diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index c045e06bb..6436e229a 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -27,5 +27,3 @@ da8baf2aa5ce93b958aca90a0ae69f537806324b
 369f9740de4534c28d0e81ab2afc99decbb9a3e6
 # Get rid of .internal.h convention in LIBC_INTRIN
 86d884cce24d773e298a2714c1e3d91ecab9be45
-# Remove .internal from more header filenames
-31194165d2afca36c2315a6e7ca2f0797dde09e3
diff --git a/.gitattributes b/.gitattributes
index ffcf7856a..fa3e12742 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,10 +1,4 @@
 # -*- conf -*-
 *.gz                         binary
-*.so                         binary
-*.dll                        binary
-*.dylib                      binary
-/build/bootstrap/*           binary
-/usr/share/terminfo/*        binary
-/usr/share/terminfo/*/*      binary
+/build/bootstrap/*.com       binary
 /usr/share/zoneinfo/*        binary
-/usr/share/zoneinfo/*/*      binary
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7de803d3a..c558453d5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,8 +1,5 @@
 name: build
 
-env:
-  COSMOCC_VERSION: 3.9.2
-
 on:
   push:
     branches:
@@ -22,48 +19,13 @@ jobs:
       matrix:
         mode: ["", tiny, rel, tinylinux, optlinux]
     steps:
-      - uses: actions/checkout@v4
-        with:
-          # Full checkout needed for git-restore-mtime-bare.
-          fetch-depth: 0
-
-      # TODO(jart): fork this action.
-      - uses: chetan/git-restore-mtime-action@v2
-
-      - uses: actions/cache/restore@v4
-        id: cache
-        with:
-          path: |
-            .cosmocc
-            o
-          key: ${{ env.COSMOCC_VERSION }}-${{ matrix.mode }}-${{ github.sha }}
-          restore-keys: |
-            ${{ env.COSMOCC_VERSION }}-${{ matrix.mode }}-
-            ${{ env.COSMOCC_VERSION }}-
-
-      - name: Restore mtimes
-        if: steps.cache.outputs.cache-hit == 'true'
-        run: |
-          while read mtime file; do
-            [ -f "$file" ] && touch -d "@$mtime" "$file"
-          done < o/.mtimes
+      - uses: actions/checkout@v3
 
       - name: support ape bins 1
-        run: sudo cp -a build/bootstrap/ape.elf /usr/bin/ape
+        run: sudo cp build/bootstrap/ape.elf /usr/bin/ape
 
       - name: support ape bins 2
         run: sudo sh -c "echo ':APE:M::MZqFpD::/usr/bin/ape:' >/proc/sys/fs/binfmt_misc/register"
 
       - name: make matrix
         run: V=0 make -j2 MODE=${{ matrix.mode }}
-
-      - name: Save mtimes
-        run: |
-          find o -type f -exec stat -c "%Y %n" {} \; > o/.mtimes
-
-      - uses: actions/cache/save@v4
-        with:
-          path: |
-            .cosmocc
-            o
-          key: ${{ env.COSMOCC_VERSION }}-${{ matrix.mode }}-${{ github.sha }}
diff --git a/.github/workflows/nightly-cosmocc.yml b/.github/workflows/nightly-cosmocc.yml
deleted file mode 100644
index 69ccb16d2..000000000
--- a/.github/workflows/nightly-cosmocc.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: Nightly cosmocc
-on:
-  schedule:
-    # https://crontab.guru/#37_4_*_*_*
-    - cron: "37 4 * * *"
-  workflow_dispatch:
-concurrency:
-  group: ${{ github.workflow }}
-  cancel-in-progress: true
-jobs:
-  build-cosmocc:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - run: |
-          sudo cp build/bootstrap/ape.elf /usr/bin/ape
-          sudo sh -c "echo ':APE:M::MZqFpD::/usr/bin/ape:' >/proc/sys/fs/binfmt_misc/register"
-      - run: tool/cosmocc/package.sh
-      # https://github.com/actions/upload-artifact/issues/590
-      - uses: actions/upload-artifact@v4.3.5
-        with:
-          name: cosmocc
-          path: cosmocc
-          compression-level: 9
diff --git a/.gitignore b/.gitignore
index 0c6b21f03..4c767cd51 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,3 @@ __pycache__
 /tool/emacs/*.elc
 /perf.data
 /perf.data.old
-/qemu*core
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 6ce0ca591..000000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "C_Cpp.default.compilerPath": ".cosmocc/3.9.2/bin/aarch64-linux-cosmo-c++",
-    "C_Cpp.default.compilerArgs": [
-        "-nostdinc",
-        "-nostdlib",
-        "-iquote.",
-        "-isystemlibc/isystem",
-        "-isystemthird_party/libcxx",
-        "-includelibc/integral/normalize.inc",
-        "-D_COSMO_SOURCE",
-        "-D__aarch64__"
-    ],
-    "[c]": {
-        "editor.tabSize": 2,
-        "editor.insertSpaces": true
-    },
-    "[cpp]": {
-        "editor.tabSize": 2,
-        "editor.insertSpaces": true
-    },
-    "[makefile]": {
-        "editor.tabSize": 8,
-        "editor.insertSpaces": false
-    },
-    "[make]": {
-        "editor.tabSize": 8,
-        "editor.insertSpaces": false
-    },
-    "[assembly]": {
-        "editor.tabSize": 8,
-        "editor.insertSpaces": true
-    },
-    "files.associations": {
-        "log.h": "c"
-    }
-}
diff --git a/Makefile b/Makefile
index 33fbcbdae..485a17687 100644
--- a/Makefile
+++ b/Makefile
@@ -77,8 +77,7 @@ COMMA := ,
 PWD := $(shell pwd)
 
 # detect wsl2 running cosmopolitan binaries on the host by checking whether:
-# - user ran .cosmocc/current/bin/make, in which case make's working directory
-#   is in wsl
+# - user ran build/bootstrap/make, in which case make's working directory is in wsl
 # - user ran make, in which case cocmd's working directory is in wsl
 ifneq ($(findstring //wsl.localhost/,$(CURDIR) $(PWD)),)
 $(warning wsl2 interop is enabled)
@@ -90,7 +89,7 @@ UNAME_S := $(shell uname -s)
 
 # apple still distributes a 17 year old version of gnu make
 ifeq ($(MAKE_VERSION), 3.81)
-$(error please use https://cosmo.zip/pub/cosmos/bin/make)
+$(error please use build/bootstrap/make)
 endif
 
 LC_ALL = C
@@ -116,8 +115,10 @@ ZIPCOPY = $(BOOTSTRAP)/zipcopy
 PECHECK = $(BOOTSTRAP)/pecheck
 FIXUPOBJ = $(BOOTSTRAP)/fixupobj
 OBJBINCOPY = $(BOOTSTRAP)/objbincopy
-MKDIR = $(BOOTSTRAP)/mkdir.ape -p
-COMPILE = $(BOOTSTRAP)/compile.ape -V9 -M2048m -P8192 $(QUOTA)
+MKDIR = build/bootstrap/mkdir -p
+COMPILE = build/bootstrap/compile -V9 -M2048m -P8192 $(QUOTA)
+
+IGNORE := $(shell $(MKDIR) $(TMPDIR))
 
 # the default build modes is empty string
 # on x86_64 hosts, MODE= is the same as MODE=x86_64
@@ -133,13 +134,14 @@ endif
 
 ifneq ($(findstring aarch64,$(MODE)),)
 ARCH = aarch64
-HOSTS ?= pi pi5 studio freebsdarm
+HOSTS ?= pi studio freebsdarm
 else
 ARCH = x86_64
-HOSTS ?= freebsd rhel7 xnu openbsd netbsd win10 luna
+HOSTS ?= freebsd rhel7 xnu openbsd netbsd win10
 endif
 
 ZIPOBJ_FLAGS += -a$(ARCH)
+IGNORE := $(shell $(MKDIR) $(TMPDIR))
 
 export ADDR2LINE
 export LC_ALL
@@ -148,12 +150,10 @@ export MODE
 export SOURCE_DATE_EPOCH
 export TMPDIR
 
-COSMOCC = .cosmocc/3.9.2
+COSMOCC = .cosmocc/3.6.0
 BOOTSTRAP = $(COSMOCC)/bin
 TOOLCHAIN = $(COSMOCC)/bin/$(ARCH)-linux-cosmo-
-DOWNLOAD := $(shell build/download-cosmocc.sh $(COSMOCC) 3.9.2 f4ff13af65fcd309f3f1cfd04275996fb7f72a4897726628a8c9cf732e850193)
-
-IGNORE := $(shell $(MKDIR) $(TMPDIR))
+DOWNLOAD := $(shell build/download-cosmocc.sh $(COSMOCC) 3.6.0 4918c45ac3e0972ff260e2a249e25716881e39fb679d5e714ae216a2ef6c3f7e)
 
 AS = $(TOOLCHAIN)as
 CC = $(TOOLCHAIN)gcc
@@ -275,16 +275,10 @@ include libc/BUILD.mk				#─┘
 include libc/sock/BUILD.mk			#─┐
 include net/http/BUILD.mk			# ├──ONLINE RUNTIME
 include third_party/musl/BUILD.mk		# │  You can communicate with the network
-include third_party/regex/BUILD.mk		# │
-include third_party/tr/BUILD.mk			# │
-include third_party/sed/BUILD.mk		# │
-include libc/system/BUILD.mk			# │
 include libc/x/BUILD.mk				# │
 include dsp/scale/BUILD.mk			# │
 include dsp/mpeg/BUILD.mk			# │
 include dsp/tty/BUILD.mk			# │
-include dsp/audio/BUILD.mk			# │
-include dsp/prog/BUILD.mk			# │
 include dsp/BUILD.mk				# │
 include third_party/stb/BUILD.mk		# │
 include third_party/mbedtls/BUILD.mk		# │
@@ -298,7 +292,8 @@ include third_party/libcxx/BUILD.mk		# │
 include third_party/openmp/BUILD.mk		# │
 include third_party/pcre/BUILD.mk		# │
 include third_party/less/BUILD.mk		# │
-include net/https/BUILD.mk			#─┘
+include net/https/BUILD.mk			# │
+include third_party/regex/BUILD.mk		#─┘
 include third_party/tidy/BUILD.mk
 include third_party/BUILD.mk
 include third_party/nsync/testing/BUILD.mk
@@ -317,6 +312,8 @@ include third_party/double-conversion/test/BUILD.mk
 include third_party/lua/BUILD.mk
 include third_party/tree/BUILD.mk
 include third_party/zstd/BUILD.mk
+include third_party/tr/BUILD.mk
+include third_party/sed/BUILD.mk
 include third_party/awk/BUILD.mk
 include third_party/hiredis/BUILD.mk
 include third_party/make/BUILD.mk
@@ -369,7 +366,6 @@ include test/libc/fmt/BUILD.mk
 include test/libc/time/BUILD.mk
 include test/libc/proc/BUILD.mk
 include test/libc/stdio/BUILD.mk
-include test/libc/system/BUILD.mk
 include test/libc/BUILD.mk
 include test/net/http/BUILD.mk
 include test/net/https/BUILD.mk
@@ -433,71 +429,68 @@ HTAGS:	o/$(MODE)/hdrs-old.txt $(filter-out third_party/libcxx/%,$(HDRS)) #o/$(MO
 
 loc: private .UNSANDBOXED = 1
 loc: o/$(MODE)/tool/build/summy
-	find -name \*.h -or -name \*.hpp -or -name \*.c -or -name \*.cc -or -name \*.cpp -or -name \*.S -or -name \*.mk | \
+	find -name \*.h -or -name \*.c -or -name \*.S | \
 	$(XARGS) wc -l | grep total | awk '{print $$1}' | $<
 
-COSMOPOLITAN =				\
+# PLEASE: MAINTAIN TOPOLOGICAL ORDER
+# FROM HIGHEST LEVEL TO LOWEST LEVEL
+COSMOPOLITAN_OBJECTS =			\
 	CTL				\
-	DSP_AUDIO			\
-	LIBC_CALLS			\
-	LIBC_DLOPEN			\
-	LIBC_ELF			\
-	LIBC_FMT			\
-	LIBC_INTRIN			\
-	LIBC_IRQ			\
-	LIBC_LOG			\
-	LIBC_MEM			\
-	LIBC_NEXGEN32E			\
-	LIBC_NT_ADVAPI32		\
-	LIBC_NT_BCRYPTPRIMITIVES	\
-	LIBC_NT_COMDLG32		\
-	LIBC_NT_GDI32			\
-	LIBC_NT_IPHLPAPI		\
-	LIBC_NT_KERNEL32		\
-	LIBC_NT_NTDLL			\
-	LIBC_NT_PDH			\
-	LIBC_NT_POWRPROF		\
-	LIBC_NT_PSAPI			\
-	LIBC_NT_REALTIME		\
-	LIBC_NT_SHELL32			\
-	LIBC_NT_SYNCHRONIZATION		\
-	LIBC_NT_USER32			\
-	LIBC_NT_WS2_32			\
-	LIBC_PROC			\
-	LIBC_RUNTIME			\
-	LIBC_SOCK			\
-	LIBC_STDIO			\
-	LIBC_STR			\
-	LIBC_SYSTEM			\
-	LIBC_SYSV			\
-	LIBC_SYSV_CALLS			\
-	LIBC_THREAD			\
-	LIBC_TINYMATH			\
-	LIBC_VGA			\
-	LIBC_X				\
-	NET_HTTP			\
-	THIRD_PARTY_COMPILER_RT		\
-	THIRD_PARTY_DLMALLOC		\
 	THIRD_PARTY_DOUBLECONVERSION	\
-	THIRD_PARTY_GDTOA		\
+	THIRD_PARTY_OPENMP		\
+	TOOL_ARGS			\
+	NET_HTTP			\
+	LIBC_SOCK			\
+	LIBC_NT_WS2_32			\
+	LIBC_NT_IPHLPAPI		\
+	LIBC_X				\
 	THIRD_PARTY_GETOPT		\
+	LIBC_LOG			\
+	THIRD_PARTY_TZ			\
+	THIRD_PARTY_MUSL		\
+	THIRD_PARTY_ZLIB_GZ		\
 	THIRD_PARTY_LIBCXXABI		\
 	THIRD_PARTY_LIBUNWIND		\
-	THIRD_PARTY_MUSL		\
-	THIRD_PARTY_NSYNC		\
-	THIRD_PARTY_NSYNC_MEM		\
-	THIRD_PARTY_OPENMP		\
-	THIRD_PARTY_PUFF		\
+	LIBC_STDIO			\
+	THIRD_PARTY_GDTOA		\
 	THIRD_PARTY_REGEX		\
-	THIRD_PARTY_TZ			\
-	THIRD_PARTY_XED			\
+	LIBC_THREAD			\
+	LIBC_PROC			\
+	THIRD_PARTY_NSYNC_MEM		\
+	LIBC_MEM			\
+	THIRD_PARTY_DLMALLOC		\
+	LIBC_DLOPEN			\
+	LIBC_RUNTIME			\
+	THIRD_PARTY_NSYNC		\
+	LIBC_ELF			\
+	LIBC_IRQ			\
+	LIBC_CALLS			\
+	LIBC_SYSV_CALLS			\
+	LIBC_VGA			\
+	LIBC_NT_PSAPI			\
+	LIBC_NT_POWRPROF		\
+	LIBC_NT_PDH			\
+	LIBC_NT_GDI32			\
+	LIBC_NT_COMDLG32		\
+	LIBC_NT_USER32			\
+	LIBC_NT_NTDLL			\
+	LIBC_NT_ADVAPI32		\
+	LIBC_NT_SYNCHRONIZATION		\
+	LIBC_FMT			\
 	THIRD_PARTY_ZLIB		\
-	THIRD_PARTY_ZLIB_GZ		\
-	TOOL_ARGS			\
+	THIRD_PARTY_PUFF		\
+	THIRD_PARTY_COMPILER_RT		\
+	LIBC_TINYMATH			\
+	THIRD_PARTY_XED			\
+	LIBC_STR			\
+	LIBC_SYSV			\
+	LIBC_INTRIN			\
+	LIBC_NT_BCRYPTPRIMITIVES	\
+	LIBC_NT_KERNEL32		\
+	LIBC_NEXGEN32E
 
 COSMOPOLITAN_H_PKGS =			\
 	APE				\
-	DSP_AUDIO			\
 	LIBC				\
 	LIBC_CALLS			\
 	LIBC_ELF			\
@@ -541,14 +534,14 @@ COSMOCC_PKGS =				\
 	THIRD_PARTY_INTEL
 
 o/$(MODE)/cosmopolitan.a:		\
-		$(call reverse,$(call uniq,$(foreach x,$(COSMOPOLITAN),$($(x)))))
+		$(foreach x,$(COSMOPOLITAN_OBJECTS),$($(x)_A_OBJS))
 
 COSMOCC_HDRS =								\
 	$(wildcard libc/integral/*)					\
 	$(foreach x,$(COSMOCC_PKGS),$($(x)_HDRS))			\
 	$(foreach x,$(COSMOCC_PKGS),$($(x)_INCS))
 
-o/cosmocc.h.txt: Makefile libc $(MAKEFILES) $(call uniq,$(foreach x,$(HDRS) $(INCS),$(dir $(x)))) $(HDRS) $(INCS)
+o/cosmocc.h.txt: Makefile
 	$(file >$@, $(call uniq,$(COSMOCC_HDRS)))
 
 COSMOPOLITAN_H_ROOT_HDRS =						\
diff --git a/README.md b/README.md
index 75851c8be..d447a47ad 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,12 @@
 [![build](https://github.com/jart/cosmopolitan/actions/workflows/build.yml/badge.svg)](https://github.com/jart/cosmopolitan/actions/workflows/build.yml)
 # Cosmopolitan
 
-[Cosmopolitan Libc](https://justine.lol/cosmopolitan/index.html) makes C/C++
+[Cosmopolitan Libc](https://justine.lol/cosmopolitan/index.html) makes C
 a build-once run-anywhere language, like Java, except it doesn't need an
 interpreter or virtual machine. Instead, it reconfigures stock GCC and
 Clang to output a POSIX-approved polyglot format that runs natively on
-Linux + Mac + Windows + FreeBSD + OpenBSD 7.3 + NetBSD + BIOS with the
-best possible performance and the tiniest footprint imaginable.
+Linux + Mac + Windows + FreeBSD + OpenBSD + NetBSD + BIOS with the best
+possible performance and the tiniest footprint imaginable.
 
 ## Background
 
@@ -87,22 +87,15 @@ ape/apeinstall.sh
 ```
 
 You can now build the mono repo with any modern version of GNU Make. To
-bootstrap your build, you can install Cosmopolitan Make from this site:
-
-https://cosmo.zip/pub/cosmos/bin/make
-
-E.g.:
+make life easier, we've included one in the cosmocc toolchain, which is
+guaranteed to be compatible and furthermore includes our extensions for
+doing build system sandboxing.
 
 ```sh
-curl -LO https://cosmo.zip/pub/cosmos/bin/make
-./make -j8
+build/bootstrap/make -j8
 o//examples/hello
 ```
 
-After you've built the repo once, you can also use the make from your
-cosmocc at `.cosmocc/current/bin/make`. You might even prefer to alias
-make to `$COSMO/.cosmocc/current/bin/make`.
-
 Since the Cosmopolitan repository is very large, you might only want to
 build one particular thing. Here's an example of a target that can be
 compiled relatively quickly, which is a simple POSIX test that only
@@ -110,7 +103,7 @@ depends on core LIBC packages.
 
 ```sh
 rm -rf o//libc o//test
-.cosmocc/current/bin/make o//test/posix/signal_test
+build/bootstrap/make o//test/posix/signal_test
 o//test/posix/signal_test
 ```
 
@@ -119,21 +112,21 @@ list out each individual one. For example if you wanted to build and run
 all the unit tests in the `TEST_POSIX` package, you could say:
 
 ```sh
-.cosmocc/current/bin/make o//test/posix
+build/bootstrap/make o//test/posix
 ```
 
 Cosmopolitan provides a variety of build modes. For example, if you want
 really tiny binaries (as small as 12kb in size) then you'd say:
 
 ```sh
-.cosmocc/current/bin/make m=tiny
+build/bootstrap/make m=tiny
 ```
 
 You can furthermore cut out the bloat of other operating systems, and
 have Cosmopolitan become much more similar to Musl Libc.
 
 ```sh
-.cosmocc/current/bin/make m=tinylinux
+build/bootstrap/make m=tinylinux
 ```
 
 For further details, see [//build/config.mk](build/config.mk).
@@ -249,16 +242,16 @@ server. You're welcome to join us! <https://discord.gg/FwAVVu7eJ4>
 
 ## Support Vector
 
-| Platform       | Min Version    | Circa |
-| :---           | ---:           | ---:  |
-| AMD            | K8             | 2003  |
-| Intel          | Core           | 2006  |
-| Linux          | 2.6.18         | 2007  |
-| Windows        | 8 [1]          | 2012  |
-| Darwin (macOS) | 23.1.0+        | 2023  |
-| OpenBSD        | 7.3 or earlier | 2023  |
-| FreeBSD        | 13             | 2020  |
-| NetBSD         | 9.2            | 2021  |
+| Platform       | Min Version | Circa |
+| :---           | ---:        | ---:  |
+| AMD            | K8          | 2003  |
+| Intel          | Core        | 2006  |
+| Linux          | 2.6.18      | 2007  |
+| Windows        | 8 [1]       | 2012  |
+| Darwin (macOS) | 23.1.0+     | 2023  |
+| OpenBSD        | 7           | 2021  |
+| FreeBSD        | 13          | 2020  |
+| NetBSD         | 9.2         | 2021  |
 
 [1] See our [vista branch](https://github.com/jart/cosmopolitan/tree/vista)
     for a community supported version of Cosmopolitan that works on Windows
diff --git a/ape/BUILD.mk b/ape/BUILD.mk
index 3e8ea3137..25542e0d5 100644
--- a/ape/BUILD.mk
+++ b/ape/BUILD.mk
@@ -45,10 +45,10 @@ o/$(MODE)/ape: $(APE)
 
 o/$(MODE)/ape/aarch64.lds:			\
 	ape/aarch64.lds				\
-	libc/zip.h				\
+	libc/zip.internal.h			\
 	libc/thread/tls.h			\
 	libc/calls/struct/timespec.h		\
-	libc/macros.h				\
+	libc/macros.internal.h			\
 	libc/str/str.h
 
 APE_LOADER_LDFLAGS =				\
@@ -162,8 +162,8 @@ o/$(MODE)/ape/ape-no-modify-self.o:		\
 		libc/dce.h			\
 		libc/elf/def.h			\
 		libc/thread/tls.h		\
-		libc/macho.h			\
-		libc/macros.h			\
+		libc/macho.internal.h		\
+		libc/macros.internal.h		\
 		libc/nexgen32e/uart.internal.h	\
 		libc/calls/metalfile.internal.h	\
 		libc/nt/pedef.internal.h	\
@@ -188,8 +188,8 @@ o/$(MODE)/ape/ape-copy-self.o:			\
 		libc/dce.h			\
 		libc/elf/def.h			\
 		libc/thread/tls.h		\
-		libc/macho.h			\
-		libc/macros.h			\
+		libc/macho.internal.h		\
+		libc/macros.internal.h		\
 		libc/nexgen32e/uart.internal.h	\
 		libc/calls/metalfile.internal.h	\
 		libc/nt/pedef.internal.h	\
@@ -246,6 +246,8 @@ o/$(MODE)/ape:	$(APE_CHECKS)			\
 		o/$(MODE)/ape/ape.lds		\
 		o/$(MODE)/ape/ape.elf		\
 		o/$(MODE)/ape/ape.macho		\
+		o/$(MODE)/ape/ape-copy-self.o	\
+		o/$(MODE)/ape/ape-no-modify-self.o
 
 endif
 
@@ -259,8 +261,8 @@ o/$(MODE)/ape/ape.o:				\
 		libc/thread/tls.h		\
 		ape/ape.internal.h		\
 		ape/macros.internal.h		\
-		libc/macho.h			\
-		libc/macros.h			\
+		libc/macho.internal.h		\
+		libc/macros.internal.h		\
 		libc/sysv/consts/prot.h		\
 		libc/nt/pedef.internal.h	\
 		libc/runtime/pc.internal.h	\
@@ -281,7 +283,7 @@ o/$(MODE)/ape/ape.lds:				\
 		libc/dce.h			\
 		libc/elf/def.h			\
 		libc/elf/pf2prot.internal.h	\
-		libc/macros.h			\
+		libc/macros.internal.h		\
 		libc/nt/pedef.internal.h	\
 		libc/str/str.h			\
-		libc/zip.h
+		libc/zip.internal.h
diff --git a/ape/aarch64.lds b/ape/aarch64.lds
index 48562d2a2..356ff3ae7 100644
--- a/ape/aarch64.lds
+++ b/ape/aarch64.lds
@@ -103,8 +103,10 @@ SECTIONS {
     *(.eh_frame_entry .eh_frame_entry.*)
   }
 
-  __eh_frame_hdr_start = SIZEOF(.eh_frame_hdr) > 0 ? ADDR(.eh_frame_hdr) : 0;
-  __eh_frame_hdr_end = SIZEOF(.eh_frame_hdr) > 0 ? . : 0;
+  .eh_frame : ONLY_IF_RO {
+    KEEP(*(.eh_frame))
+    *(.eh_frame.*)
+  }
 
   .gcc_except_table : ONLY_IF_RO {
     *(.gcc_except_table .gcc_except_table.*)
@@ -125,11 +127,9 @@ SECTIONS {
   . += CONSTANT(MAXPAGESIZE);
   . = DATA_SEGMENT_ALIGN(CONSTANT(MAXPAGESIZE), CONSTANT(COMMONPAGESIZE));
 
-  .eh_frame : {
-    __eh_frame_start = .;
+  .eh_frame : ONLY_IF_RW {
     KEEP(*(.eh_frame))
     *(.eh_frame.*)
-    __eh_frame_end = .;
   }
 
   .gnu_extab : ONLY_IF_RW {
@@ -259,9 +259,6 @@ SECTIONS {
   .debug_ranges       0 : { *(.debug_ranges) }
   .debug_macro        0 : { *(.debug_macro) }
   .debug_addr         0 : { *(.debug_addr) }
-  .debug_names        0 : { *(.debug_names) }
-  .debug_loclists     0 : { *(.debug_loclists) }
-  .debug_str_offsets  0 : { *(.debug_str_offsets) }
   .ARM.attributes     0 : { KEEP(*(.ARM.attributes)) KEEP(*(.gnu.attributes)) }
   .note.gnu.arm.ident 0 : { KEEP(*(.note.gnu.arm.ident)) }
 
diff --git a/ape/ape-m1.c b/ape/ape-m1.c
index 2d677f22b..8d188404a 100644
--- a/ape/ape-m1.c
+++ b/ape/ape-m1.c
@@ -16,12 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#ifndef __APPLE__
-#error "ape/ape-m1.c is for apple silicon. chances you want ape/loader.c"
-#endif
-#ifndef __aarch64__
-#error "ape/ape-m1.c is for apple silicon; you want: make o//ape/ape.macho"
-#endif
 #include <assert.h>
 #include <dispatch/dispatch.h>
 #include <dlfcn.h>
diff --git a/ape/ape.S b/ape/ape.S
index f274e31f6..2ec05e963 100644
--- a/ape/ape.S
+++ b/ape/ape.S
@@ -37,7 +37,7 @@
 #include "libc/calls/metalfile.internal.h"
 #include "libc/dce.h"
 #include "libc/elf/def.h"
-#include "libc/macho.h"
+#include "libc/macho.internal.h"
 #include "libc/nexgen32e/uart.internal.h"
 #include "libc/nt/pedef.internal.h"
 #include "libc/runtime/pc.internal.h"
diff --git a/ape/ape.lds b/ape/ape.lds
index ac82bde00..4e6db724a 100644
--- a/ape/ape.lds
+++ b/ape/ape.lds
@@ -310,7 +310,7 @@ SECTIONS {
     . = ALIGN(__privileged_end > __privileged_start ? CONSTANT(COMMONPAGESIZE) : 0);
 /*END: morphable code */
     __privileged_start = .;
-    *(.privileged .privileged.*)
+    *(.privileged)
     __privileged_end = .;
 
     KEEP(*(.ape.pad.text))
@@ -329,10 +329,6 @@ SECTIONS {
     *(.ubsan.types)
     *(.ubsan.data)
 
-    __eh_frame_hdr_start_actual = .;
-    *(.eh_frame_hdr)
-    __eh_frame_hdr_end_actual = .;
-
     /* Legal Notices */
     __notices = .;
     KEEP(*(.notice))
@@ -386,13 +382,6 @@ SECTIONS {
     _tbss_end = .;
   } :Tls
 
-  .eh_frame : {
-    __eh_frame_start = .;
-    KEEP(*(.eh_frame))
-    *(.eh_frame.*)
-    __eh_frame_end = .;
-  } :Ram
-
   .data . : {
 /*BEGIN: Read/Write Data */
 #if SupportsWindows()
@@ -441,6 +430,7 @@ SECTIONS {
     KEEP(*(.piro.pad.data))
     *(.igot.plt)
     KEEP(*(.dataepilogue))
+
     . = ALIGN(. != 0 ? CONSTANT(COMMONPAGESIZE) : 0);
 /*END: NT FORK COPYING */
     _edata = .;
@@ -520,9 +510,6 @@ SECTIONS {
   .debug_rnglists    0 : { *(.debug_rnglists) }
   .debug_macro       0 : { *(.debug_macro) }
   .debug_addr        0 : { *(.debug_addr) }
-  .debug_names       0 : { *(.debug_names) }
-  .debug_loclists    0 : { *(.debug_loclists) }
-  .debug_str_offsets 0 : { *(.debug_str_offsets) }
   .gnu.attributes    0 : { KEEP(*(.gnu.attributes)) }
   .GCC.command.line  0 : { *(.GCC.command.line) }
 
@@ -586,11 +573,11 @@ ape_rom_memsz = ape_rom_filesz;
 ape_rom_align = CONSTANT(COMMONPAGESIZE);
 ape_rom_rva = RVA(ape_rom_vaddr);
 
-ape_ram_vaddr = ADDR(.eh_frame);
+ape_ram_vaddr = ADDR(.data);
 ape_ram_offset = ape_ram_vaddr - __executable_start;
-ape_ram_paddr = LOADADDR(.eh_frame);
-ape_ram_filesz = ADDR(.bss) - ADDR(.eh_frame);
-ape_ram_memsz = _end - ADDR(.eh_frame);
+ape_ram_paddr = LOADADDR(.data);
+ape_ram_filesz = ADDR(.bss) - ADDR(.data);
+ape_ram_memsz = _end - ADDR(.data);
 ape_ram_align = CONSTANT(COMMONPAGESIZE);
 ape_ram_rva = RVA(ape_ram_vaddr);
 
@@ -600,7 +587,7 @@ ape_stack_offset = 0;
 ape_stack_vaddr = DEFINED(ape_stack_vaddr) ? ape_stack_vaddr : 0x700000000000;
 ape_stack_paddr = ape_ram_paddr + ape_ram_filesz;
 ape_stack_filesz = 0;
-ape_stack_memsz = DEFINED(ape_stack_memsz) ? ape_stack_memsz : 4 * 1024 * 1024;
+ape_stack_memsz = DEFINED(ape_stack_memsz) ? ape_stack_memsz : 8 * 1024 * 1024;
 
 ape_note_offset = ape_cod_offset + (ape_note - ape_cod_vaddr);
 ape_note_filesz = ape_note_end - ape_note;
@@ -614,9 +601,6 @@ ape_text_memsz = ape_text_filesz;
 ape_text_align = CONSTANT(COMMONPAGESIZE);
 ape_text_rva = RVA(ape_text_vaddr);
 
-__eh_frame_hdr_start = __eh_frame_hdr_end_actual > __eh_frame_hdr_start_actual ? __eh_frame_hdr_start_actual : 0;
-__eh_frame_hdr_end = __eh_frame_hdr_end_actual > __eh_frame_hdr_start_actual ? __eh_frame_hdr_end_actual : 0;
-
 /* we roundup here because xnu wants the file load segments page-aligned */
 /* but we don't want to add the nop padding to the ape program, so we'll */
 /* let ape.S dd read past the end of the file into the wrapping binaries */
diff --git a/ape/apeinstall.sh b/ape/apeinstall.sh
index 73f24965f..2a0a28590 100755
--- a/ape/apeinstall.sh
+++ b/ape/apeinstall.sh
@@ -10,8 +10,8 @@ if [ ! -f ape/loader.c ]; then
   cd "$COSMO" || exit
 fi
 
-if [ -x .cosmocc/current/bin/make ]; then
-  MAKE=.cosmocc/current/bin/make
+if [ -x build/bootstrap/make ]; then
+  MAKE=build/bootstrap/make
 else
   MAKE=make
 fi
diff --git a/ape/launch.S b/ape/launch.S
index f710fdec7..ae2cb58a1 100644
--- a/ape/launch.S
+++ b/ape/launch.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Calls _start() function of loaded program.
 //
diff --git a/ape/loader-macho.S b/ape/loader-macho.S
index e484f0686..bcecf9dac 100644
--- a/ape/loader-macho.S
+++ b/ape/loader-macho.S
@@ -16,10 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macho.h"
+#include "libc/macho.internal.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Apple Mach-O Executable Headers
 //	Fixups are applied by objbincopy
diff --git a/ape/macros.internal.h b/ape/macros.internal.h
index dcfdc75a0..ad354e474 100644
--- a/ape/macros.internal.h
+++ b/ape/macros.internal.h
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #ifndef APE_MACROS_H_
 #define APE_MACROS_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #ifdef __ASSEMBLER__
 /* clang-format off */
 
diff --git a/ape/start.S b/ape/start.S
index e497fc852..c148966e1 100644
--- a/ape/start.S
+++ b/ape/start.S
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "ape/ape.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #ifdef __aarch64__
 
diff --git a/ape/systemcall.S b/ape/systemcall.S
index 91daedc95..c98632fd5 100644
--- a/ape/systemcall.S
+++ b/ape/systemcall.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Invokes system call.
 //
diff --git a/build/bootstrap/ape.aarch64 b/build/bootstrap/ape.aarch64
index 794864ad0..2421e7248 100755
Binary files a/build/bootstrap/ape.aarch64 and b/build/bootstrap/ape.aarch64 differ
diff --git a/build/bootstrap/ape.elf b/build/bootstrap/ape.elf
index db75ffb3a..5ffffe84b 100755
Binary files a/build/bootstrap/ape.elf and b/build/bootstrap/ape.elf differ
diff --git a/build/bootstrap/ape.macho b/build/bootstrap/ape.macho
index 77bcfb233..d6d22567b 100755
Binary files a/build/bootstrap/ape.macho and b/build/bootstrap/ape.macho differ
diff --git a/build/bootstrap/cocmd b/build/bootstrap/cocmd
index f16d26c8d..58ecb0a0d 100755
Binary files a/build/bootstrap/cocmd and b/build/bootstrap/cocmd differ
diff --git a/build/bootstrap/compile b/build/bootstrap/compile
new file mode 100755
index 000000000..3fb641378
Binary files /dev/null and b/build/bootstrap/compile differ
diff --git a/build/bootstrap/make b/build/bootstrap/make
new file mode 100755
index 000000000..48aba621f
Binary files /dev/null and b/build/bootstrap/make differ
diff --git a/build/bootstrap/mkdir b/build/bootstrap/mkdir
new file mode 100755
index 000000000..4c359071e
Binary files /dev/null and b/build/bootstrap/mkdir differ
diff --git a/build/config.mk b/build/config.mk
index c18308725..81ee8b711 100644
--- a/build/config.mk
+++ b/build/config.mk
@@ -82,7 +82,7 @@ ENABLE_FTRACE = 1
 CONFIG_OFLAGS ?= -g -ggdb
 CONFIG_CPPFLAGS += -DNDEBUG -DSYSDEBUG
 CONFIG_CCFLAGS += $(BACKTRACES) -O3 -fmerge-all-constants
-CONFIG_TARGET_ARCH ?= -march=native
+TARGET_ARCH ?= -march=native
 endif
 
 # Optimized Linux Mode
@@ -100,20 +100,7 @@ CONFIG_OFLAGS ?= -g -ggdb
 CONFIG_CPPFLAGS += -DNDEBUG -DSYSDEBUG -DSUPPORT_VECTOR=1
 CONFIG_CCFLAGS += -O3 -fmerge-all-constants
 CONFIG_COPTS += -mred-zone
-CONFIG_TARGET_ARCH ?= -march=native
-endif
-ifeq ($(MODE), x86_64-optlinux)
-CONFIG_OFLAGS ?= -g -ggdb
-CONFIG_CPPFLAGS += -DNDEBUG -DSYSDEBUG -DSUPPORT_VECTOR=1
-CONFIG_CCFLAGS += -O3 -fmerge-all-constants
-CONFIG_COPTS += -mred-zone
-CONFIG_TARGET_ARCH ?= -march=native
-endif
-ifeq ($(MODE), aarch64-optlinux)
-CONFIG_OFLAGS ?= -g -ggdb
-CONFIG_CPPFLAGS += -DNDEBUG -DSYSDEBUG -DSUPPORT_VECTOR=1
-CONFIG_CCFLAGS += -O3 -fmerge-all-constants
-CONFIG_COPTS += -mred-zone
+TARGET_ARCH ?= -march=native
 endif
 
 # Release Mode
@@ -149,21 +136,8 @@ endif
 ifeq ($(MODE), dbg)
 ENABLE_FTRACE = 1
 CONFIG_OFLAGS ?= -g -ggdb
-OVERRIDE_CFLAGS += -O0
-OVERRIDE_CXXFLAGS += -O0
-CONFIG_CPPFLAGS += -DMODE_DBG -D__SANITIZE_UNDEFINED__ -Wno-unused-variable -Wno-unused-but-set-variable
-CONFIG_CCFLAGS += $(BACKTRACES) -DSYSDEBUG
-CONFIG_COPTS += -fsanitize=undefined
-OVERRIDE_CCFLAGS += -fno-pie
-QUOTA ?= -C64 -L300
-endif
-ifeq ($(MODE), x86_64-dbg)
-ENABLE_FTRACE = 1
-CONFIG_OFLAGS ?= -g -ggdb
-OVERRIDE_CFLAGS += -O0
-OVERRIDE_CXXFLAGS += -O0
-CONFIG_CPPFLAGS += -DMODE_DBG -D__SANITIZE_UNDEFINED__ -Wno-unused-variable -Wno-unused-but-set-variable
-CONFIG_CCFLAGS += $(BACKTRACES) -DSYSDEBUG
+CONFIG_CPPFLAGS += -DMODE_DBG -D__SANITIZE_UNDEFINED__
+CONFIG_CCFLAGS += $(BACKTRACES) -DSYSDEBUG -O0 -fno-inline
 CONFIG_COPTS += -fsanitize=undefined
 OVERRIDE_CCFLAGS += -fno-pie
 QUOTA ?= -C64 -L300
@@ -171,10 +145,8 @@ endif
 ifeq ($(MODE), aarch64-dbg)
 ENABLE_FTRACE = 1
 CONFIG_OFLAGS ?= -g -ggdb
-OVERRIDE_CFLAGS += -O0 -fdce
-OVERRIDE_CXXFLAGS += -O0 -fdce
-CONFIG_CPPFLAGS += -DMODE_DBG -D__SANITIZE_UNDEFINED__ -Wno-unused-variable -Wno-unused-but-set-variable
-CONFIG_CCFLAGS += $(BACKTRACES) -DSYSDEBUG
+CONFIG_CPPFLAGS += -DMODE_DBG -D__SANITIZE_UNDEFINED__
+CONFIG_CCFLAGS += $(BACKTRACES) -DSYSDEBUG -O0 -fno-inline -fdce
 CONFIG_COPTS += -fsanitize=undefined
 QUOTA ?= -C64 -L300
 endif
@@ -223,6 +195,8 @@ CONFIG_CCFLAGS +=			\
 	-momit-leaf-frame-pointer	\
 	-foptimize-sibling-calls	\
 	-DDWARFLESS
+TARGET_ARCH ?=				\
+	-msse3
 PYFLAGS +=				\
 	-O2				\
 	-B
@@ -242,6 +216,8 @@ CONFIG_CCFLAGS +=			\
 	-momit-leaf-frame-pointer	\
 	-foptimize-sibling-calls	\
 	-DDWARFLESS
+TARGET_ARCH ?=				\
+	-msse3
 PYFLAGS +=				\
 	-O2				\
 	-B
@@ -293,6 +269,8 @@ CONFIG_CCFLAGS +=			\
 	-fno-align-jumps		\
 	-fno-align-labels		\
 	-fno-align-loops
+TARGET_ARCH ?=				\
+	-msse3
 endif
 
 # Linux+BSD Tiny Mode
@@ -322,6 +300,8 @@ CONFIG_CCFLAGS +=		\
 	-fno-align-jumps	\
 	-fno-align-labels	\
 	-fno-align-loops
+TARGET_ARCH ?=			\
+	-msse3
 endif
 
 # Unix Tiny Mode
@@ -350,6 +330,8 @@ CONFIG_CCFLAGS +=		\
 	-fno-align-jumps	\
 	-fno-align-labels	\
 	-fno-align-loops
+TARGET_ARCH ?=			\
+	-msse3
 endif
 
 # Tiny Metallic Unix Mode
@@ -378,6 +360,8 @@ CONFIG_CCFLAGS +=		\
 	-fno-align-jumps	\
 	-fno-align-labels	\
 	-fno-align-loops
+TARGET_ARCH ?=			\
+	-msse3
 endif
 
 # no x87 instructions mode
@@ -510,5 +494,3 @@ ifeq ($(ARCH), aarch64)
 CONFIG_CCFLAGS += -fpatchable-function-entry=7,6
 endif
 endif
-
-TARGET_ARCH ?= $(CONFIG_TARGET_ARCH)
diff --git a/build/definitions.mk b/build/definitions.mk
index 6682f79b7..774983244 100644
--- a/build/definitions.mk
+++ b/build/definitions.mk
@@ -92,7 +92,10 @@ DEFAULT_COPTS ?=							\
 	-fno-gnu-unique							\
 	-fstrict-aliasing						\
 	-fstrict-overflow						\
-	-fno-semantic-interposition
+	-fno-semantic-interposition					\
+	-fno-dwarf2-cfi-asm						\
+	-fno-unwind-tables						\
+	-fno-asynchronous-unwind-tables
 
 ifeq ($(ARCH), x86_64)
 # Microsoft says "[a]ny memory below the stack beyond the red zone
@@ -112,10 +115,14 @@ ifeq ($(ARCH), aarch64)
 # - Cosmopolitan Libc uses x28 for thread-local storage because Apple
 #   forbids us from using tpidr_el0 too.
 #
+# - Cosmopolitan currently lacks an implementation of the runtime
+#   libraries needed by the -moutline-atomics flag
+#
 DEFAULT_COPTS +=							\
 	-ffixed-x18							\
 	-ffixed-x28							\
-	-fsigned-char
+	-fsigned-char							\
+	-mno-outline-atomics
 endif
 
 MATHEMATICAL =								\
@@ -136,6 +143,8 @@ DEFAULT_CFLAGS =							\
 
 DEFAULT_CXXFLAGS =							\
 	-std=gnu++23							\
+	-fno-rtti							\
+	-fno-exceptions							\
 	-fuse-cxa-atexit						\
 	-Wno-int-in-bool-context					\
 	-Wno-narrowing							\
diff --git a/build/download-cosmocc.sh b/build/download-cosmocc.sh
index 52c89b091..13310a4e4 100755
--- a/build/download-cosmocc.sh
+++ b/build/download-cosmocc.sh
@@ -99,8 +99,3 @@ rm -f cosmocc.zip cosmocc.zip.sha256sum
 # commit output directory
 cd "${OLDPWD}" || die
 mv "${OUTPUT_TMP}" "${OUTPUT_DIR}" || die
-
-# update current symlink
-BASE=$(basename "${OUTPUT_DIR}")
-DIR=$(dirname "${OUTPUT_DIR}")
-ln -sfn "$BASE" "$DIR/current"
diff --git a/build/objdump b/build/objdump
index 358d8f4c8..f1acb58a5 100755
--- a/build/objdump
+++ b/build/objdump
@@ -6,14 +6,14 @@ if [ -n "$OBJDUMP" ]; then
 fi
 
 find_objdump() {
-  if [ -x .cosmocc/3.9.2/bin/$1-linux-cosmo-objdump ]; then
-    OBJDUMP=.cosmocc/3.9.2/bin/$1-linux-cosmo-objdump
-  elif [ -x .cosmocc/3.9.2/bin/$1-linux-musl-objdump ]; then
-    OBJDUMP=.cosmocc/3.9.2/bin/$1-linux-musl-objdump
-  elif [ -x "$COSMO/.cosmocc/3.9.2/bin/$1-linux-cosmo-objdump" ]; then
-    OBJDUMP="$COSMO/.cosmocc/3.9.2/bin/$1-linux-cosmo-objdump"
-  elif [ -x "$COSMO/.cosmocc/3.9.2/bin/$1-linux-musl-objdump" ]; then
-    OBJDUMP="$COSMO/.cosmocc/3.9.2/bin/$1-linux-musl-objdump"
+  if [ -x .cosmocc/3.3.5/bin/$1-linux-cosmo-objdump ]; then
+    OBJDUMP=.cosmocc/3.3.5/bin/$1-linux-cosmo-objdump
+  elif [ -x .cosmocc/3.3.5/bin/$1-linux-musl-objdump ]; then
+    OBJDUMP=.cosmocc/3.3.5/bin/$1-linux-musl-objdump
+  elif [ -x "$COSMO/.cosmocc/3.3.5/bin/$1-linux-cosmo-objdump" ]; then
+    OBJDUMP="$COSMO/.cosmocc/3.3.5/bin/$1-linux-cosmo-objdump"
+  elif [ -x "$COSMO/.cosmocc/3.3.5/bin/$1-linux-musl-objdump" ]; then
+    OBJDUMP="$COSMO/.cosmocc/3.3.5/bin/$1-linux-musl-objdump"
   else
     echo "error: toolchain not found (try running 'cosmocc --update' or 'make' in the cosmo monorepo)" >&2
     exit 1
diff --git a/build/run b/build/run
index 079bc9991..c7fc0c292 100755
--- a/build/run
+++ b/build/run
@@ -4,5 +4,5 @@ UNAMES=$(uname -s)
 if [ x"$UNAMES" = x"Darwin" ] && [ x"$UNAMEM" = x"arm64" ]; then
   exec ape "$@"
 else
-  exec rusage "$@"
+  exec "$@"
 fi
diff --git a/ctl/conditional.h b/ctl/conditional.h
index 5b63eaa85..976143a1d 100644
--- a/ctl/conditional.h
+++ b/ctl/conditional.h
@@ -17,9 +17,6 @@ struct conditional<false, T, F>
     typedef F type;
 };
 
-template<bool B, typename T, typename F>
-using conditional_t = typename conditional<B, T, F>::type;
-
 } // namespace ctl
 
 #endif // CTL_CONDITIONAL_H_
diff --git a/ctl/is_void.h b/ctl/is_void.h
index 275848d81..04c33145c 100644
--- a/ctl/is_void.h
+++ b/ctl/is_void.h
@@ -19,9 +19,6 @@ template<typename _Tp>
 struct is_void : public is_void_<typename ctl::remove_cv<_Tp>::type>::type
 {};
 
-template<typename T>
-inline constexpr bool is_void_v = is_void<T>::value;
-
 } // namespace ctl
 
 #endif // CTL_IS_VOID_H_
diff --git a/ctl/set.h b/ctl/set.h
index 2216ca851..cc951b98c 100644
--- a/ctl/set.h
+++ b/ctl/set.h
@@ -241,9 +241,8 @@ class set
       private:
         friend class set;
         node_type* node_;
-        node_type* root_;
 
-        explicit reverse_iterator(node_type* node, node_type* root) : node_(node), root_(root)
+        explicit reverse_iterator(node_type* node) : node_(node)
         {
         }
     };
@@ -348,17 +347,17 @@ class set
 
     reverse_iterator rbegin()
     {
-        return reverse_iterator(rightmost(root_), root_);
+        return reverse_iterator(rightmost(root_));
     }
 
     const_reverse_iterator rbegin() const
     {
-        return const_reverse_iterator(rightmost(root_), root_);
+        return const_reverse_iterator(rightmost(root_));
     }
 
     const_reverse_iterator crbegin() const
     {
-        return const_reverse_iterator(rightmost(root_), root_);
+        return const_reverse_iterator(rightmost(root_));
     }
 
     iterator end() noexcept
@@ -378,17 +377,17 @@ class set
 
     reverse_iterator rend()
     {
-        return reverse_iterator(nullptr, root_);
+        return reverse_iterator(nullptr);
     }
 
     const_reverse_iterator rend() const
     {
-        return const_reverse_iterator(nullptr, root_);
+        return const_reverse_iterator(nullptr);
     }
 
     const_reverse_iterator crend() const
     {
-        return const_reverse_iterator(nullptr, root_);
+        return const_reverse_iterator(nullptr);
     }
 
     void clear() noexcept
diff --git a/ctl/shared_ptr.h b/ctl/shared_ptr.h
deleted file mode 100644
index 8aac68070..000000000
--- a/ctl/shared_ptr.h
+++ /dev/null
@@ -1,618 +0,0 @@
-// -*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-
-// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
-#ifndef CTL_SHARED_PTR_H_
-#define CTL_SHARED_PTR_H_
-
-#include "exception.h"
-#include "is_base_of.h"
-#include "is_constructible.h"
-#include "is_convertible.h"
-#include "remove_extent.h"
-#include "unique_ptr.h"
-
-// XXX currently needed to use placement-new syntax (move to cxx.inc?)
-void*
-operator new(size_t, void*) noexcept;
-
-namespace ctl {
-
-class bad_weak_ptr : public exception
-{
-  public:
-    const char* what() const noexcept override
-    {
-        return "ctl::bad_weak_ptr";
-    }
-};
-
-namespace __ {
-
-template<typename T>
-struct ptr_ref
-{
-    using type = T&;
-};
-
-template<>
-struct ptr_ref<void>
-{
-    using type = void;
-};
-
-static inline __attribute__((always_inline)) void
-incref(size_t* r) noexcept
-{
-#ifdef NDEBUG
-    __atomic_fetch_add(r, 1, __ATOMIC_RELAXED);
-#else
-    ssize_t refs = __atomic_fetch_add(r, 1, __ATOMIC_RELAXED);
-    if (refs < 0)
-        __builtin_trap();
-#endif
-}
-
-static inline __attribute__((always_inline)) bool
-decref(size_t* r) noexcept
-{
-    if (!__atomic_fetch_sub(r, 1, __ATOMIC_RELEASE)) {
-        __atomic_thread_fence(__ATOMIC_ACQUIRE);
-        return true;
-    }
-    return false;
-}
-
-class shared_ref
-{
-  public:
-    constexpr shared_ref() noexcept = default;
-    shared_ref(const shared_ref&) = delete;
-    shared_ref& operator=(const shared_ref&) = delete;
-
-    virtual ~shared_ref() = default;
-
-    void keep_shared() noexcept
-    {
-        incref(&shared);
-    }
-
-    void drop_shared() noexcept
-    {
-        if (decref(&shared)) {
-            dispose();
-            drop_weak();
-        }
-    }
-
-    void keep_weak() noexcept
-    {
-        incref(&weak);
-    }
-
-    void drop_weak() noexcept
-    {
-        if (decref(&weak)) {
-            delete this;
-        }
-    }
-
-    size_t use_count() const noexcept
-    {
-        return __atomic_load_n(&shared, __ATOMIC_RELAXED) + 1;
-    }
-
-    size_t weak_count() const noexcept
-    {
-        return __atomic_load_n(&weak, __ATOMIC_RELAXED);
-    }
-
-  private:
-    virtual void dispose() noexcept = 0;
-
-    size_t shared = 0;
-    size_t weak = 0;
-};
-
-template<typename T, typename D>
-class shared_pointer : public shared_ref
-{
-  public:
-    static shared_pointer* make(T* const p, D d)
-    {
-        return make(unique_ptr<T, D>(p, move(d)));
-    }
-
-    static shared_pointer* make(unique_ptr<T, D> p)
-    {
-        return new shared_pointer(p.release(), move(p.get_deleter()));
-    }
-
-  private:
-    shared_pointer(T* const p, D d) noexcept : p(p), d(move(d))
-    {
-    }
-
-    void dispose() noexcept override
-    {
-        move(d)(p);
-    }
-
-    T* const p;
-    [[no_unique_address]] D d;
-};
-
-template<typename T>
-class shared_emplace : public shared_ref
-{
-  public:
-    union
-    {
-        T t;
-    };
-
-    ~shared_emplace() override
-    {
-    }
-
-    template<typename... Args>
-    void construct(Args&&... args)
-    {
-        ::new (&t) T(forward<Args>(args)...);
-    }
-
-    static unique_ptr<shared_emplace> make()
-    {
-        return unique_ptr(new shared_emplace());
-    }
-
-  private:
-    explicit constexpr shared_emplace() noexcept
-    {
-    }
-
-    void dispose() noexcept override
-    {
-        t.~T();
-    }
-};
-
-template<typename T, typename U>
-concept shared_ptr_compatible = is_convertible_v<U*, T*>;
-
-} // namespace __
-
-template<typename T>
-class weak_ptr;
-
-template<typename T>
-class shared_ptr
-{
-  public:
-    using element_type = remove_extent_t<T>;
-    using weak_type = weak_ptr<T>;
-
-    constexpr shared_ptr() noexcept = default;
-    constexpr shared_ptr(nullptr_t) noexcept
-    {
-    }
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    explicit shared_ptr(U* const p) : shared_ptr(p, default_delete<U>())
-    {
-    }
-
-    template<typename U, typename D>
-        requires __::shared_ptr_compatible<T, U>
-    shared_ptr(U*, D);
-
-    template<typename U>
-    shared_ptr(const shared_ptr<U>& r, element_type* p) noexcept
-      : p(p), rc(r.rc)
-    {
-        if (rc)
-            rc->keep_shared();
-    }
-
-    template<typename U>
-    shared_ptr(shared_ptr<U>&& r, element_type* p) noexcept : p(p), rc(r.rc)
-    {
-        r.p = nullptr;
-        r.rc = nullptr;
-    }
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    shared_ptr(const shared_ptr<U>& r) noexcept : p(r.p), rc(r.rc)
-    {
-        if (rc)
-            rc->keep_shared();
-    }
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    shared_ptr(shared_ptr<U>&& r) noexcept : p(r.p), rc(r.rc)
-    {
-        r.p = nullptr;
-        r.rc = nullptr;
-    }
-
-    shared_ptr(const shared_ptr& r) noexcept : p(r.p), rc(r.rc)
-    {
-        if (rc)
-            rc->keep_shared();
-    }
-
-    shared_ptr(shared_ptr&& r) noexcept : p(r.p), rc(r.rc)
-    {
-        r.p = nullptr;
-        r.rc = nullptr;
-    }
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    explicit shared_ptr(const weak_ptr<U>& r) : p(r.p), rc(r.rc)
-    {
-        if (r.expired()) {
-            throw bad_weak_ptr();
-        }
-        rc->keep_shared();
-    }
-
-    template<typename U, typename D>
-        requires __::shared_ptr_compatible<T, U>
-    shared_ptr(unique_ptr<U, D>&& r)
-      : p(r.p), rc(__::shared_pointer<U, D>::make(move(r)))
-    {
-    }
-
-    ~shared_ptr()
-    {
-        if (rc)
-            rc->drop_shared();
-    }
-
-    shared_ptr& operator=(shared_ptr r) noexcept
-    {
-        swap(r);
-        return *this;
-    }
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    shared_ptr& operator=(shared_ptr<U> r) noexcept
-    {
-        shared_ptr<T>(move(r)).swap(*this);
-        return *this;
-    }
-
-    void reset() noexcept
-    {
-        shared_ptr().swap(*this);
-    }
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    void reset(U* const p2)
-    {
-        shared_ptr<T>(p2).swap(*this);
-    }
-
-    template<typename U, typename D>
-        requires __::shared_ptr_compatible<T, U>
-    void reset(U* const p2, D d)
-    {
-        shared_ptr<T>(p2, d).swap(*this);
-    }
-
-    void swap(shared_ptr& r) noexcept
-    {
-        using ctl::swap;
-        swap(p, r.p);
-        swap(rc, r.rc);
-    }
-
-    element_type* get() const noexcept
-    {
-        return p;
-    }
-
-    typename __::ptr_ref<T>::type operator*() const noexcept
-    {
-        if (!p)
-            __builtin_trap();
-        return *p;
-    }
-
-    T* operator->() const noexcept
-    {
-        if (!p)
-            __builtin_trap();
-        return p;
-    }
-
-    long use_count() const noexcept
-    {
-        return rc ? rc->use_count() : 0;
-    }
-
-    explicit operator bool() const noexcept
-    {
-        return p;
-    }
-
-    template<typename U>
-    bool owner_before(const shared_ptr<U>& r) const noexcept
-    {
-        return rc < r.rc;
-    }
-
-    template<typename U>
-    bool owner_before(const weak_ptr<U>& r) const noexcept
-    {
-        return rc < r.rc;
-    }
-
-  private:
-    template<typename U>
-    friend class weak_ptr;
-
-    template<typename U>
-    friend class shared_ptr;
-
-    template<typename U, typename... Args>
-    friend shared_ptr<U> make_shared(Args&&... args);
-
-    element_type* p = nullptr;
-    __::shared_ref* rc = nullptr;
-};
-
-template<typename T>
-class weak_ptr
-{
-  public:
-    using element_type = remove_extent_t<T>;
-
-    constexpr weak_ptr() noexcept = default;
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    weak_ptr(const shared_ptr<U>& r) noexcept : p(r.p), rc(r.rc)
-    {
-        if (rc)
-            rc->keep_weak();
-    }
-
-    weak_ptr(const weak_ptr& r) noexcept : p(r.p), rc(r.rc)
-    {
-        if (rc)
-            rc->keep_weak();
-    }
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    weak_ptr(const weak_ptr<U>& r) noexcept : p(r.p), rc(r.rc)
-    {
-        if (rc)
-            rc->keep_weak();
-    }
-
-    weak_ptr(weak_ptr&& r) noexcept : p(r.p), rc(r.rc)
-    {
-        r.p = nullptr;
-        r.rc = nullptr;
-    }
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    weak_ptr(weak_ptr<U>&& r) noexcept : p(r.p), rc(r.rc)
-    {
-        r.p = nullptr;
-        r.rc = nullptr;
-    }
-
-    ~weak_ptr()
-    {
-        if (rc)
-            rc->drop_weak();
-    }
-
-    long use_count() const noexcept
-    {
-        return rc ? rc->use_count() : 0;
-    }
-
-    bool expired() const noexcept
-    {
-        return !use_count();
-    }
-
-    void reset() noexcept
-    {
-        weak_ptr().swap(*this);
-    }
-
-    void swap(weak_ptr& r) noexcept
-    {
-        using ctl::swap;
-        swap(p, r.p);
-        swap(rc, r.rc);
-    }
-
-    weak_ptr& operator=(weak_ptr r) noexcept
-    {
-        swap(r);
-        return *this;
-    }
-
-    template<typename U>
-        requires __::shared_ptr_compatible<T, U>
-    weak_ptr& operator=(weak_ptr<U> r) noexcept
-    {
-        weak_ptr<T>(move(r)).swap(*this);
-    }
-
-    shared_ptr<T> lock() const noexcept
-    {
-        if (expired())
-            return nullptr;
-        shared_ptr<T> r;
-        r.p = p;
-        r.rc = rc;
-        if (rc)
-            rc->keep_shared();
-        return r;
-    }
-
-    template<typename U>
-    bool owner_before(const weak_ptr<U>& r) const noexcept
-    {
-        return rc < r.rc;
-    }
-
-    template<typename U>
-    bool owner_before(const shared_ptr<U>& r) const noexcept
-    {
-        return rc < r.rc;
-    }
-
-  private:
-    template<typename U>
-    friend class shared_ptr;
-
-    template<typename U, typename... Args>
-    friend shared_ptr<U> make_shared(Args&&...);
-
-    element_type* p = nullptr;
-    __::shared_ref* rc = nullptr;
-};
-
-template<typename T>
-class enable_shared_from_this
-{
-  public:
-    shared_ptr<T> shared_from_this()
-    {
-        return shared_ptr<T>(weak_this);
-    }
-    shared_ptr<T const> shared_from_this() const
-    {
-        return shared_ptr<T>(weak_this);
-    }
-
-    weak_ptr<T> weak_from_this()
-    {
-        return weak_this;
-    }
-    weak_ptr<T const> weak_from_this() const
-    {
-        return weak_this;
-    }
-
-  protected:
-    constexpr enable_shared_from_this() noexcept = default;
-    enable_shared_from_this(const enable_shared_from_this& r) noexcept
-    {
-    }
-    ~enable_shared_from_this() = default;
-
-    enable_shared_from_this& operator=(
-      const enable_shared_from_this& r) noexcept
-    {
-        return *this;
-    }
-
-  private:
-    template<typename U, typename... Args>
-    friend shared_ptr<U> make_shared(Args&&...);
-
-    template<typename U>
-    friend class shared_ptr;
-
-    weak_ptr<T> weak_this;
-};
-
-template<typename T>
-template<typename U, typename D>
-    requires __::shared_ptr_compatible<T, U>
-shared_ptr<T>::shared_ptr(U* const p, D d)
-  : p(p), rc(__::shared_pointer<U, D>::make(p, move(d)))
-{
-    if constexpr (is_base_of_v<enable_shared_from_this<U>, U>) {
-        p->weak_this = *this;
-    }
-}
-
-// Our make_shared supports passing a weak self reference as the first parameter
-// to your constructor, e.g.:
-//
-//     struct Tree : ctl::weak_self_base
-//     {
-//         ctl::shared_ptr<Tree> l, r;
-//         ctl::weak_ptr<Tree> parent;
-//         Tree(weak_ptr<Tree> const& self, auto&& l2, auto&& r2)
-//           : l(ctl::forward<decltype(l2)>(l2)),
-//             r(ctl::forward<decltype(r2)>(r2))
-//         {
-//             if (l) l->parent = self;
-//             if (r) r->parent = self;
-//         }
-//     };
-//
-//     int main() {
-//         auto t = ctl::make_shared<Tree>(
-//             ctl::make_shared<Tree>(nullptr, nullptr), nullptr);
-//         return t->l->parent.lock().get() == t.get() ? 0 : 1;
-//     }
-//
-// As shown, passing the parameter at object construction time lets you complete
-// object construction without needing a separate Init method. But because we go
-// off spec as far as the STL is concerned, there is a potential ambiguity where
-// you might have a constructor with a weak_ptr first parameter that is intended
-// to be something other than a self-reference. So this feature is opt-in by way
-// of inheriting from the following struct.
-struct weak_self_base
-{};
-
-template<typename T, typename... Args>
-shared_ptr<T>
-make_shared(Args&&... args)
-{
-    unique_ptr rc = __::shared_emplace<T>::make();
-    if constexpr (is_base_of_v<weak_self_base, T> &&
-                  is_constructible_v<T, const weak_ptr<T>&, Args...>) {
-        // A __::shared_ref has a virtual weak reference that is owned by all of
-        // the shared references. We can avoid some unnecessary refcount changes
-        // by "borrowing" that reference and passing it to the constructor, then
-        // promoting it to a shared reference by swapping it with the shared_ptr
-        // that we return.
-        weak_ptr<T> w;
-        w.p = &rc->t;
-        w.rc = rc.get();
-        try {
-            rc->construct(const_cast<const weak_ptr<T>&>(w),
-                          forward<Args>(args)...);
-        } catch (...) {
-            w.p = nullptr;
-            w.rc = nullptr;
-            throw;
-        }
-        rc.release();
-        shared_ptr<T> r;
-        swap(r.p, w.p);
-        swap(r.rc, w.rc);
-        return r;
-    } else {
-        rc->construct(forward<Args>(args)...);
-        shared_ptr<T> r;
-        r.p = &rc->t;
-        r.rc = rc.release();
-        if constexpr (is_base_of_v<enable_shared_from_this<T>, T>) {
-            r->weak_this = r;
-        }
-        return r;
-    }
-}
-
-} // namespace ctl
-
-#endif // CTL_SHARED_PTR_H_
diff --git a/ctl/string.cc b/ctl/string.cc
index 5e14220de..c30bf699d 100644
--- a/ctl/string.cc
+++ b/ctl/string.cc
@@ -383,72 +383,4 @@ string::erase(const size_t pos, size_t count) noexcept
     return *this;
 }
 
-void
-string::append(const ctl::string_view& s, size_t pos, size_t count) noexcept
-{
-    append(s.substr(pos, count));
-}
-
-size_t
-string::find_last_of(char c, size_t pos) const noexcept
-{
-    const char* b = data();
-    size_t n = size();
-    if (pos > n)
-        pos = n;
-    const char* p = (const char*)memrchr(b, c, pos);
-    return p ? p - b : npos;
-}
-
-size_t
-string::find_last_of(ctl::string_view set, size_t pos) const noexcept
-{
-    if (empty() || set.empty())
-        return npos;
-    bool lut[256] = {};
-    for (char c : set)
-        lut[c & 255] = true;
-    const char* b = data();
-    size_t last = size() - 1;
-    if (pos > last)
-        pos = last;
-    for (;;) {
-        if (lut[b[pos] & 255])
-            return pos;
-        if (!pos)
-            return npos;
-        --pos;
-    }
-}
-
-size_t
-string::find_first_of(char c, size_t pos) const noexcept
-{
-    size_t n = size();
-    if (pos >= n)
-        return npos;
-    const char* b = data();
-    const char* p = (const char*)memchr(b + pos, c, n - pos);
-    return p ? p - b : npos;
-}
-
-size_t
-string::find_first_of(ctl::string_view set, size_t pos) const noexcept
-{
-    if (set.empty())
-        return npos;
-    bool lut[256] = {};
-    for (char c : set)
-        lut[c & 255] = true;
-    const char* b = data();
-    size_t n = size();
-    for (;;) {
-        if (pos >= n)
-            return npos;
-        if (lut[b[pos] & 255])
-            return pos;
-        ++pos;
-    }
-}
-
 } // namespace ctl
diff --git a/ctl/string.h b/ctl/string.h
index e6e736eec..6c92d8f9f 100644
--- a/ctl/string.h
+++ b/ctl/string.h
@@ -125,7 +125,6 @@ class string
     void append(char, size_t) noexcept;
     void append(unsigned long) noexcept;
     void append(const void*, size_t) noexcept;
-    void append(const ctl::string_view&, size_t, size_t = npos) noexcept;
     string& insert(size_t, ctl::string_view) noexcept;
     string& erase(size_t = 0, size_t = npos) noexcept;
     string substr(size_t = 0, size_t = npos) const noexcept;
@@ -137,10 +136,6 @@ class string
     bool starts_with(ctl::string_view) const noexcept;
     size_t find(char, size_t = 0) const noexcept;
     size_t find(ctl::string_view, size_t = 0) const noexcept;
-    size_t find_first_of(char, size_t = 0) const noexcept;
-    size_t find_first_of(ctl::string_view, size_t = 0) const noexcept;
-    size_t find_last_of(char, size_t = npos) const noexcept;
-    size_t find_last_of(ctl::string_view, size_t = npos) const noexcept;
 
     void swap(string& s) noexcept
     {
@@ -307,7 +302,7 @@ class string
         append(ch);
     }
 
-    void append(const ctl::string_view& s) noexcept
+    void append(const ctl::string_view s) noexcept
     {
         append(s.p, s.n);
     }
diff --git a/ctl/string_view.cc b/ctl/string_view.cc
index 3dbadbe21..951f707a9 100644
--- a/ctl/string_view.cc
+++ b/ctl/string_view.cc
@@ -108,66 +108,4 @@ string_view::starts_with(const string_view s) const noexcept
     return !memcmp(p, s.p, s.n);
 }
 
-size_t
-string_view::find_last_of(char c, size_t pos) const noexcept
-{
-    const char* b = data();
-    size_t n = size();
-    if (pos > n)
-        pos = n;
-    const char* p = (const char*)memrchr(b, c, pos);
-    return p ? p - b : npos;
-}
-
-size_t
-string_view::find_last_of(ctl::string_view set, size_t pos) const noexcept
-{
-    if (empty() || set.empty())
-        return npos;
-    bool lut[256] = {};
-    for (char c : set)
-        lut[c & 255] = true;
-    const char* b = data();
-    size_t last = size() - 1;
-    if (pos > last)
-        pos = last;
-    for (;;) {
-        if (lut[b[pos] & 255])
-            return pos;
-        if (!pos)
-            return npos;
-        --pos;
-    }
-}
-
-size_t
-string_view::find_first_of(char c, size_t pos) const noexcept
-{
-    size_t n = size();
-    if (pos >= n)
-        return npos;
-    const char* b = data();
-    const char* p = (const char*)memchr(b + pos, c, n - pos);
-    return p ? p - b : npos;
-}
-
-size_t
-string_view::find_first_of(ctl::string_view set, size_t pos) const noexcept
-{
-    if (set.empty())
-        return npos;
-    bool lut[256] = {};
-    for (char c : set)
-        lut[c & 255] = true;
-    const char* b = data();
-    size_t n = size();
-    for (;;) {
-        if (pos >= n)
-            return npos;
-        if (lut[b[pos] & 255])
-            return pos;
-        ++pos;
-    }
-}
-
 } // namespace ctl
diff --git a/ctl/string_view.h b/ctl/string_view.h
index 9c5949f02..64a895799 100644
--- a/ctl/string_view.h
+++ b/ctl/string_view.h
@@ -45,10 +45,6 @@ struct string_view
     string_view substr(size_t = 0, size_t = npos) const noexcept;
     size_t find(char, size_t = 0) const noexcept;
     size_t find(string_view, size_t = 0) const noexcept;
-    size_t find_first_of(char, size_t = 0) const noexcept;
-    size_t find_first_of(ctl::string_view, size_t = 0) const noexcept;
-    size_t find_last_of(char, size_t = npos) const noexcept;
-    size_t find_last_of(ctl::string_view, size_t = npos) const noexcept;
 
     constexpr string_view& operator=(const string_view s) noexcept
     {
@@ -113,12 +109,12 @@ struct string_view
         return p[n - 1];
     }
 
-    constexpr const_iterator begin() const noexcept
+    constexpr const_iterator begin() noexcept
     {
         return p;
     }
 
-    constexpr const_iterator end() const noexcept
+    constexpr const_iterator end() noexcept
     {
         return p + n;
     }
diff --git a/dsp/BUILD.mk b/dsp/BUILD.mk
index ae2150196..87e655809 100644
--- a/dsp/BUILD.mk
+++ b/dsp/BUILD.mk
@@ -2,9 +2,7 @@
 #── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
 
 .PHONY:		o/$(MODE)/dsp
-o/$(MODE)/dsp:	o/$(MODE)/dsp/audio	\
-		o/$(MODE)/dsp/core	\
+o/$(MODE)/dsp:	o/$(MODE)/dsp/core	\
 		o/$(MODE)/dsp/mpeg	\
 		o/$(MODE)/dsp/scale	\
-		o/$(MODE)/dsp/prog	\
 		o/$(MODE)/dsp/tty
diff --git a/dsp/audio/BUILD.mk b/dsp/audio/BUILD.mk
deleted file mode 100644
index 8265a040c..000000000
--- a/dsp/audio/BUILD.mk
+++ /dev/null
@@ -1,56 +0,0 @@
-#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
-#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
-
-PKGS += DSP_AUDIO
-
-DSP_AUDIO_ARTIFACTS += DSP_AUDIO_A
-DSP_AUDIO = $(DSP_AUDIO_A_DEPS) $(DSP_AUDIO_A)
-DSP_AUDIO_A = o/$(MODE)/dsp/audio/audio.a
-DSP_AUDIO_A_FILES := $(wildcard dsp/audio/*)
-DSP_AUDIO_A_HDRS = $(filter %.h,$(DSP_AUDIO_A_FILES)) dsp/audio/cosmoaudio/cosmoaudio.h
-DSP_AUDIO_A_SRCS = $(filter %.c,$(DSP_AUDIO_A_FILES))
-
-DSP_AUDIO_A_DATA =				\
-	dsp/audio/cosmoaudio/miniaudio.h	\
-	dsp/audio/cosmoaudio/cosmoaudio.c	\
-	dsp/audio/cosmoaudio/cosmoaudio.h	\
-	dsp/audio/cosmoaudio/cosmoaudio.dll	\
-
-DSP_AUDIO_A_OBJS =				\
-	$(DSP_AUDIO_A_SRCS:%.c=o/$(MODE)/%.o)	\
-	$(DSP_AUDIO_A_DATA:%=o/$(MODE)/%.zip.o)	\
-
-DSP_AUDIO_A_CHECKS =				\
-	$(DSP_AUDIO_A).pkg			\
-	$(DSP_AUDIO_A_HDRS:%=o/$(MODE)/%.ok)
-
-DSP_AUDIO_A_DIRECTDEPS =			\
-	LIBC_CALLS				\
-	LIBC_DLOPEN				\
-	LIBC_INTRIN				\
-	LIBC_NEXGEN32E				\
-	LIBC_STR				\
-	LIBC_SYSV				\
-	LIBC_PROC				\
-	LIBC_THREAD				\
-
-DSP_AUDIO_A_DEPS :=				\
-	$(call uniq,$(foreach x,$(DSP_AUDIO_A_DIRECTDEPS),$($(x))))
-
-$(DSP_AUDIO_A):	dsp/audio/			\
-		$(DSP_AUDIO_A).pkg		\
-		$(DSP_AUDIO_A_OBJS)
-
-$(DSP_AUDIO_A).pkg:				\
-		$(DSP_AUDIO_A_OBJS)		\
-		$(foreach x,$(DSP_AUDIO_A_DIRECTDEPS),$($(x)_A).pkg)
-
-DSP_AUDIO_LIBS = $(foreach x,$(DSP_AUDIO_ARTIFACTS),$($(x)))
-DSP_AUDIO_SRCS = $(foreach x,$(DSP_AUDIO_ARTIFACTS),$($(x)_SRCS))
-DSP_AUDIO_HDRS = $(foreach x,$(DSP_AUDIO_ARTIFACTS),$($(x)_HDRS))
-DSP_AUDIO_CHECKS = $(foreach x,$(DSP_AUDIO_ARTIFACTS),$($(x)_CHECKS))
-DSP_AUDIO_OBJS = $(foreach x,$(DSP_AUDIO_ARTIFACTS),$($(x)_OBJS))
-$(DSP_AUDIO_OBJS): $(BUILD_FILES) dsp/audio/BUILD.mk
-
-.PHONY: o/$(MODE)/dsp/audio
-o/$(MODE)/dsp/audio: $(DSP_AUDIO_CHECKS)
diff --git a/dsp/audio/audio.c b/dsp/audio/audio.c
deleted file mode 100644
index f05a6a3b6..000000000
--- a/dsp/audio/audio.c
+++ /dev/null
@@ -1,358 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "dsp/audio/cosmoaudio/cosmoaudio.h"
-#include "dsp/audio/describe.h"
-#include "libc/calls/blockcancel.internal.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/sigset.internal.h"
-#include "libc/calls/struct/stat.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/dce.h"
-#include "libc/dlopen/dlfcn.h"
-#include "libc/errno.h"
-#include "libc/intrin/describeflags.h"
-#include "libc/intrin/strace.h"
-#include "libc/limits.h"
-#include "libc/macros.h"
-#include "libc/proc/posix_spawn.h"
-#include "libc/runtime/runtime.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/temp.h"
-#include "libc/thread/thread.h"
-
-#define COSMOAUDIO_MINIMUM_VERISON 1
-
-#define COSMOAUDIO_DSO_NAME "cosmoaudio." STRINGIFY(COSMOAUDIO_MINIMUM_VERISON)
-
-__static_yoink("dsp/audio/cosmoaudio/miniaudio.h");
-__static_yoink("dsp/audio/cosmoaudio/cosmoaudio.h");
-__static_yoink("dsp/audio/cosmoaudio/cosmoaudio.c");
-__static_yoink("dsp/audio/cosmoaudio/cosmoaudio.dll");
-
-static const struct Source {
-  const char *zip;
-  const char *name;
-} srcs[] = {
-    {"/zip/dsp/audio/cosmoaudio/miniaudio.h", "miniaudio.h"},
-    {"/zip/dsp/audio/cosmoaudio/cosmoaudio.h", "cosmoaudio.h"},
-    {"/zip/dsp/audio/cosmoaudio/cosmoaudio.c", "cosmoaudio.c"},  // must last
-};
-
-static struct {
-  pthread_once_t once;
-  typeof(cosmoaudio_open) *open;
-  typeof(cosmoaudio_close) *close;
-  typeof(cosmoaudio_write) *write;
-  typeof(cosmoaudio_flush) *flush;
-  typeof(cosmoaudio_read) *read;
-  typeof(cosmoaudio_poll) *poll;
-} g_audio;
-
-static const char *cosmoaudio_tmp_dir(void) {
-  const char *tmpdir;
-  if (!(tmpdir = getenv("TMPDIR")) || !*tmpdir)
-    if (!(tmpdir = getenv("HOME")) || !*tmpdir)
-      tmpdir = ".";
-  return tmpdir;
-}
-
-static bool cosmoaudio_app_dir(char *path, size_t size) {
-  strlcpy(path, cosmoaudio_tmp_dir(), size);
-  strlcat(path, "/.cosmo/", size);
-  if (makedirs(path, 0755))
-    return false;
-  return true;
-}
-
-static bool cosmoaudio_dso_path(char *path, size_t size) {
-  if (!cosmoaudio_app_dir(path, size))
-    return false;
-  strlcat(path, COSMOAUDIO_DSO_NAME, size);
-  if (IsWindows()) {
-    strlcat(path, ".dll", size);
-  } else if (IsXnu()) {
-    strlcat(path, ".dylib", size);
-  } else {
-    strlcat(path, ".so", size);
-  }
-  return true;
-}
-
-static bool cosmoaudio_extract(const char *zip, const char *to) {
-  int fdin, fdout;
-  char stage[PATH_MAX];
-  strlcpy(stage, to, sizeof(stage));
-  if (strlcat(stage, ".XXXXXX", sizeof(stage)) >= sizeof(stage)) {
-    errno = ENAMETOOLONG;
-    return false;
-  }
-  if ((fdout = mkostemp(stage, O_CLOEXEC)) == -1)
-    return false;
-  if ((fdin = open(zip, O_RDONLY | O_CLOEXEC)) == -1) {
-    close(fdout);
-    unlink(stage);
-    return false;
-  }
-  if (copyfd(fdin, fdout, -1) == -1) {
-    close(fdin);
-    close(fdout);
-    unlink(stage);
-    return false;
-  }
-  if (close(fdout)) {
-    close(fdin);
-    unlink(stage);
-    return false;
-  }
-  if (close(fdin)) {
-    unlink(stage);
-    return false;
-  }
-  if (rename(stage, to)) {
-    unlink(stage);
-    return false;
-  }
-  return true;
-}
-
-static bool cosmoaudio_build(const char *dso) {
-
-  // extract sauce
-  char src[PATH_MAX];
-  for (int i = 0; i < sizeof(srcs) / sizeof(*srcs); ++i) {
-    if (!cosmoaudio_app_dir(src, PATH_MAX))
-      return false;
-    strlcat(src, srcs[i].name, sizeof(src));
-    if (!cosmoaudio_extract(srcs[i].zip, src))
-      return false;
-  }
-
-  // create temporary name for compiled dso
-  // it'll ensure build operation is atomic
-  int fd;
-  char tmpdso[PATH_MAX];
-  strlcpy(tmpdso, dso, sizeof(tmpdso));
-  strlcat(tmpdso, ".XXXXXX", sizeof(tmpdso));
-  if ((fd = mkostemp(tmpdso, O_CLOEXEC)) != -1) {
-    close(fd);
-  } else {
-    return false;
-  }
-
-  // build cosmoaudio with host c compiler
-  char *args[] = {
-      "cc",                                       //
-      "-w",                                       //
-      "-I.",                                      //
-      "-O2",                                      //
-      "-fPIC",                                    //
-      "-shared",                                  //
-      "-pthread",                                 //
-      "-DNDEBUG",                                 //
-      IsAarch64() ? "-ffixed-x28" : "-DIGNORE1",  //
-      src,                                        //
-      "-o",                                       //
-      tmpdso,                                     //
-      "-lm",                                      //
-      IsNetbsd() ? 0 : "-ldl",                    //
-      NULL,
-  };
-  int pid, ws;
-  errno_t err = posix_spawnp(&pid, args[0], NULL, NULL, args, environ);
-  if (err)
-    return false;
-  while (waitpid(pid, &ws, 0) == -1)
-    if (errno != EINTR)
-      return false;
-  if (ws)
-    return false;
-
-  // move dso to its final destination
-  if (rename(tmpdso, dso))
-    return false;
-
-  return true;
-}
-
-static void *cosmoaudio_dlopen(const char *name) {
-  void *handle;
-  if ((handle = cosmo_dlopen(name, RTLD_NOW))) {
-    typeof(cosmoaudio_version) *version;
-    if ((version = cosmo_dlsym(handle, "cosmoaudio_version")))
-      if (version() >= COSMOAUDIO_MINIMUM_VERISON)
-        return handle;
-    cosmo_dlclose(handle);
-  }
-  return 0;
-}
-
-static void cosmoaudio_setup_impl(void) {
-  void *handle;
-  if (IsOpenbsd())
-    return;  // no dlopen support yet
-  if (IsXnu() && !IsXnuSilicon())
-    return;  // no dlopen support yet
-  if (!(handle = cosmoaudio_dlopen(COSMOAUDIO_DSO_NAME ".so")) &&
-      !(handle = cosmoaudio_dlopen("lib" COSMOAUDIO_DSO_NAME ".so")) &&
-      !(handle = cosmoaudio_dlopen("cosmoaudio.so")) &&
-      !(handle = cosmoaudio_dlopen("libcosmoaudio.so"))) {
-    char dso[PATH_MAX];
-    if (!cosmoaudio_dso_path(dso, sizeof(dso)))
-      return;
-    if ((handle = cosmoaudio_dlopen(dso)))
-      goto WeAreGood;
-    if (IsWindows()) {
-      if (cosmoaudio_extract("/zip/dsp/audio/cosmoaudio/cosmoaudio.dll", dso)) {
-        if ((handle = cosmoaudio_dlopen(dso))) {
-          goto WeAreGood;
-        } else {
-          return;
-        }
-      }
-    }
-    if (!cosmoaudio_build(dso))
-      return;
-    if (!(handle = cosmoaudio_dlopen(dso)))
-      return;
-  }
-WeAreGood:
-  g_audio.open = cosmo_dlsym(handle, "cosmoaudio_open");
-  g_audio.close = cosmo_dlsym(handle, "cosmoaudio_close");
-  g_audio.write = cosmo_dlsym(handle, "cosmoaudio_write");
-  g_audio.flush = cosmo_dlsym(handle, "cosmoaudio_flush");
-  g_audio.read = cosmo_dlsym(handle, "cosmoaudio_read");
-  g_audio.poll = cosmo_dlsym(handle, "cosmoaudio_poll");
-}
-
-static void cosmoaudio_setup(void) {
-  BLOCK_CANCELATION;
-  cosmoaudio_setup_impl();
-  ALLOW_CANCELATION;
-}
-
-static void cosmoaudio_init(void) {
-  pthread_once(&g_audio.once, cosmoaudio_setup);
-}
-
-COSMOAUDIO_ABI int cosmoaudio_open(
-    struct CosmoAudio **out_ca, const struct CosmoAudioOpenOptions *options) {
-  int status;
-  char sbuf[32];
-  char dbuf[256];
-  cosmoaudio_init();
-  if (g_audio.open) {
-    BLOCK_SIGNALS;
-    status = g_audio.open(out_ca, options);
-    ALLOW_SIGNALS;
-  } else {
-    status = COSMOAUDIO_ELINK;
-  }
-  STRACE("cosmoaudio_open([%p], %s) → %s",
-         out_ca ? *out_ca : (struct CosmoAudio *)-1,
-         cosmoaudio_describe_open_options(dbuf, sizeof(dbuf), options),
-         cosmoaudio_describe_status(sbuf, sizeof(sbuf), status));
-  return status;
-}
-
-COSMOAUDIO_ABI int cosmoaudio_close(struct CosmoAudio *ca) {
-  int status;
-  char sbuf[32];
-  if (g_audio.close) {
-    BLOCK_SIGNALS;
-    status = g_audio.close(ca);
-    ALLOW_SIGNALS;
-  } else {
-    status = COSMOAUDIO_ELINK;
-  }
-  STRACE("cosmoaudio_close(%p) → %s", ca,
-         cosmoaudio_describe_status(sbuf, sizeof(sbuf), status));
-  return status;
-}
-
-COSMOAUDIO_ABI int cosmoaudio_write(struct CosmoAudio *ca, const float *data,
-                                    int frames) {
-  int status;
-  char sbuf[32];
-  if (g_audio.write) {
-    BLOCK_SIGNALS;
-    status = g_audio.write(ca, data, frames);
-    ALLOW_SIGNALS;
-  } else {
-    status = COSMOAUDIO_ELINK;
-  }
-  if (frames <= 0 || frames >= 160)
-    DATATRACE("cosmoaudio_write(%p, %p, %d) → %s", ca, data, frames,
-              cosmoaudio_describe_status(sbuf, sizeof(sbuf), status));
-  return status;
-}
-
-COSMOAUDIO_ABI int cosmoaudio_read(struct CosmoAudio *ca, float *data,
-                                   int frames) {
-  int status;
-  char sbuf[32];
-  if (g_audio.read) {
-    BLOCK_SIGNALS;
-    status = g_audio.read(ca, data, frames);
-    ALLOW_SIGNALS;
-  } else {
-    status = COSMOAUDIO_ELINK;
-  }
-  if (frames <= 0 || frames >= 160)
-    DATATRACE("cosmoaudio_read(%p, %p, %d) → %s", ca, data, frames,
-              cosmoaudio_describe_status(sbuf, sizeof(sbuf), status));
-  return status;
-}
-
-COSMOAUDIO_ABI int cosmoaudio_flush(struct CosmoAudio *ca) {
-  int status;
-  char sbuf[32];
-  if (g_audio.flush) {
-    BLOCK_SIGNALS;
-    status = g_audio.flush(ca);
-    ALLOW_SIGNALS;
-  } else {
-    status = COSMOAUDIO_ELINK;
-  }
-  DATATRACE("cosmoaudio_flush(%p) → %s", ca,
-            cosmoaudio_describe_status(sbuf, sizeof(sbuf), status));
-  return status;
-}
-
-COSMOAUDIO_ABI int cosmoaudio_poll(struct CosmoAudio *ca,
-                                   int *in_out_readFrames,
-                                   int *in_out_writeFrames) {
-  int status;
-  char sbuf[32];
-  char fbuf[2][20];
-  if (g_audio.poll) {
-    BLOCK_SIGNALS;
-    status = g_audio.poll(ca, in_out_readFrames, in_out_writeFrames);
-    ALLOW_SIGNALS;
-  } else {
-    status = COSMOAUDIO_ELINK;
-  }
-  DATATRACE("cosmoaudio_poll(%p, %s, %s) → %s", ca,
-            cosmoaudio_describe_poll_frames(fbuf[0], sizeof(fbuf[0]),
-                                            in_out_readFrames),
-            cosmoaudio_describe_poll_frames(fbuf[1], sizeof(fbuf[1]),
-                                            in_out_writeFrames),
-            cosmoaudio_describe_status(sbuf, sizeof(sbuf), status));
-  return status;
-}
diff --git a/dsp/audio/cosmoaudio/.gitignore b/dsp/audio/cosmoaudio/.gitignore
deleted file mode 100644
index 87bb5b389..000000000
--- a/dsp/audio/cosmoaudio/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*.o
-/Debug
-/Release
diff --git a/dsp/audio/cosmoaudio/Makefile.msvc b/dsp/audio/cosmoaudio/Makefile.msvc
deleted file mode 100644
index 0165c5d16..000000000
--- a/dsp/audio/cosmoaudio/Makefile.msvc
+++ /dev/null
@@ -1,87 +0,0 @@
-# Makefile for MSVC x64 Command Line Developer Tools
-#
-#     nmake /f Makefile.msvc check
-#     nmake /f Makefile.msvc MODE=debug check
-#
-# Note: MSVC 2019 makes the DLL 64kb smaller than MSVC 2022.
-
-# Compiler and linker
-CC=cl
-LINK=link
-
-# Build mode (can be overridden from command line)
-!IFNDEF MODE
-MODE=release
-!ENDIF
-
-# Library dependencies.
-TEST_LIBS=OneCore.lib
-
-# Compiler flags
-CFLAGS_COMMON=/nologo /W4 /Gy /EHsc
-CFLAGS_DEBUG=/Od /Zi /MDd /D_DEBUG
-CFLAGS_RELEASE=/O2 /MT /DNDEBUG
-
-!IF "$(MODE)"=="debug"
-CFLAGS=$(CFLAGS_COMMON) $(CFLAGS_DEBUG)
-LDFLAGS=/DEBUG
-OUT_DIR=Debug
-!ELSE
-CFLAGS=$(CFLAGS_COMMON) $(CFLAGS_RELEASE) /GL
-LDFLAGS=/RELEASE /OPT:REF /OPT:ICF /LTCG /INCREMENTAL:NO
-OUT_DIR=Release
-!ENDIF
-
-# Additional flags for DLL
-DLL_CFLAGS=$(CFLAGS) /D_USRDLL /D_WINDLL
-
-# Linker flags
-LDFLAGS=/NOLOGO /SUBSYSTEM:CONSOLE $(LDFLAGS)
-
-# Output file names
-DLL_TARGET=$(OUT_DIR)\cosmoaudio.dll
-TEST_TARGET=$(OUT_DIR)\test.exe
-
-# Source files
-DLL_SOURCES=cosmoaudio.c
-TEST_SOURCES=test.c
-
-# Object files
-DLL_OBJECTS=$(OUT_DIR)\cosmoaudio.obj
-TEST_OBJECTS=$(OUT_DIR)\test.obj
-
-# Default target
-all: $(OUT_DIR) $(DLL_TARGET) $(TEST_TARGET)
-
-# Create output directory
-$(OUT_DIR):
-    if not exist $(OUT_DIR) mkdir $(OUT_DIR)
-
-# Rule to build the DLL
-$(DLL_TARGET): $(OUT_DIR) $(DLL_OBJECTS)
-    $(LINK) /DLL $(LDFLAGS) /OUT:$(DLL_TARGET) $(DLL_OBJECTS)
-
-# Rule to build the test program
-$(TEST_TARGET): $(OUT_DIR) $(TEST_OBJECTS) $(DLL_TARGET)
-    $(LINK) $(LDFLAGS) /OUT:$(TEST_TARGET) $(TEST_OBJECTS) $(DLL_TARGET:.dll=.lib) $(TEST_LIBS)
-
-# Rules to compile .c files to .obj files with header dependencies
-{.}.c{$(OUT_DIR)}.obj:
-    $(CC) $(DLL_CFLAGS) /c /Fo$(OUT_DIR)\ $<
-
-$(OUT_DIR)\test.obj: $(OUT_DIR) test.c cosmoaudio.h
-    $(CC) $(CFLAGS) /c /Fo$(OUT_DIR)\ test.c
-
-$(OUT_DIR)\cosmoaudio.obj: $(OUT_DIR) cosmoaudio.c miniaudio.h cosmoaudio.h
-    $(CC) $(DLL_CFLAGS) /c /Fo$(OUT_DIR)\ cosmoaudio.c
-
-# Clean target
-clean:
-    if exist $(OUT_DIR) rmdir /s /q $(OUT_DIR)
-
-# Run tests (now called 'check')
-check: $(TEST_TARGET)
-    $(TEST_TARGET)
-
-# Phony targets
-.PHONY: all clean check
diff --git a/dsp/audio/cosmoaudio/cosmoaudio.c b/dsp/audio/cosmoaudio/cosmoaudio.c
deleted file mode 100644
index e518a4852..000000000
--- a/dsp/audio/cosmoaudio/cosmoaudio.c
+++ /dev/null
@@ -1,519 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#define COSMOAUDIO_BUILD
-#include "cosmoaudio.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define MA_DEBUG_OUTPUT
-#define MA_DR_MP3_NO_STDIO
-#define MA_NO_DECODING
-#define MA_NO_ENCODING
-#define MA_NO_ENGINE
-#define MA_NO_GENERATION
-#define MA_NO_NODE_GRAPH
-#define MA_NO_RESOURCE_MANAGER
-#define MA_STATIC
-
-#define MINIAUDIO_IMPLEMENTATION
-#include "miniaudio.h"
-
-struct CosmoAudio {
-  enum CosmoAudioDeviceType deviceType;
-  ma_uint32 outputBufferFrames;
-  ma_uint32 inputBufferFrames;
-  int sampleRate;
-  int channels;
-  int isLeft;
-  ma_context context;
-  ma_device device;
-  ma_pcm_rb output;
-  ma_pcm_rb input;
-  ma_event event;
-  ma_log log;
-};
-
-static int read_ring_buffer(ma_log* log, ma_pcm_rb* rb, float* pOutput,
-                            ma_uint32 frameCount, ma_uint32 channels) {
-  ma_result result;
-  ma_uint32 framesRead;
-  ma_uint32 framesToRead;
-  for (framesRead = 0; framesRead < frameCount; framesRead += framesToRead) {
-    framesToRead = frameCount - framesRead;
-    void* pMappedBuffer;
-    result = ma_pcm_rb_acquire_read(rb, &framesToRead, &pMappedBuffer);
-    if (result != MA_SUCCESS) {
-      ma_log_postf(log, MA_LOG_LEVEL_WARNING,
-                   "ma_pcm_rb_acquire_read failed: %s\n",
-                   ma_result_description(result));
-      return COSMOAUDIO_ERROR;
-    }
-    if (!framesToRead)
-      break;
-    memcpy(pOutput + framesRead * channels, pMappedBuffer,
-           framesToRead * channels * sizeof(float));
-    result = ma_pcm_rb_commit_read(rb, framesToRead);
-    if (result != MA_SUCCESS) {
-      if (result == MA_AT_END) {
-        framesRead += framesToRead;
-        break;
-      }
-      ma_log_postf(log, MA_LOG_LEVEL_WARNING,
-                   "ma_pcm_rb_commit_read failed: %s\n",
-                   ma_result_description(result));
-      return COSMOAUDIO_ERROR;
-    }
-  }
-  return framesRead;
-}
-
-static int write_ring_buffer(ma_log* log, ma_pcm_rb* rb, const float* pInput,
-                             ma_uint32 frameCount, ma_uint32 channels) {
-  ma_result result;
-  ma_uint32 framesWritten;
-  ma_uint32 framesToWrite;
-  for (framesWritten = 0; framesWritten < frameCount;
-       framesWritten += framesToWrite) {
-    framesToWrite = frameCount - framesWritten;
-    void* pMappedBuffer;
-    result = ma_pcm_rb_acquire_write(rb, &framesToWrite, &pMappedBuffer);
-    if (result != MA_SUCCESS) {
-      ma_log_postf(log, MA_LOG_LEVEL_WARNING,
-                   "ma_pcm_rb_acquire_write failed: %s\n",
-                   ma_result_description(result));
-      return COSMOAUDIO_ERROR;
-    }
-    if (!framesToWrite)
-      break;
-    memcpy(pMappedBuffer, pInput + framesWritten * channels,
-           framesToWrite * channels * sizeof(float));
-    result = ma_pcm_rb_commit_write(rb, framesToWrite);
-    if (result != MA_SUCCESS) {
-      if (result == MA_AT_END) {
-        framesWritten += framesToWrite;
-        break;
-      }
-      ma_log_postf(log, MA_LOG_LEVEL_WARNING,
-                   "ma_pcm_rb_commit_write failed: %s\n",
-                   ma_result_description(result));
-      return COSMOAUDIO_ERROR;
-    }
-  }
-  return framesWritten;
-}
-
-static void data_callback_f32(ma_device* pDevice, float* pOutput,
-                              const float* pInput, ma_uint32 frameCount) {
-  struct CosmoAudio* ca = (struct CosmoAudio*)pDevice->pUserData;
-  if (ca->deviceType & kCosmoAudioDeviceTypePlayback) {
-    //
-    // "By default, miniaudio will pre-silence the data callback's
-    //  output buffer. If you know that you will always write valid data
-    //  to the output buffer you can disable pre-silencing by setting
-    //  the noPreSilence config option in the device config to true."
-    //
-    //          —Quoth miniaudio documentation § 16.1. Low Level API
-    //
-    if (ca->isLeft) {
-      int framesCopied = read_ring_buffer(&ca->log, &ca->output, pOutput,
-                                          frameCount, ca->channels);
-      if (framesCopied < (int)frameCount)
-        ca->isLeft = 0;
-    } else {
-      // TODO(jart): Maybe we should stretch the audio too short?
-      int frameOffset;
-      int availableFrames = ma_pcm_rb_available_read(&ca->output);
-      if (availableFrames >= (int)frameCount) {
-        frameOffset = 0;
-      } else {
-        frameOffset = frameCount - availableFrames;
-        frameCount = availableFrames;
-      }
-      read_ring_buffer(&ca->log, &ca->output,
-                       pOutput + frameOffset * ca->channels, frameCount,
-                       ca->channels);
-      ca->isLeft = 1;
-    }
-  }
-  if (ca->deviceType & kCosmoAudioDeviceTypeCapture)
-    write_ring_buffer(&ca->log, &ca->input, pInput, frameCount, ca->channels);
-  ma_event_signal(&ca->event);
-}
-
-static void data_callback(ma_device* pDevice, void* pOutput, const void* pInput,
-                          ma_uint32 frameCount) {
-  data_callback_f32(pDevice, (float*)pOutput, (const float*)pInput, frameCount);
-}
-
-/**
- * Returns current version of cosmo audio library.
- */
-COSMOAUDIO_ABI int cosmoaudio_version(void) {
-  return 1;
-}
-
-/**
- * Opens access to speaker and microphone.
- *
- * @param out_ca will receive pointer to allocated CosmoAudio object,
- *     which must be freed by caller with cosmoaudio_close(); if this
- *     function fails, then this will receive a NULL pointer value so
- *     that cosmoaudio_close(), cosmoaudio_write() etc. can be called
- *     without crashing if no error checking is performed
- * @return 0 on success, or negative error code on failure
- */
-COSMOAUDIO_ABI int cosmoaudio_open(  //
-    struct CosmoAudio** out_ca,      //
-    const struct CosmoAudioOpenOptions* options) {
-
-  // Validate arguments.
-  if (!out_ca)
-    return COSMOAUDIO_EINVAL;
-  *out_ca = NULL;
-  if (!options)
-    return COSMOAUDIO_EINVAL;
-  if (options->sizeofThis < (int)sizeof(struct CosmoAudioOpenOptions))
-    return COSMOAUDIO_EINVAL;
-  if (options->bufferFrames < 0)
-    return COSMOAUDIO_EINVAL;
-  if (options->sampleRate < 8000)
-    return COSMOAUDIO_EINVAL;
-  if (options->channels < 1)
-    return COSMOAUDIO_EINVAL;
-  if (!options->deviceType)
-    return COSMOAUDIO_EINVAL;
-  if (options->deviceType &
-      ~(kCosmoAudioDeviceTypePlayback | kCosmoAudioDeviceTypeCapture))
-    return COSMOAUDIO_EINVAL;
-
-  // Allocate cosmo audio object.
-  struct CosmoAudio* ca;
-  ca = (struct CosmoAudio*)calloc(1, sizeof(struct CosmoAudio));
-  if (!ca)
-    return COSMOAUDIO_ERROR;
-  ca->channels = options->channels;
-  ca->sampleRate = options->sampleRate;
-  ca->deviceType = options->deviceType;
-
-  // Create win32-style condition variable.
-  if (ma_event_init(&ca->event) != MA_SUCCESS) {
-    free(ca);
-    return COSMOAUDIO_ERROR;
-  }
-
-  // Create audio log.
-  if (ma_log_init(NULL, &ca->log) != MA_SUCCESS) {
-    ma_event_uninit(&ca->event);
-    free(ca);
-    return COSMOAUDIO_ERROR;
-  }
-  if (!options->debugLog)
-    ca->log.callbackCount = 0;
-
-  // Create audio context.
-  ma_context_config contextConfig = ma_context_config_init();
-  contextConfig.pLog = &ca->log;
-  if (ma_context_init(NULL, 0, &contextConfig, &ca->context) != MA_SUCCESS) {
-    ma_event_uninit(&ca->event);
-    ma_log_uninit(&ca->log);
-    free(ca);
-    return COSMOAUDIO_ERROR;
-  }
-
-  // Initialize device.
-  ma_result result;
-  ma_device_config deviceConfig;
-  deviceConfig = ma_device_config_init(ca->deviceType);
-  deviceConfig.sampleRate = ca->sampleRate;
-  if (ca->deviceType & kCosmoAudioDeviceTypeCapture) {
-    deviceConfig.capture.channels = ca->channels;
-    deviceConfig.capture.format = ma_format_f32;
-    deviceConfig.capture.shareMode = ma_share_mode_shared;
-  }
-  if (ca->deviceType & kCosmoAudioDeviceTypePlayback) {
-    deviceConfig.playback.channels = ca->channels;
-    deviceConfig.playback.format = ma_format_f32;
-  }
-  deviceConfig.dataCallback = data_callback;
-  deviceConfig.pUserData = ca;
-  result = ma_device_init(&ca->context, &deviceConfig, &ca->device);
-  if (result != MA_SUCCESS) {
-    ma_context_uninit(&ca->context);
-    ma_event_uninit(&ca->event);
-    ma_log_uninit(&ca->log);
-    free(ca);
-    return COSMOAUDIO_ERROR;
-  }
-
-  // Initialize the speaker ring buffer.
-  int period = ca->device.playback.internalPeriodSizeInFrames;
-  if (!options->bufferFrames) {
-    ca->outputBufferFrames = period * 10;
-  } else if (options->bufferFrames < period * 2) {
-    ca->outputBufferFrames = period * 2;
-  } else {
-    ca->outputBufferFrames = options->bufferFrames;
-  }
-  if (ca->deviceType & kCosmoAudioDeviceTypePlayback) {
-    result = ma_pcm_rb_init(ma_format_f32, ca->channels, ca->outputBufferFrames,
-                            NULL, NULL, &ca->output);
-    if (result != MA_SUCCESS) {
-      ma_device_uninit(&ca->device);
-      ma_context_uninit(&ca->context);
-      ma_event_uninit(&ca->event);
-      ma_log_uninit(&ca->log);
-      free(ca);
-      return COSMOAUDIO_ERROR;
-    }
-    ma_pcm_rb_set_sample_rate(&ca->output, ca->sampleRate);
-  }
-
-  // Initialize the microphone ring buffer.
-  period = ca->device.capture.internalPeriodSizeInFrames;
-  if (!options->bufferFrames) {
-    ca->inputBufferFrames = period * 10;
-  } else if (options->bufferFrames < period * 2) {
-    ca->inputBufferFrames = period * 2;
-  } else {
-    ca->inputBufferFrames = options->bufferFrames;
-  }
-  if (ca->deviceType & kCosmoAudioDeviceTypeCapture) {
-    result = ma_pcm_rb_init(ma_format_f32, ca->channels, ca->inputBufferFrames,
-                            NULL, NULL, &ca->input);
-    if (result != MA_SUCCESS) {
-      ma_device_uninit(&ca->device);
-      if (ca->deviceType & kCosmoAudioDeviceTypePlayback)
-        ma_pcm_rb_uninit(&ca->output);
-      ma_context_uninit(&ca->context);
-      ma_event_uninit(&ca->event);
-      ma_log_uninit(&ca->log);
-      free(ca);
-      return COSMOAUDIO_ERROR;
-    }
-    ma_pcm_rb_set_sample_rate(&ca->output, ca->sampleRate);
-  }
-
-  // Start audio playback.
-  if (ma_device_start(&ca->device) != MA_SUCCESS) {
-    ma_device_uninit(&ca->device);
-    if (ca->deviceType & kCosmoAudioDeviceTypePlayback)
-      ma_pcm_rb_uninit(&ca->output);
-    if (ca->deviceType & kCosmoAudioDeviceTypeCapture)
-      ma_pcm_rb_uninit(&ca->input);
-    ma_context_uninit(&ca->context);
-    ma_event_uninit(&ca->event);
-    ma_log_uninit(&ca->log);
-    free(ca);
-    return COSMOAUDIO_ERROR;
-  }
-
-  *out_ca = ca;
-  return COSMOAUDIO_SUCCESS;
-}
-
-/**
- * Closes audio device and frees all associated resources.
- *
- * This function is non-blocking and will drop buffered audio. In
- * playback mode, you need to call cosmoaudio_flush() to ensure data
- * supplied by cosmoaudio_write() gets played on your speaker.
- *
- * Calling this function twice on the same object will result in
- * undefined behavior. Even if this function fails, the `ca` will be
- * freed to the greatest extent possible.
- *
- * @param ca is CosmoAudio object returned earlier by cosmoaudio_open()
- * @return 0 on success, or negative error code on failure
- */
-COSMOAUDIO_ABI int cosmoaudio_close(struct CosmoAudio* ca) {
-  if (!ca)
-    return COSMOAUDIO_EINVAL;
-  ma_device_uninit(&ca->device);  // do this first
-  if (ca->deviceType & kCosmoAudioDeviceTypePlayback)
-    ma_pcm_rb_uninit(&ca->output);
-  if (ca->deviceType & kCosmoAudioDeviceTypeCapture)
-    ma_pcm_rb_uninit(&ca->input);
-  ma_context_uninit(&ca->context);
-  ma_event_uninit(&ca->event);
-  ma_log_uninit(&ca->log);
-  free(ca);
-  return COSMOAUDIO_SUCCESS;
-}
-
-/**
- * Writes raw audio data to speaker.
- *
- * The data is written to a ring buffer in real-time, which is then
- * played back very soon on the audio device. This has tolerence for
- * a certain amount of buffering, but expects that this function is
- * repeatedly called at a regular time interval. The caller should
- * have its own sleep loop for this purpose.
- *
- * This function never blocks. Programs that don't have their own timer
- * can use cosmoaudio_poll() to wait until audio may be written.
- *
- * For any given CosmoAudio object, it's assumed that only a single
- * thread will call this function.
- *
- * @param ca is CosmoAudio object returned earlier by cosmoaudio_open()
- * @param data is pointer to raw audio samples, expected to be in the range
- *     -1.0 to 1.0, where channels are interleaved
- * @param frames is the number of frames (i.e. number of samples divided by
- *     number of channels) from `data` to write to audio device
- * @return number of frames written, or negative error code on failure
- */
-COSMOAUDIO_ABI int cosmoaudio_write(struct CosmoAudio* ca, const float* data,
-                                    int frames) {
-  if (!ca)
-    return COSMOAUDIO_EINVAL;
-  if (frames < 0)
-    return COSMOAUDIO_EINVAL;
-  if (!(ca->deviceType & kCosmoAudioDeviceTypePlayback))
-    return COSMOAUDIO_EINVAL;
-  if (1u + frames > ca->outputBufferFrames)
-    return COSMOAUDIO_ENOBUF;
-  if (!frames)
-    return 0;
-  if (!data)
-    return COSMOAUDIO_EINVAL;
-  return write_ring_buffer(&ca->log, &ca->output, data, frames, ca->channels);
-}
-
-/**
- * Reads raw audio data from microphone.
- *
- * The data is read from a ring buffer in real-time, which is then
- * played back on the audio device. This has tolerence for a certain
- * amount of buffering (based on the `bufferFrames` parameter passed to
- * cosmoaudio_open(), which by default assumes this function will be
- * called at at a regular time interval.
- *
- * This function never blocks. Programs that don't have their own timer
- * can use cosmoaudio_poll() to wait until audio may be read.
- *
- * For any given CosmoAudio object, it's assumed that only a single
- * thread will call this function.
- *
- * @param ca is CosmoAudio object returned earlier by cosmoaudio_open()
- * @param data is pointer to raw audio samples, expected to be in the range
- *     -1.0 to 1.0, where channels are interleaved
- * @param frames is the number of frames (i.e. number of samples divided by
- *     number of channels) from `data` to read from microphone
- * @return number of frames read, or negative error code on failure
- */
-COSMOAUDIO_ABI int cosmoaudio_read(struct CosmoAudio* ca, float* data,
-                                   int frames) {
-  if (!ca)
-    return COSMOAUDIO_EINVAL;
-  if (frames < 0)
-    return COSMOAUDIO_EINVAL;
-  if (!(ca->deviceType & kCosmoAudioDeviceTypeCapture))
-    return COSMOAUDIO_EINVAL;
-  if (!frames)
-    return 0;
-  if (!data)
-    return COSMOAUDIO_EINVAL;
-  return read_ring_buffer(&ca->log, &ca->input, data, frames, ca->channels);
-}
-
-/**
- * Waits until it's possible to read/write audio.
- *
- * This function is uninterruptible. All signals are masked throughout
- * the duration of time this function may block, including cancelation
- * signals, because this is not a cancelation point. Cosmopolitan Libc
- * applies this masking in its dlopen wrapper.
- *
- * @param ca is CosmoAudio object returned earlier by cosmoaudio_open()
- * @param in_out_readFrames if non-NULL specifies how many frames of
- *     capture data be immediately readable by cosmoaudio_read() before
- *     this can return; it must not exceed the buffer size; on return
- *     this will be set to the actual number of frames in the buffer;
- *     if the caller supplies a zero then this call is a non-blocking
- *     way to query buffer sizes
- * @param in_out_writeFrames if non-NULL specifies how many frames of
- *     capture data be immediately writable by cosmoaudio_write() before
- *     this can return; it must not exceed the buffer size; on return
- *     this will be set to the actual number of frames in the buffer;
- *     if the caller supplies a zero then this call is a non-blocking
- *     way to query buffer sizes
- * @return 0 on success, or negative error code on error
- */
-COSMOAUDIO_ABI int cosmoaudio_poll(struct CosmoAudio* ca,
-                                   int* in_out_readFrames,
-                                   int* in_out_writeFrames) {
-  if (!ca)
-    return COSMOAUDIO_EINVAL;
-  if (!in_out_readFrames && !in_out_writeFrames)
-    return COSMOAUDIO_EINVAL;
-  if (in_out_readFrames && !(ca->deviceType & kCosmoAudioDeviceTypeCapture))
-    return COSMOAUDIO_EINVAL;
-  if (in_out_writeFrames && !(ca->deviceType & kCosmoAudioDeviceTypePlayback))
-    return COSMOAUDIO_EINVAL;
-  if (in_out_readFrames && 1u + *in_out_readFrames > ca->inputBufferFrames)
-    return COSMOAUDIO_ENOBUF;
-  if (in_out_writeFrames && 1u + *in_out_writeFrames > ca->outputBufferFrames)
-    return COSMOAUDIO_ENOBUF;
-  for (;;) {
-    int done = 1;
-    ma_uint32 readable = 0;
-    ma_uint32 writable = 0;
-    if (in_out_readFrames) {
-      readable = ma_pcm_rb_available_read(&ca->input);
-      done &= readable >= (ma_uint32)*in_out_readFrames;
-    }
-    if (in_out_writeFrames) {
-      writable = ma_pcm_rb_available_write(&ca->output);
-      done &= writable >= (ma_uint32)*in_out_writeFrames;
-    }
-    if (done) {
-      if (in_out_readFrames)
-        *in_out_readFrames = readable;
-      if (in_out_writeFrames)
-        *in_out_writeFrames = writable;
-      return COSMOAUDIO_SUCCESS;
-    }
-    if (ma_event_wait(&ca->event) != MA_SUCCESS)
-      return COSMOAUDIO_ERROR;
-  }
-}
-
-/**
- * Waits for written samples to be sent to device.
- *
- * This function is only valid to call in playback or duplex mode.
- *
- * This function is uninterruptible. All signals are masked throughout
- * the duration of time this function may block, including cancelation
- * signals, because this is not a cancelation point. Cosmopolitan Libc
- * applies this masking in its dlopen wrapper.
- *
- * @param ca is CosmoAudio object returned earlier by cosmoaudio_open()
- * @return 0 on success, or negative error code on failure
- */
-COSMOAUDIO_ABI int cosmoaudio_flush(struct CosmoAudio* ca) {
-  if (!ca)
-    return COSMOAUDIO_EINVAL;
-  if (!(ca->deviceType & kCosmoAudioDeviceTypePlayback))
-    return COSMOAUDIO_EINVAL;
-  for (;;) {
-    if (!ma_pcm_rb_available_read(&ca->output))
-      return COSMOAUDIO_SUCCESS;
-    if (ma_event_wait(&ca->event) != MA_SUCCESS)
-      return COSMOAUDIO_ERROR;
-  }
-}
diff --git a/dsp/audio/cosmoaudio/cosmoaudio.dll b/dsp/audio/cosmoaudio/cosmoaudio.dll
deleted file mode 100644
index 7ef152aa4..000000000
Binary files a/dsp/audio/cosmoaudio/cosmoaudio.dll and /dev/null differ
diff --git a/dsp/audio/cosmoaudio/cosmoaudio.h b/dsp/audio/cosmoaudio/cosmoaudio.h
deleted file mode 100644
index 40158ab81..000000000
--- a/dsp/audio/cosmoaudio/cosmoaudio.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef COSMOAUDIO_H_
-#define COSMOAUDIO_H_
-
-#ifdef _MSC_VER
-#define COSMOAUDIO_ABI
-#ifdef COSMOAUDIO_BUILD
-#define COSMOAUDIO_API __declspec(dllexport)
-#else
-#define COSMOAUDIO_API __declspec(dllimport)
-#endif
-#else
-#define COSMOAUDIO_API
-#ifdef __x86_64__
-#define COSMOAUDIO_ABI __attribute__((__ms_abi__, __visibility__("default")))
-#else
-#define COSMOAUDIO_ABI __attribute__((__visibility__("default")))
-#endif
-#endif
-
-#define COSMOAUDIO_SUCCESS -0  // no error or nothing written
-#define COSMOAUDIO_ERROR   -1  // unspecified error
-#define COSMOAUDIO_EINVAL  -2  // invalid parameters passed to api
-#define COSMOAUDIO_ELINK   -3  // loading cosmoaudio dso failed
-#define COSMOAUDIO_ENOBUF  -4  // invalid buffering parameters
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct CosmoAudio;
-
-enum CosmoAudioDeviceType {
-  kCosmoAudioDeviceTypePlayback = 1,
-  kCosmoAudioDeviceTypeCapture = 2,
-  kCosmoAudioDeviceTypeDuplex =
-      kCosmoAudioDeviceTypePlayback | kCosmoAudioDeviceTypeCapture,
-};
-
-struct CosmoAudioOpenOptions {
-
-  // This field must be set to sizeof(struct CosmoAudioOpenOptions) or
-  // cosmoaudio_open() will return COSMOAUDIO_EINVAL.
-  int sizeofThis;
-
-  // Whether you want this object to open the speaker or microphone.
-  // Please note that asking for microphone access may cause some OSes
-  // like MacOS to show a popup asking the user for permission.
-  enum CosmoAudioDeviceType deviceType;
-
-  // The sample rate can be 44100 for CD quality, 8000 for telephone
-  // quality, etc. Values below 8000 are currently not supported.
-  int sampleRate;
-
-  // The number of audio channels in each interleaved frame. Should be 1
-  // for mono or 2 for stereo.
-  int channels;
-
-  // Number of frames in each ring buffer. A frame consists of a PCM
-  // sample for each channel. Set to 0 for default. If this is less than
-  // the device period size times two, it'll be increased to that value.
-  int bufferFrames;
-
-  // Enables debug logging if non-zero.
-  int debugLog;
-};
-
-COSMOAUDIO_API int cosmoaudio_version(void) COSMOAUDIO_ABI;
-
-COSMOAUDIO_API int cosmoaudio_open(              //
-    struct CosmoAudio **out_ca,                  //
-    const struct CosmoAudioOpenOptions *options  //
-    ) COSMOAUDIO_ABI;
-
-COSMOAUDIO_API int cosmoaudio_close(  //
-    struct CosmoAudio *ca             //
-    ) COSMOAUDIO_ABI;
-
-COSMOAUDIO_API int cosmoaudio_write(  //
-    struct CosmoAudio *ca,            //
-    const float *samples,             //
-    int frameCount                    //
-    ) COSMOAUDIO_ABI;
-
-COSMOAUDIO_API int cosmoaudio_flush(  //
-    struct CosmoAudio *ca             //
-    ) COSMOAUDIO_ABI;
-
-COSMOAUDIO_API int cosmoaudio_read(  //
-    struct CosmoAudio *ca,           //
-    float *out_samples,              //
-    int frameCount                   //
-    ) COSMOAUDIO_ABI;
-
-COSMOAUDIO_API int cosmoaudio_poll(  //
-    struct CosmoAudio *ca,           //
-    int *in_out_readFrames,          //
-    int *in_out_writeFrames          //
-    ) COSMOAUDIO_ABI;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* COSMOAUDIO_H_ */
diff --git a/dsp/audio/cosmoaudio/miniaudio.h b/dsp/audio/cosmoaudio/miniaudio.h
deleted file mode 100644
index 47332e11a..000000000
--- a/dsp/audio/cosmoaudio/miniaudio.h
+++ /dev/null
@@ -1,92621 +0,0 @@
-/*
-Audio playback and capture library. Choice of public domain or MIT-0. See license statements at the end of this file.
-miniaudio - v0.11.21 - 2023-11-15
-
-David Reid - mackron@gmail.com
-
-Website:       https://miniaud.io
-Documentation: https://miniaud.io/docs
-GitHub:        https://github.com/mackron/miniaudio
-*/
-
-/*
-1. Introduction
-===============
-miniaudio is a single file library for audio playback and capture. To use it, do the following in
-one .c file:
-
-    ```c
-    #define MINIAUDIO_IMPLEMENTATION
-    #include "miniaudio.h"
-    ```
-
-You can do `#include "miniaudio.h"` in other parts of the program just like any other header.
-
-miniaudio includes both low level and high level APIs. The low level API is good for those who want
-to do all of their mixing themselves and only require a light weight interface to the underlying
-audio device. The high level API is good for those who have complex mixing and effect requirements.
-
-In miniaudio, objects are transparent structures. Unlike many other libraries, there are no handles
-to opaque objects which means you need to allocate memory for objects yourself. In the examples
-presented in this documentation you will often see objects declared on the stack. You need to be
-careful when translating these examples to your own code so that you don't accidentally declare
-your objects on the stack and then cause them to become invalid once the function returns. In
-addition, you must ensure the memory address of your objects remain the same throughout their
-lifetime. You therefore cannot be making copies of your objects.
-
-A config/init pattern is used throughout the entire library. The idea is that you set up a config
-object and pass that into the initialization routine. The advantage to this system is that the
-config object can be initialized with logical defaults and new properties added to it without
-breaking the API. The config object can be allocated on the stack and does not need to be
-maintained after initialization of the corresponding object.
-
-
-1.1. Low Level API
-------------------
-The low level API gives you access to the raw audio data of an audio device. It supports playback,
-capture, full-duplex and loopback (WASAPI only). You can enumerate over devices to determine which
-physical device(s) you want to connect to.
-
-The low level API uses the concept of a "device" as the abstraction for physical devices. The idea
-is that you choose a physical device to emit or capture audio from, and then move data to/from the
-device when miniaudio tells you to. Data is delivered to and from devices asynchronously via a
-callback which you specify when initializing the device.
-
-When initializing the device you first need to configure it. The device configuration allows you to
-specify things like the format of the data delivered via the callback, the size of the internal
-buffer and the ID of the device you want to emit or capture audio from.
-
-Once you have the device configuration set up you can initialize the device. When initializing a
-device you need to allocate memory for the device object beforehand. This gives the application
-complete control over how the memory is allocated. In the example below we initialize a playback
-device on the stack, but you could allocate it on the heap if that suits your situation better.
-
-    ```c
-    void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount)
-    {
-        // In playback mode copy data to pOutput. In capture mode read data from pInput. In full-duplex mode, both
-        // pOutput and pInput will be valid and you can move data from pInput into pOutput. Never process more than
-        // frameCount frames.
-    }
-
-    int main()
-    {
-        ma_device_config config = ma_device_config_init(ma_device_type_playback);
-        config.playback.format   = ma_format_f32;   // Set to ma_format_unknown to use the device's native format.
-        config.playback.channels = 2;               // Set to 0 to use the device's native channel count.
-        config.sampleRate        = 48000;           // Set to 0 to use the device's native sample rate.
-        config.dataCallback      = data_callback;   // This function will be called when miniaudio needs more data.
-        config.pUserData         = pMyCustomData;   // Can be accessed from the device object (device.pUserData).
-
-        ma_device device;
-        if (ma_device_init(NULL, &config, &device) != MA_SUCCESS) {
-            return -1;  // Failed to initialize the device.
-        }
-
-        ma_device_start(&device);     // The device is sleeping by default so you'll need to start it manually.
-
-        // Do something here. Probably your program's main loop.
-
-        ma_device_uninit(&device);
-        return 0;
-    }
-    ```
-
-In the example above, `data_callback()` is where audio data is written and read from the device.
-The idea is in playback mode you cause sound to be emitted from the speakers by writing audio data
-to the output buffer (`pOutput` in the example). In capture mode you read data from the input
-buffer (`pInput`) to extract sound captured by the microphone. The `frameCount` parameter tells you
-how many frames can be written to the output buffer and read from the input buffer. A "frame" is
-one sample for each channel. For example, in a stereo stream (2 channels), one frame is 2
-samples: one for the left, one for the right. The channel count is defined by the device config.
-The size in bytes of an individual sample is defined by the sample format which is also specified
-in the device config. Multi-channel audio data is always interleaved, which means the samples for
-each frame are stored next to each other in memory. For example, in a stereo stream the first pair
-of samples will be the left and right samples for the first frame, the second pair of samples will
-be the left and right samples for the second frame, etc.
-
-The configuration of the device is defined by the `ma_device_config` structure. The config object
-is always initialized with `ma_device_config_init()`. It's important to always initialize the
-config with this function as it initializes it with logical defaults and ensures your program
-doesn't break when new members are added to the `ma_device_config` structure. The example above
-uses a fairly simple and standard device configuration. The call to `ma_device_config_init()` takes
-a single parameter, which is whether or not the device is a playback, capture, duplex or loopback
-device (loopback devices are not supported on all backends). The `config.playback.format` member
-sets the sample format which can be one of the following (all formats are native-endian):
-
-    +---------------+----------------------------------------+---------------------------+
-    | Symbol        | Description                            | Range                     |
-    +---------------+----------------------------------------+---------------------------+
-    | ma_format_f32 | 32-bit floating point                  | [-1, 1]                   |
-    | ma_format_s16 | 16-bit signed integer                  | [-32768, 32767]           |
-    | ma_format_s24 | 24-bit signed integer (tightly packed) | [-8388608, 8388607]       |
-    | ma_format_s32 | 32-bit signed integer                  | [-2147483648, 2147483647] |
-    | ma_format_u8  | 8-bit unsigned integer                 | [0, 255]                  |
-    +---------------+----------------------------------------+---------------------------+
-
-The `config.playback.channels` member sets the number of channels to use with the device. The
-channel count cannot exceed MA_MAX_CHANNELS. The `config.sampleRate` member sets the sample rate
-(which must be the same for both playback and capture in full-duplex configurations). This is
-usually set to 44100 or 48000, but can be set to anything. It's recommended to keep this between
-8000 and 384000, however.
-
-Note that leaving the format, channel count and/or sample rate at their default values will result
-in the internal device's native configuration being used which is useful if you want to avoid the
-overhead of miniaudio's automatic data conversion.
-
-In addition to the sample format, channel count and sample rate, the data callback and user data
-pointer are also set via the config. The user data pointer is not passed into the callback as a
-parameter, but is instead set to the `pUserData` member of `ma_device` which you can access
-directly since all miniaudio structures are transparent.
-
-Initializing the device is done with `ma_device_init()`. This will return a result code telling you
-what went wrong, if anything. On success it will return `MA_SUCCESS`. After initialization is
-complete the device will be in a stopped state. To start it, use `ma_device_start()`.
-Uninitializing the device will stop it, which is what the example above does, but you can also stop
-the device with `ma_device_stop()`. To resume the device simply call `ma_device_start()` again.
-Note that it's important to never stop or start the device from inside the callback. This will
-result in a deadlock. Instead you set a variable or signal an event indicating that the device
-needs to stop and handle it in a different thread. The following APIs must never be called inside
-the callback:
-
-    ```c
-    ma_device_init()
-    ma_device_init_ex()
-    ma_device_uninit()
-    ma_device_start()
-    ma_device_stop()
-    ```
-
-You must never try uninitializing and reinitializing a device inside the callback. You must also
-never try to stop and start it from inside the callback. There are a few other things you shouldn't
-do in the callback depending on your requirements, however this isn't so much a thread-safety
-thing, but rather a real-time processing thing which is beyond the scope of this introduction.
-
-The example above demonstrates the initialization of a playback device, but it works exactly the
-same for capture. All you need to do is change the device type from `ma_device_type_playback` to
-`ma_device_type_capture` when setting up the config, like so:
-
-    ```c
-    ma_device_config config = ma_device_config_init(ma_device_type_capture);
-    config.capture.format   = MY_FORMAT;
-    config.capture.channels = MY_CHANNEL_COUNT;
-    ```
-
-In the data callback you just read from the input buffer (`pInput` in the example above) and leave
-the output buffer alone (it will be set to NULL when the device type is set to
-`ma_device_type_capture`).
-
-These are the available device types and how you should handle the buffers in the callback:
-
-    +-------------------------+--------------------------------------------------------+
-    | Device Type             | Callback Behavior                                      |
-    +-------------------------+--------------------------------------------------------+
-    | ma_device_type_playback | Write to output buffer, leave input buffer untouched.  |
-    | ma_device_type_capture  | Read from input buffer, leave output buffer untouched. |
-    | ma_device_type_duplex   | Read from input buffer, write to output buffer.        |
-    | ma_device_type_loopback | Read from input buffer, leave output buffer untouched. |
-    +-------------------------+--------------------------------------------------------+
-
-You will notice in the example above that the sample format and channel count is specified
-separately for playback and capture. This is to support different data formats between the playback
-and capture devices in a full-duplex system. An example may be that you want to capture audio data
-as a monaural stream (one channel), but output sound to a stereo speaker system. Note that if you
-use different formats between playback and capture in a full-duplex configuration you will need to
-convert the data yourself. There are functions available to help you do this which will be
-explained later.
-
-The example above did not specify a physical device to connect to which means it will use the
-operating system's default device. If you have multiple physical devices connected and you want to
-use a specific one you will need to specify the device ID in the configuration, like so:
-
-    ```c
-    config.playback.pDeviceID = pMyPlaybackDeviceID;    // Only if requesting a playback or duplex device.
-    config.capture.pDeviceID = pMyCaptureDeviceID;      // Only if requesting a capture, duplex or loopback device.
-    ```
-
-To retrieve the device ID you will need to perform device enumeration, however this requires the
-use of a new concept called the "context". Conceptually speaking the context sits above the device.
-There is one context to many devices. The purpose of the context is to represent the backend at a
-more global level and to perform operations outside the scope of an individual device. Mainly it is
-used for performing run-time linking against backend libraries, initializing backends and
-enumerating devices. The example below shows how to enumerate devices.
-
-    ```c
-    ma_context context;
-    if (ma_context_init(NULL, 0, NULL, &context) != MA_SUCCESS) {
-        // Error.
-    }
-
-    ma_device_info* pPlaybackInfos;
-    ma_uint32 playbackCount;
-    ma_device_info* pCaptureInfos;
-    ma_uint32 captureCount;
-    if (ma_context_get_devices(&context, &pPlaybackInfos, &playbackCount, &pCaptureInfos, &captureCount) != MA_SUCCESS) {
-        // Error.
-    }
-
-    // Loop over each device info and do something with it. Here we just print the name with their index. You may want
-    // to give the user the opportunity to choose which device they'd prefer.
-    for (ma_uint32 iDevice = 0; iDevice < playbackCount; iDevice += 1) {
-        printf("%d - %s\n", iDevice, pPlaybackInfos[iDevice].name);
-    }
-
-    ma_device_config config = ma_device_config_init(ma_device_type_playback);
-    config.playback.pDeviceID = &pPlaybackInfos[chosenPlaybackDeviceIndex].id;
-    config.playback.format    = MY_FORMAT;
-    config.playback.channels  = MY_CHANNEL_COUNT;
-    config.sampleRate         = MY_SAMPLE_RATE;
-    config.dataCallback       = data_callback;
-    config.pUserData          = pMyCustomData;
-
-    ma_device device;
-    if (ma_device_init(&context, &config, &device) != MA_SUCCESS) {
-        // Error
-    }
-
-    ...
-
-    ma_device_uninit(&device);
-    ma_context_uninit(&context);
-    ```
-
-The first thing we do in this example is initialize a `ma_context` object with `ma_context_init()`.
-The first parameter is a pointer to a list of `ma_backend` values which are used to override the
-default backend priorities. When this is NULL, as in this example, miniaudio's default priorities
-are used. The second parameter is the number of backends listed in the array pointed to by the
-first parameter. The third parameter is a pointer to a `ma_context_config` object which can be
-NULL, in which case defaults are used. The context configuration is used for setting the logging
-callback, custom memory allocation callbacks, user-defined data and some backend-specific
-configurations.
-
-Once the context has been initialized you can enumerate devices. In the example above we use the
-simpler `ma_context_get_devices()`, however you can also use a callback for handling devices by
-using `ma_context_enumerate_devices()`. When using `ma_context_get_devices()` you provide a pointer
-to a pointer that will, upon output, be set to a pointer to a buffer containing a list of
-`ma_device_info` structures. You also provide a pointer to an unsigned integer that will receive
-the number of items in the returned buffer. Do not free the returned buffers as their memory is
-managed internally by miniaudio.
-
-The `ma_device_info` structure contains an `id` member which is the ID you pass to the device
-config. It also contains the name of the device which is useful for presenting a list of devices
-to the user via the UI.
-
-When creating your own context you will want to pass it to `ma_device_init()` when initializing the
-device. Passing in NULL, like we do in the first example, will result in miniaudio creating the
-context for you, which you don't want to do since you've already created a context. Note that
-internally the context is only tracked by it's pointer which means you must not change the location
-of the `ma_context` object. If this is an issue, consider using `malloc()` to allocate memory for
-the context.
-
-
-1.2. High Level API
--------------------
-The high level API consists of three main parts:
-
-  * Resource management for loading and streaming sounds.
-  * A node graph for advanced mixing and effect processing.
-  * A high level "engine" that wraps around the resource manager and node graph.
-
-The resource manager (`ma_resource_manager`) is used for loading sounds. It supports loading sounds
-fully into memory and also streaming. It will also deal with reference counting for you which
-avoids the same sound being loaded multiple times.
-
-The node graph is used for mixing and effect processing. The idea is that you connect a number of
-nodes into the graph by connecting each node's outputs to another node's inputs. Each node can
-implement it's own effect. By chaining nodes together, advanced mixing and effect processing can
-be achieved.
-
-The engine encapsulates both the resource manager and the node graph to create a simple, easy to
-use high level API. The resource manager and node graph APIs are covered in more later sections of
-this manual.
-
-The code below shows how you can initialize an engine using it's default configuration.
-
-    ```c
-    ma_result result;
-    ma_engine engine;
-
-    result = ma_engine_init(NULL, &engine);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to initialize the engine.
-    }
-    ```
-
-This creates an engine instance which will initialize a device internally which you can access with
-`ma_engine_get_device()`. It will also initialize a resource manager for you which can be accessed
-with `ma_engine_get_resource_manager()`. The engine itself is a node graph (`ma_node_graph`) which
-means you can pass a pointer to the engine object into any of the `ma_node_graph` APIs (with a
-cast). Alternatively, you can use `ma_engine_get_node_graph()` instead of a cast.
-
-Note that all objects in miniaudio, including the `ma_engine` object in the example above, are
-transparent structures. There are no handles to opaque structures in miniaudio which means you need
-to be mindful of how you declare them. In the example above we are declaring it on the stack, but
-this will result in the struct being invalidated once the function encapsulating it returns. If
-allocating the engine on the heap is more appropriate, you can easily do so with a standard call
-to `malloc()` or whatever heap allocation routine you like:
-
-    ```c
-    ma_engine* pEngine = malloc(sizeof(*pEngine));
-    ```
-
-The `ma_engine` API uses the same config/init pattern used all throughout miniaudio. To configure
-an engine, you can fill out a `ma_engine_config` object and pass it into the first parameter of
-`ma_engine_init()`:
-
-    ```c
-    ma_result result;
-    ma_engine engine;
-    ma_engine_config engineConfig;
-
-    engineConfig = ma_engine_config_init();
-    engineConfig.pResourceManager = &myCustomResourceManager;   // <-- Initialized as some earlier stage.
-
-    result = ma_engine_init(&engineConfig, &engine);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-    ```
-
-This creates an engine instance using a custom config. In this particular example it's showing how
-you can specify a custom resource manager rather than having the engine initialize one internally.
-This is particularly useful if you want to have multiple engine's share the same resource manager.
-
-The engine must be uninitialized with `ma_engine_uninit()` when it's no longer needed.
-
-By default the engine will be started, but nothing will be playing because no sounds have been
-initialized. The easiest but least flexible way of playing a sound is like so:
-
-    ```c
-    ma_engine_play_sound(&engine, "my_sound.wav", NULL);
-    ```
-
-This plays what miniaudio calls an "inline" sound. It plays the sound once, and then puts the
-internal sound up for recycling. The last parameter is used to specify which sound group the sound
-should be associated with which will be explained later. This particular way of playing a sound is
-simple, but lacks flexibility and features. A more flexible way of playing a sound is to first
-initialize a sound:
-
-    ```c
-    ma_result result;
-    ma_sound sound;
-
-    result = ma_sound_init_from_file(&engine, "my_sound.wav", 0, NULL, NULL, &sound);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    ma_sound_start(&sound);
-    ```
-
-This returns a `ma_sound` object which represents a single instance of the specified sound file. If
-you want to play the same file multiple times simultaneously, you need to create one sound for each
-instance.
-
-Sounds should be uninitialized with `ma_sound_uninit()`.
-
-Sounds are not started by default. Start a sound with `ma_sound_start()` and stop it with
-`ma_sound_stop()`. When a sound is stopped, it is not rewound to the start. Use
-`ma_sound_seek_to_pcm_frame(&sound, 0)` to seek back to the start of a sound. By default, starting
-and stopping sounds happens immediately, but sometimes it might be convenient to schedule the sound
-the be started and/or stopped at a specific time. This can be done with the following functions:
-
-    ```c
-    ma_sound_set_start_time_in_pcm_frames()
-    ma_sound_set_start_time_in_milliseconds()
-    ma_sound_set_stop_time_in_pcm_frames()
-    ma_sound_set_stop_time_in_milliseconds()
-    ```
-
-The start/stop time needs to be specified based on the absolute timer which is controlled by the
-engine. The current global time time in PCM frames can be retrieved with
-`ma_engine_get_time_in_pcm_frames()`. The engine's global time can be changed with
-`ma_engine_set_time_in_pcm_frames()` for synchronization purposes if required. Note that scheduling
-a start time still requires an explicit call to `ma_sound_start()` before anything will play:
-
-    ```c
-    ma_sound_set_start_time_in_pcm_frames(&sound, ma_engine_get_time_in_pcm_frames(&engine) + (ma_engine_get_sample_rate(&engine) * 2);
-    ma_sound_start(&sound);
-    ```
-
-The third parameter of `ma_sound_init_from_file()` is a set of flags that control how the sound be
-loaded and a few options on which features should be enabled for that sound. By default, the sound
-is synchronously loaded fully into memory straight from the file system without any kind of
-decoding. If you want to decode the sound before storing it in memory, you need to specify the
-`MA_SOUND_FLAG_DECODE` flag. This is useful if you want to incur the cost of decoding at an earlier
-stage, such as a loading stage. Without this option, decoding will happen dynamically at mixing
-time which might be too expensive on the audio thread.
-
-If you want to load the sound asynchronously, you can specify the `MA_SOUND_FLAG_ASYNC` flag. This
-will result in `ma_sound_init_from_file()` returning quickly, but the sound will not start playing
-until the sound has had some audio decoded.
-
-The fourth parameter is a pointer to sound group. A sound group is used as a mechanism to organise
-sounds into groups which have their own effect processing and volume control. An example is a game
-which might have separate groups for sfx, voice and music. Each of these groups have their own
-independent volume control. Use `ma_sound_group_init()` or `ma_sound_group_init_ex()` to initialize
-a sound group.
-
-Sounds and sound groups are nodes in the engine's node graph and can be plugged into any `ma_node`
-API. This makes it possible to connect sounds and sound groups to effect nodes to produce complex
-effect chains.
-
-A sound can have it's volume changed with `ma_sound_set_volume()`. If you prefer decibel volume
-control you can use `ma_volume_db_to_linear()` to convert from decibel representation to linear.
-
-Panning and pitching is supported with `ma_sound_set_pan()` and `ma_sound_set_pitch()`. If you know
-a sound will never have it's pitch changed with `ma_sound_set_pitch()` or via the doppler effect,
-you can specify the `MA_SOUND_FLAG_NO_PITCH` flag when initializing the sound for an optimization.
-
-By default, sounds and sound groups have spatialization enabled. If you don't ever want to
-spatialize your sounds, initialize the sound with the `MA_SOUND_FLAG_NO_SPATIALIZATION` flag. The
-spatialization model is fairly simple and is roughly on feature parity with OpenAL. HRTF and
-environmental occlusion are not currently supported, but planned for the future. The supported
-features include:
-
-  * Sound and listener positioning and orientation with cones
-  * Attenuation models: none, inverse, linear and exponential
-  * Doppler effect
-
-Sounds can be faded in and out with `ma_sound_set_fade_in_pcm_frames()`.
-
-To check if a sound is currently playing, you can use `ma_sound_is_playing()`. To check if a sound
-is at the end, use `ma_sound_at_end()`. Looping of a sound can be controlled with
-`ma_sound_set_looping()`. Use `ma_sound_is_looping()` to check whether or not the sound is looping.
-
-
-
-2. Building
-===========
-miniaudio should work cleanly out of the box without the need to download or install any
-dependencies. See below for platform-specific details.
-
-Note that GCC and Clang require `-msse2`, `-mavx2`, etc. for SIMD optimizations.
-
-If you get errors about undefined references to `__sync_val_compare_and_swap_8`, `__atomic_load_8`,
-etc. you need to link with `-latomic`.
-
-
-2.1. Windows
-------------
-The Windows build should compile cleanly on all popular compilers without the need to configure any
-include paths nor link to any libraries.
-
-The UWP build may require linking to mmdevapi.lib if you get errors about an unresolved external
-symbol for `ActivateAudioInterfaceAsync()`.
-
-
-2.2. macOS and iOS
-------------------
-The macOS build should compile cleanly without the need to download any dependencies nor link to
-any libraries or frameworks. The iOS build needs to be compiled as Objective-C and will need to
-link the relevant frameworks but should compile cleanly out of the box with Xcode. Compiling
-through the command line requires linking to `-lpthread` and `-lm`.
-
-Due to the way miniaudio links to frameworks at runtime, your application may not pass Apple's
-notarization process. To fix this there are two options. The first is to use the
-`MA_NO_RUNTIME_LINKING` option, like so:
-
-    ```c
-    #ifdef __APPLE__
-        #define MA_NO_RUNTIME_LINKING
-    #endif
-    #define MINIAUDIO_IMPLEMENTATION
-    #include "miniaudio.h"
-    ```
-
-This will require linking with `-framework CoreFoundation -framework CoreAudio -framework AudioToolbox`.
-If you get errors about AudioToolbox, try with `-framework AudioUnit` instead. You may get this when
-using older versions of iOS. Alternatively, if you would rather keep using runtime linking you can
-add the following to your entitlements.xcent file:
-
-    ```
-    <key>com.apple.security.cs.allow-dyld-environment-variables</key>
-    <true/>
-    <key>com.apple.security.cs.allow-unsigned-executable-memory</key>
-    <true/>
-    ```
-
-See this discussion for more info: https://github.com/mackron/miniaudio/issues/203.
-
-
-2.3. Linux
-----------
-The Linux build only requires linking to `-ldl`, `-lpthread` and `-lm`. You do not need any
-development packages. You may need to link with `-latomic` if you're compiling for 32-bit ARM.
-
-
-2.4. BSD
---------
-The BSD build only requires linking to `-lpthread` and `-lm`. NetBSD uses audio(4), OpenBSD uses
-sndio and FreeBSD uses OSS. You may need to link with `-latomic` if you're compiling for 32-bit
-ARM.
-
-
-2.5. Android
-------------
-AAudio is the highest priority backend on Android. This should work out of the box without needing
-any kind of compiler configuration. Support for AAudio starts with Android 8 which means older
-versions will fall back to OpenSL|ES which requires API level 16+.
-
-There have been reports that the OpenSL|ES backend fails to initialize on some Android based
-devices due to `dlopen()` failing to open "libOpenSLES.so". If this happens on your platform
-you'll need to disable run-time linking with `MA_NO_RUNTIME_LINKING` and link with -lOpenSLES.
-
-
-2.6. Emscripten
----------------
-The Emscripten build emits Web Audio JavaScript directly and should compile cleanly out of the box.
-You cannot use `-std=c*` compiler flags, nor `-ansi`.
-
-You can enable the use of AudioWorkets by defining `MA_ENABLE_AUDIO_WORKLETS` and then compiling
-with the following options:
-
-    -sAUDIO_WORKLET=1 -sWASM_WORKERS=1 -sASYNCIFY
-
-An example for compiling with AudioWorklet support might look like this:
-
-    emcc program.c -o bin/program.html -DMA_ENABLE_AUDIO_WORKLETS -sAUDIO_WORKLET=1 -sWASM_WORKERS=1 -sASYNCIFY
-
-To run locally, you'll need to use emrun:
-
-    emrun bin/program.html
-
-
-
-2.7. Build Options
-------------------
-`#define` these options before including miniaudio.h.
-
-    +----------------------------------+--------------------------------------------------------------------+
-    | Option                           | Description                                                        |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_WASAPI                     | Disables the WASAPI backend.                                       |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_DSOUND                     | Disables the DirectSound backend.                                  |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_WINMM                      | Disables the WinMM backend.                                        |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_ALSA                       | Disables the ALSA backend.                                         |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_PULSEAUDIO                 | Disables the PulseAudio backend.                                   |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_JACK                       | Disables the JACK backend.                                         |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_COREAUDIO                  | Disables the Core Audio backend.                                   |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_SNDIO                      | Disables the sndio backend.                                        |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_AUDIO4                     | Disables the audio(4) backend.                                     |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_OSS                        | Disables the OSS backend.                                          |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_AAUDIO                     | Disables the AAudio backend.                                       |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_OPENSL                     | Disables the OpenSL|ES backend.                                    |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_WEBAUDIO                   | Disables the Web Audio backend.                                    |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_NULL                       | Disables the null backend.                                         |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_ONLY_SPECIFIC_BACKENDS | Disables all backends by default and requires `MA_ENABLE_*` to     |
-    |                                  | enable specific backends.                                          |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_WASAPI                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the WASAPI backend.                                         |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_DSOUND                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the DirectSound backend.                                    |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_WINMM                  | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the WinMM backend.                                          |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_ALSA                   | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the ALSA backend.                                           |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_PULSEAUDIO             | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the PulseAudio backend.                                     |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_JACK                   | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the JACK backend.                                           |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_COREAUDIO              | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the Core Audio backend.                                     |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_SNDIO                  | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the sndio backend.                                          |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_AUDIO4                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the audio(4) backend.                                       |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_OSS                    | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the OSS backend.                                            |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_AAUDIO                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the AAudio backend.                                         |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_OPENSL                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the OpenSL|ES backend.                                      |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_WEBAUDIO               | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the Web Audio backend.                                      |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_ENABLE_NULL                   | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
-    |                                  | enable the null backend.                                           |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_DECODING                   | Disables decoding APIs.                                            |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_ENCODING                   | Disables encoding APIs.                                            |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_WAV                        | Disables the built-in WAV decoder and encoder.                     |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_FLAC                       | Disables the built-in FLAC decoder.                                |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_MP3                        | Disables the built-in MP3 decoder.                                 |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_DEVICE_IO                  | Disables playback and recording. This will disable `ma_context`    |
-    |                                  | and `ma_device` APIs. This is useful if you only want to use       |
-    |                                  | miniaudio's data conversion and/or decoding APIs.                  |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_RESOURCE_MANAGER           | Disables the resource manager. When using the engine this will     |
-    |                                  | also disable the following functions:                              |
-    |                                  |                                                                    |
-    |                                  | ```                                                                |
-    |                                  | ma_sound_init_from_file()                                          |
-    |                                  | ma_sound_init_from_file_w()                                        |
-    |                                  | ma_sound_init_copy()                                               |
-    |                                  | ma_engine_play_sound_ex()                                          |
-    |                                  | ma_engine_play_sound()                                             |
-    |                                  | ```                                                                |
-    |                                  |                                                                    |
-    |                                  | The only way to initialize a `ma_sound` object is to initialize it |
-    |                                  | from a data source.                                                |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_NODE_GRAPH                 | Disables the node graph API. This will also disable the engine API |
-    |                                  | because it depends on the node graph.                              |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_ENGINE                     | Disables the engine API.                                           |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_THREADING                  | Disables the `ma_thread`, `ma_mutex`, `ma_semaphore` and           |
-    |                                  | `ma_event` APIs. This option is useful if you only need to use     |
-    |                                  | miniaudio for data conversion, decoding and/or encoding. Some      |
-    |                                  | families of APIs require threading which means the following       |
-    |                                  | options must also be set:                                          |
-    |                                  |                                                                    |
-    |                                  |     ```                                                            |
-    |                                  |     MA_NO_DEVICE_IO                                                |
-    |                                  |     ```                                                            |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_GENERATION                 | Disables generation APIs such a `ma_waveform` and `ma_noise`.      |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_SSE2                       | Disables SSE2 optimizations.                                       |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_AVX2                       | Disables AVX2 optimizations.                                       |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_NEON                       | Disables NEON optimizations.                                       |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_NO_RUNTIME_LINKING            | Disables runtime linking. This is useful for passing Apple's       |
-    |                                  | notarization process. When enabling this, you may need to avoid    |
-    |                                  | using `-std=c89` or `-std=c99` on Linux builds or else you may end |
-    |                                  | up with compilation errors due to conflicts with `timespec` and    |
-    |                                  | `timeval` data types.                                              |
-    |                                  |                                                                    |
-    |                                  | You may need to enable this if your target platform does not allow |
-    |                                  | runtime linking via `dlopen()`.                                    |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_DEBUG_OUTPUT                  | Enable `printf()` output of debug logs (`MA_LOG_LEVEL_DEBUG`).     |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_COINIT_VALUE                  | Windows only. The value to pass to internal calls to               |
-    |                                  | `CoInitializeEx()`. Defaults to `COINIT_MULTITHREADED`.            |
-    +----------------------------------+--------------------------------------------------------------------+
-    | MA_API                           | Controls how public APIs should be decorated. Default is `extern`. |
-    +----------------------------------+--------------------------------------------------------------------+
-
-
-3. Definitions
-==============
-This section defines common terms used throughout miniaudio. Unfortunately there is often ambiguity
-in the use of terms throughout the audio space, so this section is intended to clarify how miniaudio
-uses each term.
-
-3.1. Sample
------------
-A sample is a single unit of audio data. If the sample format is f32, then one sample is one 32-bit
-floating point number.
-
-3.2. Frame / PCM Frame
-----------------------
-A frame is a group of samples equal to the number of channels. For a stereo stream a frame is 2
-samples, a mono frame is 1 sample, a 5.1 surround sound frame is 6 samples, etc. The terms "frame"
-and "PCM frame" are the same thing in miniaudio. Note that this is different to a compressed frame.
-If ever miniaudio needs to refer to a compressed frame, such as a FLAC frame, it will always
-clarify what it's referring to with something like "FLAC frame".
-
-3.3. Channel
-------------
-A stream of monaural audio that is emitted from an individual speaker in a speaker system, or
-received from an individual microphone in a microphone system. A stereo stream has two channels (a
-left channel, and a right channel), a 5.1 surround sound system has 6 channels, etc. Some audio
-systems refer to a channel as a complex audio stream that's mixed with other channels to produce
-the final mix - this is completely different to miniaudio's use of the term "channel" and should
-not be confused.
-
-3.4. Sample Rate
-----------------
-The sample rate in miniaudio is always expressed in Hz, such as 44100, 48000, etc. It's the number
-of PCM frames that are processed per second.
-
-3.5. Formats
-------------
-Throughout miniaudio you will see references to different sample formats:
-
-    +---------------+----------------------------------------+---------------------------+
-    | Symbol        | Description                            | Range                     |
-    +---------------+----------------------------------------+---------------------------+
-    | ma_format_f32 | 32-bit floating point                  | [-1, 1]                   |
-    | ma_format_s16 | 16-bit signed integer                  | [-32768, 32767]           |
-    | ma_format_s24 | 24-bit signed integer (tightly packed) | [-8388608, 8388607]       |
-    | ma_format_s32 | 32-bit signed integer                  | [-2147483648, 2147483647] |
-    | ma_format_u8  | 8-bit unsigned integer                 | [0, 255]                  |
-    +---------------+----------------------------------------+---------------------------+
-
-All formats are native-endian.
-
-
-
-4. Data Sources
-===============
-The data source abstraction in miniaudio is used for retrieving audio data from some source. A few
-examples include `ma_decoder`, `ma_noise` and `ma_waveform`. You will need to be familiar with data
-sources in order to make sense of some of the higher level concepts in miniaudio.
-
-The `ma_data_source` API is a generic interface for reading from a data source. Any object that
-implements the data source interface can be plugged into any `ma_data_source` function.
-
-To read data from a data source:
-
-    ```c
-    ma_result result;
-    ma_uint64 framesRead;
-
-    result = ma_data_source_read_pcm_frames(pDataSource, pFramesOut, frameCount, &framesRead);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to read data from the data source.
-    }
-    ```
-
-If you don't need the number of frames that were successfully read you can pass in `NULL` to the
-`pFramesRead` parameter. If this returns a value less than the number of frames requested it means
-the end of the file has been reached. `MA_AT_END` will be returned only when the number of frames
-read is 0.
-
-When calling any data source function, with the exception of `ma_data_source_init()` and
-`ma_data_source_uninit()`, you can pass in any object that implements a data source. For example,
-you could plug in a decoder like so:
-
-    ```c
-    ma_result result;
-    ma_uint64 framesRead;
-    ma_decoder decoder;   // <-- This would be initialized with `ma_decoder_init_*()`.
-
-    result = ma_data_source_read_pcm_frames(&decoder, pFramesOut, frameCount, &framesRead);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to read data from the decoder.
-    }
-    ```
-
-If you want to seek forward you can pass in `NULL` to the `pFramesOut` parameter. Alternatively you
-can use `ma_data_source_seek_pcm_frames()`.
-
-To seek to a specific PCM frame:
-
-    ```c
-    result = ma_data_source_seek_to_pcm_frame(pDataSource, frameIndex);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to seek to PCM frame.
-    }
-    ```
-
-You can retrieve the total length of a data source in PCM frames, but note that some data sources
-may not have the notion of a length, such as noise and waveforms, and others may just not have a
-way of determining the length such as some decoders. To retrieve the length:
-
-    ```c
-    ma_uint64 length;
-
-    result = ma_data_source_get_length_in_pcm_frames(pDataSource, &length);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to retrieve the length.
-    }
-    ```
-
-Care should be taken when retrieving the length of a data source where the underlying decoder is
-pulling data from a data stream with an undefined length, such as internet radio or some kind of
-broadcast. If you do this, `ma_data_source_get_length_in_pcm_frames()` may never return.
-
-The current position of the cursor in PCM frames can also be retrieved:
-
-    ```c
-    ma_uint64 cursor;
-
-    result = ma_data_source_get_cursor_in_pcm_frames(pDataSource, &cursor);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to retrieve the cursor.
-    }
-    ```
-
-You will often need to know the data format that will be returned after reading. This can be
-retrieved like so:
-
-    ```c
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_channel channelMap[MA_MAX_CHANNELS];
-
-    result = ma_data_source_get_data_format(pDataSource, &format, &channels, &sampleRate, channelMap, MA_MAX_CHANNELS);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to retrieve data format.
-    }
-    ```
-
-If you do not need a specific data format property, just pass in NULL to the respective parameter.
-
-There may be cases where you want to implement something like a sound bank where you only want to
-read data within a certain range of the underlying data. To do this you can use a range:
-
-    ```c
-    result = ma_data_source_set_range_in_pcm_frames(pDataSource, rangeBegInFrames, rangeEndInFrames);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to set the range.
-    }
-    ```
-
-This is useful if you have a sound bank where many sounds are stored in the same file and you want
-the data source to only play one of those sub-sounds. Note that once the range is set, everything
-that takes a position, such as cursors and loop points, should always be relatvie to the start of
-the range. When the range is set, any previously defined loop point will be reset.
-
-Custom loop points can also be used with data sources. By default, data sources will loop after
-they reach the end of the data source, but if you need to loop at a specific location, you can do
-the following:
-
-    ```c
-    result = ma_data_set_loop_point_in_pcm_frames(pDataSource, loopBegInFrames, loopEndInFrames);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to set the loop point.
-    }
-    ```
-
-The loop point is relative to the current range.
-
-It's sometimes useful to chain data sources together so that a seamless transition can be achieved.
-To do this, you can use chaining:
-
-    ```c
-    ma_decoder decoder1;
-    ma_decoder decoder2;
-
-    // ... initialize decoders with ma_decoder_init_*() ...
-
-    result = ma_data_source_set_next(&decoder1, &decoder2);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to set the next data source.
-    }
-
-    result = ma_data_source_read_pcm_frames(&decoder1, pFramesOut, frameCount, pFramesRead);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to read from the decoder.
-    }
-    ```
-
-In the example above we're using decoders. When reading from a chain, you always want to read from
-the top level data source in the chain. In the example above, `decoder1` is the top level data
-source in the chain. When `decoder1` reaches the end, `decoder2` will start seamlessly without any
-gaps.
-
-Note that when looping is enabled, only the current data source will be looped. You can loop the
-entire chain by linking in a loop like so:
-
-    ```c
-    ma_data_source_set_next(&decoder1, &decoder2);  // decoder1 -> decoder2
-    ma_data_source_set_next(&decoder2, &decoder1);  // decoder2 -> decoder1 (loop back to the start).
-    ```
-
-Note that setting up chaining is not thread safe, so care needs to be taken if you're dynamically
-changing links while the audio thread is in the middle of reading.
-
-Do not use `ma_decoder_seek_to_pcm_frame()` as a means to reuse a data source to play multiple
-instances of the same sound simultaneously. This can be extremely inefficient depending on the type
-of data source and can result in glitching due to subtle changes to the state of internal filters.
-Instead, initialize multiple data sources for each instance.
-
-
-4.1. Custom Data Sources
-------------------------
-You can implement a custom data source by implementing the functions in `ma_data_source_vtable`.
-Your custom object must have `ma_data_source_base` as it's first member:
-
-    ```c
-    struct my_data_source
-    {
-        ma_data_source_base base;
-        ...
-    };
-    ```
-
-In your initialization routine, you need to call `ma_data_source_init()` in order to set up the
-base object (`ma_data_source_base`):
-
-    ```c
-    static ma_result my_data_source_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-    {
-        // Read data here. Output in the same format returned by my_data_source_get_data_format().
-    }
-
-    static ma_result my_data_source_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-    {
-        // Seek to a specific PCM frame here. Return MA_NOT_IMPLEMENTED if seeking is not supported.
-    }
-
-    static ma_result my_data_source_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-    {
-        // Return the format of the data here.
-    }
-
-    static ma_result my_data_source_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
-    {
-        // Retrieve the current position of the cursor here. Return MA_NOT_IMPLEMENTED and set *pCursor to 0 if there is no notion of a cursor.
-    }
-
-    static ma_result my_data_source_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
-    {
-        // Retrieve the length in PCM frames here. Return MA_NOT_IMPLEMENTED and set *pLength to 0 if there is no notion of a length or if the length is unknown.
-    }
-
-    static ma_data_source_vtable g_my_data_source_vtable =
-    {
-        my_data_source_read,
-        my_data_source_seek,
-        my_data_source_get_data_format,
-        my_data_source_get_cursor,
-        my_data_source_get_length
-    };
-
-    ma_result my_data_source_init(my_data_source* pMyDataSource)
-    {
-        ma_result result;
-        ma_data_source_config baseConfig;
-
-        baseConfig = ma_data_source_config_init();
-        baseConfig.vtable = &g_my_data_source_vtable;
-
-        result = ma_data_source_init(&baseConfig, &pMyDataSource->base);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        // ... do the initialization of your custom data source here ...
-
-        return MA_SUCCESS;
-    }
-
-    void my_data_source_uninit(my_data_source* pMyDataSource)
-    {
-        // ... do the uninitialization of your custom data source here ...
-
-        // You must uninitialize the base data source.
-        ma_data_source_uninit(&pMyDataSource->base);
-    }
-    ```
-
-Note that `ma_data_source_init()` and `ma_data_source_uninit()` are never called directly outside
-of the custom data source. It's up to the custom data source itself to call these within their own
-init/uninit functions.
-
-
-
-5. Engine
-=========
-The `ma_engine` API is a high level API for managing and mixing sounds and effect processing. The
-`ma_engine` object encapsulates a resource manager and a node graph, both of which will be
-explained in more detail later.
-
-Sounds are called `ma_sound` and are created from an engine. Sounds can be associated with a mixing
-group called `ma_sound_group` which are also created from the engine. Both `ma_sound` and
-`ma_sound_group` objects are nodes within the engine's node graph.
-
-When the engine is initialized, it will normally create a device internally. If you would rather
-manage the device yourself, you can do so and just pass a pointer to it via the engine config when
-you initialize the engine. You can also just use the engine without a device, which again can be
-configured via the engine config.
-
-The most basic way to initialize the engine is with a default config, like so:
-
-    ```c
-    ma_result result;
-    ma_engine engine;
-
-    result = ma_engine_init(NULL, &engine);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to initialize the engine.
-    }
-    ```
-
-This will result in the engine initializing a playback device using the operating system's default
-device. This will be sufficient for many use cases, but if you need more flexibility you'll want to
-configure the engine with an engine config:
-
-    ```c
-    ma_result result;
-    ma_engine engine;
-    ma_engine_config engineConfig;
-
-    engineConfig = ma_engine_config_init();
-    engineConfig.pDevice = &myDevice;
-
-    result = ma_engine_init(&engineConfig, &engine);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to initialize the engine.
-    }
-    ```
-
-In the example above we're passing in a pre-initialized device. Since the caller is the one in
-control of the device's data callback, it's their responsibility to manually call
-`ma_engine_read_pcm_frames()` from inside their data callback:
-
-    ```c
-    void playback_data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount)
-    {
-        ma_engine_read_pcm_frames(&g_Engine, pOutput, frameCount, NULL);
-    }
-    ```
-
-You can also use the engine independent of a device entirely:
-
-    ```c
-    ma_result result;
-    ma_engine engine;
-    ma_engine_config engineConfig;
-
-    engineConfig = ma_engine_config_init();
-    engineConfig.noDevice   = MA_TRUE;
-    engineConfig.channels   = 2;        // Must be set when not using a device.
-    engineConfig.sampleRate = 48000;    // Must be set when not using a device.
-
-    result = ma_engine_init(&engineConfig, &engine);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to initialize the engine.
-    }
-    ```
-
-Note that when you're not using a device, you must set the channel count and sample rate in the
-config or else miniaudio won't know what to use (miniaudio will use the device to determine this
-normally). When not using a device, you need to use `ma_engine_read_pcm_frames()` to process audio
-data from the engine. This kind of setup is useful if you want to do something like offline
-processing or want to use a different audio system for playback such as SDL.
-
-When a sound is loaded it goes through a resource manager. By default the engine will initialize a
-resource manager internally, but you can also specify a pre-initialized resource manager:
-
-    ```c
-    ma_result result;
-    ma_engine engine1;
-    ma_engine engine2;
-    ma_engine_config engineConfig;
-
-    engineConfig = ma_engine_config_init();
-    engineConfig.pResourceManager = &myResourceManager;
-
-    ma_engine_init(&engineConfig, &engine1);
-    ma_engine_init(&engineConfig, &engine2);
-    ```
-
-In this example we are initializing two engines, both of which are sharing the same resource
-manager. This is especially useful for saving memory when loading the same file across multiple
-engines. If you were not to use a shared resource manager, each engine instance would use their own
-which would result in any sounds that are used between both engine's being loaded twice. By using
-a shared resource manager, it would only be loaded once. Using multiple engine's is useful when you
-need to output to multiple playback devices, such as in a local multiplayer game where each player
-is using their own set of headphones.
-
-By default an engine will be in a started state. To make it so the engine is not automatically
-started you can configure it as such:
-
-    ```c
-    engineConfig.noAutoStart = MA_TRUE;
-
-    // The engine will need to be started manually.
-    ma_engine_start(&engine);
-
-    // Later on the engine can be stopped with ma_engine_stop().
-    ma_engine_stop(&engine);
-    ```
-
-The concept of starting or stopping an engine is only relevant when using the engine with a
-device. Attempting to start or stop an engine that is not associated with a device will result in
-`MA_INVALID_OPERATION`.
-
-The master volume of the engine can be controlled with `ma_engine_set_volume()` which takes a
-linear scale, with 0 resulting in silence and anything above 1 resulting in amplification. If you
-prefer decibel based volume control, use `ma_volume_db_to_linear()` to convert from dB to linear.
-
-When a sound is spatialized, it is done so relative to a listener. An engine can be configured to
-have multiple listeners which can be configured via the config:
-
-    ```c
-    engineConfig.listenerCount = 2;
-    ```
-
-The maximum number of listeners is restricted to `MA_ENGINE_MAX_LISTENERS`. By default, when a
-sound is spatialized, it will be done so relative to the closest listener. You can also pin a sound
-to a specific listener which will be explained later. Listener's have a position, direction, cone,
-and velocity (for doppler effect). A listener is referenced by an index, the meaning of which is up
-to the caller (the index is 0 based and cannot go beyond the listener count, minus 1). The
-position, direction and velocity are all specified in absolute terms:
-
-    ```c
-    ma_engine_listener_set_position(&engine, listenerIndex, worldPosX, worldPosY, worldPosZ);
-    ```
-
-The direction of the listener represents it's forward vector. The listener's up vector can also be
-specified and defaults to +1 on the Y axis.
-
-    ```c
-    ma_engine_listener_set_direction(&engine, listenerIndex, forwardX, forwardY, forwardZ);
-    ma_engine_listener_set_world_up(&engine, listenerIndex, 0, 1, 0);
-    ```
-
-The engine supports directional attenuation. The listener can have a cone the controls how sound is
-attenuated based on the listener's direction. When a sound is between the inner and outer cones, it
-will be attenuated between 1 and the cone's outer gain:
-
-    ```c
-    ma_engine_listener_set_cone(&engine, listenerIndex, innerAngleInRadians, outerAngleInRadians, outerGain);
-    ```
-
-When a sound is inside the inner code, no directional attenuation is applied. When the sound is
-outside of the outer cone, the attenuation will be set to `outerGain` in the example above. When
-the sound is in between the inner and outer cones, the attenuation will be interpolated between 1
-and the outer gain.
-
-The engine's coordinate system follows the OpenGL coordinate system where positive X points right,
-positive Y points up and negative Z points forward.
-
-The simplest and least flexible way to play a sound is like so:
-
-    ```c
-    ma_engine_play_sound(&engine, "my_sound.wav", pGroup);
-    ```
-
-This is a "fire and forget" style of function. The engine will manage the `ma_sound` object
-internally. When the sound finishes playing, it'll be put up for recycling. For more flexibility
-you'll want to initialize a sound object:
-
-    ```c
-    ma_sound sound;
-
-    result = ma_sound_init_from_file(&engine, "my_sound.wav", flags, pGroup, NULL, &sound);
-    if (result != MA_SUCCESS) {
-        return result;  // Failed to load sound.
-    }
-    ```
-
-Sounds need to be uninitialized with `ma_sound_uninit()`.
-
-The example above loads a sound from a file. If the resource manager has been disabled you will not
-be able to use this function and instead you'll need to initialize a sound directly from a data
-source:
-
-    ```c
-    ma_sound sound;
-
-    result = ma_sound_init_from_data_source(&engine, &dataSource, flags, pGroup, &sound);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-    ```
-
-Each `ma_sound` object represents a single instance of the sound. If you want to play the same
-sound multiple times at the same time, you need to initialize a separate `ma_sound` object.
-
-For the most flexibility when initializing sounds, use `ma_sound_init_ex()`. This uses miniaudio's
-standard config/init pattern:
-
-    ```c
-    ma_sound sound;
-    ma_sound_config soundConfig;
-
-    soundConfig = ma_sound_config_init();
-    soundConfig.pFilePath   = NULL; // Set this to load from a file path.
-    soundConfig.pDataSource = NULL; // Set this to initialize from an existing data source.
-    soundConfig.pInitialAttachment = &someNodeInTheNodeGraph;
-    soundConfig.initialAttachmentInputBusIndex = 0;
-    soundConfig.channelsIn  = 1;
-    soundConfig.channelsOut = 0;    // Set to 0 to use the engine's native channel count.
-
-    result = ma_sound_init_ex(&soundConfig, &sound);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-    ```
-
-In the example above, the sound is being initialized without a file nor a data source. This is
-valid, in which case the sound acts as a node in the middle of the node graph. This means you can
-connect other sounds to this sound and allow it to act like a sound group. Indeed, this is exactly
-what a `ma_sound_group` is.
-
-When loading a sound, you specify a set of flags that control how the sound is loaded and what
-features are enabled for that sound. When no flags are set, the sound will be fully loaded into
-memory in exactly the same format as how it's stored on the file system. The resource manager will
-allocate a block of memory and then load the file directly into it. When reading audio data, it
-will be decoded dynamically on the fly. In order to save processing time on the audio thread, it
-might be beneficial to pre-decode the sound. You can do this with the `MA_SOUND_FLAG_DECODE` flag:
-
-    ```c
-    ma_sound_init_from_file(&engine, "my_sound.wav", MA_SOUND_FLAG_DECODE, pGroup, NULL, &sound);
-    ```
-
-By default, sounds will be loaded synchronously, meaning `ma_sound_init_*()` will not return until
-the sound has been fully loaded. If this is prohibitive you can instead load sounds asynchronously
-by specifying the `MA_SOUND_FLAG_ASYNC` flag:
-
-    ```c
-    ma_sound_init_from_file(&engine, "my_sound.wav", MA_SOUND_FLAG_DECODE | MA_SOUND_FLAG_ASYNC, pGroup, NULL, &sound);
-    ```
-
-This will result in `ma_sound_init_*()` returning quickly, but the sound won't yet have been fully
-loaded. When you start the sound, it won't output anything until some sound is available. The sound
-will start outputting audio before the sound has been fully decoded when the `MA_SOUND_FLAG_DECODE`
-is specified.
-
-If you need to wait for an asynchronously loaded sound to be fully loaded, you can use a fence. A
-fence in miniaudio is a simple synchronization mechanism which simply blocks until it's internal
-counter hit's zero. You can specify a fence like so:
-
-    ```c
-    ma_result result;
-    ma_fence fence;
-    ma_sound sounds[4];
-
-    result = ma_fence_init(&fence);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    // Load some sounds asynchronously.
-    for (int iSound = 0; iSound < 4; iSound += 1) {
-        ma_sound_init_from_file(&engine, mySoundFilesPaths[iSound], MA_SOUND_FLAG_DECODE | MA_SOUND_FLAG_ASYNC, pGroup, &fence, &sounds[iSound]);
-    }
-
-    // ... do some other stuff here in the mean time ...
-
-    // Wait for all sounds to finish loading.
-    ma_fence_wait(&fence);
-    ```
-
-If loading the entire sound into memory is prohibitive, you can also configure the engine to stream
-the audio data:
-
-    ```c
-    ma_sound_init_from_file(&engine, "my_sound.wav", MA_SOUND_FLAG_STREAM, pGroup, NULL, &sound);
-    ```
-
-When streaming sounds, 2 seconds worth of audio data is stored in memory. Although it should work
-fine, it's inefficient to use streaming for short sounds. Streaming is useful for things like music
-tracks in games.
-
-When loading a sound from a file path, the engine will reference count the file to prevent it from
-being loaded if it's already in memory. When you uninitialize a sound, the reference count will be
-decremented, and if it hits zero, the sound will be unloaded from memory. This reference counting
-system is not used for streams. The engine will use a 64-bit hash of the file name when comparing
-file paths which means there's a small chance you might encounter a name collision. If this is an
-issue, you'll need to use a different name for one of the colliding file paths, or just not load
-from files and instead load from a data source.
-
-You can use `ma_sound_init_copy()` to initialize a copy of another sound. Note, however, that this
-only works for sounds that were initialized with `ma_sound_init_from_file()` and without the
-`MA_SOUND_FLAG_STREAM` flag.
-
-When you initialize a sound, if you specify a sound group the sound will be attached to that group
-automatically. If you set it to NULL, it will be automatically attached to the engine's endpoint.
-If you would instead rather leave the sound unattached by default, you can can specify the
-`MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT` flag. This is useful if you want to set up a complex node
-graph.
-
-Sounds are not started by default. To start a sound, use `ma_sound_start()`. Stop a sound with
-`ma_sound_stop()`.
-
-Sounds can have their volume controlled with `ma_sound_set_volume()` in the same way as the
-engine's master volume.
-
-Sounds support stereo panning and pitching. Set the pan with `ma_sound_set_pan()`. Setting the pan
-to 0 will result in an unpanned sound. Setting it to -1 will shift everything to the left, whereas
-+1 will shift it to the right. The pitch can be controlled with `ma_sound_set_pitch()`. A larger
-value will result in a higher pitch. The pitch must be greater than 0.
-
-The engine supports 3D spatialization of sounds. By default sounds will have spatialization
-enabled, but if a sound does not need to be spatialized it's best to disable it. There are two ways
-to disable spatialization of a sound:
-
-    ```c
-    // Disable spatialization at initialization time via a flag:
-    ma_sound_init_from_file(&engine, "my_sound.wav", MA_SOUND_FLAG_NO_SPATIALIZATION, NULL, NULL, &sound);
-
-    // Dynamically disable or enable spatialization post-initialization:
-    ma_sound_set_spatialization_enabled(&sound, isSpatializationEnabled);
-    ```
-
-By default sounds will be spatialized based on the closest listener. If a sound should always be
-spatialized relative to a specific listener it can be pinned to one:
-
-    ```c
-    ma_sound_set_pinned_listener_index(&sound, listenerIndex);
-    ```
-
-Like listeners, sounds have a position. By default, the position of a sound is in absolute space,
-but it can be changed to be relative to a listener:
-
-    ```c
-    ma_sound_set_positioning(&sound, ma_positioning_relative);
-    ```
-
-Note that relative positioning of a sound only makes sense if there is either only one listener, or
-the sound is pinned to a specific listener. To set the position of a sound:
-
-    ```c
-    ma_sound_set_position(&sound, posX, posY, posZ);
-    ```
-
-The direction works the same way as a listener and represents the sound's forward direction:
-
-    ```c
-    ma_sound_set_direction(&sound, forwardX, forwardY, forwardZ);
-    ```
-
-Sound's also have a cone for controlling directional attenuation. This works exactly the same as
-listeners:
-
-    ```c
-    ma_sound_set_cone(&sound, innerAngleInRadians, outerAngleInRadians, outerGain);
-    ```
-
-The velocity of a sound is used for doppler effect and can be set as such:
-
-    ```c
-    ma_sound_set_velocity(&sound, velocityX, velocityY, velocityZ);
-    ```
-
-The engine supports different attenuation models which can be configured on a per-sound basis. By
-default the attenuation model is set to `ma_attenuation_model_inverse` which is the equivalent to
-OpenAL's `AL_INVERSE_DISTANCE_CLAMPED`. Configure the attenuation model like so:
-
-    ```c
-    ma_sound_set_attenuation_model(&sound, ma_attenuation_model_inverse);
-    ```
-
-The supported attenuation models include the following:
-
-    +----------------------------------+----------------------------------------------+
-    | ma_attenuation_model_none        | No distance attenuation.                     |
-    +----------------------------------+----------------------------------------------+
-    | ma_attenuation_model_inverse     | Equivalent to `AL_INVERSE_DISTANCE_CLAMPED`. |
-    +----------------------------------+----------------------------------------------+
-    | ma_attenuation_model_linear      | Linear attenuation.                          |
-    +----------------------------------+----------------------------------------------+
-    | ma_attenuation_model_exponential | Exponential attenuation.                     |
-    +----------------------------------+----------------------------------------------+
-
-To control how quickly a sound rolls off as it moves away from the listener, you need to configure
-the rolloff:
-
-    ```c
-    ma_sound_set_rolloff(&sound, rolloff);
-    ```
-
-You can control the minimum and maximum gain to apply from spatialization:
-
-    ```c
-    ma_sound_set_min_gain(&sound, minGain);
-    ma_sound_set_max_gain(&sound, maxGain);
-    ```
-
-Likewise, in the calculation of attenuation, you can control the minimum and maximum distances for
-the attenuation calculation. This is useful if you want to ensure sounds don't drop below a certain
-volume after the listener moves further away and to have sounds play a maximum volume when the
-listener is within a certain distance:
-
-    ```c
-    ma_sound_set_min_distance(&sound, minDistance);
-    ma_sound_set_max_distance(&sound, maxDistance);
-    ```
-
-The engine's spatialization system supports doppler effect. The doppler factor can be configure on
-a per-sound basis like so:
-
-    ```c
-    ma_sound_set_doppler_factor(&sound, dopplerFactor);
-    ```
-
-You can fade sounds in and out with `ma_sound_set_fade_in_pcm_frames()` and
-`ma_sound_set_fade_in_milliseconds()`. Set the volume to -1 to use the current volume as the
-starting volume:
-
-    ```c
-    // Fade in over 1 second.
-    ma_sound_set_fade_in_milliseconds(&sound, 0, 1, 1000);
-
-    // ... sometime later ...
-
-    // Fade out over 1 second, starting from the current volume.
-    ma_sound_set_fade_in_milliseconds(&sound, -1, 0, 1000);
-    ```
-
-By default sounds will start immediately, but sometimes for timing and synchronization purposes it
-can be useful to schedule a sound to start or stop:
-
-    ```c
-    // Start the sound in 1 second from now.
-    ma_sound_set_start_time_in_pcm_frames(&sound, ma_engine_get_time_in_pcm_frames(&engine) + (ma_engine_get_sample_rate(&engine) * 1));
-
-    // Stop the sound in 2 seconds from now.
-    ma_sound_set_stop_time_in_pcm_frames(&sound, ma_engine_get_time_in_pcm_frames(&engine) + (ma_engine_get_sample_rate(&engine) * 2));
-    ```
-
-Note that scheduling a start time still requires an explicit call to `ma_sound_start()` before
-anything will play.
-
-The time is specified in global time which is controlled by the engine. You can get the engine's
-current time with `ma_engine_get_time_in_pcm_frames()`. The engine's global time is incremented
-automatically as audio data is read, but it can be reset with `ma_engine_set_time_in_pcm_frames()`
-in case it needs to be resynchronized for some reason.
-
-To determine whether or not a sound is currently playing, use `ma_sound_is_playing()`. This will
-take the scheduled start and stop times into account.
-
-Whether or not a sound should loop can be controlled with `ma_sound_set_looping()`. Sounds will not
-be looping by default. Use `ma_sound_is_looping()` to determine whether or not a sound is looping.
-
-Use `ma_sound_at_end()` to determine whether or not a sound is currently at the end. For a looping
-sound this should never return true. Alternatively, you can configure a callback that will be fired
-when the sound reaches the end. Note that the callback is fired from the audio thread which means
-you cannot be uninitializing sound from the callback. To set the callback you can use
-`ma_sound_set_end_callback()`. Alternatively, if you're using `ma_sound_init_ex()`, you can pass it
-into the config like so:
-
-    ```c
-    soundConfig.endCallback = my_end_callback;
-    soundConfig.pEndCallbackUserData = pMyEndCallbackUserData;
-    ```
-
-The end callback is declared like so:
-
-    ```c
-    void my_end_callback(void* pUserData, ma_sound* pSound)
-    {
-        ...
-    }
-    ```
-
-Internally a sound wraps around a data source. Some APIs exist to control the underlying data
-source, mainly for convenience:
-
-    ```c
-    ma_sound_seek_to_pcm_frame(&sound, frameIndex);
-    ma_sound_get_data_format(&sound, &format, &channels, &sampleRate, pChannelMap, channelMapCapacity);
-    ma_sound_get_cursor_in_pcm_frames(&sound, &cursor);
-    ma_sound_get_length_in_pcm_frames(&sound, &length);
-    ```
-
-Sound groups have the same API as sounds, only they are called `ma_sound_group`, and since they do
-not have any notion of a data source, anything relating to a data source is unavailable.
-
-Internally, sound data is loaded via the `ma_decoder` API which means by default it only supports
-file formats that have built-in support in miniaudio. You can extend this to support any kind of
-file format through the use of custom decoders. To do this you'll need to use a self-managed
-resource manager and configure it appropriately. See the "Resource Management" section below for
-details on how to set this up.
-
-
-6. Resource Management
-======================
-Many programs will want to manage sound resources for things such as reference counting and
-streaming. This is supported by miniaudio via the `ma_resource_manager` API.
-
-The resource manager is mainly responsible for the following:
-
-  * Loading of sound files into memory with reference counting.
-  * Streaming of sound data.
-
-When loading a sound file, the resource manager will give you back a `ma_data_source` compatible
-object called `ma_resource_manager_data_source`. This object can be passed into any
-`ma_data_source` API which is how you can read and seek audio data. When loading a sound file, you
-specify whether or not you want the sound to be fully loaded into memory (and optionally
-pre-decoded) or streamed. When loading into memory, you can also specify whether or not you want
-the data to be loaded asynchronously.
-
-The example below is how you can initialize a resource manager using it's default configuration:
-
-    ```c
-    ma_resource_manager_config config;
-    ma_resource_manager resourceManager;
-
-    config = ma_resource_manager_config_init();
-    result = ma_resource_manager_init(&config, &resourceManager);
-    if (result != MA_SUCCESS) {
-        ma_device_uninit(&device);
-        printf("Failed to initialize the resource manager.");
-        return -1;
-    }
-    ```
-
-You can configure the format, channels and sample rate of the decoded audio data. By default it
-will use the file's native data format, but you can configure it to use a consistent format. This
-is useful for offloading the cost of data conversion to load time rather than dynamically
-converting at mixing time. To do this, you configure the decoded format, channels and sample rate
-like the code below:
-
-    ```c
-    config = ma_resource_manager_config_init();
-    config.decodedFormat     = device.playback.format;
-    config.decodedChannels   = device.playback.channels;
-    config.decodedSampleRate = device.sampleRate;
-    ```
-
-In the code above, the resource manager will be configured so that any decoded audio data will be
-pre-converted at load time to the device's native data format. If instead you used defaults and
-the data format of the file did not match the device's data format, you would need to convert the
-data at mixing time which may be prohibitive in high-performance and large scale scenarios like
-games.
-
-Internally the resource manager uses the `ma_decoder` API to load sounds. This means by default it
-only supports decoders that are built into miniaudio. It's possible to support additional encoding
-formats through the use of custom decoders. To do so, pass in your `ma_decoding_backend_vtable`
-vtables into the resource manager config:
-
-    ```c
-    ma_decoding_backend_vtable* pCustomBackendVTables[] =
-    {
-        &g_ma_decoding_backend_vtable_libvorbis,
-        &g_ma_decoding_backend_vtable_libopus
-    };
-
-    ...
-
-    resourceManagerConfig.ppCustomDecodingBackendVTables = pCustomBackendVTables;
-    resourceManagerConfig.customDecodingBackendCount     = sizeof(pCustomBackendVTables) / sizeof(pCustomBackendVTables[0]);
-    resourceManagerConfig.pCustomDecodingBackendUserData = NULL;
-    ```
-
-This system can allow you to support any kind of file format. See the "Decoding" section for
-details on how to implement custom decoders. The miniaudio repository includes examples for Opus
-via libopus and libopusfile and Vorbis via libvorbis and libvorbisfile.
-
-Asynchronicity is achieved via a job system. When an operation needs to be performed, such as the
-decoding of a page, a job will be posted to a queue which will then be processed by a job thread.
-By default there will be only one job thread running, but this can be configured, like so:
-
-    ```c
-    config = ma_resource_manager_config_init();
-    config.jobThreadCount = MY_JOB_THREAD_COUNT;
-    ```
-
-By default job threads are managed internally by the resource manager, however you can also self
-manage your job threads if, for example, you want to integrate the job processing into your
-existing job infrastructure, or if you simply don't like the way the resource manager does it. To
-do this, just set the job thread count to 0 and process jobs manually. To process jobs, you first
-need to retrieve a job using `ma_resource_manager_next_job()` and then process it using
-`ma_job_process()`:
-
-    ```c
-    config = ma_resource_manager_config_init();
-    config.jobThreadCount = 0;                            // Don't manage any job threads internally.
-    config.flags = MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING; // Optional. Makes `ma_resource_manager_next_job()` non-blocking.
-
-    // ... Initialize your custom job threads ...
-
-    void my_custom_job_thread(...)
-    {
-        for (;;) {
-            ma_job job;
-            ma_result result = ma_resource_manager_next_job(pMyResourceManager, &job);
-            if (result != MA_SUCCESS) {
-                if (result == MA_NO_DATA_AVAILABLE) {
-                    // No jobs are available. Keep going. Will only get this if the resource manager was initialized
-                    // with MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING.
-                    continue;
-                } else if (result == MA_CANCELLED) {
-                    // MA_JOB_TYPE_QUIT was posted. Exit.
-                    break;
-                } else {
-                    // Some other error occurred.
-                    break;
-                }
-            }
-
-            ma_job_process(&job);
-        }
-    }
-    ```
-
-In the example above, the `MA_JOB_TYPE_QUIT` event is the used as the termination
-indicator, but you can use whatever you would like to terminate the thread. The call to
-`ma_resource_manager_next_job()` is blocking by default, but can be configured to be non-blocking
-by initializing the resource manager with the `MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING` configuration
-flag. Note that the `MA_JOB_TYPE_QUIT` will never be removed from the job queue. This
-is to give every thread the opportunity to catch the event and terminate naturally.
-
-When loading a file, it's sometimes convenient to be able to customize how files are opened and
-read instead of using standard `fopen()`, `fclose()`, etc. which is what miniaudio will use by
-default. This can be done by setting `pVFS` member of the resource manager's config:
-
-    ```c
-    // Initialize your custom VFS object. See documentation for VFS for information on how to do this.
-    my_custom_vfs vfs = my_custom_vfs_init();
-
-    config = ma_resource_manager_config_init();
-    config.pVFS = &vfs;
-    ```
-
-This is particularly useful in programs like games where you want to read straight from an archive
-rather than the normal file system. If you do not specify a custom VFS, the resource manager will
-use the operating system's normal file operations.
-
-To load a sound file and create a data source, call `ma_resource_manager_data_source_init()`. When
-loading a sound you need to specify the file path and options for how the sounds should be loaded.
-By default a sound will be loaded synchronously. The returned data source is owned by the caller
-which means the caller is responsible for the allocation and freeing of the data source. Below is
-an example for initializing a data source:
-
-    ```c
-    ma_resource_manager_data_source dataSource;
-    ma_result result = ma_resource_manager_data_source_init(pResourceManager, pFilePath, flags, &dataSource);
-    if (result != MA_SUCCESS) {
-        // Error.
-    }
-
-    // ...
-
-    // A ma_resource_manager_data_source object is compatible with the `ma_data_source` API. To read data, just call
-    // the `ma_data_source_read_pcm_frames()` like you would with any normal data source.
-    result = ma_data_source_read_pcm_frames(&dataSource, pDecodedData, frameCount, &framesRead);
-    if (result != MA_SUCCESS) {
-        // Failed to read PCM frames.
-    }
-
-    // ...
-
-    ma_resource_manager_data_source_uninit(&dataSource);
-    ```
-
-The `flags` parameter specifies how you want to perform loading of the sound file. It can be a
-combination of the following flags:
-
-    ```
-    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM
-    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE
-    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC
-    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT
-    ```
-
-When no flags are specified (set to 0), the sound will be fully loaded into memory, but not
-decoded, meaning the raw file data will be stored in memory, and then dynamically decoded when
-`ma_data_source_read_pcm_frames()` is called. To instead decode the audio data before storing it in
-memory, use the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE` flag. By default, the sound file will
-be loaded synchronously, meaning `ma_resource_manager_data_source_init()` will only return after
-the entire file has been loaded. This is good for simplicity, but can be prohibitively slow. You
-can instead load the sound asynchronously using the `MA_RESOURCE_MANAGER_DATA_SOURCE_ASYNC` flag.
-This will result in `ma_resource_manager_data_source_init()` returning quickly, but no data will be
-returned by `ma_data_source_read_pcm_frames()` until some data is available. When no data is
-available because the asynchronous decoding hasn't caught up, `MA_BUSY` will be returned by
-`ma_data_source_read_pcm_frames()`.
-
-For large sounds, it's often prohibitive to store the entire file in memory. To mitigate this, you
-can instead stream audio data which you can do by specifying the
-`MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM` flag. When streaming, data will be decoded in 1
-second pages. When a new page needs to be decoded, a job will be posted to the job queue and then
-subsequently processed in a job thread.
-
-For in-memory sounds, reference counting is used to ensure the data is loaded only once. This means
-multiple calls to `ma_resource_manager_data_source_init()` with the same file path will result in
-the file data only being loaded once. Each call to `ma_resource_manager_data_source_init()` must be
-matched up with a call to `ma_resource_manager_data_source_uninit()`. Sometimes it can be useful
-for a program to register self-managed raw audio data and associate it with a file path. Use the
-`ma_resource_manager_register_*()` and `ma_resource_manager_unregister_*()` APIs to do this.
-`ma_resource_manager_register_decoded_data()` is used to associate a pointer to raw, self-managed
-decoded audio data in the specified data format with the specified name. Likewise,
-`ma_resource_manager_register_encoded_data()` is used to associate a pointer to raw self-managed
-encoded audio data (the raw file data) with the specified name. Note that these names need not be
-actual file paths. When `ma_resource_manager_data_source_init()` is called (without the
-`MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM` flag), the resource manager will look for these
-explicitly registered data buffers and, if found, will use it as the backing data for the data
-source. Note that the resource manager does *not* make a copy of this data so it is up to the
-caller to ensure the pointer stays valid for it's lifetime. Use
-`ma_resource_manager_unregister_data()` to unregister the self-managed data. You can also use
-`ma_resource_manager_register_file()` and `ma_resource_manager_unregister_file()` to register and
-unregister a file. It does not make sense to use the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM`
-flag with a self-managed data pointer.
-
-
-6.1. Asynchronous Loading and Synchronization
----------------------------------------------
-When loading asynchronously, it can be useful to poll whether or not loading has finished. Use
-`ma_resource_manager_data_source_result()` to determine this. For in-memory sounds, this will
-return `MA_SUCCESS` when the file has been *entirely* decoded. If the sound is still being decoded,
-`MA_BUSY` will be returned. Otherwise, some other error code will be returned if the sound failed
-to load. For streaming data sources, `MA_SUCCESS` will be returned when the first page has been
-decoded and the sound is ready to be played. If the first page is still being decoded, `MA_BUSY`
-will be returned. Otherwise, some other error code will be returned if the sound failed to load.
-
-In addition to polling, you can also use a simple synchronization object called a "fence" to wait
-for asynchronously loaded sounds to finish. This is called `ma_fence`. The advantage to using a
-fence is that it can be used to wait for a group of sounds to finish loading rather than waiting
-for sounds on an individual basis. There are two stages to loading a sound:
-
-  * Initialization of the internal decoder; and
-  * Completion of decoding of the file (the file is fully decoded)
-
-You can specify separate fences for each of the different stages. Waiting for the initialization
-of the internal decoder is important for when you need to know the sample format, channels and
-sample rate of the file.
-
-The example below shows how you could use a fence when loading a number of sounds:
-
-    ```c
-    // This fence will be released when all sounds are finished loading entirely.
-    ma_fence fence;
-    ma_fence_init(&fence);
-
-    // This will be passed into the initialization routine for each sound.
-    ma_resource_manager_pipeline_notifications notifications = ma_resource_manager_pipeline_notifications_init();
-    notifications.done.pFence = &fence;
-
-    // Now load a bunch of sounds:
-    for (iSound = 0; iSound < soundCount; iSound += 1) {
-        ma_resource_manager_data_source_init(pResourceManager, pSoundFilePaths[iSound], flags, &notifications, &pSoundSources[iSound]);
-    }
-
-    // ... DO SOMETHING ELSE WHILE SOUNDS ARE LOADING ...
-
-    // Wait for loading of sounds to finish.
-    ma_fence_wait(&fence);
-    ```
-
-In the example above we used a fence for waiting until the entire file has been fully decoded. If
-you only need to wait for the initialization of the internal decoder to complete, you can use the
-`init` member of the `ma_resource_manager_pipeline_notifications` object:
-
-    ```c
-    notifications.init.pFence = &fence;
-    ```
-
-If a fence is not appropriate for your situation, you can instead use a callback that is fired on
-an individual sound basis. This is done in a very similar way to fences:
-
-    ```c
-    typedef struct
-    {
-        ma_async_notification_callbacks cb;
-        void* pMyData;
-    } my_notification;
-
-    void my_notification_callback(ma_async_notification* pNotification)
-    {
-        my_notification* pMyNotification = (my_notification*)pNotification;
-
-        // Do something in response to the sound finishing loading.
-    }
-
-    ...
-
-    my_notification myCallback;
-    myCallback.cb.onSignal = my_notification_callback;
-    myCallback.pMyData     = pMyData;
-
-    ma_resource_manager_pipeline_notifications notifications = ma_resource_manager_pipeline_notifications_init();
-    notifications.done.pNotification = &myCallback;
-
-    ma_resource_manager_data_source_init(pResourceManager, "my_sound.wav", flags, &notifications, &mySound);
-    ```
-
-In the example above we just extend the `ma_async_notification_callbacks` object and pass an
-instantiation into the `ma_resource_manager_pipeline_notifications` in the same way as we did with
-the fence, only we set `pNotification` instead of `pFence`. You can set both of these at the same
-time and they should both work as expected. If using the `pNotification` system, you need to ensure
-your `ma_async_notification_callbacks` object stays valid.
-
-
-
-6.2. Resource Manager Implementation Details
---------------------------------------------
-Resources are managed in two main ways:
-
-  * By storing the entire sound inside an in-memory buffer (referred to as a data buffer)
-  * By streaming audio data on the fly (referred to as a data stream)
-
-A resource managed data source (`ma_resource_manager_data_source`) encapsulates a data buffer or
-data stream, depending on whether or not the data source was initialized with the
-`MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM` flag. If so, it will make use of a
-`ma_resource_manager_data_stream` object. Otherwise it will use a `ma_resource_manager_data_buffer`
-object. Both of these objects are data sources which means they can be used with any
-`ma_data_source_*()` API.
-
-Another major feature of the resource manager is the ability to asynchronously decode audio files.
-This relieves the audio thread of time-consuming decoding which can negatively affect scalability
-due to the audio thread needing to complete it's work extremely quickly to avoid glitching.
-Asynchronous decoding is achieved through a job system. There is a central multi-producer,
-multi-consumer, fixed-capacity job queue. When some asynchronous work needs to be done, a job is
-posted to the queue which is then read by a job thread. The number of job threads can be
-configured for improved scalability, and job threads can all run in parallel without needing to
-worry about the order of execution (how this is achieved is explained below).
-
-When a sound is being loaded asynchronously, playback can begin before the sound has been fully
-decoded. This enables the application to start playback of the sound quickly, while at the same
-time allowing to resource manager to keep loading in the background. Since there may be less
-threads than the number of sounds being loaded at a given time, a simple scheduling system is used
-to keep decoding time balanced and fair. The resource manager solves this by splitting decoding
-into chunks called pages. By default, each page is 1 second long. When a page has been decoded, a
-new job will be posted to start decoding the next page. By dividing up decoding into pages, an
-individual sound shouldn't ever delay every other sound from having their first page decoded. Of
-course, when loading many sounds at the same time, there will always be an amount of time required
-to process jobs in the queue so in heavy load situations there will still be some delay. To
-determine if a data source is ready to have some frames read, use
-`ma_resource_manager_data_source_get_available_frames()`. This will return the number of frames
-available starting from the current position.
-
-
-6.2.1. Job Queue
-----------------
-The resource manager uses a job queue which is multi-producer, multi-consumer, and fixed-capacity.
-This job queue is not currently lock-free, and instead uses a spinlock to achieve thread-safety.
-Only a fixed number of jobs can be allocated and inserted into the queue which is done through a
-lock-free data structure for allocating an index into a fixed sized array, with reference counting
-for mitigation of the ABA problem. The reference count is 32-bit.
-
-For many types of jobs it's important that they execute in a specific order. In these cases, jobs
-are executed serially. For the resource manager, serial execution of jobs is only required on a
-per-object basis (per data buffer or per data stream). Each of these objects stores an execution
-counter. When a job is posted it is associated with an execution counter. When the job is
-processed, it checks if the execution counter of the job equals the execution counter of the
-owning object and if so, processes the job. If the counters are not equal, the job will be posted
-back onto the job queue for later processing. When the job finishes processing the execution order
-of the main object is incremented. This system means the no matter how many job threads are
-executing, decoding of an individual sound will always get processed serially. The advantage to
-having multiple threads comes into play when loading multiple sounds at the same time.
-
-The resource manager's job queue is not 100% lock-free and will use a spinlock to achieve
-thread-safety for a very small section of code. This is only relevant when the resource manager
-uses more than one job thread. If only using a single job thread, which is the default, the
-lock should never actually wait in practice. The amount of time spent locking should be quite
-short, but it's something to be aware of for those who have pedantic lock-free requirements and
-need to use more than one job thread. There are plans to remove this lock in a future version.
-
-In addition, posting a job will release a semaphore, which on Win32 is implemented with
-`ReleaseSemaphore` and on POSIX platforms via a condition variable:
-
-    ```c
-    pthread_mutex_lock(&pSemaphore->lock);
-    {
-        pSemaphore->value += 1;
-        pthread_cond_signal(&pSemaphore->cond);
-    }
-    pthread_mutex_unlock(&pSemaphore->lock);
-    ```
-
-Again, this is relevant for those with strict lock-free requirements in the audio thread. To avoid
-this, you can use non-blocking mode (via the `MA_JOB_QUEUE_FLAG_NON_BLOCKING`
-flag) and implement your own job processing routine (see the "Resource Manager" section above for
-details on how to do this).
-
-
-
-6.2.2. Data Buffers
--------------------
-When the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM` flag is excluded at initialization time, the
-resource manager will try to load the data into an in-memory data buffer. Before doing so, however,
-it will first check if the specified file is already loaded. If so, it will increment a reference
-counter and just use the already loaded data. This saves both time and memory. When the data buffer
-is uninitialized, the reference counter will be decremented. If the counter hits zero, the file
-will be unloaded. This is a detail to keep in mind because it could result in excessive loading and
-unloading of a sound. For example, the following sequence will result in a file be loaded twice,
-once after the other:
-
-    ```c
-    ma_resource_manager_data_source_init(pResourceManager, "my_file", ..., &myDataBuffer0); // Refcount = 1. Initial load.
-    ma_resource_manager_data_source_uninit(&myDataBuffer0);                                 // Refcount = 0. Unloaded.
-
-    ma_resource_manager_data_source_init(pResourceManager, "my_file", ..., &myDataBuffer1); // Refcount = 1. Reloaded because previous uninit() unloaded it.
-    ma_resource_manager_data_source_uninit(&myDataBuffer1);                                 // Refcount = 0. Unloaded.
-    ```
-
-A binary search tree (BST) is used for storing data buffers as it has good balance between
-efficiency and simplicity. The key of the BST is a 64-bit hash of the file path that was passed
-into `ma_resource_manager_data_source_init()`. The advantage of using a hash is that it saves
-memory over storing the entire path, has faster comparisons, and results in a mostly balanced BST
-due to the random nature of the hash. The disadvantages are that file names are case-sensitive and
-there's a small chance of name collisions. If case-sensitivity is an issue, you should normalize
-your file names to upper- or lower-case before initializing your data sources. If name collisions
-become an issue, you'll need to change the name of one of the colliding names or just not use the
-resource manager.
-
-When a sound file has not already been loaded and the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC`
-flag is excluded, the file will be decoded synchronously by the calling thread. There are two
-options for controlling how the audio is stored in the data buffer - encoded or decoded. When the
-`MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE` option is excluded, the raw file data will be stored
-in memory. Otherwise the sound will be decoded before storing it in memory. Synchronous loading is
-a very simple and standard process of simply adding an item to the BST, allocating a block of
-memory and then decoding (if `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE` is specified).
-
-When the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC` flag is specified, loading of the data buffer
-is done asynchronously. In this case, a job is posted to the queue to start loading and then the
-function immediately returns, setting an internal result code to `MA_BUSY`. This result code is
-returned when the program calls `ma_resource_manager_data_source_result()`. When decoding has fully
-completed `MA_SUCCESS` will be returned. This can be used to know if loading has fully completed.
-
-When loading asynchronously, a single job is posted to the queue of the type
-`MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE`. This involves making a copy of the file path and
-associating it with job. When the job is processed by the job thread, it will first load the file
-using the VFS associated with the resource manager. When using a custom VFS, it's important that it
-be completely thread-safe because it will be used from one or more job threads at the same time.
-Individual files should only ever be accessed by one thread at a time, however. After opening the
-file via the VFS, the job will determine whether or not the file is being decoded. If not, it
-simply allocates a block of memory and loads the raw file contents into it and returns. On the
-other hand, when the file is being decoded, it will first allocate a decoder on the heap and
-initialize it. Then it will check if the length of the file is known. If so it will allocate a
-block of memory to store the decoded output and initialize it to silence. If the size is unknown,
-it will allocate room for one page. After memory has been allocated, the first page will be
-decoded. If the sound is shorter than a page, the result code will be set to `MA_SUCCESS` and the
-completion event will be signalled and loading is now complete. If, however, there is more to
-decode, a job with the code `MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE` is posted. This job
-will decode the next page and perform the same process if it reaches the end. If there is more to
-decode, the job will post another `MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE` job which will
-keep on happening until the sound has been fully decoded. For sounds of an unknown length, each
-page will be linked together as a linked list. Internally this is implemented via the
-`ma_paged_audio_buffer` object.
-
-
-6.2.3. Data Streams
--------------------
-Data streams only ever store two pages worth of data for each instance. They are most useful for
-large sounds like music tracks in games that would consume too much memory if fully decoded in
-memory. After every frame from a page has been read, a job will be posted to load the next page
-which is done from the VFS.
-
-For data streams, the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC` flag will determine whether or
-not initialization of the data source waits until the two pages have been decoded. When unset,
-`ma_resource_manager_data_source_init()` will wait until the two pages have been loaded, otherwise
-it will return immediately.
-
-When frames are read from a data stream using `ma_resource_manager_data_source_read_pcm_frames()`,
-`MA_BUSY` will be returned if there are no frames available. If there are some frames available,
-but less than the number requested, `MA_SUCCESS` will be returned, but the actual number of frames
-read will be less than the number requested. Due to the asynchronous nature of data streams,
-seeking is also asynchronous. If the data stream is in the middle of a seek, `MA_BUSY` will be
-returned when trying to read frames.
-
-When `ma_resource_manager_data_source_read_pcm_frames()` results in a page getting fully consumed
-a job is posted to load the next page. This will be posted from the same thread that called
-`ma_resource_manager_data_source_read_pcm_frames()`.
-
-Data streams are uninitialized by posting a job to the queue, but the function won't return until
-that job has been processed. The reason for this is that the caller owns the data stream object and
-therefore miniaudio needs to ensure everything completes before handing back control to the caller.
-Also, if the data stream is uninitialized while pages are in the middle of decoding, they must
-complete before destroying any underlying object and the job system handles this cleanly.
-
-Note that when a new page needs to be loaded, a job will be posted to the resource manager's job
-thread from the audio thread. You must keep in mind the details mentioned in the "Job Queue"
-section above regarding locking when posting an event if you require a strictly lock-free audio
-thread.
-
-
-
-7. Node Graph
-=============
-miniaudio's routing infrastructure follows a node graph paradigm. The idea is that you create a
-node whose outputs are attached to inputs of another node, thereby creating a graph. There are
-different types of nodes, with each node in the graph processing input data to produce output,
-which is then fed through the chain. Each node in the graph can apply their own custom effects. At
-the start of the graph will usually be one or more data source nodes which have no inputs and
-instead pull their data from a data source. At the end of the graph is an endpoint which represents
-the end of the chain and is where the final output is ultimately extracted from.
-
-Each node has a number of input buses and a number of output buses. An output bus from a node is
-attached to an input bus of another. Multiple nodes can connect their output buses to another
-node's input bus, in which case their outputs will be mixed before processing by the node. Below is
-a diagram that illustrates a hypothetical node graph setup:
-
-    ```
-    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Data flows left to right >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-
-    +---------------+                              +-----------------+
-    | Data Source 1 =----+    +----------+    +----= Low Pass Filter =----+
-    +---------------+    |    |          =----+    +-----------------+    |    +----------+
-                         +----= Splitter |                                +----= ENDPOINT |
-    +---------------+    |    |          =----+    +-----------------+    |    +----------+
-    | Data Source 2 =----+    +----------+    +----=  Echo / Delay   =----+
-    +---------------+                              +-----------------+
-    ```
-
-In the above graph, it starts with two data sources whose outputs are attached to the input of a
-splitter node. It's at this point that the two data sources are mixed. After mixing, the splitter
-performs it's processing routine and produces two outputs which is simply a duplication of the
-input stream. One output is attached to a low pass filter, whereas the other output is attached to
-a echo/delay. The outputs of the the low pass filter and the echo are attached to the endpoint, and
-since they're both connected to the same input bus, they'll be mixed.
-
-Each input bus must be configured to accept the same number of channels, but the number of channels
-used by input buses can be different to the number of channels for output buses in which case
-miniaudio will automatically convert the input data to the output channel count before processing.
-The number of channels of an output bus of one node must match the channel count of the input bus
-it's attached to. The channel counts cannot be changed after the node has been initialized. If you
-attempt to attach an output bus to an input bus with a different channel count, attachment will
-fail.
-
-To use a node graph, you first need to initialize a `ma_node_graph` object. This is essentially a
-container around the entire graph. The `ma_node_graph` object is required for some thread-safety
-issues which will be explained later. A `ma_node_graph` object is initialized using miniaudio's
-standard config/init system:
-
-    ```c
-    ma_node_graph_config nodeGraphConfig = ma_node_graph_config_init(myChannelCount);
-
-    result = ma_node_graph_init(&nodeGraphConfig, NULL, &nodeGraph);    // Second parameter is a pointer to allocation callbacks.
-    if (result != MA_SUCCESS) {
-        // Failed to initialize node graph.
-    }
-    ```
-
-When you initialize the node graph, you're specifying the channel count of the endpoint. The
-endpoint is a special node which has one input bus and one output bus, both of which have the
-same channel count, which is specified in the config. Any nodes that connect directly to the
-endpoint must be configured such that their output buses have the same channel count. When you read
-audio data from the node graph, it'll have the channel count you specified in the config. To read
-data from the graph:
-
-    ```c
-    ma_uint32 framesRead;
-    result = ma_node_graph_read_pcm_frames(&nodeGraph, pFramesOut, frameCount, &framesRead);
-    if (result != MA_SUCCESS) {
-        // Failed to read data from the node graph.
-    }
-    ```
-
-When you read audio data, miniaudio starts at the node graph's endpoint node which then pulls in
-data from it's input attachments, which in turn recursively pull in data from their inputs, and so
-on. At the start of the graph there will be some kind of data source node which will have zero
-inputs and will instead read directly from a data source. The base nodes don't literally need to
-read from a `ma_data_source` object, but they will always have some kind of underlying object that
-sources some kind of audio. The `ma_data_source_node` node can be used to read from a
-`ma_data_source`. Data is always in floating-point format and in the number of channels you
-specified when the graph was initialized. The sample rate is defined by the underlying data sources.
-It's up to you to ensure they use a consistent and appropriate sample rate.
-
-The `ma_node` API is designed to allow custom nodes to be implemented with relative ease, but
-miniaudio includes a few stock nodes for common functionality. This is how you would initialize a
-node which reads directly from a data source (`ma_data_source_node`) which is an example of one
-of the stock nodes that comes with miniaudio:
-
-    ```c
-    ma_data_source_node_config config = ma_data_source_node_config_init(pMyDataSource);
-
-    ma_data_source_node dataSourceNode;
-    result = ma_data_source_node_init(&nodeGraph, &config, NULL, &dataSourceNode);
-    if (result != MA_SUCCESS) {
-        // Failed to create data source node.
-    }
-    ```
-
-The data source node will use the output channel count to determine the channel count of the output
-bus. There will be 1 output bus and 0 input buses (data will be drawn directly from the data
-source). The data source must output to floating-point (`ma_format_f32`) or else an error will be
-returned from `ma_data_source_node_init()`.
-
-By default the node will not be attached to the graph. To do so, use `ma_node_attach_output_bus()`:
-
-    ```c
-    result = ma_node_attach_output_bus(&dataSourceNode, 0, ma_node_graph_get_endpoint(&nodeGraph), 0);
-    if (result != MA_SUCCESS) {
-        // Failed to attach node.
-    }
-    ```
-
-The code above connects the data source node directly to the endpoint. Since the data source node
-has only a single output bus, the index will always be 0. Likewise, the endpoint only has a single
-input bus which means the input bus index will also always be 0.
-
-To detach a specific output bus, use `ma_node_detach_output_bus()`. To detach all output buses, use
-`ma_node_detach_all_output_buses()`. If you want to just move the output bus from one attachment to
-another, you do not need to detach first. You can just call `ma_node_attach_output_bus()` and it'll
-deal with it for you.
-
-Less frequently you may want to create a specialized node. This will be a node where you implement
-your own processing callback to apply a custom effect of some kind. This is similar to initializing
-one of the stock node types, only this time you need to specify a pointer to a vtable containing a
-pointer to the processing function and the number of input and output buses. Example:
-
-    ```c
-    static void my_custom_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-    {
-        // Do some processing of ppFramesIn (one stream of audio data per input bus)
-        const float* pFramesIn_0 = ppFramesIn[0]; // Input bus @ index 0.
-        const float* pFramesIn_1 = ppFramesIn[1]; // Input bus @ index 1.
-        float* pFramesOut_0 = ppFramesOut[0];     // Output bus @ index 0.
-
-        // Do some processing. On input, `pFrameCountIn` will be the number of input frames in each
-        // buffer in `ppFramesIn` and `pFrameCountOut` will be the capacity of each of the buffers
-        // in `ppFramesOut`. On output, `pFrameCountIn` should be set to the number of input frames
-        // your node consumed and `pFrameCountOut` should be set the number of output frames that
-        // were produced.
-        //
-        // You should process as many frames as you can. If your effect consumes input frames at the
-        // same rate as output frames (always the case, unless you're doing resampling), you need
-        // only look at `ppFramesOut` and process that exact number of frames. If you're doing
-        // resampling, you'll need to be sure to set both `pFrameCountIn` and `pFrameCountOut`
-        // properly.
-    }
-
-    static ma_node_vtable my_custom_node_vtable =
-    {
-        my_custom_node_process_pcm_frames, // The function that will be called to process your custom node. This is where you'd implement your effect processing.
-        NULL,   // Optional. A callback for calculating the number of input frames that are required to process a specified number of output frames.
-        2,      // 2 input buses.
-        1,      // 1 output bus.
-        0       // Default flags.
-    };
-
-    ...
-
-    // Each bus needs to have a channel count specified. To do this you need to specify the channel
-    // counts in an array and then pass that into the node config.
-    ma_uint32 inputChannels[2];     // Equal in size to the number of input channels specified in the vtable.
-    ma_uint32 outputChannels[1];    // Equal in size to the number of output channels specified in the vtable.
-
-    inputChannels[0]  = channelsIn;
-    inputChannels[1]  = channelsIn;
-    outputChannels[0] = channelsOut;
-
-    ma_node_config nodeConfig = ma_node_config_init();
-    nodeConfig.vtable          = &my_custom_node_vtable;
-    nodeConfig.pInputChannels  = inputChannels;
-    nodeConfig.pOutputChannels = outputChannels;
-
-    ma_node_base node;
-    result = ma_node_init(&nodeGraph, &nodeConfig, NULL, &node);
-    if (result != MA_SUCCESS) {
-        // Failed to initialize node.
-    }
-    ```
-
-When initializing a custom node, as in the code above, you'll normally just place your vtable in
-static space. The number of input and output buses are specified as part of the vtable. If you need
-a variable number of buses on a per-node bases, the vtable should have the relevant bus count set
-to `MA_NODE_BUS_COUNT_UNKNOWN`. In this case, the bus count should be set in the node config:
-
-    ```c
-    static ma_node_vtable my_custom_node_vtable =
-    {
-        my_custom_node_process_pcm_frames, // The function that will be called process your custom node. This is where you'd implement your effect processing.
-        NULL,   // Optional. A callback for calculating the number of input frames that are required to process a specified number of output frames.
-        MA_NODE_BUS_COUNT_UNKNOWN,  // The number of input buses is determined on a per-node basis.
-        1,      // 1 output bus.
-        0       // Default flags.
-    };
-
-    ...
-
-    ma_node_config nodeConfig = ma_node_config_init();
-    nodeConfig.vtable          = &my_custom_node_vtable;
-    nodeConfig.inputBusCount   = myBusCount;        // <-- Since the vtable specifies MA_NODE_BUS_COUNT_UNKNOWN, the input bus count should be set here.
-    nodeConfig.pInputChannels  = inputChannels;     // <-- Make sure there are nodeConfig.inputBusCount elements in this array.
-    nodeConfig.pOutputChannels = outputChannels;    // <-- The vtable specifies 1 output bus, so there must be 1 element in this array.
-    ```
-
-In the above example it's important to never set the `inputBusCount` and `outputBusCount` members
-to anything other than their defaults if the vtable specifies an explicit count. They can only be
-set if the vtable specifies MA_NODE_BUS_COUNT_UNKNOWN in the relevant bus count.
-
-Most often you'll want to create a structure to encapsulate your node with some extra data. You
-need to make sure the `ma_node_base` object is your first member of the structure:
-
-    ```c
-    typedef struct
-    {
-        ma_node_base base; // <-- Make sure this is always the first member.
-        float someCustomData;
-    } my_custom_node;
-    ```
-
-By doing this, your object will be compatible with all `ma_node` APIs and you can attach it to the
-graph just like any other node.
-
-In the custom processing callback (`my_custom_node_process_pcm_frames()` in the example above), the
-number of channels for each bus is what was specified by the config when the node was initialized
-with `ma_node_init()`. In addition, all attachments to each of the input buses will have been
-pre-mixed by miniaudio. The config allows you to specify different channel counts for each
-individual input and output bus. It's up to the effect to handle it appropriate, and if it can't,
-return an error in it's initialization routine.
-
-Custom nodes can be assigned some flags to describe their behaviour. These are set via the vtable
-and include the following:
-
-    +-----------------------------------------+---------------------------------------------------+
-    | Flag Name                               | Description                                       |
-    +-----------------------------------------+---------------------------------------------------+
-    | MA_NODE_FLAG_PASSTHROUGH                | Useful for nodes that do not do any kind of audio |
-    |                                         | processing, but are instead used for tracking     |
-    |                                         | time, handling events, etc. Also used by the      |
-    |                                         | internal endpoint node. It reads directly from    |
-    |                                         | the input bus to the output bus. Nodes with this  |
-    |                                         | flag must have exactly 1 input bus and 1 output   |
-    |                                         | bus, and both buses must have the same channel    |
-    |                                         | counts.                                           |
-    +-----------------------------------------+---------------------------------------------------+
-    | MA_NODE_FLAG_CONTINUOUS_PROCESSING      | Causes the processing callback to be called even  |
-    |                                         | when no data is available to be read from input   |
-    |                                         | attachments. When a node has at least one input   |
-    |                                         | bus, but there are no inputs attached or the      |
-    |                                         | inputs do not deliver any data, the node's        |
-    |                                         | processing callback will not get fired. This flag |
-    |                                         | will make it so the callback is always fired      |
-    |                                         | regardless of whether or not any input data is    |
-    |                                         | received. This is useful for effects like         |
-    |                                         | echos where there will be a tail of audio data    |
-    |                                         | that still needs to be processed even when the    |
-    |                                         | original data sources have reached their ends. It |
-    |                                         | may also be useful for nodes that must always     |
-    |                                         | have their processing callback fired when there   |
-    |                                         | are no inputs attached.                           |
-    +-----------------------------------------+---------------------------------------------------+
-    | MA_NODE_FLAG_ALLOW_NULL_INPUT           | Used in conjunction with                          |
-    |                                         | `MA_NODE_FLAG_CONTINUOUS_PROCESSING`. When this   |
-    |                                         | is set, the `ppFramesIn` parameter of the         |
-    |                                         | processing callback will be set to NULL when      |
-    |                                         | there are no input frames are available. When     |
-    |                                         | this is unset, silence will be posted to the      |
-    |                                         | processing callback.                              |
-    +-----------------------------------------+---------------------------------------------------+
-    | MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES | Used to tell miniaudio that input and output      |
-    |                                         | frames are processed at different rates. You      |
-    |                                         | should set this for any nodes that perform        |
-    |                                         | resampling.                                       |
-    +-----------------------------------------+---------------------------------------------------+
-    | MA_NODE_FLAG_SILENT_OUTPUT              | Used to tell miniaudio that a node produces only  |
-    |                                         | silent output. This is useful for nodes where you |
-    |                                         | don't want the output to contribute to the final  |
-    |                                         | mix. An example might be if you want split your   |
-    |                                         | stream and have one branch be output to a file.   |
-    |                                         | When using this flag, you should avoid writing to |
-    |                                         | the output buffer of the node's processing        |
-    |                                         | callback because miniaudio will ignore it anyway. |
-    +-----------------------------------------+---------------------------------------------------+
-
-
-If you need to make a copy of an audio stream for effect processing you can use a splitter node
-called `ma_splitter_node`. This takes has 1 input bus and splits the stream into 2 output buses.
-You can use it like this:
-
-    ```c
-    ma_splitter_node_config splitterNodeConfig = ma_splitter_node_config_init(channels);
-
-    ma_splitter_node splitterNode;
-    result = ma_splitter_node_init(&nodeGraph, &splitterNodeConfig, NULL, &splitterNode);
-    if (result != MA_SUCCESS) {
-        // Failed to create node.
-    }
-
-    // Attach your output buses to two different input buses (can be on two different nodes).
-    ma_node_attach_output_bus(&splitterNode, 0, ma_node_graph_get_endpoint(&nodeGraph), 0); // Attach directly to the endpoint.
-    ma_node_attach_output_bus(&splitterNode, 1, &myEffectNode,                          0); // Attach to input bus 0 of some effect node.
-    ```
-
-The volume of an output bus can be configured on a per-bus basis:
-
-    ```c
-    ma_node_set_output_bus_volume(&splitterNode, 0, 0.5f);
-    ma_node_set_output_bus_volume(&splitterNode, 1, 0.5f);
-    ```
-
-In the code above we're using the splitter node from before and changing the volume of each of the
-copied streams.
-
-You can start and stop a node with the following:
-
-    ```c
-    ma_node_set_state(&splitterNode, ma_node_state_started);    // The default state.
-    ma_node_set_state(&splitterNode, ma_node_state_stopped);
-    ```
-
-By default the node is in a started state, but since it won't be connected to anything won't
-actually be invoked by the node graph until it's connected. When you stop a node, data will not be
-read from any of it's input connections. You can use this property to stop a group of sounds
-atomically.
-
-You can configure the initial state of a node in it's config:
-
-    ```c
-    nodeConfig.initialState = ma_node_state_stopped;
-    ```
-
-Note that for the stock specialized nodes, all of their configs will have a `nodeConfig` member
-which is the config to use with the base node. This is where the initial state can be configured
-for specialized nodes:
-
-    ```c
-    dataSourceNodeConfig.nodeConfig.initialState = ma_node_state_stopped;
-    ```
-
-When using a specialized node like `ma_data_source_node` or `ma_splitter_node`, be sure to not
-modify the `vtable` member of the `nodeConfig` object.
-
-
-7.1. Timing
------------
-The node graph supports starting and stopping nodes at scheduled times. This is especially useful
-for data source nodes where you want to get the node set up, but only start playback at a specific
-time. There are two clocks: local and global.
-
-A local clock is per-node, whereas the global clock is per graph. Scheduling starts and stops can
-only be done based on the global clock because the local clock will not be running while the node
-is stopped. The global clocks advances whenever `ma_node_graph_read_pcm_frames()` is called. On the
-other hand, the local clock only advances when the node's processing callback is fired, and is
-advanced based on the output frame count.
-
-To retrieve the global time, use `ma_node_graph_get_time()`. The global time can be set with
-`ma_node_graph_set_time()` which might be useful if you want to do seeking on a global timeline.
-Getting and setting the local time is similar. Use `ma_node_get_time()` to retrieve the local time,
-and `ma_node_set_time()` to set the local time. The global and local times will be advanced by the
-audio thread, so care should be taken to avoid data races. Ideally you should avoid calling these
-outside of the node processing callbacks which are always run on the audio thread.
-
-There is basic support for scheduling the starting and stopping of nodes. You can only schedule one
-start and one stop at a time. This is mainly intended for putting nodes into a started or stopped
-state in a frame-exact manner. Without this mechanism, starting and stopping of a node is limited
-to the resolution of a call to `ma_node_graph_read_pcm_frames()` which would typically be in blocks
-of several milliseconds. The following APIs can be used for scheduling node states:
-
-    ```c
-    ma_node_set_state_time()
-    ma_node_get_state_time()
-    ```
-
-The time is absolute and must be based on the global clock. An example is below:
-
-    ```c
-    ma_node_set_state_time(&myNode, ma_node_state_started, sampleRate*1);   // Delay starting to 1 second.
-    ma_node_set_state_time(&myNode, ma_node_state_stopped, sampleRate*5);   // Delay stopping to 5 seconds.
-    ```
-
-An example for changing the state using a relative time.
-
-    ```c
-    ma_node_set_state_time(&myNode, ma_node_state_started, sampleRate*1 + ma_node_graph_get_time(&myNodeGraph));
-    ma_node_set_state_time(&myNode, ma_node_state_stopped, sampleRate*5 + ma_node_graph_get_time(&myNodeGraph));
-    ```
-
-Note that due to the nature of multi-threading the times may not be 100% exact. If this is an
-issue, consider scheduling state changes from within a processing callback. An idea might be to
-have some kind of passthrough trigger node that is used specifically for tracking time and handling
-events.
-
-
-
-7.2. Thread Safety and Locking
-------------------------------
-When processing audio, it's ideal not to have any kind of locking in the audio thread. Since it's
-expected that `ma_node_graph_read_pcm_frames()` would be run on the audio thread, it does so
-without the use of any locks. This section discusses the implementation used by miniaudio and goes
-over some of the compromises employed by miniaudio to achieve this goal. Note that the current
-implementation may not be ideal - feedback and critiques are most welcome.
-
-The node graph API is not *entirely* lock-free. Only `ma_node_graph_read_pcm_frames()` is expected
-to be lock-free. Attachment, detachment and uninitialization of nodes use locks to simplify the
-implementation, but are crafted in a way such that such locking is not required when reading audio
-data from the graph. Locking in these areas are achieved by means of spinlocks.
-
-The main complication with keeping `ma_node_graph_read_pcm_frames()` lock-free stems from the fact
-that a node can be uninitialized, and it's memory potentially freed, while in the middle of being
-processed on the audio thread. There are times when the audio thread will be referencing a node,
-which means the uninitialization process of a node needs to make sure it delays returning until the
-audio thread is finished so that control is not handed back to the caller thereby giving them a
-chance to free the node's memory.
-
-When the audio thread is processing a node, it does so by reading from each of the output buses of
-the node. In order for a node to process data for one of it's output buses, it needs to read from
-each of it's input buses, and so on an so forth. It follows that once all output buses of a node
-are detached, the node as a whole will be disconnected and no further processing will occur unless
-it's output buses are reattached, which won't be happening when the node is being uninitialized.
-By having `ma_node_detach_output_bus()` wait until the audio thread is finished with it, we can
-simplify a few things, at the expense of making `ma_node_detach_output_bus()` a bit slower. By
-doing this, the implementation of `ma_node_uninit()` becomes trivial - just detach all output
-nodes, followed by each of the attachments to each of it's input nodes, and then do any final clean
-up.
-
-With the above design, the worst-case scenario is `ma_node_detach_output_bus()` taking as long as
-it takes to process the output bus being detached. This will happen if it's called at just the
-wrong moment where the audio thread has just iterated it and has just started processing. The
-caller of `ma_node_detach_output_bus()` will stall until the audio thread is finished, which
-includes the cost of recursively processing it's inputs. This is the biggest compromise made with
-the approach taken by miniaudio for it's lock-free processing system. The cost of detaching nodes
-earlier in the pipeline (data sources, for example) will be cheaper than the cost of detaching
-higher level nodes, such as some kind of final post-processing endpoint. If you need to do mass
-detachments, detach starting from the lowest level nodes and work your way towards the final
-endpoint node (but don't try detaching the node graph's endpoint). If the audio thread is not
-running, detachment will be fast and detachment in any order will be the same. The reason nodes
-need to wait for their input attachments to complete is due to the potential for desyncs between
-data sources. If the node was to terminate processing mid way through processing it's inputs,
-there's a chance that some of the underlying data sources will have been read, but then others not.
-That will then result in a potential desynchronization when detaching and reattaching higher-level
-nodes. A possible solution to this is to have an option when detaching to terminate processing
-before processing all input attachments which should be fairly simple.
-
-Another compromise, albeit less significant, is locking when attaching and detaching nodes. This
-locking is achieved by means of a spinlock in order to reduce memory overhead. A lock is present
-for each input bus and output bus. When an output bus is connected to an input bus, both the output
-bus and input bus is locked. This locking is specifically for attaching and detaching across
-different threads and does not affect `ma_node_graph_read_pcm_frames()` in any way. The locking and
-unlocking is mostly self-explanatory, but a slightly less intuitive aspect comes into it when
-considering that iterating over attachments must not break as a result of attaching or detaching a
-node while iteration is occurring.
-
-Attaching and detaching are both quite simple. When an output bus of a node is attached to an input
-bus of another node, it's added to a linked list. Basically, an input bus is a linked list, where
-each item in the list is and output bus. We have some intentional (and convenient) restrictions on
-what can done with the linked list in order to simplify the implementation. First of all, whenever
-something needs to iterate over the list, it must do so in a forward direction. Backwards iteration
-is not supported. Also, items can only be added to the start of the list.
-
-The linked list is a doubly-linked list where each item in the list (an output bus) holds a pointer
-to the next item in the list, and another to the previous item. A pointer to the previous item is
-only required for fast detachment of the node - it is never used in iteration. This is an
-important property because it means from the perspective of iteration, attaching and detaching of
-an item can be done with a single atomic assignment. This is exploited by both the attachment and
-detachment process. When attaching the node, the first thing that is done is the setting of the
-local "next" and "previous" pointers of the node. After that, the item is "attached" to the list
-by simply performing an atomic exchange with the head pointer. After that, the node is "attached"
-to the list from the perspective of iteration. Even though the "previous" pointer of the next item
-hasn't yet been set, from the perspective of iteration it's been attached because iteration will
-only be happening in a forward direction which means the "previous" pointer won't actually ever get
-used. The same general process applies to detachment. See `ma_node_attach_output_bus()` and
-`ma_node_detach_output_bus()` for the implementation of this mechanism.
-
-
-
-8. Decoding
-===========
-The `ma_decoder` API is used for reading audio files. Decoders are completely decoupled from
-devices and can be used independently. Built-in support is included for the following formats:
-
-    +---------+
-    | Format  |
-    +---------+
-    | WAV     |
-    | MP3     |
-    | FLAC    |
-    +---------+
-
-You can disable the built-in decoders by specifying one or more of the following options before the
-miniaudio implementation:
-
-    ```c
-    #define MA_NO_WAV
-    #define MA_NO_MP3
-    #define MA_NO_FLAC
-    ```
-
-miniaudio supports the ability to plug in custom decoders. See the section below for details on how
-to use custom decoders.
-
-A decoder can be initialized from a file with `ma_decoder_init_file()`, a block of memory with
-`ma_decoder_init_memory()`, or from data delivered via callbacks with `ma_decoder_init()`. Here is
-an example for loading a decoder from a file:
-
-    ```c
-    ma_decoder decoder;
-    ma_result result = ma_decoder_init_file("MySong.mp3", NULL, &decoder);
-    if (result != MA_SUCCESS) {
-        return false;   // An error occurred.
-    }
-
-    ...
-
-    ma_decoder_uninit(&decoder);
-    ```
-
-When initializing a decoder, you can optionally pass in a pointer to a `ma_decoder_config` object
-(the `NULL` argument in the example above) which allows you to configure the output format, channel
-count, sample rate and channel map:
-
-    ```c
-    ma_decoder_config config = ma_decoder_config_init(ma_format_f32, 2, 48000);
-    ```
-
-When passing in `NULL` for decoder config in `ma_decoder_init*()`, the output format will be the
-same as that defined by the decoding backend.
-
-Data is read from the decoder as PCM frames. This will output the number of PCM frames actually
-read. If this is less than the requested number of PCM frames it means you've reached the end. The
-return value will be `MA_AT_END` if no samples have been read and the end has been reached.
-
-    ```c
-    ma_result result = ma_decoder_read_pcm_frames(pDecoder, pFrames, framesToRead, &framesRead);
-    if (framesRead < framesToRead) {
-        // Reached the end.
-    }
-    ```
-
-You can also seek to a specific frame like so:
-
-    ```c
-    ma_result result = ma_decoder_seek_to_pcm_frame(pDecoder, targetFrame);
-    if (result != MA_SUCCESS) {
-        return false;   // An error occurred.
-    }
-    ```
-
-If you want to loop back to the start, you can simply seek back to the first PCM frame:
-
-    ```c
-    ma_decoder_seek_to_pcm_frame(pDecoder, 0);
-    ```
-
-When loading a decoder, miniaudio uses a trial and error technique to find the appropriate decoding
-backend. This can be unnecessarily inefficient if the type is already known. In this case you can
-use `encodingFormat` variable in the device config to specify a specific encoding format you want
-to decode:
-
-    ```c
-    decoderConfig.encodingFormat = ma_encoding_format_wav;
-    ```
-
-See the `ma_encoding_format` enum for possible encoding formats.
-
-The `ma_decoder_init_file()` API will try using the file extension to determine which decoding
-backend to prefer.
-
-
-8.1. Custom Decoders
---------------------
-It's possible to implement a custom decoder and plug it into miniaudio. This is extremely useful
-when you want to use the `ma_decoder` API, but need to support an encoding format that's not one of
-the stock formats supported by miniaudio. This can be put to particularly good use when using the
-`ma_engine` and/or `ma_resource_manager` APIs because they use `ma_decoder` internally. If, for
-example, you wanted to support Opus, you can do so with a custom decoder (there if a reference
-Opus decoder in the "extras" folder of the miniaudio repository which uses libopus + libopusfile).
-
-A custom decoder must implement a data source. A vtable called `ma_decoding_backend_vtable` needs
-to be implemented which is then passed into the decoder config:
-
-    ```c
-    ma_decoding_backend_vtable* pCustomBackendVTables[] =
-    {
-        &g_ma_decoding_backend_vtable_libvorbis,
-        &g_ma_decoding_backend_vtable_libopus
-    };
-
-    ...
-
-    decoderConfig = ma_decoder_config_init_default();
-    decoderConfig.pCustomBackendUserData = NULL;
-    decoderConfig.ppCustomBackendVTables = pCustomBackendVTables;
-    decoderConfig.customBackendCount     = sizeof(pCustomBackendVTables) / sizeof(pCustomBackendVTables[0]);
-    ```
-
-The `ma_decoding_backend_vtable` vtable has the following functions:
-
-    ```
-    onInit
-    onInitFile
-    onInitFileW
-    onInitMemory
-    onUninit
-    ```
-
-There are only two functions that must be implemented - `onInit` and `onUninit`. The other
-functions can be implemented for a small optimization for loading from a file path or memory. If
-these are not specified, miniaudio will deal with it for you via a generic implementation.
-
-When you initialize a custom data source (by implementing the `onInit` function in the vtable) you
-will need to output a pointer to a `ma_data_source` which implements your custom decoder. See the
-section about data sources for details on how to implement this. Alternatively, see the
-"custom_decoders" example in the miniaudio repository.
-
-The `onInit` function takes a pointer to some callbacks for the purpose of reading raw audio data
-from some arbitrary source. You'll use these functions to read from the raw data and perform the
-decoding. When you call them, you will pass in the `pReadSeekTellUserData` pointer to the relevant
-parameter.
-
-The `pConfig` parameter in `onInit` can be used to configure the backend if appropriate. It's only
-used as a hint and can be ignored. However, if any of the properties are relevant to your decoder,
-an optimal implementation will handle the relevant properties appropriately.
-
-If memory allocation is required, it should be done so via the specified allocation callbacks if
-possible (the `pAllocationCallbacks` parameter).
-
-If an error occurs when initializing the decoder, you should leave `ppBackend` unset, or set to
-NULL, and make sure everything is cleaned up appropriately and an appropriate result code returned.
-When multiple custom backends are specified, miniaudio will cycle through the vtables in the order
-they're listed in the array that's passed into the decoder config so it's important that your
-initialization routine is clean.
-
-When a decoder is uninitialized, the `onUninit` callback will be fired which will give you an
-opportunity to clean up and internal data.
-
-
-
-9. Encoding
-===========
-The `ma_encoding` API is used for writing audio files. The only supported output format is WAV.
-This can be disabled by specifying the following option before the implementation of miniaudio:
-
-    ```c
-    #define MA_NO_WAV
-    ```
-
-An encoder can be initialized to write to a file with `ma_encoder_init_file()` or from data
-delivered via callbacks with `ma_encoder_init()`. Below is an example for initializing an encoder
-to output to a file.
-
-    ```c
-    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, FORMAT, CHANNELS, SAMPLE_RATE);
-    ma_encoder encoder;
-    ma_result result = ma_encoder_init_file("my_file.wav", &config, &encoder);
-    if (result != MA_SUCCESS) {
-        // Error
-    }
-
-    ...
-
-    ma_encoder_uninit(&encoder);
-    ```
-
-When initializing an encoder you must specify a config which is initialized with
-`ma_encoder_config_init()`. Here you must specify the file type, the output sample format, output
-channel count and output sample rate. The following file types are supported:
-
-    +------------------------+-------------+
-    | Enum                   | Description |
-    +------------------------+-------------+
-    | ma_encoding_format_wav | WAV         |
-    +------------------------+-------------+
-
-If the format, channel count or sample rate is not supported by the output file type an error will
-be returned. The encoder will not perform data conversion so you will need to convert it before
-outputting any audio data. To output audio data, use `ma_encoder_write_pcm_frames()`, like in the
-example below:
-
-    ```c
-    ma_uint64 framesWritten;
-    result = ma_encoder_write_pcm_frames(&encoder, pPCMFramesToWrite, framesToWrite, &framesWritten);
-    if (result != MA_SUCCESS) {
-        ... handle error ...
-    }
-    ```
-
-The `framesWritten` variable will contain the number of PCM frames that were actually written. This
-is optionally and you can pass in `NULL` if you need this.
-
-Encoders must be uninitialized with `ma_encoder_uninit()`.
-
-
-
-10. Data Conversion
-===================
-A data conversion API is included with miniaudio which supports the majority of data conversion
-requirements. This supports conversion between sample formats, channel counts (with channel
-mapping) and sample rates.
-
-
-10.1. Sample Format Conversion
-------------------------------
-Conversion between sample formats is achieved with the `ma_pcm_*_to_*()`, `ma_pcm_convert()` and
-`ma_convert_pcm_frames_format()` APIs. Use `ma_pcm_*_to_*()` to convert between two specific
-formats. Use `ma_pcm_convert()` to convert based on a `ma_format` variable. Use
-`ma_convert_pcm_frames_format()` to convert PCM frames where you want to specify the frame count
-and channel count as a variable instead of the total sample count.
-
-
-10.1.1. Dithering
------------------
-Dithering can be set using the ditherMode parameter.
-
-The different dithering modes include the following, in order of efficiency:
-
-    +-----------+--------------------------+
-    | Type      | Enum Token               |
-    +-----------+--------------------------+
-    | None      | ma_dither_mode_none      |
-    | Rectangle | ma_dither_mode_rectangle |
-    | Triangle  | ma_dither_mode_triangle  |
-    +-----------+--------------------------+
-
-Note that even if the dither mode is set to something other than `ma_dither_mode_none`, it will be
-ignored for conversions where dithering is not needed. Dithering is available for the following
-conversions:
-
-    ```
-    s16 -> u8
-    s24 -> u8
-    s32 -> u8
-    f32 -> u8
-    s24 -> s16
-    s32 -> s16
-    f32 -> s16
-    ```
-
-Note that it is not an error to pass something other than ma_dither_mode_none for conversions where
-dither is not used. It will just be ignored.
-
-
-
-10.2. Channel Conversion
-------------------------
-Channel conversion is used for channel rearrangement and conversion from one channel count to
-another. The `ma_channel_converter` API is used for channel conversion. Below is an example of
-initializing a simple channel converter which converts from mono to stereo.
-
-    ```c
-    ma_channel_converter_config config = ma_channel_converter_config_init(
-        ma_format,                      // Sample format
-        1,                              // Input channels
-        NULL,                           // Input channel map
-        2,                              // Output channels
-        NULL,                           // Output channel map
-        ma_channel_mix_mode_default);   // The mixing algorithm to use when combining channels.
-
-    result = ma_channel_converter_init(&config, NULL, &converter);
-    if (result != MA_SUCCESS) {
-        // Error.
-    }
-    ```
-
-To perform the conversion simply call `ma_channel_converter_process_pcm_frames()` like so:
-
-    ```c
-    ma_result result = ma_channel_converter_process_pcm_frames(&converter, pFramesOut, pFramesIn, frameCount);
-    if (result != MA_SUCCESS) {
-        // Error.
-    }
-    ```
-
-It is up to the caller to ensure the output buffer is large enough to accommodate the new PCM
-frames.
-
-Input and output PCM frames are always interleaved. Deinterleaved layouts are not supported.
-
-
-10.2.1. Channel Mapping
------------------------
-In addition to converting from one channel count to another, like the example above, the channel
-converter can also be used to rearrange channels. When initializing the channel converter, you can
-optionally pass in channel maps for both the input and output frames. If the channel counts are the
-same, and each channel map contains the same channel positions with the exception that they're in
-a different order, a simple shuffling of the channels will be performed. If, however, there is not
-a 1:1 mapping of channel positions, or the channel counts differ, the input channels will be mixed
-based on a mixing mode which is specified when initializing the `ma_channel_converter_config`
-object.
-
-When converting from mono to multi-channel, the mono channel is simply copied to each output
-channel. When going the other way around, the audio of each output channel is simply averaged and
-copied to the mono channel.
-
-In more complicated cases blending is used. The `ma_channel_mix_mode_simple` mode will drop excess
-channels and silence extra channels. For example, converting from 4 to 2 channels, the 3rd and 4th
-channels will be dropped, whereas converting from 2 to 4 channels will put silence into the 3rd and
-4th channels.
-
-The `ma_channel_mix_mode_rectangle` mode uses spacial locality based on a rectangle to compute a
-simple distribution between input and output. Imagine sitting in the middle of a room, with
-speakers on the walls representing channel positions. The `MA_CHANNEL_FRONT_LEFT` position can be
-thought of as being in the corner of the front and left walls.
-
-Finally, the `ma_channel_mix_mode_custom_weights` mode can be used to use custom user-defined
-weights. Custom weights can be passed in as the last parameter of
-`ma_channel_converter_config_init()`.
-
-Predefined channel maps can be retrieved with `ma_channel_map_init_standard()`. This takes a
-`ma_standard_channel_map` enum as it's first parameter, which can be one of the following:
-
-    +-----------------------------------+-----------------------------------------------------------+
-    | Name                              | Description                                               |
-    +-----------------------------------+-----------------------------------------------------------+
-    | ma_standard_channel_map_default   | Default channel map used by miniaudio. See below.         |
-    | ma_standard_channel_map_microsoft | Channel map used by Microsoft's bitfield channel maps.    |
-    | ma_standard_channel_map_alsa      | Default ALSA channel map.                                 |
-    | ma_standard_channel_map_rfc3551   | RFC 3551. Based on AIFF.                                  |
-    | ma_standard_channel_map_flac      | FLAC channel map.                                         |
-    | ma_standard_channel_map_vorbis    | Vorbis channel map.                                       |
-    | ma_standard_channel_map_sound4    | FreeBSD's sound(4).                                       |
-    | ma_standard_channel_map_sndio     | sndio channel map. http://www.sndio.org/tips.html.        |
-    | ma_standard_channel_map_webaudio  | https://webaudio.github.io/web-audio-api/#ChannelOrdering |
-    +-----------------------------------+-----------------------------------------------------------+
-
-Below are the channel maps used by default in miniaudio (`ma_standard_channel_map_default`):
-
-    +---------------+---------------------------------+
-    | Channel Count | Mapping                         |
-    +---------------+---------------------------------+
-    | 1 (Mono)      | 0: MA_CHANNEL_MONO              |
-    +---------------+---------------------------------+
-    | 2 (Stereo)    | 0: MA_CHANNEL_FRONT_LEFT   <br> |
-    |               | 1: MA_CHANNEL_FRONT_RIGHT       |
-    +---------------+---------------------------------+
-    | 3             | 0: MA_CHANNEL_FRONT_LEFT   <br> |
-    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
-    |               | 2: MA_CHANNEL_FRONT_CENTER      |
-    +---------------+---------------------------------+
-    | 4 (Surround)  | 0: MA_CHANNEL_FRONT_LEFT   <br> |
-    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
-    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
-    |               | 3: MA_CHANNEL_BACK_CENTER       |
-    +---------------+---------------------------------+
-    | 5             | 0: MA_CHANNEL_FRONT_LEFT   <br> |
-    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
-    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
-    |               | 3: MA_CHANNEL_BACK_LEFT    <br> |
-    |               | 4: MA_CHANNEL_BACK_RIGHT        |
-    +---------------+---------------------------------+
-    | 6 (5.1)       | 0: MA_CHANNEL_FRONT_LEFT   <br> |
-    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
-    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
-    |               | 3: MA_CHANNEL_LFE          <br> |
-    |               | 4: MA_CHANNEL_SIDE_LEFT    <br> |
-    |               | 5: MA_CHANNEL_SIDE_RIGHT        |
-    +---------------+---------------------------------+
-    | 7             | 0: MA_CHANNEL_FRONT_LEFT   <br> |
-    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
-    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
-    |               | 3: MA_CHANNEL_LFE          <br> |
-    |               | 4: MA_CHANNEL_BACK_CENTER  <br> |
-    |               | 4: MA_CHANNEL_SIDE_LEFT    <br> |
-    |               | 5: MA_CHANNEL_SIDE_RIGHT        |
-    +---------------+---------------------------------+
-    | 8 (7.1)       | 0: MA_CHANNEL_FRONT_LEFT   <br> |
-    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
-    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
-    |               | 3: MA_CHANNEL_LFE          <br> |
-    |               | 4: MA_CHANNEL_BACK_LEFT    <br> |
-    |               | 5: MA_CHANNEL_BACK_RIGHT   <br> |
-    |               | 6: MA_CHANNEL_SIDE_LEFT    <br> |
-    |               | 7: MA_CHANNEL_SIDE_RIGHT        |
-    +---------------+---------------------------------+
-    | Other         | All channels set to 0. This     |
-    |               | is equivalent to the same       |
-    |               | mapping as the device.          |
-    +---------------+---------------------------------+
-
-
-
-10.3. Resampling
-----------------
-Resampling is achieved with the `ma_resampler` object. To create a resampler object, do something
-like the following:
-
-    ```c
-    ma_resampler_config config = ma_resampler_config_init(
-        ma_format_s16,
-        channels,
-        sampleRateIn,
-        sampleRateOut,
-        ma_resample_algorithm_linear);
-
-    ma_resampler resampler;
-    ma_result result = ma_resampler_init(&config, &resampler);
-    if (result != MA_SUCCESS) {
-        // An error occurred...
-    }
-    ```
-
-Do the following to uninitialize the resampler:
-
-    ```c
-    ma_resampler_uninit(&resampler);
-    ```
-
-The following example shows how data can be processed
-
-    ```c
-    ma_uint64 frameCountIn  = 1000;
-    ma_uint64 frameCountOut = 2000;
-    ma_result result = ma_resampler_process_pcm_frames(&resampler, pFramesIn, &frameCountIn, pFramesOut, &frameCountOut);
-    if (result != MA_SUCCESS) {
-        // An error occurred...
-    }
-
-    // At this point, frameCountIn contains the number of input frames that were consumed and frameCountOut contains the
-    // number of output frames written.
-    ```
-
-To initialize the resampler you first need to set up a config (`ma_resampler_config`) with
-`ma_resampler_config_init()`. You need to specify the sample format you want to use, the number of
-channels, the input and output sample rate, and the algorithm.
-
-The sample format can be either `ma_format_s16` or `ma_format_f32`. If you need a different format
-you will need to perform pre- and post-conversions yourself where necessary. Note that the format
-is the same for both input and output. The format cannot be changed after initialization.
-
-The resampler supports multiple channels and is always interleaved (both input and output). The
-channel count cannot be changed after initialization.
-
-The sample rates can be anything other than zero, and are always specified in hertz. They should be
-set to something like 44100, etc. The sample rate is the only configuration property that can be
-changed after initialization.
-
-The miniaudio resampler has built-in support for the following algorithms:
-
-    +-----------+------------------------------+
-    | Algorithm | Enum Token                   |
-    +-----------+------------------------------+
-    | Linear    | ma_resample_algorithm_linear |
-    | Custom    | ma_resample_algorithm_custom |
-    +-----------+------------------------------+
-
-The algorithm cannot be changed after initialization.
-
-Processing always happens on a per PCM frame basis and always assumes interleaved input and output.
-De-interleaved processing is not supported. To process frames, use
-`ma_resampler_process_pcm_frames()`. On input, this function takes the number of output frames you
-can fit in the output buffer and the number of input frames contained in the input buffer. On
-output these variables contain the number of output frames that were written to the output buffer
-and the number of input frames that were consumed in the process. You can pass in NULL for the
-input buffer in which case it will be treated as an infinitely large buffer of zeros. The output
-buffer can also be NULL, in which case the processing will be treated as seek.
-
-The sample rate can be changed dynamically on the fly. You can change this with explicit sample
-rates with `ma_resampler_set_rate()` and also with a decimal ratio with
-`ma_resampler_set_rate_ratio()`. The ratio is in/out.
-
-Sometimes it's useful to know exactly how many input frames will be required to output a specific
-number of frames. You can calculate this with `ma_resampler_get_required_input_frame_count()`.
-Likewise, it's sometimes useful to know exactly how many frames would be output given a certain
-number of input frames. You can do this with `ma_resampler_get_expected_output_frame_count()`.
-
-Due to the nature of how resampling works, the resampler introduces some latency. This can be
-retrieved in terms of both the input rate and the output rate with
-`ma_resampler_get_input_latency()` and `ma_resampler_get_output_latency()`.
-
-
-10.3.1. Resampling Algorithms
------------------------------
-The choice of resampling algorithm depends on your situation and requirements.
-
-
-10.3.1.1. Linear Resampling
----------------------------
-The linear resampler is the fastest, but comes at the expense of poorer quality. There is, however,
-some control over the quality of the linear resampler which may make it a suitable option depending
-on your requirements.
-
-The linear resampler performs low-pass filtering before or after downsampling or upsampling,
-depending on the sample rates you're converting between. When decreasing the sample rate, the
-low-pass filter will be applied before downsampling. When increasing the rate it will be performed
-after upsampling. By default a fourth order low-pass filter will be applied. This can be configured
-via the `lpfOrder` configuration variable. Setting this to 0 will disable filtering.
-
-The low-pass filter has a cutoff frequency which defaults to half the sample rate of the lowest of
-the input and output sample rates (Nyquist Frequency).
-
-The API for the linear resampler is the same as the main resampler API, only it's called
-`ma_linear_resampler`.
-
-
-10.3.2. Custom Resamplers
--------------------------
-You can implement a custom resampler by using the `ma_resample_algorithm_custom` resampling
-algorithm and setting a vtable in the resampler config:
-
-    ```c
-    ma_resampler_config config = ma_resampler_config_init(..., ma_resample_algorithm_custom);
-    config.pBackendVTable = &g_customResamplerVTable;
-    ```
-
-Custom resamplers are useful if the stock algorithms are not appropriate for your use case. You
-need to implement the required functions in `ma_resampling_backend_vtable`. Note that not all
-functions in the vtable need to be implemented, but if it's possible to implement, they should be.
-
-You can use the `ma_linear_resampler` object for an example on how to implement the vtable. The
-`onGetHeapSize` callback is used to calculate the size of any internal heap allocation the custom
-resampler will need to make given the supplied config. When you initialize the resampler via the
-`onInit` callback, you'll be given a pointer to a heap allocation which is where you should store
-the heap allocated data. You should not free this data in `onUninit` because miniaudio will manage
-it for you.
-
-The `onProcess` callback is where the actual resampling takes place. On input, `pFrameCountIn`
-points to a variable containing the number of frames in the `pFramesIn` buffer and
-`pFrameCountOut` points to a variable containing the capacity in frames of the `pFramesOut` buffer.
-On output, `pFrameCountIn` should be set to the number of input frames that were fully consumed,
-whereas `pFrameCountOut` should be set to the number of frames that were written to `pFramesOut`.
-
-The `onSetRate` callback is optional and is used for dynamically changing the sample rate. If
-dynamic rate changes are not supported, you can set this callback to NULL.
-
-The `onGetInputLatency` and `onGetOutputLatency` functions are used for retrieving the latency in
-input and output rates respectively. These can be NULL in which case latency calculations will be
-assumed to be NULL.
-
-The `onGetRequiredInputFrameCount` callback is used to give miniaudio a hint as to how many input
-frames are required to be available to produce the given number of output frames. Likewise, the
-`onGetExpectedOutputFrameCount` callback is used to determine how many output frames will be
-produced given the specified number of input frames. miniaudio will use these as a hint, but they
-are optional and can be set to NULL if you're unable to implement them.
-
-
-
-10.4. General Data Conversion
------------------------------
-The `ma_data_converter` API can be used to wrap sample format conversion, channel conversion and
-resampling into one operation. This is what miniaudio uses internally to convert between the format
-requested when the device was initialized and the format of the backend's native device. The API
-for general data conversion is very similar to the resampling API. Create a `ma_data_converter`
-object like this:
-
-    ```c
-    ma_data_converter_config config = ma_data_converter_config_init(
-        inputFormat,
-        outputFormat,
-        inputChannels,
-        outputChannels,
-        inputSampleRate,
-        outputSampleRate
-    );
-
-    ma_data_converter converter;
-    ma_result result = ma_data_converter_init(&config, NULL, &converter);
-    if (result != MA_SUCCESS) {
-        // An error occurred...
-    }
-    ```
-
-In the example above we use `ma_data_converter_config_init()` to initialize the config, however
-there's many more properties that can be configured, such as channel maps and resampling quality.
-Something like the following may be more suitable depending on your requirements:
-
-    ```c
-    ma_data_converter_config config = ma_data_converter_config_init_default();
-    config.formatIn = inputFormat;
-    config.formatOut = outputFormat;
-    config.channelsIn = inputChannels;
-    config.channelsOut = outputChannels;
-    config.sampleRateIn = inputSampleRate;
-    config.sampleRateOut = outputSampleRate;
-    ma_channel_map_init_standard(ma_standard_channel_map_flac, config.channelMapIn, sizeof(config.channelMapIn)/sizeof(config.channelMapIn[0]), config.channelCountIn);
-    config.resampling.linear.lpfOrder = MA_MAX_FILTER_ORDER;
-    ```
-
-Do the following to uninitialize the data converter:
-
-    ```c
-    ma_data_converter_uninit(&converter, NULL);
-    ```
-
-The following example shows how data can be processed
-
-    ```c
-    ma_uint64 frameCountIn  = 1000;
-    ma_uint64 frameCountOut = 2000;
-    ma_result result = ma_data_converter_process_pcm_frames(&converter, pFramesIn, &frameCountIn, pFramesOut, &frameCountOut);
-    if (result != MA_SUCCESS) {
-        // An error occurred...
-    }
-
-    // At this point, frameCountIn contains the number of input frames that were consumed and frameCountOut contains the number
-    // of output frames written.
-    ```
-
-The data converter supports multiple channels and is always interleaved (both input and output).
-The channel count cannot be changed after initialization.
-
-Sample rates can be anything other than zero, and are always specified in hertz. They should be set
-to something like 44100, etc. The sample rate is the only configuration property that can be
-changed after initialization, but only if the `resampling.allowDynamicSampleRate` member of
-`ma_data_converter_config` is set to `MA_TRUE`. To change the sample rate, use
-`ma_data_converter_set_rate()` or `ma_data_converter_set_rate_ratio()`. The ratio must be in/out.
-The resampling algorithm cannot be changed after initialization.
-
-Processing always happens on a per PCM frame basis and always assumes interleaved input and output.
-De-interleaved processing is not supported. To process frames, use
-`ma_data_converter_process_pcm_frames()`. On input, this function takes the number of output frames
-you can fit in the output buffer and the number of input frames contained in the input buffer. On
-output these variables contain the number of output frames that were written to the output buffer
-and the number of input frames that were consumed in the process. You can pass in NULL for the
-input buffer in which case it will be treated as an infinitely large
-buffer of zeros. The output buffer can also be NULL, in which case the processing will be treated
-as seek.
-
-Sometimes it's useful to know exactly how many input frames will be required to output a specific
-number of frames. You can calculate this with `ma_data_converter_get_required_input_frame_count()`.
-Likewise, it's sometimes useful to know exactly how many frames would be output given a certain
-number of input frames. You can do this with `ma_data_converter_get_expected_output_frame_count()`.
-
-Due to the nature of how resampling works, the data converter introduces some latency if resampling
-is required. This can be retrieved in terms of both the input rate and the output rate with
-`ma_data_converter_get_input_latency()` and `ma_data_converter_get_output_latency()`.
-
-
-
-11. Filtering
-=============
-
-11.1. Biquad Filtering
-----------------------
-Biquad filtering is achieved with the `ma_biquad` API. Example:
-
-    ```c
-    ma_biquad_config config = ma_biquad_config_init(ma_format_f32, channels, b0, b1, b2, a0, a1, a2);
-    ma_result result = ma_biquad_init(&config, &biquad);
-    if (result != MA_SUCCESS) {
-        // Error.
-    }
-
-    ...
-
-    ma_biquad_process_pcm_frames(&biquad, pFramesOut, pFramesIn, frameCount);
-    ```
-
-Biquad filtering is implemented using transposed direct form 2. The numerator coefficients are b0,
-b1 and b2, and the denominator coefficients are a0, a1 and a2. The a0 coefficient is required and
-coefficients must not be pre-normalized.
-
-Supported formats are `ma_format_s16` and `ma_format_f32`. If you need to use a different format
-you need to convert it yourself beforehand. When using `ma_format_s16` the biquad filter will use
-fixed point arithmetic. When using `ma_format_f32`, floating point arithmetic will be used.
-
-Input and output frames are always interleaved.
-
-Filtering can be applied in-place by passing in the same pointer for both the input and output
-buffers, like so:
-
-    ```c
-    ma_biquad_process_pcm_frames(&biquad, pMyData, pMyData, frameCount);
-    ```
-
-If you need to change the values of the coefficients, but maintain the values in the registers you
-can do so with `ma_biquad_reinit()`. This is useful if you need to change the properties of the
-filter while keeping the values of registers valid to avoid glitching. Do not use
-`ma_biquad_init()` for this as it will do a full initialization which involves clearing the
-registers to 0. Note that changing the format or channel count after initialization is invalid and
-will result in an error.
-
-
-11.2. Low-Pass Filtering
-------------------------
-Low-pass filtering is achieved with the following APIs:
-
-    +---------+------------------------------------------+
-    | API     | Description                              |
-    +---------+------------------------------------------+
-    | ma_lpf1 | First order low-pass filter              |
-    | ma_lpf2 | Second order low-pass filter             |
-    | ma_lpf  | High order low-pass filter (Butterworth) |
-    +---------+------------------------------------------+
-
-Low-pass filter example:
-
-    ```c
-    ma_lpf_config config = ma_lpf_config_init(ma_format_f32, channels, sampleRate, cutoffFrequency, order);
-    ma_result result = ma_lpf_init(&config, &lpf);
-    if (result != MA_SUCCESS) {
-        // Error.
-    }
-
-    ...
-
-    ma_lpf_process_pcm_frames(&lpf, pFramesOut, pFramesIn, frameCount);
-    ```
-
-Supported formats are `ma_format_s16` and` ma_format_f32`. If you need to use a different format
-you need to convert it yourself beforehand. Input and output frames are always interleaved.
-
-Filtering can be applied in-place by passing in the same pointer for both the input and output
-buffers, like so:
-
-    ```c
-    ma_lpf_process_pcm_frames(&lpf, pMyData, pMyData, frameCount);
-    ```
-
-The maximum filter order is limited to `MA_MAX_FILTER_ORDER` which is set to 8. If you need more,
-you can chain first and second order filters together.
-
-    ```c
-    for (iFilter = 0; iFilter < filterCount; iFilter += 1) {
-        ma_lpf2_process_pcm_frames(&lpf2[iFilter], pMyData, pMyData, frameCount);
-    }
-    ```
-
-If you need to change the configuration of the filter, but need to maintain the state of internal
-registers you can do so with `ma_lpf_reinit()`. This may be useful if you need to change the sample
-rate and/or cutoff frequency dynamically while maintaining smooth transitions. Note that changing the
-format or channel count after initialization is invalid and will result in an error.
-
-The `ma_lpf` object supports a configurable order, but if you only need a first order filter you
-may want to consider using `ma_lpf1`. Likewise, if you only need a second order filter you can use
-`ma_lpf2`. The advantage of this is that they're lighter weight and a bit more efficient.
-
-If an even filter order is specified, a series of second order filters will be processed in a
-chain. If an odd filter order is specified, a first order filter will be applied, followed by a
-series of second order filters in a chain.
-
-
-11.3. High-Pass Filtering
--------------------------
-High-pass filtering is achieved with the following APIs:
-
-    +---------+-------------------------------------------+
-    | API     | Description                               |
-    +---------+-------------------------------------------+
-    | ma_hpf1 | First order high-pass filter              |
-    | ma_hpf2 | Second order high-pass filter             |
-    | ma_hpf  | High order high-pass filter (Butterworth) |
-    +---------+-------------------------------------------+
-
-High-pass filters work exactly the same as low-pass filters, only the APIs are called `ma_hpf1`,
-`ma_hpf2` and `ma_hpf`. See example code for low-pass filters for example usage.
-
-
-11.4. Band-Pass Filtering
--------------------------
-Band-pass filtering is achieved with the following APIs:
-
-    +---------+-------------------------------+
-    | API     | Description                   |
-    +---------+-------------------------------+
-    | ma_bpf2 | Second order band-pass filter |
-    | ma_bpf  | High order band-pass filter   |
-    +---------+-------------------------------+
-
-Band-pass filters work exactly the same as low-pass filters, only the APIs are called `ma_bpf2` and
-`ma_hpf`. See example code for low-pass filters for example usage. Note that the order for
-band-pass filters must be an even number which means there is no first order band-pass filter,
-unlike low-pass and high-pass filters.
-
-
-11.5. Notch Filtering
----------------------
-Notch filtering is achieved with the following APIs:
-
-    +-----------+------------------------------------------+
-    | API       | Description                              |
-    +-----------+------------------------------------------+
-    | ma_notch2 | Second order notching filter             |
-    +-----------+------------------------------------------+
-
-
-11.6. Peaking EQ Filtering
--------------------------
-Peaking filtering is achieved with the following APIs:
-
-    +----------+------------------------------------------+
-    | API      | Description                              |
-    +----------+------------------------------------------+
-    | ma_peak2 | Second order peaking filter              |
-    +----------+------------------------------------------+
-
-
-11.7. Low Shelf Filtering
--------------------------
-Low shelf filtering is achieved with the following APIs:
-
-    +-------------+------------------------------------------+
-    | API         | Description                              |
-    +-------------+------------------------------------------+
-    | ma_loshelf2 | Second order low shelf filter            |
-    +-------------+------------------------------------------+
-
-Where a high-pass filter is used to eliminate lower frequencies, a low shelf filter can be used to
-just turn them down rather than eliminate them entirely.
-
-
-11.8. High Shelf Filtering
---------------------------
-High shelf filtering is achieved with the following APIs:
-
-    +-------------+------------------------------------------+
-    | API         | Description                              |
-    +-------------+------------------------------------------+
-    | ma_hishelf2 | Second order high shelf filter           |
-    +-------------+------------------------------------------+
-
-The high shelf filter has the same API as the low shelf filter, only you would use `ma_hishelf`
-instead of `ma_loshelf`. Where a low shelf filter is used to adjust the volume of low frequencies,
-the high shelf filter does the same thing for high frequencies.
-
-
-
-
-12. Waveform and Noise Generation
-=================================
-
-12.1. Waveforms
----------------
-miniaudio supports generation of sine, square, triangle and sawtooth waveforms. This is achieved
-with the `ma_waveform` API. Example:
-
-    ```c
-    ma_waveform_config config = ma_waveform_config_init(
-        FORMAT,
-        CHANNELS,
-        SAMPLE_RATE,
-        ma_waveform_type_sine,
-        amplitude,
-        frequency);
-
-    ma_waveform waveform;
-    ma_result result = ma_waveform_init(&config, &waveform);
-    if (result != MA_SUCCESS) {
-        // Error.
-    }
-
-    ...
-
-    ma_waveform_read_pcm_frames(&waveform, pOutput, frameCount);
-    ```
-
-The amplitude, frequency, type, and sample rate can be changed dynamically with
-`ma_waveform_set_amplitude()`, `ma_waveform_set_frequency()`, `ma_waveform_set_type()`, and
-`ma_waveform_set_sample_rate()` respectively.
-
-You can invert the waveform by setting the amplitude to a negative value. You can use this to
-control whether or not a sawtooth has a positive or negative ramp, for example.
-
-Below are the supported waveform types:
-
-    +---------------------------+
-    | Enum Name                 |
-    +---------------------------+
-    | ma_waveform_type_sine     |
-    | ma_waveform_type_square   |
-    | ma_waveform_type_triangle |
-    | ma_waveform_type_sawtooth |
-    +---------------------------+
-
-
-
-12.2. Noise
------------
-miniaudio supports generation of white, pink and Brownian noise via the `ma_noise` API. Example:
-
-    ```c
-    ma_noise_config config = ma_noise_config_init(
-        FORMAT,
-        CHANNELS,
-        ma_noise_type_white,
-        SEED,
-        amplitude);
-
-    ma_noise noise;
-    ma_result result = ma_noise_init(&config, &noise);
-    if (result != MA_SUCCESS) {
-        // Error.
-    }
-
-    ...
-
-    ma_noise_read_pcm_frames(&noise, pOutput, frameCount);
-    ```
-
-The noise API uses simple LCG random number generation. It supports a custom seed which is useful
-for things like automated testing requiring reproducibility. Setting the seed to zero will default
-to `MA_DEFAULT_LCG_SEED`.
-
-The amplitude and seed can be changed dynamically with `ma_noise_set_amplitude()` and
-`ma_noise_set_seed()` respectively.
-
-By default, the noise API will use different values for different channels. So, for example, the
-left side in a stereo stream will be different to the right side. To instead have each channel use
-the same random value, set the `duplicateChannels` member of the noise config to true, like so:
-
-    ```c
-    config.duplicateChannels = MA_TRUE;
-    ```
-
-Below are the supported noise types.
-
-    +------------------------+
-    | Enum Name              |
-    +------------------------+
-    | ma_noise_type_white    |
-    | ma_noise_type_pink     |
-    | ma_noise_type_brownian |
-    +------------------------+
-
-
-
-13. Audio Buffers
-=================
-miniaudio supports reading from a buffer of raw audio data via the `ma_audio_buffer` API. This can
-read from memory that's managed by the application, but can also handle the memory management for
-you internally. Memory management is flexible and should support most use cases.
-
-Audio buffers are initialized using the standard configuration system used everywhere in miniaudio:
-
-    ```c
-    ma_audio_buffer_config config = ma_audio_buffer_config_init(
-        format,
-        channels,
-        sizeInFrames,
-        pExistingData,
-        &allocationCallbacks);
-
-    ma_audio_buffer buffer;
-    result = ma_audio_buffer_init(&config, &buffer);
-    if (result != MA_SUCCESS) {
-        // Error.
-    }
-
-    ...
-
-    ma_audio_buffer_uninit(&buffer);
-    ```
-
-In the example above, the memory pointed to by `pExistingData` will *not* be copied and is how an
-application can do self-managed memory allocation. If you would rather make a copy of the data, use
-`ma_audio_buffer_init_copy()`. To uninitialize the buffer, use `ma_audio_buffer_uninit()`.
-
-Sometimes it can be convenient to allocate the memory for the `ma_audio_buffer` structure and the
-raw audio data in a contiguous block of memory. That is, the raw audio data will be located
-immediately after the `ma_audio_buffer` structure. To do this, use
-`ma_audio_buffer_alloc_and_init()`:
-
-    ```c
-    ma_audio_buffer_config config = ma_audio_buffer_config_init(
-        format,
-        channels,
-        sizeInFrames,
-        pExistingData,
-        &allocationCallbacks);
-
-    ma_audio_buffer* pBuffer
-    result = ma_audio_buffer_alloc_and_init(&config, &pBuffer);
-    if (result != MA_SUCCESS) {
-        // Error
-    }
-
-    ...
-
-    ma_audio_buffer_uninit_and_free(&buffer);
-    ```
-
-If you initialize the buffer with `ma_audio_buffer_alloc_and_init()` you should uninitialize it
-with `ma_audio_buffer_uninit_and_free()`. In the example above, the memory pointed to by
-`pExistingData` will be copied into the buffer, which is contrary to the behavior of
-`ma_audio_buffer_init()`.
-
-An audio buffer has a playback cursor just like a decoder. As you read frames from the buffer, the
-cursor moves forward. The last parameter (`loop`) can be used to determine if the buffer should
-loop. The return value is the number of frames actually read. If this is less than the number of
-frames requested it means the end has been reached. This should never happen if the `loop`
-parameter is set to true. If you want to manually loop back to the start, you can do so with with
-`ma_audio_buffer_seek_to_pcm_frame(pAudioBuffer, 0)`. Below is an example for reading data from an
-audio buffer.
-
-    ```c
-    ma_uint64 framesRead = ma_audio_buffer_read_pcm_frames(pAudioBuffer, pFramesOut, desiredFrameCount, isLooping);
-    if (framesRead < desiredFrameCount) {
-        // If not looping, this means the end has been reached. This should never happen in looping mode with valid input.
-    }
-    ```
-
-Sometimes you may want to avoid the cost of data movement between the internal buffer and the
-output buffer. Instead you can use memory mapping to retrieve a pointer to a segment of data:
-
-    ```c
-    void* pMappedFrames;
-    ma_uint64 frameCount = frameCountToTryMapping;
-    ma_result result = ma_audio_buffer_map(pAudioBuffer, &pMappedFrames, &frameCount);
-    if (result == MA_SUCCESS) {
-        // Map was successful. The value in frameCount will be how many frames were _actually_ mapped, which may be
-        // less due to the end of the buffer being reached.
-        ma_copy_pcm_frames(pFramesOut, pMappedFrames, frameCount, pAudioBuffer->format, pAudioBuffer->channels);
-
-        // You must unmap the buffer.
-        ma_audio_buffer_unmap(pAudioBuffer, frameCount);
-    }
-    ```
-
-When you use memory mapping, the read cursor is increment by the frame count passed in to
-`ma_audio_buffer_unmap()`. If you decide not to process every frame you can pass in a value smaller
-than the value returned by `ma_audio_buffer_map()`. The disadvantage to using memory mapping is
-that it does not handle looping for you. You can determine if the buffer is at the end for the
-purpose of looping with `ma_audio_buffer_at_end()` or by inspecting the return value of
-`ma_audio_buffer_unmap()` and checking if it equals `MA_AT_END`. You should not treat `MA_AT_END`
-as an error when returned by `ma_audio_buffer_unmap()`.
-
-
-
-14. Ring Buffers
-================
-miniaudio supports lock free (single producer, single consumer) ring buffers which are exposed via
-the `ma_rb` and `ma_pcm_rb` APIs. The `ma_rb` API operates on bytes, whereas the `ma_pcm_rb`
-operates on PCM frames. They are otherwise identical as `ma_pcm_rb` is just a wrapper around
-`ma_rb`.
-
-Unlike most other APIs in miniaudio, ring buffers support both interleaved and deinterleaved
-streams. The caller can also allocate their own backing memory for the ring buffer to use
-internally for added flexibility. Otherwise the ring buffer will manage it's internal memory for
-you.
-
-The examples below use the PCM frame variant of the ring buffer since that's most likely the one
-you will want to use. To initialize a ring buffer, do something like the following:
-
-    ```c
-    ma_pcm_rb rb;
-    ma_result result = ma_pcm_rb_init(FORMAT, CHANNELS, BUFFER_SIZE_IN_FRAMES, NULL, NULL, &rb);
-    if (result != MA_SUCCESS) {
-        // Error
-    }
-    ```
-
-The `ma_pcm_rb_init()` function takes the sample format and channel count as parameters because
-it's the PCM variant of the ring buffer API. For the regular ring buffer that operates on bytes you
-would call `ma_rb_init()` which leaves these out and just takes the size of the buffer in bytes
-instead of frames. The fourth parameter is an optional pre-allocated buffer and the fifth parameter
-is a pointer to a `ma_allocation_callbacks` structure for custom memory allocation routines.
-Passing in `NULL` for this results in `MA_MALLOC()` and `MA_FREE()` being used.
-
-Use `ma_pcm_rb_init_ex()` if you need a deinterleaved buffer. The data for each sub-buffer is
-offset from each other based on the stride. To manage your sub-buffers you can use
-`ma_pcm_rb_get_subbuffer_stride()`, `ma_pcm_rb_get_subbuffer_offset()` and
-`ma_pcm_rb_get_subbuffer_ptr()`.
-
-Use `ma_pcm_rb_acquire_read()` and `ma_pcm_rb_acquire_write()` to retrieve a pointer to a section
-of the ring buffer. You specify the number of frames you need, and on output it will set to what
-was actually acquired. If the read or write pointer is positioned such that the number of frames
-requested will require a loop, it will be clamped to the end of the buffer. Therefore, the number
-of frames you're given may be less than the number you requested.
-
-After calling `ma_pcm_rb_acquire_read()` or `ma_pcm_rb_acquire_write()`, you do your work on the
-buffer and then "commit" it with `ma_pcm_rb_commit_read()` or `ma_pcm_rb_commit_write()`. This is
-where the read/write pointers are updated. When you commit you need to pass in the buffer that was
-returned by the earlier call to `ma_pcm_rb_acquire_read()` or `ma_pcm_rb_acquire_write()` and is
-only used for validation. The number of frames passed to `ma_pcm_rb_commit_read()` and
-`ma_pcm_rb_commit_write()` is what's used to increment the pointers, and can be less that what was
-originally requested.
-
-If you want to correct for drift between the write pointer and the read pointer you can use a
-combination of `ma_pcm_rb_pointer_distance()`, `ma_pcm_rb_seek_read()` and
-`ma_pcm_rb_seek_write()`. Note that you can only move the pointers forward, and you should only
-move the read pointer forward via the consumer thread, and the write pointer forward by the
-producer thread. If there is too much space between the pointers, move the read pointer forward. If
-there is too little space between the pointers, move the write pointer forward.
-
-You can use a ring buffer at the byte level instead of the PCM frame level by using the `ma_rb`
-API. This is exactly the same, only you will use the `ma_rb` functions instead of `ma_pcm_rb` and
-instead of frame counts you will pass around byte counts.
-
-The maximum size of the buffer in bytes is `0x7FFFFFFF-(MA_SIMD_ALIGNMENT-1)` due to the most
-significant bit being used to encode a loop flag and the internally managed buffers always being
-aligned to `MA_SIMD_ALIGNMENT`.
-
-Note that the ring buffer is only thread safe when used by a single consumer thread and single
-producer thread.
-
-
-
-15. Backends
-============
-The following backends are supported by miniaudio. These are listed in order of default priority.
-When no backend is specified when initializing a context or device, miniaudio will attempt to use
-each of these backends in the order listed in the table below.
-
-Note that backends that are not usable by the build target will not be included in the build. For
-example, ALSA, which is specific to Linux, will not be included in the Windows build.
-
-    +-------------+-----------------------+--------------------------------------------------------+
-    | Name        | Enum Name             | Supported Operating Systems                            |
-    +-------------+-----------------------+--------------------------------------------------------+
-    | WASAPI      | ma_backend_wasapi     | Windows Vista+                                         |
-    | DirectSound | ma_backend_dsound     | Windows XP+                                            |
-    | WinMM       | ma_backend_winmm      | Windows 95+                                            |
-    | Core Audio  | ma_backend_coreaudio  | macOS, iOS                                             |
-    | sndio       | ma_backend_sndio      | OpenBSD                                                |
-    | audio(4)    | ma_backend_audio4     | NetBSD, OpenBSD                                        |
-    | OSS         | ma_backend_oss        | FreeBSD                                                |
-    | PulseAudio  | ma_backend_pulseaudio | Cross Platform (disabled on Windows, BSD and Android)  |
-    | ALSA        | ma_backend_alsa       | Linux                                                  |
-    | JACK        | ma_backend_jack       | Cross Platform (disabled on BSD and Android)           |
-    | AAudio      | ma_backend_aaudio     | Android 8+                                             |
-    | OpenSL ES   | ma_backend_opensl     | Android (API level 16+)                                |
-    | Web Audio   | ma_backend_webaudio   | Web (via Emscripten)                                   |
-    | Custom      | ma_backend_custom     | Cross Platform                                         |
-    | Null        | ma_backend_null       | Cross Platform (not used on Web)                       |
-    +-------------+-----------------------+--------------------------------------------------------+
-
-Some backends have some nuance details you may want to be aware of.
-
-15.1. WASAPI
-------------
-- Low-latency shared mode will be disabled when using an application-defined sample rate which is
-  different to the device's native sample rate. To work around this, set `wasapi.noAutoConvertSRC`
-  to true in the device config. This is due to IAudioClient3_InitializeSharedAudioStream() failing
-  when the `AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM` flag is specified. Setting wasapi.noAutoConvertSRC
-  will result in miniaudio's internal resampler being used instead which will in turn enable the
-  use of low-latency shared mode.
-
-15.2. PulseAudio
-----------------
-- If you experience bad glitching/noise on Arch Linux, consider this fix from the Arch wiki:
-  https://wiki.archlinux.org/index.php/PulseAudio/Troubleshooting#Glitches,_skips_or_crackling.
-  Alternatively, consider using a different backend such as ALSA.
-
-15.3. Android
--------------
-- To capture audio on Android, remember to add the RECORD_AUDIO permission to your manifest:
-  `<uses-permission android:name="android.permission.RECORD_AUDIO" />`
-- With OpenSL|ES, only a single ma_context can be active at any given time. This is due to a
-  limitation with OpenSL|ES.
-- With AAudio, only default devices are enumerated. This is due to AAudio not having an enumeration
-  API (devices are enumerated through Java). You can however perform your own device enumeration
-  through Java and then set the ID in the ma_device_id structure (ma_device_id.aaudio) and pass it
-  to ma_device_init().
-- The backend API will perform resampling where possible. The reason for this as opposed to using
-  miniaudio's built-in resampler is to take advantage of any potential device-specific
-  optimizations the driver may implement.
-
-BSD
----
-- The sndio backend is currently only enabled on OpenBSD builds.
-- The audio(4) backend is supported on OpenBSD, but you may need to disable sndiod before you can
-  use it.
-
-15.4. UWP
----------
-- UWP only supports default playback and capture devices.
-- UWP requires the Microphone capability to be enabled in the application's manifest (Package.appxmanifest):
-
-    ```
-    <Package ...>
-        ...
-        <Capabilities>
-            <DeviceCapability Name="microphone" />
-        </Capabilities>
-    </Package>
-    ```
-
-15.5. Web Audio / Emscripten
-----------------------------
-- You cannot use `-std=c*` compiler flags, nor `-ansi`. This only applies to the Emscripten build.
-- The first time a context is initialized it will create a global object called "miniaudio" whose
-  primary purpose is to act as a factory for device objects.
-- Currently the Web Audio backend uses ScriptProcessorNode's, but this may need to change later as
-  they've been deprecated.
-- Google has implemented a policy in their browsers that prevent automatic media output without
-  first receiving some kind of user input. The following web page has additional details:
-  https://developers.google.com/web/updates/2017/09/autoplay-policy-changes. Starting the device
-  may fail if you try to start playback without first handling some kind of user input.
-
-
-
-16. Optimization Tips
-=====================
-See below for some tips on improving performance.
-
-16.1. Low Level API
--------------------
-- In the data callback, if your data is already clipped prior to copying it into the output buffer,
-  set the `noClip` config option in the device config to true. This will disable miniaudio's built
-  in clipping function.
-- By default, miniaudio will pre-silence the data callback's output buffer. If you know that you
-  will always write valid data to the output buffer you can disable pre-silencing by setting the
-  `noPreSilence` config option in the device config to true.
-
-16.2. High Level API
---------------------
-- If a sound does not require doppler or pitch shifting, consider disabling pitching by
-  initializing the sound with the `MA_SOUND_FLAG_NO_PITCH` flag.
-- If a sound does not require spatialization, disable it by initializing the sound with the
-  `MA_SOUND_FLAG_NO_SPATIALIZATION` flag. It can be re-enabled again post-initialization with
-  `ma_sound_set_spatialization_enabled()`.
-- If you know all of your sounds will always be the same sample rate, set the engine's sample
-  rate to match that of the sounds. Likewise, if you're using a self-managed resource manager,
-  consider setting the decoded sample rate to match your sounds. By configuring everything to
-  use a consistent sample rate, sample rate conversion can be avoided.
-
-
-
-17. Miscellaneous Notes
-=======================
-- Automatic stream routing is enabled on a per-backend basis. Support is explicitly enabled for
-  WASAPI and Core Audio, however other backends such as PulseAudio may naturally support it, though
-  not all have been tested.
-- When compiling with VC6 and earlier, decoding is restricted to files less than 2GB in size. This
-  is due to 64-bit file APIs not being available.
-*/
-
-#ifndef miniaudio_h
-#define miniaudio_h
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define MA_STRINGIFY(x)     #x
-#define MA_XSTRINGIFY(x)    MA_STRINGIFY(x)
-
-#define MA_VERSION_MAJOR    0
-#define MA_VERSION_MINOR    11
-#define MA_VERSION_REVISION 21
-#define MA_VERSION_STRING   MA_XSTRINGIFY(MA_VERSION_MAJOR) "." MA_XSTRINGIFY(MA_VERSION_MINOR) "." MA_XSTRINGIFY(MA_VERSION_REVISION)
-
-#if defined(_MSC_VER) && !defined(__clang__)
-    #pragma warning(push)
-    #pragma warning(disable:4201)   /* nonstandard extension used: nameless struct/union */
-    #pragma warning(disable:4214)   /* nonstandard extension used: bit field types other than int */
-    #pragma warning(disable:4324)   /* structure was padded due to alignment specifier */
-#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wpedantic" /* For ISO C99 doesn't support unnamed structs/unions [-Wpedantic] */
-    #if defined(__clang__)
-        #pragma GCC diagnostic ignored "-Wc11-extensions"   /* anonymous unions are a C11 extension */
-    #endif
-#endif
-
-
-
-#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined(_M_IA64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
-    #define MA_SIZEOF_PTR   8
-#else
-    #define MA_SIZEOF_PTR   4
-#endif
-
-#include <stddef.h> /* For size_t. */
-
-/* Sized types. */
-#if defined(MA_USE_STDINT)
-    #include <stdint.h>
-    typedef int8_t   ma_int8;
-    typedef uint8_t  ma_uint8;
-    typedef int16_t  ma_int16;
-    typedef uint16_t ma_uint16;
-    typedef int32_t  ma_int32;
-    typedef uint32_t ma_uint32;
-    typedef int64_t  ma_int64;
-    typedef uint64_t ma_uint64;
-#else
-    typedef   signed char           ma_int8;
-    typedef unsigned char           ma_uint8;
-    typedef   signed short          ma_int16;
-    typedef unsigned short          ma_uint16;
-    typedef   signed int            ma_int32;
-    typedef unsigned int            ma_uint32;
-    #if defined(_MSC_VER) && !defined(__clang__)
-        typedef   signed __int64    ma_int64;
-        typedef unsigned __int64    ma_uint64;
-    #else
-        #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-            #pragma GCC diagnostic push
-            #pragma GCC diagnostic ignored "-Wlong-long"
-            #if defined(__clang__)
-                #pragma GCC diagnostic ignored "-Wc++11-long-long"
-            #endif
-        #endif
-        typedef   signed long long  ma_int64;
-        typedef unsigned long long  ma_uint64;
-        #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-            #pragma GCC diagnostic pop
-        #endif
-    #endif
-#endif  /* MA_USE_STDINT */
-
-#if MA_SIZEOF_PTR == 8
-    typedef ma_uint64           ma_uintptr;
-#else
-    typedef ma_uint32           ma_uintptr;
-#endif
-
-typedef ma_uint8    ma_bool8;
-typedef ma_uint32   ma_bool32;
-#define MA_TRUE     1
-#define MA_FALSE    0
-
-/* These float types are not used universally by miniaudio. It's to simplify some macro expansion for atomic types. */
-typedef float       ma_float;
-typedef double      ma_double;
-
-typedef void* ma_handle;
-typedef void* ma_ptr;
-
-/*
-ma_proc is annoying because when compiling with GCC we get pendantic warnings about converting
-between `void*` and `void (*)()`. We can't use `void (*)()` with MSVC however, because we'll get
-warning C4191 about "type cast between incompatible function types". To work around this I'm going
-to use a different data type depending on the compiler.
-*/
-#if defined(__GNUC__)
-typedef void (*ma_proc)(void);
-#else
-typedef void* ma_proc;
-#endif
-
-#if defined(_MSC_VER) && !defined(_WCHAR_T_DEFINED)
-typedef ma_uint16 wchar_t;
-#endif
-
-/* Define NULL for some compilers. */
-#ifndef NULL
-#define NULL 0
-#endif
-
-#if defined(SIZE_MAX)
-    #define MA_SIZE_MAX    SIZE_MAX
-#else
-    #define MA_SIZE_MAX    0xFFFFFFFF  /* When SIZE_MAX is not defined by the standard library just default to the maximum 32-bit unsigned integer. */
-#endif
-
-
-/* Platform/backend detection. */
-#if defined(_WIN32) || defined(__COSMOPOLITAN__)
-    #define MA_WIN32
-    #if defined(MA_FORCE_UWP) || (defined(WINAPI_FAMILY) && ((defined(WINAPI_FAMILY_PC_APP) && WINAPI_FAMILY == WINAPI_FAMILY_PC_APP) || (defined(WINAPI_FAMILY_PHONE_APP) && WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP)))
-        #define MA_WIN32_UWP
-    #elif defined(WINAPI_FAMILY) && (defined(WINAPI_FAMILY_GAMES) && WINAPI_FAMILY == WINAPI_FAMILY_GAMES)
-        #define MA_WIN32_GDK
-    #else
-        #define MA_WIN32_DESKTOP
-    #endif
-#endif
-#if !defined(_WIN32)    /* If it's not Win32, assume POSIX. */
-    #define MA_POSIX
-
-    /*
-    Use the MA_NO_PTHREAD_IN_HEADER option at your own risk. This is intentionally undocumented.
-    You can use this to avoid including pthread.h in the header section. The downside is that it
-    results in some fixed sized structures being declared for the various types that are used in
-    miniaudio. The risk here is that these types might be too small for a given platform. This
-    risk is yours to take and no support will be offered if you enable this option.
-    */
-    #ifndef MA_NO_PTHREAD_IN_HEADER
-        #include <pthread.h>    /* Unfortunate #include, but needed for pthread_t, pthread_mutex_t and pthread_cond_t types. */
-        typedef pthread_t       ma_pthread_t;
-        typedef pthread_mutex_t ma_pthread_mutex_t;
-        typedef pthread_cond_t  ma_pthread_cond_t;
-    #else
-        typedef ma_uintptr      ma_pthread_t;
-        typedef union           ma_pthread_mutex_t { char __data[40]; ma_uint64 __alignment; } ma_pthread_mutex_t;
-        typedef union           ma_pthread_cond_t  { char __data[48]; ma_uint64 __alignment; } ma_pthread_cond_t;
-    #endif
-
-    #if defined(__unix__)
-        #define MA_UNIX
-    #endif
-    #if defined(__linux__)
-        #define MA_LINUX
-    #endif
-    #if defined(__APPLE__)
-        #define MA_APPLE
-    #endif
-    #if defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
-        #define MA_BSD
-    #endif
-    #if defined(__ANDROID__)
-        #define MA_ANDROID
-    #endif
-    #if defined(__EMSCRIPTEN__)
-        #define MA_EMSCRIPTEN
-    #endif
-    #if defined(__ORBIS__)
-        #define MA_ORBIS
-    #endif
-    #if defined(__PROSPERO__)
-        #define MA_PROSPERO
-    #endif
-    #if defined(__NX__)
-        #define MA_NX
-    #endif
-    #if defined(__BEOS__) || defined(__HAIKU__)
-        #define MA_BEOS
-    #endif
-    #if defined(__HAIKU__)
-        #define MA_HAIKU
-    #endif
-#endif
-
-#if defined(__has_c_attribute)
-    #if __has_c_attribute(fallthrough)
-        #define MA_FALLTHROUGH [[fallthrough]]
-    #endif
-#endif
-#if !defined(MA_FALLTHROUGH) && defined(__has_attribute) && (defined(__clang__) || defined(__GNUC__))
-    #if __has_attribute(fallthrough)
-        #define MA_FALLTHROUGH __attribute__((fallthrough))
-    #endif
-#endif
-#if !defined(MA_FALLTHROUGH)
-    #define MA_FALLTHROUGH ((void)0)
-#endif
-
-#ifdef _MSC_VER
-    #define MA_INLINE __forceinline
-
-    /* noinline was introduced in Visual Studio 2005. */
-    #if _MSC_VER >= 1400
-        #define MA_NO_INLINE __declspec(noinline)
-    #else
-        #define MA_NO_INLINE
-    #endif
-#elif defined(__GNUC__)
-    /*
-    I've had a bug report where GCC is emitting warnings about functions possibly not being inlineable. This warning happens when
-    the __attribute__((always_inline)) attribute is defined without an "inline" statement. I think therefore there must be some
-    case where "__inline__" is not always defined, thus the compiler emitting these warnings. When using -std=c89 or -ansi on the
-    command line, we cannot use the "inline" keyword and instead need to use "__inline__". In an attempt to work around this issue
-    I am using "__inline__" only when we're compiling in strict ANSI mode.
-    */
-    #if defined(__STRICT_ANSI__)
-        #define MA_GNUC_INLINE_HINT __inline__
-    #else
-        #define MA_GNUC_INLINE_HINT inline
-    #endif
-
-    #if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2)) || defined(__clang__)
-        #define MA_INLINE MA_GNUC_INLINE_HINT __attribute__((always_inline))
-        #define MA_NO_INLINE __attribute__((noinline))
-    #else
-        #define MA_INLINE MA_GNUC_INLINE_HINT
-        #define MA_NO_INLINE __attribute__((noinline))
-    #endif
-#elif defined(__WATCOMC__)
-    #define MA_INLINE __inline
-    #define MA_NO_INLINE
-#else
-    #define MA_INLINE
-    #define MA_NO_INLINE
-#endif
-
-/* MA_DLL is not officially supported. You're on your own if you want to use this. */
-#if defined(MA_DLL)
-    #if defined(_WIN32)
-        #define MA_DLL_IMPORT  __declspec(dllimport)
-        #define MA_DLL_EXPORT  __declspec(dllexport)
-        #define MA_DLL_PRIVATE static
-    #else
-        #if defined(__GNUC__) && __GNUC__ >= 4
-            #define MA_DLL_IMPORT  __attribute__((visibility("default")))
-            #define MA_DLL_EXPORT  __attribute__((visibility("default")))
-            #define MA_DLL_PRIVATE __attribute__((visibility("hidden")))
-        #else
-            #define MA_DLL_IMPORT
-            #define MA_DLL_EXPORT
-            #define MA_DLL_PRIVATE static
-        #endif
-    #endif
-#endif
-
-#if !defined(MA_API)
-    #if defined(MA_DLL)
-        #if defined(MINIAUDIO_IMPLEMENTATION) || defined(MA_IMPLEMENTATION)
-            #define MA_API  MA_DLL_EXPORT
-        #else
-            #define MA_API  MA_DLL_IMPORT
-        #endif
-    #else
-        #define MA_API extern
-    #endif
-#endif
-
-#if !defined(MA_STATIC)
-    #if defined(MA_DLL)
-        #define MA_PRIVATE MA_DLL_PRIVATE
-    #else
-        #define MA_PRIVATE static
-    #endif
-#endif
-
-
-/* SIMD alignment in bytes. Currently set to 32 bytes in preparation for future AVX optimizations. */
-#define MA_SIMD_ALIGNMENT  32
-
-/*
-Special wchar_t type to ensure any structures in the public sections that reference it have a
-consistent size across all platforms.
-
-On Windows, wchar_t is 2 bytes, whereas everywhere else it's 4 bytes. Since Windows likes to use
-wchar_t for it's IDs, we need a special explicitly sized wchar type that is always 2 bytes on all
-platforms.
-*/
-#if !defined(MA_POSIX) && defined(MA_WIN32)
-typedef wchar_t     ma_wchar_win32;
-#else
-typedef ma_uint16   ma_wchar_win32;
-#endif
-
-
-
-/*
-Logging Levels
-==============
-Log levels are only used to give logging callbacks some context as to the severity of a log message
-so they can do filtering. All log levels will be posted to registered logging callbacks. If you
-don't want to output a certain log level you can discriminate against the log level in the callback.
-
-MA_LOG_LEVEL_DEBUG
-    Used for debugging. Useful for debug and test builds, but should be disabled in release builds.
-
-MA_LOG_LEVEL_INFO
-    Informational logging. Useful for debugging. This will never be called from within the data
-    callback.
-
-MA_LOG_LEVEL_WARNING
-    Warnings. You should enable this in you development builds and action them when encounted. These
-    logs usually indicate a potential problem or misconfiguration, but still allow you to keep
-    running. This will never be called from within the data callback.
-
-MA_LOG_LEVEL_ERROR
-    Error logging. This will be fired when an operation fails and is subsequently aborted. This can
-    be fired from within the data callback, in which case the device will be stopped. You should
-    always have this log level enabled.
-*/
-typedef enum
-{
-    MA_LOG_LEVEL_DEBUG   = 4,
-    MA_LOG_LEVEL_INFO    = 3,
-    MA_LOG_LEVEL_WARNING = 2,
-    MA_LOG_LEVEL_ERROR   = 1
-} ma_log_level;
-
-/*
-Variables needing to be accessed atomically should be declared with this macro for two reasons:
-
-    1) It allows people who read the code to identify a variable as such; and
-    2) It forces alignment on platforms where it's required or optimal.
-
-Note that for x86/64, alignment is not strictly necessary, but does have some performance
-implications. Where supported by the compiler, alignment will be used, but otherwise if the CPU
-architecture does not require it, it will simply leave it unaligned. This is the case with old
-versions of Visual Studio, which I've confirmed with at least VC6.
-*/
-#if !defined(_MSC_VER) && defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
-    #include <stdalign.h>
-    #define MA_ATOMIC(alignment, type)            _Alignas(alignment) type
-#else
-    #if defined(__GNUC__)
-        /* GCC-style compilers. */
-        #define MA_ATOMIC(alignment, type)        type __attribute__((aligned(alignment)))
-    #elif defined(_MSC_VER) && _MSC_VER > 1200  /* 1200 = VC6. Alignment not supported, but not necessary because x86 is the only supported target. */
-        /* MSVC. */
-        #define MA_ATOMIC(alignment, type)        __declspec(align(alignment)) type
-    #else
-        /* Other compilers. */
-        #define MA_ATOMIC(alignment, type)        type
-    #endif
-#endif
-
-typedef struct ma_context ma_context;
-typedef struct ma_device ma_device;
-
-typedef ma_uint8 ma_channel;
-typedef enum
-{
-    MA_CHANNEL_NONE               = 0,
-    MA_CHANNEL_MONO               = 1,
-    MA_CHANNEL_FRONT_LEFT         = 2,
-    MA_CHANNEL_FRONT_RIGHT        = 3,
-    MA_CHANNEL_FRONT_CENTER       = 4,
-    MA_CHANNEL_LFE                = 5,
-    MA_CHANNEL_BACK_LEFT          = 6,
-    MA_CHANNEL_BACK_RIGHT         = 7,
-    MA_CHANNEL_FRONT_LEFT_CENTER  = 8,
-    MA_CHANNEL_FRONT_RIGHT_CENTER = 9,
-    MA_CHANNEL_BACK_CENTER        = 10,
-    MA_CHANNEL_SIDE_LEFT          = 11,
-    MA_CHANNEL_SIDE_RIGHT         = 12,
-    MA_CHANNEL_TOP_CENTER         = 13,
-    MA_CHANNEL_TOP_FRONT_LEFT     = 14,
-    MA_CHANNEL_TOP_FRONT_CENTER   = 15,
-    MA_CHANNEL_TOP_FRONT_RIGHT    = 16,
-    MA_CHANNEL_TOP_BACK_LEFT      = 17,
-    MA_CHANNEL_TOP_BACK_CENTER    = 18,
-    MA_CHANNEL_TOP_BACK_RIGHT     = 19,
-    MA_CHANNEL_AUX_0              = 20,
-    MA_CHANNEL_AUX_1              = 21,
-    MA_CHANNEL_AUX_2              = 22,
-    MA_CHANNEL_AUX_3              = 23,
-    MA_CHANNEL_AUX_4              = 24,
-    MA_CHANNEL_AUX_5              = 25,
-    MA_CHANNEL_AUX_6              = 26,
-    MA_CHANNEL_AUX_7              = 27,
-    MA_CHANNEL_AUX_8              = 28,
-    MA_CHANNEL_AUX_9              = 29,
-    MA_CHANNEL_AUX_10             = 30,
-    MA_CHANNEL_AUX_11             = 31,
-    MA_CHANNEL_AUX_12             = 32,
-    MA_CHANNEL_AUX_13             = 33,
-    MA_CHANNEL_AUX_14             = 34,
-    MA_CHANNEL_AUX_15             = 35,
-    MA_CHANNEL_AUX_16             = 36,
-    MA_CHANNEL_AUX_17             = 37,
-    MA_CHANNEL_AUX_18             = 38,
-    MA_CHANNEL_AUX_19             = 39,
-    MA_CHANNEL_AUX_20             = 40,
-    MA_CHANNEL_AUX_21             = 41,
-    MA_CHANNEL_AUX_22             = 42,
-    MA_CHANNEL_AUX_23             = 43,
-    MA_CHANNEL_AUX_24             = 44,
-    MA_CHANNEL_AUX_25             = 45,
-    MA_CHANNEL_AUX_26             = 46,
-    MA_CHANNEL_AUX_27             = 47,
-    MA_CHANNEL_AUX_28             = 48,
-    MA_CHANNEL_AUX_29             = 49,
-    MA_CHANNEL_AUX_30             = 50,
-    MA_CHANNEL_AUX_31             = 51,
-    MA_CHANNEL_LEFT               = MA_CHANNEL_FRONT_LEFT,
-    MA_CHANNEL_RIGHT              = MA_CHANNEL_FRONT_RIGHT,
-    MA_CHANNEL_POSITION_COUNT     = (MA_CHANNEL_AUX_31 + 1)
-} _ma_channel_position; /* Do not use `_ma_channel_position` directly. Use `ma_channel` instead. */
-
-typedef enum
-{
-    MA_SUCCESS                        =  0,
-    MA_ERROR                          = -1,  /* A generic error. */
-    MA_INVALID_ARGS                   = -2,
-    MA_INVALID_OPERATION              = -3,
-    MA_OUT_OF_MEMORY                  = -4,
-    MA_OUT_OF_RANGE                   = -5,
-    MA_ACCESS_DENIED                  = -6,
-    MA_DOES_NOT_EXIST                 = -7,
-    MA_ALREADY_EXISTS                 = -8,
-    MA_TOO_MANY_OPEN_FILES            = -9,
-    MA_INVALID_FILE                   = -10,
-    MA_TOO_BIG                        = -11,
-    MA_PATH_TOO_LONG                  = -12,
-    MA_NAME_TOO_LONG                  = -13,
-    MA_NOT_DIRECTORY                  = -14,
-    MA_IS_DIRECTORY                   = -15,
-    MA_DIRECTORY_NOT_EMPTY            = -16,
-    MA_AT_END                         = -17,
-    MA_NO_SPACE                       = -18,
-    MA_BUSY                           = -19,
-    MA_IO_ERROR                       = -20,
-    MA_INTERRUPT                      = -21,
-    MA_UNAVAILABLE                    = -22,
-    MA_ALREADY_IN_USE                 = -23,
-    MA_BAD_ADDRESS                    = -24,
-    MA_BAD_SEEK                       = -25,
-    MA_BAD_PIPE                       = -26,
-    MA_DEADLOCK                       = -27,
-    MA_TOO_MANY_LINKS                 = -28,
-    MA_NOT_IMPLEMENTED                = -29,
-    MA_NO_MESSAGE                     = -30,
-    MA_BAD_MESSAGE                    = -31,
-    MA_NO_DATA_AVAILABLE              = -32,
-    MA_INVALID_DATA                   = -33,
-    MA_TIMEOUT                        = -34,
-    MA_NO_NETWORK                     = -35,
-    MA_NOT_UNIQUE                     = -36,
-    MA_NOT_SOCKET                     = -37,
-    MA_NO_ADDRESS                     = -38,
-    MA_BAD_PROTOCOL                   = -39,
-    MA_PROTOCOL_UNAVAILABLE           = -40,
-    MA_PROTOCOL_NOT_SUPPORTED         = -41,
-    MA_PROTOCOL_FAMILY_NOT_SUPPORTED  = -42,
-    MA_ADDRESS_FAMILY_NOT_SUPPORTED   = -43,
-    MA_SOCKET_NOT_SUPPORTED           = -44,
-    MA_CONNECTION_RESET               = -45,
-    MA_ALREADY_CONNECTED              = -46,
-    MA_NOT_CONNECTED                  = -47,
-    MA_CONNECTION_REFUSED             = -48,
-    MA_NO_HOST                        = -49,
-    MA_IN_PROGRESS                    = -50,
-    MA_CANCELLED                      = -51,
-    MA_MEMORY_ALREADY_MAPPED          = -52,
-
-    /* General non-standard errors. */
-    MA_CRC_MISMATCH                   = -100,
-
-    /* General miniaudio-specific errors. */
-    MA_FORMAT_NOT_SUPPORTED           = -200,
-    MA_DEVICE_TYPE_NOT_SUPPORTED      = -201,
-    MA_SHARE_MODE_NOT_SUPPORTED       = -202,
-    MA_NO_BACKEND                     = -203,
-    MA_NO_DEVICE                      = -204,
-    MA_API_NOT_FOUND                  = -205,
-    MA_INVALID_DEVICE_CONFIG          = -206,
-    MA_LOOP                           = -207,
-    MA_BACKEND_NOT_ENABLED            = -208,
-
-    /* State errors. */
-    MA_DEVICE_NOT_INITIALIZED         = -300,
-    MA_DEVICE_ALREADY_INITIALIZED     = -301,
-    MA_DEVICE_NOT_STARTED             = -302,
-    MA_DEVICE_NOT_STOPPED             = -303,
-
-    /* Operation errors. */
-    MA_FAILED_TO_INIT_BACKEND         = -400,
-    MA_FAILED_TO_OPEN_BACKEND_DEVICE  = -401,
-    MA_FAILED_TO_START_BACKEND_DEVICE = -402,
-    MA_FAILED_TO_STOP_BACKEND_DEVICE  = -403
-} ma_result;
-
-
-#define MA_MIN_CHANNELS                 1
-#ifndef MA_MAX_CHANNELS
-#define MA_MAX_CHANNELS                 254
-#endif
-
-#ifndef MA_MAX_FILTER_ORDER
-#define MA_MAX_FILTER_ORDER             8
-#endif
-
-typedef enum
-{
-    ma_stream_format_pcm = 0
-} ma_stream_format;
-
-typedef enum
-{
-    ma_stream_layout_interleaved = 0,
-    ma_stream_layout_deinterleaved
-} ma_stream_layout;
-
-typedef enum
-{
-    ma_dither_mode_none = 0,
-    ma_dither_mode_rectangle,
-    ma_dither_mode_triangle
-} ma_dither_mode;
-
-typedef enum
-{
-    /*
-    I like to keep these explicitly defined because they're used as a key into a lookup table. When items are
-    added to this, make sure there are no gaps and that they're added to the lookup table in ma_get_bytes_per_sample().
-    */
-    ma_format_unknown = 0,     /* Mainly used for indicating an error, but also used as the default for the output format for decoders. */
-    ma_format_u8      = 1,
-    ma_format_s16     = 2,     /* Seems to be the most widely supported format. */
-    ma_format_s24     = 3,     /* Tightly packed. 3 bytes per sample. */
-    ma_format_s32     = 4,
-    ma_format_f32     = 5,
-    ma_format_count
-} ma_format;
-
-typedef enum
-{
-    /* Standard rates need to be in priority order. */
-    ma_standard_sample_rate_48000  = 48000,     /* Most common */
-    ma_standard_sample_rate_44100  = 44100,
-
-    ma_standard_sample_rate_32000  = 32000,     /* Lows */
-    ma_standard_sample_rate_24000  = 24000,
-    ma_standard_sample_rate_22050  = 22050,
-
-    ma_standard_sample_rate_88200  = 88200,     /* Highs */
-    ma_standard_sample_rate_96000  = 96000,
-    ma_standard_sample_rate_176400 = 176400,
-    ma_standard_sample_rate_192000 = 192000,
-
-    ma_standard_sample_rate_16000  = 16000,     /* Extreme lows */
-    ma_standard_sample_rate_11025  = 11025,
-    ma_standard_sample_rate_8000   = 8000,
-
-    ma_standard_sample_rate_352800 = 352800,    /* Extreme highs */
-    ma_standard_sample_rate_384000 = 384000,
-
-    ma_standard_sample_rate_min    = ma_standard_sample_rate_8000,
-    ma_standard_sample_rate_max    = ma_standard_sample_rate_384000,
-    ma_standard_sample_rate_count  = 14         /* Need to maintain the count manually. Make sure this is updated if items are added to enum. */
-} ma_standard_sample_rate;
-
-
-typedef enum
-{
-    ma_channel_mix_mode_rectangular = 0,   /* Simple averaging based on the plane(s) the channel is sitting on. */
-    ma_channel_mix_mode_simple,            /* Drop excess channels; zeroed out extra channels. */
-    ma_channel_mix_mode_custom_weights,    /* Use custom weights specified in ma_channel_converter_config. */
-    ma_channel_mix_mode_default = ma_channel_mix_mode_rectangular
-} ma_channel_mix_mode;
-
-typedef enum
-{
-    ma_standard_channel_map_microsoft,
-    ma_standard_channel_map_alsa,
-    ma_standard_channel_map_rfc3551,   /* Based off AIFF. */
-    ma_standard_channel_map_flac,
-    ma_standard_channel_map_vorbis,
-    ma_standard_channel_map_sound4,    /* FreeBSD's sound(4). */
-    ma_standard_channel_map_sndio,     /* www.sndio.org/tips.html */
-    ma_standard_channel_map_webaudio = ma_standard_channel_map_flac, /* https://webaudio.github.io/web-audio-api/#ChannelOrdering. Only 1, 2, 4 and 6 channels are defined, but can fill in the gaps with logical assumptions. */
-    ma_standard_channel_map_default = ma_standard_channel_map_microsoft
-} ma_standard_channel_map;
-
-typedef enum
-{
-    ma_performance_profile_low_latency = 0,
-    ma_performance_profile_conservative
-} ma_performance_profile;
-
-
-typedef struct
-{
-    void* pUserData;
-    void* (* onMalloc)(size_t sz, void* pUserData);
-    void* (* onRealloc)(void* p, size_t sz, void* pUserData);
-    void  (* onFree)(void* p, void* pUserData);
-} ma_allocation_callbacks;
-
-typedef struct
-{
-    ma_int32 state;
-} ma_lcg;
-
-
-/*
-Atomics.
-
-These are typesafe structures to prevent errors as a result of forgetting to reference variables atomically. It's too
-easy to introduce subtle bugs where you accidentally do a regular assignment instead of an atomic load/store, etc. By
-using a struct we can enforce the use of atomics at compile time.
-
-These types are declared in the header section because we need to reference them in structs below, but functions for
-using them are only exposed in the implementation section. I do not want these to be part of the public API.
-
-There's a few downsides to this system. The first is that you need to declare a new struct for each type. Below are
-some macros to help with the declarations. They will be named like so:
-
-    ma_atomic_uint32 - atomic ma_uint32
-    ma_atomic_int32  - atomic ma_int32
-    ma_atomic_uint64 - atomic ma_uint64
-    ma_atomic_float  - atomic float
-    ma_atomic_bool32 - atomic ma_bool32
-
-The other downside is that atomic pointers are extremely messy. You need to declare a new struct for each specific
-type of pointer you need to make atomic. For example, an atomic ma_node* will look like this:
-
-    MA_ATOMIC_SAFE_TYPE_IMPL_PTR(node)
-
-Which will declare a type struct that's named like so:
-
-    ma_atomic_ptr_node
-
-Functions to use the atomic types are declared in the implementation section. All atomic functions are prefixed with
-the name of the struct. For example:
-
-    ma_atomic_uint32_set() - Atomic store of ma_uint32
-    ma_atomic_uint32_get() - Atomic load of ma_uint32
-    etc.
-
-For pointer types it's the same, which makes them a bit messy to use due to the length of each function name, but in
-return you get type safety and enforcement of atomic operations.
-*/
-#define MA_ATOMIC_SAFE_TYPE_DECL(c89TypeExtension, typeSize, type) \
-    typedef struct \
-    { \
-        MA_ATOMIC(typeSize, ma_##type) value; \
-    } ma_atomic_##type; \
-
-#define MA_ATOMIC_SAFE_TYPE_DECL_PTR(type) \
-    typedef struct \
-    { \
-        MA_ATOMIC(MA_SIZEOF_PTR, ma_##type*) value; \
-    } ma_atomic_ptr_##type; \
-
-MA_ATOMIC_SAFE_TYPE_DECL(32,  4, uint32)
-MA_ATOMIC_SAFE_TYPE_DECL(i32, 4, int32)
-MA_ATOMIC_SAFE_TYPE_DECL(64,  8, uint64)
-MA_ATOMIC_SAFE_TYPE_DECL(f32, 4, float)
-MA_ATOMIC_SAFE_TYPE_DECL(32,  4, bool32)
-
-
-/* Spinlocks are 32-bit for compatibility reasons. */
-typedef ma_uint32 ma_spinlock;
-
-#ifndef MA_NO_THREADING
-    /* Thread priorities should be ordered such that the default priority of the worker thread is 0. */
-    typedef enum
-    {
-        ma_thread_priority_idle     = -5,
-        ma_thread_priority_lowest   = -4,
-        ma_thread_priority_low      = -3,
-        ma_thread_priority_normal   = -2,
-        ma_thread_priority_high     = -1,
-        ma_thread_priority_highest  =  0,
-        ma_thread_priority_realtime =  1,
-        ma_thread_priority_default  =  0
-    } ma_thread_priority;
-
-    #if defined(MA_POSIX)
-        typedef ma_pthread_t ma_thread;
-    #elif defined(MA_WIN32)
-        typedef ma_handle ma_thread;
-    #endif
-
-    #if defined(MA_POSIX)
-        typedef ma_pthread_mutex_t ma_mutex;
-    #elif defined(MA_WIN32)
-        typedef ma_handle ma_mutex;
-    #endif
-
-    #if defined(MA_POSIX)
-        typedef struct
-        {
-            ma_uint32 value;
-            ma_pthread_mutex_t lock;
-            ma_pthread_cond_t cond;
-        } ma_event;
-    #elif defined(MA_WIN32)
-        typedef ma_handle ma_event;
-    #endif
-
-    #if defined(MA_POSIX)
-        typedef struct
-        {
-            int value;
-            ma_pthread_mutex_t lock;
-            ma_pthread_cond_t cond;
-        } ma_semaphore;
-    #elif defined(MA_WIN32)
-        typedef ma_handle ma_semaphore;
-    #endif
-#else
-    /* MA_NO_THREADING is set which means threading is disabled. Threading is required by some API families. If any of these are enabled we need to throw an error. */
-    #ifndef MA_NO_DEVICE_IO
-        #error "MA_NO_THREADING cannot be used without MA_NO_DEVICE_IO";
-    #endif
-#endif  /* MA_NO_THREADING */
-
-
-/*
-Retrieves the version of miniaudio as separated integers. Each component can be NULL if it's not required.
-*/
-MA_API void ma_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision);
-
-/*
-Retrieves the version of miniaudio as a string which can be useful for logging purposes.
-*/
-MA_API const char* ma_version_string(void);
-
-
-/**************************************************************************************************************************************************************
-
-Logging
-
-**************************************************************************************************************************************************************/
-#include <stdarg.h> /* For va_list. */
-
-#if defined(__has_attribute)
-    #if __has_attribute(format)
-        #define MA_ATTRIBUTE_FORMAT(fmt, va) __attribute__((format(printf, fmt, va)))
-    #endif
-#endif
-#ifndef MA_ATTRIBUTE_FORMAT
-#define MA_ATTRIBUTE_FORMAT(fmt, va)
-#endif
-
-#ifndef MA_MAX_LOG_CALLBACKS
-#define MA_MAX_LOG_CALLBACKS    4
-#endif
-
-
-/*
-The callback for handling log messages.
-
-
-Parameters
-----------
-pUserData (in)
-    The user data pointer that was passed into ma_log_register_callback().
-
-logLevel (in)
-    The log level. This can be one of the following:
-
-    +----------------------+
-    | Log Level            |
-    +----------------------+
-    | MA_LOG_LEVEL_DEBUG   |
-    | MA_LOG_LEVEL_INFO    |
-    | MA_LOG_LEVEL_WARNING |
-    | MA_LOG_LEVEL_ERROR   |
-    +----------------------+
-
-pMessage (in)
-    The log message.
-*/
-typedef void (* ma_log_callback_proc)(void* pUserData, ma_uint32 level, const char* pMessage);
-
-typedef struct
-{
-    ma_log_callback_proc onLog;
-    void* pUserData;
-} ma_log_callback;
-
-MA_API ma_log_callback ma_log_callback_init(ma_log_callback_proc onLog, void* pUserData);
-
-
-typedef struct
-{
-    ma_log_callback callbacks[MA_MAX_LOG_CALLBACKS];
-    ma_uint32 callbackCount;
-    ma_allocation_callbacks allocationCallbacks;    /* Need to store these persistently because ma_log_postv() might need to allocate a buffer on the heap. */
-#ifndef MA_NO_THREADING
-    ma_mutex lock;  /* For thread safety just to make it easier and safer for the logging implementation. */
-#endif
-} ma_log;
-
-MA_API ma_result ma_log_init(const ma_allocation_callbacks* pAllocationCallbacks, ma_log* pLog);
-MA_API void ma_log_uninit(ma_log* pLog);
-MA_API ma_result ma_log_register_callback(ma_log* pLog, ma_log_callback callback);
-MA_API ma_result ma_log_unregister_callback(ma_log* pLog, ma_log_callback callback);
-MA_API ma_result ma_log_post(ma_log* pLog, ma_uint32 level, const char* pMessage);
-MA_API ma_result ma_log_postv(ma_log* pLog, ma_uint32 level, const char* pFormat, va_list args);
-MA_API ma_result ma_log_postf(ma_log* pLog, ma_uint32 level, const char* pFormat, ...) MA_ATTRIBUTE_FORMAT(3, 4);
-
-
-/**************************************************************************************************************************************************************
-
-Biquad Filtering
-
-**************************************************************************************************************************************************************/
-typedef union
-{
-    float    f32;
-    ma_int32 s32;
-} ma_biquad_coefficient;
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    double b0;
-    double b1;
-    double b2;
-    double a0;
-    double a1;
-    double a2;
-} ma_biquad_config;
-
-MA_API ma_biquad_config ma_biquad_config_init(ma_format format, ma_uint32 channels, double b0, double b1, double b2, double a0, double a1, double a2);
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_biquad_coefficient b0;
-    ma_biquad_coefficient b1;
-    ma_biquad_coefficient b2;
-    ma_biquad_coefficient a1;
-    ma_biquad_coefficient a2;
-    ma_biquad_coefficient* pR1;
-    ma_biquad_coefficient* pR2;
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_biquad;
-
-MA_API ma_result ma_biquad_get_heap_size(const ma_biquad_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_biquad_init_preallocated(const ma_biquad_config* pConfig, void* pHeap, ma_biquad* pBQ);
-MA_API ma_result ma_biquad_init(const ma_biquad_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_biquad* pBQ);
-MA_API void ma_biquad_uninit(ma_biquad* pBQ, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_biquad_reinit(const ma_biquad_config* pConfig, ma_biquad* pBQ);
-MA_API ma_result ma_biquad_clear_cache(ma_biquad* pBQ);
-MA_API ma_result ma_biquad_process_pcm_frames(ma_biquad* pBQ, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_biquad_get_latency(const ma_biquad* pBQ);
-
-
-/**************************************************************************************************************************************************************
-
-Low-Pass Filtering
-
-**************************************************************************************************************************************************************/
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double cutoffFrequency;
-    double q;
-} ma_lpf1_config, ma_lpf2_config;
-
-MA_API ma_lpf1_config ma_lpf1_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency);
-MA_API ma_lpf2_config ma_lpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q);
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_biquad_coefficient a;
-    ma_biquad_coefficient* pR1;
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_lpf1;
-
-MA_API ma_result ma_lpf1_get_heap_size(const ma_lpf1_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_lpf1_init_preallocated(const ma_lpf1_config* pConfig, void* pHeap, ma_lpf1* pLPF);
-MA_API ma_result ma_lpf1_init(const ma_lpf1_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf1* pLPF);
-MA_API void ma_lpf1_uninit(ma_lpf1* pLPF, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_lpf1_reinit(const ma_lpf1_config* pConfig, ma_lpf1* pLPF);
-MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF);
-MA_API ma_result ma_lpf1_process_pcm_frames(ma_lpf1* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_lpf1_get_latency(const ma_lpf1* pLPF);
-
-typedef struct
-{
-    ma_biquad bq;   /* The second order low-pass filter is implemented as a biquad filter. */
-} ma_lpf2;
-
-MA_API ma_result ma_lpf2_get_heap_size(const ma_lpf2_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_lpf2_init_preallocated(const ma_lpf2_config* pConfig, void* pHeap, ma_lpf2* pHPF);
-MA_API ma_result ma_lpf2_init(const ma_lpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf2* pLPF);
-MA_API void ma_lpf2_uninit(ma_lpf2* pLPF, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_lpf2_reinit(const ma_lpf2_config* pConfig, ma_lpf2* pLPF);
-MA_API ma_result ma_lpf2_clear_cache(ma_lpf2* pLPF);
-MA_API ma_result ma_lpf2_process_pcm_frames(ma_lpf2* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_lpf2_get_latency(const ma_lpf2* pLPF);
-
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double cutoffFrequency;
-    ma_uint32 order;    /* If set to 0, will be treated as a passthrough (no filtering will be applied). */
-} ma_lpf_config;
-
-MA_API ma_lpf_config ma_lpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_uint32 lpf1Count;
-    ma_uint32 lpf2Count;
-    ma_lpf1* pLPF1;
-    ma_lpf2* pLPF2;
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_lpf;
-
-MA_API ma_result ma_lpf_get_heap_size(const ma_lpf_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_lpf_init_preallocated(const ma_lpf_config* pConfig, void* pHeap, ma_lpf* pLPF);
-MA_API ma_result ma_lpf_init(const ma_lpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf* pLPF);
-MA_API void ma_lpf_uninit(ma_lpf* pLPF, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_lpf_reinit(const ma_lpf_config* pConfig, ma_lpf* pLPF);
-MA_API ma_result ma_lpf_clear_cache(ma_lpf* pLPF);
-MA_API ma_result ma_lpf_process_pcm_frames(ma_lpf* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_lpf_get_latency(const ma_lpf* pLPF);
-
-
-/**************************************************************************************************************************************************************
-
-High-Pass Filtering
-
-**************************************************************************************************************************************************************/
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double cutoffFrequency;
-    double q;
-} ma_hpf1_config, ma_hpf2_config;
-
-MA_API ma_hpf1_config ma_hpf1_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency);
-MA_API ma_hpf2_config ma_hpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q);
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_biquad_coefficient a;
-    ma_biquad_coefficient* pR1;
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_hpf1;
-
-MA_API ma_result ma_hpf1_get_heap_size(const ma_hpf1_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_hpf1_init_preallocated(const ma_hpf1_config* pConfig, void* pHeap, ma_hpf1* pLPF);
-MA_API ma_result ma_hpf1_init(const ma_hpf1_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf1* pHPF);
-MA_API void ma_hpf1_uninit(ma_hpf1* pHPF, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_hpf1_reinit(const ma_hpf1_config* pConfig, ma_hpf1* pHPF);
-MA_API ma_result ma_hpf1_process_pcm_frames(ma_hpf1* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_hpf1_get_latency(const ma_hpf1* pHPF);
-
-typedef struct
-{
-    ma_biquad bq;   /* The second order high-pass filter is implemented as a biquad filter. */
-} ma_hpf2;
-
-MA_API ma_result ma_hpf2_get_heap_size(const ma_hpf2_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_hpf2_init_preallocated(const ma_hpf2_config* pConfig, void* pHeap, ma_hpf2* pHPF);
-MA_API ma_result ma_hpf2_init(const ma_hpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf2* pHPF);
-MA_API void ma_hpf2_uninit(ma_hpf2* pHPF, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_hpf2_reinit(const ma_hpf2_config* pConfig, ma_hpf2* pHPF);
-MA_API ma_result ma_hpf2_process_pcm_frames(ma_hpf2* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_hpf2_get_latency(const ma_hpf2* pHPF);
-
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double cutoffFrequency;
-    ma_uint32 order;    /* If set to 0, will be treated as a passthrough (no filtering will be applied). */
-} ma_hpf_config;
-
-MA_API ma_hpf_config ma_hpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_uint32 hpf1Count;
-    ma_uint32 hpf2Count;
-    ma_hpf1* pHPF1;
-    ma_hpf2* pHPF2;
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_hpf;
-
-MA_API ma_result ma_hpf_get_heap_size(const ma_hpf_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_hpf_init_preallocated(const ma_hpf_config* pConfig, void* pHeap, ma_hpf* pLPF);
-MA_API ma_result ma_hpf_init(const ma_hpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf* pHPF);
-MA_API void ma_hpf_uninit(ma_hpf* pHPF, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_hpf_reinit(const ma_hpf_config* pConfig, ma_hpf* pHPF);
-MA_API ma_result ma_hpf_process_pcm_frames(ma_hpf* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_hpf_get_latency(const ma_hpf* pHPF);
-
-
-/**************************************************************************************************************************************************************
-
-Band-Pass Filtering
-
-**************************************************************************************************************************************************************/
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double cutoffFrequency;
-    double q;
-} ma_bpf2_config;
-
-MA_API ma_bpf2_config ma_bpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q);
-
-typedef struct
-{
-    ma_biquad bq;   /* The second order band-pass filter is implemented as a biquad filter. */
-} ma_bpf2;
-
-MA_API ma_result ma_bpf2_get_heap_size(const ma_bpf2_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_bpf2_init_preallocated(const ma_bpf2_config* pConfig, void* pHeap, ma_bpf2* pBPF);
-MA_API ma_result ma_bpf2_init(const ma_bpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf2* pBPF);
-MA_API void ma_bpf2_uninit(ma_bpf2* pBPF, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_bpf2_reinit(const ma_bpf2_config* pConfig, ma_bpf2* pBPF);
-MA_API ma_result ma_bpf2_process_pcm_frames(ma_bpf2* pBPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_bpf2_get_latency(const ma_bpf2* pBPF);
-
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double cutoffFrequency;
-    ma_uint32 order;    /* If set to 0, will be treated as a passthrough (no filtering will be applied). */
-} ma_bpf_config;
-
-MA_API ma_bpf_config ma_bpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 bpf2Count;
-    ma_bpf2* pBPF2;
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_bpf;
-
-MA_API ma_result ma_bpf_get_heap_size(const ma_bpf_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_bpf_init_preallocated(const ma_bpf_config* pConfig, void* pHeap, ma_bpf* pBPF);
-MA_API ma_result ma_bpf_init(const ma_bpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf* pBPF);
-MA_API void ma_bpf_uninit(ma_bpf* pBPF, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_bpf_reinit(const ma_bpf_config* pConfig, ma_bpf* pBPF);
-MA_API ma_result ma_bpf_process_pcm_frames(ma_bpf* pBPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_bpf_get_latency(const ma_bpf* pBPF);
-
-
-/**************************************************************************************************************************************************************
-
-Notching Filter
-
-**************************************************************************************************************************************************************/
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double q;
-    double frequency;
-} ma_notch2_config, ma_notch_config;
-
-MA_API ma_notch2_config ma_notch2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double q, double frequency);
-
-typedef struct
-{
-    ma_biquad bq;
-} ma_notch2;
-
-MA_API ma_result ma_notch2_get_heap_size(const ma_notch2_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_notch2_init_preallocated(const ma_notch2_config* pConfig, void* pHeap, ma_notch2* pFilter);
-MA_API ma_result ma_notch2_init(const ma_notch2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_notch2* pFilter);
-MA_API void ma_notch2_uninit(ma_notch2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_notch2_reinit(const ma_notch2_config* pConfig, ma_notch2* pFilter);
-MA_API ma_result ma_notch2_process_pcm_frames(ma_notch2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_notch2_get_latency(const ma_notch2* pFilter);
-
-
-/**************************************************************************************************************************************************************
-
-Peaking EQ Filter
-
-**************************************************************************************************************************************************************/
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double gainDB;
-    double q;
-    double frequency;
-} ma_peak2_config, ma_peak_config;
-
-MA_API ma_peak2_config ma_peak2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency);
-
-typedef struct
-{
-    ma_biquad bq;
-} ma_peak2;
-
-MA_API ma_result ma_peak2_get_heap_size(const ma_peak2_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_peak2_init_preallocated(const ma_peak2_config* pConfig, void* pHeap, ma_peak2* pFilter);
-MA_API ma_result ma_peak2_init(const ma_peak2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_peak2* pFilter);
-MA_API void ma_peak2_uninit(ma_peak2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_peak2_reinit(const ma_peak2_config* pConfig, ma_peak2* pFilter);
-MA_API ma_result ma_peak2_process_pcm_frames(ma_peak2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_peak2_get_latency(const ma_peak2* pFilter);
-
-
-/**************************************************************************************************************************************************************
-
-Low Shelf Filter
-
-**************************************************************************************************************************************************************/
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double gainDB;
-    double shelfSlope;
-    double frequency;
-} ma_loshelf2_config, ma_loshelf_config;
-
-MA_API ma_loshelf2_config ma_loshelf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double shelfSlope, double frequency);
-
-typedef struct
-{
-    ma_biquad bq;
-} ma_loshelf2;
-
-MA_API ma_result ma_loshelf2_get_heap_size(const ma_loshelf2_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_loshelf2_init_preallocated(const ma_loshelf2_config* pConfig, void* pHeap, ma_loshelf2* pFilter);
-MA_API ma_result ma_loshelf2_init(const ma_loshelf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_loshelf2* pFilter);
-MA_API void ma_loshelf2_uninit(ma_loshelf2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_loshelf2_reinit(const ma_loshelf2_config* pConfig, ma_loshelf2* pFilter);
-MA_API ma_result ma_loshelf2_process_pcm_frames(ma_loshelf2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_loshelf2_get_latency(const ma_loshelf2* pFilter);
-
-
-/**************************************************************************************************************************************************************
-
-High Shelf Filter
-
-**************************************************************************************************************************************************************/
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double gainDB;
-    double shelfSlope;
-    double frequency;
-} ma_hishelf2_config, ma_hishelf_config;
-
-MA_API ma_hishelf2_config ma_hishelf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double shelfSlope, double frequency);
-
-typedef struct
-{
-    ma_biquad bq;
-} ma_hishelf2;
-
-MA_API ma_result ma_hishelf2_get_heap_size(const ma_hishelf2_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_hishelf2_init_preallocated(const ma_hishelf2_config* pConfig, void* pHeap, ma_hishelf2* pFilter);
-MA_API ma_result ma_hishelf2_init(const ma_hishelf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hishelf2* pFilter);
-MA_API void ma_hishelf2_uninit(ma_hishelf2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_hishelf2_reinit(const ma_hishelf2_config* pConfig, ma_hishelf2* pFilter);
-MA_API ma_result ma_hishelf2_process_pcm_frames(ma_hishelf2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_uint32 ma_hishelf2_get_latency(const ma_hishelf2* pFilter);
-
-
-
-/*
-Delay
-*/
-typedef struct
-{
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_uint32 delayInFrames;
-    ma_bool32 delayStart;       /* Set to true to delay the start of the output; false otherwise. */
-    float wet;                  /* 0..1. Default = 1. */
-    float dry;                  /* 0..1. Default = 1. */
-    float decay;                /* 0..1. Default = 0 (no feedback). Feedback decay. Use this for echo. */
-} ma_delay_config;
-
-MA_API ma_delay_config ma_delay_config_init(ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 delayInFrames, float decay);
-
-
-typedef struct
-{
-    ma_delay_config config;
-    ma_uint32 cursor;               /* Feedback is written to this cursor. Always equal or in front of the read cursor. */
-    ma_uint32 bufferSizeInFrames;
-    float* pBuffer;
-} ma_delay;
-
-MA_API ma_result ma_delay_init(const ma_delay_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_delay* pDelay);
-MA_API void ma_delay_uninit(ma_delay* pDelay, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_delay_process_pcm_frames(ma_delay* pDelay, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount);
-MA_API void ma_delay_set_wet(ma_delay* pDelay, float value);
-MA_API float ma_delay_get_wet(const ma_delay* pDelay);
-MA_API void ma_delay_set_dry(ma_delay* pDelay, float value);
-MA_API float ma_delay_get_dry(const ma_delay* pDelay);
-MA_API void ma_delay_set_decay(ma_delay* pDelay, float value);
-MA_API float ma_delay_get_decay(const ma_delay* pDelay);
-
-
-/* Gainer for smooth volume changes. */
-typedef struct
-{
-    ma_uint32 channels;
-    ma_uint32 smoothTimeInFrames;
-} ma_gainer_config;
-
-MA_API ma_gainer_config ma_gainer_config_init(ma_uint32 channels, ma_uint32 smoothTimeInFrames);
-
-
-typedef struct
-{
-    ma_gainer_config config;
-    ma_uint32 t;
-    float masterVolume;
-    float* pOldGains;
-    float* pNewGains;
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_gainer;
-
-MA_API ma_result ma_gainer_get_heap_size(const ma_gainer_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_gainer_init_preallocated(const ma_gainer_config* pConfig, void* pHeap, ma_gainer* pGainer);
-MA_API ma_result ma_gainer_init(const ma_gainer_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_gainer* pGainer);
-MA_API void ma_gainer_uninit(ma_gainer* pGainer, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_gainer_process_pcm_frames(ma_gainer* pGainer, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_result ma_gainer_set_gain(ma_gainer* pGainer, float newGain);
-MA_API ma_result ma_gainer_set_gains(ma_gainer* pGainer, float* pNewGains);
-MA_API ma_result ma_gainer_set_master_volume(ma_gainer* pGainer, float volume);
-MA_API ma_result ma_gainer_get_master_volume(const ma_gainer* pGainer, float* pVolume);
-
-
-
-/* Stereo panner. */
-typedef enum
-{
-    ma_pan_mode_balance = 0,    /* Does not blend one side with the other. Technically just a balance. Compatible with other popular audio engines and therefore the default. */
-    ma_pan_mode_pan             /* A true pan. The sound from one side will "move" to the other side and blend with it. */
-} ma_pan_mode;
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_pan_mode mode;
-    float pan;
-} ma_panner_config;
-
-MA_API ma_panner_config ma_panner_config_init(ma_format format, ma_uint32 channels);
-
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_pan_mode mode;
-    float pan;  /* -1..1 where 0 is no pan, -1 is left side, +1 is right side. Defaults to 0. */
-} ma_panner;
-
-MA_API ma_result ma_panner_init(const ma_panner_config* pConfig, ma_panner* pPanner);
-MA_API ma_result ma_panner_process_pcm_frames(ma_panner* pPanner, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API void ma_panner_set_mode(ma_panner* pPanner, ma_pan_mode mode);
-MA_API ma_pan_mode ma_panner_get_mode(const ma_panner* pPanner);
-MA_API void ma_panner_set_pan(ma_panner* pPanner, float pan);
-MA_API float ma_panner_get_pan(const ma_panner* pPanner);
-
-
-
-/* Fader. */
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-} ma_fader_config;
-
-MA_API ma_fader_config ma_fader_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate);
-
-typedef struct
-{
-    ma_fader_config config;
-    float volumeBeg;            /* If volumeBeg and volumeEnd is equal to 1, no fading happens (ma_fader_process_pcm_frames() will run as a passthrough). */
-    float volumeEnd;
-    ma_uint64 lengthInFrames;   /* The total length of the fade. */
-    ma_int64  cursorInFrames;   /* The current time in frames. Incremented by ma_fader_process_pcm_frames(). Signed because it'll be offset by startOffsetInFrames in set_fade_ex(). */
-} ma_fader;
-
-MA_API ma_result ma_fader_init(const ma_fader_config* pConfig, ma_fader* pFader);
-MA_API ma_result ma_fader_process_pcm_frames(ma_fader* pFader, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API void ma_fader_get_data_format(const ma_fader* pFader, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate);
-MA_API void ma_fader_set_fade(ma_fader* pFader, float volumeBeg, float volumeEnd, ma_uint64 lengthInFrames);
-MA_API void ma_fader_set_fade_ex(ma_fader* pFader, float volumeBeg, float volumeEnd, ma_uint64 lengthInFrames, ma_int64 startOffsetInFrames);
-MA_API float ma_fader_get_current_volume(const ma_fader* pFader);
-
-
-
-/* Spatializer. */
-typedef struct
-{
-    float x;
-    float y;
-    float z;
-} ma_vec3f;
-
-typedef struct
-{
-    ma_vec3f v;
-    ma_spinlock lock;
-} ma_atomic_vec3f;
-
-typedef enum
-{
-    ma_attenuation_model_none,          /* No distance attenuation and no spatialization. */
-    ma_attenuation_model_inverse,       /* Equivalent to OpenAL's AL_INVERSE_DISTANCE_CLAMPED. */
-    ma_attenuation_model_linear,        /* Linear attenuation. Equivalent to OpenAL's AL_LINEAR_DISTANCE_CLAMPED. */
-    ma_attenuation_model_exponential    /* Exponential attenuation. Equivalent to OpenAL's AL_EXPONENT_DISTANCE_CLAMPED. */
-} ma_attenuation_model;
-
-typedef enum
-{
-    ma_positioning_absolute,
-    ma_positioning_relative
-} ma_positioning;
-
-typedef enum
-{
-    ma_handedness_right,
-    ma_handedness_left
-} ma_handedness;
-
-
-typedef struct
-{
-    ma_uint32 channelsOut;
-    ma_channel* pChannelMapOut;
-    ma_handedness handedness;   /* Defaults to right. Forward is -1 on the Z axis. In a left handed system, forward is +1 on the Z axis. */
-    float coneInnerAngleInRadians;
-    float coneOuterAngleInRadians;
-    float coneOuterGain;
-    float speedOfSound;
-    ma_vec3f worldUp;
-} ma_spatializer_listener_config;
-
-MA_API ma_spatializer_listener_config ma_spatializer_listener_config_init(ma_uint32 channelsOut);
-
-
-typedef struct
-{
-    ma_spatializer_listener_config config;
-    ma_atomic_vec3f position;  /* The absolute position of the listener. */
-    ma_atomic_vec3f direction; /* The direction the listener is facing. The world up vector is config.worldUp. */
-    ma_atomic_vec3f velocity;
-    ma_bool32 isEnabled;
-
-    /* Memory management. */
-    ma_bool32 _ownsHeap;
-    void* _pHeap;
-} ma_spatializer_listener;
-
-MA_API ma_result ma_spatializer_listener_get_heap_size(const ma_spatializer_listener_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_spatializer_listener_init_preallocated(const ma_spatializer_listener_config* pConfig, void* pHeap, ma_spatializer_listener* pListener);
-MA_API ma_result ma_spatializer_listener_init(const ma_spatializer_listener_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_spatializer_listener* pListener);
-MA_API void ma_spatializer_listener_uninit(ma_spatializer_listener* pListener, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_channel* ma_spatializer_listener_get_channel_map(ma_spatializer_listener* pListener);
-MA_API void ma_spatializer_listener_set_cone(ma_spatializer_listener* pListener, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
-MA_API void ma_spatializer_listener_get_cone(const ma_spatializer_listener* pListener, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
-MA_API void ma_spatializer_listener_set_position(ma_spatializer_listener* pListener, float x, float y, float z);
-MA_API ma_vec3f ma_spatializer_listener_get_position(const ma_spatializer_listener* pListener);
-MA_API void ma_spatializer_listener_set_direction(ma_spatializer_listener* pListener, float x, float y, float z);
-MA_API ma_vec3f ma_spatializer_listener_get_direction(const ma_spatializer_listener* pListener);
-MA_API void ma_spatializer_listener_set_velocity(ma_spatializer_listener* pListener, float x, float y, float z);
-MA_API ma_vec3f ma_spatializer_listener_get_velocity(const ma_spatializer_listener* pListener);
-MA_API void ma_spatializer_listener_set_speed_of_sound(ma_spatializer_listener* pListener, float speedOfSound);
-MA_API float ma_spatializer_listener_get_speed_of_sound(const ma_spatializer_listener* pListener);
-MA_API void ma_spatializer_listener_set_world_up(ma_spatializer_listener* pListener, float x, float y, float z);
-MA_API ma_vec3f ma_spatializer_listener_get_world_up(const ma_spatializer_listener* pListener);
-MA_API void ma_spatializer_listener_set_enabled(ma_spatializer_listener* pListener, ma_bool32 isEnabled);
-MA_API ma_bool32 ma_spatializer_listener_is_enabled(const ma_spatializer_listener* pListener);
-
-
-typedef struct
-{
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    ma_channel* pChannelMapIn;
-    ma_attenuation_model attenuationModel;
-    ma_positioning positioning;
-    ma_handedness handedness;           /* Defaults to right. Forward is -1 on the Z axis. In a left handed system, forward is +1 on the Z axis. */
-    float minGain;
-    float maxGain;
-    float minDistance;
-    float maxDistance;
-    float rolloff;
-    float coneInnerAngleInRadians;
-    float coneOuterAngleInRadians;
-    float coneOuterGain;
-    float dopplerFactor;                /* Set to 0 to disable doppler effect. */
-    float directionalAttenuationFactor; /* Set to 0 to disable directional attenuation. */
-    float minSpatializationChannelGain; /* The minimal scaling factor to apply to channel gains when accounting for the direction of the sound relative to the listener. Must be in the range of 0..1. Smaller values means more aggressive directional panning, larger values means more subtle directional panning. */
-    ma_uint32 gainSmoothTimeInFrames;   /* When the gain of a channel changes during spatialization, the transition will be linearly interpolated over this number of frames. */
-} ma_spatializer_config;
-
-MA_API ma_spatializer_config ma_spatializer_config_init(ma_uint32 channelsIn, ma_uint32 channelsOut);
-
-
-typedef struct
-{
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    ma_channel* pChannelMapIn;
-    ma_attenuation_model attenuationModel;
-    ma_positioning positioning;
-    ma_handedness handedness;           /* Defaults to right. Forward is -1 on the Z axis. In a left handed system, forward is +1 on the Z axis. */
-    float minGain;
-    float maxGain;
-    float minDistance;
-    float maxDistance;
-    float rolloff;
-    float coneInnerAngleInRadians;
-    float coneOuterAngleInRadians;
-    float coneOuterGain;
-    float dopplerFactor;                /* Set to 0 to disable doppler effect. */
-    float directionalAttenuationFactor; /* Set to 0 to disable directional attenuation. */
-    ma_uint32 gainSmoothTimeInFrames;   /* When the gain of a channel changes during spatialization, the transition will be linearly interpolated over this number of frames. */
-    ma_atomic_vec3f position;
-    ma_atomic_vec3f direction;
-    ma_atomic_vec3f velocity;  /* For doppler effect. */
-    float dopplerPitch; /* Will be updated by ma_spatializer_process_pcm_frames() and can be used by higher level functions to apply a pitch shift for doppler effect. */
-    float minSpatializationChannelGain;
-    ma_gainer gainer;   /* For smooth gain transitions. */
-    float* pNewChannelGainsOut; /* An offset of _pHeap. Used by ma_spatializer_process_pcm_frames() to store new channel gains. The number of elements in this array is equal to config.channelsOut. */
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_spatializer;
-
-MA_API ma_result ma_spatializer_get_heap_size(const ma_spatializer_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_spatializer_init_preallocated(const ma_spatializer_config* pConfig, void* pHeap, ma_spatializer* pSpatializer);
-MA_API ma_result ma_spatializer_init(const ma_spatializer_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_uninit(ma_spatializer* pSpatializer, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_spatializer_process_pcm_frames(ma_spatializer* pSpatializer, ma_spatializer_listener* pListener, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_result ma_spatializer_set_master_volume(ma_spatializer* pSpatializer, float volume);
-MA_API ma_result ma_spatializer_get_master_volume(const ma_spatializer* pSpatializer, float* pVolume);
-MA_API ma_uint32 ma_spatializer_get_input_channels(const ma_spatializer* pSpatializer);
-MA_API ma_uint32 ma_spatializer_get_output_channels(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_attenuation_model(ma_spatializer* pSpatializer, ma_attenuation_model attenuationModel);
-MA_API ma_attenuation_model ma_spatializer_get_attenuation_model(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_positioning(ma_spatializer* pSpatializer, ma_positioning positioning);
-MA_API ma_positioning ma_spatializer_get_positioning(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_rolloff(ma_spatializer* pSpatializer, float rolloff);
-MA_API float ma_spatializer_get_rolloff(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_min_gain(ma_spatializer* pSpatializer, float minGain);
-MA_API float ma_spatializer_get_min_gain(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_max_gain(ma_spatializer* pSpatializer, float maxGain);
-MA_API float ma_spatializer_get_max_gain(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_min_distance(ma_spatializer* pSpatializer, float minDistance);
-MA_API float ma_spatializer_get_min_distance(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_max_distance(ma_spatializer* pSpatializer, float maxDistance);
-MA_API float ma_spatializer_get_max_distance(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_cone(ma_spatializer* pSpatializer, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
-MA_API void ma_spatializer_get_cone(const ma_spatializer* pSpatializer, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
-MA_API void ma_spatializer_set_doppler_factor(ma_spatializer* pSpatializer, float dopplerFactor);
-MA_API float ma_spatializer_get_doppler_factor(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_directional_attenuation_factor(ma_spatializer* pSpatializer, float directionalAttenuationFactor);
-MA_API float ma_spatializer_get_directional_attenuation_factor(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_position(ma_spatializer* pSpatializer, float x, float y, float z);
-MA_API ma_vec3f ma_spatializer_get_position(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_direction(ma_spatializer* pSpatializer, float x, float y, float z);
-MA_API ma_vec3f ma_spatializer_get_direction(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_set_velocity(ma_spatializer* pSpatializer, float x, float y, float z);
-MA_API ma_vec3f ma_spatializer_get_velocity(const ma_spatializer* pSpatializer);
-MA_API void ma_spatializer_get_relative_position_and_direction(const ma_spatializer* pSpatializer, const ma_spatializer_listener* pListener, ma_vec3f* pRelativePos, ma_vec3f* pRelativeDir);
-
-
-
-/************************************************************************************************************************************************************
-*************************************************************************************************************************************************************
-
-DATA CONVERSION
-===============
-
-This section contains the APIs for data conversion. You will find everything here for channel mapping, sample format conversion, resampling, etc.
-
-*************************************************************************************************************************************************************
-************************************************************************************************************************************************************/
-
-/**************************************************************************************************************************************************************
-
-Resampling
-
-**************************************************************************************************************************************************************/
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRateIn;
-    ma_uint32 sampleRateOut;
-    ma_uint32 lpfOrder;         /* The low-pass filter order. Setting this to 0 will disable low-pass filtering. */
-    double    lpfNyquistFactor; /* 0..1. Defaults to 1. 1 = Half the sampling frequency (Nyquist Frequency), 0.5 = Quarter the sampling frequency (half Nyquest Frequency), etc. */
-} ma_linear_resampler_config;
-
-MA_API ma_linear_resampler_config ma_linear_resampler_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
-
-typedef struct
-{
-    ma_linear_resampler_config config;
-    ma_uint32 inAdvanceInt;
-    ma_uint32 inAdvanceFrac;
-    ma_uint32 inTimeInt;
-    ma_uint32 inTimeFrac;
-    union
-    {
-        float* f32;
-        ma_int16* s16;
-    } x0; /* The previous input frame. */
-    union
-    {
-        float* f32;
-        ma_int16* s16;
-    } x1; /* The next input frame. */
-    ma_lpf lpf;
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_linear_resampler;
-
-MA_API ma_result ma_linear_resampler_get_heap_size(const ma_linear_resampler_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_linear_resampler_init_preallocated(const ma_linear_resampler_config* pConfig, void* pHeap, ma_linear_resampler* pResampler);
-MA_API ma_result ma_linear_resampler_init(const ma_linear_resampler_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_linear_resampler* pResampler);
-MA_API void ma_linear_resampler_uninit(ma_linear_resampler* pResampler, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_linear_resampler_process_pcm_frames(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut);
-MA_API ma_result ma_linear_resampler_set_rate(ma_linear_resampler* pResampler, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
-MA_API ma_result ma_linear_resampler_set_rate_ratio(ma_linear_resampler* pResampler, float ratioInOut);
-MA_API ma_uint64 ma_linear_resampler_get_input_latency(const ma_linear_resampler* pResampler);
-MA_API ma_uint64 ma_linear_resampler_get_output_latency(const ma_linear_resampler* pResampler);
-MA_API ma_result ma_linear_resampler_get_required_input_frame_count(const ma_linear_resampler* pResampler, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount);
-MA_API ma_result ma_linear_resampler_get_expected_output_frame_count(const ma_linear_resampler* pResampler, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount);
-MA_API ma_result ma_linear_resampler_reset(ma_linear_resampler* pResampler);
-
-
-typedef struct ma_resampler_config ma_resampler_config;
-
-typedef void ma_resampling_backend;
-typedef struct
-{
-    ma_result (* onGetHeapSize                )(void* pUserData, const ma_resampler_config* pConfig, size_t* pHeapSizeInBytes);
-    ma_result (* onInit                       )(void* pUserData, const ma_resampler_config* pConfig, void* pHeap, ma_resampling_backend** ppBackend);
-    void      (* onUninit                     )(void* pUserData, ma_resampling_backend* pBackend, const ma_allocation_callbacks* pAllocationCallbacks);
-    ma_result (* onProcess                    )(void* pUserData, ma_resampling_backend* pBackend, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut);
-    ma_result (* onSetRate                    )(void* pUserData, ma_resampling_backend* pBackend, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);                 /* Optional. Rate changes will be disabled. */
-    ma_uint64 (* onGetInputLatency            )(void* pUserData, const ma_resampling_backend* pBackend);                                                            /* Optional. Latency will be reported as 0. */
-    ma_uint64 (* onGetOutputLatency           )(void* pUserData, const ma_resampling_backend* pBackend);                                                            /* Optional. Latency will be reported as 0. */
-    ma_result (* onGetRequiredInputFrameCount )(void* pUserData, const ma_resampling_backend* pBackend, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount);   /* Optional. Latency mitigation will be disabled. */
-    ma_result (* onGetExpectedOutputFrameCount)(void* pUserData, const ma_resampling_backend* pBackend, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount);   /* Optional. Latency mitigation will be disabled. */
-    ma_result (* onReset                      )(void* pUserData, ma_resampling_backend* pBackend);
-} ma_resampling_backend_vtable;
-
-typedef enum
-{
-    ma_resample_algorithm_linear = 0,    /* Fastest, lowest quality. Optional low-pass filtering. Default. */
-    ma_resample_algorithm_custom,
-} ma_resample_algorithm;
-
-struct ma_resampler_config
-{
-    ma_format format;   /* Must be either ma_format_f32 or ma_format_s16. */
-    ma_uint32 channels;
-    ma_uint32 sampleRateIn;
-    ma_uint32 sampleRateOut;
-    ma_resample_algorithm algorithm;    /* When set to ma_resample_algorithm_custom, pBackendVTable will be used. */
-    ma_resampling_backend_vtable* pBackendVTable;
-    void* pBackendUserData;
-    struct
-    {
-        ma_uint32 lpfOrder;
-    } linear;
-};
-
-MA_API ma_resampler_config ma_resampler_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut, ma_resample_algorithm algorithm);
-
-typedef struct
-{
-    ma_resampling_backend* pBackend;
-    ma_resampling_backend_vtable* pBackendVTable;
-    void* pBackendUserData;
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRateIn;
-    ma_uint32 sampleRateOut;
-    union
-    {
-        ma_linear_resampler linear;
-    } state;    /* State for stock resamplers so we can avoid a malloc. For stock resamplers, pBackend will point here. */
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_resampler;
-
-MA_API ma_result ma_resampler_get_heap_size(const ma_resampler_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_resampler_init_preallocated(const ma_resampler_config* pConfig, void* pHeap, ma_resampler* pResampler);
-
-/*
-Initializes a new resampler object from a config.
-*/
-MA_API ma_result ma_resampler_init(const ma_resampler_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_resampler* pResampler);
-
-/*
-Uninitializes a resampler.
-*/
-MA_API void ma_resampler_uninit(ma_resampler* pResampler, const ma_allocation_callbacks* pAllocationCallbacks);
-
-/*
-Converts the given input data.
-
-Both the input and output frames must be in the format specified in the config when the resampler was initialized.
-
-On input, [pFrameCountOut] contains the number of output frames to process. On output it contains the number of output frames that
-were actually processed, which may be less than the requested amount which will happen if there's not enough input data. You can use
-ma_resampler_get_expected_output_frame_count() to know how many output frames will be processed for a given number of input frames.
-
-On input, [pFrameCountIn] contains the number of input frames contained in [pFramesIn]. On output it contains the number of whole
-input frames that were actually processed. You can use ma_resampler_get_required_input_frame_count() to know how many input frames
-you should provide for a given number of output frames. [pFramesIn] can be NULL, in which case zeroes will be used instead.
-
-If [pFramesOut] is NULL, a seek is performed. In this case, if [pFrameCountOut] is not NULL it will seek by the specified number of
-output frames. Otherwise, if [pFramesCountOut] is NULL and [pFrameCountIn] is not NULL, it will seek by the specified number of input
-frames. When seeking, [pFramesIn] is allowed to NULL, in which case the internal timing state will be updated, but no input will be
-processed. In this case, any internal filter state will be updated as if zeroes were passed in.
-
-It is an error for [pFramesOut] to be non-NULL and [pFrameCountOut] to be NULL.
-
-It is an error for both [pFrameCountOut] and [pFrameCountIn] to be NULL.
-*/
-MA_API ma_result ma_resampler_process_pcm_frames(ma_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut);
-
-
-/*
-Sets the input and output sample rate.
-*/
-MA_API ma_result ma_resampler_set_rate(ma_resampler* pResampler, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
-
-/*
-Sets the input and output sample rate as a ratio.
-
-The ration is in/out.
-*/
-MA_API ma_result ma_resampler_set_rate_ratio(ma_resampler* pResampler, float ratio);
-
-/*
-Retrieves the latency introduced by the resampler in input frames.
-*/
-MA_API ma_uint64 ma_resampler_get_input_latency(const ma_resampler* pResampler);
-
-/*
-Retrieves the latency introduced by the resampler in output frames.
-*/
-MA_API ma_uint64 ma_resampler_get_output_latency(const ma_resampler* pResampler);
-
-/*
-Calculates the number of whole input frames that would need to be read from the client in order to output the specified
-number of output frames.
-
-The returned value does not include cached input frames. It only returns the number of extra frames that would need to be
-read from the input buffer in order to output the specified number of output frames.
-*/
-MA_API ma_result ma_resampler_get_required_input_frame_count(const ma_resampler* pResampler, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount);
-
-/*
-Calculates the number of whole output frames that would be output after fully reading and consuming the specified number of
-input frames.
-*/
-MA_API ma_result ma_resampler_get_expected_output_frame_count(const ma_resampler* pResampler, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount);
-
-/*
-Resets the resampler's timer and clears it's internal cache.
-*/
-MA_API ma_result ma_resampler_reset(ma_resampler* pResampler);
-
-
-/**************************************************************************************************************************************************************
-
-Channel Conversion
-
-**************************************************************************************************************************************************************/
-typedef enum
-{
-    ma_channel_conversion_path_unknown,
-    ma_channel_conversion_path_passthrough,
-    ma_channel_conversion_path_mono_out,    /* Converting to mono. */
-    ma_channel_conversion_path_mono_in,     /* Converting from mono. */
-    ma_channel_conversion_path_shuffle,     /* Simple shuffle. Will use this when all channels are present in both input and output channel maps, but just in a different order. */
-    ma_channel_conversion_path_weights      /* Blended based on weights. */
-} ma_channel_conversion_path;
-
-typedef enum
-{
-    ma_mono_expansion_mode_duplicate = 0,   /* The default. */
-    ma_mono_expansion_mode_average,         /* Average the mono channel across all channels. */
-    ma_mono_expansion_mode_stereo_only,     /* Duplicate to the left and right channels only and ignore the others. */
-    ma_mono_expansion_mode_default = ma_mono_expansion_mode_duplicate
-} ma_mono_expansion_mode;
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    const ma_channel* pChannelMapIn;
-    const ma_channel* pChannelMapOut;
-    ma_channel_mix_mode mixingMode;
-    ma_bool32 calculateLFEFromSpatialChannels;  /* When an output LFE channel is present, but no input LFE, set to true to set the output LFE to the average of all spatial channels (LR, FR, etc.). Ignored when an input LFE is present. */
-    float** ppWeights;  /* [in][out]. Only used when mixingMode is set to ma_channel_mix_mode_custom_weights. */
-} ma_channel_converter_config;
-
-MA_API ma_channel_converter_config ma_channel_converter_config_init(ma_format format, ma_uint32 channelsIn, const ma_channel* pChannelMapIn, ma_uint32 channelsOut, const ma_channel* pChannelMapOut, ma_channel_mix_mode mixingMode);
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    ma_channel_mix_mode mixingMode;
-    ma_channel_conversion_path conversionPath;
-    ma_channel* pChannelMapIn;
-    ma_channel* pChannelMapOut;
-    ma_uint8* pShuffleTable;    /* Indexed by output channel index. */
-    union
-    {
-        float**    f32;
-        ma_int32** s16;
-    } weights;  /* [in][out] */
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_channel_converter;
-
-MA_API ma_result ma_channel_converter_get_heap_size(const ma_channel_converter_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_channel_converter_init_preallocated(const ma_channel_converter_config* pConfig, void* pHeap, ma_channel_converter* pConverter);
-MA_API ma_result ma_channel_converter_init(const ma_channel_converter_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_channel_converter* pConverter);
-MA_API void ma_channel_converter_uninit(ma_channel_converter* pConverter, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_channel_converter_process_pcm_frames(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
-MA_API ma_result ma_channel_converter_get_input_channel_map(const ma_channel_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_channel_converter_get_output_channel_map(const ma_channel_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap);
-
-
-/**************************************************************************************************************************************************************
-
-Data Conversion
-
-**************************************************************************************************************************************************************/
-typedef struct
-{
-    ma_format formatIn;
-    ma_format formatOut;
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    ma_uint32 sampleRateIn;
-    ma_uint32 sampleRateOut;
-    ma_channel* pChannelMapIn;
-    ma_channel* pChannelMapOut;
-    ma_dither_mode ditherMode;
-    ma_channel_mix_mode channelMixMode;
-    ma_bool32 calculateLFEFromSpatialChannels;  /* When an output LFE channel is present, but no input LFE, set to true to set the output LFE to the average of all spatial channels (LR, FR, etc.). Ignored when an input LFE is present. */
-    float** ppChannelWeights;  /* [in][out]. Only used when mixingMode is set to ma_channel_mix_mode_custom_weights. */
-    ma_bool32 allowDynamicSampleRate;
-    ma_resampler_config resampling;
-} ma_data_converter_config;
-
-MA_API ma_data_converter_config ma_data_converter_config_init_default(void);
-MA_API ma_data_converter_config ma_data_converter_config_init(ma_format formatIn, ma_format formatOut, ma_uint32 channelsIn, ma_uint32 channelsOut, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
-
-
-typedef enum
-{
-    ma_data_converter_execution_path_passthrough,       /* No conversion. */
-    ma_data_converter_execution_path_format_only,       /* Only format conversion. */
-    ma_data_converter_execution_path_channels_only,     /* Only channel conversion. */
-    ma_data_converter_execution_path_resample_only,     /* Only resampling. */
-    ma_data_converter_execution_path_resample_first,    /* All conversions, but resample as the first step. */
-    ma_data_converter_execution_path_channels_first     /* All conversions, but channels as the first step. */
-} ma_data_converter_execution_path;
-
-typedef struct
-{
-    ma_format formatIn;
-    ma_format formatOut;
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    ma_uint32 sampleRateIn;
-    ma_uint32 sampleRateOut;
-    ma_dither_mode ditherMode;
-    ma_data_converter_execution_path executionPath; /* The execution path the data converter will follow when processing. */
-    ma_channel_converter channelConverter;
-    ma_resampler resampler;
-    ma_bool8 hasPreFormatConversion;
-    ma_bool8 hasPostFormatConversion;
-    ma_bool8 hasChannelConverter;
-    ma_bool8 hasResampler;
-    ma_bool8 isPassthrough;
-
-    /* Memory management. */
-    ma_bool8 _ownsHeap;
-    void* _pHeap;
-} ma_data_converter;
-
-MA_API ma_result ma_data_converter_get_heap_size(const ma_data_converter_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_data_converter_init_preallocated(const ma_data_converter_config* pConfig, void* pHeap, ma_data_converter* pConverter);
-MA_API ma_result ma_data_converter_init(const ma_data_converter_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_converter* pConverter);
-MA_API void ma_data_converter_uninit(ma_data_converter* pConverter, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_data_converter_process_pcm_frames(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut);
-MA_API ma_result ma_data_converter_set_rate(ma_data_converter* pConverter, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
-MA_API ma_result ma_data_converter_set_rate_ratio(ma_data_converter* pConverter, float ratioInOut);
-MA_API ma_uint64 ma_data_converter_get_input_latency(const ma_data_converter* pConverter);
-MA_API ma_uint64 ma_data_converter_get_output_latency(const ma_data_converter* pConverter);
-MA_API ma_result ma_data_converter_get_required_input_frame_count(const ma_data_converter* pConverter, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount);
-MA_API ma_result ma_data_converter_get_expected_output_frame_count(const ma_data_converter* pConverter, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount);
-MA_API ma_result ma_data_converter_get_input_channel_map(const ma_data_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_data_converter_get_output_channel_map(const ma_data_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_data_converter_reset(ma_data_converter* pConverter);
-
-
-/************************************************************************************************************************************************************
-
-Format Conversion
-
-************************************************************************************************************************************************************/
-MA_API void ma_pcm_u8_to_s16(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_u8_to_s24(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_u8_to_s32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_u8_to_f32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s16_to_u8(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s16_to_s24(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s16_to_s32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s16_to_f32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s24_to_u8(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s24_to_s16(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s24_to_s32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s24_to_f32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s32_to_u8(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s32_to_s16(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s32_to_s24(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_s32_to_f32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_f32_to_u8(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_f32_to_s16(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_f32_to_s24(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_f32_to_s32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
-MA_API void ma_pcm_convert(void* pOut, ma_format formatOut, const void* pIn, ma_format formatIn, ma_uint64 sampleCount, ma_dither_mode ditherMode);
-MA_API void ma_convert_pcm_frames_format(void* pOut, ma_format formatOut, const void* pIn, ma_format formatIn, ma_uint64 frameCount, ma_uint32 channels, ma_dither_mode ditherMode);
-
-/*
-Deinterleaves an interleaved buffer.
-*/
-MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void* pInterleavedPCMFrames, void** ppDeinterleavedPCMFrames);
-
-/*
-Interleaves a group of deinterleaved buffers.
-*/
-MA_API void ma_interleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void** ppDeinterleavedPCMFrames, void* pInterleavedPCMFrames);
-
-
-/************************************************************************************************************************************************************
-
-Channel Maps
-
-************************************************************************************************************************************************************/
-/*
-This is used in the shuffle table to indicate that the channel index is undefined and should be ignored.
-*/
-#define MA_CHANNEL_INDEX_NULL   255
-
-/*
-Retrieves the channel position of the specified channel in the given channel map.
-
-The pChannelMap parameter can be null, in which case miniaudio's default channel map will be assumed.
-*/
-MA_API ma_channel ma_channel_map_get_channel(const ma_channel* pChannelMap, ma_uint32 channelCount, ma_uint32 channelIndex);
-
-/*
-Initializes a blank channel map.
-
-When a blank channel map is specified anywhere it indicates that the native channel map should be used.
-*/
-MA_API void ma_channel_map_init_blank(ma_channel* pChannelMap, ma_uint32 channels);
-
-/*
-Helper for retrieving a standard channel map.
-
-The output channel map buffer must have a capacity of at least `channelMapCap`.
-*/
-MA_API void ma_channel_map_init_standard(ma_standard_channel_map standardChannelMap, ma_channel* pChannelMap, size_t channelMapCap, ma_uint32 channels);
-
-/*
-Copies a channel map.
-
-Both input and output channel map buffers must have a capacity of at at least `channels`.
-*/
-MA_API void ma_channel_map_copy(ma_channel* pOut, const ma_channel* pIn, ma_uint32 channels);
-
-/*
-Copies a channel map if one is specified, otherwise copies the default channel map.
-
-The output buffer must have a capacity of at least `channels`. If not NULL, the input channel map must also have a capacity of at least `channels`.
-*/
-MA_API void ma_channel_map_copy_or_default(ma_channel* pOut, size_t channelMapCapOut, const ma_channel* pIn, ma_uint32 channels);
-
-
-/*
-Determines whether or not a channel map is valid.
-
-A blank channel map is valid (all channels set to MA_CHANNEL_NONE). The way a blank channel map is handled is context specific, but
-is usually treated as a passthrough.
-
-Invalid channel maps:
-  - A channel map with no channels
-  - A channel map with more than one channel and a mono channel
-
-The channel map buffer must have a capacity of at least `channels`.
-*/
-MA_API ma_bool32 ma_channel_map_is_valid(const ma_channel* pChannelMap, ma_uint32 channels);
-
-/*
-Helper for comparing two channel maps for equality.
-
-This assumes the channel count is the same between the two.
-
-Both channels map buffers must have a capacity of at least `channels`.
-*/
-MA_API ma_bool32 ma_channel_map_is_equal(const ma_channel* pChannelMapA, const ma_channel* pChannelMapB, ma_uint32 channels);
-
-/*
-Helper for determining if a channel map is blank (all channels set to MA_CHANNEL_NONE).
-
-The channel map buffer must have a capacity of at least `channels`.
-*/
-MA_API ma_bool32 ma_channel_map_is_blank(const ma_channel* pChannelMap, ma_uint32 channels);
-
-/*
-Helper for determining whether or not a channel is present in the given channel map.
-
-The channel map buffer must have a capacity of at least `channels`.
-*/
-MA_API ma_bool32 ma_channel_map_contains_channel_position(ma_uint32 channels, const ma_channel* pChannelMap, ma_channel channelPosition);
-
-/*
-Find a channel position in the given channel map. Returns MA_TRUE if the channel is found; MA_FALSE otherwise. The
-index of the channel is output to `pChannelIndex`.
-
-The channel map buffer must have a capacity of at least `channels`.
-*/
-MA_API ma_bool32 ma_channel_map_find_channel_position(ma_uint32 channels, const ma_channel* pChannelMap, ma_channel channelPosition, ma_uint32* pChannelIndex);
-
-/*
-Generates a string representing the given channel map.
-
-This is for printing and debugging purposes, not serialization/deserialization.
-
-Returns the length of the string, not including the null terminator.
-*/
-MA_API size_t ma_channel_map_to_string(const ma_channel* pChannelMap, ma_uint32 channels, char* pBufferOut, size_t bufferCap);
-
-/*
-Retrieves a human readable version of a channel position.
-*/
-MA_API const char* ma_channel_position_to_string(ma_channel channel);
-
-
-/************************************************************************************************************************************************************
-
-Conversion Helpers
-
-************************************************************************************************************************************************************/
-
-/*
-High-level helper for doing a full format conversion in one go. Returns the number of output frames. Call this with pOut set to NULL to
-determine the required size of the output buffer. frameCountOut should be set to the capacity of pOut. If pOut is NULL, frameCountOut is
-ignored.
-
-A return value of 0 indicates an error.
-
-This function is useful for one-off bulk conversions, but if you're streaming data you should use the ma_data_converter APIs instead.
-*/
-MA_API ma_uint64 ma_convert_frames(void* pOut, ma_uint64 frameCountOut, ma_format formatOut, ma_uint32 channelsOut, ma_uint32 sampleRateOut, const void* pIn, ma_uint64 frameCountIn, ma_format formatIn, ma_uint32 channelsIn, ma_uint32 sampleRateIn);
-MA_API ma_uint64 ma_convert_frames_ex(void* pOut, ma_uint64 frameCountOut, const void* pIn, ma_uint64 frameCountIn, const ma_data_converter_config* pConfig);
-
-
-/************************************************************************************************************************************************************
-
-Data Source
-
-************************************************************************************************************************************************************/
-typedef void ma_data_source;
-
-#define MA_DATA_SOURCE_SELF_MANAGED_RANGE_AND_LOOP_POINT    0x00000001
-
-typedef struct
-{
-    ma_result (* onRead)(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-    ma_result (* onSeek)(ma_data_source* pDataSource, ma_uint64 frameIndex);
-    ma_result (* onGetDataFormat)(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-    ma_result (* onGetCursor)(ma_data_source* pDataSource, ma_uint64* pCursor);
-    ma_result (* onGetLength)(ma_data_source* pDataSource, ma_uint64* pLength);
-    ma_result (* onSetLooping)(ma_data_source* pDataSource, ma_bool32 isLooping);
-    ma_uint32 flags;
-} ma_data_source_vtable;
-
-typedef ma_data_source* (* ma_data_source_get_next_proc)(ma_data_source* pDataSource);
-
-typedef struct
-{
-    const ma_data_source_vtable* vtable;
-} ma_data_source_config;
-
-MA_API ma_data_source_config ma_data_source_config_init(void);
-
-
-typedef struct
-{
-    const ma_data_source_vtable* vtable;
-    ma_uint64 rangeBegInFrames;
-    ma_uint64 rangeEndInFrames;             /* Set to -1 for unranged (default). */
-    ma_uint64 loopBegInFrames;              /* Relative to rangeBegInFrames. */
-    ma_uint64 loopEndInFrames;              /* Relative to rangeBegInFrames. Set to -1 for the end of the range. */
-    ma_data_source* pCurrent;               /* When non-NULL, the data source being initialized will act as a proxy and will route all operations to pCurrent. Used in conjunction with pNext/onGetNext for seamless chaining. */
-    ma_data_source* pNext;                  /* When set to NULL, onGetNext will be used. */
-    ma_data_source_get_next_proc onGetNext; /* Will be used when pNext is NULL. If both are NULL, no next will be used. */
-    MA_ATOMIC(4, ma_bool32) isLooping;
-} ma_data_source_base;
-
-MA_API ma_result ma_data_source_init(const ma_data_source_config* pConfig, ma_data_source* pDataSource);
-MA_API void ma_data_source_uninit(ma_data_source* pDataSource);
-MA_API ma_result ma_data_source_read_pcm_frames(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);   /* Must support pFramesOut = NULL in which case a forward seek should be performed. */
-MA_API ma_result ma_data_source_seek_pcm_frames(ma_data_source* pDataSource, ma_uint64 frameCount, ma_uint64* pFramesSeeked); /* Can only seek forward. Equivalent to ma_data_source_read_pcm_frames(pDataSource, NULL, frameCount, &framesRead); */
-MA_API ma_result ma_data_source_seek_to_pcm_frame(ma_data_source* pDataSource, ma_uint64 frameIndex);
-MA_API ma_result ma_data_source_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_data_source_get_cursor_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pCursor);
-MA_API ma_result ma_data_source_get_length_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pLength);    /* Returns MA_NOT_IMPLEMENTED if the length is unknown or cannot be determined. Decoders can return this. */
-MA_API ma_result ma_data_source_get_cursor_in_seconds(ma_data_source* pDataSource, float* pCursor);
-MA_API ma_result ma_data_source_get_length_in_seconds(ma_data_source* pDataSource, float* pLength);
-MA_API ma_result ma_data_source_set_looping(ma_data_source* pDataSource, ma_bool32 isLooping);
-MA_API ma_bool32 ma_data_source_is_looping(const ma_data_source* pDataSource);
-MA_API ma_result ma_data_source_set_range_in_pcm_frames(ma_data_source* pDataSource, ma_uint64 rangeBegInFrames, ma_uint64 rangeEndInFrames);
-MA_API void ma_data_source_get_range_in_pcm_frames(const ma_data_source* pDataSource, ma_uint64* pRangeBegInFrames, ma_uint64* pRangeEndInFrames);
-MA_API ma_result ma_data_source_set_loop_point_in_pcm_frames(ma_data_source* pDataSource, ma_uint64 loopBegInFrames, ma_uint64 loopEndInFrames);
-MA_API void ma_data_source_get_loop_point_in_pcm_frames(const ma_data_source* pDataSource, ma_uint64* pLoopBegInFrames, ma_uint64* pLoopEndInFrames);
-MA_API ma_result ma_data_source_set_current(ma_data_source* pDataSource, ma_data_source* pCurrentDataSource);
-MA_API ma_data_source* ma_data_source_get_current(const ma_data_source* pDataSource);
-MA_API ma_result ma_data_source_set_next(ma_data_source* pDataSource, ma_data_source* pNextDataSource);
-MA_API ma_data_source* ma_data_source_get_next(const ma_data_source* pDataSource);
-MA_API ma_result ma_data_source_set_next_callback(ma_data_source* pDataSource, ma_data_source_get_next_proc onGetNext);
-MA_API ma_data_source_get_next_proc ma_data_source_get_next_callback(const ma_data_source* pDataSource);
-
-
-typedef struct
-{
-    ma_data_source_base ds;
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_uint64 cursor;
-    ma_uint64 sizeInFrames;
-    const void* pData;
-} ma_audio_buffer_ref;
-
-MA_API ma_result ma_audio_buffer_ref_init(ma_format format, ma_uint32 channels, const void* pData, ma_uint64 sizeInFrames, ma_audio_buffer_ref* pAudioBufferRef);
-MA_API void ma_audio_buffer_ref_uninit(ma_audio_buffer_ref* pAudioBufferRef);
-MA_API ma_result ma_audio_buffer_ref_set_data(ma_audio_buffer_ref* pAudioBufferRef, const void* pData, ma_uint64 sizeInFrames);
-MA_API ma_uint64 ma_audio_buffer_ref_read_pcm_frames(ma_audio_buffer_ref* pAudioBufferRef, void* pFramesOut, ma_uint64 frameCount, ma_bool32 loop);
-MA_API ma_result ma_audio_buffer_ref_seek_to_pcm_frame(ma_audio_buffer_ref* pAudioBufferRef, ma_uint64 frameIndex);
-MA_API ma_result ma_audio_buffer_ref_map(ma_audio_buffer_ref* pAudioBufferRef, void** ppFramesOut, ma_uint64* pFrameCount);
-MA_API ma_result ma_audio_buffer_ref_unmap(ma_audio_buffer_ref* pAudioBufferRef, ma_uint64 frameCount);    /* Returns MA_AT_END if the end has been reached. This should be considered successful. */
-MA_API ma_bool32 ma_audio_buffer_ref_at_end(const ma_audio_buffer_ref* pAudioBufferRef);
-MA_API ma_result ma_audio_buffer_ref_get_cursor_in_pcm_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pCursor);
-MA_API ma_result ma_audio_buffer_ref_get_length_in_pcm_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pLength);
-MA_API ma_result ma_audio_buffer_ref_get_available_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pAvailableFrames);
-
-
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_uint64 sizeInFrames;
-    const void* pData;  /* If set to NULL, will allocate a block of memory for you. */
-    ma_allocation_callbacks allocationCallbacks;
-} ma_audio_buffer_config;
-
-MA_API ma_audio_buffer_config ma_audio_buffer_config_init(ma_format format, ma_uint32 channels, ma_uint64 sizeInFrames, const void* pData, const ma_allocation_callbacks* pAllocationCallbacks);
-
-typedef struct
-{
-    ma_audio_buffer_ref ref;
-    ma_allocation_callbacks allocationCallbacks;
-    ma_bool32 ownsData;             /* Used to control whether or not miniaudio owns the data buffer. If set to true, pData will be freed in ma_audio_buffer_uninit(). */
-    ma_uint8 _pExtraData[1];        /* For allocating a buffer with the memory located directly after the other memory of the structure. */
-} ma_audio_buffer;
-
-MA_API ma_result ma_audio_buffer_init(const ma_audio_buffer_config* pConfig, ma_audio_buffer* pAudioBuffer);
-MA_API ma_result ma_audio_buffer_init_copy(const ma_audio_buffer_config* pConfig, ma_audio_buffer* pAudioBuffer);
-MA_API ma_result ma_audio_buffer_alloc_and_init(const ma_audio_buffer_config* pConfig, ma_audio_buffer** ppAudioBuffer);  /* Always copies the data. Doesn't make sense to use this otherwise. Use ma_audio_buffer_uninit_and_free() to uninit. */
-MA_API void ma_audio_buffer_uninit(ma_audio_buffer* pAudioBuffer);
-MA_API void ma_audio_buffer_uninit_and_free(ma_audio_buffer* pAudioBuffer);
-MA_API ma_uint64 ma_audio_buffer_read_pcm_frames(ma_audio_buffer* pAudioBuffer, void* pFramesOut, ma_uint64 frameCount, ma_bool32 loop);
-MA_API ma_result ma_audio_buffer_seek_to_pcm_frame(ma_audio_buffer* pAudioBuffer, ma_uint64 frameIndex);
-MA_API ma_result ma_audio_buffer_map(ma_audio_buffer* pAudioBuffer, void** ppFramesOut, ma_uint64* pFrameCount);
-MA_API ma_result ma_audio_buffer_unmap(ma_audio_buffer* pAudioBuffer, ma_uint64 frameCount);    /* Returns MA_AT_END if the end has been reached. This should be considered successful. */
-MA_API ma_bool32 ma_audio_buffer_at_end(const ma_audio_buffer* pAudioBuffer);
-MA_API ma_result ma_audio_buffer_get_cursor_in_pcm_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pCursor);
-MA_API ma_result ma_audio_buffer_get_length_in_pcm_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pLength);
-MA_API ma_result ma_audio_buffer_get_available_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pAvailableFrames);
-
-
-/*
-Paged Audio Buffer
-==================
-A paged audio buffer is made up of a linked list of pages. It's expandable, but not shrinkable. It
-can be used for cases where audio data is streamed in asynchronously while allowing data to be read
-at the same time.
-
-This is lock-free, but not 100% thread safe. You can append a page and read from the buffer across
-simultaneously across different threads, however only one thread at a time can append, and only one
-thread at a time can read and seek.
-*/
-typedef struct ma_paged_audio_buffer_page ma_paged_audio_buffer_page;
-struct ma_paged_audio_buffer_page
-{
-    MA_ATOMIC(MA_SIZEOF_PTR, ma_paged_audio_buffer_page*) pNext;
-    ma_uint64 sizeInFrames;
-    ma_uint8 pAudioData[1];
-};
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_paged_audio_buffer_page head;                                /* Dummy head for the lock-free algorithm. Always has a size of 0. */
-    MA_ATOMIC(MA_SIZEOF_PTR, ma_paged_audio_buffer_page*) pTail;    /* Never null. Initially set to &head. */
-} ma_paged_audio_buffer_data;
-
-MA_API ma_result ma_paged_audio_buffer_data_init(ma_format format, ma_uint32 channels, ma_paged_audio_buffer_data* pData);
-MA_API void ma_paged_audio_buffer_data_uninit(ma_paged_audio_buffer_data* pData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_paged_audio_buffer_page* ma_paged_audio_buffer_data_get_head(ma_paged_audio_buffer_data* pData);
-MA_API ma_paged_audio_buffer_page* ma_paged_audio_buffer_data_get_tail(ma_paged_audio_buffer_data* pData);
-MA_API ma_result ma_paged_audio_buffer_data_get_length_in_pcm_frames(ma_paged_audio_buffer_data* pData, ma_uint64* pLength);
-MA_API ma_result ma_paged_audio_buffer_data_allocate_page(ma_paged_audio_buffer_data* pData, ma_uint64 pageSizeInFrames, const void* pInitialData, const ma_allocation_callbacks* pAllocationCallbacks, ma_paged_audio_buffer_page** ppPage);
-MA_API ma_result ma_paged_audio_buffer_data_free_page(ma_paged_audio_buffer_data* pData, ma_paged_audio_buffer_page* pPage, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_paged_audio_buffer_data_append_page(ma_paged_audio_buffer_data* pData, ma_paged_audio_buffer_page* pPage);
-MA_API ma_result ma_paged_audio_buffer_data_allocate_and_append_page(ma_paged_audio_buffer_data* pData, ma_uint32 pageSizeInFrames, const void* pInitialData, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-typedef struct
-{
-    ma_paged_audio_buffer_data* pData;  /* Must not be null. */
-} ma_paged_audio_buffer_config;
-
-MA_API ma_paged_audio_buffer_config ma_paged_audio_buffer_config_init(ma_paged_audio_buffer_data* pData);
-
-
-typedef struct
-{
-    ma_data_source_base ds;
-    ma_paged_audio_buffer_data* pData;              /* Audio data is read from here. Cannot be null. */
-    ma_paged_audio_buffer_page* pCurrent;
-    ma_uint64 relativeCursor;                       /* Relative to the current page. */
-    ma_uint64 absoluteCursor;
-} ma_paged_audio_buffer;
-
-MA_API ma_result ma_paged_audio_buffer_init(const ma_paged_audio_buffer_config* pConfig, ma_paged_audio_buffer* pPagedAudioBuffer);
-MA_API void ma_paged_audio_buffer_uninit(ma_paged_audio_buffer* pPagedAudioBuffer);
-MA_API ma_result ma_paged_audio_buffer_read_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);   /* Returns MA_AT_END if no more pages available. */
-MA_API ma_result ma_paged_audio_buffer_seek_to_pcm_frame(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64 frameIndex);
-MA_API ma_result ma_paged_audio_buffer_get_cursor_in_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64* pCursor);
-MA_API ma_result ma_paged_audio_buffer_get_length_in_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64* pLength);
-
-
-
-/************************************************************************************************************************************************************
-
-Ring Buffer
-
-************************************************************************************************************************************************************/
-typedef struct
-{
-    void* pBuffer;
-    ma_uint32 subbufferSizeInBytes;
-    ma_uint32 subbufferCount;
-    ma_uint32 subbufferStrideInBytes;
-    MA_ATOMIC(4, ma_uint32) encodedReadOffset;  /* Most significant bit is the loop flag. Lower 31 bits contains the actual offset in bytes. Must be used atomically. */
-    MA_ATOMIC(4, ma_uint32) encodedWriteOffset; /* Most significant bit is the loop flag. Lower 31 bits contains the actual offset in bytes. Must be used atomically. */
-    ma_bool8 ownsBuffer;                        /* Used to know whether or not miniaudio is responsible for free()-ing the buffer. */
-    ma_bool8 clearOnWriteAcquire;               /* When set, clears the acquired write buffer before returning from ma_rb_acquire_write(). */
-    ma_allocation_callbacks allocationCallbacks;
-} ma_rb;
-
-MA_API ma_result ma_rb_init_ex(size_t subbufferSizeInBytes, size_t subbufferCount, size_t subbufferStrideInBytes, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_rb* pRB);
-MA_API ma_result ma_rb_init(size_t bufferSizeInBytes, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_rb* pRB);
-MA_API void ma_rb_uninit(ma_rb* pRB);
-MA_API void ma_rb_reset(ma_rb* pRB);
-MA_API ma_result ma_rb_acquire_read(ma_rb* pRB, size_t* pSizeInBytes, void** ppBufferOut);
-MA_API ma_result ma_rb_commit_read(ma_rb* pRB, size_t sizeInBytes);
-MA_API ma_result ma_rb_acquire_write(ma_rb* pRB, size_t* pSizeInBytes, void** ppBufferOut);
-MA_API ma_result ma_rb_commit_write(ma_rb* pRB, size_t sizeInBytes);
-MA_API ma_result ma_rb_seek_read(ma_rb* pRB, size_t offsetInBytes);
-MA_API ma_result ma_rb_seek_write(ma_rb* pRB, size_t offsetInBytes);
-MA_API ma_int32 ma_rb_pointer_distance(ma_rb* pRB);    /* Returns the distance between the write pointer and the read pointer. Should never be negative for a correct program. Will return the number of bytes that can be read before the read pointer hits the write pointer. */
-MA_API ma_uint32 ma_rb_available_read(ma_rb* pRB);
-MA_API ma_uint32 ma_rb_available_write(ma_rb* pRB);
-MA_API size_t ma_rb_get_subbuffer_size(ma_rb* pRB);
-MA_API size_t ma_rb_get_subbuffer_stride(ma_rb* pRB);
-MA_API size_t ma_rb_get_subbuffer_offset(ma_rb* pRB, size_t subbufferIndex);
-MA_API void* ma_rb_get_subbuffer_ptr(ma_rb* pRB, size_t subbufferIndex, void* pBuffer);
-
-
-typedef struct
-{
-    ma_data_source_base ds;
-    ma_rb rb;
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate; /* Not required for the ring buffer itself, but useful for associating the data with some sample rate, particularly for data sources. */
-} ma_pcm_rb;
-
-MA_API ma_result ma_pcm_rb_init_ex(ma_format format, ma_uint32 channels, ma_uint32 subbufferSizeInFrames, ma_uint32 subbufferCount, ma_uint32 subbufferStrideInFrames, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_pcm_rb* pRB);
-MA_API ma_result ma_pcm_rb_init(ma_format format, ma_uint32 channels, ma_uint32 bufferSizeInFrames, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_pcm_rb* pRB);
-MA_API void ma_pcm_rb_uninit(ma_pcm_rb* pRB);
-MA_API void ma_pcm_rb_reset(ma_pcm_rb* pRB);
-MA_API ma_result ma_pcm_rb_acquire_read(ma_pcm_rb* pRB, ma_uint32* pSizeInFrames, void** ppBufferOut);
-MA_API ma_result ma_pcm_rb_commit_read(ma_pcm_rb* pRB, ma_uint32 sizeInFrames);
-MA_API ma_result ma_pcm_rb_acquire_write(ma_pcm_rb* pRB, ma_uint32* pSizeInFrames, void** ppBufferOut);
-MA_API ma_result ma_pcm_rb_commit_write(ma_pcm_rb* pRB, ma_uint32 sizeInFrames);
-MA_API ma_result ma_pcm_rb_seek_read(ma_pcm_rb* pRB, ma_uint32 offsetInFrames);
-MA_API ma_result ma_pcm_rb_seek_write(ma_pcm_rb* pRB, ma_uint32 offsetInFrames);
-MA_API ma_int32 ma_pcm_rb_pointer_distance(ma_pcm_rb* pRB); /* Return value is in frames. */
-MA_API ma_uint32 ma_pcm_rb_available_read(ma_pcm_rb* pRB);
-MA_API ma_uint32 ma_pcm_rb_available_write(ma_pcm_rb* pRB);
-MA_API ma_uint32 ma_pcm_rb_get_subbuffer_size(ma_pcm_rb* pRB);
-MA_API ma_uint32 ma_pcm_rb_get_subbuffer_stride(ma_pcm_rb* pRB);
-MA_API ma_uint32 ma_pcm_rb_get_subbuffer_offset(ma_pcm_rb* pRB, ma_uint32 subbufferIndex);
-MA_API void* ma_pcm_rb_get_subbuffer_ptr(ma_pcm_rb* pRB, ma_uint32 subbufferIndex, void* pBuffer);
-MA_API ma_format ma_pcm_rb_get_format(const ma_pcm_rb* pRB);
-MA_API ma_uint32 ma_pcm_rb_get_channels(const ma_pcm_rb* pRB);
-MA_API ma_uint32 ma_pcm_rb_get_sample_rate(const ma_pcm_rb* pRB);
-MA_API void ma_pcm_rb_set_sample_rate(ma_pcm_rb* pRB, ma_uint32 sampleRate);
-
-
-/*
-The idea of the duplex ring buffer is to act as the intermediary buffer when running two asynchronous devices in a duplex set up. The
-capture device writes to it, and then a playback device reads from it.
-
-At the moment this is just a simple naive implementation, but in the future I want to implement some dynamic resampling to seamlessly
-handle desyncs. Note that the API is work in progress and may change at any time in any version.
-
-The size of the buffer is based on the capture side since that's what'll be written to the buffer. It is based on the capture period size
-in frames. The internal sample rate of the capture device is also needed in order to calculate the size.
-*/
-typedef struct
-{
-    ma_pcm_rb rb;
-} ma_duplex_rb;
-
-MA_API ma_result ma_duplex_rb_init(ma_format captureFormat, ma_uint32 captureChannels, ma_uint32 sampleRate, ma_uint32 captureInternalSampleRate, ma_uint32 captureInternalPeriodSizeInFrames, const ma_allocation_callbacks* pAllocationCallbacks, ma_duplex_rb* pRB);
-MA_API ma_result ma_duplex_rb_uninit(ma_duplex_rb* pRB);
-
-
-/************************************************************************************************************************************************************
-
-Miscellaneous Helpers
-
-************************************************************************************************************************************************************/
-/*
-Retrieves a human readable description of the given result code.
-*/
-MA_API const char* ma_result_description(ma_result result);
-
-/*
-malloc()
-*/
-MA_API void* ma_malloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks);
-
-/*
-calloc()
-*/
-MA_API void* ma_calloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks);
-
-/*
-realloc()
-*/
-MA_API void* ma_realloc(void* p, size_t sz, const ma_allocation_callbacks* pAllocationCallbacks);
-
-/*
-free()
-*/
-MA_API void ma_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
-
-/*
-Performs an aligned malloc, with the assumption that the alignment is a power of 2.
-*/
-MA_API void* ma_aligned_malloc(size_t sz, size_t alignment, const ma_allocation_callbacks* pAllocationCallbacks);
-
-/*
-Free's an aligned malloc'd buffer.
-*/
-MA_API void ma_aligned_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
-
-/*
-Retrieves a friendly name for a format.
-*/
-MA_API const char* ma_get_format_name(ma_format format);
-
-/*
-Blends two frames in floating point format.
-*/
-MA_API void ma_blend_f32(float* pOut, float* pInA, float* pInB, float factor, ma_uint32 channels);
-
-/*
-Retrieves the size of a sample in bytes for the given format.
-
-This API is efficient and is implemented using a lookup table.
-
-Thread Safety: SAFE
-  This API is pure.
-*/
-MA_API ma_uint32 ma_get_bytes_per_sample(ma_format format);
-static MA_INLINE ma_uint32 ma_get_bytes_per_frame(ma_format format, ma_uint32 channels) { return ma_get_bytes_per_sample(format) * channels; }
-
-/*
-Converts a log level to a string.
-*/
-MA_API const char* ma_log_level_to_string(ma_uint32 logLevel);
-
-
-
-
-/************************************************************************************************************************************************************
-
-Synchronization
-
-************************************************************************************************************************************************************/
-/*
-Locks a spinlock.
-*/
-MA_API ma_result ma_spinlock_lock(volatile ma_spinlock* pSpinlock);
-
-/*
-Locks a spinlock, but does not yield() when looping.
-*/
-MA_API ma_result ma_spinlock_lock_noyield(volatile ma_spinlock* pSpinlock);
-
-/*
-Unlocks a spinlock.
-*/
-MA_API ma_result ma_spinlock_unlock(volatile ma_spinlock* pSpinlock);
-
-
-#ifndef MA_NO_THREADING
-
-/*
-Creates a mutex.
-
-A mutex must be created from a valid context. A mutex is initially unlocked.
-*/
-MA_API ma_result ma_mutex_init(ma_mutex* pMutex);
-
-/*
-Deletes a mutex.
-*/
-MA_API void ma_mutex_uninit(ma_mutex* pMutex);
-
-/*
-Locks a mutex with an infinite timeout.
-*/
-MA_API void ma_mutex_lock(ma_mutex* pMutex);
-
-/*
-Unlocks a mutex.
-*/
-MA_API void ma_mutex_unlock(ma_mutex* pMutex);
-
-
-/*
-Initializes an auto-reset event.
-*/
-MA_API ma_result ma_event_init(ma_event* pEvent);
-
-/*
-Uninitializes an auto-reset event.
-*/
-MA_API void ma_event_uninit(ma_event* pEvent);
-
-/*
-Waits for the specified auto-reset event to become signalled.
-*/
-MA_API ma_result ma_event_wait(ma_event* pEvent);
-
-/*
-Signals the specified auto-reset event.
-*/
-MA_API ma_result ma_event_signal(ma_event* pEvent);
-#endif  /* MA_NO_THREADING */
-
-
-/*
-Fence
-=====
-This locks while the counter is larger than 0. Counter can be incremented and decremented by any
-thread, but care needs to be taken when waiting. It is possible for one thread to acquire the
-fence just as another thread returns from ma_fence_wait().
-
-The idea behind a fence is to allow you to wait for a group of operations to complete. When an
-operation starts, the counter is incremented which locks the fence. When the operation completes,
-the fence will be released which decrements the counter. ma_fence_wait() will block until the
-counter hits zero.
-
-If threading is disabled, ma_fence_wait() will spin on the counter.
-*/
-typedef struct
-{
-#ifndef MA_NO_THREADING
-    ma_event e;
-#endif
-    ma_uint32 counter;
-} ma_fence;
-
-MA_API ma_result ma_fence_init(ma_fence* pFence);
-MA_API void ma_fence_uninit(ma_fence* pFence);
-MA_API ma_result ma_fence_acquire(ma_fence* pFence);    /* Increment counter. */
-MA_API ma_result ma_fence_release(ma_fence* pFence);    /* Decrement counter. */
-MA_API ma_result ma_fence_wait(ma_fence* pFence);       /* Wait for counter to reach 0. */
-
-
-
-/*
-Notification callback for asynchronous operations.
-*/
-typedef void ma_async_notification;
-
-typedef struct
-{
-    void (* onSignal)(ma_async_notification* pNotification);
-} ma_async_notification_callbacks;
-
-MA_API ma_result ma_async_notification_signal(ma_async_notification* pNotification);
-
-
-/*
-Simple polling notification.
-
-This just sets a variable when the notification has been signalled which is then polled with ma_async_notification_poll_is_signalled()
-*/
-typedef struct
-{
-    ma_async_notification_callbacks cb;
-    ma_bool32 signalled;
-} ma_async_notification_poll;
-
-MA_API ma_result ma_async_notification_poll_init(ma_async_notification_poll* pNotificationPoll);
-MA_API ma_bool32 ma_async_notification_poll_is_signalled(const ma_async_notification_poll* pNotificationPoll);
-
-
-/*
-Event Notification
-
-This uses an ma_event. If threading is disabled (MA_NO_THREADING), initialization will fail.
-*/
-typedef struct
-{
-    ma_async_notification_callbacks cb;
-#ifndef MA_NO_THREADING
-    ma_event e;
-#endif
-} ma_async_notification_event;
-
-MA_API ma_result ma_async_notification_event_init(ma_async_notification_event* pNotificationEvent);
-MA_API ma_result ma_async_notification_event_uninit(ma_async_notification_event* pNotificationEvent);
-MA_API ma_result ma_async_notification_event_wait(ma_async_notification_event* pNotificationEvent);
-MA_API ma_result ma_async_notification_event_signal(ma_async_notification_event* pNotificationEvent);
-
-
-
-
-/************************************************************************************************************************************************************
-
-Job Queue
-
-************************************************************************************************************************************************************/
-
-/*
-Slot Allocator
---------------
-The idea of the slot allocator is for it to be used in conjunction with a fixed sized buffer. You use the slot allocator to allocator an index that can be used
-as the insertion point for an object.
-
-Slots are reference counted to help mitigate the ABA problem in the lock-free queue we use for tracking jobs.
-
-The slot index is stored in the low 32 bits. The reference counter is stored in the high 32 bits:
-
-    +-----------------+-----------------+
-    | 32 Bits         | 32 Bits         |
-    +-----------------+-----------------+
-    | Reference Count | Slot Index      |
-    +-----------------+-----------------+
-*/
-typedef struct
-{
-    ma_uint32 capacity;    /* The number of slots to make available. */
-} ma_slot_allocator_config;
-
-MA_API ma_slot_allocator_config ma_slot_allocator_config_init(ma_uint32 capacity);
-
-
-typedef struct
-{
-    MA_ATOMIC(4, ma_uint32) bitfield;   /* Must be used atomically because the allocation and freeing routines need to make copies of this which must never be optimized away by the compiler. */
-} ma_slot_allocator_group;
-
-typedef struct
-{
-    ma_slot_allocator_group* pGroups;   /* Slots are grouped in chunks of 32. */
-    ma_uint32* pSlots;                  /* 32 bits for reference counting for ABA mitigation. */
-    ma_uint32 count;                    /* Allocation count. */
-    ma_uint32 capacity;
-
-    /* Memory management. */
-    ma_bool32 _ownsHeap;
-    void* _pHeap;
-} ma_slot_allocator;
-
-MA_API ma_result ma_slot_allocator_get_heap_size(const ma_slot_allocator_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_slot_allocator_init_preallocated(const ma_slot_allocator_config* pConfig, void* pHeap, ma_slot_allocator* pAllocator);
-MA_API ma_result ma_slot_allocator_init(const ma_slot_allocator_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_slot_allocator* pAllocator);
-MA_API void ma_slot_allocator_uninit(ma_slot_allocator* pAllocator, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_slot_allocator_alloc(ma_slot_allocator* pAllocator, ma_uint64* pSlot);
-MA_API ma_result ma_slot_allocator_free(ma_slot_allocator* pAllocator, ma_uint64 slot);
-
-
-typedef struct ma_job ma_job;
-
-/*
-Callback for processing a job. Each job type will have their own processing callback which will be
-called by ma_job_process().
-*/
-typedef ma_result (* ma_job_proc)(ma_job* pJob);
-
-/* When a job type is added here an callback needs to be added go "g_jobVTable" in the implementation section. */
-typedef enum
-{
-    /* Miscellaneous. */
-    MA_JOB_TYPE_QUIT = 0,
-    MA_JOB_TYPE_CUSTOM,
-
-    /* Resource Manager. */
-    MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE,
-    MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER_NODE,
-    MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE,
-    MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER,
-    MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER,
-    MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_STREAM,
-    MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_STREAM,
-    MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_STREAM,
-    MA_JOB_TYPE_RESOURCE_MANAGER_SEEK_DATA_STREAM,
-
-    /* Device. */
-    MA_JOB_TYPE_DEVICE_AAUDIO_REROUTE,
-
-    /* Count. Must always be last. */
-    MA_JOB_TYPE_COUNT
-} ma_job_type;
-
-struct ma_job
-{
-    union
-    {
-        struct
-        {
-            ma_uint16 code;         /* Job type. */
-            ma_uint16 slot;         /* Index into a ma_slot_allocator. */
-            ma_uint32 refcount;
-        } breakup;
-        ma_uint64 allocation;
-    } toc;  /* 8 bytes. We encode the job code into the slot allocation data to save space. */
-    MA_ATOMIC(8, ma_uint64) next; /* refcount + slot for the next item. Does not include the job code. */
-    ma_uint32 order;    /* Execution order. Used to create a data dependency and ensure a job is executed in order. Usage is contextual depending on the job type. */
-
-    union
-    {
-        /* Miscellaneous. */
-        struct
-        {
-            ma_job_proc proc;
-            ma_uintptr data0;
-            ma_uintptr data1;
-        } custom;
-
-        /* Resource Manager */
-        union
-        {
-            struct
-            {
-                /*ma_resource_manager**/ void* pResourceManager;
-                /*ma_resource_manager_data_buffer_node**/ void* pDataBufferNode;
-                char* pFilePath;
-                wchar_t* pFilePathW;
-                ma_uint32 flags;                                /* Resource manager data source flags that were used when initializing the data buffer. */
-                ma_async_notification* pInitNotification;       /* Signalled when the data buffer has been initialized and the format/channels/rate can be retrieved. */
-                ma_async_notification* pDoneNotification;       /* Signalled when the data buffer has been fully decoded. Will be passed through to MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE when decoding. */
-                ma_fence* pInitFence;                           /* Released when initialization of the decoder is complete. */
-                ma_fence* pDoneFence;                           /* Released if initialization of the decoder fails. Passed through to PAGE_DATA_BUFFER_NODE untouched if init is successful. */
-            } loadDataBufferNode;
-            struct
-            {
-                /*ma_resource_manager**/ void* pResourceManager;
-                /*ma_resource_manager_data_buffer_node**/ void* pDataBufferNode;
-                ma_async_notification* pDoneNotification;
-                ma_fence* pDoneFence;
-            } freeDataBufferNode;
-            struct
-            {
-                /*ma_resource_manager**/ void* pResourceManager;
-                /*ma_resource_manager_data_buffer_node**/ void* pDataBufferNode;
-                /*ma_decoder**/ void* pDecoder;
-                ma_async_notification* pDoneNotification;       /* Signalled when the data buffer has been fully decoded. */
-                ma_fence* pDoneFence;                           /* Passed through from LOAD_DATA_BUFFER_NODE and released when the data buffer completes decoding or an error occurs. */
-            } pageDataBufferNode;
-
-            struct
-            {
-                /*ma_resource_manager_data_buffer**/ void* pDataBuffer;
-                ma_async_notification* pInitNotification;       /* Signalled when the data buffer has been initialized and the format/channels/rate can be retrieved. */
-                ma_async_notification* pDoneNotification;       /* Signalled when the data buffer has been fully decoded. */
-                ma_fence* pInitFence;                           /* Released when the data buffer has been initialized and the format/channels/rate can be retrieved. */
-                ma_fence* pDoneFence;                           /* Released when the data buffer has been fully decoded. */
-                ma_uint64 rangeBegInPCMFrames;
-                ma_uint64 rangeEndInPCMFrames;
-                ma_uint64 loopPointBegInPCMFrames;
-                ma_uint64 loopPointEndInPCMFrames;
-                ma_uint32 isLooping;
-            } loadDataBuffer;
-            struct
-            {
-                /*ma_resource_manager_data_buffer**/ void* pDataBuffer;
-                ma_async_notification* pDoneNotification;
-                ma_fence* pDoneFence;
-            } freeDataBuffer;
-
-            struct
-            {
-                /*ma_resource_manager_data_stream**/ void* pDataStream;
-                char* pFilePath;                            /* Allocated when the job is posted, freed by the job thread after loading. */
-                wchar_t* pFilePathW;                        /* ^ As above ^. Only used if pFilePath is NULL. */
-                ma_uint64 initialSeekPoint;
-                ma_async_notification* pInitNotification;   /* Signalled after the first two pages have been decoded and frames can be read from the stream. */
-                ma_fence* pInitFence;
-            } loadDataStream;
-            struct
-            {
-                /*ma_resource_manager_data_stream**/ void* pDataStream;
-                ma_async_notification* pDoneNotification;
-                ma_fence* pDoneFence;
-            } freeDataStream;
-            struct
-            {
-                /*ma_resource_manager_data_stream**/ void* pDataStream;
-                ma_uint32 pageIndex;                    /* The index of the page to decode into. */
-            } pageDataStream;
-            struct
-            {
-                /*ma_resource_manager_data_stream**/ void* pDataStream;
-                ma_uint64 frameIndex;
-            } seekDataStream;
-        } resourceManager;
-
-        /* Device. */
-        union
-        {
-            union
-            {
-                struct
-                {
-                    /*ma_device**/ void* pDevice;
-                    /*ma_device_type*/ ma_uint32 deviceType;
-                } reroute;
-            } aaudio;
-        } device;
-    } data;
-};
-
-MA_API ma_job ma_job_init(ma_uint16 code);
-MA_API ma_result ma_job_process(ma_job* pJob);
-
-
-/*
-When set, ma_job_queue_next() will not wait and no semaphore will be signaled in
-ma_job_queue_post(). ma_job_queue_next() will return MA_NO_DATA_AVAILABLE if nothing is available.
-
-This flag should always be used for platforms that do not support multithreading.
-*/
-typedef enum
-{
-    MA_JOB_QUEUE_FLAG_NON_BLOCKING = 0x00000001
-} ma_job_queue_flags;
-
-typedef struct
-{
-    ma_uint32 flags;
-    ma_uint32 capacity; /* The maximum number of jobs that can fit in the queue at a time. */
-} ma_job_queue_config;
-
-MA_API ma_job_queue_config ma_job_queue_config_init(ma_uint32 flags, ma_uint32 capacity);
-
-
-typedef struct
-{
-    ma_uint32 flags;                /* Flags passed in at initialization time. */
-    ma_uint32 capacity;             /* The maximum number of jobs that can fit in the queue at a time. Set by the config. */
-    MA_ATOMIC(8, ma_uint64) head;   /* The first item in the list. Required for removing from the top of the list. */
-    MA_ATOMIC(8, ma_uint64) tail;   /* The last item in the list. Required for appending to the end of the list. */
-#ifndef MA_NO_THREADING
-    ma_semaphore sem;               /* Only used when MA_JOB_QUEUE_FLAG_NON_BLOCKING is unset. */
-#endif
-    ma_slot_allocator allocator;
-    ma_job* pJobs;
-#ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
-    ma_spinlock lock;
-#endif
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_job_queue;
-
-MA_API ma_result ma_job_queue_get_heap_size(const ma_job_queue_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_job_queue_init_preallocated(const ma_job_queue_config* pConfig, void* pHeap, ma_job_queue* pQueue);
-MA_API ma_result ma_job_queue_init(const ma_job_queue_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_job_queue* pQueue);
-MA_API void ma_job_queue_uninit(ma_job_queue* pQueue, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_job_queue_post(ma_job_queue* pQueue, const ma_job* pJob);
-MA_API ma_result ma_job_queue_next(ma_job_queue* pQueue, ma_job* pJob); /* Returns MA_CANCELLED if the next job is a quit job. */
-
-
-
-/************************************************************************************************************************************************************
-*************************************************************************************************************************************************************
-
-DEVICE I/O
-==========
-
-This section contains the APIs for device playback and capture. Here is where you'll find ma_device_init(), etc.
-
-*************************************************************************************************************************************************************
-************************************************************************************************************************************************************/
-#ifndef MA_NO_DEVICE_IO
-/* Some backends are only supported on certain platforms. */
-#if defined(MA_WIN32)
-    #define MA_SUPPORT_WASAPI
-
-    #if defined(MA_WIN32_DESKTOP)   /* DirectSound and WinMM backends are only supported on desktops. */
-        #define MA_SUPPORT_DSOUND
-        #define MA_SUPPORT_WINMM
-
-        /* Don't enable JACK here if compiling with Cosmopolitan. It'll be enabled in the Linux section below. */
-        #if !defined(__COSMOPOLITAN__)
-            #define MA_SUPPORT_JACK    /* JACK is technically supported on Windows, but I don't know how many people use it in practice... */
-        #endif
-    #endif
-#endif
-#if defined(MA_UNIX) && !defined(MA_ORBIS) && !defined(MA_PROSPERO)
-    #if defined(MA_LINUX)
-        #if !defined(MA_ANDROID) && !defined(__COSMOPOLITAN__)   /* ALSA is not supported on Android. */
-            #define MA_SUPPORT_ALSA
-        #endif
-    #endif
-    #if !defined(MA_BSD) && !defined(MA_ANDROID) && !defined(MA_EMSCRIPTEN)
-        #define MA_SUPPORT_PULSEAUDIO
-        #define MA_SUPPORT_JACK
-    #endif
-    #if defined(__OpenBSD__)        /* <-- Change this to "#if defined(MA_BSD)" to enable sndio on all BSD flavors. */
-        #define MA_SUPPORT_SNDIO    /* sndio is only supported on OpenBSD for now. May be expanded later if there's demand. */
-    #endif
-    #if defined(__NetBSD__) || defined(__OpenBSD__)
-        #define MA_SUPPORT_AUDIO4   /* Only support audio(4) on platforms with known support. */
-    #endif
-    #if defined(__FreeBSD__) || defined(__DragonFly__)
-        #define MA_SUPPORT_OSS      /* Only support OSS on specific platforms with known support. */
-    #endif
-#endif
-#if defined(MA_ANDROID)
-    #define MA_SUPPORT_AAUDIO
-    #define MA_SUPPORT_OPENSL
-#endif
-#if defined(MA_APPLE)
-    #define MA_SUPPORT_COREAUDIO
-#endif
-#if defined(MA_EMSCRIPTEN)
-    #define MA_SUPPORT_WEBAUDIO
-#endif
-
-/* All platforms should support custom backends. */
-#define MA_SUPPORT_CUSTOM
-
-/* Explicitly disable the Null backend for Emscripten because it uses a background thread which is not properly supported right now. */
-#if !defined(MA_EMSCRIPTEN)
-#define MA_SUPPORT_NULL
-#endif
-
-
-#if defined(MA_SUPPORT_WASAPI) && !defined(MA_NO_WASAPI) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_WASAPI))
-    #define MA_HAS_WASAPI
-#endif
-#if defined(MA_SUPPORT_DSOUND) && !defined(MA_NO_DSOUND) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_DSOUND))
-    #define MA_HAS_DSOUND
-#endif
-#if defined(MA_SUPPORT_WINMM) && !defined(MA_NO_WINMM) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_WINMM))
-    #define MA_HAS_WINMM
-#endif
-#if defined(MA_SUPPORT_ALSA) && !defined(MA_NO_ALSA) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_ALSA))
-    #define MA_HAS_ALSA
-#endif
-#if defined(MA_SUPPORT_PULSEAUDIO) && !defined(MA_NO_PULSEAUDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_PULSEAUDIO))
-    #define MA_HAS_PULSEAUDIO
-#endif
-#if defined(MA_SUPPORT_JACK) && !defined(MA_NO_JACK) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_JACK))
-    #define MA_HAS_JACK
-#endif
-#if defined(MA_SUPPORT_COREAUDIO) && !defined(MA_NO_COREAUDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_COREAUDIO))
-    #define MA_HAS_COREAUDIO
-#endif
-#if defined(MA_SUPPORT_SNDIO) && !defined(MA_NO_SNDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_SNDIO))
-    #define MA_HAS_SNDIO
-#endif
-#if defined(MA_SUPPORT_AUDIO4) && !defined(MA_NO_AUDIO4) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_AUDIO4))
-    #define MA_HAS_AUDIO4
-#endif
-#if defined(MA_SUPPORT_OSS) && !defined(MA_NO_OSS) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_OSS))
-    #define MA_HAS_OSS
-#endif
-#if defined(MA_SUPPORT_AAUDIO) && !defined(MA_NO_AAUDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_AAUDIO))
-    #define MA_HAS_AAUDIO
-#endif
-#if defined(MA_SUPPORT_OPENSL) && !defined(MA_NO_OPENSL) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_OPENSL))
-    #define MA_HAS_OPENSL
-#endif
-#if defined(MA_SUPPORT_WEBAUDIO) && !defined(MA_NO_WEBAUDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_WEBAUDIO))
-    #define MA_HAS_WEBAUDIO
-#endif
-#if defined(MA_SUPPORT_CUSTOM) && !defined(MA_NO_CUSTOM) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_CUSTOM))
-    #define MA_HAS_CUSTOM
-#endif
-#if defined(MA_SUPPORT_NULL) && !defined(MA_NO_NULL) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_NULL))
-    #define MA_HAS_NULL
-#endif
-
-typedef enum
-{
-    ma_device_state_uninitialized = 0,
-    ma_device_state_stopped       = 1,  /* The device's default state after initialization. */
-    ma_device_state_started       = 2,  /* The device is started and is requesting and/or delivering audio data. */
-    ma_device_state_starting      = 3,  /* Transitioning from a stopped state to started. */
-    ma_device_state_stopping      = 4   /* Transitioning from a started state to stopped. */
-} ma_device_state;
-
-MA_ATOMIC_SAFE_TYPE_DECL(i32, 4, device_state)
-
-
-#ifdef MA_SUPPORT_WASAPI
-/* We need a IMMNotificationClient object for WASAPI. */
-typedef struct
-{
-    void* lpVtbl;
-    ma_uint32 counter;
-    ma_device* pDevice;
-} ma_IMMNotificationClient;
-#endif
-
-/* Backend enums must be in priority order. */
-typedef enum
-{
-    ma_backend_wasapi,
-    ma_backend_dsound,
-    ma_backend_winmm,
-    ma_backend_coreaudio,
-    ma_backend_sndio,
-    ma_backend_audio4,
-    ma_backend_oss,
-    ma_backend_pulseaudio,
-    ma_backend_alsa,
-    ma_backend_jack,
-    ma_backend_aaudio,
-    ma_backend_opensl,
-    ma_backend_webaudio,
-    ma_backend_custom,  /* <-- Custom backend, with callbacks defined by the context config. */
-    ma_backend_null     /* <-- Must always be the last item. Lowest priority, and used as the terminator for backend enumeration. */
-} ma_backend;
-
-#define MA_BACKEND_COUNT (ma_backend_null+1)
-
-
-/*
-Device job thread. This is used by backends that require asynchronous processing of certain
-operations. It is not used by all backends.
-
-The device job thread is made up of a thread and a job queue. You can post a job to the thread with
-ma_device_job_thread_post(). The thread will do the processing of the job.
-*/
-typedef struct
-{
-    ma_bool32 noThread; /* Set this to true if you want to process jobs yourself. */
-    ma_uint32 jobQueueCapacity;
-    ma_uint32 jobQueueFlags;
-} ma_device_job_thread_config;
-
-MA_API ma_device_job_thread_config ma_device_job_thread_config_init(void);
-
-typedef struct
-{
-    ma_thread thread;
-    ma_job_queue jobQueue;
-    ma_bool32 _hasThread;
-} ma_device_job_thread;
-
-MA_API ma_result ma_device_job_thread_init(const ma_device_job_thread_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_device_job_thread* pJobThread);
-MA_API void ma_device_job_thread_uninit(ma_device_job_thread* pJobThread, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_device_job_thread_post(ma_device_job_thread* pJobThread, const ma_job* pJob);
-MA_API ma_result ma_device_job_thread_next(ma_device_job_thread* pJobThread, ma_job* pJob);
-
-
-
-/* Device notification types. */
-typedef enum
-{
-    ma_device_notification_type_started,
-    ma_device_notification_type_stopped,
-    ma_device_notification_type_rerouted,
-    ma_device_notification_type_interruption_began,
-    ma_device_notification_type_interruption_ended,
-    ma_device_notification_type_unlocked
-} ma_device_notification_type;
-
-typedef struct
-{
-    ma_device* pDevice;
-    ma_device_notification_type type;
-    union
-    {
-        struct
-        {
-            int _unused;
-        } started;
-        struct
-        {
-            int _unused;
-        } stopped;
-        struct
-        {
-            int _unused;
-        } rerouted;
-        struct
-        {
-            int _unused;
-        } interruption;
-    } data;
-} ma_device_notification;
-
-/*
-The notification callback for when the application should be notified of a change to the device.
-
-This callback is used for notifying the application of changes such as when the device has started,
-stopped, rerouted or an interruption has occurred. Note that not all backends will post all
-notification types. For example, some backends will perform automatic stream routing without any
-kind of notification to the host program which means miniaudio will never know about it and will
-never be able to fire the rerouted notification. You should keep this in mind when designing your
-program.
-
-The stopped notification will *not* get fired when a device is rerouted.
-
-
-Parameters
-----------
-pNotification (in)
-    A pointer to a structure containing information about the event. Use the `pDevice` member of
-    this object to retrieve the relevant device. The `type` member can be used to discriminate
-    against each of the notification types.
-
-
-Remarks
--------
-Do not restart or uninitialize the device from the callback.
-
-Not all notifications will be triggered by all backends, however the started and stopped events
-should be reliable for all backends. Some backends do not have a good way to detect device
-stoppages due to unplugging the device which may result in the stopped callback not getting
-fired. This has been observed with at least one BSD variant.
-
-The rerouted notification is fired *after* the reroute has occurred. The stopped notification will
-*not* get fired when a device is rerouted. The following backends are known to do automatic stream
-rerouting, but do not have a way to be notified of the change:
-
-  * DirectSound
-
-The interruption notifications are used on mobile platforms for detecting when audio is interrupted
-due to things like an incoming phone call. Currently this is only implemented on iOS. None of the
-Android backends will report this notification.
-*/
-typedef void (* ma_device_notification_proc)(const ma_device_notification* pNotification);
-
-
-/*
-The callback for processing audio data from the device.
-
-The data callback is fired by miniaudio whenever the device needs to have more data delivered to a playback device, or when a capture device has some data
-available. This is called as soon as the backend asks for more data which means it may be called with inconsistent frame counts. You cannot assume the
-callback will be fired with a consistent frame count.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the relevant device.
-
-pOutput (out)
-    A pointer to the output buffer that will receive audio data that will later be played back through the speakers. This will be non-null for a playback or
-    full-duplex device and null for a capture and loopback device.
-
-pInput (in)
-    A pointer to the buffer containing input data from a recording device. This will be non-null for a capture, full-duplex or loopback device and null for a
-    playback device.
-
-frameCount (in)
-    The number of PCM frames to process. Note that this will not necessarily be equal to what you requested when you initialized the device. The
-    `periodSizeInFrames` and `periodSizeInMilliseconds` members of the device config are just hints, and are not necessarily exactly what you'll get. You must
-    not assume this will always be the same value each time the callback is fired.
-
-
-Remarks
--------
-You cannot stop and start the device from inside the callback or else you'll get a deadlock. You must also not uninitialize the device from inside the
-callback. The following APIs cannot be called from inside the callback:
-
-    ma_device_init()
-    ma_device_init_ex()
-    ma_device_uninit()
-    ma_device_start()
-    ma_device_stop()
-
-The proper way to stop the device is to call `ma_device_stop()` from a different thread, normally the main application thread.
-*/
-typedef void (* ma_device_data_proc)(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount);
-
-
-
-
-/*
-DEPRECATED. Use ma_device_notification_proc instead.
-
-The callback for when the device has been stopped.
-
-This will be called when the device is stopped explicitly with `ma_device_stop()` and also called implicitly when the device is stopped through external forces
-such as being unplugged or an internal error occurring.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device that has just stopped.
-
-
-Remarks
--------
-Do not restart or uninitialize the device from the callback.
-*/
-typedef void (* ma_stop_proc)(ma_device* pDevice);  /* DEPRECATED. Use ma_device_notification_proc instead. */
-
-typedef enum
-{
-    ma_device_type_playback = 1,
-    ma_device_type_capture  = 2,
-    ma_device_type_duplex   = ma_device_type_playback | ma_device_type_capture, /* 3 */
-    ma_device_type_loopback = 4
-} ma_device_type;
-
-typedef enum
-{
-    ma_share_mode_shared = 0,
-    ma_share_mode_exclusive
-} ma_share_mode;
-
-/* iOS/tvOS/watchOS session categories. */
-typedef enum
-{
-    ma_ios_session_category_default = 0,        /* AVAudioSessionCategoryPlayAndRecord. */
-    ma_ios_session_category_none,               /* Leave the session category unchanged. */
-    ma_ios_session_category_ambient,            /* AVAudioSessionCategoryAmbient */
-    ma_ios_session_category_solo_ambient,       /* AVAudioSessionCategorySoloAmbient */
-    ma_ios_session_category_playback,           /* AVAudioSessionCategoryPlayback */
-    ma_ios_session_category_record,             /* AVAudioSessionCategoryRecord */
-    ma_ios_session_category_play_and_record,    /* AVAudioSessionCategoryPlayAndRecord */
-    ma_ios_session_category_multi_route         /* AVAudioSessionCategoryMultiRoute */
-} ma_ios_session_category;
-
-/* iOS/tvOS/watchOS session category options */
-typedef enum
-{
-    ma_ios_session_category_option_mix_with_others                            = 0x01,   /* AVAudioSessionCategoryOptionMixWithOthers */
-    ma_ios_session_category_option_duck_others                                = 0x02,   /* AVAudioSessionCategoryOptionDuckOthers */
-    ma_ios_session_category_option_allow_bluetooth                            = 0x04,   /* AVAudioSessionCategoryOptionAllowBluetooth */
-    ma_ios_session_category_option_default_to_speaker                         = 0x08,   /* AVAudioSessionCategoryOptionDefaultToSpeaker */
-    ma_ios_session_category_option_interrupt_spoken_audio_and_mix_with_others = 0x11,   /* AVAudioSessionCategoryOptionInterruptSpokenAudioAndMixWithOthers */
-    ma_ios_session_category_option_allow_bluetooth_a2dp                       = 0x20,   /* AVAudioSessionCategoryOptionAllowBluetoothA2DP */
-    ma_ios_session_category_option_allow_air_play                             = 0x40,   /* AVAudioSessionCategoryOptionAllowAirPlay */
-} ma_ios_session_category_option;
-
-/* OpenSL stream types. */
-typedef enum
-{
-    ma_opensl_stream_type_default = 0,              /* Leaves the stream type unset. */
-    ma_opensl_stream_type_voice,                    /* SL_ANDROID_STREAM_VOICE */
-    ma_opensl_stream_type_system,                   /* SL_ANDROID_STREAM_SYSTEM */
-    ma_opensl_stream_type_ring,                     /* SL_ANDROID_STREAM_RING */
-    ma_opensl_stream_type_media,                    /* SL_ANDROID_STREAM_MEDIA */
-    ma_opensl_stream_type_alarm,                    /* SL_ANDROID_STREAM_ALARM */
-    ma_opensl_stream_type_notification              /* SL_ANDROID_STREAM_NOTIFICATION */
-} ma_opensl_stream_type;
-
-/* OpenSL recording presets. */
-typedef enum
-{
-    ma_opensl_recording_preset_default = 0,         /* Leaves the input preset unset. */
-    ma_opensl_recording_preset_generic,             /* SL_ANDROID_RECORDING_PRESET_GENERIC */
-    ma_opensl_recording_preset_camcorder,           /* SL_ANDROID_RECORDING_PRESET_CAMCORDER */
-    ma_opensl_recording_preset_voice_recognition,   /* SL_ANDROID_RECORDING_PRESET_VOICE_RECOGNITION */
-    ma_opensl_recording_preset_voice_communication, /* SL_ANDROID_RECORDING_PRESET_VOICE_COMMUNICATION */
-    ma_opensl_recording_preset_voice_unprocessed    /* SL_ANDROID_RECORDING_PRESET_UNPROCESSED */
-} ma_opensl_recording_preset;
-
-/* WASAPI audio thread priority characteristics. */
-typedef enum
-{
-    ma_wasapi_usage_default = 0,
-    ma_wasapi_usage_games,
-    ma_wasapi_usage_pro_audio,
-} ma_wasapi_usage;
-
-/* AAudio usage types. */
-typedef enum
-{
-    ma_aaudio_usage_default = 0,                    /* Leaves the usage type unset. */
-    ma_aaudio_usage_media,                          /* AAUDIO_USAGE_MEDIA */
-    ma_aaudio_usage_voice_communication,            /* AAUDIO_USAGE_VOICE_COMMUNICATION */
-    ma_aaudio_usage_voice_communication_signalling, /* AAUDIO_USAGE_VOICE_COMMUNICATION_SIGNALLING */
-    ma_aaudio_usage_alarm,                          /* AAUDIO_USAGE_ALARM */
-    ma_aaudio_usage_notification,                   /* AAUDIO_USAGE_NOTIFICATION */
-    ma_aaudio_usage_notification_ringtone,          /* AAUDIO_USAGE_NOTIFICATION_RINGTONE */
-    ma_aaudio_usage_notification_event,             /* AAUDIO_USAGE_NOTIFICATION_EVENT */
-    ma_aaudio_usage_assistance_accessibility,       /* AAUDIO_USAGE_ASSISTANCE_ACCESSIBILITY */
-    ma_aaudio_usage_assistance_navigation_guidance, /* AAUDIO_USAGE_ASSISTANCE_NAVIGATION_GUIDANCE */
-    ma_aaudio_usage_assistance_sonification,        /* AAUDIO_USAGE_ASSISTANCE_SONIFICATION */
-    ma_aaudio_usage_game,                           /* AAUDIO_USAGE_GAME */
-    ma_aaudio_usage_assitant,                       /* AAUDIO_USAGE_ASSISTANT */
-    ma_aaudio_usage_emergency,                      /* AAUDIO_SYSTEM_USAGE_EMERGENCY */
-    ma_aaudio_usage_safety,                         /* AAUDIO_SYSTEM_USAGE_SAFETY */
-    ma_aaudio_usage_vehicle_status,                 /* AAUDIO_SYSTEM_USAGE_VEHICLE_STATUS */
-    ma_aaudio_usage_announcement                    /* AAUDIO_SYSTEM_USAGE_ANNOUNCEMENT */
-} ma_aaudio_usage;
-
-/* AAudio content types. */
-typedef enum
-{
-    ma_aaudio_content_type_default = 0,             /* Leaves the content type unset. */
-    ma_aaudio_content_type_speech,                  /* AAUDIO_CONTENT_TYPE_SPEECH */
-    ma_aaudio_content_type_music,                   /* AAUDIO_CONTENT_TYPE_MUSIC */
-    ma_aaudio_content_type_movie,                   /* AAUDIO_CONTENT_TYPE_MOVIE */
-    ma_aaudio_content_type_sonification             /* AAUDIO_CONTENT_TYPE_SONIFICATION */
-} ma_aaudio_content_type;
-
-/* AAudio input presets. */
-typedef enum
-{
-    ma_aaudio_input_preset_default = 0,             /* Leaves the input preset unset. */
-    ma_aaudio_input_preset_generic,                 /* AAUDIO_INPUT_PRESET_GENERIC */
-    ma_aaudio_input_preset_camcorder,               /* AAUDIO_INPUT_PRESET_CAMCORDER */
-    ma_aaudio_input_preset_voice_recognition,       /* AAUDIO_INPUT_PRESET_VOICE_RECOGNITION */
-    ma_aaudio_input_preset_voice_communication,     /* AAUDIO_INPUT_PRESET_VOICE_COMMUNICATION */
-    ma_aaudio_input_preset_unprocessed,             /* AAUDIO_INPUT_PRESET_UNPROCESSED */
-    ma_aaudio_input_preset_voice_performance        /* AAUDIO_INPUT_PRESET_VOICE_PERFORMANCE */
-} ma_aaudio_input_preset;
-
-typedef enum
-{
-    ma_aaudio_allow_capture_default = 0,            /* Leaves the allowed capture policy unset. */
-    ma_aaudio_allow_capture_by_all,                 /* AAUDIO_ALLOW_CAPTURE_BY_ALL */
-    ma_aaudio_allow_capture_by_system,              /* AAUDIO_ALLOW_CAPTURE_BY_SYSTEM */
-    ma_aaudio_allow_capture_by_none                 /* AAUDIO_ALLOW_CAPTURE_BY_NONE */
-} ma_aaudio_allowed_capture_policy;
-
-typedef union
-{
-    ma_int64 counter;
-    double counterD;
-} ma_timer;
-
-typedef union
-{
-    ma_wchar_win32 wasapi[64];      /* WASAPI uses a wchar_t string for identification. */
-    ma_uint8 dsound[16];            /* DirectSound uses a GUID for identification. */
-    /*UINT_PTR*/ ma_uint32 winmm;   /* When creating a device, WinMM expects a Win32 UINT_PTR for device identification. In practice it's actually just a UINT. */
-    char alsa[256];                 /* ALSA uses a name string for identification. */
-    char pulse[256];                /* PulseAudio uses a name string for identification. */
-    int jack;                       /* JACK always uses default devices. */
-    char coreaudio[256];            /* Core Audio uses a string for identification. */
-    char sndio[256];                /* "snd/0", etc. */
-    char audio4[256];               /* "/dev/audio", etc. */
-    char oss[64];                   /* "dev/dsp0", etc. "dev/dsp" for the default device. */
-    ma_int32 aaudio;                /* AAudio uses a 32-bit integer for identification. */
-    ma_uint32 opensl;               /* OpenSL|ES uses a 32-bit unsigned integer for identification. */
-    char webaudio[32];              /* Web Audio always uses default devices for now, but if this changes it'll be a GUID. */
-    union
-    {
-        int i;
-        char s[256];
-        void* p;
-    } custom;                       /* The custom backend could be anything. Give them a few options. */
-    int nullbackend;                /* The null backend uses an integer for device IDs. */
-} ma_device_id;
-
-
-typedef struct ma_context_config    ma_context_config;
-typedef struct ma_device_config     ma_device_config;
-typedef struct ma_backend_callbacks ma_backend_callbacks;
-
-#define MA_DATA_FORMAT_FLAG_EXCLUSIVE_MODE (1U << 1)    /* If set, this is supported in exclusive mode. Otherwise not natively supported by exclusive mode. */
-
-#ifndef MA_MAX_DEVICE_NAME_LENGTH
-#define MA_MAX_DEVICE_NAME_LENGTH   255
-#endif
-
-typedef struct
-{
-    /* Basic info. This is the only information guaranteed to be filled in during device enumeration. */
-    ma_device_id id;
-    char name[MA_MAX_DEVICE_NAME_LENGTH + 1];   /* +1 for null terminator. */
-    ma_bool32 isDefault;
-
-    ma_uint32 nativeDataFormatCount;
-    struct
-    {
-        ma_format format;       /* Sample format. If set to ma_format_unknown, all sample formats are supported. */
-        ma_uint32 channels;     /* If set to 0, all channels are supported. */
-        ma_uint32 sampleRate;   /* If set to 0, all sample rates are supported. */
-        ma_uint32 flags;        /* A combination of MA_DATA_FORMAT_FLAG_* flags. */
-    } nativeDataFormats[/*ma_format_count * ma_standard_sample_rate_count * MA_MAX_CHANNELS*/ 64];  /* Not sure how big to make this. There can be *many* permutations for virtual devices which can support anything. */
-} ma_device_info;
-
-struct ma_device_config
-{
-    ma_device_type deviceType;
-    ma_uint32 sampleRate;
-    ma_uint32 periodSizeInFrames;
-    ma_uint32 periodSizeInMilliseconds;
-    ma_uint32 periods;
-    ma_performance_profile performanceProfile;
-    ma_bool8 noPreSilencedOutputBuffer; /* When set to true, the contents of the output buffer passed into the data callback will be left undefined rather than initialized to silence. */
-    ma_bool8 noClip;                    /* When set to true, the contents of the output buffer passed into the data callback will not be clipped after returning. Only applies when the playback sample format is f32. */
-    ma_bool8 noDisableDenormals;        /* Do not disable denormals when firing the data callback. */
-    ma_bool8 noFixedSizedCallback;      /* Disables strict fixed-sized data callbacks. Setting this to true will result in the period size being treated only as a hint to the backend. This is an optimization for those who don't need fixed sized callbacks. */
-    ma_device_data_proc dataCallback;
-    ma_device_notification_proc notificationCallback;
-    ma_stop_proc stopCallback;
-    void* pUserData;
-    ma_resampler_config resampling;
-    struct
-    {
-        const ma_device_id* pDeviceID;
-        ma_format format;
-        ma_uint32 channels;
-        ma_channel* pChannelMap;
-        ma_channel_mix_mode channelMixMode;
-        ma_bool32 calculateLFEFromSpatialChannels;  /* When an output LFE channel is present, but no input LFE, set to true to set the output LFE to the average of all spatial channels (LR, FR, etc.). Ignored when an input LFE is present. */
-        ma_share_mode shareMode;
-    } playback;
-    struct
-    {
-        const ma_device_id* pDeviceID;
-        ma_format format;
-        ma_uint32 channels;
-        ma_channel* pChannelMap;
-        ma_channel_mix_mode channelMixMode;
-        ma_bool32 calculateLFEFromSpatialChannels;  /* When an output LFE channel is present, but no input LFE, set to true to set the output LFE to the average of all spatial channels (LR, FR, etc.). Ignored when an input LFE is present. */
-        ma_share_mode shareMode;
-    } capture;
-
-    struct
-    {
-        ma_wasapi_usage usage;              /* When configured, uses Avrt APIs to set the thread characteristics. */
-        ma_bool8 noAutoConvertSRC;          /* When set to true, disables the use of AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM. */
-        ma_bool8 noDefaultQualitySRC;       /* When set to true, disables the use of AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY. */
-        ma_bool8 noAutoStreamRouting;       /* Disables automatic stream routing. */
-        ma_bool8 noHardwareOffloading;      /* Disables WASAPI's hardware offloading feature. */
-        ma_uint32 loopbackProcessID;        /* The process ID to include or exclude for loopback mode. Set to 0 to capture audio from all processes. Ignored when an explicit device ID is specified. */
-        ma_bool8 loopbackProcessExclude;    /* When set to true, excludes the process specified by loopbackProcessID. By default, the process will be included. */
-    } wasapi;
-    struct
-    {
-        ma_bool32 noMMap;           /* Disables MMap mode. */
-        ma_bool32 noAutoFormat;     /* Opens the ALSA device with SND_PCM_NO_AUTO_FORMAT. */
-        ma_bool32 noAutoChannels;   /* Opens the ALSA device with SND_PCM_NO_AUTO_CHANNELS. */
-        ma_bool32 noAutoResample;   /* Opens the ALSA device with SND_PCM_NO_AUTO_RESAMPLE. */
-    } alsa;
-    struct
-    {
-        const char* pStreamNamePlayback;
-        const char* pStreamNameCapture;
-    } pulse;
-    struct
-    {
-        ma_bool32 allowNominalSampleRateChange; /* Desktop only. When enabled, allows changing of the sample rate at the operating system level. */
-    } coreaudio;
-    struct
-    {
-        ma_opensl_stream_type streamType;
-        ma_opensl_recording_preset recordingPreset;
-        ma_bool32 enableCompatibilityWorkarounds;
-    } opensl;
-    struct
-    {
-        ma_aaudio_usage usage;
-        ma_aaudio_content_type contentType;
-        ma_aaudio_input_preset inputPreset;
-        ma_aaudio_allowed_capture_policy allowedCapturePolicy;
-        ma_bool32 noAutoStartAfterReroute;
-        ma_bool32 enableCompatibilityWorkarounds;
-    } aaudio;
-};
-
-
-/*
-The callback for handling device enumeration. This is fired from `ma_context_enumerate_devices()`.
-
-
-Parameters
-----------
-pContext (in)
-    A pointer to the context performing the enumeration.
-
-deviceType (in)
-    The type of the device being enumerated. This will always be either `ma_device_type_playback` or `ma_device_type_capture`.
-
-pInfo (in)
-    A pointer to a `ma_device_info` containing the ID and name of the enumerated device. Note that this will not include detailed information about the device,
-    only basic information (ID and name). The reason for this is that it would otherwise require opening the backend device to probe for the information which
-    is too inefficient.
-
-pUserData (in)
-    The user data pointer passed into `ma_context_enumerate_devices()`.
-*/
-typedef ma_bool32 (* ma_enum_devices_callback_proc)(ma_context* pContext, ma_device_type deviceType, const ma_device_info* pInfo, void* pUserData);
-
-
-/*
-Describes some basic details about a playback or capture device.
-*/
-typedef struct
-{
-    const ma_device_id* pDeviceID;
-    ma_share_mode shareMode;
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_channel channelMap[MA_MAX_CHANNELS];
-    ma_uint32 periodSizeInFrames;
-    ma_uint32 periodSizeInMilliseconds;
-    ma_uint32 periodCount;
-} ma_device_descriptor;
-
-/*
-These are the callbacks required to be implemented for a backend. These callbacks are grouped into two parts: context and device. There is one context
-to many devices. A device is created from a context.
-
-The general flow goes like this:
-
-  1) A context is created with `onContextInit()`
-     1a) Available devices can be enumerated with `onContextEnumerateDevices()` if required.
-     1b) Detailed information about a device can be queried with `onContextGetDeviceInfo()` if required.
-  2) A device is created from the context that was created in the first step using `onDeviceInit()`, and optionally a device ID that was
-     selected from device enumeration via `onContextEnumerateDevices()`.
-  3) A device is started or stopped with `onDeviceStart()` / `onDeviceStop()`
-  4) Data is delivered to and from the device by the backend. This is always done based on the native format returned by the prior call
-     to `onDeviceInit()`. Conversion between the device's native format and the format requested by the application will be handled by
-     miniaudio internally.
-
-Initialization of the context is quite simple. You need to do any necessary initialization of internal objects and then output the
-callbacks defined in this structure.
-
-Once the context has been initialized you can initialize a device. Before doing so, however, the application may want to know which
-physical devices are available. This is where `onContextEnumerateDevices()` comes in. This is fairly simple. For each device, fire the
-given callback with, at a minimum, the basic information filled out in `ma_device_info`. When the callback returns `MA_FALSE`, enumeration
-needs to stop and the `onContextEnumerateDevices()` function returns with a success code.
-
-Detailed device information can be retrieved from a device ID using `onContextGetDeviceInfo()`. This takes as input the device type and ID,
-and on output returns detailed information about the device in `ma_device_info`. The `onContextGetDeviceInfo()` callback must handle the
-case when the device ID is NULL, in which case information about the default device needs to be retrieved.
-
-Once the context has been created and the device ID retrieved (if using anything other than the default device), the device can be created.
-This is a little bit more complicated than initialization of the context due to it's more complicated configuration. When initializing a
-device, a duplex device may be requested. This means a separate data format needs to be specified for both playback and capture. On input,
-the data format is set to what the application wants. On output it's set to the native format which should match as closely as possible to
-the requested format. The conversion between the format requested by the application and the device's native format will be handled
-internally by miniaudio.
-
-On input, if the sample format is set to `ma_format_unknown`, the backend is free to use whatever sample format it desires, so long as it's
-supported by miniaudio. When the channel count is set to 0, the backend should use the device's native channel count. The same applies for
-sample rate. For the channel map, the default should be used when `ma_channel_map_is_blank()` returns true (all channels set to
-`MA_CHANNEL_NONE`). On input, the `periodSizeInFrames` or `periodSizeInMilliseconds` option should always be set. The backend should
-inspect both of these variables. If `periodSizeInFrames` is set, it should take priority, otherwise it needs to be derived from the period
-size in milliseconds (`periodSizeInMilliseconds`) and the sample rate, keeping in mind that the sample rate may be 0, in which case the
-sample rate will need to be determined before calculating the period size in frames. On output, all members of the `ma_device_descriptor`
-object should be set to a valid value, except for `periodSizeInMilliseconds` which is optional (`periodSizeInFrames` *must* be set).
-
-Starting and stopping of the device is done with `onDeviceStart()` and `onDeviceStop()` and should be self-explanatory. If the backend uses
-asynchronous reading and writing, `onDeviceStart()` and `onDeviceStop()` should always be implemented.
-
-The handling of data delivery between the application and the device is the most complicated part of the process. To make this a bit
-easier, some helper callbacks are available. If the backend uses a blocking read/write style of API, the `onDeviceRead()` and
-`onDeviceWrite()` callbacks can optionally be implemented. These are blocking and work just like reading and writing from a file. If the
-backend uses a callback for data delivery, that callback must call `ma_device_handle_backend_data_callback()` from within it's callback.
-This allows miniaudio to then process any necessary data conversion and then pass it to the miniaudio data callback.
-
-If the backend requires absolute flexibility with it's data delivery, it can optionally implement the `onDeviceDataLoop()` callback
-which will allow it to implement the logic that will run on the audio thread. This is much more advanced and is completely optional.
-
-The audio thread should run data delivery logic in a loop while `ma_device_get_state() == ma_device_state_started` and no errors have been
-encountered. Do not start or stop the device here. That will be handled from outside the `onDeviceDataLoop()` callback.
-
-The invocation of the `onDeviceDataLoop()` callback will be handled by miniaudio. When you start the device, miniaudio will fire this
-callback. When the device is stopped, the `ma_device_get_state() == ma_device_state_started` condition will fail and the loop will be terminated
-which will then fall through to the part that stops the device. For an example on how to implement the `onDeviceDataLoop()` callback,
-look at `ma_device_audio_thread__default_read_write()`. Implement the `onDeviceDataLoopWakeup()` callback if you need a mechanism to
-wake up the audio thread.
-
-If the backend supports an optimized retrieval of device information from an initialized `ma_device` object, it should implement the
-`onDeviceGetInfo()` callback. This is optional, in which case it will fall back to `onContextGetDeviceInfo()` which is less efficient.
-*/
-struct ma_backend_callbacks
-{
-    ma_result (* onContextInit)(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks);
-    ma_result (* onContextUninit)(ma_context* pContext);
-    ma_result (* onContextEnumerateDevices)(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData);
-    ma_result (* onContextGetDeviceInfo)(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo);
-    ma_result (* onDeviceInit)(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture);
-    ma_result (* onDeviceUninit)(ma_device* pDevice);
-    ma_result (* onDeviceStart)(ma_device* pDevice);
-    ma_result (* onDeviceStop)(ma_device* pDevice);
-    ma_result (* onDeviceRead)(ma_device* pDevice, void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesRead);
-    ma_result (* onDeviceWrite)(ma_device* pDevice, const void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten);
-    ma_result (* onDeviceDataLoop)(ma_device* pDevice);
-    ma_result (* onDeviceDataLoopWakeup)(ma_device* pDevice);
-    ma_result (* onDeviceGetInfo)(ma_device* pDevice, ma_device_type type, ma_device_info* pDeviceInfo);
-};
-
-struct ma_context_config
-{
-    ma_log* pLog;
-    ma_thread_priority threadPriority;
-    size_t threadStackSize;
-    void* pUserData;
-    ma_allocation_callbacks allocationCallbacks;
-    struct
-    {
-        ma_bool32 useVerboseDeviceEnumeration;
-    } alsa;
-    struct
-    {
-        const char* pApplicationName;
-        const char* pServerName;
-        ma_bool32 tryAutoSpawn; /* Enables autospawning of the PulseAudio daemon if necessary. */
-    } pulse;
-    struct
-    {
-        ma_ios_session_category sessionCategory;
-        ma_uint32 sessionCategoryOptions;
-        ma_bool32 noAudioSessionActivate;   /* iOS only. When set to true, does not perform an explicit [[AVAudioSession sharedInstace] setActive:true] on initialization. */
-        ma_bool32 noAudioSessionDeactivate; /* iOS only. When set to true, does not perform an explicit [[AVAudioSession sharedInstace] setActive:false] on uninitialization. */
-    } coreaudio;
-    struct
-    {
-        const char* pClientName;
-        ma_bool32 tryStartServer;
-    } jack;
-    ma_backend_callbacks custom;
-};
-
-/* WASAPI specific structure for some commands which must run on a common thread due to bugs in WASAPI. */
-typedef struct
-{
-    int code;
-    ma_event* pEvent;   /* This will be signalled when the event is complete. */
-    union
-    {
-        struct
-        {
-            int _unused;
-        } quit;
-        struct
-        {
-            ma_device_type deviceType;
-            void* pAudioClient;
-            void** ppAudioClientService;
-            ma_result* pResult; /* The result from creating the audio client service. */
-        } createAudioClient;
-        struct
-        {
-            ma_device* pDevice;
-            ma_device_type deviceType;
-        } releaseAudioClient;
-    } data;
-} ma_context_command__wasapi;
-
-struct ma_context
-{
-    ma_backend_callbacks callbacks;
-    ma_backend backend;                 /* DirectSound, ALSA, etc. */
-    ma_log* pLog;
-    ma_log log; /* Only used if the log is owned by the context. The pLog member will be set to &log in this case. */
-    ma_thread_priority threadPriority;
-    size_t threadStackSize;
-    void* pUserData;
-    ma_allocation_callbacks allocationCallbacks;
-    ma_mutex deviceEnumLock;            /* Used to make ma_context_get_devices() thread safe. */
-    ma_mutex deviceInfoLock;            /* Used to make ma_context_get_device_info() thread safe. */
-    ma_uint32 deviceInfoCapacity;       /* Total capacity of pDeviceInfos. */
-    ma_uint32 playbackDeviceInfoCount;
-    ma_uint32 captureDeviceInfoCount;
-    ma_device_info* pDeviceInfos;       /* Playback devices first, then capture. */
-
-    union
-    {
-#ifdef MA_SUPPORT_WASAPI
-        struct
-        {
-            ma_thread commandThread;
-            ma_mutex commandLock;
-            ma_semaphore commandSem;
-            ma_uint32 commandIndex;
-            ma_uint32 commandCount;
-            ma_context_command__wasapi commands[4];
-            ma_handle hAvrt;
-            ma_proc AvSetMmThreadCharacteristicsA;
-            ma_proc AvRevertMmThreadcharacteristics;
-            ma_handle hMMDevapi;
-            ma_proc ActivateAudioInterfaceAsync;
-        } wasapi;
-#endif
-#ifdef MA_SUPPORT_DSOUND
-        struct
-        {
-            ma_handle hDSoundDLL;
-            ma_proc DirectSoundCreate;
-            ma_proc DirectSoundEnumerateA;
-            ma_proc DirectSoundCaptureCreate;
-            ma_proc DirectSoundCaptureEnumerateA;
-        } dsound;
-#endif
-#ifdef MA_SUPPORT_WINMM
-        struct
-        {
-            ma_handle hWinMM;
-            ma_proc waveOutGetNumDevs;
-            ma_proc waveOutGetDevCapsA;
-            ma_proc waveOutOpen;
-            ma_proc waveOutClose;
-            ma_proc waveOutPrepareHeader;
-            ma_proc waveOutUnprepareHeader;
-            ma_proc waveOutWrite;
-            ma_proc waveOutReset;
-            ma_proc waveInGetNumDevs;
-            ma_proc waveInGetDevCapsA;
-            ma_proc waveInOpen;
-            ma_proc waveInClose;
-            ma_proc waveInPrepareHeader;
-            ma_proc waveInUnprepareHeader;
-            ma_proc waveInAddBuffer;
-            ma_proc waveInStart;
-            ma_proc waveInReset;
-        } winmm;
-#endif
-#ifdef MA_SUPPORT_ALSA
-        struct
-        {
-            ma_handle asoundSO;
-            ma_proc snd_pcm_open;
-            ma_proc snd_pcm_close;
-            ma_proc snd_pcm_hw_params_sizeof;
-            ma_proc snd_pcm_hw_params_any;
-            ma_proc snd_pcm_hw_params_set_format;
-            ma_proc snd_pcm_hw_params_set_format_first;
-            ma_proc snd_pcm_hw_params_get_format_mask;
-            ma_proc snd_pcm_hw_params_set_channels;
-            ma_proc snd_pcm_hw_params_set_channels_near;
-            ma_proc snd_pcm_hw_params_set_channels_minmax;
-            ma_proc snd_pcm_hw_params_set_rate_resample;
-            ma_proc snd_pcm_hw_params_set_rate;
-            ma_proc snd_pcm_hw_params_set_rate_near;
-            ma_proc snd_pcm_hw_params_set_buffer_size_near;
-            ma_proc snd_pcm_hw_params_set_periods_near;
-            ma_proc snd_pcm_hw_params_set_access;
-            ma_proc snd_pcm_hw_params_get_format;
-            ma_proc snd_pcm_hw_params_get_channels;
-            ma_proc snd_pcm_hw_params_get_channels_min;
-            ma_proc snd_pcm_hw_params_get_channels_max;
-            ma_proc snd_pcm_hw_params_get_rate;
-            ma_proc snd_pcm_hw_params_get_rate_min;
-            ma_proc snd_pcm_hw_params_get_rate_max;
-            ma_proc snd_pcm_hw_params_get_buffer_size;
-            ma_proc snd_pcm_hw_params_get_periods;
-            ma_proc snd_pcm_hw_params_get_access;
-            ma_proc snd_pcm_hw_params_test_format;
-            ma_proc snd_pcm_hw_params_test_channels;
-            ma_proc snd_pcm_hw_params_test_rate;
-            ma_proc snd_pcm_hw_params;
-            ma_proc snd_pcm_sw_params_sizeof;
-            ma_proc snd_pcm_sw_params_current;
-            ma_proc snd_pcm_sw_params_get_boundary;
-            ma_proc snd_pcm_sw_params_set_avail_min;
-            ma_proc snd_pcm_sw_params_set_start_threshold;
-            ma_proc snd_pcm_sw_params_set_stop_threshold;
-            ma_proc snd_pcm_sw_params;
-            ma_proc snd_pcm_format_mask_sizeof;
-            ma_proc snd_pcm_format_mask_test;
-            ma_proc snd_pcm_get_chmap;
-            ma_proc snd_pcm_state;
-            ma_proc snd_pcm_prepare;
-            ma_proc snd_pcm_start;
-            ma_proc snd_pcm_drop;
-            ma_proc snd_pcm_drain;
-            ma_proc snd_pcm_reset;
-            ma_proc snd_device_name_hint;
-            ma_proc snd_device_name_get_hint;
-            ma_proc snd_card_get_index;
-            ma_proc snd_device_name_free_hint;
-            ma_proc snd_pcm_mmap_begin;
-            ma_proc snd_pcm_mmap_commit;
-            ma_proc snd_pcm_recover;
-            ma_proc snd_pcm_readi;
-            ma_proc snd_pcm_writei;
-            ma_proc snd_pcm_avail;
-            ma_proc snd_pcm_avail_update;
-            ma_proc snd_pcm_wait;
-            ma_proc snd_pcm_nonblock;
-            ma_proc snd_pcm_info;
-            ma_proc snd_pcm_info_sizeof;
-            ma_proc snd_pcm_info_get_name;
-            ma_proc snd_pcm_poll_descriptors;
-            ma_proc snd_pcm_poll_descriptors_count;
-            ma_proc snd_pcm_poll_descriptors_revents;
-            ma_proc snd_config_update_free_global;
-
-            ma_mutex internalDeviceEnumLock;
-            ma_bool32 useVerboseDeviceEnumeration;
-        } alsa;
-#endif
-#ifdef MA_SUPPORT_PULSEAUDIO
-        struct
-        {
-            ma_handle pulseSO;
-            ma_proc pa_mainloop_new;
-            ma_proc pa_mainloop_free;
-            ma_proc pa_mainloop_quit;
-            ma_proc pa_mainloop_get_api;
-            ma_proc pa_mainloop_iterate;
-            ma_proc pa_mainloop_wakeup;
-            ma_proc pa_threaded_mainloop_new;
-            ma_proc pa_threaded_mainloop_free;
-            ma_proc pa_threaded_mainloop_start;
-            ma_proc pa_threaded_mainloop_stop;
-            ma_proc pa_threaded_mainloop_lock;
-            ma_proc pa_threaded_mainloop_unlock;
-            ma_proc pa_threaded_mainloop_wait;
-            ma_proc pa_threaded_mainloop_signal;
-            ma_proc pa_threaded_mainloop_accept;
-            ma_proc pa_threaded_mainloop_get_retval;
-            ma_proc pa_threaded_mainloop_get_api;
-            ma_proc pa_threaded_mainloop_in_thread;
-            ma_proc pa_threaded_mainloop_set_name;
-            ma_proc pa_context_new;
-            ma_proc pa_context_unref;
-            ma_proc pa_context_connect;
-            ma_proc pa_context_disconnect;
-            ma_proc pa_context_set_state_callback;
-            ma_proc pa_context_get_state;
-            ma_proc pa_context_get_sink_info_list;
-            ma_proc pa_context_get_source_info_list;
-            ma_proc pa_context_get_sink_info_by_name;
-            ma_proc pa_context_get_source_info_by_name;
-            ma_proc pa_operation_unref;
-            ma_proc pa_operation_get_state;
-            ma_proc pa_channel_map_init_extend;
-            ma_proc pa_channel_map_valid;
-            ma_proc pa_channel_map_compatible;
-            ma_proc pa_stream_new;
-            ma_proc pa_stream_unref;
-            ma_proc pa_stream_connect_playback;
-            ma_proc pa_stream_connect_record;
-            ma_proc pa_stream_disconnect;
-            ma_proc pa_stream_get_state;
-            ma_proc pa_stream_get_sample_spec;
-            ma_proc pa_stream_get_channel_map;
-            ma_proc pa_stream_get_buffer_attr;
-            ma_proc pa_stream_set_buffer_attr;
-            ma_proc pa_stream_get_device_name;
-            ma_proc pa_stream_set_write_callback;
-            ma_proc pa_stream_set_read_callback;
-            ma_proc pa_stream_set_suspended_callback;
-            ma_proc pa_stream_set_moved_callback;
-            ma_proc pa_stream_is_suspended;
-            ma_proc pa_stream_flush;
-            ma_proc pa_stream_drain;
-            ma_proc pa_stream_is_corked;
-            ma_proc pa_stream_cork;
-            ma_proc pa_stream_trigger;
-            ma_proc pa_stream_begin_write;
-            ma_proc pa_stream_write;
-            ma_proc pa_stream_peek;
-            ma_proc pa_stream_drop;
-            ma_proc pa_stream_writable_size;
-            ma_proc pa_stream_readable_size;
-
-            /*pa_mainloop**/ ma_ptr pMainLoop;
-            /*pa_context**/ ma_ptr pPulseContext;
-            char* pApplicationName; /* Set when the context is initialized. Used by devices for their local pa_context objects. */
-            char* pServerName;      /* Set when the context is initialized. Used by devices for their local pa_context objects. */
-        } pulse;
-#endif
-#ifdef MA_SUPPORT_JACK
-        struct
-        {
-            ma_handle jackSO;
-            ma_proc jack_client_open;
-            ma_proc jack_client_close;
-            ma_proc jack_client_name_size;
-            ma_proc jack_set_process_callback;
-            ma_proc jack_set_buffer_size_callback;
-            ma_proc jack_on_shutdown;
-            ma_proc jack_get_sample_rate;
-            ma_proc jack_get_buffer_size;
-            ma_proc jack_get_ports;
-            ma_proc jack_activate;
-            ma_proc jack_deactivate;
-            ma_proc jack_connect;
-            ma_proc jack_port_register;
-            ma_proc jack_port_name;
-            ma_proc jack_port_get_buffer;
-            ma_proc jack_free;
-
-            char* pClientName;
-            ma_bool32 tryStartServer;
-        } jack;
-#endif
-#ifdef MA_SUPPORT_COREAUDIO
-        struct
-        {
-            ma_handle hCoreFoundation;
-            ma_proc CFStringGetCString;
-            ma_proc CFRelease;
-
-            ma_handle hCoreAudio;
-            ma_proc AudioObjectGetPropertyData;
-            ma_proc AudioObjectGetPropertyDataSize;
-            ma_proc AudioObjectSetPropertyData;
-            ma_proc AudioObjectAddPropertyListener;
-            ma_proc AudioObjectRemovePropertyListener;
-
-            ma_handle hAudioUnit;  /* Could possibly be set to AudioToolbox on later versions of macOS. */
-            ma_proc AudioComponentFindNext;
-            ma_proc AudioComponentInstanceDispose;
-            ma_proc AudioComponentInstanceNew;
-            ma_proc AudioOutputUnitStart;
-            ma_proc AudioOutputUnitStop;
-            ma_proc AudioUnitAddPropertyListener;
-            ma_proc AudioUnitGetPropertyInfo;
-            ma_proc AudioUnitGetProperty;
-            ma_proc AudioUnitSetProperty;
-            ma_proc AudioUnitInitialize;
-            ma_proc AudioUnitRender;
-
-            /*AudioComponent*/ ma_ptr component;
-            ma_bool32 noAudioSessionDeactivate; /* For tracking whether or not the iOS audio session should be explicitly deactivated. Set from the config in ma_context_init__coreaudio(). */
-        } coreaudio;
-#endif
-#ifdef MA_SUPPORT_SNDIO
-        struct
-        {
-            ma_handle sndioSO;
-            ma_proc sio_open;
-            ma_proc sio_close;
-            ma_proc sio_setpar;
-            ma_proc sio_getpar;
-            ma_proc sio_getcap;
-            ma_proc sio_start;
-            ma_proc sio_stop;
-            ma_proc sio_read;
-            ma_proc sio_write;
-            ma_proc sio_onmove;
-            ma_proc sio_nfds;
-            ma_proc sio_pollfd;
-            ma_proc sio_revents;
-            ma_proc sio_eof;
-            ma_proc sio_setvol;
-            ma_proc sio_onvol;
-            ma_proc sio_initpar;
-        } sndio;
-#endif
-#ifdef MA_SUPPORT_AUDIO4
-        struct
-        {
-            int _unused;
-        } audio4;
-#endif
-#ifdef MA_SUPPORT_OSS
-        struct
-        {
-            int versionMajor;
-            int versionMinor;
-        } oss;
-#endif
-#ifdef MA_SUPPORT_AAUDIO
-        struct
-        {
-            ma_handle hAAudio; /* libaaudio.so */
-            ma_proc AAudio_createStreamBuilder;
-            ma_proc AAudioStreamBuilder_delete;
-            ma_proc AAudioStreamBuilder_setDeviceId;
-            ma_proc AAudioStreamBuilder_setDirection;
-            ma_proc AAudioStreamBuilder_setSharingMode;
-            ma_proc AAudioStreamBuilder_setFormat;
-            ma_proc AAudioStreamBuilder_setChannelCount;
-            ma_proc AAudioStreamBuilder_setSampleRate;
-            ma_proc AAudioStreamBuilder_setBufferCapacityInFrames;
-            ma_proc AAudioStreamBuilder_setFramesPerDataCallback;
-            ma_proc AAudioStreamBuilder_setDataCallback;
-            ma_proc AAudioStreamBuilder_setErrorCallback;
-            ma_proc AAudioStreamBuilder_setPerformanceMode;
-            ma_proc AAudioStreamBuilder_setUsage;
-            ma_proc AAudioStreamBuilder_setContentType;
-            ma_proc AAudioStreamBuilder_setInputPreset;
-            ma_proc AAudioStreamBuilder_setAllowedCapturePolicy;
-            ma_proc AAudioStreamBuilder_openStream;
-            ma_proc AAudioStream_close;
-            ma_proc AAudioStream_getState;
-            ma_proc AAudioStream_waitForStateChange;
-            ma_proc AAudioStream_getFormat;
-            ma_proc AAudioStream_getChannelCount;
-            ma_proc AAudioStream_getSampleRate;
-            ma_proc AAudioStream_getBufferCapacityInFrames;
-            ma_proc AAudioStream_getFramesPerDataCallback;
-            ma_proc AAudioStream_getFramesPerBurst;
-            ma_proc AAudioStream_requestStart;
-            ma_proc AAudioStream_requestStop;
-            ma_device_job_thread jobThread; /* For processing operations outside of the error callback, specifically device disconnections and rerouting. */
-        } aaudio;
-#endif
-#ifdef MA_SUPPORT_OPENSL
-        struct
-        {
-            ma_handle libOpenSLES;
-            ma_handle SL_IID_ENGINE;
-            ma_handle SL_IID_AUDIOIODEVICECAPABILITIES;
-            ma_handle SL_IID_ANDROIDSIMPLEBUFFERQUEUE;
-            ma_handle SL_IID_RECORD;
-            ma_handle SL_IID_PLAY;
-            ma_handle SL_IID_OUTPUTMIX;
-            ma_handle SL_IID_ANDROIDCONFIGURATION;
-            ma_proc   slCreateEngine;
-        } opensl;
-#endif
-#ifdef MA_SUPPORT_WEBAUDIO
-        struct
-        {
-            int _unused;
-        } webaudio;
-#endif
-#ifdef MA_SUPPORT_NULL
-        struct
-        {
-            int _unused;
-        } null_backend;
-#endif
-    };
-
-    union
-    {
-#if defined(MA_WIN32)
-        struct
-        {
-            /*HMODULE*/ ma_handle hOle32DLL;
-            ma_proc CoInitialize;
-            ma_proc CoInitializeEx;
-            ma_proc CoUninitialize;
-            ma_proc CoCreateInstance;
-            ma_proc CoTaskMemFree;
-            ma_proc PropVariantClear;
-            ma_proc StringFromGUID2;
-
-            /*HMODULE*/ ma_handle hUser32DLL;
-            ma_proc GetForegroundWindow;
-            ma_proc GetDesktopWindow;
-
-            /*HMODULE*/ ma_handle hAdvapi32DLL;
-            ma_proc RegOpenKeyExA;
-            ma_proc RegCloseKey;
-            ma_proc RegQueryValueExA;
-
-            /*HRESULT*/ long CoInitializeResult;
-        } win32;
-#endif
-#ifdef MA_POSIX
-        struct
-        {
-            int _unused;
-        } posix;
-#endif
-        int _unused;
-    };
-};
-
-struct ma_device
-{
-    ma_context* pContext;
-    ma_device_type type;
-    ma_uint32 sampleRate;
-    ma_atomic_device_state state;               /* The state of the device is variable and can change at any time on any thread. Must be used atomically. */
-    ma_device_data_proc onData;                 /* Set once at initialization time and should not be changed after. */
-    ma_device_notification_proc onNotification; /* Set once at initialization time and should not be changed after. */
-    ma_stop_proc onStop;                        /* DEPRECATED. Use the notification callback instead. Set once at initialization time and should not be changed after. */
-    void* pUserData;                            /* Application defined data. */
-    ma_mutex startStopLock;
-    ma_event wakeupEvent;
-    ma_event startEvent;
-    ma_event stopEvent;
-    ma_thread thread;
-    ma_result workResult;                       /* This is set by the worker thread after it's finished doing a job. */
-    ma_bool8 isOwnerOfContext;                  /* When set to true, uninitializing the device will also uninitialize the context. Set to true when NULL is passed into ma_device_init(). */
-    ma_bool8 noPreSilencedOutputBuffer;
-    ma_bool8 noClip;
-    ma_bool8 noDisableDenormals;
-    ma_bool8 noFixedSizedCallback;
-    ma_atomic_float masterVolumeFactor;         /* Linear 0..1. Can be read and written simultaneously by different threads. Must be used atomically. */
-    ma_duplex_rb duplexRB;                      /* Intermediary buffer for duplex device on asynchronous backends. */
-    struct
-    {
-        ma_resample_algorithm algorithm;
-        ma_resampling_backend_vtable* pBackendVTable;
-        void* pBackendUserData;
-        struct
-        {
-            ma_uint32 lpfOrder;
-        } linear;
-    } resampling;
-    struct
-    {
-        ma_device_id* pID;                  /* Set to NULL if using default ID, otherwise set to the address of "id". */
-        ma_device_id id;                    /* If using an explicit device, will be set to a copy of the ID used for initialization. Otherwise cleared to 0. */
-        char name[MA_MAX_DEVICE_NAME_LENGTH + 1];                     /* Maybe temporary. Likely to be replaced with a query API. */
-        ma_share_mode shareMode;            /* Set to whatever was passed in when the device was initialized. */
-        ma_format format;
-        ma_uint32 channels;
-        ma_channel channelMap[MA_MAX_CHANNELS];
-        ma_format internalFormat;
-        ma_uint32 internalChannels;
-        ma_uint32 internalSampleRate;
-        ma_channel internalChannelMap[MA_MAX_CHANNELS];
-        ma_uint32 internalPeriodSizeInFrames;
-        ma_uint32 internalPeriods;
-        ma_channel_mix_mode channelMixMode;
-        ma_bool32 calculateLFEFromSpatialChannels;
-        ma_data_converter converter;
-        void* pIntermediaryBuffer;          /* For implementing fixed sized buffer callbacks. Will be null if using variable sized callbacks. */
-        ma_uint32 intermediaryBufferCap;
-        ma_uint32 intermediaryBufferLen;    /* How many valid frames are sitting in the intermediary buffer. */
-        void* pInputCache;                  /* In external format. Can be null. */
-        ma_uint64 inputCacheCap;
-        ma_uint64 inputCacheConsumed;
-        ma_uint64 inputCacheRemaining;
-    } playback;
-    struct
-    {
-        ma_device_id* pID;                  /* Set to NULL if using default ID, otherwise set to the address of "id". */
-        ma_device_id id;                    /* If using an explicit device, will be set to a copy of the ID used for initialization. Otherwise cleared to 0. */
-        char name[MA_MAX_DEVICE_NAME_LENGTH + 1];                     /* Maybe temporary. Likely to be replaced with a query API. */
-        ma_share_mode shareMode;            /* Set to whatever was passed in when the device was initialized. */
-        ma_format format;
-        ma_uint32 channels;
-        ma_channel channelMap[MA_MAX_CHANNELS];
-        ma_format internalFormat;
-        ma_uint32 internalChannels;
-        ma_uint32 internalSampleRate;
-        ma_channel internalChannelMap[MA_MAX_CHANNELS];
-        ma_uint32 internalPeriodSizeInFrames;
-        ma_uint32 internalPeriods;
-        ma_channel_mix_mode channelMixMode;
-        ma_bool32 calculateLFEFromSpatialChannels;
-        ma_data_converter converter;
-        void* pIntermediaryBuffer;          /* For implementing fixed sized buffer callbacks. Will be null if using variable sized callbacks. */
-        ma_uint32 intermediaryBufferCap;
-        ma_uint32 intermediaryBufferLen;    /* How many valid frames are sitting in the intermediary buffer. */
-    } capture;
-
-    union
-    {
-#ifdef MA_SUPPORT_WASAPI
-        struct
-        {
-            /*IAudioClient**/ ma_ptr pAudioClientPlayback;
-            /*IAudioClient**/ ma_ptr pAudioClientCapture;
-            /*IAudioRenderClient**/ ma_ptr pRenderClient;
-            /*IAudioCaptureClient**/ ma_ptr pCaptureClient;
-            /*IMMDeviceEnumerator**/ ma_ptr pDeviceEnumerator;      /* Used for IMMNotificationClient notifications. Required for detecting default device changes. */
-            ma_IMMNotificationClient notificationClient;
-            /*HANDLE*/ ma_handle hEventPlayback;                    /* Auto reset. Initialized to signaled. */
-            /*HANDLE*/ ma_handle hEventCapture;                     /* Auto reset. Initialized to unsignaled. */
-            ma_uint32 actualBufferSizeInFramesPlayback;             /* Value from GetBufferSize(). internalPeriodSizeInFrames is not set to the _actual_ buffer size when low-latency shared mode is being used due to the way the IAudioClient3 API works. */
-            ma_uint32 actualBufferSizeInFramesCapture;
-            ma_uint32 originalPeriodSizeInFrames;
-            ma_uint32 originalPeriodSizeInMilliseconds;
-            ma_uint32 originalPeriods;
-            ma_performance_profile originalPerformanceProfile;
-            ma_uint32 periodSizeInFramesPlayback;
-            ma_uint32 periodSizeInFramesCapture;
-            void* pMappedBufferCapture;
-            ma_uint32 mappedBufferCaptureCap;
-            ma_uint32 mappedBufferCaptureLen;
-            void* pMappedBufferPlayback;
-            ma_uint32 mappedBufferPlaybackCap;
-            ma_uint32 mappedBufferPlaybackLen;
-            ma_atomic_bool32 isStartedCapture;                      /* Can be read and written simultaneously across different threads. Must be used atomically, and must be 32-bit. */
-            ma_atomic_bool32 isStartedPlayback;                     /* Can be read and written simultaneously across different threads. Must be used atomically, and must be 32-bit. */
-            ma_uint32 loopbackProcessID;
-            ma_bool8 loopbackProcessExclude;
-            ma_bool8 noAutoConvertSRC;                              /* When set to true, disables the use of AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM. */
-            ma_bool8 noDefaultQualitySRC;                           /* When set to true, disables the use of AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY. */
-            ma_bool8 noHardwareOffloading;
-            ma_bool8 allowCaptureAutoStreamRouting;
-            ma_bool8 allowPlaybackAutoStreamRouting;
-            ma_bool8 isDetachedPlayback;
-            ma_bool8 isDetachedCapture;
-            ma_wasapi_usage usage;
-            void* hAvrtHandle;
-            ma_mutex rerouteLock;
-        } wasapi;
-#endif
-#ifdef MA_SUPPORT_DSOUND
-        struct
-        {
-            /*LPDIRECTSOUND*/ ma_ptr pPlayback;
-            /*LPDIRECTSOUNDBUFFER*/ ma_ptr pPlaybackPrimaryBuffer;
-            /*LPDIRECTSOUNDBUFFER*/ ma_ptr pPlaybackBuffer;
-            /*LPDIRECTSOUNDCAPTURE*/ ma_ptr pCapture;
-            /*LPDIRECTSOUNDCAPTUREBUFFER*/ ma_ptr pCaptureBuffer;
-        } dsound;
-#endif
-#ifdef MA_SUPPORT_WINMM
-        struct
-        {
-            /*HWAVEOUT*/ ma_handle hDevicePlayback;
-            /*HWAVEIN*/ ma_handle hDeviceCapture;
-            /*HANDLE*/ ma_handle hEventPlayback;
-            /*HANDLE*/ ma_handle hEventCapture;
-            ma_uint32 fragmentSizeInFrames;
-            ma_uint32 iNextHeaderPlayback;             /* [0,periods). Used as an index into pWAVEHDRPlayback. */
-            ma_uint32 iNextHeaderCapture;              /* [0,periods). Used as an index into pWAVEHDRCapture. */
-            ma_uint32 headerFramesConsumedPlayback;    /* The number of PCM frames consumed in the buffer in pWAVEHEADER[iNextHeader]. */
-            ma_uint32 headerFramesConsumedCapture;     /* ^^^ */
-            /*WAVEHDR**/ ma_uint8* pWAVEHDRPlayback;   /* One instantiation for each period. */
-            /*WAVEHDR**/ ma_uint8* pWAVEHDRCapture;    /* One instantiation for each period. */
-            ma_uint8* pIntermediaryBufferPlayback;
-            ma_uint8* pIntermediaryBufferCapture;
-            ma_uint8* _pHeapData;                      /* Used internally and is used for the heap allocated data for the intermediary buffer and the WAVEHDR structures. */
-        } winmm;
-#endif
-#ifdef MA_SUPPORT_ALSA
-        struct
-        {
-            /*snd_pcm_t**/ ma_ptr pPCMPlayback;
-            /*snd_pcm_t**/ ma_ptr pPCMCapture;
-            /*struct pollfd**/ void* pPollDescriptorsPlayback;
-            /*struct pollfd**/ void* pPollDescriptorsCapture;
-            int pollDescriptorCountPlayback;
-            int pollDescriptorCountCapture;
-            int wakeupfdPlayback;   /* eventfd for waking up from poll() when the playback device is stopped. */
-            int wakeupfdCapture;    /* eventfd for waking up from poll() when the capture device is stopped. */
-            ma_bool8 isUsingMMapPlayback;
-            ma_bool8 isUsingMMapCapture;
-        } alsa;
-#endif
-#ifdef MA_SUPPORT_PULSEAUDIO
-        struct
-        {
-            /*pa_mainloop**/ ma_ptr pMainLoop;
-            /*pa_context**/ ma_ptr pPulseContext;
-            /*pa_stream**/ ma_ptr pStreamPlayback;
-            /*pa_stream**/ ma_ptr pStreamCapture;
-        } pulse;
-#endif
-#ifdef MA_SUPPORT_JACK
-        struct
-        {
-            /*jack_client_t**/ ma_ptr pClient;
-            /*jack_port_t**/ ma_ptr* ppPortsPlayback;
-            /*jack_port_t**/ ma_ptr* ppPortsCapture;
-            float* pIntermediaryBufferPlayback; /* Typed as a float because JACK is always floating point. */
-            float* pIntermediaryBufferCapture;
-        } jack;
-#endif
-#ifdef MA_SUPPORT_COREAUDIO
-        struct
-        {
-            ma_uint32 deviceObjectIDPlayback;
-            ma_uint32 deviceObjectIDCapture;
-            /*AudioUnit*/ ma_ptr audioUnitPlayback;
-            /*AudioUnit*/ ma_ptr audioUnitCapture;
-            /*AudioBufferList**/ ma_ptr pAudioBufferList;   /* Only used for input devices. */
-            ma_uint32 audioBufferCapInFrames;               /* Only used for input devices. The capacity in frames of each buffer in pAudioBufferList. */
-            ma_event stopEvent;
-            ma_uint32 originalPeriodSizeInFrames;
-            ma_uint32 originalPeriodSizeInMilliseconds;
-            ma_uint32 originalPeriods;
-            ma_performance_profile originalPerformanceProfile;
-            ma_bool32 isDefaultPlaybackDevice;
-            ma_bool32 isDefaultCaptureDevice;
-            ma_bool32 isSwitchingPlaybackDevice;   /* <-- Set to true when the default device has changed and miniaudio is in the process of switching. */
-            ma_bool32 isSwitchingCaptureDevice;    /* <-- Set to true when the default device has changed and miniaudio is in the process of switching. */
-            void* pNotificationHandler;             /* Only used on mobile platforms. Obj-C object for handling route changes. */
-        } coreaudio;
-#endif
-#ifdef MA_SUPPORT_SNDIO
-        struct
-        {
-            ma_ptr handlePlayback;
-            ma_ptr handleCapture;
-            ma_bool32 isStartedPlayback;
-            ma_bool32 isStartedCapture;
-        } sndio;
-#endif
-#ifdef MA_SUPPORT_AUDIO4
-        struct
-        {
-            int fdPlayback;
-            int fdCapture;
-        } audio4;
-#endif
-#ifdef MA_SUPPORT_OSS
-        struct
-        {
-            int fdPlayback;
-            int fdCapture;
-        } oss;
-#endif
-#ifdef MA_SUPPORT_AAUDIO
-        struct
-        {
-            /*AAudioStream**/ ma_ptr pStreamPlayback;
-            /*AAudioStream**/ ma_ptr pStreamCapture;
-            ma_aaudio_usage usage;
-            ma_aaudio_content_type contentType;
-            ma_aaudio_input_preset inputPreset;
-            ma_aaudio_allowed_capture_policy allowedCapturePolicy;
-            ma_bool32 noAutoStartAfterReroute;
-        } aaudio;
-#endif
-#ifdef MA_SUPPORT_OPENSL
-        struct
-        {
-            /*SLObjectItf*/ ma_ptr pOutputMixObj;
-            /*SLOutputMixItf*/ ma_ptr pOutputMix;
-            /*SLObjectItf*/ ma_ptr pAudioPlayerObj;
-            /*SLPlayItf*/ ma_ptr pAudioPlayer;
-            /*SLObjectItf*/ ma_ptr pAudioRecorderObj;
-            /*SLRecordItf*/ ma_ptr pAudioRecorder;
-            /*SLAndroidSimpleBufferQueueItf*/ ma_ptr pBufferQueuePlayback;
-            /*SLAndroidSimpleBufferQueueItf*/ ma_ptr pBufferQueueCapture;
-            ma_bool32 isDrainingCapture;
-            ma_bool32 isDrainingPlayback;
-            ma_uint32 currentBufferIndexPlayback;
-            ma_uint32 currentBufferIndexCapture;
-            ma_uint8* pBufferPlayback;      /* This is malloc()'d and is used for storing audio data. Typed as ma_uint8 for easy offsetting. */
-            ma_uint8* pBufferCapture;
-        } opensl;
-#endif
-#ifdef MA_SUPPORT_WEBAUDIO
-        struct
-        {
-            /* AudioWorklets path. */
-            /* EMSCRIPTEN_WEBAUDIO_T */ int audioContext;
-            /* EMSCRIPTEN_WEBAUDIO_T */ int audioWorklet;
-            float* pIntermediaryBuffer;
-            void* pStackBuffer;
-            ma_result initResult;   /* Set to MA_BUSY while initialization is in progress. */
-            int deviceIndex;        /* We store the device in a list on the JavaScript side. This is used to map our C object to the JS object. */
-        } webaudio;
-#endif
-#ifdef MA_SUPPORT_NULL
-        struct
-        {
-            ma_thread deviceThread;
-            ma_event operationEvent;
-            ma_event operationCompletionEvent;
-            ma_semaphore operationSemaphore;
-            ma_uint32 operation;
-            ma_result operationResult;
-            ma_timer timer;
-            double priorRunTime;
-            ma_uint32 currentPeriodFramesRemainingPlayback;
-            ma_uint32 currentPeriodFramesRemainingCapture;
-            ma_uint64 lastProcessedFramePlayback;
-            ma_uint64 lastProcessedFrameCapture;
-            ma_atomic_bool32 isStarted; /* Read and written by multiple threads. Must be used atomically, and must be 32-bit for compiler compatibility. */
-        } null_device;
-#endif
-    };
-};
-#if defined(_MSC_VER) && !defined(__clang__)
-    #pragma warning(pop)
-#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-    #pragma GCC diagnostic pop  /* For ISO C99 doesn't support unnamed structs/unions [-Wpedantic] */
-#endif
-
-/*
-Initializes a `ma_context_config` object.
-
-
-Return Value
-------------
-A `ma_context_config` initialized to defaults.
-
-
-Remarks
--------
-You must always use this to initialize the default state of the `ma_context_config` object. Not using this will result in your program breaking when miniaudio
-is updated and new members are added to `ma_context_config`. It also sets logical defaults.
-
-You can override members of the returned object by changing it's members directly.
-
-
-See Also
---------
-ma_context_init()
-*/
-MA_API ma_context_config ma_context_config_init(void);
-
-/*
-Initializes a context.
-
-The context is used for selecting and initializing an appropriate backend and to represent the backend at a more global level than that of an individual
-device. There is one context to many devices, and a device is created from a context. A context is required to enumerate devices.
-
-
-Parameters
-----------
-backends (in, optional)
-    A list of backends to try initializing, in priority order. Can be NULL, in which case it uses default priority order.
-
-backendCount (in, optional)
-    The number of items in `backend`. Ignored if `backend` is NULL.
-
-pConfig (in, optional)
-    The context configuration.
-
-pContext (in)
-    A pointer to the context object being initialized.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Unsafe. Do not call this function across multiple threads as some backends read and write to global state.
-
-
-Remarks
--------
-When `backends` is NULL, the default priority order will be used. Below is a list of backends in priority order:
-
-    |-------------|-----------------------|--------------------------------------------------------|
-    | Name        | Enum Name             | Supported Operating Systems                            |
-    |-------------|-----------------------|--------------------------------------------------------|
-    | WASAPI      | ma_backend_wasapi     | Windows Vista+                                         |
-    | DirectSound | ma_backend_dsound     | Windows XP+                                            |
-    | WinMM       | ma_backend_winmm      | Windows XP+ (may work on older versions, but untested) |
-    | Core Audio  | ma_backend_coreaudio  | macOS, iOS                                             |
-    | ALSA        | ma_backend_alsa       | Linux                                                  |
-    | PulseAudio  | ma_backend_pulseaudio | Cross Platform (disabled on Windows, BSD and Android)  |
-    | JACK        | ma_backend_jack       | Cross Platform (disabled on BSD and Android)           |
-    | sndio       | ma_backend_sndio      | OpenBSD                                                |
-    | audio(4)    | ma_backend_audio4     | NetBSD, OpenBSD                                        |
-    | OSS         | ma_backend_oss        | FreeBSD                                                |
-    | AAudio      | ma_backend_aaudio     | Android 8+                                             |
-    | OpenSL|ES   | ma_backend_opensl     | Android (API level 16+)                                |
-    | Web Audio   | ma_backend_webaudio   | Web (via Emscripten)                                   |
-    | Null        | ma_backend_null       | Cross Platform (not used on Web)                       |
-    |-------------|-----------------------|--------------------------------------------------------|
-
-The context can be configured via the `pConfig` argument. The config object is initialized with `ma_context_config_init()`. Individual configuration settings
-can then be set directly on the structure. Below are the members of the `ma_context_config` object.
-
-    pLog
-        A pointer to the `ma_log` to post log messages to. Can be NULL if the application does not
-        require logging. See the `ma_log` API for details on how to use the logging system.
-
-    threadPriority
-        The desired priority to use for the audio thread. Allowable values include the following:
-
-        |--------------------------------------|
-        | Thread Priority                      |
-        |--------------------------------------|
-        | ma_thread_priority_idle              |
-        | ma_thread_priority_lowest            |
-        | ma_thread_priority_low               |
-        | ma_thread_priority_normal            |
-        | ma_thread_priority_high              |
-        | ma_thread_priority_highest (default) |
-        | ma_thread_priority_realtime          |
-        | ma_thread_priority_default           |
-        |--------------------------------------|
-
-    threadStackSize
-        The desired size of the stack for the audio thread. Defaults to the operating system's default.
-
-    pUserData
-        A pointer to application-defined data. This can be accessed from the context object directly such as `context.pUserData`.
-
-    allocationCallbacks
-        Structure containing custom allocation callbacks. Leaving this at defaults will cause it to use MA_MALLOC, MA_REALLOC and MA_FREE. These allocation
-        callbacks will be used for anything tied to the context, including devices.
-
-    alsa.useVerboseDeviceEnumeration
-        ALSA will typically enumerate many different devices which can be intrusive and not user-friendly. To combat this, miniaudio will enumerate only unique
-        card/device pairs by default. The problem with this is that you lose a bit of flexibility and control. Setting alsa.useVerboseDeviceEnumeration makes
-        it so the ALSA backend includes all devices. Defaults to false.
-
-    pulse.pApplicationName
-        PulseAudio only. The application name to use when initializing the PulseAudio context with `pa_context_new()`.
-
-    pulse.pServerName
-        PulseAudio only. The name of the server to connect to with `pa_context_connect()`.
-
-    pulse.tryAutoSpawn
-        PulseAudio only. Whether or not to try automatically starting the PulseAudio daemon. Defaults to false. If you set this to true, keep in mind that
-        miniaudio uses a trial and error method to find the most appropriate backend, and this will result in the PulseAudio daemon starting which may be
-        intrusive for the end user.
-
-    coreaudio.sessionCategory
-        iOS only. The session category to use for the shared AudioSession instance. Below is a list of allowable values and their Core Audio equivalents.
-
-        |-----------------------------------------|-------------------------------------|
-        | miniaudio Token                         | Core Audio Token                    |
-        |-----------------------------------------|-------------------------------------|
-        | ma_ios_session_category_ambient         | AVAudioSessionCategoryAmbient       |
-        | ma_ios_session_category_solo_ambient    | AVAudioSessionCategorySoloAmbient   |
-        | ma_ios_session_category_playback        | AVAudioSessionCategoryPlayback      |
-        | ma_ios_session_category_record          | AVAudioSessionCategoryRecord        |
-        | ma_ios_session_category_play_and_record | AVAudioSessionCategoryPlayAndRecord |
-        | ma_ios_session_category_multi_route     | AVAudioSessionCategoryMultiRoute    |
-        | ma_ios_session_category_none            | AVAudioSessionCategoryAmbient       |
-        | ma_ios_session_category_default         | AVAudioSessionCategoryAmbient       |
-        |-----------------------------------------|-------------------------------------|
-
-    coreaudio.sessionCategoryOptions
-        iOS only. Session category options to use with the shared AudioSession instance. Below is a list of allowable values and their Core Audio equivalents.
-
-        |---------------------------------------------------------------------------|------------------------------------------------------------------|
-        | miniaudio Token                                                           | Core Audio Token                                                 |
-        |---------------------------------------------------------------------------|------------------------------------------------------------------|
-        | ma_ios_session_category_option_mix_with_others                            | AVAudioSessionCategoryOptionMixWithOthers                        |
-        | ma_ios_session_category_option_duck_others                                | AVAudioSessionCategoryOptionDuckOthers                           |
-        | ma_ios_session_category_option_allow_bluetooth                            | AVAudioSessionCategoryOptionAllowBluetooth                       |
-        | ma_ios_session_category_option_default_to_speaker                         | AVAudioSessionCategoryOptionDefaultToSpeaker                     |
-        | ma_ios_session_category_option_interrupt_spoken_audio_and_mix_with_others | AVAudioSessionCategoryOptionInterruptSpokenAudioAndMixWithOthers |
-        | ma_ios_session_category_option_allow_bluetooth_a2dp                       | AVAudioSessionCategoryOptionAllowBluetoothA2DP                   |
-        | ma_ios_session_category_option_allow_air_play                             | AVAudioSessionCategoryOptionAllowAirPlay                         |
-        |---------------------------------------------------------------------------|------------------------------------------------------------------|
-
-    coreaudio.noAudioSessionActivate
-        iOS only. When set to true, does not perform an explicit [[AVAudioSession sharedInstace] setActive:true] on initialization.
-
-    coreaudio.noAudioSessionDeactivate
-        iOS only. When set to true, does not perform an explicit [[AVAudioSession sharedInstace] setActive:false] on uninitialization.
-
-    jack.pClientName
-        The name of the client to pass to `jack_client_open()`.
-
-    jack.tryStartServer
-        Whether or not to try auto-starting the JACK server. Defaults to false.
-
-
-It is recommended that only a single context is active at any given time because it's a bulky data structure which performs run-time linking for the
-relevant backends every time it's initialized.
-
-The location of the context cannot change throughout it's lifetime. Consider allocating the `ma_context` object with `malloc()` if this is an issue. The
-reason for this is that a pointer to the context is stored in the `ma_device` structure.
-
-
-Example 1 - Default Initialization
-----------------------------------
-The example below shows how to initialize the context using the default configuration.
-
-```c
-ma_context context;
-ma_result result = ma_context_init(NULL, 0, NULL, &context);
-if (result != MA_SUCCESS) {
-    // Error.
-}
-```
-
-
-Example 2 - Custom Configuration
---------------------------------
-The example below shows how to initialize the context using custom backend priorities and a custom configuration. In this hypothetical example, the program
-wants to prioritize ALSA over PulseAudio on Linux. They also want to avoid using the WinMM backend on Windows because it's latency is too high. They also
-want an error to be returned if no valid backend is available which they achieve by excluding the Null backend.
-
-For the configuration, the program wants to capture any log messages so they can, for example, route it to a log file and user interface.
-
-```c
-ma_backend backends[] = {
-    ma_backend_alsa,
-    ma_backend_pulseaudio,
-    ma_backend_wasapi,
-    ma_backend_dsound
-};
-
-ma_log log;
-ma_log_init(&log);
-ma_log_register_callback(&log, ma_log_callback_init(my_log_callbac, pMyLogUserData));
-
-ma_context_config config = ma_context_config_init();
-config.pLog = &log; // Specify a custom log object in the config so any logs that are posted from ma_context_init() are captured.
-
-ma_context context;
-ma_result result = ma_context_init(backends, sizeof(backends)/sizeof(backends[0]), &config, &context);
-if (result != MA_SUCCESS) {
-    // Error.
-    if (result == MA_NO_BACKEND) {
-        // Couldn't find an appropriate backend.
-    }
-}
-
-// You could also attach a log callback post-initialization:
-ma_log_register_callback(ma_context_get_log(&context), ma_log_callback_init(my_log_callback, pMyLogUserData));
-```
-
-
-See Also
---------
-ma_context_config_init()
-ma_context_uninit()
-*/
-MA_API ma_result ma_context_init(const ma_backend backends[], ma_uint32 backendCount, const ma_context_config* pConfig, ma_context* pContext);
-
-/*
-Uninitializes a context.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Unsafe. Do not call this function across multiple threads as some backends read and write to global state.
-
-
-Remarks
--------
-Results are undefined if you call this while any device created by this context is still active.
-
-
-See Also
---------
-ma_context_init()
-*/
-MA_API ma_result ma_context_uninit(ma_context* pContext);
-
-/*
-Retrieves the size of the ma_context object.
-
-This is mainly for the purpose of bindings to know how much memory to allocate.
-*/
-MA_API size_t ma_context_sizeof(void);
-
-/*
-Retrieves a pointer to the log object associated with this context.
-
-
-Remarks
--------
-Pass the returned pointer to `ma_log_post()`, `ma_log_postv()` or `ma_log_postf()` to post a log
-message.
-
-You can attach your own logging callback to the log with `ma_log_register_callback()`
-
-
-Return Value
-------------
-A pointer to the `ma_log` object that the context uses to post log messages. If some error occurs,
-NULL will be returned.
-*/
-MA_API ma_log* ma_context_get_log(ma_context* pContext);
-
-/*
-Enumerates over every device (both playback and capture).
-
-This is a lower-level enumeration function to the easier to use `ma_context_get_devices()`. Use `ma_context_enumerate_devices()` if you would rather not incur
-an internal heap allocation, or it simply suits your code better.
-
-Note that this only retrieves the ID and name/description of the device. The reason for only retrieving basic information is that it would otherwise require
-opening the backend device in order to probe it for more detailed information which can be inefficient. Consider using `ma_context_get_device_info()` for this,
-but don't call it from within the enumeration callback.
-
-Returning false from the callback will stop enumeration. Returning true will continue enumeration.
-
-
-Parameters
-----------
-pContext (in)
-    A pointer to the context performing the enumeration.
-
-callback (in)
-    The callback to fire for each enumerated device.
-
-pUserData (in)
-    A pointer to application-defined data passed to the callback.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Safe. This is guarded using a simple mutex lock.
-
-
-Remarks
--------
-Do _not_ assume the first enumerated device of a given type is the default device.
-
-Some backends and platforms may only support default playback and capture devices.
-
-In general, you should not do anything complicated from within the callback. In particular, do not try initializing a device from within the callback. Also,
-do not try to call `ma_context_get_device_info()` from within the callback.
-
-Consider using `ma_context_get_devices()` for a simpler and safer API, albeit at the expense of an internal heap allocation.
-
-
-Example 1 - Simple Enumeration
-------------------------------
-ma_bool32 ma_device_enum_callback(ma_context* pContext, ma_device_type deviceType, const ma_device_info* pInfo, void* pUserData)
-{
-    printf("Device Name: %s\n", pInfo->name);
-    return MA_TRUE;
-}
-
-ma_result result = ma_context_enumerate_devices(&context, my_device_enum_callback, pMyUserData);
-if (result != MA_SUCCESS) {
-    // Error.
-}
-
-
-See Also
---------
-ma_context_get_devices()
-*/
-MA_API ma_result ma_context_enumerate_devices(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData);
-
-/*
-Retrieves basic information about every active playback and/or capture device.
-
-This function will allocate memory internally for the device lists and return a pointer to them through the `ppPlaybackDeviceInfos` and `ppCaptureDeviceInfos`
-parameters. If you do not want to incur the overhead of these allocations consider using `ma_context_enumerate_devices()` which will instead use a callback.
-
-
-Parameters
-----------
-pContext (in)
-    A pointer to the context performing the enumeration.
-
-ppPlaybackDeviceInfos (out)
-    A pointer to a pointer that will receive the address of a buffer containing the list of `ma_device_info` structures for playback devices.
-
-pPlaybackDeviceCount (out)
-    A pointer to an unsigned integer that will receive the number of playback devices.
-
-ppCaptureDeviceInfos (out)
-    A pointer to a pointer that will receive the address of a buffer containing the list of `ma_device_info` structures for capture devices.
-
-pCaptureDeviceCount (out)
-    A pointer to an unsigned integer that will receive the number of capture devices.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Unsafe. Since each call to this function invalidates the pointers from the previous call, you should not be calling this simultaneously across multiple
-threads. Instead, you need to make a copy of the returned data with your own higher level synchronization.
-
-
-Remarks
--------
-It is _not_ safe to assume the first device in the list is the default device.
-
-You can pass in NULL for the playback or capture lists in which case they'll be ignored.
-
-The returned pointers will become invalid upon the next call this this function, or when the context is uninitialized. Do not free the returned pointers.
-
-
-See Also
---------
-ma_context_get_devices()
-*/
-MA_API ma_result ma_context_get_devices(ma_context* pContext, ma_device_info** ppPlaybackDeviceInfos, ma_uint32* pPlaybackDeviceCount, ma_device_info** ppCaptureDeviceInfos, ma_uint32* pCaptureDeviceCount);
-
-/*
-Retrieves information about a device of the given type, with the specified ID and share mode.
-
-
-Parameters
-----------
-pContext (in)
-    A pointer to the context performing the query.
-
-deviceType (in)
-    The type of the device being queried. Must be either `ma_device_type_playback` or `ma_device_type_capture`.
-
-pDeviceID (in)
-    The ID of the device being queried.
-
-pDeviceInfo (out)
-    A pointer to the `ma_device_info` structure that will receive the device information.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Safe. This is guarded using a simple mutex lock.
-
-
-Remarks
--------
-Do _not_ call this from within the `ma_context_enumerate_devices()` callback.
-
-It's possible for a device to have different information and capabilities depending on whether or not it's opened in shared or exclusive mode. For example, in
-shared mode, WASAPI always uses floating point samples for mixing, but in exclusive mode it can be anything. Therefore, this function allows you to specify
-which share mode you want information for. Note that not all backends and devices support shared or exclusive mode, in which case this function will fail if
-the requested share mode is unsupported.
-
-This leaves pDeviceInfo unmodified in the result of an error.
-*/
-MA_API ma_result ma_context_get_device_info(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo);
-
-/*
-Determines if the given context supports loopback mode.
-
-
-Parameters
-----------
-pContext (in)
-    A pointer to the context getting queried.
-
-
-Return Value
-------------
-MA_TRUE if the context supports loopback mode; MA_FALSE otherwise.
-*/
-MA_API ma_bool32 ma_context_is_loopback_supported(ma_context* pContext);
-
-
-
-/*
-Initializes a device config with default settings.
-
-
-Parameters
-----------
-deviceType (in)
-    The type of the device this config is being initialized for. This must set to one of the following:
-
-    |-------------------------|
-    | Device Type             |
-    |-------------------------|
-    | ma_device_type_playback |
-    | ma_device_type_capture  |
-    | ma_device_type_duplex   |
-    | ma_device_type_loopback |
-    |-------------------------|
-
-
-Return Value
-------------
-A new device config object with default settings. You will typically want to adjust the config after this function returns. See remarks.
-
-
-Thread Safety
--------------
-Safe.
-
-
-Callback Safety
----------------
-Safe, but don't try initializing a device in a callback.
-
-
-Remarks
--------
-The returned config will be initialized to defaults. You will normally want to customize a few variables before initializing the device. See Example 1 for a
-typical configuration which sets the sample format, channel count, sample rate, data callback and user data. These are usually things you will want to change
-before initializing the device.
-
-See `ma_device_init()` for details on specific configuration options.
-
-
-Example 1 - Simple Configuration
---------------------------------
-The example below is what a program will typically want to configure for each device at a minimum. Notice how `ma_device_config_init()` is called first, and
-then the returned object is modified directly. This is important because it ensures that your program continues to work as new configuration options are added
-to the `ma_device_config` structure.
-
-```c
-ma_device_config config = ma_device_config_init(ma_device_type_playback);
-config.playback.format   = ma_format_f32;
-config.playback.channels = 2;
-config.sampleRate        = 48000;
-config.dataCallback      = ma_data_callback;
-config.pUserData         = pMyUserData;
-```
-
-
-See Also
---------
-ma_device_init()
-ma_device_init_ex()
-*/
-MA_API ma_device_config ma_device_config_init(ma_device_type deviceType);
-
-
-/*
-Initializes a device.
-
-A device represents a physical audio device. The idea is you send or receive audio data from the device to either play it back through a speaker, or capture it
-from a microphone. Whether or not you should send or receive data from the device (or both) depends on the type of device you are initializing which can be
-playback, capture, full-duplex or loopback. (Note that loopback mode is only supported on select backends.) Sending and receiving audio data to and from the
-device is done via a callback which is fired by miniaudio at periodic time intervals.
-
-The frequency at which data is delivered to and from a device depends on the size of it's period. The size of the period can be defined in terms of PCM frames
-or milliseconds, whichever is more convenient. Generally speaking, the smaller the period, the lower the latency at the expense of higher CPU usage and
-increased risk of glitching due to the more frequent and granular data deliver intervals. The size of a period will depend on your requirements, but
-miniaudio's defaults should work fine for most scenarios. If you're building a game you should leave this fairly small, whereas if you're building a simple
-media player you can make it larger. Note that the period size you request is actually just a hint - miniaudio will tell the backend what you want, but the
-backend is ultimately responsible for what it gives you. You cannot assume you will get exactly what you ask for.
-
-When delivering data to and from a device you need to make sure it's in the correct format which you can set through the device configuration. You just set the
-format that you want to use and miniaudio will perform all of the necessary conversion for you internally. When delivering data to and from the callback you
-can assume the format is the same as what you requested when you initialized the device. See Remarks for more details on miniaudio's data conversion pipeline.
-
-
-Parameters
-----------
-pContext (in, optional)
-    A pointer to the context that owns the device. This can be null, in which case it creates a default context internally.
-
-pConfig (in)
-    A pointer to the device configuration. Cannot be null. See remarks for details.
-
-pDevice (out)
-    A pointer to the device object being initialized.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Unsafe. It is not safe to call this function simultaneously for different devices because some backends depend on and mutate global state. The same applies to
-calling this at the same time as `ma_device_uninit()`.
-
-
-Callback Safety
----------------
-Unsafe. It is not safe to call this inside any callback.
-
-
-Remarks
--------
-Setting `pContext` to NULL will result in miniaudio creating a default context internally and is equivalent to passing in a context initialized like so:
-
-    ```c
-    ma_context_init(NULL, 0, NULL, &context);
-    ```
-
-Do not set `pContext` to NULL if you are needing to open multiple devices. You can, however, use NULL when initializing the first device, and then use
-device.pContext for the initialization of other devices.
-
-The device can be configured via the `pConfig` argument. The config object is initialized with `ma_device_config_init()`. Individual configuration settings can
-then be set directly on the structure. Below are the members of the `ma_device_config` object.
-
-    deviceType
-        Must be `ma_device_type_playback`, `ma_device_type_capture`, `ma_device_type_duplex` of `ma_device_type_loopback`.
-
-    sampleRate
-        The sample rate, in hertz. The most common sample rates are 48000 and 44100. Setting this to 0 will use the device's native sample rate.
-
-    periodSizeInFrames
-        The desired size of a period in PCM frames. If this is 0, `periodSizeInMilliseconds` will be used instead. If both are 0 the default buffer size will
-        be used depending on the selected performance profile. This value affects latency. See below for details.
-
-    periodSizeInMilliseconds
-        The desired size of a period in milliseconds. If this is 0, `periodSizeInFrames` will be used instead. If both are 0 the default buffer size will be
-        used depending on the selected performance profile. The value affects latency. See below for details.
-
-    periods
-        The number of periods making up the device's entire buffer. The total buffer size is `periodSizeInFrames` or `periodSizeInMilliseconds` multiplied by
-        this value. This is just a hint as backends will be the ones who ultimately decide how your periods will be configured.
-
-    performanceProfile
-        A hint to miniaudio as to the performance requirements of your program. Can be either `ma_performance_profile_low_latency` (default) or
-        `ma_performance_profile_conservative`. This mainly affects the size of default buffers and can usually be left at it's default value.
-
-    noPreSilencedOutputBuffer
-        When set to true, the contents of the output buffer passed into the data callback will be left undefined. When set to false (default), the contents of
-        the output buffer will be cleared the zero. You can use this to avoid the overhead of zeroing out the buffer if you can guarantee that your data
-        callback will write to every sample in the output buffer, or if you are doing your own clearing.
-
-    noClip
-        When set to true, the contents of the output buffer are left alone after returning and it will be left up to the backend itself to decide whether or
-        not to clip. When set to false (default), the contents of the output buffer passed into the data callback will be clipped after returning. This only
-        applies when the playback sample format is f32.
-
-    noDisableDenormals
-        By default, miniaudio will disable denormals when the data callback is called. Setting this to true will prevent the disabling of denormals.
-
-    noFixedSizedCallback
-        Allows miniaudio to fire the data callback with any frame count. When this is set to false (the default), the data callback will be fired with a
-        consistent frame count as specified by `periodSizeInFrames` or `periodSizeInMilliseconds`. When set to true, miniaudio will fire the callback with
-        whatever the backend requests, which could be anything.
-
-    dataCallback
-        The callback to fire whenever data is ready to be delivered to or from the device.
-
-    notificationCallback
-        The callback to fire when something has changed with the device, such as whether or not it has been started or stopped.
-
-    pUserData
-        The user data pointer to use with the device. You can access this directly from the device object like `device.pUserData`.
-
-    resampling.algorithm
-        The resampling algorithm to use when miniaudio needs to perform resampling between the rate specified by `sampleRate` and the device's native rate. The
-        default value is `ma_resample_algorithm_linear`, and the quality can be configured with `resampling.linear.lpfOrder`.
-
-    resampling.pBackendVTable
-        A pointer to an optional vtable that can be used for plugging in a custom resampler.
-
-    resampling.pBackendUserData
-        A pointer that will passed to callbacks in pBackendVTable.
-
-    resampling.linear.lpfOrder
-        The linear resampler applies a low-pass filter as part of it's processing for anti-aliasing. This setting controls the order of the filter. The higher
-        the value, the better the quality, in general. Setting this to 0 will disable low-pass filtering altogether. The maximum value is
-        `MA_MAX_FILTER_ORDER`. The default value is `min(4, MA_MAX_FILTER_ORDER)`.
-
-    playback.pDeviceID
-        A pointer to a `ma_device_id` structure containing the ID of the playback device to initialize. Setting this NULL (default) will use the system's
-        default playback device. Retrieve the device ID from the `ma_device_info` structure, which can be retrieved using device enumeration.
-
-    playback.format
-        The sample format to use for playback. When set to `ma_format_unknown` the device's native format will be used. This can be retrieved after
-        initialization from the device object directly with `device.playback.format`.
-
-    playback.channels
-        The number of channels to use for playback. When set to 0 the device's native channel count will be used. This can be retrieved after initialization
-        from the device object directly with `device.playback.channels`.
-
-    playback.pChannelMap
-        The channel map to use for playback. When left empty, the device's native channel map will be used. This can be retrieved after initialization from the
-        device object direct with `device.playback.pChannelMap`. When set, the buffer should contain `channels` items.
-
-    playback.shareMode
-        The preferred share mode to use for playback. Can be either `ma_share_mode_shared` (default) or `ma_share_mode_exclusive`. Note that if you specify
-        exclusive mode, but it's not supported by the backend, initialization will fail. You can then fall back to shared mode if desired by changing this to
-        ma_share_mode_shared and reinitializing.
-
-    capture.pDeviceID
-        A pointer to a `ma_device_id` structure containing the ID of the capture device to initialize. Setting this NULL (default) will use the system's
-        default capture device. Retrieve the device ID from the `ma_device_info` structure, which can be retrieved using device enumeration.
-
-    capture.format
-        The sample format to use for capture. When set to `ma_format_unknown` the device's native format will be used. This can be retrieved after
-        initialization from the device object directly with `device.capture.format`.
-
-    capture.channels
-        The number of channels to use for capture. When set to 0 the device's native channel count will be used. This can be retrieved after initialization
-        from the device object directly with `device.capture.channels`.
-
-    capture.pChannelMap
-        The channel map to use for capture. When left empty, the device's native channel map will be used. This can be retrieved after initialization from the
-        device object direct with `device.capture.pChannelMap`. When set, the buffer should contain `channels` items.
-
-    capture.shareMode
-        The preferred share mode to use for capture. Can be either `ma_share_mode_shared` (default) or `ma_share_mode_exclusive`. Note that if you specify
-        exclusive mode, but it's not supported by the backend, initialization will fail. You can then fall back to shared mode if desired by changing this to
-        ma_share_mode_shared and reinitializing.
-
-    wasapi.noAutoConvertSRC
-        WASAPI only. When set to true, disables WASAPI's automatic resampling and forces the use of miniaudio's resampler. Defaults to false.
-
-    wasapi.noDefaultQualitySRC
-        WASAPI only. Only used when `wasapi.noAutoConvertSRC` is set to false. When set to true, disables the use of `AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY`.
-        You should usually leave this set to false, which is the default.
-
-    wasapi.noAutoStreamRouting
-        WASAPI only. When set to true, disables automatic stream routing on the WASAPI backend. Defaults to false.
-
-    wasapi.noHardwareOffloading
-        WASAPI only. When set to true, disables the use of WASAPI's hardware offloading feature. Defaults to false.
-
-    alsa.noMMap
-        ALSA only. When set to true, disables MMap mode. Defaults to false.
-
-    alsa.noAutoFormat
-        ALSA only. When set to true, disables ALSA's automatic format conversion by including the SND_PCM_NO_AUTO_FORMAT flag. Defaults to false.
-
-    alsa.noAutoChannels
-        ALSA only. When set to true, disables ALSA's automatic channel conversion by including the SND_PCM_NO_AUTO_CHANNELS flag. Defaults to false.
-
-    alsa.noAutoResample
-        ALSA only. When set to true, disables ALSA's automatic resampling by including the SND_PCM_NO_AUTO_RESAMPLE flag. Defaults to false.
-
-    pulse.pStreamNamePlayback
-        PulseAudio only. Sets the stream name for playback.
-
-    pulse.pStreamNameCapture
-        PulseAudio only. Sets the stream name for capture.
-
-    coreaudio.allowNominalSampleRateChange
-        Core Audio only. Desktop only. When enabled, allows the sample rate of the device to be changed at the operating system level. This
-        is disabled by default in order to prevent intrusive changes to the user's system. This is useful if you want to use a sample rate
-        that is known to be natively supported by the hardware thereby avoiding the cost of resampling. When set to true, miniaudio will
-        find the closest match between the sample rate requested in the device config and the sample rates natively supported by the
-        hardware. When set to false, the sample rate currently set by the operating system will always be used.
-
-    opensl.streamType
-        OpenSL only. Explicitly sets the stream type. If left unset (`ma_opensl_stream_type_default`), the
-        stream type will be left unset. Think of this as the type of audio you're playing.
-
-    opensl.recordingPreset
-        OpenSL only. Explicitly sets the type of recording your program will be doing. When left
-        unset, the recording preset will be left unchanged.
-
-    aaudio.usage
-        AAudio only. Explicitly sets the nature of the audio the program will be consuming. When
-        left unset, the usage will be left unchanged.
-
-    aaudio.contentType
-        AAudio only. Sets the content type. When left unset, the content type will be left unchanged.
-
-    aaudio.inputPreset
-        AAudio only. Explicitly sets the type of recording your program will be doing. When left
-        unset, the input preset will be left unchanged.
-
-    aaudio.noAutoStartAfterReroute
-        AAudio only. Controls whether or not the device should be automatically restarted after a
-        stream reroute. When set to false (default) the device will be restarted automatically;
-        otherwise the device will be stopped.
-
-
-Once initialized, the device's config is immutable. If you need to change the config you will need to initialize a new device.
-
-After initializing the device it will be in a stopped state. To start it, use `ma_device_start()`.
-
-If both `periodSizeInFrames` and `periodSizeInMilliseconds` are set to zero, it will default to `MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY` or
-`MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE`, depending on whether or not `performanceProfile` is set to `ma_performance_profile_low_latency` or
-`ma_performance_profile_conservative`.
-
-If you request exclusive mode and the backend does not support it an error will be returned. For robustness, you may want to first try initializing the device
-in exclusive mode, and then fall back to shared mode if required. Alternatively you can just request shared mode (the default if you leave it unset in the
-config) which is the most reliable option. Some backends do not have a practical way of choosing whether or not the device should be exclusive or not (ALSA,
-for example) in which case it just acts as a hint. Unless you have special requirements you should try avoiding exclusive mode as it's intrusive to the user.
-Starting with Windows 10, miniaudio will use low-latency shared mode where possible which may make exclusive mode unnecessary.
-
-When sending or receiving data to/from a device, miniaudio will internally perform a format conversion to convert between the format specified by the config
-and the format used internally by the backend. If you pass in 0 for the sample format, channel count, sample rate _and_ channel map, data transmission will run
-on an optimized pass-through fast path. You can retrieve the format, channel count and sample rate by inspecting the `playback/capture.format`,
-`playback/capture.channels` and `sampleRate` members of the device object.
-
-When compiling for UWP you must ensure you call this function on the main UI thread because the operating system may need to present the user with a message
-asking for permissions. Please refer to the official documentation for ActivateAudioInterfaceAsync() for more information.
-
-ALSA Specific: When initializing the default device, requesting shared mode will try using the "dmix" device for playback and the "dsnoop" device for capture.
-If these fail it will try falling back to the "hw" device.
-
-
-Example 1 - Simple Initialization
----------------------------------
-This example shows how to initialize a simple playback device using a standard configuration. If you are just needing to do simple playback from the default
-playback device this is usually all you need.
-
-```c
-ma_device_config config = ma_device_config_init(ma_device_type_playback);
-config.playback.format   = ma_format_f32;
-config.playback.channels = 2;
-config.sampleRate        = 48000;
-config.dataCallback      = ma_data_callback;
-config.pMyUserData       = pMyUserData;
-
-ma_device device;
-ma_result result = ma_device_init(NULL, &config, &device);
-if (result != MA_SUCCESS) {
-    // Error
-}
-```
-
-
-Example 2 - Advanced Initialization
------------------------------------
-This example shows how you might do some more advanced initialization. In this hypothetical example we want to control the latency by setting the buffer size
-and period count. We also want to allow the user to be able to choose which device to output from which means we need a context so we can perform device
-enumeration.
-
-```c
-ma_context context;
-ma_result result = ma_context_init(NULL, 0, NULL, &context);
-if (result != MA_SUCCESS) {
-    // Error
-}
-
-ma_device_info* pPlaybackDeviceInfos;
-ma_uint32 playbackDeviceCount;
-result = ma_context_get_devices(&context, &pPlaybackDeviceInfos, &playbackDeviceCount, NULL, NULL);
-if (result != MA_SUCCESS) {
-    // Error
-}
-
-// ... choose a device from pPlaybackDeviceInfos ...
-
-ma_device_config config = ma_device_config_init(ma_device_type_playback);
-config.playback.pDeviceID       = pMyChosenDeviceID;    // <-- Get this from the `id` member of one of the `ma_device_info` objects returned by ma_context_get_devices().
-config.playback.format          = ma_format_f32;
-config.playback.channels        = 2;
-config.sampleRate               = 48000;
-config.dataCallback             = ma_data_callback;
-config.pUserData                = pMyUserData;
-config.periodSizeInMilliseconds = 10;
-config.periods                  = 3;
-
-ma_device device;
-result = ma_device_init(&context, &config, &device);
-if (result != MA_SUCCESS) {
-    // Error
-}
-```
-
-
-See Also
---------
-ma_device_config_init()
-ma_device_uninit()
-ma_device_start()
-ma_context_init()
-ma_context_get_devices()
-ma_context_enumerate_devices()
-*/
-MA_API ma_result ma_device_init(ma_context* pContext, const ma_device_config* pConfig, ma_device* pDevice);
-
-/*
-Initializes a device without a context, with extra parameters for controlling the configuration of the internal self-managed context.
-
-This is the same as `ma_device_init()`, only instead of a context being passed in, the parameters from `ma_context_init()` are passed in instead. This function
-allows you to configure the internally created context.
-
-
-Parameters
-----------
-backends (in, optional)
-    A list of backends to try initializing, in priority order. Can be NULL, in which case it uses default priority order.
-
-backendCount (in, optional)
-    The number of items in `backend`. Ignored if `backend` is NULL.
-
-pContextConfig (in, optional)
-    The context configuration.
-
-pConfig (in)
-    A pointer to the device configuration. Cannot be null. See remarks for details.
-
-pDevice (out)
-    A pointer to the device object being initialized.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Unsafe. It is not safe to call this function simultaneously for different devices because some backends depend on and mutate global state. The same applies to
-calling this at the same time as `ma_device_uninit()`.
-
-
-Callback Safety
----------------
-Unsafe. It is not safe to call this inside any callback.
-
-
-Remarks
--------
-You only need to use this function if you want to configure the context differently to it's defaults. You should never use this function if you want to manage
-your own context.
-
-See the documentation for `ma_context_init()` for information on the different context configuration options.
-
-
-See Also
---------
-ma_device_init()
-ma_device_uninit()
-ma_device_config_init()
-ma_context_init()
-*/
-MA_API ma_result ma_device_init_ex(const ma_backend backends[], ma_uint32 backendCount, const ma_context_config* pContextConfig, const ma_device_config* pConfig, ma_device* pDevice);
-
-/*
-Uninitializes a device.
-
-This will explicitly stop the device. You do not need to call `ma_device_stop()` beforehand, but it's harmless if you do.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device to stop.
-
-
-Return Value
-------------
-Nothing
-
-
-Thread Safety
--------------
-Unsafe. As soon as this API is called the device should be considered undefined.
-
-
-Callback Safety
----------------
-Unsafe. It is not safe to call this inside any callback. Doing this will result in a deadlock.
-
-
-See Also
---------
-ma_device_init()
-ma_device_stop()
-*/
-MA_API void ma_device_uninit(ma_device* pDevice);
-
-
-/*
-Retrieves a pointer to the context that owns the given device.
-*/
-MA_API ma_context* ma_device_get_context(ma_device* pDevice);
-
-/*
-Helper function for retrieving the log object associated with the context that owns this device.
-*/
-MA_API ma_log* ma_device_get_log(ma_device* pDevice);
-
-
-/*
-Retrieves information about the device.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device whose information is being retrieved.
-
-type (in)
-    The device type. This parameter is required for duplex devices. When retrieving device
-    information, you are doing so for an individual playback or capture device.
-
-pDeviceInfo (out)
-    A pointer to the `ma_device_info` that will receive the device information.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Unsafe. This should be considered unsafe because it may be calling into the backend which may or
-may not be safe.
-
-
-Callback Safety
----------------
-Unsafe. You should avoid calling this in the data callback because it may call into the backend
-which may or may not be safe.
-*/
-MA_API ma_result ma_device_get_info(ma_device* pDevice, ma_device_type type, ma_device_info* pDeviceInfo);
-
-
-/*
-Retrieves the name of the device.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device whose information is being retrieved.
-
-type (in)
-    The device type. This parameter is required for duplex devices. When retrieving device
-    information, you are doing so for an individual playback or capture device.
-
-pName (out)
-    A pointer to the buffer that will receive the name.
-
-nameCap (in)
-    The capacity of the output buffer, including space for the null terminator.
-
-pLengthNotIncludingNullTerminator (out, optional)
-    A pointer to the variable that will receive the length of the name, not including the null
-    terminator.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Unsafe. This should be considered unsafe because it may be calling into the backend which may or
-may not be safe.
-
-
-Callback Safety
----------------
-Unsafe. You should avoid calling this in the data callback because it may call into the backend
-which may or may not be safe.
-
-
-Remarks
--------
-If the name does not fully fit into the output buffer, it'll be truncated. You can pass in NULL to
-`pName` if you want to first get the length of the name for the purpose of memory allocation of the
-output buffer. Allocating a buffer of size `MA_MAX_DEVICE_NAME_LENGTH + 1` should be enough for
-most cases and will avoid the need for the inefficiency of calling this function twice.
-
-This is implemented in terms of `ma_device_get_info()`.
-*/
-MA_API ma_result ma_device_get_name(ma_device* pDevice, ma_device_type type, char* pName, size_t nameCap, size_t* pLengthNotIncludingNullTerminator);
-
-
-/*
-Starts the device. For playback devices this begins playback. For capture devices it begins recording.
-
-Use `ma_device_stop()` to stop the device.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device to start.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Safe. It's safe to call this from any thread with the exception of the callback thread.
-
-
-Callback Safety
----------------
-Unsafe. It is not safe to call this inside any callback.
-
-
-Remarks
--------
-For a playback device, this will retrieve an initial chunk of audio data from the client before returning. The reason for this is to ensure there is valid
-audio data in the buffer, which needs to be done before the device begins playback.
-
-This API waits until the backend device has been started for real by the worker thread. It also waits on a mutex for thread-safety.
-
-Do not call this in any callback.
-
-
-See Also
---------
-ma_device_stop()
-*/
-MA_API ma_result ma_device_start(ma_device* pDevice);
-
-/*
-Stops the device. For playback devices this stops playback. For capture devices it stops recording.
-
-Use `ma_device_start()` to start the device again.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device to stop.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error code otherwise.
-
-
-Thread Safety
--------------
-Safe. It's safe to call this from any thread with the exception of the callback thread.
-
-
-Callback Safety
----------------
-Unsafe. It is not safe to call this inside any callback. Doing this will result in a deadlock.
-
-
-Remarks
--------
-This API needs to wait on the worker thread to stop the backend device properly before returning. It also waits on a mutex for thread-safety. In addition, some
-backends need to wait for the device to finish playback/recording of the current fragment which can take some time (usually proportionate to the buffer size
-that was specified at initialization time).
-
-Backends are required to either pause the stream in-place or drain the buffer if pausing is not possible. The reason for this is that stopping the device and
-the resuming it with ma_device_start() (which you might do when your program loses focus) may result in a situation where those samples are never output to the
-speakers or received from the microphone which can in turn result in de-syncs.
-
-Do not call this in any callback.
-
-
-See Also
---------
-ma_device_start()
-*/
-MA_API ma_result ma_device_stop(ma_device* pDevice);
-
-/*
-Determines whether or not the device is started.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device whose start state is being retrieved.
-
-
-Return Value
-------------
-True if the device is started, false otherwise.
-
-
-Thread Safety
--------------
-Safe. If another thread calls `ma_device_start()` or `ma_device_stop()` at this same time as this function is called, there's a very small chance the return
-value will be out of sync.
-
-
-Callback Safety
----------------
-Safe. This is implemented as a simple accessor.
-
-
-See Also
---------
-ma_device_start()
-ma_device_stop()
-*/
-MA_API ma_bool32 ma_device_is_started(const ma_device* pDevice);
-
-
-/*
-Retrieves the state of the device.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device whose state is being retrieved.
-
-
-Return Value
-------------
-The current state of the device. The return value will be one of the following:
-
-    +-------------------------------+------------------------------------------------------------------------------+
-    | ma_device_state_uninitialized | Will only be returned if the device is in the middle of initialization.      |
-    +-------------------------------+------------------------------------------------------------------------------+
-    | ma_device_state_stopped       | The device is stopped. The initial state of the device after initialization. |
-    +-------------------------------+------------------------------------------------------------------------------+
-    | ma_device_state_started       | The device started and requesting and/or delivering audio data.              |
-    +-------------------------------+------------------------------------------------------------------------------+
-    | ma_device_state_starting      | The device is in the process of starting.                                    |
-    +-------------------------------+------------------------------------------------------------------------------+
-    | ma_device_state_stopping      | The device is in the process of stopping.                                    |
-    +-------------------------------+------------------------------------------------------------------------------+
-
-
-Thread Safety
--------------
-Safe. This is implemented as a simple accessor. Note that if the device is started or stopped at the same time as this function is called,
-there's a possibility the return value could be out of sync. See remarks.
-
-
-Callback Safety
----------------
-Safe. This is implemented as a simple accessor.
-
-
-Remarks
--------
-The general flow of a devices state goes like this:
-
-    ```
-    ma_device_init()  -> ma_device_state_uninitialized -> ma_device_state_stopped
-    ma_device_start() -> ma_device_state_starting      -> ma_device_state_started
-    ma_device_stop()  -> ma_device_state_stopping      -> ma_device_state_stopped
-    ```
-
-When the state of the device is changed with `ma_device_start()` or `ma_device_stop()` at this same time as this function is called, the
-value returned by this function could potentially be out of sync. If this is significant to your program you need to implement your own
-synchronization.
-*/
-MA_API ma_device_state ma_device_get_state(const ma_device* pDevice);
-
-
-/*
-Performs post backend initialization routines for setting up internal data conversion.
-
-This should be called whenever the backend is initialized. The only time this should be called from
-outside of miniaudio is if you're implementing a custom backend, and you would only do it if you
-are reinitializing the backend due to rerouting or reinitializing for some reason.
-
-
-Parameters
-----------
-pDevice [in]
-    A pointer to the device.
-
-deviceType [in]
-    The type of the device that was just reinitialized.
-
-pPlaybackDescriptor [in]
-    The descriptor of the playback device containing the internal data format and buffer sizes.
-
-pPlaybackDescriptor [in]
-    The descriptor of the capture device containing the internal data format and buffer sizes.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other error otherwise.
-
-
-Thread Safety
--------------
-Unsafe. This will be reinitializing internal data converters which may be in use by another thread.
-
-
-Callback Safety
----------------
-Unsafe. This will be reinitializing internal data converters which may be in use by the callback.
-
-
-Remarks
--------
-For a duplex device, you can call this for only one side of the system. This is why the deviceType
-is specified as a parameter rather than deriving it from the device.
-
-You do not need to call this manually unless you are doing a custom backend, in which case you need
-only do it if you're manually performing rerouting or reinitialization.
-*/
-MA_API ma_result ma_device_post_init(ma_device* pDevice, ma_device_type deviceType, const ma_device_descriptor* pPlaybackDescriptor, const ma_device_descriptor* pCaptureDescriptor);
-
-
-/*
-Sets the master volume factor for the device.
-
-The volume factor must be between 0 (silence) and 1 (full volume). Use `ma_device_set_master_volume_db()` to use decibel notation, where 0 is full volume and
-values less than 0 decreases the volume.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device whose volume is being set.
-
-volume (in)
-    The new volume factor. Must be >= 0.
-
-
-Return Value
-------------
-MA_SUCCESS if the volume was set successfully.
-MA_INVALID_ARGS if pDevice is NULL.
-MA_INVALID_ARGS if volume is negative.
-
-
-Thread Safety
--------------
-Safe. This just sets a local member of the device object.
-
-
-Callback Safety
----------------
-Safe. If you set the volume in the data callback, that data written to the output buffer will have the new volume applied.
-
-
-Remarks
--------
-This applies the volume factor across all channels.
-
-This does not change the operating system's volume. It only affects the volume for the given `ma_device` object's audio stream.
-
-
-See Also
---------
-ma_device_get_master_volume()
-ma_device_set_master_volume_db()
-ma_device_get_master_volume_db()
-*/
-MA_API ma_result ma_device_set_master_volume(ma_device* pDevice, float volume);
-
-/*
-Retrieves the master volume factor for the device.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device whose volume factor is being retrieved.
-
-pVolume (in)
-    A pointer to the variable that will receive the volume factor. The returned value will be in the range of [0, 1].
-
-
-Return Value
-------------
-MA_SUCCESS if successful.
-MA_INVALID_ARGS if pDevice is NULL.
-MA_INVALID_ARGS if pVolume is NULL.
-
-
-Thread Safety
--------------
-Safe. This just a simple member retrieval.
-
-
-Callback Safety
----------------
-Safe.
-
-
-Remarks
--------
-If an error occurs, `*pVolume` will be set to 0.
-
-
-See Also
---------
-ma_device_set_master_volume()
-ma_device_set_master_volume_gain_db()
-ma_device_get_master_volume_gain_db()
-*/
-MA_API ma_result ma_device_get_master_volume(ma_device* pDevice, float* pVolume);
-
-/*
-Sets the master volume for the device as gain in decibels.
-
-A gain of 0 is full volume, whereas a gain of < 0 will decrease the volume.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device whose gain is being set.
-
-gainDB (in)
-    The new volume as gain in decibels. Must be less than or equal to 0, where 0 is full volume and anything less than 0 decreases the volume.
-
-
-Return Value
-------------
-MA_SUCCESS if the volume was set successfully.
-MA_INVALID_ARGS if pDevice is NULL.
-MA_INVALID_ARGS if the gain is > 0.
-
-
-Thread Safety
--------------
-Safe. This just sets a local member of the device object.
-
-
-Callback Safety
----------------
-Safe. If you set the volume in the data callback, that data written to the output buffer will have the new volume applied.
-
-
-Remarks
--------
-This applies the gain across all channels.
-
-This does not change the operating system's volume. It only affects the volume for the given `ma_device` object's audio stream.
-
-
-See Also
---------
-ma_device_get_master_volume_gain_db()
-ma_device_set_master_volume()
-ma_device_get_master_volume()
-*/
-MA_API ma_result ma_device_set_master_volume_db(ma_device* pDevice, float gainDB);
-
-/*
-Retrieves the master gain in decibels.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to the device whose gain is being retrieved.
-
-pGainDB (in)
-    A pointer to the variable that will receive the gain in decibels. The returned value will be <= 0.
-
-
-Return Value
-------------
-MA_SUCCESS if successful.
-MA_INVALID_ARGS if pDevice is NULL.
-MA_INVALID_ARGS if pGainDB is NULL.
-
-
-Thread Safety
--------------
-Safe. This just a simple member retrieval.
-
-
-Callback Safety
----------------
-Safe.
-
-
-Remarks
--------
-If an error occurs, `*pGainDB` will be set to 0.
-
-
-See Also
---------
-ma_device_set_master_volume_db()
-ma_device_set_master_volume()
-ma_device_get_master_volume()
-*/
-MA_API ma_result ma_device_get_master_volume_db(ma_device* pDevice, float* pGainDB);
-
-
-/*
-Called from the data callback of asynchronous backends to allow miniaudio to process the data and fire the miniaudio data callback.
-
-
-Parameters
-----------
-pDevice (in)
-    A pointer to device whose processing the data callback.
-
-pOutput (out)
-    A pointer to the buffer that will receive the output PCM frame data. On a playback device this must not be NULL. On a duplex device
-    this can be NULL, in which case pInput must not be NULL.
-
-pInput (in)
-    A pointer to the buffer containing input PCM frame data. On a capture device this must not be NULL. On a duplex device this can be
-    NULL, in which case `pOutput` must not be NULL.
-
-frameCount (in)
-    The number of frames being processed.
-
-
-Return Value
-------------
-MA_SUCCESS if successful; any other result code otherwise.
-
-
-Thread Safety
--------------
-This function should only ever be called from the internal data callback of the backend. It is safe to call this simultaneously between a
-playback and capture device in duplex setups.
-
-
-Callback Safety
----------------
-Do not call this from the miniaudio data callback. It should only ever be called from the internal data callback of the backend.
-
-
-Remarks
--------
-If both `pOutput` and `pInput` are NULL, and error will be returned. In duplex scenarios, both `pOutput` and `pInput` can be non-NULL, in
-which case `pInput` will be processed first, followed by `pOutput`.
-
-If you are implementing a custom backend, and that backend uses a callback for data delivery, you'll need to call this from inside that
-callback.
-*/
-MA_API ma_result ma_device_handle_backend_data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount);
-
-
-/*
-Calculates an appropriate buffer size from a descriptor, native sample rate and performance profile.
-
-This function is used by backends for helping determine an appropriately sized buffer to use with
-the device depending on the values of `periodSizeInFrames` and `periodSizeInMilliseconds` in the
-`pDescriptor` object. Since buffer size calculations based on time depends on the sample rate, a
-best guess at the device's native sample rate is also required which is where `nativeSampleRate`
-comes in. In addition, the performance profile is also needed for cases where both the period size
-in frames and milliseconds are both zero.
-
-
-Parameters
-----------
-pDescriptor (in)
-    A pointer to device descriptor whose `periodSizeInFrames` and `periodSizeInMilliseconds` members
-    will be used for the calculation of the buffer size.
-
-nativeSampleRate (in)
-    The device's native sample rate. This is only ever used when the `periodSizeInFrames` member of
-    `pDescriptor` is zero. In this case, `periodSizeInMilliseconds` will be used instead, in which
-    case a sample rate is required to convert to a size in frames.
-
-performanceProfile (in)
-    When both the `periodSizeInFrames` and `periodSizeInMilliseconds` members of `pDescriptor` are
-    zero, miniaudio will fall back to a buffer size based on the performance profile. The profile
-    to use for this calculation is determine by this parameter.
-
-
-Return Value
-------------
-The calculated buffer size in frames.
-
-
-Thread Safety
--------------
-This is safe so long as nothing modifies `pDescriptor` at the same time. However, this function
-should only ever be called from within the backend's device initialization routine and therefore
-shouldn't have any multithreading concerns.
-
-
-Callback Safety
----------------
-This is safe to call within the data callback, but there is no reason to ever do this.
-
-
-Remarks
--------
-If `nativeSampleRate` is zero, this function will fall back to `pDescriptor->sampleRate`. If that
-is also zero, `MA_DEFAULT_SAMPLE_RATE` will be used instead.
-*/
-MA_API ma_uint32 ma_calculate_buffer_size_in_frames_from_descriptor(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile);
-
-
-
-/*
-Retrieves a friendly name for a backend.
-*/
-MA_API const char* ma_get_backend_name(ma_backend backend);
-
-/*
-Retrieves the backend enum from the given name.
-*/
-MA_API ma_result ma_get_backend_from_name(const char* pBackendName, ma_backend* pBackend);
-
-/*
-Determines whether or not the given backend is available by the compilation environment.
-*/
-MA_API ma_bool32 ma_is_backend_enabled(ma_backend backend);
-
-/*
-Retrieves compile-time enabled backends.
-
-
-Parameters
-----------
-pBackends (out, optional)
-    A pointer to the buffer that will receive the enabled backends. Set to NULL to retrieve the backend count. Setting
-    the capacity of the buffer to `MA_BUFFER_COUNT` will guarantee it's large enough for all backends.
-
-backendCap (in)
-    The capacity of the `pBackends` buffer.
-
-pBackendCount (out)
-    A pointer to the variable that will receive the enabled backend count.
-
-
-Return Value
-------------
-MA_SUCCESS if successful.
-MA_INVALID_ARGS if `pBackendCount` is NULL.
-MA_NO_SPACE if the capacity of `pBackends` is not large enough.
-
-If `MA_NO_SPACE` is returned, the `pBackends` buffer will be filled with `*pBackendCount` values.
-
-
-Thread Safety
--------------
-Safe.
-
-
-Callback Safety
----------------
-Safe.
-
-
-Remarks
--------
-If you want to retrieve the number of backends so you can determine the capacity of `pBackends` buffer, you can call
-this function with `pBackends` set to NULL.
-
-This will also enumerate the null backend. If you don't want to include this you need to check for `ma_backend_null`
-when you enumerate over the returned backends and handle it appropriately. Alternatively, you can disable it at
-compile time with `MA_NO_NULL`.
-
-The returned backends are determined based on compile time settings, not the platform it's currently running on. For
-example, PulseAudio will be returned if it was enabled at compile time, even when the user doesn't actually have
-PulseAudio installed.
-
-
-Example 1
----------
-The example below retrieves the enabled backend count using a fixed sized buffer allocated on the stack. The buffer is
-given a capacity of `MA_BACKEND_COUNT` which will guarantee it'll be large enough to store all available backends.
-Since `MA_BACKEND_COUNT` is always a relatively small value, this should be suitable for most scenarios.
-
-```
-ma_backend enabledBackends[MA_BACKEND_COUNT];
-size_t enabledBackendCount;
-
-result = ma_get_enabled_backends(enabledBackends, MA_BACKEND_COUNT, &enabledBackendCount);
-if (result != MA_SUCCESS) {
-    // Failed to retrieve enabled backends. Should never happen in this example since all inputs are valid.
-}
-```
-
-
-See Also
---------
-ma_is_backend_enabled()
-*/
-MA_API ma_result ma_get_enabled_backends(ma_backend* pBackends, size_t backendCap, size_t* pBackendCount);
-
-/*
-Determines whether or not loopback mode is support by a backend.
-*/
-MA_API ma_bool32 ma_is_loopback_supported(ma_backend backend);
-
-#endif  /* MA_NO_DEVICE_IO */
-
-
-
-/************************************************************************************************************************************************************
-
-Utilities
-
-************************************************************************************************************************************************************/
-
-/*
-Calculates a buffer size in milliseconds from the specified number of frames and sample rate.
-*/
-MA_API ma_uint32 ma_calculate_buffer_size_in_milliseconds_from_frames(ma_uint32 bufferSizeInFrames, ma_uint32 sampleRate);
-
-/*
-Calculates a buffer size in frames from the specified number of milliseconds and sample rate.
-*/
-MA_API ma_uint32 ma_calculate_buffer_size_in_frames_from_milliseconds(ma_uint32 bufferSizeInMilliseconds, ma_uint32 sampleRate);
-
-/*
-Copies PCM frames from one buffer to another.
-*/
-MA_API void ma_copy_pcm_frames(void* dst, const void* src, ma_uint64 frameCount, ma_format format, ma_uint32 channels);
-
-/*
-Copies silent frames into the given buffer.
-
-Remarks
--------
-For all formats except `ma_format_u8`, the output buffer will be filled with 0. For `ma_format_u8` it will be filled with 128. The reason for this is that it
-makes more sense for the purpose of mixing to initialize it to the center point.
-*/
-MA_API void ma_silence_pcm_frames(void* p, ma_uint64 frameCount, ma_format format, ma_uint32 channels);
-
-
-/*
-Offsets a pointer by the specified number of PCM frames.
-*/
-MA_API void* ma_offset_pcm_frames_ptr(void* p, ma_uint64 offsetInFrames, ma_format format, ma_uint32 channels);
-MA_API const void* ma_offset_pcm_frames_const_ptr(const void* p, ma_uint64 offsetInFrames, ma_format format, ma_uint32 channels);
-static MA_INLINE float* ma_offset_pcm_frames_ptr_f32(float* p, ma_uint64 offsetInFrames, ma_uint32 channels) { return (float*)ma_offset_pcm_frames_ptr((void*)p, offsetInFrames, ma_format_f32, channels); }
-static MA_INLINE const float* ma_offset_pcm_frames_const_ptr_f32(const float* p, ma_uint64 offsetInFrames, ma_uint32 channels) { return (const float*)ma_offset_pcm_frames_const_ptr((const void*)p, offsetInFrames, ma_format_f32, channels); }
-
-
-/*
-Clips samples.
-*/
-MA_API void ma_clip_samples_u8(ma_uint8* pDst, const ma_int16* pSrc, ma_uint64 count);
-MA_API void ma_clip_samples_s16(ma_int16* pDst, const ma_int32* pSrc, ma_uint64 count);
-MA_API void ma_clip_samples_s24(ma_uint8* pDst, const ma_int64* pSrc, ma_uint64 count);
-MA_API void ma_clip_samples_s32(ma_int32* pDst, const ma_int64* pSrc, ma_uint64 count);
-MA_API void ma_clip_samples_f32(float* pDst, const float* pSrc, ma_uint64 count);
-MA_API void ma_clip_pcm_frames(void* pDst, const void* pSrc, ma_uint64 frameCount, ma_format format, ma_uint32 channels);
-
-/*
-Helper for applying a volume factor to samples.
-
-Note that the source and destination buffers can be the same, in which case it'll perform the operation in-place.
-*/
-MA_API void ma_copy_and_apply_volume_factor_u8(ma_uint8* pSamplesOut, const ma_uint8* pSamplesIn, ma_uint64 sampleCount, float factor);
-MA_API void ma_copy_and_apply_volume_factor_s16(ma_int16* pSamplesOut, const ma_int16* pSamplesIn, ma_uint64 sampleCount, float factor);
-MA_API void ma_copy_and_apply_volume_factor_s24(void* pSamplesOut, const void* pSamplesIn, ma_uint64 sampleCount, float factor);
-MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_int32* pSamplesIn, ma_uint64 sampleCount, float factor);
-MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor);
-
-MA_API void ma_apply_volume_factor_u8(ma_uint8* pSamples, ma_uint64 sampleCount, float factor);
-MA_API void ma_apply_volume_factor_s16(ma_int16* pSamples, ma_uint64 sampleCount, float factor);
-MA_API void ma_apply_volume_factor_s24(void* pSamples, ma_uint64 sampleCount, float factor);
-MA_API void ma_apply_volume_factor_s32(ma_int32* pSamples, ma_uint64 sampleCount, float factor);
-MA_API void ma_apply_volume_factor_f32(float* pSamples, ma_uint64 sampleCount, float factor);
-
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_u8(ma_uint8* pFramesOut, const ma_uint8* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s16(ma_int16* pFramesOut, const ma_int16* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s24(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s32(ma_int32* pFramesOut, const ma_int32* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float factor);
-
-MA_API void ma_apply_volume_factor_pcm_frames_u8(ma_uint8* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_apply_volume_factor_pcm_frames_s16(ma_int16* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_apply_volume_factor_pcm_frames_s24(void* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_apply_volume_factor_pcm_frames_s32(ma_int32* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_apply_volume_factor_pcm_frames_f32(float* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
-MA_API void ma_apply_volume_factor_pcm_frames(void* pFrames, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float factor);
-
-MA_API void ma_copy_and_apply_volume_factor_per_channel_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float* pChannelGains);
-
-
-MA_API void ma_copy_and_apply_volume_and_clip_samples_u8(ma_uint8* pDst, const ma_int16* pSrc, ma_uint64 count, float volume);
-MA_API void ma_copy_and_apply_volume_and_clip_samples_s16(ma_int16* pDst, const ma_int32* pSrc, ma_uint64 count, float volume);
-MA_API void ma_copy_and_apply_volume_and_clip_samples_s24(ma_uint8* pDst, const ma_int64* pSrc, ma_uint64 count, float volume);
-MA_API void ma_copy_and_apply_volume_and_clip_samples_s32(ma_int32* pDst, const ma_int64* pSrc, ma_uint64 count, float volume);
-MA_API void ma_copy_and_apply_volume_and_clip_samples_f32(float* pDst, const float* pSrc, ma_uint64 count, float volume);
-MA_API void ma_copy_and_apply_volume_and_clip_pcm_frames(void* pDst, const void* pSrc, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float volume);
-
-
-/*
-Helper for converting a linear factor to gain in decibels.
-*/
-MA_API float ma_volume_linear_to_db(float factor);
-
-/*
-Helper for converting gain in decibels to a linear factor.
-*/
-MA_API float ma_volume_db_to_linear(float gain);
-
-
-/*
-Mixes the specified number of frames in floating point format with a volume factor.
-
-This will run on an optimized path when the volume is equal to 1.
-*/
-MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume);
-
-
-
-
-/************************************************************************************************************************************************************
-
-VFS
-===
-
-The VFS object (virtual file system) is what's used to customize file access. This is useful in cases where stdio FILE* based APIs may not be entirely
-appropriate for a given situation.
-
-************************************************************************************************************************************************************/
-typedef void      ma_vfs;
-typedef ma_handle ma_vfs_file;
-
-typedef enum
-{
-    MA_OPEN_MODE_READ  = 0x00000001,
-    MA_OPEN_MODE_WRITE = 0x00000002
-} ma_open_mode_flags;
-
-typedef enum
-{
-    ma_seek_origin_start,
-    ma_seek_origin_current,
-    ma_seek_origin_end  /* Not used by decoders. */
-} ma_seek_origin;
-
-typedef struct
-{
-    ma_uint64 sizeInBytes;
-} ma_file_info;
-
-typedef struct
-{
-    ma_result (* onOpen) (ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile);
-    ma_result (* onOpenW)(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile);
-    ma_result (* onClose)(ma_vfs* pVFS, ma_vfs_file file);
-    ma_result (* onRead) (ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead);
-    ma_result (* onWrite)(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten);
-    ma_result (* onSeek) (ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin);
-    ma_result (* onTell) (ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor);
-    ma_result (* onInfo) (ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo);
-} ma_vfs_callbacks;
-
-MA_API ma_result ma_vfs_open(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile);
-MA_API ma_result ma_vfs_open_w(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile);
-MA_API ma_result ma_vfs_close(ma_vfs* pVFS, ma_vfs_file file);
-MA_API ma_result ma_vfs_read(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead);
-MA_API ma_result ma_vfs_write(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten);
-MA_API ma_result ma_vfs_seek(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin);
-MA_API ma_result ma_vfs_tell(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor);
-MA_API ma_result ma_vfs_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo);
-MA_API ma_result ma_vfs_open_and_read_file(ma_vfs* pVFS, const char* pFilePath, void** ppData, size_t* pSize, const ma_allocation_callbacks* pAllocationCallbacks);
-
-typedef struct
-{
-    ma_vfs_callbacks cb;
-    ma_allocation_callbacks allocationCallbacks;    /* Only used for the wchar_t version of open() on non-Windows platforms. */
-} ma_default_vfs;
-
-MA_API ma_result ma_default_vfs_init(ma_default_vfs* pVFS, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-
-typedef ma_result (* ma_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead);
-typedef ma_result (* ma_seek_proc)(void* pUserData, ma_int64 offset, ma_seek_origin origin);
-typedef ma_result (* ma_tell_proc)(void* pUserData, ma_int64* pCursor);
-
-
-
-#if !defined(MA_NO_DECODING) || !defined(MA_NO_ENCODING)
-typedef enum
-{
-    ma_encoding_format_unknown = 0,
-    ma_encoding_format_wav,
-    ma_encoding_format_flac,
-    ma_encoding_format_mp3,
-    ma_encoding_format_vorbis
-} ma_encoding_format;
-#endif
-
-/************************************************************************************************************************************************************
-
-Decoding
-========
-
-Decoders are independent of the main device API. Decoding APIs can be called freely inside the device's data callback, but they are not thread safe unless
-you do your own synchronization.
-
-************************************************************************************************************************************************************/
-#ifndef MA_NO_DECODING
-typedef struct ma_decoder ma_decoder;
-
-
-typedef struct
-{
-    ma_format preferredFormat;
-    ma_uint32 seekPointCount;   /* Set to > 0 to generate a seektable if the decoding backend supports it. */
-} ma_decoding_backend_config;
-
-MA_API ma_decoding_backend_config ma_decoding_backend_config_init(ma_format preferredFormat, ma_uint32 seekPointCount);
-
-
-typedef struct
-{
-    ma_result (* onInit      )(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend);
-    ma_result (* onInitFile  )(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend);               /* Optional. */
-    ma_result (* onInitFileW )(void* pUserData, const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend);            /* Optional. */
-    ma_result (* onInitMemory)(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend);  /* Optional. */
-    void      (* onUninit    )(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks);
-} ma_decoding_backend_vtable;
-
-
-typedef ma_result (* ma_decoder_read_proc)(ma_decoder* pDecoder, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead);         /* Returns the number of bytes read. */
-typedef ma_result (* ma_decoder_seek_proc)(ma_decoder* pDecoder, ma_int64 byteOffset, ma_seek_origin origin);
-typedef ma_result (* ma_decoder_tell_proc)(ma_decoder* pDecoder, ma_int64* pCursor);
-
-typedef struct
-{
-    ma_format format;      /* Set to 0 or ma_format_unknown to use the stream's internal format. */
-    ma_uint32 channels;    /* Set to 0 to use the stream's internal channels. */
-    ma_uint32 sampleRate;  /* Set to 0 to use the stream's internal sample rate. */
-    ma_channel* pChannelMap;
-    ma_channel_mix_mode channelMixMode;
-    ma_dither_mode ditherMode;
-    ma_resampler_config resampling;
-    ma_allocation_callbacks allocationCallbacks;
-    ma_encoding_format encodingFormat;
-    ma_uint32 seekPointCount;   /* When set to > 0, specifies the number of seek points to use for the generation of a seek table. Not all decoding backends support this. */
-    ma_decoding_backend_vtable** ppCustomBackendVTables;
-    ma_uint32 customBackendCount;
-    void* pCustomBackendUserData;
-} ma_decoder_config;
-
-struct ma_decoder
-{
-    ma_data_source_base ds;
-    ma_data_source* pBackend;                   /* The decoding backend we'll be pulling data from. */
-    const ma_decoding_backend_vtable* pBackendVTable; /* The vtable for the decoding backend. This needs to be stored so we can access the onUninit() callback. */
-    void* pBackendUserData;
-    ma_decoder_read_proc onRead;
-    ma_decoder_seek_proc onSeek;
-    ma_decoder_tell_proc onTell;
-    void* pUserData;
-    ma_uint64 readPointerInPCMFrames;      /* In output sample rate. Used for keeping track of how many frames are available for decoding. */
-    ma_format outputFormat;
-    ma_uint32 outputChannels;
-    ma_uint32 outputSampleRate;
-    ma_data_converter converter;    /* Data conversion is achieved by running frames through this. */
-    void* pInputCache;              /* In input format. Can be null if it's not needed. */
-    ma_uint64 inputCacheCap;        /* The capacity of the input cache. */
-    ma_uint64 inputCacheConsumed;   /* The number of frames that have been consumed in the cache. Used for determining the next valid frame. */
-    ma_uint64 inputCacheRemaining;  /* The number of valid frames remaining in the cahce. */
-    ma_allocation_callbacks allocationCallbacks;
-    union
-    {
-        struct
-        {
-            ma_vfs* pVFS;
-            ma_vfs_file file;
-        } vfs;
-        struct
-        {
-            const ma_uint8* pData;
-            size_t dataSize;
-            size_t currentReadPos;
-        } memory;               /* Only used for decoders that were opened against a block of memory. */
-    } data;
-};
-
-MA_API ma_decoder_config ma_decoder_config_init(ma_format outputFormat, ma_uint32 outputChannels, ma_uint32 outputSampleRate);
-MA_API ma_decoder_config ma_decoder_config_init_default(void);
-
-MA_API ma_result ma_decoder_init(ma_decoder_read_proc onRead, ma_decoder_seek_proc onSeek, void* pUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
-MA_API ma_result ma_decoder_init_memory(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
-MA_API ma_result ma_decoder_init_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
-MA_API ma_result ma_decoder_init_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
-MA_API ma_result ma_decoder_init_file(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
-MA_API ma_result ma_decoder_init_file_w(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
-
-/*
-Uninitializes a decoder.
-*/
-MA_API ma_result ma_decoder_uninit(ma_decoder* pDecoder);
-
-/*
-Reads PCM frames from the given decoder.
-
-This is not thread safe without your own synchronization.
-*/
-MA_API ma_result ma_decoder_read_pcm_frames(ma_decoder* pDecoder, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-
-/*
-Seeks to a PCM frame based on it's absolute index.
-
-This is not thread safe without your own synchronization.
-*/
-MA_API ma_result ma_decoder_seek_to_pcm_frame(ma_decoder* pDecoder, ma_uint64 frameIndex);
-
-/*
-Retrieves the decoder's output data format.
-*/
-MA_API ma_result ma_decoder_get_data_format(ma_decoder* pDecoder, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-
-/*
-Retrieves the current position of the read cursor in PCM frames.
-*/
-MA_API ma_result ma_decoder_get_cursor_in_pcm_frames(ma_decoder* pDecoder, ma_uint64* pCursor);
-
-/*
-Retrieves the length of the decoder in PCM frames.
-
-Do not call this on streams of an undefined length, such as internet radio.
-
-If the length is unknown or an error occurs, 0 will be returned.
-
-This will always return 0 for Vorbis decoders. This is due to a limitation with stb_vorbis in push mode which is what miniaudio
-uses internally.
-
-For MP3's, this will decode the entire file. Do not call this in time critical scenarios.
-
-This function is not thread safe without your own synchronization.
-*/
-MA_API ma_result ma_decoder_get_length_in_pcm_frames(ma_decoder* pDecoder, ma_uint64* pLength);
-
-/*
-Retrieves the number of frames that can be read before reaching the end.
-
-This calls `ma_decoder_get_length_in_pcm_frames()` so you need to be aware of the rules for that function, in
-particular ensuring you do not call it on streams of an undefined length, such as internet radio.
-
-If the total length of the decoder cannot be retrieved, such as with Vorbis decoders, `MA_NOT_IMPLEMENTED` will be
-returned.
-*/
-MA_API ma_result ma_decoder_get_available_frames(ma_decoder* pDecoder, ma_uint64* pAvailableFrames);
-
-/*
-Helper for opening and decoding a file into a heap allocated block of memory. Free the returned pointer with ma_free(). On input,
-pConfig should be set to what you want. On output it will be set to what you got.
-*/
-MA_API ma_result ma_decode_from_vfs(ma_vfs* pVFS, const char* pFilePath, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut);
-MA_API ma_result ma_decode_file(const char* pFilePath, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut);
-MA_API ma_result ma_decode_memory(const void* pData, size_t dataSize, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut);
-
-#endif  /* MA_NO_DECODING */
-
-
-/************************************************************************************************************************************************************
-
-Encoding
-========
-
-Encoders do not perform any format conversion for you. If your target format does not support the format, and error will be returned.
-
-************************************************************************************************************************************************************/
-#ifndef MA_NO_ENCODING
-typedef struct ma_encoder ma_encoder;
-
-typedef ma_result (* ma_encoder_write_proc)           (ma_encoder* pEncoder, const void* pBufferIn, size_t bytesToWrite, size_t* pBytesWritten);
-typedef ma_result (* ma_encoder_seek_proc)            (ma_encoder* pEncoder, ma_int64 offset, ma_seek_origin origin);
-typedef ma_result (* ma_encoder_init_proc)            (ma_encoder* pEncoder);
-typedef void      (* ma_encoder_uninit_proc)          (ma_encoder* pEncoder);
-typedef ma_result (* ma_encoder_write_pcm_frames_proc)(ma_encoder* pEncoder, const void* pFramesIn, ma_uint64 frameCount, ma_uint64* pFramesWritten);
-
-typedef struct
-{
-    ma_encoding_format encodingFormat;
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_allocation_callbacks allocationCallbacks;
-} ma_encoder_config;
-
-MA_API ma_encoder_config ma_encoder_config_init(ma_encoding_format encodingFormat, ma_format format, ma_uint32 channels, ma_uint32 sampleRate);
-
-struct ma_encoder
-{
-    ma_encoder_config config;
-    ma_encoder_write_proc onWrite;
-    ma_encoder_seek_proc onSeek;
-    ma_encoder_init_proc onInit;
-    ma_encoder_uninit_proc onUninit;
-    ma_encoder_write_pcm_frames_proc onWritePCMFrames;
-    void* pUserData;
-    void* pInternalEncoder;
-    union
-    {
-        struct
-        {
-            ma_vfs* pVFS;
-            ma_vfs_file file;
-        } vfs;
-    } data;
-};
-
-MA_API ma_result ma_encoder_init(ma_encoder_write_proc onWrite, ma_encoder_seek_proc onSeek, void* pUserData, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
-MA_API ma_result ma_encoder_init_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
-MA_API ma_result ma_encoder_init_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
-MA_API ma_result ma_encoder_init_file(const char* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
-MA_API ma_result ma_encoder_init_file_w(const wchar_t* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
-MA_API void ma_encoder_uninit(ma_encoder* pEncoder);
-MA_API ma_result ma_encoder_write_pcm_frames(ma_encoder* pEncoder, const void* pFramesIn, ma_uint64 frameCount, ma_uint64* pFramesWritten);
-
-#endif /* MA_NO_ENCODING */
-
-
-/************************************************************************************************************************************************************
-
-Generation
-
-************************************************************************************************************************************************************/
-#ifndef MA_NO_GENERATION
-typedef enum
-{
-    ma_waveform_type_sine,
-    ma_waveform_type_square,
-    ma_waveform_type_triangle,
-    ma_waveform_type_sawtooth
-} ma_waveform_type;
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_waveform_type type;
-    double amplitude;
-    double frequency;
-} ma_waveform_config;
-
-MA_API ma_waveform_config ma_waveform_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_waveform_type type, double amplitude, double frequency);
-
-typedef struct
-{
-    ma_data_source_base ds;
-    ma_waveform_config config;
-    double advance;
-    double time;
-} ma_waveform;
-
-MA_API ma_result ma_waveform_init(const ma_waveform_config* pConfig, ma_waveform* pWaveform);
-MA_API void ma_waveform_uninit(ma_waveform* pWaveform);
-MA_API ma_result ma_waveform_read_pcm_frames(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_waveform_seek_to_pcm_frame(ma_waveform* pWaveform, ma_uint64 frameIndex);
-MA_API ma_result ma_waveform_set_amplitude(ma_waveform* pWaveform, double amplitude);
-MA_API ma_result ma_waveform_set_frequency(ma_waveform* pWaveform, double frequency);
-MA_API ma_result ma_waveform_set_type(ma_waveform* pWaveform, ma_waveform_type type);
-MA_API ma_result ma_waveform_set_sample_rate(ma_waveform* pWaveform, ma_uint32 sampleRate);
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    double dutyCycle;
-    double amplitude;
-    double frequency;
-} ma_pulsewave_config;
-
-MA_API ma_pulsewave_config ma_pulsewave_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double dutyCycle, double amplitude, double frequency);
-
-typedef struct
-{
-    ma_waveform waveform;
-    ma_pulsewave_config config;
-} ma_pulsewave;
-
-MA_API ma_result ma_pulsewave_init(const ma_pulsewave_config* pConfig, ma_pulsewave* pWaveform);
-MA_API void ma_pulsewave_uninit(ma_pulsewave* pWaveform);
-MA_API ma_result ma_pulsewave_read_pcm_frames(ma_pulsewave* pWaveform, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_pulsewave_seek_to_pcm_frame(ma_pulsewave* pWaveform, ma_uint64 frameIndex);
-MA_API ma_result ma_pulsewave_set_amplitude(ma_pulsewave* pWaveform, double amplitude);
-MA_API ma_result ma_pulsewave_set_frequency(ma_pulsewave* pWaveform, double frequency);
-MA_API ma_result ma_pulsewave_set_sample_rate(ma_pulsewave* pWaveform, ma_uint32 sampleRate);
-MA_API ma_result ma_pulsewave_set_duty_cycle(ma_pulsewave* pWaveform, double dutyCycle);
-
-typedef enum
-{
-    ma_noise_type_white,
-    ma_noise_type_pink,
-    ma_noise_type_brownian
-} ma_noise_type;
-
-
-typedef struct
-{
-    ma_format format;
-    ma_uint32 channels;
-    ma_noise_type type;
-    ma_int32 seed;
-    double amplitude;
-    ma_bool32 duplicateChannels;
-} ma_noise_config;
-
-MA_API ma_noise_config ma_noise_config_init(ma_format format, ma_uint32 channels, ma_noise_type type, ma_int32 seed, double amplitude);
-
-typedef struct
-{
-    ma_data_source_base ds;
-    ma_noise_config config;
-    ma_lcg lcg;
-    union
-    {
-        struct
-        {
-            double** bin;
-            double* accumulation;
-            ma_uint32* counter;
-        } pink;
-        struct
-        {
-            double* accumulation;
-        } brownian;
-    } state;
-
-    /* Memory management. */
-    void* _pHeap;
-    ma_bool32 _ownsHeap;
-} ma_noise;
-
-MA_API ma_result ma_noise_get_heap_size(const ma_noise_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_noise_init_preallocated(const ma_noise_config* pConfig, void* pHeap, ma_noise* pNoise);
-MA_API ma_result ma_noise_init(const ma_noise_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_noise* pNoise);
-MA_API void ma_noise_uninit(ma_noise* pNoise, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_noise_read_pcm_frames(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_noise_set_amplitude(ma_noise* pNoise, double amplitude);
-MA_API ma_result ma_noise_set_seed(ma_noise* pNoise, ma_int32 seed);
-MA_API ma_result ma_noise_set_type(ma_noise* pNoise, ma_noise_type type);
-
-#endif  /* MA_NO_GENERATION */
-
-
-
-/************************************************************************************************************************************************************
-
-Resource Manager
-
-************************************************************************************************************************************************************/
-/* The resource manager cannot be enabled if there is no decoder. */
-#if !defined(MA_NO_RESOURCE_MANAGER) && defined(MA_NO_DECODING)
-#define MA_NO_RESOURCE_MANAGER
-#endif
-
-#ifndef MA_NO_RESOURCE_MANAGER
-typedef struct ma_resource_manager                  ma_resource_manager;
-typedef struct ma_resource_manager_data_buffer_node ma_resource_manager_data_buffer_node;
-typedef struct ma_resource_manager_data_buffer      ma_resource_manager_data_buffer;
-typedef struct ma_resource_manager_data_stream      ma_resource_manager_data_stream;
-typedef struct ma_resource_manager_data_source      ma_resource_manager_data_source;
-
-typedef enum
-{
-    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM         = 0x00000001,   /* When set, does not load the entire data source in memory. Disk I/O will happen on job threads. */
-    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE         = 0x00000002,   /* Decode data before storing in memory. When set, decoding is done at the resource manager level rather than the mixing thread. Results in faster mixing, but higher memory usage. */
-    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC          = 0x00000004,   /* When set, the resource manager will load the data source asynchronously. */
-    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT      = 0x00000008,   /* When set, waits for initialization of the underlying data source before returning from ma_resource_manager_data_source_init(). */
-    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_UNKNOWN_LENGTH = 0x00000010    /* Gives the resource manager a hint that the length of the data source is unknown and calling `ma_data_source_get_length_in_pcm_frames()` should be avoided. */
-} ma_resource_manager_data_source_flags;
-
-
-/*
-Pipeline notifications used by the resource manager. Made up of both an async notification and a fence, both of which are optional.
-*/
-typedef struct
-{
-    ma_async_notification* pNotification;
-    ma_fence* pFence;
-} ma_resource_manager_pipeline_stage_notification;
-
-typedef struct
-{
-    ma_resource_manager_pipeline_stage_notification init;    /* Initialization of the decoder. */
-    ma_resource_manager_pipeline_stage_notification done;    /* Decoding fully completed. */
-} ma_resource_manager_pipeline_notifications;
-
-MA_API ma_resource_manager_pipeline_notifications ma_resource_manager_pipeline_notifications_init(void);
-
-
-
-/* BEGIN BACKWARDS COMPATIBILITY */
-/* TODO: Remove this block in version 0.12. */
-#if 1
-#define ma_resource_manager_job                         ma_job
-#define ma_resource_manager_job_init                    ma_job_init
-#define MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_FLAG_NON_BLOCKING MA_JOB_QUEUE_FLAG_NON_BLOCKING
-#define ma_resource_manager_job_queue_config            ma_job_queue_config
-#define ma_resource_manager_job_queue_config_init       ma_job_queue_config_init
-#define ma_resource_manager_job_queue                   ma_job_queue
-#define ma_resource_manager_job_queue_get_heap_size     ma_job_queue_get_heap_size
-#define ma_resource_manager_job_queue_init_preallocated ma_job_queue_init_preallocated
-#define ma_resource_manager_job_queue_init              ma_job_queue_init
-#define ma_resource_manager_job_queue_uninit            ma_job_queue_uninit
-#define ma_resource_manager_job_queue_post              ma_job_queue_post
-#define ma_resource_manager_job_queue_next              ma_job_queue_next
-#endif
-/* END BACKWARDS COMPATIBILITY */
-
-
-
-
-/* Maximum job thread count will be restricted to this, but this may be removed later and replaced with a heap allocation thereby removing any limitation. */
-#ifndef MA_RESOURCE_MANAGER_MAX_JOB_THREAD_COUNT
-#define MA_RESOURCE_MANAGER_MAX_JOB_THREAD_COUNT    64
-#endif
-
-typedef enum
-{
-    /* Indicates ma_resource_manager_next_job() should not block. Only valid when the job thread count is 0. */
-    MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING = 0x00000001,
-
-    /* Disables any kind of multithreading. Implicitly enables MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING. */
-    MA_RESOURCE_MANAGER_FLAG_NO_THREADING = 0x00000002
-} ma_resource_manager_flags;
-
-typedef struct
-{
-    const char* pFilePath;
-    const wchar_t* pFilePathW;
-    const ma_resource_manager_pipeline_notifications* pNotifications;
-    ma_uint64 initialSeekPointInPCMFrames;
-    ma_uint64 rangeBegInPCMFrames;
-    ma_uint64 rangeEndInPCMFrames;
-    ma_uint64 loopPointBegInPCMFrames;
-    ma_uint64 loopPointEndInPCMFrames;
-    ma_bool32 isLooping;
-    ma_uint32 flags;
-} ma_resource_manager_data_source_config;
-
-MA_API ma_resource_manager_data_source_config ma_resource_manager_data_source_config_init(void);
-
-
-typedef enum
-{
-    ma_resource_manager_data_supply_type_unknown = 0,   /* Used for determining whether or the data supply has been initialized. */
-    ma_resource_manager_data_supply_type_encoded,       /* Data supply is an encoded buffer. Connector is ma_decoder. */
-    ma_resource_manager_data_supply_type_decoded,       /* Data supply is a decoded buffer. Connector is ma_audio_buffer. */
-    ma_resource_manager_data_supply_type_decoded_paged  /* Data supply is a linked list of decoded buffers. Connector is ma_paged_audio_buffer. */
-} ma_resource_manager_data_supply_type;
-
-typedef struct
-{
-    MA_ATOMIC(4, ma_resource_manager_data_supply_type) type;    /* Read and written from different threads so needs to be accessed atomically. */
-    union
-    {
-        struct
-        {
-            const void* pData;
-            size_t sizeInBytes;
-        } encoded;
-        struct
-        {
-            const void* pData;
-            ma_uint64 totalFrameCount;
-            ma_uint64 decodedFrameCount;
-            ma_format format;
-            ma_uint32 channels;
-            ma_uint32 sampleRate;
-        } decoded;
-        struct
-        {
-            ma_paged_audio_buffer_data data;
-            ma_uint64 decodedFrameCount;
-            ma_uint32 sampleRate;
-        } decodedPaged;
-    } backend;
-} ma_resource_manager_data_supply;
-
-struct ma_resource_manager_data_buffer_node
-{
-    ma_uint32 hashedName32;                         /* The hashed name. This is the key. */
-    ma_uint32 refCount;
-    MA_ATOMIC(4, ma_result) result;                 /* Result from asynchronous loading. When loading set to MA_BUSY. When fully loaded set to MA_SUCCESS. When deleting set to MA_UNAVAILABLE. */
-    MA_ATOMIC(4, ma_uint32) executionCounter;       /* For allocating execution orders for jobs. */
-    MA_ATOMIC(4, ma_uint32) executionPointer;       /* For managing the order of execution for asynchronous jobs relating to this object. Incremented as jobs complete processing. */
-    ma_bool32 isDataOwnedByResourceManager;         /* Set to true when the underlying data buffer was allocated the resource manager. Set to false if it is owned by the application (via ma_resource_manager_register_*()). */
-    ma_resource_manager_data_supply data;
-    ma_resource_manager_data_buffer_node* pParent;
-    ma_resource_manager_data_buffer_node* pChildLo;
-    ma_resource_manager_data_buffer_node* pChildHi;
-};
-
-struct ma_resource_manager_data_buffer
-{
-    ma_data_source_base ds;                         /* Base data source. A data buffer is a data source. */
-    ma_resource_manager* pResourceManager;          /* A pointer to the resource manager that owns this buffer. */
-    ma_resource_manager_data_buffer_node* pNode;    /* The data node. This is reference counted and is what supplies the data. */
-    ma_uint32 flags;                                /* The flags that were passed used to initialize the buffer. */
-    MA_ATOMIC(4, ma_uint32) executionCounter;       /* For allocating execution orders for jobs. */
-    MA_ATOMIC(4, ma_uint32) executionPointer;       /* For managing the order of execution for asynchronous jobs relating to this object. Incremented as jobs complete processing. */
-    ma_uint64 seekTargetInPCMFrames;                /* Only updated by the public API. Never written nor read from the job thread. */
-    ma_bool32 seekToCursorOnNextRead;               /* On the next read we need to seek to the frame cursor. */
-    MA_ATOMIC(4, ma_result) result;                 /* Keeps track of a result of decoding. Set to MA_BUSY while the buffer is still loading. Set to MA_SUCCESS when loading is finished successfully. Otherwise set to some other code. */
-    MA_ATOMIC(4, ma_bool32) isLooping;              /* Can be read and written by different threads at the same time. Must be used atomically. */
-    ma_atomic_bool32 isConnectorInitialized;        /* Used for asynchronous loading to ensure we don't try to initialize the connector multiple times while waiting for the node to fully load. */
-    union
-    {
-        ma_decoder decoder;                 /* Supply type is ma_resource_manager_data_supply_type_encoded */
-        ma_audio_buffer buffer;             /* Supply type is ma_resource_manager_data_supply_type_decoded */
-        ma_paged_audio_buffer pagedBuffer;  /* Supply type is ma_resource_manager_data_supply_type_decoded_paged */
-    } connector;    /* Connects this object to the node's data supply. */
-};
-
-struct ma_resource_manager_data_stream
-{
-    ma_data_source_base ds;                     /* Base data source. A data stream is a data source. */
-    ma_resource_manager* pResourceManager;      /* A pointer to the resource manager that owns this data stream. */
-    ma_uint32 flags;                            /* The flags that were passed used to initialize the stream. */
-    ma_decoder decoder;                         /* Used for filling pages with data. This is only ever accessed by the job thread. The public API should never touch this. */
-    ma_bool32 isDecoderInitialized;             /* Required for determining whether or not the decoder should be uninitialized in MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_STREAM. */
-    ma_uint64 totalLengthInPCMFrames;           /* This is calculated when first loaded by the MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_STREAM. */
-    ma_uint32 relativeCursor;                   /* The playback cursor, relative to the current page. Only ever accessed by the public API. Never accessed by the job thread. */
-    MA_ATOMIC(8, ma_uint64) absoluteCursor;     /* The playback cursor, in absolute position starting from the start of the file. */
-    ma_uint32 currentPageIndex;                 /* Toggles between 0 and 1. Index 0 is the first half of pPageData. Index 1 is the second half. Only ever accessed by the public API. Never accessed by the job thread. */
-    MA_ATOMIC(4, ma_uint32) executionCounter;   /* For allocating execution orders for jobs. */
-    MA_ATOMIC(4, ma_uint32) executionPointer;   /* For managing the order of execution for asynchronous jobs relating to this object. Incremented as jobs complete processing. */
-
-    /* Written by the public API, read by the job thread. */
-    MA_ATOMIC(4, ma_bool32) isLooping;          /* Whether or not the stream is looping. It's important to set the looping flag at the data stream level for smooth loop transitions. */
-
-    /* Written by the job thread, read by the public API. */
-    void* pPageData;                            /* Buffer containing the decoded data of each page. Allocated once at initialization time. */
-    MA_ATOMIC(4, ma_uint32) pageFrameCount[2];  /* The number of valid PCM frames in each page. Used to determine the last valid frame. */
-
-    /* Written and read by both the public API and the job thread. These must be atomic. */
-    MA_ATOMIC(4, ma_result) result;             /* Result from asynchronous loading. When loading set to MA_BUSY. When initialized set to MA_SUCCESS. When deleting set to MA_UNAVAILABLE. If an error occurs when loading, set to an error code. */
-    MA_ATOMIC(4, ma_bool32) isDecoderAtEnd;     /* Whether or not the decoder has reached the end. */
-    MA_ATOMIC(4, ma_bool32) isPageValid[2];     /* Booleans to indicate whether or not a page is valid. Set to false by the public API, set to true by the job thread. Set to false as the pages are consumed, true when they are filled. */
-    MA_ATOMIC(4, ma_bool32) seekCounter;        /* When 0, no seeking is being performed. When > 0, a seek is being performed and reading should be delayed with MA_BUSY. */
-};
-
-struct ma_resource_manager_data_source
-{
-    union
-    {
-        ma_resource_manager_data_buffer buffer;
-        ma_resource_manager_data_stream stream;
-    } backend;  /* Must be the first item because we need the first item to be the data source callbacks for the buffer or stream. */
-
-    ma_uint32 flags;                          /* The flags that were passed in to ma_resource_manager_data_source_init(). */
-    MA_ATOMIC(4, ma_uint32) executionCounter;     /* For allocating execution orders for jobs. */
-    MA_ATOMIC(4, ma_uint32) executionPointer;     /* For managing the order of execution for asynchronous jobs relating to this object. Incremented as jobs complete processing. */
-};
-
-typedef struct
-{
-    ma_allocation_callbacks allocationCallbacks;
-    ma_log* pLog;
-    ma_format decodedFormat;        /* The decoded format to use. Set to ma_format_unknown (default) to use the file's native format. */
-    ma_uint32 decodedChannels;      /* The decoded channel count to use. Set to 0 (default) to use the file's native channel count. */
-    ma_uint32 decodedSampleRate;    /* the decoded sample rate to use. Set to 0 (default) to use the file's native sample rate. */
-    ma_uint32 jobThreadCount;       /* Set to 0 if you want to self-manage your job threads. Defaults to 1. */
-    size_t jobThreadStackSize;
-    ma_uint32 jobQueueCapacity;     /* The maximum number of jobs that can fit in the queue at a time. Defaults to MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_CAPACITY. Cannot be zero. */
-    ma_uint32 flags;
-    ma_vfs* pVFS;                   /* Can be NULL in which case defaults will be used. */
-    ma_decoding_backend_vtable** ppCustomDecodingBackendVTables;
-    ma_uint32 customDecodingBackendCount;
-    void* pCustomDecodingBackendUserData;
-} ma_resource_manager_config;
-
-MA_API ma_resource_manager_config ma_resource_manager_config_init(void);
-
-struct ma_resource_manager
-{
-    ma_resource_manager_config config;
-    ma_resource_manager_data_buffer_node* pRootDataBufferNode;      /* The root buffer in the binary tree. */
-#ifndef MA_NO_THREADING
-    ma_mutex dataBufferBSTLock;                                     /* For synchronizing access to the data buffer binary tree. */
-    ma_thread jobThreads[MA_RESOURCE_MANAGER_MAX_JOB_THREAD_COUNT]; /* The threads for executing jobs. */
-#endif
-    ma_job_queue jobQueue;                                          /* Multi-consumer, multi-producer job queue for managing jobs for asynchronous decoding and streaming. */
-    ma_default_vfs defaultVFS;                                      /* Only used if a custom VFS is not specified. */
-    ma_log log;                                                     /* Only used if no log was specified in the config. */
-};
-
-/* Init. */
-MA_API ma_result ma_resource_manager_init(const ma_resource_manager_config* pConfig, ma_resource_manager* pResourceManager);
-MA_API void ma_resource_manager_uninit(ma_resource_manager* pResourceManager);
-MA_API ma_log* ma_resource_manager_get_log(ma_resource_manager* pResourceManager);
-
-/* Registration. */
-MA_API ma_result ma_resource_manager_register_file(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags);
-MA_API ma_result ma_resource_manager_register_file_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags);
-MA_API ma_result ma_resource_manager_register_decoded_data(ma_resource_manager* pResourceManager, const char* pName, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate);  /* Does not copy. Increments the reference count if already exists and returns MA_SUCCESS. */
-MA_API ma_result ma_resource_manager_register_decoded_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate);
-MA_API ma_result ma_resource_manager_register_encoded_data(ma_resource_manager* pResourceManager, const char* pName, const void* pData, size_t sizeInBytes);    /* Does not copy. Increments the reference count if already exists and returns MA_SUCCESS. */
-MA_API ma_result ma_resource_manager_register_encoded_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName, const void* pData, size_t sizeInBytes);
-MA_API ma_result ma_resource_manager_unregister_file(ma_resource_manager* pResourceManager, const char* pFilePath);
-MA_API ma_result ma_resource_manager_unregister_file_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath);
-MA_API ma_result ma_resource_manager_unregister_data(ma_resource_manager* pResourceManager, const char* pName);
-MA_API ma_result ma_resource_manager_unregister_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName);
-
-/* Data Buffers. */
-MA_API ma_result ma_resource_manager_data_buffer_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_buffer* pDataBuffer);
-MA_API ma_result ma_resource_manager_data_buffer_init(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_buffer* pDataBuffer);
-MA_API ma_result ma_resource_manager_data_buffer_init_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_buffer* pDataBuffer);
-MA_API ma_result ma_resource_manager_data_buffer_init_copy(ma_resource_manager* pResourceManager, const ma_resource_manager_data_buffer* pExistingDataBuffer, ma_resource_manager_data_buffer* pDataBuffer);
-MA_API ma_result ma_resource_manager_data_buffer_uninit(ma_resource_manager_data_buffer* pDataBuffer);
-MA_API ma_result ma_resource_manager_data_buffer_read_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_resource_manager_data_buffer_seek_to_pcm_frame(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64 frameIndex);
-MA_API ma_result ma_resource_manager_data_buffer_get_data_format(ma_resource_manager_data_buffer* pDataBuffer, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_resource_manager_data_buffer_get_cursor_in_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pCursor);
-MA_API ma_result ma_resource_manager_data_buffer_get_length_in_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pLength);
-MA_API ma_result ma_resource_manager_data_buffer_result(const ma_resource_manager_data_buffer* pDataBuffer);
-MA_API ma_result ma_resource_manager_data_buffer_set_looping(ma_resource_manager_data_buffer* pDataBuffer, ma_bool32 isLooping);
-MA_API ma_bool32 ma_resource_manager_data_buffer_is_looping(const ma_resource_manager_data_buffer* pDataBuffer);
-MA_API ma_result ma_resource_manager_data_buffer_get_available_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pAvailableFrames);
-
-/* Data Streams. */
-MA_API ma_result ma_resource_manager_data_stream_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_stream* pDataStream);
-MA_API ma_result ma_resource_manager_data_stream_init(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_stream* pDataStream);
-MA_API ma_result ma_resource_manager_data_stream_init_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_stream* pDataStream);
-MA_API ma_result ma_resource_manager_data_stream_uninit(ma_resource_manager_data_stream* pDataStream);
-MA_API ma_result ma_resource_manager_data_stream_read_pcm_frames(ma_resource_manager_data_stream* pDataStream, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_resource_manager_data_stream_seek_to_pcm_frame(ma_resource_manager_data_stream* pDataStream, ma_uint64 frameIndex);
-MA_API ma_result ma_resource_manager_data_stream_get_data_format(ma_resource_manager_data_stream* pDataStream, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_resource_manager_data_stream_get_cursor_in_pcm_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pCursor);
-MA_API ma_result ma_resource_manager_data_stream_get_length_in_pcm_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pLength);
-MA_API ma_result ma_resource_manager_data_stream_result(const ma_resource_manager_data_stream* pDataStream);
-MA_API ma_result ma_resource_manager_data_stream_set_looping(ma_resource_manager_data_stream* pDataStream, ma_bool32 isLooping);
-MA_API ma_bool32 ma_resource_manager_data_stream_is_looping(const ma_resource_manager_data_stream* pDataStream);
-MA_API ma_result ma_resource_manager_data_stream_get_available_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pAvailableFrames);
-
-/* Data Sources. */
-MA_API ma_result ma_resource_manager_data_source_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_source* pDataSource);
-MA_API ma_result ma_resource_manager_data_source_init(ma_resource_manager* pResourceManager, const char* pName, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_source* pDataSource);
-MA_API ma_result ma_resource_manager_data_source_init_w(ma_resource_manager* pResourceManager, const wchar_t* pName, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_source* pDataSource);
-MA_API ma_result ma_resource_manager_data_source_init_copy(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source* pExistingDataSource, ma_resource_manager_data_source* pDataSource);
-MA_API ma_result ma_resource_manager_data_source_uninit(ma_resource_manager_data_source* pDataSource);
-MA_API ma_result ma_resource_manager_data_source_read_pcm_frames(ma_resource_manager_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_resource_manager_data_source_seek_to_pcm_frame(ma_resource_manager_data_source* pDataSource, ma_uint64 frameIndex);
-MA_API ma_result ma_resource_manager_data_source_get_data_format(ma_resource_manager_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_resource_manager_data_source_get_cursor_in_pcm_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pCursor);
-MA_API ma_result ma_resource_manager_data_source_get_length_in_pcm_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pLength);
-MA_API ma_result ma_resource_manager_data_source_result(const ma_resource_manager_data_source* pDataSource);
-MA_API ma_result ma_resource_manager_data_source_set_looping(ma_resource_manager_data_source* pDataSource, ma_bool32 isLooping);
-MA_API ma_bool32 ma_resource_manager_data_source_is_looping(const ma_resource_manager_data_source* pDataSource);
-MA_API ma_result ma_resource_manager_data_source_get_available_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pAvailableFrames);
-
-/* Job management. */
-MA_API ma_result ma_resource_manager_post_job(ma_resource_manager* pResourceManager, const ma_job* pJob);
-MA_API ma_result ma_resource_manager_post_job_quit(ma_resource_manager* pResourceManager);  /* Helper for posting a quit job. */
-MA_API ma_result ma_resource_manager_next_job(ma_resource_manager* pResourceManager, ma_job* pJob);
-MA_API ma_result ma_resource_manager_process_job(ma_resource_manager* pResourceManager, ma_job* pJob);  /* DEPRECATED. Use ma_job_process(). Will be removed in version 0.12. */
-MA_API ma_result ma_resource_manager_process_next_job(ma_resource_manager* pResourceManager);   /* Returns MA_CANCELLED if a MA_JOB_TYPE_QUIT job is found. In non-blocking mode, returns MA_NO_DATA_AVAILABLE if no jobs are available. */
-#endif  /* MA_NO_RESOURCE_MANAGER */
-
-
-
-/************************************************************************************************************************************************************
-
-Node Graph
-
-************************************************************************************************************************************************************/
-#ifndef MA_NO_NODE_GRAPH
-/* Must never exceed 254. */
-#ifndef MA_MAX_NODE_BUS_COUNT
-#define MA_MAX_NODE_BUS_COUNT       254
-#endif
-
-/* Used internally by miniaudio for memory management. Must never exceed MA_MAX_NODE_BUS_COUNT. */
-#ifndef MA_MAX_NODE_LOCAL_BUS_COUNT
-#define MA_MAX_NODE_LOCAL_BUS_COUNT 2
-#endif
-
-/* Use this when the bus count is determined by the node instance rather than the vtable. */
-#define MA_NODE_BUS_COUNT_UNKNOWN   255
-
-typedef struct ma_node_graph ma_node_graph;
-typedef void ma_node;
-
-
-/* Node flags. */
-typedef enum
-{
-    MA_NODE_FLAG_PASSTHROUGH                = 0x00000001,
-    MA_NODE_FLAG_CONTINUOUS_PROCESSING      = 0x00000002,
-    MA_NODE_FLAG_ALLOW_NULL_INPUT           = 0x00000004,
-    MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES = 0x00000008,
-    MA_NODE_FLAG_SILENT_OUTPUT              = 0x00000010
-} ma_node_flags;
-
-
-/* The playback state of a node. Either started or stopped. */
-typedef enum
-{
-    ma_node_state_started = 0,
-    ma_node_state_stopped = 1
-} ma_node_state;
-
-
-typedef struct
-{
-    /*
-    Extended processing callback. This callback is used for effects that process input and output
-    at different rates (i.e. they perform resampling). This is similar to the simple version, only
-    they take two separate frame counts: one for input, and one for output.
-
-    On input, `pFrameCountOut` is equal to the capacity of the output buffer for each bus, whereas
-    `pFrameCountIn` will be equal to the number of PCM frames in each of the buffers in `ppFramesIn`.
-
-    On output, set `pFrameCountOut` to the number of PCM frames that were actually output and set
-    `pFrameCountIn` to the number of input frames that were consumed.
-    */
-    void (* onProcess)(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut);
-
-    /*
-    A callback for retrieving the number of a input frames that are required to output the
-    specified number of output frames. You would only want to implement this when the node performs
-    resampling. This is optional, even for nodes that perform resampling, but it does offer a
-    small reduction in latency as it allows miniaudio to calculate the exact number of input frames
-    to read at a time instead of having to estimate.
-    */
-    ma_result (* onGetRequiredInputFrameCount)(ma_node* pNode, ma_uint32 outputFrameCount, ma_uint32* pInputFrameCount);
-
-    /*
-    The number of input buses. This is how many sub-buffers will be contained in the `ppFramesIn`
-    parameters of the callbacks above.
-    */
-    ma_uint8 inputBusCount;
-
-    /*
-    The number of output buses. This is how many sub-buffers will be contained in the `ppFramesOut`
-    parameters of the callbacks above.
-    */
-    ma_uint8 outputBusCount;
-
-    /*
-    Flags describing characteristics of the node. This is currently just a placeholder for some
-    ideas for later on.
-    */
-    ma_uint32 flags;
-} ma_node_vtable;
-
-typedef struct
-{
-    const ma_node_vtable* vtable;       /* Should never be null. Initialization of the node will fail if so. */
-    ma_node_state initialState;         /* Defaults to ma_node_state_started. */
-    ma_uint32 inputBusCount;            /* Only used if the vtable specifies an input bus count of `MA_NODE_BUS_COUNT_UNKNOWN`, otherwise must be set to `MA_NODE_BUS_COUNT_UNKNOWN` (default). */
-    ma_uint32 outputBusCount;           /* Only used if the vtable specifies an output bus count of `MA_NODE_BUS_COUNT_UNKNOWN`, otherwise  be set to `MA_NODE_BUS_COUNT_UNKNOWN` (default). */
-    const ma_uint32* pInputChannels;    /* The number of elements are determined by the input bus count as determined by the vtable, or `inputBusCount` if the vtable specifies `MA_NODE_BUS_COUNT_UNKNOWN`. */
-    const ma_uint32* pOutputChannels;   /* The number of elements are determined by the output bus count as determined by the vtable, or `outputBusCount` if the vtable specifies `MA_NODE_BUS_COUNT_UNKNOWN`. */
-} ma_node_config;
-
-MA_API ma_node_config ma_node_config_init(void);
-
-
-/*
-A node has multiple output buses. An output bus is attached to an input bus as an item in a linked
-list. Think of the input bus as a linked list, with the output bus being an item in that list.
-*/
-typedef struct ma_node_output_bus ma_node_output_bus;
-struct ma_node_output_bus
-{
-    /* Immutable. */
-    ma_node* pNode;                                         /* The node that owns this output bus. The input node. Will be null for dummy head and tail nodes. */
-    ma_uint8 outputBusIndex;                                /* The index of the output bus on pNode that this output bus represents. */
-    ma_uint8 channels;                                      /* The number of channels in the audio stream for this bus. */
-
-    /* Mutable via multiple threads. Must be used atomically. The weird ordering here is for packing reasons. */
-    ma_uint8 inputNodeInputBusIndex;                        /* The index of the input bus on the input. Required for detaching. Will only be used within the spinlock so does not need to be atomic. */
-    MA_ATOMIC(4, ma_uint32) flags;                          /* Some state flags for tracking the read state of the output buffer. A combination of MA_NODE_OUTPUT_BUS_FLAG_*. */
-    MA_ATOMIC(4, ma_uint32) refCount;                       /* Reference count for some thread-safety when detaching. */
-    MA_ATOMIC(4, ma_bool32) isAttached;                     /* This is used to prevent iteration of nodes that are in the middle of being detached. Used for thread safety. */
-    MA_ATOMIC(4, ma_spinlock) lock;                         /* Unfortunate lock, but significantly simplifies the implementation. Required for thread-safe attaching and detaching. */
-    MA_ATOMIC(4, float) volume;                             /* Linear. */
-    MA_ATOMIC(MA_SIZEOF_PTR, ma_node_output_bus*) pNext;    /* If null, it's the tail node or detached. */
-    MA_ATOMIC(MA_SIZEOF_PTR, ma_node_output_bus*) pPrev;    /* If null, it's the head node or detached. */
-    MA_ATOMIC(MA_SIZEOF_PTR, ma_node*) pInputNode;          /* The node that this output bus is attached to. Required for detaching. */
-};
-
-/*
-A node has multiple input buses. The output buses of a node are connecting to the input busses of
-another. An input bus is essentially just a linked list of output buses.
-*/
-typedef struct ma_node_input_bus ma_node_input_bus;
-struct ma_node_input_bus
-{
-    /* Mutable via multiple threads. */
-    ma_node_output_bus head;                /* Dummy head node for simplifying some lock-free thread-safety stuff. */
-    MA_ATOMIC(4, ma_uint32) nextCounter;    /* This is used to determine whether or not the input bus is finding the next node in the list. Used for thread safety when detaching output buses. */
-    MA_ATOMIC(4, ma_spinlock) lock;         /* Unfortunate lock, but significantly simplifies the implementation. Required for thread-safe attaching and detaching. */
-
-    /* Set once at startup. */
-    ma_uint8 channels;                      /* The number of channels in the audio stream for this bus. */
-};
-
-
-typedef struct ma_node_base ma_node_base;
-struct ma_node_base
-{
-    /* These variables are set once at startup. */
-    ma_node_graph* pNodeGraph;              /* The graph this node belongs to. */
-    const ma_node_vtable* vtable;
-    float* pCachedData;                     /* Allocated on the heap. Fixed size. Needs to be stored on the heap because reading from output buses is done in separate function calls. */
-    ma_uint16 cachedDataCapInFramesPerBus;  /* The capacity of the input data cache in frames, per bus. */
-
-    /* These variables are read and written only from the audio thread. */
-    ma_uint16 cachedFrameCountOut;
-    ma_uint16 cachedFrameCountIn;
-    ma_uint16 consumedFrameCountIn;
-
-    /* These variables are read and written between different threads. */
-    MA_ATOMIC(4, ma_node_state) state;      /* When set to stopped, nothing will be read, regardless of the times in stateTimes. */
-    MA_ATOMIC(8, ma_uint64) stateTimes[2];  /* Indexed by ma_node_state. Specifies the time based on the global clock that a node should be considered to be in the relevant state. */
-    MA_ATOMIC(8, ma_uint64) localTime;      /* The node's local clock. This is just a running sum of the number of output frames that have been processed. Can be modified by any thread with `ma_node_set_time()`. */
-    ma_uint32 inputBusCount;
-    ma_uint32 outputBusCount;
-    ma_node_input_bus* pInputBuses;
-    ma_node_output_bus* pOutputBuses;
-
-    /* Memory management. */
-    ma_node_input_bus _inputBuses[MA_MAX_NODE_LOCAL_BUS_COUNT];
-    ma_node_output_bus _outputBuses[MA_MAX_NODE_LOCAL_BUS_COUNT];
-    void* _pHeap;   /* A heap allocation for internal use only. pInputBuses and/or pOutputBuses will point to this if the bus count exceeds MA_MAX_NODE_LOCAL_BUS_COUNT. */
-    ma_bool32 _ownsHeap;    /* If set to true, the node owns the heap allocation and _pHeap will be freed in ma_node_uninit(). */
-};
-
-MA_API ma_result ma_node_get_heap_size(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_node_init_preallocated(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, void* pHeap, ma_node* pNode);
-MA_API ma_result ma_node_init(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_node* pNode);
-MA_API void ma_node_uninit(ma_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_node_graph* ma_node_get_node_graph(const ma_node* pNode);
-MA_API ma_uint32 ma_node_get_input_bus_count(const ma_node* pNode);
-MA_API ma_uint32 ma_node_get_output_bus_count(const ma_node* pNode);
-MA_API ma_uint32 ma_node_get_input_channels(const ma_node* pNode, ma_uint32 inputBusIndex);
-MA_API ma_uint32 ma_node_get_output_channels(const ma_node* pNode, ma_uint32 outputBusIndex);
-MA_API ma_result ma_node_attach_output_bus(ma_node* pNode, ma_uint32 outputBusIndex, ma_node* pOtherNode, ma_uint32 otherNodeInputBusIndex);
-MA_API ma_result ma_node_detach_output_bus(ma_node* pNode, ma_uint32 outputBusIndex);
-MA_API ma_result ma_node_detach_all_output_buses(ma_node* pNode);
-MA_API ma_result ma_node_set_output_bus_volume(ma_node* pNode, ma_uint32 outputBusIndex, float volume);
-MA_API float ma_node_get_output_bus_volume(const ma_node* pNode, ma_uint32 outputBusIndex);
-MA_API ma_result ma_node_set_state(ma_node* pNode, ma_node_state state);
-MA_API ma_node_state ma_node_get_state(const ma_node* pNode);
-MA_API ma_result ma_node_set_state_time(ma_node* pNode, ma_node_state state, ma_uint64 globalTime);
-MA_API ma_uint64 ma_node_get_state_time(const ma_node* pNode, ma_node_state state);
-MA_API ma_node_state ma_node_get_state_by_time(const ma_node* pNode, ma_uint64 globalTime);
-MA_API ma_node_state ma_node_get_state_by_time_range(const ma_node* pNode, ma_uint64 globalTimeBeg, ma_uint64 globalTimeEnd);
-MA_API ma_uint64 ma_node_get_time(const ma_node* pNode);
-MA_API ma_result ma_node_set_time(ma_node* pNode, ma_uint64 localTime);
-
-
-typedef struct
-{
-    ma_uint32 channels;
-    ma_uint16 nodeCacheCapInFrames;
-} ma_node_graph_config;
-
-MA_API ma_node_graph_config ma_node_graph_config_init(ma_uint32 channels);
-
-
-struct ma_node_graph
-{
-    /* Immutable. */
-    ma_node_base base;                  /* The node graph itself is a node so it can be connected as an input to different node graph. This has zero inputs and calls ma_node_graph_read_pcm_frames() to generate it's output. */
-    ma_node_base endpoint;              /* Special node that all nodes eventually connect to. Data is read from this node in ma_node_graph_read_pcm_frames(). */
-    ma_uint16 nodeCacheCapInFrames;
-
-    /* Read and written by multiple threads. */
-    MA_ATOMIC(4, ma_bool32) isReading;
-};
-
-MA_API ma_result ma_node_graph_init(const ma_node_graph_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_node_graph* pNodeGraph);
-MA_API void ma_node_graph_uninit(ma_node_graph* pNodeGraph, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_node* ma_node_graph_get_endpoint(ma_node_graph* pNodeGraph);
-MA_API ma_result ma_node_graph_read_pcm_frames(ma_node_graph* pNodeGraph, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_uint32 ma_node_graph_get_channels(const ma_node_graph* pNodeGraph);
-MA_API ma_uint64 ma_node_graph_get_time(const ma_node_graph* pNodeGraph);
-MA_API ma_result ma_node_graph_set_time(ma_node_graph* pNodeGraph, ma_uint64 globalTime);
-
-
-
-/* Data source node. 0 input buses, 1 output bus. Used for reading from a data source. */
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_data_source* pDataSource;
-} ma_data_source_node_config;
-
-MA_API ma_data_source_node_config ma_data_source_node_config_init(ma_data_source* pDataSource);
-
-
-typedef struct
-{
-    ma_node_base base;
-    ma_data_source* pDataSource;
-} ma_data_source_node;
-
-MA_API ma_result ma_data_source_node_init(ma_node_graph* pNodeGraph, const ma_data_source_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source_node* pDataSourceNode);
-MA_API void ma_data_source_node_uninit(ma_data_source_node* pDataSourceNode, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_data_source_node_set_looping(ma_data_source_node* pDataSourceNode, ma_bool32 isLooping);
-MA_API ma_bool32 ma_data_source_node_is_looping(ma_data_source_node* pDataSourceNode);
-
-
-/* Splitter Node. 1 input, many outputs. Used for splitting/copying a stream so it can be as input into two separate output nodes. */
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_uint32 channels;
-    ma_uint32 outputBusCount;
-} ma_splitter_node_config;
-
-MA_API ma_splitter_node_config ma_splitter_node_config_init(ma_uint32 channels);
-
-
-typedef struct
-{
-    ma_node_base base;
-} ma_splitter_node;
-
-MA_API ma_result ma_splitter_node_init(ma_node_graph* pNodeGraph, const ma_splitter_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_splitter_node* pSplitterNode);
-MA_API void ma_splitter_node_uninit(ma_splitter_node* pSplitterNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-/*
-Biquad Node
-*/
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_biquad_config biquad;
-} ma_biquad_node_config;
-
-MA_API ma_biquad_node_config ma_biquad_node_config_init(ma_uint32 channels, float b0, float b1, float b2, float a0, float a1, float a2);
-
-
-typedef struct
-{
-    ma_node_base baseNode;
-    ma_biquad biquad;
-} ma_biquad_node;
-
-MA_API ma_result ma_biquad_node_init(ma_node_graph* pNodeGraph, const ma_biquad_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_biquad_node* pNode);
-MA_API ma_result ma_biquad_node_reinit(const ma_biquad_config* pConfig, ma_biquad_node* pNode);
-MA_API void ma_biquad_node_uninit(ma_biquad_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-/*
-Low Pass Filter Node
-*/
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_lpf_config lpf;
-} ma_lpf_node_config;
-
-MA_API ma_lpf_node_config ma_lpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
-
-
-typedef struct
-{
-    ma_node_base baseNode;
-    ma_lpf lpf;
-} ma_lpf_node;
-
-MA_API ma_result ma_lpf_node_init(ma_node_graph* pNodeGraph, const ma_lpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf_node* pNode);
-MA_API ma_result ma_lpf_node_reinit(const ma_lpf_config* pConfig, ma_lpf_node* pNode);
-MA_API void ma_lpf_node_uninit(ma_lpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-/*
-High Pass Filter Node
-*/
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_hpf_config hpf;
-} ma_hpf_node_config;
-
-MA_API ma_hpf_node_config ma_hpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
-
-
-typedef struct
-{
-    ma_node_base baseNode;
-    ma_hpf hpf;
-} ma_hpf_node;
-
-MA_API ma_result ma_hpf_node_init(ma_node_graph* pNodeGraph, const ma_hpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf_node* pNode);
-MA_API ma_result ma_hpf_node_reinit(const ma_hpf_config* pConfig, ma_hpf_node* pNode);
-MA_API void ma_hpf_node_uninit(ma_hpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-/*
-Band Pass Filter Node
-*/
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_bpf_config bpf;
-} ma_bpf_node_config;
-
-MA_API ma_bpf_node_config ma_bpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
-
-
-typedef struct
-{
-    ma_node_base baseNode;
-    ma_bpf bpf;
-} ma_bpf_node;
-
-MA_API ma_result ma_bpf_node_init(ma_node_graph* pNodeGraph, const ma_bpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf_node* pNode);
-MA_API ma_result ma_bpf_node_reinit(const ma_bpf_config* pConfig, ma_bpf_node* pNode);
-MA_API void ma_bpf_node_uninit(ma_bpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-/*
-Notching Filter Node
-*/
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_notch_config notch;
-} ma_notch_node_config;
-
-MA_API ma_notch_node_config ma_notch_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double q, double frequency);
-
-
-typedef struct
-{
-    ma_node_base baseNode;
-    ma_notch2 notch;
-} ma_notch_node;
-
-MA_API ma_result ma_notch_node_init(ma_node_graph* pNodeGraph, const ma_notch_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_notch_node* pNode);
-MA_API ma_result ma_notch_node_reinit(const ma_notch_config* pConfig, ma_notch_node* pNode);
-MA_API void ma_notch_node_uninit(ma_notch_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-/*
-Peaking Filter Node
-*/
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_peak_config peak;
-} ma_peak_node_config;
-
-MA_API ma_peak_node_config ma_peak_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency);
-
-
-typedef struct
-{
-    ma_node_base baseNode;
-    ma_peak2 peak;
-} ma_peak_node;
-
-MA_API ma_result ma_peak_node_init(ma_node_graph* pNodeGraph, const ma_peak_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_peak_node* pNode);
-MA_API ma_result ma_peak_node_reinit(const ma_peak_config* pConfig, ma_peak_node* pNode);
-MA_API void ma_peak_node_uninit(ma_peak_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-/*
-Low Shelf Filter Node
-*/
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_loshelf_config loshelf;
-} ma_loshelf_node_config;
-
-MA_API ma_loshelf_node_config ma_loshelf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency);
-
-
-typedef struct
-{
-    ma_node_base baseNode;
-    ma_loshelf2 loshelf;
-} ma_loshelf_node;
-
-MA_API ma_result ma_loshelf_node_init(ma_node_graph* pNodeGraph, const ma_loshelf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_loshelf_node* pNode);
-MA_API ma_result ma_loshelf_node_reinit(const ma_loshelf_config* pConfig, ma_loshelf_node* pNode);
-MA_API void ma_loshelf_node_uninit(ma_loshelf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-/*
-High Shelf Filter Node
-*/
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_hishelf_config hishelf;
-} ma_hishelf_node_config;
-
-MA_API ma_hishelf_node_config ma_hishelf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency);
-
-
-typedef struct
-{
-    ma_node_base baseNode;
-    ma_hishelf2 hishelf;
-} ma_hishelf_node;
-
-MA_API ma_result ma_hishelf_node_init(ma_node_graph* pNodeGraph, const ma_hishelf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hishelf_node* pNode);
-MA_API ma_result ma_hishelf_node_reinit(const ma_hishelf_config* pConfig, ma_hishelf_node* pNode);
-MA_API void ma_hishelf_node_uninit(ma_hishelf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-typedef struct
-{
-    ma_node_config nodeConfig;
-    ma_delay_config delay;
-} ma_delay_node_config;
-
-MA_API ma_delay_node_config ma_delay_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 delayInFrames, float decay);
-
-
-typedef struct
-{
-    ma_node_base baseNode;
-    ma_delay delay;
-} ma_delay_node;
-
-MA_API ma_result ma_delay_node_init(ma_node_graph* pNodeGraph, const ma_delay_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_delay_node* pDelayNode);
-MA_API void ma_delay_node_uninit(ma_delay_node* pDelayNode, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API void ma_delay_node_set_wet(ma_delay_node* pDelayNode, float value);
-MA_API float ma_delay_node_get_wet(const ma_delay_node* pDelayNode);
-MA_API void ma_delay_node_set_dry(ma_delay_node* pDelayNode, float value);
-MA_API float ma_delay_node_get_dry(const ma_delay_node* pDelayNode);
-MA_API void ma_delay_node_set_decay(ma_delay_node* pDelayNode, float value);
-MA_API float ma_delay_node_get_decay(const ma_delay_node* pDelayNode);
-#endif  /* MA_NO_NODE_GRAPH */
-
-
-/* SECTION: miniaudio_engine.h */
-/************************************************************************************************************************************************************
-
-Engine
-
-************************************************************************************************************************************************************/
-#if !defined(MA_NO_ENGINE) && !defined(MA_NO_NODE_GRAPH)
-typedef struct ma_engine ma_engine;
-typedef struct ma_sound  ma_sound;
-
-
-/* Sound flags. */
-typedef enum
-{
-    /* Resource manager flags. */
-    MA_SOUND_FLAG_STREAM                = 0x00000001,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM */
-    MA_SOUND_FLAG_DECODE                = 0x00000002,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE */
-    MA_SOUND_FLAG_ASYNC                 = 0x00000004,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC */
-    MA_SOUND_FLAG_WAIT_INIT             = 0x00000008,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT */
-    MA_SOUND_FLAG_UNKNOWN_LENGTH        = 0x00000010,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_UNKNOWN_LENGTH */
-
-    /* ma_sound specific flags. */
-    MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT = 0x00001000,   /* Do not attach to the endpoint by default. Useful for when setting up nodes in a complex graph system. */
-    MA_SOUND_FLAG_NO_PITCH              = 0x00002000,   /* Disable pitch shifting with ma_sound_set_pitch() and ma_sound_group_set_pitch(). This is an optimization. */
-    MA_SOUND_FLAG_NO_SPATIALIZATION     = 0x00004000    /* Disable spatialization. */
-} ma_sound_flags;
-
-#ifndef MA_ENGINE_MAX_LISTENERS
-#define MA_ENGINE_MAX_LISTENERS             4
-#endif
-
-#define MA_LISTENER_INDEX_CLOSEST           ((ma_uint8)-1)
-
-typedef enum
-{
-    ma_engine_node_type_sound,
-    ma_engine_node_type_group
-} ma_engine_node_type;
-
-typedef struct
-{
-    ma_engine* pEngine;
-    ma_engine_node_type type;
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    ma_uint32 sampleRate;               /* Only used when the type is set to ma_engine_node_type_sound. */
-    ma_uint32 volumeSmoothTimeInPCMFrames;  /* The number of frames to smooth over volume changes. Defaults to 0 in which case no smoothing is used. */
-    ma_mono_expansion_mode monoExpansionMode;
-    ma_bool8 isPitchDisabled;           /* Pitching can be explicitly disabled with MA_SOUND_FLAG_NO_PITCH to optimize processing. */
-    ma_bool8 isSpatializationDisabled;  /* Spatialization can be explicitly disabled with MA_SOUND_FLAG_NO_SPATIALIZATION. */
-    ma_uint8 pinnedListenerIndex;       /* The index of the listener this node should always use for spatialization. If set to MA_LISTENER_INDEX_CLOSEST the engine will use the closest listener. */
-} ma_engine_node_config;
-
-MA_API ma_engine_node_config ma_engine_node_config_init(ma_engine* pEngine, ma_engine_node_type type, ma_uint32 flags);
-
-
-/* Base node object for both ma_sound and ma_sound_group. */
-typedef struct
-{
-    ma_node_base baseNode;                              /* Must be the first member for compatiblity with the ma_node API. */
-    ma_engine* pEngine;                                 /* A pointer to the engine. Set based on the value from the config. */
-    ma_uint32 sampleRate;                               /* The sample rate of the input data. For sounds backed by a data source, this will be the data source's sample rate. Otherwise it'll be the engine's sample rate. */
-    ma_uint32 volumeSmoothTimeInPCMFrames;
-    ma_mono_expansion_mode monoExpansionMode;
-    ma_fader fader;
-    ma_linear_resampler resampler;                      /* For pitch shift. */
-    ma_spatializer spatializer;
-    ma_panner panner;
-    ma_gainer volumeGainer;                             /* This will only be used if volumeSmoothTimeInPCMFrames is > 0. */
-    ma_atomic_float volume;                             /* Defaults to 1. */
-    MA_ATOMIC(4, float) pitch;
-    float oldPitch;                                     /* For determining whether or not the resampler needs to be updated to reflect the new pitch. The resampler will be updated on the mixing thread. */
-    float oldDopplerPitch;                              /* For determining whether or not the resampler needs to be updated to take a new doppler pitch into account. */
-    MA_ATOMIC(4, ma_bool32) isPitchDisabled;            /* When set to true, pitching will be disabled which will allow the resampler to be bypassed to save some computation. */
-    MA_ATOMIC(4, ma_bool32) isSpatializationDisabled;   /* Set to false by default. When set to false, will not have spatialisation applied. */
-    MA_ATOMIC(4, ma_uint32) pinnedListenerIndex;        /* The index of the listener this node should always use for spatialization. If set to MA_LISTENER_INDEX_CLOSEST the engine will use the closest listener. */
-
-    /* When setting a fade, it's not done immediately in ma_sound_set_fade(). It's deferred to the audio thread which means we need to store the settings here. */
-    struct
-    {
-        ma_atomic_float volumeBeg;
-        ma_atomic_float volumeEnd;
-        ma_atomic_uint64 fadeLengthInFrames;            /* <-- Defaults to (~(ma_uint64)0) which is used to indicate that no fade should be applied. */
-        ma_atomic_uint64 absoluteGlobalTimeInFrames;    /* <-- The time to start the fade. */
-    } fadeSettings;
-
-    /* Memory management. */
-    ma_bool8 _ownsHeap;
-    void* _pHeap;
-} ma_engine_node;
-
-MA_API ma_result ma_engine_node_get_heap_size(const ma_engine_node_config* pConfig, size_t* pHeapSizeInBytes);
-MA_API ma_result ma_engine_node_init_preallocated(const ma_engine_node_config* pConfig, void* pHeap, ma_engine_node* pEngineNode);
-MA_API ma_result ma_engine_node_init(const ma_engine_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_engine_node* pEngineNode);
-MA_API void ma_engine_node_uninit(ma_engine_node* pEngineNode, const ma_allocation_callbacks* pAllocationCallbacks);
-
-
-#define MA_SOUND_SOURCE_CHANNEL_COUNT   0xFFFFFFFF
-
-/* Callback for when a sound reaches the end. */
-typedef void (* ma_sound_end_proc)(void* pUserData, ma_sound* pSound);
-
-typedef struct
-{
-    const char* pFilePath;                      /* Set this to load from the resource manager. */
-    const wchar_t* pFilePathW;                  /* Set this to load from the resource manager. */
-    ma_data_source* pDataSource;                /* Set this to load from an existing data source. */
-    ma_node* pInitialAttachment;                /* If set, the sound will be attached to an input of this node. This can be set to a ma_sound. If set to NULL, the sound will be attached directly to the endpoint unless MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT is set in `flags`. */
-    ma_uint32 initialAttachmentInputBusIndex;   /* The index of the input bus of pInitialAttachment to attach the sound to. */
-    ma_uint32 channelsIn;                       /* Ignored if using a data source as input (the data source's channel count will be used always). Otherwise, setting to 0 will cause the engine's channel count to be used. */
-    ma_uint32 channelsOut;                      /* Set this to 0 (default) to use the engine's channel count. Set to MA_SOUND_SOURCE_CHANNEL_COUNT to use the data source's channel count (only used if using a data source as input). */
-    ma_mono_expansion_mode monoExpansionMode;   /* Controls how the mono channel should be expanded to other channels when spatialization is disabled on a sound. */
-    ma_uint32 flags;                            /* A combination of MA_SOUND_FLAG_* flags. */
-    ma_uint32 volumeSmoothTimeInPCMFrames;      /* The number of frames to smooth over volume changes. Defaults to 0 in which case no smoothing is used. */
-    ma_uint64 initialSeekPointInPCMFrames;      /* Initializes the sound such that it's seeked to this location by default. */
-    ma_uint64 rangeBegInPCMFrames;
-    ma_uint64 rangeEndInPCMFrames;
-    ma_uint64 loopPointBegInPCMFrames;
-    ma_uint64 loopPointEndInPCMFrames;
-    ma_bool32 isLooping;
-    ma_sound_end_proc endCallback;              /* Fired when the sound reaches the end. Will be fired from the audio thread. Do not restart, uninitialize or otherwise change the state of the sound from here. Instead fire an event or set a variable to indicate to a different thread to change the start of the sound. Will not be fired in response to a scheduled stop with ma_sound_set_stop_time_*(). */
-    void* pEndCallbackUserData;
-#ifndef MA_NO_RESOURCE_MANAGER
-    ma_resource_manager_pipeline_notifications initNotifications;
-#endif
-    ma_fence* pDoneFence;                       /* Deprecated. Use initNotifications instead. Released when the resource manager has finished decoding the entire sound. Not used with streams. */
-} ma_sound_config;
-
-MA_API ma_sound_config ma_sound_config_init(void);                  /* Deprecated. Will be removed in version 0.12. Use ma_sound_config_2() instead. */
-MA_API ma_sound_config ma_sound_config_init_2(ma_engine* pEngine);  /* Will be renamed to ma_sound_config_init() in version 0.12. */
-
-struct ma_sound
-{
-    ma_engine_node engineNode;          /* Must be the first member for compatibility with the ma_node API. */
-    ma_data_source* pDataSource;
-    MA_ATOMIC(8, ma_uint64) seekTarget; /* The PCM frame index to seek to in the mixing thread. Set to (~(ma_uint64)0) to not perform any seeking. */
-    MA_ATOMIC(4, ma_bool32) atEnd;
-    ma_sound_end_proc endCallback;
-    void* pEndCallbackUserData;
-    ma_bool8 ownsDataSource;
-
-    /*
-    We're declaring a resource manager data source object here to save us a malloc when loading a
-    sound via the resource manager, which I *think* will be the most common scenario.
-    */
-#ifndef MA_NO_RESOURCE_MANAGER
-    ma_resource_manager_data_source* pResourceManagerDataSource;
-#endif
-};
-
-/* Structure specifically for sounds played with ma_engine_play_sound(). Making this a separate structure to reduce overhead. */
-typedef struct ma_sound_inlined ma_sound_inlined;
-struct ma_sound_inlined
-{
-    ma_sound sound;
-    ma_sound_inlined* pNext;
-    ma_sound_inlined* pPrev;
-};
-
-/* A sound group is just a sound. */
-typedef ma_sound_config ma_sound_group_config;
-typedef ma_sound        ma_sound_group;
-
-MA_API ma_sound_group_config ma_sound_group_config_init(void);                  /* Deprecated. Will be removed in version 0.12. Use ma_sound_config_2() instead. */
-MA_API ma_sound_group_config ma_sound_group_config_init_2(ma_engine* pEngine);  /* Will be renamed to ma_sound_config_init() in version 0.12. */
-
-typedef void (* ma_engine_process_proc)(void* pUserData, float* pFramesOut, ma_uint64 frameCount);
-
-typedef struct
-{
-#if !defined(MA_NO_RESOURCE_MANAGER)
-    ma_resource_manager* pResourceManager;          /* Can be null in which case a resource manager will be created for you. */
-#endif
-#if !defined(MA_NO_DEVICE_IO)
-    ma_context* pContext;
-    ma_device* pDevice;                             /* If set, the caller is responsible for calling ma_engine_data_callback() in the device's data callback. */
-    ma_device_id* pPlaybackDeviceID;                /* The ID of the playback device to use with the default listener. */
-    ma_device_data_proc dataCallback;               /* Can be null. Can be used to provide a custom device data callback. */
-    ma_device_notification_proc notificationCallback;
-#endif
-    ma_log* pLog;                                   /* When set to NULL, will use the context's log. */
-    ma_uint32 listenerCount;                        /* Must be between 1 and MA_ENGINE_MAX_LISTENERS. */
-    ma_uint32 channels;                             /* The number of channels to use when mixing and spatializing. When set to 0, will use the native channel count of the device. */
-    ma_uint32 sampleRate;                           /* The sample rate. When set to 0 will use the native channel count of the device. */
-    ma_uint32 periodSizeInFrames;                   /* If set to something other than 0, updates will always be exactly this size. The underlying device may be a different size, but from the perspective of the mixer that won't matter.*/
-    ma_uint32 periodSizeInMilliseconds;             /* Used if periodSizeInFrames is unset. */
-    ma_uint32 gainSmoothTimeInFrames;               /* The number of frames to interpolate the gain of spatialized sounds across. If set to 0, will use gainSmoothTimeInMilliseconds. */
-    ma_uint32 gainSmoothTimeInMilliseconds;         /* When set to 0, gainSmoothTimeInFrames will be used. If both are set to 0, a default value will be used. */
-    ma_uint32 defaultVolumeSmoothTimeInPCMFrames;   /* Defaults to 0. Controls the default amount of smoothing to apply to volume changes to sounds. High values means more smoothing at the expense of high latency (will take longer to reach the new volume). */
-    ma_allocation_callbacks allocationCallbacks;
-    ma_bool32 noAutoStart;                          /* When set to true, requires an explicit call to ma_engine_start(). This is false by default, meaning the engine will be started automatically in ma_engine_init(). */
-    ma_bool32 noDevice;                             /* When set to true, don't create a default device. ma_engine_read_pcm_frames() can be called manually to read data. */
-    ma_mono_expansion_mode monoExpansionMode;       /* Controls how the mono channel should be expanded to other channels when spatialization is disabled on a sound. */
-    ma_vfs* pResourceManagerVFS;                    /* A pointer to a pre-allocated VFS object to use with the resource manager. This is ignored if pResourceManager is not NULL. */
-    ma_engine_process_proc onProcess;               /* Fired at the end of each call to ma_engine_read_pcm_frames(). For engine's that manage their own internal device (the default configuration), this will be fired from the audio thread, and you do not need to call ma_engine_read_pcm_frames() manually in order to trigger this. */
-    void* pProcessUserData;                         /* User data that's passed into onProcess. */
-} ma_engine_config;
-
-MA_API ma_engine_config ma_engine_config_init(void);
-
-
-struct ma_engine
-{
-    ma_node_graph nodeGraph;                /* An engine is a node graph. It should be able to be plugged into any ma_node_graph API (with a cast) which means this must be the first member of this struct. */
-#if !defined(MA_NO_RESOURCE_MANAGER)
-    ma_resource_manager* pResourceManager;
-#endif
-#if !defined(MA_NO_DEVICE_IO)
-    ma_device* pDevice;                     /* Optionally set via the config, otherwise allocated by the engine in ma_engine_init(). */
-#endif
-    ma_log* pLog;
-    ma_uint32 sampleRate;
-    ma_uint32 listenerCount;
-    ma_spatializer_listener listeners[MA_ENGINE_MAX_LISTENERS];
-    ma_allocation_callbacks allocationCallbacks;
-    ma_bool8 ownsResourceManager;
-    ma_bool8 ownsDevice;
-    ma_spinlock inlinedSoundLock;               /* For synchronizing access so the inlined sound list. */
-    ma_sound_inlined* pInlinedSoundHead;        /* The first inlined sound. Inlined sounds are tracked in a linked list. */
-    MA_ATOMIC(4, ma_uint32) inlinedSoundCount;  /* The total number of allocated inlined sound objects. Used for debugging. */
-    ma_uint32 gainSmoothTimeInFrames;           /* The number of frames to interpolate the gain of spatialized sounds across. */
-    ma_uint32 defaultVolumeSmoothTimeInPCMFrames;
-    ma_mono_expansion_mode monoExpansionMode;
-    ma_engine_process_proc onProcess;
-    void* pProcessUserData;
-};
-
-MA_API ma_result ma_engine_init(const ma_engine_config* pConfig, ma_engine* pEngine);
-MA_API void ma_engine_uninit(ma_engine* pEngine);
-MA_API ma_result ma_engine_read_pcm_frames(ma_engine* pEngine, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_node_graph* ma_engine_get_node_graph(ma_engine* pEngine);
-#if !defined(MA_NO_RESOURCE_MANAGER)
-MA_API ma_resource_manager* ma_engine_get_resource_manager(ma_engine* pEngine);
-#endif
-MA_API ma_device* ma_engine_get_device(ma_engine* pEngine);
-MA_API ma_log* ma_engine_get_log(ma_engine* pEngine);
-MA_API ma_node* ma_engine_get_endpoint(ma_engine* pEngine);
-MA_API ma_uint64 ma_engine_get_time_in_pcm_frames(const ma_engine* pEngine);
-MA_API ma_uint64 ma_engine_get_time_in_milliseconds(const ma_engine* pEngine);
-MA_API ma_result ma_engine_set_time_in_pcm_frames(ma_engine* pEngine, ma_uint64 globalTime);
-MA_API ma_result ma_engine_set_time_in_milliseconds(ma_engine* pEngine, ma_uint64 globalTime);
-MA_API ma_uint64 ma_engine_get_time(const ma_engine* pEngine);                  /* Deprecated. Use ma_engine_get_time_in_pcm_frames(). Will be removed in version 0.12. */
-MA_API ma_result ma_engine_set_time(ma_engine* pEngine, ma_uint64 globalTime);  /* Deprecated. Use ma_engine_set_time_in_pcm_frames(). Will be removed in version 0.12. */
-MA_API ma_uint32 ma_engine_get_channels(const ma_engine* pEngine);
-MA_API ma_uint32 ma_engine_get_sample_rate(const ma_engine* pEngine);
-
-MA_API ma_result ma_engine_start(ma_engine* pEngine);
-MA_API ma_result ma_engine_stop(ma_engine* pEngine);
-MA_API ma_result ma_engine_set_volume(ma_engine* pEngine, float volume);
-MA_API float ma_engine_get_volume(ma_engine* pEngine);
-MA_API ma_result ma_engine_set_gain_db(ma_engine* pEngine, float gainDB);
-MA_API float ma_engine_get_gain_db(ma_engine* pEngine);
-
-MA_API ma_uint32 ma_engine_get_listener_count(const ma_engine* pEngine);
-MA_API ma_uint32 ma_engine_find_closest_listener(const ma_engine* pEngine, float absolutePosX, float absolutePosY, float absolutePosZ);
-MA_API void ma_engine_listener_set_position(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z);
-MA_API ma_vec3f ma_engine_listener_get_position(const ma_engine* pEngine, ma_uint32 listenerIndex);
-MA_API void ma_engine_listener_set_direction(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z);
-MA_API ma_vec3f ma_engine_listener_get_direction(const ma_engine* pEngine, ma_uint32 listenerIndex);
-MA_API void ma_engine_listener_set_velocity(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z);
-MA_API ma_vec3f ma_engine_listener_get_velocity(const ma_engine* pEngine, ma_uint32 listenerIndex);
-MA_API void ma_engine_listener_set_cone(ma_engine* pEngine, ma_uint32 listenerIndex, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
-MA_API void ma_engine_listener_get_cone(const ma_engine* pEngine, ma_uint32 listenerIndex, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
-MA_API void ma_engine_listener_set_world_up(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z);
-MA_API ma_vec3f ma_engine_listener_get_world_up(const ma_engine* pEngine, ma_uint32 listenerIndex);
-MA_API void ma_engine_listener_set_enabled(ma_engine* pEngine, ma_uint32 listenerIndex, ma_bool32 isEnabled);
-MA_API ma_bool32 ma_engine_listener_is_enabled(const ma_engine* pEngine, ma_uint32 listenerIndex);
-
-#ifndef MA_NO_RESOURCE_MANAGER
-MA_API ma_result ma_engine_play_sound_ex(ma_engine* pEngine, const char* pFilePath, ma_node* pNode, ma_uint32 nodeInputBusIndex);
-MA_API ma_result ma_engine_play_sound(ma_engine* pEngine, const char* pFilePath, ma_sound_group* pGroup);   /* Fire and forget. */
-#endif
-
-#ifndef MA_NO_RESOURCE_MANAGER
-MA_API ma_result ma_sound_init_from_file(ma_engine* pEngine, const char* pFilePath, ma_uint32 flags, ma_sound_group* pGroup, ma_fence* pDoneFence, ma_sound* pSound);
-MA_API ma_result ma_sound_init_from_file_w(ma_engine* pEngine, const wchar_t* pFilePath, ma_uint32 flags, ma_sound_group* pGroup, ma_fence* pDoneFence, ma_sound* pSound);
-MA_API ma_result ma_sound_init_copy(ma_engine* pEngine, const ma_sound* pExistingSound, ma_uint32 flags, ma_sound_group* pGroup, ma_sound* pSound);
-#endif
-MA_API ma_result ma_sound_init_from_data_source(ma_engine* pEngine, ma_data_source* pDataSource, ma_uint32 flags, ma_sound_group* pGroup, ma_sound* pSound);
-MA_API ma_result ma_sound_init_ex(ma_engine* pEngine, const ma_sound_config* pConfig, ma_sound* pSound);
-MA_API void ma_sound_uninit(ma_sound* pSound);
-MA_API ma_engine* ma_sound_get_engine(const ma_sound* pSound);
-MA_API ma_data_source* ma_sound_get_data_source(const ma_sound* pSound);
-MA_API ma_result ma_sound_start(ma_sound* pSound);
-MA_API ma_result ma_sound_stop(ma_sound* pSound);
-MA_API ma_result ma_sound_stop_with_fade_in_pcm_frames(ma_sound* pSound, ma_uint64 fadeLengthInFrames);     /* Will overwrite any scheduled stop and fade. */
-MA_API ma_result ma_sound_stop_with_fade_in_milliseconds(ma_sound* pSound, ma_uint64 fadeLengthInFrames);   /* Will overwrite any scheduled stop and fade. */
-MA_API void ma_sound_set_volume(ma_sound* pSound, float volume);
-MA_API float ma_sound_get_volume(const ma_sound* pSound);
-MA_API void ma_sound_set_pan(ma_sound* pSound, float pan);
-MA_API float ma_sound_get_pan(const ma_sound* pSound);
-MA_API void ma_sound_set_pan_mode(ma_sound* pSound, ma_pan_mode panMode);
-MA_API ma_pan_mode ma_sound_get_pan_mode(const ma_sound* pSound);
-MA_API void ma_sound_set_pitch(ma_sound* pSound, float pitch);
-MA_API float ma_sound_get_pitch(const ma_sound* pSound);
-MA_API void ma_sound_set_spatialization_enabled(ma_sound* pSound, ma_bool32 enabled);
-MA_API ma_bool32 ma_sound_is_spatialization_enabled(const ma_sound* pSound);
-MA_API void ma_sound_set_pinned_listener_index(ma_sound* pSound, ma_uint32 listenerIndex);
-MA_API ma_uint32 ma_sound_get_pinned_listener_index(const ma_sound* pSound);
-MA_API ma_uint32 ma_sound_get_listener_index(const ma_sound* pSound);
-MA_API ma_vec3f ma_sound_get_direction_to_listener(const ma_sound* pSound);
-MA_API void ma_sound_set_position(ma_sound* pSound, float x, float y, float z);
-MA_API ma_vec3f ma_sound_get_position(const ma_sound* pSound);
-MA_API void ma_sound_set_direction(ma_sound* pSound, float x, float y, float z);
-MA_API ma_vec3f ma_sound_get_direction(const ma_sound* pSound);
-MA_API void ma_sound_set_velocity(ma_sound* pSound, float x, float y, float z);
-MA_API ma_vec3f ma_sound_get_velocity(const ma_sound* pSound);
-MA_API void ma_sound_set_attenuation_model(ma_sound* pSound, ma_attenuation_model attenuationModel);
-MA_API ma_attenuation_model ma_sound_get_attenuation_model(const ma_sound* pSound);
-MA_API void ma_sound_set_positioning(ma_sound* pSound, ma_positioning positioning);
-MA_API ma_positioning ma_sound_get_positioning(const ma_sound* pSound);
-MA_API void ma_sound_set_rolloff(ma_sound* pSound, float rolloff);
-MA_API float ma_sound_get_rolloff(const ma_sound* pSound);
-MA_API void ma_sound_set_min_gain(ma_sound* pSound, float minGain);
-MA_API float ma_sound_get_min_gain(const ma_sound* pSound);
-MA_API void ma_sound_set_max_gain(ma_sound* pSound, float maxGain);
-MA_API float ma_sound_get_max_gain(const ma_sound* pSound);
-MA_API void ma_sound_set_min_distance(ma_sound* pSound, float minDistance);
-MA_API float ma_sound_get_min_distance(const ma_sound* pSound);
-MA_API void ma_sound_set_max_distance(ma_sound* pSound, float maxDistance);
-MA_API float ma_sound_get_max_distance(const ma_sound* pSound);
-MA_API void ma_sound_set_cone(ma_sound* pSound, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
-MA_API void ma_sound_get_cone(const ma_sound* pSound, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
-MA_API void ma_sound_set_doppler_factor(ma_sound* pSound, float dopplerFactor);
-MA_API float ma_sound_get_doppler_factor(const ma_sound* pSound);
-MA_API void ma_sound_set_directional_attenuation_factor(ma_sound* pSound, float directionalAttenuationFactor);
-MA_API float ma_sound_get_directional_attenuation_factor(const ma_sound* pSound);
-MA_API void ma_sound_set_fade_in_pcm_frames(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames);
-MA_API void ma_sound_set_fade_in_milliseconds(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds);
-MA_API void ma_sound_set_fade_start_in_pcm_frames(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames, ma_uint64 absoluteGlobalTimeInFrames);
-MA_API void ma_sound_set_fade_start_in_milliseconds(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds, ma_uint64 absoluteGlobalTimeInMilliseconds);
-MA_API float ma_sound_get_current_fade_volume(const ma_sound* pSound);
-MA_API void ma_sound_set_start_time_in_pcm_frames(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInFrames);
-MA_API void ma_sound_set_start_time_in_milliseconds(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInMilliseconds);
-MA_API void ma_sound_set_stop_time_in_pcm_frames(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInFrames);
-MA_API void ma_sound_set_stop_time_in_milliseconds(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInMilliseconds);
-MA_API void ma_sound_set_stop_time_with_fade_in_pcm_frames(ma_sound* pSound, ma_uint64 stopAbsoluteGlobalTimeInFrames, ma_uint64 fadeLengthInFrames);
-MA_API void ma_sound_set_stop_time_with_fade_in_milliseconds(ma_sound* pSound, ma_uint64 stopAbsoluteGlobalTimeInMilliseconds, ma_uint64 fadeLengthInMilliseconds);
-MA_API ma_bool32 ma_sound_is_playing(const ma_sound* pSound);
-MA_API ma_uint64 ma_sound_get_time_in_pcm_frames(const ma_sound* pSound);
-MA_API ma_uint64 ma_sound_get_time_in_milliseconds(const ma_sound* pSound);
-MA_API void ma_sound_set_looping(ma_sound* pSound, ma_bool32 isLooping);
-MA_API ma_bool32 ma_sound_is_looping(const ma_sound* pSound);
-MA_API ma_bool32 ma_sound_at_end(const ma_sound* pSound);
-MA_API ma_result ma_sound_seek_to_pcm_frame(ma_sound* pSound, ma_uint64 frameIndex); /* Just a wrapper around ma_data_source_seek_to_pcm_frame(). */
-MA_API ma_result ma_sound_get_data_format(ma_sound* pSound, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_sound_get_cursor_in_pcm_frames(ma_sound* pSound, ma_uint64* pCursor);
-MA_API ma_result ma_sound_get_length_in_pcm_frames(ma_sound* pSound, ma_uint64* pLength);
-MA_API ma_result ma_sound_get_cursor_in_seconds(ma_sound* pSound, float* pCursor);
-MA_API ma_result ma_sound_get_length_in_seconds(ma_sound* pSound, float* pLength);
-MA_API ma_result ma_sound_set_end_callback(ma_sound* pSound, ma_sound_end_proc callback, void* pUserData);
-
-MA_API ma_result ma_sound_group_init(ma_engine* pEngine, ma_uint32 flags, ma_sound_group* pParentGroup, ma_sound_group* pGroup);
-MA_API ma_result ma_sound_group_init_ex(ma_engine* pEngine, const ma_sound_group_config* pConfig, ma_sound_group* pGroup);
-MA_API void ma_sound_group_uninit(ma_sound_group* pGroup);
-MA_API ma_engine* ma_sound_group_get_engine(const ma_sound_group* pGroup);
-MA_API ma_result ma_sound_group_start(ma_sound_group* pGroup);
-MA_API ma_result ma_sound_group_stop(ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_volume(ma_sound_group* pGroup, float volume);
-MA_API float ma_sound_group_get_volume(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_pan(ma_sound_group* pGroup, float pan);
-MA_API float ma_sound_group_get_pan(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_pan_mode(ma_sound_group* pGroup, ma_pan_mode panMode);
-MA_API ma_pan_mode ma_sound_group_get_pan_mode(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_pitch(ma_sound_group* pGroup, float pitch);
-MA_API float ma_sound_group_get_pitch(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_spatialization_enabled(ma_sound_group* pGroup, ma_bool32 enabled);
-MA_API ma_bool32 ma_sound_group_is_spatialization_enabled(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_pinned_listener_index(ma_sound_group* pGroup, ma_uint32 listenerIndex);
-MA_API ma_uint32 ma_sound_group_get_pinned_listener_index(const ma_sound_group* pGroup);
-MA_API ma_uint32 ma_sound_group_get_listener_index(const ma_sound_group* pGroup);
-MA_API ma_vec3f ma_sound_group_get_direction_to_listener(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_position(ma_sound_group* pGroup, float x, float y, float z);
-MA_API ma_vec3f ma_sound_group_get_position(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_direction(ma_sound_group* pGroup, float x, float y, float z);
-MA_API ma_vec3f ma_sound_group_get_direction(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_velocity(ma_sound_group* pGroup, float x, float y, float z);
-MA_API ma_vec3f ma_sound_group_get_velocity(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_attenuation_model(ma_sound_group* pGroup, ma_attenuation_model attenuationModel);
-MA_API ma_attenuation_model ma_sound_group_get_attenuation_model(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_positioning(ma_sound_group* pGroup, ma_positioning positioning);
-MA_API ma_positioning ma_sound_group_get_positioning(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_rolloff(ma_sound_group* pGroup, float rolloff);
-MA_API float ma_sound_group_get_rolloff(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_min_gain(ma_sound_group* pGroup, float minGain);
-MA_API float ma_sound_group_get_min_gain(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_max_gain(ma_sound_group* pGroup, float maxGain);
-MA_API float ma_sound_group_get_max_gain(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_min_distance(ma_sound_group* pGroup, float minDistance);
-MA_API float ma_sound_group_get_min_distance(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_max_distance(ma_sound_group* pGroup, float maxDistance);
-MA_API float ma_sound_group_get_max_distance(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_cone(ma_sound_group* pGroup, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
-MA_API void ma_sound_group_get_cone(const ma_sound_group* pGroup, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
-MA_API void ma_sound_group_set_doppler_factor(ma_sound_group* pGroup, float dopplerFactor);
-MA_API float ma_sound_group_get_doppler_factor(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_directional_attenuation_factor(ma_sound_group* pGroup, float directionalAttenuationFactor);
-MA_API float ma_sound_group_get_directional_attenuation_factor(const ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_fade_in_pcm_frames(ma_sound_group* pGroup, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames);
-MA_API void ma_sound_group_set_fade_in_milliseconds(ma_sound_group* pGroup, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds);
-MA_API float ma_sound_group_get_current_fade_volume(ma_sound_group* pGroup);
-MA_API void ma_sound_group_set_start_time_in_pcm_frames(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInFrames);
-MA_API void ma_sound_group_set_start_time_in_milliseconds(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInMilliseconds);
-MA_API void ma_sound_group_set_stop_time_in_pcm_frames(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInFrames);
-MA_API void ma_sound_group_set_stop_time_in_milliseconds(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInMilliseconds);
-MA_API ma_bool32 ma_sound_group_is_playing(const ma_sound_group* pGroup);
-MA_API ma_uint64 ma_sound_group_get_time_in_pcm_frames(const ma_sound_group* pGroup);
-#endif  /* MA_NO_ENGINE */
-/* END SECTION: miniaudio_engine.h */
-
-#ifdef __cplusplus
-}
-#endif
-#endif  /* miniaudio_h */
-
-
-/*
-This is for preventing greying out of the implementation section.
-*/
-#if defined(Q_CREATOR_RUN) || defined(__INTELLISENSE__) || defined(__CDT_PARSER__)
-#define MINIAUDIO_IMPLEMENTATION
-#endif
-
-/************************************************************************************************************************************************************
-*************************************************************************************************************************************************************
-
-IMPLEMENTATION
-
-*************************************************************************************************************************************************************
-************************************************************************************************************************************************************/
-#if defined(MINIAUDIO_IMPLEMENTATION) || defined(MA_IMPLEMENTATION)
-#ifndef miniaudio_c
-#define miniaudio_c
-
-#include <assert.h>
-#include <limits.h>         /* For INT_MAX */
-#include <math.h>           /* sin(), etc. */
-#include <stdlib.h>         /* For malloc(), free(), wcstombs(). */
-#include <string.h>         /* For memset() */
-
-#include <stdarg.h>
-#include <stdio.h>
-#if !defined(_MSC_VER) && !defined(__DMC__)
-    #include <strings.h>    /* For strcasecmp(). */
-    #include <wchar.h>      /* For wcslen(), wcsrtombs() */
-#endif
-#ifdef _MSC_VER
-    #include <float.h>      /* For _controlfp_s constants */
-#endif
-
-#if defined(MA_WIN32)
-    #include <windows.h>
-
-    /*
-    There's a possibility that WIN32_LEAN_AND_MEAN has been defined which will exclude some symbols
-    such as STGM_READ and CLSCTL_ALL. We need to check these and define them ourselves if they're
-    unavailable.
-    */
-    #ifndef STGM_READ
-    #define STGM_READ   0x00000000L
-    #endif
-    #ifndef CLSCTX_ALL
-    #define CLSCTX_ALL  23
-    #endif
-
-    /* IUnknown is used by both the WASAPI and DirectSound backends. It easier to just declare our version here. */
-    typedef struct ma_IUnknown  ma_IUnknown;
-#endif
-
-#if !defined(MA_WIN32)
-#include <sched.h>
-#include <sys/time.h>   /* select() (used for ma_sleep()). */
-#include <pthread.h>
-#endif
-
-#ifdef MA_NX
-#include <time.h>       /* For nanosleep() */
-#endif
-
-#include <sys/stat.h>   /* For fstat(), etc. */
-
-#ifdef MA_EMSCRIPTEN
-#include <emscripten/emscripten.h>
-#endif
-
-
-/* Architecture Detection */
-#if !defined(MA_64BIT) && !defined(MA_32BIT)
-#ifdef _WIN32
-#ifdef _WIN64
-#define MA_64BIT
-#else
-#define MA_32BIT
-#endif
-#endif
-#endif
-
-#if !defined(MA_64BIT) && !defined(MA_32BIT)
-#ifdef __GNUC__
-#ifdef __LP64__
-#define MA_64BIT
-#else
-#define MA_32BIT
-#endif
-#endif
-#endif
-
-#if !defined(MA_64BIT) && !defined(MA_32BIT)
-#include <stdint.h>
-#if INTPTR_MAX == INT64_MAX
-#define MA_64BIT
-#else
-#define MA_32BIT
-#endif
-#endif
-
-#if defined(__arm__) || defined(_M_ARM)
-#define MA_ARM32
-#endif
-#if defined(__arm64) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64)
-#define MA_ARM64
-#endif
-
-#if defined(__x86_64__) || defined(_M_X64)
-#define MA_X64
-#elif defined(__i386) || defined(_M_IX86)
-#define MA_X86
-#elif defined(MA_ARM32) || defined(MA_ARM64)
-#define MA_ARM
-#endif
-
-/* Intrinsics Support */
-#if (defined(MA_X64) || defined(MA_X86)) && !defined(__COSMOPOLITAN__)
-    #if defined(_MSC_VER) && !defined(__clang__)
-        /* MSVC. */
-        #if _MSC_VER >= 1400 && !defined(MA_NO_SSE2)   /* 2005 */
-            #define MA_SUPPORT_SSE2
-        #endif
-        /*#if _MSC_VER >= 1600 && !defined(MA_NO_AVX)*/    /* 2010 */
-        /*    #define MA_SUPPORT_AVX*/
-        /*#endif*/
-        #if _MSC_VER >= 1700 && !defined(MA_NO_AVX2)   /* 2012 */
-            #define MA_SUPPORT_AVX2
-        #endif
-    #else
-        /* Assume GNUC-style. */
-        #if defined(__SSE2__) && !defined(MA_NO_SSE2)
-            #define MA_SUPPORT_SSE2
-        #endif
-        /*#if defined(__AVX__) && !defined(MA_NO_AVX)*/
-        /*    #define MA_SUPPORT_AVX*/
-        /*#endif*/
-        #if defined(__AVX2__) && !defined(MA_NO_AVX2)
-            #define MA_SUPPORT_AVX2
-        #endif
-    #endif
-
-    /* If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include. */
-    #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
-        #if !defined(MA_SUPPORT_SSE2)   && !defined(MA_NO_SSE2)   && __has_include(<emmintrin.h>)
-            #define MA_SUPPORT_SSE2
-        #endif
-        /*#if !defined(MA_SUPPORT_AVX)    && !defined(MA_NO_AVX)    && __has_include(<immintrin.h>)*/
-        /*    #define MA_SUPPORT_AVX*/
-        /*#endif*/
-        #if !defined(MA_SUPPORT_AVX2)   && !defined(MA_NO_AVX2)   && __has_include(<immintrin.h>)
-            #define MA_SUPPORT_AVX2
-        #endif
-    #endif
-
-    #if defined(MA_SUPPORT_AVX2) || defined(MA_SUPPORT_AVX)
-        #include <immintrin.h>
-    #elif defined(MA_SUPPORT_SSE2)
-        #include <emmintrin.h>
-    #endif
-#endif
-
-#if defined(MA_ARM)
-    #if !defined(MA_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
-        #define MA_SUPPORT_NEON
-        #include <arm_neon.h>
-    #endif
-#endif
-
-/* Begin globally disabled warnings. */
-#if defined(_MSC_VER)
-    #pragma warning(push)
-    #pragma warning(disable:4752)   /* found Intel(R) Advanced Vector Extensions; consider using /arch:AVX */
-    #pragma warning(disable:4049)   /* compiler limit : terminating line number emission */
-#endif
-
-#if defined(MA_X64) || defined(MA_X86)
-    #if defined(_MSC_VER) && !defined(__clang__)
-        #if _MSC_VER >= 1400
-            #include <intrin.h>
-            static MA_INLINE void ma_cpuid(int info[4], int fid)
-            {
-                __cpuid(info, fid);
-            }
-        #else
-            #define MA_NO_CPUID
-        #endif
-
-        #if _MSC_VER >= 1600 && (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219)
-            static MA_INLINE unsigned __int64 ma_xgetbv(int reg)
-            {
-                return _xgetbv(reg);
-            }
-        #else
-            #define MA_NO_XGETBV
-        #endif
-    #elif (defined(__GNUC__) || defined(__clang__)) && !defined(MA_ANDROID)
-        static MA_INLINE void ma_cpuid(int info[4], int fid)
-        {
-            /*
-            It looks like the -fPIC option uses the ebx register which GCC complains about. We can work around this by just using a different register, the
-            specific register of which I'm letting the compiler decide on. The "k" prefix is used to specify a 32-bit register. The {...} syntax is for
-            supporting different assembly dialects.
-
-            What's basically happening is that we're saving and restoring the ebx register manually.
-            */
-            #if defined(MA_X86) && defined(__PIC__)
-                __asm__ __volatile__ (
-                    "xchg{l} {%%}ebx, %k1;"
-                    "cpuid;"
-                    "xchg{l} {%%}ebx, %k1;"
-                    : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
-                );
-            #else
-                __asm__ __volatile__ (
-                    "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
-                );
-            #endif
-        }
-
-        static MA_INLINE ma_uint64 ma_xgetbv(int reg)
-        {
-            unsigned int hi;
-            unsigned int lo;
-
-            __asm__ __volatile__ (
-                "xgetbv" : "=a"(lo), "=d"(hi) : "c"(reg)
-            );
-
-            return ((ma_uint64)hi << 32) | (ma_uint64)lo;
-        }
-    #else
-        #define MA_NO_CPUID
-        #define MA_NO_XGETBV
-    #endif
-#else
-    #define MA_NO_CPUID
-    #define MA_NO_XGETBV
-#endif
-
-static MA_INLINE ma_bool32 ma_has_sse2(void)
-{
-#if defined(MA_SUPPORT_SSE2)
-    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_NO_SSE2)
-        #if defined(MA_X64)
-            return MA_TRUE;    /* 64-bit targets always support SSE2. */
-        #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
-            return MA_TRUE;    /* If the compiler is allowed to freely generate SSE2 code we can assume support. */
-        #else
-            #if defined(MA_NO_CPUID)
-                return MA_FALSE;
-            #else
-                int info[4];
-                ma_cpuid(info, 1);
-                return (info[3] & (1 << 26)) != 0;
-            #endif
-        #endif
-    #else
-        return MA_FALSE;       /* SSE2 is only supported on x86 and x64 architectures. */
-    #endif
-#else
-    return MA_FALSE;           /* No compiler support. */
-#endif
-}
-
-#if 0
-static MA_INLINE ma_bool32 ma_has_avx()
-{
-#if defined(MA_SUPPORT_AVX)
-    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_NO_AVX)
-        #if defined(_AVX_) || defined(__AVX__)
-            return MA_TRUE;    /* If the compiler is allowed to freely generate AVX code we can assume support. */
-        #else
-            /* AVX requires both CPU and OS support. */
-            #if defined(MA_NO_CPUID) || defined(MA_NO_XGETBV)
-                return MA_FALSE;
-            #else
-                int info[4];
-                ma_cpuid(info, 1);
-                if (((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0)) {
-                    ma_uint64 xrc = ma_xgetbv(0);
-                    if ((xrc & 0x06) == 0x06) {
-                        return MA_TRUE;
-                    } else {
-                        return MA_FALSE;
-                    }
-                } else {
-                    return MA_FALSE;
-                }
-            #endif
-        #endif
-    #else
-        return MA_FALSE;       /* AVX is only supported on x86 and x64 architectures. */
-    #endif
-#else
-    return MA_FALSE;           /* No compiler support. */
-#endif
-}
-#endif
-
-static MA_INLINE ma_bool32 ma_has_avx2(void)
-{
-#if defined(MA_SUPPORT_AVX2)
-    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_NO_AVX2)
-        #if defined(_AVX2_) || defined(__AVX2__)
-            return MA_TRUE;    /* If the compiler is allowed to freely generate AVX2 code we can assume support. */
-        #else
-            /* AVX2 requires both CPU and OS support. */
-            #if defined(MA_NO_CPUID) || defined(MA_NO_XGETBV)
-                return MA_FALSE;
-            #else
-                int info1[4];
-                int info7[4];
-                ma_cpuid(info1, 1);
-                ma_cpuid(info7, 7);
-                if (((info1[2] & (1 << 27)) != 0) && ((info7[1] & (1 << 5)) != 0)) {
-                    ma_uint64 xrc = ma_xgetbv(0);
-                    if ((xrc & 0x06) == 0x06) {
-                        return MA_TRUE;
-                    } else {
-                        return MA_FALSE;
-                    }
-                } else {
-                    return MA_FALSE;
-                }
-            #endif
-        #endif
-    #else
-        return MA_FALSE;       /* AVX2 is only supported on x86 and x64 architectures. */
-    #endif
-#else
-    return MA_FALSE;           /* No compiler support. */
-#endif
-}
-
-static MA_INLINE ma_bool32 ma_has_neon(void)
-{
-#if defined(MA_SUPPORT_NEON)
-    #if defined(MA_ARM) && !defined(MA_NO_NEON)
-        #if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
-            return MA_TRUE;    /* If the compiler is allowed to freely generate NEON code we can assume support. */
-        #else
-            /* TODO: Runtime check. */
-            return MA_FALSE;
-        #endif
-    #else
-        return MA_FALSE;       /* NEON is only supported on ARM architectures. */
-    #endif
-#else
-    return MA_FALSE;           /* No compiler support. */
-#endif
-}
-
-#if defined(__has_builtin)
-    #define MA_COMPILER_HAS_BUILTIN(x) __has_builtin(x)
-#else
-    #define MA_COMPILER_HAS_BUILTIN(x) 0
-#endif
-
-#ifndef MA_ASSUME
-    #if MA_COMPILER_HAS_BUILTIN(__builtin_assume)
-        #define MA_ASSUME(x) __builtin_assume(x)
-    #elif MA_COMPILER_HAS_BUILTIN(__builtin_unreachable)
-        #define MA_ASSUME(x) do { if (!(x)) __builtin_unreachable(); } while (0)
-    #elif defined(_MSC_VER)
-        #define MA_ASSUME(x) __assume(x)
-    #else
-        #define MA_ASSUME(x) (void)(x)
-    #endif
-#endif
-
-#ifndef MA_RESTRICT
-    #if defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER)
-        #define MA_RESTRICT __restrict
-    #else
-        #define MA_RESTRICT
-    #endif
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-    #define MA_HAS_BYTESWAP16_INTRINSIC
-    #define MA_HAS_BYTESWAP32_INTRINSIC
-    #define MA_HAS_BYTESWAP64_INTRINSIC
-#elif defined(__clang__)
-    #if MA_COMPILER_HAS_BUILTIN(__builtin_bswap16)
-        #define MA_HAS_BYTESWAP16_INTRINSIC
-    #endif
-    #if MA_COMPILER_HAS_BUILTIN(__builtin_bswap32)
-        #define MA_HAS_BYTESWAP32_INTRINSIC
-    #endif
-    #if MA_COMPILER_HAS_BUILTIN(__builtin_bswap64)
-        #define MA_HAS_BYTESWAP64_INTRINSIC
-    #endif
-#elif defined(__GNUC__)
-    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
-        #define MA_HAS_BYTESWAP32_INTRINSIC
-        #define MA_HAS_BYTESWAP64_INTRINSIC
-    #endif
-    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
-        #define MA_HAS_BYTESWAP16_INTRINSIC
-    #endif
-#endif
-
-
-static MA_INLINE ma_bool32 ma_is_little_endian(void)
-{
-#if defined(MA_X86) || defined(MA_X64)
-    return MA_TRUE;
-#else
-    int n = 1;
-    return (*(char*)&n) == 1;
-#endif
-}
-
-static MA_INLINE ma_bool32 ma_is_big_endian(void)
-{
-    return !ma_is_little_endian();
-}
-
-
-static MA_INLINE ma_uint32 ma_swap_endian_uint32(ma_uint32 n)
-{
-#ifdef MA_HAS_BYTESWAP32_INTRINSIC
-    #if defined(_MSC_VER)
-        return _byteswap_ulong(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        #if defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(MA_64BIT)   /* <-- 64-bit inline assembly has not been tested, so disabling for now. */
-            /* Inline assembly optimized implementation for ARM. In my testing, GCC does not generate optimized code with __builtin_bswap32(). */
-            ma_uint32 r;
-            __asm__ __volatile__ (
-            #if defined(MA_64BIT)
-                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
-            #else
-                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
-            #endif
-            );
-            return r;
-        #else
-            return __builtin_bswap32(n);
-        #endif
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    return ((n & 0xFF000000) >> 24) |
-           ((n & 0x00FF0000) >>  8) |
-           ((n & 0x0000FF00) <<  8) |
-           ((n & 0x000000FF) << 24);
-#endif
-}
-
-
-#if !defined(MA_EMSCRIPTEN)
-#ifdef MA_WIN32
-static void ma_sleep__win32(ma_uint32 milliseconds)
-{
-    Sleep((DWORD)milliseconds);
-}
-#endif
-#ifdef MA_POSIX
-static void ma_sleep__posix(ma_uint32 milliseconds)
-{
-#ifdef MA_EMSCRIPTEN
-    (void)milliseconds;
-    MA_ASSERT(MA_FALSE);  /* The Emscripten build should never sleep. */
-#else
-    #if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309L) || defined(MA_NX)
-        struct timespec ts;
-        ts.tv_sec  = milliseconds / 1000;
-        ts.tv_nsec = milliseconds % 1000 * 1000000;
-        nanosleep(&ts, NULL);
-    #else
-        struct timeval tv;
-        tv.tv_sec  = milliseconds / 1000;
-        tv.tv_usec = milliseconds % 1000 * 1000;
-        select(0, NULL, NULL, NULL, &tv);
-    #endif
-#endif
-}
-#endif
-
-static MA_INLINE void ma_sleep(ma_uint32 milliseconds)
-{
-#ifdef MA_WIN32
-    ma_sleep__win32(milliseconds);
-#endif
-#ifdef MA_POSIX
-    ma_sleep__posix(milliseconds);
-#endif
-}
-#endif
-
-static MA_INLINE void ma_yield(void)
-{
-#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-    /* x86/x64 */
-    #if (defined(_MSC_VER) || defined(__WATCOMC__) || defined(__DMC__)) && !defined(__clang__)
-        #if _MSC_VER >= 1400
-            _mm_pause();
-        #else
-            #if defined(__DMC__)
-                /* Digital Mars does not recognize the PAUSE opcode. Fall back to NOP. */
-                __asm nop;
-            #else
-                __asm pause;
-            #endif
-        #endif
-    #else
-        __asm__ __volatile__ ("pause");
-    #endif
-#elif (defined(__arm__) && defined(__ARM_ARCH) && __ARM_ARCH >= 7) || defined(_M_ARM64) || (defined(_M_ARM) && _M_ARM >= 7) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__)
-    /* ARM */
-    #if defined(_MSC_VER)
-        /* Apparently there is a __yield() intrinsic that's compatible with ARM, but I cannot find documentation for it nor can I find where it's declared. */
-        __yield();
-    #else
-        __asm__ __volatile__ ("yield"); /* ARMv6K/ARMv6T2 and above. */
-    #endif
-#else
-    /* Unknown or unsupported architecture. No-op. */
-#endif
-}
-
-
-#define MA_MM_DENORMALS_ZERO_MASK   0x0040
-#define MA_MM_FLUSH_ZERO_MASK       0x8000
-
-static MA_INLINE unsigned int ma_disable_denormals(void)
-{
-    unsigned int prevState;
-
-    #if defined(_MSC_VER)
-    {
-        /*
-        Older versions of Visual Studio don't support the "safe" versions of _controlfp_s(). I don't
-        know which version of Visual Studio first added support for _controlfp_s(), but I do know
-        that VC6 lacks support. _MSC_VER = 1200 is VC6, but if you get compilation errors on older
-        versions of Visual Studio, let me know and I'll make the necessary adjustment.
-        */
-        #if _MSC_VER <= 1200
-        {
-            prevState = _statusfp();
-            _controlfp(prevState | _DN_FLUSH, _MCW_DN);
-        }
-        #else
-        {
-            unsigned int unused;
-            _controlfp_s(&prevState, 0, 0);
-            _controlfp_s(&unused, prevState | _DN_FLUSH, _MCW_DN);
-        }
-        #endif
-    }
-    #elif defined(MA_X86) || defined(MA_X64)
-    {
-        #if defined(__SSE2__) && !(defined(__TINYC__) || defined(__WATCOMC__) || defined(__COSMOPOLITAN__)) /* <-- Add compilers that lack support for _mm_getcsr() and _mm_setcsr() to this list. */
-        {
-            prevState = _mm_getcsr();
-            _mm_setcsr(prevState | MA_MM_DENORMALS_ZERO_MASK | MA_MM_FLUSH_ZERO_MASK);
-        }
-        #else
-        {
-            /* x88/64, but no support for _mm_getcsr()/_mm_setcsr(). May need to fall back to inlined assembly here. */
-            prevState = 0;
-        }
-        #endif
-    }
-    #else
-    {
-        /* Unknown or unsupported architecture. No-op. */
-        prevState = 0;
-    }
-    #endif
-
-    return prevState;
-}
-
-static MA_INLINE void ma_restore_denormals(unsigned int prevState)
-{
-    #if defined(_MSC_VER)
-    {
-        /* Older versions of Visual Studio do not support _controlfp_s(). See ma_disable_denormals(). */
-        #if _MSC_VER <= 1200
-        {
-            _controlfp(prevState, _MCW_DN);
-        }
-        #else
-        {
-            unsigned int unused;
-            _controlfp_s(&unused, prevState, _MCW_DN);
-        }
-        #endif
-    }
-    #elif defined(MA_X86) || defined(MA_X64)
-    {
-        #if defined(__SSE2__) && !(defined(__TINYC__) || defined(__WATCOMC__) || defined(__COSMOPOLITAN__))   /* <-- Add compilers that lack support for _mm_getcsr() and _mm_setcsr() to this list. */
-        {
-            _mm_setcsr(prevState);
-        }
-        #else
-        {
-            /* x88/64, but no support for _mm_getcsr()/_mm_setcsr(). May need to fall back to inlined assembly here. */
-            (void)prevState;
-        }
-        #endif
-    }
-    #else
-    {
-        /* Unknown or unsupported architecture. No-op. */
-        (void)prevState;
-    }
-    #endif
-}
-
-
-#ifdef MA_ANDROID
-#include <sys/system_properties.h>
-
-int ma_android_sdk_version()
-{
-    char sdkVersion[PROP_VALUE_MAX + 1] = {0, };
-    if (__system_property_get("ro.build.version.sdk", sdkVersion)) {
-        return atoi(sdkVersion);
-    }
-
-    return 0;
-}
-#endif
-
-
-#ifndef MA_COINIT_VALUE
-#define MA_COINIT_VALUE    0   /* 0 = COINIT_MULTITHREADED */
-#endif
-
-
-#ifndef MA_FLT_MAX
-    #ifdef FLT_MAX
-        #define MA_FLT_MAX FLT_MAX
-    #else
-        #define MA_FLT_MAX 3.402823466e+38F
-    #endif
-#endif
-
-
-#ifndef MA_PI
-#define MA_PI      3.14159265358979323846264f
-#endif
-#ifndef MA_PI_D
-#define MA_PI_D    3.14159265358979323846264
-#endif
-#ifndef MA_TAU
-#define MA_TAU     6.28318530717958647693f
-#endif
-#ifndef MA_TAU_D
-#define MA_TAU_D   6.28318530717958647693
-#endif
-
-
-/* The default format when ma_format_unknown (0) is requested when initializing a device. */
-#ifndef MA_DEFAULT_FORMAT
-#define MA_DEFAULT_FORMAT                                   ma_format_f32
-#endif
-
-/* The default channel count to use when 0 is used when initializing a device. */
-#ifndef MA_DEFAULT_CHANNELS
-#define MA_DEFAULT_CHANNELS                                 2
-#endif
-
-/* The default sample rate to use when 0 is used when initializing a device. */
-#ifndef MA_DEFAULT_SAMPLE_RATE
-#define MA_DEFAULT_SAMPLE_RATE                              48000
-#endif
-
-/* Default periods when none is specified in ma_device_init(). More periods means more work on the CPU. */
-#ifndef MA_DEFAULT_PERIODS
-#define MA_DEFAULT_PERIODS                                  3
-#endif
-
-/* The default period size in milliseconds for low latency mode. */
-#ifndef MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY
-#define MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY  10
-#endif
-
-/* The default buffer size in milliseconds for conservative mode. */
-#ifndef MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE
-#define MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE 100
-#endif
-
-/* The default LPF filter order for linear resampling. Note that this is clamped to MA_MAX_FILTER_ORDER. */
-#ifndef MA_DEFAULT_RESAMPLER_LPF_ORDER
-    #if MA_MAX_FILTER_ORDER >= 4
-        #define MA_DEFAULT_RESAMPLER_LPF_ORDER  4
-    #else
-        #define MA_DEFAULT_RESAMPLER_LPF_ORDER  MA_MAX_FILTER_ORDER
-    #endif
-#endif
-
-
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wunused-variable"
-#endif
-
-/* Standard sample rates, in order of priority. */
-static ma_uint32 g_maStandardSampleRatePriorities[] = {
-    (ma_uint32)ma_standard_sample_rate_48000,
-    (ma_uint32)ma_standard_sample_rate_44100,
-
-    (ma_uint32)ma_standard_sample_rate_32000,
-    (ma_uint32)ma_standard_sample_rate_24000,
-    (ma_uint32)ma_standard_sample_rate_22050,
-
-    (ma_uint32)ma_standard_sample_rate_88200,
-    (ma_uint32)ma_standard_sample_rate_96000,
-    (ma_uint32)ma_standard_sample_rate_176400,
-    (ma_uint32)ma_standard_sample_rate_192000,
-
-    (ma_uint32)ma_standard_sample_rate_16000,
-    (ma_uint32)ma_standard_sample_rate_11025,
-    (ma_uint32)ma_standard_sample_rate_8000,
-
-    (ma_uint32)ma_standard_sample_rate_352800,
-    (ma_uint32)ma_standard_sample_rate_384000
-};
-
-static MA_INLINE ma_bool32 ma_is_standard_sample_rate(ma_uint32 sampleRate)
-{
-    ma_uint32 iSampleRate;
-
-    for (iSampleRate = 0; iSampleRate < sizeof(g_maStandardSampleRatePriorities) / sizeof(g_maStandardSampleRatePriorities[0]); iSampleRate += 1) {
-        if (g_maStandardSampleRatePriorities[iSampleRate] == sampleRate) {
-            return MA_TRUE;
-        }
-    }
-
-    /* Getting here means the sample rate is not supported. */
-    return MA_FALSE;
-}
-
-
-static ma_format g_maFormatPriorities[] = {
-    ma_format_s16,         /* Most common */
-    ma_format_f32,
-
-    /*ma_format_s24_32,*/    /* Clean alignment */
-    ma_format_s32,
-
-    ma_format_s24,         /* Unclean alignment */
-
-    ma_format_u8           /* Low quality */
-};
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-    #pragma GCC diagnostic pop
-#endif
-
-
-MA_API void ma_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision)
-{
-    if (pMajor) {
-        *pMajor = MA_VERSION_MAJOR;
-    }
-
-    if (pMinor) {
-        *pMinor = MA_VERSION_MINOR;
-    }
-
-    if (pRevision) {
-        *pRevision = MA_VERSION_REVISION;
-    }
-}
-
-MA_API const char* ma_version_string(void)
-{
-    return MA_VERSION_STRING;
-}
-
-
-/******************************************************************************
-
-Standard Library Stuff
-
-******************************************************************************/
-#ifndef MA_ASSERT
-#define MA_ASSERT(condition)            assert(condition)
-#endif
-
-#ifndef MA_MALLOC
-#define MA_MALLOC(sz)                   malloc((sz))
-#endif
-#ifndef MA_REALLOC
-#define MA_REALLOC(p, sz)               realloc((p), (sz))
-#endif
-#ifndef MA_FREE
-#define MA_FREE(p)                      free((p))
-#endif
-
-static MA_INLINE void ma_zero_memory_default(void* p, size_t sz)
-{
-    if (p == NULL) {
-        MA_ASSERT(sz == 0); /* If this is triggered there's an error with the calling code. */
-        return;
-    }
-
-    if (sz > 0) {
-        memset(p, 0, sz);
-    }
-}
-
-
-#ifndef MA_ZERO_MEMORY
-#define MA_ZERO_MEMORY(p, sz)           ma_zero_memory_default((p), (sz))
-#endif
-#ifndef MA_COPY_MEMORY
-#define MA_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
-#endif
-#ifndef MA_MOVE_MEMORY
-#define MA_MOVE_MEMORY(dst, src, sz)    memmove((dst), (src), (sz))
-#endif
-
-#define MA_ZERO_OBJECT(p)               MA_ZERO_MEMORY((p), sizeof(*(p)))
-
-#define ma_countof(x)                   (sizeof(x) / sizeof(x[0]))
-#define ma_max(x, y)                    (((x) > (y)) ? (x) : (y))
-#define ma_min(x, y)                    (((x) < (y)) ? (x) : (y))
-#define ma_abs(x)                       (((x) > 0) ? (x) : -(x))
-#define ma_clamp(x, lo, hi)             (ma_max(lo, ma_min(x, hi)))
-#define ma_offset_ptr(p, offset)        (((ma_uint8*)(p)) + (offset))
-#define ma_align(x, a)                  (((x) + ((a)-1)) & ~((a)-1))
-#define ma_align_64(x)                  ma_align(x, 8)
-
-#define ma_buffer_frame_capacity(buffer, channels, format) (sizeof(buffer) / ma_get_bytes_per_sample(format) / (channels))
-
-static MA_INLINE double ma_sind(double x)
-{
-    /* TODO: Implement custom sin(x). */
-    return sin(x);
-}
-
-static MA_INLINE double ma_expd(double x)
-{
-    /* TODO: Implement custom exp(x). */
-    return exp(x);
-}
-
-static MA_INLINE double ma_logd(double x)
-{
-    /* TODO: Implement custom log(x). */
-    return log(x);
-}
-
-static MA_INLINE double ma_powd(double x, double y)
-{
-    /* TODO: Implement custom pow(x, y). */
-    return pow(x, y);
-}
-
-static MA_INLINE double ma_sqrtd(double x)
-{
-    /* TODO: Implement custom sqrt(x). */
-    return sqrt(x);
-}
-
-
-static MA_INLINE float ma_rsqrtf(float x)
-{
-    #if defined(MA_SUPPORT_SSE2) && !defined(MA_NO_SSE2) && (defined(MA_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__))
-    {
-        /*
-        For SSE we can use RSQRTSS.
-
-        This Stack Overflow post suggests that compilers don't necessarily generate optimal code
-        when using intrinsics:
-
-            https://web.archive.org/web/20221211012522/https://stackoverflow.com/questions/32687079/getting-fewest-instructions-for-rsqrtss-wrapper
-
-        I'm going to do something similar here, but a bit simpler.
-        */
-        #if defined(__GNUC__) || defined(__clang__)
-        {
-            float result;
-            __asm__ __volatile__("rsqrtss %1, %0" : "=x"(result) : "x"(x));
-            return result;
-        }
-        #else
-        {
-            return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ps1(x)));
-        }
-        #endif
-    }
-    #else
-    {
-        return 1 / (float)ma_sqrtd(x);
-    }
-    #endif
-}
-
-
-static MA_INLINE float ma_sinf(float x)
-{
-    return (float)ma_sind((float)x);
-}
-
-static MA_INLINE double ma_cosd(double x)
-{
-    return ma_sind((MA_PI_D*0.5) - x);
-}
-
-static MA_INLINE float ma_cosf(float x)
-{
-    return (float)ma_cosd((float)x);
-}
-
-static MA_INLINE double ma_log10d(double x)
-{
-    return ma_logd(x) * 0.43429448190325182765;
-}
-
-static MA_INLINE float ma_powf(float x, float y)
-{
-    return (float)ma_powd((double)x, (double)y);
-}
-
-static MA_INLINE float ma_log10f(float x)
-{
-    return (float)ma_log10d((double)x);
-}
-
-
-static MA_INLINE double ma_degrees_to_radians(double degrees)
-{
-    return degrees * 0.01745329252;
-}
-
-static MA_INLINE double ma_radians_to_degrees(double radians)
-{
-    return radians * 57.295779512896;
-}
-
-static MA_INLINE float ma_degrees_to_radians_f(float degrees)
-{
-    return degrees * 0.01745329252f;
-}
-
-static MA_INLINE float ma_radians_to_degrees_f(float radians)
-{
-    return radians * 57.295779512896f;
-}
-
-
-/*
-Return Values:
-  0:  Success
-  22: EINVAL
-  34: ERANGE
-
-Not using symbolic constants for errors because I want to avoid #including errno.h
-
-These are marked as no-inline because of some bad code generation by Clang. None of these functions
-are used in any performance-critical code within miniaudio.
-*/
-MA_API MA_NO_INLINE int ma_strcpy_s(char* dst, size_t dstSizeInBytes, const char* src)
-{
-    size_t i;
-
-    if (dst == 0) {
-        return 22;
-    }
-    if (dstSizeInBytes == 0) {
-        return 34;
-    }
-    if (src == 0) {
-        dst[0] = '\0';
-        return 22;
-    }
-
-    for (i = 0; i < dstSizeInBytes && src[i] != '\0'; ++i) {
-        dst[i] = src[i];
-    }
-
-    if (i < dstSizeInBytes) {
-        dst[i] = '\0';
-        return 0;
-    }
-
-    dst[0] = '\0';
-    return 34;
-}
-
-MA_API MA_NO_INLINE int ma_wcscpy_s(wchar_t* dst, size_t dstCap, const wchar_t* src)
-{
-    size_t i;
-
-    if (dst == 0) {
-        return 22;
-    }
-    if (dstCap == 0) {
-        return 34;
-    }
-    if (src == 0) {
-        dst[0] = '\0';
-        return 22;
-    }
-
-    for (i = 0; i < dstCap && src[i] != '\0'; ++i) {
-        dst[i] = src[i];
-    }
-
-    if (i < dstCap) {
-        dst[i] = '\0';
-        return 0;
-    }
-
-    dst[0] = '\0';
-    return 34;
-}
-
-
-MA_API MA_NO_INLINE int ma_strncpy_s(char* dst, size_t dstSizeInBytes, const char* src, size_t count)
-{
-    size_t maxcount;
-    size_t i;
-
-    if (dst == 0) {
-        return 22;
-    }
-    if (dstSizeInBytes == 0) {
-        return 34;
-    }
-    if (src == 0) {
-        dst[0] = '\0';
-        return 22;
-    }
-
-    maxcount = count;
-    if (count == ((size_t)-1) || count >= dstSizeInBytes) {        /* -1 = _TRUNCATE */
-        maxcount = dstSizeInBytes - 1;
-    }
-
-    for (i = 0; i < maxcount && src[i] != '\0'; ++i) {
-        dst[i] = src[i];
-    }
-
-    if (src[i] == '\0' || i == count || count == ((size_t)-1)) {
-        dst[i] = '\0';
-        return 0;
-    }
-
-    dst[0] = '\0';
-    return 34;
-}
-
-MA_API MA_NO_INLINE int ma_strcat_s(char* dst, size_t dstSizeInBytes, const char* src)
-{
-    char* dstorig;
-
-    if (dst == 0) {
-        return 22;
-    }
-    if (dstSizeInBytes == 0) {
-        return 34;
-    }
-    if (src == 0) {
-        dst[0] = '\0';
-        return 22;
-    }
-
-    dstorig = dst;
-
-    while (dstSizeInBytes > 0 && dst[0] != '\0') {
-        dst += 1;
-        dstSizeInBytes -= 1;
-    }
-
-    if (dstSizeInBytes == 0) {
-        return 22;  /* Unterminated. */
-    }
-
-
-    while (dstSizeInBytes > 0 && src[0] != '\0') {
-        *dst++ = *src++;
-        dstSizeInBytes -= 1;
-    }
-
-    if (dstSizeInBytes > 0) {
-        dst[0] = '\0';
-    } else {
-        dstorig[0] = '\0';
-        return 34;
-    }
-
-    return 0;
-}
-
-MA_API MA_NO_INLINE int ma_strncat_s(char* dst, size_t dstSizeInBytes, const char* src, size_t count)
-{
-    char* dstorig;
-
-    if (dst == 0) {
-        return 22;
-    }
-    if (dstSizeInBytes == 0) {
-        return 34;
-    }
-    if (src == 0) {
-        return 22;
-    }
-
-    dstorig = dst;
-
-    while (dstSizeInBytes > 0 && dst[0] != '\0') {
-        dst += 1;
-        dstSizeInBytes -= 1;
-    }
-
-    if (dstSizeInBytes == 0) {
-        return 22;  /* Unterminated. */
-    }
-
-
-    if (count == ((size_t)-1)) {        /* _TRUNCATE */
-        count = dstSizeInBytes - 1;
-    }
-
-    while (dstSizeInBytes > 0 && src[0] != '\0' && count > 0) {
-        *dst++ = *src++;
-        dstSizeInBytes -= 1;
-        count -= 1;
-    }
-
-    if (dstSizeInBytes > 0) {
-        dst[0] = '\0';
-    } else {
-        dstorig[0] = '\0';
-        return 34;
-    }
-
-    return 0;
-}
-
-MA_API MA_NO_INLINE int ma_itoa_s(int value, char* dst, size_t dstSizeInBytes, int radix)
-{
-    int sign;
-    unsigned int valueU;
-    char* dstEnd;
-
-    if (dst == NULL || dstSizeInBytes == 0) {
-        return 22;
-    }
-    if (radix < 2 || radix > 36) {
-        dst[0] = '\0';
-        return 22;
-    }
-
-    sign = (value < 0 && radix == 10) ? -1 : 1;     /* The negative sign is only used when the base is 10. */
-
-    if (value < 0) {
-        valueU = -value;
-    } else {
-        valueU = value;
-    }
-
-    dstEnd = dst;
-    do
-    {
-        int remainder = valueU % radix;
-        if (remainder > 9) {
-            *dstEnd = (char)((remainder - 10) + 'a');
-        } else {
-            *dstEnd = (char)(remainder + '0');
-        }
-
-        dstEnd += 1;
-        dstSizeInBytes -= 1;
-        valueU /= radix;
-    } while (dstSizeInBytes > 0 && valueU > 0);
-
-    if (dstSizeInBytes == 0) {
-        dst[0] = '\0';
-        return 22;  /* Ran out of room in the output buffer. */
-    }
-
-    if (sign < 0) {
-        *dstEnd++ = '-';
-        dstSizeInBytes -= 1;
-    }
-
-    if (dstSizeInBytes == 0) {
-        dst[0] = '\0';
-        return 22;  /* Ran out of room in the output buffer. */
-    }
-
-    *dstEnd = '\0';
-
-
-    /* At this point the string will be reversed. */
-    dstEnd -= 1;
-    while (dst < dstEnd) {
-        char temp = *dst;
-        *dst = *dstEnd;
-        *dstEnd = temp;
-
-        dst += 1;
-        dstEnd -= 1;
-    }
-
-    return 0;
-}
-
-MA_API MA_NO_INLINE int ma_strcmp(const char* str1, const char* str2)
-{
-    if (str1 == str2) return  0;
-
-    /* These checks differ from the standard implementation. It's not important, but I prefer it just for sanity. */
-    if (str1 == NULL) return -1;
-    if (str2 == NULL) return  1;
-
-    for (;;) {
-        if (str1[0] == '\0') {
-            break;
-        }
-        if (str1[0] != str2[0]) {
-            break;
-        }
-
-        str1 += 1;
-        str2 += 1;
-    }
-
-    return ((unsigned char*)str1)[0] - ((unsigned char*)str2)[0];
-}
-
-MA_API MA_NO_INLINE int ma_strappend(char* dst, size_t dstSize, const char* srcA, const char* srcB)
-{
-    int result;
-
-    result = ma_strncpy_s(dst, dstSize, srcA, (size_t)-1);
-    if (result != 0) {
-        return result;
-    }
-
-    result = ma_strncat_s(dst, dstSize, srcB, (size_t)-1);
-    if (result != 0) {
-        return result;
-    }
-
-    return result;
-}
-
-MA_API MA_NO_INLINE char* ma_copy_string(const char* src, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    size_t sz;
-    char* dst;
-
-    if (src == NULL) {
-        return NULL;
-    }
-
-    sz = strlen(src)+1;
-    dst = (char*)ma_malloc(sz, pAllocationCallbacks);
-    if (dst == NULL) {
-        return NULL;
-    }
-
-    ma_strcpy_s(dst, sz, src);
-
-    return dst;
-}
-
-MA_API MA_NO_INLINE wchar_t* ma_copy_string_w(const wchar_t* src, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    size_t sz = wcslen(src)+1;
-    wchar_t* dst = (wchar_t*)ma_malloc(sz * sizeof(*dst), pAllocationCallbacks);
-    if (dst == NULL) {
-        return NULL;
-    }
-
-    ma_wcscpy_s(dst, sz, src);
-
-    return dst;
-}
-
-
-
-#include <errno.h>
-static ma_result ma_result_from_errno(int e)
-{
-    if (e == 0) {
-        return MA_SUCCESS;
-    }
-#ifdef EPERM
-    else if (e == EPERM) { return MA_INVALID_OPERATION; }
-#endif
-#ifdef ENOENT
-    else if (e == ENOENT) { return MA_DOES_NOT_EXIST; }
-#endif
-#ifdef ESRCH
-    else if (e == ESRCH) { return MA_DOES_NOT_EXIST; }
-#endif
-#ifdef EINTR
-    else if (e == EINTR) { return MA_INTERRUPT; }
-#endif
-#ifdef EIO
-    else if (e == EIO) { return MA_IO_ERROR; }
-#endif
-#ifdef ENXIO
-    else if (e == ENXIO) { return MA_DOES_NOT_EXIST; }
-#endif
-#ifdef E2BIG
-    else if (e == E2BIG) { return MA_INVALID_ARGS; }
-#endif
-#ifdef ENOEXEC
-    else if (e == ENOEXEC) { return MA_INVALID_FILE; }
-#endif
-#ifdef EBADF
-    else if (e == EBADF) { return MA_INVALID_FILE; }
-#endif
-#ifdef ECHILD
-    else if (e == ECHILD) { return MA_ERROR; }
-#endif
-#ifdef EAGAIN
-    else if (e == EAGAIN) { return MA_UNAVAILABLE; }
-#endif
-#ifdef ENOMEM
-    else if (e == ENOMEM) { return MA_OUT_OF_MEMORY; }
-#endif
-#ifdef EACCES
-    else if (e == EACCES) { return MA_ACCESS_DENIED; }
-#endif
-#ifdef EFAULT
-    else if (e == EFAULT) { return MA_BAD_ADDRESS; }
-#endif
-#ifdef ENOTBLK
-    else if (e == ENOTBLK) { return MA_ERROR; }
-#endif
-#ifdef EBUSY
-    else if (e == EBUSY) { return MA_BUSY; }
-#endif
-#ifdef EEXIST
-    else if (e == EEXIST) { return MA_ALREADY_EXISTS; }
-#endif
-#ifdef EXDEV
-    else if (e == EXDEV) { return MA_ERROR; }
-#endif
-#ifdef ENODEV
-    else if (e == ENODEV) { return MA_DOES_NOT_EXIST; }
-#endif
-#ifdef ENOTDIR
-    else if (e == ENOTDIR) { return MA_NOT_DIRECTORY; }
-#endif
-#ifdef EISDIR
-    else if (e == EISDIR) { return MA_IS_DIRECTORY; }
-#endif
-#ifdef EINVAL
-    else if (e == EINVAL) { return MA_INVALID_ARGS; }
-#endif
-#ifdef ENFILE
-    else if (e == ENFILE) { return MA_TOO_MANY_OPEN_FILES; }
-#endif
-#ifdef EMFILE
-    else if (e == EMFILE) { return MA_TOO_MANY_OPEN_FILES; }
-#endif
-#ifdef ENOTTY
-    else if (e == ENOTTY) { return MA_INVALID_OPERATION; }
-#endif
-#ifdef ETXTBSY
-    else if (e == ETXTBSY) { return MA_BUSY; }
-#endif
-#ifdef EFBIG
-    else if (e == EFBIG) { return MA_TOO_BIG; }
-#endif
-#ifdef ENOSPC
-    else if (e == ENOSPC) { return MA_NO_SPACE; }
-#endif
-#ifdef ESPIPE
-    else if (e == ESPIPE) { return MA_BAD_SEEK; }
-#endif
-#ifdef EROFS
-    else if (e == EROFS) { return MA_ACCESS_DENIED; }
-#endif
-#ifdef EMLINK
-    else if (e == EMLINK) { return MA_TOO_MANY_LINKS; }
-#endif
-#ifdef EPIPE
-    else if (e == EPIPE) { return MA_BAD_PIPE; }
-#endif
-#ifdef EDOM
-    else if (e == EDOM) { return MA_OUT_OF_RANGE; }
-#endif
-#ifdef ERANGE
-    else if (e == ERANGE) { return MA_OUT_OF_RANGE; }
-#endif
-#ifdef EDEADLK
-    else if (e == EDEADLK) { return MA_DEADLOCK; }
-#endif
-#ifdef ENAMETOOLONG
-    else if (e == ENAMETOOLONG) { return MA_PATH_TOO_LONG; }
-#endif
-#ifdef ENOLCK
-    else if (e == ENOLCK) { return MA_ERROR; }
-#endif
-#ifdef ENOSYS
-    else if (e == ENOSYS) { return MA_NOT_IMPLEMENTED; }
-#endif
-#ifdef ENOTEMPTY
-    else if (e == ENOTEMPTY) { return MA_DIRECTORY_NOT_EMPTY; }
-#endif
-#ifdef ELOOP
-    else if (e == ELOOP) { return MA_TOO_MANY_LINKS; }
-#endif
-#ifdef ENOMSG
-    else if (e == ENOMSG) { return MA_NO_MESSAGE; }
-#endif
-#ifdef EIDRM
-    else if (e == EIDRM) { return MA_ERROR; }
-#endif
-#ifdef ECHRNG
-    else if (e == ECHRNG) { return MA_ERROR; }
-#endif
-#ifdef EL2NSYNC
-    else if (e == EL2NSYNC) { return MA_ERROR; }
-#endif
-#ifdef EL3HLT
-    else if (e == EL3HLT) { return MA_ERROR; }
-#endif
-#ifdef EL3RST
-    else if (e == EL3RST) { return MA_ERROR; }
-#endif
-#ifdef ELNRNG
-    else if (e == ELNRNG) { return MA_OUT_OF_RANGE; }
-#endif
-#ifdef EUNATCH
-    else if (e == EUNATCH) { return MA_ERROR; }
-#endif
-#ifdef ENOCSI
-    else if (e == ENOCSI) { return MA_ERROR; }
-#endif
-#ifdef EL2HLT
-    else if (e == EL2HLT) { return MA_ERROR; }
-#endif
-#ifdef EBADE
-    else if (e == EBADE) { return MA_ERROR; }
-#endif
-#ifdef EBADR
-    else if (e == EBADR) { return MA_ERROR; }
-#endif
-#ifdef EXFULL
-    else if (e == EXFULL) { return MA_ERROR; }
-#endif
-#ifdef ENOANO
-    else if (e == ENOANO) { return MA_ERROR; }
-#endif
-#ifdef EBADRQC
-    else if (e == EBADRQC) { return MA_ERROR; }
-#endif
-#ifdef EBADSLT
-    else if (e == EBADSLT) { return MA_ERROR; }
-#endif
-#ifdef EBFONT
-    else if (e == EBFONT) { return MA_INVALID_FILE; }
-#endif
-#ifdef ENOSTR
-    else if (e == ENOSTR) { return MA_ERROR; }
-#endif
-#ifdef ENODATA
-    else if (e == ENODATA) { return MA_NO_DATA_AVAILABLE; }
-#endif
-#ifdef ETIME
-    else if (e == ETIME) { return MA_TIMEOUT; }
-#endif
-#ifdef ENOSR
-    else if (e == ENOSR) { return MA_NO_DATA_AVAILABLE; }
-#endif
-#ifdef ENONET
-    else if (e == ENONET) { return MA_NO_NETWORK; }
-#endif
-#ifdef ENOPKG
-    else if (e == ENOPKG) { return MA_ERROR; }
-#endif
-#ifdef EREMOTE
-    else if (e == EREMOTE) { return MA_ERROR; }
-#endif
-#ifdef ENOLINK
-    else if (e == ENOLINK) { return MA_ERROR; }
-#endif
-#ifdef EADV
-    else if (e == EADV) { return MA_ERROR; }
-#endif
-#ifdef ESRMNT
-    else if (e == ESRMNT) { return MA_ERROR; }
-#endif
-#ifdef ECOMM
-    else if (e == ECOMM) { return MA_ERROR; }
-#endif
-#ifdef EPROTO
-    else if (e == EPROTO) { return MA_ERROR; }
-#endif
-#ifdef EMULTIHOP
-    else if (e == EMULTIHOP) { return MA_ERROR; }
-#endif
-#ifdef EDOTDOT
-    else if (e == EDOTDOT) { return MA_ERROR; }
-#endif
-#ifdef EBADMSG
-    else if (e == EBADMSG) { return MA_BAD_MESSAGE; }
-#endif
-#ifdef EOVERFLOW
-    else if (e == EOVERFLOW) { return MA_TOO_BIG; }
-#endif
-#ifdef ENOTUNIQ
-    else if (e == ENOTUNIQ) { return MA_NOT_UNIQUE; }
-#endif
-#ifdef EBADFD
-    else if (e == EBADFD) { return MA_ERROR; }
-#endif
-#ifdef EREMCHG
-    else if (e == EREMCHG) { return MA_ERROR; }
-#endif
-#ifdef ELIBACC
-    else if (e == ELIBACC) { return MA_ACCESS_DENIED; }
-#endif
-#ifdef ELIBBAD
-    else if (e == ELIBBAD) { return MA_INVALID_FILE; }
-#endif
-#ifdef ELIBSCN
-    else if (e == ELIBSCN) { return MA_INVALID_FILE; }
-#endif
-#ifdef ELIBMAX
-    else if (e == ELIBMAX) { return MA_ERROR; }
-#endif
-#ifdef ELIBEXEC
-    else if (e == ELIBEXEC) { return MA_ERROR; }
-#endif
-#ifdef EILSEQ
-    else if (e == EILSEQ) { return MA_INVALID_DATA; }
-#endif
-#ifdef ERESTART
-    else if (e == ERESTART) { return MA_ERROR; }
-#endif
-#ifdef ESTRPIPE
-    else if (e == ESTRPIPE) { return MA_ERROR; }
-#endif
-#ifdef EUSERS
-    else if (e == EUSERS) { return MA_ERROR; }
-#endif
-#ifdef ENOTSOCK
-    else if (e == ENOTSOCK) { return MA_NOT_SOCKET; }
-#endif
-#ifdef EDESTADDRREQ
-    else if (e == EDESTADDRREQ) { return MA_NO_ADDRESS; }
-#endif
-#ifdef EMSGSIZE
-    else if (e == EMSGSIZE) { return MA_TOO_BIG; }
-#endif
-#ifdef EPROTOTYPE
-    else if (e == EPROTOTYPE) { return MA_BAD_PROTOCOL; }
-#endif
-#ifdef ENOPROTOOPT
-    else if (e == ENOPROTOOPT) { return MA_PROTOCOL_UNAVAILABLE; }
-#endif
-#ifdef EPROTONOSUPPORT
-    else if (e == EPROTONOSUPPORT) { return MA_PROTOCOL_NOT_SUPPORTED; }
-#endif
-#ifdef ESOCKTNOSUPPORT
-    else if (e == ESOCKTNOSUPPORT) { return MA_SOCKET_NOT_SUPPORTED; }
-#endif
-#ifdef EOPNOTSUPP
-    else if (e == EOPNOTSUPP) { return MA_INVALID_OPERATION; }
-#endif
-#ifdef EPFNOSUPPORT
-    else if (e == EPFNOSUPPORT) { return MA_PROTOCOL_FAMILY_NOT_SUPPORTED; }
-#endif
-#ifdef EAFNOSUPPORT
-    else if (e == EAFNOSUPPORT) { return MA_ADDRESS_FAMILY_NOT_SUPPORTED; }
-#endif
-#ifdef EADDRINUSE
-    else if (e == EADDRINUSE) { return MA_ALREADY_IN_USE; }
-#endif
-#ifdef EADDRNOTAVAIL
-    else if (e == EADDRNOTAVAIL) { return MA_ERROR; }
-#endif
-#ifdef ENETDOWN
-    else if (e == ENETDOWN) { return MA_NO_NETWORK; }
-#endif
-#ifdef ENETUNREACH
-    else if (e == ENETUNREACH) { return MA_NO_NETWORK; }
-#endif
-#ifdef ENETRESET
-    else if (e == ENETRESET) { return MA_NO_NETWORK; }
-#endif
-#ifdef ECONNABORTED
-    else if (e == ECONNABORTED) { return MA_NO_NETWORK; }
-#endif
-#ifdef ECONNRESET
-    else if (e == ECONNRESET) { return MA_CONNECTION_RESET; }
-#endif
-#ifdef ENOBUFS
-    else if (e == ENOBUFS) { return MA_NO_SPACE; }
-#endif
-#ifdef EISCONN
-    else if (e == EISCONN) { return MA_ALREADY_CONNECTED; }
-#endif
-#ifdef ENOTCONN
-    else if (e == ENOTCONN) { return MA_NOT_CONNECTED; }
-#endif
-#ifdef ESHUTDOWN
-    else if (e == ESHUTDOWN) { return MA_ERROR; }
-#endif
-#ifdef ETOOMANYREFS
-    else if (e == ETOOMANYREFS) { return MA_ERROR; }
-#endif
-#ifdef ETIMEDOUT
-    else if (e == ETIMEDOUT) { return MA_TIMEOUT; }
-#endif
-#ifdef ECONNREFUSED
-    else if (e == ECONNREFUSED) { return MA_CONNECTION_REFUSED; }
-#endif
-#ifdef EHOSTDOWN
-    else if (e == EHOSTDOWN) { return MA_NO_HOST; }
-#endif
-#ifdef EHOSTUNREACH
-    else if (e == EHOSTUNREACH) { return MA_NO_HOST; }
-#endif
-#ifdef EALREADY
-    else if (e == EALREADY) { return MA_IN_PROGRESS; }
-#endif
-#ifdef EINPROGRESS
-    else if (e == EINPROGRESS) { return MA_IN_PROGRESS; }
-#endif
-#ifdef ESTALE
-    else if (e == ESTALE) { return MA_INVALID_FILE; }
-#endif
-#ifdef EUCLEAN
-    else if (e == EUCLEAN) { return MA_ERROR; }
-#endif
-#ifdef ENOTNAM
-    else if (e == ENOTNAM) { return MA_ERROR; }
-#endif
-#ifdef ENAVAIL
-    else if (e == ENAVAIL) { return MA_ERROR; }
-#endif
-#ifdef EISNAM
-    else if (e == EISNAM) { return MA_ERROR; }
-#endif
-#ifdef EREMOTEIO
-    else if (e == EREMOTEIO) { return MA_IO_ERROR; }
-#endif
-#ifdef EDQUOT
-    else if (e == EDQUOT) { return MA_NO_SPACE; }
-#endif
-#ifdef ENOMEDIUM
-    else if (e == ENOMEDIUM) { return MA_DOES_NOT_EXIST; }
-#endif
-#ifdef EMEDIUMTYPE
-    else if (e == EMEDIUMTYPE) { return MA_ERROR; }
-#endif
-#ifdef ECANCELED
-    else if (e == ECANCELED) { return MA_CANCELLED; }
-#endif
-#ifdef ENOKEY
-    else if (e == ENOKEY) { return MA_ERROR; }
-#endif
-#ifdef EKEYEXPIRED
-    else if (e == EKEYEXPIRED) { return MA_ERROR; }
-#endif
-#ifdef EKEYREVOKED
-    else if (e == EKEYREVOKED) { return MA_ERROR; }
-#endif
-#ifdef EKEYREJECTED
-    else if (e == EKEYREJECTED) { return MA_ERROR; }
-#endif
-#ifdef EOWNERDEAD
-    else if (e == EOWNERDEAD) { return MA_ERROR; }
-#endif
-#ifdef ENOTRECOVERABLE
-    else if (e == ENOTRECOVERABLE) { return MA_ERROR; }
-#endif
-#ifdef ERFKILL
-    else if (e == ERFKILL) { return MA_ERROR; }
-#endif
-#ifdef EHWPOISON
-    else if (e == EHWPOISON) { return MA_ERROR; }
-#endif
-    else {
-        return MA_ERROR;
-    }
-}
-
-MA_API ma_result ma_fopen(FILE** ppFile, const char* pFilePath, const char* pOpenMode)
-{
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-    errno_t err;
-#endif
-
-    if (ppFile != NULL) {
-        *ppFile = NULL;  /* Safety. */
-    }
-
-    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-    err = fopen_s(ppFile, pFilePath, pOpenMode);
-    if (err != 0) {
-        return ma_result_from_errno(err);
-    }
-#else
-#if defined(_WIN32) || defined(__APPLE__)
-    *ppFile = fopen(pFilePath, pOpenMode);
-#else
-    #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64 && defined(_LARGEFILE64_SOURCE)
-        *ppFile = fopen64(pFilePath, pOpenMode);
-    #else
-        *ppFile = fopen(pFilePath, pOpenMode);
-    #endif
-#endif
-    if (*ppFile == NULL) {
-        ma_result result = ma_result_from_errno(errno);
-        if (result == MA_SUCCESS) {
-            result = MA_ERROR;   /* Just a safety check to make sure we never ever return success when pFile == NULL. */
-        }
-
-        return result;
-    }
-#endif
-
-    return MA_SUCCESS;
-}
-
-
-
-/*
-_wfopen() isn't always available in all compilation environments.
-
-    * Windows only.
-    * MSVC seems to support it universally as far back as VC6 from what I can tell (haven't checked further back).
-    * MinGW-64 (both 32- and 64-bit) seems to support it.
-    * MinGW wraps it in !defined(__STRICT_ANSI__).
-    * OpenWatcom wraps it in !defined(_NO_EXT_KEYS).
-
-This can be reviewed as compatibility issues arise. The preference is to use _wfopen_s() and _wfopen() as opposed to the wcsrtombs()
-fallback, so if you notice your compiler not detecting this properly I'm happy to look at adding support.
-*/
-#if defined(_WIN32)
-    #if defined(_MSC_VER) || defined(__MINGW64__) || (!defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS))
-        #define MA_HAS_WFOPEN
-    #endif
-#endif
-
-MA_API ma_result ma_wfopen(FILE** ppFile, const wchar_t* pFilePath, const wchar_t* pOpenMode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (ppFile != NULL) {
-        *ppFile = NULL;  /* Safety. */
-    }
-
-    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_HAS_WFOPEN)
-    {
-        /* Use _wfopen() on Windows. */
-    #if defined(_MSC_VER) && _MSC_VER >= 1400
-        errno_t err = _wfopen_s(ppFile, pFilePath, pOpenMode);
-        if (err != 0) {
-            return ma_result_from_errno(err);
-        }
-    #else
-        *ppFile = _wfopen(pFilePath, pOpenMode);
-        if (*ppFile == NULL) {
-            return ma_result_from_errno(errno);
-        }
-    #endif
-        (void)pAllocationCallbacks;
-    }
-#else
-    /*
-    Use fopen() on anything other than Windows. Requires a conversion. This is annoying because fopen() is locale specific. The only real way I can
-    think of to do this is with wcsrtombs(). Note that wcstombs() is apparently not thread-safe because it uses a static global mbstate_t object for
-    maintaining state. I've checked this with -std=c89 and it works, but if somebody get's a compiler error I'll look into improving compatibility.
-    */
-    {
-        mbstate_t mbs;
-        size_t lenMB;
-        const wchar_t* pFilePathTemp = pFilePath;
-        char* pFilePathMB = NULL;
-        char pOpenModeMB[32] = {0};
-
-        /* Get the length first. */
-        MA_ZERO_OBJECT(&mbs);
-        lenMB = wcsrtombs(NULL, &pFilePathTemp, 0, &mbs);
-        if (lenMB == (size_t)-1) {
-            return ma_result_from_errno(errno);
-        }
-
-        pFilePathMB = (char*)ma_malloc(lenMB + 1, pAllocationCallbacks);
-        if (pFilePathMB == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-
-        pFilePathTemp = pFilePath;
-        MA_ZERO_OBJECT(&mbs);
-        wcsrtombs(pFilePathMB, &pFilePathTemp, lenMB + 1, &mbs);
-
-        /* The open mode should always consist of ASCII characters so we should be able to do a trivial conversion. */
-        {
-            size_t i = 0;
-            for (;;) {
-                if (pOpenMode[i] == 0) {
-                    pOpenModeMB[i] = '\0';
-                    break;
-                }
-
-                pOpenModeMB[i] = (char)pOpenMode[i];
-                i += 1;
-            }
-        }
-
-        *ppFile = fopen(pFilePathMB, pOpenModeMB);
-
-        ma_free(pFilePathMB, pAllocationCallbacks);
-    }
-
-    if (*ppFile == NULL) {
-        return MA_ERROR;
-    }
-#endif
-
-    return MA_SUCCESS;
-}
-
-
-
-static MA_INLINE void ma_copy_memory_64(void* dst, const void* src, ma_uint64 sizeInBytes)
-{
-#if 0xFFFFFFFFFFFFFFFF <= MA_SIZE_MAX
-    MA_COPY_MEMORY(dst, src, (size_t)sizeInBytes);
-#else
-    while (sizeInBytes > 0) {
-        ma_uint64 bytesToCopyNow = sizeInBytes;
-        if (bytesToCopyNow > MA_SIZE_MAX) {
-            bytesToCopyNow = MA_SIZE_MAX;
-        }
-
-        MA_COPY_MEMORY(dst, src, (size_t)bytesToCopyNow);  /* Safe cast to size_t. */
-
-        sizeInBytes -= bytesToCopyNow;
-        dst = (      void*)((      ma_uint8*)dst + bytesToCopyNow);
-        src = (const void*)((const ma_uint8*)src + bytesToCopyNow);
-    }
-#endif
-}
-
-static MA_INLINE void ma_zero_memory_64(void* dst, ma_uint64 sizeInBytes)
-{
-#if 0xFFFFFFFFFFFFFFFF <= MA_SIZE_MAX
-    MA_ZERO_MEMORY(dst, (size_t)sizeInBytes);
-#else
-    while (sizeInBytes > 0) {
-        ma_uint64 bytesToZeroNow = sizeInBytes;
-        if (bytesToZeroNow > MA_SIZE_MAX) {
-            bytesToZeroNow = MA_SIZE_MAX;
-        }
-
-        MA_ZERO_MEMORY(dst, (size_t)bytesToZeroNow);  /* Safe cast to size_t. */
-
-        sizeInBytes -= bytesToZeroNow;
-        dst = (void*)((ma_uint8*)dst + bytesToZeroNow);
-    }
-#endif
-}
-
-
-/* Thanks to good old Bit Twiddling Hacks for this one: http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
-static MA_INLINE unsigned int ma_next_power_of_2(unsigned int x)
-{
-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-    x++;
-
-    return x;
-}
-
-static MA_INLINE unsigned int ma_prev_power_of_2(unsigned int x)
-{
-    return ma_next_power_of_2(x) >> 1;
-}
-
-static MA_INLINE unsigned int ma_round_to_power_of_2(unsigned int x)
-{
-    unsigned int prev = ma_prev_power_of_2(x);
-    unsigned int next = ma_next_power_of_2(x);
-    if ((next - x) > (x - prev)) {
-        return prev;
-    } else {
-        return next;
-    }
-}
-
-static MA_INLINE unsigned int ma_count_set_bits(unsigned int x)
-{
-    unsigned int count = 0;
-    while (x != 0) {
-        if (x & 1) {
-            count += 1;
-        }
-
-        x = x >> 1;
-    }
-
-    return count;
-}
-
-
-
-/**************************************************************************************************************************************************************
-
-Allocation Callbacks
-
-**************************************************************************************************************************************************************/
-static void* ma__malloc_default(size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return MA_MALLOC(sz);
-}
-
-static void* ma__realloc_default(void* p, size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return MA_REALLOC(p, sz);
-}
-
-static void ma__free_default(void* p, void* pUserData)
-{
-    (void)pUserData;
-    MA_FREE(p);
-}
-
-static ma_allocation_callbacks ma_allocation_callbacks_init_default(void)
-{
-    ma_allocation_callbacks callbacks;
-    callbacks.pUserData = NULL;
-    callbacks.onMalloc  = ma__malloc_default;
-    callbacks.onRealloc = ma__realloc_default;
-    callbacks.onFree    = ma__free_default;
-
-    return callbacks;
-}
-
-static ma_result ma_allocation_callbacks_init_copy(ma_allocation_callbacks* pDst, const ma_allocation_callbacks* pSrc)
-{
-    if (pDst == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pSrc == NULL) {
-        *pDst = ma_allocation_callbacks_init_default();
-    } else {
-        if (pSrc->pUserData == NULL && pSrc->onFree == NULL && pSrc->onMalloc == NULL && pSrc->onRealloc == NULL) {
-            *pDst = ma_allocation_callbacks_init_default();
-        } else {
-            if (pSrc->onFree == NULL || (pSrc->onMalloc == NULL && pSrc->onRealloc == NULL)) {
-                return MA_INVALID_ARGS;    /* Invalid allocation callbacks. */
-            } else {
-                *pDst = *pSrc;
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-
-
-
-/**************************************************************************************************************************************************************
-
-Logging
-
-**************************************************************************************************************************************************************/
-MA_API const char* ma_log_level_to_string(ma_uint32 logLevel)
-{
-    switch (logLevel)
-    {
-        case MA_LOG_LEVEL_DEBUG:   return "DEBUG";
-        case MA_LOG_LEVEL_INFO:    return "INFO";
-        case MA_LOG_LEVEL_WARNING: return "WARNING";
-        case MA_LOG_LEVEL_ERROR:   return "ERROR";
-        default:                   return "ERROR";
-    }
-}
-
-#if defined(MA_DEBUG_OUTPUT)
-#if defined(MA_ANDROID)
-    #include <android/log.h>
-#endif
-
-/* Customize this to use a specific tag in __android_log_print() for debug output messages. */
-#ifndef MA_ANDROID_LOG_TAG
-#define MA_ANDROID_LOG_TAG  "miniaudio"
-#endif
-
-void ma_log_callback_debug(void* pUserData, ma_uint32 level, const char* pMessage)
-{
-    (void)pUserData;
-
-    /* Special handling for some platforms. */
-    #if defined(MA_ANDROID)
-    {
-        /* Android. */
-        __android_log_print(ANDROID_LOG_DEBUG, MA_ANDROID_LOG_TAG, "%s: %s", ma_log_level_to_string(level), pMessage);
-    }
-    #else
-    {
-        /* Everything else. */
-        printf("%s: %s", ma_log_level_to_string(level), pMessage);
-    }
-    #endif
-}
-#endif
-
-MA_API ma_log_callback ma_log_callback_init(ma_log_callback_proc onLog, void* pUserData)
-{
-    ma_log_callback callback;
-
-    MA_ZERO_OBJECT(&callback);
-    callback.onLog     = onLog;
-    callback.pUserData = pUserData;
-
-    return callback;
-}
-
-
-MA_API ma_result ma_log_init(const ma_allocation_callbacks* pAllocationCallbacks, ma_log* pLog)
-{
-    if (pLog == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pLog);
-    ma_allocation_callbacks_init_copy(&pLog->allocationCallbacks, pAllocationCallbacks);
-
-    /* We need a mutex for thread safety. */
-    #ifndef MA_NO_THREADING
-    {
-        ma_result result = ma_mutex_init(&pLog->lock);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-    #endif
-
-    /* If we're using debug output, enable it. */
-    #if defined(MA_DEBUG_OUTPUT)
-    {
-        ma_log_register_callback(pLog, ma_log_callback_init(ma_log_callback_debug, NULL)); /* Doesn't really matter if this fails. */
-    }
-    #endif
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_log_uninit(ma_log* pLog)
-{
-    if (pLog == NULL) {
-        return;
-    }
-
-#ifndef MA_NO_THREADING
-    ma_mutex_uninit(&pLog->lock);
-#endif
-}
-
-static void ma_log_lock(ma_log* pLog)
-{
-#ifndef MA_NO_THREADING
-    ma_mutex_lock(&pLog->lock);
-#else
-    (void)pLog;
-#endif
-}
-
-static void ma_log_unlock(ma_log* pLog)
-{
-#ifndef MA_NO_THREADING
-    ma_mutex_unlock(&pLog->lock);
-#else
-    (void)pLog;
-#endif
-}
-
-MA_API ma_result ma_log_register_callback(ma_log* pLog, ma_log_callback callback)
-{
-    ma_result result = MA_SUCCESS;
-
-    if (pLog == NULL || callback.onLog == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_log_lock(pLog);
-    {
-        if (pLog->callbackCount == ma_countof(pLog->callbacks)) {
-            result = MA_OUT_OF_MEMORY;  /* Reached the maximum allowed log callbacks. */
-        } else {
-            pLog->callbacks[pLog->callbackCount] = callback;
-            pLog->callbackCount += 1;
-        }
-    }
-    ma_log_unlock(pLog);
-
-    return result;
-}
-
-MA_API ma_result ma_log_unregister_callback(ma_log* pLog, ma_log_callback callback)
-{
-    if (pLog == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_log_lock(pLog);
-    {
-        ma_uint32 iLog;
-        for (iLog = 0; iLog < pLog->callbackCount; ) {
-            if (pLog->callbacks[iLog].onLog == callback.onLog) {
-                /* Found. Move everything down a slot. */
-                ma_uint32 jLog;
-                for (jLog = iLog; jLog < pLog->callbackCount-1; jLog += 1) {
-                    pLog->callbacks[jLog] = pLog->callbacks[jLog + 1];
-                }
-
-                pLog->callbackCount -= 1;
-            } else {
-                /* Not found. */
-                iLog += 1;
-            }
-        }
-    }
-    ma_log_unlock(pLog);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_log_post(ma_log* pLog, ma_uint32 level, const char* pMessage)
-{
-    if (pLog == NULL || pMessage == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_log_lock(pLog);
-    {
-        ma_uint32 iLog;
-        for (iLog = 0; iLog < pLog->callbackCount; iLog += 1) {
-            if (pLog->callbacks[iLog].onLog) {
-                pLog->callbacks[iLog].onLog(pLog->callbacks[iLog].pUserData, level, pMessage);
-            }
-        }
-    }
-    ma_log_unlock(pLog);
-
-    return MA_SUCCESS;
-}
-
-
-/*
-We need to emulate _vscprintf() for the VC6 build. This can be more efficient, but since it's only VC6, and it's just a
-logging function, I'm happy to keep this simple. In the VC6 build we can implement this in terms of _vsnprintf().
-*/
-#if defined(_MSC_VER) && _MSC_VER < 1900
-static int ma_vscprintf(const ma_allocation_callbacks* pAllocationCallbacks, const char* format, va_list args)
-{
-#if _MSC_VER > 1200
-    return _vscprintf(format, args);
-#else
-    int result;
-    char* pTempBuffer = NULL;
-    size_t tempBufferCap = 1024;
-
-    if (format == NULL) {
-        errno = EINVAL;
-        return -1;
-    }
-
-    for (;;) {
-        char* pNewTempBuffer = (char*)ma_realloc(pTempBuffer, tempBufferCap, pAllocationCallbacks);
-        if (pNewTempBuffer == NULL) {
-            ma_free(pTempBuffer, pAllocationCallbacks);
-            errno = ENOMEM;
-            return -1;  /* Out of memory. */
-        }
-
-        pTempBuffer = pNewTempBuffer;
-
-        result = _vsnprintf(pTempBuffer, tempBufferCap, format, args);
-        ma_free(pTempBuffer, NULL);
-
-        if (result != -1) {
-            break;  /* Got it. */
-        }
-
-        /* Buffer wasn't big enough. Ideally it'd be nice to use an error code to know the reason for sure, but this is reliable enough. */
-        tempBufferCap *= 2;
-    }
-
-    return result;
-#endif
-}
-#endif
-
-MA_API ma_result ma_log_postv(ma_log* pLog, ma_uint32 level, const char* pFormat, va_list args)
-{
-    if (pLog == NULL || pFormat == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || ((!defined(_MSC_VER) || _MSC_VER >= 1900) && !defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS)) || (defined(__cplusplus) && __cplusplus >= 201103L)
-    {
-        ma_result result;
-        int length;
-        char  pFormattedMessageStack[1024];
-        char* pFormattedMessageHeap = NULL;
-
-        /* First try formatting into our fixed sized stack allocated buffer. If this is too small we'll fallback to a heap allocation. */
-        length = vsnprintf(pFormattedMessageStack, sizeof(pFormattedMessageStack), pFormat, args);
-        if (length < 0) {
-            return MA_INVALID_OPERATION;    /* An error occurred when trying to convert the buffer. */
-        }
-
-        if ((size_t)length < sizeof(pFormattedMessageStack)) {
-            /* The string was written to the stack. */
-            result = ma_log_post(pLog, level, pFormattedMessageStack);
-        } else {
-            /* The stack buffer was too small, try the heap. */
-            pFormattedMessageHeap = (char*)ma_malloc(length + 1, &pLog->allocationCallbacks);
-            if (pFormattedMessageHeap == NULL) {
-                return MA_OUT_OF_MEMORY;
-            }
-
-            length = vsnprintf(pFormattedMessageHeap, length + 1, pFormat, args);
-            if (length < 0) {
-                ma_free(pFormattedMessageHeap, &pLog->allocationCallbacks);
-                return MA_INVALID_OPERATION;
-            }
-
-            result = ma_log_post(pLog, level, pFormattedMessageHeap);
-            ma_free(pFormattedMessageHeap, &pLog->allocationCallbacks);
-        }
-
-        return result;
-    }
-    #else
-    {
-        /*
-        Without snprintf() we need to first measure the string and then heap allocate it. I'm only aware of Visual Studio having support for this without snprintf(), so we'll
-        need to restrict this branch to Visual Studio. For other compilers we need to just not support formatted logging because I don't want the security risk of overflowing
-        a fixed sized stack allocated buffer.
-        */
-        #if defined(_MSC_VER) && _MSC_VER >= 1200   /* 1200 = VC6 */
-        {
-            ma_result result;
-            int formattedLen;
-            char* pFormattedMessage = NULL;
-            va_list args2;
-
-            #if _MSC_VER >= 1800
-            {
-                va_copy(args2, args);
-            }
-            #else
-            {
-                args2 = args;
-            }
-            #endif
-
-            formattedLen = ma_vscprintf(&pLog->allocationCallbacks, pFormat, args2);
-            va_end(args2);
-
-            if (formattedLen <= 0) {
-                return MA_INVALID_OPERATION;
-            }
-
-            pFormattedMessage = (char*)ma_malloc(formattedLen + 1, &pLog->allocationCallbacks);
-            if (pFormattedMessage == NULL) {
-                return MA_OUT_OF_MEMORY;
-            }
-
-            /* We'll get errors on newer versions of Visual Studio if we try to use vsprintf().  */
-            #if _MSC_VER >= 1400    /* 1400 = Visual Studio 2005 */
-            {
-                vsprintf_s(pFormattedMessage, formattedLen + 1, pFormat, args);
-            }
-            #else
-            {
-                vsprintf(pFormattedMessage, pFormat, args);
-            }
-            #endif
-
-            result = ma_log_post(pLog, level, pFormattedMessage);
-            ma_free(pFormattedMessage, &pLog->allocationCallbacks);
-
-            return result;
-        }
-        #else
-        {
-            /* Can't do anything because we don't have a safe way of to emulate vsnprintf() without a manual solution. */
-            (void)level;
-            (void)args;
-
-            return MA_INVALID_OPERATION;
-        }
-        #endif
-    }
-    #endif
-}
-
-MA_API ma_result ma_log_postf(ma_log* pLog, ma_uint32 level, const char* pFormat, ...)
-{
-    ma_result result;
-    va_list args;
-
-    if (pLog == NULL || pFormat == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    va_start(args, pFormat);
-    {
-        result = ma_log_postv(pLog, level, pFormat, args);
-    }
-    va_end(args);
-
-    return result;
-}
-
-
-
-static MA_INLINE ma_uint8 ma_clip_u8(ma_int32 x)
-{
-    return (ma_uint8)(ma_clamp(x, -128, 127) + 128);
-}
-
-static MA_INLINE ma_int16 ma_clip_s16(ma_int32 x)
-{
-    return (ma_int16)ma_clamp(x, -32768, 32767);
-}
-
-static MA_INLINE ma_int64 ma_clip_s24(ma_int64 x)
-{
-    return (ma_int64)ma_clamp(x, -8388608, 8388607);
-}
-
-static MA_INLINE ma_int32 ma_clip_s32(ma_int64 x)
-{
-    /* This dance is to silence warnings with -std=c89. A good compiler should be able to optimize this away. */
-    ma_int64 clipMin;
-    ma_int64 clipMax;
-    clipMin = -((ma_int64)2147483647 + 1);
-    clipMax =   (ma_int64)2147483647;
-
-    return (ma_int32)ma_clamp(x, clipMin, clipMax);
-}
-
-static MA_INLINE float ma_clip_f32(float x)
-{
-    if (x < -1) return -1;
-    if (x > +1) return +1;
-    return x;
-}
-
-
-static MA_INLINE float ma_mix_f32(float x, float y, float a)
-{
-    return x*(1-a) + y*a;
-}
-static MA_INLINE float ma_mix_f32_fast(float x, float y, float a)
-{
-    float r0 = (y - x);
-    float r1 = r0*a;
-    return x + r1;
-    /*return x + (y - x)*a;*/
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE __m128 ma_mix_f32_fast__sse2(__m128 x, __m128 y, __m128 a)
-{
-    return _mm_add_ps(x, _mm_mul_ps(_mm_sub_ps(y, x), a));
-}
-#endif
-#if defined(MA_SUPPORT_AVX2)
-static MA_INLINE __m256 ma_mix_f32_fast__avx2(__m256 x, __m256 y, __m256 a)
-{
-    return _mm256_add_ps(x, _mm256_mul_ps(_mm256_sub_ps(y, x), a));
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE float32x4_t ma_mix_f32_fast__neon(float32x4_t x, float32x4_t y, float32x4_t a)
-{
-    return vaddq_f32(x, vmulq_f32(vsubq_f32(y, x), a));
-}
-#endif
-
-
-static MA_INLINE double ma_mix_f64(double x, double y, double a)
-{
-    return x*(1-a) + y*a;
-}
-static MA_INLINE double ma_mix_f64_fast(double x, double y, double a)
-{
-    return x + (y - x)*a;
-}
-
-static MA_INLINE float ma_scale_to_range_f32(float x, float lo, float hi)
-{
-    return lo + x*(hi-lo);
-}
-
-
-/*
-Greatest common factor using Euclid's algorithm iteratively.
-*/
-static MA_INLINE ma_uint32 ma_gcf_u32(ma_uint32 a, ma_uint32 b)
-{
-    for (;;) {
-        if (b == 0) {
-            break;
-        } else {
-            ma_uint32 t = a;
-            a = b;
-            b = t % a;
-        }
-    }
-
-    return a;
-}
-
-
-static ma_uint32 ma_ffs_32(ma_uint32 x)
-{
-    ma_uint32 i;
-
-    /* Just a naive implementation just to get things working for now. Will optimize this later. */
-    for (i = 0; i < 32; i += 1) {
-        if ((x & (1 << i)) != 0) {
-            return i;
-        }
-    }
-
-    return i;
-}
-
-static MA_INLINE ma_int16 ma_float_to_fixed_16(float x)
-{
-    return (ma_int16)(x * (1 << 8));
-}
-
-
-
-/*
-Random Number Generation
-
-miniaudio uses the LCG random number generation algorithm. This is good enough for audio.
-
-Note that miniaudio's global LCG implementation uses global state which is _not_ thread-local. When this is called across
-multiple threads, results will be unpredictable. However, it won't crash and results will still be random enough for
-miniaudio's purposes.
-*/
-#ifndef MA_DEFAULT_LCG_SEED
-#define MA_DEFAULT_LCG_SEED 4321
-#endif
-
-#define MA_LCG_M   2147483647
-#define MA_LCG_A   48271
-#define MA_LCG_C   0
-
-static ma_lcg g_maLCG = {MA_DEFAULT_LCG_SEED}; /* Non-zero initial seed. Use ma_seed() to use an explicit seed. */
-
-static MA_INLINE void ma_lcg_seed(ma_lcg* pLCG, ma_int32 seed)
-{
-    MA_ASSERT(pLCG != NULL);
-    pLCG->state = seed;
-}
-
-static MA_INLINE ma_int32 ma_lcg_rand_s32(ma_lcg* pLCG)
-{
-    pLCG->state = (MA_LCG_A * pLCG->state + MA_LCG_C) % MA_LCG_M;
-    return pLCG->state;
-}
-
-static MA_INLINE ma_uint32 ma_lcg_rand_u32(ma_lcg* pLCG)
-{
-    return (ma_uint32)ma_lcg_rand_s32(pLCG);
-}
-
-static MA_INLINE ma_int16 ma_lcg_rand_s16(ma_lcg* pLCG)
-{
-    return (ma_int16)(ma_lcg_rand_s32(pLCG) & 0xFFFF);
-}
-
-static MA_INLINE double ma_lcg_rand_f64(ma_lcg* pLCG)
-{
-    return ma_lcg_rand_s32(pLCG) / (double)0x7FFFFFFF;
-}
-
-static MA_INLINE float ma_lcg_rand_f32(ma_lcg* pLCG)
-{
-    return (float)ma_lcg_rand_f64(pLCG);
-}
-
-static MA_INLINE float ma_lcg_rand_range_f32(ma_lcg* pLCG, float lo, float hi)
-{
-    return ma_scale_to_range_f32(ma_lcg_rand_f32(pLCG), lo, hi);
-}
-
-static MA_INLINE ma_int32 ma_lcg_rand_range_s32(ma_lcg* pLCG, ma_int32 lo, ma_int32 hi)
-{
-    if (lo == hi) {
-        return lo;
-    }
-
-    return lo + ma_lcg_rand_u32(pLCG) / (0xFFFFFFFF / (hi - lo + 1) + 1);
-}
-
-
-
-static MA_INLINE void ma_seed(ma_int32 seed)
-{
-    ma_lcg_seed(&g_maLCG, seed);
-}
-
-static MA_INLINE ma_int32 ma_rand_s32(void)
-{
-    return ma_lcg_rand_s32(&g_maLCG);
-}
-
-static MA_INLINE ma_uint32 ma_rand_u32(void)
-{
-    return ma_lcg_rand_u32(&g_maLCG);
-}
-
-static MA_INLINE double ma_rand_f64(void)
-{
-    return ma_lcg_rand_f64(&g_maLCG);
-}
-
-static MA_INLINE float ma_rand_f32(void)
-{
-    return ma_lcg_rand_f32(&g_maLCG);
-}
-
-static MA_INLINE float ma_rand_range_f32(float lo, float hi)
-{
-    return ma_lcg_rand_range_f32(&g_maLCG, lo, hi);
-}
-
-static MA_INLINE ma_int32 ma_rand_range_s32(ma_int32 lo, ma_int32 hi)
-{
-    return ma_lcg_rand_range_s32(&g_maLCG, lo, hi);
-}
-
-
-static MA_INLINE float ma_dither_f32_rectangle(float ditherMin, float ditherMax)
-{
-    return ma_rand_range_f32(ditherMin, ditherMax);
-}
-
-static MA_INLINE float ma_dither_f32_triangle(float ditherMin, float ditherMax)
-{
-    float a = ma_rand_range_f32(ditherMin, 0);
-    float b = ma_rand_range_f32(0, ditherMax);
-    return a + b;
-}
-
-static MA_INLINE float ma_dither_f32(ma_dither_mode ditherMode, float ditherMin, float ditherMax)
-{
-    if (ditherMode == ma_dither_mode_rectangle) {
-        return ma_dither_f32_rectangle(ditherMin, ditherMax);
-    }
-    if (ditherMode == ma_dither_mode_triangle) {
-        return ma_dither_f32_triangle(ditherMin, ditherMax);
-    }
-
-    return 0;
-}
-
-static MA_INLINE ma_int32 ma_dither_s32(ma_dither_mode ditherMode, ma_int32 ditherMin, ma_int32 ditherMax)
-{
-    if (ditherMode == ma_dither_mode_rectangle) {
-        ma_int32 a = ma_rand_range_s32(ditherMin, ditherMax);
-        return a;
-    }
-    if (ditherMode == ma_dither_mode_triangle) {
-        ma_int32 a = ma_rand_range_s32(ditherMin, 0);
-        ma_int32 b = ma_rand_range_s32(0, ditherMax);
-        return a + b;
-    }
-
-    return 0;
-}
-
-
-/**************************************************************************************************************************************************************
-
-Atomics
-
-**************************************************************************************************************************************************************/
-/* ma_atomic.h begin */
-#ifndef ma_atomic_h
-#if defined(__cplusplus)
-extern "C" {
-#endif
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wlong-long"
-    #if defined(__clang__)
-        #pragma GCC diagnostic ignored "-Wc++11-long-long"
-    #endif
-#endif
-typedef int ma_atomic_memory_order;
-#define MA_ATOMIC_HAS_8
-#define MA_ATOMIC_HAS_16
-#define MA_ATOMIC_HAS_32
-#define MA_ATOMIC_HAS_64
-#if (defined(_MSC_VER) ) || defined(__WATCOMC__) || defined(__DMC__)
-    #define MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, intrin, ma_atomicType, msvcType)   \
-        ma_atomicType result; \
-        switch (order) \
-        { \
-            case ma_atomic_memory_order_relaxed: \
-            { \
-                result = (ma_atomicType)intrin##_nf((volatile msvcType*)dst, (msvcType)src); \
-            } break; \
-            case ma_atomic_memory_order_consume: \
-            case ma_atomic_memory_order_acquire: \
-            { \
-                result = (ma_atomicType)intrin##_acq((volatile msvcType*)dst, (msvcType)src); \
-            } break; \
-            case ma_atomic_memory_order_release: \
-            { \
-                result = (ma_atomicType)intrin##_rel((volatile msvcType*)dst, (msvcType)src); \
-            } break; \
-            case ma_atomic_memory_order_acq_rel: \
-            case ma_atomic_memory_order_seq_cst: \
-            default: \
-            { \
-                result = (ma_atomicType)intrin((volatile msvcType*)dst, (msvcType)src); \
-            } break; \
-        } \
-        return result;
-    #define MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, expected, desired, order, intrin, ma_atomicType, msvcType)   \
-        ma_atomicType result; \
-        switch (order) \
-        { \
-            case ma_atomic_memory_order_relaxed: \
-            { \
-                result = (ma_atomicType)intrin##_nf((volatile msvcType*)ptr, (msvcType)expected, (msvcType)desired); \
-            } break; \
-            case ma_atomic_memory_order_consume: \
-            case ma_atomic_memory_order_acquire: \
-            { \
-                result = (ma_atomicType)intrin##_acq((volatile msvcType*)ptr, (msvcType)expected, (msvcType)desired); \
-            } break; \
-            case ma_atomic_memory_order_release: \
-            { \
-                result = (ma_atomicType)intrin##_rel((volatile msvcType*)ptr, (msvcType)expected, (msvcType)desired); \
-            } break; \
-            case ma_atomic_memory_order_acq_rel: \
-            case ma_atomic_memory_order_seq_cst: \
-            default: \
-            { \
-                result = (ma_atomicType)intrin((volatile msvcType*)ptr, (msvcType)expected, (msvcType)desired); \
-            } break; \
-        } \
-        return result;
-    #define ma_atomic_memory_order_relaxed  0
-    #define ma_atomic_memory_order_consume  1
-    #define ma_atomic_memory_order_acquire  2
-    #define ma_atomic_memory_order_release  3
-    #define ma_atomic_memory_order_acq_rel  4
-    #define ma_atomic_memory_order_seq_cst  5
-    #if _MSC_VER < 1600 && defined(MA_X86)
-        #define MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY
-    #endif
-    #if _MSC_VER < 1600
-        #undef MA_ATOMIC_HAS_8
-        #undef MA_ATOMIC_HAS_16
-    #endif
-    #if !defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
-        #include <intrin.h>
-    #endif
-    #if defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
-        #if defined(MA_ATOMIC_HAS_8)
-            static MA_INLINE ma_uint8 __stdcall ma_atomic_compare_and_swap_8(volatile ma_uint8* dst, ma_uint8 expected, ma_uint8 desired)
-            {
-                ma_uint8 result = 0;
-                __asm {
-                    mov ecx, dst
-                    mov al,  expected
-                    mov dl,  desired
-                    lock cmpxchg [ecx], dl
-                    mov result, al
-                }
-                return result;
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_16)
-            static MA_INLINE ma_uint16 __stdcall ma_atomic_compare_and_swap_16(volatile ma_uint16* dst, ma_uint16 expected, ma_uint16 desired)
-            {
-                ma_uint16 result = 0;
-                __asm {
-                    mov ecx, dst
-                    mov ax,  expected
-                    mov dx,  desired
-                    lock cmpxchg [ecx], dx
-                    mov result, ax
-                }
-                return result;
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_32)
-            static MA_INLINE ma_uint32 __stdcall ma_atomic_compare_and_swap_32(volatile ma_uint32* dst, ma_uint32 expected, ma_uint32 desired)
-            {
-                ma_uint32 result = 0;
-                __asm {
-                    mov ecx, dst
-                    mov eax, expected
-                    mov edx, desired
-                    lock cmpxchg [ecx], edx
-                    mov result, eax
-                }
-                return result;
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_64)
-            static MA_INLINE ma_uint64 __stdcall ma_atomic_compare_and_swap_64(volatile ma_uint64* dst, ma_uint64 expected, ma_uint64 desired)
-            {
-                ma_uint32 resultEAX = 0;
-                ma_uint32 resultEDX = 0;
-                __asm {
-                    mov esi, dst
-                    mov eax, dword ptr expected
-                    mov edx, dword ptr expected + 4
-                    mov ebx, dword ptr desired
-                    mov ecx, dword ptr desired + 4
-                    lock cmpxchg8b qword ptr [esi]
-                    mov resultEAX, eax
-                    mov resultEDX, edx
-                }
-                return ((ma_uint64)resultEDX << 32) | resultEAX;
-            }
-        #endif
-    #else
-        #if defined(MA_ATOMIC_HAS_8)
-            #define ma_atomic_compare_and_swap_8( dst, expected, desired) (ma_uint8 )_InterlockedCompareExchange8((volatile char*)dst, (char)desired, (char)expected)
-        #endif
-        #if defined(MA_ATOMIC_HAS_16)
-            #define ma_atomic_compare_and_swap_16(dst, expected, desired) (ma_uint16)_InterlockedCompareExchange16((volatile short*)dst, (short)desired, (short)expected)
-        #endif
-        #if defined(MA_ATOMIC_HAS_32)
-            #define ma_atomic_compare_and_swap_32(dst, expected, desired) (ma_uint32)_InterlockedCompareExchange((volatile long*)dst, (long)desired, (long)expected)
-        #endif
-        #if defined(MA_ATOMIC_HAS_64)
-            #define ma_atomic_compare_and_swap_64(dst, expected, desired) (ma_uint64)_InterlockedCompareExchange64((volatile ma_int64*)dst, (ma_int64)desired, (ma_int64)expected)
-        #endif
-    #endif
-    #if defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
-        #if defined(MA_ATOMIC_HAS_8)
-            static MA_INLINE ma_uint8 __stdcall ma_atomic_exchange_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-            {
-                ma_uint8 result = 0;
-                (void)order;
-                __asm {
-                    mov ecx, dst
-                    mov al,  src
-                    lock xchg [ecx], al
-                    mov result, al
-                }
-                return result;
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_16)
-            static MA_INLINE ma_uint16 __stdcall ma_atomic_exchange_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-            {
-                ma_uint16 result = 0;
-                (void)order;
-                __asm {
-                    mov ecx, dst
-                    mov ax,  src
-                    lock xchg [ecx], ax
-                    mov result, ax
-                }
-                return result;
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_32)
-            static MA_INLINE ma_uint32 __stdcall ma_atomic_exchange_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-            {
-                ma_uint32 result = 0;
-                (void)order;
-                __asm {
-                    mov ecx, dst
-                    mov eax, src
-                    lock xchg [ecx], eax
-                    mov result, eax
-                }
-                return result;
-            }
-        #endif
-    #else
-        #if defined(MA_ATOMIC_HAS_8)
-            static MA_INLINE ma_uint8 __stdcall ma_atomic_exchange_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-            {
-            #if defined(MA_ARM)
-                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchange8, ma_uint8, char);
-            #else
-                (void)order;
-                return (ma_uint8)_InterlockedExchange8((volatile char*)dst, (char)src);
-            #endif
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_16)
-            static MA_INLINE ma_uint16 __stdcall ma_atomic_exchange_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-            {
-            #if defined(MA_ARM)
-                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchange16, ma_uint16, short);
-            #else
-                (void)order;
-                return (ma_uint16)_InterlockedExchange16((volatile short*)dst, (short)src);
-            #endif
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_32)
-            static MA_INLINE ma_uint32 __stdcall ma_atomic_exchange_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-            {
-            #if defined(MA_ARM)
-                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchange, ma_uint32, long);
-            #else
-                (void)order;
-                return (ma_uint32)_InterlockedExchange((volatile long*)dst, (long)src);
-            #endif
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_64) && defined(MA_64BIT)
-            static MA_INLINE ma_uint64 __stdcall ma_atomic_exchange_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-            {
-            #if defined(MA_ARM)
-                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchange64, ma_uint64, long long);
-            #else
-                (void)order;
-                return (ma_uint64)_InterlockedExchange64((volatile long long*)dst, (long long)src);
-            #endif
-            }
-        #else
-        #endif
-    #endif
-    #if defined(MA_ATOMIC_HAS_64) && !defined(MA_64BIT)
-        static MA_INLINE ma_uint64 __stdcall ma_atomic_exchange_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            ma_uint64 oldValue;
-            do {
-                oldValue = *dst;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, src) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-    #endif
-    #if defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
-        #if defined(MA_ATOMIC_HAS_8)
-            static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_add_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-            {
-                ma_uint8 result = 0;
-                (void)order;
-                __asm {
-                    mov ecx, dst
-                    mov al,  src
-                    lock xadd [ecx], al
-                    mov result, al
-                }
-                return result;
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_16)
-            static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_add_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-            {
-                ma_uint16 result = 0;
-                (void)order;
-                __asm {
-                    mov ecx, dst
-                    mov ax,  src
-                    lock xadd [ecx], ax
-                    mov result, ax
-                }
-                return result;
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_32)
-            static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_add_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-            {
-                ma_uint32 result = 0;
-                (void)order;
-                __asm {
-                    mov ecx, dst
-                    mov eax, src
-                    lock xadd [ecx], eax
-                    mov result, eax
-                }
-                return result;
-            }
-        #endif
-    #else
-        #if defined(MA_ATOMIC_HAS_8)
-            static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_add_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-            {
-            #if defined(MA_ARM)
-                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchangeAdd8, ma_uint8, char);
-            #else
-                (void)order;
-                return (ma_uint8)_InterlockedExchangeAdd8((volatile char*)dst, (char)src);
-            #endif
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_16)
-            static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_add_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-            {
-            #if defined(MA_ARM)
-                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchangeAdd16, ma_uint16, short);
-            #else
-                (void)order;
-                return (ma_uint16)_InterlockedExchangeAdd16((volatile short*)dst, (short)src);
-            #endif
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_32)
-            static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_add_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-            {
-            #if defined(MA_ARM)
-                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchangeAdd, ma_uint32, long);
-            #else
-                (void)order;
-                return (ma_uint32)_InterlockedExchangeAdd((volatile long*)dst, (long)src);
-            #endif
-            }
-        #endif
-        #if defined(MA_ATOMIC_HAS_64) && defined(MA_64BIT)
-            static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_add_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-            {
-            #if defined(MA_ARM)
-                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchangeAdd64, ma_uint64, long long);
-            #else
-                (void)order;
-                return (ma_uint64)_InterlockedExchangeAdd64((volatile long long*)dst, (long long)src);
-            #endif
-            }
-        #else
-        #endif
-    #endif
-    #if defined(MA_ATOMIC_HAS_64) && !defined(MA_64BIT)
-        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_add_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue + src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-    #endif
-    #if defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
-        static MA_INLINE void __stdcall ma_atomic_thread_fence(ma_atomic_memory_order order)
-        {
-            (void)order;
-            __asm {
-                lock add [esp], 0
-            }
-        }
-    #else
-        #if defined(MA_X64)
-            #define ma_atomic_thread_fence(order)   __faststorefence(), (void)order
-        #elif defined(MA_ARM64)
-            #define ma_atomic_thread_fence(order)   __dmb(_ARM64_BARRIER_ISH), (void)order
-        #else
-            static MA_INLINE void ma_atomic_thread_fence(ma_atomic_memory_order order)
-            {
-                volatile ma_uint32 barrier = 0;
-                ma_atomic_fetch_add_explicit_32(&barrier, 0, order);
-            }
-        #endif
-    #endif
-    #define ma_atomic_compiler_fence()      ma_atomic_thread_fence(ma_atomic_memory_order_seq_cst)
-    #define ma_atomic_signal_fence(order)   ma_atomic_thread_fence(order)
-    #if defined(MA_ATOMIC_HAS_8)
-        static MA_INLINE ma_uint8 ma_atomic_load_explicit_8(volatile const ma_uint8* ptr, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, 0, 0, order, _InterlockedCompareExchange8, ma_uint8, char);
-        #else
-            (void)order;
-            return ma_atomic_compare_and_swap_8((volatile ma_uint8*)ptr, 0, 0);
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_16)
-        static MA_INLINE ma_uint16 ma_atomic_load_explicit_16(volatile const ma_uint16* ptr, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, 0, 0, order, _InterlockedCompareExchange16, ma_uint16, short);
-        #else
-            (void)order;
-            return ma_atomic_compare_and_swap_16((volatile ma_uint16*)ptr, 0, 0);
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_32)
-        static MA_INLINE ma_uint32 ma_atomic_load_explicit_32(volatile const ma_uint32* ptr, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, 0, 0, order, _InterlockedCompareExchange, ma_uint32, long);
-        #else
-            (void)order;
-            return ma_atomic_compare_and_swap_32((volatile ma_uint32*)ptr, 0, 0);
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_64)
-        static MA_INLINE ma_uint64 ma_atomic_load_explicit_64(volatile const ma_uint64* ptr, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, 0, 0, order, _InterlockedCompareExchange64, ma_uint64, long long);
-        #else
-            (void)order;
-            return ma_atomic_compare_and_swap_64((volatile ma_uint64*)ptr, 0, 0);
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_8)
-        #define ma_atomic_store_explicit_8( dst, src, order) (void)ma_atomic_exchange_explicit_8 (dst, src, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_16)
-        #define ma_atomic_store_explicit_16(dst, src, order) (void)ma_atomic_exchange_explicit_16(dst, src, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_32)
-        #define ma_atomic_store_explicit_32(dst, src, order) (void)ma_atomic_exchange_explicit_32(dst, src, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_64)
-        #define ma_atomic_store_explicit_64(dst, src, order) (void)ma_atomic_exchange_explicit_64(dst, src, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_8)
-        static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_sub_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            ma_uint8 oldValue;
-            ma_uint8 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint8)(oldValue - src);
-            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_16)
-        static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_sub_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            ma_uint16 oldValue;
-            ma_uint16 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint16)(oldValue - src);
-            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_32)
-        static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_sub_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            ma_uint32 oldValue;
-            ma_uint32 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue - src;
-            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_64)
-        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_sub_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue - src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_8)
-        static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_and_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedAnd8, ma_uint8, char);
-        #else
-            ma_uint8 oldValue;
-            ma_uint8 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint8)(oldValue & src);
-            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_16)
-        static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_and_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedAnd16, ma_uint16, short);
-        #else
-            ma_uint16 oldValue;
-            ma_uint16 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint16)(oldValue & src);
-            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_32)
-        static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_and_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedAnd, ma_uint32, long);
-        #else
-            ma_uint32 oldValue;
-            ma_uint32 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue & src;
-            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_64)
-        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_and_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedAnd64, ma_uint64, long long);
-        #else
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue & src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_8)
-        static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_xor_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedXor8, ma_uint8, char);
-        #else
-            ma_uint8 oldValue;
-            ma_uint8 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint8)(oldValue ^ src);
-            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_16)
-        static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_xor_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedXor16, ma_uint16, short);
-        #else
-            ma_uint16 oldValue;
-            ma_uint16 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint16)(oldValue ^ src);
-            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_32)
-        static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_xor_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedXor, ma_uint32, long);
-        #else
-            ma_uint32 oldValue;
-            ma_uint32 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue ^ src;
-            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_64)
-        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_xor_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedXor64, ma_uint64, long long);
-        #else
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue ^ src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_8)
-        static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_or_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedOr8, ma_uint8, char);
-        #else
-            ma_uint8 oldValue;
-            ma_uint8 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint8)(oldValue | src);
-            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_16)
-        static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_or_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedOr16, ma_uint16, short);
-        #else
-            ma_uint16 oldValue;
-            ma_uint16 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint16)(oldValue | src);
-            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_32)
-        static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_or_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedOr, ma_uint32, long);
-        #else
-            ma_uint32 oldValue;
-            ma_uint32 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue | src;
-            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_64)
-        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_or_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_ARM)
-            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedOr64, ma_uint64, long long);
-        #else
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue | src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        #endif
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_8)
-        #define ma_atomic_test_and_set_explicit_8( dst, order) ma_atomic_exchange_explicit_8 (dst, 1, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_16)
-        #define ma_atomic_test_and_set_explicit_16(dst, order) ma_atomic_exchange_explicit_16(dst, 1, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_32)
-        #define ma_atomic_test_and_set_explicit_32(dst, order) ma_atomic_exchange_explicit_32(dst, 1, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_64)
-        #define ma_atomic_test_and_set_explicit_64(dst, order) ma_atomic_exchange_explicit_64(dst, 1, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_8)
-        #define ma_atomic_clear_explicit_8( dst, order) ma_atomic_store_explicit_8 (dst, 0, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_16)
-        #define ma_atomic_clear_explicit_16(dst, order) ma_atomic_store_explicit_16(dst, 0, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_32)
-        #define ma_atomic_clear_explicit_32(dst, order) ma_atomic_store_explicit_32(dst, 0, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_64)
-        #define ma_atomic_clear_explicit_64(dst, order) ma_atomic_store_explicit_64(dst, 0, order)
-    #endif
-    #if defined(MA_ATOMIC_HAS_8)
-        typedef ma_uint8 ma_atomic_flag;
-        #define ma_atomic_flag_test_and_set_explicit(ptr, order)    (ma_bool32)ma_atomic_test_and_set_explicit_8(ptr, order)
-        #define ma_atomic_flag_clear_explicit(ptr, order)           ma_atomic_clear_explicit_8(ptr, order)
-        #define c89atoimc_flag_load_explicit(ptr, order)            ma_atomic_load_explicit_8(ptr, order)
-    #else
-        typedef ma_uint32 ma_atomic_flag;
-        #define ma_atomic_flag_test_and_set_explicit(ptr, order)    (ma_bool32)ma_atomic_test_and_set_explicit_32(ptr, order)
-        #define ma_atomic_flag_clear_explicit(ptr, order)           ma_atomic_clear_explicit_32(ptr, order)
-        #define c89atoimc_flag_load_explicit(ptr, order)            ma_atomic_load_explicit_32(ptr, order)
-    #endif
-#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
-    #define MA_ATOMIC_HAS_NATIVE_COMPARE_EXCHANGE
-    #define MA_ATOMIC_HAS_NATIVE_IS_LOCK_FREE
-    #define ma_atomic_memory_order_relaxed                          __ATOMIC_RELAXED
-    #define ma_atomic_memory_order_consume                          __ATOMIC_CONSUME
-    #define ma_atomic_memory_order_acquire                          __ATOMIC_ACQUIRE
-    #define ma_atomic_memory_order_release                          __ATOMIC_RELEASE
-    #define ma_atomic_memory_order_acq_rel                          __ATOMIC_ACQ_REL
-    #define ma_atomic_memory_order_seq_cst                          __ATOMIC_SEQ_CST
-    #define ma_atomic_compiler_fence()                              __asm__ __volatile__("":::"memory")
-    #define ma_atomic_thread_fence(order)                           __atomic_thread_fence(order)
-    #define ma_atomic_signal_fence(order)                           __atomic_signal_fence(order)
-    #define ma_atomic_is_lock_free_8(ptr)                           __atomic_is_lock_free(1, ptr)
-    #define ma_atomic_is_lock_free_16(ptr)                          __atomic_is_lock_free(2, ptr)
-    #define ma_atomic_is_lock_free_32(ptr)                          __atomic_is_lock_free(4, ptr)
-    #define ma_atomic_is_lock_free_64(ptr)                          __atomic_is_lock_free(8, ptr)
-    #define ma_atomic_test_and_set_explicit_8( dst, order)          __atomic_exchange_n(dst, 1, order)
-    #define ma_atomic_test_and_set_explicit_16(dst, order)          __atomic_exchange_n(dst, 1, order)
-    #define ma_atomic_test_and_set_explicit_32(dst, order)          __atomic_exchange_n(dst, 1, order)
-    #define ma_atomic_test_and_set_explicit_64(dst, order)          __atomic_exchange_n(dst, 1, order)
-    #define ma_atomic_clear_explicit_8( dst, order)                 __atomic_store_n(dst, 0, order)
-    #define ma_atomic_clear_explicit_16(dst, order)                 __atomic_store_n(dst, 0, order)
-    #define ma_atomic_clear_explicit_32(dst, order)                 __atomic_store_n(dst, 0, order)
-    #define ma_atomic_clear_explicit_64(dst, order)                 __atomic_store_n(dst, 0, order)
-    #define ma_atomic_store_explicit_8( dst, src, order)            __atomic_store_n(dst, src, order)
-    #define ma_atomic_store_explicit_16(dst, src, order)            __atomic_store_n(dst, src, order)
-    #define ma_atomic_store_explicit_32(dst, src, order)            __atomic_store_n(dst, src, order)
-    #define ma_atomic_store_explicit_64(dst, src, order)            __atomic_store_n(dst, src, order)
-    #define ma_atomic_load_explicit_8( dst, order)                  __atomic_load_n(dst, order)
-    #define ma_atomic_load_explicit_16(dst, order)                  __atomic_load_n(dst, order)
-    #define ma_atomic_load_explicit_32(dst, order)                  __atomic_load_n(dst, order)
-    #define ma_atomic_load_explicit_64(dst, order)                  __atomic_load_n(dst, order)
-    #define ma_atomic_exchange_explicit_8( dst, src, order)         __atomic_exchange_n(dst, src, order)
-    #define ma_atomic_exchange_explicit_16(dst, src, order)         __atomic_exchange_n(dst, src, order)
-    #define ma_atomic_exchange_explicit_32(dst, src, order)         __atomic_exchange_n(dst, src, order)
-    #define ma_atomic_exchange_explicit_64(dst, src, order)         __atomic_exchange_n(dst, src, order)
-    #define ma_atomic_compare_exchange_strong_explicit_8( dst, expected, desired, successOrder, failureOrder)   __atomic_compare_exchange_n(dst, expected, desired, 0, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_strong_explicit_16(dst, expected, desired, successOrder, failureOrder)   __atomic_compare_exchange_n(dst, expected, desired, 0, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_strong_explicit_32(dst, expected, desired, successOrder, failureOrder)   __atomic_compare_exchange_n(dst, expected, desired, 0, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_strong_explicit_64(dst, expected, desired, successOrder, failureOrder)   __atomic_compare_exchange_n(dst, expected, desired, 0, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_weak_explicit_8( dst, expected, desired, successOrder, failureOrder)     __atomic_compare_exchange_n(dst, expected, desired, 1, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_weak_explicit_16(dst, expected, desired, successOrder, failureOrder)     __atomic_compare_exchange_n(dst, expected, desired, 1, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_weak_explicit_32(dst, expected, desired, successOrder, failureOrder)     __atomic_compare_exchange_n(dst, expected, desired, 1, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_weak_explicit_64(dst, expected, desired, successOrder, failureOrder)     __atomic_compare_exchange_n(dst, expected, desired, 1, successOrder, failureOrder)
-    #define ma_atomic_fetch_add_explicit_8( dst, src, order)        __atomic_fetch_add(dst, src, order)
-    #define ma_atomic_fetch_add_explicit_16(dst, src, order)        __atomic_fetch_add(dst, src, order)
-    #define ma_atomic_fetch_add_explicit_32(dst, src, order)        __atomic_fetch_add(dst, src, order)
-    #define ma_atomic_fetch_add_explicit_64(dst, src, order)        __atomic_fetch_add(dst, src, order)
-    #define ma_atomic_fetch_sub_explicit_8( dst, src, order)        __atomic_fetch_sub(dst, src, order)
-    #define ma_atomic_fetch_sub_explicit_16(dst, src, order)        __atomic_fetch_sub(dst, src, order)
-    #define ma_atomic_fetch_sub_explicit_32(dst, src, order)        __atomic_fetch_sub(dst, src, order)
-    #define ma_atomic_fetch_sub_explicit_64(dst, src, order)        __atomic_fetch_sub(dst, src, order)
-    #define ma_atomic_fetch_or_explicit_8( dst, src, order)         __atomic_fetch_or(dst, src, order)
-    #define ma_atomic_fetch_or_explicit_16(dst, src, order)         __atomic_fetch_or(dst, src, order)
-    #define ma_atomic_fetch_or_explicit_32(dst, src, order)         __atomic_fetch_or(dst, src, order)
-    #define ma_atomic_fetch_or_explicit_64(dst, src, order)         __atomic_fetch_or(dst, src, order)
-    #define ma_atomic_fetch_xor_explicit_8( dst, src, order)        __atomic_fetch_xor(dst, src, order)
-    #define ma_atomic_fetch_xor_explicit_16(dst, src, order)        __atomic_fetch_xor(dst, src, order)
-    #define ma_atomic_fetch_xor_explicit_32(dst, src, order)        __atomic_fetch_xor(dst, src, order)
-    #define ma_atomic_fetch_xor_explicit_64(dst, src, order)        __atomic_fetch_xor(dst, src, order)
-    #define ma_atomic_fetch_and_explicit_8( dst, src, order)        __atomic_fetch_and(dst, src, order)
-    #define ma_atomic_fetch_and_explicit_16(dst, src, order)        __atomic_fetch_and(dst, src, order)
-    #define ma_atomic_fetch_and_explicit_32(dst, src, order)        __atomic_fetch_and(dst, src, order)
-    #define ma_atomic_fetch_and_explicit_64(dst, src, order)        __atomic_fetch_and(dst, src, order)
-    static MA_INLINE ma_uint8 ma_atomic_compare_and_swap_8(volatile ma_uint8* dst, ma_uint8 expected, ma_uint8 desired)
-    {
-        __atomic_compare_exchange_n(dst, &expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-        return expected;
-    }
-    static MA_INLINE ma_uint16 ma_atomic_compare_and_swap_16(volatile ma_uint16* dst, ma_uint16 expected, ma_uint16 desired)
-    {
-        __atomic_compare_exchange_n(dst, &expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-        return expected;
-    }
-    static MA_INLINE ma_uint32 ma_atomic_compare_and_swap_32(volatile ma_uint32* dst, ma_uint32 expected, ma_uint32 desired)
-    {
-        __atomic_compare_exchange_n(dst, &expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-        return expected;
-    }
-    static MA_INLINE ma_uint64 ma_atomic_compare_and_swap_64(volatile ma_uint64* dst, ma_uint64 expected, ma_uint64 desired)
-    {
-        __atomic_compare_exchange_n(dst, &expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-        return expected;
-    }
-    typedef ma_uint8 ma_atomic_flag;
-    #define ma_atomic_flag_test_and_set_explicit(dst, order)        (ma_bool32)__atomic_test_and_set(dst, order)
-    #define ma_atomic_flag_clear_explicit(dst, order)               __atomic_clear(dst, order)
-    #define c89atoimc_flag_load_explicit(ptr, order)                ma_atomic_load_explicit_8(ptr, order)
-#else
-    #define ma_atomic_memory_order_relaxed  1
-    #define ma_atomic_memory_order_consume  2
-    #define ma_atomic_memory_order_acquire  3
-    #define ma_atomic_memory_order_release  4
-    #define ma_atomic_memory_order_acq_rel  5
-    #define ma_atomic_memory_order_seq_cst  6
-    #define ma_atomic_compiler_fence() __asm__ __volatile__("":::"memory")
-    #if defined(__GNUC__)
-        #define ma_atomic_thread_fence(order) __sync_synchronize(), (void)order
-        static MA_INLINE ma_uint8 ma_atomic_exchange_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            if (order > ma_atomic_memory_order_acquire) {
-                __sync_synchronize();
-            }
-            return __sync_lock_test_and_set(dst, src);
-        }
-        static MA_INLINE ma_uint16 ma_atomic_exchange_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            ma_uint16 oldValue;
-            do {
-                oldValue = *dst;
-            } while (__sync_val_compare_and_swap(dst, oldValue, src) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint32 ma_atomic_exchange_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            ma_uint32 oldValue;
-            do {
-                oldValue = *dst;
-            } while (__sync_val_compare_and_swap(dst, oldValue, src) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint64 ma_atomic_exchange_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            ma_uint64 oldValue;
-            do {
-                oldValue = *dst;
-            } while (__sync_val_compare_and_swap(dst, oldValue, src) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_add_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_add(dst, src);
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_add_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_add(dst, src);
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_add_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_add(dst, src);
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_add_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_add(dst, src);
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_sub_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_sub(dst, src);
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_sub_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_sub(dst, src);
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_sub_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_sub(dst, src);
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_sub_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_sub(dst, src);
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_or_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_or(dst, src);
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_or_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_or(dst, src);
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_or_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_or(dst, src);
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_or_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_or(dst, src);
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_xor_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_xor(dst, src);
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_xor_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_xor(dst, src);
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_xor_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_xor(dst, src);
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_xor_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_xor(dst, src);
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_and_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_and(dst, src);
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_and_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_and(dst, src);
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_and_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_and(dst, src);
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_and_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            (void)order;
-            return __sync_fetch_and_and(dst, src);
-        }
-        #define ma_atomic_compare_and_swap_8( dst, expected, desired)   __sync_val_compare_and_swap(dst, expected, desired)
-        #define ma_atomic_compare_and_swap_16(dst, expected, desired)   __sync_val_compare_and_swap(dst, expected, desired)
-        #define ma_atomic_compare_and_swap_32(dst, expected, desired)   __sync_val_compare_and_swap(dst, expected, desired)
-        #define ma_atomic_compare_and_swap_64(dst, expected, desired)   __sync_val_compare_and_swap(dst, expected, desired)
-    #else
-        #if defined(MA_X86)
-            #define ma_atomic_thread_fence(order) __asm__ __volatile__("lock; addl $0, (%%esp)" ::: "memory", "cc")
-        #elif defined(MA_X64)
-            #define ma_atomic_thread_fence(order) __asm__ __volatile__("lock; addq $0, (%%rsp)" ::: "memory", "cc")
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-        static MA_INLINE ma_uint8 ma_atomic_compare_and_swap_8(volatile ma_uint8* dst, ma_uint8 expected, ma_uint8 desired)
-        {
-            ma_uint8 result;
-        #if defined(MA_X86) || defined(MA_X64)
-            __asm__ __volatile__("lock; cmpxchg %3, %0" : "+m"(*dst), "=a"(result) : "a"(expected), "d"(desired) : "cc");
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint16 ma_atomic_compare_and_swap_16(volatile ma_uint16* dst, ma_uint16 expected, ma_uint16 desired)
-        {
-            ma_uint16 result;
-        #if defined(MA_X86) || defined(MA_X64)
-            __asm__ __volatile__("lock; cmpxchg %3, %0" : "+m"(*dst), "=a"(result) : "a"(expected), "d"(desired) : "cc");
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint32 ma_atomic_compare_and_swap_32(volatile ma_uint32* dst, ma_uint32 expected, ma_uint32 desired)
-        {
-            ma_uint32 result;
-        #if defined(MA_X86) || defined(MA_X64)
-            __asm__ __volatile__("lock; cmpxchg %3, %0" : "+m"(*dst), "=a"(result) : "a"(expected), "d"(desired) : "cc");
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint64 ma_atomic_compare_and_swap_64(volatile ma_uint64* dst, ma_uint64 expected, ma_uint64 desired)
-        {
-            volatile ma_uint64 result;
-        #if defined(MA_X86)
-            ma_uint32 resultEAX;
-            ma_uint32 resultEDX;
-            __asm__ __volatile__("push %%ebx; xchg %5, %%ebx; lock; cmpxchg8b %0; pop %%ebx" : "+m"(*dst), "=a"(resultEAX), "=d"(resultEDX) : "a"(expected & 0xFFFFFFFF), "d"(expected >> 32), "r"(desired & 0xFFFFFFFF), "c"(desired >> 32) : "cc");
-            result = ((ma_uint64)resultEDX << 32) | resultEAX;
-        #elif defined(MA_X64)
-            __asm__ __volatile__("lock; cmpxchg %3, %0" : "+m"(*dst), "=a"(result) : "a"(expected), "d"(desired) : "cc");
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint8 ma_atomic_exchange_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            ma_uint8 result = 0;
-            (void)order;
-        #if defined(MA_X86) || defined(MA_X64)
-            __asm__ __volatile__("lock; xchg %1, %0" : "+m"(*dst), "=a"(result) : "a"(src));
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint16 ma_atomic_exchange_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            ma_uint16 result = 0;
-            (void)order;
-        #if defined(MA_X86) || defined(MA_X64)
-            __asm__ __volatile__("lock; xchg %1, %0" : "+m"(*dst), "=a"(result) : "a"(src));
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint32 ma_atomic_exchange_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            ma_uint32 result;
-            (void)order;
-        #if defined(MA_X86) || defined(MA_X64)
-            __asm__ __volatile__("lock; xchg %1, %0" : "+m"(*dst), "=a"(result) : "a"(src));
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint64 ma_atomic_exchange_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            ma_uint64 result;
-            (void)order;
-        #if defined(MA_X86)
-            do {
-                result = *dst;
-            } while (ma_atomic_compare_and_swap_64(dst, result, src) != result);
-        #elif defined(MA_X64)
-            __asm__ __volatile__("lock; xchg %1, %0" : "+m"(*dst), "=a"(result) : "a"(src));
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_add_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            ma_uint8 result;
-            (void)order;
-        #if defined(MA_X86) || defined(MA_X64)
-            __asm__ __volatile__("lock; xadd %1, %0" : "+m"(*dst), "=a"(result) : "a"(src) : "cc");
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_add_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            ma_uint16 result;
-            (void)order;
-        #if defined(MA_X86) || defined(MA_X64)
-            __asm__ __volatile__("lock; xadd %1, %0" : "+m"(*dst), "=a"(result) : "a"(src) : "cc");
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_add_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            ma_uint32 result;
-            (void)order;
-        #if defined(MA_X86) || defined(MA_X64)
-            __asm__ __volatile__("lock; xadd %1, %0" : "+m"(*dst), "=a"(result) : "a"(src) : "cc");
-        #else
-            #error Unsupported architecture. Please submit a feature request.
-        #endif
-            return result;
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_add_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-        #if defined(MA_X86)
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            (void)order;
-            do {
-                oldValue = *dst;
-                newValue = oldValue + src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            return oldValue;
-        #elif defined(MA_X64)
-            ma_uint64 result;
-            (void)order;
-            __asm__ __volatile__("lock; xadd %1, %0" : "+m"(*dst), "=a"(result) : "a"(src) : "cc");
-            return result;
-        #endif
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_sub_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            ma_uint8 oldValue;
-            ma_uint8 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint8)(oldValue - src);
-            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_sub_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            ma_uint16 oldValue;
-            ma_uint16 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint16)(oldValue - src);
-            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_sub_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            ma_uint32 oldValue;
-            ma_uint32 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue - src;
-            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_sub_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue - src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_and_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            ma_uint8 oldValue;
-            ma_uint8 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint8)(oldValue & src);
-            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_and_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            ma_uint16 oldValue;
-            ma_uint16 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint16)(oldValue & src);
-            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_and_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            ma_uint32 oldValue;
-            ma_uint32 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue & src;
-            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_and_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue & src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_xor_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            ma_uint8 oldValue;
-            ma_uint8 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint8)(oldValue ^ src);
-            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_xor_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            ma_uint16 oldValue;
-            ma_uint16 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint16)(oldValue ^ src);
-            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_xor_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            ma_uint32 oldValue;
-            ma_uint32 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue ^ src;
-            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_xor_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue ^ src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint8 ma_atomic_fetch_or_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
-        {
-            ma_uint8 oldValue;
-            ma_uint8 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint8)(oldValue | src);
-            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint16 ma_atomic_fetch_or_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
-        {
-            ma_uint16 oldValue;
-            ma_uint16 newValue;
-            do {
-                oldValue = *dst;
-                newValue = (ma_uint16)(oldValue | src);
-            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint32 ma_atomic_fetch_or_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
-        {
-            ma_uint32 oldValue;
-            ma_uint32 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue | src;
-            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-        static MA_INLINE ma_uint64 ma_atomic_fetch_or_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
-        {
-            ma_uint64 oldValue;
-            ma_uint64 newValue;
-            do {
-                oldValue = *dst;
-                newValue = oldValue | src;
-            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
-            (void)order;
-            return oldValue;
-        }
-    #endif
-    #define ma_atomic_signal_fence(order)                           ma_atomic_thread_fence(order)
-    static MA_INLINE ma_uint8 ma_atomic_load_explicit_8(volatile const ma_uint8* ptr, ma_atomic_memory_order order)
-    {
-        (void)order;
-        return ma_atomic_compare_and_swap_8((ma_uint8*)ptr, 0, 0);
-    }
-    static MA_INLINE ma_uint16 ma_atomic_load_explicit_16(volatile const ma_uint16* ptr, ma_atomic_memory_order order)
-    {
-        (void)order;
-        return ma_atomic_compare_and_swap_16((ma_uint16*)ptr, 0, 0);
-    }
-    static MA_INLINE ma_uint32 ma_atomic_load_explicit_32(volatile const ma_uint32* ptr, ma_atomic_memory_order order)
-    {
-        (void)order;
-        return ma_atomic_compare_and_swap_32((ma_uint32*)ptr, 0, 0);
-    }
-    static MA_INLINE ma_uint64 ma_atomic_load_explicit_64(volatile const ma_uint64* ptr, ma_atomic_memory_order order)
-    {
-        (void)order;
-        return ma_atomic_compare_and_swap_64((ma_uint64*)ptr, 0, 0);
-    }
-    #define ma_atomic_store_explicit_8( dst, src, order)            (void)ma_atomic_exchange_explicit_8 (dst, src, order)
-    #define ma_atomic_store_explicit_16(dst, src, order)            (void)ma_atomic_exchange_explicit_16(dst, src, order)
-    #define ma_atomic_store_explicit_32(dst, src, order)            (void)ma_atomic_exchange_explicit_32(dst, src, order)
-    #define ma_atomic_store_explicit_64(dst, src, order)            (void)ma_atomic_exchange_explicit_64(dst, src, order)
-    #define ma_atomic_test_and_set_explicit_8( dst, order)          ma_atomic_exchange_explicit_8 (dst, 1, order)
-    #define ma_atomic_test_and_set_explicit_16(dst, order)          ma_atomic_exchange_explicit_16(dst, 1, order)
-    #define ma_atomic_test_and_set_explicit_32(dst, order)          ma_atomic_exchange_explicit_32(dst, 1, order)
-    #define ma_atomic_test_and_set_explicit_64(dst, order)          ma_atomic_exchange_explicit_64(dst, 1, order)
-    #define ma_atomic_clear_explicit_8( dst, order)                 ma_atomic_store_explicit_8 (dst, 0, order)
-    #define ma_atomic_clear_explicit_16(dst, order)                 ma_atomic_store_explicit_16(dst, 0, order)
-    #define ma_atomic_clear_explicit_32(dst, order)                 ma_atomic_store_explicit_32(dst, 0, order)
-    #define ma_atomic_clear_explicit_64(dst, order)                 ma_atomic_store_explicit_64(dst, 0, order)
-    typedef ma_uint8 ma_atomic_flag;
-    #define ma_atomic_flag_test_and_set_explicit(ptr, order)        (ma_bool32)ma_atomic_test_and_set_explicit_8(ptr, order)
-    #define ma_atomic_flag_clear_explicit(ptr, order)               ma_atomic_clear_explicit_8(ptr, order)
-    #define c89atoimc_flag_load_explicit(ptr, order)                ma_atomic_load_explicit_8(ptr, order)
-#endif
-#if !defined(MA_ATOMIC_HAS_NATIVE_COMPARE_EXCHANGE)
-    #if defined(MA_ATOMIC_HAS_8)
-        static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_8(volatile ma_uint8* dst, ma_uint8* expected, ma_uint8 desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-        {
-            ma_uint8 expectedValue;
-            ma_uint8 result;
-            (void)successOrder;
-            (void)failureOrder;
-            expectedValue = ma_atomic_load_explicit_8(expected, ma_atomic_memory_order_seq_cst);
-            result = ma_atomic_compare_and_swap_8(dst, expectedValue, desired);
-            if (result == expectedValue) {
-                return 1;
-            } else {
-                ma_atomic_store_explicit_8(expected, result, failureOrder);
-                return 0;
-            }
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_16)
-        static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_16(volatile ma_uint16* dst, ma_uint16* expected, ma_uint16 desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-        {
-            ma_uint16 expectedValue;
-            ma_uint16 result;
-            (void)successOrder;
-            (void)failureOrder;
-            expectedValue = ma_atomic_load_explicit_16(expected, ma_atomic_memory_order_seq_cst);
-            result = ma_atomic_compare_and_swap_16(dst, expectedValue, desired);
-            if (result == expectedValue) {
-                return 1;
-            } else {
-                ma_atomic_store_explicit_16(expected, result, failureOrder);
-                return 0;
-            }
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_32)
-        static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_32(volatile ma_uint32* dst, ma_uint32* expected, ma_uint32 desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-        {
-            ma_uint32 expectedValue;
-            ma_uint32 result;
-            (void)successOrder;
-            (void)failureOrder;
-            expectedValue = ma_atomic_load_explicit_32(expected, ma_atomic_memory_order_seq_cst);
-            result = ma_atomic_compare_and_swap_32(dst, expectedValue, desired);
-            if (result == expectedValue) {
-                return 1;
-            } else {
-                ma_atomic_store_explicit_32(expected, result, failureOrder);
-                return 0;
-            }
-        }
-    #endif
-    #if defined(MA_ATOMIC_HAS_64)
-        static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_64(volatile ma_uint64* dst, volatile ma_uint64* expected, ma_uint64 desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-        {
-            ma_uint64 expectedValue;
-            ma_uint64 result;
-            (void)successOrder;
-            (void)failureOrder;
-            expectedValue = ma_atomic_load_explicit_64(expected, ma_atomic_memory_order_seq_cst);
-            result = ma_atomic_compare_and_swap_64(dst, expectedValue, desired);
-            if (result == expectedValue) {
-                return 1;
-            } else {
-                ma_atomic_store_explicit_64(expected, result, failureOrder);
-                return 0;
-            }
-        }
-    #endif
-    #define ma_atomic_compare_exchange_weak_explicit_8( dst, expected, desired, successOrder, failureOrder) ma_atomic_compare_exchange_strong_explicit_8 (dst, expected, desired, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_weak_explicit_16(dst, expected, desired, successOrder, failureOrder) ma_atomic_compare_exchange_strong_explicit_16(dst, expected, desired, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_weak_explicit_32(dst, expected, desired, successOrder, failureOrder) ma_atomic_compare_exchange_strong_explicit_32(dst, expected, desired, successOrder, failureOrder)
-    #define ma_atomic_compare_exchange_weak_explicit_64(dst, expected, desired, successOrder, failureOrder) ma_atomic_compare_exchange_strong_explicit_64(dst, expected, desired, successOrder, failureOrder)
-#endif
-#if !defined(MA_ATOMIC_HAS_NATIVE_IS_LOCK_FREE)
-    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_8(volatile void* ptr)
-    {
-        (void)ptr;
-        return 1;
-    }
-    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_16(volatile void* ptr)
-    {
-        (void)ptr;
-        return 1;
-    }
-    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_32(volatile void* ptr)
-    {
-        (void)ptr;
-        return 1;
-    }
-    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_64(volatile void* ptr)
-    {
-        (void)ptr;
-    #if defined(MA_64BIT)
-        return 1;
-    #else
-        #if defined(MA_X86) || defined(MA_X64)
-            return 1;
-        #else
-            return 0;
-        #endif
-    #endif
-    }
-#endif
-#if defined(MA_64BIT)
-    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_ptr(volatile void** ptr)
-    {
-        return ma_atomic_is_lock_free_64((volatile ma_uint64*)ptr);
-    }
-    static MA_INLINE void* ma_atomic_load_explicit_ptr(volatile void** ptr, ma_atomic_memory_order order)
-    {
-        return (void*)ma_atomic_load_explicit_64((volatile ma_uint64*)ptr, order);
-    }
-    static MA_INLINE void ma_atomic_store_explicit_ptr(volatile void** dst, void* src, ma_atomic_memory_order order)
-    {
-        ma_atomic_store_explicit_64((volatile ma_uint64*)dst, (ma_uint64)src, order);
-    }
-    static MA_INLINE void* ma_atomic_exchange_explicit_ptr(volatile void** dst, void* src, ma_atomic_memory_order order)
-    {
-        return (void*)ma_atomic_exchange_explicit_64((volatile ma_uint64*)dst, (ma_uint64)src, order);
-    }
-    static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_ptr(volatile void** dst, void** expected, void* desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-    {
-        return ma_atomic_compare_exchange_strong_explicit_64((volatile ma_uint64*)dst, (ma_uint64*)expected, (ma_uint64)desired, successOrder, failureOrder);
-    }
-    static MA_INLINE ma_bool32 ma_atomic_compare_exchange_weak_explicit_ptr(volatile void** dst, void** expected, void* desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-    {
-        return ma_atomic_compare_exchange_weak_explicit_64((volatile ma_uint64*)dst, (ma_uint64*)expected, (ma_uint64)desired, successOrder, failureOrder);
-    }
-    static MA_INLINE void* ma_atomic_compare_and_swap_ptr(volatile void** dst, void* expected, void* desired)
-    {
-        return (void*)ma_atomic_compare_and_swap_64((volatile ma_uint64*)dst, (ma_uint64)expected, (ma_uint64)desired);
-    }
-#elif defined(MA_32BIT)
-    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_ptr(volatile void** ptr)
-    {
-        return ma_atomic_is_lock_free_32((volatile ma_uint32*)ptr);
-    }
-    static MA_INLINE void* ma_atomic_load_explicit_ptr(volatile void** ptr, ma_atomic_memory_order order)
-    {
-        return (void*)ma_atomic_load_explicit_32((volatile ma_uint32*)ptr, order);
-    }
-    static MA_INLINE void ma_atomic_store_explicit_ptr(volatile void** dst, void* src, ma_atomic_memory_order order)
-    {
-        ma_atomic_store_explicit_32((volatile ma_uint32*)dst, (ma_uint32)src, order);
-    }
-    static MA_INLINE void* ma_atomic_exchange_explicit_ptr(volatile void** dst, void* src, ma_atomic_memory_order order)
-    {
-        return (void*)ma_atomic_exchange_explicit_32((volatile ma_uint32*)dst, (ma_uint32)src, order);
-    }
-    static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_ptr(volatile void** dst, void** expected, void* desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-    {
-        return ma_atomic_compare_exchange_strong_explicit_32((volatile ma_uint32*)dst, (ma_uint32*)expected, (ma_uint32)desired, successOrder, failureOrder);
-    }
-    static MA_INLINE ma_bool32 ma_atomic_compare_exchange_weak_explicit_ptr(volatile void** dst, void** expected, void* desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-    {
-        return ma_atomic_compare_exchange_weak_explicit_32((volatile ma_uint32*)dst, (ma_uint32*)expected, (ma_uint32)desired, successOrder, failureOrder);
-    }
-    static MA_INLINE void* ma_atomic_compare_and_swap_ptr(volatile void** dst, void* expected, void* desired)
-    {
-        return (void*)ma_atomic_compare_and_swap_32((volatile ma_uint32*)dst, (ma_uint32)expected, (ma_uint32)desired);
-    }
-#else
-    #error Unsupported architecture.
-#endif
-#define ma_atomic_flag_test_and_set(ptr)                                ma_atomic_flag_test_and_set_explicit(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_flag_clear(ptr)                                       ma_atomic_flag_clear_explicit(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_ptr(dst, src)                                   ma_atomic_store_explicit_ptr((volatile void**)dst, (void*)src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_ptr(ptr)                                         ma_atomic_load_explicit_ptr((volatile void**)ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_ptr(dst, src)                                ma_atomic_exchange_explicit_ptr((volatile void**)dst, (void*)src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_ptr(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_ptr((volatile void**)dst, (void**)expected, (void*)desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_ptr(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_ptr((volatile void**)dst, (void**)expected, (void*)desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_test_and_set_8( ptr)                                  ma_atomic_test_and_set_explicit_8( ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_test_and_set_16(ptr)                                  ma_atomic_test_and_set_explicit_16(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_test_and_set_32(ptr)                                  ma_atomic_test_and_set_explicit_32(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_test_and_set_64(ptr)                                  ma_atomic_test_and_set_explicit_64(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_clear_8( ptr)                                         ma_atomic_clear_explicit_8( ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_clear_16(ptr)                                         ma_atomic_clear_explicit_16(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_clear_32(ptr)                                         ma_atomic_clear_explicit_32(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_clear_64(ptr)                                         ma_atomic_clear_explicit_64(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_8( dst, src)                                    ma_atomic_store_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_16(dst, src)                                    ma_atomic_store_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_32(dst, src)                                    ma_atomic_store_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_64(dst, src)                                    ma_atomic_store_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_8( ptr)                                          ma_atomic_load_explicit_8( ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_16(ptr)                                          ma_atomic_load_explicit_16(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_32(ptr)                                          ma_atomic_load_explicit_32(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_64(ptr)                                          ma_atomic_load_explicit_64(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_8( dst, src)                                 ma_atomic_exchange_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_16(dst, src)                                 ma_atomic_exchange_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_32(dst, src)                                 ma_atomic_exchange_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_64(dst, src)                                 ma_atomic_exchange_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_8( dst, expected, desired)    ma_atomic_compare_exchange_strong_explicit_8( dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_16(dst, expected, desired)    ma_atomic_compare_exchange_strong_explicit_16(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_32(dst, expected, desired)    ma_atomic_compare_exchange_strong_explicit_32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_64(dst, expected, desired)    ma_atomic_compare_exchange_strong_explicit_64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_8(  dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_8( dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_16( dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_16(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_32( dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_64( dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_8( dst, src)                                ma_atomic_fetch_add_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_16(dst, src)                                ma_atomic_fetch_add_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_32(dst, src)                                ma_atomic_fetch_add_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_64(dst, src)                                ma_atomic_fetch_add_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_8( dst, src)                                ma_atomic_fetch_sub_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_16(dst, src)                                ma_atomic_fetch_sub_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_32(dst, src)                                ma_atomic_fetch_sub_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_64(dst, src)                                ma_atomic_fetch_sub_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_8( dst, src)                                 ma_atomic_fetch_or_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_16(dst, src)                                 ma_atomic_fetch_or_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_32(dst, src)                                 ma_atomic_fetch_or_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_64(dst, src)                                 ma_atomic_fetch_or_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_8( dst, src)                                ma_atomic_fetch_xor_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_16(dst, src)                                ma_atomic_fetch_xor_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_32(dst, src)                                ma_atomic_fetch_xor_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_64(dst, src)                                ma_atomic_fetch_xor_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_8( dst, src)                                ma_atomic_fetch_and_explicit_8 (dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_16(dst, src)                                ma_atomic_fetch_and_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_32(dst, src)                                ma_atomic_fetch_and_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_64(dst, src)                                ma_atomic_fetch_and_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_test_and_set_explicit_i8( ptr, order)                 (ma_int8 )ma_atomic_test_and_set_explicit_8( (ma_uint8* )ptr, order)
-#define ma_atomic_test_and_set_explicit_i16(ptr, order)                 (ma_int16)ma_atomic_test_and_set_explicit_16((ma_uint16*)ptr, order)
-#define ma_atomic_test_and_set_explicit_i32(ptr, order)                 (ma_int32)ma_atomic_test_and_set_explicit_32((ma_uint32*)ptr, order)
-#define ma_atomic_test_and_set_explicit_i64(ptr, order)                 (ma_int64)ma_atomic_test_and_set_explicit_64((ma_uint64*)ptr, order)
-#define ma_atomic_clear_explicit_i8( ptr, order)                        ma_atomic_clear_explicit_8( (ma_uint8* )ptr, order)
-#define ma_atomic_clear_explicit_i16(ptr, order)                        ma_atomic_clear_explicit_16((ma_uint16*)ptr, order)
-#define ma_atomic_clear_explicit_i32(ptr, order)                        ma_atomic_clear_explicit_32((ma_uint32*)ptr, order)
-#define ma_atomic_clear_explicit_i64(ptr, order)                        ma_atomic_clear_explicit_64((ma_uint64*)ptr, order)
-#define ma_atomic_store_explicit_i8( dst, src, order)                   ma_atomic_store_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
-#define ma_atomic_store_explicit_i16(dst, src, order)                   ma_atomic_store_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
-#define ma_atomic_store_explicit_i32(dst, src, order)                   ma_atomic_store_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
-#define ma_atomic_store_explicit_i64(dst, src, order)                   ma_atomic_store_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
-#define ma_atomic_load_explicit_i8( ptr, order)                         (ma_int8 )ma_atomic_load_explicit_8( (ma_uint8* )ptr, order)
-#define ma_atomic_load_explicit_i16(ptr, order)                         (ma_int16)ma_atomic_load_explicit_16((ma_uint16*)ptr, order)
-#define ma_atomic_load_explicit_i32(ptr, order)                         (ma_int32)ma_atomic_load_explicit_32((ma_uint32*)ptr, order)
-#define ma_atomic_load_explicit_i64(ptr, order)                         (ma_int64)ma_atomic_load_explicit_64((ma_uint64*)ptr, order)
-#define ma_atomic_exchange_explicit_i8( dst, src, order)                (ma_int8 )ma_atomic_exchange_explicit_8 ((ma_uint8* )dst, (ma_uint8 )src, order)
-#define ma_atomic_exchange_explicit_i16(dst, src, order)                (ma_int16)ma_atomic_exchange_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
-#define ma_atomic_exchange_explicit_i32(dst, src, order)                (ma_int32)ma_atomic_exchange_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
-#define ma_atomic_exchange_explicit_i64(dst, src, order)                (ma_int64)ma_atomic_exchange_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
-#define ma_atomic_compare_exchange_strong_explicit_i8( dst, expected, desired, successOrder, failureOrder)  ma_atomic_compare_exchange_strong_explicit_8( (ma_uint8* )dst, (ma_uint8* )expected, (ma_uint8 )desired, successOrder, failureOrder)
-#define ma_atomic_compare_exchange_strong_explicit_i16(dst, expected, desired, successOrder, failureOrder)  ma_atomic_compare_exchange_strong_explicit_16((ma_uint16*)dst, (ma_uint16*)expected, (ma_uint16)desired, successOrder, failureOrder)
-#define ma_atomic_compare_exchange_strong_explicit_i32(dst, expected, desired, successOrder, failureOrder)  ma_atomic_compare_exchange_strong_explicit_32((ma_uint32*)dst, (ma_uint32*)expected, (ma_uint32)desired, successOrder, failureOrder)
-#define ma_atomic_compare_exchange_strong_explicit_i64(dst, expected, desired, successOrder, failureOrder)  ma_atomic_compare_exchange_strong_explicit_64((ma_uint64*)dst, (ma_uint64*)expected, (ma_uint64)desired, successOrder, failureOrder)
-#define ma_atomic_compare_exchange_weak_explicit_i8( dst, expected, desired, successOrder, failureOrder)    ma_atomic_compare_exchange_weak_explicit_8( (ma_uint8* )dst, (ma_uint8* )expected, (ma_uint8 )desired, successOrder, failureOrder)
-#define ma_atomic_compare_exchange_weak_explicit_i16(dst, expected, desired, successOrder, failureOrder)    ma_atomic_compare_exchange_weak_explicit_16((ma_uint16*)dst, (ma_uint16*)expected, (ma_uint16)desired, successOrder, failureOrder)
-#define ma_atomic_compare_exchange_weak_explicit_i32(dst, expected, desired, successOrder, failureOrder)    ma_atomic_compare_exchange_weak_explicit_32((ma_uint32*)dst, (ma_uint32*)expected, (ma_uint32)desired, successOrder, failureOrder)
-#define ma_atomic_compare_exchange_weak_explicit_i64(dst, expected, desired, successOrder, failureOrder)    ma_atomic_compare_exchange_weak_explicit_64((ma_uint64*)dst, (ma_uint64*)expected, (ma_uint64)desired, successOrder, failureOrder)
-#define ma_atomic_fetch_add_explicit_i8( dst, src, order)               (ma_int8 )ma_atomic_fetch_add_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
-#define ma_atomic_fetch_add_explicit_i16(dst, src, order)               (ma_int16)ma_atomic_fetch_add_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
-#define ma_atomic_fetch_add_explicit_i32(dst, src, order)               (ma_int32)ma_atomic_fetch_add_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
-#define ma_atomic_fetch_add_explicit_i64(dst, src, order)               (ma_int64)ma_atomic_fetch_add_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
-#define ma_atomic_fetch_sub_explicit_i8( dst, src, order)               (ma_int8 )ma_atomic_fetch_sub_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
-#define ma_atomic_fetch_sub_explicit_i16(dst, src, order)               (ma_int16)ma_atomic_fetch_sub_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
-#define ma_atomic_fetch_sub_explicit_i32(dst, src, order)               (ma_int32)ma_atomic_fetch_sub_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
-#define ma_atomic_fetch_sub_explicit_i64(dst, src, order)               (ma_int64)ma_atomic_fetch_sub_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
-#define ma_atomic_fetch_or_explicit_i8( dst, src, order)                (ma_int8 )ma_atomic_fetch_or_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
-#define ma_atomic_fetch_or_explicit_i16(dst, src, order)                (ma_int16)ma_atomic_fetch_or_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
-#define ma_atomic_fetch_or_explicit_i32(dst, src, order)                (ma_int32)ma_atomic_fetch_or_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
-#define ma_atomic_fetch_or_explicit_i64(dst, src, order)                (ma_int64)ma_atomic_fetch_or_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
-#define ma_atomic_fetch_xor_explicit_i8( dst, src, order)               (ma_int8 )ma_atomic_fetch_xor_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
-#define ma_atomic_fetch_xor_explicit_i16(dst, src, order)               (ma_int16)ma_atomic_fetch_xor_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
-#define ma_atomic_fetch_xor_explicit_i32(dst, src, order)               (ma_int32)ma_atomic_fetch_xor_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
-#define ma_atomic_fetch_xor_explicit_i64(dst, src, order)               (ma_int64)ma_atomic_fetch_xor_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
-#define ma_atomic_fetch_and_explicit_i8( dst, src, order)               (ma_int8 )ma_atomic_fetch_and_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
-#define ma_atomic_fetch_and_explicit_i16(dst, src, order)               (ma_int16)ma_atomic_fetch_and_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
-#define ma_atomic_fetch_and_explicit_i32(dst, src, order)               (ma_int32)ma_atomic_fetch_and_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
-#define ma_atomic_fetch_and_explicit_i64(dst, src, order)               (ma_int64)ma_atomic_fetch_and_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
-#define ma_atomic_test_and_set_i8( ptr)                                 ma_atomic_test_and_set_explicit_i8( ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_test_and_set_i16(ptr)                                 ma_atomic_test_and_set_explicit_i16(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_test_and_set_i32(ptr)                                 ma_atomic_test_and_set_explicit_i32(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_test_and_set_i64(ptr)                                 ma_atomic_test_and_set_explicit_i64(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_clear_i8( ptr)                                        ma_atomic_clear_explicit_i8( ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_clear_i16(ptr)                                        ma_atomic_clear_explicit_i16(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_clear_i32(ptr)                                        ma_atomic_clear_explicit_i32(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_clear_i64(ptr)                                        ma_atomic_clear_explicit_i64(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_i8( dst, src)                                   ma_atomic_store_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_i16(dst, src)                                   ma_atomic_store_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_i32(dst, src)                                   ma_atomic_store_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_i64(dst, src)                                   ma_atomic_store_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_i8( ptr)                                         ma_atomic_load_explicit_i8( ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_i16(ptr)                                         ma_atomic_load_explicit_i16(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_i32(ptr)                                         ma_atomic_load_explicit_i32(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_i64(ptr)                                         ma_atomic_load_explicit_i64(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_i8( dst, src)                                ma_atomic_exchange_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_i16(dst, src)                                ma_atomic_exchange_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_i32(dst, src)                                ma_atomic_exchange_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_i64(dst, src)                                ma_atomic_exchange_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_i8( dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_i8( dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_i16(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_i16(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_i32(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_i32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_i64(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_i64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_i8( dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_i8( dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_i16(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_i16(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_i32(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_i32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_i64(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_i64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_i8( dst, src)                               ma_atomic_fetch_add_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_i16(dst, src)                               ma_atomic_fetch_add_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_i32(dst, src)                               ma_atomic_fetch_add_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_i64(dst, src)                               ma_atomic_fetch_add_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_i8( dst, src)                               ma_atomic_fetch_sub_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_i16(dst, src)                               ma_atomic_fetch_sub_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_i32(dst, src)                               ma_atomic_fetch_sub_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_i64(dst, src)                               ma_atomic_fetch_sub_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_i8( dst, src)                                ma_atomic_fetch_or_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_i16(dst, src)                                ma_atomic_fetch_or_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_i32(dst, src)                                ma_atomic_fetch_or_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_i64(dst, src)                                ma_atomic_fetch_or_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_i8( dst, src)                               ma_atomic_fetch_xor_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_i16(dst, src)                               ma_atomic_fetch_xor_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_i32(dst, src)                               ma_atomic_fetch_xor_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_i64(dst, src)                               ma_atomic_fetch_xor_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_i8( dst, src)                               ma_atomic_fetch_and_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_i16(dst, src)                               ma_atomic_fetch_and_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_i32(dst, src)                               ma_atomic_fetch_and_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_i64(dst, src)                               ma_atomic_fetch_and_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_and_swap_i8( dst, expected, dedsired)         (ma_int8 )ma_atomic_compare_and_swap_8( (ma_uint8* )dst, (ma_uint8 )expected, (ma_uint8 )dedsired)
-#define ma_atomic_compare_and_swap_i16(dst, expected, dedsired)         (ma_int16)ma_atomic_compare_and_swap_16((ma_uint16*)dst, (ma_uint16)expected, (ma_uint16)dedsired)
-#define ma_atomic_compare_and_swap_i32(dst, expected, dedsired)         (ma_int32)ma_atomic_compare_and_swap_32((ma_uint32*)dst, (ma_uint32)expected, (ma_uint32)dedsired)
-#define ma_atomic_compare_and_swap_i64(dst, expected, dedsired)         (ma_int64)ma_atomic_compare_and_swap_64((ma_uint64*)dst, (ma_uint64)expected, (ma_uint64)dedsired)
-typedef union
-{
-    ma_uint32 i;
-    float f;
-} ma_atomic_if32;
-typedef union
-{
-    ma_uint64 i;
-    double f;
-} ma_atomic_if64;
-#define ma_atomic_clear_explicit_f32(ptr, order)                        ma_atomic_clear_explicit_32((ma_uint32*)ptr, order)
-#define ma_atomic_clear_explicit_f64(ptr, order)                        ma_atomic_clear_explicit_64((ma_uint64*)ptr, order)
-static MA_INLINE void ma_atomic_store_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
-{
-    ma_atomic_if32 x;
-    x.f = src;
-    ma_atomic_store_explicit_32((volatile ma_uint32*)dst, x.i, order);
-}
-static MA_INLINE void ma_atomic_store_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
-{
-    ma_atomic_if64 x;
-    x.f = src;
-    ma_atomic_store_explicit_64((volatile ma_uint64*)dst, x.i, order);
-}
-static MA_INLINE float ma_atomic_load_explicit_f32(volatile const float* ptr, ma_atomic_memory_order order)
-{
-    ma_atomic_if32 r;
-    r.i = ma_atomic_load_explicit_32((volatile const ma_uint32*)ptr, order);
-    return r.f;
-}
-static MA_INLINE double ma_atomic_load_explicit_f64(volatile const double* ptr, ma_atomic_memory_order order)
-{
-    ma_atomic_if64 r;
-    r.i = ma_atomic_load_explicit_64((volatile const ma_uint64*)ptr, order);
-    return r.f;
-}
-static MA_INLINE float ma_atomic_exchange_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
-{
-    ma_atomic_if32 r;
-    ma_atomic_if32 x;
-    x.f = src;
-    r.i = ma_atomic_exchange_explicit_32((volatile ma_uint32*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE double ma_atomic_exchange_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
-{
-    ma_atomic_if64 r;
-    ma_atomic_if64 x;
-    x.f = src;
-    r.i = ma_atomic_exchange_explicit_64((volatile ma_uint64*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_f32(volatile float* dst, float* expected, float desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-{
-    ma_atomic_if32 d;
-    d.f = desired;
-    return ma_atomic_compare_exchange_strong_explicit_32((volatile ma_uint32*)dst, (ma_uint32*)expected, d.i, successOrder, failureOrder);
-}
-static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_f64(volatile double* dst, double* expected, double desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-{
-    ma_atomic_if64 d;
-    d.f = desired;
-    return ma_atomic_compare_exchange_strong_explicit_64((volatile ma_uint64*)dst, (ma_uint64*)expected, d.i, successOrder, failureOrder);
-}
-static MA_INLINE ma_bool32 ma_atomic_compare_exchange_weak_explicit_f32(volatile float* dst, float* expected, float desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-{
-    ma_atomic_if32 d;
-    d.f = desired;
-    return ma_atomic_compare_exchange_weak_explicit_32((volatile ma_uint32*)dst, (ma_uint32*)expected, d.i, successOrder, failureOrder);
-}
-static MA_INLINE ma_bool32 ma_atomic_compare_exchange_weak_explicit_f64(volatile double* dst, double* expected, double desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
-{
-    ma_atomic_if64 d;
-    d.f = desired;
-    return ma_atomic_compare_exchange_weak_explicit_64((volatile ma_uint64*)dst, (ma_uint64*)expected, d.i, successOrder, failureOrder);
-}
-static MA_INLINE float ma_atomic_fetch_add_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
-{
-    ma_atomic_if32 r;
-    ma_atomic_if32 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_add_explicit_32((volatile ma_uint32*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE double ma_atomic_fetch_add_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
-{
-    ma_atomic_if64 r;
-    ma_atomic_if64 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_add_explicit_64((volatile ma_uint64*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE float ma_atomic_fetch_sub_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
-{
-    ma_atomic_if32 r;
-    ma_atomic_if32 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_sub_explicit_32((volatile ma_uint32*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE double ma_atomic_fetch_sub_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
-{
-    ma_atomic_if64 r;
-    ma_atomic_if64 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_sub_explicit_64((volatile ma_uint64*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE float ma_atomic_fetch_or_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
-{
-    ma_atomic_if32 r;
-    ma_atomic_if32 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_or_explicit_32((volatile ma_uint32*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE double ma_atomic_fetch_or_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
-{
-    ma_atomic_if64 r;
-    ma_atomic_if64 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_or_explicit_64((volatile ma_uint64*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE float ma_atomic_fetch_xor_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
-{
-    ma_atomic_if32 r;
-    ma_atomic_if32 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_xor_explicit_32((volatile ma_uint32*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE double ma_atomic_fetch_xor_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
-{
-    ma_atomic_if64 r;
-    ma_atomic_if64 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_xor_explicit_64((volatile ma_uint64*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE float ma_atomic_fetch_and_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
-{
-    ma_atomic_if32 r;
-    ma_atomic_if32 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_and_explicit_32((volatile ma_uint32*)dst, x.i, order);
-    return r.f;
-}
-static MA_INLINE double ma_atomic_fetch_and_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
-{
-    ma_atomic_if64 r;
-    ma_atomic_if64 x;
-    x.f = src;
-    r.i = ma_atomic_fetch_and_explicit_64((volatile ma_uint64*)dst, x.i, order);
-    return r.f;
-}
-#define ma_atomic_clear_f32(ptr)                                        (float )ma_atomic_clear_explicit_f32(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_clear_f64(ptr)                                        (double)ma_atomic_clear_explicit_f64(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_f32(dst, src)                                   ma_atomic_store_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_store_f64(dst, src)                                   ma_atomic_store_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_f32(ptr)                                         (float )ma_atomic_load_explicit_f32(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_load_f64(ptr)                                         (double)ma_atomic_load_explicit_f64(ptr, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_f32(dst, src)                                (float )ma_atomic_exchange_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_exchange_f64(dst, src)                                (double)ma_atomic_exchange_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_f32(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_f32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_strong_f64(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_f64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_f32(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_f32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_compare_exchange_weak_f64(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_f64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_f32(dst, src)                               ma_atomic_fetch_add_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_add_f64(dst, src)                               ma_atomic_fetch_add_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_f32(dst, src)                               ma_atomic_fetch_sub_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_sub_f64(dst, src)                               ma_atomic_fetch_sub_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_f32(dst, src)                                ma_atomic_fetch_or_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_or_f64(dst, src)                                ma_atomic_fetch_or_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_f32(dst, src)                               ma_atomic_fetch_xor_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_xor_f64(dst, src)                               ma_atomic_fetch_xor_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_f32(dst, src)                               ma_atomic_fetch_and_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
-#define ma_atomic_fetch_and_f64(dst, src)                               ma_atomic_fetch_and_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
-static MA_INLINE float ma_atomic_compare_and_swap_f32(volatile float* dst, float expected, float desired)
-{
-    ma_atomic_if32 r;
-    ma_atomic_if32 e, d;
-    e.f = expected;
-    d.f = desired;
-    r.i = ma_atomic_compare_and_swap_32((volatile ma_uint32*)dst, e.i, d.i);
-    return r.f;
-}
-static MA_INLINE double ma_atomic_compare_and_swap_f64(volatile double* dst, double expected, double desired)
-{
-    ma_atomic_if64 r;
-    ma_atomic_if64 e, d;
-    e.f = expected;
-    d.f = desired;
-    r.i = ma_atomic_compare_and_swap_64((volatile ma_uint64*)dst, e.i, d.i);
-    return r.f;
-}
-typedef ma_atomic_flag ma_atomic_spinlock;
-static MA_INLINE void ma_atomic_spinlock_lock(volatile ma_atomic_spinlock* pSpinlock)
-{
-    for (;;) {
-        if (ma_atomic_flag_test_and_set_explicit(pSpinlock, ma_atomic_memory_order_acquire) == 0) {
-            break;
-        }
-        while (c89atoimc_flag_load_explicit(pSpinlock, ma_atomic_memory_order_relaxed) == 1) {
-        }
-    }
-}
-static MA_INLINE void ma_atomic_spinlock_unlock(volatile ma_atomic_spinlock* pSpinlock)
-{
-    ma_atomic_flag_clear_explicit(pSpinlock, ma_atomic_memory_order_release);
-}
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-    #pragma GCC diagnostic pop
-#endif
-#if defined(__cplusplus)
-}
-#endif
-#endif
-/* ma_atomic.h end */
-
-#define MA_ATOMIC_SAFE_TYPE_IMPL(c89TypeExtension, type) \
-    static MA_INLINE ma_##type ma_atomic_##type##_get(ma_atomic_##type* x) \
-    { \
-        return (ma_##type)ma_atomic_load_##c89TypeExtension(&x->value); \
-    } \
-    static MA_INLINE void ma_atomic_##type##_set(ma_atomic_##type* x, ma_##type value) \
-    { \
-        ma_atomic_store_##c89TypeExtension(&x->value, value); \
-    } \
-    static MA_INLINE ma_##type ma_atomic_##type##_exchange(ma_atomic_##type* x, ma_##type value) \
-    { \
-        return (ma_##type)ma_atomic_exchange_##c89TypeExtension(&x->value, value); \
-    } \
-    static MA_INLINE ma_bool32 ma_atomic_##type##_compare_exchange(ma_atomic_##type* x, ma_##type* expected, ma_##type desired) \
-    { \
-        return ma_atomic_compare_exchange_weak_##c89TypeExtension(&x->value, expected, desired); \
-    } \
-    static MA_INLINE ma_##type ma_atomic_##type##_fetch_add(ma_atomic_##type* x, ma_##type y) \
-    { \
-        return (ma_##type)ma_atomic_fetch_add_##c89TypeExtension(&x->value, y); \
-    } \
-    static MA_INLINE ma_##type ma_atomic_##type##_fetch_sub(ma_atomic_##type* x, ma_##type y) \
-    { \
-        return (ma_##type)ma_atomic_fetch_sub_##c89TypeExtension(&x->value, y); \
-    } \
-    static MA_INLINE ma_##type ma_atomic_##type##_fetch_or(ma_atomic_##type* x, ma_##type y) \
-    { \
-        return (ma_##type)ma_atomic_fetch_or_##c89TypeExtension(&x->value, y); \
-    } \
-    static MA_INLINE ma_##type ma_atomic_##type##_fetch_xor(ma_atomic_##type* x, ma_##type y) \
-    { \
-        return (ma_##type)ma_atomic_fetch_xor_##c89TypeExtension(&x->value, y); \
-    } \
-    static MA_INLINE ma_##type ma_atomic_##type##_fetch_and(ma_atomic_##type* x, ma_##type y) \
-    { \
-        return (ma_##type)ma_atomic_fetch_and_##c89TypeExtension(&x->value, y); \
-    } \
-    static MA_INLINE ma_##type ma_atomic_##type##_compare_and_swap(ma_atomic_##type* x, ma_##type expected, ma_##type desired) \
-    { \
-        return (ma_##type)ma_atomic_compare_and_swap_##c89TypeExtension(&x->value, expected, desired); \
-    } \
-
-#define MA_ATOMIC_SAFE_TYPE_IMPL_PTR(type) \
-    static MA_INLINE ma_##type* ma_atomic_ptr_##type##_get(ma_atomic_ptr_##type* x) \
-    { \
-        return ma_atomic_load_ptr((void**)&x->value); \
-    } \
-    static MA_INLINE void ma_atomic_ptr_##type##_set(ma_atomic_ptr_##type* x, ma_##type* value) \
-    { \
-        ma_atomic_store_ptr((void**)&x->value, (void*)value); \
-    } \
-    static MA_INLINE ma_##type* ma_atomic_ptr_##type##_exchange(ma_atomic_ptr_##type* x, ma_##type* value) \
-    { \
-        return ma_atomic_exchange_ptr((void**)&x->value, (void*)value); \
-    } \
-    static MA_INLINE ma_bool32 ma_atomic_ptr_##type##_compare_exchange(ma_atomic_ptr_##type* x, ma_##type** expected, ma_##type* desired) \
-    { \
-        return ma_atomic_compare_exchange_weak_ptr((void**)&x->value, (void*)expected, (void*)desired); \
-    } \
-    static MA_INLINE ma_##type* ma_atomic_ptr_##type##_compare_and_swap(ma_atomic_ptr_##type* x, ma_##type* expected, ma_##type* desired) \
-    { \
-        return (ma_##type*)ma_atomic_compare_and_swap_ptr((void**)&x->value, (void*)expected, (void*)desired); \
-    } \
-
-MA_ATOMIC_SAFE_TYPE_IMPL(32,  uint32)
-MA_ATOMIC_SAFE_TYPE_IMPL(i32, int32)
-MA_ATOMIC_SAFE_TYPE_IMPL(64,  uint64)
-MA_ATOMIC_SAFE_TYPE_IMPL(f32, float)
-MA_ATOMIC_SAFE_TYPE_IMPL(32,  bool32)
-
-#if !defined(MA_NO_DEVICE_IO)
-MA_ATOMIC_SAFE_TYPE_IMPL(i32, device_state)
-#endif
-
-
-MA_API ma_uint64 ma_calculate_frame_count_after_resampling(ma_uint32 sampleRateOut, ma_uint32 sampleRateIn, ma_uint64 frameCountIn)
-{
-    /* This is based on the calculation in ma_linear_resampler_get_expected_output_frame_count(). */
-    ma_uint64 outputFrameCount;
-    ma_uint64 preliminaryInputFrameCountFromFrac;
-    ma_uint64 preliminaryInputFrameCount;
-
-    if (sampleRateIn == 0 || sampleRateOut == 0 || frameCountIn == 0) {
-        return 0;
-    }
-
-    if (sampleRateOut == sampleRateIn) {
-        return frameCountIn;
-    }
-
-    outputFrameCount = (frameCountIn * sampleRateOut) / sampleRateIn;
-
-    preliminaryInputFrameCountFromFrac = (outputFrameCount * (sampleRateIn / sampleRateOut)) / sampleRateOut;
-    preliminaryInputFrameCount         = (outputFrameCount * (sampleRateIn % sampleRateOut)) + preliminaryInputFrameCountFromFrac;
-
-    if (preliminaryInputFrameCount <= frameCountIn) {
-        outputFrameCount += 1;
-    }
-
-    return outputFrameCount;
-}
-
-#ifndef MA_DATA_CONVERTER_STACK_BUFFER_SIZE
-#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE     4096
-#endif
-
-
-
-#if defined(MA_WIN32)
-static ma_result ma_result_from_GetLastError(DWORD error)
-{
-    switch (error)
-    {
-        case ERROR_SUCCESS:             return MA_SUCCESS;
-        case ERROR_PATH_NOT_FOUND:      return MA_DOES_NOT_EXIST;
-        case ERROR_TOO_MANY_OPEN_FILES: return MA_TOO_MANY_OPEN_FILES;
-        case ERROR_NOT_ENOUGH_MEMORY:   return MA_OUT_OF_MEMORY;
-        case ERROR_DISK_FULL:           return MA_NO_SPACE;
-        case ERROR_HANDLE_EOF:          return MA_AT_END;
-        case ERROR_NEGATIVE_SEEK:       return MA_BAD_SEEK;
-        case ERROR_INVALID_PARAMETER:   return MA_INVALID_ARGS;
-        case ERROR_ACCESS_DENIED:       return MA_ACCESS_DENIED;
-        case ERROR_SEM_TIMEOUT:         return MA_TIMEOUT;
-        case ERROR_FILE_NOT_FOUND:      return MA_DOES_NOT_EXIST;
-        default: break;
-    }
-
-    return MA_ERROR;
-}
-#endif  /* MA_WIN32 */
-
-
-/*******************************************************************************
-
-Threading
-
-*******************************************************************************/
-static MA_INLINE ma_result ma_spinlock_lock_ex(volatile ma_spinlock* pSpinlock, ma_bool32 yield)
-{
-    if (pSpinlock == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (;;) {
-        if (ma_atomic_exchange_explicit_32(pSpinlock, 1, ma_atomic_memory_order_acquire) == 0) {
-            break;
-        }
-
-        while (ma_atomic_load_explicit_32(pSpinlock, ma_atomic_memory_order_relaxed) == 1) {
-            if (yield) {
-                ma_yield();
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_spinlock_lock(volatile ma_spinlock* pSpinlock)
-{
-    return ma_spinlock_lock_ex(pSpinlock, MA_TRUE);
-}
-
-MA_API ma_result ma_spinlock_lock_noyield(volatile ma_spinlock* pSpinlock)
-{
-    return ma_spinlock_lock_ex(pSpinlock, MA_FALSE);
-}
-
-MA_API ma_result ma_spinlock_unlock(volatile ma_spinlock* pSpinlock)
-{
-    if (pSpinlock == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_atomic_store_explicit_32(pSpinlock, 0, ma_atomic_memory_order_release);
-    return MA_SUCCESS;
-}
-
-
-#ifndef MA_NO_THREADING
-#if defined(MA_POSIX)
-    #define MA_THREADCALL
-    typedef void* ma_thread_result;
-#elif defined(MA_WIN32)
-    #define MA_THREADCALL WINAPI
-    typedef unsigned long ma_thread_result;
-#endif
-
-typedef ma_thread_result (MA_THREADCALL * ma_thread_entry_proc)(void* pData);
-
-#ifdef MA_POSIX
-static ma_result ma_thread_create__posix(ma_thread* pThread, ma_thread_priority priority, size_t stackSize, ma_thread_entry_proc entryProc, void* pData)
-{
-    int result;
-    pthread_attr_t* pAttr = NULL;
-
-#if !defined(__EMSCRIPTEN__)
-    /* Try setting the thread priority. It's not critical if anything fails here. */
-    pthread_attr_t attr;
-    if (pthread_attr_init(&attr) == 0) {
-        int scheduler = -1;
-
-        /* We successfully initialized our attributes object so we can assign the pointer so it's passed into pthread_create(). */
-        pAttr = &attr;
-
-        /* We need to set the scheduler policy. Only do this if the OS supports pthread_attr_setschedpolicy() */
-        #if !defined(MA_BEOS)
-        {
-            if (priority == ma_thread_priority_idle) {
-            #ifdef SCHED_IDLE
-                if (pthread_attr_setschedpolicy(&attr, SCHED_IDLE) == 0) {
-                    scheduler = SCHED_IDLE;
-                }
-            #endif
-            } else if (priority == ma_thread_priority_realtime) {
-            #ifdef SCHED_FIFO
-                if (pthread_attr_setschedpolicy(&attr, SCHED_FIFO) == 0) {
-                    scheduler = SCHED_FIFO;
-                }
-            #endif
-            #ifdef MA_LINUX
-            } else {
-                scheduler = sched_getscheduler(0);
-            #endif
-            }
-        }
-        #endif
-
-        if (stackSize > 0) {
-            pthread_attr_setstacksize(&attr, stackSize);
-        }
-
-        if (scheduler != -1) {
-            int priorityMin = sched_get_priority_min(scheduler);
-            int priorityMax = sched_get_priority_max(scheduler);
-            int priorityStep = (priorityMax - priorityMin) / 7;  /* 7 = number of priorities supported by miniaudio. */
-
-            struct sched_param sched;
-            if (pthread_attr_getschedparam(&attr, &sched) == 0) {
-                if (priority == ma_thread_priority_idle) {
-                    sched.sched_priority = priorityMin;
-                } else if (priority == ma_thread_priority_realtime) {
-                    sched.sched_priority = priorityMax;
-                } else {
-                    sched.sched_priority += ((int)priority + 5) * priorityStep;  /* +5 because the lowest priority is -5. */
-                    if (sched.sched_priority < priorityMin) {
-                        sched.sched_priority = priorityMin;
-                    }
-                    if (sched.sched_priority > priorityMax) {
-                        sched.sched_priority = priorityMax;
-                    }
-                }
-
-                /* I'm not treating a failure of setting the priority as a critical error so not checking the return value here. */
-                pthread_attr_setschedparam(&attr, &sched);
-            }
-        }
-    }
-#else
-    /* It's the emscripten build. We'll have a few unused parameters. */
-    (void)priority;
-    (void)stackSize;
-#endif
-
-    result = pthread_create((pthread_t*)pThread, pAttr, entryProc, pData);
-
-    /* The thread attributes object is no longer required. */
-    if (pAttr != NULL) {
-        pthread_attr_destroy(pAttr);
-    }
-
-    if (result != 0) {
-        return ma_result_from_errno(result);
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_thread_wait__posix(ma_thread* pThread)
-{
-    pthread_join((pthread_t)*pThread, NULL);
-}
-
-
-static ma_result ma_mutex_init__posix(ma_mutex* pMutex)
-{
-    int result;
-    
-    if (pMutex == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pMutex);
-
-    result = pthread_mutex_init((pthread_mutex_t*)pMutex, NULL);
-    if (result != 0) {
-        return ma_result_from_errno(result);
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_mutex_uninit__posix(ma_mutex* pMutex)
-{
-    pthread_mutex_destroy((pthread_mutex_t*)pMutex);
-}
-
-static void ma_mutex_lock__posix(ma_mutex* pMutex)
-{
-    pthread_mutex_lock((pthread_mutex_t*)pMutex);
-}
-
-static void ma_mutex_unlock__posix(ma_mutex* pMutex)
-{
-    pthread_mutex_unlock((pthread_mutex_t*)pMutex);
-}
-
-
-static ma_result ma_event_init__posix(ma_event* pEvent)
-{
-    int result;
-
-    result = pthread_mutex_init((pthread_mutex_t*)&pEvent->lock, NULL);
-    if (result != 0) {
-        return ma_result_from_errno(result);
-    }
-
-    result = pthread_cond_init((pthread_cond_t*)&pEvent->cond, NULL);
-    if (result != 0) {
-        pthread_mutex_destroy((pthread_mutex_t*)&pEvent->lock);
-        return ma_result_from_errno(result);
-    }
-
-    pEvent->value = 0;
-    return MA_SUCCESS;
-}
-
-static void ma_event_uninit__posix(ma_event* pEvent)
-{
-    pthread_cond_destroy((pthread_cond_t*)&pEvent->cond);
-    pthread_mutex_destroy((pthread_mutex_t*)&pEvent->lock);
-}
-
-static ma_result ma_event_wait__posix(ma_event* pEvent)
-{
-    pthread_mutex_lock((pthread_mutex_t*)&pEvent->lock);
-    {
-        while (pEvent->value == 0) {
-            pthread_cond_wait((pthread_cond_t*)&pEvent->cond, (pthread_mutex_t*)&pEvent->lock);
-        }
-        pEvent->value = 0;  /* Auto-reset. */
-    }
-    pthread_mutex_unlock((pthread_mutex_t*)&pEvent->lock);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_event_signal__posix(ma_event* pEvent)
-{
-    pthread_mutex_lock((pthread_mutex_t*)&pEvent->lock);
-    {
-        pEvent->value = 1;
-        pthread_cond_signal((pthread_cond_t*)&pEvent->cond);
-    }
-    pthread_mutex_unlock((pthread_mutex_t*)&pEvent->lock);
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_semaphore_init__posix(int initialValue, ma_semaphore* pSemaphore)
-{
-    int result;
-
-    if (pSemaphore == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pSemaphore->value = initialValue;
-
-    result = pthread_mutex_init((pthread_mutex_t*)&pSemaphore->lock, NULL);
-    if (result != 0) {
-        return ma_result_from_errno(result);  /* Failed to create mutex. */
-    }
-
-    result = pthread_cond_init((pthread_cond_t*)&pSemaphore->cond, NULL);
-    if (result != 0) {
-        pthread_mutex_destroy((pthread_mutex_t*)&pSemaphore->lock);
-        return ma_result_from_errno(result);  /* Failed to create condition variable. */
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_semaphore_uninit__posix(ma_semaphore* pSemaphore)
-{
-    if (pSemaphore == NULL) {
-        return;
-    }
-
-    pthread_cond_destroy((pthread_cond_t*)&pSemaphore->cond);
-    pthread_mutex_destroy((pthread_mutex_t*)&pSemaphore->lock);
-}
-
-static ma_result ma_semaphore_wait__posix(ma_semaphore* pSemaphore)
-{
-    if (pSemaphore == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pthread_mutex_lock((pthread_mutex_t*)&pSemaphore->lock);
-    {
-        /* We need to wait on a condition variable before escaping. We can't return from this function until the semaphore has been signaled. */
-        while (pSemaphore->value == 0) {
-            pthread_cond_wait((pthread_cond_t*)&pSemaphore->cond, (pthread_mutex_t*)&pSemaphore->lock);
-        }
-
-        pSemaphore->value -= 1;
-    }
-    pthread_mutex_unlock((pthread_mutex_t*)&pSemaphore->lock);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_semaphore_release__posix(ma_semaphore* pSemaphore)
-{
-    if (pSemaphore == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pthread_mutex_lock((pthread_mutex_t*)&pSemaphore->lock);
-    {
-        pSemaphore->value += 1;
-        pthread_cond_signal((pthread_cond_t*)&pSemaphore->cond);
-    }
-    pthread_mutex_unlock((pthread_mutex_t*)&pSemaphore->lock);
-
-    return MA_SUCCESS;
-}
-#elif defined(MA_WIN32)
-static int ma_thread_priority_to_win32(ma_thread_priority priority)
-{
-    switch (priority) {
-        case ma_thread_priority_idle:     return THREAD_PRIORITY_IDLE;
-        case ma_thread_priority_lowest:   return THREAD_PRIORITY_LOWEST;
-        case ma_thread_priority_low:      return THREAD_PRIORITY_BELOW_NORMAL;
-        case ma_thread_priority_normal:   return THREAD_PRIORITY_NORMAL;
-        case ma_thread_priority_high:     return THREAD_PRIORITY_ABOVE_NORMAL;
-        case ma_thread_priority_highest:  return THREAD_PRIORITY_HIGHEST;
-        case ma_thread_priority_realtime: return THREAD_PRIORITY_TIME_CRITICAL;
-        default:                          return THREAD_PRIORITY_NORMAL;
-    }
-}
-
-static ma_result ma_thread_create__win32(ma_thread* pThread, ma_thread_priority priority, size_t stackSize, ma_thread_entry_proc entryProc, void* pData)
-{
-    DWORD threadID; /* Not used. Only used for passing into CreateThread() so it doesn't fail on Windows 98. */
-
-    *pThread = CreateThread(NULL, stackSize, entryProc, pData, 0, &threadID);
-    if (*pThread == NULL) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    SetThreadPriority((HANDLE)*pThread, ma_thread_priority_to_win32(priority));
-
-    return MA_SUCCESS;
-}
-
-static void ma_thread_wait__win32(ma_thread* pThread)
-{
-    WaitForSingleObject((HANDLE)*pThread, INFINITE);
-    CloseHandle((HANDLE)*pThread);
-}
-
-
-static ma_result ma_mutex_init__win32(ma_mutex* pMutex)
-{
-    *pMutex = CreateEventA(NULL, FALSE, TRUE, NULL);
-    if (*pMutex == NULL) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_mutex_uninit__win32(ma_mutex* pMutex)
-{
-    CloseHandle((HANDLE)*pMutex);
-}
-
-static void ma_mutex_lock__win32(ma_mutex* pMutex)
-{
-    WaitForSingleObject((HANDLE)*pMutex, INFINITE);
-}
-
-static void ma_mutex_unlock__win32(ma_mutex* pMutex)
-{
-    SetEvent((HANDLE)*pMutex);
-}
-
-
-static ma_result ma_event_init__win32(ma_event* pEvent)
-{
-    *pEvent = CreateEventA(NULL, FALSE, FALSE, NULL);
-    if (*pEvent == NULL) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_event_uninit__win32(ma_event* pEvent)
-{
-    CloseHandle((HANDLE)*pEvent);
-}
-
-static ma_result ma_event_wait__win32(ma_event* pEvent)
-{
-    DWORD result = WaitForSingleObject((HANDLE)*pEvent, INFINITE);
-    if (result == WAIT_OBJECT_0) {
-        return MA_SUCCESS;
-    }
-
-    if (result == WAIT_TIMEOUT) {
-        return MA_TIMEOUT;
-    }
-
-    return ma_result_from_GetLastError(GetLastError());
-}
-
-static ma_result ma_event_signal__win32(ma_event* pEvent)
-{
-    BOOL result = SetEvent((HANDLE)*pEvent);
-    if (result == 0) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_semaphore_init__win32(int initialValue, ma_semaphore* pSemaphore)
-{
-    *pSemaphore = CreateSemaphoreW(NULL, (LONG)initialValue, LONG_MAX, NULL);
-    if (*pSemaphore == NULL) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_semaphore_uninit__win32(ma_semaphore* pSemaphore)
-{
-    CloseHandle((HANDLE)*pSemaphore);
-}
-
-static ma_result ma_semaphore_wait__win32(ma_semaphore* pSemaphore)
-{
-    DWORD result = WaitForSingleObject((HANDLE)*pSemaphore, INFINITE);
-    if (result == WAIT_OBJECT_0) {
-        return MA_SUCCESS;
-    }
-
-    if (result == WAIT_TIMEOUT) {
-        return MA_TIMEOUT;
-    }
-
-    return ma_result_from_GetLastError(GetLastError());
-}
-
-static ma_result ma_semaphore_release__win32(ma_semaphore* pSemaphore)
-{
-    BOOL result = ReleaseSemaphore((HANDLE)*pSemaphore, 1, NULL);
-    if (result == 0) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    return MA_SUCCESS;
-}
-#endif
-
-typedef struct
-{
-    ma_thread_entry_proc entryProc;
-    void* pData;
-    ma_allocation_callbacks allocationCallbacks;
-} ma_thread_proxy_data;
-
-static ma_thread_result MA_THREADCALL ma_thread_entry_proxy(void* pData)
-{
-    ma_thread_proxy_data* pProxyData = (ma_thread_proxy_data*)pData;
-    ma_thread_entry_proc entryProc;
-    void* pEntryProcData;
-    ma_thread_result result;
-
-    #if defined(MA_ON_THREAD_ENTRY)
-        MA_ON_THREAD_ENTRY
-    #endif
-
-    entryProc = pProxyData->entryProc;
-    pEntryProcData = pProxyData->pData;
-
-    /* Free the proxy data before getting into the real thread entry proc. */
-    ma_free(pProxyData, &pProxyData->allocationCallbacks);
-
-    result = entryProc(pEntryProcData);
-
-    #if defined(MA_ON_THREAD_EXIT)
-        MA_ON_THREAD_EXIT
-    #endif
-
-    return result;
-}
-
-static ma_result ma_thread_create(ma_thread* pThread, ma_thread_priority priority, size_t stackSize, ma_thread_entry_proc entryProc, void* pData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_result result;
-    ma_thread_proxy_data* pProxyData;
-
-    if (pThread == NULL || entryProc == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pProxyData = (ma_thread_proxy_data*)ma_malloc(sizeof(*pProxyData), pAllocationCallbacks);   /* Will be freed by the proxy entry proc. */
-    if (pProxyData == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-#if defined(MA_THREAD_DEFAULT_STACK_SIZE)
-    if (stackSize == 0) {
-        stackSize = MA_THREAD_DEFAULT_STACK_SIZE;
-    }
-#endif
-
-    pProxyData->entryProc = entryProc;
-    pProxyData->pData     = pData;
-    ma_allocation_callbacks_init_copy(&pProxyData->allocationCallbacks, pAllocationCallbacks);
-
-#if defined(MA_POSIX)
-    result = ma_thread_create__posix(pThread, priority, stackSize, ma_thread_entry_proxy, pProxyData);
-#elif defined(MA_WIN32)
-    result = ma_thread_create__win32(pThread, priority, stackSize, ma_thread_entry_proxy, pProxyData);
-#endif
-
-    if (result != MA_SUCCESS) {
-        ma_free(pProxyData, pAllocationCallbacks);
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_thread_wait(ma_thread* pThread)
-{
-    if (pThread == NULL) {
-        return;
-    }
-
-#if defined(MA_POSIX)
-    ma_thread_wait__posix(pThread);
-#elif defined(MA_WIN32)
-    ma_thread_wait__win32(pThread);
-#endif
-}
-
-
-MA_API ma_result ma_mutex_init(ma_mutex* pMutex)
-{
-    if (pMutex == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_POSIX)
-    return ma_mutex_init__posix(pMutex);
-#elif defined(MA_WIN32)
-    return ma_mutex_init__win32(pMutex);
-#endif
-}
-
-MA_API void ma_mutex_uninit(ma_mutex* pMutex)
-{
-    if (pMutex == NULL) {
-        return;
-    }
-
-#if defined(MA_POSIX)
-    ma_mutex_uninit__posix(pMutex);
-#elif defined(MA_WIN32)
-    ma_mutex_uninit__win32(pMutex);
-#endif
-}
-
-MA_API void ma_mutex_lock(ma_mutex* pMutex)
-{
-    if (pMutex == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
-        return;
-    }
-
-#if defined(MA_POSIX)
-    ma_mutex_lock__posix(pMutex);
-#elif defined(MA_WIN32)
-    ma_mutex_lock__win32(pMutex);
-#endif
-}
-
-MA_API void ma_mutex_unlock(ma_mutex* pMutex)
-{
-    if (pMutex == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
-        return;
-    }
-
-#if defined(MA_POSIX)
-    ma_mutex_unlock__posix(pMutex);
-#elif defined(MA_WIN32)
-    ma_mutex_unlock__win32(pMutex);
-#endif
-}
-
-
-MA_API ma_result ma_event_init(ma_event* pEvent)
-{
-    if (pEvent == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_POSIX)
-    return ma_event_init__posix(pEvent);
-#elif defined(MA_WIN32)
-    return ma_event_init__win32(pEvent);
-#endif
-}
-
-#if 0
-static ma_result ma_event_alloc_and_init(ma_event** ppEvent, ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_result result;
-    ma_event* pEvent;
-
-    if (ppEvent == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *ppEvent = NULL;
-
-    pEvent = ma_malloc(sizeof(*pEvent), pAllocationCallbacks);
-    if (pEvent == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_event_init(pEvent);
-    if (result != MA_SUCCESS) {
-        ma_free(pEvent, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppEvent = pEvent;
-    return result;
-}
-#endif
-
-MA_API void ma_event_uninit(ma_event* pEvent)
-{
-    if (pEvent == NULL) {
-        return;
-    }
-
-#if defined(MA_POSIX)
-    ma_event_uninit__posix(pEvent);
-#elif defined(MA_WIN32)
-    ma_event_uninit__win32(pEvent);
-#endif
-}
-
-#if 0
-static void ma_event_uninit_and_free(ma_event* pEvent, ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pEvent == NULL) {
-        return;
-    }
-
-    ma_event_uninit(pEvent);
-    ma_free(pEvent, pAllocationCallbacks);
-}
-#endif
-
-MA_API ma_result ma_event_wait(ma_event* pEvent)
-{
-    if (pEvent == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert to the caller is aware of this bug. */
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_POSIX)
-    return ma_event_wait__posix(pEvent);
-#elif defined(MA_WIN32)
-    return ma_event_wait__win32(pEvent);
-#endif
-}
-
-MA_API ma_result ma_event_signal(ma_event* pEvent)
-{
-    if (pEvent == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert to the caller is aware of this bug. */
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_POSIX)
-    return ma_event_signal__posix(pEvent);
-#elif defined(MA_WIN32)
-    return ma_event_signal__win32(pEvent);
-#endif
-}
-
-
-MA_API ma_result ma_semaphore_init(int initialValue, ma_semaphore* pSemaphore)
-{
-    if (pSemaphore == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_POSIX)
-    return ma_semaphore_init__posix(initialValue, pSemaphore);
-#elif defined(MA_WIN32)
-    return ma_semaphore_init__win32(initialValue, pSemaphore);
-#endif
-}
-
-MA_API void ma_semaphore_uninit(ma_semaphore* pSemaphore)
-{
-    if (pSemaphore == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
-        return;
-    }
-
-#if defined(MA_POSIX)
-    ma_semaphore_uninit__posix(pSemaphore);
-#elif defined(MA_WIN32)
-    ma_semaphore_uninit__win32(pSemaphore);
-#endif
-}
-
-MA_API ma_result ma_semaphore_wait(ma_semaphore* pSemaphore)
-{
-    if (pSemaphore == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_POSIX)
-    return ma_semaphore_wait__posix(pSemaphore);
-#elif defined(MA_WIN32)
-    return ma_semaphore_wait__win32(pSemaphore);
-#endif
-}
-
-MA_API ma_result ma_semaphore_release(ma_semaphore* pSemaphore)
-{
-    if (pSemaphore == NULL) {
-        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_POSIX)
-    return ma_semaphore_release__posix(pSemaphore);
-#elif defined(MA_WIN32)
-    return ma_semaphore_release__win32(pSemaphore);
-#endif
-}
-#else
-/* MA_NO_THREADING is set which means threading is disabled. Threading is required by some API families. If any of these are enabled we need to throw an error. */
-#ifndef MA_NO_DEVICE_IO
-#error "MA_NO_THREADING cannot be used without MA_NO_DEVICE_IO";
-#endif
-#endif  /* MA_NO_THREADING */
-
-
-
-#define MA_FENCE_COUNTER_MAX    0x7FFFFFFF
-
-MA_API ma_result ma_fence_init(ma_fence* pFence)
-{
-    if (pFence == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pFence);
-    pFence->counter = 0;
-
-    #ifndef MA_NO_THREADING
-    {
-        ma_result result;
-
-        result = ma_event_init(&pFence->e);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-    #endif
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_fence_uninit(ma_fence* pFence)
-{
-    if (pFence == NULL) {
-        return;
-    }
-
-    #ifndef MA_NO_THREADING
-    {
-        ma_event_uninit(&pFence->e);
-    }
-    #endif
-
-    MA_ZERO_OBJECT(pFence);
-}
-
-MA_API ma_result ma_fence_acquire(ma_fence* pFence)
-{
-    if (pFence == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (;;) {
-        ma_uint32 oldCounter = ma_atomic_load_32(&pFence->counter);
-        ma_uint32 newCounter = oldCounter + 1;
-
-        /* Make sure we're not about to exceed our maximum value. */
-        if (newCounter > MA_FENCE_COUNTER_MAX) {
-            MA_ASSERT(MA_FALSE);
-            return MA_OUT_OF_RANGE;
-        }
-
-        if (ma_atomic_compare_exchange_weak_32(&pFence->counter, &oldCounter, newCounter)) {
-            return MA_SUCCESS;
-        } else {
-            if (oldCounter == MA_FENCE_COUNTER_MAX) {
-                MA_ASSERT(MA_FALSE);
-                return MA_OUT_OF_RANGE; /* The other thread took the last available slot. Abort. */
-            }
-        }
-    }
-
-    /* Should never get here. */
-    /*return MA_SUCCESS;*/
-}
-
-MA_API ma_result ma_fence_release(ma_fence* pFence)
-{
-    if (pFence == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (;;) {
-        ma_uint32 oldCounter = ma_atomic_load_32(&pFence->counter);
-        ma_uint32 newCounter = oldCounter - 1;
-
-        if (oldCounter == 0) {
-            MA_ASSERT(MA_FALSE);
-            return MA_INVALID_OPERATION;    /* Acquire/release mismatch. */
-        }
-
-        if (ma_atomic_compare_exchange_weak_32(&pFence->counter, &oldCounter, newCounter)) {
-            #ifndef MA_NO_THREADING
-            {
-                if (newCounter == 0) {
-                    ma_event_signal(&pFence->e);    /* <-- ma_fence_wait() will be waiting on this. */
-                }
-            }
-            #endif
-
-            return MA_SUCCESS;
-        } else {
-            if (oldCounter == 0) {
-                MA_ASSERT(MA_FALSE);
-                return MA_INVALID_OPERATION;    /* Another thread has taken the 0 slot. Acquire/release mismatch. */
-            }
-        }
-    }
-
-    /* Should never get here. */
-    /*return MA_SUCCESS;*/
-}
-
-MA_API ma_result ma_fence_wait(ma_fence* pFence)
-{
-    if (pFence == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (;;) {
-        ma_uint32 counter;
-
-        counter = ma_atomic_load_32(&pFence->counter);
-        if (counter == 0) {
-            /*
-            Counter has hit zero. By the time we get here some other thread may have acquired the
-            fence again, but that is where the caller needs to take care with how they se the fence.
-            */
-            return MA_SUCCESS;
-        }
-
-        /* Getting here means the counter is > 0. We'll need to wait for something to happen. */
-        #ifndef MA_NO_THREADING
-        {
-            ma_result result;
-
-            result = ma_event_wait(&pFence->e);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-        }
-        #endif
-    }
-
-    /* Should never get here. */
-    /*return MA_INVALID_OPERATION;*/
-}
-
-
-MA_API ma_result ma_async_notification_signal(ma_async_notification* pNotification)
-{
-    ma_async_notification_callbacks* pNotificationCallbacks = (ma_async_notification_callbacks*)pNotification;
-
-    if (pNotification == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pNotificationCallbacks->onSignal == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    pNotificationCallbacks->onSignal(pNotification);
-    return MA_INVALID_ARGS;
-}
-
-
-static void ma_async_notification_poll__on_signal(ma_async_notification* pNotification)
-{
-    ((ma_async_notification_poll*)pNotification)->signalled = MA_TRUE;
-}
-
-MA_API ma_result ma_async_notification_poll_init(ma_async_notification_poll* pNotificationPoll)
-{
-    if (pNotificationPoll == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pNotificationPoll->cb.onSignal = ma_async_notification_poll__on_signal;
-    pNotificationPoll->signalled = MA_FALSE;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_bool32 ma_async_notification_poll_is_signalled(const ma_async_notification_poll* pNotificationPoll)
-{
-    if (pNotificationPoll == NULL) {
-        return MA_FALSE;
-    }
-
-    return pNotificationPoll->signalled;
-}
-
-
-static void ma_async_notification_event__on_signal(ma_async_notification* pNotification)
-{
-    ma_async_notification_event_signal((ma_async_notification_event*)pNotification);
-}
-
-MA_API ma_result ma_async_notification_event_init(ma_async_notification_event* pNotificationEvent)
-{
-    if (pNotificationEvent == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pNotificationEvent->cb.onSignal = ma_async_notification_event__on_signal;
-
-    #ifndef MA_NO_THREADING
-    {
-        ma_result result;
-
-        result = ma_event_init(&pNotificationEvent->e);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        return MA_NOT_IMPLEMENTED;  /* Threading is disabled. */
-    }
-    #endif
-}
-
-MA_API ma_result ma_async_notification_event_uninit(ma_async_notification_event* pNotificationEvent)
-{
-    if (pNotificationEvent == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #ifndef MA_NO_THREADING
-    {
-        ma_event_uninit(&pNotificationEvent->e);
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        return MA_NOT_IMPLEMENTED;  /* Threading is disabled. */
-    }
-    #endif
-}
-
-MA_API ma_result ma_async_notification_event_wait(ma_async_notification_event* pNotificationEvent)
-{
-    if (pNotificationEvent == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #ifndef MA_NO_THREADING
-    {
-        return ma_event_wait(&pNotificationEvent->e);
-    }
-    #else
-    {
-        return MA_NOT_IMPLEMENTED;  /* Threading is disabled. */
-    }
-    #endif
-}
-
-MA_API ma_result ma_async_notification_event_signal(ma_async_notification_event* pNotificationEvent)
-{
-    if (pNotificationEvent == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #ifndef MA_NO_THREADING
-    {
-        return ma_event_signal(&pNotificationEvent->e);
-    }
-    #else
-    {
-        return MA_NOT_IMPLEMENTED;  /* Threading is disabled. */
-    }
-    #endif
-}
-
-
-
-/************************************************************************************************************************************************************
-
-Job Queue
-
-************************************************************************************************************************************************************/
-MA_API ma_slot_allocator_config ma_slot_allocator_config_init(ma_uint32 capacity)
-{
-    ma_slot_allocator_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.capacity = capacity;
-
-    return config;
-}
-
-
-static MA_INLINE ma_uint32 ma_slot_allocator_calculate_group_capacity(ma_uint32 slotCapacity)
-{
-    ma_uint32 cap = slotCapacity / 32;
-    if ((slotCapacity % 32) != 0) {
-        cap += 1;
-    }
-
-    return cap;
-}
-
-static MA_INLINE ma_uint32 ma_slot_allocator_group_capacity(const ma_slot_allocator* pAllocator)
-{
-    return ma_slot_allocator_calculate_group_capacity(pAllocator->capacity);
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t groupsOffset;
-    size_t slotsOffset;
-} ma_slot_allocator_heap_layout;
-
-static ma_result ma_slot_allocator_get_heap_layout(const ma_slot_allocator_config* pConfig, ma_slot_allocator_heap_layout* pHeapLayout)
-{
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->capacity == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* Groups. */
-    pHeapLayout->groupsOffset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += ma_align_64(ma_slot_allocator_calculate_group_capacity(pConfig->capacity) * sizeof(ma_slot_allocator_group));
-
-    /* Slots. */
-    pHeapLayout->slotsOffset  = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += ma_align_64(pConfig->capacity * sizeof(ma_uint32));
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_slot_allocator_get_heap_size(const ma_slot_allocator_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_slot_allocator_heap_layout layout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_slot_allocator_get_heap_layout(pConfig, &layout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = layout.sizeInBytes;
-
-    return result;
-}
-
-MA_API ma_result ma_slot_allocator_init_preallocated(const ma_slot_allocator_config* pConfig, void* pHeap, ma_slot_allocator* pAllocator)
-{
-    ma_result result;
-    ma_slot_allocator_heap_layout heapLayout;
-
-    if (pAllocator == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pAllocator);
-
-    if (pHeap == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_slot_allocator_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pAllocator->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pAllocator->pGroups  = (ma_slot_allocator_group*)ma_offset_ptr(pHeap, heapLayout.groupsOffset);
-    pAllocator->pSlots   = (ma_uint32*)ma_offset_ptr(pHeap, heapLayout.slotsOffset);
-    pAllocator->capacity = pConfig->capacity;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_slot_allocator_init(const ma_slot_allocator_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_slot_allocator* pAllocator)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_slot_allocator_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to retrieve the size of the heap allocation. */
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_slot_allocator_init_preallocated(pConfig, pHeap, pAllocator);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pAllocator->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_slot_allocator_uninit(ma_slot_allocator* pAllocator, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocator == NULL) {
-        return;
-    }
-
-    if (pAllocator->_ownsHeap) {
-        ma_free(pAllocator->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_result ma_slot_allocator_alloc(ma_slot_allocator* pAllocator, ma_uint64* pSlot)
-{
-    ma_uint32 iAttempt;
-    const ma_uint32 maxAttempts = 2;    /* The number of iterations to perform until returning MA_OUT_OF_MEMORY if no slots can be found. */
-
-    if (pAllocator == NULL || pSlot == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (iAttempt = 0; iAttempt < maxAttempts; iAttempt += 1) {
-        /* We need to acquire a suitable bitfield first. This is a bitfield that's got an available slot within it. */
-        ma_uint32 iGroup;
-        for (iGroup = 0; iGroup < ma_slot_allocator_group_capacity(pAllocator); iGroup += 1) {
-            /* CAS */
-            for (;;) {
-                ma_uint32 oldBitfield;
-                ma_uint32 newBitfield;
-                ma_uint32 bitOffset;
-
-                oldBitfield = ma_atomic_load_32(&pAllocator->pGroups[iGroup].bitfield);  /* <-- This copy must happen. The compiler must not optimize this away. */
-
-                /* Fast check to see if anything is available. */
-                if (oldBitfield == 0xFFFFFFFF) {
-                    break;  /* No available bits in this bitfield. */
-                }
-
-                bitOffset = ma_ffs_32(~oldBitfield);
-                MA_ASSERT(bitOffset < 32);
-
-                newBitfield = oldBitfield | (1 << bitOffset);
-
-                if (ma_atomic_compare_and_swap_32(&pAllocator->pGroups[iGroup].bitfield, oldBitfield, newBitfield) == oldBitfield) {
-                    ma_uint32 slotIndex;
-
-                    /* Increment the counter as soon as possible to have other threads report out-of-memory sooner than later. */
-                    ma_atomic_fetch_add_32(&pAllocator->count, 1);
-
-                    /* The slot index is required for constructing the output value. */
-                    slotIndex = (iGroup << 5) + bitOffset;  /* iGroup << 5 = iGroup * 32 */
-                    if (slotIndex >= pAllocator->capacity) {
-                        return MA_OUT_OF_MEMORY;
-                    }
-
-                    /* Increment the reference count before constructing the output value. */
-                    pAllocator->pSlots[slotIndex] += 1;
-
-                    /* Construct the output value. */
-                    *pSlot = (((ma_uint64)pAllocator->pSlots[slotIndex] << 32) | slotIndex);
-
-                    return MA_SUCCESS;
-                }
-            }
-        }
-
-        /* We weren't able to find a slot. If it's because we've reached our capacity we need to return MA_OUT_OF_MEMORY. Otherwise we need to do another iteration and try again. */
-        if (pAllocator->count < pAllocator->capacity) {
-            ma_yield();
-        } else {
-            return MA_OUT_OF_MEMORY;
-        }
-    }
-
-    /* We couldn't find a slot within the maximum number of attempts. */
-    return MA_OUT_OF_MEMORY;
-}
-
-MA_API ma_result ma_slot_allocator_free(ma_slot_allocator* pAllocator, ma_uint64 slot)
-{
-    ma_uint32 iGroup;
-    ma_uint32 iBit;
-
-    if (pAllocator == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    iGroup = (ma_uint32)((slot & 0xFFFFFFFF) >> 5);   /* slot / 32 */
-    iBit   = (ma_uint32)((slot & 0xFFFFFFFF) & 31);   /* slot % 32 */
-
-    if (iGroup >= ma_slot_allocator_group_capacity(pAllocator)) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ASSERT(iBit < 32);   /* This must be true due to the logic we used to actually calculate it. */
-
-    while (ma_atomic_load_32(&pAllocator->count) > 0) {
-        /* CAS */
-        ma_uint32 oldBitfield;
-        ma_uint32 newBitfield;
-
-        oldBitfield = ma_atomic_load_32(&pAllocator->pGroups[iGroup].bitfield);  /* <-- This copy must happen. The compiler must not optimize this away. */
-        newBitfield = oldBitfield & ~(1 << iBit);
-
-        /* Debugging for checking for double-frees. */
-        #if defined(MA_DEBUG_OUTPUT)
-        {
-            if ((oldBitfield & (1 << iBit)) == 0) {
-                MA_ASSERT(MA_FALSE);    /* Double free detected.*/
-            }
-        }
-        #endif
-
-        if (ma_atomic_compare_and_swap_32(&pAllocator->pGroups[iGroup].bitfield, oldBitfield, newBitfield) == oldBitfield) {
-            ma_atomic_fetch_sub_32(&pAllocator->count, 1);
-            return MA_SUCCESS;
-        }
-    }
-
-    /* Getting here means there are no allocations available for freeing. */
-    return MA_INVALID_OPERATION;
-}
-
-
-#define MA_JOB_ID_NONE      ~((ma_uint64)0)
-#define MA_JOB_SLOT_NONE    (ma_uint16)(~0)
-
-static MA_INLINE ma_uint32 ma_job_extract_refcount(ma_uint64 toc)
-{
-    return (ma_uint32)(toc >> 32);
-}
-
-static MA_INLINE ma_uint16 ma_job_extract_slot(ma_uint64 toc)
-{
-    return (ma_uint16)(toc & 0x0000FFFF);
-}
-
-static MA_INLINE ma_uint16 ma_job_extract_code(ma_uint64 toc)
-{
-    return (ma_uint16)((toc & 0xFFFF0000) >> 16);
-}
-
-static MA_INLINE ma_uint64 ma_job_toc_to_allocation(ma_uint64 toc)
-{
-    return ((ma_uint64)ma_job_extract_refcount(toc) << 32) | (ma_uint64)ma_job_extract_slot(toc);
-}
-
-static MA_INLINE ma_uint64 ma_job_set_refcount(ma_uint64 toc, ma_uint32 refcount)
-{
-    /* Clear the reference count first. */
-    toc = toc & ~((ma_uint64)0xFFFFFFFF << 32);
-    toc = toc |  ((ma_uint64)refcount   << 32);
-
-    return toc;
-}
-
-
-MA_API ma_job ma_job_init(ma_uint16 code)
-{
-    ma_job job;
-
-    MA_ZERO_OBJECT(&job);
-    job.toc.breakup.code = code;
-    job.toc.breakup.slot = MA_JOB_SLOT_NONE;    /* Temp value. Will be allocated when posted to a queue. */
-    job.next             = MA_JOB_ID_NONE;
-
-    return job;
-}
-
-
-static ma_result ma_job_process__noop(ma_job* pJob);
-static ma_result ma_job_process__quit(ma_job* pJob);
-static ma_result ma_job_process__custom(ma_job* pJob);
-static ma_result ma_job_process__resource_manager__load_data_buffer_node(ma_job* pJob);
-static ma_result ma_job_process__resource_manager__free_data_buffer_node(ma_job* pJob);
-static ma_result ma_job_process__resource_manager__page_data_buffer_node(ma_job* pJob);
-static ma_result ma_job_process__resource_manager__load_data_buffer(ma_job* pJob);
-static ma_result ma_job_process__resource_manager__free_data_buffer(ma_job* pJob);
-static ma_result ma_job_process__resource_manager__load_data_stream(ma_job* pJob);
-static ma_result ma_job_process__resource_manager__free_data_stream(ma_job* pJob);
-static ma_result ma_job_process__resource_manager__page_data_stream(ma_job* pJob);
-static ma_result ma_job_process__resource_manager__seek_data_stream(ma_job* pJob);
-
-#if !defined(MA_NO_DEVICE_IO)
-static ma_result ma_job_process__device__aaudio_reroute(ma_job* pJob);
-#endif
-
-static ma_job_proc g_jobVTable[MA_JOB_TYPE_COUNT] =
-{
-    /* Miscellaneous. */
-    ma_job_process__quit,                                       /* MA_JOB_TYPE_QUIT */
-    ma_job_process__custom,                                     /* MA_JOB_TYPE_CUSTOM */
-
-    /* Resource Manager. */
-    ma_job_process__resource_manager__load_data_buffer_node,    /* MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE */
-    ma_job_process__resource_manager__free_data_buffer_node,    /* MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER_NODE */
-    ma_job_process__resource_manager__page_data_buffer_node,    /* MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE */
-    ma_job_process__resource_manager__load_data_buffer,         /* MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER */
-    ma_job_process__resource_manager__free_data_buffer,         /* MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER */
-    ma_job_process__resource_manager__load_data_stream,         /* MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_STREAM */
-    ma_job_process__resource_manager__free_data_stream,         /* MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_STREAM */
-    ma_job_process__resource_manager__page_data_stream,         /* MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_STREAM */
-    ma_job_process__resource_manager__seek_data_stream,         /* MA_JOB_TYPE_RESOURCE_MANAGER_SEEK_DATA_STREAM */
-
-    /* Device. */
-#if !defined(MA_NO_DEVICE_IO)
-    ma_job_process__device__aaudio_reroute                      /*MA_JOB_TYPE_DEVICE_AAUDIO_REROUTE*/
-#endif
-};
-
-MA_API ma_result ma_job_process(ma_job* pJob)
-{
-    if (pJob == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pJob->toc.breakup.code >= MA_JOB_TYPE_COUNT) {
-        return MA_INVALID_OPERATION;
-    }
-
-    return g_jobVTable[pJob->toc.breakup.code](pJob);
-}
-
-static ma_result ma_job_process__noop(ma_job* pJob)
-{
-    MA_ASSERT(pJob != NULL);
-
-    /* No-op. */
-    (void)pJob;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_job_process__quit(ma_job* pJob)
-{
-    return ma_job_process__noop(pJob);
-}
-
-static ma_result ma_job_process__custom(ma_job* pJob)
-{
-    MA_ASSERT(pJob != NULL);
-
-    /* No-op if there's no callback. */
-    if (pJob->data.custom.proc == NULL) {
-        return MA_SUCCESS;
-    }
-
-    return pJob->data.custom.proc(pJob);
-}
-
-
-
-MA_API ma_job_queue_config ma_job_queue_config_init(ma_uint32 flags, ma_uint32 capacity)
-{
-    ma_job_queue_config config;
-
-    config.flags    = flags;
-    config.capacity = capacity;
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t allocatorOffset;
-    size_t jobsOffset;
-} ma_job_queue_heap_layout;
-
-static ma_result ma_job_queue_get_heap_layout(const ma_job_queue_config* pConfig, ma_job_queue_heap_layout* pHeapLayout)
-{
-    ma_result result;
-
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->capacity == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* Allocator. */
-    {
-        ma_slot_allocator_config allocatorConfig;
-        size_t allocatorHeapSizeInBytes;
-
-        allocatorConfig = ma_slot_allocator_config_init(pConfig->capacity);
-        result = ma_slot_allocator_get_heap_size(&allocatorConfig, &allocatorHeapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->allocatorOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes    += allocatorHeapSizeInBytes;
-    }
-
-    /* Jobs. */
-    pHeapLayout->jobsOffset   = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += ma_align_64(pConfig->capacity * sizeof(ma_job));
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_job_queue_get_heap_size(const ma_job_queue_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_job_queue_heap_layout layout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_job_queue_get_heap_layout(pConfig, &layout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = layout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_job_queue_init_preallocated(const ma_job_queue_config* pConfig, void* pHeap, ma_job_queue* pQueue)
-{
-    ma_result result;
-    ma_job_queue_heap_layout heapLayout;
-    ma_slot_allocator_config allocatorConfig;
-
-    if (pQueue == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pQueue);
-
-    result = ma_job_queue_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pQueue->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pQueue->flags    = pConfig->flags;
-    pQueue->capacity = pConfig->capacity;
-    pQueue->pJobs    = (ma_job*)ma_offset_ptr(pHeap, heapLayout.jobsOffset);
-
-    allocatorConfig = ma_slot_allocator_config_init(pConfig->capacity);
-    result = ma_slot_allocator_init_preallocated(&allocatorConfig, ma_offset_ptr(pHeap, heapLayout.allocatorOffset), &pQueue->allocator);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* We need a semaphore if we're running in non-blocking mode. If threading is disabled we need to return an error. */
-    if ((pQueue->flags & MA_JOB_QUEUE_FLAG_NON_BLOCKING) == 0) {
-        #ifndef MA_NO_THREADING
-        {
-            ma_semaphore_init(0, &pQueue->sem);
-        }
-        #else
-        {
-            /* Threading is disabled and we've requested non-blocking mode. */
-            return MA_INVALID_OPERATION;
-        }
-        #endif
-    }
-
-    /*
-    Our queue needs to be initialized with a free standing node. This should always be slot 0. Required for the lock free algorithm. The first job in the queue is
-    just a dummy item for giving us the first item in the list which is stored in the "next" member.
-    */
-    ma_slot_allocator_alloc(&pQueue->allocator, &pQueue->head);  /* Will never fail. */
-    pQueue->pJobs[ma_job_extract_slot(pQueue->head)].next = MA_JOB_ID_NONE;
-    pQueue->tail = pQueue->head;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_job_queue_init(const ma_job_queue_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_job_queue* pQueue)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_job_queue_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_job_queue_init_preallocated(pConfig, pHeap, pQueue);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pQueue->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_job_queue_uninit(ma_job_queue* pQueue, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pQueue == NULL) {
-        return;
-    }
-
-    /* All we need to do is uninitialize the semaphore. */
-    if ((pQueue->flags & MA_JOB_QUEUE_FLAG_NON_BLOCKING) == 0) {
-        #ifndef MA_NO_THREADING
-        {
-            ma_semaphore_uninit(&pQueue->sem);
-        }
-        #else
-        {
-            MA_ASSERT(MA_FALSE);    /* Should never get here. Should have been checked at initialization time. */
-        }
-        #endif
-    }
-
-    ma_slot_allocator_uninit(&pQueue->allocator, pAllocationCallbacks);
-
-    if (pQueue->_ownsHeap) {
-        ma_free(pQueue->_pHeap, pAllocationCallbacks);
-    }
-}
-
-static ma_bool32 ma_job_queue_cas(volatile ma_uint64* dst, ma_uint64 expected, ma_uint64 desired)
-{
-    /* The new counter is taken from the expected value. */
-    return ma_atomic_compare_and_swap_64(dst, expected, ma_job_set_refcount(desired, ma_job_extract_refcount(expected) + 1)) == expected;
-}
-
-MA_API ma_result ma_job_queue_post(ma_job_queue* pQueue, const ma_job* pJob)
-{
-    /*
-    Lock free queue implementation based on the paper by Michael and Scott: Nonblocking Algorithms and Preemption-Safe Locking on Multiprogrammed Shared Memory Multiprocessors
-    */
-    ma_result result;
-    ma_uint64 slot;
-    ma_uint64 tail;
-    ma_uint64 next;
-
-    if (pQueue == NULL || pJob == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* We need a new slot. */
-    result = ma_slot_allocator_alloc(&pQueue->allocator, &slot);
-    if (result != MA_SUCCESS) {
-        return result;  /* Probably ran out of slots. If so, MA_OUT_OF_MEMORY will be returned. */
-    }
-
-    /* At this point we should have a slot to place the job. */
-    MA_ASSERT(ma_job_extract_slot(slot) < pQueue->capacity);
-
-    /* We need to put the job into memory before we do anything. */
-    pQueue->pJobs[ma_job_extract_slot(slot)]                  = *pJob;
-    pQueue->pJobs[ma_job_extract_slot(slot)].toc.allocation   = slot;                    /* This will overwrite the job code. */
-    pQueue->pJobs[ma_job_extract_slot(slot)].toc.breakup.code = pJob->toc.breakup.code;  /* The job code needs to be applied again because the line above overwrote it. */
-    pQueue->pJobs[ma_job_extract_slot(slot)].next             = MA_JOB_ID_NONE;          /* Reset for safety. */
-
-    #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
-    ma_spinlock_lock(&pQueue->lock);
-    #endif
-    {
-        /* The job is stored in memory so now we need to add it to our linked list. We only ever add items to the end of the list. */
-        for (;;) {
-            tail = ma_atomic_load_64(&pQueue->tail);
-            next = ma_atomic_load_64(&pQueue->pJobs[ma_job_extract_slot(tail)].next);
-
-            if (ma_job_toc_to_allocation(tail) == ma_job_toc_to_allocation(ma_atomic_load_64(&pQueue->tail))) {
-                if (ma_job_extract_slot(next) == 0xFFFF) {
-                    if (ma_job_queue_cas(&pQueue->pJobs[ma_job_extract_slot(tail)].next, next, slot)) {
-                        break;
-                    }
-                } else {
-                    ma_job_queue_cas(&pQueue->tail, tail, ma_job_extract_slot(next));
-                }
-            }
-        }
-        ma_job_queue_cas(&pQueue->tail, tail, slot);
-    }
-    #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
-    ma_spinlock_unlock(&pQueue->lock);
-    #endif
-
-
-    /* Signal the semaphore as the last step if we're using synchronous mode. */
-    if ((pQueue->flags & MA_JOB_QUEUE_FLAG_NON_BLOCKING) == 0) {
-        #ifndef MA_NO_THREADING
-        {
-            ma_semaphore_release(&pQueue->sem);
-        }
-        #else
-        {
-            MA_ASSERT(MA_FALSE);    /* Should never get here. Should have been checked at initialization time. */
-        }
-        #endif
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_job_queue_next(ma_job_queue* pQueue, ma_job* pJob)
-{
-    ma_uint64 head;
-    ma_uint64 tail;
-    ma_uint64 next;
-
-    if (pQueue == NULL || pJob == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* If we're running in synchronous mode we'll need to wait on a semaphore. */
-    if ((pQueue->flags & MA_JOB_QUEUE_FLAG_NON_BLOCKING) == 0) {
-        #ifndef MA_NO_THREADING
-        {
-            ma_semaphore_wait(&pQueue->sem);
-        }
-        #else
-        {
-            MA_ASSERT(MA_FALSE);    /* Should never get here. Should have been checked at initialization time. */
-        }
-        #endif
-    }
-
-    #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
-    ma_spinlock_lock(&pQueue->lock);
-    #endif
-    {
-        /*
-        BUG: In lock-free mode, multiple threads can be in this section of code. The "head" variable in the loop below
-        is stored. One thread can fall through to the freeing of this item while another is still using "head" for the
-        retrieval of the "next" variable.
-
-        The slot allocator might need to make use of some reference counting to ensure it's only truely freed when
-        there are no more references to the item. This must be fixed before removing these locks.
-        */
-
-        /* Now we need to remove the root item from the list. */
-        for (;;) {
-            head = ma_atomic_load_64(&pQueue->head);
-            tail = ma_atomic_load_64(&pQueue->tail);
-            next = ma_atomic_load_64(&pQueue->pJobs[ma_job_extract_slot(head)].next);
-
-            if (ma_job_toc_to_allocation(head) == ma_job_toc_to_allocation(ma_atomic_load_64(&pQueue->head))) {
-                if (ma_job_extract_slot(head) == ma_job_extract_slot(tail)) {
-                    if (ma_job_extract_slot(next) == 0xFFFF) {
-                        #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
-                        ma_spinlock_unlock(&pQueue->lock);
-                        #endif
-                        return MA_NO_DATA_AVAILABLE;
-                    }
-                    ma_job_queue_cas(&pQueue->tail, tail, ma_job_extract_slot(next));
-                } else {
-                    *pJob = pQueue->pJobs[ma_job_extract_slot(next)];
-                    if (ma_job_queue_cas(&pQueue->head, head, ma_job_extract_slot(next))) {
-                        break;
-                    }
-                }
-            }
-        }
-    }
-    #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
-    ma_spinlock_unlock(&pQueue->lock);
-    #endif
-
-    ma_slot_allocator_free(&pQueue->allocator, head);
-
-    /*
-    If it's a quit job make sure it's put back on the queue to ensure other threads have an opportunity to detect it and terminate naturally. We
-    could instead just leave it on the queue, but that would involve fiddling with the lock-free code above and I want to keep that as simple as
-    possible.
-    */
-    if (pJob->toc.breakup.code == MA_JOB_TYPE_QUIT) {
-        ma_job_queue_post(pQueue, pJob);
-        return MA_CANCELLED;    /* Return a cancelled status just in case the thread is checking return codes and not properly checking for a quit job. */
-    }
-
-    return MA_SUCCESS;
-}
-
-
-
-/*******************************************************************************
-
-Dynamic Linking
-
-*******************************************************************************/
-#ifdef MA_POSIX
-    /* No need for dlfcn.h if we're not using runtime linking. */
-    #ifndef MA_NO_RUNTIME_LINKING
-        #include <dlfcn.h>
-    #endif
-#endif
-
-MA_API ma_handle ma_dlopen(ma_log* pLog, const char* filename)
-{
-#ifndef MA_NO_RUNTIME_LINKING
-    ma_handle handle;
-
-    ma_log_postf(pLog, MA_LOG_LEVEL_DEBUG, "Loading library: %s\n", filename);
-
-    #ifdef MA_WIN32
-        /* From MSDN: Desktop applications cannot use LoadPackagedLibrary; if a desktop application calls this function it fails with APPMODEL_ERROR_NO_PACKAGE.*/
-        #if !defined(MA_WIN32_UWP) || !(defined(WINAPI_FAMILY) && ((defined(WINAPI_FAMILY_PHONE_APP) && WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP)))
-            handle = (ma_handle)LoadLibraryA(filename);
-        #else
-            /* *sigh* It appears there is no ANSI version of LoadPackagedLibrary()... */
-            WCHAR filenameW[4096];
-            if (MultiByteToWideChar(CP_UTF8, 0, filename, -1, filenameW, sizeof(filenameW)) == 0) {
-                handle = NULL;
-            } else {
-                handle = (ma_handle)LoadPackagedLibrary(filenameW, 0);
-            }
-        #endif
-    #else
-        handle = (ma_handle)dlopen(filename, RTLD_NOW);
-    #endif
-
-    /*
-    I'm not considering failure to load a library an error nor a warning because seamlessly falling through to a lower-priority
-    backend is a deliberate design choice. Instead I'm logging it as an informational message.
-    */
-    if (handle == NULL) {
-        ma_log_postf(pLog, MA_LOG_LEVEL_INFO, "Failed to load library: %s\n", filename);
-    }
-
-    return handle;
-#else
-    /* Runtime linking is disabled. */
-    (void)pLog;
-    (void)filename;
-    return NULL;
-#endif
-}
-
-MA_API void ma_dlclose(ma_log* pLog, ma_handle handle)
-{
-#ifndef MA_NO_RUNTIME_LINKING
-    #ifdef MA_WIN32
-        FreeLibrary((HMODULE)handle);
-    #else
-        dlclose((void*)handle);
-    #endif
-
-    (void)pLog;
-#else
-    /* Runtime linking is disabled. */
-    (void)pLog;
-    (void)handle;
-#endif
-}
-
-MA_API ma_proc ma_dlsym(ma_log* pLog, ma_handle handle, const char* symbol)
-{
-#ifndef MA_NO_RUNTIME_LINKING
-    ma_proc proc;
-
-    ma_log_postf(pLog, MA_LOG_LEVEL_DEBUG, "Loading symbol: %s\n", symbol);
-
-#ifdef _WIN32
-    proc = (ma_proc)GetProcAddress((HMODULE)handle, symbol);
-#else
-#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wpedantic"
-#endif
-    proc = (ma_proc)dlsym((void*)handle, symbol);
-#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
-    #pragma GCC diagnostic pop
-#endif
-#endif
-
-    if (proc == NULL) {
-        ma_log_postf(pLog, MA_LOG_LEVEL_WARNING, "Failed to load symbol: %s\n", symbol);
-    }
-
-    (void)pLog; /* It's possible for pContext to be unused. */
-    return proc;
-#else
-    /* Runtime linking is disabled. */
-    (void)pLog;
-    (void)handle;
-    (void)symbol;
-    return NULL;
-#endif
-}
-
-
-
-/************************************************************************************************************************************************************
-*************************************************************************************************************************************************************
-
-DEVICE I/O
-==========
-
-*************************************************************************************************************************************************************
-************************************************************************************************************************************************************/
-
-/* Disable run-time linking on certain backends and platforms. */
-#ifndef MA_NO_RUNTIME_LINKING
-    #if defined(MA_EMSCRIPTEN) || defined(MA_ORBIS) || defined(MA_PROSPERO)
-        #define MA_NO_RUNTIME_LINKING
-    #endif
-#endif
-
-#ifndef MA_NO_DEVICE_IO
-
-#if defined(MA_APPLE) && (__MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
-    #include <mach/mach_time.h> /* For mach_absolute_time() */
-#endif
-
-#ifdef MA_POSIX
-    #include <sys/types.h>
-    #include <unistd.h>
-
-    /* No need for dlfcn.h if we're not using runtime linking. */
-    #ifndef MA_NO_RUNTIME_LINKING
-        #include <dlfcn.h>
-    #endif
-#endif
-
-
-
-MA_API void ma_device_info_add_native_data_format(ma_device_info* pDeviceInfo, ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 flags)
-{
-    if (pDeviceInfo == NULL) {
-        return;
-    }
-
-    if (pDeviceInfo->nativeDataFormatCount < ma_countof(pDeviceInfo->nativeDataFormats)) {
-        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
-        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
-        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = sampleRate;
-        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = flags;
-        pDeviceInfo->nativeDataFormatCount += 1;
-    }
-}
-
-
-typedef struct
-{
-    ma_backend backend;
-    const char* pName;
-} ma_backend_info;
-
-static ma_backend_info gBackendInfo[] = /* Indexed by the backend enum. Must be in the order backends are declared in the ma_backend enum. */
-{
-    {ma_backend_wasapi,     "WASAPI"},
-    {ma_backend_dsound,     "DirectSound"},
-    {ma_backend_winmm,      "WinMM"},
-    {ma_backend_coreaudio,  "Core Audio"},
-    {ma_backend_sndio,      "sndio"},
-    {ma_backend_audio4,     "audio(4)"},
-    {ma_backend_oss,        "OSS"},
-    {ma_backend_pulseaudio, "PulseAudio"},
-    {ma_backend_alsa,       "ALSA"},
-    {ma_backend_jack,       "JACK"},
-    {ma_backend_aaudio,     "AAudio"},
-    {ma_backend_opensl,     "OpenSL|ES"},
-    {ma_backend_webaudio,   "Web Audio"},
-    {ma_backend_custom,     "Custom"},
-    {ma_backend_null,       "Null"}
-};
-
-MA_API const char* ma_get_backend_name(ma_backend backend)
-{
-    if (backend < 0 || backend >= (int)ma_countof(gBackendInfo)) {
-        return "Unknown";
-    }
-
-    return gBackendInfo[backend].pName;
-}
-
-MA_API ma_result ma_get_backend_from_name(const char* pBackendName, ma_backend* pBackend)
-{
-    size_t iBackend;
-
-    if (pBackendName == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (iBackend = 0; iBackend < ma_countof(gBackendInfo); iBackend += 1) {
-        if (ma_strcmp(pBackendName, gBackendInfo[iBackend].pName) == 0) {
-            if (pBackend != NULL) {
-                *pBackend = gBackendInfo[iBackend].backend;
-            }
-
-            return MA_SUCCESS;
-        }
-    }
-
-    /* Getting here means the backend name is unknown. */
-    return MA_INVALID_ARGS;
-}
-
-MA_API ma_bool32 ma_is_backend_enabled(ma_backend backend)
-{
-    /*
-    This looks a little bit gross, but we want all backends to be included in the switch to avoid warnings on some compilers
-    about some enums not being handled by the switch statement.
-    */
-    switch (backend)
-    {
-        case ma_backend_wasapi:
-        #if defined(MA_HAS_WASAPI)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_dsound:
-        #if defined(MA_HAS_DSOUND)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_winmm:
-        #if defined(MA_HAS_WINMM)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_coreaudio:
-        #if defined(MA_HAS_COREAUDIO)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_sndio:
-        #if defined(MA_HAS_SNDIO)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_audio4:
-        #if defined(MA_HAS_AUDIO4)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_oss:
-        #if defined(MA_HAS_OSS)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_pulseaudio:
-        #if defined(MA_HAS_PULSEAUDIO)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_alsa:
-        #if defined(MA_HAS_ALSA)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_jack:
-        #if defined(MA_HAS_JACK)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_aaudio:
-        #if defined(MA_HAS_AAUDIO)
-            #if defined(MA_ANDROID)
-            {
-                return ma_android_sdk_version() >= 26;
-            }
-            #else
-                return MA_FALSE;
-            #endif
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_opensl:
-        #if defined(MA_HAS_OPENSL)
-            #if defined(MA_ANDROID)
-            {
-                return ma_android_sdk_version() >= 9;
-            }
-            #else
-                return MA_TRUE;
-            #endif
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_webaudio:
-        #if defined(MA_HAS_WEBAUDIO)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_custom:
-        #if defined(MA_HAS_CUSTOM)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-        case ma_backend_null:
-        #if defined(MA_HAS_NULL)
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-
-        default: return MA_FALSE;
-    }
-}
-
-MA_API ma_result ma_get_enabled_backends(ma_backend* pBackends, size_t backendCap, size_t* pBackendCount)
-{
-    size_t backendCount;
-    size_t iBackend;
-    ma_result result = MA_SUCCESS;
-
-    if (pBackendCount == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    backendCount = 0;
-
-    for (iBackend = 0; iBackend <= ma_backend_null; iBackend += 1) {
-        ma_backend backend = (ma_backend)iBackend;
-
-        if (ma_is_backend_enabled(backend)) {
-            /* The backend is enabled. Try adding it to the list. If there's no room, MA_NO_SPACE needs to be returned. */
-            if (backendCount == backendCap) {
-                result = MA_NO_SPACE;
-                break;
-            } else {
-                pBackends[backendCount] = backend;
-                backendCount += 1;
-            }
-        }
-    }
-
-    if (pBackendCount != NULL) {
-        *pBackendCount = backendCount;
-    }
-
-    return result;
-}
-
-MA_API ma_bool32 ma_is_loopback_supported(ma_backend backend)
-{
-    switch (backend)
-    {
-        case ma_backend_wasapi:     return MA_TRUE;
-        case ma_backend_dsound:     return MA_FALSE;
-        case ma_backend_winmm:      return MA_FALSE;
-        case ma_backend_coreaudio:  return MA_FALSE;
-        case ma_backend_sndio:      return MA_FALSE;
-        case ma_backend_audio4:     return MA_FALSE;
-        case ma_backend_oss:        return MA_FALSE;
-        case ma_backend_pulseaudio: return MA_FALSE;
-        case ma_backend_alsa:       return MA_FALSE;
-        case ma_backend_jack:       return MA_FALSE;
-        case ma_backend_aaudio:     return MA_FALSE;
-        case ma_backend_opensl:     return MA_FALSE;
-        case ma_backend_webaudio:   return MA_FALSE;
-        case ma_backend_custom:     return MA_FALSE;    /* <-- Will depend on the implementation of the backend. */
-        case ma_backend_null:       return MA_FALSE;
-        default:                    return MA_FALSE;
-    }
-}
-
-
-
-#if defined(MA_WIN32)
-/* WASAPI error codes. */
-#define MA_AUDCLNT_E_NOT_INITIALIZED              ((HRESULT)0x88890001)
-#define MA_AUDCLNT_E_ALREADY_INITIALIZED          ((HRESULT)0x88890002)
-#define MA_AUDCLNT_E_WRONG_ENDPOINT_TYPE          ((HRESULT)0x88890003)
-#define MA_AUDCLNT_E_DEVICE_INVALIDATED           ((HRESULT)0x88890004)
-#define MA_AUDCLNT_E_NOT_STOPPED                  ((HRESULT)0x88890005)
-#define MA_AUDCLNT_E_BUFFER_TOO_LARGE             ((HRESULT)0x88890006)
-#define MA_AUDCLNT_E_OUT_OF_ORDER                 ((HRESULT)0x88890007)
-#define MA_AUDCLNT_E_UNSUPPORTED_FORMAT           ((HRESULT)0x88890008)
-#define MA_AUDCLNT_E_INVALID_SIZE                 ((HRESULT)0x88890009)
-#define MA_AUDCLNT_E_DEVICE_IN_USE                ((HRESULT)0x8889000A)
-#define MA_AUDCLNT_E_BUFFER_OPERATION_PENDING     ((HRESULT)0x8889000B)
-#define MA_AUDCLNT_E_THREAD_NOT_REGISTERED        ((HRESULT)0x8889000C)
-#define MA_AUDCLNT_E_NO_SINGLE_PROCESS            ((HRESULT)0x8889000D)
-#define MA_AUDCLNT_E_EXCLUSIVE_MODE_NOT_ALLOWED   ((HRESULT)0x8889000E)
-#define MA_AUDCLNT_E_ENDPOINT_CREATE_FAILED       ((HRESULT)0x8889000F)
-#define MA_AUDCLNT_E_SERVICE_NOT_RUNNING          ((HRESULT)0x88890010)
-#define MA_AUDCLNT_E_EVENTHANDLE_NOT_EXPECTED     ((HRESULT)0x88890011)
-#define MA_AUDCLNT_E_EXCLUSIVE_MODE_ONLY          ((HRESULT)0x88890012)
-#define MA_AUDCLNT_E_BUFDURATION_PERIOD_NOT_EQUAL ((HRESULT)0x88890013)
-#define MA_AUDCLNT_E_EVENTHANDLE_NOT_SET          ((HRESULT)0x88890014)
-#define MA_AUDCLNT_E_INCORRECT_BUFFER_SIZE        ((HRESULT)0x88890015)
-#define MA_AUDCLNT_E_BUFFER_SIZE_ERROR            ((HRESULT)0x88890016)
-#define MA_AUDCLNT_E_CPUUSAGE_EXCEEDED            ((HRESULT)0x88890017)
-#define MA_AUDCLNT_E_BUFFER_ERROR                 ((HRESULT)0x88890018)
-#define MA_AUDCLNT_E_BUFFER_SIZE_NOT_ALIGNED      ((HRESULT)0x88890019)
-#define MA_AUDCLNT_E_INVALID_DEVICE_PERIOD        ((HRESULT)0x88890020)
-#define MA_AUDCLNT_E_INVALID_STREAM_FLAG          ((HRESULT)0x88890021)
-#define MA_AUDCLNT_E_ENDPOINT_OFFLOAD_NOT_CAPABLE ((HRESULT)0x88890022)
-#define MA_AUDCLNT_E_OUT_OF_OFFLOAD_RESOURCES     ((HRESULT)0x88890023)
-#define MA_AUDCLNT_E_OFFLOAD_MODE_ONLY            ((HRESULT)0x88890024)
-#define MA_AUDCLNT_E_NONOFFLOAD_MODE_ONLY         ((HRESULT)0x88890025)
-#define MA_AUDCLNT_E_RESOURCES_INVALIDATED        ((HRESULT)0x88890026)
-#define MA_AUDCLNT_E_RAW_MODE_UNSUPPORTED         ((HRESULT)0x88890027)
-#define MA_AUDCLNT_E_ENGINE_PERIODICITY_LOCKED    ((HRESULT)0x88890028)
-#define MA_AUDCLNT_E_ENGINE_FORMAT_LOCKED         ((HRESULT)0x88890029)
-#define MA_AUDCLNT_E_HEADTRACKING_ENABLED         ((HRESULT)0x88890030)
-#define MA_AUDCLNT_E_HEADTRACKING_UNSUPPORTED     ((HRESULT)0x88890040)
-#define MA_AUDCLNT_S_BUFFER_EMPTY                 ((HRESULT)0x08890001)
-#define MA_AUDCLNT_S_THREAD_ALREADY_REGISTERED    ((HRESULT)0x08890002)
-#define MA_AUDCLNT_S_POSITION_STALLED             ((HRESULT)0x08890003)
-
-#define MA_DS_OK                                  ((HRESULT)0)
-#define MA_DS_NO_VIRTUALIZATION                   ((HRESULT)0x0878000A)
-#define MA_DSERR_ALLOCATED                        ((HRESULT)0x8878000A)
-#define MA_DSERR_CONTROLUNAVAIL                   ((HRESULT)0x8878001E)
-#define MA_DSERR_INVALIDPARAM                     ((HRESULT)0x80070057) /*E_INVALIDARG*/
-#define MA_DSERR_INVALIDCALL                      ((HRESULT)0x88780032)
-#define MA_DSERR_GENERIC                          ((HRESULT)0x80004005) /*E_FAIL*/
-#define MA_DSERR_PRIOLEVELNEEDED                  ((HRESULT)0x88780046)
-#define MA_DSERR_OUTOFMEMORY                      ((HRESULT)0x8007000E) /*E_OUTOFMEMORY*/
-#define MA_DSERR_BADFORMAT                        ((HRESULT)0x88780064)
-#define MA_DSERR_UNSUPPORTED                      ((HRESULT)0x80004001) /*E_NOTIMPL*/
-#define MA_DSERR_NODRIVER                         ((HRESULT)0x88780078)
-#define MA_DSERR_ALREADYINITIALIZED               ((HRESULT)0x88780082)
-#define MA_DSERR_NOAGGREGATION                    ((HRESULT)0x80040110) /*CLASS_E_NOAGGREGATION*/
-#define MA_DSERR_BUFFERLOST                       ((HRESULT)0x88780096)
-#define MA_DSERR_OTHERAPPHASPRIO                  ((HRESULT)0x887800A0)
-#define MA_DSERR_UNINITIALIZED                    ((HRESULT)0x887800AA)
-#define MA_DSERR_NOINTERFACE                      ((HRESULT)0x80004002) /*E_NOINTERFACE*/
-#define MA_DSERR_ACCESSDENIED                     ((HRESULT)0x80070005) /*E_ACCESSDENIED*/
-#define MA_DSERR_BUFFERTOOSMALL                   ((HRESULT)0x887800B4)
-#define MA_DSERR_DS8_REQUIRED                     ((HRESULT)0x887800BE)
-#define MA_DSERR_SENDLOOP                         ((HRESULT)0x887800C8)
-#define MA_DSERR_BADSENDBUFFERGUID                ((HRESULT)0x887800D2)
-#define MA_DSERR_OBJECTNOTFOUND                   ((HRESULT)0x88781161)
-#define MA_DSERR_FXUNAVAILABLE                    ((HRESULT)0x887800DC)
-
-static ma_result ma_result_from_HRESULT(HRESULT hr)
-{
-    switch (hr)
-    {
-        case NOERROR:                                   return MA_SUCCESS;
-        /*case S_OK:                                      return MA_SUCCESS;*/
-
-        case E_POINTER:                                 return MA_INVALID_ARGS;
-        case E_UNEXPECTED:                              return MA_ERROR;
-        case E_NOTIMPL:                                 return MA_NOT_IMPLEMENTED;
-        case E_OUTOFMEMORY:                             return MA_OUT_OF_MEMORY;
-        case E_INVALIDARG:                              return MA_INVALID_ARGS;
-        case E_NOINTERFACE:                             return MA_API_NOT_FOUND;
-        case E_HANDLE:                                  return MA_INVALID_ARGS;
-        case E_ABORT:                                   return MA_ERROR;
-        case E_FAIL:                                    return MA_ERROR;
-        case E_ACCESSDENIED:                            return MA_ACCESS_DENIED;
-
-        /* WASAPI */
-        case MA_AUDCLNT_E_NOT_INITIALIZED:              return MA_DEVICE_NOT_INITIALIZED;
-        case MA_AUDCLNT_E_ALREADY_INITIALIZED:          return MA_DEVICE_ALREADY_INITIALIZED;
-        case MA_AUDCLNT_E_WRONG_ENDPOINT_TYPE:          return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_DEVICE_INVALIDATED:           return MA_UNAVAILABLE;
-        case MA_AUDCLNT_E_NOT_STOPPED:                  return MA_DEVICE_NOT_STOPPED;
-        case MA_AUDCLNT_E_BUFFER_TOO_LARGE:             return MA_TOO_BIG;
-        case MA_AUDCLNT_E_OUT_OF_ORDER:                 return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_UNSUPPORTED_FORMAT:           return MA_FORMAT_NOT_SUPPORTED;
-        case MA_AUDCLNT_E_INVALID_SIZE:                 return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_DEVICE_IN_USE:                return MA_BUSY;
-        case MA_AUDCLNT_E_BUFFER_OPERATION_PENDING:     return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_THREAD_NOT_REGISTERED:        return MA_DOES_NOT_EXIST;
-        case MA_AUDCLNT_E_NO_SINGLE_PROCESS:            return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_EXCLUSIVE_MODE_NOT_ALLOWED:   return MA_SHARE_MODE_NOT_SUPPORTED;
-        case MA_AUDCLNT_E_ENDPOINT_CREATE_FAILED:       return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-        case MA_AUDCLNT_E_SERVICE_NOT_RUNNING:          return MA_NOT_CONNECTED;
-        case MA_AUDCLNT_E_EVENTHANDLE_NOT_EXPECTED:     return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_EXCLUSIVE_MODE_ONLY:          return MA_SHARE_MODE_NOT_SUPPORTED;
-        case MA_AUDCLNT_E_BUFDURATION_PERIOD_NOT_EQUAL: return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_EVENTHANDLE_NOT_SET:          return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_INCORRECT_BUFFER_SIZE:        return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_BUFFER_SIZE_ERROR:            return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_CPUUSAGE_EXCEEDED:            return MA_ERROR;
-        case MA_AUDCLNT_E_BUFFER_ERROR:                 return MA_ERROR;
-        case MA_AUDCLNT_E_BUFFER_SIZE_NOT_ALIGNED:      return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_INVALID_DEVICE_PERIOD:        return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_INVALID_STREAM_FLAG:          return MA_INVALID_ARGS;
-        case MA_AUDCLNT_E_ENDPOINT_OFFLOAD_NOT_CAPABLE: return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_OUT_OF_OFFLOAD_RESOURCES:     return MA_OUT_OF_MEMORY;
-        case MA_AUDCLNT_E_OFFLOAD_MODE_ONLY:            return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_NONOFFLOAD_MODE_ONLY:         return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_RESOURCES_INVALIDATED:        return MA_INVALID_DATA;
-        case MA_AUDCLNT_E_RAW_MODE_UNSUPPORTED:         return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_ENGINE_PERIODICITY_LOCKED:    return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_ENGINE_FORMAT_LOCKED:         return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_HEADTRACKING_ENABLED:         return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_E_HEADTRACKING_UNSUPPORTED:     return MA_INVALID_OPERATION;
-        case MA_AUDCLNT_S_BUFFER_EMPTY:                 return MA_NO_SPACE;
-        case MA_AUDCLNT_S_THREAD_ALREADY_REGISTERED:    return MA_ALREADY_EXISTS;
-        case MA_AUDCLNT_S_POSITION_STALLED:             return MA_ERROR;
-
-        /* DirectSound */
-        /*case MA_DS_OK:                                  return MA_SUCCESS;*/          /* S_OK */
-        case MA_DS_NO_VIRTUALIZATION:                   return MA_SUCCESS;
-        case MA_DSERR_ALLOCATED:                        return MA_ALREADY_IN_USE;
-        case MA_DSERR_CONTROLUNAVAIL:                   return MA_INVALID_OPERATION;
-        /*case MA_DSERR_INVALIDPARAM:                    return MA_INVALID_ARGS;*/      /* E_INVALIDARG */
-        case MA_DSERR_INVALIDCALL:                      return MA_INVALID_OPERATION;
-        /*case MA_DSERR_GENERIC:                          return MA_ERROR;*/            /* E_FAIL */
-        case MA_DSERR_PRIOLEVELNEEDED:                  return MA_INVALID_OPERATION;
-        /*case MA_DSERR_OUTOFMEMORY:                      return MA_OUT_OF_MEMORY;*/    /* E_OUTOFMEMORY */
-        case MA_DSERR_BADFORMAT:                        return MA_FORMAT_NOT_SUPPORTED;
-        /*case MA_DSERR_UNSUPPORTED:                      return MA_NOT_IMPLEMENTED;*/  /* E_NOTIMPL */
-        case MA_DSERR_NODRIVER:                         return MA_FAILED_TO_INIT_BACKEND;
-        case MA_DSERR_ALREADYINITIALIZED:               return MA_DEVICE_ALREADY_INITIALIZED;
-        case MA_DSERR_NOAGGREGATION:                    return MA_ERROR;
-        case MA_DSERR_BUFFERLOST:                       return MA_UNAVAILABLE;
-        case MA_DSERR_OTHERAPPHASPRIO:                  return MA_ACCESS_DENIED;
-        case MA_DSERR_UNINITIALIZED:                    return MA_DEVICE_NOT_INITIALIZED;
-        /*case MA_DSERR_NOINTERFACE:                      return MA_API_NOT_FOUND;*/    /* E_NOINTERFACE */
-        /*case MA_DSERR_ACCESSDENIED:                     return MA_ACCESS_DENIED;*/    /* E_ACCESSDENIED */
-        case MA_DSERR_BUFFERTOOSMALL:                   return MA_NO_SPACE;
-        case MA_DSERR_DS8_REQUIRED:                     return MA_INVALID_OPERATION;
-        case MA_DSERR_SENDLOOP:                         return MA_DEADLOCK;
-        case MA_DSERR_BADSENDBUFFERGUID:                return MA_INVALID_ARGS;
-        case MA_DSERR_OBJECTNOTFOUND:                   return MA_NO_DEVICE;
-        case MA_DSERR_FXUNAVAILABLE:                    return MA_UNAVAILABLE;
-
-        default:                                        return MA_ERROR;
-    }
-}
-
-/* PROPVARIANT */
-#define MA_VT_LPWSTR    31
-#define MA_VT_BLOB      65
-
-#if defined(_MSC_VER) && !defined(__clang__)
-    #pragma warning(push)
-    #pragma warning(disable:4201)   /* nonstandard extension used: nameless struct/union */
-#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wpedantic" /* For ISO C99 doesn't support unnamed structs/unions [-Wpedantic] */
-    #if defined(__clang__)
-        #pragma GCC diagnostic ignored "-Wc11-extensions"   /* anonymous unions are a C11 extension */
-    #endif
-#endif
-typedef struct
-{
-    WORD vt;
-    WORD wReserved1;
-    WORD wReserved2;
-    WORD wReserved3;
-    union
-    {
-        struct
-        {
-            ULONG cbSize;
-            BYTE* pBlobData;
-        } blob;
-        WCHAR* pwszVal;
-        char pad[16];   /* Just to ensure the size of the struct matches the official version. */
-    };
-} MA_PROPVARIANT;
-#if defined(_MSC_VER) && !defined(__clang__)
-    #pragma warning(pop)
-#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-    #pragma GCC diagnostic pop
-#endif
-
-typedef HRESULT (WINAPI * MA_PFN_CoInitialize)(void* pvReserved);
-typedef HRESULT (WINAPI * MA_PFN_CoInitializeEx)(void* pvReserved, DWORD  dwCoInit);
-typedef void    (WINAPI * MA_PFN_CoUninitialize)(void);
-typedef HRESULT (WINAPI * MA_PFN_CoCreateInstance)(const IID* rclsid, void* pUnkOuter, DWORD dwClsContext, const IID* riid, void* ppv);
-typedef void    (WINAPI * MA_PFN_CoTaskMemFree)(void* pv);
-typedef HRESULT (WINAPI * MA_PFN_PropVariantClear)(MA_PROPVARIANT *pvar);
-typedef int     (WINAPI * MA_PFN_StringFromGUID2)(const GUID* const rguid, WCHAR* lpsz, int cchMax);
-
-typedef HWND    (WINAPI * MA_PFN_GetForegroundWindow)(void);
-typedef HWND    (WINAPI * MA_PFN_GetDesktopWindow)(void);
-
-#if defined(MA_WIN32_DESKTOP)
-/* Microsoft documents these APIs as returning LSTATUS, but the Win32 API shipping with some compilers do not define it. It's just a LONG. */
-typedef LONG    (WINAPI * MA_PFN_RegOpenKeyExA)(HKEY hKey, const char* lpSubKey, DWORD ulOptions, DWORD samDesired, HKEY* phkResult);
-typedef LONG    (WINAPI * MA_PFN_RegCloseKey)(HKEY hKey);
-typedef LONG    (WINAPI * MA_PFN_RegQueryValueExA)(HKEY hKey, const char* lpValueName, DWORD* lpReserved, DWORD* lpType, BYTE* lpData, DWORD* lpcbData);
-#endif  /* MA_WIN32_DESKTOP */
-
-
-MA_API size_t ma_strlen_WCHAR(const WCHAR* str)
-{
-    size_t len = 0;
-    while (str[len] != '\0') {
-        len += 1;
-    }
-
-    return len;
-}
-
-MA_API int ma_strcmp_WCHAR(const WCHAR *s1, const WCHAR *s2)
-{
-    while (*s1 != '\0' && *s1 == *s2) {
-        s1 += 1;
-        s2 += 1;
-    }
-
-    return *s1 - *s2;
-}
-
-MA_API int ma_strcpy_s_WCHAR(WCHAR* dst, size_t dstCap, const WCHAR* src)
-{
-    size_t i;
-
-    if (dst == 0) {
-        return 22;
-    }
-    if (dstCap == 0) {
-        return 34;
-    }
-    if (src == 0) {
-        dst[0] = '\0';
-        return 22;
-    }
-
-    for (i = 0; i < dstCap && src[i] != '\0'; ++i) {
-        dst[i] = src[i];
-    }
-
-    if (i < dstCap) {
-        dst[i] = '\0';
-        return 0;
-    }
-
-    dst[0] = '\0';
-    return 34;
-}
-#endif  /* MA_WIN32 */
-
-
-#define MA_DEFAULT_PLAYBACK_DEVICE_NAME    "Default Playback Device"
-#define MA_DEFAULT_CAPTURE_DEVICE_NAME     "Default Capture Device"
-
-
-
-
-/*******************************************************************************
-
-Timing
-
-*******************************************************************************/
-#if defined(MA_WIN32) && !defined(MA_POSIX)
-    static LARGE_INTEGER g_ma_TimerFrequency;   /* <-- Initialized to zero since it's static. */
-    static void ma_timer_init(ma_timer* pTimer)
-    {
-        LARGE_INTEGER counter;
-
-        if (g_ma_TimerFrequency.QuadPart == 0) {
-            QueryPerformanceFrequency(&g_ma_TimerFrequency);
-        }
-
-        QueryPerformanceCounter(&counter);
-        pTimer->counter = counter.QuadPart;
-    }
-
-    static double ma_timer_get_time_in_seconds(ma_timer* pTimer)
-    {
-        LARGE_INTEGER counter;
-        if (!QueryPerformanceCounter(&counter)) {
-            return 0;
-        }
-
-        return (double)(counter.QuadPart - pTimer->counter) / g_ma_TimerFrequency.QuadPart;
-    }
-#elif defined(MA_APPLE) && (__MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
-    static ma_uint64 g_ma_TimerFrequency = 0;
-    static void ma_timer_init(ma_timer* pTimer)
-    {
-        mach_timebase_info_data_t baseTime;
-        mach_timebase_info(&baseTime);
-        g_ma_TimerFrequency = (baseTime.denom * 1e9) / baseTime.numer;
-
-        pTimer->counter = mach_absolute_time();
-    }
-
-    static double ma_timer_get_time_in_seconds(ma_timer* pTimer)
-    {
-        ma_uint64 newTimeCounter = mach_absolute_time();
-        ma_uint64 oldTimeCounter = pTimer->counter;
-
-        return (newTimeCounter - oldTimeCounter) / g_ma_TimerFrequency;
-    }
-#elif defined(MA_EMSCRIPTEN)
-    static MA_INLINE void ma_timer_init(ma_timer* pTimer)
-    {
-        pTimer->counterD = emscripten_get_now();
-    }
-
-    static MA_INLINE double ma_timer_get_time_in_seconds(ma_timer* pTimer)
-    {
-        return (emscripten_get_now() - pTimer->counterD) / 1000;    /* Emscripten is in milliseconds. */
-    }
-#else
-    #if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309L
-        #if defined(CLOCK_MONOTONIC)
-            #define MA_CLOCK_ID CLOCK_MONOTONIC
-        #else
-            #define MA_CLOCK_ID CLOCK_REALTIME
-        #endif
-
-        static void ma_timer_init(ma_timer* pTimer)
-        {
-            struct timespec newTime;
-            clock_gettime(MA_CLOCK_ID, &newTime);
-
-            pTimer->counter = (newTime.tv_sec * 1000000000) + newTime.tv_nsec;
-        }
-
-        static double ma_timer_get_time_in_seconds(ma_timer* pTimer)
-        {
-            ma_uint64 newTimeCounter;
-            ma_uint64 oldTimeCounter;
-
-            struct timespec newTime;
-            clock_gettime(MA_CLOCK_ID, &newTime);
-
-            newTimeCounter = (newTime.tv_sec * 1000000000) + newTime.tv_nsec;
-            oldTimeCounter = pTimer->counter;
-
-            return (newTimeCounter - oldTimeCounter) / 1000000000.0;
-        }
-    #else
-        static void ma_timer_init(ma_timer* pTimer)
-        {
-            struct timeval newTime;
-            gettimeofday(&newTime, NULL);
-
-            pTimer->counter = (newTime.tv_sec * 1000000) + newTime.tv_usec;
-        }
-
-        static double ma_timer_get_time_in_seconds(ma_timer* pTimer)
-        {
-            ma_uint64 newTimeCounter;
-            ma_uint64 oldTimeCounter;
-
-            struct timeval newTime;
-            gettimeofday(&newTime, NULL);
-
-            newTimeCounter = (newTime.tv_sec * 1000000) + newTime.tv_usec;
-            oldTimeCounter = pTimer->counter;
-
-            return (newTimeCounter - oldTimeCounter) / 1000000.0;
-        }
-    #endif
-#endif
-
-
-
-#if 0
-static ma_uint32 ma_get_closest_standard_sample_rate(ma_uint32 sampleRateIn)
-{
-    ma_uint32 closestRate = 0;
-    ma_uint32 closestDiff = 0xFFFFFFFF;
-    size_t iStandardRate;
-
-    for (iStandardRate = 0; iStandardRate < ma_countof(g_maStandardSampleRatePriorities); ++iStandardRate) {
-        ma_uint32 standardRate = g_maStandardSampleRatePriorities[iStandardRate];
-        ma_uint32 diff;
-
-        if (sampleRateIn > standardRate) {
-            diff = sampleRateIn - standardRate;
-        } else {
-            diff = standardRate - sampleRateIn;
-        }
-
-        if (diff == 0) {
-            return standardRate;    /* The input sample rate is a standard rate. */
-        }
-
-        if (closestDiff > diff) {
-            closestDiff = diff;
-            closestRate = standardRate;
-        }
-    }
-
-    return closestRate;
-}
-#endif
-
-
-static MA_INLINE unsigned int ma_device_disable_denormals(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (!pDevice->noDisableDenormals) {
-        return ma_disable_denormals();
-    } else {
-        return 0;
-    }
-}
-
-static MA_INLINE void ma_device_restore_denormals(ma_device* pDevice, unsigned int prevState)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (!pDevice->noDisableDenormals) {
-        ma_restore_denormals(prevState);
-    } else {
-        /* Do nothing. */
-        (void)prevState;
-    }
-}
-
-static ma_device_notification ma_device_notification_init(ma_device* pDevice, ma_device_notification_type type)
-{
-    ma_device_notification notification;
-
-    MA_ZERO_OBJECT(&notification);
-    notification.pDevice = pDevice;
-    notification.type    = type;
-
-    return notification;
-}
-
-static void ma_device__on_notification(ma_device_notification notification)
-{
-    MA_ASSERT(notification.pDevice != NULL);
-
-    if (notification.pDevice->onNotification != NULL) {
-        notification.pDevice->onNotification(&notification);
-    }
-
-    /* TEMP FOR COMPATIBILITY: If it's a stopped notification, fire the onStop callback as well. This is only for backwards compatibility and will be removed. */
-    if (notification.pDevice->onStop != NULL && notification.type == ma_device_notification_type_stopped) {
-        notification.pDevice->onStop(notification.pDevice);
-    }
-}
-
-static void ma_device__on_notification_started(ma_device* pDevice)
-{
-    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_started));
-}
-
-static void ma_device__on_notification_stopped(ma_device* pDevice)
-{
-    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_stopped));
-}
-
-/* Not all platforms support reroute notifications. */
-#if !defined(MA_EMSCRIPTEN)
-static void ma_device__on_notification_rerouted(ma_device* pDevice)
-{
-    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_rerouted));
-}
-#endif
-
-#if defined(MA_EMSCRIPTEN)
-EMSCRIPTEN_KEEPALIVE
-void ma_device__on_notification_unlocked(ma_device* pDevice)
-{
-    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_unlocked));
-}
-#endif
-
-
-static void ma_device__on_data_inner(ma_device* pDevice, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
-{
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pDevice->onData != NULL);
-
-    if (!pDevice->noPreSilencedOutputBuffer && pFramesOut != NULL) {
-        ma_silence_pcm_frames(pFramesOut, frameCount, pDevice->playback.format, pDevice->playback.channels);
-    }
-
-    pDevice->onData(pDevice, pFramesOut, pFramesIn, frameCount);
-}
-
-static void ma_device__on_data(ma_device* pDevice, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    /* Don't read more data from the client if we're in the process of stopping. */
-    if (ma_device_get_state(pDevice) == ma_device_state_stopping) {
-        return;
-    }
-
-    if (pDevice->noFixedSizedCallback) {
-        /* Fast path. Not using a fixed sized callback. Process directly from the specified buffers. */
-        ma_device__on_data_inner(pDevice, pFramesOut, pFramesIn, frameCount);
-    } else {
-        /* Slow path. Using a fixed sized callback. Need to use the intermediary buffer. */
-        ma_uint32 totalFramesProcessed = 0;
-
-        while (totalFramesProcessed < frameCount) {
-            ma_uint32 totalFramesRemaining = frameCount - totalFramesProcessed;
-            ma_uint32 framesToProcessThisIteration = 0;
-
-            if (pFramesIn != NULL) {
-                /* Capturing. Write to the intermediary buffer. If there's no room, fire the callback to empty it. */
-                if (pDevice->capture.intermediaryBufferLen < pDevice->capture.intermediaryBufferCap) {
-                    /* There's some room left in the intermediary buffer. Write to it without firing the callback. */
-                    framesToProcessThisIteration = totalFramesRemaining;
-                    if (framesToProcessThisIteration > pDevice->capture.intermediaryBufferCap - pDevice->capture.intermediaryBufferLen) {
-                        framesToProcessThisIteration = pDevice->capture.intermediaryBufferCap - pDevice->capture.intermediaryBufferLen;
-                    }
-
-                    ma_copy_pcm_frames(
-                        ma_offset_pcm_frames_ptr(pDevice->capture.pIntermediaryBuffer, pDevice->capture.intermediaryBufferLen, pDevice->capture.format, pDevice->capture.channels),
-                        ma_offset_pcm_frames_const_ptr(pFramesIn, totalFramesProcessed, pDevice->capture.format, pDevice->capture.channels),
-                        framesToProcessThisIteration,
-                        pDevice->capture.format, pDevice->capture.channels);
-
-                    pDevice->capture.intermediaryBufferLen += framesToProcessThisIteration;
-                }
-
-                if (pDevice->capture.intermediaryBufferLen == pDevice->capture.intermediaryBufferCap) {
-                    /* No room left in the intermediary buffer. Fire the data callback. */
-                    if (pDevice->type == ma_device_type_duplex) {
-                        /* We'll do the duplex data callback later after we've processed the playback data. */
-                    } else {
-                        ma_device__on_data_inner(pDevice, NULL, pDevice->capture.pIntermediaryBuffer, pDevice->capture.intermediaryBufferCap);
-
-                        /* The intermediary buffer has just been drained. */
-                        pDevice->capture.intermediaryBufferLen = 0;
-                    }
-                }
-            }
-
-            if (pFramesOut != NULL) {
-                /* Playing back. Read from the intermediary buffer. If there's nothing in it, fire the callback to fill it. */
-                if (pDevice->playback.intermediaryBufferLen > 0) {
-                    /* There's some content in the intermediary buffer. Read from that without firing the callback. */
-                    if (pDevice->type == ma_device_type_duplex) {
-                        /* The frames processed this iteration for a duplex device will always be based on the capture side. Leave it unmodified. */
-                    } else {
-                        framesToProcessThisIteration = totalFramesRemaining;
-                        if (framesToProcessThisIteration > pDevice->playback.intermediaryBufferLen) {
-                            framesToProcessThisIteration = pDevice->playback.intermediaryBufferLen;
-                        }
-                    }
-
-                    ma_copy_pcm_frames(
-                        ma_offset_pcm_frames_ptr(pFramesOut, totalFramesProcessed, pDevice->playback.format, pDevice->playback.channels),
-                        ma_offset_pcm_frames_ptr(pDevice->playback.pIntermediaryBuffer, pDevice->playback.intermediaryBufferCap - pDevice->playback.intermediaryBufferLen, pDevice->playback.format, pDevice->playback.channels),
-                        framesToProcessThisIteration,
-                        pDevice->playback.format, pDevice->playback.channels);
-
-                    pDevice->playback.intermediaryBufferLen -= framesToProcessThisIteration;
-                }
-
-                if (pDevice->playback.intermediaryBufferLen == 0) {
-                    /* There's nothing in the intermediary buffer. Fire the data callback to fill it. */
-                    if (pDevice->type == ma_device_type_duplex) {
-                        /* In duplex mode, the data callback will be fired later. Nothing to do here. */
-                    } else {
-                        ma_device__on_data_inner(pDevice, pDevice->playback.pIntermediaryBuffer, NULL, pDevice->playback.intermediaryBufferCap);
-
-                        /* The intermediary buffer has just been filled. */
-                        pDevice->playback.intermediaryBufferLen = pDevice->playback.intermediaryBufferCap;
-                    }
-                }
-            }
-
-            /* If we're in duplex mode we might need to do a refill of the data. */
-            if (pDevice->type == ma_device_type_duplex) {
-                if (pDevice->capture.intermediaryBufferLen == pDevice->capture.intermediaryBufferCap) {
-                    ma_device__on_data_inner(pDevice, pDevice->playback.pIntermediaryBuffer, pDevice->capture.pIntermediaryBuffer, pDevice->capture.intermediaryBufferCap);
-
-                    pDevice->playback.intermediaryBufferLen = pDevice->playback.intermediaryBufferCap;  /* The playback buffer will have just been filled. */
-                    pDevice->capture.intermediaryBufferLen  = 0;                                        /* The intermediary buffer has just been drained. */
-                }
-            }
-
-            /* Make sure this is only incremented once in the duplex case. */
-            totalFramesProcessed += framesToProcessThisIteration;
-        }
-    }
-}
-
-static void ma_device__handle_data_callback(ma_device* pDevice, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
-{
-    float masterVolumeFactor;
-
-    ma_device_get_master_volume(pDevice, &masterVolumeFactor);  /* Use ma_device_get_master_volume() to ensure the volume is loaded atomically. */
-
-    if (pDevice->onData) {
-        unsigned int prevDenormalState = ma_device_disable_denormals(pDevice);
-        {
-            /* Volume control of input makes things a bit awkward because the input buffer is read-only. We'll need to use a temp buffer and loop in this case. */
-            if (pFramesIn != NULL && masterVolumeFactor < 1) {
-                ma_uint8 tempFramesIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-                ma_uint32 bpfCapture  = ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
-                ma_uint32 bpfPlayback = ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
-                ma_uint32 totalFramesProcessed = 0;
-                while (totalFramesProcessed < frameCount) {
-                    ma_uint32 framesToProcessThisIteration = frameCount - totalFramesProcessed;
-                    if (framesToProcessThisIteration > sizeof(tempFramesIn)/bpfCapture) {
-                        framesToProcessThisIteration = sizeof(tempFramesIn)/bpfCapture;
-                    }
-
-                    ma_copy_and_apply_volume_factor_pcm_frames(tempFramesIn, ma_offset_ptr(pFramesIn, totalFramesProcessed*bpfCapture), framesToProcessThisIteration, pDevice->capture.format, pDevice->capture.channels, masterVolumeFactor);
-
-                    ma_device__on_data(pDevice, ma_offset_ptr(pFramesOut, totalFramesProcessed*bpfPlayback), tempFramesIn, framesToProcessThisIteration);
-
-                    totalFramesProcessed += framesToProcessThisIteration;
-                }
-            } else {
-                ma_device__on_data(pDevice, pFramesOut, pFramesIn, frameCount);
-            }
-
-            /* Volume control and clipping for playback devices. */
-            if (pFramesOut != NULL) {
-                if (masterVolumeFactor < 1) {
-                    if (pFramesIn == NULL) {    /* <-- In full-duplex situations, the volume will have been applied to the input samples before the data callback. Applying it again post-callback will incorrectly compound it. */
-                        ma_apply_volume_factor_pcm_frames(pFramesOut, frameCount, pDevice->playback.format, pDevice->playback.channels, masterVolumeFactor);
-                    }
-                }
-
-                if (!pDevice->noClip && pDevice->playback.format == ma_format_f32) {
-                    ma_clip_samples_f32((float*)pFramesOut, (const float*)pFramesOut, frameCount * pDevice->playback.channels);   /* Intentionally specifying the same pointer for both input and output for in-place processing. */
-                }
-            }
-        }
-        ma_device_restore_denormals(pDevice, prevDenormalState);
-    }
-}
-
-
-
-/* A helper function for reading sample data from the client. */
-static void ma_device__read_frames_from_client(ma_device* pDevice, ma_uint32 frameCount, void* pFramesOut)
-{
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(frameCount > 0);
-    MA_ASSERT(pFramesOut != NULL);
-
-    if (pDevice->playback.converter.isPassthrough) {
-        ma_device__handle_data_callback(pDevice, pFramesOut, NULL, frameCount);
-    } else {
-        ma_result result;
-        ma_uint64 totalFramesReadOut;
-        void* pRunningFramesOut;
-
-        totalFramesReadOut = 0;
-        pRunningFramesOut  = pFramesOut;
-
-        /*
-        We run slightly different logic depending on whether or not we're using a heap-allocated
-        buffer for caching input data. This will be the case if the data converter does not have
-        the ability to retrieve the required input frame count for a given output frame count.
-        */
-        if (pDevice->playback.pInputCache != NULL) {
-            while (totalFramesReadOut < frameCount) {
-                ma_uint64 framesToReadThisIterationIn;
-                ma_uint64 framesToReadThisIterationOut;
-
-                /* If there's any data available in the cache, that needs to get processed first. */
-                if (pDevice->playback.inputCacheRemaining > 0) {
-                    framesToReadThisIterationOut = (frameCount - totalFramesReadOut);
-                    framesToReadThisIterationIn  = framesToReadThisIterationOut;
-                    if (framesToReadThisIterationIn > pDevice->playback.inputCacheRemaining) {
-                        framesToReadThisIterationIn = pDevice->playback.inputCacheRemaining;
-                    }
-
-                    result = ma_data_converter_process_pcm_frames(&pDevice->playback.converter, ma_offset_pcm_frames_ptr(pDevice->playback.pInputCache, pDevice->playback.inputCacheConsumed, pDevice->playback.format, pDevice->playback.channels), &framesToReadThisIterationIn, pRunningFramesOut, &framesToReadThisIterationOut);
-                    if (result != MA_SUCCESS) {
-                        break;
-                    }
-
-                    pDevice->playback.inputCacheConsumed  += framesToReadThisIterationIn;
-                    pDevice->playback.inputCacheRemaining -= framesToReadThisIterationIn;
-
-                    totalFramesReadOut += framesToReadThisIterationOut;
-                    pRunningFramesOut   = ma_offset_ptr(pRunningFramesOut, framesToReadThisIterationOut * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
-
-                    if (framesToReadThisIterationIn == 0 && framesToReadThisIterationOut == 0) {
-                        break;  /* We're done. */
-                    }
-                }
-
-                /* Getting here means there's no data in the cache and we need to fill it up with data from the client. */
-                if (pDevice->playback.inputCacheRemaining == 0) {
-                    ma_device__handle_data_callback(pDevice, pDevice->playback.pInputCache, NULL, (ma_uint32)pDevice->playback.inputCacheCap);
-
-                    pDevice->playback.inputCacheConsumed  = 0;
-                    pDevice->playback.inputCacheRemaining = pDevice->playback.inputCacheCap;
-                }
-            }
-        } else {
-            while (totalFramesReadOut < frameCount) {
-                ma_uint8 pIntermediaryBuffer[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In client format. */
-                ma_uint64 intermediaryBufferCap = sizeof(pIntermediaryBuffer) / ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
-                ma_uint64 framesToReadThisIterationIn;
-                ma_uint64 framesReadThisIterationIn;
-                ma_uint64 framesToReadThisIterationOut;
-                ma_uint64 framesReadThisIterationOut;
-                ma_uint64 requiredInputFrameCount;
-
-                framesToReadThisIterationOut = (frameCount - totalFramesReadOut);
-                framesToReadThisIterationIn = framesToReadThisIterationOut;
-                if (framesToReadThisIterationIn > intermediaryBufferCap) {
-                    framesToReadThisIterationIn = intermediaryBufferCap;
-                }
-
-                ma_data_converter_get_required_input_frame_count(&pDevice->playback.converter, framesToReadThisIterationOut, &requiredInputFrameCount);
-                if (framesToReadThisIterationIn > requiredInputFrameCount) {
-                    framesToReadThisIterationIn = requiredInputFrameCount;
-                }
-
-                if (framesToReadThisIterationIn > 0) {
-                    ma_device__handle_data_callback(pDevice, pIntermediaryBuffer, NULL, (ma_uint32)framesToReadThisIterationIn);
-                }
-
-                /*
-                At this point we have our decoded data in input format and now we need to convert to output format. Note that even if we didn't read any
-                input frames, we still want to try processing frames because there may some output frames generated from cached input data.
-                */
-                framesReadThisIterationIn  = framesToReadThisIterationIn;
-                framesReadThisIterationOut = framesToReadThisIterationOut;
-                result = ma_data_converter_process_pcm_frames(&pDevice->playback.converter, pIntermediaryBuffer, &framesReadThisIterationIn, pRunningFramesOut, &framesReadThisIterationOut);
-                if (result != MA_SUCCESS) {
-                    break;
-                }
-
-                totalFramesReadOut += framesReadThisIterationOut;
-                pRunningFramesOut   = ma_offset_ptr(pRunningFramesOut, framesReadThisIterationOut * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
-
-                if (framesReadThisIterationIn == 0 && framesReadThisIterationOut == 0) {
-                    break;  /* We're done. */
-                }
-            }
-        }
-    }
-}
-
-/* A helper for sending sample data to the client. */
-static void ma_device__send_frames_to_client(ma_device* pDevice, ma_uint32 frameCountInDeviceFormat, const void* pFramesInDeviceFormat)
-{
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(frameCountInDeviceFormat > 0);
-    MA_ASSERT(pFramesInDeviceFormat != NULL);
-
-    if (pDevice->capture.converter.isPassthrough) {
-        ma_device__handle_data_callback(pDevice, NULL, pFramesInDeviceFormat, frameCountInDeviceFormat);
-    } else {
-        ma_result result;
-        ma_uint8 pFramesInClientFormat[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-        ma_uint64 framesInClientFormatCap = sizeof(pFramesInClientFormat) / ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
-        ma_uint64 totalDeviceFramesProcessed = 0;
-        ma_uint64 totalClientFramesProcessed = 0;
-        const void* pRunningFramesInDeviceFormat = pFramesInDeviceFormat;
-
-        /* We just keep going until we've exhaused all of our input frames and cannot generate any more output frames. */
-        for (;;) {
-            ma_uint64 deviceFramesProcessedThisIteration;
-            ma_uint64 clientFramesProcessedThisIteration;
-
-            deviceFramesProcessedThisIteration = (frameCountInDeviceFormat - totalDeviceFramesProcessed);
-            clientFramesProcessedThisIteration = framesInClientFormatCap;
-
-            result = ma_data_converter_process_pcm_frames(&pDevice->capture.converter, pRunningFramesInDeviceFormat, &deviceFramesProcessedThisIteration, pFramesInClientFormat, &clientFramesProcessedThisIteration);
-            if (result != MA_SUCCESS) {
-                break;
-            }
-
-            if (clientFramesProcessedThisIteration > 0) {
-                ma_device__handle_data_callback(pDevice, NULL, pFramesInClientFormat, (ma_uint32)clientFramesProcessedThisIteration);    /* Safe cast. */
-            }
-
-            pRunningFramesInDeviceFormat = ma_offset_ptr(pRunningFramesInDeviceFormat, deviceFramesProcessedThisIteration * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
-            totalDeviceFramesProcessed  += deviceFramesProcessedThisIteration;
-            totalClientFramesProcessed  += clientFramesProcessedThisIteration;
-
-            /* This is just to silence a warning. I might want to use this variable later so leaving in place for now. */
-            (void)totalClientFramesProcessed;
-
-            if (deviceFramesProcessedThisIteration == 0 && clientFramesProcessedThisIteration == 0) {
-                break;  /* We're done. */
-            }
-        }
-    }
-}
-
-static ma_result ma_device__handle_duplex_callback_capture(ma_device* pDevice, ma_uint32 frameCountInDeviceFormat, const void* pFramesInDeviceFormat, ma_pcm_rb* pRB)
-{
-    ma_result result;
-    ma_uint32 totalDeviceFramesProcessed = 0;
-    const void* pRunningFramesInDeviceFormat = pFramesInDeviceFormat;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(frameCountInDeviceFormat > 0);
-    MA_ASSERT(pFramesInDeviceFormat != NULL);
-    MA_ASSERT(pRB != NULL);
-
-    /* Write to the ring buffer. The ring buffer is in the client format which means we need to convert. */
-    for (;;) {
-        ma_uint32 framesToProcessInDeviceFormat = (frameCountInDeviceFormat - totalDeviceFramesProcessed);
-        ma_uint32 framesToProcessInClientFormat = MA_DATA_CONVERTER_STACK_BUFFER_SIZE / ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
-        ma_uint64 framesProcessedInDeviceFormat;
-        ma_uint64 framesProcessedInClientFormat;
-        void* pFramesInClientFormat;
-
-        result = ma_pcm_rb_acquire_write(pRB, &framesToProcessInClientFormat, &pFramesInClientFormat);
-        if (result != MA_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "Failed to acquire capture PCM frames from ring buffer.");
-            break;
-        }
-
-        if (framesToProcessInClientFormat == 0) {
-            if (ma_pcm_rb_pointer_distance(pRB) == (ma_int32)ma_pcm_rb_get_subbuffer_size(pRB)) {
-                break;  /* Overrun. Not enough room in the ring buffer for input frame. Excess frames are dropped. */
-            }
-        }
-
-        /* Convert. */
-        framesProcessedInDeviceFormat = framesToProcessInDeviceFormat;
-        framesProcessedInClientFormat = framesToProcessInClientFormat;
-        result = ma_data_converter_process_pcm_frames(&pDevice->capture.converter, pRunningFramesInDeviceFormat, &framesProcessedInDeviceFormat, pFramesInClientFormat, &framesProcessedInClientFormat);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        result = ma_pcm_rb_commit_write(pRB, (ma_uint32)framesProcessedInClientFormat);  /* Safe cast. */
-        if (result != MA_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "Failed to commit capture PCM frames to ring buffer.");
-            break;
-        }
-
-        pRunningFramesInDeviceFormat = ma_offset_ptr(pRunningFramesInDeviceFormat, framesProcessedInDeviceFormat * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
-        totalDeviceFramesProcessed += (ma_uint32)framesProcessedInDeviceFormat; /* Safe cast. */
-
-        /* We're done when we're unable to process any client nor device frames. */
-        if (framesProcessedInClientFormat == 0 && framesProcessedInDeviceFormat == 0) {
-            break;  /* Done. */
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device__handle_duplex_callback_playback(ma_device* pDevice, ma_uint32 frameCount, void* pFramesInInternalFormat, ma_pcm_rb* pRB)
-{
-    ma_result result;
-    ma_uint8 silentInputFrames[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-    ma_uint32 totalFramesReadOut = 0;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(frameCount > 0);
-    MA_ASSERT(pFramesInInternalFormat != NULL);
-    MA_ASSERT(pRB != NULL);
-    MA_ASSERT(pDevice->playback.pInputCache != NULL);
-
-    /*
-    Sitting in the ring buffer should be captured data from the capture callback in external format. If there's not enough data in there for
-    the whole frameCount frames we just use silence instead for the input data.
-    */
-    MA_ZERO_MEMORY(silentInputFrames, sizeof(silentInputFrames));
-
-    while (totalFramesReadOut < frameCount && ma_device_is_started(pDevice)) {
-        /*
-        We should have a buffer allocated on the heap. Any playback frames still sitting in there
-        need to be sent to the internal device before we process any more data from the client.
-        */
-        if (pDevice->playback.inputCacheRemaining > 0) {
-            ma_uint64 framesConvertedIn  = pDevice->playback.inputCacheRemaining;
-            ma_uint64 framesConvertedOut = (frameCount - totalFramesReadOut);
-            ma_data_converter_process_pcm_frames(&pDevice->playback.converter, ma_offset_pcm_frames_ptr(pDevice->playback.pInputCache, pDevice->playback.inputCacheConsumed, pDevice->playback.format, pDevice->playback.channels), &framesConvertedIn, pFramesInInternalFormat, &framesConvertedOut);
-
-            pDevice->playback.inputCacheConsumed  += framesConvertedIn;
-            pDevice->playback.inputCacheRemaining -= framesConvertedIn;
-
-            totalFramesReadOut        += (ma_uint32)framesConvertedOut; /* Safe cast. */
-            pFramesInInternalFormat    = ma_offset_ptr(pFramesInInternalFormat, framesConvertedOut * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
-        }
-
-        /* If there's no more data in the cache we'll need to fill it with some. */
-        if (totalFramesReadOut < frameCount && pDevice->playback.inputCacheRemaining == 0) {
-            ma_uint32 inputFrameCount;
-            void* pInputFrames;
-
-            inputFrameCount = (ma_uint32)pDevice->playback.inputCacheCap;
-            result = ma_pcm_rb_acquire_read(pRB, &inputFrameCount, &pInputFrames);
-            if (result == MA_SUCCESS) {
-                if (inputFrameCount > 0) {
-                    ma_device__handle_data_callback(pDevice, pDevice->playback.pInputCache, pInputFrames, inputFrameCount);
-                } else {
-                    if (ma_pcm_rb_pointer_distance(pRB) == 0) {
-                        break;  /* Underrun. */
-                    }
-                }
-            } else {
-                /* No capture data available. Feed in silence. */
-                inputFrameCount = (ma_uint32)ma_min(pDevice->playback.inputCacheCap, sizeof(silentInputFrames) / ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels));
-                ma_device__handle_data_callback(pDevice, pDevice->playback.pInputCache, silentInputFrames, inputFrameCount);
-            }
-
-            pDevice->playback.inputCacheConsumed  = 0;
-            pDevice->playback.inputCacheRemaining = inputFrameCount;
-
-            result = ma_pcm_rb_commit_read(pRB, inputFrameCount);
-            if (result != MA_SUCCESS) {
-                return result;  /* Should never happen. */
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-/* A helper for changing the state of the device. */
-static MA_INLINE void ma_device__set_state(ma_device* pDevice, ma_device_state newState)
-{
-    ma_atomic_device_state_set(&pDevice->state, newState);
-}
-
-
-#if defined(MA_WIN32)
-    static GUID MA_GUID_KSDATAFORMAT_SUBTYPE_PCM        = {0x00000001, 0x0000, 0x0010, {0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}};
-    static GUID MA_GUID_KSDATAFORMAT_SUBTYPE_IEEE_FLOAT = {0x00000003, 0x0000, 0x0010, {0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}};
-    /*static GUID MA_GUID_KSDATAFORMAT_SUBTYPE_ALAW       = {0x00000006, 0x0000, 0x0010, {0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}};*/
-    /*static GUID MA_GUID_KSDATAFORMAT_SUBTYPE_MULAW      = {0x00000007, 0x0000, 0x0010, {0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}};*/
-#endif
-
-
-
-MA_API ma_uint32 ma_get_format_priority_index(ma_format format) /* Lower = better. */
-{
-    ma_uint32 i;
-    for (i = 0; i < ma_countof(g_maFormatPriorities); ++i) {
-        if (g_maFormatPriorities[i] == format) {
-            return i;
-        }
-    }
-
-    /* Getting here means the format could not be found or is equal to ma_format_unknown. */
-    return (ma_uint32)-1;
-}
-
-static ma_result ma_device__post_init_setup(ma_device* pDevice, ma_device_type deviceType);
-
-static ma_bool32 ma_device_descriptor_is_valid(const ma_device_descriptor* pDeviceDescriptor)
-{
-    if (pDeviceDescriptor == NULL) {
-        return MA_FALSE;
-    }
-
-    if (pDeviceDescriptor->format == ma_format_unknown) {
-        return MA_FALSE;
-    }
-
-    if (pDeviceDescriptor->channels == 0 || pDeviceDescriptor->channels > MA_MAX_CHANNELS) {
-        return MA_FALSE;
-    }
-
-    if (pDeviceDescriptor->sampleRate == 0) {
-        return MA_FALSE;
-    }
-
-    return MA_TRUE;
-}
-
-
-static ma_result ma_device_audio_thread__default_read_write(ma_device* pDevice)
-{
-    ma_result result = MA_SUCCESS;
-    ma_bool32 exitLoop = MA_FALSE;
-    ma_uint8  capturedDeviceData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-    ma_uint8  playbackDeviceData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-    ma_uint32 capturedDeviceDataCapInFrames = 0;
-    ma_uint32 playbackDeviceDataCapInFrames = 0;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /* Just some quick validation on the device type and the available callbacks. */
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
-        if (pDevice->pContext->callbacks.onDeviceRead == NULL) {
-            return MA_NOT_IMPLEMENTED;
-        }
-
-        capturedDeviceDataCapInFrames = sizeof(capturedDeviceData) / ma_get_bytes_per_frame(pDevice->capture.internalFormat,  pDevice->capture.internalChannels);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        if (pDevice->pContext->callbacks.onDeviceWrite == NULL) {
-            return MA_NOT_IMPLEMENTED;
-        }
-
-        playbackDeviceDataCapInFrames = sizeof(playbackDeviceData) / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-    }
-
-    /* NOTE: The device was started outside of this function, in the worker thread. */
-
-    while (ma_device_get_state(pDevice) == ma_device_state_started && !exitLoop) {
-        switch (pDevice->type) {
-            case ma_device_type_duplex:
-            {
-                /* The process is: onDeviceRead() -> convert -> callback -> convert -> onDeviceWrite() */
-                ma_uint32 totalCapturedDeviceFramesProcessed = 0;
-                ma_uint32 capturedDevicePeriodSizeInFrames = ma_min(pDevice->capture.internalPeriodSizeInFrames, pDevice->playback.internalPeriodSizeInFrames);
-
-                while (totalCapturedDeviceFramesProcessed < capturedDevicePeriodSizeInFrames) {
-                    ma_uint32 capturedDeviceFramesRemaining;
-                    ma_uint32 capturedDeviceFramesProcessed;
-                    ma_uint32 capturedDeviceFramesToProcess;
-                    ma_uint32 capturedDeviceFramesToTryProcessing = capturedDevicePeriodSizeInFrames - totalCapturedDeviceFramesProcessed;
-                    if (capturedDeviceFramesToTryProcessing > capturedDeviceDataCapInFrames) {
-                        capturedDeviceFramesToTryProcessing = capturedDeviceDataCapInFrames;
-                    }
-
-                    result = pDevice->pContext->callbacks.onDeviceRead(pDevice, capturedDeviceData, capturedDeviceFramesToTryProcessing, &capturedDeviceFramesToProcess);
-                    if (result != MA_SUCCESS) {
-                        exitLoop = MA_TRUE;
-                        break;
-                    }
-
-                    capturedDeviceFramesRemaining = capturedDeviceFramesToProcess;
-                    capturedDeviceFramesProcessed = 0;
-
-                    /* At this point we have our captured data in device format and we now need to convert it to client format. */
-                    for (;;) {
-                        ma_uint8  capturedClientData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-                        ma_uint8  playbackClientData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-                        ma_uint32 capturedClientDataCapInFrames = sizeof(capturedClientData) / ma_get_bytes_per_frame(pDevice->capture.format,  pDevice->capture.channels);
-                        ma_uint32 playbackClientDataCapInFrames = sizeof(playbackClientData) / ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
-                        ma_uint64 capturedClientFramesToProcessThisIteration = ma_min(capturedClientDataCapInFrames, playbackClientDataCapInFrames);
-                        ma_uint64 capturedDeviceFramesToProcessThisIteration = capturedDeviceFramesRemaining;
-                        ma_uint8* pRunningCapturedDeviceFrames = ma_offset_ptr(capturedDeviceData, capturedDeviceFramesProcessed * ma_get_bytes_per_frame(pDevice->capture.internalFormat,  pDevice->capture.internalChannels));
-
-                        /* Convert capture data from device format to client format. */
-                        result = ma_data_converter_process_pcm_frames(&pDevice->capture.converter, pRunningCapturedDeviceFrames, &capturedDeviceFramesToProcessThisIteration, capturedClientData, &capturedClientFramesToProcessThisIteration);
-                        if (result != MA_SUCCESS) {
-                            break;
-                        }
-
-                        /*
-                        If we weren't able to generate any output frames it must mean we've exhaused all of our input. The only time this would not be the case is if capturedClientData was too small
-                        which should never be the case when it's of the size MA_DATA_CONVERTER_STACK_BUFFER_SIZE.
-                        */
-                        if (capturedClientFramesToProcessThisIteration == 0) {
-                            break;
-                        }
-
-                        ma_device__handle_data_callback(pDevice, playbackClientData, capturedClientData, (ma_uint32)capturedClientFramesToProcessThisIteration);    /* Safe cast .*/
-
-                        capturedDeviceFramesProcessed += (ma_uint32)capturedDeviceFramesToProcessThisIteration; /* Safe cast. */
-                        capturedDeviceFramesRemaining -= (ma_uint32)capturedDeviceFramesToProcessThisIteration; /* Safe cast. */
-
-                        /* At this point the playbackClientData buffer should be holding data that needs to be written to the device. */
-                        for (;;) {
-                            ma_uint64 convertedClientFrameCount = capturedClientFramesToProcessThisIteration;
-                            ma_uint64 convertedDeviceFrameCount = playbackDeviceDataCapInFrames;
-                            result = ma_data_converter_process_pcm_frames(&pDevice->playback.converter, playbackClientData, &convertedClientFrameCount, playbackDeviceData, &convertedDeviceFrameCount);
-                            if (result != MA_SUCCESS) {
-                                break;
-                            }
-
-                            result = pDevice->pContext->callbacks.onDeviceWrite(pDevice, playbackDeviceData, (ma_uint32)convertedDeviceFrameCount, NULL);   /* Safe cast. */
-                            if (result != MA_SUCCESS) {
-                                exitLoop = MA_TRUE;
-                                break;
-                            }
-
-                            capturedClientFramesToProcessThisIteration -= (ma_uint32)convertedClientFrameCount;  /* Safe cast. */
-                            if (capturedClientFramesToProcessThisIteration == 0) {
-                                break;
-                            }
-                        }
-
-                        /* In case an error happened from ma_device_write__null()... */
-                        if (result != MA_SUCCESS) {
-                            exitLoop = MA_TRUE;
-                            break;
-                        }
-                    }
-
-                    /* Make sure we don't get stuck in the inner loop. */
-                    if (capturedDeviceFramesProcessed == 0) {
-                        break;
-                    }
-
-                    totalCapturedDeviceFramesProcessed += capturedDeviceFramesProcessed;
-                }
-            } break;
-
-            case ma_device_type_capture:
-            case ma_device_type_loopback:
-            {
-                ma_uint32 periodSizeInFrames = pDevice->capture.internalPeriodSizeInFrames;
-                ma_uint32 framesReadThisPeriod = 0;
-                while (framesReadThisPeriod < periodSizeInFrames) {
-                    ma_uint32 framesRemainingInPeriod = periodSizeInFrames - framesReadThisPeriod;
-                    ma_uint32 framesProcessed;
-                    ma_uint32 framesToReadThisIteration = framesRemainingInPeriod;
-                    if (framesToReadThisIteration > capturedDeviceDataCapInFrames) {
-                        framesToReadThisIteration = capturedDeviceDataCapInFrames;
-                    }
-
-                    result = pDevice->pContext->callbacks.onDeviceRead(pDevice, capturedDeviceData, framesToReadThisIteration, &framesProcessed);
-                    if (result != MA_SUCCESS) {
-                        exitLoop = MA_TRUE;
-                        break;
-                    }
-
-                    /* Make sure we don't get stuck in the inner loop. */
-                    if (framesProcessed == 0) {
-                        break;
-                    }
-
-                    ma_device__send_frames_to_client(pDevice, framesProcessed, capturedDeviceData);
-
-                    framesReadThisPeriod += framesProcessed;
-                }
-            } break;
-
-            case ma_device_type_playback:
-            {
-                /* We write in chunks of the period size, but use a stack allocated buffer for the intermediary. */
-                ma_uint32 periodSizeInFrames = pDevice->playback.internalPeriodSizeInFrames;
-                ma_uint32 framesWrittenThisPeriod = 0;
-                while (framesWrittenThisPeriod < periodSizeInFrames) {
-                    ma_uint32 framesRemainingInPeriod = periodSizeInFrames - framesWrittenThisPeriod;
-                    ma_uint32 framesProcessed;
-                    ma_uint32 framesToWriteThisIteration = framesRemainingInPeriod;
-                    if (framesToWriteThisIteration > playbackDeviceDataCapInFrames) {
-                        framesToWriteThisIteration = playbackDeviceDataCapInFrames;
-                    }
-
-                    ma_device__read_frames_from_client(pDevice, framesToWriteThisIteration, playbackDeviceData);
-
-                    result = pDevice->pContext->callbacks.onDeviceWrite(pDevice, playbackDeviceData, framesToWriteThisIteration, &framesProcessed);
-                    if (result != MA_SUCCESS) {
-                        exitLoop = MA_TRUE;
-                        break;
-                    }
-
-                    /* Make sure we don't get stuck in the inner loop. */
-                    if (framesProcessed == 0) {
-                        break;
-                    }
-
-                    framesWrittenThisPeriod += framesProcessed;
-                }
-            } break;
-
-            /* Should never get here. */
-            default: break;
-        }
-    }
-
-    return result;
-}
-
-
-
-/*******************************************************************************
-
-Null Backend
-
-*******************************************************************************/
-#ifdef MA_HAS_NULL
-
-#define MA_DEVICE_OP_NONE__NULL    0
-#define MA_DEVICE_OP_START__NULL   1
-#define MA_DEVICE_OP_SUSPEND__NULL 2
-#define MA_DEVICE_OP_KILL__NULL    3
-
-static ma_thread_result MA_THREADCALL ma_device_thread__null(void* pData)
-{
-    ma_device* pDevice = (ma_device*)pData;
-    MA_ASSERT(pDevice != NULL);
-
-    for (;;) {  /* Keep the thread alive until the device is uninitialized. */
-        ma_uint32 operation;
-
-        /* Wait for an operation to be requested. */
-        ma_event_wait(&pDevice->null_device.operationEvent);
-
-        /* At this point an event should have been triggered. */
-        operation = pDevice->null_device.operation;
-
-        /* Starting the device needs to put the thread into a loop. */
-        if (operation == MA_DEVICE_OP_START__NULL) {
-            /* Reset the timer just in case. */
-            ma_timer_init(&pDevice->null_device.timer);
-
-            /* Getting here means a suspend or kill operation has been requested. */
-            pDevice->null_device.operationResult = MA_SUCCESS;
-            ma_event_signal(&pDevice->null_device.operationCompletionEvent);
-            ma_semaphore_release(&pDevice->null_device.operationSemaphore);
-            continue;
-        }
-
-        /* Suspending the device means we need to stop the timer and just continue the loop. */
-        if (operation == MA_DEVICE_OP_SUSPEND__NULL) {
-            /* We need to add the current run time to the prior run time, then reset the timer. */
-            pDevice->null_device.priorRunTime += ma_timer_get_time_in_seconds(&pDevice->null_device.timer);
-            ma_timer_init(&pDevice->null_device.timer);
-
-            /* We're done. */
-            pDevice->null_device.operationResult = MA_SUCCESS;
-            ma_event_signal(&pDevice->null_device.operationCompletionEvent);
-            ma_semaphore_release(&pDevice->null_device.operationSemaphore);
-            continue;
-        }
-
-        /* Killing the device means we need to get out of this loop so that this thread can terminate. */
-        if (operation == MA_DEVICE_OP_KILL__NULL) {
-            pDevice->null_device.operationResult = MA_SUCCESS;
-            ma_event_signal(&pDevice->null_device.operationCompletionEvent);
-            ma_semaphore_release(&pDevice->null_device.operationSemaphore);
-            break;
-        }
-
-        /* Getting a signal on a "none" operation probably means an error. Return invalid operation. */
-        if (operation == MA_DEVICE_OP_NONE__NULL) {
-            MA_ASSERT(MA_FALSE);  /* <-- Trigger this in debug mode to ensure developers are aware they're doing something wrong (or there's a bug in a miniaudio). */
-            pDevice->null_device.operationResult = MA_INVALID_OPERATION;
-            ma_event_signal(&pDevice->null_device.operationCompletionEvent);
-            ma_semaphore_release(&pDevice->null_device.operationSemaphore);
-            continue;   /* Continue the loop. Don't terminate. */
-        }
-    }
-
-    return (ma_thread_result)0;
-}
-
-static ma_result ma_device_do_operation__null(ma_device* pDevice, ma_uint32 operation)
-{
-    ma_result result;
-
-    /*
-    TODO: Need to review this and consider just using mutual exclusion. I think the original motivation
-    for this was to just post the event to a queue and return immediately, but that has since changed
-    and now this function is synchronous. I think this can be simplified to just use a mutex.
-    */
-
-    /*
-    The first thing to do is wait for an operation slot to become available. We only have a single slot for this, but we could extend this later
-    to support queing of operations.
-    */
-    result = ma_semaphore_wait(&pDevice->null_device.operationSemaphore);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to wait for the event. */
-    }
-
-    /*
-    When we get here it means the background thread is not referencing the operation code and it can be changed. After changing this we need to
-    signal an event to the worker thread to let it know that it can start work.
-    */
-    pDevice->null_device.operation = operation;
-
-    /* Once the operation code has been set, the worker thread can start work. */
-    if (ma_event_signal(&pDevice->null_device.operationEvent) != MA_SUCCESS) {
-        return MA_ERROR;
-    }
-
-    /* We want everything to be synchronous so we're going to wait for the worker thread to complete it's operation. */
-    if (ma_event_wait(&pDevice->null_device.operationCompletionEvent) != MA_SUCCESS) {
-        return MA_ERROR;
-    }
-
-    return pDevice->null_device.operationResult;
-}
-
-static ma_uint64 ma_device_get_total_run_time_in_frames__null(ma_device* pDevice)
-{
-    ma_uint32 internalSampleRate;
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        internalSampleRate = pDevice->capture.internalSampleRate;
-    } else {
-        internalSampleRate = pDevice->playback.internalSampleRate;
-    }
-
-    return (ma_uint64)((pDevice->null_device.priorRunTime + ma_timer_get_time_in_seconds(&pDevice->null_device.timer)) * internalSampleRate);
-}
-
-static ma_result ma_context_enumerate_devices__null(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_bool32 cbResult = MA_TRUE;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    /* Playback. */
-    if (cbResult) {
-        ma_device_info deviceInfo;
-        MA_ZERO_OBJECT(&deviceInfo);
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), "NULL Playback Device", (size_t)-1);
-        deviceInfo.isDefault = MA_TRUE; /* Only one playback and capture device for the null backend, so might as well mark as default. */
-        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-    }
-
-    /* Capture. */
-    if (cbResult) {
-        ma_device_info deviceInfo;
-        MA_ZERO_OBJECT(&deviceInfo);
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), "NULL Capture Device", (size_t)-1);
-        deviceInfo.isDefault = MA_TRUE; /* Only one playback and capture device for the null backend, so might as well mark as default. */
-        cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-    }
-
-    (void)cbResult; /* Silence a static analysis warning. */
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info__null(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    MA_ASSERT(pContext != NULL);
-
-    if (pDeviceID != NULL && pDeviceID->nullbackend != 0) {
-        return MA_NO_DEVICE;   /* Don't know the device. */
-    }
-
-    /* Name / Description */
-    if (deviceType == ma_device_type_playback) {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), "NULL Playback Device", (size_t)-1);
-    } else {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), "NULL Capture Device", (size_t)-1);
-    }
-
-    pDeviceInfo->isDefault = MA_TRUE;   /* Only one playback and capture device for the null backend, so might as well mark as default. */
-
-    /* Support everything on the null backend. */
-    pDeviceInfo->nativeDataFormats[0].format     = ma_format_unknown;
-    pDeviceInfo->nativeDataFormats[0].channels   = 0;
-    pDeviceInfo->nativeDataFormats[0].sampleRate = 0;
-    pDeviceInfo->nativeDataFormats[0].flags      = 0;
-    pDeviceInfo->nativeDataFormatCount = 1;
-
-    (void)pContext;
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_device_uninit__null(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    /* Keep it clean and wait for the device thread to finish before returning. */
-    ma_device_do_operation__null(pDevice, MA_DEVICE_OP_KILL__NULL);
-
-    /* Wait for the thread to finish before continuing. */
-    ma_thread_wait(&pDevice->null_device.deviceThread);
-
-    /* At this point the loop in the device thread is as good as terminated so we can uninitialize our events. */
-    ma_semaphore_uninit(&pDevice->null_device.operationSemaphore);
-    ma_event_uninit(&pDevice->null_device.operationCompletionEvent);
-    ma_event_uninit(&pDevice->null_device.operationEvent);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init__null(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ZERO_OBJECT(&pDevice->null_device);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    /* The null backend supports everything exactly as we specify it. */
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        pDescriptorCapture->format     = (pDescriptorCapture->format     != ma_format_unknown) ? pDescriptorCapture->format     : MA_DEFAULT_FORMAT;
-        pDescriptorCapture->channels   = (pDescriptorCapture->channels   != 0)                 ? pDescriptorCapture->channels   : MA_DEFAULT_CHANNELS;
-        pDescriptorCapture->sampleRate = (pDescriptorCapture->sampleRate != 0)                 ? pDescriptorCapture->sampleRate : MA_DEFAULT_SAMPLE_RATE;
-
-        if (pDescriptorCapture->channelMap[0] == MA_CHANNEL_NONE) {
-            ma_channel_map_init_standard(ma_standard_channel_map_default, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorCapture->channels);
-        }
-
-        pDescriptorCapture->periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptorCapture, pDescriptorCapture->sampleRate, pConfig->performanceProfile);
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        pDescriptorPlayback->format     = (pDescriptorPlayback->format     != ma_format_unknown) ? pDescriptorPlayback->format     : MA_DEFAULT_FORMAT;
-        pDescriptorPlayback->channels   = (pDescriptorPlayback->channels   != 0)                 ? pDescriptorPlayback->channels   : MA_DEFAULT_CHANNELS;
-        pDescriptorPlayback->sampleRate = (pDescriptorPlayback->sampleRate != 0)                 ? pDescriptorPlayback->sampleRate : MA_DEFAULT_SAMPLE_RATE;
-
-        if (pDescriptorPlayback->channelMap[0] == MA_CHANNEL_NONE) {
-            ma_channel_map_init_standard(ma_standard_channel_map_default, pDescriptorPlayback->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorPlayback->channels);
-        }
-
-        pDescriptorPlayback->periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptorPlayback, pDescriptorPlayback->sampleRate, pConfig->performanceProfile);
-    }
-
-    /*
-    In order to get timing right, we need to create a thread that does nothing but keeps track of the timer. This timer is started when the
-    first period is "written" to it, and then stopped in ma_device_stop__null().
-    */
-    result = ma_event_init(&pDevice->null_device.operationEvent);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = ma_event_init(&pDevice->null_device.operationCompletionEvent);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = ma_semaphore_init(1, &pDevice->null_device.operationSemaphore);    /* <-- It's important that the initial value is set to 1. */
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = ma_thread_create(&pDevice->null_device.deviceThread, pDevice->pContext->threadPriority, 0, ma_device_thread__null, pDevice, &pDevice->pContext->allocationCallbacks);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_start__null(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    ma_device_do_operation__null(pDevice, MA_DEVICE_OP_START__NULL);
-
-    ma_atomic_bool32_set(&pDevice->null_device.isStarted, MA_TRUE);
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__null(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    ma_device_do_operation__null(pDevice, MA_DEVICE_OP_SUSPEND__NULL);
-
-    ma_atomic_bool32_set(&pDevice->null_device.isStarted, MA_FALSE);
-    return MA_SUCCESS;
-}
-
-static ma_bool32 ma_device_is_started__null(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    return ma_atomic_bool32_get(&pDevice->null_device.isStarted);
-}
-
-static ma_result ma_device_write__null(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint32 totalPCMFramesProcessed;
-    ma_bool32 wasStartedOnEntry;
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = 0;
-    }
-
-    wasStartedOnEntry = ma_device_is_started__null(pDevice);
-
-    /* Keep going until everything has been read. */
-    totalPCMFramesProcessed = 0;
-    while (totalPCMFramesProcessed < frameCount) {
-        ma_uint64 targetFrame;
-
-        /* If there are any frames remaining in the current period, consume those first. */
-        if (pDevice->null_device.currentPeriodFramesRemainingPlayback > 0) {
-            ma_uint32 framesRemaining = (frameCount - totalPCMFramesProcessed);
-            ma_uint32 framesToProcess = pDevice->null_device.currentPeriodFramesRemainingPlayback;
-            if (framesToProcess > framesRemaining) {
-                framesToProcess = framesRemaining;
-            }
-
-            /* We don't actually do anything with pPCMFrames, so just mark it as unused to prevent a warning. */
-            (void)pPCMFrames;
-
-            pDevice->null_device.currentPeriodFramesRemainingPlayback -= framesToProcess;
-            totalPCMFramesProcessed += framesToProcess;
-        }
-
-        /* If we've consumed the current period we'll need to mark it as such an ensure the device is started if it's not already. */
-        if (pDevice->null_device.currentPeriodFramesRemainingPlayback == 0) {
-            pDevice->null_device.currentPeriodFramesRemainingPlayback = 0;
-
-            if (!ma_device_is_started__null(pDevice) && !wasStartedOnEntry) {
-                result = ma_device_start__null(pDevice);
-                if (result != MA_SUCCESS) {
-                    break;
-                }
-            }
-        }
-
-        /* If we've consumed the whole buffer we can return now. */
-        MA_ASSERT(totalPCMFramesProcessed <= frameCount);
-        if (totalPCMFramesProcessed == frameCount) {
-            break;
-        }
-
-        /* Getting here means we've still got more frames to consume, we but need to wait for it to become available. */
-        targetFrame = pDevice->null_device.lastProcessedFramePlayback;
-        for (;;) {
-            ma_uint64 currentFrame;
-
-            /* Stop waiting if the device has been stopped. */
-            if (!ma_device_is_started__null(pDevice)) {
-                break;
-            }
-
-            currentFrame = ma_device_get_total_run_time_in_frames__null(pDevice);
-            if (currentFrame >= targetFrame) {
-                break;
-            }
-
-            /* Getting here means we haven't yet reached the target sample, so continue waiting. */
-            ma_sleep(10);
-        }
-
-        pDevice->null_device.lastProcessedFramePlayback          += pDevice->playback.internalPeriodSizeInFrames;
-        pDevice->null_device.currentPeriodFramesRemainingPlayback = pDevice->playback.internalPeriodSizeInFrames;
-    }
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = totalPCMFramesProcessed;
-    }
-
-    return result;
-}
-
-static ma_result ma_device_read__null(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint32 totalPCMFramesProcessed;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    /* Keep going until everything has been read. */
-    totalPCMFramesProcessed = 0;
-    while (totalPCMFramesProcessed < frameCount) {
-        ma_uint64 targetFrame;
-
-        /* If there are any frames remaining in the current period, consume those first. */
-        if (pDevice->null_device.currentPeriodFramesRemainingCapture > 0) {
-            ma_uint32 bpf = ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-            ma_uint32 framesRemaining = (frameCount - totalPCMFramesProcessed);
-            ma_uint32 framesToProcess = pDevice->null_device.currentPeriodFramesRemainingCapture;
-            if (framesToProcess > framesRemaining) {
-                framesToProcess = framesRemaining;
-            }
-
-            /* We need to ensure the output buffer is zeroed. */
-            MA_ZERO_MEMORY(ma_offset_ptr(pPCMFrames, totalPCMFramesProcessed*bpf), framesToProcess*bpf);
-
-            pDevice->null_device.currentPeriodFramesRemainingCapture -= framesToProcess;
-            totalPCMFramesProcessed += framesToProcess;
-        }
-
-        /* If we've consumed the current period we'll need to mark it as such an ensure the device is started if it's not already. */
-        if (pDevice->null_device.currentPeriodFramesRemainingCapture == 0) {
-            pDevice->null_device.currentPeriodFramesRemainingCapture = 0;
-        }
-
-        /* If we've consumed the whole buffer we can return now. */
-        MA_ASSERT(totalPCMFramesProcessed <= frameCount);
-        if (totalPCMFramesProcessed == frameCount) {
-            break;
-        }
-
-        /* Getting here means we've still got more frames to consume, we but need to wait for it to become available. */
-        targetFrame = pDevice->null_device.lastProcessedFrameCapture + pDevice->capture.internalPeriodSizeInFrames;
-        for (;;) {
-            ma_uint64 currentFrame;
-
-            /* Stop waiting if the device has been stopped. */
-            if (!ma_device_is_started__null(pDevice)) {
-                break;
-            }
-
-            currentFrame = ma_device_get_total_run_time_in_frames__null(pDevice);
-            if (currentFrame >= targetFrame) {
-                break;
-            }
-
-            /* Getting here means we haven't yet reached the target sample, so continue waiting. */
-            ma_sleep(10);
-        }
-
-        pDevice->null_device.lastProcessedFrameCapture          += pDevice->capture.internalPeriodSizeInFrames;
-        pDevice->null_device.currentPeriodFramesRemainingCapture = pDevice->capture.internalPeriodSizeInFrames;
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = totalPCMFramesProcessed;
-    }
-
-    return result;
-}
-
-static ma_result ma_context_uninit__null(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_null);
-
-    (void)pContext;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__null(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    MA_ASSERT(pContext != NULL);
-
-    (void)pConfig;
-    (void)pContext;
-
-    pCallbacks->onContextInit             = ma_context_init__null;
-    pCallbacks->onContextUninit           = ma_context_uninit__null;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__null;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__null;
-    pCallbacks->onDeviceInit              = ma_device_init__null;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__null;
-    pCallbacks->onDeviceStart             = ma_device_start__null;
-    pCallbacks->onDeviceStop              = ma_device_stop__null;
-    pCallbacks->onDeviceRead              = ma_device_read__null;
-    pCallbacks->onDeviceWrite             = ma_device_write__null;
-    pCallbacks->onDeviceDataLoop          = NULL;   /* Our backend is asynchronous with a blocking read-write API which means we can get miniaudio to deal with the audio thread. */
-
-    /* The null backend always works. */
-    return MA_SUCCESS;
-}
-#endif
-
-
-
-/*******************************************************************************
-
-WIN32 COMMON
-
-*******************************************************************************/
-#if defined(MA_WIN32)
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    #define ma_CoInitializeEx(pContext, pvReserved, dwCoInit)                          ((pContext->win32.CoInitializeEx) ? ((MA_PFN_CoInitializeEx)pContext->win32.CoInitializeEx)(pvReserved, dwCoInit) : ((MA_PFN_CoInitialize)pContext->win32.CoInitialize)(pvReserved))
-    #define ma_CoUninitialize(pContext)                                                ((MA_PFN_CoUninitialize)pContext->win32.CoUninitialize)()
-    #define ma_CoCreateInstance(pContext, rclsid, pUnkOuter, dwClsContext, riid, ppv)  ((MA_PFN_CoCreateInstance)pContext->win32.CoCreateInstance)(rclsid, pUnkOuter, dwClsContext, riid, ppv)
-    #define ma_CoTaskMemFree(pContext, pv)                                             ((MA_PFN_CoTaskMemFree)pContext->win32.CoTaskMemFree)(pv)
-    #define ma_PropVariantClear(pContext, pvar)                                        ((MA_PFN_PropVariantClear)pContext->win32.PropVariantClear)(pvar)
-#else
-    #define ma_CoInitializeEx(pContext, pvReserved, dwCoInit)                          CoInitializeEx(pvReserved, dwCoInit)
-    #define ma_CoUninitialize(pContext)                                                CoUninitialize()
-    #define ma_CoCreateInstance(pContext, rclsid, pUnkOuter, dwClsContext, riid, ppv)  CoCreateInstance(rclsid, pUnkOuter, dwClsContext, riid, ppv)
-    #define ma_CoTaskMemFree(pContext, pv)                                             CoTaskMemFree(pv)
-    #define ma_PropVariantClear(pContext, pvar)                                        PropVariantClear(pvar)
-#endif
-
-#if !defined(MAXULONG_PTR) && !defined(__WATCOMC__)
-typedef size_t DWORD_PTR;
-#endif
-
-#if !defined(WAVE_FORMAT_1M08)
-#define WAVE_FORMAT_1M08    0x00000001
-#define WAVE_FORMAT_1S08    0x00000002
-#define WAVE_FORMAT_1M16    0x00000004
-#define WAVE_FORMAT_1S16    0x00000008
-#define WAVE_FORMAT_2M08    0x00000010
-#define WAVE_FORMAT_2S08    0x00000020
-#define WAVE_FORMAT_2M16    0x00000040
-#define WAVE_FORMAT_2S16    0x00000080
-#define WAVE_FORMAT_4M08    0x00000100
-#define WAVE_FORMAT_4S08    0x00000200
-#define WAVE_FORMAT_4M16    0x00000400
-#define WAVE_FORMAT_4S16    0x00000800
-#endif
-
-#if !defined(WAVE_FORMAT_44M08)
-#define WAVE_FORMAT_44M08   0x00000100
-#define WAVE_FORMAT_44S08   0x00000200
-#define WAVE_FORMAT_44M16   0x00000400
-#define WAVE_FORMAT_44S16   0x00000800
-#define WAVE_FORMAT_48M08   0x00001000
-#define WAVE_FORMAT_48S08   0x00002000
-#define WAVE_FORMAT_48M16   0x00004000
-#define WAVE_FORMAT_48S16   0x00008000
-#define WAVE_FORMAT_96M08   0x00010000
-#define WAVE_FORMAT_96S08   0x00020000
-#define WAVE_FORMAT_96M16   0x00040000
-#define WAVE_FORMAT_96S16   0x00080000
-#endif
-
-#ifndef SPEAKER_FRONT_LEFT
-#define SPEAKER_FRONT_LEFT            0x1
-#define SPEAKER_FRONT_RIGHT           0x2
-#define SPEAKER_FRONT_CENTER          0x4
-#define SPEAKER_LOW_FREQUENCY         0x8
-#define SPEAKER_BACK_LEFT             0x10
-#define SPEAKER_BACK_RIGHT            0x20
-#define SPEAKER_FRONT_LEFT_OF_CENTER  0x40
-#define SPEAKER_FRONT_RIGHT_OF_CENTER 0x80
-#define SPEAKER_BACK_CENTER           0x100
-#define SPEAKER_SIDE_LEFT             0x200
-#define SPEAKER_SIDE_RIGHT            0x400
-#define SPEAKER_TOP_CENTER            0x800
-#define SPEAKER_TOP_FRONT_LEFT        0x1000
-#define SPEAKER_TOP_FRONT_CENTER      0x2000
-#define SPEAKER_TOP_FRONT_RIGHT       0x4000
-#define SPEAKER_TOP_BACK_LEFT         0x8000
-#define SPEAKER_TOP_BACK_CENTER       0x10000
-#define SPEAKER_TOP_BACK_RIGHT        0x20000
-#endif
-
-/*
-Implement our own version of MA_WAVEFORMATEXTENSIBLE so we can avoid a header. Be careful with this
-because MA_WAVEFORMATEX has an extra two bytes over standard WAVEFORMATEX due to padding. The
-standard version uses tight packing, but for compiler compatibility we're not doing that with ours.
-*/
-typedef struct
-{
-    WORD wFormatTag;
-    WORD nChannels;
-    DWORD nSamplesPerSec;
-    DWORD nAvgBytesPerSec;
-    WORD nBlockAlign;
-    WORD wBitsPerSample;
-    WORD cbSize;
-} MA_WAVEFORMATEX;
-
-typedef struct
-{
-    WORD wFormatTag;
-    WORD nChannels;
-    DWORD nSamplesPerSec;
-    DWORD nAvgBytesPerSec;
-    WORD nBlockAlign;
-    WORD wBitsPerSample;
-    WORD cbSize;
-    union
-    {
-        WORD wValidBitsPerSample;
-        WORD wSamplesPerBlock;
-        WORD wReserved;
-    } Samples;
-    DWORD dwChannelMask;
-    GUID SubFormat;
-} MA_WAVEFORMATEXTENSIBLE;
-
-
-
-#ifndef WAVE_FORMAT_EXTENSIBLE
-#define WAVE_FORMAT_EXTENSIBLE  0xFFFE
-#endif
-
-#ifndef WAVE_FORMAT_PCM
-#define WAVE_FORMAT_PCM         1
-#endif
-
-#ifndef WAVE_FORMAT_IEEE_FLOAT
-#define WAVE_FORMAT_IEEE_FLOAT  0x0003
-#endif
-
-/* Converts an individual Win32-style channel identifier (SPEAKER_FRONT_LEFT, etc.) to miniaudio. */
-static ma_uint8 ma_channel_id_to_ma__win32(DWORD id)
-{
-    switch (id)
-    {
-        case SPEAKER_FRONT_LEFT:            return MA_CHANNEL_FRONT_LEFT;
-        case SPEAKER_FRONT_RIGHT:           return MA_CHANNEL_FRONT_RIGHT;
-        case SPEAKER_FRONT_CENTER:          return MA_CHANNEL_FRONT_CENTER;
-        case SPEAKER_LOW_FREQUENCY:         return MA_CHANNEL_LFE;
-        case SPEAKER_BACK_LEFT:             return MA_CHANNEL_BACK_LEFT;
-        case SPEAKER_BACK_RIGHT:            return MA_CHANNEL_BACK_RIGHT;
-        case SPEAKER_FRONT_LEFT_OF_CENTER:  return MA_CHANNEL_FRONT_LEFT_CENTER;
-        case SPEAKER_FRONT_RIGHT_OF_CENTER: return MA_CHANNEL_FRONT_RIGHT_CENTER;
-        case SPEAKER_BACK_CENTER:           return MA_CHANNEL_BACK_CENTER;
-        case SPEAKER_SIDE_LEFT:             return MA_CHANNEL_SIDE_LEFT;
-        case SPEAKER_SIDE_RIGHT:            return MA_CHANNEL_SIDE_RIGHT;
-        case SPEAKER_TOP_CENTER:            return MA_CHANNEL_TOP_CENTER;
-        case SPEAKER_TOP_FRONT_LEFT:        return MA_CHANNEL_TOP_FRONT_LEFT;
-        case SPEAKER_TOP_FRONT_CENTER:      return MA_CHANNEL_TOP_FRONT_CENTER;
-        case SPEAKER_TOP_FRONT_RIGHT:       return MA_CHANNEL_TOP_FRONT_RIGHT;
-        case SPEAKER_TOP_BACK_LEFT:         return MA_CHANNEL_TOP_BACK_LEFT;
-        case SPEAKER_TOP_BACK_CENTER:       return MA_CHANNEL_TOP_BACK_CENTER;
-        case SPEAKER_TOP_BACK_RIGHT:        return MA_CHANNEL_TOP_BACK_RIGHT;
-        default: return 0;
-    }
-}
-
-/* Converts an individual miniaudio channel identifier (MA_CHANNEL_FRONT_LEFT, etc.) to Win32-style. */
-static DWORD ma_channel_id_to_win32(DWORD id)
-{
-    switch (id)
-    {
-        case MA_CHANNEL_MONO:               return SPEAKER_FRONT_CENTER;
-        case MA_CHANNEL_FRONT_LEFT:         return SPEAKER_FRONT_LEFT;
-        case MA_CHANNEL_FRONT_RIGHT:        return SPEAKER_FRONT_RIGHT;
-        case MA_CHANNEL_FRONT_CENTER:       return SPEAKER_FRONT_CENTER;
-        case MA_CHANNEL_LFE:                return SPEAKER_LOW_FREQUENCY;
-        case MA_CHANNEL_BACK_LEFT:          return SPEAKER_BACK_LEFT;
-        case MA_CHANNEL_BACK_RIGHT:         return SPEAKER_BACK_RIGHT;
-        case MA_CHANNEL_FRONT_LEFT_CENTER:  return SPEAKER_FRONT_LEFT_OF_CENTER;
-        case MA_CHANNEL_FRONT_RIGHT_CENTER: return SPEAKER_FRONT_RIGHT_OF_CENTER;
-        case MA_CHANNEL_BACK_CENTER:        return SPEAKER_BACK_CENTER;
-        case MA_CHANNEL_SIDE_LEFT:          return SPEAKER_SIDE_LEFT;
-        case MA_CHANNEL_SIDE_RIGHT:         return SPEAKER_SIDE_RIGHT;
-        case MA_CHANNEL_TOP_CENTER:         return SPEAKER_TOP_CENTER;
-        case MA_CHANNEL_TOP_FRONT_LEFT:     return SPEAKER_TOP_FRONT_LEFT;
-        case MA_CHANNEL_TOP_FRONT_CENTER:   return SPEAKER_TOP_FRONT_CENTER;
-        case MA_CHANNEL_TOP_FRONT_RIGHT:    return SPEAKER_TOP_FRONT_RIGHT;
-        case MA_CHANNEL_TOP_BACK_LEFT:      return SPEAKER_TOP_BACK_LEFT;
-        case MA_CHANNEL_TOP_BACK_CENTER:    return SPEAKER_TOP_BACK_CENTER;
-        case MA_CHANNEL_TOP_BACK_RIGHT:     return SPEAKER_TOP_BACK_RIGHT;
-        default: return 0;
-    }
-}
-
-/* Converts a channel mapping to a Win32-style channel mask. */
-static DWORD ma_channel_map_to_channel_mask__win32(const ma_channel* pChannelMap, ma_uint32 channels)
-{
-    DWORD dwChannelMask = 0;
-    ma_uint32 iChannel;
-
-    for (iChannel = 0; iChannel < channels; ++iChannel) {
-        dwChannelMask |= ma_channel_id_to_win32(pChannelMap[iChannel]);
-    }
-
-    return dwChannelMask;
-}
-
-/* Converts a Win32-style channel mask to a miniaudio channel map. */
-static void ma_channel_mask_to_channel_map__win32(DWORD dwChannelMask, ma_uint32 channels, ma_channel* pChannelMap)
-{
-    /* If the channel mask is set to 0, just assume a default Win32 channel map. */
-    if (dwChannelMask == 0) {
-        ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pChannelMap, channels, channels);
-    } else {
-        if (channels == 1 && (dwChannelMask & SPEAKER_FRONT_CENTER) != 0) {
-            pChannelMap[0] = MA_CHANNEL_MONO;
-        } else {
-            /* Just iterate over each bit. */
-            ma_uint32 iChannel = 0;
-            ma_uint32 iBit;
-
-            for (iBit = 0; iBit < 32 && iChannel < channels; ++iBit) {
-                DWORD bitValue = (dwChannelMask & (1UL << iBit));
-                if (bitValue != 0) {
-                    /* The bit is set. */
-                    pChannelMap[iChannel] = ma_channel_id_to_ma__win32(bitValue);
-                    iChannel += 1;
-                }
-            }
-        }
-    }
-}
-
-#ifdef __cplusplus
-static ma_bool32 ma_is_guid_equal(const void* a, const void* b)
-{
-    return IsEqualGUID(*(const GUID*)a, *(const GUID*)b);
-}
-#else
-#define ma_is_guid_equal(a, b) IsEqualGUID((const GUID*)a, (const GUID*)b)
-#endif
-
-static MA_INLINE ma_bool32 ma_is_guid_null(const void* guid)
-{
-    static GUID nullguid = {0x00000000, 0x0000, 0x0000, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}};
-    return ma_is_guid_equal(guid, &nullguid);
-}
-
-static ma_format ma_format_from_WAVEFORMATEX(const MA_WAVEFORMATEX* pWF)
-{
-    MA_ASSERT(pWF != NULL);
-
-    if (pWF->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
-        const MA_WAVEFORMATEXTENSIBLE* pWFEX = (const MA_WAVEFORMATEXTENSIBLE*)pWF;
-        if (ma_is_guid_equal(&pWFEX->SubFormat, &MA_GUID_KSDATAFORMAT_SUBTYPE_PCM)) {
-            if (pWFEX->Samples.wValidBitsPerSample == 32) {
-                return ma_format_s32;
-            }
-            if (pWFEX->Samples.wValidBitsPerSample == 24) {
-                if (pWFEX->wBitsPerSample == 32) {
-                    return ma_format_s32;
-                }
-                if (pWFEX->wBitsPerSample == 24) {
-                    return ma_format_s24;
-                }
-            }
-            if (pWFEX->Samples.wValidBitsPerSample == 16) {
-                return ma_format_s16;
-            }
-            if (pWFEX->Samples.wValidBitsPerSample == 8) {
-                return ma_format_u8;
-            }
-        }
-        if (ma_is_guid_equal(&pWFEX->SubFormat, &MA_GUID_KSDATAFORMAT_SUBTYPE_IEEE_FLOAT)) {
-            if (pWFEX->Samples.wValidBitsPerSample == 32) {
-                return ma_format_f32;
-            }
-            /*
-            if (pWFEX->Samples.wValidBitsPerSample == 64) {
-                return ma_format_f64;
-            }
-            */
-        }
-    } else {
-        if (pWF->wFormatTag == WAVE_FORMAT_PCM) {
-            if (pWF->wBitsPerSample == 32) {
-                return ma_format_s32;
-            }
-            if (pWF->wBitsPerSample == 24) {
-                return ma_format_s24;
-            }
-            if (pWF->wBitsPerSample == 16) {
-                return ma_format_s16;
-            }
-            if (pWF->wBitsPerSample == 8) {
-                return ma_format_u8;
-            }
-        }
-        if (pWF->wFormatTag == WAVE_FORMAT_IEEE_FLOAT) {
-            if (pWF->wBitsPerSample == 32) {
-                return ma_format_f32;
-            }
-            if (pWF->wBitsPerSample == 64) {
-                /*return ma_format_f64;*/
-            }
-        }
-    }
-
-    return ma_format_unknown;
-}
-#endif
-
-
-/*******************************************************************************
-
-WASAPI Backend
-
-*******************************************************************************/
-#ifdef MA_HAS_WASAPI
-#if 0
-#if defined(_MSC_VER)
-    #pragma warning(push)
-    #pragma warning(disable:4091)   /* 'typedef ': ignored on left of '' when no variable is declared */
-#endif
-#include <audioclient.h>
-#include <mmdeviceapi.h>
-#if defined(_MSC_VER)
-    #pragma warning(pop)
-#endif
-#endif  /* 0 */
-
-static ma_result ma_device_reroute__wasapi(ma_device* pDevice, ma_device_type deviceType);
-
-/* Some compilers don't define VerifyVersionInfoW. Need to write this ourselves. */
-#define MA_WIN32_WINNT_VISTA    0x0600
-#define MA_VER_MINORVERSION     0x01
-#define MA_VER_MAJORVERSION     0x02
-#define MA_VER_SERVICEPACKMAJOR 0x20
-#define MA_VER_GREATER_EQUAL    0x03
-
-typedef struct  {
-    DWORD dwOSVersionInfoSize;
-    DWORD dwMajorVersion;
-    DWORD dwMinorVersion;
-    DWORD dwBuildNumber;
-    DWORD dwPlatformId;
-    WCHAR szCSDVersion[128];
-    WORD  wServicePackMajor;
-    WORD  wServicePackMinor;
-    WORD  wSuiteMask;
-    BYTE  wProductType;
-    BYTE  wReserved;
-} ma_OSVERSIONINFOEXW;
-
-typedef BOOL      (WINAPI * ma_PFNVerifyVersionInfoW) (ma_OSVERSIONINFOEXW* lpVersionInfo, DWORD dwTypeMask, DWORDLONG dwlConditionMask);
-typedef ULONGLONG (WINAPI * ma_PFNVerSetConditionMask)(ULONGLONG dwlConditionMask, DWORD dwTypeBitMask, BYTE dwConditionMask);
-
-
-#ifndef PROPERTYKEY_DEFINED
-#define PROPERTYKEY_DEFINED
-#ifndef __WATCOMC__
-typedef struct
-{
-    GUID fmtid;
-    DWORD pid;
-} PROPERTYKEY;
-#endif
-#endif
-
-/* Some compilers don't define PropVariantInit(). We just do this ourselves since it's just a memset(). */
-static MA_INLINE void ma_PropVariantInit(MA_PROPVARIANT* pProp)
-{
-    MA_ZERO_OBJECT(pProp);
-}
-
-
-static const PROPERTYKEY MA_PKEY_Device_FriendlyName             = {{0xA45C254E, 0xDF1C, 0x4EFD, {0x80, 0x20, 0x67, 0xD1, 0x46, 0xA8, 0x50, 0xE0}}, 14};
-static const PROPERTYKEY MA_PKEY_AudioEngine_DeviceFormat        = {{0xF19F064D, 0x82C,  0x4E27, {0xBC, 0x73, 0x68, 0x82, 0xA1, 0xBB, 0x8E, 0x4C}},  0};
-
-static const IID MA_IID_IUnknown                                 = {0x00000000, 0x0000, 0x0000, {0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46}}; /* 00000000-0000-0000-C000-000000000046 */
-#if !defined(MA_WIN32_DESKTOP) && !defined(MA_WIN32_GDK)
-static const IID MA_IID_IAgileObject                             = {0x94EA2B94, 0xE9CC, 0x49E0, {0xC0, 0xFF, 0xEE, 0x64, 0xCA, 0x8F, 0x5B, 0x90}}; /* 94EA2B94-E9CC-49E0-C0FF-EE64CA8F5B90 */
-#endif
-
-static const IID MA_IID_IAudioClient                             = {0x1CB9AD4C, 0xDBFA, 0x4C32, {0xB1, 0x78, 0xC2, 0xF5, 0x68, 0xA7, 0x03, 0xB2}}; /* 1CB9AD4C-DBFA-4C32-B178-C2F568A703B2 = __uuidof(IAudioClient) */
-static const IID MA_IID_IAudioClient2                            = {0x726778CD, 0xF60A, 0x4EDA, {0x82, 0xDE, 0xE4, 0x76, 0x10, 0xCD, 0x78, 0xAA}}; /* 726778CD-F60A-4EDA-82DE-E47610CD78AA = __uuidof(IAudioClient2) */
-static const IID MA_IID_IAudioClient3                            = {0x7ED4EE07, 0x8E67, 0x4CD4, {0x8C, 0x1A, 0x2B, 0x7A, 0x59, 0x87, 0xAD, 0x42}}; /* 7ED4EE07-8E67-4CD4-8C1A-2B7A5987AD42 = __uuidof(IAudioClient3) */
-static const IID MA_IID_IAudioRenderClient                       = {0xF294ACFC, 0x3146, 0x4483, {0xA7, 0xBF, 0xAD, 0xDC, 0xA7, 0xC2, 0x60, 0xE2}}; /* F294ACFC-3146-4483-A7BF-ADDCA7C260E2 = __uuidof(IAudioRenderClient) */
-static const IID MA_IID_IAudioCaptureClient                      = {0xC8ADBD64, 0xE71E, 0x48A0, {0xA4, 0xDE, 0x18, 0x5C, 0x39, 0x5C, 0xD3, 0x17}}; /* C8ADBD64-E71E-48A0-A4DE-185C395CD317 = __uuidof(IAudioCaptureClient) */
-static const IID MA_IID_IMMNotificationClient                    = {0x7991EEC9, 0x7E89, 0x4D85, {0x83, 0x90, 0x6C, 0x70, 0x3C, 0xEC, 0x60, 0xC0}}; /* 7991EEC9-7E89-4D85-8390-6C703CEC60C0 = __uuidof(IMMNotificationClient) */
-#if !defined(MA_WIN32_DESKTOP) && !defined(MA_WIN32_GDK)
-static const IID MA_IID_DEVINTERFACE_AUDIO_RENDER                = {0xE6327CAD, 0xDCEC, 0x4949, {0xAE, 0x8A, 0x99, 0x1E, 0x97, 0x6A, 0x79, 0xD2}}; /* E6327CAD-DCEC-4949-AE8A-991E976A79D2 */
-static const IID MA_IID_DEVINTERFACE_AUDIO_CAPTURE               = {0x2EEF81BE, 0x33FA, 0x4800, {0x96, 0x70, 0x1C, 0xD4, 0x74, 0x97, 0x2C, 0x3F}}; /* 2EEF81BE-33FA-4800-9670-1CD474972C3F */
-static const IID MA_IID_IActivateAudioInterfaceCompletionHandler = {0x41D949AB, 0x9862, 0x444A, {0x80, 0xF6, 0xC2, 0x61, 0x33, 0x4D, 0xA5, 0xEB}}; /* 41D949AB-9862-444A-80F6-C261334DA5EB */
-#endif
-
-static const IID MA_CLSID_MMDeviceEnumerator                     = {0xBCDE0395, 0xE52F, 0x467C, {0x8E, 0x3D, 0xC4, 0x57, 0x92, 0x91, 0x69, 0x2E}}; /* BCDE0395-E52F-467C-8E3D-C4579291692E = __uuidof(MMDeviceEnumerator) */
-static const IID MA_IID_IMMDeviceEnumerator                      = {0xA95664D2, 0x9614, 0x4F35, {0xA7, 0x46, 0xDE, 0x8D, 0xB6, 0x36, 0x17, 0xE6}}; /* A95664D2-9614-4F35-A746-DE8DB63617E6 = __uuidof(IMMDeviceEnumerator) */
-
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-#define MA_MM_DEVICE_STATE_ACTIVE                          1
-#define MA_MM_DEVICE_STATE_DISABLED                        2
-#define MA_MM_DEVICE_STATE_NOTPRESENT                      4
-#define MA_MM_DEVICE_STATE_UNPLUGGED                       8
-
-typedef struct ma_IMMDeviceEnumerator                      ma_IMMDeviceEnumerator;
-typedef struct ma_IMMDeviceCollection                      ma_IMMDeviceCollection;
-typedef struct ma_IMMDevice                                ma_IMMDevice;
-#else
-typedef struct ma_IActivateAudioInterfaceCompletionHandler ma_IActivateAudioInterfaceCompletionHandler;
-typedef struct ma_IActivateAudioInterfaceAsyncOperation    ma_IActivateAudioInterfaceAsyncOperation;
-#endif
-typedef struct ma_IPropertyStore                           ma_IPropertyStore;
-typedef struct ma_IAudioClient                             ma_IAudioClient;
-typedef struct ma_IAudioClient2                            ma_IAudioClient2;
-typedef struct ma_IAudioClient3                            ma_IAudioClient3;
-typedef struct ma_IAudioRenderClient                       ma_IAudioRenderClient;
-typedef struct ma_IAudioCaptureClient                      ma_IAudioCaptureClient;
-
-typedef ma_int64                                           MA_REFERENCE_TIME;
-
-#define MA_AUDCLNT_STREAMFLAGS_CROSSPROCESS                0x00010000
-#define MA_AUDCLNT_STREAMFLAGS_LOOPBACK                    0x00020000
-#define MA_AUDCLNT_STREAMFLAGS_EVENTCALLBACK               0x00040000
-#define MA_AUDCLNT_STREAMFLAGS_NOPERSIST                   0x00080000
-#define MA_AUDCLNT_STREAMFLAGS_RATEADJUST                  0x00100000
-#define MA_AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY         0x08000000
-#define MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM              0x80000000
-#define MA_AUDCLNT_SESSIONFLAGS_EXPIREWHENUNOWNED          0x10000000
-#define MA_AUDCLNT_SESSIONFLAGS_DISPLAY_HIDE               0x20000000
-#define MA_AUDCLNT_SESSIONFLAGS_DISPLAY_HIDEWHENEXPIRED    0x40000000
-
-/* Buffer flags. */
-#define MA_AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY          1
-#define MA_AUDCLNT_BUFFERFLAGS_SILENT                      2
-#define MA_AUDCLNT_BUFFERFLAGS_TIMESTAMP_ERROR             4
-
-typedef enum
-{
-    ma_eRender  = 0,
-    ma_eCapture = 1,
-    ma_eAll     = 2
-} ma_EDataFlow;
-
-typedef enum
-{
-    ma_eConsole        = 0,
-    ma_eMultimedia     = 1,
-    ma_eCommunications = 2
-} ma_ERole;
-
-typedef enum
-{
-    MA_AUDCLNT_SHAREMODE_SHARED,
-    MA_AUDCLNT_SHAREMODE_EXCLUSIVE
-} MA_AUDCLNT_SHAREMODE;
-
-typedef enum
-{
-    MA_AudioCategory_Other = 0  /* <-- miniaudio is only caring about Other. */
-} MA_AUDIO_STREAM_CATEGORY;
-
-typedef struct
-{
-    ma_uint32 cbSize;
-    BOOL bIsOffload;
-    MA_AUDIO_STREAM_CATEGORY eCategory;
-} ma_AudioClientProperties;
-
-/* IUnknown */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IUnknown* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IUnknown* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IUnknown* pThis);
-} ma_IUnknownVtbl;
-struct ma_IUnknown
-{
-    ma_IUnknownVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IUnknown_QueryInterface(ma_IUnknown* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IUnknown_AddRef(ma_IUnknown* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IUnknown_Release(ma_IUnknown* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    /* IMMNotificationClient */
-    typedef struct
-    {
-        /* IUnknown */
-        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IMMNotificationClient* pThis, const IID* const riid, void** ppObject);
-        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IMMNotificationClient* pThis);
-        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IMMNotificationClient* pThis);
-
-        /* IMMNotificationClient */
-        HRESULT (STDMETHODCALLTYPE * OnDeviceStateChanged)  (ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID, DWORD dwNewState);
-        HRESULT (STDMETHODCALLTYPE * OnDeviceAdded)         (ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID);
-        HRESULT (STDMETHODCALLTYPE * OnDeviceRemoved)       (ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID);
-        HRESULT (STDMETHODCALLTYPE * OnDefaultDeviceChanged)(ma_IMMNotificationClient* pThis, ma_EDataFlow dataFlow, ma_ERole role, const WCHAR* pDefaultDeviceID);
-        HRESULT (STDMETHODCALLTYPE * OnPropertyValueChanged)(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID, const PROPERTYKEY key);
-    } ma_IMMNotificationClientVtbl;
-
-    /* IMMDeviceEnumerator */
-    typedef struct
-    {
-        /* IUnknown */
-        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IMMDeviceEnumerator* pThis, const IID* const riid, void** ppObject);
-        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IMMDeviceEnumerator* pThis);
-        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IMMDeviceEnumerator* pThis);
-
-        /* IMMDeviceEnumerator */
-        HRESULT (STDMETHODCALLTYPE * EnumAudioEndpoints)                    (ma_IMMDeviceEnumerator* pThis, ma_EDataFlow dataFlow, DWORD dwStateMask, ma_IMMDeviceCollection** ppDevices);
-        HRESULT (STDMETHODCALLTYPE * GetDefaultAudioEndpoint)               (ma_IMMDeviceEnumerator* pThis, ma_EDataFlow dataFlow, ma_ERole role, ma_IMMDevice** ppEndpoint);
-        HRESULT (STDMETHODCALLTYPE * GetDevice)                             (ma_IMMDeviceEnumerator* pThis, const WCHAR* pID, ma_IMMDevice** ppDevice);
-        HRESULT (STDMETHODCALLTYPE * RegisterEndpointNotificationCallback)  (ma_IMMDeviceEnumerator* pThis, ma_IMMNotificationClient* pClient);
-        HRESULT (STDMETHODCALLTYPE * UnregisterEndpointNotificationCallback)(ma_IMMDeviceEnumerator* pThis, ma_IMMNotificationClient* pClient);
-    } ma_IMMDeviceEnumeratorVtbl;
-    struct ma_IMMDeviceEnumerator
-    {
-        ma_IMMDeviceEnumeratorVtbl* lpVtbl;
-    };
-    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_QueryInterface(ma_IMMDeviceEnumerator* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-    static MA_INLINE ULONG   ma_IMMDeviceEnumerator_AddRef(ma_IMMDeviceEnumerator* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-    static MA_INLINE ULONG   ma_IMMDeviceEnumerator_Release(ma_IMMDeviceEnumerator* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_EnumAudioEndpoints(ma_IMMDeviceEnumerator* pThis, ma_EDataFlow dataFlow, DWORD dwStateMask, ma_IMMDeviceCollection** ppDevices) { return pThis->lpVtbl->EnumAudioEndpoints(pThis, dataFlow, dwStateMask, ppDevices); }
-    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_GetDefaultAudioEndpoint(ma_IMMDeviceEnumerator* pThis, ma_EDataFlow dataFlow, ma_ERole role, ma_IMMDevice** ppEndpoint) { return pThis->lpVtbl->GetDefaultAudioEndpoint(pThis, dataFlow, role, ppEndpoint); }
-    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_GetDevice(ma_IMMDeviceEnumerator* pThis, const WCHAR* pID, ma_IMMDevice** ppDevice) { return pThis->lpVtbl->GetDevice(pThis, pID, ppDevice); }
-    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_RegisterEndpointNotificationCallback(ma_IMMDeviceEnumerator* pThis, ma_IMMNotificationClient* pClient) { return pThis->lpVtbl->RegisterEndpointNotificationCallback(pThis, pClient); }
-    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_UnregisterEndpointNotificationCallback(ma_IMMDeviceEnumerator* pThis, ma_IMMNotificationClient* pClient) { return pThis->lpVtbl->UnregisterEndpointNotificationCallback(pThis, pClient); }
-
-
-    /* IMMDeviceCollection */
-    typedef struct
-    {
-        /* IUnknown */
-        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IMMDeviceCollection* pThis, const IID* const riid, void** ppObject);
-        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IMMDeviceCollection* pThis);
-        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IMMDeviceCollection* pThis);
-
-        /* IMMDeviceCollection */
-        HRESULT (STDMETHODCALLTYPE * GetCount)(ma_IMMDeviceCollection* pThis, UINT* pDevices);
-        HRESULT (STDMETHODCALLTYPE * Item)    (ma_IMMDeviceCollection* pThis, UINT nDevice, ma_IMMDevice** ppDevice);
-    } ma_IMMDeviceCollectionVtbl;
-    struct ma_IMMDeviceCollection
-    {
-        ma_IMMDeviceCollectionVtbl* lpVtbl;
-    };
-    static MA_INLINE HRESULT ma_IMMDeviceCollection_QueryInterface(ma_IMMDeviceCollection* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-    static MA_INLINE ULONG   ma_IMMDeviceCollection_AddRef(ma_IMMDeviceCollection* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-    static MA_INLINE ULONG   ma_IMMDeviceCollection_Release(ma_IMMDeviceCollection* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-    static MA_INLINE HRESULT ma_IMMDeviceCollection_GetCount(ma_IMMDeviceCollection* pThis, UINT* pDevices)                               { return pThis->lpVtbl->GetCount(pThis, pDevices); }
-    static MA_INLINE HRESULT ma_IMMDeviceCollection_Item(ma_IMMDeviceCollection* pThis, UINT nDevice, ma_IMMDevice** ppDevice)            { return pThis->lpVtbl->Item(pThis, nDevice, ppDevice); }
-
-
-    /* IMMDevice */
-    typedef struct
-    {
-        /* IUnknown */
-        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IMMDevice* pThis, const IID* const riid, void** ppObject);
-        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IMMDevice* pThis);
-        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IMMDevice* pThis);
-
-        /* IMMDevice */
-        HRESULT (STDMETHODCALLTYPE * Activate)         (ma_IMMDevice* pThis, const IID* const iid, DWORD dwClsCtx, MA_PROPVARIANT* pActivationParams, void** ppInterface);
-        HRESULT (STDMETHODCALLTYPE * OpenPropertyStore)(ma_IMMDevice* pThis, DWORD stgmAccess, ma_IPropertyStore** ppProperties);
-        HRESULT (STDMETHODCALLTYPE * GetId)            (ma_IMMDevice* pThis, WCHAR** pID);
-        HRESULT (STDMETHODCALLTYPE * GetState)         (ma_IMMDevice* pThis, DWORD *pState);
-    } ma_IMMDeviceVtbl;
-    struct ma_IMMDevice
-    {
-        ma_IMMDeviceVtbl* lpVtbl;
-    };
-    static MA_INLINE HRESULT ma_IMMDevice_QueryInterface(ma_IMMDevice* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-    static MA_INLINE ULONG   ma_IMMDevice_AddRef(ma_IMMDevice* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-    static MA_INLINE ULONG   ma_IMMDevice_Release(ma_IMMDevice* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-    static MA_INLINE HRESULT ma_IMMDevice_Activate(ma_IMMDevice* pThis, const IID* const iid, DWORD dwClsCtx, MA_PROPVARIANT* pActivationParams, void** ppInterface) { return pThis->lpVtbl->Activate(pThis, iid, dwClsCtx, pActivationParams, ppInterface); }
-    static MA_INLINE HRESULT ma_IMMDevice_OpenPropertyStore(ma_IMMDevice* pThis, DWORD stgmAccess, ma_IPropertyStore** ppProperties) { return pThis->lpVtbl->OpenPropertyStore(pThis, stgmAccess, ppProperties); }
-    static MA_INLINE HRESULT ma_IMMDevice_GetId(ma_IMMDevice* pThis, WCHAR** pID)                                     { return pThis->lpVtbl->GetId(pThis, pID); }
-    static MA_INLINE HRESULT ma_IMMDevice_GetState(ma_IMMDevice* pThis, DWORD *pState)                                { return pThis->lpVtbl->GetState(pThis, pState); }
-#else
-    /* IActivateAudioInterfaceAsyncOperation */
-    typedef struct
-    {
-        /* IUnknown */
-        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IActivateAudioInterfaceAsyncOperation* pThis, const IID* const riid, void** ppObject);
-        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IActivateAudioInterfaceAsyncOperation* pThis);
-        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IActivateAudioInterfaceAsyncOperation* pThis);
-
-        /* IActivateAudioInterfaceAsyncOperation */
-        HRESULT (STDMETHODCALLTYPE * GetActivateResult)(ma_IActivateAudioInterfaceAsyncOperation* pThis, HRESULT *pActivateResult, ma_IUnknown** ppActivatedInterface);
-    } ma_IActivateAudioInterfaceAsyncOperationVtbl;
-    struct ma_IActivateAudioInterfaceAsyncOperation
-    {
-        ma_IActivateAudioInterfaceAsyncOperationVtbl* lpVtbl;
-    };
-    static MA_INLINE HRESULT ma_IActivateAudioInterfaceAsyncOperation_QueryInterface(ma_IActivateAudioInterfaceAsyncOperation* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-    static MA_INLINE ULONG   ma_IActivateAudioInterfaceAsyncOperation_AddRef(ma_IActivateAudioInterfaceAsyncOperation* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-    static MA_INLINE ULONG   ma_IActivateAudioInterfaceAsyncOperation_Release(ma_IActivateAudioInterfaceAsyncOperation* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-    static MA_INLINE HRESULT ma_IActivateAudioInterfaceAsyncOperation_GetActivateResult(ma_IActivateAudioInterfaceAsyncOperation* pThis, HRESULT *pActivateResult, ma_IUnknown** ppActivatedInterface) { return pThis->lpVtbl->GetActivateResult(pThis, pActivateResult, ppActivatedInterface); }
-#endif
-
-/* IPropertyStore */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IPropertyStore* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IPropertyStore* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IPropertyStore* pThis);
-
-    /* IPropertyStore */
-    HRESULT (STDMETHODCALLTYPE * GetCount)(ma_IPropertyStore* pThis, DWORD* pPropCount);
-    HRESULT (STDMETHODCALLTYPE * GetAt)   (ma_IPropertyStore* pThis, DWORD propIndex, PROPERTYKEY* pPropKey);
-    HRESULT (STDMETHODCALLTYPE * GetValue)(ma_IPropertyStore* pThis, const PROPERTYKEY* const pKey, MA_PROPVARIANT* pPropVar);
-    HRESULT (STDMETHODCALLTYPE * SetValue)(ma_IPropertyStore* pThis, const PROPERTYKEY* const pKey, const MA_PROPVARIANT* const pPropVar);
-    HRESULT (STDMETHODCALLTYPE * Commit)  (ma_IPropertyStore* pThis);
-} ma_IPropertyStoreVtbl;
-struct ma_IPropertyStore
-{
-    ma_IPropertyStoreVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IPropertyStore_QueryInterface(ma_IPropertyStore* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IPropertyStore_AddRef(ma_IPropertyStore* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IPropertyStore_Release(ma_IPropertyStore* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IPropertyStore_GetCount(ma_IPropertyStore* pThis, DWORD* pPropCount)                            { return pThis->lpVtbl->GetCount(pThis, pPropCount); }
-static MA_INLINE HRESULT ma_IPropertyStore_GetAt(ma_IPropertyStore* pThis, DWORD propIndex, PROPERTYKEY* pPropKey)          { return pThis->lpVtbl->GetAt(pThis, propIndex, pPropKey); }
-static MA_INLINE HRESULT ma_IPropertyStore_GetValue(ma_IPropertyStore* pThis, const PROPERTYKEY* const pKey, MA_PROPVARIANT* pPropVar) { return pThis->lpVtbl->GetValue(pThis, pKey, pPropVar); }
-static MA_INLINE HRESULT ma_IPropertyStore_SetValue(ma_IPropertyStore* pThis, const PROPERTYKEY* const pKey, const MA_PROPVARIANT* const pPropVar) { return pThis->lpVtbl->SetValue(pThis, pKey, pPropVar); }
-static MA_INLINE HRESULT ma_IPropertyStore_Commit(ma_IPropertyStore* pThis)                                                 { return pThis->lpVtbl->Commit(pThis); }
-
-
-/* IAudioClient */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioClient* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioClient* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioClient* pThis);
-
-    /* IAudioClient */
-    HRESULT (STDMETHODCALLTYPE * Initialize)       (ma_IAudioClient* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid);
-    HRESULT (STDMETHODCALLTYPE * GetBufferSize)    (ma_IAudioClient* pThis, ma_uint32* pNumBufferFrames);
-    HRESULT (STDMETHODCALLTYPE * GetStreamLatency) (ma_IAudioClient* pThis, MA_REFERENCE_TIME* pLatency);
-    HRESULT (STDMETHODCALLTYPE * GetCurrentPadding)(ma_IAudioClient* pThis, ma_uint32* pNumPaddingFrames);
-    HRESULT (STDMETHODCALLTYPE * IsFormatSupported)(ma_IAudioClient* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch);
-    HRESULT (STDMETHODCALLTYPE * GetMixFormat)     (ma_IAudioClient* pThis, MA_WAVEFORMATEX** ppDeviceFormat);
-    HRESULT (STDMETHODCALLTYPE * GetDevicePeriod)  (ma_IAudioClient* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod);
-    HRESULT (STDMETHODCALLTYPE * Start)            (ma_IAudioClient* pThis);
-    HRESULT (STDMETHODCALLTYPE * Stop)             (ma_IAudioClient* pThis);
-    HRESULT (STDMETHODCALLTYPE * Reset)            (ma_IAudioClient* pThis);
-    HRESULT (STDMETHODCALLTYPE * SetEventHandle)   (ma_IAudioClient* pThis, HANDLE eventHandle);
-    HRESULT (STDMETHODCALLTYPE * GetService)       (ma_IAudioClient* pThis, const IID* const riid, void** pp);
-} ma_IAudioClientVtbl;
-struct ma_IAudioClient
-{
-    ma_IAudioClientVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IAudioClient_QueryInterface(ma_IAudioClient* pThis, const IID* const riid, void** ppObject)    { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IAudioClient_AddRef(ma_IAudioClient* pThis)                                                    { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IAudioClient_Release(ma_IAudioClient* pThis)                                                   { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient_Initialize(ma_IAudioClient* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid) { return pThis->lpVtbl->Initialize(pThis, shareMode, streamFlags, bufferDuration, periodicity, pFormat, pAudioSessionGuid); }
-static MA_INLINE HRESULT ma_IAudioClient_GetBufferSize(ma_IAudioClient* pThis, ma_uint32* pNumBufferFrames)                { return pThis->lpVtbl->GetBufferSize(pThis, pNumBufferFrames); }
-static MA_INLINE HRESULT ma_IAudioClient_GetStreamLatency(ma_IAudioClient* pThis, MA_REFERENCE_TIME* pLatency)             { return pThis->lpVtbl->GetStreamLatency(pThis, pLatency); }
-static MA_INLINE HRESULT ma_IAudioClient_GetCurrentPadding(ma_IAudioClient* pThis, ma_uint32* pNumPaddingFrames)           { return pThis->lpVtbl->GetCurrentPadding(pThis, pNumPaddingFrames); }
-static MA_INLINE HRESULT ma_IAudioClient_IsFormatSupported(ma_IAudioClient* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch) { return pThis->lpVtbl->IsFormatSupported(pThis, shareMode, pFormat, ppClosestMatch); }
-static MA_INLINE HRESULT ma_IAudioClient_GetMixFormat(ma_IAudioClient* pThis, MA_WAVEFORMATEX** ppDeviceFormat)            { return pThis->lpVtbl->GetMixFormat(pThis, ppDeviceFormat); }
-static MA_INLINE HRESULT ma_IAudioClient_GetDevicePeriod(ma_IAudioClient* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod) { return pThis->lpVtbl->GetDevicePeriod(pThis, pDefaultDevicePeriod, pMinimumDevicePeriod); }
-static MA_INLINE HRESULT ma_IAudioClient_Start(ma_IAudioClient* pThis)                                                     { return pThis->lpVtbl->Start(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient_Stop(ma_IAudioClient* pThis)                                                      { return pThis->lpVtbl->Stop(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient_Reset(ma_IAudioClient* pThis)                                                     { return pThis->lpVtbl->Reset(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient_SetEventHandle(ma_IAudioClient* pThis, HANDLE eventHandle)                        { return pThis->lpVtbl->SetEventHandle(pThis, eventHandle); }
-static MA_INLINE HRESULT ma_IAudioClient_GetService(ma_IAudioClient* pThis, const IID* const riid, void** pp)              { return pThis->lpVtbl->GetService(pThis, riid, pp); }
-
-/* IAudioClient2 */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioClient2* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioClient2* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioClient2* pThis);
-
-    /* IAudioClient */
-    HRESULT (STDMETHODCALLTYPE * Initialize)       (ma_IAudioClient2* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid);
-    HRESULT (STDMETHODCALLTYPE * GetBufferSize)    (ma_IAudioClient2* pThis, ma_uint32* pNumBufferFrames);
-    HRESULT (STDMETHODCALLTYPE * GetStreamLatency) (ma_IAudioClient2* pThis, MA_REFERENCE_TIME* pLatency);
-    HRESULT (STDMETHODCALLTYPE * GetCurrentPadding)(ma_IAudioClient2* pThis, ma_uint32* pNumPaddingFrames);
-    HRESULT (STDMETHODCALLTYPE * IsFormatSupported)(ma_IAudioClient2* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch);
-    HRESULT (STDMETHODCALLTYPE * GetMixFormat)     (ma_IAudioClient2* pThis, MA_WAVEFORMATEX** ppDeviceFormat);
-    HRESULT (STDMETHODCALLTYPE * GetDevicePeriod)  (ma_IAudioClient2* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod);
-    HRESULT (STDMETHODCALLTYPE * Start)            (ma_IAudioClient2* pThis);
-    HRESULT (STDMETHODCALLTYPE * Stop)             (ma_IAudioClient2* pThis);
-    HRESULT (STDMETHODCALLTYPE * Reset)            (ma_IAudioClient2* pThis);
-    HRESULT (STDMETHODCALLTYPE * SetEventHandle)   (ma_IAudioClient2* pThis, HANDLE eventHandle);
-    HRESULT (STDMETHODCALLTYPE * GetService)       (ma_IAudioClient2* pThis, const IID* const riid, void** pp);
-
-    /* IAudioClient2 */
-    HRESULT (STDMETHODCALLTYPE * IsOffloadCapable)   (ma_IAudioClient2* pThis, MA_AUDIO_STREAM_CATEGORY category, BOOL* pOffloadCapable);
-    HRESULT (STDMETHODCALLTYPE * SetClientProperties)(ma_IAudioClient2* pThis, const ma_AudioClientProperties* pProperties);
-    HRESULT (STDMETHODCALLTYPE * GetBufferSizeLimits)(ma_IAudioClient2* pThis, const MA_WAVEFORMATEX* pFormat, BOOL eventDriven, MA_REFERENCE_TIME* pMinBufferDuration, MA_REFERENCE_TIME* pMaxBufferDuration);
-} ma_IAudioClient2Vtbl;
-struct ma_IAudioClient2
-{
-    ma_IAudioClient2Vtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IAudioClient2_QueryInterface(ma_IAudioClient2* pThis, const IID* const riid, void** ppObject)    { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IAudioClient2_AddRef(ma_IAudioClient2* pThis)                                                    { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IAudioClient2_Release(ma_IAudioClient2* pThis)                                                   { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient2_Initialize(ma_IAudioClient2* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid) { return pThis->lpVtbl->Initialize(pThis, shareMode, streamFlags, bufferDuration, periodicity, pFormat, pAudioSessionGuid); }
-static MA_INLINE HRESULT ma_IAudioClient2_GetBufferSize(ma_IAudioClient2* pThis, ma_uint32* pNumBufferFrames)                { return pThis->lpVtbl->GetBufferSize(pThis, pNumBufferFrames); }
-static MA_INLINE HRESULT ma_IAudioClient2_GetStreamLatency(ma_IAudioClient2* pThis, MA_REFERENCE_TIME* pLatency)             { return pThis->lpVtbl->GetStreamLatency(pThis, pLatency); }
-static MA_INLINE HRESULT ma_IAudioClient2_GetCurrentPadding(ma_IAudioClient2* pThis, ma_uint32* pNumPaddingFrames)           { return pThis->lpVtbl->GetCurrentPadding(pThis, pNumPaddingFrames); }
-static MA_INLINE HRESULT ma_IAudioClient2_IsFormatSupported(ma_IAudioClient2* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch) { return pThis->lpVtbl->IsFormatSupported(pThis, shareMode, pFormat, ppClosestMatch); }
-static MA_INLINE HRESULT ma_IAudioClient2_GetMixFormat(ma_IAudioClient2* pThis, MA_WAVEFORMATEX** ppDeviceFormat)            { return pThis->lpVtbl->GetMixFormat(pThis, ppDeviceFormat); }
-static MA_INLINE HRESULT ma_IAudioClient2_GetDevicePeriod(ma_IAudioClient2* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod) { return pThis->lpVtbl->GetDevicePeriod(pThis, pDefaultDevicePeriod, pMinimumDevicePeriod); }
-static MA_INLINE HRESULT ma_IAudioClient2_Start(ma_IAudioClient2* pThis)                                                     { return pThis->lpVtbl->Start(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient2_Stop(ma_IAudioClient2* pThis)                                                      { return pThis->lpVtbl->Stop(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient2_Reset(ma_IAudioClient2* pThis)                                                     { return pThis->lpVtbl->Reset(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient2_SetEventHandle(ma_IAudioClient2* pThis, HANDLE eventHandle)                        { return pThis->lpVtbl->SetEventHandle(pThis, eventHandle); }
-static MA_INLINE HRESULT ma_IAudioClient2_GetService(ma_IAudioClient2* pThis, const IID* const riid, void** pp)              { return pThis->lpVtbl->GetService(pThis, riid, pp); }
-static MA_INLINE HRESULT ma_IAudioClient2_IsOffloadCapable(ma_IAudioClient2* pThis, MA_AUDIO_STREAM_CATEGORY category, BOOL* pOffloadCapable) { return pThis->lpVtbl->IsOffloadCapable(pThis, category, pOffloadCapable); }
-static MA_INLINE HRESULT ma_IAudioClient2_SetClientProperties(ma_IAudioClient2* pThis, const ma_AudioClientProperties* pProperties)           { return pThis->lpVtbl->SetClientProperties(pThis, pProperties); }
-static MA_INLINE HRESULT ma_IAudioClient2_GetBufferSizeLimits(ma_IAudioClient2* pThis, const MA_WAVEFORMATEX* pFormat, BOOL eventDriven, MA_REFERENCE_TIME* pMinBufferDuration, MA_REFERENCE_TIME* pMaxBufferDuration) { return pThis->lpVtbl->GetBufferSizeLimits(pThis, pFormat, eventDriven, pMinBufferDuration, pMaxBufferDuration); }
-
-
-/* IAudioClient3 */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioClient3* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioClient3* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioClient3* pThis);
-
-    /* IAudioClient */
-    HRESULT (STDMETHODCALLTYPE * Initialize)       (ma_IAudioClient3* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid);
-    HRESULT (STDMETHODCALLTYPE * GetBufferSize)    (ma_IAudioClient3* pThis, ma_uint32* pNumBufferFrames);
-    HRESULT (STDMETHODCALLTYPE * GetStreamLatency) (ma_IAudioClient3* pThis, MA_REFERENCE_TIME* pLatency);
-    HRESULT (STDMETHODCALLTYPE * GetCurrentPadding)(ma_IAudioClient3* pThis, ma_uint32* pNumPaddingFrames);
-    HRESULT (STDMETHODCALLTYPE * IsFormatSupported)(ma_IAudioClient3* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch);
-    HRESULT (STDMETHODCALLTYPE * GetMixFormat)     (ma_IAudioClient3* pThis, MA_WAVEFORMATEX** ppDeviceFormat);
-    HRESULT (STDMETHODCALLTYPE * GetDevicePeriod)  (ma_IAudioClient3* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod);
-    HRESULT (STDMETHODCALLTYPE * Start)            (ma_IAudioClient3* pThis);
-    HRESULT (STDMETHODCALLTYPE * Stop)             (ma_IAudioClient3* pThis);
-    HRESULT (STDMETHODCALLTYPE * Reset)            (ma_IAudioClient3* pThis);
-    HRESULT (STDMETHODCALLTYPE * SetEventHandle)   (ma_IAudioClient3* pThis, HANDLE eventHandle);
-    HRESULT (STDMETHODCALLTYPE * GetService)       (ma_IAudioClient3* pThis, const IID* const riid, void** pp);
-
-    /* IAudioClient2 */
-    HRESULT (STDMETHODCALLTYPE * IsOffloadCapable)   (ma_IAudioClient3* pThis, MA_AUDIO_STREAM_CATEGORY category, BOOL* pOffloadCapable);
-    HRESULT (STDMETHODCALLTYPE * SetClientProperties)(ma_IAudioClient3* pThis, const ma_AudioClientProperties* pProperties);
-    HRESULT (STDMETHODCALLTYPE * GetBufferSizeLimits)(ma_IAudioClient3* pThis, const MA_WAVEFORMATEX* pFormat, BOOL eventDriven, MA_REFERENCE_TIME* pMinBufferDuration, MA_REFERENCE_TIME* pMaxBufferDuration);
-
-    /* IAudioClient3 */
-    HRESULT (STDMETHODCALLTYPE * GetSharedModeEnginePeriod)       (ma_IAudioClient3* pThis, const MA_WAVEFORMATEX* pFormat, ma_uint32* pDefaultPeriodInFrames, ma_uint32* pFundamentalPeriodInFrames, ma_uint32* pMinPeriodInFrames, ma_uint32* pMaxPeriodInFrames);
-    HRESULT (STDMETHODCALLTYPE * GetCurrentSharedModeEnginePeriod)(ma_IAudioClient3* pThis, MA_WAVEFORMATEX** ppFormat, ma_uint32* pCurrentPeriodInFrames);
-    HRESULT (STDMETHODCALLTYPE * InitializeSharedAudioStream)     (ma_IAudioClient3* pThis, DWORD streamFlags, ma_uint32 periodInFrames, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid);
-} ma_IAudioClient3Vtbl;
-struct ma_IAudioClient3
-{
-    ma_IAudioClient3Vtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IAudioClient3_QueryInterface(ma_IAudioClient3* pThis, const IID* const riid, void** ppObject)    { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IAudioClient3_AddRef(ma_IAudioClient3* pThis)                                                    { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IAudioClient3_Release(ma_IAudioClient3* pThis)                                                   { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient3_Initialize(ma_IAudioClient3* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid) { return pThis->lpVtbl->Initialize(pThis, shareMode, streamFlags, bufferDuration, periodicity, pFormat, pAudioSessionGuid); }
-static MA_INLINE HRESULT ma_IAudioClient3_GetBufferSize(ma_IAudioClient3* pThis, ma_uint32* pNumBufferFrames)                { return pThis->lpVtbl->GetBufferSize(pThis, pNumBufferFrames); }
-static MA_INLINE HRESULT ma_IAudioClient3_GetStreamLatency(ma_IAudioClient3* pThis, MA_REFERENCE_TIME* pLatency)             { return pThis->lpVtbl->GetStreamLatency(pThis, pLatency); }
-static MA_INLINE HRESULT ma_IAudioClient3_GetCurrentPadding(ma_IAudioClient3* pThis, ma_uint32* pNumPaddingFrames)           { return pThis->lpVtbl->GetCurrentPadding(pThis, pNumPaddingFrames); }
-static MA_INLINE HRESULT ma_IAudioClient3_IsFormatSupported(ma_IAudioClient3* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch) { return pThis->lpVtbl->IsFormatSupported(pThis, shareMode, pFormat, ppClosestMatch); }
-static MA_INLINE HRESULT ma_IAudioClient3_GetMixFormat(ma_IAudioClient3* pThis, MA_WAVEFORMATEX** ppDeviceFormat)               { return pThis->lpVtbl->GetMixFormat(pThis, ppDeviceFormat); }
-static MA_INLINE HRESULT ma_IAudioClient3_GetDevicePeriod(ma_IAudioClient3* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod) { return pThis->lpVtbl->GetDevicePeriod(pThis, pDefaultDevicePeriod, pMinimumDevicePeriod); }
-static MA_INLINE HRESULT ma_IAudioClient3_Start(ma_IAudioClient3* pThis)                                                     { return pThis->lpVtbl->Start(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient3_Stop(ma_IAudioClient3* pThis)                                                      { return pThis->lpVtbl->Stop(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient3_Reset(ma_IAudioClient3* pThis)                                                     { return pThis->lpVtbl->Reset(pThis); }
-static MA_INLINE HRESULT ma_IAudioClient3_SetEventHandle(ma_IAudioClient3* pThis, HANDLE eventHandle)                        { return pThis->lpVtbl->SetEventHandle(pThis, eventHandle); }
-static MA_INLINE HRESULT ma_IAudioClient3_GetService(ma_IAudioClient3* pThis, const IID* const riid, void** pp)              { return pThis->lpVtbl->GetService(pThis, riid, pp); }
-static MA_INLINE HRESULT ma_IAudioClient3_IsOffloadCapable(ma_IAudioClient3* pThis, MA_AUDIO_STREAM_CATEGORY category, BOOL* pOffloadCapable) { return pThis->lpVtbl->IsOffloadCapable(pThis, category, pOffloadCapable); }
-static MA_INLINE HRESULT ma_IAudioClient3_SetClientProperties(ma_IAudioClient3* pThis, const ma_AudioClientProperties* pProperties)           { return pThis->lpVtbl->SetClientProperties(pThis, pProperties); }
-static MA_INLINE HRESULT ma_IAudioClient3_GetBufferSizeLimits(ma_IAudioClient3* pThis, const MA_WAVEFORMATEX* pFormat, BOOL eventDriven, MA_REFERENCE_TIME* pMinBufferDuration, MA_REFERENCE_TIME* pMaxBufferDuration) { return pThis->lpVtbl->GetBufferSizeLimits(pThis, pFormat, eventDriven, pMinBufferDuration, pMaxBufferDuration); }
-static MA_INLINE HRESULT ma_IAudioClient3_GetSharedModeEnginePeriod(ma_IAudioClient3* pThis, const MA_WAVEFORMATEX* pFormat, ma_uint32* pDefaultPeriodInFrames, ma_uint32* pFundamentalPeriodInFrames, ma_uint32* pMinPeriodInFrames, ma_uint32* pMaxPeriodInFrames) { return pThis->lpVtbl->GetSharedModeEnginePeriod(pThis, pFormat, pDefaultPeriodInFrames, pFundamentalPeriodInFrames, pMinPeriodInFrames, pMaxPeriodInFrames); }
-static MA_INLINE HRESULT ma_IAudioClient3_GetCurrentSharedModeEnginePeriod(ma_IAudioClient3* pThis, MA_WAVEFORMATEX** ppFormat, ma_uint32* pCurrentPeriodInFrames) { return pThis->lpVtbl->GetCurrentSharedModeEnginePeriod(pThis, ppFormat, pCurrentPeriodInFrames); }
-static MA_INLINE HRESULT ma_IAudioClient3_InitializeSharedAudioStream(ma_IAudioClient3* pThis, DWORD streamFlags, ma_uint32 periodInFrames, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGUID) { return pThis->lpVtbl->InitializeSharedAudioStream(pThis, streamFlags, periodInFrames, pFormat, pAudioSessionGUID); }
-
-
-/* IAudioRenderClient */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioRenderClient* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioRenderClient* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioRenderClient* pThis);
-
-    /* IAudioRenderClient */
-    HRESULT (STDMETHODCALLTYPE * GetBuffer)    (ma_IAudioRenderClient* pThis, ma_uint32 numFramesRequested, BYTE** ppData);
-    HRESULT (STDMETHODCALLTYPE * ReleaseBuffer)(ma_IAudioRenderClient* pThis, ma_uint32 numFramesWritten, DWORD dwFlags);
-} ma_IAudioRenderClientVtbl;
-struct ma_IAudioRenderClient
-{
-    ma_IAudioRenderClientVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IAudioRenderClient_QueryInterface(ma_IAudioRenderClient* pThis, const IID* const riid, void** ppObject)   { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IAudioRenderClient_AddRef(ma_IAudioRenderClient* pThis)                                                   { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IAudioRenderClient_Release(ma_IAudioRenderClient* pThis)                                                  { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IAudioRenderClient_GetBuffer(ma_IAudioRenderClient* pThis, ma_uint32 numFramesRequested, BYTE** ppData)   { return pThis->lpVtbl->GetBuffer(pThis, numFramesRequested, ppData); }
-static MA_INLINE HRESULT ma_IAudioRenderClient_ReleaseBuffer(ma_IAudioRenderClient* pThis, ma_uint32 numFramesWritten, DWORD dwFlags) { return pThis->lpVtbl->ReleaseBuffer(pThis, numFramesWritten, dwFlags); }
-
-
-/* IAudioCaptureClient */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioCaptureClient* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioCaptureClient* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioCaptureClient* pThis);
-
-    /* IAudioRenderClient */
-    HRESULT (STDMETHODCALLTYPE * GetBuffer)        (ma_IAudioCaptureClient* pThis, BYTE** ppData, ma_uint32* pNumFramesToRead, DWORD* pFlags, ma_uint64* pDevicePosition, ma_uint64* pQPCPosition);
-    HRESULT (STDMETHODCALLTYPE * ReleaseBuffer)    (ma_IAudioCaptureClient* pThis, ma_uint32 numFramesRead);
-    HRESULT (STDMETHODCALLTYPE * GetNextPacketSize)(ma_IAudioCaptureClient* pThis, ma_uint32* pNumFramesInNextPacket);
-} ma_IAudioCaptureClientVtbl;
-struct ma_IAudioCaptureClient
-{
-    ma_IAudioCaptureClientVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IAudioCaptureClient_QueryInterface(ma_IAudioCaptureClient* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IAudioCaptureClient_AddRef(ma_IAudioCaptureClient* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IAudioCaptureClient_Release(ma_IAudioCaptureClient* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IAudioCaptureClient_GetBuffer(ma_IAudioCaptureClient* pThis, BYTE** ppData, ma_uint32* pNumFramesToRead, DWORD* pFlags, ma_uint64* pDevicePosition, ma_uint64* pQPCPosition) { return pThis->lpVtbl->GetBuffer(pThis, ppData, pNumFramesToRead, pFlags, pDevicePosition, pQPCPosition); }
-static MA_INLINE HRESULT ma_IAudioCaptureClient_ReleaseBuffer(ma_IAudioCaptureClient* pThis, ma_uint32 numFramesRead)                 { return pThis->lpVtbl->ReleaseBuffer(pThis, numFramesRead); }
-static MA_INLINE HRESULT ma_IAudioCaptureClient_GetNextPacketSize(ma_IAudioCaptureClient* pThis, ma_uint32* pNumFramesInNextPacket)   { return pThis->lpVtbl->GetNextPacketSize(pThis, pNumFramesInNextPacket); }
-
-#if defined(MA_WIN32_UWP)
-/* mmdevapi Functions */
-typedef HRESULT (WINAPI * MA_PFN_ActivateAudioInterfaceAsync)(const wchar_t* deviceInterfacePath, const IID* riid, MA_PROPVARIANT* activationParams, ma_IActivateAudioInterfaceCompletionHandler* completionHandler, ma_IActivateAudioInterfaceAsyncOperation** activationOperation);
-#endif
-
-/* Avrt Functions */
-typedef HANDLE (WINAPI * MA_PFN_AvSetMmThreadCharacteristicsA)(const char* TaskName, DWORD* TaskIndex);
-typedef BOOL   (WINAPI * MA_PFN_AvRevertMmThreadCharacteristics)(HANDLE AvrtHandle);
-
-#if !defined(MA_WIN32_DESKTOP) && !defined(MA_WIN32_GDK)
-typedef struct ma_completion_handler_uwp ma_completion_handler_uwp;
-
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_completion_handler_uwp* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_completion_handler_uwp* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_completion_handler_uwp* pThis);
-
-    /* IActivateAudioInterfaceCompletionHandler */
-    HRESULT (STDMETHODCALLTYPE * ActivateCompleted)(ma_completion_handler_uwp* pThis, ma_IActivateAudioInterfaceAsyncOperation* pActivateOperation);
-} ma_completion_handler_uwp_vtbl;
-struct ma_completion_handler_uwp
-{
-    ma_completion_handler_uwp_vtbl* lpVtbl;
-    MA_ATOMIC(4, ma_uint32) counter;
-    HANDLE hEvent;
-};
-
-static HRESULT STDMETHODCALLTYPE ma_completion_handler_uwp_QueryInterface(ma_completion_handler_uwp* pThis, const IID* const riid, void** ppObject)
-{
-    /*
-    We need to "implement" IAgileObject which is just an indicator that's used internally by WASAPI for some multithreading management. To
-    "implement" this, we just make sure we return pThis when the IAgileObject is requested.
-    */
-    if (!ma_is_guid_equal(riid, &MA_IID_IUnknown) && !ma_is_guid_equal(riid, &MA_IID_IActivateAudioInterfaceCompletionHandler) && !ma_is_guid_equal(riid, &MA_IID_IAgileObject)) {
-        *ppObject = NULL;
-        return E_NOINTERFACE;
-    }
-
-    /* Getting here means the IID is IUnknown or IMMNotificationClient. */
-    *ppObject = (void*)pThis;
-    ((ma_completion_handler_uwp_vtbl*)pThis->lpVtbl)->AddRef(pThis);
-    return S_OK;
-}
-
-static ULONG STDMETHODCALLTYPE ma_completion_handler_uwp_AddRef(ma_completion_handler_uwp* pThis)
-{
-    return (ULONG)ma_atomic_fetch_add_32(&pThis->counter, 1) + 1;
-}
-
-static ULONG STDMETHODCALLTYPE ma_completion_handler_uwp_Release(ma_completion_handler_uwp* pThis)
-{
-    ma_uint32 newRefCount = ma_atomic_fetch_sub_32(&pThis->counter, 1) - 1;
-    if (newRefCount == 0) {
-        return 0;   /* We don't free anything here because we never allocate the object on the heap. */
-    }
-
-    return (ULONG)newRefCount;
-}
-
-static HRESULT STDMETHODCALLTYPE ma_completion_handler_uwp_ActivateCompleted(ma_completion_handler_uwp* pThis, ma_IActivateAudioInterfaceAsyncOperation* pActivateOperation)
-{
-    (void)pActivateOperation;
-    SetEvent(pThis->hEvent);
-    return S_OK;
-}
-
-
-static ma_completion_handler_uwp_vtbl g_maCompletionHandlerVtblInstance = {
-    ma_completion_handler_uwp_QueryInterface,
-    ma_completion_handler_uwp_AddRef,
-    ma_completion_handler_uwp_Release,
-    ma_completion_handler_uwp_ActivateCompleted
-};
-
-static ma_result ma_completion_handler_uwp_init(ma_completion_handler_uwp* pHandler)
-{
-    MA_ASSERT(pHandler != NULL);
-    MA_ZERO_OBJECT(pHandler);
-
-    pHandler->lpVtbl = &g_maCompletionHandlerVtblInstance;
-    pHandler->counter = 1;
-    pHandler->hEvent = CreateEventA(NULL, FALSE, FALSE, NULL);
-    if (pHandler->hEvent == NULL) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_completion_handler_uwp_uninit(ma_completion_handler_uwp* pHandler)
-{
-    if (pHandler->hEvent != NULL) {
-        CloseHandle(pHandler->hEvent);
-    }
-}
-
-static void ma_completion_handler_uwp_wait(ma_completion_handler_uwp* pHandler)
-{
-    WaitForSingleObject((HANDLE)pHandler->hEvent, INFINITE);
-}
-#endif  /* !MA_WIN32_DESKTOP */
-
-/* We need a virtual table for our notification client object that's used for detecting changes to the default device. */
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_QueryInterface(ma_IMMNotificationClient* pThis, const IID* const riid, void** ppObject)
-{
-    /*
-    We care about two interfaces - IUnknown and IMMNotificationClient. If the requested IID is something else
-    we just return E_NOINTERFACE. Otherwise we need to increment the reference counter and return S_OK.
-    */
-    if (!ma_is_guid_equal(riid, &MA_IID_IUnknown) && !ma_is_guid_equal(riid, &MA_IID_IMMNotificationClient)) {
-        *ppObject = NULL;
-        return E_NOINTERFACE;
-    }
-
-    /* Getting here means the IID is IUnknown or IMMNotificationClient. */
-    *ppObject = (void*)pThis;
-    ((ma_IMMNotificationClientVtbl*)pThis->lpVtbl)->AddRef(pThis);
-    return S_OK;
-}
-
-static ULONG STDMETHODCALLTYPE ma_IMMNotificationClient_AddRef(ma_IMMNotificationClient* pThis)
-{
-    return (ULONG)ma_atomic_fetch_add_32(&pThis->counter, 1) + 1;
-}
-
-static ULONG STDMETHODCALLTYPE ma_IMMNotificationClient_Release(ma_IMMNotificationClient* pThis)
-{
-    ma_uint32 newRefCount = ma_atomic_fetch_sub_32(&pThis->counter, 1) - 1;
-    if (newRefCount == 0) {
-        return 0;   /* We don't free anything here because we never allocate the object on the heap. */
-    }
-
-    return (ULONG)newRefCount;
-}
-
-static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnDeviceStateChanged(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID, DWORD dwNewState)
-{
-    ma_bool32 isThisDevice = MA_FALSE;
-    ma_bool32 isCapture    = MA_FALSE;
-    ma_bool32 isPlayback   = MA_FALSE;
-
-#ifdef MA_DEBUG_OUTPUT
-    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnDeviceStateChanged(pDeviceID=%S, dwNewState=%u)\n", (pDeviceID != NULL) ? pDeviceID : L"(NULL)", (unsigned int)dwNewState);*/
-#endif
-
-    /*
-    There have been reports of a hang when a playback device is disconnected. The idea with this code is to explicitly stop the device if we detect
-    that the device is disabled or has been unplugged.
-    */
-    if (pThis->pDevice->wasapi.allowCaptureAutoStreamRouting && (pThis->pDevice->type == ma_device_type_capture || pThis->pDevice->type == ma_device_type_duplex || pThis->pDevice->type == ma_device_type_loopback)) {
-        isCapture = MA_TRUE;
-        if (ma_strcmp_WCHAR(pThis->pDevice->capture.id.wasapi, pDeviceID) == 0) {
-            isThisDevice = MA_TRUE;
-        }
-    }
-
-    if (pThis->pDevice->wasapi.allowPlaybackAutoStreamRouting && (pThis->pDevice->type == ma_device_type_playback || pThis->pDevice->type == ma_device_type_duplex)) {
-        isPlayback = MA_TRUE;
-        if (ma_strcmp_WCHAR(pThis->pDevice->playback.id.wasapi, pDeviceID) == 0) {
-            isThisDevice = MA_TRUE;
-        }
-    }
-
-
-    /*
-    If the device ID matches our device we need to mark our device as detached and stop it. When a
-    device is added in OnDeviceAdded(), we'll restart it. We only mark it as detached if the device
-    was started at the time of being removed.
-    */
-    if (isThisDevice) {
-        if ((dwNewState & MA_MM_DEVICE_STATE_ACTIVE) == 0) {
-            /*
-            Unplugged or otherwise unavailable. Mark as detached if we were in a playing state. We'll
-            use this to determine whether or not we need to automatically start the device when it's
-            plugged back in again.
-            */
-            if (ma_device_get_state(pThis->pDevice) == ma_device_state_started) {
-                if (isPlayback) {
-                    pThis->pDevice->wasapi.isDetachedPlayback = MA_TRUE;
-                }
-                if (isCapture) {
-                    pThis->pDevice->wasapi.isDetachedCapture = MA_TRUE;
-                }
-
-                ma_device_stop(pThis->pDevice);
-            }
-        }
-
-        if ((dwNewState & MA_MM_DEVICE_STATE_ACTIVE) != 0) {
-            /* The device was activated. If we were detached, we need to start it again. */
-            ma_bool8 tryRestartingDevice = MA_FALSE;
-
-            if (isPlayback) {
-                if (pThis->pDevice->wasapi.isDetachedPlayback) {
-                    pThis->pDevice->wasapi.isDetachedPlayback = MA_FALSE;
-                    ma_device_reroute__wasapi(pThis->pDevice, ma_device_type_playback);
-                    tryRestartingDevice = MA_TRUE;
-                }
-            }
-
-            if (isCapture) {
-                if (pThis->pDevice->wasapi.isDetachedCapture) {
-                    pThis->pDevice->wasapi.isDetachedCapture = MA_FALSE;
-                    ma_device_reroute__wasapi(pThis->pDevice, (pThis->pDevice->type == ma_device_type_loopback) ? ma_device_type_loopback : ma_device_type_capture);
-                    tryRestartingDevice = MA_TRUE;
-                }
-            }
-
-            if (tryRestartingDevice) {
-                if (pThis->pDevice->wasapi.isDetachedPlayback == MA_FALSE && pThis->pDevice->wasapi.isDetachedCapture == MA_FALSE) {
-                    ma_device_start(pThis->pDevice);
-                }
-            }
-        }
-    }
-
-    return S_OK;
-}
-
-static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnDeviceAdded(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID)
-{
-#ifdef MA_DEBUG_OUTPUT
-    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnDeviceAdded(pDeviceID=%S)\n", (pDeviceID != NULL) ? pDeviceID : L"(NULL)");*/
-#endif
-
-    /* We don't need to worry about this event for our purposes. */
-    (void)pThis;
-    (void)pDeviceID;
-    return S_OK;
-}
-
-static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnDeviceRemoved(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID)
-{
-#ifdef MA_DEBUG_OUTPUT
-    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnDeviceRemoved(pDeviceID=%S)\n", (pDeviceID != NULL) ? pDeviceID : L"(NULL)");*/
-#endif
-
-    /* We don't need to worry about this event for our purposes. */
-    (void)pThis;
-    (void)pDeviceID;
-    return S_OK;
-}
-
-static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnDefaultDeviceChanged(ma_IMMNotificationClient* pThis, ma_EDataFlow dataFlow, ma_ERole role, const WCHAR* pDefaultDeviceID)
-{
-#ifdef MA_DEBUG_OUTPUT
-    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnDefaultDeviceChanged(dataFlow=%d, role=%d, pDefaultDeviceID=%S)\n", dataFlow, role, (pDefaultDeviceID != NULL) ? pDefaultDeviceID : L"(NULL)");*/
-#endif
-
-    (void)role;
-
-    /* We only care about devices with the same data flow as the current device. */
-    if ((pThis->pDevice->type == ma_device_type_playback && dataFlow != ma_eRender)  ||
-        (pThis->pDevice->type == ma_device_type_capture  && dataFlow != ma_eCapture) ||
-        (pThis->pDevice->type == ma_device_type_loopback && dataFlow != ma_eRender)) {
-        ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Stream rerouting abandoned because dataFlow does match device type.\n");
-        return S_OK;
-    }
-
-    /* We need to consider dataFlow as ma_eCapture if device is ma_device_type_loopback */
-    if (pThis->pDevice->type == ma_device_type_loopback) {
-        dataFlow = ma_eCapture;
-    }
-
-    /* Don't do automatic stream routing if we're not allowed. */
-    if ((dataFlow == ma_eRender  && pThis->pDevice->wasapi.allowPlaybackAutoStreamRouting == MA_FALSE) ||
-        (dataFlow == ma_eCapture && pThis->pDevice->wasapi.allowCaptureAutoStreamRouting  == MA_FALSE)) {
-        ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Stream rerouting abandoned because automatic stream routing has been disabled by the device config.\n");
-        return S_OK;
-    }
-
-    /*
-    Not currently supporting automatic stream routing in exclusive mode. This is not working correctly on my machine due to
-    AUDCLNT_E_DEVICE_IN_USE errors when reinitializing the device. If this is a bug in miniaudio, we can try re-enabling this once
-    it's fixed.
-    */
-    if ((dataFlow == ma_eRender  && pThis->pDevice->playback.shareMode == ma_share_mode_exclusive) ||
-        (dataFlow == ma_eCapture && pThis->pDevice->capture.shareMode  == ma_share_mode_exclusive)) {
-        ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Stream rerouting abandoned because the device shared mode is exclusive.\n");
-        return S_OK;
-    }
-
-
-
-    /*
-    Second attempt at device rerouting. We're going to retrieve the device's state at the time of
-    the route change. We're then going to stop the device, reinitialize the device, and then start
-    it again if the state before stopping was ma_device_state_started.
-    */
-    {
-        ma_uint32 previousState = ma_device_get_state(pThis->pDevice);
-        ma_bool8 restartDevice = MA_FALSE;
-
-        if (previousState == ma_device_state_uninitialized || previousState == ma_device_state_starting) {
-            ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Stream rerouting abandoned because the device is in the process of starting.\n");
-            return S_OK;
-        }
-
-        if (previousState == ma_device_state_started) {
-            ma_device_stop(pThis->pDevice);
-            restartDevice = MA_TRUE;
-        }
-
-        if (pDefaultDeviceID != NULL) { /* <-- The input device ID will be null if there's no other device available. */
-            ma_mutex_lock(&pThis->pDevice->wasapi.rerouteLock);
-            {
-                if (dataFlow == ma_eRender) {
-                    ma_device_reroute__wasapi(pThis->pDevice, ma_device_type_playback);
-
-                    if (pThis->pDevice->wasapi.isDetachedPlayback) {
-                        pThis->pDevice->wasapi.isDetachedPlayback = MA_FALSE;
-
-                        if (pThis->pDevice->type == ma_device_type_duplex && pThis->pDevice->wasapi.isDetachedCapture) {
-                            restartDevice = MA_FALSE;   /* It's a duplex device and the capture side is detached. We cannot be restarting the device just yet. */
-                        }
-                        else {
-                            restartDevice = MA_TRUE;    /* It's not a duplex device, or the capture side is also attached so we can go ahead and restart the device. */
-                        }
-                    }
-                }
-                else {
-                    ma_device_reroute__wasapi(pThis->pDevice, (pThis->pDevice->type == ma_device_type_loopback) ? ma_device_type_loopback : ma_device_type_capture);
-
-                    if (pThis->pDevice->wasapi.isDetachedCapture) {
-                        pThis->pDevice->wasapi.isDetachedCapture = MA_FALSE;
-
-                        if (pThis->pDevice->type == ma_device_type_duplex && pThis->pDevice->wasapi.isDetachedPlayback) {
-                            restartDevice = MA_FALSE;   /* It's a duplex device and the playback side is detached. We cannot be restarting the device just yet. */
-                        }
-                        else {
-                            restartDevice = MA_TRUE;    /* It's not a duplex device, or the playback side is also attached so we can go ahead and restart the device. */
-                        }
-                    }
-                }
-            }
-            ma_mutex_unlock(&pThis->pDevice->wasapi.rerouteLock);
-
-            if (restartDevice) {
-                ma_device_start(pThis->pDevice);
-            }
-        }
-    }
-
-    return S_OK;
-}
-
-static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnPropertyValueChanged(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID, const PROPERTYKEY key)
-{
-#ifdef MA_DEBUG_OUTPUT
-    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnPropertyValueChanged(pDeviceID=%S)\n", (pDeviceID != NULL) ? pDeviceID : L"(NULL)");*/
-#endif
-
-    (void)pThis;
-    (void)pDeviceID;
-    (void)key;
-    return S_OK;
-}
-
-static ma_IMMNotificationClientVtbl g_maNotificationCientVtbl = {
-    ma_IMMNotificationClient_QueryInterface,
-    ma_IMMNotificationClient_AddRef,
-    ma_IMMNotificationClient_Release,
-    ma_IMMNotificationClient_OnDeviceStateChanged,
-    ma_IMMNotificationClient_OnDeviceAdded,
-    ma_IMMNotificationClient_OnDeviceRemoved,
-    ma_IMMNotificationClient_OnDefaultDeviceChanged,
-    ma_IMMNotificationClient_OnPropertyValueChanged
-};
-#endif  /* MA_WIN32_DESKTOP */
-
-static const char* ma_to_usage_string__wasapi(ma_wasapi_usage usage)
-{
-    switch (usage)
-    {
-        case ma_wasapi_usage_default:   return NULL;
-        case ma_wasapi_usage_games:     return "Games";
-        case ma_wasapi_usage_pro_audio: return "Pro Audio";
-        default: break;
-    }
-
-    return NULL;
-}
-
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-typedef ma_IMMDevice ma_WASAPIDeviceInterface;
-#else
-typedef ma_IUnknown ma_WASAPIDeviceInterface;
-#endif
-
-
-#define MA_CONTEXT_COMMAND_QUIT__WASAPI                 1
-#define MA_CONTEXT_COMMAND_CREATE_IAUDIOCLIENT__WASAPI  2
-#define MA_CONTEXT_COMMAND_RELEASE_IAUDIOCLIENT__WASAPI 3
-
-static ma_context_command__wasapi ma_context_init_command__wasapi(int code)
-{
-    ma_context_command__wasapi cmd;
-
-    MA_ZERO_OBJECT(&cmd);
-    cmd.code = code;
-
-    return cmd;
-}
-
-static ma_result ma_context_post_command__wasapi(ma_context* pContext, const ma_context_command__wasapi* pCmd)
-{
-    /* For now we are doing everything synchronously, but I might relax this later if the need arises. */
-    ma_result result;
-    ma_bool32 isUsingLocalEvent = MA_FALSE;
-    ma_event localEvent;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pCmd     != NULL);
-
-    if (pCmd->pEvent == NULL) {
-        isUsingLocalEvent = MA_TRUE;
-
-        result = ma_event_init(&localEvent);
-        if (result != MA_SUCCESS) {
-            return result;  /* Failed to create the event for this command. */
-        }
-    }
-
-    /* Here is where we add the command to the list. If there's not enough room we'll spin until there is. */
-    ma_mutex_lock(&pContext->wasapi.commandLock);
-    {
-        ma_uint32 index;
-
-        /* Spin until we've got some space available. */
-        while (pContext->wasapi.commandCount == ma_countof(pContext->wasapi.commands)) {
-            ma_yield();
-        }
-
-        /* Space is now available. Can safely add to the list. */
-        index = (pContext->wasapi.commandIndex + pContext->wasapi.commandCount) % ma_countof(pContext->wasapi.commands);
-        pContext->wasapi.commands[index]        = *pCmd;
-        pContext->wasapi.commands[index].pEvent = &localEvent;
-        pContext->wasapi.commandCount += 1;
-
-        /* Now that the command has been added, release the semaphore so ma_context_next_command__wasapi() can return. */
-        ma_semaphore_release(&pContext->wasapi.commandSem);
-    }
-    ma_mutex_unlock(&pContext->wasapi.commandLock);
-
-    if (isUsingLocalEvent) {
-        ma_event_wait(&localEvent);
-        ma_event_uninit(&localEvent);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_next_command__wasapi(ma_context* pContext, ma_context_command__wasapi* pCmd)
-{
-    ma_result result = MA_SUCCESS;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pCmd     != NULL);
-
-    result = ma_semaphore_wait(&pContext->wasapi.commandSem);
-    if (result == MA_SUCCESS) {
-        ma_mutex_lock(&pContext->wasapi.commandLock);
-        {
-            *pCmd = pContext->wasapi.commands[pContext->wasapi.commandIndex];
-            pContext->wasapi.commandIndex  = (pContext->wasapi.commandIndex + 1) % ma_countof(pContext->wasapi.commands);
-            pContext->wasapi.commandCount -= 1;
-        }
-        ma_mutex_unlock(&pContext->wasapi.commandLock);
-    }
-
-    return result;
-}
-
-static ma_thread_result MA_THREADCALL ma_context_command_thread__wasapi(void* pUserData)
-{
-    ma_result result;
-    ma_context* pContext = (ma_context*)pUserData;
-    MA_ASSERT(pContext != NULL);
-
-    for (;;) {
-        ma_context_command__wasapi cmd;
-        result = ma_context_next_command__wasapi(pContext, &cmd);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        switch (cmd.code)
-        {
-            case MA_CONTEXT_COMMAND_QUIT__WASAPI:
-            {
-                /* Do nothing. Handled after the switch. */
-            } break;
-
-            case MA_CONTEXT_COMMAND_CREATE_IAUDIOCLIENT__WASAPI:
-            {
-                if (cmd.data.createAudioClient.deviceType == ma_device_type_playback) {
-                    *cmd.data.createAudioClient.pResult = ma_result_from_HRESULT(ma_IAudioClient_GetService((ma_IAudioClient*)cmd.data.createAudioClient.pAudioClient, &MA_IID_IAudioRenderClient, cmd.data.createAudioClient.ppAudioClientService));
-                } else {
-                    *cmd.data.createAudioClient.pResult = ma_result_from_HRESULT(ma_IAudioClient_GetService((ma_IAudioClient*)cmd.data.createAudioClient.pAudioClient, &MA_IID_IAudioCaptureClient, cmd.data.createAudioClient.ppAudioClientService));
-                }
-            } break;
-
-            case MA_CONTEXT_COMMAND_RELEASE_IAUDIOCLIENT__WASAPI:
-            {
-                if (cmd.data.releaseAudioClient.deviceType == ma_device_type_playback) {
-                    if (cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientPlayback != NULL) {
-                        ma_IAudioClient_Release((ma_IAudioClient*)cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientPlayback);
-                        cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientPlayback = NULL;
-                    }
-                }
-
-                if (cmd.data.releaseAudioClient.deviceType == ma_device_type_capture) {
-                    if (cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientCapture != NULL) {
-                        ma_IAudioClient_Release((ma_IAudioClient*)cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientCapture);
-                        cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientCapture = NULL;
-                    }
-                }
-            } break;
-
-            default:
-            {
-                /* Unknown command. Ignore it, but trigger an assert in debug mode so we're aware of it. */
-                MA_ASSERT(MA_FALSE);
-            } break;
-        }
-
-        if (cmd.pEvent != NULL) {
-            ma_event_signal(cmd.pEvent);
-        }
-
-        if (cmd.code == MA_CONTEXT_COMMAND_QUIT__WASAPI) {
-            break;  /* Received a quit message. Get out of here. */
-        }
-    }
-
-    return (ma_thread_result)0;
-}
-
-static ma_result ma_device_create_IAudioClient_service__wasapi(ma_context* pContext, ma_device_type deviceType, ma_IAudioClient* pAudioClient, void** ppAudioClientService)
-{
-    ma_result result;
-    ma_result cmdResult;
-    ma_context_command__wasapi cmd = ma_context_init_command__wasapi(MA_CONTEXT_COMMAND_CREATE_IAUDIOCLIENT__WASAPI);
-    cmd.data.createAudioClient.deviceType           = deviceType;
-    cmd.data.createAudioClient.pAudioClient         = (void*)pAudioClient;
-    cmd.data.createAudioClient.ppAudioClientService = ppAudioClientService;
-    cmd.data.createAudioClient.pResult              = &cmdResult;   /* Declared locally, but won't be dereferenced after this function returns since execution of the command will wait here. */
-
-    result = ma_context_post_command__wasapi(pContext, &cmd);  /* This will not return until the command has actually been run. */
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return *cmd.data.createAudioClient.pResult;
-}
-
-#if 0   /* Not used at the moment, but leaving here for future use. */
-static ma_result ma_device_release_IAudioClient_service__wasapi(ma_device* pDevice, ma_device_type deviceType)
-{
-    ma_result result;
-    ma_context_command__wasapi cmd = ma_context_init_command__wasapi(MA_CONTEXT_COMMAND_RELEASE_IAUDIOCLIENT__WASAPI);
-    cmd.data.releaseAudioClient.pDevice    = pDevice;
-    cmd.data.releaseAudioClient.deviceType = deviceType;
-
-    result = ma_context_post_command__wasapi(pDevice->pContext, &cmd);  /* This will not return until the command has actually been run. */
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-#endif
-
-
-static void ma_add_native_data_format_to_device_info_from_WAVEFORMATEX(const MA_WAVEFORMATEX* pWF, ma_share_mode shareMode, ma_device_info* pInfo)
-{
-    MA_ASSERT(pWF != NULL);
-    MA_ASSERT(pInfo != NULL);
-
-    if (pInfo->nativeDataFormatCount >= ma_countof(pInfo->nativeDataFormats)) {
-        return; /* Too many data formats. Need to ignore this one. Don't think this should ever happen with WASAPI. */
-    }
-
-    pInfo->nativeDataFormats[pInfo->nativeDataFormatCount].format     = ma_format_from_WAVEFORMATEX(pWF);
-    pInfo->nativeDataFormats[pInfo->nativeDataFormatCount].channels   = pWF->nChannels;
-    pInfo->nativeDataFormats[pInfo->nativeDataFormatCount].sampleRate = pWF->nSamplesPerSec;
-    pInfo->nativeDataFormats[pInfo->nativeDataFormatCount].flags      = (shareMode == ma_share_mode_exclusive) ? MA_DATA_FORMAT_FLAG_EXCLUSIVE_MODE : 0;
-    pInfo->nativeDataFormatCount += 1;
-}
-
-static ma_result ma_context_get_device_info_from_IAudioClient__wasapi(ma_context* pContext, /*ma_IMMDevice**/void* pMMDevice, ma_IAudioClient* pAudioClient, ma_device_info* pInfo)
-{
-    HRESULT hr;
-    MA_WAVEFORMATEX* pWF = NULL;
-
-    MA_ASSERT(pAudioClient != NULL);
-    MA_ASSERT(pInfo != NULL);
-
-    /* Shared Mode. We use GetMixFormat() here. */
-    hr = ma_IAudioClient_GetMixFormat((ma_IAudioClient*)pAudioClient, (MA_WAVEFORMATEX**)&pWF);
-    if (SUCCEEDED(hr)) {
-        ma_add_native_data_format_to_device_info_from_WAVEFORMATEX(pWF, ma_share_mode_shared, pInfo);
-    } else {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to retrieve mix format for device info retrieval.");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    /*
-    Exlcusive Mode. We repeatedly call IsFormatSupported() here. This is not currently supported on
-    UWP. Failure to retrieve the exclusive mode format is not considered an error, so from here on
-    out, MA_SUCCESS is guaranteed to be returned.
-    */
-    #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    {
-        ma_IPropertyStore *pProperties;
-
-        /*
-        The first thing to do is get the format from PKEY_AudioEngine_DeviceFormat. This should give us a channel count we assume is
-        correct which will simplify our searching.
-        */
-        hr = ma_IMMDevice_OpenPropertyStore((ma_IMMDevice*)pMMDevice, STGM_READ, &pProperties);
-        if (SUCCEEDED(hr)) {
-            MA_PROPVARIANT var;
-            ma_PropVariantInit(&var);
-
-            hr = ma_IPropertyStore_GetValue(pProperties, &MA_PKEY_AudioEngine_DeviceFormat, &var);
-            if (SUCCEEDED(hr)) {
-                pWF = (MA_WAVEFORMATEX*)var.blob.pBlobData;
-
-                /*
-                In my testing, the format returned by PKEY_AudioEngine_DeviceFormat is suitable for exclusive mode so we check this format
-                first. If this fails, fall back to a search.
-                */
-                hr = ma_IAudioClient_IsFormatSupported((ma_IAudioClient*)pAudioClient, MA_AUDCLNT_SHAREMODE_EXCLUSIVE, pWF, NULL);
-                if (SUCCEEDED(hr)) {
-                    /* The format returned by PKEY_AudioEngine_DeviceFormat is supported. */
-                    ma_add_native_data_format_to_device_info_from_WAVEFORMATEX(pWF, ma_share_mode_exclusive, pInfo);
-                } else {
-                    /*
-                    The format returned by PKEY_AudioEngine_DeviceFormat is not supported, so fall back to a search. We assume the channel
-                    count returned by MA_PKEY_AudioEngine_DeviceFormat is valid and correct. For simplicity we're only returning one format.
-                    */
-                    ma_uint32 channels = pWF->nChannels;
-                    ma_channel defaultChannelMap[MA_MAX_CHANNELS];
-                    MA_WAVEFORMATEXTENSIBLE wf;
-                    ma_bool32 found;
-                    ma_uint32 iFormat;
-
-                    /* Make sure we don't overflow the channel map. */
-                    if (channels > MA_MAX_CHANNELS) {
-                        channels = MA_MAX_CHANNELS;
-                    }
-
-                    ma_channel_map_init_standard(ma_standard_channel_map_microsoft, defaultChannelMap, ma_countof(defaultChannelMap), channels);
-
-                    MA_ZERO_OBJECT(&wf);
-                    wf.cbSize     = sizeof(wf);
-                    wf.wFormatTag = WAVE_FORMAT_EXTENSIBLE;
-                    wf.nChannels  = (WORD)channels;
-                    wf.dwChannelMask     = ma_channel_map_to_channel_mask__win32(defaultChannelMap, channels);
-
-                    found = MA_FALSE;
-                    for (iFormat = 0; iFormat < ma_countof(g_maFormatPriorities); ++iFormat) {
-                        ma_format format = g_maFormatPriorities[iFormat];
-                        ma_uint32 iSampleRate;
-
-                        wf.wBitsPerSample       = (WORD)(ma_get_bytes_per_sample(format)*8);
-                        wf.nBlockAlign          = (WORD)(wf.nChannels * wf.wBitsPerSample / 8);
-                        wf.nAvgBytesPerSec      = wf.nBlockAlign * wf.nSamplesPerSec;
-                        wf.Samples.wValidBitsPerSample = /*(format == ma_format_s24_32) ? 24 :*/ wf.wBitsPerSample;
-                        if (format == ma_format_f32) {
-                            wf.SubFormat = MA_GUID_KSDATAFORMAT_SUBTYPE_IEEE_FLOAT;
-                        } else {
-                            wf.SubFormat = MA_GUID_KSDATAFORMAT_SUBTYPE_PCM;
-                        }
-
-                        for (iSampleRate = 0; iSampleRate < ma_countof(g_maStandardSampleRatePriorities); ++iSampleRate) {
-                            wf.nSamplesPerSec = g_maStandardSampleRatePriorities[iSampleRate];
-
-                            hr = ma_IAudioClient_IsFormatSupported((ma_IAudioClient*)pAudioClient, MA_AUDCLNT_SHAREMODE_EXCLUSIVE, (MA_WAVEFORMATEX*)&wf, NULL);
-                            if (SUCCEEDED(hr)) {
-                                ma_add_native_data_format_to_device_info_from_WAVEFORMATEX((MA_WAVEFORMATEX*)&wf, ma_share_mode_exclusive, pInfo);
-                                found = MA_TRUE;
-                                break;
-                            }
-                        }
-
-                        if (found) {
-                            break;
-                        }
-                    }
-
-                    ma_PropVariantClear(pContext, &var);
-
-                    if (!found) {
-                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "[WASAPI] Failed to find suitable device format for device info retrieval.");
-                    }
-                }
-            } else {
-                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "[WASAPI] Failed to retrieve device format for device info retrieval.");
-            }
-
-            ma_IPropertyStore_Release(pProperties);
-        } else {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "[WASAPI] Failed to open property store for device info retrieval.");
-        }
-    }
-    #else
-    {
-        (void)pMMDevice;    /* Unused. */
-    }
-    #endif
-
-    return MA_SUCCESS;
-}
-
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-static ma_EDataFlow ma_device_type_to_EDataFlow(ma_device_type deviceType)
-{
-    if (deviceType == ma_device_type_playback) {
-        return ma_eRender;
-    } else if (deviceType == ma_device_type_capture) {
-        return ma_eCapture;
-    } else {
-        MA_ASSERT(MA_FALSE);
-        return ma_eRender; /* Should never hit this. */
-    }
-}
-
-static ma_result ma_context_create_IMMDeviceEnumerator__wasapi(ma_context* pContext, ma_IMMDeviceEnumerator** ppDeviceEnumerator)
-{
-    HRESULT hr;
-    ma_IMMDeviceEnumerator* pDeviceEnumerator;
-
-    MA_ASSERT(pContext           != NULL);
-    MA_ASSERT(ppDeviceEnumerator != NULL);
-
-    *ppDeviceEnumerator = NULL; /* Safety. */
-
-    hr = ma_CoCreateInstance(pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
-    if (FAILED(hr)) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create device enumerator.");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    *ppDeviceEnumerator = pDeviceEnumerator;
-
-    return MA_SUCCESS;
-}
-
-static WCHAR* ma_context_get_default_device_id_from_IMMDeviceEnumerator__wasapi(ma_context* pContext, ma_IMMDeviceEnumerator* pDeviceEnumerator, ma_device_type deviceType)
-{
-    HRESULT hr;
-    ma_IMMDevice* pMMDefaultDevice = NULL;
-    WCHAR* pDefaultDeviceID = NULL;
-    ma_EDataFlow dataFlow;
-    ma_ERole role;
-
-    MA_ASSERT(pContext          != NULL);
-    MA_ASSERT(pDeviceEnumerator != NULL);
-
-    (void)pContext;
-
-    /* Grab the EDataFlow type from the device type. */
-    dataFlow = ma_device_type_to_EDataFlow(deviceType);
-
-    /* The role is always eConsole, but we may make this configurable later. */
-    role = ma_eConsole;
-
-    hr = ma_IMMDeviceEnumerator_GetDefaultAudioEndpoint(pDeviceEnumerator, dataFlow, role, &pMMDefaultDevice);
-    if (FAILED(hr)) {
-        return NULL;
-    }
-
-    hr = ma_IMMDevice_GetId(pMMDefaultDevice, &pDefaultDeviceID);
-
-    ma_IMMDevice_Release(pMMDefaultDevice);
-    pMMDefaultDevice = NULL;
-
-    if (FAILED(hr)) {
-        return NULL;
-    }
-
-    return pDefaultDeviceID;
-}
-
-static WCHAR* ma_context_get_default_device_id__wasapi(ma_context* pContext, ma_device_type deviceType)    /* Free the returned pointer with ma_CoTaskMemFree() */
-{
-    ma_result result;
-    ma_IMMDeviceEnumerator* pDeviceEnumerator;
-    WCHAR* pDefaultDeviceID = NULL;
-
-    MA_ASSERT(pContext != NULL);
-
-    result = ma_context_create_IMMDeviceEnumerator__wasapi(pContext, &pDeviceEnumerator);
-    if (result != MA_SUCCESS) {
-        return NULL;
-    }
-
-    pDefaultDeviceID = ma_context_get_default_device_id_from_IMMDeviceEnumerator__wasapi(pContext, pDeviceEnumerator, deviceType);
-
-    ma_IMMDeviceEnumerator_Release(pDeviceEnumerator);
-    return pDefaultDeviceID;
-}
-
-static ma_result ma_context_get_MMDevice__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_IMMDevice** ppMMDevice)
-{
-    ma_IMMDeviceEnumerator* pDeviceEnumerator;
-    HRESULT hr;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(ppMMDevice != NULL);
-
-    hr = ma_CoCreateInstance(pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
-    if (FAILED(hr)) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create IMMDeviceEnumerator.\n");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    if (pDeviceID == NULL) {
-        hr = ma_IMMDeviceEnumerator_GetDefaultAudioEndpoint(pDeviceEnumerator, (deviceType == ma_device_type_capture) ? ma_eCapture : ma_eRender, ma_eConsole, ppMMDevice);
-    } else {
-        hr = ma_IMMDeviceEnumerator_GetDevice(pDeviceEnumerator, pDeviceID->wasapi, ppMMDevice);
-    }
-
-    ma_IMMDeviceEnumerator_Release(pDeviceEnumerator);
-    if (FAILED(hr)) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to retrieve IMMDevice.\n");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_id_from_MMDevice__wasapi(ma_context* pContext, ma_IMMDevice* pMMDevice, ma_device_id* pDeviceID)
-{
-    WCHAR* pDeviceIDString;
-    HRESULT hr;
-
-    MA_ASSERT(pDeviceID != NULL);
-
-    hr = ma_IMMDevice_GetId(pMMDevice, &pDeviceIDString);
-    if (SUCCEEDED(hr)) {
-        size_t idlen = ma_strlen_WCHAR(pDeviceIDString);
-        if (idlen+1 > ma_countof(pDeviceID->wasapi)) {
-            ma_CoTaskMemFree(pContext, pDeviceIDString);
-            MA_ASSERT(MA_FALSE);  /* NOTE: If this is triggered, please report it. It means the format of the ID must haved change and is too long to fit in our fixed sized buffer. */
-            return MA_ERROR;
-        }
-
-        MA_COPY_MEMORY(pDeviceID->wasapi, pDeviceIDString, idlen * sizeof(wchar_t));
-        pDeviceID->wasapi[idlen] = '\0';
-
-        ma_CoTaskMemFree(pContext, pDeviceIDString);
-
-        return MA_SUCCESS;
-    }
-
-    return MA_ERROR;
-}
-
-static ma_result ma_context_get_device_info_from_MMDevice__wasapi(ma_context* pContext, ma_IMMDevice* pMMDevice, WCHAR* pDefaultDeviceID, ma_bool32 onlySimpleInfo, ma_device_info* pInfo)
-{
-    ma_result result;
-    HRESULT hr;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pMMDevice != NULL);
-    MA_ASSERT(pInfo != NULL);
-
-    /* ID. */
-    result = ma_context_get_device_id_from_MMDevice__wasapi(pContext, pMMDevice, &pInfo->id);
-    if (result == MA_SUCCESS) {
-        if (pDefaultDeviceID != NULL) {
-            if (ma_strcmp_WCHAR(pInfo->id.wasapi, pDefaultDeviceID) == 0) {
-                pInfo->isDefault = MA_TRUE;
-            }
-        }
-    }
-
-    /* Description / Friendly Name */
-    {
-        ma_IPropertyStore *pProperties;
-        hr = ma_IMMDevice_OpenPropertyStore(pMMDevice, STGM_READ, &pProperties);
-        if (SUCCEEDED(hr)) {
-            MA_PROPVARIANT var;
-
-            ma_PropVariantInit(&var);
-            hr = ma_IPropertyStore_GetValue(pProperties, &MA_PKEY_Device_FriendlyName, &var);
-            if (SUCCEEDED(hr)) {
-                WideCharToMultiByte(CP_UTF8, 0, var.pwszVal, -1, pInfo->name, sizeof(pInfo->name), 0, FALSE);
-                ma_PropVariantClear(pContext, &var);
-            }
-
-            ma_IPropertyStore_Release(pProperties);
-        }
-    }
-
-    /* Format */
-    if (!onlySimpleInfo) {
-        ma_IAudioClient* pAudioClient;
-        hr = ma_IMMDevice_Activate(pMMDevice, &MA_IID_IAudioClient, CLSCTX_ALL, NULL, (void**)&pAudioClient);
-        if (SUCCEEDED(hr)) {
-            result = ma_context_get_device_info_from_IAudioClient__wasapi(pContext, pMMDevice, pAudioClient, pInfo);
-
-            ma_IAudioClient_Release(pAudioClient);
-            return result;
-        } else {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to activate audio client for device info retrieval.");
-            return ma_result_from_HRESULT(hr);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_enumerate_devices_by_type__wasapi(ma_context* pContext, ma_IMMDeviceEnumerator* pDeviceEnumerator, ma_device_type deviceType, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_result result = MA_SUCCESS;
-    UINT deviceCount;
-    HRESULT hr;
-    ma_uint32 iDevice;
-    WCHAR* pDefaultDeviceID = NULL;
-    ma_IMMDeviceCollection* pDeviceCollection = NULL;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    /* Grab the default device. We use this to know whether or not flag the returned device info as being the default. */
-    pDefaultDeviceID = ma_context_get_default_device_id_from_IMMDeviceEnumerator__wasapi(pContext, pDeviceEnumerator, deviceType);
-
-    /* We need to enumerate the devices which returns a device collection. */
-    hr = ma_IMMDeviceEnumerator_EnumAudioEndpoints(pDeviceEnumerator, ma_device_type_to_EDataFlow(deviceType), MA_MM_DEVICE_STATE_ACTIVE, &pDeviceCollection);
-    if (SUCCEEDED(hr)) {
-        hr = ma_IMMDeviceCollection_GetCount(pDeviceCollection, &deviceCount);
-        if (FAILED(hr)) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to get device count.\n");
-            result = ma_result_from_HRESULT(hr);
-            goto done;
-        }
-
-        for (iDevice = 0; iDevice < deviceCount; ++iDevice) {
-            ma_device_info deviceInfo;
-            ma_IMMDevice* pMMDevice;
-
-            MA_ZERO_OBJECT(&deviceInfo);
-
-            hr = ma_IMMDeviceCollection_Item(pDeviceCollection, iDevice, &pMMDevice);
-            if (SUCCEEDED(hr)) {
-                result = ma_context_get_device_info_from_MMDevice__wasapi(pContext, pMMDevice, pDefaultDeviceID, MA_TRUE, &deviceInfo);   /* MA_TRUE = onlySimpleInfo. */
-
-                ma_IMMDevice_Release(pMMDevice);
-                if (result == MA_SUCCESS) {
-                    ma_bool32 cbResult = callback(pContext, deviceType, &deviceInfo, pUserData);
-                    if (cbResult == MA_FALSE) {
-                        break;
-                    }
-                }
-            }
-        }
-    }
-
-done:
-    if (pDefaultDeviceID != NULL) {
-        ma_CoTaskMemFree(pContext, pDefaultDeviceID);
-        pDefaultDeviceID = NULL;
-    }
-
-    if (pDeviceCollection != NULL) {
-        ma_IMMDeviceCollection_Release(pDeviceCollection);
-        pDeviceCollection = NULL;
-    }
-
-    return result;
-}
-
-static ma_result ma_context_get_IAudioClient_Desktop__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, MA_PROPVARIANT* pActivationParams, ma_IAudioClient** ppAudioClient, ma_IMMDevice** ppMMDevice)
-{
-    ma_result result;
-    HRESULT hr;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(ppAudioClient != NULL);
-    MA_ASSERT(ppMMDevice != NULL);
-
-    result = ma_context_get_MMDevice__wasapi(pContext, deviceType, pDeviceID, ppMMDevice);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    hr = ma_IMMDevice_Activate(*ppMMDevice, &MA_IID_IAudioClient, CLSCTX_ALL, pActivationParams, (void**)ppAudioClient);
-    if (FAILED(hr)) {
-        return ma_result_from_HRESULT(hr);
-    }
-
-    return MA_SUCCESS;
-}
-#else
-static ma_result ma_context_get_IAudioClient_UWP__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, MA_PROPVARIANT* pActivationParams, ma_IAudioClient** ppAudioClient, ma_IUnknown** ppActivatedInterface)
-{
-    ma_IActivateAudioInterfaceAsyncOperation *pAsyncOp = NULL;
-    ma_completion_handler_uwp completionHandler;
-    IID iid;
-    WCHAR* iidStr;
-    HRESULT hr;
-    ma_result result;
-    HRESULT activateResult;
-    ma_IUnknown* pActivatedInterface;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(ppAudioClient != NULL);
-
-    if (pDeviceID != NULL) {
-        iidStr = (WCHAR*)pDeviceID->wasapi;
-    } else {
-        if (deviceType == ma_device_type_capture) {
-            iid = MA_IID_DEVINTERFACE_AUDIO_CAPTURE;
-        } else {
-            iid = MA_IID_DEVINTERFACE_AUDIO_RENDER;
-        }
-
-    #if defined(__cplusplus)
-        hr = StringFromIID(iid, &iidStr);
-    #else
-        hr = StringFromIID(&iid, &iidStr);
-    #endif
-        if (FAILED(hr)) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to convert device IID to string for ActivateAudioInterfaceAsync(). Out of memory.\n");
-            return ma_result_from_HRESULT(hr);
-        }
-    }
-
-    result = ma_completion_handler_uwp_init(&completionHandler);
-    if (result != MA_SUCCESS) {
-        ma_CoTaskMemFree(pContext, iidStr);
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create event for waiting for ActivateAudioInterfaceAsync().\n");
-        return result;
-    }
-
-    hr = ((MA_PFN_ActivateAudioInterfaceAsync)pContext->wasapi.ActivateAudioInterfaceAsync)(iidStr, &MA_IID_IAudioClient, pActivationParams, (ma_IActivateAudioInterfaceCompletionHandler*)&completionHandler, (ma_IActivateAudioInterfaceAsyncOperation**)&pAsyncOp);
-    if (FAILED(hr)) {
-        ma_completion_handler_uwp_uninit(&completionHandler);
-        ma_CoTaskMemFree(pContext, iidStr);
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] ActivateAudioInterfaceAsync() failed.\n");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    if (pDeviceID == NULL) {
-        ma_CoTaskMemFree(pContext, iidStr);
-    }
-
-    /* Wait for the async operation for finish. */
-    ma_completion_handler_uwp_wait(&completionHandler);
-    ma_completion_handler_uwp_uninit(&completionHandler);
-
-    hr = ma_IActivateAudioInterfaceAsyncOperation_GetActivateResult(pAsyncOp, &activateResult, &pActivatedInterface);
-    ma_IActivateAudioInterfaceAsyncOperation_Release(pAsyncOp);
-
-    if (FAILED(hr) || FAILED(activateResult)) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to activate device.\n");
-        return FAILED(hr) ? ma_result_from_HRESULT(hr) : ma_result_from_HRESULT(activateResult);
-    }
-
-    /* Here is where we grab the IAudioClient interface. */
-    hr = ma_IUnknown_QueryInterface(pActivatedInterface, &MA_IID_IAudioClient, (void**)ppAudioClient);
-    if (FAILED(hr)) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to query IAudioClient interface.\n");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    if (ppActivatedInterface) {
-        *ppActivatedInterface = pActivatedInterface;
-    } else {
-        ma_IUnknown_Release(pActivatedInterface);
-    }
-
-    return MA_SUCCESS;
-}
-#endif
-
-
-/* https://docs.microsoft.com/en-us/windows/win32/api/audioclientactivationparams/ne-audioclientactivationparams-audioclient_activation_type */
-typedef enum
-{
-    MA_AUDIOCLIENT_ACTIVATION_TYPE_DEFAULT,
-    MA_AUDIOCLIENT_ACTIVATION_TYPE_PROCESS_LOOPBACK
-} MA_AUDIOCLIENT_ACTIVATION_TYPE;
-
-/* https://docs.microsoft.com/en-us/windows/win32/api/audioclientactivationparams/ne-audioclientactivationparams-process_loopback_mode */
-typedef enum
-{
-    MA_PROCESS_LOOPBACK_MODE_INCLUDE_TARGET_PROCESS_TREE,
-    MA_PROCESS_LOOPBACK_MODE_EXCLUDE_TARGET_PROCESS_TREE
-} MA_PROCESS_LOOPBACK_MODE;
-
-/* https://docs.microsoft.com/en-us/windows/win32/api/audioclientactivationparams/ns-audioclientactivationparams-audioclient_process_loopback_params */
-typedef struct
-{
-    DWORD TargetProcessId;
-    MA_PROCESS_LOOPBACK_MODE ProcessLoopbackMode;
-} MA_AUDIOCLIENT_PROCESS_LOOPBACK_PARAMS;
-
-#if defined(_MSC_VER) && !defined(__clang__)
-    #pragma warning(push)
-    #pragma warning(disable:4201)   /* nonstandard extension used: nameless struct/union */
-#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wpedantic" /* For ISO C99 doesn't support unnamed structs/unions [-Wpedantic] */
-    #if defined(__clang__)
-        #pragma GCC diagnostic ignored "-Wc11-extensions"   /* anonymous unions are a C11 extension */
-    #endif
-#endif
-/* https://docs.microsoft.com/en-us/windows/win32/api/audioclientactivationparams/ns-audioclientactivationparams-audioclient_activation_params */
-typedef struct
-{
-    MA_AUDIOCLIENT_ACTIVATION_TYPE ActivationType;
-    union
-    {
-        MA_AUDIOCLIENT_PROCESS_LOOPBACK_PARAMS ProcessLoopbackParams;
-    };
-} MA_AUDIOCLIENT_ACTIVATION_PARAMS;
-#if defined(_MSC_VER) && !defined(__clang__)
-    #pragma warning(pop)
-#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-    #pragma GCC diagnostic pop
-#endif
-
-#define MA_VIRTUAL_AUDIO_DEVICE_PROCESS_LOOPBACK L"VAD\\Process_Loopback"
-
-static ma_result ma_context_get_IAudioClient__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_uint32 loopbackProcessID, ma_bool32 loopbackProcessExclude, ma_IAudioClient** ppAudioClient, ma_WASAPIDeviceInterface** ppDeviceInterface)
-{
-    ma_result result;
-    ma_bool32 usingProcessLoopback = MA_FALSE;
-    MA_AUDIOCLIENT_ACTIVATION_PARAMS audioclientActivationParams;
-    MA_PROPVARIANT activationParams;
-    MA_PROPVARIANT* pActivationParams = NULL;
-    ma_device_id virtualDeviceID;
-
-    /* Activation parameters specific to loopback mode. Note that process-specific loopback will only work when a default device ID is specified. */
-    if (deviceType == ma_device_type_loopback && loopbackProcessID != 0 && pDeviceID == NULL) {
-        usingProcessLoopback = MA_TRUE;
-    }
-
-    if (usingProcessLoopback) {
-        MA_ZERO_OBJECT(&audioclientActivationParams);
-        audioclientActivationParams.ActivationType                            = MA_AUDIOCLIENT_ACTIVATION_TYPE_PROCESS_LOOPBACK;
-        audioclientActivationParams.ProcessLoopbackParams.ProcessLoopbackMode = (loopbackProcessExclude) ? MA_PROCESS_LOOPBACK_MODE_EXCLUDE_TARGET_PROCESS_TREE : MA_PROCESS_LOOPBACK_MODE_INCLUDE_TARGET_PROCESS_TREE;
-        audioclientActivationParams.ProcessLoopbackParams.TargetProcessId     = (DWORD)loopbackProcessID;
-
-        ma_PropVariantInit(&activationParams);
-        activationParams.vt             = MA_VT_BLOB;
-        activationParams.blob.cbSize    = sizeof(audioclientActivationParams);
-        activationParams.blob.pBlobData = (BYTE*)&audioclientActivationParams;
-        pActivationParams = &activationParams;
-
-        /* When requesting a specific device ID we need to use a special device ID. */
-        MA_COPY_MEMORY(virtualDeviceID.wasapi, MA_VIRTUAL_AUDIO_DEVICE_PROCESS_LOOPBACK, (wcslen(MA_VIRTUAL_AUDIO_DEVICE_PROCESS_LOOPBACK) + 1) * sizeof(wchar_t)); /* +1 for the null terminator. */
-        pDeviceID = &virtualDeviceID;
-    } else {
-        pActivationParams = NULL;   /* No activation parameters required. */
-    }
-
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    result = ma_context_get_IAudioClient_Desktop__wasapi(pContext, deviceType, pDeviceID, pActivationParams, ppAudioClient, ppDeviceInterface);
-#else
-    result = ma_context_get_IAudioClient_UWP__wasapi(pContext, deviceType, pDeviceID, pActivationParams, ppAudioClient, ppDeviceInterface);
-#endif
-
-    /*
-    If loopback mode was requested with a process ID and initialization failed, it could be because it's
-    trying to run on an older version of Windows where it's not supported. We need to let the caller
-    know about this with a log message.
-    */
-    if (result != MA_SUCCESS) {
-        if (usingProcessLoopback) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Loopback mode requested to %s process ID %u, but initialization failed. Support for this feature begins with Windows 10 Build 20348. Confirm your version of Windows or consider not using process-specific loopback.\n", (loopbackProcessExclude) ? "exclude" : "include", loopbackProcessID);
-        }
-    }
-
-    return result;
-}
-
-
-static ma_result ma_context_enumerate_devices__wasapi(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    /* Different enumeration for desktop and UWP. */
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    /* Desktop */
-    HRESULT hr;
-    ma_IMMDeviceEnumerator* pDeviceEnumerator;
-
-    hr = ma_CoCreateInstance(pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
-    if (FAILED(hr)) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create device enumerator.");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    ma_context_enumerate_devices_by_type__wasapi(pContext, pDeviceEnumerator, ma_device_type_playback, callback, pUserData);
-    ma_context_enumerate_devices_by_type__wasapi(pContext, pDeviceEnumerator, ma_device_type_capture,  callback, pUserData);
-
-    ma_IMMDeviceEnumerator_Release(pDeviceEnumerator);
-#else
-    /*
-    UWP
-
-    The MMDevice API is only supported on desktop applications. For now, while I'm still figuring out how to properly enumerate
-    over devices without using MMDevice, I'm restricting devices to defaults.
-
-    Hint: DeviceInformation::FindAllAsync() with DeviceClass.AudioCapture/AudioRender. https://blogs.windows.com/buildingapps/2014/05/15/real-time-audio-in-windows-store-and-windows-phone-apps/
-    */
-    if (callback) {
-        ma_bool32 cbResult = MA_TRUE;
-
-        /* Playback. */
-        if (cbResult) {
-            ma_device_info deviceInfo;
-            MA_ZERO_OBJECT(&deviceInfo);
-            ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-            deviceInfo.isDefault = MA_TRUE;
-            cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-        }
-
-        /* Capture. */
-        if (cbResult) {
-            ma_device_info deviceInfo;
-            MA_ZERO_OBJECT(&deviceInfo);
-            ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-            deviceInfo.isDefault = MA_TRUE;
-            cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-        }
-    }
-#endif
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    ma_result result;
-    ma_IMMDevice* pMMDevice = NULL;
-    WCHAR* pDefaultDeviceID = NULL;
-
-    result = ma_context_get_MMDevice__wasapi(pContext, deviceType, pDeviceID, &pMMDevice);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* We need the default device ID so we can set the isDefault flag in the device info. */
-    pDefaultDeviceID = ma_context_get_default_device_id__wasapi(pContext, deviceType);
-
-    result = ma_context_get_device_info_from_MMDevice__wasapi(pContext, pMMDevice, pDefaultDeviceID, MA_FALSE, pDeviceInfo);   /* MA_FALSE = !onlySimpleInfo. */
-
-    if (pDefaultDeviceID != NULL) {
-        ma_CoTaskMemFree(pContext, pDefaultDeviceID);
-        pDefaultDeviceID = NULL;
-    }
-
-    ma_IMMDevice_Release(pMMDevice);
-
-    return result;
-#else
-    ma_IAudioClient* pAudioClient;
-    ma_result result;
-
-    /* UWP currently only uses default devices. */
-    if (deviceType == ma_device_type_playback) {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-    } else {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-    }
-
-    result = ma_context_get_IAudioClient_UWP__wasapi(pContext, deviceType, pDeviceID, NULL, &pAudioClient, NULL);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = ma_context_get_device_info_from_IAudioClient__wasapi(pContext, NULL, pAudioClient, pDeviceInfo);
-
-    pDeviceInfo->isDefault = MA_TRUE;  /* UWP only supports default devices. */
-
-    ma_IAudioClient_Release(pAudioClient);
-    return result;
-#endif
-}
-
-static ma_result ma_device_uninit__wasapi(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    if (pDevice->wasapi.pDeviceEnumerator) {
-        ((ma_IMMDeviceEnumerator*)pDevice->wasapi.pDeviceEnumerator)->lpVtbl->UnregisterEndpointNotificationCallback((ma_IMMDeviceEnumerator*)pDevice->wasapi.pDeviceEnumerator, &pDevice->wasapi.notificationClient);
-        ma_IMMDeviceEnumerator_Release((ma_IMMDeviceEnumerator*)pDevice->wasapi.pDeviceEnumerator);
-    }
-#endif
-
-    if (pDevice->wasapi.pRenderClient) {
-        if (pDevice->wasapi.pMappedBufferPlayback != NULL) {
-            ma_IAudioRenderClient_ReleaseBuffer((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient, pDevice->wasapi.mappedBufferPlaybackCap, 0);
-            pDevice->wasapi.pMappedBufferPlayback   = NULL;
-            pDevice->wasapi.mappedBufferPlaybackCap = 0;
-            pDevice->wasapi.mappedBufferPlaybackLen = 0;
-        }
-
-        ma_IAudioRenderClient_Release((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient);
-    }
-    if (pDevice->wasapi.pCaptureClient) {
-        if (pDevice->wasapi.pMappedBufferCapture != NULL) {
-            ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
-            pDevice->wasapi.pMappedBufferCapture   = NULL;
-            pDevice->wasapi.mappedBufferCaptureCap = 0;
-            pDevice->wasapi.mappedBufferCaptureLen = 0;
-        }
-
-        ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
-    }
-
-    if (pDevice->wasapi.pAudioClientPlayback) {
-        ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback);
-    }
-    if (pDevice->wasapi.pAudioClientCapture) {
-        ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
-    }
-
-    if (pDevice->wasapi.hEventPlayback) {
-        CloseHandle((HANDLE)pDevice->wasapi.hEventPlayback);
-    }
-    if (pDevice->wasapi.hEventCapture) {
-        CloseHandle((HANDLE)pDevice->wasapi.hEventCapture);
-    }
-
-    return MA_SUCCESS;
-}
-
-
-typedef struct
-{
-    /* Input. */
-    ma_format formatIn;
-    ma_uint32 channelsIn;
-    ma_uint32 sampleRateIn;
-    ma_channel channelMapIn[MA_MAX_CHANNELS];
-    ma_uint32 periodSizeInFramesIn;
-    ma_uint32 periodSizeInMillisecondsIn;
-    ma_uint32 periodsIn;
-    ma_share_mode shareMode;
-    ma_performance_profile performanceProfile;
-    ma_bool32 noAutoConvertSRC;
-    ma_bool32 noDefaultQualitySRC;
-    ma_bool32 noHardwareOffloading;
-    ma_uint32 loopbackProcessID;
-    ma_bool32 loopbackProcessExclude;
-
-    /* Output. */
-    ma_IAudioClient* pAudioClient;
-    ma_IAudioRenderClient* pRenderClient;
-    ma_IAudioCaptureClient* pCaptureClient;
-    ma_format formatOut;
-    ma_uint32 channelsOut;
-    ma_uint32 sampleRateOut;
-    ma_channel channelMapOut[MA_MAX_CHANNELS];
-    ma_uint32 periodSizeInFramesOut;
-    ma_uint32 periodsOut;
-    ma_bool32 usingAudioClient3;
-    char deviceName[256];
-    ma_device_id id;
-} ma_device_init_internal_data__wasapi;
-
-static ma_result ma_device_init_internal__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_init_internal_data__wasapi* pData)
-{
-    HRESULT hr;
-    ma_result result = MA_SUCCESS;
-    const char* errorMsg = "";
-    MA_AUDCLNT_SHAREMODE shareMode = MA_AUDCLNT_SHAREMODE_SHARED;
-    DWORD streamFlags = 0;
-    MA_REFERENCE_TIME periodDurationInMicroseconds;
-    ma_bool32 wasInitializedUsingIAudioClient3 = MA_FALSE;
-    MA_WAVEFORMATEXTENSIBLE wf;
-    ma_WASAPIDeviceInterface* pDeviceInterface = NULL;
-    ma_IAudioClient2* pAudioClient2;
-    ma_uint32 nativeSampleRate;
-    ma_bool32 usingProcessLoopback = MA_FALSE;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pData != NULL);
-
-    /* This function is only used to initialize one device type: either playback, capture or loopback. Never full-duplex. */
-    if (deviceType == ma_device_type_duplex) {
-        return MA_INVALID_ARGS;
-    }
-
-    usingProcessLoopback = deviceType == ma_device_type_loopback && pData->loopbackProcessID != 0 && pDeviceID == NULL;
-
-    pData->pAudioClient = NULL;
-    pData->pRenderClient = NULL;
-    pData->pCaptureClient = NULL;
-
-    streamFlags = MA_AUDCLNT_STREAMFLAGS_EVENTCALLBACK;
-    if (!pData->noAutoConvertSRC && pData->sampleRateIn != 0 && pData->shareMode != ma_share_mode_exclusive) {    /* <-- Exclusive streams must use the native sample rate. */
-        streamFlags |= MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM;
-    }
-    if (!pData->noDefaultQualitySRC && pData->sampleRateIn != 0 && (streamFlags & MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM) != 0) {
-        streamFlags |= MA_AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY;
-    }
-    if (deviceType == ma_device_type_loopback) {
-        streamFlags |= MA_AUDCLNT_STREAMFLAGS_LOOPBACK;
-    }
-
-    result = ma_context_get_IAudioClient__wasapi(pContext, deviceType, pDeviceID, pData->loopbackProcessID, pData->loopbackProcessExclude, &pData->pAudioClient, &pDeviceInterface);
-    if (result != MA_SUCCESS) {
-        goto done;
-    }
-
-    MA_ZERO_OBJECT(&wf);
-
-    /* Try enabling hardware offloading. */
-    if (!pData->noHardwareOffloading) {
-        hr = ma_IAudioClient_QueryInterface(pData->pAudioClient, &MA_IID_IAudioClient2, (void**)&pAudioClient2);
-        if (SUCCEEDED(hr)) {
-            BOOL isHardwareOffloadingSupported = 0;
-            hr = ma_IAudioClient2_IsOffloadCapable(pAudioClient2, MA_AudioCategory_Other, &isHardwareOffloadingSupported);
-            if (SUCCEEDED(hr) && isHardwareOffloadingSupported) {
-                ma_AudioClientProperties clientProperties;
-                MA_ZERO_OBJECT(&clientProperties);
-                clientProperties.cbSize = sizeof(clientProperties);
-                clientProperties.bIsOffload = 1;
-                clientProperties.eCategory = MA_AudioCategory_Other;
-                ma_IAudioClient2_SetClientProperties(pAudioClient2, &clientProperties);
-            }
-
-            pAudioClient2->lpVtbl->Release(pAudioClient2);
-        }
-    }
-
-    /* Here is where we try to determine the best format to use with the device. If the client if wanting exclusive mode, first try finding the best format for that. If this fails, fall back to shared mode. */
-    result = MA_FORMAT_NOT_SUPPORTED;
-    if (pData->shareMode == ma_share_mode_exclusive) {
-    #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-        /* In exclusive mode on desktop we always use the backend's native format. */
-        ma_IPropertyStore* pStore = NULL;
-        hr = ma_IMMDevice_OpenPropertyStore(pDeviceInterface, STGM_READ, &pStore);
-        if (SUCCEEDED(hr)) {
-            MA_PROPVARIANT prop;
-            ma_PropVariantInit(&prop);
-            hr = ma_IPropertyStore_GetValue(pStore, &MA_PKEY_AudioEngine_DeviceFormat, &prop);
-            if (SUCCEEDED(hr)) {
-                MA_WAVEFORMATEX* pActualFormat = (MA_WAVEFORMATEX*)prop.blob.pBlobData;
-                hr = ma_IAudioClient_IsFormatSupported((ma_IAudioClient*)pData->pAudioClient, MA_AUDCLNT_SHAREMODE_EXCLUSIVE, pActualFormat, NULL);
-                if (SUCCEEDED(hr)) {
-                    MA_COPY_MEMORY(&wf, pActualFormat, sizeof(MA_WAVEFORMATEXTENSIBLE));
-                }
-
-                ma_PropVariantClear(pContext, &prop);
-            }
-
-            ma_IPropertyStore_Release(pStore);
-        }
-    #else
-        /*
-        I do not know how to query the device's native format on UWP so for now I'm just disabling support for
-        exclusive mode. The alternative is to enumerate over different formats and check IsFormatSupported()
-        until you find one that works.
-
-        TODO: Add support for exclusive mode to UWP.
-        */
-        hr = S_FALSE;
-    #endif
-
-        if (hr == S_OK) {
-            shareMode = MA_AUDCLNT_SHAREMODE_EXCLUSIVE;
-            result = MA_SUCCESS;
-        } else {
-            result = MA_SHARE_MODE_NOT_SUPPORTED;
-        }
-    } else {
-        /* In shared mode we are always using the format reported by the operating system. */
-        MA_WAVEFORMATEXTENSIBLE* pNativeFormat = NULL;
-        hr = ma_IAudioClient_GetMixFormat((ma_IAudioClient*)pData->pAudioClient, (MA_WAVEFORMATEX**)&pNativeFormat);
-        if (hr != S_OK) {
-            /* When using process-specific loopback, GetMixFormat() seems to always fail. */
-            if (usingProcessLoopback) {
-                wf.wFormatTag      = WAVE_FORMAT_IEEE_FLOAT;
-                wf.nChannels       = 2;
-                wf.nSamplesPerSec  = 44100;
-                wf.wBitsPerSample  = 32;
-                wf.nBlockAlign     = wf.nChannels * wf.wBitsPerSample / 8;
-                wf.nAvgBytesPerSec = wf.nSamplesPerSec * wf.nBlockAlign;
-                wf.cbSize          = sizeof(MA_WAVEFORMATEX);
-
-                result = MA_SUCCESS;
-            } else {
-                result = MA_FORMAT_NOT_SUPPORTED;
-            }
-        } else {
-            /*
-            I've seen cases where cbSize will be set to sizeof(WAVEFORMATEX) even though the structure itself
-            is given the format tag of WAVE_FORMAT_EXTENSIBLE. If the format tag is WAVE_FORMAT_EXTENSIBLE
-            want to make sure we copy the whole WAVEFORMATEXTENSIBLE structure. Otherwise we'll have to be
-            safe and only copy the WAVEFORMATEX part.
-            */
-            if (pNativeFormat->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
-                MA_COPY_MEMORY(&wf, pNativeFormat, sizeof(MA_WAVEFORMATEXTENSIBLE));
-            } else {
-                /* I've seen a case where cbSize was set to 0. Assume sizeof(WAVEFORMATEX) in this case. */
-                size_t cbSize = pNativeFormat->cbSize;
-                if (cbSize == 0) {
-                    cbSize = sizeof(MA_WAVEFORMATEX);
-                }
-
-                /* Make sure we don't copy more than the capacity of `wf`. */
-                if (cbSize > sizeof(wf)) {
-                    cbSize = sizeof(wf);
-                }
-
-                MA_COPY_MEMORY(&wf, pNativeFormat, cbSize);
-            }
-
-            result = MA_SUCCESS;
-        }
-
-        ma_CoTaskMemFree(pContext, pNativeFormat);
-
-        shareMode = MA_AUDCLNT_SHAREMODE_SHARED;
-    }
-
-    /* Return an error if we still haven't found a format. */
-    if (result != MA_SUCCESS) {
-        errorMsg = "[WASAPI] Failed to find best device mix format.";
-        goto done;
-    }
-
-    /*
-    Override the native sample rate with the one requested by the caller, but only if we're not using the default sample rate. We'll use
-    WASAPI to perform the sample rate conversion.
-    */
-    nativeSampleRate = wf.nSamplesPerSec;
-    if (streamFlags & MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM) {
-        wf.nSamplesPerSec = (pData->sampleRateIn != 0) ? pData->sampleRateIn : MA_DEFAULT_SAMPLE_RATE;
-        wf.nAvgBytesPerSec = wf.nSamplesPerSec * wf.nBlockAlign;
-    }
-
-    pData->formatOut = ma_format_from_WAVEFORMATEX((MA_WAVEFORMATEX*)&wf);
-    if (pData->formatOut == ma_format_unknown) {
-        /*
-        The format isn't supported. This is almost certainly because the exclusive mode format isn't supported by miniaudio. We need to return MA_SHARE_MODE_NOT_SUPPORTED
-        in this case so that the caller can detect it and fall back to shared mode if desired. We should never get here if shared mode was requested, but just for
-        completeness we'll check for it and return MA_FORMAT_NOT_SUPPORTED.
-        */
-        if (shareMode == MA_AUDCLNT_SHAREMODE_EXCLUSIVE) {
-            result = MA_SHARE_MODE_NOT_SUPPORTED;
-        } else {
-            result = MA_FORMAT_NOT_SUPPORTED;
-        }
-
-        errorMsg = "[WASAPI] Native format not supported.";
-        goto done;
-    }
-
-    pData->channelsOut = wf.nChannels;
-    pData->sampleRateOut = wf.nSamplesPerSec;
-
-    /*
-    Get the internal channel map based on the channel mask. There is a possibility that GetMixFormat() returns
-    a WAVEFORMATEX instead of a WAVEFORMATEXTENSIBLE, in which case the channel mask will be undefined. In this
-    case we'll just use the default channel map.
-    */
-    if (wf.wFormatTag == WAVE_FORMAT_EXTENSIBLE || wf.cbSize >= sizeof(MA_WAVEFORMATEXTENSIBLE)) {
-        ma_channel_mask_to_channel_map__win32(wf.dwChannelMask, pData->channelsOut, pData->channelMapOut);
-    } else {
-        ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pData->channelMapOut, ma_countof(pData->channelMapOut), pData->channelsOut);
-    }
-
-    /* Period size. */
-    pData->periodsOut = (pData->periodsIn != 0) ? pData->periodsIn : MA_DEFAULT_PERIODS;
-    pData->periodSizeInFramesOut = pData->periodSizeInFramesIn;
-    if (pData->periodSizeInFramesOut == 0) {
-        if (pData->periodSizeInMillisecondsIn == 0) {
-            if (pData->performanceProfile == ma_performance_profile_low_latency) {
-                pData->periodSizeInFramesOut = ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY, wf.nSamplesPerSec);
-            } else {
-                pData->periodSizeInFramesOut = ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE, wf.nSamplesPerSec);
-            }
-        } else {
-            pData->periodSizeInFramesOut = ma_calculate_buffer_size_in_frames_from_milliseconds(pData->periodSizeInMillisecondsIn, wf.nSamplesPerSec);
-        }
-    }
-
-    periodDurationInMicroseconds = ((ma_uint64)pData->periodSizeInFramesOut * 1000 * 1000) / wf.nSamplesPerSec;
-
-
-    /* Slightly different initialization for shared and exclusive modes. We try exclusive mode first, and if it fails, fall back to shared mode. */
-    if (shareMode == MA_AUDCLNT_SHAREMODE_EXCLUSIVE) {
-        MA_REFERENCE_TIME bufferDuration = periodDurationInMicroseconds * pData->periodsOut * 10;
-
-        /*
-        If the periodicy is too small, Initialize() will fail with AUDCLNT_E_INVALID_DEVICE_PERIOD. In this case we should just keep increasing
-        it and trying it again.
-        */
-        hr = E_FAIL;
-        for (;;) {
-            hr = ma_IAudioClient_Initialize((ma_IAudioClient*)pData->pAudioClient, shareMode, streamFlags, bufferDuration, bufferDuration, (MA_WAVEFORMATEX*)&wf, NULL);
-            if (hr == MA_AUDCLNT_E_INVALID_DEVICE_PERIOD) {
-                if (bufferDuration > 500*10000) {
-                    break;
-                } else {
-                    if (bufferDuration == 0) {  /* <-- Just a sanity check to prevent an infinit loop. Should never happen, but it makes me feel better. */
-                        break;
-                    }
-
-                    bufferDuration = bufferDuration * 2;
-                    continue;
-                }
-            } else {
-                break;
-            }
-        }
-
-        if (hr == MA_AUDCLNT_E_BUFFER_SIZE_NOT_ALIGNED) {
-            ma_uint32 bufferSizeInFrames;
-            hr = ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pData->pAudioClient, &bufferSizeInFrames);
-            if (SUCCEEDED(hr)) {
-                bufferDuration = (MA_REFERENCE_TIME)((10000.0 * 1000 / wf.nSamplesPerSec * bufferSizeInFrames) + 0.5);
-
-                /* Unfortunately we need to release and re-acquire the audio client according to MSDN. Seems silly - why not just call IAudioClient_Initialize() again?! */
-                ma_IAudioClient_Release((ma_IAudioClient*)pData->pAudioClient);
-
-            #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-                hr = ma_IMMDevice_Activate(pDeviceInterface, &MA_IID_IAudioClient, CLSCTX_ALL, NULL, (void**)&pData->pAudioClient);
-            #else
-                hr = ma_IUnknown_QueryInterface(pDeviceInterface, &MA_IID_IAudioClient, (void**)&pData->pAudioClient);
-            #endif
-
-                if (SUCCEEDED(hr)) {
-                    hr = ma_IAudioClient_Initialize((ma_IAudioClient*)pData->pAudioClient, shareMode, streamFlags, bufferDuration, bufferDuration, (MA_WAVEFORMATEX*)&wf, NULL);
-                }
-            }
-        }
-
-        if (FAILED(hr)) {
-            /* Failed to initialize in exclusive mode. Don't fall back to shared mode - instead tell the client about it. They can reinitialize in shared mode if they want. */
-            if (hr == E_ACCESSDENIED) {
-                errorMsg = "[WASAPI] Failed to initialize device in exclusive mode. Access denied.", result = MA_ACCESS_DENIED;
-            } else if (hr == MA_AUDCLNT_E_DEVICE_IN_USE) {
-                errorMsg = "[WASAPI] Failed to initialize device in exclusive mode. Device in use.", result = MA_BUSY;
-            } else {
-                errorMsg = "[WASAPI] Failed to initialize device in exclusive mode."; result = ma_result_from_HRESULT(hr);
-            }
-            goto done;
-        }
-    }
-
-    if (shareMode == MA_AUDCLNT_SHAREMODE_SHARED) {
-        /*
-        Low latency shared mode via IAudioClient3.
-
-        NOTE
-        ====
-        Contrary to the documentation on MSDN (https://docs.microsoft.com/en-us/windows/win32/api/audioclient/nf-audioclient-iaudioclient3-initializesharedaudiostream), the
-        use of AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM and AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY with IAudioClient3_InitializeSharedAudioStream() absolutely does not work. Using
-        any of these flags will result in HRESULT code 0x88890021. The other problem is that calling IAudioClient3_GetSharedModeEnginePeriod() with a sample rate different to
-        that returned by IAudioClient_GetMixFormat() also results in an error. I'm therefore disabling low-latency shared mode with AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM.
-        */
-        #ifndef MA_WASAPI_NO_LOW_LATENCY_SHARED_MODE
-        {
-            if ((streamFlags & MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM) == 0 || nativeSampleRate == wf.nSamplesPerSec) {
-                ma_IAudioClient3* pAudioClient3 = NULL;
-                hr = ma_IAudioClient_QueryInterface(pData->pAudioClient, &MA_IID_IAudioClient3, (void**)&pAudioClient3);
-                if (SUCCEEDED(hr)) {
-                    ma_uint32 defaultPeriodInFrames;
-                    ma_uint32 fundamentalPeriodInFrames;
-                    ma_uint32 minPeriodInFrames;
-                    ma_uint32 maxPeriodInFrames;
-                    hr = ma_IAudioClient3_GetSharedModeEnginePeriod(pAudioClient3, (MA_WAVEFORMATEX*)&wf, &defaultPeriodInFrames, &fundamentalPeriodInFrames, &minPeriodInFrames, &maxPeriodInFrames);
-                    if (SUCCEEDED(hr)) {
-                        ma_uint32 desiredPeriodInFrames = pData->periodSizeInFramesOut;
-                        ma_uint32 actualPeriodInFrames  = desiredPeriodInFrames;
-
-                        /* Make sure the period size is a multiple of fundamentalPeriodInFrames. */
-                        actualPeriodInFrames = actualPeriodInFrames / fundamentalPeriodInFrames;
-                        actualPeriodInFrames = actualPeriodInFrames * fundamentalPeriodInFrames;
-
-                        /* The period needs to be clamped between minPeriodInFrames and maxPeriodInFrames. */
-                        actualPeriodInFrames = ma_clamp(actualPeriodInFrames, minPeriodInFrames, maxPeriodInFrames);
-
-                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] Trying IAudioClient3_InitializeSharedAudioStream(actualPeriodInFrames=%d)\n", actualPeriodInFrames);
-                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    defaultPeriodInFrames=%d\n", defaultPeriodInFrames);
-                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    fundamentalPeriodInFrames=%d\n", fundamentalPeriodInFrames);
-                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    minPeriodInFrames=%d\n", minPeriodInFrames);
-                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    maxPeriodInFrames=%d\n", maxPeriodInFrames);
-
-                        /* If the client requested a largish buffer than we don't actually want to use low latency shared mode because it forces small buffers. */
-                        if (actualPeriodInFrames >= desiredPeriodInFrames) {
-                            /*
-                            MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM | MA_AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY must not be in the stream flags. If either of these are specified,
-                            IAudioClient3_InitializeSharedAudioStream() will fail.
-                            */
-                            hr = ma_IAudioClient3_InitializeSharedAudioStream(pAudioClient3, streamFlags & ~(MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM | MA_AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY), actualPeriodInFrames, (MA_WAVEFORMATEX*)&wf, NULL);
-                            if (SUCCEEDED(hr)) {
-                                wasInitializedUsingIAudioClient3 = MA_TRUE;
-                                pData->periodSizeInFramesOut = actualPeriodInFrames;
-
-                                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] Using IAudioClient3\n");
-                                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    periodSizeInFramesOut=%d\n", pData->periodSizeInFramesOut);
-                            } else {
-                                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] IAudioClient3_InitializeSharedAudioStream failed. Falling back to IAudioClient.\n");
-                            }
-                        } else {
-                            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] Not using IAudioClient3 because the desired period size is larger than the maximum supported by IAudioClient3.\n");
-                        }
-                    } else {
-                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] IAudioClient3_GetSharedModeEnginePeriod failed. Falling back to IAudioClient.\n");
-                    }
-
-                    ma_IAudioClient3_Release(pAudioClient3);
-                    pAudioClient3 = NULL;
-                }
-            }
-        }
-        #else
-        {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] Not using IAudioClient3 because MA_WASAPI_NO_LOW_LATENCY_SHARED_MODE is enabled.\n");
-        }
-        #endif
-
-        /* If we don't have an IAudioClient3 then we need to use the normal initialization routine. */
-        if (!wasInitializedUsingIAudioClient3) {
-            MA_REFERENCE_TIME bufferDuration = periodDurationInMicroseconds * pData->periodsOut * 10;   /* <-- Multiply by 10 for microseconds to 100-nanoseconds. */
-            hr = ma_IAudioClient_Initialize((ma_IAudioClient*)pData->pAudioClient, shareMode, streamFlags, bufferDuration, 0, (const MA_WAVEFORMATEX*)&wf, NULL);
-            if (FAILED(hr)) {
-                if (hr == E_ACCESSDENIED) {
-                    errorMsg = "[WASAPI] Failed to initialize device. Access denied.", result = MA_ACCESS_DENIED;
-                } else if (hr == MA_AUDCLNT_E_DEVICE_IN_USE) {
-                    errorMsg = "[WASAPI] Failed to initialize device. Device in use.", result = MA_BUSY;
-                } else {
-                    errorMsg = "[WASAPI] Failed to initialize device.", result = ma_result_from_HRESULT(hr);
-                }
-
-                goto done;
-            }
-        }
-    }
-
-    if (!wasInitializedUsingIAudioClient3) {
-        ma_uint32 bufferSizeInFrames = 0;
-        hr = ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pData->pAudioClient, &bufferSizeInFrames);
-        if (FAILED(hr)) {
-            errorMsg = "[WASAPI] Failed to get audio client's actual buffer size.", result = ma_result_from_HRESULT(hr);
-            goto done;
-        }
-
-        /*
-        When using process loopback mode, retrieval of the buffer size seems to result in totally
-        incorrect values. In this case we'll just assume it's the same size as what we requested
-        when we initialized the client.
-        */
-        if (usingProcessLoopback) {
-            bufferSizeInFrames = (ma_uint32)((periodDurationInMicroseconds * pData->periodsOut) * pData->sampleRateOut / 1000000);
-        }
-
-        pData->periodSizeInFramesOut = bufferSizeInFrames / pData->periodsOut;
-    }
-
-    pData->usingAudioClient3 = wasInitializedUsingIAudioClient3;
-
-
-    if (deviceType == ma_device_type_playback) {
-        result = ma_device_create_IAudioClient_service__wasapi(pContext, deviceType, (ma_IAudioClient*)pData->pAudioClient, (void**)&pData->pRenderClient);
-    } else {
-        result = ma_device_create_IAudioClient_service__wasapi(pContext, deviceType, (ma_IAudioClient*)pData->pAudioClient, (void**)&pData->pCaptureClient);
-    }
-
-    /*if (FAILED(hr)) {*/
-    if (result != MA_SUCCESS) {
-        errorMsg = "[WASAPI] Failed to get audio client service.";
-        goto done;
-    }
-
-
-    /* Grab the name of the device. */
-    #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    {
-        ma_IPropertyStore *pProperties;
-        hr = ma_IMMDevice_OpenPropertyStore(pDeviceInterface, STGM_READ, &pProperties);
-        if (SUCCEEDED(hr)) {
-            MA_PROPVARIANT varName;
-            ma_PropVariantInit(&varName);
-            hr = ma_IPropertyStore_GetValue(pProperties, &MA_PKEY_Device_FriendlyName, &varName);
-            if (SUCCEEDED(hr)) {
-                WideCharToMultiByte(CP_UTF8, 0, varName.pwszVal, -1, pData->deviceName, sizeof(pData->deviceName), 0, FALSE);
-                ma_PropVariantClear(pContext, &varName);
-            }
-
-            ma_IPropertyStore_Release(pProperties);
-        }
-    }
-    #endif
-
-    /*
-    For the WASAPI backend we need to know the actual IDs of the device in order to do automatic
-    stream routing so that IDs can be compared and we can determine which device has been detached
-    and whether or not it matches with our ma_device.
-    */
-    #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    {
-        /* Desktop */
-        ma_context_get_device_id_from_MMDevice__wasapi(pContext, pDeviceInterface, &pData->id);
-    }
-    #else
-    {
-        /* UWP */
-        /* TODO: Implement me. Need to figure out how to get the ID of the default device. */
-    }
-    #endif
-
-done:
-    /* Clean up. */
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    if (pDeviceInterface != NULL) {
-        ma_IMMDevice_Release(pDeviceInterface);
-    }
-#else
-    if (pDeviceInterface != NULL) {
-        ma_IUnknown_Release(pDeviceInterface);
-    }
-#endif
-
-    if (result != MA_SUCCESS) {
-        if (pData->pRenderClient) {
-            ma_IAudioRenderClient_Release((ma_IAudioRenderClient*)pData->pRenderClient);
-            pData->pRenderClient = NULL;
-        }
-        if (pData->pCaptureClient) {
-            ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pData->pCaptureClient);
-            pData->pCaptureClient = NULL;
-        }
-        if (pData->pAudioClient) {
-            ma_IAudioClient_Release((ma_IAudioClient*)pData->pAudioClient);
-            pData->pAudioClient = NULL;
-        }
-
-        if (errorMsg != NULL && errorMsg[0] != '\0') {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "%s\n", errorMsg);
-        }
-
-        return result;
-    } else {
-        return MA_SUCCESS;
-    }
-}
-
-static ma_result ma_device_reinit__wasapi(ma_device* pDevice, ma_device_type deviceType)
-{
-    ma_device_init_internal_data__wasapi data;
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /* We only re-initialize the playback or capture device. Never a full-duplex device. */
-    if (deviceType == ma_device_type_duplex) {
-        return MA_INVALID_ARGS;
-    }
-
-
-    /*
-    Before reinitializing the device we need to free the previous audio clients.
-
-    There's a known memory leak here. We will be calling this from the routing change callback that
-    is fired by WASAPI. If we attempt to release the IAudioClient we will deadlock. In my opinion
-    this is a bug. I'm not sure what I need to do to handle this cleanly, but I think we'll probably
-    need some system where we post an event, but delay the execution of it until the callback has
-    returned. I'm not sure how to do this reliably, however. I have set up some infrastructure for
-    a command thread which might be useful for this.
-    */
-    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_loopback) {
-        if (pDevice->wasapi.pCaptureClient) {
-            ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
-            pDevice->wasapi.pCaptureClient = NULL;
-        }
-
-        if (pDevice->wasapi.pAudioClientCapture) {
-            /*ma_device_release_IAudioClient_service__wasapi(pDevice, ma_device_type_capture);*/
-            pDevice->wasapi.pAudioClientCapture = NULL;
-        }
-    }
-
-    if (deviceType == ma_device_type_playback) {
-        if (pDevice->wasapi.pRenderClient) {
-            ma_IAudioRenderClient_Release((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient);
-            pDevice->wasapi.pRenderClient = NULL;
-        }
-
-        if (pDevice->wasapi.pAudioClientPlayback) {
-            /*ma_device_release_IAudioClient_service__wasapi(pDevice, ma_device_type_playback);*/
-            pDevice->wasapi.pAudioClientPlayback = NULL;
-        }
-    }
-
-
-    if (deviceType == ma_device_type_playback) {
-        data.formatIn               = pDevice->playback.format;
-        data.channelsIn             = pDevice->playback.channels;
-        MA_COPY_MEMORY(data.channelMapIn, pDevice->playback.channelMap, sizeof(pDevice->playback.channelMap));
-        data.shareMode              = pDevice->playback.shareMode;
-    } else {
-        data.formatIn               = pDevice->capture.format;
-        data.channelsIn             = pDevice->capture.channels;
-        MA_COPY_MEMORY(data.channelMapIn, pDevice->capture.channelMap, sizeof(pDevice->capture.channelMap));
-        data.shareMode              = pDevice->capture.shareMode;
-    }
-
-    data.sampleRateIn               = pDevice->sampleRate;
-    data.periodSizeInFramesIn       = pDevice->wasapi.originalPeriodSizeInFrames;
-    data.periodSizeInMillisecondsIn = pDevice->wasapi.originalPeriodSizeInMilliseconds;
-    data.periodsIn                  = pDevice->wasapi.originalPeriods;
-    data.performanceProfile         = pDevice->wasapi.originalPerformanceProfile;
-    data.noAutoConvertSRC           = pDevice->wasapi.noAutoConvertSRC;
-    data.noDefaultQualitySRC        = pDevice->wasapi.noDefaultQualitySRC;
-    data.noHardwareOffloading       = pDevice->wasapi.noHardwareOffloading;
-    data.loopbackProcessID          = pDevice->wasapi.loopbackProcessID;
-    data.loopbackProcessExclude     = pDevice->wasapi.loopbackProcessExclude;
-    result = ma_device_init_internal__wasapi(pDevice->pContext, deviceType, NULL, &data);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* At this point we have some new objects ready to go. We need to uninitialize the previous ones and then set the new ones. */
-    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_loopback) {
-        pDevice->wasapi.pAudioClientCapture         = data.pAudioClient;
-        pDevice->wasapi.pCaptureClient              = data.pCaptureClient;
-
-        pDevice->capture.internalFormat             = data.formatOut;
-        pDevice->capture.internalChannels           = data.channelsOut;
-        pDevice->capture.internalSampleRate         = data.sampleRateOut;
-        MA_COPY_MEMORY(pDevice->capture.internalChannelMap, data.channelMapOut, sizeof(data.channelMapOut));
-        pDevice->capture.internalPeriodSizeInFrames = data.periodSizeInFramesOut;
-        pDevice->capture.internalPeriods            = data.periodsOut;
-        ma_strcpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), data.deviceName);
-
-        ma_IAudioClient_SetEventHandle((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture, (HANDLE)pDevice->wasapi.hEventCapture);
-
-        pDevice->wasapi.periodSizeInFramesCapture = data.periodSizeInFramesOut;
-        ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture, &pDevice->wasapi.actualBufferSizeInFramesCapture);
-
-        /* We must always have a valid ID. */
-        ma_strcpy_s_WCHAR(pDevice->capture.id.wasapi, sizeof(pDevice->capture.id.wasapi), data.id.wasapi);
-    }
-
-    if (deviceType == ma_device_type_playback) {
-        pDevice->wasapi.pAudioClientPlayback         = data.pAudioClient;
-        pDevice->wasapi.pRenderClient                = data.pRenderClient;
-
-        pDevice->playback.internalFormat             = data.formatOut;
-        pDevice->playback.internalChannels           = data.channelsOut;
-        pDevice->playback.internalSampleRate         = data.sampleRateOut;
-        MA_COPY_MEMORY(pDevice->playback.internalChannelMap, data.channelMapOut, sizeof(data.channelMapOut));
-        pDevice->playback.internalPeriodSizeInFrames = data.periodSizeInFramesOut;
-        pDevice->playback.internalPeriods            = data.periodsOut;
-        ma_strcpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), data.deviceName);
-
-        ma_IAudioClient_SetEventHandle((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, (HANDLE)pDevice->wasapi.hEventPlayback);
-
-        pDevice->wasapi.periodSizeInFramesPlayback = data.periodSizeInFramesOut;
-        ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, &pDevice->wasapi.actualBufferSizeInFramesPlayback);
-
-        /* We must always have a valid ID because rerouting will look at it. */
-        ma_strcpy_s_WCHAR(pDevice->playback.id.wasapi, sizeof(pDevice->playback.id.wasapi), data.id.wasapi);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init__wasapi(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    ma_result result = MA_SUCCESS;
-
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    HRESULT hr;
-    ma_IMMDeviceEnumerator* pDeviceEnumerator;
-#endif
-
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ZERO_OBJECT(&pDevice->wasapi);
-    pDevice->wasapi.usage                  = pConfig->wasapi.usage;
-    pDevice->wasapi.noAutoConvertSRC       = pConfig->wasapi.noAutoConvertSRC;
-    pDevice->wasapi.noDefaultQualitySRC    = pConfig->wasapi.noDefaultQualitySRC;
-    pDevice->wasapi.noHardwareOffloading   = pConfig->wasapi.noHardwareOffloading;
-    pDevice->wasapi.loopbackProcessID      = pConfig->wasapi.loopbackProcessID;
-    pDevice->wasapi.loopbackProcessExclude = pConfig->wasapi.loopbackProcessExclude;
-
-    /* Exclusive mode is not allowed with loopback. */
-    if (pConfig->deviceType == ma_device_type_loopback && pConfig->playback.shareMode == ma_share_mode_exclusive) {
-        return MA_INVALID_DEVICE_CONFIG;
-    }
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
-        ma_device_init_internal_data__wasapi data;
-        data.formatIn                   = pDescriptorCapture->format;
-        data.channelsIn                 = pDescriptorCapture->channels;
-        data.sampleRateIn               = pDescriptorCapture->sampleRate;
-        MA_COPY_MEMORY(data.channelMapIn, pDescriptorCapture->channelMap, sizeof(pDescriptorCapture->channelMap));
-        data.periodSizeInFramesIn       = pDescriptorCapture->periodSizeInFrames;
-        data.periodSizeInMillisecondsIn = pDescriptorCapture->periodSizeInMilliseconds;
-        data.periodsIn                  = pDescriptorCapture->periodCount;
-        data.shareMode                  = pDescriptorCapture->shareMode;
-        data.performanceProfile         = pConfig->performanceProfile;
-        data.noAutoConvertSRC           = pConfig->wasapi.noAutoConvertSRC;
-        data.noDefaultQualitySRC        = pConfig->wasapi.noDefaultQualitySRC;
-        data.noHardwareOffloading       = pConfig->wasapi.noHardwareOffloading;
-        data.loopbackProcessID          = pConfig->wasapi.loopbackProcessID;
-        data.loopbackProcessExclude     = pConfig->wasapi.loopbackProcessExclude;
-
-        result = ma_device_init_internal__wasapi(pDevice->pContext, (pConfig->deviceType == ma_device_type_loopback) ? ma_device_type_loopback : ma_device_type_capture, pDescriptorCapture->pDeviceID, &data);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pDevice->wasapi.pAudioClientCapture              = data.pAudioClient;
-        pDevice->wasapi.pCaptureClient                   = data.pCaptureClient;
-        pDevice->wasapi.originalPeriodSizeInMilliseconds = pDescriptorCapture->periodSizeInMilliseconds;
-        pDevice->wasapi.originalPeriodSizeInFrames       = pDescriptorCapture->periodSizeInFrames;
-        pDevice->wasapi.originalPeriods                  = pDescriptorCapture->periodCount;
-        pDevice->wasapi.originalPerformanceProfile       = pConfig->performanceProfile;
-
-        /*
-        The event for capture needs to be manual reset for the same reason as playback. We keep the initial state set to unsignaled,
-        however, because we want to block until we actually have something for the first call to ma_device_read().
-        */
-        pDevice->wasapi.hEventCapture = (ma_handle)CreateEventA(NULL, FALSE, FALSE, NULL);  /* Auto reset, unsignaled by default. */
-        if (pDevice->wasapi.hEventCapture == NULL) {
-            result = ma_result_from_GetLastError(GetLastError());
-
-            if (pDevice->wasapi.pCaptureClient != NULL) {
-                ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
-                pDevice->wasapi.pCaptureClient = NULL;
-            }
-            if (pDevice->wasapi.pAudioClientCapture != NULL) {
-                ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
-                pDevice->wasapi.pAudioClientCapture = NULL;
-            }
-
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create event for capture.");
-            return result;
-        }
-        ma_IAudioClient_SetEventHandle((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture, (HANDLE)pDevice->wasapi.hEventCapture);
-
-        pDevice->wasapi.periodSizeInFramesCapture = data.periodSizeInFramesOut;
-        ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture, &pDevice->wasapi.actualBufferSizeInFramesCapture);
-
-        /* We must always have a valid ID. */
-        ma_strcpy_s_WCHAR(pDevice->capture.id.wasapi, sizeof(pDevice->capture.id.wasapi), data.id.wasapi);
-
-        /* The descriptor needs to be updated with actual values. */
-        pDescriptorCapture->format             = data.formatOut;
-        pDescriptorCapture->channels           = data.channelsOut;
-        pDescriptorCapture->sampleRate         = data.sampleRateOut;
-        MA_COPY_MEMORY(pDescriptorCapture->channelMap, data.channelMapOut, sizeof(data.channelMapOut));
-        pDescriptorCapture->periodSizeInFrames = data.periodSizeInFramesOut;
-        pDescriptorCapture->periodCount        = data.periodsOut;
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ma_device_init_internal_data__wasapi data;
-        data.formatIn                   = pDescriptorPlayback->format;
-        data.channelsIn                 = pDescriptorPlayback->channels;
-        data.sampleRateIn               = pDescriptorPlayback->sampleRate;
-        MA_COPY_MEMORY(data.channelMapIn, pDescriptorPlayback->channelMap, sizeof(pDescriptorPlayback->channelMap));
-        data.periodSizeInFramesIn       = pDescriptorPlayback->periodSizeInFrames;
-        data.periodSizeInMillisecondsIn = pDescriptorPlayback->periodSizeInMilliseconds;
-        data.periodsIn                  = pDescriptorPlayback->periodCount;
-        data.shareMode                  = pDescriptorPlayback->shareMode;
-        data.performanceProfile         = pConfig->performanceProfile;
-        data.noAutoConvertSRC           = pConfig->wasapi.noAutoConvertSRC;
-        data.noDefaultQualitySRC        = pConfig->wasapi.noDefaultQualitySRC;
-        data.noHardwareOffloading       = pConfig->wasapi.noHardwareOffloading;
-        data.loopbackProcessID          = pConfig->wasapi.loopbackProcessID;
-        data.loopbackProcessExclude     = pConfig->wasapi.loopbackProcessExclude;
-
-        result = ma_device_init_internal__wasapi(pDevice->pContext, ma_device_type_playback, pDescriptorPlayback->pDeviceID, &data);
-        if (result != MA_SUCCESS) {
-            if (pConfig->deviceType == ma_device_type_duplex) {
-                if (pDevice->wasapi.pCaptureClient != NULL) {
-                    ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
-                    pDevice->wasapi.pCaptureClient = NULL;
-                }
-                if (pDevice->wasapi.pAudioClientCapture != NULL) {
-                    ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
-                    pDevice->wasapi.pAudioClientCapture = NULL;
-                }
-
-                CloseHandle((HANDLE)pDevice->wasapi.hEventCapture);
-                pDevice->wasapi.hEventCapture = NULL;
-            }
-            return result;
-        }
-
-        pDevice->wasapi.pAudioClientPlayback             = data.pAudioClient;
-        pDevice->wasapi.pRenderClient                    = data.pRenderClient;
-        pDevice->wasapi.originalPeriodSizeInMilliseconds = pDescriptorPlayback->periodSizeInMilliseconds;
-        pDevice->wasapi.originalPeriodSizeInFrames       = pDescriptorPlayback->periodSizeInFrames;
-        pDevice->wasapi.originalPeriods                  = pDescriptorPlayback->periodCount;
-        pDevice->wasapi.originalPerformanceProfile       = pConfig->performanceProfile;
-
-        /*
-        The event for playback is needs to be manual reset because we want to explicitly control the fact that it becomes signalled
-        only after the whole available space has been filled, never before.
-
-        The playback event also needs to be initially set to a signaled state so that the first call to ma_device_write() is able
-        to get passed WaitForMultipleObjects().
-        */
-        pDevice->wasapi.hEventPlayback = (ma_handle)CreateEventA(NULL, FALSE, TRUE, NULL);  /* Auto reset, signaled by default. */
-        if (pDevice->wasapi.hEventPlayback == NULL) {
-            result = ma_result_from_GetLastError(GetLastError());
-
-            if (pConfig->deviceType == ma_device_type_duplex) {
-                if (pDevice->wasapi.pCaptureClient != NULL) {
-                    ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
-                    pDevice->wasapi.pCaptureClient = NULL;
-                }
-                if (pDevice->wasapi.pAudioClientCapture != NULL) {
-                    ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
-                    pDevice->wasapi.pAudioClientCapture = NULL;
-                }
-
-                CloseHandle((HANDLE)pDevice->wasapi.hEventCapture);
-                pDevice->wasapi.hEventCapture = NULL;
-            }
-
-            if (pDevice->wasapi.pRenderClient != NULL) {
-                ma_IAudioRenderClient_Release((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient);
-                pDevice->wasapi.pRenderClient = NULL;
-            }
-            if (pDevice->wasapi.pAudioClientPlayback != NULL) {
-                ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback);
-                pDevice->wasapi.pAudioClientPlayback = NULL;
-            }
-
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create event for playback.");
-            return result;
-        }
-        ma_IAudioClient_SetEventHandle((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, (HANDLE)pDevice->wasapi.hEventPlayback);
-
-        pDevice->wasapi.periodSizeInFramesPlayback = data.periodSizeInFramesOut;
-        ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, &pDevice->wasapi.actualBufferSizeInFramesPlayback);
-
-        /* We must always have a valid ID because rerouting will look at it. */
-        ma_strcpy_s_WCHAR(pDevice->playback.id.wasapi, sizeof(pDevice->playback.id.wasapi), data.id.wasapi);
-
-        /* The descriptor needs to be updated with actual values. */
-        pDescriptorPlayback->format             = data.formatOut;
-        pDescriptorPlayback->channels           = data.channelsOut;
-        pDescriptorPlayback->sampleRate         = data.sampleRateOut;
-        MA_COPY_MEMORY(pDescriptorPlayback->channelMap, data.channelMapOut, sizeof(data.channelMapOut));
-        pDescriptorPlayback->periodSizeInFrames = data.periodSizeInFramesOut;
-        pDescriptorPlayback->periodCount        = data.periodsOut;
-    }
-
-    /*
-    We need to register a notification client to detect when the device has been disabled, unplugged or re-routed (when the default device changes). When
-    we are connecting to the default device we want to do automatic stream routing when the device is disabled or unplugged. Otherwise we want to just
-    stop the device outright and let the application handle it.
-    */
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    if (pConfig->wasapi.noAutoStreamRouting == MA_FALSE) {
-        if ((pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) && pConfig->capture.pDeviceID == NULL) {
-            pDevice->wasapi.allowCaptureAutoStreamRouting = MA_TRUE;
-        }
-        if ((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pConfig->playback.pDeviceID == NULL) {
-            pDevice->wasapi.allowPlaybackAutoStreamRouting = MA_TRUE;
-        }
-    }
-
-    ma_mutex_init(&pDevice->wasapi.rerouteLock);
-
-    hr = ma_CoCreateInstance(pDevice->pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
-    if (FAILED(hr)) {
-        ma_device_uninit__wasapi(pDevice);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create device enumerator.");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    pDevice->wasapi.notificationClient.lpVtbl  = (void*)&g_maNotificationCientVtbl;
-    pDevice->wasapi.notificationClient.counter = 1;
-    pDevice->wasapi.notificationClient.pDevice = pDevice;
-
-    hr = pDeviceEnumerator->lpVtbl->RegisterEndpointNotificationCallback(pDeviceEnumerator, &pDevice->wasapi.notificationClient);
-    if (SUCCEEDED(hr)) {
-        pDevice->wasapi.pDeviceEnumerator = (ma_ptr)pDeviceEnumerator;
-    } else {
-        /* Not the end of the world if we fail to register the notification callback. We just won't support automatic stream routing. */
-        ma_IMMDeviceEnumerator_Release(pDeviceEnumerator);
-    }
-#endif
-
-    ma_atomic_bool32_set(&pDevice->wasapi.isStartedCapture,  MA_FALSE);
-    ma_atomic_bool32_set(&pDevice->wasapi.isStartedPlayback, MA_FALSE);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device__get_available_frames__wasapi(ma_device* pDevice, ma_IAudioClient* pAudioClient, ma_uint32* pFrameCount)
-{
-    ma_uint32 paddingFramesCount;
-    HRESULT hr;
-    ma_share_mode shareMode;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pFrameCount != NULL);
-
-    *pFrameCount = 0;
-
-    if ((ma_ptr)pAudioClient != pDevice->wasapi.pAudioClientPlayback && (ma_ptr)pAudioClient != pDevice->wasapi.pAudioClientCapture) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /*
-    I've had a report that GetCurrentPadding() is returning a frame count of 0 which is preventing
-    higher level function calls from doing anything because it thinks nothing is available. I have
-    taken a look at the documentation and it looks like this is unnecessary in exclusive mode.
-
-    From Microsoft's documentation:
-
-        For an exclusive-mode rendering or capture stream that was initialized with the
-        AUDCLNT_STREAMFLAGS_EVENTCALLBACK flag, the client typically has no use for the padding
-        value reported by GetCurrentPadding. Instead, the client accesses an entire buffer during
-        each processing pass.
-
-    Considering this, I'm going to skip GetCurrentPadding() for exclusive mode and just report the
-    entire buffer. This depends on the caller making sure they wait on the event handler.
-    */
-    shareMode = ((ma_ptr)pAudioClient == pDevice->wasapi.pAudioClientPlayback) ? pDevice->playback.shareMode : pDevice->capture.shareMode;
-    if (shareMode == ma_share_mode_shared) {
-        /* Shared mode. */
-        hr = ma_IAudioClient_GetCurrentPadding(pAudioClient, &paddingFramesCount);
-        if (FAILED(hr)) {
-            return ma_result_from_HRESULT(hr);
-        }
-
-        if ((ma_ptr)pAudioClient == pDevice->wasapi.pAudioClientPlayback) {
-            *pFrameCount = pDevice->wasapi.actualBufferSizeInFramesPlayback - paddingFramesCount;
-        } else {
-            *pFrameCount = paddingFramesCount;
-        }
-    } else {
-        /* Exclusive mode. */
-        if ((ma_ptr)pAudioClient == pDevice->wasapi.pAudioClientPlayback) {
-            *pFrameCount = pDevice->wasapi.actualBufferSizeInFramesPlayback;
-        } else {
-            *pFrameCount = pDevice->wasapi.actualBufferSizeInFramesCapture;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_device_reroute__wasapi(ma_device* pDevice, ma_device_type deviceType)
-{
-    ma_result result;
-
-    if (deviceType == ma_device_type_duplex) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "=== CHANGING DEVICE ===\n");
-
-    result = ma_device_reinit__wasapi(pDevice, deviceType);
-    if (result != MA_SUCCESS) {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[WASAPI] Reinitializing device after route change failed.\n");
-        return result;
-    }
-
-    ma_device__post_init_setup(pDevice, deviceType);
-    ma_device__on_notification_rerouted(pDevice);
-
-    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "=== DEVICE CHANGED ===\n");
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_start__wasapi_nolock(ma_device* pDevice)
-{
-    HRESULT hr;
-
-    if (pDevice->pContext->wasapi.hAvrt) {
-        const char* pTaskName = ma_to_usage_string__wasapi(pDevice->wasapi.usage);
-        if (pTaskName) {
-            DWORD idx = 0;
-            pDevice->wasapi.hAvrtHandle = (ma_handle)((MA_PFN_AvSetMmThreadCharacteristicsA)pDevice->pContext->wasapi.AvSetMmThreadCharacteristicsA)(pTaskName, &idx);
-        }
-    }
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
-        hr = ma_IAudioClient_Start((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
-        if (FAILED(hr)) {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to start internal capture device. HRESULT = %d.", (int)hr);
-            return ma_result_from_HRESULT(hr);
-        }
-
-        ma_atomic_bool32_set(&pDevice->wasapi.isStartedCapture, MA_TRUE);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        hr = ma_IAudioClient_Start((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback);
-        if (FAILED(hr)) {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to start internal playback device. HRESULT = %d.", (int)hr);
-            return ma_result_from_HRESULT(hr);
-        }
-
-        ma_atomic_bool32_set(&pDevice->wasapi.isStartedPlayback, MA_TRUE);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_start__wasapi(ma_device* pDevice)
-{
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /* Wait for any rerouting to finish before attempting to start the device. */
-    ma_mutex_lock(&pDevice->wasapi.rerouteLock);
-    {
-        result = ma_device_start__wasapi_nolock(pDevice);
-    }
-    ma_mutex_unlock(&pDevice->wasapi.rerouteLock);
-
-    return result;
-}
-
-static ma_result ma_device_stop__wasapi_nolock(ma_device* pDevice)
-{
-    ma_result result;
-    HRESULT hr;
-
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->wasapi.hAvrtHandle) {
-        ((MA_PFN_AvRevertMmThreadCharacteristics)pDevice->pContext->wasapi.AvRevertMmThreadcharacteristics)((HANDLE)pDevice->wasapi.hAvrtHandle);
-        pDevice->wasapi.hAvrtHandle = NULL;
-    }
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
-        hr = ma_IAudioClient_Stop((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
-        if (FAILED(hr)) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to stop internal capture device.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        /* The audio client needs to be reset otherwise restarting will fail. */
-        hr = ma_IAudioClient_Reset((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
-        if (FAILED(hr)) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to reset internal capture device.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        /* If we have a mapped buffer we need to release it. */
-        if (pDevice->wasapi.pMappedBufferCapture != NULL) {
-            ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
-            pDevice->wasapi.pMappedBufferCapture = NULL;
-            pDevice->wasapi.mappedBufferCaptureCap = 0;
-            pDevice->wasapi.mappedBufferCaptureLen = 0;
-        }
-
-        ma_atomic_bool32_set(&pDevice->wasapi.isStartedCapture, MA_FALSE);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        /*
-        The buffer needs to be drained before stopping the device. Not doing this will result in the last few frames not getting output to
-        the speakers. This is a problem for very short sounds because it'll result in a significant portion of it not getting played.
-        */
-        if (ma_atomic_bool32_get(&pDevice->wasapi.isStartedPlayback)) {
-            /* We need to make sure we put a timeout here or else we'll risk getting stuck in a deadlock in some cases. */
-            DWORD waitTime = pDevice->wasapi.actualBufferSizeInFramesPlayback / pDevice->playback.internalSampleRate;
-
-            if (pDevice->playback.shareMode == ma_share_mode_exclusive) {
-                WaitForSingleObject((HANDLE)pDevice->wasapi.hEventPlayback, waitTime);
-            }
-            else {
-                ma_uint32 prevFramesAvaialablePlayback = (ma_uint32)-1;
-                ma_uint32 framesAvailablePlayback;
-                for (;;) {
-                    result = ma_device__get_available_frames__wasapi(pDevice, (ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, &framesAvailablePlayback);
-                    if (result != MA_SUCCESS) {
-                        break;
-                    }
-
-                    if (framesAvailablePlayback >= pDevice->wasapi.actualBufferSizeInFramesPlayback) {
-                        break;
-                    }
-
-                    /*
-                    Just a safety check to avoid an infinite loop. If this iteration results in a situation where the number of available frames
-                    has not changed, get out of the loop. I don't think this should ever happen, but I think it's nice to have just in case.
-                    */
-                    if (framesAvailablePlayback == prevFramesAvaialablePlayback) {
-                        break;
-                    }
-                    prevFramesAvaialablePlayback = framesAvailablePlayback;
-
-                    WaitForSingleObject((HANDLE)pDevice->wasapi.hEventPlayback, waitTime * 1000);
-                    ResetEvent((HANDLE)pDevice->wasapi.hEventPlayback); /* Manual reset. */
-                }
-            }
-        }
-
-        hr = ma_IAudioClient_Stop((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback);
-        if (FAILED(hr)) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to stop internal playback device.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        /* The audio client needs to be reset otherwise restarting will fail. */
-        hr = ma_IAudioClient_Reset((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback);
-        if (FAILED(hr)) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to reset internal playback device.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        if (pDevice->wasapi.pMappedBufferPlayback != NULL) {
-            ma_IAudioRenderClient_ReleaseBuffer((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient, pDevice->wasapi.mappedBufferPlaybackCap, 0);
-            pDevice->wasapi.pMappedBufferPlayback = NULL;
-            pDevice->wasapi.mappedBufferPlaybackCap = 0;
-            pDevice->wasapi.mappedBufferPlaybackLen = 0;
-        }
-
-        ma_atomic_bool32_set(&pDevice->wasapi.isStartedPlayback, MA_FALSE);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__wasapi(ma_device* pDevice)
-{
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /* Wait for any rerouting to finish before attempting to stop the device. */
-    ma_mutex_lock(&pDevice->wasapi.rerouteLock);
-    {
-        result = ma_device_stop__wasapi_nolock(pDevice);
-    }
-    ma_mutex_unlock(&pDevice->wasapi.rerouteLock);
-
-    return result;
-}
-
-
-#ifndef MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS
-#define MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS 5000
-#endif
-
-static ma_result ma_device_read__wasapi(ma_device* pDevice, void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint32 totalFramesProcessed = 0;
-
-    /*
-    When reading, we need to get a buffer and process all of it before releasing it. Because the
-    frame count (frameCount) can be different to the size of the buffer, we'll need to cache the
-    pointer to the buffer.
-    */
-
-    /* Keep running until we've processed the requested number of frames. */
-    while (ma_device_get_state(pDevice) == ma_device_state_started && totalFramesProcessed < frameCount) {
-        ma_uint32 framesRemaining = frameCount - totalFramesProcessed;
-
-        /* If we have a mapped data buffer, consume that first. */
-        if (pDevice->wasapi.pMappedBufferCapture != NULL) {
-            /* We have a cached data pointer so consume that before grabbing another one from WASAPI. */
-            ma_uint32 framesToProcessNow = framesRemaining;
-            if (framesToProcessNow > pDevice->wasapi.mappedBufferCaptureLen) {
-                framesToProcessNow = pDevice->wasapi.mappedBufferCaptureLen;
-            }
-
-            /* Now just copy the data over to the output buffer. */
-            ma_copy_pcm_frames(
-                ma_offset_pcm_frames_ptr(pFrames, totalFramesProcessed, pDevice->capture.internalFormat, pDevice->capture.internalChannels),
-                ma_offset_pcm_frames_const_ptr(pDevice->wasapi.pMappedBufferCapture, pDevice->wasapi.mappedBufferCaptureCap - pDevice->wasapi.mappedBufferCaptureLen, pDevice->capture.internalFormat, pDevice->capture.internalChannels),
-                framesToProcessNow,
-                pDevice->capture.internalFormat, pDevice->capture.internalChannels
-            );
-
-            totalFramesProcessed                   += framesToProcessNow;
-            pDevice->wasapi.mappedBufferCaptureLen -= framesToProcessNow;
-
-            /* If the data buffer has been fully consumed we need to release it. */
-            if (pDevice->wasapi.mappedBufferCaptureLen == 0) {
-                ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
-                pDevice->wasapi.pMappedBufferCapture   = NULL;
-                pDevice->wasapi.mappedBufferCaptureCap = 0;
-            }
-        } else {
-            /* We don't have any cached data pointer, so grab another one. */
-            HRESULT hr;
-            DWORD flags = 0;
-
-            /* First just ask WASAPI for a data buffer. If it's not available, we'll wait for more. */
-            hr = ma_IAudioCaptureClient_GetBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, (BYTE**)&pDevice->wasapi.pMappedBufferCapture, &pDevice->wasapi.mappedBufferCaptureCap, &flags, NULL, NULL);
-            if (hr == S_OK) {
-                /* We got a data buffer. Continue to the next loop iteration which will then read from the mapped pointer. */
-                pDevice->wasapi.mappedBufferCaptureLen = pDevice->wasapi.mappedBufferCaptureCap;
-
-                /*
-                There have been reports that indicate that at times the AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY is reported for every
-                call to IAudioCaptureClient_GetBuffer() above which results in spamming of the debug messages below. To partially
-                work around this, I'm only outputting these messages when MA_DEBUG_OUTPUT is explicitly defined. The better solution
-                would be to figure out why the flag is always getting reported.
-                */
-                #if defined(MA_DEBUG_OUTPUT)
-                {
-                    if (flags != 0) {
-                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Capture Flags: %ld\n", flags);
-
-                        if ((flags & MA_AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY) != 0) {
-                            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity (possible overrun). Attempting recovery. mappedBufferCaptureCap=%d\n", pDevice->wasapi.mappedBufferCaptureCap);
-                        }
-                    }
-                }
-                #endif
-
-                /* Overrun detection. */
-                if ((flags & MA_AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY) != 0) {
-                    /* Glitched. Probably due to an overrun. */
-
-                    /*
-                    If we got an overrun it probably means we're straddling the end of the buffer. In normal capture
-                    mode this is the fault of the client application because they're responsible for ensuring data is
-                    processed fast enough. In duplex mode, however, the processing of audio is tied to the playback
-                    device, so this can possibly be the result of a timing de-sync.
-
-                    In capture mode we're not going to do any kind of recovery because the real fix is for the client
-                    application to process faster. In duplex mode, we'll treat this as a desync and reset the buffers
-                    to prevent a never-ending sequence of glitches due to straddling the end of the buffer.
-                    */
-                    if (pDevice->type == ma_device_type_duplex) {
-                        /*
-                        Experiment:
-
-                        If we empty out the *entire* buffer we may end up putting ourselves into an underrun position
-                        which isn't really any better than the overrun we're probably in right now. Instead we'll just
-                        empty out about half.
-                        */
-                        ma_uint32 i;
-                        ma_uint32 periodCount = (pDevice->wasapi.actualBufferSizeInFramesCapture / pDevice->wasapi.periodSizeInFramesCapture);
-                        ma_uint32 iterationCount = periodCount / 2;
-                        if ((periodCount % 2) > 0) {
-                            iterationCount += 1;
-                        }
-
-                        for (i = 0; i < iterationCount; i += 1) {
-                            hr = ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
-                            if (FAILED(hr)) {
-                                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity recovery: IAudioCaptureClient_ReleaseBuffer() failed with %ld.\n", hr);
-                                break;
-                            }
-
-                            flags = 0;
-                            hr = ma_IAudioCaptureClient_GetBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, (BYTE**)&pDevice->wasapi.pMappedBufferCapture, &pDevice->wasapi.mappedBufferCaptureCap, &flags, NULL, NULL);
-                            if (hr == MA_AUDCLNT_S_BUFFER_EMPTY || FAILED(hr)) {
-                                /*
-                                The buffer has been completely emptied or an error occurred. In this case we'll need
-                                to reset the state of the mapped buffer which will trigger the next iteration to get
-                                a fresh buffer from WASAPI.
-                                */
-                                pDevice->wasapi.pMappedBufferCapture   = NULL;
-                                pDevice->wasapi.mappedBufferCaptureCap = 0;
-                                pDevice->wasapi.mappedBufferCaptureLen = 0;
-
-                                if (hr == MA_AUDCLNT_S_BUFFER_EMPTY) {
-                                    if ((flags & MA_AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY) != 0) {
-                                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity recovery: Buffer emptied, and data discontinuity still reported.\n");
-                                    } else {
-                                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity recovery: Buffer emptied.\n");
-                                    }
-                                }
-
-                                if (FAILED(hr)) {
-                                    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity recovery: IAudioCaptureClient_GetBuffer() failed with %ld.\n", hr);
-                                }
-
-                                break;
-                            }
-                        }
-
-                        /* If at this point we have a valid buffer mapped, make sure the buffer length is set appropriately. */
-                        if (pDevice->wasapi.pMappedBufferCapture != NULL) {
-                            pDevice->wasapi.mappedBufferCaptureLen = pDevice->wasapi.mappedBufferCaptureCap;
-                        }
-                    }
-                }
-
-                continue;
-            } else {
-                if (hr == MA_AUDCLNT_S_BUFFER_EMPTY || hr == MA_AUDCLNT_E_BUFFER_ERROR) {
-                    /*
-                    No data is available. We need to wait for more. There's two situations to consider
-                    here. The first is normal capture mode. If this times out it probably means the
-                    microphone isn't delivering data for whatever reason. In this case we'll just
-                    abort the read and return whatever we were able to get. The other situations is
-                    loopback mode, in which case a timeout probably just means the nothing is playing
-                    through the speakers.
-                    */
-
-                    /* Experiment: Use a shorter timeout for loopback mode. */
-                    DWORD timeoutInMilliseconds = MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS;
-                    if (pDevice->type == ma_device_type_loopback) {
-                        timeoutInMilliseconds = 10;
-                    }
-
-                    if (WaitForSingleObject((HANDLE)pDevice->wasapi.hEventCapture, timeoutInMilliseconds) != WAIT_OBJECT_0) {
-                        if (pDevice->type == ma_device_type_loopback) {
-                            continue;   /* Keep waiting in loopback mode. */
-                        } else {
-                            result = MA_ERROR;
-                            break;      /* Wait failed. */
-                        }
-                    }
-
-                    /* At this point we should be able to loop back to the start of the loop and try retrieving a data buffer again. */
-                } else {
-                    /* An error occurred and we need to abort. */
-                    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to retrieve internal buffer from capture device in preparation for reading from the device. HRESULT = %d. Stopping device.\n", (int)hr);
-                    result = ma_result_from_HRESULT(hr);
-                    break;
-                }
-            }
-        }
-    }
-
-    /*
-    If we were unable to process the entire requested frame count, but we still have a mapped buffer,
-    there's a good chance either an error occurred or the device was stopped mid-read. In this case
-    we'll need to make sure the buffer is released.
-    */
-    if (totalFramesProcessed < frameCount && pDevice->wasapi.pMappedBufferCapture != NULL) {
-        ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
-        pDevice->wasapi.pMappedBufferCapture   = NULL;
-        pDevice->wasapi.mappedBufferCaptureCap = 0;
-        pDevice->wasapi.mappedBufferCaptureLen = 0;
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = totalFramesProcessed;
-    }
-
-    return result;
-}
-
-static ma_result ma_device_write__wasapi(ma_device* pDevice, const void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint32 totalFramesProcessed = 0;
-
-    /* Keep writing to the device until it's stopped or we've consumed all of our input. */
-    while (ma_device_get_state(pDevice) == ma_device_state_started && totalFramesProcessed < frameCount) {
-        ma_uint32 framesRemaining = frameCount - totalFramesProcessed;
-
-        /*
-        We're going to do this in a similar way to capture. We'll first check if the cached data pointer
-        is valid, and if so, read from that. Otherwise We will call IAudioRenderClient_GetBuffer() with
-        a requested buffer size equal to our actual period size. If it returns AUDCLNT_E_BUFFER_TOO_LARGE
-        it means we need to wait for some data to become available.
-        */
-        if (pDevice->wasapi.pMappedBufferPlayback != NULL) {
-            /* We still have some space available in the mapped data buffer. Write to it. */
-            ma_uint32 framesToProcessNow = framesRemaining;
-            if (framesToProcessNow > (pDevice->wasapi.mappedBufferPlaybackCap - pDevice->wasapi.mappedBufferPlaybackLen)) {
-                framesToProcessNow = (pDevice->wasapi.mappedBufferPlaybackCap - pDevice->wasapi.mappedBufferPlaybackLen);
-            }
-
-            /* Now just copy the data over to the output buffer. */
-            ma_copy_pcm_frames(
-                ma_offset_pcm_frames_ptr(pDevice->wasapi.pMappedBufferPlayback, pDevice->wasapi.mappedBufferPlaybackLen, pDevice->playback.internalFormat, pDevice->playback.internalChannels),
-                ma_offset_pcm_frames_const_ptr(pFrames, totalFramesProcessed, pDevice->playback.internalFormat, pDevice->playback.internalChannels),
-                framesToProcessNow,
-                pDevice->playback.internalFormat, pDevice->playback.internalChannels
-            );
-
-            totalFramesProcessed                    += framesToProcessNow;
-            pDevice->wasapi.mappedBufferPlaybackLen += framesToProcessNow;
-
-            /* If the data buffer has been fully consumed we need to release it. */
-            if (pDevice->wasapi.mappedBufferPlaybackLen == pDevice->wasapi.mappedBufferPlaybackCap) {
-                ma_IAudioRenderClient_ReleaseBuffer((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient, pDevice->wasapi.mappedBufferPlaybackCap, 0);
-                pDevice->wasapi.pMappedBufferPlayback   = NULL;
-                pDevice->wasapi.mappedBufferPlaybackCap = 0;
-                pDevice->wasapi.mappedBufferPlaybackLen = 0;
-
-                /*
-                In exclusive mode we need to wait here. Exclusive mode is weird because GetBuffer() never
-                seems to return AUDCLNT_E_BUFFER_TOO_LARGE, which is what we normally use to determine
-                whether or not we need to wait for more data.
-                */
-                if (pDevice->playback.shareMode == ma_share_mode_exclusive) {
-                    if (WaitForSingleObject((HANDLE)pDevice->wasapi.hEventPlayback, MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS) != WAIT_OBJECT_0) {
-                        result = MA_ERROR;
-                        break;   /* Wait failed. Probably timed out. */
-                    }
-                }
-            }
-        } else {
-            /* We don't have a mapped data buffer so we'll need to get one. */
-            HRESULT hr;
-            ma_uint32 bufferSizeInFrames;
-
-            /* Special rules for exclusive mode. */
-            if (pDevice->playback.shareMode == ma_share_mode_exclusive) {
-                bufferSizeInFrames = pDevice->wasapi.actualBufferSizeInFramesPlayback;
-            } else {
-                bufferSizeInFrames = pDevice->wasapi.periodSizeInFramesPlayback;
-            }
-
-            hr = ma_IAudioRenderClient_GetBuffer((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient, bufferSizeInFrames, (BYTE**)&pDevice->wasapi.pMappedBufferPlayback);
-            if (hr == S_OK) {
-                /* We have data available. */
-                pDevice->wasapi.mappedBufferPlaybackCap = bufferSizeInFrames;
-                pDevice->wasapi.mappedBufferPlaybackLen = 0;
-            } else {
-                if (hr == MA_AUDCLNT_E_BUFFER_TOO_LARGE || hr == MA_AUDCLNT_E_BUFFER_ERROR) {
-                    /* Not enough data available. We need to wait for more. */
-                    if (WaitForSingleObject((HANDLE)pDevice->wasapi.hEventPlayback, MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS) != WAIT_OBJECT_0) {
-                        result = MA_ERROR;
-                        break;   /* Wait failed. Probably timed out. */
-                    }
-                } else {
-                    /* Some error occurred. We'll need to abort. */
-                    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to retrieve internal buffer from playback device in preparation for writing to the device. HRESULT = %d. Stopping device.\n", (int)hr);
-                    result = ma_result_from_HRESULT(hr);
-                    break;
-                }
-            }
-        }
-    }
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = totalFramesProcessed;
-    }
-
-    return result;
-}
-
-static ma_result ma_device_data_loop_wakeup__wasapi(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
-        SetEvent((HANDLE)pDevice->wasapi.hEventCapture);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        SetEvent((HANDLE)pDevice->wasapi.hEventPlayback);
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_context_uninit__wasapi(ma_context* pContext)
-{
-    ma_context_command__wasapi cmd = ma_context_init_command__wasapi(MA_CONTEXT_COMMAND_QUIT__WASAPI);
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_wasapi);
-
-    ma_context_post_command__wasapi(pContext, &cmd);
-    ma_thread_wait(&pContext->wasapi.commandThread);
-
-    if (pContext->wasapi.hAvrt) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->wasapi.hAvrt);
-        pContext->wasapi.hAvrt = NULL;
-    }
-
-    #if defined(MA_WIN32_UWP)
-    {
-        if (pContext->wasapi.hMMDevapi) {
-            ma_dlclose(ma_context_get_log(pContext), pContext->wasapi.hMMDevapi);
-            pContext->wasapi.hMMDevapi = NULL;
-        }
-    }
-    #endif
-
-    /* Only after the thread has been terminated can we uninitialize the sync objects for the command thread. */
-    ma_semaphore_uninit(&pContext->wasapi.commandSem);
-    ma_mutex_uninit(&pContext->wasapi.commandLock);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__wasapi(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    ma_result result = MA_SUCCESS;
-
-    MA_ASSERT(pContext != NULL);
-
-    (void)pConfig;
-
-#ifdef MA_WIN32_DESKTOP
-    /*
-    WASAPI is only supported in Vista SP1 and newer. The reason for SP1 and not the base version of Vista is that event-driven
-    exclusive mode does not work until SP1.
-
-    Unfortunately older compilers don't define these functions so we need to dynamically load them in order to avoid a link error.
-    */
-    {
-        ma_OSVERSIONINFOEXW osvi;
-        ma_handle kernel32DLL;
-        ma_PFNVerifyVersionInfoW _VerifyVersionInfoW;
-        ma_PFNVerSetConditionMask _VerSetConditionMask;
-
-        kernel32DLL = ma_dlopen(ma_context_get_log(pContext), "kernel32.dll");
-        if (kernel32DLL == NULL) {
-            return MA_NO_BACKEND;
-        }
-
-        _VerifyVersionInfoW  = (ma_PFNVerifyVersionInfoW )ma_dlsym(ma_context_get_log(pContext), kernel32DLL, "VerifyVersionInfoW");
-        _VerSetConditionMask = (ma_PFNVerSetConditionMask)ma_dlsym(ma_context_get_log(pContext), kernel32DLL, "VerSetConditionMask");
-        if (_VerifyVersionInfoW == NULL || _VerSetConditionMask == NULL) {
-            ma_dlclose(ma_context_get_log(pContext), kernel32DLL);
-            return MA_NO_BACKEND;
-        }
-
-        MA_ZERO_OBJECT(&osvi);
-        osvi.dwOSVersionInfoSize = sizeof(osvi);
-        osvi.dwMajorVersion = ((MA_WIN32_WINNT_VISTA >> 8) & 0xFF);
-        osvi.dwMinorVersion = ((MA_WIN32_WINNT_VISTA >> 0) & 0xFF);
-        osvi.wServicePackMajor = 1;
-        if (_VerifyVersionInfoW(&osvi, MA_VER_MAJORVERSION | MA_VER_MINORVERSION | MA_VER_SERVICEPACKMAJOR, _VerSetConditionMask(_VerSetConditionMask(_VerSetConditionMask(0, MA_VER_MAJORVERSION, MA_VER_GREATER_EQUAL), MA_VER_MINORVERSION, MA_VER_GREATER_EQUAL), MA_VER_SERVICEPACKMAJOR, MA_VER_GREATER_EQUAL))) {
-            result = MA_SUCCESS;
-        } else {
-            result = MA_NO_BACKEND;
-        }
-
-        ma_dlclose(ma_context_get_log(pContext), kernel32DLL);
-    }
-#endif
-
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    MA_ZERO_OBJECT(&pContext->wasapi);
-
-
-    #if defined(MA_WIN32_UWP)
-    {
-        /* Link to mmdevapi so we can get access to ActivateAudioInterfaceAsync(). */
-        pContext->wasapi.hMMDevapi = ma_dlopen(ma_context_get_log(pContext), "mmdevapi.dll");
-        if (pContext->wasapi.hMMDevapi) {
-            pContext->wasapi.ActivateAudioInterfaceAsync = ma_dlsym(ma_context_get_log(pContext), pContext->wasapi.hMMDevapi, "ActivateAudioInterfaceAsync");
-            if (pContext->wasapi.ActivateAudioInterfaceAsync == NULL) {
-                ma_dlclose(ma_context_get_log(pContext), pContext->wasapi.hMMDevapi);
-                return MA_NO_BACKEND;   /* ActivateAudioInterfaceAsync() could not be loaded. */
-            }
-        } else {
-            return MA_NO_BACKEND;   /* Failed to load mmdevapi.dll which is required for ActivateAudioInterfaceAsync() */
-        }
-    }
-    #endif
-
-    /* Optionally use the Avrt API to specify the audio thread's latency sensitivity requirements */
-    pContext->wasapi.hAvrt = ma_dlopen(ma_context_get_log(pContext), "avrt.dll");
-    if (pContext->wasapi.hAvrt) {
-        pContext->wasapi.AvSetMmThreadCharacteristicsA   = ma_dlsym(ma_context_get_log(pContext), pContext->wasapi.hAvrt, "AvSetMmThreadCharacteristicsA");
-        pContext->wasapi.AvRevertMmThreadcharacteristics = ma_dlsym(ma_context_get_log(pContext), pContext->wasapi.hAvrt, "AvRevertMmThreadCharacteristics");
-
-        /* If either function could not be found, disable use of avrt entirely. */
-        if (!pContext->wasapi.AvSetMmThreadCharacteristicsA || !pContext->wasapi.AvRevertMmThreadcharacteristics) {
-            pContext->wasapi.AvSetMmThreadCharacteristicsA   = NULL;
-            pContext->wasapi.AvRevertMmThreadcharacteristics = NULL;
-            ma_dlclose(ma_context_get_log(pContext), pContext->wasapi.hAvrt);
-            pContext->wasapi.hAvrt = NULL;
-        }
-    }
-
-
-    /*
-    Annoyingly, WASAPI does not allow you to release an IAudioClient object from a different thread
-    than the one that retrieved it with GetService(). This can result in a deadlock in two
-    situations:
-
-        1) When calling ma_device_uninit() from a different thread to ma_device_init(); and
-        2) When uninitializing and reinitializing the internal IAudioClient object in response to
-           automatic stream routing.
-
-    We could define ma_device_uninit() such that it must be called on the same thread as
-    ma_device_init(). We could also just not release the IAudioClient when performing automatic
-    stream routing to avoid the deadlock. Neither of these are acceptable solutions in my view so
-    we're going to have to work around this with a worker thread. This is not ideal, but I can't
-    think of a better way to do this.
-
-    More information about this can be found here:
-
-        https://docs.microsoft.com/en-us/windows/win32/api/audioclient/nn-audioclient-iaudiorenderclient
-
-    Note this section:
-
-        When releasing an IAudioRenderClient interface instance, the client must call the interface's
-        Release method from the same thread as the call to IAudioClient::GetService that created the
-        object.
-    */
-    {
-        result = ma_mutex_init(&pContext->wasapi.commandLock);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        result = ma_semaphore_init(0, &pContext->wasapi.commandSem);
-        if (result != MA_SUCCESS) {
-            ma_mutex_uninit(&pContext->wasapi.commandLock);
-            return result;
-        }
-
-        result = ma_thread_create(&pContext->wasapi.commandThread, ma_thread_priority_normal, 0, ma_context_command_thread__wasapi, pContext, &pContext->allocationCallbacks);
-        if (result != MA_SUCCESS) {
-            ma_semaphore_uninit(&pContext->wasapi.commandSem);
-            ma_mutex_uninit(&pContext->wasapi.commandLock);
-            return result;
-        }
-    }
-
-
-    pCallbacks->onContextInit             = ma_context_init__wasapi;
-    pCallbacks->onContextUninit           = ma_context_uninit__wasapi;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__wasapi;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__wasapi;
-    pCallbacks->onDeviceInit              = ma_device_init__wasapi;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__wasapi;
-    pCallbacks->onDeviceStart             = ma_device_start__wasapi;
-    pCallbacks->onDeviceStop              = ma_device_stop__wasapi;
-    pCallbacks->onDeviceRead              = ma_device_read__wasapi;
-    pCallbacks->onDeviceWrite             = ma_device_write__wasapi;
-    pCallbacks->onDeviceDataLoop          = NULL;
-    pCallbacks->onDeviceDataLoopWakeup    = ma_device_data_loop_wakeup__wasapi;
-
-    return MA_SUCCESS;
-}
-#endif
-
-/******************************************************************************
-
-DirectSound Backend
-
-******************************************************************************/
-#ifdef MA_HAS_DSOUND
-/*#include <dsound.h>*/
-
-/*static const GUID MA_GUID_IID_DirectSoundNotify = {0xb0210783, 0x89cd, 0x11d0, {0xaf, 0x08, 0x00, 0xa0, 0xc9, 0x25, 0xcd, 0x16}};*/
-
-/* miniaudio only uses priority or exclusive modes. */
-#define MA_DSSCL_NORMAL                 1
-#define MA_DSSCL_PRIORITY               2
-#define MA_DSSCL_EXCLUSIVE              3
-#define MA_DSSCL_WRITEPRIMARY           4
-
-#define MA_DSCAPS_PRIMARYMONO           0x00000001
-#define MA_DSCAPS_PRIMARYSTEREO         0x00000002
-#define MA_DSCAPS_PRIMARY8BIT           0x00000004
-#define MA_DSCAPS_PRIMARY16BIT          0x00000008
-#define MA_DSCAPS_CONTINUOUSRATE        0x00000010
-#define MA_DSCAPS_EMULDRIVER            0x00000020
-#define MA_DSCAPS_CERTIFIED             0x00000040
-#define MA_DSCAPS_SECONDARYMONO         0x00000100
-#define MA_DSCAPS_SECONDARYSTEREO       0x00000200
-#define MA_DSCAPS_SECONDARY8BIT         0x00000400
-#define MA_DSCAPS_SECONDARY16BIT        0x00000800
-
-#define MA_DSBCAPS_PRIMARYBUFFER        0x00000001
-#define MA_DSBCAPS_STATIC               0x00000002
-#define MA_DSBCAPS_LOCHARDWARE          0x00000004
-#define MA_DSBCAPS_LOCSOFTWARE          0x00000008
-#define MA_DSBCAPS_CTRL3D               0x00000010
-#define MA_DSBCAPS_CTRLFREQUENCY        0x00000020
-#define MA_DSBCAPS_CTRLPAN              0x00000040
-#define MA_DSBCAPS_CTRLVOLUME           0x00000080
-#define MA_DSBCAPS_CTRLPOSITIONNOTIFY   0x00000100
-#define MA_DSBCAPS_CTRLFX               0x00000200
-#define MA_DSBCAPS_STICKYFOCUS          0x00004000
-#define MA_DSBCAPS_GLOBALFOCUS          0x00008000
-#define MA_DSBCAPS_GETCURRENTPOSITION2  0x00010000
-#define MA_DSBCAPS_MUTE3DATMAXDISTANCE  0x00020000
-#define MA_DSBCAPS_LOCDEFER             0x00040000
-#define MA_DSBCAPS_TRUEPLAYPOSITION     0x00080000
-
-#define MA_DSBPLAY_LOOPING              0x00000001
-#define MA_DSBPLAY_LOCHARDWARE          0x00000002
-#define MA_DSBPLAY_LOCSOFTWARE          0x00000004
-#define MA_DSBPLAY_TERMINATEBY_TIME     0x00000008
-#define MA_DSBPLAY_TERMINATEBY_DISTANCE 0x00000010
-#define MA_DSBPLAY_TERMINATEBY_PRIORITY 0x00000020
-
-#define MA_DSCBSTART_LOOPING            0x00000001
-
-typedef struct
-{
-    DWORD dwSize;
-    DWORD dwFlags;
-    DWORD dwBufferBytes;
-    DWORD dwReserved;
-    MA_WAVEFORMATEX* lpwfxFormat;
-    GUID guid3DAlgorithm;
-} MA_DSBUFFERDESC;
-
-typedef struct
-{
-    DWORD dwSize;
-    DWORD dwFlags;
-    DWORD dwBufferBytes;
-    DWORD dwReserved;
-    MA_WAVEFORMATEX* lpwfxFormat;
-    DWORD dwFXCount;
-    void* lpDSCFXDesc;  /* <-- miniaudio doesn't use this, so set to void*. */
-} MA_DSCBUFFERDESC;
-
-typedef struct
-{
-    DWORD dwSize;
-    DWORD dwFlags;
-    DWORD dwMinSecondarySampleRate;
-    DWORD dwMaxSecondarySampleRate;
-    DWORD dwPrimaryBuffers;
-    DWORD dwMaxHwMixingAllBuffers;
-    DWORD dwMaxHwMixingStaticBuffers;
-    DWORD dwMaxHwMixingStreamingBuffers;
-    DWORD dwFreeHwMixingAllBuffers;
-    DWORD dwFreeHwMixingStaticBuffers;
-    DWORD dwFreeHwMixingStreamingBuffers;
-    DWORD dwMaxHw3DAllBuffers;
-    DWORD dwMaxHw3DStaticBuffers;
-    DWORD dwMaxHw3DStreamingBuffers;
-    DWORD dwFreeHw3DAllBuffers;
-    DWORD dwFreeHw3DStaticBuffers;
-    DWORD dwFreeHw3DStreamingBuffers;
-    DWORD dwTotalHwMemBytes;
-    DWORD dwFreeHwMemBytes;
-    DWORD dwMaxContigFreeHwMemBytes;
-    DWORD dwUnlockTransferRateHwBuffers;
-    DWORD dwPlayCpuOverheadSwBuffers;
-    DWORD dwReserved1;
-    DWORD dwReserved2;
-} MA_DSCAPS;
-
-typedef struct
-{
-    DWORD dwSize;
-    DWORD dwFlags;
-    DWORD dwBufferBytes;
-    DWORD dwUnlockTransferRate;
-    DWORD dwPlayCpuOverhead;
-} MA_DSBCAPS;
-
-typedef struct
-{
-    DWORD dwSize;
-    DWORD dwFlags;
-    DWORD dwFormats;
-    DWORD dwChannels;
-} MA_DSCCAPS;
-
-typedef struct
-{
-    DWORD dwSize;
-    DWORD dwFlags;
-    DWORD dwBufferBytes;
-    DWORD dwReserved;
-} MA_DSCBCAPS;
-
-typedef struct
-{
-    DWORD  dwOffset;
-    HANDLE hEventNotify;
-} MA_DSBPOSITIONNOTIFY;
-
-typedef struct ma_IDirectSound              ma_IDirectSound;
-typedef struct ma_IDirectSoundBuffer        ma_IDirectSoundBuffer;
-typedef struct ma_IDirectSoundCapture       ma_IDirectSoundCapture;
-typedef struct ma_IDirectSoundCaptureBuffer ma_IDirectSoundCaptureBuffer;
-typedef struct ma_IDirectSoundNotify        ma_IDirectSoundNotify;
-
-
-/*
-COM objects. The way these work is that you have a vtable (a list of function pointers, kind of
-like how C++ works internally), and then you have a structure with a single member, which is a
-pointer to the vtable. The vtable is where the methods of the object are defined. Methods need
-to be in a specific order, and parent classes need to have their methods declared first.
-*/
-
-/* IDirectSound */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSound* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSound* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSound* pThis);
-
-    /* IDirectSound */
-    HRESULT (STDMETHODCALLTYPE * CreateSoundBuffer)   (ma_IDirectSound* pThis, const MA_DSBUFFERDESC* pDSBufferDesc, ma_IDirectSoundBuffer** ppDSBuffer, void* pUnkOuter);
-    HRESULT (STDMETHODCALLTYPE * GetCaps)             (ma_IDirectSound* pThis, MA_DSCAPS* pDSCaps);
-    HRESULT (STDMETHODCALLTYPE * DuplicateSoundBuffer)(ma_IDirectSound* pThis, ma_IDirectSoundBuffer* pDSBufferOriginal, ma_IDirectSoundBuffer** ppDSBufferDuplicate);
-    HRESULT (STDMETHODCALLTYPE * SetCooperativeLevel) (ma_IDirectSound* pThis, HWND hwnd, DWORD dwLevel);
-    HRESULT (STDMETHODCALLTYPE * Compact)             (ma_IDirectSound* pThis);
-    HRESULT (STDMETHODCALLTYPE * GetSpeakerConfig)    (ma_IDirectSound* pThis, DWORD* pSpeakerConfig);
-    HRESULT (STDMETHODCALLTYPE * SetSpeakerConfig)    (ma_IDirectSound* pThis, DWORD dwSpeakerConfig);
-    HRESULT (STDMETHODCALLTYPE * Initialize)          (ma_IDirectSound* pThis, const GUID* pGuidDevice);
-} ma_IDirectSoundVtbl;
-struct ma_IDirectSound
-{
-    ma_IDirectSoundVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IDirectSound_QueryInterface(ma_IDirectSound* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IDirectSound_AddRef(ma_IDirectSound* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IDirectSound_Release(ma_IDirectSound* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IDirectSound_CreateSoundBuffer(ma_IDirectSound* pThis, const MA_DSBUFFERDESC* pDSBufferDesc, ma_IDirectSoundBuffer** ppDSBuffer, void* pUnkOuter) { return pThis->lpVtbl->CreateSoundBuffer(pThis, pDSBufferDesc, ppDSBuffer, pUnkOuter); }
-static MA_INLINE HRESULT ma_IDirectSound_GetCaps(ma_IDirectSound* pThis, MA_DSCAPS* pDSCaps)                            { return pThis->lpVtbl->GetCaps(pThis, pDSCaps); }
-static MA_INLINE HRESULT ma_IDirectSound_DuplicateSoundBuffer(ma_IDirectSound* pThis, ma_IDirectSoundBuffer* pDSBufferOriginal, ma_IDirectSoundBuffer** ppDSBufferDuplicate) { return pThis->lpVtbl->DuplicateSoundBuffer(pThis, pDSBufferOriginal, ppDSBufferDuplicate); }
-static MA_INLINE HRESULT ma_IDirectSound_SetCooperativeLevel(ma_IDirectSound* pThis, HWND hwnd, DWORD dwLevel)          { return pThis->lpVtbl->SetCooperativeLevel(pThis, hwnd, dwLevel); }
-static MA_INLINE HRESULT ma_IDirectSound_Compact(ma_IDirectSound* pThis)                                                { return pThis->lpVtbl->Compact(pThis); }
-static MA_INLINE HRESULT ma_IDirectSound_GetSpeakerConfig(ma_IDirectSound* pThis, DWORD* pSpeakerConfig)                { return pThis->lpVtbl->GetSpeakerConfig(pThis, pSpeakerConfig); }
-static MA_INLINE HRESULT ma_IDirectSound_SetSpeakerConfig(ma_IDirectSound* pThis, DWORD dwSpeakerConfig)                { return pThis->lpVtbl->SetSpeakerConfig(pThis, dwSpeakerConfig); }
-static MA_INLINE HRESULT ma_IDirectSound_Initialize(ma_IDirectSound* pThis, const GUID* pGuidDevice)                    { return pThis->lpVtbl->Initialize(pThis, pGuidDevice); }
-
-
-/* IDirectSoundBuffer */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSoundBuffer* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSoundBuffer* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSoundBuffer* pThis);
-
-    /* IDirectSoundBuffer */
-    HRESULT (STDMETHODCALLTYPE * GetCaps)           (ma_IDirectSoundBuffer* pThis, MA_DSBCAPS* pDSBufferCaps);
-    HRESULT (STDMETHODCALLTYPE * GetCurrentPosition)(ma_IDirectSoundBuffer* pThis, DWORD* pCurrentPlayCursor, DWORD* pCurrentWriteCursor);
-    HRESULT (STDMETHODCALLTYPE * GetFormat)         (ma_IDirectSoundBuffer* pThis, MA_WAVEFORMATEX* pFormat, DWORD dwSizeAllocated, DWORD* pSizeWritten);
-    HRESULT (STDMETHODCALLTYPE * GetVolume)         (ma_IDirectSoundBuffer* pThis, LONG* pVolume);
-    HRESULT (STDMETHODCALLTYPE * GetPan)            (ma_IDirectSoundBuffer* pThis, LONG* pPan);
-    HRESULT (STDMETHODCALLTYPE * GetFrequency)      (ma_IDirectSoundBuffer* pThis, DWORD* pFrequency);
-    HRESULT (STDMETHODCALLTYPE * GetStatus)         (ma_IDirectSoundBuffer* pThis, DWORD* pStatus);
-    HRESULT (STDMETHODCALLTYPE * Initialize)        (ma_IDirectSoundBuffer* pThis, ma_IDirectSound* pDirectSound, const MA_DSBUFFERDESC* pDSBufferDesc);
-    HRESULT (STDMETHODCALLTYPE * Lock)              (ma_IDirectSoundBuffer* pThis, DWORD dwOffset, DWORD dwBytes, void** ppAudioPtr1, DWORD* pAudioBytes1, void** ppAudioPtr2, DWORD* pAudioBytes2, DWORD dwFlags);
-    HRESULT (STDMETHODCALLTYPE * Play)              (ma_IDirectSoundBuffer* pThis, DWORD dwReserved1, DWORD dwPriority, DWORD dwFlags);
-    HRESULT (STDMETHODCALLTYPE * SetCurrentPosition)(ma_IDirectSoundBuffer* pThis, DWORD dwNewPosition);
-    HRESULT (STDMETHODCALLTYPE * SetFormat)         (ma_IDirectSoundBuffer* pThis, const MA_WAVEFORMATEX* pFormat);
-    HRESULT (STDMETHODCALLTYPE * SetVolume)         (ma_IDirectSoundBuffer* pThis, LONG volume);
-    HRESULT (STDMETHODCALLTYPE * SetPan)            (ma_IDirectSoundBuffer* pThis, LONG pan);
-    HRESULT (STDMETHODCALLTYPE * SetFrequency)      (ma_IDirectSoundBuffer* pThis, DWORD dwFrequency);
-    HRESULT (STDMETHODCALLTYPE * Stop)              (ma_IDirectSoundBuffer* pThis);
-    HRESULT (STDMETHODCALLTYPE * Unlock)            (ma_IDirectSoundBuffer* pThis, void* pAudioPtr1, DWORD dwAudioBytes1, void* pAudioPtr2, DWORD dwAudioBytes2);
-    HRESULT (STDMETHODCALLTYPE * Restore)           (ma_IDirectSoundBuffer* pThis);
-} ma_IDirectSoundBufferVtbl;
-struct ma_IDirectSoundBuffer
-{
-    ma_IDirectSoundBufferVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_QueryInterface(ma_IDirectSoundBuffer* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IDirectSoundBuffer_AddRef(ma_IDirectSoundBuffer* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IDirectSoundBuffer_Release(ma_IDirectSoundBuffer* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetCaps(ma_IDirectSoundBuffer* pThis, MA_DSBCAPS* pDSBufferCaps)                     { return pThis->lpVtbl->GetCaps(pThis, pDSBufferCaps); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetCurrentPosition(ma_IDirectSoundBuffer* pThis, DWORD* pCurrentPlayCursor, DWORD* pCurrentWriteCursor) { return pThis->lpVtbl->GetCurrentPosition(pThis, pCurrentPlayCursor, pCurrentWriteCursor); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetFormat(ma_IDirectSoundBuffer* pThis, MA_WAVEFORMATEX* pFormat, DWORD dwSizeAllocated, DWORD* pSizeWritten) { return pThis->lpVtbl->GetFormat(pThis, pFormat, dwSizeAllocated, pSizeWritten); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetVolume(ma_IDirectSoundBuffer* pThis, LONG* pVolume)                               { return pThis->lpVtbl->GetVolume(pThis, pVolume); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetPan(ma_IDirectSoundBuffer* pThis, LONG* pPan)                                     { return pThis->lpVtbl->GetPan(pThis, pPan); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetFrequency(ma_IDirectSoundBuffer* pThis, DWORD* pFrequency)                        { return pThis->lpVtbl->GetFrequency(pThis, pFrequency); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetStatus(ma_IDirectSoundBuffer* pThis, DWORD* pStatus)                              { return pThis->lpVtbl->GetStatus(pThis, pStatus); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_Initialize(ma_IDirectSoundBuffer* pThis, ma_IDirectSound* pDirectSound, const MA_DSBUFFERDESC* pDSBufferDesc) { return pThis->lpVtbl->Initialize(pThis, pDirectSound, pDSBufferDesc); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_Lock(ma_IDirectSoundBuffer* pThis, DWORD dwOffset, DWORD dwBytes, void** ppAudioPtr1, DWORD* pAudioBytes1, void** ppAudioPtr2, DWORD* pAudioBytes2, DWORD dwFlags) { return pThis->lpVtbl->Lock(pThis, dwOffset, dwBytes, ppAudioPtr1, pAudioBytes1, ppAudioPtr2, pAudioBytes2, dwFlags); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_Play(ma_IDirectSoundBuffer* pThis, DWORD dwReserved1, DWORD dwPriority, DWORD dwFlags) { return pThis->lpVtbl->Play(pThis, dwReserved1, dwPriority, dwFlags); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetCurrentPosition(ma_IDirectSoundBuffer* pThis, DWORD dwNewPosition)                { return pThis->lpVtbl->SetCurrentPosition(pThis, dwNewPosition); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetFormat(ma_IDirectSoundBuffer* pThis, const MA_WAVEFORMATEX* pFormat)              { return pThis->lpVtbl->SetFormat(pThis, pFormat); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetVolume(ma_IDirectSoundBuffer* pThis, LONG volume)                                 { return pThis->lpVtbl->SetVolume(pThis, volume); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetPan(ma_IDirectSoundBuffer* pThis, LONG pan)                                       { return pThis->lpVtbl->SetPan(pThis, pan); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetFrequency(ma_IDirectSoundBuffer* pThis, DWORD dwFrequency)                        { return pThis->lpVtbl->SetFrequency(pThis, dwFrequency); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_Stop(ma_IDirectSoundBuffer* pThis)                                                   { return pThis->lpVtbl->Stop(pThis); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_Unlock(ma_IDirectSoundBuffer* pThis, void* pAudioPtr1, DWORD dwAudioBytes1, void* pAudioPtr2, DWORD dwAudioBytes2) { return pThis->lpVtbl->Unlock(pThis, pAudioPtr1, dwAudioBytes1, pAudioPtr2, dwAudioBytes2); }
-static MA_INLINE HRESULT ma_IDirectSoundBuffer_Restore(ma_IDirectSoundBuffer* pThis)                                                { return pThis->lpVtbl->Restore(pThis); }
-
-
-/* IDirectSoundCapture */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSoundCapture* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSoundCapture* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSoundCapture* pThis);
-
-    /* IDirectSoundCapture */
-    HRESULT (STDMETHODCALLTYPE * CreateCaptureBuffer)(ma_IDirectSoundCapture* pThis, const MA_DSCBUFFERDESC* pDSCBufferDesc, ma_IDirectSoundCaptureBuffer** ppDSCBuffer, void* pUnkOuter);
-    HRESULT (STDMETHODCALLTYPE * GetCaps)            (ma_IDirectSoundCapture* pThis, MA_DSCCAPS* pDSCCaps);
-    HRESULT (STDMETHODCALLTYPE * Initialize)         (ma_IDirectSoundCapture* pThis, const GUID* pGuidDevice);
-} ma_IDirectSoundCaptureVtbl;
-struct ma_IDirectSoundCapture
-{
-    ma_IDirectSoundCaptureVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IDirectSoundCapture_QueryInterface     (ma_IDirectSoundCapture* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IDirectSoundCapture_AddRef             (ma_IDirectSoundCapture* pThis)                                    { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IDirectSoundCapture_Release            (ma_IDirectSoundCapture* pThis)                                    { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IDirectSoundCapture_CreateCaptureBuffer(ma_IDirectSoundCapture* pThis, const MA_DSCBUFFERDESC* pDSCBufferDesc, ma_IDirectSoundCaptureBuffer** ppDSCBuffer, void* pUnkOuter) { return pThis->lpVtbl->CreateCaptureBuffer(pThis, pDSCBufferDesc, ppDSCBuffer, pUnkOuter); }
-static MA_INLINE HRESULT ma_IDirectSoundCapture_GetCaps            (ma_IDirectSoundCapture* pThis, MA_DSCCAPS* pDSCCaps)              { return pThis->lpVtbl->GetCaps(pThis, pDSCCaps); }
-static MA_INLINE HRESULT ma_IDirectSoundCapture_Initialize         (ma_IDirectSoundCapture* pThis, const GUID* pGuidDevice)           { return pThis->lpVtbl->Initialize(pThis, pGuidDevice); }
-
-
-/* IDirectSoundCaptureBuffer */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSoundCaptureBuffer* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSoundCaptureBuffer* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSoundCaptureBuffer* pThis);
-
-    /* IDirectSoundCaptureBuffer */
-    HRESULT (STDMETHODCALLTYPE * GetCaps)           (ma_IDirectSoundCaptureBuffer* pThis, MA_DSCBCAPS* pDSCBCaps);
-    HRESULT (STDMETHODCALLTYPE * GetCurrentPosition)(ma_IDirectSoundCaptureBuffer* pThis, DWORD* pCapturePosition, DWORD* pReadPosition);
-    HRESULT (STDMETHODCALLTYPE * GetFormat)         (ma_IDirectSoundCaptureBuffer* pThis, MA_WAVEFORMATEX* pFormat, DWORD dwSizeAllocated, DWORD* pSizeWritten);
-    HRESULT (STDMETHODCALLTYPE * GetStatus)         (ma_IDirectSoundCaptureBuffer* pThis, DWORD* pStatus);
-    HRESULT (STDMETHODCALLTYPE * Initialize)        (ma_IDirectSoundCaptureBuffer* pThis, ma_IDirectSoundCapture* pDirectSoundCapture, const MA_DSCBUFFERDESC* pDSCBufferDesc);
-    HRESULT (STDMETHODCALLTYPE * Lock)              (ma_IDirectSoundCaptureBuffer* pThis, DWORD dwOffset, DWORD dwBytes, void** ppAudioPtr1, DWORD* pAudioBytes1, void** ppAudioPtr2, DWORD* pAudioBytes2, DWORD dwFlags);
-    HRESULT (STDMETHODCALLTYPE * Start)             (ma_IDirectSoundCaptureBuffer* pThis, DWORD dwFlags);
-    HRESULT (STDMETHODCALLTYPE * Stop)              (ma_IDirectSoundCaptureBuffer* pThis);
-    HRESULT (STDMETHODCALLTYPE * Unlock)            (ma_IDirectSoundCaptureBuffer* pThis, void* pAudioPtr1, DWORD dwAudioBytes1, void* pAudioPtr2, DWORD dwAudioBytes2);
-} ma_IDirectSoundCaptureBufferVtbl;
-struct ma_IDirectSoundCaptureBuffer
-{
-    ma_IDirectSoundCaptureBufferVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_QueryInterface(ma_IDirectSoundCaptureBuffer* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IDirectSoundCaptureBuffer_AddRef(ma_IDirectSoundCaptureBuffer* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IDirectSoundCaptureBuffer_Release(ma_IDirectSoundCaptureBuffer* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_GetCaps(ma_IDirectSoundCaptureBuffer* pThis, MA_DSCBCAPS* pDSCBCaps)                        { return pThis->lpVtbl->GetCaps(pThis, pDSCBCaps); }
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_GetCurrentPosition(ma_IDirectSoundCaptureBuffer* pThis, DWORD* pCapturePosition, DWORD* pReadPosition) { return pThis->lpVtbl->GetCurrentPosition(pThis, pCapturePosition, pReadPosition); }
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_GetFormat(ma_IDirectSoundCaptureBuffer* pThis, MA_WAVEFORMATEX* pFormat, DWORD dwSizeAllocated, DWORD* pSizeWritten) { return pThis->lpVtbl->GetFormat(pThis, pFormat, dwSizeAllocated, pSizeWritten); }
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_GetStatus(ma_IDirectSoundCaptureBuffer* pThis, DWORD* pStatus)                              { return pThis->lpVtbl->GetStatus(pThis, pStatus); }
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Initialize(ma_IDirectSoundCaptureBuffer* pThis, ma_IDirectSoundCapture* pDirectSoundCapture, const MA_DSCBUFFERDESC* pDSCBufferDesc) { return pThis->lpVtbl->Initialize(pThis, pDirectSoundCapture, pDSCBufferDesc); }
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Lock(ma_IDirectSoundCaptureBuffer* pThis, DWORD dwOffset, DWORD dwBytes, void** ppAudioPtr1, DWORD* pAudioBytes1, void** ppAudioPtr2, DWORD* pAudioBytes2, DWORD dwFlags) { return pThis->lpVtbl->Lock(pThis, dwOffset, dwBytes, ppAudioPtr1, pAudioBytes1, ppAudioPtr2, pAudioBytes2, dwFlags); }
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Start(ma_IDirectSoundCaptureBuffer* pThis, DWORD dwFlags)                                   { return pThis->lpVtbl->Start(pThis, dwFlags); }
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Stop(ma_IDirectSoundCaptureBuffer* pThis)                                                   { return pThis->lpVtbl->Stop(pThis); }
-static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Unlock(ma_IDirectSoundCaptureBuffer* pThis, void* pAudioPtr1, DWORD dwAudioBytes1, void* pAudioPtr2, DWORD dwAudioBytes2) { return pThis->lpVtbl->Unlock(pThis, pAudioPtr1, dwAudioBytes1, pAudioPtr2, dwAudioBytes2); }
-
-
-/* IDirectSoundNotify */
-typedef struct
-{
-    /* IUnknown */
-    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSoundNotify* pThis, const IID* const riid, void** ppObject);
-    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSoundNotify* pThis);
-    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSoundNotify* pThis);
-
-    /* IDirectSoundNotify */
-    HRESULT (STDMETHODCALLTYPE * SetNotificationPositions)(ma_IDirectSoundNotify* pThis, DWORD dwPositionNotifies, const MA_DSBPOSITIONNOTIFY* pPositionNotifies);
-} ma_IDirectSoundNotifyVtbl;
-struct ma_IDirectSoundNotify
-{
-    ma_IDirectSoundNotifyVtbl* lpVtbl;
-};
-static MA_INLINE HRESULT ma_IDirectSoundNotify_QueryInterface(ma_IDirectSoundNotify* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
-static MA_INLINE ULONG   ma_IDirectSoundNotify_AddRef(ma_IDirectSoundNotify* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
-static MA_INLINE ULONG   ma_IDirectSoundNotify_Release(ma_IDirectSoundNotify* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
-static MA_INLINE HRESULT ma_IDirectSoundNotify_SetNotificationPositions(ma_IDirectSoundNotify* pThis, DWORD dwPositionNotifies, const MA_DSBPOSITIONNOTIFY* pPositionNotifies) { return pThis->lpVtbl->SetNotificationPositions(pThis, dwPositionNotifies, pPositionNotifies); }
-
-
-typedef BOOL    (CALLBACK * ma_DSEnumCallbackAProc)             (GUID* pDeviceGUID, const char* pDeviceDescription, const char* pModule, void* pContext);
-typedef HRESULT (WINAPI   * ma_DirectSoundCreateProc)           (const GUID* pcGuidDevice, ma_IDirectSound** ppDS8, ma_IUnknown* pUnkOuter);
-typedef HRESULT (WINAPI   * ma_DirectSoundEnumerateAProc)       (ma_DSEnumCallbackAProc pDSEnumCallback, void* pContext);
-typedef HRESULT (WINAPI   * ma_DirectSoundCaptureCreateProc)    (const GUID* pcGuidDevice, ma_IDirectSoundCapture** ppDSC8, ma_IUnknown* pUnkOuter);
-typedef HRESULT (WINAPI   * ma_DirectSoundCaptureEnumerateAProc)(ma_DSEnumCallbackAProc pDSEnumCallback, void* pContext);
-
-static ma_uint32 ma_get_best_sample_rate_within_range(ma_uint32 sampleRateMin, ma_uint32 sampleRateMax)
-{
-    /* Normalize the range in case we were given something stupid. */
-    if (sampleRateMin < (ma_uint32)ma_standard_sample_rate_min) {
-        sampleRateMin = (ma_uint32)ma_standard_sample_rate_min;
-    }
-    if (sampleRateMax > (ma_uint32)ma_standard_sample_rate_max) {
-        sampleRateMax = (ma_uint32)ma_standard_sample_rate_max;
-    }
-    if (sampleRateMin > sampleRateMax) {
-        sampleRateMin = sampleRateMax;
-    }
-
-    if (sampleRateMin == sampleRateMax) {
-        return sampleRateMax;
-    } else {
-        size_t iStandardRate;
-        for (iStandardRate = 0; iStandardRate < ma_countof(g_maStandardSampleRatePriorities); ++iStandardRate) {
-            ma_uint32 standardRate = g_maStandardSampleRatePriorities[iStandardRate];
-            if (standardRate >= sampleRateMin && standardRate <= sampleRateMax) {
-                return standardRate;
-            }
-        }
-    }
-
-    /* Should never get here. */
-    MA_ASSERT(MA_FALSE);
-    return 0;
-}
-
-/*
-Retrieves the channel count and channel map for the given speaker configuration. If the speaker configuration is unknown,
-the channel count and channel map will be left unmodified.
-*/
-static void ma_get_channels_from_speaker_config__dsound(DWORD speakerConfig, WORD* pChannelsOut, DWORD* pChannelMapOut)
-{
-    WORD  channels;
-    DWORD channelMap;
-
-    channels = 0;
-    if (pChannelsOut != NULL) {
-        channels = *pChannelsOut;
-    }
-
-    channelMap = 0;
-    if (pChannelMapOut != NULL) {
-        channelMap = *pChannelMapOut;
-    }
-
-    /*
-    The speaker configuration is a combination of speaker config and speaker geometry. The lower 8 bits is what we care about. The upper
-    16 bits is for the geometry.
-    */
-    switch ((BYTE)(speakerConfig)) {
-        case 1 /*DSSPEAKER_HEADPHONE*/:                          channels = 2; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT; break;
-        case 2 /*DSSPEAKER_MONO*/:                               channels = 1; channelMap = SPEAKER_FRONT_CENTER; break;
-        case 3 /*DSSPEAKER_QUAD*/:                               channels = 4; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT; break;
-        case 4 /*DSSPEAKER_STEREO*/:                             channels = 2; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT; break;
-        case 5 /*DSSPEAKER_SURROUND*/:                           channels = 4; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_BACK_CENTER; break;
-        case 6 /*DSSPEAKER_5POINT1_BACK*/ /*DSSPEAKER_5POINT1*/: channels = 6; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT; break;
-        case 7 /*DSSPEAKER_7POINT1_WIDE*/ /*DSSPEAKER_7POINT1*/: channels = 8; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT | SPEAKER_FRONT_LEFT_OF_CENTER | SPEAKER_FRONT_RIGHT_OF_CENTER; break;
-        case 8 /*DSSPEAKER_7POINT1_SURROUND*/:                   channels = 8; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT | SPEAKER_SIDE_LEFT | SPEAKER_SIDE_RIGHT; break;
-        case 9 /*DSSPEAKER_5POINT1_SURROUND*/:                   channels = 6; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | SPEAKER_SIDE_LEFT | SPEAKER_SIDE_RIGHT; break;
-        default: break;
-    }
-
-    if (pChannelsOut != NULL) {
-        *pChannelsOut = channels;
-    }
-
-    if (pChannelMapOut != NULL) {
-        *pChannelMapOut = channelMap;
-    }
-}
-
-
-static ma_result ma_context_create_IDirectSound__dsound(ma_context* pContext, ma_share_mode shareMode, const ma_device_id* pDeviceID, ma_IDirectSound** ppDirectSound)
-{
-    ma_IDirectSound* pDirectSound;
-    HWND hWnd;
-    HRESULT hr;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(ppDirectSound != NULL);
-
-    *ppDirectSound = NULL;
-    pDirectSound = NULL;
-
-    if (FAILED(((ma_DirectSoundCreateProc)pContext->dsound.DirectSoundCreate)((pDeviceID == NULL) ? NULL : (const GUID*)pDeviceID->dsound, &pDirectSound, NULL))) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] DirectSoundCreate() failed for playback device.");
-        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-    }
-
-    /* The cooperative level must be set before doing anything else. */
-    hWnd = ((MA_PFN_GetForegroundWindow)pContext->win32.GetForegroundWindow)();
-    if (hWnd == 0) {
-        hWnd = ((MA_PFN_GetDesktopWindow)pContext->win32.GetDesktopWindow)();
-    }
-
-    hr = ma_IDirectSound_SetCooperativeLevel(pDirectSound, hWnd, (shareMode == ma_share_mode_exclusive) ? MA_DSSCL_EXCLUSIVE : MA_DSSCL_PRIORITY);
-    if (FAILED(hr)) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_SetCooperateiveLevel() failed for playback device.");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    *ppDirectSound = pDirectSound;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_create_IDirectSoundCapture__dsound(ma_context* pContext, ma_share_mode shareMode, const ma_device_id* pDeviceID, ma_IDirectSoundCapture** ppDirectSoundCapture)
-{
-    ma_IDirectSoundCapture* pDirectSoundCapture;
-    HRESULT hr;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(ppDirectSoundCapture != NULL);
-
-    /* DirectSound does not support exclusive mode for capture. */
-    if (shareMode == ma_share_mode_exclusive) {
-        return MA_SHARE_MODE_NOT_SUPPORTED;
-    }
-
-    *ppDirectSoundCapture = NULL;
-    pDirectSoundCapture = NULL;
-
-    hr = ((ma_DirectSoundCaptureCreateProc)pContext->dsound.DirectSoundCaptureCreate)((pDeviceID == NULL) ? NULL : (const GUID*)pDeviceID->dsound, &pDirectSoundCapture, NULL);
-    if (FAILED(hr)) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] DirectSoundCaptureCreate() failed for capture device.");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    *ppDirectSoundCapture = pDirectSoundCapture;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_format_info_for_IDirectSoundCapture__dsound(ma_context* pContext, ma_IDirectSoundCapture* pDirectSoundCapture, WORD* pChannels, WORD* pBitsPerSample, DWORD* pSampleRate)
-{
-    HRESULT hr;
-    MA_DSCCAPS caps;
-    WORD bitsPerSample;
-    DWORD sampleRate;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pDirectSoundCapture != NULL);
-
-    if (pChannels) {
-        *pChannels = 0;
-    }
-    if (pBitsPerSample) {
-        *pBitsPerSample = 0;
-    }
-    if (pSampleRate) {
-        *pSampleRate = 0;
-    }
-
-    MA_ZERO_OBJECT(&caps);
-    caps.dwSize = sizeof(caps);
-    hr = ma_IDirectSoundCapture_GetCaps(pDirectSoundCapture, &caps);
-    if (FAILED(hr)) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundCapture_GetCaps() failed for capture device.");
-        return ma_result_from_HRESULT(hr);
-    }
-
-    if (pChannels) {
-        *pChannels = (WORD)caps.dwChannels;
-    }
-
-    /* The device can support multiple formats. We just go through the different formats in order of priority and pick the first one. This the same type of system as the WinMM backend. */
-    bitsPerSample = 16;
-    sampleRate = 48000;
-
-    if (caps.dwChannels == 1) {
-        if ((caps.dwFormats & WAVE_FORMAT_48M16) != 0) {
-            sampleRate = 48000;
-        } else if ((caps.dwFormats & WAVE_FORMAT_44M16) != 0) {
-            sampleRate = 44100;
-        } else if ((caps.dwFormats & WAVE_FORMAT_2M16) != 0) {
-            sampleRate = 22050;
-        } else if ((caps.dwFormats & WAVE_FORMAT_1M16) != 0) {
-            sampleRate = 11025;
-        } else if ((caps.dwFormats & WAVE_FORMAT_96M16) != 0) {
-            sampleRate = 96000;
-        } else {
-            bitsPerSample = 8;
-            if ((caps.dwFormats & WAVE_FORMAT_48M08) != 0) {
-                sampleRate = 48000;
-            } else if ((caps.dwFormats & WAVE_FORMAT_44M08) != 0) {
-                sampleRate = 44100;
-            } else if ((caps.dwFormats & WAVE_FORMAT_2M08) != 0) {
-                sampleRate = 22050;
-            } else if ((caps.dwFormats & WAVE_FORMAT_1M08) != 0) {
-                sampleRate = 11025;
-            } else if ((caps.dwFormats & WAVE_FORMAT_96M08) != 0) {
-                sampleRate = 96000;
-            } else {
-                bitsPerSample = 16;  /* Didn't find it. Just fall back to 16-bit. */
-            }
-        }
-    } else if (caps.dwChannels == 2) {
-        if ((caps.dwFormats & WAVE_FORMAT_48S16) != 0) {
-            sampleRate = 48000;
-        } else if ((caps.dwFormats & WAVE_FORMAT_44S16) != 0) {
-            sampleRate = 44100;
-        } else if ((caps.dwFormats & WAVE_FORMAT_2S16) != 0) {
-            sampleRate = 22050;
-        } else if ((caps.dwFormats & WAVE_FORMAT_1S16) != 0) {
-            sampleRate = 11025;
-        } else if ((caps.dwFormats & WAVE_FORMAT_96S16) != 0) {
-            sampleRate = 96000;
-        } else {
-            bitsPerSample = 8;
-            if ((caps.dwFormats & WAVE_FORMAT_48S08) != 0) {
-                sampleRate = 48000;
-            } else if ((caps.dwFormats & WAVE_FORMAT_44S08) != 0) {
-                sampleRate = 44100;
-            } else if ((caps.dwFormats & WAVE_FORMAT_2S08) != 0) {
-                sampleRate = 22050;
-            } else if ((caps.dwFormats & WAVE_FORMAT_1S08) != 0) {
-                sampleRate = 11025;
-            } else if ((caps.dwFormats & WAVE_FORMAT_96S08) != 0) {
-                sampleRate = 96000;
-            } else {
-                bitsPerSample = 16;  /* Didn't find it. Just fall back to 16-bit. */
-            }
-        }
-    }
-
-    if (pBitsPerSample) {
-        *pBitsPerSample = bitsPerSample;
-    }
-    if (pSampleRate) {
-        *pSampleRate = sampleRate;
-    }
-
-    return MA_SUCCESS;
-}
-
-
-typedef struct
-{
-    ma_context* pContext;
-    ma_device_type deviceType;
-    ma_enum_devices_callback_proc callback;
-    void* pUserData;
-    ma_bool32 terminated;
-} ma_context_enumerate_devices_callback_data__dsound;
-
-static BOOL CALLBACK ma_context_enumerate_devices_callback__dsound(GUID* lpGuid, const char* lpcstrDescription, const char* lpcstrModule, void* lpContext)
-{
-    ma_context_enumerate_devices_callback_data__dsound* pData = (ma_context_enumerate_devices_callback_data__dsound*)lpContext;
-    ma_device_info deviceInfo;
-
-    (void)lpcstrModule;
-
-    MA_ZERO_OBJECT(&deviceInfo);
-
-    /* ID. */
-    if (lpGuid != NULL) {
-        MA_COPY_MEMORY(deviceInfo.id.dsound, lpGuid, 16);
-    } else {
-        MA_ZERO_MEMORY(deviceInfo.id.dsound, 16);
-        deviceInfo.isDefault = MA_TRUE;
-    }
-
-    /* Name / Description */
-    ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), lpcstrDescription, (size_t)-1);
-
-
-    /* Call the callback function, but make sure we stop enumerating if the callee requested so. */
-    MA_ASSERT(pData != NULL);
-    pData->terminated = (pData->callback(pData->pContext, pData->deviceType, &deviceInfo, pData->pUserData) == MA_FALSE);
-    if (pData->terminated) {
-        return FALSE;   /* Stop enumeration. */
-    } else {
-        return TRUE;    /* Continue enumeration. */
-    }
-}
-
-static ma_result ma_context_enumerate_devices__dsound(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_context_enumerate_devices_callback_data__dsound data;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    data.pContext = pContext;
-    data.callback = callback;
-    data.pUserData = pUserData;
-    data.terminated = MA_FALSE;
-
-    /* Playback. */
-    if (!data.terminated) {
-        data.deviceType = ma_device_type_playback;
-        ((ma_DirectSoundEnumerateAProc)pContext->dsound.DirectSoundEnumerateA)(ma_context_enumerate_devices_callback__dsound, &data);
-    }
-
-    /* Capture. */
-    if (!data.terminated) {
-        data.deviceType = ma_device_type_capture;
-        ((ma_DirectSoundCaptureEnumerateAProc)pContext->dsound.DirectSoundCaptureEnumerateA)(ma_context_enumerate_devices_callback__dsound, &data);
-    }
-
-    return MA_SUCCESS;
-}
-
-
-typedef struct
-{
-    const ma_device_id* pDeviceID;
-    ma_device_info* pDeviceInfo;
-    ma_bool32 found;
-} ma_context_get_device_info_callback_data__dsound;
-
-static BOOL CALLBACK ma_context_get_device_info_callback__dsound(GUID* lpGuid, const char* lpcstrDescription, const char* lpcstrModule, void* lpContext)
-{
-    ma_context_get_device_info_callback_data__dsound* pData = (ma_context_get_device_info_callback_data__dsound*)lpContext;
-    MA_ASSERT(pData != NULL);
-
-    if ((pData->pDeviceID == NULL || ma_is_guid_null(pData->pDeviceID->dsound)) && (lpGuid == NULL || ma_is_guid_null(lpGuid))) {
-        /* Default device. */
-        ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), lpcstrDescription, (size_t)-1);
-        pData->pDeviceInfo->isDefault = MA_TRUE;
-        pData->found = MA_TRUE;
-        return FALSE;   /* Stop enumeration. */
-    } else {
-        /* Not the default device. */
-        if (lpGuid != NULL && pData->pDeviceID != NULL) {
-            if (memcmp(pData->pDeviceID->dsound, lpGuid, sizeof(pData->pDeviceID->dsound)) == 0) {
-                ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), lpcstrDescription, (size_t)-1);
-                pData->found = MA_TRUE;
-                return FALSE;   /* Stop enumeration. */
-            }
-        }
-    }
-
-    (void)lpcstrModule;
-    return TRUE;
-}
-
-static ma_result ma_context_get_device_info__dsound(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    ma_result result;
-    HRESULT hr;
-
-    if (pDeviceID != NULL) {
-        ma_context_get_device_info_callback_data__dsound data;
-
-        /* ID. */
-        MA_COPY_MEMORY(pDeviceInfo->id.dsound, pDeviceID->dsound, 16);
-
-        /* Name / Description. This is retrieved by enumerating over each device until we find that one that matches the input ID. */
-        data.pDeviceID = pDeviceID;
-        data.pDeviceInfo = pDeviceInfo;
-        data.found = MA_FALSE;
-        if (deviceType == ma_device_type_playback) {
-            ((ma_DirectSoundEnumerateAProc)pContext->dsound.DirectSoundEnumerateA)(ma_context_get_device_info_callback__dsound, &data);
-        } else {
-            ((ma_DirectSoundCaptureEnumerateAProc)pContext->dsound.DirectSoundCaptureEnumerateA)(ma_context_get_device_info_callback__dsound, &data);
-        }
-
-        if (!data.found) {
-            return MA_NO_DEVICE;
-        }
-    } else {
-        /* I don't think there's a way to get the name of the default device with DirectSound. In this case we just need to use defaults. */
-
-        /* ID */
-        MA_ZERO_MEMORY(pDeviceInfo->id.dsound, 16);
-
-        /* Name / Description */
-        if (deviceType == ma_device_type_playback) {
-            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-        } else {
-            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-        }
-
-        pDeviceInfo->isDefault = MA_TRUE;
-    }
-
-    /* Retrieving detailed information is slightly different depending on the device type. */
-    if (deviceType == ma_device_type_playback) {
-        /* Playback. */
-        ma_IDirectSound* pDirectSound;
-        MA_DSCAPS caps;
-        WORD channels;
-
-        result = ma_context_create_IDirectSound__dsound(pContext, ma_share_mode_shared, pDeviceID, &pDirectSound);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        MA_ZERO_OBJECT(&caps);
-        caps.dwSize = sizeof(caps);
-        hr = ma_IDirectSound_GetCaps(pDirectSound, &caps);
-        if (FAILED(hr)) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_GetCaps() failed for playback device.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-
-        /* Channels. Only a single channel count is reported for DirectSound. */
-        if ((caps.dwFlags & MA_DSCAPS_PRIMARYSTEREO) != 0) {
-            /* It supports at least stereo, but could support more. */
-            DWORD speakerConfig;
-
-            channels = 2;
-
-            /* Look at the speaker configuration to get a better idea on the channel count. */
-            hr = ma_IDirectSound_GetSpeakerConfig(pDirectSound, &speakerConfig);
-            if (SUCCEEDED(hr)) {
-                ma_get_channels_from_speaker_config__dsound(speakerConfig, &channels, NULL);
-            }
-        } else {
-            /* It does not support stereo, which means we are stuck with mono. */
-            channels = 1;
-        }
-
-
-        /*
-        In DirectSound, our native formats are centered around sample rates. All formats are supported, and we're only reporting a single channel
-        count. However, DirectSound can report a range of supported sample rates. We're only going to include standard rates known by miniaudio
-        in order to keep the size of this within reason.
-        */
-        if ((caps.dwFlags & MA_DSCAPS_CONTINUOUSRATE) != 0) {
-            /* Multiple sample rates are supported. We'll report in order of our preferred sample rates. */
-            size_t iStandardSampleRate;
-            for (iStandardSampleRate = 0; iStandardSampleRate < ma_countof(g_maStandardSampleRatePriorities); iStandardSampleRate += 1) {
-                ma_uint32 sampleRate = g_maStandardSampleRatePriorities[iStandardSampleRate];
-                if (sampleRate >= caps.dwMinSecondarySampleRate && sampleRate <= caps.dwMaxSecondarySampleRate) {
-                    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = ma_format_unknown;
-                    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
-                    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = sampleRate;
-                    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = 0;
-                    pDeviceInfo->nativeDataFormatCount += 1;
-                }
-            }
-        } else {
-            /* Only a single sample rate is supported. */
-            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = ma_format_unknown;
-            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
-            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = caps.dwMaxSecondarySampleRate;
-            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = 0;
-            pDeviceInfo->nativeDataFormatCount += 1;
-        }
-
-        ma_IDirectSound_Release(pDirectSound);
-    } else {
-        /*
-        Capture. This is a little different to playback due to the say the supported formats are reported. Technically capture
-        devices can support a number of different formats, but for simplicity and consistency with ma_device_init() I'm just
-        reporting the best format.
-        */
-        ma_IDirectSoundCapture* pDirectSoundCapture;
-        WORD channels;
-        WORD bitsPerSample;
-        DWORD sampleRate;
-
-        result = ma_context_create_IDirectSoundCapture__dsound(pContext, ma_share_mode_shared, pDeviceID, &pDirectSoundCapture);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        result = ma_context_get_format_info_for_IDirectSoundCapture__dsound(pContext, pDirectSoundCapture, &channels, &bitsPerSample, &sampleRate);
-        if (result != MA_SUCCESS) {
-            ma_IDirectSoundCapture_Release(pDirectSoundCapture);
-            return result;
-        }
-
-        ma_IDirectSoundCapture_Release(pDirectSoundCapture);
-
-        /* The format is always an integer format and is based on the bits per sample. */
-        if (bitsPerSample == 8) {
-            pDeviceInfo->nativeDataFormats[0].format = ma_format_u8;
-        } else if (bitsPerSample == 16) {
-            pDeviceInfo->nativeDataFormats[0].format = ma_format_s16;
-        } else if (bitsPerSample == 24) {
-            pDeviceInfo->nativeDataFormats[0].format = ma_format_s24;
-        } else if (bitsPerSample == 32) {
-            pDeviceInfo->nativeDataFormats[0].format = ma_format_s32;
-        } else {
-            return MA_FORMAT_NOT_SUPPORTED;
-        }
-
-        pDeviceInfo->nativeDataFormats[0].channels   = channels;
-        pDeviceInfo->nativeDataFormats[0].sampleRate = sampleRate;
-        pDeviceInfo->nativeDataFormats[0].flags      = 0;
-        pDeviceInfo->nativeDataFormatCount = 1;
-    }
-
-    return MA_SUCCESS;
-}
-
-
-
-static ma_result ma_device_uninit__dsound(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->dsound.pCaptureBuffer != NULL) {
-        ma_IDirectSoundCaptureBuffer_Release((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
-    }
-    if (pDevice->dsound.pCapture != NULL) {
-        ma_IDirectSoundCapture_Release((ma_IDirectSoundCapture*)pDevice->dsound.pCapture);
-    }
-
-    if (pDevice->dsound.pPlaybackBuffer != NULL) {
-        ma_IDirectSoundBuffer_Release((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer);
-    }
-    if (pDevice->dsound.pPlaybackPrimaryBuffer != NULL) {
-        ma_IDirectSoundBuffer_Release((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackPrimaryBuffer);
-    }
-    if (pDevice->dsound.pPlayback != NULL) {
-        ma_IDirectSound_Release((ma_IDirectSound*)pDevice->dsound.pPlayback);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_config_to_WAVEFORMATEXTENSIBLE(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, const ma_channel* pChannelMap, MA_WAVEFORMATEXTENSIBLE* pWF)
-{
-    GUID subformat;
-
-    if (format == ma_format_unknown) {
-        format = MA_DEFAULT_FORMAT;
-    }
-
-    if (channels == 0) {
-        channels = MA_DEFAULT_CHANNELS;
-    }
-
-    if (sampleRate == 0) {
-        sampleRate = MA_DEFAULT_SAMPLE_RATE;
-    }
-
-    switch (format)
-    {
-        case ma_format_u8:
-        case ma_format_s16:
-        case ma_format_s24:
-        /*case ma_format_s24_32:*/
-        case ma_format_s32:
-        {
-            subformat = MA_GUID_KSDATAFORMAT_SUBTYPE_PCM;
-        } break;
-
-        case ma_format_f32:
-        {
-            subformat = MA_GUID_KSDATAFORMAT_SUBTYPE_IEEE_FLOAT;
-        } break;
-
-        default:
-        return MA_FORMAT_NOT_SUPPORTED;
-    }
-
-    MA_ZERO_OBJECT(pWF);
-    pWF->cbSize                      = sizeof(*pWF);
-    pWF->wFormatTag                  = WAVE_FORMAT_EXTENSIBLE;
-    pWF->nChannels                   = (WORD)channels;
-    pWF->nSamplesPerSec              = (DWORD)sampleRate;
-    pWF->wBitsPerSample              = (WORD)(ma_get_bytes_per_sample(format)*8);
-    pWF->nBlockAlign                 = (WORD)(pWF->nChannels * pWF->wBitsPerSample / 8);
-    pWF->nAvgBytesPerSec             = pWF->nBlockAlign * pWF->nSamplesPerSec;
-    pWF->Samples.wValidBitsPerSample = pWF->wBitsPerSample;
-    pWF->dwChannelMask               = ma_channel_map_to_channel_mask__win32(pChannelMap, channels);
-    pWF->SubFormat                   = subformat;
-
-    return MA_SUCCESS;
-}
-
-static ma_uint32 ma_calculate_period_size_in_frames_from_descriptor__dsound(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
-{
-    /*
-    DirectSound has a minimum period size of 20ms. In practice, this doesn't seem to be enough for
-    reliable glitch-free processing so going to use 30ms instead.
-    */
-    ma_uint32 minPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(30, nativeSampleRate);
-    ma_uint32 periodSizeInFrames;
-
-    periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, nativeSampleRate, performanceProfile);
-    if (periodSizeInFrames < minPeriodSizeInFrames) {
-        periodSizeInFrames = minPeriodSizeInFrames;
-    }
-
-    return periodSizeInFrames;
-}
-
-static ma_result ma_device_init__dsound(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    ma_result result;
-    HRESULT hr;
-
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ZERO_OBJECT(&pDevice->dsound);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    /*
-    Unfortunately DirectSound uses different APIs and data structures for playback and catpure devices. We need to initialize
-    the capture device first because we'll want to match it's buffer size and period count on the playback side if we're using
-    full-duplex mode.
-    */
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        MA_WAVEFORMATEXTENSIBLE wf;
-        MA_DSCBUFFERDESC descDS;
-        ma_uint32 periodSizeInFrames;
-        ma_uint32 periodCount;
-        char rawdata[1024]; /* <-- Ugly hack to avoid a malloc() due to a crappy DirectSound API. */
-        MA_WAVEFORMATEXTENSIBLE* pActualFormat;
-
-        result = ma_config_to_WAVEFORMATEXTENSIBLE(pDescriptorCapture->format, pDescriptorCapture->channels, pDescriptorCapture->sampleRate, pDescriptorCapture->channelMap, &wf);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        result = ma_context_create_IDirectSoundCapture__dsound(pDevice->pContext, pDescriptorCapture->shareMode, pDescriptorCapture->pDeviceID, (ma_IDirectSoundCapture**)&pDevice->dsound.pCapture);
-        if (result != MA_SUCCESS) {
-            ma_device_uninit__dsound(pDevice);
-            return result;
-        }
-
-        result = ma_context_get_format_info_for_IDirectSoundCapture__dsound(pDevice->pContext, (ma_IDirectSoundCapture*)pDevice->dsound.pCapture, &wf.nChannels, &wf.wBitsPerSample, &wf.nSamplesPerSec);
-        if (result != MA_SUCCESS) {
-            ma_device_uninit__dsound(pDevice);
-            return result;
-        }
-
-        wf.nBlockAlign                 = (WORD)(wf.nChannels * wf.wBitsPerSample / 8);
-        wf.nAvgBytesPerSec             = wf.nBlockAlign * wf.nSamplesPerSec;
-        wf.Samples.wValidBitsPerSample = wf.wBitsPerSample;
-        wf.SubFormat                   = MA_GUID_KSDATAFORMAT_SUBTYPE_PCM;
-
-        /* The size of the buffer must be a clean multiple of the period count. */
-        periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__dsound(pDescriptorCapture, wf.nSamplesPerSec, pConfig->performanceProfile);
-        periodCount = (pDescriptorCapture->periodCount > 0) ? pDescriptorCapture->periodCount : MA_DEFAULT_PERIODS;
-
-        MA_ZERO_OBJECT(&descDS);
-        descDS.dwSize        = sizeof(descDS);
-        descDS.dwFlags       = 0;
-        descDS.dwBufferBytes = periodSizeInFrames * periodCount * wf.nBlockAlign;
-        descDS.lpwfxFormat   = (MA_WAVEFORMATEX*)&wf;
-        hr = ma_IDirectSoundCapture_CreateCaptureBuffer((ma_IDirectSoundCapture*)pDevice->dsound.pCapture, &descDS, (ma_IDirectSoundCaptureBuffer**)&pDevice->dsound.pCaptureBuffer, NULL);
-        if (FAILED(hr)) {
-            ma_device_uninit__dsound(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundCapture_CreateCaptureBuffer() failed for capture device.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        /* Get the _actual_ properties of the buffer. */
-        pActualFormat = (MA_WAVEFORMATEXTENSIBLE*)rawdata;
-        hr = ma_IDirectSoundCaptureBuffer_GetFormat((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, (MA_WAVEFORMATEX*)pActualFormat, sizeof(rawdata), NULL);
-        if (FAILED(hr)) {
-            ma_device_uninit__dsound(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to retrieve the actual format of the capture device's buffer.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        /* We can now start setting the output data formats. */
-        pDescriptorCapture->format     = ma_format_from_WAVEFORMATEX((MA_WAVEFORMATEX*)pActualFormat);
-        pDescriptorCapture->channels   = pActualFormat->nChannels;
-        pDescriptorCapture->sampleRate = pActualFormat->nSamplesPerSec;
-
-        /* Get the native channel map based on the channel mask. */
-        if (pActualFormat->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
-            ma_channel_mask_to_channel_map__win32(pActualFormat->dwChannelMask, pDescriptorCapture->channels, pDescriptorCapture->channelMap);
-        } else {
-            ma_channel_mask_to_channel_map__win32(wf.dwChannelMask, pDescriptorCapture->channels, pDescriptorCapture->channelMap);
-        }
-
-        /*
-        After getting the actual format the size of the buffer in frames may have actually changed. However, we want this to be as close to what the
-        user has asked for as possible, so let's go ahead and release the old capture buffer and create a new one in this case.
-        */
-        if (periodSizeInFrames != (descDS.dwBufferBytes / ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels) / periodCount)) {
-            descDS.dwBufferBytes = periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels) * periodCount;
-            ma_IDirectSoundCaptureBuffer_Release((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
-
-            hr = ma_IDirectSoundCapture_CreateCaptureBuffer((ma_IDirectSoundCapture*)pDevice->dsound.pCapture, &descDS, (ma_IDirectSoundCaptureBuffer**)&pDevice->dsound.pCaptureBuffer, NULL);
-            if (FAILED(hr)) {
-                ma_device_uninit__dsound(pDevice);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Second attempt at IDirectSoundCapture_CreateCaptureBuffer() failed for capture device.");
-                return ma_result_from_HRESULT(hr);
-            }
-        }
-
-        /* DirectSound should give us a buffer exactly the size we asked for. */
-        pDescriptorCapture->periodSizeInFrames = periodSizeInFrames;
-        pDescriptorCapture->periodCount        = periodCount;
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        MA_WAVEFORMATEXTENSIBLE wf;
-        MA_DSBUFFERDESC descDSPrimary;
-        MA_DSCAPS caps;
-        char rawdata[1024]; /* <-- Ugly hack to avoid a malloc() due to a crappy DirectSound API. */
-        MA_WAVEFORMATEXTENSIBLE* pActualFormat;
-        ma_uint32 periodSizeInFrames;
-        ma_uint32 periodCount;
-        MA_DSBUFFERDESC descDS;
-        WORD nativeChannelCount;
-        DWORD nativeChannelMask = 0;
-
-        result = ma_config_to_WAVEFORMATEXTENSIBLE(pDescriptorPlayback->format, pDescriptorPlayback->channels, pDescriptorPlayback->sampleRate, pDescriptorPlayback->channelMap, &wf);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        result = ma_context_create_IDirectSound__dsound(pDevice->pContext, pDescriptorPlayback->shareMode, pDescriptorPlayback->pDeviceID, (ma_IDirectSound**)&pDevice->dsound.pPlayback);
-        if (result != MA_SUCCESS) {
-            ma_device_uninit__dsound(pDevice);
-            return result;
-        }
-
-        MA_ZERO_OBJECT(&descDSPrimary);
-        descDSPrimary.dwSize  = sizeof(MA_DSBUFFERDESC);
-        descDSPrimary.dwFlags = MA_DSBCAPS_PRIMARYBUFFER | MA_DSBCAPS_CTRLVOLUME;
-        hr = ma_IDirectSound_CreateSoundBuffer((ma_IDirectSound*)pDevice->dsound.pPlayback, &descDSPrimary, (ma_IDirectSoundBuffer**)&pDevice->dsound.pPlaybackPrimaryBuffer, NULL);
-        if (FAILED(hr)) {
-            ma_device_uninit__dsound(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_CreateSoundBuffer() failed for playback device's primary buffer.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-
-        /* We may want to make some adjustments to the format if we are using defaults. */
-        MA_ZERO_OBJECT(&caps);
-        caps.dwSize = sizeof(caps);
-        hr = ma_IDirectSound_GetCaps((ma_IDirectSound*)pDevice->dsound.pPlayback, &caps);
-        if (FAILED(hr)) {
-            ma_device_uninit__dsound(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_GetCaps() failed for playback device.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        if ((caps.dwFlags & MA_DSCAPS_PRIMARYSTEREO) != 0) {
-            DWORD speakerConfig;
-
-            /* It supports at least stereo, but could support more. */
-            nativeChannelCount = 2;
-
-            /* Look at the speaker configuration to get a better idea on the channel count. */
-            if (SUCCEEDED(ma_IDirectSound_GetSpeakerConfig((ma_IDirectSound*)pDevice->dsound.pPlayback, &speakerConfig))) {
-                ma_get_channels_from_speaker_config__dsound(speakerConfig, &nativeChannelCount, &nativeChannelMask);
-            }
-        } else {
-            /* It does not support stereo, which means we are stuck with mono. */
-            nativeChannelCount = 1;
-            nativeChannelMask  = 0x00000001;
-        }
-
-        if (pDescriptorPlayback->channels == 0) {
-            wf.nChannels = nativeChannelCount;
-            wf.dwChannelMask    = nativeChannelMask;
-        }
-
-        if (pDescriptorPlayback->sampleRate == 0) {
-            /* We base the sample rate on the values returned by GetCaps(). */
-            if ((caps.dwFlags & MA_DSCAPS_CONTINUOUSRATE) != 0) {
-                wf.nSamplesPerSec = ma_get_best_sample_rate_within_range(caps.dwMinSecondarySampleRate, caps.dwMaxSecondarySampleRate);
-            } else {
-                wf.nSamplesPerSec = caps.dwMaxSecondarySampleRate;
-            }
-        }
-
-        wf.nBlockAlign     = (WORD)(wf.nChannels * wf.wBitsPerSample / 8);
-        wf.nAvgBytesPerSec = wf.nBlockAlign * wf.nSamplesPerSec;
-
-        /*
-        From MSDN:
-
-        The method succeeds even if the hardware does not support the requested format; DirectSound sets the buffer to the closest
-        supported format. To determine whether this has happened, an application can call the GetFormat method for the primary buffer
-        and compare the result with the format that was requested with the SetFormat method.
-        */
-        hr = ma_IDirectSoundBuffer_SetFormat((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackPrimaryBuffer, (MA_WAVEFORMATEX*)&wf);
-        if (FAILED(hr)) {
-            /*
-            If setting of the format failed we'll try again with some fallback settings. On Windows 98 I have
-            observed that IEEE_FLOAT does not work. We'll therefore enforce PCM. I also had issues where a
-            sample rate of 48000 did not work correctly. Not sure if it was a driver issue or not, but will
-            use 44100 for the sample rate.
-            */
-            wf.cbSize          = 18;    /* NOTE: Don't use sizeof(MA_WAVEFORMATEX) here because it's got an extra 2 bytes due to padding. */
-            wf.wFormatTag      = WAVE_FORMAT_PCM;
-            wf.wBitsPerSample  = 16;
-            wf.nChannels       = nativeChannelCount;
-            wf.nSamplesPerSec  = 44100;
-            wf.nBlockAlign     = wf.nChannels * (wf.wBitsPerSample / 8);
-            wf.nAvgBytesPerSec = wf.nSamplesPerSec * wf.nBlockAlign;
-
-            hr = ma_IDirectSoundBuffer_SetFormat((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackPrimaryBuffer, (MA_WAVEFORMATEX*)&wf);
-            if (FAILED(hr)) {
-                ma_device_uninit__dsound(pDevice);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to set format of playback device's primary buffer.");
-                return ma_result_from_HRESULT(hr);
-            }
-        }
-
-        /* Get the _actual_ properties of the buffer. */
-        pActualFormat = (MA_WAVEFORMATEXTENSIBLE*)rawdata;
-        hr = ma_IDirectSoundBuffer_GetFormat((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackPrimaryBuffer, (MA_WAVEFORMATEX*)pActualFormat, sizeof(rawdata), NULL);
-        if (FAILED(hr)) {
-            ma_device_uninit__dsound(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to retrieve the actual format of the playback device's primary buffer.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        /* We now have enough information to start setting some output properties. */
-        pDescriptorPlayback->format     = ma_format_from_WAVEFORMATEX((MA_WAVEFORMATEX*)pActualFormat);
-        pDescriptorPlayback->channels   = pActualFormat->nChannels;
-        pDescriptorPlayback->sampleRate = pActualFormat->nSamplesPerSec;
-
-        /* Get the internal channel map based on the channel mask. */
-        if (pActualFormat->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
-            ma_channel_mask_to_channel_map__win32(pActualFormat->dwChannelMask, pDescriptorPlayback->channels, pDescriptorPlayback->channelMap);
-        } else {
-            ma_channel_mask_to_channel_map__win32(wf.dwChannelMask, pDescriptorPlayback->channels, pDescriptorPlayback->channelMap);
-        }
-
-        /* The size of the buffer must be a clean multiple of the period count. */
-        periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__dsound(pDescriptorPlayback, pDescriptorPlayback->sampleRate, pConfig->performanceProfile);
-        periodCount = (pDescriptorPlayback->periodCount > 0) ? pDescriptorPlayback->periodCount : MA_DEFAULT_PERIODS;
-
-        /*
-        Meaning of dwFlags (from MSDN):
-
-        DSBCAPS_CTRLPOSITIONNOTIFY
-          The buffer has position notification capability.
-
-        DSBCAPS_GLOBALFOCUS
-          With this flag set, an application using DirectSound can continue to play its buffers if the user switches focus to
-          another application, even if the new application uses DirectSound.
-
-        DSBCAPS_GETCURRENTPOSITION2
-          In the first version of DirectSound, the play cursor was significantly ahead of the actual playing sound on emulated
-          sound cards; it was directly behind the write cursor. Now, if the DSBCAPS_GETCURRENTPOSITION2 flag is specified, the
-          application can get a more accurate play cursor.
-        */
-        MA_ZERO_OBJECT(&descDS);
-        descDS.dwSize = sizeof(descDS);
-        descDS.dwFlags = MA_DSBCAPS_CTRLPOSITIONNOTIFY | MA_DSBCAPS_GLOBALFOCUS | MA_DSBCAPS_GETCURRENTPOSITION2;
-        descDS.dwBufferBytes = periodSizeInFrames * periodCount * ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels);
-        descDS.lpwfxFormat = (MA_WAVEFORMATEX*)pActualFormat;
-        hr = ma_IDirectSound_CreateSoundBuffer((ma_IDirectSound*)pDevice->dsound.pPlayback, &descDS, (ma_IDirectSoundBuffer**)&pDevice->dsound.pPlaybackBuffer, NULL);
-        if (FAILED(hr)) {
-            ma_device_uninit__dsound(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_CreateSoundBuffer() failed for playback device's secondary buffer.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        /* DirectSound should give us a buffer exactly the size we asked for. */
-        pDescriptorPlayback->periodSizeInFrames = periodSizeInFrames;
-        pDescriptorPlayback->periodCount        = periodCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_device_data_loop__dsound(ma_device* pDevice)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint32 bpfDeviceCapture  = ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-    ma_uint32 bpfDevicePlayback = ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-    HRESULT hr;
-    DWORD lockOffsetInBytesCapture;
-    DWORD lockSizeInBytesCapture;
-    DWORD mappedSizeInBytesCapture;
-    DWORD mappedDeviceFramesProcessedCapture;
-    void* pMappedDeviceBufferCapture;
-    DWORD lockOffsetInBytesPlayback;
-    DWORD lockSizeInBytesPlayback;
-    DWORD mappedSizeInBytesPlayback;
-    void* pMappedDeviceBufferPlayback;
-    DWORD prevReadCursorInBytesCapture = 0;
-    DWORD prevPlayCursorInBytesPlayback = 0;
-    ma_bool32 physicalPlayCursorLoopFlagPlayback = 0;
-    DWORD virtualWriteCursorInBytesPlayback = 0;
-    ma_bool32 virtualWriteCursorLoopFlagPlayback = 0;
-    ma_bool32 isPlaybackDeviceStarted = MA_FALSE;
-    ma_uint32 framesWrittenToPlaybackDevice = 0;   /* For knowing whether or not the playback device needs to be started. */
-    ma_uint32 waitTimeInMilliseconds = 1;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /* The first thing to do is start the capture device. The playback device is only started after the first period is written. */
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        hr = ma_IDirectSoundCaptureBuffer_Start((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, MA_DSCBSTART_LOOPING);
-        if (FAILED(hr)) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundCaptureBuffer_Start() failed.");
-            return ma_result_from_HRESULT(hr);
-        }
-    }
-
-    while (ma_device_get_state(pDevice) == ma_device_state_started) {
-        switch (pDevice->type)
-        {
-            case ma_device_type_duplex:
-            {
-                DWORD physicalCaptureCursorInBytes;
-                DWORD physicalReadCursorInBytes;
-                hr = ma_IDirectSoundCaptureBuffer_GetCurrentPosition((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, &physicalCaptureCursorInBytes, &physicalReadCursorInBytes);
-                if (FAILED(hr)) {
-                    return ma_result_from_HRESULT(hr);
-                }
-
-                /* If nothing is available we just sleep for a bit and return from this iteration. */
-                if (physicalReadCursorInBytes == prevReadCursorInBytesCapture) {
-                    ma_sleep(waitTimeInMilliseconds);
-                    continue; /* Nothing is available in the capture buffer. */
-                }
-
-                /*
-                The current position has moved. We need to map all of the captured samples and write them to the playback device, making sure
-                we don't return until every frame has been copied over.
-                */
-                if (prevReadCursorInBytesCapture < physicalReadCursorInBytes) {
-                    /* The capture position has not looped. This is the simple case. */
-                    lockOffsetInBytesCapture = prevReadCursorInBytesCapture;
-                    lockSizeInBytesCapture   = (physicalReadCursorInBytes - prevReadCursorInBytesCapture);
-                } else {
-                    /*
-                    The capture position has looped. This is the more complex case. Map to the end of the buffer. If this does not return anything,
-                    do it again from the start.
-                    */
-                    if (prevReadCursorInBytesCapture < pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture) {
-                        /* Lock up to the end of the buffer. */
-                        lockOffsetInBytesCapture = prevReadCursorInBytesCapture;
-                        lockSizeInBytesCapture   = (pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture) - prevReadCursorInBytesCapture;
-                    } else {
-                        /* Lock starting from the start of the buffer. */
-                        lockOffsetInBytesCapture = 0;
-                        lockSizeInBytesCapture   = physicalReadCursorInBytes;
-                    }
-                }
-
-                if (lockSizeInBytesCapture == 0) {
-                    ma_sleep(waitTimeInMilliseconds);
-                    continue; /* Nothing is available in the capture buffer. */
-                }
-
-                hr = ma_IDirectSoundCaptureBuffer_Lock((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, lockOffsetInBytesCapture, lockSizeInBytesCapture, &pMappedDeviceBufferCapture, &mappedSizeInBytesCapture, NULL, NULL, 0);
-                if (FAILED(hr)) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to map buffer from capture device in preparation for writing to the device.");
-                    return ma_result_from_HRESULT(hr);
-                }
-
-
-                /* At this point we have some input data that we need to output. We do not return until every mapped frame of the input data is written to the playback device. */
-                mappedDeviceFramesProcessedCapture = 0;
-
-                for (;;) {  /* Keep writing to the playback device. */
-                    ma_uint8  inputFramesInClientFormat[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-                    ma_uint32 inputFramesInClientFormatCap = sizeof(inputFramesInClientFormat) / ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
-                    ma_uint8  outputFramesInClientFormat[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-                    ma_uint32 outputFramesInClientFormatCap = sizeof(outputFramesInClientFormat) / ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
-                    ma_uint32 outputFramesInClientFormatCount;
-                    ma_uint32 outputFramesInClientFormatConsumed = 0;
-                    ma_uint64 clientCapturedFramesToProcess = ma_min(inputFramesInClientFormatCap, outputFramesInClientFormatCap);
-                    ma_uint64 deviceCapturedFramesToProcess = (mappedSizeInBytesCapture / bpfDeviceCapture) - mappedDeviceFramesProcessedCapture;
-                    void* pRunningMappedDeviceBufferCapture = ma_offset_ptr(pMappedDeviceBufferCapture, mappedDeviceFramesProcessedCapture * bpfDeviceCapture);
-
-                    result = ma_data_converter_process_pcm_frames(&pDevice->capture.converter, pRunningMappedDeviceBufferCapture, &deviceCapturedFramesToProcess, inputFramesInClientFormat, &clientCapturedFramesToProcess);
-                    if (result != MA_SUCCESS) {
-                        break;
-                    }
-
-                    outputFramesInClientFormatCount     = (ma_uint32)clientCapturedFramesToProcess;
-                    mappedDeviceFramesProcessedCapture += (ma_uint32)deviceCapturedFramesToProcess;
-
-                    ma_device__handle_data_callback(pDevice, outputFramesInClientFormat, inputFramesInClientFormat, (ma_uint32)clientCapturedFramesToProcess);
-
-                    /* At this point we have input and output data in client format. All we need to do now is convert it to the output device format. This may take a few passes. */
-                    for (;;) {
-                        ma_uint32 framesWrittenThisIteration;
-                        DWORD physicalPlayCursorInBytes;
-                        DWORD physicalWriteCursorInBytes;
-                        DWORD availableBytesPlayback;
-                        DWORD silentPaddingInBytes = 0; /* <-- Must be initialized to 0. */
-
-                        /* We need the physical play and write cursors. */
-                        if (FAILED(ma_IDirectSoundBuffer_GetCurrentPosition((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, &physicalPlayCursorInBytes, &physicalWriteCursorInBytes))) {
-                            break;
-                        }
-
-                        if (physicalPlayCursorInBytes < prevPlayCursorInBytesPlayback) {
-                            physicalPlayCursorLoopFlagPlayback = !physicalPlayCursorLoopFlagPlayback;
-                        }
-                        prevPlayCursorInBytesPlayback  = physicalPlayCursorInBytes;
-
-                        /* If there's any bytes available for writing we can do that now. The space between the virtual cursor position and play cursor. */
-                        if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
-                            /* Same loop iteration. The available bytes wraps all the way around from the virtual write cursor to the physical play cursor. */
-                            if (physicalPlayCursorInBytes <= virtualWriteCursorInBytesPlayback) {
-                                availableBytesPlayback  = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
-                                availableBytesPlayback += physicalPlayCursorInBytes;    /* Wrap around. */
-                            } else {
-                                /* This is an error. */
-                                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Duplex/Playback): Play cursor has moved in front of the write cursor (same loop iteration). physicalPlayCursorInBytes=%ld, virtualWriteCursorInBytes=%ld.\n", physicalPlayCursorInBytes, virtualWriteCursorInBytesPlayback);
-                                availableBytesPlayback = 0;
-                            }
-                        } else {
-                            /* Different loop iterations. The available bytes only goes from the virtual write cursor to the physical play cursor. */
-                            if (physicalPlayCursorInBytes >= virtualWriteCursorInBytesPlayback) {
-                                availableBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
-                            } else {
-                                /* This is an error. */
-                                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Duplex/Playback): Write cursor has moved behind the play cursor (different loop iterations). physicalPlayCursorInBytes=%ld, virtualWriteCursorInBytes=%ld.\n", physicalPlayCursorInBytes, virtualWriteCursorInBytesPlayback);
-                                availableBytesPlayback = 0;
-                            }
-                        }
-
-                        /* If there's no room available for writing we need to wait for more. */
-                        if (availableBytesPlayback == 0) {
-                            /* If we haven't started the device yet, this will never get beyond 0. In this case we need to get the device started. */
-                            if (!isPlaybackDeviceStarted) {
-                                hr = ma_IDirectSoundBuffer_Play((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0, 0, MA_DSBPLAY_LOOPING);
-                                if (FAILED(hr)) {
-                                    ma_IDirectSoundCaptureBuffer_Stop((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
-                                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Play() failed.");
-                                    return ma_result_from_HRESULT(hr);
-                                }
-                                isPlaybackDeviceStarted = MA_TRUE;
-                            } else {
-                                ma_sleep(waitTimeInMilliseconds);
-                                continue;
-                            }
-                        }
-
-
-                        /* Getting here means there room available somewhere. We limit this to either the end of the buffer or the physical play cursor, whichever is closest. */
-                        lockOffsetInBytesPlayback = virtualWriteCursorInBytesPlayback;
-                        if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
-                            /* Same loop iteration. Go up to the end of the buffer. */
-                            lockSizeInBytesPlayback = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
-                        } else {
-                            /* Different loop iterations. Go up to the physical play cursor. */
-                            lockSizeInBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
-                        }
-
-                        hr = ma_IDirectSoundBuffer_Lock((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, lockOffsetInBytesPlayback, lockSizeInBytesPlayback, &pMappedDeviceBufferPlayback, &mappedSizeInBytesPlayback, NULL, NULL, 0);
-                        if (FAILED(hr)) {
-                            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to map buffer from playback device in preparation for writing to the device.");
-                            result = ma_result_from_HRESULT(hr);
-                            break;
-                        }
-
-                        /*
-                        Experiment: If the playback buffer is being starved, pad it with some silence to get it back in sync. This will cause a glitch, but it may prevent
-                        endless glitching due to it constantly running out of data.
-                        */
-                        if (isPlaybackDeviceStarted) {
-                            DWORD bytesQueuedForPlayback = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - availableBytesPlayback;
-                            if (bytesQueuedForPlayback < (pDevice->playback.internalPeriodSizeInFrames*bpfDevicePlayback)) {
-                                silentPaddingInBytes   = (pDevice->playback.internalPeriodSizeInFrames*2*bpfDevicePlayback) - bytesQueuedForPlayback;
-                                if (silentPaddingInBytes > lockSizeInBytesPlayback) {
-                                    silentPaddingInBytes = lockSizeInBytesPlayback;
-                                }
-
-                                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Duplex/Playback) Playback buffer starved. availableBytesPlayback=%ld, silentPaddingInBytes=%ld\n", availableBytesPlayback, silentPaddingInBytes);
-                            }
-                        }
-
-                        /* At this point we have a buffer for output. */
-                        if (silentPaddingInBytes > 0) {
-                            MA_ZERO_MEMORY(pMappedDeviceBufferPlayback, silentPaddingInBytes);
-                            framesWrittenThisIteration = silentPaddingInBytes/bpfDevicePlayback;
-                        } else {
-                            ma_uint64 convertedFrameCountIn  = (outputFramesInClientFormatCount - outputFramesInClientFormatConsumed);
-                            ma_uint64 convertedFrameCountOut = mappedSizeInBytesPlayback/bpfDevicePlayback;
-                            void* pConvertedFramesIn  = ma_offset_ptr(outputFramesInClientFormat, outputFramesInClientFormatConsumed * bpfDevicePlayback);
-                            void* pConvertedFramesOut = pMappedDeviceBufferPlayback;
-
-                            result = ma_data_converter_process_pcm_frames(&pDevice->playback.converter, pConvertedFramesIn, &convertedFrameCountIn, pConvertedFramesOut, &convertedFrameCountOut);
-                            if (result != MA_SUCCESS) {
-                                break;
-                            }
-
-                            outputFramesInClientFormatConsumed += (ma_uint32)convertedFrameCountOut;
-                            framesWrittenThisIteration          = (ma_uint32)convertedFrameCountOut;
-                        }
-
-
-                        hr = ma_IDirectSoundBuffer_Unlock((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, pMappedDeviceBufferPlayback, framesWrittenThisIteration*bpfDevicePlayback, NULL, 0);
-                        if (FAILED(hr)) {
-                            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to unlock internal buffer from playback device after writing to the device.");
-                            result = ma_result_from_HRESULT(hr);
-                            break;
-                        }
-
-                        virtualWriteCursorInBytesPlayback += framesWrittenThisIteration*bpfDevicePlayback;
-                        if ((virtualWriteCursorInBytesPlayback/bpfDevicePlayback) == pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods) {
-                            virtualWriteCursorInBytesPlayback  = 0;
-                            virtualWriteCursorLoopFlagPlayback = !virtualWriteCursorLoopFlagPlayback;
-                        }
-
-                        /*
-                        We may need to start the device. We want two full periods to be written before starting the playback device. Having an extra period adds
-                        a bit of a buffer to prevent the playback buffer from getting starved.
-                        */
-                        framesWrittenToPlaybackDevice += framesWrittenThisIteration;
-                        if (!isPlaybackDeviceStarted && framesWrittenToPlaybackDevice >= (pDevice->playback.internalPeriodSizeInFrames*2)) {
-                            hr = ma_IDirectSoundBuffer_Play((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0, 0, MA_DSBPLAY_LOOPING);
-                            if (FAILED(hr)) {
-                                ma_IDirectSoundCaptureBuffer_Stop((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
-                                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Play() failed.");
-                                return ma_result_from_HRESULT(hr);
-                            }
-                            isPlaybackDeviceStarted = MA_TRUE;
-                        }
-
-                        if (framesWrittenThisIteration < mappedSizeInBytesPlayback/bpfDevicePlayback) {
-                            break;  /* We're finished with the output data.*/
-                        }
-                    }
-
-                    if (clientCapturedFramesToProcess == 0) {
-                        break;  /* We just consumed every input sample. */
-                    }
-                }
-
-
-                /* At this point we're done with the mapped portion of the capture buffer. */
-                hr = ma_IDirectSoundCaptureBuffer_Unlock((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, pMappedDeviceBufferCapture, mappedSizeInBytesCapture, NULL, 0);
-                if (FAILED(hr)) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to unlock internal buffer from capture device after reading from the device.");
-                    return ma_result_from_HRESULT(hr);
-                }
-                prevReadCursorInBytesCapture = (lockOffsetInBytesCapture + mappedSizeInBytesCapture);
-            } break;
-
-
-
-            case ma_device_type_capture:
-            {
-                DWORD physicalCaptureCursorInBytes;
-                DWORD physicalReadCursorInBytes;
-                hr = ma_IDirectSoundCaptureBuffer_GetCurrentPosition((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, &physicalCaptureCursorInBytes, &physicalReadCursorInBytes);
-                if (FAILED(hr)) {
-                    return MA_ERROR;
-                }
-
-                /* If the previous capture position is the same as the current position we need to wait a bit longer. */
-                if (prevReadCursorInBytesCapture == physicalReadCursorInBytes) {
-                    ma_sleep(waitTimeInMilliseconds);
-                    continue;
-                }
-
-                /* Getting here means we have capture data available. */
-                if (prevReadCursorInBytesCapture < physicalReadCursorInBytes) {
-                    /* The capture position has not looped. This is the simple case. */
-                    lockOffsetInBytesCapture = prevReadCursorInBytesCapture;
-                    lockSizeInBytesCapture   = (physicalReadCursorInBytes - prevReadCursorInBytesCapture);
-                } else {
-                    /*
-                    The capture position has looped. This is the more complex case. Map to the end of the buffer. If this does not return anything,
-                    do it again from the start.
-                    */
-                    if (prevReadCursorInBytesCapture < pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture) {
-                        /* Lock up to the end of the buffer. */
-                        lockOffsetInBytesCapture = prevReadCursorInBytesCapture;
-                        lockSizeInBytesCapture   = (pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture) - prevReadCursorInBytesCapture;
-                    } else {
-                        /* Lock starting from the start of the buffer. */
-                        lockOffsetInBytesCapture = 0;
-                        lockSizeInBytesCapture   = physicalReadCursorInBytes;
-                    }
-                }
-
-                if (lockSizeInBytesCapture < pDevice->capture.internalPeriodSizeInFrames) {
-                    ma_sleep(waitTimeInMilliseconds);
-                    continue; /* Nothing is available in the capture buffer. */
-                }
-
-                hr = ma_IDirectSoundCaptureBuffer_Lock((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, lockOffsetInBytesCapture, lockSizeInBytesCapture, &pMappedDeviceBufferCapture, &mappedSizeInBytesCapture, NULL, NULL, 0);
-                if (FAILED(hr)) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to map buffer from capture device in preparation for writing to the device.");
-                    result = ma_result_from_HRESULT(hr);
-                }
-
-                if (lockSizeInBytesCapture != mappedSizeInBytesCapture) {
-                    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[DirectSound] (Capture) lockSizeInBytesCapture=%ld != mappedSizeInBytesCapture=%ld\n", lockSizeInBytesCapture, mappedSizeInBytesCapture);
-                }
-
-                ma_device__send_frames_to_client(pDevice, mappedSizeInBytesCapture/bpfDeviceCapture, pMappedDeviceBufferCapture);
-
-                hr = ma_IDirectSoundCaptureBuffer_Unlock((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, pMappedDeviceBufferCapture, mappedSizeInBytesCapture, NULL, 0);
-                if (FAILED(hr)) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to unlock internal buffer from capture device after reading from the device.");
-                    return ma_result_from_HRESULT(hr);
-                }
-                prevReadCursorInBytesCapture = lockOffsetInBytesCapture + mappedSizeInBytesCapture;
-
-                if (prevReadCursorInBytesCapture == (pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture)) {
-                    prevReadCursorInBytesCapture = 0;
-                }
-            } break;
-
-
-
-            case ma_device_type_playback:
-            {
-                DWORD availableBytesPlayback;
-                DWORD physicalPlayCursorInBytes;
-                DWORD physicalWriteCursorInBytes;
-                hr = ma_IDirectSoundBuffer_GetCurrentPosition((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, &physicalPlayCursorInBytes, &physicalWriteCursorInBytes);
-                if (FAILED(hr)) {
-                    break;
-                }
-
-                if (physicalPlayCursorInBytes < prevPlayCursorInBytesPlayback) {
-                    physicalPlayCursorLoopFlagPlayback = !physicalPlayCursorLoopFlagPlayback;
-                }
-                prevPlayCursorInBytesPlayback  = physicalPlayCursorInBytes;
-
-                /* If there's any bytes available for writing we can do that now. The space between the virtual cursor position and play cursor. */
-                if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
-                    /* Same loop iteration. The available bytes wraps all the way around from the virtual write cursor to the physical play cursor. */
-                    if (physicalPlayCursorInBytes <= virtualWriteCursorInBytesPlayback) {
-                        availableBytesPlayback  = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
-                        availableBytesPlayback += physicalPlayCursorInBytes;    /* Wrap around. */
-                    } else {
-                        /* This is an error. */
-                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Playback): Play cursor has moved in front of the write cursor (same loop iterations). physicalPlayCursorInBytes=%ld, virtualWriteCursorInBytes=%ld.\n", physicalPlayCursorInBytes, virtualWriteCursorInBytesPlayback);
-                        availableBytesPlayback = 0;
-                    }
-                } else {
-                    /* Different loop iterations. The available bytes only goes from the virtual write cursor to the physical play cursor. */
-                    if (physicalPlayCursorInBytes >= virtualWriteCursorInBytesPlayback) {
-                        availableBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
-                    } else {
-                        /* This is an error. */
-                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Playback): Write cursor has moved behind the play cursor (different loop iterations). physicalPlayCursorInBytes=%ld, virtualWriteCursorInBytes=%ld.\n", physicalPlayCursorInBytes, virtualWriteCursorInBytesPlayback);
-                        availableBytesPlayback = 0;
-                    }
-                }
-
-                /* If there's no room available for writing we need to wait for more. */
-                if (availableBytesPlayback < pDevice->playback.internalPeriodSizeInFrames) {
-                    /* If we haven't started the device yet, this will never get beyond 0. In this case we need to get the device started. */
-                    if (availableBytesPlayback == 0 && !isPlaybackDeviceStarted) {
-                        hr = ma_IDirectSoundBuffer_Play((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0, 0, MA_DSBPLAY_LOOPING);
-                        if (FAILED(hr)) {
-                            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Play() failed.");
-                            return ma_result_from_HRESULT(hr);
-                        }
-                        isPlaybackDeviceStarted = MA_TRUE;
-                    } else {
-                        ma_sleep(waitTimeInMilliseconds);
-                        continue;
-                    }
-                }
-
-                /* Getting here means there room available somewhere. We limit this to either the end of the buffer or the physical play cursor, whichever is closest. */
-                lockOffsetInBytesPlayback = virtualWriteCursorInBytesPlayback;
-                if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
-                    /* Same loop iteration. Go up to the end of the buffer. */
-                    lockSizeInBytesPlayback = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
-                } else {
-                    /* Different loop iterations. Go up to the physical play cursor. */
-                    lockSizeInBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
-                }
-
-                hr = ma_IDirectSoundBuffer_Lock((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, lockOffsetInBytesPlayback, lockSizeInBytesPlayback, &pMappedDeviceBufferPlayback, &mappedSizeInBytesPlayback, NULL, NULL, 0);
-                if (FAILED(hr)) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to map buffer from playback device in preparation for writing to the device.");
-                    result = ma_result_from_HRESULT(hr);
-                    break;
-                }
-
-                /* At this point we have a buffer for output. */
-                ma_device__read_frames_from_client(pDevice, (mappedSizeInBytesPlayback/bpfDevicePlayback), pMappedDeviceBufferPlayback);
-
-                hr = ma_IDirectSoundBuffer_Unlock((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, pMappedDeviceBufferPlayback, mappedSizeInBytesPlayback, NULL, 0);
-                if (FAILED(hr)) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to unlock internal buffer from playback device after writing to the device.");
-                    result = ma_result_from_HRESULT(hr);
-                    break;
-                }
-
-                virtualWriteCursorInBytesPlayback += mappedSizeInBytesPlayback;
-                if (virtualWriteCursorInBytesPlayback == pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) {
-                    virtualWriteCursorInBytesPlayback  = 0;
-                    virtualWriteCursorLoopFlagPlayback = !virtualWriteCursorLoopFlagPlayback;
-                }
-
-                /*
-                We may need to start the device. We want two full periods to be written before starting the playback device. Having an extra period adds
-                a bit of a buffer to prevent the playback buffer from getting starved.
-                */
-                framesWrittenToPlaybackDevice += mappedSizeInBytesPlayback/bpfDevicePlayback;
-                if (!isPlaybackDeviceStarted && framesWrittenToPlaybackDevice >= pDevice->playback.internalPeriodSizeInFrames) {
-                    hr = ma_IDirectSoundBuffer_Play((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0, 0, MA_DSBPLAY_LOOPING);
-                    if (FAILED(hr)) {
-                        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Play() failed.");
-                        return ma_result_from_HRESULT(hr);
-                    }
-                    isPlaybackDeviceStarted = MA_TRUE;
-                }
-            } break;
-
-
-            default: return MA_INVALID_ARGS;   /* Invalid device type. */
-        }
-
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    /* Getting here means the device is being stopped. */
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        hr = ma_IDirectSoundCaptureBuffer_Stop((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
-        if (FAILED(hr)) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundCaptureBuffer_Stop() failed.");
-            return ma_result_from_HRESULT(hr);
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        /* The playback device should be drained before stopping. All we do is wait until the available bytes is equal to the size of the buffer. */
-        if (isPlaybackDeviceStarted) {
-            for (;;) {
-                DWORD availableBytesPlayback = 0;
-                DWORD physicalPlayCursorInBytes;
-                DWORD physicalWriteCursorInBytes;
-                hr = ma_IDirectSoundBuffer_GetCurrentPosition((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, &physicalPlayCursorInBytes, &physicalWriteCursorInBytes);
-                if (FAILED(hr)) {
-                    break;
-                }
-
-                if (physicalPlayCursorInBytes < prevPlayCursorInBytesPlayback) {
-                    physicalPlayCursorLoopFlagPlayback = !physicalPlayCursorLoopFlagPlayback;
-                }
-                prevPlayCursorInBytesPlayback  = physicalPlayCursorInBytes;
-
-                if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
-                    /* Same loop iteration. The available bytes wraps all the way around from the virtual write cursor to the physical play cursor. */
-                    if (physicalPlayCursorInBytes <= virtualWriteCursorInBytesPlayback) {
-                        availableBytesPlayback  = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
-                        availableBytesPlayback += physicalPlayCursorInBytes;    /* Wrap around. */
-                    } else {
-                        break;
-                    }
-                } else {
-                    /* Different loop iterations. The available bytes only goes from the virtual write cursor to the physical play cursor. */
-                    if (physicalPlayCursorInBytes >= virtualWriteCursorInBytesPlayback) {
-                        availableBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
-                    } else {
-                        break;
-                    }
-                }
-
-                if (availableBytesPlayback >= (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback)) {
-                    break;
-                }
-
-                ma_sleep(waitTimeInMilliseconds);
-            }
-        }
-
-        hr = ma_IDirectSoundBuffer_Stop((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer);
-        if (FAILED(hr)) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Stop() failed.");
-            return ma_result_from_HRESULT(hr);
-        }
-
-        ma_IDirectSoundBuffer_SetCurrentPosition((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_uninit__dsound(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_dsound);
-
-    ma_dlclose(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__dsound(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    MA_ASSERT(pContext != NULL);
-
-    (void)pConfig;
-
-    pContext->dsound.hDSoundDLL = ma_dlopen(ma_context_get_log(pContext), "dsound.dll");
-    if (pContext->dsound.hDSoundDLL == NULL) {
-        return MA_API_NOT_FOUND;
-    }
-
-    pContext->dsound.DirectSoundCreate            = ma_dlsym(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL, "DirectSoundCreate");
-    pContext->dsound.DirectSoundEnumerateA        = ma_dlsym(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL, "DirectSoundEnumerateA");
-    pContext->dsound.DirectSoundCaptureCreate     = ma_dlsym(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL, "DirectSoundCaptureCreate");
-    pContext->dsound.DirectSoundCaptureEnumerateA = ma_dlsym(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL, "DirectSoundCaptureEnumerateA");
-
-    /*
-    We need to support all functions or nothing. DirectSound with Windows 95 seems to not work too
-    well in my testing. For example, it's missing DirectSoundCaptureEnumerateA(). This is a convenient
-    place to just disable the DirectSound backend for Windows 95.
-    */
-    if (pContext->dsound.DirectSoundCreate            == NULL ||
-        pContext->dsound.DirectSoundEnumerateA        == NULL ||
-        pContext->dsound.DirectSoundCaptureCreate     == NULL ||
-        pContext->dsound.DirectSoundCaptureEnumerateA == NULL) {
-        return MA_API_NOT_FOUND;
-    }
-
-    pCallbacks->onContextInit             = ma_context_init__dsound;
-    pCallbacks->onContextUninit           = ma_context_uninit__dsound;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__dsound;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__dsound;
-    pCallbacks->onDeviceInit              = ma_device_init__dsound;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__dsound;
-    pCallbacks->onDeviceStart             = NULL;   /* Not used. Started in onDeviceDataLoop. */
-    pCallbacks->onDeviceStop              = NULL;   /* Not used. Stopped in onDeviceDataLoop. */
-    pCallbacks->onDeviceRead              = NULL;   /* Not used. Data is read directly in onDeviceDataLoop. */
-    pCallbacks->onDeviceWrite             = NULL;   /* Not used. Data is written directly in onDeviceDataLoop. */
-    pCallbacks->onDeviceDataLoop          = ma_device_data_loop__dsound;
-
-    return MA_SUCCESS;
-}
-#endif
-
-
-
-/******************************************************************************
-
-WinMM Backend
-
-******************************************************************************/
-#ifdef MA_HAS_WINMM
-
-/*
-Some build configurations will exclude the WinMM API. An example is when WIN32_LEAN_AND_MEAN
-is defined. We need to define the types and functions we need manually.
-*/
-#define MA_MMSYSERR_NOERROR     0
-#define MA_MMSYSERR_ERROR       1
-#define MA_MMSYSERR_BADDEVICEID 2
-#define MA_MMSYSERR_INVALHANDLE 5
-#define MA_MMSYSERR_NOMEM       7
-#define MA_MMSYSERR_INVALFLAG   10
-#define MA_MMSYSERR_INVALPARAM  11
-#define MA_MMSYSERR_HANDLEBUSY  12
-
-#define MA_CALLBACK_EVENT       0x00050000
-#define MA_WAVE_ALLOWSYNC       0x0002
-
-#define MA_WHDR_DONE            0x00000001
-#define MA_WHDR_PREPARED        0x00000002
-#define MA_WHDR_BEGINLOOP       0x00000004
-#define MA_WHDR_ENDLOOP         0x00000008
-#define MA_WHDR_INQUEUE         0x00000010
-
-#define MA_MAXPNAMELEN          32
-
-typedef void* MA_HWAVEIN;
-typedef void* MA_HWAVEOUT;
-typedef UINT MA_MMRESULT;
-typedef UINT MA_MMVERSION;
-
-typedef struct
-{
-    WORD wMid;
-    WORD wPid;
-    MA_MMVERSION vDriverVersion;
-    CHAR szPname[MA_MAXPNAMELEN];
-    DWORD dwFormats;
-    WORD wChannels;
-    WORD wReserved1;
-} MA_WAVEINCAPSA;
-
-typedef struct
-{
-    WORD wMid;
-    WORD wPid;
-    MA_MMVERSION vDriverVersion;
-    CHAR szPname[MA_MAXPNAMELEN];
-    DWORD dwFormats;
-    WORD wChannels;
-    WORD wReserved1;
-    DWORD dwSupport;
-} MA_WAVEOUTCAPSA;
-
-typedef struct tagWAVEHDR
-{
-    char* lpData;
-    DWORD dwBufferLength;
-    DWORD dwBytesRecorded;
-    DWORD_PTR dwUser;
-    DWORD dwFlags;
-    DWORD dwLoops;
-    struct tagWAVEHDR* lpNext;
-    DWORD_PTR reserved;
-} MA_WAVEHDR;
-
-typedef struct
-{
-    WORD wMid;
-    WORD wPid;
-    MA_MMVERSION vDriverVersion;
-    CHAR szPname[MA_MAXPNAMELEN];
-    DWORD dwFormats;
-    WORD wChannels;
-    WORD wReserved1;
-    DWORD dwSupport;
-    GUID ManufacturerGuid;
-    GUID ProductGuid;
-    GUID NameGuid;
-} MA_WAVEOUTCAPS2A;
-
-typedef struct
-{
-    WORD wMid;
-    WORD wPid;
-    MA_MMVERSION vDriverVersion;
-    CHAR szPname[MA_MAXPNAMELEN];
-    DWORD dwFormats;
-    WORD wChannels;
-    WORD wReserved1;
-    GUID ManufacturerGuid;
-    GUID ProductGuid;
-    GUID NameGuid;
-} MA_WAVEINCAPS2A;
-
-typedef UINT        (WINAPI * MA_PFN_waveOutGetNumDevs)(void);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutGetDevCapsA)(ma_uintptr uDeviceID, MA_WAVEOUTCAPSA* pwoc, UINT cbwoc);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutOpen)(MA_HWAVEOUT* phwo, UINT uDeviceID, const MA_WAVEFORMATEX* pwfx, DWORD_PTR dwCallback, DWORD_PTR dwInstance, DWORD fdwOpen);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutClose)(MA_HWAVEOUT hwo);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutPrepareHeader)(MA_HWAVEOUT hwo, MA_WAVEHDR* pwh, UINT cbwh);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutUnprepareHeader)(MA_HWAVEOUT hwo, MA_WAVEHDR* pwh, UINT cbwh);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutWrite)(MA_HWAVEOUT hwo, MA_WAVEHDR* pwh, UINT cbwh);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutReset)(MA_HWAVEOUT hwo);
-typedef UINT        (WINAPI * MA_PFN_waveInGetNumDevs)(void);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveInGetDevCapsA)(ma_uintptr uDeviceID, MA_WAVEINCAPSA* pwic, UINT cbwic);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveInOpen)(MA_HWAVEIN* phwi, UINT uDeviceID, const MA_WAVEFORMATEX* pwfx, DWORD_PTR dwCallback, DWORD_PTR dwInstance, DWORD fdwOpen);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveInClose)(MA_HWAVEIN hwi);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveInPrepareHeader)(MA_HWAVEIN hwi, MA_WAVEHDR* pwh, UINT cbwh);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveInUnprepareHeader)(MA_HWAVEIN hwi, MA_WAVEHDR* pwh, UINT cbwh);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveInAddBuffer)(MA_HWAVEIN hwi, MA_WAVEHDR* pwh, UINT cbwh);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveInStart)(MA_HWAVEIN hwi);
-typedef MA_MMRESULT (WINAPI * MA_PFN_waveInReset)(MA_HWAVEIN hwi);
-
-static ma_result ma_result_from_MMRESULT(MA_MMRESULT resultMM)
-{
-    switch (resultMM)
-    {
-        case MA_MMSYSERR_NOERROR:       return MA_SUCCESS;
-        case MA_MMSYSERR_BADDEVICEID:   return MA_INVALID_ARGS;
-        case MA_MMSYSERR_INVALHANDLE:   return MA_INVALID_ARGS;
-        case MA_MMSYSERR_NOMEM:         return MA_OUT_OF_MEMORY;
-        case MA_MMSYSERR_INVALFLAG:     return MA_INVALID_ARGS;
-        case MA_MMSYSERR_INVALPARAM:    return MA_INVALID_ARGS;
-        case MA_MMSYSERR_HANDLEBUSY:    return MA_BUSY;
-        case MA_MMSYSERR_ERROR:         return MA_ERROR;
-        default:                        return MA_ERROR;
-    }
-}
-
-static char* ma_find_last_character(char* str, char ch)
-{
-    char* last;
-
-    if (str == NULL) {
-        return NULL;
-    }
-
-    last = NULL;
-    while (*str != '\0') {
-        if (*str == ch) {
-            last = str;
-        }
-
-        str += 1;
-    }
-
-    return last;
-}
-
-static ma_uint32 ma_get_period_size_in_bytes(ma_uint32 periodSizeInFrames, ma_format format, ma_uint32 channels)
-{
-    return periodSizeInFrames * ma_get_bytes_per_frame(format, channels);
-}
-
-
-/*
-Our own "WAVECAPS" structure that contains generic information shared between WAVEOUTCAPS2 and WAVEINCAPS2 so
-we can do things generically and typesafely. Names are being kept the same for consistency.
-*/
-typedef struct
-{
-    CHAR szPname[MA_MAXPNAMELEN];
-    DWORD dwFormats;
-    WORD wChannels;
-    GUID NameGuid;
-} MA_WAVECAPSA;
-
-static ma_result ma_get_best_info_from_formats_flags__winmm(DWORD dwFormats, WORD channels, WORD* pBitsPerSample, DWORD* pSampleRate)
-{
-    WORD bitsPerSample = 0;
-    DWORD sampleRate = 0;
-
-    if (pBitsPerSample) {
-        *pBitsPerSample = 0;
-    }
-    if (pSampleRate) {
-        *pSampleRate = 0;
-    }
-
-    if (channels == 1) {
-        bitsPerSample = 16;
-        if ((dwFormats & WAVE_FORMAT_48M16) != 0) {
-            sampleRate = 48000;
-        } else if ((dwFormats & WAVE_FORMAT_44M16) != 0) {
-            sampleRate = 44100;
-        } else if ((dwFormats & WAVE_FORMAT_2M16) != 0) {
-            sampleRate = 22050;
-        } else if ((dwFormats & WAVE_FORMAT_1M16) != 0) {
-            sampleRate = 11025;
-        } else if ((dwFormats & WAVE_FORMAT_96M16) != 0) {
-            sampleRate = 96000;
-        } else {
-            bitsPerSample = 8;
-            if ((dwFormats & WAVE_FORMAT_48M08) != 0) {
-                sampleRate = 48000;
-            } else if ((dwFormats & WAVE_FORMAT_44M08) != 0) {
-                sampleRate = 44100;
-            } else if ((dwFormats & WAVE_FORMAT_2M08) != 0) {
-                sampleRate = 22050;
-            } else if ((dwFormats & WAVE_FORMAT_1M08) != 0) {
-                sampleRate = 11025;
-            } else if ((dwFormats & WAVE_FORMAT_96M08) != 0) {
-                sampleRate = 96000;
-            } else {
-                return MA_FORMAT_NOT_SUPPORTED;
-            }
-        }
-    } else {
-        bitsPerSample = 16;
-        if ((dwFormats & WAVE_FORMAT_48S16) != 0) {
-            sampleRate = 48000;
-        } else if ((dwFormats & WAVE_FORMAT_44S16) != 0) {
-            sampleRate = 44100;
-        } else if ((dwFormats & WAVE_FORMAT_2S16) != 0) {
-            sampleRate = 22050;
-        } else if ((dwFormats & WAVE_FORMAT_1S16) != 0) {
-            sampleRate = 11025;
-        } else if ((dwFormats & WAVE_FORMAT_96S16) != 0) {
-            sampleRate = 96000;
-        } else {
-            bitsPerSample = 8;
-            if ((dwFormats & WAVE_FORMAT_48S08) != 0) {
-                sampleRate = 48000;
-            } else if ((dwFormats & WAVE_FORMAT_44S08) != 0) {
-                sampleRate = 44100;
-            } else if ((dwFormats & WAVE_FORMAT_2S08) != 0) {
-                sampleRate = 22050;
-            } else if ((dwFormats & WAVE_FORMAT_1S08) != 0) {
-                sampleRate = 11025;
-            } else if ((dwFormats & WAVE_FORMAT_96S08) != 0) {
-                sampleRate = 96000;
-            } else {
-                return MA_FORMAT_NOT_SUPPORTED;
-            }
-        }
-    }
-
-    if (pBitsPerSample) {
-        *pBitsPerSample = bitsPerSample;
-    }
-    if (pSampleRate) {
-        *pSampleRate = sampleRate;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_formats_flags_to_WAVEFORMATEX__winmm(DWORD dwFormats, WORD channels, MA_WAVEFORMATEX* pWF)
-{
-    ma_result result;
-
-    MA_ASSERT(pWF != NULL);
-
-    MA_ZERO_OBJECT(pWF);
-    pWF->cbSize     = sizeof(*pWF);
-    pWF->wFormatTag = WAVE_FORMAT_PCM;
-    pWF->nChannels  = (WORD)channels;
-    if (pWF->nChannels > 2) {
-        pWF->nChannels = 2;
-    }
-
-    result = ma_get_best_info_from_formats_flags__winmm(dwFormats, channels, &pWF->wBitsPerSample, &pWF->nSamplesPerSec);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pWF->nBlockAlign     = (WORD)(pWF->nChannels * pWF->wBitsPerSample / 8);
-    pWF->nAvgBytesPerSec = pWF->nBlockAlign * pWF->nSamplesPerSec;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info_from_WAVECAPS(ma_context* pContext, MA_WAVECAPSA* pCaps, ma_device_info* pDeviceInfo)
-{
-    WORD bitsPerSample;
-    DWORD sampleRate;
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pCaps != NULL);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    /*
-    Name / Description
-
-    Unfortunately the name specified in WAVE(OUT/IN)CAPS2 is limited to 31 characters. This results in an unprofessional looking
-    situation where the names of the devices are truncated. To help work around this, we need to look at the name GUID and try
-    looking in the registry for the full name. If we can't find it there, we need to just fall back to the default name.
-    */
-
-    /* Set the default to begin with. */
-    ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), pCaps->szPname, (size_t)-1);
-
-    /*
-    Now try the registry. There's a few things to consider here:
-    - The name GUID can be null, in which we case we just need to stick to the original 31 characters.
-    - If the name GUID is not present in the registry we'll also need to stick to the original 31 characters.
-    - I like consistency, so I want the returned device names to be consistent with those returned by WASAPI and DirectSound. The
-      problem, however is that WASAPI and DirectSound use "<component> (<name>)" format (such as "Speakers (High Definition Audio)"),
-      but WinMM does not specificy the component name. From my admittedly limited testing, I've notice the component name seems to
-      usually fit within the 31 characters of the fixed sized buffer, so what I'm going to do is parse that string for the component
-      name, and then concatenate the name from the registry.
-    */
-    if (!ma_is_guid_null(&pCaps->NameGuid)) {
-        WCHAR guidStrW[256];
-        if (((MA_PFN_StringFromGUID2)pContext->win32.StringFromGUID2)(&pCaps->NameGuid, guidStrW, ma_countof(guidStrW)) > 0) {
-            char guidStr[256];
-            char keyStr[1024];
-            HKEY hKey;
-
-            WideCharToMultiByte(CP_UTF8, 0, guidStrW, -1, guidStr, sizeof(guidStr), 0, FALSE);
-
-            ma_strcpy_s(keyStr, sizeof(keyStr), "SYSTEM\\CurrentControlSet\\Control\\MediaCategories\\");
-            ma_strcat_s(keyStr, sizeof(keyStr), guidStr);
-
-            if (((MA_PFN_RegOpenKeyExA)pContext->win32.RegOpenKeyExA)(HKEY_LOCAL_MACHINE, keyStr, 0, KEY_READ, &hKey) == ERROR_SUCCESS) {
-                BYTE nameFromReg[512];
-                DWORD nameFromRegSize = sizeof(nameFromReg);
-                LONG resultWin32 = ((MA_PFN_RegQueryValueExA)pContext->win32.RegQueryValueExA)(hKey, "Name", 0, NULL, (BYTE*)nameFromReg, (DWORD*)&nameFromRegSize);
-                ((MA_PFN_RegCloseKey)pContext->win32.RegCloseKey)(hKey);
-
-                if (resultWin32 == ERROR_SUCCESS) {
-                    /* We have the value from the registry, so now we need to construct the name string. */
-                    char name[1024];
-                    if (ma_strcpy_s(name, sizeof(name), pDeviceInfo->name) == 0) {
-                        char* nameBeg = ma_find_last_character(name, '(');
-                        if (nameBeg != NULL) {
-                            size_t leadingLen = (nameBeg - name);
-                            ma_strncpy_s(nameBeg + 1, sizeof(name) - leadingLen, (const char*)nameFromReg, (size_t)-1);
-
-                            /* The closing ")", if it can fit. */
-                            if (leadingLen + nameFromRegSize < sizeof(name)-1) {
-                                ma_strcat_s(name, sizeof(name), ")");
-                            }
-
-                            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), name, (size_t)-1);
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-
-    result = ma_get_best_info_from_formats_flags__winmm(pCaps->dwFormats, pCaps->wChannels, &bitsPerSample, &sampleRate);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (bitsPerSample == 8) {
-        pDeviceInfo->nativeDataFormats[0].format = ma_format_u8;
-    } else if (bitsPerSample == 16) {
-        pDeviceInfo->nativeDataFormats[0].format = ma_format_s16;
-    } else if (bitsPerSample == 24) {
-        pDeviceInfo->nativeDataFormats[0].format = ma_format_s24;
-    } else if (bitsPerSample == 32) {
-        pDeviceInfo->nativeDataFormats[0].format = ma_format_s32;
-    } else {
-        return MA_FORMAT_NOT_SUPPORTED;
-    }
-    pDeviceInfo->nativeDataFormats[0].channels   = pCaps->wChannels;
-    pDeviceInfo->nativeDataFormats[0].sampleRate = sampleRate;
-    pDeviceInfo->nativeDataFormats[0].flags      = 0;
-    pDeviceInfo->nativeDataFormatCount = 1;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info_from_WAVEOUTCAPS2(ma_context* pContext, MA_WAVEOUTCAPS2A* pCaps, ma_device_info* pDeviceInfo)
-{
-    MA_WAVECAPSA caps;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pCaps != NULL);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    MA_COPY_MEMORY(caps.szPname, pCaps->szPname, sizeof(caps.szPname));
-    caps.dwFormats = pCaps->dwFormats;
-    caps.wChannels = pCaps->wChannels;
-    caps.NameGuid  = pCaps->NameGuid;
-    return ma_context_get_device_info_from_WAVECAPS(pContext, &caps, pDeviceInfo);
-}
-
-static ma_result ma_context_get_device_info_from_WAVEINCAPS2(ma_context* pContext, MA_WAVEINCAPS2A* pCaps, ma_device_info* pDeviceInfo)
-{
-    MA_WAVECAPSA caps;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pCaps != NULL);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    MA_COPY_MEMORY(caps.szPname, pCaps->szPname, sizeof(caps.szPname));
-    caps.dwFormats = pCaps->dwFormats;
-    caps.wChannels = pCaps->wChannels;
-    caps.NameGuid  = pCaps->NameGuid;
-    return ma_context_get_device_info_from_WAVECAPS(pContext, &caps, pDeviceInfo);
-}
-
-
-static ma_result ma_context_enumerate_devices__winmm(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    UINT playbackDeviceCount;
-    UINT captureDeviceCount;
-    UINT iPlaybackDevice;
-    UINT iCaptureDevice;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    /* Playback. */
-    playbackDeviceCount = ((MA_PFN_waveOutGetNumDevs)pContext->winmm.waveOutGetNumDevs)();
-    for (iPlaybackDevice = 0; iPlaybackDevice < playbackDeviceCount; ++iPlaybackDevice) {
-        MA_MMRESULT result;
-        MA_WAVEOUTCAPS2A caps;
-
-        MA_ZERO_OBJECT(&caps);
-
-        result = ((MA_PFN_waveOutGetDevCapsA)pContext->winmm.waveOutGetDevCapsA)(iPlaybackDevice, (MA_WAVEOUTCAPSA*)&caps, sizeof(caps));
-        if (result == MA_MMSYSERR_NOERROR) {
-            ma_device_info deviceInfo;
-
-            MA_ZERO_OBJECT(&deviceInfo);
-            deviceInfo.id.winmm = iPlaybackDevice;
-
-            /* The first enumerated device is the default device. */
-            if (iPlaybackDevice == 0) {
-                deviceInfo.isDefault = MA_TRUE;
-            }
-
-            if (ma_context_get_device_info_from_WAVEOUTCAPS2(pContext, &caps, &deviceInfo) == MA_SUCCESS) {
-                ma_bool32 cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-                if (cbResult == MA_FALSE) {
-                    return MA_SUCCESS; /* Enumeration was stopped. */
-                }
-            }
-        }
-    }
-
-    /* Capture. */
-    captureDeviceCount = ((MA_PFN_waveInGetNumDevs)pContext->winmm.waveInGetNumDevs)();
-    for (iCaptureDevice = 0; iCaptureDevice < captureDeviceCount; ++iCaptureDevice) {
-        MA_MMRESULT result;
-        MA_WAVEINCAPS2A caps;
-
-        MA_ZERO_OBJECT(&caps);
-
-        result = ((MA_PFN_waveInGetDevCapsA)pContext->winmm.waveInGetDevCapsA)(iCaptureDevice, (MA_WAVEINCAPSA*)&caps, sizeof(caps));
-        if (result == MA_MMSYSERR_NOERROR) {
-            ma_device_info deviceInfo;
-
-            MA_ZERO_OBJECT(&deviceInfo);
-            deviceInfo.id.winmm = iCaptureDevice;
-
-            /* The first enumerated device is the default device. */
-            if (iCaptureDevice == 0) {
-                deviceInfo.isDefault = MA_TRUE;
-            }
-
-            if (ma_context_get_device_info_from_WAVEINCAPS2(pContext, &caps, &deviceInfo) == MA_SUCCESS) {
-                ma_bool32 cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-                if (cbResult == MA_FALSE) {
-                    return MA_SUCCESS; /* Enumeration was stopped. */
-                }
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info__winmm(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    UINT winMMDeviceID;
-
-    MA_ASSERT(pContext != NULL);
-
-    winMMDeviceID = 0;
-    if (pDeviceID != NULL) {
-        winMMDeviceID = (UINT)pDeviceID->winmm;
-    }
-
-    pDeviceInfo->id.winmm = winMMDeviceID;
-
-    /* The first ID is the default device. */
-    if (winMMDeviceID == 0) {
-        pDeviceInfo->isDefault = MA_TRUE;
-    }
-
-    if (deviceType == ma_device_type_playback) {
-        MA_MMRESULT result;
-        MA_WAVEOUTCAPS2A caps;
-
-        MA_ZERO_OBJECT(&caps);
-
-        result = ((MA_PFN_waveOutGetDevCapsA)pContext->winmm.waveOutGetDevCapsA)(winMMDeviceID, (MA_WAVEOUTCAPSA*)&caps, sizeof(caps));
-        if (result == MA_MMSYSERR_NOERROR) {
-            return ma_context_get_device_info_from_WAVEOUTCAPS2(pContext, &caps, pDeviceInfo);
-        }
-    } else {
-        MA_MMRESULT result;
-        MA_WAVEINCAPS2A caps;
-
-        MA_ZERO_OBJECT(&caps);
-
-        result = ((MA_PFN_waveInGetDevCapsA)pContext->winmm.waveInGetDevCapsA)(winMMDeviceID, (MA_WAVEINCAPSA*)&caps, sizeof(caps));
-        if (result == MA_MMSYSERR_NOERROR) {
-            return ma_context_get_device_info_from_WAVEINCAPS2(pContext, &caps, pDeviceInfo);
-        }
-    }
-
-    return MA_NO_DEVICE;
-}
-
-
-static ma_result ma_device_uninit__winmm(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ((MA_PFN_waveInClose)pDevice->pContext->winmm.waveInClose)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture);
-        CloseHandle((HANDLE)pDevice->winmm.hEventCapture);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ((MA_PFN_waveOutReset)pDevice->pContext->winmm.waveOutReset)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback);
-        ((MA_PFN_waveOutClose)pDevice->pContext->winmm.waveOutClose)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback);
-        CloseHandle((HANDLE)pDevice->winmm.hEventPlayback);
-    }
-
-    ma_free(pDevice->winmm._pHeapData, &pDevice->pContext->allocationCallbacks);
-
-    MA_ZERO_OBJECT(&pDevice->winmm);   /* Safety. */
-
-    return MA_SUCCESS;
-}
-
-static ma_uint32 ma_calculate_period_size_in_frames_from_descriptor__winmm(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
-{
-    /* WinMM has a minimum period size of 40ms. */
-    ma_uint32 minPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(40, nativeSampleRate);
-    ma_uint32 periodSizeInFrames;
-
-    periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, nativeSampleRate, performanceProfile);
-    if (periodSizeInFrames < minPeriodSizeInFrames) {
-        periodSizeInFrames = minPeriodSizeInFrames;
-    }
-
-    return periodSizeInFrames;
-}
-
-static ma_result ma_device_init__winmm(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    const char* errorMsg = "";
-    ma_result errorCode = MA_ERROR;
-    ma_result result = MA_SUCCESS;
-    ma_uint32 heapSize;
-    UINT winMMDeviceIDPlayback = 0;
-    UINT winMMDeviceIDCapture  = 0;
-
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ZERO_OBJECT(&pDevice->winmm);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    /* No exlusive mode with WinMM. */
-    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
-        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
-        return MA_SHARE_MODE_NOT_SUPPORTED;
-    }
-
-    if (pDescriptorPlayback->pDeviceID != NULL) {
-        winMMDeviceIDPlayback = (UINT)pDescriptorPlayback->pDeviceID->winmm;
-    }
-    if (pDescriptorCapture->pDeviceID != NULL) {
-        winMMDeviceIDCapture = (UINT)pDescriptorCapture->pDeviceID->winmm;
-    }
-
-    /* The capture device needs to be initialized first. */
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        MA_WAVEINCAPSA caps;
-        MA_WAVEFORMATEX wf;
-        MA_MMRESULT resultMM;
-
-        /* We use an event to know when a new fragment needs to be enqueued. */
-        pDevice->winmm.hEventCapture = (ma_handle)CreateEventA(NULL, TRUE, TRUE, NULL);
-        if (pDevice->winmm.hEventCapture == NULL) {
-            errorMsg = "[WinMM] Failed to create event for fragment enqueing for the capture device.", errorCode = ma_result_from_GetLastError(GetLastError());
-            goto on_error;
-        }
-
-        /* The format should be based on the device's actual format. */
-        if (((MA_PFN_waveInGetDevCapsA)pDevice->pContext->winmm.waveInGetDevCapsA)(winMMDeviceIDCapture, &caps, sizeof(caps)) != MA_MMSYSERR_NOERROR) {
-            errorMsg = "[WinMM] Failed to retrieve internal device caps.", errorCode = MA_FORMAT_NOT_SUPPORTED;
-            goto on_error;
-        }
-
-        result = ma_formats_flags_to_WAVEFORMATEX__winmm(caps.dwFormats, caps.wChannels, &wf);
-        if (result != MA_SUCCESS) {
-            errorMsg = "[WinMM] Could not find appropriate format for internal device.", errorCode = result;
-            goto on_error;
-        }
-
-        resultMM = ((MA_PFN_waveInOpen)pDevice->pContext->winmm.waveInOpen)((MA_HWAVEIN*)&pDevice->winmm.hDeviceCapture, winMMDeviceIDCapture, &wf, (DWORD_PTR)pDevice->winmm.hEventCapture, (DWORD_PTR)pDevice, MA_CALLBACK_EVENT | MA_WAVE_ALLOWSYNC);
-        if (resultMM != MA_MMSYSERR_NOERROR) {
-            errorMsg = "[WinMM] Failed to open capture device.", errorCode = MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-            goto on_error;
-        }
-
-        pDescriptorCapture->format             = ma_format_from_WAVEFORMATEX(&wf);
-        pDescriptorCapture->channels           = wf.nChannels;
-        pDescriptorCapture->sampleRate         = wf.nSamplesPerSec;
-        ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorCapture->channels);
-        pDescriptorCapture->periodCount        = pDescriptorCapture->periodCount;
-        pDescriptorCapture->periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__winmm(pDescriptorCapture, pDescriptorCapture->sampleRate, pConfig->performanceProfile);
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        MA_WAVEOUTCAPSA caps;
-        MA_WAVEFORMATEX wf;
-        MA_MMRESULT resultMM;
-
-        /* We use an event to know when a new fragment needs to be enqueued. */
-        pDevice->winmm.hEventPlayback = (ma_handle)CreateEventA(NULL, TRUE, TRUE, NULL);
-        if (pDevice->winmm.hEventPlayback == NULL) {
-            errorMsg = "[WinMM] Failed to create event for fragment enqueing for the playback device.", errorCode = ma_result_from_GetLastError(GetLastError());
-            goto on_error;
-        }
-
-        /* The format should be based on the device's actual format. */
-        if (((MA_PFN_waveOutGetDevCapsA)pDevice->pContext->winmm.waveOutGetDevCapsA)(winMMDeviceIDPlayback, &caps, sizeof(caps)) != MA_MMSYSERR_NOERROR) {
-            errorMsg = "[WinMM] Failed to retrieve internal device caps.", errorCode = MA_FORMAT_NOT_SUPPORTED;
-            goto on_error;
-        }
-
-        result = ma_formats_flags_to_WAVEFORMATEX__winmm(caps.dwFormats, caps.wChannels, &wf);
-        if (result != MA_SUCCESS) {
-            errorMsg = "[WinMM] Could not find appropriate format for internal device.", errorCode = result;
-            goto on_error;
-        }
-
-        resultMM = ((MA_PFN_waveOutOpen)pDevice->pContext->winmm.waveOutOpen)((MA_HWAVEOUT*)&pDevice->winmm.hDevicePlayback, winMMDeviceIDPlayback, &wf, (DWORD_PTR)pDevice->winmm.hEventPlayback, (DWORD_PTR)pDevice, MA_CALLBACK_EVENT | MA_WAVE_ALLOWSYNC);
-        if (resultMM != MA_MMSYSERR_NOERROR) {
-            errorMsg = "[WinMM] Failed to open playback device.", errorCode = MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-            goto on_error;
-        }
-
-        pDescriptorPlayback->format             = ma_format_from_WAVEFORMATEX(&wf);
-        pDescriptorPlayback->channels           = wf.nChannels;
-        pDescriptorPlayback->sampleRate         = wf.nSamplesPerSec;
-        ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pDescriptorPlayback->channelMap, ma_countof(pDescriptorPlayback->channelMap), pDescriptorPlayback->channels);
-        pDescriptorPlayback->periodCount        = pDescriptorPlayback->periodCount;
-        pDescriptorPlayback->periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__winmm(pDescriptorPlayback, pDescriptorPlayback->sampleRate, pConfig->performanceProfile);
-    }
-
-    /*
-    The heap allocated data is allocated like so:
-
-    [Capture WAVEHDRs][Playback WAVEHDRs][Capture Intermediary Buffer][Playback Intermediary Buffer]
-    */
-    heapSize = 0;
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        heapSize += sizeof(MA_WAVEHDR)*pDescriptorCapture->periodCount + (pDescriptorCapture->periodSizeInFrames * pDescriptorCapture->periodCount * ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels));
-    }
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        heapSize += sizeof(MA_WAVEHDR)*pDescriptorPlayback->periodCount + (pDescriptorPlayback->periodSizeInFrames * pDescriptorPlayback->periodCount * ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels));
-    }
-
-    pDevice->winmm._pHeapData = (ma_uint8*)ma_calloc(heapSize, &pDevice->pContext->allocationCallbacks);
-    if (pDevice->winmm._pHeapData == NULL) {
-        errorMsg = "[WinMM] Failed to allocate memory for the intermediary buffer.", errorCode = MA_OUT_OF_MEMORY;
-        goto on_error;
-    }
-
-    MA_ZERO_MEMORY(pDevice->winmm._pHeapData, heapSize);
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ma_uint32 iPeriod;
-
-        if (pConfig->deviceType == ma_device_type_capture) {
-            pDevice->winmm.pWAVEHDRCapture            = pDevice->winmm._pHeapData;
-            pDevice->winmm.pIntermediaryBufferCapture = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*(pDescriptorCapture->periodCount));
-        } else {
-            pDevice->winmm.pWAVEHDRCapture            = pDevice->winmm._pHeapData;
-            pDevice->winmm.pIntermediaryBufferCapture = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*(pDescriptorCapture->periodCount + pDescriptorPlayback->periodCount));
-        }
-
-        /* Prepare headers. */
-        for (iPeriod = 0; iPeriod < pDescriptorCapture->periodCount; ++iPeriod) {
-            ma_uint32 periodSizeInBytes = ma_get_period_size_in_bytes(pDescriptorCapture->periodSizeInFrames, pDescriptorCapture->format, pDescriptorCapture->channels);
-
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].lpData         = (char*)(pDevice->winmm.pIntermediaryBufferCapture + (periodSizeInBytes*iPeriod));
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].dwBufferLength = periodSizeInBytes;
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].dwFlags        = 0L;
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].dwLoops        = 0L;
-            ((MA_PFN_waveInPrepareHeader)pDevice->pContext->winmm.waveInPrepareHeader)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod], sizeof(MA_WAVEHDR));
-
-            /*
-            The user data of the MA_WAVEHDR structure is a single flag the controls whether or not it is ready for writing. Consider it to be named "isLocked". A value of 0 means
-            it's unlocked and available for writing. A value of 1 means it's locked.
-            */
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].dwUser = 0;
-        }
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ma_uint32 iPeriod;
-
-        if (pConfig->deviceType == ma_device_type_playback) {
-            pDevice->winmm.pWAVEHDRPlayback            = pDevice->winmm._pHeapData;
-            pDevice->winmm.pIntermediaryBufferPlayback = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*pDescriptorPlayback->periodCount);
-        } else {
-            pDevice->winmm.pWAVEHDRPlayback            = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*(pDescriptorCapture->periodCount));
-            pDevice->winmm.pIntermediaryBufferPlayback = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*(pDescriptorCapture->periodCount + pDescriptorPlayback->periodCount)) + (pDescriptorCapture->periodSizeInFrames*pDescriptorCapture->periodCount*ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels));
-        }
-
-        /* Prepare headers. */
-        for (iPeriod = 0; iPeriod < pDescriptorPlayback->periodCount; ++iPeriod) {
-            ma_uint32 periodSizeInBytes = ma_get_period_size_in_bytes(pDescriptorPlayback->periodSizeInFrames, pDescriptorPlayback->format, pDescriptorPlayback->channels);
-
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].lpData         = (char*)(pDevice->winmm.pIntermediaryBufferPlayback + (periodSizeInBytes*iPeriod));
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].dwBufferLength = periodSizeInBytes;
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].dwFlags        = 0L;
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].dwLoops        = 0L;
-            ((MA_PFN_waveOutPrepareHeader)pDevice->pContext->winmm.waveOutPrepareHeader)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod], sizeof(MA_WAVEHDR));
-
-            /*
-            The user data of the MA_WAVEHDR structure is a single flag the controls whether or not it is ready for writing. Consider it to be named "isLocked". A value of 0 means
-            it's unlocked and available for writing. A value of 1 means it's locked.
-            */
-            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].dwUser = 0;
-        }
-    }
-
-    return MA_SUCCESS;
-
-on_error:
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        if (pDevice->winmm.pWAVEHDRCapture != NULL) {
-            ma_uint32 iPeriod;
-            for (iPeriod = 0; iPeriod < pDescriptorCapture->periodCount; ++iPeriod) {
-                ((MA_PFN_waveInUnprepareHeader)pDevice->pContext->winmm.waveInUnprepareHeader)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod], sizeof(MA_WAVEHDR));
-            }
-        }
-
-        ((MA_PFN_waveInClose)pDevice->pContext->winmm.waveInClose)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        if (pDevice->winmm.pWAVEHDRCapture != NULL) {
-            ma_uint32 iPeriod;
-            for (iPeriod = 0; iPeriod < pDescriptorPlayback->periodCount; ++iPeriod) {
-                ((MA_PFN_waveOutUnprepareHeader)pDevice->pContext->winmm.waveOutUnprepareHeader)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod], sizeof(MA_WAVEHDR));
-            }
-        }
-
-        ((MA_PFN_waveOutClose)pDevice->pContext->winmm.waveOutClose)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback);
-    }
-
-    ma_free(pDevice->winmm._pHeapData, &pDevice->pContext->allocationCallbacks);
-
-    if (errorMsg != NULL && errorMsg[0] != '\0') {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "%s", errorMsg);
-    }
-
-    return errorCode;
-}
-
-static ma_result ma_device_start__winmm(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        MA_MMRESULT resultMM;
-        MA_WAVEHDR* pWAVEHDR;
-        ma_uint32 iPeriod;
-
-        pWAVEHDR = (MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture;
-
-        /* Make sure the event is reset to a non-signaled state to ensure we don't prematurely return from WaitForSingleObject(). */
-        ResetEvent((HANDLE)pDevice->winmm.hEventCapture);
-
-        /* To start the device we attach all of the buffers and then start it. As the buffers are filled with data we will get notifications. */
-        for (iPeriod = 0; iPeriod < pDevice->capture.internalPeriods; ++iPeriod) {
-            resultMM = ((MA_PFN_waveInAddBuffer)pDevice->pContext->winmm.waveInAddBuffer)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod], sizeof(MA_WAVEHDR));
-            if (resultMM != MA_MMSYSERR_NOERROR) {
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WinMM] Failed to attach input buffers to capture device in preparation for capture.");
-                return ma_result_from_MMRESULT(resultMM);
-            }
-
-            /* Make sure all of the buffers start out locked. We don't want to access them until the backend tells us we can. */
-            pWAVEHDR[iPeriod].dwUser = 1;   /* 1 = locked. */
-        }
-
-        /* Capture devices need to be explicitly started, unlike playback devices. */
-        resultMM = ((MA_PFN_waveInStart)pDevice->pContext->winmm.waveInStart)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture);
-        if (resultMM != MA_MMSYSERR_NOERROR) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WinMM] Failed to start backend device.");
-            return ma_result_from_MMRESULT(resultMM);
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        /* Don't need to do anything for playback. It'll be started automatically in ma_device_start__winmm(). */
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__winmm(ma_device* pDevice)
-{
-    MA_MMRESULT resultMM;
-
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        if (pDevice->winmm.hDeviceCapture == NULL) {
-            return MA_INVALID_ARGS;
-        }
-
-        resultMM = ((MA_PFN_waveInReset)pDevice->pContext->winmm.waveInReset)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture);
-        if (resultMM != MA_MMSYSERR_NOERROR) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[WinMM] WARNING: Failed to reset capture device.");
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_uint32 iPeriod;
-        MA_WAVEHDR* pWAVEHDR;
-
-        if (pDevice->winmm.hDevicePlayback == NULL) {
-            return MA_INVALID_ARGS;
-        }
-
-        /* We need to drain the device. To do this we just loop over each header and if it's locked just wait for the event. */
-        pWAVEHDR = (MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback;
-        for (iPeriod = 0; iPeriod < pDevice->playback.internalPeriods; iPeriod += 1) {
-            if (pWAVEHDR[iPeriod].dwUser == 1) { /* 1 = locked. */
-                if (WaitForSingleObject((HANDLE)pDevice->winmm.hEventPlayback, INFINITE) != WAIT_OBJECT_0) {
-                    break;  /* An error occurred so just abandon ship and stop the device without draining. */
-                }
-
-                pWAVEHDR[iPeriod].dwUser = 0;
-            }
-        }
-
-        resultMM = ((MA_PFN_waveOutReset)pDevice->pContext->winmm.waveOutReset)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback);
-        if (resultMM != MA_MMSYSERR_NOERROR) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[WinMM] WARNING: Failed to reset playback device.");
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_write__winmm(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
-{
-    ma_result result = MA_SUCCESS;
-    MA_MMRESULT resultMM;
-    ma_uint32 totalFramesWritten;
-    MA_WAVEHDR* pWAVEHDR;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pPCMFrames != NULL);
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = 0;
-    }
-
-    pWAVEHDR = (MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback;
-
-    /* Keep processing as much data as possible. */
-    totalFramesWritten = 0;
-    while (totalFramesWritten < frameCount) {
-        /* If the current header has some space available we need to write part of it. */
-        if (pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwUser == 0) { /* 0 = unlocked. */
-            /*
-            This header has room in it. We copy as much of it as we can. If we end up fully consuming the buffer we need to
-            write it out and move on to the next iteration.
-            */
-            ma_uint32 bpf = ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-            ma_uint32 framesRemainingInHeader = (pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwBufferLength/bpf) - pDevice->winmm.headerFramesConsumedPlayback;
-
-            ma_uint32 framesToCopy = ma_min(framesRemainingInHeader, (frameCount - totalFramesWritten));
-            const void* pSrc = ma_offset_ptr(pPCMFrames, totalFramesWritten*bpf);
-            void* pDst = ma_offset_ptr(pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].lpData, pDevice->winmm.headerFramesConsumedPlayback*bpf);
-            MA_COPY_MEMORY(pDst, pSrc, framesToCopy*bpf);
-
-            pDevice->winmm.headerFramesConsumedPlayback += framesToCopy;
-            totalFramesWritten += framesToCopy;
-
-            /* If we've consumed the buffer entirely we need to write it out to the device. */
-            if (pDevice->winmm.headerFramesConsumedPlayback == (pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwBufferLength/bpf)) {
-                pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwUser = 1;            /* 1 = locked. */
-                pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwFlags &= ~MA_WHDR_DONE; /* <-- Need to make sure the WHDR_DONE flag is unset. */
-
-                /* Make sure the event is reset to a non-signaled state to ensure we don't prematurely return from WaitForSingleObject(). */
-                ResetEvent((HANDLE)pDevice->winmm.hEventPlayback);
-
-                /* The device will be started here. */
-                resultMM = ((MA_PFN_waveOutWrite)pDevice->pContext->winmm.waveOutWrite)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback, &pWAVEHDR[pDevice->winmm.iNextHeaderPlayback], sizeof(MA_WAVEHDR));
-                if (resultMM != MA_MMSYSERR_NOERROR) {
-                    result = ma_result_from_MMRESULT(resultMM);
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WinMM] waveOutWrite() failed.");
-                    break;
-                }
-
-                /* Make sure we move to the next header. */
-                pDevice->winmm.iNextHeaderPlayback = (pDevice->winmm.iNextHeaderPlayback + 1) % pDevice->playback.internalPeriods;
-                pDevice->winmm.headerFramesConsumedPlayback = 0;
-            }
-
-            /* If at this point we have consumed the entire input buffer we can return. */
-            MA_ASSERT(totalFramesWritten <= frameCount);
-            if (totalFramesWritten == frameCount) {
-                break;
-            }
-
-            /* Getting here means there's more to process. */
-            continue;
-        }
-
-        /* Getting here means there isn't enough room in the buffer and we need to wait for one to become available. */
-        if (WaitForSingleObject((HANDLE)pDevice->winmm.hEventPlayback, INFINITE) != WAIT_OBJECT_0) {
-            result = MA_ERROR;
-            break;
-        }
-
-        /* Something happened. If the next buffer has been marked as done we need to reset a bit of state. */
-        if ((pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwFlags & MA_WHDR_DONE) != 0) {
-            pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwUser = 0;    /* 0 = unlocked (make it available for writing). */
-            pDevice->winmm.headerFramesConsumedPlayback = 0;
-        }
-
-        /* If the device has been stopped we need to break. */
-        if (ma_device_get_state(pDevice) != ma_device_state_started) {
-            break;
-        }
-    }
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = totalFramesWritten;
-    }
-
-    return result;
-}
-
-static ma_result ma_device_read__winmm(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
-{
-    ma_result result = MA_SUCCESS;
-    MA_MMRESULT resultMM;
-    ma_uint32 totalFramesRead;
-    MA_WAVEHDR* pWAVEHDR;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pPCMFrames != NULL);
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    pWAVEHDR = (MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture;
-
-    /* Keep processing as much data as possible. */
-    totalFramesRead = 0;
-    while (totalFramesRead < frameCount) {
-        /* If the current header has some space available we need to write part of it. */
-        if (pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwUser == 0) { /* 0 = unlocked. */
-            /* The buffer is available for reading. If we fully consume it we need to add it back to the buffer. */
-            ma_uint32 bpf = ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-            ma_uint32 framesRemainingInHeader = (pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwBufferLength/bpf) - pDevice->winmm.headerFramesConsumedCapture;
-
-            ma_uint32 framesToCopy = ma_min(framesRemainingInHeader, (frameCount - totalFramesRead));
-            const void* pSrc = ma_offset_ptr(pWAVEHDR[pDevice->winmm.iNextHeaderCapture].lpData, pDevice->winmm.headerFramesConsumedCapture*bpf);
-            void* pDst = ma_offset_ptr(pPCMFrames, totalFramesRead*bpf);
-            MA_COPY_MEMORY(pDst, pSrc, framesToCopy*bpf);
-
-            pDevice->winmm.headerFramesConsumedCapture += framesToCopy;
-            totalFramesRead += framesToCopy;
-
-            /* If we've consumed the buffer entirely we need to add it back to the device. */
-            if (pDevice->winmm.headerFramesConsumedCapture == (pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwBufferLength/bpf)) {
-                pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwUser = 1;            /* 1 = locked. */
-                pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwFlags &= ~MA_WHDR_DONE; /* <-- Need to make sure the WHDR_DONE flag is unset. */
-
-                /* Make sure the event is reset to a non-signaled state to ensure we don't prematurely return from WaitForSingleObject(). */
-                ResetEvent((HANDLE)pDevice->winmm.hEventCapture);
-
-                /* The device will be started here. */
-                resultMM = ((MA_PFN_waveInAddBuffer)pDevice->pContext->winmm.waveInAddBuffer)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[pDevice->winmm.iNextHeaderCapture], sizeof(MA_WAVEHDR));
-                if (resultMM != MA_MMSYSERR_NOERROR) {
-                    result = ma_result_from_MMRESULT(resultMM);
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WinMM] waveInAddBuffer() failed.");
-                    break;
-                }
-
-                /* Make sure we move to the next header. */
-                pDevice->winmm.iNextHeaderCapture = (pDevice->winmm.iNextHeaderCapture + 1) % pDevice->capture.internalPeriods;
-                pDevice->winmm.headerFramesConsumedCapture = 0;
-            }
-
-            /* If at this point we have filled the entire input buffer we can return. */
-            MA_ASSERT(totalFramesRead <= frameCount);
-            if (totalFramesRead == frameCount) {
-                break;
-            }
-
-            /* Getting here means there's more to process. */
-            continue;
-        }
-
-        /* Getting here means there isn't enough any data left to send to the client which means we need to wait for more. */
-        if (WaitForSingleObject((HANDLE)pDevice->winmm.hEventCapture, INFINITE) != WAIT_OBJECT_0) {
-            result = MA_ERROR;
-            break;
-        }
-
-        /* Something happened. If the next buffer has been marked as done we need to reset a bit of state. */
-        if ((pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwFlags & MA_WHDR_DONE) != 0) {
-            pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwUser = 0;    /* 0 = unlocked (make it available for reading). */
-            pDevice->winmm.headerFramesConsumedCapture = 0;
-        }
-
-        /* If the device has been stopped we need to break. */
-        if (ma_device_get_state(pDevice) != ma_device_state_started) {
-            break;
-        }
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = totalFramesRead;
-    }
-
-    return result;
-}
-
-static ma_result ma_context_uninit__winmm(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_winmm);
-
-    ma_dlclose(ma_context_get_log(pContext), pContext->winmm.hWinMM);
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__winmm(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    MA_ASSERT(pContext != NULL);
-
-    (void)pConfig;
-
-    pContext->winmm.hWinMM = ma_dlopen(ma_context_get_log(pContext), "winmm.dll");
-    if (pContext->winmm.hWinMM == NULL) {
-        return MA_NO_BACKEND;
-    }
-
-    pContext->winmm.waveOutGetNumDevs      = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutGetNumDevs");
-    pContext->winmm.waveOutGetDevCapsA     = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutGetDevCapsA");
-    pContext->winmm.waveOutOpen            = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutOpen");
-    pContext->winmm.waveOutClose           = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutClose");
-    pContext->winmm.waveOutPrepareHeader   = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutPrepareHeader");
-    pContext->winmm.waveOutUnprepareHeader = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutUnprepareHeader");
-    pContext->winmm.waveOutWrite           = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutWrite");
-    pContext->winmm.waveOutReset           = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutReset");
-    pContext->winmm.waveInGetNumDevs       = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInGetNumDevs");
-    pContext->winmm.waveInGetDevCapsA      = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInGetDevCapsA");
-    pContext->winmm.waveInOpen             = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInOpen");
-    pContext->winmm.waveInClose            = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInClose");
-    pContext->winmm.waveInPrepareHeader    = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInPrepareHeader");
-    pContext->winmm.waveInUnprepareHeader  = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInUnprepareHeader");
-    pContext->winmm.waveInAddBuffer        = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInAddBuffer");
-    pContext->winmm.waveInStart            = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInStart");
-    pContext->winmm.waveInReset            = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInReset");
-
-    pCallbacks->onContextInit             = ma_context_init__winmm;
-    pCallbacks->onContextUninit           = ma_context_uninit__winmm;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__winmm;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__winmm;
-    pCallbacks->onDeviceInit              = ma_device_init__winmm;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__winmm;
-    pCallbacks->onDeviceStart             = ma_device_start__winmm;
-    pCallbacks->onDeviceStop              = ma_device_stop__winmm;
-    pCallbacks->onDeviceRead              = ma_device_read__winmm;
-    pCallbacks->onDeviceWrite             = ma_device_write__winmm;
-    pCallbacks->onDeviceDataLoop          = NULL;   /* This is a blocking read-write API, so this can be NULL since miniaudio will manage the audio thread for us. */
-
-    return MA_SUCCESS;
-}
-#endif
-
-
-
-
-/******************************************************************************
-
-ALSA Backend
-
-******************************************************************************/
-#ifdef MA_HAS_ALSA
-
-#include <poll.h>           /* poll(), struct pollfd */
-#include <sys/eventfd.h>    /* eventfd() */
-
-#ifdef MA_NO_RUNTIME_LINKING
-
-/* asoundlib.h marks some functions with "inline" which isn't always supported. Need to emulate it. */
-#if !defined(__cplusplus)
-    #if defined(__STRICT_ANSI__)
-        #if !defined(inline)
-            #define inline __inline__ __attribute__((always_inline))
-            #define MA_INLINE_DEFINED
-        #endif
-    #endif
-#endif
-#include <alsa/asoundlib.h>
-#if defined(MA_INLINE_DEFINED)
-    #undef inline
-    #undef MA_INLINE_DEFINED
-#endif
-
-typedef snd_pcm_uframes_t                       ma_snd_pcm_uframes_t;
-typedef snd_pcm_sframes_t                       ma_snd_pcm_sframes_t;
-typedef snd_pcm_stream_t                        ma_snd_pcm_stream_t;
-typedef snd_pcm_format_t                        ma_snd_pcm_format_t;
-typedef snd_pcm_access_t                        ma_snd_pcm_access_t;
-typedef snd_pcm_t                               ma_snd_pcm_t;
-typedef snd_pcm_hw_params_t                     ma_snd_pcm_hw_params_t;
-typedef snd_pcm_sw_params_t                     ma_snd_pcm_sw_params_t;
-typedef snd_pcm_format_mask_t                   ma_snd_pcm_format_mask_t;
-typedef snd_pcm_info_t                          ma_snd_pcm_info_t;
-typedef snd_pcm_channel_area_t                  ma_snd_pcm_channel_area_t;
-typedef snd_pcm_chmap_t                         ma_snd_pcm_chmap_t;
-typedef snd_pcm_state_t                         ma_snd_pcm_state_t;
-
-/* snd_pcm_stream_t */
-#define MA_SND_PCM_STREAM_PLAYBACK              SND_PCM_STREAM_PLAYBACK
-#define MA_SND_PCM_STREAM_CAPTURE               SND_PCM_STREAM_CAPTURE
-
-/* snd_pcm_format_t */
-#define MA_SND_PCM_FORMAT_UNKNOWN               SND_PCM_FORMAT_UNKNOWN
-#define MA_SND_PCM_FORMAT_U8                    SND_PCM_FORMAT_U8
-#define MA_SND_PCM_FORMAT_S16_LE                SND_PCM_FORMAT_S16_LE
-#define MA_SND_PCM_FORMAT_S16_BE                SND_PCM_FORMAT_S16_BE
-#define MA_SND_PCM_FORMAT_S24_LE                SND_PCM_FORMAT_S24_LE
-#define MA_SND_PCM_FORMAT_S24_BE                SND_PCM_FORMAT_S24_BE
-#define MA_SND_PCM_FORMAT_S32_LE                SND_PCM_FORMAT_S32_LE
-#define MA_SND_PCM_FORMAT_S32_BE                SND_PCM_FORMAT_S32_BE
-#define MA_SND_PCM_FORMAT_FLOAT_LE              SND_PCM_FORMAT_FLOAT_LE
-#define MA_SND_PCM_FORMAT_FLOAT_BE              SND_PCM_FORMAT_FLOAT_BE
-#define MA_SND_PCM_FORMAT_FLOAT64_LE            SND_PCM_FORMAT_FLOAT64_LE
-#define MA_SND_PCM_FORMAT_FLOAT64_BE            SND_PCM_FORMAT_FLOAT64_BE
-#define MA_SND_PCM_FORMAT_MU_LAW                SND_PCM_FORMAT_MU_LAW
-#define MA_SND_PCM_FORMAT_A_LAW                 SND_PCM_FORMAT_A_LAW
-#define MA_SND_PCM_FORMAT_S24_3LE               SND_PCM_FORMAT_S24_3LE
-#define MA_SND_PCM_FORMAT_S24_3BE               SND_PCM_FORMAT_S24_3BE
-
-/* ma_snd_pcm_access_t */
-#define MA_SND_PCM_ACCESS_MMAP_INTERLEAVED      SND_PCM_ACCESS_MMAP_INTERLEAVED
-#define MA_SND_PCM_ACCESS_MMAP_NONINTERLEAVED   SND_PCM_ACCESS_MMAP_NONINTERLEAVED
-#define MA_SND_PCM_ACCESS_MMAP_COMPLEX          SND_PCM_ACCESS_MMAP_COMPLEX
-#define MA_SND_PCM_ACCESS_RW_INTERLEAVED        SND_PCM_ACCESS_RW_INTERLEAVED
-#define MA_SND_PCM_ACCESS_RW_NONINTERLEAVED     SND_PCM_ACCESS_RW_NONINTERLEAVED
-
-/* Channel positions. */
-#define MA_SND_CHMAP_UNKNOWN                    SND_CHMAP_UNKNOWN
-#define MA_SND_CHMAP_NA                         SND_CHMAP_NA
-#define MA_SND_CHMAP_MONO                       SND_CHMAP_MONO
-#define MA_SND_CHMAP_FL                         SND_CHMAP_FL
-#define MA_SND_CHMAP_FR                         SND_CHMAP_FR
-#define MA_SND_CHMAP_RL                         SND_CHMAP_RL
-#define MA_SND_CHMAP_RR                         SND_CHMAP_RR
-#define MA_SND_CHMAP_FC                         SND_CHMAP_FC
-#define MA_SND_CHMAP_LFE                        SND_CHMAP_LFE
-#define MA_SND_CHMAP_SL                         SND_CHMAP_SL
-#define MA_SND_CHMAP_SR                         SND_CHMAP_SR
-#define MA_SND_CHMAP_RC                         SND_CHMAP_RC
-#define MA_SND_CHMAP_FLC                        SND_CHMAP_FLC
-#define MA_SND_CHMAP_FRC                        SND_CHMAP_FRC
-#define MA_SND_CHMAP_RLC                        SND_CHMAP_RLC
-#define MA_SND_CHMAP_RRC                        SND_CHMAP_RRC
-#define MA_SND_CHMAP_FLW                        SND_CHMAP_FLW
-#define MA_SND_CHMAP_FRW                        SND_CHMAP_FRW
-#define MA_SND_CHMAP_FLH                        SND_CHMAP_FLH
-#define MA_SND_CHMAP_FCH                        SND_CHMAP_FCH
-#define MA_SND_CHMAP_FRH                        SND_CHMAP_FRH
-#define MA_SND_CHMAP_TC                         SND_CHMAP_TC
-#define MA_SND_CHMAP_TFL                        SND_CHMAP_TFL
-#define MA_SND_CHMAP_TFR                        SND_CHMAP_TFR
-#define MA_SND_CHMAP_TFC                        SND_CHMAP_TFC
-#define MA_SND_CHMAP_TRL                        SND_CHMAP_TRL
-#define MA_SND_CHMAP_TRR                        SND_CHMAP_TRR
-#define MA_SND_CHMAP_TRC                        SND_CHMAP_TRC
-#define MA_SND_CHMAP_TFLC                       SND_CHMAP_TFLC
-#define MA_SND_CHMAP_TFRC                       SND_CHMAP_TFRC
-#define MA_SND_CHMAP_TSL                        SND_CHMAP_TSL
-#define MA_SND_CHMAP_TSR                        SND_CHMAP_TSR
-#define MA_SND_CHMAP_LLFE                       SND_CHMAP_LLFE
-#define MA_SND_CHMAP_RLFE                       SND_CHMAP_RLFE
-#define MA_SND_CHMAP_BC                         SND_CHMAP_BC
-#define MA_SND_CHMAP_BLC                        SND_CHMAP_BLC
-#define MA_SND_CHMAP_BRC                        SND_CHMAP_BRC
-
-/* Open mode flags. */
-#define MA_SND_PCM_NO_AUTO_RESAMPLE             SND_PCM_NO_AUTO_RESAMPLE
-#define MA_SND_PCM_NO_AUTO_CHANNELS             SND_PCM_NO_AUTO_CHANNELS
-#define MA_SND_PCM_NO_AUTO_FORMAT               SND_PCM_NO_AUTO_FORMAT
-#else
-#include <errno.h>  /* For EPIPE, etc. */
-typedef unsigned long                           ma_snd_pcm_uframes_t;
-typedef long                                    ma_snd_pcm_sframes_t;
-typedef int                                     ma_snd_pcm_stream_t;
-typedef int                                     ma_snd_pcm_format_t;
-typedef int                                     ma_snd_pcm_access_t;
-typedef int                                     ma_snd_pcm_state_t;
-typedef struct ma_snd_pcm_t                     ma_snd_pcm_t;
-typedef struct ma_snd_pcm_hw_params_t           ma_snd_pcm_hw_params_t;
-typedef struct ma_snd_pcm_sw_params_t           ma_snd_pcm_sw_params_t;
-typedef struct ma_snd_pcm_format_mask_t         ma_snd_pcm_format_mask_t;
-typedef struct ma_snd_pcm_info_t                ma_snd_pcm_info_t;
-typedef struct
-{
-    void* addr;
-    unsigned int first;
-    unsigned int step;
-} ma_snd_pcm_channel_area_t;
-typedef struct
-{
-    unsigned int channels;
-    unsigned int pos[1];
-} ma_snd_pcm_chmap_t;
-
-/* snd_pcm_state_t */
-#define MA_SND_PCM_STATE_OPEN                  0
-#define MA_SND_PCM_STATE_SETUP                 1
-#define MA_SND_PCM_STATE_PREPARED              2
-#define MA_SND_PCM_STATE_RUNNING               3
-#define MA_SND_PCM_STATE_XRUN                  4
-#define MA_SND_PCM_STATE_DRAINING              5
-#define MA_SND_PCM_STATE_PAUSED                6
-#define MA_SND_PCM_STATE_SUSPENDED             7
-#define MA_SND_PCM_STATE_DISCONNECTED          8
-
-/* snd_pcm_stream_t */
-#define MA_SND_PCM_STREAM_PLAYBACK             0
-#define MA_SND_PCM_STREAM_CAPTURE              1
-
-/* snd_pcm_format_t */
-#define MA_SND_PCM_FORMAT_UNKNOWN              -1
-#define MA_SND_PCM_FORMAT_U8                   1
-#define MA_SND_PCM_FORMAT_S16_LE               2
-#define MA_SND_PCM_FORMAT_S16_BE               3
-#define MA_SND_PCM_FORMAT_S24_LE               6
-#define MA_SND_PCM_FORMAT_S24_BE               7
-#define MA_SND_PCM_FORMAT_S32_LE               10
-#define MA_SND_PCM_FORMAT_S32_BE               11
-#define MA_SND_PCM_FORMAT_FLOAT_LE             14
-#define MA_SND_PCM_FORMAT_FLOAT_BE             15
-#define MA_SND_PCM_FORMAT_FLOAT64_LE           16
-#define MA_SND_PCM_FORMAT_FLOAT64_BE           17
-#define MA_SND_PCM_FORMAT_MU_LAW               20
-#define MA_SND_PCM_FORMAT_A_LAW                21
-#define MA_SND_PCM_FORMAT_S24_3LE              32
-#define MA_SND_PCM_FORMAT_S24_3BE              33
-
-/* snd_pcm_access_t */
-#define MA_SND_PCM_ACCESS_MMAP_INTERLEAVED     0
-#define MA_SND_PCM_ACCESS_MMAP_NONINTERLEAVED  1
-#define MA_SND_PCM_ACCESS_MMAP_COMPLEX         2
-#define MA_SND_PCM_ACCESS_RW_INTERLEAVED       3
-#define MA_SND_PCM_ACCESS_RW_NONINTERLEAVED    4
-
-/* Channel positions. */
-#define MA_SND_CHMAP_UNKNOWN                   0
-#define MA_SND_CHMAP_NA                        1
-#define MA_SND_CHMAP_MONO                      2
-#define MA_SND_CHMAP_FL                        3
-#define MA_SND_CHMAP_FR                        4
-#define MA_SND_CHMAP_RL                        5
-#define MA_SND_CHMAP_RR                        6
-#define MA_SND_CHMAP_FC                        7
-#define MA_SND_CHMAP_LFE                       8
-#define MA_SND_CHMAP_SL                        9
-#define MA_SND_CHMAP_SR                        10
-#define MA_SND_CHMAP_RC                        11
-#define MA_SND_CHMAP_FLC                       12
-#define MA_SND_CHMAP_FRC                       13
-#define MA_SND_CHMAP_RLC                       14
-#define MA_SND_CHMAP_RRC                       15
-#define MA_SND_CHMAP_FLW                       16
-#define MA_SND_CHMAP_FRW                       17
-#define MA_SND_CHMAP_FLH                       18
-#define MA_SND_CHMAP_FCH                       19
-#define MA_SND_CHMAP_FRH                       20
-#define MA_SND_CHMAP_TC                        21
-#define MA_SND_CHMAP_TFL                       22
-#define MA_SND_CHMAP_TFR                       23
-#define MA_SND_CHMAP_TFC                       24
-#define MA_SND_CHMAP_TRL                       25
-#define MA_SND_CHMAP_TRR                       26
-#define MA_SND_CHMAP_TRC                       27
-#define MA_SND_CHMAP_TFLC                      28
-#define MA_SND_CHMAP_TFRC                      29
-#define MA_SND_CHMAP_TSL                       30
-#define MA_SND_CHMAP_TSR                       31
-#define MA_SND_CHMAP_LLFE                      32
-#define MA_SND_CHMAP_RLFE                      33
-#define MA_SND_CHMAP_BC                        34
-#define MA_SND_CHMAP_BLC                       35
-#define MA_SND_CHMAP_BRC                       36
-
-/* Open mode flags. */
-#define MA_SND_PCM_NO_AUTO_RESAMPLE            0x00010000
-#define MA_SND_PCM_NO_AUTO_CHANNELS            0x00020000
-#define MA_SND_PCM_NO_AUTO_FORMAT              0x00040000
-#endif
-
-typedef int                  (* ma_snd_pcm_open_proc)                          (ma_snd_pcm_t **pcm, const char *name, ma_snd_pcm_stream_t stream, int mode);
-typedef int                  (* ma_snd_pcm_close_proc)                         (ma_snd_pcm_t *pcm);
-typedef size_t               (* ma_snd_pcm_hw_params_sizeof_proc)              (void);
-typedef int                  (* ma_snd_pcm_hw_params_any_proc)                 (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params);
-typedef int                  (* ma_snd_pcm_hw_params_set_format_proc)          (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_t val);
-typedef int                  (* ma_snd_pcm_hw_params_set_format_first_proc)    (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_t *format);
-typedef void                 (* ma_snd_pcm_hw_params_get_format_mask_proc)     (ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_mask_t *mask);
-typedef int                  (* ma_snd_pcm_hw_params_set_channels_proc)        (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val);
-typedef int                  (* ma_snd_pcm_hw_params_set_channels_near_proc)   (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int *val);
-typedef int                  (* ma_snd_pcm_hw_params_set_channels_minmax_proc) (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int *minimum, unsigned int *maximum);
-typedef int                  (* ma_snd_pcm_hw_params_set_rate_resample_proc)   (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val);
-typedef int                  (* ma_snd_pcm_hw_params_set_rate_proc)            (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val, int dir);
-typedef int                  (* ma_snd_pcm_hw_params_set_rate_near_proc)       (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int *val, int *dir);
-typedef int                  (* ma_snd_pcm_hw_params_set_buffer_size_near_proc)(ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_uframes_t *val);
-typedef int                  (* ma_snd_pcm_hw_params_set_periods_near_proc)    (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int *val, int *dir);
-typedef int                  (* ma_snd_pcm_hw_params_set_access_proc)          (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_access_t _access);
-typedef int                  (* ma_snd_pcm_hw_params_get_format_proc)          (const ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_t *format);
-typedef int                  (* ma_snd_pcm_hw_params_get_channels_proc)        (const ma_snd_pcm_hw_params_t *params, unsigned int *val);
-typedef int                  (* ma_snd_pcm_hw_params_get_channels_min_proc)    (const ma_snd_pcm_hw_params_t *params, unsigned int *val);
-typedef int                  (* ma_snd_pcm_hw_params_get_channels_max_proc)    (const ma_snd_pcm_hw_params_t *params, unsigned int *val);
-typedef int                  (* ma_snd_pcm_hw_params_get_rate_proc)            (const ma_snd_pcm_hw_params_t *params, unsigned int *rate, int *dir);
-typedef int                  (* ma_snd_pcm_hw_params_get_rate_min_proc)        (const ma_snd_pcm_hw_params_t *params, unsigned int *rate, int *dir);
-typedef int                  (* ma_snd_pcm_hw_params_get_rate_max_proc)        (const ma_snd_pcm_hw_params_t *params, unsigned int *rate, int *dir);
-typedef int                  (* ma_snd_pcm_hw_params_get_buffer_size_proc)     (const ma_snd_pcm_hw_params_t *params, ma_snd_pcm_uframes_t *val);
-typedef int                  (* ma_snd_pcm_hw_params_get_periods_proc)         (const ma_snd_pcm_hw_params_t *params, unsigned int *val, int *dir);
-typedef int                  (* ma_snd_pcm_hw_params_get_access_proc)          (const ma_snd_pcm_hw_params_t *params, ma_snd_pcm_access_t *_access);
-typedef int                  (* ma_snd_pcm_hw_params_test_format_proc)         (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_t val);
-typedef int                  (* ma_snd_pcm_hw_params_test_channels_proc)       (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val);
-typedef int                  (* ma_snd_pcm_hw_params_test_rate_proc)           (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val, int dir);
-typedef int                  (* ma_snd_pcm_hw_params_proc)                     (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params);
-typedef size_t               (* ma_snd_pcm_sw_params_sizeof_proc)              (void);
-typedef int                  (* ma_snd_pcm_sw_params_current_proc)             (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params);
-typedef int                  (* ma_snd_pcm_sw_params_get_boundary_proc)        (const ma_snd_pcm_sw_params_t *params, ma_snd_pcm_uframes_t* val);
-typedef int                  (* ma_snd_pcm_sw_params_set_avail_min_proc)       (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params, ma_snd_pcm_uframes_t val);
-typedef int                  (* ma_snd_pcm_sw_params_set_start_threshold_proc) (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params, ma_snd_pcm_uframes_t val);
-typedef int                  (* ma_snd_pcm_sw_params_set_stop_threshold_proc)  (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params, ma_snd_pcm_uframes_t val);
-typedef int                  (* ma_snd_pcm_sw_params_proc)                     (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params);
-typedef size_t               (* ma_snd_pcm_format_mask_sizeof_proc)            (void);
-typedef int                  (* ma_snd_pcm_format_mask_test_proc)              (const ma_snd_pcm_format_mask_t *mask, ma_snd_pcm_format_t val);
-typedef ma_snd_pcm_chmap_t * (* ma_snd_pcm_get_chmap_proc)                     (ma_snd_pcm_t *pcm);
-typedef ma_snd_pcm_state_t   (* ma_snd_pcm_state_proc)                         (ma_snd_pcm_t *pcm);
-typedef int                  (* ma_snd_pcm_prepare_proc)                       (ma_snd_pcm_t *pcm);
-typedef int                  (* ma_snd_pcm_start_proc)                         (ma_snd_pcm_t *pcm);
-typedef int                  (* ma_snd_pcm_drop_proc)                          (ma_snd_pcm_t *pcm);
-typedef int                  (* ma_snd_pcm_drain_proc)                         (ma_snd_pcm_t *pcm);
-typedef int                  (* ma_snd_pcm_reset_proc)                         (ma_snd_pcm_t *pcm);
-typedef int                  (* ma_snd_device_name_hint_proc)                  (int card, const char *iface, void ***hints);
-typedef char *               (* ma_snd_device_name_get_hint_proc)              (const void *hint, const char *id);
-typedef int                  (* ma_snd_card_get_index_proc)                    (const char *name);
-typedef int                  (* ma_snd_device_name_free_hint_proc)             (void **hints);
-typedef int                  (* ma_snd_pcm_mmap_begin_proc)                    (ma_snd_pcm_t *pcm, const ma_snd_pcm_channel_area_t **areas, ma_snd_pcm_uframes_t *offset, ma_snd_pcm_uframes_t *frames);
-typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_mmap_commit_proc)                   (ma_snd_pcm_t *pcm, ma_snd_pcm_uframes_t offset, ma_snd_pcm_uframes_t frames);
-typedef int                  (* ma_snd_pcm_recover_proc)                       (ma_snd_pcm_t *pcm, int err, int silent);
-typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_readi_proc)                         (ma_snd_pcm_t *pcm, void *buffer, ma_snd_pcm_uframes_t size);
-typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_writei_proc)                        (ma_snd_pcm_t *pcm, const void *buffer, ma_snd_pcm_uframes_t size);
-typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_avail_proc)                         (ma_snd_pcm_t *pcm);
-typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_avail_update_proc)                  (ma_snd_pcm_t *pcm);
-typedef int                  (* ma_snd_pcm_wait_proc)                          (ma_snd_pcm_t *pcm, int timeout);
-typedef int                  (* ma_snd_pcm_nonblock_proc)                      (ma_snd_pcm_t *pcm, int nonblock);
-typedef int                  (* ma_snd_pcm_info_proc)                          (ma_snd_pcm_t *pcm, ma_snd_pcm_info_t* info);
-typedef size_t               (* ma_snd_pcm_info_sizeof_proc)                   (void);
-typedef const char*          (* ma_snd_pcm_info_get_name_proc)                 (const ma_snd_pcm_info_t* info);
-typedef int                  (* ma_snd_pcm_poll_descriptors_proc)              (ma_snd_pcm_t *pcm, struct pollfd *pfds, unsigned int space);
-typedef int                  (* ma_snd_pcm_poll_descriptors_count_proc)        (ma_snd_pcm_t *pcm);
-typedef int                  (* ma_snd_pcm_poll_descriptors_revents_proc)      (ma_snd_pcm_t *pcm, struct pollfd *pfds, unsigned int nfds, unsigned short *revents);
-typedef int                  (* ma_snd_config_update_free_global_proc)         (void);
-
-/* This array specifies each of the common devices that can be used for both playback and capture. */
-static const char* g_maCommonDeviceNamesALSA[] = {
-    "default",
-    "null",
-    "pulse",
-    "jack"
-};
-
-/* This array allows us to blacklist specific playback devices. */
-static const char* g_maBlacklistedPlaybackDeviceNamesALSA[] = {
-    ""
-};
-
-/* This array allows us to blacklist specific capture devices. */
-static const char* g_maBlacklistedCaptureDeviceNamesALSA[] = {
-    ""
-};
-
-
-static ma_snd_pcm_format_t ma_convert_ma_format_to_alsa_format(ma_format format)
-{
-    ma_snd_pcm_format_t ALSAFormats[] = {
-        MA_SND_PCM_FORMAT_UNKNOWN,     /* ma_format_unknown */
-        MA_SND_PCM_FORMAT_U8,          /* ma_format_u8 */
-        MA_SND_PCM_FORMAT_S16_LE,      /* ma_format_s16 */
-        MA_SND_PCM_FORMAT_S24_3LE,     /* ma_format_s24 */
-        MA_SND_PCM_FORMAT_S32_LE,      /* ma_format_s32 */
-        MA_SND_PCM_FORMAT_FLOAT_LE     /* ma_format_f32 */
-    };
-
-    if (ma_is_big_endian()) {
-        ALSAFormats[0] = MA_SND_PCM_FORMAT_UNKNOWN;
-        ALSAFormats[1] = MA_SND_PCM_FORMAT_U8;
-        ALSAFormats[2] = MA_SND_PCM_FORMAT_S16_BE;
-        ALSAFormats[3] = MA_SND_PCM_FORMAT_S24_3BE;
-        ALSAFormats[4] = MA_SND_PCM_FORMAT_S32_BE;
-        ALSAFormats[5] = MA_SND_PCM_FORMAT_FLOAT_BE;
-    }
-
-    return ALSAFormats[format];
-}
-
-static ma_format ma_format_from_alsa(ma_snd_pcm_format_t formatALSA)
-{
-    if (ma_is_little_endian()) {
-        switch (formatALSA) {
-            case MA_SND_PCM_FORMAT_S16_LE:   return ma_format_s16;
-            case MA_SND_PCM_FORMAT_S24_3LE:  return ma_format_s24;
-            case MA_SND_PCM_FORMAT_S32_LE:   return ma_format_s32;
-            case MA_SND_PCM_FORMAT_FLOAT_LE: return ma_format_f32;
-            default: break;
-        }
-    } else {
-        switch (formatALSA) {
-            case MA_SND_PCM_FORMAT_S16_BE:   return ma_format_s16;
-            case MA_SND_PCM_FORMAT_S24_3BE:  return ma_format_s24;
-            case MA_SND_PCM_FORMAT_S32_BE:   return ma_format_s32;
-            case MA_SND_PCM_FORMAT_FLOAT_BE: return ma_format_f32;
-            default: break;
-        }
-    }
-
-    /* Endian agnostic. */
-    switch (formatALSA) {
-        case MA_SND_PCM_FORMAT_U8: return ma_format_u8;
-        default: return ma_format_unknown;
-    }
-}
-
-static ma_channel ma_convert_alsa_channel_position_to_ma_channel(unsigned int alsaChannelPos)
-{
-    switch (alsaChannelPos)
-    {
-        case MA_SND_CHMAP_MONO: return MA_CHANNEL_MONO;
-        case MA_SND_CHMAP_FL:   return MA_CHANNEL_FRONT_LEFT;
-        case MA_SND_CHMAP_FR:   return MA_CHANNEL_FRONT_RIGHT;
-        case MA_SND_CHMAP_RL:   return MA_CHANNEL_BACK_LEFT;
-        case MA_SND_CHMAP_RR:   return MA_CHANNEL_BACK_RIGHT;
-        case MA_SND_CHMAP_FC:   return MA_CHANNEL_FRONT_CENTER;
-        case MA_SND_CHMAP_LFE:  return MA_CHANNEL_LFE;
-        case MA_SND_CHMAP_SL:   return MA_CHANNEL_SIDE_LEFT;
-        case MA_SND_CHMAP_SR:   return MA_CHANNEL_SIDE_RIGHT;
-        case MA_SND_CHMAP_RC:   return MA_CHANNEL_BACK_CENTER;
-        case MA_SND_CHMAP_FLC:  return MA_CHANNEL_FRONT_LEFT_CENTER;
-        case MA_SND_CHMAP_FRC:  return MA_CHANNEL_FRONT_RIGHT_CENTER;
-        case MA_SND_CHMAP_RLC:  return 0;
-        case MA_SND_CHMAP_RRC:  return 0;
-        case MA_SND_CHMAP_FLW:  return 0;
-        case MA_SND_CHMAP_FRW:  return 0;
-        case MA_SND_CHMAP_FLH:  return 0;
-        case MA_SND_CHMAP_FCH:  return 0;
-        case MA_SND_CHMAP_FRH:  return 0;
-        case MA_SND_CHMAP_TC:   return MA_CHANNEL_TOP_CENTER;
-        case MA_SND_CHMAP_TFL:  return MA_CHANNEL_TOP_FRONT_LEFT;
-        case MA_SND_CHMAP_TFR:  return MA_CHANNEL_TOP_FRONT_RIGHT;
-        case MA_SND_CHMAP_TFC:  return MA_CHANNEL_TOP_FRONT_CENTER;
-        case MA_SND_CHMAP_TRL:  return MA_CHANNEL_TOP_BACK_LEFT;
-        case MA_SND_CHMAP_TRR:  return MA_CHANNEL_TOP_BACK_RIGHT;
-        case MA_SND_CHMAP_TRC:  return MA_CHANNEL_TOP_BACK_CENTER;
-        default: break;
-    }
-
-    return 0;
-}
-
-static ma_bool32 ma_is_common_device_name__alsa(const char* name)
-{
-    size_t iName;
-    for (iName = 0; iName < ma_countof(g_maCommonDeviceNamesALSA); ++iName) {
-        if (ma_strcmp(name, g_maCommonDeviceNamesALSA[iName]) == 0) {
-            return MA_TRUE;
-        }
-    }
-
-    return MA_FALSE;
-}
-
-
-static ma_bool32 ma_is_playback_device_blacklisted__alsa(const char* name)
-{
-    size_t iName;
-    for (iName = 0; iName < ma_countof(g_maBlacklistedPlaybackDeviceNamesALSA); ++iName) {
-        if (ma_strcmp(name, g_maBlacklistedPlaybackDeviceNamesALSA[iName]) == 0) {
-            return MA_TRUE;
-        }
-    }
-
-    return MA_FALSE;
-}
-
-static ma_bool32 ma_is_capture_device_blacklisted__alsa(const char* name)
-{
-    size_t iName;
-    for (iName = 0; iName < ma_countof(g_maBlacklistedCaptureDeviceNamesALSA); ++iName) {
-        if (ma_strcmp(name, g_maBlacklistedCaptureDeviceNamesALSA[iName]) == 0) {
-            return MA_TRUE;
-        }
-    }
-
-    return MA_FALSE;
-}
-
-static ma_bool32 ma_is_device_blacklisted__alsa(ma_device_type deviceType, const char* name)
-{
-    if (deviceType == ma_device_type_playback) {
-        return ma_is_playback_device_blacklisted__alsa(name);
-    } else {
-        return ma_is_capture_device_blacklisted__alsa(name);
-    }
-}
-
-
-static const char* ma_find_char(const char* str, char c, int* index)
-{
-    int i = 0;
-    for (;;) {
-        if (str[i] == '\0') {
-            if (index) *index = -1;
-            return NULL;
-        }
-
-        if (str[i] == c) {
-            if (index) *index = i;
-            return str + i;
-        }
-
-        i += 1;
-    }
-
-    /* Should never get here, but treat it as though the character was not found to make me feel better inside. */
-    if (index) *index = -1;
-    return NULL;
-}
-
-static ma_bool32 ma_is_device_name_in_hw_format__alsa(const char* hwid)
-{
-    /* This function is just checking whether or not hwid is in "hw:%d,%d" format. */
-
-    int commaPos;
-    const char* dev;
-    int i;
-
-    if (hwid == NULL) {
-        return MA_FALSE;
-    }
-
-    if (hwid[0] != 'h' || hwid[1] != 'w' || hwid[2] != ':') {
-        return MA_FALSE;
-    }
-
-    hwid += 3;
-
-    dev = ma_find_char(hwid, ',', &commaPos);
-    if (dev == NULL) {
-        return MA_FALSE;
-    } else {
-        dev += 1;   /* Skip past the ",". */
-    }
-
-    /* Check if the part between the ":" and the "," contains only numbers. If not, return false. */
-    for (i = 0; i < commaPos; ++i) {
-        if (hwid[i] < '0' || hwid[i] > '9') {
-            return MA_FALSE;
-        }
-    }
-
-    /* Check if everything after the "," is numeric. If not, return false. */
-    i = 0;
-    while (dev[i] != '\0') {
-        if (dev[i] < '0' || dev[i] > '9') {
-            return MA_FALSE;
-        }
-        i += 1;
-    }
-
-    return MA_TRUE;
-}
-
-static int ma_convert_device_name_to_hw_format__alsa(ma_context* pContext, char* dst, size_t dstSize, const char* src)  /* Returns 0 on success, non-0 on error. */
-{
-    /* src should look something like this: "hw:CARD=I82801AAICH,DEV=0" */
-
-    int colonPos;
-    int commaPos;
-    char card[256];
-    const char* dev;
-    int cardIndex;
-
-    if (dst == NULL) {
-        return -1;
-    }
-    if (dstSize < 7) {
-        return -1;     /* Absolute minimum size of the output buffer is 7 bytes. */
-    }
-
-    *dst = '\0';    /* Safety. */
-    if (src == NULL) {
-        return -1;
-    }
-
-    /* If the input name is already in "hw:%d,%d" format, just return that verbatim. */
-    if (ma_is_device_name_in_hw_format__alsa(src)) {
-        return ma_strcpy_s(dst, dstSize, src);
-    }
-
-    src = ma_find_char(src, ':', &colonPos);
-    if (src == NULL) {
-        return -1;  /* Couldn't find a colon */
-    }
-
-    dev = ma_find_char(src, ',', &commaPos);
-    if (dev == NULL) {
-        dev = "0";
-        ma_strncpy_s(card, sizeof(card), src+6, (size_t)-1);   /* +6 = ":CARD=" */
-    } else {
-        dev = dev + 5;  /* +5 = ",DEV=" */
-        ma_strncpy_s(card, sizeof(card), src+6, commaPos-6);   /* +6 = ":CARD=" */
-    }
-
-    cardIndex = ((ma_snd_card_get_index_proc)pContext->alsa.snd_card_get_index)(card);
-    if (cardIndex < 0) {
-        return -2;  /* Failed to retrieve the card index. */
-    }
-
-
-    /* Construction. */
-    dst[0] = 'h'; dst[1] = 'w'; dst[2] = ':';
-    if (ma_itoa_s(cardIndex, dst+3, dstSize-3, 10) != 0) {
-        return -3;
-    }
-    if (ma_strcat_s(dst, dstSize, ",") != 0) {
-        return -3;
-    }
-    if (ma_strcat_s(dst, dstSize, dev) != 0) {
-        return -3;
-    }
-
-    return 0;
-}
-
-static ma_bool32 ma_does_id_exist_in_list__alsa(ma_device_id* pUniqueIDs, ma_uint32 count, const char* pHWID)
-{
-    ma_uint32 i;
-
-    MA_ASSERT(pHWID != NULL);
-
-    for (i = 0; i < count; ++i) {
-        if (ma_strcmp(pUniqueIDs[i].alsa, pHWID) == 0) {
-            return MA_TRUE;
-        }
-    }
-
-    return MA_FALSE;
-}
-
-
-static ma_result ma_context_open_pcm__alsa(ma_context* pContext, ma_share_mode shareMode, ma_device_type deviceType, const ma_device_id* pDeviceID, int openMode, ma_snd_pcm_t** ppPCM)
-{
-    ma_snd_pcm_t* pPCM;
-    ma_snd_pcm_stream_t stream;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(ppPCM != NULL);
-
-    *ppPCM = NULL;
-    pPCM = NULL;
-
-    stream = (deviceType == ma_device_type_playback) ? MA_SND_PCM_STREAM_PLAYBACK : MA_SND_PCM_STREAM_CAPTURE;
-
-    if (pDeviceID == NULL) {
-        ma_bool32 isDeviceOpen;
-        size_t i;
-
-        /*
-        We're opening the default device. I don't know if trying anything other than "default" is necessary, but it makes
-        me feel better to try as hard as we can get to get _something_ working.
-        */
-        const char* defaultDeviceNames[] = {
-            "default",
-            NULL,
-            NULL,
-            NULL,
-            NULL,
-            NULL,
-            NULL
-        };
-
-        if (shareMode == ma_share_mode_exclusive) {
-            defaultDeviceNames[1] = "hw";
-            defaultDeviceNames[2] = "hw:0";
-            defaultDeviceNames[3] = "hw:0,0";
-        } else {
-            if (deviceType == ma_device_type_playback) {
-                defaultDeviceNames[1] = "dmix";
-                defaultDeviceNames[2] = "dmix:0";
-                defaultDeviceNames[3] = "dmix:0,0";
-            } else {
-                defaultDeviceNames[1] = "dsnoop";
-                defaultDeviceNames[2] = "dsnoop:0";
-                defaultDeviceNames[3] = "dsnoop:0,0";
-            }
-            defaultDeviceNames[4] = "hw";
-            defaultDeviceNames[5] = "hw:0";
-            defaultDeviceNames[6] = "hw:0,0";
-        }
-
-        isDeviceOpen = MA_FALSE;
-        for (i = 0; i < ma_countof(defaultDeviceNames); ++i) {
-            if (defaultDeviceNames[i] != NULL && defaultDeviceNames[i][0] != '\0') {
-                if (((ma_snd_pcm_open_proc)pContext->alsa.snd_pcm_open)(&pPCM, defaultDeviceNames[i], stream, openMode) == 0) {
-                    isDeviceOpen = MA_TRUE;
-                    break;
-                }
-            }
-        }
-
-        if (!isDeviceOpen) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[ALSA] snd_pcm_open() failed when trying to open an appropriate default device.");
-            return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-        }
-    } else {
-        /*
-        We're trying to open a specific device. There's a few things to consider here:
-
-        miniaudio recongnizes a special format of device id that excludes the "hw", "dmix", etc. prefix. It looks like this: ":0,0", ":0,1", etc. When
-        an ID of this format is specified, it indicates to miniaudio that it can try different combinations of plugins ("hw", "dmix", etc.) until it
-        finds an appropriate one that works. This comes in very handy when trying to open a device in shared mode ("dmix"), vs exclusive mode ("hw").
-        */
-
-        /* May end up needing to make small adjustments to the ID, so make a copy. */
-        ma_device_id deviceID = *pDeviceID;
-        int resultALSA = -ENODEV;
-
-        if (deviceID.alsa[0] != ':') {
-            /* The ID is not in ":0,0" format. Use the ID exactly as-is. */
-            resultALSA = ((ma_snd_pcm_open_proc)pContext->alsa.snd_pcm_open)(&pPCM, deviceID.alsa, stream, openMode);
-        } else {
-            char hwid[256];
-
-            /* The ID is in ":0,0" format. Try different plugins depending on the shared mode. */
-            if (deviceID.alsa[1] == '\0') {
-                deviceID.alsa[0] = '\0';  /* An ID of ":" should be converted to "". */
-            }
-
-            if (shareMode == ma_share_mode_shared) {
-                if (deviceType == ma_device_type_playback) {
-                    ma_strcpy_s(hwid, sizeof(hwid), "dmix");
-                } else {
-                    ma_strcpy_s(hwid, sizeof(hwid), "dsnoop");
-                }
-
-                if (ma_strcat_s(hwid, sizeof(hwid), deviceID.alsa) == 0) {
-                    resultALSA = ((ma_snd_pcm_open_proc)pContext->alsa.snd_pcm_open)(&pPCM, hwid, stream, openMode);
-                }
-            }
-
-            /* If at this point we still don't have an open device it means we're either preferencing exclusive mode or opening with "dmix"/"dsnoop" failed. */
-            if (resultALSA != 0) {
-                ma_strcpy_s(hwid, sizeof(hwid), "hw");
-                if (ma_strcat_s(hwid, sizeof(hwid), deviceID.alsa) == 0) {
-                    resultALSA = ((ma_snd_pcm_open_proc)pContext->alsa.snd_pcm_open)(&pPCM, hwid, stream, openMode);
-                }
-            }
-        }
-
-        if (resultALSA < 0) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[ALSA] snd_pcm_open() failed.");
-            return ma_result_from_errno(-resultALSA);
-        }
-    }
-
-    *ppPCM = pPCM;
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_context_enumerate_devices__alsa(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    int resultALSA;
-    ma_bool32 cbResult = MA_TRUE;
-    char** ppDeviceHints;
-    ma_device_id* pUniqueIDs = NULL;
-    ma_uint32 uniqueIDCount = 0;
-    char** ppNextDeviceHint;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    ma_mutex_lock(&pContext->alsa.internalDeviceEnumLock);
-
-    resultALSA = ((ma_snd_device_name_hint_proc)pContext->alsa.snd_device_name_hint)(-1, "pcm", (void***)&ppDeviceHints);
-    if (resultALSA < 0) {
-        ma_mutex_unlock(&pContext->alsa.internalDeviceEnumLock);
-        return ma_result_from_errno(-resultALSA);
-    }
-
-    ppNextDeviceHint = ppDeviceHints;
-    while (*ppNextDeviceHint != NULL) {
-        char* NAME = ((ma_snd_device_name_get_hint_proc)pContext->alsa.snd_device_name_get_hint)(*ppNextDeviceHint, "NAME");
-        char* DESC = ((ma_snd_device_name_get_hint_proc)pContext->alsa.snd_device_name_get_hint)(*ppNextDeviceHint, "DESC");
-        char* IOID = ((ma_snd_device_name_get_hint_proc)pContext->alsa.snd_device_name_get_hint)(*ppNextDeviceHint, "IOID");
-        ma_device_type deviceType = ma_device_type_playback;
-        ma_bool32 stopEnumeration = MA_FALSE;
-        char hwid[sizeof(pUniqueIDs->alsa)];
-        ma_device_info deviceInfo;
-
-        if ((IOID == NULL || ma_strcmp(IOID, "Output") == 0)) {
-            deviceType = ma_device_type_playback;
-        }
-        if ((IOID != NULL && ma_strcmp(IOID, "Input" ) == 0)) {
-            deviceType = ma_device_type_capture;
-        }
-
-        if (NAME != NULL) {
-            if (pContext->alsa.useVerboseDeviceEnumeration) {
-                /* Verbose mode. Use the name exactly as-is. */
-                ma_strncpy_s(hwid, sizeof(hwid), NAME, (size_t)-1);
-            } else {
-                /* Simplified mode. Use ":%d,%d" format. */
-                if (ma_convert_device_name_to_hw_format__alsa(pContext, hwid, sizeof(hwid), NAME) == 0) {
-                    /*
-                    At this point, hwid looks like "hw:0,0". In simplified enumeration mode, we actually want to strip off the
-                    plugin name so it looks like ":0,0". The reason for this is that this special format is detected at device
-                    initialization time and is used as an indicator to try and use the most appropriate plugin depending on the
-                    device type and sharing mode.
-                    */
-                    char* dst = hwid;
-                    char* src = hwid+2;
-                    while ((*dst++ = *src++));
-                } else {
-                    /* Conversion to "hw:%d,%d" failed. Just use the name as-is. */
-                    ma_strncpy_s(hwid, sizeof(hwid), NAME, (size_t)-1);
-                }
-
-                if (ma_does_id_exist_in_list__alsa(pUniqueIDs, uniqueIDCount, hwid)) {
-                    goto next_device;   /* The device has already been enumerated. Move on to the next one. */
-                } else {
-                    /* The device has not yet been enumerated. Make sure it's added to our list so that it's not enumerated again. */
-                    size_t newCapacity = sizeof(*pUniqueIDs) * (uniqueIDCount + 1);
-                    ma_device_id* pNewUniqueIDs = (ma_device_id*)ma_realloc(pUniqueIDs, newCapacity, &pContext->allocationCallbacks);
-                    if (pNewUniqueIDs == NULL) {
-                        goto next_device;   /* Failed to allocate memory. */
-                    }
-
-                    pUniqueIDs = pNewUniqueIDs;
-                    MA_COPY_MEMORY(pUniqueIDs[uniqueIDCount].alsa, hwid, sizeof(hwid));
-                    uniqueIDCount += 1;
-                }
-            }
-        } else {
-            MA_ZERO_MEMORY(hwid, sizeof(hwid));
-        }
-
-        MA_ZERO_OBJECT(&deviceInfo);
-        ma_strncpy_s(deviceInfo.id.alsa, sizeof(deviceInfo.id.alsa), hwid, (size_t)-1);
-
-        /*
-        There's no good way to determine whether or not a device is the default on Linux. We're just going to do something simple and
-        just use the name of "default" as the indicator.
-        */
-        if (ma_strcmp(deviceInfo.id.alsa, "default") == 0) {
-            deviceInfo.isDefault = MA_TRUE;
-        }
-
-
-        /*
-        DESC is the friendly name. We treat this slightly differently depending on whether or not we are using verbose
-        device enumeration. In verbose mode we want to take the entire description so that the end-user can distinguish
-        between the subdevices of each card/dev pair. In simplified mode, however, we only want the first part of the
-        description.
-
-        The value in DESC seems to be split into two lines, with the first line being the name of the device and the
-        second line being a description of the device. I don't like having the description be across two lines because
-        it makes formatting ugly and annoying. I'm therefore deciding to put it all on a single line with the second line
-        being put into parentheses. In simplified mode I'm just stripping the second line entirely.
-        */
-        if (DESC != NULL) {
-            int lfPos;
-            const char* line2 = ma_find_char(DESC, '\n', &lfPos);
-            if (line2 != NULL) {
-                line2 += 1; /* Skip past the new-line character. */
-
-                if (pContext->alsa.useVerboseDeviceEnumeration) {
-                    /* Verbose mode. Put the second line in brackets. */
-                    ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), DESC, lfPos);
-                    ma_strcat_s (deviceInfo.name, sizeof(deviceInfo.name), " (");
-                    ma_strcat_s (deviceInfo.name, sizeof(deviceInfo.name), line2);
-                    ma_strcat_s (deviceInfo.name, sizeof(deviceInfo.name), ")");
-                } else {
-                    /* Simplified mode. Strip the second line entirely. */
-                    ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), DESC, lfPos);
-                }
-            } else {
-                /* There's no second line. Just copy the whole description. */
-                ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), DESC, (size_t)-1);
-            }
-        }
-
-        if (!ma_is_device_blacklisted__alsa(deviceType, NAME)) {
-            cbResult = callback(pContext, deviceType, &deviceInfo, pUserData);
-        }
-
-        /*
-        Some devices are both playback and capture, but they are only enumerated by ALSA once. We need to fire the callback
-        again for the other device type in this case. We do this for known devices and where the IOID hint is NULL, which
-        means both Input and Output.
-        */
-        if (cbResult) {
-            if (ma_is_common_device_name__alsa(NAME) || IOID == NULL) {
-                if (deviceType == ma_device_type_playback) {
-                    if (!ma_is_capture_device_blacklisted__alsa(NAME)) {
-                        cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-                    }
-                } else {
-                    if (!ma_is_playback_device_blacklisted__alsa(NAME)) {
-                        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-                    }
-                }
-            }
-        }
-
-        if (cbResult == MA_FALSE) {
-            stopEnumeration = MA_TRUE;
-        }
-
-    next_device:
-        free(NAME);
-        free(DESC);
-        free(IOID);
-        ppNextDeviceHint += 1;
-
-        /* We need to stop enumeration if the callback returned false. */
-        if (stopEnumeration) {
-            break;
-        }
-    }
-
-    ma_free(pUniqueIDs, &pContext->allocationCallbacks);
-    ((ma_snd_device_name_free_hint_proc)pContext->alsa.snd_device_name_free_hint)((void**)ppDeviceHints);
-
-    ma_mutex_unlock(&pContext->alsa.internalDeviceEnumLock);
-
-    return MA_SUCCESS;
-}
-
-
-typedef struct
-{
-    ma_device_type deviceType;
-    const ma_device_id* pDeviceID;
-    ma_share_mode shareMode;
-    ma_device_info* pDeviceInfo;
-    ma_bool32 foundDevice;
-} ma_context_get_device_info_enum_callback_data__alsa;
-
-static ma_bool32 ma_context_get_device_info_enum_callback__alsa(ma_context* pContext, ma_device_type deviceType, const ma_device_info* pDeviceInfo, void* pUserData)
-{
-    ma_context_get_device_info_enum_callback_data__alsa* pData = (ma_context_get_device_info_enum_callback_data__alsa*)pUserData;
-    MA_ASSERT(pData != NULL);
-
-    (void)pContext;
-
-    if (pData->pDeviceID == NULL && ma_strcmp(pDeviceInfo->id.alsa, "default") == 0) {
-        ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), pDeviceInfo->name, (size_t)-1);
-        pData->foundDevice = MA_TRUE;
-    } else {
-        if (pData->deviceType == deviceType && (pData->pDeviceID != NULL && ma_strcmp(pData->pDeviceID->alsa, pDeviceInfo->id.alsa) == 0)) {
-            ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), pDeviceInfo->name, (size_t)-1);
-            pData->foundDevice = MA_TRUE;
-        }
-    }
-
-    /* Keep enumerating until we have found the device. */
-    return !pData->foundDevice;
-}
-
-static void ma_context_test_rate_and_add_native_data_format__alsa(ma_context* pContext, ma_snd_pcm_t* pPCM, ma_snd_pcm_hw_params_t* pHWParams, ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 flags, ma_device_info* pDeviceInfo)
-{
-    MA_ASSERT(pPCM        != NULL);
-    MA_ASSERT(pHWParams   != NULL);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    if (pDeviceInfo->nativeDataFormatCount < ma_countof(pDeviceInfo->nativeDataFormats) && ((ma_snd_pcm_hw_params_test_rate_proc)pContext->alsa.snd_pcm_hw_params_test_rate)(pPCM, pHWParams, sampleRate, 0) == 0) {
-        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
-        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
-        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = sampleRate;
-        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = flags;
-        pDeviceInfo->nativeDataFormatCount += 1;
-    }
-}
-
-static void ma_context_iterate_rates_and_add_native_data_format__alsa(ma_context* pContext, ma_snd_pcm_t* pPCM, ma_snd_pcm_hw_params_t* pHWParams, ma_format format, ma_uint32 channels, ma_uint32 flags, ma_device_info* pDeviceInfo)
-{
-    ma_uint32 iSampleRate;
-    unsigned int minSampleRate;
-    unsigned int maxSampleRate;
-    int sampleRateDir;  /* Not used. Just passed into snd_pcm_hw_params_get_rate_min/max(). */
-
-    /* There could be a range. */
-    ((ma_snd_pcm_hw_params_get_rate_min_proc)pContext->alsa.snd_pcm_hw_params_get_rate_min)(pHWParams, &minSampleRate, &sampleRateDir);
-    ((ma_snd_pcm_hw_params_get_rate_max_proc)pContext->alsa.snd_pcm_hw_params_get_rate_max)(pHWParams, &maxSampleRate, &sampleRateDir);
-
-    /* Make sure our sample rates are clamped to sane values. Stupid devices like "pulse" will reports rates like "1" which is ridiculus. */
-    minSampleRate = ma_clamp(minSampleRate, (unsigned int)ma_standard_sample_rate_min, (unsigned int)ma_standard_sample_rate_max);
-    maxSampleRate = ma_clamp(maxSampleRate, (unsigned int)ma_standard_sample_rate_min, (unsigned int)ma_standard_sample_rate_max);
-
-    for (iSampleRate = 0; iSampleRate < ma_countof(g_maStandardSampleRatePriorities); iSampleRate += 1) {
-        ma_uint32 standardSampleRate = g_maStandardSampleRatePriorities[iSampleRate];
-
-        if (standardSampleRate >= minSampleRate && standardSampleRate <= maxSampleRate) {
-            ma_context_test_rate_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, channels, standardSampleRate, flags, pDeviceInfo);
-        }
-    }
-
-    /* Now make sure our min and max rates are included just in case they aren't in the range of our standard rates. */
-    if (!ma_is_standard_sample_rate(minSampleRate)) {
-        ma_context_test_rate_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, channels, minSampleRate, flags, pDeviceInfo);
-    }
-
-    if (!ma_is_standard_sample_rate(maxSampleRate) && maxSampleRate != minSampleRate) {
-        ma_context_test_rate_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, channels, maxSampleRate, flags, pDeviceInfo);
-    }
-}
-
-static ma_result ma_context_get_device_info__alsa(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    ma_context_get_device_info_enum_callback_data__alsa data;
-    ma_result result;
-    int resultALSA;
-    ma_snd_pcm_t* pPCM;
-    ma_snd_pcm_hw_params_t* pHWParams;
-    ma_uint32 iFormat;
-    ma_uint32 iChannel;
-
-    MA_ASSERT(pContext != NULL);
-
-    /* We just enumerate to find basic information about the device. */
-    data.deviceType  = deviceType;
-    data.pDeviceID   = pDeviceID;
-    data.pDeviceInfo = pDeviceInfo;
-    data.foundDevice = MA_FALSE;
-    result = ma_context_enumerate_devices__alsa(pContext, ma_context_get_device_info_enum_callback__alsa, &data);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (!data.foundDevice) {
-        return MA_NO_DEVICE;
-    }
-
-    if (ma_strcmp(pDeviceInfo->id.alsa, "default") == 0) {
-        pDeviceInfo->isDefault = MA_TRUE;
-    }
-
-    /* For detailed info we need to open the device. */
-    result = ma_context_open_pcm__alsa(pContext, ma_share_mode_shared, deviceType, pDeviceID, 0, &pPCM);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* We need to initialize a HW parameters object in order to know what formats are supported. */
-    pHWParams = (ma_snd_pcm_hw_params_t*)ma_calloc(((ma_snd_pcm_hw_params_sizeof_proc)pContext->alsa.snd_pcm_hw_params_sizeof)(), &pContext->allocationCallbacks);
-    if (pHWParams == NULL) {
-        ((ma_snd_pcm_close_proc)pContext->alsa.snd_pcm_close)(pPCM);
-        return MA_OUT_OF_MEMORY;
-    }
-
-    resultALSA = ((ma_snd_pcm_hw_params_any_proc)pContext->alsa.snd_pcm_hw_params_any)(pPCM, pHWParams);
-    if (resultALSA < 0) {
-        ma_free(pHWParams, &pContext->allocationCallbacks);
-        ((ma_snd_pcm_close_proc)pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to initialize hardware parameters. snd_pcm_hw_params_any() failed.");
-        return ma_result_from_errno(-resultALSA);
-    }
-
-    /*
-    Some ALSA devices can support many permutations of formats, channels and rates. We only support
-    a fixed number of permutations which means we need to employ some strategies to ensure the best
-    combinations are returned. An example is the "pulse" device which can do it's own data conversion
-    in software and as a result can support any combination of format, channels and rate.
-
-    We want to ensure the the first data formats are the best. We have a list of favored sample
-    formats and sample rates, so these will be the basis of our iteration.
-    */
-
-    /* Formats. We just iterate over our standard formats and test them, making sure we reset the configuration space each iteration. */
-    for (iFormat = 0; iFormat < ma_countof(g_maFormatPriorities); iFormat += 1) {
-        ma_format format = g_maFormatPriorities[iFormat];
-
-        /*
-        For each format we need to make sure we reset the configuration space so we don't return
-        channel counts and rates that aren't compatible with a format.
-        */
-        ((ma_snd_pcm_hw_params_any_proc)pContext->alsa.snd_pcm_hw_params_any)(pPCM, pHWParams);
-
-        /* Test the format first. If this fails it means the format is not supported and we can skip it. */
-        if (((ma_snd_pcm_hw_params_test_format_proc)pContext->alsa.snd_pcm_hw_params_test_format)(pPCM, pHWParams, ma_convert_ma_format_to_alsa_format(format)) == 0) {
-            /* The format is supported. */
-            unsigned int minChannels;
-            unsigned int maxChannels;
-
-            /*
-            The configuration space needs to be restricted to this format so we can get an accurate
-            picture of which sample rates and channel counts are support with this format.
-            */
-            ((ma_snd_pcm_hw_params_set_format_proc)pContext->alsa.snd_pcm_hw_params_set_format)(pPCM, pHWParams, ma_convert_ma_format_to_alsa_format(format));
-
-            /* Now we need to check for supported channels. */
-            ((ma_snd_pcm_hw_params_get_channels_min_proc)pContext->alsa.snd_pcm_hw_params_get_channels_min)(pHWParams, &minChannels);
-            ((ma_snd_pcm_hw_params_get_channels_max_proc)pContext->alsa.snd_pcm_hw_params_get_channels_max)(pHWParams, &maxChannels);
-
-            if (minChannels > MA_MAX_CHANNELS) {
-                continue;   /* Too many channels. */
-            }
-            if (maxChannels < MA_MIN_CHANNELS) {
-                continue;   /* Not enough channels. */
-            }
-
-            /*
-            Make sure the channel count is clamped. This is mainly intended for the max channels
-            because some devices can report an unbound maximum.
-            */
-            minChannels = ma_clamp(minChannels, MA_MIN_CHANNELS, MA_MAX_CHANNELS);
-            maxChannels = ma_clamp(maxChannels, MA_MIN_CHANNELS, MA_MAX_CHANNELS);
-
-            if (minChannels == MA_MIN_CHANNELS && maxChannels == MA_MAX_CHANNELS) {
-                /* The device supports all channels. Don't iterate over every single one. Instead just set the channels to 0 which means all channels are supported. */
-                ma_context_iterate_rates_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, 0, 0, pDeviceInfo);    /* Intentionally setting the channel count to 0 as that means all channels are supported. */
-            } else {
-                /* The device only supports a specific set of channels. We need to iterate over all of them. */
-                for (iChannel = minChannels; iChannel <= maxChannels; iChannel += 1) {
-                    /* Test the channel before applying it to the configuration space. */
-                    unsigned int channels = iChannel;
-
-                    /* Make sure our channel range is reset before testing again or else we'll always fail the test. */
-                    ((ma_snd_pcm_hw_params_any_proc)pContext->alsa.snd_pcm_hw_params_any)(pPCM, pHWParams);
-                    ((ma_snd_pcm_hw_params_set_format_proc)pContext->alsa.snd_pcm_hw_params_set_format)(pPCM, pHWParams, ma_convert_ma_format_to_alsa_format(format));
-
-                    if (((ma_snd_pcm_hw_params_test_channels_proc)pContext->alsa.snd_pcm_hw_params_test_channels)(pPCM, pHWParams, channels) == 0) {
-                        /* The channel count is supported. */
-
-                        /* The configuration space now needs to be restricted to the channel count before extracting the sample rate. */
-                        ((ma_snd_pcm_hw_params_set_channels_proc)pContext->alsa.snd_pcm_hw_params_set_channels)(pPCM, pHWParams, channels);
-
-                        /* Only after the configuration space has been restricted to the specific channel count should we iterate over our sample rates. */
-                        ma_context_iterate_rates_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, channels, 0, pDeviceInfo);
-                    } else {
-                        /* The channel count is not supported. Skip. */
-                    }
-                }
-            }
-        } else {
-            /* The format is not supported. Skip. */
-        }
-    }
-
-    ma_free(pHWParams, &pContext->allocationCallbacks);
-
-    ((ma_snd_pcm_close_proc)pContext->alsa.snd_pcm_close)(pPCM);
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_uninit__alsa(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if ((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture) {
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture);
-        close(pDevice->alsa.wakeupfdCapture);
-        ma_free(pDevice->alsa.pPollDescriptorsCapture, &pDevice->pContext->allocationCallbacks);
-    }
-
-    if ((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback) {
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback);
-        close(pDevice->alsa.wakeupfdPlayback);
-        ma_free(pDevice->alsa.pPollDescriptorsPlayback, &pDevice->pContext->allocationCallbacks);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init_by_type__alsa(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptor, ma_device_type deviceType)
-{
-    ma_result result;
-    int resultALSA;
-    ma_snd_pcm_t* pPCM;
-    ma_bool32 isUsingMMap;
-    ma_snd_pcm_format_t formatALSA;
-    ma_format internalFormat;
-    ma_uint32 internalChannels;
-    ma_uint32 internalSampleRate;
-    ma_channel internalChannelMap[MA_MAX_CHANNELS];
-    ma_uint32 internalPeriodSizeInFrames;
-    ma_uint32 internalPeriods;
-    int openMode;
-    ma_snd_pcm_hw_params_t* pHWParams;
-    ma_snd_pcm_sw_params_t* pSWParams;
-    ma_snd_pcm_uframes_t bufferBoundary;
-    int pollDescriptorCount;
-    struct pollfd* pPollDescriptors;
-    int wakeupfd;
-
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(deviceType != ma_device_type_duplex); /* This function should only be called for playback _or_ capture, never duplex. */
-    MA_ASSERT(pDevice != NULL);
-
-    formatALSA = ma_convert_ma_format_to_alsa_format(pDescriptor->format);
-
-    openMode = 0;
-    if (pConfig->alsa.noAutoResample) {
-        openMode |= MA_SND_PCM_NO_AUTO_RESAMPLE;
-    }
-    if (pConfig->alsa.noAutoChannels) {
-        openMode |= MA_SND_PCM_NO_AUTO_CHANNELS;
-    }
-    if (pConfig->alsa.noAutoFormat) {
-        openMode |= MA_SND_PCM_NO_AUTO_FORMAT;
-    }
-
-    result = ma_context_open_pcm__alsa(pDevice->pContext, pDescriptor->shareMode, deviceType, pDescriptor->pDeviceID, openMode, &pPCM);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-
-    /* Hardware parameters. */
-    pHWParams = (ma_snd_pcm_hw_params_t*)ma_calloc(((ma_snd_pcm_hw_params_sizeof_proc)pDevice->pContext->alsa.snd_pcm_hw_params_sizeof)(), &pDevice->pContext->allocationCallbacks);
-    if (pHWParams == NULL) {
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to allocate memory for hardware parameters.");
-        return MA_OUT_OF_MEMORY;
-    }
-
-    resultALSA = ((ma_snd_pcm_hw_params_any_proc)pDevice->pContext->alsa.snd_pcm_hw_params_any)(pPCM, pHWParams);
-    if (resultALSA < 0) {
-        ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to initialize hardware parameters. snd_pcm_hw_params_any() failed.");
-        return ma_result_from_errno(-resultALSA);
-    }
-
-    /* MMAP Mode. Try using interleaved MMAP access. If this fails, fall back to standard readi/writei. */
-    isUsingMMap = MA_FALSE;
-#if 0   /* NOTE: MMAP mode temporarily disabled. */
-    if (deviceType != ma_device_type_capture) {    /* <-- Disabling MMAP mode for capture devices because I apparently do not have a device that supports it which means I can't test it... Contributions welcome. */
-        if (!pConfig->alsa.noMMap) {
-            if (((ma_snd_pcm_hw_params_set_access_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_access)(pPCM, pHWParams, MA_SND_PCM_ACCESS_MMAP_INTERLEAVED) == 0) {
-                pDevice->alsa.isUsingMMap = MA_TRUE;
-            }
-        }
-    }
-#endif
-
-    if (!isUsingMMap) {
-        resultALSA = ((ma_snd_pcm_hw_params_set_access_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_access)(pPCM, pHWParams, MA_SND_PCM_ACCESS_RW_INTERLEAVED);
-        if (resultALSA < 0) {
-            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set access mode to neither SND_PCM_ACCESS_MMAP_INTERLEAVED nor SND_PCM_ACCESS_RW_INTERLEAVED. snd_pcm_hw_params_set_access() failed.");
-            return ma_result_from_errno(-resultALSA);
-        }
-    }
-
-    /*
-    Most important properties first. The documentation for OSS (yes, I know this is ALSA!) recommends format, channels, then sample rate. I can't
-    find any documentation for ALSA specifically, so I'm going to copy the recommendation for OSS.
-    */
-
-    /* Format. */
-    {
-        /*
-        At this point we should have a list of supported formats, so now we need to find the best one. We first check if the requested format is
-        supported, and if so, use that one. If it's not supported, we just run though a list of formats and try to find the best one.
-        */
-        if (formatALSA == MA_SND_PCM_FORMAT_UNKNOWN || ((ma_snd_pcm_hw_params_test_format_proc)pDevice->pContext->alsa.snd_pcm_hw_params_test_format)(pPCM, pHWParams, formatALSA) != 0) {
-            /* We're either requesting the native format or the specified format is not supported. */
-            size_t iFormat;
-
-            formatALSA = MA_SND_PCM_FORMAT_UNKNOWN;
-            for (iFormat = 0; iFormat < ma_countof(g_maFormatPriorities); ++iFormat) {
-                if (((ma_snd_pcm_hw_params_test_format_proc)pDevice->pContext->alsa.snd_pcm_hw_params_test_format)(pPCM, pHWParams, ma_convert_ma_format_to_alsa_format(g_maFormatPriorities[iFormat])) == 0) {
-                    formatALSA = ma_convert_ma_format_to_alsa_format(g_maFormatPriorities[iFormat]);
-                    break;
-                }
-            }
-
-            if (formatALSA == MA_SND_PCM_FORMAT_UNKNOWN) {
-                ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-                ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Format not supported. The device does not support any miniaudio formats.");
-                return MA_FORMAT_NOT_SUPPORTED;
-            }
-        }
-
-        resultALSA = ((ma_snd_pcm_hw_params_set_format_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_format)(pPCM, pHWParams, formatALSA);
-        if (resultALSA < 0) {
-            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Format not supported. snd_pcm_hw_params_set_format() failed.");
-            return ma_result_from_errno(-resultALSA);
-        }
-
-        internalFormat = ma_format_from_alsa(formatALSA);
-        if (internalFormat == ma_format_unknown) {
-            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] The chosen format is not supported by miniaudio.");
-            return MA_FORMAT_NOT_SUPPORTED;
-        }
-    }
-
-    /* Channels. */
-    {
-        unsigned int channels = pDescriptor->channels;
-        if (channels == 0) {
-            channels = MA_DEFAULT_CHANNELS;
-        }
-
-        resultALSA = ((ma_snd_pcm_hw_params_set_channels_near_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_channels_near)(pPCM, pHWParams, &channels);
-        if (resultALSA < 0) {
-            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set channel count. snd_pcm_hw_params_set_channels_near() failed.");
-            return ma_result_from_errno(-resultALSA);
-        }
-
-        internalChannels = (ma_uint32)channels;
-    }
-
-    /* Sample Rate */
-    {
-        unsigned int sampleRate;
-
-        /*
-        It appears there's either a bug in ALSA, a bug in some drivers, or I'm doing something silly; but having resampling enabled causes
-        problems with some device configurations when used in conjunction with MMAP access mode. To fix this problem we need to disable
-        resampling.
-
-        To reproduce this problem, open the "plug:dmix" device, and set the sample rate to 44100. Internally, it looks like dmix uses a
-        sample rate of 48000. The hardware parameters will get set correctly with no errors, but it looks like the 44100 -> 48000 resampling
-        doesn't work properly - but only with MMAP access mode. You will notice skipping/crackling in the audio, and it'll run at a slightly
-        faster rate.
-
-        miniaudio has built-in support for sample rate conversion (albeit low quality at the moment), so disabling resampling should be fine
-        for us. The only problem is that it won't be taking advantage of any kind of hardware-accelerated resampling and it won't be very
-        good quality until I get a chance to improve the quality of miniaudio's software sample rate conversion.
-
-        I don't currently know if the dmix plugin is the only one with this error. Indeed, this is the only one I've been able to reproduce
-        this error with. In the future, we may want to restrict the disabling of resampling to only known bad plugins.
-        */
-        ((ma_snd_pcm_hw_params_set_rate_resample_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_rate_resample)(pPCM, pHWParams, 0);
-
-        sampleRate = pDescriptor->sampleRate;
-        if (sampleRate == 0) {
-            sampleRate = MA_DEFAULT_SAMPLE_RATE;
-        }
-
-        resultALSA = ((ma_snd_pcm_hw_params_set_rate_near_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_rate_near)(pPCM, pHWParams, &sampleRate, 0);
-        if (resultALSA < 0) {
-            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Sample rate not supported. snd_pcm_hw_params_set_rate_near() failed.");
-            return ma_result_from_errno(-resultALSA);
-        }
-
-        internalSampleRate = (ma_uint32)sampleRate;
-    }
-
-    /* Periods. */
-    {
-        ma_uint32 periods = pDescriptor->periodCount;
-
-        resultALSA = ((ma_snd_pcm_hw_params_set_periods_near_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_periods_near)(pPCM, pHWParams, &periods, NULL);
-        if (resultALSA < 0) {
-            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set period count. snd_pcm_hw_params_set_periods_near() failed.");
-            return ma_result_from_errno(-resultALSA);
-        }
-
-        internalPeriods = periods;
-    }
-
-    /* Buffer Size */
-    {
-        ma_snd_pcm_uframes_t actualBufferSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, internalSampleRate, pConfig->performanceProfile) * internalPeriods;
-
-        resultALSA = ((ma_snd_pcm_hw_params_set_buffer_size_near_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_buffer_size_near)(pPCM, pHWParams, &actualBufferSizeInFrames);
-        if (resultALSA < 0) {
-            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set buffer size for device. snd_pcm_hw_params_set_buffer_size() failed.");
-            return ma_result_from_errno(-resultALSA);
-        }
-
-        internalPeriodSizeInFrames = actualBufferSizeInFrames / internalPeriods;
-    }
-
-    /* Apply hardware parameters. */
-    resultALSA = ((ma_snd_pcm_hw_params_proc)pDevice->pContext->alsa.snd_pcm_hw_params)(pPCM, pHWParams);
-    if (resultALSA < 0) {
-        ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set hardware parameters. snd_pcm_hw_params() failed.");
-        return ma_result_from_errno(-resultALSA);
-    }
-
-    ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
-    pHWParams = NULL;
-
-
-    /* Software parameters. */
-    pSWParams = (ma_snd_pcm_sw_params_t*)ma_calloc(((ma_snd_pcm_sw_params_sizeof_proc)pDevice->pContext->alsa.snd_pcm_sw_params_sizeof)(), &pDevice->pContext->allocationCallbacks);
-    if (pSWParams == NULL) {
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to allocate memory for software parameters.");
-        return MA_OUT_OF_MEMORY;
-    }
-
-    resultALSA = ((ma_snd_pcm_sw_params_current_proc)pDevice->pContext->alsa.snd_pcm_sw_params_current)(pPCM, pSWParams);
-    if (resultALSA < 0) {
-        ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to initialize software parameters. snd_pcm_sw_params_current() failed.");
-        return ma_result_from_errno(-resultALSA);
-    }
-
-    resultALSA = ((ma_snd_pcm_sw_params_set_avail_min_proc)pDevice->pContext->alsa.snd_pcm_sw_params_set_avail_min)(pPCM, pSWParams, ma_prev_power_of_2(internalPeriodSizeInFrames));
-    if (resultALSA < 0) {
-        ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] snd_pcm_sw_params_set_avail_min() failed.");
-        return ma_result_from_errno(-resultALSA);
-    }
-
-    resultALSA = ((ma_snd_pcm_sw_params_get_boundary_proc)pDevice->pContext->alsa.snd_pcm_sw_params_get_boundary)(pSWParams, &bufferBoundary);
-    if (resultALSA < 0) {
-        bufferBoundary = internalPeriodSizeInFrames * internalPeriods;
-    }
-
-    if (deviceType == ma_device_type_playback && !isUsingMMap) {   /* Only playback devices in writei/readi mode need a start threshold. */
-        /*
-        Subtle detail here with the start threshold. When in playback-only mode (no full-duplex) we can set the start threshold to
-        the size of a period. But for full-duplex we need to set it such that it is at least two periods.
-        */
-        resultALSA = ((ma_snd_pcm_sw_params_set_start_threshold_proc)pDevice->pContext->alsa.snd_pcm_sw_params_set_start_threshold)(pPCM, pSWParams, internalPeriodSizeInFrames*2);
-        if (resultALSA < 0) {
-            ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
-            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set start threshold for playback device. snd_pcm_sw_params_set_start_threshold() failed.");
-            return ma_result_from_errno(-resultALSA);
-        }
-
-        resultALSA = ((ma_snd_pcm_sw_params_set_stop_threshold_proc)pDevice->pContext->alsa.snd_pcm_sw_params_set_stop_threshold)(pPCM, pSWParams, bufferBoundary);
-        if (resultALSA < 0) { /* Set to boundary to loop instead of stop in the event of an xrun. */
-            ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
-            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set stop threshold for playback device. snd_pcm_sw_params_set_stop_threshold() failed.");
-            return ma_result_from_errno(-resultALSA);
-        }
-    }
-
-    resultALSA = ((ma_snd_pcm_sw_params_proc)pDevice->pContext->alsa.snd_pcm_sw_params)(pPCM, pSWParams);
-    if (resultALSA < 0) {
-        ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set software parameters. snd_pcm_sw_params() failed.");
-        return ma_result_from_errno(-resultALSA);
-    }
-
-    ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
-    pSWParams = NULL;
-
-
-    /* Grab the internal channel map. For now we're not going to bother trying to change the channel map and instead just do it ourselves. */
-    {
-        ma_snd_pcm_chmap_t* pChmap = NULL;
-        if (pDevice->pContext->alsa.snd_pcm_get_chmap != NULL) {
-            pChmap = ((ma_snd_pcm_get_chmap_proc)pDevice->pContext->alsa.snd_pcm_get_chmap)(pPCM);
-        }
-
-        if (pChmap != NULL) {
-            ma_uint32 iChannel;
-
-            /* There are cases where the returned channel map can have a different channel count than was returned by snd_pcm_hw_params_set_channels_near(). */
-            if (pChmap->channels >= internalChannels) {
-                /* Drop excess channels. */
-                for (iChannel = 0; iChannel < internalChannels; ++iChannel) {
-                    internalChannelMap[iChannel] = ma_convert_alsa_channel_position_to_ma_channel(pChmap->pos[iChannel]);
-                }
-            } else {
-                ma_uint32 i;
-
-                /*
-                Excess channels use defaults. Do an initial fill with defaults, overwrite the first pChmap->channels, validate to ensure there are no duplicate
-                channels. If validation fails, fall back to defaults.
-                */
-                ma_bool32 isValid = MA_TRUE;
-
-                /* Fill with defaults. */
-                ma_channel_map_init_standard(ma_standard_channel_map_alsa, internalChannelMap, ma_countof(internalChannelMap), internalChannels);
-
-                /* Overwrite first pChmap->channels channels. */
-                for (iChannel = 0; iChannel < pChmap->channels; ++iChannel) {
-                    internalChannelMap[iChannel] = ma_convert_alsa_channel_position_to_ma_channel(pChmap->pos[iChannel]);
-                }
-
-                /* Validate. */
-                for (i = 0; i < internalChannels && isValid; ++i) {
-                    ma_uint32 j;
-                    for (j = i+1; j < internalChannels; ++j) {
-                        if (internalChannelMap[i] == internalChannelMap[j]) {
-                            isValid = MA_FALSE;
-                            break;
-                        }
-                    }
-                }
-
-                /* If our channel map is invalid, fall back to defaults. */
-                if (!isValid) {
-                    ma_channel_map_init_standard(ma_standard_channel_map_alsa, internalChannelMap, ma_countof(internalChannelMap), internalChannels);
-                }
-            }
-
-            free(pChmap);
-            pChmap = NULL;
-        } else {
-            /* Could not retrieve the channel map. Fall back to a hard-coded assumption. */
-            ma_channel_map_init_standard(ma_standard_channel_map_alsa, internalChannelMap, ma_countof(internalChannelMap), internalChannels);
-        }
-    }
-
-
-    /*
-    We need to retrieve the poll descriptors so we can use poll() to wait for data to become
-    available for reading or writing. There's no well defined maximum for this so we're just going
-    to allocate this on the heap.
-    */
-    pollDescriptorCount = ((ma_snd_pcm_poll_descriptors_count_proc)pDevice->pContext->alsa.snd_pcm_poll_descriptors_count)(pPCM);
-    if (pollDescriptorCount <= 0) {
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to retrieve poll descriptors count.");
-        return MA_ERROR;
-    }
-
-    pPollDescriptors = (struct pollfd*)ma_malloc(sizeof(*pPollDescriptors) * (pollDescriptorCount + 1), &pDevice->pContext->allocationCallbacks);   /* +1 because we want room for the wakeup descriptor. */
-    if (pPollDescriptors == NULL) {
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to allocate memory for poll descriptors.");
-        return MA_OUT_OF_MEMORY;
-    }
-
-    /*
-    We need an eventfd to wakeup from poll() and avoid a deadlock in situations where the driver
-    never returns from writei() and readi(). This has been observed with the "pulse" device.
-    */
-    wakeupfd = eventfd(0, 0);
-    if (wakeupfd < 0) {
-        ma_free(pPollDescriptors, &pDevice->pContext->allocationCallbacks);
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to create eventfd for poll wakeup.");
-        return ma_result_from_errno(errno);
-    }
-
-    /* We'll place the wakeup fd at the start of the buffer. */
-    pPollDescriptors[0].fd      = wakeupfd;
-    pPollDescriptors[0].events  = POLLIN;    /* We only care about waiting to read from the wakeup file descriptor. */
-    pPollDescriptors[0].revents = 0;
-
-    /* We can now extract the PCM poll descriptors which we place after the wakeup descriptor. */
-    pollDescriptorCount = ((ma_snd_pcm_poll_descriptors_proc)pDevice->pContext->alsa.snd_pcm_poll_descriptors)(pPCM, pPollDescriptors + 1, pollDescriptorCount);    /* +1 because we want to place these descriptors after the wakeup descriptor. */
-    if (pollDescriptorCount <= 0) {
-        close(wakeupfd);
-        ma_free(pPollDescriptors, &pDevice->pContext->allocationCallbacks);
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to retrieve poll descriptors.");
-        return MA_ERROR;
-    }
-
-    if (deviceType == ma_device_type_capture) {
-        pDevice->alsa.pollDescriptorCountCapture = pollDescriptorCount;
-        pDevice->alsa.pPollDescriptorsCapture = pPollDescriptors;
-        pDevice->alsa.wakeupfdCapture = wakeupfd;
-    } else {
-        pDevice->alsa.pollDescriptorCountPlayback = pollDescriptorCount;
-        pDevice->alsa.pPollDescriptorsPlayback = pPollDescriptors;
-        pDevice->alsa.wakeupfdPlayback = wakeupfd;
-    }
-
-
-    /* We're done. Prepare the device. */
-    resultALSA = ((ma_snd_pcm_prepare_proc)pDevice->pContext->alsa.snd_pcm_prepare)(pPCM);
-    if (resultALSA < 0) {
-        close(wakeupfd);
-        ma_free(pPollDescriptors, &pDevice->pContext->allocationCallbacks);
-        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to prepare device.");
-        return ma_result_from_errno(-resultALSA);
-    }
-
-
-    if (deviceType == ma_device_type_capture) {
-        pDevice->alsa.pPCMCapture         = (ma_ptr)pPCM;
-        pDevice->alsa.isUsingMMapCapture  = isUsingMMap;
-    } else {
-        pDevice->alsa.pPCMPlayback        = (ma_ptr)pPCM;
-        pDevice->alsa.isUsingMMapPlayback = isUsingMMap;
-    }
-
-    pDescriptor->format             = internalFormat;
-    pDescriptor->channels           = internalChannels;
-    pDescriptor->sampleRate         = internalSampleRate;
-    ma_channel_map_copy(pDescriptor->channelMap, internalChannelMap, ma_min(internalChannels, MA_MAX_CHANNELS));
-    pDescriptor->periodSizeInFrames = internalPeriodSizeInFrames;
-    pDescriptor->periodCount        = internalPeriods;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init__alsa(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ZERO_OBJECT(&pDevice->alsa);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ma_result result = ma_device_init_by_type__alsa(pDevice, pConfig, pDescriptorCapture, ma_device_type_capture);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ma_result result = ma_device_init_by_type__alsa(pDevice, pConfig, pDescriptorPlayback, ma_device_type_playback);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_start__alsa(ma_device* pDevice)
-{
-    int resultALSA;
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        resultALSA = ((ma_snd_pcm_start_proc)pDevice->pContext->alsa.snd_pcm_start)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture);
-        if (resultALSA < 0) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to start capture device.");
-            return ma_result_from_errno(-resultALSA);
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        /* Don't need to do anything for playback because it'll be started automatically when enough data has been written. */
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__alsa(ma_device* pDevice)
-{
-    /*
-    The stop callback will get called on the worker thread after read/write__alsa() has returned. At this point there is
-    a small chance that our wakeupfd has not been cleared. We'll clear that out now if applicable.
-    */
-    int resultPoll;
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Dropping capture device...\n");
-        ((ma_snd_pcm_drop_proc)pDevice->pContext->alsa.snd_pcm_drop)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture);
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Dropping capture device successful.\n");
-
-        /* We need to prepare the device again, otherwise we won't be able to restart the device. */
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing capture device...\n");
-        if (((ma_snd_pcm_prepare_proc)pDevice->pContext->alsa.snd_pcm_prepare)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture) < 0) {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing capture device failed.\n");
-        } else {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing capture device successful.\n");
-        }
-
-    /* Clear the wakeupfd. */
-    resultPoll = poll((struct pollfd*)pDevice->alsa.pPollDescriptorsCapture, 1, 0);
-    if (resultPoll > 0) {
-        ma_uint64 t;
-        read(((struct pollfd*)pDevice->alsa.pPollDescriptorsCapture)[0].fd, &t, sizeof(t));
-    }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Dropping playback device...\n");
-        ((ma_snd_pcm_drop_proc)pDevice->pContext->alsa.snd_pcm_drop)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback);
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Dropping playback device successful.\n");
-
-        /* We need to prepare the device again, otherwise we won't be able to restart the device. */
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing playback device...\n");
-        if (((ma_snd_pcm_prepare_proc)pDevice->pContext->alsa.snd_pcm_prepare)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback) < 0) {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing playback device failed.\n");
-        } else {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing playback device successful.\n");
-        }
-
-        /* Clear the wakeupfd. */
-    resultPoll = poll((struct pollfd*)pDevice->alsa.pPollDescriptorsPlayback, 1, 0);
-    if (resultPoll > 0) {
-        ma_uint64 t;
-        read(((struct pollfd*)pDevice->alsa.pPollDescriptorsPlayback)[0].fd, &t, sizeof(t));
-    }
-
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_wait__alsa(ma_device* pDevice, ma_snd_pcm_t* pPCM, struct pollfd* pPollDescriptors, int pollDescriptorCount, short requiredEvent)
-{
-    for (;;) {
-        unsigned short revents;
-        int resultALSA;
-        int resultPoll = poll(pPollDescriptors, pollDescriptorCount, -1);
-        if (resultPoll < 0) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] poll() failed.\n");
-            return ma_result_from_errno(errno);
-        }
-
-        /*
-        Before checking the ALSA poll descriptor flag we need to check if the wakeup descriptor
-        has had it's POLLIN flag set. If so, we need to actually read the data and then exit
-        function. The wakeup descriptor will be the first item in the descriptors buffer.
-        */
-        if ((pPollDescriptors[0].revents & POLLIN) != 0) {
-            ma_uint64 t;
-            int resultRead = read(pPollDescriptors[0].fd, &t, sizeof(t));    /* <-- Important that we read here so that the next write() does not block. */
-            if (resultRead < 0) {
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] read() failed.\n");
-                return ma_result_from_errno(errno);
-            }
-
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] POLLIN set for wakeupfd\n");
-            return MA_DEVICE_NOT_STARTED;
-        }
-
-        /*
-        Getting here means that some data should be able to be read. We need to use ALSA to
-        translate the revents flags for us.
-        */
-        resultALSA = ((ma_snd_pcm_poll_descriptors_revents_proc)pDevice->pContext->alsa.snd_pcm_poll_descriptors_revents)(pPCM, pPollDescriptors + 1, pollDescriptorCount - 1, &revents);   /* +1, -1 to ignore the wakeup descriptor. */
-        if (resultALSA < 0) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] snd_pcm_poll_descriptors_revents() failed.\n");
-            return ma_result_from_errno(-resultALSA);
-        }
-
-        if ((revents & POLLERR) != 0) {
-            ma_snd_pcm_state_t state = ((ma_snd_pcm_state_proc)pDevice->pContext->alsa.snd_pcm_state)(pPCM);
-            if (state == MA_SND_PCM_STATE_XRUN) {
-                /* The PCM is in a xrun state. This will be recovered from at a higher level. We can disregard this. */
-        } else {
-                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[ALSA] POLLERR detected. status = %d\n", ((ma_snd_pcm_state_proc)pDevice->pContext->alsa.snd_pcm_state)(pPCM));
-            }
-        }
-
-        if ((revents & requiredEvent) == requiredEvent) {
-            break;  /* We're done. Data available for reading or writing. */
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_wait_read__alsa(ma_device* pDevice)
-{
-    return ma_device_wait__alsa(pDevice, (ma_snd_pcm_t*)pDevice->alsa.pPCMCapture, (struct pollfd*)pDevice->alsa.pPollDescriptorsCapture, pDevice->alsa.pollDescriptorCountCapture + 1, POLLIN); /* +1 to account for the wakeup descriptor. */
-}
-
-static ma_result ma_device_wait_write__alsa(ma_device* pDevice)
-{
-    return ma_device_wait__alsa(pDevice, (ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback, (struct pollfd*)pDevice->alsa.pPollDescriptorsPlayback, pDevice->alsa.pollDescriptorCountPlayback + 1, POLLOUT); /* +1 to account for the wakeup descriptor. */
-}
-
-static ma_result ma_device_read__alsa(ma_device* pDevice, void* pFramesOut, ma_uint32 frameCount, ma_uint32* pFramesRead)
-{
-    ma_snd_pcm_sframes_t resultALSA = 0;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    while (ma_device_get_state(pDevice) == ma_device_state_started) {
-        ma_result result;
-
-        /* The first thing to do is wait for data to become available for reading. This will return an error code if the device has been stopped. */
-        result = ma_device_wait_read__alsa(pDevice);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        /* Getting here means we should have data available. */
-        resultALSA = ((ma_snd_pcm_readi_proc)pDevice->pContext->alsa.snd_pcm_readi)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture, pFramesOut, frameCount);
-        if (resultALSA >= 0) {
-            break;  /* Success. */
-        } else {
-            if (resultALSA == -EAGAIN) {
-                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "EGAIN (read)\n");*/
-                continue;   /* Try again. */
-            } else if (resultALSA == -EPIPE) {
-                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "EPIPE (read)\n");
-
-                /* Overrun. Recover and try again. If this fails we need to return an error. */
-                resultALSA = ((ma_snd_pcm_recover_proc)pDevice->pContext->alsa.snd_pcm_recover)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture, resultALSA, MA_TRUE);
-                if (resultALSA < 0) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to recover device after overrun.");
-                    return ma_result_from_errno((int)-resultALSA);
-                }
-
-                resultALSA = ((ma_snd_pcm_start_proc)pDevice->pContext->alsa.snd_pcm_start)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture);
-                if (resultALSA < 0) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to start device after underrun.");
-                    return ma_result_from_errno((int)-resultALSA);
-                }
-
-                continue;   /* Try reading again. */
-            }
-        }
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = resultALSA;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_write__alsa(ma_device* pDevice, const void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
-{
-    ma_snd_pcm_sframes_t resultALSA = 0;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pFrames != NULL);
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = 0;
-    }
-
-    while (ma_device_get_state(pDevice) == ma_device_state_started) {
-        ma_result result;
-
-        /* The first thing to do is wait for space to become available for writing. This will return an error code if the device has been stopped. */
-        result = ma_device_wait_write__alsa(pDevice);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        resultALSA = ((ma_snd_pcm_writei_proc)pDevice->pContext->alsa.snd_pcm_writei)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback, pFrames, frameCount);
-        if (resultALSA >= 0) {
-            break;  /* Success. */
-        } else {
-            if (resultALSA == -EAGAIN) {
-                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "EGAIN (write)\n");*/
-                continue;   /* Try again. */
-            } else if (resultALSA == -EPIPE) {
-                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "EPIPE (write)\n");
-
-                /* Underrun. Recover and try again. If this fails we need to return an error. */
-                resultALSA = ((ma_snd_pcm_recover_proc)pDevice->pContext->alsa.snd_pcm_recover)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback, resultALSA, MA_TRUE);    /* MA_TRUE=silent (don't print anything on error). */
-                if (resultALSA < 0) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to recover device after underrun.");
-                    return ma_result_from_errno((int)-resultALSA);
-                }
-
-                /*
-                In my testing I have had a situation where writei() does not automatically restart the device even though I've set it
-                up as such in the software parameters. What will happen is writei() will block indefinitely even though the number of
-                frames is well beyond the auto-start threshold. To work around this I've needed to add an explicit start here. Not sure
-                if this is me just being stupid and not recovering the device properly, but this definitely feels like something isn't
-                quite right here.
-                */
-                resultALSA = ((ma_snd_pcm_start_proc)pDevice->pContext->alsa.snd_pcm_start)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback);
-                if (resultALSA < 0) {
-                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to start device after underrun.");
-                    return ma_result_from_errno((int)-resultALSA);
-                }
-
-                continue;   /* Try writing again. */
-            }
-        }
-    }
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = resultALSA;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_data_loop_wakeup__alsa(ma_device* pDevice)
-{
-    ma_uint64 t = 1;
-    int resultWrite = 0;
-
-    MA_ASSERT(pDevice != NULL);
-
-    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Waking up...\n");
-
-    /* Write to an eventfd to trigger a wakeup from poll() and abort any reading or writing. */
-    if (pDevice->alsa.pPollDescriptorsCapture != NULL) {
-        resultWrite = write(pDevice->alsa.wakeupfdCapture, &t, sizeof(t));
-    }
-    if (pDevice->alsa.pPollDescriptorsPlayback != NULL) {
-        resultWrite = write(pDevice->alsa.wakeupfdPlayback, &t, sizeof(t));
-    }
-
-    if (resultWrite < 0) {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] write() failed.\n");
-        return ma_result_from_errno(errno);
-    }
-
-    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Waking up completed successfully.\n");
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_uninit__alsa(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_alsa);
-
-    /* Clean up memory for memory leak checkers. */
-    ((ma_snd_config_update_free_global_proc)pContext->alsa.snd_config_update_free_global)();
-
-#ifndef MA_NO_RUNTIME_LINKING
-    ma_dlclose(ma_context_get_log(pContext), pContext->alsa.asoundSO);
-#endif
-
-    ma_mutex_uninit(&pContext->alsa.internalDeviceEnumLock);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__alsa(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    ma_result result;
-#ifndef MA_NO_RUNTIME_LINKING
-    const char* libasoundNames[] = {
-        "libasound.so.2",
-        "libasound.so"
-    };
-    size_t i;
-
-    for (i = 0; i < ma_countof(libasoundNames); ++i) {
-        pContext->alsa.asoundSO = ma_dlopen(ma_context_get_log(pContext), libasoundNames[i]);
-        if (pContext->alsa.asoundSO != NULL) {
-            break;
-        }
-    }
-
-    if (pContext->alsa.asoundSO == NULL) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[ALSA] Failed to open shared object.\n");
-        return MA_NO_BACKEND;
-    }
-
-    pContext->alsa.snd_pcm_open                           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_open");
-    pContext->alsa.snd_pcm_close                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_close");
-    pContext->alsa.snd_pcm_hw_params_sizeof               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_sizeof");
-    pContext->alsa.snd_pcm_hw_params_any                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_any");
-    pContext->alsa.snd_pcm_hw_params_set_format           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_format");
-    pContext->alsa.snd_pcm_hw_params_set_format_first     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_format_first");
-    pContext->alsa.snd_pcm_hw_params_get_format_mask      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_format_mask");
-    pContext->alsa.snd_pcm_hw_params_set_channels         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_channels");
-    pContext->alsa.snd_pcm_hw_params_set_channels_near    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_channels_near");
-    pContext->alsa.snd_pcm_hw_params_set_channels_minmax  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_channels_minmax");
-    pContext->alsa.snd_pcm_hw_params_set_rate_resample    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_rate_resample");
-    pContext->alsa.snd_pcm_hw_params_set_rate             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_rate");
-    pContext->alsa.snd_pcm_hw_params_set_rate_near        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_rate_near");
-    pContext->alsa.snd_pcm_hw_params_set_buffer_size_near = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_buffer_size_near");
-    pContext->alsa.snd_pcm_hw_params_set_periods_near     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_periods_near");
-    pContext->alsa.snd_pcm_hw_params_set_access           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_access");
-    pContext->alsa.snd_pcm_hw_params_get_format           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_format");
-    pContext->alsa.snd_pcm_hw_params_get_channels         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_channels");
-    pContext->alsa.snd_pcm_hw_params_get_channels_min     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_channels_min");
-    pContext->alsa.snd_pcm_hw_params_get_channels_max     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_channels_max");
-    pContext->alsa.snd_pcm_hw_params_get_rate             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_rate");
-    pContext->alsa.snd_pcm_hw_params_get_rate_min         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_rate_min");
-    pContext->alsa.snd_pcm_hw_params_get_rate_max         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_rate_max");
-    pContext->alsa.snd_pcm_hw_params_get_buffer_size      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_buffer_size");
-    pContext->alsa.snd_pcm_hw_params_get_periods          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_periods");
-    pContext->alsa.snd_pcm_hw_params_get_access           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_access");
-    pContext->alsa.snd_pcm_hw_params_test_format          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_test_format");
-    pContext->alsa.snd_pcm_hw_params_test_channels        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_test_channels");
-    pContext->alsa.snd_pcm_hw_params_test_rate            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_test_rate");
-    pContext->alsa.snd_pcm_hw_params                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params");
-    pContext->alsa.snd_pcm_sw_params_sizeof               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_sizeof");
-    pContext->alsa.snd_pcm_sw_params_current              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_current");
-    pContext->alsa.snd_pcm_sw_params_get_boundary         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_get_boundary");
-    pContext->alsa.snd_pcm_sw_params_set_avail_min        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_set_avail_min");
-    pContext->alsa.snd_pcm_sw_params_set_start_threshold  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_set_start_threshold");
-    pContext->alsa.snd_pcm_sw_params_set_stop_threshold   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_set_stop_threshold");
-    pContext->alsa.snd_pcm_sw_params                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params");
-    pContext->alsa.snd_pcm_format_mask_sizeof             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_format_mask_sizeof");
-    pContext->alsa.snd_pcm_format_mask_test               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_format_mask_test");
-    pContext->alsa.snd_pcm_get_chmap                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_get_chmap");
-    pContext->alsa.snd_pcm_state                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_state");
-    pContext->alsa.snd_pcm_prepare                        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_prepare");
-    pContext->alsa.snd_pcm_start                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_start");
-    pContext->alsa.snd_pcm_drop                           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_drop");
-    pContext->alsa.snd_pcm_drain                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_drain");
-    pContext->alsa.snd_pcm_reset                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_reset");
-    pContext->alsa.snd_device_name_hint                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_device_name_hint");
-    pContext->alsa.snd_device_name_get_hint               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_device_name_get_hint");
-    pContext->alsa.snd_card_get_index                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_card_get_index");
-    pContext->alsa.snd_device_name_free_hint              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_device_name_free_hint");
-    pContext->alsa.snd_pcm_mmap_begin                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_mmap_begin");
-    pContext->alsa.snd_pcm_mmap_commit                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_mmap_commit");
-    pContext->alsa.snd_pcm_recover                        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_recover");
-    pContext->alsa.snd_pcm_readi                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_readi");
-    pContext->alsa.snd_pcm_writei                         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_writei");
-    pContext->alsa.snd_pcm_avail                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_avail");
-    pContext->alsa.snd_pcm_avail_update                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_avail_update");
-    pContext->alsa.snd_pcm_wait                           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_wait");
-    pContext->alsa.snd_pcm_nonblock                       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_nonblock");
-    pContext->alsa.snd_pcm_info                           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_info");
-    pContext->alsa.snd_pcm_info_sizeof                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_info_sizeof");
-    pContext->alsa.snd_pcm_info_get_name                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_info_get_name");
-    pContext->alsa.snd_pcm_poll_descriptors               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_poll_descriptors");
-    pContext->alsa.snd_pcm_poll_descriptors_count         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_poll_descriptors_count");
-    pContext->alsa.snd_pcm_poll_descriptors_revents       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_poll_descriptors_revents");
-    pContext->alsa.snd_config_update_free_global          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_config_update_free_global");
-#else
-    /* The system below is just for type safety. */
-    ma_snd_pcm_open_proc                           _snd_pcm_open                           = snd_pcm_open;
-    ma_snd_pcm_close_proc                          _snd_pcm_close                          = snd_pcm_close;
-    ma_snd_pcm_hw_params_sizeof_proc               _snd_pcm_hw_params_sizeof               = snd_pcm_hw_params_sizeof;
-    ma_snd_pcm_hw_params_any_proc                  _snd_pcm_hw_params_any                  = snd_pcm_hw_params_any;
-    ma_snd_pcm_hw_params_set_format_proc           _snd_pcm_hw_params_set_format           = snd_pcm_hw_params_set_format;
-    ma_snd_pcm_hw_params_set_format_first_proc     _snd_pcm_hw_params_set_format_first     = snd_pcm_hw_params_set_format_first;
-    ma_snd_pcm_hw_params_get_format_mask_proc      _snd_pcm_hw_params_get_format_mask      = snd_pcm_hw_params_get_format_mask;
-    ma_snd_pcm_hw_params_set_channels_proc         _snd_pcm_hw_params_set_channels         = snd_pcm_hw_params_set_channels;
-    ma_snd_pcm_hw_params_set_channels_near_proc    _snd_pcm_hw_params_set_channels_near    = snd_pcm_hw_params_set_channels_near;
-    ma_snd_pcm_hw_params_set_rate_resample_proc    _snd_pcm_hw_params_set_rate_resample    = snd_pcm_hw_params_set_rate_resample;
-    ma_snd_pcm_hw_params_set_rate_near             _snd_pcm_hw_params_set_rate             = snd_pcm_hw_params_set_rate;
-    ma_snd_pcm_hw_params_set_rate_near_proc        _snd_pcm_hw_params_set_rate_near        = snd_pcm_hw_params_set_rate_near;
-    ma_snd_pcm_hw_params_set_rate_minmax_proc      _snd_pcm_hw_params_set_rate_minmax      = snd_pcm_hw_params_set_rate_minmax;
-    ma_snd_pcm_hw_params_set_buffer_size_near_proc _snd_pcm_hw_params_set_buffer_size_near = snd_pcm_hw_params_set_buffer_size_near;
-    ma_snd_pcm_hw_params_set_periods_near_proc     _snd_pcm_hw_params_set_periods_near     = snd_pcm_hw_params_set_periods_near;
-    ma_snd_pcm_hw_params_set_access_proc           _snd_pcm_hw_params_set_access           = snd_pcm_hw_params_set_access;
-    ma_snd_pcm_hw_params_get_format_proc           _snd_pcm_hw_params_get_format           = snd_pcm_hw_params_get_format;
-    ma_snd_pcm_hw_params_get_channels_proc         _snd_pcm_hw_params_get_channels         = snd_pcm_hw_params_get_channels;
-    ma_snd_pcm_hw_params_get_channels_min_proc     _snd_pcm_hw_params_get_channels_min     = snd_pcm_hw_params_get_channels_min;
-    ma_snd_pcm_hw_params_get_channels_max_proc     _snd_pcm_hw_params_get_channels_max     = snd_pcm_hw_params_get_channels_max;
-    ma_snd_pcm_hw_params_get_rate_proc             _snd_pcm_hw_params_get_rate             = snd_pcm_hw_params_get_rate;
-    ma_snd_pcm_hw_params_get_rate_min_proc         _snd_pcm_hw_params_get_rate_min         = snd_pcm_hw_params_get_rate_min;
-    ma_snd_pcm_hw_params_get_rate_max_proc         _snd_pcm_hw_params_get_rate_max         = snd_pcm_hw_params_get_rate_max;
-    ma_snd_pcm_hw_params_get_buffer_size_proc      _snd_pcm_hw_params_get_buffer_size      = snd_pcm_hw_params_get_buffer_size;
-    ma_snd_pcm_hw_params_get_periods_proc          _snd_pcm_hw_params_get_periods          = snd_pcm_hw_params_get_periods;
-    ma_snd_pcm_hw_params_get_access_proc           _snd_pcm_hw_params_get_access           = snd_pcm_hw_params_get_access;
-    ma_snd_pcm_hw_params_test_format_proc          _snd_pcm_hw_params_test_format          = snd_pcm_hw_params_test_format;
-    ma_snd_pcm_hw_params_test_channels_proc        _snd_pcm_hw_params_test_channels        = snd_pcm_hw_params_test_channels;
-    ma_snd_pcm_hw_params_test_rate_proc            _snd_pcm_hw_params_test_rate            = snd_pcm_hw_params_test_rate;
-    ma_snd_pcm_hw_params_proc                      _snd_pcm_hw_params                      = snd_pcm_hw_params;
-    ma_snd_pcm_sw_params_sizeof_proc               _snd_pcm_sw_params_sizeof               = snd_pcm_sw_params_sizeof;
-    ma_snd_pcm_sw_params_current_proc              _snd_pcm_sw_params_current              = snd_pcm_sw_params_current;
-    ma_snd_pcm_sw_params_get_boundary_proc         _snd_pcm_sw_params_get_boundary         = snd_pcm_sw_params_get_boundary;
-    ma_snd_pcm_sw_params_set_avail_min_proc        _snd_pcm_sw_params_set_avail_min        = snd_pcm_sw_params_set_avail_min;
-    ma_snd_pcm_sw_params_set_start_threshold_proc  _snd_pcm_sw_params_set_start_threshold  = snd_pcm_sw_params_set_start_threshold;
-    ma_snd_pcm_sw_params_set_stop_threshold_proc   _snd_pcm_sw_params_set_stop_threshold   = snd_pcm_sw_params_set_stop_threshold;
-    ma_snd_pcm_sw_params_proc                      _snd_pcm_sw_params                      = snd_pcm_sw_params;
-    ma_snd_pcm_format_mask_sizeof_proc             _snd_pcm_format_mask_sizeof             = snd_pcm_format_mask_sizeof;
-    ma_snd_pcm_format_mask_test_proc               _snd_pcm_format_mask_test               = snd_pcm_format_mask_test;
-    ma_snd_pcm_get_chmap_proc                      _snd_pcm_get_chmap                      = snd_pcm_get_chmap;
-    ma_snd_pcm_state_proc                          _snd_pcm_state                          = snd_pcm_state;
-    ma_snd_pcm_prepare_proc                        _snd_pcm_prepare                        = snd_pcm_prepare;
-    ma_snd_pcm_start_proc                          _snd_pcm_start                          = snd_pcm_start;
-    ma_snd_pcm_drop_proc                           _snd_pcm_drop                           = snd_pcm_drop;
-    ma_snd_pcm_drain_proc                          _snd_pcm_drain                          = snd_pcm_drain;
-    ma_snd_pcm_reset_proc                          _snd_pcm_reset                          = snd_pcm_reset;
-    ma_snd_device_name_hint_proc                   _snd_device_name_hint                   = snd_device_name_hint;
-    ma_snd_device_name_get_hint_proc               _snd_device_name_get_hint               = snd_device_name_get_hint;
-    ma_snd_card_get_index_proc                     _snd_card_get_index                     = snd_card_get_index;
-    ma_snd_device_name_free_hint_proc              _snd_device_name_free_hint              = snd_device_name_free_hint;
-    ma_snd_pcm_mmap_begin_proc                     _snd_pcm_mmap_begin                     = snd_pcm_mmap_begin;
-    ma_snd_pcm_mmap_commit_proc                    _snd_pcm_mmap_commit                    = snd_pcm_mmap_commit;
-    ma_snd_pcm_recover_proc                        _snd_pcm_recover                        = snd_pcm_recover;
-    ma_snd_pcm_readi_proc                          _snd_pcm_readi                          = snd_pcm_readi;
-    ma_snd_pcm_writei_proc                         _snd_pcm_writei                         = snd_pcm_writei;
-    ma_snd_pcm_avail_proc                          _snd_pcm_avail                          = snd_pcm_avail;
-    ma_snd_pcm_avail_update_proc                   _snd_pcm_avail_update                   = snd_pcm_avail_update;
-    ma_snd_pcm_wait_proc                           _snd_pcm_wait                           = snd_pcm_wait;
-    ma_snd_pcm_nonblock_proc                       _snd_pcm_nonblock                       = snd_pcm_nonblock;
-    ma_snd_pcm_info_proc                           _snd_pcm_info                           = snd_pcm_info;
-    ma_snd_pcm_info_sizeof_proc                    _snd_pcm_info_sizeof                    = snd_pcm_info_sizeof;
-    ma_snd_pcm_info_get_name_proc                  _snd_pcm_info_get_name                  = snd_pcm_info_get_name;
-    ma_snd_pcm_poll_descriptors                    _snd_pcm_poll_descriptors               = snd_pcm_poll_descriptors;
-    ma_snd_pcm_poll_descriptors_count              _snd_pcm_poll_descriptors_count         = snd_pcm_poll_descriptors_count;
-    ma_snd_pcm_poll_descriptors_revents            _snd_pcm_poll_descriptors_revents       = snd_pcm_poll_descriptors_revents;
-    ma_snd_config_update_free_global_proc          _snd_config_update_free_global          = snd_config_update_free_global;
-
-    pContext->alsa.snd_pcm_open                           = (ma_proc)_snd_pcm_open;
-    pContext->alsa.snd_pcm_close                          = (ma_proc)_snd_pcm_close;
-    pContext->alsa.snd_pcm_hw_params_sizeof               = (ma_proc)_snd_pcm_hw_params_sizeof;
-    pContext->alsa.snd_pcm_hw_params_any                  = (ma_proc)_snd_pcm_hw_params_any;
-    pContext->alsa.snd_pcm_hw_params_set_format           = (ma_proc)_snd_pcm_hw_params_set_format;
-    pContext->alsa.snd_pcm_hw_params_set_format_first     = (ma_proc)_snd_pcm_hw_params_set_format_first;
-    pContext->alsa.snd_pcm_hw_params_get_format_mask      = (ma_proc)_snd_pcm_hw_params_get_format_mask;
-    pContext->alsa.snd_pcm_hw_params_set_channels         = (ma_proc)_snd_pcm_hw_params_set_channels;
-    pContext->alsa.snd_pcm_hw_params_set_channels_near    = (ma_proc)_snd_pcm_hw_params_set_channels_near;
-    pContext->alsa.snd_pcm_hw_params_set_channels_minmax  = (ma_proc)_snd_pcm_hw_params_set_channels_minmax;
-    pContext->alsa.snd_pcm_hw_params_set_rate_resample    = (ma_proc)_snd_pcm_hw_params_set_rate_resample;
-    pContext->alsa.snd_pcm_hw_params_set_rate             = (ma_proc)_snd_pcm_hw_params_set_rate;
-    pContext->alsa.snd_pcm_hw_params_set_rate_near        = (ma_proc)_snd_pcm_hw_params_set_rate_near;
-    pContext->alsa.snd_pcm_hw_params_set_buffer_size_near = (ma_proc)_snd_pcm_hw_params_set_buffer_size_near;
-    pContext->alsa.snd_pcm_hw_params_set_periods_near     = (ma_proc)_snd_pcm_hw_params_set_periods_near;
-    pContext->alsa.snd_pcm_hw_params_set_access           = (ma_proc)_snd_pcm_hw_params_set_access;
-    pContext->alsa.snd_pcm_hw_params_get_format           = (ma_proc)_snd_pcm_hw_params_get_format;
-    pContext->alsa.snd_pcm_hw_params_get_channels         = (ma_proc)_snd_pcm_hw_params_get_channels;
-    pContext->alsa.snd_pcm_hw_params_get_channels_min     = (ma_proc)_snd_pcm_hw_params_get_channels_min;
-    pContext->alsa.snd_pcm_hw_params_get_channels_max     = (ma_proc)_snd_pcm_hw_params_get_channels_max;
-    pContext->alsa.snd_pcm_hw_params_get_rate             = (ma_proc)_snd_pcm_hw_params_get_rate;
-    pContext->alsa.snd_pcm_hw_params_get_rate_min         = (ma_proc)_snd_pcm_hw_params_get_rate_min;
-    pContext->alsa.snd_pcm_hw_params_get_rate_max         = (ma_proc)_snd_pcm_hw_params_get_rate_max;
-    pContext->alsa.snd_pcm_hw_params_get_buffer_size      = (ma_proc)_snd_pcm_hw_params_get_buffer_size;
-    pContext->alsa.snd_pcm_hw_params_get_periods          = (ma_proc)_snd_pcm_hw_params_get_periods;
-    pContext->alsa.snd_pcm_hw_params_get_access           = (ma_proc)_snd_pcm_hw_params_get_access;
-    pContext->alsa.snd_pcm_hw_params_test_format          = (ma_proc)_snd_pcm_hw_params_test_format;
-    pContext->alsa.snd_pcm_hw_params_test_channels        = (ma_proc)_snd_pcm_hw_params_test_channels;
-    pContext->alsa.snd_pcm_hw_params_test_rate            = (ma_proc)_snd_pcm_hw_params_test_rate;
-    pContext->alsa.snd_pcm_hw_params                      = (ma_proc)_snd_pcm_hw_params;
-    pContext->alsa.snd_pcm_sw_params_sizeof               = (ma_proc)_snd_pcm_sw_params_sizeof;
-    pContext->alsa.snd_pcm_sw_params_current              = (ma_proc)_snd_pcm_sw_params_current;
-    pContext->alsa.snd_pcm_sw_params_get_boundary         = (ma_proc)_snd_pcm_sw_params_get_boundary;
-    pContext->alsa.snd_pcm_sw_params_set_avail_min        = (ma_proc)_snd_pcm_sw_params_set_avail_min;
-    pContext->alsa.snd_pcm_sw_params_set_start_threshold  = (ma_proc)_snd_pcm_sw_params_set_start_threshold;
-    pContext->alsa.snd_pcm_sw_params_set_stop_threshold   = (ma_proc)_snd_pcm_sw_params_set_stop_threshold;
-    pContext->alsa.snd_pcm_sw_params                      = (ma_proc)_snd_pcm_sw_params;
-    pContext->alsa.snd_pcm_format_mask_sizeof             = (ma_proc)_snd_pcm_format_mask_sizeof;
-    pContext->alsa.snd_pcm_format_mask_test               = (ma_proc)_snd_pcm_format_mask_test;
-    pContext->alsa.snd_pcm_get_chmap                      = (ma_proc)_snd_pcm_get_chmap;
-    pContext->alsa.snd_pcm_state                          = (ma_proc)_snd_pcm_state;
-    pContext->alsa.snd_pcm_prepare                        = (ma_proc)_snd_pcm_prepare;
-    pContext->alsa.snd_pcm_start                          = (ma_proc)_snd_pcm_start;
-    pContext->alsa.snd_pcm_drop                           = (ma_proc)_snd_pcm_drop;
-    pContext->alsa.snd_pcm_drain                          = (ma_proc)_snd_pcm_drain;
-    pContext->alsa.snd_pcm_reset                          = (ma_proc)_snd_pcm_reset;
-    pContext->alsa.snd_device_name_hint                   = (ma_proc)_snd_device_name_hint;
-    pContext->alsa.snd_device_name_get_hint               = (ma_proc)_snd_device_name_get_hint;
-    pContext->alsa.snd_card_get_index                     = (ma_proc)_snd_card_get_index;
-    pContext->alsa.snd_device_name_free_hint              = (ma_proc)_snd_device_name_free_hint;
-    pContext->alsa.snd_pcm_mmap_begin                     = (ma_proc)_snd_pcm_mmap_begin;
-    pContext->alsa.snd_pcm_mmap_commit                    = (ma_proc)_snd_pcm_mmap_commit;
-    pContext->alsa.snd_pcm_recover                        = (ma_proc)_snd_pcm_recover;
-    pContext->alsa.snd_pcm_readi                          = (ma_proc)_snd_pcm_readi;
-    pContext->alsa.snd_pcm_writei                         = (ma_proc)_snd_pcm_writei;
-    pContext->alsa.snd_pcm_avail                          = (ma_proc)_snd_pcm_avail;
-    pContext->alsa.snd_pcm_avail_update                   = (ma_proc)_snd_pcm_avail_update;
-    pContext->alsa.snd_pcm_wait                           = (ma_proc)_snd_pcm_wait;
-    pContext->alsa.snd_pcm_nonblock                       = (ma_proc)_snd_pcm_nonblock;
-    pContext->alsa.snd_pcm_info                           = (ma_proc)_snd_pcm_info;
-    pContext->alsa.snd_pcm_info_sizeof                    = (ma_proc)_snd_pcm_info_sizeof;
-    pContext->alsa.snd_pcm_info_get_name                  = (ma_proc)_snd_pcm_info_get_name;
-    pContext->alsa.snd_pcm_poll_descriptors               = (ma_proc)_snd_pcm_poll_descriptors;
-    pContext->alsa.snd_pcm_poll_descriptors_count         = (ma_proc)_snd_pcm_poll_descriptors_count;
-    pContext->alsa.snd_pcm_poll_descriptors_revents       = (ma_proc)_snd_pcm_poll_descriptors_revents;
-    pContext->alsa.snd_config_update_free_global          = (ma_proc)_snd_config_update_free_global;
-#endif
-
-    pContext->alsa.useVerboseDeviceEnumeration = pConfig->alsa.useVerboseDeviceEnumeration;
-
-    result = ma_mutex_init(&pContext->alsa.internalDeviceEnumLock);
-    if (result != MA_SUCCESS) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[ALSA] WARNING: Failed to initialize mutex for internal device enumeration.");
-        return result;
-    }
-
-    pCallbacks->onContextInit             = ma_context_init__alsa;
-    pCallbacks->onContextUninit           = ma_context_uninit__alsa;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__alsa;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__alsa;
-    pCallbacks->onDeviceInit              = ma_device_init__alsa;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__alsa;
-    pCallbacks->onDeviceStart             = ma_device_start__alsa;
-    pCallbacks->onDeviceStop              = ma_device_stop__alsa;
-    pCallbacks->onDeviceRead              = ma_device_read__alsa;
-    pCallbacks->onDeviceWrite             = ma_device_write__alsa;
-    pCallbacks->onDeviceDataLoop          = NULL;
-    pCallbacks->onDeviceDataLoopWakeup    = ma_device_data_loop_wakeup__alsa;
-
-    return MA_SUCCESS;
-}
-#endif  /* ALSA */
-
-
-
-/******************************************************************************
-
-PulseAudio Backend
-
-******************************************************************************/
-#ifdef MA_HAS_PULSEAUDIO
-/*
-The PulseAudio API, along with Apple's Core Audio, is the worst of the maintream audio APIs. This is a brief description of what's going on
-in the PulseAudio backend. I apologize if this gets a bit ranty for your liking - you might want to skip this discussion.
-
-PulseAudio has something they call the "Simple API", which unfortunately isn't suitable for miniaudio. I've not seen anywhere where it
-allows you to enumerate over devices, nor does it seem to support the ability to stop and start streams. Looking at the documentation, it
-appears as though the stream is constantly running and you prevent sound from being emitted or captured by simply not calling the read or
-write functions. This is not a professional solution as it would be much better to *actually* stop the underlying stream. Perhaps the
-simple API has some smarts to do this automatically, but I'm not sure. Another limitation with the simple API is that it seems inefficient
-when you want to have multiple streams to a single context. For these reasons, miniaudio is not using the simple API.
-
-Since we're not using the simple API, we're left with the asynchronous API as our only other option. And boy, is this where it starts to
-get fun, and I don't mean that in a good way...
-
-The problems start with the very name of the API - "asynchronous". Yes, this is an asynchronous oriented API which means your commands
-don't immediately take effect. You instead need to issue your commands, and then wait for them to complete. The waiting mechanism is
-enabled through the use of a "main loop". In the asychronous API you cannot get away from the main loop, and the main loop is where almost
-all of PulseAudio's problems stem from.
-
-When you first initialize PulseAudio you need an object referred to as "main loop". You can implement this yourself by defining your own
-vtable, but it's much easier to just use one of the built-in main loop implementations. There's two generic implementations called
-pa_mainloop and pa_threaded_mainloop, and another implementation specific to GLib called pa_glib_mainloop. We're using pa_threaded_mainloop
-because it simplifies management of the worker thread. The idea of the main loop object is pretty self explanatory - you're supposed to use
-it to implement a worker thread which runs in a loop. The main loop is where operations are actually executed.
-
-To initialize the main loop, you just use `pa_threaded_mainloop_new()`. This is the first function you'll call. You can then get a pointer
-to the vtable with `pa_threaded_mainloop_get_api()` (the main loop vtable is called `pa_mainloop_api`). Again, you can bypass the threaded
-main loop object entirely and just implement `pa_mainloop_api` directly, but there's no need for it unless you're doing something extremely
-specialized such as if you want to integrate it into your application's existing main loop infrastructure.
-
-(EDIT 2021-01-26: miniaudio is no longer using `pa_threaded_mainloop` due to this issue: https://github.com/mackron/miniaudio/issues/262.
-It is now using `pa_mainloop` which turns out to be a simpler solution anyway. The rest of this rant still applies, however.)
-
-Once you have your main loop vtable (the `pa_mainloop_api` object) you can create the PulseAudio context. This is very similar to
-miniaudio's context and they map to each other quite well. You have one context to many streams, which is basically the same as miniaudio's
-one `ma_context` to many `ma_device`s. Here's where it starts to get annoying, however. When you first create the PulseAudio context, which
-is done with `pa_context_new()`, it's not actually connected to anything. When you connect, you call `pa_context_connect()`. However, if
-you remember, PulseAudio is an asynchronous API. That means you cannot just assume the context is connected after `pa_context_context()`
-has returned. You instead need to wait for it to connect. To do this, you need to either wait for a callback to get fired, which you can
-set with `pa_context_set_state_callback()`, or you can continuously poll the context's state. Either way, you need to run this in a loop.
-All objects from here out are created from the context, and, I believe, you can't be creating these objects until the context is connected.
-This waiting loop is therefore unavoidable. In order for the waiting to ever complete, however, the main loop needs to be running. Before
-attempting to connect the context, the main loop needs to be started with `pa_threaded_mainloop_start()`.
-
-The reason for this asynchronous design is to support cases where you're connecting to a remote server, say through a local network or an
-internet connection. However, the *VAST* majority of cases don't involve this at all - they just connect to a local "server" running on the
-host machine. The fact that this would be the default rather than making `pa_context_connect()` synchronous tends to boggle the mind.
-
-Once the context has been created and connected you can start creating a stream. A PulseAudio stream is analogous to miniaudio's device.
-The initialization of a stream is fairly standard - you configure some attributes (analogous to miniaudio's device config) and then call
-`pa_stream_new()` to actually create it. Here is where we start to get into "operations". When configuring the stream, you can get
-information about the source (such as sample format, sample rate, etc.), however it's not synchronous. Instead, a `pa_operation` object
-is returned from `pa_context_get_source_info_by_name()` (capture) or `pa_context_get_sink_info_by_name()` (playback). Then, you need to
-run a loop (again!) to wait for the operation to complete which you can determine via a callback or polling, just like we did with the
-context. Then, as an added bonus, you need to decrement the reference counter of the `pa_operation` object to ensure memory is cleaned up.
-All of that just to retrieve basic information about a device!
-
-Once the basic information about the device has been retrieved, miniaudio can now create the stream with `ma_stream_new()`. Like the
-context, this needs to be connected. But we need to be careful here, because we're now about to introduce one of the most horrific design
-choices in PulseAudio.
-
-PulseAudio allows you to specify a callback that is fired when data can be written to or read from a stream. The language is important here
-because PulseAudio takes it literally, specifically the "can be". You would think these callbacks would be appropriate as the place for
-writing and reading data to and from the stream, and that would be right, except when it's not. When you initialize the stream, you can
-set a flag that tells PulseAudio to not start the stream automatically. This is required because miniaudio does not auto-start devices
-straight after initialization - you need to call `ma_device_start()` manually. The problem is that even when this flag is specified,
-PulseAudio will immediately fire it's write or read callback. This is *technically* correct (based on the wording in the documentation)
-because indeed, data *can* be written at this point. The problem is that it's not *practical*. It makes sense that the write/read callback
-would be where a program will want to write or read data to or from the stream, but when it's called before the application has even
-requested that the stream be started, it's just not practical because the program probably isn't ready for any kind of data delivery at
-that point (it may still need to load files or whatnot). Instead, this callback should only be fired when the application requests the
-stream be started which is how it works with literally *every* other callback-based audio API. Since miniaudio forbids firing of the data
-callback until the device has been started (as it should be with *all* callback based APIs), logic needs to be added to ensure miniaudio
-doesn't just blindly fire the application-defined data callback from within the PulseAudio callback before the stream has actually been
-started. The device state is used for this - if the state is anything other than `ma_device_state_starting` or `ma_device_state_started`, the main data
-callback is not fired.
-
-This, unfortunately, is not the end of the problems with the PulseAudio write callback. Any normal callback based audio API will
-continuously fire the callback at regular intervals based on the size of the internal buffer. This will only ever be fired when the device
-is running, and will be fired regardless of whether or not the user actually wrote anything to the device/stream. This not the case in
-PulseAudio. In PulseAudio, the data callback will *only* be called if you wrote something to it previously. That means, if you don't call
-`pa_stream_write()`, the callback will not get fired. On the surface you wouldn't think this would matter because you should be always
-writing data, and if you don't have anything to write, just write silence. That's fine until you want to drain the stream. You see, if
-you're continuously writing data to the stream, the stream will never get drained! That means in order to drain the stream, you need to
-*not* write data to it! But remember, when you don't write data to the stream, the callback won't get fired again! Why is draining
-important? Because that's how we've defined stopping to work in miniaudio. In miniaudio, stopping the device requires it to be drained
-before returning from ma_device_stop(). So we've stopped the device, which requires us to drain, but draining requires us to *not* write
-data to the stream (or else it won't ever complete draining), but not writing to the stream means the callback won't get fired again!
-
-This becomes a problem when stopping and then restarting the device. When the device is stopped, it's drained, which requires us to *not*
-write anything to the stream. But then, since we didn't write anything to it, the write callback will *never* get called again if we just
-resume the stream naively. This means that starting the stream requires us to write data to the stream from outside the callback. This
-disconnect is something PulseAudio has got seriously wrong - there should only ever be a single source of data delivery, that being the
-callback. (I have tried using `pa_stream_flush()` to trigger the write callback to fire, but this just doesn't work for some reason.)
-
-Once you've created the stream, you need to connect it which involves the whole waiting procedure. This is the same process as the context,
-only this time you'll poll for the state with `pa_stream_get_status()`. The starting and stopping of a streaming is referred to as
-"corking" in PulseAudio. The analogy is corking a barrel. To start the stream, you uncork it, to stop it you cork it. Personally I think
-it's silly - why would you not just call it "starting" and "stopping" like any other normal audio API? Anyway, the act of corking is, you
-guessed it, asynchronous. This means you'll need our waiting loop as usual. Again, why this asynchronous design is the default is
-absolutely beyond me. Would it really be that hard to just make it run synchronously?
-
-Teardown is pretty simple (what?!). It's just a matter of calling the relevant `_unref()` function on each object in reverse order that
-they were initialized in.
-
-That's about it from the PulseAudio side. A bit ranty, I know, but they really need to fix that main loop and callback system. They're
-embarrassingly unpractical. The main loop thing is an easy fix - have synchronous versions of all APIs. If an application wants these to
-run asynchronously, they can execute them in a separate thread themselves. The desire to run these asynchronously is such a niche
-requirement - it makes no sense to make it the default. The stream write callback needs to be change, or an alternative provided, that is
-constantly fired, regardless of whether or not `pa_stream_write()` has been called, and it needs to take a pointer to a buffer as a
-parameter which the program just writes to directly rather than having to call `pa_stream_writable_size()` and `pa_stream_write()`. These
-changes alone will change PulseAudio from one of the worst audio APIs to one of the best.
-*/
-
-
-/*
-It is assumed pulseaudio.h is available when linking at compile time. When linking at compile time, we use the declarations in the header
-to check for type safety. We cannot do this when linking at run time because the header might not be available.
-*/
-#ifdef MA_NO_RUNTIME_LINKING
-
-/* pulseaudio.h marks some functions with "inline" which isn't always supported. Need to emulate it. */
-#if !defined(__cplusplus)
-    #if defined(__STRICT_ANSI__)
-        #if !defined(inline)
-            #define inline __inline__ __attribute__((always_inline))
-            #define MA_INLINE_DEFINED
-        #endif
-    #endif
-#endif
-#include <pulse/pulseaudio.h>
-#if defined(MA_INLINE_DEFINED)
-    #undef inline
-    #undef MA_INLINE_DEFINED
-#endif
-
-#define MA_PA_OK                                       PA_OK
-#define MA_PA_ERR_ACCESS                               PA_ERR_ACCESS
-#define MA_PA_ERR_INVALID                              PA_ERR_INVALID
-#define MA_PA_ERR_NOENTITY                             PA_ERR_NOENTITY
-#define MA_PA_ERR_NOTSUPPORTED                         PA_ERR_NOTSUPPORTED
-
-#define MA_PA_CHANNELS_MAX                             PA_CHANNELS_MAX
-#define MA_PA_RATE_MAX                                 PA_RATE_MAX
-
-typedef pa_context_flags_t ma_pa_context_flags_t;
-#define MA_PA_CONTEXT_NOFLAGS                          PA_CONTEXT_NOFLAGS
-#define MA_PA_CONTEXT_NOAUTOSPAWN                      PA_CONTEXT_NOAUTOSPAWN
-#define MA_PA_CONTEXT_NOFAIL                           PA_CONTEXT_NOFAIL
-
-typedef pa_stream_flags_t ma_pa_stream_flags_t;
-#define MA_PA_STREAM_NOFLAGS                           PA_STREAM_NOFLAGS
-#define MA_PA_STREAM_START_CORKED                      PA_STREAM_START_CORKED
-#define MA_PA_STREAM_INTERPOLATE_TIMING                PA_STREAM_INTERPOLATE_TIMING
-#define MA_PA_STREAM_NOT_MONOTONIC                     PA_STREAM_NOT_MONOTONIC
-#define MA_PA_STREAM_AUTO_TIMING_UPDATE                PA_STREAM_AUTO_TIMING_UPDATE
-#define MA_PA_STREAM_NO_REMAP_CHANNELS                 PA_STREAM_NO_REMAP_CHANNELS
-#define MA_PA_STREAM_NO_REMIX_CHANNELS                 PA_STREAM_NO_REMIX_CHANNELS
-#define MA_PA_STREAM_FIX_FORMAT                        PA_STREAM_FIX_FORMAT
-#define MA_PA_STREAM_FIX_RATE                          PA_STREAM_FIX_RATE
-#define MA_PA_STREAM_FIX_CHANNELS                      PA_STREAM_FIX_CHANNELS
-#define MA_PA_STREAM_DONT_MOVE                         PA_STREAM_DONT_MOVE
-#define MA_PA_STREAM_VARIABLE_RATE                     PA_STREAM_VARIABLE_RATE
-#define MA_PA_STREAM_PEAK_DETECT                       PA_STREAM_PEAK_DETECT
-#define MA_PA_STREAM_START_MUTED                       PA_STREAM_START_MUTED
-#define MA_PA_STREAM_ADJUST_LATENCY                    PA_STREAM_ADJUST_LATENCY
-#define MA_PA_STREAM_EARLY_REQUESTS                    PA_STREAM_EARLY_REQUESTS
-#define MA_PA_STREAM_DONT_INHIBIT_AUTO_SUSPEND         PA_STREAM_DONT_INHIBIT_AUTO_SUSPEND
-#define MA_PA_STREAM_START_UNMUTED                     PA_STREAM_START_UNMUTED
-#define MA_PA_STREAM_FAIL_ON_SUSPEND                   PA_STREAM_FAIL_ON_SUSPEND
-#define MA_PA_STREAM_RELATIVE_VOLUME                   PA_STREAM_RELATIVE_VOLUME
-#define MA_PA_STREAM_PASSTHROUGH                       PA_STREAM_PASSTHROUGH
-
-typedef pa_sink_flags_t ma_pa_sink_flags_t;
-#define MA_PA_SINK_NOFLAGS                             PA_SINK_NOFLAGS
-#define MA_PA_SINK_HW_VOLUME_CTRL                      PA_SINK_HW_VOLUME_CTRL
-#define MA_PA_SINK_LATENCY                             PA_SINK_LATENCY
-#define MA_PA_SINK_HARDWARE                            PA_SINK_HARDWARE
-#define MA_PA_SINK_NETWORK                             PA_SINK_NETWORK
-#define MA_PA_SINK_HW_MUTE_CTRL                        PA_SINK_HW_MUTE_CTRL
-#define MA_PA_SINK_DECIBEL_VOLUME                      PA_SINK_DECIBEL_VOLUME
-#define MA_PA_SINK_FLAT_VOLUME                         PA_SINK_FLAT_VOLUME
-#define MA_PA_SINK_DYNAMIC_LATENCY                     PA_SINK_DYNAMIC_LATENCY
-#define MA_PA_SINK_SET_FORMATS                         PA_SINK_SET_FORMATS
-
-typedef pa_source_flags_t ma_pa_source_flags_t;
-#define MA_PA_SOURCE_NOFLAGS                           PA_SOURCE_NOFLAGS
-#define MA_PA_SOURCE_HW_VOLUME_CTRL                    PA_SOURCE_HW_VOLUME_CTRL
-#define MA_PA_SOURCE_LATENCY                           PA_SOURCE_LATENCY
-#define MA_PA_SOURCE_HARDWARE                          PA_SOURCE_HARDWARE
-#define MA_PA_SOURCE_NETWORK                           PA_SOURCE_NETWORK
-#define MA_PA_SOURCE_HW_MUTE_CTRL                      PA_SOURCE_HW_MUTE_CTRL
-#define MA_PA_SOURCE_DECIBEL_VOLUME                    PA_SOURCE_DECIBEL_VOLUME
-#define MA_PA_SOURCE_DYNAMIC_LATENCY                   PA_SOURCE_DYNAMIC_LATENCY
-#define MA_PA_SOURCE_FLAT_VOLUME                       PA_SOURCE_FLAT_VOLUME
-
-typedef pa_context_state_t ma_pa_context_state_t;
-#define MA_PA_CONTEXT_UNCONNECTED                      PA_CONTEXT_UNCONNECTED
-#define MA_PA_CONTEXT_CONNECTING                       PA_CONTEXT_CONNECTING
-#define MA_PA_CONTEXT_AUTHORIZING                      PA_CONTEXT_AUTHORIZING
-#define MA_PA_CONTEXT_SETTING_NAME                     PA_CONTEXT_SETTING_NAME
-#define MA_PA_CONTEXT_READY                            PA_CONTEXT_READY
-#define MA_PA_CONTEXT_FAILED                           PA_CONTEXT_FAILED
-#define MA_PA_CONTEXT_TERMINATED                       PA_CONTEXT_TERMINATED
-
-typedef pa_stream_state_t ma_pa_stream_state_t;
-#define MA_PA_STREAM_UNCONNECTED                       PA_STREAM_UNCONNECTED
-#define MA_PA_STREAM_CREATING                          PA_STREAM_CREATING
-#define MA_PA_STREAM_READY                             PA_STREAM_READY
-#define MA_PA_STREAM_FAILED                            PA_STREAM_FAILED
-#define MA_PA_STREAM_TERMINATED                        PA_STREAM_TERMINATED
-
-typedef pa_operation_state_t ma_pa_operation_state_t;
-#define MA_PA_OPERATION_RUNNING                        PA_OPERATION_RUNNING
-#define MA_PA_OPERATION_DONE                           PA_OPERATION_DONE
-#define MA_PA_OPERATION_CANCELLED                      PA_OPERATION_CANCELLED
-
-typedef pa_sink_state_t ma_pa_sink_state_t;
-#define MA_PA_SINK_INVALID_STATE                       PA_SINK_INVALID_STATE
-#define MA_PA_SINK_RUNNING                             PA_SINK_RUNNING
-#define MA_PA_SINK_IDLE                                PA_SINK_IDLE
-#define MA_PA_SINK_SUSPENDED                           PA_SINK_SUSPENDED
-
-typedef pa_source_state_t ma_pa_source_state_t;
-#define MA_PA_SOURCE_INVALID_STATE                     PA_SOURCE_INVALID_STATE
-#define MA_PA_SOURCE_RUNNING                           PA_SOURCE_RUNNING
-#define MA_PA_SOURCE_IDLE                              PA_SOURCE_IDLE
-#define MA_PA_SOURCE_SUSPENDED                         PA_SOURCE_SUSPENDED
-
-typedef pa_seek_mode_t ma_pa_seek_mode_t;
-#define MA_PA_SEEK_RELATIVE                            PA_SEEK_RELATIVE
-#define MA_PA_SEEK_ABSOLUTE                            PA_SEEK_ABSOLUTE
-#define MA_PA_SEEK_RELATIVE_ON_READ                    PA_SEEK_RELATIVE_ON_READ
-#define MA_PA_SEEK_RELATIVE_END                        PA_SEEK_RELATIVE_END
-
-typedef pa_channel_position_t ma_pa_channel_position_t;
-#define MA_PA_CHANNEL_POSITION_INVALID                 PA_CHANNEL_POSITION_INVALID
-#define MA_PA_CHANNEL_POSITION_MONO                    PA_CHANNEL_POSITION_MONO
-#define MA_PA_CHANNEL_POSITION_FRONT_LEFT              PA_CHANNEL_POSITION_FRONT_LEFT
-#define MA_PA_CHANNEL_POSITION_FRONT_RIGHT             PA_CHANNEL_POSITION_FRONT_RIGHT
-#define MA_PA_CHANNEL_POSITION_FRONT_CENTER            PA_CHANNEL_POSITION_FRONT_CENTER
-#define MA_PA_CHANNEL_POSITION_REAR_CENTER             PA_CHANNEL_POSITION_REAR_CENTER
-#define MA_PA_CHANNEL_POSITION_REAR_LEFT               PA_CHANNEL_POSITION_REAR_LEFT
-#define MA_PA_CHANNEL_POSITION_REAR_RIGHT              PA_CHANNEL_POSITION_REAR_RIGHT
-#define MA_PA_CHANNEL_POSITION_LFE                     PA_CHANNEL_POSITION_LFE
-#define MA_PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER    PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER
-#define MA_PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER   PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER
-#define MA_PA_CHANNEL_POSITION_SIDE_LEFT               PA_CHANNEL_POSITION_SIDE_LEFT
-#define MA_PA_CHANNEL_POSITION_SIDE_RIGHT              PA_CHANNEL_POSITION_SIDE_RIGHT
-#define MA_PA_CHANNEL_POSITION_AUX0                    PA_CHANNEL_POSITION_AUX0
-#define MA_PA_CHANNEL_POSITION_AUX1                    PA_CHANNEL_POSITION_AUX1
-#define MA_PA_CHANNEL_POSITION_AUX2                    PA_CHANNEL_POSITION_AUX2
-#define MA_PA_CHANNEL_POSITION_AUX3                    PA_CHANNEL_POSITION_AUX3
-#define MA_PA_CHANNEL_POSITION_AUX4                    PA_CHANNEL_POSITION_AUX4
-#define MA_PA_CHANNEL_POSITION_AUX5                    PA_CHANNEL_POSITION_AUX5
-#define MA_PA_CHANNEL_POSITION_AUX6                    PA_CHANNEL_POSITION_AUX6
-#define MA_PA_CHANNEL_POSITION_AUX7                    PA_CHANNEL_POSITION_AUX7
-#define MA_PA_CHANNEL_POSITION_AUX8                    PA_CHANNEL_POSITION_AUX8
-#define MA_PA_CHANNEL_POSITION_AUX9                    PA_CHANNEL_POSITION_AUX9
-#define MA_PA_CHANNEL_POSITION_AUX10                   PA_CHANNEL_POSITION_AUX10
-#define MA_PA_CHANNEL_POSITION_AUX11                   PA_CHANNEL_POSITION_AUX11
-#define MA_PA_CHANNEL_POSITION_AUX12                   PA_CHANNEL_POSITION_AUX12
-#define MA_PA_CHANNEL_POSITION_AUX13                   PA_CHANNEL_POSITION_AUX13
-#define MA_PA_CHANNEL_POSITION_AUX14                   PA_CHANNEL_POSITION_AUX14
-#define MA_PA_CHANNEL_POSITION_AUX15                   PA_CHANNEL_POSITION_AUX15
-#define MA_PA_CHANNEL_POSITION_AUX16                   PA_CHANNEL_POSITION_AUX16
-#define MA_PA_CHANNEL_POSITION_AUX17                   PA_CHANNEL_POSITION_AUX17
-#define MA_PA_CHANNEL_POSITION_AUX18                   PA_CHANNEL_POSITION_AUX18
-#define MA_PA_CHANNEL_POSITION_AUX19                   PA_CHANNEL_POSITION_AUX19
-#define MA_PA_CHANNEL_POSITION_AUX20                   PA_CHANNEL_POSITION_AUX20
-#define MA_PA_CHANNEL_POSITION_AUX21                   PA_CHANNEL_POSITION_AUX21
-#define MA_PA_CHANNEL_POSITION_AUX22                   PA_CHANNEL_POSITION_AUX22
-#define MA_PA_CHANNEL_POSITION_AUX23                   PA_CHANNEL_POSITION_AUX23
-#define MA_PA_CHANNEL_POSITION_AUX24                   PA_CHANNEL_POSITION_AUX24
-#define MA_PA_CHANNEL_POSITION_AUX25                   PA_CHANNEL_POSITION_AUX25
-#define MA_PA_CHANNEL_POSITION_AUX26                   PA_CHANNEL_POSITION_AUX26
-#define MA_PA_CHANNEL_POSITION_AUX27                   PA_CHANNEL_POSITION_AUX27
-#define MA_PA_CHANNEL_POSITION_AUX28                   PA_CHANNEL_POSITION_AUX28
-#define MA_PA_CHANNEL_POSITION_AUX29                   PA_CHANNEL_POSITION_AUX29
-#define MA_PA_CHANNEL_POSITION_AUX30                   PA_CHANNEL_POSITION_AUX30
-#define MA_PA_CHANNEL_POSITION_AUX31                   PA_CHANNEL_POSITION_AUX31
-#define MA_PA_CHANNEL_POSITION_TOP_CENTER              PA_CHANNEL_POSITION_TOP_CENTER
-#define MA_PA_CHANNEL_POSITION_TOP_FRONT_LEFT          PA_CHANNEL_POSITION_TOP_FRONT_LEFT
-#define MA_PA_CHANNEL_POSITION_TOP_FRONT_RIGHT         PA_CHANNEL_POSITION_TOP_FRONT_RIGHT
-#define MA_PA_CHANNEL_POSITION_TOP_FRONT_CENTER        PA_CHANNEL_POSITION_TOP_FRONT_CENTER
-#define MA_PA_CHANNEL_POSITION_TOP_REAR_LEFT           PA_CHANNEL_POSITION_TOP_REAR_LEFT
-#define MA_PA_CHANNEL_POSITION_TOP_REAR_RIGHT          PA_CHANNEL_POSITION_TOP_REAR_RIGHT
-#define MA_PA_CHANNEL_POSITION_TOP_REAR_CENTER         PA_CHANNEL_POSITION_TOP_REAR_CENTER
-#define MA_PA_CHANNEL_POSITION_LEFT                    PA_CHANNEL_POSITION_LEFT
-#define MA_PA_CHANNEL_POSITION_RIGHT                   PA_CHANNEL_POSITION_RIGHT
-#define MA_PA_CHANNEL_POSITION_CENTER                  PA_CHANNEL_POSITION_CENTER
-#define MA_PA_CHANNEL_POSITION_SUBWOOFER               PA_CHANNEL_POSITION_SUBWOOFER
-
-typedef pa_channel_map_def_t ma_pa_channel_map_def_t;
-#define MA_PA_CHANNEL_MAP_AIFF                         PA_CHANNEL_MAP_AIFF
-#define MA_PA_CHANNEL_MAP_ALSA                         PA_CHANNEL_MAP_ALSA
-#define MA_PA_CHANNEL_MAP_AUX                          PA_CHANNEL_MAP_AUX
-#define MA_PA_CHANNEL_MAP_WAVEEX                       PA_CHANNEL_MAP_WAVEEX
-#define MA_PA_CHANNEL_MAP_OSS                          PA_CHANNEL_MAP_OSS
-#define MA_PA_CHANNEL_MAP_DEFAULT                      PA_CHANNEL_MAP_DEFAULT
-
-typedef pa_sample_format_t ma_pa_sample_format_t;
-#define MA_PA_SAMPLE_INVALID                           PA_SAMPLE_INVALID
-#define MA_PA_SAMPLE_U8                                PA_SAMPLE_U8
-#define MA_PA_SAMPLE_ALAW                              PA_SAMPLE_ALAW
-#define MA_PA_SAMPLE_ULAW                              PA_SAMPLE_ULAW
-#define MA_PA_SAMPLE_S16LE                             PA_SAMPLE_S16LE
-#define MA_PA_SAMPLE_S16BE                             PA_SAMPLE_S16BE
-#define MA_PA_SAMPLE_FLOAT32LE                         PA_SAMPLE_FLOAT32LE
-#define MA_PA_SAMPLE_FLOAT32BE                         PA_SAMPLE_FLOAT32BE
-#define MA_PA_SAMPLE_S32LE                             PA_SAMPLE_S32LE
-#define MA_PA_SAMPLE_S32BE                             PA_SAMPLE_S32BE
-#define MA_PA_SAMPLE_S24LE                             PA_SAMPLE_S24LE
-#define MA_PA_SAMPLE_S24BE                             PA_SAMPLE_S24BE
-#define MA_PA_SAMPLE_S24_32LE                          PA_SAMPLE_S24_32LE
-#define MA_PA_SAMPLE_S24_32BE                          PA_SAMPLE_S24_32BE
-
-typedef pa_mainloop             ma_pa_mainloop;
-typedef pa_threaded_mainloop    ma_pa_threaded_mainloop;
-typedef pa_mainloop_api         ma_pa_mainloop_api;
-typedef pa_context              ma_pa_context;
-typedef pa_operation            ma_pa_operation;
-typedef pa_stream               ma_pa_stream;
-typedef pa_spawn_api            ma_pa_spawn_api;
-typedef pa_buffer_attr          ma_pa_buffer_attr;
-typedef pa_channel_map          ma_pa_channel_map;
-typedef pa_cvolume              ma_pa_cvolume;
-typedef pa_sample_spec          ma_pa_sample_spec;
-typedef pa_sink_info            ma_pa_sink_info;
-typedef pa_source_info          ma_pa_source_info;
-
-typedef pa_context_notify_cb_t  ma_pa_context_notify_cb_t;
-typedef pa_sink_info_cb_t       ma_pa_sink_info_cb_t;
-typedef pa_source_info_cb_t     ma_pa_source_info_cb_t;
-typedef pa_stream_success_cb_t  ma_pa_stream_success_cb_t;
-typedef pa_stream_request_cb_t  ma_pa_stream_request_cb_t;
-typedef pa_stream_notify_cb_t   ma_pa_stream_notify_cb_t;
-typedef pa_free_cb_t            ma_pa_free_cb_t;
-#else
-#define MA_PA_OK                                       0
-#define MA_PA_ERR_ACCESS                               1
-#define MA_PA_ERR_INVALID                              2
-#define MA_PA_ERR_NOENTITY                             5
-#define MA_PA_ERR_NOTSUPPORTED                         19
-
-#define MA_PA_CHANNELS_MAX                             32
-#define MA_PA_RATE_MAX                                 384000
-
-typedef int ma_pa_context_flags_t;
-#define MA_PA_CONTEXT_NOFLAGS                          0x00000000
-#define MA_PA_CONTEXT_NOAUTOSPAWN                      0x00000001
-#define MA_PA_CONTEXT_NOFAIL                           0x00000002
-
-typedef int ma_pa_stream_flags_t;
-#define MA_PA_STREAM_NOFLAGS                           0x00000000
-#define MA_PA_STREAM_START_CORKED                      0x00000001
-#define MA_PA_STREAM_INTERPOLATE_TIMING                0x00000002
-#define MA_PA_STREAM_NOT_MONOTONIC                     0x00000004
-#define MA_PA_STREAM_AUTO_TIMING_UPDATE                0x00000008
-#define MA_PA_STREAM_NO_REMAP_CHANNELS                 0x00000010
-#define MA_PA_STREAM_NO_REMIX_CHANNELS                 0x00000020
-#define MA_PA_STREAM_FIX_FORMAT                        0x00000040
-#define MA_PA_STREAM_FIX_RATE                          0x00000080
-#define MA_PA_STREAM_FIX_CHANNELS                      0x00000100
-#define MA_PA_STREAM_DONT_MOVE                         0x00000200
-#define MA_PA_STREAM_VARIABLE_RATE                     0x00000400
-#define MA_PA_STREAM_PEAK_DETECT                       0x00000800
-#define MA_PA_STREAM_START_MUTED                       0x00001000
-#define MA_PA_STREAM_ADJUST_LATENCY                    0x00002000
-#define MA_PA_STREAM_EARLY_REQUESTS                    0x00004000
-#define MA_PA_STREAM_DONT_INHIBIT_AUTO_SUSPEND         0x00008000
-#define MA_PA_STREAM_START_UNMUTED                     0x00010000
-#define MA_PA_STREAM_FAIL_ON_SUSPEND                   0x00020000
-#define MA_PA_STREAM_RELATIVE_VOLUME                   0x00040000
-#define MA_PA_STREAM_PASSTHROUGH                       0x00080000
-
-typedef int ma_pa_sink_flags_t;
-#define MA_PA_SINK_NOFLAGS                             0x00000000
-#define MA_PA_SINK_HW_VOLUME_CTRL                      0x00000001
-#define MA_PA_SINK_LATENCY                             0x00000002
-#define MA_PA_SINK_HARDWARE                            0x00000004
-#define MA_PA_SINK_NETWORK                             0x00000008
-#define MA_PA_SINK_HW_MUTE_CTRL                        0x00000010
-#define MA_PA_SINK_DECIBEL_VOLUME                      0x00000020
-#define MA_PA_SINK_FLAT_VOLUME                         0x00000040
-#define MA_PA_SINK_DYNAMIC_LATENCY                     0x00000080
-#define MA_PA_SINK_SET_FORMATS                         0x00000100
-
-typedef int ma_pa_source_flags_t;
-#define MA_PA_SOURCE_NOFLAGS                           0x00000000
-#define MA_PA_SOURCE_HW_VOLUME_CTRL                    0x00000001
-#define MA_PA_SOURCE_LATENCY                           0x00000002
-#define MA_PA_SOURCE_HARDWARE                          0x00000004
-#define MA_PA_SOURCE_NETWORK                           0x00000008
-#define MA_PA_SOURCE_HW_MUTE_CTRL                      0x00000010
-#define MA_PA_SOURCE_DECIBEL_VOLUME                    0x00000020
-#define MA_PA_SOURCE_DYNAMIC_LATENCY                   0x00000040
-#define MA_PA_SOURCE_FLAT_VOLUME                       0x00000080
-
-typedef int ma_pa_context_state_t;
-#define MA_PA_CONTEXT_UNCONNECTED                      0
-#define MA_PA_CONTEXT_CONNECTING                       1
-#define MA_PA_CONTEXT_AUTHORIZING                      2
-#define MA_PA_CONTEXT_SETTING_NAME                     3
-#define MA_PA_CONTEXT_READY                            4
-#define MA_PA_CONTEXT_FAILED                           5
-#define MA_PA_CONTEXT_TERMINATED                       6
-
-typedef int ma_pa_stream_state_t;
-#define MA_PA_STREAM_UNCONNECTED                       0
-#define MA_PA_STREAM_CREATING                          1
-#define MA_PA_STREAM_READY                             2
-#define MA_PA_STREAM_FAILED                            3
-#define MA_PA_STREAM_TERMINATED                        4
-
-typedef int ma_pa_operation_state_t;
-#define MA_PA_OPERATION_RUNNING                        0
-#define MA_PA_OPERATION_DONE                           1
-#define MA_PA_OPERATION_CANCELLED                      2
-
-typedef int ma_pa_sink_state_t;
-#define MA_PA_SINK_INVALID_STATE                       -1
-#define MA_PA_SINK_RUNNING                             0
-#define MA_PA_SINK_IDLE                                1
-#define MA_PA_SINK_SUSPENDED                           2
-
-typedef int ma_pa_source_state_t;
-#define MA_PA_SOURCE_INVALID_STATE                     -1
-#define MA_PA_SOURCE_RUNNING                           0
-#define MA_PA_SOURCE_IDLE                              1
-#define MA_PA_SOURCE_SUSPENDED                         2
-
-typedef int ma_pa_seek_mode_t;
-#define MA_PA_SEEK_RELATIVE                            0
-#define MA_PA_SEEK_ABSOLUTE                            1
-#define MA_PA_SEEK_RELATIVE_ON_READ                    2
-#define MA_PA_SEEK_RELATIVE_END                        3
-
-typedef int ma_pa_channel_position_t;
-#define MA_PA_CHANNEL_POSITION_INVALID                 -1
-#define MA_PA_CHANNEL_POSITION_MONO                    0
-#define MA_PA_CHANNEL_POSITION_FRONT_LEFT              1
-#define MA_PA_CHANNEL_POSITION_FRONT_RIGHT             2
-#define MA_PA_CHANNEL_POSITION_FRONT_CENTER            3
-#define MA_PA_CHANNEL_POSITION_REAR_CENTER             4
-#define MA_PA_CHANNEL_POSITION_REAR_LEFT               5
-#define MA_PA_CHANNEL_POSITION_REAR_RIGHT              6
-#define MA_PA_CHANNEL_POSITION_LFE                     7
-#define MA_PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER    8
-#define MA_PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER   9
-#define MA_PA_CHANNEL_POSITION_SIDE_LEFT               10
-#define MA_PA_CHANNEL_POSITION_SIDE_RIGHT              11
-#define MA_PA_CHANNEL_POSITION_AUX0                    12
-#define MA_PA_CHANNEL_POSITION_AUX1                    13
-#define MA_PA_CHANNEL_POSITION_AUX2                    14
-#define MA_PA_CHANNEL_POSITION_AUX3                    15
-#define MA_PA_CHANNEL_POSITION_AUX4                    16
-#define MA_PA_CHANNEL_POSITION_AUX5                    17
-#define MA_PA_CHANNEL_POSITION_AUX6                    18
-#define MA_PA_CHANNEL_POSITION_AUX7                    19
-#define MA_PA_CHANNEL_POSITION_AUX8                    20
-#define MA_PA_CHANNEL_POSITION_AUX9                    21
-#define MA_PA_CHANNEL_POSITION_AUX10                   22
-#define MA_PA_CHANNEL_POSITION_AUX11                   23
-#define MA_PA_CHANNEL_POSITION_AUX12                   24
-#define MA_PA_CHANNEL_POSITION_AUX13                   25
-#define MA_PA_CHANNEL_POSITION_AUX14                   26
-#define MA_PA_CHANNEL_POSITION_AUX15                   27
-#define MA_PA_CHANNEL_POSITION_AUX16                   28
-#define MA_PA_CHANNEL_POSITION_AUX17                   29
-#define MA_PA_CHANNEL_POSITION_AUX18                   30
-#define MA_PA_CHANNEL_POSITION_AUX19                   31
-#define MA_PA_CHANNEL_POSITION_AUX20                   32
-#define MA_PA_CHANNEL_POSITION_AUX21                   33
-#define MA_PA_CHANNEL_POSITION_AUX22                   34
-#define MA_PA_CHANNEL_POSITION_AUX23                   35
-#define MA_PA_CHANNEL_POSITION_AUX24                   36
-#define MA_PA_CHANNEL_POSITION_AUX25                   37
-#define MA_PA_CHANNEL_POSITION_AUX26                   38
-#define MA_PA_CHANNEL_POSITION_AUX27                   39
-#define MA_PA_CHANNEL_POSITION_AUX28                   40
-#define MA_PA_CHANNEL_POSITION_AUX29                   41
-#define MA_PA_CHANNEL_POSITION_AUX30                   42
-#define MA_PA_CHANNEL_POSITION_AUX31                   43
-#define MA_PA_CHANNEL_POSITION_TOP_CENTER              44
-#define MA_PA_CHANNEL_POSITION_TOP_FRONT_LEFT          45
-#define MA_PA_CHANNEL_POSITION_TOP_FRONT_RIGHT         46
-#define MA_PA_CHANNEL_POSITION_TOP_FRONT_CENTER        47
-#define MA_PA_CHANNEL_POSITION_TOP_REAR_LEFT           48
-#define MA_PA_CHANNEL_POSITION_TOP_REAR_RIGHT          49
-#define MA_PA_CHANNEL_POSITION_TOP_REAR_CENTER         50
-#define MA_PA_CHANNEL_POSITION_LEFT                    MA_PA_CHANNEL_POSITION_FRONT_LEFT
-#define MA_PA_CHANNEL_POSITION_RIGHT                   MA_PA_CHANNEL_POSITION_FRONT_RIGHT
-#define MA_PA_CHANNEL_POSITION_CENTER                  MA_PA_CHANNEL_POSITION_FRONT_CENTER
-#define MA_PA_CHANNEL_POSITION_SUBWOOFER               MA_PA_CHANNEL_POSITION_LFE
-
-typedef int ma_pa_channel_map_def_t;
-#define MA_PA_CHANNEL_MAP_AIFF                         0
-#define MA_PA_CHANNEL_MAP_ALSA                         1
-#define MA_PA_CHANNEL_MAP_AUX                          2
-#define MA_PA_CHANNEL_MAP_WAVEEX                       3
-#define MA_PA_CHANNEL_MAP_OSS                          4
-#define MA_PA_CHANNEL_MAP_DEFAULT                      MA_PA_CHANNEL_MAP_AIFF
-
-typedef int ma_pa_sample_format_t;
-#define MA_PA_SAMPLE_INVALID                           -1
-#define MA_PA_SAMPLE_U8                                0
-#define MA_PA_SAMPLE_ALAW                              1
-#define MA_PA_SAMPLE_ULAW                              2
-#define MA_PA_SAMPLE_S16LE                             3
-#define MA_PA_SAMPLE_S16BE                             4
-#define MA_PA_SAMPLE_FLOAT32LE                         5
-#define MA_PA_SAMPLE_FLOAT32BE                         6
-#define MA_PA_SAMPLE_S32LE                             7
-#define MA_PA_SAMPLE_S32BE                             8
-#define MA_PA_SAMPLE_S24LE                             9
-#define MA_PA_SAMPLE_S24BE                             10
-#define MA_PA_SAMPLE_S24_32LE                          11
-#define MA_PA_SAMPLE_S24_32BE                          12
-
-typedef struct ma_pa_mainloop           ma_pa_mainloop;
-typedef struct ma_pa_threaded_mainloop  ma_pa_threaded_mainloop;
-typedef struct ma_pa_mainloop_api       ma_pa_mainloop_api;
-typedef struct ma_pa_context            ma_pa_context;
-typedef struct ma_pa_operation          ma_pa_operation;
-typedef struct ma_pa_stream             ma_pa_stream;
-typedef struct ma_pa_spawn_api          ma_pa_spawn_api;
-
-typedef struct
-{
-    ma_uint32 maxlength;
-    ma_uint32 tlength;
-    ma_uint32 prebuf;
-    ma_uint32 minreq;
-    ma_uint32 fragsize;
-} ma_pa_buffer_attr;
-
-typedef struct
-{
-    ma_uint8 channels;
-    ma_pa_channel_position_t map[MA_PA_CHANNELS_MAX];
-} ma_pa_channel_map;
-
-typedef struct
-{
-    ma_uint8 channels;
-    ma_uint32 values[MA_PA_CHANNELS_MAX];
-} ma_pa_cvolume;
-
-typedef struct
-{
-    ma_pa_sample_format_t format;
-    ma_uint32 rate;
-    ma_uint8 channels;
-} ma_pa_sample_spec;
-
-typedef struct
-{
-    const char* name;
-    ma_uint32 index;
-    const char* description;
-    ma_pa_sample_spec sample_spec;
-    ma_pa_channel_map channel_map;
-    ma_uint32 owner_module;
-    ma_pa_cvolume volume;
-    int mute;
-    ma_uint32 monitor_source;
-    const char* monitor_source_name;
-    ma_uint64 latency;
-    const char* driver;
-    ma_pa_sink_flags_t flags;
-    void* proplist;
-    ma_uint64 configured_latency;
-    ma_uint32 base_volume;
-    ma_pa_sink_state_t state;
-    ma_uint32 n_volume_steps;
-    ma_uint32 card;
-    ma_uint32 n_ports;
-    void** ports;
-    void* active_port;
-    ma_uint8 n_formats;
-    void** formats;
-} ma_pa_sink_info;
-
-typedef struct
-{
-    const char *name;
-    ma_uint32 index;
-    const char *description;
-    ma_pa_sample_spec sample_spec;
-    ma_pa_channel_map channel_map;
-    ma_uint32 owner_module;
-    ma_pa_cvolume volume;
-    int mute;
-    ma_uint32 monitor_of_sink;
-    const char *monitor_of_sink_name;
-    ma_uint64 latency;
-    const char *driver;
-    ma_pa_source_flags_t flags;
-    void* proplist;
-    ma_uint64 configured_latency;
-    ma_uint32 base_volume;
-    ma_pa_source_state_t state;
-    ma_uint32 n_volume_steps;
-    ma_uint32 card;
-    ma_uint32 n_ports;
-    void** ports;
-    void* active_port;
-    ma_uint8 n_formats;
-    void** formats;
-} ma_pa_source_info;
-
-typedef void (* ma_pa_context_notify_cb_t)(ma_pa_context* c, void* userdata);
-typedef void (* ma_pa_sink_info_cb_t)     (ma_pa_context* c, const ma_pa_sink_info* i, int eol, void* userdata);
-typedef void (* ma_pa_source_info_cb_t)   (ma_pa_context* c, const ma_pa_source_info* i, int eol, void* userdata);
-typedef void (* ma_pa_stream_success_cb_t)(ma_pa_stream* s, int success, void* userdata);
-typedef void (* ma_pa_stream_request_cb_t)(ma_pa_stream* s, size_t nbytes, void* userdata);
-typedef void (* ma_pa_stream_notify_cb_t) (ma_pa_stream* s, void* userdata);
-typedef void (* ma_pa_free_cb_t)          (void* p);
-#endif
-
-
-typedef ma_pa_mainloop*          (* ma_pa_mainloop_new_proc)                   (void);
-typedef void                     (* ma_pa_mainloop_free_proc)                  (ma_pa_mainloop* m);
-typedef void                     (* ma_pa_mainloop_quit_proc)                  (ma_pa_mainloop* m, int retval);
-typedef ma_pa_mainloop_api*      (* ma_pa_mainloop_get_api_proc)               (ma_pa_mainloop* m);
-typedef int                      (* ma_pa_mainloop_iterate_proc)               (ma_pa_mainloop* m, int block, int* retval);
-typedef void                     (* ma_pa_mainloop_wakeup_proc)                (ma_pa_mainloop* m);
-typedef ma_pa_threaded_mainloop* (* ma_pa_threaded_mainloop_new_proc)          (void);
-typedef void                     (* ma_pa_threaded_mainloop_free_proc)         (ma_pa_threaded_mainloop* m);
-typedef int                      (* ma_pa_threaded_mainloop_start_proc)        (ma_pa_threaded_mainloop* m);
-typedef void                     (* ma_pa_threaded_mainloop_stop_proc)         (ma_pa_threaded_mainloop* m);
-typedef void                     (* ma_pa_threaded_mainloop_lock_proc)         (ma_pa_threaded_mainloop* m);
-typedef void                     (* ma_pa_threaded_mainloop_unlock_proc)       (ma_pa_threaded_mainloop* m);
-typedef void                     (* ma_pa_threaded_mainloop_wait_proc)         (ma_pa_threaded_mainloop* m);
-typedef void                     (* ma_pa_threaded_mainloop_signal_proc)       (ma_pa_threaded_mainloop* m, int wait_for_accept);
-typedef void                     (* ma_pa_threaded_mainloop_accept_proc)       (ma_pa_threaded_mainloop* m);
-typedef int                      (* ma_pa_threaded_mainloop_get_retval_proc)   (ma_pa_threaded_mainloop* m);
-typedef ma_pa_mainloop_api*      (* ma_pa_threaded_mainloop_get_api_proc)      (ma_pa_threaded_mainloop* m);
-typedef int                      (* ma_pa_threaded_mainloop_in_thread_proc)    (ma_pa_threaded_mainloop* m);
-typedef void                     (* ma_pa_threaded_mainloop_set_name_proc)     (ma_pa_threaded_mainloop* m, const char* name);
-typedef ma_pa_context*           (* ma_pa_context_new_proc)                    (ma_pa_mainloop_api* mainloop, const char* name);
-typedef void                     (* ma_pa_context_unref_proc)                  (ma_pa_context* c);
-typedef int                      (* ma_pa_context_connect_proc)                (ma_pa_context* c, const char* server, ma_pa_context_flags_t flags, const ma_pa_spawn_api* api);
-typedef void                     (* ma_pa_context_disconnect_proc)             (ma_pa_context* c);
-typedef void                     (* ma_pa_context_set_state_callback_proc)     (ma_pa_context* c, ma_pa_context_notify_cb_t cb, void* userdata);
-typedef ma_pa_context_state_t    (* ma_pa_context_get_state_proc)              (ma_pa_context* c);
-typedef ma_pa_operation*         (* ma_pa_context_get_sink_info_list_proc)     (ma_pa_context* c, ma_pa_sink_info_cb_t cb, void* userdata);
-typedef ma_pa_operation*         (* ma_pa_context_get_source_info_list_proc)   (ma_pa_context* c, ma_pa_source_info_cb_t cb, void* userdata);
-typedef ma_pa_operation*         (* ma_pa_context_get_sink_info_by_name_proc)  (ma_pa_context* c, const char* name, ma_pa_sink_info_cb_t cb, void* userdata);
-typedef ma_pa_operation*         (* ma_pa_context_get_source_info_by_name_proc)(ma_pa_context* c, const char* name, ma_pa_source_info_cb_t cb, void* userdata);
-typedef void                     (* ma_pa_operation_unref_proc)                (ma_pa_operation* o);
-typedef ma_pa_operation_state_t  (* ma_pa_operation_get_state_proc)            (ma_pa_operation* o);
-typedef ma_pa_channel_map*       (* ma_pa_channel_map_init_extend_proc)        (ma_pa_channel_map* m, unsigned channels, ma_pa_channel_map_def_t def);
-typedef int                      (* ma_pa_channel_map_valid_proc)              (const ma_pa_channel_map* m);
-typedef int                      (* ma_pa_channel_map_compatible_proc)         (const ma_pa_channel_map* m, const ma_pa_sample_spec* ss);
-typedef ma_pa_stream*            (* ma_pa_stream_new_proc)                     (ma_pa_context* c, const char* name, const ma_pa_sample_spec* ss, const ma_pa_channel_map* map);
-typedef void                     (* ma_pa_stream_unref_proc)                   (ma_pa_stream* s);
-typedef int                      (* ma_pa_stream_connect_playback_proc)        (ma_pa_stream* s, const char* dev, const ma_pa_buffer_attr* attr, ma_pa_stream_flags_t flags, const ma_pa_cvolume* volume, ma_pa_stream* sync_stream);
-typedef int                      (* ma_pa_stream_connect_record_proc)          (ma_pa_stream* s, const char* dev, const ma_pa_buffer_attr* attr, ma_pa_stream_flags_t flags);
-typedef int                      (* ma_pa_stream_disconnect_proc)              (ma_pa_stream* s);
-typedef ma_pa_stream_state_t     (* ma_pa_stream_get_state_proc)               (ma_pa_stream* s);
-typedef const ma_pa_sample_spec* (* ma_pa_stream_get_sample_spec_proc)         (ma_pa_stream* s);
-typedef const ma_pa_channel_map* (* ma_pa_stream_get_channel_map_proc)         (ma_pa_stream* s);
-typedef const ma_pa_buffer_attr* (* ma_pa_stream_get_buffer_attr_proc)         (ma_pa_stream* s);
-typedef ma_pa_operation*         (* ma_pa_stream_set_buffer_attr_proc)         (ma_pa_stream* s, const ma_pa_buffer_attr* attr, ma_pa_stream_success_cb_t cb, void* userdata);
-typedef const char*              (* ma_pa_stream_get_device_name_proc)         (ma_pa_stream* s);
-typedef void                     (* ma_pa_stream_set_write_callback_proc)      (ma_pa_stream* s, ma_pa_stream_request_cb_t cb, void* userdata);
-typedef void                     (* ma_pa_stream_set_read_callback_proc)       (ma_pa_stream* s, ma_pa_stream_request_cb_t cb, void* userdata);
-typedef void                     (* ma_pa_stream_set_suspended_callback_proc)  (ma_pa_stream* s, ma_pa_stream_notify_cb_t cb, void* userdata);
-typedef void                     (* ma_pa_stream_set_moved_callback_proc)      (ma_pa_stream* s, ma_pa_stream_notify_cb_t cb, void* userdata);
-typedef int                      (* ma_pa_stream_is_suspended_proc)            (const ma_pa_stream* s);
-typedef ma_pa_operation*         (* ma_pa_stream_flush_proc)                   (ma_pa_stream* s, ma_pa_stream_success_cb_t cb, void* userdata);
-typedef ma_pa_operation*         (* ma_pa_stream_drain_proc)                   (ma_pa_stream* s, ma_pa_stream_success_cb_t cb, void* userdata);
-typedef int                      (* ma_pa_stream_is_corked_proc)               (ma_pa_stream* s);
-typedef ma_pa_operation*         (* ma_pa_stream_cork_proc)                    (ma_pa_stream* s, int b, ma_pa_stream_success_cb_t cb, void* userdata);
-typedef ma_pa_operation*         (* ma_pa_stream_trigger_proc)                 (ma_pa_stream* s, ma_pa_stream_success_cb_t cb, void* userdata);
-typedef int                      (* ma_pa_stream_begin_write_proc)             (ma_pa_stream* s, void** data, size_t* nbytes);
-typedef int                      (* ma_pa_stream_write_proc)                   (ma_pa_stream* s, const void* data, size_t nbytes, ma_pa_free_cb_t free_cb, int64_t offset, ma_pa_seek_mode_t seek);
-typedef int                      (* ma_pa_stream_peek_proc)                    (ma_pa_stream* s, const void** data, size_t* nbytes);
-typedef int                      (* ma_pa_stream_drop_proc)                    (ma_pa_stream* s);
-typedef size_t                   (* ma_pa_stream_writable_size_proc)           (ma_pa_stream* s);
-typedef size_t                   (* ma_pa_stream_readable_size_proc)           (ma_pa_stream* s);
-
-typedef struct
-{
-    ma_uint32 count;
-    ma_uint32 capacity;
-    ma_device_info* pInfo;
-} ma_pulse_device_enum_data;
-
-static ma_result ma_result_from_pulse(int result)
-{
-    if (result < 0) {
-        return MA_ERROR;
-    }
-
-    switch (result) {
-        case MA_PA_OK:           return MA_SUCCESS;
-        case MA_PA_ERR_ACCESS:   return MA_ACCESS_DENIED;
-        case MA_PA_ERR_INVALID:  return MA_INVALID_ARGS;
-        case MA_PA_ERR_NOENTITY: return MA_NO_DEVICE;
-        default:                 return MA_ERROR;
-    }
-}
-
-#if 0
-static ma_pa_sample_format_t ma_format_to_pulse(ma_format format)
-{
-    if (ma_is_little_endian()) {
-        switch (format) {
-            case ma_format_s16: return MA_PA_SAMPLE_S16LE;
-            case ma_format_s24: return MA_PA_SAMPLE_S24LE;
-            case ma_format_s32: return MA_PA_SAMPLE_S32LE;
-            case ma_format_f32: return MA_PA_SAMPLE_FLOAT32LE;
-            default: break;
-        }
-    } else {
-        switch (format) {
-            case ma_format_s16: return MA_PA_SAMPLE_S16BE;
-            case ma_format_s24: return MA_PA_SAMPLE_S24BE;
-            case ma_format_s32: return MA_PA_SAMPLE_S32BE;
-            case ma_format_f32: return MA_PA_SAMPLE_FLOAT32BE;
-            default: break;
-        }
-    }
-
-    /* Endian agnostic. */
-    switch (format) {
-        case ma_format_u8: return MA_PA_SAMPLE_U8;
-        default: return MA_PA_SAMPLE_INVALID;
-    }
-}
-#endif
-
-static ma_format ma_format_from_pulse(ma_pa_sample_format_t format)
-{
-    if (ma_is_little_endian()) {
-        switch (format) {
-            case MA_PA_SAMPLE_S16LE:     return ma_format_s16;
-            case MA_PA_SAMPLE_S24LE:     return ma_format_s24;
-            case MA_PA_SAMPLE_S32LE:     return ma_format_s32;
-            case MA_PA_SAMPLE_FLOAT32LE: return ma_format_f32;
-            default: break;
-        }
-    } else {
-        switch (format) {
-            case MA_PA_SAMPLE_S16BE:     return ma_format_s16;
-            case MA_PA_SAMPLE_S24BE:     return ma_format_s24;
-            case MA_PA_SAMPLE_S32BE:     return ma_format_s32;
-            case MA_PA_SAMPLE_FLOAT32BE: return ma_format_f32;
-            default: break;
-        }
-    }
-
-    /* Endian agnostic. */
-    switch (format) {
-        case MA_PA_SAMPLE_U8: return ma_format_u8;
-        default: return ma_format_unknown;
-    }
-}
-
-static ma_channel ma_channel_position_from_pulse(ma_pa_channel_position_t position)
-{
-    switch (position)
-    {
-        case MA_PA_CHANNEL_POSITION_INVALID:               return MA_CHANNEL_NONE;
-        case MA_PA_CHANNEL_POSITION_MONO:                  return MA_CHANNEL_MONO;
-        case MA_PA_CHANNEL_POSITION_FRONT_LEFT:            return MA_CHANNEL_FRONT_LEFT;
-        case MA_PA_CHANNEL_POSITION_FRONT_RIGHT:           return MA_CHANNEL_FRONT_RIGHT;
-        case MA_PA_CHANNEL_POSITION_FRONT_CENTER:          return MA_CHANNEL_FRONT_CENTER;
-        case MA_PA_CHANNEL_POSITION_REAR_CENTER:           return MA_CHANNEL_BACK_CENTER;
-        case MA_PA_CHANNEL_POSITION_REAR_LEFT:             return MA_CHANNEL_BACK_LEFT;
-        case MA_PA_CHANNEL_POSITION_REAR_RIGHT:            return MA_CHANNEL_BACK_RIGHT;
-        case MA_PA_CHANNEL_POSITION_LFE:                   return MA_CHANNEL_LFE;
-        case MA_PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER:  return MA_CHANNEL_FRONT_LEFT_CENTER;
-        case MA_PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER: return MA_CHANNEL_FRONT_RIGHT_CENTER;
-        case MA_PA_CHANNEL_POSITION_SIDE_LEFT:             return MA_CHANNEL_SIDE_LEFT;
-        case MA_PA_CHANNEL_POSITION_SIDE_RIGHT:            return MA_CHANNEL_SIDE_RIGHT;
-        case MA_PA_CHANNEL_POSITION_AUX0:                  return MA_CHANNEL_AUX_0;
-        case MA_PA_CHANNEL_POSITION_AUX1:                  return MA_CHANNEL_AUX_1;
-        case MA_PA_CHANNEL_POSITION_AUX2:                  return MA_CHANNEL_AUX_2;
-        case MA_PA_CHANNEL_POSITION_AUX3:                  return MA_CHANNEL_AUX_3;
-        case MA_PA_CHANNEL_POSITION_AUX4:                  return MA_CHANNEL_AUX_4;
-        case MA_PA_CHANNEL_POSITION_AUX5:                  return MA_CHANNEL_AUX_5;
-        case MA_PA_CHANNEL_POSITION_AUX6:                  return MA_CHANNEL_AUX_6;
-        case MA_PA_CHANNEL_POSITION_AUX7:                  return MA_CHANNEL_AUX_7;
-        case MA_PA_CHANNEL_POSITION_AUX8:                  return MA_CHANNEL_AUX_8;
-        case MA_PA_CHANNEL_POSITION_AUX9:                  return MA_CHANNEL_AUX_9;
-        case MA_PA_CHANNEL_POSITION_AUX10:                 return MA_CHANNEL_AUX_10;
-        case MA_PA_CHANNEL_POSITION_AUX11:                 return MA_CHANNEL_AUX_11;
-        case MA_PA_CHANNEL_POSITION_AUX12:                 return MA_CHANNEL_AUX_12;
-        case MA_PA_CHANNEL_POSITION_AUX13:                 return MA_CHANNEL_AUX_13;
-        case MA_PA_CHANNEL_POSITION_AUX14:                 return MA_CHANNEL_AUX_14;
-        case MA_PA_CHANNEL_POSITION_AUX15:                 return MA_CHANNEL_AUX_15;
-        case MA_PA_CHANNEL_POSITION_AUX16:                 return MA_CHANNEL_AUX_16;
-        case MA_PA_CHANNEL_POSITION_AUX17:                 return MA_CHANNEL_AUX_17;
-        case MA_PA_CHANNEL_POSITION_AUX18:                 return MA_CHANNEL_AUX_18;
-        case MA_PA_CHANNEL_POSITION_AUX19:                 return MA_CHANNEL_AUX_19;
-        case MA_PA_CHANNEL_POSITION_AUX20:                 return MA_CHANNEL_AUX_20;
-        case MA_PA_CHANNEL_POSITION_AUX21:                 return MA_CHANNEL_AUX_21;
-        case MA_PA_CHANNEL_POSITION_AUX22:                 return MA_CHANNEL_AUX_22;
-        case MA_PA_CHANNEL_POSITION_AUX23:                 return MA_CHANNEL_AUX_23;
-        case MA_PA_CHANNEL_POSITION_AUX24:                 return MA_CHANNEL_AUX_24;
-        case MA_PA_CHANNEL_POSITION_AUX25:                 return MA_CHANNEL_AUX_25;
-        case MA_PA_CHANNEL_POSITION_AUX26:                 return MA_CHANNEL_AUX_26;
-        case MA_PA_CHANNEL_POSITION_AUX27:                 return MA_CHANNEL_AUX_27;
-        case MA_PA_CHANNEL_POSITION_AUX28:                 return MA_CHANNEL_AUX_28;
-        case MA_PA_CHANNEL_POSITION_AUX29:                 return MA_CHANNEL_AUX_29;
-        case MA_PA_CHANNEL_POSITION_AUX30:                 return MA_CHANNEL_AUX_30;
-        case MA_PA_CHANNEL_POSITION_AUX31:                 return MA_CHANNEL_AUX_31;
-        case MA_PA_CHANNEL_POSITION_TOP_CENTER:            return MA_CHANNEL_TOP_CENTER;
-        case MA_PA_CHANNEL_POSITION_TOP_FRONT_LEFT:        return MA_CHANNEL_TOP_FRONT_LEFT;
-        case MA_PA_CHANNEL_POSITION_TOP_FRONT_RIGHT:       return MA_CHANNEL_TOP_FRONT_RIGHT;
-        case MA_PA_CHANNEL_POSITION_TOP_FRONT_CENTER:      return MA_CHANNEL_TOP_FRONT_CENTER;
-        case MA_PA_CHANNEL_POSITION_TOP_REAR_LEFT:         return MA_CHANNEL_TOP_BACK_LEFT;
-        case MA_PA_CHANNEL_POSITION_TOP_REAR_RIGHT:        return MA_CHANNEL_TOP_BACK_RIGHT;
-        case MA_PA_CHANNEL_POSITION_TOP_REAR_CENTER:       return MA_CHANNEL_TOP_BACK_CENTER;
-        default: return MA_CHANNEL_NONE;
-    }
-}
-
-#if 0
-static ma_pa_channel_position_t ma_channel_position_to_pulse(ma_channel position)
-{
-    switch (position)
-    {
-        case MA_CHANNEL_NONE:               return MA_PA_CHANNEL_POSITION_INVALID;
-        case MA_CHANNEL_FRONT_LEFT:         return MA_PA_CHANNEL_POSITION_FRONT_LEFT;
-        case MA_CHANNEL_FRONT_RIGHT:        return MA_PA_CHANNEL_POSITION_FRONT_RIGHT;
-        case MA_CHANNEL_FRONT_CENTER:       return MA_PA_CHANNEL_POSITION_FRONT_CENTER;
-        case MA_CHANNEL_LFE:                return MA_PA_CHANNEL_POSITION_LFE;
-        case MA_CHANNEL_BACK_LEFT:          return MA_PA_CHANNEL_POSITION_REAR_LEFT;
-        case MA_CHANNEL_BACK_RIGHT:         return MA_PA_CHANNEL_POSITION_REAR_RIGHT;
-        case MA_CHANNEL_FRONT_LEFT_CENTER:  return MA_PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER;
-        case MA_CHANNEL_FRONT_RIGHT_CENTER: return MA_PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER;
-        case MA_CHANNEL_BACK_CENTER:        return MA_PA_CHANNEL_POSITION_REAR_CENTER;
-        case MA_CHANNEL_SIDE_LEFT:          return MA_PA_CHANNEL_POSITION_SIDE_LEFT;
-        case MA_CHANNEL_SIDE_RIGHT:         return MA_PA_CHANNEL_POSITION_SIDE_RIGHT;
-        case MA_CHANNEL_TOP_CENTER:         return MA_PA_CHANNEL_POSITION_TOP_CENTER;
-        case MA_CHANNEL_TOP_FRONT_LEFT:     return MA_PA_CHANNEL_POSITION_TOP_FRONT_LEFT;
-        case MA_CHANNEL_TOP_FRONT_CENTER:   return MA_PA_CHANNEL_POSITION_TOP_FRONT_CENTER;
-        case MA_CHANNEL_TOP_FRONT_RIGHT:    return MA_PA_CHANNEL_POSITION_TOP_FRONT_RIGHT;
-        case MA_CHANNEL_TOP_BACK_LEFT:      return MA_PA_CHANNEL_POSITION_TOP_REAR_LEFT;
-        case MA_CHANNEL_TOP_BACK_CENTER:    return MA_PA_CHANNEL_POSITION_TOP_REAR_CENTER;
-        case MA_CHANNEL_TOP_BACK_RIGHT:     return MA_PA_CHANNEL_POSITION_TOP_REAR_RIGHT;
-        case MA_CHANNEL_19:                 return MA_PA_CHANNEL_POSITION_AUX18;
-        case MA_CHANNEL_20:                 return MA_PA_CHANNEL_POSITION_AUX19;
-        case MA_CHANNEL_21:                 return MA_PA_CHANNEL_POSITION_AUX20;
-        case MA_CHANNEL_22:                 return MA_PA_CHANNEL_POSITION_AUX21;
-        case MA_CHANNEL_23:                 return MA_PA_CHANNEL_POSITION_AUX22;
-        case MA_CHANNEL_24:                 return MA_PA_CHANNEL_POSITION_AUX23;
-        case MA_CHANNEL_25:                 return MA_PA_CHANNEL_POSITION_AUX24;
-        case MA_CHANNEL_26:                 return MA_PA_CHANNEL_POSITION_AUX25;
-        case MA_CHANNEL_27:                 return MA_PA_CHANNEL_POSITION_AUX26;
-        case MA_CHANNEL_28:                 return MA_PA_CHANNEL_POSITION_AUX27;
-        case MA_CHANNEL_29:                 return MA_PA_CHANNEL_POSITION_AUX28;
-        case MA_CHANNEL_30:                 return MA_PA_CHANNEL_POSITION_AUX29;
-        case MA_CHANNEL_31:                 return MA_PA_CHANNEL_POSITION_AUX30;
-        case MA_CHANNEL_32:                 return MA_PA_CHANNEL_POSITION_AUX31;
-        default: return (ma_pa_channel_position_t)position;
-    }
-}
-#endif
-
-static ma_result ma_wait_for_operation__pulse(ma_context* pContext, ma_ptr pMainLoop, ma_pa_operation* pOP)
-{
-    int resultPA;
-    ma_pa_operation_state_t state;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pOP != NULL);
-
-    for (;;) {
-        state = ((ma_pa_operation_get_state_proc)pContext->pulse.pa_operation_get_state)(pOP);
-        if (state != MA_PA_OPERATION_RUNNING) {
-            break;  /* Done. */
-        }
-
-        resultPA = ((ma_pa_mainloop_iterate_proc)pContext->pulse.pa_mainloop_iterate)((ma_pa_mainloop*)pMainLoop, 1, NULL);
-        if (resultPA < 0) {
-            return ma_result_from_pulse(resultPA);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_wait_for_operation_and_unref__pulse(ma_context* pContext, ma_ptr pMainLoop, ma_pa_operation* pOP)
-{
-    ma_result result;
-
-    if (pOP == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_wait_for_operation__pulse(pContext, pMainLoop, pOP);
-    ((ma_pa_operation_unref_proc)pContext->pulse.pa_operation_unref)(pOP);
-
-    return result;
-}
-
-static ma_result ma_wait_for_pa_context_to_connect__pulse(ma_context* pContext, ma_ptr pMainLoop, ma_ptr pPulseContext)
-{
-    int resultPA;
-    ma_pa_context_state_t state;
-
-    for (;;) {
-        state = ((ma_pa_context_get_state_proc)pContext->pulse.pa_context_get_state)((ma_pa_context*)pPulseContext);
-        if (state == MA_PA_CONTEXT_READY) {
-            break;  /* Done. */
-        }
-
-        if (state == MA_PA_CONTEXT_FAILED || state == MA_PA_CONTEXT_TERMINATED) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] An error occurred while connecting the PulseAudio context.");
-            return MA_ERROR;
-        }
-
-        resultPA = ((ma_pa_mainloop_iterate_proc)pContext->pulse.pa_mainloop_iterate)((ma_pa_mainloop*)pMainLoop, 1, NULL);
-        if (resultPA < 0) {
-            return ma_result_from_pulse(resultPA);
-        }
-    }
-
-    /* Should never get here. */
-    return MA_SUCCESS;
-}
-
-static ma_result ma_wait_for_pa_stream_to_connect__pulse(ma_context* pContext, ma_ptr pMainLoop, ma_ptr pStream)
-{
-    int resultPA;
-    ma_pa_stream_state_t state;
-
-    for (;;) {
-        state = ((ma_pa_stream_get_state_proc)pContext->pulse.pa_stream_get_state)((ma_pa_stream*)pStream);
-        if (state == MA_PA_STREAM_READY) {
-            break;  /* Done. */
-        }
-
-        if (state == MA_PA_STREAM_FAILED || state == MA_PA_STREAM_TERMINATED) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] An error occurred while connecting the PulseAudio stream.");
-            return MA_ERROR;
-        }
-
-        resultPA = ((ma_pa_mainloop_iterate_proc)pContext->pulse.pa_mainloop_iterate)((ma_pa_mainloop*)pMainLoop, 1, NULL);
-        if (resultPA < 0) {
-            return ma_result_from_pulse(resultPA);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_init_pa_mainloop_and_pa_context__pulse(ma_context* pContext, const char* pApplicationName, const char* pServerName, ma_bool32 tryAutoSpawn, ma_ptr* ppMainLoop, ma_ptr* ppPulseContext)
-{
-    ma_result result;
-    ma_ptr pMainLoop;
-    ma_ptr pPulseContext;
-
-    MA_ASSERT(ppMainLoop     != NULL);
-    MA_ASSERT(ppPulseContext != NULL);
-
-    /* The PulseAudio context maps well to miniaudio's notion of a context. The pa_context object will be initialized as part of the ma_context. */
-    pMainLoop = ((ma_pa_mainloop_new_proc)pContext->pulse.pa_mainloop_new)();
-    if (pMainLoop == NULL) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to create mainloop.");
-        return MA_FAILED_TO_INIT_BACKEND;
-    }
-
-    pPulseContext = ((ma_pa_context_new_proc)pContext->pulse.pa_context_new)(((ma_pa_mainloop_get_api_proc)pContext->pulse.pa_mainloop_get_api)((ma_pa_mainloop*)pMainLoop), pApplicationName);
-    if (pPulseContext == NULL) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to create PulseAudio context.");
-        ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)(pMainLoop));
-        return MA_FAILED_TO_INIT_BACKEND;
-    }
-
-    /* Now we need to connect to the context. Everything is asynchronous so we need to wait for it to connect before returning. */
-    result = ma_result_from_pulse(((ma_pa_context_connect_proc)pContext->pulse.pa_context_connect)((ma_pa_context*)pPulseContext, pServerName, (tryAutoSpawn) ? 0 : MA_PA_CONTEXT_NOAUTOSPAWN, NULL));
-    if (result != MA_SUCCESS) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to connect PulseAudio context.");
-        ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)(pMainLoop));
-        return result;
-    }
-
-    /* Since ma_context_init() runs synchronously we need to wait for the PulseAudio context to connect before we return. */
-    result = ma_wait_for_pa_context_to_connect__pulse(pContext, pMainLoop, pPulseContext);
-    if (result != MA_SUCCESS) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] Waiting for connection failed.");
-        ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)(pMainLoop));
-        return result;
-    }
-
-    *ppMainLoop     = pMainLoop;
-    *ppPulseContext = pPulseContext;
-
-    return MA_SUCCESS;
-}
-
-
-static void ma_device_sink_info_callback(ma_pa_context* pPulseContext, const ma_pa_sink_info* pInfo, int endOfList, void* pUserData)
-{
-    ma_pa_sink_info* pInfoOut;
-
-    if (endOfList > 0) {
-        return;
-    }
-
-    /*
-    There has been a report that indicates that pInfo can be null which results
-    in a null pointer dereference below. We'll check for this for safety.
-    */
-    if (pInfo == NULL) {
-        return;
-    }
-
-    pInfoOut = (ma_pa_sink_info*)pUserData;
-    MA_ASSERT(pInfoOut != NULL);
-
-    *pInfoOut = *pInfo;
-
-    (void)pPulseContext; /* Unused. */
-}
-
-static void ma_device_source_info_callback(ma_pa_context* pPulseContext, const ma_pa_source_info* pInfo, int endOfList, void* pUserData)
-{
-    ma_pa_source_info* pInfoOut;
-
-    if (endOfList > 0) {
-        return;
-    }
-
-    /*
-    There has been a report that indicates that pInfo can be null which results
-    in a null pointer dereference below. We'll check for this for safety.
-    */
-    if (pInfo == NULL) {
-        return;
-    }
-
-    pInfoOut = (ma_pa_source_info*)pUserData;
-    MA_ASSERT(pInfoOut != NULL);
-
-    *pInfoOut = *pInfo;
-
-    (void)pPulseContext; /* Unused. */
-}
-
-#if 0
-static void ma_device_sink_name_callback(ma_pa_context* pPulseContext, const ma_pa_sink_info* pInfo, int endOfList, void* pUserData)
-{
-    ma_device* pDevice;
-
-    if (endOfList > 0) {
-        return;
-    }
-
-    pDevice = (ma_device*)pUserData;
-    MA_ASSERT(pDevice != NULL);
-
-    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), pInfo->description, (size_t)-1);
-
-    (void)pPulseContext; /* Unused. */
-}
-
-static void ma_device_source_name_callback(ma_pa_context* pPulseContext, const ma_pa_source_info* pInfo, int endOfList, void* pUserData)
-{
-    ma_device* pDevice;
-
-    if (endOfList > 0) {
-        return;
-    }
-
-    pDevice = (ma_device*)pUserData;
-    MA_ASSERT(pDevice != NULL);
-
-    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), pInfo->description, (size_t)-1);
-
-    (void)pPulseContext; /* Unused. */
-}
-#endif
-
-static ma_result ma_context_get_sink_info__pulse(ma_context* pContext, const char* pDeviceName, ma_pa_sink_info* pSinkInfo)
-{
-    ma_pa_operation* pOP;
-
-    pOP = ((ma_pa_context_get_sink_info_by_name_proc)pContext->pulse.pa_context_get_sink_info_by_name)((ma_pa_context*)pContext->pulse.pPulseContext, pDeviceName, ma_device_sink_info_callback, pSinkInfo);
-    if (pOP == NULL) {
-        return MA_ERROR;
-    }
-
-    return ma_wait_for_operation_and_unref__pulse(pContext, pContext->pulse.pMainLoop, pOP);
-}
-
-static ma_result ma_context_get_source_info__pulse(ma_context* pContext, const char* pDeviceName, ma_pa_source_info* pSourceInfo)
-{
-    ma_pa_operation* pOP;
-
-    pOP = ((ma_pa_context_get_source_info_by_name_proc)pContext->pulse.pa_context_get_source_info_by_name)((ma_pa_context*)pContext->pulse.pPulseContext, pDeviceName, ma_device_source_info_callback, pSourceInfo);
-    if (pOP == NULL) {
-        return MA_ERROR;
-    }
-
-    return ma_wait_for_operation_and_unref__pulse(pContext, pContext->pulse.pMainLoop, pOP);
-}
-
-static ma_result ma_context_get_default_device_index__pulse(ma_context* pContext, ma_device_type deviceType, ma_uint32* pIndex)
-{
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pIndex   != NULL);
-
-    if (pIndex != NULL) {
-        *pIndex = (ma_uint32)-1;
-    }
-
-    if (deviceType == ma_device_type_playback) {
-        ma_pa_sink_info sinkInfo;
-        result = ma_context_get_sink_info__pulse(pContext, NULL, &sinkInfo);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        if (pIndex != NULL) {
-            *pIndex = sinkInfo.index;
-        }
-    }
-
-    if (deviceType == ma_device_type_capture) {
-        ma_pa_source_info sourceInfo;
-        result = ma_context_get_source_info__pulse(pContext, NULL, &sourceInfo);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        if (pIndex != NULL) {
-            *pIndex = sourceInfo.index;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-
-typedef struct
-{
-    ma_context* pContext;
-    ma_enum_devices_callback_proc callback;
-    void* pUserData;
-    ma_bool32 isTerminated;
-    ma_uint32 defaultDeviceIndexPlayback;
-    ma_uint32 defaultDeviceIndexCapture;
-} ma_context_enumerate_devices_callback_data__pulse;
-
-static void ma_context_enumerate_devices_sink_callback__pulse(ma_pa_context* pPulseContext, const ma_pa_sink_info* pSinkInfo, int endOfList, void* pUserData)
-{
-    ma_context_enumerate_devices_callback_data__pulse* pData = (ma_context_enumerate_devices_callback_data__pulse*)pUserData;
-    ma_device_info deviceInfo;
-
-    MA_ASSERT(pData != NULL);
-
-    if (endOfList || pData->isTerminated) {
-        return;
-    }
-
-    MA_ZERO_OBJECT(&deviceInfo);
-
-    /* The name from PulseAudio is the ID for miniaudio. */
-    if (pSinkInfo->name != NULL) {
-        ma_strncpy_s(deviceInfo.id.pulse, sizeof(deviceInfo.id.pulse), pSinkInfo->name, (size_t)-1);
-    }
-
-    /* The description from PulseAudio is the name for miniaudio. */
-    if (pSinkInfo->description != NULL) {
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), pSinkInfo->description, (size_t)-1);
-    }
-
-    if (pSinkInfo->index == pData->defaultDeviceIndexPlayback) {
-        deviceInfo.isDefault = MA_TRUE;
-    }
-
-    pData->isTerminated = !pData->callback(pData->pContext, ma_device_type_playback, &deviceInfo, pData->pUserData);
-
-    (void)pPulseContext; /* Unused. */
-}
-
-static void ma_context_enumerate_devices_source_callback__pulse(ma_pa_context* pPulseContext, const ma_pa_source_info* pSourceInfo, int endOfList, void* pUserData)
-{
-    ma_context_enumerate_devices_callback_data__pulse* pData = (ma_context_enumerate_devices_callback_data__pulse*)pUserData;
-    ma_device_info deviceInfo;
-
-    MA_ASSERT(pData != NULL);
-
-    if (endOfList || pData->isTerminated) {
-        return;
-    }
-
-    MA_ZERO_OBJECT(&deviceInfo);
-
-    /* The name from PulseAudio is the ID for miniaudio. */
-    if (pSourceInfo->name != NULL) {
-        ma_strncpy_s(deviceInfo.id.pulse, sizeof(deviceInfo.id.pulse), pSourceInfo->name, (size_t)-1);
-    }
-
-    /* The description from PulseAudio is the name for miniaudio. */
-    if (pSourceInfo->description != NULL) {
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), pSourceInfo->description, (size_t)-1);
-    }
-
-    if (pSourceInfo->index == pData->defaultDeviceIndexCapture) {
-        deviceInfo.isDefault = MA_TRUE;
-    }
-
-    pData->isTerminated = !pData->callback(pData->pContext, ma_device_type_capture, &deviceInfo, pData->pUserData);
-
-    (void)pPulseContext; /* Unused. */
-}
-
-static ma_result ma_context_enumerate_devices__pulse(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_result result = MA_SUCCESS;
-    ma_context_enumerate_devices_callback_data__pulse callbackData;
-    ma_pa_operation* pOP = NULL;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    callbackData.pContext = pContext;
-    callbackData.callback = callback;
-    callbackData.pUserData = pUserData;
-    callbackData.isTerminated = MA_FALSE;
-    callbackData.defaultDeviceIndexPlayback = (ma_uint32)-1;
-    callbackData.defaultDeviceIndexCapture  = (ma_uint32)-1;
-
-    /* We need to get the index of the default devices. */
-    ma_context_get_default_device_index__pulse(pContext, ma_device_type_playback, &callbackData.defaultDeviceIndexPlayback);
-    ma_context_get_default_device_index__pulse(pContext, ma_device_type_capture,  &callbackData.defaultDeviceIndexCapture);
-
-    /* Playback. */
-    if (!callbackData.isTerminated) {
-        pOP = ((ma_pa_context_get_sink_info_list_proc)pContext->pulse.pa_context_get_sink_info_list)((ma_pa_context*)(pContext->pulse.pPulseContext), ma_context_enumerate_devices_sink_callback__pulse, &callbackData);
-        if (pOP == NULL) {
-            result = MA_ERROR;
-            goto done;
-        }
-
-        result = ma_wait_for_operation__pulse(pContext, pContext->pulse.pMainLoop, pOP);
-        ((ma_pa_operation_unref_proc)pContext->pulse.pa_operation_unref)(pOP);
-
-        if (result != MA_SUCCESS) {
-            goto done;
-        }
-    }
-
-
-    /* Capture. */
-    if (!callbackData.isTerminated) {
-        pOP = ((ma_pa_context_get_source_info_list_proc)pContext->pulse.pa_context_get_source_info_list)((ma_pa_context*)(pContext->pulse.pPulseContext), ma_context_enumerate_devices_source_callback__pulse, &callbackData);
-        if (pOP == NULL) {
-            result = MA_ERROR;
-            goto done;
-        }
-
-        result = ma_wait_for_operation__pulse(pContext, pContext->pulse.pMainLoop, pOP);
-        ((ma_pa_operation_unref_proc)pContext->pulse.pa_operation_unref)(pOP);
-
-        if (result != MA_SUCCESS) {
-            goto done;
-        }
-    }
-
-done:
-    return result;
-}
-
-
-typedef struct
-{
-    ma_device_info* pDeviceInfo;
-    ma_uint32 defaultDeviceIndex;
-    ma_bool32 foundDevice;
-} ma_context_get_device_info_callback_data__pulse;
-
-static void ma_context_get_device_info_sink_callback__pulse(ma_pa_context* pPulseContext, const ma_pa_sink_info* pInfo, int endOfList, void* pUserData)
-{
-    ma_context_get_device_info_callback_data__pulse* pData = (ma_context_get_device_info_callback_data__pulse*)pUserData;
-
-    if (endOfList > 0) {
-        return;
-    }
-
-    MA_ASSERT(pData != NULL);
-    pData->foundDevice = MA_TRUE;
-
-    if (pInfo->name != NULL) {
-        ma_strncpy_s(pData->pDeviceInfo->id.pulse, sizeof(pData->pDeviceInfo->id.pulse), pInfo->name, (size_t)-1);
-    }
-
-    if (pInfo->description != NULL) {
-        ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), pInfo->description, (size_t)-1);
-    }
-
-    /*
-    We're just reporting a single data format here. I think technically PulseAudio might support
-    all formats, but I don't trust that PulseAudio will do *anything* right, so I'm just going to
-    report the "native" device format.
-    */
-    pData->pDeviceInfo->nativeDataFormats[0].format     = ma_format_from_pulse(pInfo->sample_spec.format);
-    pData->pDeviceInfo->nativeDataFormats[0].channels   = pInfo->sample_spec.channels;
-    pData->pDeviceInfo->nativeDataFormats[0].sampleRate = pInfo->sample_spec.rate;
-    pData->pDeviceInfo->nativeDataFormats[0].flags      = 0;
-    pData->pDeviceInfo->nativeDataFormatCount = 1;
-
-    if (pData->defaultDeviceIndex == pInfo->index) {
-        pData->pDeviceInfo->isDefault = MA_TRUE;
-    }
-
-    (void)pPulseContext; /* Unused. */
-}
-
-static void ma_context_get_device_info_source_callback__pulse(ma_pa_context* pPulseContext, const ma_pa_source_info* pInfo, int endOfList, void* pUserData)
-{
-    ma_context_get_device_info_callback_data__pulse* pData = (ma_context_get_device_info_callback_data__pulse*)pUserData;
-
-    if (endOfList > 0) {
-        return;
-    }
-
-    MA_ASSERT(pData != NULL);
-    pData->foundDevice = MA_TRUE;
-
-    if (pInfo->name != NULL) {
-        ma_strncpy_s(pData->pDeviceInfo->id.pulse, sizeof(pData->pDeviceInfo->id.pulse), pInfo->name, (size_t)-1);
-    }
-
-    if (pInfo->description != NULL) {
-        ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), pInfo->description, (size_t)-1);
-    }
-
-    /*
-    We're just reporting a single data format here. I think technically PulseAudio might support
-    all formats, but I don't trust that PulseAudio will do *anything* right, so I'm just going to
-    report the "native" device format.
-    */
-    pData->pDeviceInfo->nativeDataFormats[0].format     = ma_format_from_pulse(pInfo->sample_spec.format);
-    pData->pDeviceInfo->nativeDataFormats[0].channels   = pInfo->sample_spec.channels;
-    pData->pDeviceInfo->nativeDataFormats[0].sampleRate = pInfo->sample_spec.rate;
-    pData->pDeviceInfo->nativeDataFormats[0].flags      = 0;
-    pData->pDeviceInfo->nativeDataFormatCount = 1;
-
-    if (pData->defaultDeviceIndex == pInfo->index) {
-        pData->pDeviceInfo->isDefault = MA_TRUE;
-    }
-
-    (void)pPulseContext; /* Unused. */
-}
-
-static ma_result ma_context_get_device_info__pulse(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    ma_result result = MA_SUCCESS;
-    ma_context_get_device_info_callback_data__pulse callbackData;
-    ma_pa_operation* pOP = NULL;
-    const char* pDeviceName = NULL;
-
-    MA_ASSERT(pContext != NULL);
-
-    callbackData.pDeviceInfo = pDeviceInfo;
-    callbackData.foundDevice = MA_FALSE;
-
-    if (pDeviceID != NULL) {
-        pDeviceName = pDeviceID->pulse;
-    } else {
-        pDeviceName = NULL;
-    }
-
-    result = ma_context_get_default_device_index__pulse(pContext, deviceType, &callbackData.defaultDeviceIndex);
-
-    if (deviceType == ma_device_type_playback) {
-        pOP = ((ma_pa_context_get_sink_info_by_name_proc)pContext->pulse.pa_context_get_sink_info_by_name)((ma_pa_context*)(pContext->pulse.pPulseContext), pDeviceName, ma_context_get_device_info_sink_callback__pulse, &callbackData);
-    } else {
-        pOP = ((ma_pa_context_get_source_info_by_name_proc)pContext->pulse.pa_context_get_source_info_by_name)((ma_pa_context*)(pContext->pulse.pPulseContext), pDeviceName, ma_context_get_device_info_source_callback__pulse, &callbackData);
-    }
-
-    if (pOP != NULL) {
-        ma_wait_for_operation_and_unref__pulse(pContext, pContext->pulse.pMainLoop, pOP);
-    } else {
-        result = MA_ERROR;
-        goto done;
-    }
-
-    if (!callbackData.foundDevice) {
-        result = MA_NO_DEVICE;
-        goto done;
-    }
-
-done:
-    return result;
-}
-
-static ma_result ma_device_uninit__pulse(ma_device* pDevice)
-{
-    ma_context* pContext;
-
-    MA_ASSERT(pDevice != NULL);
-
-    pContext = pDevice->pContext;
-    MA_ASSERT(pContext != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ((ma_pa_stream_disconnect_proc)pContext->pulse.pa_stream_disconnect)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
-        ((ma_pa_stream_unref_proc)pContext->pulse.pa_stream_unref)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ((ma_pa_stream_disconnect_proc)pContext->pulse.pa_stream_disconnect)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
-        ((ma_pa_stream_unref_proc)pContext->pulse.pa_stream_unref)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
-    }
-
-    if (pDevice->type == ma_device_type_duplex) {
-        ma_duplex_rb_uninit(&pDevice->duplexRB);
-    }
-
-    ((ma_pa_context_disconnect_proc)pContext->pulse.pa_context_disconnect)((ma_pa_context*)pDevice->pulse.pPulseContext);
-    ((ma_pa_context_unref_proc)pContext->pulse.pa_context_unref)((ma_pa_context*)pDevice->pulse.pPulseContext);
-    ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)pDevice->pulse.pMainLoop);
-
-    return MA_SUCCESS;
-}
-
-static ma_pa_buffer_attr ma_device__pa_buffer_attr_new(ma_uint32 periodSizeInFrames, ma_uint32 periods, const ma_pa_sample_spec* ss)
-{
-    ma_pa_buffer_attr attr;
-    attr.maxlength = periodSizeInFrames * periods * ma_get_bytes_per_frame(ma_format_from_pulse(ss->format), ss->channels);
-    attr.tlength   = attr.maxlength / periods;
-    attr.prebuf    = (ma_uint32)-1;
-    attr.minreq    = (ma_uint32)-1;
-    attr.fragsize  = attr.maxlength / periods;
-
-    return attr;
-}
-
-static ma_pa_stream* ma_device__pa_stream_new__pulse(ma_device* pDevice, const char* pStreamName, const ma_pa_sample_spec* ss, const ma_pa_channel_map* cmap)
-{
-    static int g_StreamCounter = 0;
-    char actualStreamName[256];
-
-    if (pStreamName != NULL) {
-        ma_strncpy_s(actualStreamName, sizeof(actualStreamName), pStreamName, (size_t)-1);
-    } else {
-        ma_strcpy_s(actualStreamName, sizeof(actualStreamName), "miniaudio:");
-        ma_itoa_s(g_StreamCounter, actualStreamName + 8, sizeof(actualStreamName)-8, 10);  /* 8 = strlen("miniaudio:") */
-    }
-    g_StreamCounter += 1;
-
-    return ((ma_pa_stream_new_proc)pDevice->pContext->pulse.pa_stream_new)((ma_pa_context*)pDevice->pulse.pPulseContext, actualStreamName, ss, cmap);
-}
-
-
-static void ma_device_on_read__pulse(ma_pa_stream* pStream, size_t byteCount, void* pUserData)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    ma_uint32 bpf;
-    ma_uint32 deviceState;
-    ma_uint64 frameCount;
-    ma_uint64 framesProcessed;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /*
-    Don't do anything if the device isn't initialized yet. Yes, this can happen because PulseAudio
-    can fire this callback before the stream has even started. Ridiculous.
-    */
-    deviceState = ma_device_get_state(pDevice);
-    if (deviceState != ma_device_state_starting && deviceState != ma_device_state_started) {
-        return;
-    }
-
-    bpf = ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-    MA_ASSERT(bpf > 0);
-
-    frameCount = byteCount / bpf;
-    framesProcessed = 0;
-
-    while (ma_device_get_state(pDevice) == ma_device_state_started && framesProcessed < frameCount) {
-        const void* pMappedPCMFrames;
-        size_t bytesMapped;
-        ma_uint64 framesMapped;
-
-        int pulseResult = ((ma_pa_stream_peek_proc)pDevice->pContext->pulse.pa_stream_peek)(pStream, &pMappedPCMFrames, &bytesMapped);
-        if (pulseResult < 0) {
-            break; /* Failed to map. Abort. */
-        }
-
-        framesMapped = bytesMapped / bpf;
-        if (framesMapped > 0) {
-            if (pMappedPCMFrames != NULL) {
-                ma_device_handle_backend_data_callback(pDevice, NULL, pMappedPCMFrames, framesMapped);
-            } else {
-                /* It's a hole. */
-                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[PulseAudio] ma_device_on_read__pulse: Hole.\n");
-            }
-
-            pulseResult = ((ma_pa_stream_drop_proc)pDevice->pContext->pulse.pa_stream_drop)(pStream);
-            if (pulseResult < 0) {
-                break;  /* Failed to drop the buffer. */
-            }
-
-            framesProcessed += framesMapped;
-
-        } else {
-            /* Nothing was mapped. Just abort. */
-            break;
-        }
-    }
-}
-
-static ma_result ma_device_write_to_stream__pulse(ma_device* pDevice, ma_pa_stream* pStream, ma_uint64* pFramesProcessed)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint64 framesProcessed = 0;
-    size_t bytesMapped;
-    ma_uint32 bpf;
-    ma_uint32 deviceState;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pStream != NULL);
-
-    bpf = ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-    MA_ASSERT(bpf > 0);
-
-    deviceState = ma_device_get_state(pDevice);
-
-    bytesMapped = ((ma_pa_stream_writable_size_proc)pDevice->pContext->pulse.pa_stream_writable_size)(pStream);
-    if (bytesMapped != (size_t)-1) {
-        if (bytesMapped > 0) {
-            ma_uint64 framesMapped;
-            void* pMappedPCMFrames;
-            int pulseResult = ((ma_pa_stream_begin_write_proc)pDevice->pContext->pulse.pa_stream_begin_write)(pStream, &pMappedPCMFrames, &bytesMapped);
-            if (pulseResult < 0) {
-                result = ma_result_from_pulse(pulseResult);
-                goto done;
-            }
-
-            framesMapped = bytesMapped / bpf;
-
-            if (deviceState == ma_device_state_started || deviceState == ma_device_state_starting) {  /* Check for starting state just in case this is being used to do the initial fill. */
-                ma_device_handle_backend_data_callback(pDevice, pMappedPCMFrames, NULL, framesMapped);
-            } else {
-                /* Device is not started. Write silence. */
-                ma_silence_pcm_frames(pMappedPCMFrames, framesMapped, pDevice->playback.format, pDevice->playback.channels);
-            }
-
-            pulseResult = ((ma_pa_stream_write_proc)pDevice->pContext->pulse.pa_stream_write)(pStream, pMappedPCMFrames, bytesMapped, NULL, 0, MA_PA_SEEK_RELATIVE);
-            if (pulseResult < 0) {
-                result = ma_result_from_pulse(pulseResult);
-                goto done;  /* Failed to write data to stream. */
-            }
-
-            framesProcessed += framesMapped;
-        } else {
-            result = MA_SUCCESS;  /* No data available for writing. */
-            goto done;
-        }
-    } else {
-        result = MA_ERROR;  /* Failed to retrieve the writable size. Abort. */
-        goto done;
-    }
-
-done:
-    if (pFramesProcessed != NULL) {
-        *pFramesProcessed = framesProcessed;
-    }
-
-    return result;
-}
-
-static void ma_device_on_write__pulse(ma_pa_stream* pStream, size_t byteCount, void* pUserData)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    ma_uint32 bpf;
-    ma_uint64 frameCount;
-    ma_uint64 framesProcessed;
-    ma_uint32 deviceState;
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /*
-    Don't do anything if the device isn't initialized yet. Yes, this can happen because PulseAudio
-    can fire this callback before the stream has even started. Ridiculous.
-    */
-    deviceState = ma_device_get_state(pDevice);
-    if (deviceState != ma_device_state_starting && deviceState != ma_device_state_started) {
-        return;
-    }
-
-    bpf = ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-    MA_ASSERT(bpf > 0);
-
-    frameCount = byteCount / bpf;
-    framesProcessed = 0;
-
-    while (framesProcessed < frameCount) {
-        ma_uint64 framesProcessedThisIteration;
-
-        /* Don't keep trying to process frames if the device isn't started. */
-        deviceState = ma_device_get_state(pDevice);
-        if (deviceState != ma_device_state_starting && deviceState != ma_device_state_started) {
-            break;
-        }
-
-        result = ma_device_write_to_stream__pulse(pDevice, pStream, &framesProcessedThisIteration);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        framesProcessed += framesProcessedThisIteration;
-    }
-}
-
-static void ma_device_on_suspended__pulse(ma_pa_stream* pStream, void* pUserData)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    int suspended;
-
-    (void)pStream;
-
-    suspended = ((ma_pa_stream_is_suspended_proc)pDevice->pContext->pulse.pa_stream_is_suspended)(pStream);
-    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[Pulse] Device suspended state changed. pa_stream_is_suspended() returned %d.\n", suspended);
-
-    if (suspended < 0) {
-        return;
-    }
-
-    if (suspended == 1) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[Pulse] Device suspended state changed. Suspended.\n");
-        ma_device__on_notification_stopped(pDevice);
-    } else {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[Pulse] Device suspended state changed. Resumed.\n");
-        ma_device__on_notification_started(pDevice);
-    }
-}
-
-static void ma_device_on_rerouted__pulse(ma_pa_stream* pStream, void* pUserData)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-
-    (void)pStream;
-    (void)pUserData;
-
-    ma_device__on_notification_rerouted(pDevice);
-}
-
-static ma_uint32 ma_calculate_period_size_in_frames_from_descriptor__pulse(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
-{
-    /*
-    There have been reports from users where buffers of < ~20ms result glitches when running through
-    PipeWire. To work around this we're going to have to use a different default buffer size.
-    */
-    const ma_uint32 defaultPeriodSizeInMilliseconds_LowLatency   = 25;
-    const ma_uint32 defaultPeriodSizeInMilliseconds_Conservative = MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE;
-
-    MA_ASSERT(nativeSampleRate != 0);
-
-    if (pDescriptor->periodSizeInFrames == 0) {
-        if (pDescriptor->periodSizeInMilliseconds == 0) {
-            if (performanceProfile == ma_performance_profile_low_latency) {
-                return ma_calculate_buffer_size_in_frames_from_milliseconds(defaultPeriodSizeInMilliseconds_LowLatency, nativeSampleRate);
-            } else {
-                return ma_calculate_buffer_size_in_frames_from_milliseconds(defaultPeriodSizeInMilliseconds_Conservative, nativeSampleRate);
-            }
-        } else {
-            return ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptor->periodSizeInMilliseconds, nativeSampleRate);
-        }
-    } else {
-        return pDescriptor->periodSizeInFrames;
-    }
-}
-
-static ma_result ma_device_init__pulse(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    /*
-    Notes for PulseAudio:
-
-      - When both the period size in frames and milliseconds are 0, we default to miniaudio's
-        default buffer sizes rather than leaving it up to PulseAudio because I don't trust
-        PulseAudio to give us any kind of reasonable latency by default.
-
-      - Do not ever, *ever* forget to use MA_PA_STREAM_ADJUST_LATENCY. If you don't specify this
-        flag, capture mode will just not work properly until you open another PulseAudio app.
-    */
-
-    ma_result result = MA_SUCCESS;
-    int error = 0;
-    const char* devPlayback = NULL;
-    const char* devCapture  = NULL;
-    ma_format format = ma_format_unknown;
-    ma_uint32 channels = 0;
-    ma_uint32 sampleRate = 0;
-    ma_pa_sink_info sinkInfo;
-    ma_pa_source_info sourceInfo;
-    ma_pa_sample_spec ss;
-    ma_pa_channel_map cmap;
-    ma_pa_buffer_attr attr;
-    const ma_pa_sample_spec* pActualSS   = NULL;
-    const ma_pa_buffer_attr* pActualAttr = NULL;
-    ma_uint32 iChannel;
-    ma_pa_stream_flags_t streamFlags;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ZERO_OBJECT(&pDevice->pulse);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    /* No exclusive mode with the PulseAudio backend. */
-    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pConfig->playback.shareMode == ma_share_mode_exclusive) ||
-        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pConfig->capture.shareMode  == ma_share_mode_exclusive)) {
-        return MA_SHARE_MODE_NOT_SUPPORTED;
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        if (pDescriptorPlayback->pDeviceID != NULL) {
-            devPlayback = pDescriptorPlayback->pDeviceID->pulse;
-        }
-
-        format     = pDescriptorPlayback->format;
-        channels   = pDescriptorPlayback->channels;
-        sampleRate = pDescriptorPlayback->sampleRate;
-    }
-
-    if (pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) {
-        if (pDescriptorCapture->pDeviceID != NULL) {
-            devCapture = pDescriptorCapture->pDeviceID->pulse;
-        }
-
-        format     = pDescriptorCapture->format;
-        channels   = pDescriptorCapture->channels;
-        sampleRate = pDescriptorCapture->sampleRate;
-    }
-
-
-
-    result = ma_init_pa_mainloop_and_pa_context__pulse(pDevice->pContext, pDevice->pContext->pulse.pApplicationName, pDevice->pContext->pulse.pServerName, MA_FALSE, &pDevice->pulse.pMainLoop, &pDevice->pulse.pPulseContext);
-    if (result != MA_SUCCESS) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to initialize PA mainloop and context for device.\n");
-        return result;
-    }
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        result = ma_context_get_source_info__pulse(pDevice->pContext, devCapture, &sourceInfo);
-        if (result != MA_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to retrieve source info for capture device.");
-            goto on_error0;
-        }
-
-        ss   = sourceInfo.sample_spec;
-        cmap = sourceInfo.channel_map;
-
-        /* Use the requested channel count if we have one. */
-        if (pDescriptorCapture->channels != 0) {
-            ss.channels = pDescriptorCapture->channels;
-        }
-
-        /* Use a default channel map. */
-        ((ma_pa_channel_map_init_extend_proc)pDevice->pContext->pulse.pa_channel_map_init_extend)(&cmap, ss.channels, MA_PA_CHANNEL_MAP_DEFAULT);
-
-        /* Use the requested sample rate if one was specified. */
-        if (pDescriptorCapture->sampleRate != 0) {
-            ss.rate = pDescriptorCapture->sampleRate;
-        }
-        streamFlags = MA_PA_STREAM_START_CORKED | MA_PA_STREAM_ADJUST_LATENCY;
-
-        if (ma_format_from_pulse(ss.format) == ma_format_unknown) {
-            if (ma_is_little_endian()) {
-                ss.format = MA_PA_SAMPLE_FLOAT32LE;
-            } else {
-                ss.format = MA_PA_SAMPLE_FLOAT32BE;
-            }
-            streamFlags |= MA_PA_STREAM_FIX_FORMAT;
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.format not supported by miniaudio. Defaulting to PA_SAMPLE_FLOAT32.\n");
-        }
-        if (ss.rate == 0) {
-            ss.rate = MA_DEFAULT_SAMPLE_RATE;
-            streamFlags |= MA_PA_STREAM_FIX_RATE;
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.rate = 0. Defaulting to %d.\n", ss.rate);
-        }
-        if (ss.channels == 0) {
-            ss.channels = MA_DEFAULT_CHANNELS;
-            streamFlags |= MA_PA_STREAM_FIX_CHANNELS;
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.channels = 0. Defaulting to %d.\n", ss.channels);
-        }
-
-        /* We now have enough information to calculate our actual period size in frames. */
-        pDescriptorCapture->periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__pulse(pDescriptorCapture, ss.rate, pConfig->performanceProfile);
-
-        attr = ma_device__pa_buffer_attr_new(pDescriptorCapture->periodSizeInFrames, pDescriptorCapture->periodCount, &ss);
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Capture attr: maxlength=%d, tlength=%d, prebuf=%d, minreq=%d, fragsize=%d; periodSizeInFrames=%d\n", attr.maxlength, attr.tlength, attr.prebuf, attr.minreq, attr.fragsize, pDescriptorCapture->periodSizeInFrames);
-
-        pDevice->pulse.pStreamCapture = ma_device__pa_stream_new__pulse(pDevice, pConfig->pulse.pStreamNameCapture, &ss, &cmap);
-        if (pDevice->pulse.pStreamCapture == NULL) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to create PulseAudio capture stream.\n");
-            result = MA_ERROR;
-            goto on_error0;
-        }
-
-
-        /* The callback needs to be set before connecting the stream. */
-        ((ma_pa_stream_set_read_callback_proc)pDevice->pContext->pulse.pa_stream_set_read_callback)((ma_pa_stream*)pDevice->pulse.pStreamCapture, ma_device_on_read__pulse, pDevice);
-
-        /* State callback for checking when the device has been corked. */
-        ((ma_pa_stream_set_suspended_callback_proc)pDevice->pContext->pulse.pa_stream_set_suspended_callback)((ma_pa_stream*)pDevice->pulse.pStreamCapture, ma_device_on_suspended__pulse, pDevice);
-
-        /* Rerouting notification. */
-        ((ma_pa_stream_set_moved_callback_proc)pDevice->pContext->pulse.pa_stream_set_moved_callback)((ma_pa_stream*)pDevice->pulse.pStreamCapture, ma_device_on_rerouted__pulse, pDevice);
-
-
-        /* Connect after we've got all of our internal state set up. */
-        if (devCapture != NULL) {
-            streamFlags |= MA_PA_STREAM_DONT_MOVE;
-        }
-
-        error = ((ma_pa_stream_connect_record_proc)pDevice->pContext->pulse.pa_stream_connect_record)((ma_pa_stream*)pDevice->pulse.pStreamCapture, devCapture, &attr, streamFlags);
-        if (error != MA_PA_OK) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to connect PulseAudio capture stream.");
-            result = ma_result_from_pulse(error);
-            goto on_error1;
-        }
-
-        result = ma_wait_for_pa_stream_to_connect__pulse(pDevice->pContext, pDevice->pulse.pMainLoop, (ma_pa_stream*)pDevice->pulse.pStreamCapture);
-        if (result != MA_SUCCESS) {
-            goto on_error2;
-        }
-
-
-        /* Internal format. */
-        pActualSS = ((ma_pa_stream_get_sample_spec_proc)pDevice->pContext->pulse.pa_stream_get_sample_spec)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
-        if (pActualSS != NULL) {
-            ss = *pActualSS;
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Capture sample spec: format=%s, channels=%d, rate=%d\n", ma_get_format_name(ma_format_from_pulse(ss.format)), ss.channels, ss.rate);
-        } else {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Failed to retrieve capture sample spec.\n");
-        }
-
-        pDescriptorCapture->format     = ma_format_from_pulse(ss.format);
-        pDescriptorCapture->channels   = ss.channels;
-        pDescriptorCapture->sampleRate = ss.rate;
-
-        if (pDescriptorCapture->format == ma_format_unknown || pDescriptorCapture->channels == 0 || pDescriptorCapture->sampleRate == 0) {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Capture sample spec is invalid. Device unusable by miniaudio. format=%s, channels=%d, sampleRate=%d.\n", ma_get_format_name(pDescriptorCapture->format), pDescriptorCapture->channels, pDescriptorCapture->sampleRate);
-            result = MA_ERROR;
-            goto on_error4;
-        }
-
-        /* Internal channel map. */
-
-        /*
-        Bug in PipeWire. There have been reports that PipeWire is returning AUX channels when reporting
-        the channel map. To somewhat workaround this, I'm hacking in a hard coded channel map for mono
-        and stereo. In this case it should be safe to assume mono = MONO and stereo = LEFT/RIGHT. For
-        all other channel counts we need to just put up with whatever PipeWire reports and hope it gets
-        fixed sooner than later. I might remove this hack later.
-        */
-        if (pDescriptorCapture->channels > 2) {
-            for (iChannel = 0; iChannel < pDescriptorCapture->channels; ++iChannel) {
-                pDescriptorCapture->channelMap[iChannel] = ma_channel_position_from_pulse(cmap.map[iChannel]);
-            }
-        } else {
-            /* Hack for mono and stereo. */
-            if (pDescriptorCapture->channels == 1) {
-                pDescriptorCapture->channelMap[0] = MA_CHANNEL_MONO;
-            } else if (pDescriptorCapture->channels == 2) {
-                pDescriptorCapture->channelMap[0] = MA_CHANNEL_FRONT_LEFT;
-                pDescriptorCapture->channelMap[1] = MA_CHANNEL_FRONT_RIGHT;
-            } else {
-                MA_ASSERT(MA_FALSE);    /* Should never hit this. */
-            }
-        }
-
-
-        /* Buffer. */
-        pActualAttr = ((ma_pa_stream_get_buffer_attr_proc)pDevice->pContext->pulse.pa_stream_get_buffer_attr)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
-        if (pActualAttr != NULL) {
-            attr = *pActualAttr;
-        }
-
-        if (attr.fragsize > 0) {
-            pDescriptorCapture->periodCount = ma_max(attr.maxlength / attr.fragsize, 1);
-        } else {
-            pDescriptorCapture->periodCount = 1;
-        }
-
-        pDescriptorCapture->periodSizeInFrames = attr.maxlength / ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels) / pDescriptorCapture->periodCount;
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Capture actual attr: maxlength=%d, tlength=%d, prebuf=%d, minreq=%d, fragsize=%d; periodSizeInFrames=%d\n", attr.maxlength, attr.tlength, attr.prebuf, attr.minreq, attr.fragsize, pDescriptorCapture->periodSizeInFrames);
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        result = ma_context_get_sink_info__pulse(pDevice->pContext, devPlayback, &sinkInfo);
-        if (result != MA_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to retrieve sink info for playback device.\n");
-            goto on_error2;
-        }
-
-        ss   = sinkInfo.sample_spec;
-        cmap = sinkInfo.channel_map;
-
-        /* Use the requested channel count if we have one. */
-        if (pDescriptorPlayback->channels != 0) {
-            ss.channels = pDescriptorPlayback->channels;
-        }
-
-        /* Use a default channel map. */
-        ((ma_pa_channel_map_init_extend_proc)pDevice->pContext->pulse.pa_channel_map_init_extend)(&cmap, ss.channels, MA_PA_CHANNEL_MAP_DEFAULT);
-
-
-        /* Use the requested sample rate if one was specified. */
-        if (pDescriptorPlayback->sampleRate != 0) {
-            ss.rate = pDescriptorPlayback->sampleRate;
-        }
-
-        streamFlags = MA_PA_STREAM_START_CORKED | MA_PA_STREAM_ADJUST_LATENCY;
-        if (ma_format_from_pulse(ss.format) == ma_format_unknown) {
-            if (ma_is_little_endian()) {
-                ss.format = MA_PA_SAMPLE_FLOAT32LE;
-            } else {
-                ss.format = MA_PA_SAMPLE_FLOAT32BE;
-            }
-            streamFlags |= MA_PA_STREAM_FIX_FORMAT;
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.format not supported by miniaudio. Defaulting to PA_SAMPLE_FLOAT32.\n");
-        }
-        if (ss.rate == 0) {
-            ss.rate = MA_DEFAULT_SAMPLE_RATE;
-            streamFlags |= MA_PA_STREAM_FIX_RATE;
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.rate = 0. Defaulting to %d.\n", ss.rate);
-        }
-        if (ss.channels == 0) {
-            ss.channels = MA_DEFAULT_CHANNELS;
-            streamFlags |= MA_PA_STREAM_FIX_CHANNELS;
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.channels = 0. Defaulting to %d.\n", ss.channels);
-        }
-
-        /* We now have enough information to calculate the actual buffer size in frames. */
-        pDescriptorPlayback->periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__pulse(pDescriptorPlayback, ss.rate, pConfig->performanceProfile);
-
-        attr = ma_device__pa_buffer_attr_new(pDescriptorPlayback->periodSizeInFrames, pDescriptorPlayback->periodCount, &ss);
-
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Playback attr: maxlength=%d, tlength=%d, prebuf=%d, minreq=%d, fragsize=%d; periodSizeInFrames=%d\n", attr.maxlength, attr.tlength, attr.prebuf, attr.minreq, attr.fragsize, pDescriptorPlayback->periodSizeInFrames);
-
-        pDevice->pulse.pStreamPlayback = ma_device__pa_stream_new__pulse(pDevice, pConfig->pulse.pStreamNamePlayback, &ss, &cmap);
-        if (pDevice->pulse.pStreamPlayback == NULL) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to create PulseAudio playback stream.\n");
-            result = MA_ERROR;
-            goto on_error2;
-        }
-
-
-        /*
-        Note that this callback will be fired as soon as the stream is connected, even though it's started as corked. The callback needs to handle a
-        device state of ma_device_state_uninitialized.
-        */
-        ((ma_pa_stream_set_write_callback_proc)pDevice->pContext->pulse.pa_stream_set_write_callback)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, ma_device_on_write__pulse, pDevice);
-
-        /* State callback for checking when the device has been corked. */
-        ((ma_pa_stream_set_suspended_callback_proc)pDevice->pContext->pulse.pa_stream_set_suspended_callback)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, ma_device_on_suspended__pulse, pDevice);
-
-        /* Rerouting notification. */
-        ((ma_pa_stream_set_moved_callback_proc)pDevice->pContext->pulse.pa_stream_set_moved_callback)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, ma_device_on_rerouted__pulse, pDevice);
-
-
-        /* Connect after we've got all of our internal state set up. */
-        if (devPlayback != NULL) {
-            streamFlags |= MA_PA_STREAM_DONT_MOVE;
-        }
-
-        error = ((ma_pa_stream_connect_playback_proc)pDevice->pContext->pulse.pa_stream_connect_playback)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, devPlayback, &attr, streamFlags, NULL, NULL);
-        if (error != MA_PA_OK) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to connect PulseAudio playback stream.");
-            result = ma_result_from_pulse(error);
-            goto on_error3;
-        }
-
-        result = ma_wait_for_pa_stream_to_connect__pulse(pDevice->pContext, pDevice->pulse.pMainLoop, (ma_pa_stream*)pDevice->pulse.pStreamPlayback);
-        if (result != MA_SUCCESS) {
-            goto on_error3;
-        }
-
-
-        /* Internal format. */
-        pActualSS = ((ma_pa_stream_get_sample_spec_proc)pDevice->pContext->pulse.pa_stream_get_sample_spec)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
-        if (pActualSS != NULL) {
-            ss = *pActualSS;
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Playback sample spec: format=%s, channels=%d, rate=%d\n", ma_get_format_name(ma_format_from_pulse(ss.format)), ss.channels, ss.rate);
-        } else {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Failed to retrieve playback sample spec.\n");
-        }
-
-        pDescriptorPlayback->format     = ma_format_from_pulse(ss.format);
-        pDescriptorPlayback->channels   = ss.channels;
-        pDescriptorPlayback->sampleRate = ss.rate;
-
-        if (pDescriptorPlayback->format == ma_format_unknown || pDescriptorPlayback->channels == 0 || pDescriptorPlayback->sampleRate == 0) {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Playback sample spec is invalid. Device unusable by miniaudio. format=%s, channels=%d, sampleRate=%d.\n", ma_get_format_name(pDescriptorPlayback->format), pDescriptorPlayback->channels, pDescriptorPlayback->sampleRate);
-            result = MA_ERROR;
-            goto on_error4;
-        }
-
-        /* Internal channel map. */
-
-        /*
-        Bug in PipeWire. There have been reports that PipeWire is returning AUX channels when reporting
-        the channel map. To somewhat workaround this, I'm hacking in a hard coded channel map for mono
-        and stereo. In this case it should be safe to assume mono = MONO and stereo = LEFT/RIGHT. For
-        all other channel counts we need to just put up with whatever PipeWire reports and hope it gets
-        fixed sooner than later. I might remove this hack later.
-        */
-        if (pDescriptorPlayback->channels > 2) {
-            for (iChannel = 0; iChannel < pDescriptorPlayback->channels; ++iChannel) {
-                pDescriptorPlayback->channelMap[iChannel] = ma_channel_position_from_pulse(cmap.map[iChannel]);
-            }
-        } else {
-            /* Hack for mono and stereo. */
-            if (pDescriptorPlayback->channels == 1) {
-                pDescriptorPlayback->channelMap[0] = MA_CHANNEL_MONO;
-            } else if (pDescriptorPlayback->channels == 2) {
-                pDescriptorPlayback->channelMap[0] = MA_CHANNEL_FRONT_LEFT;
-                pDescriptorPlayback->channelMap[1] = MA_CHANNEL_FRONT_RIGHT;
-            } else {
-                MA_ASSERT(MA_FALSE);    /* Should never hit this. */
-            }
-        }
-
-
-        /* Buffer. */
-        pActualAttr = ((ma_pa_stream_get_buffer_attr_proc)pDevice->pContext->pulse.pa_stream_get_buffer_attr)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
-        if (pActualAttr != NULL) {
-            attr = *pActualAttr;
-        }
-
-        if (attr.tlength > 0) {
-            pDescriptorPlayback->periodCount = ma_max(attr.maxlength / attr.tlength, 1);
-        } else {
-            pDescriptorPlayback->periodCount = 1;
-        }
-
-        pDescriptorPlayback->periodSizeInFrames = attr.maxlength / ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels) / pDescriptorPlayback->periodCount;
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Playback actual attr: maxlength=%d, tlength=%d, prebuf=%d, minreq=%d, fragsize=%d; internalPeriodSizeInFrames=%d\n", attr.maxlength, attr.tlength, attr.prebuf, attr.minreq, attr.fragsize, pDescriptorPlayback->periodSizeInFrames);
-    }
-
-
-    /*
-    We need a ring buffer for handling duplex mode. We can use the main duplex ring buffer in the main
-    part of the ma_device struct. We cannot, however, depend on ma_device_init() initializing this for
-    us later on because that will only do it if it's a fully asynchronous backend - i.e. the
-    onDeviceDataLoop callback is NULL, which is not the case for PulseAudio.
-    */
-    if (pConfig->deviceType == ma_device_type_duplex) {
-        ma_format rbFormat     = (format != ma_format_unknown) ? format     : pDescriptorCapture->format;
-        ma_uint32 rbChannels   = (channels   > 0)              ? channels   : pDescriptorCapture->channels;
-        ma_uint32 rbSampleRate = (sampleRate > 0)              ? sampleRate : pDescriptorCapture->sampleRate;
-
-        result = ma_duplex_rb_init(rbFormat, rbChannels, rbSampleRate, pDescriptorCapture->sampleRate, pDescriptorCapture->periodSizeInFrames, &pDevice->pContext->allocationCallbacks, &pDevice->duplexRB);
-        if (result != MA_SUCCESS) {
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to initialize ring buffer. %s.\n", ma_result_description(result));
-            goto on_error4;
-        }
-    }
-
-    return MA_SUCCESS;
-
-
-on_error4:
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ((ma_pa_stream_disconnect_proc)pDevice->pContext->pulse.pa_stream_disconnect)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
-    }
-on_error3:
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ((ma_pa_stream_unref_proc)pDevice->pContext->pulse.pa_stream_unref)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
-    }
-on_error2:
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ((ma_pa_stream_disconnect_proc)pDevice->pContext->pulse.pa_stream_disconnect)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
-    }
-on_error1:
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ((ma_pa_stream_unref_proc)pDevice->pContext->pulse.pa_stream_unref)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
-    }
-on_error0:
-    return result;
-}
-
-
-static void ma_pulse_operation_complete_callback(ma_pa_stream* pStream, int success, void* pUserData)
-{
-    ma_bool32* pIsSuccessful = (ma_bool32*)pUserData;
-    MA_ASSERT(pIsSuccessful != NULL);
-
-    *pIsSuccessful = (ma_bool32)success;
-
-    (void)pStream; /* Unused. */
-}
-
-static ma_result ma_device__cork_stream__pulse(ma_device* pDevice, ma_device_type deviceType, int cork)
-{
-    ma_context* pContext = pDevice->pContext;
-    ma_bool32 wasSuccessful;
-    ma_pa_stream* pStream;
-    ma_pa_operation* pOP;
-    ma_result result;
-
-    /* This should not be called with a duplex device type. */
-    if (deviceType == ma_device_type_duplex) {
-        return MA_INVALID_ARGS;
-    }
-
-    wasSuccessful = MA_FALSE;
-
-    pStream = (ma_pa_stream*)((deviceType == ma_device_type_capture) ? pDevice->pulse.pStreamCapture : pDevice->pulse.pStreamPlayback);
-    MA_ASSERT(pStream != NULL);
-
-    pOP = ((ma_pa_stream_cork_proc)pContext->pulse.pa_stream_cork)(pStream, cork, ma_pulse_operation_complete_callback, &wasSuccessful);
-    if (pOP == NULL) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to cork PulseAudio stream.");
-        return MA_ERROR;
-    }
-
-    result = ma_wait_for_operation_and_unref__pulse(pDevice->pContext, pDevice->pulse.pMainLoop, pOP);
-    if (result != MA_SUCCESS) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] An error occurred while waiting for the PulseAudio stream to cork.");
-        return result;
-    }
-
-    if (!wasSuccessful) {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to %s PulseAudio stream.", (cork) ? "stop" : "start");
-        return MA_ERROR;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_start__pulse(ma_device* pDevice)
-{
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        result = ma_device__cork_stream__pulse(pDevice, ma_device_type_capture, 0);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        /*
-        We need to fill some data before uncorking. Not doing this will result in the write callback
-        never getting fired. We're not going to abort if writing fails because I still want the device
-        to get uncorked.
-        */
-        ma_device_write_to_stream__pulse(pDevice, (ma_pa_stream*)(pDevice->pulse.pStreamPlayback), NULL);   /* No need to check the result here. Always want to fall through an uncork.*/
-
-        result = ma_device__cork_stream__pulse(pDevice, ma_device_type_playback, 0);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__pulse(ma_device* pDevice)
-{
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        result = ma_device__cork_stream__pulse(pDevice, ma_device_type_capture, 1);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        /*
-        Ideally we would drain the device here, but there's been cases where PulseAudio seems to be
-        broken on some systems to the point where no audio processing seems to happen. When this
-        happens, draining never completes and we get stuck here. For now I'm disabling draining of
-        the device so we don't just freeze the application.
-        */
-    #if 0
-        ma_pa_operation* pOP = ((ma_pa_stream_drain_proc)pDevice->pContext->pulse.pa_stream_drain)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, ma_pulse_operation_complete_callback, &wasSuccessful);
-        ma_wait_for_operation_and_unref__pulse(pDevice->pContext, pDevice->pulse.pMainLoop, pOP);
-    #endif
-
-        result = ma_device__cork_stream__pulse(pDevice, ma_device_type_playback, 1);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_data_loop__pulse(ma_device* pDevice)
-{
-    int resultPA;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /* NOTE: Don't start the device here. It'll be done at a higher level. */
-
-    /*
-    All data is handled through callbacks. All we need to do is iterate over the main loop and let
-    the callbacks deal with it.
-    */
-    while (ma_device_get_state(pDevice) == ma_device_state_started) {
-        resultPA = ((ma_pa_mainloop_iterate_proc)pDevice->pContext->pulse.pa_mainloop_iterate)((ma_pa_mainloop*)pDevice->pulse.pMainLoop, 1, NULL);
-        if (resultPA < 0) {
-            break;
-        }
-    }
-
-    /* NOTE: Don't stop the device here. It'll be done at a higher level. */
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_data_loop_wakeup__pulse(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    ((ma_pa_mainloop_wakeup_proc)pDevice->pContext->pulse.pa_mainloop_wakeup)((ma_pa_mainloop*)pDevice->pulse.pMainLoop);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_uninit__pulse(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_pulseaudio);
-
-    ((ma_pa_context_disconnect_proc)pContext->pulse.pa_context_disconnect)((ma_pa_context*)pContext->pulse.pPulseContext);
-    ((ma_pa_context_unref_proc)pContext->pulse.pa_context_unref)((ma_pa_context*)pContext->pulse.pPulseContext);
-    ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)pContext->pulse.pMainLoop);
-
-    ma_free(pContext->pulse.pServerName, &pContext->allocationCallbacks);
-    ma_free(pContext->pulse.pApplicationName, &pContext->allocationCallbacks);
-
-#ifndef MA_NO_RUNTIME_LINKING
-    ma_dlclose(ma_context_get_log(pContext), pContext->pulse.pulseSO);
-#endif
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__pulse(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    ma_result result;
-#ifndef MA_NO_RUNTIME_LINKING
-    const char* libpulseNames[] = {
-        "libpulse.so",
-        "libpulse.so.0"
-    };
-    size_t i;
-
-    for (i = 0; i < ma_countof(libpulseNames); ++i) {
-        pContext->pulse.pulseSO = ma_dlopen(ma_context_get_log(pContext), libpulseNames[i]);
-        if (pContext->pulse.pulseSO != NULL) {
-            break;
-        }
-    }
-
-    if (pContext->pulse.pulseSO == NULL) {
-        return MA_NO_BACKEND;
-    }
-
-    pContext->pulse.pa_mainloop_new                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_new");
-    pContext->pulse.pa_mainloop_free                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_free");
-    pContext->pulse.pa_mainloop_quit                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_quit");
-    pContext->pulse.pa_mainloop_get_api                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_get_api");
-    pContext->pulse.pa_mainloop_iterate                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_iterate");
-    pContext->pulse.pa_mainloop_wakeup                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_wakeup");
-    pContext->pulse.pa_threaded_mainloop_new           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_new");
-    pContext->pulse.pa_threaded_mainloop_free          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_free");
-    pContext->pulse.pa_threaded_mainloop_start         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_start");
-    pContext->pulse.pa_threaded_mainloop_stop          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_stop");
-    pContext->pulse.pa_threaded_mainloop_lock          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_lock");
-    pContext->pulse.pa_threaded_mainloop_unlock        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_unlock");
-    pContext->pulse.pa_threaded_mainloop_wait          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_wait");
-    pContext->pulse.pa_threaded_mainloop_signal        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_signal");
-    pContext->pulse.pa_threaded_mainloop_accept        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_accept");
-    pContext->pulse.pa_threaded_mainloop_get_retval    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_get_retval");
-    pContext->pulse.pa_threaded_mainloop_get_api       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_get_api");
-    pContext->pulse.pa_threaded_mainloop_in_thread     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_in_thread");
-    pContext->pulse.pa_threaded_mainloop_set_name      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_set_name");
-    pContext->pulse.pa_context_new                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_new");
-    pContext->pulse.pa_context_unref                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_unref");
-    pContext->pulse.pa_context_connect                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_connect");
-    pContext->pulse.pa_context_disconnect              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_disconnect");
-    pContext->pulse.pa_context_set_state_callback      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_set_state_callback");
-    pContext->pulse.pa_context_get_state               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_state");
-    pContext->pulse.pa_context_get_sink_info_list      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_sink_info_list");
-    pContext->pulse.pa_context_get_source_info_list    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_source_info_list");
-    pContext->pulse.pa_context_get_sink_info_by_name   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_sink_info_by_name");
-    pContext->pulse.pa_context_get_source_info_by_name = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_source_info_by_name");
-    pContext->pulse.pa_operation_unref                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_operation_unref");
-    pContext->pulse.pa_operation_get_state             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_operation_get_state");
-    pContext->pulse.pa_channel_map_init_extend         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_channel_map_init_extend");
-    pContext->pulse.pa_channel_map_valid               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_channel_map_valid");
-    pContext->pulse.pa_channel_map_compatible          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_channel_map_compatible");
-    pContext->pulse.pa_stream_new                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_new");
-    pContext->pulse.pa_stream_unref                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_unref");
-    pContext->pulse.pa_stream_connect_playback         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_connect_playback");
-    pContext->pulse.pa_stream_connect_record           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_connect_record");
-    pContext->pulse.pa_stream_disconnect               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_disconnect");
-    pContext->pulse.pa_stream_get_state                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_state");
-    pContext->pulse.pa_stream_get_sample_spec          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_sample_spec");
-    pContext->pulse.pa_stream_get_channel_map          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_channel_map");
-    pContext->pulse.pa_stream_get_buffer_attr          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_buffer_attr");
-    pContext->pulse.pa_stream_set_buffer_attr          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_buffer_attr");
-    pContext->pulse.pa_stream_get_device_name          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_device_name");
-    pContext->pulse.pa_stream_set_write_callback       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_write_callback");
-    pContext->pulse.pa_stream_set_read_callback        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_read_callback");
-    pContext->pulse.pa_stream_set_suspended_callback   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_suspended_callback");
-    pContext->pulse.pa_stream_set_moved_callback       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_moved_callback");
-    pContext->pulse.pa_stream_is_suspended             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_is_suspended");
-    pContext->pulse.pa_stream_flush                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_flush");
-    pContext->pulse.pa_stream_drain                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_drain");
-    pContext->pulse.pa_stream_is_corked                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_is_corked");
-    pContext->pulse.pa_stream_cork                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_cork");
-    pContext->pulse.pa_stream_trigger                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_trigger");
-    pContext->pulse.pa_stream_begin_write              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_begin_write");
-    pContext->pulse.pa_stream_write                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_write");
-    pContext->pulse.pa_stream_peek                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_peek");
-    pContext->pulse.pa_stream_drop                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_drop");
-    pContext->pulse.pa_stream_writable_size            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_writable_size");
-    pContext->pulse.pa_stream_readable_size            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_readable_size");
-#else
-    /* This strange assignment system is just for type safety. */
-    ma_pa_mainloop_new_proc                    _pa_mainloop_new                   = pa_mainloop_new;
-    ma_pa_mainloop_free_proc                   _pa_mainloop_free                  = pa_mainloop_free;
-    ma_pa_mainloop_quit_proc                   _pa_mainloop_quit                  = pa_mainloop_quit;
-    ma_pa_mainloop_get_api_proc                _pa_mainloop_get_api               = pa_mainloop_get_api;
-    ma_pa_mainloop_iterate_proc                _pa_mainloop_iterate               = pa_mainloop_iterate;
-    ma_pa_mainloop_wakeup_proc                 _pa_mainloop_wakeup                = pa_mainloop_wakeup;
-    ma_pa_threaded_mainloop_new_proc           _pa_threaded_mainloop_new          = pa_threaded_mainloop_new;
-    ma_pa_threaded_mainloop_free_proc          _pa_threaded_mainloop_free         = pa_threaded_mainloop_free;
-    ma_pa_threaded_mainloop_start_proc         _pa_threaded_mainloop_start        = pa_threaded_mainloop_start;
-    ma_pa_threaded_mainloop_stop_proc          _pa_threaded_mainloop_stop         = pa_threaded_mainloop_stop;
-    ma_pa_threaded_mainloop_lock_proc          _pa_threaded_mainloop_lock         = pa_threaded_mainloop_lock;
-    ma_pa_threaded_mainloop_unlock_proc        _pa_threaded_mainloop_unlock       = pa_threaded_mainloop_unlock;
-    ma_pa_threaded_mainloop_wait_proc          _pa_threaded_mainloop_wait         = pa_threaded_mainloop_wait;
-    ma_pa_threaded_mainloop_signal_proc        _pa_threaded_mainloop_signal       = pa_threaded_mainloop_signal;
-    ma_pa_threaded_mainloop_accept_proc        _pa_threaded_mainloop_accept       = pa_threaded_mainloop_accept;
-    ma_pa_threaded_mainloop_get_retval_proc    _pa_threaded_mainloop_get_retval   = pa_threaded_mainloop_get_retval;
-    ma_pa_threaded_mainloop_get_api_proc       _pa_threaded_mainloop_get_api      = pa_threaded_mainloop_get_api;
-    ma_pa_threaded_mainloop_in_thread_proc     _pa_threaded_mainloop_in_thread    = pa_threaded_mainloop_in_thread;
-    ma_pa_threaded_mainloop_set_name_proc      _pa_threaded_mainloop_set_name     = pa_threaded_mainloop_set_name;
-    ma_pa_context_new_proc                     _pa_context_new                    = pa_context_new;
-    ma_pa_context_unref_proc                   _pa_context_unref                  = pa_context_unref;
-    ma_pa_context_connect_proc                 _pa_context_connect                = pa_context_connect;
-    ma_pa_context_disconnect_proc              _pa_context_disconnect             = pa_context_disconnect;
-    ma_pa_context_set_state_callback_proc      _pa_context_set_state_callback     = pa_context_set_state_callback;
-    ma_pa_context_get_state_proc               _pa_context_get_state              = pa_context_get_state;
-    ma_pa_context_get_sink_info_list_proc      _pa_context_get_sink_info_list     = pa_context_get_sink_info_list;
-    ma_pa_context_get_source_info_list_proc    _pa_context_get_source_info_list   = pa_context_get_source_info_list;
-    ma_pa_context_get_sink_info_by_name_proc   _pa_context_get_sink_info_by_name  = pa_context_get_sink_info_by_name;
-    ma_pa_context_get_source_info_by_name_proc _pa_context_get_source_info_by_name= pa_context_get_source_info_by_name;
-    ma_pa_operation_unref_proc                 _pa_operation_unref                = pa_operation_unref;
-    ma_pa_operation_get_state_proc             _pa_operation_get_state            = pa_operation_get_state;
-    ma_pa_channel_map_init_extend_proc         _pa_channel_map_init_extend        = pa_channel_map_init_extend;
-    ma_pa_channel_map_valid_proc               _pa_channel_map_valid              = pa_channel_map_valid;
-    ma_pa_channel_map_compatible_proc          _pa_channel_map_compatible         = pa_channel_map_compatible;
-    ma_pa_stream_new_proc                      _pa_stream_new                     = pa_stream_new;
-    ma_pa_stream_unref_proc                    _pa_stream_unref                   = pa_stream_unref;
-    ma_pa_stream_connect_playback_proc         _pa_stream_connect_playback        = pa_stream_connect_playback;
-    ma_pa_stream_connect_record_proc           _pa_stream_connect_record          = pa_stream_connect_record;
-    ma_pa_stream_disconnect_proc               _pa_stream_disconnect              = pa_stream_disconnect;
-    ma_pa_stream_get_state_proc                _pa_stream_get_state               = pa_stream_get_state;
-    ma_pa_stream_get_sample_spec_proc          _pa_stream_get_sample_spec         = pa_stream_get_sample_spec;
-    ma_pa_stream_get_channel_map_proc          _pa_stream_get_channel_map         = pa_stream_get_channel_map;
-    ma_pa_stream_get_buffer_attr_proc          _pa_stream_get_buffer_attr         = pa_stream_get_buffer_attr;
-    ma_pa_stream_set_buffer_attr_proc          _pa_stream_set_buffer_attr         = pa_stream_set_buffer_attr;
-    ma_pa_stream_get_device_name_proc          _pa_stream_get_device_name         = pa_stream_get_device_name;
-    ma_pa_stream_set_write_callback_proc       _pa_stream_set_write_callback      = pa_stream_set_write_callback;
-    ma_pa_stream_set_read_callback_proc        _pa_stream_set_read_callback       = pa_stream_set_read_callback;
-    ma_pa_stream_set_suspended_callback_proc   _pa_stream_set_suspended_callback  = pa_stream_set_suspended_callback;
-    ma_pa_stream_set_moved_callback_proc       _pa_stream_set_moved_callback      = pa_stream_set_moved_callback;
-    ma_pa_stream_is_suspended_proc             _pa_stream_is_suspended            = pa_stream_is_suspended;
-    ma_pa_stream_flush_proc                    _pa_stream_flush                   = pa_stream_flush;
-    ma_pa_stream_drain_proc                    _pa_stream_drain                   = pa_stream_drain;
-    ma_pa_stream_is_corked_proc                _pa_stream_is_corked               = pa_stream_is_corked;
-    ma_pa_stream_cork_proc                     _pa_stream_cork                    = pa_stream_cork;
-    ma_pa_stream_trigger_proc                  _pa_stream_trigger                 = pa_stream_trigger;
-    ma_pa_stream_begin_write_proc              _pa_stream_begin_write             = pa_stream_begin_write;
-    ma_pa_stream_write_proc                    _pa_stream_write                   = pa_stream_write;
-    ma_pa_stream_peek_proc                     _pa_stream_peek                    = pa_stream_peek;
-    ma_pa_stream_drop_proc                     _pa_stream_drop                    = pa_stream_drop;
-    ma_pa_stream_writable_size_proc            _pa_stream_writable_size           = pa_stream_writable_size;
-    ma_pa_stream_readable_size_proc            _pa_stream_readable_size           = pa_stream_readable_size;
-
-    pContext->pulse.pa_mainloop_new                    = (ma_proc)_pa_mainloop_new;
-    pContext->pulse.pa_mainloop_free                   = (ma_proc)_pa_mainloop_free;
-    pContext->pulse.pa_mainloop_quit                   = (ma_proc)_pa_mainloop_quit;
-    pContext->pulse.pa_mainloop_get_api                = (ma_proc)_pa_mainloop_get_api;
-    pContext->pulse.pa_mainloop_iterate                = (ma_proc)_pa_mainloop_iterate;
-    pContext->pulse.pa_mainloop_wakeup                 = (ma_proc)_pa_mainloop_wakeup;
-    pContext->pulse.pa_threaded_mainloop_new           = (ma_proc)_pa_threaded_mainloop_new;
-    pContext->pulse.pa_threaded_mainloop_free          = (ma_proc)_pa_threaded_mainloop_free;
-    pContext->pulse.pa_threaded_mainloop_start         = (ma_proc)_pa_threaded_mainloop_start;
-    pContext->pulse.pa_threaded_mainloop_stop          = (ma_proc)_pa_threaded_mainloop_stop;
-    pContext->pulse.pa_threaded_mainloop_lock          = (ma_proc)_pa_threaded_mainloop_lock;
-    pContext->pulse.pa_threaded_mainloop_unlock        = (ma_proc)_pa_threaded_mainloop_unlock;
-    pContext->pulse.pa_threaded_mainloop_wait          = (ma_proc)_pa_threaded_mainloop_wait;
-    pContext->pulse.pa_threaded_mainloop_signal        = (ma_proc)_pa_threaded_mainloop_signal;
-    pContext->pulse.pa_threaded_mainloop_accept        = (ma_proc)_pa_threaded_mainloop_accept;
-    pContext->pulse.pa_threaded_mainloop_get_retval    = (ma_proc)_pa_threaded_mainloop_get_retval;
-    pContext->pulse.pa_threaded_mainloop_get_api       = (ma_proc)_pa_threaded_mainloop_get_api;
-    pContext->pulse.pa_threaded_mainloop_in_thread     = (ma_proc)_pa_threaded_mainloop_in_thread;
-    pContext->pulse.pa_threaded_mainloop_set_name      = (ma_proc)_pa_threaded_mainloop_set_name;
-    pContext->pulse.pa_context_new                     = (ma_proc)_pa_context_new;
-    pContext->pulse.pa_context_unref                   = (ma_proc)_pa_context_unref;
-    pContext->pulse.pa_context_connect                 = (ma_proc)_pa_context_connect;
-    pContext->pulse.pa_context_disconnect              = (ma_proc)_pa_context_disconnect;
-    pContext->pulse.pa_context_set_state_callback      = (ma_proc)_pa_context_set_state_callback;
-    pContext->pulse.pa_context_get_state               = (ma_proc)_pa_context_get_state;
-    pContext->pulse.pa_context_get_sink_info_list      = (ma_proc)_pa_context_get_sink_info_list;
-    pContext->pulse.pa_context_get_source_info_list    = (ma_proc)_pa_context_get_source_info_list;
-    pContext->pulse.pa_context_get_sink_info_by_name   = (ma_proc)_pa_context_get_sink_info_by_name;
-    pContext->pulse.pa_context_get_source_info_by_name = (ma_proc)_pa_context_get_source_info_by_name;
-    pContext->pulse.pa_operation_unref                 = (ma_proc)_pa_operation_unref;
-    pContext->pulse.pa_operation_get_state             = (ma_proc)_pa_operation_get_state;
-    pContext->pulse.pa_channel_map_init_extend         = (ma_proc)_pa_channel_map_init_extend;
-    pContext->pulse.pa_channel_map_valid               = (ma_proc)_pa_channel_map_valid;
-    pContext->pulse.pa_channel_map_compatible          = (ma_proc)_pa_channel_map_compatible;
-    pContext->pulse.pa_stream_new                      = (ma_proc)_pa_stream_new;
-    pContext->pulse.pa_stream_unref                    = (ma_proc)_pa_stream_unref;
-    pContext->pulse.pa_stream_connect_playback         = (ma_proc)_pa_stream_connect_playback;
-    pContext->pulse.pa_stream_connect_record           = (ma_proc)_pa_stream_connect_record;
-    pContext->pulse.pa_stream_disconnect               = (ma_proc)_pa_stream_disconnect;
-    pContext->pulse.pa_stream_get_state                = (ma_proc)_pa_stream_get_state;
-    pContext->pulse.pa_stream_get_sample_spec          = (ma_proc)_pa_stream_get_sample_spec;
-    pContext->pulse.pa_stream_get_channel_map          = (ma_proc)_pa_stream_get_channel_map;
-    pContext->pulse.pa_stream_get_buffer_attr          = (ma_proc)_pa_stream_get_buffer_attr;
-    pContext->pulse.pa_stream_set_buffer_attr          = (ma_proc)_pa_stream_set_buffer_attr;
-    pContext->pulse.pa_stream_get_device_name          = (ma_proc)_pa_stream_get_device_name;
-    pContext->pulse.pa_stream_set_write_callback       = (ma_proc)_pa_stream_set_write_callback;
-    pContext->pulse.pa_stream_set_read_callback        = (ma_proc)_pa_stream_set_read_callback;
-    pContext->pulse.pa_stream_set_suspended_callback   = (ma_proc)_pa_stream_set_suspended_callback;
-    pContext->pulse.pa_stream_set_moved_callback       = (ma_proc)_pa_stream_set_moved_callback;
-    pContext->pulse.pa_stream_is_suspended             = (ma_proc)_pa_stream_is_suspended;
-    pContext->pulse.pa_stream_flush                    = (ma_proc)_pa_stream_flush;
-    pContext->pulse.pa_stream_drain                    = (ma_proc)_pa_stream_drain;
-    pContext->pulse.pa_stream_is_corked                = (ma_proc)_pa_stream_is_corked;
-    pContext->pulse.pa_stream_cork                     = (ma_proc)_pa_stream_cork;
-    pContext->pulse.pa_stream_trigger                  = (ma_proc)_pa_stream_trigger;
-    pContext->pulse.pa_stream_begin_write              = (ma_proc)_pa_stream_begin_write;
-    pContext->pulse.pa_stream_write                    = (ma_proc)_pa_stream_write;
-    pContext->pulse.pa_stream_peek                     = (ma_proc)_pa_stream_peek;
-    pContext->pulse.pa_stream_drop                     = (ma_proc)_pa_stream_drop;
-    pContext->pulse.pa_stream_writable_size            = (ma_proc)_pa_stream_writable_size;
-    pContext->pulse.pa_stream_readable_size            = (ma_proc)_pa_stream_readable_size;
-#endif
-
-    /* We need to make a copy of the application and server names so we can pass them to the pa_context of each device. */
-    pContext->pulse.pApplicationName = ma_copy_string(pConfig->pulse.pApplicationName, &pContext->allocationCallbacks);
-    if (pContext->pulse.pApplicationName == NULL && pConfig->pulse.pApplicationName != NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    pContext->pulse.pServerName = ma_copy_string(pConfig->pulse.pServerName, &pContext->allocationCallbacks);
-    if (pContext->pulse.pServerName == NULL && pConfig->pulse.pServerName != NULL) {
-        ma_free(pContext->pulse.pApplicationName, &pContext->allocationCallbacks);
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_init_pa_mainloop_and_pa_context__pulse(pContext, pConfig->pulse.pApplicationName, pConfig->pulse.pServerName, pConfig->pulse.tryAutoSpawn, &pContext->pulse.pMainLoop, &pContext->pulse.pPulseContext);
-    if (result != MA_SUCCESS) {
-        ma_free(pContext->pulse.pServerName, &pContext->allocationCallbacks);
-        ma_free(pContext->pulse.pApplicationName, &pContext->allocationCallbacks);
-    #ifndef MA_NO_RUNTIME_LINKING
-        ma_dlclose(ma_context_get_log(pContext), pContext->pulse.pulseSO);
-    #endif
-        return result;
-    }
-
-    /* With pa_mainloop we run a synchronous backend, but we implement our own main loop. */
-    pCallbacks->onContextInit             = ma_context_init__pulse;
-    pCallbacks->onContextUninit           = ma_context_uninit__pulse;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__pulse;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__pulse;
-    pCallbacks->onDeviceInit              = ma_device_init__pulse;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__pulse;
-    pCallbacks->onDeviceStart             = ma_device_start__pulse;
-    pCallbacks->onDeviceStop              = ma_device_stop__pulse;
-    pCallbacks->onDeviceRead              = NULL;   /* Not used because we're implementing onDeviceDataLoop. */
-    pCallbacks->onDeviceWrite             = NULL;   /* Not used because we're implementing onDeviceDataLoop. */
-    pCallbacks->onDeviceDataLoop          = ma_device_data_loop__pulse;
-    pCallbacks->onDeviceDataLoopWakeup    = ma_device_data_loop_wakeup__pulse;
-
-    return MA_SUCCESS;
-}
-#endif
-
-
-/******************************************************************************
-
-JACK Backend
-
-******************************************************************************/
-#ifdef MA_HAS_JACK
-
-/* It is assumed jack.h is available when compile-time linking is being used. */
-#ifdef MA_NO_RUNTIME_LINKING
-#include <jack/jack.h>
-
-typedef jack_nframes_t              ma_jack_nframes_t;
-typedef jack_options_t              ma_jack_options_t;
-typedef jack_status_t               ma_jack_status_t;
-typedef jack_client_t               ma_jack_client_t;
-typedef jack_port_t                 ma_jack_port_t;
-typedef JackProcessCallback         ma_JackProcessCallback;
-typedef JackBufferSizeCallback      ma_JackBufferSizeCallback;
-typedef JackShutdownCallback        ma_JackShutdownCallback;
-#define MA_JACK_DEFAULT_AUDIO_TYPE  JACK_DEFAULT_AUDIO_TYPE
-#define ma_JackNoStartServer        JackNoStartServer
-#define ma_JackPortIsInput          JackPortIsInput
-#define ma_JackPortIsOutput         JackPortIsOutput
-#define ma_JackPortIsPhysical       JackPortIsPhysical
-#else
-typedef ma_uint32               ma_jack_nframes_t;
-typedef int                     ma_jack_options_t;
-typedef int                     ma_jack_status_t;
-typedef struct ma_jack_client_t ma_jack_client_t;
-typedef struct ma_jack_port_t   ma_jack_port_t;
-typedef int  (* ma_JackProcessCallback)   (ma_jack_nframes_t nframes, void* arg);
-typedef int  (* ma_JackBufferSizeCallback)(ma_jack_nframes_t nframes, void* arg);
-typedef void (* ma_JackShutdownCallback)  (void* arg);
-#define MA_JACK_DEFAULT_AUDIO_TYPE "32 bit float mono audio"
-#define ma_JackNoStartServer       1
-#define ma_JackPortIsInput         1
-#define ma_JackPortIsOutput        2
-#define ma_JackPortIsPhysical      4
-#endif
-
-typedef ma_jack_client_t* (* ma_jack_client_open_proc)             (const char* client_name, ma_jack_options_t options, ma_jack_status_t* status, ...);
-typedef int               (* ma_jack_client_close_proc)            (ma_jack_client_t* client);
-typedef int               (* ma_jack_client_name_size_proc)        (void);
-typedef int               (* ma_jack_set_process_callback_proc)    (ma_jack_client_t* client, ma_JackProcessCallback process_callback, void* arg);
-typedef int               (* ma_jack_set_buffer_size_callback_proc)(ma_jack_client_t* client, ma_JackBufferSizeCallback bufsize_callback, void* arg);
-typedef void              (* ma_jack_on_shutdown_proc)             (ma_jack_client_t* client, ma_JackShutdownCallback function, void* arg);
-typedef ma_jack_nframes_t (* ma_jack_get_sample_rate_proc)         (ma_jack_client_t* client);
-typedef ma_jack_nframes_t (* ma_jack_get_buffer_size_proc)         (ma_jack_client_t* client);
-typedef const char**      (* ma_jack_get_ports_proc)               (ma_jack_client_t* client, const char* port_name_pattern, const char* type_name_pattern, unsigned long flags);
-typedef int               (* ma_jack_activate_proc)                (ma_jack_client_t* client);
-typedef int               (* ma_jack_deactivate_proc)              (ma_jack_client_t* client);
-typedef int               (* ma_jack_connect_proc)                 (ma_jack_client_t* client, const char* source_port, const char* destination_port);
-typedef ma_jack_port_t*   (* ma_jack_port_register_proc)           (ma_jack_client_t* client, const char* port_name, const char* port_type, unsigned long flags, unsigned long buffer_size);
-typedef const char*       (* ma_jack_port_name_proc)               (const ma_jack_port_t* port);
-typedef void*             (* ma_jack_port_get_buffer_proc)         (ma_jack_port_t* port, ma_jack_nframes_t nframes);
-typedef void              (* ma_jack_free_proc)                    (void* ptr);
-
-static ma_result ma_context_open_client__jack(ma_context* pContext, ma_jack_client_t** ppClient)
-{
-    size_t maxClientNameSize;
-    char clientName[256];
-    ma_jack_status_t status;
-    ma_jack_client_t* pClient;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(ppClient != NULL);
-
-    if (ppClient) {
-        *ppClient = NULL;
-    }
-
-    maxClientNameSize = ((ma_jack_client_name_size_proc)pContext->jack.jack_client_name_size)(); /* Includes null terminator. */
-    ma_strncpy_s(clientName, ma_min(sizeof(clientName), maxClientNameSize), (pContext->jack.pClientName != NULL) ? pContext->jack.pClientName : "miniaudio", (size_t)-1);
-
-    pClient = ((ma_jack_client_open_proc)pContext->jack.jack_client_open)(clientName, (pContext->jack.tryStartServer) ? 0 : ma_JackNoStartServer, &status, NULL);
-    if (pClient == NULL) {
-        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-    }
-
-    if (ppClient) {
-        *ppClient = pClient;
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_context_enumerate_devices__jack(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_bool32 cbResult = MA_TRUE;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    /* Playback. */
-    if (cbResult) {
-        ma_device_info deviceInfo;
-        MA_ZERO_OBJECT(&deviceInfo);
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-        deviceInfo.isDefault = MA_TRUE;    /* JACK only uses default devices. */
-        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-    }
-
-    /* Capture. */
-    if (cbResult) {
-        ma_device_info deviceInfo;
-        MA_ZERO_OBJECT(&deviceInfo);
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-        deviceInfo.isDefault = MA_TRUE;    /* JACK only uses default devices. */
-        cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-    }
-
-    (void)cbResult; /* For silencing a static analysis warning. */
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info__jack(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    ma_jack_client_t* pClient;
-    ma_result result;
-    const char** ppPorts;
-
-    MA_ASSERT(pContext != NULL);
-
-    if (pDeviceID != NULL && pDeviceID->jack != 0) {
-        return MA_NO_DEVICE;   /* Don't know the device. */
-    }
-
-    /* Name / Description */
-    if (deviceType == ma_device_type_playback) {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-    } else {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-    }
-
-    /* Jack only uses default devices. */
-    pDeviceInfo->isDefault = MA_TRUE;
-
-    /* Jack only supports f32 and has a specific channel count and sample rate. */
-    pDeviceInfo->nativeDataFormats[0].format = ma_format_f32;
-
-    /* The channel count and sample rate can only be determined by opening the device. */
-    result = ma_context_open_client__jack(pContext, &pClient);
-    if (result != MA_SUCCESS) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[JACK] Failed to open client.");
-        return result;
-    }
-
-    pDeviceInfo->nativeDataFormats[0].sampleRate = ((ma_jack_get_sample_rate_proc)pContext->jack.jack_get_sample_rate)((ma_jack_client_t*)pClient);
-    pDeviceInfo->nativeDataFormats[0].channels   = 0;
-
-    ppPorts = ((ma_jack_get_ports_proc)pContext->jack.jack_get_ports)((ma_jack_client_t*)pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ((deviceType == ma_device_type_playback) ? ma_JackPortIsInput : ma_JackPortIsOutput));
-    if (ppPorts == NULL) {
-        ((ma_jack_client_close_proc)pContext->jack.jack_client_close)((ma_jack_client_t*)pClient);
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[JACK] Failed to query physical ports.");
-        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-    }
-
-    while (ppPorts[pDeviceInfo->nativeDataFormats[0].channels] != NULL) {
-        pDeviceInfo->nativeDataFormats[0].channels += 1;
-    }
-
-    pDeviceInfo->nativeDataFormats[0].flags = 0;
-    pDeviceInfo->nativeDataFormatCount = 1;
-
-    ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppPorts);
-    ((ma_jack_client_close_proc)pContext->jack.jack_client_close)((ma_jack_client_t*)pClient);
-
-    (void)pContext;
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_device_uninit__jack(ma_device* pDevice)
-{
-    ma_context* pContext;
-
-    MA_ASSERT(pDevice != NULL);
-
-    pContext = pDevice->pContext;
-    MA_ASSERT(pContext != NULL);
-
-    if (pDevice->jack.pClient != NULL) {
-        ((ma_jack_client_close_proc)pContext->jack.jack_client_close)((ma_jack_client_t*)pDevice->jack.pClient);
-    }
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ma_free(pDevice->jack.pIntermediaryBufferCapture, &pDevice->pContext->allocationCallbacks);
-        ma_free(pDevice->jack.ppPortsCapture, &pDevice->pContext->allocationCallbacks);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_free(pDevice->jack.pIntermediaryBufferPlayback, &pDevice->pContext->allocationCallbacks);
-        ma_free(pDevice->jack.ppPortsPlayback, &pDevice->pContext->allocationCallbacks);
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_device__jack_shutdown_callback(void* pUserData)
-{
-    /* JACK died. Stop the device. */
-    ma_device* pDevice = (ma_device*)pUserData;
-    MA_ASSERT(pDevice != NULL);
-
-    ma_device_stop(pDevice);
-}
-
-static int ma_device__jack_buffer_size_callback(ma_jack_nframes_t frameCount, void* pUserData)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        size_t newBufferSize = frameCount * (pDevice->capture.internalChannels * ma_get_bytes_per_sample(pDevice->capture.internalFormat));
-        float* pNewBuffer = (float*)ma_calloc(newBufferSize, &pDevice->pContext->allocationCallbacks);
-        if (pNewBuffer == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-
-        ma_free(pDevice->jack.pIntermediaryBufferCapture, &pDevice->pContext->allocationCallbacks);
-
-        pDevice->jack.pIntermediaryBufferCapture = pNewBuffer;
-        pDevice->playback.internalPeriodSizeInFrames = frameCount;
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        size_t newBufferSize = frameCount * (pDevice->playback.internalChannels * ma_get_bytes_per_sample(pDevice->playback.internalFormat));
-        float* pNewBuffer = (float*)ma_calloc(newBufferSize, &pDevice->pContext->allocationCallbacks);
-        if (pNewBuffer == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-
-        ma_free(pDevice->jack.pIntermediaryBufferPlayback, &pDevice->pContext->allocationCallbacks);
-
-        pDevice->jack.pIntermediaryBufferPlayback = pNewBuffer;
-        pDevice->playback.internalPeriodSizeInFrames = frameCount;
-    }
-
-    return 0;
-}
-
-static int ma_device__jack_process_callback(ma_jack_nframes_t frameCount, void* pUserData)
-{
-    ma_device* pDevice;
-    ma_context* pContext;
-    ma_uint32 iChannel;
-
-    pDevice = (ma_device*)pUserData;
-    MA_ASSERT(pDevice != NULL);
-
-    pContext = pDevice->pContext;
-    MA_ASSERT(pContext != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        /* Channels need to be interleaved. */
-        for (iChannel = 0; iChannel < pDevice->capture.internalChannels; ++iChannel) {
-            const float* pSrc = (const float*)((ma_jack_port_get_buffer_proc)pContext->jack.jack_port_get_buffer)((ma_jack_port_t*)pDevice->jack.ppPortsCapture[iChannel], frameCount);
-            if (pSrc != NULL) {
-                float* pDst = pDevice->jack.pIntermediaryBufferCapture + iChannel;
-                ma_jack_nframes_t iFrame;
-                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                    *pDst = *pSrc;
-
-                    pDst += pDevice->capture.internalChannels;
-                    pSrc += 1;
-                }
-            }
-        }
-
-        ma_device_handle_backend_data_callback(pDevice, NULL, pDevice->jack.pIntermediaryBufferCapture, frameCount);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_device_handle_backend_data_callback(pDevice, pDevice->jack.pIntermediaryBufferPlayback, NULL, frameCount);
-
-        /* Channels need to be deinterleaved. */
-        for (iChannel = 0; iChannel < pDevice->playback.internalChannels; ++iChannel) {
-            float* pDst = (float*)((ma_jack_port_get_buffer_proc)pContext->jack.jack_port_get_buffer)((ma_jack_port_t*)pDevice->jack.ppPortsPlayback[iChannel], frameCount);
-            if (pDst != NULL) {
-                const float* pSrc = pDevice->jack.pIntermediaryBufferPlayback + iChannel;
-                ma_jack_nframes_t iFrame;
-                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                    *pDst = *pSrc;
-
-                    pDst += 1;
-                    pSrc += pDevice->playback.internalChannels;
-                }
-            }
-        }
-    }
-
-    return 0;
-}
-
-static ma_result ma_device_init__jack(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    ma_result result;
-    ma_uint32 periodSizeInFrames;
-
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(pDevice != NULL);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Loopback mode not supported.");
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    /* Only supporting default devices with JACK. */
-    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->pDeviceID != NULL && pDescriptorPlayback->pDeviceID->jack != 0) ||
-        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->pDeviceID  != NULL && pDescriptorCapture->pDeviceID->jack  != 0)) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Only default devices are supported.");
-        return MA_NO_DEVICE;
-    }
-
-    /* No exclusive mode with the JACK backend. */
-    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
-        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Exclusive mode not supported.");
-        return MA_SHARE_MODE_NOT_SUPPORTED;
-    }
-
-    /* Open the client. */
-    result = ma_context_open_client__jack(pDevice->pContext, (ma_jack_client_t**)&pDevice->jack.pClient);
-    if (result != MA_SUCCESS) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to open client.");
-        return result;
-    }
-
-    /* Callbacks. */
-    if (((ma_jack_set_process_callback_proc)pDevice->pContext->jack.jack_set_process_callback)((ma_jack_client_t*)pDevice->jack.pClient, ma_device__jack_process_callback, pDevice) != 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to set process callback.");
-        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-    }
-    if (((ma_jack_set_buffer_size_callback_proc)pDevice->pContext->jack.jack_set_buffer_size_callback)((ma_jack_client_t*)pDevice->jack.pClient, ma_device__jack_buffer_size_callback, pDevice) != 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to set buffer size callback.");
-        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-    }
-
-    ((ma_jack_on_shutdown_proc)pDevice->pContext->jack.jack_on_shutdown)((ma_jack_client_t*)pDevice->jack.pClient, ma_device__jack_shutdown_callback, pDevice);
-
-
-    /* The buffer size in frames can change. */
-    periodSizeInFrames = ((ma_jack_get_buffer_size_proc)pDevice->pContext->jack.jack_get_buffer_size)((ma_jack_client_t*)pDevice->jack.pClient);
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ma_uint32 iPort;
-        const char** ppPorts;
-
-        pDescriptorCapture->format     = ma_format_f32;
-        pDescriptorCapture->channels   = 0;
-        pDescriptorCapture->sampleRate = ((ma_jack_get_sample_rate_proc)pDevice->pContext->jack.jack_get_sample_rate)((ma_jack_client_t*)pDevice->jack.pClient);
-        ma_channel_map_init_standard(ma_standard_channel_map_alsa, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorCapture->channels);
-
-        ppPorts = ((ma_jack_get_ports_proc)pDevice->pContext->jack.jack_get_ports)((ma_jack_client_t*)pDevice->jack.pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ma_JackPortIsOutput);
-        if (ppPorts == NULL) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to query physical ports.");
-            return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-        }
-
-        /* Need to count the number of ports first so we can allocate some memory. */
-        while (ppPorts[pDescriptorCapture->channels] != NULL) {
-            pDescriptorCapture->channels += 1;
-        }
-
-        pDevice->jack.ppPortsCapture = (ma_ptr*)ma_malloc(sizeof(*pDevice->jack.ppPortsCapture) * pDescriptorCapture->channels, &pDevice->pContext->allocationCallbacks);
-        if (pDevice->jack.ppPortsCapture == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-
-        for (iPort = 0; iPort < pDescriptorCapture->channels; iPort += 1) {
-            char name[64];
-            ma_strcpy_s(name, sizeof(name), "capture");
-            ma_itoa_s((int)iPort, name+7, sizeof(name)-7, 10); /* 7 = length of "capture" */
-
-            pDevice->jack.ppPortsCapture[iPort] = ((ma_jack_port_register_proc)pDevice->pContext->jack.jack_port_register)((ma_jack_client_t*)pDevice->jack.pClient, name, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsInput, 0);
-            if (pDevice->jack.ppPortsCapture[iPort] == NULL) {
-                ((ma_jack_free_proc)pDevice->pContext->jack.jack_free)((void*)ppPorts);
-                ma_device_uninit__jack(pDevice);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to register ports.");
-                return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-            }
-        }
-
-        ((ma_jack_free_proc)pDevice->pContext->jack.jack_free)((void*)ppPorts);
-
-        pDescriptorCapture->periodSizeInFrames = periodSizeInFrames;
-        pDescriptorCapture->periodCount        = 1; /* There's no notion of a period in JACK. Just set to 1. */
-
-        pDevice->jack.pIntermediaryBufferCapture = (float*)ma_calloc(pDescriptorCapture->periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels), &pDevice->pContext->allocationCallbacks);
-        if (pDevice->jack.pIntermediaryBufferCapture == NULL) {
-            ma_device_uninit__jack(pDevice);
-            return MA_OUT_OF_MEMORY;
-        }
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ma_uint32 iPort;
-        const char** ppPorts;
-
-        pDescriptorPlayback->format     = ma_format_f32;
-        pDescriptorPlayback->channels   = 0;
-        pDescriptorPlayback->sampleRate = ((ma_jack_get_sample_rate_proc)pDevice->pContext->jack.jack_get_sample_rate)((ma_jack_client_t*)pDevice->jack.pClient);
-        ma_channel_map_init_standard(ma_standard_channel_map_alsa, pDescriptorPlayback->channelMap, ma_countof(pDescriptorPlayback->channelMap), pDescriptorPlayback->channels);
-
-        ppPorts = ((ma_jack_get_ports_proc)pDevice->pContext->jack.jack_get_ports)((ma_jack_client_t*)pDevice->jack.pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ma_JackPortIsInput);
-        if (ppPorts == NULL) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to query physical ports.");
-            return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-        }
-
-        /* Need to count the number of ports first so we can allocate some memory. */
-        while (ppPorts[pDescriptorPlayback->channels] != NULL) {
-            pDescriptorPlayback->channels += 1;
-        }
-
-        pDevice->jack.ppPortsPlayback = (ma_ptr*)ma_malloc(sizeof(*pDevice->jack.ppPortsPlayback) * pDescriptorPlayback->channels, &pDevice->pContext->allocationCallbacks);
-        if (pDevice->jack.ppPortsPlayback == NULL) {
-            ma_free(pDevice->jack.ppPortsCapture, &pDevice->pContext->allocationCallbacks);
-            return MA_OUT_OF_MEMORY;
-        }
-
-        for (iPort = 0; iPort < pDescriptorPlayback->channels; iPort += 1) {
-            char name[64];
-            ma_strcpy_s(name, sizeof(name), "playback");
-            ma_itoa_s((int)iPort, name+8, sizeof(name)-8, 10); /* 8 = length of "playback" */
-
-            pDevice->jack.ppPortsPlayback[iPort] = ((ma_jack_port_register_proc)pDevice->pContext->jack.jack_port_register)((ma_jack_client_t*)pDevice->jack.pClient, name, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsOutput, 0);
-            if (pDevice->jack.ppPortsPlayback[iPort] == NULL) {
-                ((ma_jack_free_proc)pDevice->pContext->jack.jack_free)((void*)ppPorts);
-                ma_device_uninit__jack(pDevice);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to register ports.");
-                return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-            }
-        }
-
-        ((ma_jack_free_proc)pDevice->pContext->jack.jack_free)((void*)ppPorts);
-
-        pDescriptorPlayback->periodSizeInFrames = periodSizeInFrames;
-        pDescriptorPlayback->periodCount        = 1;   /* There's no notion of a period in JACK. Just set to 1. */
-
-        pDevice->jack.pIntermediaryBufferPlayback = (float*)ma_calloc(pDescriptorPlayback->periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels), &pDevice->pContext->allocationCallbacks);
-        if (pDevice->jack.pIntermediaryBufferPlayback == NULL) {
-            ma_device_uninit__jack(pDevice);
-            return MA_OUT_OF_MEMORY;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_device_start__jack(ma_device* pDevice)
-{
-    ma_context* pContext = pDevice->pContext;
-    int resultJACK;
-    size_t i;
-
-    resultJACK = ((ma_jack_activate_proc)pContext->jack.jack_activate)((ma_jack_client_t*)pDevice->jack.pClient);
-    if (resultJACK != 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to activate the JACK client.");
-        return MA_FAILED_TO_START_BACKEND_DEVICE;
-    }
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        const char** ppServerPorts = ((ma_jack_get_ports_proc)pContext->jack.jack_get_ports)((ma_jack_client_t*)pDevice->jack.pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ma_JackPortIsOutput);
-        if (ppServerPorts == NULL) {
-            ((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to retrieve physical ports.");
-            return MA_ERROR;
-        }
-
-        for (i = 0; ppServerPorts[i] != NULL; ++i) {
-            const char* pServerPort = ppServerPorts[i];
-            const char* pClientPort = ((ma_jack_port_name_proc)pContext->jack.jack_port_name)((ma_jack_port_t*)pDevice->jack.ppPortsCapture[i]);
-
-            resultJACK = ((ma_jack_connect_proc)pContext->jack.jack_connect)((ma_jack_client_t*)pDevice->jack.pClient, pServerPort, pClientPort);
-            if (resultJACK != 0) {
-                ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppServerPorts);
-                ((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to connect ports.");
-                return MA_ERROR;
-            }
-        }
-
-        ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppServerPorts);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        const char** ppServerPorts = ((ma_jack_get_ports_proc)pContext->jack.jack_get_ports)((ma_jack_client_t*)pDevice->jack.pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ma_JackPortIsInput);
-        if (ppServerPorts == NULL) {
-            ((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to retrieve physical ports.");
-            return MA_ERROR;
-        }
-
-        for (i = 0; ppServerPorts[i] != NULL; ++i) {
-            const char* pServerPort = ppServerPorts[i];
-            const char* pClientPort = ((ma_jack_port_name_proc)pContext->jack.jack_port_name)((ma_jack_port_t*)pDevice->jack.ppPortsPlayback[i]);
-
-            resultJACK = ((ma_jack_connect_proc)pContext->jack.jack_connect)((ma_jack_client_t*)pDevice->jack.pClient, pClientPort, pServerPort);
-            if (resultJACK != 0) {
-                ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppServerPorts);
-                ((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to connect ports.");
-                return MA_ERROR;
-            }
-        }
-
-        ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppServerPorts);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__jack(ma_device* pDevice)
-{
-    ma_context* pContext = pDevice->pContext;
-
-    if (((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient) != 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] An error occurred when deactivating the JACK client.");
-        return MA_ERROR;
-    }
-
-    ma_device__on_notification_stopped(pDevice);
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_context_uninit__jack(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_jack);
-
-    ma_free(pContext->jack.pClientName, &pContext->allocationCallbacks);
-    pContext->jack.pClientName = NULL;
-
-#ifndef MA_NO_RUNTIME_LINKING
-    ma_dlclose(ma_context_get_log(pContext), pContext->jack.jackSO);
-#endif
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__jack(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-#ifndef MA_NO_RUNTIME_LINKING
-    const char* libjackNames[] = {
-#if defined(MA_WIN32)
-        "libjack.dll",
-        "libjack64.dll"
-#endif
-#if defined(MA_UNIX)
-        "libjack.so",
-        "libjack.so.0"
-#endif
-    };
-    size_t i;
-
-    for (i = 0; i < ma_countof(libjackNames); ++i) {
-        pContext->jack.jackSO = ma_dlopen(ma_context_get_log(pContext), libjackNames[i]);
-        if (pContext->jack.jackSO != NULL) {
-            break;
-        }
-    }
-
-    if (pContext->jack.jackSO == NULL) {
-        return MA_NO_BACKEND;
-    }
-
-    pContext->jack.jack_client_open              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_client_open");
-    pContext->jack.jack_client_close             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_client_close");
-    pContext->jack.jack_client_name_size         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_client_name_size");
-    pContext->jack.jack_set_process_callback     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_set_process_callback");
-    pContext->jack.jack_set_buffer_size_callback = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_set_buffer_size_callback");
-    pContext->jack.jack_on_shutdown              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_on_shutdown");
-    pContext->jack.jack_get_sample_rate          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_get_sample_rate");
-    pContext->jack.jack_get_buffer_size          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_get_buffer_size");
-    pContext->jack.jack_get_ports                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_get_ports");
-    pContext->jack.jack_activate                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_activate");
-    pContext->jack.jack_deactivate               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_deactivate");
-    pContext->jack.jack_connect                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_connect");
-    pContext->jack.jack_port_register            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_port_register");
-    pContext->jack.jack_port_name                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_port_name");
-    pContext->jack.jack_port_get_buffer          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_port_get_buffer");
-    pContext->jack.jack_free                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_free");
-#else
-    /*
-    This strange assignment system is here just to ensure type safety of miniaudio's function pointer
-    types. If anything differs slightly the compiler should throw a warning.
-    */
-    ma_jack_client_open_proc              _jack_client_open              = jack_client_open;
-    ma_jack_client_close_proc             _jack_client_close             = jack_client_close;
-    ma_jack_client_name_size_proc         _jack_client_name_size         = jack_client_name_size;
-    ma_jack_set_process_callback_proc     _jack_set_process_callback     = jack_set_process_callback;
-    ma_jack_set_buffer_size_callback_proc _jack_set_buffer_size_callback = jack_set_buffer_size_callback;
-    ma_jack_on_shutdown_proc              _jack_on_shutdown              = jack_on_shutdown;
-    ma_jack_get_sample_rate_proc          _jack_get_sample_rate          = jack_get_sample_rate;
-    ma_jack_get_buffer_size_proc          _jack_get_buffer_size          = jack_get_buffer_size;
-    ma_jack_get_ports_proc                _jack_get_ports                = jack_get_ports;
-    ma_jack_activate_proc                 _jack_activate                 = jack_activate;
-    ma_jack_deactivate_proc               _jack_deactivate               = jack_deactivate;
-    ma_jack_connect_proc                  _jack_connect                  = jack_connect;
-    ma_jack_port_register_proc            _jack_port_register            = jack_port_register;
-    ma_jack_port_name_proc                _jack_port_name                = jack_port_name;
-    ma_jack_port_get_buffer_proc          _jack_port_get_buffer          = jack_port_get_buffer;
-    ma_jack_free_proc                     _jack_free                     = jack_free;
-
-    pContext->jack.jack_client_open              = (ma_proc)_jack_client_open;
-    pContext->jack.jack_client_close             = (ma_proc)_jack_client_close;
-    pContext->jack.jack_client_name_size         = (ma_proc)_jack_client_name_size;
-    pContext->jack.jack_set_process_callback     = (ma_proc)_jack_set_process_callback;
-    pContext->jack.jack_set_buffer_size_callback = (ma_proc)_jack_set_buffer_size_callback;
-    pContext->jack.jack_on_shutdown              = (ma_proc)_jack_on_shutdown;
-    pContext->jack.jack_get_sample_rate          = (ma_proc)_jack_get_sample_rate;
-    pContext->jack.jack_get_buffer_size          = (ma_proc)_jack_get_buffer_size;
-    pContext->jack.jack_get_ports                = (ma_proc)_jack_get_ports;
-    pContext->jack.jack_activate                 = (ma_proc)_jack_activate;
-    pContext->jack.jack_deactivate               = (ma_proc)_jack_deactivate;
-    pContext->jack.jack_connect                  = (ma_proc)_jack_connect;
-    pContext->jack.jack_port_register            = (ma_proc)_jack_port_register;
-    pContext->jack.jack_port_name                = (ma_proc)_jack_port_name;
-    pContext->jack.jack_port_get_buffer          = (ma_proc)_jack_port_get_buffer;
-    pContext->jack.jack_free                     = (ma_proc)_jack_free;
-#endif
-
-    if (pConfig->jack.pClientName != NULL) {
-        pContext->jack.pClientName = ma_copy_string(pConfig->jack.pClientName, &pContext->allocationCallbacks);
-    }
-    pContext->jack.tryStartServer = pConfig->jack.tryStartServer;
-
-    /*
-    Getting here means the JACK library is installed, but it doesn't necessarily mean it's usable. We need to quickly test this by connecting
-    a temporary client.
-    */
-    {
-        ma_jack_client_t* pDummyClient;
-        ma_result result = ma_context_open_client__jack(pContext, &pDummyClient);
-        if (result != MA_SUCCESS) {
-            ma_free(pContext->jack.pClientName, &pContext->allocationCallbacks);
-        #ifndef MA_NO_RUNTIME_LINKING
-            ma_dlclose(ma_context_get_log(pContext), pContext->jack.jackSO);
-        #endif
-            return MA_NO_BACKEND;
-        }
-
-        ((ma_jack_client_close_proc)pContext->jack.jack_client_close)((ma_jack_client_t*)pDummyClient);
-    }
-
-
-    pCallbacks->onContextInit             = ma_context_init__jack;
-    pCallbacks->onContextUninit           = ma_context_uninit__jack;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__jack;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__jack;
-    pCallbacks->onDeviceInit              = ma_device_init__jack;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__jack;
-    pCallbacks->onDeviceStart             = ma_device_start__jack;
-    pCallbacks->onDeviceStop              = ma_device_stop__jack;
-    pCallbacks->onDeviceRead              = NULL;   /* Not used because JACK is asynchronous. */
-    pCallbacks->onDeviceWrite             = NULL;   /* Not used because JACK is asynchronous. */
-    pCallbacks->onDeviceDataLoop          = NULL;   /* Not used because JACK is asynchronous. */
-
-    return MA_SUCCESS;
-}
-#endif  /* JACK */
-
-
-
-/******************************************************************************
-
-Core Audio Backend
-
-References
-==========
-- Technical Note TN2091: Device input using the HAL Output Audio Unit
-    https://developer.apple.com/library/archive/technotes/tn2091/_index.html
-
-******************************************************************************/
-#ifdef MA_HAS_COREAUDIO
-#include <TargetConditionals.h>
-
-#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1
-    #define MA_APPLE_MOBILE
-    #if defined(TARGET_OS_TV) && TARGET_OS_TV == 1
-        #define MA_APPLE_TV
-    #endif
-    #if defined(TARGET_OS_WATCH) && TARGET_OS_WATCH == 1
-        #define MA_APPLE_WATCH
-    #endif
-    #if __has_feature(objc_arc)
-        #define MA_BRIDGE_TRANSFER  __bridge_transfer
-        #define MA_BRIDGE_RETAINED  __bridge_retained
-    #else
-        #define MA_BRIDGE_TRANSFER
-        #define MA_BRIDGE_RETAINED
-    #endif
-#else
-    #define MA_APPLE_DESKTOP
-#endif
-
-#if defined(MA_APPLE_DESKTOP)
-#include <CoreAudio/CoreAudio.h>
-#else
-#include <AVFoundation/AVFoundation.h>
-#endif
-
-#include <AudioToolbox/AudioToolbox.h>
-
-/* CoreFoundation */
-typedef Boolean (* ma_CFStringGetCString_proc)(CFStringRef theString, char* buffer, CFIndex bufferSize, CFStringEncoding encoding);
-typedef void (* ma_CFRelease_proc)(CFTypeRef cf);
-
-/* CoreAudio */
-#if defined(MA_APPLE_DESKTOP)
-typedef OSStatus (* ma_AudioObjectGetPropertyData_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, UInt32 inQualifierDataSize, const void* inQualifierData, UInt32* ioDataSize, void* outData);
-typedef OSStatus (* ma_AudioObjectGetPropertyDataSize_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, UInt32 inQualifierDataSize, const void* inQualifierData, UInt32* outDataSize);
-typedef OSStatus (* ma_AudioObjectSetPropertyData_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, UInt32 inQualifierDataSize, const void* inQualifierData, UInt32 inDataSize, const void* inData);
-typedef OSStatus (* ma_AudioObjectAddPropertyListener_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, AudioObjectPropertyListenerProc inListener, void* inClientData);
-typedef OSStatus (* ma_AudioObjectRemovePropertyListener_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, AudioObjectPropertyListenerProc inListener, void* inClientData);
-#endif
-
-/* AudioToolbox */
-typedef AudioComponent (* ma_AudioComponentFindNext_proc)(AudioComponent inComponent, const AudioComponentDescription* inDesc);
-typedef OSStatus (* ma_AudioComponentInstanceDispose_proc)(AudioComponentInstance inInstance);
-typedef OSStatus (* ma_AudioComponentInstanceNew_proc)(AudioComponent inComponent, AudioComponentInstance* outInstance);
-typedef OSStatus (* ma_AudioOutputUnitStart_proc)(AudioUnit inUnit);
-typedef OSStatus (* ma_AudioOutputUnitStop_proc)(AudioUnit inUnit);
-typedef OSStatus (* ma_AudioUnitAddPropertyListener_proc)(AudioUnit inUnit, AudioUnitPropertyID inID, AudioUnitPropertyListenerProc inProc, void* inProcUserData);
-typedef OSStatus (* ma_AudioUnitGetPropertyInfo_proc)(AudioUnit inUnit, AudioUnitPropertyID inID, AudioUnitScope inScope, AudioUnitElement inElement, UInt32* outDataSize, Boolean* outWriteable);
-typedef OSStatus (* ma_AudioUnitGetProperty_proc)(AudioUnit inUnit, AudioUnitPropertyID inID, AudioUnitScope inScope, AudioUnitElement inElement, void* outData, UInt32* ioDataSize);
-typedef OSStatus (* ma_AudioUnitSetProperty_proc)(AudioUnit inUnit, AudioUnitPropertyID inID, AudioUnitScope inScope, AudioUnitElement inElement, const void* inData, UInt32 inDataSize);
-typedef OSStatus (* ma_AudioUnitInitialize_proc)(AudioUnit inUnit);
-typedef OSStatus (* ma_AudioUnitRender_proc)(AudioUnit inUnit, AudioUnitRenderActionFlags* ioActionFlags, const AudioTimeStamp* inTimeStamp, UInt32 inOutputBusNumber, UInt32 inNumberFrames, AudioBufferList* ioData);
-
-
-#define MA_COREAUDIO_OUTPUT_BUS    0
-#define MA_COREAUDIO_INPUT_BUS     1
-
-#if defined(MA_APPLE_DESKTOP)
-static ma_result ma_device_reinit_internal__coreaudio(ma_device* pDevice, ma_device_type deviceType, ma_bool32 disposePreviousAudioUnit);
-#endif
-
-/*
-Core Audio
-
-So far, Core Audio has been the worst backend to work with due to being both unintuitive and having almost no documentation
-apart from comments in the headers (which admittedly are quite good). For my own purposes, and for anybody out there whose
-needing to figure out how this darn thing works, I'm going to outline a few things here.
-
-Since miniaudio is a fairly low-level API, one of the things it needs is control over specific devices, and it needs to be
-able to identify whether or not it can be used as playback and/or capture. The AudioObject API is the only one I've seen
-that supports this level of detail. There was some public domain sample code I stumbled across that used the AudioComponent
-and AudioUnit APIs, but I couldn't see anything that gave low-level control over device selection and capabilities (the
-distinction between playback and capture in particular). Therefore, miniaudio is using the AudioObject API.
-
-Most (all?) functions in the AudioObject API take a AudioObjectID as it's input. This is the device identifier. When
-retrieving global information, such as the device list, you use kAudioObjectSystemObject. When retrieving device-specific
-data, you pass in the ID for that device. In order to retrieve device-specific IDs you need to enumerate over each of the
-devices. This is done using the AudioObjectGetPropertyDataSize() and AudioObjectGetPropertyData() APIs which seem to be
-the central APIs for retrieving information about the system and specific devices.
-
-To use the AudioObjectGetPropertyData() API you need to use the notion of a property address. A property address is a
-structure with three variables and is used to identify which property you are getting or setting. The first is the "selector"
-which is basically the specific property that you're wanting to retrieve or set. The second is the "scope", which is
-typically set to kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyScopeInput for input-specific properties and
-kAudioObjectPropertyScopeOutput for output-specific properties. The last is the "element" which is always set to
-kAudioObjectPropertyElementMain in miniaudio's case. I don't know of any cases where this would be set to anything different.
-
-Back to the earlier issue of device retrieval, you first use the AudioObjectGetPropertyDataSize() API to retrieve the size
-of the raw data which is just a list of AudioDeviceID's. You use the kAudioObjectSystemObject AudioObjectID, and a property
-address with the kAudioHardwarePropertyDevices selector and the kAudioObjectPropertyScopeGlobal scope. Once you have the
-size, allocate a block of memory of that size and then call AudioObjectGetPropertyData(). The data is just a list of
-AudioDeviceID's so just do "dataSize/sizeof(AudioDeviceID)" to know the device count.
-*/
-
-#if defined(MA_APPLE_MOBILE)
-static void ma_device__on_notification_interruption_began(ma_device* pDevice)
-{
-    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_interruption_began));
-}
-
-static void ma_device__on_notification_interruption_ended(ma_device* pDevice)
-{
-    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_interruption_ended));
-}
-#endif
-
-static ma_result ma_result_from_OSStatus(OSStatus status)
-{
-    switch (status)
-    {
-        case noErr:                                   return MA_SUCCESS;
-    #if defined(MA_APPLE_DESKTOP)
-        case kAudioHardwareNotRunningError:           return MA_DEVICE_NOT_STARTED;
-        case kAudioHardwareUnspecifiedError:          return MA_ERROR;
-        case kAudioHardwareUnknownPropertyError:      return MA_INVALID_ARGS;
-        case kAudioHardwareBadPropertySizeError:      return MA_INVALID_OPERATION;
-        case kAudioHardwareIllegalOperationError:     return MA_INVALID_OPERATION;
-        case kAudioHardwareBadObjectError:            return MA_INVALID_ARGS;
-        case kAudioHardwareBadDeviceError:            return MA_INVALID_ARGS;
-        case kAudioHardwareBadStreamError:            return MA_INVALID_ARGS;
-        case kAudioHardwareUnsupportedOperationError: return MA_INVALID_OPERATION;
-        case kAudioDeviceUnsupportedFormatError:      return MA_FORMAT_NOT_SUPPORTED;
-        case kAudioDevicePermissionsError:            return MA_ACCESS_DENIED;
-    #endif
-        default:                                      return MA_ERROR;
-    }
-}
-
-#if 0
-static ma_channel ma_channel_from_AudioChannelBitmap(AudioChannelBitmap bit)
-{
-    switch (bit)
-    {
-        case kAudioChannelBit_Left:                 return MA_CHANNEL_LEFT;
-        case kAudioChannelBit_Right:                return MA_CHANNEL_RIGHT;
-        case kAudioChannelBit_Center:               return MA_CHANNEL_FRONT_CENTER;
-        case kAudioChannelBit_LFEScreen:            return MA_CHANNEL_LFE;
-        case kAudioChannelBit_LeftSurround:         return MA_CHANNEL_BACK_LEFT;
-        case kAudioChannelBit_RightSurround:        return MA_CHANNEL_BACK_RIGHT;
-        case kAudioChannelBit_LeftCenter:           return MA_CHANNEL_FRONT_LEFT_CENTER;
-        case kAudioChannelBit_RightCenter:          return MA_CHANNEL_FRONT_RIGHT_CENTER;
-        case kAudioChannelBit_CenterSurround:       return MA_CHANNEL_BACK_CENTER;
-        case kAudioChannelBit_LeftSurroundDirect:   return MA_CHANNEL_SIDE_LEFT;
-        case kAudioChannelBit_RightSurroundDirect:  return MA_CHANNEL_SIDE_RIGHT;
-        case kAudioChannelBit_TopCenterSurround:    return MA_CHANNEL_TOP_CENTER;
-        case kAudioChannelBit_VerticalHeightLeft:   return MA_CHANNEL_TOP_FRONT_LEFT;
-        case kAudioChannelBit_VerticalHeightCenter: return MA_CHANNEL_TOP_FRONT_CENTER;
-        case kAudioChannelBit_VerticalHeightRight:  return MA_CHANNEL_TOP_FRONT_RIGHT;
-        case kAudioChannelBit_TopBackLeft:          return MA_CHANNEL_TOP_BACK_LEFT;
-        case kAudioChannelBit_TopBackCenter:        return MA_CHANNEL_TOP_BACK_CENTER;
-        case kAudioChannelBit_TopBackRight:         return MA_CHANNEL_TOP_BACK_RIGHT;
-        default:                                    return MA_CHANNEL_NONE;
-    }
-}
-#endif
-
-static ma_result ma_format_from_AudioStreamBasicDescription(const AudioStreamBasicDescription* pDescription, ma_format* pFormatOut)
-{
-    MA_ASSERT(pDescription != NULL);
-    MA_ASSERT(pFormatOut != NULL);
-
-    *pFormatOut = ma_format_unknown;   /* Safety. */
-
-    /* There's a few things miniaudio doesn't support. */
-    if (pDescription->mFormatID != kAudioFormatLinearPCM) {
-        return MA_FORMAT_NOT_SUPPORTED;
-    }
-
-    /* We don't support any non-packed formats that are aligned high. */
-    if ((pDescription->mFormatFlags & kLinearPCMFormatFlagIsAlignedHigh) != 0) {
-        return MA_FORMAT_NOT_SUPPORTED;
-    }
-
-    /* Only supporting native-endian. */
-    if ((ma_is_little_endian() && (pDescription->mFormatFlags & kAudioFormatFlagIsBigEndian) != 0) || (ma_is_big_endian() && (pDescription->mFormatFlags & kAudioFormatFlagIsBigEndian) == 0)) {
-        return MA_FORMAT_NOT_SUPPORTED;
-    }
-
-    /* We are not currently supporting non-interleaved formats (this will be added in a future version of miniaudio). */
-    /*if ((pDescription->mFormatFlags & kAudioFormatFlagIsNonInterleaved) != 0) {
-        return MA_FORMAT_NOT_SUPPORTED;
-    }*/
-
-    if ((pDescription->mFormatFlags & kLinearPCMFormatFlagIsFloat) != 0) {
-        if (pDescription->mBitsPerChannel == 32) {
-            *pFormatOut = ma_format_f32;
-            return MA_SUCCESS;
-        }
-    } else {
-        if ((pDescription->mFormatFlags & kLinearPCMFormatFlagIsSignedInteger) != 0) {
-            if (pDescription->mBitsPerChannel == 16) {
-                *pFormatOut = ma_format_s16;
-                return MA_SUCCESS;
-            } else if (pDescription->mBitsPerChannel == 24) {
-                if (pDescription->mBytesPerFrame == (pDescription->mBitsPerChannel/8 * pDescription->mChannelsPerFrame)) {
-                    *pFormatOut = ma_format_s24;
-                    return MA_SUCCESS;
-                } else {
-                    if (pDescription->mBytesPerFrame/pDescription->mChannelsPerFrame == sizeof(ma_int32)) {
-                        /* TODO: Implement ma_format_s24_32. */
-                        /**pFormatOut = ma_format_s24_32;*/
-                        /*return MA_SUCCESS;*/
-                        return MA_FORMAT_NOT_SUPPORTED;
-                    }
-                }
-            } else if (pDescription->mBitsPerChannel == 32) {
-                *pFormatOut = ma_format_s32;
-                return MA_SUCCESS;
-            }
-        } else {
-            if (pDescription->mBitsPerChannel == 8) {
-                *pFormatOut = ma_format_u8;
-                return MA_SUCCESS;
-            }
-        }
-    }
-
-    /* Getting here means the format is not supported. */
-    return MA_FORMAT_NOT_SUPPORTED;
-}
-
-#if defined(MA_APPLE_DESKTOP)
-static ma_channel ma_channel_from_AudioChannelLabel(AudioChannelLabel label)
-{
-    switch (label)
-    {
-        case kAudioChannelLabel_Unknown:              return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_Unused:               return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_UseCoordinates:       return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_Left:                 return MA_CHANNEL_LEFT;
-        case kAudioChannelLabel_Right:                return MA_CHANNEL_RIGHT;
-        case kAudioChannelLabel_Center:               return MA_CHANNEL_FRONT_CENTER;
-        case kAudioChannelLabel_LFEScreen:            return MA_CHANNEL_LFE;
-        case kAudioChannelLabel_LeftSurround:         return MA_CHANNEL_BACK_LEFT;
-        case kAudioChannelLabel_RightSurround:        return MA_CHANNEL_BACK_RIGHT;
-        case kAudioChannelLabel_LeftCenter:           return MA_CHANNEL_FRONT_LEFT_CENTER;
-        case kAudioChannelLabel_RightCenter:          return MA_CHANNEL_FRONT_RIGHT_CENTER;
-        case kAudioChannelLabel_CenterSurround:       return MA_CHANNEL_BACK_CENTER;
-        case kAudioChannelLabel_LeftSurroundDirect:   return MA_CHANNEL_SIDE_LEFT;
-        case kAudioChannelLabel_RightSurroundDirect:  return MA_CHANNEL_SIDE_RIGHT;
-        case kAudioChannelLabel_TopCenterSurround:    return MA_CHANNEL_TOP_CENTER;
-        case kAudioChannelLabel_VerticalHeightLeft:   return MA_CHANNEL_TOP_FRONT_LEFT;
-        case kAudioChannelLabel_VerticalHeightCenter: return MA_CHANNEL_TOP_FRONT_CENTER;
-        case kAudioChannelLabel_VerticalHeightRight:  return MA_CHANNEL_TOP_FRONT_RIGHT;
-        case kAudioChannelLabel_TopBackLeft:          return MA_CHANNEL_TOP_BACK_LEFT;
-        case kAudioChannelLabel_TopBackCenter:        return MA_CHANNEL_TOP_BACK_CENTER;
-        case kAudioChannelLabel_TopBackRight:         return MA_CHANNEL_TOP_BACK_RIGHT;
-        case kAudioChannelLabel_RearSurroundLeft:     return MA_CHANNEL_BACK_LEFT;
-        case kAudioChannelLabel_RearSurroundRight:    return MA_CHANNEL_BACK_RIGHT;
-        case kAudioChannelLabel_LeftWide:             return MA_CHANNEL_SIDE_LEFT;
-        case kAudioChannelLabel_RightWide:            return MA_CHANNEL_SIDE_RIGHT;
-        case kAudioChannelLabel_LFE2:                 return MA_CHANNEL_LFE;
-        case kAudioChannelLabel_LeftTotal:            return MA_CHANNEL_LEFT;
-        case kAudioChannelLabel_RightTotal:           return MA_CHANNEL_RIGHT;
-        case kAudioChannelLabel_HearingImpaired:      return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_Narration:            return MA_CHANNEL_MONO;
-        case kAudioChannelLabel_Mono:                 return MA_CHANNEL_MONO;
-        case kAudioChannelLabel_DialogCentricMix:     return MA_CHANNEL_MONO;
-        case kAudioChannelLabel_CenterSurroundDirect: return MA_CHANNEL_BACK_CENTER;
-        case kAudioChannelLabel_Haptic:               return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_Ambisonic_W:          return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_Ambisonic_X:          return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_Ambisonic_Y:          return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_Ambisonic_Z:          return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_MS_Mid:               return MA_CHANNEL_LEFT;
-        case kAudioChannelLabel_MS_Side:              return MA_CHANNEL_RIGHT;
-        case kAudioChannelLabel_XY_X:                 return MA_CHANNEL_LEFT;
-        case kAudioChannelLabel_XY_Y:                 return MA_CHANNEL_RIGHT;
-        case kAudioChannelLabel_HeadphonesLeft:       return MA_CHANNEL_LEFT;
-        case kAudioChannelLabel_HeadphonesRight:      return MA_CHANNEL_RIGHT;
-        case kAudioChannelLabel_ClickTrack:           return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_ForeignLanguage:      return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_Discrete:             return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_Discrete_0:           return MA_CHANNEL_AUX_0;
-        case kAudioChannelLabel_Discrete_1:           return MA_CHANNEL_AUX_1;
-        case kAudioChannelLabel_Discrete_2:           return MA_CHANNEL_AUX_2;
-        case kAudioChannelLabel_Discrete_3:           return MA_CHANNEL_AUX_3;
-        case kAudioChannelLabel_Discrete_4:           return MA_CHANNEL_AUX_4;
-        case kAudioChannelLabel_Discrete_5:           return MA_CHANNEL_AUX_5;
-        case kAudioChannelLabel_Discrete_6:           return MA_CHANNEL_AUX_6;
-        case kAudioChannelLabel_Discrete_7:           return MA_CHANNEL_AUX_7;
-        case kAudioChannelLabel_Discrete_8:           return MA_CHANNEL_AUX_8;
-        case kAudioChannelLabel_Discrete_9:           return MA_CHANNEL_AUX_9;
-        case kAudioChannelLabel_Discrete_10:          return MA_CHANNEL_AUX_10;
-        case kAudioChannelLabel_Discrete_11:          return MA_CHANNEL_AUX_11;
-        case kAudioChannelLabel_Discrete_12:          return MA_CHANNEL_AUX_12;
-        case kAudioChannelLabel_Discrete_13:          return MA_CHANNEL_AUX_13;
-        case kAudioChannelLabel_Discrete_14:          return MA_CHANNEL_AUX_14;
-        case kAudioChannelLabel_Discrete_15:          return MA_CHANNEL_AUX_15;
-        case kAudioChannelLabel_Discrete_65535:       return MA_CHANNEL_NONE;
-
-    #if 0   /* Introduced in a later version of macOS. */
-        case kAudioChannelLabel_HOA_ACN:              return MA_CHANNEL_NONE;
-        case kAudioChannelLabel_HOA_ACN_0:            return MA_CHANNEL_AUX_0;
-        case kAudioChannelLabel_HOA_ACN_1:            return MA_CHANNEL_AUX_1;
-        case kAudioChannelLabel_HOA_ACN_2:            return MA_CHANNEL_AUX_2;
-        case kAudioChannelLabel_HOA_ACN_3:            return MA_CHANNEL_AUX_3;
-        case kAudioChannelLabel_HOA_ACN_4:            return MA_CHANNEL_AUX_4;
-        case kAudioChannelLabel_HOA_ACN_5:            return MA_CHANNEL_AUX_5;
-        case kAudioChannelLabel_HOA_ACN_6:            return MA_CHANNEL_AUX_6;
-        case kAudioChannelLabel_HOA_ACN_7:            return MA_CHANNEL_AUX_7;
-        case kAudioChannelLabel_HOA_ACN_8:            return MA_CHANNEL_AUX_8;
-        case kAudioChannelLabel_HOA_ACN_9:            return MA_CHANNEL_AUX_9;
-        case kAudioChannelLabel_HOA_ACN_10:           return MA_CHANNEL_AUX_10;
-        case kAudioChannelLabel_HOA_ACN_11:           return MA_CHANNEL_AUX_11;
-        case kAudioChannelLabel_HOA_ACN_12:           return MA_CHANNEL_AUX_12;
-        case kAudioChannelLabel_HOA_ACN_13:           return MA_CHANNEL_AUX_13;
-        case kAudioChannelLabel_HOA_ACN_14:           return MA_CHANNEL_AUX_14;
-        case kAudioChannelLabel_HOA_ACN_15:           return MA_CHANNEL_AUX_15;
-        case kAudioChannelLabel_HOA_ACN_65024:        return MA_CHANNEL_NONE;
-    #endif
-
-        default:                                      return MA_CHANNEL_NONE;
-    }
-}
-
-static ma_result ma_get_channel_map_from_AudioChannelLayout(AudioChannelLayout* pChannelLayout, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    MA_ASSERT(pChannelLayout != NULL);
-
-    if (pChannelLayout->mChannelLayoutTag == kAudioChannelLayoutTag_UseChannelDescriptions) {
-        UInt32 iChannel;
-        for (iChannel = 0; iChannel < pChannelLayout->mNumberChannelDescriptions && iChannel < channelMapCap; ++iChannel) {
-            pChannelMap[iChannel] = ma_channel_from_AudioChannelLabel(pChannelLayout->mChannelDescriptions[iChannel].mChannelLabel);
-        }
-    } else
-#if 0
-    if (pChannelLayout->mChannelLayoutTag == kAudioChannelLayoutTag_UseChannelBitmap) {
-        /* This is the same kind of system that's used by Windows audio APIs. */
-        UInt32 iChannel = 0;
-        UInt32 iBit;
-        AudioChannelBitmap bitmap = pChannelLayout->mChannelBitmap;
-        for (iBit = 0; iBit < 32 && iChannel < channelMapCap; ++iBit) {
-            AudioChannelBitmap bit = bitmap & (1 << iBit);
-            if (bit != 0) {
-                pChannelMap[iChannel++] = ma_channel_from_AudioChannelBit(bit);
-            }
-        }
-    } else
-#endif
-    {
-        /*
-        Need to use the tag to determine the channel map. For now I'm just assuming a default channel map, but later on this should
-        be updated to determine the mapping based on the tag.
-        */
-        UInt32 channelCount;
-
-        /* Our channel map retrieval APIs below take 32-bit integers, so we'll want to clamp the channel map capacity. */
-        if (channelMapCap > 0xFFFFFFFF) {
-            channelMapCap = 0xFFFFFFFF;
-        }
-
-        channelCount = ma_min(AudioChannelLayoutTag_GetNumberOfChannels(pChannelLayout->mChannelLayoutTag), (UInt32)channelMapCap);
-
-        switch (pChannelLayout->mChannelLayoutTag)
-        {
-            case kAudioChannelLayoutTag_Mono:
-            case kAudioChannelLayoutTag_Stereo:
-            case kAudioChannelLayoutTag_StereoHeadphones:
-            case kAudioChannelLayoutTag_MatrixStereo:
-            case kAudioChannelLayoutTag_MidSide:
-            case kAudioChannelLayoutTag_XY:
-            case kAudioChannelLayoutTag_Binaural:
-            case kAudioChannelLayoutTag_Ambisonic_B_Format:
-            {
-                ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, channelCount);
-            } break;
-
-            case kAudioChannelLayoutTag_Octagonal:
-            {
-                pChannelMap[7] = MA_CHANNEL_SIDE_RIGHT;
-                pChannelMap[6] = MA_CHANNEL_SIDE_LEFT;
-            } MA_FALLTHROUGH; /* Intentional fallthrough. */
-            case kAudioChannelLayoutTag_Hexagonal:
-            {
-                pChannelMap[5] = MA_CHANNEL_BACK_CENTER;
-            } MA_FALLTHROUGH; /* Intentional fallthrough. */
-            case kAudioChannelLayoutTag_Pentagonal:
-            {
-                pChannelMap[4] = MA_CHANNEL_FRONT_CENTER;
-            } MA_FALLTHROUGH; /* Intentional fallthrough. */
-            case kAudioChannelLayoutTag_Quadraphonic:
-            {
-                pChannelMap[3] = MA_CHANNEL_BACK_RIGHT;
-                pChannelMap[2] = MA_CHANNEL_BACK_LEFT;
-                pChannelMap[1] = MA_CHANNEL_RIGHT;
-                pChannelMap[0] = MA_CHANNEL_LEFT;
-            } break;
-
-            /* TODO: Add support for more tags here. */
-
-            default:
-            {
-                ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, channelCount);
-            } break;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-#if (defined(MAC_OS_VERSION_12_0) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_12_0) || \
-    (defined(__IPHONE_15_0) && __IPHONE_OS_VERSION_MAX_ALLOWED >= __IPHONE_15_0)
-#define AUDIO_OBJECT_PROPERTY_ELEMENT kAudioObjectPropertyElementMain
-#else
-/* kAudioObjectPropertyElementMaster is deprecated. */
-#define AUDIO_OBJECT_PROPERTY_ELEMENT kAudioObjectPropertyElementMaster
-#endif
-
-static ma_result ma_get_device_object_ids__coreaudio(ma_context* pContext, UInt32* pDeviceCount, AudioObjectID** ppDeviceObjectIDs) /* NOTE: Free the returned buffer with ma_free(). */
-{
-    AudioObjectPropertyAddress propAddressDevices;
-    UInt32 deviceObjectsDataSize;
-    OSStatus status;
-    AudioObjectID* pDeviceObjectIDs;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pDeviceCount != NULL);
-    MA_ASSERT(ppDeviceObjectIDs != NULL);
-
-    /* Safety. */
-    *pDeviceCount = 0;
-    *ppDeviceObjectIDs = NULL;
-
-    propAddressDevices.mSelector = kAudioHardwarePropertyDevices;
-    propAddressDevices.mScope    = kAudioObjectPropertyScopeGlobal;
-    propAddressDevices.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(kAudioObjectSystemObject, &propAddressDevices, 0, NULL, &deviceObjectsDataSize);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-    pDeviceObjectIDs = (AudioObjectID*)ma_malloc(deviceObjectsDataSize, &pContext->allocationCallbacks);
-    if (pDeviceObjectIDs == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(kAudioObjectSystemObject, &propAddressDevices, 0, NULL, &deviceObjectsDataSize, pDeviceObjectIDs);
-    if (status != noErr) {
-        ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
-        return ma_result_from_OSStatus(status);
-    }
-
-    *pDeviceCount = deviceObjectsDataSize / sizeof(AudioObjectID);
-    *ppDeviceObjectIDs = pDeviceObjectIDs;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_get_AudioObject_uid_as_CFStringRef(ma_context* pContext, AudioObjectID objectID, CFStringRef* pUID)
-{
-    AudioObjectPropertyAddress propAddress;
-    UInt32 dataSize;
-    OSStatus status;
-
-    MA_ASSERT(pContext != NULL);
-
-    propAddress.mSelector = kAudioDevicePropertyDeviceUID;
-    propAddress.mScope    = kAudioObjectPropertyScopeGlobal;
-    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-    dataSize = sizeof(*pUID);
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(objectID, &propAddress, 0, NULL, &dataSize, pUID);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_get_AudioObject_uid(ma_context* pContext, AudioObjectID objectID, size_t bufferSize, char* bufferOut)
-{
-    CFStringRef uid;
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-
-    result = ma_get_AudioObject_uid_as_CFStringRef(pContext, objectID, &uid);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (!((ma_CFStringGetCString_proc)pContext->coreaudio.CFStringGetCString)(uid, bufferOut, bufferSize, kCFStringEncodingUTF8)) {
-        return MA_ERROR;
-    }
-
-    ((ma_CFRelease_proc)pContext->coreaudio.CFRelease)(uid);
-    return MA_SUCCESS;
-}
-
-static ma_result ma_get_AudioObject_name(ma_context* pContext, AudioObjectID objectID, size_t bufferSize, char* bufferOut)
-{
-    AudioObjectPropertyAddress propAddress;
-    CFStringRef deviceName = NULL;
-    UInt32 dataSize;
-    OSStatus status;
-
-    MA_ASSERT(pContext != NULL);
-
-    propAddress.mSelector = kAudioDevicePropertyDeviceNameCFString;
-    propAddress.mScope    = kAudioObjectPropertyScopeGlobal;
-    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-    dataSize = sizeof(deviceName);
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(objectID, &propAddress, 0, NULL, &dataSize, &deviceName);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-    if (!((ma_CFStringGetCString_proc)pContext->coreaudio.CFStringGetCString)(deviceName, bufferOut, bufferSize, kCFStringEncodingUTF8)) {
-        return MA_ERROR;
-    }
-
-    ((ma_CFRelease_proc)pContext->coreaudio.CFRelease)(deviceName);
-    return MA_SUCCESS;
-}
-
-static ma_bool32 ma_does_AudioObject_support_scope(ma_context* pContext, AudioObjectID deviceObjectID, AudioObjectPropertyScope scope)
-{
-    AudioObjectPropertyAddress propAddress;
-    UInt32 dataSize;
-    OSStatus status;
-    AudioBufferList* pBufferList;
-    ma_bool32 isSupported;
-
-    MA_ASSERT(pContext != NULL);
-
-    /* To know whether or not a device is an input device we need ot look at the stream configuration. If it has an output channel it's a playback device. */
-    propAddress.mSelector = kAudioDevicePropertyStreamConfiguration;
-    propAddress.mScope    = scope;
-    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(deviceObjectID, &propAddress, 0, NULL, &dataSize);
-    if (status != noErr) {
-        return MA_FALSE;
-    }
-
-    pBufferList = (AudioBufferList*)ma_malloc(dataSize, &pContext->allocationCallbacks);
-    if (pBufferList == NULL) {
-        return MA_FALSE;   /* Out of memory. */
-    }
-
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, pBufferList);
-    if (status != noErr) {
-        ma_free(pBufferList, &pContext->allocationCallbacks);
-        return MA_FALSE;
-    }
-
-    isSupported = MA_FALSE;
-    if (pBufferList->mNumberBuffers > 0) {
-        isSupported = MA_TRUE;
-    }
-
-    ma_free(pBufferList, &pContext->allocationCallbacks);
-    return isSupported;
-}
-
-static ma_bool32 ma_does_AudioObject_support_playback(ma_context* pContext, AudioObjectID deviceObjectID)
-{
-    return ma_does_AudioObject_support_scope(pContext, deviceObjectID, kAudioObjectPropertyScopeOutput);
-}
-
-static ma_bool32 ma_does_AudioObject_support_capture(ma_context* pContext, AudioObjectID deviceObjectID)
-{
-    return ma_does_AudioObject_support_scope(pContext, deviceObjectID, kAudioObjectPropertyScopeInput);
-}
-
-
-static ma_result ma_get_AudioObject_stream_descriptions(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, UInt32* pDescriptionCount, AudioStreamRangedDescription** ppDescriptions) /* NOTE: Free the returned pointer with ma_free(). */
-{
-    AudioObjectPropertyAddress propAddress;
-    UInt32 dataSize;
-    OSStatus status;
-    AudioStreamRangedDescription* pDescriptions;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pDescriptionCount != NULL);
-    MA_ASSERT(ppDescriptions != NULL);
-
-    /*
-    TODO: Experiment with kAudioStreamPropertyAvailablePhysicalFormats instead of (or in addition to) kAudioStreamPropertyAvailableVirtualFormats. My
-          MacBook Pro uses s24/32 format, however, which miniaudio does not currently support.
-    */
-    propAddress.mSelector = kAudioStreamPropertyAvailableVirtualFormats; /*kAudioStreamPropertyAvailablePhysicalFormats;*/
-    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
-    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(deviceObjectID, &propAddress, 0, NULL, &dataSize);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-    pDescriptions = (AudioStreamRangedDescription*)ma_malloc(dataSize, &pContext->allocationCallbacks);
-    if (pDescriptions == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, pDescriptions);
-    if (status != noErr) {
-        ma_free(pDescriptions, &pContext->allocationCallbacks);
-        return ma_result_from_OSStatus(status);
-    }
-
-    *pDescriptionCount = dataSize / sizeof(*pDescriptions);
-    *ppDescriptions = pDescriptions;
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_get_AudioObject_channel_layout(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, AudioChannelLayout** ppChannelLayout)   /* NOTE: Free the returned pointer with ma_free(). */
-{
-    AudioObjectPropertyAddress propAddress;
-    UInt32 dataSize;
-    OSStatus status;
-    AudioChannelLayout* pChannelLayout;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(ppChannelLayout != NULL);
-
-    *ppChannelLayout = NULL;    /* Safety. */
-
-    propAddress.mSelector = kAudioDevicePropertyPreferredChannelLayout;
-    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
-    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(deviceObjectID, &propAddress, 0, NULL, &dataSize);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-    pChannelLayout = (AudioChannelLayout*)ma_malloc(dataSize, &pContext->allocationCallbacks);
-    if (pChannelLayout == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, pChannelLayout);
-    if (status != noErr) {
-        ma_free(pChannelLayout, &pContext->allocationCallbacks);
-        return ma_result_from_OSStatus(status);
-    }
-
-    *ppChannelLayout = pChannelLayout;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_get_AudioObject_channel_count(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_uint32* pChannelCount)
-{
-    AudioChannelLayout* pChannelLayout;
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pChannelCount != NULL);
-
-    *pChannelCount = 0; /* Safety. */
-
-    result = ma_get_AudioObject_channel_layout(pContext, deviceObjectID, deviceType, &pChannelLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pChannelLayout->mChannelLayoutTag == kAudioChannelLayoutTag_UseChannelDescriptions) {
-        *pChannelCount = pChannelLayout->mNumberChannelDescriptions;
-    } else if (pChannelLayout->mChannelLayoutTag == kAudioChannelLayoutTag_UseChannelBitmap) {
-        *pChannelCount = ma_count_set_bits(pChannelLayout->mChannelBitmap);
-    } else {
-        *pChannelCount = AudioChannelLayoutTag_GetNumberOfChannels(pChannelLayout->mChannelLayoutTag);
-    }
-
-    ma_free(pChannelLayout, &pContext->allocationCallbacks);
-    return MA_SUCCESS;
-}
-
-#if 0
-static ma_result ma_get_AudioObject_channel_map(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    AudioChannelLayout* pChannelLayout;
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-
-    result = ma_get_AudioObject_channel_layout(pContext, deviceObjectID, deviceType, &pChannelLayout);
-    if (result != MA_SUCCESS) {
-        return result;  /* Rather than always failing here, would it be more robust to simply assume a default? */
-    }
-
-    result = ma_get_channel_map_from_AudioChannelLayout(pChannelLayout, pChannelMap, channelMapCap);
-    if (result != MA_SUCCESS) {
-        ma_free(pChannelLayout, &pContext->allocationCallbacks);
-        return result;
-    }
-
-    ma_free(pChannelLayout, &pContext->allocationCallbacks);
-    return result;
-}
-#endif
-
-static ma_result ma_get_AudioObject_sample_rates(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, UInt32* pSampleRateRangesCount, AudioValueRange** ppSampleRateRanges)   /* NOTE: Free the returned pointer with ma_free(). */
-{
-    AudioObjectPropertyAddress propAddress;
-    UInt32 dataSize;
-    OSStatus status;
-    AudioValueRange* pSampleRateRanges;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pSampleRateRangesCount != NULL);
-    MA_ASSERT(ppSampleRateRanges != NULL);
-
-    /* Safety. */
-    *pSampleRateRangesCount = 0;
-    *ppSampleRateRanges = NULL;
-
-    propAddress.mSelector = kAudioDevicePropertyAvailableNominalSampleRates;
-    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
-    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(deviceObjectID, &propAddress, 0, NULL, &dataSize);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-    pSampleRateRanges = (AudioValueRange*)ma_malloc(dataSize, &pContext->allocationCallbacks);
-    if (pSampleRateRanges == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, pSampleRateRanges);
-    if (status != noErr) {
-        ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
-        return ma_result_from_OSStatus(status);
-    }
-
-    *pSampleRateRangesCount = dataSize / sizeof(*pSampleRateRanges);
-    *ppSampleRateRanges = pSampleRateRanges;
-    return MA_SUCCESS;
-}
-
-#if 0
-static ma_result ma_get_AudioObject_get_closest_sample_rate(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_uint32 sampleRateIn, ma_uint32* pSampleRateOut)
-{
-    UInt32 sampleRateRangeCount;
-    AudioValueRange* pSampleRateRanges;
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pSampleRateOut != NULL);
-
-    *pSampleRateOut = 0;    /* Safety. */
-
-    result = ma_get_AudioObject_sample_rates(pContext, deviceObjectID, deviceType, &sampleRateRangeCount, &pSampleRateRanges);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (sampleRateRangeCount == 0) {
-        ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
-        return MA_ERROR;   /* Should never hit this case should we? */
-    }
-
-    if (sampleRateIn == 0) {
-        /* Search in order of miniaudio's preferred priority. */
-        UInt32 iMALSampleRate;
-        for (iMALSampleRate = 0; iMALSampleRate < ma_countof(g_maStandardSampleRatePriorities); ++iMALSampleRate) {
-            ma_uint32 malSampleRate = g_maStandardSampleRatePriorities[iMALSampleRate];
-            UInt32 iCASampleRate;
-            for (iCASampleRate = 0; iCASampleRate < sampleRateRangeCount; ++iCASampleRate) {
-                AudioValueRange caSampleRate = pSampleRateRanges[iCASampleRate];
-                if (caSampleRate.mMinimum <= malSampleRate && caSampleRate.mMaximum >= malSampleRate) {
-                    *pSampleRateOut = malSampleRate;
-                    ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
-                    return MA_SUCCESS;
-                }
-            }
-        }
-
-        /*
-        If we get here it means none of miniaudio's standard sample rates matched any of the supported sample rates from the device. In this
-        case we just fall back to the first one reported by Core Audio.
-        */
-        MA_ASSERT(sampleRateRangeCount > 0);
-
-        *pSampleRateOut = pSampleRateRanges[0].mMinimum;
-        ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
-        return MA_SUCCESS;
-    } else {
-        /* Find the closest match to this sample rate. */
-        UInt32 currentAbsoluteDifference = INT32_MAX;
-        UInt32 iCurrentClosestRange = (UInt32)-1;
-        UInt32 iRange;
-        for (iRange = 0; iRange < sampleRateRangeCount; ++iRange) {
-            if (pSampleRateRanges[iRange].mMinimum <= sampleRateIn && pSampleRateRanges[iRange].mMaximum >= sampleRateIn) {
-                *pSampleRateOut = sampleRateIn;
-                ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
-                return MA_SUCCESS;
-            } else {
-                UInt32 absoluteDifference;
-                if (pSampleRateRanges[iRange].mMinimum > sampleRateIn) {
-                    absoluteDifference = pSampleRateRanges[iRange].mMinimum - sampleRateIn;
-                } else {
-                    absoluteDifference = sampleRateIn - pSampleRateRanges[iRange].mMaximum;
-                }
-
-                if (currentAbsoluteDifference > absoluteDifference) {
-                    currentAbsoluteDifference = absoluteDifference;
-                    iCurrentClosestRange = iRange;
-                }
-            }
-        }
-
-        MA_ASSERT(iCurrentClosestRange != (UInt32)-1);
-
-        *pSampleRateOut = pSampleRateRanges[iCurrentClosestRange].mMinimum;
-        ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
-        return MA_SUCCESS;
-    }
-
-    /* Should never get here, but it would mean we weren't able to find any suitable sample rates. */
-    /*ma_free(pSampleRateRanges, &pContext->allocationCallbacks);*/
-    /*return MA_ERROR;*/
-}
-#endif
-
-static ma_result ma_get_AudioObject_closest_buffer_size_in_frames(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_uint32 bufferSizeInFramesIn, ma_uint32* pBufferSizeInFramesOut)
-{
-    AudioObjectPropertyAddress propAddress;
-    AudioValueRange bufferSizeRange;
-    UInt32 dataSize;
-    OSStatus status;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pBufferSizeInFramesOut != NULL);
-
-    *pBufferSizeInFramesOut = 0;    /* Safety. */
-
-    propAddress.mSelector = kAudioDevicePropertyBufferFrameSizeRange;
-    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
-    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-    dataSize = sizeof(bufferSizeRange);
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, &bufferSizeRange);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-    /* This is just a clamp. */
-    if (bufferSizeInFramesIn < bufferSizeRange.mMinimum) {
-        *pBufferSizeInFramesOut = (ma_uint32)bufferSizeRange.mMinimum;
-    } else if (bufferSizeInFramesIn > bufferSizeRange.mMaximum) {
-        *pBufferSizeInFramesOut = (ma_uint32)bufferSizeRange.mMaximum;
-    } else {
-        *pBufferSizeInFramesOut = bufferSizeInFramesIn;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_set_AudioObject_buffer_size_in_frames(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_uint32* pPeriodSizeInOut)
-{
-    ma_result result;
-    ma_uint32 chosenBufferSizeInFrames;
-    AudioObjectPropertyAddress propAddress;
-    UInt32 dataSize;
-    OSStatus status;
-
-    MA_ASSERT(pContext != NULL);
-
-    result = ma_get_AudioObject_closest_buffer_size_in_frames(pContext, deviceObjectID, deviceType, *pPeriodSizeInOut, &chosenBufferSizeInFrames);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* Try setting the size of the buffer... If this fails we just use whatever is currently set. */
-    propAddress.mSelector = kAudioDevicePropertyBufferFrameSize;
-    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
-    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-    ((ma_AudioObjectSetPropertyData_proc)pContext->coreaudio.AudioObjectSetPropertyData)(deviceObjectID, &propAddress, 0, NULL, sizeof(chosenBufferSizeInFrames), &chosenBufferSizeInFrames);
-
-    /* Get the actual size of the buffer. */
-    dataSize = sizeof(*pPeriodSizeInOut);
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, &chosenBufferSizeInFrames);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-    *pPeriodSizeInOut = chosenBufferSizeInFrames;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_find_default_AudioObjectID(ma_context* pContext, ma_device_type deviceType, AudioObjectID* pDeviceObjectID)
-{
-    AudioObjectPropertyAddress propAddressDefaultDevice;
-    UInt32 defaultDeviceObjectIDSize = sizeof(AudioObjectID);
-    AudioObjectID defaultDeviceObjectID;
-    OSStatus status;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pDeviceObjectID != NULL);
-
-    /* Safety. */
-    *pDeviceObjectID = 0;
-
-    propAddressDefaultDevice.mScope = kAudioObjectPropertyScopeGlobal;
-    propAddressDefaultDevice.mElement = AUDIO_OBJECT_PROPERTY_ELEMENT;
-    if (deviceType == ma_device_type_playback) {
-        propAddressDefaultDevice.mSelector = kAudioHardwarePropertyDefaultOutputDevice;
-    } else {
-        propAddressDefaultDevice.mSelector = kAudioHardwarePropertyDefaultInputDevice;
-    }
-
-    defaultDeviceObjectIDSize = sizeof(AudioObjectID);
-    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(kAudioObjectSystemObject, &propAddressDefaultDevice, 0, NULL, &defaultDeviceObjectIDSize, &defaultDeviceObjectID);
-    if (status == noErr) {
-        *pDeviceObjectID = defaultDeviceObjectID;
-        return MA_SUCCESS;
-    }
-
-    /* If we get here it means we couldn't find the device. */
-    return MA_NO_DEVICE;
-}
-
-static ma_result ma_find_AudioObjectID(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, AudioObjectID* pDeviceObjectID)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pDeviceObjectID != NULL);
-
-    /* Safety. */
-    *pDeviceObjectID = 0;
-
-    if (pDeviceID == NULL) {
-        /* Default device. */
-        return ma_find_default_AudioObjectID(pContext, deviceType, pDeviceObjectID);
-    } else {
-        /* Explicit device. */
-        UInt32 deviceCount;
-        AudioObjectID* pDeviceObjectIDs;
-        ma_result result;
-        UInt32 iDevice;
-
-        result = ma_get_device_object_ids__coreaudio(pContext, &deviceCount, &pDeviceObjectIDs);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        for (iDevice = 0; iDevice < deviceCount; ++iDevice) {
-            AudioObjectID deviceObjectID = pDeviceObjectIDs[iDevice];
-
-            char uid[256];
-            if (ma_get_AudioObject_uid(pContext, deviceObjectID, sizeof(uid), uid) != MA_SUCCESS) {
-                continue;
-            }
-
-            if (deviceType == ma_device_type_playback) {
-                if (ma_does_AudioObject_support_playback(pContext, deviceObjectID)) {
-                    if (strcmp(uid, pDeviceID->coreaudio) == 0) {
-                        *pDeviceObjectID = deviceObjectID;
-                        ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
-                        return MA_SUCCESS;
-                    }
-                }
-            } else {
-                if (ma_does_AudioObject_support_capture(pContext, deviceObjectID)) {
-                    if (strcmp(uid, pDeviceID->coreaudio) == 0) {
-                        *pDeviceObjectID = deviceObjectID;
-                        ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
-                        return MA_SUCCESS;
-                    }
-                }
-            }
-        }
-
-        ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
-    }
-
-    /* If we get here it means we couldn't find the device. */
-    return MA_NO_DEVICE;
-}
-
-
-static ma_result ma_find_best_format__coreaudio(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_format format, ma_uint32 channels, ma_uint32 sampleRate, const AudioStreamBasicDescription* pOrigFormat, AudioStreamBasicDescription* pFormat)
-{
-    UInt32 deviceFormatDescriptionCount;
-    AudioStreamRangedDescription* pDeviceFormatDescriptions;
-    ma_result result;
-    ma_uint32 desiredSampleRate;
-    ma_uint32 desiredChannelCount;
-    ma_format desiredFormat;
-    AudioStreamBasicDescription bestDeviceFormatSoFar;
-    ma_bool32 hasSupportedFormat;
-    UInt32 iFormat;
-
-    result = ma_get_AudioObject_stream_descriptions(pContext, deviceObjectID, deviceType, &deviceFormatDescriptionCount, &pDeviceFormatDescriptions);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    desiredSampleRate = sampleRate;
-    if (desiredSampleRate == 0) {
-        desiredSampleRate = pOrigFormat->mSampleRate;
-    }
-
-    desiredChannelCount = channels;
-    if (desiredChannelCount == 0) {
-        desiredChannelCount = pOrigFormat->mChannelsPerFrame;
-    }
-
-    desiredFormat = format;
-    if (desiredFormat == ma_format_unknown) {
-        result = ma_format_from_AudioStreamBasicDescription(pOrigFormat, &desiredFormat);
-        if (result != MA_SUCCESS || desiredFormat == ma_format_unknown) {
-            desiredFormat = g_maFormatPriorities[0];
-        }
-    }
-
-    /*
-    If we get here it means we don't have an exact match to what the client is asking for. We'll need to find the closest one. The next
-    loop will check for formats that have the same sample rate to what we're asking for. If there is, we prefer that one in all cases.
-    */
-    MA_ZERO_OBJECT(&bestDeviceFormatSoFar);
-
-    hasSupportedFormat = MA_FALSE;
-    for (iFormat = 0; iFormat < deviceFormatDescriptionCount; ++iFormat) {
-        ma_format formatFromDescription;
-        ma_result formatResult = ma_format_from_AudioStreamBasicDescription(&pDeviceFormatDescriptions[iFormat].mFormat, &formatFromDescription);
-        if (formatResult == MA_SUCCESS && formatFromDescription != ma_format_unknown) {
-            hasSupportedFormat = MA_TRUE;
-            bestDeviceFormatSoFar = pDeviceFormatDescriptions[iFormat].mFormat;
-            break;
-        }
-    }
-
-    if (!hasSupportedFormat) {
-        ma_free(pDeviceFormatDescriptions, &pContext->allocationCallbacks);
-        return MA_FORMAT_NOT_SUPPORTED;
-    }
-
-
-    for (iFormat = 0; iFormat < deviceFormatDescriptionCount; ++iFormat) {
-        AudioStreamBasicDescription thisDeviceFormat = pDeviceFormatDescriptions[iFormat].mFormat;
-        ma_format thisSampleFormat;
-        ma_result formatResult;
-        ma_format bestSampleFormatSoFar;
-
-        /* If the format is not supported by miniaudio we need to skip this one entirely. */
-        formatResult = ma_format_from_AudioStreamBasicDescription(&pDeviceFormatDescriptions[iFormat].mFormat, &thisSampleFormat);
-        if (formatResult != MA_SUCCESS || thisSampleFormat == ma_format_unknown) {
-            continue;   /* The format is not supported by miniaudio. Skip. */
-        }
-
-        ma_format_from_AudioStreamBasicDescription(&bestDeviceFormatSoFar, &bestSampleFormatSoFar);
-
-        /* Getting here means the format is supported by miniaudio which makes this format a candidate. */
-        if (thisDeviceFormat.mSampleRate != desiredSampleRate) {
-            /*
-            The sample rate does not match, but this format could still be usable, although it's a very low priority. If the best format
-            so far has an equal sample rate we can just ignore this one.
-            */
-            if (bestDeviceFormatSoFar.mSampleRate == desiredSampleRate) {
-                continue;   /* The best sample rate so far has the same sample rate as what we requested which means it's still the best so far. Skip this format. */
-            } else {
-                /* In this case, neither the best format so far nor this one have the same sample rate. Check the channel count next. */
-                if (thisDeviceFormat.mChannelsPerFrame != desiredChannelCount) {
-                    /* This format has a different sample rate _and_ a different channel count. */
-                    if (bestDeviceFormatSoFar.mChannelsPerFrame == desiredChannelCount) {
-                        continue;   /* No change to the best format. */
-                    } else {
-                        /*
-                        Both this format and the best so far have different sample rates and different channel counts. Whichever has the
-                        best format is the new best.
-                        */
-                        if (ma_get_format_priority_index(thisSampleFormat) < ma_get_format_priority_index(bestSampleFormatSoFar)) {
-                            bestDeviceFormatSoFar = thisDeviceFormat;
-                            continue;
-                        } else {
-                            continue;   /* No change to the best format. */
-                        }
-                    }
-                } else {
-                    /* This format has a different sample rate but the desired channel count. */
-                    if (bestDeviceFormatSoFar.mChannelsPerFrame == desiredChannelCount) {
-                        /* Both this format and the best so far have the desired channel count. Whichever has the best format is the new best. */
-                        if (ma_get_format_priority_index(thisSampleFormat) < ma_get_format_priority_index(bestSampleFormatSoFar)) {
-                            bestDeviceFormatSoFar = thisDeviceFormat;
-                            continue;
-                        } else {
-                            continue;   /* No change to the best format for now. */
-                        }
-                    } else {
-                        /* This format has the desired channel count, but the best so far does not. We have a new best. */
-                        bestDeviceFormatSoFar = thisDeviceFormat;
-                        continue;
-                    }
-                }
-            }
-        } else {
-            /*
-            The sample rates match which makes this format a very high priority contender. If the best format so far has a different
-            sample rate it needs to be replaced with this one.
-            */
-            if (bestDeviceFormatSoFar.mSampleRate != desiredSampleRate) {
-                bestDeviceFormatSoFar = thisDeviceFormat;
-                continue;
-            } else {
-                /* In this case both this format and the best format so far have the same sample rate. Check the channel count next. */
-                if (thisDeviceFormat.mChannelsPerFrame == desiredChannelCount) {
-                    /*
-                    In this case this format has the same channel count as what the client is requesting. If the best format so far has
-                    a different count, this one becomes the new best.
-                    */
-                    if (bestDeviceFormatSoFar.mChannelsPerFrame != desiredChannelCount) {
-                        bestDeviceFormatSoFar = thisDeviceFormat;
-                        continue;
-                    } else {
-                        /* In this case both this format and the best so far have the ideal sample rate and channel count. Check the format. */
-                        if (thisSampleFormat == desiredFormat) {
-                            bestDeviceFormatSoFar = thisDeviceFormat;
-                            break;  /* Found the exact match. */
-                        } else {
-                            /* The formats are different. The new best format is the one with the highest priority format according to miniaudio. */
-                            if (ma_get_format_priority_index(thisSampleFormat) < ma_get_format_priority_index(bestSampleFormatSoFar)) {
-                                bestDeviceFormatSoFar = thisDeviceFormat;
-                                continue;
-                            } else {
-                                continue;   /* No change to the best format for now. */
-                            }
-                        }
-                    }
-                } else {
-                    /*
-                    In this case the channel count is different to what the client has requested. If the best so far has the same channel
-                    count as the requested count then it remains the best.
-                    */
-                    if (bestDeviceFormatSoFar.mChannelsPerFrame == desiredChannelCount) {
-                        continue;
-                    } else {
-                        /*
-                        This is the case where both have the same sample rate (good) but different channel counts. Right now both have about
-                        the same priority, but we need to compare the format now.
-                        */
-                        if (thisSampleFormat == bestSampleFormatSoFar) {
-                            if (ma_get_format_priority_index(thisSampleFormat) < ma_get_format_priority_index(bestSampleFormatSoFar)) {
-                                bestDeviceFormatSoFar = thisDeviceFormat;
-                                continue;
-                            } else {
-                                continue;   /* No change to the best format for now. */
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    *pFormat = bestDeviceFormatSoFar;
-
-    ma_free(pDeviceFormatDescriptions, &pContext->allocationCallbacks);
-    return MA_SUCCESS;
-}
-
-static ma_result ma_get_AudioUnit_channel_map(ma_context* pContext, AudioUnit audioUnit, ma_device_type deviceType, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    AudioUnitScope deviceScope;
-    AudioUnitElement deviceBus;
-    UInt32 channelLayoutSize;
-    OSStatus status;
-    AudioChannelLayout* pChannelLayout;
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-
-    if (deviceType == ma_device_type_playback) {
-        deviceScope = kAudioUnitScope_Input;
-        deviceBus = MA_COREAUDIO_OUTPUT_BUS;
-    } else {
-        deviceScope = kAudioUnitScope_Output;
-        deviceBus = MA_COREAUDIO_INPUT_BUS;
-    }
-
-    status = ((ma_AudioUnitGetPropertyInfo_proc)pContext->coreaudio.AudioUnitGetPropertyInfo)(audioUnit, kAudioUnitProperty_AudioChannelLayout, deviceScope, deviceBus, &channelLayoutSize, NULL);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-    pChannelLayout = (AudioChannelLayout*)ma_malloc(channelLayoutSize, &pContext->allocationCallbacks);
-    if (pChannelLayout == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    status = ((ma_AudioUnitGetProperty_proc)pContext->coreaudio.AudioUnitGetProperty)(audioUnit, kAudioUnitProperty_AudioChannelLayout, deviceScope, deviceBus, pChannelLayout, &channelLayoutSize);
-    if (status != noErr) {
-        ma_free(pChannelLayout, &pContext->allocationCallbacks);
-        return ma_result_from_OSStatus(status);
-    }
-
-    result = ma_get_channel_map_from_AudioChannelLayout(pChannelLayout, pChannelMap, channelMapCap);
-    if (result != MA_SUCCESS) {
-        ma_free(pChannelLayout, &pContext->allocationCallbacks);
-        return result;
-    }
-
-    ma_free(pChannelLayout, &pContext->allocationCallbacks);
-    return MA_SUCCESS;
-}
-#endif /* MA_APPLE_DESKTOP */
-
-
-#if !defined(MA_APPLE_DESKTOP)
-static void ma_AVAudioSessionPortDescription_to_device_info(AVAudioSessionPortDescription* pPortDesc, ma_device_info* pInfo)
-{
-    MA_ZERO_OBJECT(pInfo);
-    ma_strncpy_s(pInfo->name,         sizeof(pInfo->name),         [pPortDesc.portName UTF8String], (size_t)-1);
-    ma_strncpy_s(pInfo->id.coreaudio, sizeof(pInfo->id.coreaudio), [pPortDesc.UID      UTF8String], (size_t)-1);
-}
-#endif
-
-static ma_result ma_context_enumerate_devices__coreaudio(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-#if defined(MA_APPLE_DESKTOP)
-    UInt32 deviceCount;
-    AudioObjectID* pDeviceObjectIDs;
-    AudioObjectID defaultDeviceObjectIDPlayback;
-    AudioObjectID defaultDeviceObjectIDCapture;
-    ma_result result;
-    UInt32 iDevice;
-
-    ma_find_default_AudioObjectID(pContext, ma_device_type_playback, &defaultDeviceObjectIDPlayback);   /* OK if this fails. */
-    ma_find_default_AudioObjectID(pContext, ma_device_type_capture,  &defaultDeviceObjectIDCapture);    /* OK if this fails. */
-
-    result = ma_get_device_object_ids__coreaudio(pContext, &deviceCount, &pDeviceObjectIDs);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    for (iDevice = 0; iDevice < deviceCount; ++iDevice) {
-        AudioObjectID deviceObjectID = pDeviceObjectIDs[iDevice];
-        ma_device_info info;
-
-        MA_ZERO_OBJECT(&info);
-        if (ma_get_AudioObject_uid(pContext, deviceObjectID, sizeof(info.id.coreaudio), info.id.coreaudio) != MA_SUCCESS) {
-            continue;
-        }
-        if (ma_get_AudioObject_name(pContext, deviceObjectID, sizeof(info.name), info.name) != MA_SUCCESS) {
-            continue;
-        }
-
-        if (ma_does_AudioObject_support_playback(pContext, deviceObjectID)) {
-            if (deviceObjectID == defaultDeviceObjectIDPlayback) {
-                info.isDefault = MA_TRUE;
-            }
-
-            if (!callback(pContext, ma_device_type_playback, &info, pUserData)) {
-                break;
-            }
-        }
-        if (ma_does_AudioObject_support_capture(pContext, deviceObjectID)) {
-            if (deviceObjectID == defaultDeviceObjectIDCapture) {
-                info.isDefault = MA_TRUE;
-            }
-
-            if (!callback(pContext, ma_device_type_capture, &info, pUserData)) {
-                break;
-            }
-        }
-    }
-
-    ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
-#else
-    ma_device_info info;
-    NSArray *pInputs  = [[[AVAudioSession sharedInstance] currentRoute] inputs];
-    NSArray *pOutputs = [[[AVAudioSession sharedInstance] currentRoute] outputs];
-
-    for (AVAudioSessionPortDescription* pPortDesc in pOutputs) {
-        ma_AVAudioSessionPortDescription_to_device_info(pPortDesc, &info);
-        if (!callback(pContext, ma_device_type_playback, &info, pUserData)) {
-            return MA_SUCCESS;
-        }
-    }
-
-    for (AVAudioSessionPortDescription* pPortDesc in pInputs) {
-        ma_AVAudioSessionPortDescription_to_device_info(pPortDesc, &info);
-        if (!callback(pContext, ma_device_type_capture, &info, pUserData)) {
-            return MA_SUCCESS;
-        }
-    }
-#endif
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info__coreaudio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-
-#if defined(MA_APPLE_DESKTOP)
-    /* Desktop */
-    {
-        AudioObjectID deviceObjectID;
-        AudioObjectID defaultDeviceObjectID;
-        UInt32 streamDescriptionCount;
-        AudioStreamRangedDescription* pStreamDescriptions;
-        UInt32 iStreamDescription;
-        UInt32 sampleRateRangeCount;
-        AudioValueRange* pSampleRateRanges;
-
-        ma_find_default_AudioObjectID(pContext, deviceType, &defaultDeviceObjectID);     /* OK if this fails. */
-
-        result = ma_find_AudioObjectID(pContext, deviceType, pDeviceID, &deviceObjectID);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        result = ma_get_AudioObject_uid(pContext, deviceObjectID, sizeof(pDeviceInfo->id.coreaudio), pDeviceInfo->id.coreaudio);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        result = ma_get_AudioObject_name(pContext, deviceObjectID, sizeof(pDeviceInfo->name), pDeviceInfo->name);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        if (deviceObjectID == defaultDeviceObjectID) {
-            pDeviceInfo->isDefault = MA_TRUE;
-        }
-
-        /*
-        There could be a large number of permutations here. Fortunately there is only a single channel count
-        being reported which reduces this quite a bit. For sample rates we're only reporting those that are
-        one of miniaudio's recognized "standard" rates. If there are still more formats than can fit into
-        our fixed sized array we'll just need to truncate them. This is unlikely and will probably only happen
-        if some driver performs software data conversion and therefore reports every possible format and
-        sample rate.
-        */
-        pDeviceInfo->nativeDataFormatCount = 0;
-
-        /* Formats. */
-        {
-            ma_format uniqueFormats[ma_format_count];
-            ma_uint32 uniqueFormatCount = 0;
-            ma_uint32 channels;
-
-            /* Channels. */
-            result = ma_get_AudioObject_channel_count(pContext, deviceObjectID, deviceType, &channels);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-
-            /* Formats. */
-            result = ma_get_AudioObject_stream_descriptions(pContext, deviceObjectID, deviceType, &streamDescriptionCount, &pStreamDescriptions);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-
-            for (iStreamDescription = 0; iStreamDescription < streamDescriptionCount; ++iStreamDescription) {
-                ma_format format;
-                ma_bool32 hasFormatBeenHandled = MA_FALSE;
-                ma_uint32 iOutputFormat;
-                ma_uint32 iSampleRate;
-
-                result = ma_format_from_AudioStreamBasicDescription(&pStreamDescriptions[iStreamDescription].mFormat, &format);
-                if (result != MA_SUCCESS) {
-                    continue;
-                }
-
-                MA_ASSERT(format != ma_format_unknown);
-
-                /* Make sure the format isn't already in the output list. */
-                for (iOutputFormat = 0; iOutputFormat < uniqueFormatCount; ++iOutputFormat) {
-                    if (uniqueFormats[iOutputFormat] == format) {
-                        hasFormatBeenHandled = MA_TRUE;
-                        break;
-                    }
-                }
-
-                /* If we've already handled this format just skip it. */
-                if (hasFormatBeenHandled) {
-                    continue;
-                }
-
-                uniqueFormats[uniqueFormatCount] = format;
-                uniqueFormatCount += 1;
-
-                /* Sample Rates */
-                result = ma_get_AudioObject_sample_rates(pContext, deviceObjectID, deviceType, &sampleRateRangeCount, &pSampleRateRanges);
-                if (result != MA_SUCCESS) {
-                    return result;
-                }
-
-                /*
-                Annoyingly Core Audio reports a sample rate range. We just get all the standard rates that are
-                between this range.
-                */
-                for (iSampleRate = 0; iSampleRate < sampleRateRangeCount; ++iSampleRate) {
-                    ma_uint32 iStandardSampleRate;
-                    for (iStandardSampleRate = 0; iStandardSampleRate < ma_countof(g_maStandardSampleRatePriorities); iStandardSampleRate += 1) {
-                        ma_uint32 standardSampleRate = g_maStandardSampleRatePriorities[iStandardSampleRate];
-                        if (standardSampleRate >= pSampleRateRanges[iSampleRate].mMinimum && standardSampleRate <= pSampleRateRanges[iSampleRate].mMaximum) {
-                            /* We have a new data format. Add it to the list. */
-                            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
-                            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
-                            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = standardSampleRate;
-                            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = 0;
-                            pDeviceInfo->nativeDataFormatCount += 1;
-
-                            if (pDeviceInfo->nativeDataFormatCount >= ma_countof(pDeviceInfo->nativeDataFormats)) {
-                                break;  /* No more room for any more formats. */
-                            }
-                        }
-                    }
-                }
-
-                ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
-
-                if (pDeviceInfo->nativeDataFormatCount >= ma_countof(pDeviceInfo->nativeDataFormats)) {
-                    break;  /* No more room for any more formats. */
-                }
-            }
-
-            ma_free(pStreamDescriptions, &pContext->allocationCallbacks);
-        }
-    }
-#else
-    /* Mobile */
-    {
-        AudioComponentDescription desc;
-        AudioComponent component;
-        AudioUnit audioUnit;
-        OSStatus status;
-        AudioUnitScope formatScope;
-        AudioUnitElement formatElement;
-        AudioStreamBasicDescription bestFormat;
-        UInt32 propSize;
-
-        /* We want to ensure we use a consistent device name to device enumeration. */
-        if (pDeviceID != NULL && pDeviceID->coreaudio[0] != '\0') {
-            ma_bool32 found = MA_FALSE;
-            if (deviceType == ma_device_type_playback) {
-                NSArray *pOutputs = [[[AVAudioSession sharedInstance] currentRoute] outputs];
-                for (AVAudioSessionPortDescription* pPortDesc in pOutputs) {
-                    if (strcmp(pDeviceID->coreaudio, [pPortDesc.UID UTF8String]) == 0) {
-                        ma_AVAudioSessionPortDescription_to_device_info(pPortDesc, pDeviceInfo);
-                        found = MA_TRUE;
-                        break;
-                    }
-                }
-            } else {
-                NSArray *pInputs = [[[AVAudioSession sharedInstance] currentRoute] inputs];
-                for (AVAudioSessionPortDescription* pPortDesc in pInputs) {
-                    if (strcmp(pDeviceID->coreaudio, [pPortDesc.UID UTF8String]) == 0) {
-                        ma_AVAudioSessionPortDescription_to_device_info(pPortDesc, pDeviceInfo);
-                        found = MA_TRUE;
-                        break;
-                    }
-                }
-            }
-
-            if (!found) {
-                return MA_DOES_NOT_EXIST;
-            }
-        } else {
-            if (deviceType == ma_device_type_playback) {
-                ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-            } else {
-                ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-            }
-        }
-
-
-        /*
-        Retrieving device information is more annoying on mobile than desktop. For simplicity I'm locking this down to whatever format is
-        reported on a temporary I/O unit. The problem, however, is that this doesn't return a value for the sample rate which we need to
-        retrieve from the AVAudioSession shared instance.
-        */
-        desc.componentType = kAudioUnitType_Output;
-        desc.componentSubType = kAudioUnitSubType_RemoteIO;
-        desc.componentManufacturer = kAudioUnitManufacturer_Apple;
-        desc.componentFlags = 0;
-        desc.componentFlagsMask = 0;
-
-        component = ((ma_AudioComponentFindNext_proc)pContext->coreaudio.AudioComponentFindNext)(NULL, &desc);
-        if (component == NULL) {
-            return MA_FAILED_TO_INIT_BACKEND;
-        }
-
-        status = ((ma_AudioComponentInstanceNew_proc)pContext->coreaudio.AudioComponentInstanceNew)(component, &audioUnit);
-        if (status != noErr) {
-            return ma_result_from_OSStatus(status);
-        }
-
-        formatScope   = (deviceType == ma_device_type_playback) ? kAudioUnitScope_Input : kAudioUnitScope_Output;
-        formatElement = (deviceType == ma_device_type_playback) ? MA_COREAUDIO_OUTPUT_BUS : MA_COREAUDIO_INPUT_BUS;
-
-        propSize = sizeof(bestFormat);
-        status = ((ma_AudioUnitGetProperty_proc)pContext->coreaudio.AudioUnitGetProperty)(audioUnit, kAudioUnitProperty_StreamFormat, formatScope, formatElement, &bestFormat, &propSize);
-        if (status != noErr) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(audioUnit);
-            return ma_result_from_OSStatus(status);
-        }
-
-        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(audioUnit);
-        audioUnit = NULL;
-
-        /* Only a single format is being reported for iOS. */
-        pDeviceInfo->nativeDataFormatCount = 1;
-
-        result = ma_format_from_AudioStreamBasicDescription(&bestFormat, &pDeviceInfo->nativeDataFormats[0].format);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pDeviceInfo->nativeDataFormats[0].channels = bestFormat.mChannelsPerFrame;
-
-        /*
-        It looks like Apple are wanting to push the whole AVAudioSession thing. Thus, we need to use that to determine device settings. To do
-        this we just get the shared instance and inspect.
-        */
-        @autoreleasepool {
-            AVAudioSession* pAudioSession = [AVAudioSession sharedInstance];
-            MA_ASSERT(pAudioSession != NULL);
-
-            pDeviceInfo->nativeDataFormats[0].sampleRate = (ma_uint32)pAudioSession.sampleRate;
-        }
-    }
-#endif
-
-    (void)pDeviceInfo; /* Unused. */
-    return MA_SUCCESS;
-}
-
-static AudioBufferList* ma_allocate_AudioBufferList__coreaudio(ma_uint32 sizeInFrames, ma_format format, ma_uint32 channels, ma_stream_layout layout, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    AudioBufferList* pBufferList;
-    UInt32 audioBufferSizeInBytes;
-    size_t allocationSize;
-
-    MA_ASSERT(sizeInFrames > 0);
-    MA_ASSERT(format != ma_format_unknown);
-    MA_ASSERT(channels > 0);
-
-    allocationSize = sizeof(AudioBufferList) - sizeof(AudioBuffer);  /* Subtract sizeof(AudioBuffer) because that part is dynamically sized. */
-    if (layout == ma_stream_layout_interleaved) {
-        /* Interleaved case. This is the simple case because we just have one buffer. */
-        allocationSize += sizeof(AudioBuffer) * 1;
-    } else {
-        /* Non-interleaved case. This is the more complex case because there's more than one buffer. */
-        allocationSize += sizeof(AudioBuffer) * channels;
-    }
-
-    allocationSize += sizeInFrames * ma_get_bytes_per_frame(format, channels);
-
-    pBufferList = (AudioBufferList*)ma_malloc(allocationSize, pAllocationCallbacks);
-    if (pBufferList == NULL) {
-        return NULL;
-    }
-
-    audioBufferSizeInBytes = (UInt32)(sizeInFrames * ma_get_bytes_per_sample(format));
-
-    if (layout == ma_stream_layout_interleaved) {
-        pBufferList->mNumberBuffers = 1;
-        pBufferList->mBuffers[0].mNumberChannels = channels;
-        pBufferList->mBuffers[0].mDataByteSize   = audioBufferSizeInBytes * channels;
-        pBufferList->mBuffers[0].mData           = (ma_uint8*)pBufferList + sizeof(AudioBufferList);
-    } else {
-        ma_uint32 iBuffer;
-        pBufferList->mNumberBuffers = channels;
-        for (iBuffer = 0; iBuffer < pBufferList->mNumberBuffers; ++iBuffer) {
-            pBufferList->mBuffers[iBuffer].mNumberChannels = 1;
-            pBufferList->mBuffers[iBuffer].mDataByteSize   = audioBufferSizeInBytes;
-            pBufferList->mBuffers[iBuffer].mData           = (ma_uint8*)pBufferList + ((sizeof(AudioBufferList) - sizeof(AudioBuffer)) + (sizeof(AudioBuffer) * channels)) + (audioBufferSizeInBytes * iBuffer);
-        }
-    }
-
-    return pBufferList;
-}
-
-static ma_result ma_device_realloc_AudioBufferList__coreaudio(ma_device* pDevice, ma_uint32 sizeInFrames, ma_format format, ma_uint32 channels, ma_stream_layout layout)
-{
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(format != ma_format_unknown);
-    MA_ASSERT(channels > 0);
-
-    /* Only resize the buffer if necessary. */
-    if (pDevice->coreaudio.audioBufferCapInFrames < sizeInFrames) {
-        AudioBufferList* pNewAudioBufferList;
-
-        pNewAudioBufferList = ma_allocate_AudioBufferList__coreaudio(sizeInFrames, format, channels, layout, &pDevice->pContext->allocationCallbacks);
-        if (pNewAudioBufferList == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-
-        /* At this point we'll have a new AudioBufferList and we can free the old one. */
-        ma_free(pDevice->coreaudio.pAudioBufferList, &pDevice->pContext->allocationCallbacks);
-        pDevice->coreaudio.pAudioBufferList = pNewAudioBufferList;
-        pDevice->coreaudio.audioBufferCapInFrames = sizeInFrames;
-    }
-
-    /* Getting here means the capacity of the audio is fine. */
-    return MA_SUCCESS;
-}
-
-
-static OSStatus ma_on_output__coreaudio(void* pUserData, AudioUnitRenderActionFlags* pActionFlags, const AudioTimeStamp* pTimeStamp, UInt32 busNumber, UInt32 frameCount, AudioBufferList* pBufferList)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    ma_stream_layout layout;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "INFO: Output Callback: busNumber=%d, frameCount=%d, mNumberBuffers=%d\n", (int)busNumber, (int)frameCount, (int)pBufferList->mNumberBuffers);*/
-
-    /* We need to check whether or not we are outputting interleaved or non-interleaved samples. The way we do this is slightly different for each type. */
-    layout = ma_stream_layout_interleaved;
-    if (pBufferList->mBuffers[0].mNumberChannels != pDevice->playback.internalChannels) {
-        layout = ma_stream_layout_deinterleaved;
-    }
-
-    if (layout == ma_stream_layout_interleaved) {
-        /* For now we can assume everything is interleaved. */
-        UInt32 iBuffer;
-        for (iBuffer = 0; iBuffer < pBufferList->mNumberBuffers; ++iBuffer) {
-            if (pBufferList->mBuffers[iBuffer].mNumberChannels == pDevice->playback.internalChannels) {
-                ma_uint32 frameCountForThisBuffer = pBufferList->mBuffers[iBuffer].mDataByteSize / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-                if (frameCountForThisBuffer > 0) {
-                    ma_device_handle_backend_data_callback(pDevice, pBufferList->mBuffers[iBuffer].mData, NULL, frameCountForThisBuffer);
-                }
-
-                /*a_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "  frameCount=%d, mNumberChannels=%d, mDataByteSize=%d\n", (int)frameCount, (int)pBufferList->mBuffers[iBuffer].mNumberChannels, (int)pBufferList->mBuffers[iBuffer].mDataByteSize);*/
-            } else {
-                /*
-                This case is where the number of channels in the output buffer do not match our internal channels. It could mean that it's
-                not interleaved, in which case we can't handle right now since miniaudio does not yet support non-interleaved streams. We just
-                output silence here.
-                */
-                MA_ZERO_MEMORY(pBufferList->mBuffers[iBuffer].mData, pBufferList->mBuffers[iBuffer].mDataByteSize);
-                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "  WARNING: Outputting silence. frameCount=%d, mNumberChannels=%d, mDataByteSize=%d\n", (int)frameCount, (int)pBufferList->mBuffers[iBuffer].mNumberChannels, (int)pBufferList->mBuffers[iBuffer].mDataByteSize);*/
-            }
-        }
-    } else {
-        /* This is the deinterleaved case. We need to update each buffer in groups of internalChannels. This assumes each buffer is the same size. */
-        MA_ASSERT(pDevice->playback.internalChannels <= MA_MAX_CHANNELS);   /* This should heve been validated at initialization time. */
-
-        /*
-        For safety we'll check that the internal channels is a multiple of the buffer count. If it's not it means something
-        very strange has happened and we're not going to support it.
-        */
-        if ((pBufferList->mNumberBuffers % pDevice->playback.internalChannels) == 0) {
-            ma_uint8 tempBuffer[4096];
-            UInt32 iBuffer;
-
-            for (iBuffer = 0; iBuffer < pBufferList->mNumberBuffers; iBuffer += pDevice->playback.internalChannels) {
-                ma_uint32 frameCountPerBuffer = pBufferList->mBuffers[iBuffer].mDataByteSize / ma_get_bytes_per_sample(pDevice->playback.internalFormat);
-                ma_uint32 framesRemaining = frameCountPerBuffer;
-
-                while (framesRemaining > 0) {
-                    void* ppDeinterleavedBuffers[MA_MAX_CHANNELS];
-                    ma_uint32 iChannel;
-                    ma_uint32 framesToRead = sizeof(tempBuffer) / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-                    if (framesToRead > framesRemaining) {
-                        framesToRead = framesRemaining;
-                    }
-
-                    ma_device_handle_backend_data_callback(pDevice, tempBuffer, NULL, framesToRead);
-
-                    for (iChannel = 0; iChannel < pDevice->playback.internalChannels; ++iChannel) {
-                        ppDeinterleavedBuffers[iChannel] = (void*)ma_offset_ptr(pBufferList->mBuffers[iBuffer+iChannel].mData, (frameCountPerBuffer - framesRemaining) * ma_get_bytes_per_sample(pDevice->playback.internalFormat));
-                    }
-
-                    ma_deinterleave_pcm_frames(pDevice->playback.internalFormat, pDevice->playback.internalChannels, framesToRead, tempBuffer, ppDeinterleavedBuffers);
-
-                    framesRemaining -= framesToRead;
-                }
-            }
-        }
-    }
-
-    (void)pActionFlags;
-    (void)pTimeStamp;
-    (void)busNumber;
-    (void)frameCount;
-
-    return noErr;
-}
-
-static OSStatus ma_on_input__coreaudio(void* pUserData, AudioUnitRenderActionFlags* pActionFlags, const AudioTimeStamp* pTimeStamp, UInt32 busNumber, UInt32 frameCount, AudioBufferList* pUnusedBufferList)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    AudioBufferList* pRenderedBufferList;
-    ma_result result;
-    ma_stream_layout layout;
-    ma_uint32 iBuffer;
-    OSStatus status;
-
-    MA_ASSERT(pDevice != NULL);
-
-    pRenderedBufferList = (AudioBufferList*)pDevice->coreaudio.pAudioBufferList;
-    MA_ASSERT(pRenderedBufferList);
-
-    /* We need to check whether or not we are outputting interleaved or non-interleaved samples. The way we do this is slightly different for each type. */
-    layout = ma_stream_layout_interleaved;
-    if (pRenderedBufferList->mBuffers[0].mNumberChannels != pDevice->capture.internalChannels) {
-        layout = ma_stream_layout_deinterleaved;
-    }
-
-    /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "INFO: Input Callback: busNumber=%d, frameCount=%d, mNumberBuffers=%d\n", (int)busNumber, (int)frameCount, (int)pRenderedBufferList->mNumberBuffers);*/
-
-    /*
-    There has been a situation reported where frame count passed into this function is greater than the capacity of
-    our capture buffer. There doesn't seem to be a reliable way to determine what the maximum frame count will be,
-    so we need to instead resort to dynamically reallocating our buffer to ensure it's large enough to capture the
-    number of frames requested by this callback.
-    */
-    result = ma_device_realloc_AudioBufferList__coreaudio(pDevice, frameCount, pDevice->capture.internalFormat, pDevice->capture.internalChannels, layout);
-    if (result != MA_SUCCESS) {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "Failed to allocate AudioBufferList for capture.\n");
-        return noErr;
-    }
-
-    pRenderedBufferList = (AudioBufferList*)pDevice->coreaudio.pAudioBufferList;
-    MA_ASSERT(pRenderedBufferList);
-
-    /*
-    When you call AudioUnitRender(), Core Audio tries to be helpful by setting the mDataByteSize to the number of bytes
-    that were actually rendered. The problem with this is that the next call can fail with -50 due to the size no longer
-    being set to the capacity of the buffer, but instead the size in bytes of the previous render. This will cause a
-    problem when a future call to this callback specifies a larger number of frames.
-
-    To work around this we need to explicitly set the size of each buffer to their respective size in bytes.
-    */
-    for (iBuffer = 0; iBuffer < pRenderedBufferList->mNumberBuffers; ++iBuffer) {
-        pRenderedBufferList->mBuffers[iBuffer].mDataByteSize = pDevice->coreaudio.audioBufferCapInFrames * ma_get_bytes_per_sample(pDevice->capture.internalFormat) * pRenderedBufferList->mBuffers[iBuffer].mNumberChannels;
-    }
-
-    status = ((ma_AudioUnitRender_proc)pDevice->pContext->coreaudio.AudioUnitRender)((AudioUnit)pDevice->coreaudio.audioUnitCapture, pActionFlags, pTimeStamp, busNumber, frameCount, pRenderedBufferList);
-    if (status != noErr) {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "  ERROR: AudioUnitRender() failed with %d.\n", (int)status);
-        return status;
-    }
-
-    if (layout == ma_stream_layout_interleaved) {
-        for (iBuffer = 0; iBuffer < pRenderedBufferList->mNumberBuffers; ++iBuffer) {
-            if (pRenderedBufferList->mBuffers[iBuffer].mNumberChannels == pDevice->capture.internalChannels) {
-                ma_device_handle_backend_data_callback(pDevice, NULL, pRenderedBufferList->mBuffers[iBuffer].mData, frameCount);
-                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "  mDataByteSize=%d.\n", (int)pRenderedBufferList->mBuffers[iBuffer].mDataByteSize);*/
-            } else {
-                /*
-                This case is where the number of channels in the output buffer do not match our internal channels. It could mean that it's
-                not interleaved, in which case we can't handle right now since miniaudio does not yet support non-interleaved streams.
-                */
-                ma_uint8 silentBuffer[4096];
-                ma_uint32 framesRemaining;
-
-                MA_ZERO_MEMORY(silentBuffer, sizeof(silentBuffer));
-
-                framesRemaining = frameCount;
-                while (framesRemaining > 0) {
-                    ma_uint32 framesToSend = sizeof(silentBuffer) / ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-                    if (framesToSend > framesRemaining) {
-                        framesToSend = framesRemaining;
-                    }
-
-                    ma_device_handle_backend_data_callback(pDevice, NULL, silentBuffer, framesToSend);
-
-                    framesRemaining -= framesToSend;
-                }
-
-                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "  WARNING: Outputting silence. frameCount=%d, mNumberChannels=%d, mDataByteSize=%d\n", (int)frameCount, (int)pRenderedBufferList->mBuffers[iBuffer].mNumberChannels, (int)pRenderedBufferList->mBuffers[iBuffer].mDataByteSize);*/
-            }
-        }
-    } else {
-        /* This is the deinterleaved case. We need to interleave the audio data before sending it to the client. This assumes each buffer is the same size. */
-        MA_ASSERT(pDevice->capture.internalChannels <= MA_MAX_CHANNELS);    /* This should have been validated at initialization time. */
-
-        /*
-        For safety we'll check that the internal channels is a multiple of the buffer count. If it's not it means something
-        very strange has happened and we're not going to support it.
-        */
-        if ((pRenderedBufferList->mNumberBuffers % pDevice->capture.internalChannels) == 0) {
-            ma_uint8 tempBuffer[4096];
-            for (iBuffer = 0; iBuffer < pRenderedBufferList->mNumberBuffers; iBuffer += pDevice->capture.internalChannels) {
-                ma_uint32 framesRemaining = frameCount;
-                while (framesRemaining > 0) {
-                    void* ppDeinterleavedBuffers[MA_MAX_CHANNELS];
-                    ma_uint32 iChannel;
-                    ma_uint32 framesToSend = sizeof(tempBuffer) / ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-                    if (framesToSend > framesRemaining) {
-                        framesToSend = framesRemaining;
-                    }
-
-                    for (iChannel = 0; iChannel < pDevice->capture.internalChannels; ++iChannel) {
-                        ppDeinterleavedBuffers[iChannel] = (void*)ma_offset_ptr(pRenderedBufferList->mBuffers[iBuffer+iChannel].mData, (frameCount - framesRemaining) * ma_get_bytes_per_sample(pDevice->capture.internalFormat));
-                    }
-
-                    ma_interleave_pcm_frames(pDevice->capture.internalFormat, pDevice->capture.internalChannels, framesToSend, (const void**)ppDeinterleavedBuffers, tempBuffer);
-                    ma_device_handle_backend_data_callback(pDevice, NULL, tempBuffer, framesToSend);
-
-                    framesRemaining -= framesToSend;
-                }
-            }
-        }
-    }
-
-    (void)pActionFlags;
-    (void)pTimeStamp;
-    (void)busNumber;
-    (void)frameCount;
-    (void)pUnusedBufferList;
-
-    return noErr;
-}
-
-static void on_start_stop__coreaudio(void* pUserData, AudioUnit audioUnit, AudioUnitPropertyID propertyID, AudioUnitScope scope, AudioUnitElement element)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    MA_ASSERT(pDevice != NULL);
-
-    /* Don't do anything if it looks like we're just reinitializing due to a device switch. */
-    if (((audioUnit == pDevice->coreaudio.audioUnitPlayback) && pDevice->coreaudio.isSwitchingPlaybackDevice) ||
-        ((audioUnit == pDevice->coreaudio.audioUnitCapture)  && pDevice->coreaudio.isSwitchingCaptureDevice)) {
-        return;
-    }
-
-    /*
-    There's been a report of a deadlock here when triggered by ma_device_uninit(). It looks like
-    AudioUnitGetProprty (called below) and AudioComponentInstanceDispose (called in ma_device_uninit)
-    can try waiting on the same lock. I'm going to try working around this by not calling any Core
-    Audio APIs in the callback when the device has been stopped or uninitialized.
-    */
-    if (ma_device_get_state(pDevice) == ma_device_state_uninitialized || ma_device_get_state(pDevice) == ma_device_state_stopping || ma_device_get_state(pDevice) == ma_device_state_stopped) {
-        ma_device__on_notification_stopped(pDevice);
-    } else {
-        UInt32 isRunning;
-        UInt32 isRunningSize = sizeof(isRunning);
-        OSStatus status = ((ma_AudioUnitGetProperty_proc)pDevice->pContext->coreaudio.AudioUnitGetProperty)(audioUnit, kAudioOutputUnitProperty_IsRunning, scope, element, &isRunning, &isRunningSize);
-        if (status != noErr) {
-            goto done; /* Don't really know what to do in this case... just ignore it, I suppose... */
-        }
-
-        if (!isRunning) {
-            /*
-            The stop event is a bit annoying in Core Audio because it will be called when we automatically switch the default device. Some scenarios to consider:
-
-            1) When the device is unplugged, this will be called _before_ the default device change notification.
-            2) When the device is changed via the default device change notification, this will be called _after_ the switch.
-
-            For case #1, we just check if there's a new default device available. If so, we just ignore the stop event. For case #2 we check a flag.
-            */
-            if (((audioUnit == pDevice->coreaudio.audioUnitPlayback) && pDevice->coreaudio.isDefaultPlaybackDevice) ||
-                ((audioUnit == pDevice->coreaudio.audioUnitCapture)  && pDevice->coreaudio.isDefaultCaptureDevice)) {
-                /*
-                It looks like the device is switching through an external event, such as the user unplugging the device or changing the default device
-                via the operating system's sound settings. If we're re-initializing the device, we just terminate because we want the stopping of the
-                device to be seamless to the client (we don't want them receiving the stopped event and thinking that the device has stopped when it
-                hasn't!).
-                */
-                if (((audioUnit == pDevice->coreaudio.audioUnitPlayback) && pDevice->coreaudio.isSwitchingPlaybackDevice) ||
-                    ((audioUnit == pDevice->coreaudio.audioUnitCapture)  && pDevice->coreaudio.isSwitchingCaptureDevice)) {
-                    goto done;
-                }
-
-                /*
-                Getting here means the device is not reinitializing which means it may have been unplugged. From what I can see, it looks like Core Audio
-                will try switching to the new default device seamlessly. We need to somehow find a way to determine whether or not Core Audio will most
-                likely be successful in switching to the new device.
-
-                TODO: Try to predict if Core Audio will switch devices. If not, the stopped callback needs to be posted.
-                */
-                goto done;
-            }
-
-            /* Getting here means we need to stop the device. */
-            ma_device__on_notification_stopped(pDevice);
-        }
-    }
-
-    (void)propertyID; /* Unused. */
-
-done:
-    /* Always signal the stop event. It's possible for the "else" case to get hit which can happen during an interruption. */
-    ma_event_signal(&pDevice->coreaudio.stopEvent);
-}
-
-#if defined(MA_APPLE_DESKTOP)
-static ma_spinlock g_DeviceTrackingInitLock_CoreAudio = 0;  /* A spinlock for mutal exclusion of the init/uninit of the global tracking data. Initialization to 0 is what we need. */
-static ma_uint32   g_DeviceTrackingInitCounter_CoreAudio = 0;
-static ma_mutex    g_DeviceTrackingMutex_CoreAudio;
-static ma_device** g_ppTrackedDevices_CoreAudio = NULL;
-static ma_uint32   g_TrackedDeviceCap_CoreAudio = 0;
-static ma_uint32   g_TrackedDeviceCount_CoreAudio = 0;
-
-static OSStatus ma_default_device_changed__coreaudio(AudioObjectID objectID, UInt32 addressCount, const AudioObjectPropertyAddress* pAddresses, void* pUserData)
-{
-    ma_device_type deviceType;
-
-    /* Not sure if I really need to check this, but it makes me feel better. */
-    if (addressCount == 0) {
-        return noErr;
-    }
-
-    if (pAddresses[0].mSelector == kAudioHardwarePropertyDefaultOutputDevice) {
-        deviceType = ma_device_type_playback;
-    } else if (pAddresses[0].mSelector == kAudioHardwarePropertyDefaultInputDevice) {
-        deviceType = ma_device_type_capture;
-    } else {
-        return noErr;   /* Should never hit this. */
-    }
-
-    ma_mutex_lock(&g_DeviceTrackingMutex_CoreAudio);
-    {
-        ma_uint32 iDevice;
-        for (iDevice = 0; iDevice < g_TrackedDeviceCount_CoreAudio; iDevice += 1) {
-            ma_result reinitResult;
-            ma_device* pDevice;
-
-            pDevice = g_ppTrackedDevices_CoreAudio[iDevice];
-            if (pDevice->type == deviceType || pDevice->type == ma_device_type_duplex) {
-                if (deviceType == ma_device_type_playback) {
-                    pDevice->coreaudio.isSwitchingPlaybackDevice = MA_TRUE;
-                    reinitResult = ma_device_reinit_internal__coreaudio(pDevice, deviceType, MA_TRUE);
-                    pDevice->coreaudio.isSwitchingPlaybackDevice = MA_FALSE;
-                } else {
-                    pDevice->coreaudio.isSwitchingCaptureDevice = MA_TRUE;
-                    reinitResult = ma_device_reinit_internal__coreaudio(pDevice, deviceType, MA_TRUE);
-                    pDevice->coreaudio.isSwitchingCaptureDevice = MA_FALSE;
-                }
-
-                if (reinitResult == MA_SUCCESS) {
-                    ma_device__post_init_setup(pDevice, deviceType);
-
-                    /* Restart the device if required. If this fails we need to stop the device entirely. */
-                    if (ma_device_get_state(pDevice) == ma_device_state_started) {
-                        OSStatus status;
-                        if (deviceType == ma_device_type_playback) {
-                            status = ((ma_AudioOutputUnitStart_proc)pDevice->pContext->coreaudio.AudioOutputUnitStart)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
-                            if (status != noErr) {
-                                if (pDevice->type == ma_device_type_duplex) {
-                                    ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
-                                }
-                                ma_device__set_state(pDevice, ma_device_state_stopped);
-                            }
-                        } else if (deviceType == ma_device_type_capture) {
-                            status = ((ma_AudioOutputUnitStart_proc)pDevice->pContext->coreaudio.AudioOutputUnitStart)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
-                            if (status != noErr) {
-                                if (pDevice->type == ma_device_type_duplex) {
-                                    ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
-                                }
-                                ma_device__set_state(pDevice, ma_device_state_stopped);
-                            }
-                        }
-                    }
-
-                    ma_device__on_notification_rerouted(pDevice);
-                }
-            }
-        }
-    }
-    ma_mutex_unlock(&g_DeviceTrackingMutex_CoreAudio);
-
-    /* Unused parameters. */
-    (void)objectID;
-    (void)pUserData;
-
-    return noErr;
-}
-
-static ma_result ma_context__init_device_tracking__coreaudio(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-
-    ma_spinlock_lock(&g_DeviceTrackingInitLock_CoreAudio);
-    {
-        /* Don't do anything if we've already initializd device tracking. */
-        if (g_DeviceTrackingInitCounter_CoreAudio == 0) {
-            AudioObjectPropertyAddress propAddress;
-            propAddress.mScope    = kAudioObjectPropertyScopeGlobal;
-            propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-            ma_mutex_init(&g_DeviceTrackingMutex_CoreAudio);
-
-            propAddress.mSelector = kAudioHardwarePropertyDefaultInputDevice;
-            ((ma_AudioObjectAddPropertyListener_proc)pContext->coreaudio.AudioObjectAddPropertyListener)(kAudioObjectSystemObject, &propAddress, &ma_default_device_changed__coreaudio, NULL);
-
-            propAddress.mSelector = kAudioHardwarePropertyDefaultOutputDevice;
-            ((ma_AudioObjectAddPropertyListener_proc)pContext->coreaudio.AudioObjectAddPropertyListener)(kAudioObjectSystemObject, &propAddress, &ma_default_device_changed__coreaudio, NULL);
-
-        }
-        g_DeviceTrackingInitCounter_CoreAudio += 1;
-    }
-    ma_spinlock_unlock(&g_DeviceTrackingInitLock_CoreAudio);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context__uninit_device_tracking__coreaudio(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-
-    ma_spinlock_lock(&g_DeviceTrackingInitLock_CoreAudio);
-    {
-        if (g_DeviceTrackingInitCounter_CoreAudio > 0)
-            g_DeviceTrackingInitCounter_CoreAudio -= 1;
-
-        if (g_DeviceTrackingInitCounter_CoreAudio == 0) {
-            AudioObjectPropertyAddress propAddress;
-            propAddress.mScope    = kAudioObjectPropertyScopeGlobal;
-            propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-            propAddress.mSelector = kAudioHardwarePropertyDefaultInputDevice;
-            ((ma_AudioObjectRemovePropertyListener_proc)pContext->coreaudio.AudioObjectRemovePropertyListener)(kAudioObjectSystemObject, &propAddress, &ma_default_device_changed__coreaudio, NULL);
-
-            propAddress.mSelector = kAudioHardwarePropertyDefaultOutputDevice;
-            ((ma_AudioObjectRemovePropertyListener_proc)pContext->coreaudio.AudioObjectRemovePropertyListener)(kAudioObjectSystemObject, &propAddress, &ma_default_device_changed__coreaudio, NULL);
-
-            /* At this point there should be no tracked devices. If not there's an error somewhere. */
-            if (g_ppTrackedDevices_CoreAudio != NULL) {
-                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "You have uninitialized all contexts while an associated device is still active.");
-                ma_spinlock_unlock(&g_DeviceTrackingInitLock_CoreAudio);
-                return MA_INVALID_OPERATION;
-            }
-
-            ma_mutex_uninit(&g_DeviceTrackingMutex_CoreAudio);
-        }
-    }
-    ma_spinlock_unlock(&g_DeviceTrackingInitLock_CoreAudio);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device__track__coreaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    ma_mutex_lock(&g_DeviceTrackingMutex_CoreAudio);
-    {
-        /* Allocate memory if required. */
-        if (g_TrackedDeviceCap_CoreAudio <= g_TrackedDeviceCount_CoreAudio) {
-            ma_uint32 newCap;
-            ma_device** ppNewDevices;
-
-            newCap = g_TrackedDeviceCap_CoreAudio * 2;
-            if (newCap == 0) {
-                newCap = 1;
-            }
-
-            ppNewDevices = (ma_device**)ma_realloc(g_ppTrackedDevices_CoreAudio, sizeof(*g_ppTrackedDevices_CoreAudio)*newCap, &pDevice->pContext->allocationCallbacks);
-            if (ppNewDevices == NULL) {
-                ma_mutex_unlock(&g_DeviceTrackingMutex_CoreAudio);
-                return MA_OUT_OF_MEMORY;
-            }
-
-            g_ppTrackedDevices_CoreAudio = ppNewDevices;
-            g_TrackedDeviceCap_CoreAudio = newCap;
-        }
-
-        g_ppTrackedDevices_CoreAudio[g_TrackedDeviceCount_CoreAudio] = pDevice;
-        g_TrackedDeviceCount_CoreAudio += 1;
-    }
-    ma_mutex_unlock(&g_DeviceTrackingMutex_CoreAudio);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device__untrack__coreaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    ma_mutex_lock(&g_DeviceTrackingMutex_CoreAudio);
-    {
-        ma_uint32 iDevice;
-        for (iDevice = 0; iDevice < g_TrackedDeviceCount_CoreAudio; iDevice += 1) {
-            if (g_ppTrackedDevices_CoreAudio[iDevice] == pDevice) {
-                /* We've found the device. We now need to remove it from the list. */
-                ma_uint32 jDevice;
-                for (jDevice = iDevice; jDevice < g_TrackedDeviceCount_CoreAudio-1; jDevice += 1) {
-                    g_ppTrackedDevices_CoreAudio[jDevice] = g_ppTrackedDevices_CoreAudio[jDevice+1];
-                }
-
-                g_TrackedDeviceCount_CoreAudio -= 1;
-
-                /* If there's nothing else in the list we need to free memory. */
-                if (g_TrackedDeviceCount_CoreAudio == 0) {
-                    ma_free(g_ppTrackedDevices_CoreAudio, &pDevice->pContext->allocationCallbacks);
-                    g_ppTrackedDevices_CoreAudio = NULL;
-                    g_TrackedDeviceCap_CoreAudio = 0;
-                }
-
-                break;
-            }
-        }
-    }
-    ma_mutex_unlock(&g_DeviceTrackingMutex_CoreAudio);
-
-    return MA_SUCCESS;
-}
-#endif
-
-#if defined(MA_APPLE_MOBILE)
-@interface ma_ios_notification_handler:NSObject {
-    ma_device* m_pDevice;
-}
-@end
-
-@implementation ma_ios_notification_handler
--(id)init:(ma_device*)pDevice
-{
-    self = [super init];
-    m_pDevice = pDevice;
-
-    /* For route changes. */
-    [[NSNotificationCenter defaultCenter] addObserver:self selector:@selector(handle_route_change:) name:AVAudioSessionRouteChangeNotification object:[AVAudioSession sharedInstance]];
-
-    /* For interruptions. */
-    [[NSNotificationCenter defaultCenter] addObserver:self selector:@selector(handle_interruption:) name:AVAudioSessionInterruptionNotification object:[AVAudioSession sharedInstance]];
-
-    return self;
-}
-
--(void)dealloc
-{
-    [self remove_handler];
-
-    #if defined(__has_feature)
-        #if !__has_feature(objc_arc)
-            [super dealloc];
-        #endif
-    #endif
-}
-
--(void)remove_handler
-{
-    [[NSNotificationCenter defaultCenter] removeObserver:self name:AVAudioSessionRouteChangeNotification object:nil];
-    [[NSNotificationCenter defaultCenter] removeObserver:self name:AVAudioSessionInterruptionNotification object:nil];
-}
-
--(void)handle_interruption:(NSNotification*)pNotification
-{
-    NSInteger type = [[[pNotification userInfo] objectForKey:AVAudioSessionInterruptionTypeKey] integerValue];
-    switch (type)
-    {
-        case AVAudioSessionInterruptionTypeBegan:
-        {
-            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Interruption: AVAudioSessionInterruptionTypeBegan\n");
-
-            /*
-            Core Audio will have stopped the internal device automatically, but we need explicitly
-            stop it at a higher level to ensure miniaudio-specific state is updated for consistency.
-            */
-            ma_device_stop(m_pDevice);
-
-            /*
-            Fire the notification after the device has been stopped to ensure it's in the correct
-            state when the notification handler is invoked.
-            */
-            ma_device__on_notification_interruption_began(m_pDevice);
-        } break;
-
-        case AVAudioSessionInterruptionTypeEnded:
-        {
-            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Interruption: AVAudioSessionInterruptionTypeEnded\n");
-            ma_device__on_notification_interruption_ended(m_pDevice);
-        } break;
-    }
-}
-
--(void)handle_route_change:(NSNotification*)pNotification
-{
-    AVAudioSession* pSession = [AVAudioSession sharedInstance];
-
-    NSInteger reason = [[[pNotification userInfo] objectForKey:AVAudioSessionRouteChangeReasonKey] integerValue];
-    switch (reason)
-    {
-        case AVAudioSessionRouteChangeReasonOldDeviceUnavailable:
-        {
-            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonOldDeviceUnavailable\n");
-        } break;
-
-        case AVAudioSessionRouteChangeReasonNewDeviceAvailable:
-        {
-            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonNewDeviceAvailable\n");
-        } break;
-
-        case AVAudioSessionRouteChangeReasonNoSuitableRouteForCategory:
-        {
-            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonNoSuitableRouteForCategory\n");
-        } break;
-
-        case AVAudioSessionRouteChangeReasonWakeFromSleep:
-        {
-            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonWakeFromSleep\n");
-        } break;
-
-        case AVAudioSessionRouteChangeReasonOverride:
-        {
-            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonOverride\n");
-        } break;
-
-        case AVAudioSessionRouteChangeReasonCategoryChange:
-        {
-            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonCategoryChange\n");
-        } break;
-
-        case AVAudioSessionRouteChangeReasonUnknown:
-        default:
-        {
-            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonUnknown\n");
-        } break;
-    }
-
-    ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_DEBUG, "[Core Audio] Changing Route. inputNumberChannels=%d; outputNumberOfChannels=%d\n", (int)pSession.inputNumberOfChannels, (int)pSession.outputNumberOfChannels);
-
-    /* Let the application know about the route change. */
-    ma_device__on_notification_rerouted(m_pDevice);
-}
-@end
-#endif
-
-static ma_result ma_device_uninit__coreaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_uninitialized);
-
-#if defined(MA_APPLE_DESKTOP)
-    /*
-    Make sure we're no longer tracking the device. It doesn't matter if we call this for a non-default device because it'll
-    just gracefully ignore it.
-    */
-    ma_device__untrack__coreaudio(pDevice);
-#endif
-#if defined(MA_APPLE_MOBILE)
-    if (pDevice->coreaudio.pNotificationHandler != NULL) {
-        ma_ios_notification_handler* pNotificationHandler = (MA_BRIDGE_TRANSFER ma_ios_notification_handler*)pDevice->coreaudio.pNotificationHandler;
-        [pNotificationHandler remove_handler];
-    }
-#endif
-
-    if (pDevice->coreaudio.audioUnitCapture != NULL) {
-        ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
-    }
-    if (pDevice->coreaudio.audioUnitPlayback != NULL) {
-        ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
-    }
-
-    if (pDevice->coreaudio.pAudioBufferList) {
-        ma_free(pDevice->coreaudio.pAudioBufferList, &pDevice->pContext->allocationCallbacks);
-    }
-
-    return MA_SUCCESS;
-}
-
-typedef struct
-{
-    ma_bool32 allowNominalSampleRateChange;
-
-    /* Input. */
-    ma_format formatIn;
-    ma_uint32 channelsIn;
-    ma_uint32 sampleRateIn;
-    ma_channel channelMapIn[MA_MAX_CHANNELS];
-    ma_uint32 periodSizeInFramesIn;
-    ma_uint32 periodSizeInMillisecondsIn;
-    ma_uint32 periodsIn;
-    ma_share_mode shareMode;
-    ma_performance_profile performanceProfile;
-    ma_bool32 registerStopEvent;
-
-    /* Output. */
-#if defined(MA_APPLE_DESKTOP)
-    AudioObjectID deviceObjectID;
-#endif
-    AudioComponent component;
-    AudioUnit audioUnit;
-    AudioBufferList* pAudioBufferList;  /* Only used for input devices. */
-    ma_format formatOut;
-    ma_uint32 channelsOut;
-    ma_uint32 sampleRateOut;
-    ma_channel channelMapOut[MA_MAX_CHANNELS];
-    ma_uint32 periodSizeInFramesOut;
-    ma_uint32 periodsOut;
-    char deviceName[256];
-} ma_device_init_internal_data__coreaudio;
-
-static ma_result ma_device_init_internal__coreaudio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_init_internal_data__coreaudio* pData, void* pDevice_DoNotReference)   /* <-- pDevice is typed as void* intentionally so as to avoid accidentally referencing it. */
-{
-    ma_result result;
-    OSStatus status;
-    UInt32 enableIOFlag;
-    AudioStreamBasicDescription bestFormat;
-    UInt32 actualPeriodSizeInFrames;
-    AURenderCallbackStruct callbackInfo;
-#if defined(MA_APPLE_DESKTOP)
-    AudioObjectID deviceObjectID;
-#endif
-
-    /* This API should only be used for a single device type: playback or capture. No full-duplex mode. */
-    if (deviceType == ma_device_type_duplex) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(deviceType == ma_device_type_playback || deviceType == ma_device_type_capture);
-
-#if defined(MA_APPLE_DESKTOP)
-    pData->deviceObjectID = 0;
-#endif
-    pData->component = NULL;
-    pData->audioUnit = NULL;
-    pData->pAudioBufferList = NULL;
-
-#if defined(MA_APPLE_DESKTOP)
-    result = ma_find_AudioObjectID(pContext, deviceType, pDeviceID, &deviceObjectID);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pData->deviceObjectID = deviceObjectID;
-#endif
-
-    /* Core audio doesn't really use the notion of a period so we can leave this unmodified, but not too over the top. */
-    pData->periodsOut = pData->periodsIn;
-    if (pData->periodsOut == 0) {
-        pData->periodsOut = MA_DEFAULT_PERIODS;
-    }
-    if (pData->periodsOut > 16) {
-        pData->periodsOut = 16;
-    }
-
-
-    /* Audio unit. */
-    status = ((ma_AudioComponentInstanceNew_proc)pContext->coreaudio.AudioComponentInstanceNew)((AudioComponent)pContext->coreaudio.component, (AudioUnit*)&pData->audioUnit);
-    if (status != noErr) {
-        return ma_result_from_OSStatus(status);
-    }
-
-
-    /* The input/output buses need to be explicitly enabled and disabled. We set the flag based on the output unit first, then we just swap it for input. */
-    enableIOFlag = 1;
-    if (deviceType == ma_device_type_capture) {
-        enableIOFlag = 0;
-    }
-
-    status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Output, MA_COREAUDIO_OUTPUT_BUS, &enableIOFlag, sizeof(enableIOFlag));
-    if (status != noErr) {
-        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-        return ma_result_from_OSStatus(status);
-    }
-
-    enableIOFlag = (enableIOFlag == 0) ? 1 : 0;
-    status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Input, MA_COREAUDIO_INPUT_BUS, &enableIOFlag, sizeof(enableIOFlag));
-    if (status != noErr) {
-        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-        return ma_result_from_OSStatus(status);
-    }
-
-
-    /* Set the device to use with this audio unit. This is only used on desktop since we are using defaults on mobile. */
-#if defined(MA_APPLE_DESKTOP)
-    status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioOutputUnitProperty_CurrentDevice, kAudioUnitScope_Global, 0, &deviceObjectID, sizeof(deviceObjectID));
-    if (status != noErr) {
-        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-        return ma_result_from_OSStatus(result);
-    }
-#else
-    /*
-    For some reason it looks like Apple is only allowing selection of the input device. There does not appear to be any way to change
-    the default output route. I have no idea why this is like this, but for now we'll only be able to configure capture devices.
-    */
-    if (pDeviceID != NULL) {
-        if (deviceType == ma_device_type_capture) {
-            ma_bool32 found = MA_FALSE;
-            NSArray *pInputs = [[[AVAudioSession sharedInstance] currentRoute] inputs];
-            for (AVAudioSessionPortDescription* pPortDesc in pInputs) {
-                if (strcmp(pDeviceID->coreaudio, [pPortDesc.UID UTF8String]) == 0) {
-                    [[AVAudioSession sharedInstance] setPreferredInput:pPortDesc error:nil];
-                    found = MA_TRUE;
-                    break;
-                }
-            }
-
-            if (found == MA_FALSE) {
-                return MA_DOES_NOT_EXIST;
-            }
-        }
-    }
-#endif
-
-    /*
-    Format. This is the hardest part of initialization because there's a few variables to take into account.
-      1) The format must be supported by the device.
-      2) The format must be supported miniaudio.
-      3) There's a priority that miniaudio prefers.
-
-    Ideally we would like to use a format that's as close to the hardware as possible so we can get as close to a passthrough as possible. The
-    most important property is the sample rate. miniaudio can do format conversion for any sample rate and channel count, but cannot do the same
-    for the sample data format. If the sample data format is not supported by miniaudio it must be ignored completely.
-
-    On mobile platforms this is a bit different. We just force the use of whatever the audio unit's current format is set to.
-    */
-    {
-        AudioStreamBasicDescription origFormat;
-        UInt32 origFormatSize = sizeof(origFormat);
-        AudioUnitScope   formatScope   = (deviceType == ma_device_type_playback) ? kAudioUnitScope_Input : kAudioUnitScope_Output;
-        AudioUnitElement formatElement = (deviceType == ma_device_type_playback) ? MA_COREAUDIO_OUTPUT_BUS : MA_COREAUDIO_INPUT_BUS;
-
-        if (deviceType == ma_device_type_playback) {
-            status = ((ma_AudioUnitGetProperty_proc)pContext->coreaudio.AudioUnitGetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, MA_COREAUDIO_OUTPUT_BUS, &origFormat, &origFormatSize);
-        } else {
-            status = ((ma_AudioUnitGetProperty_proc)pContext->coreaudio.AudioUnitGetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Input, MA_COREAUDIO_INPUT_BUS, &origFormat, &origFormatSize);
-        }
-        if (status != noErr) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-            return ma_result_from_OSStatus(status);
-        }
-
-    #if defined(MA_APPLE_DESKTOP)
-        result = ma_find_best_format__coreaudio(pContext, deviceObjectID, deviceType, pData->formatIn, pData->channelsIn, pData->sampleRateIn, &origFormat, &bestFormat);
-        if (result != MA_SUCCESS) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-            return result;
-        }
-
-        /*
-        Technical Note TN2091: Device input using the HAL Output Audio Unit
-            https://developer.apple.com/library/archive/technotes/tn2091/_index.html
-
-        This documentation says the following:
-
-            The internal AudioConverter can handle any *simple* conversion. Typically, this means that a client can specify ANY
-            variant of the PCM formats. Consequently, the device's sample rate should match the desired sample rate. If sample rate
-            conversion is needed, it can be accomplished by buffering the input and converting the data on a separate thread with
-            another AudioConverter.
-
-        The important part here is the mention that it can handle *simple* conversions, which does *not* include sample rate. We
-        therefore want to ensure the sample rate stays consistent. This document is specifically for input, but I'm going to play it
-        safe and apply the same rule to output as well.
-
-        I have tried going against the documentation by setting the sample rate anyway, but this just results in AudioUnitRender()
-        returning a result code of -10863. I have also tried changing the format directly on the input scope on the input bus, but
-        this just results in `ca_require: IsStreamFormatWritable(inScope, inElement) NotWritable` when trying to set the format.
-
-        Something that does seem to work, however, has been setting the nominal sample rate on the deivce object. The problem with
-        this, however, is that it actually changes the sample rate at the operating system level and not just the application. This
-        could be intrusive to the user, however, so I don't think it's wise to make this the default. Instead I'm making this a
-        configuration option. When the `coreaudio.allowNominalSampleRateChange` config option is set to true, changing the sample
-        rate will be allowed. Otherwise it'll be fixed to the current sample rate. To check the system-defined sample rate, run
-        the Audio MIDI Setup program that comes installed on macOS and observe how the sample rate changes as the sample rate is
-        changed by miniaudio.
-        */
-        if (pData->allowNominalSampleRateChange) {
-            AudioValueRange sampleRateRange;
-            AudioObjectPropertyAddress propAddress;
-
-            sampleRateRange.mMinimum = bestFormat.mSampleRate;
-            sampleRateRange.mMaximum = bestFormat.mSampleRate;
-
-            propAddress.mSelector = kAudioDevicePropertyNominalSampleRate;
-            propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
-            propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
-
-            status = ((ma_AudioObjectSetPropertyData_proc)pContext->coreaudio.AudioObjectSetPropertyData)(deviceObjectID, &propAddress, 0, NULL, sizeof(sampleRateRange), &sampleRateRange);
-            if (status != noErr) {
-                bestFormat.mSampleRate = origFormat.mSampleRate;
-            }
-        } else {
-            bestFormat.mSampleRate = origFormat.mSampleRate;
-        }
-
-        status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, formatScope, formatElement, &bestFormat, sizeof(bestFormat));
-        if (status != noErr) {
-            /* We failed to set the format, so fall back to the current format of the audio unit. */
-            bestFormat = origFormat;
-        }
-    #else
-        bestFormat = origFormat;
-
-        /*
-        Sample rate is a little different here because for some reason kAudioUnitProperty_StreamFormat returns 0... Oh well. We need to instead try
-        setting the sample rate to what the user has requested and then just see the results of it. Need to use some Objective-C here for this since
-        it depends on Apple's AVAudioSession API. To do this we just get the shared AVAudioSession instance and then set it. Note that from what I
-        can tell, it looks like the sample rate is shared between playback and capture for everything.
-        */
-        @autoreleasepool {
-            AVAudioSession* pAudioSession = [AVAudioSession sharedInstance];
-            MA_ASSERT(pAudioSession != NULL);
-
-            [pAudioSession setPreferredSampleRate:(double)pData->sampleRateIn error:nil];
-            bestFormat.mSampleRate = pAudioSession.sampleRate;
-
-            /*
-            I've had a report that the channel count returned by AudioUnitGetProperty above is inconsistent with
-            AVAudioSession outputNumberOfChannels. I'm going to try using the AVAudioSession values instead.
-            */
-            if (deviceType == ma_device_type_playback) {
-                bestFormat.mChannelsPerFrame = (UInt32)pAudioSession.outputNumberOfChannels;
-            }
-            if (deviceType == ma_device_type_capture) {
-                bestFormat.mChannelsPerFrame = (UInt32)pAudioSession.inputNumberOfChannels;
-            }
-        }
-
-        status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, formatScope, formatElement, &bestFormat, sizeof(bestFormat));
-        if (status != noErr) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-            return ma_result_from_OSStatus(status);
-        }
-    #endif
-
-        result = ma_format_from_AudioStreamBasicDescription(&bestFormat, &pData->formatOut);
-        if (result != MA_SUCCESS) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-            return result;
-        }
-
-        if (pData->formatOut == ma_format_unknown) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-            return MA_FORMAT_NOT_SUPPORTED;
-        }
-
-        pData->channelsOut   = bestFormat.mChannelsPerFrame;
-        pData->sampleRateOut = bestFormat.mSampleRate;
-    }
-
-    /* Clamp the channel count for safety. */
-    if (pData->channelsOut > MA_MAX_CHANNELS) {
-        pData->channelsOut = MA_MAX_CHANNELS;
-    }
-
-    /*
-    Internal channel map. This is weird in my testing. If I use the AudioObject to get the
-    channel map, the channel descriptions are set to "Unknown" for some reason. To work around
-    this it looks like retrieving it from the AudioUnit will work. However, and this is where
-    it gets weird, it doesn't seem to work with capture devices, nor at all on iOS... Therefore
-    I'm going to fall back to a default assumption in these cases.
-    */
-#if defined(MA_APPLE_DESKTOP)
-    result = ma_get_AudioUnit_channel_map(pContext, pData->audioUnit, deviceType, pData->channelMapOut, pData->channelsOut);
-    if (result != MA_SUCCESS) {
-    #if 0
-        /* Try falling back to the channel map from the AudioObject. */
-        result = ma_get_AudioObject_channel_map(pContext, deviceObjectID, deviceType, pData->channelMapOut, pData->channelsOut);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    #else
-        /* Fall back to default assumptions. */
-        ma_channel_map_init_standard(ma_standard_channel_map_default, pData->channelMapOut, ma_countof(pData->channelMapOut), pData->channelsOut);
-    #endif
-    }
-#else
-    /* TODO: Figure out how to get the channel map using AVAudioSession. */
-    ma_channel_map_init_standard(ma_standard_channel_map_default, pData->channelMapOut, ma_countof(pData->channelMapOut), pData->channelsOut);
-#endif
-
-
-    /* Buffer size. Not allowing this to be configurable on iOS. */
-    if (pData->periodSizeInFramesIn == 0) {
-        if (pData->periodSizeInMillisecondsIn == 0) {
-            if (pData->performanceProfile == ma_performance_profile_low_latency) {
-                actualPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY, pData->sampleRateOut);
-            } else {
-                actualPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE, pData->sampleRateOut);
-            }
-        } else {
-            actualPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(pData->periodSizeInMillisecondsIn, pData->sampleRateOut);
-        }
-    } else {
-        actualPeriodSizeInFrames = pData->periodSizeInFramesIn;
-    }
-
-#if defined(MA_APPLE_DESKTOP)
-    result = ma_set_AudioObject_buffer_size_in_frames(pContext, deviceObjectID, deviceType, &actualPeriodSizeInFrames);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-#else
-    /*
-    On iOS, the size of the IO buffer needs to be specified in seconds and is a floating point
-    number. I don't trust any potential truncation errors due to converting from float to integer
-    so I'm going to explicitly set the actual period size to the next power of 2.
-    */
-    @autoreleasepool {
-        AVAudioSession* pAudioSession = [AVAudioSession sharedInstance];
-        MA_ASSERT(pAudioSession != NULL);
-
-        [pAudioSession setPreferredIOBufferDuration:((float)actualPeriodSizeInFrames / pAudioSession.sampleRate) error:nil];
-        actualPeriodSizeInFrames = ma_next_power_of_2((ma_uint32)(pAudioSession.IOBufferDuration * pAudioSession.sampleRate));
-    }
-#endif
-
-
-    /*
-    During testing I discovered that the buffer size can be too big. You'll get an error like this:
-
-      kAudioUnitErr_TooManyFramesToProcess : inFramesToProcess=4096, mMaxFramesPerSlice=512
-
-    Note how inFramesToProcess is smaller than mMaxFramesPerSlice. To fix, we need to set kAudioUnitProperty_MaximumFramesPerSlice to that
-    of the size of our buffer, or do it the other way around and set our buffer size to the kAudioUnitProperty_MaximumFramesPerSlice.
-    */
-    status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_MaximumFramesPerSlice, kAudioUnitScope_Global, 0, &actualPeriodSizeInFrames, sizeof(actualPeriodSizeInFrames));
-    if (status != noErr) {
-        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-        return ma_result_from_OSStatus(status);
-    }
-
-    pData->periodSizeInFramesOut = (ma_uint32)actualPeriodSizeInFrames;
-
-    /* We need a buffer list if this is an input device. We render into this in the input callback. */
-    if (deviceType == ma_device_type_capture) {
-        ma_bool32 isInterleaved = (bestFormat.mFormatFlags & kAudioFormatFlagIsNonInterleaved) == 0;
-        AudioBufferList* pBufferList;
-
-        pBufferList = ma_allocate_AudioBufferList__coreaudio(pData->periodSizeInFramesOut, pData->formatOut, pData->channelsOut, (isInterleaved) ? ma_stream_layout_interleaved : ma_stream_layout_deinterleaved, &pContext->allocationCallbacks);
-        if (pBufferList == NULL) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-            return MA_OUT_OF_MEMORY;
-        }
-
-        pData->pAudioBufferList = pBufferList;
-    }
-
-    /* Callbacks. */
-    callbackInfo.inputProcRefCon = pDevice_DoNotReference;
-    if (deviceType == ma_device_type_playback) {
-        callbackInfo.inputProc = ma_on_output__coreaudio;
-        status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Global, 0, &callbackInfo, sizeof(callbackInfo));
-        if (status != noErr) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-            return ma_result_from_OSStatus(status);
-        }
-    } else {
-        callbackInfo.inputProc = ma_on_input__coreaudio;
-        status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioOutputUnitProperty_SetInputCallback, kAudioUnitScope_Global, 0, &callbackInfo, sizeof(callbackInfo));
-        if (status != noErr) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-            return ma_result_from_OSStatus(status);
-        }
-    }
-
-    /* We need to listen for stop events. */
-    if (pData->registerStopEvent) {
-        status = ((ma_AudioUnitAddPropertyListener_proc)pContext->coreaudio.AudioUnitAddPropertyListener)(pData->audioUnit, kAudioOutputUnitProperty_IsRunning, on_start_stop__coreaudio, pDevice_DoNotReference);
-        if (status != noErr) {
-            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-            return ma_result_from_OSStatus(status);
-        }
-    }
-
-    /* Initialize the audio unit. */
-    status = ((ma_AudioUnitInitialize_proc)pContext->coreaudio.AudioUnitInitialize)(pData->audioUnit);
-    if (status != noErr) {
-        ma_free(pData->pAudioBufferList, &pContext->allocationCallbacks);
-        pData->pAudioBufferList = NULL;
-        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
-        return ma_result_from_OSStatus(status);
-    }
-
-    /* Grab the name. */
-#if defined(MA_APPLE_DESKTOP)
-    ma_get_AudioObject_name(pContext, deviceObjectID, sizeof(pData->deviceName), pData->deviceName);
-#else
-    if (deviceType == ma_device_type_playback) {
-        ma_strcpy_s(pData->deviceName, sizeof(pData->deviceName), MA_DEFAULT_PLAYBACK_DEVICE_NAME);
-    } else {
-        ma_strcpy_s(pData->deviceName, sizeof(pData->deviceName), MA_DEFAULT_CAPTURE_DEVICE_NAME);
-    }
-#endif
-
-    return result;
-}
-
-#if defined(MA_APPLE_DESKTOP)
-static ma_result ma_device_reinit_internal__coreaudio(ma_device* pDevice, ma_device_type deviceType, ma_bool32 disposePreviousAudioUnit)
-{
-    ma_device_init_internal_data__coreaudio data;
-    ma_result result;
-
-    /* This should only be called for playback or capture, not duplex. */
-    if (deviceType == ma_device_type_duplex) {
-        return MA_INVALID_ARGS;
-    }
-
-    data.allowNominalSampleRateChange = MA_FALSE;   /* Don't change the nominal sample rate when switching devices. */
-
-    if (deviceType == ma_device_type_capture) {
-        data.formatIn               = pDevice->capture.format;
-        data.channelsIn             = pDevice->capture.channels;
-        data.sampleRateIn           = pDevice->sampleRate;
-        MA_COPY_MEMORY(data.channelMapIn, pDevice->capture.channelMap, sizeof(pDevice->capture.channelMap));
-        data.shareMode              = pDevice->capture.shareMode;
-        data.performanceProfile     = pDevice->coreaudio.originalPerformanceProfile;
-        data.registerStopEvent      = MA_TRUE;
-
-        if (disposePreviousAudioUnit) {
-            ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
-            ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
-        }
-        if (pDevice->coreaudio.pAudioBufferList) {
-            ma_free(pDevice->coreaudio.pAudioBufferList, &pDevice->pContext->allocationCallbacks);
-        }
-    } else if (deviceType == ma_device_type_playback) {
-        data.formatIn               = pDevice->playback.format;
-        data.channelsIn             = pDevice->playback.channels;
-        data.sampleRateIn           = pDevice->sampleRate;
-        MA_COPY_MEMORY(data.channelMapIn, pDevice->playback.channelMap, sizeof(pDevice->playback.channelMap));
-        data.shareMode              = pDevice->playback.shareMode;
-        data.performanceProfile     = pDevice->coreaudio.originalPerformanceProfile;
-        data.registerStopEvent      = (pDevice->type != ma_device_type_duplex);
-
-        if (disposePreviousAudioUnit) {
-            ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
-            ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
-        }
-    }
-    data.periodSizeInFramesIn       = pDevice->coreaudio.originalPeriodSizeInFrames;
-    data.periodSizeInMillisecondsIn = pDevice->coreaudio.originalPeriodSizeInMilliseconds;
-    data.periodsIn                  = pDevice->coreaudio.originalPeriods;
-
-    /* Need at least 3 periods for duplex. */
-    if (data.periodsIn < 3 && pDevice->type == ma_device_type_duplex) {
-        data.periodsIn = 3;
-    }
-
-    result = ma_device_init_internal__coreaudio(pDevice->pContext, deviceType, NULL, &data, (void*)pDevice);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (deviceType == ma_device_type_capture) {
-    #if defined(MA_APPLE_DESKTOP)
-        pDevice->coreaudio.deviceObjectIDCapture     = (ma_uint32)data.deviceObjectID;
-        ma_get_AudioObject_uid(pDevice->pContext, pDevice->coreaudio.deviceObjectIDCapture, sizeof(pDevice->capture.id.coreaudio), pDevice->capture.id.coreaudio);
-    #endif
-        pDevice->coreaudio.audioUnitCapture          = (ma_ptr)data.audioUnit;
-        pDevice->coreaudio.pAudioBufferList          = (ma_ptr)data.pAudioBufferList;
-        pDevice->coreaudio.audioBufferCapInFrames    = data.periodSizeInFramesOut;
-
-        pDevice->capture.internalFormat              = data.formatOut;
-        pDevice->capture.internalChannels            = data.channelsOut;
-        pDevice->capture.internalSampleRate          = data.sampleRateOut;
-        MA_COPY_MEMORY(pDevice->capture.internalChannelMap, data.channelMapOut, sizeof(data.channelMapOut));
-        pDevice->capture.internalPeriodSizeInFrames  = data.periodSizeInFramesOut;
-        pDevice->capture.internalPeriods             = data.periodsOut;
-    } else if (deviceType == ma_device_type_playback) {
-    #if defined(MA_APPLE_DESKTOP)
-        pDevice->coreaudio.deviceObjectIDPlayback    = (ma_uint32)data.deviceObjectID;
-        ma_get_AudioObject_uid(pDevice->pContext, pDevice->coreaudio.deviceObjectIDPlayback, sizeof(pDevice->playback.id.coreaudio), pDevice->playback.id.coreaudio);
-    #endif
-        pDevice->coreaudio.audioUnitPlayback         = (ma_ptr)data.audioUnit;
-
-        pDevice->playback.internalFormat             = data.formatOut;
-        pDevice->playback.internalChannels           = data.channelsOut;
-        pDevice->playback.internalSampleRate         = data.sampleRateOut;
-        MA_COPY_MEMORY(pDevice->playback.internalChannelMap, data.channelMapOut, sizeof(data.channelMapOut));
-        pDevice->playback.internalPeriodSizeInFrames = data.periodSizeInFramesOut;
-        pDevice->playback.internalPeriods            = data.periodsOut;
-    }
-
-    return MA_SUCCESS;
-}
-#endif /* MA_APPLE_DESKTOP */
-
-static ma_result ma_device_init__coreaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pConfig != NULL);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    /* No exclusive mode with the Core Audio backend for now. */
-    if (((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive) ||
-        ((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive)) {
-        return MA_SHARE_MODE_NOT_SUPPORTED;
-    }
-
-    /* Capture needs to be initialized first. */
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ma_device_init_internal_data__coreaudio data;
-        data.allowNominalSampleRateChange = pConfig->coreaudio.allowNominalSampleRateChange;
-        data.formatIn                     = pDescriptorCapture->format;
-        data.channelsIn                   = pDescriptorCapture->channels;
-        data.sampleRateIn                 = pDescriptorCapture->sampleRate;
-        MA_COPY_MEMORY(data.channelMapIn, pDescriptorCapture->channelMap, sizeof(pDescriptorCapture->channelMap));
-        data.periodSizeInFramesIn         = pDescriptorCapture->periodSizeInFrames;
-        data.periodSizeInMillisecondsIn   = pDescriptorCapture->periodSizeInMilliseconds;
-        data.periodsIn                    = pDescriptorCapture->periodCount;
-        data.shareMode                    = pDescriptorCapture->shareMode;
-        data.performanceProfile           = pConfig->performanceProfile;
-        data.registerStopEvent            = MA_TRUE;
-
-        /* Need at least 3 periods for duplex. */
-        if (data.periodsIn < 3 && pConfig->deviceType == ma_device_type_duplex) {
-            data.periodsIn = 3;
-        }
-
-        result = ma_device_init_internal__coreaudio(pDevice->pContext, ma_device_type_capture, pDescriptorCapture->pDeviceID, &data, (void*)pDevice);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pDevice->coreaudio.isDefaultCaptureDevice           = (pConfig->capture.pDeviceID == NULL);
-    #if defined(MA_APPLE_DESKTOP)
-        pDevice->coreaudio.deviceObjectIDCapture            = (ma_uint32)data.deviceObjectID;
-    #endif
-        pDevice->coreaudio.audioUnitCapture                 = (ma_ptr)data.audioUnit;
-        pDevice->coreaudio.pAudioBufferList                 = (ma_ptr)data.pAudioBufferList;
-        pDevice->coreaudio.audioBufferCapInFrames           = data.periodSizeInFramesOut;
-        pDevice->coreaudio.originalPeriodSizeInFrames       = pDescriptorCapture->periodSizeInFrames;
-        pDevice->coreaudio.originalPeriodSizeInMilliseconds = pDescriptorCapture->periodSizeInMilliseconds;
-        pDevice->coreaudio.originalPeriods                  = pDescriptorCapture->periodCount;
-        pDevice->coreaudio.originalPerformanceProfile       = pConfig->performanceProfile;
-
-        pDescriptorCapture->format                          = data.formatOut;
-        pDescriptorCapture->channels                        = data.channelsOut;
-        pDescriptorCapture->sampleRate                      = data.sampleRateOut;
-        MA_COPY_MEMORY(pDescriptorCapture->channelMap, data.channelMapOut, sizeof(data.channelMapOut));
-        pDescriptorCapture->periodSizeInFrames              = data.periodSizeInFramesOut;
-        pDescriptorCapture->periodCount                     = data.periodsOut;
-
-    #if defined(MA_APPLE_DESKTOP)
-        ma_get_AudioObject_uid(pDevice->pContext, pDevice->coreaudio.deviceObjectIDCapture, sizeof(pDevice->capture.id.coreaudio), pDevice->capture.id.coreaudio);
-
-        /*
-        If we are using the default device we'll need to listen for changes to the system's default device so we can seemlessly
-        switch the device in the background.
-        */
-        if (pConfig->capture.pDeviceID == NULL) {
-            ma_device__track__coreaudio(pDevice);
-        }
-    #endif
-    }
-
-    /* Playback. */
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ma_device_init_internal_data__coreaudio data;
-        data.allowNominalSampleRateChange   = pConfig->coreaudio.allowNominalSampleRateChange;
-        data.formatIn                       = pDescriptorPlayback->format;
-        data.channelsIn                     = pDescriptorPlayback->channels;
-        data.sampleRateIn                   = pDescriptorPlayback->sampleRate;
-        MA_COPY_MEMORY(data.channelMapIn, pDescriptorPlayback->channelMap, sizeof(pDescriptorPlayback->channelMap));
-        data.shareMode                      = pDescriptorPlayback->shareMode;
-        data.performanceProfile             = pConfig->performanceProfile;
-
-        /* In full-duplex mode we want the playback buffer to be the same size as the capture buffer. */
-        if (pConfig->deviceType == ma_device_type_duplex) {
-            data.periodSizeInFramesIn       = pDescriptorCapture->periodSizeInFrames;
-            data.periodsIn                  = pDescriptorCapture->periodCount;
-            data.registerStopEvent          = MA_FALSE;
-        } else {
-            data.periodSizeInFramesIn       = pDescriptorPlayback->periodSizeInFrames;
-            data.periodSizeInMillisecondsIn = pDescriptorPlayback->periodSizeInMilliseconds;
-            data.periodsIn                  = pDescriptorPlayback->periodCount;
-            data.registerStopEvent          = MA_TRUE;
-        }
-
-        result = ma_device_init_internal__coreaudio(pDevice->pContext, ma_device_type_playback, pDescriptorPlayback->pDeviceID, &data, (void*)pDevice);
-        if (result != MA_SUCCESS) {
-            if (pConfig->deviceType == ma_device_type_duplex) {
-                ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
-                if (pDevice->coreaudio.pAudioBufferList) {
-                    ma_free(pDevice->coreaudio.pAudioBufferList, &pDevice->pContext->allocationCallbacks);
-                }
-            }
-            return result;
-        }
-
-        pDevice->coreaudio.isDefaultPlaybackDevice          = (pConfig->playback.pDeviceID == NULL);
-    #if defined(MA_APPLE_DESKTOP)
-        pDevice->coreaudio.deviceObjectIDPlayback           = (ma_uint32)data.deviceObjectID;
-    #endif
-        pDevice->coreaudio.audioUnitPlayback                = (ma_ptr)data.audioUnit;
-        pDevice->coreaudio.originalPeriodSizeInFrames       = pDescriptorPlayback->periodSizeInFrames;
-        pDevice->coreaudio.originalPeriodSizeInMilliseconds = pDescriptorPlayback->periodSizeInMilliseconds;
-        pDevice->coreaudio.originalPeriods                  = pDescriptorPlayback->periodCount;
-        pDevice->coreaudio.originalPerformanceProfile       = pConfig->performanceProfile;
-
-        pDescriptorPlayback->format                         = data.formatOut;
-        pDescriptorPlayback->channels                       = data.channelsOut;
-        pDescriptorPlayback->sampleRate                     = data.sampleRateOut;
-        MA_COPY_MEMORY(pDescriptorPlayback->channelMap, data.channelMapOut, sizeof(data.channelMapOut));
-        pDescriptorPlayback->periodSizeInFrames             = data.periodSizeInFramesOut;
-        pDescriptorPlayback->periodCount                    = data.periodsOut;
-
-    #if defined(MA_APPLE_DESKTOP)
-        ma_get_AudioObject_uid(pDevice->pContext, pDevice->coreaudio.deviceObjectIDPlayback, sizeof(pDevice->playback.id.coreaudio), pDevice->playback.id.coreaudio);
-
-        /*
-        If we are using the default device we'll need to listen for changes to the system's default device so we can seemlessly
-        switch the device in the background.
-        */
-        if (pDescriptorPlayback->pDeviceID == NULL && (pConfig->deviceType != ma_device_type_duplex || pDescriptorCapture->pDeviceID != NULL)) {
-            ma_device__track__coreaudio(pDevice);
-        }
-    #endif
-    }
-
-
-
-    /*
-    When stopping the device, a callback is called on another thread. We need to wait for this callback
-    before returning from ma_device_stop(). This event is used for this.
-    */
-    ma_event_init(&pDevice->coreaudio.stopEvent);
-
-    /*
-    We need to detect when a route has changed so we can update the data conversion pipeline accordingly. This is done
-    differently on non-Desktop Apple platforms.
-    */
-#if defined(MA_APPLE_MOBILE)
-    pDevice->coreaudio.pNotificationHandler = (MA_BRIDGE_RETAINED void*)[[ma_ios_notification_handler alloc] init:pDevice];
-#endif
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_device_start__coreaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        OSStatus status = ((ma_AudioOutputUnitStart_proc)pDevice->pContext->coreaudio.AudioOutputUnitStart)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
-        if (status != noErr) {
-            return ma_result_from_OSStatus(status);
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        OSStatus status = ((ma_AudioOutputUnitStart_proc)pDevice->pContext->coreaudio.AudioOutputUnitStart)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
-        if (status != noErr) {
-            if (pDevice->type == ma_device_type_duplex) {
-                ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
-            }
-            return ma_result_from_OSStatus(status);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__coreaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    /* It's not clear from the documentation whether or not AudioOutputUnitStop() actually drains the device or not. */
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        OSStatus status = ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
-        if (status != noErr) {
-            return ma_result_from_OSStatus(status);
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        OSStatus status = ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
-        if (status != noErr) {
-            return ma_result_from_OSStatus(status);
-        }
-    }
-
-    /* We need to wait for the callback to finish before returning. */
-    ma_event_wait(&pDevice->coreaudio.stopEvent);
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_context_uninit__coreaudio(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_coreaudio);
-
-#if defined(MA_APPLE_MOBILE)
-    if (!pContext->coreaudio.noAudioSessionDeactivate) {
-        if (![[AVAudioSession sharedInstance] setActive:false error:nil]) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "Failed to deactivate audio session.");
-            return MA_FAILED_TO_INIT_BACKEND;
-        }
-    }
-#endif
-
-#if !defined(MA_NO_RUNTIME_LINKING) && !defined(MA_APPLE_MOBILE)
-    ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit);
-    ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
-    ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
-#endif
-
-#if !defined(MA_APPLE_MOBILE)
-    ma_context__uninit_device_tracking__coreaudio(pContext);
-#endif
-
-    (void)pContext;
-    return MA_SUCCESS;
-}
-
-#if defined(MA_APPLE_MOBILE) && defined(__IPHONE_12_0)
-static AVAudioSessionCategory ma_to_AVAudioSessionCategory(ma_ios_session_category category)
-{
-    /* The "default" and "none" categories are treated different and should not be used as an input into this function. */
-    MA_ASSERT(category != ma_ios_session_category_default);
-    MA_ASSERT(category != ma_ios_session_category_none);
-
-    switch (category) {
-        case ma_ios_session_category_ambient:         return AVAudioSessionCategoryAmbient;
-        case ma_ios_session_category_solo_ambient:    return AVAudioSessionCategorySoloAmbient;
-        case ma_ios_session_category_playback:        return AVAudioSessionCategoryPlayback;
-        case ma_ios_session_category_record:          return AVAudioSessionCategoryRecord;
-        case ma_ios_session_category_play_and_record: return AVAudioSessionCategoryPlayAndRecord;
-        case ma_ios_session_category_multi_route:     return AVAudioSessionCategoryMultiRoute;
-        case ma_ios_session_category_none:            return AVAudioSessionCategoryAmbient;
-        case ma_ios_session_category_default:         return AVAudioSessionCategoryAmbient;
-        default:                                      return AVAudioSessionCategoryAmbient;
-    }
-}
-#endif
-
-static ma_result ma_context_init__coreaudio(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-#if !defined(MA_APPLE_MOBILE)
-    ma_result result;
-#endif
-
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(pContext != NULL);
-
-#if defined(MA_APPLE_MOBILE)
-    @autoreleasepool {
-        AVAudioSession* pAudioSession = [AVAudioSession sharedInstance];
-        AVAudioSessionCategoryOptions options = pConfig->coreaudio.sessionCategoryOptions;
-
-        MA_ASSERT(pAudioSession != NULL);
-
-        if (pConfig->coreaudio.sessionCategory == ma_ios_session_category_default) {
-            /*
-            I'm going to use trial and error to determine our default session category. First we'll try PlayAndRecord. If that fails
-            we'll try Playback and if that fails we'll try record. If all of these fail we'll just not set the category.
-            */
-        #if !defined(MA_APPLE_TV) && !defined(MA_APPLE_WATCH)
-            options |= AVAudioSessionCategoryOptionDefaultToSpeaker;
-        #endif
-
-            if ([pAudioSession setCategory: AVAudioSessionCategoryPlayAndRecord withOptions:options error:nil]) {
-                /* Using PlayAndRecord */
-            } else if ([pAudioSession setCategory: AVAudioSessionCategoryPlayback withOptions:options error:nil]) {
-                /* Using Playback */
-            } else if ([pAudioSession setCategory: AVAudioSessionCategoryRecord withOptions:options error:nil]) {
-                /* Using Record */
-            } else {
-                /* Leave as default? */
-            }
-        } else {
-            if (pConfig->coreaudio.sessionCategory != ma_ios_session_category_none) {
-            #if defined(__IPHONE_12_0)
-                if (![pAudioSession setCategory: ma_to_AVAudioSessionCategory(pConfig->coreaudio.sessionCategory) withOptions:options error:nil]) {
-                    return MA_INVALID_OPERATION;    /* Failed to set session category. */
-                }
-            #else
-                /* Ignore the session category on version 11 and older, but post a warning. */
-                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "Session category only supported in iOS 12 and newer.");
-            #endif
-            }
-        }
-
-        if (!pConfig->coreaudio.noAudioSessionActivate) {
-            if (![pAudioSession setActive:true error:nil]) {
-                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "Failed to activate audio session.");
-                return MA_FAILED_TO_INIT_BACKEND;
-            }
-        }
-    }
-#endif
-
-#if !defined(MA_NO_RUNTIME_LINKING) && !defined(MA_APPLE_MOBILE)
-    pContext->coreaudio.hCoreFoundation = ma_dlopen(ma_context_get_log(pContext), "/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation");
-    if (pContext->coreaudio.hCoreFoundation == NULL) {
-        return MA_API_NOT_FOUND;
-    }
-
-    pContext->coreaudio.CFStringGetCString = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation, "CFStringGetCString");
-    pContext->coreaudio.CFRelease          = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation, "CFRelease");
-
-
-    pContext->coreaudio.hCoreAudio = ma_dlopen(ma_context_get_log(pContext), "/System/Library/Frameworks/CoreAudio.framework/CoreAudio");
-    if (pContext->coreaudio.hCoreAudio == NULL) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
-        return MA_API_NOT_FOUND;
-    }
-
-    pContext->coreaudio.AudioObjectGetPropertyData        = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectGetPropertyData");
-    pContext->coreaudio.AudioObjectGetPropertyDataSize    = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectGetPropertyDataSize");
-    pContext->coreaudio.AudioObjectSetPropertyData        = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectSetPropertyData");
-    pContext->coreaudio.AudioObjectAddPropertyListener    = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectAddPropertyListener");
-    pContext->coreaudio.AudioObjectRemovePropertyListener = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectRemovePropertyListener");
-
-    /*
-    It looks like Apple has moved some APIs from AudioUnit into AudioToolbox on more recent versions of macOS. They are still
-    defined in AudioUnit, but just in case they decide to remove them from there entirely I'm going to implement a fallback.
-    The way it'll work is that it'll first try AudioUnit, and if the required symbols are not present there we'll fall back to
-    AudioToolbox.
-    */
-    pContext->coreaudio.hAudioUnit = ma_dlopen(ma_context_get_log(pContext), "/System/Library/Frameworks/AudioUnit.framework/AudioUnit");
-    if (pContext->coreaudio.hAudioUnit == NULL) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
-        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
-        return MA_API_NOT_FOUND;
-    }
-
-    if (ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioComponentFindNext") == NULL) {
-        /* Couldn't find the required symbols in AudioUnit, so fall back to AudioToolbox. */
-        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit);
-        pContext->coreaudio.hAudioUnit = ma_dlopen(ma_context_get_log(pContext), "/System/Library/Frameworks/AudioToolbox.framework/AudioToolbox");
-        if (pContext->coreaudio.hAudioUnit == NULL) {
-            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
-            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
-            return MA_API_NOT_FOUND;
-        }
-    }
-
-    pContext->coreaudio.AudioComponentFindNext            = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioComponentFindNext");
-    pContext->coreaudio.AudioComponentInstanceDispose     = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioComponentInstanceDispose");
-    pContext->coreaudio.AudioComponentInstanceNew         = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioComponentInstanceNew");
-    pContext->coreaudio.AudioOutputUnitStart              = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioOutputUnitStart");
-    pContext->coreaudio.AudioOutputUnitStop               = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioOutputUnitStop");
-    pContext->coreaudio.AudioUnitAddPropertyListener      = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitAddPropertyListener");
-    pContext->coreaudio.AudioUnitGetPropertyInfo          = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitGetPropertyInfo");
-    pContext->coreaudio.AudioUnitGetProperty              = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitGetProperty");
-    pContext->coreaudio.AudioUnitSetProperty              = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitSetProperty");
-    pContext->coreaudio.AudioUnitInitialize               = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitInitialize");
-    pContext->coreaudio.AudioUnitRender                   = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitRender");
-#else
-    pContext->coreaudio.CFStringGetCString                = (ma_proc)CFStringGetCString;
-    pContext->coreaudio.CFRelease                         = (ma_proc)CFRelease;
-
-    #if defined(MA_APPLE_DESKTOP)
-    pContext->coreaudio.AudioObjectGetPropertyData        = (ma_proc)AudioObjectGetPropertyData;
-    pContext->coreaudio.AudioObjectGetPropertyDataSize    = (ma_proc)AudioObjectGetPropertyDataSize;
-    pContext->coreaudio.AudioObjectSetPropertyData        = (ma_proc)AudioObjectSetPropertyData;
-    pContext->coreaudio.AudioObjectAddPropertyListener    = (ma_proc)AudioObjectAddPropertyListener;
-    pContext->coreaudio.AudioObjectRemovePropertyListener = (ma_proc)AudioObjectRemovePropertyListener;
-    #endif
-
-    pContext->coreaudio.AudioComponentFindNext            = (ma_proc)AudioComponentFindNext;
-    pContext->coreaudio.AudioComponentInstanceDispose     = (ma_proc)AudioComponentInstanceDispose;
-    pContext->coreaudio.AudioComponentInstanceNew         = (ma_proc)AudioComponentInstanceNew;
-    pContext->coreaudio.AudioOutputUnitStart              = (ma_proc)AudioOutputUnitStart;
-    pContext->coreaudio.AudioOutputUnitStop               = (ma_proc)AudioOutputUnitStop;
-    pContext->coreaudio.AudioUnitAddPropertyListener      = (ma_proc)AudioUnitAddPropertyListener;
-    pContext->coreaudio.AudioUnitGetPropertyInfo          = (ma_proc)AudioUnitGetPropertyInfo;
-    pContext->coreaudio.AudioUnitGetProperty              = (ma_proc)AudioUnitGetProperty;
-    pContext->coreaudio.AudioUnitSetProperty              = (ma_proc)AudioUnitSetProperty;
-    pContext->coreaudio.AudioUnitInitialize               = (ma_proc)AudioUnitInitialize;
-    pContext->coreaudio.AudioUnitRender                   = (ma_proc)AudioUnitRender;
-#endif
-
-    /* Audio component. */
-    {
-        AudioComponentDescription desc;
-        desc.componentType         = kAudioUnitType_Output;
-    #if defined(MA_APPLE_DESKTOP)
-        desc.componentSubType      = kAudioUnitSubType_HALOutput;
-    #else
-        desc.componentSubType      = kAudioUnitSubType_RemoteIO;
-    #endif
-        desc.componentManufacturer = kAudioUnitManufacturer_Apple;
-        desc.componentFlags        = 0;
-        desc.componentFlagsMask    = 0;
-
-        pContext->coreaudio.component = ((ma_AudioComponentFindNext_proc)pContext->coreaudio.AudioComponentFindNext)(NULL, &desc);
-        if (pContext->coreaudio.component == NULL) {
-        #if !defined(MA_NO_RUNTIME_LINKING) && !defined(MA_APPLE_MOBILE)
-            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit);
-            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
-            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
-        #endif
-            return MA_FAILED_TO_INIT_BACKEND;
-        }
-    }
-
-#if !defined(MA_APPLE_MOBILE)
-    result = ma_context__init_device_tracking__coreaudio(pContext);
-    if (result != MA_SUCCESS) {
-    #if !defined(MA_NO_RUNTIME_LINKING) && !defined(MA_APPLE_MOBILE)
-        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit);
-        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
-        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
-    #endif
-        return result;
-    }
-#endif
-
-    pContext->coreaudio.noAudioSessionDeactivate = pConfig->coreaudio.noAudioSessionDeactivate;
-
-    pCallbacks->onContextInit             = ma_context_init__coreaudio;
-    pCallbacks->onContextUninit           = ma_context_uninit__coreaudio;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__coreaudio;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__coreaudio;
-    pCallbacks->onDeviceInit              = ma_device_init__coreaudio;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__coreaudio;
-    pCallbacks->onDeviceStart             = ma_device_start__coreaudio;
-    pCallbacks->onDeviceStop              = ma_device_stop__coreaudio;
-    pCallbacks->onDeviceRead              = NULL;
-    pCallbacks->onDeviceWrite             = NULL;
-    pCallbacks->onDeviceDataLoop          = NULL;
-
-    return MA_SUCCESS;
-}
-#endif  /* Core Audio */
-
-
-
-/******************************************************************************
-
-sndio Backend
-
-******************************************************************************/
-#ifdef MA_HAS_SNDIO
-#include <fcntl.h>
-
-/*
-Only supporting OpenBSD. This did not work very well at all on FreeBSD when I tried it. Not sure if this is due
-to miniaudio's implementation or if it's some kind of system configuration issue, but basically the default device
-just doesn't emit any sound, or at times you'll hear tiny pieces. I will consider enabling this when there's
-demand for it or if I can get it tested and debugged more thoroughly.
-*/
-#if 0
-#if defined(__NetBSD__) || defined(__OpenBSD__)
-#include <sys/audioio.h>
-#endif
-#if defined(__FreeBSD__) || defined(__DragonFly__)
-#include <sys/soundcard.h>
-#endif
-#endif
-
-#define MA_SIO_DEVANY   "default"
-#define MA_SIO_PLAY     1
-#define MA_SIO_REC      2
-#define MA_SIO_NENC     8
-#define MA_SIO_NCHAN    8
-#define MA_SIO_NRATE    16
-#define MA_SIO_NCONF    4
-
-struct ma_sio_hdl; /* <-- Opaque */
-
-struct ma_sio_par
-{
-    unsigned int bits;
-    unsigned int bps;
-    unsigned int sig;
-    unsigned int le;
-    unsigned int msb;
-    unsigned int rchan;
-    unsigned int pchan;
-    unsigned int rate;
-    unsigned int bufsz;
-    unsigned int xrun;
-    unsigned int round;
-    unsigned int appbufsz;
-    int __pad[3];
-    unsigned int __magic;
-};
-
-struct ma_sio_enc
-{
-    unsigned int bits;
-    unsigned int bps;
-    unsigned int sig;
-    unsigned int le;
-    unsigned int msb;
-};
-
-struct ma_sio_conf
-{
-    unsigned int enc;
-    unsigned int rchan;
-    unsigned int pchan;
-    unsigned int rate;
-};
-
-struct ma_sio_cap
-{
-    struct ma_sio_enc enc[MA_SIO_NENC];
-    unsigned int rchan[MA_SIO_NCHAN];
-    unsigned int pchan[MA_SIO_NCHAN];
-    unsigned int rate[MA_SIO_NRATE];
-    int __pad[7];
-    unsigned int nconf;
-    struct ma_sio_conf confs[MA_SIO_NCONF];
-};
-
-typedef struct ma_sio_hdl* (* ma_sio_open_proc)   (const char*, unsigned int, int);
-typedef void               (* ma_sio_close_proc)  (struct ma_sio_hdl*);
-typedef int                (* ma_sio_setpar_proc) (struct ma_sio_hdl*, struct ma_sio_par*);
-typedef int                (* ma_sio_getpar_proc) (struct ma_sio_hdl*, struct ma_sio_par*);
-typedef int                (* ma_sio_getcap_proc) (struct ma_sio_hdl*, struct ma_sio_cap*);
-typedef size_t             (* ma_sio_write_proc)  (struct ma_sio_hdl*, const void*, size_t);
-typedef size_t             (* ma_sio_read_proc)   (struct ma_sio_hdl*, void*, size_t);
-typedef int                (* ma_sio_start_proc)  (struct ma_sio_hdl*);
-typedef int                (* ma_sio_stop_proc)   (struct ma_sio_hdl*);
-typedef int                (* ma_sio_initpar_proc)(struct ma_sio_par*);
-
-static ma_uint32 ma_get_standard_sample_rate_priority_index__sndio(ma_uint32 sampleRate)   /* Lower = higher priority */
-{
-    ma_uint32 i;
-    for (i = 0; i < ma_countof(g_maStandardSampleRatePriorities); ++i) {
-        if (g_maStandardSampleRatePriorities[i] == sampleRate) {
-            return i;
-        }
-    }
-
-    return (ma_uint32)-1;
-}
-
-static ma_format ma_format_from_sio_enc__sndio(unsigned int bits, unsigned int bps, unsigned int sig, unsigned int le, unsigned int msb)
-{
-    /* We only support native-endian right now. */
-    if ((ma_is_little_endian() && le == 0) || (ma_is_big_endian() && le == 1)) {
-        return ma_format_unknown;
-    }
-
-    if (bits ==  8 && bps == 1 && sig == 0) {
-        return ma_format_u8;
-    }
-    if (bits == 16 && bps == 2 && sig == 1) {
-        return ma_format_s16;
-    }
-    if (bits == 24 && bps == 3 && sig == 1) {
-        return ma_format_s24;
-    }
-    if (bits == 24 && bps == 4 && sig == 1 && msb == 0) {
-        /*return ma_format_s24_32;*/
-    }
-    if (bits == 32 && bps == 4 && sig == 1) {
-        return ma_format_s32;
-    }
-
-    return ma_format_unknown;
-}
-
-static ma_format ma_find_best_format_from_sio_cap__sndio(struct ma_sio_cap* caps)
-{
-    ma_format bestFormat;
-    unsigned int iConfig;
-
-    MA_ASSERT(caps != NULL);
-
-    bestFormat = ma_format_unknown;
-    for (iConfig = 0; iConfig < caps->nconf; iConfig += 1) {
-        unsigned int iEncoding;
-        for (iEncoding = 0; iEncoding < MA_SIO_NENC; iEncoding += 1) {
-            unsigned int bits;
-            unsigned int bps;
-            unsigned int sig;
-            unsigned int le;
-            unsigned int msb;
-            ma_format format;
-
-            if ((caps->confs[iConfig].enc & (1UL << iEncoding)) == 0) {
-                continue;
-            }
-
-            bits = caps->enc[iEncoding].bits;
-            bps  = caps->enc[iEncoding].bps;
-            sig  = caps->enc[iEncoding].sig;
-            le   = caps->enc[iEncoding].le;
-            msb  = caps->enc[iEncoding].msb;
-            format = ma_format_from_sio_enc__sndio(bits, bps, sig, le, msb);
-            if (format == ma_format_unknown) {
-                continue;   /* Format not supported. */
-            }
-
-            if (bestFormat == ma_format_unknown) {
-                bestFormat = format;
-            } else {
-                if (ma_get_format_priority_index(bestFormat) > ma_get_format_priority_index(format)) {    /* <-- Lower = better. */
-                    bestFormat = format;
-                }
-            }
-        }
-    }
-
-    return bestFormat;
-}
-
-static ma_uint32 ma_find_best_channels_from_sio_cap__sndio(struct ma_sio_cap* caps, ma_device_type deviceType, ma_format requiredFormat)
-{
-    ma_uint32 maxChannels;
-    unsigned int iConfig;
-
-    MA_ASSERT(caps != NULL);
-    MA_ASSERT(requiredFormat != ma_format_unknown);
-
-    /* Just pick whatever configuration has the most channels. */
-    maxChannels = 0;
-    for (iConfig = 0; iConfig < caps->nconf; iConfig += 1) {
-        /* The encoding should be of requiredFormat. */
-        unsigned int iEncoding;
-        for (iEncoding = 0; iEncoding < MA_SIO_NENC; iEncoding += 1) {
-            unsigned int iChannel;
-            unsigned int bits;
-            unsigned int bps;
-            unsigned int sig;
-            unsigned int le;
-            unsigned int msb;
-            ma_format format;
-
-            if ((caps->confs[iConfig].enc & (1UL << iEncoding)) == 0) {
-                continue;
-            }
-
-            bits = caps->enc[iEncoding].bits;
-            bps  = caps->enc[iEncoding].bps;
-            sig  = caps->enc[iEncoding].sig;
-            le   = caps->enc[iEncoding].le;
-            msb  = caps->enc[iEncoding].msb;
-            format = ma_format_from_sio_enc__sndio(bits, bps, sig, le, msb);
-            if (format != requiredFormat) {
-                continue;
-            }
-
-            /* Getting here means the format is supported. Iterate over each channel count and grab the biggest one. */
-            for (iChannel = 0; iChannel < MA_SIO_NCHAN; iChannel += 1) {
-                unsigned int chan = 0;
-                unsigned int channels;
-
-                if (deviceType == ma_device_type_playback) {
-                    chan = caps->confs[iConfig].pchan;
-                } else {
-                    chan = caps->confs[iConfig].rchan;
-                }
-
-                if ((chan & (1UL << iChannel)) == 0) {
-                    continue;
-                }
-
-                if (deviceType == ma_device_type_playback) {
-                    channels = caps->pchan[iChannel];
-                } else {
-                    channels = caps->rchan[iChannel];
-                }
-
-                if (maxChannels < channels) {
-                    maxChannels = channels;
-                }
-            }
-        }
-    }
-
-    return maxChannels;
-}
-
-static ma_uint32 ma_find_best_sample_rate_from_sio_cap__sndio(struct ma_sio_cap* caps, ma_device_type deviceType, ma_format requiredFormat, ma_uint32 requiredChannels)
-{
-    ma_uint32 firstSampleRate;
-    ma_uint32 bestSampleRate;
-    unsigned int iConfig;
-
-    MA_ASSERT(caps != NULL);
-    MA_ASSERT(requiredFormat != ma_format_unknown);
-    MA_ASSERT(requiredChannels > 0);
-    MA_ASSERT(requiredChannels <= MA_MAX_CHANNELS);
-
-    firstSampleRate = 0; /* <-- If the device does not support a standard rate we'll fall back to the first one that's found. */
-    bestSampleRate  = 0;
-
-    for (iConfig = 0; iConfig < caps->nconf; iConfig += 1) {
-        /* The encoding should be of requiredFormat. */
-        unsigned int iEncoding;
-        for (iEncoding = 0; iEncoding < MA_SIO_NENC; iEncoding += 1) {
-            unsigned int iChannel;
-            unsigned int bits;
-            unsigned int bps;
-            unsigned int sig;
-            unsigned int le;
-            unsigned int msb;
-            ma_format format;
-
-            if ((caps->confs[iConfig].enc & (1UL << iEncoding)) == 0) {
-                continue;
-            }
-
-            bits = caps->enc[iEncoding].bits;
-            bps  = caps->enc[iEncoding].bps;
-            sig  = caps->enc[iEncoding].sig;
-            le   = caps->enc[iEncoding].le;
-            msb  = caps->enc[iEncoding].msb;
-            format = ma_format_from_sio_enc__sndio(bits, bps, sig, le, msb);
-            if (format != requiredFormat) {
-                continue;
-            }
-
-            /* Getting here means the format is supported. Iterate over each channel count and grab the biggest one. */
-            for (iChannel = 0; iChannel < MA_SIO_NCHAN; iChannel += 1) {
-                unsigned int chan = 0;
-                unsigned int channels;
-                unsigned int iRate;
-
-                if (deviceType == ma_device_type_playback) {
-                    chan = caps->confs[iConfig].pchan;
-                } else {
-                    chan = caps->confs[iConfig].rchan;
-                }
-
-                if ((chan & (1UL << iChannel)) == 0) {
-                    continue;
-                }
-
-                if (deviceType == ma_device_type_playback) {
-                    channels = caps->pchan[iChannel];
-                } else {
-                    channels = caps->rchan[iChannel];
-                }
-
-                if (channels != requiredChannels) {
-                    continue;
-                }
-
-                /* Getting here means we have found a compatible encoding/channel pair. */
-                for (iRate = 0; iRate < MA_SIO_NRATE; iRate += 1) {
-                    ma_uint32 rate = (ma_uint32)caps->rate[iRate];
-                    ma_uint32 ratePriority;
-
-                    if (firstSampleRate == 0) {
-                        firstSampleRate = rate;
-                    }
-
-                    /* Disregard this rate if it's not a standard one. */
-                    ratePriority = ma_get_standard_sample_rate_priority_index__sndio(rate);
-                    if (ratePriority == (ma_uint32)-1) {
-                        continue;
-                    }
-
-                    if (ma_get_standard_sample_rate_priority_index__sndio(bestSampleRate) > ratePriority) {   /* Lower = better. */
-                        bestSampleRate = rate;
-                    }
-                }
-            }
-        }
-    }
-
-    /* If a standard sample rate was not found just fall back to the first one that was iterated. */
-    if (bestSampleRate == 0) {
-        bestSampleRate = firstSampleRate;
-    }
-
-    return bestSampleRate;
-}
-
-
-static ma_result ma_context_enumerate_devices__sndio(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_bool32 isTerminating = MA_FALSE;
-    struct ma_sio_hdl* handle;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    /* sndio doesn't seem to have a good device enumeration API, so I'm therefore only enumerating over default devices for now. */
-
-    /* Playback. */
-    if (!isTerminating) {
-        handle = ((ma_sio_open_proc)pContext->sndio.sio_open)(MA_SIO_DEVANY, MA_SIO_PLAY, 0);
-        if (handle != NULL) {
-            /* Supports playback. */
-            ma_device_info deviceInfo;
-            MA_ZERO_OBJECT(&deviceInfo);
-            ma_strcpy_s(deviceInfo.id.sndio, sizeof(deviceInfo.id.sndio), MA_SIO_DEVANY);
-            ma_strcpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME);
-
-            isTerminating = !callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-
-            ((ma_sio_close_proc)pContext->sndio.sio_close)(handle);
-        }
-    }
-
-    /* Capture. */
-    if (!isTerminating) {
-        handle = ((ma_sio_open_proc)pContext->sndio.sio_open)(MA_SIO_DEVANY, MA_SIO_REC, 0);
-        if (handle != NULL) {
-            /* Supports capture. */
-            ma_device_info deviceInfo;
-            MA_ZERO_OBJECT(&deviceInfo);
-            ma_strcpy_s(deviceInfo.id.sndio, sizeof(deviceInfo.id.sndio), "default");
-            ma_strcpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME);
-
-            isTerminating = !callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-
-            ((ma_sio_close_proc)pContext->sndio.sio_close)(handle);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info__sndio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    char devid[256];
-    struct ma_sio_hdl* handle;
-    struct ma_sio_cap caps;
-    unsigned int iConfig;
-
-    MA_ASSERT(pContext != NULL);
-
-    /* We need to open the device before we can get information about it. */
-    if (pDeviceID == NULL) {
-        ma_strcpy_s(devid, sizeof(devid), MA_SIO_DEVANY);
-        ma_strcpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), (deviceType == ma_device_type_playback) ? MA_DEFAULT_PLAYBACK_DEVICE_NAME : MA_DEFAULT_CAPTURE_DEVICE_NAME);
-    } else {
-        ma_strcpy_s(devid, sizeof(devid), pDeviceID->sndio);
-        ma_strcpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), devid);
-    }
-
-    handle = ((ma_sio_open_proc)pContext->sndio.sio_open)(devid, (deviceType == ma_device_type_playback) ? MA_SIO_PLAY : MA_SIO_REC, 0);
-    if (handle == NULL) {
-        return MA_NO_DEVICE;
-    }
-
-    if (((ma_sio_getcap_proc)pContext->sndio.sio_getcap)(handle, &caps) == 0) {
-        return MA_ERROR;
-    }
-
-    pDeviceInfo->nativeDataFormatCount = 0;
-
-    for (iConfig = 0; iConfig < caps.nconf; iConfig += 1) {
-        /*
-        The main thing we care about is that the encoding is supported by miniaudio. If it is, we want to give
-        preference to some formats over others.
-        */
-        unsigned int iEncoding;
-        unsigned int iChannel;
-        unsigned int iRate;
-
-        for (iEncoding = 0; iEncoding < MA_SIO_NENC; iEncoding += 1) {
-            unsigned int bits;
-            unsigned int bps;
-            unsigned int sig;
-            unsigned int le;
-            unsigned int msb;
-            ma_format format;
-
-            if ((caps.confs[iConfig].enc & (1UL << iEncoding)) == 0) {
-                continue;
-            }
-
-            bits = caps.enc[iEncoding].bits;
-            bps  = caps.enc[iEncoding].bps;
-            sig  = caps.enc[iEncoding].sig;
-            le   = caps.enc[iEncoding].le;
-            msb  = caps.enc[iEncoding].msb;
-            format = ma_format_from_sio_enc__sndio(bits, bps, sig, le, msb);
-            if (format == ma_format_unknown) {
-                continue;   /* Format not supported. */
-            }
-
-
-            /* Channels. */
-            for (iChannel = 0; iChannel < MA_SIO_NCHAN; iChannel += 1) {
-                unsigned int chan = 0;
-                unsigned int channels;
-
-                if (deviceType == ma_device_type_playback) {
-                    chan = caps.confs[iConfig].pchan;
-                } else {
-                    chan = caps.confs[iConfig].rchan;
-                }
-
-                if ((chan & (1UL << iChannel)) == 0) {
-                    continue;
-                }
-
-                if (deviceType == ma_device_type_playback) {
-                    channels = caps.pchan[iChannel];
-                } else {
-                    channels = caps.rchan[iChannel];
-                }
-
-
-                /* Sample Rates. */
-                for (iRate = 0; iRate < MA_SIO_NRATE; iRate += 1) {
-                    if ((caps.confs[iConfig].rate & (1UL << iRate)) != 0) {
-                        ma_device_info_add_native_data_format(pDeviceInfo, format, channels, caps.rate[iRate], 0);
-                    }
-                }
-            }
-        }
-    }
-
-    ((ma_sio_close_proc)pContext->sndio.sio_close)(handle);
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_uninit__sndio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)pDevice->sndio.handleCapture);
-    }
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)pDevice->sndio.handlePlayback);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init_handle__sndio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptor, ma_device_type deviceType)
-{
-    const char* pDeviceName;
-    ma_ptr handle;
-    int openFlags = 0;
-    struct ma_sio_cap caps;
-    struct ma_sio_par par;
-    const ma_device_id* pDeviceID;
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_format internalFormat;
-    ma_uint32 internalChannels;
-    ma_uint32 internalSampleRate;
-    ma_uint32 internalPeriodSizeInFrames;
-    ma_uint32 internalPeriods;
-
-    MA_ASSERT(pConfig    != NULL);
-    MA_ASSERT(deviceType != ma_device_type_duplex);
-    MA_ASSERT(pDevice    != NULL);
-
-    if (deviceType == ma_device_type_capture) {
-        openFlags = MA_SIO_REC;
-    } else {
-        openFlags = MA_SIO_PLAY;
-    }
-
-    pDeviceID  = pDescriptor->pDeviceID;
-    format     = pDescriptor->format;
-    channels   = pDescriptor->channels;
-    sampleRate = pDescriptor->sampleRate;
-
-    pDeviceName = MA_SIO_DEVANY;
-    if (pDeviceID != NULL) {
-        pDeviceName = pDeviceID->sndio;
-    }
-
-    handle = (ma_ptr)((ma_sio_open_proc)pDevice->pContext->sndio.sio_open)(pDeviceName, openFlags, 0);
-    if (handle == NULL) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to open device.");
-        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-    }
-
-    /* We need to retrieve the device caps to determine the most appropriate format to use. */
-    if (((ma_sio_getcap_proc)pDevice->pContext->sndio.sio_getcap)((struct ma_sio_hdl*)handle, &caps) == 0) {
-        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)handle);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to retrieve device caps.");
-        return MA_ERROR;
-    }
-
-    /*
-    Note: sndio reports a huge range of available channels. This is inconvenient for us because there's no real
-    way, as far as I can tell, to get the _actual_ channel count of the device. I'm therefore restricting this
-    to the requested channels, regardless of whether or not the default channel count is requested.
-
-    For hardware devices, I'm suspecting only a single channel count will be reported and we can safely use the
-    value returned by ma_find_best_channels_from_sio_cap__sndio().
-    */
-    if (deviceType == ma_device_type_capture) {
-        if (format == ma_format_unknown) {
-            format = ma_find_best_format_from_sio_cap__sndio(&caps);
-        }
-
-        if (channels == 0) {
-            if (strlen(pDeviceName) > strlen("rsnd/") && strncmp(pDeviceName, "rsnd/", strlen("rsnd/")) == 0) {
-                channels = ma_find_best_channels_from_sio_cap__sndio(&caps, deviceType, format);
-            } else {
-                channels = MA_DEFAULT_CHANNELS;
-            }
-        }
-    } else {
-        if (format == ma_format_unknown) {
-            format = ma_find_best_format_from_sio_cap__sndio(&caps);
-        }
-
-        if (channels == 0) {
-            if (strlen(pDeviceName) > strlen("rsnd/") && strncmp(pDeviceName, "rsnd/", strlen("rsnd/")) == 0) {
-                channels = ma_find_best_channels_from_sio_cap__sndio(&caps, deviceType, format);
-            } else {
-                channels = MA_DEFAULT_CHANNELS;
-            }
-        }
-    }
-
-    if (sampleRate == 0) {
-        sampleRate = ma_find_best_sample_rate_from_sio_cap__sndio(&caps, pConfig->deviceType, format, channels);
-    }
-
-
-    ((ma_sio_initpar_proc)pDevice->pContext->sndio.sio_initpar)(&par);
-    par.msb = 0;
-    par.le  = ma_is_little_endian();
-
-    switch (format) {
-        case ma_format_u8:
-        {
-            par.bits = 8;
-            par.bps  = 1;
-            par.sig  = 0;
-        } break;
-
-        case ma_format_s24:
-        {
-            par.bits = 24;
-            par.bps  = 3;
-            par.sig  = 1;
-        } break;
-
-        case ma_format_s32:
-        {
-            par.bits = 32;
-            par.bps  = 4;
-            par.sig  = 1;
-        } break;
-
-        case ma_format_s16:
-        case ma_format_f32:
-        case ma_format_unknown:
-        default:
-        {
-            par.bits = 16;
-            par.bps  = 2;
-            par.sig  = 1;
-        } break;
-    }
-
-    if (deviceType == ma_device_type_capture) {
-        par.rchan = channels;
-    } else {
-        par.pchan = channels;
-    }
-
-    par.rate = sampleRate;
-
-    internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, par.rate, pConfig->performanceProfile);
-
-    par.round    = internalPeriodSizeInFrames;
-    par.appbufsz = par.round * pDescriptor->periodCount;
-
-    if (((ma_sio_setpar_proc)pDevice->pContext->sndio.sio_setpar)((struct ma_sio_hdl*)handle, &par) == 0) {
-        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)handle);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to set buffer size.");
-        return MA_ERROR;
-    }
-
-    if (((ma_sio_getpar_proc)pDevice->pContext->sndio.sio_getpar)((struct ma_sio_hdl*)handle, &par) == 0) {
-        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)handle);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to retrieve buffer size.");
-        return MA_ERROR;
-    }
-
-    internalFormat             = ma_format_from_sio_enc__sndio(par.bits, par.bps, par.sig, par.le, par.msb);
-    internalChannels           = (deviceType == ma_device_type_capture) ? par.rchan : par.pchan;
-    internalSampleRate         = par.rate;
-    internalPeriods            = par.appbufsz / par.round;
-    internalPeriodSizeInFrames = par.round;
-
-    if (deviceType == ma_device_type_capture) {
-        pDevice->sndio.handleCapture  = handle;
-    } else {
-        pDevice->sndio.handlePlayback = handle;
-    }
-
-    pDescriptor->format             = internalFormat;
-    pDescriptor->channels           = internalChannels;
-    pDescriptor->sampleRate         = internalSampleRate;
-    ma_channel_map_init_standard(ma_standard_channel_map_sndio, pDescriptor->channelMap, ma_countof(pDescriptor->channelMap), internalChannels);
-    pDescriptor->periodSizeInFrames = internalPeriodSizeInFrames;
-    pDescriptor->periodCount        = internalPeriods;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init__sndio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ZERO_OBJECT(&pDevice->sndio);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ma_result result = ma_device_init_handle__sndio(pDevice, pConfig, pDescriptorCapture, ma_device_type_capture);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ma_result result = ma_device_init_handle__sndio(pDevice, pConfig, pDescriptorPlayback, ma_device_type_playback);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_start__sndio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ((ma_sio_start_proc)pDevice->pContext->sndio.sio_start)((struct ma_sio_hdl*)pDevice->sndio.handleCapture);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ((ma_sio_start_proc)pDevice->pContext->sndio.sio_start)((struct ma_sio_hdl*)pDevice->sndio.handlePlayback);   /* <-- Doesn't actually playback until data is written. */
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__sndio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    /*
-    From the documentation:
-
-        The sio_stop() function puts the audio subsystem in the same state as before sio_start() is called. It stops recording, drains the play buffer and then
-        stops playback. If samples to play are queued but playback hasn't started yet then playback is forced immediately; playback will actually stop once the
-        buffer is drained. In no case are samples in the play buffer discarded.
-
-    Therefore, sio_stop() performs all of the necessary draining for us.
-    */
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ((ma_sio_stop_proc)pDevice->pContext->sndio.sio_stop)((struct ma_sio_hdl*)pDevice->sndio.handleCapture);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ((ma_sio_stop_proc)pDevice->pContext->sndio.sio_stop)((struct ma_sio_hdl*)pDevice->sndio.handlePlayback);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_write__sndio(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
-{
-    int result;
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = 0;
-    }
-
-    result = ((ma_sio_write_proc)pDevice->pContext->sndio.sio_write)((struct ma_sio_hdl*)pDevice->sndio.handlePlayback, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
-    if (result == 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to send data from the client to the device.");
-        return MA_IO_ERROR;
-    }
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = frameCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_read__sndio(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
-{
-    int result;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    result = ((ma_sio_read_proc)pDevice->pContext->sndio.sio_read)((struct ma_sio_hdl*)pDevice->sndio.handleCapture, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
-    if (result == 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to read data from the device to be sent to the device.");
-        return MA_IO_ERROR;
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = frameCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_uninit__sndio(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_sndio);
-
-    (void)pContext;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__sndio(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-#ifndef MA_NO_RUNTIME_LINKING
-    const char* libsndioNames[] = {
-        "libsndio.so"
-    };
-    size_t i;
-
-    for (i = 0; i < ma_countof(libsndioNames); ++i) {
-        pContext->sndio.sndioSO = ma_dlopen(ma_context_get_log(pContext), libsndioNames[i]);
-        if (pContext->sndio.sndioSO != NULL) {
-            break;
-        }
-    }
-
-    if (pContext->sndio.sndioSO == NULL) {
-        return MA_NO_BACKEND;
-    }
-
-    pContext->sndio.sio_open    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_open");
-    pContext->sndio.sio_close   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_close");
-    pContext->sndio.sio_setpar  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_setpar");
-    pContext->sndio.sio_getpar  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_getpar");
-    pContext->sndio.sio_getcap  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_getcap");
-    pContext->sndio.sio_write   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_write");
-    pContext->sndio.sio_read    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_read");
-    pContext->sndio.sio_start   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_start");
-    pContext->sndio.sio_stop    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_stop");
-    pContext->sndio.sio_initpar = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_initpar");
-#else
-    pContext->sndio.sio_open    = sio_open;
-    pContext->sndio.sio_close   = sio_close;
-    pContext->sndio.sio_setpar  = sio_setpar;
-    pContext->sndio.sio_getpar  = sio_getpar;
-    pContext->sndio.sio_getcap  = sio_getcap;
-    pContext->sndio.sio_write   = sio_write;
-    pContext->sndio.sio_read    = sio_read;
-    pContext->sndio.sio_start   = sio_start;
-    pContext->sndio.sio_stop    = sio_stop;
-    pContext->sndio.sio_initpar = sio_initpar;
-#endif
-
-    pCallbacks->onContextInit             = ma_context_init__sndio;
-    pCallbacks->onContextUninit           = ma_context_uninit__sndio;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__sndio;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__sndio;
-    pCallbacks->onDeviceInit              = ma_device_init__sndio;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__sndio;
-    pCallbacks->onDeviceStart             = ma_device_start__sndio;
-    pCallbacks->onDeviceStop              = ma_device_stop__sndio;
-    pCallbacks->onDeviceRead              = ma_device_read__sndio;
-    pCallbacks->onDeviceWrite             = ma_device_write__sndio;
-    pCallbacks->onDeviceDataLoop          = NULL;
-
-    (void)pConfig;
-    return MA_SUCCESS;
-}
-#endif  /* sndio */
-
-
-
-/******************************************************************************
-
-audio(4) Backend
-
-******************************************************************************/
-#ifdef MA_HAS_AUDIO4
-#include <fcntl.h>
-#include <poll.h>
-#include <errno.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/ioctl.h>
-#include <sys/audioio.h>
-
-#if defined(__OpenBSD__)
-    #include <sys/param.h>
-    #if defined(OpenBSD) && OpenBSD >= 201709
-        #define MA_AUDIO4_USE_NEW_API
-    #endif
-#endif
-
-static void ma_construct_device_id__audio4(char* id, size_t idSize, const char* base, int deviceIndex)
-{
-    size_t baseLen;
-
-    MA_ASSERT(id != NULL);
-    MA_ASSERT(idSize > 0);
-    MA_ASSERT(deviceIndex >= 0);
-
-    baseLen = strlen(base);
-    MA_ASSERT(idSize > baseLen);
-
-    ma_strcpy_s(id, idSize, base);
-    ma_itoa_s(deviceIndex, id+baseLen, idSize-baseLen, 10);
-}
-
-static ma_result ma_extract_device_index_from_id__audio4(const char* id, const char* base, int* pIndexOut)
-{
-    size_t idLen;
-    size_t baseLen;
-    const char* deviceIndexStr;
-
-    MA_ASSERT(id != NULL);
-    MA_ASSERT(base != NULL);
-    MA_ASSERT(pIndexOut != NULL);
-
-    idLen = strlen(id);
-    baseLen = strlen(base);
-    if (idLen <= baseLen) {
-        return MA_ERROR;   /* Doesn't look like the id starts with the base. */
-    }
-
-    if (strncmp(id, base, baseLen) != 0) {
-        return MA_ERROR;   /* ID does not begin with base. */
-    }
-
-    deviceIndexStr = id + baseLen;
-    if (deviceIndexStr[0] == '\0') {
-        return MA_ERROR;   /* No index specified in the ID. */
-    }
-
-    if (pIndexOut) {
-        *pIndexOut = atoi(deviceIndexStr);
-    }
-
-    return MA_SUCCESS;
-}
-
-
-#if !defined(MA_AUDIO4_USE_NEW_API)    /* Old API */
-static ma_format ma_format_from_encoding__audio4(unsigned int encoding, unsigned int precision)
-{
-    if (precision == 8 && (encoding == AUDIO_ENCODING_ULINEAR || encoding == AUDIO_ENCODING_ULINEAR || encoding == AUDIO_ENCODING_ULINEAR_LE || encoding == AUDIO_ENCODING_ULINEAR_BE)) {
-        return ma_format_u8;
-    } else {
-        if (ma_is_little_endian() && encoding == AUDIO_ENCODING_SLINEAR_LE) {
-            if (precision == 16) {
-                return ma_format_s16;
-            } else if (precision == 24) {
-                return ma_format_s24;
-            } else if (precision == 32) {
-                return ma_format_s32;
-            }
-        } else if (ma_is_big_endian() && encoding == AUDIO_ENCODING_SLINEAR_BE) {
-            if (precision == 16) {
-                return ma_format_s16;
-            } else if (precision == 24) {
-                return ma_format_s24;
-            } else if (precision == 32) {
-                return ma_format_s32;
-            }
-        }
-    }
-
-    return ma_format_unknown;  /* Encoding not supported. */
-}
-
-static void ma_encoding_from_format__audio4(ma_format format, unsigned int* pEncoding, unsigned int* pPrecision)
-{
-    MA_ASSERT(pEncoding  != NULL);
-    MA_ASSERT(pPrecision != NULL);
-
-    switch (format)
-    {
-        case ma_format_u8:
-        {
-            *pEncoding = AUDIO_ENCODING_ULINEAR;
-            *pPrecision = 8;
-        } break;
-
-        case ma_format_s24:
-        {
-            *pEncoding = (ma_is_little_endian()) ? AUDIO_ENCODING_SLINEAR_LE : AUDIO_ENCODING_SLINEAR_BE;
-            *pPrecision = 24;
-        } break;
-
-        case ma_format_s32:
-        {
-            *pEncoding = (ma_is_little_endian()) ? AUDIO_ENCODING_SLINEAR_LE : AUDIO_ENCODING_SLINEAR_BE;
-            *pPrecision = 32;
-        } break;
-
-        case ma_format_s16:
-        case ma_format_f32:
-        case ma_format_unknown:
-        default:
-        {
-            *pEncoding = (ma_is_little_endian()) ? AUDIO_ENCODING_SLINEAR_LE : AUDIO_ENCODING_SLINEAR_BE;
-            *pPrecision = 16;
-        } break;
-    }
-}
-
-static ma_format ma_format_from_prinfo__audio4(struct audio_prinfo* prinfo)
-{
-    return ma_format_from_encoding__audio4(prinfo->encoding, prinfo->precision);
-}
-
-static ma_format ma_best_format_from_fd__audio4(int fd, ma_format preferredFormat)
-{
-    audio_encoding_t encoding;
-    ma_uint32 iFormat;
-    int counter = 0;
-
-    /* First check to see if the preferred format is supported. */
-    if (preferredFormat != ma_format_unknown) {
-        counter = 0;
-        for (;;) {
-            MA_ZERO_OBJECT(&encoding);
-            encoding.index = counter;
-            if (ioctl(fd, AUDIO_GETENC, &encoding) < 0) {
-                break;
-            }
-
-            if (preferredFormat == ma_format_from_encoding__audio4(encoding.encoding, encoding.precision)) {
-                return preferredFormat;  /* Found the preferred format. */
-            }
-
-            /* Getting here means this encoding does not match our preferred format so we need to more on to the next encoding. */
-            counter += 1;
-        }
-    }
-
-    /* Getting here means our preferred format is not supported, so fall back to our standard priorities. */
-    for (iFormat = 0; iFormat < ma_countof(g_maFormatPriorities); iFormat += 1) {
-        ma_format format = g_maFormatPriorities[iFormat];
-
-        counter = 0;
-        for (;;) {
-            MA_ZERO_OBJECT(&encoding);
-            encoding.index = counter;
-            if (ioctl(fd, AUDIO_GETENC, &encoding) < 0) {
-                break;
-            }
-
-            if (format == ma_format_from_encoding__audio4(encoding.encoding, encoding.precision)) {
-                return format;  /* Found a workable format. */
-            }
-
-            /* Getting here means this encoding does not match our preferred format so we need to more on to the next encoding. */
-            counter += 1;
-        }
-    }
-
-    /* Getting here means not appropriate format was found. */
-    return ma_format_unknown;
-}
-#else
-static ma_format ma_format_from_swpar__audio4(struct audio_swpar* par)
-{
-    if (par->bits == 8 && par->bps == 1 && par->sig == 0) {
-        return ma_format_u8;
-    }
-    if (par->bits == 16 && par->bps == 2 && par->sig == 1 && par->le == ma_is_little_endian()) {
-        return ma_format_s16;
-    }
-    if (par->bits == 24 && par->bps == 3 && par->sig == 1 && par->le == ma_is_little_endian()) {
-        return ma_format_s24;
-    }
-    if (par->bits == 32 && par->bps == 4 && par->sig == 1 && par->le == ma_is_little_endian()) {
-        return ma_format_f32;
-    }
-
-    /* Format not supported. */
-    return ma_format_unknown;
-}
-#endif
-
-static ma_result ma_context_get_device_info_from_fd__audio4(ma_context* pContext, ma_device_type deviceType, int fd, ma_device_info* pDeviceInfo)
-{
-    audio_device_t fdDevice;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(fd >= 0);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    (void)pContext;
-    (void)deviceType;
-
-    if (ioctl(fd, AUDIO_GETDEV, &fdDevice) < 0) {
-        return MA_ERROR;   /* Failed to retrieve device info. */
-    }
-
-    /* Name. */
-    ma_strcpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), fdDevice.name);
-
-    #if !defined(MA_AUDIO4_USE_NEW_API)
-    {
-        audio_info_t fdInfo;
-        int counter = 0;
-        ma_uint32 channels;
-        ma_uint32 sampleRate;
-
-        if (ioctl(fd, AUDIO_GETINFO, &fdInfo) < 0) {
-            return MA_ERROR;
-        }
-
-        if (deviceType == ma_device_type_playback) {
-            channels   = fdInfo.play.channels;
-            sampleRate = fdInfo.play.sample_rate;
-        } else {
-            channels   = fdInfo.record.channels;
-            sampleRate = fdInfo.record.sample_rate;
-        }
-
-        /* Supported formats. We get this by looking at the encodings. */
-        pDeviceInfo->nativeDataFormatCount = 0;
-        for (;;) {
-            audio_encoding_t encoding;
-            ma_format format;
-
-            MA_ZERO_OBJECT(&encoding);
-            encoding.index = counter;
-            if (ioctl(fd, AUDIO_GETENC, &encoding) < 0) {
-                break;
-            }
-
-            format = ma_format_from_encoding__audio4(encoding.encoding, encoding.precision);
-            if (format != ma_format_unknown) {
-                ma_device_info_add_native_data_format(pDeviceInfo, format, channels, sampleRate, 0);
-            }
-
-            counter += 1;
-        }
-    }
-    #else
-    {
-        struct audio_swpar fdPar;
-        ma_format format;
-        ma_uint32 channels;
-        ma_uint32 sampleRate;
-
-        if (ioctl(fd, AUDIO_GETPAR, &fdPar) < 0) {
-            return MA_ERROR;
-        }
-
-        format = ma_format_from_swpar__audio4(&fdPar);
-        if (format == ma_format_unknown) {
-            return MA_FORMAT_NOT_SUPPORTED;
-        }
-
-        if (deviceType == ma_device_type_playback) {
-            channels = fdPar.pchan;
-        } else {
-            channels = fdPar.rchan;
-        }
-
-        sampleRate = fdPar.rate;
-
-        pDeviceInfo->nativeDataFormatCount = 0;
-        ma_device_info_add_native_data_format(pDeviceInfo, format, channels, sampleRate, 0);
-    }
-    #endif
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_enumerate_devices__audio4(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    const int maxDevices = 64;
-    char devpath[256];
-    int iDevice;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    /*
-    Every device will be named "/dev/audioN", with a "/dev/audioctlN" equivalent. We use the "/dev/audioctlN"
-    version here since we can open it even when another process has control of the "/dev/audioN" device.
-    */
-    for (iDevice = 0; iDevice < maxDevices; ++iDevice) {
-        struct stat st;
-        int fd;
-        ma_bool32 isTerminating = MA_FALSE;
-
-        ma_strcpy_s(devpath, sizeof(devpath), "/dev/audioctl");
-        ma_itoa_s(iDevice, devpath+strlen(devpath), sizeof(devpath)-strlen(devpath), 10);
-
-        if (stat(devpath, &st) < 0) {
-            break;
-        }
-
-        /* The device exists, but we need to check if it's usable as playback and/or capture. */
-
-        /* Playback. */
-        if (!isTerminating) {
-            fd = open(devpath, O_RDONLY, 0);
-            if (fd >= 0) {
-                /* Supports playback. */
-                ma_device_info deviceInfo;
-                MA_ZERO_OBJECT(&deviceInfo);
-                ma_construct_device_id__audio4(deviceInfo.id.audio4, sizeof(deviceInfo.id.audio4), "/dev/audio", iDevice);
-                if (ma_context_get_device_info_from_fd__audio4(pContext, ma_device_type_playback, fd, &deviceInfo) == MA_SUCCESS) {
-                    isTerminating = !callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-                }
-
-                close(fd);
-            }
-        }
-
-        /* Capture. */
-        if (!isTerminating) {
-            fd = open(devpath, O_WRONLY, 0);
-            if (fd >= 0) {
-                /* Supports capture. */
-                ma_device_info deviceInfo;
-                MA_ZERO_OBJECT(&deviceInfo);
-                ma_construct_device_id__audio4(deviceInfo.id.audio4, sizeof(deviceInfo.id.audio4), "/dev/audio", iDevice);
-                if (ma_context_get_device_info_from_fd__audio4(pContext, ma_device_type_capture, fd, &deviceInfo) == MA_SUCCESS) {
-                    isTerminating = !callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-                }
-
-                close(fd);
-            }
-        }
-
-        if (isTerminating) {
-            break;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info__audio4(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    int fd = -1;
-    int deviceIndex = -1;
-    char ctlid[256];
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-
-    /*
-    We need to open the "/dev/audioctlN" device to get the info. To do this we need to extract the number
-    from the device ID which will be in "/dev/audioN" format.
-    */
-    if (pDeviceID == NULL) {
-        /* Default device. */
-        ma_strcpy_s(ctlid, sizeof(ctlid), "/dev/audioctl");
-    } else {
-        /* Specific device. We need to convert from "/dev/audioN" to "/dev/audioctlN". */
-        result = ma_extract_device_index_from_id__audio4(pDeviceID->audio4, "/dev/audio", &deviceIndex);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        ma_construct_device_id__audio4(ctlid, sizeof(ctlid), "/dev/audioctl", deviceIndex);
-    }
-
-    fd = open(ctlid, (deviceType == ma_device_type_playback) ? O_WRONLY : O_RDONLY, 0);
-    if (fd == -1) {
-        return MA_NO_DEVICE;
-    }
-
-    if (deviceIndex == -1) {
-        ma_strcpy_s(pDeviceInfo->id.audio4, sizeof(pDeviceInfo->id.audio4), "/dev/audio");
-    } else {
-        ma_construct_device_id__audio4(pDeviceInfo->id.audio4, sizeof(pDeviceInfo->id.audio4), "/dev/audio", deviceIndex);
-    }
-
-    result = ma_context_get_device_info_from_fd__audio4(pContext, deviceType, fd, pDeviceInfo);
-
-    close(fd);
-    return result;
-}
-
-static ma_result ma_device_uninit__audio4(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        close(pDevice->audio4.fdCapture);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        close(pDevice->audio4.fdPlayback);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init_fd__audio4(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptor, ma_device_type deviceType)
-{
-    const char* pDefaultDeviceNames[] = {
-        "/dev/audio",
-        "/dev/audio0"
-    };
-    const char* pDefaultDeviceCtlNames[] = {
-        "/dev/audioctl",
-        "/dev/audioctl0"
-    };
-    int fd;
-    int fdFlags = 0;
-    size_t iDefaultDevice = (size_t)-1;
-    ma_format internalFormat;
-    ma_uint32 internalChannels;
-    ma_uint32 internalSampleRate;
-    ma_uint32 internalPeriodSizeInFrames;
-    ma_uint32 internalPeriods;
-
-    MA_ASSERT(pConfig    != NULL);
-    MA_ASSERT(deviceType != ma_device_type_duplex);
-    MA_ASSERT(pDevice    != NULL);
-
-    /* The first thing to do is open the file. */
-    if (deviceType == ma_device_type_capture) {
-        fdFlags = O_RDONLY;
-    } else {
-        fdFlags = O_WRONLY;
-    }
-    /*fdFlags |= O_NONBLOCK;*/
-
-    /* Find the index of the default device as a start. We'll use this index later. Set it to (size_t)-1 otherwise. */
-    if (pDescriptor->pDeviceID == NULL) {
-        /* Default device. */
-        for (iDefaultDevice = 0; iDefaultDevice < ma_countof(pDefaultDeviceNames); ++iDefaultDevice) {
-            fd = open(pDefaultDeviceNames[iDefaultDevice], fdFlags, 0);
-            if (fd != -1) {
-                break;
-            }
-        }
-    } else {
-        /* Specific device. */
-        fd = open(pDescriptor->pDeviceID->audio4, fdFlags, 0);
-
-        for (iDefaultDevice = 0; iDefaultDevice < ma_countof(pDefaultDeviceNames); iDefaultDevice += 1) {
-            if (ma_strcmp(pDefaultDeviceNames[iDefaultDevice], pDescriptor->pDeviceID->audio4) == 0) {
-                break;
-            }
-        }
-
-        if (iDefaultDevice == ma_countof(pDefaultDeviceNames)) {
-            iDefaultDevice = (size_t)-1;
-        }
-    }
-
-    if (fd == -1) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to open device.");
-        return ma_result_from_errno(errno);
-    }
-
-    #if !defined(MA_AUDIO4_USE_NEW_API)    /* Old API */
-    {
-        audio_info_t fdInfo;
-        int fdInfoResult = -1;
-
-        /*
-        The documentation is a little bit unclear to me as to how it handles formats. It says the
-        following:
-
-            Regardless of formats supported by underlying driver, the audio driver accepts the
-            following formats.
-
-        By then the next sentence says this:
-
-            `encoding` and `precision` are one of the values obtained by AUDIO_GETENC.
-
-        It sounds like a direct contradiction to me. I'm going to play this safe any only use the
-        best sample format returned by AUDIO_GETENC. If the requested format is supported we'll
-        use that, but otherwise we'll just use our standard format priorities to pick an
-        appropriate one.
-        */
-        AUDIO_INITINFO(&fdInfo);
-
-        /*
-        Get the default format from the audioctl file if we're asking for a default device. If we
-        retrieve it from /dev/audio it'll default to mono 8000Hz.
-        */
-        if (iDefaultDevice != (size_t)-1) {
-            /* We're using a default device. Get the info from the /dev/audioctl file instead of /dev/audio. */
-            int fdctl = open(pDefaultDeviceCtlNames[iDefaultDevice], fdFlags, 0);
-            if (fdctl != -1) {
-                fdInfoResult = ioctl(fdctl, AUDIO_GETINFO, &fdInfo);
-                close(fdctl);
-            }
-        }
-
-        if (fdInfoResult == -1) {
-            /* We still don't have the default device info so just retrieve it from the main audio device. */
-            if (ioctl(fd, AUDIO_GETINFO, &fdInfo) < 0) {
-                close(fd);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] AUDIO_GETINFO failed.");
-                return ma_result_from_errno(errno);
-            }
-        }
-
-        /* We get the driver to do as much of the data conversion as possible. */
-        if (deviceType == ma_device_type_capture) {
-            fdInfo.mode = AUMODE_RECORD;
-            ma_encoding_from_format__audio4(ma_best_format_from_fd__audio4(fd, pDescriptor->format), &fdInfo.record.encoding, &fdInfo.record.precision);
-
-            if (pDescriptor->channels != 0) {
-                fdInfo.record.channels = ma_clamp(pDescriptor->channels, 1, 12);    /* From the documentation: `channels` ranges from 1 to 12. */
-            }
-
-            if (pDescriptor->sampleRate != 0) {
-                fdInfo.record.sample_rate = ma_clamp(pDescriptor->sampleRate, 1000, 192000);    /* From the documentation: `frequency` ranges from 1000Hz to 192000Hz. (They mean `sample_rate` instead of `frequency`.) */
-            }
-        } else {
-            fdInfo.mode = AUMODE_PLAY;
-            ma_encoding_from_format__audio4(ma_best_format_from_fd__audio4(fd, pDescriptor->format), &fdInfo.play.encoding, &fdInfo.play.precision);
-
-            if (pDescriptor->channels != 0) {
-                fdInfo.play.channels = ma_clamp(pDescriptor->channels, 1, 12);    /* From the documentation: `channels` ranges from 1 to 12. */
-            }
-
-            if (pDescriptor->sampleRate != 0) {
-                fdInfo.play.sample_rate = ma_clamp(pDescriptor->sampleRate, 1000, 192000);    /* From the documentation: `frequency` ranges from 1000Hz to 192000Hz. (They mean `sample_rate` instead of `frequency`.) */
-            }
-        }
-
-        if (ioctl(fd, AUDIO_SETINFO, &fdInfo) < 0) {
-            close(fd);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to set device format. AUDIO_SETINFO failed.");
-            return ma_result_from_errno(errno);
-        }
-
-        if (ioctl(fd, AUDIO_GETINFO, &fdInfo) < 0) {
-            close(fd);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] AUDIO_GETINFO failed.");
-            return ma_result_from_errno(errno);
-        }
-
-        if (deviceType == ma_device_type_capture) {
-            internalFormat     = ma_format_from_prinfo__audio4(&fdInfo.record);
-            internalChannels   = fdInfo.record.channels;
-            internalSampleRate = fdInfo.record.sample_rate;
-        } else {
-            internalFormat     = ma_format_from_prinfo__audio4(&fdInfo.play);
-            internalChannels   = fdInfo.play.channels;
-            internalSampleRate = fdInfo.play.sample_rate;
-        }
-
-        if (internalFormat == ma_format_unknown) {
-            close(fd);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] The device's internal device format is not supported by miniaudio. The device is unusable.");
-            return MA_FORMAT_NOT_SUPPORTED;
-        }
-
-        /* Buffer. */
-        {
-            ma_uint32 internalPeriodSizeInBytes;
-
-            internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, internalSampleRate, pConfig->performanceProfile);
-
-            internalPeriodSizeInBytes = internalPeriodSizeInFrames * ma_get_bytes_per_frame(internalFormat, internalChannels);
-            if (internalPeriodSizeInBytes < 16) {
-                internalPeriodSizeInBytes = 16;
-            }
-
-            internalPeriods = pDescriptor->periodCount;
-            if (internalPeriods < 2) {
-                internalPeriods = 2;
-            }
-
-            /* What miniaudio calls a period, audio4 calls a block. */
-            AUDIO_INITINFO(&fdInfo);
-            fdInfo.hiwat     = internalPeriods;
-            fdInfo.lowat     = internalPeriods-1;
-            fdInfo.blocksize = internalPeriodSizeInBytes;
-            if (ioctl(fd, AUDIO_SETINFO, &fdInfo) < 0) {
-                close(fd);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to set internal buffer size. AUDIO_SETINFO failed.");
-                return ma_result_from_errno(errno);
-            }
-
-            internalPeriods            = fdInfo.hiwat;
-            internalPeriodSizeInFrames = fdInfo.blocksize / ma_get_bytes_per_frame(internalFormat, internalChannels);
-        }
-    }
-    #else
-    {
-        struct audio_swpar fdPar;
-
-        /* We need to retrieve the format of the device so we can know the channel count and sample rate. Then we can calculate the buffer size. */
-        if (ioctl(fd, AUDIO_GETPAR, &fdPar) < 0) {
-            close(fd);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to retrieve initial device parameters.");
-            return ma_result_from_errno(errno);
-        }
-
-        internalFormat     = ma_format_from_swpar__audio4(&fdPar);
-        internalChannels   = (deviceType == ma_device_type_capture) ? fdPar.rchan : fdPar.pchan;
-        internalSampleRate = fdPar.rate;
-
-        if (internalFormat == ma_format_unknown) {
-            close(fd);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] The device's internal device format is not supported by miniaudio. The device is unusable.");
-            return MA_FORMAT_NOT_SUPPORTED;
-        }
-
-        /* Buffer. */
-        {
-            ma_uint32 internalPeriodSizeInBytes;
-
-            internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, internalSampleRate, pConfig->performanceProfile);
-
-            /* What miniaudio calls a period, audio4 calls a block. */
-            internalPeriodSizeInBytes = internalPeriodSizeInFrames * ma_get_bytes_per_frame(internalFormat, internalChannels);
-            if (internalPeriodSizeInBytes < 16) {
-                internalPeriodSizeInBytes = 16;
-            }
-
-            fdPar.nblks = pDescriptor->periodCount;
-            fdPar.round = internalPeriodSizeInBytes;
-
-            if (ioctl(fd, AUDIO_SETPAR, &fdPar) < 0) {
-                close(fd);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to set device parameters.");
-                return ma_result_from_errno(errno);
-            }
-
-            if (ioctl(fd, AUDIO_GETPAR, &fdPar) < 0) {
-                close(fd);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to retrieve actual device parameters.");
-                return ma_result_from_errno(errno);
-            }
-        }
-
-        internalFormat             = ma_format_from_swpar__audio4(&fdPar);
-        internalChannels           = (deviceType == ma_device_type_capture) ? fdPar.rchan : fdPar.pchan;
-        internalSampleRate         = fdPar.rate;
-        internalPeriods            = fdPar.nblks;
-        internalPeriodSizeInFrames = fdPar.round / ma_get_bytes_per_frame(internalFormat, internalChannels);
-    }
-    #endif
-
-    if (internalFormat == ma_format_unknown) {
-        close(fd);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] The device's internal device format is not supported by miniaudio. The device is unusable.");
-        return MA_FORMAT_NOT_SUPPORTED;
-    }
-
-    if (deviceType == ma_device_type_capture) {
-        pDevice->audio4.fdCapture  = fd;
-    } else {
-        pDevice->audio4.fdPlayback = fd;
-    }
-
-    pDescriptor->format             = internalFormat;
-    pDescriptor->channels           = internalChannels;
-    pDescriptor->sampleRate         = internalSampleRate;
-    ma_channel_map_init_standard(ma_standard_channel_map_sound4, pDescriptor->channelMap, ma_countof(pDescriptor->channelMap), internalChannels);
-    pDescriptor->periodSizeInFrames = internalPeriodSizeInFrames;
-    pDescriptor->periodCount        = internalPeriods;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init__audio4(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ZERO_OBJECT(&pDevice->audio4);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    pDevice->audio4.fdCapture  = -1;
-    pDevice->audio4.fdPlayback = -1;
-
-    /*
-    The version of the operating system dictates whether or not the device is exclusive or shared. NetBSD
-    introduced in-kernel mixing which means it's shared. All other BSD flavours are exclusive as far as
-    I'm aware.
-    */
-#if defined(__NetBSD_Version__) && __NetBSD_Version__ >= 800000000
-    /* NetBSD 8.0+ */
-    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
-        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
-        return MA_SHARE_MODE_NOT_SUPPORTED;
-    }
-#else
-    /* All other flavors. */
-#endif
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ma_result result = ma_device_init_fd__audio4(pDevice, pConfig, pDescriptorCapture, ma_device_type_capture);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ma_result result = ma_device_init_fd__audio4(pDevice, pConfig, pDescriptorPlayback, ma_device_type_playback);
-        if (result != MA_SUCCESS) {
-            if (pConfig->deviceType == ma_device_type_duplex) {
-                close(pDevice->audio4.fdCapture);
-            }
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_start__audio4(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        if (pDevice->audio4.fdCapture == -1) {
-            return MA_INVALID_ARGS;
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        if (pDevice->audio4.fdPlayback == -1) {
-            return MA_INVALID_ARGS;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop_fd__audio4(ma_device* pDevice, int fd)
-{
-    if (fd == -1) {
-        return MA_INVALID_ARGS;
-    }
-
-#if !defined(MA_AUDIO4_USE_NEW_API)
-    if (ioctl(fd, AUDIO_FLUSH, 0) < 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to stop device. AUDIO_FLUSH failed.");
-        return ma_result_from_errno(errno);
-    }
-#else
-    if (ioctl(fd, AUDIO_STOP, 0) < 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to stop device. AUDIO_STOP failed.");
-        return ma_result_from_errno(errno);
-    }
-#endif
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__audio4(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ma_result result;
-
-        result = ma_device_stop_fd__audio4(pDevice, pDevice->audio4.fdCapture);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_result result;
-
-        /* Drain the device first. If this fails we'll just need to flush without draining. Unfortunately draining isn't available on newer version of OpenBSD. */
-    #if !defined(MA_AUDIO4_USE_NEW_API)
-        ioctl(pDevice->audio4.fdPlayback, AUDIO_DRAIN, 0);
-    #endif
-
-        /* Here is where the device is stopped immediately. */
-        result = ma_device_stop_fd__audio4(pDevice, pDevice->audio4.fdPlayback);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_write__audio4(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
-{
-    int result;
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = 0;
-    }
-
-    result = write(pDevice->audio4.fdPlayback, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
-    if (result < 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to write data to the device.");
-        return ma_result_from_errno(errno);
-    }
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = (ma_uint32)result / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_read__audio4(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
-{
-    int result;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    result = read(pDevice->audio4.fdCapture, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
-    if (result < 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to read data from the device.");
-        return ma_result_from_errno(errno);
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = (ma_uint32)result / ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_uninit__audio4(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_audio4);
-
-    (void)pContext;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__audio4(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    MA_ASSERT(pContext != NULL);
-
-    (void)pConfig;
-
-    pCallbacks->onContextInit             = ma_context_init__audio4;
-    pCallbacks->onContextUninit           = ma_context_uninit__audio4;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__audio4;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__audio4;
-    pCallbacks->onDeviceInit              = ma_device_init__audio4;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__audio4;
-    pCallbacks->onDeviceStart             = ma_device_start__audio4;
-    pCallbacks->onDeviceStop              = ma_device_stop__audio4;
-    pCallbacks->onDeviceRead              = ma_device_read__audio4;
-    pCallbacks->onDeviceWrite             = ma_device_write__audio4;
-    pCallbacks->onDeviceDataLoop          = NULL;
-
-    return MA_SUCCESS;
-}
-#endif  /* audio4 */
-
-
-/******************************************************************************
-
-OSS Backend
-
-******************************************************************************/
-#ifdef MA_HAS_OSS
-#include <sys/ioctl.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/soundcard.h>
-
-#ifndef SNDCTL_DSP_HALT
-#define SNDCTL_DSP_HALT SNDCTL_DSP_RESET
-#endif
-
-#define MA_OSS_DEFAULT_DEVICE_NAME  "/dev/dsp"
-
-static int ma_open_temp_device__oss()
-{
-    /* The OSS sample code uses "/dev/mixer" as the device for getting system properties so I'm going to do the same. */
-    int fd = open("/dev/mixer", O_RDONLY, 0);
-    if (fd >= 0) {
-        return fd;
-    }
-
-    return -1;
-}
-
-static ma_result ma_context_open_device__oss(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_share_mode shareMode, int* pfd)
-{
-    const char* deviceName;
-    int flags;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pfd != NULL);
-    (void)pContext;
-
-    *pfd = -1;
-
-    /* This function should only be called for playback or capture, not duplex. */
-    if (deviceType == ma_device_type_duplex) {
-        return MA_INVALID_ARGS;
-    }
-
-    deviceName = MA_OSS_DEFAULT_DEVICE_NAME;
-    if (pDeviceID != NULL) {
-        deviceName = pDeviceID->oss;
-    }
-
-    flags = (deviceType == ma_device_type_playback) ? O_WRONLY : O_RDONLY;
-    if (shareMode == ma_share_mode_exclusive) {
-        flags |= O_EXCL;
-    }
-
-    *pfd = open(deviceName, flags, 0);
-    if (*pfd == -1) {
-        return ma_result_from_errno(errno);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_enumerate_devices__oss(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    int fd;
-    oss_sysinfo si;
-    int result;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    fd = ma_open_temp_device__oss();
-    if (fd == -1) {
-        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open a temporary device for retrieving system information used for device enumeration.");
-        return MA_NO_BACKEND;
-    }
-
-    result = ioctl(fd, SNDCTL_SYSINFO, &si);
-    if (result != -1) {
-        int iAudioDevice;
-        for (iAudioDevice = 0; iAudioDevice < si.numaudios; ++iAudioDevice) {
-            oss_audioinfo ai;
-            ai.dev = iAudioDevice;
-            result = ioctl(fd, SNDCTL_AUDIOINFO, &ai);
-            if (result != -1) {
-                if (ai.devnode[0] != '\0') {    /* <-- Can be blank, according to documentation. */
-                    ma_device_info deviceInfo;
-                    ma_bool32 isTerminating = MA_FALSE;
-
-                    MA_ZERO_OBJECT(&deviceInfo);
-
-                    /* ID */
-                    ma_strncpy_s(deviceInfo.id.oss, sizeof(deviceInfo.id.oss), ai.devnode, (size_t)-1);
-
-                    /*
-                    The human readable device name should be in the "ai.handle" variable, but it can
-                    sometimes be empty in which case we just fall back to "ai.name" which is less user
-                    friendly, but usually has a value.
-                    */
-                    if (ai.handle[0] != '\0') {
-                        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), ai.handle, (size_t)-1);
-                    } else {
-                        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), ai.name, (size_t)-1);
-                    }
-
-                    /* The device can be both playback and capture. */
-                    if (!isTerminating && (ai.caps & PCM_CAP_OUTPUT) != 0) {
-                        isTerminating = !callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-                    }
-                    if (!isTerminating && (ai.caps & PCM_CAP_INPUT) != 0) {
-                        isTerminating = !callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-                    }
-
-                    if (isTerminating) {
-                        break;
-                    }
-                }
-            }
-        }
-    } else {
-        close(fd);
-        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to retrieve system information for device enumeration.");
-        return MA_NO_BACKEND;
-    }
-
-    close(fd);
-    return MA_SUCCESS;
-}
-
-static void ma_context_add_native_data_format__oss(ma_context* pContext, oss_audioinfo* pAudioInfo, ma_format format, ma_device_info* pDeviceInfo)
-{
-    unsigned int minChannels;
-    unsigned int maxChannels;
-    unsigned int iRate;
-
-    MA_ASSERT(pContext    != NULL);
-    MA_ASSERT(pAudioInfo  != NULL);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    /* If we support all channels we just report 0. */
-    minChannels = ma_clamp(pAudioInfo->min_channels, MA_MIN_CHANNELS, MA_MAX_CHANNELS);
-    maxChannels = ma_clamp(pAudioInfo->max_channels, MA_MIN_CHANNELS, MA_MAX_CHANNELS);
-
-    /*
-    OSS has this annoying thing where sample rates can be reported in two ways. We prefer explicitness,
-    which OSS has in the form of nrates/rates, however there are times where nrates can be 0, in which
-    case we'll need to use min_rate and max_rate and report only standard rates.
-    */
-    if (pAudioInfo->nrates > 0) {
-        for (iRate = 0; iRate < pAudioInfo->nrates; iRate += 1) {
-            unsigned int rate = pAudioInfo->rates[iRate];
-
-            if (minChannels == MA_MIN_CHANNELS && maxChannels == MA_MAX_CHANNELS) {
-                ma_device_info_add_native_data_format(pDeviceInfo, format, 0, rate, 0);   /* Set the channel count to 0 to indicate that all channel counts are supported. */
-            } else {
-                unsigned int iChannel;
-                for (iChannel = minChannels; iChannel <= maxChannels; iChannel += 1) {
-                     ma_device_info_add_native_data_format(pDeviceInfo, format, iChannel, rate, 0);
-                }
-            }
-        }
-    } else {
-        for (iRate = 0; iRate < ma_countof(g_maStandardSampleRatePriorities); iRate += 1) {
-            ma_uint32 standardRate = g_maStandardSampleRatePriorities[iRate];
-
-            if (standardRate >= (ma_uint32)pAudioInfo->min_rate && standardRate <= (ma_uint32)pAudioInfo->max_rate) {
-                if (minChannels == MA_MIN_CHANNELS && maxChannels == MA_MAX_CHANNELS) {
-                    ma_device_info_add_native_data_format(pDeviceInfo, format, 0, standardRate, 0);   /* Set the channel count to 0 to indicate that all channel counts are supported. */
-                } else {
-                    unsigned int iChannel;
-                    for (iChannel = minChannels; iChannel <= maxChannels; iChannel += 1) {
-                         ma_device_info_add_native_data_format(pDeviceInfo, format, iChannel, standardRate, 0);
-                    }
-                }
-            }
-        }
-    }
-}
-
-static ma_result ma_context_get_device_info__oss(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    ma_bool32 foundDevice;
-    int fdTemp;
-    oss_sysinfo si;
-    int result;
-
-    MA_ASSERT(pContext != NULL);
-
-    /* Handle the default device a little differently. */
-    if (pDeviceID == NULL) {
-        if (deviceType == ma_device_type_playback) {
-            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-        } else {
-            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-        }
-
-        return MA_SUCCESS;
-    }
-
-
-    /* If we get here it means we are _not_ using the default device. */
-    foundDevice = MA_FALSE;
-
-    fdTemp = ma_open_temp_device__oss();
-    if (fdTemp == -1) {
-        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open a temporary device for retrieving system information used for device enumeration.");
-        return MA_NO_BACKEND;
-    }
-
-    result = ioctl(fdTemp, SNDCTL_SYSINFO, &si);
-    if (result != -1) {
-        int iAudioDevice;
-        for (iAudioDevice = 0; iAudioDevice < si.numaudios; ++iAudioDevice) {
-            oss_audioinfo ai;
-            ai.dev = iAudioDevice;
-            result = ioctl(fdTemp, SNDCTL_AUDIOINFO, &ai);
-            if (result != -1) {
-                if (ma_strcmp(ai.devnode, pDeviceID->oss) == 0) {
-                    /* It has the same name, so now just confirm the type. */
-                    if ((deviceType == ma_device_type_playback && ((ai.caps & PCM_CAP_OUTPUT) != 0)) ||
-                        (deviceType == ma_device_type_capture  && ((ai.caps & PCM_CAP_INPUT)  != 0))) {
-                        unsigned int formatMask;
-
-                        /* ID */
-                        ma_strncpy_s(pDeviceInfo->id.oss, sizeof(pDeviceInfo->id.oss), ai.devnode, (size_t)-1);
-
-                        /*
-                        The human readable device name should be in the "ai.handle" variable, but it can
-                        sometimes be empty in which case we just fall back to "ai.name" which is less user
-                        friendly, but usually has a value.
-                        */
-                        if (ai.handle[0] != '\0') {
-                            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), ai.handle, (size_t)-1);
-                        } else {
-                            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), ai.name, (size_t)-1);
-                        }
-
-
-                        pDeviceInfo->nativeDataFormatCount = 0;
-
-                        if (deviceType == ma_device_type_playback) {
-                            formatMask = ai.oformats;
-                        } else {
-                            formatMask = ai.iformats;
-                        }
-
-                        if (((formatMask & AFMT_S16_LE) != 0 && ma_is_little_endian()) || (AFMT_S16_BE && ma_is_big_endian())) {
-                            ma_context_add_native_data_format__oss(pContext, &ai, ma_format_s16, pDeviceInfo);
-                        }
-                        if (((formatMask & AFMT_S32_LE) != 0 && ma_is_little_endian()) || (AFMT_S32_BE && ma_is_big_endian())) {
-                            ma_context_add_native_data_format__oss(pContext, &ai, ma_format_s32, pDeviceInfo);
-                        }
-                        if ((formatMask & AFMT_U8) != 0) {
-                            ma_context_add_native_data_format__oss(pContext, &ai, ma_format_u8, pDeviceInfo);
-                        }
-
-                        foundDevice = MA_TRUE;
-                        break;
-                    }
-                }
-            }
-        }
-    } else {
-        close(fdTemp);
-        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to retrieve system information for device enumeration.");
-        return MA_NO_BACKEND;
-    }
-
-
-    close(fdTemp);
-
-    if (!foundDevice) {
-        return MA_NO_DEVICE;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_uninit__oss(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        close(pDevice->oss.fdCapture);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        close(pDevice->oss.fdPlayback);
-    }
-
-    return MA_SUCCESS;
-}
-
-static int ma_format_to_oss(ma_format format)
-{
-    int ossFormat = AFMT_U8;
-    switch (format) {
-        case ma_format_s16: ossFormat = (ma_is_little_endian()) ? AFMT_S16_LE : AFMT_S16_BE; break;
-        case ma_format_s24: ossFormat = (ma_is_little_endian()) ? AFMT_S32_LE : AFMT_S32_BE; break;
-        case ma_format_s32: ossFormat = (ma_is_little_endian()) ? AFMT_S32_LE : AFMT_S32_BE; break;
-        case ma_format_f32: ossFormat = (ma_is_little_endian()) ? AFMT_S16_LE : AFMT_S16_BE; break;
-        case ma_format_u8:
-        default: ossFormat = AFMT_U8; break;
-    }
-
-    return ossFormat;
-}
-
-static ma_format ma_format_from_oss(int ossFormat)
-{
-    if (ossFormat == AFMT_U8) {
-        return ma_format_u8;
-    } else {
-        if (ma_is_little_endian()) {
-            switch (ossFormat) {
-                case AFMT_S16_LE: return ma_format_s16;
-                case AFMT_S32_LE: return ma_format_s32;
-                default: return ma_format_unknown;
-            }
-        } else {
-            switch (ossFormat) {
-                case AFMT_S16_BE: return ma_format_s16;
-                case AFMT_S32_BE: return ma_format_s32;
-                default: return ma_format_unknown;
-            }
-        }
-    }
-
-    return ma_format_unknown;
-}
-
-static ma_result ma_device_init_fd__oss(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptor, ma_device_type deviceType)
-{
-    ma_result result;
-    int ossResult;
-    int fd;
-    const ma_device_id* pDeviceID = NULL;
-    ma_share_mode shareMode;
-    int ossFormat;
-    int ossChannels;
-    int ossSampleRate;
-    int ossFragment;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(deviceType != ma_device_type_duplex);
-
-    pDeviceID     = pDescriptor->pDeviceID;
-    shareMode     = pDescriptor->shareMode;
-    ossFormat     = ma_format_to_oss((pDescriptor->format != ma_format_unknown) ? pDescriptor->format : ma_format_s16); /* Use s16 by default because OSS doesn't like floating point. */
-    ossChannels   = (int)(pDescriptor->channels   > 0) ? pDescriptor->channels   : MA_DEFAULT_CHANNELS;
-    ossSampleRate = (int)(pDescriptor->sampleRate > 0) ? pDescriptor->sampleRate : MA_DEFAULT_SAMPLE_RATE;
-
-    result = ma_context_open_device__oss(pDevice->pContext, deviceType, pDeviceID, shareMode, &fd);
-    if (result != MA_SUCCESS) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open device.");
-        return result;
-    }
-
-    /*
-    The OSS documantation is very clear about the order we should be initializing the device's properties:
-      1) Format
-      2) Channels
-      3) Sample rate.
-    */
-
-    /* Format. */
-    ossResult = ioctl(fd, SNDCTL_DSP_SETFMT, &ossFormat);
-    if (ossResult == -1) {
-        close(fd);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to set format.");
-        return ma_result_from_errno(errno);
-    }
-
-    /* Channels. */
-    ossResult = ioctl(fd, SNDCTL_DSP_CHANNELS, &ossChannels);
-    if (ossResult == -1) {
-        close(fd);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to set channel count.");
-        return ma_result_from_errno(errno);
-    }
-
-    /* Sample Rate. */
-    ossResult = ioctl(fd, SNDCTL_DSP_SPEED, &ossSampleRate);
-    if (ossResult == -1) {
-        close(fd);
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to set sample rate.");
-        return ma_result_from_errno(errno);
-    }
-
-    /*
-    Buffer.
-
-    The documentation says that the fragment settings should be set as soon as possible, but I'm not sure if
-    it should be done before or after format/channels/rate.
-
-    OSS wants the fragment size in bytes and a power of 2. When setting, we specify the power, not the actual
-    value.
-    */
-    {
-        ma_uint32 periodSizeInFrames;
-        ma_uint32 periodSizeInBytes;
-        ma_uint32 ossFragmentSizePower;
-
-        periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, (ma_uint32)ossSampleRate, pConfig->performanceProfile);
-
-        periodSizeInBytes = ma_round_to_power_of_2(periodSizeInFrames * ma_get_bytes_per_frame(ma_format_from_oss(ossFormat), ossChannels));
-        if (periodSizeInBytes < 16) {
-            periodSizeInBytes = 16;
-        }
-
-        ossFragmentSizePower = 4;
-        periodSizeInBytes >>= 4;
-        while (periodSizeInBytes >>= 1) {
-            ossFragmentSizePower += 1;
-        }
-
-        ossFragment = (int)((pConfig->periods << 16) | ossFragmentSizePower);
-        ossResult = ioctl(fd, SNDCTL_DSP_SETFRAGMENT, &ossFragment);
-        if (ossResult == -1) {
-            close(fd);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to set fragment size and period count.");
-            return ma_result_from_errno(errno);
-        }
-    }
-
-    /* Internal settings. */
-    if (deviceType == ma_device_type_capture) {
-        pDevice->oss.fdCapture  = fd;
-    } else {
-        pDevice->oss.fdPlayback = fd;
-    }
-
-    pDescriptor->format             = ma_format_from_oss(ossFormat);
-    pDescriptor->channels           = ossChannels;
-    pDescriptor->sampleRate         = ossSampleRate;
-    ma_channel_map_init_standard(ma_standard_channel_map_sound4, pDescriptor->channelMap, ma_countof(pDescriptor->channelMap), pDescriptor->channels);
-    pDescriptor->periodCount        = (ma_uint32)(ossFragment >> 16);
-    pDescriptor->periodSizeInFrames = (ma_uint32)(1 << (ossFragment & 0xFFFF)) / ma_get_bytes_per_frame(pDescriptor->format, pDescriptor->channels);
-
-    if (pDescriptor->format == ma_format_unknown) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] The device's internal format is not supported by miniaudio.");
-        return MA_FORMAT_NOT_SUPPORTED;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init__oss(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    MA_ASSERT(pDevice  != NULL);
-    MA_ASSERT(pConfig  != NULL);
-
-    MA_ZERO_OBJECT(&pDevice->oss);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ma_result result = ma_device_init_fd__oss(pDevice, pConfig, pDescriptorCapture, ma_device_type_capture);
-        if (result != MA_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open device.");
-            return result;
-        }
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ma_result result = ma_device_init_fd__oss(pDevice, pConfig, pDescriptorPlayback, ma_device_type_playback);
-        if (result != MA_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open device.");
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-/*
-Note on Starting and Stopping
-=============================
-In the past I was using SNDCTL_DSP_HALT to stop the device, however this results in issues when
-trying to resume the device again. If we use SNDCTL_DSP_HALT, the next write() or read() will
-fail. Instead what we need to do is just not write or read to and from the device when the
-device is not running.
-
-As a result, both the start and stop functions for OSS are just empty stubs. The starting and
-stopping logic is handled by ma_device_write__oss() and ma_device_read__oss(). These will check
-the device state, and if the device is stopped they will simply not do any kind of processing.
-
-The downside to this technique is that I've noticed a fairly lengthy delay in stopping the
-device, up to a second. This is on a virtual machine, and as such might just be due to the
-virtual drivers, but I'm not fully sure. I am not sure how to work around this problem so for
-the moment that's just how it's going to have to be.
-
-When starting the device, OSS will automatically start it when write() or read() is called.
-*/
-static ma_result ma_device_start__oss(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    /* The device is automatically started with reading and writing. */
-    (void)pDevice;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__oss(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    /* See note above on why this is empty. */
-    (void)pDevice;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_write__oss(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
-{
-    int resultOSS;
-    ma_uint32 deviceState;
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = 0;
-    }
-
-    /* Don't do any processing if the device is stopped. */
-    deviceState = ma_device_get_state(pDevice);
-    if (deviceState != ma_device_state_started && deviceState != ma_device_state_starting) {
-        return MA_SUCCESS;
-    }
-
-    resultOSS = write(pDevice->oss.fdPlayback, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
-    if (resultOSS < 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to send data from the client to the device.");
-        return ma_result_from_errno(errno);
-    }
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = (ma_uint32)resultOSS / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_read__oss(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
-{
-    int resultOSS;
-    ma_uint32 deviceState;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    /* Don't do any processing if the device is stopped. */
-    deviceState = ma_device_get_state(pDevice);
-    if (deviceState != ma_device_state_started && deviceState != ma_device_state_starting) {
-        return MA_SUCCESS;
-    }
-
-    resultOSS = read(pDevice->oss.fdCapture, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
-    if (resultOSS < 0) {
-        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to read data from the device to be sent to the client.");
-        return ma_result_from_errno(errno);
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = (ma_uint32)resultOSS / ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_uninit__oss(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_oss);
-
-    (void)pContext;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__oss(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    int fd;
-    int ossVersion;
-    int result;
-
-    MA_ASSERT(pContext != NULL);
-
-    (void)pConfig;
-
-    /* Try opening a temporary device first so we can get version information. This is closed at the end. */
-    fd = ma_open_temp_device__oss();
-    if (fd == -1) {
-        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open temporary device for retrieving system properties.");   /* Looks liks OSS isn't installed, or there are no available devices. */
-        return MA_NO_BACKEND;
-    }
-
-    /* Grab the OSS version. */
-    ossVersion = 0;
-    result = ioctl(fd, OSS_GETVERSION, &ossVersion);
-    if (result == -1) {
-        close(fd);
-        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to retrieve OSS version.");
-        return MA_NO_BACKEND;
-    }
-
-    /* The file handle to temp device is no longer needed. Close ASAP. */
-    close(fd);
-
-    pContext->oss.versionMajor = ((ossVersion & 0xFF0000) >> 16);
-    pContext->oss.versionMinor = ((ossVersion & 0x00FF00) >> 8);
-
-    pCallbacks->onContextInit             = ma_context_init__oss;
-    pCallbacks->onContextUninit           = ma_context_uninit__oss;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__oss;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__oss;
-    pCallbacks->onDeviceInit              = ma_device_init__oss;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__oss;
-    pCallbacks->onDeviceStart             = ma_device_start__oss;
-    pCallbacks->onDeviceStop              = ma_device_stop__oss;
-    pCallbacks->onDeviceRead              = ma_device_read__oss;
-    pCallbacks->onDeviceWrite             = ma_device_write__oss;
-    pCallbacks->onDeviceDataLoop          = NULL;
-
-    return MA_SUCCESS;
-}
-#endif  /* OSS */
-
-
-
-
-
-/******************************************************************************
-
-AAudio Backend
-
-******************************************************************************/
-#ifdef MA_HAS_AAUDIO
-
-/*#include <AAudio/AAudio.h>*/
-
-typedef int32_t                                         ma_aaudio_result_t;
-typedef int32_t                                         ma_aaudio_direction_t;
-typedef int32_t                                         ma_aaudio_sharing_mode_t;
-typedef int32_t                                         ma_aaudio_format_t;
-typedef int32_t                                         ma_aaudio_stream_state_t;
-typedef int32_t                                         ma_aaudio_performance_mode_t;
-typedef int32_t                                         ma_aaudio_usage_t;
-typedef int32_t                                         ma_aaudio_content_type_t;
-typedef int32_t                                         ma_aaudio_input_preset_t;
-typedef int32_t                                         ma_aaudio_allowed_capture_policy_t;
-typedef int32_t                                         ma_aaudio_data_callback_result_t;
-typedef struct ma_AAudioStreamBuilder_t*                ma_AAudioStreamBuilder;
-typedef struct ma_AAudioStream_t*                       ma_AAudioStream;
-
-#define MA_AAUDIO_UNSPECIFIED                           0
-
-/* Result codes. miniaudio only cares about the success code. */
-#define MA_AAUDIO_OK                                    0
-
-/* Directions. */
-#define MA_AAUDIO_DIRECTION_OUTPUT                      0
-#define MA_AAUDIO_DIRECTION_INPUT                       1
-
-/* Sharing modes. */
-#define MA_AAUDIO_SHARING_MODE_EXCLUSIVE                0
-#define MA_AAUDIO_SHARING_MODE_SHARED                   1
-
-/* Formats. */
-#define MA_AAUDIO_FORMAT_PCM_I16                        1
-#define MA_AAUDIO_FORMAT_PCM_FLOAT                      2
-
-/* Stream states. */
-#define MA_AAUDIO_STREAM_STATE_UNINITIALIZED            0
-#define MA_AAUDIO_STREAM_STATE_UNKNOWN                  1
-#define MA_AAUDIO_STREAM_STATE_OPEN                     2
-#define MA_AAUDIO_STREAM_STATE_STARTING                 3
-#define MA_AAUDIO_STREAM_STATE_STARTED                  4
-#define MA_AAUDIO_STREAM_STATE_PAUSING                  5
-#define MA_AAUDIO_STREAM_STATE_PAUSED                   6
-#define MA_AAUDIO_STREAM_STATE_FLUSHING                 7
-#define MA_AAUDIO_STREAM_STATE_FLUSHED                  8
-#define MA_AAUDIO_STREAM_STATE_STOPPING                 9
-#define MA_AAUDIO_STREAM_STATE_STOPPED                  10
-#define MA_AAUDIO_STREAM_STATE_CLOSING                  11
-#define MA_AAUDIO_STREAM_STATE_CLOSED                   12
-#define MA_AAUDIO_STREAM_STATE_DISCONNECTED             13
-
-/* Performance modes. */
-#define MA_AAUDIO_PERFORMANCE_MODE_NONE                 10
-#define MA_AAUDIO_PERFORMANCE_MODE_POWER_SAVING         11
-#define MA_AAUDIO_PERFORMANCE_MODE_LOW_LATENCY          12
-
-/* Usage types. */
-#define MA_AAUDIO_USAGE_MEDIA                           1
-#define MA_AAUDIO_USAGE_VOICE_COMMUNICATION             2
-#define MA_AAUDIO_USAGE_VOICE_COMMUNICATION_SIGNALLING  3
-#define MA_AAUDIO_USAGE_ALARM                           4
-#define MA_AAUDIO_USAGE_NOTIFICATION                    5
-#define MA_AAUDIO_USAGE_NOTIFICATION_RINGTONE           6
-#define MA_AAUDIO_USAGE_NOTIFICATION_EVENT              10
-#define MA_AAUDIO_USAGE_ASSISTANCE_ACCESSIBILITY        11
-#define MA_AAUDIO_USAGE_ASSISTANCE_NAVIGATION_GUIDANCE  12
-#define MA_AAUDIO_USAGE_ASSISTANCE_SONIFICATION         13
-#define MA_AAUDIO_USAGE_GAME                            14
-#define MA_AAUDIO_USAGE_ASSISTANT                       16
-#define MA_AAUDIO_SYSTEM_USAGE_EMERGENCY                1000
-#define MA_AAUDIO_SYSTEM_USAGE_SAFETY                   1001
-#define MA_AAUDIO_SYSTEM_USAGE_VEHICLE_STATUS           1002
-#define MA_AAUDIO_SYSTEM_USAGE_ANNOUNCEMENT             1003
-
-/* Content types. */
-#define MA_AAUDIO_CONTENT_TYPE_SPEECH                   1
-#define MA_AAUDIO_CONTENT_TYPE_MUSIC                    2
-#define MA_AAUDIO_CONTENT_TYPE_MOVIE                    3
-#define MA_AAUDIO_CONTENT_TYPE_SONIFICATION             4
-
-/* Input presets. */
-#define MA_AAUDIO_INPUT_PRESET_GENERIC                  1
-#define MA_AAUDIO_INPUT_PRESET_CAMCORDER                5
-#define MA_AAUDIO_INPUT_PRESET_VOICE_RECOGNITION        6
-#define MA_AAUDIO_INPUT_PRESET_VOICE_COMMUNICATION      7
-#define MA_AAUDIO_INPUT_PRESET_UNPROCESSED              9
-#define MA_AAUDIO_INPUT_PRESET_VOICE_PERFORMANCE        10
-
-/* Allowed Capture Policies */
-#define MA_AAUDIO_ALLOW_CAPTURE_BY_ALL                  1
-#define MA_AAUDIO_ALLOW_CAPTURE_BY_SYSTEM               2
-#define MA_AAUDIO_ALLOW_CAPTURE_BY_NONE                 3
-
-/* Callback results. */
-#define MA_AAUDIO_CALLBACK_RESULT_CONTINUE              0
-#define MA_AAUDIO_CALLBACK_RESULT_STOP                  1
-
-
-typedef ma_aaudio_data_callback_result_t (* ma_AAudioStream_dataCallback) (ma_AAudioStream* pStream, void* pUserData, void* pAudioData, int32_t numFrames);
-typedef void                             (* ma_AAudioStream_errorCallback)(ma_AAudioStream *pStream, void *pUserData, ma_aaudio_result_t error);
-
-typedef ma_aaudio_result_t       (* MA_PFN_AAudio_createStreamBuilder)                   (ma_AAudioStreamBuilder** ppBuilder);
-typedef ma_aaudio_result_t       (* MA_PFN_AAudioStreamBuilder_delete)                   (ma_AAudioStreamBuilder* pBuilder);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setDeviceId)              (ma_AAudioStreamBuilder* pBuilder, int32_t deviceId);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setDirection)             (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_direction_t direction);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setSharingMode)           (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_sharing_mode_t sharingMode);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setFormat)                (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_format_t format);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setChannelCount)          (ma_AAudioStreamBuilder* pBuilder, int32_t channelCount);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setSampleRate)            (ma_AAudioStreamBuilder* pBuilder, int32_t sampleRate);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setBufferCapacityInFrames)(ma_AAudioStreamBuilder* pBuilder, int32_t numFrames);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setFramesPerDataCallback) (ma_AAudioStreamBuilder* pBuilder, int32_t numFrames);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setDataCallback)          (ma_AAudioStreamBuilder* pBuilder, ma_AAudioStream_dataCallback callback, void* pUserData);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setErrorCallback)         (ma_AAudioStreamBuilder* pBuilder, ma_AAudioStream_errorCallback callback, void* pUserData);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setPerformanceMode)       (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_performance_mode_t mode);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setUsage)                 (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_usage_t contentType);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setContentType)           (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_content_type_t contentType);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setInputPreset)           (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_input_preset_t inputPreset);
-typedef void                     (* MA_PFN_AAudioStreamBuilder_setAllowedCapturePolicy)  (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_allowed_capture_policy_t policy);
-typedef ma_aaudio_result_t       (* MA_PFN_AAudioStreamBuilder_openStream)               (ma_AAudioStreamBuilder* pBuilder, ma_AAudioStream** ppStream);
-typedef ma_aaudio_result_t       (* MA_PFN_AAudioStream_close)                           (ma_AAudioStream* pStream);
-typedef ma_aaudio_stream_state_t (* MA_PFN_AAudioStream_getState)                        (ma_AAudioStream* pStream);
-typedef ma_aaudio_result_t       (* MA_PFN_AAudioStream_waitForStateChange)              (ma_AAudioStream* pStream, ma_aaudio_stream_state_t inputState, ma_aaudio_stream_state_t* pNextState, int64_t timeoutInNanoseconds);
-typedef ma_aaudio_format_t       (* MA_PFN_AAudioStream_getFormat)                       (ma_AAudioStream* pStream);
-typedef int32_t                  (* MA_PFN_AAudioStream_getChannelCount)                 (ma_AAudioStream* pStream);
-typedef int32_t                  (* MA_PFN_AAudioStream_getSampleRate)                   (ma_AAudioStream* pStream);
-typedef int32_t                  (* MA_PFN_AAudioStream_getBufferCapacityInFrames)       (ma_AAudioStream* pStream);
-typedef int32_t                  (* MA_PFN_AAudioStream_getFramesPerDataCallback)        (ma_AAudioStream* pStream);
-typedef int32_t                  (* MA_PFN_AAudioStream_getFramesPerBurst)               (ma_AAudioStream* pStream);
-typedef ma_aaudio_result_t       (* MA_PFN_AAudioStream_requestStart)                    (ma_AAudioStream* pStream);
-typedef ma_aaudio_result_t       (* MA_PFN_AAudioStream_requestStop)                     (ma_AAudioStream* pStream);
-
-static ma_result ma_result_from_aaudio(ma_aaudio_result_t resultAA)
-{
-    switch (resultAA)
-    {
-        case MA_AAUDIO_OK: return MA_SUCCESS;
-        default: break;
-    }
-
-    return MA_ERROR;
-}
-
-static ma_aaudio_usage_t ma_to_usage__aaudio(ma_aaudio_usage usage)
-{
-    switch (usage) {
-        case ma_aaudio_usage_media:                          return MA_AAUDIO_USAGE_MEDIA;
-        case ma_aaudio_usage_voice_communication:            return MA_AAUDIO_USAGE_VOICE_COMMUNICATION;
-        case ma_aaudio_usage_voice_communication_signalling: return MA_AAUDIO_USAGE_VOICE_COMMUNICATION_SIGNALLING;
-        case ma_aaudio_usage_alarm:                          return MA_AAUDIO_USAGE_ALARM;
-        case ma_aaudio_usage_notification:                   return MA_AAUDIO_USAGE_NOTIFICATION;
-        case ma_aaudio_usage_notification_ringtone:          return MA_AAUDIO_USAGE_NOTIFICATION_RINGTONE;
-        case ma_aaudio_usage_notification_event:             return MA_AAUDIO_USAGE_NOTIFICATION_EVENT;
-        case ma_aaudio_usage_assistance_accessibility:       return MA_AAUDIO_USAGE_ASSISTANCE_ACCESSIBILITY;
-        case ma_aaudio_usage_assistance_navigation_guidance: return MA_AAUDIO_USAGE_ASSISTANCE_NAVIGATION_GUIDANCE;
-        case ma_aaudio_usage_assistance_sonification:        return MA_AAUDIO_USAGE_ASSISTANCE_SONIFICATION;
-        case ma_aaudio_usage_game:                           return MA_AAUDIO_USAGE_GAME;
-        case ma_aaudio_usage_assitant:                       return MA_AAUDIO_USAGE_ASSISTANT;
-        case ma_aaudio_usage_emergency:                      return MA_AAUDIO_SYSTEM_USAGE_EMERGENCY;
-        case ma_aaudio_usage_safety:                         return MA_AAUDIO_SYSTEM_USAGE_SAFETY;
-        case ma_aaudio_usage_vehicle_status:                 return MA_AAUDIO_SYSTEM_USAGE_VEHICLE_STATUS;
-        case ma_aaudio_usage_announcement:                   return MA_AAUDIO_SYSTEM_USAGE_ANNOUNCEMENT;
-        default: break;
-    }
-
-    return MA_AAUDIO_USAGE_MEDIA;
-}
-
-static ma_aaudio_content_type_t ma_to_content_type__aaudio(ma_aaudio_content_type contentType)
-{
-    switch (contentType) {
-        case ma_aaudio_content_type_speech:       return MA_AAUDIO_CONTENT_TYPE_SPEECH;
-        case ma_aaudio_content_type_music:        return MA_AAUDIO_CONTENT_TYPE_MUSIC;
-        case ma_aaudio_content_type_movie:        return MA_AAUDIO_CONTENT_TYPE_MOVIE;
-        case ma_aaudio_content_type_sonification: return MA_AAUDIO_CONTENT_TYPE_SONIFICATION;
-        default: break;
-    }
-
-    return MA_AAUDIO_CONTENT_TYPE_SPEECH;
-}
-
-static ma_aaudio_input_preset_t ma_to_input_preset__aaudio(ma_aaudio_input_preset inputPreset)
-{
-    switch (inputPreset) {
-        case ma_aaudio_input_preset_generic:             return MA_AAUDIO_INPUT_PRESET_GENERIC;
-        case ma_aaudio_input_preset_camcorder:           return MA_AAUDIO_INPUT_PRESET_CAMCORDER;
-        case ma_aaudio_input_preset_voice_recognition:   return MA_AAUDIO_INPUT_PRESET_VOICE_RECOGNITION;
-        case ma_aaudio_input_preset_voice_communication: return MA_AAUDIO_INPUT_PRESET_VOICE_COMMUNICATION;
-        case ma_aaudio_input_preset_unprocessed:         return MA_AAUDIO_INPUT_PRESET_UNPROCESSED;
-        case ma_aaudio_input_preset_voice_performance:   return MA_AAUDIO_INPUT_PRESET_VOICE_PERFORMANCE;
-        default: break;
-    }
-
-    return MA_AAUDIO_INPUT_PRESET_GENERIC;
-}
-
-static ma_aaudio_allowed_capture_policy_t ma_to_allowed_capture_policy__aaudio(ma_aaudio_allowed_capture_policy allowedCapturePolicy)
-{
-    switch (allowedCapturePolicy) {
-        case ma_aaudio_allow_capture_by_all:    return MA_AAUDIO_ALLOW_CAPTURE_BY_ALL;
-        case ma_aaudio_allow_capture_by_system: return MA_AAUDIO_ALLOW_CAPTURE_BY_SYSTEM;
-        case ma_aaudio_allow_capture_by_none:   return MA_AAUDIO_ALLOW_CAPTURE_BY_NONE;
-        default: break;
-    }
-
-    return MA_AAUDIO_ALLOW_CAPTURE_BY_ALL;
-}
-
-static void ma_stream_error_callback__aaudio(ma_AAudioStream* pStream, void* pUserData, ma_aaudio_result_t error)
-{
-    ma_result result;
-    ma_job job;
-    ma_device* pDevice = (ma_device*)pUserData;
-    MA_ASSERT(pDevice != NULL);
-
-    (void)error;
-
-    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] ERROR CALLBACK: error=%d, AAudioStream_getState()=%d\n", error, ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream));
-
-    /*
-    When we get an error, we'll assume that the stream is in an erroneous state and needs to be restarted. From the documentation,
-    we cannot do this from the error callback. Therefore we are going to use an event thread for the AAudio backend to do this
-    cleanly and safely.
-    */
-    job = ma_job_init(MA_JOB_TYPE_DEVICE_AAUDIO_REROUTE);
-    job.data.device.aaudio.reroute.pDevice = pDevice;
-
-    if (pStream == pDevice->aaudio.pStreamCapture) {
-        job.data.device.aaudio.reroute.deviceType = ma_device_type_capture;
-    }
-    else {
-        job.data.device.aaudio.reroute.deviceType = ma_device_type_playback;
-    }
-
-    result = ma_device_job_thread_post(&pDevice->pContext->aaudio.jobThread, &job);
-    if (result != MA_SUCCESS) {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] Device Disconnected. Failed to post job for rerouting.\n");
-        return;
-    }
-}
-
-static ma_aaudio_data_callback_result_t ma_stream_data_callback_capture__aaudio(ma_AAudioStream* pStream, void* pUserData, void* pAudioData, int32_t frameCount)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    MA_ASSERT(pDevice != NULL);
-
-    ma_device_handle_backend_data_callback(pDevice, NULL, pAudioData, frameCount);
-
-    (void)pStream;
-    return MA_AAUDIO_CALLBACK_RESULT_CONTINUE;
-}
-
-static ma_aaudio_data_callback_result_t ma_stream_data_callback_playback__aaudio(ma_AAudioStream* pStream, void* pUserData, void* pAudioData, int32_t frameCount)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    MA_ASSERT(pDevice != NULL);
-
-    ma_device_handle_backend_data_callback(pDevice, pAudioData, NULL, frameCount);
-
-    (void)pStream;
-    return MA_AAUDIO_CALLBACK_RESULT_CONTINUE;
-}
-
-static ma_result ma_create_and_configure_AAudioStreamBuilder__aaudio(ma_context* pContext, const ma_device_id* pDeviceID, ma_device_type deviceType, ma_share_mode shareMode, const ma_device_descriptor* pDescriptor, const ma_device_config* pConfig, ma_device* pDevice, ma_AAudioStreamBuilder** ppBuilder)
-{
-    ma_AAudioStreamBuilder* pBuilder;
-    ma_aaudio_result_t resultAA;
-
-    /* Safety. */
-    *ppBuilder = NULL;
-
-    resultAA = ((MA_PFN_AAudio_createStreamBuilder)pContext->aaudio.AAudio_createStreamBuilder)(&pBuilder);
-    if (resultAA != MA_AAUDIO_OK) {
-        return ma_result_from_aaudio(resultAA);
-    }
-
-    if (pDeviceID != NULL) {
-        ((MA_PFN_AAudioStreamBuilder_setDeviceId)pContext->aaudio.AAudioStreamBuilder_setDeviceId)(pBuilder, pDeviceID->aaudio);
-    }
-
-    ((MA_PFN_AAudioStreamBuilder_setDirection)pContext->aaudio.AAudioStreamBuilder_setDirection)(pBuilder, (deviceType == ma_device_type_playback) ? MA_AAUDIO_DIRECTION_OUTPUT : MA_AAUDIO_DIRECTION_INPUT);
-    ((MA_PFN_AAudioStreamBuilder_setSharingMode)pContext->aaudio.AAudioStreamBuilder_setSharingMode)(pBuilder, (shareMode == ma_share_mode_shared) ? MA_AAUDIO_SHARING_MODE_SHARED : MA_AAUDIO_SHARING_MODE_EXCLUSIVE);
-
-
-    /* If we have a device descriptor make sure we configure the stream builder to take our requested parameters. */
-    if (pDescriptor != NULL) {
-        MA_ASSERT(pConfig != NULL); /* We must have a device config if we also have a descriptor. The config is required for AAudio specific configuration options. */
-
-        if (pDescriptor->sampleRate != 0) {
-            ((MA_PFN_AAudioStreamBuilder_setSampleRate)pContext->aaudio.AAudioStreamBuilder_setSampleRate)(pBuilder, pDescriptor->sampleRate);
-        }
-
-        if (deviceType == ma_device_type_capture) {
-            if (pDescriptor->channels != 0) {
-                ((MA_PFN_AAudioStreamBuilder_setChannelCount)pContext->aaudio.AAudioStreamBuilder_setChannelCount)(pBuilder, pDescriptor->channels);
-            }
-            if (pDescriptor->format != ma_format_unknown) {
-                ((MA_PFN_AAudioStreamBuilder_setFormat)pContext->aaudio.AAudioStreamBuilder_setFormat)(pBuilder, (pDescriptor->format == ma_format_s16) ? MA_AAUDIO_FORMAT_PCM_I16 : MA_AAUDIO_FORMAT_PCM_FLOAT);
-            }
-        } else {
-            if (pDescriptor->channels != 0) {
-                ((MA_PFN_AAudioStreamBuilder_setChannelCount)pContext->aaudio.AAudioStreamBuilder_setChannelCount)(pBuilder, pDescriptor->channels);
-            }
-            if (pDescriptor->format != ma_format_unknown) {
-                ((MA_PFN_AAudioStreamBuilder_setFormat)pContext->aaudio.AAudioStreamBuilder_setFormat)(pBuilder, (pDescriptor->format == ma_format_s16) ? MA_AAUDIO_FORMAT_PCM_I16 : MA_AAUDIO_FORMAT_PCM_FLOAT);
-            }
-        }
-
-
-        /*
-        There have been reports where setting the frames per data callback results in an error
-        later on from Android. To address this, I'm experimenting with simply not setting it on
-        anything from Android 11 and earlier. Suggestions welcome on how we might be able to make
-        this more targetted.
-        */
-        if (!pConfig->aaudio.enableCompatibilityWorkarounds || ma_android_sdk_version() > 30) {
-            /*
-            AAudio is annoying when it comes to it's buffer calculation stuff because it doesn't let you
-            retrieve the actual sample rate until after you've opened the stream. But you need to configure
-            the buffer capacity before you open the stream... :/
-
-            To solve, we're just going to assume MA_DEFAULT_SAMPLE_RATE (48000) and move on.
-            */
-            ma_uint32 bufferCapacityInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, pDescriptor->sampleRate, pConfig->performanceProfile) * pDescriptor->periodCount;
-
-            ((MA_PFN_AAudioStreamBuilder_setBufferCapacityInFrames)pContext->aaudio.AAudioStreamBuilder_setBufferCapacityInFrames)(pBuilder, bufferCapacityInFrames);
-            ((MA_PFN_AAudioStreamBuilder_setFramesPerDataCallback)pContext->aaudio.AAudioStreamBuilder_setFramesPerDataCallback)(pBuilder, bufferCapacityInFrames / pDescriptor->periodCount);
-        }
-
-        if (deviceType == ma_device_type_capture) {
-            if (pConfig->aaudio.inputPreset != ma_aaudio_input_preset_default && pContext->aaudio.AAudioStreamBuilder_setInputPreset != NULL) {
-                ((MA_PFN_AAudioStreamBuilder_setInputPreset)pContext->aaudio.AAudioStreamBuilder_setInputPreset)(pBuilder, ma_to_input_preset__aaudio(pConfig->aaudio.inputPreset));
-            }
-
-            ((MA_PFN_AAudioStreamBuilder_setDataCallback)pContext->aaudio.AAudioStreamBuilder_setDataCallback)(pBuilder, ma_stream_data_callback_capture__aaudio, (void*)pDevice);
-        } else {
-            if (pConfig->aaudio.usage != ma_aaudio_usage_default && pContext->aaudio.AAudioStreamBuilder_setUsage != NULL) {
-                ((MA_PFN_AAudioStreamBuilder_setUsage)pContext->aaudio.AAudioStreamBuilder_setUsage)(pBuilder, ma_to_usage__aaudio(pConfig->aaudio.usage));
-            }
-
-            if (pConfig->aaudio.contentType != ma_aaudio_content_type_default && pContext->aaudio.AAudioStreamBuilder_setContentType != NULL) {
-                ((MA_PFN_AAudioStreamBuilder_setContentType)pContext->aaudio.AAudioStreamBuilder_setContentType)(pBuilder, ma_to_content_type__aaudio(pConfig->aaudio.contentType));
-            }
-
-            if (pConfig->aaudio.allowedCapturePolicy != ma_aaudio_allow_capture_default && pContext->aaudio.AAudioStreamBuilder_setAllowedCapturePolicy != NULL) {
-                ((MA_PFN_AAudioStreamBuilder_setAllowedCapturePolicy)pContext->aaudio.AAudioStreamBuilder_setAllowedCapturePolicy)(pBuilder, ma_to_allowed_capture_policy__aaudio(pConfig->aaudio.allowedCapturePolicy));
-            }
-
-            ((MA_PFN_AAudioStreamBuilder_setDataCallback)pContext->aaudio.AAudioStreamBuilder_setDataCallback)(pBuilder, ma_stream_data_callback_playback__aaudio, (void*)pDevice);
-        }
-
-        /* Not sure how this affects things, but since there's a mapping between miniaudio's performance profiles and AAudio's performance modes, let go ahead and set it. */
-        ((MA_PFN_AAudioStreamBuilder_setPerformanceMode)pContext->aaudio.AAudioStreamBuilder_setPerformanceMode)(pBuilder, (pConfig->performanceProfile == ma_performance_profile_low_latency) ? MA_AAUDIO_PERFORMANCE_MODE_LOW_LATENCY : MA_AAUDIO_PERFORMANCE_MODE_NONE);
-
-        /* We need to set an error callback to detect device changes. */
-        if (pDevice != NULL) {  /* <-- pDevice should never be null if pDescriptor is not null, which is always the case if we hit this branch. Check anyway for safety. */
-            ((MA_PFN_AAudioStreamBuilder_setErrorCallback)pContext->aaudio.AAudioStreamBuilder_setErrorCallback)(pBuilder, ma_stream_error_callback__aaudio, (void*)pDevice);
-        }
-    }
-
-    *ppBuilder = pBuilder;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_open_stream_and_close_builder__aaudio(ma_context* pContext, ma_AAudioStreamBuilder* pBuilder, ma_AAudioStream** ppStream)
-{
-    ma_result result;
-
-    result = ma_result_from_aaudio(((MA_PFN_AAudioStreamBuilder_openStream)pContext->aaudio.AAudioStreamBuilder_openStream)(pBuilder, ppStream));
-    ((MA_PFN_AAudioStreamBuilder_delete)pContext->aaudio.AAudioStreamBuilder_delete)(pBuilder);
-
-    return result;
-}
-
-static ma_result ma_open_stream_basic__aaudio(ma_context* pContext, const ma_device_id* pDeviceID, ma_device_type deviceType, ma_share_mode shareMode, ma_AAudioStream** ppStream)
-{
-    ma_result result;
-    ma_AAudioStreamBuilder* pBuilder;
-
-    *ppStream = NULL;
-
-    result = ma_create_and_configure_AAudioStreamBuilder__aaudio(pContext, pDeviceID, deviceType, shareMode, NULL, NULL, NULL, &pBuilder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return ma_open_stream_and_close_builder__aaudio(pContext, pBuilder, ppStream);
-}
-
-static ma_result ma_open_stream__aaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_type deviceType, const ma_device_descriptor* pDescriptor, ma_AAudioStream** ppStream)
-{
-    ma_result result;
-    ma_AAudioStreamBuilder* pBuilder;
-
-    MA_ASSERT(pDevice != NULL);
-    MA_ASSERT(pDescriptor != NULL);
-    MA_ASSERT(deviceType != ma_device_type_duplex);   /* This function should not be called for a full-duplex device type. */
-
-    *ppStream = NULL;
-
-    result = ma_create_and_configure_AAudioStreamBuilder__aaudio(pDevice->pContext, pDescriptor->pDeviceID, deviceType, pDescriptor->shareMode, pDescriptor, pConfig, pDevice, &pBuilder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return ma_open_stream_and_close_builder__aaudio(pDevice->pContext, pBuilder, ppStream);
-}
-
-static ma_result ma_close_stream__aaudio(ma_context* pContext, ma_AAudioStream* pStream)
-{
-    return ma_result_from_aaudio(((MA_PFN_AAudioStream_close)pContext->aaudio.AAudioStream_close)(pStream));
-}
-
-static ma_bool32 ma_has_default_device__aaudio(ma_context* pContext, ma_device_type deviceType)
-{
-    /* The only way to know this is to try creating a stream. */
-    ma_AAudioStream* pStream;
-    ma_result result = ma_open_stream_basic__aaudio(pContext, NULL, deviceType, ma_share_mode_shared, &pStream);
-    if (result != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-
-    ma_close_stream__aaudio(pContext, pStream);
-    return MA_TRUE;
-}
-
-static ma_result ma_wait_for_simple_state_transition__aaudio(ma_context* pContext, ma_AAudioStream* pStream, ma_aaudio_stream_state_t oldState, ma_aaudio_stream_state_t newState)
-{
-    ma_aaudio_stream_state_t actualNewState;
-    ma_aaudio_result_t resultAA = ((MA_PFN_AAudioStream_waitForStateChange)pContext->aaudio.AAudioStream_waitForStateChange)(pStream, oldState, &actualNewState, 5000000000); /* 5 second timeout. */
-    if (resultAA != MA_AAUDIO_OK) {
-        return ma_result_from_aaudio(resultAA);
-    }
-
-    if (newState != actualNewState) {
-        return MA_ERROR;   /* Failed to transition into the expected state. */
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_context_enumerate_devices__aaudio(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_bool32 cbResult = MA_TRUE;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    /* Unfortunately AAudio does not have an enumeration API. Therefore I'm only going to report default devices, but only if it can instantiate a stream. */
-
-    /* Playback. */
-    if (cbResult) {
-        ma_device_info deviceInfo;
-        MA_ZERO_OBJECT(&deviceInfo);
-        deviceInfo.id.aaudio = MA_AAUDIO_UNSPECIFIED;
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-
-        if (ma_has_default_device__aaudio(pContext, ma_device_type_playback)) {
-            cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-        }
-    }
-
-    /* Capture. */
-    if (cbResult) {
-        ma_device_info deviceInfo;
-        MA_ZERO_OBJECT(&deviceInfo);
-        deviceInfo.id.aaudio = MA_AAUDIO_UNSPECIFIED;
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-
-        if (ma_has_default_device__aaudio(pContext, ma_device_type_capture)) {
-            cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_context_add_native_data_format_from_AAudioStream_ex__aaudio(ma_context* pContext, ma_AAudioStream* pStream, ma_format format, ma_uint32 flags, ma_device_info* pDeviceInfo)
-{
-    MA_ASSERT(pContext    != NULL);
-    MA_ASSERT(pStream     != NULL);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
-    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = ((MA_PFN_AAudioStream_getChannelCount)pContext->aaudio.AAudioStream_getChannelCount)(pStream);
-    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = ((MA_PFN_AAudioStream_getSampleRate)pContext->aaudio.AAudioStream_getSampleRate)(pStream);
-    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = flags;
-    pDeviceInfo->nativeDataFormatCount += 1;
-}
-
-static void ma_context_add_native_data_format_from_AAudioStream__aaudio(ma_context* pContext, ma_AAudioStream* pStream, ma_uint32 flags, ma_device_info* pDeviceInfo)
-{
-    /* AAudio supports s16 and f32. */
-    ma_context_add_native_data_format_from_AAudioStream_ex__aaudio(pContext, pStream, ma_format_f32, flags, pDeviceInfo);
-    ma_context_add_native_data_format_from_AAudioStream_ex__aaudio(pContext, pStream, ma_format_s16, flags, pDeviceInfo);
-}
-
-static ma_result ma_context_get_device_info__aaudio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    ma_AAudioStream* pStream;
-    ma_result result;
-
-    MA_ASSERT(pContext != NULL);
-
-    /* ID */
-    if (pDeviceID != NULL) {
-        pDeviceInfo->id.aaudio = pDeviceID->aaudio;
-    } else {
-        pDeviceInfo->id.aaudio = MA_AAUDIO_UNSPECIFIED;
-    }
-
-    /* Name */
-    if (deviceType == ma_device_type_playback) {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-    } else {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-    }
-
-
-    pDeviceInfo->nativeDataFormatCount = 0;
-
-    /* We'll need to open the device to get accurate sample rate and channel count information. */
-    result = ma_open_stream_basic__aaudio(pContext, pDeviceID, deviceType, ma_share_mode_shared, &pStream);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    ma_context_add_native_data_format_from_AAudioStream__aaudio(pContext, pStream, 0, pDeviceInfo);
-
-    ma_close_stream__aaudio(pContext, pStream);
-    pStream = NULL;
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_device_uninit__aaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ma_close_stream__aaudio(pDevice->pContext, (ma_AAudioStream*)pDevice->aaudio.pStreamCapture);
-        pDevice->aaudio.pStreamCapture = NULL;
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_close_stream__aaudio(pDevice->pContext, (ma_AAudioStream*)pDevice->aaudio.pStreamPlayback);
-        pDevice->aaudio.pStreamPlayback = NULL;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init_by_type__aaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_type deviceType, ma_device_descriptor* pDescriptor, ma_AAudioStream** ppStream)
-{
-    ma_result result;
-    int32_t bufferCapacityInFrames;
-    int32_t framesPerDataCallback;
-    ma_AAudioStream* pStream;
-
-    MA_ASSERT(pDevice     != NULL);
-    MA_ASSERT(pConfig     != NULL);
-    MA_ASSERT(pDescriptor != NULL);
-
-    *ppStream = NULL;   /* Safety. */
-
-    /* First step is to open the stream. From there we'll be able to extract the internal configuration. */
-    result = ma_open_stream__aaudio(pDevice, pConfig, deviceType, pDescriptor, &pStream);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to open the AAudio stream. */
-    }
-
-    /* Now extract the internal configuration. */
-    pDescriptor->format     = (((MA_PFN_AAudioStream_getFormat)pDevice->pContext->aaudio.AAudioStream_getFormat)(pStream) == MA_AAUDIO_FORMAT_PCM_I16) ? ma_format_s16 : ma_format_f32;
-    pDescriptor->channels   = ((MA_PFN_AAudioStream_getChannelCount)pDevice->pContext->aaudio.AAudioStream_getChannelCount)(pStream);
-    pDescriptor->sampleRate = ((MA_PFN_AAudioStream_getSampleRate)pDevice->pContext->aaudio.AAudioStream_getSampleRate)(pStream);
-
-    /* For the channel map we need to be sure we don't overflow any buffers. */
-    if (pDescriptor->channels <= MA_MAX_CHANNELS) {
-        ma_channel_map_init_standard(ma_standard_channel_map_default, pDescriptor->channelMap, ma_countof(pDescriptor->channelMap), pDescriptor->channels); /* <-- Cannot find info on channel order, so assuming a default. */
-    } else {
-        ma_channel_map_init_blank(pDescriptor->channelMap, MA_MAX_CHANNELS); /* Too many channels. Use a blank channel map. */
-    }
-
-    bufferCapacityInFrames = ((MA_PFN_AAudioStream_getBufferCapacityInFrames)pDevice->pContext->aaudio.AAudioStream_getBufferCapacityInFrames)(pStream);
-    framesPerDataCallback = ((MA_PFN_AAudioStream_getFramesPerDataCallback)pDevice->pContext->aaudio.AAudioStream_getFramesPerDataCallback)(pStream);
-
-    if (framesPerDataCallback > 0) {
-        pDescriptor->periodSizeInFrames = framesPerDataCallback;
-        pDescriptor->periodCount        = bufferCapacityInFrames / framesPerDataCallback;
-    } else {
-        pDescriptor->periodSizeInFrames = bufferCapacityInFrames;
-        pDescriptor->periodCount        = 1;
-    }
-
-    *ppStream = pStream;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init__aaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    pDevice->aaudio.usage                   = pConfig->aaudio.usage;
-    pDevice->aaudio.contentType             = pConfig->aaudio.contentType;
-    pDevice->aaudio.inputPreset             = pConfig->aaudio.inputPreset;
-    pDevice->aaudio.allowedCapturePolicy    = pConfig->aaudio.allowedCapturePolicy;
-    pDevice->aaudio.noAutoStartAfterReroute = pConfig->aaudio.noAutoStartAfterReroute;
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        result = ma_device_init_by_type__aaudio(pDevice, pConfig, ma_device_type_capture, pDescriptorCapture, (ma_AAudioStream**)&pDevice->aaudio.pStreamCapture);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        result = ma_device_init_by_type__aaudio(pDevice, pConfig, ma_device_type_playback, pDescriptorPlayback, (ma_AAudioStream**)&pDevice->aaudio.pStreamPlayback);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_start_stream__aaudio(ma_device* pDevice, ma_AAudioStream* pStream)
-{
-    ma_aaudio_result_t resultAA;
-    ma_aaudio_stream_state_t currentState;
-
-    MA_ASSERT(pDevice != NULL);
-
-    resultAA = ((MA_PFN_AAudioStream_requestStart)pDevice->pContext->aaudio.AAudioStream_requestStart)(pStream);
-    if (resultAA != MA_AAUDIO_OK) {
-        return ma_result_from_aaudio(resultAA);
-    }
-
-    /* Do we actually need to wait for the device to transition into it's started state? */
-
-    /* The device should be in either a starting or started state. If it's not set to started we need to wait for it to transition. It should go from starting to started. */
-    currentState = ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream);
-    if (currentState != MA_AAUDIO_STREAM_STATE_STARTED) {
-        ma_result result;
-
-        if (currentState != MA_AAUDIO_STREAM_STATE_STARTING) {
-            return MA_ERROR;   /* Expecting the stream to be a starting or started state. */
-        }
-
-        result = ma_wait_for_simple_state_transition__aaudio(pDevice->pContext, pStream, currentState, MA_AAUDIO_STREAM_STATE_STARTED);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop_stream__aaudio(ma_device* pDevice, ma_AAudioStream* pStream)
-{
-    ma_aaudio_result_t resultAA;
-    ma_aaudio_stream_state_t currentState;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /*
-    From the AAudio documentation:
-
-        The stream will stop after all of the data currently buffered has been played.
-
-    This maps with miniaudio's requirement that device's be drained which means we don't need to implement any draining logic.
-    */
-    currentState = ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream);
-    if (currentState == MA_AAUDIO_STREAM_STATE_DISCONNECTED) {
-        return MA_SUCCESS;  /* The device is disconnected. Don't try stopping it. */
-    }
-
-    resultAA = ((MA_PFN_AAudioStream_requestStop)pDevice->pContext->aaudio.AAudioStream_requestStop)(pStream);
-    if (resultAA != MA_AAUDIO_OK) {
-        return ma_result_from_aaudio(resultAA);
-    }
-
-    /* The device should be in either a stopping or stopped state. If it's not set to started we need to wait for it to transition. It should go from stopping to stopped. */
-    currentState = ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream);
-    if (currentState != MA_AAUDIO_STREAM_STATE_STOPPED) {
-        ma_result result;
-
-        if (currentState != MA_AAUDIO_STREAM_STATE_STOPPING) {
-            return MA_ERROR;   /* Expecting the stream to be a stopping or stopped state. */
-        }
-
-        result = ma_wait_for_simple_state_transition__aaudio(pDevice->pContext, pStream, currentState, MA_AAUDIO_STREAM_STATE_STOPPED);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_start__aaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ma_result result = ma_device_start_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamCapture);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_result result = ma_device_start_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamPlayback);
-        if (result != MA_SUCCESS) {
-            if (pDevice->type == ma_device_type_duplex) {
-                ma_device_stop_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamCapture);
-            }
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__aaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ma_result result = ma_device_stop_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamCapture);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_result result = ma_device_stop_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamPlayback);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    ma_device__on_notification_stopped(pDevice);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_reinit__aaudio(ma_device* pDevice, ma_device_type deviceType)
-{
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    /* The first thing to do is close the streams. */
-    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex) {
-        ma_close_stream__aaudio(pDevice->pContext, (ma_AAudioStream*)pDevice->aaudio.pStreamCapture);
-        pDevice->aaudio.pStreamCapture = NULL;
-    }
-
-    if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
-        ma_close_stream__aaudio(pDevice->pContext, (ma_AAudioStream*)pDevice->aaudio.pStreamPlayback);
-        pDevice->aaudio.pStreamPlayback = NULL;
-    }
-
-    /* Now we need to reinitialize each streams. The hardest part with this is just filling output the config and descriptors. */
-    {
-        ma_device_config deviceConfig;
-        ma_device_descriptor descriptorPlayback;
-        ma_device_descriptor descriptorCapture;
-
-        deviceConfig = ma_device_config_init(deviceType);
-        deviceConfig.playback.pDeviceID             = NULL; /* Only doing rerouting with default devices. */
-        deviceConfig.playback.shareMode             = pDevice->playback.shareMode;
-        deviceConfig.playback.format                = pDevice->playback.format;
-        deviceConfig.playback.channels              = pDevice->playback.channels;
-        deviceConfig.capture.pDeviceID              = NULL; /* Only doing rerouting with default devices. */
-        deviceConfig.capture.shareMode              = pDevice->capture.shareMode;
-        deviceConfig.capture.format                 = pDevice->capture.format;
-        deviceConfig.capture.channels               = pDevice->capture.channels;
-        deviceConfig.sampleRate                     = pDevice->sampleRate;
-        deviceConfig.aaudio.usage                   = pDevice->aaudio.usage;
-        deviceConfig.aaudio.contentType             = pDevice->aaudio.contentType;
-        deviceConfig.aaudio.inputPreset             = pDevice->aaudio.inputPreset;
-        deviceConfig.aaudio.allowedCapturePolicy    = pDevice->aaudio.allowedCapturePolicy;
-        deviceConfig.aaudio.noAutoStartAfterReroute = pDevice->aaudio.noAutoStartAfterReroute;
-        deviceConfig.periods                        = 1;
-
-        /* Try to get an accurate period size. */
-        if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
-            deviceConfig.periodSizeInFrames = pDevice->playback.internalPeriodSizeInFrames;
-        } else {
-            deviceConfig.periodSizeInFrames = pDevice->capture.internalPeriodSizeInFrames;
-        }
-
-        if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
-            descriptorCapture.pDeviceID           = deviceConfig.capture.pDeviceID;
-            descriptorCapture.shareMode           = deviceConfig.capture.shareMode;
-            descriptorCapture.format              = deviceConfig.capture.format;
-            descriptorCapture.channels            = deviceConfig.capture.channels;
-            descriptorCapture.sampleRate          = deviceConfig.sampleRate;
-            descriptorCapture.periodSizeInFrames  = deviceConfig.periodSizeInFrames;
-            descriptorCapture.periodCount         = deviceConfig.periods;
-        }
-
-        if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
-            descriptorPlayback.pDeviceID          = deviceConfig.playback.pDeviceID;
-            descriptorPlayback.shareMode          = deviceConfig.playback.shareMode;
-            descriptorPlayback.format             = deviceConfig.playback.format;
-            descriptorPlayback.channels           = deviceConfig.playback.channels;
-            descriptorPlayback.sampleRate         = deviceConfig.sampleRate;
-            descriptorPlayback.periodSizeInFrames = deviceConfig.periodSizeInFrames;
-            descriptorPlayback.periodCount        = deviceConfig.periods;
-        }
-
-        result = ma_device_init__aaudio(pDevice, &deviceConfig, &descriptorPlayback, &descriptorCapture);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        result = ma_device_post_init(pDevice, deviceType, &descriptorPlayback, &descriptorCapture);
-        if (result != MA_SUCCESS) {
-            ma_device_uninit__aaudio(pDevice);
-            return result;
-        }
-
-        /* We'll only ever do this in response to a reroute. */
-        ma_device__on_notification_rerouted(pDevice);
-
-        /* If the device is started, start the streams. Maybe make this configurable? */
-        if (ma_device_get_state(pDevice) == ma_device_state_started) {
-            if (pDevice->aaudio.noAutoStartAfterReroute == MA_FALSE) {
-                ma_device_start__aaudio(pDevice);
-            } else {
-                ma_device_stop(pDevice);    /* Do a full device stop so we set internal state correctly. */
-            }
-        }
-
-        return MA_SUCCESS;
-    }
-}
-
-static ma_result ma_device_get_info__aaudio(ma_device* pDevice, ma_device_type type, ma_device_info* pDeviceInfo)
-{
-    ma_AAudioStream* pStream = NULL;
-
-    MA_ASSERT(pDevice     != NULL);
-    MA_ASSERT(type        != ma_device_type_duplex);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    if (type == ma_device_type_playback) {
-        pStream = (ma_AAudioStream*)pDevice->aaudio.pStreamCapture;
-        pDeviceInfo->id.aaudio = pDevice->capture.id.aaudio;
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);     /* Only supporting default devices. */
-    }
-    if (type == ma_device_type_capture) {
-        pStream = (ma_AAudioStream*)pDevice->aaudio.pStreamPlayback;
-        pDeviceInfo->id.aaudio = pDevice->playback.id.aaudio;
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);    /* Only supporting default devices. */
-    }
-
-    /* Safety. Should never happen. */
-    if (pStream == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    pDeviceInfo->nativeDataFormatCount = 0;
-    ma_context_add_native_data_format_from_AAudioStream__aaudio(pDevice->pContext, pStream, 0, pDeviceInfo);
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_context_uninit__aaudio(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_aaudio);
-
-    ma_device_job_thread_uninit(&pContext->aaudio.jobThread, &pContext->allocationCallbacks);
-
-    ma_dlclose(ma_context_get_log(pContext), pContext->aaudio.hAAudio);
-    pContext->aaudio.hAAudio = NULL;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__aaudio(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    size_t i;
-    const char* libNames[] = {
-        "libaaudio.so"
-    };
-
-    for (i = 0; i < ma_countof(libNames); ++i) {
-        pContext->aaudio.hAAudio = ma_dlopen(ma_context_get_log(pContext), libNames[i]);
-        if (pContext->aaudio.hAAudio != NULL) {
-            break;
-        }
-    }
-
-    if (pContext->aaudio.hAAudio == NULL) {
-        return MA_FAILED_TO_INIT_BACKEND;
-    }
-
-    pContext->aaudio.AAudio_createStreamBuilder                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudio_createStreamBuilder");
-    pContext->aaudio.AAudioStreamBuilder_delete                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_delete");
-    pContext->aaudio.AAudioStreamBuilder_setDeviceId               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setDeviceId");
-    pContext->aaudio.AAudioStreamBuilder_setDirection              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setDirection");
-    pContext->aaudio.AAudioStreamBuilder_setSharingMode            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setSharingMode");
-    pContext->aaudio.AAudioStreamBuilder_setFormat                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setFormat");
-    pContext->aaudio.AAudioStreamBuilder_setChannelCount           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setChannelCount");
-    pContext->aaudio.AAudioStreamBuilder_setSampleRate             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setSampleRate");
-    pContext->aaudio.AAudioStreamBuilder_setBufferCapacityInFrames = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setBufferCapacityInFrames");
-    pContext->aaudio.AAudioStreamBuilder_setFramesPerDataCallback  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setFramesPerDataCallback");
-    pContext->aaudio.AAudioStreamBuilder_setDataCallback           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setDataCallback");
-    pContext->aaudio.AAudioStreamBuilder_setErrorCallback          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setErrorCallback");
-    pContext->aaudio.AAudioStreamBuilder_setPerformanceMode        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setPerformanceMode");
-    pContext->aaudio.AAudioStreamBuilder_setUsage                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setUsage");
-    pContext->aaudio.AAudioStreamBuilder_setContentType            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setContentType");
-    pContext->aaudio.AAudioStreamBuilder_setInputPreset            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setInputPreset");
-    pContext->aaudio.AAudioStreamBuilder_setAllowedCapturePolicy   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setAllowedCapturePolicy");
-    pContext->aaudio.AAudioStreamBuilder_openStream                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_openStream");
-    pContext->aaudio.AAudioStream_close                            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_close");
-    pContext->aaudio.AAudioStream_getState                         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getState");
-    pContext->aaudio.AAudioStream_waitForStateChange               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_waitForStateChange");
-    pContext->aaudio.AAudioStream_getFormat                        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getFormat");
-    pContext->aaudio.AAudioStream_getChannelCount                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getChannelCount");
-    pContext->aaudio.AAudioStream_getSampleRate                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getSampleRate");
-    pContext->aaudio.AAudioStream_getBufferCapacityInFrames        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getBufferCapacityInFrames");
-    pContext->aaudio.AAudioStream_getFramesPerDataCallback         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getFramesPerDataCallback");
-    pContext->aaudio.AAudioStream_getFramesPerBurst                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getFramesPerBurst");
-    pContext->aaudio.AAudioStream_requestStart                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_requestStart");
-    pContext->aaudio.AAudioStream_requestStop                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_requestStop");
-
-
-    pCallbacks->onContextInit             = ma_context_init__aaudio;
-    pCallbacks->onContextUninit           = ma_context_uninit__aaudio;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__aaudio;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__aaudio;
-    pCallbacks->onDeviceInit              = ma_device_init__aaudio;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__aaudio;
-    pCallbacks->onDeviceStart             = ma_device_start__aaudio;
-    pCallbacks->onDeviceStop              = ma_device_stop__aaudio;
-    pCallbacks->onDeviceRead              = NULL;   /* Not used because AAudio is asynchronous. */
-    pCallbacks->onDeviceWrite             = NULL;   /* Not used because AAudio is asynchronous. */
-    pCallbacks->onDeviceDataLoop          = NULL;   /* Not used because AAudio is asynchronous. */
-    pCallbacks->onDeviceGetInfo           = ma_device_get_info__aaudio;
-
-
-    /* We need a job thread so we can deal with rerouting. */
-    {
-        ma_result result;
-        ma_device_job_thread_config jobThreadConfig;
-
-        jobThreadConfig = ma_device_job_thread_config_init();
-
-        result = ma_device_job_thread_init(&jobThreadConfig, &pContext->allocationCallbacks, &pContext->aaudio.jobThread);
-        if (result != MA_SUCCESS) {
-            ma_dlclose(ma_context_get_log(pContext), pContext->aaudio.hAAudio);
-            pContext->aaudio.hAAudio = NULL;
-            return result;
-        }
-    }
-
-
-    (void)pConfig;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_job_process__device__aaudio_reroute(ma_job* pJob)
-{
-    ma_device* pDevice;
-
-    MA_ASSERT(pJob != NULL);
-
-    pDevice = (ma_device*)pJob->data.device.aaudio.reroute.pDevice;
-    MA_ASSERT(pDevice != NULL);
-
-    /* Here is where we need to reroute the device. To do this we need to uninitialize the stream and reinitialize it. */
-    return ma_device_reinit__aaudio(pDevice, (ma_device_type)pJob->data.device.aaudio.reroute.deviceType);
-}
-#else
-/* Getting here means there is no AAudio backend so we need a no-op job implementation. */
-static ma_result ma_job_process__device__aaudio_reroute(ma_job* pJob)
-{
-    return ma_job_process__noop(pJob);
-}
-#endif  /* AAudio */
-
-
-/******************************************************************************
-
-OpenSL|ES Backend
-
-******************************************************************************/
-#ifdef MA_HAS_OPENSL
-#include <SLES/OpenSLES.h>
-#ifdef MA_ANDROID
-#include <SLES/OpenSLES_Android.h>
-#endif
-
-typedef SLresult (SLAPIENTRY * ma_slCreateEngine_proc)(SLObjectItf* pEngine, SLuint32 numOptions, SLEngineOption* pEngineOptions, SLuint32 numInterfaces, SLInterfaceID* pInterfaceIds, SLboolean* pInterfaceRequired);
-
-/* OpenSL|ES has one-per-application objects :( */
-static SLObjectItf g_maEngineObjectSL    = NULL;
-static SLEngineItf g_maEngineSL          = NULL;
-static ma_uint32   g_maOpenSLInitCounter = 0;
-static ma_spinlock g_maOpenSLSpinlock    = 0;   /* For init/uninit. */
-
-#define MA_OPENSL_OBJ(p)         (*((SLObjectItf)(p)))
-#define MA_OPENSL_OUTPUTMIX(p)   (*((SLOutputMixItf)(p)))
-#define MA_OPENSL_PLAY(p)        (*((SLPlayItf)(p)))
-#define MA_OPENSL_RECORD(p)      (*((SLRecordItf)(p)))
-
-#ifdef MA_ANDROID
-#define MA_OPENSL_BUFFERQUEUE(p) (*((SLAndroidSimpleBufferQueueItf)(p)))
-#else
-#define MA_OPENSL_BUFFERQUEUE(p) (*((SLBufferQueueItf)(p)))
-#endif
-
-static ma_result ma_result_from_OpenSL(SLuint32 result)
-{
-    switch (result)
-    {
-        case SL_RESULT_SUCCESS:                 return MA_SUCCESS;
-        case SL_RESULT_PRECONDITIONS_VIOLATED:  return MA_ERROR;
-        case SL_RESULT_PARAMETER_INVALID:       return MA_INVALID_ARGS;
-        case SL_RESULT_MEMORY_FAILURE:          return MA_OUT_OF_MEMORY;
-        case SL_RESULT_RESOURCE_ERROR:          return MA_INVALID_DATA;
-        case SL_RESULT_RESOURCE_LOST:           return MA_ERROR;
-        case SL_RESULT_IO_ERROR:                return MA_IO_ERROR;
-        case SL_RESULT_BUFFER_INSUFFICIENT:     return MA_NO_SPACE;
-        case SL_RESULT_CONTENT_CORRUPTED:       return MA_INVALID_DATA;
-        case SL_RESULT_CONTENT_UNSUPPORTED:     return MA_FORMAT_NOT_SUPPORTED;
-        case SL_RESULT_CONTENT_NOT_FOUND:       return MA_ERROR;
-        case SL_RESULT_PERMISSION_DENIED:       return MA_ACCESS_DENIED;
-        case SL_RESULT_FEATURE_UNSUPPORTED:     return MA_NOT_IMPLEMENTED;
-        case SL_RESULT_INTERNAL_ERROR:          return MA_ERROR;
-        case SL_RESULT_UNKNOWN_ERROR:           return MA_ERROR;
-        case SL_RESULT_OPERATION_ABORTED:       return MA_ERROR;
-        case SL_RESULT_CONTROL_LOST:            return MA_ERROR;
-        default:                                return MA_ERROR;
-    }
-}
-
-/* Converts an individual OpenSL-style channel identifier (SL_SPEAKER_FRONT_LEFT, etc.) to miniaudio. */
-static ma_uint8 ma_channel_id_to_ma__opensl(SLuint32 id)
-{
-    switch (id)
-    {
-        case SL_SPEAKER_FRONT_LEFT:            return MA_CHANNEL_FRONT_LEFT;
-        case SL_SPEAKER_FRONT_RIGHT:           return MA_CHANNEL_FRONT_RIGHT;
-        case SL_SPEAKER_FRONT_CENTER:          return MA_CHANNEL_FRONT_CENTER;
-        case SL_SPEAKER_LOW_FREQUENCY:         return MA_CHANNEL_LFE;
-        case SL_SPEAKER_BACK_LEFT:             return MA_CHANNEL_BACK_LEFT;
-        case SL_SPEAKER_BACK_RIGHT:            return MA_CHANNEL_BACK_RIGHT;
-        case SL_SPEAKER_FRONT_LEFT_OF_CENTER:  return MA_CHANNEL_FRONT_LEFT_CENTER;
-        case SL_SPEAKER_FRONT_RIGHT_OF_CENTER: return MA_CHANNEL_FRONT_RIGHT_CENTER;
-        case SL_SPEAKER_BACK_CENTER:           return MA_CHANNEL_BACK_CENTER;
-        case SL_SPEAKER_SIDE_LEFT:             return MA_CHANNEL_SIDE_LEFT;
-        case SL_SPEAKER_SIDE_RIGHT:            return MA_CHANNEL_SIDE_RIGHT;
-        case SL_SPEAKER_TOP_CENTER:            return MA_CHANNEL_TOP_CENTER;
-        case SL_SPEAKER_TOP_FRONT_LEFT:        return MA_CHANNEL_TOP_FRONT_LEFT;
-        case SL_SPEAKER_TOP_FRONT_CENTER:      return MA_CHANNEL_TOP_FRONT_CENTER;
-        case SL_SPEAKER_TOP_FRONT_RIGHT:       return MA_CHANNEL_TOP_FRONT_RIGHT;
-        case SL_SPEAKER_TOP_BACK_LEFT:         return MA_CHANNEL_TOP_BACK_LEFT;
-        case SL_SPEAKER_TOP_BACK_CENTER:       return MA_CHANNEL_TOP_BACK_CENTER;
-        case SL_SPEAKER_TOP_BACK_RIGHT:        return MA_CHANNEL_TOP_BACK_RIGHT;
-        default: return 0;
-    }
-}
-
-/* Converts an individual miniaudio channel identifier (MA_CHANNEL_FRONT_LEFT, etc.) to OpenSL-style. */
-static SLuint32 ma_channel_id_to_opensl(ma_uint8 id)
-{
-    switch (id)
-    {
-        case MA_CHANNEL_MONO:               return SL_SPEAKER_FRONT_CENTER;
-        case MA_CHANNEL_FRONT_LEFT:         return SL_SPEAKER_FRONT_LEFT;
-        case MA_CHANNEL_FRONT_RIGHT:        return SL_SPEAKER_FRONT_RIGHT;
-        case MA_CHANNEL_FRONT_CENTER:       return SL_SPEAKER_FRONT_CENTER;
-        case MA_CHANNEL_LFE:                return SL_SPEAKER_LOW_FREQUENCY;
-        case MA_CHANNEL_BACK_LEFT:          return SL_SPEAKER_BACK_LEFT;
-        case MA_CHANNEL_BACK_RIGHT:         return SL_SPEAKER_BACK_RIGHT;
-        case MA_CHANNEL_FRONT_LEFT_CENTER:  return SL_SPEAKER_FRONT_LEFT_OF_CENTER;
-        case MA_CHANNEL_FRONT_RIGHT_CENTER: return SL_SPEAKER_FRONT_RIGHT_OF_CENTER;
-        case MA_CHANNEL_BACK_CENTER:        return SL_SPEAKER_BACK_CENTER;
-        case MA_CHANNEL_SIDE_LEFT:          return SL_SPEAKER_SIDE_LEFT;
-        case MA_CHANNEL_SIDE_RIGHT:         return SL_SPEAKER_SIDE_RIGHT;
-        case MA_CHANNEL_TOP_CENTER:         return SL_SPEAKER_TOP_CENTER;
-        case MA_CHANNEL_TOP_FRONT_LEFT:     return SL_SPEAKER_TOP_FRONT_LEFT;
-        case MA_CHANNEL_TOP_FRONT_CENTER:   return SL_SPEAKER_TOP_FRONT_CENTER;
-        case MA_CHANNEL_TOP_FRONT_RIGHT:    return SL_SPEAKER_TOP_FRONT_RIGHT;
-        case MA_CHANNEL_TOP_BACK_LEFT:      return SL_SPEAKER_TOP_BACK_LEFT;
-        case MA_CHANNEL_TOP_BACK_CENTER:    return SL_SPEAKER_TOP_BACK_CENTER;
-        case MA_CHANNEL_TOP_BACK_RIGHT:     return SL_SPEAKER_TOP_BACK_RIGHT;
-        default: return 0;
-    }
-}
-
-/* Converts a channel mapping to an OpenSL-style channel mask. */
-static SLuint32 ma_channel_map_to_channel_mask__opensl(const ma_channel* pChannelMap, ma_uint32 channels)
-{
-    SLuint32 channelMask = 0;
-    ma_uint32 iChannel;
-    for (iChannel = 0; iChannel < channels; ++iChannel) {
-        channelMask |= ma_channel_id_to_opensl(pChannelMap[iChannel]);
-    }
-
-    return channelMask;
-}
-
-/* Converts an OpenSL-style channel mask to a miniaudio channel map. */
-static void ma_channel_mask_to_channel_map__opensl(SLuint32 channelMask, ma_uint32 channels, ma_channel* pChannelMap)
-{
-    if (channels == 1 && channelMask == 0) {
-        pChannelMap[0] = MA_CHANNEL_MONO;
-    } else if (channels == 2 && channelMask == 0) {
-        pChannelMap[0] = MA_CHANNEL_FRONT_LEFT;
-        pChannelMap[1] = MA_CHANNEL_FRONT_RIGHT;
-    } else {
-        if (channels == 1 && (channelMask & SL_SPEAKER_FRONT_CENTER) != 0) {
-            pChannelMap[0] = MA_CHANNEL_MONO;
-        } else {
-            /* Just iterate over each bit. */
-            ma_uint32 iChannel = 0;
-            ma_uint32 iBit;
-            for (iBit = 0; iBit < 32 && iChannel < channels; ++iBit) {
-                SLuint32 bitValue = (channelMask & (1UL << iBit));
-                if (bitValue != 0) {
-                    /* The bit is set. */
-                    pChannelMap[iChannel] = ma_channel_id_to_ma__opensl(bitValue);
-                    iChannel += 1;
-                }
-            }
-        }
-    }
-}
-
-static SLuint32 ma_round_to_standard_sample_rate__opensl(SLuint32 samplesPerSec)
-{
-    if (samplesPerSec <= SL_SAMPLINGRATE_8) {
-        return SL_SAMPLINGRATE_8;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_11_025) {
-        return SL_SAMPLINGRATE_11_025;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_12) {
-        return SL_SAMPLINGRATE_12;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_16) {
-        return SL_SAMPLINGRATE_16;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_22_05) {
-        return SL_SAMPLINGRATE_22_05;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_24) {
-        return SL_SAMPLINGRATE_24;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_32) {
-        return SL_SAMPLINGRATE_32;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_44_1) {
-        return SL_SAMPLINGRATE_44_1;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_48) {
-        return SL_SAMPLINGRATE_48;
-    }
-
-    /* Android doesn't support more than 48000. */
-#ifndef MA_ANDROID
-    if (samplesPerSec <= SL_SAMPLINGRATE_64) {
-        return SL_SAMPLINGRATE_64;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_88_2) {
-        return SL_SAMPLINGRATE_88_2;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_96) {
-        return SL_SAMPLINGRATE_96;
-    }
-    if (samplesPerSec <= SL_SAMPLINGRATE_192) {
-        return SL_SAMPLINGRATE_192;
-    }
-#endif
-
-    return SL_SAMPLINGRATE_16;
-}
-
-
-static SLint32 ma_to_stream_type__opensl(ma_opensl_stream_type streamType)
-{
-    switch (streamType) {
-        case ma_opensl_stream_type_voice:        return SL_ANDROID_STREAM_VOICE;
-        case ma_opensl_stream_type_system:       return SL_ANDROID_STREAM_SYSTEM;
-        case ma_opensl_stream_type_ring:         return SL_ANDROID_STREAM_RING;
-        case ma_opensl_stream_type_media:        return SL_ANDROID_STREAM_MEDIA;
-        case ma_opensl_stream_type_alarm:        return SL_ANDROID_STREAM_ALARM;
-        case ma_opensl_stream_type_notification: return SL_ANDROID_STREAM_NOTIFICATION;
-        default: break;
-    }
-
-    return SL_ANDROID_STREAM_VOICE;
-}
-
-static SLint32 ma_to_recording_preset__opensl(ma_opensl_recording_preset recordingPreset)
-{
-    switch (recordingPreset) {
-        case ma_opensl_recording_preset_generic:             return SL_ANDROID_RECORDING_PRESET_GENERIC;
-        case ma_opensl_recording_preset_camcorder:           return SL_ANDROID_RECORDING_PRESET_CAMCORDER;
-        case ma_opensl_recording_preset_voice_recognition:   return SL_ANDROID_RECORDING_PRESET_VOICE_RECOGNITION;
-        case ma_opensl_recording_preset_voice_communication: return SL_ANDROID_RECORDING_PRESET_VOICE_COMMUNICATION;
-        case ma_opensl_recording_preset_voice_unprocessed:   return SL_ANDROID_RECORDING_PRESET_UNPROCESSED;
-        default: break;
-    }
-
-    return SL_ANDROID_RECORDING_PRESET_NONE;
-}
-
-
-static ma_result ma_context_enumerate_devices__opensl(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_bool32 cbResult;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it and then attempted to enumerate devices. */
-    if (g_maOpenSLInitCounter == 0) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /*
-    TODO: Test Me.
-
-    This is currently untested, so for now we are just returning default devices.
-    */
-#if 0 && !defined(MA_ANDROID)
-    ma_bool32 isTerminated = MA_FALSE;
-
-    SLuint32 pDeviceIDs[128];
-    SLint32 deviceCount = sizeof(pDeviceIDs) / sizeof(pDeviceIDs[0]);
-
-    SLAudioIODeviceCapabilitiesItf deviceCaps;
-    SLresult resultSL = (*g_maEngineObjectSL)->GetInterface(g_maEngineObjectSL, (SLInterfaceID)pContext->opensl.SL_IID_AUDIOIODEVICECAPABILITIES, &deviceCaps);
-    if (resultSL != SL_RESULT_SUCCESS) {
-        /* The interface may not be supported so just report a default device. */
-        goto return_default_device;
-    }
-
-    /* Playback */
-    if (!isTerminated) {
-        resultSL = (*deviceCaps)->GetAvailableAudioOutputs(deviceCaps, &deviceCount, pDeviceIDs);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        for (SLint32 iDevice = 0; iDevice < deviceCount; ++iDevice) {
-            ma_device_info deviceInfo;
-            MA_ZERO_OBJECT(&deviceInfo);
-            deviceInfo.id.opensl = pDeviceIDs[iDevice];
-
-            SLAudioOutputDescriptor desc;
-            resultSL = (*deviceCaps)->QueryAudioOutputCapabilities(deviceCaps, deviceInfo.id.opensl, &desc);
-            if (resultSL == SL_RESULT_SUCCESS) {
-                ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), (const char*)desc.pDeviceName, (size_t)-1);
-
-                ma_bool32 cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-                if (cbResult == MA_FALSE) {
-                    isTerminated = MA_TRUE;
-                    break;
-                }
-            }
-        }
-    }
-
-    /* Capture */
-    if (!isTerminated) {
-        resultSL = (*deviceCaps)->GetAvailableAudioInputs(deviceCaps, &deviceCount, pDeviceIDs);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        for (SLint32 iDevice = 0; iDevice < deviceCount; ++iDevice) {
-            ma_device_info deviceInfo;
-            MA_ZERO_OBJECT(&deviceInfo);
-            deviceInfo.id.opensl = pDeviceIDs[iDevice];
-
-            SLAudioInputDescriptor desc;
-            resultSL = (*deviceCaps)->QueryAudioInputCapabilities(deviceCaps, deviceInfo.id.opensl, &desc);
-            if (resultSL == SL_RESULT_SUCCESS) {
-                ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), (const char*)desc.deviceName, (size_t)-1);
-
-                ma_bool32 cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-                if (cbResult == MA_FALSE) {
-                    isTerminated = MA_TRUE;
-                    break;
-                }
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-#else
-    goto return_default_device;
-#endif
-
-return_default_device:;
-    cbResult = MA_TRUE;
-
-    /* Playback. */
-    if (cbResult) {
-        ma_device_info deviceInfo;
-        MA_ZERO_OBJECT(&deviceInfo);
-        deviceInfo.id.opensl = SL_DEFAULTDEVICEID_AUDIOOUTPUT;
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-    }
-
-    /* Capture. */
-    if (cbResult) {
-        ma_device_info deviceInfo;
-        MA_ZERO_OBJECT(&deviceInfo);
-        deviceInfo.id.opensl = SL_DEFAULTDEVICEID_AUDIOINPUT;
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-        cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_context_add_data_format_ex__opensl(ma_context* pContext, ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_device_info* pDeviceInfo)
-{
-    MA_ASSERT(pContext    != NULL);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
-    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
-    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = sampleRate;
-    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = 0;
-    pDeviceInfo->nativeDataFormatCount += 1;
-}
-
-static void ma_context_add_data_format__opensl(ma_context* pContext, ma_format format, ma_device_info* pDeviceInfo)
-{
-    ma_uint32 minChannels   = 1;
-    ma_uint32 maxChannels   = 2;
-    ma_uint32 minSampleRate = (ma_uint32)ma_standard_sample_rate_8000;
-    ma_uint32 maxSampleRate = (ma_uint32)ma_standard_sample_rate_48000;
-    ma_uint32 iChannel;
-    ma_uint32 iSampleRate;
-
-    MA_ASSERT(pContext    != NULL);
-    MA_ASSERT(pDeviceInfo != NULL);
-
-    /*
-    Each sample format can support mono and stereo, and we'll support a small subset of standard
-    rates (up to 48000). A better solution would be to somehow find a native sample rate.
-    */
-    for (iChannel = minChannels; iChannel < maxChannels; iChannel += 1) {
-        for (iSampleRate = 0; iSampleRate < ma_countof(g_maStandardSampleRatePriorities); iSampleRate += 1) {
-            ma_uint32 standardSampleRate = g_maStandardSampleRatePriorities[iSampleRate];
-            if (standardSampleRate >= minSampleRate && standardSampleRate <= maxSampleRate) {
-                ma_context_add_data_format_ex__opensl(pContext, format, iChannel, standardSampleRate, pDeviceInfo);
-            }
-        }
-    }
-}
-
-static ma_result ma_context_get_device_info__opensl(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    MA_ASSERT(pContext != NULL);
-
-    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it and then attempted to get device info. */
-    if (g_maOpenSLInitCounter == 0) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /*
-    TODO: Test Me.
-
-    This is currently untested, so for now we are just returning default devices.
-    */
-#if 0 && !defined(MA_ANDROID)
-    SLAudioIODeviceCapabilitiesItf deviceCaps;
-    SLresult resultSL = (*g_maEngineObjectSL)->GetInterface(g_maEngineObjectSL, (SLInterfaceID)pContext->opensl.SL_IID_AUDIOIODEVICECAPABILITIES, &deviceCaps);
-    if (resultSL != SL_RESULT_SUCCESS) {
-        /* The interface may not be supported so just report a default device. */
-        goto return_default_device;
-    }
-
-    if (deviceType == ma_device_type_playback) {
-        SLAudioOutputDescriptor desc;
-        resultSL = (*deviceCaps)->QueryAudioOutputCapabilities(deviceCaps, pDeviceID->opensl, &desc);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), (const char*)desc.pDeviceName, (size_t)-1);
-    } else {
-        SLAudioInputDescriptor desc;
-        resultSL = (*deviceCaps)->QueryAudioInputCapabilities(deviceCaps, pDeviceID->opensl, &desc);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), (const char*)desc.deviceName, (size_t)-1);
-    }
-
-    goto return_detailed_info;
-#else
-    goto return_default_device;
-#endif
-
-return_default_device:
-    if (pDeviceID != NULL) {
-        if ((deviceType == ma_device_type_playback && pDeviceID->opensl != SL_DEFAULTDEVICEID_AUDIOOUTPUT) ||
-            (deviceType == ma_device_type_capture  && pDeviceID->opensl != SL_DEFAULTDEVICEID_AUDIOINPUT)) {
-            return MA_NO_DEVICE;   /* Don't know the device. */
-        }
-    }
-
-    /* ID and Name / Description */
-    if (deviceType == ma_device_type_playback) {
-        pDeviceInfo->id.opensl = SL_DEFAULTDEVICEID_AUDIOOUTPUT;
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-    } else {
-        pDeviceInfo->id.opensl = SL_DEFAULTDEVICEID_AUDIOINPUT;
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-    }
-
-    pDeviceInfo->isDefault = MA_TRUE;
-
-    goto return_detailed_info;
-
-
-return_detailed_info:
-
-    /*
-    For now we're just outputting a set of values that are supported by the API but not necessarily supported
-    by the device natively. Later on we should work on this so that it more closely reflects the device's
-    actual native format.
-    */
-    pDeviceInfo->nativeDataFormatCount = 0;
-#if defined(MA_ANDROID) && __ANDROID_API__ >= 21
-    ma_context_add_data_format__opensl(pContext, ma_format_f32, pDeviceInfo);
-#endif
-    ma_context_add_data_format__opensl(pContext, ma_format_s16, pDeviceInfo);
-    ma_context_add_data_format__opensl(pContext, ma_format_u8,  pDeviceInfo);
-
-    return MA_SUCCESS;
-}
-
-
-#ifdef MA_ANDROID
-/*void ma_buffer_queue_callback_capture__opensl_android(SLAndroidSimpleBufferQueueItf pBufferQueue, SLuint32 eventFlags, const void* pBuffer, SLuint32 bufferSize, SLuint32 dataUsed, void* pContext)*/
-static void ma_buffer_queue_callback_capture__opensl_android(SLAndroidSimpleBufferQueueItf pBufferQueue, void* pUserData)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    size_t periodSizeInBytes;
-    ma_uint8* pBuffer;
-    SLresult resultSL;
-
-    MA_ASSERT(pDevice != NULL);
-
-    (void)pBufferQueue;
-
-    /*
-    For now, don't do anything unless the buffer was fully processed. From what I can tell, it looks like
-    OpenSL|ES 1.1 improves on buffer queues to the point that we could much more intelligently handle this,
-    but unfortunately it looks like Android is only supporting OpenSL|ES 1.0.1 for now :(
-    */
-
-    /* Don't do anything if the device is not started. */
-    if (ma_device_get_state(pDevice) != ma_device_state_started) {
-        return;
-    }
-
-    /* Don't do anything if the device is being drained. */
-    if (pDevice->opensl.isDrainingCapture) {
-        return;
-    }
-
-    periodSizeInBytes = pDevice->capture.internalPeriodSizeInFrames * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-    pBuffer = pDevice->opensl.pBufferCapture + (pDevice->opensl.currentBufferIndexCapture * periodSizeInBytes);
-
-    ma_device_handle_backend_data_callback(pDevice, NULL, pBuffer, pDevice->capture.internalPeriodSizeInFrames);
-
-    resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueueCapture)->Enqueue((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture, pBuffer, periodSizeInBytes);
-    if (resultSL != SL_RESULT_SUCCESS) {
-        return;
-    }
-
-    pDevice->opensl.currentBufferIndexCapture = (pDevice->opensl.currentBufferIndexCapture + 1) % pDevice->capture.internalPeriods;
-}
-
-static void ma_buffer_queue_callback_playback__opensl_android(SLAndroidSimpleBufferQueueItf pBufferQueue, void* pUserData)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    size_t periodSizeInBytes;
-    ma_uint8* pBuffer;
-    SLresult resultSL;
-
-    MA_ASSERT(pDevice != NULL);
-
-    (void)pBufferQueue;
-
-    /* Don't do anything if the device is not started. */
-    if (ma_device_get_state(pDevice) != ma_device_state_started) {
-        return;
-    }
-
-    /* Don't do anything if the device is being drained. */
-    if (pDevice->opensl.isDrainingPlayback) {
-        return;
-    }
-
-    periodSizeInBytes = pDevice->playback.internalPeriodSizeInFrames * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-    pBuffer = pDevice->opensl.pBufferPlayback + (pDevice->opensl.currentBufferIndexPlayback * periodSizeInBytes);
-
-    ma_device_handle_backend_data_callback(pDevice, pBuffer, NULL, pDevice->playback.internalPeriodSizeInFrames);
-
-    resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueuePlayback)->Enqueue((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback, pBuffer, periodSizeInBytes);
-    if (resultSL != SL_RESULT_SUCCESS) {
-        return;
-    }
-
-    pDevice->opensl.currentBufferIndexPlayback = (pDevice->opensl.currentBufferIndexPlayback + 1) % pDevice->playback.internalPeriods;
-}
-#endif
-
-static ma_result ma_device_uninit__opensl(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it before uninitializing the device. */
-    if (g_maOpenSLInitCounter == 0) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        if (pDevice->opensl.pAudioRecorderObj) {
-            MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->Destroy((SLObjectItf)pDevice->opensl.pAudioRecorderObj);
-        }
-
-        ma_free(pDevice->opensl.pBufferCapture, &pDevice->pContext->allocationCallbacks);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        if (pDevice->opensl.pAudioPlayerObj) {
-            MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->Destroy((SLObjectItf)pDevice->opensl.pAudioPlayerObj);
-        }
-        if (pDevice->opensl.pOutputMixObj) {
-            MA_OPENSL_OBJ(pDevice->opensl.pOutputMixObj)->Destroy((SLObjectItf)pDevice->opensl.pOutputMixObj);
-        }
-
-        ma_free(pDevice->opensl.pBufferPlayback, &pDevice->pContext->allocationCallbacks);
-    }
-
-    return MA_SUCCESS;
-}
-
-#if defined(MA_ANDROID) && __ANDROID_API__ >= 21
-typedef SLAndroidDataFormat_PCM_EX  ma_SLDataFormat_PCM;
-#else
-typedef SLDataFormat_PCM            ma_SLDataFormat_PCM;
-#endif
-
-static ma_result ma_SLDataFormat_PCM_init__opensl(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, const ma_channel* channelMap, ma_SLDataFormat_PCM* pDataFormat)
-{
-    /* We need to convert our format/channels/rate so that they aren't set to default. */
-    if (format == ma_format_unknown) {
-        format = MA_DEFAULT_FORMAT;
-    }
-    if (channels == 0) {
-        channels = MA_DEFAULT_CHANNELS;
-    }
-    if (sampleRate == 0) {
-        sampleRate = MA_DEFAULT_SAMPLE_RATE;
-    }
-
-#if defined(MA_ANDROID) && __ANDROID_API__ >= 21
-    if (format == ma_format_f32) {
-        pDataFormat->formatType     = SL_ANDROID_DATAFORMAT_PCM_EX;
-        pDataFormat->representation = SL_ANDROID_PCM_REPRESENTATION_FLOAT;
-    } else {
-        pDataFormat->formatType = SL_DATAFORMAT_PCM;
-    }
-#else
-    pDataFormat->formatType = SL_DATAFORMAT_PCM;
-#endif
-
-    pDataFormat->numChannels   = channels;
-    ((SLDataFormat_PCM*)pDataFormat)->samplesPerSec = ma_round_to_standard_sample_rate__opensl(sampleRate * 1000);  /* In millihertz. Annoyingly, the sample rate variable is named differently between SLAndroidDataFormat_PCM_EX and SLDataFormat_PCM */
-    pDataFormat->bitsPerSample = ma_get_bytes_per_sample(format) * 8;
-    pDataFormat->channelMask   = ma_channel_map_to_channel_mask__opensl(channelMap, channels);
-    pDataFormat->endianness    = (ma_is_little_endian()) ? SL_BYTEORDER_LITTLEENDIAN : SL_BYTEORDER_BIGENDIAN;
-
-    /*
-    Android has a few restrictions on the format as documented here: https://developer.android.com/ndk/guides/audio/opensl-for-android.html
-     - Only mono and stereo is supported.
-     - Only u8 and s16 formats are supported.
-     - Maximum sample rate of 48000.
-    */
-#ifdef MA_ANDROID
-    if (pDataFormat->numChannels > 2) {
-        pDataFormat->numChannels = 2;
-    }
-#if __ANDROID_API__ >= 21
-    if (pDataFormat->formatType == SL_ANDROID_DATAFORMAT_PCM_EX) {
-        /* It's floating point. */
-        MA_ASSERT(pDataFormat->representation == SL_ANDROID_PCM_REPRESENTATION_FLOAT);
-        if (pDataFormat->bitsPerSample > 32) {
-            pDataFormat->bitsPerSample = 32;
-        }
-    } else {
-        if (pDataFormat->bitsPerSample > 16) {
-            pDataFormat->bitsPerSample = 16;
-        }
-    }
-#else
-    if (pDataFormat->bitsPerSample > 16) {
-        pDataFormat->bitsPerSample = 16;
-    }
-#endif
-    if (((SLDataFormat_PCM*)pDataFormat)->samplesPerSec > SL_SAMPLINGRATE_48) {
-        ((SLDataFormat_PCM*)pDataFormat)->samplesPerSec = SL_SAMPLINGRATE_48;
-    }
-#endif
-
-    pDataFormat->containerSize = pDataFormat->bitsPerSample;  /* Always tightly packed for now. */
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_deconstruct_SLDataFormat_PCM__opensl(ma_SLDataFormat_PCM* pDataFormat, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    ma_bool32 isFloatingPoint = MA_FALSE;
-#if defined(MA_ANDROID) && __ANDROID_API__ >= 21
-    if (pDataFormat->formatType == SL_ANDROID_DATAFORMAT_PCM_EX) {
-        MA_ASSERT(pDataFormat->representation == SL_ANDROID_PCM_REPRESENTATION_FLOAT);
-        isFloatingPoint = MA_TRUE;
-    }
-#endif
-    if (isFloatingPoint) {
-        if (pDataFormat->bitsPerSample == 32) {
-            *pFormat = ma_format_f32;
-        }
-    } else {
-        if (pDataFormat->bitsPerSample == 8) {
-            *pFormat = ma_format_u8;
-        } else if (pDataFormat->bitsPerSample == 16) {
-            *pFormat = ma_format_s16;
-        } else if (pDataFormat->bitsPerSample == 24) {
-            *pFormat = ma_format_s24;
-        } else if (pDataFormat->bitsPerSample == 32) {
-            *pFormat = ma_format_s32;
-        }
-    }
-
-    *pChannels   = pDataFormat->numChannels;
-    *pSampleRate = ((SLDataFormat_PCM*)pDataFormat)->samplesPerSec / 1000;
-    ma_channel_mask_to_channel_map__opensl(pDataFormat->channelMask, ma_min(pDataFormat->numChannels, channelMapCap), pChannelMap);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_init__opensl(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-#ifdef MA_ANDROID
-    SLDataLocator_AndroidSimpleBufferQueue queue;
-    SLresult resultSL;
-    size_t bufferSizeInBytes;
-    SLInterfaceID itfIDs[2];
-    const SLboolean itfIDsRequired[] = {
-        SL_BOOLEAN_TRUE,    /* SL_IID_ANDROIDSIMPLEBUFFERQUEUE */
-        SL_BOOLEAN_FALSE    /* SL_IID_ANDROIDCONFIGURATION */
-    };
-#endif
-
-    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it and then attempted to initialize a new device. */
-    if (g_maOpenSLInitCounter == 0) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    /*
-    For now, only supporting Android implementations of OpenSL|ES since that's the only one I've
-    been able to test with and I currently depend on Android-specific extensions (simple buffer
-    queues).
-    */
-#ifdef MA_ANDROID
-    itfIDs[0] = (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE;
-    itfIDs[1] = (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDCONFIGURATION;
-
-    /* No exclusive mode with OpenSL|ES. */
-    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
-        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
-        return MA_SHARE_MODE_NOT_SUPPORTED;
-    }
-
-    /* Now we can start initializing the device properly. */
-    MA_ASSERT(pDevice != NULL);
-    MA_ZERO_OBJECT(&pDevice->opensl);
-
-    queue.locatorType = SL_DATALOCATOR_ANDROIDSIMPLEBUFFERQUEUE;
-
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        ma_SLDataFormat_PCM pcm;
-        SLDataLocator_IODevice locatorDevice;
-        SLDataSource source;
-        SLDataSink sink;
-        SLAndroidConfigurationItf pRecorderConfig;
-
-        ma_SLDataFormat_PCM_init__opensl(pDescriptorCapture->format, pDescriptorCapture->channels, pDescriptorCapture->sampleRate, pDescriptorCapture->channelMap, &pcm);
-
-        locatorDevice.locatorType = SL_DATALOCATOR_IODEVICE;
-        locatorDevice.deviceType  = SL_IODEVICE_AUDIOINPUT;
-        locatorDevice.deviceID    = SL_DEFAULTDEVICEID_AUDIOINPUT;  /* Must always use the default device with Android. */
-        locatorDevice.device      = NULL;
-
-        source.pLocator = &locatorDevice;
-        source.pFormat  = NULL;
-
-        queue.numBuffers = pDescriptorCapture->periodCount;
-
-        sink.pLocator = &queue;
-        sink.pFormat  = (SLDataFormat_PCM*)&pcm;
-
-        resultSL = (*g_maEngineSL)->CreateAudioRecorder(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pAudioRecorderObj, &source, &sink, ma_countof(itfIDs), itfIDs, itfIDsRequired);
-        if (resultSL == SL_RESULT_CONTENT_UNSUPPORTED || resultSL == SL_RESULT_PARAMETER_INVALID) {
-            /* Unsupported format. Fall back to something safer and try again. If this fails, just abort. */
-            pcm.formatType    = SL_DATAFORMAT_PCM;
-            pcm.numChannels   = 1;
-            ((SLDataFormat_PCM*)&pcm)->samplesPerSec = SL_SAMPLINGRATE_16;  /* The name of the sample rate variable is different between SLAndroidDataFormat_PCM_EX and SLDataFormat_PCM. */
-            pcm.bitsPerSample = 16;
-            pcm.containerSize = pcm.bitsPerSample;  /* Always tightly packed for now. */
-            pcm.channelMask   = 0;
-            resultSL = (*g_maEngineSL)->CreateAudioRecorder(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pAudioRecorderObj, &source, &sink, ma_countof(itfIDs), itfIDs, itfIDsRequired);
-        }
-
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to create audio recorder.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-
-        /* Set the recording preset before realizing the player. */
-        if (pConfig->opensl.recordingPreset != ma_opensl_recording_preset_default) {
-            resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioRecorderObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDCONFIGURATION, &pRecorderConfig);
-            if (resultSL == SL_RESULT_SUCCESS) {
-                SLint32 recordingPreset = ma_to_recording_preset__opensl(pConfig->opensl.recordingPreset);
-                resultSL = (*pRecorderConfig)->SetConfiguration(pRecorderConfig, SL_ANDROID_KEY_RECORDING_PRESET, &recordingPreset, sizeof(SLint32));
-                if (resultSL != SL_RESULT_SUCCESS) {
-                    /* Failed to set the configuration. Just keep going. */
-                }
-            }
-        }
-
-        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->Realize((SLObjectItf)pDevice->opensl.pAudioRecorderObj, SL_BOOLEAN_FALSE);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to realize audio recorder.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioRecorderObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_RECORD, &pDevice->opensl.pAudioRecorder);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_RECORD interface.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioRecorderObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE, &pDevice->opensl.pBufferQueueCapture);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_ANDROIDSIMPLEBUFFERQUEUE interface.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueueCapture)->RegisterCallback((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture, ma_buffer_queue_callback_capture__opensl_android, pDevice);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to register buffer queue callback.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        /* The internal format is determined by the "pcm" object. */
-        ma_deconstruct_SLDataFormat_PCM__opensl(&pcm, &pDescriptorCapture->format, &pDescriptorCapture->channels, &pDescriptorCapture->sampleRate, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap));
-
-        /* Buffer. */
-        pDescriptorCapture->periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptorCapture, pDescriptorCapture->sampleRate, pConfig->performanceProfile);
-        pDevice->opensl.currentBufferIndexCapture = 0;
-
-        bufferSizeInBytes = pDescriptorCapture->periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels) * pDescriptorCapture->periodCount;
-        pDevice->opensl.pBufferCapture = (ma_uint8*)ma_calloc(bufferSizeInBytes, &pDevice->pContext->allocationCallbacks);
-        if (pDevice->opensl.pBufferCapture == NULL) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to allocate memory for data buffer.");
-            return MA_OUT_OF_MEMORY;
-        }
-        MA_ZERO_MEMORY(pDevice->opensl.pBufferCapture, bufferSizeInBytes);
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        ma_SLDataFormat_PCM pcm;
-        SLDataSource source;
-        SLDataLocator_OutputMix outmixLocator;
-        SLDataSink sink;
-        SLAndroidConfigurationItf pPlayerConfig;
-
-        ma_SLDataFormat_PCM_init__opensl(pDescriptorPlayback->format, pDescriptorPlayback->channels, pDescriptorPlayback->sampleRate, pDescriptorPlayback->channelMap, &pcm);
-
-        resultSL = (*g_maEngineSL)->CreateOutputMix(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pOutputMixObj, 0, NULL, NULL);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to create output mix.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pOutputMixObj)->Realize((SLObjectItf)pDevice->opensl.pOutputMixObj, SL_BOOLEAN_FALSE);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to realize output mix object.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pOutputMixObj)->GetInterface((SLObjectItf)pDevice->opensl.pOutputMixObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_OUTPUTMIX, &pDevice->opensl.pOutputMix);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_OUTPUTMIX interface.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        /* Set the output device. */
-        if (pDescriptorPlayback->pDeviceID != NULL) {
-            SLuint32 deviceID_OpenSL = pDescriptorPlayback->pDeviceID->opensl;
-            MA_OPENSL_OUTPUTMIX(pDevice->opensl.pOutputMix)->ReRoute((SLOutputMixItf)pDevice->opensl.pOutputMix, 1, &deviceID_OpenSL);
-        }
-
-        queue.numBuffers = pDescriptorPlayback->periodCount;
-
-        source.pLocator = &queue;
-        source.pFormat  = (SLDataFormat_PCM*)&pcm;
-
-        outmixLocator.locatorType = SL_DATALOCATOR_OUTPUTMIX;
-        outmixLocator.outputMix   = (SLObjectItf)pDevice->opensl.pOutputMixObj;
-
-        sink.pLocator = &outmixLocator;
-        sink.pFormat  = NULL;
-
-        resultSL = (*g_maEngineSL)->CreateAudioPlayer(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pAudioPlayerObj, &source, &sink, ma_countof(itfIDs), itfIDs, itfIDsRequired);
-        if (resultSL == SL_RESULT_CONTENT_UNSUPPORTED || resultSL == SL_RESULT_PARAMETER_INVALID) {
-            /* Unsupported format. Fall back to something safer and try again. If this fails, just abort. */
-            pcm.formatType = SL_DATAFORMAT_PCM;
-            pcm.numChannels = 2;
-            ((SLDataFormat_PCM*)&pcm)->samplesPerSec = SL_SAMPLINGRATE_16;
-            pcm.bitsPerSample = 16;
-            pcm.containerSize = pcm.bitsPerSample;  /* Always tightly packed for now. */
-            pcm.channelMask = SL_SPEAKER_FRONT_LEFT | SL_SPEAKER_FRONT_RIGHT;
-            resultSL = (*g_maEngineSL)->CreateAudioPlayer(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pAudioPlayerObj, &source, &sink, ma_countof(itfIDs), itfIDs, itfIDsRequired);
-        }
-
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to create audio player.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-
-        /* Set the stream type before realizing the player. */
-        if (pConfig->opensl.streamType != ma_opensl_stream_type_default) {
-            resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioPlayerObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDCONFIGURATION, &pPlayerConfig);
-            if (resultSL == SL_RESULT_SUCCESS) {
-                SLint32 streamType = ma_to_stream_type__opensl(pConfig->opensl.streamType);
-                resultSL = (*pPlayerConfig)->SetConfiguration(pPlayerConfig, SL_ANDROID_KEY_STREAM_TYPE, &streamType, sizeof(SLint32));
-                if (resultSL != SL_RESULT_SUCCESS) {
-                    /* Failed to set the configuration. Just keep going. */
-                }
-            }
-        }
-
-        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->Realize((SLObjectItf)pDevice->opensl.pAudioPlayerObj, SL_BOOLEAN_FALSE);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to realize audio player.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioPlayerObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_PLAY, &pDevice->opensl.pAudioPlayer);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_PLAY interface.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioPlayerObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE, &pDevice->opensl.pBufferQueuePlayback);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_ANDROIDSIMPLEBUFFERQUEUE interface.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueuePlayback)->RegisterCallback((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback, ma_buffer_queue_callback_playback__opensl_android, pDevice);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to register buffer queue callback.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        /* The internal format is determined by the "pcm" object. */
-        ma_deconstruct_SLDataFormat_PCM__opensl(&pcm, &pDescriptorPlayback->format, &pDescriptorPlayback->channels, &pDescriptorPlayback->sampleRate, pDescriptorPlayback->channelMap, ma_countof(pDescriptorPlayback->channelMap));
-
-        /* Buffer. */
-        pDescriptorPlayback->periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptorPlayback, pDescriptorPlayback->sampleRate, pConfig->performanceProfile);
-        pDevice->opensl.currentBufferIndexPlayback   = 0;
-
-        bufferSizeInBytes = pDescriptorPlayback->periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels) * pDescriptorPlayback->periodCount;
-        pDevice->opensl.pBufferPlayback = (ma_uint8*)ma_calloc(bufferSizeInBytes, &pDevice->pContext->allocationCallbacks);
-        if (pDevice->opensl.pBufferPlayback == NULL) {
-            ma_device_uninit__opensl(pDevice);
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to allocate memory for data buffer.");
-            return MA_OUT_OF_MEMORY;
-        }
-        MA_ZERO_MEMORY(pDevice->opensl.pBufferPlayback, bufferSizeInBytes);
-    }
-
-    return MA_SUCCESS;
-#else
-    return MA_NO_BACKEND;   /* Non-Android implementations are not supported. */
-#endif
-}
-
-static ma_result ma_device_start__opensl(ma_device* pDevice)
-{
-    SLresult resultSL;
-    size_t periodSizeInBytes;
-    ma_uint32 iPeriod;
-
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it and then attempted to start the device. */
-    if (g_maOpenSLInitCounter == 0) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        resultSL = MA_OPENSL_RECORD(pDevice->opensl.pAudioRecorder)->SetRecordState((SLRecordItf)pDevice->opensl.pAudioRecorder, SL_RECORDSTATE_RECORDING);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to start internal capture device.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        periodSizeInBytes = pDevice->capture.internalPeriodSizeInFrames * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
-        for (iPeriod = 0; iPeriod < pDevice->capture.internalPeriods; ++iPeriod) {
-            resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueueCapture)->Enqueue((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture, pDevice->opensl.pBufferCapture + (periodSizeInBytes * iPeriod), periodSizeInBytes);
-            if (resultSL != SL_RESULT_SUCCESS) {
-                MA_OPENSL_RECORD(pDevice->opensl.pAudioRecorder)->SetRecordState((SLRecordItf)pDevice->opensl.pAudioRecorder, SL_RECORDSTATE_STOPPED);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to enqueue buffer for capture device.");
-                return ma_result_from_OpenSL(resultSL);
-            }
-        }
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        resultSL = MA_OPENSL_PLAY(pDevice->opensl.pAudioPlayer)->SetPlayState((SLPlayItf)pDevice->opensl.pAudioPlayer, SL_PLAYSTATE_PLAYING);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to start internal playback device.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        /* In playback mode (no duplex) we need to load some initial buffers. In duplex mode we need to enqueue silent buffers. */
-        if (pDevice->type == ma_device_type_duplex) {
-            MA_ZERO_MEMORY(pDevice->opensl.pBufferPlayback, pDevice->playback.internalPeriodSizeInFrames * pDevice->playback.internalPeriods * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
-        } else {
-            ma_device__read_frames_from_client(pDevice, pDevice->playback.internalPeriodSizeInFrames * pDevice->playback.internalPeriods, pDevice->opensl.pBufferPlayback);
-        }
-
-        periodSizeInBytes = pDevice->playback.internalPeriodSizeInFrames * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
-        for (iPeriod = 0; iPeriod < pDevice->playback.internalPeriods; ++iPeriod) {
-            resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueuePlayback)->Enqueue((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback, pDevice->opensl.pBufferPlayback + (periodSizeInBytes * iPeriod), periodSizeInBytes);
-            if (resultSL != SL_RESULT_SUCCESS) {
-                MA_OPENSL_PLAY(pDevice->opensl.pAudioPlayer)->SetPlayState((SLPlayItf)pDevice->opensl.pAudioPlayer, SL_PLAYSTATE_STOPPED);
-                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to enqueue buffer for playback device.");
-                return ma_result_from_OpenSL(resultSL);
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_drain__opensl(ma_device* pDevice, ma_device_type deviceType)
-{
-    SLAndroidSimpleBufferQueueItf pBufferQueue;
-
-    MA_ASSERT(deviceType == ma_device_type_capture || deviceType == ma_device_type_playback);
-
-    if (pDevice->type == ma_device_type_capture) {
-        pBufferQueue = (SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture;
-        pDevice->opensl.isDrainingCapture  = MA_TRUE;
-    } else {
-        pBufferQueue = (SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback;
-        pDevice->opensl.isDrainingPlayback = MA_TRUE;
-    }
-
-    for (;;) {
-        SLAndroidSimpleBufferQueueState state;
-
-        MA_OPENSL_BUFFERQUEUE(pBufferQueue)->GetState(pBufferQueue, &state);
-        if (state.count == 0) {
-            break;
-        }
-
-        ma_sleep(10);
-    }
-
-    if (pDevice->type == ma_device_type_capture) {
-        pDevice->opensl.isDrainingCapture  = MA_FALSE;
-    } else {
-        pDevice->opensl.isDrainingPlayback = MA_FALSE;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__opensl(ma_device* pDevice)
-{
-    SLresult resultSL;
-
-    MA_ASSERT(pDevice != NULL);
-
-    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it before stopping/uninitializing the device. */
-    if (g_maOpenSLInitCounter == 0) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
-        ma_device_drain__opensl(pDevice, ma_device_type_capture);
-
-        resultSL = MA_OPENSL_RECORD(pDevice->opensl.pAudioRecorder)->SetRecordState((SLRecordItf)pDevice->opensl.pAudioRecorder, SL_RECORDSTATE_STOPPED);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to stop internal capture device.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueueCapture)->Clear((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture);
-    }
-
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_device_drain__opensl(pDevice, ma_device_type_playback);
-
-        resultSL = MA_OPENSL_PLAY(pDevice->opensl.pAudioPlayer)->SetPlayState((SLPlayItf)pDevice->opensl.pAudioPlayer, SL_PLAYSTATE_STOPPED);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to stop internal playback device.");
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueuePlayback)->Clear((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback);
-    }
-
-    /* Make sure the client is aware that the device has stopped. There may be an OpenSL|ES callback for this, but I haven't found it. */
-    ma_device__on_notification_stopped(pDevice);
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_context_uninit__opensl(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_opensl);
-    (void)pContext;
-
-    /* Uninit global data. */
-    ma_spinlock_lock(&g_maOpenSLSpinlock);
-    {
-        MA_ASSERT(g_maOpenSLInitCounter > 0);   /* If you've triggered this, it means you have ma_context_init/uninit mismatch. Each successful call to ma_context_init() must be matched up with a call to ma_context_uninit(). */
-
-        g_maOpenSLInitCounter -= 1;
-        if (g_maOpenSLInitCounter == 0) {
-            (*g_maEngineObjectSL)->Destroy(g_maEngineObjectSL);
-        }
-    }
-    ma_spinlock_unlock(&g_maOpenSLSpinlock);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_dlsym_SLInterfaceID__opensl(ma_context* pContext, const char* pName, ma_handle* pHandle)
-{
-    /* We need to return an error if the symbol cannot be found. This is important because there have been reports that some symbols do not exist. */
-    ma_handle* p = (ma_handle*)ma_dlsym(ma_context_get_log(pContext), pContext->opensl.libOpenSLES, pName);
-    if (p == NULL) {
-        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_INFO, "[OpenSL] Cannot find symbol %s", pName);
-        return MA_NO_BACKEND;
-    }
-
-    *pHandle = *p;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init_engine_nolock__opensl(ma_context* pContext)
-{
-    g_maOpenSLInitCounter += 1;
-    if (g_maOpenSLInitCounter == 1) {
-        SLresult resultSL;
-
-        resultSL = ((ma_slCreateEngine_proc)pContext->opensl.slCreateEngine)(&g_maEngineObjectSL, 0, NULL, 0, NULL, NULL);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            g_maOpenSLInitCounter -= 1;
-            return ma_result_from_OpenSL(resultSL);
-        }
-
-        (*g_maEngineObjectSL)->Realize(g_maEngineObjectSL, SL_BOOLEAN_FALSE);
-
-        resultSL = (*g_maEngineObjectSL)->GetInterface(g_maEngineObjectSL, (SLInterfaceID)pContext->opensl.SL_IID_ENGINE, &g_maEngineSL);
-        if (resultSL != SL_RESULT_SUCCESS) {
-            (*g_maEngineObjectSL)->Destroy(g_maEngineObjectSL);
-            g_maOpenSLInitCounter -= 1;
-            return ma_result_from_OpenSL(resultSL);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__opensl(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    ma_result result;
-
-#if !defined(MA_NO_RUNTIME_LINKING)
-    size_t i;
-    const char* libOpenSLESNames[] = {
-        "libOpenSLES.so"
-    };
-#endif
-
-    MA_ASSERT(pContext != NULL);
-
-    (void)pConfig;
-
-#if !defined(MA_NO_RUNTIME_LINKING)
-    /*
-    Dynamically link against libOpenSLES.so. I have now had multiple reports that SL_IID_ANDROIDSIMPLEBUFFERQUEUE cannot be found. One
-    report was happening at compile time and another at runtime. To try working around this, I'm going to link to libOpenSLES at runtime
-    and extract the symbols rather than reference them directly. This should, hopefully, fix these issues as the compiler won't see any
-    references to the symbols and will hopefully skip the checks.
-    */
-    for (i = 0; i < ma_countof(libOpenSLESNames); i += 1) {
-        pContext->opensl.libOpenSLES = ma_dlopen(ma_context_get_log(pContext), libOpenSLESNames[i]);
-        if (pContext->opensl.libOpenSLES != NULL) {
-            break;
-        }
-    }
-
-    if (pContext->opensl.libOpenSLES == NULL) {
-        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_INFO, "[OpenSL] Could not find libOpenSLES.so");
-        return MA_NO_BACKEND;
-    }
-
-    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_ENGINE", &pContext->opensl.SL_IID_ENGINE);
-    if (result != MA_SUCCESS) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
-        return result;
-    }
-
-    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_AUDIOIODEVICECAPABILITIES", &pContext->opensl.SL_IID_AUDIOIODEVICECAPABILITIES);
-    if (result != MA_SUCCESS) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
-        return result;
-    }
-
-    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_ANDROIDSIMPLEBUFFERQUEUE", &pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE);
-    if (result != MA_SUCCESS) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
-        return result;
-    }
-
-    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_RECORD", &pContext->opensl.SL_IID_RECORD);
-    if (result != MA_SUCCESS) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
-        return result;
-    }
-
-    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_PLAY", &pContext->opensl.SL_IID_PLAY);
-    if (result != MA_SUCCESS) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
-        return result;
-    }
-
-    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_OUTPUTMIX", &pContext->opensl.SL_IID_OUTPUTMIX);
-    if (result != MA_SUCCESS) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
-        return result;
-    }
-
-    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_ANDROIDCONFIGURATION", &pContext->opensl.SL_IID_ANDROIDCONFIGURATION);
-    if (result != MA_SUCCESS) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
-        return result;
-    }
-
-    pContext->opensl.slCreateEngine = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->opensl.libOpenSLES, "slCreateEngine");
-    if (pContext->opensl.slCreateEngine == NULL) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
-        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_INFO, "[OpenSL] Cannot find symbol slCreateEngine.");
-        return MA_NO_BACKEND;
-    }
-#else
-    pContext->opensl.SL_IID_ENGINE                    = (ma_handle)SL_IID_ENGINE;
-    pContext->opensl.SL_IID_AUDIOIODEVICECAPABILITIES = (ma_handle)SL_IID_AUDIOIODEVICECAPABILITIES;
-    pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE  = (ma_handle)SL_IID_ANDROIDSIMPLEBUFFERQUEUE;
-    pContext->opensl.SL_IID_RECORD                    = (ma_handle)SL_IID_RECORD;
-    pContext->opensl.SL_IID_PLAY                      = (ma_handle)SL_IID_PLAY;
-    pContext->opensl.SL_IID_OUTPUTMIX                 = (ma_handle)SL_IID_OUTPUTMIX;
-    pContext->opensl.SL_IID_ANDROIDCONFIGURATION      = (ma_handle)SL_IID_ANDROIDCONFIGURATION;
-    pContext->opensl.slCreateEngine                   = (ma_proc)slCreateEngine;
-#endif
-
-
-    /* Initialize global data first if applicable. */
-    ma_spinlock_lock(&g_maOpenSLSpinlock);
-    {
-        result = ma_context_init_engine_nolock__opensl(pContext);
-    }
-    ma_spinlock_unlock(&g_maOpenSLSpinlock);
-
-    if (result != MA_SUCCESS) {
-        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
-        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_INFO, "[OpenSL] Failed to initialize OpenSL engine.");
-        return result;
-    }
-
-    pCallbacks->onContextInit             = ma_context_init__opensl;
-    pCallbacks->onContextUninit           = ma_context_uninit__opensl;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__opensl;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__opensl;
-    pCallbacks->onDeviceInit              = ma_device_init__opensl;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__opensl;
-    pCallbacks->onDeviceStart             = ma_device_start__opensl;
-    pCallbacks->onDeviceStop              = ma_device_stop__opensl;
-    pCallbacks->onDeviceRead              = NULL;   /* Not needed because OpenSL|ES is asynchronous. */
-    pCallbacks->onDeviceWrite             = NULL;   /* Not needed because OpenSL|ES is asynchronous. */
-    pCallbacks->onDeviceDataLoop          = NULL;   /* Not needed because OpenSL|ES is asynchronous. */
-
-    return MA_SUCCESS;
-}
-#endif  /* OpenSL|ES */
-
-
-/******************************************************************************
-
-Web Audio Backend
-
-******************************************************************************/
-#ifdef MA_HAS_WEBAUDIO
-#include <emscripten/emscripten.h>
-
-#if (__EMSCRIPTEN_major__ > 3) || (__EMSCRIPTEN_major__ == 3 && (__EMSCRIPTEN_minor__ > 1 || (__EMSCRIPTEN_minor__ == 1 && __EMSCRIPTEN_tiny__ >= 32)))
-    #include <emscripten/webaudio.h>
-    #define MA_SUPPORT_AUDIO_WORKLETS
-#endif
-
-/*
-TODO: Version 0.12: Swap this logic around so that AudioWorklets are used by default. Add MA_NO_AUDIO_WORKLETS.
-*/
-#if defined(MA_ENABLE_AUDIO_WORKLETS) && defined(MA_SUPPORT_AUDIO_WORKLETS)
-    #define MA_USE_AUDIO_WORKLETS
-#endif
-
-/* The thread stack size must be a multiple of 16. */
-#ifndef MA_AUDIO_WORKLETS_THREAD_STACK_SIZE
-#define MA_AUDIO_WORKLETS_THREAD_STACK_SIZE 16384
-#endif
-
-#if defined(MA_USE_AUDIO_WORKLETS)
-#define MA_WEBAUDIO_LATENCY_HINT_BALANCED       "balanced"
-#define MA_WEBAUDIO_LATENCY_HINT_INTERACTIVE    "interactive"
-#define MA_WEBAUDIO_LATENCY_HINT_PLAYBACK       "playback"
-#endif
-
-static ma_bool32 ma_is_capture_supported__webaudio()
-{
-    return EM_ASM_INT({
-        return (navigator.mediaDevices !== undefined && navigator.mediaDevices.getUserMedia !== undefined);
-    }, 0) != 0; /* Must pass in a dummy argument for C99 compatibility. */
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-void* EMSCRIPTEN_KEEPALIVE ma_malloc_emscripten(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_malloc(sz, pAllocationCallbacks);
-}
-
-void EMSCRIPTEN_KEEPALIVE ma_free_emscripten(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_free(p, pAllocationCallbacks);
-}
-
-void EMSCRIPTEN_KEEPALIVE ma_device_process_pcm_frames_capture__webaudio(ma_device* pDevice, int frameCount, float* pFrames)
-{
-    ma_device_handle_backend_data_callback(pDevice, NULL, pFrames, (ma_uint32)frameCount);
-}
-
-void EMSCRIPTEN_KEEPALIVE ma_device_process_pcm_frames_playback__webaudio(ma_device* pDevice, int frameCount, float* pFrames)
-{
-    ma_device_handle_backend_data_callback(pDevice, pFrames, NULL, (ma_uint32)frameCount);
-}
-#ifdef __cplusplus
-}
-#endif
-
-static ma_result ma_context_enumerate_devices__webaudio(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_bool32 cbResult = MA_TRUE;
-
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(callback != NULL);
-
-    /* Only supporting default devices for now. */
-
-    /* Playback. */
-    if (cbResult) {
-        ma_device_info deviceInfo;
-        MA_ZERO_OBJECT(&deviceInfo);
-        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-        deviceInfo.isDefault = MA_TRUE;    /* Only supporting default devices. */
-        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
-    }
-
-    /* Capture. */
-    if (cbResult) {
-        if (ma_is_capture_supported__webaudio()) {
-            ma_device_info deviceInfo;
-            MA_ZERO_OBJECT(&deviceInfo);
-            ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-            deviceInfo.isDefault = MA_TRUE;    /* Only supporting default devices. */
-            cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_get_device_info__webaudio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    MA_ASSERT(pContext != NULL);
-
-    if (deviceType == ma_device_type_capture && !ma_is_capture_supported__webaudio()) {
-        return MA_NO_DEVICE;
-    }
-
-    MA_ZERO_MEMORY(pDeviceInfo->id.webaudio, sizeof(pDeviceInfo->id.webaudio));
-
-    /* Only supporting default devices for now. */
-    (void)pDeviceID;
-    if (deviceType == ma_device_type_playback) {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-    } else {
-        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-    }
-
-    /* Only supporting default devices. */
-    pDeviceInfo->isDefault = MA_TRUE;
-
-    /* Web Audio can support any number of channels and sample rates. It only supports f32 formats, however. */
-    pDeviceInfo->nativeDataFormats[0].flags      = 0;
-    pDeviceInfo->nativeDataFormats[0].format     = ma_format_unknown;
-    pDeviceInfo->nativeDataFormats[0].channels   = 0; /* All channels are supported. */
-    pDeviceInfo->nativeDataFormats[0].sampleRate = EM_ASM_INT({
-        try {
-            var temp = new (window.AudioContext || window.webkitAudioContext)();
-            var sampleRate = temp.sampleRate;
-            temp.close();
-            return sampleRate;
-        } catch(e) {
-            return 0;
-        }
-    }, 0);  /* Must pass in a dummy argument for C99 compatibility. */
-
-    if (pDeviceInfo->nativeDataFormats[0].sampleRate == 0) {
-        return MA_NO_DEVICE;
-    }
-
-    pDeviceInfo->nativeDataFormatCount = 1;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_uninit__webaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    #if defined(MA_USE_AUDIO_WORKLETS)
-    {
-        EM_ASM({
-            var device = miniaudio.get_device_by_index($0);
-
-            if (device.streamNode !== undefined) {
-                device.streamNode.disconnect();
-                device.streamNode = undefined;
-            }
-        }, pDevice->webaudio.deviceIndex);
-
-        emscripten_destroy_web_audio_node(pDevice->webaudio.audioWorklet);
-        emscripten_destroy_audio_context(pDevice->webaudio.audioContext);
-        ma_free(pDevice->webaudio.pStackBuffer, &pDevice->pContext->allocationCallbacks);
-    }
-    #else
-    {
-        EM_ASM({
-            var device = miniaudio.get_device_by_index($0);
-
-            /* Make sure all nodes are disconnected and marked for collection. */
-            if (device.scriptNode !== undefined) {
-                device.scriptNode.onaudioprocess = function(e) {};  /* We want to reset the callback to ensure it doesn't get called after AudioContext.close() has returned. Shouldn't happen since we're disconnecting, but just to be safe... */
-                device.scriptNode.disconnect();
-                device.scriptNode = undefined;
-            }
-
-            if (device.streamNode !== undefined) {
-                device.streamNode.disconnect();
-                device.streamNode = undefined;
-            }
-
-            /*
-            Stop the device. I think there is a chance the callback could get fired after calling this, hence why we want
-            to clear the callback before closing.
-            */
-            device.webaudio.close();
-            device.webaudio = undefined;
-            device.pDevice = undefined;
-        }, pDevice->webaudio.deviceIndex);
-    }
-    #endif
-
-    /* Clean up the device on the JS side. */
-    EM_ASM({
-        miniaudio.untrack_device_by_index($0);
-    }, pDevice->webaudio.deviceIndex);
-
-    ma_free(pDevice->webaudio.pIntermediaryBuffer, &pDevice->pContext->allocationCallbacks);
-
-    return MA_SUCCESS;
-}
-
-#if !defined(MA_USE_AUDIO_WORKLETS)
-static ma_uint32 ma_calculate_period_size_in_frames_from_descriptor__webaudio(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
-{
-    /*
-    There have been reports of the default buffer size being too small on some browsers. If we're using
-    the default buffer size, we'll make sure the period size is bigger than our standard defaults.
-    */
-    ma_uint32 periodSizeInFrames;
-
-    if (nativeSampleRate == 0) {
-        nativeSampleRate = MA_DEFAULT_SAMPLE_RATE;
-    }
-
-    if (pDescriptor->periodSizeInFrames == 0) {
-        if (pDescriptor->periodSizeInMilliseconds == 0) {
-            if (performanceProfile == ma_performance_profile_low_latency) {
-                periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(33, nativeSampleRate);  /* 1 frame @ 30 FPS */
-            } else {
-                periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(333, nativeSampleRate);
-            }
-        } else {
-            periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptor->periodSizeInMilliseconds, nativeSampleRate);
-        }
-    } else {
-        periodSizeInFrames = pDescriptor->periodSizeInFrames;
-    }
-
-    /* The size of the buffer must be a power of 2 and between 256 and 16384. */
-    if (periodSizeInFrames < 256) {
-        periodSizeInFrames = 256;
-    } else if (periodSizeInFrames > 16384) {
-        periodSizeInFrames = 16384;
-    } else {
-        periodSizeInFrames = ma_next_power_of_2(periodSizeInFrames);
-    }
-
-    return periodSizeInFrames;
-}
-#endif
-
-
-#if defined(MA_USE_AUDIO_WORKLETS)
-typedef struct
-{
-    ma_device* pDevice;
-    const ma_device_config* pConfig;
-    ma_device_descriptor* pDescriptorPlayback;
-    ma_device_descriptor* pDescriptorCapture;
-} ma_audio_worklet_thread_initialized_data;
-
-static EM_BOOL ma_audio_worklet_process_callback__webaudio(int inputCount, const AudioSampleFrame* pInputs, int outputCount, AudioSampleFrame* pOutputs, int paramCount, const AudioParamFrame* pParams, void* pUserData)
-{
-    ma_device* pDevice = (ma_device*)pUserData;
-    ma_uint32 frameCount;
-
-    (void)paramCount;
-    (void)pParams;
-
-    if (ma_device_get_state(pDevice) != ma_device_state_started) {
-        return EM_TRUE;
-    }
-
-    /*
-    The Emscripten documentation says that it'll always be 128 frames being passed in. Hard coding it like that feels
-    like a very bad idea to me. Even if it's hard coded in the backend, the API and documentation should always refer
-    to variables instead of a hard coded number. In any case, will follow along for the time being.
-
-    Unfortunately the audio data is not interleaved so we'll need to convert it before we give the data to miniaudio
-    for further processing.
-    */
-    frameCount = 128;
-
-    if (inputCount > 0) {
-        /* Input data needs to be interleaved before we hand it to the client. */
-        for (ma_uint32 iChannel = 0; iChannel < pDevice->capture.internalChannels; iChannel += 1) {
-            for (ma_uint32 iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                pDevice->webaudio.pIntermediaryBuffer[iFrame*pDevice->capture.internalChannels + iChannel] = pInputs[0].data[frameCount*iChannel + iFrame];
-            }
-        }
-
-        ma_device_process_pcm_frames_capture__webaudio(pDevice, frameCount, pDevice->webaudio.pIntermediaryBuffer);
-    }
-
-    if (outputCount > 0) {
-        /* If it's a capture-only device, we'll need to output silence. */
-        if (pDevice->type == ma_device_type_capture) {
-            MA_ZERO_MEMORY(pOutputs[0].data, frameCount * pDevice->playback.internalChannels * sizeof(float));
-        } else {
-            ma_device_process_pcm_frames_playback__webaudio(pDevice, frameCount, pDevice->webaudio.pIntermediaryBuffer);
-
-            /* We've read the data from the client. Now we need to deinterleave the buffer and output to the output buffer. */
-            for (ma_uint32 iChannel = 0; iChannel < pDevice->playback.internalChannels; iChannel += 1) {
-                for (ma_uint32 iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                    pOutputs[0].data[frameCount*iChannel + iFrame] = pDevice->webaudio.pIntermediaryBuffer[iFrame*pDevice->playback.internalChannels + iChannel];
-                }
-            }
-        }
-    }
-
-    return EM_TRUE;
-}
-
-
-static void ma_audio_worklet_processor_created__webaudio(EMSCRIPTEN_WEBAUDIO_T audioContext, EM_BOOL success, void* pUserData)
-{
-    ma_audio_worklet_thread_initialized_data* pParameters = (ma_audio_worklet_thread_initialized_data*)pUserData;
-    EmscriptenAudioWorkletNodeCreateOptions audioWorkletOptions;
-    int channels = 0;
-    size_t intermediaryBufferSizeInFrames;
-    int sampleRate;
-
-    if (success == EM_FALSE) {
-        pParameters->pDevice->webaudio.initResult = MA_ERROR;
-        ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
-        return;
-    }
-
-    /* The next step is to initialize the audio worklet node. */
-    MA_ZERO_OBJECT(&audioWorkletOptions);
-
-    /*
-    The way channel counts work with Web Audio is confusing. As far as I can tell, there's no way to know the channel
-    count from MediaStreamAudioSourceNode (what we use for capture)? The only way to have control is to configure an
-    output channel count on the capture side. This is slightly confusing for capture mode because intuitively you
-    wouldn't actually connect an output to an input-only node, but this is what we'll have to do in order to have
-    proper control over the channel count. In the capture case, we'll have to output silence to it's output node.
-    */
-    if (pParameters->pConfig->deviceType == ma_device_type_capture) {
-        channels = (int)((pParameters->pDescriptorCapture->channels > 0) ? pParameters->pDescriptorCapture->channels : MA_DEFAULT_CHANNELS);
-        audioWorkletOptions.numberOfInputs = 1;
-    } else {
-        channels = (int)((pParameters->pDescriptorPlayback->channels > 0) ? pParameters->pDescriptorPlayback->channels : MA_DEFAULT_CHANNELS);
-
-        if (pParameters->pConfig->deviceType == ma_device_type_duplex) {
-            audioWorkletOptions.numberOfInputs = 1;
-        } else {
-            audioWorkletOptions.numberOfInputs = 0;
-        }
-    }
-
-    audioWorkletOptions.numberOfOutputs = 1;
-    audioWorkletOptions.outputChannelCounts = &channels;
-
-
-    /*
-    Now that we know the channel count to use we can allocate the intermediary buffer. The
-    intermediary buffer is used for interleaving and deinterleaving.
-    */
-    intermediaryBufferSizeInFrames = 128;
-
-    pParameters->pDevice->webaudio.pIntermediaryBuffer = (float*)ma_malloc(intermediaryBufferSizeInFrames * (ma_uint32)channels * sizeof(float), &pParameters->pDevice->pContext->allocationCallbacks);
-    if (pParameters->pDevice->webaudio.pIntermediaryBuffer == NULL) {
-        pParameters->pDevice->webaudio.initResult = MA_OUT_OF_MEMORY;
-        ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
-        return;
-    }
-
-
-    pParameters->pDevice->webaudio.audioWorklet = emscripten_create_wasm_audio_worklet_node(audioContext, "miniaudio", &audioWorkletOptions, &ma_audio_worklet_process_callback__webaudio, pParameters->pDevice);
-
-    /* With the audio worklet initialized we can now attach it to the graph. */
-    if (pParameters->pConfig->deviceType == ma_device_type_capture || pParameters->pConfig->deviceType == ma_device_type_duplex) {
-        ma_result attachmentResult = (ma_result)EM_ASM_INT({
-            var getUserMediaResult = 0;
-            var audioWorklet = emscriptenGetAudioObject($0);
-            var audioContext = emscriptenGetAudioObject($1);
-
-            navigator.mediaDevices.getUserMedia({audio:true, video:false})
-                .then(function(stream) {
-                    audioContext.streamNode = audioContext.createMediaStreamSource(stream);
-                    audioContext.streamNode.connect(audioWorklet);
-                    audioWorklet.connect(audioContext.destination);
-                    getUserMediaResult = 0;   /* 0 = MA_SUCCESS */
-                })
-                .catch(function(error) {
-                    console.log("navigator.mediaDevices.getUserMedia Failed: " + error);
-                    getUserMediaResult = -1;  /* -1 = MA_ERROR */
-                });
-
-            return getUserMediaResult;
-        }, pParameters->pDevice->webaudio.audioWorklet, audioContext);
-
-        if (attachmentResult != MA_SUCCESS) {
-            ma_log_postf(ma_device_get_log(pParameters->pDevice), MA_LOG_LEVEL_ERROR, "Web Audio: Failed to connect capture node.");
-            emscripten_destroy_web_audio_node(pParameters->pDevice->webaudio.audioWorklet);
-            pParameters->pDevice->webaudio.initResult = attachmentResult;
-            ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
-            return;
-        }
-    }
-
-    /* If it's playback only we can now attach the worklet node to the graph. This has already been done for the duplex case. */
-    if (pParameters->pConfig->deviceType == ma_device_type_playback) {
-        ma_result attachmentResult = (ma_result)EM_ASM_INT({
-            var audioWorklet = emscriptenGetAudioObject($0);
-            var audioContext = emscriptenGetAudioObject($1);
-            audioWorklet.connect(audioContext.destination);
-            return 0;   /* 0 = MA_SUCCESS */
-        }, pParameters->pDevice->webaudio.audioWorklet, audioContext);
-
-        if (attachmentResult != MA_SUCCESS) {
-            ma_log_postf(ma_device_get_log(pParameters->pDevice), MA_LOG_LEVEL_ERROR, "Web Audio: Failed to connect playback node.");
-            pParameters->pDevice->webaudio.initResult = attachmentResult;
-            ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
-            return;
-        }
-    }
-
-    /* We need to update the descriptors so that they reflect the internal data format. Both capture and playback should be the same. */
-    sampleRate = EM_ASM_INT({ return emscriptenGetAudioObject($0).sampleRate; }, audioContext);
-
-    if (pParameters->pDescriptorCapture != NULL) {
-        pParameters->pDescriptorCapture->format              = ma_format_f32;
-        pParameters->pDescriptorCapture->channels            = (ma_uint32)channels;
-        pParameters->pDescriptorCapture->sampleRate          = (ma_uint32)sampleRate;
-        ma_channel_map_init_standard(ma_standard_channel_map_webaudio, pParameters->pDescriptorCapture->channelMap, ma_countof(pParameters->pDescriptorCapture->channelMap), pParameters->pDescriptorCapture->channels);
-        pParameters->pDescriptorCapture->periodSizeInFrames  = intermediaryBufferSizeInFrames;
-        pParameters->pDescriptorCapture->periodCount         = 1;
-    }
-
-    if (pParameters->pDescriptorPlayback != NULL) {
-        pParameters->pDescriptorPlayback->format             = ma_format_f32;
-        pParameters->pDescriptorPlayback->channels           = (ma_uint32)channels;
-        pParameters->pDescriptorPlayback->sampleRate         = (ma_uint32)sampleRate;
-        ma_channel_map_init_standard(ma_standard_channel_map_webaudio, pParameters->pDescriptorPlayback->channelMap, ma_countof(pParameters->pDescriptorPlayback->channelMap), pParameters->pDescriptorPlayback->channels);
-        pParameters->pDescriptorPlayback->periodSizeInFrames = intermediaryBufferSizeInFrames;
-        pParameters->pDescriptorPlayback->periodCount        = 1;
-    }
-
-    /* At this point we're done and we can return. */
-    ma_log_postf(ma_device_get_log(pParameters->pDevice), MA_LOG_LEVEL_DEBUG, "AudioWorklets: Created worklet node: %d\n", pParameters->pDevice->webaudio.audioWorklet);
-    pParameters->pDevice->webaudio.initResult = MA_SUCCESS;
-    ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
-}
-
-static void ma_audio_worklet_thread_initialized__webaudio(EMSCRIPTEN_WEBAUDIO_T audioContext, EM_BOOL success, void* pUserData)
-{
-    ma_audio_worklet_thread_initialized_data* pParameters = (ma_audio_worklet_thread_initialized_data*)pUserData;
-    WebAudioWorkletProcessorCreateOptions workletProcessorOptions;
-
-    MA_ASSERT(pParameters != NULL);
-
-    if (success == EM_FALSE) {
-        pParameters->pDevice->webaudio.initResult = MA_ERROR;
-        return;
-    }
-
-    MA_ZERO_OBJECT(&workletProcessorOptions);
-    workletProcessorOptions.name = "miniaudio"; /* I'm not entirely sure what to call this. Does this need to be globally unique, or does it need only be unique for a given AudioContext? */
-
-    emscripten_create_wasm_audio_worklet_processor_async(audioContext, &workletProcessorOptions, ma_audio_worklet_processor_created__webaudio, pParameters);
-}
-#endif
-
-static ma_result ma_device_init__webaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
-{
-    if (pConfig->deviceType == ma_device_type_loopback) {
-        return MA_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-    /* No exclusive mode with Web Audio. */
-    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
-        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
-        return MA_SHARE_MODE_NOT_SUPPORTED;
-    }
-
-    /*
-    With AudioWorklets we'll have just a single AudioContext. I'm not sure why I'm not doing this for ScriptProcessorNode so
-    it might be worthwhile to look into that as well.
-    */
-    #if defined(MA_USE_AUDIO_WORKLETS)
-    {
-        EmscriptenWebAudioCreateAttributes audioContextAttributes;
-        ma_audio_worklet_thread_initialized_data* pInitParameters;
-        void* pStackBuffer;
-
-        if (pConfig->performanceProfile == ma_performance_profile_conservative) {
-            audioContextAttributes.latencyHint = MA_WEBAUDIO_LATENCY_HINT_PLAYBACK;
-        } else {
-            audioContextAttributes.latencyHint = MA_WEBAUDIO_LATENCY_HINT_INTERACTIVE;
-        }
-
-        /*
-        In my testing, Firefox does not seem to capture audio data properly if the sample rate is set
-        to anything other than 48K. This does not seem to be the case for other browsers. For this reason,
-        if the device type is anything other than playback, we'll leave the sample rate as-is and let the
-        browser pick the appropriate rate for us.
-        */
-        if (pConfig->deviceType == ma_device_type_playback) {
-            audioContextAttributes.sampleRate = pDescriptorPlayback->sampleRate;
-        } else {
-            audioContextAttributes.sampleRate = 0;
-        }
-
-        /* It's not clear if this can return an error. None of the tests in the Emscripten repository check for this, so neither am I for now. */
-        pDevice->webaudio.audioContext = emscripten_create_audio_context(&audioContextAttributes);
-
-
-        /*
-        With the context created we can now create the worklet. We can only have a single worklet per audio
-        context which means we'll need to craft this appropriately to handle duplex devices correctly.
-        */
-
-        /*
-        We now need to create a worker thread. This is a bit weird because we need to allocate our
-        own buffer for the thread's stack. The stack needs to be aligned to 16 bytes. I'm going to
-        allocate this on the heap to keep it simple.
-        */
-        pStackBuffer = ma_aligned_malloc(MA_AUDIO_WORKLETS_THREAD_STACK_SIZE, 16, &pDevice->pContext->allocationCallbacks);
-        if (pStackBuffer == NULL) {
-            emscripten_destroy_audio_context(pDevice->webaudio.audioContext);
-            return MA_OUT_OF_MEMORY;
-        }
-
-        /* Our thread initialization parameters need to be allocated on the heap so they don't go out of scope. */
-        pInitParameters = (ma_audio_worklet_thread_initialized_data*)ma_malloc(sizeof(*pInitParameters), &pDevice->pContext->allocationCallbacks);
-        if (pInitParameters == NULL) {
-            ma_free(pStackBuffer, &pDevice->pContext->allocationCallbacks);
-            emscripten_destroy_audio_context(pDevice->webaudio.audioContext);
-            return MA_OUT_OF_MEMORY;
-        }
-
-        pInitParameters->pDevice = pDevice;
-        pInitParameters->pConfig = pConfig;
-        pInitParameters->pDescriptorPlayback = pDescriptorPlayback;
-        pInitParameters->pDescriptorCapture  = pDescriptorCapture;
-
-        /*
-        We need to flag the device as not yet initialized so we can wait on it later. Unfortunately all of
-        the Emscripten WebAudio stuff is asynchronous.
-        */
-        pDevice->webaudio.initResult = MA_BUSY;
-        {
-            emscripten_start_wasm_audio_worklet_thread_async(pDevice->webaudio.audioContext, pStackBuffer, MA_AUDIO_WORKLETS_THREAD_STACK_SIZE, ma_audio_worklet_thread_initialized__webaudio, pInitParameters);
-        }
-        while (pDevice->webaudio.initResult == MA_BUSY) { emscripten_sleep(1); }    /* We must wait for initialization to complete. We're just spinning here. The emscripten_sleep() call is why we need to build with `-sASYNCIFY`. */
-
-        /* Initialization is now complete. Descriptors were updated when the worklet was initialized. */
-        if (pDevice->webaudio.initResult != MA_SUCCESS) {
-            ma_free(pStackBuffer, &pDevice->pContext->allocationCallbacks);
-            emscripten_destroy_audio_context(pDevice->webaudio.audioContext);
-            return pDevice->webaudio.initResult;
-        }
-
-        /* We need to add an entry to the miniaudio.devices list on the JS side so we can do some JS/C interop. */
-        pDevice->webaudio.deviceIndex = EM_ASM_INT({
-            return miniaudio.track_device({
-                webaudio: emscriptenGetAudioObject($0),
-                state:    1 /* 1 = ma_device_state_stopped */
-            });
-        }, pDevice->webaudio.audioContext);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* ScriptProcessorNode. This path requires us to do almost everything in JS, but we'll do as much as we can in C. */
-        ma_uint32 deviceIndex;
-        ma_uint32 channels;
-        ma_uint32 sampleRate;
-        ma_uint32 periodSizeInFrames;
-
-        /* The channel count will depend on the device type. If it's a capture, use it's, otherwise use the playback side. */
-        if (pConfig->deviceType == ma_device_type_capture) {
-            channels = (pDescriptorCapture->channels  > 0) ? pDescriptorCapture->channels  : MA_DEFAULT_CHANNELS;
-        } else {
-            channels = (pDescriptorPlayback->channels > 0) ? pDescriptorPlayback->channels : MA_DEFAULT_CHANNELS;
-        }
-
-        /*
-        When testing in Firefox, I've seen it where capture mode fails if the sample rate is changed to anything other than it's
-        native rate. For this reason we're leaving the sample rate untouched for capture devices.
-        */
-        if (pConfig->deviceType == ma_device_type_playback) {
-            sampleRate = pDescriptorPlayback->sampleRate;
-        } else {
-            sampleRate = 0; /* Let the browser decide when capturing. */
-        }
-
-        /* The period size needs to be a power of 2. */
-        if (pConfig->deviceType == ma_device_type_capture) {
-            periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__webaudio(pDescriptorCapture, sampleRate, pConfig->performanceProfile);
-        } else {
-            periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__webaudio(pDescriptorPlayback, sampleRate, pConfig->performanceProfile);
-        }
-
-        /* We need an intermediary buffer for doing interleaving and deinterleaving. */
-        pDevice->webaudio.pIntermediaryBuffer = (float*)ma_malloc(periodSizeInFrames * channels * sizeof(float), &pDevice->pContext->allocationCallbacks);
-        if (pDevice->webaudio.pIntermediaryBuffer == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-
-        deviceIndex = EM_ASM_INT({
-            var deviceType = $0;
-            var channels   = $1;
-            var sampleRate = $2;
-            var bufferSize = $3;
-            var pIntermediaryBuffer = $4;
-            var pDevice    = $5;
-
-            if (typeof(window.miniaudio) === 'undefined') {
-                return -1;  /* Context not initialized. */
-            }
-
-            var device = {};
-
-            /* First thing we need is an AudioContext. */
-            var audioContextOptions = {};
-            if (deviceType == window.miniaudio.device_type.playback && sampleRate != 0) {
-                audioContextOptions.sampleRate = sampleRate;
-            }
-
-            device.webaudio = new (window.AudioContext || window.webkitAudioContext)(audioContextOptions);
-            device.webaudio.suspend();  /* The AudioContext must be created in a suspended state. */
-            device.state = window.miniaudio.device_state.stopped;
-
-            /*
-            We need to create a ScriptProcessorNode. The channel situation is the same as the AudioWorklet path in that we
-            need to specify an output and configure the channel count there.
-            */
-            var channelCountIn  = 0;
-            var channelCountOut = channels;
-            if (deviceType != window.miniaudio.device_type.playback) {
-                channelCountIn  = channels;
-            }
-
-            device.scriptNode = device.webaudio.createScriptProcessor(bufferSize, channelCountIn, channelCountOut);
-
-            /* The node processing callback. */
-            device.scriptNode.onaudioprocess = function(e) {
-                if (device.intermediaryBufferView == null || device.intermediaryBufferView.length == 0) {
-                    device.intermediaryBufferView = new Float32Array(Module.HEAPF32.buffer, pIntermediaryBuffer, bufferSize * channels);
-                }
-
-                /* Do the capture side first. */
-                if (deviceType == miniaudio.device_type.capture || deviceType == miniaudio.device_type.duplex) {
-                    /* The data must be interleaved before being processed miniaudio. */
-                    for (var iChannel = 0; iChannel < channels; iChannel += 1) {
-                        var inputBuffer = e.inputBuffer.getChannelData(iChannel);
-                        var intermediaryBuffer = device.intermediaryBufferView;
-
-                        for (var iFrame = 0; iFrame < bufferSize; iFrame += 1) {
-                            intermediaryBuffer[iFrame*channels + iChannel] = inputBuffer[iFrame];
-                        }
-                    }
-
-                    _ma_device_process_pcm_frames_capture__webaudio(pDevice, bufferSize, pIntermediaryBuffer);
-                }
-
-                if (deviceType == miniaudio.device_type.playback || deviceType == miniaudio.device_type.duplex) {
-                    _ma_device_process_pcm_frames_playback__webaudio(pDevice, bufferSize, pIntermediaryBuffer);
-
-                    for (var iChannel = 0; iChannel < e.outputBuffer.numberOfChannels; ++iChannel) {
-                        var outputBuffer = e.outputBuffer.getChannelData(iChannel);
-                        var intermediaryBuffer = device.intermediaryBufferView;
-
-                        for (var iFrame = 0; iFrame < bufferSize; iFrame += 1) {
-                            outputBuffer[iFrame] = intermediaryBuffer[iFrame*channels + iChannel];
-                        }
-                    }
-                } else {
-                    /* It's a capture-only device. Make sure the output is silenced. */
-                    for (var iChannel = 0; iChannel < e.outputBuffer.numberOfChannels; ++iChannel) {
-                        e.outputBuffer.getChannelData(iChannel).fill(0.0);
-                    }
-                }
-            };
-
-            /* Now we need to connect our node to the graph. */
-            if (deviceType == miniaudio.device_type.capture || deviceType == miniaudio.device_type.duplex) {
-                navigator.mediaDevices.getUserMedia({audio:true, video:false})
-                    .then(function(stream) {
-                        device.streamNode = device.webaudio.createMediaStreamSource(stream);
-                        device.streamNode.connect(device.scriptNode);
-                        device.scriptNode.connect(device.webaudio.destination);
-                    })
-                    .catch(function(error) {
-                        console.log("Failed to get user media: " + error);
-                    });
-            }
-
-            if (deviceType == miniaudio.device_type.playback) {
-                device.scriptNode.connect(device.webaudio.destination);
-            }
-
-            device.pDevice = pDevice;
-
-            return miniaudio.track_device(device);
-        }, pConfig->deviceType, channels, sampleRate, periodSizeInFrames, pDevice->webaudio.pIntermediaryBuffer, pDevice);
-
-        if (deviceIndex < 0) {
-            return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
-        }
-
-        pDevice->webaudio.deviceIndex = deviceIndex;
-
-        /* Grab the sample rate from the audio context directly. */
-        sampleRate = (ma_uint32)EM_ASM_INT({ return miniaudio.get_device_by_index($0).webaudio.sampleRate; }, deviceIndex);
-
-        if (pDescriptorCapture != NULL) {
-            pDescriptorCapture->format              = ma_format_f32;
-            pDescriptorCapture->channels            = channels;
-            pDescriptorCapture->sampleRate          = sampleRate;
-            ma_channel_map_init_standard(ma_standard_channel_map_webaudio, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorCapture->channels);
-            pDescriptorCapture->periodSizeInFrames  = periodSizeInFrames;
-            pDescriptorCapture->periodCount         = 1;
-        }
-
-        if (pDescriptorPlayback != NULL) {
-            pDescriptorPlayback->format             = ma_format_f32;
-            pDescriptorPlayback->channels           = channels;
-            pDescriptorPlayback->sampleRate         = sampleRate;
-            ma_channel_map_init_standard(ma_standard_channel_map_webaudio, pDescriptorPlayback->channelMap, ma_countof(pDescriptorPlayback->channelMap), pDescriptorPlayback->channels);
-            pDescriptorPlayback->periodSizeInFrames = periodSizeInFrames;
-            pDescriptorPlayback->periodCount        = 1;
-        }
-
-        return MA_SUCCESS;
-    }
-    #endif
-}
-
-static ma_result ma_device_start__webaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    EM_ASM({
-        var device = miniaudio.get_device_by_index($0);
-        device.webaudio.resume();
-        device.state = miniaudio.device_state.started;
-    }, pDevice->webaudio.deviceIndex);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_device_stop__webaudio(ma_device* pDevice)
-{
-    MA_ASSERT(pDevice != NULL);
-
-    /*
-    From the WebAudio API documentation for AudioContext.suspend():
-
-        Suspends the progression of AudioContext's currentTime, allows any current context processing blocks that are already processed to be played to the
-        destination, and then allows the system to release its claim on audio hardware.
-
-    I read this to mean that "any current context processing blocks" are processed by suspend() - i.e. They they are drained. We therefore shouldn't need to
-    do any kind of explicit draining.
-    */
-    EM_ASM({
-        var device = miniaudio.get_device_by_index($0);
-        device.webaudio.suspend();
-        device.state = miniaudio.device_state.stopped;
-    }, pDevice->webaudio.deviceIndex);
-
-    ma_device__on_notification_stopped(pDevice);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_uninit__webaudio(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-    MA_ASSERT(pContext->backend == ma_backend_webaudio);
-
-    (void)pContext; /* Unused. */
-
-    /* Remove the global miniaudio object from window if there are no more references to it. */
-    EM_ASM({
-        if (typeof(window.miniaudio) !== 'undefined') {
-            window.miniaudio.referenceCount -= 1;
-            if (window.miniaudio.referenceCount === 0) {
-                delete window.miniaudio;
-            }
-        }
-    });
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init__webaudio(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
-{
-    int resultFromJS;
-
-    MA_ASSERT(pContext != NULL);
-
-    (void)pConfig; /* Unused. */
-
-    /* Here is where our global JavaScript object is initialized. */
-    resultFromJS = EM_ASM_INT({
-        if (typeof window === 'undefined' || (window.AudioContext || window.webkitAudioContext) === undefined) {
-            return 0;   /* Web Audio not supported. */
-        }
-
-        if (typeof(window.miniaudio) === 'undefined') {
-            window.miniaudio = {
-                referenceCount: 0
-            };
-
-            /* Device types. */
-            window.miniaudio.device_type = {};
-            window.miniaudio.device_type.playback = $0;
-            window.miniaudio.device_type.capture  = $1;
-            window.miniaudio.device_type.duplex   = $2;
-
-            /* Device states. */
-            window.miniaudio.device_state = {};
-            window.miniaudio.device_state.stopped = $3;
-            window.miniaudio.device_state.started = $4;
-
-            /* Device cache for mapping devices to indexes for JavaScript/C interop. */
-            miniaudio.devices = [];
-
-            miniaudio.track_device = function(device) {
-                /* Try inserting into a free slot first. */
-                for (var iDevice = 0; iDevice < miniaudio.devices.length; ++iDevice) {
-                    if (miniaudio.devices[iDevice] == null) {
-                        miniaudio.devices[iDevice] = device;
-                        return iDevice;
-                    }
-                }
-
-                /* Getting here means there is no empty slots in the array so we just push to the end. */
-                miniaudio.devices.push(device);
-                return miniaudio.devices.length - 1;
-            };
-
-            miniaudio.untrack_device_by_index = function(deviceIndex) {
-                /* We just set the device's slot to null. The slot will get reused in the next call to ma_track_device. */
-                miniaudio.devices[deviceIndex] = null;
-
-                /* Trim the array if possible. */
-                while (miniaudio.devices.length > 0) {
-                    if (miniaudio.devices[miniaudio.devices.length-1] == null) {
-                        miniaudio.devices.pop();
-                    } else {
-                        break;
-                    }
-                }
-            };
-
-            miniaudio.untrack_device = function(device) {
-                for (var iDevice = 0; iDevice < miniaudio.devices.length; ++iDevice) {
-                    if (miniaudio.devices[iDevice] == device) {
-                        return miniaudio.untrack_device_by_index(iDevice);
-                    }
-                }
-            };
-
-            miniaudio.get_device_by_index = function(deviceIndex) {
-                return miniaudio.devices[deviceIndex];
-            };
-
-            miniaudio.unlock_event_types = (function(){
-                return ['touchend', 'click'];
-            })();
-
-            miniaudio.unlock = function() {
-                for(var i = 0; i < miniaudio.devices.length; ++i) {
-                    var device = miniaudio.devices[i];
-                    if (device != null &&
-                        device.webaudio != null &&
-                        device.state === window.miniaudio.device_state.started) {
-
-                        device.webaudio.resume().then(() => {
-                                Module._ma_device__on_notification_unlocked(device.pDevice);
-                            },
-                            (error) => {console.error("Failed to resume audiocontext", error);
-                            });
-                    }
-                }
-                miniaudio.unlock_event_types.map(function(event_type) {
-                    document.removeEventListener(event_type, miniaudio.unlock, true);
-                });
-            };
-
-            miniaudio.unlock_event_types.map(function(event_type) {
-                document.addEventListener(event_type, miniaudio.unlock, true);
-            });
-        }
-
-        window.miniaudio.referenceCount += 1;
-
-        return 1;
-    }, ma_device_type_playback, ma_device_type_capture, ma_device_type_duplex, ma_device_state_stopped, ma_device_state_started);
-
-    if (resultFromJS != 1) {
-        return MA_FAILED_TO_INIT_BACKEND;
-    }
-
-    pCallbacks->onContextInit             = ma_context_init__webaudio;
-    pCallbacks->onContextUninit           = ma_context_uninit__webaudio;
-    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__webaudio;
-    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__webaudio;
-    pCallbacks->onDeviceInit              = ma_device_init__webaudio;
-    pCallbacks->onDeviceUninit            = ma_device_uninit__webaudio;
-    pCallbacks->onDeviceStart             = ma_device_start__webaudio;
-    pCallbacks->onDeviceStop              = ma_device_stop__webaudio;
-    pCallbacks->onDeviceRead              = NULL;   /* Not needed because WebAudio is asynchronous. */
-    pCallbacks->onDeviceWrite             = NULL;   /* Not needed because WebAudio is asynchronous. */
-    pCallbacks->onDeviceDataLoop          = NULL;   /* Not needed because WebAudio is asynchronous. */
-
-    return MA_SUCCESS;
-}
-#endif  /* Web Audio */
-
-
-
-static ma_bool32 ma__is_channel_map_valid(const ma_channel* pChannelMap, ma_uint32 channels)
-{
-    /* A blank channel map should be allowed, in which case it should use an appropriate default which will depend on context. */
-    if (pChannelMap != NULL && pChannelMap[0] != MA_CHANNEL_NONE) {
-        ma_uint32 iChannel;
-
-        if (channels == 0 || channels > MA_MAX_CHANNELS) {
-            return MA_FALSE;   /* Channel count out of range. */
-        }
-
-        /* A channel cannot be present in the channel map more than once. */
-        for (iChannel = 0; iChannel < channels; ++iChannel) {
-            ma_uint32 jChannel;
-            for (jChannel = iChannel + 1; jChannel < channels; ++jChannel) {
-                if (pChannelMap[iChannel] == pChannelMap[jChannel]) {
-                    return MA_FALSE;
-                }
-            }
-        }
-    }
-
-    return MA_TRUE;
-}
-
-
-static ma_bool32 ma_context_is_backend_asynchronous(ma_context* pContext)
-{
-    MA_ASSERT(pContext != NULL);
-
-    if (pContext->callbacks.onDeviceRead == NULL && pContext->callbacks.onDeviceWrite == NULL) {
-        if (pContext->callbacks.onDeviceDataLoop == NULL) {
-            return MA_TRUE;
-        } else {
-            return MA_FALSE;
-        }
-    } else {
-        return MA_FALSE;
-    }
-}
-
-
-static ma_result ma_device__post_init_setup(ma_device* pDevice, ma_device_type deviceType)
-{
-    ma_result result;
-
-    MA_ASSERT(pDevice != NULL);
-
-    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
-        if (pDevice->capture.format == ma_format_unknown) {
-            pDevice->capture.format = pDevice->capture.internalFormat;
-        }
-        if (pDevice->capture.channels == 0) {
-            pDevice->capture.channels = pDevice->capture.internalChannels;
-        }
-        if (pDevice->capture.channelMap[0] == MA_CHANNEL_NONE) {
-            MA_ASSERT(pDevice->capture.channels <= MA_MAX_CHANNELS);
-            if (pDevice->capture.internalChannels == pDevice->capture.channels) {
-                ma_channel_map_copy(pDevice->capture.channelMap, pDevice->capture.internalChannelMap, pDevice->capture.channels);
-            } else {
-                if (pDevice->capture.channelMixMode == ma_channel_mix_mode_simple) {
-                    ma_channel_map_init_blank(pDevice->capture.channelMap, pDevice->capture.channels);
-                } else {
-                    ma_channel_map_init_standard(ma_standard_channel_map_default, pDevice->capture.channelMap, ma_countof(pDevice->capture.channelMap), pDevice->capture.channels);
-                }
-            }
-        }
-    }
-
-    if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
-        if (pDevice->playback.format == ma_format_unknown) {
-            pDevice->playback.format = pDevice->playback.internalFormat;
-        }
-        if (pDevice->playback.channels == 0) {
-            pDevice->playback.channels = pDevice->playback.internalChannels;
-        }
-        if (pDevice->playback.channelMap[0] == MA_CHANNEL_NONE) {
-            MA_ASSERT(pDevice->playback.channels <= MA_MAX_CHANNELS);
-            if (pDevice->playback.internalChannels == pDevice->playback.channels) {
-                ma_channel_map_copy(pDevice->playback.channelMap, pDevice->playback.internalChannelMap, pDevice->playback.channels);
-            } else {
-                if (pDevice->playback.channelMixMode == ma_channel_mix_mode_simple) {
-                    ma_channel_map_init_blank(pDevice->playback.channelMap, pDevice->playback.channels);
-                } else {
-                    ma_channel_map_init_standard(ma_standard_channel_map_default, pDevice->playback.channelMap, ma_countof(pDevice->playback.channelMap), pDevice->playback.channels);
-                }
-            }
-        }
-    }
-
-    if (pDevice->sampleRate == 0) {
-        if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
-            pDevice->sampleRate = pDevice->capture.internalSampleRate;
-        } else {
-            pDevice->sampleRate = pDevice->playback.internalSampleRate;
-        }
-    }
-
-    /* Data converters. */
-    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
-        /* Converting from internal device format to client format. */
-        ma_data_converter_config converterConfig = ma_data_converter_config_init_default();
-        converterConfig.formatIn                        = pDevice->capture.internalFormat;
-        converterConfig.channelsIn                      = pDevice->capture.internalChannels;
-        converterConfig.sampleRateIn                    = pDevice->capture.internalSampleRate;
-        converterConfig.pChannelMapIn                   = pDevice->capture.internalChannelMap;
-        converterConfig.formatOut                       = pDevice->capture.format;
-        converterConfig.channelsOut                     = pDevice->capture.channels;
-        converterConfig.sampleRateOut                   = pDevice->sampleRate;
-        converterConfig.pChannelMapOut                  = pDevice->capture.channelMap;
-        converterConfig.channelMixMode                  = pDevice->capture.channelMixMode;
-        converterConfig.calculateLFEFromSpatialChannels = pDevice->capture.calculateLFEFromSpatialChannels;
-        converterConfig.allowDynamicSampleRate          = MA_FALSE;
-        converterConfig.resampling.algorithm            = pDevice->resampling.algorithm;
-        converterConfig.resampling.linear.lpfOrder      = pDevice->resampling.linear.lpfOrder;
-        converterConfig.resampling.pBackendVTable       = pDevice->resampling.pBackendVTable;
-        converterConfig.resampling.pBackendUserData     = pDevice->resampling.pBackendUserData;
-
-        /* Make sure the old converter is uninitialized first. */
-        if (ma_device_get_state(pDevice) != ma_device_state_uninitialized) {
-            ma_data_converter_uninit(&pDevice->capture.converter, &pDevice->pContext->allocationCallbacks);
-        }
-
-        result = ma_data_converter_init(&converterConfig, &pDevice->pContext->allocationCallbacks, &pDevice->capture.converter);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
-        /* Converting from client format to device format. */
-        ma_data_converter_config converterConfig = ma_data_converter_config_init_default();
-        converterConfig.formatIn                        = pDevice->playback.format;
-        converterConfig.channelsIn                      = pDevice->playback.channels;
-        converterConfig.sampleRateIn                    = pDevice->sampleRate;
-        converterConfig.pChannelMapIn                   = pDevice->playback.channelMap;
-        converterConfig.formatOut                       = pDevice->playback.internalFormat;
-        converterConfig.channelsOut                     = pDevice->playback.internalChannels;
-        converterConfig.sampleRateOut                   = pDevice->playback.internalSampleRate;
-        converterConfig.pChannelMapOut                  = pDevice->playback.internalChannelMap;
-        converterConfig.channelMixMode                  = pDevice->playback.channelMixMode;
-        converterConfig.calculateLFEFromSpatialChannels = pDevice->playback.calculateLFEFromSpatialChannels;
-        converterConfig.allowDynamicSampleRate          = MA_FALSE;
-        converterConfig.resampling.algorithm            = pDevice->resampling.algorithm;
-        converterConfig.resampling.linear.lpfOrder      = pDevice->resampling.linear.lpfOrder;
-        converterConfig.resampling.pBackendVTable       = pDevice->resampling.pBackendVTable;
-        converterConfig.resampling.pBackendUserData     = pDevice->resampling.pBackendUserData;
-
-        /* Make sure the old converter is uninitialized first. */
-        if (ma_device_get_state(pDevice) != ma_device_state_uninitialized) {
-            ma_data_converter_uninit(&pDevice->playback.converter, &pDevice->pContext->allocationCallbacks);
-        }
-
-        result = ma_data_converter_init(&converterConfig, &pDevice->pContext->allocationCallbacks, &pDevice->playback.converter);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-
-    /*
-    If the device is doing playback (ma_device_type_playback or ma_device_type_duplex), there's
-    a couple of situations where we'll need a heap allocated cache.
-
-    The first is a duplex device for backends that use a callback for data delivery. The reason
-    this is needed is that the input stage needs to have a buffer to place the input data while it
-    waits for the playback stage, after which the miniaudio data callback will get fired. This is
-    not needed for backends that use a blocking API because miniaudio manages temporary buffers on
-    the stack to achieve this.
-
-    The other situation is when the data converter does not have the ability to query the number
-    of input frames that are required in order to process a given number of output frames. When
-    performing data conversion, it's useful if miniaudio know exactly how many frames it needs
-    from the client in order to generate a given number of output frames. This way, only exactly
-    the number of frames are needed to be read from the client which means no cache is necessary.
-    On the other hand, if miniaudio doesn't know how many frames to read, it is forced to read
-    in fixed sized chunks and then cache any residual unused input frames, those of which will be
-    processed at a later stage.
-    */
-    if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
-        ma_uint64 unused;
-
-        pDevice->playback.inputCacheConsumed  = 0;
-        pDevice->playback.inputCacheRemaining = 0;
-
-        if (pDevice->type == ma_device_type_duplex ||                                                                       /* Duplex. backend may decide to use ma_device_handle_backend_data_callback() which will require this cache. */
-            ma_data_converter_get_required_input_frame_count(&pDevice->playback.converter, 1, &unused) != MA_SUCCESS)       /* Data conversion required input frame calculation not supported. */
-        {
-            /* We need a heap allocated cache. We want to size this based on the period size. */
-            void* pNewInputCache;
-            ma_uint64 newInputCacheCap;
-            ma_uint64 newInputCacheSizeInBytes;
-
-            newInputCacheCap = ma_calculate_frame_count_after_resampling(pDevice->playback.internalSampleRate, pDevice->sampleRate, pDevice->playback.internalPeriodSizeInFrames);
-
-            newInputCacheSizeInBytes = newInputCacheCap * ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
-            if (newInputCacheSizeInBytes > MA_SIZE_MAX) {
-                ma_free(pDevice->playback.pInputCache, &pDevice->pContext->allocationCallbacks);
-                pDevice->playback.pInputCache   = NULL;
-                pDevice->playback.inputCacheCap = 0;
-                return MA_OUT_OF_MEMORY;    /* Allocation too big. Should never hit this, but makes the cast below safer for 32-bit builds. */
-            }
-
-            pNewInputCache = ma_realloc(pDevice->playback.pInputCache, (size_t)newInputCacheSizeInBytes, &pDevice->pContext->allocationCallbacks);
-            if (pNewInputCache == NULL) {
-                ma_free(pDevice->playback.pInputCache, &pDevice->pContext->allocationCallbacks);
-                pDevice->playback.pInputCache   = NULL;
-                pDevice->playback.inputCacheCap = 0;
-                return MA_OUT_OF_MEMORY;
-            }
-
-            pDevice->playback.pInputCache   = pNewInputCache;
-            pDevice->playback.inputCacheCap = newInputCacheCap;
-        } else {
-            /* Heap allocation not required. Make sure we clear out the old cache just in case this function was called in response to a route change. */
-            ma_free(pDevice->playback.pInputCache, &pDevice->pContext->allocationCallbacks);
-            pDevice->playback.pInputCache   = NULL;
-            pDevice->playback.inputCacheCap = 0;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_device_post_init(ma_device* pDevice, ma_device_type deviceType, const ma_device_descriptor* pDescriptorPlayback, const ma_device_descriptor* pDescriptorCapture)
-{
-    ma_result result;
-
-    if (pDevice == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Capture. */
-    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
-        if (ma_device_descriptor_is_valid(pDescriptorCapture) == MA_FALSE) {
-            return MA_INVALID_ARGS;
-        }
-
-        pDevice->capture.internalFormat             = pDescriptorCapture->format;
-        pDevice->capture.internalChannels           = pDescriptorCapture->channels;
-        pDevice->capture.internalSampleRate         = pDescriptorCapture->sampleRate;
-        MA_COPY_MEMORY(pDevice->capture.internalChannelMap, pDescriptorCapture->channelMap, sizeof(pDescriptorCapture->channelMap));
-        pDevice->capture.internalPeriodSizeInFrames = pDescriptorCapture->periodSizeInFrames;
-        pDevice->capture.internalPeriods            = pDescriptorCapture->periodCount;
-
-        if (pDevice->capture.internalPeriodSizeInFrames == 0) {
-            pDevice->capture.internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptorCapture->periodSizeInMilliseconds, pDescriptorCapture->sampleRate);
-        }
-    }
-
-    /* Playback. */
-    if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
-        if (ma_device_descriptor_is_valid(pDescriptorPlayback) == MA_FALSE) {
-            return MA_INVALID_ARGS;
-        }
-
-        pDevice->playback.internalFormat             = pDescriptorPlayback->format;
-        pDevice->playback.internalChannels           = pDescriptorPlayback->channels;
-        pDevice->playback.internalSampleRate         = pDescriptorPlayback->sampleRate;
-        MA_COPY_MEMORY(pDevice->playback.internalChannelMap, pDescriptorPlayback->channelMap, sizeof(pDescriptorPlayback->channelMap));
-        pDevice->playback.internalPeriodSizeInFrames = pDescriptorPlayback->periodSizeInFrames;
-        pDevice->playback.internalPeriods            = pDescriptorPlayback->periodCount;
-
-        if (pDevice->playback.internalPeriodSizeInFrames == 0) {
-            pDevice->playback.internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptorPlayback->periodSizeInMilliseconds, pDescriptorPlayback->sampleRate);
-        }
-    }
-
-    /*
-    The name of the device can be retrieved from device info. This may be temporary and replaced with a `ma_device_get_info(pDevice, deviceType)` instead.
-    For loopback devices, we need to retrieve the name of the playback device.
-    */
-    {
-        ma_device_info deviceInfo;
-
-        if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
-            result = ma_device_get_info(pDevice, (deviceType == ma_device_type_loopback) ? ma_device_type_playback : ma_device_type_capture, &deviceInfo);
-            if (result == MA_SUCCESS) {
-                ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), deviceInfo.name, (size_t)-1);
-            } else {
-                /* We failed to retrieve the device info. Fall back to a default name. */
-                if (pDescriptorCapture->pDeviceID == NULL) {
-                    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-                } else {
-                    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), "Capture Device", (size_t)-1);
-                }
-            }
-        }
-
-        if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
-            result = ma_device_get_info(pDevice, ma_device_type_playback, &deviceInfo);
-            if (result == MA_SUCCESS) {
-                ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), deviceInfo.name, (size_t)-1);
-            } else {
-                /* We failed to retrieve the device info. Fall back to a default name. */
-                if (pDescriptorPlayback->pDeviceID == NULL) {
-                    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-                } else {
-                    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), "Playback Device", (size_t)-1);
-                }
-            }
-        }
-    }
-
-    /* Update data conversion. */
-    return ma_device__post_init_setup(pDevice, deviceType); /* TODO: Should probably rename ma_device__post_init_setup() to something better. */
-}
-
-
-static ma_thread_result MA_THREADCALL ma_worker_thread(void* pData)
-{
-    ma_device* pDevice = (ma_device*)pData;
-#ifdef MA_WIN32
-    HRESULT CoInitializeResult;
-#endif
-
-    MA_ASSERT(pDevice != NULL);
-
-#ifdef MA_WIN32
-    CoInitializeResult = ma_CoInitializeEx(pDevice->pContext, NULL, MA_COINIT_VALUE);
-#endif
-
-    /*
-    When the device is being initialized it's initial state is set to ma_device_state_uninitialized. Before returning from
-    ma_device_init(), the state needs to be set to something valid. In miniaudio the device's default state immediately
-    after initialization is stopped, so therefore we need to mark the device as such. miniaudio will wait on the worker
-    thread to signal an event to know when the worker thread is ready for action.
-    */
-    ma_device__set_state(pDevice, ma_device_state_stopped);
-    ma_event_signal(&pDevice->stopEvent);
-
-    for (;;) {  /* <-- This loop just keeps the thread alive. The main audio loop is inside. */
-        ma_result startResult;
-        ma_result stopResult;   /* <-- This will store the result from onDeviceStop(). If it returns an error, we don't fire the stopped notification callback. */
-
-        /* We wait on an event to know when something has requested that the device be started and the main loop entered. */
-        ma_event_wait(&pDevice->wakeupEvent);
-
-        /* Default result code. */
-        pDevice->workResult = MA_SUCCESS;
-
-        /* If the reason for the wake up is that we are terminating, just break from the loop. */
-        if (ma_device_get_state(pDevice) == ma_device_state_uninitialized) {
-            break;
-        }
-
-        /*
-        Getting to this point means the device is wanting to get started. The function that has requested that the device
-        be started will be waiting on an event (pDevice->startEvent) which means we need to make sure we signal the event
-        in both the success and error case. It's important that the state of the device is set _before_ signaling the event.
-        */
-        MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_starting);
-
-        /* If the device has a start callback, start it now. */
-        if (pDevice->pContext->callbacks.onDeviceStart != NULL) {
-            startResult = pDevice->pContext->callbacks.onDeviceStart(pDevice);
-        } else {
-            startResult = MA_SUCCESS;
-        }
-
-        /*
-        If starting was not successful we'll need to loop back to the start and wait for something
-        to happen (pDevice->wakeupEvent).
-        */
-        if (startResult != MA_SUCCESS) {
-            pDevice->workResult = startResult;
-            ma_event_signal(&pDevice->startEvent);  /* <-- Always signal the start event so ma_device_start() can return as it'll be waiting on it. */
-            continue;
-        }
-
-        /* Make sure the state is set appropriately. */
-        ma_device__set_state(pDevice, ma_device_state_started); /* <-- Set this before signaling the event so that the state is always guaranteed to be good after ma_device_start() has returned. */
-        ma_event_signal(&pDevice->startEvent);
-
-        ma_device__on_notification_started(pDevice);
-
-        if (pDevice->pContext->callbacks.onDeviceDataLoop != NULL) {
-            pDevice->pContext->callbacks.onDeviceDataLoop(pDevice);
-        } else {
-            /* The backend is not using a custom main loop implementation, so now fall back to the blocking read-write implementation. */
-            ma_device_audio_thread__default_read_write(pDevice);
-        }
-
-        /* Getting here means we have broken from the main loop which happens the application has requested that device be stopped. */
-        if (pDevice->pContext->callbacks.onDeviceStop != NULL) {
-            stopResult = pDevice->pContext->callbacks.onDeviceStop(pDevice);
-        } else {
-            stopResult = MA_SUCCESS;    /* No stop callback with the backend. Just assume successful. */
-        }
-
-        /*
-        After the device has stopped, make sure an event is posted. Don't post a stopped event if
-        stopping failed. This can happen on some backends when the underlying stream has been
-        stopped due to the device being physically unplugged or disabled via an OS setting.
-        */
-        if (stopResult == MA_SUCCESS) {
-            ma_device__on_notification_stopped(pDevice);
-        }
-
-        /* If we stopped because the device has been uninitialized, abort now. */
-        if (ma_device_get_state(pDevice) == ma_device_state_uninitialized) {
-            break;
-        }
-
-        /* A function somewhere is waiting for the device to have stopped for real so we need to signal an event to allow it to continue. */
-        ma_device__set_state(pDevice, ma_device_state_stopped);
-        ma_event_signal(&pDevice->stopEvent);
-    }
-
-#ifdef MA_WIN32
-    if (CoInitializeResult == S_OK) {
-        ma_CoUninitialize(pDevice->pContext);
-    }
-#endif
-
-    return (ma_thread_result)0;
-}
-
-
-/* Helper for determining whether or not the given device is initialized. */
-static ma_bool32 ma_device__is_initialized(ma_device* pDevice)
-{
-    if (pDevice == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_device_get_state(pDevice) != ma_device_state_uninitialized;
-}
-
-
-#ifdef MA_WIN32
-static ma_result ma_context_uninit_backend_apis__win32(ma_context* pContext)
-{
-    /* For some reason UWP complains when CoUninitialize() is called. I'm just not going to call it on UWP. */
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    if (pContext->win32.CoInitializeResult == S_OK) {
-        ma_CoUninitialize(pContext);
-    }
-
-    #if defined(MA_WIN32_DESKTOP)
-        ma_dlclose(ma_context_get_log(pContext), pContext->win32.hUser32DLL);
-        ma_dlclose(ma_context_get_log(pContext), pContext->win32.hAdvapi32DLL);
-    #endif
-
-    ma_dlclose(ma_context_get_log(pContext), pContext->win32.hOle32DLL);
-#else
-    (void)pContext;
-#endif
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init_backend_apis__win32(ma_context* pContext)
-{
-#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
-    #if defined(MA_WIN32_DESKTOP)
-        /* User32.dll */
-        pContext->win32.hUser32DLL = ma_dlopen(ma_context_get_log(pContext), "user32.dll");
-        if (pContext->win32.hUser32DLL == NULL) {
-            return MA_FAILED_TO_INIT_BACKEND;
-        }
-
-        pContext->win32.GetForegroundWindow = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hUser32DLL, "GetForegroundWindow");
-        pContext->win32.GetDesktopWindow    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hUser32DLL, "GetDesktopWindow");
-
-
-        /* Advapi32.dll */
-        pContext->win32.hAdvapi32DLL = ma_dlopen(ma_context_get_log(pContext), "advapi32.dll");
-        if (pContext->win32.hAdvapi32DLL == NULL) {
-            return MA_FAILED_TO_INIT_BACKEND;
-        }
-
-        pContext->win32.RegOpenKeyExA    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hAdvapi32DLL, "RegOpenKeyExA");
-        pContext->win32.RegCloseKey      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hAdvapi32DLL, "RegCloseKey");
-        pContext->win32.RegQueryValueExA = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hAdvapi32DLL, "RegQueryValueExA");
-    #endif
-
-    /* Ole32.dll */
-    pContext->win32.hOle32DLL = ma_dlopen(ma_context_get_log(pContext), "ole32.dll");
-    if (pContext->win32.hOle32DLL == NULL) {
-        return MA_FAILED_TO_INIT_BACKEND;
-    }
-
-    pContext->win32.CoInitialize     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoInitialize");
-    pContext->win32.CoInitializeEx   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoInitializeEx");
-    pContext->win32.CoUninitialize   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoUninitialize");
-    pContext->win32.CoCreateInstance = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoCreateInstance");
-    pContext->win32.CoTaskMemFree    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoTaskMemFree");
-    pContext->win32.PropVariantClear = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "PropVariantClear");
-    pContext->win32.StringFromGUID2  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "StringFromGUID2");
-#else
-    (void)pContext; /* Unused. */
-#endif
-
-    pContext->win32.CoInitializeResult = ma_CoInitializeEx(pContext, NULL, MA_COINIT_VALUE);
-    return MA_SUCCESS;
-}
-#else
-static ma_result ma_context_uninit_backend_apis__nix(ma_context* pContext)
-{
-    (void)pContext;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_context_init_backend_apis__nix(ma_context* pContext)
-{
-    (void)pContext;
-
-    return MA_SUCCESS;
-}
-#endif
-
-static ma_result ma_context_init_backend_apis(ma_context* pContext)
-{
-    ma_result result;
-#ifdef MA_WIN32
-    result = ma_context_init_backend_apis__win32(pContext);
-#else
-    result = ma_context_init_backend_apis__nix(pContext);
-#endif
-
-    return result;
-}
-
-static ma_result ma_context_uninit_backend_apis(ma_context* pContext)
-{
-    ma_result result;
-#ifdef MA_WIN32
-    result = ma_context_uninit_backend_apis__win32(pContext);
-#else
-    result = ma_context_uninit_backend_apis__nix(pContext);
-#endif
-
-    return result;
-}
-
-
-/* The default capacity doesn't need to be too big. */
-#ifndef MA_DEFAULT_DEVICE_JOB_QUEUE_CAPACITY
-#define MA_DEFAULT_DEVICE_JOB_QUEUE_CAPACITY    32
-#endif
-
-MA_API ma_device_job_thread_config ma_device_job_thread_config_init(void)
-{
-    ma_device_job_thread_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.noThread         = MA_FALSE;
-    config.jobQueueCapacity = MA_DEFAULT_DEVICE_JOB_QUEUE_CAPACITY;
-    config.jobQueueFlags    = 0;
-
-    return config;
-}
-
-
-static ma_thread_result MA_THREADCALL ma_device_job_thread_entry(void* pUserData)
-{
-    ma_device_job_thread* pJobThread = (ma_device_job_thread*)pUserData;
-    MA_ASSERT(pJobThread != NULL);
-
-    for (;;) {
-        ma_result result;
-        ma_job job;
-
-        result = ma_device_job_thread_next(pJobThread, &job);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        if (job.toc.breakup.code == MA_JOB_TYPE_QUIT) {
-            break;
-        }
-
-        ma_job_process(&job);
-    }
-
-    return (ma_thread_result)0;
-}
-
-MA_API ma_result ma_device_job_thread_init(const ma_device_job_thread_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_device_job_thread* pJobThread)
-{
-    ma_result result;
-    ma_job_queue_config jobQueueConfig;
-
-    if (pJobThread == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pJobThread);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-
-    /* Initialize the job queue before the thread to ensure it's in a valid state. */
-    jobQueueConfig = ma_job_queue_config_init(pConfig->jobQueueFlags, pConfig->jobQueueCapacity);
-
-    result = ma_job_queue_init(&jobQueueConfig, pAllocationCallbacks, &pJobThread->jobQueue);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize job queue. */
-    }
-
-
-    /* The thread needs to be initialized after the job queue to ensure the thread doesn't try to access it prematurely. */
-    if (pConfig->noThread == MA_FALSE) {
-        result = ma_thread_create(&pJobThread->thread, ma_thread_priority_normal, 0, ma_device_job_thread_entry, pJobThread, pAllocationCallbacks);
-        if (result != MA_SUCCESS) {
-            ma_job_queue_uninit(&pJobThread->jobQueue, pAllocationCallbacks);
-            return result;  /* Failed to create the job thread. */
-        }
-
-        pJobThread->_hasThread = MA_TRUE;
-    } else {
-        pJobThread->_hasThread = MA_FALSE;
-    }
-
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_device_job_thread_uninit(ma_device_job_thread* pJobThread, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pJobThread == NULL) {
-        return;
-    }
-
-    /* The first thing to do is post a quit message to the job queue. If we're using a thread we'll need to wait for it. */
-    {
-        ma_job job = ma_job_init(MA_JOB_TYPE_QUIT);
-        ma_device_job_thread_post(pJobThread, &job);
-    }
-
-    /* Wait for the thread to terminate naturally. */
-    if (pJobThread->_hasThread) {
-        ma_thread_wait(&pJobThread->thread);
-    }
-
-    /* At this point the thread should be terminated so we can safely uninitialize the job queue. */
-    ma_job_queue_uninit(&pJobThread->jobQueue, pAllocationCallbacks);
-}
-
-MA_API ma_result ma_device_job_thread_post(ma_device_job_thread* pJobThread, const ma_job* pJob)
-{
-    if (pJobThread == NULL || pJob == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_job_queue_post(&pJobThread->jobQueue, pJob);
-}
-
-MA_API ma_result ma_device_job_thread_next(ma_device_job_thread* pJobThread, ma_job* pJob)
-{
-    if (pJob == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pJob);
-
-    if (pJobThread == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_job_queue_next(&pJobThread->jobQueue, pJob);
-}
-
-
-
-MA_API ma_context_config ma_context_config_init(void)
-{
-    ma_context_config config;
-    MA_ZERO_OBJECT(&config);
-
-    return config;
-}
-
-MA_API ma_result ma_context_init(const ma_backend backends[], ma_uint32 backendCount, const ma_context_config* pConfig, ma_context* pContext)
-{
-    ma_result result;
-    ma_context_config defaultConfig;
-    ma_backend defaultBackends[ma_backend_null+1];
-    ma_uint32 iBackend;
-    ma_backend* pBackendsToIterate;
-    ma_uint32 backendsToIterateCount;
-
-    if (pContext == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pContext);
-
-    /* Always make sure the config is set first to ensure properties are available as soon as possible. */
-    if (pConfig == NULL) {
-        defaultConfig = ma_context_config_init();
-        pConfig = &defaultConfig;
-    }
-
-    /* Allocation callbacks need to come first because they'll be passed around to other areas. */
-    result = ma_allocation_callbacks_init_copy(&pContext->allocationCallbacks, &pConfig->allocationCallbacks);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* Get a lot set up first so we can start logging ASAP. */
-    if (pConfig->pLog != NULL) {
-        pContext->pLog = pConfig->pLog;
-    } else {
-        result = ma_log_init(&pContext->allocationCallbacks, &pContext->log);
-        if (result == MA_SUCCESS) {
-            pContext->pLog = &pContext->log;
-        } else {
-            pContext->pLog = NULL;  /* Logging is not available. */
-        }
-    }
-
-    pContext->threadPriority  = pConfig->threadPriority;
-    pContext->threadStackSize = pConfig->threadStackSize;
-    pContext->pUserData       = pConfig->pUserData;
-
-    /* Backend APIs need to be initialized first. This is where external libraries will be loaded and linked. */
-    result = ma_context_init_backend_apis(pContext);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    for (iBackend = 0; iBackend <= ma_backend_null; ++iBackend) {
-        defaultBackends[iBackend] = (ma_backend)iBackend;
-    }
-
-    pBackendsToIterate = (ma_backend*)backends;
-    backendsToIterateCount = backendCount;
-    if (pBackendsToIterate == NULL) {
-        pBackendsToIterate = (ma_backend*)defaultBackends;
-        backendsToIterateCount = ma_countof(defaultBackends);
-    }
-
-    MA_ASSERT(pBackendsToIterate != NULL);
-
-    for (iBackend = 0; iBackend < backendsToIterateCount; iBackend += 1) {
-        ma_backend backend = pBackendsToIterate[iBackend];
-
-        /* Make sure all callbacks are reset so we don't accidentally drag in any from previously failed initialization attempts. */
-        MA_ZERO_OBJECT(&pContext->callbacks);
-
-        /* These backends are using the new callback system. */
-        switch (backend) {
-        #ifdef MA_HAS_WASAPI
-            case ma_backend_wasapi:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__wasapi;
-            } break;
-        #endif
-        #ifdef MA_HAS_DSOUND
-            case ma_backend_dsound:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__dsound;
-            } break;
-        #endif
-        #ifdef MA_HAS_WINMM
-            case ma_backend_winmm:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__winmm;
-            } break;
-        #endif
-        #ifdef MA_HAS_COREAUDIO
-            case ma_backend_coreaudio:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__coreaudio;
-            } break;
-        #endif
-        #ifdef MA_HAS_SNDIO
-            case ma_backend_sndio:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__sndio;
-            } break;
-        #endif
-        #ifdef MA_HAS_AUDIO4
-            case ma_backend_audio4:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__audio4;
-            } break;
-        #endif
-        #ifdef MA_HAS_OSS
-            case ma_backend_oss:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__oss;
-            } break;
-        #endif
-        #ifdef MA_HAS_PULSEAUDIO
-            case ma_backend_pulseaudio:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__pulse;
-            } break;
-        #endif
-        #ifdef MA_HAS_ALSA
-            case ma_backend_alsa:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__alsa;
-            } break;
-        #endif
-        #ifdef MA_HAS_JACK
-            case ma_backend_jack:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__jack;
-            } break;
-        #endif
-        #ifdef MA_HAS_AAUDIO
-            case ma_backend_aaudio:
-            {
-                if (ma_is_backend_enabled(backend)) {
-                    pContext->callbacks.onContextInit = ma_context_init__aaudio;
-                }
-            } break;
-        #endif
-        #ifdef MA_HAS_OPENSL
-            case ma_backend_opensl:
-            {
-                if (ma_is_backend_enabled(backend)) {
-                    pContext->callbacks.onContextInit = ma_context_init__opensl;
-                }
-            } break;
-        #endif
-        #ifdef MA_HAS_WEBAUDIO
-            case ma_backend_webaudio:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__webaudio;
-            } break;
-        #endif
-        #ifdef MA_HAS_CUSTOM
-            case ma_backend_custom:
-            {
-                /* Slightly different logic for custom backends. Custom backends can optionally set all of their callbacks in the config. */
-                pContext->callbacks = pConfig->custom;
-            } break;
-        #endif
-        #ifdef MA_HAS_NULL
-            case ma_backend_null:
-            {
-                pContext->callbacks.onContextInit = ma_context_init__null;
-            } break;
-        #endif
-
-            default: break;
-        }
-
-        if (pContext->callbacks.onContextInit != NULL) {
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "Attempting to initialize %s backend...\n", ma_get_backend_name(backend));
-            result = pContext->callbacks.onContextInit(pContext, pConfig, &pContext->callbacks);
-        } else {
-            /* Getting here means the onContextInit callback is not set which means the backend is not enabled. Special case for the custom backend. */
-            if (backend != ma_backend_custom) {
-                result = MA_BACKEND_NOT_ENABLED;
-            } else {
-            #if !defined(MA_HAS_CUSTOM)
-                result = MA_BACKEND_NOT_ENABLED;
-            #else
-                result = MA_NO_BACKEND;
-            #endif
-            }
-        }
-
-        /* If this iteration was successful, return. */
-        if (result == MA_SUCCESS) {
-            result = ma_mutex_init(&pContext->deviceEnumLock);
-            if (result != MA_SUCCESS) {
-                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "Failed to initialize mutex for device enumeration. ma_context_get_devices() is not thread safe.\n");
-            }
-
-            result = ma_mutex_init(&pContext->deviceInfoLock);
-            if (result != MA_SUCCESS) {
-                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "Failed to initialize mutex for device info retrieval. ma_context_get_device_info() is not thread safe.\n");
-            }
-
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "System Architecture:\n");
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "  Endian: %s\n", ma_is_little_endian() ? "LE"  : "BE");
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "  SSE2:   %s\n", ma_has_sse2()         ? "YES" : "NO");
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "  AVX2:   %s\n", ma_has_avx2()         ? "YES" : "NO");
-            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "  NEON:   %s\n", ma_has_neon()         ? "YES" : "NO");
-
-            pContext->backend = backend;
-            return result;
-        } else {
-            if (result == MA_BACKEND_NOT_ENABLED) {
-                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "%s backend is disabled.\n", ma_get_backend_name(backend));
-            } else {
-                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "Failed to initialize %s backend.\n", ma_get_backend_name(backend));
-            }
-        }
-    }
-
-    /* If we get here it means an error occurred. */
-    MA_ZERO_OBJECT(pContext);  /* Safety. */
-    return MA_NO_BACKEND;
-}
-
-MA_API ma_result ma_context_uninit(ma_context* pContext)
-{
-    if (pContext == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pContext->callbacks.onContextUninit != NULL) {
-        pContext->callbacks.onContextUninit(pContext);
-    }
-
-    ma_mutex_uninit(&pContext->deviceEnumLock);
-    ma_mutex_uninit(&pContext->deviceInfoLock);
-    ma_free(pContext->pDeviceInfos, &pContext->allocationCallbacks);
-    ma_context_uninit_backend_apis(pContext);
-
-    if (pContext->pLog == &pContext->log) {
-        ma_log_uninit(&pContext->log);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API size_t ma_context_sizeof(void)
-{
-    return sizeof(ma_context);
-}
-
-
-MA_API ma_log* ma_context_get_log(ma_context* pContext)
-{
-    if (pContext == NULL) {
-        return NULL;
-    }
-
-    return pContext->pLog;
-}
-
-
-MA_API ma_result ma_context_enumerate_devices(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
-{
-    ma_result result;
-
-    if (pContext == NULL || callback == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pContext->callbacks.onContextEnumerateDevices == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    ma_mutex_lock(&pContext->deviceEnumLock);
-    {
-        result = pContext->callbacks.onContextEnumerateDevices(pContext, callback, pUserData);
-    }
-    ma_mutex_unlock(&pContext->deviceEnumLock);
-
-    return result;
-}
-
-
-static ma_bool32 ma_context_get_devices__enum_callback(ma_context* pContext, ma_device_type deviceType, const ma_device_info* pInfo, void* pUserData)
-{
-    /*
-    We need to insert the device info into our main internal buffer. Where it goes depends on the device type. If it's a capture device
-    it's just appended to the end. If it's a playback device it's inserted just before the first capture device.
-    */
-
-    /*
-    First make sure we have room. Since the number of devices we add to the list is usually relatively small I've decided to use a
-    simple fixed size increment for buffer expansion.
-    */
-    const ma_uint32 bufferExpansionCount = 2;
-    const ma_uint32 totalDeviceInfoCount = pContext->playbackDeviceInfoCount + pContext->captureDeviceInfoCount;
-
-    if (totalDeviceInfoCount >= pContext->deviceInfoCapacity) {
-        ma_uint32 newCapacity = pContext->deviceInfoCapacity + bufferExpansionCount;
-        ma_device_info* pNewInfos = (ma_device_info*)ma_realloc(pContext->pDeviceInfos, sizeof(*pContext->pDeviceInfos)*newCapacity, &pContext->allocationCallbacks);
-        if (pNewInfos == NULL) {
-            return MA_FALSE;   /* Out of memory. */
-        }
-
-        pContext->pDeviceInfos = pNewInfos;
-        pContext->deviceInfoCapacity = newCapacity;
-    }
-
-    if (deviceType == ma_device_type_playback) {
-        /* Playback. Insert just before the first capture device. */
-
-        /* The first thing to do is move all of the capture devices down a slot. */
-        ma_uint32 iFirstCaptureDevice = pContext->playbackDeviceInfoCount;
-        size_t iCaptureDevice;
-        for (iCaptureDevice = totalDeviceInfoCount; iCaptureDevice > iFirstCaptureDevice; --iCaptureDevice) {
-            pContext->pDeviceInfos[iCaptureDevice] = pContext->pDeviceInfos[iCaptureDevice-1];
-        }
-
-        /* Now just insert where the first capture device was before moving it down a slot. */
-        pContext->pDeviceInfos[iFirstCaptureDevice] = *pInfo;
-        pContext->playbackDeviceInfoCount += 1;
-    } else {
-        /* Capture. Insert at the end. */
-        pContext->pDeviceInfos[totalDeviceInfoCount] = *pInfo;
-        pContext->captureDeviceInfoCount += 1;
-    }
-
-    (void)pUserData;
-    return MA_TRUE;
-}
-
-MA_API ma_result ma_context_get_devices(ma_context* pContext, ma_device_info** ppPlaybackDeviceInfos, ma_uint32* pPlaybackDeviceCount, ma_device_info** ppCaptureDeviceInfos, ma_uint32* pCaptureDeviceCount)
-{
-    ma_result result;
-
-    /* Safety. */
-    if (ppPlaybackDeviceInfos != NULL) *ppPlaybackDeviceInfos = NULL;
-    if (pPlaybackDeviceCount  != NULL) *pPlaybackDeviceCount  = 0;
-    if (ppCaptureDeviceInfos  != NULL) *ppCaptureDeviceInfos  = NULL;
-    if (pCaptureDeviceCount   != NULL) *pCaptureDeviceCount   = 0;
-
-    if (pContext == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pContext->callbacks.onContextEnumerateDevices == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* Note that we don't use ma_context_enumerate_devices() here because we want to do locking at a higher level. */
-    ma_mutex_lock(&pContext->deviceEnumLock);
-    {
-        /* Reset everything first. */
-        pContext->playbackDeviceInfoCount = 0;
-        pContext->captureDeviceInfoCount = 0;
-
-        /* Now enumerate over available devices. */
-        result = pContext->callbacks.onContextEnumerateDevices(pContext, ma_context_get_devices__enum_callback, NULL);
-        if (result == MA_SUCCESS) {
-            /* Playback devices. */
-            if (ppPlaybackDeviceInfos != NULL) {
-                *ppPlaybackDeviceInfos = pContext->pDeviceInfos;
-            }
-            if (pPlaybackDeviceCount != NULL) {
-                *pPlaybackDeviceCount = pContext->playbackDeviceInfoCount;
-            }
-
-            /* Capture devices. */
-            if (ppCaptureDeviceInfos != NULL) {
-                *ppCaptureDeviceInfos = pContext->pDeviceInfos;
-                /* Capture devices come after playback devices. */
-                if (pContext->playbackDeviceInfoCount > 0) {
-                    /* Conditional, because NULL+0 is undefined behavior. */
-                    *ppCaptureDeviceInfos += pContext->playbackDeviceInfoCount;
-                }
-            }
-            if (pCaptureDeviceCount != NULL) {
-                *pCaptureDeviceCount = pContext->captureDeviceInfoCount;
-            }
-        }
-    }
-    ma_mutex_unlock(&pContext->deviceEnumLock);
-
-    return result;
-}
-
-MA_API ma_result ma_context_get_device_info(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
-{
-    ma_result result;
-    ma_device_info deviceInfo;
-
-    /* NOTE: Do not clear pDeviceInfo on entry. The reason is the pDeviceID may actually point to pDeviceInfo->id which will break things. */
-    if (pContext == NULL || pDeviceInfo == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(&deviceInfo);
-
-    /* Help the backend out by copying over the device ID if we have one. */
-    if (pDeviceID != NULL) {
-        MA_COPY_MEMORY(&deviceInfo.id, pDeviceID, sizeof(*pDeviceID));
-    }
-
-    if (pContext->callbacks.onContextGetDeviceInfo == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    ma_mutex_lock(&pContext->deviceInfoLock);
-    {
-        result = pContext->callbacks.onContextGetDeviceInfo(pContext, deviceType, pDeviceID, &deviceInfo);
-    }
-    ma_mutex_unlock(&pContext->deviceInfoLock);
-
-    *pDeviceInfo = deviceInfo;
-    return result;
-}
-
-MA_API ma_bool32 ma_context_is_loopback_supported(ma_context* pContext)
-{
-    if (pContext == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_is_loopback_supported(pContext->backend);
-}
-
-
-MA_API ma_device_config ma_device_config_init(ma_device_type deviceType)
-{
-    ma_device_config config;
-    MA_ZERO_OBJECT(&config);
-    config.deviceType = deviceType;
-    config.resampling = ma_resampler_config_init(ma_format_unknown, 0, 0, 0, ma_resample_algorithm_linear); /* Format/channels/rate don't matter here. */
-
-    return config;
-}
-
-MA_API ma_result ma_device_init(ma_context* pContext, const ma_device_config* pConfig, ma_device* pDevice)
-{
-    ma_result result;
-    ma_device_descriptor descriptorPlayback;
-    ma_device_descriptor descriptorCapture;
-
-    /* The context can be null, in which case we self-manage it. */
-    if (pContext == NULL) {
-        return ma_device_init_ex(NULL, 0, NULL, pConfig, pDevice);
-    }
-
-    if (pDevice == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDevice);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Check that we have our callbacks defined. */
-    if (pContext->callbacks.onDeviceInit == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* Basic config validation. */
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
-        if (pConfig->capture.channels > MA_MAX_CHANNELS) {
-            return MA_INVALID_ARGS;
-        }
-
-        if (!ma__is_channel_map_valid(pConfig->capture.pChannelMap, pConfig->capture.channels)) {
-            return MA_INVALID_ARGS;
-        }
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
-        if (pConfig->playback.channels > MA_MAX_CHANNELS) {
-            return MA_INVALID_ARGS;
-        }
-
-        if (!ma__is_channel_map_valid(pConfig->playback.pChannelMap, pConfig->playback.channels)) {
-            return MA_INVALID_ARGS;
-        }
-    }
-
-    pDevice->pContext = pContext;
-
-    /* Set the user data and log callback ASAP to ensure it is available for the entire initialization process. */
-    pDevice->pUserData      = pConfig->pUserData;
-    pDevice->onData         = pConfig->dataCallback;
-    pDevice->onNotification = pConfig->notificationCallback;
-    pDevice->onStop         = pConfig->stopCallback;
-
-    if (pConfig->playback.pDeviceID != NULL) {
-        MA_COPY_MEMORY(&pDevice->playback.id, pConfig->playback.pDeviceID, sizeof(pDevice->playback.id));
-        pDevice->playback.pID = &pDevice->playback.id;
-    } else {
-        pDevice->playback.pID = NULL;
-    }
-
-    if (pConfig->capture.pDeviceID != NULL) {
-        MA_COPY_MEMORY(&pDevice->capture.id, pConfig->capture.pDeviceID, sizeof(pDevice->capture.id));
-        pDevice->capture.pID = &pDevice->capture.id;
-    } else {
-        pDevice->capture.pID = NULL;
-    }
-
-    pDevice->noPreSilencedOutputBuffer   = pConfig->noPreSilencedOutputBuffer;
-    pDevice->noClip                      = pConfig->noClip;
-    pDevice->noDisableDenormals          = pConfig->noDisableDenormals;
-    pDevice->noFixedSizedCallback        = pConfig->noFixedSizedCallback;
-    ma_atomic_float_set(&pDevice->masterVolumeFactor, 1);
-
-    pDevice->type                        = pConfig->deviceType;
-    pDevice->sampleRate                  = pConfig->sampleRate;
-    pDevice->resampling.algorithm        = pConfig->resampling.algorithm;
-    pDevice->resampling.linear.lpfOrder  = pConfig->resampling.linear.lpfOrder;
-    pDevice->resampling.pBackendVTable   = pConfig->resampling.pBackendVTable;
-    pDevice->resampling.pBackendUserData = pConfig->resampling.pBackendUserData;
-
-    pDevice->capture.shareMode           = pConfig->capture.shareMode;
-    pDevice->capture.format              = pConfig->capture.format;
-    pDevice->capture.channels            = pConfig->capture.channels;
-    ma_channel_map_copy_or_default(pDevice->capture.channelMap, ma_countof(pDevice->capture.channelMap), pConfig->capture.pChannelMap, pConfig->capture.channels);
-    pDevice->capture.channelMixMode      = pConfig->capture.channelMixMode;
-    pDevice->capture.calculateLFEFromSpatialChannels = pConfig->capture.calculateLFEFromSpatialChannels;
-
-    pDevice->playback.shareMode          = pConfig->playback.shareMode;
-    pDevice->playback.format             = pConfig->playback.format;
-    pDevice->playback.channels           = pConfig->playback.channels;
-    ma_channel_map_copy_or_default(pDevice->playback.channelMap, ma_countof(pDevice->playback.channelMap), pConfig->playback.pChannelMap, pConfig->playback.channels);
-    pDevice->playback.channelMixMode     = pConfig->playback.channelMixMode;
-    pDevice->playback.calculateLFEFromSpatialChannels = pConfig->playback.calculateLFEFromSpatialChannels;
-
-    result = ma_mutex_init(&pDevice->startStopLock);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /*
-    When the device is started, the worker thread is the one that does the actual startup of the backend device. We
-    use a semaphore to wait for the background thread to finish the work. The same applies for stopping the device.
-
-    Each of these semaphores is released internally by the worker thread when the work is completed. The start
-    semaphore is also used to wake up the worker thread.
-    */
-    result = ma_event_init(&pDevice->wakeupEvent);
-    if (result != MA_SUCCESS) {
-        ma_mutex_uninit(&pDevice->startStopLock);
-        return result;
-    }
-
-    result = ma_event_init(&pDevice->startEvent);
-    if (result != MA_SUCCESS) {
-        ma_event_uninit(&pDevice->wakeupEvent);
-        ma_mutex_uninit(&pDevice->startStopLock);
-        return result;
-    }
-
-    result = ma_event_init(&pDevice->stopEvent);
-    if (result != MA_SUCCESS) {
-        ma_event_uninit(&pDevice->startEvent);
-        ma_event_uninit(&pDevice->wakeupEvent);
-        ma_mutex_uninit(&pDevice->startStopLock);
-        return result;
-    }
-
-
-    MA_ZERO_OBJECT(&descriptorPlayback);
-    descriptorPlayback.pDeviceID                = pConfig->playback.pDeviceID;
-    descriptorPlayback.shareMode                = pConfig->playback.shareMode;
-    descriptorPlayback.format                   = pConfig->playback.format;
-    descriptorPlayback.channels                 = pConfig->playback.channels;
-    descriptorPlayback.sampleRate               = pConfig->sampleRate;
-    ma_channel_map_copy_or_default(descriptorPlayback.channelMap, ma_countof(descriptorPlayback.channelMap), pConfig->playback.pChannelMap, pConfig->playback.channels);
-    descriptorPlayback.periodSizeInFrames       = pConfig->periodSizeInFrames;
-    descriptorPlayback.periodSizeInMilliseconds = pConfig->periodSizeInMilliseconds;
-    descriptorPlayback.periodCount              = pConfig->periods;
-
-    if (descriptorPlayback.periodCount == 0) {
-        descriptorPlayback.periodCount = MA_DEFAULT_PERIODS;
-    }
-
-
-    MA_ZERO_OBJECT(&descriptorCapture);
-    descriptorCapture.pDeviceID                 = pConfig->capture.pDeviceID;
-    descriptorCapture.shareMode                 = pConfig->capture.shareMode;
-    descriptorCapture.format                    = pConfig->capture.format;
-    descriptorCapture.channels                  = pConfig->capture.channels;
-    descriptorCapture.sampleRate                = pConfig->sampleRate;
-    ma_channel_map_copy_or_default(descriptorCapture.channelMap, ma_countof(descriptorCapture.channelMap), pConfig->capture.pChannelMap, pConfig->capture.channels);
-    descriptorCapture.periodSizeInFrames        = pConfig->periodSizeInFrames;
-    descriptorCapture.periodSizeInMilliseconds  = pConfig->periodSizeInMilliseconds;
-    descriptorCapture.periodCount               = pConfig->periods;
-
-    if (descriptorCapture.periodCount == 0) {
-        descriptorCapture.periodCount = MA_DEFAULT_PERIODS;
-    }
-
-
-    result = pContext->callbacks.onDeviceInit(pDevice, pConfig, &descriptorPlayback, &descriptorCapture);
-    if (result != MA_SUCCESS) {
-        ma_event_uninit(&pDevice->startEvent);
-        ma_event_uninit(&pDevice->wakeupEvent);
-        ma_mutex_uninit(&pDevice->startStopLock);
-        return result;
-    }
-
-#if 0
-    /*
-    On output the descriptors will contain the *actual* data format of the device. We need this to know how to convert the data between
-    the requested format and the internal format.
-    */
-    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
-        if (!ma_device_descriptor_is_valid(&descriptorCapture)) {
-            ma_device_uninit(pDevice);
-            return MA_INVALID_ARGS;
-        }
-
-        pDevice->capture.internalFormat             = descriptorCapture.format;
-        pDevice->capture.internalChannels           = descriptorCapture.channels;
-        pDevice->capture.internalSampleRate         = descriptorCapture.sampleRate;
-        ma_channel_map_copy(pDevice->capture.internalChannelMap, descriptorCapture.channelMap, descriptorCapture.channels);
-        pDevice->capture.internalPeriodSizeInFrames = descriptorCapture.periodSizeInFrames;
-        pDevice->capture.internalPeriods            = descriptorCapture.periodCount;
-
-        if (pDevice->capture.internalPeriodSizeInFrames == 0) {
-            pDevice->capture.internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(descriptorCapture.periodSizeInMilliseconds, descriptorCapture.sampleRate);
-        }
-    }
-
-    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-        if (!ma_device_descriptor_is_valid(&descriptorPlayback)) {
-            ma_device_uninit(pDevice);
-            return MA_INVALID_ARGS;
-        }
-
-        pDevice->playback.internalFormat             = descriptorPlayback.format;
-        pDevice->playback.internalChannels           = descriptorPlayback.channels;
-        pDevice->playback.internalSampleRate         = descriptorPlayback.sampleRate;
-        ma_channel_map_copy(pDevice->playback.internalChannelMap, descriptorPlayback.channelMap, descriptorPlayback.channels);
-        pDevice->playback.internalPeriodSizeInFrames = descriptorPlayback.periodSizeInFrames;
-        pDevice->playback.internalPeriods            = descriptorPlayback.periodCount;
-
-        if (pDevice->playback.internalPeriodSizeInFrames == 0) {
-            pDevice->playback.internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(descriptorPlayback.periodSizeInMilliseconds, descriptorPlayback.sampleRate);
-        }
-    }
-
-
-    /*
-    The name of the device can be retrieved from device info. This may be temporary and replaced with a `ma_device_get_info(pDevice, deviceType)` instead.
-    For loopback devices, we need to retrieve the name of the playback device.
-    */
-    {
-        ma_device_info deviceInfo;
-
-        if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
-            result = ma_device_get_info(pDevice, (pConfig->deviceType == ma_device_type_loopback) ? ma_device_type_playback : ma_device_type_capture, &deviceInfo);
-            if (result == MA_SUCCESS) {
-                ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), deviceInfo.name, (size_t)-1);
-            } else {
-                /* We failed to retrieve the device info. Fall back to a default name. */
-                if (descriptorCapture.pDeviceID == NULL) {
-                    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
-                } else {
-                    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), "Capture Device", (size_t)-1);
-                }
-            }
-        }
-
-        if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-            result = ma_device_get_info(pDevice, ma_device_type_playback, &deviceInfo);
-            if (result == MA_SUCCESS) {
-                ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), deviceInfo.name, (size_t)-1);
-            } else {
-                /* We failed to retrieve the device info. Fall back to a default name. */
-                if (descriptorPlayback.pDeviceID == NULL) {
-                    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
-                } else {
-                    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), "Playback Device", (size_t)-1);
-                }
-            }
-        }
-    }
-
-
-    ma_device__post_init_setup(pDevice, pConfig->deviceType);
-#endif
-
-    result = ma_device_post_init(pDevice, pConfig->deviceType, &descriptorPlayback, &descriptorCapture);
-    if (result != MA_SUCCESS) {
-        ma_device_uninit(pDevice);
-        return result;
-    }
-
-
-    /*
-    If we're using fixed sized callbacks we'll need to make use of an intermediary buffer. Needs to
-    be done after post_init_setup() because we'll need access to the sample rate.
-    */
-    if (pConfig->noFixedSizedCallback == MA_FALSE) {
-        /* We're using a fixed sized data callback so we'll need an intermediary buffer. */
-        ma_uint32 intermediaryBufferCap = pConfig->periodSizeInFrames;
-        if (intermediaryBufferCap == 0) {
-            intermediaryBufferCap = ma_calculate_buffer_size_in_frames_from_milliseconds(pConfig->periodSizeInMilliseconds, pDevice->sampleRate);
-        }
-
-        if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
-            ma_uint32 intermediaryBufferSizeInBytes;
-
-            pDevice->capture.intermediaryBufferLen = 0;
-            pDevice->capture.intermediaryBufferCap = intermediaryBufferCap;
-            if (pDevice->capture.intermediaryBufferCap == 0) {
-                pDevice->capture.intermediaryBufferCap = pDevice->capture.internalPeriodSizeInFrames;
-            }
-
-            intermediaryBufferSizeInBytes = pDevice->capture.intermediaryBufferCap * ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
-
-            pDevice->capture.pIntermediaryBuffer = ma_malloc((size_t)intermediaryBufferSizeInBytes, &pContext->allocationCallbacks);
-            if (pDevice->capture.pIntermediaryBuffer == NULL) {
-                ma_device_uninit(pDevice);
-                return MA_OUT_OF_MEMORY;
-            }
-
-            /* Silence the buffer for safety. */
-            ma_silence_pcm_frames(pDevice->capture.pIntermediaryBuffer, pDevice->capture.intermediaryBufferCap, pDevice->capture.format, pDevice->capture.channels);
-            pDevice->capture.intermediaryBufferLen = pDevice->capture.intermediaryBufferCap;
-        }
-
-        if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
-            ma_uint64 intermediaryBufferSizeInBytes;
-
-            pDevice->playback.intermediaryBufferLen = 0;
-            if (pConfig->deviceType == ma_device_type_duplex) {
-                pDevice->playback.intermediaryBufferCap = pDevice->capture.intermediaryBufferCap;   /* In duplex mode, make sure the intermediary buffer is always the same size as the capture side. */
-            } else {
-                pDevice->playback.intermediaryBufferCap = intermediaryBufferCap;
-                if (pDevice->playback.intermediaryBufferCap == 0) {
-                    pDevice->playback.intermediaryBufferCap = pDevice->playback.internalPeriodSizeInFrames;
-                }
-            }
-
-            intermediaryBufferSizeInBytes = pDevice->playback.intermediaryBufferCap * ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
-
-            pDevice->playback.pIntermediaryBuffer = ma_malloc((size_t)intermediaryBufferSizeInBytes, &pContext->allocationCallbacks);
-            if (pDevice->playback.pIntermediaryBuffer == NULL) {
-                ma_device_uninit(pDevice);
-                return MA_OUT_OF_MEMORY;
-            }
-
-            /* Silence the buffer for safety. */
-            ma_silence_pcm_frames(pDevice->playback.pIntermediaryBuffer, pDevice->playback.intermediaryBufferCap, pDevice->playback.format, pDevice->playback.channels);
-            pDevice->playback.intermediaryBufferLen = 0;
-        }
-    } else {
-        /* Not using a fixed sized data callback so no need for an intermediary buffer. */
-    }
-
-
-    /* Some backends don't require the worker thread. */
-    if (!ma_context_is_backend_asynchronous(pContext)) {
-        /* The worker thread. */
-        result = ma_thread_create(&pDevice->thread, pContext->threadPriority, pContext->threadStackSize, ma_worker_thread, pDevice, &pContext->allocationCallbacks);
-        if (result != MA_SUCCESS) {
-            ma_device_uninit(pDevice);
-            return result;
-        }
-
-        /* Wait for the worker thread to put the device into it's stopped state for real. */
-        ma_event_wait(&pDevice->stopEvent);
-        MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_stopped);
-    } else {
-        /*
-        If the backend is asynchronous and the device is duplex, we'll need an intermediary ring buffer. Note that this needs to be done
-        after ma_device__post_init_setup().
-        */
-        if (ma_context_is_backend_asynchronous(pContext)) {
-            if (pConfig->deviceType == ma_device_type_duplex) {
-                result = ma_duplex_rb_init(pDevice->capture.format, pDevice->capture.channels, pDevice->sampleRate, pDevice->capture.internalSampleRate, pDevice->capture.internalPeriodSizeInFrames, &pDevice->pContext->allocationCallbacks, &pDevice->duplexRB);
-                if (result != MA_SUCCESS) {
-                    ma_device_uninit(pDevice);
-                    return result;
-                }
-            }
-        }
-
-        ma_device__set_state(pDevice, ma_device_state_stopped);
-    }
-
-    /* Log device information. */
-    {
-        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[%s]\n", ma_get_backend_name(pDevice->pContext->backend));
-        if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
-            char name[MA_MAX_DEVICE_NAME_LENGTH + 1];
-            ma_device_get_name(pDevice, (pDevice->type == ma_device_type_loopback) ? ma_device_type_playback : ma_device_type_capture, name, sizeof(name), NULL);
-
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "  %s (%s)\n", name, "Capture");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Format:      %s -> %s\n", ma_get_format_name(pDevice->capture.internalFormat), ma_get_format_name(pDevice->capture.format));
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Channels:    %d -> %d\n", pDevice->capture.internalChannels, pDevice->capture.channels);
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Sample Rate: %d -> %d\n", pDevice->capture.internalSampleRate, pDevice->sampleRate);
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Buffer Size: %d*%d (%d)\n", pDevice->capture.internalPeriodSizeInFrames, pDevice->capture.internalPeriods, (pDevice->capture.internalPeriodSizeInFrames * pDevice->capture.internalPeriods));
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Conversion:\n");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Pre Format Conversion:  %s\n", pDevice->capture.converter.hasPreFormatConversion  ? "YES" : "NO");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Post Format Conversion: %s\n", pDevice->capture.converter.hasPostFormatConversion ? "YES" : "NO");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Routing:        %s\n", pDevice->capture.converter.hasChannelConverter     ? "YES" : "NO");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Resampling:             %s\n", pDevice->capture.converter.hasResampler            ? "YES" : "NO");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Passthrough:            %s\n", pDevice->capture.converter.isPassthrough           ? "YES" : "NO");
-            {
-                char channelMapStr[1024];
-                ma_channel_map_to_string(pDevice->capture.internalChannelMap, pDevice->capture.internalChannels, channelMapStr, sizeof(channelMapStr));
-                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Map In:         {%s}\n", channelMapStr);
-
-                ma_channel_map_to_string(pDevice->capture.channelMap, pDevice->capture.channels, channelMapStr, sizeof(channelMapStr));
-                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Map Out:        {%s}\n", channelMapStr);
-            }
-        }
-        if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-            char name[MA_MAX_DEVICE_NAME_LENGTH + 1];
-            ma_device_get_name(pDevice, ma_device_type_playback, name, sizeof(name), NULL);
-
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "  %s (%s)\n", name, "Playback");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Format:      %s -> %s\n", ma_get_format_name(pDevice->playback.format), ma_get_format_name(pDevice->playback.internalFormat));
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Channels:    %d -> %d\n", pDevice->playback.channels, pDevice->playback.internalChannels);
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Sample Rate: %d -> %d\n", pDevice->sampleRate, pDevice->playback.internalSampleRate);
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Buffer Size: %d*%d (%d)\n", pDevice->playback.internalPeriodSizeInFrames, pDevice->playback.internalPeriods, (pDevice->playback.internalPeriodSizeInFrames * pDevice->playback.internalPeriods));
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Conversion:\n");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Pre Format Conversion:  %s\n", pDevice->playback.converter.hasPreFormatConversion  ? "YES" : "NO");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Post Format Conversion: %s\n", pDevice->playback.converter.hasPostFormatConversion ? "YES" : "NO");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Routing:        %s\n", pDevice->playback.converter.hasChannelConverter     ? "YES" : "NO");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Resampling:             %s\n", pDevice->playback.converter.hasResampler            ? "YES" : "NO");
-            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Passthrough:            %s\n", pDevice->playback.converter.isPassthrough           ? "YES" : "NO");
-            {
-                char channelMapStr[1024];
-                ma_channel_map_to_string(pDevice->playback.channelMap, pDevice->playback.channels, channelMapStr, sizeof(channelMapStr));
-                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Map In:         {%s}\n", channelMapStr);
-
-                ma_channel_map_to_string(pDevice->playback.internalChannelMap, pDevice->playback.internalChannels, channelMapStr, sizeof(channelMapStr));
-                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Map Out:        {%s}\n", channelMapStr);
-            }
-        }
-    }
-
-    MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_stopped);
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_device_init_ex(const ma_backend backends[], ma_uint32 backendCount, const ma_context_config* pContextConfig, const ma_device_config* pConfig, ma_device* pDevice)
-{
-    ma_result result;
-    ma_context* pContext;
-    ma_backend defaultBackends[ma_backend_null+1];
-    ma_uint32 iBackend;
-    ma_backend* pBackendsToIterate;
-    ma_uint32 backendsToIterateCount;
-    ma_allocation_callbacks allocationCallbacks;
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pContextConfig != NULL) {
-        result = ma_allocation_callbacks_init_copy(&allocationCallbacks, &pContextConfig->allocationCallbacks);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    } else {
-        allocationCallbacks = ma_allocation_callbacks_init_default();
-    }
-
-    pContext = (ma_context*)ma_malloc(sizeof(*pContext), &allocationCallbacks);
-    if (pContext == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    for (iBackend = 0; iBackend <= ma_backend_null; ++iBackend) {
-        defaultBackends[iBackend] = (ma_backend)iBackend;
-    }
-
-    pBackendsToIterate = (ma_backend*)backends;
-    backendsToIterateCount = backendCount;
-    if (pBackendsToIterate == NULL) {
-        pBackendsToIterate = (ma_backend*)defaultBackends;
-        backendsToIterateCount = ma_countof(defaultBackends);
-    }
-
-    result = MA_NO_BACKEND;
-
-    for (iBackend = 0; iBackend < backendsToIterateCount; ++iBackend) {
-        /*
-        This is a hack for iOS. If the context config is null, there's a good chance the
-        `ma_device_init(NULL, &deviceConfig, pDevice);` pattern is being used. In this
-        case, set the session category based on the device type.
-        */
-    #if defined(MA_APPLE_MOBILE)
-        ma_context_config contextConfig;
-
-        if (pContextConfig == NULL) {
-            contextConfig = ma_context_config_init();
-            switch (pConfig->deviceType) {
-                case ma_device_type_duplex: {
-                    contextConfig.coreaudio.sessionCategory = ma_ios_session_category_play_and_record;
-                } break;
-                case ma_device_type_capture: {
-                    contextConfig.coreaudio.sessionCategory = ma_ios_session_category_record;
-                } break;
-                case ma_device_type_playback:
-                default: {
-                    contextConfig.coreaudio.sessionCategory = ma_ios_session_category_playback;
-                } break;
-            }
-
-            pContextConfig = &contextConfig;
-        }
-    #endif
-
-        result = ma_context_init(&pBackendsToIterate[iBackend], 1, pContextConfig, pContext);
-        if (result == MA_SUCCESS) {
-            result = ma_device_init(pContext, pConfig, pDevice);
-            if (result == MA_SUCCESS) {
-                break;  /* Success. */
-            } else {
-                ma_context_uninit(pContext);   /* Failure. */
-            }
-        }
-    }
-
-    if (result != MA_SUCCESS) {
-        ma_free(pContext, &allocationCallbacks);
-        return result;
-    }
-
-    pDevice->isOwnerOfContext = MA_TRUE;
-    return result;
-}
-
-MA_API void ma_device_uninit(ma_device* pDevice)
-{
-    if (!ma_device__is_initialized(pDevice)) {
-        return;
-    }
-
-    /*
-    It's possible for the miniaudio side of the device and the backend to not be in sync due to
-    system-level situations such as the computer being put into sleep mode and the backend not
-    notifying miniaudio of the fact the device has stopped. It's possible for this to result in a
-    deadlock due to miniaudio thinking the device is in a running state, when in fact it's not
-    running at all. For this reason I am no longer explicitly stopping the device. I don't think
-    this should affect anyone in practice since uninitializing the backend will naturally stop the
-    device anyway.
-    */
-    #if 0
-    {
-        /* Make sure the device is stopped first. The backends will probably handle this naturally, but I like to do it explicitly for my own sanity. */
-        if (ma_device_is_started(pDevice)) {
-            ma_device_stop(pDevice);
-        }
-    }
-    #endif
-
-    /* Putting the device into an uninitialized state will make the worker thread return. */
-    ma_device__set_state(pDevice, ma_device_state_uninitialized);
-
-    /* Wake up the worker thread and wait for it to properly terminate. */
-    if (!ma_context_is_backend_asynchronous(pDevice->pContext)) {
-        ma_event_signal(&pDevice->wakeupEvent);
-        ma_thread_wait(&pDevice->thread);
-    }
-
-    if (pDevice->pContext->callbacks.onDeviceUninit != NULL) {
-        pDevice->pContext->callbacks.onDeviceUninit(pDevice);
-    }
-
-
-    ma_event_uninit(&pDevice->stopEvent);
-    ma_event_uninit(&pDevice->startEvent);
-    ma_event_uninit(&pDevice->wakeupEvent);
-    ma_mutex_uninit(&pDevice->startStopLock);
-
-    if (ma_context_is_backend_asynchronous(pDevice->pContext)) {
-        if (pDevice->type == ma_device_type_duplex) {
-            ma_duplex_rb_uninit(&pDevice->duplexRB);
-        }
-    }
-
-    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
-        ma_data_converter_uninit(&pDevice->capture.converter, &pDevice->pContext->allocationCallbacks);
-    }
-    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        ma_data_converter_uninit(&pDevice->playback.converter, &pDevice->pContext->allocationCallbacks);
-    }
-
-    if (pDevice->playback.pInputCache != NULL) {
-        ma_free(pDevice->playback.pInputCache, &pDevice->pContext->allocationCallbacks);
-    }
-
-    if (pDevice->capture.pIntermediaryBuffer != NULL) {
-        ma_free(pDevice->capture.pIntermediaryBuffer, &pDevice->pContext->allocationCallbacks);
-    }
-    if (pDevice->playback.pIntermediaryBuffer != NULL) {
-        ma_free(pDevice->playback.pIntermediaryBuffer, &pDevice->pContext->allocationCallbacks);
-    }
-
-    if (pDevice->isOwnerOfContext) {
-        ma_allocation_callbacks allocationCallbacks = pDevice->pContext->allocationCallbacks;
-
-        ma_context_uninit(pDevice->pContext);
-        ma_free(pDevice->pContext, &allocationCallbacks);
-    }
-
-    MA_ZERO_OBJECT(pDevice);
-}
-
-MA_API ma_context* ma_device_get_context(ma_device* pDevice)
-{
-    if (pDevice == NULL) {
-        return NULL;
-    }
-
-    return pDevice->pContext;
-}
-
-MA_API ma_log* ma_device_get_log(ma_device* pDevice)
-{
-    return ma_context_get_log(ma_device_get_context(pDevice));
-}
-
-MA_API ma_result ma_device_get_info(ma_device* pDevice, ma_device_type type, ma_device_info* pDeviceInfo)
-{
-    if (pDeviceInfo == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDeviceInfo);
-
-    if (pDevice == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* If the onDeviceGetInfo() callback is set, use that. Otherwise we'll fall back to ma_context_get_device_info(). */
-    if (pDevice->pContext->callbacks.onDeviceGetInfo != NULL) {
-        return pDevice->pContext->callbacks.onDeviceGetInfo(pDevice, type, pDeviceInfo);
-    }
-
-    /* Getting here means onDeviceGetInfo is not implemented so we need to fall back to an alternative. */
-    if (type == ma_device_type_playback) {
-        return ma_context_get_device_info(pDevice->pContext, type, pDevice->playback.pID, pDeviceInfo);
-    } else {
-        return ma_context_get_device_info(pDevice->pContext, type, pDevice->capture.pID, pDeviceInfo);
-    }
-}
-
-MA_API ma_result ma_device_get_name(ma_device* pDevice, ma_device_type type, char* pName, size_t nameCap, size_t* pLengthNotIncludingNullTerminator)
-{
-    ma_result result;
-    ma_device_info deviceInfo;
-
-    if (pLengthNotIncludingNullTerminator != NULL) {
-        *pLengthNotIncludingNullTerminator = 0;
-    }
-
-    if (pName != NULL && nameCap > 0) {
-        pName[0] = '\0';
-    }
-
-    result = ma_device_get_info(pDevice, type, &deviceInfo);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pName != NULL) {
-        ma_strncpy_s(pName, nameCap, deviceInfo.name, (size_t)-1);
-
-        /*
-        For safety, make sure the length is based on the truncated output string rather than the
-        source. Otherwise the caller might assume the output buffer contains more content than it
-        actually does.
-        */
-        if (pLengthNotIncludingNullTerminator != NULL) {
-            *pLengthNotIncludingNullTerminator = strlen(pName);
-        }
-    } else {
-        /* Name not specified. Just report the length of the source string. */
-        if (pLengthNotIncludingNullTerminator != NULL) {
-            *pLengthNotIncludingNullTerminator = strlen(deviceInfo.name);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_device_start(ma_device* pDevice)
-{
-    ma_result result;
-
-    if (pDevice == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ma_device_get_state(pDevice) == ma_device_state_uninitialized) {
-        return MA_INVALID_OPERATION;    /* Not initialized. */
-    }
-
-    if (ma_device_get_state(pDevice) == ma_device_state_started) {
-        return MA_SUCCESS;  /* Already started. */
-    }
-
-    ma_mutex_lock(&pDevice->startStopLock);
-    {
-        /* Starting and stopping are wrapped in a mutex which means we can assert that the device is in a stopped or paused state. */
-        MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_stopped);
-
-        ma_device__set_state(pDevice, ma_device_state_starting);
-
-        /* Asynchronous backends need to be handled differently. */
-        if (ma_context_is_backend_asynchronous(pDevice->pContext)) {
-            if (pDevice->pContext->callbacks.onDeviceStart != NULL) {
-                result = pDevice->pContext->callbacks.onDeviceStart(pDevice);
-            } else {
-                result = MA_INVALID_OPERATION;
-            }
-
-            if (result == MA_SUCCESS) {
-                ma_device__set_state(pDevice, ma_device_state_started);
-                ma_device__on_notification_started(pDevice);
-            }
-        } else {
-            /*
-            Synchronous backends are started by signaling an event that's being waited on in the worker thread. We first wake up the
-            thread and then wait for the start event.
-            */
-            ma_event_signal(&pDevice->wakeupEvent);
-
-            /*
-            Wait for the worker thread to finish starting the device. Note that the worker thread will be the one who puts the device
-            into the started state. Don't call ma_device__set_state() here.
-            */
-            ma_event_wait(&pDevice->startEvent);
-            result = pDevice->workResult;
-        }
-
-        /* We changed the state from stopped to started, so if we failed, make sure we put the state back to stopped. */
-        if (result != MA_SUCCESS) {
-            ma_device__set_state(pDevice, ma_device_state_stopped);
-        }
-    }
-    ma_mutex_unlock(&pDevice->startStopLock);
-
-    return result;
-}
-
-MA_API ma_result ma_device_stop(ma_device* pDevice)
-{
-    ma_result result;
-
-    if (pDevice == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ma_device_get_state(pDevice) == ma_device_state_uninitialized) {
-        return MA_INVALID_OPERATION;    /* Not initialized. */
-    }
-
-    if (ma_device_get_state(pDevice) == ma_device_state_stopped) {
-        return MA_SUCCESS;  /* Already stopped. */
-    }
-
-    ma_mutex_lock(&pDevice->startStopLock);
-    {
-        /* Starting and stopping are wrapped in a mutex which means we can assert that the device is in a started or paused state. */
-        MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_started);
-
-        ma_device__set_state(pDevice, ma_device_state_stopping);
-
-        /* Asynchronous backends need to be handled differently. */
-        if (ma_context_is_backend_asynchronous(pDevice->pContext)) {
-            /* Asynchronous backends must have a stop operation. */
-            if (pDevice->pContext->callbacks.onDeviceStop != NULL) {
-                result = pDevice->pContext->callbacks.onDeviceStop(pDevice);
-            } else {
-                result = MA_INVALID_OPERATION;
-            }
-
-            ma_device__set_state(pDevice, ma_device_state_stopped);
-        } else {
-            /*
-            Synchronous backends. The stop callback is always called from the worker thread. Do not call the stop callback here. If
-            the backend is implementing it's own audio thread loop we'll need to wake it up if required. Note that we need to make
-            sure the state of the device is *not* playing right now, which it shouldn't be since we set it above. This is super
-            important though, so I'm asserting it here as well for extra safety in case we accidentally change something later.
-            */
-            MA_ASSERT(ma_device_get_state(pDevice) != ma_device_state_started);
-
-            if (pDevice->pContext->callbacks.onDeviceDataLoopWakeup != NULL) {
-                pDevice->pContext->callbacks.onDeviceDataLoopWakeup(pDevice);
-            }
-
-            /*
-            We need to wait for the worker thread to become available for work before returning. Note that the worker thread will be
-            the one who puts the device into the stopped state. Don't call ma_device__set_state() here.
-            */
-            ma_event_wait(&pDevice->stopEvent);
-            result = MA_SUCCESS;
-        }
-
-        /*
-        This is a safety measure to ensure the internal buffer has been cleared so any leftover
-        does not get played the next time the device starts. Ideally this should be drained by
-        the backend first.
-        */
-        pDevice->playback.intermediaryBufferLen = 0;
-        pDevice->playback.inputCacheConsumed    = 0;
-        pDevice->playback.inputCacheRemaining   = 0;
-    }
-    ma_mutex_unlock(&pDevice->startStopLock);
-
-    return result;
-}
-
-MA_API ma_bool32 ma_device_is_started(const ma_device* pDevice)
-{
-    return ma_device_get_state(pDevice) == ma_device_state_started;
-}
-
-MA_API ma_device_state ma_device_get_state(const ma_device* pDevice)
-{
-    if (pDevice == NULL) {
-        return ma_device_state_uninitialized;
-    }
-
-    return ma_atomic_device_state_get((ma_atomic_device_state*)&pDevice->state);   /* Naughty cast to get rid of a const warning. */
-}
-
-MA_API ma_result ma_device_set_master_volume(ma_device* pDevice, float volume)
-{
-    if (pDevice == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (volume < 0.0f) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_atomic_float_set(&pDevice->masterVolumeFactor, volume);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_device_get_master_volume(ma_device* pDevice, float* pVolume)
-{
-    if (pVolume == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDevice == NULL) {
-        *pVolume = 0;
-        return MA_INVALID_ARGS;
-    }
-
-    *pVolume = ma_atomic_float_get(&pDevice->masterVolumeFactor);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_device_set_master_volume_db(ma_device* pDevice, float gainDB)
-{
-    if (gainDB > 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_device_set_master_volume(pDevice, ma_volume_db_to_linear(gainDB));
-}
-
-MA_API ma_result ma_device_get_master_volume_db(ma_device* pDevice, float* pGainDB)
-{
-    float factor;
-    ma_result result;
-
-    if (pGainDB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_device_get_master_volume(pDevice, &factor);
-    if (result != MA_SUCCESS) {
-        *pGainDB = 0;
-        return result;
-    }
-
-    *pGainDB = ma_volume_linear_to_db(factor);
-
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_result ma_device_handle_backend_data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount)
-{
-    if (pDevice == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pOutput == NULL && pInput == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDevice->type == ma_device_type_duplex) {
-        if (pInput != NULL) {
-            ma_device__handle_duplex_callback_capture(pDevice, frameCount, pInput, &pDevice->duplexRB.rb);
-        }
-
-        if (pOutput != NULL) {
-            ma_device__handle_duplex_callback_playback(pDevice, frameCount, pOutput, &pDevice->duplexRB.rb);
-        }
-    } else {
-        if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_loopback) {
-            if (pInput == NULL) {
-                return MA_INVALID_ARGS;
-            }
-
-            ma_device__send_frames_to_client(pDevice, frameCount, pInput);
-        }
-
-        if (pDevice->type == ma_device_type_playback) {
-            if (pOutput == NULL) {
-                return MA_INVALID_ARGS;
-            }
-
-            ma_device__read_frames_from_client(pDevice, frameCount, pOutput);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_uint32 ma_calculate_buffer_size_in_frames_from_descriptor(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
-{
-    if (pDescriptor == NULL) {
-        return 0;
-    }
-
-    /*
-    We must have a non-0 native sample rate, but some backends don't allow retrieval of this at the
-    time when the size of the buffer needs to be determined. In this case we need to just take a best
-    guess and move on. We'll try using the sample rate in pDescriptor first. If that's not set we'll
-    just fall back to MA_DEFAULT_SAMPLE_RATE.
-    */
-    if (nativeSampleRate == 0) {
-        nativeSampleRate = pDescriptor->sampleRate;
-    }
-    if (nativeSampleRate == 0) {
-        nativeSampleRate = MA_DEFAULT_SAMPLE_RATE;
-    }
-
-    MA_ASSERT(nativeSampleRate != 0);
-
-    if (pDescriptor->periodSizeInFrames == 0) {
-        if (pDescriptor->periodSizeInMilliseconds == 0) {
-            if (performanceProfile == ma_performance_profile_low_latency) {
-                return ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY, nativeSampleRate);
-            } else {
-                return ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE, nativeSampleRate);
-            }
-        } else {
-            return ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptor->periodSizeInMilliseconds, nativeSampleRate);
-        }
-    } else {
-        return pDescriptor->periodSizeInFrames;
-    }
-}
-#endif  /* MA_NO_DEVICE_IO */
-
-
-MA_API ma_uint32 ma_calculate_buffer_size_in_milliseconds_from_frames(ma_uint32 bufferSizeInFrames, ma_uint32 sampleRate)
-{
-    /* Prevent a division by zero. */
-    if (sampleRate == 0) {
-        return 0;
-    }
-
-    return bufferSizeInFrames*1000 / sampleRate;
-}
-
-MA_API ma_uint32 ma_calculate_buffer_size_in_frames_from_milliseconds(ma_uint32 bufferSizeInMilliseconds, ma_uint32 sampleRate)
-{
-    /* Prevent a division by zero. */
-    if (sampleRate == 0) {
-        return 0;
-    }
-
-    return bufferSizeInMilliseconds*sampleRate / 1000;
-}
-
-MA_API void ma_copy_pcm_frames(void* dst, const void* src, ma_uint64 frameCount, ma_format format, ma_uint32 channels)
-{
-    if (dst == src) {
-        return; /* No-op. */
-    }
-
-    ma_copy_memory_64(dst, src, frameCount * ma_get_bytes_per_frame(format, channels));
-}
-
-MA_API void ma_silence_pcm_frames(void* p, ma_uint64 frameCount, ma_format format, ma_uint32 channels)
-{
-    if (format == ma_format_u8) {
-        ma_uint64 sampleCount = frameCount * channels;
-        ma_uint64 iSample;
-        for (iSample = 0; iSample < sampleCount; iSample += 1) {
-            ((ma_uint8*)p)[iSample] = 128;
-        }
-    } else {
-        ma_zero_memory_64(p, frameCount * ma_get_bytes_per_frame(format, channels));
-    }
-}
-
-MA_API void* ma_offset_pcm_frames_ptr(void* p, ma_uint64 offsetInFrames, ma_format format, ma_uint32 channels)
-{
-    return ma_offset_ptr(p, offsetInFrames * ma_get_bytes_per_frame(format, channels));
-}
-
-MA_API const void* ma_offset_pcm_frames_const_ptr(const void* p, ma_uint64 offsetInFrames, ma_format format, ma_uint32 channels)
-{
-    return ma_offset_ptr(p, offsetInFrames * ma_get_bytes_per_frame(format, channels));
-}
-
-
-MA_API void ma_clip_samples_u8(ma_uint8* pDst, const ma_int16* pSrc, ma_uint64 count)
-{
-    ma_uint64 iSample;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        pDst[iSample] = ma_clip_u8(pSrc[iSample]);
-    }
-}
-
-MA_API void ma_clip_samples_s16(ma_int16* pDst, const ma_int32* pSrc, ma_uint64 count)
-{
-    ma_uint64 iSample;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        pDst[iSample] = ma_clip_s16(pSrc[iSample]);
-    }
-}
-
-MA_API void ma_clip_samples_s24(ma_uint8* pDst, const ma_int64* pSrc, ma_uint64 count)
-{
-    ma_uint64 iSample;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        ma_int64 s = ma_clip_s24(pSrc[iSample]);
-        pDst[iSample*3 + 0] = (ma_uint8)((s & 0x000000FF) >>  0);
-        pDst[iSample*3 + 1] = (ma_uint8)((s & 0x0000FF00) >>  8);
-        pDst[iSample*3 + 2] = (ma_uint8)((s & 0x00FF0000) >> 16);
-    }
-}
-
-MA_API void ma_clip_samples_s32(ma_int32* pDst, const ma_int64* pSrc, ma_uint64 count)
-{
-    ma_uint64 iSample;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        pDst[iSample] = ma_clip_s32(pSrc[iSample]);
-    }
-}
-
-MA_API void ma_clip_samples_f32(float* pDst, const float* pSrc, ma_uint64 count)
-{
-    ma_uint64 iSample;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        pDst[iSample] = ma_clip_f32(pSrc[iSample]);
-    }
-}
-
-MA_API void ma_clip_pcm_frames(void* pDst, const void* pSrc, ma_uint64 frameCount, ma_format format, ma_uint32 channels)
-{
-    ma_uint64 sampleCount;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    sampleCount = frameCount * channels;
-
-    switch (format) {
-        case ma_format_u8:  ma_clip_samples_u8( (ma_uint8*)pDst, (const ma_int16*)pSrc, sampleCount); break;
-        case ma_format_s16: ma_clip_samples_s16((ma_int16*)pDst, (const ma_int32*)pSrc, sampleCount); break;
-        case ma_format_s24: ma_clip_samples_s24((ma_uint8*)pDst, (const ma_int64*)pSrc, sampleCount); break;
-        case ma_format_s32: ma_clip_samples_s32((ma_int32*)pDst, (const ma_int64*)pSrc, sampleCount); break;
-        case ma_format_f32: ma_clip_samples_f32((   float*)pDst, (const    float*)pSrc, sampleCount); break;
-
-        /* Do nothing if we don't know the format. We're including these here to silence a compiler warning about enums not being handled by the switch. */
-        case ma_format_unknown:
-        case ma_format_count:
-            break;
-    }
-}
-
-
-MA_API void ma_copy_and_apply_volume_factor_u8(ma_uint8* pSamplesOut, const ma_uint8* pSamplesIn, ma_uint64 sampleCount, float factor)
-{
-    ma_uint64 iSample;
-
-    if (pSamplesOut == NULL || pSamplesIn == NULL) {
-        return;
-    }
-
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamplesOut[iSample] = (ma_uint8)(pSamplesIn[iSample] * factor);
-    }
-}
-
-MA_API void ma_copy_and_apply_volume_factor_s16(ma_int16* pSamplesOut, const ma_int16* pSamplesIn, ma_uint64 sampleCount, float factor)
-{
-    ma_uint64 iSample;
-
-    if (pSamplesOut == NULL || pSamplesIn == NULL) {
-        return;
-    }
-
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamplesOut[iSample] = (ma_int16)(pSamplesIn[iSample] * factor);
-    }
-}
-
-MA_API void ma_copy_and_apply_volume_factor_s24(void* pSamplesOut, const void* pSamplesIn, ma_uint64 sampleCount, float factor)
-{
-    ma_uint64 iSample;
-    ma_uint8* pSamplesOut8;
-    ma_uint8* pSamplesIn8;
-
-    if (pSamplesOut == NULL || pSamplesIn == NULL) {
-        return;
-    }
-
-    pSamplesOut8 = (ma_uint8*)pSamplesOut;
-    pSamplesIn8  = (ma_uint8*)pSamplesIn;
-
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        ma_int32 sampleS32;
-
-        sampleS32 = (ma_int32)(((ma_uint32)(pSamplesIn8[iSample*3+0]) << 8) | ((ma_uint32)(pSamplesIn8[iSample*3+1]) << 16) | ((ma_uint32)(pSamplesIn8[iSample*3+2])) << 24);
-        sampleS32 = (ma_int32)(sampleS32 * factor);
-
-        pSamplesOut8[iSample*3+0] = (ma_uint8)(((ma_uint32)sampleS32 & 0x0000FF00) >>  8);
-        pSamplesOut8[iSample*3+1] = (ma_uint8)(((ma_uint32)sampleS32 & 0x00FF0000) >> 16);
-        pSamplesOut8[iSample*3+2] = (ma_uint8)(((ma_uint32)sampleS32 & 0xFF000000) >> 24);
-    }
-}
-
-MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_int32* pSamplesIn, ma_uint64 sampleCount, float factor)
-{
-    ma_uint64 iSample;
-
-    if (pSamplesOut == NULL || pSamplesIn == NULL) {
-        return;
-    }
-
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamplesOut[iSample] = (ma_int32)(pSamplesIn[iSample] * factor);
-    }
-}
-
-MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
-{
-    ma_uint64 iSample;
-
-    if (pSamplesOut == NULL || pSamplesIn == NULL) {
-        return;
-    }
-
-    if (factor == 1) {
-        if (pSamplesOut == pSamplesIn) {
-            /* In place. No-op. */
-        } else {
-            /* Just a copy. */
-            for (iSample = 0; iSample < sampleCount; iSample += 1) {
-                pSamplesOut[iSample] = pSamplesIn[iSample];
-            }
-        }
-    } else {
-        for (iSample = 0; iSample < sampleCount; iSample += 1) {
-            pSamplesOut[iSample] = pSamplesIn[iSample] * factor;
-        }
-    }
-}
-
-MA_API void ma_apply_volume_factor_u8(ma_uint8* pSamples, ma_uint64 sampleCount, float factor)
-{
-    ma_copy_and_apply_volume_factor_u8(pSamples, pSamples, sampleCount, factor);
-}
-
-MA_API void ma_apply_volume_factor_s16(ma_int16* pSamples, ma_uint64 sampleCount, float factor)
-{
-    ma_copy_and_apply_volume_factor_s16(pSamples, pSamples, sampleCount, factor);
-}
-
-MA_API void ma_apply_volume_factor_s24(void* pSamples, ma_uint64 sampleCount, float factor)
-{
-    ma_copy_and_apply_volume_factor_s24(pSamples, pSamples, sampleCount, factor);
-}
-
-MA_API void ma_apply_volume_factor_s32(ma_int32* pSamples, ma_uint64 sampleCount, float factor)
-{
-    ma_copy_and_apply_volume_factor_s32(pSamples, pSamples, sampleCount, factor);
-}
-
-MA_API void ma_apply_volume_factor_f32(float* pSamples, ma_uint64 sampleCount, float factor)
-{
-    ma_copy_and_apply_volume_factor_f32(pSamples, pSamples, sampleCount, factor);
-}
-
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_u8(ma_uint8* pFramesOut, const ma_uint8* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_u8(pFramesOut, pFramesIn, frameCount*channels, factor);
-}
-
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s16(ma_int16* pFramesOut, const ma_int16* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_s16(pFramesOut, pFramesIn, frameCount*channels, factor);
-}
-
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s24(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_s24(pFramesOut, pFramesIn, frameCount*channels, factor);
-}
-
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s32(ma_int32* pFramesOut, const ma_int32* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_s32(pFramesOut, pFramesIn, frameCount*channels, factor);
-}
-
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_f32(pFramesOut, pFramesIn, frameCount*channels, factor);
-}
-
-MA_API void ma_copy_and_apply_volume_factor_pcm_frames(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float factor)
-{
-    switch (format)
-    {
-    case ma_format_u8:  ma_copy_and_apply_volume_factor_pcm_frames_u8 ((ma_uint8*)pFramesOut, (const ma_uint8*)pFramesIn, frameCount, channels, factor); return;
-    case ma_format_s16: ma_copy_and_apply_volume_factor_pcm_frames_s16((ma_int16*)pFramesOut, (const ma_int16*)pFramesIn, frameCount, channels, factor); return;
-    case ma_format_s24: ma_copy_and_apply_volume_factor_pcm_frames_s24(           pFramesOut,                  pFramesIn, frameCount, channels, factor); return;
-    case ma_format_s32: ma_copy_and_apply_volume_factor_pcm_frames_s32((ma_int32*)pFramesOut, (const ma_int32*)pFramesIn, frameCount, channels, factor); return;
-    case ma_format_f32: ma_copy_and_apply_volume_factor_pcm_frames_f32(   (float*)pFramesOut,    (const float*)pFramesIn, frameCount, channels, factor); return;
-    default: return;    /* Do nothing. */
-    }
-}
-
-MA_API void ma_apply_volume_factor_pcm_frames_u8(ma_uint8* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_pcm_frames_u8(pFrames, pFrames, frameCount, channels, factor);
-}
-
-MA_API void ma_apply_volume_factor_pcm_frames_s16(ma_int16* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_pcm_frames_s16(pFrames, pFrames, frameCount, channels, factor);
-}
-
-MA_API void ma_apply_volume_factor_pcm_frames_s24(void* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_pcm_frames_s24(pFrames, pFrames, frameCount, channels, factor);
-}
-
-MA_API void ma_apply_volume_factor_pcm_frames_s32(ma_int32* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_pcm_frames_s32(pFrames, pFrames, frameCount, channels, factor);
-}
-
-MA_API void ma_apply_volume_factor_pcm_frames_f32(float* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_pcm_frames_f32(pFrames, pFrames, frameCount, channels, factor);
-}
-
-MA_API void ma_apply_volume_factor_pcm_frames(void* pFramesOut, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float factor)
-{
-    ma_copy_and_apply_volume_factor_pcm_frames(pFramesOut, pFramesOut, frameCount, format, channels, factor);
-}
-
-
-MA_API void ma_copy_and_apply_volume_factor_per_channel_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float* pChannelGains)
-{
-    ma_uint64 iFrame;
-
-    if (channels == 2) {
-        /* TODO: Do an optimized implementation for stereo and mono. Can do a SIMD optimized implementation as well. */
-    }
-
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            pFramesOut[iFrame * channels + iChannel] = pFramesIn[iFrame * channels + iChannel] * pChannelGains[iChannel];
-        }
-    }
-}
-
-
-
-static MA_INLINE ma_int16 ma_apply_volume_unclipped_u8(ma_int16 x, ma_int16 volume)
-{
-    return (ma_int16)(((ma_int32)x * (ma_int32)volume) >> 8);
-}
-
-static MA_INLINE ma_int32 ma_apply_volume_unclipped_s16(ma_int32 x, ma_int16 volume)
-{
-    return (ma_int32)((x * volume) >> 8);
-}
-
-static MA_INLINE ma_int64 ma_apply_volume_unclipped_s24(ma_int64 x, ma_int16 volume)
-{
-    return (ma_int64)((x * volume) >> 8);
-}
-
-static MA_INLINE ma_int64 ma_apply_volume_unclipped_s32(ma_int64 x, ma_int16 volume)
-{
-    return (ma_int64)((x * volume) >> 8);
-}
-
-static MA_INLINE float ma_apply_volume_unclipped_f32(float x, float volume)
-{
-    return x * volume;
-}
-
-
-MA_API void ma_copy_and_apply_volume_and_clip_samples_u8(ma_uint8* pDst, const ma_int16* pSrc, ma_uint64 count, float volume)
-{
-    ma_uint64 iSample;
-    ma_int16  volumeFixed;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    volumeFixed = ma_float_to_fixed_16(volume);
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        pDst[iSample] = ma_clip_u8(ma_apply_volume_unclipped_u8(pSrc[iSample], volumeFixed));
-    }
-}
-
-MA_API void ma_copy_and_apply_volume_and_clip_samples_s16(ma_int16* pDst, const ma_int32* pSrc, ma_uint64 count, float volume)
-{
-    ma_uint64 iSample;
-    ma_int16  volumeFixed;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    volumeFixed = ma_float_to_fixed_16(volume);
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        pDst[iSample] = ma_clip_s16(ma_apply_volume_unclipped_s16(pSrc[iSample], volumeFixed));
-    }
-}
-
-MA_API void ma_copy_and_apply_volume_and_clip_samples_s24(ma_uint8* pDst, const ma_int64* pSrc, ma_uint64 count, float volume)
-{
-    ma_uint64 iSample;
-    ma_int16  volumeFixed;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    volumeFixed = ma_float_to_fixed_16(volume);
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        ma_int64 s = ma_clip_s24(ma_apply_volume_unclipped_s24(pSrc[iSample], volumeFixed));
-        pDst[iSample*3 + 0] = (ma_uint8)((s & 0x000000FF) >>  0);
-        pDst[iSample*3 + 1] = (ma_uint8)((s & 0x0000FF00) >>  8);
-        pDst[iSample*3 + 2] = (ma_uint8)((s & 0x00FF0000) >> 16);
-    }
-}
-
-MA_API void ma_copy_and_apply_volume_and_clip_samples_s32(ma_int32* pDst, const ma_int64* pSrc, ma_uint64 count, float volume)
-{
-    ma_uint64 iSample;
-    ma_int16  volumeFixed;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    volumeFixed = ma_float_to_fixed_16(volume);
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        pDst[iSample] = ma_clip_s32(ma_apply_volume_unclipped_s32(pSrc[iSample], volumeFixed));
-    }
-}
-
-MA_API void ma_copy_and_apply_volume_and_clip_samples_f32(float* pDst, const float* pSrc, ma_uint64 count, float volume)
-{
-    ma_uint64 iSample;
-
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    /* For the f32 case we need to make sure this supports in-place processing where the input and output buffers are the same. */
-
-    for (iSample = 0; iSample < count; iSample += 1) {
-        pDst[iSample] = ma_clip_f32(ma_apply_volume_unclipped_f32(pSrc[iSample], volume));
-    }
-}
-
-MA_API void ma_copy_and_apply_volume_and_clip_pcm_frames(void* pDst, const void* pSrc, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float volume)
-{
-    MA_ASSERT(pDst != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    if (volume == 1) {
-        ma_clip_pcm_frames(pDst, pSrc, frameCount, format, channels);   /* Optimized case for volume = 1. */
-    } else if (volume == 0) {
-        ma_silence_pcm_frames(pDst, frameCount, format, channels);      /* Optimized case for volume = 0. */
-    } else {
-        ma_uint64 sampleCount = frameCount * channels;
-
-        switch (format) {
-            case ma_format_u8:  ma_copy_and_apply_volume_and_clip_samples_u8( (ma_uint8*)pDst, (const ma_int16*)pSrc, sampleCount, volume); break;
-            case ma_format_s16: ma_copy_and_apply_volume_and_clip_samples_s16((ma_int16*)pDst, (const ma_int32*)pSrc, sampleCount, volume); break;
-            case ma_format_s24: ma_copy_and_apply_volume_and_clip_samples_s24((ma_uint8*)pDst, (const ma_int64*)pSrc, sampleCount, volume); break;
-            case ma_format_s32: ma_copy_and_apply_volume_and_clip_samples_s32((ma_int32*)pDst, (const ma_int64*)pSrc, sampleCount, volume); break;
-            case ma_format_f32: ma_copy_and_apply_volume_and_clip_samples_f32((   float*)pDst, (const    float*)pSrc, sampleCount, volume); break;
-
-            /* Do nothing if we don't know the format. We're including these here to silence a compiler warning about enums not being handled by the switch. */
-            case ma_format_unknown:
-            case ma_format_count:
-                break;
-        }
-    }
-}
-
-
-
-MA_API float ma_volume_linear_to_db(float factor)
-{
-    return 20*ma_log10f(factor);
-}
-
-MA_API float ma_volume_db_to_linear(float gain)
-{
-    return ma_powf(10, gain/20.0f);
-}
-
-
-MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
-{
-    ma_uint64 iSample;
-    ma_uint64 sampleCount;
-
-    if (pDst == NULL || pSrc == NULL || channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (volume == 0) {
-        return MA_SUCCESS;  /* No changes if the volume is 0. */
-    }
-
-    sampleCount = frameCount * channels;
-
-    if (volume == 1) {
-        for (iSample = 0; iSample < sampleCount; iSample += 1) {
-            pDst[iSample] += pSrc[iSample];
-        }
-    } else {
-        for (iSample = 0; iSample < sampleCount; iSample += 1) {
-            pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-
-
-/**************************************************************************************************************************************************************
-
-Format Conversion
-
-**************************************************************************************************************************************************************/
-
-static MA_INLINE ma_int16 ma_pcm_sample_f32_to_s16(float x)
-{
-    return (ma_int16)(x * 32767.0f);
-}
-
-static MA_INLINE ma_int16 ma_pcm_sample_u8_to_s16_no_scale(ma_uint8 x)
-{
-    return (ma_int16)((ma_int16)x - 128);
-}
-
-static MA_INLINE ma_int64 ma_pcm_sample_s24_to_s32_no_scale(const ma_uint8* x)
-{
-    return (ma_int64)(((ma_uint64)x[0] << 40) | ((ma_uint64)x[1] << 48) | ((ma_uint64)x[2] << 56)) >> 40;  /* Make sure the sign bits are maintained. */
-}
-
-static MA_INLINE void ma_pcm_sample_s32_to_s24_no_scale(ma_int64 x, ma_uint8* s24)
-{
-    s24[0] = (ma_uint8)((x & 0x000000FF) >>  0);
-    s24[1] = (ma_uint8)((x & 0x0000FF00) >>  8);
-    s24[2] = (ma_uint8)((x & 0x00FF0000) >> 16);
-}
-
-
-/* u8 */
-MA_API void ma_pcm_u8_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    (void)ditherMode;
-    ma_copy_memory_64(dst, src, count * sizeof(ma_uint8));
-}
-
-
-static MA_INLINE void ma_pcm_u8_to_s16__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_int16* dst_s16 = (ma_int16*)dst;
-    const ma_uint8* src_u8 = (const ma_uint8*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        ma_int16 x = src_u8[i];
-        x = (ma_int16)(x - 128);
-        x = (ma_int16)(x << 8);
-        dst_s16[i] = x;
-    }
-
-    (void)ditherMode;
-}
-
-static MA_INLINE void ma_pcm_u8_to_s16__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_s16__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_u8_to_s16__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_u8_to_s16__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_u8_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_u8_to_s16__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_u8_to_s16__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_u8_to_s16__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_u8_to_s24__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint8* dst_s24 = (ma_uint8*)dst;
-    const ma_uint8* src_u8 = (const ma_uint8*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        ma_int16 x = src_u8[i];
-        x = (ma_int16)(x - 128);
-
-        dst_s24[i*3+0] = 0;
-        dst_s24[i*3+1] = 0;
-        dst_s24[i*3+2] = (ma_uint8)((ma_int8)x);
-    }
-
-    (void)ditherMode;
-}
-
-static MA_INLINE void ma_pcm_u8_to_s24__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_s24__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_u8_to_s24__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_u8_to_s24__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_u8_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_u8_to_s24__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_u8_to_s24__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_u8_to_s24__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_u8_to_s32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_int32* dst_s32 = (ma_int32*)dst;
-    const ma_uint8* src_u8 = (const ma_uint8*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        ma_int32 x = src_u8[i];
-        x = x - 128;
-        x = x << 24;
-        dst_s32[i] = x;
-    }
-
-    (void)ditherMode;
-}
-
-static MA_INLINE void ma_pcm_u8_to_s32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_s32__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_u8_to_s32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_u8_to_s32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_u8_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_u8_to_s32__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_u8_to_s32__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_u8_to_s32__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_u8_to_f32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    float* dst_f32 = (float*)dst;
-    const ma_uint8* src_u8 = (const ma_uint8*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        float x = (float)src_u8[i];
-        x = x * 0.00784313725490196078f;    /* 0..255 to 0..2 */
-        x = x - 1;                          /* 0..2 to -1..1 */
-
-        dst_f32[i] = x;
-    }
-
-    (void)ditherMode;
-}
-
-static MA_INLINE void ma_pcm_u8_to_f32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_f32__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_u8_to_f32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_u8_to_f32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_u8_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_u8_to_f32__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_u8_to_f32__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_u8_to_f32__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-static MA_INLINE void ma_pcm_interleave_u8__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_uint8* dst_u8 = (ma_uint8*)dst;
-    const ma_uint8** src_u8 = (const ma_uint8**)src;
-
-    ma_uint64 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst_u8[iFrame*channels + iChannel] = src_u8[iChannel][iFrame];
-        }
-    }
-}
-#else
-static MA_INLINE void ma_pcm_interleave_u8__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_uint8* dst_u8 = (ma_uint8*)dst;
-    const ma_uint8** src_u8 = (const ma_uint8**)src;
-
-    if (channels == 1) {
-        ma_copy_memory_64(dst, src[0], frameCount * sizeof(ma_uint8));
-    } else if (channels == 2) {
-        ma_uint64 iFrame;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            dst_u8[iFrame*2 + 0] = src_u8[0][iFrame];
-            dst_u8[iFrame*2 + 1] = src_u8[1][iFrame];
-        }
-    } else {
-        ma_uint64 iFrame;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            ma_uint32 iChannel;
-            for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                dst_u8[iFrame*channels + iChannel] = src_u8[iChannel][iFrame];
-            }
-        }
-    }
-}
-#endif
-
-MA_API void ma_pcm_interleave_u8(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_interleave_u8__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_interleave_u8__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_deinterleave_u8__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_uint8** dst_u8 = (ma_uint8**)dst;
-    const ma_uint8* src_u8 = (const ma_uint8*)src;
-
-    ma_uint64 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst_u8[iChannel][iFrame] = src_u8[iFrame*channels + iChannel];
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_deinterleave_u8__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_pcm_deinterleave_u8__reference(dst, src, frameCount, channels);
-}
-
-MA_API void ma_pcm_deinterleave_u8(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_deinterleave_u8__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_deinterleave_u8__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-/* s16 */
-static MA_INLINE void ma_pcm_s16_to_u8__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint8* dst_u8 = (ma_uint8*)dst;
-    const ma_int16* src_s16 = (const ma_int16*)src;
-
-    if (ditherMode == ma_dither_mode_none) {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            ma_int16 x = src_s16[i];
-            x = (ma_int16)(x >> 8);
-            x = (ma_int16)(x + 128);
-            dst_u8[i] = (ma_uint8)x;
-        }
-    } else {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            ma_int16 x = src_s16[i];
-
-            /* Dither. Don't overflow. */
-            ma_int32 dither = ma_dither_s32(ditherMode, -0x80, 0x7F);
-            if ((x + dither) <= 0x7FFF) {
-                x = (ma_int16)(x + dither);
-            } else {
-                x = 0x7FFF;
-            }
-
-            x = (ma_int16)(x >> 8);
-            x = (ma_int16)(x + 128);
-            dst_u8[i] = (ma_uint8)x;
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_s16_to_u8__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_u8__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s16_to_u8__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s16_to_u8__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s16_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s16_to_u8__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s16_to_u8__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s16_to_u8__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-MA_API void ma_pcm_s16_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    (void)ditherMode;
-    ma_copy_memory_64(dst, src, count * sizeof(ma_int16));
-}
-
-
-static MA_INLINE void ma_pcm_s16_to_s24__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint8* dst_s24 = (ma_uint8*)dst;
-    const ma_int16* src_s16 = (const ma_int16*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        dst_s24[i*3+0] = 0;
-        dst_s24[i*3+1] = (ma_uint8)(src_s16[i] & 0xFF);
-        dst_s24[i*3+2] = (ma_uint8)(src_s16[i] >> 8);
-    }
-
-    (void)ditherMode;
-}
-
-static MA_INLINE void ma_pcm_s16_to_s24__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_s24__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s16_to_s24__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s16_to_s24__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s16_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s16_to_s24__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s16_to_s24__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s16_to_s24__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_s16_to_s32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_int32* dst_s32 = (ma_int32*)dst;
-    const ma_int16* src_s16 = (const ma_int16*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        dst_s32[i] = src_s16[i] << 16;
-    }
-
-    (void)ditherMode;
-}
-
-static MA_INLINE void ma_pcm_s16_to_s32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_s32__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s16_to_s32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s16_to_s32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s16_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s16_to_s32__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s16_to_s32__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s16_to_s32__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_s16_to_f32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    float* dst_f32 = (float*)dst;
-    const ma_int16* src_s16 = (const ma_int16*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        float x = (float)src_s16[i];
-
-#if 0
-        /* The accurate way. */
-        x = x + 32768.0f;                   /* -32768..32767 to 0..65535 */
-        x = x * 0.00003051804379339284f;    /* 0..65535 to 0..2 */
-        x = x - 1;                          /* 0..2 to -1..1 */
-#else
-        /* The fast way. */
-        x = x * 0.000030517578125f;         /* -32768..32767 to -1..0.999969482421875 */
-#endif
-
-        dst_f32[i] = x;
-    }
-
-    (void)ditherMode;
-}
-
-static MA_INLINE void ma_pcm_s16_to_f32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_f32__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s16_to_f32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s16_to_f32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s16_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s16_to_f32__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s16_to_f32__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s16_to_f32__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_interleave_s16__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_int16* dst_s16 = (ma_int16*)dst;
-    const ma_int16** src_s16 = (const ma_int16**)src;
-
-    ma_uint64 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst_s16[iFrame*channels + iChannel] = src_s16[iChannel][iFrame];
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_interleave_s16__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_pcm_interleave_s16__reference(dst, src, frameCount, channels);
-}
-
-MA_API void ma_pcm_interleave_s16(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_interleave_s16__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_interleave_s16__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_deinterleave_s16__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_int16** dst_s16 = (ma_int16**)dst;
-    const ma_int16* src_s16 = (const ma_int16*)src;
-
-    ma_uint64 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst_s16[iChannel][iFrame] = src_s16[iFrame*channels + iChannel];
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_deinterleave_s16__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_pcm_deinterleave_s16__reference(dst, src, frameCount, channels);
-}
-
-MA_API void ma_pcm_deinterleave_s16(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_deinterleave_s16__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_deinterleave_s16__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-/* s24 */
-static MA_INLINE void ma_pcm_s24_to_u8__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint8* dst_u8 = (ma_uint8*)dst;
-    const ma_uint8* src_s24 = (const ma_uint8*)src;
-
-    if (ditherMode == ma_dither_mode_none) {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            dst_u8[i] = (ma_uint8)((ma_int8)src_s24[i*3 + 2] + 128);
-        }
-    } else {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            ma_int32 x = (ma_int32)(((ma_uint32)(src_s24[i*3+0]) << 8) | ((ma_uint32)(src_s24[i*3+1]) << 16) | ((ma_uint32)(src_s24[i*3+2])) << 24);
-
-            /* Dither. Don't overflow. */
-            ma_int32 dither = ma_dither_s32(ditherMode, -0x800000, 0x7FFFFF);
-            if ((ma_int64)x + dither <= 0x7FFFFFFF) {
-                x = x + dither;
-            } else {
-                x = 0x7FFFFFFF;
-            }
-
-            x = x >> 24;
-            x = x + 128;
-            dst_u8[i] = (ma_uint8)x;
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_s24_to_u8__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_u8__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s24_to_u8__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s24_to_u8__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s24_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s24_to_u8__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s24_to_u8__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s24_to_u8__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_s24_to_s16__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_int16* dst_s16 = (ma_int16*)dst;
-    const ma_uint8* src_s24 = (const ma_uint8*)src;
-
-    if (ditherMode == ma_dither_mode_none) {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            ma_uint16 dst_lo =            ((ma_uint16)src_s24[i*3 + 1]);
-            ma_uint16 dst_hi = (ma_uint16)((ma_uint16)src_s24[i*3 + 2] << 8);
-            dst_s16[i] = (ma_int16)(dst_lo | dst_hi);
-        }
-    } else {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            ma_int32 x = (ma_int32)(((ma_uint32)(src_s24[i*3+0]) << 8) | ((ma_uint32)(src_s24[i*3+1]) << 16) | ((ma_uint32)(src_s24[i*3+2])) << 24);
-
-            /* Dither. Don't overflow. */
-            ma_int32 dither = ma_dither_s32(ditherMode, -0x8000, 0x7FFF);
-            if ((ma_int64)x + dither <= 0x7FFFFFFF) {
-                x = x + dither;
-            } else {
-                x = 0x7FFFFFFF;
-            }
-
-            x = x >> 16;
-            dst_s16[i] = (ma_int16)x;
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_s24_to_s16__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_s16__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s24_to_s16__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s24_to_s16__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s24_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s24_to_s16__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s24_to_s16__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s24_to_s16__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-MA_API void ma_pcm_s24_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    (void)ditherMode;
-
-    ma_copy_memory_64(dst, src, count * 3);
-}
-
-
-static MA_INLINE void ma_pcm_s24_to_s32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_int32* dst_s32 = (ma_int32*)dst;
-    const ma_uint8* src_s24 = (const ma_uint8*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        dst_s32[i] = (ma_int32)(((ma_uint32)(src_s24[i*3+0]) << 8) | ((ma_uint32)(src_s24[i*3+1]) << 16) | ((ma_uint32)(src_s24[i*3+2])) << 24);
-    }
-
-    (void)ditherMode;
-}
-
-static MA_INLINE void ma_pcm_s24_to_s32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_s32__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s24_to_s32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s24_to_s32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s24_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s24_to_s32__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s24_to_s32__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s24_to_s32__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_s24_to_f32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    float* dst_f32 = (float*)dst;
-    const ma_uint8* src_s24 = (const ma_uint8*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        float x = (float)(((ma_int32)(((ma_uint32)(src_s24[i*3+0]) << 8) | ((ma_uint32)(src_s24[i*3+1]) << 16) | ((ma_uint32)(src_s24[i*3+2])) << 24)) >> 8);
-
-#if 0
-        /* The accurate way. */
-        x = x + 8388608.0f;                 /* -8388608..8388607 to 0..16777215 */
-        x = x * 0.00000011920929665621f;    /* 0..16777215 to 0..2 */
-        x = x - 1;                          /* 0..2 to -1..1 */
-#else
-        /* The fast way. */
-        x = x * 0.00000011920928955078125f; /* -8388608..8388607 to -1..0.999969482421875 */
-#endif
-
-        dst_f32[i] = x;
-    }
-
-    (void)ditherMode;
-}
-
-static MA_INLINE void ma_pcm_s24_to_f32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_f32__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s24_to_f32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s24_to_f32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s24_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s24_to_f32__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s24_to_f32__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s24_to_f32__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_interleave_s24__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_uint8* dst8 = (ma_uint8*)dst;
-    const ma_uint8** src8 = (const ma_uint8**)src;
-
-    ma_uint64 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst8[iFrame*3*channels + iChannel*3 + 0] = src8[iChannel][iFrame*3 + 0];
-            dst8[iFrame*3*channels + iChannel*3 + 1] = src8[iChannel][iFrame*3 + 1];
-            dst8[iFrame*3*channels + iChannel*3 + 2] = src8[iChannel][iFrame*3 + 2];
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_interleave_s24__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_pcm_interleave_s24__reference(dst, src, frameCount, channels);
-}
-
-MA_API void ma_pcm_interleave_s24(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_interleave_s24__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_interleave_s24__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_deinterleave_s24__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_uint8** dst8 = (ma_uint8**)dst;
-    const ma_uint8* src8 = (const ma_uint8*)src;
-
-    ma_uint32 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst8[iChannel][iFrame*3 + 0] = src8[iFrame*3*channels + iChannel*3 + 0];
-            dst8[iChannel][iFrame*3 + 1] = src8[iFrame*3*channels + iChannel*3 + 1];
-            dst8[iChannel][iFrame*3 + 2] = src8[iFrame*3*channels + iChannel*3 + 2];
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_deinterleave_s24__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_pcm_deinterleave_s24__reference(dst, src, frameCount, channels);
-}
-
-MA_API void ma_pcm_deinterleave_s24(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_deinterleave_s24__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_deinterleave_s24__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-
-/* s32 */
-static MA_INLINE void ma_pcm_s32_to_u8__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint8* dst_u8 = (ma_uint8*)dst;
-    const ma_int32* src_s32 = (const ma_int32*)src;
-
-    if (ditherMode == ma_dither_mode_none) {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            ma_int32 x = src_s32[i];
-            x = x >> 24;
-            x = x + 128;
-            dst_u8[i] = (ma_uint8)x;
-        }
-    } else {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            ma_int32 x = src_s32[i];
-
-            /* Dither. Don't overflow. */
-            ma_int32 dither = ma_dither_s32(ditherMode, -0x800000, 0x7FFFFF);
-            if ((ma_int64)x + dither <= 0x7FFFFFFF) {
-                x = x + dither;
-            } else {
-                x = 0x7FFFFFFF;
-            }
-
-            x = x >> 24;
-            x = x + 128;
-            dst_u8[i] = (ma_uint8)x;
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_s32_to_u8__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_u8__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s32_to_u8__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s32_to_u8__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s32_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s32_to_u8__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s32_to_u8__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s32_to_u8__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_s32_to_s16__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_int16* dst_s16 = (ma_int16*)dst;
-    const ma_int32* src_s32 = (const ma_int32*)src;
-
-    if (ditherMode == ma_dither_mode_none) {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            ma_int32 x = src_s32[i];
-            x = x >> 16;
-            dst_s16[i] = (ma_int16)x;
-        }
-    } else {
-        ma_uint64 i;
-        for (i = 0; i < count; i += 1) {
-            ma_int32 x = src_s32[i];
-
-            /* Dither. Don't overflow. */
-            ma_int32 dither = ma_dither_s32(ditherMode, -0x8000, 0x7FFF);
-            if ((ma_int64)x + dither <= 0x7FFFFFFF) {
-                x = x + dither;
-            } else {
-                x = 0x7FFFFFFF;
-            }
-
-            x = x >> 16;
-            dst_s16[i] = (ma_int16)x;
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_s32_to_s16__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_s16__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s32_to_s16__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s32_to_s16__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s32_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s32_to_s16__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s32_to_s16__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s32_to_s16__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_s32_to_s24__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint8* dst_s24 = (ma_uint8*)dst;
-    const ma_int32* src_s32 = (const ma_int32*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        ma_uint32 x = (ma_uint32)src_s32[i];
-        dst_s24[i*3+0] = (ma_uint8)((x & 0x0000FF00) >>  8);
-        dst_s24[i*3+1] = (ma_uint8)((x & 0x00FF0000) >> 16);
-        dst_s24[i*3+2] = (ma_uint8)((x & 0xFF000000) >> 24);
-    }
-
-    (void)ditherMode;   /* No dithering for s32 -> s24. */
-}
-
-static MA_INLINE void ma_pcm_s32_to_s24__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_s24__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s32_to_s24__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s32_to_s24__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s32_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s32_to_s24__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s32_to_s24__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s32_to_s24__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-MA_API void ma_pcm_s32_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    (void)ditherMode;
-
-    ma_copy_memory_64(dst, src, count * sizeof(ma_int32));
-}
-
-
-static MA_INLINE void ma_pcm_s32_to_f32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    float* dst_f32 = (float*)dst;
-    const ma_int32* src_s32 = (const ma_int32*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        double x = src_s32[i];
-
-#if 0
-        x = x + 2147483648.0;
-        x = x * 0.0000000004656612873077392578125;
-        x = x - 1;
-#else
-        x = x / 2147483648.0;
-#endif
-
-        dst_f32[i] = (float)x;
-    }
-
-    (void)ditherMode;   /* No dithering for s32 -> f32. */
-}
-
-static MA_INLINE void ma_pcm_s32_to_f32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_f32__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_s32_to_f32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_s32_to_f32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_s32_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_s32_to_f32__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_s32_to_f32__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_s32_to_f32__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_interleave_s32__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_int32* dst_s32 = (ma_int32*)dst;
-    const ma_int32** src_s32 = (const ma_int32**)src;
-
-    ma_uint64 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst_s32[iFrame*channels + iChannel] = src_s32[iChannel][iFrame];
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_interleave_s32__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_pcm_interleave_s32__reference(dst, src, frameCount, channels);
-}
-
-MA_API void ma_pcm_interleave_s32(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_interleave_s32__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_interleave_s32__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_deinterleave_s32__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_int32** dst_s32 = (ma_int32**)dst;
-    const ma_int32* src_s32 = (const ma_int32*)src;
-
-    ma_uint64 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst_s32[iChannel][iFrame] = src_s32[iFrame*channels + iChannel];
-        }
-    }
-}
-
-static MA_INLINE void ma_pcm_deinterleave_s32__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_pcm_deinterleave_s32__reference(dst, src, frameCount, channels);
-}
-
-MA_API void ma_pcm_deinterleave_s32(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_deinterleave_s32__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_deinterleave_s32__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-/* f32 */
-static MA_INLINE void ma_pcm_f32_to_u8__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint64 i;
-
-    ma_uint8* dst_u8 = (ma_uint8*)dst;
-    const float* src_f32 = (const float*)src;
-
-    float ditherMin = 0;
-    float ditherMax = 0;
-    if (ditherMode != ma_dither_mode_none) {
-        ditherMin = 1.0f / -128;
-        ditherMax = 1.0f /  127;
-    }
-
-    for (i = 0; i < count; i += 1) {
-        float x = src_f32[i];
-        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
-        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
-        x = x + 1;                                  /* -1..1 to 0..2 */
-        x = x * 127.5f;                             /* 0..2 to 0..255 */
-
-        dst_u8[i] = (ma_uint8)x;
-    }
-}
-
-static MA_INLINE void ma_pcm_f32_to_u8__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_f32_to_u8__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_f32_to_u8__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_f32_to_u8__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_f32_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_f32_to_u8__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_f32_to_u8__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_f32_to_u8__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-static MA_INLINE void ma_pcm_f32_to_s16__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint64 i;
-
-    ma_int16* dst_s16 = (ma_int16*)dst;
-    const float* src_f32 = (const float*)src;
-
-    float ditherMin = 0;
-    float ditherMax = 0;
-    if (ditherMode != ma_dither_mode_none) {
-        ditherMin = 1.0f / -32768;
-        ditherMax = 1.0f /  32767;
-    }
-
-    for (i = 0; i < count; i += 1) {
-        float x = src_f32[i];
-        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
-        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
-
-#if 0
-        /* The accurate way. */
-        x = x + 1;                                  /* -1..1 to 0..2 */
-        x = x * 32767.5f;                           /* 0..2 to 0..65535 */
-        x = x - 32768.0f;                           /* 0...65535 to -32768..32767 */
-#else
-        /* The fast way. */
-        x = x * 32767.0f;                           /* -1..1 to -32767..32767 */
-#endif
-
-        dst_s16[i] = (ma_int16)x;
-    }
-}
-#else
-static MA_INLINE void ma_pcm_f32_to_s16__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint64 i;
-    ma_uint64 i4;
-    ma_uint64 count4;
-
-    ma_int16* dst_s16 = (ma_int16*)dst;
-    const float* src_f32 = (const float*)src;
-
-    float ditherMin = 0;
-    float ditherMax = 0;
-    if (ditherMode != ma_dither_mode_none) {
-        ditherMin = 1.0f / -32768;
-        ditherMax = 1.0f /  32767;
-    }
-
-    /* Unrolled. */
-    i = 0;
-    count4 = count >> 2;
-    for (i4 = 0; i4 < count4; i4 += 1) {
-        float d0 = ma_dither_f32(ditherMode, ditherMin, ditherMax);
-        float d1 = ma_dither_f32(ditherMode, ditherMin, ditherMax);
-        float d2 = ma_dither_f32(ditherMode, ditherMin, ditherMax);
-        float d3 = ma_dither_f32(ditherMode, ditherMin, ditherMax);
-
-        float x0 = src_f32[i+0];
-        float x1 = src_f32[i+1];
-        float x2 = src_f32[i+2];
-        float x3 = src_f32[i+3];
-
-        x0 = x0 + d0;
-        x1 = x1 + d1;
-        x2 = x2 + d2;
-        x3 = x3 + d3;
-
-        x0 = ((x0 < -1) ? -1 : ((x0 > 1) ? 1 : x0));
-        x1 = ((x1 < -1) ? -1 : ((x1 > 1) ? 1 : x1));
-        x2 = ((x2 < -1) ? -1 : ((x2 > 1) ? 1 : x2));
-        x3 = ((x3 < -1) ? -1 : ((x3 > 1) ? 1 : x3));
-
-        x0 = x0 * 32767.0f;
-        x1 = x1 * 32767.0f;
-        x2 = x2 * 32767.0f;
-        x3 = x3 * 32767.0f;
-
-        dst_s16[i+0] = (ma_int16)x0;
-        dst_s16[i+1] = (ma_int16)x1;
-        dst_s16[i+2] = (ma_int16)x2;
-        dst_s16[i+3] = (ma_int16)x3;
-
-        i += 4;
-    }
-
-    /* Leftover. */
-    for (; i < count; i += 1) {
-        float x = src_f32[i];
-        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
-        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
-        x = x * 32767.0f;                           /* -1..1 to -32767..32767 */
-
-        dst_s16[i] = (ma_int16)x;
-    }
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_f32_to_s16__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint64 i;
-    ma_uint64 i8;
-    ma_uint64 count8;
-    ma_int16* dst_s16;
-    const float* src_f32;
-    float ditherMin;
-    float ditherMax;
-
-    /* Both the input and output buffers need to be aligned to 16 bytes. */
-    if ((((ma_uintptr)dst & 15) != 0) || (((ma_uintptr)src & 15) != 0)) {
-        ma_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
-        return;
-    }
-
-    dst_s16 = (ma_int16*)dst;
-    src_f32 = (const float*)src;
-
-    ditherMin = 0;
-    ditherMax = 0;
-    if (ditherMode != ma_dither_mode_none) {
-        ditherMin = 1.0f / -32768;
-        ditherMax = 1.0f /  32767;
-    }
-
-    i = 0;
-
-    /* SSE2. SSE allows us to output 8 s16's at a time which means our loop is unrolled 8 times. */
-    count8 = count >> 3;
-    for (i8 = 0; i8 < count8; i8 += 1) {
-        __m128 d0;
-        __m128 d1;
-        __m128 x0;
-        __m128 x1;
-
-        if (ditherMode == ma_dither_mode_none) {
-            d0 = _mm_set1_ps(0);
-            d1 = _mm_set1_ps(0);
-        } else if (ditherMode == ma_dither_mode_rectangle) {
-            d0 = _mm_set_ps(
-                ma_dither_f32_rectangle(ditherMin, ditherMax),
-                ma_dither_f32_rectangle(ditherMin, ditherMax),
-                ma_dither_f32_rectangle(ditherMin, ditherMax),
-                ma_dither_f32_rectangle(ditherMin, ditherMax)
-            );
-            d1 = _mm_set_ps(
-                ma_dither_f32_rectangle(ditherMin, ditherMax),
-                ma_dither_f32_rectangle(ditherMin, ditherMax),
-                ma_dither_f32_rectangle(ditherMin, ditherMax),
-                ma_dither_f32_rectangle(ditherMin, ditherMax)
-            );
-        } else {
-            d0 = _mm_set_ps(
-                ma_dither_f32_triangle(ditherMin, ditherMax),
-                ma_dither_f32_triangle(ditherMin, ditherMax),
-                ma_dither_f32_triangle(ditherMin, ditherMax),
-                ma_dither_f32_triangle(ditherMin, ditherMax)
-            );
-            d1 = _mm_set_ps(
-                ma_dither_f32_triangle(ditherMin, ditherMax),
-                ma_dither_f32_triangle(ditherMin, ditherMax),
-                ma_dither_f32_triangle(ditherMin, ditherMax),
-                ma_dither_f32_triangle(ditherMin, ditherMax)
-            );
-        }
-
-        x0 = *((__m128*)(src_f32 + i) + 0);
-        x1 = *((__m128*)(src_f32 + i) + 1);
-
-        x0 = _mm_add_ps(x0, d0);
-        x1 = _mm_add_ps(x1, d1);
-
-        x0 = _mm_mul_ps(x0, _mm_set1_ps(32767.0f));
-        x1 = _mm_mul_ps(x1, _mm_set1_ps(32767.0f));
-
-        _mm_stream_si128(((__m128i*)(dst_s16 + i)), _mm_packs_epi32(_mm_cvttps_epi32(x0), _mm_cvttps_epi32(x1)));
-
-        i += 8;
-    }
-
-
-    /* Leftover. */
-    for (; i < count; i += 1) {
-        float x = src_f32[i];
-        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
-        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
-        x = x * 32767.0f;                           /* -1..1 to -32767..32767 */
-
-        dst_s16[i] = (ma_int16)x;
-    }
-}
-#endif  /* SSE2 */
-
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_f32_to_s16__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint64 i;
-    ma_uint64 i8;
-    ma_uint64 count8;
-    ma_int16* dst_s16;
-    const float* src_f32;
-    float ditherMin;
-    float ditherMax;
-
-    if (!ma_has_neon()) {
-        ma_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
-        return;
-    }
-
-    /* Both the input and output buffers need to be aligned to 16 bytes. */
-    if ((((ma_uintptr)dst & 15) != 0) || (((ma_uintptr)src & 15) != 0)) {
-        ma_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
-        return;
-    }
-
-    dst_s16 = (ma_int16*)dst;
-    src_f32 = (const float*)src;
-
-    ditherMin = 0;
-    ditherMax = 0;
-    if (ditherMode != ma_dither_mode_none) {
-        ditherMin = 1.0f / -32768;
-        ditherMax = 1.0f /  32767;
-    }
-
-    i = 0;
-
-    /* NEON. NEON allows us to output 8 s16's at a time which means our loop is unrolled 8 times. */
-    count8 = count >> 3;
-    for (i8 = 0; i8 < count8; i8 += 1) {
-        float32x4_t d0;
-        float32x4_t d1;
-        float32x4_t x0;
-        float32x4_t x1;
-        int32x4_t i0;
-        int32x4_t i1;
-
-        if (ditherMode == ma_dither_mode_none) {
-            d0 = vmovq_n_f32(0);
-            d1 = vmovq_n_f32(0);
-        } else if (ditherMode == ma_dither_mode_rectangle) {
-            float d0v[4];
-            float d1v[4];
-
-            d0v[0] = ma_dither_f32_rectangle(ditherMin, ditherMax);
-            d0v[1] = ma_dither_f32_rectangle(ditherMin, ditherMax);
-            d0v[2] = ma_dither_f32_rectangle(ditherMin, ditherMax);
-            d0v[3] = ma_dither_f32_rectangle(ditherMin, ditherMax);
-            d0 = vld1q_f32(d0v);
-
-            d1v[0] = ma_dither_f32_rectangle(ditherMin, ditherMax);
-            d1v[1] = ma_dither_f32_rectangle(ditherMin, ditherMax);
-            d1v[2] = ma_dither_f32_rectangle(ditherMin, ditherMax);
-            d1v[3] = ma_dither_f32_rectangle(ditherMin, ditherMax);
-            d1 = vld1q_f32(d1v);
-        } else {
-            float d0v[4];
-            float d1v[4];
-
-            d0v[0] = ma_dither_f32_triangle(ditherMin, ditherMax);
-            d0v[1] = ma_dither_f32_triangle(ditherMin, ditherMax);
-            d0v[2] = ma_dither_f32_triangle(ditherMin, ditherMax);
-            d0v[3] = ma_dither_f32_triangle(ditherMin, ditherMax);
-            d0 = vld1q_f32(d0v);
-
-            d1v[0] = ma_dither_f32_triangle(ditherMin, ditherMax);
-            d1v[1] = ma_dither_f32_triangle(ditherMin, ditherMax);
-            d1v[2] = ma_dither_f32_triangle(ditherMin, ditherMax);
-            d1v[3] = ma_dither_f32_triangle(ditherMin, ditherMax);
-            d1 = vld1q_f32(d1v);
-        }
-
-        x0 = *((float32x4_t*)(src_f32 + i) + 0);
-        x1 = *((float32x4_t*)(src_f32 + i) + 1);
-
-        x0 = vaddq_f32(x0, d0);
-        x1 = vaddq_f32(x1, d1);
-
-        x0 = vmulq_n_f32(x0, 32767.0f);
-        x1 = vmulq_n_f32(x1, 32767.0f);
-
-        i0 = vcvtq_s32_f32(x0);
-        i1 = vcvtq_s32_f32(x1);
-        *((int16x8_t*)(dst_s16 + i)) = vcombine_s16(vqmovn_s32(i0), vqmovn_s32(i1));
-
-        i += 8;
-    }
-
-
-    /* Leftover. */
-    for (; i < count; i += 1) {
-        float x = src_f32[i];
-        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
-        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
-        x = x * 32767.0f;                           /* -1..1 to -32767..32767 */
-
-        dst_s16[i] = (ma_int16)x;
-    }
-}
-#endif  /* Neon */
-#endif  /* MA_USE_REFERENCE_CONVERSION_APIS */
-
-MA_API void ma_pcm_f32_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_f32_to_s16__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_f32_to_s16__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_f32_to_s16__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_f32_to_s24__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_uint8* dst_s24 = (ma_uint8*)dst;
-    const float* src_f32 = (const float*)src;
-
-    ma_uint64 i;
-    for (i = 0; i < count; i += 1) {
-        ma_int32 r;
-        float x = src_f32[i];
-        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
-
-#if 0
-        /* The accurate way. */
-        x = x + 1;                                  /* -1..1 to 0..2 */
-        x = x * 8388607.5f;                         /* 0..2 to 0..16777215 */
-        x = x - 8388608.0f;                         /* 0..16777215 to -8388608..8388607 */
-#else
-        /* The fast way. */
-        x = x * 8388607.0f;                         /* -1..1 to -8388607..8388607 */
-#endif
-
-        r = (ma_int32)x;
-        dst_s24[(i*3)+0] = (ma_uint8)((r & 0x0000FF) >>  0);
-        dst_s24[(i*3)+1] = (ma_uint8)((r & 0x00FF00) >>  8);
-        dst_s24[(i*3)+2] = (ma_uint8)((r & 0xFF0000) >> 16);
-    }
-
-    (void)ditherMode;   /* No dithering for f32 -> s24. */
-}
-
-static MA_INLINE void ma_pcm_f32_to_s24__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_f32_to_s24__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_f32_to_s24__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_f32_to_s24__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_f32_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_f32_to_s24__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_f32_to_s24__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_f32_to_s24__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-static MA_INLINE void ma_pcm_f32_to_s32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_int32* dst_s32 = (ma_int32*)dst;
-    const float* src_f32 = (const float*)src;
-
-    ma_uint32 i;
-    for (i = 0; i < count; i += 1) {
-        double x = src_f32[i];
-        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
-
-#if 0
-        /* The accurate way. */
-        x = x + 1;                                  /* -1..1 to 0..2 */
-        x = x * 2147483647.5;                       /* 0..2 to 0..4294967295 */
-        x = x - 2147483648.0;                       /* 0...4294967295 to -2147483648..2147483647 */
-#else
-        /* The fast way. */
-        x = x * 2147483647.0;                       /* -1..1 to -2147483647..2147483647 */
-#endif
-
-        dst_s32[i] = (ma_int32)x;
-    }
-
-    (void)ditherMode;   /* No dithering for f32 -> s32. */
-}
-
-static MA_INLINE void ma_pcm_f32_to_s32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_f32_to_s32__reference(dst, src, count, ditherMode);
-}
-
-#if defined(MA_SUPPORT_SSE2)
-static MA_INLINE void ma_pcm_f32_to_s32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
-}
-#endif
-#if defined(MA_SUPPORT_NEON)
-static MA_INLINE void ma_pcm_f32_to_s32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    ma_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
-}
-#endif
-
-MA_API void ma_pcm_f32_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_f32_to_s32__reference(dst, src, count, ditherMode);
-#else
-    #  if defined(MA_SUPPORT_SSE2)
-        if (ma_has_sse2()) {
-            ma_pcm_f32_to_s32__sse2(dst, src, count, ditherMode);
-        } else
-    #elif defined(MA_SUPPORT_NEON)
-        if (ma_has_neon()) {
-            ma_pcm_f32_to_s32__neon(dst, src, count, ditherMode);
-        } else
-    #endif
-        {
-            ma_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
-        }
-#endif
-}
-
-
-MA_API void ma_pcm_f32_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
-{
-    (void)ditherMode;
-
-    ma_copy_memory_64(dst, src, count * sizeof(float));
-}
-
-
-static void ma_pcm_interleave_f32__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    float* dst_f32 = (float*)dst;
-    const float** src_f32 = (const float**)src;
-
-    ma_uint64 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst_f32[iFrame*channels + iChannel] = src_f32[iChannel][iFrame];
-        }
-    }
-}
-
-static void ma_pcm_interleave_f32__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_pcm_interleave_f32__reference(dst, src, frameCount, channels);
-}
-
-MA_API void ma_pcm_interleave_f32(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_interleave_f32__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_interleave_f32__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-static void ma_pcm_deinterleave_f32__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    float** dst_f32 = (float**)dst;
-    const float* src_f32 = (const float*)src;
-
-    ma_uint64 iFrame;
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; iChannel += 1) {
-            dst_f32[iChannel][iFrame] = src_f32[iFrame*channels + iChannel];
-        }
-    }
-}
-
-static void ma_pcm_deinterleave_f32__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-    ma_pcm_deinterleave_f32__reference(dst, src, frameCount, channels);
-}
-
-MA_API void ma_pcm_deinterleave_f32(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
-{
-#ifdef MA_USE_REFERENCE_CONVERSION_APIS
-    ma_pcm_deinterleave_f32__reference(dst, src, frameCount, channels);
-#else
-    ma_pcm_deinterleave_f32__optimized(dst, src, frameCount, channels);
-#endif
-}
-
-
-MA_API void ma_pcm_convert(void* pOut, ma_format formatOut, const void* pIn, ma_format formatIn, ma_uint64 sampleCount, ma_dither_mode ditherMode)
-{
-    if (formatOut == formatIn) {
-        ma_copy_memory_64(pOut, pIn, sampleCount * ma_get_bytes_per_sample(formatOut));
-        return;
-    }
-
-    switch (formatIn)
-    {
-        case ma_format_u8:
-        {
-            switch (formatOut)
-            {
-                case ma_format_s16: ma_pcm_u8_to_s16(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s24: ma_pcm_u8_to_s24(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s32: ma_pcm_u8_to_s32(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_f32: ma_pcm_u8_to_f32(pOut, pIn, sampleCount, ditherMode); return;
-                default: break;
-            }
-        } break;
-
-        case ma_format_s16:
-        {
-            switch (formatOut)
-            {
-                case ma_format_u8:  ma_pcm_s16_to_u8( pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s24: ma_pcm_s16_to_s24(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s32: ma_pcm_s16_to_s32(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_f32: ma_pcm_s16_to_f32(pOut, pIn, sampleCount, ditherMode); return;
-                default: break;
-            }
-        } break;
-
-        case ma_format_s24:
-        {
-            switch (formatOut)
-            {
-                case ma_format_u8:  ma_pcm_s24_to_u8( pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s16: ma_pcm_s24_to_s16(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s32: ma_pcm_s24_to_s32(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_f32: ma_pcm_s24_to_f32(pOut, pIn, sampleCount, ditherMode); return;
-                default: break;
-            }
-        } break;
-
-        case ma_format_s32:
-        {
-            switch (formatOut)
-            {
-                case ma_format_u8:  ma_pcm_s32_to_u8( pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s16: ma_pcm_s32_to_s16(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s24: ma_pcm_s32_to_s24(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_f32: ma_pcm_s32_to_f32(pOut, pIn, sampleCount, ditherMode); return;
-                default: break;
-            }
-        } break;
-
-        case ma_format_f32:
-        {
-            switch (formatOut)
-            {
-                case ma_format_u8:  ma_pcm_f32_to_u8( pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s16: ma_pcm_f32_to_s16(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s24: ma_pcm_f32_to_s24(pOut, pIn, sampleCount, ditherMode); return;
-                case ma_format_s32: ma_pcm_f32_to_s32(pOut, pIn, sampleCount, ditherMode); return;
-                default: break;
-            }
-        } break;
-
-        default: break;
-    }
-}
-
-MA_API void ma_convert_pcm_frames_format(void* pOut, ma_format formatOut, const void* pIn, ma_format formatIn, ma_uint64 frameCount, ma_uint32 channels, ma_dither_mode ditherMode)
-{
-    ma_pcm_convert(pOut, formatOut, pIn, formatIn, frameCount * channels, ditherMode);
-}
-
-MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void* pInterleavedPCMFrames, void** ppDeinterleavedPCMFrames)
-{
-    if (pInterleavedPCMFrames == NULL || ppDeinterleavedPCMFrames == NULL) {
-        return; /* Invalid args. */
-    }
-
-    /* For efficiency we do this per format. */
-    switch (format) {
-        case ma_format_s16:
-        {
-            const ma_int16* pSrcS16 = (const ma_int16*)pInterleavedPCMFrames;
-            ma_uint64 iPCMFrame;
-            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
-                ma_uint32 iChannel;
-                for (iChannel = 0; iChannel < channels; ++iChannel) {
-                    ma_int16* pDstS16 = (ma_int16*)ppDeinterleavedPCMFrames[iChannel];
-                    pDstS16[iPCMFrame] = pSrcS16[iPCMFrame*channels+iChannel];
-                }
-            }
-        } break;
-
-        case ma_format_f32:
-        {
-            const float* pSrcF32 = (const float*)pInterleavedPCMFrames;
-            ma_uint64 iPCMFrame;
-            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
-                ma_uint32 iChannel;
-                for (iChannel = 0; iChannel < channels; ++iChannel) {
-                    float* pDstF32 = (float*)ppDeinterleavedPCMFrames[iChannel];
-                    pDstF32[iPCMFrame] = pSrcF32[iPCMFrame*channels+iChannel];
-                }
-            }
-        } break;
-
-        default:
-        {
-            ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
-            ma_uint64 iPCMFrame;
-            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
-                ma_uint32 iChannel;
-                for (iChannel = 0; iChannel < channels; ++iChannel) {
-                          void* pDst = ma_offset_ptr(ppDeinterleavedPCMFrames[iChannel], iPCMFrame*sampleSizeInBytes);
-                    const void* pSrc = ma_offset_ptr(pInterleavedPCMFrames, (iPCMFrame*channels+iChannel)*sampleSizeInBytes);
-                    memcpy(pDst, pSrc, sampleSizeInBytes);
-                }
-            }
-        } break;
-    }
-}
-
-MA_API void ma_interleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void** ppDeinterleavedPCMFrames, void* pInterleavedPCMFrames)
-{
-    switch (format)
-    {
-        case ma_format_s16:
-        {
-            ma_int16* pDstS16 = (ma_int16*)pInterleavedPCMFrames;
-            ma_uint64 iPCMFrame;
-            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
-                ma_uint32 iChannel;
-                for (iChannel = 0; iChannel < channels; ++iChannel) {
-                    const ma_int16* pSrcS16 = (const ma_int16*)ppDeinterleavedPCMFrames[iChannel];
-                    pDstS16[iPCMFrame*channels+iChannel] = pSrcS16[iPCMFrame];
-                }
-            }
-        } break;
-
-        case ma_format_f32:
-        {
-            float* pDstF32 = (float*)pInterleavedPCMFrames;
-            ma_uint64 iPCMFrame;
-            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
-                ma_uint32 iChannel;
-                for (iChannel = 0; iChannel < channels; ++iChannel) {
-                    const float* pSrcF32 = (const float*)ppDeinterleavedPCMFrames[iChannel];
-                    pDstF32[iPCMFrame*channels+iChannel] = pSrcF32[iPCMFrame];
-                }
-            }
-        } break;
-
-        default:
-        {
-            ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
-            ma_uint64 iPCMFrame;
-            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
-                ma_uint32 iChannel;
-                for (iChannel = 0; iChannel < channels; ++iChannel) {
-                          void* pDst = ma_offset_ptr(pInterleavedPCMFrames, (iPCMFrame*channels+iChannel)*sampleSizeInBytes);
-                    const void* pSrc = ma_offset_ptr(ppDeinterleavedPCMFrames[iChannel], iPCMFrame*sampleSizeInBytes);
-                    memcpy(pDst, pSrc, sampleSizeInBytes);
-                }
-            }
-        } break;
-    }
-}
-
-
-/**************************************************************************************************************************************************************
-
-Biquad Filter
-
-**************************************************************************************************************************************************************/
-#ifndef MA_BIQUAD_FIXED_POINT_SHIFT
-#define MA_BIQUAD_FIXED_POINT_SHIFT 14
-#endif
-
-static ma_int32 ma_biquad_float_to_fp(double x)
-{
-    return (ma_int32)(x * (1 << MA_BIQUAD_FIXED_POINT_SHIFT));
-}
-
-MA_API ma_biquad_config ma_biquad_config_init(ma_format format, ma_uint32 channels, double b0, double b1, double b2, double a0, double a1, double a2)
-{
-    ma_biquad_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format = format;
-    config.channels = channels;
-    config.b0 = b0;
-    config.b1 = b1;
-    config.b2 = b2;
-    config.a0 = a0;
-    config.a1 = a1;
-    config.a2 = a2;
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t r1Offset;
-    size_t r2Offset;
-} ma_biquad_heap_layout;
-
-static ma_result ma_biquad_get_heap_layout(const ma_biquad_config* pConfig, ma_biquad_heap_layout* pHeapLayout)
-{
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* R0 */
-    pHeapLayout->r1Offset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += sizeof(ma_biquad_coefficient) * pConfig->channels;
-
-    /* R1 */
-    pHeapLayout->r2Offset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += sizeof(ma_biquad_coefficient) * pConfig->channels;
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_biquad_get_heap_size(const ma_biquad_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_biquad_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_biquad_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_biquad_init_preallocated(const ma_biquad_config* pConfig, void* pHeap, ma_biquad* pBQ)
-{
-    ma_result result;
-    ma_biquad_heap_layout heapLayout;
-
-    if (pBQ == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pBQ);
-
-    result = ma_biquad_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pBQ->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pBQ->pR1 = (ma_biquad_coefficient*)ma_offset_ptr(pHeap, heapLayout.r1Offset);
-    pBQ->pR2 = (ma_biquad_coefficient*)ma_offset_ptr(pHeap, heapLayout.r2Offset);
-
-    return ma_biquad_reinit(pConfig, pBQ);
-}
-
-MA_API ma_result ma_biquad_init(const ma_biquad_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_biquad* pBQ)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_biquad_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_biquad_init_preallocated(pConfig, pHeap, pBQ);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pBQ->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_biquad_uninit(ma_biquad* pBQ, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pBQ == NULL) {
-        return;
-    }
-
-    if (pBQ->_ownsHeap) {
-        ma_free(pBQ->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_result ma_biquad_reinit(const ma_biquad_config* pConfig, ma_biquad* pBQ)
-{
-    if (pBQ == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->a0 == 0) {
-        return MA_INVALID_ARGS; /* Division by zero. */
-    }
-
-    /* Only supporting f32 and s16. */
-    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The format cannot be changed after initialization. */
-    if (pBQ->format != ma_format_unknown && pBQ->format != pConfig->format) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* The channel count cannot be changed after initialization. */
-    if (pBQ->channels != 0 && pBQ->channels != pConfig->channels) {
-        return MA_INVALID_OPERATION;
-    }
-
-
-    pBQ->format   = pConfig->format;
-    pBQ->channels = pConfig->channels;
-
-    /* Normalize. */
-    if (pConfig->format == ma_format_f32) {
-        pBQ->b0.f32 = (float)(pConfig->b0 / pConfig->a0);
-        pBQ->b1.f32 = (float)(pConfig->b1 / pConfig->a0);
-        pBQ->b2.f32 = (float)(pConfig->b2 / pConfig->a0);
-        pBQ->a1.f32 = (float)(pConfig->a1 / pConfig->a0);
-        pBQ->a2.f32 = (float)(pConfig->a2 / pConfig->a0);
-    } else {
-        pBQ->b0.s32 = ma_biquad_float_to_fp(pConfig->b0 / pConfig->a0);
-        pBQ->b1.s32 = ma_biquad_float_to_fp(pConfig->b1 / pConfig->a0);
-        pBQ->b2.s32 = ma_biquad_float_to_fp(pConfig->b2 / pConfig->a0);
-        pBQ->a1.s32 = ma_biquad_float_to_fp(pConfig->a1 / pConfig->a0);
-        pBQ->a2.s32 = ma_biquad_float_to_fp(pConfig->a2 / pConfig->a0);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_biquad_clear_cache(ma_biquad* pBQ)
-{
-    if (pBQ == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pBQ->format == ma_format_f32) {
-        pBQ->pR1->f32 = 0;
-        pBQ->pR2->f32 = 0;
-    } else {
-        pBQ->pR1->s32 = 0;
-        pBQ->pR2->s32 = 0;
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(ma_biquad* pBQ, float* pY, const float* pX)
-{
-    ma_uint32 c;
-    const ma_uint32 channels = pBQ->channels;
-    const float b0 = pBQ->b0.f32;
-    const float b1 = pBQ->b1.f32;
-    const float b2 = pBQ->b2.f32;
-    const float a1 = pBQ->a1.f32;
-    const float a2 = pBQ->a2.f32;
-
-    MA_ASSUME(channels > 0);
-    for (c = 0; c < channels; c += 1) {
-        float r1 = pBQ->pR1[c].f32;
-        float r2 = pBQ->pR2[c].f32;
-        float x  = pX[c];
-        float y;
-
-        y  = b0*x        + r1;
-        r1 = b1*x - a1*y + r2;
-        r2 = b2*x - a2*y;
-
-        pY[c]           = y;
-        pBQ->pR1[c].f32 = r1;
-        pBQ->pR2[c].f32 = r2;
-    }
-}
-
-static MA_INLINE void ma_biquad_process_pcm_frame_f32(ma_biquad* pBQ, float* pY, const float* pX)
-{
-    ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(pBQ, pY, pX);
-}
-
-static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(ma_biquad* pBQ, ma_int16* pY, const ma_int16* pX)
-{
-    ma_uint32 c;
-    const ma_uint32 channels = pBQ->channels;
-    const ma_int32 b0 = pBQ->b0.s32;
-    const ma_int32 b1 = pBQ->b1.s32;
-    const ma_int32 b2 = pBQ->b2.s32;
-    const ma_int32 a1 = pBQ->a1.s32;
-    const ma_int32 a2 = pBQ->a2.s32;
-
-    MA_ASSUME(channels > 0);
-    for (c = 0; c < channels; c += 1) {
-        ma_int32 r1 = pBQ->pR1[c].s32;
-        ma_int32 r2 = pBQ->pR2[c].s32;
-        ma_int32 x  = pX[c];
-        ma_int32 y;
-
-        y  = (b0*x        + r1) >> MA_BIQUAD_FIXED_POINT_SHIFT;
-        r1 = (b1*x - a1*y + r2);
-        r2 = (b2*x - a2*y);
-
-        pY[c]           = (ma_int16)ma_clamp(y, -32768, 32767);
-        pBQ->pR1[c].s32 = r1;
-        pBQ->pR2[c].s32 = r2;
-    }
-}
-
-static MA_INLINE void ma_biquad_process_pcm_frame_s16(ma_biquad* pBQ, ma_int16* pY, const ma_int16* pX)
-{
-    ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(pBQ, pY, pX);
-}
-
-MA_API ma_result ma_biquad_process_pcm_frames(ma_biquad* pBQ, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_uint32 n;
-
-    if (pBQ == NULL || pFramesOut == NULL || pFramesIn == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Note that the logic below needs to support in-place filtering. That is, it must support the case where pFramesOut and pFramesIn are the same. */
-
-    if (pBQ->format == ma_format_f32) {
-        /* */ float* pY = (      float*)pFramesOut;
-        const float* pX = (const float*)pFramesIn;
-
-        for (n = 0; n < frameCount; n += 1) {
-            ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(pBQ, pY, pX);
-            pY += pBQ->channels;
-            pX += pBQ->channels;
-        }
-    } else if (pBQ->format == ma_format_s16) {
-        /* */ ma_int16* pY = (      ma_int16*)pFramesOut;
-        const ma_int16* pX = (const ma_int16*)pFramesIn;
-
-        for (n = 0; n < frameCount; n += 1) {
-            ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(pBQ, pY, pX);
-            pY += pBQ->channels;
-            pX += pBQ->channels;
-        }
-    } else {
-        MA_ASSERT(MA_FALSE);
-        return MA_INVALID_ARGS; /* Format not supported. Should never hit this because it's checked in ma_biquad_init() and ma_biquad_reinit(). */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_uint32 ma_biquad_get_latency(const ma_biquad* pBQ)
-{
-    if (pBQ == NULL) {
-        return 0;
-    }
-
-    return 2;
-}
-
-
-/**************************************************************************************************************************************************************
-
-Low-Pass Filter
-
-**************************************************************************************************************************************************************/
-MA_API ma_lpf1_config ma_lpf1_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency)
-{
-    ma_lpf1_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format = format;
-    config.channels = channels;
-    config.sampleRate = sampleRate;
-    config.cutoffFrequency = cutoffFrequency;
-    config.q = 0.5;
-
-    return config;
-}
-
-MA_API ma_lpf2_config ma_lpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q)
-{
-    ma_lpf2_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format = format;
-    config.channels = channels;
-    config.sampleRate = sampleRate;
-    config.cutoffFrequency = cutoffFrequency;
-    config.q = q;
-
-    /* Q cannot be 0 or else it'll result in a division by 0. In this case just default to 0.707107. */
-    if (config.q == 0) {
-        config.q = 0.707107;
-    }
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t r1Offset;
-} ma_lpf1_heap_layout;
-
-static ma_result ma_lpf1_get_heap_layout(const ma_lpf1_config* pConfig, ma_lpf1_heap_layout* pHeapLayout)
-{
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* R1 */
-    pHeapLayout->r1Offset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += sizeof(ma_biquad_coefficient) * pConfig->channels;
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_lpf1_get_heap_size(const ma_lpf1_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_lpf1_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_lpf1_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_lpf1_init_preallocated(const ma_lpf1_config* pConfig, void* pHeap, ma_lpf1* pLPF)
-{
-    ma_result result;
-    ma_lpf1_heap_layout heapLayout;
-
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pLPF);
-
-    result = ma_lpf1_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pLPF->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pLPF->pR1 = (ma_biquad_coefficient*)ma_offset_ptr(pHeap, heapLayout.r1Offset);
-
-    return ma_lpf1_reinit(pConfig, pLPF);
-}
-
-MA_API ma_result ma_lpf1_init(const ma_lpf1_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf1* pLPF)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_lpf1_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_lpf1_init_preallocated(pConfig, pHeap, pLPF);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pLPF->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_lpf1_uninit(ma_lpf1* pLPF, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pLPF == NULL) {
-        return;
-    }
-
-    if (pLPF->_ownsHeap) {
-        ma_free(pLPF->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_result ma_lpf1_reinit(const ma_lpf1_config* pConfig, ma_lpf1* pLPF)
-{
-    double a;
-
-    if (pLPF == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Only supporting f32 and s16. */
-    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The format cannot be changed after initialization. */
-    if (pLPF->format != ma_format_unknown && pLPF->format != pConfig->format) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* The channel count cannot be changed after initialization. */
-    if (pLPF->channels != 0 && pLPF->channels != pConfig->channels) {
-        return MA_INVALID_OPERATION;
-    }
-
-    pLPF->format   = pConfig->format;
-    pLPF->channels = pConfig->channels;
-
-    a = ma_expd(-2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate);
-    if (pConfig->format == ma_format_f32) {
-        pLPF->a.f32 = (float)a;
-    } else {
-        pLPF->a.s32 = ma_biquad_float_to_fp(a);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
-{
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pLPF->format == ma_format_f32) {
-        pLPF->a.f32 = 0;
-    } else {
-        pLPF->a.s32 = 0;
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
-{
-    ma_uint32 c;
-    const ma_uint32 channels = pLPF->channels;
-    const float a = pLPF->a.f32;
-    const float b = 1 - a;
-
-    MA_ASSUME(channels > 0);
-    for (c = 0; c < channels; c += 1) {
-        float r1 = pLPF->pR1[c].f32;
-        float x  = pX[c];
-        float y;
-
-        y = b*x + a*r1;
-
-        pY[c]           = y;
-        pLPF->pR1[c].f32 = y;
-    }
-}
-
-static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY, const ma_int16* pX)
-{
-    ma_uint32 c;
-    const ma_uint32 channels = pLPF->channels;
-    const ma_int32 a = pLPF->a.s32;
-    const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
-
-    MA_ASSUME(channels > 0);
-    for (c = 0; c < channels; c += 1) {
-        ma_int32 r1 = pLPF->pR1[c].s32;
-        ma_int32 x  = pX[c];
-        ma_int32 y;
-
-        y = (b*x + a*r1) >> MA_BIQUAD_FIXED_POINT_SHIFT;
-
-        pY[c]            = (ma_int16)y;
-        pLPF->pR1[c].s32 = (ma_int32)y;
-    }
-}
-
-MA_API ma_result ma_lpf1_process_pcm_frames(ma_lpf1* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_uint32 n;
-
-    if (pLPF == NULL || pFramesOut == NULL || pFramesIn == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Note that the logic below needs to support in-place filtering. That is, it must support the case where pFramesOut and pFramesIn are the same. */
-
-    if (pLPF->format == ma_format_f32) {
-        /* */ float* pY = (      float*)pFramesOut;
-        const float* pX = (const float*)pFramesIn;
-
-        for (n = 0; n < frameCount; n += 1) {
-            ma_lpf1_process_pcm_frame_f32(pLPF, pY, pX);
-            pY += pLPF->channels;
-            pX += pLPF->channels;
-        }
-    } else if (pLPF->format == ma_format_s16) {
-        /* */ ma_int16* pY = (      ma_int16*)pFramesOut;
-        const ma_int16* pX = (const ma_int16*)pFramesIn;
-
-        for (n = 0; n < frameCount; n += 1) {
-            ma_lpf1_process_pcm_frame_s16(pLPF, pY, pX);
-            pY += pLPF->channels;
-            pX += pLPF->channels;
-        }
-    } else {
-        MA_ASSERT(MA_FALSE);
-        return MA_INVALID_ARGS; /* Format not supported. Should never hit this because it's checked in ma_biquad_init() and ma_biquad_reinit(). */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_uint32 ma_lpf1_get_latency(const ma_lpf1* pLPF)
-{
-    if (pLPF == NULL) {
-        return 0;
-    }
-
-    return 1;
-}
-
-
-static MA_INLINE ma_biquad_config ma_lpf2__get_biquad_config(const ma_lpf2_config* pConfig)
-{
-    ma_biquad_config bqConfig;
-    double q;
-    double w;
-    double s;
-    double c;
-    double a;
-
-    MA_ASSERT(pConfig != NULL);
-
-    q = pConfig->q;
-    w = 2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate;
-    s = ma_sind(w);
-    c = ma_cosd(w);
-    a = s / (2*q);
-
-    bqConfig.b0 = (1 - c) / 2;
-    bqConfig.b1 =  1 - c;
-    bqConfig.b2 = (1 - c) / 2;
-    bqConfig.a0 =  1 + a;
-    bqConfig.a1 = -2 * c;
-    bqConfig.a2 =  1 - a;
-
-    bqConfig.format   = pConfig->format;
-    bqConfig.channels = pConfig->channels;
-
-    return bqConfig;
-}
-
-MA_API ma_result ma_lpf2_get_heap_size(const ma_lpf2_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_biquad_config bqConfig;
-    bqConfig = ma_lpf2__get_biquad_config(pConfig);
-
-    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
-}
-
-MA_API ma_result ma_lpf2_init_preallocated(const ma_lpf2_config* pConfig, void* pHeap, ma_lpf2* pLPF)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pLPF);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_lpf2__get_biquad_config(pConfig);
-    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pLPF->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_lpf2_init(const ma_lpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf2* pLPF)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_lpf2_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_lpf2_init_preallocated(pConfig, pHeap, pLPF);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pLPF->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
-    return MA_SUCCESS;
-}
-
-MA_API void ma_lpf2_uninit(ma_lpf2* pLPF, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pLPF == NULL) {
-        return;
-    }
-
-    ma_biquad_uninit(&pLPF->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
-}
-
-MA_API ma_result ma_lpf2_reinit(const ma_lpf2_config* pConfig, ma_lpf2* pLPF)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pLPF == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_lpf2__get_biquad_config(pConfig);
-    result = ma_biquad_reinit(&bqConfig, &pLPF->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_lpf2_clear_cache(ma_lpf2* pLPF)
-{
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_biquad_clear_cache(&pLPF->bq);
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_lpf2_process_pcm_frame_s16(ma_lpf2* pLPF, ma_int16* pFrameOut, const ma_int16* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_s16(&pLPF->bq, pFrameOut, pFrameIn);
-}
-
-static MA_INLINE void ma_lpf2_process_pcm_frame_f32(ma_lpf2* pLPF, float* pFrameOut, const float* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_f32(&pLPF->bq, pFrameOut, pFrameIn);
-}
-
-MA_API ma_result ma_lpf2_process_pcm_frames(ma_lpf2* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_biquad_process_pcm_frames(&pLPF->bq, pFramesOut, pFramesIn, frameCount);
-}
-
-MA_API ma_uint32 ma_lpf2_get_latency(const ma_lpf2* pLPF)
-{
-    if (pLPF == NULL) {
-        return 0;
-    }
-
-    return ma_biquad_get_latency(&pLPF->bq);
-}
-
-
-MA_API ma_lpf_config ma_lpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
-{
-    ma_lpf_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format          = format;
-    config.channels        = channels;
-    config.sampleRate      = sampleRate;
-    config.cutoffFrequency = cutoffFrequency;
-    config.order           = ma_min(order, MA_MAX_FILTER_ORDER);
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t lpf1Offset;
-    size_t lpf2Offset;  /* Offset of the first second order filter. Subsequent filters will come straight after, and will each have the same heap size. */
-} ma_lpf_heap_layout;
-
-static void ma_lpf_calculate_sub_lpf_counts(ma_uint32 order, ma_uint32* pLPF1Count, ma_uint32* pLPF2Count)
-{
-    MA_ASSERT(pLPF1Count != NULL);
-    MA_ASSERT(pLPF2Count != NULL);
-
-    *pLPF1Count = order % 2;
-    *pLPF2Count = order / 2;
-}
-
-static ma_result ma_lpf_get_heap_layout(const ma_lpf_config* pConfig, ma_lpf_heap_layout* pHeapLayout)
-{
-    ma_result result;
-    ma_uint32 lpf1Count;
-    ma_uint32 lpf2Count;
-    ma_uint32 ilpf1;
-    ma_uint32 ilpf2;
-
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->order > MA_MAX_FILTER_ORDER) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_lpf_calculate_sub_lpf_counts(pConfig->order, &lpf1Count, &lpf2Count);
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* LPF 1 */
-    pHeapLayout->lpf1Offset = pHeapLayout->sizeInBytes;
-    for (ilpf1 = 0; ilpf1 < lpf1Count; ilpf1 += 1) {
-        size_t lpf1HeapSizeInBytes;
-        ma_lpf1_config lpf1Config = ma_lpf1_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency);
-
-        result = ma_lpf1_get_heap_size(&lpf1Config, &lpf1HeapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->sizeInBytes += sizeof(ma_lpf1) + lpf1HeapSizeInBytes;
-    }
-
-    /* LPF 2*/
-    pHeapLayout->lpf2Offset = pHeapLayout->sizeInBytes;
-    for (ilpf2 = 0; ilpf2 < lpf2Count; ilpf2 += 1) {
-        size_t lpf2HeapSizeInBytes;
-        ma_lpf2_config lpf2Config = ma_lpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, 0.707107);   /* <-- The "q" parameter does not matter for the purpose of calculating the heap size. */
-
-        result = ma_lpf2_get_heap_size(&lpf2Config, &lpf2HeapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->sizeInBytes += sizeof(ma_lpf2) + lpf2HeapSizeInBytes;
-    }
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_lpf_reinit__internal(const ma_lpf_config* pConfig, void* pHeap, ma_lpf* pLPF, ma_bool32 isNew)
-{
-    ma_result result;
-    ma_uint32 lpf1Count;
-    ma_uint32 lpf2Count;
-    ma_uint32 ilpf1;
-    ma_uint32 ilpf2;
-    ma_lpf_heap_layout heapLayout;  /* Only used if isNew is true. */
-
-    if (pLPF == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Only supporting f32 and s16. */
-    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The format cannot be changed after initialization. */
-    if (pLPF->format != ma_format_unknown && pLPF->format != pConfig->format) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* The channel count cannot be changed after initialization. */
-    if (pLPF->channels != 0 && pLPF->channels != pConfig->channels) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pConfig->order > MA_MAX_FILTER_ORDER) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_lpf_calculate_sub_lpf_counts(pConfig->order, &lpf1Count, &lpf2Count);
-
-    /* The filter order can't change between reinits. */
-    if (!isNew) {
-        if (pLPF->lpf1Count != lpf1Count || pLPF->lpf2Count != lpf2Count) {
-            return MA_INVALID_OPERATION;
-        }
-    }
-
-    if (isNew) {
-        result = ma_lpf_get_heap_layout(pConfig, &heapLayout);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pLPF->_pHeap = pHeap;
-        MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-        pLPF->pLPF1 = (ma_lpf1*)ma_offset_ptr(pHeap, heapLayout.lpf1Offset);
-        pLPF->pLPF2 = (ma_lpf2*)ma_offset_ptr(pHeap, heapLayout.lpf2Offset);
-    } else {
-        MA_ZERO_OBJECT(&heapLayout);    /* To silence a compiler warning. */
-    }
-
-    for (ilpf1 = 0; ilpf1 < lpf1Count; ilpf1 += 1) {
-        ma_lpf1_config lpf1Config = ma_lpf1_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency);
-
-        if (isNew) {
-            size_t lpf1HeapSizeInBytes;
-
-            result = ma_lpf1_get_heap_size(&lpf1Config, &lpf1HeapSizeInBytes);
-            if (result == MA_SUCCESS) {
-                result = ma_lpf1_init_preallocated(&lpf1Config, ma_offset_ptr(pHeap, heapLayout.lpf1Offset + (sizeof(ma_lpf1) * lpf1Count) + (ilpf1 * lpf1HeapSizeInBytes)), &pLPF->pLPF1[ilpf1]);
-            }
-        } else {
-            result = ma_lpf1_reinit(&lpf1Config, &pLPF->pLPF1[ilpf1]);
-        }
-
-        if (result != MA_SUCCESS) {
-            ma_uint32 jlpf1;
-
-            for (jlpf1 = 0; jlpf1 < ilpf1; jlpf1 += 1) {
-                ma_lpf1_uninit(&pLPF->pLPF1[jlpf1], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
-            }
-
-            return result;
-        }
-    }
-
-    for (ilpf2 = 0; ilpf2 < lpf2Count; ilpf2 += 1) {
-        ma_lpf2_config lpf2Config;
-        double q;
-        double a;
-
-        /* Tempting to use 0.707107, but won't result in a Butterworth filter if the order is > 2. */
-        if (lpf1Count == 1) {
-            a = (1 + ilpf2*1) * (MA_PI_D/(pConfig->order*1));   /* Odd order. */
-        } else {
-            a = (1 + ilpf2*2) * (MA_PI_D/(pConfig->order*2));   /* Even order. */
-        }
-        q = 1 / (2*ma_cosd(a));
-
-        lpf2Config = ma_lpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, q);
-
-        if (isNew) {
-            size_t lpf2HeapSizeInBytes;
-
-            result = ma_lpf2_get_heap_size(&lpf2Config, &lpf2HeapSizeInBytes);
-            if (result == MA_SUCCESS) {
-                result = ma_lpf2_init_preallocated(&lpf2Config, ma_offset_ptr(pHeap, heapLayout.lpf2Offset + (sizeof(ma_lpf2) * lpf2Count) + (ilpf2 * lpf2HeapSizeInBytes)), &pLPF->pLPF2[ilpf2]);
-            }
-        } else {
-            result = ma_lpf2_reinit(&lpf2Config, &pLPF->pLPF2[ilpf2]);
-        }
-
-        if (result != MA_SUCCESS) {
-            ma_uint32 jlpf1;
-            ma_uint32 jlpf2;
-
-            for (jlpf1 = 0; jlpf1 < lpf1Count; jlpf1 += 1) {
-                ma_lpf1_uninit(&pLPF->pLPF1[jlpf1], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
-            }
-
-            for (jlpf2 = 0; jlpf2 < ilpf2; jlpf2 += 1) {
-                ma_lpf2_uninit(&pLPF->pLPF2[jlpf2], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
-            }
-
-            return result;
-        }
-    }
-
-    pLPF->lpf1Count  = lpf1Count;
-    pLPF->lpf2Count  = lpf2Count;
-    pLPF->format     = pConfig->format;
-    pLPF->channels   = pConfig->channels;
-    pLPF->sampleRate = pConfig->sampleRate;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_lpf_get_heap_size(const ma_lpf_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_lpf_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_lpf_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return result;
-}
-
-MA_API ma_result ma_lpf_init_preallocated(const ma_lpf_config* pConfig, void* pHeap, ma_lpf* pLPF)
-{
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pLPF);
-
-    return ma_lpf_reinit__internal(pConfig, pHeap, pLPF, /*isNew*/MA_TRUE);
-}
-
-MA_API ma_result ma_lpf_init(const ma_lpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf* pLPF)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_lpf_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_lpf_init_preallocated(pConfig, pHeap, pLPF);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pLPF->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_lpf_uninit(ma_lpf* pLPF, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_uint32 ilpf1;
-    ma_uint32 ilpf2;
-
-    if (pLPF == NULL) {
-        return;
-    }
-
-    for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
-        ma_lpf1_uninit(&pLPF->pLPF1[ilpf1], pAllocationCallbacks);
-    }
-
-    for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
-        ma_lpf2_uninit(&pLPF->pLPF2[ilpf2], pAllocationCallbacks);
-    }
-
-    if (pLPF->_ownsHeap) {
-        ma_free(pLPF->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_result ma_lpf_reinit(const ma_lpf_config* pConfig, ma_lpf* pLPF)
-{
-    return ma_lpf_reinit__internal(pConfig, NULL, pLPF, /*isNew*/MA_FALSE);
-}
-
-MA_API ma_result ma_lpf_clear_cache(ma_lpf* pLPF)
-{
-    ma_uint32 ilpf1;
-    ma_uint32 ilpf2;
-
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
-        ma_lpf1_clear_cache(&pLPF->pLPF1[ilpf1]);
-    }
-
-    for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
-        ma_lpf2_clear_cache(&pLPF->pLPF2[ilpf2]);
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_lpf_process_pcm_frame_f32(ma_lpf* pLPF, float* pY, const void* pX)
-{
-    ma_uint32 ilpf1;
-    ma_uint32 ilpf2;
-
-    MA_ASSERT(pLPF->format == ma_format_f32);
-
-    MA_MOVE_MEMORY(pY, pX, ma_get_bytes_per_frame(pLPF->format, pLPF->channels));
-
-    for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
-        ma_lpf1_process_pcm_frame_f32(&pLPF->pLPF1[ilpf1], pY, pY);
-    }
-
-    for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
-        ma_lpf2_process_pcm_frame_f32(&pLPF->pLPF2[ilpf2], pY, pY);
-    }
-}
-
-static MA_INLINE void ma_lpf_process_pcm_frame_s16(ma_lpf* pLPF, ma_int16* pY, const ma_int16* pX)
-{
-    ma_uint32 ilpf1;
-    ma_uint32 ilpf2;
-
-    MA_ASSERT(pLPF->format == ma_format_s16);
-
-    MA_MOVE_MEMORY(pY, pX, ma_get_bytes_per_frame(pLPF->format, pLPF->channels));
-
-    for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
-        ma_lpf1_process_pcm_frame_s16(&pLPF->pLPF1[ilpf1], pY, pY);
-    }
-
-    for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
-        ma_lpf2_process_pcm_frame_s16(&pLPF->pLPF2[ilpf2], pY, pY);
-    }
-}
-
-MA_API ma_result ma_lpf_process_pcm_frames(ma_lpf* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_result result;
-    ma_uint32 ilpf1;
-    ma_uint32 ilpf2;
-
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Faster path for in-place. */
-    if (pFramesOut == pFramesIn) {
-        for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
-            result = ma_lpf1_process_pcm_frames(&pLPF->pLPF1[ilpf1], pFramesOut, pFramesOut, frameCount);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-        }
-
-        for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
-            result = ma_lpf2_process_pcm_frames(&pLPF->pLPF2[ilpf2], pFramesOut, pFramesOut, frameCount);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-        }
-    }
-
-    /* Slightly slower path for copying. */
-    if (pFramesOut != pFramesIn) {
-        ma_uint32 iFrame;
-
-        /*  */ if (pLPF->format == ma_format_f32) {
-            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
-            const float* pFramesInF32  = (const float*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                ma_lpf_process_pcm_frame_f32(pLPF, pFramesOutF32, pFramesInF32);
-                pFramesOutF32 += pLPF->channels;
-                pFramesInF32  += pLPF->channels;
-            }
-        } else if (pLPF->format == ma_format_s16) {
-            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
-            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                ma_lpf_process_pcm_frame_s16(pLPF, pFramesOutS16, pFramesInS16);
-                pFramesOutS16 += pLPF->channels;
-                pFramesInS16  += pLPF->channels;
-            }
-        } else {
-            MA_ASSERT(MA_FALSE);
-            return MA_INVALID_OPERATION;    /* Should never hit this. */
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_uint32 ma_lpf_get_latency(const ma_lpf* pLPF)
-{
-    if (pLPF == NULL) {
-        return 0;
-    }
-
-    return pLPF->lpf2Count*2 + pLPF->lpf1Count;
-}
-
-
-/**************************************************************************************************************************************************************
-
-High-Pass Filtering
-
-**************************************************************************************************************************************************************/
-MA_API ma_hpf1_config ma_hpf1_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency)
-{
-    ma_hpf1_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format = format;
-    config.channels = channels;
-    config.sampleRate = sampleRate;
-    config.cutoffFrequency = cutoffFrequency;
-
-    return config;
-}
-
-MA_API ma_hpf2_config ma_hpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q)
-{
-    ma_hpf2_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format = format;
-    config.channels = channels;
-    config.sampleRate = sampleRate;
-    config.cutoffFrequency = cutoffFrequency;
-    config.q = q;
-
-    /* Q cannot be 0 or else it'll result in a division by 0. In this case just default to 0.707107. */
-    if (config.q == 0) {
-        config.q = 0.707107;
-    }
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t r1Offset;
-} ma_hpf1_heap_layout;
-
-static ma_result ma_hpf1_get_heap_layout(const ma_hpf1_config* pConfig, ma_hpf1_heap_layout* pHeapLayout)
-{
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* R1 */
-    pHeapLayout->r1Offset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += sizeof(ma_biquad_coefficient) * pConfig->channels;
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_hpf1_get_heap_size(const ma_hpf1_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_hpf1_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_hpf1_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_hpf1_init_preallocated(const ma_hpf1_config* pConfig, void* pHeap, ma_hpf1* pLPF)
-{
-    ma_result result;
-    ma_hpf1_heap_layout heapLayout;
-
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pLPF);
-
-    result = ma_hpf1_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pLPF->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pLPF->pR1 = (ma_biquad_coefficient*)ma_offset_ptr(pHeap, heapLayout.r1Offset);
-
-    return ma_hpf1_reinit(pConfig, pLPF);
-}
-
-MA_API ma_result ma_hpf1_init(const ma_hpf1_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf1* pLPF)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_hpf1_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_hpf1_init_preallocated(pConfig, pHeap, pLPF);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pLPF->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_hpf1_uninit(ma_hpf1* pHPF, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pHPF == NULL) {
-        return;
-    }
-
-    if (pHPF->_ownsHeap) {
-        ma_free(pHPF->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_result ma_hpf1_reinit(const ma_hpf1_config* pConfig, ma_hpf1* pHPF)
-{
-    double a;
-
-    if (pHPF == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Only supporting f32 and s16. */
-    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The format cannot be changed after initialization. */
-    if (pHPF->format != ma_format_unknown && pHPF->format != pConfig->format) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* The channel count cannot be changed after initialization. */
-    if (pHPF->channels != 0 && pHPF->channels != pConfig->channels) {
-        return MA_INVALID_OPERATION;
-    }
-
-    pHPF->format   = pConfig->format;
-    pHPF->channels = pConfig->channels;
-
-    a = ma_expd(-2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate);
-    if (pConfig->format == ma_format_f32) {
-        pHPF->a.f32 = (float)a;
-    } else {
-        pHPF->a.s32 = ma_biquad_float_to_fp(a);
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, const float* pX)
-{
-    ma_uint32 c;
-    const ma_uint32 channels = pHPF->channels;
-    const float a = 1 - pHPF->a.f32;
-    const float b = 1 - a;
-
-    MA_ASSUME(channels > 0);
-    for (c = 0; c < channels; c += 1) {
-        float r1 = pHPF->pR1[c].f32;
-        float x  = pX[c];
-        float y;
-
-        y = b*x - a*r1;
-
-        pY[c]            = y;
-        pHPF->pR1[c].f32 = y;
-    }
-}
-
-static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY, const ma_int16* pX)
-{
-    ma_uint32 c;
-    const ma_uint32 channels = pHPF->channels;
-    const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
-    const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
-
-    MA_ASSUME(channels > 0);
-    for (c = 0; c < channels; c += 1) {
-        ma_int32 r1 = pHPF->pR1[c].s32;
-        ma_int32 x  = pX[c];
-        ma_int32 y;
-
-        y = (b*x - a*r1) >> MA_BIQUAD_FIXED_POINT_SHIFT;
-
-        pY[c]            = (ma_int16)y;
-        pHPF->pR1[c].s32 = (ma_int32)y;
-    }
-}
-
-MA_API ma_result ma_hpf1_process_pcm_frames(ma_hpf1* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_uint32 n;
-
-    if (pHPF == NULL || pFramesOut == NULL || pFramesIn == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Note that the logic below needs to support in-place filtering. That is, it must support the case where pFramesOut and pFramesIn are the same. */
-
-    if (pHPF->format == ma_format_f32) {
-        /* */ float* pY = (      float*)pFramesOut;
-        const float* pX = (const float*)pFramesIn;
-
-        for (n = 0; n < frameCount; n += 1) {
-            ma_hpf1_process_pcm_frame_f32(pHPF, pY, pX);
-            pY += pHPF->channels;
-            pX += pHPF->channels;
-        }
-    } else if (pHPF->format == ma_format_s16) {
-        /* */ ma_int16* pY = (      ma_int16*)pFramesOut;
-        const ma_int16* pX = (const ma_int16*)pFramesIn;
-
-        for (n = 0; n < frameCount; n += 1) {
-            ma_hpf1_process_pcm_frame_s16(pHPF, pY, pX);
-            pY += pHPF->channels;
-            pX += pHPF->channels;
-        }
-    } else {
-        MA_ASSERT(MA_FALSE);
-        return MA_INVALID_ARGS; /* Format not supported. Should never hit this because it's checked in ma_biquad_init() and ma_biquad_reinit(). */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_uint32 ma_hpf1_get_latency(const ma_hpf1* pHPF)
-{
-    if (pHPF == NULL) {
-        return 0;
-    }
-
-    return 1;
-}
-
-
-static MA_INLINE ma_biquad_config ma_hpf2__get_biquad_config(const ma_hpf2_config* pConfig)
-{
-    ma_biquad_config bqConfig;
-    double q;
-    double w;
-    double s;
-    double c;
-    double a;
-
-    MA_ASSERT(pConfig != NULL);
-
-    q = pConfig->q;
-    w = 2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate;
-    s = ma_sind(w);
-    c = ma_cosd(w);
-    a = s / (2*q);
-
-    bqConfig.b0 =  (1 + c) / 2;
-    bqConfig.b1 = -(1 + c);
-    bqConfig.b2 =  (1 + c) / 2;
-    bqConfig.a0 =   1 + a;
-    bqConfig.a1 =  -2 * c;
-    bqConfig.a2 =   1 - a;
-
-    bqConfig.format   = pConfig->format;
-    bqConfig.channels = pConfig->channels;
-
-    return bqConfig;
-}
-
-MA_API ma_result ma_hpf2_get_heap_size(const ma_hpf2_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_biquad_config bqConfig;
-    bqConfig = ma_hpf2__get_biquad_config(pConfig);
-
-    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
-}
-
-MA_API ma_result ma_hpf2_init_preallocated(const ma_hpf2_config* pConfig, void* pHeap, ma_hpf2* pHPF)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pHPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pHPF);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_hpf2__get_biquad_config(pConfig);
-    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pHPF->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_hpf2_init(const ma_hpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf2* pHPF)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_hpf2_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_hpf2_init_preallocated(pConfig, pHeap, pHPF);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pHPF->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
-    return MA_SUCCESS;
-}
-
-MA_API void ma_hpf2_uninit(ma_hpf2* pHPF, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pHPF == NULL) {
-        return;
-    }
-
-    ma_biquad_uninit(&pHPF->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
-}
-
-MA_API ma_result ma_hpf2_reinit(const ma_hpf2_config* pConfig, ma_hpf2* pHPF)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pHPF == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_hpf2__get_biquad_config(pConfig);
-    result = ma_biquad_reinit(&bqConfig, &pHPF->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_hpf2_process_pcm_frame_s16(ma_hpf2* pHPF, ma_int16* pFrameOut, const ma_int16* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_s16(&pHPF->bq, pFrameOut, pFrameIn);
-}
-
-static MA_INLINE void ma_hpf2_process_pcm_frame_f32(ma_hpf2* pHPF, float* pFrameOut, const float* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_f32(&pHPF->bq, pFrameOut, pFrameIn);
-}
-
-MA_API ma_result ma_hpf2_process_pcm_frames(ma_hpf2* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pHPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_biquad_process_pcm_frames(&pHPF->bq, pFramesOut, pFramesIn, frameCount);
-}
-
-MA_API ma_uint32 ma_hpf2_get_latency(const ma_hpf2* pHPF)
-{
-    if (pHPF == NULL) {
-        return 0;
-    }
-
-    return ma_biquad_get_latency(&pHPF->bq);
-}
-
-
-MA_API ma_hpf_config ma_hpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
-{
-    ma_hpf_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format          = format;
-    config.channels        = channels;
-    config.sampleRate      = sampleRate;
-    config.cutoffFrequency = cutoffFrequency;
-    config.order           = ma_min(order, MA_MAX_FILTER_ORDER);
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t hpf1Offset;
-    size_t hpf2Offset;  /* Offset of the first second order filter. Subsequent filters will come straight after, and will each have the same heap size. */
-} ma_hpf_heap_layout;
-
-static void ma_hpf_calculate_sub_hpf_counts(ma_uint32 order, ma_uint32* pHPF1Count, ma_uint32* pHPF2Count)
-{
-    MA_ASSERT(pHPF1Count != NULL);
-    MA_ASSERT(pHPF2Count != NULL);
-
-    *pHPF1Count = order % 2;
-    *pHPF2Count = order / 2;
-}
-
-static ma_result ma_hpf_get_heap_layout(const ma_hpf_config* pConfig, ma_hpf_heap_layout* pHeapLayout)
-{
-    ma_result result;
-    ma_uint32 hpf1Count;
-    ma_uint32 hpf2Count;
-    ma_uint32 ihpf1;
-    ma_uint32 ihpf2;
-
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->order > MA_MAX_FILTER_ORDER) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_hpf_calculate_sub_hpf_counts(pConfig->order, &hpf1Count, &hpf2Count);
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* HPF 1 */
-    pHeapLayout->hpf1Offset = pHeapLayout->sizeInBytes;
-    for (ihpf1 = 0; ihpf1 < hpf1Count; ihpf1 += 1) {
-        size_t hpf1HeapSizeInBytes;
-        ma_hpf1_config hpf1Config = ma_hpf1_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency);
-
-        result = ma_hpf1_get_heap_size(&hpf1Config, &hpf1HeapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->sizeInBytes += sizeof(ma_hpf1) + hpf1HeapSizeInBytes;
-    }
-
-    /* HPF 2*/
-    pHeapLayout->hpf2Offset = pHeapLayout->sizeInBytes;
-    for (ihpf2 = 0; ihpf2 < hpf2Count; ihpf2 += 1) {
-        size_t hpf2HeapSizeInBytes;
-        ma_hpf2_config hpf2Config = ma_hpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, 0.707107);   /* <-- The "q" parameter does not matter for the purpose of calculating the heap size. */
-
-        result = ma_hpf2_get_heap_size(&hpf2Config, &hpf2HeapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->sizeInBytes += sizeof(ma_hpf2) + hpf2HeapSizeInBytes;
-    }
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_hpf_reinit__internal(const ma_hpf_config* pConfig, void* pHeap, ma_hpf* pHPF, ma_bool32 isNew)
-{
-    ma_result result;
-    ma_uint32 hpf1Count;
-    ma_uint32 hpf2Count;
-    ma_uint32 ihpf1;
-    ma_uint32 ihpf2;
-    ma_hpf_heap_layout heapLayout;  /* Only used if isNew is true. */
-
-    if (pHPF == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Only supporting f32 and s16. */
-    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The format cannot be changed after initialization. */
-    if (pHPF->format != ma_format_unknown && pHPF->format != pConfig->format) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* The channel count cannot be changed after initialization. */
-    if (pHPF->channels != 0 && pHPF->channels != pConfig->channels) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pConfig->order > MA_MAX_FILTER_ORDER) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_hpf_calculate_sub_hpf_counts(pConfig->order, &hpf1Count, &hpf2Count);
-
-    /* The filter order can't change between reinits. */
-    if (!isNew) {
-        if (pHPF->hpf1Count != hpf1Count || pHPF->hpf2Count != hpf2Count) {
-            return MA_INVALID_OPERATION;
-        }
-    }
-
-    if (isNew) {
-        result = ma_hpf_get_heap_layout(pConfig, &heapLayout);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHPF->_pHeap = pHeap;
-        MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-        pHPF->pHPF1 = (ma_hpf1*)ma_offset_ptr(pHeap, heapLayout.hpf1Offset);
-        pHPF->pHPF2 = (ma_hpf2*)ma_offset_ptr(pHeap, heapLayout.hpf2Offset);
-    } else {
-        MA_ZERO_OBJECT(&heapLayout);    /* To silence a compiler warning. */
-    }
-
-    for (ihpf1 = 0; ihpf1 < hpf1Count; ihpf1 += 1) {
-        ma_hpf1_config hpf1Config = ma_hpf1_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency);
-
-        if (isNew) {
-            size_t hpf1HeapSizeInBytes;
-
-            result = ma_hpf1_get_heap_size(&hpf1Config, &hpf1HeapSizeInBytes);
-            if (result == MA_SUCCESS) {
-                result = ma_hpf1_init_preallocated(&hpf1Config, ma_offset_ptr(pHeap, heapLayout.hpf1Offset + (sizeof(ma_hpf1) * hpf1Count) + (ihpf1 * hpf1HeapSizeInBytes)), &pHPF->pHPF1[ihpf1]);
-            }
-        } else {
-            result = ma_hpf1_reinit(&hpf1Config, &pHPF->pHPF1[ihpf1]);
-        }
-
-        if (result != MA_SUCCESS) {
-            ma_uint32 jhpf1;
-
-            for (jhpf1 = 0; jhpf1 < ihpf1; jhpf1 += 1) {
-                ma_hpf1_uninit(&pHPF->pHPF1[jhpf1], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
-            }
-
-            return result;
-        }
-    }
-
-    for (ihpf2 = 0; ihpf2 < hpf2Count; ihpf2 += 1) {
-        ma_hpf2_config hpf2Config;
-        double q;
-        double a;
-
-        /* Tempting to use 0.707107, but won't result in a Butterworth filter if the order is > 2. */
-        if (hpf1Count == 1) {
-            a = (1 + ihpf2*1) * (MA_PI_D/(pConfig->order*1));   /* Odd order. */
-        } else {
-            a = (1 + ihpf2*2) * (MA_PI_D/(pConfig->order*2));   /* Even order. */
-        }
-        q = 1 / (2*ma_cosd(a));
-
-        hpf2Config = ma_hpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, q);
-
-        if (isNew) {
-            size_t hpf2HeapSizeInBytes;
-
-            result = ma_hpf2_get_heap_size(&hpf2Config, &hpf2HeapSizeInBytes);
-            if (result == MA_SUCCESS) {
-                result = ma_hpf2_init_preallocated(&hpf2Config, ma_offset_ptr(pHeap, heapLayout.hpf2Offset + (sizeof(ma_hpf2) * hpf2Count) + (ihpf2 * hpf2HeapSizeInBytes)), &pHPF->pHPF2[ihpf2]);
-            }
-        } else {
-            result = ma_hpf2_reinit(&hpf2Config, &pHPF->pHPF2[ihpf2]);
-        }
-
-        if (result != MA_SUCCESS) {
-            ma_uint32 jhpf1;
-            ma_uint32 jhpf2;
-
-            for (jhpf1 = 0; jhpf1 < hpf1Count; jhpf1 += 1) {
-                ma_hpf1_uninit(&pHPF->pHPF1[jhpf1], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
-            }
-
-            for (jhpf2 = 0; jhpf2 < ihpf2; jhpf2 += 1) {
-                ma_hpf2_uninit(&pHPF->pHPF2[jhpf2], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
-            }
-
-            return result;
-        }
-    }
-
-    pHPF->hpf1Count  = hpf1Count;
-    pHPF->hpf2Count  = hpf2Count;
-    pHPF->format     = pConfig->format;
-    pHPF->channels   = pConfig->channels;
-    pHPF->sampleRate = pConfig->sampleRate;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_hpf_get_heap_size(const ma_hpf_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_hpf_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_hpf_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return result;
-}
-
-MA_API ma_result ma_hpf_init_preallocated(const ma_hpf_config* pConfig, void* pHeap, ma_hpf* pLPF)
-{
-    if (pLPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pLPF);
-
-    return ma_hpf_reinit__internal(pConfig, pHeap, pLPF, /*isNew*/MA_TRUE);
-}
-
-MA_API ma_result ma_hpf_init(const ma_hpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf* pHPF)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_hpf_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_hpf_init_preallocated(pConfig, pHeap, pHPF);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pHPF->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_hpf_uninit(ma_hpf* pHPF, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_uint32 ihpf1;
-    ma_uint32 ihpf2;
-
-    if (pHPF == NULL) {
-        return;
-    }
-
-    for (ihpf1 = 0; ihpf1 < pHPF->hpf1Count; ihpf1 += 1) {
-        ma_hpf1_uninit(&pHPF->pHPF1[ihpf1], pAllocationCallbacks);
-    }
-
-    for (ihpf2 = 0; ihpf2 < pHPF->hpf2Count; ihpf2 += 1) {
-        ma_hpf2_uninit(&pHPF->pHPF2[ihpf2], pAllocationCallbacks);
-    }
-
-    if (pHPF->_ownsHeap) {
-        ma_free(pHPF->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_result ma_hpf_reinit(const ma_hpf_config* pConfig, ma_hpf* pHPF)
-{
-    return ma_hpf_reinit__internal(pConfig, NULL, pHPF, /*isNew*/MA_FALSE);
-}
-
-MA_API ma_result ma_hpf_process_pcm_frames(ma_hpf* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_result result;
-    ma_uint32 ihpf1;
-    ma_uint32 ihpf2;
-
-    if (pHPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Faster path for in-place. */
-    if (pFramesOut == pFramesIn) {
-        for (ihpf1 = 0; ihpf1 < pHPF->hpf1Count; ihpf1 += 1) {
-            result = ma_hpf1_process_pcm_frames(&pHPF->pHPF1[ihpf1], pFramesOut, pFramesOut, frameCount);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-        }
-
-        for (ihpf2 = 0; ihpf2 < pHPF->hpf2Count; ihpf2 += 1) {
-            result = ma_hpf2_process_pcm_frames(&pHPF->pHPF2[ihpf2], pFramesOut, pFramesOut, frameCount);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-        }
-    }
-
-    /* Slightly slower path for copying. */
-    if (pFramesOut != pFramesIn) {
-        ma_uint32 iFrame;
-
-        /*  */ if (pHPF->format == ma_format_f32) {
-            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
-            const float* pFramesInF32  = (const float*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                MA_COPY_MEMORY(pFramesOutF32, pFramesInF32, ma_get_bytes_per_frame(pHPF->format, pHPF->channels));
-
-                for (ihpf1 = 0; ihpf1 < pHPF->hpf1Count; ihpf1 += 1) {
-                    ma_hpf1_process_pcm_frame_f32(&pHPF->pHPF1[ihpf1], pFramesOutF32, pFramesOutF32);
-                }
-
-                for (ihpf2 = 0; ihpf2 < pHPF->hpf2Count; ihpf2 += 1) {
-                    ma_hpf2_process_pcm_frame_f32(&pHPF->pHPF2[ihpf2], pFramesOutF32, pFramesOutF32);
-                }
-
-                pFramesOutF32 += pHPF->channels;
-                pFramesInF32  += pHPF->channels;
-            }
-        } else if (pHPF->format == ma_format_s16) {
-            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
-            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                MA_COPY_MEMORY(pFramesOutS16, pFramesInS16, ma_get_bytes_per_frame(pHPF->format, pHPF->channels));
-
-                for (ihpf1 = 0; ihpf1 < pHPF->hpf1Count; ihpf1 += 1) {
-                    ma_hpf1_process_pcm_frame_s16(&pHPF->pHPF1[ihpf1], pFramesOutS16, pFramesOutS16);
-                }
-
-                for (ihpf2 = 0; ihpf2 < pHPF->hpf2Count; ihpf2 += 1) {
-                    ma_hpf2_process_pcm_frame_s16(&pHPF->pHPF2[ihpf2], pFramesOutS16, pFramesOutS16);
-                }
-
-                pFramesOutS16 += pHPF->channels;
-                pFramesInS16  += pHPF->channels;
-            }
-        } else {
-            MA_ASSERT(MA_FALSE);
-            return MA_INVALID_OPERATION;    /* Should never hit this. */
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_uint32 ma_hpf_get_latency(const ma_hpf* pHPF)
-{
-    if (pHPF == NULL) {
-        return 0;
-    }
-
-    return pHPF->hpf2Count*2 + pHPF->hpf1Count;
-}
-
-
-/**************************************************************************************************************************************************************
-
-Band-Pass Filtering
-
-**************************************************************************************************************************************************************/
-MA_API ma_bpf2_config ma_bpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q)
-{
-    ma_bpf2_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format = format;
-    config.channels = channels;
-    config.sampleRate = sampleRate;
-    config.cutoffFrequency = cutoffFrequency;
-    config.q = q;
-
-    /* Q cannot be 0 or else it'll result in a division by 0. In this case just default to 0.707107. */
-    if (config.q == 0) {
-        config.q = 0.707107;
-    }
-
-    return config;
-}
-
-
-static MA_INLINE ma_biquad_config ma_bpf2__get_biquad_config(const ma_bpf2_config* pConfig)
-{
-    ma_biquad_config bqConfig;
-    double q;
-    double w;
-    double s;
-    double c;
-    double a;
-
-    MA_ASSERT(pConfig != NULL);
-
-    q = pConfig->q;
-    w = 2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate;
-    s = ma_sind(w);
-    c = ma_cosd(w);
-    a = s / (2*q);
-
-    bqConfig.b0 =  q * a;
-    bqConfig.b1 =  0;
-    bqConfig.b2 = -q * a;
-    bqConfig.a0 =  1 + a;
-    bqConfig.a1 = -2 * c;
-    bqConfig.a2 =  1 - a;
-
-    bqConfig.format   = pConfig->format;
-    bqConfig.channels = pConfig->channels;
-
-    return bqConfig;
-}
-
-MA_API ma_result ma_bpf2_get_heap_size(const ma_bpf2_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_biquad_config bqConfig;
-    bqConfig = ma_bpf2__get_biquad_config(pConfig);
-
-    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
-}
-
-MA_API ma_result ma_bpf2_init_preallocated(const ma_bpf2_config* pConfig, void* pHeap, ma_bpf2* pBPF)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pBPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pBPF);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_bpf2__get_biquad_config(pConfig);
-    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pBPF->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_bpf2_init(const ma_bpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf2* pBPF)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_bpf2_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_bpf2_init_preallocated(pConfig, pHeap, pBPF);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pBPF->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
-    return MA_SUCCESS;
-}
-
-MA_API void ma_bpf2_uninit(ma_bpf2* pBPF, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pBPF == NULL) {
-        return;
-    }
-
-    ma_biquad_uninit(&pBPF->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
-}
-
-MA_API ma_result ma_bpf2_reinit(const ma_bpf2_config* pConfig, ma_bpf2* pBPF)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pBPF == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_bpf2__get_biquad_config(pConfig);
-    result = ma_biquad_reinit(&bqConfig, &pBPF->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_bpf2_process_pcm_frame_s16(ma_bpf2* pBPF, ma_int16* pFrameOut, const ma_int16* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_s16(&pBPF->bq, pFrameOut, pFrameIn);
-}
-
-static MA_INLINE void ma_bpf2_process_pcm_frame_f32(ma_bpf2* pBPF, float* pFrameOut, const float* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_f32(&pBPF->bq, pFrameOut, pFrameIn);
-}
-
-MA_API ma_result ma_bpf2_process_pcm_frames(ma_bpf2* pBPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pBPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_biquad_process_pcm_frames(&pBPF->bq, pFramesOut, pFramesIn, frameCount);
-}
-
-MA_API ma_uint32 ma_bpf2_get_latency(const ma_bpf2* pBPF)
-{
-    if (pBPF == NULL) {
-        return 0;
-    }
-
-    return ma_biquad_get_latency(&pBPF->bq);
-}
-
-
-MA_API ma_bpf_config ma_bpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
-{
-    ma_bpf_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format          = format;
-    config.channels        = channels;
-    config.sampleRate      = sampleRate;
-    config.cutoffFrequency = cutoffFrequency;
-    config.order           = ma_min(order, MA_MAX_FILTER_ORDER);
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t bpf2Offset;
-} ma_bpf_heap_layout;
-
-static ma_result ma_bpf_get_heap_layout(const ma_bpf_config* pConfig, ma_bpf_heap_layout* pHeapLayout)
-{
-    ma_result result;
-    ma_uint32 bpf2Count;
-    ma_uint32 ibpf2;
-
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->order > MA_MAX_FILTER_ORDER) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* We must have an even number of order. */
-    if ((pConfig->order & 0x1) != 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    bpf2Count = pConfig->channels / 2;
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* BPF 2 */
-    pHeapLayout->bpf2Offset = pHeapLayout->sizeInBytes;
-    for (ibpf2 = 0; ibpf2 < bpf2Count; ibpf2 += 1) {
-        size_t bpf2HeapSizeInBytes;
-        ma_bpf2_config bpf2Config = ma_bpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, 0.707107);   /* <-- The "q" parameter does not matter for the purpose of calculating the heap size. */
-
-        result = ma_bpf2_get_heap_size(&bpf2Config, &bpf2HeapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->sizeInBytes += sizeof(ma_bpf2) + bpf2HeapSizeInBytes;
-    }
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_bpf_reinit__internal(const ma_bpf_config* pConfig, void* pHeap, ma_bpf* pBPF, ma_bool32 isNew)
-{
-    ma_result result;
-    ma_uint32 bpf2Count;
-    ma_uint32 ibpf2;
-    ma_bpf_heap_layout heapLayout;  /* Only used if isNew is true. */
-
-    if (pBPF == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Only supporting f32 and s16. */
-    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The format cannot be changed after initialization. */
-    if (pBPF->format != ma_format_unknown && pBPF->format != pConfig->format) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* The channel count cannot be changed after initialization. */
-    if (pBPF->channels != 0 && pBPF->channels != pConfig->channels) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pConfig->order > MA_MAX_FILTER_ORDER) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* We must have an even number of order. */
-    if ((pConfig->order & 0x1) != 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    bpf2Count = pConfig->order / 2;
-
-    /* The filter order can't change between reinits. */
-    if (!isNew) {
-        if (pBPF->bpf2Count != bpf2Count) {
-            return MA_INVALID_OPERATION;
-        }
-    }
-
-    if (isNew) {
-        result = ma_bpf_get_heap_layout(pConfig, &heapLayout);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pBPF->_pHeap = pHeap;
-        MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-        pBPF->pBPF2 = (ma_bpf2*)ma_offset_ptr(pHeap, heapLayout.bpf2Offset);
-    } else {
-        MA_ZERO_OBJECT(&heapLayout);
-    }
-
-    for (ibpf2 = 0; ibpf2 < bpf2Count; ibpf2 += 1) {
-        ma_bpf2_config bpf2Config;
-        double q;
-
-        /* TODO: Calculate Q to make this a proper Butterworth filter. */
-        q = 0.707107;
-
-        bpf2Config = ma_bpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, q);
-
-        if (isNew) {
-            size_t bpf2HeapSizeInBytes;
-
-            result = ma_bpf2_get_heap_size(&bpf2Config, &bpf2HeapSizeInBytes);
-            if (result == MA_SUCCESS) {
-                result = ma_bpf2_init_preallocated(&bpf2Config, ma_offset_ptr(pHeap, heapLayout.bpf2Offset + (sizeof(ma_bpf2) * bpf2Count) + (ibpf2 * bpf2HeapSizeInBytes)), &pBPF->pBPF2[ibpf2]);
-            }
-        } else {
-            result = ma_bpf2_reinit(&bpf2Config, &pBPF->pBPF2[ibpf2]);
-        }
-
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    pBPF->bpf2Count = bpf2Count;
-    pBPF->format    = pConfig->format;
-    pBPF->channels  = pConfig->channels;
-
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_result ma_bpf_get_heap_size(const ma_bpf_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_bpf_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_bpf_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_bpf_init_preallocated(const ma_bpf_config* pConfig, void* pHeap, ma_bpf* pBPF)
-{
-    if (pBPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pBPF);
-
-    return ma_bpf_reinit__internal(pConfig, pHeap, pBPF, /*isNew*/MA_TRUE);
-}
-
-MA_API ma_result ma_bpf_init(const ma_bpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf* pBPF)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_bpf_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_bpf_init_preallocated(pConfig, pHeap, pBPF);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pBPF->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_bpf_uninit(ma_bpf* pBPF, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_uint32 ibpf2;
-
-    if (pBPF == NULL) {
-        return;
-    }
-
-    for (ibpf2 = 0; ibpf2 < pBPF->bpf2Count; ibpf2 += 1) {
-        ma_bpf2_uninit(&pBPF->pBPF2[ibpf2], pAllocationCallbacks);
-    }
-
-    if (pBPF->_ownsHeap) {
-        ma_free(pBPF->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_result ma_bpf_reinit(const ma_bpf_config* pConfig, ma_bpf* pBPF)
-{
-    return ma_bpf_reinit__internal(pConfig, NULL, pBPF, /*isNew*/MA_FALSE);
-}
-
-MA_API ma_result ma_bpf_process_pcm_frames(ma_bpf* pBPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_result result;
-    ma_uint32 ibpf2;
-
-    if (pBPF == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Faster path for in-place. */
-    if (pFramesOut == pFramesIn) {
-        for (ibpf2 = 0; ibpf2 < pBPF->bpf2Count; ibpf2 += 1) {
-            result = ma_bpf2_process_pcm_frames(&pBPF->pBPF2[ibpf2], pFramesOut, pFramesOut, frameCount);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-        }
-    }
-
-    /* Slightly slower path for copying. */
-    if (pFramesOut != pFramesIn) {
-        ma_uint32 iFrame;
-
-        /*  */ if (pBPF->format == ma_format_f32) {
-            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
-            const float* pFramesInF32  = (const float*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                MA_COPY_MEMORY(pFramesOutF32, pFramesInF32, ma_get_bytes_per_frame(pBPF->format, pBPF->channels));
-
-                for (ibpf2 = 0; ibpf2 < pBPF->bpf2Count; ibpf2 += 1) {
-                    ma_bpf2_process_pcm_frame_f32(&pBPF->pBPF2[ibpf2], pFramesOutF32, pFramesOutF32);
-                }
-
-                pFramesOutF32 += pBPF->channels;
-                pFramesInF32  += pBPF->channels;
-            }
-        } else if (pBPF->format == ma_format_s16) {
-            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
-            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                MA_COPY_MEMORY(pFramesOutS16, pFramesInS16, ma_get_bytes_per_frame(pBPF->format, pBPF->channels));
-
-                for (ibpf2 = 0; ibpf2 < pBPF->bpf2Count; ibpf2 += 1) {
-                    ma_bpf2_process_pcm_frame_s16(&pBPF->pBPF2[ibpf2], pFramesOutS16, pFramesOutS16);
-                }
-
-                pFramesOutS16 += pBPF->channels;
-                pFramesInS16  += pBPF->channels;
-            }
-        } else {
-            MA_ASSERT(MA_FALSE);
-            return MA_INVALID_OPERATION;    /* Should never hit this. */
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_uint32 ma_bpf_get_latency(const ma_bpf* pBPF)
-{
-    if (pBPF == NULL) {
-        return 0;
-    }
-
-    return pBPF->bpf2Count*2;
-}
-
-
-/**************************************************************************************************************************************************************
-
-Notching Filter
-
-**************************************************************************************************************************************************************/
-MA_API ma_notch2_config ma_notch2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double q, double frequency)
-{
-    ma_notch2_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format     = format;
-    config.channels   = channels;
-    config.sampleRate = sampleRate;
-    config.q          = q;
-    config.frequency  = frequency;
-
-    if (config.q == 0) {
-        config.q = 0.707107;
-    }
-
-    return config;
-}
-
-
-static MA_INLINE ma_biquad_config ma_notch2__get_biquad_config(const ma_notch2_config* pConfig)
-{
-    ma_biquad_config bqConfig;
-    double q;
-    double w;
-    double s;
-    double c;
-    double a;
-
-    MA_ASSERT(pConfig != NULL);
-
-    q = pConfig->q;
-    w = 2 * MA_PI_D * pConfig->frequency / pConfig->sampleRate;
-    s = ma_sind(w);
-    c = ma_cosd(w);
-    a = s / (2*q);
-
-    bqConfig.b0 =  1;
-    bqConfig.b1 = -2 * c;
-    bqConfig.b2 =  1;
-    bqConfig.a0 =  1 + a;
-    bqConfig.a1 = -2 * c;
-    bqConfig.a2 =  1 - a;
-
-    bqConfig.format   = pConfig->format;
-    bqConfig.channels = pConfig->channels;
-
-    return bqConfig;
-}
-
-MA_API ma_result ma_notch2_get_heap_size(const ma_notch2_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_biquad_config bqConfig;
-    bqConfig = ma_notch2__get_biquad_config(pConfig);
-
-    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
-}
-
-MA_API ma_result ma_notch2_init_preallocated(const ma_notch2_config* pConfig, void* pHeap, ma_notch2* pFilter)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pFilter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pFilter);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_notch2__get_biquad_config(pConfig);
-    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pFilter->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_notch2_init(const ma_notch2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_notch2* pFilter)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_notch2_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_notch2_init_preallocated(pConfig, pHeap, pFilter);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pFilter->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
-    return MA_SUCCESS;
-}
-
-MA_API void ma_notch2_uninit(ma_notch2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFilter == NULL) {
-        return;
-    }
-
-    ma_biquad_uninit(&pFilter->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
-}
-
-MA_API ma_result ma_notch2_reinit(const ma_notch2_config* pConfig, ma_notch2* pFilter)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pFilter == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_notch2__get_biquad_config(pConfig);
-    result = ma_biquad_reinit(&bqConfig, &pFilter->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_notch2_process_pcm_frame_s16(ma_notch2* pFilter, ma_int16* pFrameOut, const ma_int16* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_s16(&pFilter->bq, pFrameOut, pFrameIn);
-}
-
-static MA_INLINE void ma_notch2_process_pcm_frame_f32(ma_notch2* pFilter, float* pFrameOut, const float* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_f32(&pFilter->bq, pFrameOut, pFrameIn);
-}
-
-MA_API ma_result ma_notch2_process_pcm_frames(ma_notch2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pFilter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_biquad_process_pcm_frames(&pFilter->bq, pFramesOut, pFramesIn, frameCount);
-}
-
-MA_API ma_uint32 ma_notch2_get_latency(const ma_notch2* pFilter)
-{
-    if (pFilter == NULL) {
-        return 0;
-    }
-
-    return ma_biquad_get_latency(&pFilter->bq);
-}
-
-
-
-/**************************************************************************************************************************************************************
-
-Peaking EQ Filter
-
-**************************************************************************************************************************************************************/
-MA_API ma_peak2_config ma_peak2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency)
-{
-    ma_peak2_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format     = format;
-    config.channels   = channels;
-    config.sampleRate = sampleRate;
-    config.gainDB     = gainDB;
-    config.q          = q;
-    config.frequency  = frequency;
-
-    if (config.q == 0) {
-        config.q = 0.707107;
-    }
-
-    return config;
-}
-
-
-static MA_INLINE ma_biquad_config ma_peak2__get_biquad_config(const ma_peak2_config* pConfig)
-{
-    ma_biquad_config bqConfig;
-    double q;
-    double w;
-    double s;
-    double c;
-    double a;
-    double A;
-
-    MA_ASSERT(pConfig != NULL);
-
-    q = pConfig->q;
-    w = 2 * MA_PI_D * pConfig->frequency / pConfig->sampleRate;
-    s = ma_sind(w);
-    c = ma_cosd(w);
-    a = s / (2*q);
-    A = ma_powd(10, (pConfig->gainDB / 40));
-
-    bqConfig.b0 =  1 + (a * A);
-    bqConfig.b1 = -2 * c;
-    bqConfig.b2 =  1 - (a * A);
-    bqConfig.a0 =  1 + (a / A);
-    bqConfig.a1 = -2 * c;
-    bqConfig.a2 =  1 - (a / A);
-
-    bqConfig.format   = pConfig->format;
-    bqConfig.channels = pConfig->channels;
-
-    return bqConfig;
-}
-
-MA_API ma_result ma_peak2_get_heap_size(const ma_peak2_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_biquad_config bqConfig;
-    bqConfig = ma_peak2__get_biquad_config(pConfig);
-
-    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
-}
-
-MA_API ma_result ma_peak2_init_preallocated(const ma_peak2_config* pConfig, void* pHeap, ma_peak2* pFilter)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pFilter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pFilter);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_peak2__get_biquad_config(pConfig);
-    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pFilter->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_peak2_init(const ma_peak2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_peak2* pFilter)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_peak2_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_peak2_init_preallocated(pConfig, pHeap, pFilter);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pFilter->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
-    return MA_SUCCESS;
-}
-
-MA_API void ma_peak2_uninit(ma_peak2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFilter == NULL) {
-        return;
-    }
-
-    ma_biquad_uninit(&pFilter->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
-}
-
-MA_API ma_result ma_peak2_reinit(const ma_peak2_config* pConfig, ma_peak2* pFilter)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pFilter == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_peak2__get_biquad_config(pConfig);
-    result = ma_biquad_reinit(&bqConfig, &pFilter->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_peak2_process_pcm_frame_s16(ma_peak2* pFilter, ma_int16* pFrameOut, const ma_int16* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_s16(&pFilter->bq, pFrameOut, pFrameIn);
-}
-
-static MA_INLINE void ma_peak2_process_pcm_frame_f32(ma_peak2* pFilter, float* pFrameOut, const float* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_f32(&pFilter->bq, pFrameOut, pFrameIn);
-}
-
-MA_API ma_result ma_peak2_process_pcm_frames(ma_peak2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pFilter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_biquad_process_pcm_frames(&pFilter->bq, pFramesOut, pFramesIn, frameCount);
-}
-
-MA_API ma_uint32 ma_peak2_get_latency(const ma_peak2* pFilter)
-{
-    if (pFilter == NULL) {
-        return 0;
-    }
-
-    return ma_biquad_get_latency(&pFilter->bq);
-}
-
-
-/**************************************************************************************************************************************************************
-
-Low Shelf Filter
-
-**************************************************************************************************************************************************************/
-MA_API ma_loshelf2_config ma_loshelf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double shelfSlope, double frequency)
-{
-    ma_loshelf2_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format     = format;
-    config.channels   = channels;
-    config.sampleRate = sampleRate;
-    config.gainDB     = gainDB;
-    config.shelfSlope = shelfSlope;
-    config.frequency  = frequency;
-
-    return config;
-}
-
-
-static MA_INLINE ma_biquad_config ma_loshelf2__get_biquad_config(const ma_loshelf2_config* pConfig)
-{
-    ma_biquad_config bqConfig;
-    double w;
-    double s;
-    double c;
-    double A;
-    double S;
-    double a;
-    double sqrtA;
-
-    MA_ASSERT(pConfig != NULL);
-
-    w = 2 * MA_PI_D * pConfig->frequency / pConfig->sampleRate;
-    s = ma_sind(w);
-    c = ma_cosd(w);
-    A = ma_powd(10, (pConfig->gainDB / 40));
-    S = pConfig->shelfSlope;
-    a = s/2 * ma_sqrtd((A + 1/A) * (1/S - 1) + 2);
-    sqrtA = 2*ma_sqrtd(A)*a;
-
-    bqConfig.b0 =  A * ((A + 1) - (A - 1)*c + sqrtA);
-    bqConfig.b1 =  2 * A * ((A - 1) - (A + 1)*c);
-    bqConfig.b2 =  A * ((A + 1) - (A - 1)*c - sqrtA);
-    bqConfig.a0 =  (A + 1) + (A - 1)*c + sqrtA;
-    bqConfig.a1 = -2 * ((A - 1) + (A + 1)*c);
-    bqConfig.a2 =  (A + 1) + (A - 1)*c - sqrtA;
-
-    bqConfig.format   = pConfig->format;
-    bqConfig.channels = pConfig->channels;
-
-    return bqConfig;
-}
-
-MA_API ma_result ma_loshelf2_get_heap_size(const ma_loshelf2_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_biquad_config bqConfig;
-    bqConfig = ma_loshelf2__get_biquad_config(pConfig);
-
-    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
-}
-
-MA_API ma_result ma_loshelf2_init_preallocated(const ma_loshelf2_config* pConfig, void* pHeap, ma_loshelf2* pFilter)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pFilter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pFilter);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_loshelf2__get_biquad_config(pConfig);
-    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pFilter->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_loshelf2_init(const ma_loshelf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_loshelf2* pFilter)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_loshelf2_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_loshelf2_init_preallocated(pConfig, pHeap, pFilter);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pFilter->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
-    return MA_SUCCESS;
-}
-
-MA_API void ma_loshelf2_uninit(ma_loshelf2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFilter == NULL) {
-        return;
-    }
-
-    ma_biquad_uninit(&pFilter->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
-}
-
-MA_API ma_result ma_loshelf2_reinit(const ma_loshelf2_config* pConfig, ma_loshelf2* pFilter)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pFilter == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_loshelf2__get_biquad_config(pConfig);
-    result = ma_biquad_reinit(&bqConfig, &pFilter->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_loshelf2_process_pcm_frame_s16(ma_loshelf2* pFilter, ma_int16* pFrameOut, const ma_int16* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_s16(&pFilter->bq, pFrameOut, pFrameIn);
-}
-
-static MA_INLINE void ma_loshelf2_process_pcm_frame_f32(ma_loshelf2* pFilter, float* pFrameOut, const float* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_f32(&pFilter->bq, pFrameOut, pFrameIn);
-}
-
-MA_API ma_result ma_loshelf2_process_pcm_frames(ma_loshelf2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pFilter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_biquad_process_pcm_frames(&pFilter->bq, pFramesOut, pFramesIn, frameCount);
-}
-
-MA_API ma_uint32 ma_loshelf2_get_latency(const ma_loshelf2* pFilter)
-{
-    if (pFilter == NULL) {
-        return 0;
-    }
-
-    return ma_biquad_get_latency(&pFilter->bq);
-}
-
-
-/**************************************************************************************************************************************************************
-
-High Shelf Filter
-
-**************************************************************************************************************************************************************/
-MA_API ma_hishelf2_config ma_hishelf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double shelfSlope, double frequency)
-{
-    ma_hishelf2_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format     = format;
-    config.channels   = channels;
-    config.sampleRate = sampleRate;
-    config.gainDB     = gainDB;
-    config.shelfSlope = shelfSlope;
-    config.frequency  = frequency;
-
-    return config;
-}
-
-
-static MA_INLINE ma_biquad_config ma_hishelf2__get_biquad_config(const ma_hishelf2_config* pConfig)
-{
-    ma_biquad_config bqConfig;
-    double w;
-    double s;
-    double c;
-    double A;
-    double S;
-    double a;
-    double sqrtA;
-
-    MA_ASSERT(pConfig != NULL);
-
-    w = 2 * MA_PI_D * pConfig->frequency / pConfig->sampleRate;
-    s = ma_sind(w);
-    c = ma_cosd(w);
-    A = ma_powd(10, (pConfig->gainDB / 40));
-    S = pConfig->shelfSlope;
-    a = s/2 * ma_sqrtd((A + 1/A) * (1/S - 1) + 2);
-    sqrtA = 2*ma_sqrtd(A)*a;
-
-    bqConfig.b0 =  A * ((A + 1) + (A - 1)*c + sqrtA);
-    bqConfig.b1 = -2 * A * ((A - 1) + (A + 1)*c);
-    bqConfig.b2 =  A * ((A + 1) + (A - 1)*c - sqrtA);
-    bqConfig.a0 =  (A + 1) - (A - 1)*c + sqrtA;
-    bqConfig.a1 =  2 * ((A - 1) - (A + 1)*c);
-    bqConfig.a2 =  (A + 1) - (A - 1)*c - sqrtA;
-
-    bqConfig.format   = pConfig->format;
-    bqConfig.channels = pConfig->channels;
-
-    return bqConfig;
-}
-
-MA_API ma_result ma_hishelf2_get_heap_size(const ma_hishelf2_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_biquad_config bqConfig;
-    bqConfig = ma_hishelf2__get_biquad_config(pConfig);
-
-    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
-}
-
-MA_API ma_result ma_hishelf2_init_preallocated(const ma_hishelf2_config* pConfig, void* pHeap, ma_hishelf2* pFilter)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pFilter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pFilter);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_hishelf2__get_biquad_config(pConfig);
-    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pFilter->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_hishelf2_init(const ma_hishelf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hishelf2* pFilter)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_hishelf2_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_hishelf2_init_preallocated(pConfig, pHeap, pFilter);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pFilter->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
-    return MA_SUCCESS;
-}
-
-MA_API void ma_hishelf2_uninit(ma_hishelf2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFilter == NULL) {
-        return;
-    }
-
-    ma_biquad_uninit(&pFilter->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
-}
-
-MA_API ma_result ma_hishelf2_reinit(const ma_hishelf2_config* pConfig, ma_hishelf2* pFilter)
-{
-    ma_result result;
-    ma_biquad_config bqConfig;
-
-    if (pFilter == NULL || pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    bqConfig = ma_hishelf2__get_biquad_config(pConfig);
-    result = ma_biquad_reinit(&bqConfig, &pFilter->bq);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-static MA_INLINE void ma_hishelf2_process_pcm_frame_s16(ma_hishelf2* pFilter, ma_int16* pFrameOut, const ma_int16* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_s16(&pFilter->bq, pFrameOut, pFrameIn);
-}
-
-static MA_INLINE void ma_hishelf2_process_pcm_frame_f32(ma_hishelf2* pFilter, float* pFrameOut, const float* pFrameIn)
-{
-    ma_biquad_process_pcm_frame_f32(&pFilter->bq, pFrameOut, pFrameIn);
-}
-
-MA_API ma_result ma_hishelf2_process_pcm_frames(ma_hishelf2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pFilter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_biquad_process_pcm_frames(&pFilter->bq, pFramesOut, pFramesIn, frameCount);
-}
-
-MA_API ma_uint32 ma_hishelf2_get_latency(const ma_hishelf2* pFilter)
-{
-    if (pFilter == NULL) {
-        return 0;
-    }
-
-    return ma_biquad_get_latency(&pFilter->bq);
-}
-
-
-
-/*
-Delay
-*/
-MA_API ma_delay_config ma_delay_config_init(ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 delayInFrames, float decay)
-{
-    ma_delay_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.channels      = channels;
-    config.sampleRate    = sampleRate;
-    config.delayInFrames = delayInFrames;
-    config.delayStart    = (decay == 0) ? MA_TRUE : MA_FALSE;   /* Delay the start if it looks like we're not configuring an echo. */
-    config.wet           = 1;
-    config.dry           = 1;
-    config.decay         = decay;
-
-    return config;
-}
-
-
-MA_API ma_result ma_delay_init(const ma_delay_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_delay* pDelay)
-{
-    if (pDelay == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDelay);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->decay < 0 || pConfig->decay > 1) {
-        return MA_INVALID_ARGS;
-    }
-
-    pDelay->config             = *pConfig;
-    pDelay->bufferSizeInFrames = pConfig->delayInFrames;
-    pDelay->cursor             = 0;
-
-    pDelay->pBuffer = (float*)ma_malloc((size_t)(pDelay->bufferSizeInFrames * ma_get_bytes_per_frame(ma_format_f32, pConfig->channels)), pAllocationCallbacks);
-    if (pDelay->pBuffer == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    ma_silence_pcm_frames(pDelay->pBuffer, pDelay->bufferSizeInFrames, ma_format_f32, pConfig->channels);
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_delay_uninit(ma_delay* pDelay, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pDelay == NULL) {
-        return;
-    }
-
-    ma_free(pDelay->pBuffer, pAllocationCallbacks);
-}
-
-MA_API ma_result ma_delay_process_pcm_frames(ma_delay* pDelay, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
-{
-    ma_uint32 iFrame;
-    ma_uint32 iChannel;
-    float* pFramesOutF32 = (float*)pFramesOut;
-    const float* pFramesInF32 = (const float*)pFramesIn;
-
-    if (pDelay == NULL || pFramesOut == NULL || pFramesIn == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        for (iChannel = 0; iChannel < pDelay->config.channels; iChannel += 1) {
-            ma_uint32 iBuffer = (pDelay->cursor * pDelay->config.channels) + iChannel;
-
-            if (pDelay->config.delayStart) {
-                /* Delayed start. */
-
-                /* Read */
-                pFramesOutF32[iChannel] = pDelay->pBuffer[iBuffer] * pDelay->config.wet;
-
-                /* Feedback */
-                pDelay->pBuffer[iBuffer] = (pDelay->pBuffer[iBuffer] * pDelay->config.decay) + (pFramesInF32[iChannel] * pDelay->config.dry);
-            } else {
-                /* Immediate start */
-
-                /* Feedback */
-                pDelay->pBuffer[iBuffer] = (pDelay->pBuffer[iBuffer] * pDelay->config.decay) + (pFramesInF32[iChannel] * pDelay->config.dry);
-
-                /* Read */
-                pFramesOutF32[iChannel] = pDelay->pBuffer[iBuffer] * pDelay->config.wet;
-            }
-        }
-
-        pDelay->cursor = (pDelay->cursor + 1) % pDelay->bufferSizeInFrames;
-
-        pFramesOutF32 += pDelay->config.channels;
-        pFramesInF32  += pDelay->config.channels;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_delay_set_wet(ma_delay* pDelay, float value)
-{
-    if (pDelay == NULL) {
-        return;
-    }
-
-    pDelay->config.wet = value;
-}
-
-MA_API float ma_delay_get_wet(const ma_delay* pDelay)
-{
-    if (pDelay == NULL) {
-        return 0;
-    }
-
-    return pDelay->config.wet;
-}
-
-MA_API void ma_delay_set_dry(ma_delay* pDelay, float value)
-{
-    if (pDelay == NULL) {
-        return;
-    }
-
-    pDelay->config.dry = value;
-}
-
-MA_API float ma_delay_get_dry(const ma_delay* pDelay)
-{
-    if (pDelay == NULL) {
-        return 0;
-    }
-
-    return pDelay->config.dry;
-}
-
-MA_API void ma_delay_set_decay(ma_delay* pDelay, float value)
-{
-    if (pDelay == NULL) {
-        return;
-    }
-
-    pDelay->config.decay = value;
-}
-
-MA_API float ma_delay_get_decay(const ma_delay* pDelay)
-{
-    if (pDelay == NULL) {
-        return 0;
-    }
-
-    return pDelay->config.decay;
-}
-
-
-MA_API ma_gainer_config ma_gainer_config_init(ma_uint32 channels, ma_uint32 smoothTimeInFrames)
-{
-    ma_gainer_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.channels           = channels;
-    config.smoothTimeInFrames = smoothTimeInFrames;
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t oldGainsOffset;
-    size_t newGainsOffset;
-} ma_gainer_heap_layout;
-
-static ma_result ma_gainer_get_heap_layout(const ma_gainer_config* pConfig, ma_gainer_heap_layout* pHeapLayout)
-{
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* Old gains. */
-    pHeapLayout->oldGainsOffset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += sizeof(float) * pConfig->channels;
-
-    /* New gains. */
-    pHeapLayout->newGainsOffset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += sizeof(float) * pConfig->channels;
-
-    /* Alignment. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_result ma_gainer_get_heap_size(const ma_gainer_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_gainer_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_gainer_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_result ma_gainer_init_preallocated(const ma_gainer_config* pConfig, void* pHeap, ma_gainer* pGainer)
-{
-    ma_result result;
-    ma_gainer_heap_layout heapLayout;
-    ma_uint32 iChannel;
-
-    if (pGainer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pGainer);
-
-    if (pConfig == NULL || pHeap == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_gainer_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pGainer->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pGainer->pOldGains = (float*)ma_offset_ptr(pHeap, heapLayout.oldGainsOffset);
-    pGainer->pNewGains = (float*)ma_offset_ptr(pHeap, heapLayout.newGainsOffset);
-    pGainer->masterVolume = 1;
-
-    pGainer->config = *pConfig;
-    pGainer->t      = (ma_uint32)-1;  /* No interpolation by default. */
-
-    for (iChannel = 0; iChannel < pConfig->channels; iChannel += 1) {
-        pGainer->pOldGains[iChannel] = 1;
-        pGainer->pNewGains[iChannel] = 1;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_gainer_init(const ma_gainer_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_gainer* pGainer)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_gainer_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to retrieve the size of the heap allocation. */
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_gainer_init_preallocated(pConfig, pHeap, pGainer);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pGainer->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_gainer_uninit(ma_gainer* pGainer, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pGainer == NULL) {
-        return;
-    }
-
-    if (pGainer->_ownsHeap) {
-        ma_free(pGainer->_pHeap, pAllocationCallbacks);
-    }
-}
-
-static float ma_gainer_calculate_current_gain(const ma_gainer* pGainer, ma_uint32 channel)
-{
-    float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
-    return ma_mix_f32_fast(pGainer->pOldGains[channel], pGainer->pNewGains[channel], a);
-}
-
-static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_internal(ma_gainer * pGainer, void* MA_RESTRICT pFramesOut, const void* MA_RESTRICT pFramesIn, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannel;
-    ma_uint64 interpolatedFrameCount;
-
-    MA_ASSERT(pGainer != NULL);
-
-    /*
-    We don't necessarily need to apply a linear interpolation for the entire frameCount frames. When
-    linear interpolation is not needed we can do a simple volume adjustment which will be more
-    efficient than a lerp with an alpha value of 1.
-
-    To do this, all we need to do is determine how many frames need to have a lerp applied. Then we
-    just process that number of frames with linear interpolation. After that we run on an optimized
-    path which just applies the new gains without a lerp.
-    */
-    if (pGainer->t >= pGainer->config.smoothTimeInFrames) {
-        interpolatedFrameCount = 0;
-    } else {
-        interpolatedFrameCount = pGainer->t - pGainer->config.smoothTimeInFrames;
-        if (interpolatedFrameCount > frameCount) {
-            interpolatedFrameCount = frameCount;
-        }
-    }
-
-    /*
-    Start off with our interpolated frames. When we do this, we'll adjust frameCount and our pointers
-    so that the fast path can work naturally without consideration of the interpolated path.
-    */
-    if (interpolatedFrameCount > 0) {
-        /* We can allow the input and output buffers to be null in which case we'll just update the internal timer. */
-        if (pFramesOut != NULL && pFramesIn != NULL) {
-            /*
-            All we're really doing here is moving the old gains towards the new gains. We don't want to
-            be modifying the gains inside the ma_gainer object because that will break things. Instead
-            we can make a copy here on the stack. For extreme channel counts we can fall back to a slower
-            implementation which just uses a standard lerp.
-            */
-            float* pFramesOutF32 = (float*)pFramesOut;
-            const float* pFramesInF32 = (const float*)pFramesIn;
-            float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
-            float d = 1.0f / pGainer->config.smoothTimeInFrames;
-
-            if (pGainer->config.channels <= 32) {
-                float pRunningGain[32];
-                float pRunningGainDelta[32];    /* Could this be heap-allocated as part of the ma_gainer object? */
-
-                /* Initialize the running gain. */
-                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                    float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
-                    pRunningGainDelta[iChannel] = t * d;
-                    pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
-                }
-
-                iFrame = 0;
-
-                /* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
-                if (pGainer->config.channels == 2) {
-                #if defined(MA_SUPPORT_SSE2)
-                    if (ma_has_sse2()) {
-                        ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
-
-                        /* Expand some arrays so we can have a clean SIMD loop below. */
-                        __m128 runningGainDelta0 = _mm_set_ps(pRunningGainDelta[1], pRunningGainDelta[0], pRunningGainDelta[1], pRunningGainDelta[0]);
-                        __m128 runningGain0      = _mm_set_ps(pRunningGain[1] + pRunningGainDelta[1], pRunningGain[0] + pRunningGainDelta[0], pRunningGain[1], pRunningGain[0]);
-
-                        for (; iFrame < unrolledLoopCount; iFrame += 1) {
-                            _mm_storeu_ps(&pFramesOutF32[iFrame*4 + 0], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*4 + 0]), runningGain0));
-                            runningGain0 = _mm_add_ps(runningGain0, runningGainDelta0);
-                        }
-
-                        iFrame = unrolledLoopCount << 1;
-                    } else
-                #endif
-                    {
-                        /*
-                        Two different scalar implementations here. Clang (and I assume GCC) will vectorize
-                        both of these, but the bottom version results in a nicer vectorization with less
-                        instructions emitted. The problem, however, is that the bottom version runs slower
-                        when compiled with MSVC. The top version will be partially vectorized by MSVC.
-                        */
-                    #if defined(_MSC_VER) && !defined(__clang__)
-                        ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
-
-                        /* Expand some arrays so we can have a clean 4x SIMD operation in the loop. */
-                        pRunningGainDelta[2] = pRunningGainDelta[0];
-                        pRunningGainDelta[3] = pRunningGainDelta[1];
-                        pRunningGain[2] = pRunningGain[0] + pRunningGainDelta[0];
-                        pRunningGain[3] = pRunningGain[1] + pRunningGainDelta[1];
-
-                        for (; iFrame < unrolledLoopCount; iFrame += 1) {
-                            pFramesOutF32[iFrame*4 + 0] = pFramesInF32[iFrame*4 + 0] * pRunningGain[0];
-                            pFramesOutF32[iFrame*4 + 1] = pFramesInF32[iFrame*4 + 1] * pRunningGain[1];
-                            pFramesOutF32[iFrame*4 + 2] = pFramesInF32[iFrame*4 + 2] * pRunningGain[2];
-                            pFramesOutF32[iFrame*4 + 3] = pFramesInF32[iFrame*4 + 3] * pRunningGain[3];
-
-                            /* Move the running gain forward towards the new gain. */
-                            pRunningGain[0] += pRunningGainDelta[0];
-                            pRunningGain[1] += pRunningGainDelta[1];
-                            pRunningGain[2] += pRunningGainDelta[2];
-                            pRunningGain[3] += pRunningGainDelta[3];
-                        }
-
-                        iFrame = unrolledLoopCount << 1;
-                    #else
-                        for (; iFrame < interpolatedFrameCount; iFrame += 1) {
-                            for (iChannel = 0; iChannel < 2; iChannel += 1) {
-                                pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
-                            }
-
-                            for (iChannel = 0; iChannel < 2; iChannel += 1) {
-                                pRunningGain[iChannel] += pRunningGainDelta[iChannel];
-                            }
-                        }
-                    #endif
-                    }
-                } else if (pGainer->config.channels == 6) {
-                #if defined(MA_SUPPORT_SSE2)
-                    if (ma_has_sse2()) {
-                        /*
-                        For 6 channels things are a bit more complicated because 6 isn't cleanly divisible by 4. We need to do 2 frames
-                        at a time, meaning we'll be doing 12 samples in a group. Like the stereo case we'll need to expand some arrays
-                        so we can do clean 4x SIMD operations.
-                        */
-                        ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
-
-                        /* Expand some arrays so we can have a clean SIMD loop below. */
-                        __m128 runningGainDelta0 = _mm_set_ps(pRunningGainDelta[3], pRunningGainDelta[2], pRunningGainDelta[1], pRunningGainDelta[0]);
-                        __m128 runningGainDelta1 = _mm_set_ps(pRunningGainDelta[1], pRunningGainDelta[0], pRunningGainDelta[5], pRunningGainDelta[4]);
-                        __m128 runningGainDelta2 = _mm_set_ps(pRunningGainDelta[5], pRunningGainDelta[4], pRunningGainDelta[3], pRunningGainDelta[2]);
-
-                        __m128 runningGain0      = _mm_set_ps(pRunningGain[3],                        pRunningGain[2],                        pRunningGain[1],                        pRunningGain[0]);
-                        __m128 runningGain1      = _mm_set_ps(pRunningGain[1] + pRunningGainDelta[1], pRunningGain[0] + pRunningGainDelta[0], pRunningGain[5],                        pRunningGain[4]);
-                        __m128 runningGain2      = _mm_set_ps(pRunningGain[5] + pRunningGainDelta[5], pRunningGain[4] + pRunningGainDelta[4], pRunningGain[3] + pRunningGainDelta[3], pRunningGain[2] + pRunningGainDelta[2]);
-
-                        for (; iFrame < unrolledLoopCount; iFrame += 1) {
-                            _mm_storeu_ps(&pFramesOutF32[iFrame*12 + 0], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*12 + 0]), runningGain0));
-                            _mm_storeu_ps(&pFramesOutF32[iFrame*12 + 4], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*12 + 4]), runningGain1));
-                            _mm_storeu_ps(&pFramesOutF32[iFrame*12 + 8], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*12 + 8]), runningGain2));
-
-                            runningGain0 = _mm_add_ps(runningGain0, runningGainDelta0);
-                            runningGain1 = _mm_add_ps(runningGain1, runningGainDelta1);
-                            runningGain2 = _mm_add_ps(runningGain2, runningGainDelta2);
-                        }
-
-                        iFrame = unrolledLoopCount << 1;
-                    } else
-                #endif
-                    {
-                        for (; iFrame < interpolatedFrameCount; iFrame += 1) {
-                            for (iChannel = 0; iChannel < 6; iChannel += 1) {
-                                pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
-                            }
-
-                            /* Move the running gain forward towards the new gain. */
-                            for (iChannel = 0; iChannel < 6; iChannel += 1) {
-                                pRunningGain[iChannel] += pRunningGainDelta[iChannel];
-                            }
-                        }
-                    }
-                } else if (pGainer->config.channels == 8) {
-                    /* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
-                #if defined(MA_SUPPORT_SSE2)
-                    if (ma_has_sse2()) {
-                        __m128 runningGainDelta0 = _mm_loadu_ps(&pRunningGainDelta[0]);
-                        __m128 runningGainDelta1 = _mm_loadu_ps(&pRunningGainDelta[4]);
-                        __m128 runningGain0      = _mm_loadu_ps(&pRunningGain[0]);
-                        __m128 runningGain1      = _mm_loadu_ps(&pRunningGain[4]);
-
-                        for (; iFrame < interpolatedFrameCount; iFrame += 1) {
-                            _mm_storeu_ps(&pFramesOutF32[iFrame*8 + 0], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*8 + 0]), runningGain0));
-                            _mm_storeu_ps(&pFramesOutF32[iFrame*8 + 4], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*8 + 4]), runningGain1));
-
-                            runningGain0 = _mm_add_ps(runningGain0, runningGainDelta0);
-                            runningGain1 = _mm_add_ps(runningGain1, runningGainDelta1);
-                        }
-                    } else
-                #endif
-                    {
-                        /* This is crafted so that it auto-vectorizes when compiled with Clang. */
-                        for (; iFrame < interpolatedFrameCount; iFrame += 1) {
-                            for (iChannel = 0; iChannel < 8; iChannel += 1) {
-                                pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
-                            }
-
-                            /* Move the running gain forward towards the new gain. */
-                            for (iChannel = 0; iChannel < 8; iChannel += 1) {
-                                pRunningGain[iChannel] += pRunningGainDelta[iChannel];
-                            }
-                        }
-                    }
-                }
-
-                for (; iFrame < interpolatedFrameCount; iFrame += 1) {
-                    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                        pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
-                        pRunningGain[iChannel] += pRunningGainDelta[iChannel];
-                    }
-                }
-            } else {
-                /* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
-                for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
-                    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                        pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
-                    }
-
-                    a += d;
-                }
-            }
-        }
-
-        /* Make sure the timer is updated. */
-        pGainer->t = (ma_uint32)ma_min(pGainer->t + interpolatedFrameCount, pGainer->config.smoothTimeInFrames);
-
-        /* Adjust our arguments so the next part can work normally. */
-        frameCount -= interpolatedFrameCount;
-        pFramesOut  = ma_offset_ptr(pFramesOut, interpolatedFrameCount * sizeof(float));
-        pFramesIn   = ma_offset_ptr(pFramesIn,  interpolatedFrameCount * sizeof(float));
-    }
-
-    /* All we need to do here is apply the new gains using an optimized path. */
-    if (pFramesOut != NULL && pFramesIn != NULL) {
-        if (pGainer->config.channels <= 32) {
-            float gains[32];
-            for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
-            }
-
-            ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
-        } else {
-            /* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                    ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
-                }
-            }
-        }
-    }
-
-    /* Now that some frames have been processed we need to make sure future changes to the gain are interpolated. */
-    if (pGainer->t == (ma_uint32)-1) {
-        pGainer->t  = (ma_uint32)ma_min(pGainer->config.smoothTimeInFrames, frameCount);
-    }
-
-#if 0
-    if (pGainer->t >= pGainer->config.smoothTimeInFrames) {
-        /* Fast path. No gain calculation required. */
-        ma_copy_and_apply_volume_factor_per_channel_f32(pFramesOutF32, pFramesInF32, frameCount, pGainer->config.channels, pGainer->pNewGains);
-        ma_apply_volume_factor_f32(pFramesOutF32, frameCount * pGainer->config.channels, pGainer->masterVolume);
-
-        /* Now that some frames have been processed we need to make sure future changes to the gain are interpolated. */
-        if (pGainer->t == (ma_uint32)-1) {
-            pGainer->t = pGainer->config.smoothTimeInFrames;
-        }
-    } else {
-        /* Slow path. Need to interpolate the gain for each channel individually. */
-
-        /* We can allow the input and output buffers to be null in which case we'll just update the internal timer. */
-        if (pFramesOut != NULL && pFramesIn != NULL) {
-            float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
-            float d = 1.0f / pGainer->config.smoothTimeInFrames;
-            ma_uint32 channelCount = pGainer->config.channels;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channelCount; iChannel += 1) {
-                    pFramesOutF32[iChannel] = pFramesInF32[iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
-                }
-
-                pFramesOutF32 += channelCount;
-                pFramesInF32  += channelCount;
-
-                a += d;
-                if (a > 1) {
-                    a = 1;
-                }
-            }
-        }
-
-        pGainer->t = (ma_uint32)ma_min(pGainer->t + frameCount, pGainer->config.smoothTimeInFrames);
-
-    #if 0   /* Reference implementation. */
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            /* We can allow the input and output buffers to be null in which case we'll just update the internal timer. */
-            if (pFramesOut != NULL && pFramesIn != NULL) {
-                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-                    pFramesOutF32[iFrame * pGainer->config.channels + iChannel] = pFramesInF32[iFrame * pGainer->config.channels + iChannel] * ma_gainer_calculate_current_gain(pGainer, iChannel) * pGainer->masterVolume;
-                }
-            }
-
-            /* Move interpolation time forward, but don't go beyond our smoothing time. */
-            pGainer->t = ma_min(pGainer->t + 1, pGainer->config.smoothTimeInFrames);
-        }
-    #endif
-    }
-#endif
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_gainer_process_pcm_frames(ma_gainer* pGainer, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pGainer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /*
-    ma_gainer_process_pcm_frames_internal() marks pFramesOut and pFramesIn with MA_RESTRICT which
-    helps with auto-vectorization.
-    */
-    return ma_gainer_process_pcm_frames_internal(pGainer, pFramesOut, pFramesIn, frameCount);
-}
-
-static void ma_gainer_set_gain_by_index(ma_gainer* pGainer, float newGain, ma_uint32 iChannel)
-{
-    pGainer->pOldGains[iChannel] = ma_gainer_calculate_current_gain(pGainer, iChannel);
-    pGainer->pNewGains[iChannel] = newGain;
-}
-
-static void ma_gainer_reset_smoothing_time(ma_gainer* pGainer)
-{
-    if (pGainer->t == (ma_uint32)-1) {
-        pGainer->t = pGainer->config.smoothTimeInFrames;    /* No smoothing required for initial gains setting. */
-    } else {
-        pGainer->t = 0;
-    }
-}
-
-MA_API ma_result ma_gainer_set_gain(ma_gainer* pGainer, float newGain)
-{
-    ma_uint32 iChannel;
-
-    if (pGainer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-        ma_gainer_set_gain_by_index(pGainer, newGain, iChannel);
-    }
-
-    /* The smoothing time needs to be reset to ensure we always interpolate by the configured smoothing time, but only if it's not the first setting. */
-    ma_gainer_reset_smoothing_time(pGainer);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_gainer_set_gains(ma_gainer* pGainer, float* pNewGains)
-{
-    ma_uint32 iChannel;
-
-    if (pGainer == NULL || pNewGains == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
-        ma_gainer_set_gain_by_index(pGainer, pNewGains[iChannel], iChannel);
-    }
-
-    /* The smoothing time needs to be reset to ensure we always interpolate by the configured smoothing time, but only if it's not the first setting. */
-    ma_gainer_reset_smoothing_time(pGainer);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_gainer_set_master_volume(ma_gainer* pGainer, float volume)
-{
-    if (pGainer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pGainer->masterVolume = volume;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_gainer_get_master_volume(const ma_gainer* pGainer, float* pVolume)
-{
-    if (pGainer == NULL || pVolume == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pVolume = pGainer->masterVolume;
-
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_panner_config ma_panner_config_init(ma_format format, ma_uint32 channels)
-{
-    ma_panner_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format   = format;
-    config.channels = channels;
-    config.mode     = ma_pan_mode_balance;  /* Set to balancing mode by default because it's consistent with other audio engines and most likely what the caller is expecting. */
-    config.pan      = 0;
-
-    return config;
-}
-
-
-MA_API ma_result ma_panner_init(const ma_panner_config* pConfig, ma_panner* pPanner)
-{
-    if (pPanner == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pPanner);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pPanner->format   = pConfig->format;
-    pPanner->channels = pConfig->channels;
-    pPanner->mode     = pConfig->mode;
-    pPanner->pan      = pConfig->pan;
-
-    return MA_SUCCESS;
-}
-
-static void ma_stereo_balance_pcm_frames_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, float pan)
-{
-    ma_uint64 iFrame;
-
-    if (pan > 0) {
-        float factor = 1.0f - pan;
-        if (pFramesOut == pFramesIn) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                pFramesOut[iFrame*2 + 0] = pFramesIn[iFrame*2 + 0] * factor;
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                pFramesOut[iFrame*2 + 0] = pFramesIn[iFrame*2 + 0] * factor;
-                pFramesOut[iFrame*2 + 1] = pFramesIn[iFrame*2 + 1];
-            }
-        }
-    } else {
-        float factor = 1.0f + pan;
-        if (pFramesOut == pFramesIn) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                pFramesOut[iFrame*2 + 1] = pFramesIn[iFrame*2 + 1] * factor;
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                pFramesOut[iFrame*2 + 0] = pFramesIn[iFrame*2 + 0];
-                pFramesOut[iFrame*2 + 1] = pFramesIn[iFrame*2 + 1] * factor;
-            }
-        }
-    }
-}
-
-static void ma_stereo_balance_pcm_frames(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_format format, float pan)
-{
-    if (pan == 0) {
-        /* Fast path. No panning required. */
-        if (pFramesOut == pFramesIn) {
-            /* No-op */
-        } else {
-            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, format, 2);
-        }
-
-        return;
-    }
-
-    switch (format) {
-        case ma_format_f32: ma_stereo_balance_pcm_frames_f32((float*)pFramesOut, (float*)pFramesIn, frameCount, pan); break;
-
-        /* Unknown format. Just copy. */
-        default:
-        {
-            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, format, 2);
-        } break;
-    }
-}
-
-
-static void ma_stereo_pan_pcm_frames_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, float pan)
-{
-    ma_uint64 iFrame;
-
-    if (pan > 0) {
-        float factorL0 = 1.0f - pan;
-        float factorL1 = 0.0f + pan;
-
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float sample0 = (pFramesIn[iFrame*2 + 0] * factorL0);
-            float sample1 = (pFramesIn[iFrame*2 + 0] * factorL1) + pFramesIn[iFrame*2 + 1];
-
-            pFramesOut[iFrame*2 + 0] = sample0;
-            pFramesOut[iFrame*2 + 1] = sample1;
-        }
-    } else {
-        float factorR0 = 0.0f - pan;
-        float factorR1 = 1.0f + pan;
-
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float sample0 = pFramesIn[iFrame*2 + 0] + (pFramesIn[iFrame*2 + 1] * factorR0);
-            float sample1 =                           (pFramesIn[iFrame*2 + 1] * factorR1);
-
-            pFramesOut[iFrame*2 + 0] = sample0;
-            pFramesOut[iFrame*2 + 1] = sample1;
-        }
-    }
-}
-
-static void ma_stereo_pan_pcm_frames(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_format format, float pan)
-{
-    if (pan == 0) {
-        /* Fast path. No panning required. */
-        if (pFramesOut == pFramesIn) {
-            /* No-op */
-        } else {
-            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, format, 2);
-        }
-
-        return;
-    }
-
-    switch (format) {
-        case ma_format_f32: ma_stereo_pan_pcm_frames_f32((float*)pFramesOut, (float*)pFramesIn, frameCount, pan); break;
-
-        /* Unknown format. Just copy. */
-        default:
-        {
-            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, format, 2);
-        } break;
-    }
-}
-
-MA_API ma_result ma_panner_process_pcm_frames(ma_panner* pPanner, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pPanner == NULL || pFramesOut == NULL || pFramesIn == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pPanner->channels == 2) {
-        /* Stereo case. For now assume channel 0 is left and channel right is 1, but should probably add support for a channel map. */
-        if (pPanner->mode == ma_pan_mode_balance) {
-            ma_stereo_balance_pcm_frames(pFramesOut, pFramesIn, frameCount, pPanner->format, pPanner->pan);
-        } else {
-            ma_stereo_pan_pcm_frames(pFramesOut, pFramesIn, frameCount, pPanner->format, pPanner->pan);
-        }
-    } else {
-        if (pPanner->channels == 1) {
-            /* Panning has no effect on mono streams. */
-            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, pPanner->format, pPanner->channels);
-        } else {
-            /* For now we're not going to support non-stereo set ups. Not sure how I want to handle this case just yet. */
-            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, pPanner->format, pPanner->channels);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_panner_set_mode(ma_panner* pPanner, ma_pan_mode mode)
-{
-    if (pPanner == NULL) {
-        return;
-    }
-
-    pPanner->mode = mode;
-}
-
-MA_API ma_pan_mode ma_panner_get_mode(const ma_panner* pPanner)
-{
-    if (pPanner == NULL) {
-        return ma_pan_mode_balance;
-    }
-
-    return pPanner->mode;
-}
-
-MA_API void ma_panner_set_pan(ma_panner* pPanner, float pan)
-{
-    if (pPanner == NULL) {
-        return;
-    }
-
-    pPanner->pan = ma_clamp(pan, -1.0f, 1.0f);
-}
-
-MA_API float ma_panner_get_pan(const ma_panner* pPanner)
-{
-    if (pPanner == NULL) {
-        return 0;
-    }
-
-    return pPanner->pan;
-}
-
-
-
-
-MA_API ma_fader_config ma_fader_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
-{
-    ma_fader_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format     = format;
-    config.channels   = channels;
-    config.sampleRate = sampleRate;
-
-    return config;
-}
-
-
-MA_API ma_result ma_fader_init(const ma_fader_config* pConfig, ma_fader* pFader)
-{
-    if (pFader == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pFader);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Only f32 is supported for now. */
-    if (pConfig->format != ma_format_f32) {
-        return MA_INVALID_ARGS;
-    }
-
-    pFader->config         = *pConfig;
-    pFader->volumeBeg      = 1;
-    pFader->volumeEnd      = 1;
-    pFader->lengthInFrames = 0;
-    pFader->cursorInFrames = 0;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_fader_process_pcm_frames(ma_fader* pFader, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pFader == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* If the cursor is still negative we need to just copy the absolute number of those frames, but no more than frameCount. */
-    if (pFader->cursorInFrames < 0) {
-        ma_uint64 absCursorInFrames = (ma_uint64)0 - pFader->cursorInFrames;
-        if (absCursorInFrames > frameCount) {
-            absCursorInFrames = frameCount;
-        }
-
-        ma_copy_pcm_frames(pFramesOut, pFramesIn, absCursorInFrames, pFader->config.format, pFader->config.channels);
-
-        pFader->cursorInFrames += absCursorInFrames;
-        frameCount -= absCursorInFrames;
-        pFramesOut  = ma_offset_ptr(pFramesOut, ma_get_bytes_per_frame(pFader->config.format, pFader->config.channels)*absCursorInFrames);
-        pFramesIn   = ma_offset_ptr(pFramesIn,  ma_get_bytes_per_frame(pFader->config.format, pFader->config.channels)*absCursorInFrames);
-    }
-
-    if (pFader->cursorInFrames >= 0) {
-        /*
-        For now we need to clamp frameCount so that the cursor never overflows 32-bits. This is required for
-        the conversion to a float which we use for the linear interpolation. This might be changed later.
-        */
-        if (frameCount + pFader->cursorInFrames > UINT_MAX) {
-            frameCount = UINT_MAX - pFader->cursorInFrames;
-        }
-
-        /* Optimized path if volumeBeg and volumeEnd are equal. */
-        if (pFader->volumeBeg == pFader->volumeEnd) {
-            if (pFader->volumeBeg == 1) {
-                /* Straight copy. */
-                ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, pFader->config.format, pFader->config.channels);
-            } else {
-                /* Copy with volume. */
-                ma_copy_and_apply_volume_and_clip_pcm_frames(pFramesOut, pFramesIn, frameCount, pFader->config.format, pFader->config.channels, pFader->volumeBeg);
-            }
-        } else {
-            /* Slower path. Volumes are different, so may need to do an interpolation. */
-            if ((ma_uint64)pFader->cursorInFrames >= pFader->lengthInFrames) {
-                /* Fast path. We've gone past the end of the fade period so just apply the end volume to all samples. */
-                ma_copy_and_apply_volume_and_clip_pcm_frames(pFramesOut, pFramesIn, frameCount, pFader->config.format, pFader->config.channels, pFader->volumeEnd);
-            } else {
-                /* Slow path. This is where we do the actual fading. */
-                ma_uint64 iFrame;
-                ma_uint32 iChannel;
-
-                /* For now we only support f32. Support for other formats might be added later. */
-                if (pFader->config.format == ma_format_f32) {
-                    const float* pFramesInF32  = (const float*)pFramesIn;
-                    /* */ float* pFramesOutF32 = (      float*)pFramesOut;
-
-                    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                        float a = (ma_uint32)ma_min(pFader->cursorInFrames + iFrame, pFader->lengthInFrames) / (float)((ma_uint32)pFader->lengthInFrames);   /* Safe cast due to the frameCount clamp at the top of this function. */
-                        float volume = ma_mix_f32_fast(pFader->volumeBeg, pFader->volumeEnd, a);
-
-                        for (iChannel = 0; iChannel < pFader->config.channels; iChannel += 1) {
-                            pFramesOutF32[iFrame*pFader->config.channels + iChannel] = pFramesInF32[iFrame*pFader->config.channels + iChannel] * volume;
-                        }
-                    }
-                } else {
-                    return MA_NOT_IMPLEMENTED;
-                }
-            }
-        }
-    }
-
-    pFader->cursorInFrames += frameCount;
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_fader_get_data_format(const ma_fader* pFader, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate)
-{
-    if (pFader == NULL) {
-        return;
-    }
-
-    if (pFormat != NULL) {
-        *pFormat = pFader->config.format;
-    }
-
-    if (pChannels != NULL) {
-        *pChannels = pFader->config.channels;
-    }
-
-    if (pSampleRate != NULL) {
-        *pSampleRate = pFader->config.sampleRate;
-    }
-}
-
-MA_API void ma_fader_set_fade(ma_fader* pFader, float volumeBeg, float volumeEnd, ma_uint64 lengthInFrames)
-{
-    ma_fader_set_fade_ex(pFader, volumeBeg, volumeEnd, lengthInFrames, 0);
-}
-
-MA_API void ma_fader_set_fade_ex(ma_fader* pFader, float volumeBeg, float volumeEnd, ma_uint64 lengthInFrames, ma_int64 startOffsetInFrames)
-{
-    if (pFader == NULL) {
-        return;
-    }
-
-    /* If the volume is negative, use current volume. */
-    if (volumeBeg < 0) {
-        volumeBeg = ma_fader_get_current_volume(pFader);
-    }
-
-    /*
-    The length needs to be clamped to 32-bits due to how we convert it to a float for linear
-    interpolation reasons. I might change this requirement later, but for now it's not important.
-    */
-    if (lengthInFrames > UINT_MAX) {
-        lengthInFrames = UINT_MAX;
-    }
-
-    /* The start offset needs to be clamped to ensure it doesn't overflow a signed number. */
-    if (startOffsetInFrames > INT_MAX) {
-        startOffsetInFrames = INT_MAX;
-    }
-
-    pFader->volumeBeg      = volumeBeg;
-    pFader->volumeEnd      = volumeEnd;
-    pFader->lengthInFrames = lengthInFrames;
-    pFader->cursorInFrames = -startOffsetInFrames;
-}
-
-MA_API float ma_fader_get_current_volume(const ma_fader* pFader)
-{
-    if (pFader == NULL) {
-        return 0.0f;
-    }
-
-    /* Any frames prior to the start of the fade period will be at unfaded volume. */
-    if (pFader->cursorInFrames < 0) {
-        return 1.0f;
-    }
-
-    /* The current volume depends on the position of the cursor. */
-    if (pFader->cursorInFrames == 0) {
-        return pFader->volumeBeg;
-    } else if ((ma_uint64)pFader->cursorInFrames >= pFader->lengthInFrames) {   /* Safe case because the < 0 case was checked above. */
-        return pFader->volumeEnd;
-    } else {
-        /* The cursor is somewhere inside the fading period. We can figure this out with a simple linear interpoluation between volumeBeg and volumeEnd based on our cursor position. */
-        return ma_mix_f32_fast(pFader->volumeBeg, pFader->volumeEnd, (ma_uint32)pFader->cursorInFrames / (float)((ma_uint32)pFader->lengthInFrames));    /* Safe cast to uint32 because we clamp it in ma_fader_process_pcm_frames(). */
-    }
-}
-
-
-
-
-
-MA_API ma_vec3f ma_vec3f_init_3f(float x, float y, float z)
-{
-    ma_vec3f v;
-
-    v.x = x;
-    v.y = y;
-    v.z = z;
-
-    return v;
-}
-
-MA_API ma_vec3f ma_vec3f_sub(ma_vec3f a, ma_vec3f b)
-{
-    return ma_vec3f_init_3f(
-        a.x - b.x,
-        a.y - b.y,
-        a.z - b.z
-    );
-}
-
-MA_API ma_vec3f ma_vec3f_neg(ma_vec3f a)
-{
-    return ma_vec3f_init_3f(
-        -a.x,
-        -a.y,
-        -a.z
-    );
-}
-
-MA_API float ma_vec3f_dot(ma_vec3f a, ma_vec3f b)
-{
-    return a.x*b.x + a.y*b.y + a.z*b.z;
-}
-
-MA_API float ma_vec3f_len2(ma_vec3f v)
-{
-    return ma_vec3f_dot(v, v);
-}
-
-MA_API float ma_vec3f_len(ma_vec3f v)
-{
-    return (float)ma_sqrtd(ma_vec3f_len2(v));
-}
-
-
-
-MA_API float ma_vec3f_dist(ma_vec3f a, ma_vec3f b)
-{
-    return ma_vec3f_len(ma_vec3f_sub(a, b));
-}
-
-MA_API ma_vec3f ma_vec3f_normalize(ma_vec3f v)
-{
-    float invLen;
-    float len2 = ma_vec3f_len2(v);
-    if (len2 == 0) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    invLen = ma_rsqrtf(len2);
-    v.x *= invLen;
-    v.y *= invLen;
-    v.z *= invLen;
-
-    return v;
-}
-
-MA_API ma_vec3f ma_vec3f_cross(ma_vec3f a, ma_vec3f b)
-{
-    return ma_vec3f_init_3f(
-        a.y*b.z - a.z*b.y,
-        a.z*b.x - a.x*b.z,
-        a.x*b.y - a.y*b.x
-    );
-}
-
-
-MA_API void ma_atomic_vec3f_init(ma_atomic_vec3f* v, ma_vec3f value)
-{
-    v->v = value;
-    v->lock = 0;    /* Important this is initialized to 0. */
-}
-
-MA_API void ma_atomic_vec3f_set(ma_atomic_vec3f* v, ma_vec3f value)
-{
-    ma_spinlock_lock(&v->lock);
-    {
-        v->v = value;
-    }
-    ma_spinlock_unlock(&v->lock);
-}
-
-MA_API ma_vec3f ma_atomic_vec3f_get(ma_atomic_vec3f* v)
-{
-    ma_vec3f r;
-
-    ma_spinlock_lock(&v->lock);
-    {
-        r = v->v;
-    }
-    ma_spinlock_unlock(&v->lock);
-
-    return r;
-}
-
-
-
-static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChannelMapOut, ma_uint32 channelsOut, const float* pFramesIn, const ma_channel* pChannelMapIn, ma_uint32 channelsIn, ma_uint64 frameCount, ma_channel_mix_mode mode, ma_mono_expansion_mode monoExpansionMode);
-static ma_bool32 ma_is_spatial_channel_position(ma_channel channelPosition);
-
-
-#ifndef MA_DEFAULT_SPEED_OF_SOUND
-#define MA_DEFAULT_SPEED_OF_SOUND   343.3f
-#endif
-
-/*
-These vectors represent the direction that speakers are facing from the center point. They're used
-for panning in the spatializer. Must be normalized.
-*/
-static ma_vec3f g_maChannelDirections[MA_CHANNEL_POSITION_COUNT] = {
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_NONE */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_MONO */
-    {-0.7071f,  0.0f,    -0.7071f },  /* MA_CHANNEL_FRONT_LEFT */
-    {+0.7071f,  0.0f,    -0.7071f },  /* MA_CHANNEL_FRONT_RIGHT */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_FRONT_CENTER */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_LFE */
-    {-0.7071f,  0.0f,    +0.7071f },  /* MA_CHANNEL_BACK_LEFT */
-    {+0.7071f,  0.0f,    +0.7071f },  /* MA_CHANNEL_BACK_RIGHT */
-    {-0.3162f,  0.0f,    -0.9487f },  /* MA_CHANNEL_FRONT_LEFT_CENTER */
-    {+0.3162f,  0.0f,    -0.9487f },  /* MA_CHANNEL_FRONT_RIGHT_CENTER */
-    { 0.0f,     0.0f,    +1.0f    },  /* MA_CHANNEL_BACK_CENTER */
-    {-1.0f,     0.0f,     0.0f    },  /* MA_CHANNEL_SIDE_LEFT */
-    {+1.0f,     0.0f,     0.0f    },  /* MA_CHANNEL_SIDE_RIGHT */
-    { 0.0f,    +1.0f,     0.0f    },  /* MA_CHANNEL_TOP_CENTER */
-    {-0.5774f, +0.5774f, -0.5774f },  /* MA_CHANNEL_TOP_FRONT_LEFT */
-    { 0.0f,    +0.7071f, -0.7071f },  /* MA_CHANNEL_TOP_FRONT_CENTER */
-    {+0.5774f, +0.5774f, -0.5774f },  /* MA_CHANNEL_TOP_FRONT_RIGHT */
-    {-0.5774f, +0.5774f, +0.5774f },  /* MA_CHANNEL_TOP_BACK_LEFT */
-    { 0.0f,    +0.7071f, +0.7071f },  /* MA_CHANNEL_TOP_BACK_CENTER */
-    {+0.5774f, +0.5774f, +0.5774f },  /* MA_CHANNEL_TOP_BACK_RIGHT */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_0 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_1 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_2 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_3 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_4 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_5 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_6 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_7 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_8 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_9 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_10 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_11 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_12 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_13 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_14 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_15 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_16 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_17 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_18 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_19 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_20 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_21 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_22 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_23 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_24 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_25 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_26 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_27 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_28 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_29 */
-    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_30 */
-    { 0.0f,     0.0f,    -1.0f    }   /* MA_CHANNEL_AUX_31 */
-};
-
-static ma_vec3f ma_get_channel_direction(ma_channel channel)
-{
-    if (channel >= MA_CHANNEL_POSITION_COUNT) {
-        return ma_vec3f_init_3f(0, 0, -1);
-    } else {
-        return g_maChannelDirections[channel];
-    }
-}
-
-
-
-static float ma_attenuation_inverse(float distance, float minDistance, float maxDistance, float rolloff)
-{
-    if (minDistance >= maxDistance) {
-        return 1;   /* To avoid division by zero. Do not attenuate. */
-    }
-
-    return minDistance / (minDistance + rolloff * (ma_clamp(distance, minDistance, maxDistance) - minDistance));
-}
-
-static float ma_attenuation_linear(float distance, float minDistance, float maxDistance, float rolloff)
-{
-    if (minDistance >= maxDistance) {
-        return 1;   /* To avoid division by zero. Do not attenuate. */
-    }
-
-    return 1 - rolloff * (ma_clamp(distance, minDistance, maxDistance) - minDistance) / (maxDistance - minDistance);
-}
-
-static float ma_attenuation_exponential(float distance, float minDistance, float maxDistance, float rolloff)
-{
-    if (minDistance >= maxDistance) {
-        return 1;   /* To avoid division by zero. Do not attenuate. */
-    }
-
-    return (float)ma_powd(ma_clamp(distance, minDistance, maxDistance) / minDistance, -rolloff);
-}
-
-
-/*
-Dopper Effect calculation taken from the OpenAL spec, with two main differences:
-
-  1) The source to listener vector will have already been calcualted at an earlier step so we can
-     just use that directly. We need only the position of the source relative to the origin.
-
-  2) We don't scale by a frequency because we actually just want the ratio which we'll plug straight
-     into the resampler directly.
-*/
-static float ma_doppler_pitch(ma_vec3f relativePosition, ma_vec3f sourceVelocity, ma_vec3f listenVelocity, float speedOfSound, float dopplerFactor)
-{
-    float len;
-    float vls;
-    float vss;
-
-    len = ma_vec3f_len(relativePosition);
-
-    /*
-    There's a case where the position of the source will be right on top of the listener in which
-    case the length will be 0 and we'll end up with a division by zero. We can just return a ratio
-    of 1.0 in this case. This is not considered in the OpenAL spec, but is necessary.
-    */
-    if (len == 0) {
-        return 1.0;
-    }
-
-    vls = ma_vec3f_dot(relativePosition, listenVelocity) / len;
-    vss = ma_vec3f_dot(relativePosition, sourceVelocity) / len;
-
-    vls = ma_min(vls, speedOfSound / dopplerFactor);
-    vss = ma_min(vss, speedOfSound / dopplerFactor);
-
-    return (speedOfSound - dopplerFactor*vls) / (speedOfSound - dopplerFactor*vss);
-}
-
-
-static void ma_get_default_channel_map_for_spatializer(ma_channel* pChannelMap, size_t channelMapCap, ma_uint32 channelCount)
-{
-    /*
-    Special case for stereo. Want to default the left and right speakers to side left and side
-    right so that they're facing directly down the X axis rather than slightly forward. Not
-    doing this will result in sounds being quieter when behind the listener. This might
-    actually be good for some scenerios, but I don't think it's an appropriate default because
-    it can be a bit unexpected.
-    */
-    if (channelCount == 2) {
-        pChannelMap[0] = MA_CHANNEL_SIDE_LEFT;
-        pChannelMap[1] = MA_CHANNEL_SIDE_RIGHT;
-    } else {
-        ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, channelCount);
-    }
-}
-
-
-MA_API ma_spatializer_listener_config ma_spatializer_listener_config_init(ma_uint32 channelsOut)
-{
-    ma_spatializer_listener_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.channelsOut             = channelsOut;
-    config.pChannelMapOut          = NULL;
-    config.handedness              = ma_handedness_right;
-    config.worldUp                 = ma_vec3f_init_3f(0, 1,  0);
-    config.coneInnerAngleInRadians = 6.283185f; /* 360 degrees. */
-    config.coneOuterAngleInRadians = 6.283185f; /* 360 degrees. */
-    config.coneOuterGain           = 0;
-    config.speedOfSound            = 343.3f;    /* Same as OpenAL. Used for doppler effect. */
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t channelMapOutOffset;
-} ma_spatializer_listener_heap_layout;
-
-static ma_result ma_spatializer_listener_get_heap_layout(const ma_spatializer_listener_config* pConfig, ma_spatializer_listener_heap_layout* pHeapLayout)
-{
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channelsOut == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* Channel map. We always need this, even for passthroughs. */
-    pHeapLayout->channelMapOutOffset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += ma_align_64(sizeof(*pConfig->pChannelMapOut) * pConfig->channelsOut);
-
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_result ma_spatializer_listener_get_heap_size(const ma_spatializer_listener_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_spatializer_listener_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_spatializer_listener_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_spatializer_listener_init_preallocated(const ma_spatializer_listener_config* pConfig, void* pHeap, ma_spatializer_listener* pListener)
-{
-    ma_result result;
-    ma_spatializer_listener_heap_layout heapLayout;
-
-    if (pListener == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pListener);
-
-    result = ma_spatializer_listener_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pListener->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pListener->config    = *pConfig;
-    ma_atomic_vec3f_init(&pListener->position,  ma_vec3f_init_3f(0, 0, 0));
-    ma_atomic_vec3f_init(&pListener->direction, ma_vec3f_init_3f(0, 0, -1));
-    ma_atomic_vec3f_init(&pListener->velocity,  ma_vec3f_init_3f(0, 0,  0));
-    pListener->isEnabled = MA_TRUE;
-
-    /* Swap the forward direction if we're left handed (it was initialized based on right handed). */
-    if (pListener->config.handedness == ma_handedness_left) {
-        ma_vec3f negDir = ma_vec3f_neg(ma_spatializer_listener_get_direction(pListener));
-        ma_spatializer_listener_set_direction(pListener, negDir.x, negDir.y, negDir.z);
-    }
-
-
-    /* We must always have a valid channel map. */
-    pListener->config.pChannelMapOut = (ma_channel*)ma_offset_ptr(pHeap, heapLayout.channelMapOutOffset);
-
-    /* Use a slightly different default channel map for stereo. */
-    if (pConfig->pChannelMapOut == NULL) {
-        ma_get_default_channel_map_for_spatializer(pListener->config.pChannelMapOut, pConfig->channelsOut, pConfig->channelsOut);
-    } else {
-        ma_channel_map_copy_or_default(pListener->config.pChannelMapOut, pConfig->channelsOut, pConfig->pChannelMapOut, pConfig->channelsOut);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_spatializer_listener_init(const ma_spatializer_listener_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_spatializer_listener* pListener)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_spatializer_listener_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_spatializer_listener_init_preallocated(pConfig, pHeap, pListener);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pListener->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_spatializer_listener_uninit(ma_spatializer_listener* pListener, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pListener == NULL) {
-        return;
-    }
-
-    if (pListener->_ownsHeap) {
-        ma_free(pListener->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_channel* ma_spatializer_listener_get_channel_map(ma_spatializer_listener* pListener)
-{
-    if (pListener == NULL) {
-        return NULL;
-    }
-
-    return pListener->config.pChannelMapOut;
-}
-
-MA_API void ma_spatializer_listener_set_cone(ma_spatializer_listener* pListener, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
-{
-    if (pListener == NULL) {
-        return;
-    }
-
-    pListener->config.coneInnerAngleInRadians = innerAngleInRadians;
-    pListener->config.coneOuterAngleInRadians = outerAngleInRadians;
-    pListener->config.coneOuterGain           = outerGain;
-}
-
-MA_API void ma_spatializer_listener_get_cone(const ma_spatializer_listener* pListener, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
-{
-    if (pListener == NULL) {
-        return;
-    }
-
-    if (pInnerAngleInRadians != NULL) {
-        *pInnerAngleInRadians = pListener->config.coneInnerAngleInRadians;
-    }
-
-    if (pOuterAngleInRadians != NULL) {
-        *pOuterAngleInRadians = pListener->config.coneOuterAngleInRadians;
-    }
-
-    if (pOuterGain != NULL) {
-        *pOuterGain = pListener->config.coneOuterGain;
-    }
-}
-
-MA_API void ma_spatializer_listener_set_position(ma_spatializer_listener* pListener, float x, float y, float z)
-{
-    if (pListener == NULL) {
-        return;
-    }
-
-    ma_atomic_vec3f_set(&pListener->position, ma_vec3f_init_3f(x, y, z));
-}
-
-MA_API ma_vec3f ma_spatializer_listener_get_position(const ma_spatializer_listener* pListener)
-{
-    if (pListener == NULL) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pListener->position); /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
-}
-
-MA_API void ma_spatializer_listener_set_direction(ma_spatializer_listener* pListener, float x, float y, float z)
-{
-    if (pListener == NULL) {
-        return;
-    }
-
-    ma_atomic_vec3f_set(&pListener->direction, ma_vec3f_init_3f(x, y, z));
-}
-
-MA_API ma_vec3f ma_spatializer_listener_get_direction(const ma_spatializer_listener* pListener)
-{
-    if (pListener == NULL) {
-        return ma_vec3f_init_3f(0, 0, -1);
-    }
-
-    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pListener->direction);    /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
-}
-
-MA_API void ma_spatializer_listener_set_velocity(ma_spatializer_listener* pListener, float x, float y, float z)
-{
-    if (pListener == NULL) {
-        return;
-    }
-
-    ma_atomic_vec3f_set(&pListener->velocity, ma_vec3f_init_3f(x, y, z));
-}
-
-MA_API ma_vec3f ma_spatializer_listener_get_velocity(const ma_spatializer_listener* pListener)
-{
-    if (pListener == NULL) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pListener->velocity); /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
-}
-
-MA_API void ma_spatializer_listener_set_speed_of_sound(ma_spatializer_listener* pListener, float speedOfSound)
-{
-    if (pListener == NULL) {
-        return;
-    }
-
-    pListener->config.speedOfSound = speedOfSound;
-}
-
-MA_API float ma_spatializer_listener_get_speed_of_sound(const ma_spatializer_listener* pListener)
-{
-    if (pListener == NULL) {
-        return 0;
-    }
-
-    return pListener->config.speedOfSound;
-}
-
-MA_API void ma_spatializer_listener_set_world_up(ma_spatializer_listener* pListener, float x, float y, float z)
-{
-    if (pListener == NULL) {
-        return;
-    }
-
-    pListener->config.worldUp = ma_vec3f_init_3f(x, y, z);
-}
-
-MA_API ma_vec3f ma_spatializer_listener_get_world_up(const ma_spatializer_listener* pListener)
-{
-    if (pListener == NULL) {
-        return ma_vec3f_init_3f(0, 1, 0);
-    }
-
-    return pListener->config.worldUp;
-}
-
-MA_API void ma_spatializer_listener_set_enabled(ma_spatializer_listener* pListener, ma_bool32 isEnabled)
-{
-    if (pListener == NULL) {
-        return;
-    }
-
-    pListener->isEnabled = isEnabled;
-}
-
-MA_API ma_bool32 ma_spatializer_listener_is_enabled(const ma_spatializer_listener* pListener)
-{
-    if (pListener == NULL) {
-        return MA_FALSE;
-    }
-
-    return pListener->isEnabled;
-}
-
-
-
-
-MA_API ma_spatializer_config ma_spatializer_config_init(ma_uint32 channelsIn, ma_uint32 channelsOut)
-{
-    ma_spatializer_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.channelsIn                   = channelsIn;
-    config.channelsOut                  = channelsOut;
-    config.pChannelMapIn                = NULL;
-    config.attenuationModel             = ma_attenuation_model_inverse;
-    config.positioning                  = ma_positioning_absolute;
-    config.handedness                   = ma_handedness_right;
-    config.minGain                      = 0;
-    config.maxGain                      = 1;
-    config.minDistance                  = 1;
-    config.maxDistance                  = MA_FLT_MAX;
-    config.rolloff                      = 1;
-    config.coneInnerAngleInRadians      = 6.283185f; /* 360 degrees. */
-    config.coneOuterAngleInRadians      = 6.283185f; /* 360 degress. */
-    config.coneOuterGain                = 0.0f;
-    config.dopplerFactor                = 1;
-    config.directionalAttenuationFactor = 1;
-    config.minSpatializationChannelGain = 0.2f;
-    config.gainSmoothTimeInFrames       = 360;       /* 7.5ms @ 48K. */
-
-    return config;
-}
-
-
-static ma_gainer_config ma_spatializer_gainer_config_init(const ma_spatializer_config* pConfig)
-{
-    MA_ASSERT(pConfig != NULL);
-    return ma_gainer_config_init(pConfig->channelsOut, pConfig->gainSmoothTimeInFrames);
-}
-
-static ma_result ma_spatializer_validate_config(const ma_spatializer_config* pConfig)
-{
-    MA_ASSERT(pConfig != NULL);
-
-    if (pConfig->channelsIn == 0 || pConfig->channelsOut == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    return MA_SUCCESS;
-}
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t channelMapInOffset;
-    size_t newChannelGainsOffset;
-    size_t gainerOffset;
-} ma_spatializer_heap_layout;
-
-static ma_result ma_spatializer_get_heap_layout(const ma_spatializer_config* pConfig, ma_spatializer_heap_layout* pHeapLayout)
-{
-    ma_result result;
-
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_spatializer_validate_config(pConfig);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* Channel map. */
-    pHeapLayout->channelMapInOffset = MA_SIZE_MAX;  /* <-- MA_SIZE_MAX indicates no allocation necessary. */
-    if (pConfig->pChannelMapIn != NULL) {
-        pHeapLayout->channelMapInOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += ma_align_64(sizeof(*pConfig->pChannelMapIn) * pConfig->channelsIn);
-    }
-
-    /* New channel gains for output. */
-    pHeapLayout->newChannelGainsOffset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += ma_align_64(sizeof(float) * pConfig->channelsOut);
-
-    /* Gainer. */
-    {
-        size_t gainerHeapSizeInBytes;
-        ma_gainer_config gainerConfig;
-
-        gainerConfig = ma_spatializer_gainer_config_init(pConfig);
-
-        result = ma_gainer_get_heap_size(&gainerConfig, &gainerHeapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->gainerOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += ma_align_64(gainerHeapSizeInBytes);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_spatializer_get_heap_size(const ma_spatializer_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_spatializer_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;  /* Safety. */
-
-    result = ma_spatializer_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_result ma_spatializer_init_preallocated(const ma_spatializer_config* pConfig, void* pHeap, ma_spatializer* pSpatializer)
-{
-    ma_result result;
-    ma_spatializer_heap_layout heapLayout;
-    ma_gainer_config gainerConfig;
-
-    if (pSpatializer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pSpatializer);
-
-    if (pConfig == NULL || pHeap == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_spatializer_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pSpatializer->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pSpatializer->channelsIn                   = pConfig->channelsIn;
-    pSpatializer->channelsOut                  = pConfig->channelsOut;
-    pSpatializer->attenuationModel             = pConfig->attenuationModel;
-    pSpatializer->positioning                  = pConfig->positioning;
-    pSpatializer->handedness                   = pConfig->handedness;
-    pSpatializer->minGain                      = pConfig->minGain;
-    pSpatializer->maxGain                      = pConfig->maxGain;
-    pSpatializer->minDistance                  = pConfig->minDistance;
-    pSpatializer->maxDistance                  = pConfig->maxDistance;
-    pSpatializer->rolloff                      = pConfig->rolloff;
-    pSpatializer->coneInnerAngleInRadians      = pConfig->coneInnerAngleInRadians;
-    pSpatializer->coneOuterAngleInRadians      = pConfig->coneOuterAngleInRadians;
-    pSpatializer->coneOuterGain                = pConfig->coneOuterGain;
-    pSpatializer->dopplerFactor                = pConfig->dopplerFactor;
-    pSpatializer->minSpatializationChannelGain = pConfig->minSpatializationChannelGain;
-    pSpatializer->directionalAttenuationFactor = pConfig->directionalAttenuationFactor;
-    pSpatializer->gainSmoothTimeInFrames       = pConfig->gainSmoothTimeInFrames;
-    ma_atomic_vec3f_init(&pSpatializer->position,  ma_vec3f_init_3f(0, 0,  0));
-    ma_atomic_vec3f_init(&pSpatializer->direction, ma_vec3f_init_3f(0, 0, -1));
-    ma_atomic_vec3f_init(&pSpatializer->velocity,  ma_vec3f_init_3f(0, 0,  0));
-    pSpatializer->dopplerPitch                 = 1;
-
-    /* Swap the forward direction if we're left handed (it was initialized based on right handed). */
-    if (pSpatializer->handedness == ma_handedness_left) {
-        ma_vec3f negDir = ma_vec3f_neg(ma_spatializer_get_direction(pSpatializer));
-        ma_spatializer_set_direction(pSpatializer, negDir.x, negDir.y, negDir.z);
-    }
-
-    /* Channel map. This will be on the heap. */
-    if (pConfig->pChannelMapIn != NULL) {
-        pSpatializer->pChannelMapIn = (ma_channel*)ma_offset_ptr(pHeap, heapLayout.channelMapInOffset);
-        ma_channel_map_copy_or_default(pSpatializer->pChannelMapIn, pSpatializer->channelsIn, pConfig->pChannelMapIn, pSpatializer->channelsIn);
-    }
-
-    /* New channel gains for output channels. */
-    pSpatializer->pNewChannelGainsOut = (float*)ma_offset_ptr(pHeap, heapLayout.newChannelGainsOffset);
-
-    /* Gainer. */
-    gainerConfig = ma_spatializer_gainer_config_init(pConfig);
-
-    result = ma_gainer_init_preallocated(&gainerConfig, ma_offset_ptr(pHeap, heapLayout.gainerOffset), &pSpatializer->gainer);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the gainer. */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_spatializer_init(const ma_spatializer_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_spatializer* pSpatializer)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    /* We'll need a heap allocation to retrieve the size. */
-    result = ma_spatializer_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_spatializer_init_preallocated(pConfig, pHeap, pSpatializer);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pSpatializer->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_spatializer_uninit(ma_spatializer* pSpatializer, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_gainer_uninit(&pSpatializer->gainer, pAllocationCallbacks);
-
-    if (pSpatializer->_ownsHeap) {
-        ma_free(pSpatializer->_pHeap, pAllocationCallbacks);
-    }
-}
-
-static float ma_calculate_angular_gain(ma_vec3f dirA, ma_vec3f dirB, float coneInnerAngleInRadians, float coneOuterAngleInRadians, float coneOuterGain)
-{
-    /*
-    Angular attenuation.
-
-    Unlike distance gain, the math for this is not specified by the OpenAL spec so we'll just go ahead and figure
-    this out for ourselves at the expense of possibly being inconsistent with other implementations.
-
-    To do cone attenuation, I'm just using the same math that we'd use to implement a basic spotlight in OpenGL. We
-    just need to get the direction from the source to the listener and then do a dot product against that and the
-    direction of the spotlight. Then we just compare that dot product against the cosine of the inner and outer
-    angles. If the dot product is greater than the the outer angle, we just use coneOuterGain. If it's less than
-    the inner angle, we just use a gain of 1. Otherwise we linearly interpolate between 1 and coneOuterGain.
-    */
-    if (coneInnerAngleInRadians < 6.283185f) {
-        float angularGain = 1;
-        float cutoffInner = (float)ma_cosd(coneInnerAngleInRadians*0.5f);
-        float cutoffOuter = (float)ma_cosd(coneOuterAngleInRadians*0.5f);
-        float d;
-
-        d = ma_vec3f_dot(dirA, dirB);
-
-        if (d > cutoffInner) {
-            /* It's inside the inner angle. */
-            angularGain = 1;
-        } else {
-            /* It's outside the inner angle. */
-            if (d > cutoffOuter) {
-                /* It's between the inner and outer angle. We need to linearly interpolate between 1 and coneOuterGain. */
-                angularGain = ma_mix_f32(coneOuterGain, 1, (d - cutoffOuter) / (cutoffInner - cutoffOuter));
-            } else {
-                /* It's outside the outer angle. */
-                angularGain = coneOuterGain;
-            }
-        }
-
-        /*printf("d = %f; cutoffInner = %f; cutoffOuter = %f; angularGain = %f\n", d, cutoffInner, cutoffOuter, angularGain);*/
-        return angularGain;
-    } else {
-        /* Inner angle is 360 degrees so no need to do any attenuation. */
-        return 1;
-    }
-}
-
-MA_API ma_result ma_spatializer_process_pcm_frames(ma_spatializer* pSpatializer, ma_spatializer_listener* pListener, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_channel* pChannelMapIn  = pSpatializer->pChannelMapIn;
-    ma_channel* pChannelMapOut = pListener->config.pChannelMapOut;
-
-    if (pSpatializer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* If we're not spatializing we need to run an optimized path. */
-    if (ma_atomic_load_i32(&pSpatializer->attenuationModel) == ma_attenuation_model_none) {
-        if (ma_spatializer_listener_is_enabled(pListener)) {
-            /* No attenuation is required, but we'll need to do some channel conversion. */
-            if (pSpatializer->channelsIn == pSpatializer->channelsOut) {
-                ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, ma_format_f32, pSpatializer->channelsIn);
-            } else {
-                ma_channel_map_apply_f32((float*)pFramesOut, pChannelMapOut, pSpatializer->channelsOut, (const float*)pFramesIn, pChannelMapIn, pSpatializer->channelsIn, frameCount, ma_channel_mix_mode_rectangular, ma_mono_expansion_mode_default);   /* Safe casts to float* because f32 is the only supported format. */
-            }
-        } else {
-            /* The listener is disabled. Output silence. */
-            ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, pSpatializer->channelsOut);
-        }
-
-        /*
-        We're not doing attenuation so don't bother with doppler for now. I'm not sure if this is
-        the correct thinking so might need to review this later.
-        */
-        pSpatializer->dopplerPitch = 1;
-    } else {
-        /*
-        Let's first determine which listener the sound is closest to. Need to keep in mind that we
-        might not have a world or any listeners, in which case we just spatializer based on the
-        listener being positioned at the origin (0, 0, 0).
-        */
-        ma_vec3f relativePosNormalized;
-        ma_vec3f relativePos;   /* The position relative to the listener. */
-        ma_vec3f relativeDir;   /* The direction of the sound, relative to the listener. */
-        ma_vec3f listenerVel;   /* The volocity of the listener. For doppler pitch calculation. */
-        float speedOfSound;
-        float distance = 0;
-        float gain = 1;
-        ma_uint32 iChannel;
-        const ma_uint32 channelsOut = pSpatializer->channelsOut;
-        const ma_uint32 channelsIn  = pSpatializer->channelsIn;
-        float minDistance = ma_spatializer_get_min_distance(pSpatializer);
-        float maxDistance = ma_spatializer_get_max_distance(pSpatializer);
-        float rolloff = ma_spatializer_get_rolloff(pSpatializer);
-        float dopplerFactor = ma_spatializer_get_doppler_factor(pSpatializer);
-
-        /*
-        We'll need the listener velocity for doppler pitch calculations. The speed of sound is
-        defined by the listener, so we'll grab that here too.
-        */
-        if (pListener != NULL) {
-            listenerVel  = ma_spatializer_listener_get_velocity(pListener);
-            speedOfSound = pListener->config.speedOfSound;
-        } else {
-            listenerVel  = ma_vec3f_init_3f(0, 0, 0);
-            speedOfSound = MA_DEFAULT_SPEED_OF_SOUND;
-        }
-
-        if (pListener == NULL || ma_spatializer_get_positioning(pSpatializer) == ma_positioning_relative) {
-            /* There's no listener or we're using relative positioning. */
-            relativePos = ma_spatializer_get_position(pSpatializer);
-            relativeDir = ma_spatializer_get_direction(pSpatializer);
-        } else {
-            /*
-            We've found a listener and we're using absolute positioning. We need to transform the
-            sound's position and direction so that it's relative to listener. Later on we'll use
-            this for determining the factors to apply to each channel to apply the panning effect.
-            */
-            ma_spatializer_get_relative_position_and_direction(pSpatializer, pListener, &relativePos, &relativeDir);
-        }
-
-        distance = ma_vec3f_len(relativePos);
-
-        /* We've gathered the data, so now we can apply some spatialization. */
-        switch (ma_spatializer_get_attenuation_model(pSpatializer)) {
-            case ma_attenuation_model_inverse:
-            {
-                gain = ma_attenuation_inverse(distance, minDistance, maxDistance, rolloff);
-            } break;
-            case ma_attenuation_model_linear:
-            {
-                gain = ma_attenuation_linear(distance, minDistance, maxDistance, rolloff);
-            } break;
-            case ma_attenuation_model_exponential:
-            {
-                gain = ma_attenuation_exponential(distance, minDistance, maxDistance, rolloff);
-            } break;
-            case ma_attenuation_model_none:
-            default:
-            {
-                gain = 1;
-            } break;
-        }
-
-        /* Normalize the position. */
-        if (distance > 0.001f) {
-            float distanceInv = 1/distance;
-            relativePosNormalized    = relativePos;
-            relativePosNormalized.x *= distanceInv;
-            relativePosNormalized.y *= distanceInv;
-            relativePosNormalized.z *= distanceInv;
-        } else {
-            distance = 0;
-            relativePosNormalized = ma_vec3f_init_3f(0, 0, 0);
-        }
-
-        /*
-        Angular attenuation.
-
-        Unlike distance gain, the math for this is not specified by the OpenAL spec so we'll just go ahead and figure
-        this out for ourselves at the expense of possibly being inconsistent with other implementations.
-
-        To do cone attenuation, I'm just using the same math that we'd use to implement a basic spotlight in OpenGL. We
-        just need to get the direction from the source to the listener and then do a dot product against that and the
-        direction of the spotlight. Then we just compare that dot product against the cosine of the inner and outer
-        angles. If the dot product is greater than the the outer angle, we just use coneOuterGain. If it's less than
-        the inner angle, we just use a gain of 1. Otherwise we linearly interpolate between 1 and coneOuterGain.
-        */
-        if (distance > 0) {
-            /* Source anglular gain. */
-            float spatializerConeInnerAngle;
-            float spatializerConeOuterAngle;
-            float spatializerConeOuterGain;
-            ma_spatializer_get_cone(pSpatializer, &spatializerConeInnerAngle, &spatializerConeOuterAngle, &spatializerConeOuterGain);
-
-            gain *= ma_calculate_angular_gain(relativeDir, ma_vec3f_neg(relativePosNormalized), spatializerConeInnerAngle, spatializerConeOuterAngle, spatializerConeOuterGain);
-
-            /*
-            We're supporting angular gain on the listener as well for those who want to reduce the volume of sounds that
-            are positioned behind the listener. On default settings, this will have no effect.
-            */
-            if (pListener != NULL && pListener->config.coneInnerAngleInRadians < 6.283185f) {
-                ma_vec3f listenerDirection;
-                float listenerInnerAngle;
-                float listenerOuterAngle;
-                float listenerOuterGain;
-
-                if (pListener->config.handedness == ma_handedness_right) {
-                    listenerDirection = ma_vec3f_init_3f(0, 0, -1);
-                } else {
-                    listenerDirection = ma_vec3f_init_3f(0, 0, +1);
-                }
-
-                listenerInnerAngle = pListener->config.coneInnerAngleInRadians;
-                listenerOuterAngle = pListener->config.coneOuterAngleInRadians;
-                listenerOuterGain  = pListener->config.coneOuterGain;
-
-                gain *= ma_calculate_angular_gain(listenerDirection, relativePosNormalized, listenerInnerAngle, listenerOuterAngle, listenerOuterGain);
-            }
-        } else {
-            /* The sound is right on top of the listener. Don't do any angular attenuation. */
-        }
-
-
-        /* Clamp the gain. */
-        gain = ma_clamp(gain, ma_spatializer_get_min_gain(pSpatializer), ma_spatializer_get_max_gain(pSpatializer));
-
-        /*
-        The gain needs to be applied per-channel here. The spatialization code below will be changing the per-channel
-        gains which will then eventually be passed into the gainer which will deal with smoothing the gain transitions
-        to avoid harsh changes in gain.
-        */
-        for (iChannel = 0; iChannel < channelsOut; iChannel += 1) {
-            pSpatializer->pNewChannelGainsOut[iChannel] = gain;
-        }
-
-        /*
-        Convert to our output channel count. If the listener is disabled we just output silence here. We cannot ignore
-        the whole section of code here because we need to update some internal spatialization state.
-        */
-        if (ma_spatializer_listener_is_enabled(pListener)) {
-            ma_channel_map_apply_f32((float*)pFramesOut, pChannelMapOut, channelsOut, (const float*)pFramesIn, pChannelMapIn, channelsIn, frameCount, ma_channel_mix_mode_rectangular, ma_mono_expansion_mode_default);
-        } else {
-            ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, pSpatializer->channelsOut);
-        }
-
-
-        /*
-        Panning. This is where we'll apply the gain and convert to the output channel count. We have an optimized path for
-        when we're converting to a mono stream. In that case we don't really need to do any panning - we just apply the
-        gain to the final output.
-        */
-        /*printf("distance=%f; gain=%f\n", distance, gain);*/
-
-        /* We must have a valid channel map here to ensure we spatialize properly. */
-        MA_ASSERT(pChannelMapOut != NULL);
-
-        /*
-        We're not converting to mono so we'll want to apply some panning. This is where the feeling of something being
-        to the left, right, infront or behind the listener is calculated. I'm just using a basic model here. Note that
-        the code below is not based on any specific algorithm. I'm just implementing this off the top of my head and
-        seeing how it goes. There might be better ways to do this.
-
-        To determine the direction of the sound relative to a speaker I'm using dot products. Each speaker is given a
-        direction. For example, the left channel in a stereo system will be -1 on the X axis and the right channel will
-        be +1 on the X axis. A dot product is performed against the direction vector of the channel and the normalized
-        position of the sound.
-        */
-
-        /*
-        Calculate our per-channel gains. We do this based on the normalized relative position of the sound and it's
-        relation to the direction of the channel.
-        */
-        if (distance > 0) {
-            ma_vec3f unitPos = relativePos;
-            float distanceInv = 1/distance;
-            unitPos.x *= distanceInv;
-            unitPos.y *= distanceInv;
-            unitPos.z *= distanceInv;
-
-            for (iChannel = 0; iChannel < channelsOut; iChannel += 1) {
-                ma_channel channelOut;
-                float d;
-                float dMin;
-
-                channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannel);
-                if (ma_is_spatial_channel_position(channelOut)) {
-                    d = ma_mix_f32_fast(1, ma_vec3f_dot(unitPos, ma_get_channel_direction(channelOut)), ma_spatializer_get_directional_attenuation_factor(pSpatializer));
-                } else {
-                    d = 1;  /* It's not a spatial channel so there's no real notion of direction. */
-                }
-
-                /*
-                In my testing, if the panning effect is too aggressive it makes spatialization feel uncomfortable.
-                The "dMin" variable below is used to control the aggressiveness of the panning effect. When set to
-                0, panning will be most extreme and any sounds that are positioned on the opposite side of the
-                speaker will be completely silent from that speaker. Not only does this feel uncomfortable, it
-                doesn't even remotely represent the real world at all because sounds that come from your right side
-                are still clearly audible from your left side. Setting "dMin" to 1 will result in no panning at
-                all, which is also not ideal. By setting it to something greater than 0, the spatialization effect
-                becomes much less dramatic and a lot more bearable.
-
-                Summary: 0 = more extreme panning; 1 = no panning.
-                */
-                dMin = pSpatializer->minSpatializationChannelGain;
-
-                /*
-                At this point, "d" will be positive if the sound is on the same side as the channel and negative if
-                it's on the opposite side. It will be in the range of -1..1. There's two ways I can think of to
-                calculate a panning value. The first is to simply convert it to 0..1, however this has a problem
-                which I'm not entirely happy with. Considering a stereo system, when a sound is positioned right
-                in front of the listener it'll result in each speaker getting a gain of 0.5. I don't know if I like
-                the idea of having a scaling factor of 0.5 being applied to a sound when it's sitting right in front
-                of the listener. I would intuitively expect that to be played at full volume, or close to it.
-
-                The second idea I think of is to only apply a reduction in gain when the sound is on the opposite
-                side of the speaker. That is, reduce the gain only when the dot product is negative. The problem
-                with this is that there will not be any attenuation as the sound sweeps around the 180 degrees
-                where the dot product is positive. The idea with this option is that you leave the gain at 1 when
-                the sound is being played on the same side as the speaker and then you just reduce the volume when
-                the sound is on the other side.
-
-                The summarize, I think the first option should give a better sense of spatialization, but the second
-                option is better for preserving the sound's power.
-
-                UPDATE: In my testing, I find the first option to sound better. You can feel the sense of space a
-                bit better, but you can also hear the reduction in volume when it's right in front.
-                */
-                #if 1
-                {
-                    /*
-                    Scale the dot product from -1..1 to 0..1. Will result in a sound directly in front losing power
-                    by being played at 0.5 gain.
-                    */
-                    d = (d + 1) * 0.5f;  /* -1..1 to 0..1 */
-                    d = ma_max(d, dMin);
-                    pSpatializer->pNewChannelGainsOut[iChannel] *= d;
-                }
-                #else
-                {
-                    /*
-                    Only reduce the volume of the sound if it's on the opposite side. This path keeps the volume more
-                    consistent, but comes at the expense of a worse sense of space and positioning.
-                    */
-                    if (d < 0) {
-                        d += 1; /* Move into the positive range. */
-                        d = ma_max(d, dMin);
-                        channelGainsOut[iChannel] *= d;
-                    }
-                }
-                #endif
-            }
-        } else {
-            /* Assume the sound is right on top of us. Don't do any panning. */
-        }
-
-        /* Now we need to apply the volume to each channel. This needs to run through the gainer to ensure we get a smooth volume transition. */
-        ma_gainer_set_gains(&pSpatializer->gainer, pSpatializer->pNewChannelGainsOut);
-        ma_gainer_process_pcm_frames(&pSpatializer->gainer, pFramesOut, pFramesOut, frameCount);
-
-        /*
-        Before leaving we'll want to update our doppler pitch so that the caller can apply some
-        pitch shifting if they desire. Note that we need to negate the relative position here
-        because the doppler calculation needs to be source-to-listener, but ours is listener-to-
-        source.
-        */
-        if (dopplerFactor > 0) {
-            pSpatializer->dopplerPitch = ma_doppler_pitch(ma_vec3f_sub(ma_spatializer_listener_get_position(pListener), ma_spatializer_get_position(pSpatializer)), ma_spatializer_get_velocity(pSpatializer), listenerVel, speedOfSound, dopplerFactor);
-        } else {
-            pSpatializer->dopplerPitch = 1;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_spatializer_set_master_volume(ma_spatializer* pSpatializer, float volume)
-{
-    if (pSpatializer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_gainer_set_master_volume(&pSpatializer->gainer, volume);
-}
-
-MA_API ma_result ma_spatializer_get_master_volume(const ma_spatializer* pSpatializer, float* pVolume)
-{
-    if (pSpatializer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_gainer_get_master_volume(&pSpatializer->gainer, pVolume);
-}
-
-MA_API ma_uint32 ma_spatializer_get_input_channels(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return 0;
-    }
-
-    return pSpatializer->channelsIn;
-}
-
-MA_API ma_uint32 ma_spatializer_get_output_channels(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return 0;
-    }
-
-    return pSpatializer->channelsOut;
-}
-
-MA_API void ma_spatializer_set_attenuation_model(ma_spatializer* pSpatializer, ma_attenuation_model attenuationModel)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_i32(&pSpatializer->attenuationModel, attenuationModel);
-}
-
-MA_API ma_attenuation_model ma_spatializer_get_attenuation_model(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return ma_attenuation_model_none;
-    }
-
-    return (ma_attenuation_model)ma_atomic_load_i32(&pSpatializer->attenuationModel);
-}
-
-MA_API void ma_spatializer_set_positioning(ma_spatializer* pSpatializer, ma_positioning positioning)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_i32(&pSpatializer->positioning, positioning);
-}
-
-MA_API ma_positioning ma_spatializer_get_positioning(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return ma_positioning_absolute;
-    }
-
-    return (ma_positioning)ma_atomic_load_i32(&pSpatializer->positioning);
-}
-
-MA_API void ma_spatializer_set_rolloff(ma_spatializer* pSpatializer, float rolloff)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_f32(&pSpatializer->rolloff, rolloff);
-}
-
-MA_API float ma_spatializer_get_rolloff(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return 0;
-    }
-
-    return ma_atomic_load_f32(&pSpatializer->rolloff);
-}
-
-MA_API void ma_spatializer_set_min_gain(ma_spatializer* pSpatializer, float minGain)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_f32(&pSpatializer->minGain, minGain);
-}
-
-MA_API float ma_spatializer_get_min_gain(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return 0;
-    }
-
-    return ma_atomic_load_f32(&pSpatializer->minGain);
-}
-
-MA_API void ma_spatializer_set_max_gain(ma_spatializer* pSpatializer, float maxGain)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_f32(&pSpatializer->maxGain, maxGain);
-}
-
-MA_API float ma_spatializer_get_max_gain(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return 0;
-    }
-
-    return ma_atomic_load_f32(&pSpatializer->maxGain);
-}
-
-MA_API void ma_spatializer_set_min_distance(ma_spatializer* pSpatializer, float minDistance)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_f32(&pSpatializer->minDistance, minDistance);
-}
-
-MA_API float ma_spatializer_get_min_distance(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return 0;
-    }
-
-    return ma_atomic_load_f32(&pSpatializer->minDistance);
-}
-
-MA_API void ma_spatializer_set_max_distance(ma_spatializer* pSpatializer, float maxDistance)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_f32(&pSpatializer->maxDistance, maxDistance);
-}
-
-MA_API float ma_spatializer_get_max_distance(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return 0;
-    }
-
-    return ma_atomic_load_f32(&pSpatializer->maxDistance);
-}
-
-MA_API void ma_spatializer_set_cone(ma_spatializer* pSpatializer, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_f32(&pSpatializer->coneInnerAngleInRadians, innerAngleInRadians);
-    ma_atomic_exchange_f32(&pSpatializer->coneOuterAngleInRadians, outerAngleInRadians);
-    ma_atomic_exchange_f32(&pSpatializer->coneOuterGain,           outerGain);
-}
-
-MA_API void ma_spatializer_get_cone(const ma_spatializer* pSpatializer, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    if (pInnerAngleInRadians != NULL) {
-        *pInnerAngleInRadians = ma_atomic_load_f32(&pSpatializer->coneInnerAngleInRadians);
-    }
-
-    if (pOuterAngleInRadians != NULL) {
-        *pOuterAngleInRadians = ma_atomic_load_f32(&pSpatializer->coneOuterAngleInRadians);
-    }
-
-    if (pOuterGain != NULL) {
-        *pOuterGain = ma_atomic_load_f32(&pSpatializer->coneOuterGain);
-    }
-}
-
-MA_API void ma_spatializer_set_doppler_factor(ma_spatializer* pSpatializer, float dopplerFactor)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_f32(&pSpatializer->dopplerFactor, dopplerFactor);
-}
-
-MA_API float ma_spatializer_get_doppler_factor(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return 1;
-    }
-
-    return ma_atomic_load_f32(&pSpatializer->dopplerFactor);
-}
-
-MA_API void ma_spatializer_set_directional_attenuation_factor(ma_spatializer* pSpatializer, float directionalAttenuationFactor)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_f32(&pSpatializer->directionalAttenuationFactor, directionalAttenuationFactor);
-}
-
-MA_API float ma_spatializer_get_directional_attenuation_factor(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return 1;
-    }
-
-    return ma_atomic_load_f32(&pSpatializer->directionalAttenuationFactor);
-}
-
-MA_API void ma_spatializer_set_position(ma_spatializer* pSpatializer, float x, float y, float z)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_vec3f_set(&pSpatializer->position, ma_vec3f_init_3f(x, y, z));
-}
-
-MA_API ma_vec3f ma_spatializer_get_position(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pSpatializer->position);  /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
-}
-
-MA_API void ma_spatializer_set_direction(ma_spatializer* pSpatializer, float x, float y, float z)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_vec3f_set(&pSpatializer->direction, ma_vec3f_init_3f(x, y, z));
-}
-
-MA_API ma_vec3f ma_spatializer_get_direction(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return ma_vec3f_init_3f(0, 0, -1);
-    }
-
-    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pSpatializer->direction); /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
-}
-
-MA_API void ma_spatializer_set_velocity(ma_spatializer* pSpatializer, float x, float y, float z)
-{
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    ma_atomic_vec3f_set(&pSpatializer->velocity, ma_vec3f_init_3f(x, y, z));
-}
-
-MA_API ma_vec3f ma_spatializer_get_velocity(const ma_spatializer* pSpatializer)
-{
-    if (pSpatializer == NULL) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pSpatializer->velocity);  /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
-}
-
-MA_API void ma_spatializer_get_relative_position_and_direction(const ma_spatializer* pSpatializer, const ma_spatializer_listener* pListener, ma_vec3f* pRelativePos, ma_vec3f* pRelativeDir)
-{
-    if (pRelativePos != NULL) {
-        pRelativePos->x = 0;
-        pRelativePos->y = 0;
-        pRelativePos->z = 0;
-    }
-
-    if (pRelativeDir != NULL) {
-        pRelativeDir->x = 0;
-        pRelativeDir->y = 0;
-        pRelativeDir->z = -1;
-    }
-
-    if (pSpatializer == NULL) {
-        return;
-    }
-
-    if (pListener == NULL || ma_spatializer_get_positioning(pSpatializer) == ma_positioning_relative) {
-        /* There's no listener or we're using relative positioning. */
-        if (pRelativePos != NULL) {
-            *pRelativePos = ma_spatializer_get_position(pSpatializer);
-        }
-        if (pRelativeDir != NULL) {
-            *pRelativeDir = ma_spatializer_get_direction(pSpatializer);
-        }
-    } else {
-        ma_vec3f spatializerPosition;
-        ma_vec3f spatializerDirection;
-        ma_vec3f listenerPosition;
-        ma_vec3f listenerDirection;
-        ma_vec3f v;
-        ma_vec3f axisX;
-        ma_vec3f axisY;
-        ma_vec3f axisZ;
-        float m[4][4];
-
-        spatializerPosition  = ma_spatializer_get_position(pSpatializer);
-        spatializerDirection = ma_spatializer_get_direction(pSpatializer);
-        listenerPosition     = ma_spatializer_listener_get_position(pListener);
-        listenerDirection    = ma_spatializer_listener_get_direction(pListener);
-
-        /*
-        We need to calcualte the right vector from our forward and up vectors. This is done with
-        a cross product.
-        */
-        axisZ = ma_vec3f_normalize(listenerDirection);                                  /* Normalization required here because we can't trust the caller. */
-        axisX = ma_vec3f_normalize(ma_vec3f_cross(axisZ, pListener->config.worldUp));   /* Normalization required here because the world up vector may not be perpendicular with the forward vector. */
-
-        /*
-        The calculation of axisX above can result in a zero-length vector if the listener is
-        looking straight up on the Y axis. We'll need to fall back to a +X in this case so that
-        the calculations below don't fall apart. This is where a quaternion based listener and
-        sound orientation would come in handy.
-        */
-        if (ma_vec3f_len2(axisX) == 0) {
-            axisX = ma_vec3f_init_3f(1, 0, 0);
-        }
-
-        axisY = ma_vec3f_cross(axisX, axisZ);                                           /* No normalization is required here because axisX and axisZ are unit length and perpendicular. */
-
-        /*
-        We need to swap the X axis if we're left handed because otherwise the cross product above
-        will have resulted in it pointing in the wrong direction (right handed was assumed in the
-        cross products above).
-        */
-        if (pListener->config.handedness == ma_handedness_left) {
-            axisX = ma_vec3f_neg(axisX);
-        }
-
-        /* Lookat. */
-        m[0][0] =  axisX.x; m[1][0] =  axisX.y; m[2][0] =  axisX.z; m[3][0] = -ma_vec3f_dot(axisX,               listenerPosition);
-        m[0][1] =  axisY.x; m[1][1] =  axisY.y; m[2][1] =  axisY.z; m[3][1] = -ma_vec3f_dot(axisY,               listenerPosition);
-        m[0][2] = -axisZ.x; m[1][2] = -axisZ.y; m[2][2] = -axisZ.z; m[3][2] = -ma_vec3f_dot(ma_vec3f_neg(axisZ), listenerPosition);
-        m[0][3] = 0;        m[1][3] = 0;        m[2][3] = 0;        m[3][3] = 1;
-
-        /*
-        Multiply the lookat matrix by the spatializer position to transform it to listener
-        space. This allows calculations to work based on the sound being relative to the
-        origin which makes things simpler.
-        */
-        if (pRelativePos != NULL) {
-            v = spatializerPosition;
-            pRelativePos->x = m[0][0] * v.x + m[1][0] * v.y + m[2][0] * v.z + m[3][0] * 1;
-            pRelativePos->y = m[0][1] * v.x + m[1][1] * v.y + m[2][1] * v.z + m[3][1] * 1;
-            pRelativePos->z = m[0][2] * v.x + m[1][2] * v.y + m[2][2] * v.z + m[3][2] * 1;
-        }
-
-        /*
-        The direction of the sound needs to also be transformed so that it's relative to the
-        rotation of the listener.
-        */
-        if (pRelativeDir != NULL) {
-            v = spatializerDirection;
-            pRelativeDir->x = m[0][0] * v.x + m[1][0] * v.y + m[2][0] * v.z;
-            pRelativeDir->y = m[0][1] * v.x + m[1][1] * v.y + m[2][1] * v.z;
-            pRelativeDir->z = m[0][2] * v.x + m[1][2] * v.y + m[2][2] * v.z;
-        }
-    }
-}
-
-
-
-
-/**************************************************************************************************************************************************************
-
-Resampling
-
-**************************************************************************************************************************************************************/
-MA_API ma_linear_resampler_config ma_linear_resampler_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
-{
-    ma_linear_resampler_config config;
-    MA_ZERO_OBJECT(&config);
-    config.format           = format;
-    config.channels         = channels;
-    config.sampleRateIn     = sampleRateIn;
-    config.sampleRateOut    = sampleRateOut;
-    config.lpfOrder         = ma_min(MA_DEFAULT_RESAMPLER_LPF_ORDER, MA_MAX_FILTER_ORDER);
-    config.lpfNyquistFactor = 1;
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t x0Offset;
-    size_t x1Offset;
-    size_t lpfOffset;
-} ma_linear_resampler_heap_layout;
-
-
-static void ma_linear_resampler_adjust_timer_for_new_rate(ma_linear_resampler* pResampler, ma_uint32 oldSampleRateOut, ma_uint32 newSampleRateOut)
-{
-    /*
-    So what's happening here? Basically we need to adjust the fractional component of the time advance based on the new rate. The old time advance will
-    be based on the old sample rate, but we are needing to adjust it to that it's based on the new sample rate.
-    */
-    ma_uint32 oldRateTimeWhole = pResampler->inTimeFrac / oldSampleRateOut;  /* <-- This should almost never be anything other than 0, but leaving it here to make this more general and robust just in case. */
-    ma_uint32 oldRateTimeFract = pResampler->inTimeFrac % oldSampleRateOut;
-
-    pResampler->inTimeFrac =
-         (oldRateTimeWhole * newSampleRateOut) +
-        ((oldRateTimeFract * newSampleRateOut) / oldSampleRateOut);
-
-    /* Make sure the fractional part is less than the output sample rate. */
-    pResampler->inTimeInt += pResampler->inTimeFrac / pResampler->config.sampleRateOut;
-    pResampler->inTimeFrac = pResampler->inTimeFrac % pResampler->config.sampleRateOut;
-}
-
-static ma_result ma_linear_resampler_set_rate_internal(ma_linear_resampler* pResampler, void* pHeap, ma_linear_resampler_heap_layout* pHeapLayout, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut, ma_bool32 isResamplerAlreadyInitialized)
-{
-    ma_result result;
-    ma_uint32 gcf;
-    ma_uint32 lpfSampleRate;
-    double lpfCutoffFrequency;
-    ma_lpf_config lpfConfig;
-    ma_uint32 oldSampleRateOut; /* Required for adjusting time advance down the bottom. */
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (sampleRateIn == 0 || sampleRateOut == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    oldSampleRateOut = pResampler->config.sampleRateOut;
-
-    pResampler->config.sampleRateIn  = sampleRateIn;
-    pResampler->config.sampleRateOut = sampleRateOut;
-
-    /* Simplify the sample rate. */
-    gcf = ma_gcf_u32(pResampler->config.sampleRateIn, pResampler->config.sampleRateOut);
-    pResampler->config.sampleRateIn  /= gcf;
-    pResampler->config.sampleRateOut /= gcf;
-
-    /* Always initialize the low-pass filter, even when the order is 0. */
-    if (pResampler->config.lpfOrder > MA_MAX_FILTER_ORDER) {
-        return MA_INVALID_ARGS;
-    }
-
-    lpfSampleRate      = (ma_uint32)(ma_max(pResampler->config.sampleRateIn, pResampler->config.sampleRateOut));
-    lpfCutoffFrequency = (   double)(ma_min(pResampler->config.sampleRateIn, pResampler->config.sampleRateOut) * 0.5 * pResampler->config.lpfNyquistFactor);
-
-    lpfConfig = ma_lpf_config_init(pResampler->config.format, pResampler->config.channels, lpfSampleRate, lpfCutoffFrequency, pResampler->config.lpfOrder);
-
-    /*
-    If the resampler is alreay initialized we don't want to do a fresh initialization of the low-pass filter because it will result in the cached frames
-    getting cleared. Instead we re-initialize the filter which will maintain any cached frames.
-    */
-    if (isResamplerAlreadyInitialized) {
-        result = ma_lpf_reinit(&lpfConfig, &pResampler->lpf);
-    } else {
-        result = ma_lpf_init_preallocated(&lpfConfig, ma_offset_ptr(pHeap, pHeapLayout->lpfOffset), &pResampler->lpf);
-    }
-
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-
-    pResampler->inAdvanceInt  = pResampler->config.sampleRateIn / pResampler->config.sampleRateOut;
-    pResampler->inAdvanceFrac = pResampler->config.sampleRateIn % pResampler->config.sampleRateOut;
-
-    /* Our timer was based on the old rate. We need to adjust it so that it's based on the new rate. */
-    ma_linear_resampler_adjust_timer_for_new_rate(pResampler, oldSampleRateOut, pResampler->config.sampleRateOut);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_linear_resampler_get_heap_layout(const ma_linear_resampler_config* pConfig, ma_linear_resampler_heap_layout* pHeapLayout)
-{
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* x0 */
-    pHeapLayout->x0Offset = pHeapLayout->sizeInBytes;
-    if (pConfig->format == ma_format_f32) {
-        pHeapLayout->sizeInBytes += sizeof(float) * pConfig->channels;
-    } else {
-        pHeapLayout->sizeInBytes += sizeof(ma_int16) * pConfig->channels;
-    }
-
-    /* x1 */
-    pHeapLayout->x1Offset = pHeapLayout->sizeInBytes;
-    if (pConfig->format == ma_format_f32) {
-        pHeapLayout->sizeInBytes += sizeof(float) * pConfig->channels;
-    } else {
-        pHeapLayout->sizeInBytes += sizeof(ma_int16) * pConfig->channels;
-    }
-
-    /* LPF */
-    pHeapLayout->lpfOffset = ma_align_64(pHeapLayout->sizeInBytes);
-    {
-        ma_result result;
-        size_t lpfHeapSizeInBytes;
-        ma_lpf_config lpfConfig = ma_lpf_config_init(pConfig->format, pConfig->channels, 1, 1, pConfig->lpfOrder);  /* Sample rate and cutoff frequency do not matter. */
-
-        result = ma_lpf_get_heap_size(&lpfConfig, &lpfHeapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->sizeInBytes += lpfHeapSizeInBytes;
-    }
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_linear_resampler_get_heap_size(const ma_linear_resampler_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_linear_resampler_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_linear_resampler_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_linear_resampler_init_preallocated(const ma_linear_resampler_config* pConfig, void* pHeap, ma_linear_resampler* pResampler)
-{
-    ma_result result;
-    ma_linear_resampler_heap_layout heapLayout;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pResampler);
-
-    result = ma_linear_resampler_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pResampler->config = *pConfig;
-
-    pResampler->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    if (pConfig->format == ma_format_f32) {
-        pResampler->x0.f32 = (float*)ma_offset_ptr(pHeap, heapLayout.x0Offset);
-        pResampler->x1.f32 = (float*)ma_offset_ptr(pHeap, heapLayout.x1Offset);
-    } else {
-        pResampler->x0.s16 = (ma_int16*)ma_offset_ptr(pHeap, heapLayout.x0Offset);
-        pResampler->x1.s16 = (ma_int16*)ma_offset_ptr(pHeap, heapLayout.x1Offset);
-    }
-
-    /* Setting the rate will set up the filter and time advances for us. */
-    result = ma_linear_resampler_set_rate_internal(pResampler, pHeap, &heapLayout, pConfig->sampleRateIn, pConfig->sampleRateOut, /* isResamplerAlreadyInitialized = */ MA_FALSE);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pResampler->inTimeInt  = 1;  /* Set this to one to force an input sample to always be loaded for the first output frame. */
-    pResampler->inTimeFrac = 0;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_linear_resampler_init(const ma_linear_resampler_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_linear_resampler* pResampler)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_linear_resampler_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_linear_resampler_init_preallocated(pConfig, pHeap, pResampler);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pResampler->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_linear_resampler_uninit(ma_linear_resampler* pResampler, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pResampler == NULL) {
-        return;
-    }
-
-    ma_lpf_uninit(&pResampler->lpf, pAllocationCallbacks);
-
-    if (pResampler->_ownsHeap) {
-        ma_free(pResampler->_pHeap, pAllocationCallbacks);
-    }
-}
-
-static MA_INLINE ma_int16 ma_linear_resampler_mix_s16(ma_int16 x, ma_int16 y, ma_int32 a, const ma_int32 shift)
-{
-    ma_int32 b;
-    ma_int32 c;
-    ma_int32 r;
-
-    MA_ASSERT(a <= (1<<shift));
-
-    b = x * ((1<<shift) - a);
-    c = y * a;
-    r = b + c;
-
-    return (ma_int16)(r >> shift);
-}
-
-static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResampler, ma_int16* MA_RESTRICT pFrameOut)
-{
-    ma_uint32 c;
-    ma_uint32 a;
-    const ma_uint32 channels = pResampler->config.channels;
-    const ma_uint32 shift = 12;
-
-    MA_ASSERT(pResampler != NULL);
-    MA_ASSERT(pFrameOut  != NULL);
-
-    a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
-
-    MA_ASSUME(channels > 0);
-    for (c = 0; c < channels; c += 1) {
-        ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
-        pFrameOut[c] = s;
-    }
-}
-
-
-static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResampler, float* MA_RESTRICT pFrameOut)
-{
-    ma_uint32 c;
-    float a;
-    const ma_uint32 channels = pResampler->config.channels;
-
-    MA_ASSERT(pResampler != NULL);
-    MA_ASSERT(pFrameOut  != NULL);
-
-    a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
-
-    MA_ASSUME(channels > 0);
-    for (c = 0; c < channels; c += 1) {
-        float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
-        pFrameOut[c] = s;
-    }
-}
-
-static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    const ma_int16* pFramesInS16;
-    /* */ ma_int16* pFramesOutS16;
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 framesProcessedIn;
-    ma_uint64 framesProcessedOut;
-
-    MA_ASSERT(pResampler     != NULL);
-    MA_ASSERT(pFrameCountIn  != NULL);
-    MA_ASSERT(pFrameCountOut != NULL);
-
-    pFramesInS16       = (const ma_int16*)pFramesIn;
-    pFramesOutS16      = (      ma_int16*)pFramesOut;
-    frameCountIn       = *pFrameCountIn;
-    frameCountOut      = *pFrameCountOut;
-    framesProcessedIn  = 0;
-    framesProcessedOut = 0;
-
-    while (framesProcessedOut < frameCountOut) {
-        /* Before interpolating we need to load the buffers. When doing this we need to ensure we run every input sample through the filter. */
-        while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
-            ma_uint32 iChannel;
-
-            if (pFramesInS16 != NULL) {
-                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-                    pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
-                    pResampler->x1.s16[iChannel] = pFramesInS16[iChannel];
-                }
-                pFramesInS16 += pResampler->config.channels;
-            } else {
-                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-                    pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
-                    pResampler->x1.s16[iChannel] = 0;
-                }
-            }
-
-            /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
-            if (pResampler->config.sampleRateIn != pResampler->config.sampleRateOut) {
-                ma_lpf_process_pcm_frame_s16(&pResampler->lpf, pResampler->x1.s16, pResampler->x1.s16);
-            }
-
-            framesProcessedIn     += 1;
-            pResampler->inTimeInt -= 1;
-        }
-
-        if (pResampler->inTimeInt > 0) {
-            break;  /* Ran out of input data. */
-        }
-
-        /* Getting here means the frames have been loaded and filtered and we can generate the next output frame. */
-        if (pFramesOutS16 != NULL) {
-            MA_ASSERT(pResampler->inTimeInt == 0);
-            ma_linear_resampler_interpolate_frame_s16(pResampler, pFramesOutS16);
-
-            pFramesOutS16 += pResampler->config.channels;
-        }
-
-        framesProcessedOut += 1;
-
-        /* Advance time forward. */
-        pResampler->inTimeInt  += pResampler->inAdvanceInt;
-        pResampler->inTimeFrac += pResampler->inAdvanceFrac;
-        if (pResampler->inTimeFrac >= pResampler->config.sampleRateOut) {
-            pResampler->inTimeFrac -= pResampler->config.sampleRateOut;
-            pResampler->inTimeInt  += 1;
-        }
-    }
-
-    *pFrameCountIn  = framesProcessedIn;
-    *pFrameCountOut = framesProcessedOut;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    const ma_int16* pFramesInS16;
-    /* */ ma_int16* pFramesOutS16;
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 framesProcessedIn;
-    ma_uint64 framesProcessedOut;
-
-    MA_ASSERT(pResampler     != NULL);
-    MA_ASSERT(pFrameCountIn  != NULL);
-    MA_ASSERT(pFrameCountOut != NULL);
-
-    pFramesInS16       = (const ma_int16*)pFramesIn;
-    pFramesOutS16      = (      ma_int16*)pFramesOut;
-    frameCountIn       = *pFrameCountIn;
-    frameCountOut      = *pFrameCountOut;
-    framesProcessedIn  = 0;
-    framesProcessedOut = 0;
-
-    while (framesProcessedOut < frameCountOut) {
-        /* Before interpolating we need to load the buffers. */
-        while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
-            ma_uint32 iChannel;
-
-            if (pFramesInS16 != NULL) {
-                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-                    pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
-                    pResampler->x1.s16[iChannel] = pFramesInS16[iChannel];
-                }
-                pFramesInS16 += pResampler->config.channels;
-            } else {
-                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-                    pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
-                    pResampler->x1.s16[iChannel] = 0;
-                }
-            }
-
-            framesProcessedIn     += 1;
-            pResampler->inTimeInt -= 1;
-        }
-
-        if (pResampler->inTimeInt > 0) {
-            break;  /* Ran out of input data. */
-        }
-
-        /* Getting here means the frames have been loaded and we can generate the next output frame. */
-        if (pFramesOutS16 != NULL) {
-            MA_ASSERT(pResampler->inTimeInt == 0);
-            ma_linear_resampler_interpolate_frame_s16(pResampler, pFramesOutS16);
-
-            /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
-            if (pResampler->config.sampleRateIn != pResampler->config.sampleRateOut) {
-                ma_lpf_process_pcm_frame_s16(&pResampler->lpf, pFramesOutS16, pFramesOutS16);
-            }
-
-            pFramesOutS16 += pResampler->config.channels;
-        }
-
-        framesProcessedOut += 1;
-
-        /* Advance time forward. */
-        pResampler->inTimeInt  += pResampler->inAdvanceInt;
-        pResampler->inTimeFrac += pResampler->inAdvanceFrac;
-        if (pResampler->inTimeFrac >= pResampler->config.sampleRateOut) {
-            pResampler->inTimeFrac -= pResampler->config.sampleRateOut;
-            pResampler->inTimeInt  += 1;
-        }
-    }
-
-    *pFrameCountIn  = framesProcessedIn;
-    *pFrameCountOut = framesProcessedOut;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    MA_ASSERT(pResampler != NULL);
-
-    if (pResampler->config.sampleRateIn > pResampler->config.sampleRateOut) {
-        return ma_linear_resampler_process_pcm_frames_s16_downsample(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-    } else {
-        return ma_linear_resampler_process_pcm_frames_s16_upsample(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-    }
-}
-
-
-static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    const float* pFramesInF32;
-    /* */ float* pFramesOutF32;
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 framesProcessedIn;
-    ma_uint64 framesProcessedOut;
-
-    MA_ASSERT(pResampler     != NULL);
-    MA_ASSERT(pFrameCountIn  != NULL);
-    MA_ASSERT(pFrameCountOut != NULL);
-
-    pFramesInF32       = (const float*)pFramesIn;
-    pFramesOutF32      = (      float*)pFramesOut;
-    frameCountIn       = *pFrameCountIn;
-    frameCountOut      = *pFrameCountOut;
-    framesProcessedIn  = 0;
-    framesProcessedOut = 0;
-
-    while (framesProcessedOut < frameCountOut) {
-        /* Before interpolating we need to load the buffers. When doing this we need to ensure we run every input sample through the filter. */
-        while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
-            ma_uint32 iChannel;
-
-            if (pFramesInF32 != NULL) {
-                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-                    pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
-                    pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
-                }
-                pFramesInF32 += pResampler->config.channels;
-            } else {
-                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-                    pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
-                    pResampler->x1.f32[iChannel] = 0;
-                }
-            }
-
-            /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
-            if (pResampler->config.sampleRateIn != pResampler->config.sampleRateOut) {
-                ma_lpf_process_pcm_frame_f32(&pResampler->lpf, pResampler->x1.f32, pResampler->x1.f32);
-            }
-
-            framesProcessedIn     += 1;
-            pResampler->inTimeInt -= 1;
-        }
-
-        if (pResampler->inTimeInt > 0) {
-            break;  /* Ran out of input data. */
-        }
-
-        /* Getting here means the frames have been loaded and filtered and we can generate the next output frame. */
-        if (pFramesOutF32 != NULL) {
-            MA_ASSERT(pResampler->inTimeInt == 0);
-            ma_linear_resampler_interpolate_frame_f32(pResampler, pFramesOutF32);
-
-            pFramesOutF32 += pResampler->config.channels;
-        }
-
-        framesProcessedOut += 1;
-
-        /* Advance time forward. */
-        pResampler->inTimeInt  += pResampler->inAdvanceInt;
-        pResampler->inTimeFrac += pResampler->inAdvanceFrac;
-        if (pResampler->inTimeFrac >= pResampler->config.sampleRateOut) {
-            pResampler->inTimeFrac -= pResampler->config.sampleRateOut;
-            pResampler->inTimeInt  += 1;
-        }
-    }
-
-    *pFrameCountIn  = framesProcessedIn;
-    *pFrameCountOut = framesProcessedOut;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    const float* pFramesInF32;
-    /* */ float* pFramesOutF32;
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 framesProcessedIn;
-    ma_uint64 framesProcessedOut;
-
-    MA_ASSERT(pResampler     != NULL);
-    MA_ASSERT(pFrameCountIn  != NULL);
-    MA_ASSERT(pFrameCountOut != NULL);
-
-    pFramesInF32       = (const float*)pFramesIn;
-    pFramesOutF32      = (      float*)pFramesOut;
-    frameCountIn       = *pFrameCountIn;
-    frameCountOut      = *pFrameCountOut;
-    framesProcessedIn  = 0;
-    framesProcessedOut = 0;
-
-    while (framesProcessedOut < frameCountOut) {
-        /* Before interpolating we need to load the buffers. */
-        while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
-            ma_uint32 iChannel;
-
-            if (pFramesInF32 != NULL) {
-                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-                    pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
-                    pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
-                }
-                pFramesInF32 += pResampler->config.channels;
-            } else {
-                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-                    pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
-                    pResampler->x1.f32[iChannel] = 0;
-                }
-            }
-
-            framesProcessedIn     += 1;
-            pResampler->inTimeInt -= 1;
-        }
-
-        if (pResampler->inTimeInt > 0) {
-            break;  /* Ran out of input data. */
-        }
-
-        /* Getting here means the frames have been loaded and we can generate the next output frame. */
-        if (pFramesOutF32 != NULL) {
-            MA_ASSERT(pResampler->inTimeInt == 0);
-            ma_linear_resampler_interpolate_frame_f32(pResampler, pFramesOutF32);
-
-            /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
-            if (pResampler->config.sampleRateIn != pResampler->config.sampleRateOut) {
-                ma_lpf_process_pcm_frame_f32(&pResampler->lpf, pFramesOutF32, pFramesOutF32);
-            }
-
-            pFramesOutF32 += pResampler->config.channels;
-        }
-
-        framesProcessedOut += 1;
-
-        /* Advance time forward. */
-        pResampler->inTimeInt  += pResampler->inAdvanceInt;
-        pResampler->inTimeFrac += pResampler->inAdvanceFrac;
-        if (pResampler->inTimeFrac >= pResampler->config.sampleRateOut) {
-            pResampler->inTimeFrac -= pResampler->config.sampleRateOut;
-            pResampler->inTimeInt  += 1;
-        }
-    }
-
-    *pFrameCountIn  = framesProcessedIn;
-    *pFrameCountOut = framesProcessedOut;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_linear_resampler_process_pcm_frames_f32(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    MA_ASSERT(pResampler != NULL);
-
-    if (pResampler->config.sampleRateIn > pResampler->config.sampleRateOut) {
-        return ma_linear_resampler_process_pcm_frames_f32_downsample(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-    } else {
-        return ma_linear_resampler_process_pcm_frames_f32_upsample(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-    }
-}
-
-
-MA_API ma_result ma_linear_resampler_process_pcm_frames(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /*  */ if (pResampler->config.format == ma_format_s16) {
-        return ma_linear_resampler_process_pcm_frames_s16(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-    } else if (pResampler->config.format == ma_format_f32) {
-        return ma_linear_resampler_process_pcm_frames_f32(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-    } else {
-        /* Should never get here. Getting here means the format is not supported and you didn't check the return value of ma_linear_resampler_init(). */
-        MA_ASSERT(MA_FALSE);
-        return MA_INVALID_ARGS;
-    }
-}
-
-
-MA_API ma_result ma_linear_resampler_set_rate(ma_linear_resampler* pResampler, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
-{
-    return ma_linear_resampler_set_rate_internal(pResampler, NULL, NULL, sampleRateIn, sampleRateOut, /* isResamplerAlreadyInitialized = */ MA_TRUE);
-}
-
-MA_API ma_result ma_linear_resampler_set_rate_ratio(ma_linear_resampler* pResampler, float ratioInOut)
-{
-    ma_uint32 n;
-    ma_uint32 d;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ratioInOut <= 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    d = 1000000;
-    n = (ma_uint32)(ratioInOut * d);
-
-    if (n == 0) {
-        return MA_INVALID_ARGS; /* Ratio too small. */
-    }
-
-    MA_ASSERT(n != 0);
-
-    return ma_linear_resampler_set_rate(pResampler, n, d);
-}
-
-MA_API ma_uint64 ma_linear_resampler_get_input_latency(const ma_linear_resampler* pResampler)
-{
-    if (pResampler == NULL) {
-        return 0;
-    }
-
-    return 1 + ma_lpf_get_latency(&pResampler->lpf);
-}
-
-MA_API ma_uint64 ma_linear_resampler_get_output_latency(const ma_linear_resampler* pResampler)
-{
-    if (pResampler == NULL) {
-        return 0;
-    }
-
-    return ma_linear_resampler_get_input_latency(pResampler) * pResampler->config.sampleRateOut / pResampler->config.sampleRateIn;
-}
-
-MA_API ma_result ma_linear_resampler_get_required_input_frame_count(const ma_linear_resampler* pResampler, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount)
-{
-    ma_uint64 inputFrameCount;
-
-    if (pInputFrameCount == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pInputFrameCount = 0;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (outputFrameCount == 0) {
-        return MA_SUCCESS;
-    }
-
-    /* Any whole input frames are consumed before the first output frame is generated. */
-    inputFrameCount = pResampler->inTimeInt;
-    outputFrameCount -= 1;
-
-    /* The rest of the output frames can be calculated in constant time. */
-    inputFrameCount += outputFrameCount * pResampler->inAdvanceInt;
-    inputFrameCount += (pResampler->inTimeFrac + (outputFrameCount * pResampler->inAdvanceFrac)) / pResampler->config.sampleRateOut;
-
-    *pInputFrameCount = inputFrameCount;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_linear_resampler_get_expected_output_frame_count(const ma_linear_resampler* pResampler, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount)
-{
-    ma_uint64 outputFrameCount;
-    ma_uint64 preliminaryInputFrameCountFromFrac;
-    ma_uint64 preliminaryInputFrameCount;
-
-    if (pOutputFrameCount == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pOutputFrameCount = 0;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /*
-    The first step is to get a preliminary output frame count. This will either be exactly equal to what we need, or less by 1. We need to
-    determine how many input frames will be consumed by this value. If it's greater than our original input frame count it means we won't
-    be able to generate an extra frame because we will have run out of input data. Otherwise we will have enough input for the generation
-    of an extra output frame. This add-by-one logic is necessary due to how the data loading logic works when processing frames.
-    */
-    outputFrameCount = (inputFrameCount * pResampler->config.sampleRateOut) / pResampler->config.sampleRateIn;
-
-    /*
-    We need to determine how many *whole* input frames will have been processed to generate our preliminary output frame count. This is
-    used in the logic below to determine whether or not we need to add an extra output frame.
-    */
-    preliminaryInputFrameCountFromFrac = (pResampler->inTimeFrac + outputFrameCount*pResampler->inAdvanceFrac) / pResampler->config.sampleRateOut;
-    preliminaryInputFrameCount         = (pResampler->inTimeInt  + outputFrameCount*pResampler->inAdvanceInt ) + preliminaryInputFrameCountFromFrac;
-
-    /*
-    If the total number of *whole* input frames that would be required to generate our preliminary output frame count is greather than
-    the amount of whole input frames we have available as input we need to *not* add an extra output frame as there won't be enough data
-    to actually process. Otherwise we need to add the extra output frame.
-    */
-    if (preliminaryInputFrameCount <= inputFrameCount) {
-        outputFrameCount += 1;
-    }
-
-    *pOutputFrameCount = outputFrameCount;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_linear_resampler_reset(ma_linear_resampler* pResampler)
-{
-    ma_uint32 iChannel;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Timers need to be cleared back to zero. */
-    pResampler->inTimeInt  = 1;  /* Set this to one to force an input sample to always be loaded for the first output frame. */
-    pResampler->inTimeFrac = 0;
-
-    /* Cached samples need to be cleared. */
-    if (pResampler->config.format == ma_format_f32) {
-        for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-            pResampler->x0.f32[iChannel] = 0;
-            pResampler->x1.f32[iChannel] = 0;
-        }
-    } else {
-        for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
-            pResampler->x0.s16[iChannel] = 0;
-            pResampler->x1.s16[iChannel] = 0;
-        }
-    }
-
-    /* The low pass filter needs to have it's cache reset. */
-    ma_lpf_clear_cache(&pResampler->lpf);
-
-    return MA_SUCCESS;
-}
-
-
-
-/* Linear resampler backend vtable. */
-static ma_linear_resampler_config ma_resampling_backend_get_config__linear(const ma_resampler_config* pConfig)
-{
-    ma_linear_resampler_config linearConfig;
-
-    linearConfig = ma_linear_resampler_config_init(pConfig->format, pConfig->channels, pConfig->sampleRateIn, pConfig->sampleRateOut);
-    linearConfig.lpfOrder = pConfig->linear.lpfOrder;
-
-    return linearConfig;
-}
-
-static ma_result ma_resampling_backend_get_heap_size__linear(void* pUserData, const ma_resampler_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_linear_resampler_config linearConfig;
-
-    (void)pUserData;
-
-    linearConfig = ma_resampling_backend_get_config__linear(pConfig);
-
-    return ma_linear_resampler_get_heap_size(&linearConfig, pHeapSizeInBytes);
-}
-
-static ma_result ma_resampling_backend_init__linear(void* pUserData, const ma_resampler_config* pConfig, void* pHeap, ma_resampling_backend** ppBackend)
-{
-    ma_resampler* pResampler = (ma_resampler*)pUserData;
-    ma_result result;
-    ma_linear_resampler_config linearConfig;
-
-    (void)pUserData;
-
-    linearConfig = ma_resampling_backend_get_config__linear(pConfig);
-
-    result = ma_linear_resampler_init_preallocated(&linearConfig, pHeap, &pResampler->state.linear);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *ppBackend = &pResampler->state.linear;
-
-    return MA_SUCCESS;
-}
-
-static void ma_resampling_backend_uninit__linear(void* pUserData, ma_resampling_backend* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    (void)pUserData;
-
-    ma_linear_resampler_uninit((ma_linear_resampler*)pBackend, pAllocationCallbacks);
-}
-
-static ma_result ma_resampling_backend_process__linear(void* pUserData, ma_resampling_backend* pBackend, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    (void)pUserData;
-
-    return ma_linear_resampler_process_pcm_frames((ma_linear_resampler*)pBackend, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-}
-
-static ma_result ma_resampling_backend_set_rate__linear(void* pUserData, ma_resampling_backend* pBackend, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
-{
-    (void)pUserData;
-
-    return ma_linear_resampler_set_rate((ma_linear_resampler*)pBackend, sampleRateIn, sampleRateOut);
-}
-
-static ma_uint64 ma_resampling_backend_get_input_latency__linear(void* pUserData, const ma_resampling_backend* pBackend)
-{
-    (void)pUserData;
-
-    return ma_linear_resampler_get_input_latency((const ma_linear_resampler*)pBackend);
-}
-
-static ma_uint64 ma_resampling_backend_get_output_latency__linear(void* pUserData, const ma_resampling_backend* pBackend)
-{
-    (void)pUserData;
-
-    return ma_linear_resampler_get_output_latency((const ma_linear_resampler*)pBackend);
-}
-
-static ma_result ma_resampling_backend_get_required_input_frame_count__linear(void* pUserData, const ma_resampling_backend* pBackend, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount)
-{
-    (void)pUserData;
-
-    return ma_linear_resampler_get_required_input_frame_count((const ma_linear_resampler*)pBackend, outputFrameCount, pInputFrameCount);
-}
-
-static ma_result ma_resampling_backend_get_expected_output_frame_count__linear(void* pUserData, const ma_resampling_backend* pBackend, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount)
-{
-    (void)pUserData;
-
-    return ma_linear_resampler_get_expected_output_frame_count((const ma_linear_resampler*)pBackend, inputFrameCount, pOutputFrameCount);
-}
-
-static ma_result ma_resampling_backend_reset__linear(void* pUserData, ma_resampling_backend* pBackend)
-{
-    (void)pUserData;
-
-    return ma_linear_resampler_reset((ma_linear_resampler*)pBackend);
-}
-
-static ma_resampling_backend_vtable g_ma_linear_resampler_vtable =
-{
-    ma_resampling_backend_get_heap_size__linear,
-    ma_resampling_backend_init__linear,
-    ma_resampling_backend_uninit__linear,
-    ma_resampling_backend_process__linear,
-    ma_resampling_backend_set_rate__linear,
-    ma_resampling_backend_get_input_latency__linear,
-    ma_resampling_backend_get_output_latency__linear,
-    ma_resampling_backend_get_required_input_frame_count__linear,
-    ma_resampling_backend_get_expected_output_frame_count__linear,
-    ma_resampling_backend_reset__linear
-};
-
-
-
-MA_API ma_resampler_config ma_resampler_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut, ma_resample_algorithm algorithm)
-{
-    ma_resampler_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format = format;
-    config.channels = channels;
-    config.sampleRateIn = sampleRateIn;
-    config.sampleRateOut = sampleRateOut;
-    config.algorithm = algorithm;
-
-    /* Linear. */
-    config.linear.lpfOrder = ma_min(MA_DEFAULT_RESAMPLER_LPF_ORDER, MA_MAX_FILTER_ORDER);
-
-    return config;
-}
-
-static ma_result ma_resampler_get_vtable(const ma_resampler_config* pConfig, ma_resampler* pResampler, ma_resampling_backend_vtable** ppVTable, void** ppUserData)
-{
-    MA_ASSERT(pConfig    != NULL);
-    MA_ASSERT(ppVTable   != NULL);
-    MA_ASSERT(ppUserData != NULL);
-
-    /* Safety. */
-    *ppVTable   = NULL;
-    *ppUserData = NULL;
-
-    switch (pConfig->algorithm)
-    {
-        case ma_resample_algorithm_linear:
-        {
-            *ppVTable   = &g_ma_linear_resampler_vtable;
-            *ppUserData = pResampler;
-        } break;
-
-        case ma_resample_algorithm_custom:
-        {
-            *ppVTable   = pConfig->pBackendVTable;
-            *ppUserData = pConfig->pBackendUserData;
-        } break;
-
-        default: return MA_INVALID_ARGS;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resampler_get_heap_size(const ma_resampler_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_resampling_backend_vtable* pVTable;
-    void* pVTableUserData;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_resampler_get_vtable(pConfig, NULL, &pVTable, &pVTableUserData);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pVTable == NULL || pVTable->onGetHeapSize == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    result = pVTable->onGetHeapSize(pVTableUserData, pConfig, pHeapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resampler_init_preallocated(const ma_resampler_config* pConfig, void* pHeap, ma_resampler* pResampler)
-{
-    ma_result result;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pResampler);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pResampler->_pHeap        = pHeap;
-    pResampler->format        = pConfig->format;
-    pResampler->channels      = pConfig->channels;
-    pResampler->sampleRateIn  = pConfig->sampleRateIn;
-    pResampler->sampleRateOut = pConfig->sampleRateOut;
-
-    result = ma_resampler_get_vtable(pConfig, pResampler, &pResampler->pBackendVTable, &pResampler->pBackendUserData);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onInit == NULL) {
-        return MA_NOT_IMPLEMENTED;  /* onInit not implemented. */
-    }
-
-    result = pResampler->pBackendVTable->onInit(pResampler->pBackendUserData, pConfig, pHeap, &pResampler->pBackend);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resampler_init(const ma_resampler_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_resampler* pResampler)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_resampler_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_resampler_init_preallocated(pConfig, pHeap, pResampler);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pResampler->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_resampler_uninit(ma_resampler* pResampler, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pResampler == NULL) {
-        return;
-    }
-
-    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onUninit == NULL) {
-        return;
-    }
-
-    pResampler->pBackendVTable->onUninit(pResampler->pBackendUserData, pResampler->pBackend, pAllocationCallbacks);
-
-    if (pResampler->_ownsHeap) {
-        ma_free(pResampler->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_result ma_resampler_process_pcm_frames(ma_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pFrameCountOut == NULL && pFrameCountIn == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onProcess == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pResampler->pBackendVTable->onProcess(pResampler->pBackendUserData, pResampler->pBackend, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-}
-
-MA_API ma_result ma_resampler_set_rate(ma_resampler* pResampler, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
-{
-    ma_result result;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (sampleRateIn == 0 || sampleRateOut == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onSetRate == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    result = pResampler->pBackendVTable->onSetRate(pResampler->pBackendUserData, pResampler->pBackend, sampleRateIn, sampleRateOut);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pResampler->sampleRateIn  = sampleRateIn;
-    pResampler->sampleRateOut = sampleRateOut;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resampler_set_rate_ratio(ma_resampler* pResampler, float ratio)
-{
-    ma_uint32 n;
-    ma_uint32 d;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ratio <= 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    d = 1000;
-    n = (ma_uint32)(ratio * d);
-
-    if (n == 0) {
-        return MA_INVALID_ARGS; /* Ratio too small. */
-    }
-
-    MA_ASSERT(n != 0);
-
-    return ma_resampler_set_rate(pResampler, n, d);
-}
-
-MA_API ma_uint64 ma_resampler_get_input_latency(const ma_resampler* pResampler)
-{
-    if (pResampler == NULL) {
-        return 0;
-    }
-
-    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onGetInputLatency == NULL) {
-        return 0;
-    }
-
-    return pResampler->pBackendVTable->onGetInputLatency(pResampler->pBackendUserData, pResampler->pBackend);
-}
-
-MA_API ma_uint64 ma_resampler_get_output_latency(const ma_resampler* pResampler)
-{
-    if (pResampler == NULL) {
-        return 0;
-    }
-
-    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onGetOutputLatency == NULL) {
-        return 0;
-    }
-
-    return pResampler->pBackendVTable->onGetOutputLatency(pResampler->pBackendUserData, pResampler->pBackend);
-}
-
-MA_API ma_result ma_resampler_get_required_input_frame_count(const ma_resampler* pResampler, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount)
-{
-    if (pInputFrameCount == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pInputFrameCount = 0;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onGetRequiredInputFrameCount == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pResampler->pBackendVTable->onGetRequiredInputFrameCount(pResampler->pBackendUserData, pResampler->pBackend, outputFrameCount, pInputFrameCount);
-}
-
-MA_API ma_result ma_resampler_get_expected_output_frame_count(const ma_resampler* pResampler, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount)
-{
-    if (pOutputFrameCount == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pOutputFrameCount = 0;
-
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onGetExpectedOutputFrameCount == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pResampler->pBackendVTable->onGetExpectedOutputFrameCount(pResampler->pBackendUserData, pResampler->pBackend, inputFrameCount, pOutputFrameCount);
-}
-
-MA_API ma_result ma_resampler_reset(ma_resampler* pResampler)
-{
-    if (pResampler == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onReset == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pResampler->pBackendVTable->onReset(pResampler->pBackendUserData, pResampler->pBackend);
-}
-
-/**************************************************************************************************************************************************************
-
-Channel Conversion
-
-**************************************************************************************************************************************************************/
-#ifndef MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT
-#define MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT  12
-#endif
-
-#define MA_PLANE_LEFT      0
-#define MA_PLANE_RIGHT     1
-#define MA_PLANE_FRONT     2
-#define MA_PLANE_BACK      3
-#define MA_PLANE_BOTTOM    4
-#define MA_PLANE_TOP       5
-
-static float g_maChannelPlaneRatios[MA_CHANNEL_POSITION_COUNT][6] = {
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_NONE */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_MONO */
-    { 0.5f,  0.0f,  0.5f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_LEFT */
-    { 0.0f,  0.5f,  0.5f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_RIGHT */
-    { 0.0f,  0.0f,  1.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_CENTER */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_LFE */
-    { 0.5f,  0.0f,  0.0f,  0.5f,  0.0f,  0.0f},  /* MA_CHANNEL_BACK_LEFT */
-    { 0.0f,  0.5f,  0.0f,  0.5f,  0.0f,  0.0f},  /* MA_CHANNEL_BACK_RIGHT */
-    { 0.25f, 0.0f,  0.75f, 0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_LEFT_CENTER */
-    { 0.0f,  0.25f, 0.75f, 0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_RIGHT_CENTER */
-    { 0.0f,  0.0f,  0.0f,  1.0f,  0.0f,  0.0f},  /* MA_CHANNEL_BACK_CENTER */
-    { 1.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_SIDE_LEFT */
-    { 0.0f,  1.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_SIDE_RIGHT */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  1.0f},  /* MA_CHANNEL_TOP_CENTER */
-    { 0.33f, 0.0f,  0.33f, 0.0f,  0.0f,  0.34f}, /* MA_CHANNEL_TOP_FRONT_LEFT */
-    { 0.0f,  0.0f,  0.5f,  0.0f,  0.0f,  0.5f},  /* MA_CHANNEL_TOP_FRONT_CENTER */
-    { 0.0f,  0.33f, 0.33f, 0.0f,  0.0f,  0.34f}, /* MA_CHANNEL_TOP_FRONT_RIGHT */
-    { 0.33f, 0.0f,  0.0f,  0.33f, 0.0f,  0.34f}, /* MA_CHANNEL_TOP_BACK_LEFT */
-    { 0.0f,  0.0f,  0.0f,  0.5f,  0.0f,  0.5f},  /* MA_CHANNEL_TOP_BACK_CENTER */
-    { 0.0f,  0.33f, 0.0f,  0.33f, 0.0f,  0.34f}, /* MA_CHANNEL_TOP_BACK_RIGHT */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_0 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_1 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_2 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_3 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_4 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_5 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_6 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_7 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_8 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_9 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_10 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_11 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_12 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_13 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_14 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_15 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_16 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_17 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_18 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_19 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_20 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_21 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_22 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_23 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_24 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_25 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_26 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_27 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_28 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_29 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_30 */
-    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_31 */
-};
-
-static float ma_calculate_channel_position_rectangular_weight(ma_channel channelPositionA, ma_channel channelPositionB)
-{
-    /*
-    Imagine the following simplified example: You have a single input speaker which is the front/left speaker which you want to convert to
-    the following output configuration:
-
-     - front/left
-     - side/left
-     - back/left
-
-    The front/left output is easy - it the same speaker position so it receives the full contribution of the front/left input. The amount
-    of contribution to apply to the side/left and back/left speakers, however, is a bit more complicated.
-
-    Imagine the front/left speaker as emitting audio from two planes - the front plane and the left plane. You can think of the front/left
-    speaker emitting half of it's total volume from the front, and the other half from the left. Since part of it's volume is being emitted
-    from the left side, and the side/left and back/left channels also emit audio from the left plane, one would expect that they would
-    receive some amount of contribution from front/left speaker. The amount of contribution depends on how many planes are shared between
-    the two speakers. Note that in the examples below I've added a top/front/left speaker as an example just to show how the math works
-    across 3 spatial dimensions.
-
-    The first thing to do is figure out how each speaker's volume is spread over each of plane:
-     - front/left:     2 planes (front and left)      = 1/2 = half it's total volume on each plane
-     - side/left:      1 plane (left only)            = 1/1 = entire volume from left plane
-     - back/left:      2 planes (back and left)       = 1/2 = half it's total volume on each plane
-     - top/front/left: 3 planes (top, front and left) = 1/3 = one third it's total volume on each plane
-
-    The amount of volume each channel contributes to each of it's planes is what controls how much it is willing to given and take to other
-    channels on the same plane. The volume that is willing to the given by one channel is multiplied by the volume that is willing to be
-    taken by the other to produce the final contribution.
-    */
-
-    /* Contribution = Sum(Volume to Give * Volume to Take) */
-    float contribution =
-        g_maChannelPlaneRatios[channelPositionA][0] * g_maChannelPlaneRatios[channelPositionB][0] +
-        g_maChannelPlaneRatios[channelPositionA][1] * g_maChannelPlaneRatios[channelPositionB][1] +
-        g_maChannelPlaneRatios[channelPositionA][2] * g_maChannelPlaneRatios[channelPositionB][2] +
-        g_maChannelPlaneRatios[channelPositionA][3] * g_maChannelPlaneRatios[channelPositionB][3] +
-        g_maChannelPlaneRatios[channelPositionA][4] * g_maChannelPlaneRatios[channelPositionB][4] +
-        g_maChannelPlaneRatios[channelPositionA][5] * g_maChannelPlaneRatios[channelPositionB][5];
-
-    return contribution;
-}
-
-MA_API ma_channel_converter_config ma_channel_converter_config_init(ma_format format, ma_uint32 channelsIn, const ma_channel* pChannelMapIn, ma_uint32 channelsOut, const ma_channel* pChannelMapOut, ma_channel_mix_mode mixingMode)
-{
-    ma_channel_converter_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format         = format;
-    config.channelsIn     = channelsIn;
-    config.channelsOut    = channelsOut;
-    config.pChannelMapIn  = pChannelMapIn;
-    config.pChannelMapOut = pChannelMapOut;
-    config.mixingMode     = mixingMode;
-
-    return config;
-}
-
-static ma_int32 ma_channel_converter_float_to_fixed(float x)
-{
-    return (ma_int32)(x * (1<<MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT));
-}
-
-static ma_uint32 ma_channel_map_get_spatial_channel_count(const ma_channel* pChannelMap, ma_uint32 channels)
-{
-    ma_uint32 spatialChannelCount = 0;
-    ma_uint32 iChannel;
-
-    MA_ASSERT(pChannelMap != NULL);
-    MA_ASSERT(channels > 0);
-
-    for (iChannel = 0; iChannel < channels; ++iChannel) {
-        if (ma_is_spatial_channel_position(ma_channel_map_get_channel(pChannelMap, channels, iChannel))) {
-            spatialChannelCount++;
-        }
-    }
-
-    return spatialChannelCount;
-}
-
-static ma_bool32 ma_is_spatial_channel_position(ma_channel channelPosition)
-{
-    int i;
-
-    if (channelPosition == MA_CHANNEL_NONE || channelPosition == MA_CHANNEL_MONO || channelPosition == MA_CHANNEL_LFE) {
-        return MA_FALSE;
-    }
-
-    if (channelPosition >= MA_CHANNEL_AUX_0 && channelPosition <= MA_CHANNEL_AUX_31) {
-        return MA_FALSE;
-    }
-
-    for (i = 0; i < 6; ++i) {   /* Each side of a cube. */
-        if (g_maChannelPlaneRatios[channelPosition][i] != 0) {
-            return MA_TRUE;
-        }
-    }
-
-    return MA_FALSE;
-}
-
-
-static ma_bool32 ma_channel_map_is_passthrough(const ma_channel* pChannelMapIn, ma_uint32 channelsIn, const ma_channel* pChannelMapOut, ma_uint32 channelsOut)
-{
-    if (channelsOut == channelsIn) {
-        return ma_channel_map_is_equal(pChannelMapOut, pChannelMapIn, channelsOut);
-    } else {
-        return MA_FALSE;    /* Channel counts differ, so cannot be a passthrough. */
-    }
-}
-
-static ma_channel_conversion_path ma_channel_map_get_conversion_path(const ma_channel* pChannelMapIn, ma_uint32 channelsIn, const ma_channel* pChannelMapOut, ma_uint32 channelsOut, ma_channel_mix_mode mode)
-{
-    if (ma_channel_map_is_passthrough(pChannelMapIn, channelsIn, pChannelMapOut, channelsOut)) {
-        return ma_channel_conversion_path_passthrough;
-    }
-
-    if (channelsOut == 1 && (pChannelMapOut == NULL || pChannelMapOut[0] == MA_CHANNEL_MONO)) {
-        return ma_channel_conversion_path_mono_out;
-    }
-
-    if (channelsIn == 1 && (pChannelMapIn == NULL || pChannelMapIn[0] == MA_CHANNEL_MONO)) {
-        return ma_channel_conversion_path_mono_in;
-    }
-
-    if (mode == ma_channel_mix_mode_custom_weights) {
-        return ma_channel_conversion_path_weights;
-    }
-
-    /*
-    We can use a simple shuffle if both channel maps have the same channel count and all channel
-    positions are present in both.
-    */
-    if (channelsIn == channelsOut) {
-        ma_uint32 iChannelIn;
-        ma_bool32 areAllChannelPositionsPresent = MA_TRUE;
-        for (iChannelIn = 0; iChannelIn < channelsIn; ++iChannelIn) {
-            ma_bool32 isInputChannelPositionInOutput = MA_FALSE;
-            if (ma_channel_map_contains_channel_position(channelsOut, pChannelMapOut, ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn))) {
-                isInputChannelPositionInOutput = MA_TRUE;
-                break;
-            }
-
-            if (!isInputChannelPositionInOutput) {
-                areAllChannelPositionsPresent = MA_FALSE;
-                break;
-            }
-        }
-
-        if (areAllChannelPositionsPresent) {
-            return ma_channel_conversion_path_shuffle;
-        }
-    }
-
-    /* Getting here means we'll need to use weights. */
-    return ma_channel_conversion_path_weights;
-}
-
-
-static ma_result ma_channel_map_build_shuffle_table(const ma_channel* pChannelMapIn, ma_uint32 channelCountIn, const ma_channel* pChannelMapOut, ma_uint32 channelCountOut, ma_uint8* pShuffleTable)
-{
-    ma_uint32 iChannelIn;
-    ma_uint32 iChannelOut;
-
-    if (pShuffleTable == NULL || channelCountIn == 0 || channelCountOut == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    /*
-    When building the shuffle table we just do a 1:1 mapping based on the first occurance of a channel. If the
-    input channel has more than one occurance of a channel position, the second one will be ignored.
-    */
-    for (iChannelOut = 0; iChannelOut < channelCountOut; iChannelOut += 1) {
-        ma_channel channelOut;
-
-        /* Default to MA_CHANNEL_INDEX_NULL so that if a mapping is not found it'll be set appropriately. */
-        pShuffleTable[iChannelOut] = MA_CHANNEL_INDEX_NULL;
-
-        channelOut = ma_channel_map_get_channel(pChannelMapOut, channelCountOut, iChannelOut);
-        for (iChannelIn = 0; iChannelIn < channelCountIn; iChannelIn += 1) {
-            ma_channel channelIn;
-
-            channelIn = ma_channel_map_get_channel(pChannelMapIn, channelCountIn, iChannelIn);
-            if (channelOut == channelIn) {
-                pShuffleTable[iChannelOut] = (ma_uint8)iChannelIn;
-                break;
-            }
-
-            /*
-            Getting here means the channels don't exactly match, but we are going to support some
-            relaxed matching for practicality. If, for example, there are two stereo channel maps,
-            but one uses front left/right and the other uses side left/right, it makes logical
-            sense to just map these. The way we'll do it is we'll check if there is a logical
-            corresponding mapping, and if so, apply it, but we will *not* break from the loop,
-            thereby giving the loop a chance to find an exact match later which will take priority.
-            */
-            switch (channelOut)
-            {
-                /* Left channels. */
-                case MA_CHANNEL_FRONT_LEFT:
-                case MA_CHANNEL_SIDE_LEFT:
-                {
-                    switch (channelIn) {
-                        case MA_CHANNEL_FRONT_LEFT:
-                        case MA_CHANNEL_SIDE_LEFT:
-                        {
-                            pShuffleTable[iChannelOut] = (ma_uint8)iChannelIn;
-                        } break;
-                    }
-                } break;
-
-                /* Right channels. */
-                case MA_CHANNEL_FRONT_RIGHT:
-                case MA_CHANNEL_SIDE_RIGHT:
-                {
-                    switch (channelIn) {
-                        case MA_CHANNEL_FRONT_RIGHT:
-                        case MA_CHANNEL_SIDE_RIGHT:
-                        {
-                            pShuffleTable[iChannelOut] = (ma_uint8)iChannelIn;
-                        } break;
-                    }
-                } break;
-
-                default: break;
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint32 channelsOut, const ma_uint8* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannelOut;
-
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
-            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
-                pFramesOut[iChannelOut] = pFramesIn[iChannelIn];
-            } else {
-                pFramesOut[iChannelOut] = 0;
-            }
-        }
-
-        pFramesOut += channelsOut;
-        pFramesIn  += channelsIn;
-    }
-}
-
-static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint32 channelsOut, const ma_int16* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannelOut;
-
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
-            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
-                pFramesOut[iChannelOut] = pFramesIn[iChannelIn];
-            } else {
-                pFramesOut[iChannelOut] = 0;
-            }
-        }
-
-        pFramesOut += channelsOut;
-        pFramesIn  += channelsIn;
-    }
-}
-
-static void ma_channel_map_apply_shuffle_table_s24(ma_uint8* pFramesOut, ma_uint32 channelsOut, const ma_uint8* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannelOut;
-
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
-            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
-                pFramesOut[iChannelOut*3 + 0] = pFramesIn[iChannelIn*3 + 0];
-                pFramesOut[iChannelOut*3 + 1] = pFramesIn[iChannelIn*3 + 1];
-                pFramesOut[iChannelOut*3 + 2] = pFramesIn[iChannelIn*3 + 2];
-            } else {
-                pFramesOut[iChannelOut*3 + 0] = 0;
-            }   pFramesOut[iChannelOut*3 + 1] = 0;
-        }       pFramesOut[iChannelOut*3 + 2] = 0;
-
-        pFramesOut += channelsOut*3;
-        pFramesIn  += channelsIn*3;
-    }
-}
-
-static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint32 channelsOut, const ma_int32* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannelOut;
-
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
-            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
-                pFramesOut[iChannelOut] = pFramesIn[iChannelIn];
-            } else {
-                pFramesOut[iChannelOut] = 0;
-            }
-        }
-
-        pFramesOut += channelsOut;
-        pFramesIn  += channelsIn;
-    }
-}
-
-static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32 channelsOut, const float* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannelOut;
-
-    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
-            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
-                pFramesOut[iChannelOut] = pFramesIn[iChannelIn];
-            } else {
-                pFramesOut[iChannelOut] = 0;
-            }
-        }
-
-        pFramesOut += channelsOut;
-        pFramesIn  += channelsIn;
-    }
-}
-
-static ma_result ma_channel_map_apply_shuffle_table(void* pFramesOut, ma_uint32 channelsOut, const void* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable, ma_format format)
-{
-    if (pFramesOut == NULL || pFramesIn == NULL || channelsOut == 0 || pShuffleTable == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    switch (format)
-    {
-        case ma_format_u8:
-        {
-            ma_channel_map_apply_shuffle_table_u8((ma_uint8*)pFramesOut, channelsOut, (const ma_uint8*)pFramesIn, channelsIn, frameCount, pShuffleTable);
-        } break;
-
-        case ma_format_s16:
-        {
-            ma_channel_map_apply_shuffle_table_s16((ma_int16*)pFramesOut, channelsOut, (const ma_int16*)pFramesIn, channelsIn, frameCount, pShuffleTable);
-        } break;
-
-        case ma_format_s24:
-        {
-            ma_channel_map_apply_shuffle_table_s24((ma_uint8*)pFramesOut, channelsOut, (const ma_uint8*)pFramesIn, channelsIn, frameCount, pShuffleTable);
-        } break;
-
-        case ma_format_s32:
-        {
-            ma_channel_map_apply_shuffle_table_s32((ma_int32*)pFramesOut, channelsOut, (const ma_int32*)pFramesIn, channelsIn, frameCount, pShuffleTable);
-        } break;
-
-        case ma_format_f32:
-        {
-            ma_channel_map_apply_shuffle_table_f32((float*)pFramesOut, channelsOut, (const float*)pFramesIn, channelsIn, frameCount, pShuffleTable);
-        } break;
-
-        default: return MA_INVALID_ARGS;    /* Unknown format. */
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_channel_map_apply_mono_out_f32(float* pFramesOut, const float* pFramesIn, const ma_channel* pChannelMapIn, ma_uint32 channelsIn, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannelIn;
-    ma_uint32 accumulationCount;
-
-    if (pFramesOut == NULL || pFramesIn == NULL || channelsIn == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* In this case the output stream needs to be the average of all channels, ignoring NONE. */
-
-    /* A quick pre-processing step to get the accumulation counter since we're ignoring NONE channels. */
-    accumulationCount = 0;
-    for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
-        if (ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn) != MA_CHANNEL_NONE) {
-            accumulationCount += 1;
-        }
-    }
-
-    if (accumulationCount > 0) {    /* <-- Prevent a division by zero. */
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float accumulation = 0;
-
-            for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
-                ma_channel channelIn = ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn);
-                if (channelIn != MA_CHANNEL_NONE) {
-                    accumulation += pFramesIn[iChannelIn];
-                }
-            }
-
-            pFramesOut[0] = accumulation / accumulationCount;
-            pFramesOut += 1;
-            pFramesIn  += channelsIn;
-        }
-    } else {
-        ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, 1);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut, const ma_channel* pChannelMapOut, ma_uint32 channelsOut, const float* MA_RESTRICT pFramesIn, ma_uint64 frameCount, ma_mono_expansion_mode monoExpansionMode)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannelOut;
-
-    if (pFramesOut == NULL || channelsOut == 0 || pFramesIn == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Note that the MA_CHANNEL_NONE channel must be ignored in all cases. */
-    switch (monoExpansionMode)
-    {
-        case ma_mono_expansion_mode_average:
-        {
-            float weight;
-            ma_uint32 validChannelCount = 0;
-
-            for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
-                if (channelOut != MA_CHANNEL_NONE) {
-                    validChannelCount += 1;
-                }
-            }
-
-            weight = 1.0f / validChannelCount;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                    ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
-                    if (channelOut != MA_CHANNEL_NONE) {
-                        pFramesOut[iChannelOut] = pFramesIn[0] * weight;
-                    }
-                }
-
-                pFramesOut += channelsOut;
-                pFramesIn  += 1;
-            }
-        } break;
-
-        case ma_mono_expansion_mode_stereo_only:
-        {
-            if (channelsOut >= 2) {
-                ma_uint32 iChannelLeft  = (ma_uint32)-1;
-                ma_uint32 iChannelRight = (ma_uint32)-1;
-
-                /*
-                We first need to find our stereo channels. We prefer front-left and front-right, but
-                if they're not available, we'll also try side-left and side-right. If neither are
-                available we'll fall through to the default case below.
-                */
-                for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                    ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
-                    if (channelOut == MA_CHANNEL_SIDE_LEFT) {
-                        iChannelLeft  = iChannelOut;
-                    }
-                    if (channelOut == MA_CHANNEL_SIDE_RIGHT) {
-                        iChannelRight = iChannelOut;
-                    }
-                }
-
-                for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                    ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
-                    if (channelOut == MA_CHANNEL_FRONT_LEFT) {
-                        iChannelLeft  = iChannelOut;
-                    }
-                    if (channelOut == MA_CHANNEL_FRONT_RIGHT) {
-                        iChannelRight = iChannelOut;
-                    }
-                }
-
-
-                if (iChannelLeft != (ma_uint32)-1 && iChannelRight != (ma_uint32)-1) {
-                    /* We found our stereo channels so we can duplicate the signal across those channels. */
-                    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                            ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
-                            if (channelOut != MA_CHANNEL_NONE) {
-                                if (iChannelOut == iChannelLeft || iChannelOut == iChannelRight) {
-                                    pFramesOut[iChannelOut] = pFramesIn[0];
-                                } else {
-                                    pFramesOut[iChannelOut] = 0.0f;
-                                }
-                            }
-                        }
-
-                        pFramesOut += channelsOut;
-                        pFramesIn  += 1;
-                    }
-
-                    break;  /* Get out of the switch. */
-                } else {
-                    /* Fallthrough. Does not have left and right channels. */
-                    goto default_handler;
-                }
-            } else {
-                /* Fallthrough. Does not have stereo channels. */
-                goto default_handler;
-            }
-        };  /* Fallthrough. See comments above. */
-
-        case ma_mono_expansion_mode_duplicate:
-        default:
-        {
-            default_handler:
-            {
-                if (channelsOut <= MA_MAX_CHANNELS) {
-                    ma_bool32 hasEmptyChannel = MA_FALSE;
-                    ma_channel channelPositions[MA_MAX_CHANNELS];
-                    for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                        channelPositions[iChannelOut] = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
-                        if (channelPositions[iChannelOut] == MA_CHANNEL_NONE) {
-                            hasEmptyChannel = MA_TRUE;
-                        }
-                    }
-
-                    if (hasEmptyChannel == MA_FALSE) {
-                        /*
-                        Faster path when there's no MA_CHANNEL_NONE channel positions. This should hopefully
-                        help the compiler with auto-vectorization.m
-                        */
-                        if (channelsOut == 2) {
-                        #if defined(MA_SUPPORT_SSE2)
-                            if (ma_has_sse2()) {
-                                /* We want to do two frames in each iteration. */
-                                ma_uint64 unrolledFrameCount = frameCount >> 1;
-
-                                for (iFrame = 0; iFrame < unrolledFrameCount; iFrame += 1) {
-                                    __m128 in0 = _mm_set1_ps(pFramesIn[iFrame*2 + 0]);
-                                    __m128 in1 = _mm_set1_ps(pFramesIn[iFrame*2 + 1]);
-                                    _mm_storeu_ps(&pFramesOut[iFrame*4 + 0], _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(0, 0, 0, 0)));
-                                }
-
-                                /* Tail. */
-                                iFrame = unrolledFrameCount << 1;
-                                goto generic_on_fastpath;
-                            } else
-                        #endif
-                            {
-                                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                                    for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
-                                        pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
-                                    }
-                                }
-                            }
-                        } else if (channelsOut == 6) {
-                        #if defined(MA_SUPPORT_SSE2)
-                            if (ma_has_sse2()) {
-                                /* We want to do two frames in each iteration so we can have a multiple of 4 samples. */
-                                ma_uint64 unrolledFrameCount = frameCount >> 1;
-
-                                for (iFrame = 0; iFrame < unrolledFrameCount; iFrame += 1) {
-                                    __m128 in0 = _mm_set1_ps(pFramesIn[iFrame*2 + 0]);
-                                    __m128 in1 = _mm_set1_ps(pFramesIn[iFrame*2 + 1]);
-
-                                    _mm_storeu_ps(&pFramesOut[iFrame*12 + 0], in0);
-                                    _mm_storeu_ps(&pFramesOut[iFrame*12 + 4], _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(0, 0, 0, 0)));
-                                    _mm_storeu_ps(&pFramesOut[iFrame*12 + 8], in1);
-                                }
-
-                                /* Tail. */
-                                iFrame = unrolledFrameCount << 1;
-                                goto generic_on_fastpath;
-                            } else
-                        #endif
-                            {
-                                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                                    for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
-                                        pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
-                                    }
-                                }
-                            }
-                        } else if (channelsOut == 8) {
-                        #if defined(MA_SUPPORT_SSE2)
-                            if (ma_has_sse2()) {
-                                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                                    __m128 in = _mm_set1_ps(pFramesIn[iFrame]);
-                                    _mm_storeu_ps(&pFramesOut[iFrame*8 + 0], in);
-                                    _mm_storeu_ps(&pFramesOut[iFrame*8 + 4], in);
-                                }
-                            } else
-                        #endif
-                            {
-                                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                                    for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
-                                        pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
-                                    }
-                                }
-                            }
-                        } else {
-                            iFrame = 0;
-
-                            #if defined(MA_SUPPORT_SSE2)    /* For silencing a warning with non-x86 builds. */
-                            generic_on_fastpath:
-                            #endif
-                            {
-                                for (; iFrame < frameCount; iFrame += 1) {
-                                    for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                                        pFramesOut[iFrame*channelsOut + iChannelOut] = pFramesIn[iFrame];
-                                    }
-                                }
-                            }
-                        }
-                    } else {
-                        /* Slow path. Need to handle MA_CHANNEL_NONE. */
-                        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                            for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                                if (channelPositions[iChannelOut] != MA_CHANNEL_NONE) {
-                                    pFramesOut[iFrame*channelsOut + iChannelOut] = pFramesIn[iFrame];
-                                }
-                            }
-                        }
-                    }
-                } else {
-                    /* Slow path. Too many channels to store on the stack. */
-                    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                            ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
-                            if (channelOut != MA_CHANNEL_NONE) {
-                                pFramesOut[iFrame*channelsOut + iChannelOut] = pFramesIn[iFrame];
-                            }
-                        }
-                    }
-                }
-            }
-        } break;
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChannelMapOut, ma_uint32 channelsOut, const float* pFramesIn, const ma_channel* pChannelMapIn, ma_uint32 channelsIn, ma_uint64 frameCount, ma_channel_mix_mode mode, ma_mono_expansion_mode monoExpansionMode)
-{
-    ma_channel_conversion_path conversionPath = ma_channel_map_get_conversion_path(pChannelMapIn, channelsIn, pChannelMapOut, channelsOut, mode);
-
-    /* Optimized Path: Passthrough */
-    if (conversionPath == ma_channel_conversion_path_passthrough) {
-        ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, ma_format_f32, channelsOut);
-        return;
-    }
-
-    /* Special Path: Mono Output. */
-    if (conversionPath == ma_channel_conversion_path_mono_out) {
-        ma_channel_map_apply_mono_out_f32(pFramesOut, pFramesIn, pChannelMapIn, channelsIn, frameCount);
-        return;
-    }
-
-    /* Special Path: Mono Input. */
-    if (conversionPath == ma_channel_conversion_path_mono_in) {
-        ma_channel_map_apply_mono_in_f32(pFramesOut, pChannelMapOut, channelsOut, pFramesIn, frameCount, monoExpansionMode);
-        return;
-    }
-
-    /* Getting here means we aren't running on an optimized conversion path. */
-    if (channelsOut <= MA_MAX_CHANNELS) {
-        ma_result result;
-
-        if (mode == ma_channel_mix_mode_simple) {
-            ma_channel shuffleTable[MA_MAX_CHANNELS];
-
-            result = ma_channel_map_build_shuffle_table(pChannelMapIn, channelsIn, pChannelMapOut, channelsOut, shuffleTable);
-            if (result != MA_SUCCESS) {
-                return;
-            }
-
-            result = ma_channel_map_apply_shuffle_table(pFramesOut, channelsOut, pFramesIn, channelsIn, frameCount, shuffleTable, ma_format_f32);
-            if (result != MA_SUCCESS) {
-                return;
-            }
-        } else {
-            ma_uint32 iFrame;
-            ma_uint32 iChannelOut;
-            ma_uint32 iChannelIn;
-            float weights[32][32];  /* Do not use MA_MAX_CHANNELS here! */
-
-            /*
-            If we have a small enough number of channels, pre-compute the weights. Otherwise we'll just need to
-            fall back to a slower path because otherwise we'll run out of stack space.
-            */
-            if (channelsIn <= ma_countof(weights) && channelsOut <= ma_countof(weights)) {
-                /* Pre-compute weights. */
-                for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                    ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
-                    for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
-                        ma_channel channelIn = ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn);
-                        weights[iChannelOut][iChannelIn] = ma_calculate_channel_position_rectangular_weight(channelOut, channelIn);
-                    }
-                }
-
-                iFrame = 0;
-
-                /* Experiment: Try an optimized unroll for some specific cases to see how it improves performance. RESULT: Good gains. */
-                if (channelsOut == 8) {
-                    /* Experiment 2: Expand the inner loop to see what kind of different it makes. RESULT: Small, but worthwhile gain. */
-                    if (channelsIn == 2) {
-                        for (; iFrame < frameCount; iFrame += 1) {
-                            float accumulation[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-                            accumulation[0] += pFramesIn[iFrame*2 + 0] * weights[0][0];
-                            accumulation[1] += pFramesIn[iFrame*2 + 0] * weights[1][0];
-                            accumulation[2] += pFramesIn[iFrame*2 + 0] * weights[2][0];
-                            accumulation[3] += pFramesIn[iFrame*2 + 0] * weights[3][0];
-                            accumulation[4] += pFramesIn[iFrame*2 + 0] * weights[4][0];
-                            accumulation[5] += pFramesIn[iFrame*2 + 0] * weights[5][0];
-                            accumulation[6] += pFramesIn[iFrame*2 + 0] * weights[6][0];
-                            accumulation[7] += pFramesIn[iFrame*2 + 0] * weights[7][0];
-
-                            accumulation[0] += pFramesIn[iFrame*2 + 1] * weights[0][1];
-                            accumulation[1] += pFramesIn[iFrame*2 + 1] * weights[1][1];
-                            accumulation[2] += pFramesIn[iFrame*2 + 1] * weights[2][1];
-                            accumulation[3] += pFramesIn[iFrame*2 + 1] * weights[3][1];
-                            accumulation[4] += pFramesIn[iFrame*2 + 1] * weights[4][1];
-                            accumulation[5] += pFramesIn[iFrame*2 + 1] * weights[5][1];
-                            accumulation[6] += pFramesIn[iFrame*2 + 1] * weights[6][1];
-                            accumulation[7] += pFramesIn[iFrame*2 + 1] * weights[7][1];
-
-                            pFramesOut[iFrame*8 + 0] = accumulation[0];
-                            pFramesOut[iFrame*8 + 1] = accumulation[1];
-                            pFramesOut[iFrame*8 + 2] = accumulation[2];
-                            pFramesOut[iFrame*8 + 3] = accumulation[3];
-                            pFramesOut[iFrame*8 + 4] = accumulation[4];
-                            pFramesOut[iFrame*8 + 5] = accumulation[5];
-                            pFramesOut[iFrame*8 + 6] = accumulation[6];
-                            pFramesOut[iFrame*8 + 7] = accumulation[7];
-                        }
-                    } else {
-                        /* When outputting to 8 channels, we can do everything in groups of two 4x SIMD operations. */
-                        for (; iFrame < frameCount; iFrame += 1) {
-                            float accumulation[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-                            for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
-                                accumulation[0] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[0][iChannelIn];
-                                accumulation[1] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[1][iChannelIn];
-                                accumulation[2] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[2][iChannelIn];
-                                accumulation[3] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[3][iChannelIn];
-                                accumulation[4] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[4][iChannelIn];
-                                accumulation[5] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[5][iChannelIn];
-                                accumulation[6] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[6][iChannelIn];
-                                accumulation[7] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[7][iChannelIn];
-                            }
-
-                            pFramesOut[iFrame*8 + 0] = accumulation[0];
-                            pFramesOut[iFrame*8 + 1] = accumulation[1];
-                            pFramesOut[iFrame*8 + 2] = accumulation[2];
-                            pFramesOut[iFrame*8 + 3] = accumulation[3];
-                            pFramesOut[iFrame*8 + 4] = accumulation[4];
-                            pFramesOut[iFrame*8 + 5] = accumulation[5];
-                            pFramesOut[iFrame*8 + 6] = accumulation[6];
-                            pFramesOut[iFrame*8 + 7] = accumulation[7];
-                        }
-                    }
-                } else if (channelsOut == 6) {
-                    /*
-                    When outputting to 6 channels we unfortunately don't have a nice multiple of 4 to do 4x SIMD operations. Instead we'll
-                    expand our weights and do two frames at a time.
-                    */
-                    for (; iFrame < frameCount; iFrame += 1) {
-                        float accumulation[12] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-
-                        for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
-                            accumulation[0] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[0][iChannelIn];
-                            accumulation[1] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[1][iChannelIn];
-                            accumulation[2] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[2][iChannelIn];
-                            accumulation[3] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[3][iChannelIn];
-                            accumulation[4] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[4][iChannelIn];
-                            accumulation[5] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[5][iChannelIn];
-                        }
-
-                        pFramesOut[iFrame*6 + 0] = accumulation[0];
-                        pFramesOut[iFrame*6 + 1] = accumulation[1];
-                        pFramesOut[iFrame*6 + 2] = accumulation[2];
-                        pFramesOut[iFrame*6 + 3] = accumulation[3];
-                        pFramesOut[iFrame*6 + 4] = accumulation[4];
-                        pFramesOut[iFrame*6 + 5] = accumulation[5];
-                    }
-                }
-
-                /* Leftover frames. */
-                for (; iFrame < frameCount; iFrame += 1) {
-                    for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                        float accumulation = 0;
-
-                        for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
-                            accumulation += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[iChannelOut][iChannelIn];
-                        }
-
-                        pFramesOut[iFrame*channelsOut + iChannelOut] = accumulation;
-                    }
-                }
-            } else {
-                /* Cannot pre-compute weights because not enough room in stack-allocated buffer. */
-                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                    for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
-                        float accumulation = 0;
-                        ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
-
-                        for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
-                            ma_channel channelIn = ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn);
-                            accumulation += pFramesIn[iFrame*channelsIn + iChannelIn] * ma_calculate_channel_position_rectangular_weight(channelOut, channelIn);
-                        }
-
-                        pFramesOut[iFrame*channelsOut + iChannelOut] = accumulation;
-                    }
-                }
-            }
-        }
-    } else {
-        /* Fall back to silence. If you hit this, what are you doing with so many channels?! */
-        ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, channelsOut);
-    }
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t channelMapInOffset;
-    size_t channelMapOutOffset;
-    size_t shuffleTableOffset;
-    size_t weightsOffset;
-} ma_channel_converter_heap_layout;
-
-static ma_channel_conversion_path ma_channel_converter_config_get_conversion_path(const ma_channel_converter_config* pConfig)
-{
-    return ma_channel_map_get_conversion_path(pConfig->pChannelMapIn, pConfig->channelsIn, pConfig->pChannelMapOut, pConfig->channelsOut, pConfig->mixingMode);
-}
-
-static ma_result ma_channel_converter_get_heap_layout(const ma_channel_converter_config* pConfig, ma_channel_converter_heap_layout* pHeapLayout)
-{
-    ma_channel_conversion_path conversionPath;
-
-    MA_ASSERT(pHeapLayout != NULL);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channelsIn == 0 || pConfig->channelsOut == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (!ma_channel_map_is_valid(pConfig->pChannelMapIn, pConfig->channelsIn)) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (!ma_channel_map_is_valid(pConfig->pChannelMapOut, pConfig->channelsOut)) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* Input channel map. Only need to allocate this if we have an input channel map (otherwise default channel map is assumed). */
-    pHeapLayout->channelMapInOffset = pHeapLayout->sizeInBytes;
-    if (pConfig->pChannelMapIn != NULL) {
-        pHeapLayout->sizeInBytes += sizeof(ma_channel) * pConfig->channelsIn;
-    }
-
-    /* Output channel map. Only need to allocate this if we have an output channel map (otherwise default channel map is assumed). */
-    pHeapLayout->channelMapOutOffset = pHeapLayout->sizeInBytes;
-    if (pConfig->pChannelMapOut != NULL) {
-        pHeapLayout->sizeInBytes += sizeof(ma_channel) * pConfig->channelsOut;
-    }
-
-    /* Alignment for the next section. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    /* Whether or not we use weights of a shuffle table depends on the channel map themselves and the algorithm we've chosen. */
-    conversionPath = ma_channel_converter_config_get_conversion_path(pConfig);
-
-    /* Shuffle table */
-    pHeapLayout->shuffleTableOffset = pHeapLayout->sizeInBytes;
-    if (conversionPath == ma_channel_conversion_path_shuffle) {
-        pHeapLayout->sizeInBytes += sizeof(ma_uint8) * pConfig->channelsOut;
-    }
-
-    /* Weights */
-    pHeapLayout->weightsOffset = pHeapLayout->sizeInBytes;
-    if (conversionPath == ma_channel_conversion_path_weights) {
-        pHeapLayout->sizeInBytes += sizeof(float*) * pConfig->channelsIn;
-        pHeapLayout->sizeInBytes += sizeof(float ) * pConfig->channelsIn * pConfig->channelsOut;
-    }
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_channel_converter_get_heap_size(const ma_channel_converter_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_channel_converter_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_channel_converter_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_channel_converter_init_preallocated(const ma_channel_converter_config* pConfig, void* pHeap, ma_channel_converter* pConverter)
-{
-    ma_result result;
-    ma_channel_converter_heap_layout heapLayout;
-
-    if (pConverter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pConverter);
-
-    result = ma_channel_converter_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pConverter->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pConverter->_pHeap, heapLayout.sizeInBytes);
-
-    pConverter->format      = pConfig->format;
-    pConverter->channelsIn  = pConfig->channelsIn;
-    pConverter->channelsOut = pConfig->channelsOut;
-    pConverter->mixingMode  = pConfig->mixingMode;
-
-    if (pConfig->pChannelMapIn != NULL) {
-        pConverter->pChannelMapIn = (ma_channel*)ma_offset_ptr(pHeap, heapLayout.channelMapInOffset);
-        ma_channel_map_copy_or_default(pConverter->pChannelMapIn, pConfig->channelsIn, pConfig->pChannelMapIn, pConfig->channelsIn);
-    } else {
-        pConverter->pChannelMapIn = NULL;   /* Use default channel map. */
-    }
-
-    if (pConfig->pChannelMapOut != NULL) {
-        pConverter->pChannelMapOut = (ma_channel*)ma_offset_ptr(pHeap, heapLayout.channelMapOutOffset);
-        ma_channel_map_copy_or_default(pConverter->pChannelMapOut, pConfig->channelsOut, pConfig->pChannelMapOut, pConfig->channelsOut);
-    } else {
-        pConverter->pChannelMapOut = NULL;  /* Use default channel map. */
-    }
-
-    pConverter->conversionPath = ma_channel_converter_config_get_conversion_path(pConfig);
-
-    if (pConverter->conversionPath == ma_channel_conversion_path_shuffle) {
-        pConverter->pShuffleTable = (ma_uint8*)ma_offset_ptr(pHeap, heapLayout.shuffleTableOffset);
-        ma_channel_map_build_shuffle_table(pConverter->pChannelMapIn, pConverter->channelsIn, pConverter->pChannelMapOut, pConverter->channelsOut, pConverter->pShuffleTable);
-    }
-
-    if (pConverter->conversionPath == ma_channel_conversion_path_weights) {
-        ma_uint32 iChannelIn;
-        ma_uint32 iChannelOut;
-
-        if (pConverter->format == ma_format_f32) {
-            pConverter->weights.f32 = (float**   )ma_offset_ptr(pHeap, heapLayout.weightsOffset);
-            for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; iChannelIn += 1) {
-                pConverter->weights.f32[iChannelIn] = (float*)ma_offset_ptr(pHeap, heapLayout.weightsOffset + ((sizeof(float*) * pConverter->channelsIn) + (sizeof(float) * pConverter->channelsOut * iChannelIn)));
-            }
-        } else {
-            pConverter->weights.s16 = (ma_int32**)ma_offset_ptr(pHeap, heapLayout.weightsOffset);
-            for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; iChannelIn += 1) {
-                pConverter->weights.s16[iChannelIn] = (ma_int32*)ma_offset_ptr(pHeap, heapLayout.weightsOffset + ((sizeof(ma_int32*) * pConverter->channelsIn) + (sizeof(ma_int32) * pConverter->channelsOut * iChannelIn)));
-            }
-        }
-
-        /* Silence our weights by default. */
-        for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; iChannelIn += 1) {
-            for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; iChannelOut += 1) {
-                if (pConverter->format == ma_format_f32) {
-                    pConverter->weights.f32[iChannelIn][iChannelOut] = 0.0f;
-                } else {
-                    pConverter->weights.s16[iChannelIn][iChannelOut] = 0;
-                }
-            }
-        }
-
-        /*
-        We now need to fill out our weights table. This is determined by the mixing mode.
-        */
-
-        /* In all cases we need to make sure all channels that are present in both channel maps have a 1:1 mapping. */
-        for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
-            ma_channel channelPosIn = ma_channel_map_get_channel(pConverter->pChannelMapIn, pConverter->channelsIn, iChannelIn);
-
-            for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
-                ma_channel channelPosOut = ma_channel_map_get_channel(pConverter->pChannelMapOut, pConverter->channelsOut, iChannelOut);
-
-                if (channelPosIn == channelPosOut) {
-                    float weight = 1;
-
-                    if (pConverter->format == ma_format_f32) {
-                        pConverter->weights.f32[iChannelIn][iChannelOut] = weight;
-                    } else {
-                        pConverter->weights.s16[iChannelIn][iChannelOut] = ma_channel_converter_float_to_fixed(weight);
-                    }
-                }
-            }
-        }
-
-        switch (pConverter->mixingMode)
-        {
-            case ma_channel_mix_mode_custom_weights:
-            {
-                if (pConfig->ppWeights == NULL) {
-                    return MA_INVALID_ARGS; /* Config specified a custom weights mixing mode, but no custom weights have been specified. */
-                }
-
-                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; iChannelIn += 1) {
-                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; iChannelOut += 1) {
-                        float weight = pConfig->ppWeights[iChannelIn][iChannelOut];
-
-                        if (pConverter->format == ma_format_f32) {
-                            pConverter->weights.f32[iChannelIn][iChannelOut] = weight;
-                        } else {
-                            pConverter->weights.s16[iChannelIn][iChannelOut] = ma_channel_converter_float_to_fixed(weight);
-                        }
-                    }
-                }
-            } break;
-
-            case ma_channel_mix_mode_simple:
-            {
-                /*
-                In simple mode, only set weights for channels that have exactly matching types, leave the rest at
-                zero. The 1:1 mappings have already been covered before this switch statement.
-                */
-            } break;
-
-            case ma_channel_mix_mode_rectangular:
-            default:
-            {
-                /* Unmapped input channels. */
-                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
-                    ma_channel channelPosIn = ma_channel_map_get_channel(pConverter->pChannelMapIn, pConverter->channelsIn, iChannelIn);
-
-                    if (ma_is_spatial_channel_position(channelPosIn)) {
-                        if (!ma_channel_map_contains_channel_position(pConverter->channelsOut, pConverter->pChannelMapOut, channelPosIn)) {
-                            for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
-                                ma_channel channelPosOut = ma_channel_map_get_channel(pConverter->pChannelMapOut, pConverter->channelsOut, iChannelOut);
-
-                                if (ma_is_spatial_channel_position(channelPosOut)) {
-                                    float weight = 0;
-                                    if (pConverter->mixingMode == ma_channel_mix_mode_rectangular) {
-                                        weight = ma_calculate_channel_position_rectangular_weight(channelPosIn, channelPosOut);
-                                    }
-
-                                    /* Only apply the weight if we haven't already got some contribution from the respective channels. */
-                                    if (pConverter->format == ma_format_f32) {
-                                        if (pConverter->weights.f32[iChannelIn][iChannelOut] == 0) {
-                                            pConverter->weights.f32[iChannelIn][iChannelOut] = weight;
-                                        }
-                                    } else {
-                                        if (pConverter->weights.s16[iChannelIn][iChannelOut] == 0) {
-                                            pConverter->weights.s16[iChannelIn][iChannelOut] = ma_channel_converter_float_to_fixed(weight);
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-
-                /* Unmapped output channels. */
-                for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
-                    ma_channel channelPosOut = ma_channel_map_get_channel(pConverter->pChannelMapOut, pConverter->channelsOut, iChannelOut);
-
-                    if (ma_is_spatial_channel_position(channelPosOut)) {
-                        if (!ma_channel_map_contains_channel_position(pConverter->channelsIn, pConverter->pChannelMapIn, channelPosOut)) {
-                            for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
-                                ma_channel channelPosIn = ma_channel_map_get_channel(pConverter->pChannelMapIn, pConverter->channelsIn, iChannelIn);
-
-                                if (ma_is_spatial_channel_position(channelPosIn)) {
-                                    float weight = 0;
-                                    if (pConverter->mixingMode == ma_channel_mix_mode_rectangular) {
-                                        weight = ma_calculate_channel_position_rectangular_weight(channelPosIn, channelPosOut);
-                                    }
-
-                                    /* Only apply the weight if we haven't already got some contribution from the respective channels. */
-                                    if (pConverter->format == ma_format_f32) {
-                                        if (pConverter->weights.f32[iChannelIn][iChannelOut] == 0) {
-                                            pConverter->weights.f32[iChannelIn][iChannelOut] = weight;
-                                        }
-                                    } else {
-                                        if (pConverter->weights.s16[iChannelIn][iChannelOut] == 0) {
-                                            pConverter->weights.s16[iChannelIn][iChannelOut] = ma_channel_converter_float_to_fixed(weight);
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-
-                /* If LFE is in the output channel map but was not present in the input channel map, configure its weight now */
-                if (pConfig->calculateLFEFromSpatialChannels) {
-                    if (!ma_channel_map_contains_channel_position(pConverter->channelsIn, pConverter->pChannelMapIn, MA_CHANNEL_LFE)) {
-                        ma_uint32 spatialChannelCount = ma_channel_map_get_spatial_channel_count(pConverter->pChannelMapIn, pConverter->channelsIn);
-                        ma_uint32 iChannelOutLFE;
-
-                        if (spatialChannelCount > 0 && ma_channel_map_find_channel_position(pConverter->channelsOut, pConverter->pChannelMapOut, MA_CHANNEL_LFE, &iChannelOutLFE)) {
-                            const float weightForLFE = 1.0f / spatialChannelCount;
-                            for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
-                                const ma_channel channelPosIn = ma_channel_map_get_channel(pConverter->pChannelMapIn, pConverter->channelsIn, iChannelIn);
-                                if (ma_is_spatial_channel_position(channelPosIn)) {
-                                    if (pConverter->format == ma_format_f32) {
-                                        if (pConverter->weights.f32[iChannelIn][iChannelOutLFE] == 0) {
-                                            pConverter->weights.f32[iChannelIn][iChannelOutLFE] = weightForLFE;
-                                        }
-                                    } else {
-                                        if (pConverter->weights.s16[iChannelIn][iChannelOutLFE] == 0) {
-                                            pConverter->weights.s16[iChannelIn][iChannelOutLFE] = ma_channel_converter_float_to_fixed(weightForLFE);
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            } break;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_channel_converter_init(const ma_channel_converter_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_channel_converter* pConverter)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_channel_converter_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_channel_converter_init_preallocated(pConfig, pHeap, pConverter);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pConverter->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_channel_converter_uninit(ma_channel_converter* pConverter, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pConverter == NULL) {
-        return;
-    }
-
-    if (pConverter->_ownsHeap) {
-        ma_free(pConverter->_pHeap, pAllocationCallbacks);
-    }
-}
-
-static ma_result ma_channel_converter_process_pcm_frames__passthrough(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    MA_ASSERT(pConverter != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-    MA_ASSERT(pFramesIn  != NULL);
-
-    ma_copy_memory_64(pFramesOut, pFramesIn, frameCount * ma_get_bytes_per_frame(pConverter->format, pConverter->channelsOut));
-    return MA_SUCCESS;
-}
-
-static ma_result ma_channel_converter_process_pcm_frames__shuffle(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    MA_ASSERT(pConverter != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-    MA_ASSERT(pFramesIn  != NULL);
-    MA_ASSERT(pConverter->channelsIn == pConverter->channelsOut);
-
-    return ma_channel_map_apply_shuffle_table(pFramesOut, pConverter->channelsOut, pFramesIn, pConverter->channelsIn, frameCount, pConverter->pShuffleTable, pConverter->format);
-}
-
-static ma_result ma_channel_converter_process_pcm_frames__mono_in(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-
-    MA_ASSERT(pConverter != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-    MA_ASSERT(pFramesIn  != NULL);
-    MA_ASSERT(pConverter->channelsIn == 1);
-
-    switch (pConverter->format)
-    {
-        case ma_format_u8:
-        {
-            /* */ ma_uint8* pFramesOutU8 = (      ma_uint8*)pFramesOut;
-            const ma_uint8* pFramesInU8  = (const ma_uint8*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                ma_uint32 iChannel;
-                for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
-                    pFramesOutU8[iFrame*pConverter->channelsOut + iChannel] = pFramesInU8[iFrame];
-                }
-            }
-        } break;
-
-        case ma_format_s16:
-        {
-            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
-            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
-
-            if (pConverter->channelsOut == 2) {
-                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                    pFramesOutS16[iFrame*2 + 0] = pFramesInS16[iFrame];
-                    pFramesOutS16[iFrame*2 + 1] = pFramesInS16[iFrame];
-                }
-            } else {
-                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                    ma_uint32 iChannel;
-                    for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
-                        pFramesOutS16[iFrame*pConverter->channelsOut + iChannel] = pFramesInS16[iFrame];
-                    }
-                }
-            }
-        } break;
-
-        case ma_format_s24:
-        {
-            /* */ ma_uint8* pFramesOutS24 = (      ma_uint8*)pFramesOut;
-            const ma_uint8* pFramesInS24  = (const ma_uint8*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                ma_uint32 iChannel;
-                for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
-                    ma_uint64 iSampleOut = iFrame*pConverter->channelsOut + iChannel;
-                    ma_uint64 iSampleIn  = iFrame;
-                    pFramesOutS24[iSampleOut*3 + 0] = pFramesInS24[iSampleIn*3 + 0];
-                    pFramesOutS24[iSampleOut*3 + 1] = pFramesInS24[iSampleIn*3 + 1];
-                    pFramesOutS24[iSampleOut*3 + 2] = pFramesInS24[iSampleIn*3 + 2];
-                }
-            }
-        } break;
-
-        case ma_format_s32:
-        {
-            /* */ ma_int32* pFramesOutS32 = (      ma_int32*)pFramesOut;
-            const ma_int32* pFramesInS32  = (const ma_int32*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                ma_uint32 iChannel;
-                for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
-                    pFramesOutS32[iFrame*pConverter->channelsOut + iChannel] = pFramesInS32[iFrame];
-                }
-            }
-        } break;
-
-        case ma_format_f32:
-        {
-            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
-            const float* pFramesInF32  = (const float*)pFramesIn;
-
-            if (pConverter->channelsOut == 2) {
-                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                    pFramesOutF32[iFrame*2 + 0] = pFramesInF32[iFrame];
-                    pFramesOutF32[iFrame*2 + 1] = pFramesInF32[iFrame];
-                }
-            } else {
-                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                    ma_uint32 iChannel;
-                    for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
-                        pFramesOutF32[iFrame*pConverter->channelsOut + iChannel] = pFramesInF32[iFrame];
-                    }
-                }
-            }
-        } break;
-
-        default: return MA_INVALID_OPERATION;   /* Unknown format. */
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_channel_converter_process_pcm_frames__mono_out(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannel;
-
-    MA_ASSERT(pConverter != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-    MA_ASSERT(pFramesIn  != NULL);
-    MA_ASSERT(pConverter->channelsOut == 1);
-
-    switch (pConverter->format)
-    {
-        case ma_format_u8:
-        {
-            /* */ ma_uint8* pFramesOutU8 = (      ma_uint8*)pFramesOut;
-            const ma_uint8* pFramesInU8  = (const ma_uint8*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                ma_int32 t = 0;
-                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
-                    t += ma_pcm_sample_u8_to_s16_no_scale(pFramesInU8[iFrame*pConverter->channelsIn + iChannel]);
-                }
-
-                pFramesOutU8[iFrame] = ma_clip_u8(t / pConverter->channelsOut);
-            }
-        } break;
-
-        case ma_format_s16:
-        {
-            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
-            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                ma_int32 t = 0;
-                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
-                    t += pFramesInS16[iFrame*pConverter->channelsIn + iChannel];
-                }
-
-                pFramesOutS16[iFrame] = (ma_int16)(t / pConverter->channelsIn);
-            }
-        } break;
-
-        case ma_format_s24:
-        {
-            /* */ ma_uint8* pFramesOutS24 = (      ma_uint8*)pFramesOut;
-            const ma_uint8* pFramesInS24  = (const ma_uint8*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                ma_int64 t = 0;
-                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
-                    t += ma_pcm_sample_s24_to_s32_no_scale(&pFramesInS24[(iFrame*pConverter->channelsIn + iChannel)*3]);
-                }
-
-                ma_pcm_sample_s32_to_s24_no_scale(t / pConverter->channelsIn, &pFramesOutS24[iFrame*3]);
-            }
-        } break;
-
-        case ma_format_s32:
-        {
-            /* */ ma_int32* pFramesOutS32 = (      ma_int32*)pFramesOut;
-            const ma_int32* pFramesInS32  = (const ma_int32*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                ma_int64 t = 0;
-                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
-                    t += pFramesInS32[iFrame*pConverter->channelsIn + iChannel];
-                }
-
-                pFramesOutS32[iFrame] = (ma_int32)(t / pConverter->channelsIn);
-            }
-        } break;
-
-        case ma_format_f32:
-        {
-            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
-            const float* pFramesInF32  = (const float*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
-                float t = 0;
-                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
-                    t += pFramesInF32[iFrame*pConverter->channelsIn + iChannel];
-                }
-
-                pFramesOutF32[iFrame] = t / pConverter->channelsIn;
-            }
-        } break;
-
-        default: return MA_INVALID_OPERATION;   /* Unknown format. */
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_channel_converter_process_pcm_frames__weights(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    ma_uint32 iFrame;
-    ma_uint32 iChannelIn;
-    ma_uint32 iChannelOut;
-
-    MA_ASSERT(pConverter != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-    MA_ASSERT(pFramesIn  != NULL);
-
-    /* This is the more complicated case. Each of the output channels is accumulated with 0 or more input channels. */
-
-    /* Clear. */
-    ma_zero_memory_64(pFramesOut, frameCount * ma_get_bytes_per_frame(pConverter->format, pConverter->channelsOut));
-
-    /* Accumulate. */
-    switch (pConverter->format)
-    {
-        case ma_format_u8:
-        {
-            /* */ ma_uint8* pFramesOutU8 = (      ma_uint8*)pFramesOut;
-            const ma_uint8* pFramesInU8  = (const ma_uint8*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
-                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
-                        ma_int16 u8_O = ma_pcm_sample_u8_to_s16_no_scale(pFramesOutU8[iFrame*pConverter->channelsOut + iChannelOut]);
-                        ma_int16 u8_I = ma_pcm_sample_u8_to_s16_no_scale(pFramesInU8 [iFrame*pConverter->channelsIn  + iChannelIn ]);
-                        ma_int32 s    = (ma_int32)ma_clamp(u8_O + ((u8_I * pConverter->weights.s16[iChannelIn][iChannelOut]) >> MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT), -128, 127);
-                        pFramesOutU8[iFrame*pConverter->channelsOut + iChannelOut] = ma_clip_u8((ma_int16)s);
-                    }
-                }
-            }
-        } break;
-
-        case ma_format_s16:
-        {
-            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
-            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
-                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
-                        ma_int32 s = pFramesOutS16[iFrame*pConverter->channelsOut + iChannelOut];
-                        s += (pFramesInS16[iFrame*pConverter->channelsIn + iChannelIn] * pConverter->weights.s16[iChannelIn][iChannelOut]) >> MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT;
-
-                        pFramesOutS16[iFrame*pConverter->channelsOut + iChannelOut] = (ma_int16)ma_clamp(s, -32768, 32767);
-                    }
-                }
-            }
-        } break;
-
-        case ma_format_s24:
-        {
-            /* */ ma_uint8* pFramesOutS24 = (      ma_uint8*)pFramesOut;
-            const ma_uint8* pFramesInS24  = (const ma_uint8*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
-                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
-                        ma_int64 s24_O = ma_pcm_sample_s24_to_s32_no_scale(&pFramesOutS24[(iFrame*pConverter->channelsOut + iChannelOut)*3]);
-                        ma_int64 s24_I = ma_pcm_sample_s24_to_s32_no_scale(&pFramesInS24 [(iFrame*pConverter->channelsIn  + iChannelIn )*3]);
-                        ma_int64 s24   = (ma_int32)ma_clamp(s24_O + ((s24_I * pConverter->weights.s16[iChannelIn][iChannelOut]) >> MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT), -8388608, 8388607);
-                        ma_pcm_sample_s32_to_s24_no_scale(s24, &pFramesOutS24[(iFrame*pConverter->channelsOut + iChannelOut)*3]);
-                    }
-                }
-            }
-        } break;
-
-        case ma_format_s32:
-        {
-            /* */ ma_int32* pFramesOutS32 = (      ma_int32*)pFramesOut;
-            const ma_int32* pFramesInS32  = (const ma_int32*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
-                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
-                        ma_int64 s = pFramesOutS32[iFrame*pConverter->channelsOut + iChannelOut];
-                        s += ((ma_int64)pFramesInS32[iFrame*pConverter->channelsIn + iChannelIn] * pConverter->weights.s16[iChannelIn][iChannelOut]) >> MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT;
-
-                        pFramesOutS32[iFrame*pConverter->channelsOut + iChannelOut] = ma_clip_s32(s);
-                    }
-                }
-            }
-        } break;
-
-        case ma_format_f32:
-        {
-            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
-            const float* pFramesInF32  = (const float*)pFramesIn;
-
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
-                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
-                        pFramesOutF32[iFrame*pConverter->channelsOut + iChannelOut] += pFramesInF32[iFrame*pConverter->channelsIn + iChannelIn] * pConverter->weights.f32[iChannelIn][iChannelOut];
-                    }
-                }
-            }
-        } break;
-
-        default: return MA_INVALID_OPERATION;   /* Unknown format. */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_channel_converter_process_pcm_frames(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
-{
-    if (pConverter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pFramesOut == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pFramesIn == NULL) {
-        ma_zero_memory_64(pFramesOut, frameCount * ma_get_bytes_per_frame(pConverter->format, pConverter->channelsOut));
-        return MA_SUCCESS;
-    }
-
-    switch (pConverter->conversionPath)
-    {
-        case ma_channel_conversion_path_passthrough: return ma_channel_converter_process_pcm_frames__passthrough(pConverter, pFramesOut, pFramesIn, frameCount);
-        case ma_channel_conversion_path_mono_out:    return ma_channel_converter_process_pcm_frames__mono_out(pConverter, pFramesOut, pFramesIn, frameCount);
-        case ma_channel_conversion_path_mono_in:     return ma_channel_converter_process_pcm_frames__mono_in(pConverter, pFramesOut, pFramesIn, frameCount);
-        case ma_channel_conversion_path_shuffle:     return ma_channel_converter_process_pcm_frames__shuffle(pConverter, pFramesOut, pFramesIn, frameCount);
-        case ma_channel_conversion_path_weights:
-        default:
-        {
-            return ma_channel_converter_process_pcm_frames__weights(pConverter, pFramesOut, pFramesIn, frameCount);
-        }
-    }
-}
-
-MA_API ma_result ma_channel_converter_get_input_channel_map(const ma_channel_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    if (pConverter == NULL || pChannelMap == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_channel_map_copy_or_default(pChannelMap, channelMapCap, pConverter->pChannelMapIn, pConverter->channelsIn);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_channel_converter_get_output_channel_map(const ma_channel_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    if (pConverter == NULL || pChannelMap == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_channel_map_copy_or_default(pChannelMap, channelMapCap, pConverter->pChannelMapOut, pConverter->channelsOut);
-
-    return MA_SUCCESS;
-}
-
-
-/**************************************************************************************************************************************************************
-
-Data Conversion
-
-**************************************************************************************************************************************************************/
-MA_API ma_data_converter_config ma_data_converter_config_init_default(void)
-{
-    ma_data_converter_config config;
-    MA_ZERO_OBJECT(&config);
-
-    config.ditherMode = ma_dither_mode_none;
-    config.resampling.algorithm = ma_resample_algorithm_linear;
-    config.allowDynamicSampleRate = MA_FALSE; /* Disable dynamic sample rates by default because dynamic rate adjustments should be quite rare and it allows an optimization for cases when the in and out sample rates are the same. */
-
-    /* Linear resampling defaults. */
-    config.resampling.linear.lpfOrder = 1;
-
-    return config;
-}
-
-MA_API ma_data_converter_config ma_data_converter_config_init(ma_format formatIn, ma_format formatOut, ma_uint32 channelsIn, ma_uint32 channelsOut, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
-{
-    ma_data_converter_config config = ma_data_converter_config_init_default();
-    config.formatIn      = formatIn;
-    config.formatOut     = formatOut;
-    config.channelsIn    = channelsIn;
-    config.channelsOut   = channelsOut;
-    config.sampleRateIn  = sampleRateIn;
-    config.sampleRateOut = sampleRateOut;
-
-    return config;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t channelConverterOffset;
-    size_t resamplerOffset;
-} ma_data_converter_heap_layout;
-
-static ma_bool32 ma_data_converter_config_is_resampler_required(const ma_data_converter_config* pConfig)
-{
-    MA_ASSERT(pConfig != NULL);
-
-    return pConfig->allowDynamicSampleRate || pConfig->sampleRateIn != pConfig->sampleRateOut;
-}
-
-static ma_format ma_data_converter_config_get_mid_format(const ma_data_converter_config* pConfig)
-{
-    MA_ASSERT(pConfig != NULL);
-
-    /*
-    We want to avoid as much data conversion as possible. The channel converter and linear
-    resampler both support s16 and f32 natively. We need to decide on the format to use for this
-    stage. We call this the mid format because it's used in the middle stage of the conversion
-    pipeline. If the output format is either s16 or f32 we use that one. If that is not the case it
-    will do the same thing for the input format. If it's neither we just use f32. If we are using a
-    custom resampling backend, we can only guarantee that f32 will be supported so we'll be forced
-    to use that if resampling is required.
-    */
-    if (ma_data_converter_config_is_resampler_required(pConfig) && pConfig->resampling.algorithm != ma_resample_algorithm_linear) {
-        return ma_format_f32;  /* <-- Force f32 since that is the only one we can guarantee will be supported by the resampler. */
-    } else {
-        /*  */ if (pConfig->formatOut == ma_format_s16 || pConfig->formatOut == ma_format_f32) {
-            return pConfig->formatOut;
-        } else if (pConfig->formatIn  == ma_format_s16 || pConfig->formatIn  == ma_format_f32) {
-            return pConfig->formatIn;
-        } else {
-            return ma_format_f32;
-        }
-    }
-}
-
-static ma_channel_converter_config ma_channel_converter_config_init_from_data_converter_config(const ma_data_converter_config* pConfig)
-{
-    ma_channel_converter_config channelConverterConfig;
-
-    MA_ASSERT(pConfig != NULL);
-
-    channelConverterConfig = ma_channel_converter_config_init(ma_data_converter_config_get_mid_format(pConfig), pConfig->channelsIn, pConfig->pChannelMapIn, pConfig->channelsOut, pConfig->pChannelMapOut, pConfig->channelMixMode);
-    channelConverterConfig.ppWeights = pConfig->ppChannelWeights;
-    channelConverterConfig.calculateLFEFromSpatialChannels = pConfig->calculateLFEFromSpatialChannels;
-
-    return channelConverterConfig;
-}
-
-static ma_resampler_config ma_resampler_config_init_from_data_converter_config(const ma_data_converter_config* pConfig)
-{
-    ma_resampler_config resamplerConfig;
-    ma_uint32 resamplerChannels;
-
-    MA_ASSERT(pConfig != NULL);
-
-    /* The resampler is the most expensive part of the conversion process, so we need to do it at the stage where the channel count is at it's lowest. */
-    if (pConfig->channelsIn < pConfig->channelsOut) {
-        resamplerChannels = pConfig->channelsIn;
-    } else {
-        resamplerChannels = pConfig->channelsOut;
-    }
-
-    resamplerConfig = ma_resampler_config_init(ma_data_converter_config_get_mid_format(pConfig), resamplerChannels, pConfig->sampleRateIn, pConfig->sampleRateOut, pConfig->resampling.algorithm);
-    resamplerConfig.linear           = pConfig->resampling.linear;
-    resamplerConfig.pBackendVTable   = pConfig->resampling.pBackendVTable;
-    resamplerConfig.pBackendUserData = pConfig->resampling.pBackendUserData;
-
-    return resamplerConfig;
-}
-
-static ma_result ma_data_converter_get_heap_layout(const ma_data_converter_config* pConfig, ma_data_converter_heap_layout* pHeapLayout)
-{
-    ma_result result;
-
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channelsIn == 0 || pConfig->channelsOut == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* Channel converter. */
-    pHeapLayout->channelConverterOffset = pHeapLayout->sizeInBytes;
-    {
-        size_t heapSizeInBytes;
-        ma_channel_converter_config channelConverterConfig = ma_channel_converter_config_init_from_data_converter_config(pConfig);
-
-        result = ma_channel_converter_get_heap_size(&channelConverterConfig, &heapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->sizeInBytes += heapSizeInBytes;
-    }
-
-    /* Resampler. */
-    pHeapLayout->resamplerOffset = pHeapLayout->sizeInBytes;
-    if (ma_data_converter_config_is_resampler_required(pConfig)) {
-        size_t heapSizeInBytes;
-        ma_resampler_config resamplerConfig = ma_resampler_config_init_from_data_converter_config(pConfig);
-
-        result = ma_resampler_get_heap_size(&resamplerConfig, &heapSizeInBytes);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->sizeInBytes += heapSizeInBytes;
-    }
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_converter_get_heap_size(const ma_data_converter_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_data_converter_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_data_converter_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_converter_init_preallocated(const ma_data_converter_config* pConfig, void* pHeap, ma_data_converter* pConverter)
-{
-    ma_result result;
-    ma_data_converter_heap_layout heapLayout;
-    ma_format midFormat;
-    ma_bool32 isResamplingRequired;
-
-    if (pConverter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pConverter);
-
-    result = ma_data_converter_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pConverter->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pConverter->formatIn      = pConfig->formatIn;
-    pConverter->formatOut     = pConfig->formatOut;
-    pConverter->channelsIn    = pConfig->channelsIn;
-    pConverter->channelsOut   = pConfig->channelsOut;
-    pConverter->sampleRateIn  = pConfig->sampleRateIn;
-    pConverter->sampleRateOut = pConfig->sampleRateOut;
-    pConverter->ditherMode    = pConfig->ditherMode;
-
-    /*
-    Determine if resampling is required. We need to do this so we can determine an appropriate
-    mid format to use. If resampling is required, the mid format must be ma_format_f32 since
-    that is the only one that is guaranteed to supported by custom resampling backends.
-    */
-    isResamplingRequired = ma_data_converter_config_is_resampler_required(pConfig);
-    midFormat = ma_data_converter_config_get_mid_format(pConfig);
-
-
-    /* Channel converter. We always initialize this, but we check if it configures itself as a passthrough to determine whether or not it's needed. */
-    {
-        ma_channel_converter_config channelConverterConfig = ma_channel_converter_config_init_from_data_converter_config(pConfig);
-
-        result = ma_channel_converter_init_preallocated(&channelConverterConfig, ma_offset_ptr(pHeap, heapLayout.channelConverterOffset), &pConverter->channelConverter);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        /* If the channel converter is not a passthrough we need to enable it. Otherwise we can skip it. */
-        if (pConverter->channelConverter.conversionPath != ma_channel_conversion_path_passthrough) {
-            pConverter->hasChannelConverter = MA_TRUE;
-        }
-    }
-
-
-    /* Resampler. */
-    if (isResamplingRequired) {
-        ma_resampler_config resamplerConfig = ma_resampler_config_init_from_data_converter_config(pConfig);
-
-        result = ma_resampler_init_preallocated(&resamplerConfig, ma_offset_ptr(pHeap, heapLayout.resamplerOffset), &pConverter->resampler);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pConverter->hasResampler = MA_TRUE;
-    }
-
-
-    /* We can simplify pre- and post-format conversion if we have neither channel conversion nor resampling. */
-    if (pConverter->hasChannelConverter == MA_FALSE && pConverter->hasResampler == MA_FALSE) {
-        /* We have neither channel conversion nor resampling so we'll only need one of pre- or post-format conversion, or none if the input and output formats are the same. */
-        if (pConverter->formatIn == pConverter->formatOut) {
-            /* The formats are the same so we can just pass through. */
-            pConverter->hasPreFormatConversion  = MA_FALSE;
-            pConverter->hasPostFormatConversion = MA_FALSE;
-        } else {
-            /* The formats are different so we need to do either pre- or post-format conversion. It doesn't matter which. */
-            pConverter->hasPreFormatConversion  = MA_FALSE;
-            pConverter->hasPostFormatConversion = MA_TRUE;
-        }
-    } else {
-        /* We have a channel converter and/or resampler so we'll need channel conversion based on the mid format. */
-        if (pConverter->formatIn != midFormat) {
-            pConverter->hasPreFormatConversion  = MA_TRUE;
-        }
-        if (pConverter->formatOut != midFormat) {
-            pConverter->hasPostFormatConversion = MA_TRUE;
-        }
-    }
-
-    /* We can enable passthrough optimizations if applicable. Note that we'll only be able to do this if the sample rate is static. */
-    if (pConverter->hasPreFormatConversion  == MA_FALSE &&
-        pConverter->hasPostFormatConversion == MA_FALSE &&
-        pConverter->hasChannelConverter     == MA_FALSE &&
-        pConverter->hasResampler            == MA_FALSE) {
-        pConverter->isPassthrough = MA_TRUE;
-    }
-
-
-    /* We now need to determine our execution path. */
-    if (pConverter->isPassthrough) {
-        pConverter->executionPath = ma_data_converter_execution_path_passthrough;
-    } else {
-        if (pConverter->channelsIn < pConverter->channelsOut) {
-            /* Do resampling first, if necessary. */
-            MA_ASSERT(pConverter->hasChannelConverter == MA_TRUE);
-
-            if (pConverter->hasResampler) {
-                pConverter->executionPath = ma_data_converter_execution_path_resample_first;
-            } else {
-                pConverter->executionPath = ma_data_converter_execution_path_channels_only;
-            }
-        } else {
-            /* Do channel conversion first, if necessary. */
-            if (pConverter->hasChannelConverter) {
-                if (pConverter->hasResampler) {
-                    pConverter->executionPath = ma_data_converter_execution_path_channels_first;
-                } else {
-                    pConverter->executionPath = ma_data_converter_execution_path_channels_only;
-                }
-            } else {
-                /* Channel routing not required. */
-                if (pConverter->hasResampler) {
-                    pConverter->executionPath = ma_data_converter_execution_path_resample_only;
-                } else {
-                    pConverter->executionPath = ma_data_converter_execution_path_format_only;
-                }
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_converter_init(const ma_data_converter_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_converter* pConverter)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_data_converter_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_data_converter_init_preallocated(pConfig, pHeap, pConverter);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pConverter->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_data_converter_uninit(ma_data_converter* pConverter, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pConverter == NULL) {
-        return;
-    }
-
-    if (pConverter->hasResampler) {
-        ma_resampler_uninit(&pConverter->resampler, pAllocationCallbacks);
-    }
-
-    ma_channel_converter_uninit(&pConverter->channelConverter, pAllocationCallbacks);
-
-    if (pConverter->_ownsHeap) {
-        ma_free(pConverter->_pHeap, pAllocationCallbacks);
-    }
-}
-
-static ma_result ma_data_converter_process_pcm_frames__passthrough(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 frameCount;
-
-    MA_ASSERT(pConverter != NULL);
-
-    frameCountIn = 0;
-    if (pFrameCountIn != NULL) {
-        frameCountIn = *pFrameCountIn;
-    }
-
-    frameCountOut = 0;
-    if (pFrameCountOut != NULL) {
-        frameCountOut = *pFrameCountOut;
-    }
-
-    frameCount = ma_min(frameCountIn, frameCountOut);
-
-    if (pFramesOut != NULL) {
-        if (pFramesIn != NULL) {
-            ma_copy_memory_64(pFramesOut, pFramesIn, frameCount * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
-        } else {
-            ma_zero_memory_64(pFramesOut,            frameCount * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
-        }
-    }
-
-    if (pFrameCountIn != NULL) {
-        *pFrameCountIn = frameCount;
-    }
-    if (pFrameCountOut != NULL) {
-        *pFrameCountOut = frameCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_data_converter_process_pcm_frames__format_only(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 frameCount;
-
-    MA_ASSERT(pConverter != NULL);
-
-    frameCountIn = 0;
-    if (pFrameCountIn != NULL) {
-        frameCountIn = *pFrameCountIn;
-    }
-
-    frameCountOut = 0;
-    if (pFrameCountOut != NULL) {
-        frameCountOut = *pFrameCountOut;
-    }
-
-    frameCount = ma_min(frameCountIn, frameCountOut);
-
-    if (pFramesOut != NULL) {
-        if (pFramesIn != NULL) {
-            ma_convert_pcm_frames_format(pFramesOut, pConverter->formatOut, pFramesIn, pConverter->formatIn, frameCount, pConverter->channelsIn, pConverter->ditherMode);
-        } else {
-            ma_zero_memory_64(pFramesOut, frameCount * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
-        }
-    }
-
-    if (pFrameCountIn != NULL) {
-        *pFrameCountIn = frameCount;
-    }
-    if (pFrameCountOut != NULL) {
-        *pFrameCountOut = frameCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_data_converter_process_pcm_frames__resample_with_format_conversion(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 framesProcessedIn;
-    ma_uint64 framesProcessedOut;
-
-    MA_ASSERT(pConverter != NULL);
-
-    frameCountIn = 0;
-    if (pFrameCountIn != NULL) {
-        frameCountIn = *pFrameCountIn;
-    }
-
-    frameCountOut = 0;
-    if (pFrameCountOut != NULL) {
-        frameCountOut = *pFrameCountOut;
-    }
-
-    framesProcessedIn  = 0;
-    framesProcessedOut = 0;
-
-    while (framesProcessedOut < frameCountOut) {
-        ma_uint8 pTempBufferOut[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-        const ma_uint32 tempBufferOutCap = sizeof(pTempBufferOut) / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
-        const void* pFramesInThisIteration;
-        /* */ void* pFramesOutThisIteration;
-        ma_uint64 frameCountInThisIteration;
-        ma_uint64 frameCountOutThisIteration;
-
-        if (pFramesIn != NULL) {
-            pFramesInThisIteration = ma_offset_ptr(pFramesIn, framesProcessedIn * ma_get_bytes_per_frame(pConverter->formatIn, pConverter->channelsIn));
-        } else {
-            pFramesInThisIteration = NULL;
-        }
-
-        if (pFramesOut != NULL) {
-            pFramesOutThisIteration = ma_offset_ptr(pFramesOut, framesProcessedOut * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
-        } else {
-            pFramesOutThisIteration = NULL;
-        }
-
-        /* Do a pre format conversion if necessary. */
-        if (pConverter->hasPreFormatConversion) {
-            ma_uint8 pTempBufferIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-            const ma_uint32 tempBufferInCap = sizeof(pTempBufferIn) / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
-
-            frameCountInThisIteration  = (frameCountIn - framesProcessedIn);
-            if (frameCountInThisIteration > tempBufferInCap) {
-                frameCountInThisIteration = tempBufferInCap;
-            }
-
-            if (pConverter->hasPostFormatConversion) {
-               if (frameCountInThisIteration > tempBufferOutCap) {
-                   frameCountInThisIteration = tempBufferOutCap;
-               }
-            }
-
-            if (pFramesInThisIteration != NULL) {
-                ma_convert_pcm_frames_format(pTempBufferIn, pConverter->resampler.format, pFramesInThisIteration, pConverter->formatIn, frameCountInThisIteration, pConverter->channelsIn, pConverter->ditherMode);
-            } else {
-                MA_ZERO_MEMORY(pTempBufferIn, sizeof(pTempBufferIn));
-            }
-
-            frameCountOutThisIteration = (frameCountOut - framesProcessedOut);
-
-            if (pConverter->hasPostFormatConversion) {
-                /* Both input and output conversion required. Output to the temp buffer. */
-                if (frameCountOutThisIteration > tempBufferOutCap) {
-                    frameCountOutThisIteration = tempBufferOutCap;
-                }
-
-                result = ma_resampler_process_pcm_frames(&pConverter->resampler, pTempBufferIn, &frameCountInThisIteration, pTempBufferOut, &frameCountOutThisIteration);
-            } else {
-                /* Only pre-format required. Output straight to the output buffer. */
-                result = ma_resampler_process_pcm_frames(&pConverter->resampler, pTempBufferIn, &frameCountInThisIteration, pFramesOutThisIteration, &frameCountOutThisIteration);
-            }
-
-            if (result != MA_SUCCESS) {
-                break;
-            }
-        } else {
-            /* No pre-format required. Just read straight from the input buffer. */
-            MA_ASSERT(pConverter->hasPostFormatConversion == MA_TRUE);
-
-            frameCountInThisIteration  = (frameCountIn  - framesProcessedIn);
-            frameCountOutThisIteration = (frameCountOut - framesProcessedOut);
-            if (frameCountOutThisIteration > tempBufferOutCap) {
-                frameCountOutThisIteration = tempBufferOutCap;
-            }
-
-            result = ma_resampler_process_pcm_frames(&pConverter->resampler, pFramesInThisIteration, &frameCountInThisIteration, pTempBufferOut, &frameCountOutThisIteration);
-            if (result != MA_SUCCESS) {
-                break;
-            }
-        }
-
-        /* If we are doing a post format conversion we need to do that now. */
-        if (pConverter->hasPostFormatConversion) {
-            if (pFramesOutThisIteration != NULL) {
-                ma_convert_pcm_frames_format(pFramesOutThisIteration, pConverter->formatOut, pTempBufferOut, pConverter->resampler.format, frameCountOutThisIteration, pConverter->resampler.channels, pConverter->ditherMode);
-            }
-        }
-
-        framesProcessedIn  += frameCountInThisIteration;
-        framesProcessedOut += frameCountOutThisIteration;
-
-        MA_ASSERT(framesProcessedIn  <= frameCountIn);
-        MA_ASSERT(framesProcessedOut <= frameCountOut);
-
-        if (frameCountOutThisIteration == 0) {
-            break;  /* Consumed all of our input data. */
-        }
-    }
-
-    if (pFrameCountIn != NULL) {
-        *pFrameCountIn = framesProcessedIn;
-    }
-    if (pFrameCountOut != NULL) {
-        *pFrameCountOut = framesProcessedOut;
-    }
-
-    return result;
-}
-
-static ma_result ma_data_converter_process_pcm_frames__resample_only(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    MA_ASSERT(pConverter != NULL);
-
-    if (pConverter->hasPreFormatConversion == MA_FALSE && pConverter->hasPostFormatConversion == MA_FALSE) {
-        /* Neither pre- nor post-format required. This is simple case where only resampling is required. */
-        return ma_resampler_process_pcm_frames(&pConverter->resampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-    } else {
-        /* Format conversion required. */
-        return ma_data_converter_process_pcm_frames__resample_with_format_conversion(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-    }
-}
-
-static ma_result ma_data_converter_process_pcm_frames__channels_only(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    ma_result result;
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 frameCount;
-
-    MA_ASSERT(pConverter != NULL);
-
-    frameCountIn = 0;
-    if (pFrameCountIn != NULL) {
-        frameCountIn = *pFrameCountIn;
-    }
-
-    frameCountOut = 0;
-    if (pFrameCountOut != NULL) {
-        frameCountOut = *pFrameCountOut;
-    }
-
-    frameCount = ma_min(frameCountIn, frameCountOut);
-
-    if (pConverter->hasPreFormatConversion == MA_FALSE && pConverter->hasPostFormatConversion == MA_FALSE) {
-        /* No format conversion required. */
-        result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pFramesOut, pFramesIn, frameCount);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    } else {
-        /* Format conversion required. */
-        ma_uint64 framesProcessed = 0;
-
-        while (framesProcessed < frameCount) {
-            ma_uint8 pTempBufferOut[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-            const ma_uint32 tempBufferOutCap = sizeof(pTempBufferOut) / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsOut);
-            const void* pFramesInThisIteration;
-            /* */ void* pFramesOutThisIteration;
-            ma_uint64 frameCountThisIteration;
-
-            if (pFramesIn != NULL) {
-                pFramesInThisIteration = ma_offset_ptr(pFramesIn, framesProcessed * ma_get_bytes_per_frame(pConverter->formatIn, pConverter->channelsIn));
-            } else {
-                pFramesInThisIteration = NULL;
-            }
-
-            if (pFramesOut != NULL) {
-                pFramesOutThisIteration = ma_offset_ptr(pFramesOut, framesProcessed * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
-            } else {
-                pFramesOutThisIteration = NULL;
-            }
-
-            /* Do a pre format conversion if necessary. */
-            if (pConverter->hasPreFormatConversion) {
-                ma_uint8 pTempBufferIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-                const ma_uint32 tempBufferInCap = sizeof(pTempBufferIn) / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsIn);
-
-                frameCountThisIteration = (frameCount - framesProcessed);
-                if (frameCountThisIteration > tempBufferInCap) {
-                    frameCountThisIteration = tempBufferInCap;
-                }
-
-                if (pConverter->hasPostFormatConversion) {
-                    if (frameCountThisIteration > tempBufferOutCap) {
-                        frameCountThisIteration = tempBufferOutCap;
-                    }
-                }
-
-                if (pFramesInThisIteration != NULL) {
-                    ma_convert_pcm_frames_format(pTempBufferIn, pConverter->channelConverter.format, pFramesInThisIteration, pConverter->formatIn, frameCountThisIteration, pConverter->channelsIn, pConverter->ditherMode);
-                } else {
-                    MA_ZERO_MEMORY(pTempBufferIn, sizeof(pTempBufferIn));
-                }
-
-                if (pConverter->hasPostFormatConversion) {
-                    /* Both input and output conversion required. Output to the temp buffer. */
-                    result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pTempBufferOut, pTempBufferIn, frameCountThisIteration);
-                } else {
-                    /* Only pre-format required. Output straight to the output buffer. */
-                    result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pFramesOutThisIteration, pTempBufferIn, frameCountThisIteration);
-                }
-
-                if (result != MA_SUCCESS) {
-                    break;
-                }
-            } else {
-                /* No pre-format required. Just read straight from the input buffer. */
-                MA_ASSERT(pConverter->hasPostFormatConversion == MA_TRUE);
-
-                frameCountThisIteration = (frameCount - framesProcessed);
-                if (frameCountThisIteration > tempBufferOutCap) {
-                    frameCountThisIteration = tempBufferOutCap;
-                }
-
-                result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pTempBufferOut, pFramesInThisIteration, frameCountThisIteration);
-                if (result != MA_SUCCESS) {
-                    break;
-                }
-            }
-
-            /* If we are doing a post format conversion we need to do that now. */
-            if (pConverter->hasPostFormatConversion) {
-                if (pFramesOutThisIteration != NULL) {
-                    ma_convert_pcm_frames_format(pFramesOutThisIteration, pConverter->formatOut, pTempBufferOut, pConverter->channelConverter.format, frameCountThisIteration, pConverter->channelConverter.channelsOut, pConverter->ditherMode);
-                }
-            }
-
-            framesProcessed += frameCountThisIteration;
-        }
-    }
-
-    if (pFrameCountIn != NULL) {
-        *pFrameCountIn = frameCount;
-    }
-    if (pFrameCountOut != NULL) {
-        *pFrameCountOut = frameCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_data_converter_process_pcm_frames__resample_first(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    ma_result result;
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 framesProcessedIn;
-    ma_uint64 framesProcessedOut;
-    ma_uint8  pTempBufferIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];   /* In resampler format. */
-    ma_uint64 tempBufferInCap;
-    ma_uint8  pTempBufferMid[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In resampler format, channel converter input format. */
-    ma_uint64 tempBufferMidCap;
-    ma_uint8  pTempBufferOut[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In channel converter output format. */
-    ma_uint64 tempBufferOutCap;
-
-    MA_ASSERT(pConverter != NULL);
-    MA_ASSERT(pConverter->resampler.format   == pConverter->channelConverter.format);
-    MA_ASSERT(pConverter->resampler.channels == pConverter->channelConverter.channelsIn);
-    MA_ASSERT(pConverter->resampler.channels <  pConverter->channelConverter.channelsOut);
-
-    frameCountIn = 0;
-    if (pFrameCountIn != NULL) {
-        frameCountIn = *pFrameCountIn;
-    }
-
-    frameCountOut = 0;
-    if (pFrameCountOut != NULL) {
-        frameCountOut = *pFrameCountOut;
-    }
-
-    framesProcessedIn  = 0;
-    framesProcessedOut = 0;
-
-    tempBufferInCap  = sizeof(pTempBufferIn)  / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
-    tempBufferMidCap = sizeof(pTempBufferIn)  / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
-    tempBufferOutCap = sizeof(pTempBufferOut) / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsOut);
-
-    while (framesProcessedOut < frameCountOut) {
-        ma_uint64 frameCountInThisIteration;
-        ma_uint64 frameCountOutThisIteration;
-        const void* pRunningFramesIn = NULL;
-        void* pRunningFramesOut = NULL;
-        const void* pResampleBufferIn;
-        void* pChannelsBufferOut;
-
-        if (pFramesIn != NULL) {
-            pRunningFramesIn  = ma_offset_ptr(pFramesIn,  framesProcessedIn  * ma_get_bytes_per_frame(pConverter->formatIn, pConverter->channelsIn));
-        }
-        if (pFramesOut != NULL) {
-            pRunningFramesOut = ma_offset_ptr(pFramesOut, framesProcessedOut * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
-        }
-
-        /* Run input data through the resampler and output it to the temporary buffer. */
-        frameCountInThisIteration = (frameCountIn - framesProcessedIn);
-
-        if (pConverter->hasPreFormatConversion) {
-            if (frameCountInThisIteration > tempBufferInCap) {
-                frameCountInThisIteration = tempBufferInCap;
-            }
-        }
-
-        frameCountOutThisIteration = (frameCountOut - framesProcessedOut);
-        if (frameCountOutThisIteration > tempBufferMidCap) {
-            frameCountOutThisIteration = tempBufferMidCap;
-        }
-
-        /* We can't read more frames than can fit in the output buffer. */
-        if (pConverter->hasPostFormatConversion) {
-            if (frameCountOutThisIteration > tempBufferOutCap) {
-                frameCountOutThisIteration = tempBufferOutCap;
-            }
-        }
-
-        /* We need to ensure we don't try to process too many input frames that we run out of room in the output buffer. If this happens we'll end up glitching. */
-
-        /*
-        We need to try to predict how many input frames will be required for the resampler. If the
-        resampler can tell us, we'll use that. Otherwise we'll need to make a best guess. The further
-        off we are from this, the more wasted format conversions we'll end up doing.
-        */
-        #if 1
-        {
-            ma_uint64 requiredInputFrameCount;
-
-            result = ma_resampler_get_required_input_frame_count(&pConverter->resampler, frameCountOutThisIteration, &requiredInputFrameCount);
-            if (result != MA_SUCCESS) {
-                /* Fall back to a best guess. */
-                requiredInputFrameCount = (frameCountOutThisIteration * pConverter->resampler.sampleRateIn) / pConverter->resampler.sampleRateOut;
-            }
-
-            if (frameCountInThisIteration > requiredInputFrameCount) {
-                frameCountInThisIteration = requiredInputFrameCount;
-            }
-        }
-        #endif
-
-        if (pConverter->hasPreFormatConversion) {
-            if (pFramesIn != NULL) {
-                ma_convert_pcm_frames_format(pTempBufferIn, pConverter->resampler.format, pRunningFramesIn, pConverter->formatIn, frameCountInThisIteration, pConverter->channelsIn, pConverter->ditherMode);
-                pResampleBufferIn = pTempBufferIn;
-            } else {
-                pResampleBufferIn = NULL;
-            }
-        } else {
-            pResampleBufferIn = pRunningFramesIn;
-        }
-
-        result = ma_resampler_process_pcm_frames(&pConverter->resampler, pResampleBufferIn, &frameCountInThisIteration, pTempBufferMid, &frameCountOutThisIteration);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-
-        /*
-        The input data has been resampled so now we need to run it through the channel converter. The input data is always contained in pTempBufferMid. We only need to do
-        this part if we have an output buffer.
-        */
-        if (pFramesOut != NULL) {
-            if (pConverter->hasPostFormatConversion) {
-                pChannelsBufferOut = pTempBufferOut;
-            } else {
-                pChannelsBufferOut = pRunningFramesOut;
-            }
-
-            result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pChannelsBufferOut, pTempBufferMid, frameCountOutThisIteration);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-
-            /* Finally we do post format conversion. */
-            if (pConverter->hasPostFormatConversion) {
-                ma_convert_pcm_frames_format(pRunningFramesOut, pConverter->formatOut, pChannelsBufferOut, pConverter->channelConverter.format, frameCountOutThisIteration, pConverter->channelConverter.channelsOut, pConverter->ditherMode);
-            }
-        }
-
-
-        framesProcessedIn  += frameCountInThisIteration;
-        framesProcessedOut += frameCountOutThisIteration;
-
-        MA_ASSERT(framesProcessedIn  <= frameCountIn);
-        MA_ASSERT(framesProcessedOut <= frameCountOut);
-
-        if (frameCountOutThisIteration == 0) {
-            break;  /* Consumed all of our input data. */
-        }
-    }
-
-    if (pFrameCountIn != NULL) {
-        *pFrameCountIn = framesProcessedIn;
-    }
-    if (pFrameCountOut != NULL) {
-        *pFrameCountOut = framesProcessedOut;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_data_converter_process_pcm_frames__channels_first(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    ma_result result;
-    ma_uint64 frameCountIn;
-    ma_uint64 frameCountOut;
-    ma_uint64 framesProcessedIn;
-    ma_uint64 framesProcessedOut;
-    ma_uint8  pTempBufferIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];   /* In resampler format. */
-    ma_uint64 tempBufferInCap;
-    ma_uint8  pTempBufferMid[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In resampler format, channel converter input format. */
-    ma_uint64 tempBufferMidCap;
-    ma_uint8  pTempBufferOut[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In channel converter output format. */
-    ma_uint64 tempBufferOutCap;
-
-    MA_ASSERT(pConverter != NULL);
-    MA_ASSERT(pConverter->resampler.format   == pConverter->channelConverter.format);
-    MA_ASSERT(pConverter->resampler.channels == pConverter->channelConverter.channelsOut);
-    MA_ASSERT(pConverter->resampler.channels <= pConverter->channelConverter.channelsIn);
-
-    frameCountIn = 0;
-    if (pFrameCountIn != NULL) {
-        frameCountIn = *pFrameCountIn;
-    }
-
-    frameCountOut = 0;
-    if (pFrameCountOut != NULL) {
-        frameCountOut = *pFrameCountOut;
-    }
-
-    framesProcessedIn  = 0;
-    framesProcessedOut = 0;
-
-    tempBufferInCap  = sizeof(pTempBufferIn)  / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsIn);
-    tempBufferMidCap = sizeof(pTempBufferIn)  / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsOut);
-    tempBufferOutCap = sizeof(pTempBufferOut) / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
-
-    while (framesProcessedOut < frameCountOut) {
-        ma_uint64 frameCountInThisIteration;
-        ma_uint64 frameCountOutThisIteration;
-        const void* pRunningFramesIn = NULL;
-        void* pRunningFramesOut = NULL;
-        const void* pChannelsBufferIn;
-        void* pResampleBufferOut;
-
-        if (pFramesIn != NULL) {
-            pRunningFramesIn  = ma_offset_ptr(pFramesIn,  framesProcessedIn  * ma_get_bytes_per_frame(pConverter->formatIn, pConverter->channelsIn));
-        }
-        if (pFramesOut != NULL) {
-            pRunningFramesOut = ma_offset_ptr(pFramesOut, framesProcessedOut * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
-        }
-
-        /*
-        Before doing any processing we need to determine how many frames we should try processing
-        this iteration, for both input and output. The resampler requires us to perform format and
-        channel conversion before passing any data into it. If we get our input count wrong, we'll
-        end up peforming redundant pre-processing. This isn't the end of the world, but it does
-        result in some inefficiencies proportionate to how far our estimates are off.
-
-        If the resampler has a means to calculate exactly how much we'll need, we'll use that.
-        Otherwise we'll make a best guess. In order to do this, we'll need to calculate the output
-        frame count first.
-        */
-        frameCountOutThisIteration = (frameCountOut - framesProcessedOut);
-        if (frameCountOutThisIteration > tempBufferMidCap) {
-            frameCountOutThisIteration = tempBufferMidCap;
-        }
-
-        if (pConverter->hasPostFormatConversion) {
-            if (frameCountOutThisIteration > tempBufferOutCap) {
-                frameCountOutThisIteration = tempBufferOutCap;
-            }
-        }
-
-        /* Now that we have the output frame count we can determine the input frame count. */
-        frameCountInThisIteration = (frameCountIn - framesProcessedIn);
-        if (pConverter->hasPreFormatConversion) {
-            if (frameCountInThisIteration > tempBufferInCap) {
-                frameCountInThisIteration = tempBufferInCap;
-            }
-        }
-
-        if (frameCountInThisIteration > tempBufferMidCap) {
-            frameCountInThisIteration = tempBufferMidCap;
-        }
-
-        #if 1
-        {
-            ma_uint64 requiredInputFrameCount;
-
-            result = ma_resampler_get_required_input_frame_count(&pConverter->resampler, frameCountOutThisIteration, &requiredInputFrameCount);
-            if (result != MA_SUCCESS) {
-                /* Fall back to a best guess. */
-                requiredInputFrameCount = (frameCountOutThisIteration * pConverter->resampler.sampleRateIn) / pConverter->resampler.sampleRateOut;
-            }
-
-            if (frameCountInThisIteration > requiredInputFrameCount) {
-                frameCountInThisIteration = requiredInputFrameCount;
-            }
-        }
-        #endif
-
-
-        /* Pre format conversion. */
-        if (pConverter->hasPreFormatConversion) {
-            if (pRunningFramesIn != NULL) {
-                ma_convert_pcm_frames_format(pTempBufferIn, pConverter->channelConverter.format, pRunningFramesIn, pConverter->formatIn, frameCountInThisIteration, pConverter->channelsIn, pConverter->ditherMode);
-                pChannelsBufferIn = pTempBufferIn;
-            } else {
-                pChannelsBufferIn = NULL;
-            }
-        } else {
-            pChannelsBufferIn = pRunningFramesIn;
-        }
-
-
-        /* Channel conversion. */
-        result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pTempBufferMid, pChannelsBufferIn, frameCountInThisIteration);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-
-        /* Resampling. */
-        if (pConverter->hasPostFormatConversion) {
-            pResampleBufferOut = pTempBufferOut;
-        } else {
-            pResampleBufferOut = pRunningFramesOut;
-        }
-
-        result = ma_resampler_process_pcm_frames(&pConverter->resampler, pTempBufferMid, &frameCountInThisIteration, pResampleBufferOut, &frameCountOutThisIteration);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-
-        /* Post format conversion. */
-        if (pConverter->hasPostFormatConversion) {
-            if (pRunningFramesOut != NULL) {
-                ma_convert_pcm_frames_format(pRunningFramesOut, pConverter->formatOut, pResampleBufferOut, pConverter->resampler.format, frameCountOutThisIteration, pConverter->channelsOut, pConverter->ditherMode);
-            }
-        }
-
-
-        framesProcessedIn  += frameCountInThisIteration;
-        framesProcessedOut += frameCountOutThisIteration;
-
-        MA_ASSERT(framesProcessedIn  <= frameCountIn);
-        MA_ASSERT(framesProcessedOut <= frameCountOut);
-
-        if (frameCountOutThisIteration == 0) {
-            break;  /* Consumed all of our input data. */
-        }
-    }
-
-    if (pFrameCountIn != NULL) {
-        *pFrameCountIn = framesProcessedIn;
-    }
-    if (pFrameCountOut != NULL) {
-        *pFrameCountOut = framesProcessedOut;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_converter_process_pcm_frames(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
-{
-    if (pConverter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    switch (pConverter->executionPath)
-    {
-        case ma_data_converter_execution_path_passthrough:    return ma_data_converter_process_pcm_frames__passthrough(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-        case ma_data_converter_execution_path_format_only:    return ma_data_converter_process_pcm_frames__format_only(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-        case ma_data_converter_execution_path_channels_only:  return ma_data_converter_process_pcm_frames__channels_only(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-        case ma_data_converter_execution_path_resample_only:  return ma_data_converter_process_pcm_frames__resample_only(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-        case ma_data_converter_execution_path_resample_first: return ma_data_converter_process_pcm_frames__resample_first(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-        case ma_data_converter_execution_path_channels_first: return ma_data_converter_process_pcm_frames__channels_first(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
-        default: return MA_INVALID_OPERATION;   /* Should never hit this. */
-    }
-}
-
-MA_API ma_result ma_data_converter_set_rate(ma_data_converter* pConverter, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
-{
-    if (pConverter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConverter->hasResampler == MA_FALSE) {
-        return MA_INVALID_OPERATION;    /* Dynamic resampling not enabled. */
-    }
-
-    return ma_resampler_set_rate(&pConverter->resampler, sampleRateIn, sampleRateOut);
-}
-
-MA_API ma_result ma_data_converter_set_rate_ratio(ma_data_converter* pConverter, float ratioInOut)
-{
-    if (pConverter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConverter->hasResampler == MA_FALSE) {
-        return MA_INVALID_OPERATION;    /* Dynamic resampling not enabled. */
-    }
-
-    return ma_resampler_set_rate_ratio(&pConverter->resampler, ratioInOut);
-}
-
-MA_API ma_uint64 ma_data_converter_get_input_latency(const ma_data_converter* pConverter)
-{
-    if (pConverter == NULL) {
-        return 0;
-    }
-
-    if (pConverter->hasResampler) {
-        return ma_resampler_get_input_latency(&pConverter->resampler);
-    }
-
-    return 0;   /* No latency without a resampler. */
-}
-
-MA_API ma_uint64 ma_data_converter_get_output_latency(const ma_data_converter* pConverter)
-{
-    if (pConverter == NULL) {
-        return 0;
-    }
-
-    if (pConverter->hasResampler) {
-        return ma_resampler_get_output_latency(&pConverter->resampler);
-    }
-
-    return 0;   /* No latency without a resampler. */
-}
-
-MA_API ma_result ma_data_converter_get_required_input_frame_count(const ma_data_converter* pConverter, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount)
-{
-    if (pInputFrameCount == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pInputFrameCount = 0;
-
-    if (pConverter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConverter->hasResampler) {
-        return ma_resampler_get_required_input_frame_count(&pConverter->resampler, outputFrameCount, pInputFrameCount);
-    } else {
-        *pInputFrameCount = outputFrameCount;   /* 1:1 */
-        return MA_SUCCESS;
-    }
-}
-
-MA_API ma_result ma_data_converter_get_expected_output_frame_count(const ma_data_converter* pConverter, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount)
-{
-    if (pOutputFrameCount == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pOutputFrameCount = 0;
-
-    if (pConverter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConverter->hasResampler) {
-        return ma_resampler_get_expected_output_frame_count(&pConverter->resampler, inputFrameCount, pOutputFrameCount);
-    } else {
-        *pOutputFrameCount = inputFrameCount;   /* 1:1 */
-        return MA_SUCCESS;
-    }
-}
-
-MA_API ma_result ma_data_converter_get_input_channel_map(const ma_data_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    if (pConverter == NULL || pChannelMap == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConverter->hasChannelConverter) {
-        ma_channel_converter_get_output_channel_map(&pConverter->channelConverter, pChannelMap, channelMapCap);
-    } else {
-        ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pConverter->channelsOut);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_converter_get_output_channel_map(const ma_data_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    if (pConverter == NULL || pChannelMap == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConverter->hasChannelConverter) {
-        ma_channel_converter_get_input_channel_map(&pConverter->channelConverter, pChannelMap, channelMapCap);
-    } else {
-        ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pConverter->channelsIn);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_converter_reset(ma_data_converter* pConverter)
-{
-    if (pConverter == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* There's nothing to do if we're not resampling. */
-    if (pConverter->hasResampler == MA_FALSE) {
-        return MA_SUCCESS;
-    }
-
-    return ma_resampler_reset(&pConverter->resampler);
-}
-
-
-
-/**************************************************************************************************************************************************************
-
-Channel Maps
-
-**************************************************************************************************************************************************************/
-static ma_channel ma_channel_map_init_standard_channel(ma_standard_channel_map standardChannelMap, ma_uint32 channelCount, ma_uint32 channelIndex);
-
-MA_API ma_channel ma_channel_map_get_channel(const ma_channel* pChannelMap, ma_uint32 channelCount, ma_uint32 channelIndex)
-{
-    if (pChannelMap == NULL) {
-        return ma_channel_map_init_standard_channel(ma_standard_channel_map_default, channelCount, channelIndex);
-    } else {
-        if (channelIndex >= channelCount) {
-            return MA_CHANNEL_NONE;
-        }
-
-        return pChannelMap[channelIndex];
-    }
-}
-
-MA_API void ma_channel_map_init_blank(ma_channel* pChannelMap, ma_uint32 channels)
-{
-    if (pChannelMap == NULL) {
-        return;
-    }
-
-    MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channels);
-}
-
-
-static ma_channel ma_channel_map_init_standard_channel_microsoft(ma_uint32 channelCount, ma_uint32 channelIndex)
-{
-    if (channelCount == 0 || channelIndex >= channelCount) {
-        return MA_CHANNEL_NONE;
-    }
-
-    /* This is the Microsoft channel map. Based off the speaker configurations mentioned here: https://docs.microsoft.com/en-us/windows-hardware/drivers/ddi/content/ksmedia/ns-ksmedia-ksaudio_channel_config */
-    switch (channelCount)
-    {
-        case 0: return MA_CHANNEL_NONE;
-
-        case 1:
-        {
-            return MA_CHANNEL_MONO;
-        } break;
-
-        case 2:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-            }
-        } break;
-
-        case 3: /* No defined, but best guess. */
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-            }
-        } break;
-
-        case 4:
-        {
-            switch (channelIndex) {
-            #ifndef MA_USE_QUAD_MICROSOFT_CHANNEL_MAP
-                /* Surround. Using the Surround profile has the advantage of the 3rd channel (MA_CHANNEL_FRONT_CENTER) mapping nicely with higher channel counts. */
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_BACK_CENTER;
-            #else
-                /* Quad. */
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-            #endif
-            }
-        } break;
-
-        case 5: /* Not defined, but best guess. */
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_BACK_LEFT;
-                case 4: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 6:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_LFE;
-                case 4: return MA_CHANNEL_SIDE_LEFT;
-                case 5: return MA_CHANNEL_SIDE_RIGHT;
-            }
-        } break;
-
-        case 7: /* Not defined, but best guess. */
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_LFE;
-                case 4: return MA_CHANNEL_BACK_CENTER;
-                case 5: return MA_CHANNEL_SIDE_LEFT;
-                case 6: return MA_CHANNEL_SIDE_RIGHT;
-            }
-        } break;
-
-        case 8:
-        default:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_LFE;
-                case 4: return MA_CHANNEL_BACK_LEFT;
-                case 5: return MA_CHANNEL_BACK_RIGHT;
-                case 6: return MA_CHANNEL_SIDE_LEFT;
-                case 7: return MA_CHANNEL_SIDE_RIGHT;
-            }
-        } break;
-    }
-
-    if (channelCount > 8) {
-        if (channelIndex < 32) {    /* We have 32 AUX channels. */
-            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
-        }
-    }
-
-    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
-    return MA_CHANNEL_NONE;
-}
-
-static ma_channel ma_channel_map_init_standard_channel_alsa(ma_uint32 channelCount, ma_uint32 channelIndex)
-{
-    switch (channelCount)
-    {
-        case 0: return MA_CHANNEL_NONE;
-
-        case 1:
-        {
-            return MA_CHANNEL_MONO;
-        } break;
-
-        case 2:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-            }
-        } break;
-
-        case 3:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-            }
-        } break;
-
-        case 4:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 5:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-                case 4: return MA_CHANNEL_FRONT_CENTER;
-            }
-        } break;
-
-        case 6:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-                case 4: return MA_CHANNEL_FRONT_CENTER;
-                case 5: return MA_CHANNEL_LFE;
-            }
-        } break;
-
-        case 7:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-                case 4: return MA_CHANNEL_FRONT_CENTER;
-                case 5: return MA_CHANNEL_LFE;
-                case 6: return MA_CHANNEL_BACK_CENTER;
-            }
-        } break;
-
-        case 8:
-        default:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-                case 4: return MA_CHANNEL_FRONT_CENTER;
-                case 5: return MA_CHANNEL_LFE;
-                case 6: return MA_CHANNEL_SIDE_LEFT;
-                case 7: return MA_CHANNEL_SIDE_RIGHT;
-            }
-        } break;
-    }
-
-    if (channelCount > 8) {
-        if (channelIndex < 32) {    /* We have 32 AUX channels. */
-            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
-        }
-    }
-
-    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
-    return MA_CHANNEL_NONE;
-}
-
-static ma_channel ma_channel_map_init_standard_channel_rfc3551(ma_uint32 channelCount, ma_uint32 channelIndex)
-{
-    switch (channelCount)
-    {
-        case 0: return MA_CHANNEL_NONE;
-
-        case 1:
-        {
-            return MA_CHANNEL_MONO;
-        } break;
-
-        case 2:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-            }
-        } break;
-
-        case 3:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-            }
-        } break;
-
-        case 4:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 3: return MA_CHANNEL_BACK_CENTER;
-            }
-        } break;
-
-        case 5:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_BACK_LEFT;
-                case 4: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 6:
-        default:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_SIDE_LEFT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_FRONT_RIGHT;
-                case 4: return MA_CHANNEL_SIDE_RIGHT;
-                case 5: return MA_CHANNEL_BACK_CENTER;
-            }
-        } break;
-    }
-
-    if (channelCount > 6) {
-        if (channelIndex < 32) {    /* We have 32 AUX channels. */
-            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 6));
-        }
-    }
-
-    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
-    return MA_CHANNEL_NONE;
-}
-
-static ma_channel ma_channel_map_init_standard_channel_flac(ma_uint32 channelCount, ma_uint32 channelIndex)
-{
-    switch (channelCount)
-    {
-        case 0: return MA_CHANNEL_NONE;
-
-        case 1:
-        {
-            return MA_CHANNEL_MONO;
-        } break;
-
-        case 2:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-            }
-        } break;
-
-        case 3:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-            }
-        } break;
-
-        case 4:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 5:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_BACK_LEFT;
-                case 4: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 6:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_LFE;
-                case 4: return MA_CHANNEL_BACK_LEFT;
-                case 5: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 7:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_LFE;
-                case 4: return MA_CHANNEL_BACK_CENTER;
-                case 5: return MA_CHANNEL_SIDE_LEFT;
-                case 6: return MA_CHANNEL_SIDE_RIGHT;
-            }
-        } break;
-
-        case 8:
-        default:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_LFE;
-                case 4: return MA_CHANNEL_BACK_LEFT;
-                case 5: return MA_CHANNEL_BACK_RIGHT;
-                case 6: return MA_CHANNEL_SIDE_LEFT;
-                case 7: return MA_CHANNEL_SIDE_RIGHT;
-            }
-        } break;
-    }
-
-    if (channelCount > 8) {
-        if (channelIndex < 32) {    /* We have 32 AUX channels. */
-            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
-        }
-    }
-
-    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
-    return MA_CHANNEL_NONE;
-}
-
-static ma_channel ma_channel_map_init_standard_channel_vorbis(ma_uint32 channelCount, ma_uint32 channelIndex)
-{
-    switch (channelCount)
-    {
-        case 0: return MA_CHANNEL_NONE;
-
-        case 1:
-        {
-            return MA_CHANNEL_MONO;
-        } break;
-
-        case 2:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-            }
-        } break;
-
-        case 3:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_CENTER;
-                case 2: return MA_CHANNEL_FRONT_RIGHT;
-            }
-        } break;
-
-        case 4:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 5:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_CENTER;
-                case 2: return MA_CHANNEL_FRONT_RIGHT;
-                case 3: return MA_CHANNEL_BACK_LEFT;
-                case 4: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 6:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_CENTER;
-                case 2: return MA_CHANNEL_FRONT_RIGHT;
-                case 3: return MA_CHANNEL_BACK_LEFT;
-                case 4: return MA_CHANNEL_BACK_RIGHT;
-                case 5: return MA_CHANNEL_LFE;
-            }
-        } break;
-
-        case 7:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_CENTER;
-                case 2: return MA_CHANNEL_FRONT_RIGHT;
-                case 3: return MA_CHANNEL_SIDE_LEFT;
-                case 4: return MA_CHANNEL_SIDE_RIGHT;
-                case 5: return MA_CHANNEL_BACK_CENTER;
-                case 6: return MA_CHANNEL_LFE;
-            }
-        } break;
-
-        case 8:
-        default:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_CENTER;
-                case 2: return MA_CHANNEL_FRONT_RIGHT;
-                case 3: return MA_CHANNEL_SIDE_LEFT;
-                case 4: return MA_CHANNEL_SIDE_RIGHT;
-                case 5: return MA_CHANNEL_BACK_LEFT;
-                case 6: return MA_CHANNEL_BACK_RIGHT;
-                case 7: return MA_CHANNEL_LFE;
-            }
-        } break;
-    }
-
-    if (channelCount > 8) {
-        if (channelIndex < 32) {    /* We have 32 AUX channels. */
-            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
-        }
-    }
-
-    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
-    return MA_CHANNEL_NONE;
-}
-
-static ma_channel ma_channel_map_init_standard_channel_sound4(ma_uint32 channelCount, ma_uint32 channelIndex)
-{
-    switch (channelCount)
-    {
-        case 0: return MA_CHANNEL_NONE;
-
-        case 1:
-        {
-            return MA_CHANNEL_MONO;
-        } break;
-
-        case 2:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-            }
-        } break;
-
-        case 3:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-            }
-        } break;
-
-        case 4:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 5:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-                case 3: return MA_CHANNEL_BACK_LEFT;
-                case 4: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 6:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_CENTER;
-                case 2: return MA_CHANNEL_FRONT_RIGHT;
-                case 3: return MA_CHANNEL_BACK_LEFT;
-                case 4: return MA_CHANNEL_BACK_RIGHT;
-                case 5: return MA_CHANNEL_LFE;
-            }
-        } break;
-
-        case 7:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_CENTER;
-                case 2: return MA_CHANNEL_FRONT_RIGHT;
-                case 3: return MA_CHANNEL_SIDE_LEFT;
-                case 4: return MA_CHANNEL_SIDE_RIGHT;
-                case 5: return MA_CHANNEL_BACK_CENTER;
-                case 6: return MA_CHANNEL_LFE;
-            }
-        } break;
-
-        case 8:
-        default:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_CENTER;
-                case 2: return MA_CHANNEL_FRONT_RIGHT;
-                case 3: return MA_CHANNEL_SIDE_LEFT;
-                case 4: return MA_CHANNEL_SIDE_RIGHT;
-                case 5: return MA_CHANNEL_BACK_LEFT;
-                case 6: return MA_CHANNEL_BACK_RIGHT;
-                case 7: return MA_CHANNEL_LFE;
-            }
-        } break;
-    }
-
-    if (channelCount > 8) {
-        if (channelIndex < 32) {    /* We have 32 AUX channels. */
-            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
-        }
-    }
-
-    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
-    return MA_CHANNEL_NONE;
-}
-
-static ma_channel ma_channel_map_init_standard_channel_sndio(ma_uint32 channelCount, ma_uint32 channelIndex)
-{
-    switch (channelCount)
-    {
-        case 0: return MA_CHANNEL_NONE;
-
-        case 1:
-        {
-            return MA_CHANNEL_MONO;
-        } break;
-
-        case 2:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-            }
-        } break;
-
-        case 3: /* No defined, but best guess. */
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_FRONT_CENTER;
-            }
-        } break;
-
-        case 4:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-            }
-        } break;
-
-        case 5: /* Not defined, but best guess. */
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-                case 4: return MA_CHANNEL_FRONT_CENTER;
-            }
-        } break;
-
-        case 6:
-        default:
-        {
-            switch (channelIndex) {
-                case 0: return MA_CHANNEL_FRONT_LEFT;
-                case 1: return MA_CHANNEL_FRONT_RIGHT;
-                case 2: return MA_CHANNEL_BACK_LEFT;
-                case 3: return MA_CHANNEL_BACK_RIGHT;
-                case 4: return MA_CHANNEL_FRONT_CENTER;
-                case 5: return MA_CHANNEL_LFE;
-            }
-        } break;
-    }
-
-    if (channelCount > 6) {
-        if (channelIndex < 32) {    /* We have 32 AUX channels. */
-            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 6));
-        }
-    }
-
-    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
-    return MA_CHANNEL_NONE;
-}
-
-
-static ma_channel ma_channel_map_init_standard_channel(ma_standard_channel_map standardChannelMap, ma_uint32 channelCount, ma_uint32 channelIndex)
-{
-    if (channelCount == 0 || channelIndex >= channelCount) {
-        return MA_CHANNEL_NONE;
-    }
-
-    switch (standardChannelMap)
-    {
-        case ma_standard_channel_map_alsa:
-        {
-            return ma_channel_map_init_standard_channel_alsa(channelCount, channelIndex);
-        } break;
-
-        case ma_standard_channel_map_rfc3551:
-        {
-            return ma_channel_map_init_standard_channel_rfc3551(channelCount, channelIndex);
-        } break;
-
-        case ma_standard_channel_map_flac:
-        {
-            return ma_channel_map_init_standard_channel_flac(channelCount, channelIndex);
-        } break;
-
-        case ma_standard_channel_map_vorbis:
-        {
-            return ma_channel_map_init_standard_channel_vorbis(channelCount, channelIndex);
-        } break;
-
-        case ma_standard_channel_map_sound4:
-        {
-            return ma_channel_map_init_standard_channel_sound4(channelCount, channelIndex);
-        } break;
-
-        case ma_standard_channel_map_sndio:
-        {
-            return ma_channel_map_init_standard_channel_sndio(channelCount, channelIndex);
-        } break;
-
-        case ma_standard_channel_map_microsoft: /* Also default. */
-        /*case ma_standard_channel_map_default;*/
-        default:
-        {
-            return ma_channel_map_init_standard_channel_microsoft(channelCount, channelIndex);
-        } break;
-    }
-}
-
-MA_API void ma_channel_map_init_standard(ma_standard_channel_map standardChannelMap, ma_channel* pChannelMap, size_t channelMapCap, ma_uint32 channels)
-{
-    ma_uint32 iChannel;
-
-    if (pChannelMap == NULL || channelMapCap == 0 || channels == 0) {
-        return;
-    }
-
-    for (iChannel = 0; iChannel < channels; iChannel += 1) {
-        if (channelMapCap == 0) {
-            break;  /* Ran out of room. */
-        }
-
-        pChannelMap[0] = ma_channel_map_init_standard_channel(standardChannelMap, channels, iChannel);
-        pChannelMap   += 1;
-        channelMapCap -= 1;
-    }
-}
-
-MA_API void ma_channel_map_copy(ma_channel* pOut, const ma_channel* pIn, ma_uint32 channels)
-{
-    if (pOut != NULL && pIn != NULL && channels > 0) {
-        MA_COPY_MEMORY(pOut, pIn, sizeof(*pOut) * channels);
-    }
-}
-
-MA_API void ma_channel_map_copy_or_default(ma_channel* pOut, size_t channelMapCapOut, const ma_channel* pIn, ma_uint32 channels)
-{
-    if (pOut == NULL || channels == 0) {
-        return;
-    }
-
-    if (pIn != NULL) {
-        ma_channel_map_copy(pOut, pIn, channels);
-    } else {
-        ma_channel_map_init_standard(ma_standard_channel_map_default, pOut, channelMapCapOut, channels);
-    }
-}
-
-MA_API ma_bool32 ma_channel_map_is_valid(const ma_channel* pChannelMap, ma_uint32 channels)
-{
-    /* A channel count of 0 is invalid. */
-    if (channels == 0) {
-        return MA_FALSE;
-    }
-
-    /* It does not make sense to have a mono channel when there is more than 1 channel. */
-    if (channels > 1) {
-        ma_uint32 iChannel;
-        for (iChannel = 0; iChannel < channels; ++iChannel) {
-            if (ma_channel_map_get_channel(pChannelMap, channels, iChannel) == MA_CHANNEL_MONO) {
-                return MA_FALSE;
-            }
-        }
-    }
-
-    return MA_TRUE;
-}
-
-MA_API ma_bool32 ma_channel_map_is_equal(const ma_channel* pChannelMapA, const ma_channel* pChannelMapB, ma_uint32 channels)
-{
-    ma_uint32 iChannel;
-
-    if (pChannelMapA == pChannelMapB) {
-        return MA_TRUE;
-    }
-
-    for (iChannel = 0; iChannel < channels; ++iChannel) {
-        if (ma_channel_map_get_channel(pChannelMapA, channels, iChannel) != ma_channel_map_get_channel(pChannelMapB, channels, iChannel)) {
-            return MA_FALSE;
-        }
-    }
-
-    return MA_TRUE;
-}
-
-MA_API ma_bool32 ma_channel_map_is_blank(const ma_channel* pChannelMap, ma_uint32 channels)
-{
-    ma_uint32 iChannel;
-
-    /* A null channel map is equivalent to the default channel map. */
-    if (pChannelMap == NULL) {
-        return MA_FALSE;
-    }
-
-    for (iChannel = 0; iChannel < channels; ++iChannel) {
-        if (pChannelMap[iChannel] != MA_CHANNEL_NONE) {
-            return MA_FALSE;
-        }
-    }
-
-    return MA_TRUE;
-}
-
-MA_API ma_bool32 ma_channel_map_contains_channel_position(ma_uint32 channels, const ma_channel* pChannelMap, ma_channel channelPosition)
-{
-    return ma_channel_map_find_channel_position(channels, pChannelMap, channelPosition, NULL);
-}
-
-MA_API ma_bool32 ma_channel_map_find_channel_position(ma_uint32 channels, const ma_channel* pChannelMap, ma_channel channelPosition, ma_uint32* pChannelIndex)
-{
-    ma_uint32 iChannel;
-
-    if (pChannelIndex != NULL) {
-        *pChannelIndex = (ma_uint32)-1;
-    }
-
-    for (iChannel = 0; iChannel < channels; ++iChannel) {
-        if (ma_channel_map_get_channel(pChannelMap, channels, iChannel) == channelPosition) {
-            if (pChannelIndex != NULL) {
-                *pChannelIndex = iChannel;
-            }
-
-            return MA_TRUE;
-        }
-    }
-
-    /* Getting here means the channel position was not found. */
-    return MA_FALSE;
-}
-
-MA_API size_t ma_channel_map_to_string(const ma_channel* pChannelMap, ma_uint32 channels, char* pBufferOut, size_t bufferCap)
-{
-    size_t len;
-    ma_uint32 iChannel;
-
-    len = 0;
-
-    for (iChannel = 0; iChannel < channels; iChannel += 1) {
-        const char* pChannelStr = ma_channel_position_to_string(ma_channel_map_get_channel(pChannelMap, channels, iChannel));
-        size_t channelStrLen = strlen(pChannelStr);
-
-        /* Append the string if necessary. */
-        if (pBufferOut != NULL && bufferCap > len + channelStrLen) {
-            MA_COPY_MEMORY(pBufferOut + len, pChannelStr, channelStrLen);
-        }
-        len += channelStrLen;
-
-        /* Append a space if it's not the last item. */
-        if (iChannel+1 < channels) {
-            if (pBufferOut != NULL && bufferCap > len + 1) {
-                pBufferOut[len] = ' ';
-            }
-            len += 1;
-        }
-    }
-
-    /* Null terminate. Don't increment the length here. */
-    if (pBufferOut != NULL && bufferCap > len + 1) {
-        pBufferOut[len] = '\0';
-    }
-
-    return len;
-}
-
-MA_API const char* ma_channel_position_to_string(ma_channel channel)
-{
-    switch (channel)
-    {
-        case MA_CHANNEL_NONE              : return "CHANNEL_NONE";
-        case MA_CHANNEL_MONO              : return "CHANNEL_MONO";
-        case MA_CHANNEL_FRONT_LEFT        : return "CHANNEL_FRONT_LEFT";
-        case MA_CHANNEL_FRONT_RIGHT       : return "CHANNEL_FRONT_RIGHT";
-        case MA_CHANNEL_FRONT_CENTER      : return "CHANNEL_FRONT_CENTER";
-        case MA_CHANNEL_LFE               : return "CHANNEL_LFE";
-        case MA_CHANNEL_BACK_LEFT         : return "CHANNEL_BACK_LEFT";
-        case MA_CHANNEL_BACK_RIGHT        : return "CHANNEL_BACK_RIGHT";
-        case MA_CHANNEL_FRONT_LEFT_CENTER : return "CHANNEL_FRONT_LEFT_CENTER ";
-        case MA_CHANNEL_FRONT_RIGHT_CENTER: return "CHANNEL_FRONT_RIGHT_CENTER";
-        case MA_CHANNEL_BACK_CENTER       : return "CHANNEL_BACK_CENTER";
-        case MA_CHANNEL_SIDE_LEFT         : return "CHANNEL_SIDE_LEFT";
-        case MA_CHANNEL_SIDE_RIGHT        : return "CHANNEL_SIDE_RIGHT";
-        case MA_CHANNEL_TOP_CENTER        : return "CHANNEL_TOP_CENTER";
-        case MA_CHANNEL_TOP_FRONT_LEFT    : return "CHANNEL_TOP_FRONT_LEFT";
-        case MA_CHANNEL_TOP_FRONT_CENTER  : return "CHANNEL_TOP_FRONT_CENTER";
-        case MA_CHANNEL_TOP_FRONT_RIGHT   : return "CHANNEL_TOP_FRONT_RIGHT";
-        case MA_CHANNEL_TOP_BACK_LEFT     : return "CHANNEL_TOP_BACK_LEFT";
-        case MA_CHANNEL_TOP_BACK_CENTER   : return "CHANNEL_TOP_BACK_CENTER";
-        case MA_CHANNEL_TOP_BACK_RIGHT    : return "CHANNEL_TOP_BACK_RIGHT";
-        case MA_CHANNEL_AUX_0             : return "CHANNEL_AUX_0";
-        case MA_CHANNEL_AUX_1             : return "CHANNEL_AUX_1";
-        case MA_CHANNEL_AUX_2             : return "CHANNEL_AUX_2";
-        case MA_CHANNEL_AUX_3             : return "CHANNEL_AUX_3";
-        case MA_CHANNEL_AUX_4             : return "CHANNEL_AUX_4";
-        case MA_CHANNEL_AUX_5             : return "CHANNEL_AUX_5";
-        case MA_CHANNEL_AUX_6             : return "CHANNEL_AUX_6";
-        case MA_CHANNEL_AUX_7             : return "CHANNEL_AUX_7";
-        case MA_CHANNEL_AUX_8             : return "CHANNEL_AUX_8";
-        case MA_CHANNEL_AUX_9             : return "CHANNEL_AUX_9";
-        case MA_CHANNEL_AUX_10            : return "CHANNEL_AUX_10";
-        case MA_CHANNEL_AUX_11            : return "CHANNEL_AUX_11";
-        case MA_CHANNEL_AUX_12            : return "CHANNEL_AUX_12";
-        case MA_CHANNEL_AUX_13            : return "CHANNEL_AUX_13";
-        case MA_CHANNEL_AUX_14            : return "CHANNEL_AUX_14";
-        case MA_CHANNEL_AUX_15            : return "CHANNEL_AUX_15";
-        case MA_CHANNEL_AUX_16            : return "CHANNEL_AUX_16";
-        case MA_CHANNEL_AUX_17            : return "CHANNEL_AUX_17";
-        case MA_CHANNEL_AUX_18            : return "CHANNEL_AUX_18";
-        case MA_CHANNEL_AUX_19            : return "CHANNEL_AUX_19";
-        case MA_CHANNEL_AUX_20            : return "CHANNEL_AUX_20";
-        case MA_CHANNEL_AUX_21            : return "CHANNEL_AUX_21";
-        case MA_CHANNEL_AUX_22            : return "CHANNEL_AUX_22";
-        case MA_CHANNEL_AUX_23            : return "CHANNEL_AUX_23";
-        case MA_CHANNEL_AUX_24            : return "CHANNEL_AUX_24";
-        case MA_CHANNEL_AUX_25            : return "CHANNEL_AUX_25";
-        case MA_CHANNEL_AUX_26            : return "CHANNEL_AUX_26";
-        case MA_CHANNEL_AUX_27            : return "CHANNEL_AUX_27";
-        case MA_CHANNEL_AUX_28            : return "CHANNEL_AUX_28";
-        case MA_CHANNEL_AUX_29            : return "CHANNEL_AUX_29";
-        case MA_CHANNEL_AUX_30            : return "CHANNEL_AUX_30";
-        case MA_CHANNEL_AUX_31            : return "CHANNEL_AUX_31";
-        default: break;
-    }
-
-    return "UNKNOWN";
-}
-
-
-
-/**************************************************************************************************************************************************************
-
-Conversion Helpers
-
-**************************************************************************************************************************************************************/
-MA_API ma_uint64 ma_convert_frames(void* pOut, ma_uint64 frameCountOut, ma_format formatOut, ma_uint32 channelsOut, ma_uint32 sampleRateOut, const void* pIn, ma_uint64 frameCountIn, ma_format formatIn, ma_uint32 channelsIn, ma_uint32 sampleRateIn)
-{
-    ma_data_converter_config config;
-
-    config = ma_data_converter_config_init(formatIn, formatOut, channelsIn, channelsOut, sampleRateIn, sampleRateOut);
-    config.resampling.linear.lpfOrder = ma_min(MA_DEFAULT_RESAMPLER_LPF_ORDER, MA_MAX_FILTER_ORDER);
-
-    return ma_convert_frames_ex(pOut, frameCountOut, pIn, frameCountIn, &config);
-}
-
-MA_API ma_uint64 ma_convert_frames_ex(void* pOut, ma_uint64 frameCountOut, const void* pIn, ma_uint64 frameCountIn, const ma_data_converter_config* pConfig)
-{
-    ma_result result;
-    ma_data_converter converter;
-
-    if (frameCountIn == 0 || pConfig == NULL) {
-        return 0;
-    }
-
-    result = ma_data_converter_init(pConfig, NULL, &converter);
-    if (result != MA_SUCCESS) {
-        return 0;   /* Failed to initialize the data converter. */
-    }
-
-    if (pOut == NULL) {
-        result = ma_data_converter_get_expected_output_frame_count(&converter, frameCountIn, &frameCountOut);
-        if (result != MA_SUCCESS) {
-            if (result == MA_NOT_IMPLEMENTED) {
-                /* No way to calculate the number of frames, so we'll need to brute force it and loop. */
-                frameCountOut = 0;
-
-                while (frameCountIn > 0) {
-                    ma_uint64 framesProcessedIn  = frameCountIn;
-                    ma_uint64 framesProcessedOut = 0xFFFFFFFF;
-
-                    result = ma_data_converter_process_pcm_frames(&converter, pIn, &framesProcessedIn, NULL, &framesProcessedOut);
-                    if (result != MA_SUCCESS) {
-                        break;
-                    }
-
-                    frameCountIn  -= framesProcessedIn;
-                }
-            }
-        }
-    } else {
-        result = ma_data_converter_process_pcm_frames(&converter, pIn, &frameCountIn, pOut, &frameCountOut);
-        if (result != MA_SUCCESS) {
-            frameCountOut = 0;
-        }
-    }
-
-    ma_data_converter_uninit(&converter, NULL);
-    return frameCountOut;
-}
-
-
-/**************************************************************************************************************************************************************
-
-Ring Buffer
-
-**************************************************************************************************************************************************************/
-static MA_INLINE ma_uint32 ma_rb__extract_offset_in_bytes(ma_uint32 encodedOffset)
-{
-    return encodedOffset & 0x7FFFFFFF;
-}
-
-static MA_INLINE ma_uint32 ma_rb__extract_offset_loop_flag(ma_uint32 encodedOffset)
-{
-    return encodedOffset & 0x80000000;
-}
-
-static MA_INLINE void* ma_rb__get_read_ptr(ma_rb* pRB)
-{
-    MA_ASSERT(pRB != NULL);
-    return ma_offset_ptr(pRB->pBuffer, ma_rb__extract_offset_in_bytes(ma_atomic_load_32(&pRB->encodedReadOffset)));
-}
-
-static MA_INLINE void* ma_rb__get_write_ptr(ma_rb* pRB)
-{
-    MA_ASSERT(pRB != NULL);
-    return ma_offset_ptr(pRB->pBuffer, ma_rb__extract_offset_in_bytes(ma_atomic_load_32(&pRB->encodedWriteOffset)));
-}
-
-static MA_INLINE ma_uint32 ma_rb__construct_offset(ma_uint32 offsetInBytes, ma_uint32 offsetLoopFlag)
-{
-    return offsetLoopFlag | offsetInBytes;
-}
-
-static MA_INLINE void ma_rb__deconstruct_offset(ma_uint32 encodedOffset, ma_uint32* pOffsetInBytes, ma_uint32* pOffsetLoopFlag)
-{
-    MA_ASSERT(pOffsetInBytes != NULL);
-    MA_ASSERT(pOffsetLoopFlag != NULL);
-
-    *pOffsetInBytes  = ma_rb__extract_offset_in_bytes(encodedOffset);
-    *pOffsetLoopFlag = ma_rb__extract_offset_loop_flag(encodedOffset);
-}
-
-
-MA_API ma_result ma_rb_init_ex(size_t subbufferSizeInBytes, size_t subbufferCount, size_t subbufferStrideInBytes, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_rb* pRB)
-{
-    ma_result result;
-    const ma_uint32 maxSubBufferSize = 0x7FFFFFFF - (MA_SIMD_ALIGNMENT-1);
-
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (subbufferSizeInBytes == 0 || subbufferCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (subbufferSizeInBytes > maxSubBufferSize) {
-        return MA_INVALID_ARGS;    /* Maximum buffer size is ~2GB. The most significant bit is a flag for use internally. */
-    }
-
-
-    MA_ZERO_OBJECT(pRB);
-
-    result = ma_allocation_callbacks_init_copy(&pRB->allocationCallbacks, pAllocationCallbacks);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pRB->subbufferSizeInBytes = (ma_uint32)subbufferSizeInBytes;
-    pRB->subbufferCount = (ma_uint32)subbufferCount;
-
-    if (pOptionalPreallocatedBuffer != NULL) {
-        pRB->subbufferStrideInBytes = (ma_uint32)subbufferStrideInBytes;
-        pRB->pBuffer = pOptionalPreallocatedBuffer;
-    } else {
-        size_t bufferSizeInBytes;
-
-        /*
-        Here is where we allocate our own buffer. We always want to align this to MA_SIMD_ALIGNMENT for future SIMD optimization opportunity. To do this
-        we need to make sure the stride is a multiple of MA_SIMD_ALIGNMENT.
-        */
-        pRB->subbufferStrideInBytes = (pRB->subbufferSizeInBytes + (MA_SIMD_ALIGNMENT-1)) & ~MA_SIMD_ALIGNMENT;
-
-        bufferSizeInBytes = (size_t)pRB->subbufferCount*pRB->subbufferStrideInBytes;
-        pRB->pBuffer = ma_aligned_malloc(bufferSizeInBytes, MA_SIMD_ALIGNMENT, &pRB->allocationCallbacks);
-        if (pRB->pBuffer == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-
-        MA_ZERO_MEMORY(pRB->pBuffer, bufferSizeInBytes);
-        pRB->ownsBuffer = MA_TRUE;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_rb_init(size_t bufferSizeInBytes, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_rb* pRB)
-{
-    return ma_rb_init_ex(bufferSizeInBytes, 1, 0, pOptionalPreallocatedBuffer, pAllocationCallbacks, pRB);
-}
-
-MA_API void ma_rb_uninit(ma_rb* pRB)
-{
-    if (pRB == NULL) {
-        return;
-    }
-
-    if (pRB->ownsBuffer) {
-        ma_aligned_free(pRB->pBuffer, &pRB->allocationCallbacks);
-    }
-}
-
-MA_API void ma_rb_reset(ma_rb* pRB)
-{
-    if (pRB == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_32(&pRB->encodedReadOffset, 0);
-    ma_atomic_exchange_32(&pRB->encodedWriteOffset, 0);
-}
-
-MA_API ma_result ma_rb_acquire_read(ma_rb* pRB, size_t* pSizeInBytes, void** ppBufferOut)
-{
-    ma_uint32 writeOffset;
-    ma_uint32 writeOffsetInBytes;
-    ma_uint32 writeOffsetLoopFlag;
-    ma_uint32 readOffset;
-    ma_uint32 readOffsetInBytes;
-    ma_uint32 readOffsetLoopFlag;
-    size_t bytesAvailable;
-    size_t bytesRequested;
-
-    if (pRB == NULL || pSizeInBytes == NULL || ppBufferOut == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The returned buffer should never move ahead of the write pointer. */
-    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
-    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
-
-    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
-    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
-
-    /*
-    The number of bytes available depends on whether or not the read and write pointers are on the same loop iteration. If so, we
-    can only read up to the write pointer. If not, we can only read up to the end of the buffer.
-    */
-    if (readOffsetLoopFlag == writeOffsetLoopFlag) {
-        bytesAvailable = writeOffsetInBytes - readOffsetInBytes;
-    } else {
-        bytesAvailable = pRB->subbufferSizeInBytes - readOffsetInBytes;
-    }
-
-    bytesRequested = *pSizeInBytes;
-    if (bytesRequested > bytesAvailable) {
-        bytesRequested = bytesAvailable;
-    }
-
-    *pSizeInBytes = bytesRequested;
-    (*ppBufferOut) = ma_rb__get_read_ptr(pRB);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_rb_commit_read(ma_rb* pRB, size_t sizeInBytes)
-{
-    ma_uint32 readOffset;
-    ma_uint32 readOffsetInBytes;
-    ma_uint32 readOffsetLoopFlag;
-    ma_uint32 newReadOffsetInBytes;
-    ma_uint32 newReadOffsetLoopFlag;
-
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
-    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
-
-    /* Check that sizeInBytes is correct. It should never go beyond the end of the buffer. */
-    newReadOffsetInBytes = (ma_uint32)(readOffsetInBytes + sizeInBytes);
-    if (newReadOffsetInBytes > pRB->subbufferSizeInBytes) {
-        return MA_INVALID_ARGS;    /* <-- sizeInBytes will cause the read offset to overflow. */
-    }
-
-    /* Move the read pointer back to the start if necessary. */
-    newReadOffsetLoopFlag = readOffsetLoopFlag;
-    if (newReadOffsetInBytes == pRB->subbufferSizeInBytes) {
-        newReadOffsetInBytes = 0;
-        newReadOffsetLoopFlag ^= 0x80000000;
-    }
-
-    ma_atomic_exchange_32(&pRB->encodedReadOffset, ma_rb__construct_offset(newReadOffsetLoopFlag, newReadOffsetInBytes));
-
-    if (ma_rb_pointer_distance(pRB) == 0) {
-        return MA_AT_END;
-    } else {
-        return MA_SUCCESS;
-    }
-}
-
-MA_API ma_result ma_rb_acquire_write(ma_rb* pRB, size_t* pSizeInBytes, void** ppBufferOut)
-{
-    ma_uint32 readOffset;
-    ma_uint32 readOffsetInBytes;
-    ma_uint32 readOffsetLoopFlag;
-    ma_uint32 writeOffset;
-    ma_uint32 writeOffsetInBytes;
-    ma_uint32 writeOffsetLoopFlag;
-    size_t bytesAvailable;
-    size_t bytesRequested;
-
-    if (pRB == NULL || pSizeInBytes == NULL || ppBufferOut == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The returned buffer should never overtake the read buffer. */
-    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
-    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
-
-    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
-    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
-
-    /*
-    In the case of writing, if the write pointer and the read pointer are on the same loop iteration we can only
-    write up to the end of the buffer. Otherwise we can only write up to the read pointer. The write pointer should
-    never overtake the read pointer.
-    */
-    if (writeOffsetLoopFlag == readOffsetLoopFlag) {
-        bytesAvailable = pRB->subbufferSizeInBytes - writeOffsetInBytes;
-    } else {
-        bytesAvailable = readOffsetInBytes - writeOffsetInBytes;
-    }
-
-    bytesRequested = *pSizeInBytes;
-    if (bytesRequested > bytesAvailable) {
-        bytesRequested = bytesAvailable;
-    }
-
-    *pSizeInBytes = bytesRequested;
-    *ppBufferOut  = ma_rb__get_write_ptr(pRB);
-
-    /* Clear the buffer if desired. */
-    if (pRB->clearOnWriteAcquire) {
-        MA_ZERO_MEMORY(*ppBufferOut, *pSizeInBytes);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_rb_commit_write(ma_rb* pRB, size_t sizeInBytes)
-{
-    ma_uint32 writeOffset;
-    ma_uint32 writeOffsetInBytes;
-    ma_uint32 writeOffsetLoopFlag;
-    ma_uint32 newWriteOffsetInBytes;
-    ma_uint32 newWriteOffsetLoopFlag;
-
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
-    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
-
-    /* Check that sizeInBytes is correct. It should never go beyond the end of the buffer. */
-    newWriteOffsetInBytes = (ma_uint32)(writeOffsetInBytes + sizeInBytes);
-    if (newWriteOffsetInBytes > pRB->subbufferSizeInBytes) {
-        return MA_INVALID_ARGS;    /* <-- sizeInBytes will cause the read offset to overflow. */
-    }
-
-    /* Move the read pointer back to the start if necessary. */
-    newWriteOffsetLoopFlag = writeOffsetLoopFlag;
-    if (newWriteOffsetInBytes == pRB->subbufferSizeInBytes) {
-        newWriteOffsetInBytes = 0;
-        newWriteOffsetLoopFlag ^= 0x80000000;
-    }
-
-    ma_atomic_exchange_32(&pRB->encodedWriteOffset, ma_rb__construct_offset(newWriteOffsetLoopFlag, newWriteOffsetInBytes));
-
-    if (ma_rb_pointer_distance(pRB) == 0) {
-        return MA_AT_END;
-    } else {
-        return MA_SUCCESS;
-    }
-}
-
-MA_API ma_result ma_rb_seek_read(ma_rb* pRB, size_t offsetInBytes)
-{
-    ma_uint32 readOffset;
-    ma_uint32 readOffsetInBytes;
-    ma_uint32 readOffsetLoopFlag;
-    ma_uint32 writeOffset;
-    ma_uint32 writeOffsetInBytes;
-    ma_uint32 writeOffsetLoopFlag;
-    ma_uint32 newReadOffsetInBytes;
-    ma_uint32 newReadOffsetLoopFlag;
-
-    if (pRB == NULL || offsetInBytes > pRB->subbufferSizeInBytes) {
-        return MA_INVALID_ARGS;
-    }
-
-    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
-    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
-
-    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
-    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
-
-    newReadOffsetLoopFlag = readOffsetLoopFlag;
-
-    /* We cannot go past the write buffer. */
-    if (readOffsetLoopFlag == writeOffsetLoopFlag) {
-        if ((readOffsetInBytes + offsetInBytes) > writeOffsetInBytes) {
-            newReadOffsetInBytes = writeOffsetInBytes;
-        } else {
-            newReadOffsetInBytes = (ma_uint32)(readOffsetInBytes + offsetInBytes);
-        }
-    } else {
-        /* May end up looping. */
-        if ((readOffsetInBytes + offsetInBytes) >= pRB->subbufferSizeInBytes) {
-            newReadOffsetInBytes = (ma_uint32)(readOffsetInBytes + offsetInBytes) - pRB->subbufferSizeInBytes;
-            newReadOffsetLoopFlag ^= 0x80000000;    /* <-- Looped. */
-        } else {
-            newReadOffsetInBytes = (ma_uint32)(readOffsetInBytes + offsetInBytes);
-        }
-    }
-
-    ma_atomic_exchange_32(&pRB->encodedReadOffset, ma_rb__construct_offset(newReadOffsetInBytes, newReadOffsetLoopFlag));
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_rb_seek_write(ma_rb* pRB, size_t offsetInBytes)
-{
-    ma_uint32 readOffset;
-    ma_uint32 readOffsetInBytes;
-    ma_uint32 readOffsetLoopFlag;
-    ma_uint32 writeOffset;
-    ma_uint32 writeOffsetInBytes;
-    ma_uint32 writeOffsetLoopFlag;
-    ma_uint32 newWriteOffsetInBytes;
-    ma_uint32 newWriteOffsetLoopFlag;
-
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
-    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
-
-    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
-    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
-
-    newWriteOffsetLoopFlag = writeOffsetLoopFlag;
-
-    /* We cannot go past the write buffer. */
-    if (readOffsetLoopFlag == writeOffsetLoopFlag) {
-        /* May end up looping. */
-        if ((writeOffsetInBytes + offsetInBytes) >= pRB->subbufferSizeInBytes) {
-            newWriteOffsetInBytes = (ma_uint32)(writeOffsetInBytes + offsetInBytes) - pRB->subbufferSizeInBytes;
-            newWriteOffsetLoopFlag ^= 0x80000000;    /* <-- Looped. */
-        } else {
-            newWriteOffsetInBytes = (ma_uint32)(writeOffsetInBytes + offsetInBytes);
-        }
-    } else {
-        if ((writeOffsetInBytes + offsetInBytes) > readOffsetInBytes) {
-            newWriteOffsetInBytes = readOffsetInBytes;
-        } else {
-            newWriteOffsetInBytes = (ma_uint32)(writeOffsetInBytes + offsetInBytes);
-        }
-    }
-
-    ma_atomic_exchange_32(&pRB->encodedWriteOffset, ma_rb__construct_offset(newWriteOffsetInBytes, newWriteOffsetLoopFlag));
-    return MA_SUCCESS;
-}
-
-MA_API ma_int32 ma_rb_pointer_distance(ma_rb* pRB)
-{
-    ma_uint32 readOffset;
-    ma_uint32 readOffsetInBytes;
-    ma_uint32 readOffsetLoopFlag;
-    ma_uint32 writeOffset;
-    ma_uint32 writeOffsetInBytes;
-    ma_uint32 writeOffsetLoopFlag;
-
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
-    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
-
-    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
-    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
-
-    if (readOffsetLoopFlag == writeOffsetLoopFlag) {
-        return writeOffsetInBytes - readOffsetInBytes;
-    } else {
-        return writeOffsetInBytes + (pRB->subbufferSizeInBytes - readOffsetInBytes);
-    }
-}
-
-MA_API ma_uint32 ma_rb_available_read(ma_rb* pRB)
-{
-    ma_int32 dist;
-
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    dist = ma_rb_pointer_distance(pRB);
-    if (dist < 0) {
-        return 0;
-    }
-
-    return dist;
-}
-
-MA_API ma_uint32 ma_rb_available_write(ma_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return (ma_uint32)(ma_rb_get_subbuffer_size(pRB) - ma_rb_pointer_distance(pRB));
-}
-
-MA_API size_t ma_rb_get_subbuffer_size(ma_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return pRB->subbufferSizeInBytes;
-}
-
-MA_API size_t ma_rb_get_subbuffer_stride(ma_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    if (pRB->subbufferStrideInBytes == 0) {
-        return (size_t)pRB->subbufferSizeInBytes;
-    }
-
-    return (size_t)pRB->subbufferStrideInBytes;
-}
-
-MA_API size_t ma_rb_get_subbuffer_offset(ma_rb* pRB, size_t subbufferIndex)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return subbufferIndex * ma_rb_get_subbuffer_stride(pRB);
-}
-
-MA_API void* ma_rb_get_subbuffer_ptr(ma_rb* pRB, size_t subbufferIndex, void* pBuffer)
-{
-    if (pRB == NULL) {
-        return NULL;
-    }
-
-    return ma_offset_ptr(pBuffer, ma_rb_get_subbuffer_offset(pRB, subbufferIndex));
-}
-
-
-
-static ma_result ma_pcm_rb_data_source__on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    /* Since there's no notion of an end, we don't ever want to return MA_AT_END here. But it is possible to return 0. */
-    ma_pcm_rb* pRB = (ma_pcm_rb*)pDataSource;
-    ma_result result;
-    ma_uint64 totalFramesRead;
-
-    MA_ASSERT(pRB != NULL);
-
-    /* We need to run this in a loop since the ring buffer itself may loop. */
-    totalFramesRead = 0;
-    while (totalFramesRead < frameCount) {
-        void* pMappedBuffer;
-        ma_uint32 mappedFrameCount;
-        ma_uint64 framesToRead = frameCount - totalFramesRead;
-        if (framesToRead > 0xFFFFFFFF) {
-            framesToRead = 0xFFFFFFFF;
-        }
-
-        mappedFrameCount = (ma_uint32)framesToRead;
-        result = ma_pcm_rb_acquire_read(pRB, &mappedFrameCount, &pMappedBuffer);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        if (mappedFrameCount == 0) {
-            break;  /* <-- End of ring buffer. */
-        }
-
-        ma_copy_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, pRB->format, pRB->channels), pMappedBuffer, mappedFrameCount, pRB->format, pRB->channels);
-
-        result = ma_pcm_rb_commit_read(pRB, mappedFrameCount);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        totalFramesRead += mappedFrameCount;
-    }
-
-    *pFramesRead = totalFramesRead;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_pcm_rb_data_source__on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    ma_pcm_rb* pRB = (ma_pcm_rb*)pDataSource;
-    MA_ASSERT(pRB != NULL);
-
-    if (pFormat != NULL) {
-        *pFormat = pRB->format;
-    }
-
-    if (pChannels != NULL) {
-        *pChannels = pRB->channels;
-    }
-
-    if (pSampleRate != NULL) {
-        *pSampleRate = pRB->sampleRate;
-    }
-
-    /* Just assume the default channel map. */
-    if (pChannelMap != NULL) {
-        ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pRB->channels);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_data_source_vtable ma_gRBDataSourceVTable =
-{
-    ma_pcm_rb_data_source__on_read,
-    NULL,   /* onSeek */
-    ma_pcm_rb_data_source__on_get_data_format,
-    NULL,   /* onGetCursor */
-    NULL,   /* onGetLength */
-    NULL,   /* onSetLooping */
-    0
-};
-
-static MA_INLINE ma_uint32 ma_pcm_rb_get_bpf(ma_pcm_rb* pRB)
-{
-    MA_ASSERT(pRB != NULL);
-
-    return ma_get_bytes_per_frame(pRB->format, pRB->channels);
-}
-
-MA_API ma_result ma_pcm_rb_init_ex(ma_format format, ma_uint32 channels, ma_uint32 subbufferSizeInFrames, ma_uint32 subbufferCount, ma_uint32 subbufferStrideInFrames, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_pcm_rb* pRB)
-{
-    ma_uint32 bpf;
-    ma_result result;
-
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pRB);
-
-    bpf = ma_get_bytes_per_frame(format, channels);
-    if (bpf == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_rb_init_ex(subbufferSizeInFrames*bpf, subbufferCount, subbufferStrideInFrames*bpf, pOptionalPreallocatedBuffer, pAllocationCallbacks, &pRB->rb);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pRB->format     = format;
-    pRB->channels   = channels;
-    pRB->sampleRate = 0;    /* The sample rate is not passed in as a parameter. */
-
-    /* The PCM ring buffer is a data source. We need to get that set up as well. */
-    {
-        ma_data_source_config dataSourceConfig = ma_data_source_config_init();
-        dataSourceConfig.vtable = &ma_gRBDataSourceVTable;
-
-        result = ma_data_source_init(&dataSourceConfig, &pRB->ds);
-        if (result != MA_SUCCESS) {
-            ma_rb_uninit(&pRB->rb);
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_pcm_rb_init(ma_format format, ma_uint32 channels, ma_uint32 bufferSizeInFrames, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_pcm_rb* pRB)
-{
-    return ma_pcm_rb_init_ex(format, channels, bufferSizeInFrames, 1, 0, pOptionalPreallocatedBuffer, pAllocationCallbacks, pRB);
-}
-
-MA_API void ma_pcm_rb_uninit(ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return;
-    }
-
-    ma_data_source_uninit(&pRB->ds);
-    ma_rb_uninit(&pRB->rb);
-}
-
-MA_API void ma_pcm_rb_reset(ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return;
-    }
-
-    ma_rb_reset(&pRB->rb);
-}
-
-MA_API ma_result ma_pcm_rb_acquire_read(ma_pcm_rb* pRB, ma_uint32* pSizeInFrames, void** ppBufferOut)
-{
-    size_t sizeInBytes;
-    ma_result result;
-
-    if (pRB == NULL || pSizeInFrames == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    sizeInBytes = *pSizeInFrames * ma_pcm_rb_get_bpf(pRB);
-
-    result = ma_rb_acquire_read(&pRB->rb, &sizeInBytes, ppBufferOut);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pSizeInFrames = (ma_uint32)(sizeInBytes / (size_t)ma_pcm_rb_get_bpf(pRB));
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_pcm_rb_commit_read(ma_pcm_rb* pRB, ma_uint32 sizeInFrames)
-{
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_rb_commit_read(&pRB->rb, sizeInFrames * ma_pcm_rb_get_bpf(pRB));
-}
-
-MA_API ma_result ma_pcm_rb_acquire_write(ma_pcm_rb* pRB, ma_uint32* pSizeInFrames, void** ppBufferOut)
-{
-    size_t sizeInBytes;
-    ma_result result;
-
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    sizeInBytes = *pSizeInFrames * ma_pcm_rb_get_bpf(pRB);
-
-    result = ma_rb_acquire_write(&pRB->rb, &sizeInBytes, ppBufferOut);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pSizeInFrames = (ma_uint32)(sizeInBytes / ma_pcm_rb_get_bpf(pRB));
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_pcm_rb_commit_write(ma_pcm_rb* pRB, ma_uint32 sizeInFrames)
-{
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_rb_commit_write(&pRB->rb, sizeInFrames * ma_pcm_rb_get_bpf(pRB));
-}
-
-MA_API ma_result ma_pcm_rb_seek_read(ma_pcm_rb* pRB, ma_uint32 offsetInFrames)
-{
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_rb_seek_read(&pRB->rb, offsetInFrames * ma_pcm_rb_get_bpf(pRB));
-}
-
-MA_API ma_result ma_pcm_rb_seek_write(ma_pcm_rb* pRB, ma_uint32 offsetInFrames)
-{
-    if (pRB == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_rb_seek_write(&pRB->rb, offsetInFrames * ma_pcm_rb_get_bpf(pRB));
-}
-
-MA_API ma_int32 ma_pcm_rb_pointer_distance(ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return ma_rb_pointer_distance(&pRB->rb) / ma_pcm_rb_get_bpf(pRB);
-}
-
-MA_API ma_uint32 ma_pcm_rb_available_read(ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return ma_rb_available_read(&pRB->rb) / ma_pcm_rb_get_bpf(pRB);
-}
-
-MA_API ma_uint32 ma_pcm_rb_available_write(ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return ma_rb_available_write(&pRB->rb) / ma_pcm_rb_get_bpf(pRB);
-}
-
-MA_API ma_uint32 ma_pcm_rb_get_subbuffer_size(ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return (ma_uint32)(ma_rb_get_subbuffer_size(&pRB->rb) / ma_pcm_rb_get_bpf(pRB));
-}
-
-MA_API ma_uint32 ma_pcm_rb_get_subbuffer_stride(ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return (ma_uint32)(ma_rb_get_subbuffer_stride(&pRB->rb) / ma_pcm_rb_get_bpf(pRB));
-}
-
-MA_API ma_uint32 ma_pcm_rb_get_subbuffer_offset(ma_pcm_rb* pRB, ma_uint32 subbufferIndex)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return (ma_uint32)(ma_rb_get_subbuffer_offset(&pRB->rb, subbufferIndex) / ma_pcm_rb_get_bpf(pRB));
-}
-
-MA_API void* ma_pcm_rb_get_subbuffer_ptr(ma_pcm_rb* pRB, ma_uint32 subbufferIndex, void* pBuffer)
-{
-    if (pRB == NULL) {
-        return NULL;
-    }
-
-    return ma_rb_get_subbuffer_ptr(&pRB->rb, subbufferIndex, pBuffer);
-}
-
-MA_API ma_format ma_pcm_rb_get_format(const ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return ma_format_unknown;
-    }
-
-    return pRB->format;
-}
-
-MA_API ma_uint32 ma_pcm_rb_get_channels(const ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return pRB->channels;
-}
-
-MA_API ma_uint32 ma_pcm_rb_get_sample_rate(const ma_pcm_rb* pRB)
-{
-    if (pRB == NULL) {
-        return 0;
-    }
-
-    return pRB->sampleRate;
-}
-
-MA_API void ma_pcm_rb_set_sample_rate(ma_pcm_rb* pRB, ma_uint32 sampleRate)
-{
-    if (pRB == NULL) {
-        return;
-    }
-
-    pRB->sampleRate = sampleRate;
-}
-
-
-
-MA_API ma_result ma_duplex_rb_init(ma_format captureFormat, ma_uint32 captureChannels, ma_uint32 sampleRate, ma_uint32 captureInternalSampleRate, ma_uint32 captureInternalPeriodSizeInFrames, const ma_allocation_callbacks* pAllocationCallbacks, ma_duplex_rb* pRB)
-{
-    ma_result result;
-    ma_uint32 sizeInFrames;
-
-    sizeInFrames = (ma_uint32)ma_calculate_frame_count_after_resampling(sampleRate, captureInternalSampleRate, captureInternalPeriodSizeInFrames * 5);
-    if (sizeInFrames == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_pcm_rb_init(captureFormat, captureChannels, sizeInFrames, NULL, pAllocationCallbacks, &pRB->rb);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* Seek forward a bit so we have a bit of a buffer in case of desyncs. */
-    ma_pcm_rb_seek_write((ma_pcm_rb*)pRB, captureInternalPeriodSizeInFrames * 2);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_duplex_rb_uninit(ma_duplex_rb* pRB)
-{
-    ma_pcm_rb_uninit((ma_pcm_rb*)pRB);
-    return MA_SUCCESS;
-}
-
-
-
-/**************************************************************************************************************************************************************
-
-Miscellaneous Helpers
-
-**************************************************************************************************************************************************************/
-MA_API const char* ma_result_description(ma_result result)
-{
-    switch (result)
-    {
-        case MA_SUCCESS:                       return "No error";
-        case MA_ERROR:                         return "Unknown error";
-        case MA_INVALID_ARGS:                  return "Invalid argument";
-        case MA_INVALID_OPERATION:             return "Invalid operation";
-        case MA_OUT_OF_MEMORY:                 return "Out of memory";
-        case MA_OUT_OF_RANGE:                  return "Out of range";
-        case MA_ACCESS_DENIED:                 return "Permission denied";
-        case MA_DOES_NOT_EXIST:                return "Resource does not exist";
-        case MA_ALREADY_EXISTS:                return "Resource already exists";
-        case MA_TOO_MANY_OPEN_FILES:           return "Too many open files";
-        case MA_INVALID_FILE:                  return "Invalid file";
-        case MA_TOO_BIG:                       return "Too large";
-        case MA_PATH_TOO_LONG:                 return "Path too long";
-        case MA_NAME_TOO_LONG:                 return "Name too long";
-        case MA_NOT_DIRECTORY:                 return "Not a directory";
-        case MA_IS_DIRECTORY:                  return "Is a directory";
-        case MA_DIRECTORY_NOT_EMPTY:           return "Directory not empty";
-        case MA_AT_END:                        return "At end";
-        case MA_NO_SPACE:                      return "No space available";
-        case MA_BUSY:                          return "Device or resource busy";
-        case MA_IO_ERROR:                      return "Input/output error";
-        case MA_INTERRUPT:                     return "Interrupted";
-        case MA_UNAVAILABLE:                   return "Resource unavailable";
-        case MA_ALREADY_IN_USE:                return "Resource already in use";
-        case MA_BAD_ADDRESS:                   return "Bad address";
-        case MA_BAD_SEEK:                      return "Illegal seek";
-        case MA_BAD_PIPE:                      return "Broken pipe";
-        case MA_DEADLOCK:                      return "Deadlock";
-        case MA_TOO_MANY_LINKS:                return "Too many links";
-        case MA_NOT_IMPLEMENTED:               return "Not implemented";
-        case MA_NO_MESSAGE:                    return "No message of desired type";
-        case MA_BAD_MESSAGE:                   return "Invalid message";
-        case MA_NO_DATA_AVAILABLE:             return "No data available";
-        case MA_INVALID_DATA:                  return "Invalid data";
-        case MA_TIMEOUT:                       return "Timeout";
-        case MA_NO_NETWORK:                    return "Network unavailable";
-        case MA_NOT_UNIQUE:                    return "Not unique";
-        case MA_NOT_SOCKET:                    return "Socket operation on non-socket";
-        case MA_NO_ADDRESS:                    return "Destination address required";
-        case MA_BAD_PROTOCOL:                  return "Protocol wrong type for socket";
-        case MA_PROTOCOL_UNAVAILABLE:          return "Protocol not available";
-        case MA_PROTOCOL_NOT_SUPPORTED:        return "Protocol not supported";
-        case MA_PROTOCOL_FAMILY_NOT_SUPPORTED: return "Protocol family not supported";
-        case MA_ADDRESS_FAMILY_NOT_SUPPORTED:  return "Address family not supported";
-        case MA_SOCKET_NOT_SUPPORTED:          return "Socket type not supported";
-        case MA_CONNECTION_RESET:              return "Connection reset";
-        case MA_ALREADY_CONNECTED:             return "Already connected";
-        case MA_NOT_CONNECTED:                 return "Not connected";
-        case MA_CONNECTION_REFUSED:            return "Connection refused";
-        case MA_NO_HOST:                       return "No host";
-        case MA_IN_PROGRESS:                   return "Operation in progress";
-        case MA_CANCELLED:                     return "Operation cancelled";
-        case MA_MEMORY_ALREADY_MAPPED:         return "Memory already mapped";
-
-        case MA_FORMAT_NOT_SUPPORTED:          return "Format not supported";
-        case MA_DEVICE_TYPE_NOT_SUPPORTED:     return "Device type not supported";
-        case MA_SHARE_MODE_NOT_SUPPORTED:      return "Share mode not supported";
-        case MA_NO_BACKEND:                    return "No backend";
-        case MA_NO_DEVICE:                     return "No device";
-        case MA_API_NOT_FOUND:                 return "API not found";
-        case MA_INVALID_DEVICE_CONFIG:         return "Invalid device config";
-
-        case MA_DEVICE_NOT_INITIALIZED:        return "Device not initialized";
-        case MA_DEVICE_NOT_STARTED:            return "Device not started";
-
-        case MA_FAILED_TO_INIT_BACKEND:        return "Failed to initialize backend";
-        case MA_FAILED_TO_OPEN_BACKEND_DEVICE: return "Failed to open backend device";
-        case MA_FAILED_TO_START_BACKEND_DEVICE: return "Failed to start backend device";
-        case MA_FAILED_TO_STOP_BACKEND_DEVICE: return "Failed to stop backend device";
-
-        default:                               return "Unknown error";
-    }
-}
-
-MA_API void* ma_malloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        if (pAllocationCallbacks->onMalloc != NULL) {
-            return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
-        } else {
-            return NULL;    /* Do not fall back to the default implementation. */
-        }
-    } else {
-        return ma__malloc_default(sz, NULL);
-    }
-}
-
-MA_API void* ma_calloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    void* p = ma_malloc(sz, pAllocationCallbacks);
-    if (p != NULL) {
-        MA_ZERO_MEMORY(p, sz);
-    }
-
-    return p;
-}
-
-MA_API void* ma_realloc(void* p, size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        if (pAllocationCallbacks->onRealloc != NULL) {
-            return pAllocationCallbacks->onRealloc(p, sz, pAllocationCallbacks->pUserData);
-        } else {
-            return NULL;    /* Do not fall back to the default implementation. */
-        }
-    } else {
-        return ma__realloc_default(p, sz, NULL);
-    }
-}
-
-MA_API void ma_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (p == NULL) {
-        return;
-    }
-
-    if (pAllocationCallbacks != NULL) {
-        if (pAllocationCallbacks->onFree != NULL) {
-            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
-        } else {
-            return; /* Do no fall back to the default implementation. */
-        }
-    } else {
-        ma__free_default(p, NULL);
-    }
-}
-
-MA_API void* ma_aligned_malloc(size_t sz, size_t alignment, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    size_t extraBytes;
-    void* pUnaligned;
-    void* pAligned;
-
-    if (alignment == 0) {
-        return 0;
-    }
-
-    extraBytes = alignment-1 + sizeof(void*);
-
-    pUnaligned = ma_malloc(sz + extraBytes, pAllocationCallbacks);
-    if (pUnaligned == NULL) {
-        return NULL;
-    }
-
-    pAligned = (void*)(((ma_uintptr)pUnaligned + extraBytes) & ~((ma_uintptr)(alignment-1)));
-    ((void**)pAligned)[-1] = pUnaligned;
-
-    return pAligned;
-}
-
-MA_API void ma_aligned_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_free(((void**)p)[-1], pAllocationCallbacks);
-}
-
-MA_API const char* ma_get_format_name(ma_format format)
-{
-    switch (format)
-    {
-        case ma_format_unknown: return "Unknown";
-        case ma_format_u8:      return "8-bit Unsigned Integer";
-        case ma_format_s16:     return "16-bit Signed Integer";
-        case ma_format_s24:     return "24-bit Signed Integer (Tightly Packed)";
-        case ma_format_s32:     return "32-bit Signed Integer";
-        case ma_format_f32:     return "32-bit IEEE Floating Point";
-        default:                return "Invalid";
-    }
-}
-
-MA_API void ma_blend_f32(float* pOut, float* pInA, float* pInB, float factor, ma_uint32 channels)
-{
-    ma_uint32 i;
-    for (i = 0; i < channels; ++i) {
-        pOut[i] = ma_mix_f32(pInA[i], pInB[i], factor);
-    }
-}
-
-
-MA_API ma_uint32 ma_get_bytes_per_sample(ma_format format)
-{
-    ma_uint32 sizes[] = {
-        0,  /* unknown */
-        1,  /* u8 */
-        2,  /* s16 */
-        3,  /* s24 */
-        4,  /* s32 */
-        4,  /* f32 */
-    };
-    return sizes[format];
-}
-
-
-
-#define MA_DATA_SOURCE_DEFAULT_RANGE_BEG        0
-#define MA_DATA_SOURCE_DEFAULT_RANGE_END        ~((ma_uint64)0)
-#define MA_DATA_SOURCE_DEFAULT_LOOP_POINT_BEG   0
-#define MA_DATA_SOURCE_DEFAULT_LOOP_POINT_END   ~((ma_uint64)0)
-
-MA_API ma_data_source_config ma_data_source_config_init(void)
-{
-    ma_data_source_config config;
-
-    MA_ZERO_OBJECT(&config);
-
-    return config;
-}
-
-
-MA_API ma_result ma_data_source_init(const ma_data_source_config* pConfig, ma_data_source* pDataSource)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDataSourceBase);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pDataSourceBase->vtable           = pConfig->vtable;
-    pDataSourceBase->rangeBegInFrames = MA_DATA_SOURCE_DEFAULT_RANGE_BEG;
-    pDataSourceBase->rangeEndInFrames = MA_DATA_SOURCE_DEFAULT_RANGE_END;
-    pDataSourceBase->loopBegInFrames  = MA_DATA_SOURCE_DEFAULT_LOOP_POINT_BEG;
-    pDataSourceBase->loopEndInFrames  = MA_DATA_SOURCE_DEFAULT_LOOP_POINT_END;
-    pDataSourceBase->pCurrent         = pDataSource;    /* Always read from ourself by default. */
-    pDataSourceBase->pNext            = NULL;
-    pDataSourceBase->onGetNext        = NULL;
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_data_source_uninit(ma_data_source* pDataSource)
-{
-    if (pDataSource == NULL) {
-        return;
-    }
-
-    /*
-    This is placeholder in case we need this later. Data sources need to call this in their
-    uninitialization routine to ensure things work later on if something is added here.
-    */
-}
-
-static ma_result ma_data_source_resolve_current(ma_data_source* pDataSource, ma_data_source** ppCurrentDataSource)
-{
-    ma_data_source_base* pCurrentDataSource = (ma_data_source_base*)pDataSource;
-
-    MA_ASSERT(pDataSource         != NULL);
-    MA_ASSERT(ppCurrentDataSource != NULL);
-
-    if (pCurrentDataSource->pCurrent == NULL) {
-        /*
-        The current data source is NULL. If we're using this in the context of a chain we need to return NULL
-        here so that we don't end up looping. Otherwise we just return the data source itself.
-        */
-        if (pCurrentDataSource->pNext != NULL || pCurrentDataSource->onGetNext != NULL) {
-            pCurrentDataSource = NULL;
-        } else {
-            pCurrentDataSource = (ma_data_source_base*)pDataSource; /* Not being used in a chain. Make sure we just always read from the data source itself at all times. */
-        }
-    } else {
-        pCurrentDataSource = (ma_data_source_base*)pCurrentDataSource->pCurrent;
-    }
-
-    *ppCurrentDataSource = pCurrentDataSource;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_data_source_read_pcm_frames_within_range(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-    ma_result result;
-    ma_uint64 framesRead = 0;
-    ma_bool32 loop = ma_data_source_is_looping(pDataSource);
-
-    if (pDataSourceBase == NULL) {
-        return MA_AT_END;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSourceBase->vtable->flags & MA_DATA_SOURCE_SELF_MANAGED_RANGE_AND_LOOP_POINT) != 0 || (pDataSourceBase->rangeEndInFrames == ~((ma_uint64)0) && (pDataSourceBase->loopEndInFrames == ~((ma_uint64)0) || loop == MA_FALSE))) {
-        /* Either the data source is self-managing the range, or no range is set - just read like normal. The data source itself will tell us when the end is reached. */
-        result = pDataSourceBase->vtable->onRead(pDataSourceBase, pFramesOut, frameCount, &framesRead);
-    } else {
-        /* Need to clamp to within the range. */
-        ma_uint64 relativeCursor;
-        ma_uint64 absoluteCursor;
-
-        result = ma_data_source_get_cursor_in_pcm_frames(pDataSourceBase, &relativeCursor);
-        if (result != MA_SUCCESS) {
-            /* Failed to retrieve the cursor. Cannot read within a range or loop points. Just read like normal - this may happen for things like noise data sources where it doesn't really matter. */
-            result = pDataSourceBase->vtable->onRead(pDataSourceBase, pFramesOut, frameCount, &framesRead);
-        } else {
-            ma_uint64 rangeBeg;
-            ma_uint64 rangeEnd;
-
-            /* We have the cursor. We need to make sure we don't read beyond our range. */
-            rangeBeg = pDataSourceBase->rangeBegInFrames;
-            rangeEnd = pDataSourceBase->rangeEndInFrames;
-
-            absoluteCursor = rangeBeg + relativeCursor;
-
-            /* If looping, make sure we're within range. */
-            if (loop) {
-                if (pDataSourceBase->loopEndInFrames != ~((ma_uint64)0)) {
-                    rangeEnd = ma_min(rangeEnd, pDataSourceBase->rangeBegInFrames + pDataSourceBase->loopEndInFrames);
-                }
-            }
-
-            if (frameCount > (rangeEnd - absoluteCursor) && rangeEnd != ~((ma_uint64)0)) {
-                frameCount = (rangeEnd - absoluteCursor);
-            }
-
-            /*
-            If the cursor is sitting on the end of the range the frame count will be set to 0 which can
-            result in MA_INVALID_ARGS. In this case, we don't want to try reading, but instead return
-            MA_AT_END so the higher level function can know about it.
-            */
-            if (frameCount > 0) {
-                result = pDataSourceBase->vtable->onRead(pDataSourceBase, pFramesOut, frameCount, &framesRead);
-            } else {
-                result = MA_AT_END; /* The cursor is sitting on the end of the range which means we're at the end. */
-            }
-        }
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = framesRead;
-    }
-
-    /* We need to make sure MA_AT_END is returned if we hit the end of the range. */
-    if (result == MA_SUCCESS && framesRead == 0) {
-        result  = MA_AT_END;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_data_source_read_pcm_frames(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_result result = MA_SUCCESS;
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-    ma_data_source_base* pCurrentDataSource;
-    void* pRunningFramesOut = pFramesOut;
-    ma_uint64 totalFramesProcessed = 0;
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 emptyLoopCounter = 0; /* Keeps track of how many times 0 frames have been read. For infinite loop detection of sounds with no audio data. */
-    ma_bool32 loop;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDataSourceBase == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    loop = ma_data_source_is_looping(pDataSource);
-
-    /*
-    We need to know the data format so we can advance the output buffer as we read frames. If this
-    fails, chaining will not work and we'll just read as much as we can from the current source.
-    */
-    if (ma_data_source_get_data_format(pDataSource, &format, &channels, NULL, NULL, 0) != MA_SUCCESS) {
-        result = ma_data_source_resolve_current(pDataSource, (ma_data_source**)&pCurrentDataSource);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        return ma_data_source_read_pcm_frames_within_range(pCurrentDataSource, pFramesOut, frameCount, pFramesRead);
-    }
-
-    /*
-    Looping is a bit of a special case. When the `loop` argument is true, chaining will not work and
-    only the current data source will be read from.
-    */
-
-    /* Keep reading until we've read as many frames as possible. */
-    while (totalFramesProcessed < frameCount) {
-        ma_uint64 framesProcessed;
-        ma_uint64 framesRemaining = frameCount - totalFramesProcessed;
-
-        /* We need to resolve the data source that we'll actually be reading from. */
-        result = ma_data_source_resolve_current(pDataSource, (ma_data_source**)&pCurrentDataSource);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        if (pCurrentDataSource == NULL) {
-            break;
-        }
-
-        result = ma_data_source_read_pcm_frames_within_range(pCurrentDataSource, pRunningFramesOut, framesRemaining, &framesProcessed);
-        totalFramesProcessed += framesProcessed;
-
-        /*
-        If we encounted an error from the read callback, make sure it's propagated to the caller. The caller may need to know whether or not MA_BUSY is returned which is
-        not necessarily considered an error.
-        */
-        if (result != MA_SUCCESS && result != MA_AT_END) {
-            break;
-        }
-
-        /*
-        We can determine if we've reached the end by checking if ma_data_source_read_pcm_frames_within_range() returned
-        MA_AT_END. To loop back to the start, all we need to do is seek back to the first frame.
-        */
-        if (result == MA_AT_END) {
-            /*
-            The result needs to be reset back to MA_SUCCESS (from MA_AT_END) so that we don't
-            accidentally return MA_AT_END when data has been read in prior loop iterations. at the
-            end of this function, the result will be checked for MA_SUCCESS, and if the total
-            number of frames processed is 0, will be explicitly set to MA_AT_END.
-            */
-            result = MA_SUCCESS;
-
-            /*
-            We reached the end. If we're looping, we just loop back to the start of the current
-            data source. If we're not looping we need to check if we have another in the chain, and
-            if so, switch to it.
-            */
-            if (loop) {
-                if (framesProcessed == 0) {
-                    emptyLoopCounter += 1;
-                    if (emptyLoopCounter > 1) {
-                        break;  /* Infinite loop detected. Get out. */
-                    }
-                } else {
-                    emptyLoopCounter = 0;
-                }
-
-                result = ma_data_source_seek_to_pcm_frame(pCurrentDataSource, pCurrentDataSource->loopBegInFrames);
-                if (result != MA_SUCCESS) {
-                    break;  /* Failed to loop. Abort. */
-                }
-
-                /* Don't return MA_AT_END for looping sounds. */
-                result = MA_SUCCESS;
-            } else {
-                if (pCurrentDataSource->pNext != NULL) {
-                    pDataSourceBase->pCurrent = pCurrentDataSource->pNext;
-                } else if (pCurrentDataSource->onGetNext != NULL) {
-                    pDataSourceBase->pCurrent = pCurrentDataSource->onGetNext(pCurrentDataSource);
-                    if (pDataSourceBase->pCurrent == NULL) {
-                        break;  /* Our callback did not return a next data source. We're done. */
-                    }
-                } else {
-                    /* Reached the end of the chain. We're done. */
-                    break;
-                }
-
-                /* The next data source needs to be rewound to ensure data is read in looping scenarios. */
-                result = ma_data_source_seek_to_pcm_frame(pDataSourceBase->pCurrent, 0);
-                if (result != MA_SUCCESS) {
-                    break;
-                }
-            }
-        }
-
-        if (pRunningFramesOut != NULL) {
-            pRunningFramesOut = ma_offset_ptr(pRunningFramesOut, framesProcessed * ma_get_bytes_per_frame(format, channels));
-        }
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = totalFramesProcessed;
-    }
-
-    MA_ASSERT(!(result == MA_AT_END && totalFramesProcessed > 0));  /* We should never be returning MA_AT_END if we read some data. */
-
-    if (result == MA_SUCCESS && totalFramesProcessed == 0) {
-        result  = MA_AT_END;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_data_source_seek_pcm_frames(ma_data_source* pDataSource, ma_uint64 frameCount, ma_uint64* pFramesSeeked)
-{
-    return ma_data_source_read_pcm_frames(pDataSource, NULL, frameCount, pFramesSeeked);
-}
-
-MA_API ma_result ma_data_source_seek_to_pcm_frame(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-
-    if (pDataSourceBase == NULL) {
-        return MA_SUCCESS;
-    }
-
-    if (pDataSourceBase->vtable->onSeek == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    if (frameIndex > pDataSourceBase->rangeEndInFrames) {
-        return MA_INVALID_OPERATION;    /* Trying to seek to far forward. */
-    }
-
-    return pDataSourceBase->vtable->onSeek(pDataSource, pDataSourceBase->rangeBegInFrames + frameIndex);
-}
-
-MA_API ma_result ma_data_source_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-    ma_result result;
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-
-    /* Initialize to defaults for safety just in case the data source does not implement this callback. */
-    if (pFormat != NULL) {
-        *pFormat = ma_format_unknown;
-    }
-    if (pChannels != NULL) {
-        *pChannels = 0;
-    }
-    if (pSampleRate != NULL) {
-        *pSampleRate = 0;
-    }
-    if (pChannelMap != NULL) {
-        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
-    }
-
-    if (pDataSourceBase == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDataSourceBase->vtable->onGetDataFormat == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    result = pDataSourceBase->vtable->onGetDataFormat(pDataSource, &format, &channels, &sampleRate, pChannelMap, channelMapCap);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pFormat != NULL) {
-        *pFormat = format;
-    }
-    if (pChannels != NULL) {
-        *pChannels = channels;
-    }
-    if (pSampleRate != NULL) {
-        *pSampleRate = sampleRate;
-    }
-
-    /* Channel map was passed in directly to the callback. This is safe due to the channelMapCap parameter. */
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_source_get_cursor_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-    ma_result result;
-    ma_uint64 cursor;
-
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;
-
-    if (pDataSourceBase == NULL) {
-        return MA_SUCCESS;
-    }
-
-    if (pDataSourceBase->vtable->onGetCursor == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    result = pDataSourceBase->vtable->onGetCursor(pDataSourceBase, &cursor);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* The cursor needs to be made relative to the start of the range. */
-    if (cursor < pDataSourceBase->rangeBegInFrames) {   /* Safety check so we don't return some huge number. */
-        *pCursor = 0;
-    } else {
-        *pCursor = cursor - pDataSourceBase->rangeBegInFrames;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_source_get_length_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;
-
-    if (pDataSourceBase == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /*
-    If we have a range defined we'll use that to determine the length. This is one of rare times
-    where we'll actually trust the caller. If they've set the range, I think it's mostly safe to
-    assume they've set it based on some higher level knowledge of the structure of the sound bank.
-    */
-    if (pDataSourceBase->rangeEndInFrames != ~((ma_uint64)0)) {
-        *pLength = pDataSourceBase->rangeEndInFrames - pDataSourceBase->rangeBegInFrames;
-        return MA_SUCCESS;
-    }
-
-    /*
-    Getting here means a range is not defined so we'll need to get the data source itself to tell
-    us the length.
-    */
-    if (pDataSourceBase->vtable->onGetLength == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pDataSourceBase->vtable->onGetLength(pDataSource, pLength);
-}
-
-MA_API ma_result ma_data_source_get_cursor_in_seconds(ma_data_source* pDataSource, float* pCursor)
-{
-    ma_result result;
-    ma_uint64 cursorInPCMFrames;
-    ma_uint32 sampleRate;
-
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;
-
-    result = ma_data_source_get_cursor_in_pcm_frames(pDataSource, &cursorInPCMFrames);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = ma_data_source_get_data_format(pDataSource, NULL, NULL, &sampleRate, NULL, 0);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* VC6 does not support division of unsigned 64-bit integers with floating point numbers. Need to use a signed number. This shouldn't effect anything in practice. */
-    *pCursor = (ma_int64)cursorInPCMFrames / (float)sampleRate;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_source_get_length_in_seconds(ma_data_source* pDataSource, float* pLength)
-{
-    ma_result result;
-    ma_uint64 lengthInPCMFrames;
-    ma_uint32 sampleRate;
-
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;
-
-    result = ma_data_source_get_length_in_pcm_frames(pDataSource, &lengthInPCMFrames);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = ma_data_source_get_data_format(pDataSource, NULL, NULL, &sampleRate, NULL, 0);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* VC6 does not support division of unsigned 64-bit integers with floating point numbers. Need to use a signed number. This shouldn't effect anything in practice. */
-    *pLength = (ma_int64)lengthInPCMFrames / (float)sampleRate;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_data_source_set_looping(ma_data_source* pDataSource, ma_bool32 isLooping)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_atomic_exchange_32(&pDataSourceBase->isLooping, isLooping);
-
-    /* If there's no callback for this just treat it as a successful no-op. */
-    if (pDataSourceBase->vtable->onSetLooping == NULL) {
-        return MA_SUCCESS;
-    }
-
-    return pDataSourceBase->vtable->onSetLooping(pDataSource, isLooping);
-}
-
-MA_API ma_bool32 ma_data_source_is_looping(const ma_data_source* pDataSource)
-{
-    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_atomic_load_32(&pDataSourceBase->isLooping);
-}
-
-MA_API ma_result ma_data_source_set_range_in_pcm_frames(ma_data_source* pDataSource, ma_uint64 rangeBegInFrames, ma_uint64 rangeEndInFrames)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-    ma_result result;
-    ma_uint64 relativeCursor;
-    ma_uint64 absoluteCursor;
-    ma_bool32 doSeekAdjustment = MA_FALSE;
-
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (rangeEndInFrames < rangeBegInFrames) {
-        return MA_INVALID_ARGS; /* The end of the range must come after the beginning. */
-    }
-
-    /*
-    We may need to adjust the position of the cursor to ensure it's clamped to the range. Grab it now
-    so we can calculate it's absolute position before we change the range.
-    */
-    result = ma_data_source_get_cursor_in_pcm_frames(pDataSource, &relativeCursor);
-    if (result == MA_SUCCESS) {
-        doSeekAdjustment = MA_TRUE;
-        absoluteCursor = relativeCursor + pDataSourceBase->rangeBegInFrames;
-    } else {
-        /*
-        We couldn't get the position of the cursor. It probably means the data source has no notion
-        of a cursor. We'll just leave it at position 0. Don't treat this as an error.
-        */
-        doSeekAdjustment = MA_FALSE;
-        relativeCursor = 0;
-        absoluteCursor = 0;
-    }
-
-    pDataSourceBase->rangeBegInFrames = rangeBegInFrames;
-    pDataSourceBase->rangeEndInFrames = rangeEndInFrames;
-
-    /*
-    The commented out logic below was intended to maintain loop points in response to a change in the
-    range. However, this is not useful because it results in the sound breaking when you move the range
-    outside of the old loop points. I'm simplifying this by simply resetting the loop points. The
-    caller is expected to update their loop points if they change the range.
-
-    In practice this should be mostly a non-issue because the majority of the time the range will be
-    set once right after initialization.
-    */
-    pDataSourceBase->loopBegInFrames = 0;
-    pDataSourceBase->loopEndInFrames = ~((ma_uint64)0);
-
-
-    /*
-    Seek to within range. Note that our seek positions here are relative to the new range. We don't want
-    do do this if we failed to retrieve the cursor earlier on because it probably means the data source
-    has no notion of a cursor. In practice the seek would probably fail (which we silently ignore), but
-    I'm just not even going to attempt it.
-    */
-    if (doSeekAdjustment) {
-        if (absoluteCursor < rangeBegInFrames) {
-            ma_data_source_seek_to_pcm_frame(pDataSource, 0);
-        } else if (absoluteCursor > rangeEndInFrames) {
-            ma_data_source_seek_to_pcm_frame(pDataSource, rangeEndInFrames - rangeBegInFrames);
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_data_source_get_range_in_pcm_frames(const ma_data_source* pDataSource, ma_uint64* pRangeBegInFrames, ma_uint64* pRangeEndInFrames)
-{
-    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return;
-    }
-
-    if (pRangeBegInFrames != NULL) {
-        *pRangeBegInFrames = pDataSourceBase->rangeBegInFrames;
-    }
-
-    if (pRangeEndInFrames != NULL) {
-        *pRangeEndInFrames = pDataSourceBase->rangeEndInFrames;
-    }
-}
-
-MA_API ma_result ma_data_source_set_loop_point_in_pcm_frames(ma_data_source* pDataSource, ma_uint64 loopBegInFrames, ma_uint64 loopEndInFrames)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (loopEndInFrames < loopBegInFrames) {
-        return MA_INVALID_ARGS; /* The end of the loop point must come after the beginning. */
-    }
-
-    if (loopEndInFrames > pDataSourceBase->rangeEndInFrames && loopEndInFrames != ~((ma_uint64)0)) {
-        return MA_INVALID_ARGS; /* The end of the loop point must not go beyond the range. */
-    }
-
-    pDataSourceBase->loopBegInFrames = loopBegInFrames;
-    pDataSourceBase->loopEndInFrames = loopEndInFrames;
-
-    /* The end cannot exceed the range. */
-    if (pDataSourceBase->loopEndInFrames > (pDataSourceBase->rangeEndInFrames - pDataSourceBase->rangeBegInFrames) && pDataSourceBase->loopEndInFrames != ~((ma_uint64)0)) {
-        pDataSourceBase->loopEndInFrames = (pDataSourceBase->rangeEndInFrames - pDataSourceBase->rangeBegInFrames);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_data_source_get_loop_point_in_pcm_frames(const ma_data_source* pDataSource, ma_uint64* pLoopBegInFrames, ma_uint64* pLoopEndInFrames)
-{
-    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return;
-    }
-
-    if (pLoopBegInFrames != NULL) {
-        *pLoopBegInFrames = pDataSourceBase->loopBegInFrames;
-    }
-
-    if (pLoopEndInFrames != NULL) {
-        *pLoopEndInFrames = pDataSourceBase->loopEndInFrames;
-    }
-}
-
-MA_API ma_result ma_data_source_set_current(ma_data_source* pDataSource, ma_data_source* pCurrentDataSource)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pDataSourceBase->pCurrent = pCurrentDataSource;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_data_source* ma_data_source_get_current(const ma_data_source* pDataSource)
-{
-    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return NULL;
-    }
-
-    return pDataSourceBase->pCurrent;
-}
-
-MA_API ma_result ma_data_source_set_next(ma_data_source* pDataSource, ma_data_source* pNextDataSource)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pDataSourceBase->pNext = pNextDataSource;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_data_source* ma_data_source_get_next(const ma_data_source* pDataSource)
-{
-    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return NULL;
-    }
-
-    return pDataSourceBase->pNext;
-}
-
-MA_API ma_result ma_data_source_set_next_callback(ma_data_source* pDataSource, ma_data_source_get_next_proc onGetNext)
-{
-    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pDataSourceBase->onGetNext = onGetNext;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_data_source_get_next_proc ma_data_source_get_next_callback(const ma_data_source* pDataSource)
-{
-    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
-
-    if (pDataSource == NULL) {
-        return NULL;
-    }
-
-    return pDataSourceBase->onGetNext;
-}
-
-
-static ma_result ma_audio_buffer_ref__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_audio_buffer_ref* pAudioBufferRef = (ma_audio_buffer_ref*)pDataSource;
-    ma_uint64 framesRead = ma_audio_buffer_ref_read_pcm_frames(pAudioBufferRef, pFramesOut, frameCount, MA_FALSE);
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = framesRead;
-    }
-
-    if (framesRead < frameCount || framesRead == 0) {
-        return MA_AT_END;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_audio_buffer_ref__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_audio_buffer_ref_seek_to_pcm_frame((ma_audio_buffer_ref*)pDataSource, frameIndex);
-}
-
-static ma_result ma_audio_buffer_ref__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    ma_audio_buffer_ref* pAudioBufferRef = (ma_audio_buffer_ref*)pDataSource;
-
-    *pFormat     = pAudioBufferRef->format;
-    *pChannels   = pAudioBufferRef->channels;
-    *pSampleRate = pAudioBufferRef->sampleRate;
-    ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pAudioBufferRef->channels);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_audio_buffer_ref__data_source_on_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    ma_audio_buffer_ref* pAudioBufferRef = (ma_audio_buffer_ref*)pDataSource;
-
-    *pCursor = pAudioBufferRef->cursor;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_audio_buffer_ref__data_source_on_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    ma_audio_buffer_ref* pAudioBufferRef = (ma_audio_buffer_ref*)pDataSource;
-
-    *pLength = pAudioBufferRef->sizeInFrames;
-
-    return MA_SUCCESS;
-}
-
-static ma_data_source_vtable g_ma_audio_buffer_ref_data_source_vtable =
-{
-    ma_audio_buffer_ref__data_source_on_read,
-    ma_audio_buffer_ref__data_source_on_seek,
-    ma_audio_buffer_ref__data_source_on_get_data_format,
-    ma_audio_buffer_ref__data_source_on_get_cursor,
-    ma_audio_buffer_ref__data_source_on_get_length,
-    NULL,   /* onSetLooping */
-    0
-};
-
-MA_API ma_result ma_audio_buffer_ref_init(ma_format format, ma_uint32 channels, const void* pData, ma_uint64 sizeInFrames, ma_audio_buffer_ref* pAudioBufferRef)
-{
-    ma_result result;
-    ma_data_source_config dataSourceConfig;
-
-    if (pAudioBufferRef == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pAudioBufferRef);
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_audio_buffer_ref_data_source_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pAudioBufferRef->ds);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pAudioBufferRef->format       = format;
-    pAudioBufferRef->channels     = channels;
-    pAudioBufferRef->sampleRate   = 0;  /* TODO: Version 0.12. Set this to sampleRate. */
-    pAudioBufferRef->cursor       = 0;
-    pAudioBufferRef->sizeInFrames = sizeInFrames;
-    pAudioBufferRef->pData        = pData;
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_audio_buffer_ref_uninit(ma_audio_buffer_ref* pAudioBufferRef)
-{
-    if (pAudioBufferRef == NULL) {
-        return;
-    }
-
-    ma_data_source_uninit(&pAudioBufferRef->ds);
-}
-
-MA_API ma_result ma_audio_buffer_ref_set_data(ma_audio_buffer_ref* pAudioBufferRef, const void* pData, ma_uint64 sizeInFrames)
-{
-    if (pAudioBufferRef == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pAudioBufferRef->cursor       = 0;
-    pAudioBufferRef->sizeInFrames = sizeInFrames;
-    pAudioBufferRef->pData        = pData;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_uint64 ma_audio_buffer_ref_read_pcm_frames(ma_audio_buffer_ref* pAudioBufferRef, void* pFramesOut, ma_uint64 frameCount, ma_bool32 loop)
-{
-    ma_uint64 totalFramesRead = 0;
-
-    if (pAudioBufferRef == NULL) {
-        return 0;
-    }
-
-    if (frameCount == 0) {
-        return 0;
-    }
-
-    while (totalFramesRead < frameCount) {
-        ma_uint64 framesAvailable = pAudioBufferRef->sizeInFrames - pAudioBufferRef->cursor;
-        ma_uint64 framesRemaining = frameCount - totalFramesRead;
-        ma_uint64 framesToRead;
-
-        framesToRead = framesRemaining;
-        if (framesToRead > framesAvailable) {
-            framesToRead = framesAvailable;
-        }
-
-        if (pFramesOut != NULL) {
-            ma_copy_pcm_frames(ma_offset_ptr(pFramesOut, totalFramesRead * ma_get_bytes_per_frame(pAudioBufferRef->format, pAudioBufferRef->channels)), ma_offset_ptr(pAudioBufferRef->pData, pAudioBufferRef->cursor * ma_get_bytes_per_frame(pAudioBufferRef->format, pAudioBufferRef->channels)), framesToRead, pAudioBufferRef->format, pAudioBufferRef->channels);
-        }
-
-        totalFramesRead += framesToRead;
-
-        pAudioBufferRef->cursor += framesToRead;
-        if (pAudioBufferRef->cursor == pAudioBufferRef->sizeInFrames) {
-            if (loop) {
-                pAudioBufferRef->cursor = 0;
-            } else {
-                break;  /* We've reached the end and we're not looping. Done. */
-            }
-        }
-
-        MA_ASSERT(pAudioBufferRef->cursor < pAudioBufferRef->sizeInFrames);
-    }
-
-    return totalFramesRead;
-}
-
-MA_API ma_result ma_audio_buffer_ref_seek_to_pcm_frame(ma_audio_buffer_ref* pAudioBufferRef, ma_uint64 frameIndex)
-{
-    if (pAudioBufferRef == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (frameIndex > pAudioBufferRef->sizeInFrames) {
-        return MA_INVALID_ARGS;
-    }
-
-    pAudioBufferRef->cursor = (size_t)frameIndex;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_audio_buffer_ref_map(ma_audio_buffer_ref* pAudioBufferRef, void** ppFramesOut, ma_uint64* pFrameCount)
-{
-    ma_uint64 framesAvailable;
-    ma_uint64 frameCount = 0;
-
-    if (ppFramesOut != NULL) {
-        *ppFramesOut = NULL;    /* Safety. */
-    }
-
-    if (pFrameCount != NULL) {
-        frameCount = *pFrameCount;
-        *pFrameCount = 0;       /* Safety. */
-    }
-
-    if (pAudioBufferRef == NULL || ppFramesOut == NULL || pFrameCount == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    framesAvailable = pAudioBufferRef->sizeInFrames - pAudioBufferRef->cursor;
-    if (frameCount > framesAvailable) {
-        frameCount = framesAvailable;
-    }
-
-    *ppFramesOut = ma_offset_ptr(pAudioBufferRef->pData, pAudioBufferRef->cursor * ma_get_bytes_per_frame(pAudioBufferRef->format, pAudioBufferRef->channels));
-    *pFrameCount = frameCount;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_audio_buffer_ref_unmap(ma_audio_buffer_ref* pAudioBufferRef, ma_uint64 frameCount)
-{
-    ma_uint64 framesAvailable;
-
-    if (pAudioBufferRef == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    framesAvailable = pAudioBufferRef->sizeInFrames - pAudioBufferRef->cursor;
-    if (frameCount > framesAvailable) {
-        return MA_INVALID_ARGS;   /* The frame count was too big. This should never happen in an unmapping. Need to make sure the caller is aware of this. */
-    }
-
-    pAudioBufferRef->cursor += frameCount;
-
-    if (pAudioBufferRef->cursor == pAudioBufferRef->sizeInFrames) {
-        return MA_AT_END;   /* Successful. Need to tell the caller that the end has been reached so that it can loop if desired. */
-    } else {
-        return MA_SUCCESS;
-    }
-}
-
-MA_API ma_bool32 ma_audio_buffer_ref_at_end(const ma_audio_buffer_ref* pAudioBufferRef)
-{
-    if (pAudioBufferRef == NULL) {
-        return MA_FALSE;
-    }
-
-    return pAudioBufferRef->cursor == pAudioBufferRef->sizeInFrames;
-}
-
-MA_API ma_result ma_audio_buffer_ref_get_cursor_in_pcm_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pCursor)
-{
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;
-
-    if (pAudioBufferRef == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = pAudioBufferRef->cursor;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_audio_buffer_ref_get_length_in_pcm_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pLength)
-{
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;
-
-    if (pAudioBufferRef == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = pAudioBufferRef->sizeInFrames;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_audio_buffer_ref_get_available_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pAvailableFrames)
-{
-    if (pAvailableFrames == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pAvailableFrames = 0;
-
-    if (pAudioBufferRef == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pAudioBufferRef->sizeInFrames <= pAudioBufferRef->cursor) {
-        *pAvailableFrames = 0;
-    } else {
-        *pAvailableFrames = pAudioBufferRef->sizeInFrames - pAudioBufferRef->cursor;
-    }
-
-    return MA_SUCCESS;
-}
-
-
-
-
-MA_API ma_audio_buffer_config ma_audio_buffer_config_init(ma_format format, ma_uint32 channels, ma_uint64 sizeInFrames, const void* pData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_audio_buffer_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format       = format;
-    config.channels     = channels;
-    config.sampleRate   = 0;    /* TODO: Version 0.12. Set this to sampleRate. */
-    config.sizeInFrames = sizeInFrames;
-    config.pData        = pData;
-    ma_allocation_callbacks_init_copy(&config.allocationCallbacks, pAllocationCallbacks);
-
-    return config;
-}
-
-static ma_result ma_audio_buffer_init_ex(const ma_audio_buffer_config* pConfig, ma_bool32 doCopy, ma_audio_buffer* pAudioBuffer)
-{
-    ma_result result;
-
-    if (pAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_MEMORY(pAudioBuffer, sizeof(*pAudioBuffer) - sizeof(pAudioBuffer->_pExtraData));   /* Safety. Don't overwrite the extra data. */
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->sizeInFrames == 0) {
-        return MA_INVALID_ARGS; /* Not allowing buffer sizes of 0 frames. */
-    }
-
-    result = ma_audio_buffer_ref_init(pConfig->format, pConfig->channels, NULL, 0, &pAudioBuffer->ref);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* TODO: Version 0.12. Set this in ma_audio_buffer_ref_init() instead of here. */
-    pAudioBuffer->ref.sampleRate = pConfig->sampleRate;
-
-    ma_allocation_callbacks_init_copy(&pAudioBuffer->allocationCallbacks, &pConfig->allocationCallbacks);
-
-    if (doCopy) {
-        ma_uint64 allocationSizeInBytes;
-        void* pData;
-
-        allocationSizeInBytes = pConfig->sizeInFrames * ma_get_bytes_per_frame(pConfig->format, pConfig->channels);
-        if (allocationSizeInBytes > MA_SIZE_MAX) {
-            return MA_OUT_OF_MEMORY;    /* Too big. */
-        }
-
-        pData = ma_malloc((size_t)allocationSizeInBytes, &pAudioBuffer->allocationCallbacks);   /* Safe cast to size_t. */
-        if (pData == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-
-        if (pConfig->pData != NULL) {
-            ma_copy_pcm_frames(pData, pConfig->pData, pConfig->sizeInFrames, pConfig->format, pConfig->channels);
-        } else {
-            ma_silence_pcm_frames(pData, pConfig->sizeInFrames, pConfig->format, pConfig->channels);
-        }
-
-        ma_audio_buffer_ref_set_data(&pAudioBuffer->ref, pData, pConfig->sizeInFrames);
-        pAudioBuffer->ownsData = MA_TRUE;
-    } else {
-        ma_audio_buffer_ref_set_data(&pAudioBuffer->ref, pConfig->pData, pConfig->sizeInFrames);
-        pAudioBuffer->ownsData = MA_FALSE;
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_audio_buffer_uninit_ex(ma_audio_buffer* pAudioBuffer, ma_bool32 doFree)
-{
-    if (pAudioBuffer == NULL) {
-        return;
-    }
-
-    if (pAudioBuffer->ownsData && pAudioBuffer->ref.pData != &pAudioBuffer->_pExtraData[0]) {
-        ma_free((void*)pAudioBuffer->ref.pData, &pAudioBuffer->allocationCallbacks);    /* Naugty const cast, but OK in this case since we've guarded it with the ownsData check. */
-    }
-
-    if (doFree) {
-        ma_free(pAudioBuffer, &pAudioBuffer->allocationCallbacks);
-    }
-
-    ma_audio_buffer_ref_uninit(&pAudioBuffer->ref);
-}
-
-MA_API ma_result ma_audio_buffer_init(const ma_audio_buffer_config* pConfig, ma_audio_buffer* pAudioBuffer)
-{
-    return ma_audio_buffer_init_ex(pConfig, MA_FALSE, pAudioBuffer);
-}
-
-MA_API ma_result ma_audio_buffer_init_copy(const ma_audio_buffer_config* pConfig, ma_audio_buffer* pAudioBuffer)
-{
-    return ma_audio_buffer_init_ex(pConfig, MA_TRUE, pAudioBuffer);
-}
-
-MA_API ma_result ma_audio_buffer_alloc_and_init(const ma_audio_buffer_config* pConfig, ma_audio_buffer** ppAudioBuffer)
-{
-    ma_result result;
-    ma_audio_buffer* pAudioBuffer;
-    ma_audio_buffer_config innerConfig; /* We'll be making some changes to the config, so need to make a copy. */
-    ma_uint64 allocationSizeInBytes;
-
-    if (ppAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *ppAudioBuffer = NULL;  /* Safety. */
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    innerConfig = *pConfig;
-    ma_allocation_callbacks_init_copy(&innerConfig.allocationCallbacks, &pConfig->allocationCallbacks);
-
-    allocationSizeInBytes = sizeof(*pAudioBuffer) - sizeof(pAudioBuffer->_pExtraData) + (pConfig->sizeInFrames * ma_get_bytes_per_frame(pConfig->format, pConfig->channels));
-    if (allocationSizeInBytes > MA_SIZE_MAX) {
-        return MA_OUT_OF_MEMORY;    /* Too big. */
-    }
-
-    pAudioBuffer = (ma_audio_buffer*)ma_malloc((size_t)allocationSizeInBytes, &innerConfig.allocationCallbacks);  /* Safe cast to size_t. */
-    if (pAudioBuffer == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    if (pConfig->pData != NULL) {
-        ma_copy_pcm_frames(&pAudioBuffer->_pExtraData[0], pConfig->pData, pConfig->sizeInFrames, pConfig->format, pConfig->channels);
-    } else {
-        ma_silence_pcm_frames(&pAudioBuffer->_pExtraData[0], pConfig->sizeInFrames, pConfig->format, pConfig->channels);
-    }
-
-    innerConfig.pData = &pAudioBuffer->_pExtraData[0];
-
-    result = ma_audio_buffer_init_ex(&innerConfig, MA_FALSE, pAudioBuffer);
-    if (result != MA_SUCCESS) {
-        ma_free(pAudioBuffer, &innerConfig.allocationCallbacks);
-        return result;
-    }
-
-    *ppAudioBuffer = pAudioBuffer;
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_audio_buffer_uninit(ma_audio_buffer* pAudioBuffer)
-{
-    ma_audio_buffer_uninit_ex(pAudioBuffer, MA_FALSE);
-}
-
-MA_API void ma_audio_buffer_uninit_and_free(ma_audio_buffer* pAudioBuffer)
-{
-    ma_audio_buffer_uninit_ex(pAudioBuffer, MA_TRUE);
-}
-
-MA_API ma_uint64 ma_audio_buffer_read_pcm_frames(ma_audio_buffer* pAudioBuffer, void* pFramesOut, ma_uint64 frameCount, ma_bool32 loop)
-{
-    if (pAudioBuffer == NULL) {
-        return 0;
-    }
-
-    return ma_audio_buffer_ref_read_pcm_frames(&pAudioBuffer->ref, pFramesOut, frameCount, loop);
-}
-
-MA_API ma_result ma_audio_buffer_seek_to_pcm_frame(ma_audio_buffer* pAudioBuffer, ma_uint64 frameIndex)
-{
-    if (pAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_audio_buffer_ref_seek_to_pcm_frame(&pAudioBuffer->ref, frameIndex);
-}
-
-MA_API ma_result ma_audio_buffer_map(ma_audio_buffer* pAudioBuffer, void** ppFramesOut, ma_uint64* pFrameCount)
-{
-    if (ppFramesOut != NULL) {
-        *ppFramesOut = NULL;    /* Safety. */
-    }
-
-    if (pAudioBuffer == NULL) {
-        if (pFrameCount != NULL) {
-            *pFrameCount = 0;
-        }
-
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_audio_buffer_ref_map(&pAudioBuffer->ref, ppFramesOut, pFrameCount);
-}
-
-MA_API ma_result ma_audio_buffer_unmap(ma_audio_buffer* pAudioBuffer, ma_uint64 frameCount)
-{
-    if (pAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_audio_buffer_ref_unmap(&pAudioBuffer->ref, frameCount);
-}
-
-MA_API ma_bool32 ma_audio_buffer_at_end(const ma_audio_buffer* pAudioBuffer)
-{
-    if (pAudioBuffer == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_audio_buffer_ref_at_end(&pAudioBuffer->ref);
-}
-
-MA_API ma_result ma_audio_buffer_get_cursor_in_pcm_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pCursor)
-{
-    if (pAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_audio_buffer_ref_get_cursor_in_pcm_frames(&pAudioBuffer->ref, pCursor);
-}
-
-MA_API ma_result ma_audio_buffer_get_length_in_pcm_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pLength)
-{
-    if (pAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_audio_buffer_ref_get_length_in_pcm_frames(&pAudioBuffer->ref, pLength);
-}
-
-MA_API ma_result ma_audio_buffer_get_available_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pAvailableFrames)
-{
-    if (pAvailableFrames == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pAvailableFrames = 0;
-
-    if (pAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_audio_buffer_ref_get_available_frames(&pAudioBuffer->ref, pAvailableFrames);
-}
-
-
-
-
-
-MA_API ma_result ma_paged_audio_buffer_data_init(ma_format format, ma_uint32 channels, ma_paged_audio_buffer_data* pData)
-{
-    if (pData == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pData);
-
-    pData->format   = format;
-    pData->channels = channels;
-    pData->pTail    = &pData->head;
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_paged_audio_buffer_data_uninit(ma_paged_audio_buffer_data* pData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_paged_audio_buffer_page* pPage;
-
-    if (pData == NULL) {
-        return;
-    }
-
-    /* All pages need to be freed. */
-    pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pData->head.pNext);
-    while (pPage != NULL) {
-        ma_paged_audio_buffer_page* pNext = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pPage->pNext);
-
-        ma_free(pPage, pAllocationCallbacks);
-        pPage = pNext;
-    }
-}
-
-MA_API ma_paged_audio_buffer_page* ma_paged_audio_buffer_data_get_head(ma_paged_audio_buffer_data* pData)
-{
-    if (pData == NULL) {
-        return NULL;
-    }
-
-    return &pData->head;
-}
-
-MA_API ma_paged_audio_buffer_page* ma_paged_audio_buffer_data_get_tail(ma_paged_audio_buffer_data* pData)
-{
-    if (pData == NULL) {
-        return NULL;
-    }
-
-    return pData->pTail;
-}
-
-MA_API ma_result ma_paged_audio_buffer_data_get_length_in_pcm_frames(ma_paged_audio_buffer_data* pData, ma_uint64* pLength)
-{
-    ma_paged_audio_buffer_page* pPage;
-
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;
-
-    if (pData == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Calculate the length from the linked list. */
-    for (pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pData->head.pNext); pPage != NULL; pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pPage->pNext)) {
-        *pLength += pPage->sizeInFrames;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_paged_audio_buffer_data_allocate_page(ma_paged_audio_buffer_data* pData, ma_uint64 pageSizeInFrames, const void* pInitialData, const ma_allocation_callbacks* pAllocationCallbacks, ma_paged_audio_buffer_page** ppPage)
-{
-    ma_paged_audio_buffer_page* pPage;
-    ma_uint64 allocationSize;
-
-    if (ppPage == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *ppPage = NULL;
-
-    if (pData == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    allocationSize = sizeof(*pPage) + (pageSizeInFrames * ma_get_bytes_per_frame(pData->format, pData->channels));
-    if (allocationSize > MA_SIZE_MAX) {
-        return MA_OUT_OF_MEMORY;    /* Too big. */
-    }
-
-    pPage = (ma_paged_audio_buffer_page*)ma_malloc((size_t)allocationSize, pAllocationCallbacks);   /* Safe cast to size_t. */
-    if (pPage == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    pPage->pNext = NULL;
-    pPage->sizeInFrames = pageSizeInFrames;
-
-    if (pInitialData != NULL) {
-        ma_copy_pcm_frames(pPage->pAudioData, pInitialData, pageSizeInFrames, pData->format, pData->channels);
-    }
-
-    *ppPage = pPage;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_paged_audio_buffer_data_free_page(ma_paged_audio_buffer_data* pData, ma_paged_audio_buffer_page* pPage, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pData == NULL || pPage == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* It's assumed the page is not attached to the list. */
-    ma_free(pPage, pAllocationCallbacks);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_paged_audio_buffer_data_append_page(ma_paged_audio_buffer_data* pData, ma_paged_audio_buffer_page* pPage)
-{
-    if (pData == NULL || pPage == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* This function assumes the page has been filled with audio data by this point. As soon as we append, the page will be available for reading. */
-
-    /* First thing to do is update the tail. */
-    for (;;) {
-        ma_paged_audio_buffer_page* pOldTail = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pData->pTail);
-        ma_paged_audio_buffer_page* pNewTail = pPage;
-
-        if (ma_atomic_compare_exchange_weak_ptr((volatile void**)&pData->pTail, (void**)&pOldTail, pNewTail)) {
-            /* Here is where we append the page to the list. After this, the page is attached to the list and ready to be read from. */
-            ma_atomic_exchange_ptr(&pOldTail->pNext, pPage);
-            break;  /* Done. */
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_paged_audio_buffer_data_allocate_and_append_page(ma_paged_audio_buffer_data* pData, ma_uint32 pageSizeInFrames, const void* pInitialData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_result result;
-    ma_paged_audio_buffer_page* pPage;
-
-    result = ma_paged_audio_buffer_data_allocate_page(pData, pageSizeInFrames, pInitialData, pAllocationCallbacks, &pPage);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return ma_paged_audio_buffer_data_append_page(pData, pPage);    /* <-- Should never fail. */
-}
-
-
-MA_API ma_paged_audio_buffer_config ma_paged_audio_buffer_config_init(ma_paged_audio_buffer_data* pData)
-{
-    ma_paged_audio_buffer_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.pData = pData;
-
-    return config;
-}
-
-
-static ma_result ma_paged_audio_buffer__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_paged_audio_buffer_read_pcm_frames((ma_paged_audio_buffer*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_paged_audio_buffer__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_paged_audio_buffer_seek_to_pcm_frame((ma_paged_audio_buffer*)pDataSource, frameIndex);
-}
-
-static ma_result ma_paged_audio_buffer__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    ma_paged_audio_buffer* pPagedAudioBuffer = (ma_paged_audio_buffer*)pDataSource;
-
-    *pFormat     = pPagedAudioBuffer->pData->format;
-    *pChannels   = pPagedAudioBuffer->pData->channels;
-    *pSampleRate = 0;   /* There is no notion of a sample rate with audio buffers. */
-    ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pPagedAudioBuffer->pData->channels);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_paged_audio_buffer__data_source_on_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    return ma_paged_audio_buffer_get_cursor_in_pcm_frames((ma_paged_audio_buffer*)pDataSource, pCursor);
-}
-
-static ma_result ma_paged_audio_buffer__data_source_on_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    return ma_paged_audio_buffer_get_length_in_pcm_frames((ma_paged_audio_buffer*)pDataSource, pLength);
-}
-
-static ma_data_source_vtable g_ma_paged_audio_buffer_data_source_vtable =
-{
-    ma_paged_audio_buffer__data_source_on_read,
-    ma_paged_audio_buffer__data_source_on_seek,
-    ma_paged_audio_buffer__data_source_on_get_data_format,
-    ma_paged_audio_buffer__data_source_on_get_cursor,
-    ma_paged_audio_buffer__data_source_on_get_length,
-    NULL,   /* onSetLooping */
-    0
-};
-
-MA_API ma_result ma_paged_audio_buffer_init(const ma_paged_audio_buffer_config* pConfig, ma_paged_audio_buffer* pPagedAudioBuffer)
-{
-    ma_result result;
-    ma_data_source_config dataSourceConfig;
-
-    if (pPagedAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pPagedAudioBuffer);
-
-    /* A config is required for the format and channel count. */
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->pData == NULL) {
-        return MA_INVALID_ARGS; /* No underlying data specified. */
-    }
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_paged_audio_buffer_data_source_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pPagedAudioBuffer->ds);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pPagedAudioBuffer->pData          = pConfig->pData;
-    pPagedAudioBuffer->pCurrent       = ma_paged_audio_buffer_data_get_head(pConfig->pData);
-    pPagedAudioBuffer->relativeCursor = 0;
-    pPagedAudioBuffer->absoluteCursor = 0;
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_paged_audio_buffer_uninit(ma_paged_audio_buffer* pPagedAudioBuffer)
-{
-    if (pPagedAudioBuffer == NULL) {
-        return;
-    }
-
-    /* Nothing to do. The data needs to be deleted separately. */
-}
-
-MA_API ma_result ma_paged_audio_buffer_read_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint64 totalFramesRead = 0;
-    ma_format format;
-    ma_uint32 channels;
-
-    if (pPagedAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    format   = pPagedAudioBuffer->pData->format;
-    channels = pPagedAudioBuffer->pData->channels;
-
-    while (totalFramesRead < frameCount) {
-        /* Read from the current page. The buffer should never be in a state where this is NULL. */
-        ma_uint64 framesRemainingInCurrentPage;
-        ma_uint64 framesRemainingToRead = frameCount - totalFramesRead;
-        ma_uint64 framesToReadThisIteration;
-
-        MA_ASSERT(pPagedAudioBuffer->pCurrent != NULL);
-
-        framesRemainingInCurrentPage = pPagedAudioBuffer->pCurrent->sizeInFrames - pPagedAudioBuffer->relativeCursor;
-
-        framesToReadThisIteration = ma_min(framesRemainingInCurrentPage, framesRemainingToRead);
-        ma_copy_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, format, channels), ma_offset_pcm_frames_ptr(pPagedAudioBuffer->pCurrent->pAudioData, pPagedAudioBuffer->relativeCursor, format, channels), framesToReadThisIteration, format, channels);
-        totalFramesRead += framesToReadThisIteration;
-
-        pPagedAudioBuffer->absoluteCursor += framesToReadThisIteration;
-        pPagedAudioBuffer->relativeCursor += framesToReadThisIteration;
-
-        /* Move to the next page if necessary. If there's no more pages, we need to return MA_AT_END. */
-        MA_ASSERT(pPagedAudioBuffer->relativeCursor <= pPagedAudioBuffer->pCurrent->sizeInFrames);
-
-        if (pPagedAudioBuffer->relativeCursor == pPagedAudioBuffer->pCurrent->sizeInFrames) {
-            /* We reached the end of the page. Need to move to the next. If there's no more pages, we're done. */
-            ma_paged_audio_buffer_page* pNext = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pPagedAudioBuffer->pCurrent->pNext);
-            if (pNext == NULL) {
-                result = MA_AT_END;
-                break;  /* We've reached the end. */
-            } else {
-                pPagedAudioBuffer->pCurrent       = pNext;
-                pPagedAudioBuffer->relativeCursor = 0;
-            }
-        }
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = totalFramesRead;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_paged_audio_buffer_seek_to_pcm_frame(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64 frameIndex)
-{
-    if (pPagedAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (frameIndex == pPagedAudioBuffer->absoluteCursor) {
-        return MA_SUCCESS;  /* Nothing to do. */
-    }
-
-    if (frameIndex < pPagedAudioBuffer->absoluteCursor) {
-        /* Moving backwards. Need to move the cursor back to the start, and then move forward. */
-        pPagedAudioBuffer->pCurrent       = ma_paged_audio_buffer_data_get_head(pPagedAudioBuffer->pData);
-        pPagedAudioBuffer->absoluteCursor = 0;
-        pPagedAudioBuffer->relativeCursor = 0;
-
-        /* Fall through to the forward seeking section below. */
-    }
-
-    if (frameIndex > pPagedAudioBuffer->absoluteCursor) {
-        /* Moving forward. */
-        ma_paged_audio_buffer_page* pPage;
-        ma_uint64 runningCursor = 0;
-
-        for (pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&ma_paged_audio_buffer_data_get_head(pPagedAudioBuffer->pData)->pNext); pPage != NULL; pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pPage->pNext)) {
-            ma_uint64 pageRangeBeg = runningCursor;
-            ma_uint64 pageRangeEnd = pageRangeBeg + pPage->sizeInFrames;
-
-            if (frameIndex >= pageRangeBeg) {
-                if (frameIndex < pageRangeEnd || (frameIndex == pageRangeEnd && pPage == (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(ma_paged_audio_buffer_data_get_tail(pPagedAudioBuffer->pData)))) {  /* A small edge case - allow seeking to the very end of the buffer. */
-                    /* We found the page. */
-                    pPagedAudioBuffer->pCurrent       = pPage;
-                    pPagedAudioBuffer->absoluteCursor = frameIndex;
-                    pPagedAudioBuffer->relativeCursor = frameIndex - pageRangeBeg;
-                    return MA_SUCCESS;
-                }
-            }
-
-            runningCursor = pageRangeEnd;
-        }
-
-        /* Getting here means we tried seeking too far forward. Don't change any state. */
-        return MA_BAD_SEEK;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_paged_audio_buffer_get_cursor_in_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64* pCursor)
-{
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;   /* Safety. */
-
-    if (pPagedAudioBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = pPagedAudioBuffer->absoluteCursor;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_paged_audio_buffer_get_length_in_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64* pLength)
-{
-    return ma_paged_audio_buffer_data_get_length_in_pcm_frames(pPagedAudioBuffer->pData, pLength);
-}
-
-
-
-/**************************************************************************************************************************************************************
-
-VFS
-
-**************************************************************************************************************************************************************/
-MA_API ma_result ma_vfs_open(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
-
-    if (pFile == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pFile = NULL;
-
-    if (pVFS == NULL || pFilePath == NULL || openMode == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pCallbacks->onOpen == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pCallbacks->onOpen(pVFS, pFilePath, openMode, pFile);
-}
-
-MA_API ma_result ma_vfs_open_w(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
-
-    if (pFile == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pFile = NULL;
-
-    if (pVFS == NULL || pFilePath == NULL || openMode == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pCallbacks->onOpenW == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pCallbacks->onOpenW(pVFS, pFilePath, openMode, pFile);
-}
-
-MA_API ma_result ma_vfs_close(ma_vfs* pVFS, ma_vfs_file file)
-{
-    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
-
-    if (pVFS == NULL || file == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pCallbacks->onClose == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pCallbacks->onClose(pVFS, file);
-}
-
-MA_API ma_result ma_vfs_read(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
-{
-    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
-    ma_result result;
-    size_t bytesRead = 0;
-
-    if (pBytesRead != NULL) {
-        *pBytesRead = 0;
-    }
-
-    if (pVFS == NULL || file == NULL || pDst == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pCallbacks->onRead == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    result = pCallbacks->onRead(pVFS, file, pDst, sizeInBytes, &bytesRead);
-
-    if (pBytesRead != NULL) {
-        *pBytesRead = bytesRead;
-    }
-
-    if (result == MA_SUCCESS && bytesRead == 0 && sizeInBytes > 0) {
-        result  = MA_AT_END;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_vfs_write(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
-{
-    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
-
-    if (pBytesWritten != NULL) {
-        *pBytesWritten = 0;
-    }
-
-    if (pVFS == NULL || file == NULL || pSrc == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pCallbacks->onWrite == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pCallbacks->onWrite(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
-}
-
-MA_API ma_result ma_vfs_seek(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
-{
-    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
-
-    if (pVFS == NULL || file == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pCallbacks->onSeek == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pCallbacks->onSeek(pVFS, file, offset, origin);
-}
-
-MA_API ma_result ma_vfs_tell(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
-{
-    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
-
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;
-
-    if (pVFS == NULL || file == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pCallbacks->onTell == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pCallbacks->onTell(pVFS, file, pCursor);
-}
-
-MA_API ma_result ma_vfs_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
-{
-    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
-
-    if (pInfo == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pInfo);
-
-    if (pVFS == NULL || file == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pCallbacks->onInfo == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pCallbacks->onInfo(pVFS, file, pInfo);
-}
-
-
-#if !defined(MA_USE_WIN32_FILEIO) && (defined(MA_WIN32) && defined(MA_WIN32_DESKTOP) && !defined(MA_NO_WIN32_FILEIO) && !defined(MA_POSIX))
-    #define MA_USE_WIN32_FILEIO
-#endif
-
-#if defined(MA_USE_WIN32_FILEIO)
-/*
-We need to dynamically load SetFilePointer or SetFilePointerEx because older versions of Windows do
-not have the Ex version. We therefore need to do some dynamic branching depending on what's available.
-
-We load these when we load our first file from the default VFS. It's left open for the life of the
-program and is left to the OS to uninitialize when the program terminates.
-*/
-typedef DWORD (__stdcall * ma_SetFilePointer_proc)(HANDLE hFile, LONG lDistanceToMove, LONG* lpDistanceToMoveHigh, DWORD dwMoveMethod);
-typedef BOOL  (__stdcall * ma_SetFilePointerEx_proc)(HANDLE hFile, LARGE_INTEGER liDistanceToMove, LARGE_INTEGER* lpNewFilePointer, DWORD dwMoveMethod);
-
-static ma_handle hKernel32DLL = NULL;
-static ma_SetFilePointer_proc   ma_SetFilePointer   = NULL;
-static ma_SetFilePointerEx_proc ma_SetFilePointerEx = NULL;
-
-static void ma_win32_fileio_init(void)
-{
-    if (hKernel32DLL == NULL) {
-        hKernel32DLL = ma_dlopen(NULL, "kernel32.dll");
-        if (hKernel32DLL != NULL) {
-            ma_SetFilePointer   = (ma_SetFilePointer_proc)  ma_dlsym(NULL, hKernel32DLL, "SetFilePointer");
-            ma_SetFilePointerEx = (ma_SetFilePointerEx_proc)ma_dlsym(NULL, hKernel32DLL, "SetFilePointerEx");
-        }
-    }
-}
-
-static void ma_default_vfs__get_open_settings_win32(ma_uint32 openMode, DWORD* pDesiredAccess, DWORD* pShareMode, DWORD* pCreationDisposition)
-{
-    *pDesiredAccess = 0;
-    if ((openMode & MA_OPEN_MODE_READ) != 0) {
-        *pDesiredAccess |= GENERIC_READ;
-    }
-    if ((openMode & MA_OPEN_MODE_WRITE) != 0) {
-        *pDesiredAccess |= GENERIC_WRITE;
-    }
-
-    *pShareMode = 0;
-    if ((openMode & MA_OPEN_MODE_READ) != 0) {
-        *pShareMode |= FILE_SHARE_READ;
-    }
-
-    if ((openMode & MA_OPEN_MODE_WRITE) != 0) {
-        *pCreationDisposition = CREATE_ALWAYS;  /* Opening in write mode. Truncate. */
-    } else {
-        *pCreationDisposition = OPEN_EXISTING;  /* Opening in read mode. File must exist. */
-    }
-}
-
-static ma_result ma_default_vfs_open__win32(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    HANDLE hFile;
-    DWORD dwDesiredAccess;
-    DWORD dwShareMode;
-    DWORD dwCreationDisposition;
-
-    (void)pVFS;
-
-    /* Load some Win32 symbols dynamically so we can dynamically check for the existence of SetFilePointerEx. */
-    ma_win32_fileio_init();
-
-    ma_default_vfs__get_open_settings_win32(openMode, &dwDesiredAccess, &dwShareMode, &dwCreationDisposition);
-
-    hFile = CreateFileA(pFilePath, dwDesiredAccess, dwShareMode, NULL, dwCreationDisposition, FILE_ATTRIBUTE_NORMAL, NULL);
-    if (hFile == INVALID_HANDLE_VALUE) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    *pFile = hFile;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_open_w__win32(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    HANDLE hFile;
-    DWORD dwDesiredAccess;
-    DWORD dwShareMode;
-    DWORD dwCreationDisposition;
-
-    (void)pVFS;
-
-    /* Load some Win32 symbols dynamically so we can dynamically check for the existence of SetFilePointerEx. */
-    ma_win32_fileio_init();
-
-    ma_default_vfs__get_open_settings_win32(openMode, &dwDesiredAccess, &dwShareMode, &dwCreationDisposition);
-
-    hFile = CreateFileW(pFilePath, dwDesiredAccess, dwShareMode, NULL, dwCreationDisposition, FILE_ATTRIBUTE_NORMAL, NULL);
-    if (hFile == INVALID_HANDLE_VALUE) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    *pFile = hFile;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_close__win32(ma_vfs* pVFS, ma_vfs_file file)
-{
-    (void)pVFS;
-
-    if (CloseHandle((HANDLE)file) == 0) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_default_vfs_read__win32(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
-{
-    ma_result result = MA_SUCCESS;
-    size_t totalBytesRead;
-
-    (void)pVFS;
-
-    totalBytesRead = 0;
-    while (totalBytesRead < sizeInBytes) {
-        size_t bytesRemaining;
-        DWORD bytesToRead;
-        DWORD bytesRead;
-        BOOL readResult;
-
-        bytesRemaining = sizeInBytes - totalBytesRead;
-        if (bytesRemaining >= 0xFFFFFFFF) {
-            bytesToRead = 0xFFFFFFFF;
-        } else {
-            bytesToRead = (DWORD)bytesRemaining;
-        }
-
-        readResult = ReadFile((HANDLE)file, ma_offset_ptr(pDst, totalBytesRead), bytesToRead, &bytesRead, NULL);
-        if (readResult == 1 && bytesRead == 0) {
-            result = MA_AT_END;
-            break;  /* EOF */
-        }
-
-        totalBytesRead += bytesRead;
-
-        if (bytesRead < bytesToRead) {
-            break;  /* EOF */
-        }
-
-        if (readResult == 0) {
-            result = ma_result_from_GetLastError(GetLastError());
-            break;
-        }
-    }
-
-    if (pBytesRead != NULL) {
-        *pBytesRead = totalBytesRead;
-    }
-
-    return result;
-}
-
-static ma_result ma_default_vfs_write__win32(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
-{
-    ma_result result = MA_SUCCESS;
-    size_t totalBytesWritten;
-
-    (void)pVFS;
-
-    totalBytesWritten = 0;
-    while (totalBytesWritten < sizeInBytes) {
-        size_t bytesRemaining;
-        DWORD bytesToWrite;
-        DWORD bytesWritten;
-        BOOL writeResult;
-
-        bytesRemaining = sizeInBytes - totalBytesWritten;
-        if (bytesRemaining >= 0xFFFFFFFF) {
-            bytesToWrite = 0xFFFFFFFF;
-        } else {
-            bytesToWrite = (DWORD)bytesRemaining;
-        }
-
-        writeResult = WriteFile((HANDLE)file, ma_offset_ptr(pSrc, totalBytesWritten), bytesToWrite, &bytesWritten, NULL);
-        totalBytesWritten += bytesWritten;
-
-        if (writeResult == 0) {
-            result = ma_result_from_GetLastError(GetLastError());
-            break;
-        }
-    }
-
-    if (pBytesWritten != NULL) {
-        *pBytesWritten = totalBytesWritten;
-    }
-
-    return result;
-}
-
-
-static ma_result ma_default_vfs_seek__win32(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
-{
-    LARGE_INTEGER liDistanceToMove;
-    DWORD dwMoveMethod;
-    BOOL result;
-
-    (void)pVFS;
-
-    liDistanceToMove.QuadPart = offset;
-
-    /*  */ if (origin == ma_seek_origin_current) {
-        dwMoveMethod = FILE_CURRENT;
-    } else if (origin == ma_seek_origin_end) {
-        dwMoveMethod = FILE_END;
-    } else {
-        dwMoveMethod = FILE_BEGIN;
-    }
-
-    if (ma_SetFilePointerEx != NULL) {
-        result = ma_SetFilePointerEx((HANDLE)file, liDistanceToMove, NULL, dwMoveMethod);
-    } else if (ma_SetFilePointer != NULL) {
-        /* No SetFilePointerEx() so restrict to 31 bits. */
-        if (origin > 0x7FFFFFFF) {
-            return MA_OUT_OF_RANGE;
-        }
-
-        result = ma_SetFilePointer((HANDLE)file, (LONG)liDistanceToMove.QuadPart, NULL, dwMoveMethod);
-    } else {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    if (result == 0) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_tell__win32(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
-{
-    LARGE_INTEGER liZero;
-    LARGE_INTEGER liTell;
-    BOOL result;
-
-    (void)pVFS;
-
-    liZero.QuadPart = 0;
-
-    if (ma_SetFilePointerEx != NULL) {
-        result = ma_SetFilePointerEx((HANDLE)file, liZero, &liTell, FILE_CURRENT);
-    } else if (ma_SetFilePointer != NULL) {
-        LONG tell;
-
-        result = ma_SetFilePointer((HANDLE)file, (LONG)liZero.QuadPart, &tell, FILE_CURRENT);
-        liTell.QuadPart = tell;
-    } else {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    if (result == 0) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    if (pCursor != NULL) {
-        *pCursor = liTell.QuadPart;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_info__win32(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
-{
-    BY_HANDLE_FILE_INFORMATION fi;
-    BOOL result;
-
-    (void)pVFS;
-
-    result = GetFileInformationByHandle((HANDLE)file, &fi);
-    if (result == 0) {
-        return ma_result_from_GetLastError(GetLastError());
-    }
-
-    pInfo->sizeInBytes = ((ma_uint64)fi.nFileSizeHigh << 32) | ((ma_uint64)fi.nFileSizeLow);
-
-    return MA_SUCCESS;
-}
-#else
-static ma_result ma_default_vfs_open__stdio(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    ma_result result;
-    FILE* pFileStd;
-    const char* pOpenModeStr;
-
-    MA_ASSERT(pFilePath != NULL);
-    MA_ASSERT(openMode  != 0);
-    MA_ASSERT(pFile     != NULL);
-
-    (void)pVFS;
-
-    if ((openMode & MA_OPEN_MODE_READ) != 0) {
-        if ((openMode & MA_OPEN_MODE_WRITE) != 0) {
-            pOpenModeStr = "r+";
-        } else {
-            pOpenModeStr = "rb";
-        }
-    } else {
-        pOpenModeStr = "wb";
-    }
-
-    result = ma_fopen(&pFileStd, pFilePath, pOpenModeStr);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pFile = pFileStd;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_open_w__stdio(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    ma_result result;
-    FILE* pFileStd;
-    const wchar_t* pOpenModeStr;
-
-    MA_ASSERT(pFilePath != NULL);
-    MA_ASSERT(openMode  != 0);
-    MA_ASSERT(pFile     != NULL);
-
-    (void)pVFS;
-
-    if ((openMode & MA_OPEN_MODE_READ) != 0) {
-        if ((openMode & MA_OPEN_MODE_WRITE) != 0) {
-            pOpenModeStr = L"r+";
-        } else {
-            pOpenModeStr = L"rb";
-        }
-    } else {
-        pOpenModeStr = L"wb";
-    }
-
-    result = ma_wfopen(&pFileStd, pFilePath, pOpenModeStr, (pVFS != NULL) ? &((ma_default_vfs*)pVFS)->allocationCallbacks : NULL);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pFile = pFileStd;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_close__stdio(ma_vfs* pVFS, ma_vfs_file file)
-{
-    MA_ASSERT(file != NULL);
-
-    (void)pVFS;
-
-    fclose((FILE*)file);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_read__stdio(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
-{
-    size_t result;
-
-    MA_ASSERT(file != NULL);
-    MA_ASSERT(pDst != NULL);
-
-    (void)pVFS;
-
-    result = fread(pDst, 1, sizeInBytes, (FILE*)file);
-
-    if (pBytesRead != NULL) {
-        *pBytesRead = result;
-    }
-
-    if (result != sizeInBytes) {
-        if (result == 0 && feof((FILE*)file)) {
-            return MA_AT_END;
-        } else {
-            return ma_result_from_errno(ferror((FILE*)file));
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_write__stdio(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
-{
-    size_t result;
-
-    MA_ASSERT(file != NULL);
-    MA_ASSERT(pSrc != NULL);
-
-    (void)pVFS;
-
-    result = fwrite(pSrc, 1, sizeInBytes, (FILE*)file);
-
-    if (pBytesWritten != NULL) {
-        *pBytesWritten = result;
-    }
-
-    if (result != sizeInBytes) {
-        return ma_result_from_errno(ferror((FILE*)file));
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_seek__stdio(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
-{
-    int result;
-    int whence;
-
-    MA_ASSERT(file != NULL);
-
-    (void)pVFS;
-
-    if (origin == ma_seek_origin_start) {
-        whence = SEEK_SET;
-    } else if (origin == ma_seek_origin_end) {
-        whence = SEEK_END;
-    } else {
-        whence = SEEK_CUR;
-    }
-
-#if defined(_WIN32)
-    #if defined(_MSC_VER) && _MSC_VER > 1200
-        result = _fseeki64((FILE*)file, offset, whence);
-    #else
-        /* No _fseeki64() so restrict to 31 bits. */
-        if (origin > 0x7FFFFFFF) {
-            return MA_OUT_OF_RANGE;
-        }
-
-        result = fseek((FILE*)file, (int)offset, whence);
-    #endif
-#else
-    result = fseek((FILE*)file, (long int)offset, whence);
-#endif
-    if (result != 0) {
-        return MA_ERROR;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_default_vfs_tell__stdio(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
-{
-    ma_int64 result;
-
-    MA_ASSERT(file    != NULL);
-    MA_ASSERT(pCursor != NULL);
-
-    (void)pVFS;
-
-#if defined(_WIN32)
-    #if defined(_MSC_VER) && _MSC_VER > 1200
-        result = _ftelli64((FILE*)file);
-    #else
-        result = ftell((FILE*)file);
-    #endif
-#else
-    result = ftell((FILE*)file);
-#endif
-
-    *pCursor = result;
-
-    return MA_SUCCESS;
-}
-
-#if !defined(_MSC_VER) && !((defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 1) || defined(_XOPEN_SOURCE) || defined(_POSIX_SOURCE)) && !defined(MA_BSD)
-int fileno(FILE *stream);
-#endif
-
-static ma_result ma_default_vfs_info__stdio(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
-{
-    int fd;
-    struct stat info;
-
-    MA_ASSERT(file  != NULL);
-    MA_ASSERT(pInfo != NULL);
-
-    (void)pVFS;
-
-#if defined(_MSC_VER)
-    fd = _fileno((FILE*)file);
-#else
-    fd =  fileno((FILE*)file);
-#endif
-
-    if (fstat(fd, &info) != 0) {
-        return ma_result_from_errno(errno);
-    }
-
-    pInfo->sizeInBytes = info.st_size;
-
-    return MA_SUCCESS;
-}
-#endif
-
-
-static ma_result ma_default_vfs_open(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    if (pFile == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pFile = NULL;
-
-    if (pFilePath == NULL || openMode == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_USE_WIN32_FILEIO)
-    return ma_default_vfs_open__win32(pVFS, pFilePath, openMode, pFile);
-#else
-    return ma_default_vfs_open__stdio(pVFS, pFilePath, openMode, pFile);
-#endif
-}
-
-static ma_result ma_default_vfs_open_w(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    if (pFile == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pFile = NULL;
-
-    if (pFilePath == NULL || openMode == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_USE_WIN32_FILEIO)
-    return ma_default_vfs_open_w__win32(pVFS, pFilePath, openMode, pFile);
-#else
-    return ma_default_vfs_open_w__stdio(pVFS, pFilePath, openMode, pFile);
-#endif
-}
-
-static ma_result ma_default_vfs_close(ma_vfs* pVFS, ma_vfs_file file)
-{
-    if (file == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_USE_WIN32_FILEIO)
-    return ma_default_vfs_close__win32(pVFS, file);
-#else
-    return ma_default_vfs_close__stdio(pVFS, file);
-#endif
-}
-
-static ma_result ma_default_vfs_read(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
-{
-    if (pBytesRead != NULL) {
-        *pBytesRead = 0;
-    }
-
-    if (file == NULL || pDst == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_USE_WIN32_FILEIO)
-    return ma_default_vfs_read__win32(pVFS, file, pDst, sizeInBytes, pBytesRead);
-#else
-    return ma_default_vfs_read__stdio(pVFS, file, pDst, sizeInBytes, pBytesRead);
-#endif
-}
-
-static ma_result ma_default_vfs_write(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
-{
-    if (pBytesWritten != NULL) {
-        *pBytesWritten = 0;
-    }
-
-    if (file == NULL || pSrc == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_USE_WIN32_FILEIO)
-    return ma_default_vfs_write__win32(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
-#else
-    return ma_default_vfs_write__stdio(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
-#endif
-}
-
-static ma_result ma_default_vfs_seek(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
-{
-    if (file == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_USE_WIN32_FILEIO)
-    return ma_default_vfs_seek__win32(pVFS, file, offset, origin);
-#else
-    return ma_default_vfs_seek__stdio(pVFS, file, offset, origin);
-#endif
-}
-
-static ma_result ma_default_vfs_tell(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
-{
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;
-
-    if (file == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_USE_WIN32_FILEIO)
-    return ma_default_vfs_tell__win32(pVFS, file, pCursor);
-#else
-    return ma_default_vfs_tell__stdio(pVFS, file, pCursor);
-#endif
-}
-
-static ma_result ma_default_vfs_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
-{
-    if (pInfo == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pInfo);
-
-    if (file == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-#if defined(MA_USE_WIN32_FILEIO)
-    return ma_default_vfs_info__win32(pVFS, file, pInfo);
-#else
-    return ma_default_vfs_info__stdio(pVFS, file, pInfo);
-#endif
-}
-
-
-MA_API ma_result ma_default_vfs_init(ma_default_vfs* pVFS, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pVFS == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pVFS->cb.onOpen  = ma_default_vfs_open;
-    pVFS->cb.onOpenW = ma_default_vfs_open_w;
-    pVFS->cb.onClose = ma_default_vfs_close;
-    pVFS->cb.onRead  = ma_default_vfs_read;
-    pVFS->cb.onWrite = ma_default_vfs_write;
-    pVFS->cb.onSeek  = ma_default_vfs_seek;
-    pVFS->cb.onTell  = ma_default_vfs_tell;
-    pVFS->cb.onInfo  = ma_default_vfs_info;
-    ma_allocation_callbacks_init_copy(&pVFS->allocationCallbacks, pAllocationCallbacks);
-
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_result ma_vfs_or_default_open(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    if (pVFS != NULL) {
-        return ma_vfs_open(pVFS, pFilePath, openMode, pFile);
-    } else {
-        return ma_default_vfs_open(pVFS, pFilePath, openMode, pFile);
-    }
-}
-
-MA_API ma_result ma_vfs_or_default_open_w(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
-{
-    if (pVFS != NULL) {
-        return ma_vfs_open_w(pVFS, pFilePath, openMode, pFile);
-    } else {
-        return ma_default_vfs_open_w(pVFS, pFilePath, openMode, pFile);
-    }
-}
-
-MA_API ma_result ma_vfs_or_default_close(ma_vfs* pVFS, ma_vfs_file file)
-{
-    if (pVFS != NULL) {
-        return ma_vfs_close(pVFS, file);
-    } else {
-        return ma_default_vfs_close(pVFS, file);
-    }
-}
-
-MA_API ma_result ma_vfs_or_default_read(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
-{
-    if (pVFS != NULL) {
-        return ma_vfs_read(pVFS, file, pDst, sizeInBytes, pBytesRead);
-    } else {
-        return ma_default_vfs_read(pVFS, file, pDst, sizeInBytes, pBytesRead);
-    }
-}
-
-MA_API ma_result ma_vfs_or_default_write(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
-{
-    if (pVFS != NULL) {
-        return ma_vfs_write(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
-    } else {
-        return ma_default_vfs_write(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
-    }
-}
-
-MA_API ma_result ma_vfs_or_default_seek(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
-{
-    if (pVFS != NULL) {
-        return ma_vfs_seek(pVFS, file, offset, origin);
-    } else {
-        return ma_default_vfs_seek(pVFS, file, offset, origin);
-    }
-}
-
-MA_API ma_result ma_vfs_or_default_tell(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
-{
-    if (pVFS != NULL) {
-        return ma_vfs_tell(pVFS, file, pCursor);
-    } else {
-        return ma_default_vfs_tell(pVFS, file, pCursor);
-    }
-}
-
-MA_API ma_result ma_vfs_or_default_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
-{
-    if (pVFS != NULL) {
-        return ma_vfs_info(pVFS, file, pInfo);
-    } else {
-        return ma_default_vfs_info(pVFS, file, pInfo);
-    }
-}
-
-
-
-static ma_result ma_vfs_open_and_read_file_ex(ma_vfs* pVFS, const char* pFilePath, const wchar_t* pFilePathW, void** ppData, size_t* pSize, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_result result;
-    ma_vfs_file file;
-    ma_file_info info;
-    void* pData;
-    size_t bytesRead;
-
-    if (ppData != NULL) {
-        *ppData = NULL;
-    }
-    if (pSize != NULL) {
-        *pSize = 0;
-    }
-
-    if (ppData == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pFilePath != NULL) {
-        result = ma_vfs_or_default_open(pVFS, pFilePath, MA_OPEN_MODE_READ, &file);
-    } else {
-        result = ma_vfs_or_default_open_w(pVFS, pFilePathW, MA_OPEN_MODE_READ, &file);
-    }
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = ma_vfs_or_default_info(pVFS, file, &info);
-    if (result != MA_SUCCESS) {
-        ma_vfs_or_default_close(pVFS, file);
-        return result;
-    }
-
-    if (info.sizeInBytes > MA_SIZE_MAX) {
-        ma_vfs_or_default_close(pVFS, file);
-        return MA_TOO_BIG;
-    }
-
-    pData = ma_malloc((size_t)info.sizeInBytes, pAllocationCallbacks);  /* Safe cast. */
-    if (pData == NULL) {
-        ma_vfs_or_default_close(pVFS, file);
-        return result;
-    }
-
-    result = ma_vfs_or_default_read(pVFS, file, pData, (size_t)info.sizeInBytes, &bytesRead);  /* Safe cast. */
-    ma_vfs_or_default_close(pVFS, file);
-
-    if (result != MA_SUCCESS) {
-        ma_free(pData, pAllocationCallbacks);
-        return result;
-    }
-
-    if (pSize != NULL) {
-        *pSize = bytesRead;
-    }
-
-    MA_ASSERT(ppData != NULL);
-    *ppData = pData;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_vfs_open_and_read_file(ma_vfs* pVFS, const char* pFilePath, void** ppData, size_t* pSize, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_vfs_open_and_read_file_ex(pVFS, pFilePath, NULL, ppData, pSize, pAllocationCallbacks);
-}
-
-MA_API ma_result ma_vfs_open_and_read_file_w(ma_vfs* pVFS, const wchar_t* pFilePath, void** ppData, size_t* pSize, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_vfs_open_and_read_file_ex(pVFS, NULL, pFilePath, ppData, pSize, pAllocationCallbacks);
-}
-
-
-
-/**************************************************************************************************************************************************************
-
-Decoding and Encoding Headers. These are auto-generated from a tool.
-
-**************************************************************************************************************************************************************/
-#if !defined(MA_NO_WAV) && (!defined(MA_NO_DECODING) || !defined(MA_NO_ENCODING))
-/* dr_wav_h begin */
-#ifndef ma_dr_wav_h
-#define ma_dr_wav_h
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define MA_DR_WAV_STRINGIFY(x)      #x
-#define MA_DR_WAV_XSTRINGIFY(x)     MA_DR_WAV_STRINGIFY(x)
-#define MA_DR_WAV_VERSION_MAJOR     0
-#define MA_DR_WAV_VERSION_MINOR     13
-#define MA_DR_WAV_VERSION_REVISION  13
-#define MA_DR_WAV_VERSION_STRING    MA_DR_WAV_XSTRINGIFY(MA_DR_WAV_VERSION_MAJOR) "." MA_DR_WAV_XSTRINGIFY(MA_DR_WAV_VERSION_MINOR) "." MA_DR_WAV_XSTRINGIFY(MA_DR_WAV_VERSION_REVISION)
-#include <stddef.h>
-#define MA_DR_WAVE_FORMAT_PCM          0x1
-#define MA_DR_WAVE_FORMAT_ADPCM        0x2
-#define MA_DR_WAVE_FORMAT_IEEE_FLOAT   0x3
-#define MA_DR_WAVE_FORMAT_ALAW         0x6
-#define MA_DR_WAVE_FORMAT_MULAW        0x7
-#define MA_DR_WAVE_FORMAT_DVI_ADPCM    0x11
-#define MA_DR_WAVE_FORMAT_EXTENSIBLE   0xFFFE
-#define MA_DR_WAV_SEQUENTIAL            0x00000001
-#define MA_DR_WAV_WITH_METADATA         0x00000002
-MA_API void ma_dr_wav_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision);
-MA_API const char* ma_dr_wav_version_string(void);
-typedef enum
-{
-    ma_dr_wav_seek_origin_start,
-    ma_dr_wav_seek_origin_current
-} ma_dr_wav_seek_origin;
-typedef enum
-{
-    ma_dr_wav_container_riff,
-    ma_dr_wav_container_rifx,
-    ma_dr_wav_container_w64,
-    ma_dr_wav_container_rf64,
-    ma_dr_wav_container_aiff
-} ma_dr_wav_container;
-typedef struct
-{
-    union
-    {
-        ma_uint8 fourcc[4];
-        ma_uint8 guid[16];
-    } id;
-    ma_uint64 sizeInBytes;
-    unsigned int paddingSize;
-} ma_dr_wav_chunk_header;
-typedef struct
-{
-    ma_uint16 formatTag;
-    ma_uint16 channels;
-    ma_uint32 sampleRate;
-    ma_uint32 avgBytesPerSec;
-    ma_uint16 blockAlign;
-    ma_uint16 bitsPerSample;
-    ma_uint16 extendedSize;
-    ma_uint16 validBitsPerSample;
-    ma_uint32 channelMask;
-    ma_uint8 subFormat[16];
-} ma_dr_wav_fmt;
-MA_API ma_uint16 ma_dr_wav_fmt_get_format(const ma_dr_wav_fmt* pFMT);
-typedef size_t (* ma_dr_wav_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
-typedef size_t (* ma_dr_wav_write_proc)(void* pUserData, const void* pData, size_t bytesToWrite);
-typedef ma_bool32 (* ma_dr_wav_seek_proc)(void* pUserData, int offset, ma_dr_wav_seek_origin origin);
-typedef ma_uint64 (* ma_dr_wav_chunk_proc)(void* pChunkUserData, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pReadSeekUserData, const ma_dr_wav_chunk_header* pChunkHeader, ma_dr_wav_container container, const ma_dr_wav_fmt* pFMT);
-typedef struct
-{
-    const ma_uint8* data;
-    size_t dataSize;
-    size_t currentReadPos;
-} ma_dr_wav__memory_stream;
-typedef struct
-{
-    void** ppData;
-    size_t* pDataSize;
-    size_t dataSize;
-    size_t dataCapacity;
-    size_t currentWritePos;
-} ma_dr_wav__memory_stream_write;
-typedef struct
-{
-    ma_dr_wav_container container;
-    ma_uint32 format;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_uint32 bitsPerSample;
-} ma_dr_wav_data_format;
-typedef enum
-{
-    ma_dr_wav_metadata_type_none                        = 0,
-    ma_dr_wav_metadata_type_unknown                     = 1 << 0,
-    ma_dr_wav_metadata_type_smpl                        = 1 << 1,
-    ma_dr_wav_metadata_type_inst                        = 1 << 2,
-    ma_dr_wav_metadata_type_cue                         = 1 << 3,
-    ma_dr_wav_metadata_type_acid                        = 1 << 4,
-    ma_dr_wav_metadata_type_bext                        = 1 << 5,
-    ma_dr_wav_metadata_type_list_label                  = 1 << 6,
-    ma_dr_wav_metadata_type_list_note                   = 1 << 7,
-    ma_dr_wav_metadata_type_list_labelled_cue_region    = 1 << 8,
-    ma_dr_wav_metadata_type_list_info_software          = 1 << 9,
-    ma_dr_wav_metadata_type_list_info_copyright         = 1 << 10,
-    ma_dr_wav_metadata_type_list_info_title             = 1 << 11,
-    ma_dr_wav_metadata_type_list_info_artist            = 1 << 12,
-    ma_dr_wav_metadata_type_list_info_comment           = 1 << 13,
-    ma_dr_wav_metadata_type_list_info_date              = 1 << 14,
-    ma_dr_wav_metadata_type_list_info_genre             = 1 << 15,
-    ma_dr_wav_metadata_type_list_info_album             = 1 << 16,
-    ma_dr_wav_metadata_type_list_info_tracknumber       = 1 << 17,
-    ma_dr_wav_metadata_type_list_all_info_strings       = ma_dr_wav_metadata_type_list_info_software
-                                                    | ma_dr_wav_metadata_type_list_info_copyright
-                                                    | ma_dr_wav_metadata_type_list_info_title
-                                                    | ma_dr_wav_metadata_type_list_info_artist
-                                                    | ma_dr_wav_metadata_type_list_info_comment
-                                                    | ma_dr_wav_metadata_type_list_info_date
-                                                    | ma_dr_wav_metadata_type_list_info_genre
-                                                    | ma_dr_wav_metadata_type_list_info_album
-                                                    | ma_dr_wav_metadata_type_list_info_tracknumber,
-    ma_dr_wav_metadata_type_list_all_adtl               = ma_dr_wav_metadata_type_list_label
-                                                    | ma_dr_wav_metadata_type_list_note
-                                                    | ma_dr_wav_metadata_type_list_labelled_cue_region,
-    ma_dr_wav_metadata_type_all                         = -2,
-    ma_dr_wav_metadata_type_all_including_unknown       = -1
-} ma_dr_wav_metadata_type;
-typedef enum
-{
-    ma_dr_wav_smpl_loop_type_forward  = 0,
-    ma_dr_wav_smpl_loop_type_pingpong = 1,
-    ma_dr_wav_smpl_loop_type_backward = 2
-} ma_dr_wav_smpl_loop_type;
-typedef struct
-{
-    ma_uint32 cuePointId;
-    ma_uint32 type;
-    ma_uint32 firstSampleByteOffset;
-    ma_uint32 lastSampleByteOffset;
-    ma_uint32 sampleFraction;
-    ma_uint32 playCount;
-} ma_dr_wav_smpl_loop;
-typedef struct
-{
-    ma_uint32 manufacturerId;
-    ma_uint32 productId;
-    ma_uint32 samplePeriodNanoseconds;
-    ma_uint32 midiUnityNote;
-    ma_uint32 midiPitchFraction;
-    ma_uint32 smpteFormat;
-    ma_uint32 smpteOffset;
-    ma_uint32 sampleLoopCount;
-    ma_uint32 samplerSpecificDataSizeInBytes;
-    ma_dr_wav_smpl_loop* pLoops;
-    ma_uint8* pSamplerSpecificData;
-} ma_dr_wav_smpl;
-typedef struct
-{
-    ma_int8 midiUnityNote;
-    ma_int8 fineTuneCents;
-    ma_int8 gainDecibels;
-    ma_int8 lowNote;
-    ma_int8 highNote;
-    ma_int8 lowVelocity;
-    ma_int8 highVelocity;
-} ma_dr_wav_inst;
-typedef struct
-{
-    ma_uint32 id;
-    ma_uint32 playOrderPosition;
-    ma_uint8 dataChunkId[4];
-    ma_uint32 chunkStart;
-    ma_uint32 blockStart;
-    ma_uint32 sampleByteOffset;
-} ma_dr_wav_cue_point;
-typedef struct
-{
-    ma_uint32 cuePointCount;
-    ma_dr_wav_cue_point *pCuePoints;
-} ma_dr_wav_cue;
-typedef enum
-{
-    ma_dr_wav_acid_flag_one_shot      = 1,
-    ma_dr_wav_acid_flag_root_note_set = 2,
-    ma_dr_wav_acid_flag_stretch       = 4,
-    ma_dr_wav_acid_flag_disk_based    = 8,
-    ma_dr_wav_acid_flag_acidizer      = 16
-} ma_dr_wav_acid_flag;
-typedef struct
-{
-    ma_uint32 flags;
-    ma_uint16 midiUnityNote;
-    ma_uint16 reserved1;
-    float reserved2;
-    ma_uint32 numBeats;
-    ma_uint16 meterDenominator;
-    ma_uint16 meterNumerator;
-    float tempo;
-} ma_dr_wav_acid;
-typedef struct
-{
-    ma_uint32 cuePointId;
-    ma_uint32 stringLength;
-    char* pString;
-} ma_dr_wav_list_label_or_note;
-typedef struct
-{
-    char* pDescription;
-    char* pOriginatorName;
-    char* pOriginatorReference;
-    char  pOriginationDate[10];
-    char  pOriginationTime[8];
-    ma_uint64 timeReference;
-    ma_uint16 version;
-    char* pCodingHistory;
-    ma_uint32 codingHistorySize;
-    ma_uint8* pUMID;
-    ma_uint16 loudnessValue;
-    ma_uint16 loudnessRange;
-    ma_uint16 maxTruePeakLevel;
-    ma_uint16 maxMomentaryLoudness;
-    ma_uint16 maxShortTermLoudness;
-} ma_dr_wav_bext;
-typedef struct
-{
-    ma_uint32 stringLength;
-    char* pString;
-} ma_dr_wav_list_info_text;
-typedef struct
-{
-    ma_uint32 cuePointId;
-    ma_uint32 sampleLength;
-    ma_uint8 purposeId[4];
-    ma_uint16 country;
-    ma_uint16 language;
-    ma_uint16 dialect;
-    ma_uint16 codePage;
-    ma_uint32 stringLength;
-    char* pString;
-} ma_dr_wav_list_labelled_cue_region;
-typedef enum
-{
-    ma_dr_wav_metadata_location_invalid,
-    ma_dr_wav_metadata_location_top_level,
-    ma_dr_wav_metadata_location_inside_info_list,
-    ma_dr_wav_metadata_location_inside_adtl_list
-} ma_dr_wav_metadata_location;
-typedef struct
-{
-    ma_uint8 id[4];
-    ma_dr_wav_metadata_location chunkLocation;
-    ma_uint32 dataSizeInBytes;
-    ma_uint8* pData;
-} ma_dr_wav_unknown_metadata;
-typedef struct
-{
-    ma_dr_wav_metadata_type type;
-    union
-    {
-        ma_dr_wav_cue cue;
-        ma_dr_wav_smpl smpl;
-        ma_dr_wav_acid acid;
-        ma_dr_wav_inst inst;
-        ma_dr_wav_bext bext;
-        ma_dr_wav_list_label_or_note labelOrNote;
-        ma_dr_wav_list_labelled_cue_region labelledCueRegion;
-        ma_dr_wav_list_info_text infoText;
-        ma_dr_wav_unknown_metadata unknown;
-    } data;
-} ma_dr_wav_metadata;
-typedef struct
-{
-    ma_dr_wav_read_proc onRead;
-    ma_dr_wav_write_proc onWrite;
-    ma_dr_wav_seek_proc onSeek;
-    void* pUserData;
-    ma_allocation_callbacks allocationCallbacks;
-    ma_dr_wav_container container;
-    ma_dr_wav_fmt fmt;
-    ma_uint32 sampleRate;
-    ma_uint16 channels;
-    ma_uint16 bitsPerSample;
-    ma_uint16 translatedFormatTag;
-    ma_uint64 totalPCMFrameCount;
-    ma_uint64 dataChunkDataSize;
-    ma_uint64 dataChunkDataPos;
-    ma_uint64 bytesRemaining;
-    ma_uint64 readCursorInPCMFrames;
-    ma_uint64 dataChunkDataSizeTargetWrite;
-    ma_bool32 isSequentialWrite;
-    ma_dr_wav_metadata* pMetadata;
-    ma_uint32 metadataCount;
-    ma_dr_wav__memory_stream memoryStream;
-    ma_dr_wav__memory_stream_write memoryStreamWrite;
-    struct
-    {
-        ma_uint32 bytesRemainingInBlock;
-        ma_uint16 predictor[2];
-        ma_int32  delta[2];
-        ma_int32  cachedFrames[4];
-        ma_uint32 cachedFrameCount;
-        ma_int32  prevFrames[2][2];
-    } msadpcm;
-    struct
-    {
-        ma_uint32 bytesRemainingInBlock;
-        ma_int32  predictor[2];
-        ma_int32  stepIndex[2];
-        ma_int32  cachedFrames[16];
-        ma_uint32 cachedFrameCount;
-    } ima;
-    struct
-    {
-        ma_bool8 isLE;
-        ma_bool8 isUnsigned;
-    } aiff;
-} ma_dr_wav;
-MA_API ma_bool32 ma_dr_wav_init(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_ex(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, ma_dr_wav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_with_metadata(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_write(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_write_sequential(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_dr_wav_write_proc onWrite, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_write_sequential_pcm_frames(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, ma_dr_wav_write_proc onWrite, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_write_with_metadata(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount);
-MA_API ma_uint64 ma_dr_wav_target_write_size_bytes(const ma_dr_wav_data_format* pFormat, ma_uint64 totalFrameCount, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount);
-MA_API ma_dr_wav_metadata* ma_dr_wav_take_ownership_of_metadata(ma_dr_wav* pWav);
-MA_API ma_result ma_dr_wav_uninit(ma_dr_wav* pWav);
-MA_API size_t ma_dr_wav_read_raw(ma_dr_wav* pWav, size_t bytesToRead, void* pBufferOut);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_le(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_be(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut);
-MA_API ma_bool32 ma_dr_wav_seek_to_pcm_frame(ma_dr_wav* pWav, ma_uint64 targetFrameIndex);
-MA_API ma_result ma_dr_wav_get_cursor_in_pcm_frames(ma_dr_wav* pWav, ma_uint64* pCursor);
-MA_API ma_result ma_dr_wav_get_length_in_pcm_frames(ma_dr_wav* pWav, ma_uint64* pLength);
-MA_API size_t ma_dr_wav_write_raw(ma_dr_wav* pWav, size_t bytesToWrite, const void* pData);
-MA_API ma_uint64 ma_dr_wav_write_pcm_frames(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData);
-MA_API ma_uint64 ma_dr_wav_write_pcm_frames_le(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData);
-MA_API ma_uint64 ma_dr_wav_write_pcm_frames_be(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData);
-#ifndef MA_DR_WAV_NO_CONVERSION_API
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16le(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16be(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut);
-MA_API void ma_dr_wav_u8_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_s24_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_s32_to_s16(ma_int16* pOut, const ma_int32* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_f32_to_s16(ma_int16* pOut, const float* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_f64_to_s16(ma_int16* pOut, const double* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_alaw_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_mulaw_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32le(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32be(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut);
-MA_API void ma_dr_wav_u8_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_s16_to_f32(float* pOut, const ma_int16* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_s24_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_s32_to_f32(float* pOut, const ma_int32* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_alaw_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_mulaw_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32le(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut);
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32be(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut);
-MA_API void ma_dr_wav_u8_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_s16_to_s32(ma_int32* pOut, const ma_int16* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_s24_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_f32_to_s32(ma_int32* pOut, const float* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_f64_to_s32(ma_int32* pOut, const double* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_alaw_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount);
-MA_API void ma_dr_wav_mulaw_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount);
-#endif
-#ifndef MA_DR_WAV_NO_STDIO
-MA_API ma_bool32 ma_dr_wav_init_file(ma_dr_wav* pWav, const char* filename, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_ex(ma_dr_wav* pWav, const char* filename, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_ex_w(ma_dr_wav* pWav, const wchar_t* filename, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_with_metadata(ma_dr_wav* pWav, const char* filename, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_with_metadata_w(ma_dr_wav* pWav, const wchar_t* filename, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_write(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_write_sequential(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_pcm_frames(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_write_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_pcm_frames_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-#endif
-MA_API ma_bool32 ma_dr_wav_init_memory(ma_dr_wav* pWav, const void* data, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_memory_ex(ma_dr_wav* pWav, const void* data, size_t dataSize, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_memory_with_metadata(ma_dr_wav* pWav, const void* data, size_t dataSize, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_memory_write(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_memory_write_sequential(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_wav_init_memory_write_sequential_pcm_frames(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-#ifndef MA_DR_WAV_NO_CONVERSION_API
-MA_API ma_int16* ma_dr_wav_open_and_read_pcm_frames_s16(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API float* ma_dr_wav_open_and_read_pcm_frames_f32(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int32* ma_dr_wav_open_and_read_pcm_frames_s32(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-#ifndef MA_DR_WAV_NO_STDIO
-MA_API ma_int16* ma_dr_wav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API float* ma_dr_wav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int32* ma_dr_wav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int16* ma_dr_wav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API float* ma_dr_wav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int32* ma_dr_wav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-#endif
-MA_API ma_int16* ma_dr_wav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API float* ma_dr_wav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int32* ma_dr_wav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
-#endif
-MA_API void ma_dr_wav_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_uint16 ma_dr_wav_bytes_to_u16(const ma_uint8* data);
-MA_API ma_int16 ma_dr_wav_bytes_to_s16(const ma_uint8* data);
-MA_API ma_uint32 ma_dr_wav_bytes_to_u32(const ma_uint8* data);
-MA_API ma_int32 ma_dr_wav_bytes_to_s32(const ma_uint8* data);
-MA_API ma_uint64 ma_dr_wav_bytes_to_u64(const ma_uint8* data);
-MA_API ma_int64 ma_dr_wav_bytes_to_s64(const ma_uint8* data);
-MA_API float ma_dr_wav_bytes_to_f32(const ma_uint8* data);
-MA_API ma_bool32 ma_dr_wav_guid_equal(const ma_uint8 a[16], const ma_uint8 b[16]);
-MA_API ma_bool32 ma_dr_wav_fourcc_equal(const ma_uint8* a, const char* b);
-#ifdef __cplusplus
-}
-#endif
-#endif
-/* dr_wav_h end */
-#endif  /* MA_NO_WAV */
-
-#if !defined(MA_NO_FLAC) && !defined(MA_NO_DECODING)
-/* dr_flac_h begin */
-#ifndef ma_dr_flac_h
-#define ma_dr_flac_h
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define MA_DR_FLAC_STRINGIFY(x)      #x
-#define MA_DR_FLAC_XSTRINGIFY(x)     MA_DR_FLAC_STRINGIFY(x)
-#define MA_DR_FLAC_VERSION_MAJOR     0
-#define MA_DR_FLAC_VERSION_MINOR     12
-#define MA_DR_FLAC_VERSION_REVISION  42
-#define MA_DR_FLAC_VERSION_STRING    MA_DR_FLAC_XSTRINGIFY(MA_DR_FLAC_VERSION_MAJOR) "." MA_DR_FLAC_XSTRINGIFY(MA_DR_FLAC_VERSION_MINOR) "." MA_DR_FLAC_XSTRINGIFY(MA_DR_FLAC_VERSION_REVISION)
-#include <stddef.h>
-#if defined(_MSC_VER) && _MSC_VER >= 1700
-    #define MA_DR_FLAC_DEPRECATED       __declspec(deprecated)
-#elif (defined(__GNUC__) && __GNUC__ >= 4)
-    #define MA_DR_FLAC_DEPRECATED       __attribute__((deprecated))
-#elif defined(__has_feature)
-    #if __has_feature(attribute_deprecated)
-        #define MA_DR_FLAC_DEPRECATED   __attribute__((deprecated))
-    #else
-        #define MA_DR_FLAC_DEPRECATED
-    #endif
-#else
-    #define MA_DR_FLAC_DEPRECATED
-#endif
-MA_API void ma_dr_flac_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision);
-MA_API const char* ma_dr_flac_version_string(void);
-#ifndef MA_DR_FLAC_BUFFER_SIZE
-#define MA_DR_FLAC_BUFFER_SIZE   4096
-#endif
-#ifdef MA_64BIT
-typedef ma_uint64 ma_dr_flac_cache_t;
-#else
-typedef ma_uint32 ma_dr_flac_cache_t;
-#endif
-#define MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO       0
-#define MA_DR_FLAC_METADATA_BLOCK_TYPE_PADDING          1
-#define MA_DR_FLAC_METADATA_BLOCK_TYPE_APPLICATION      2
-#define MA_DR_FLAC_METADATA_BLOCK_TYPE_SEEKTABLE        3
-#define MA_DR_FLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT   4
-#define MA_DR_FLAC_METADATA_BLOCK_TYPE_CUESHEET         5
-#define MA_DR_FLAC_METADATA_BLOCK_TYPE_PICTURE          6
-#define MA_DR_FLAC_METADATA_BLOCK_TYPE_INVALID          127
-#define MA_DR_FLAC_PICTURE_TYPE_OTHER                   0
-#define MA_DR_FLAC_PICTURE_TYPE_FILE_ICON               1
-#define MA_DR_FLAC_PICTURE_TYPE_OTHER_FILE_ICON         2
-#define MA_DR_FLAC_PICTURE_TYPE_COVER_FRONT             3
-#define MA_DR_FLAC_PICTURE_TYPE_COVER_BACK              4
-#define MA_DR_FLAC_PICTURE_TYPE_LEAFLET_PAGE            5
-#define MA_DR_FLAC_PICTURE_TYPE_MEDIA                   6
-#define MA_DR_FLAC_PICTURE_TYPE_LEAD_ARTIST             7
-#define MA_DR_FLAC_PICTURE_TYPE_ARTIST                  8
-#define MA_DR_FLAC_PICTURE_TYPE_CONDUCTOR               9
-#define MA_DR_FLAC_PICTURE_TYPE_BAND                    10
-#define MA_DR_FLAC_PICTURE_TYPE_COMPOSER                11
-#define MA_DR_FLAC_PICTURE_TYPE_LYRICIST                12
-#define MA_DR_FLAC_PICTURE_TYPE_RECORDING_LOCATION      13
-#define MA_DR_FLAC_PICTURE_TYPE_DURING_RECORDING        14
-#define MA_DR_FLAC_PICTURE_TYPE_DURING_PERFORMANCE      15
-#define MA_DR_FLAC_PICTURE_TYPE_SCREEN_CAPTURE          16
-#define MA_DR_FLAC_PICTURE_TYPE_BRIGHT_COLORED_FISH     17
-#define MA_DR_FLAC_PICTURE_TYPE_ILLUSTRATION            18
-#define MA_DR_FLAC_PICTURE_TYPE_BAND_LOGOTYPE           19
-#define MA_DR_FLAC_PICTURE_TYPE_PUBLISHER_LOGOTYPE      20
-typedef enum
-{
-    ma_dr_flac_container_native,
-    ma_dr_flac_container_ogg,
-    ma_dr_flac_container_unknown
-} ma_dr_flac_container;
-typedef enum
-{
-    ma_dr_flac_seek_origin_start,
-    ma_dr_flac_seek_origin_current
-} ma_dr_flac_seek_origin;
-typedef struct
-{
-    ma_uint64 firstPCMFrame;
-    ma_uint64 flacFrameOffset;
-    ma_uint16 pcmFrameCount;
-} ma_dr_flac_seekpoint;
-typedef struct
-{
-    ma_uint16 minBlockSizeInPCMFrames;
-    ma_uint16 maxBlockSizeInPCMFrames;
-    ma_uint32 minFrameSizeInPCMFrames;
-    ma_uint32 maxFrameSizeInPCMFrames;
-    ma_uint32 sampleRate;
-    ma_uint8  channels;
-    ma_uint8  bitsPerSample;
-    ma_uint64 totalPCMFrameCount;
-    ma_uint8  md5[16];
-} ma_dr_flac_streaminfo;
-typedef struct
-{
-    ma_uint32 type;
-    const void* pRawData;
-    ma_uint32 rawDataSize;
-    union
-    {
-        ma_dr_flac_streaminfo streaminfo;
-        struct
-        {
-            int unused;
-        } padding;
-        struct
-        {
-            ma_uint32 id;
-            const void* pData;
-            ma_uint32 dataSize;
-        } application;
-        struct
-        {
-            ma_uint32 seekpointCount;
-            const ma_dr_flac_seekpoint* pSeekpoints;
-        } seektable;
-        struct
-        {
-            ma_uint32 vendorLength;
-            const char* vendor;
-            ma_uint32 commentCount;
-            const void* pComments;
-        } vorbis_comment;
-        struct
-        {
-            char catalog[128];
-            ma_uint64 leadInSampleCount;
-            ma_bool32 isCD;
-            ma_uint8 trackCount;
-            const void* pTrackData;
-        } cuesheet;
-        struct
-        {
-            ma_uint32 type;
-            ma_uint32 mimeLength;
-            const char* mime;
-            ma_uint32 descriptionLength;
-            const char* description;
-            ma_uint32 width;
-            ma_uint32 height;
-            ma_uint32 colorDepth;
-            ma_uint32 indexColorCount;
-            ma_uint32 pictureDataSize;
-            const ma_uint8* pPictureData;
-        } picture;
-    } data;
-} ma_dr_flac_metadata;
-typedef size_t (* ma_dr_flac_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
-typedef ma_bool32 (* ma_dr_flac_seek_proc)(void* pUserData, int offset, ma_dr_flac_seek_origin origin);
-typedef void (* ma_dr_flac_meta_proc)(void* pUserData, ma_dr_flac_metadata* pMetadata);
-typedef struct
-{
-    const ma_uint8* data;
-    size_t dataSize;
-    size_t currentReadPos;
-} ma_dr_flac__memory_stream;
-typedef struct
-{
-    ma_dr_flac_read_proc onRead;
-    ma_dr_flac_seek_proc onSeek;
-    void* pUserData;
-    size_t unalignedByteCount;
-    ma_dr_flac_cache_t unalignedCache;
-    ma_uint32 nextL2Line;
-    ma_uint32 consumedBits;
-    ma_dr_flac_cache_t cacheL2[MA_DR_FLAC_BUFFER_SIZE/sizeof(ma_dr_flac_cache_t)];
-    ma_dr_flac_cache_t cache;
-    ma_uint16 crc16;
-    ma_dr_flac_cache_t crc16Cache;
-    ma_uint32 crc16CacheIgnoredBytes;
-} ma_dr_flac_bs;
-typedef struct
-{
-    ma_uint8 subframeType;
-    ma_uint8 wastedBitsPerSample;
-    ma_uint8 lpcOrder;
-    ma_int32* pSamplesS32;
-} ma_dr_flac_subframe;
-typedef struct
-{
-    ma_uint64 pcmFrameNumber;
-    ma_uint32 flacFrameNumber;
-    ma_uint32 sampleRate;
-    ma_uint16 blockSizeInPCMFrames;
-    ma_uint8 channelAssignment;
-    ma_uint8 bitsPerSample;
-    ma_uint8 crc8;
-} ma_dr_flac_frame_header;
-typedef struct
-{
-    ma_dr_flac_frame_header header;
-    ma_uint32 pcmFramesRemaining;
-    ma_dr_flac_subframe subframes[8];
-} ma_dr_flac_frame;
-typedef struct
-{
-    ma_dr_flac_meta_proc onMeta;
-    void* pUserDataMD;
-    ma_allocation_callbacks allocationCallbacks;
-    ma_uint32 sampleRate;
-    ma_uint8 channels;
-    ma_uint8 bitsPerSample;
-    ma_uint16 maxBlockSizeInPCMFrames;
-    ma_uint64 totalPCMFrameCount;
-    ma_dr_flac_container container;
-    ma_uint32 seekpointCount;
-    ma_dr_flac_frame currentFLACFrame;
-    ma_uint64 currentPCMFrame;
-    ma_uint64 firstFLACFramePosInBytes;
-    ma_dr_flac__memory_stream memoryStream;
-    ma_int32* pDecodedSamples;
-    ma_dr_flac_seekpoint* pSeekpoints;
-    void* _oggbs;
-    ma_bool32 _noSeekTableSeek    : 1;
-    ma_bool32 _noBinarySearchSeek : 1;
-    ma_bool32 _noBruteForceSeek   : 1;
-    ma_dr_flac_bs bs;
-    ma_uint8 pExtraData[1];
-} ma_dr_flac;
-MA_API ma_dr_flac* ma_dr_flac_open(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_dr_flac* ma_dr_flac_open_relaxed(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_container container, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_dr_flac* ma_dr_flac_open_with_metadata(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_dr_flac* ma_dr_flac_open_with_metadata_relaxed(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, ma_dr_flac_container container, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API void ma_dr_flac_close(ma_dr_flac* pFlac);
-MA_API ma_uint64 ma_dr_flac_read_pcm_frames_s32(ma_dr_flac* pFlac, ma_uint64 framesToRead, ma_int32* pBufferOut);
-MA_API ma_uint64 ma_dr_flac_read_pcm_frames_s16(ma_dr_flac* pFlac, ma_uint64 framesToRead, ma_int16* pBufferOut);
-MA_API ma_uint64 ma_dr_flac_read_pcm_frames_f32(ma_dr_flac* pFlac, ma_uint64 framesToRead, float* pBufferOut);
-MA_API ma_bool32 ma_dr_flac_seek_to_pcm_frame(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex);
-#ifndef MA_DR_FLAC_NO_STDIO
-MA_API ma_dr_flac* ma_dr_flac_open_file(const char* pFileName, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_dr_flac* ma_dr_flac_open_file_w(const wchar_t* pFileName, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_dr_flac* ma_dr_flac_open_file_with_metadata(const char* pFileName, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_dr_flac* ma_dr_flac_open_file_with_metadata_w(const wchar_t* pFileName, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-#endif
-MA_API ma_dr_flac* ma_dr_flac_open_memory(const void* pData, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_dr_flac* ma_dr_flac_open_memory_with_metadata(const void* pData, size_t dataSize, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int32* ma_dr_flac_open_and_read_pcm_frames_s32(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int16* ma_dr_flac_open_and_read_pcm_frames_s16(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API float* ma_dr_flac_open_and_read_pcm_frames_f32(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-#ifndef MA_DR_FLAC_NO_STDIO
-MA_API ma_int32* ma_dr_flac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int16* ma_dr_flac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API float* ma_dr_flac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-#endif
-MA_API ma_int32* ma_dr_flac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int16* ma_dr_flac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API float* ma_dr_flac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API void ma_dr_flac_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
-typedef struct
-{
-    ma_uint32 countRemaining;
-    const char* pRunningData;
-} ma_dr_flac_vorbis_comment_iterator;
-MA_API void ma_dr_flac_init_vorbis_comment_iterator(ma_dr_flac_vorbis_comment_iterator* pIter, ma_uint32 commentCount, const void* pComments);
-MA_API const char* ma_dr_flac_next_vorbis_comment(ma_dr_flac_vorbis_comment_iterator* pIter, ma_uint32* pCommentLengthOut);
-typedef struct
-{
-    ma_uint32 countRemaining;
-    const char* pRunningData;
-} ma_dr_flac_cuesheet_track_iterator;
-typedef struct
-{
-    ma_uint64 offset;
-    ma_uint8 index;
-    ma_uint8 reserved[3];
-} ma_dr_flac_cuesheet_track_index;
-typedef struct
-{
-    ma_uint64 offset;
-    ma_uint8 trackNumber;
-    char ISRC[12];
-    ma_bool8 isAudio;
-    ma_bool8 preEmphasis;
-    ma_uint8 indexCount;
-    const ma_dr_flac_cuesheet_track_index* pIndexPoints;
-} ma_dr_flac_cuesheet_track;
-MA_API void ma_dr_flac_init_cuesheet_track_iterator(ma_dr_flac_cuesheet_track_iterator* pIter, ma_uint32 trackCount, const void* pTrackData);
-MA_API ma_bool32 ma_dr_flac_next_cuesheet_track(ma_dr_flac_cuesheet_track_iterator* pIter, ma_dr_flac_cuesheet_track* pCuesheetTrack);
-#ifdef __cplusplus
-}
-#endif
-#endif
-/* dr_flac_h end */
-#endif  /* MA_NO_FLAC */
-
-#if !defined(MA_NO_MP3) && !defined(MA_NO_DECODING)
-/* dr_mp3_h begin */
-#ifndef ma_dr_mp3_h
-#define ma_dr_mp3_h
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define MA_DR_MP3_STRINGIFY(x)      #x
-#define MA_DR_MP3_XSTRINGIFY(x)     MA_DR_MP3_STRINGIFY(x)
-#define MA_DR_MP3_VERSION_MAJOR     0
-#define MA_DR_MP3_VERSION_MINOR     6
-#define MA_DR_MP3_VERSION_REVISION  38
-#define MA_DR_MP3_VERSION_STRING    MA_DR_MP3_XSTRINGIFY(MA_DR_MP3_VERSION_MAJOR) "." MA_DR_MP3_XSTRINGIFY(MA_DR_MP3_VERSION_MINOR) "." MA_DR_MP3_XSTRINGIFY(MA_DR_MP3_VERSION_REVISION)
-#include <stddef.h>
-#define MA_DR_MP3_MAX_PCM_FRAMES_PER_MP3_FRAME  1152
-#define MA_DR_MP3_MAX_SAMPLES_PER_FRAME         (MA_DR_MP3_MAX_PCM_FRAMES_PER_MP3_FRAME*2)
-MA_API void ma_dr_mp3_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision);
-MA_API const char* ma_dr_mp3_version_string(void);
-typedef struct
-{
-    int frame_bytes, channels, hz, layer, bitrate_kbps;
-} ma_dr_mp3dec_frame_info;
-typedef struct
-{
-    float mdct_overlap[2][9*32], qmf_state[15*2*32];
-    int reserv, free_format_bytes;
-    ma_uint8 header[4], reserv_buf[511];
-} ma_dr_mp3dec;
-MA_API void ma_dr_mp3dec_init(ma_dr_mp3dec *dec);
-MA_API int ma_dr_mp3dec_decode_frame(ma_dr_mp3dec *dec, const ma_uint8 *mp3, int mp3_bytes, void *pcm, ma_dr_mp3dec_frame_info *info);
-MA_API void ma_dr_mp3dec_f32_to_s16(const float *in, ma_int16 *out, size_t num_samples);
-typedef enum
-{
-    ma_dr_mp3_seek_origin_start,
-    ma_dr_mp3_seek_origin_current
-} ma_dr_mp3_seek_origin;
-typedef struct
-{
-    ma_uint64 seekPosInBytes;
-    ma_uint64 pcmFrameIndex;
-    ma_uint16 mp3FramesToDiscard;
-    ma_uint16 pcmFramesToDiscard;
-} ma_dr_mp3_seek_point;
-typedef size_t (* ma_dr_mp3_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
-typedef ma_bool32 (* ma_dr_mp3_seek_proc)(void* pUserData, int offset, ma_dr_mp3_seek_origin origin);
-typedef struct
-{
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-} ma_dr_mp3_config;
-typedef struct
-{
-    ma_dr_mp3dec decoder;
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_dr_mp3_read_proc onRead;
-    ma_dr_mp3_seek_proc onSeek;
-    void* pUserData;
-    ma_allocation_callbacks allocationCallbacks;
-    ma_uint32 mp3FrameChannels;
-    ma_uint32 mp3FrameSampleRate;
-    ma_uint32 pcmFramesConsumedInMP3Frame;
-    ma_uint32 pcmFramesRemainingInMP3Frame;
-    ma_uint8 pcmFrames[sizeof(float)*MA_DR_MP3_MAX_SAMPLES_PER_FRAME];
-    ma_uint64 currentPCMFrame;
-    ma_uint64 streamCursor;
-    ma_dr_mp3_seek_point* pSeekPoints;
-    ma_uint32 seekPointCount;
-    size_t dataSize;
-    size_t dataCapacity;
-    size_t dataConsumed;
-    ma_uint8* pData;
-    ma_bool32 atEnd : 1;
-    struct
-    {
-        const ma_uint8* pData;
-        size_t dataSize;
-        size_t currentReadPos;
-    } memory;
-} ma_dr_mp3;
-MA_API ma_bool32 ma_dr_mp3_init(ma_dr_mp3* pMP3, ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_mp3_init_memory(ma_dr_mp3* pMP3, const void* pData, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks);
-#ifndef MA_DR_MP3_NO_STDIO
-MA_API ma_bool32 ma_dr_mp3_init_file(ma_dr_mp3* pMP3, const char* pFilePath, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_bool32 ma_dr_mp3_init_file_w(ma_dr_mp3* pMP3, const wchar_t* pFilePath, const ma_allocation_callbacks* pAllocationCallbacks);
-#endif
-MA_API void ma_dr_mp3_uninit(ma_dr_mp3* pMP3);
-MA_API ma_uint64 ma_dr_mp3_read_pcm_frames_f32(ma_dr_mp3* pMP3, ma_uint64 framesToRead, float* pBufferOut);
-MA_API ma_uint64 ma_dr_mp3_read_pcm_frames_s16(ma_dr_mp3* pMP3, ma_uint64 framesToRead, ma_int16* pBufferOut);
-MA_API ma_bool32 ma_dr_mp3_seek_to_pcm_frame(ma_dr_mp3* pMP3, ma_uint64 frameIndex);
-MA_API ma_uint64 ma_dr_mp3_get_pcm_frame_count(ma_dr_mp3* pMP3);
-MA_API ma_uint64 ma_dr_mp3_get_mp3_frame_count(ma_dr_mp3* pMP3);
-MA_API ma_bool32 ma_dr_mp3_get_mp3_and_pcm_frame_count(ma_dr_mp3* pMP3, ma_uint64* pMP3FrameCount, ma_uint64* pPCMFrameCount);
-MA_API ma_bool32 ma_dr_mp3_calculate_seek_points(ma_dr_mp3* pMP3, ma_uint32* pSeekPointCount, ma_dr_mp3_seek_point* pSeekPoints);
-MA_API ma_bool32 ma_dr_mp3_bind_seek_table(ma_dr_mp3* pMP3, ma_uint32 seekPointCount, ma_dr_mp3_seek_point* pSeekPoints);
-MA_API float* ma_dr_mp3_open_and_read_pcm_frames_f32(ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int16* ma_dr_mp3_open_and_read_pcm_frames_s16(ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API float* ma_dr_mp3_open_memory_and_read_pcm_frames_f32(const void* pData, size_t dataSize, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int16* ma_dr_mp3_open_memory_and_read_pcm_frames_s16(const void* pData, size_t dataSize, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-#ifndef MA_DR_MP3_NO_STDIO
-MA_API float* ma_dr_mp3_open_file_and_read_pcm_frames_f32(const char* filePath, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_int16* ma_dr_mp3_open_file_and_read_pcm_frames_s16(const char* filePath, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
-#endif
-MA_API void* ma_dr_mp3_malloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API void ma_dr_mp3_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
-#ifdef __cplusplus
-}
-#endif
-#endif
-/* dr_mp3_h end */
-#endif  /* MA_NO_MP3 */
-
-
-/**************************************************************************************************************************************************************
-
-Decoding
-
-**************************************************************************************************************************************************************/
-#ifndef MA_NO_DECODING
-
-static ma_result ma_decoder_read_bytes(ma_decoder* pDecoder, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead)
-{
-    MA_ASSERT(pDecoder != NULL);
-
-    return pDecoder->onRead(pDecoder, pBufferOut, bytesToRead, pBytesRead);
-}
-
-static ma_result ma_decoder_seek_bytes(ma_decoder* pDecoder, ma_int64 byteOffset, ma_seek_origin origin)
-{
-    MA_ASSERT(pDecoder != NULL);
-
-    return pDecoder->onSeek(pDecoder, byteOffset, origin);
-}
-
-static ma_result ma_decoder_tell_bytes(ma_decoder* pDecoder, ma_int64* pCursor)
-{
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pDecoder->onTell == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    return pDecoder->onTell(pDecoder, pCursor);
-}
-
-
-MA_API ma_decoding_backend_config ma_decoding_backend_config_init(ma_format preferredFormat, ma_uint32 seekPointCount)
-{
-    ma_decoding_backend_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.preferredFormat = preferredFormat;
-    config.seekPointCount  = seekPointCount;
-
-    return config;
-}
-
-
-MA_API ma_decoder_config ma_decoder_config_init(ma_format outputFormat, ma_uint32 outputChannels, ma_uint32 outputSampleRate)
-{
-    ma_decoder_config config;
-    MA_ZERO_OBJECT(&config);
-    config.format         = outputFormat;
-    config.channels       = outputChannels;
-    config.sampleRate     = outputSampleRate;
-    config.resampling     = ma_resampler_config_init(ma_format_unknown, 0, 0, 0, ma_resample_algorithm_linear); /* Format/channels/rate doesn't matter here. */
-    config.encodingFormat = ma_encoding_format_unknown;
-
-    /* Note that we are intentionally leaving the channel map empty here which will cause the default channel map to be used. */
-
-    return config;
-}
-
-MA_API ma_decoder_config ma_decoder_config_init_default()
-{
-    return ma_decoder_config_init(ma_format_unknown, 0, 0);
-}
-
-MA_API ma_decoder_config ma_decoder_config_init_copy(const ma_decoder_config* pConfig)
-{
-    ma_decoder_config config;
-    if (pConfig != NULL) {
-        config = *pConfig;
-    } else {
-        MA_ZERO_OBJECT(&config);
-    }
-
-    return config;
-}
-
-static ma_result ma_decoder__init_data_converter(ma_decoder* pDecoder, const ma_decoder_config* pConfig)
-{
-    ma_result result;
-    ma_data_converter_config converterConfig;
-    ma_format internalFormat;
-    ma_uint32 internalChannels;
-    ma_uint32 internalSampleRate;
-    ma_channel internalChannelMap[MA_MAX_CHANNELS];
-
-    MA_ASSERT(pDecoder != NULL);
-    MA_ASSERT(pConfig  != NULL);
-
-    result = ma_data_source_get_data_format(pDecoder->pBackend, &internalFormat, &internalChannels, &internalSampleRate, internalChannelMap, ma_countof(internalChannelMap));
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to retrieve the internal data format. */
-    }
-
-
-    /* Make sure we're not asking for too many channels. */
-    if (pConfig->channels > MA_MAX_CHANNELS) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The internal channels should have already been validated at a higher level, but we'll do it again explicitly here for safety. */
-    if (internalChannels > MA_MAX_CHANNELS) {
-        return MA_INVALID_ARGS;
-    }
-
-
-    /* Output format. */
-    if (pConfig->format == ma_format_unknown) {
-        pDecoder->outputFormat = internalFormat;
-    } else {
-        pDecoder->outputFormat = pConfig->format;
-    }
-
-    if (pConfig->channels == 0) {
-        pDecoder->outputChannels = internalChannels;
-    } else {
-        pDecoder->outputChannels = pConfig->channels;
-    }
-
-    if (pConfig->sampleRate == 0) {
-        pDecoder->outputSampleRate = internalSampleRate;
-    } else {
-        pDecoder->outputSampleRate = pConfig->sampleRate;
-    }
-
-    converterConfig = ma_data_converter_config_init(
-        internalFormat,     pDecoder->outputFormat,
-        internalChannels,   pDecoder->outputChannels,
-        internalSampleRate, pDecoder->outputSampleRate
-    );
-    converterConfig.pChannelMapIn          = internalChannelMap;
-    converterConfig.pChannelMapOut         = pConfig->pChannelMap;
-    converterConfig.channelMixMode         = pConfig->channelMixMode;
-    converterConfig.ditherMode             = pConfig->ditherMode;
-    converterConfig.allowDynamicSampleRate = MA_FALSE;   /* Never allow dynamic sample rate conversion. Setting this to true will disable passthrough optimizations. */
-    converterConfig.resampling             = pConfig->resampling;
-
-    result = ma_data_converter_init(&converterConfig, &pDecoder->allocationCallbacks, &pDecoder->converter);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /*
-    Now that we have the decoder we need to determine whether or not we need a heap-allocated cache. We'll
-    need this if the data converter does not support calculation of the required input frame count. To
-    determine support for this we'll just run a test.
-    */
-    {
-        ma_uint64 unused;
-
-        result = ma_data_converter_get_required_input_frame_count(&pDecoder->converter, 1, &unused);
-        if (result != MA_SUCCESS) {
-            /*
-            We were unable to calculate the required input frame count which means we'll need to use
-            a heap-allocated cache.
-            */
-            ma_uint64 inputCacheCapSizeInBytes;
-
-            pDecoder->inputCacheCap = MA_DATA_CONVERTER_STACK_BUFFER_SIZE / ma_get_bytes_per_frame(internalFormat, internalChannels);
-
-            /* Not strictly necessary, but keeping here for safety in case we change the default value of pDecoder->inputCacheCap. */
-            inputCacheCapSizeInBytes = pDecoder->inputCacheCap * ma_get_bytes_per_frame(internalFormat, internalChannels);
-            if (inputCacheCapSizeInBytes > MA_SIZE_MAX) {
-                ma_data_converter_uninit(&pDecoder->converter, &pDecoder->allocationCallbacks);
-                return MA_OUT_OF_MEMORY;
-            }
-
-            pDecoder->pInputCache = ma_malloc((size_t)inputCacheCapSizeInBytes, &pDecoder->allocationCallbacks);    /* Safe cast to size_t. */
-            if (pDecoder->pInputCache == NULL) {
-                ma_data_converter_uninit(&pDecoder->converter, &pDecoder->allocationCallbacks);
-                return MA_OUT_OF_MEMORY;
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-
-
-static ma_result ma_decoder_internal_on_read__custom(void* pUserData, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead)
-{
-    ma_decoder* pDecoder = (ma_decoder*)pUserData;
-    MA_ASSERT(pDecoder != NULL);
-
-    return ma_decoder_read_bytes(pDecoder, pBufferOut, bytesToRead, pBytesRead);
-}
-
-static ma_result ma_decoder_internal_on_seek__custom(void* pUserData, ma_int64 offset, ma_seek_origin origin)
-{
-    ma_decoder* pDecoder = (ma_decoder*)pUserData;
-    MA_ASSERT(pDecoder != NULL);
-
-    return ma_decoder_seek_bytes(pDecoder, offset, origin);
-}
-
-static ma_result ma_decoder_internal_on_tell__custom(void* pUserData, ma_int64* pCursor)
-{
-    ma_decoder* pDecoder = (ma_decoder*)pUserData;
-    MA_ASSERT(pDecoder != NULL);
-
-    return ma_decoder_tell_bytes(pDecoder, pCursor);
-}
-
-
-static ma_result ma_decoder_init_from_vtable__internal(const ma_decoding_backend_vtable* pVTable, void* pVTableUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoding_backend_config backendConfig;
-    ma_data_source* pBackend;
-
-    MA_ASSERT(pVTable  != NULL);
-    MA_ASSERT(pConfig  != NULL);
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pVTable->onInit == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    backendConfig = ma_decoding_backend_config_init(pConfig->format, pConfig->seekPointCount);
-
-    result = pVTable->onInit(pVTableUserData, ma_decoder_internal_on_read__custom, ma_decoder_internal_on_seek__custom, ma_decoder_internal_on_tell__custom, pDecoder, &backendConfig, &pDecoder->allocationCallbacks, &pBackend);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the backend from this vtable. */
-    }
-
-    /* Getting here means we were able to initialize the backend so we can now initialize the decoder. */
-    pDecoder->pBackend         = pBackend;
-    pDecoder->pBackendVTable   = pVTable;
-    pDecoder->pBackendUserData = pConfig->pCustomBackendUserData;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoder_init_from_file__internal(const ma_decoding_backend_vtable* pVTable, void* pVTableUserData, const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoding_backend_config backendConfig;
-    ma_data_source* pBackend;
-
-    MA_ASSERT(pVTable  != NULL);
-    MA_ASSERT(pConfig  != NULL);
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pVTable->onInitFile == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    backendConfig = ma_decoding_backend_config_init(pConfig->format, pConfig->seekPointCount);
-
-    result = pVTable->onInitFile(pVTableUserData, pFilePath, &backendConfig, &pDecoder->allocationCallbacks, &pBackend);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the backend from this vtable. */
-    }
-
-    /* Getting here means we were able to initialize the backend so we can now initialize the decoder. */
-    pDecoder->pBackend         = pBackend;
-    pDecoder->pBackendVTable   = pVTable;
-    pDecoder->pBackendUserData = pConfig->pCustomBackendUserData;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoder_init_from_file_w__internal(const ma_decoding_backend_vtable* pVTable, void* pVTableUserData, const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoding_backend_config backendConfig;
-    ma_data_source* pBackend;
-
-    MA_ASSERT(pVTable  != NULL);
-    MA_ASSERT(pConfig  != NULL);
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pVTable->onInitFileW == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    backendConfig = ma_decoding_backend_config_init(pConfig->format, pConfig->seekPointCount);
-
-    result = pVTable->onInitFileW(pVTableUserData, pFilePath, &backendConfig, &pDecoder->allocationCallbacks, &pBackend);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the backend from this vtable. */
-    }
-
-    /* Getting here means we were able to initialize the backend so we can now initialize the decoder. */
-    pDecoder->pBackend         = pBackend;
-    pDecoder->pBackendVTable   = pVTable;
-    pDecoder->pBackendUserData = pConfig->pCustomBackendUserData;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoder_init_from_memory__internal(const ma_decoding_backend_vtable* pVTable, void* pVTableUserData, const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoding_backend_config backendConfig;
-    ma_data_source* pBackend;
-
-    MA_ASSERT(pVTable  != NULL);
-    MA_ASSERT(pConfig  != NULL);
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pVTable->onInitMemory == NULL) {
-        return MA_NOT_IMPLEMENTED;
-    }
-
-    backendConfig = ma_decoding_backend_config_init(pConfig->format, pConfig->seekPointCount);
-
-    result = pVTable->onInitMemory(pVTableUserData, pData, dataSize, &backendConfig, &pDecoder->allocationCallbacks, &pBackend);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the backend from this vtable. */
-    }
-
-    /* Getting here means we were able to initialize the backend so we can now initialize the decoder. */
-    pDecoder->pBackend         = pBackend;
-    pDecoder->pBackendVTable   = pVTable;
-    pDecoder->pBackendUserData = pConfig->pCustomBackendUserData;
-
-    return MA_SUCCESS;
-}
-
-
-
-static ma_result ma_decoder_init_custom__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result = MA_NO_BACKEND;
-    size_t ivtable;
-
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pConfig->ppCustomBackendVTables == NULL) {
-        return MA_NO_BACKEND;
-    }
-
-    /* The order each backend is listed is what defines the priority. */
-    for (ivtable = 0; ivtable < pConfig->customBackendCount; ivtable += 1) {
-        const ma_decoding_backend_vtable* pVTable = pConfig->ppCustomBackendVTables[ivtable];
-        if (pVTable != NULL) {
-            result = ma_decoder_init_from_vtable__internal(pVTable, pConfig->pCustomBackendUserData, pConfig, pDecoder);
-            if (result == MA_SUCCESS) {
-                return MA_SUCCESS;
-            } else {
-                /* Initialization failed. Move on to the next one, but seek back to the start first so the next vtable starts from the first byte of the file. */
-                result = ma_decoder_seek_bytes(pDecoder, 0, ma_seek_origin_start);
-                if (result != MA_SUCCESS) {
-                    return result;  /* Failed to seek back to the start. */
-                }
-            }
-        } else {
-            /* No vtable. */
-        }
-    }
-
-    /* Getting here means we couldn't find a backend. */
-    return MA_NO_BACKEND;
-}
-
-static ma_result ma_decoder_init_custom_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result = MA_NO_BACKEND;
-    size_t ivtable;
-
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pConfig->ppCustomBackendVTables == NULL) {
-        return MA_NO_BACKEND;
-    }
-
-    /* The order each backend is listed is what defines the priority. */
-    for (ivtable = 0; ivtable < pConfig->customBackendCount; ivtable += 1) {
-        const ma_decoding_backend_vtable* pVTable = pConfig->ppCustomBackendVTables[ivtable];
-        if (pVTable != NULL) {
-            result = ma_decoder_init_from_file__internal(pVTable, pConfig->pCustomBackendUserData, pFilePath, pConfig, pDecoder);
-            if (result == MA_SUCCESS) {
-                return MA_SUCCESS;
-            }
-        } else {
-            /* No vtable. */
-        }
-    }
-
-    /* Getting here means we couldn't find a backend. */
-    return MA_NO_BACKEND;
-}
-
-static ma_result ma_decoder_init_custom_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result = MA_NO_BACKEND;
-    size_t ivtable;
-
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pConfig->ppCustomBackendVTables == NULL) {
-        return MA_NO_BACKEND;
-    }
-
-    /* The order each backend is listed is what defines the priority. */
-    for (ivtable = 0; ivtable < pConfig->customBackendCount; ivtable += 1) {
-        const ma_decoding_backend_vtable* pVTable = pConfig->ppCustomBackendVTables[ivtable];
-        if (pVTable != NULL) {
-            result = ma_decoder_init_from_file_w__internal(pVTable, pConfig->pCustomBackendUserData, pFilePath, pConfig, pDecoder);
-            if (result == MA_SUCCESS) {
-                return MA_SUCCESS;
-            }
-        } else {
-            /* No vtable. */
-        }
-    }
-
-    /* Getting here means we couldn't find a backend. */
-    return MA_NO_BACKEND;
-}
-
-static ma_result ma_decoder_init_custom_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result = MA_NO_BACKEND;
-    size_t ivtable;
-
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pConfig->ppCustomBackendVTables == NULL) {
-        return MA_NO_BACKEND;
-    }
-
-    /* The order each backend is listed is what defines the priority. */
-    for (ivtable = 0; ivtable < pConfig->customBackendCount; ivtable += 1) {
-        const ma_decoding_backend_vtable* pVTable = pConfig->ppCustomBackendVTables[ivtable];
-        if (pVTable != NULL) {
-            result = ma_decoder_init_from_memory__internal(pVTable, pConfig->pCustomBackendUserData, pData, dataSize, pConfig, pDecoder);
-            if (result == MA_SUCCESS) {
-                return MA_SUCCESS;
-            }
-        } else {
-            /* No vtable. */
-        }
-    }
-
-    /* Getting here means we couldn't find a backend. */
-    return MA_NO_BACKEND;
-}
-
-
-/* WAV */
-#ifdef ma_dr_wav_h
-#define MA_HAS_WAV
-
-typedef struct
-{
-    ma_data_source_base ds;
-    ma_read_proc onRead;
-    ma_seek_proc onSeek;
-    ma_tell_proc onTell;
-    void* pReadSeekTellUserData;
-    ma_format format;           /* Can be f32, s16 or s32. */
-#if !defined(MA_NO_WAV)
-    ma_dr_wav dr;
-#endif
-} ma_wav;
-
-MA_API ma_result ma_wav_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav);
-MA_API ma_result ma_wav_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav);
-MA_API ma_result ma_wav_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav);
-MA_API ma_result ma_wav_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav);
-MA_API void ma_wav_uninit(ma_wav* pWav, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_wav_read_pcm_frames(ma_wav* pWav, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_wav_seek_to_pcm_frame(ma_wav* pWav, ma_uint64 frameIndex);
-MA_API ma_result ma_wav_get_data_format(ma_wav* pWav, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_wav_get_cursor_in_pcm_frames(ma_wav* pWav, ma_uint64* pCursor);
-MA_API ma_result ma_wav_get_length_in_pcm_frames(ma_wav* pWav, ma_uint64* pLength);
-
-
-static ma_result ma_wav_ds_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_wav_read_pcm_frames((ma_wav*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_wav_ds_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_wav_seek_to_pcm_frame((ma_wav*)pDataSource, frameIndex);
-}
-
-static ma_result ma_wav_ds_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    return ma_wav_get_data_format((ma_wav*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-}
-
-static ma_result ma_wav_ds_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    return ma_wav_get_cursor_in_pcm_frames((ma_wav*)pDataSource, pCursor);
-}
-
-static ma_result ma_wav_ds_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    return ma_wav_get_length_in_pcm_frames((ma_wav*)pDataSource, pLength);
-}
-
-static ma_data_source_vtable g_ma_wav_ds_vtable =
-{
-    ma_wav_ds_read,
-    ma_wav_ds_seek,
-    ma_wav_ds_get_data_format,
-    ma_wav_ds_get_cursor,
-    ma_wav_ds_get_length,
-    NULL,   /* onSetLooping */
-    0
-};
-
-
-#if !defined(MA_NO_WAV)
-static size_t ma_wav_dr_callback__read(void* pUserData, void* pBufferOut, size_t bytesToRead)
-{
-    ma_wav* pWav = (ma_wav*)pUserData;
-    ma_result result;
-    size_t bytesRead;
-
-    MA_ASSERT(pWav != NULL);
-
-    result = pWav->onRead(pWav->pReadSeekTellUserData, pBufferOut, bytesToRead, &bytesRead);
-    (void)result;
-
-    return bytesRead;
-}
-
-static ma_bool32 ma_wav_dr_callback__seek(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
-{
-    ma_wav* pWav = (ma_wav*)pUserData;
-    ma_result result;
-    ma_seek_origin maSeekOrigin;
-
-    MA_ASSERT(pWav != NULL);
-
-    maSeekOrigin = ma_seek_origin_start;
-    if (origin == ma_dr_wav_seek_origin_current) {
-        maSeekOrigin =  ma_seek_origin_current;
-    }
-
-    result = pWav->onSeek(pWav->pReadSeekTellUserData, offset, maSeekOrigin);
-    if (result != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-
-    return MA_TRUE;
-}
-#endif
-
-static ma_result ma_wav_init_internal(const ma_decoding_backend_config* pConfig, ma_wav* pWav)
-{
-    ma_result result;
-    ma_data_source_config dataSourceConfig;
-
-    if (pWav == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pWav);
-    pWav->format = ma_format_unknown;   /* Use closest match to source file by default. */
-
-    if (pConfig != NULL && (pConfig->preferredFormat == ma_format_f32 || pConfig->preferredFormat == ma_format_s16 || pConfig->preferredFormat == ma_format_s32)) {
-        pWav->format = pConfig->preferredFormat;
-    } else {
-        /* Getting here means something other than f32 and s16 was specified. Just leave this unset to use the default format. */
-    }
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_wav_ds_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pWav->ds);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the base data source. */
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_wav_post_init(ma_wav* pWav)
-{
-    /*
-    If an explicit format was not specified, try picking the closest match based on the internal
-    format. The format needs to be supported by miniaudio.
-    */
-    if (pWav->format == ma_format_unknown) {
-        switch (pWav->dr.translatedFormatTag)
-        {
-            case MA_DR_WAVE_FORMAT_PCM:
-            {
-                if (pWav->dr.bitsPerSample == 8) {
-                    pWav->format = ma_format_u8;
-                } else if (pWav->dr.bitsPerSample == 16) {
-                    pWav->format = ma_format_s16;
-                } else if (pWav->dr.bitsPerSample == 24) {
-                    pWav->format = ma_format_s24;
-                } else if (pWav->dr.bitsPerSample == 32) {
-                    pWav->format = ma_format_s32;
-                }
-            } break;
-
-            case MA_DR_WAVE_FORMAT_IEEE_FLOAT:
-            {
-                if (pWav->dr.bitsPerSample == 32) {
-                    pWav->format = ma_format_f32;
-                }
-            } break;
-
-            default: break;
-        }
-
-        /* Fall back to f32 if we couldn't find anything. */
-        if (pWav->format == ma_format_unknown) {
-            pWav->format =  ma_format_f32;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_wav_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav)
-{
-    ma_result result;
-
-    result = ma_wav_init_internal(pConfig, pWav);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (onRead == NULL || onSeek == NULL) {
-        return MA_INVALID_ARGS; /* onRead and onSeek are mandatory. */
-    }
-
-    pWav->onRead = onRead;
-    pWav->onSeek = onSeek;
-    pWav->onTell = onTell;
-    pWav->pReadSeekTellUserData = pReadSeekTellUserData;
-
-    #if !defined(MA_NO_WAV)
-    {
-        ma_bool32 wavResult;
-
-        wavResult = ma_dr_wav_init(&pWav->dr, ma_wav_dr_callback__read, ma_wav_dr_callback__seek, pWav, pAllocationCallbacks);
-        if (wavResult != MA_TRUE) {
-            return MA_INVALID_FILE;
-        }
-
-        ma_wav_post_init(pWav);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* wav is disabled. */
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_wav_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav)
-{
-    ma_result result;
-
-    result = ma_wav_init_internal(pConfig, pWav);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_WAV)
-    {
-        ma_bool32 wavResult;
-
-        wavResult = ma_dr_wav_init_file(&pWav->dr, pFilePath, pAllocationCallbacks);
-        if (wavResult != MA_TRUE) {
-            return MA_INVALID_FILE;
-        }
-
-        ma_wav_post_init(pWav);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* wav is disabled. */
-        (void)pFilePath;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_wav_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav)
-{
-    ma_result result;
-
-    result = ma_wav_init_internal(pConfig, pWav);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_WAV)
-    {
-        ma_bool32 wavResult;
-
-        wavResult = ma_dr_wav_init_file_w(&pWav->dr, pFilePath, pAllocationCallbacks);
-        if (wavResult != MA_TRUE) {
-            return MA_INVALID_FILE;
-        }
-
-        ma_wav_post_init(pWav);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* wav is disabled. */
-        (void)pFilePath;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_wav_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav)
-{
-    ma_result result;
-
-    result = ma_wav_init_internal(pConfig, pWav);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_WAV)
-    {
-        ma_bool32 wavResult;
-
-        wavResult = ma_dr_wav_init_memory(&pWav->dr, pData, dataSize, pAllocationCallbacks);
-        if (wavResult != MA_TRUE) {
-            return MA_INVALID_FILE;
-        }
-
-        ma_wav_post_init(pWav);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* wav is disabled. */
-        (void)pData;
-        (void)dataSize;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API void ma_wav_uninit(ma_wav* pWav, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pWav == NULL) {
-        return;
-    }
-
-    (void)pAllocationCallbacks;
-
-    #if !defined(MA_NO_WAV)
-    {
-        ma_dr_wav_uninit(&pWav->dr);
-    }
-    #else
-    {
-        /* wav is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-    }
-    #endif
-
-    ma_data_source_uninit(&pWav->ds);
-}
-
-MA_API ma_result ma_wav_read_pcm_frames(ma_wav* pWav, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pWav == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_WAV)
-    {
-        /* We always use floating point format. */
-        ma_result result = MA_SUCCESS;  /* Must be initialized to MA_SUCCESS. */
-        ma_uint64 totalFramesRead = 0;
-        ma_format format;
-
-        ma_wav_get_data_format(pWav, &format, NULL, NULL, NULL, 0);
-
-        switch (format)
-        {
-            case ma_format_f32:
-            {
-                totalFramesRead = ma_dr_wav_read_pcm_frames_f32(&pWav->dr, frameCount, (float*)pFramesOut);
-            } break;
-
-            case ma_format_s16:
-            {
-                totalFramesRead = ma_dr_wav_read_pcm_frames_s16(&pWav->dr, frameCount, (ma_int16*)pFramesOut);
-            } break;
-
-            case ma_format_s32:
-            {
-                totalFramesRead = ma_dr_wav_read_pcm_frames_s32(&pWav->dr, frameCount, (ma_int32*)pFramesOut);
-            } break;
-
-            /* Fallback to a raw read. */
-            case ma_format_unknown: return MA_INVALID_OPERATION; /* <-- this should never be hit because initialization would just fall back to a supported format. */
-            default:
-            {
-                totalFramesRead = ma_dr_wav_read_pcm_frames(&pWav->dr, frameCount, pFramesOut);
-            } break;
-        }
-
-        /* In the future we'll update ma_dr_wav to return MA_AT_END for us. */
-        if (totalFramesRead == 0) {
-            result = MA_AT_END;
-        }
-
-        if (pFramesRead != NULL) {
-            *pFramesRead = totalFramesRead;
-        }
-
-        if (result == MA_SUCCESS && totalFramesRead == 0) {
-            result  = MA_AT_END;
-        }
-
-        return result;
-    }
-    #else
-    {
-        /* wav is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-
-        (void)pFramesOut;
-        (void)frameCount;
-        (void)pFramesRead;
-
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_wav_seek_to_pcm_frame(ma_wav* pWav, ma_uint64 frameIndex)
-{
-    if (pWav == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_WAV)
-    {
-        ma_bool32 wavResult;
-
-        wavResult = ma_dr_wav_seek_to_pcm_frame(&pWav->dr, frameIndex);
-        if (wavResult != MA_TRUE) {
-            return MA_ERROR;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* wav is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-
-        (void)frameIndex;
-
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_wav_get_data_format(ma_wav* pWav, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    /* Defaults for safety. */
-    if (pFormat != NULL) {
-        *pFormat = ma_format_unknown;
-    }
-    if (pChannels != NULL) {
-        *pChannels = 0;
-    }
-    if (pSampleRate != NULL) {
-        *pSampleRate = 0;
-    }
-    if (pChannelMap != NULL) {
-        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
-    }
-
-    if (pWav == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pFormat != NULL) {
-        *pFormat = pWav->format;
-    }
-
-    #if !defined(MA_NO_WAV)
-    {
-        if (pChannels != NULL) {
-            *pChannels = pWav->dr.channels;
-        }
-
-        if (pSampleRate != NULL) {
-            *pSampleRate = pWav->dr.sampleRate;
-        }
-
-        if (pChannelMap != NULL) {
-            ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pChannelMap, channelMapCap, pWav->dr.channels);
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* wav is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_wav_get_cursor_in_pcm_frames(ma_wav* pWav, ma_uint64* pCursor)
-{
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;   /* Safety. */
-
-    if (pWav == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_WAV)
-    {
-        ma_result wavResult = ma_dr_wav_get_cursor_in_pcm_frames(&pWav->dr, pCursor);
-        if (wavResult != MA_SUCCESS) {
-            return (ma_result)wavResult;    /* ma_dr_wav result codes map to miniaudio's. */
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* wav is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_wav_get_length_in_pcm_frames(ma_wav* pWav, ma_uint64* pLength)
-{
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;   /* Safety. */
-
-    if (pWav == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_WAV)
-    {
-        ma_result wavResult = ma_dr_wav_get_length_in_pcm_frames(&pWav->dr, pLength);
-        if (wavResult != MA_SUCCESS) {
-            return (ma_result)wavResult;    /* ma_dr_wav result codes map to miniaudio's. */
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* wav is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-
-static ma_result ma_decoding_backend_init__wav(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_wav* pWav;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pWav = (ma_wav*)ma_malloc(sizeof(*pWav), pAllocationCallbacks);
-    if (pWav == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_wav_init(onRead, onSeek, onTell, pReadSeekTellUserData, pConfig, pAllocationCallbacks, pWav);
-    if (result != MA_SUCCESS) {
-        ma_free(pWav, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pWav;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_file__wav(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_wav* pWav;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pWav = (ma_wav*)ma_malloc(sizeof(*pWav), pAllocationCallbacks);
-    if (pWav == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_wav_init_file(pFilePath, pConfig, pAllocationCallbacks, pWav);
-    if (result != MA_SUCCESS) {
-        ma_free(pWav, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pWav;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_file_w__wav(void* pUserData, const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_wav* pWav;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pWav = (ma_wav*)ma_malloc(sizeof(*pWav), pAllocationCallbacks);
-    if (pWav == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_wav_init_file_w(pFilePath, pConfig, pAllocationCallbacks, pWav);
-    if (result != MA_SUCCESS) {
-        ma_free(pWav, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pWav;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_memory__wav(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_wav* pWav;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pWav = (ma_wav*)ma_malloc(sizeof(*pWav), pAllocationCallbacks);
-    if (pWav == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_wav_init_memory(pData, dataSize, pConfig, pAllocationCallbacks, pWav);
-    if (result != MA_SUCCESS) {
-        ma_free(pWav, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pWav;
-
-    return MA_SUCCESS;
-}
-
-static void ma_decoding_backend_uninit__wav(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_wav* pWav = (ma_wav*)pBackend;
-
-    (void)pUserData;
-
-    ma_wav_uninit(pWav, pAllocationCallbacks);
-    ma_free(pWav, pAllocationCallbacks);
-}
-
-static ma_decoding_backend_vtable g_ma_decoding_backend_vtable_wav =
-{
-    ma_decoding_backend_init__wav,
-    ma_decoding_backend_init_file__wav,
-    ma_decoding_backend_init_file_w__wav,
-    ma_decoding_backend_init_memory__wav,
-    ma_decoding_backend_uninit__wav
-};
-
-static ma_result ma_decoder_init_wav__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_vtable__internal(&g_ma_decoding_backend_vtable_wav, NULL, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_wav_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_file__internal(&g_ma_decoding_backend_vtable_wav, NULL, pFilePath, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_wav_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_file_w__internal(&g_ma_decoding_backend_vtable_wav, NULL, pFilePath, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_wav_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_memory__internal(&g_ma_decoding_backend_vtable_wav, NULL, pData, dataSize, pConfig, pDecoder);
-}
-#endif  /* ma_dr_wav_h */
-
-/* FLAC */
-#ifdef ma_dr_flac_h
-#define MA_HAS_FLAC
-
-typedef struct
-{
-    ma_data_source_base ds;
-    ma_read_proc onRead;
-    ma_seek_proc onSeek;
-    ma_tell_proc onTell;
-    void* pReadSeekTellUserData;
-    ma_format format;           /* Can be f32, s16 or s32. */
-#if !defined(MA_NO_FLAC)
-    ma_dr_flac* dr;
-#endif
-} ma_flac;
-
-MA_API ma_result ma_flac_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac);
-MA_API ma_result ma_flac_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac);
-MA_API ma_result ma_flac_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac);
-MA_API ma_result ma_flac_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac);
-MA_API void ma_flac_uninit(ma_flac* pFlac, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_flac_read_pcm_frames(ma_flac* pFlac, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_flac_seek_to_pcm_frame(ma_flac* pFlac, ma_uint64 frameIndex);
-MA_API ma_result ma_flac_get_data_format(ma_flac* pFlac, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_flac_get_cursor_in_pcm_frames(ma_flac* pFlac, ma_uint64* pCursor);
-MA_API ma_result ma_flac_get_length_in_pcm_frames(ma_flac* pFlac, ma_uint64* pLength);
-
-
-static ma_result ma_flac_ds_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_flac_read_pcm_frames((ma_flac*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_flac_ds_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_flac_seek_to_pcm_frame((ma_flac*)pDataSource, frameIndex);
-}
-
-static ma_result ma_flac_ds_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    return ma_flac_get_data_format((ma_flac*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-}
-
-static ma_result ma_flac_ds_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    return ma_flac_get_cursor_in_pcm_frames((ma_flac*)pDataSource, pCursor);
-}
-
-static ma_result ma_flac_ds_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    return ma_flac_get_length_in_pcm_frames((ma_flac*)pDataSource, pLength);
-}
-
-static ma_data_source_vtable g_ma_flac_ds_vtable =
-{
-    ma_flac_ds_read,
-    ma_flac_ds_seek,
-    ma_flac_ds_get_data_format,
-    ma_flac_ds_get_cursor,
-    ma_flac_ds_get_length,
-    NULL,   /* onSetLooping */
-    0
-};
-
-
-#if !defined(MA_NO_FLAC)
-static size_t ma_flac_dr_callback__read(void* pUserData, void* pBufferOut, size_t bytesToRead)
-{
-    ma_flac* pFlac = (ma_flac*)pUserData;
-    ma_result result;
-    size_t bytesRead;
-
-    MA_ASSERT(pFlac != NULL);
-
-    result = pFlac->onRead(pFlac->pReadSeekTellUserData, pBufferOut, bytesToRead, &bytesRead);
-    (void)result;
-
-    return bytesRead;
-}
-
-static ma_bool32 ma_flac_dr_callback__seek(void* pUserData, int offset, ma_dr_flac_seek_origin origin)
-{
-    ma_flac* pFlac = (ma_flac*)pUserData;
-    ma_result result;
-    ma_seek_origin maSeekOrigin;
-
-    MA_ASSERT(pFlac != NULL);
-
-    maSeekOrigin = ma_seek_origin_start;
-    if (origin == ma_dr_flac_seek_origin_current) {
-        maSeekOrigin =  ma_seek_origin_current;
-    }
-
-    result = pFlac->onSeek(pFlac->pReadSeekTellUserData, offset, maSeekOrigin);
-    if (result != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-
-    return MA_TRUE;
-}
-#endif
-
-static ma_result ma_flac_init_internal(const ma_decoding_backend_config* pConfig, ma_flac* pFlac)
-{
-    ma_result result;
-    ma_data_source_config dataSourceConfig;
-
-    if (pFlac == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pFlac);
-    pFlac->format = ma_format_f32;    /* f32 by default. */
-
-    if (pConfig != NULL && (pConfig->preferredFormat == ma_format_f32 || pConfig->preferredFormat == ma_format_s16 || pConfig->preferredFormat == ma_format_s32)) {
-        pFlac->format = pConfig->preferredFormat;
-    } else {
-        /* Getting here means something other than f32 and s16 was specified. Just leave this unset to use the default format. */
-    }
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_flac_ds_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pFlac->ds);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the base data source. */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_flac_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac)
-{
-    ma_result result;
-
-    result = ma_flac_init_internal(pConfig, pFlac);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (onRead == NULL || onSeek == NULL) {
-        return MA_INVALID_ARGS; /* onRead and onSeek are mandatory. */
-    }
-
-    pFlac->onRead = onRead;
-    pFlac->onSeek = onSeek;
-    pFlac->onTell = onTell;
-    pFlac->pReadSeekTellUserData = pReadSeekTellUserData;
-
-    #if !defined(MA_NO_FLAC)
-    {
-        pFlac->dr = ma_dr_flac_open(ma_flac_dr_callback__read, ma_flac_dr_callback__seek, pFlac, pAllocationCallbacks);
-        if (pFlac->dr == NULL) {
-            return MA_INVALID_FILE;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* flac is disabled. */
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_flac_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac)
-{
-    ma_result result;
-
-    result = ma_flac_init_internal(pConfig, pFlac);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_FLAC)
-    {
-        pFlac->dr = ma_dr_flac_open_file(pFilePath, pAllocationCallbacks);
-        if (pFlac->dr == NULL) {
-            return MA_INVALID_FILE;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* flac is disabled. */
-        (void)pFilePath;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_flac_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac)
-{
-    ma_result result;
-
-    result = ma_flac_init_internal(pConfig, pFlac);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_FLAC)
-    {
-        pFlac->dr = ma_dr_flac_open_file_w(pFilePath, pAllocationCallbacks);
-        if (pFlac->dr == NULL) {
-            return MA_INVALID_FILE;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* flac is disabled. */
-        (void)pFilePath;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_flac_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac)
-{
-    ma_result result;
-
-    result = ma_flac_init_internal(pConfig, pFlac);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_FLAC)
-    {
-        pFlac->dr = ma_dr_flac_open_memory(pData, dataSize, pAllocationCallbacks);
-        if (pFlac->dr == NULL) {
-            return MA_INVALID_FILE;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* flac is disabled. */
-        (void)pData;
-        (void)dataSize;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API void ma_flac_uninit(ma_flac* pFlac, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFlac == NULL) {
-        return;
-    }
-
-    (void)pAllocationCallbacks;
-
-    #if !defined(MA_NO_FLAC)
-    {
-        ma_dr_flac_close(pFlac->dr);
-    }
-    #else
-    {
-        /* flac is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-    }
-    #endif
-
-    ma_data_source_uninit(&pFlac->ds);
-}
-
-MA_API ma_result ma_flac_read_pcm_frames(ma_flac* pFlac, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pFlac == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_FLAC)
-    {
-        /* We always use floating point format. */
-        ma_result result = MA_SUCCESS;  /* Must be initialized to MA_SUCCESS. */
-        ma_uint64 totalFramesRead = 0;
-        ma_format format;
-
-        ma_flac_get_data_format(pFlac, &format, NULL, NULL, NULL, 0);
-
-        switch (format)
-        {
-            case ma_format_f32:
-            {
-                totalFramesRead = ma_dr_flac_read_pcm_frames_f32(pFlac->dr, frameCount, (float*)pFramesOut);
-            } break;
-
-            case ma_format_s16:
-            {
-                totalFramesRead = ma_dr_flac_read_pcm_frames_s16(pFlac->dr, frameCount, (ma_int16*)pFramesOut);
-            } break;
-
-            case ma_format_s32:
-            {
-                totalFramesRead = ma_dr_flac_read_pcm_frames_s32(pFlac->dr, frameCount, (ma_int32*)pFramesOut);
-            } break;
-
-            case ma_format_u8:
-            case ma_format_s24:
-            case ma_format_unknown:
-            default:
-            {
-                return MA_INVALID_OPERATION;
-            };
-        }
-
-        /* In the future we'll update ma_dr_flac to return MA_AT_END for us. */
-        if (totalFramesRead == 0) {
-            result = MA_AT_END;
-        }
-
-        if (pFramesRead != NULL) {
-            *pFramesRead = totalFramesRead;
-        }
-
-        if (result == MA_SUCCESS && totalFramesRead == 0) {
-            result  = MA_AT_END;
-        }
-
-        return result;
-    }
-    #else
-    {
-        /* flac is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-
-        (void)pFramesOut;
-        (void)frameCount;
-        (void)pFramesRead;
-
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_flac_seek_to_pcm_frame(ma_flac* pFlac, ma_uint64 frameIndex)
-{
-    if (pFlac == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_FLAC)
-    {
-        ma_bool32 flacResult;
-
-        flacResult = ma_dr_flac_seek_to_pcm_frame(pFlac->dr, frameIndex);
-        if (flacResult != MA_TRUE) {
-            return MA_ERROR;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* flac is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-
-        (void)frameIndex;
-
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_flac_get_data_format(ma_flac* pFlac, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    /* Defaults for safety. */
-    if (pFormat != NULL) {
-        *pFormat = ma_format_unknown;
-    }
-    if (pChannels != NULL) {
-        *pChannels = 0;
-    }
-    if (pSampleRate != NULL) {
-        *pSampleRate = 0;
-    }
-    if (pChannelMap != NULL) {
-        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
-    }
-
-    if (pFlac == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pFormat != NULL) {
-        *pFormat = pFlac->format;
-    }
-
-    #if !defined(MA_NO_FLAC)
-    {
-        if (pChannels != NULL) {
-            *pChannels = pFlac->dr->channels;
-        }
-
-        if (pSampleRate != NULL) {
-            *pSampleRate = pFlac->dr->sampleRate;
-        }
-
-        if (pChannelMap != NULL) {
-            ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pChannelMap, channelMapCap, pFlac->dr->channels);
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* flac is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_flac_get_cursor_in_pcm_frames(ma_flac* pFlac, ma_uint64* pCursor)
-{
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;   /* Safety. */
-
-    if (pFlac == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_FLAC)
-    {
-        *pCursor = pFlac->dr->currentPCMFrame;
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* flac is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_flac_get_length_in_pcm_frames(ma_flac* pFlac, ma_uint64* pLength)
-{
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;   /* Safety. */
-
-    if (pFlac == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_FLAC)
-    {
-        *pLength = pFlac->dr->totalPCMFrameCount;
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* flac is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-
-static ma_result ma_decoding_backend_init__flac(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_flac* pFlac;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pFlac = (ma_flac*)ma_malloc(sizeof(*pFlac), pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_flac_init(onRead, onSeek, onTell, pReadSeekTellUserData, pConfig, pAllocationCallbacks, pFlac);
-    if (result != MA_SUCCESS) {
-        ma_free(pFlac, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pFlac;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_file__flac(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_flac* pFlac;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pFlac = (ma_flac*)ma_malloc(sizeof(*pFlac), pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_flac_init_file(pFilePath, pConfig, pAllocationCallbacks, pFlac);
-    if (result != MA_SUCCESS) {
-        ma_free(pFlac, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pFlac;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_file_w__flac(void* pUserData, const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_flac* pFlac;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pFlac = (ma_flac*)ma_malloc(sizeof(*pFlac), pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_flac_init_file_w(pFilePath, pConfig, pAllocationCallbacks, pFlac);
-    if (result != MA_SUCCESS) {
-        ma_free(pFlac, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pFlac;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_memory__flac(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_flac* pFlac;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pFlac = (ma_flac*)ma_malloc(sizeof(*pFlac), pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_flac_init_memory(pData, dataSize, pConfig, pAllocationCallbacks, pFlac);
-    if (result != MA_SUCCESS) {
-        ma_free(pFlac, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pFlac;
-
-    return MA_SUCCESS;
-}
-
-static void ma_decoding_backend_uninit__flac(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_flac* pFlac = (ma_flac*)pBackend;
-
-    (void)pUserData;
-
-    ma_flac_uninit(pFlac, pAllocationCallbacks);
-    ma_free(pFlac, pAllocationCallbacks);
-}
-
-static ma_decoding_backend_vtable g_ma_decoding_backend_vtable_flac =
-{
-    ma_decoding_backend_init__flac,
-    ma_decoding_backend_init_file__flac,
-    ma_decoding_backend_init_file_w__flac,
-    ma_decoding_backend_init_memory__flac,
-    ma_decoding_backend_uninit__flac
-};
-
-static ma_result ma_decoder_init_flac__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_vtable__internal(&g_ma_decoding_backend_vtable_flac, NULL, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_flac_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_file__internal(&g_ma_decoding_backend_vtable_flac, NULL, pFilePath, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_flac_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_file_w__internal(&g_ma_decoding_backend_vtable_flac, NULL, pFilePath, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_flac_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_memory__internal(&g_ma_decoding_backend_vtable_flac, NULL, pData, dataSize, pConfig, pDecoder);
-}
-#endif  /* ma_dr_flac_h */
-
-/* MP3 */
-#ifdef ma_dr_mp3_h
-#define MA_HAS_MP3
-
-typedef struct
-{
-    ma_data_source_base ds;
-    ma_read_proc onRead;
-    ma_seek_proc onSeek;
-    ma_tell_proc onTell;
-    void* pReadSeekTellUserData;
-    ma_format format;           /* Can be f32 or s16. */
-#if !defined(MA_NO_MP3)
-    ma_dr_mp3 dr;
-    ma_uint32 seekPointCount;
-    ma_dr_mp3_seek_point* pSeekPoints;  /* Only used if seek table generation is used. */
-#endif
-} ma_mp3;
-
-MA_API ma_result ma_mp3_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3);
-MA_API ma_result ma_mp3_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3);
-MA_API ma_result ma_mp3_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3);
-MA_API ma_result ma_mp3_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3);
-MA_API void ma_mp3_uninit(ma_mp3* pMP3, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_mp3_read_pcm_frames(ma_mp3* pMP3, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_mp3_seek_to_pcm_frame(ma_mp3* pMP3, ma_uint64 frameIndex);
-MA_API ma_result ma_mp3_get_data_format(ma_mp3* pMP3, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_mp3_get_cursor_in_pcm_frames(ma_mp3* pMP3, ma_uint64* pCursor);
-MA_API ma_result ma_mp3_get_length_in_pcm_frames(ma_mp3* pMP3, ma_uint64* pLength);
-
-
-static ma_result ma_mp3_ds_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_mp3_read_pcm_frames((ma_mp3*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_mp3_ds_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_mp3_seek_to_pcm_frame((ma_mp3*)pDataSource, frameIndex);
-}
-
-static ma_result ma_mp3_ds_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    return ma_mp3_get_data_format((ma_mp3*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-}
-
-static ma_result ma_mp3_ds_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    return ma_mp3_get_cursor_in_pcm_frames((ma_mp3*)pDataSource, pCursor);
-}
-
-static ma_result ma_mp3_ds_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    return ma_mp3_get_length_in_pcm_frames((ma_mp3*)pDataSource, pLength);
-}
-
-static ma_data_source_vtable g_ma_mp3_ds_vtable =
-{
-    ma_mp3_ds_read,
-    ma_mp3_ds_seek,
-    ma_mp3_ds_get_data_format,
-    ma_mp3_ds_get_cursor,
-    ma_mp3_ds_get_length,
-    NULL,   /* onSetLooping */
-    0
-};
-
-
-#if !defined(MA_NO_MP3)
-static size_t ma_mp3_dr_callback__read(void* pUserData, void* pBufferOut, size_t bytesToRead)
-{
-    ma_mp3* pMP3 = (ma_mp3*)pUserData;
-    ma_result result;
-    size_t bytesRead;
-
-    MA_ASSERT(pMP3 != NULL);
-
-    result = pMP3->onRead(pMP3->pReadSeekTellUserData, pBufferOut, bytesToRead, &bytesRead);
-    (void)result;
-
-    return bytesRead;
-}
-
-static ma_bool32 ma_mp3_dr_callback__seek(void* pUserData, int offset, ma_dr_mp3_seek_origin origin)
-{
-    ma_mp3* pMP3 = (ma_mp3*)pUserData;
-    ma_result result;
-    ma_seek_origin maSeekOrigin;
-
-    MA_ASSERT(pMP3 != NULL);
-
-    maSeekOrigin = ma_seek_origin_start;
-    if (origin == ma_dr_mp3_seek_origin_current) {
-        maSeekOrigin =  ma_seek_origin_current;
-    }
-
-    result = pMP3->onSeek(pMP3->pReadSeekTellUserData, offset, maSeekOrigin);
-    if (result != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-
-    return MA_TRUE;
-}
-#endif
-
-static ma_result ma_mp3_init_internal(const ma_decoding_backend_config* pConfig, ma_mp3* pMP3)
-{
-    ma_result result;
-    ma_data_source_config dataSourceConfig;
-
-    if (pMP3 == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pMP3);
-    pMP3->format = ma_format_f32;    /* f32 by default. */
-
-    if (pConfig != NULL && (pConfig->preferredFormat == ma_format_f32 || pConfig->preferredFormat == ma_format_s16)) {
-        pMP3->format = pConfig->preferredFormat;
-    } else {
-        /* Getting here means something other than f32 and s16 was specified. Just leave this unset to use the default format. */
-    }
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_mp3_ds_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pMP3->ds);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the base data source. */
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_mp3_generate_seek_table(ma_mp3* pMP3, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_bool32 mp3Result;
-    ma_uint32 seekPointCount = 0;
-    ma_dr_mp3_seek_point* pSeekPoints = NULL;
-
-    MA_ASSERT(pMP3    != NULL);
-    MA_ASSERT(pConfig != NULL);
-
-    seekPointCount = pConfig->seekPointCount;
-    if (seekPointCount > 0) {
-        pSeekPoints = (ma_dr_mp3_seek_point*)ma_malloc(sizeof(*pMP3->pSeekPoints) * seekPointCount, pAllocationCallbacks);
-        if (pSeekPoints == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    }
-
-    mp3Result = ma_dr_mp3_calculate_seek_points(&pMP3->dr, &seekPointCount, pSeekPoints);
-    if (mp3Result != MA_TRUE) {
-        ma_free(pSeekPoints, pAllocationCallbacks);
-        return MA_ERROR;
-    }
-
-    mp3Result = ma_dr_mp3_bind_seek_table(&pMP3->dr, seekPointCount, pSeekPoints);
-    if (mp3Result != MA_TRUE) {
-        ma_free(pSeekPoints, pAllocationCallbacks);
-        return MA_ERROR;
-    }
-
-    pMP3->seekPointCount = seekPointCount;
-    pMP3->pSeekPoints    = pSeekPoints;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_mp3_post_init(ma_mp3* pMP3, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_result result;
-
-    result = ma_mp3_generate_seek_table(pMP3, pConfig, pAllocationCallbacks);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_mp3_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3)
-{
-    ma_result result;
-
-    result = ma_mp3_init_internal(pConfig, pMP3);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (onRead == NULL || onSeek == NULL) {
-        return MA_INVALID_ARGS; /* onRead and onSeek are mandatory. */
-    }
-
-    pMP3->onRead = onRead;
-    pMP3->onSeek = onSeek;
-    pMP3->onTell = onTell;
-    pMP3->pReadSeekTellUserData = pReadSeekTellUserData;
-
-    #if !defined(MA_NO_MP3)
-    {
-        ma_bool32 mp3Result;
-
-        mp3Result = ma_dr_mp3_init(&pMP3->dr, ma_mp3_dr_callback__read, ma_mp3_dr_callback__seek, pMP3, pAllocationCallbacks);
-        if (mp3Result != MA_TRUE) {
-            return MA_INVALID_FILE;
-        }
-
-        ma_mp3_post_init(pMP3, pConfig, pAllocationCallbacks);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* mp3 is disabled. */
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_mp3_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3)
-{
-    ma_result result;
-
-    result = ma_mp3_init_internal(pConfig, pMP3);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_MP3)
-    {
-        ma_bool32 mp3Result;
-
-        mp3Result = ma_dr_mp3_init_file(&pMP3->dr, pFilePath, pAllocationCallbacks);
-        if (mp3Result != MA_TRUE) {
-            return MA_INVALID_FILE;
-        }
-
-        ma_mp3_post_init(pMP3, pConfig, pAllocationCallbacks);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* mp3 is disabled. */
-        (void)pFilePath;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_mp3_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3)
-{
-    ma_result result;
-
-    result = ma_mp3_init_internal(pConfig, pMP3);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_MP3)
-    {
-        ma_bool32 mp3Result;
-
-        mp3Result = ma_dr_mp3_init_file_w(&pMP3->dr, pFilePath, pAllocationCallbacks);
-        if (mp3Result != MA_TRUE) {
-            return MA_INVALID_FILE;
-        }
-
-        ma_mp3_post_init(pMP3, pConfig, pAllocationCallbacks);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* mp3 is disabled. */
-        (void)pFilePath;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_mp3_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3)
-{
-    ma_result result;
-
-    result = ma_mp3_init_internal(pConfig, pMP3);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_MP3)
-    {
-        ma_bool32 mp3Result;
-
-        mp3Result = ma_dr_mp3_init_memory(&pMP3->dr, pData, dataSize, pAllocationCallbacks);
-        if (mp3Result != MA_TRUE) {
-            return MA_INVALID_FILE;
-        }
-
-        ma_mp3_post_init(pMP3, pConfig, pAllocationCallbacks);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* mp3 is disabled. */
-        (void)pData;
-        (void)dataSize;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API void ma_mp3_uninit(ma_mp3* pMP3, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pMP3 == NULL) {
-        return;
-    }
-
-    #if !defined(MA_NO_MP3)
-    {
-        ma_dr_mp3_uninit(&pMP3->dr);
-    }
-    #else
-    {
-        /* mp3 is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-    }
-    #endif
-
-    /* Seek points need to be freed after the MP3 decoder has been uninitialized to ensure they're no longer being referenced. */
-    ma_free(pMP3->pSeekPoints, pAllocationCallbacks);
-
-    ma_data_source_uninit(&pMP3->ds);
-}
-
-MA_API ma_result ma_mp3_read_pcm_frames(ma_mp3* pMP3, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pMP3 == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_MP3)
-    {
-        /* We always use floating point format. */
-        ma_result result = MA_SUCCESS;  /* Must be initialized to MA_SUCCESS. */
-        ma_uint64 totalFramesRead = 0;
-        ma_format format;
-
-        ma_mp3_get_data_format(pMP3, &format, NULL, NULL, NULL, 0);
-
-        switch (format)
-        {
-            case ma_format_f32:
-            {
-                totalFramesRead = ma_dr_mp3_read_pcm_frames_f32(&pMP3->dr, frameCount, (float*)pFramesOut);
-            } break;
-
-            case ma_format_s16:
-            {
-                totalFramesRead = ma_dr_mp3_read_pcm_frames_s16(&pMP3->dr, frameCount, (ma_int16*)pFramesOut);
-            } break;
-
-            case ma_format_u8:
-            case ma_format_s24:
-            case ma_format_s32:
-            case ma_format_unknown:
-            default:
-            {
-                return MA_INVALID_OPERATION;
-            };
-        }
-
-        /* In the future we'll update ma_dr_mp3 to return MA_AT_END for us. */
-        if (totalFramesRead == 0) {
-            result = MA_AT_END;
-        }
-
-        if (pFramesRead != NULL) {
-            *pFramesRead = totalFramesRead;
-        }
-
-        return result;
-    }
-    #else
-    {
-        /* mp3 is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-
-        (void)pFramesOut;
-        (void)frameCount;
-        (void)pFramesRead;
-
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_mp3_seek_to_pcm_frame(ma_mp3* pMP3, ma_uint64 frameIndex)
-{
-    if (pMP3 == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_MP3)
-    {
-        ma_bool32 mp3Result;
-
-        mp3Result = ma_dr_mp3_seek_to_pcm_frame(&pMP3->dr, frameIndex);
-        if (mp3Result != MA_TRUE) {
-            return MA_ERROR;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* mp3 is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-
-        (void)frameIndex;
-
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_mp3_get_data_format(ma_mp3* pMP3, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    /* Defaults for safety. */
-    if (pFormat != NULL) {
-        *pFormat = ma_format_unknown;
-    }
-    if (pChannels != NULL) {
-        *pChannels = 0;
-    }
-    if (pSampleRate != NULL) {
-        *pSampleRate = 0;
-    }
-    if (pChannelMap != NULL) {
-        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
-    }
-
-    if (pMP3 == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pFormat != NULL) {
-        *pFormat = pMP3->format;
-    }
-
-    #if !defined(MA_NO_MP3)
-    {
-        if (pChannels != NULL) {
-            *pChannels = pMP3->dr.channels;
-        }
-
-        if (pSampleRate != NULL) {
-            *pSampleRate = pMP3->dr.sampleRate;
-        }
-
-        if (pChannelMap != NULL) {
-            ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pMP3->dr.channels);
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* mp3 is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_mp3_get_cursor_in_pcm_frames(ma_mp3* pMP3, ma_uint64* pCursor)
-{
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;   /* Safety. */
-
-    if (pMP3 == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_MP3)
-    {
-        *pCursor = pMP3->dr.currentPCMFrame;
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* mp3 is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_mp3_get_length_in_pcm_frames(ma_mp3* pMP3, ma_uint64* pLength)
-{
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;   /* Safety. */
-
-    if (pMP3 == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_MP3)
-    {
-        *pLength = ma_dr_mp3_get_pcm_frame_count(&pMP3->dr);
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* mp3 is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-
-static ma_result ma_decoding_backend_init__mp3(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_mp3* pMP3;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pMP3 = (ma_mp3*)ma_malloc(sizeof(*pMP3), pAllocationCallbacks);
-    if (pMP3 == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_mp3_init(onRead, onSeek, onTell, pReadSeekTellUserData, pConfig, pAllocationCallbacks, pMP3);
-    if (result != MA_SUCCESS) {
-        ma_free(pMP3, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pMP3;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_file__mp3(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_mp3* pMP3;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pMP3 = (ma_mp3*)ma_malloc(sizeof(*pMP3), pAllocationCallbacks);
-    if (pMP3 == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_mp3_init_file(pFilePath, pConfig, pAllocationCallbacks, pMP3);
-    if (result != MA_SUCCESS) {
-        ma_free(pMP3, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pMP3;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_file_w__mp3(void* pUserData, const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_mp3* pMP3;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pMP3 = (ma_mp3*)ma_malloc(sizeof(*pMP3), pAllocationCallbacks);
-    if (pMP3 == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_mp3_init_file_w(pFilePath, pConfig, pAllocationCallbacks, pMP3);
-    if (result != MA_SUCCESS) {
-        ma_free(pMP3, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pMP3;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_memory__mp3(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_mp3* pMP3;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pMP3 = (ma_mp3*)ma_malloc(sizeof(*pMP3), pAllocationCallbacks);
-    if (pMP3 == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_mp3_init_memory(pData, dataSize, pConfig, pAllocationCallbacks, pMP3);
-    if (result != MA_SUCCESS) {
-        ma_free(pMP3, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pMP3;
-
-    return MA_SUCCESS;
-}
-
-static void ma_decoding_backend_uninit__mp3(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_mp3* pMP3 = (ma_mp3*)pBackend;
-
-    (void)pUserData;
-
-    ma_mp3_uninit(pMP3, pAllocationCallbacks);
-    ma_free(pMP3, pAllocationCallbacks);
-}
-
-static ma_decoding_backend_vtable g_ma_decoding_backend_vtable_mp3 =
-{
-    ma_decoding_backend_init__mp3,
-    ma_decoding_backend_init_file__mp3,
-    ma_decoding_backend_init_file_w__mp3,
-    ma_decoding_backend_init_memory__mp3,
-    ma_decoding_backend_uninit__mp3
-};
-
-static ma_result ma_decoder_init_mp3__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_vtable__internal(&g_ma_decoding_backend_vtable_mp3, NULL, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_mp3_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_file__internal(&g_ma_decoding_backend_vtable_mp3, NULL, pFilePath, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_mp3_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_file_w__internal(&g_ma_decoding_backend_vtable_mp3, NULL, pFilePath, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_mp3_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_memory__internal(&g_ma_decoding_backend_vtable_mp3, NULL, pData, dataSize, pConfig, pDecoder);
-}
-#endif  /* ma_dr_mp3_h */
-
-/* Vorbis */
-#ifdef STB_VORBIS_INCLUDE_STB_VORBIS_H
-#define MA_HAS_VORBIS
-
-/* The size in bytes of each chunk of data to read from the Vorbis stream. */
-#define MA_VORBIS_DATA_CHUNK_SIZE  4096
-
-typedef struct
-{
-    ma_data_source_base ds;
-    ma_read_proc onRead;
-    ma_seek_proc onSeek;
-    ma_tell_proc onTell;
-    void* pReadSeekTellUserData;
-    ma_allocation_callbacks allocationCallbacks;    /* Store the allocation callbacks within the structure because we may need to dynamically expand a buffer in ma_stbvorbis_read_pcm_frames() when using push mode. */
-    ma_format format;               /* Only f32 is allowed with stb_vorbis. */
-    ma_uint32 channels;
-    ma_uint32 sampleRate;
-    ma_uint64 cursor;
-#if !defined(MA_NO_VORBIS)
-    stb_vorbis* stb;
-    ma_bool32 usingPushMode;
-    struct
-    {
-        ma_uint8* pData;
-        size_t dataSize;
-        size_t dataCapacity;
-        size_t audioStartOffsetInBytes;
-        ma_uint32 framesConsumed;   /* The number of frames consumed in ppPacketData. */
-        ma_uint32 framesRemaining;  /* The number of frames remaining in ppPacketData. */
-        float** ppPacketData;
-    } push;
-#endif
-} ma_stbvorbis;
-
-MA_API ma_result ma_stbvorbis_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis);
-MA_API ma_result ma_stbvorbis_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis);
-MA_API ma_result ma_stbvorbis_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis);
-MA_API void ma_stbvorbis_uninit(ma_stbvorbis* pVorbis, const ma_allocation_callbacks* pAllocationCallbacks);
-MA_API ma_result ma_stbvorbis_read_pcm_frames(ma_stbvorbis* pVorbis, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
-MA_API ma_result ma_stbvorbis_seek_to_pcm_frame(ma_stbvorbis* pVorbis, ma_uint64 frameIndex);
-MA_API ma_result ma_stbvorbis_get_data_format(ma_stbvorbis* pVorbis, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
-MA_API ma_result ma_stbvorbis_get_cursor_in_pcm_frames(ma_stbvorbis* pVorbis, ma_uint64* pCursor);
-MA_API ma_result ma_stbvorbis_get_length_in_pcm_frames(ma_stbvorbis* pVorbis, ma_uint64* pLength);
-
-
-static ma_result ma_stbvorbis_ds_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_stbvorbis_read_pcm_frames((ma_stbvorbis*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_stbvorbis_ds_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_stbvorbis_seek_to_pcm_frame((ma_stbvorbis*)pDataSource, frameIndex);
-}
-
-static ma_result ma_stbvorbis_ds_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    return ma_stbvorbis_get_data_format((ma_stbvorbis*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-}
-
-static ma_result ma_stbvorbis_ds_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    return ma_stbvorbis_get_cursor_in_pcm_frames((ma_stbvorbis*)pDataSource, pCursor);
-}
-
-static ma_result ma_stbvorbis_ds_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    return ma_stbvorbis_get_length_in_pcm_frames((ma_stbvorbis*)pDataSource, pLength);
-}
-
-static ma_data_source_vtable g_ma_stbvorbis_ds_vtable =
-{
-    ma_stbvorbis_ds_read,
-    ma_stbvorbis_ds_seek,
-    ma_stbvorbis_ds_get_data_format,
-    ma_stbvorbis_ds_get_cursor,
-    ma_stbvorbis_ds_get_length,
-    NULL,   /* onSetLooping */
-    0
-};
-
-
-static ma_result ma_stbvorbis_init_internal(const ma_decoding_backend_config* pConfig, ma_stbvorbis* pVorbis)
-{
-    ma_result result;
-    ma_data_source_config dataSourceConfig;
-
-    (void)pConfig;
-
-    if (pVorbis == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pVorbis);
-    pVorbis->format = ma_format_f32;    /* Only supporting f32. */
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_stbvorbis_ds_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pVorbis->ds);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the base data source. */
-    }
-
-    return MA_SUCCESS;
-}
-
-#if !defined(MA_NO_VORBIS)
-static ma_result ma_stbvorbis_post_init(ma_stbvorbis* pVorbis)
-{
-    stb_vorbis_info info;
-
-    MA_ASSERT(pVorbis != NULL);
-
-    info = stb_vorbis_get_info(pVorbis->stb);
-
-    pVorbis->channels   = info.channels;
-    pVorbis->sampleRate = info.sample_rate;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_stbvorbis_init_internal_decoder_push(ma_stbvorbis* pVorbis)
-{
-    ma_result result;
-    stb_vorbis* stb;
-    size_t dataSize = 0;
-    size_t dataCapacity = 0;
-    ma_uint8* pData = NULL; /* <-- Must be initialized to NULL. */
-
-    for (;;) {
-        int vorbisError;
-        int consumedDataSize;   /* <-- Fill by stb_vorbis_open_pushdata(). */
-        size_t bytesRead;
-        ma_uint8* pNewData;
-
-        /* Allocate memory for the new chunk. */
-        dataCapacity += MA_VORBIS_DATA_CHUNK_SIZE;
-        pNewData = (ma_uint8*)ma_realloc(pData, dataCapacity, &pVorbis->allocationCallbacks);
-        if (pNewData == NULL) {
-            ma_free(pData, &pVorbis->allocationCallbacks);
-            return MA_OUT_OF_MEMORY;
-        }
-
-        pData = pNewData;
-
-        /* Read in the next chunk. */
-        result = pVorbis->onRead(pVorbis->pReadSeekTellUserData, ma_offset_ptr(pData, dataSize), (dataCapacity - dataSize), &bytesRead);
-        dataSize += bytesRead;
-
-        if (result != MA_SUCCESS) {
-            ma_free(pData, &pVorbis->allocationCallbacks);
-            return result;
-        }
-
-        /* We have a maximum of 31 bits with stb_vorbis. */
-        if (dataSize > INT_MAX) {
-            ma_free(pData, &pVorbis->allocationCallbacks);
-            return MA_TOO_BIG;
-        }
-
-        stb = stb_vorbis_open_pushdata(pData, (int)dataSize, &consumedDataSize, &vorbisError, NULL);
-        if (stb != NULL) {
-            /*
-            Successfully opened the Vorbis decoder. We might have some leftover unprocessed
-            data so we'll need to move that down to the front.
-            */
-            dataSize -= (size_t)consumedDataSize;   /* Consume the data. */
-            MA_MOVE_MEMORY(pData, ma_offset_ptr(pData, consumedDataSize), dataSize);
-
-            /*
-            We need to track the start point so we can seek back to the start of the audio
-            data when seeking.
-            */
-            pVorbis->push.audioStartOffsetInBytes = consumedDataSize;
-
-            break;
-        } else {
-            /* Failed to open the decoder. */
-            if (vorbisError == VORBIS_need_more_data) {
-                continue;
-            } else {
-                ma_free(pData, &pVorbis->allocationCallbacks);
-                return MA_ERROR;   /* Failed to open the stb_vorbis decoder. */
-            }
-        }
-    }
-
-    MA_ASSERT(stb != NULL);
-    pVorbis->stb = stb;
-    pVorbis->push.pData = pData;
-    pVorbis->push.dataSize = dataSize;
-    pVorbis->push.dataCapacity = dataCapacity;
-
-    return MA_SUCCESS;
-}
-#endif
-
-MA_API ma_result ma_stbvorbis_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis)
-{
-    ma_result result;
-
-    result = ma_stbvorbis_init_internal(pConfig, pVorbis);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (onRead == NULL || onSeek == NULL) {
-        return MA_INVALID_ARGS; /* onRead and onSeek are mandatory. */
-    }
-
-    pVorbis->onRead = onRead;
-    pVorbis->onSeek = onSeek;
-    pVorbis->onTell = onTell;
-    pVorbis->pReadSeekTellUserData = pReadSeekTellUserData;
-    ma_allocation_callbacks_init_copy(&pVorbis->allocationCallbacks, pAllocationCallbacks);
-
-    #if !defined(MA_NO_VORBIS)
-    {
-        /*
-        stb_vorbis lacks a callback based API for it's pulling API which means we're stuck with the
-        pushing API. In order for us to be able to successfully initialize the decoder we need to
-        supply it with enough data. We need to keep loading data until we have enough.
-        */
-        result = ma_stbvorbis_init_internal_decoder_push(pVorbis);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pVorbis->usingPushMode = MA_TRUE;
-
-        result = ma_stbvorbis_post_init(pVorbis);
-        if (result != MA_SUCCESS) {
-            stb_vorbis_close(pVorbis->stb);
-            ma_free(pVorbis->push.pData, pAllocationCallbacks);
-            return result;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* vorbis is disabled. */
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_stbvorbis_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis)
-{
-    ma_result result;
-
-    result = ma_stbvorbis_init_internal(pConfig, pVorbis);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_VORBIS)
-    {
-        (void)pAllocationCallbacks; /* Don't know how to make use of this with stb_vorbis. */
-
-        /* We can use stb_vorbis' pull mode for file based streams. */
-        pVorbis->stb = stb_vorbis_open_filename(pFilePath, NULL, NULL);
-        if (pVorbis->stb == NULL) {
-            return MA_INVALID_FILE;
-        }
-
-        pVorbis->usingPushMode = MA_FALSE;
-
-        result = ma_stbvorbis_post_init(pVorbis);
-        if (result != MA_SUCCESS) {
-            stb_vorbis_close(pVorbis->stb);
-            return result;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* vorbis is disabled. */
-        (void)pFilePath;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_stbvorbis_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis)
-{
-    ma_result result;
-
-    result = ma_stbvorbis_init_internal(pConfig, pVorbis);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    #if !defined(MA_NO_VORBIS)
-    {
-        (void)pAllocationCallbacks;
-
-        /* stb_vorbis uses an int as it's size specifier, restricting it to 32-bit even on 64-bit systems. *sigh*. */
-        if (dataSize > INT_MAX) {
-            return MA_TOO_BIG;
-        }
-
-        pVorbis->stb = stb_vorbis_open_memory((const unsigned char*)pData, (int)dataSize, NULL, NULL);
-        if (pVorbis->stb == NULL) {
-            return MA_INVALID_FILE;
-        }
-
-        pVorbis->usingPushMode = MA_FALSE;
-
-        result = ma_stbvorbis_post_init(pVorbis);
-        if (result != MA_SUCCESS) {
-            stb_vorbis_close(pVorbis->stb);
-            return result;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* vorbis is disabled. */
-        (void)pData;
-        (void)dataSize;
-        (void)pAllocationCallbacks;
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API void ma_stbvorbis_uninit(ma_stbvorbis* pVorbis, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pVorbis == NULL) {
-        return;
-    }
-
-    #if !defined(MA_NO_VORBIS)
-    {
-        stb_vorbis_close(pVorbis->stb);
-
-        /* We'll have to clear some memory if we're using push mode. */
-        if (pVorbis->usingPushMode) {
-            ma_free(pVorbis->push.pData, pAllocationCallbacks);
-        }
-    }
-    #else
-    {
-        /* vorbis is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-    }
-    #endif
-
-    ma_data_source_uninit(&pVorbis->ds);
-}
-
-MA_API ma_result ma_stbvorbis_read_pcm_frames(ma_stbvorbis* pVorbis, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pVorbis == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_VORBIS)
-    {
-        /* We always use floating point format. */
-        ma_result result = MA_SUCCESS;  /* Must be initialized to MA_SUCCESS. */
-        ma_uint64 totalFramesRead = 0;
-        ma_format format;
-        ma_uint32 channels;
-
-        ma_stbvorbis_get_data_format(pVorbis, &format, &channels, NULL, NULL, 0);
-
-        if (format == ma_format_f32) {
-            /* We read differently depending on whether or not we're using push mode. */
-            if (pVorbis->usingPushMode) {
-                /* Push mode. This is the complex case. */
-                float* pFramesOutF32 = (float*)pFramesOut;
-
-                while (totalFramesRead < frameCount) {
-                    /* The first thing to do is read from any already-cached frames. */
-                    ma_uint32 framesToReadFromCache = (ma_uint32)ma_min(pVorbis->push.framesRemaining, (frameCount - totalFramesRead));  /* Safe cast because pVorbis->framesRemaining is 32-bit. */
-
-                    /* The output pointer can be null in which case we just treate it as a seek. */
-                    if (pFramesOut != NULL) {
-                        ma_uint64 iFrame;
-                        for (iFrame = 0; iFrame < framesToReadFromCache; iFrame += 1) {
-                            ma_uint32 iChannel;
-                            for (iChannel = 0; iChannel < pVorbis->channels; iChannel += 1) {
-                                pFramesOutF32[iChannel] = pVorbis->push.ppPacketData[iChannel][pVorbis->push.framesConsumed + iFrame];
-                            }
-
-                            pFramesOutF32 += pVorbis->channels;
-                        }
-                    }
-
-                    /* Update pointers and counters. */
-                    pVorbis->push.framesConsumed  += framesToReadFromCache;
-                    pVorbis->push.framesRemaining -= framesToReadFromCache;
-                    totalFramesRead               += framesToReadFromCache;
-
-                    /* Don't bother reading any more frames right now if we've just finished loading. */
-                    if (totalFramesRead == frameCount) {
-                        break;
-                    }
-
-                    MA_ASSERT(pVorbis->push.framesRemaining == 0);
-
-                    /* Getting here means we've run out of cached frames. We'll need to load some more. */
-                    for (;;) {
-                        int samplesRead = 0;
-                        int consumedDataSize;
-
-                        /* We need to case dataSize to an int, so make sure we can do it safely. */
-                        if (pVorbis->push.dataSize > INT_MAX) {
-                            break;  /* Too big. */
-                        }
-
-                        consumedDataSize = stb_vorbis_decode_frame_pushdata(pVorbis->stb, pVorbis->push.pData, (int)pVorbis->push.dataSize, NULL, &pVorbis->push.ppPacketData, &samplesRead);
-                        if (consumedDataSize != 0) {
-                            /* Successfully decoded a Vorbis frame. Consume the data. */
-                            pVorbis->push.dataSize -= (size_t)consumedDataSize;
-                            MA_MOVE_MEMORY(pVorbis->push.pData, ma_offset_ptr(pVorbis->push.pData, consumedDataSize), pVorbis->push.dataSize);
-
-                            pVorbis->push.framesConsumed  = 0;
-                            pVorbis->push.framesRemaining = samplesRead;
-
-                            break;
-                        } else {
-                            /* Not enough data. Read more. */
-                            size_t bytesRead;
-
-                            /* Expand the data buffer if necessary. */
-                            if (pVorbis->push.dataCapacity == pVorbis->push.dataSize) {
-                                size_t newCap = pVorbis->push.dataCapacity + MA_VORBIS_DATA_CHUNK_SIZE;
-                                ma_uint8* pNewData;
-
-                                pNewData = (ma_uint8*)ma_realloc(pVorbis->push.pData, newCap, &pVorbis->allocationCallbacks);
-                                if (pNewData == NULL) {
-                                    result = MA_OUT_OF_MEMORY;
-                                    break;
-                                }
-
-                                pVorbis->push.pData = pNewData;
-                                pVorbis->push.dataCapacity = newCap;
-                            }
-
-                            /* We should have enough room to load some data. */
-                            result = pVorbis->onRead(pVorbis->pReadSeekTellUserData, ma_offset_ptr(pVorbis->push.pData, pVorbis->push.dataSize), (pVorbis->push.dataCapacity - pVorbis->push.dataSize), &bytesRead);
-                            pVorbis->push.dataSize += bytesRead;
-
-                            if (result != MA_SUCCESS) {
-                                break;  /* Failed to read any data. Get out. */
-                            }
-                        }
-                    }
-
-                    /* If we don't have a success code at this point it means we've encounted an error or the end of the file has been reached (probably the latter). */
-                    if (result != MA_SUCCESS) {
-                        break;
-                    }
-                }
-            } else {
-                /* Pull mode. This is the simple case, but we still need to run in a loop because stb_vorbis loves using 32-bit instead of 64-bit. */
-                while (totalFramesRead < frameCount) {
-                    ma_uint64 framesRemaining = (frameCount - totalFramesRead);
-                    int framesRead;
-
-                    if (framesRemaining > INT_MAX) {
-                        framesRemaining = INT_MAX;
-                    }
-
-                    framesRead = stb_vorbis_get_samples_float_interleaved(pVorbis->stb, channels, (float*)ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, format, channels), (int)framesRemaining * channels);   /* Safe cast. */
-                    totalFramesRead += framesRead;
-
-                    if (framesRead < (int)framesRemaining) {
-                        break;  /* Nothing left to read. Get out. */
-                    }
-                }
-            }
-        } else {
-            result = MA_INVALID_ARGS;
-        }
-
-        pVorbis->cursor += totalFramesRead;
-
-        if (totalFramesRead == 0) {
-            result = MA_AT_END;
-        }
-
-        if (pFramesRead != NULL) {
-            *pFramesRead = totalFramesRead;
-        }
-
-        if (result == MA_SUCCESS && totalFramesRead == 0) {
-            result  = MA_AT_END;
-        }
-
-        return result;
-    }
-    #else
-    {
-        /* vorbis is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-
-        (void)pFramesOut;
-        (void)frameCount;
-        (void)pFramesRead;
-
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_stbvorbis_seek_to_pcm_frame(ma_stbvorbis* pVorbis, ma_uint64 frameIndex)
-{
-    if (pVorbis == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_VORBIS)
-    {
-        /* Different seeking methods depending on whether or not we're using push mode. */
-        if (pVorbis->usingPushMode) {
-            /* Push mode. This is the complex case. */
-            ma_result result;
-            float buffer[4096];
-
-            /* If we're seeking backwards, we need to seek back to the start and then brute-force forward. */
-            if (frameIndex < pVorbis->cursor) {
-                if (frameIndex > 0x7FFFFFFF) {
-                    return MA_INVALID_ARGS; /* Trying to seek beyond the 32-bit maximum of stb_vorbis. */
-                }
-
-                /*
-                This is wildly inefficient due to me having trouble getting sample exact seeking working
-                robustly with stb_vorbis_flush_pushdata(). The only way I can think to make this work
-                perfectly is to reinitialize the decoder. Note that we only enter this path when seeking
-                backwards. This will hopefully be removed once we get our own Vorbis decoder implemented.
-                */
-                stb_vorbis_close(pVorbis->stb);
-                ma_free(pVorbis->push.pData, &pVorbis->allocationCallbacks);
-
-                MA_ZERO_OBJECT(&pVorbis->push);
-
-                /* Seek to the start of the file. */
-                result = pVorbis->onSeek(pVorbis->pReadSeekTellUserData, 0, ma_seek_origin_start);
-                if (result != MA_SUCCESS) {
-                    return result;
-                }
-
-                result = ma_stbvorbis_init_internal_decoder_push(pVorbis);
-                if (result != MA_SUCCESS) {
-                    return result;
-                }
-
-                /* At this point we should be sitting on the first frame. */
-                pVorbis->cursor = 0;
-            }
-
-            /* We're just brute-forcing this for now. */
-            while (pVorbis->cursor < frameIndex) {
-                ma_uint64 framesRead;
-                ma_uint64 framesToRead = ma_countof(buffer)/pVorbis->channels;
-                if (framesToRead > (frameIndex - pVorbis->cursor)) {
-                    framesToRead = (frameIndex - pVorbis->cursor);
-                }
-
-                result = ma_stbvorbis_read_pcm_frames(pVorbis, buffer, framesToRead, &framesRead);
-                if (result != MA_SUCCESS) {
-                    return result;
-                }
-            }
-        } else {
-            /* Pull mode. This is the simple case. */
-            int vorbisResult;
-
-            if (frameIndex > UINT_MAX) {
-                return MA_INVALID_ARGS; /* Trying to seek beyond the 32-bit maximum of stb_vorbis. */
-            }
-
-            vorbisResult = stb_vorbis_seek(pVorbis->stb, (unsigned int)frameIndex);  /* Safe cast. */
-            if (vorbisResult == 0) {
-                return MA_ERROR;    /* See failed. */
-            }
-
-            pVorbis->cursor = frameIndex;
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* vorbis is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-
-        (void)frameIndex;
-
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_stbvorbis_get_data_format(ma_stbvorbis* pVorbis, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    /* Defaults for safety. */
-    if (pFormat != NULL) {
-        *pFormat = ma_format_unknown;
-    }
-    if (pChannels != NULL) {
-        *pChannels = 0;
-    }
-    if (pSampleRate != NULL) {
-        *pSampleRate = 0;
-    }
-    if (pChannelMap != NULL) {
-        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
-    }
-
-    if (pVorbis == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    if (pFormat != NULL) {
-        *pFormat = pVorbis->format;
-    }
-
-    #if !defined(MA_NO_VORBIS)
-    {
-        if (pChannels != NULL) {
-            *pChannels = pVorbis->channels;
-        }
-
-        if (pSampleRate != NULL) {
-            *pSampleRate = pVorbis->sampleRate;
-        }
-
-        if (pChannelMap != NULL) {
-            ma_channel_map_init_standard(ma_standard_channel_map_vorbis, pChannelMap, channelMapCap, pVorbis->channels);
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* vorbis is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_stbvorbis_get_cursor_in_pcm_frames(ma_stbvorbis* pVorbis, ma_uint64* pCursor)
-{
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;   /* Safety. */
-
-    if (pVorbis == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_VORBIS)
-    {
-        *pCursor = pVorbis->cursor;
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* vorbis is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-MA_API ma_result ma_stbvorbis_get_length_in_pcm_frames(ma_stbvorbis* pVorbis, ma_uint64* pLength)
-{
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;   /* Safety. */
-
-    if (pVorbis == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_VORBIS)
-    {
-        if (pVorbis->usingPushMode) {
-            *pLength = 0;   /* I don't know of a good way to determine this reliably with stb_vorbis and push mode. */
-        } else {
-            *pLength = stb_vorbis_stream_length_in_samples(pVorbis->stb);
-        }
-
-        return MA_SUCCESS;
-    }
-    #else
-    {
-        /* vorbis is disabled. Should never hit this since initialization would have failed. */
-        MA_ASSERT(MA_FALSE);
-        return MA_NOT_IMPLEMENTED;
-    }
-    #endif
-}
-
-
-static ma_result ma_decoding_backend_init__stbvorbis(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_stbvorbis* pVorbis;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pVorbis = (ma_stbvorbis*)ma_malloc(sizeof(*pVorbis), pAllocationCallbacks);
-    if (pVorbis == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_stbvorbis_init(onRead, onSeek, onTell, pReadSeekTellUserData, pConfig, pAllocationCallbacks, pVorbis);
-    if (result != MA_SUCCESS) {
-        ma_free(pVorbis, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pVorbis;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_file__stbvorbis(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_stbvorbis* pVorbis;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pVorbis = (ma_stbvorbis*)ma_malloc(sizeof(*pVorbis), pAllocationCallbacks);
-    if (pVorbis == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_stbvorbis_init_file(pFilePath, pConfig, pAllocationCallbacks, pVorbis);
-    if (result != MA_SUCCESS) {
-        ma_free(pVorbis, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pVorbis;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoding_backend_init_memory__stbvorbis(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
-{
-    ma_result result;
-    ma_stbvorbis* pVorbis;
-
-    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
-
-    /* For now we're just allocating the decoder backend on the heap. */
-    pVorbis = (ma_stbvorbis*)ma_malloc(sizeof(*pVorbis), pAllocationCallbacks);
-    if (pVorbis == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_stbvorbis_init_memory(pData, dataSize, pConfig, pAllocationCallbacks, pVorbis);
-    if (result != MA_SUCCESS) {
-        ma_free(pVorbis, pAllocationCallbacks);
-        return result;
-    }
-
-    *ppBackend = pVorbis;
-
-    return MA_SUCCESS;
-}
-
-static void ma_decoding_backend_uninit__stbvorbis(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_stbvorbis* pVorbis = (ma_stbvorbis*)pBackend;
-
-    (void)pUserData;
-
-    ma_stbvorbis_uninit(pVorbis, pAllocationCallbacks);
-    ma_free(pVorbis, pAllocationCallbacks);
-}
-
-static ma_decoding_backend_vtable g_ma_decoding_backend_vtable_stbvorbis =
-{
-    ma_decoding_backend_init__stbvorbis,
-    ma_decoding_backend_init_file__stbvorbis,
-    NULL, /* onInitFileW() */
-    ma_decoding_backend_init_memory__stbvorbis,
-    ma_decoding_backend_uninit__stbvorbis
-};
-
-static ma_result ma_decoder_init_vorbis__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_vtable__internal(&g_ma_decoding_backend_vtable_stbvorbis, NULL, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_vorbis_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_file__internal(&g_ma_decoding_backend_vtable_stbvorbis, NULL, pFilePath, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_vorbis_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_file_w__internal(&g_ma_decoding_backend_vtable_stbvorbis, NULL, pFilePath, pConfig, pDecoder);
-}
-
-static ma_result ma_decoder_init_vorbis_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    return ma_decoder_init_from_memory__internal(&g_ma_decoding_backend_vtable_stbvorbis, NULL, pData, dataSize, pConfig, pDecoder);
-}
-#endif  /* STB_VORBIS_INCLUDE_STB_VORBIS_H */
-
-
-
-static ma_result ma_decoder__init_allocation_callbacks(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    MA_ASSERT(pDecoder != NULL);
-
-    if (pConfig != NULL) {
-        return ma_allocation_callbacks_init_copy(&pDecoder->allocationCallbacks, &pConfig->allocationCallbacks);
-    } else {
-        pDecoder->allocationCallbacks = ma_allocation_callbacks_init_default();
-        return MA_SUCCESS;
-    }
-}
-
-static ma_result ma_decoder__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_decoder_read_pcm_frames((ma_decoder*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_decoder__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_decoder_seek_to_pcm_frame((ma_decoder*)pDataSource, frameIndex);
-}
-
-static ma_result ma_decoder__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    return ma_decoder_get_data_format((ma_decoder*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-}
-
-static ma_result ma_decoder__data_source_on_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    return ma_decoder_get_cursor_in_pcm_frames((ma_decoder*)pDataSource, pCursor);
-}
-
-static ma_result ma_decoder__data_source_on_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    return ma_decoder_get_length_in_pcm_frames((ma_decoder*)pDataSource, pLength);
-}
-
-static ma_data_source_vtable g_ma_decoder_data_source_vtable =
-{
-    ma_decoder__data_source_on_read,
-    ma_decoder__data_source_on_seek,
-    ma_decoder__data_source_on_get_data_format,
-    ma_decoder__data_source_on_get_cursor,
-    ma_decoder__data_source_on_get_length,
-    NULL,   /* onSetLooping */
-    0
-};
-
-static ma_result ma_decoder__preinit(ma_decoder_read_proc onRead, ma_decoder_seek_proc onSeek, ma_decoder_tell_proc onTell, void* pUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_data_source_config dataSourceConfig;
-
-    MA_ASSERT(pConfig != NULL);
-
-    if (pDecoder == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDecoder);
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_decoder_data_source_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pDecoder->ds);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pDecoder->onRead    = onRead;
-    pDecoder->onSeek    = onSeek;
-    pDecoder->onTell    = onTell;
-    pDecoder->pUserData = pUserData;
-
-    result = ma_decoder__init_allocation_callbacks(pConfig, pDecoder);
-    if (result != MA_SUCCESS) {
-        ma_data_source_uninit(&pDecoder->ds);
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoder__postinit(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-
-    result = ma_decoder__init_data_converter(pDecoder, pConfig);
-
-    /* If we failed post initialization we need to uninitialize the decoder before returning to prevent a memory leak. */
-    if (result != MA_SUCCESS) {
-        ma_decoder_uninit(pDecoder);
-        return result;
-    }
-
-    return result;
-}
-
-
-static ma_result ma_decoder_init__internal(ma_decoder_read_proc onRead, ma_decoder_seek_proc onSeek, void* pUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result = MA_NO_BACKEND;
-
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(pDecoder != NULL);
-
-    /* Silence some warnings in the case that we don't have any decoder backends enabled. */
-    (void)onRead;
-    (void)onSeek;
-    (void)pUserData;
-
-
-    /* If we've specified a specific encoding type, try that first. */
-    if (pConfig->encodingFormat != ma_encoding_format_unknown) {
-    #ifdef MA_HAS_WAV
-        if (pConfig->encodingFormat == ma_encoding_format_wav) {
-            result = ma_decoder_init_wav__internal(pConfig, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (pConfig->encodingFormat == ma_encoding_format_flac) {
-            result = ma_decoder_init_flac__internal(pConfig, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (pConfig->encodingFormat == ma_encoding_format_mp3) {
-            result = ma_decoder_init_mp3__internal(pConfig, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_VORBIS
-        if (pConfig->encodingFormat == ma_encoding_format_vorbis) {
-            result = ma_decoder_init_vorbis__internal(pConfig, pDecoder);
-        }
-    #endif
-
-        /* If we weren't able to initialize the decoder, seek back to the start to give the next attempts a clean start. */
-        if (result != MA_SUCCESS) {
-            onSeek(pDecoder, 0, ma_seek_origin_start);
-        }
-    }
-
-    if (result != MA_SUCCESS) {
-        /* Getting here means we couldn't load a specific decoding backend based on the encoding format. */
-
-        /*
-        We use trial and error to open a decoder. We prioritize custom decoders so that if they
-        implement the same encoding format they take priority over the built-in decoders.
-        */
-        if (result != MA_SUCCESS) {
-            result = ma_decoder_init_custom__internal(pConfig, pDecoder);
-            if (result != MA_SUCCESS) {
-                onSeek(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-
-        /*
-        If we get to this point and we still haven't found a decoder, and the caller has requested a
-        specific encoding format, there's no hope for it. Abort.
-        */
-        if (pConfig->encodingFormat != ma_encoding_format_unknown) {
-            return MA_NO_BACKEND;
-        }
-
-    #ifdef MA_HAS_WAV
-        if (result != MA_SUCCESS) {
-            result = ma_decoder_init_wav__internal(pConfig, pDecoder);
-            if (result != MA_SUCCESS) {
-                onSeek(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (result != MA_SUCCESS) {
-            result = ma_decoder_init_flac__internal(pConfig, pDecoder);
-            if (result != MA_SUCCESS) {
-                onSeek(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (result != MA_SUCCESS) {
-            result = ma_decoder_init_mp3__internal(pConfig, pDecoder);
-            if (result != MA_SUCCESS) {
-                onSeek(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    #ifdef MA_HAS_VORBIS
-        if (result != MA_SUCCESS) {
-            result = ma_decoder_init_vorbis__internal(pConfig, pDecoder);
-            if (result != MA_SUCCESS) {
-                onSeek(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    }
-
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return ma_decoder__postinit(pConfig, pDecoder);
-}
-
-MA_API ma_result ma_decoder_init(ma_decoder_read_proc onRead, ma_decoder_seek_proc onSeek, void* pUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_decoder_config config;
-    ma_result result;
-
-    config = ma_decoder_config_init_copy(pConfig);
-
-    result = ma_decoder__preinit(onRead, onSeek, NULL, pUserData, &config, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return ma_decoder_init__internal(onRead, onSeek, pUserData, &config, pDecoder);
-}
-
-
-static ma_result ma_decoder__on_read_memory(ma_decoder* pDecoder, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead)
-{
-    size_t bytesRemaining;
-
-    MA_ASSERT(pDecoder->data.memory.dataSize >= pDecoder->data.memory.currentReadPos);
-
-    if (pBytesRead != NULL) {
-        *pBytesRead = 0;
-    }
-
-    bytesRemaining = pDecoder->data.memory.dataSize - pDecoder->data.memory.currentReadPos;
-    if (bytesToRead > bytesRemaining) {
-        bytesToRead = bytesRemaining;
-    }
-
-    if (bytesRemaining == 0) {
-        return MA_AT_END;
-    }
-
-    if (bytesToRead > 0) {
-        MA_COPY_MEMORY(pBufferOut, pDecoder->data.memory.pData + pDecoder->data.memory.currentReadPos, bytesToRead);
-        pDecoder->data.memory.currentReadPos += bytesToRead;
-    }
-
-    if (pBytesRead != NULL) {
-        *pBytesRead = bytesToRead;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoder__on_seek_memory(ma_decoder* pDecoder, ma_int64 byteOffset, ma_seek_origin origin)
-{
-    if (byteOffset > 0 && (ma_uint64)byteOffset > MA_SIZE_MAX) {
-        return MA_BAD_SEEK;
-    }
-
-    if (origin == ma_seek_origin_current) {
-        if (byteOffset > 0) {
-            if (pDecoder->data.memory.currentReadPos + byteOffset > pDecoder->data.memory.dataSize) {
-                byteOffset = (ma_int64)(pDecoder->data.memory.dataSize - pDecoder->data.memory.currentReadPos);  /* Trying to seek too far forward. */
-            }
-
-            pDecoder->data.memory.currentReadPos += (size_t)byteOffset;
-        } else {
-            if (pDecoder->data.memory.currentReadPos < (size_t)-byteOffset) {
-                byteOffset = -(ma_int64)pDecoder->data.memory.currentReadPos;  /* Trying to seek too far backwards. */
-            }
-
-            pDecoder->data.memory.currentReadPos -= (size_t)-byteOffset;
-        }
-    } else {
-        if (origin == ma_seek_origin_end) {
-            if (byteOffset < 0) {
-                byteOffset = -byteOffset;
-            }
-
-            if (byteOffset > (ma_int64)pDecoder->data.memory.dataSize) {
-                pDecoder->data.memory.currentReadPos = 0;   /* Trying to seek too far back. */
-            } else {
-                pDecoder->data.memory.currentReadPos = pDecoder->data.memory.dataSize - (size_t)byteOffset;
-            }
-        } else {
-            if ((size_t)byteOffset <= pDecoder->data.memory.dataSize) {
-                pDecoder->data.memory.currentReadPos = (size_t)byteOffset;
-            } else {
-                pDecoder->data.memory.currentReadPos = pDecoder->data.memory.dataSize;  /* Trying to seek too far forward. */
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoder__on_tell_memory(ma_decoder* pDecoder, ma_int64* pCursor)
-{
-    MA_ASSERT(pDecoder != NULL);
-    MA_ASSERT(pCursor  != NULL);
-
-    *pCursor = (ma_int64)pDecoder->data.memory.currentReadPos;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoder__preinit_memory_wrapper(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result = ma_decoder__preinit(ma_decoder__on_read_memory, ma_decoder__on_seek_memory, ma_decoder__on_tell_memory, NULL, pConfig, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pData == NULL || dataSize == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pDecoder->data.memory.pData = (const ma_uint8*)pData;
-    pDecoder->data.memory.dataSize = dataSize;
-    pDecoder->data.memory.currentReadPos = 0;
-
-    (void)pConfig;
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decoder_init_memory(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoder_config config;
-
-    config = ma_decoder_config_init_copy(pConfig);
-
-    result = ma_decoder__preinit(NULL, NULL, NULL, NULL, &config, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pData == NULL || dataSize == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* If the backend has support for loading from a file path we'll want to use that. If that all fails we'll fall back to the VFS path. */
-    result = MA_NO_BACKEND;
-
-    if (config.encodingFormat != ma_encoding_format_unknown) {
-    #ifdef MA_HAS_WAV
-        if (config.encodingFormat == ma_encoding_format_wav) {
-            result = ma_decoder_init_wav_from_memory__internal(pData, dataSize, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (config.encodingFormat == ma_encoding_format_flac) {
-            result = ma_decoder_init_flac_from_memory__internal(pData, dataSize, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (config.encodingFormat == ma_encoding_format_mp3) {
-            result = ma_decoder_init_mp3_from_memory__internal(pData, dataSize, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_VORBIS
-        if (config.encodingFormat == ma_encoding_format_vorbis) {
-            result = ma_decoder_init_vorbis_from_memory__internal(pData, dataSize, &config, pDecoder);
-        }
-    #endif
-    }
-
-    if (result != MA_SUCCESS) {
-        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
-
-        /*
-        We use trial and error to open a decoder. We prioritize custom decoders so that if they
-        implement the same encoding format they take priority over the built-in decoders.
-        */
-        result = ma_decoder_init_custom_from_memory__internal(pData, dataSize, &config, pDecoder);
-
-        /*
-        If we get to this point and we still haven't found a decoder, and the caller has requested a
-        specific encoding format, there's no hope for it. Abort.
-        */
-        if (result != MA_SUCCESS && config.encodingFormat != ma_encoding_format_unknown) {
-            return MA_NO_BACKEND;
-        }
-
-        /* Use trial and error for stock decoders. */
-        if (result != MA_SUCCESS) {
-        #ifdef MA_HAS_WAV
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_wav_from_memory__internal(pData, dataSize, &config, pDecoder);
-            }
-        #endif
-        #ifdef MA_HAS_FLAC
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_flac_from_memory__internal(pData, dataSize, &config, pDecoder);
-            }
-        #endif
-        #ifdef MA_HAS_MP3
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_mp3_from_memory__internal(pData, dataSize, &config, pDecoder);
-            }
-        #endif
-        #ifdef MA_HAS_VORBIS
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_vorbis_from_memory__internal(pData, dataSize, &config, pDecoder);
-            }
-        #endif
-        }
-    }
-
-    /*
-    If at this point we still haven't successfully initialized the decoder it most likely means
-    the backend doesn't have an implementation for loading from a file path. We'll try using
-    miniaudio's built-in file IO for loading file.
-    */
-    if (result == MA_SUCCESS) {
-        /* Initialization was successful. Finish up. */
-        result = ma_decoder__postinit(&config, pDecoder);
-        if (result != MA_SUCCESS) {
-            /*
-            The backend was initialized successfully, but for some reason post-initialization failed. This is most likely
-            due to an out of memory error. We're going to abort with an error here and not try to recover.
-            */
-            if (pDecoder->pBackendVTable != NULL && pDecoder->pBackendVTable->onUninit != NULL) {
-                pDecoder->pBackendVTable->onUninit(pDecoder->pBackendUserData, &pDecoder->pBackend, &pDecoder->allocationCallbacks);
-            }
-
-            return result;
-        }
-    } else {
-        /* Probably no implementation for loading from a block of memory. Use miniaudio's abstraction instead. */
-        result = ma_decoder__preinit_memory_wrapper(pData, dataSize, &config, pDecoder);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        result = ma_decoder_init__internal(ma_decoder__on_read_memory, ma_decoder__on_seek_memory, NULL, &config, pDecoder);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-
-#if defined(MA_HAS_WAV)    || \
-    defined(MA_HAS_MP3)    || \
-    defined(MA_HAS_FLAC)   || \
-    defined(MA_HAS_VORBIS) || \
-    defined(MA_HAS_OPUS)
-#define MA_HAS_PATH_API
-#endif
-
-#if defined(MA_HAS_PATH_API)
-static const char* ma_path_file_name(const char* path)
-{
-    const char* fileName;
-
-    if (path == NULL) {
-        return NULL;
-    }
-
-    fileName = path;
-
-    /* We just loop through the path until we find the last slash. */
-    while (path[0] != '\0') {
-        if (path[0] == '/' || path[0] == '\\') {
-            fileName = path;
-        }
-
-        path += 1;
-    }
-
-    /* At this point the file name is sitting on a slash, so just move forward. */
-    while (fileName[0] != '\0' && (fileName[0] == '/' || fileName[0] == '\\')) {
-        fileName += 1;
-    }
-
-    return fileName;
-}
-
-static const wchar_t* ma_path_file_name_w(const wchar_t* path)
-{
-    const wchar_t* fileName;
-
-    if (path == NULL) {
-        return NULL;
-    }
-
-    fileName = path;
-
-    /* We just loop through the path until we find the last slash. */
-    while (path[0] != '\0') {
-        if (path[0] == '/' || path[0] == '\\') {
-            fileName = path;
-        }
-
-        path += 1;
-    }
-
-    /* At this point the file name is sitting on a slash, so just move forward. */
-    while (fileName[0] != '\0' && (fileName[0] == '/' || fileName[0] == '\\')) {
-        fileName += 1;
-    }
-
-    return fileName;
-}
-
-
-static const char* ma_path_extension(const char* path)
-{
-    const char* extension;
-    const char* lastOccurance;
-
-    if (path == NULL) {
-        path = "";
-    }
-
-    extension = ma_path_file_name(path);
-    lastOccurance = NULL;
-
-    /* Just find the last '.' and return. */
-    while (extension[0] != '\0') {
-        if (extension[0] == '.') {
-            extension += 1;
-            lastOccurance = extension;
-        }
-
-        extension += 1;
-    }
-
-    return (lastOccurance != NULL) ? lastOccurance : extension;
-}
-
-static const wchar_t* ma_path_extension_w(const wchar_t* path)
-{
-    const wchar_t* extension;
-    const wchar_t* lastOccurance;
-
-    if (path == NULL) {
-        path = L"";
-    }
-
-    extension = ma_path_file_name_w(path);
-    lastOccurance = NULL;
-
-    /* Just find the last '.' and return. */
-    while (extension[0] != '\0') {
-        if (extension[0] == '.') {
-            extension += 1;
-            lastOccurance = extension;
-        }
-
-        extension += 1;
-    }
-
-    return (lastOccurance != NULL) ? lastOccurance : extension;
-}
-
-
-static ma_bool32 ma_path_extension_equal(const char* path, const char* extension)
-{
-    const char* ext1;
-    const char* ext2;
-
-    if (path == NULL || extension == NULL) {
-        return MA_FALSE;
-    }
-
-    ext1 = extension;
-    ext2 = ma_path_extension(path);
-
-#if defined(_MSC_VER) || defined(__DMC__)
-    return _stricmp(ext1, ext2) == 0;
-#else
-    return strcasecmp(ext1, ext2) == 0;
-#endif
-}
-
-static ma_bool32 ma_path_extension_equal_w(const wchar_t* path, const wchar_t* extension)
-{
-    const wchar_t* ext1;
-    const wchar_t* ext2;
-
-    if (path == NULL || extension == NULL) {
-        return MA_FALSE;
-    }
-
-    ext1 = extension;
-    ext2 = ma_path_extension_w(path);
-
-#if defined(_MSC_VER) || defined(__WATCOMC__) || defined(__DMC__)
-    return _wcsicmp(ext1, ext2) == 0;
-#else
-    /*
-    I'm not aware of a wide character version of strcasecmp(). I'm therefore converting the extensions to multibyte strings and comparing those. This
-    isn't the most efficient way to do it, but it should work OK.
-    */
-    {
-        char ext1MB[4096];
-        char ext2MB[4096];
-        const wchar_t* pext1 = ext1;
-        const wchar_t* pext2 = ext2;
-        mbstate_t mbs1;
-        mbstate_t mbs2;
-
-        MA_ZERO_OBJECT(&mbs1);
-        MA_ZERO_OBJECT(&mbs2);
-
-        if (wcsrtombs(ext1MB, &pext1, sizeof(ext1MB), &mbs1) == (size_t)-1) {
-            return MA_FALSE;
-        }
-        if (wcsrtombs(ext2MB, &pext2, sizeof(ext2MB), &mbs2) == (size_t)-1) {
-            return MA_FALSE;
-        }
-
-        return strcasecmp(ext1MB, ext2MB) == 0;
-    }
-#endif
-}
-#endif  /* MA_HAS_PATH_API */
-
-
-
-static ma_result ma_decoder__on_read_vfs(ma_decoder* pDecoder, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead)
-{
-    MA_ASSERT(pDecoder   != NULL);
-    MA_ASSERT(pBufferOut != NULL);
-
-    return ma_vfs_or_default_read(pDecoder->data.vfs.pVFS, pDecoder->data.vfs.file, pBufferOut, bytesToRead, pBytesRead);
-}
-
-static ma_result ma_decoder__on_seek_vfs(ma_decoder* pDecoder, ma_int64 offset, ma_seek_origin origin)
-{
-    MA_ASSERT(pDecoder != NULL);
-
-    return ma_vfs_or_default_seek(pDecoder->data.vfs.pVFS, pDecoder->data.vfs.file, offset, origin);
-}
-
-static ma_result ma_decoder__on_tell_vfs(ma_decoder* pDecoder, ma_int64* pCursor)
-{
-    MA_ASSERT(pDecoder != NULL);
-
-    return ma_vfs_or_default_tell(pDecoder->data.vfs.pVFS, pDecoder->data.vfs.file, pCursor);
-}
-
-static ma_result ma_decoder__preinit_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_vfs_file file;
-
-    result = ma_decoder__preinit(ma_decoder__on_read_vfs, ma_decoder__on_seek_vfs, ma_decoder__on_tell_vfs, NULL, pConfig, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pFilePath == NULL || pFilePath[0] == '\0') {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_vfs_or_default_open(pVFS, pFilePath, MA_OPEN_MODE_READ, &file);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pDecoder->data.vfs.pVFS = pVFS;
-    pDecoder->data.vfs.file = file;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decoder_init_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoder_config config;
-
-    config = ma_decoder_config_init_copy(pConfig);
-    result = ma_decoder__preinit_vfs(pVFS, pFilePath, &config, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = MA_NO_BACKEND;
-
-    if (config.encodingFormat != ma_encoding_format_unknown) {
-    #ifdef MA_HAS_WAV
-        if (config.encodingFormat == ma_encoding_format_wav) {
-            result = ma_decoder_init_wav__internal(&config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (config.encodingFormat == ma_encoding_format_flac) {
-            result = ma_decoder_init_flac__internal(&config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (config.encodingFormat == ma_encoding_format_mp3) {
-            result = ma_decoder_init_mp3__internal(&config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_VORBIS
-        if (config.encodingFormat == ma_encoding_format_vorbis) {
-            result = ma_decoder_init_vorbis__internal(&config, pDecoder);
-        }
-    #endif
-
-        /* Make sure we seek back to the start if we didn't initialize a decoder successfully so the next attempts have a fresh start. */
-        if (result != MA_SUCCESS) {
-            ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-        }
-    }
-
-    if (result != MA_SUCCESS) {
-        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
-
-        /*
-        We use trial and error to open a decoder. We prioritize custom decoders so that if they
-        implement the same encoding format they take priority over the built-in decoders.
-        */
-        if (result != MA_SUCCESS) {
-            result = ma_decoder_init_custom__internal(&config, pDecoder);
-            if (result != MA_SUCCESS) {
-                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-
-        /*
-        If we get to this point and we still haven't found a decoder, and the caller has requested a
-        specific encoding format, there's no hope for it. Abort.
-        */
-        if (config.encodingFormat != ma_encoding_format_unknown) {
-            return MA_NO_BACKEND;
-        }
-
-    #ifdef MA_HAS_WAV
-        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "wav")) {
-            result = ma_decoder_init_wav__internal(&config, pDecoder);
-            if (result != MA_SUCCESS) {
-                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "flac")) {
-            result = ma_decoder_init_flac__internal(&config, pDecoder);
-            if (result != MA_SUCCESS) {
-                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "mp3")) {
-            result = ma_decoder_init_mp3__internal(&config, pDecoder);
-            if (result != MA_SUCCESS) {
-                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    }
-
-    /* If we still haven't got a result just use trial and error. Otherwise we can finish up. */
-    if (result != MA_SUCCESS) {
-        result = ma_decoder_init__internal(ma_decoder__on_read_vfs, ma_decoder__on_seek_vfs, NULL, &config, pDecoder);
-    } else {
-        result = ma_decoder__postinit(&config, pDecoder);
-    }
-
-    if (result != MA_SUCCESS) {
-        if (pDecoder->data.vfs.file != NULL) {   /* <-- Will be reset to NULL if ma_decoder_uninit() is called in one of the steps above which allows us to avoid a double close of the file. */
-            ma_vfs_or_default_close(pVFS, pDecoder->data.vfs.file);
-        }
-
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_decoder__preinit_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_vfs_file file;
-
-    result = ma_decoder__preinit(ma_decoder__on_read_vfs, ma_decoder__on_seek_vfs, ma_decoder__on_tell_vfs, NULL, pConfig, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pFilePath == NULL || pFilePath[0] == '\0') {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_vfs_or_default_open_w(pVFS, pFilePath, MA_OPEN_MODE_READ, &file);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pDecoder->data.vfs.pVFS = pVFS;
-    pDecoder->data.vfs.file = file;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decoder_init_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoder_config config;
-
-    config = ma_decoder_config_init_copy(pConfig);
-    result = ma_decoder__preinit_vfs_w(pVFS, pFilePath, &config, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = MA_NO_BACKEND;
-
-    if (config.encodingFormat != ma_encoding_format_unknown) {
-    #ifdef MA_HAS_WAV
-        if (config.encodingFormat == ma_encoding_format_wav) {
-            result = ma_decoder_init_wav__internal(&config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (config.encodingFormat == ma_encoding_format_flac) {
-            result = ma_decoder_init_flac__internal(&config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (config.encodingFormat == ma_encoding_format_mp3) {
-            result = ma_decoder_init_mp3__internal(&config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_VORBIS
-        if (config.encodingFormat == ma_encoding_format_vorbis) {
-            result = ma_decoder_init_vorbis__internal(&config, pDecoder);
-        }
-    #endif
-
-        /* Make sure we seek back to the start if we didn't initialize a decoder successfully so the next attempts have a fresh start. */
-        if (result != MA_SUCCESS) {
-            ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-        }
-    }
-
-    if (result != MA_SUCCESS) {
-        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
-
-        /*
-        We use trial and error to open a decoder. We prioritize custom decoders so that if they
-        implement the same encoding format they take priority over the built-in decoders.
-        */
-        if (result != MA_SUCCESS) {
-            result = ma_decoder_init_custom__internal(&config, pDecoder);
-            if (result != MA_SUCCESS) {
-                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-
-        /*
-        If we get to this point and we still haven't found a decoder, and the caller has requested a
-        specific encoding format, there's no hope for it. Abort.
-        */
-        if (config.encodingFormat != ma_encoding_format_unknown) {
-            return MA_NO_BACKEND;
-        }
-
-    #ifdef MA_HAS_WAV
-        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"wav")) {
-            result = ma_decoder_init_wav__internal(&config, pDecoder);
-            if (result != MA_SUCCESS) {
-                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"flac")) {
-            result = ma_decoder_init_flac__internal(&config, pDecoder);
-            if (result != MA_SUCCESS) {
-                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"mp3")) {
-            result = ma_decoder_init_mp3__internal(&config, pDecoder);
-            if (result != MA_SUCCESS) {
-                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
-            }
-        }
-    #endif
-    }
-
-    /* If we still haven't got a result just use trial and error. Otherwise we can finish up. */
-    if (result != MA_SUCCESS) {
-        result = ma_decoder_init__internal(ma_decoder__on_read_vfs, ma_decoder__on_seek_vfs, NULL, &config, pDecoder);
-    } else {
-        result = ma_decoder__postinit(&config, pDecoder);
-    }
-
-    if (result != MA_SUCCESS) {
-        ma_vfs_or_default_close(pVFS, pDecoder->data.vfs.file);
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_decoder__preinit_file(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-
-    result = ma_decoder__preinit(NULL, NULL, NULL, NULL, pConfig, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pFilePath == NULL || pFilePath[0] == '\0') {
-        return MA_INVALID_ARGS;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decoder_init_file(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoder_config config;
-
-    config = ma_decoder_config_init_copy(pConfig);
-    result = ma_decoder__preinit_file(pFilePath, &config, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* If the backend has support for loading from a file path we'll want to use that. If that all fails we'll fall back to the VFS path. */
-    result = MA_NO_BACKEND;
-
-    if (config.encodingFormat != ma_encoding_format_unknown) {
-    #ifdef MA_HAS_WAV
-        if (config.encodingFormat == ma_encoding_format_wav) {
-            result = ma_decoder_init_wav_from_file__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (config.encodingFormat == ma_encoding_format_flac) {
-            result = ma_decoder_init_flac_from_file__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (config.encodingFormat == ma_encoding_format_mp3) {
-            result = ma_decoder_init_mp3_from_file__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_VORBIS
-        if (config.encodingFormat == ma_encoding_format_vorbis) {
-            result = ma_decoder_init_vorbis_from_file__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    }
-
-    if (result != MA_SUCCESS) {
-        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
-
-        /*
-        We use trial and error to open a decoder. We prioritize custom decoders so that if they
-        implement the same encoding format they take priority over the built-in decoders.
-        */
-        result = ma_decoder_init_custom_from_file__internal(pFilePath, &config, pDecoder);
-
-        /*
-        If we get to this point and we still haven't found a decoder, and the caller has requested a
-        specific encoding format, there's no hope for it. Abort.
-        */
-        if (result != MA_SUCCESS && config.encodingFormat != ma_encoding_format_unknown) {
-            return MA_NO_BACKEND;
-        }
-
-        /* First try loading based on the file extension so we don't waste time opening and closing files. */
-    #ifdef MA_HAS_WAV
-        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "wav")) {
-            result = ma_decoder_init_wav_from_file__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "flac")) {
-            result = ma_decoder_init_flac_from_file__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "mp3")) {
-            result = ma_decoder_init_mp3_from_file__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_VORBIS
-        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "ogg")) {
-            result = ma_decoder_init_vorbis_from_file__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-
-        /*
-        If we still haven't got a result just use trial and error. Custom decoders have already been attempted, so here we
-        need only iterate over our stock decoders.
-        */
-        if (result != MA_SUCCESS) {
-        #ifdef MA_HAS_WAV
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_wav_from_file__internal(pFilePath, &config, pDecoder);
-            }
-        #endif
-        #ifdef MA_HAS_FLAC
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_flac_from_file__internal(pFilePath, &config, pDecoder);
-            }
-        #endif
-        #ifdef MA_HAS_MP3
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_mp3_from_file__internal(pFilePath, &config, pDecoder);
-            }
-        #endif
-        #ifdef MA_HAS_VORBIS
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_vorbis_from_file__internal(pFilePath, &config, pDecoder);
-            }
-        #endif
-        }
-    }
-
-    /*
-    If at this point we still haven't successfully initialized the decoder it most likely means
-    the backend doesn't have an implementation for loading from a file path. We'll try using
-    miniaudio's built-in file IO for loading file.
-    */
-    if (result == MA_SUCCESS) {
-        /* Initialization was successful. Finish up. */
-        result = ma_decoder__postinit(&config, pDecoder);
-        if (result != MA_SUCCESS) {
-            /*
-            The backend was initialized successfully, but for some reason post-initialization failed. This is most likely
-            due to an out of memory error. We're going to abort with an error here and not try to recover.
-            */
-            if (pDecoder->pBackendVTable != NULL && pDecoder->pBackendVTable->onUninit != NULL) {
-                pDecoder->pBackendVTable->onUninit(pDecoder->pBackendUserData, &pDecoder->pBackend, &pDecoder->allocationCallbacks);
-            }
-
-            return result;
-        }
-    } else {
-        /* Probably no implementation for loading from a file path. Use miniaudio's file IO instead. */
-        result = ma_decoder_init_vfs(NULL, pFilePath, pConfig, pDecoder);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_decoder__preinit_file_w(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-
-    result = ma_decoder__preinit(NULL, NULL, NULL, NULL, pConfig, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pFilePath == NULL || pFilePath[0] == '\0') {
-        return MA_INVALID_ARGS;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decoder_init_file_w(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoder_config config;
-
-    config = ma_decoder_config_init_copy(pConfig);
-    result = ma_decoder__preinit_file_w(pFilePath, &config, pDecoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* If the backend has support for loading from a file path we'll want to use that. If that all fails we'll fall back to the VFS path. */
-    result = MA_NO_BACKEND;
-
-    if (config.encodingFormat != ma_encoding_format_unknown) {
-    #ifdef MA_HAS_WAV
-        if (config.encodingFormat == ma_encoding_format_wav) {
-            result = ma_decoder_init_wav_from_file_w__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (config.encodingFormat == ma_encoding_format_flac) {
-            result = ma_decoder_init_flac_from_file_w__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (config.encodingFormat == ma_encoding_format_mp3) {
-            result = ma_decoder_init_mp3_from_file_w__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_VORBIS
-        if (config.encodingFormat == ma_encoding_format_vorbis) {
-            result = ma_decoder_init_vorbis_from_file_w__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    }
-
-    if (result != MA_SUCCESS) {
-        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
-
-        /*
-        We use trial and error to open a decoder. We prioritize custom decoders so that if they
-        implement the same encoding format they take priority over the built-in decoders.
-        */
-        result = ma_decoder_init_custom_from_file_w__internal(pFilePath, &config, pDecoder);
-
-        /*
-        If we get to this point and we still haven't found a decoder, and the caller has requested a
-        specific encoding format, there's no hope for it. Abort.
-        */
-        if (result != MA_SUCCESS && config.encodingFormat != ma_encoding_format_unknown) {
-            return MA_NO_BACKEND;
-        }
-
-        /* First try loading based on the file extension so we don't waste time opening and closing files. */
-    #ifdef MA_HAS_WAV
-        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"wav")) {
-            result = ma_decoder_init_wav_from_file_w__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_FLAC
-        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"flac")) {
-            result = ma_decoder_init_flac_from_file_w__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_MP3
-        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"mp3")) {
-            result = ma_decoder_init_mp3_from_file_w__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-    #ifdef MA_HAS_VORBIS
-        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"ogg")) {
-            result = ma_decoder_init_vorbis_from_file_w__internal(pFilePath, &config, pDecoder);
-        }
-    #endif
-
-        /*
-        If we still haven't got a result just use trial and error. Custom decoders have already been attempted, so here we
-        need only iterate over our stock decoders.
-        */
-        if (result != MA_SUCCESS) {
-        #ifdef MA_HAS_WAV
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_wav_from_file_w__internal(pFilePath, &config, pDecoder);
-            }
-        #endif
-        #ifdef MA_HAS_FLAC
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_flac_from_file_w__internal(pFilePath, &config, pDecoder);
-            }
-        #endif
-        #ifdef MA_HAS_MP3
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_mp3_from_file_w__internal(pFilePath, &config, pDecoder);
-            }
-        #endif
-        #ifdef MA_HAS_VORBIS
-            if (result != MA_SUCCESS) {
-                result = ma_decoder_init_vorbis_from_file_w__internal(pFilePath, &config, pDecoder);
-            }
-        #endif
-        }
-    }
-
-    /*
-    If at this point we still haven't successfully initialized the decoder it most likely means
-    the backend doesn't have an implementation for loading from a file path. We'll try using
-    miniaudio's built-in file IO for loading file.
-    */
-    if (result == MA_SUCCESS) {
-        /* Initialization was successful. Finish up. */
-        result = ma_decoder__postinit(&config, pDecoder);
-        if (result != MA_SUCCESS) {
-            /*
-            The backend was initialized successfully, but for some reason post-initialization failed. This is most likely
-            due to an out of memory error. We're going to abort with an error here and not try to recover.
-            */
-            if (pDecoder->pBackendVTable != NULL && pDecoder->pBackendVTable->onUninit != NULL) {
-                pDecoder->pBackendVTable->onUninit(pDecoder->pBackendUserData, &pDecoder->pBackend, &pDecoder->allocationCallbacks);
-            }
-
-            return result;
-        }
-    } else {
-        /* Probably no implementation for loading from a file path. Use miniaudio's file IO instead. */
-        result = ma_decoder_init_vfs_w(NULL, pFilePath, pConfig, pDecoder);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decoder_uninit(ma_decoder* pDecoder)
-{
-    if (pDecoder == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDecoder->pBackend != NULL) {
-        if (pDecoder->pBackendVTable != NULL && pDecoder->pBackendVTable->onUninit != NULL) {
-            pDecoder->pBackendVTable->onUninit(pDecoder->pBackendUserData, pDecoder->pBackend, &pDecoder->allocationCallbacks);
-        }
-    }
-
-    if (pDecoder->onRead == ma_decoder__on_read_vfs) {
-        ma_vfs_or_default_close(pDecoder->data.vfs.pVFS, pDecoder->data.vfs.file);
-        pDecoder->data.vfs.file = NULL;
-    }
-
-    ma_data_converter_uninit(&pDecoder->converter, &pDecoder->allocationCallbacks);
-    ma_data_source_uninit(&pDecoder->ds);
-
-    if (pDecoder->pInputCache != NULL) {
-        ma_free(pDecoder->pInputCache, &pDecoder->allocationCallbacks);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decoder_read_pcm_frames(ma_decoder* pDecoder, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint64 totalFramesReadOut;
-    void* pRunningFramesOut;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;   /* Safety. */
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDecoder == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDecoder->pBackend == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* Fast path. */
-    if (pDecoder->converter.isPassthrough) {
-        result = ma_data_source_read_pcm_frames(pDecoder->pBackend, pFramesOut, frameCount, &totalFramesReadOut);
-    } else {
-        /*
-        Getting here means we need to do data conversion. If we're seeking forward and are _not_ doing resampling we can run this in a fast path. If we're doing resampling we
-        need to run through each sample because we need to ensure it's internal cache is updated.
-        */
-        if (pFramesOut == NULL && pDecoder->converter.hasResampler == MA_FALSE) {
-            result = ma_data_source_read_pcm_frames(pDecoder->pBackend, NULL, frameCount, &totalFramesReadOut);
-        } else {
-            /* Slow path. Need to run everything through the data converter. */
-            ma_format internalFormat;
-            ma_uint32 internalChannels;
-
-            totalFramesReadOut = 0;
-            pRunningFramesOut  = pFramesOut;
-
-            result = ma_data_source_get_data_format(pDecoder->pBackend, &internalFormat, &internalChannels, NULL, NULL, 0);
-            if (result != MA_SUCCESS) {
-                return result;   /* Failed to retrieve the internal format and channel count. */
-            }
-
-            /*
-            We run a different path depending on whether or not we are using a heap-allocated
-            intermediary buffer or not. If the data converter does not support the calculation of
-            the required number of input frames, we'll use the heap-allocated path. Otherwise we'll
-            use the stack-allocated path.
-            */
-            if (pDecoder->pInputCache != NULL) {
-                /* We don't have a way of determining the required number of input frames, so need to persistently store input data in a cache. */
-                while (totalFramesReadOut < frameCount) {
-                    ma_uint64 framesToReadThisIterationIn;
-                    ma_uint64 framesToReadThisIterationOut;
-
-                    /* If there's any data available in the cache, that needs to get processed first. */
-                    if (pDecoder->inputCacheRemaining > 0) {
-                        framesToReadThisIterationOut = (frameCount - totalFramesReadOut);
-                        framesToReadThisIterationIn  = framesToReadThisIterationOut;
-                        if (framesToReadThisIterationIn > pDecoder->inputCacheRemaining) {
-                            framesToReadThisIterationIn = pDecoder->inputCacheRemaining;
-                        }
-
-                        result = ma_data_converter_process_pcm_frames(&pDecoder->converter, ma_offset_pcm_frames_ptr(pDecoder->pInputCache, pDecoder->inputCacheConsumed, internalFormat, internalChannels), &framesToReadThisIterationIn, pRunningFramesOut, &framesToReadThisIterationOut);
-                        if (result != MA_SUCCESS) {
-                            break;
-                        }
-
-                        pDecoder->inputCacheConsumed  += framesToReadThisIterationIn;
-                        pDecoder->inputCacheRemaining -= framesToReadThisIterationIn;
-
-                        totalFramesReadOut += framesToReadThisIterationOut;
-
-                        if (pRunningFramesOut != NULL) {
-                            pRunningFramesOut = ma_offset_ptr(pRunningFramesOut, framesToReadThisIterationOut * ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels));
-                        }
-
-                        if (framesToReadThisIterationIn == 0 && framesToReadThisIterationOut == 0) {
-                            break;  /* We're done. */
-                        }
-                    }
-
-                    /* Getting here means there's no data in the cache and we need to fill it up from the data source. */
-                    if (pDecoder->inputCacheRemaining == 0) {
-                        pDecoder->inputCacheConsumed = 0;
-
-                        result = ma_data_source_read_pcm_frames(pDecoder->pBackend, pDecoder->pInputCache, pDecoder->inputCacheCap, &pDecoder->inputCacheRemaining);
-                        if (result != MA_SUCCESS) {
-                            break;
-                        }
-                    }
-                }
-            } else {
-                /* We have a way of determining the required number of input frames so just use the stack. */
-                while (totalFramesReadOut < frameCount) {
-                    ma_uint8 pIntermediaryBuffer[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In internal format. */
-                    ma_uint64 intermediaryBufferCap = sizeof(pIntermediaryBuffer) / ma_get_bytes_per_frame(internalFormat, internalChannels);
-                    ma_uint64 framesToReadThisIterationIn;
-                    ma_uint64 framesReadThisIterationIn;
-                    ma_uint64 framesToReadThisIterationOut;
-                    ma_uint64 framesReadThisIterationOut;
-                    ma_uint64 requiredInputFrameCount;
-
-                    framesToReadThisIterationOut = (frameCount - totalFramesReadOut);
-                    framesToReadThisIterationIn = framesToReadThisIterationOut;
-                    if (framesToReadThisIterationIn > intermediaryBufferCap) {
-                        framesToReadThisIterationIn = intermediaryBufferCap;
-                    }
-
-                    ma_data_converter_get_required_input_frame_count(&pDecoder->converter, framesToReadThisIterationOut, &requiredInputFrameCount);
-                    if (framesToReadThisIterationIn > requiredInputFrameCount) {
-                        framesToReadThisIterationIn = requiredInputFrameCount;
-                    }
-
-                    if (requiredInputFrameCount > 0) {
-                        result = ma_data_source_read_pcm_frames(pDecoder->pBackend, pIntermediaryBuffer, framesToReadThisIterationIn, &framesReadThisIterationIn);
-                    } else {
-                        framesReadThisIterationIn = 0;
-                    }
-
-                    /*
-                    At this point we have our decoded data in input format and now we need to convert to output format. Note that even if we didn't read any
-                    input frames, we still want to try processing frames because there may some output frames generated from cached input data.
-                    */
-                    framesReadThisIterationOut = framesToReadThisIterationOut;
-                    result = ma_data_converter_process_pcm_frames(&pDecoder->converter, pIntermediaryBuffer, &framesReadThisIterationIn, pRunningFramesOut, &framesReadThisIterationOut);
-                    if (result != MA_SUCCESS) {
-                        break;
-                    }
-
-                    totalFramesReadOut += framesReadThisIterationOut;
-
-                    if (pRunningFramesOut != NULL) {
-                        pRunningFramesOut = ma_offset_ptr(pRunningFramesOut, framesReadThisIterationOut * ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels));
-                    }
-
-                    if (framesReadThisIterationIn == 0 && framesReadThisIterationOut == 0) {
-                        break;  /* We're done. */
-                    }
-                }
-            }
-        }
-    }
-
-    pDecoder->readPointerInPCMFrames += totalFramesReadOut;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = totalFramesReadOut;
-    }
-
-    if (result == MA_SUCCESS && totalFramesReadOut == 0) {
-        result =  MA_AT_END;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_decoder_seek_to_pcm_frame(ma_decoder* pDecoder, ma_uint64 frameIndex)
-{
-    if (pDecoder == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDecoder->pBackend != NULL) {
-        ma_result result;
-        ma_uint64 internalFrameIndex;
-        ma_uint32 internalSampleRate;
-        ma_uint64 currentFrameIndex;
-
-        result = ma_data_source_get_data_format(pDecoder->pBackend, NULL, NULL, &internalSampleRate, NULL, 0);
-        if (result != MA_SUCCESS) {
-            return result;  /* Failed to retrieve the internal sample rate. */
-        }
-
-        if (internalSampleRate == pDecoder->outputSampleRate) {
-            internalFrameIndex = frameIndex;
-        } else {
-            internalFrameIndex = ma_calculate_frame_count_after_resampling(internalSampleRate, pDecoder->outputSampleRate, frameIndex);
-        }
-
-        /* Only seek if we're requesting a different frame to what we're currently sitting on. */
-        ma_data_source_get_cursor_in_pcm_frames(pDecoder->pBackend, &currentFrameIndex);
-        if (currentFrameIndex != internalFrameIndex) {
-            result = ma_data_source_seek_to_pcm_frame(pDecoder->pBackend, internalFrameIndex);
-            if (result == MA_SUCCESS) {
-                pDecoder->readPointerInPCMFrames = frameIndex;
-            }
-
-            /* Reset the data converter so that any cached data in the resampler is cleared. */
-            ma_data_converter_reset(&pDecoder->converter);
-        }
-
-        return result;
-    }
-
-    /* Should never get here, but if we do it means onSeekToPCMFrame was not set by the backend. */
-    return MA_INVALID_ARGS;
-}
-
-MA_API ma_result ma_decoder_get_data_format(ma_decoder* pDecoder, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    if (pDecoder == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pFormat != NULL) {
-        *pFormat = pDecoder->outputFormat;
-    }
-
-    if (pChannels != NULL) {
-        *pChannels = pDecoder->outputChannels;
-    }
-
-    if (pSampleRate != NULL) {
-        *pSampleRate = pDecoder->outputSampleRate;
-    }
-
-    if (pChannelMap != NULL) {
-        ma_data_converter_get_output_channel_map(&pDecoder->converter, pChannelMap, channelMapCap);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decoder_get_cursor_in_pcm_frames(ma_decoder* pDecoder, ma_uint64* pCursor)
-{
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;
-
-    if (pDecoder == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = pDecoder->readPointerInPCMFrames;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decoder_get_length_in_pcm_frames(ma_decoder* pDecoder, ma_uint64* pLength)
-{
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;
-
-    if (pDecoder == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDecoder->pBackend != NULL) {
-        ma_result result;
-        ma_uint64 internalLengthInPCMFrames;
-        ma_uint32 internalSampleRate;
-
-        result = ma_data_source_get_length_in_pcm_frames(pDecoder->pBackend, &internalLengthInPCMFrames);
-        if (result != MA_SUCCESS) {
-            return result;  /* Failed to retrieve the internal length. */
-        }
-
-        result = ma_data_source_get_data_format(pDecoder->pBackend, NULL, NULL, &internalSampleRate, NULL, 0);
-        if (result != MA_SUCCESS) {
-            return result;   /* Failed to retrieve the internal sample rate. */
-        }
-
-        if (internalSampleRate == pDecoder->outputSampleRate) {
-            *pLength = internalLengthInPCMFrames;
-        } else {
-            *pLength = ma_calculate_frame_count_after_resampling(pDecoder->outputSampleRate, internalSampleRate, internalLengthInPCMFrames);
-        }
-
-        return MA_SUCCESS;
-    } else {
-        return MA_NO_BACKEND;
-    }
-}
-
-MA_API ma_result ma_decoder_get_available_frames(ma_decoder* pDecoder, ma_uint64* pAvailableFrames)
-{
-    ma_result result;
-    ma_uint64 totalFrameCount;
-
-    if (pAvailableFrames == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pAvailableFrames = 0;
-
-    if (pDecoder == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_decoder_get_length_in_pcm_frames(pDecoder, &totalFrameCount);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (totalFrameCount <= pDecoder->readPointerInPCMFrames) {
-        *pAvailableFrames = 0;
-    } else {
-        *pAvailableFrames = totalFrameCount - pDecoder->readPointerInPCMFrames;
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_decoder__full_decode_and_uninit(ma_decoder* pDecoder, ma_decoder_config* pConfigOut, ma_uint64* pFrameCountOut, void** ppPCMFramesOut)
-{
-    ma_result result;
-    ma_uint64 totalFrameCount;
-    ma_uint64 bpf;
-    ma_uint64 dataCapInFrames;
-    void* pPCMFramesOut;
-
-    MA_ASSERT(pDecoder != NULL);
-
-    totalFrameCount = 0;
-    bpf = ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
-
-    /* The frame count is unknown until we try reading. Thus, we just run in a loop. */
-    dataCapInFrames = 0;
-    pPCMFramesOut = NULL;
-    for (;;) {
-        ma_uint64 frameCountToTryReading;
-        ma_uint64 framesJustRead;
-
-        /* Make room if there's not enough. */
-        if (totalFrameCount == dataCapInFrames) {
-            void* pNewPCMFramesOut;
-            ma_uint64 newDataCapInFrames = dataCapInFrames*2;
-            if (newDataCapInFrames == 0) {
-                newDataCapInFrames = 4096;
-            }
-
-            if ((newDataCapInFrames * bpf) > MA_SIZE_MAX) {
-                ma_free(pPCMFramesOut, &pDecoder->allocationCallbacks);
-                return MA_TOO_BIG;
-            }
-
-            pNewPCMFramesOut = (void*)ma_realloc(pPCMFramesOut, (size_t)(newDataCapInFrames * bpf), &pDecoder->allocationCallbacks);
-            if (pNewPCMFramesOut == NULL) {
-                ma_free(pPCMFramesOut, &pDecoder->allocationCallbacks);
-                return MA_OUT_OF_MEMORY;
-            }
-
-            dataCapInFrames = newDataCapInFrames;
-            pPCMFramesOut = pNewPCMFramesOut;
-        }
-
-        frameCountToTryReading = dataCapInFrames - totalFrameCount;
-        MA_ASSERT(frameCountToTryReading > 0);
-
-        result = ma_decoder_read_pcm_frames(pDecoder, (ma_uint8*)pPCMFramesOut + (totalFrameCount * bpf), frameCountToTryReading, &framesJustRead);
-        totalFrameCount += framesJustRead;
-
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        if (framesJustRead < frameCountToTryReading) {
-            break;
-        }
-    }
-
-
-    if (pConfigOut != NULL) {
-        pConfigOut->format     = pDecoder->outputFormat;
-        pConfigOut->channels   = pDecoder->outputChannels;
-        pConfigOut->sampleRate = pDecoder->outputSampleRate;
-    }
-
-    if (ppPCMFramesOut != NULL) {
-        *ppPCMFramesOut = pPCMFramesOut;
-    } else {
-        ma_free(pPCMFramesOut, &pDecoder->allocationCallbacks);
-    }
-
-    if (pFrameCountOut != NULL) {
-        *pFrameCountOut = totalFrameCount;
-    }
-
-    ma_decoder_uninit(pDecoder);
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_decode_from_vfs(ma_vfs* pVFS, const char* pFilePath, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut)
-{
-    ma_result result;
-    ma_decoder_config config;
-    ma_decoder decoder;
-
-    if (pFrameCountOut != NULL) {
-        *pFrameCountOut = 0;
-    }
-    if (ppPCMFramesOut != NULL) {
-        *ppPCMFramesOut = NULL;
-    }
-
-    config = ma_decoder_config_init_copy(pConfig);
-
-    result = ma_decoder_init_vfs(pVFS, pFilePath, &config, &decoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = ma_decoder__full_decode_and_uninit(&decoder, pConfig, pFrameCountOut, ppPCMFramesOut);
-
-    return result;
-}
-
-MA_API ma_result ma_decode_file(const char* pFilePath, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut)
-{
-    return ma_decode_from_vfs(NULL, pFilePath, pConfig, pFrameCountOut, ppPCMFramesOut);
-}
-
-MA_API ma_result ma_decode_memory(const void* pData, size_t dataSize, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut)
-{
-    ma_decoder_config config;
-    ma_decoder decoder;
-    ma_result result;
-
-    if (pFrameCountOut != NULL) {
-        *pFrameCountOut = 0;
-    }
-    if (ppPCMFramesOut != NULL) {
-        *ppPCMFramesOut = NULL;
-    }
-
-    if (pData == NULL || dataSize == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    config = ma_decoder_config_init_copy(pConfig);
-
-    result = ma_decoder_init_memory(pData, dataSize, &config, &decoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return ma_decoder__full_decode_and_uninit(&decoder, pConfig, pFrameCountOut, ppPCMFramesOut);
-}
-#endif  /* MA_NO_DECODING */
-
-
-#ifndef MA_NO_ENCODING
-
-#if defined(MA_HAS_WAV)
-static size_t ma_encoder__internal_on_write_wav(void* pUserData, const void* pData, size_t bytesToWrite)
-{
-    ma_encoder* pEncoder = (ma_encoder*)pUserData;
-    size_t bytesWritten = 0;
-
-    MA_ASSERT(pEncoder != NULL);
-
-    pEncoder->onWrite(pEncoder, pData, bytesToWrite, &bytesWritten);
-    return bytesWritten;
-}
-
-static ma_bool32 ma_encoder__internal_on_seek_wav(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
-{
-    ma_encoder* pEncoder = (ma_encoder*)pUserData;
-    ma_result result;
-
-    MA_ASSERT(pEncoder != NULL);
-
-    result = pEncoder->onSeek(pEncoder, offset, (origin == ma_dr_wav_seek_origin_start) ? ma_seek_origin_start : ma_seek_origin_current);
-    if (result != MA_SUCCESS) {
-        return MA_FALSE;
-    } else {
-        return MA_TRUE;
-    }
-}
-
-static ma_result ma_encoder__on_init_wav(ma_encoder* pEncoder)
-{
-    ma_dr_wav_data_format wavFormat;
-    ma_allocation_callbacks allocationCallbacks;
-    ma_dr_wav* pWav;
-
-    MA_ASSERT(pEncoder != NULL);
-
-    pWav = (ma_dr_wav*)ma_malloc(sizeof(*pWav), &pEncoder->config.allocationCallbacks);
-    if (pWav == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    wavFormat.container     = ma_dr_wav_container_riff;
-    wavFormat.channels      = pEncoder->config.channels;
-    wavFormat.sampleRate    = pEncoder->config.sampleRate;
-    wavFormat.bitsPerSample = ma_get_bytes_per_sample(pEncoder->config.format) * 8;
-    if (pEncoder->config.format == ma_format_f32) {
-        wavFormat.format    = MA_DR_WAVE_FORMAT_IEEE_FLOAT;
-    } else {
-        wavFormat.format    = MA_DR_WAVE_FORMAT_PCM;
-    }
-
-    allocationCallbacks.pUserData = pEncoder->config.allocationCallbacks.pUserData;
-    allocationCallbacks.onMalloc  = pEncoder->config.allocationCallbacks.onMalloc;
-    allocationCallbacks.onRealloc = pEncoder->config.allocationCallbacks.onRealloc;
-    allocationCallbacks.onFree    = pEncoder->config.allocationCallbacks.onFree;
-
-    if (!ma_dr_wav_init_write(pWav, &wavFormat, ma_encoder__internal_on_write_wav, ma_encoder__internal_on_seek_wav, pEncoder, &allocationCallbacks)) {
-        return MA_ERROR;
-    }
-
-    pEncoder->pInternalEncoder = pWav;
-
-    return MA_SUCCESS;
-}
-
-static void ma_encoder__on_uninit_wav(ma_encoder* pEncoder)
-{
-    ma_dr_wav* pWav;
-
-    MA_ASSERT(pEncoder != NULL);
-
-    pWav = (ma_dr_wav*)pEncoder->pInternalEncoder;
-    MA_ASSERT(pWav != NULL);
-
-    ma_dr_wav_uninit(pWav);
-    ma_free(pWav, &pEncoder->config.allocationCallbacks);
-}
-
-static ma_result ma_encoder__on_write_pcm_frames_wav(ma_encoder* pEncoder, const void* pFramesIn, ma_uint64 frameCount, ma_uint64* pFramesWritten)
-{
-    ma_dr_wav* pWav;
-    ma_uint64 framesWritten;
-
-    MA_ASSERT(pEncoder != NULL);
-
-    pWav = (ma_dr_wav*)pEncoder->pInternalEncoder;
-    MA_ASSERT(pWav != NULL);
-
-    framesWritten = ma_dr_wav_write_pcm_frames(pWav, frameCount, pFramesIn);
-
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = framesWritten;
-    }
-
-    return MA_SUCCESS;
-}
-#endif
-
-MA_API ma_encoder_config ma_encoder_config_init(ma_encoding_format encodingFormat, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
-{
-    ma_encoder_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.encodingFormat = encodingFormat;
-    config.format = format;
-    config.channels = channels;
-    config.sampleRate = sampleRate;
-
-    return config;
-}
-
-MA_API ma_result ma_encoder_preinit(const ma_encoder_config* pConfig, ma_encoder* pEncoder)
-{
-    ma_result result;
-
-    if (pEncoder == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pEncoder);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->format == ma_format_unknown || pConfig->channels == 0 || pConfig->sampleRate == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pEncoder->config = *pConfig;
-
-    result = ma_allocation_callbacks_init_copy(&pEncoder->config.allocationCallbacks, &pConfig->allocationCallbacks);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_encoder_init__internal(ma_encoder_write_proc onWrite, ma_encoder_seek_proc onSeek, void* pUserData, ma_encoder* pEncoder)
-{
-    ma_result result = MA_SUCCESS;
-
-    /* This assumes ma_encoder_preinit() has been called prior. */
-    MA_ASSERT(pEncoder != NULL);
-
-    if (onWrite == NULL || onSeek == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pEncoder->onWrite   = onWrite;
-    pEncoder->onSeek    = onSeek;
-    pEncoder->pUserData = pUserData;
-
-    switch (pEncoder->config.encodingFormat)
-    {
-        case ma_encoding_format_wav:
-        {
-        #if defined(MA_HAS_WAV)
-            pEncoder->onInit           = ma_encoder__on_init_wav;
-            pEncoder->onUninit         = ma_encoder__on_uninit_wav;
-            pEncoder->onWritePCMFrames = ma_encoder__on_write_pcm_frames_wav;
-        #else
-            result = MA_NO_BACKEND;
-        #endif
-        } break;
-
-        default:
-        {
-            result = MA_INVALID_ARGS;
-        } break;
-    }
-
-    /* Getting here means we should have our backend callbacks set up. */
-    if (result == MA_SUCCESS) {
-        result = pEncoder->onInit(pEncoder);
-    }
-
-    return result;
-}
-
-static ma_result ma_encoder__on_write_vfs(ma_encoder* pEncoder, const void* pBufferIn, size_t bytesToWrite, size_t* pBytesWritten)
-{
-    return ma_vfs_or_default_write(pEncoder->data.vfs.pVFS, pEncoder->data.vfs.file, pBufferIn, bytesToWrite, pBytesWritten);
-}
-
-static ma_result ma_encoder__on_seek_vfs(ma_encoder* pEncoder, ma_int64 offset, ma_seek_origin origin)
-{
-    return ma_vfs_or_default_seek(pEncoder->data.vfs.pVFS, pEncoder->data.vfs.file, offset, origin);
-}
-
-MA_API ma_result ma_encoder_init_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
-{
-    ma_result result;
-    ma_vfs_file file;
-
-    result = ma_encoder_preinit(pConfig, pEncoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* Now open the file. If this fails we don't need to uninitialize the encoder. */
-    result = ma_vfs_or_default_open(pVFS, pFilePath, MA_OPEN_MODE_WRITE, &file);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pEncoder->data.vfs.pVFS = pVFS;
-    pEncoder->data.vfs.file = file;
-
-    result = ma_encoder_init__internal(ma_encoder__on_write_vfs, ma_encoder__on_seek_vfs, NULL, pEncoder);
-    if (result != MA_SUCCESS) {
-        ma_vfs_or_default_close(pVFS, file);
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_encoder_init_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
-{
-    ma_result result;
-    ma_vfs_file file;
-
-    result = ma_encoder_preinit(pConfig, pEncoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* Now open the file. If this fails we don't need to uninitialize the encoder. */
-    result = ma_vfs_or_default_open_w(pVFS, pFilePath, MA_OPEN_MODE_WRITE, &file);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pEncoder->data.vfs.pVFS = pVFS;
-    pEncoder->data.vfs.file = file;
-
-    result = ma_encoder_init__internal(ma_encoder__on_write_vfs, ma_encoder__on_seek_vfs, NULL, pEncoder);
-    if (result != MA_SUCCESS) {
-        ma_vfs_or_default_close(pVFS, file);
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_encoder_init_file(const char* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
-{
-    return ma_encoder_init_vfs(NULL, pFilePath, pConfig, pEncoder);
-}
-
-MA_API ma_result ma_encoder_init_file_w(const wchar_t* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
-{
-    return ma_encoder_init_vfs_w(NULL, pFilePath, pConfig, pEncoder);
-}
-
-MA_API ma_result ma_encoder_init(ma_encoder_write_proc onWrite, ma_encoder_seek_proc onSeek, void* pUserData, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
-{
-    ma_result result;
-
-    result = ma_encoder_preinit(pConfig, pEncoder);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return ma_encoder_init__internal(onWrite, onSeek, pUserData, pEncoder);
-}
-
-
-MA_API void ma_encoder_uninit(ma_encoder* pEncoder)
-{
-    if (pEncoder == NULL) {
-        return;
-    }
-
-    if (pEncoder->onUninit) {
-        pEncoder->onUninit(pEncoder);
-    }
-
-    /* If we have a file handle, close it. */
-    if (pEncoder->onWrite == ma_encoder__on_write_vfs) {
-        ma_vfs_or_default_close(pEncoder->data.vfs.pVFS, pEncoder->data.vfs.file);
-        pEncoder->data.vfs.file = NULL;
-    }
-}
-
-
-MA_API ma_result ma_encoder_write_pcm_frames(ma_encoder* pEncoder, const void* pFramesIn, ma_uint64 frameCount, ma_uint64* pFramesWritten)
-{
-    if (pFramesWritten != NULL) {
-        *pFramesWritten = 0;
-    }
-
-    if (pEncoder == NULL || pFramesIn == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return pEncoder->onWritePCMFrames(pEncoder, pFramesIn, frameCount, pFramesWritten);
-}
-#endif  /* MA_NO_ENCODING */
-
-
-
-/**************************************************************************************************************************************************************
-
-Generation
-
-**************************************************************************************************************************************************************/
-#ifndef MA_NO_GENERATION
-MA_API ma_waveform_config ma_waveform_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_waveform_type type, double amplitude, double frequency)
-{
-    ma_waveform_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format     = format;
-    config.channels   = channels;
-    config.sampleRate = sampleRate;
-    config.type       = type;
-    config.amplitude  = amplitude;
-    config.frequency  = frequency;
-
-    return config;
-}
-
-static ma_result ma_waveform__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_waveform_read_pcm_frames((ma_waveform*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_waveform__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_waveform_seek_to_pcm_frame((ma_waveform*)pDataSource, frameIndex);
-}
-
-static ma_result ma_waveform__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    ma_waveform* pWaveform = (ma_waveform*)pDataSource;
-
-    *pFormat     = pWaveform->config.format;
-    *pChannels   = pWaveform->config.channels;
-    *pSampleRate = pWaveform->config.sampleRate;
-    ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pWaveform->config.channels);
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_waveform__data_source_on_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    ma_waveform* pWaveform = (ma_waveform*)pDataSource;
-
-    *pCursor = (ma_uint64)(pWaveform->time / pWaveform->advance);
-
-    return MA_SUCCESS;
-}
-
-static double ma_waveform__calculate_advance(ma_uint32 sampleRate, double frequency)
-{
-    return (1.0 / (sampleRate / frequency));
-}
-
-static void ma_waveform__update_advance(ma_waveform* pWaveform)
-{
-    pWaveform->advance = ma_waveform__calculate_advance(pWaveform->config.sampleRate, pWaveform->config.frequency);
-}
-
-static ma_data_source_vtable g_ma_waveform_data_source_vtable =
-{
-    ma_waveform__data_source_on_read,
-    ma_waveform__data_source_on_seek,
-    ma_waveform__data_source_on_get_data_format,
-    ma_waveform__data_source_on_get_cursor,
-    NULL,   /* onGetLength. There's no notion of a length in waveforms. */
-    NULL,   /* onSetLooping */
-    0
-};
-
-MA_API ma_result ma_waveform_init(const ma_waveform_config* pConfig, ma_waveform* pWaveform)
-{
-    ma_result result;
-    ma_data_source_config dataSourceConfig;
-
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pWaveform);
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_waveform_data_source_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pWaveform->ds);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pWaveform->config  = *pConfig;
-    pWaveform->advance = ma_waveform__calculate_advance(pWaveform->config.sampleRate, pWaveform->config.frequency);
-    pWaveform->time    = 0;
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_waveform_uninit(ma_waveform* pWaveform)
-{
-    if (pWaveform == NULL) {
-        return;
-    }
-
-    ma_data_source_uninit(&pWaveform->ds);
-}
-
-MA_API ma_result ma_waveform_set_amplitude(ma_waveform* pWaveform, double amplitude)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pWaveform->config.amplitude = amplitude;
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_waveform_set_frequency(ma_waveform* pWaveform, double frequency)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pWaveform->config.frequency = frequency;
-    ma_waveform__update_advance(pWaveform);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_waveform_set_type(ma_waveform* pWaveform, ma_waveform_type type)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pWaveform->config.type = type;
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_waveform_set_sample_rate(ma_waveform* pWaveform, ma_uint32 sampleRate)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pWaveform->config.sampleRate = sampleRate;
-    ma_waveform__update_advance(pWaveform);
-
-    return MA_SUCCESS;
-}
-
-static float ma_waveform_sine_f32(double time, double amplitude)
-{
-    return (float)(ma_sind(MA_TAU_D * time) * amplitude);
-}
-
-static ma_int16 ma_waveform_sine_s16(double time, double amplitude)
-{
-    return ma_pcm_sample_f32_to_s16(ma_waveform_sine_f32(time, amplitude));
-}
-
-static float ma_waveform_square_f32(double time, double dutyCycle, double amplitude)
-{
-    double f = time - (ma_int64)time;
-    double r;
-
-    if (f < dutyCycle) {
-        r =  amplitude;
-    } else {
-        r = -amplitude;
-    }
-
-    return (float)r;
-}
-
-static ma_int16 ma_waveform_square_s16(double time, double dutyCycle, double amplitude)
-{
-    return ma_pcm_sample_f32_to_s16(ma_waveform_square_f32(time, dutyCycle, amplitude));
-}
-
-static float ma_waveform_triangle_f32(double time, double amplitude)
-{
-    double f = time - (ma_int64)time;
-    double r;
-
-    r = 2 * ma_abs(2 * (f - 0.5)) - 1;
-
-    return (float)(r * amplitude);
-}
-
-static ma_int16 ma_waveform_triangle_s16(double time, double amplitude)
-{
-    return ma_pcm_sample_f32_to_s16(ma_waveform_triangle_f32(time, amplitude));
-}
-
-static float ma_waveform_sawtooth_f32(double time, double amplitude)
-{
-    double f = time - (ma_int64)time;
-    double r;
-
-    r = 2 * (f - 0.5);
-
-    return (float)(r * amplitude);
-}
-
-static ma_int16 ma_waveform_sawtooth_s16(double time, double amplitude)
-{
-    return ma_pcm_sample_f32_to_s16(ma_waveform_sawtooth_f32(time, amplitude));
-}
-
-static void ma_waveform_read_pcm_frames__sine(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint64 iChannel;
-    ma_uint32 bps = ma_get_bytes_per_sample(pWaveform->config.format);
-    ma_uint32 bpf = bps * pWaveform->config.channels;
-
-    MA_ASSERT(pWaveform  != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-
-    if (pWaveform->config.format == ma_format_f32) {
-        float* pFramesOutF32 = (float*)pFramesOut;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float s = ma_waveform_sine_f32(pWaveform->time, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                pFramesOutF32[iFrame*pWaveform->config.channels + iChannel] = s;
-            }
-        }
-    } else if (pWaveform->config.format == ma_format_s16) {
-        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            ma_int16 s = ma_waveform_sine_s16(pWaveform->time, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                pFramesOutS16[iFrame*pWaveform->config.channels + iChannel] = s;
-            }
-        }
-    } else {
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float s = ma_waveform_sine_f32(pWaveform->time, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pWaveform->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-            }
-        }
-    }
-}
-
-static void ma_waveform_read_pcm_frames__square(ma_waveform* pWaveform, double dutyCycle, void* pFramesOut, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint64 iChannel;
-    ma_uint32 bps = ma_get_bytes_per_sample(pWaveform->config.format);
-    ma_uint32 bpf = bps * pWaveform->config.channels;
-
-    MA_ASSERT(pWaveform  != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-
-    if (pWaveform->config.format == ma_format_f32) {
-        float* pFramesOutF32 = (float*)pFramesOut;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float s = ma_waveform_square_f32(pWaveform->time, dutyCycle, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                pFramesOutF32[iFrame*pWaveform->config.channels + iChannel] = s;
-            }
-        }
-    } else if (pWaveform->config.format == ma_format_s16) {
-        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            ma_int16 s = ma_waveform_square_s16(pWaveform->time, dutyCycle, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                pFramesOutS16[iFrame*pWaveform->config.channels + iChannel] = s;
-            }
-        }
-    } else {
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float s = ma_waveform_square_f32(pWaveform->time, dutyCycle, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pWaveform->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-            }
-        }
-    }
-}
-
-static void ma_waveform_read_pcm_frames__triangle(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint64 iChannel;
-    ma_uint32 bps = ma_get_bytes_per_sample(pWaveform->config.format);
-    ma_uint32 bpf = bps * pWaveform->config.channels;
-
-    MA_ASSERT(pWaveform  != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-
-    if (pWaveform->config.format == ma_format_f32) {
-        float* pFramesOutF32 = (float*)pFramesOut;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float s = ma_waveform_triangle_f32(pWaveform->time, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                pFramesOutF32[iFrame*pWaveform->config.channels + iChannel] = s;
-            }
-        }
-    } else if (pWaveform->config.format == ma_format_s16) {
-        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            ma_int16 s = ma_waveform_triangle_s16(pWaveform->time, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                pFramesOutS16[iFrame*pWaveform->config.channels + iChannel] = s;
-            }
-        }
-    } else {
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float s = ma_waveform_triangle_f32(pWaveform->time, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pWaveform->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-            }
-        }
-    }
-}
-
-static void ma_waveform_read_pcm_frames__sawtooth(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint64 iChannel;
-    ma_uint32 bps = ma_get_bytes_per_sample(pWaveform->config.format);
-    ma_uint32 bpf = bps * pWaveform->config.channels;
-
-    MA_ASSERT(pWaveform  != NULL);
-    MA_ASSERT(pFramesOut != NULL);
-
-    if (pWaveform->config.format == ma_format_f32) {
-        float* pFramesOutF32 = (float*)pFramesOut;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float s = ma_waveform_sawtooth_f32(pWaveform->time, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                pFramesOutF32[iFrame*pWaveform->config.channels + iChannel] = s;
-            }
-        }
-    } else if (pWaveform->config.format == ma_format_s16) {
-        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            ma_int16 s = ma_waveform_sawtooth_s16(pWaveform->time, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                pFramesOutS16[iFrame*pWaveform->config.channels + iChannel] = s;
-            }
-        }
-    } else {
-        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-            float s = ma_waveform_sawtooth_f32(pWaveform->time, pWaveform->config.amplitude);
-            pWaveform->time += pWaveform->advance;
-
-            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
-                ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pWaveform->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-            }
-        }
-    }
-}
-
-MA_API ma_result ma_waveform_read_pcm_frames(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pFramesOut != NULL) {
-        switch (pWaveform->config.type)
-        {
-            case ma_waveform_type_sine:
-            {
-                ma_waveform_read_pcm_frames__sine(pWaveform, pFramesOut, frameCount);
-            } break;
-
-            case ma_waveform_type_square:
-            {
-                ma_waveform_read_pcm_frames__square(pWaveform, 0.5, pFramesOut, frameCount);
-            } break;
-
-            case ma_waveform_type_triangle:
-            {
-                ma_waveform_read_pcm_frames__triangle(pWaveform, pFramesOut, frameCount);
-            } break;
-
-            case ma_waveform_type_sawtooth:
-            {
-                ma_waveform_read_pcm_frames__sawtooth(pWaveform, pFramesOut, frameCount);
-            } break;
-
-            default: return MA_INVALID_OPERATION;   /* Unknown waveform type. */
-        }
-    } else {
-        pWaveform->time += pWaveform->advance * (ma_int64)frameCount; /* Cast to int64 required for VC6. Won't affect anything in practice. */
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = frameCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_waveform_seek_to_pcm_frame(ma_waveform* pWaveform, ma_uint64 frameIndex)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pWaveform->time = pWaveform->advance * (ma_int64)frameIndex;    /* Casting for VC6. Won't be an issue in practice. */
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_pulsewave_config ma_pulsewave_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double dutyCycle, double amplitude, double frequency)
-{
-    ma_pulsewave_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.format     = format;
-    config.channels   = channels;
-    config.sampleRate = sampleRate;
-    config.dutyCycle  = dutyCycle;
-    config.amplitude  = amplitude;
-    config.frequency  = frequency;
-
-    return config;
-}
-
-MA_API ma_result ma_pulsewave_init(const ma_pulsewave_config* pConfig, ma_pulsewave* pWaveform)
-{
-    ma_result result;
-    ma_waveform_config config;
-
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pWaveform);
-
-    config = ma_waveform_config_init(
-        pConfig->format,
-        pConfig->channels,
-        pConfig->sampleRate,
-        ma_waveform_type_square,
-        pConfig->amplitude,
-        pConfig->frequency
-    );
-
-    result = ma_waveform_init(&config, &pWaveform->waveform);
-    ma_pulsewave_set_duty_cycle(pWaveform, pConfig->dutyCycle);
-
-    return result;
-}
-
-MA_API void ma_pulsewave_uninit(ma_pulsewave* pWaveform)
-{
-    if (pWaveform == NULL) {
-        return;
-    }
-
-    ma_waveform_uninit(&pWaveform->waveform);
-}
-
-MA_API ma_result ma_pulsewave_read_pcm_frames(ma_pulsewave* pWaveform, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pFramesOut != NULL) {
-        ma_waveform_read_pcm_frames__square(&pWaveform->waveform, pWaveform->config.dutyCycle, pFramesOut, frameCount);
-    } else {
-        pWaveform->waveform.time += pWaveform->waveform.advance * (ma_int64)frameCount; /* Cast to int64 required for VC6. Won't affect anything in practice. */
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = frameCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_pulsewave_seek_to_pcm_frame(ma_pulsewave* pWaveform, ma_uint64 frameIndex)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_waveform_seek_to_pcm_frame(&pWaveform->waveform, frameIndex);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_pulsewave_set_amplitude(ma_pulsewave* pWaveform, double amplitude)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pWaveform->config.amplitude = amplitude;
-    ma_waveform_set_amplitude(&pWaveform->waveform, amplitude);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_pulsewave_set_frequency(ma_pulsewave* pWaveform, double frequency)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pWaveform->config.frequency = frequency;
-    ma_waveform_set_frequency(&pWaveform->waveform, frequency);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_pulsewave_set_sample_rate(ma_pulsewave* pWaveform, ma_uint32 sampleRate)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pWaveform->config.sampleRate = sampleRate;
-    ma_waveform_set_sample_rate(&pWaveform->waveform, sampleRate);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_pulsewave_set_duty_cycle(ma_pulsewave* pWaveform, double dutyCycle)
-{
-    if (pWaveform == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pWaveform->config.dutyCycle = dutyCycle;
-
-    return MA_SUCCESS;
-}
-
-
-
-MA_API ma_noise_config ma_noise_config_init(ma_format format, ma_uint32 channels, ma_noise_type type, ma_int32 seed, double amplitude)
-{
-    ma_noise_config config;
-    MA_ZERO_OBJECT(&config);
-
-    config.format    = format;
-    config.channels  = channels;
-    config.type      = type;
-    config.seed      = seed;
-    config.amplitude = amplitude;
-
-    if (config.seed == 0) {
-        config.seed = MA_DEFAULT_LCG_SEED;
-    }
-
-    return config;
-}
-
-
-static ma_result ma_noise__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_noise_read_pcm_frames((ma_noise*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_noise__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    /* No-op. Just pretend to be successful. */
-    (void)pDataSource;
-    (void)frameIndex;
-    return MA_SUCCESS;
-}
-
-static ma_result ma_noise__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    ma_noise* pNoise = (ma_noise*)pDataSource;
-
-    *pFormat     = pNoise->config.format;
-    *pChannels   = pNoise->config.channels;
-    *pSampleRate = 0;   /* There is no notion of sample rate with noise generation. */
-    ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pNoise->config.channels);
-
-    return MA_SUCCESS;
-}
-
-static ma_data_source_vtable g_ma_noise_data_source_vtable =
-{
-    ma_noise__data_source_on_read,
-    ma_noise__data_source_on_seek,  /* No-op for noise. */
-    ma_noise__data_source_on_get_data_format,
-    NULL,   /* onGetCursor. No notion of a cursor for noise. */
-    NULL,   /* onGetLength. No notion of a length for noise. */
-    NULL,   /* onSetLooping */
-    0
-};
-
-
-#ifndef MA_PINK_NOISE_BIN_SIZE
-#define MA_PINK_NOISE_BIN_SIZE 16
-#endif
-
-typedef struct
-{
-    size_t sizeInBytes;
-    struct
-    {
-        size_t binOffset;
-        size_t accumulationOffset;
-        size_t counterOffset;
-    } pink;
-    struct
-    {
-        size_t accumulationOffset;
-    } brownian;
-} ma_noise_heap_layout;
-
-static ma_result ma_noise_get_heap_layout(const ma_noise_config* pConfig, ma_noise_heap_layout* pHeapLayout)
-{
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* Pink. */
-    if (pConfig->type == ma_noise_type_pink) {
-        /* bin */
-        pHeapLayout->pink.binOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += sizeof(double*) * pConfig->channels;
-        pHeapLayout->sizeInBytes += sizeof(double ) * pConfig->channels * MA_PINK_NOISE_BIN_SIZE;
-
-        /* accumulation */
-        pHeapLayout->pink.accumulationOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += sizeof(double) * pConfig->channels;
-
-        /* counter */
-        pHeapLayout->pink.counterOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += sizeof(ma_uint32) * pConfig->channels;
-    }
-
-    /* Brownian. */
-    if (pConfig->type == ma_noise_type_brownian) {
-        /* accumulation */
-        pHeapLayout->brownian.accumulationOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += sizeof(double) * pConfig->channels;
-    }
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_noise_get_heap_size(const ma_noise_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_noise_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_noise_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_noise_init_preallocated(const ma_noise_config* pConfig, void* pHeap, ma_noise* pNoise)
-{
-    ma_result result;
-    ma_noise_heap_layout heapLayout;
-    ma_data_source_config dataSourceConfig;
-    ma_uint32 iChannel;
-
-    if (pNoise == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNoise);
-
-    result = ma_noise_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pNoise->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pNoise->_pHeap, heapLayout.sizeInBytes);
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_noise_data_source_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pNoise->ds);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pNoise->config = *pConfig;
-    ma_lcg_seed(&pNoise->lcg, pConfig->seed);
-
-    if (pNoise->config.type == ma_noise_type_pink) {
-        pNoise->state.pink.bin          = (double**  )ma_offset_ptr(pHeap, heapLayout.pink.binOffset);
-        pNoise->state.pink.accumulation = (double*   )ma_offset_ptr(pHeap, heapLayout.pink.accumulationOffset);
-        pNoise->state.pink.counter      = (ma_uint32*)ma_offset_ptr(pHeap, heapLayout.pink.counterOffset);
-
-        for (iChannel = 0; iChannel < pConfig->channels; iChannel += 1) {
-            pNoise->state.pink.bin[iChannel]          = (double*)ma_offset_ptr(pHeap, heapLayout.pink.binOffset + (sizeof(double*) * pConfig->channels) + (sizeof(double) * MA_PINK_NOISE_BIN_SIZE * iChannel));
-            pNoise->state.pink.accumulation[iChannel] = 0;
-            pNoise->state.pink.counter[iChannel]      = 1;
-        }
-    }
-
-    if (pNoise->config.type == ma_noise_type_brownian) {
-        pNoise->state.brownian.accumulation = (double*)ma_offset_ptr(pHeap, heapLayout.brownian.accumulationOffset);
-
-        for (iChannel = 0; iChannel < pConfig->channels; iChannel += 1) {
-            pNoise->state.brownian.accumulation[iChannel] = 0;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_noise_init(const ma_noise_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_noise* pNoise)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_noise_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_noise_init_preallocated(pConfig, pHeap, pNoise);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pNoise->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_noise_uninit(ma_noise* pNoise, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pNoise == NULL) {
-        return;
-    }
-
-    ma_data_source_uninit(&pNoise->ds);
-
-    if (pNoise->_ownsHeap) {
-        ma_free(pNoise->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_result ma_noise_set_amplitude(ma_noise* pNoise, double amplitude)
-{
-    if (pNoise == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pNoise->config.amplitude = amplitude;
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_noise_set_seed(ma_noise* pNoise, ma_int32 seed)
-{
-    if (pNoise == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pNoise->lcg.state = seed;
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_result ma_noise_set_type(ma_noise* pNoise, ma_noise_type type)
-{
-    if (pNoise == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /*
-    This function should never have been implemented in the first place. Changing the type dynamically is not
-    supported. Instead you need to uninitialize and reinitiailize a fresh `ma_noise` object. This function
-    will be removed in version 0.12.
-    */
-    MA_ASSERT(MA_FALSE);
-    (void)type;
-
-    return MA_INVALID_OPERATION;
-}
-
-static MA_INLINE float ma_noise_f32_white(ma_noise* pNoise)
-{
-    return (float)(ma_lcg_rand_f64(&pNoise->lcg) * pNoise->config.amplitude);
-}
-
-static MA_INLINE ma_int16 ma_noise_s16_white(ma_noise* pNoise)
-{
-    return ma_pcm_sample_f32_to_s16(ma_noise_f32_white(pNoise));
-}
-
-static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannel;
-    const ma_uint32 channels = pNoise->config.channels;
-    MA_ASSUME(channels > 0);
-
-    if (pNoise->config.format == ma_format_f32) {
-        float* pFramesOutF32 = (float*)pFramesOut;
-        if (pNoise->config.duplicateChannels) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                float s = ma_noise_f32_white(pNoise);
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*channels + iChannel] = s;
-                }
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*channels + iChannel] = ma_noise_f32_white(pNoise);
-                }
-            }
-        }
-    } else if (pNoise->config.format == ma_format_s16) {
-        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
-        if (pNoise->config.duplicateChannels) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                ma_int16 s = ma_noise_s16_white(pNoise);
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*channels + iChannel] = s;
-                }
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*channels + iChannel] = ma_noise_s16_white(pNoise);
-                }
-            }
-        }
-    } else {
-        const ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
-        const ma_uint32 bpf = bps * channels;
-
-        if (pNoise->config.duplicateChannels) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                float s = ma_noise_f32_white(pNoise);
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-                }
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    float s = ma_noise_f32_white(pNoise);
-                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-                }
-            }
-        }
-    }
-
-    return frameCount;
-}
-
-
-static MA_INLINE unsigned int ma_tzcnt32(unsigned int x)
-{
-    unsigned int n;
-
-    /* Special case for odd numbers since they should happen about half the time. */
-    if (x & 0x1)  {
-        return 0;
-    }
-
-    if (x == 0) {
-        return sizeof(x) << 3;
-    }
-
-    n = 1;
-    if ((x & 0x0000FFFF) == 0) { x >>= 16; n += 16; }
-    if ((x & 0x000000FF) == 0) { x >>=  8; n +=  8; }
-    if ((x & 0x0000000F) == 0) { x >>=  4; n +=  4; }
-    if ((x & 0x00000003) == 0) { x >>=  2; n +=  2; }
-    n -= x & 0x00000001;
-
-    return n;
-}
-
-/*
-Pink noise generation based on Tonic (public domain) with modifications. https://github.com/TonicAudio/Tonic/blob/master/src/Tonic/Noise.h
-
-This is basically _the_ reference for pink noise from what I've found: http://www.firstpr.com.au/dsp/pink-noise/
-*/
-static MA_INLINE float ma_noise_f32_pink(ma_noise* pNoise, ma_uint32 iChannel)
-{
-    double result;
-    double binPrev;
-    double binNext;
-    unsigned int ibin;
-
-    ibin = ma_tzcnt32(pNoise->state.pink.counter[iChannel]) & (MA_PINK_NOISE_BIN_SIZE - 1);
-
-    binPrev = pNoise->state.pink.bin[iChannel][ibin];
-    binNext = ma_lcg_rand_f64(&pNoise->lcg);
-    pNoise->state.pink.bin[iChannel][ibin] = binNext;
-
-    pNoise->state.pink.accumulation[iChannel] += (binNext - binPrev);
-    pNoise->state.pink.counter[iChannel]      += 1;
-
-    result = (ma_lcg_rand_f64(&pNoise->lcg) + pNoise->state.pink.accumulation[iChannel]);
-    result /= 10;
-
-    return (float)(result * pNoise->config.amplitude);
-}
-
-static MA_INLINE ma_int16 ma_noise_s16_pink(ma_noise* pNoise, ma_uint32 iChannel)
-{
-    return ma_pcm_sample_f32_to_s16(ma_noise_f32_pink(pNoise, iChannel));
-}
-
-static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannel;
-    const ma_uint32 channels = pNoise->config.channels;
-    MA_ASSUME(channels > 0);
-
-    if (pNoise->config.format == ma_format_f32) {
-        float* pFramesOutF32 = (float*)pFramesOut;
-        if (pNoise->config.duplicateChannels) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                float s = ma_noise_f32_pink(pNoise, 0);
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*channels + iChannel] = s;
-                }
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*channels + iChannel] = ma_noise_f32_pink(pNoise, iChannel);
-                }
-            }
-        }
-    } else if (pNoise->config.format == ma_format_s16) {
-        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
-        if (pNoise->config.duplicateChannels) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                ma_int16 s = ma_noise_s16_pink(pNoise, 0);
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*channels + iChannel] = s;
-                }
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*channels + iChannel] = ma_noise_s16_pink(pNoise, iChannel);
-                }
-            }
-        }
-    } else {
-        const ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
-        const ma_uint32 bpf = bps * channels;
-
-        if (pNoise->config.duplicateChannels) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                float s = ma_noise_f32_pink(pNoise, 0);
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-                }
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    float s = ma_noise_f32_pink(pNoise, iChannel);
-                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-                }
-            }
-        }
-    }
-
-    return frameCount;
-}
-
-
-static MA_INLINE float ma_noise_f32_brownian(ma_noise* pNoise, ma_uint32 iChannel)
-{
-    double result;
-
-    result = (ma_lcg_rand_f64(&pNoise->lcg) + pNoise->state.brownian.accumulation[iChannel]);
-    result /= 1.005; /* Don't escape the -1..1 range on average. */
-
-    pNoise->state.brownian.accumulation[iChannel] = result;
-    result /= 20;
-
-    return (float)(result * pNoise->config.amplitude);
-}
-
-static MA_INLINE ma_int16 ma_noise_s16_brownian(ma_noise* pNoise, ma_uint32 iChannel)
-{
-    return ma_pcm_sample_f32_to_s16(ma_noise_f32_brownian(pNoise, iChannel));
-}
-
-static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount)
-{
-    ma_uint64 iFrame;
-    ma_uint32 iChannel;
-    const ma_uint32 channels = pNoise->config.channels;
-    MA_ASSUME(channels > 0);
-
-    if (pNoise->config.format == ma_format_f32) {
-        float* pFramesOutF32 = (float*)pFramesOut;
-        if (pNoise->config.duplicateChannels) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                float s = ma_noise_f32_brownian(pNoise, 0);
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*channels + iChannel] = s;
-                }
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*channels + iChannel] = ma_noise_f32_brownian(pNoise, iChannel);
-                }
-            }
-        }
-    } else if (pNoise->config.format == ma_format_s16) {
-        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
-        if (pNoise->config.duplicateChannels) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                ma_int16 s = ma_noise_s16_brownian(pNoise, 0);
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*channels + iChannel] = s;
-                }
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*channels + iChannel] = ma_noise_s16_brownian(pNoise, iChannel);
-                }
-            }
-        }
-    } else {
-        const ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
-        const ma_uint32 bpf = bps * channels;
-
-        if (pNoise->config.duplicateChannels) {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                float s = ma_noise_f32_brownian(pNoise, 0);
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-                }
-            }
-        } else {
-            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < channels; iChannel += 1) {
-                    float s = ma_noise_f32_brownian(pNoise, iChannel);
-                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
-                }
-            }
-        }
-    }
-
-    return frameCount;
-}
-
-MA_API ma_result ma_noise_read_pcm_frames(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_uint64 framesRead = 0;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pNoise == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The output buffer is allowed to be NULL. Since we aren't tracking cursors or anything we can just do nothing and pretend to be successful. */
-    if (pFramesOut == NULL) {
-        framesRead = frameCount;
-    } else {
-        switch (pNoise->config.type) {
-            case ma_noise_type_white:    framesRead = ma_noise_read_pcm_frames__white   (pNoise, pFramesOut, frameCount); break;
-            case ma_noise_type_pink:     framesRead = ma_noise_read_pcm_frames__pink    (pNoise, pFramesOut, frameCount); break;
-            case ma_noise_type_brownian: framesRead = ma_noise_read_pcm_frames__brownian(pNoise, pFramesOut, frameCount); break;
-            default: return MA_INVALID_OPERATION;   /* Unknown noise type. */
-        }
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = framesRead;
-    }
-
-    return MA_SUCCESS;
-}
-#endif /* MA_NO_GENERATION */
-
-
-
-#ifndef MA_NO_RESOURCE_MANAGER
-#ifndef MA_RESOURCE_MANAGER_PAGE_SIZE_IN_MILLISECONDS
-#define MA_RESOURCE_MANAGER_PAGE_SIZE_IN_MILLISECONDS   1000
-#endif
-
-#ifndef MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_CAPACITY
-#define MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_CAPACITY          1024
-#endif
-
-MA_API ma_resource_manager_pipeline_notifications ma_resource_manager_pipeline_notifications_init(void)
-{
-    ma_resource_manager_pipeline_notifications notifications;
-
-    MA_ZERO_OBJECT(&notifications);
-
-    return notifications;
-}
-
-static void ma_resource_manager_pipeline_notifications_signal_all_notifications(const ma_resource_manager_pipeline_notifications* pPipelineNotifications)
-{
-    if (pPipelineNotifications == NULL) {
-        return;
-    }
-
-    if (pPipelineNotifications->init.pNotification) { ma_async_notification_signal(pPipelineNotifications->init.pNotification); }
-    if (pPipelineNotifications->done.pNotification) { ma_async_notification_signal(pPipelineNotifications->done.pNotification); }
-}
-
-static void ma_resource_manager_pipeline_notifications_acquire_all_fences(const ma_resource_manager_pipeline_notifications* pPipelineNotifications)
-{
-    if (pPipelineNotifications == NULL) {
-        return;
-    }
-
-    if (pPipelineNotifications->init.pFence != NULL) { ma_fence_acquire(pPipelineNotifications->init.pFence); }
-    if (pPipelineNotifications->done.pFence != NULL) { ma_fence_acquire(pPipelineNotifications->done.pFence); }
-}
-
-static void ma_resource_manager_pipeline_notifications_release_all_fences(const ma_resource_manager_pipeline_notifications* pPipelineNotifications)
-{
-    if (pPipelineNotifications == NULL) {
-        return;
-    }
-
-    if (pPipelineNotifications->init.pFence != NULL) { ma_fence_release(pPipelineNotifications->init.pFence); }
-    if (pPipelineNotifications->done.pFence != NULL) { ma_fence_release(pPipelineNotifications->done.pFence); }
-}
-
-
-
-#ifndef MA_DEFAULT_HASH_SEED
-#define MA_DEFAULT_HASH_SEED    42
-#endif
-
-/* MurmurHash3. Based on code from https://github.com/PeterScott/murmur3/blob/master/murmur3.c (public domain). */
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-    #pragma GCC diagnostic push
-    #if __GNUC__ >= 7
-    #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
-    #endif
-#endif
-
-static MA_INLINE ma_uint32 ma_rotl32(ma_uint32 x, ma_int8 r)
-{
-    return (x << r) | (x >> (32 - r));
-}
-
-static MA_INLINE ma_uint32 ma_hash_getblock(const ma_uint32* blocks, int i)
-{
-    ma_uint32 block;
-
-    /* Try silencing a sanitization warning about unaligned access by doing a memcpy() instead of assignment. */
-    MA_COPY_MEMORY(&block, ma_offset_ptr(blocks, i * sizeof(block)), sizeof(block));
-
-    if (ma_is_little_endian()) {
-        return block;
-    } else {
-        return ma_swap_endian_uint32(block);
-    }
-}
-
-static MA_INLINE ma_uint32 ma_hash_fmix32(ma_uint32 h)
-{
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-
-    return h;
-}
-
-static ma_uint32 ma_hash_32(const void* key, int len, ma_uint32 seed)
-{
-    const ma_uint8* data = (const ma_uint8*)key;
-    const ma_uint32* blocks;
-    const ma_uint8* tail;
-    const int nblocks = len / 4;
-    ma_uint32 h1 = seed;
-    ma_uint32 c1 = 0xcc9e2d51;
-    ma_uint32 c2 = 0x1b873593;
-    ma_uint32 k1;
-    int i;
-
-    blocks = (const ma_uint32 *)(data + nblocks*4);
-
-    for(i = -nblocks; i; i++) {
-        k1 = ma_hash_getblock(blocks,i);
-
-        k1 *= c1;
-        k1 = ma_rotl32(k1, 15);
-        k1 *= c2;
-
-        h1 ^= k1;
-        h1 = ma_rotl32(h1, 13);
-        h1 = h1*5 + 0xe6546b64;
-    }
-
-
-    tail = (const ma_uint8*)(data + nblocks*4);
-
-    k1 = 0;
-    switch(len & 3) {
-        case 3: k1 ^= tail[2] << 16;
-        case 2: k1 ^= tail[1] << 8;
-        case 1: k1 ^= tail[0];
-                k1 *= c1; k1 = ma_rotl32(k1, 15); k1 *= c2; h1 ^= k1;
-    };
-
-
-    h1 ^= len;
-    h1  = ma_hash_fmix32(h1);
-
-    return h1;
-}
-
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-    #pragma GCC diagnostic push
-#endif
-/* End MurmurHash3 */
-
-static ma_uint32 ma_hash_string_32(const char* str)
-{
-    return ma_hash_32(str, (int)strlen(str), MA_DEFAULT_HASH_SEED);
-}
-
-static ma_uint32 ma_hash_string_w_32(const wchar_t* str)
-{
-    return ma_hash_32(str, (int)wcslen(str) * sizeof(*str), MA_DEFAULT_HASH_SEED);
-}
-
-
-
-
-/*
-Basic BST Functions
-*/
-static ma_result ma_resource_manager_data_buffer_node_search(ma_resource_manager* pResourceManager, ma_uint32 hashedName32, ma_resource_manager_data_buffer_node** ppDataBufferNode)
-{
-    ma_resource_manager_data_buffer_node* pCurrentNode;
-
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(ppDataBufferNode != NULL);
-
-    pCurrentNode = pResourceManager->pRootDataBufferNode;
-    while (pCurrentNode != NULL) {
-        if (hashedName32 == pCurrentNode->hashedName32) {
-            break;  /* Found. */
-        } else if (hashedName32 < pCurrentNode->hashedName32) {
-            pCurrentNode = pCurrentNode->pChildLo;
-        } else {
-            pCurrentNode = pCurrentNode->pChildHi;
-        }
-    }
-
-    *ppDataBufferNode = pCurrentNode;
-
-    if (pCurrentNode == NULL) {
-        return MA_DOES_NOT_EXIST;
-    } else {
-        return MA_SUCCESS;
-    }
-}
-
-static ma_result ma_resource_manager_data_buffer_node_insert_point(ma_resource_manager* pResourceManager, ma_uint32 hashedName32, ma_resource_manager_data_buffer_node** ppInsertPoint)
-{
-    ma_result result = MA_SUCCESS;
-    ma_resource_manager_data_buffer_node* pCurrentNode;
-
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(ppInsertPoint    != NULL);
-
-    *ppInsertPoint = NULL;
-
-    if (pResourceManager->pRootDataBufferNode == NULL) {
-        return MA_SUCCESS;  /* No items. */
-    }
-
-    /* We need to find the node that will become the parent of the new node. If a node is found that already has the same hashed name we need to return MA_ALREADY_EXISTS. */
-    pCurrentNode = pResourceManager->pRootDataBufferNode;
-    while (pCurrentNode != NULL) {
-        if (hashedName32 == pCurrentNode->hashedName32) {
-            result = MA_ALREADY_EXISTS;
-            break;
-        } else {
-            if (hashedName32 < pCurrentNode->hashedName32) {
-                if (pCurrentNode->pChildLo == NULL) {
-                    result = MA_SUCCESS;
-                    break;
-                } else {
-                    pCurrentNode = pCurrentNode->pChildLo;
-                }
-            } else {
-                if (pCurrentNode->pChildHi == NULL) {
-                    result = MA_SUCCESS;
-                    break;
-                } else {
-                    pCurrentNode = pCurrentNode->pChildHi;
-                }
-            }
-        }
-    }
-
-    *ppInsertPoint = pCurrentNode;
-    return result;
-}
-
-static ma_result ma_resource_manager_data_buffer_node_insert_at(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, ma_resource_manager_data_buffer_node* pInsertPoint)
-{
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBufferNode  != NULL);
-
-    /* The key must have been set before calling this function. */
-    MA_ASSERT(pDataBufferNode->hashedName32 != 0);
-
-    if (pInsertPoint == NULL) {
-        /* It's the first node. */
-        pResourceManager->pRootDataBufferNode = pDataBufferNode;
-    } else {
-        /* It's not the first node. It needs to be inserted. */
-        if (pDataBufferNode->hashedName32 < pInsertPoint->hashedName32) {
-            MA_ASSERT(pInsertPoint->pChildLo == NULL);
-            pInsertPoint->pChildLo = pDataBufferNode;
-        } else {
-            MA_ASSERT(pInsertPoint->pChildHi == NULL);
-            pInsertPoint->pChildHi = pDataBufferNode;
-        }
-    }
-
-    pDataBufferNode->pParent = pInsertPoint;
-
-    return MA_SUCCESS;
-}
-
-#if 0   /* Unused for now. */
-static ma_result ma_resource_manager_data_buffer_node_insert(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    ma_result result;
-    ma_resource_manager_data_buffer_node* pInsertPoint;
-
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBufferNode  != NULL);
-
-    result = ma_resource_manager_data_buffer_node_insert_point(pResourceManager, pDataBufferNode->hashedName32, &pInsertPoint);
-    if (result != MA_SUCCESS) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_resource_manager_data_buffer_node_insert_at(pResourceManager, pDataBufferNode, pInsertPoint);
-}
-#endif
-
-static MA_INLINE ma_resource_manager_data_buffer_node* ma_resource_manager_data_buffer_node_find_min(ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    ma_resource_manager_data_buffer_node* pCurrentNode;
-
-    MA_ASSERT(pDataBufferNode != NULL);
-
-    pCurrentNode = pDataBufferNode;
-    while (pCurrentNode->pChildLo != NULL) {
-        pCurrentNode = pCurrentNode->pChildLo;
-    }
-
-    return pCurrentNode;
-}
-
-static MA_INLINE ma_resource_manager_data_buffer_node* ma_resource_manager_data_buffer_node_find_max(ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    ma_resource_manager_data_buffer_node* pCurrentNode;
-
-    MA_ASSERT(pDataBufferNode != NULL);
-
-    pCurrentNode = pDataBufferNode;
-    while (pCurrentNode->pChildHi != NULL) {
-        pCurrentNode = pCurrentNode->pChildHi;
-    }
-
-    return pCurrentNode;
-}
-
-static MA_INLINE ma_resource_manager_data_buffer_node* ma_resource_manager_data_buffer_node_find_inorder_successor(ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    MA_ASSERT(pDataBufferNode           != NULL);
-    MA_ASSERT(pDataBufferNode->pChildHi != NULL);
-
-    return ma_resource_manager_data_buffer_node_find_min(pDataBufferNode->pChildHi);
-}
-
-static MA_INLINE ma_resource_manager_data_buffer_node* ma_resource_manager_data_buffer_node_find_inorder_predecessor(ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    MA_ASSERT(pDataBufferNode           != NULL);
-    MA_ASSERT(pDataBufferNode->pChildLo != NULL);
-
-    return ma_resource_manager_data_buffer_node_find_max(pDataBufferNode->pChildLo);
-}
-
-static ma_result ma_resource_manager_data_buffer_node_remove(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBufferNode  != NULL);
-
-    if (pDataBufferNode->pChildLo == NULL) {
-        if (pDataBufferNode->pChildHi == NULL) {
-            /* Simple case - deleting a buffer with no children. */
-            if (pDataBufferNode->pParent == NULL) {
-                MA_ASSERT(pResourceManager->pRootDataBufferNode == pDataBufferNode);    /* There is only a single buffer in the tree which should be equal to the root node. */
-                pResourceManager->pRootDataBufferNode = NULL;
-            } else {
-                if (pDataBufferNode->pParent->pChildLo == pDataBufferNode) {
-                    pDataBufferNode->pParent->pChildLo = NULL;
-                } else {
-                    pDataBufferNode->pParent->pChildHi = NULL;
-                }
-            }
-        } else {
-            /* Node has one child - pChildHi != NULL. */
-            pDataBufferNode->pChildHi->pParent = pDataBufferNode->pParent;
-
-            if (pDataBufferNode->pParent == NULL) {
-                MA_ASSERT(pResourceManager->pRootDataBufferNode == pDataBufferNode);
-                pResourceManager->pRootDataBufferNode = pDataBufferNode->pChildHi;
-            } else {
-                if (pDataBufferNode->pParent->pChildLo == pDataBufferNode) {
-                    pDataBufferNode->pParent->pChildLo = pDataBufferNode->pChildHi;
-                } else {
-                    pDataBufferNode->pParent->pChildHi = pDataBufferNode->pChildHi;
-                }
-            }
-        }
-    } else {
-        if (pDataBufferNode->pChildHi == NULL) {
-            /* Node has one child - pChildLo != NULL. */
-            pDataBufferNode->pChildLo->pParent = pDataBufferNode->pParent;
-
-            if (pDataBufferNode->pParent == NULL) {
-                MA_ASSERT(pResourceManager->pRootDataBufferNode == pDataBufferNode);
-                pResourceManager->pRootDataBufferNode = pDataBufferNode->pChildLo;
-            } else {
-                if (pDataBufferNode->pParent->pChildLo == pDataBufferNode) {
-                    pDataBufferNode->pParent->pChildLo = pDataBufferNode->pChildLo;
-                } else {
-                    pDataBufferNode->pParent->pChildHi = pDataBufferNode->pChildLo;
-                }
-            }
-        } else {
-            /* Complex case - deleting a node with two children. */
-            ma_resource_manager_data_buffer_node* pReplacementDataBufferNode;
-
-            /* For now we are just going to use the in-order successor as the replacement, but we may want to try to keep this balanced by switching between the two. */
-            pReplacementDataBufferNode = ma_resource_manager_data_buffer_node_find_inorder_successor(pDataBufferNode);
-            MA_ASSERT(pReplacementDataBufferNode != NULL);
-
-            /*
-            Now that we have our replacement node we can make the change. The simple way to do this would be to just exchange the values, and then remove the replacement
-            node, however we track specific nodes via pointers which means we can't just swap out the values. We need to instead just change the pointers around. The
-            replacement node should have at most 1 child. Therefore, we can detach it in terms of our simpler cases above. What we're essentially doing is detaching the
-            replacement node and reinserting it into the same position as the deleted node.
-            */
-            MA_ASSERT(pReplacementDataBufferNode->pParent  != NULL);  /* The replacement node should never be the root which means it should always have a parent. */
-            MA_ASSERT(pReplacementDataBufferNode->pChildLo == NULL);  /* Because we used in-order successor. This would be pChildHi == NULL if we used in-order predecessor. */
-
-            if (pReplacementDataBufferNode->pChildHi == NULL) {
-                if (pReplacementDataBufferNode->pParent->pChildLo == pReplacementDataBufferNode) {
-                    pReplacementDataBufferNode->pParent->pChildLo = NULL;
-                } else {
-                    pReplacementDataBufferNode->pParent->pChildHi = NULL;
-                }
-            } else {
-                pReplacementDataBufferNode->pChildHi->pParent = pReplacementDataBufferNode->pParent;
-                if (pReplacementDataBufferNode->pParent->pChildLo == pReplacementDataBufferNode) {
-                    pReplacementDataBufferNode->pParent->pChildLo = pReplacementDataBufferNode->pChildHi;
-                } else {
-                    pReplacementDataBufferNode->pParent->pChildHi = pReplacementDataBufferNode->pChildHi;
-                }
-            }
-
-
-            /* The replacement node has essentially been detached from the binary tree, so now we need to replace the old data buffer with it. The first thing to update is the parent */
-            if (pDataBufferNode->pParent != NULL) {
-                if (pDataBufferNode->pParent->pChildLo == pDataBufferNode) {
-                    pDataBufferNode->pParent->pChildLo = pReplacementDataBufferNode;
-                } else {
-                    pDataBufferNode->pParent->pChildHi = pReplacementDataBufferNode;
-                }
-            }
-
-            /* Now need to update the replacement node's pointers. */
-            pReplacementDataBufferNode->pParent  = pDataBufferNode->pParent;
-            pReplacementDataBufferNode->pChildLo = pDataBufferNode->pChildLo;
-            pReplacementDataBufferNode->pChildHi = pDataBufferNode->pChildHi;
-
-            /* Now the children of the replacement node need to have their parent pointers updated. */
-            if (pReplacementDataBufferNode->pChildLo != NULL) {
-                pReplacementDataBufferNode->pChildLo->pParent = pReplacementDataBufferNode;
-            }
-            if (pReplacementDataBufferNode->pChildHi != NULL) {
-                pReplacementDataBufferNode->pChildHi->pParent = pReplacementDataBufferNode;
-            }
-
-            /* Now the root node needs to be updated. */
-            if (pResourceManager->pRootDataBufferNode == pDataBufferNode) {
-                pResourceManager->pRootDataBufferNode = pReplacementDataBufferNode;
-            }
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-#if 0   /* Unused for now. */
-static ma_result ma_resource_manager_data_buffer_node_remove_by_key(ma_resource_manager* pResourceManager, ma_uint32 hashedName32)
-{
-    ma_result result;
-    ma_resource_manager_data_buffer_node* pDataBufferNode;
-
-    result = ma_resource_manager_data_buffer_search(pResourceManager, hashedName32, &pDataBufferNode);
-    if (result != MA_SUCCESS) {
-        return result;  /* Could not find the data buffer. */
-    }
-
-    return ma_resource_manager_data_buffer_remove(pResourceManager, pDataBufferNode);
-}
-#endif
-
-static ma_resource_manager_data_supply_type ma_resource_manager_data_buffer_node_get_data_supply_type(ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    return (ma_resource_manager_data_supply_type)ma_atomic_load_i32(&pDataBufferNode->data.type);
-}
-
-static void ma_resource_manager_data_buffer_node_set_data_supply_type(ma_resource_manager_data_buffer_node* pDataBufferNode, ma_resource_manager_data_supply_type supplyType)
-{
-    ma_atomic_exchange_i32(&pDataBufferNode->data.type, supplyType);
-}
-
-static ma_result ma_resource_manager_data_buffer_node_increment_ref(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, ma_uint32* pNewRefCount)
-{
-    ma_uint32 refCount;
-
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBufferNode  != NULL);
-
-    (void)pResourceManager;
-
-    refCount = ma_atomic_fetch_add_32(&pDataBufferNode->refCount, 1) + 1;
-
-    if (pNewRefCount != NULL) {
-        *pNewRefCount = refCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_resource_manager_data_buffer_node_decrement_ref(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, ma_uint32* pNewRefCount)
-{
-    ma_uint32 refCount;
-
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBufferNode  != NULL);
-
-    (void)pResourceManager;
-
-    refCount = ma_atomic_fetch_sub_32(&pDataBufferNode->refCount, 1) - 1;
-
-    if (pNewRefCount != NULL) {
-        *pNewRefCount = refCount;
-    }
-
-    return MA_SUCCESS;
-}
-
-static void ma_resource_manager_data_buffer_node_free(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBufferNode  != NULL);
-
-    if (pDataBufferNode->isDataOwnedByResourceManager) {
-        if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode) == ma_resource_manager_data_supply_type_encoded) {
-            ma_free((void*)pDataBufferNode->data.backend.encoded.pData, &pResourceManager->config.allocationCallbacks);
-            pDataBufferNode->data.backend.encoded.pData       = NULL;
-            pDataBufferNode->data.backend.encoded.sizeInBytes = 0;
-        } else if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode) == ma_resource_manager_data_supply_type_decoded) {
-            ma_free((void*)pDataBufferNode->data.backend.decoded.pData, &pResourceManager->config.allocationCallbacks);
-            pDataBufferNode->data.backend.decoded.pData           = NULL;
-            pDataBufferNode->data.backend.decoded.totalFrameCount = 0;
-        } else if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode) == ma_resource_manager_data_supply_type_decoded_paged) {
-            ma_paged_audio_buffer_data_uninit(&pDataBufferNode->data.backend.decodedPaged.data, &pResourceManager->config.allocationCallbacks);
-        } else {
-            /* Should never hit this if the node was successfully initialized. */
-            MA_ASSERT(pDataBufferNode->result != MA_SUCCESS);
-        }
-    }
-
-    /* The data buffer itself needs to be freed. */
-    ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
-}
-
-static ma_result ma_resource_manager_data_buffer_node_result(const ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    MA_ASSERT(pDataBufferNode != NULL);
-
-    return (ma_result)ma_atomic_load_i32((ma_result*)&pDataBufferNode->result);    /* Need a naughty const-cast here. */
-}
-
-
-static ma_bool32 ma_resource_manager_is_threading_enabled(const ma_resource_manager* pResourceManager)
-{
-    MA_ASSERT(pResourceManager != NULL);
-
-    return (pResourceManager->config.flags & MA_RESOURCE_MANAGER_FLAG_NO_THREADING) == 0;
-}
-
-
-typedef struct
-{
-    union
-    {
-        ma_async_notification_event e;
-        ma_async_notification_poll p;
-    } backend;  /* Must be the first member. */
-    ma_resource_manager* pResourceManager;
-} ma_resource_manager_inline_notification;
-
-static ma_result ma_resource_manager_inline_notification_init(ma_resource_manager* pResourceManager, ma_resource_manager_inline_notification* pNotification)
-{
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pNotification    != NULL);
-
-    pNotification->pResourceManager = pResourceManager;
-
-    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
-        return ma_async_notification_event_init(&pNotification->backend.e);
-    } else {
-        return ma_async_notification_poll_init(&pNotification->backend.p);
-    }
-}
-
-static void ma_resource_manager_inline_notification_uninit(ma_resource_manager_inline_notification* pNotification)
-{
-    MA_ASSERT(pNotification != NULL);
-
-    if (ma_resource_manager_is_threading_enabled(pNotification->pResourceManager)) {
-        ma_async_notification_event_uninit(&pNotification->backend.e);
-    } else {
-        /* No need to uninitialize a polling notification. */
-    }
-}
-
-static void ma_resource_manager_inline_notification_wait(ma_resource_manager_inline_notification* pNotification)
-{
-    MA_ASSERT(pNotification != NULL);
-
-    if (ma_resource_manager_is_threading_enabled(pNotification->pResourceManager)) {
-        ma_async_notification_event_wait(&pNotification->backend.e);
-    } else {
-        while (ma_async_notification_poll_is_signalled(&pNotification->backend.p) == MA_FALSE) {
-            ma_result result = ma_resource_manager_process_next_job(pNotification->pResourceManager);
-            if (result == MA_NO_DATA_AVAILABLE || result == MA_CANCELLED) {
-                break;
-            }
-        }
-    }
-}
-
-static void ma_resource_manager_inline_notification_wait_and_uninit(ma_resource_manager_inline_notification* pNotification)
-{
-    ma_resource_manager_inline_notification_wait(pNotification);
-    ma_resource_manager_inline_notification_uninit(pNotification);
-}
-
-
-static void ma_resource_manager_data_buffer_bst_lock(ma_resource_manager* pResourceManager)
-{
-    MA_ASSERT(pResourceManager != NULL);
-
-    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
-        #ifndef MA_NO_THREADING
-        {
-            ma_mutex_lock(&pResourceManager->dataBufferBSTLock);
-        }
-        #else
-        {
-            MA_ASSERT(MA_FALSE);    /* Should never hit this. */
-        }
-        #endif
-    } else {
-        /* Threading not enabled. Do nothing. */
-    }
-}
-
-static void ma_resource_manager_data_buffer_bst_unlock(ma_resource_manager* pResourceManager)
-{
-    MA_ASSERT(pResourceManager != NULL);
-
-    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
-        #ifndef MA_NO_THREADING
-        {
-            ma_mutex_unlock(&pResourceManager->dataBufferBSTLock);
-        }
-        #else
-        {
-            MA_ASSERT(MA_FALSE);    /* Should never hit this. */
-        }
-        #endif
-    } else {
-        /* Threading not enabled. Do nothing. */
-    }
-}
-
-#ifndef MA_NO_THREADING
-static ma_thread_result MA_THREADCALL ma_resource_manager_job_thread(void* pUserData)
-{
-    ma_resource_manager* pResourceManager = (ma_resource_manager*)pUserData;
-    MA_ASSERT(pResourceManager != NULL);
-
-    for (;;) {
-        ma_result result;
-        ma_job job;
-
-        result = ma_resource_manager_next_job(pResourceManager, &job);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        /* Terminate if we got a quit message. */
-        if (job.toc.breakup.code == MA_JOB_TYPE_QUIT) {
-            break;
-        }
-
-        ma_job_process(&job);
-    }
-
-    return (ma_thread_result)0;
-}
-#endif
-
-MA_API ma_resource_manager_config ma_resource_manager_config_init(void)
-{
-    ma_resource_manager_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.decodedFormat     = ma_format_unknown;
-    config.decodedChannels   = 0;
-    config.decodedSampleRate = 0;
-    config.jobThreadCount    = 1;   /* A single miniaudio-managed job thread by default. */
-    config.jobQueueCapacity  = MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_CAPACITY;
-
-    /* Flags. */
-    config.flags = 0;
-    #ifdef MA_NO_THREADING
-    {
-        /* Threading is disabled at compile time so disable threading at runtime as well by default. */
-        config.flags |= MA_RESOURCE_MANAGER_FLAG_NO_THREADING;
-        config.jobThreadCount = 0;
-    }
-    #endif
-
-    return config;
-}
-
-
-MA_API ma_result ma_resource_manager_init(const ma_resource_manager_config* pConfig, ma_resource_manager* pResourceManager)
-{
-    ma_result result;
-    ma_job_queue_config jobQueueConfig;
-
-    if (pResourceManager == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pResourceManager);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #ifndef MA_NO_THREADING
-    {
-        if (pConfig->jobThreadCount > ma_countof(pResourceManager->jobThreads)) {
-            return MA_INVALID_ARGS; /* Requesting too many job threads. */
-        }
-    }
-    #endif
-
-    pResourceManager->config = *pConfig;
-    ma_allocation_callbacks_init_copy(&pResourceManager->config.allocationCallbacks, &pConfig->allocationCallbacks);
-
-    /* Get the log set up early so we can start using it as soon as possible. */
-    if (pResourceManager->config.pLog == NULL) {
-        result = ma_log_init(&pResourceManager->config.allocationCallbacks, &pResourceManager->log);
-        if (result == MA_SUCCESS) {
-            pResourceManager->config.pLog = &pResourceManager->log;
-        } else {
-            pResourceManager->config.pLog = NULL;   /* Logging is unavailable. */
-        }
-    }
-
-    if (pResourceManager->config.pVFS == NULL) {
-        result = ma_default_vfs_init(&pResourceManager->defaultVFS, &pResourceManager->config.allocationCallbacks);
-        if (result != MA_SUCCESS) {
-            return result;  /* Failed to initialize the default file system. */
-        }
-
-        pResourceManager->config.pVFS = &pResourceManager->defaultVFS;
-    }
-
-    /* If threading has been disabled at compile time, enfore it at run time as well. */
-    #ifdef MA_NO_THREADING
-    {
-        pResourceManager->config.flags |= MA_RESOURCE_MANAGER_FLAG_NO_THREADING;
-    }
-    #endif
-
-    /* We need to force MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING if MA_RESOURCE_MANAGER_FLAG_NO_THREADING is set. */
-    if ((pResourceManager->config.flags & MA_RESOURCE_MANAGER_FLAG_NO_THREADING) != 0) {
-        pResourceManager->config.flags |= MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING;
-
-        /* We cannot allow job threads when MA_RESOURCE_MANAGER_FLAG_NO_THREADING has been set. This is an invalid use case. */
-        if (pResourceManager->config.jobThreadCount > 0) {
-            return MA_INVALID_ARGS;
-        }
-    }
-
-    /* Job queue. */
-    jobQueueConfig.capacity = pResourceManager->config.jobQueueCapacity;
-    jobQueueConfig.flags    = 0;
-    if ((pResourceManager->config.flags & MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING) != 0) {
-        if (pResourceManager->config.jobThreadCount > 0) {
-            return MA_INVALID_ARGS; /* Non-blocking mode is only valid for self-managed job threads. */
-        }
-
-        jobQueueConfig.flags |= MA_JOB_QUEUE_FLAG_NON_BLOCKING;
-    }
-
-    result = ma_job_queue_init(&jobQueueConfig, &pResourceManager->config.allocationCallbacks, &pResourceManager->jobQueue);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-
-    /* Custom decoding backends. */
-    if (pConfig->ppCustomDecodingBackendVTables != NULL && pConfig->customDecodingBackendCount > 0) {
-        size_t sizeInBytes = sizeof(*pResourceManager->config.ppCustomDecodingBackendVTables) * pConfig->customDecodingBackendCount;
-
-        pResourceManager->config.ppCustomDecodingBackendVTables = (ma_decoding_backend_vtable**)ma_malloc(sizeInBytes, &pResourceManager->config.allocationCallbacks);
-        if (pResourceManager->config.ppCustomDecodingBackendVTables == NULL) {
-            ma_job_queue_uninit(&pResourceManager->jobQueue, &pResourceManager->config.allocationCallbacks);
-            return MA_OUT_OF_MEMORY;
-        }
-
-        MA_COPY_MEMORY(pResourceManager->config.ppCustomDecodingBackendVTables, pConfig->ppCustomDecodingBackendVTables, sizeInBytes);
-
-        pResourceManager->config.customDecodingBackendCount     = pConfig->customDecodingBackendCount;
-        pResourceManager->config.pCustomDecodingBackendUserData = pConfig->pCustomDecodingBackendUserData;
-    }
-
-
-
-    /* Here is where we initialize our threading stuff. We don't do this if we don't support threading. */
-    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
-        #ifndef MA_NO_THREADING
-        {
-            ma_uint32 iJobThread;
-
-            /* Data buffer lock. */
-            result = ma_mutex_init(&pResourceManager->dataBufferBSTLock);
-            if (result != MA_SUCCESS) {
-                ma_job_queue_uninit(&pResourceManager->jobQueue, &pResourceManager->config.allocationCallbacks);
-                return result;
-            }
-
-            /* Create the job threads last to ensure the threads has access to valid data. */
-            for (iJobThread = 0; iJobThread < pResourceManager->config.jobThreadCount; iJobThread += 1) {
-                result = ma_thread_create(&pResourceManager->jobThreads[iJobThread], ma_thread_priority_normal, pResourceManager->config.jobThreadStackSize, ma_resource_manager_job_thread, pResourceManager, &pResourceManager->config.allocationCallbacks);
-                if (result != MA_SUCCESS) {
-                    ma_mutex_uninit(&pResourceManager->dataBufferBSTLock);
-                    ma_job_queue_uninit(&pResourceManager->jobQueue, &pResourceManager->config.allocationCallbacks);
-                    return result;
-                }
-            }
-        }
-        #else
-        {
-            /* Threading is disabled at compile time. We should never get here because validation checks should have already been performed. */
-            MA_ASSERT(MA_FALSE);
-        }
-        #endif
-    }
-
-    return MA_SUCCESS;
-}
-
-
-static void ma_resource_manager_delete_all_data_buffer_nodes(ma_resource_manager* pResourceManager)
-{
-    MA_ASSERT(pResourceManager);
-
-    /* If everything was done properly, there shouldn't be any active data buffers. */
-    while (pResourceManager->pRootDataBufferNode != NULL) {
-        ma_resource_manager_data_buffer_node* pDataBufferNode = pResourceManager->pRootDataBufferNode;
-        ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
-
-        /* The data buffer has been removed from the BST, so now we need to free it's data. */
-        ma_resource_manager_data_buffer_node_free(pResourceManager, pDataBufferNode);
-    }
-}
-
-MA_API void ma_resource_manager_uninit(ma_resource_manager* pResourceManager)
-{
-    if (pResourceManager == NULL) {
-        return;
-    }
-
-    /*
-    Job threads need to be killed first. To do this we need to post a quit message to the message queue and then wait for the thread. The quit message will never be removed from the
-    queue which means it will never not be returned after being encounted for the first time which means all threads will eventually receive it.
-    */
-    ma_resource_manager_post_job_quit(pResourceManager);
-
-    /* Wait for every job to finish before continuing to ensure nothing is sill trying to access any of our objects below. */
-    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
-        #ifndef MA_NO_THREADING
-        {
-            ma_uint32 iJobThread;
-
-            for (iJobThread = 0; iJobThread < pResourceManager->config.jobThreadCount; iJobThread += 1) {
-                ma_thread_wait(&pResourceManager->jobThreads[iJobThread]);
-            }
-        }
-        #else
-        {
-            MA_ASSERT(MA_FALSE);    /* Should never hit this. */
-        }
-        #endif
-    }
-
-    /* At this point the thread should have returned and no other thread should be accessing our data. We can now delete all data buffers. */
-    ma_resource_manager_delete_all_data_buffer_nodes(pResourceManager);
-
-    /* The job queue is no longer needed. */
-    ma_job_queue_uninit(&pResourceManager->jobQueue, &pResourceManager->config.allocationCallbacks);
-
-    /* We're no longer doing anything with data buffers so the lock can now be uninitialized. */
-    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
-        #ifndef MA_NO_THREADING
-        {
-            ma_mutex_uninit(&pResourceManager->dataBufferBSTLock);
-        }
-        #else
-        {
-            MA_ASSERT(MA_FALSE);    /* Should never hit this. */
-        }
-        #endif
-    }
-
-    ma_free(pResourceManager->config.ppCustomDecodingBackendVTables, &pResourceManager->config.allocationCallbacks);
-
-    if (pResourceManager->config.pLog == &pResourceManager->log) {
-        ma_log_uninit(&pResourceManager->log);
-    }
-}
-
-MA_API ma_log* ma_resource_manager_get_log(ma_resource_manager* pResourceManager)
-{
-    if (pResourceManager == NULL) {
-        return NULL;
-    }
-
-    return pResourceManager->config.pLog;
-}
-
-
-
-MA_API ma_resource_manager_data_source_config ma_resource_manager_data_source_config_init(void)
-{
-    ma_resource_manager_data_source_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.rangeBegInPCMFrames     = MA_DATA_SOURCE_DEFAULT_RANGE_BEG;
-    config.rangeEndInPCMFrames     = MA_DATA_SOURCE_DEFAULT_RANGE_END;
-    config.loopPointBegInPCMFrames = MA_DATA_SOURCE_DEFAULT_LOOP_POINT_BEG;
-    config.loopPointEndInPCMFrames = MA_DATA_SOURCE_DEFAULT_LOOP_POINT_END;
-    config.isLooping               = MA_FALSE;
-
-    return config;
-}
-
-
-static ma_decoder_config ma_resource_manager__init_decoder_config(ma_resource_manager* pResourceManager)
-{
-    ma_decoder_config config;
-
-    config = ma_decoder_config_init(pResourceManager->config.decodedFormat, pResourceManager->config.decodedChannels, pResourceManager->config.decodedSampleRate);
-    config.allocationCallbacks    = pResourceManager->config.allocationCallbacks;
-    config.ppCustomBackendVTables = pResourceManager->config.ppCustomDecodingBackendVTables;
-    config.customBackendCount     = pResourceManager->config.customDecodingBackendCount;
-    config.pCustomBackendUserData = pResourceManager->config.pCustomDecodingBackendUserData;
-
-    return config;
-}
-
-static ma_result ma_resource_manager__init_decoder(ma_resource_manager* pResourceManager, const char* pFilePath, const wchar_t* pFilePathW, ma_decoder* pDecoder)
-{
-    ma_result result;
-    ma_decoder_config config;
-
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pFilePath        != NULL || pFilePathW != NULL);
-    MA_ASSERT(pDecoder         != NULL);
-
-    config = ma_resource_manager__init_decoder_config(pResourceManager);
-
-    if (pFilePath != NULL) {
-        result = ma_decoder_init_vfs(pResourceManager->config.pVFS, pFilePath, &config, pDecoder);
-        if (result != MA_SUCCESS) {
-            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to load file \"%s\". %s.\n", pFilePath, ma_result_description(result));
-            return result;
-        }
-    } else {
-        result = ma_decoder_init_vfs_w(pResourceManager->config.pVFS, pFilePathW, &config, pDecoder);
-        if (result != MA_SUCCESS) {
-            #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(_MSC_VER)
-                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to load file \"%ls\". %s.\n", pFilePathW, ma_result_description(result));
-            #endif
-            return result;
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_bool32 ma_resource_manager_data_buffer_has_connector(ma_resource_manager_data_buffer* pDataBuffer)
-{
-    return ma_atomic_bool32_get(&pDataBuffer->isConnectorInitialized);
-}
-
-static ma_data_source* ma_resource_manager_data_buffer_get_connector(ma_resource_manager_data_buffer* pDataBuffer)
-{
-    if (ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE) {
-        return NULL;    /* Connector not yet initialized. */
-    }
-
-    switch (pDataBuffer->pNode->data.type)
-    {
-        case ma_resource_manager_data_supply_type_encoded:       return &pDataBuffer->connector.decoder;
-        case ma_resource_manager_data_supply_type_decoded:       return &pDataBuffer->connector.buffer;
-        case ma_resource_manager_data_supply_type_decoded_paged: return &pDataBuffer->connector.pagedBuffer;
-
-        case ma_resource_manager_data_supply_type_unknown:
-        default:
-        {
-            ma_log_postf(ma_resource_manager_get_log(pDataBuffer->pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to retrieve data buffer connector. Unknown data supply type.\n");
-            return NULL;
-        };
-    };
-}
-
-static ma_result ma_resource_manager_data_buffer_init_connector(ma_resource_manager_data_buffer* pDataBuffer, const ma_resource_manager_data_source_config* pConfig, ma_async_notification* pInitNotification, ma_fence* pInitFence)
-{
-    ma_result result;
-
-    MA_ASSERT(pDataBuffer != NULL);
-    MA_ASSERT(pConfig     != NULL);
-    MA_ASSERT(ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE);
-
-    /* The underlying data buffer must be initialized before we'll be able to know how to initialize the backend. */
-    result = ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode);
-    if (result != MA_SUCCESS && result != MA_BUSY) {
-        return result;  /* The data buffer is in an erroneous state. */
-    }
-
-    /*
-    We need to initialize either a ma_decoder or an ma_audio_buffer depending on whether or not the backing data is encoded or decoded. These act as the
-    "instance" to the data and are used to form the connection between underlying data buffer and the data source. If the data buffer is decoded, we can use
-    an ma_audio_buffer. This enables us to use memory mapping when mixing which saves us a bit of data movement overhead.
-    */
-    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
-    {
-        case ma_resource_manager_data_supply_type_encoded:          /* Connector is a decoder. */
-        {
-            ma_decoder_config config;
-            config = ma_resource_manager__init_decoder_config(pDataBuffer->pResourceManager);
-            result = ma_decoder_init_memory(pDataBuffer->pNode->data.backend.encoded.pData, pDataBuffer->pNode->data.backend.encoded.sizeInBytes, &config, &pDataBuffer->connector.decoder);
-        } break;
-
-        case ma_resource_manager_data_supply_type_decoded:          /* Connector is an audio buffer. */
-        {
-            ma_audio_buffer_config config;
-            config = ma_audio_buffer_config_init(pDataBuffer->pNode->data.backend.decoded.format, pDataBuffer->pNode->data.backend.decoded.channels, pDataBuffer->pNode->data.backend.decoded.totalFrameCount, pDataBuffer->pNode->data.backend.decoded.pData, NULL);
-            result = ma_audio_buffer_init(&config, &pDataBuffer->connector.buffer);
-        } break;
-
-        case ma_resource_manager_data_supply_type_decoded_paged:    /* Connector is a paged audio buffer. */
-        {
-            ma_paged_audio_buffer_config config;
-            config = ma_paged_audio_buffer_config_init(&pDataBuffer->pNode->data.backend.decodedPaged.data);
-            result = ma_paged_audio_buffer_init(&config, &pDataBuffer->connector.pagedBuffer);
-        } break;
-
-        case ma_resource_manager_data_supply_type_unknown:
-        default:
-        {
-            /* Unknown data supply type. Should never happen. Need to post an error here. */
-            return MA_INVALID_ARGS;
-        };
-    }
-
-    /*
-    Initialization of the connector is when we can fire the init notification. This will give the application access to
-    the format/channels/rate of the data source.
-    */
-    if (result == MA_SUCCESS) {
-        /*
-        The resource manager supports the ability to set the range and loop settings via a config at
-        initialization time. This results in an case where the ranges could be set explicitly via
-        ma_data_source_set_*() before we get to this point here. If this happens, we'll end up
-        hitting a case where we just override those settings which results in what feels like a bug.
-
-        To address this we only change the relevant properties if they're not equal to defaults. If
-        they're equal to defaults there's no need to change them anyway. If they're *not* set to the
-        default values, we can assume the user has set the range and loop settings via the config. If
-        they're doing their own calls to ma_data_source_set_*() in addition to setting them via the
-        config, that's entirely on the caller and any synchronization issue becomes their problem.
-        */
-        if (pConfig->rangeBegInPCMFrames != MA_DATA_SOURCE_DEFAULT_RANGE_BEG || pConfig->rangeEndInPCMFrames != MA_DATA_SOURCE_DEFAULT_RANGE_END) {
-            ma_data_source_set_range_in_pcm_frames(pDataBuffer, pConfig->rangeBegInPCMFrames, pConfig->rangeEndInPCMFrames);
-        }
-
-        if (pConfig->loopPointBegInPCMFrames != MA_DATA_SOURCE_DEFAULT_LOOP_POINT_BEG || pConfig->loopPointEndInPCMFrames != MA_DATA_SOURCE_DEFAULT_LOOP_POINT_END) {
-            ma_data_source_set_loop_point_in_pcm_frames(pDataBuffer, pConfig->loopPointBegInPCMFrames, pConfig->loopPointEndInPCMFrames);
-        }
-
-        if (pConfig->isLooping != MA_FALSE) {
-            ma_data_source_set_looping(pDataBuffer, pConfig->isLooping);
-        }
-
-        ma_atomic_bool32_set(&pDataBuffer->isConnectorInitialized, MA_TRUE);
-
-        if (pInitNotification != NULL) {
-            ma_async_notification_signal(pInitNotification);
-        }
-
-        if (pInitFence != NULL) {
-            ma_fence_release(pInitFence);
-        }
-    }
-
-    /* At this point the backend should be initialized. We do *not* want to set pDataSource->result here - that needs to be done at a higher level to ensure it's done as the last step. */
-    return result;
-}
-
-static ma_result ma_resource_manager_data_buffer_uninit_connector(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer* pDataBuffer)
-{
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBuffer      != NULL);
-
-    (void)pResourceManager;
-
-    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
-    {
-        case ma_resource_manager_data_supply_type_encoded:          /* Connector is a decoder. */
-        {
-            ma_decoder_uninit(&pDataBuffer->connector.decoder);
-        } break;
-
-        case ma_resource_manager_data_supply_type_decoded:          /* Connector is an audio buffer. */
-        {
-            ma_audio_buffer_uninit(&pDataBuffer->connector.buffer);
-        } break;
-
-        case ma_resource_manager_data_supply_type_decoded_paged:    /* Connector is a paged audio buffer. */
-        {
-            ma_paged_audio_buffer_uninit(&pDataBuffer->connector.pagedBuffer);
-        } break;
-
-        case ma_resource_manager_data_supply_type_unknown:
-        default:
-        {
-            /* Unknown data supply type. Should never happen. Need to post an error here. */
-            return MA_INVALID_ARGS;
-        };
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_uint32 ma_resource_manager_data_buffer_node_next_execution_order(ma_resource_manager_data_buffer_node* pDataBufferNode)
-{
-    MA_ASSERT(pDataBufferNode != NULL);
-    return ma_atomic_fetch_add_32(&pDataBufferNode->executionCounter, 1);
-}
-
-static ma_result ma_resource_manager_data_buffer_node_init_supply_encoded(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, const char* pFilePath, const wchar_t* pFilePathW)
-{
-    ma_result result;
-    size_t dataSizeInBytes;
-    void* pData;
-
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBufferNode  != NULL);
-    MA_ASSERT(pFilePath != NULL || pFilePathW != NULL);
-
-    result = ma_vfs_open_and_read_file_ex(pResourceManager->config.pVFS, pFilePath, pFilePathW, &pData, &dataSizeInBytes, &pResourceManager->config.allocationCallbacks);
-    if (result != MA_SUCCESS) {
-        if (pFilePath != NULL) {
-            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to load file \"%s\". %s.\n", pFilePath, ma_result_description(result));
-        } else {
-            #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(_MSC_VER)
-                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to load file \"%ls\". %s.\n", pFilePathW, ma_result_description(result));
-            #endif
-        }
-
-        return result;
-    }
-
-    pDataBufferNode->data.backend.encoded.pData       = pData;
-    pDataBufferNode->data.backend.encoded.sizeInBytes = dataSizeInBytes;
-    ma_resource_manager_data_buffer_node_set_data_supply_type(pDataBufferNode, ma_resource_manager_data_supply_type_encoded);  /* <-- Must be set last. */
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_resource_manager_data_buffer_node_init_supply_decoded(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, const char* pFilePath, const wchar_t* pFilePathW, ma_uint32 flags, ma_decoder** ppDecoder)
-{
-    ma_result result = MA_SUCCESS;
-    ma_decoder* pDecoder;
-    ma_uint64 totalFrameCount;
-
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBufferNode  != NULL);
-    MA_ASSERT(ppDecoder         != NULL);
-    MA_ASSERT(pFilePath != NULL || pFilePathW != NULL);
-
-    *ppDecoder = NULL;  /* For safety. */
-
-    pDecoder = (ma_decoder*)ma_malloc(sizeof(*pDecoder), &pResourceManager->config.allocationCallbacks);
-    if (pDecoder == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_resource_manager__init_decoder(pResourceManager, pFilePath, pFilePathW, pDecoder);
-    if (result != MA_SUCCESS) {
-        ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
-        return result;
-    }
-
-    /*
-    At this point we have the decoder and we now need to initialize the data supply. This will
-    be either a decoded buffer, or a decoded paged buffer. A regular buffer is just one big heap
-    allocated buffer, whereas a paged buffer is a linked list of paged-sized buffers. The latter
-    is used when the length of a sound is unknown until a full decode has been performed.
-    */
-    if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_UNKNOWN_LENGTH) == 0) {
-        result = ma_decoder_get_length_in_pcm_frames(pDecoder, &totalFrameCount);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    } else {
-        totalFrameCount = 0;
-    }
-
-    if (totalFrameCount > 0) {
-        /* It's a known length. The data supply is a regular decoded buffer. */
-        ma_uint64 dataSizeInBytes;
-        void* pData;
-
-        dataSizeInBytes = totalFrameCount * ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
-        if (dataSizeInBytes > MA_SIZE_MAX) {
-            ma_decoder_uninit(pDecoder);
-            ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
-            return MA_TOO_BIG;
-        }
-
-        pData = ma_malloc((size_t)dataSizeInBytes, &pResourceManager->config.allocationCallbacks);
-        if (pData == NULL) {
-            ma_decoder_uninit(pDecoder);
-            ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
-            return MA_OUT_OF_MEMORY;
-        }
-
-        /* The buffer needs to be initialized to silence in case the caller reads from it. */
-        ma_silence_pcm_frames(pData, totalFrameCount, pDecoder->outputFormat, pDecoder->outputChannels);
-
-        /* Data has been allocated and the data supply can now be initialized. */
-        pDataBufferNode->data.backend.decoded.pData             = pData;
-        pDataBufferNode->data.backend.decoded.totalFrameCount   = totalFrameCount;
-        pDataBufferNode->data.backend.decoded.format            = pDecoder->outputFormat;
-        pDataBufferNode->data.backend.decoded.channels          = pDecoder->outputChannels;
-        pDataBufferNode->data.backend.decoded.sampleRate        = pDecoder->outputSampleRate;
-        pDataBufferNode->data.backend.decoded.decodedFrameCount = 0;
-        ma_resource_manager_data_buffer_node_set_data_supply_type(pDataBufferNode, ma_resource_manager_data_supply_type_decoded);  /* <-- Must be set last. */
-    } else {
-        /*
-        It's an unknown length. The data supply is a paged decoded buffer. Setting this up is
-        actually easier than the non-paged decoded buffer because we just need to initialize
-        a ma_paged_audio_buffer object.
-        */
-        result = ma_paged_audio_buffer_data_init(pDecoder->outputFormat, pDecoder->outputChannels, &pDataBufferNode->data.backend.decodedPaged.data);
-        if (result != MA_SUCCESS) {
-            ma_decoder_uninit(pDecoder);
-            ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
-            return result;
-        }
-
-        pDataBufferNode->data.backend.decodedPaged.sampleRate        = pDecoder->outputSampleRate;
-        pDataBufferNode->data.backend.decodedPaged.decodedFrameCount = 0;
-        ma_resource_manager_data_buffer_node_set_data_supply_type(pDataBufferNode, ma_resource_manager_data_supply_type_decoded_paged);  /* <-- Must be set last. */
-    }
-
-    *ppDecoder = pDecoder;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_resource_manager_data_buffer_node_decode_next_page(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, ma_decoder* pDecoder)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint64 pageSizeInFrames;
-    ma_uint64 framesToTryReading;
-    ma_uint64 framesRead;
-
-    MA_ASSERT(pResourceManager != NULL);
-    MA_ASSERT(pDataBufferNode  != NULL);
-    MA_ASSERT(pDecoder         != NULL);
-
-    /* We need to know the size of a page in frames to know how many frames to decode. */
-    pageSizeInFrames = MA_RESOURCE_MANAGER_PAGE_SIZE_IN_MILLISECONDS * (pDecoder->outputSampleRate/1000);
-    framesToTryReading = pageSizeInFrames;
-
-    /*
-    Here is where we do the decoding of the next page. We'll run a slightly different path depending
-    on whether or not we're using a flat or paged buffer because the allocation of the page differs
-    between the two. For a flat buffer it's an offset to an already-allocated buffer. For a paged
-    buffer, we need to allocate a new page and attach it to the linked list.
-    */
-    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode))
-    {
-        case ma_resource_manager_data_supply_type_decoded:
-        {
-            /* The destination buffer is an offset to the existing buffer. Don't read more than we originally retrieved when we first initialized the decoder. */
-            void* pDst;
-            ma_uint64 framesRemaining = pDataBufferNode->data.backend.decoded.totalFrameCount - pDataBufferNode->data.backend.decoded.decodedFrameCount;
-            if (framesToTryReading > framesRemaining) {
-                framesToTryReading = framesRemaining;
-            }
-
-            if (framesToTryReading > 0) {
-                pDst = ma_offset_ptr(
-                    pDataBufferNode->data.backend.decoded.pData,
-                    pDataBufferNode->data.backend.decoded.decodedFrameCount * ma_get_bytes_per_frame(pDataBufferNode->data.backend.decoded.format, pDataBufferNode->data.backend.decoded.channels)
-                );
-                MA_ASSERT(pDst != NULL);
-
-                result = ma_decoder_read_pcm_frames(pDecoder, pDst, framesToTryReading, &framesRead);
-                if (framesRead > 0) {
-                    pDataBufferNode->data.backend.decoded.decodedFrameCount += framesRead;
-                }
-            } else {
-                framesRead = 0;
-            }
-        } break;
-
-        case ma_resource_manager_data_supply_type_decoded_paged:
-        {
-            /* The destination buffer is a freshly allocated page. */
-            ma_paged_audio_buffer_page* pPage;
-
-            result = ma_paged_audio_buffer_data_allocate_page(&pDataBufferNode->data.backend.decodedPaged.data, framesToTryReading, NULL, &pResourceManager->config.allocationCallbacks, &pPage);
-            if (result != MA_SUCCESS) {
-                return result;
-            }
-
-            result = ma_decoder_read_pcm_frames(pDecoder, pPage->pAudioData, framesToTryReading, &framesRead);
-            if (framesRead > 0) {
-                pPage->sizeInFrames = framesRead;
-
-                result = ma_paged_audio_buffer_data_append_page(&pDataBufferNode->data.backend.decodedPaged.data, pPage);
-                if (result == MA_SUCCESS) {
-                    pDataBufferNode->data.backend.decodedPaged.decodedFrameCount += framesRead;
-                } else {
-                    /* Failed to append the page. Just abort and set the status to MA_AT_END. */
-                    ma_paged_audio_buffer_data_free_page(&pDataBufferNode->data.backend.decodedPaged.data, pPage, &pResourceManager->config.allocationCallbacks);
-                    result = MA_AT_END;
-                }
-            } else {
-                /* No frames were read. Free the page and just set the status to MA_AT_END. */
-                ma_paged_audio_buffer_data_free_page(&pDataBufferNode->data.backend.decodedPaged.data, pPage, &pResourceManager->config.allocationCallbacks);
-                result = MA_AT_END;
-            }
-        } break;
-
-        case ma_resource_manager_data_supply_type_encoded:
-        case ma_resource_manager_data_supply_type_unknown:
-        default:
-        {
-            /* Unexpected data supply type. */
-            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Unexpected data supply type (%d) when decoding page.", ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode));
-            return MA_ERROR;
-        };
-    }
-
-    if (result == MA_SUCCESS && framesRead == 0) {
-        result = MA_AT_END;
-    }
-
-    return result;
-}
-
-static ma_result ma_resource_manager_data_buffer_node_acquire_critical_section(ma_resource_manager* pResourceManager, const char* pFilePath, const wchar_t* pFilePathW, ma_uint32 hashedName32, ma_uint32 flags, const ma_resource_manager_data_supply* pExistingData, ma_fence* pInitFence, ma_fence* pDoneFence, ma_resource_manager_inline_notification* pInitNotification, ma_resource_manager_data_buffer_node** ppDataBufferNode)
-{
-    ma_result result = MA_SUCCESS;
-    ma_resource_manager_data_buffer_node* pDataBufferNode = NULL;
-    ma_resource_manager_data_buffer_node* pInsertPoint;
-
-    if (ppDataBufferNode != NULL) {
-        *ppDataBufferNode = NULL;
-    }
-
-    result = ma_resource_manager_data_buffer_node_insert_point(pResourceManager, hashedName32, &pInsertPoint);
-    if (result == MA_ALREADY_EXISTS) {
-        /* The node already exists. We just need to increment the reference count. */
-        pDataBufferNode = pInsertPoint;
-
-        result = ma_resource_manager_data_buffer_node_increment_ref(pResourceManager, pDataBufferNode, NULL);
-        if (result != MA_SUCCESS) {
-            return result;  /* Should never happen. Failed to increment the reference count. */
-        }
-
-        result = MA_ALREADY_EXISTS;
-        goto done;
-    } else {
-        /*
-        The node does not already exist. We need to post a LOAD_DATA_BUFFER_NODE job here. This
-        needs to be done inside the critical section to ensure an uninitialization of the node
-        does not occur before initialization on another thread.
-        */
-        pDataBufferNode = (ma_resource_manager_data_buffer_node*)ma_malloc(sizeof(*pDataBufferNode), &pResourceManager->config.allocationCallbacks);
-        if (pDataBufferNode == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-
-        MA_ZERO_OBJECT(pDataBufferNode);
-        pDataBufferNode->hashedName32 = hashedName32;
-        pDataBufferNode->refCount     = 1;        /* Always set to 1 by default (this is our first reference). */
-
-        if (pExistingData == NULL) {
-            pDataBufferNode->data.type    = ma_resource_manager_data_supply_type_unknown;    /* <-- We won't know this until we start decoding. */
-            pDataBufferNode->result       = MA_BUSY;  /* Must be set to MA_BUSY before we leave the critical section, so might as well do it now. */
-            pDataBufferNode->isDataOwnedByResourceManager = MA_TRUE;
-        } else {
-            pDataBufferNode->data         = *pExistingData;
-            pDataBufferNode->result       = MA_SUCCESS;   /* Not loading asynchronously, so just set the status */
-            pDataBufferNode->isDataOwnedByResourceManager = MA_FALSE;
-        }
-
-        result = ma_resource_manager_data_buffer_node_insert_at(pResourceManager, pDataBufferNode, pInsertPoint);
-        if (result != MA_SUCCESS) {
-            ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
-            return result;  /* Should never happen. Failed to insert the data buffer into the BST. */
-        }
-
-        /*
-        Here is where we'll post the job, but only if we're loading asynchronously. If we're
-        loading synchronously we'll defer loading to a later stage, outside of the critical
-        section.
-        */
-        if (pDataBufferNode->isDataOwnedByResourceManager && (flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) != 0) {
-            /* Loading asynchronously. Post the job. */
-            ma_job job;
-            char* pFilePathCopy = NULL;
-            wchar_t* pFilePathWCopy = NULL;
-
-            /* We need a copy of the file path. We should probably make this more efficient, but for now we'll do a transient memory allocation. */
-            if (pFilePath != NULL) {
-                pFilePathCopy = ma_copy_string(pFilePath, &pResourceManager->config.allocationCallbacks);
-            } else {
-                pFilePathWCopy = ma_copy_string_w(pFilePathW, &pResourceManager->config.allocationCallbacks);
-            }
-
-            if (pFilePathCopy == NULL && pFilePathWCopy == NULL) {
-                ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
-                ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
-                return MA_OUT_OF_MEMORY;
-            }
-
-            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-                ma_resource_manager_inline_notification_init(pResourceManager, pInitNotification);
-            }
-
-            /* Acquire init and done fences before posting the job. These will be unacquired by the job thread. */
-            if (pInitFence != NULL) { ma_fence_acquire(pInitFence); }
-            if (pDoneFence != NULL) { ma_fence_acquire(pDoneFence); }
-
-            /* We now have everything we need to post the job to the job thread. */
-            job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE);
-            job.order = ma_resource_manager_data_buffer_node_next_execution_order(pDataBufferNode);
-            job.data.resourceManager.loadDataBufferNode.pResourceManager  = pResourceManager;
-            job.data.resourceManager.loadDataBufferNode.pDataBufferNode   = pDataBufferNode;
-            job.data.resourceManager.loadDataBufferNode.pFilePath         = pFilePathCopy;
-            job.data.resourceManager.loadDataBufferNode.pFilePathW        = pFilePathWCopy;
-            job.data.resourceManager.loadDataBufferNode.flags             = flags;
-            job.data.resourceManager.loadDataBufferNode.pInitNotification = ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) ? pInitNotification : NULL;
-            job.data.resourceManager.loadDataBufferNode.pDoneNotification = NULL;
-            job.data.resourceManager.loadDataBufferNode.pInitFence        = pInitFence;
-            job.data.resourceManager.loadDataBufferNode.pDoneFence        = pDoneFence;
-
-            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-                result = ma_job_process(&job);
-            } else {
-                result = ma_resource_manager_post_job(pResourceManager, &job);
-            }
-
-            if (result != MA_SUCCESS) {
-                /* Failed to post job. Probably ran out of memory. */
-                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to post MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE job. %s.\n", ma_result_description(result));
-
-                /*
-                Fences were acquired before posting the job, but since the job was not able to
-                be posted, we need to make sure we release them so nothing gets stuck waiting.
-                */
-                if (pInitFence != NULL) { ma_fence_release(pInitFence); }
-                if (pDoneFence != NULL) { ma_fence_release(pDoneFence); }
-
-                if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-                    ma_resource_manager_inline_notification_uninit(pInitNotification);
-                } else {
-                    /* These will have been freed by the job thread, but with WAIT_INIT they will already have happend sinced the job has already been handled. */
-                    ma_free(pFilePathCopy,  &pResourceManager->config.allocationCallbacks);
-                    ma_free(pFilePathWCopy, &pResourceManager->config.allocationCallbacks);
-                }
-
-                ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
-                ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
-
-                return result;
-            }
-        }
-    }
-
-done:
-    if (ppDataBufferNode != NULL) {
-        *ppDataBufferNode = pDataBufferNode;
-    }
-
-    return result;
-}
-
-static ma_result ma_resource_manager_data_buffer_node_acquire(ma_resource_manager* pResourceManager, const char* pFilePath, const wchar_t* pFilePathW, ma_uint32 hashedName32, ma_uint32 flags, const ma_resource_manager_data_supply* pExistingData, ma_fence* pInitFence, ma_fence* pDoneFence, ma_resource_manager_data_buffer_node** ppDataBufferNode)
-{
-    ma_result result = MA_SUCCESS;
-    ma_bool32 nodeAlreadyExists = MA_FALSE;
-    ma_resource_manager_data_buffer_node* pDataBufferNode = NULL;
-    ma_resource_manager_inline_notification initNotification;   /* Used when the WAIT_INIT flag is set. */
-
-    if (ppDataBufferNode != NULL) {
-        *ppDataBufferNode = NULL;   /* Safety. */
-    }
-
-    if (pResourceManager == NULL || (pFilePath == NULL && pFilePathW == NULL && hashedName32 == 0)) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* If we're specifying existing data, it must be valid. */
-    if (pExistingData != NULL && pExistingData->type == ma_resource_manager_data_supply_type_unknown) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* If we don't support threading, remove the ASYNC flag to make the rest of this a bit simpler. */
-    if (ma_resource_manager_is_threading_enabled(pResourceManager) == MA_FALSE) {
-        flags &= ~MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC;
-    }
-
-    if (hashedName32 == 0) {
-        if (pFilePath != NULL) {
-            hashedName32 = ma_hash_string_32(pFilePath);
-        } else {
-            hashedName32 = ma_hash_string_w_32(pFilePathW);
-        }
-    }
-
-    /*
-    Here is where we either increment the node's reference count or allocate a new one and add it
-    to the BST. When allocating a new node, we need to make sure the LOAD_DATA_BUFFER_NODE job is
-    posted inside the critical section just in case the caller immediately uninitializes the node
-    as this will ensure the FREE_DATA_BUFFER_NODE job is given an execution order such that the
-    node is not uninitialized before initialization.
-    */
-    ma_resource_manager_data_buffer_bst_lock(pResourceManager);
-    {
-        result = ma_resource_manager_data_buffer_node_acquire_critical_section(pResourceManager, pFilePath, pFilePathW, hashedName32, flags, pExistingData, pInitFence, pDoneFence, &initNotification, &pDataBufferNode);
-    }
-    ma_resource_manager_data_buffer_bst_unlock(pResourceManager);
-
-    if (result == MA_ALREADY_EXISTS) {
-        nodeAlreadyExists = MA_TRUE;
-        result = MA_SUCCESS;
-    } else {
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    /*
-    If we're loading synchronously, we'll need to load everything now. When loading asynchronously,
-    a job will have been posted inside the BST critical section so that an uninitialization can be
-    allocated an appropriate execution order thereby preventing it from being uninitialized before
-    the node is initialized by the decoding thread(s).
-    */
-    if (nodeAlreadyExists == MA_FALSE) {    /* Don't need to try loading anything if the node already exists. */
-        if (pFilePath == NULL && pFilePathW == NULL) {
-            /*
-            If this path is hit, it means a buffer is being copied (i.e. initialized from only the
-            hashed name), but that node has been freed in the meantime, probably from some other
-            thread. This is an invalid operation.
-            */
-            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Cloning data buffer node failed because the source node was released. The source node must remain valid until the cloning has completed.\n");
-            result = MA_INVALID_OPERATION;
-            goto done;
-        }
-
-        if (pDataBufferNode->isDataOwnedByResourceManager) {
-            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) == 0) {
-                /* Loading synchronously. Load the sound in it's entirety here. */
-                if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE) == 0) {
-                    /* No decoding. This is the simple case - just store the file contents in memory. */
-                    result = ma_resource_manager_data_buffer_node_init_supply_encoded(pResourceManager, pDataBufferNode, pFilePath, pFilePathW);
-                    if (result != MA_SUCCESS) {
-                        goto done;
-                    }
-                } else {
-                    /* Decoding. We do this the same way as we do when loading asynchronously. */
-                    ma_decoder* pDecoder;
-                    result = ma_resource_manager_data_buffer_node_init_supply_decoded(pResourceManager, pDataBufferNode, pFilePath, pFilePathW, flags, &pDecoder);
-                    if (result != MA_SUCCESS) {
-                        goto done;
-                    }
-
-                    /* We have the decoder, now decode page by page just like we do when loading asynchronously. */
-                    for (;;) {
-                        /* Decode next page. */
-                        result = ma_resource_manager_data_buffer_node_decode_next_page(pResourceManager, pDataBufferNode, pDecoder);
-                        if (result != MA_SUCCESS) {
-                            break;  /* Will return MA_AT_END when the last page has been decoded. */
-                        }
-                    }
-
-                    /* Reaching the end needs to be considered successful. */
-                    if (result == MA_AT_END) {
-                        result  = MA_SUCCESS;
-                    }
-
-                    /*
-                    At this point the data buffer is either fully decoded or some error occurred. Either
-                    way, the decoder is no longer necessary.
-                    */
-                    ma_decoder_uninit(pDecoder);
-                    ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
-                }
-
-                /* Getting here means we were successful. Make sure the status of the node is updated accordingly. */
-                ma_atomic_exchange_i32(&pDataBufferNode->result, result);
-            } else {
-                /* Loading asynchronously. We may need to wait for initialization. */
-                if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-                    ma_resource_manager_inline_notification_wait(&initNotification);
-                }
-            }
-        } else {
-            /* The data is not managed by the resource manager so there's nothing else to do. */
-            MA_ASSERT(pExistingData != NULL);
-        }
-    }
-
-done:
-    /* If we failed to initialize the data buffer we need to free it. */
-    if (result != MA_SUCCESS) {
-        if (nodeAlreadyExists == MA_FALSE) {
-            ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
-            ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
-        }
-    }
-
-    /*
-    The init notification needs to be uninitialized. This will be used if the node does not already
-    exist, and we've specified ASYNC | WAIT_INIT.
-    */
-    if (nodeAlreadyExists == MA_FALSE && pDataBufferNode->isDataOwnedByResourceManager && (flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) != 0) {
-        if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-            ma_resource_manager_inline_notification_uninit(&initNotification);
-        }
-    }
-
-    if (ppDataBufferNode != NULL) {
-        *ppDataBufferNode = pDataBufferNode;
-    }
-
-    return result;
-}
-
-static ma_result ma_resource_manager_data_buffer_node_unacquire(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, const char* pName, const wchar_t* pNameW)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint32 refCount = 0xFFFFFFFF; /* The new reference count of the node after decrementing. Initialize to non-0 to be safe we don't fall into the freeing path. */
-    ma_uint32 hashedName32 = 0;
-
-    if (pResourceManager == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pDataBufferNode == NULL) {
-        if (pName == NULL && pNameW == NULL) {
-            return MA_INVALID_ARGS;
-        }
-
-        if (pName != NULL) {
-            hashedName32 = ma_hash_string_32(pName);
-        } else {
-            hashedName32 = ma_hash_string_w_32(pNameW);
-        }
-    }
-
-    /*
-    The first thing to do is decrement the reference counter of the node. Then, if the reference
-    count is zero, we need to free the node. If the node is still in the process of loading, we'll
-    need to post a job to the job queue to free the node. Otherwise we'll just do it here.
-    */
-    ma_resource_manager_data_buffer_bst_lock(pResourceManager);
-    {
-        /* Might need to find the node. Must be done inside the critical section. */
-        if (pDataBufferNode == NULL) {
-            result = ma_resource_manager_data_buffer_node_search(pResourceManager, hashedName32, &pDataBufferNode);
-            if (result != MA_SUCCESS) {
-                goto stage2;    /* Couldn't find the node. */
-            }
-        }
-
-        result = ma_resource_manager_data_buffer_node_decrement_ref(pResourceManager, pDataBufferNode, &refCount);
-        if (result != MA_SUCCESS) {
-            goto stage2;    /* Should never happen. */
-        }
-
-        if (refCount == 0) {
-            result = ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
-            if (result != MA_SUCCESS) {
-                goto stage2;  /* An error occurred when trying to remove the data buffer. This should never happen. */
-            }
-        }
-    }
-    ma_resource_manager_data_buffer_bst_unlock(pResourceManager);
-
-stage2:
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /*
-    Here is where we need to free the node. We don't want to do this inside the critical section
-    above because we want to keep that as small as possible for multi-threaded efficiency.
-    */
-    if (refCount == 0) {
-        if (ma_resource_manager_data_buffer_node_result(pDataBufferNode) == MA_BUSY) {
-            /* The sound is still loading. We need to delay the freeing of the node to a safe time. */
-            ma_job job;
-
-            /* We need to mark the node as unavailable for the sake of the resource manager worker threads. */
-            ma_atomic_exchange_i32(&pDataBufferNode->result, MA_UNAVAILABLE);
-
-            job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER_NODE);
-            job.order = ma_resource_manager_data_buffer_node_next_execution_order(pDataBufferNode);
-            job.data.resourceManager.freeDataBufferNode.pResourceManager = pResourceManager;
-            job.data.resourceManager.freeDataBufferNode.pDataBufferNode  = pDataBufferNode;
-
-            result = ma_resource_manager_post_job(pResourceManager, &job);
-            if (result != MA_SUCCESS) {
-                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to post MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER_NODE job. %s.\n", ma_result_description(result));
-                return result;
-            }
-
-            /* If we don't support threading, process the job queue here. */
-            if (ma_resource_manager_is_threading_enabled(pResourceManager) == MA_FALSE) {
-                while (ma_resource_manager_data_buffer_node_result(pDataBufferNode) == MA_BUSY) {
-                    result = ma_resource_manager_process_next_job(pResourceManager);
-                    if (result == MA_NO_DATA_AVAILABLE || result == MA_CANCELLED) {
-                        result = MA_SUCCESS;
-                        break;
-                    }
-                }
-            } else {
-                /* Threading is enabled. The job queue will deal with the rest of the cleanup from here. */
-            }
-        } else {
-            /* The sound isn't loading so we can just free the node here. */
-            ma_resource_manager_data_buffer_node_free(pResourceManager, pDataBufferNode);
-        }
-    }
-
-    return result;
-}
-
-
-
-static ma_uint32 ma_resource_manager_data_buffer_next_execution_order(ma_resource_manager_data_buffer* pDataBuffer)
-{
-    MA_ASSERT(pDataBuffer != NULL);
-    return ma_atomic_fetch_add_32(&pDataBuffer->executionCounter, 1);
-}
-
-static ma_result ma_resource_manager_data_buffer_cb__read_pcm_frames(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_resource_manager_data_buffer_read_pcm_frames((ma_resource_manager_data_buffer*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_resource_manager_data_buffer_cb__seek_to_pcm_frame(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_resource_manager_data_buffer_seek_to_pcm_frame((ma_resource_manager_data_buffer*)pDataSource, frameIndex);
-}
-
-static ma_result ma_resource_manager_data_buffer_cb__get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    return ma_resource_manager_data_buffer_get_data_format((ma_resource_manager_data_buffer*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-}
-
-static ma_result ma_resource_manager_data_buffer_cb__get_cursor_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    return ma_resource_manager_data_buffer_get_cursor_in_pcm_frames((ma_resource_manager_data_buffer*)pDataSource, pCursor);
-}
-
-static ma_result ma_resource_manager_data_buffer_cb__get_length_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    return ma_resource_manager_data_buffer_get_length_in_pcm_frames((ma_resource_manager_data_buffer*)pDataSource, pLength);
-}
-
-static ma_result ma_resource_manager_data_buffer_cb__set_looping(ma_data_source* pDataSource, ma_bool32 isLooping)
-{
-    ma_resource_manager_data_buffer* pDataBuffer = (ma_resource_manager_data_buffer*)pDataSource;
-    MA_ASSERT(pDataBuffer != NULL);
-
-    ma_atomic_exchange_32(&pDataBuffer->isLooping, isLooping);
-
-    /* The looping state needs to be set on the connector as well or else looping won't work when we read audio data. */
-    ma_data_source_set_looping(ma_resource_manager_data_buffer_get_connector(pDataBuffer), isLooping);
-
-    return MA_SUCCESS;
-}
-
-static ma_data_source_vtable g_ma_resource_manager_data_buffer_vtable =
-{
-    ma_resource_manager_data_buffer_cb__read_pcm_frames,
-    ma_resource_manager_data_buffer_cb__seek_to_pcm_frame,
-    ma_resource_manager_data_buffer_cb__get_data_format,
-    ma_resource_manager_data_buffer_cb__get_cursor_in_pcm_frames,
-    ma_resource_manager_data_buffer_cb__get_length_in_pcm_frames,
-    ma_resource_manager_data_buffer_cb__set_looping,
-    0
-};
-
-static ma_result ma_resource_manager_data_buffer_init_ex_internal(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_uint32 hashedName32, ma_resource_manager_data_buffer* pDataBuffer)
-{
-    ma_result result = MA_SUCCESS;
-    ma_resource_manager_data_buffer_node* pDataBufferNode;
-    ma_data_source_config dataSourceConfig;
-    ma_bool32 async;
-    ma_uint32 flags;
-    ma_resource_manager_pipeline_notifications notifications;
-
-    if (pDataBuffer == NULL) {
-        if (pConfig != NULL && pConfig->pNotifications != NULL) {
-            ma_resource_manager_pipeline_notifications_signal_all_notifications(pConfig->pNotifications);
-        }
-
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDataBuffer);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->pNotifications != NULL) {
-        notifications = *pConfig->pNotifications;   /* From here on out we should be referencing `notifications` instead of `pNotifications`. Set this to NULL to catch errors at testing time. */
-    } else {
-        MA_ZERO_OBJECT(&notifications);
-    }
-
-    /* For safety, always remove the ASYNC flag if threading is disabled on the resource manager. */
-    flags = pConfig->flags;
-    if (ma_resource_manager_is_threading_enabled(pResourceManager) == MA_FALSE) {
-        flags &= ~MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC;
-    }
-
-    async = (flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) != 0;
-
-    /*
-    Fences need to be acquired before doing anything. These must be acquired and released outside of
-    the node to ensure there's no holes where ma_fence_wait() could prematurely return before the
-    data buffer has completed initialization.
-
-    When loading asynchronously, the node acquisition routine below will acquire the fences on this
-    thread and then release them on the async thread when the operation is complete.
-
-    These fences are always released at the "done" tag at the end of this function. They'll be
-    acquired a second if loading asynchronously. This double acquisition system is just done to
-    simplify code maintanence.
-    */
-    ma_resource_manager_pipeline_notifications_acquire_all_fences(&notifications);
-    {
-        /* We first need to acquire a node. If ASYNC is not set, this will not return until the entire sound has been loaded. */
-        result = ma_resource_manager_data_buffer_node_acquire(pResourceManager, pConfig->pFilePath, pConfig->pFilePathW, hashedName32, flags, NULL, notifications.init.pFence, notifications.done.pFence, &pDataBufferNode);
-        if (result != MA_SUCCESS) {
-            ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
-            goto done;
-        }
-
-        dataSourceConfig = ma_data_source_config_init();
-        dataSourceConfig.vtable = &g_ma_resource_manager_data_buffer_vtable;
-
-        result = ma_data_source_init(&dataSourceConfig, &pDataBuffer->ds);
-        if (result != MA_SUCCESS) {
-            ma_resource_manager_data_buffer_node_unacquire(pResourceManager, pDataBufferNode, NULL, NULL);
-            ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
-            goto done;
-        }
-
-        pDataBuffer->pResourceManager = pResourceManager;
-        pDataBuffer->pNode  = pDataBufferNode;
-        pDataBuffer->flags  = flags;
-        pDataBuffer->result = MA_BUSY;  /* Always default to MA_BUSY for safety. It'll be overwritten when loading completes or an error occurs. */
-
-        /* If we're loading asynchronously we need to post a job to the job queue to initialize the connector. */
-        if (async == MA_FALSE || ma_resource_manager_data_buffer_node_result(pDataBufferNode) == MA_SUCCESS) {
-            /* Loading synchronously or the data has already been fully loaded. We can just initialize the connector from here without a job. */
-            result = ma_resource_manager_data_buffer_init_connector(pDataBuffer, pConfig, NULL, NULL);
-            ma_atomic_exchange_i32(&pDataBuffer->result, result);
-
-            ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
-            goto done;
-        } else {
-            /* The node's data supply isn't initialized yet. The caller has requested that we load asynchronously so we need to post a job to do this. */
-            ma_job job;
-            ma_resource_manager_inline_notification initNotification;   /* Used when the WAIT_INIT flag is set. */
-
-            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-                ma_resource_manager_inline_notification_init(pResourceManager, &initNotification);
-            }
-
-            /*
-            The status of the data buffer needs to be set to MA_BUSY before posting the job so that the
-            worker thread is aware of it's busy state. If the LOAD_DATA_BUFFER job sees a status other
-            than MA_BUSY, it'll assume an error and fall through to an early exit.
-            */
-            ma_atomic_exchange_i32(&pDataBuffer->result, MA_BUSY);
-
-            /* Acquire fences a second time. These will be released by the async thread. */
-            ma_resource_manager_pipeline_notifications_acquire_all_fences(&notifications);
-
-            job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER);
-            job.order = ma_resource_manager_data_buffer_next_execution_order(pDataBuffer);
-            job.data.resourceManager.loadDataBuffer.pDataBuffer             = pDataBuffer;
-            job.data.resourceManager.loadDataBuffer.pInitNotification       = ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) ? &initNotification : notifications.init.pNotification;
-            job.data.resourceManager.loadDataBuffer.pDoneNotification       = notifications.done.pNotification;
-            job.data.resourceManager.loadDataBuffer.pInitFence              = notifications.init.pFence;
-            job.data.resourceManager.loadDataBuffer.pDoneFence              = notifications.done.pFence;
-            job.data.resourceManager.loadDataBuffer.rangeBegInPCMFrames     = pConfig->rangeBegInPCMFrames;
-            job.data.resourceManager.loadDataBuffer.rangeEndInPCMFrames     = pConfig->rangeEndInPCMFrames;
-            job.data.resourceManager.loadDataBuffer.loopPointBegInPCMFrames = pConfig->loopPointBegInPCMFrames;
-            job.data.resourceManager.loadDataBuffer.loopPointEndInPCMFrames = pConfig->loopPointEndInPCMFrames;
-            job.data.resourceManager.loadDataBuffer.isLooping               = pConfig->isLooping;
-
-            /* If we need to wait for initialization to complete we can just process the job in place. */
-            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-                result = ma_job_process(&job);
-            } else {
-                result = ma_resource_manager_post_job(pResourceManager, &job);
-            }
-
-            if (result != MA_SUCCESS) {
-                /* We failed to post the job. Most likely there isn't enough room in the queue's buffer. */
-                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to post MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER job. %s.\n", ma_result_description(result));
-                ma_atomic_exchange_i32(&pDataBuffer->result, result);
-
-                /* Release the fences after the result has been set on the data buffer. */
-                ma_resource_manager_pipeline_notifications_release_all_fences(&notifications);
-            } else {
-                if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-                    ma_resource_manager_inline_notification_wait(&initNotification);
-
-                    if (notifications.init.pNotification != NULL) {
-                        ma_async_notification_signal(notifications.init.pNotification);
-                    }
-
-                    /* NOTE: Do not release the init fence here. It will have been done by the job. */
-
-                    /* Make sure we return an error if initialization failed on the async thread. */
-                    result = ma_resource_manager_data_buffer_result(pDataBuffer);
-                    if (result == MA_BUSY) {
-                        result  = MA_SUCCESS;
-                    }
-                }
-            }
-
-            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-                ma_resource_manager_inline_notification_uninit(&initNotification);
-            }
-        }
-
-        if (result != MA_SUCCESS) {
-            ma_resource_manager_data_buffer_node_unacquire(pResourceManager, pDataBufferNode, NULL, NULL);
-            goto done;
-        }
-    }
-done:
-    if (result == MA_SUCCESS) {
-        if (pConfig->initialSeekPointInPCMFrames > 0) {
-            ma_resource_manager_data_buffer_seek_to_pcm_frame(pDataBuffer, pConfig->initialSeekPointInPCMFrames);
-        }
-    }
-
-    ma_resource_manager_pipeline_notifications_release_all_fences(&notifications);
-
-    return result;
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_buffer* pDataBuffer)
-{
-    return ma_resource_manager_data_buffer_init_ex_internal(pResourceManager, pConfig, 0, pDataBuffer);
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_init(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_buffer* pDataBuffer)
-{
-    ma_resource_manager_data_source_config config;
-
-    config = ma_resource_manager_data_source_config_init();
-    config.pFilePath      = pFilePath;
-    config.flags          = flags;
-    config.pNotifications = pNotifications;
-
-    return ma_resource_manager_data_buffer_init_ex(pResourceManager, &config, pDataBuffer);
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_init_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_buffer* pDataBuffer)
-{
-    ma_resource_manager_data_source_config config;
-
-    config = ma_resource_manager_data_source_config_init();
-    config.pFilePathW     = pFilePath;
-    config.flags          = flags;
-    config.pNotifications = pNotifications;
-
-    return ma_resource_manager_data_buffer_init_ex(pResourceManager, &config, pDataBuffer);
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_init_copy(ma_resource_manager* pResourceManager, const ma_resource_manager_data_buffer* pExistingDataBuffer, ma_resource_manager_data_buffer* pDataBuffer)
-{
-    ma_resource_manager_data_source_config config;
-
-    if (pExistingDataBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ASSERT(pExistingDataBuffer->pNode != NULL);  /* <-- If you've triggered this, you've passed in an invalid existing data buffer. */
-
-    config = ma_resource_manager_data_source_config_init();
-    config.flags = pExistingDataBuffer->flags;
-
-    return ma_resource_manager_data_buffer_init_ex_internal(pResourceManager, &config, pExistingDataBuffer->pNode->hashedName32, pDataBuffer);
-}
-
-static ma_result ma_resource_manager_data_buffer_uninit_internal(ma_resource_manager_data_buffer* pDataBuffer)
-{
-    MA_ASSERT(pDataBuffer != NULL);
-
-    /* The connector should be uninitialized first. */
-    ma_resource_manager_data_buffer_uninit_connector(pDataBuffer->pResourceManager, pDataBuffer);
-
-    /* With the connector uninitialized we can unacquire the node. */
-    ma_resource_manager_data_buffer_node_unacquire(pDataBuffer->pResourceManager, pDataBuffer->pNode, NULL, NULL);
-
-    /* The base data source needs to be uninitialized as well. */
-    ma_data_source_uninit(&pDataBuffer->ds);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_uninit(ma_resource_manager_data_buffer* pDataBuffer)
-{
-    ma_result result;
-
-    if (pDataBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ma_resource_manager_data_buffer_result(pDataBuffer) == MA_SUCCESS) {
-        /* The data buffer can be deleted synchronously. */
-        return ma_resource_manager_data_buffer_uninit_internal(pDataBuffer);
-    } else {
-        /*
-        The data buffer needs to be deleted asynchronously because it's still loading. With the status set to MA_UNAVAILABLE, no more pages will
-        be loaded and the uninitialization should happen fairly quickly. Since the caller owns the data buffer, we need to wait for this event
-        to get processed before returning.
-        */
-        ma_resource_manager_inline_notification notification;
-        ma_job job;
-
-        /*
-        We need to mark the node as unavailable so we don't try reading from it anymore, but also to
-        let the loading thread know that it needs to abort it's loading procedure.
-        */
-        ma_atomic_exchange_i32(&pDataBuffer->result, MA_UNAVAILABLE);
-
-        result = ma_resource_manager_inline_notification_init(pDataBuffer->pResourceManager, &notification);
-        if (result != MA_SUCCESS) {
-            return result;  /* Failed to create the notification. This should rarely, if ever, happen. */
-        }
-
-        job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER);
-        job.order = ma_resource_manager_data_buffer_next_execution_order(pDataBuffer);
-        job.data.resourceManager.freeDataBuffer.pDataBuffer       = pDataBuffer;
-        job.data.resourceManager.freeDataBuffer.pDoneNotification = &notification;
-        job.data.resourceManager.freeDataBuffer.pDoneFence        = NULL;
-
-        result = ma_resource_manager_post_job(pDataBuffer->pResourceManager, &job);
-        if (result != MA_SUCCESS) {
-            ma_resource_manager_inline_notification_uninit(&notification);
-            return result;
-        }
-
-        ma_resource_manager_inline_notification_wait_and_uninit(&notification);
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_read_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint64 framesRead = 0;
-    ma_bool32 isDecodedBufferBusy = MA_FALSE;
-
-    /* Safety. */
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    /*
-    We cannot be using the data buffer after it's been uninitialized. If you trigger this assert it means you're trying to read from the data buffer after
-    it's been uninitialized or is in the process of uninitializing.
-    */
-    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
-
-    /* If the node is not initialized we need to abort with a busy code. */
-    if (ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE) {
-        return MA_BUSY; /* Still loading. */
-    }
-
-    /*
-    If we've got a seek scheduled we'll want to do that before reading. However, for paged buffers, there's
-    a chance that the sound hasn't yet been decoded up to the seek point will result in the seek failing. If
-    this happens, we need to keep the seek scheduled and return MA_BUSY.
-    */
-    if (pDataBuffer->seekToCursorOnNextRead) {
-        pDataBuffer->seekToCursorOnNextRead = MA_FALSE;
-
-        result = ma_data_source_seek_to_pcm_frame(ma_resource_manager_data_buffer_get_connector(pDataBuffer), pDataBuffer->seekTargetInPCMFrames);
-        if (result != MA_SUCCESS) {
-            if (result == MA_BAD_SEEK && ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode) == ma_resource_manager_data_supply_type_decoded_paged) {
-                pDataBuffer->seekToCursorOnNextRead = MA_TRUE;  /* Keep the seek scheduled. We just haven't loaded enough data yet to do the seek properly. */
-                return MA_BUSY;
-            }
-
-            return result;
-        }
-    }
-
-    /*
-    For decoded buffers (not paged) we need to check beforehand how many frames we have available. We cannot
-    exceed this amount. We'll read as much as we can, and then return MA_BUSY.
-    */
-    if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode) == ma_resource_manager_data_supply_type_decoded) {
-        ma_uint64 availableFrames;
-
-        isDecodedBufferBusy = (ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) == MA_BUSY);
-
-        if (ma_resource_manager_data_buffer_get_available_frames(pDataBuffer, &availableFrames) == MA_SUCCESS) {
-            /* Don't try reading more than the available frame count. */
-            if (frameCount > availableFrames) {
-                frameCount = availableFrames;
-
-                /*
-                If there's no frames available we want to set the status to MA_AT_END. The logic below
-                will check if the node is busy, and if so, change it to MA_BUSY. The reason we do this
-                is because we don't want to call `ma_data_source_read_pcm_frames()` if the frame count
-                is 0 because that'll result in a situation where it's possible MA_AT_END won't get
-                returned.
-                */
-                if (frameCount == 0) {
-                    result = MA_AT_END;
-                }
-            } else {
-                isDecodedBufferBusy = MA_FALSE; /* We have enough frames available in the buffer to avoid a MA_BUSY status. */
-            }
-        }
-    }
-
-    /* Don't attempt to read anything if we've got no frames available. */
-    if (frameCount > 0) {
-        result = ma_data_source_read_pcm_frames(ma_resource_manager_data_buffer_get_connector(pDataBuffer), pFramesOut, frameCount, &framesRead);
-    }
-
-    /*
-    If we returned MA_AT_END, but the node is still loading, we don't want to return that code or else the caller will interpret the sound
-    as at the end and terminate decoding.
-    */
-    if (result == MA_AT_END) {
-        if (ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) == MA_BUSY) {
-            result = MA_BUSY;
-        }
-    }
-
-    if (isDecodedBufferBusy) {
-        result = MA_BUSY;
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = framesRead;
-    }
-
-    if (result == MA_SUCCESS && framesRead == 0) {
-        result  = MA_AT_END;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_seek_to_pcm_frame(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64 frameIndex)
-{
-    ma_result result;
-
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
-
-    /* If we haven't yet got a connector we need to abort. */
-    if (ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE) {
-        pDataBuffer->seekTargetInPCMFrames = frameIndex;
-        pDataBuffer->seekToCursorOnNextRead = MA_TRUE;
-        return MA_BUSY; /* Still loading. */
-    }
-
-    result = ma_data_source_seek_to_pcm_frame(ma_resource_manager_data_buffer_get_connector(pDataBuffer), frameIndex);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pDataBuffer->seekTargetInPCMFrames = ~(ma_uint64)0; /* <-- For identification purposes. */
-    pDataBuffer->seekToCursorOnNextRead = MA_FALSE;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_get_data_format(ma_resource_manager_data_buffer* pDataBuffer, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
-
-    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
-    {
-        case ma_resource_manager_data_supply_type_encoded:
-        {
-            return ma_data_source_get_data_format(&pDataBuffer->connector.decoder, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-        };
-
-        case ma_resource_manager_data_supply_type_decoded:
-        {
-            *pFormat     = pDataBuffer->pNode->data.backend.decoded.format;
-            *pChannels   = pDataBuffer->pNode->data.backend.decoded.channels;
-            *pSampleRate = pDataBuffer->pNode->data.backend.decoded.sampleRate;
-            ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pDataBuffer->pNode->data.backend.decoded.channels);
-            return MA_SUCCESS;
-        };
-
-        case ma_resource_manager_data_supply_type_decoded_paged:
-        {
-            *pFormat     = pDataBuffer->pNode->data.backend.decodedPaged.data.format;
-            *pChannels   = pDataBuffer->pNode->data.backend.decodedPaged.data.channels;
-            *pSampleRate = pDataBuffer->pNode->data.backend.decodedPaged.sampleRate;
-            ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pDataBuffer->pNode->data.backend.decoded.channels);
-            return MA_SUCCESS;
-        };
-
-        case ma_resource_manager_data_supply_type_unknown:
-        {
-            return MA_BUSY; /* Still loading. */
-        };
-
-        default:
-        {
-            /* Unknown supply type. Should never hit this. */
-            return MA_INVALID_ARGS;
-        }
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_get_cursor_in_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pCursor)
-{
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
-
-    if (pDataBuffer == NULL || pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;
-
-    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
-    {
-        case ma_resource_manager_data_supply_type_encoded:
-        {
-            return ma_decoder_get_cursor_in_pcm_frames(&pDataBuffer->connector.decoder, pCursor);
-        };
-
-        case ma_resource_manager_data_supply_type_decoded:
-        {
-            return ma_audio_buffer_get_cursor_in_pcm_frames(&pDataBuffer->connector.buffer, pCursor);
-        };
-
-        case ma_resource_manager_data_supply_type_decoded_paged:
-        {
-            return ma_paged_audio_buffer_get_cursor_in_pcm_frames(&pDataBuffer->connector.pagedBuffer, pCursor);
-        };
-
-        case ma_resource_manager_data_supply_type_unknown:
-        {
-            return MA_BUSY;
-        };
-
-        default:
-        {
-            return MA_INVALID_ARGS;
-        }
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_get_length_in_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pLength)
-{
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
-
-    if (pDataBuffer == NULL || pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode) == ma_resource_manager_data_supply_type_unknown) {
-        return MA_BUSY; /* Still loading. */
-    }
-
-    return ma_data_source_get_length_in_pcm_frames(ma_resource_manager_data_buffer_get_connector(pDataBuffer), pLength);
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_result(const ma_resource_manager_data_buffer* pDataBuffer)
-{
-    if (pDataBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return (ma_result)ma_atomic_load_i32((ma_result*)&pDataBuffer->result);    /* Need a naughty const-cast here. */
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_set_looping(ma_resource_manager_data_buffer* pDataBuffer, ma_bool32 isLooping)
-{
-    return ma_data_source_set_looping(pDataBuffer, isLooping);
-}
-
-MA_API ma_bool32 ma_resource_manager_data_buffer_is_looping(const ma_resource_manager_data_buffer* pDataBuffer)
-{
-    return ma_data_source_is_looping(pDataBuffer);
-}
-
-MA_API ma_result ma_resource_manager_data_buffer_get_available_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pAvailableFrames)
-{
-    if (pAvailableFrames == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pAvailableFrames = 0;
-
-    if (pDataBuffer == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode) == ma_resource_manager_data_supply_type_unknown) {
-        if (ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) == MA_BUSY) {
-            return MA_BUSY;
-        } else {
-            return MA_INVALID_OPERATION;    /* No connector. */
-        }
-    }
-
-    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
-    {
-        case ma_resource_manager_data_supply_type_encoded:
-        {
-            return ma_decoder_get_available_frames(&pDataBuffer->connector.decoder, pAvailableFrames);
-        };
-
-        case ma_resource_manager_data_supply_type_decoded:
-        {
-            return ma_audio_buffer_get_available_frames(&pDataBuffer->connector.buffer, pAvailableFrames);
-        };
-
-        case ma_resource_manager_data_supply_type_decoded_paged:
-        {
-            ma_uint64 cursor;
-            ma_paged_audio_buffer_get_cursor_in_pcm_frames(&pDataBuffer->connector.pagedBuffer, &cursor);
-
-            if (pDataBuffer->pNode->data.backend.decodedPaged.decodedFrameCount > cursor) {
-                *pAvailableFrames = pDataBuffer->pNode->data.backend.decodedPaged.decodedFrameCount - cursor;
-            } else {
-                *pAvailableFrames = 0;
-            }
-
-            return MA_SUCCESS;
-        };
-
-        case ma_resource_manager_data_supply_type_unknown:
-        default:
-        {
-            /* Unknown supply type. Should never hit this. */
-            return MA_INVALID_ARGS;
-        }
-    }
-}
-
-MA_API ma_result ma_resource_manager_register_file(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags)
-{
-    return ma_resource_manager_data_buffer_node_acquire(pResourceManager, pFilePath, NULL, 0, flags, NULL, NULL, NULL, NULL);
-}
-
-MA_API ma_result ma_resource_manager_register_file_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags)
-{
-    return ma_resource_manager_data_buffer_node_acquire(pResourceManager, NULL, pFilePath, 0, flags, NULL, NULL, NULL, NULL);
-}
-
-
-static ma_result ma_resource_manager_register_data(ma_resource_manager* pResourceManager, const char* pName, const wchar_t* pNameW, ma_resource_manager_data_supply* pExistingData)
-{
-    return ma_resource_manager_data_buffer_node_acquire(pResourceManager, pName, pNameW, 0, 0, pExistingData, NULL, NULL, NULL);
-}
-
-static ma_result ma_resource_manager_register_decoded_data_internal(ma_resource_manager* pResourceManager, const char* pName, const wchar_t* pNameW, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
-{
-    ma_resource_manager_data_supply data;
-    data.type                            = ma_resource_manager_data_supply_type_decoded;
-    data.backend.decoded.pData           = pData;
-    data.backend.decoded.totalFrameCount = frameCount;
-    data.backend.decoded.format          = format;
-    data.backend.decoded.channels        = channels;
-    data.backend.decoded.sampleRate      = sampleRate;
-
-    return ma_resource_manager_register_data(pResourceManager, pName, pNameW, &data);
-}
-
-MA_API ma_result ma_resource_manager_register_decoded_data(ma_resource_manager* pResourceManager, const char* pName, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
-{
-    return ma_resource_manager_register_decoded_data_internal(pResourceManager, pName, NULL, pData, frameCount, format, channels, sampleRate);
-}
-
-MA_API ma_result ma_resource_manager_register_decoded_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
-{
-    return ma_resource_manager_register_decoded_data_internal(pResourceManager, NULL, pName, pData, frameCount, format, channels, sampleRate);
-}
-
-
-static ma_result ma_resource_manager_register_encoded_data_internal(ma_resource_manager* pResourceManager, const char* pName, const wchar_t* pNameW, const void* pData, size_t sizeInBytes)
-{
-    ma_resource_manager_data_supply data;
-    data.type                        = ma_resource_manager_data_supply_type_encoded;
-    data.backend.encoded.pData       = pData;
-    data.backend.encoded.sizeInBytes = sizeInBytes;
-
-    return ma_resource_manager_register_data(pResourceManager, pName, pNameW, &data);
-}
-
-MA_API ma_result ma_resource_manager_register_encoded_data(ma_resource_manager* pResourceManager, const char* pName, const void* pData, size_t sizeInBytes)
-{
-    return ma_resource_manager_register_encoded_data_internal(pResourceManager, pName, NULL, pData, sizeInBytes);
-}
-
-MA_API ma_result ma_resource_manager_register_encoded_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName, const void* pData, size_t sizeInBytes)
-{
-    return ma_resource_manager_register_encoded_data_internal(pResourceManager, NULL, pName, pData, sizeInBytes);
-}
-
-
-MA_API ma_result ma_resource_manager_unregister_file(ma_resource_manager* pResourceManager, const char* pFilePath)
-{
-    return ma_resource_manager_unregister_data(pResourceManager, pFilePath);
-}
-
-MA_API ma_result ma_resource_manager_unregister_file_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath)
-{
-    return ma_resource_manager_unregister_data_w(pResourceManager, pFilePath);
-}
-
-MA_API ma_result ma_resource_manager_unregister_data(ma_resource_manager* pResourceManager, const char* pName)
-{
-    return ma_resource_manager_data_buffer_node_unacquire(pResourceManager, NULL, pName, NULL);
-}
-
-MA_API ma_result ma_resource_manager_unregister_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName)
-{
-    return ma_resource_manager_data_buffer_node_unacquire(pResourceManager, NULL, NULL, pName);
-}
-
-
-static ma_uint32 ma_resource_manager_data_stream_next_execution_order(ma_resource_manager_data_stream* pDataStream)
-{
-    MA_ASSERT(pDataStream != NULL);
-    return ma_atomic_fetch_add_32(&pDataStream->executionCounter, 1);
-}
-
-static ma_bool32 ma_resource_manager_data_stream_is_decoder_at_end(const ma_resource_manager_data_stream* pDataStream)
-{
-    MA_ASSERT(pDataStream != NULL);
-    return ma_atomic_load_32((ma_bool32*)&pDataStream->isDecoderAtEnd);
-}
-
-static ma_uint32 ma_resource_manager_data_stream_seek_counter(const ma_resource_manager_data_stream* pDataStream)
-{
-    MA_ASSERT(pDataStream != NULL);
-    return ma_atomic_load_32((ma_uint32*)&pDataStream->seekCounter);
-}
-
-
-static ma_result ma_resource_manager_data_stream_cb__read_pcm_frames(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    return ma_resource_manager_data_stream_read_pcm_frames((ma_resource_manager_data_stream*)pDataSource, pFramesOut, frameCount, pFramesRead);
-}
-
-static ma_result ma_resource_manager_data_stream_cb__seek_to_pcm_frame(ma_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    return ma_resource_manager_data_stream_seek_to_pcm_frame((ma_resource_manager_data_stream*)pDataSource, frameIndex);
-}
-
-static ma_result ma_resource_manager_data_stream_cb__get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    return ma_resource_manager_data_stream_get_data_format((ma_resource_manager_data_stream*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-}
-
-static ma_result ma_resource_manager_data_stream_cb__get_cursor_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pCursor)
-{
-    return ma_resource_manager_data_stream_get_cursor_in_pcm_frames((ma_resource_manager_data_stream*)pDataSource, pCursor);
-}
-
-static ma_result ma_resource_manager_data_stream_cb__get_length_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pLength)
-{
-    return ma_resource_manager_data_stream_get_length_in_pcm_frames((ma_resource_manager_data_stream*)pDataSource, pLength);
-}
-
-static ma_result ma_resource_manager_data_stream_cb__set_looping(ma_data_source* pDataSource, ma_bool32 isLooping)
-{
-    ma_resource_manager_data_stream* pDataStream = (ma_resource_manager_data_stream*)pDataSource;
-    MA_ASSERT(pDataStream != NULL);
-
-    ma_atomic_exchange_32(&pDataStream->isLooping, isLooping);
-
-    return MA_SUCCESS;
-}
-
-static ma_data_source_vtable g_ma_resource_manager_data_stream_vtable =
-{
-    ma_resource_manager_data_stream_cb__read_pcm_frames,
-    ma_resource_manager_data_stream_cb__seek_to_pcm_frame,
-    ma_resource_manager_data_stream_cb__get_data_format,
-    ma_resource_manager_data_stream_cb__get_cursor_in_pcm_frames,
-    ma_resource_manager_data_stream_cb__get_length_in_pcm_frames,
-    ma_resource_manager_data_stream_cb__set_looping,
-    0 /*MA_DATA_SOURCE_SELF_MANAGED_RANGE_AND_LOOP_POINT*/
-};
-
-static void ma_resource_manager_data_stream_set_absolute_cursor(ma_resource_manager_data_stream* pDataStream, ma_uint64 absoluteCursor)
-{
-    /* Loop if possible. */
-    if (absoluteCursor > pDataStream->totalLengthInPCMFrames && pDataStream->totalLengthInPCMFrames > 0) {
-        absoluteCursor = absoluteCursor % pDataStream->totalLengthInPCMFrames;
-    }
-
-    ma_atomic_exchange_64(&pDataStream->absoluteCursor, absoluteCursor);
-}
-
-MA_API ma_result ma_resource_manager_data_stream_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_stream* pDataStream)
-{
-    ma_result result;
-    ma_data_source_config dataSourceConfig;
-    char* pFilePathCopy = NULL;
-    wchar_t* pFilePathWCopy = NULL;
-    ma_job job;
-    ma_bool32 waitBeforeReturning = MA_FALSE;
-    ma_resource_manager_inline_notification waitNotification;
-    ma_resource_manager_pipeline_notifications notifications;
-
-    if (pDataStream == NULL) {
-        if (pConfig != NULL && pConfig->pNotifications != NULL) {
-            ma_resource_manager_pipeline_notifications_signal_all_notifications(pConfig->pNotifications);
-        }
-
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDataStream);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->pNotifications != NULL) {
-        notifications = *pConfig->pNotifications;    /* From here on out, `notifications` should be used instead of `pNotifications`. Setting this to NULL to catch any errors at testing time. */
-    } else {
-        MA_ZERO_OBJECT(&notifications);
-    }
-
-    dataSourceConfig = ma_data_source_config_init();
-    dataSourceConfig.vtable = &g_ma_resource_manager_data_stream_vtable;
-
-    result = ma_data_source_init(&dataSourceConfig, &pDataStream->ds);
-    if (result != MA_SUCCESS) {
-        ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
-        return result;
-    }
-
-    pDataStream->pResourceManager = pResourceManager;
-    pDataStream->flags            = pConfig->flags;
-    pDataStream->result           = MA_BUSY;
-
-    ma_data_source_set_range_in_pcm_frames(pDataStream, pConfig->rangeBegInPCMFrames, pConfig->rangeEndInPCMFrames);
-    ma_data_source_set_loop_point_in_pcm_frames(pDataStream, pConfig->loopPointBegInPCMFrames, pConfig->loopPointEndInPCMFrames);
-    ma_data_source_set_looping(pDataStream, pConfig->isLooping);
-
-    if (pResourceManager == NULL || (pConfig->pFilePath == NULL && pConfig->pFilePathW == NULL)) {
-        ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
-        return MA_INVALID_ARGS;
-    }
-
-    /* We want all access to the VFS and the internal decoder to happen on the job thread just to keep things easier to manage for the VFS.  */
-
-    /* We need a copy of the file path. We should probably make this more efficient, but for now we'll do a transient memory allocation. */
-    if (pConfig->pFilePath != NULL) {
-        pFilePathCopy  = ma_copy_string(pConfig->pFilePath, &pResourceManager->config.allocationCallbacks);
-    } else {
-        pFilePathWCopy = ma_copy_string_w(pConfig->pFilePathW, &pResourceManager->config.allocationCallbacks);
-    }
-
-    if (pFilePathCopy == NULL && pFilePathWCopy == NULL) {
-        ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
-        return MA_OUT_OF_MEMORY;
-    }
-
-    /*
-    We need to check for the presence of MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC. If it's not set, we need to wait before returning. Otherwise we
-    can return immediately. Likewise, we'll also check for MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT and do the same.
-    */
-    if ((pConfig->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) == 0 || (pConfig->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
-        waitBeforeReturning = MA_TRUE;
-        ma_resource_manager_inline_notification_init(pResourceManager, &waitNotification);
-    }
-
-    ma_resource_manager_pipeline_notifications_acquire_all_fences(&notifications);
-
-    /* Set the absolute cursor to our initial seek position so retrieval of the cursor returns a good value. */
-    ma_resource_manager_data_stream_set_absolute_cursor(pDataStream, pConfig->initialSeekPointInPCMFrames);
-
-    /* We now have everything we need to post the job. This is the last thing we need to do from here. The rest will be done by the job thread. */
-    job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_STREAM);
-    job.order = ma_resource_manager_data_stream_next_execution_order(pDataStream);
-    job.data.resourceManager.loadDataStream.pDataStream       = pDataStream;
-    job.data.resourceManager.loadDataStream.pFilePath         = pFilePathCopy;
-    job.data.resourceManager.loadDataStream.pFilePathW        = pFilePathWCopy;
-    job.data.resourceManager.loadDataStream.initialSeekPoint  = pConfig->initialSeekPointInPCMFrames;
-    job.data.resourceManager.loadDataStream.pInitNotification = (waitBeforeReturning == MA_TRUE) ? &waitNotification : notifications.init.pNotification;
-    job.data.resourceManager.loadDataStream.pInitFence        = notifications.init.pFence;
-    result = ma_resource_manager_post_job(pResourceManager, &job);
-    if (result != MA_SUCCESS) {
-        ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
-        ma_resource_manager_pipeline_notifications_release_all_fences(&notifications);
-
-        if (waitBeforeReturning) {
-            ma_resource_manager_inline_notification_uninit(&waitNotification);
-        }
-
-        ma_free(pFilePathCopy,  &pResourceManager->config.allocationCallbacks);
-        ma_free(pFilePathWCopy, &pResourceManager->config.allocationCallbacks);
-        return result;
-    }
-
-    /* Wait if needed. */
-    if (waitBeforeReturning) {
-        ma_resource_manager_inline_notification_wait_and_uninit(&waitNotification);
-
-        if (notifications.init.pNotification != NULL) {
-            ma_async_notification_signal(notifications.init.pNotification);
-        }
-
-        /*
-        If there was an error during initialization make sure we return that result here. We don't want to do this
-        if we're not waiting because it will most likely be in a busy state.
-        */
-        if (pDataStream->result != MA_SUCCESS) {
-            return pDataStream->result;
-        }
-
-        /* NOTE: Do not release pInitFence here. That will be done by the job. */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resource_manager_data_stream_init(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_stream* pDataStream)
-{
-    ma_resource_manager_data_source_config config;
-
-    config = ma_resource_manager_data_source_config_init();
-    config.pFilePath      = pFilePath;
-    config.flags          = flags;
-    config.pNotifications = pNotifications;
-
-    return ma_resource_manager_data_stream_init_ex(pResourceManager, &config, pDataStream);
-}
-
-MA_API ma_result ma_resource_manager_data_stream_init_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_stream* pDataStream)
-{
-    ma_resource_manager_data_source_config config;
-
-    config = ma_resource_manager_data_source_config_init();
-    config.pFilePathW     = pFilePath;
-    config.flags          = flags;
-    config.pNotifications = pNotifications;
-
-    return ma_resource_manager_data_stream_init_ex(pResourceManager, &config, pDataStream);
-}
-
-MA_API ma_result ma_resource_manager_data_stream_uninit(ma_resource_manager_data_stream* pDataStream)
-{
-    ma_resource_manager_inline_notification freeEvent;
-    ma_job job;
-
-    if (pDataStream == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The first thing to do is set the result to unavailable. This will prevent future page decoding. */
-    ma_atomic_exchange_i32(&pDataStream->result, MA_UNAVAILABLE);
-
-    /*
-    We need to post a job to ensure we're not in the middle or decoding or anything. Because the object is owned by the caller, we'll need
-    to wait for it to complete before returning which means we need an event.
-    */
-    ma_resource_manager_inline_notification_init(pDataStream->pResourceManager, &freeEvent);
-
-    job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_STREAM);
-    job.order = ma_resource_manager_data_stream_next_execution_order(pDataStream);
-    job.data.resourceManager.freeDataStream.pDataStream       = pDataStream;
-    job.data.resourceManager.freeDataStream.pDoneNotification = &freeEvent;
-    job.data.resourceManager.freeDataStream.pDoneFence        = NULL;
-    ma_resource_manager_post_job(pDataStream->pResourceManager, &job);
-
-    /* We need to wait for the job to finish processing before we return. */
-    ma_resource_manager_inline_notification_wait_and_uninit(&freeEvent);
-
-    return MA_SUCCESS;
-}
-
-
-static ma_uint32 ma_resource_manager_data_stream_get_page_size_in_frames(ma_resource_manager_data_stream* pDataStream)
-{
-    MA_ASSERT(pDataStream != NULL);
-    MA_ASSERT(pDataStream->isDecoderInitialized == MA_TRUE);
-
-    return MA_RESOURCE_MANAGER_PAGE_SIZE_IN_MILLISECONDS * (pDataStream->decoder.outputSampleRate/1000);
-}
-
-static void* ma_resource_manager_data_stream_get_page_data_pointer(ma_resource_manager_data_stream* pDataStream, ma_uint32 pageIndex, ma_uint32 relativeCursor)
-{
-    MA_ASSERT(pDataStream != NULL);
-    MA_ASSERT(pDataStream->isDecoderInitialized == MA_TRUE);
-    MA_ASSERT(pageIndex == 0 || pageIndex == 1);
-
-    return ma_offset_ptr(pDataStream->pPageData, ((ma_resource_manager_data_stream_get_page_size_in_frames(pDataStream) * pageIndex) + relativeCursor) * ma_get_bytes_per_frame(pDataStream->decoder.outputFormat, pDataStream->decoder.outputChannels));
-}
-
-static void ma_resource_manager_data_stream_fill_page(ma_resource_manager_data_stream* pDataStream, ma_uint32 pageIndex)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint64 pageSizeInFrames;
-    ma_uint64 totalFramesReadForThisPage = 0;
-    void* pPageData = ma_resource_manager_data_stream_get_page_data_pointer(pDataStream, pageIndex, 0);
-
-    pageSizeInFrames = ma_resource_manager_data_stream_get_page_size_in_frames(pDataStream);
-
-    /* The decoder needs to inherit the stream's looping and range state. */
-    {
-        ma_uint64 rangeBeg;
-        ma_uint64 rangeEnd;
-        ma_uint64 loopPointBeg;
-        ma_uint64 loopPointEnd;
-
-        ma_data_source_set_looping(&pDataStream->decoder, ma_resource_manager_data_stream_is_looping(pDataStream));
-
-        ma_data_source_get_range_in_pcm_frames(pDataStream, &rangeBeg, &rangeEnd);
-        ma_data_source_set_range_in_pcm_frames(&pDataStream->decoder, rangeBeg, rangeEnd);
-
-        ma_data_source_get_loop_point_in_pcm_frames(pDataStream, &loopPointBeg, &loopPointEnd);
-        ma_data_source_set_loop_point_in_pcm_frames(&pDataStream->decoder, loopPointBeg, loopPointEnd);
-    }
-
-    /* Just read straight from the decoder. It will deal with ranges and looping for us. */
-    result = ma_data_source_read_pcm_frames(&pDataStream->decoder, pPageData, pageSizeInFrames, &totalFramesReadForThisPage);
-    if (result == MA_AT_END || totalFramesReadForThisPage < pageSizeInFrames) {
-        ma_atomic_exchange_32(&pDataStream->isDecoderAtEnd, MA_TRUE);
-    }
-
-    ma_atomic_exchange_32(&pDataStream->pageFrameCount[pageIndex], (ma_uint32)totalFramesReadForThisPage);
-    ma_atomic_exchange_32(&pDataStream->isPageValid[pageIndex], MA_TRUE);
-}
-
-static void ma_resource_manager_data_stream_fill_pages(ma_resource_manager_data_stream* pDataStream)
-{
-    ma_uint32 iPage;
-
-    MA_ASSERT(pDataStream != NULL);
-
-    for (iPage = 0; iPage < 2; iPage += 1) {
-        ma_resource_manager_data_stream_fill_page(pDataStream, iPage);
-    }
-}
-
-
-static ma_result ma_resource_manager_data_stream_map(ma_resource_manager_data_stream* pDataStream, void** ppFramesOut, ma_uint64* pFrameCount)
-{
-    ma_uint64 framesAvailable;
-    ma_uint64 frameCount = 0;
-
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
-
-    if (pFrameCount != NULL) {
-        frameCount = *pFrameCount;
-        *pFrameCount = 0;
-    }
-    if (ppFramesOut != NULL) {
-        *ppFramesOut = NULL;
-    }
-
-    if (pDataStream == NULL || ppFramesOut == NULL || pFrameCount == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* Don't attempt to read while we're in the middle of seeking. Tell the caller that we're busy. */
-    if (ma_resource_manager_data_stream_seek_counter(pDataStream) > 0) {
-        return MA_BUSY;
-    }
-
-    /* If the page we're on is invalid it means we've caught up to the job thread. */
-    if (ma_atomic_load_32(&pDataStream->isPageValid[pDataStream->currentPageIndex]) == MA_FALSE) {
-        framesAvailable = 0;
-    } else {
-        /*
-        The page we're on is valid so we must have some frames available. We need to make sure that we don't overflow into the next page, even if it's valid. The reason is
-        that the unmap process will only post an update for one page at a time. Keeping mapping tied to page boundaries makes this simpler.
-        */
-        ma_uint32 currentPageFrameCount = ma_atomic_load_32(&pDataStream->pageFrameCount[pDataStream->currentPageIndex]);
-        MA_ASSERT(currentPageFrameCount >= pDataStream->relativeCursor);
-
-        framesAvailable = currentPageFrameCount - pDataStream->relativeCursor;
-    }
-
-    /* If there's no frames available and the result is set to MA_AT_END we need to return MA_AT_END. */
-    if (framesAvailable == 0) {
-        if (ma_resource_manager_data_stream_is_decoder_at_end(pDataStream)) {
-            return MA_AT_END;
-        } else {
-            return MA_BUSY; /* There are no frames available, but we're not marked as EOF so we might have caught up to the job thread. Need to return MA_BUSY and wait for more data. */
-        }
-    }
-
-    MA_ASSERT(framesAvailable > 0);
-
-    if (frameCount > framesAvailable) {
-        frameCount = framesAvailable;
-    }
-
-    *ppFramesOut = ma_resource_manager_data_stream_get_page_data_pointer(pDataStream, pDataStream->currentPageIndex, pDataStream->relativeCursor);
-    *pFrameCount = frameCount;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_resource_manager_data_stream_unmap(ma_resource_manager_data_stream* pDataStream, ma_uint64 frameCount)
-{
-    ma_uint32 newRelativeCursor;
-    ma_uint32 pageSizeInFrames;
-    ma_job job;
-
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
-
-    if (pDataStream == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* The frame count should always fit inside a 32-bit integer. */
-    if (frameCount > 0xFFFFFFFF) {
-        return MA_INVALID_ARGS;
-    }
-
-    pageSizeInFrames = ma_resource_manager_data_stream_get_page_size_in_frames(pDataStream);
-
-    /* The absolute cursor needs to be updated for ma_resource_manager_data_stream_get_cursor_in_pcm_frames(). */
-    ma_resource_manager_data_stream_set_absolute_cursor(pDataStream, ma_atomic_load_64(&pDataStream->absoluteCursor) + frameCount);
-
-    /* Here is where we need to check if we need to load a new page, and if so, post a job to load it. */
-    newRelativeCursor = pDataStream->relativeCursor + (ma_uint32)frameCount;
-
-    /* If the new cursor has flowed over to the next page we need to mark the old one as invalid and post an event for it. */
-    if (newRelativeCursor >= pageSizeInFrames) {
-        newRelativeCursor -= pageSizeInFrames;
-
-        /* Here is where we post the job start decoding. */
-        job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_STREAM);
-        job.order = ma_resource_manager_data_stream_next_execution_order(pDataStream);
-        job.data.resourceManager.pageDataStream.pDataStream = pDataStream;
-        job.data.resourceManager.pageDataStream.pageIndex   = pDataStream->currentPageIndex;
-
-        /* The page needs to be marked as invalid so that the public API doesn't try reading from it. */
-        ma_atomic_exchange_32(&pDataStream->isPageValid[pDataStream->currentPageIndex], MA_FALSE);
-
-        /* Before posting the job we need to make sure we set some state. */
-        pDataStream->relativeCursor   = newRelativeCursor;
-        pDataStream->currentPageIndex = (pDataStream->currentPageIndex + 1) & 0x01;
-        return ma_resource_manager_post_job(pDataStream->pResourceManager, &job);
-    } else {
-        /* We haven't moved into a new page so we can just move the cursor forward. */
-        pDataStream->relativeCursor = newRelativeCursor;
-        return MA_SUCCESS;
-    }
-}
-
-
-MA_API ma_result ma_resource_manager_data_stream_read_pcm_frames(ma_resource_manager_data_stream* pDataStream, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint64 totalFramesProcessed;
-    ma_format format;
-    ma_uint32 channels;
-
-    /* Safety. */
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (frameCount == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
-
-    if (pDataStream == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* Don't attempt to read while we're in the middle of seeking. Tell the caller that we're busy. */
-    if (ma_resource_manager_data_stream_seek_counter(pDataStream) > 0) {
-        return MA_BUSY;
-    }
-
-    ma_resource_manager_data_stream_get_data_format(pDataStream, &format, &channels, NULL, NULL, 0);
-
-    /* Reading is implemented in terms of map/unmap. We need to run this in a loop because mapping is clamped against page boundaries. */
-    totalFramesProcessed = 0;
-    while (totalFramesProcessed < frameCount) {
-        void* pMappedFrames;
-        ma_uint64 mappedFrameCount;
-
-        mappedFrameCount = frameCount - totalFramesProcessed;
-        result = ma_resource_manager_data_stream_map(pDataStream, &pMappedFrames, &mappedFrameCount);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        /* Copy the mapped data to the output buffer if we have one. It's allowed for pFramesOut to be NULL in which case a relative forward seek is performed. */
-        if (pFramesOut != NULL) {
-            ma_copy_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, totalFramesProcessed, format, channels), pMappedFrames, mappedFrameCount, format, channels);
-        }
-
-        totalFramesProcessed += mappedFrameCount;
-
-        result = ma_resource_manager_data_stream_unmap(pDataStream, mappedFrameCount);
-        if (result != MA_SUCCESS) {
-            break;  /* This is really bad - will only get an error here if we failed to post a job to the queue for loading the next page. */
-        }
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = totalFramesProcessed;
-    }
-
-    if (result == MA_SUCCESS && totalFramesProcessed == 0) {
-        result  = MA_AT_END;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_resource_manager_data_stream_seek_to_pcm_frame(ma_resource_manager_data_stream* pDataStream, ma_uint64 frameIndex)
-{
-    ma_job job;
-    ma_result streamResult;
-
-    streamResult = ma_resource_manager_data_stream_result(pDataStream);
-
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(streamResult != MA_UNAVAILABLE);
-
-    if (pDataStream == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (streamResult != MA_SUCCESS && streamResult != MA_BUSY) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* If we're not already seeking and we're sitting on the same frame, just make this a no-op. */
-    if (ma_atomic_load_32(&pDataStream->seekCounter) == 0) {
-        if (ma_atomic_load_64(&pDataStream->absoluteCursor) == frameIndex) {
-            return MA_SUCCESS;
-        }
-    }
-
-
-    /* Increment the seek counter first to indicate to read_paged_pcm_frames() and map_paged_pcm_frames() that we are in the middle of a seek and MA_BUSY should be returned. */
-    ma_atomic_fetch_add_32(&pDataStream->seekCounter, 1);
-
-    /* Update the absolute cursor so that ma_resource_manager_data_stream_get_cursor_in_pcm_frames() returns the new position. */
-    ma_resource_manager_data_stream_set_absolute_cursor(pDataStream, frameIndex);
-
-    /*
-    We need to clear our currently loaded pages so that the stream starts playback from the new seek point as soon as possible. These are for the purpose of the public
-    API and will be ignored by the seek job. The seek job will operate on the assumption that both pages have been marked as invalid and the cursor is at the start of
-    the first page.
-    */
-    pDataStream->relativeCursor   = 0;
-    pDataStream->currentPageIndex = 0;
-    ma_atomic_exchange_32(&pDataStream->isPageValid[0], MA_FALSE);
-    ma_atomic_exchange_32(&pDataStream->isPageValid[1], MA_FALSE);
-
-    /* Make sure the data stream is not marked as at the end or else if we seek in response to hitting the end, we won't be able to read any more data. */
-    ma_atomic_exchange_32(&pDataStream->isDecoderAtEnd, MA_FALSE);
-
-    /*
-    The public API is not allowed to touch the internal decoder so we need to use a job to perform the seek. When seeking, the job thread will assume both pages
-    are invalid and any content contained within them will be discarded and replaced with newly decoded data.
-    */
-    job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_SEEK_DATA_STREAM);
-    job.order = ma_resource_manager_data_stream_next_execution_order(pDataStream);
-    job.data.resourceManager.seekDataStream.pDataStream = pDataStream;
-    job.data.resourceManager.seekDataStream.frameIndex  = frameIndex;
-    return ma_resource_manager_post_job(pDataStream->pResourceManager, &job);
-}
-
-MA_API ma_result ma_resource_manager_data_stream_get_data_format(ma_resource_manager_data_stream* pDataStream, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
-
-    if (pFormat != NULL) {
-        *pFormat = ma_format_unknown;
-    }
-
-    if (pChannels != NULL) {
-        *pChannels = 0;
-    }
-
-    if (pSampleRate != NULL) {
-        *pSampleRate = 0;
-    }
-
-    if (pChannelMap != NULL) {
-        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
-    }
-
-    if (pDataStream == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /*
-    We're being a little bit naughty here and accessing the internal decoder from the public API. The output data format is constant, and we've defined this function
-    such that the application is responsible for ensuring it's not called while uninitializing so it should be safe.
-    */
-    return ma_data_source_get_data_format(&pDataStream->decoder, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-}
-
-MA_API ma_result ma_resource_manager_data_stream_get_cursor_in_pcm_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pCursor)
-{
-    ma_result result;
-
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pCursor = 0;
-
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
-
-    if (pDataStream == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /*
-    If the stream is in an erroneous state we need to return an invalid operation. We can allow
-    this to be called when the data stream is in a busy state because the caller may have asked
-    for an initial seek position and it's convenient to return that as the cursor position.
-    */
-    result = ma_resource_manager_data_stream_result(pDataStream);
-    if (result != MA_SUCCESS && result != MA_BUSY) {
-        return MA_INVALID_OPERATION;
-    }
-
-    *pCursor = ma_atomic_load_64(&pDataStream->absoluteCursor);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resource_manager_data_stream_get_length_in_pcm_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pLength)
-{
-    ma_result streamResult;
-
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pLength = 0;
-
-    streamResult = ma_resource_manager_data_stream_result(pDataStream);
-
-    /* We cannot be using the data source after it's been uninitialized. */
-    MA_ASSERT(streamResult != MA_UNAVAILABLE);
-
-    if (pDataStream == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (streamResult != MA_SUCCESS) {
-        return streamResult;
-    }
-
-    /*
-    We most definitely do not want to be calling ma_decoder_get_length_in_pcm_frames() directly. Instead we want to use a cached value that we
-    calculated when we initialized it on the job thread.
-    */
-    *pLength = pDataStream->totalLengthInPCMFrames;
-    if (*pLength == 0) {
-        return MA_NOT_IMPLEMENTED;  /* Some decoders may not have a known length. */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resource_manager_data_stream_result(const ma_resource_manager_data_stream* pDataStream)
-{
-    if (pDataStream == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return (ma_result)ma_atomic_load_i32(&pDataStream->result);
-}
-
-MA_API ma_result ma_resource_manager_data_stream_set_looping(ma_resource_manager_data_stream* pDataStream, ma_bool32 isLooping)
-{
-    return ma_data_source_set_looping(pDataStream, isLooping);
-}
-
-MA_API ma_bool32 ma_resource_manager_data_stream_is_looping(const ma_resource_manager_data_stream* pDataStream)
-{
-    if (pDataStream == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_atomic_load_32((ma_bool32*)&pDataStream->isLooping);   /* Naughty const-cast. Value won't change from here in practice (maybe from another thread). */
-}
-
-MA_API ma_result ma_resource_manager_data_stream_get_available_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pAvailableFrames)
-{
-    ma_uint32 pageIndex0;
-    ma_uint32 pageIndex1;
-    ma_uint32 relativeCursor;
-    ma_uint64 availableFrames;
-
-    if (pAvailableFrames == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pAvailableFrames = 0;
-
-    if (pDataStream == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pageIndex0     =  pDataStream->currentPageIndex;
-    pageIndex1     = (pDataStream->currentPageIndex + 1) & 0x01;
-    relativeCursor =  pDataStream->relativeCursor;
-
-    availableFrames = 0;
-    if (ma_atomic_load_32(&pDataStream->isPageValid[pageIndex0])) {
-        availableFrames += ma_atomic_load_32(&pDataStream->pageFrameCount[pageIndex0]) - relativeCursor;
-        if (ma_atomic_load_32(&pDataStream->isPageValid[pageIndex1])) {
-            availableFrames += ma_atomic_load_32(&pDataStream->pageFrameCount[pageIndex1]);
-        }
-    }
-
-    *pAvailableFrames = availableFrames;
-    return MA_SUCCESS;
-}
-
-
-static ma_result ma_resource_manager_data_source_preinit(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_source* pDataSource)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDataSource);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pResourceManager == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pDataSource->flags = pConfig->flags;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_resource_manager_data_source_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_source* pDataSource)
-{
-    ma_result result;
-
-    result = ma_resource_manager_data_source_preinit(pResourceManager, pConfig, pDataSource);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* The data source itself is just a data stream or a data buffer. */
-    if ((pConfig->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_init_ex(pResourceManager, pConfig, &pDataSource->backend.stream);
-    } else {
-        return ma_resource_manager_data_buffer_init_ex(pResourceManager, pConfig, &pDataSource->backend.buffer);
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_init(ma_resource_manager* pResourceManager, const char* pName, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_source* pDataSource)
-{
-    ma_resource_manager_data_source_config config;
-
-    config = ma_resource_manager_data_source_config_init();
-    config.pFilePath      = pName;
-    config.flags          = flags;
-    config.pNotifications = pNotifications;
-
-    return ma_resource_manager_data_source_init_ex(pResourceManager, &config, pDataSource);
-}
-
-MA_API ma_result ma_resource_manager_data_source_init_w(ma_resource_manager* pResourceManager, const wchar_t* pName, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_source* pDataSource)
-{
-    ma_resource_manager_data_source_config config;
-
-    config = ma_resource_manager_data_source_config_init();
-    config.pFilePathW     = pName;
-    config.flags          = flags;
-    config.pNotifications = pNotifications;
-
-    return ma_resource_manager_data_source_init_ex(pResourceManager, &config, pDataSource);
-}
-
-MA_API ma_result ma_resource_manager_data_source_init_copy(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source* pExistingDataSource, ma_resource_manager_data_source* pDataSource)
-{
-    ma_result result;
-    ma_resource_manager_data_source_config config;
-
-    if (pExistingDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    config = ma_resource_manager_data_source_config_init();
-    config.flags = pExistingDataSource->flags;
-
-    result = ma_resource_manager_data_source_preinit(pResourceManager, &config, pDataSource);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* Copying can only be done from data buffers. Streams cannot be copied. */
-    if ((pExistingDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return MA_INVALID_OPERATION;
-    }
-
-    return ma_resource_manager_data_buffer_init_copy(pResourceManager, &pExistingDataSource->backend.buffer, &pDataSource->backend.buffer);
-}
-
-MA_API ma_result ma_resource_manager_data_source_uninit(ma_resource_manager_data_source* pDataSource)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* All we need to is uninitialize the underlying data buffer or data stream. */
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_uninit(&pDataSource->backend.stream);
-    } else {
-        return ma_resource_manager_data_buffer_uninit(&pDataSource->backend.buffer);
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_read_pcm_frames(ma_resource_manager_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    /* Safety. */
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_read_pcm_frames(&pDataSource->backend.stream, pFramesOut, frameCount, pFramesRead);
-    } else {
-        return ma_resource_manager_data_buffer_read_pcm_frames(&pDataSource->backend.buffer, pFramesOut, frameCount, pFramesRead);
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_seek_to_pcm_frame(ma_resource_manager_data_source* pDataSource, ma_uint64 frameIndex)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_seek_to_pcm_frame(&pDataSource->backend.stream, frameIndex);
-    } else {
-        return ma_resource_manager_data_buffer_seek_to_pcm_frame(&pDataSource->backend.buffer, frameIndex);
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_map(ma_resource_manager_data_source* pDataSource, void** ppFramesOut, ma_uint64* pFrameCount)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_map(&pDataSource->backend.stream, ppFramesOut, pFrameCount);
-    } else {
-        return MA_NOT_IMPLEMENTED;  /* Mapping not supported with data buffers. */
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_unmap(ma_resource_manager_data_source* pDataSource, ma_uint64 frameCount)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_unmap(&pDataSource->backend.stream, frameCount);
-    } else {
-        return MA_NOT_IMPLEMENTED;  /* Mapping not supported with data buffers. */
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_get_data_format(ma_resource_manager_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_get_data_format(&pDataSource->backend.stream, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-    } else {
-        return ma_resource_manager_data_buffer_get_data_format(&pDataSource->backend.buffer, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_get_cursor_in_pcm_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pCursor)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_get_cursor_in_pcm_frames(&pDataSource->backend.stream, pCursor);
-    } else {
-        return ma_resource_manager_data_buffer_get_cursor_in_pcm_frames(&pDataSource->backend.buffer, pCursor);
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_get_length_in_pcm_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pLength)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_get_length_in_pcm_frames(&pDataSource->backend.stream, pLength);
-    } else {
-        return ma_resource_manager_data_buffer_get_length_in_pcm_frames(&pDataSource->backend.buffer, pLength);
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_result(const ma_resource_manager_data_source* pDataSource)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_result(&pDataSource->backend.stream);
-    } else {
-        return ma_resource_manager_data_buffer_result(&pDataSource->backend.buffer);
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_set_looping(ma_resource_manager_data_source* pDataSource, ma_bool32 isLooping)
-{
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_set_looping(&pDataSource->backend.stream, isLooping);
-    } else {
-        return ma_resource_manager_data_buffer_set_looping(&pDataSource->backend.buffer, isLooping);
-    }
-}
-
-MA_API ma_bool32 ma_resource_manager_data_source_is_looping(const ma_resource_manager_data_source* pDataSource)
-{
-    if (pDataSource == NULL) {
-        return MA_FALSE;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_is_looping(&pDataSource->backend.stream);
-    } else {
-        return ma_resource_manager_data_buffer_is_looping(&pDataSource->backend.buffer);
-    }
-}
-
-MA_API ma_result ma_resource_manager_data_source_get_available_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pAvailableFrames)
-{
-    if (pAvailableFrames == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pAvailableFrames = 0;
-
-    if (pDataSource == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
-        return ma_resource_manager_data_stream_get_available_frames(&pDataSource->backend.stream, pAvailableFrames);
-    } else {
-        return ma_resource_manager_data_buffer_get_available_frames(&pDataSource->backend.buffer, pAvailableFrames);
-    }
-}
-
-
-MA_API ma_result ma_resource_manager_post_job(ma_resource_manager* pResourceManager, const ma_job* pJob)
-{
-    if (pResourceManager == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_job_queue_post(&pResourceManager->jobQueue, pJob);
-}
-
-MA_API ma_result ma_resource_manager_post_job_quit(ma_resource_manager* pResourceManager)
-{
-    ma_job job = ma_job_init(MA_JOB_TYPE_QUIT);
-    return ma_resource_manager_post_job(pResourceManager, &job);
-}
-
-MA_API ma_result ma_resource_manager_next_job(ma_resource_manager* pResourceManager, ma_job* pJob)
-{
-    if (pResourceManager == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_job_queue_next(&pResourceManager->jobQueue, pJob);
-}
-
-
-static ma_result ma_job_process__resource_manager__load_data_buffer_node(ma_job* pJob)
-{
-    ma_result result = MA_SUCCESS;
-    ma_resource_manager* pResourceManager;
-    ma_resource_manager_data_buffer_node* pDataBufferNode;
-
-    MA_ASSERT(pJob != NULL);
-
-    pResourceManager = (ma_resource_manager*)pJob->data.resourceManager.loadDataBufferNode.pResourceManager;
-    MA_ASSERT(pResourceManager != NULL);
-
-    pDataBufferNode = (ma_resource_manager_data_buffer_node*)pJob->data.resourceManager.loadDataBufferNode.pDataBufferNode;
-    MA_ASSERT(pDataBufferNode != NULL);
-    MA_ASSERT(pDataBufferNode->isDataOwnedByResourceManager == MA_TRUE);  /* The data should always be owned by the resource manager. */
-
-    /* The data buffer is not getting deleted, but we may be getting executed out of order. If so, we need to push the job back onto the queue and return. */
-    if (pJob->order != ma_atomic_load_32(&pDataBufferNode->executionPointer)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Attempting to execute out of order. Probably interleaved with a MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER job. */
-    }
-
-    /* First thing we need to do is check whether or not the data buffer is getting deleted. If so we just abort. */
-    if (ma_resource_manager_data_buffer_node_result(pDataBufferNode) != MA_BUSY) {
-        result = ma_resource_manager_data_buffer_node_result(pDataBufferNode);    /* The data buffer may be getting deleted before it's even been loaded. */
-        goto done;
-    }
-
-    /*
-    We're ready to start loading. Essentially what we're doing here is initializing the data supply
-    of the node. Once this is complete, data buffers can have their connectors initialized which
-    will allow then to have audio data read from them.
-
-    Note that when the data supply type has been moved away from "unknown", that is when other threads
-    will determine that the node is available for data delivery and the data buffer connectors can be
-    initialized. Therefore, it's important that it is set after the data supply has been initialized.
-    */
-    if ((pJob->data.resourceManager.loadDataBufferNode.flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE) != 0) {
-        /*
-        Decoding. This is the complex case because we're not going to be doing the entire decoding
-        process here. Instead it's going to be split of multiple jobs and loaded in pages. The
-        reason for this is to evenly distribute decoding time across multiple sounds, rather than
-        having one huge sound hog all the available processing resources.
-
-        The first thing we do is initialize a decoder. This is allocated on the heap and is passed
-        around to the paging jobs. When the last paging job has completed it's processing, it'll
-        free the decoder for us.
-
-        This job does not do any actual decoding. It instead just posts a PAGE_DATA_BUFFER_NODE job
-        which is where the actual decoding work will be done. However, once this job is complete,
-        the node will be in a state where data buffer connectors can be initialized.
-        */
-        ma_decoder* pDecoder;   /* <-- Free'd on the last page decode. */
-        ma_job pageDataBufferNodeJob;
-
-        /* Allocate the decoder by initializing a decoded data supply. */
-        result = ma_resource_manager_data_buffer_node_init_supply_decoded(pResourceManager, pDataBufferNode, pJob->data.resourceManager.loadDataBufferNode.pFilePath, pJob->data.resourceManager.loadDataBufferNode.pFilePathW, pJob->data.resourceManager.loadDataBufferNode.flags, &pDecoder);
-
-        /*
-        Don't ever propagate an MA_BUSY result code or else the resource manager will think the
-        node is just busy decoding rather than in an error state. This should never happen, but
-        including this logic for safety just in case.
-        */
-        if (result == MA_BUSY) {
-            result  = MA_ERROR;
-        }
-
-        if (result != MA_SUCCESS) {
-            if (pJob->data.resourceManager.loadDataBufferNode.pFilePath != NULL) {
-                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to initialize data supply for \"%s\". %s.\n", pJob->data.resourceManager.loadDataBufferNode.pFilePath, ma_result_description(result));
-            } else {
-                #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(_MSC_VER)
-                    ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to initialize data supply for \"%ls\", %s.\n", pJob->data.resourceManager.loadDataBufferNode.pFilePathW, ma_result_description(result));
-                #endif
-            }
-
-            goto done;
-        }
-
-        /*
-        At this point the node's data supply is initialized and other threads can start initializing
-        their data buffer connectors. However, no data will actually be available until we start to
-        actually decode it. To do this, we need to post a paging job which is where the decoding
-        work is done.
-
-        Note that if an error occurred at an earlier point, this section will have been skipped.
-        */
-        pageDataBufferNodeJob = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE);
-        pageDataBufferNodeJob.order = ma_resource_manager_data_buffer_node_next_execution_order(pDataBufferNode);
-        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pResourceManager  = pResourceManager;
-        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pDataBufferNode   = pDataBufferNode;
-        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pDecoder          = pDecoder;
-        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pDoneNotification = pJob->data.resourceManager.loadDataBufferNode.pDoneNotification;
-        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pDoneFence        = pJob->data.resourceManager.loadDataBufferNode.pDoneFence;
-
-        /* The job has been set up so it can now be posted. */
-        result = ma_resource_manager_post_job(pResourceManager, &pageDataBufferNodeJob);
-
-        /*
-        When we get here, we want to make sure the result code is set to MA_BUSY. The reason for
-        this is that the result will be copied over to the node's internal result variable. In
-        this case, since the decoding is still in-progress, we need to make sure the result code
-        is set to MA_BUSY.
-        */
-        if (result != MA_SUCCESS) {
-            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to post MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE job. %s\n", ma_result_description(result));
-            ma_decoder_uninit(pDecoder);
-            ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
-        } else {
-            result = MA_BUSY;
-        }
-    } else {
-        /* No decoding. This is the simple case. We need only read the file content into memory and we're done. */
-        result = ma_resource_manager_data_buffer_node_init_supply_encoded(pResourceManager, pDataBufferNode, pJob->data.resourceManager.loadDataBufferNode.pFilePath, pJob->data.resourceManager.loadDataBufferNode.pFilePathW);
-    }
-
-
-done:
-    /* File paths are no longer needed. */
-    ma_free(pJob->data.resourceManager.loadDataBufferNode.pFilePath,  &pResourceManager->config.allocationCallbacks);
-    ma_free(pJob->data.resourceManager.loadDataBufferNode.pFilePathW, &pResourceManager->config.allocationCallbacks);
-
-    /*
-    We need to set the result to at the very end to ensure no other threads try reading the data before we've fully initialized the object. Other threads
-    are going to be inspecting this variable to determine whether or not they're ready to read data. We can only change the result if it's set to MA_BUSY
-    because otherwise we may be changing away from an error code which would be bad. An example is if the application creates a data buffer, but then
-    immediately deletes it before we've got to this point. In this case, pDataBuffer->result will be MA_UNAVAILABLE, and setting it to MA_SUCCESS or any
-    other error code would cause the buffer to look like it's in a state that it's not.
-    */
-    ma_atomic_compare_and_swap_i32(&pDataBufferNode->result, MA_BUSY, result);
-
-    /* At this point initialization is complete and we can signal the notification if any. */
-    if (pJob->data.resourceManager.loadDataBufferNode.pInitNotification != NULL) {
-        ma_async_notification_signal(pJob->data.resourceManager.loadDataBufferNode.pInitNotification);
-    }
-    if (pJob->data.resourceManager.loadDataBufferNode.pInitFence != NULL) {
-        ma_fence_release(pJob->data.resourceManager.loadDataBufferNode.pInitFence);
-    }
-
-    /* If we have a success result it means we've fully loaded the buffer. This will happen in the non-decoding case. */
-    if (result != MA_BUSY) {
-        if (pJob->data.resourceManager.loadDataBufferNode.pDoneNotification != NULL) {
-            ma_async_notification_signal(pJob->data.resourceManager.loadDataBufferNode.pDoneNotification);
-        }
-        if (pJob->data.resourceManager.loadDataBufferNode.pDoneFence != NULL) {
-            ma_fence_release(pJob->data.resourceManager.loadDataBufferNode.pDoneFence);
-        }
-    }
-
-    /* Increment the node's execution pointer so that the next jobs can be processed. This is how we keep decoding of pages in-order. */
-    ma_atomic_fetch_add_32(&pDataBufferNode->executionPointer, 1);
-
-    /* A busy result should be considered successful from the point of view of the job system. */
-    if (result == MA_BUSY) {
-        result  = MA_SUCCESS;
-    }
-
-    return result;
-}
-
-static ma_result ma_job_process__resource_manager__free_data_buffer_node(ma_job* pJob)
-{
-    ma_resource_manager* pResourceManager;
-    ma_resource_manager_data_buffer_node* pDataBufferNode;
-
-    MA_ASSERT(pJob != NULL);
-
-    pResourceManager = (ma_resource_manager*)pJob->data.resourceManager.freeDataBufferNode.pResourceManager;
-    MA_ASSERT(pResourceManager != NULL);
-
-    pDataBufferNode = (ma_resource_manager_data_buffer_node*)pJob->data.resourceManager.freeDataBufferNode.pDataBufferNode;
-    MA_ASSERT(pDataBufferNode != NULL);
-
-    if (pJob->order != ma_atomic_load_32(&pDataBufferNode->executionPointer)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
-    }
-
-    ma_resource_manager_data_buffer_node_free(pResourceManager, pDataBufferNode);
-
-    /* The event needs to be signalled last. */
-    if (pJob->data.resourceManager.freeDataBufferNode.pDoneNotification != NULL) {
-        ma_async_notification_signal(pJob->data.resourceManager.freeDataBufferNode.pDoneNotification);
-    }
-
-    if (pJob->data.resourceManager.freeDataBufferNode.pDoneFence != NULL) {
-        ma_fence_release(pJob->data.resourceManager.freeDataBufferNode.pDoneFence);
-    }
-
-    ma_atomic_fetch_add_32(&pDataBufferNode->executionPointer, 1);
-    return MA_SUCCESS;
-}
-
-static ma_result ma_job_process__resource_manager__page_data_buffer_node(ma_job* pJob)
-{
-    ma_result result = MA_SUCCESS;
-    ma_resource_manager* pResourceManager;
-    ma_resource_manager_data_buffer_node* pDataBufferNode;
-
-    MA_ASSERT(pJob != NULL);
-
-    pResourceManager = (ma_resource_manager*)pJob->data.resourceManager.pageDataBufferNode.pResourceManager;
-    MA_ASSERT(pResourceManager != NULL);
-
-    pDataBufferNode = (ma_resource_manager_data_buffer_node*)pJob->data.resourceManager.pageDataBufferNode.pDataBufferNode;
-    MA_ASSERT(pDataBufferNode != NULL);
-
-    if (pJob->order != ma_atomic_load_32(&pDataBufferNode->executionPointer)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
-    }
-
-    /* Don't do any more decoding if the data buffer has started the uninitialization process. */
-    result = ma_resource_manager_data_buffer_node_result(pDataBufferNode);
-    if (result != MA_BUSY) {
-        goto done;
-    }
-
-    /* We're ready to decode the next page. */
-    result = ma_resource_manager_data_buffer_node_decode_next_page(pResourceManager, pDataBufferNode, (ma_decoder*)pJob->data.resourceManager.pageDataBufferNode.pDecoder);
-
-    /*
-    If we have a success code by this point, we want to post another job. We're going to set the
-    result back to MA_BUSY to make it clear that there's still more to load.
-    */
-    if (result == MA_SUCCESS) {
-        ma_job newJob;
-        newJob = *pJob; /* Everything is the same as the input job, except the execution order. */
-        newJob.order = ma_resource_manager_data_buffer_node_next_execution_order(pDataBufferNode);   /* We need a fresh execution order. */
-
-        result = ma_resource_manager_post_job(pResourceManager, &newJob);
-
-        /* Since the sound isn't yet fully decoded we want the status to be set to busy. */
-        if (result == MA_SUCCESS) {
-            result  = MA_BUSY;
-        }
-    }
-
-done:
-    /* If there's still more to decode the result will be set to MA_BUSY. Otherwise we can free the decoder. */
-    if (result != MA_BUSY) {
-        ma_decoder_uninit((ma_decoder*)pJob->data.resourceManager.pageDataBufferNode.pDecoder);
-        ma_free(pJob->data.resourceManager.pageDataBufferNode.pDecoder, &pResourceManager->config.allocationCallbacks);
-    }
-
-    /* If we reached the end we need to treat it as successful. */
-    if (result == MA_AT_END) {
-        result  = MA_SUCCESS;
-    }
-
-    /* Make sure we set the result of node in case some error occurred. */
-    ma_atomic_compare_and_swap_i32(&pDataBufferNode->result, MA_BUSY, result);
-
-    /* Signal the notification after setting the result in case the notification callback wants to inspect the result code. */
-    if (result != MA_BUSY) {
-        if (pJob->data.resourceManager.pageDataBufferNode.pDoneNotification != NULL) {
-            ma_async_notification_signal(pJob->data.resourceManager.pageDataBufferNode.pDoneNotification);
-        }
-
-        if (pJob->data.resourceManager.pageDataBufferNode.pDoneFence != NULL) {
-            ma_fence_release(pJob->data.resourceManager.pageDataBufferNode.pDoneFence);
-        }
-    }
-
-    ma_atomic_fetch_add_32(&pDataBufferNode->executionPointer, 1);
-    return result;
-}
-
-
-static ma_result ma_job_process__resource_manager__load_data_buffer(ma_job* pJob)
-{
-    ma_result result = MA_SUCCESS;
-    ma_resource_manager* pResourceManager;
-    ma_resource_manager_data_buffer* pDataBuffer;
-    ma_resource_manager_data_supply_type dataSupplyType = ma_resource_manager_data_supply_type_unknown;
-    ma_bool32 isConnectorInitialized = MA_FALSE;
-
-    /*
-    All we're doing here is checking if the node has finished loading. If not, we just re-post the job
-    and keep waiting. Otherwise we increment the execution counter and set the buffer's result code.
-    */
-    MA_ASSERT(pJob != NULL);
-
-    pDataBuffer = (ma_resource_manager_data_buffer*)pJob->data.resourceManager.loadDataBuffer.pDataBuffer;
-    MA_ASSERT(pDataBuffer != NULL);
-
-    pResourceManager = pDataBuffer->pResourceManager;
-
-    if (pJob->order != ma_atomic_load_32(&pDataBuffer->executionPointer)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Attempting to execute out of order. Probably interleaved with a MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER job. */
-    }
-
-    /*
-    First thing we need to do is check whether or not the data buffer is getting deleted. If so we
-    just abort, but making sure we increment the execution pointer.
-    */
-    result = ma_resource_manager_data_buffer_result(pDataBuffer);
-    if (result != MA_BUSY) {
-        goto done;  /* <-- This will ensure the exucution pointer is incremented. */
-    } else {
-        result = MA_SUCCESS;    /* <-- Make sure this is reset. */
-    }
-
-    /* Try initializing the connector if we haven't already. */
-    isConnectorInitialized = ma_resource_manager_data_buffer_has_connector(pDataBuffer);
-    if (isConnectorInitialized == MA_FALSE) {
-        dataSupplyType = ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode);
-
-        if (dataSupplyType != ma_resource_manager_data_supply_type_unknown) {
-            /* We can now initialize the connector. If this fails, we need to abort. It's very rare for this to fail. */
-            ma_resource_manager_data_source_config dataSourceConfig;    /* For setting initial looping state and range. */
-            dataSourceConfig = ma_resource_manager_data_source_config_init();
-            dataSourceConfig.rangeBegInPCMFrames     = pJob->data.resourceManager.loadDataBuffer.rangeBegInPCMFrames;
-            dataSourceConfig.rangeEndInPCMFrames     = pJob->data.resourceManager.loadDataBuffer.rangeEndInPCMFrames;
-            dataSourceConfig.loopPointBegInPCMFrames = pJob->data.resourceManager.loadDataBuffer.loopPointBegInPCMFrames;
-            dataSourceConfig.loopPointEndInPCMFrames = pJob->data.resourceManager.loadDataBuffer.loopPointEndInPCMFrames;
-            dataSourceConfig.isLooping               = pJob->data.resourceManager.loadDataBuffer.isLooping;
-
-            result = ma_resource_manager_data_buffer_init_connector(pDataBuffer, &dataSourceConfig, pJob->data.resourceManager.loadDataBuffer.pInitNotification, pJob->data.resourceManager.loadDataBuffer.pInitFence);
-            if (result != MA_SUCCESS) {
-                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to initialize connector for data buffer. %s.\n", ma_result_description(result));
-                goto done;
-            }
-        } else {
-            /* Don't have a known data supply type. Most likely the data buffer node is still loading, but it could be that an error occurred. */
-        }
-    } else {
-        /* The connector is already initialized. Nothing to do here. */
-    }
-
-    /*
-    If the data node is still loading, we need to repost the job and *not* increment the execution
-    pointer (i.e. we need to not fall through to the "done" label).
-
-    There is a hole between here and the where the data connector is initialized where the data
-    buffer node may have finished initializing. We need to check for this by checking the result of
-    the data buffer node and whether or not we had an unknown data supply type at the time of
-    trying to initialize the data connector.
-    */
-    result = ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode);
-    if (result == MA_BUSY || (result == MA_SUCCESS && isConnectorInitialized == MA_FALSE && dataSupplyType == ma_resource_manager_data_supply_type_unknown)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);
-    }
-
-done:
-    /* Only move away from a busy code so that we don't trash any existing error codes. */
-    ma_atomic_compare_and_swap_i32(&pDataBuffer->result, MA_BUSY, result);
-
-    /* Only signal the other threads after the result has been set just for cleanliness sake. */
-    if (pJob->data.resourceManager.loadDataBuffer.pDoneNotification != NULL) {
-        ma_async_notification_signal(pJob->data.resourceManager.loadDataBuffer.pDoneNotification);
-    }
-    if (pJob->data.resourceManager.loadDataBuffer.pDoneFence != NULL) {
-        ma_fence_release(pJob->data.resourceManager.loadDataBuffer.pDoneFence);
-    }
-
-    /*
-    If at this point the data buffer has not had it's connector initialized, it means the
-    notification event was never signalled which means we need to signal it here.
-    */
-    if (ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE && result != MA_SUCCESS) {
-        if (pJob->data.resourceManager.loadDataBuffer.pInitNotification != NULL) {
-            ma_async_notification_signal(pJob->data.resourceManager.loadDataBuffer.pInitNotification);
-        }
-        if (pJob->data.resourceManager.loadDataBuffer.pInitFence != NULL) {
-            ma_fence_release(pJob->data.resourceManager.loadDataBuffer.pInitFence);
-        }
-    }
-
-    ma_atomic_fetch_add_32(&pDataBuffer->executionPointer, 1);
-    return result;
-}
-
-static ma_result ma_job_process__resource_manager__free_data_buffer(ma_job* pJob)
-{
-    ma_resource_manager* pResourceManager;
-    ma_resource_manager_data_buffer* pDataBuffer;
-
-    MA_ASSERT(pJob != NULL);
-
-    pDataBuffer = (ma_resource_manager_data_buffer*)pJob->data.resourceManager.freeDataBuffer.pDataBuffer;
-    MA_ASSERT(pDataBuffer != NULL);
-
-    pResourceManager = pDataBuffer->pResourceManager;
-
-    if (pJob->order != ma_atomic_load_32(&pDataBuffer->executionPointer)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
-    }
-
-    ma_resource_manager_data_buffer_uninit_internal(pDataBuffer);
-
-    /* The event needs to be signalled last. */
-    if (pJob->data.resourceManager.freeDataBuffer.pDoneNotification != NULL) {
-        ma_async_notification_signal(pJob->data.resourceManager.freeDataBuffer.pDoneNotification);
-    }
-
-    if (pJob->data.resourceManager.freeDataBuffer.pDoneFence != NULL) {
-        ma_fence_release(pJob->data.resourceManager.freeDataBuffer.pDoneFence);
-    }
-
-    ma_atomic_fetch_add_32(&pDataBuffer->executionPointer, 1);
-    return MA_SUCCESS;
-}
-
-static ma_result ma_job_process__resource_manager__load_data_stream(ma_job* pJob)
-{
-    ma_result result = MA_SUCCESS;
-    ma_decoder_config decoderConfig;
-    ma_uint32 pageBufferSizeInBytes;
-    ma_resource_manager* pResourceManager;
-    ma_resource_manager_data_stream* pDataStream;
-
-    MA_ASSERT(pJob != NULL);
-
-    pDataStream = (ma_resource_manager_data_stream*)pJob->data.resourceManager.loadDataStream.pDataStream;
-    MA_ASSERT(pDataStream != NULL);
-
-    pResourceManager = pDataStream->pResourceManager;
-
-    if (pJob->order != ma_atomic_load_32(&pDataStream->executionPointer)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
-    }
-
-    if (ma_resource_manager_data_stream_result(pDataStream) != MA_BUSY) {
-        result = MA_INVALID_OPERATION;  /* Most likely the data stream is being uninitialized. */
-        goto done;
-    }
-
-    /* We need to initialize the decoder first so we can determine the size of the pages. */
-    decoderConfig = ma_resource_manager__init_decoder_config(pResourceManager);
-
-    if (pJob->data.resourceManager.loadDataStream.pFilePath != NULL) {
-        result = ma_decoder_init_vfs(pResourceManager->config.pVFS, pJob->data.resourceManager.loadDataStream.pFilePath, &decoderConfig, &pDataStream->decoder);
-    } else {
-        result = ma_decoder_init_vfs_w(pResourceManager->config.pVFS, pJob->data.resourceManager.loadDataStream.pFilePathW, &decoderConfig, &pDataStream->decoder);
-    }
-    if (result != MA_SUCCESS) {
-        goto done;
-    }
-
-    /* Retrieve the total length of the file before marking the decoder as loaded. */
-    if ((pDataStream->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_UNKNOWN_LENGTH) == 0) {
-        result = ma_decoder_get_length_in_pcm_frames(&pDataStream->decoder, &pDataStream->totalLengthInPCMFrames);
-        if (result != MA_SUCCESS) {
-            goto done;  /* Failed to retrieve the length. */
-        }
-    } else {
-        pDataStream->totalLengthInPCMFrames = 0;
-    }
-
-    /*
-    Only mark the decoder as initialized when the length of the decoder has been retrieved because that can possibly require a scan over the whole file
-    and we don't want to have another thread trying to access the decoder while it's scanning.
-    */
-    pDataStream->isDecoderInitialized = MA_TRUE;
-
-    /* We have the decoder so we can now initialize our page buffer. */
-    pageBufferSizeInBytes = ma_resource_manager_data_stream_get_page_size_in_frames(pDataStream) * 2 * ma_get_bytes_per_frame(pDataStream->decoder.outputFormat, pDataStream->decoder.outputChannels);
-
-    pDataStream->pPageData = ma_malloc(pageBufferSizeInBytes, &pResourceManager->config.allocationCallbacks);
-    if (pDataStream->pPageData == NULL) {
-        ma_decoder_uninit(&pDataStream->decoder);
-        result = MA_OUT_OF_MEMORY;
-        goto done;
-    }
-
-    /* Seek to our initial seek point before filling the initial pages. */
-    ma_decoder_seek_to_pcm_frame(&pDataStream->decoder, pJob->data.resourceManager.loadDataStream.initialSeekPoint);
-
-    /* We have our decoder and our page buffer, so now we need to fill our pages. */
-    ma_resource_manager_data_stream_fill_pages(pDataStream);
-
-    /* And now we're done. We want to make sure the result is MA_SUCCESS. */
-    result = MA_SUCCESS;
-
-done:
-    ma_free(pJob->data.resourceManager.loadDataStream.pFilePath,  &pResourceManager->config.allocationCallbacks);
-    ma_free(pJob->data.resourceManager.loadDataStream.pFilePathW, &pResourceManager->config.allocationCallbacks);
-
-    /* We can only change the status away from MA_BUSY. If it's set to anything else it means an error has occurred somewhere or the uninitialization process has started (most likely). */
-    ma_atomic_compare_and_swap_i32(&pDataStream->result, MA_BUSY, result);
-
-    /* Only signal the other threads after the result has been set just for cleanliness sake. */
-    if (pJob->data.resourceManager.loadDataStream.pInitNotification != NULL) {
-        ma_async_notification_signal(pJob->data.resourceManager.loadDataStream.pInitNotification);
-    }
-    if (pJob->data.resourceManager.loadDataStream.pInitFence != NULL) {
-        ma_fence_release(pJob->data.resourceManager.loadDataStream.pInitFence);
-    }
-
-    ma_atomic_fetch_add_32(&pDataStream->executionPointer, 1);
-    return result;
-}
-
-static ma_result ma_job_process__resource_manager__free_data_stream(ma_job* pJob)
-{
-    ma_resource_manager* pResourceManager;
-    ma_resource_manager_data_stream* pDataStream;
-
-    MA_ASSERT(pJob != NULL);
-
-    pDataStream = (ma_resource_manager_data_stream*)pJob->data.resourceManager.freeDataStream.pDataStream;
-    MA_ASSERT(pDataStream != NULL);
-
-    pResourceManager = pDataStream->pResourceManager;
-
-    if (pJob->order != ma_atomic_load_32(&pDataStream->executionPointer)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
-    }
-
-    /* If our status is not MA_UNAVAILABLE we have a bug somewhere. */
-    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) == MA_UNAVAILABLE);
-
-    if (pDataStream->isDecoderInitialized) {
-        ma_decoder_uninit(&pDataStream->decoder);
-    }
-
-    if (pDataStream->pPageData != NULL) {
-        ma_free(pDataStream->pPageData, &pResourceManager->config.allocationCallbacks);
-        pDataStream->pPageData = NULL;  /* Just in case... */
-    }
-
-    ma_data_source_uninit(&pDataStream->ds);
-
-    /* The event needs to be signalled last. */
-    if (pJob->data.resourceManager.freeDataStream.pDoneNotification != NULL) {
-        ma_async_notification_signal(pJob->data.resourceManager.freeDataStream.pDoneNotification);
-    }
-    if (pJob->data.resourceManager.freeDataStream.pDoneFence != NULL) {
-        ma_fence_release(pJob->data.resourceManager.freeDataStream.pDoneFence);
-    }
-
-    /*ma_atomic_fetch_add_32(&pDataStream->executionPointer, 1);*/
-    return MA_SUCCESS;
-}
-
-static ma_result ma_job_process__resource_manager__page_data_stream(ma_job* pJob)
-{
-    ma_result result = MA_SUCCESS;
-    ma_resource_manager* pResourceManager;
-    ma_resource_manager_data_stream* pDataStream;
-
-    MA_ASSERT(pJob != NULL);
-
-    pDataStream = (ma_resource_manager_data_stream*)pJob->data.resourceManager.pageDataStream.pDataStream;
-    MA_ASSERT(pDataStream != NULL);
-
-    pResourceManager = pDataStream->pResourceManager;
-
-    if (pJob->order != ma_atomic_load_32(&pDataStream->executionPointer)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
-    }
-
-    /* For streams, the status should be MA_SUCCESS. */
-    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
-        result = MA_INVALID_OPERATION;
-        goto done;
-    }
-
-    ma_resource_manager_data_stream_fill_page(pDataStream, pJob->data.resourceManager.pageDataStream.pageIndex);
-
-done:
-    ma_atomic_fetch_add_32(&pDataStream->executionPointer, 1);
-    return result;
-}
-
-static ma_result ma_job_process__resource_manager__seek_data_stream(ma_job* pJob)
-{
-    ma_result result = MA_SUCCESS;
-    ma_resource_manager* pResourceManager;
-    ma_resource_manager_data_stream* pDataStream;
-
-    MA_ASSERT(pJob != NULL);
-
-    pDataStream = (ma_resource_manager_data_stream*)pJob->data.resourceManager.seekDataStream.pDataStream;
-    MA_ASSERT(pDataStream != NULL);
-
-    pResourceManager = pDataStream->pResourceManager;
-
-    if (pJob->order != ma_atomic_load_32(&pDataStream->executionPointer)) {
-        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
-    }
-
-    /* For streams the status should be MA_SUCCESS for this to do anything. */
-    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS || pDataStream->isDecoderInitialized == MA_FALSE) {
-        result = MA_INVALID_OPERATION;
-        goto done;
-    }
-
-    /*
-    With seeking we just assume both pages are invalid and the relative frame cursor at position 0. This is basically exactly the same as loading, except
-    instead of initializing the decoder, we seek to a frame.
-    */
-    ma_decoder_seek_to_pcm_frame(&pDataStream->decoder, pJob->data.resourceManager.seekDataStream.frameIndex);
-
-    /* After seeking we'll need to reload the pages. */
-    ma_resource_manager_data_stream_fill_pages(pDataStream);
-
-    /* We need to let the public API know that we're done seeking. */
-    ma_atomic_fetch_sub_32(&pDataStream->seekCounter, 1);
-
-done:
-    ma_atomic_fetch_add_32(&pDataStream->executionPointer, 1);
-    return result;
-}
-
-MA_API ma_result ma_resource_manager_process_job(ma_resource_manager* pResourceManager, ma_job* pJob)
-{
-    if (pResourceManager == NULL || pJob == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_job_process(pJob);
-}
-
-MA_API ma_result ma_resource_manager_process_next_job(ma_resource_manager* pResourceManager)
-{
-    ma_result result;
-    ma_job job;
-
-    if (pResourceManager == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* This will return MA_CANCELLED if the next job is a quit job. */
-    result = ma_resource_manager_next_job(pResourceManager, &job);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return ma_job_process(&job);
-}
-#else
-/* We'll get here if the resource manager is being excluded from the build. We need to define the job processing callbacks as no-ops. */
-static ma_result ma_job_process__resource_manager__load_data_buffer_node(ma_job* pJob) { return ma_job_process__noop(pJob); }
-static ma_result ma_job_process__resource_manager__free_data_buffer_node(ma_job* pJob) { return ma_job_process__noop(pJob); }
-static ma_result ma_job_process__resource_manager__page_data_buffer_node(ma_job* pJob) { return ma_job_process__noop(pJob); }
-static ma_result ma_job_process__resource_manager__load_data_buffer(ma_job* pJob)      { return ma_job_process__noop(pJob); }
-static ma_result ma_job_process__resource_manager__free_data_buffer(ma_job* pJob)      { return ma_job_process__noop(pJob); }
-static ma_result ma_job_process__resource_manager__load_data_stream(ma_job* pJob)      { return ma_job_process__noop(pJob); }
-static ma_result ma_job_process__resource_manager__free_data_stream(ma_job* pJob)      { return ma_job_process__noop(pJob); }
-static ma_result ma_job_process__resource_manager__page_data_stream(ma_job* pJob)      { return ma_job_process__noop(pJob); }
-static ma_result ma_job_process__resource_manager__seek_data_stream(ma_job* pJob)      { return ma_job_process__noop(pJob); }
-#endif  /* MA_NO_RESOURCE_MANAGER */
-
-
-#ifndef MA_NO_NODE_GRAPH
-/* 10ms @ 48K = 480. Must never exceed 65535. */
-#ifndef MA_DEFAULT_NODE_CACHE_CAP_IN_FRAMES_PER_BUS
-#define MA_DEFAULT_NODE_CACHE_CAP_IN_FRAMES_PER_BUS 480
-#endif
-
-
-static ma_result ma_node_read_pcm_frames(ma_node* pNode, ma_uint32 outputBusIndex, float* pFramesOut, ma_uint32 frameCount, ma_uint32* pFramesRead, ma_uint64 globalTime);
-
-MA_API void ma_debug_fill_pcm_frames_with_sine_wave(float* pFramesOut, ma_uint32 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
-{
-    #ifndef MA_NO_GENERATION
-    {
-        ma_waveform_config waveformConfig;
-        ma_waveform waveform;
-
-        waveformConfig = ma_waveform_config_init(format, channels, sampleRate, ma_waveform_type_sine, 1.0, 400);
-        ma_waveform_init(&waveformConfig, &waveform);
-        ma_waveform_read_pcm_frames(&waveform, pFramesOut, frameCount, NULL);
-    }
-    #else
-    {
-        (void)pFramesOut;
-        (void)frameCount;
-        (void)format;
-        (void)channels;
-        (void)sampleRate;
-        #if defined(MA_DEBUG_OUTPUT)
-        {
-            #if _MSC_VER
-                #pragma message ("ma_debug_fill_pcm_frames_with_sine_wave() will do nothing because MA_NO_GENERATION is enabled.")
-            #endif
-        }
-        #endif
-    }
-    #endif
-}
-
-
-
-MA_API ma_node_graph_config ma_node_graph_config_init(ma_uint32 channels)
-{
-    ma_node_graph_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.channels             = channels;
-    config.nodeCacheCapInFrames = MA_DEFAULT_NODE_CACHE_CAP_IN_FRAMES_PER_BUS;
-
-    return config;
-}
-
-
-static void ma_node_graph_set_is_reading(ma_node_graph* pNodeGraph, ma_bool32 isReading)
-{
-    MA_ASSERT(pNodeGraph != NULL);
-    ma_atomic_exchange_32(&pNodeGraph->isReading, isReading);
-}
-
-#if 0
-static ma_bool32 ma_node_graph_is_reading(ma_node_graph* pNodeGraph)
-{
-    MA_ASSERT(pNodeGraph != NULL);
-    return ma_atomic_load_32(&pNodeGraph->isReading);
-}
-#endif
-
-
-static void ma_node_graph_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_node_graph* pNodeGraph = (ma_node_graph*)pNode;
-    ma_uint64 framesRead;
-
-    ma_node_graph_read_pcm_frames(pNodeGraph, ppFramesOut[0], *pFrameCountOut, &framesRead);
-
-    *pFrameCountOut = (ma_uint32)framesRead;    /* Safe cast. */
-
-    (void)ppFramesIn;
-    (void)pFrameCountIn;
-}
-
-static ma_node_vtable g_node_graph_node_vtable =
-{
-    ma_node_graph_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    0,      /* 0 input buses. */
-    1,      /* 1 output bus. */
-    0       /* Flags. */
-};
-
-static void ma_node_graph_endpoint_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    MA_ASSERT(pNode != NULL);
-    MA_ASSERT(ma_node_get_input_bus_count(pNode)  == 1);
-    MA_ASSERT(ma_node_get_output_bus_count(pNode) == 1);
-
-    /* Input channel count needs to be the same as the output channel count. */
-    MA_ASSERT(ma_node_get_input_channels(pNode, 0) == ma_node_get_output_channels(pNode, 0));
-
-    /* We don't need to do anything here because it's a passthrough. */
-    (void)pNode;
-    (void)ppFramesIn;
-    (void)pFrameCountIn;
-    (void)ppFramesOut;
-    (void)pFrameCountOut;
-
-#if 0
-    /* The data has already been mixed. We just need to move it to the output buffer. */
-    if (ppFramesIn != NULL) {
-        ma_copy_pcm_frames(ppFramesOut[0], ppFramesIn[0], *pFrameCountOut, ma_format_f32, ma_node_get_output_channels(pNode, 0));
-    }
-#endif
-}
-
-static ma_node_vtable g_node_graph_endpoint_vtable =
-{
-    ma_node_graph_endpoint_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    1,      /* 1 input bus. */
-    1,      /* 1 output bus. */
-    MA_NODE_FLAG_PASSTHROUGH    /* Flags. The endpoint is a passthrough. */
-};
-
-MA_API ma_result ma_node_graph_init(const ma_node_graph_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_node_graph* pNodeGraph)
-{
-    ma_result result;
-    ma_node_config baseConfig;
-    ma_node_config endpointConfig;
-
-    if (pNodeGraph == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNodeGraph);
-    pNodeGraph->nodeCacheCapInFrames = pConfig->nodeCacheCapInFrames;
-    if (pNodeGraph->nodeCacheCapInFrames == 0) {
-        pNodeGraph->nodeCacheCapInFrames = MA_DEFAULT_NODE_CACHE_CAP_IN_FRAMES_PER_BUS;
-    }
-
-
-    /* Base node so we can use the node graph as a node into another graph. */
-    baseConfig = ma_node_config_init();
-    baseConfig.vtable = &g_node_graph_node_vtable;
-    baseConfig.pOutputChannels = &pConfig->channels;
-
-    result = ma_node_init(pNodeGraph, &baseConfig, pAllocationCallbacks, &pNodeGraph->base);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-
-    /* Endpoint. */
-    endpointConfig = ma_node_config_init();
-    endpointConfig.vtable          = &g_node_graph_endpoint_vtable;
-    endpointConfig.pInputChannels  = &pConfig->channels;
-    endpointConfig.pOutputChannels = &pConfig->channels;
-
-    result = ma_node_init(pNodeGraph, &endpointConfig, pAllocationCallbacks, &pNodeGraph->endpoint);
-    if (result != MA_SUCCESS) {
-        ma_node_uninit(&pNodeGraph->base, pAllocationCallbacks);
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_node_graph_uninit(ma_node_graph* pNodeGraph, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pNodeGraph == NULL) {
-        return;
-    }
-
-    ma_node_uninit(&pNodeGraph->endpoint, pAllocationCallbacks);
-}
-
-MA_API ma_node* ma_node_graph_get_endpoint(ma_node_graph* pNodeGraph)
-{
-    if (pNodeGraph == NULL) {
-        return NULL;
-    }
-
-    return &pNodeGraph->endpoint;
-}
-
-MA_API ma_result ma_node_graph_read_pcm_frames(ma_node_graph* pNodeGraph, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint64 totalFramesRead;
-    ma_uint32 channels;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;   /* Safety. */
-    }
-
-    if (pNodeGraph == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    channels = ma_node_get_output_channels(&pNodeGraph->endpoint, 0);
-
-
-    /* We'll be nice and try to do a full read of all frameCount frames. */
-    totalFramesRead = 0;
-    while (totalFramesRead < frameCount) {
-        ma_uint32 framesJustRead;
-        ma_uint64 framesToRead = frameCount - totalFramesRead;
-
-        if (framesToRead > 0xFFFFFFFF) {
-            framesToRead = 0xFFFFFFFF;
-        }
-
-        ma_node_graph_set_is_reading(pNodeGraph, MA_TRUE);
-        {
-            result = ma_node_read_pcm_frames(&pNodeGraph->endpoint, 0, (float*)ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, ma_format_f32, channels), (ma_uint32)framesToRead, &framesJustRead, ma_node_get_time(&pNodeGraph->endpoint));
-        }
-        ma_node_graph_set_is_reading(pNodeGraph, MA_FALSE);
-
-        totalFramesRead += framesJustRead;
-
-        if (result != MA_SUCCESS) {
-            break;
-        }
-
-        /* Abort if we weren't able to read any frames or else we risk getting stuck in a loop. */
-        if (framesJustRead == 0) {
-            break;
-        }
-    }
-
-    /* Let's go ahead and silence any leftover frames just for some added safety to ensure the caller doesn't try emitting garbage out of the speakers. */
-    if (totalFramesRead < frameCount) {
-        ma_silence_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, ma_format_f32, channels), (frameCount - totalFramesRead), ma_format_f32, channels);
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = totalFramesRead;
-    }
-
-    return result;
-}
-
-MA_API ma_uint32 ma_node_graph_get_channels(const ma_node_graph* pNodeGraph)
-{
-    if (pNodeGraph == NULL) {
-        return 0;
-    }
-
-    return ma_node_get_output_channels(&pNodeGraph->endpoint, 0);
-}
-
-MA_API ma_uint64 ma_node_graph_get_time(const ma_node_graph* pNodeGraph)
-{
-    if (pNodeGraph == NULL) {
-        return 0;
-    }
-
-    return ma_node_get_time(&pNodeGraph->endpoint); /* Global time is just the local time of the endpoint. */
-}
-
-MA_API ma_result ma_node_graph_set_time(ma_node_graph* pNodeGraph, ma_uint64 globalTime)
-{
-    if (pNodeGraph == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_node_set_time(&pNodeGraph->endpoint, globalTime); /* Global time is just the local time of the endpoint. */
-}
-
-
-#define MA_NODE_OUTPUT_BUS_FLAG_HAS_READ    0x01    /* Whether or not this bus ready to read more data. Only used on nodes with multiple output buses. */
-
-static ma_result ma_node_output_bus_init(ma_node* pNode, ma_uint32 outputBusIndex, ma_uint32 channels, ma_node_output_bus* pOutputBus)
-{
-    MA_ASSERT(pOutputBus != NULL);
-    MA_ASSERT(outputBusIndex < MA_MAX_NODE_BUS_COUNT);
-    MA_ASSERT(outputBusIndex < ma_node_get_output_bus_count(pNode));
-    MA_ASSERT(channels < 256);
-
-    MA_ZERO_OBJECT(pOutputBus);
-
-    if (channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pOutputBus->pNode          = pNode;
-    pOutputBus->outputBusIndex = (ma_uint8)outputBusIndex;
-    pOutputBus->channels       = (ma_uint8)channels;
-    pOutputBus->flags          = MA_NODE_OUTPUT_BUS_FLAG_HAS_READ; /* <-- Important that this flag is set by default. */
-    pOutputBus->volume         = 1;
-
-    return MA_SUCCESS;
-}
-
-static void ma_node_output_bus_lock(ma_node_output_bus* pOutputBus)
-{
-    ma_spinlock_lock(&pOutputBus->lock);
-}
-
-static void ma_node_output_bus_unlock(ma_node_output_bus* pOutputBus)
-{
-    ma_spinlock_unlock(&pOutputBus->lock);
-}
-
-
-static ma_uint32 ma_node_output_bus_get_channels(const ma_node_output_bus* pOutputBus)
-{
-    return pOutputBus->channels;
-}
-
-
-static void ma_node_output_bus_set_has_read(ma_node_output_bus* pOutputBus, ma_bool32 hasRead)
-{
-    if (hasRead) {
-        ma_atomic_fetch_or_32(&pOutputBus->flags, MA_NODE_OUTPUT_BUS_FLAG_HAS_READ);
-    } else {
-        ma_atomic_fetch_and_32(&pOutputBus->flags, (ma_uint32)~MA_NODE_OUTPUT_BUS_FLAG_HAS_READ);
-    }
-}
-
-static ma_bool32 ma_node_output_bus_has_read(ma_node_output_bus* pOutputBus)
-{
-    return (ma_atomic_load_32(&pOutputBus->flags) & MA_NODE_OUTPUT_BUS_FLAG_HAS_READ) != 0;
-}
-
-
-static void ma_node_output_bus_set_is_attached(ma_node_output_bus* pOutputBus, ma_bool32 isAttached)
-{
-    ma_atomic_exchange_32(&pOutputBus->isAttached, isAttached);
-}
-
-static ma_bool32 ma_node_output_bus_is_attached(ma_node_output_bus* pOutputBus)
-{
-    return ma_atomic_load_32(&pOutputBus->isAttached);
-}
-
-
-static ma_result ma_node_output_bus_set_volume(ma_node_output_bus* pOutputBus, float volume)
-{
-    MA_ASSERT(pOutputBus != NULL);
-
-    if (volume < 0.0f) {
-        volume = 0.0f;
-    }
-
-    ma_atomic_exchange_f32(&pOutputBus->volume, volume);
-
-    return MA_SUCCESS;
-}
-
-static float ma_node_output_bus_get_volume(const ma_node_output_bus* pOutputBus)
-{
-    return ma_atomic_load_f32((float*)&pOutputBus->volume);
-}
-
-
-static ma_result ma_node_input_bus_init(ma_uint32 channels, ma_node_input_bus* pInputBus)
-{
-    MA_ASSERT(pInputBus != NULL);
-    MA_ASSERT(channels < 256);
-
-    MA_ZERO_OBJECT(pInputBus);
-
-    if (channels == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pInputBus->channels = (ma_uint8)channels;
-
-    return MA_SUCCESS;
-}
-
-static void ma_node_input_bus_lock(ma_node_input_bus* pInputBus)
-{
-    MA_ASSERT(pInputBus != NULL);
-
-    ma_spinlock_lock(&pInputBus->lock);
-}
-
-static void ma_node_input_bus_unlock(ma_node_input_bus* pInputBus)
-{
-    MA_ASSERT(pInputBus != NULL);
-
-    ma_spinlock_unlock(&pInputBus->lock);
-}
-
-
-static void ma_node_input_bus_next_begin(ma_node_input_bus* pInputBus)
-{
-    ma_atomic_fetch_add_32(&pInputBus->nextCounter, 1);
-}
-
-static void ma_node_input_bus_next_end(ma_node_input_bus* pInputBus)
-{
-    ma_atomic_fetch_sub_32(&pInputBus->nextCounter, 1);
-}
-
-static ma_uint32 ma_node_input_bus_get_next_counter(ma_node_input_bus* pInputBus)
-{
-    return ma_atomic_load_32(&pInputBus->nextCounter);
-}
-
-
-static ma_uint32 ma_node_input_bus_get_channels(const ma_node_input_bus* pInputBus)
-{
-    return pInputBus->channels;
-}
-
-
-static void ma_node_input_bus_detach__no_output_bus_lock(ma_node_input_bus* pInputBus, ma_node_output_bus* pOutputBus)
-{
-    MA_ASSERT(pInputBus  != NULL);
-    MA_ASSERT(pOutputBus != NULL);
-
-    /*
-    Mark the output bus as detached first. This will prevent future iterations on the audio thread
-    from iterating this output bus.
-    */
-    ma_node_output_bus_set_is_attached(pOutputBus, MA_FALSE);
-
-    /*
-    We cannot use the output bus lock here since it'll be getting used at a higher level, but we do
-    still need to use the input bus lock since we'll be updating pointers on two different output
-    buses. The same rules apply here as the attaching case. Although we're using a lock here, we're
-    *not* using a lock when iterating over the list in the audio thread. We therefore need to craft
-    this in a way such that the iteration on the audio thread doesn't break.
-
-    The the first thing to do is swap out the "next" pointer of the previous output bus with the
-    new "next" output bus. This is the operation that matters for iteration on the audio thread.
-    After that, the previous pointer on the new "next" pointer needs to be updated, after which
-    point the linked list will be in a good state.
-    */
-    ma_node_input_bus_lock(pInputBus);
-    {
-        ma_node_output_bus* pOldPrev = (ma_node_output_bus*)ma_atomic_load_ptr(&pOutputBus->pPrev);
-        ma_node_output_bus* pOldNext = (ma_node_output_bus*)ma_atomic_load_ptr(&pOutputBus->pNext);
-
-        if (pOldPrev != NULL) {
-            ma_atomic_exchange_ptr(&pOldPrev->pNext, pOldNext); /* <-- This is where the output bus is detached from the list. */
-        }
-        if (pOldNext != NULL) {
-            ma_atomic_exchange_ptr(&pOldNext->pPrev, pOldPrev); /* <-- This is required for detachment. */
-        }
-    }
-    ma_node_input_bus_unlock(pInputBus);
-
-    /* At this point the output bus is detached and the linked list is completely unaware of it. Reset some data for safety. */
-    ma_atomic_exchange_ptr(&pOutputBus->pNext, NULL);   /* Using atomic exchanges here, mainly for the benefit of analysis tools which don't always recognize spinlocks. */
-    ma_atomic_exchange_ptr(&pOutputBus->pPrev, NULL);   /* As above. */
-    pOutputBus->pInputNode             = NULL;
-    pOutputBus->inputNodeInputBusIndex = 0;
-
-
-    /*
-    For thread-safety reasons, we don't want to be returning from this straight away. We need to
-    wait for the audio thread to finish with the output bus. There's two things we need to wait
-    for. The first is the part that selects the next output bus in the list, and the other is the
-    part that reads from the output bus. Basically all we're doing is waiting for the input bus
-    to stop referencing the output bus.
-
-    We're doing this part last because we want the section above to run while the audio thread
-    is finishing up with the output bus, just for efficiency reasons. We marked the output bus as
-    detached right at the top of this function which is going to prevent the audio thread from
-    iterating the output bus again.
-    */
-
-    /* Part 1: Wait for the current iteration to complete. */
-    while (ma_node_input_bus_get_next_counter(pInputBus) > 0) {
-        ma_yield();
-    }
-
-    /* Part 2: Wait for any reads to complete. */
-    while (ma_atomic_load_32(&pOutputBus->refCount) > 0) {
-        ma_yield();
-    }
-
-    /*
-    At this point we're done detaching and we can be guaranteed that the audio thread is not going
-    to attempt to reference this output bus again (until attached again).
-    */
-}
-
-#if 0   /* Not used at the moment, but leaving here in case I need it later. */
-static void ma_node_input_bus_detach(ma_node_input_bus* pInputBus, ma_node_output_bus* pOutputBus)
-{
-    MA_ASSERT(pInputBus  != NULL);
-    MA_ASSERT(pOutputBus != NULL);
-
-    ma_node_output_bus_lock(pOutputBus);
-    {
-        ma_node_input_bus_detach__no_output_bus_lock(pInputBus, pOutputBus);
-    }
-    ma_node_output_bus_unlock(pOutputBus);
-}
-#endif
-
-static void ma_node_input_bus_attach(ma_node_input_bus* pInputBus, ma_node_output_bus* pOutputBus, ma_node* pNewInputNode, ma_uint32 inputNodeInputBusIndex)
-{
-    MA_ASSERT(pInputBus  != NULL);
-    MA_ASSERT(pOutputBus != NULL);
-
-    ma_node_output_bus_lock(pOutputBus);
-    {
-        ma_node_output_bus* pOldInputNode = (ma_node_output_bus*)ma_atomic_load_ptr(&pOutputBus->pInputNode);
-
-        /* Detach from any existing attachment first if necessary. */
-        if (pOldInputNode != NULL) {
-            ma_node_input_bus_detach__no_output_bus_lock(pInputBus, pOutputBus);
-        }
-
-        /*
-        At this point we can be sure the output bus is not attached to anything. The linked list in the
-        old input bus has been updated so that pOutputBus will not get iterated again.
-        */
-        pOutputBus->pInputNode             = pNewInputNode;                     /* No need for an atomic assignment here because modification of this variable always happens within a lock. */
-        pOutputBus->inputNodeInputBusIndex = (ma_uint8)inputNodeInputBusIndex;
-
-        /*
-        Now we need to attach the output bus to the linked list. This involves updating two pointers on
-        two different output buses so I'm going to go ahead and keep this simple and just use a lock.
-        There are ways to do this without a lock, but it's just too hard to maintain for it's value.
-
-        Although we're locking here, it's important to remember that we're *not* locking when iterating
-        and reading audio data since that'll be running on the audio thread. As a result we need to be
-        careful how we craft this so that we don't break iteration. What we're going to do is always
-        attach the new item so that it becomes the first item in the list. That way, as we're iterating
-        we won't break any links in the list and iteration will continue safely. The detaching case will
-        also be crafted in a way as to not break list iteration. It's important to remember to use
-        atomic exchanges here since no locking is happening on the audio thread during iteration.
-        */
-        ma_node_input_bus_lock(pInputBus);
-        {
-            ma_node_output_bus* pNewPrev = &pInputBus->head;
-            ma_node_output_bus* pNewNext = (ma_node_output_bus*)ma_atomic_load_ptr(&pInputBus->head.pNext);
-
-            /* Update the local output bus. */
-            ma_atomic_exchange_ptr(&pOutputBus->pPrev, pNewPrev);
-            ma_atomic_exchange_ptr(&pOutputBus->pNext, pNewNext);
-
-            /* Update the other output buses to point back to the local output bus. */
-            ma_atomic_exchange_ptr(&pInputBus->head.pNext, pOutputBus); /* <-- This is where the output bus is actually attached to the input bus. */
-
-            /* Do the previous pointer last. This is only used for detachment. */
-            if (pNewNext != NULL) {
-                ma_atomic_exchange_ptr(&pNewNext->pPrev,  pOutputBus);
-            }
-        }
-        ma_node_input_bus_unlock(pInputBus);
-
-        /*
-        Mark the node as attached last. This is used to controlling whether or the output bus will be
-        iterated on the audio thread. Mainly required for detachment purposes.
-        */
-        ma_node_output_bus_set_is_attached(pOutputBus, MA_TRUE);
-    }
-    ma_node_output_bus_unlock(pOutputBus);
-}
-
-static ma_node_output_bus* ma_node_input_bus_next(ma_node_input_bus* pInputBus, ma_node_output_bus* pOutputBus)
-{
-    ma_node_output_bus* pNext;
-
-    MA_ASSERT(pInputBus != NULL);
-
-    if (pOutputBus == NULL) {
-        return NULL;
-    }
-
-    ma_node_input_bus_next_begin(pInputBus);
-    {
-        pNext = pOutputBus;
-        for (;;) {
-            pNext = (ma_node_output_bus*)ma_atomic_load_ptr(&pNext->pNext);
-            if (pNext == NULL) {
-                break;      /* Reached the end. */
-            }
-
-            if (ma_node_output_bus_is_attached(pNext) == MA_FALSE) {
-                continue;   /* The node is not attached. Keep checking. */
-            }
-
-            /* The next node has been selected. */
-            break;
-        }
-
-        /* We need to increment the reference count of the selected node. */
-        if (pNext != NULL) {
-            ma_atomic_fetch_add_32(&pNext->refCount, 1);
-        }
-
-        /* The previous node is no longer being referenced. */
-        ma_atomic_fetch_sub_32(&pOutputBus->refCount, 1);
-    }
-    ma_node_input_bus_next_end(pInputBus);
-
-    return pNext;
-}
-
-static ma_node_output_bus* ma_node_input_bus_first(ma_node_input_bus* pInputBus)
-{
-    return ma_node_input_bus_next(pInputBus, &pInputBus->head);
-}
-
-
-
-static ma_result ma_node_input_bus_read_pcm_frames(ma_node* pInputNode, ma_node_input_bus* pInputBus, float* pFramesOut, ma_uint32 frameCount, ma_uint32* pFramesRead, ma_uint64 globalTime)
-{
-    ma_result result = MA_SUCCESS;
-    ma_node_output_bus* pOutputBus;
-    ma_node_output_bus* pFirst;
-    ma_uint32 inputChannels;
-    ma_bool32 doesOutputBufferHaveContent = MA_FALSE;
-
-    (void)pInputNode;   /* Not currently used. */
-
-    /*
-    This will be called from the audio thread which means we can't be doing any locking. Basically,
-    this function will not perfom any locking, whereas attaching and detaching will, but crafted in
-    such a way that we don't need to perform any locking here. The important thing to remember is
-    to always iterate in a forward direction.
-
-    In order to process any data we need to first read from all input buses. That's where this
-    function comes in. This iterates over each of the attachments and accumulates/mixes them. We
-    also convert the channels to the nodes output channel count before mixing. We want to do this
-    channel conversion so that the caller of this function can invoke the processing callback
-    without having to do it themselves.
-
-    When we iterate over each of the attachments on the input bus, we need to read as much data as
-    we can from each of them so that we don't end up with holes between each of the attachments. To
-    do this, we need to read from each attachment in a loop and read as many frames as we can, up
-    to `frameCount`.
-    */
-    MA_ASSERT(pInputNode  != NULL);
-    MA_ASSERT(pFramesRead != NULL); /* pFramesRead is critical and must always be specified. On input it's undefined and on output it'll be set to the number of frames actually read. */
-
-    *pFramesRead = 0;   /* Safety. */
-
-    inputChannels = ma_node_input_bus_get_channels(pInputBus);
-
-    /*
-    We need to be careful with how we call ma_node_input_bus_first() and ma_node_input_bus_next(). They
-    are both critical to our lock-free thread-safety system. We can only call ma_node_input_bus_first()
-    once per iteration, however we have an optimization to checks whether or not it's the first item in
-    the list. We therefore need to store a pointer to the first item rather than repeatedly calling
-    ma_node_input_bus_first(). It's safe to keep hold of this pointer, so long as we don't dereference it
-    after calling ma_node_input_bus_next(), which we won't be.
-    */
-    pFirst = ma_node_input_bus_first(pInputBus);
-    if (pFirst == NULL) {
-        return MA_SUCCESS;  /* No attachments. Read nothing. */
-    }
-
-    for (pOutputBus = pFirst; pOutputBus != NULL; pOutputBus = ma_node_input_bus_next(pInputBus, pOutputBus)) {
-        ma_uint32 framesProcessed = 0;
-        ma_bool32 isSilentOutput = MA_FALSE;
-
-        MA_ASSERT(pOutputBus->pNode != NULL);
-        MA_ASSERT(((ma_node_base*)pOutputBus->pNode)->vtable != NULL);
-
-        isSilentOutput = (((ma_node_base*)pOutputBus->pNode)->vtable->flags & MA_NODE_FLAG_SILENT_OUTPUT) != 0;
-
-        if (pFramesOut != NULL) {
-            /* Read. */
-            float temp[MA_DATA_CONVERTER_STACK_BUFFER_SIZE / sizeof(float)];
-            ma_uint32 tempCapInFrames = ma_countof(temp) / inputChannels;
-
-            while (framesProcessed < frameCount) {
-                float* pRunningFramesOut;
-                ma_uint32 framesToRead;
-                ma_uint32 framesJustRead;
-
-                framesToRead = frameCount - framesProcessed;
-                if (framesToRead > tempCapInFrames) {
-                    framesToRead = tempCapInFrames;
-                }
-
-                pRunningFramesOut = ma_offset_pcm_frames_ptr_f32(pFramesOut, framesProcessed, inputChannels);
-
-                if (doesOutputBufferHaveContent == MA_FALSE) {
-                    /* Fast path. First attachment. We just read straight into the output buffer (no mixing required). */
-                    result = ma_node_read_pcm_frames(pOutputBus->pNode, pOutputBus->outputBusIndex, pRunningFramesOut, framesToRead, &framesJustRead, globalTime + framesProcessed);
-                } else {
-                    /* Slow path. Not the first attachment. Mixing required. */
-                    result = ma_node_read_pcm_frames(pOutputBus->pNode, pOutputBus->outputBusIndex, temp, framesToRead, &framesJustRead, globalTime + framesProcessed);
-                    if (result == MA_SUCCESS || result == MA_AT_END) {
-                        if (isSilentOutput == MA_FALSE) {   /* Don't mix if the node outputs silence. */
-                            ma_mix_pcm_frames_f32(pRunningFramesOut, temp, framesJustRead, inputChannels, /*volume*/1);
-                        }
-                    }
-                }
-
-                framesProcessed += framesJustRead;
-
-                /* If we reached the end or otherwise failed to read any data we need to finish up with this output node. */
-                if (result != MA_SUCCESS) {
-                    break;
-                }
-
-                /* If we didn't read anything, abort so we don't get stuck in a loop. */
-                if (framesJustRead == 0) {
-                    break;
-                }
-            }
-
-            /* If it's the first attachment we didn't do any mixing. Any leftover samples need to be silenced. */
-            if (pOutputBus == pFirst && framesProcessed < frameCount) {
-                ma_silence_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, framesProcessed, ma_format_f32, inputChannels), (frameCount - framesProcessed), ma_format_f32, inputChannels);
-            }
-
-            if (isSilentOutput == MA_FALSE) {
-                doesOutputBufferHaveContent = MA_TRUE;
-            }
-        } else {
-            /* Seek. */
-            ma_node_read_pcm_frames(pOutputBus->pNode, pOutputBus->outputBusIndex, NULL, frameCount, &framesProcessed, globalTime);
-        }
-    }
-
-    /* If we didn't output anything, output silence. */
-    if (doesOutputBufferHaveContent == MA_FALSE && pFramesOut != NULL) {
-        ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, inputChannels);
-    }
-
-    /* In this path we always "process" the entire amount. */
-    *pFramesRead = frameCount;
-
-    return result;
-}
-
-
-MA_API ma_node_config ma_node_config_init(void)
-{
-    ma_node_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.initialState   = ma_node_state_started;    /* Nodes are started by default. */
-    config.inputBusCount  = MA_NODE_BUS_COUNT_UNKNOWN;
-    config.outputBusCount = MA_NODE_BUS_COUNT_UNKNOWN;
-
-    return config;
-}
-
-
-
-static ma_result ma_node_detach_full(ma_node* pNode);
-
-static float* ma_node_get_cached_input_ptr(ma_node* pNode, ma_uint32 inputBusIndex)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-    ma_uint32 iInputBus;
-    float* pBasePtr;
-
-    MA_ASSERT(pNodeBase != NULL);
-
-    /* Input data is stored at the front of the buffer. */
-    pBasePtr = pNodeBase->pCachedData;
-    for (iInputBus = 0; iInputBus < inputBusIndex; iInputBus += 1) {
-        pBasePtr += pNodeBase->cachedDataCapInFramesPerBus * ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[iInputBus]);
-    }
-
-    return pBasePtr;
-}
-
-static float* ma_node_get_cached_output_ptr(ma_node* pNode, ma_uint32 outputBusIndex)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-    ma_uint32 iInputBus;
-    ma_uint32 iOutputBus;
-    float* pBasePtr;
-
-    MA_ASSERT(pNodeBase != NULL);
-
-    /* Cached output data starts after the input data. */
-    pBasePtr = pNodeBase->pCachedData;
-    for (iInputBus = 0; iInputBus < ma_node_get_input_bus_count(pNodeBase); iInputBus += 1) {
-        pBasePtr += pNodeBase->cachedDataCapInFramesPerBus * ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[iInputBus]);
-    }
-
-    for (iOutputBus = 0; iOutputBus < outputBusIndex; iOutputBus += 1) {
-        pBasePtr += pNodeBase->cachedDataCapInFramesPerBus * ma_node_output_bus_get_channels(&pNodeBase->pOutputBuses[iOutputBus]);
-    }
-
-    return pBasePtr;
-}
-
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t inputBusOffset;
-    size_t outputBusOffset;
-    size_t cachedDataOffset;
-    ma_uint32 inputBusCount;    /* So it doesn't have to be calculated twice. */
-    ma_uint32 outputBusCount;   /* So it doesn't have to be calculated twice. */
-} ma_node_heap_layout;
-
-static ma_result ma_node_translate_bus_counts(const ma_node_config* pConfig, ma_uint32* pInputBusCount, ma_uint32* pOutputBusCount)
-{
-    ma_uint32 inputBusCount;
-    ma_uint32 outputBusCount;
-
-    MA_ASSERT(pConfig != NULL);
-    MA_ASSERT(pInputBusCount  != NULL);
-    MA_ASSERT(pOutputBusCount != NULL);
-
-    /* Bus counts are determined by the vtable, unless they're set to `MA_NODE_BUS_COUNT_UNKNWON`, in which case they're taken from the config. */
-    if (pConfig->vtable->inputBusCount == MA_NODE_BUS_COUNT_UNKNOWN) {
-        inputBusCount = pConfig->inputBusCount;
-    } else {
-        inputBusCount = pConfig->vtable->inputBusCount;
-
-        if (pConfig->inputBusCount != MA_NODE_BUS_COUNT_UNKNOWN && pConfig->inputBusCount != pConfig->vtable->inputBusCount) {
-            return MA_INVALID_ARGS; /* Invalid configuration. You must not specify a conflicting bus count between the node's config and the vtable. */
-        }
-    }
-
-    if (pConfig->vtable->outputBusCount == MA_NODE_BUS_COUNT_UNKNOWN) {
-        outputBusCount = pConfig->outputBusCount;
-    } else {
-        outputBusCount = pConfig->vtable->outputBusCount;
-
-        if (pConfig->outputBusCount != MA_NODE_BUS_COUNT_UNKNOWN && pConfig->outputBusCount != pConfig->vtable->outputBusCount) {
-            return MA_INVALID_ARGS; /* Invalid configuration. You must not specify a conflicting bus count between the node's config and the vtable. */
-        }
-    }
-
-    /* Bus counts must be within limits. */
-    if (inputBusCount > MA_MAX_NODE_BUS_COUNT || outputBusCount > MA_MAX_NODE_BUS_COUNT) {
-        return MA_INVALID_ARGS;
-    }
-
-
-    /* We must have channel counts for each bus. */
-    if ((inputBusCount > 0 && pConfig->pInputChannels == NULL) || (outputBusCount > 0 && pConfig->pOutputChannels == NULL)) {
-        return MA_INVALID_ARGS; /* You must specify channel counts for each input and output bus. */
-    }
-
-
-    /* Some special rules for passthrough nodes. */
-    if ((pConfig->vtable->flags & MA_NODE_FLAG_PASSTHROUGH) != 0) {
-        if ((pConfig->vtable->inputBusCount != 0 && pConfig->vtable->inputBusCount != 1) || pConfig->vtable->outputBusCount != 1) {
-            return MA_INVALID_ARGS; /* Passthrough nodes must have exactly 1 output bus and either 0 or 1 input bus. */
-        }
-
-        if (pConfig->pInputChannels[0] != pConfig->pOutputChannels[0]) {
-            return MA_INVALID_ARGS; /* Passthrough nodes must have the same number of channels between input and output nodes. */
-        }
-    }
-
-
-    *pInputBusCount  = inputBusCount;
-    *pOutputBusCount = outputBusCount;
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_node_get_heap_layout(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, ma_node_heap_layout* pHeapLayout)
-{
-    ma_result result;
-    ma_uint32 inputBusCount;
-    ma_uint32 outputBusCount;
-
-    MA_ASSERT(pHeapLayout != NULL);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL || pConfig->vtable == NULL || pConfig->vtable->onProcess == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_node_translate_bus_counts(pConfig, &inputBusCount, &outputBusCount);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    /* Input buses. */
-    if (inputBusCount > MA_MAX_NODE_LOCAL_BUS_COUNT) {
-        pHeapLayout->inputBusOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += ma_align_64(sizeof(ma_node_input_bus) * inputBusCount);
-    } else {
-        pHeapLayout->inputBusOffset = MA_SIZE_MAX;  /* MA_SIZE_MAX indicates that no heap allocation is required for the input bus. */
-    }
-
-    /* Output buses. */
-    if (outputBusCount > MA_MAX_NODE_LOCAL_BUS_COUNT) {
-        pHeapLayout->outputBusOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += ma_align_64(sizeof(ma_node_output_bus) * outputBusCount);
-    } else {
-        pHeapLayout->outputBusOffset = MA_SIZE_MAX;
-    }
-
-    /*
-    Cached audio data.
-
-    We need to allocate memory for a caching both input and output data. We have an optimization
-    where no caching is necessary for specific conditions:
-
-        - The node has 0 inputs and 1 output.
-
-    When a node meets the above conditions, no cache is allocated.
-
-    The size choice for this buffer is a little bit finicky. We don't want to be too wasteful by
-    allocating too much, but at the same time we want it be large enough so that enough frames can
-    be processed for each call to ma_node_read_pcm_frames() so that it keeps things efficient. For
-    now I'm going with 10ms @ 48K which is 480 frames per bus. This is configurable at compile
-    time. It might also be worth investigating whether or not this can be configured at run time.
-    */
-    if (inputBusCount == 0 && outputBusCount == 1) {
-        /* Fast path. No cache needed. */
-        pHeapLayout->cachedDataOffset = MA_SIZE_MAX;
-    } else {
-        /* Slow path. Cache needed. */
-        size_t cachedDataSizeInBytes = 0;
-        ma_uint32 iBus;
-
-        for (iBus = 0; iBus < inputBusCount; iBus += 1) {
-            cachedDataSizeInBytes += pNodeGraph->nodeCacheCapInFrames * ma_get_bytes_per_frame(ma_format_f32, pConfig->pInputChannels[iBus]);
-        }
-
-        for (iBus = 0; iBus < outputBusCount; iBus += 1) {
-            cachedDataSizeInBytes += pNodeGraph->nodeCacheCapInFrames * ma_get_bytes_per_frame(ma_format_f32, pConfig->pOutputChannels[iBus]);
-        }
-
-        pHeapLayout->cachedDataOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += ma_align_64(cachedDataSizeInBytes);
-    }
-
-
-    /*
-    Not technically part of the heap, but we can output the input and output bus counts so we can
-    avoid a redundant call to ma_node_translate_bus_counts().
-    */
-    pHeapLayout->inputBusCount  = inputBusCount;
-    pHeapLayout->outputBusCount = outputBusCount;
-
-    /* Make sure allocation size is aligned. */
-    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_node_get_heap_size(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_node_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_node_get_heap_layout(pNodeGraph, pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_node_init_preallocated(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, void* pHeap, ma_node* pNode)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-    ma_result result;
-    ma_node_heap_layout heapLayout;
-    ma_uint32 iInputBus;
-    ma_uint32 iOutputBus;
-
-    if (pNodeBase == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNodeBase);
-
-    result = ma_node_get_heap_layout(pNodeGraph, pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pNodeBase->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pNodeBase->pNodeGraph     = pNodeGraph;
-    pNodeBase->vtable         = pConfig->vtable;
-    pNodeBase->state          = pConfig->initialState;
-    pNodeBase->stateTimes[ma_node_state_started] = 0;
-    pNodeBase->stateTimes[ma_node_state_stopped] = (ma_uint64)(ma_int64)-1; /* Weird casting for VC6 compatibility. */
-    pNodeBase->inputBusCount  = heapLayout.inputBusCount;
-    pNodeBase->outputBusCount = heapLayout.outputBusCount;
-
-    if (heapLayout.inputBusOffset != MA_SIZE_MAX) {
-        pNodeBase->pInputBuses = (ma_node_input_bus*)ma_offset_ptr(pHeap, heapLayout.inputBusOffset);
-    } else {
-        pNodeBase->pInputBuses = pNodeBase->_inputBuses;
-    }
-
-    if (heapLayout.outputBusOffset != MA_SIZE_MAX) {
-        pNodeBase->pOutputBuses = (ma_node_output_bus*)ma_offset_ptr(pHeap, heapLayout.outputBusOffset);
-    } else {
-        pNodeBase->pOutputBuses = pNodeBase->_outputBuses;
-    }
-
-    if (heapLayout.cachedDataOffset != MA_SIZE_MAX) {
-        pNodeBase->pCachedData = (float*)ma_offset_ptr(pHeap, heapLayout.cachedDataOffset);
-        pNodeBase->cachedDataCapInFramesPerBus = pNodeGraph->nodeCacheCapInFrames;
-    } else {
-        pNodeBase->pCachedData = NULL;
-    }
-
-
-
-    /* We need to run an initialization step for each input and output bus. */
-    for (iInputBus = 0; iInputBus < ma_node_get_input_bus_count(pNodeBase); iInputBus += 1) {
-        result = ma_node_input_bus_init(pConfig->pInputChannels[iInputBus], &pNodeBase->pInputBuses[iInputBus]);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-    for (iOutputBus = 0; iOutputBus < ma_node_get_output_bus_count(pNodeBase); iOutputBus += 1) {
-        result = ma_node_output_bus_init(pNodeBase, iOutputBus, pConfig->pOutputChannels[iOutputBus], &pNodeBase->pOutputBuses[iOutputBus]);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-    }
-
-
-    /* The cached data needs to be initialized to silence (or a sine wave tone if we're debugging). */
-    if (pNodeBase->pCachedData != NULL) {
-        ma_uint32 iBus;
-
-    #if 1   /* Toggle this between 0 and 1 to turn debugging on or off. 1 = fill with a sine wave for debugging; 0 = fill with silence. */
-        /* For safety we'll go ahead and default the buffer to silence. */
-        for (iBus = 0; iBus < ma_node_get_input_bus_count(pNodeBase); iBus += 1) {
-            ma_silence_pcm_frames(ma_node_get_cached_input_ptr(pNode, iBus), pNodeBase->cachedDataCapInFramesPerBus, ma_format_f32, ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[iBus]));
-        }
-        for (iBus = 0; iBus < ma_node_get_output_bus_count(pNodeBase); iBus += 1) {
-            ma_silence_pcm_frames(ma_node_get_cached_output_ptr(pNode, iBus), pNodeBase->cachedDataCapInFramesPerBus, ma_format_f32, ma_node_output_bus_get_channels(&pNodeBase->pOutputBuses[iBus]));
-        }
-    #else
-        /* For debugging. Default to a sine wave. */
-        for (iBus = 0; iBus < ma_node_get_input_bus_count(pNodeBase); iBus += 1) {
-            ma_debug_fill_pcm_frames_with_sine_wave(ma_node_get_cached_input_ptr(pNode, iBus), pNodeBase->cachedDataCapInFramesPerBus, ma_format_f32, ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[iBus]), 48000);
-        }
-        for (iBus = 0; iBus < ma_node_get_output_bus_count(pNodeBase); iBus += 1) {
-            ma_debug_fill_pcm_frames_with_sine_wave(ma_node_get_cached_output_ptr(pNode, iBus), pNodeBase->cachedDataCapInFramesPerBus, ma_format_f32, ma_node_output_bus_get_channels(&pNodeBase->pOutputBuses[iBus]), 48000);
-        }
-    #endif
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_node_init(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_node* pNode)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_node_get_heap_size(pNodeGraph, pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_node_init_preallocated(pNodeGraph, pConfig, pHeap, pNode);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    ((ma_node_base*)pNode)->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_node_uninit(ma_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-
-    if (pNodeBase == NULL) {
-        return;
-    }
-
-    /*
-    The first thing we need to do is fully detach the node. This will detach all inputs and
-    outputs. We need to do this first because it will sever the connection with the node graph and
-    allow us to complete uninitialization without needing to worry about thread-safety with the
-    audio thread. The detachment process will wait for any local processing of the node to finish.
-    */
-    ma_node_detach_full(pNode);
-
-    /*
-    At this point the node should be completely unreferenced by the node graph and we can finish up
-    the uninitialization process without needing to worry about thread-safety.
-    */
-    if (pNodeBase->_ownsHeap) {
-        ma_free(pNodeBase->_pHeap, pAllocationCallbacks);
-    }
-}
-
-MA_API ma_node_graph* ma_node_get_node_graph(const ma_node* pNode)
-{
-    if (pNode == NULL) {
-        return NULL;
-    }
-
-    return ((const ma_node_base*)pNode)->pNodeGraph;
-}
-
-MA_API ma_uint32 ma_node_get_input_bus_count(const ma_node* pNode)
-{
-    if (pNode == NULL) {
-        return 0;
-    }
-
-    return ((ma_node_base*)pNode)->inputBusCount;
-}
-
-MA_API ma_uint32 ma_node_get_output_bus_count(const ma_node* pNode)
-{
-    if (pNode == NULL) {
-        return 0;
-    }
-
-    return ((ma_node_base*)pNode)->outputBusCount;
-}
-
-
-MA_API ma_uint32 ma_node_get_input_channels(const ma_node* pNode, ma_uint32 inputBusIndex)
-{
-    const ma_node_base* pNodeBase = (const ma_node_base*)pNode;
-
-    if (pNode == NULL) {
-        return 0;
-    }
-
-    if (inputBusIndex >= ma_node_get_input_bus_count(pNode)) {
-        return 0;   /* Invalid bus index. */
-    }
-
-    return ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[inputBusIndex]);
-}
-
-MA_API ma_uint32 ma_node_get_output_channels(const ma_node* pNode, ma_uint32 outputBusIndex)
-{
-    const ma_node_base* pNodeBase = (const ma_node_base*)pNode;
-
-    if (pNode == NULL) {
-        return 0;
-    }
-
-    if (outputBusIndex >= ma_node_get_output_bus_count(pNode)) {
-        return 0;   /* Invalid bus index. */
-    }
-
-    return ma_node_output_bus_get_channels(&pNodeBase->pOutputBuses[outputBusIndex]);
-}
-
-
-static ma_result ma_node_detach_full(ma_node* pNode)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-    ma_uint32 iInputBus;
-
-    if (pNodeBase == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /*
-    Make sure the node is completely detached first. This will not return until the output bus is
-    guaranteed to no longer be referenced by the audio thread.
-    */
-    ma_node_detach_all_output_buses(pNode);
-
-    /*
-    At this point all output buses will have been detached from the graph and we can be guaranteed
-    that none of it's input nodes will be getting processed by the graph. We can detach these
-    without needing to worry about the audio thread touching them.
-    */
-    for (iInputBus = 0; iInputBus < ma_node_get_input_bus_count(pNode); iInputBus += 1) {
-        ma_node_input_bus* pInputBus;
-        ma_node_output_bus* pOutputBus;
-
-        pInputBus = &pNodeBase->pInputBuses[iInputBus];
-
-        /*
-        This is important. We cannot be using ma_node_input_bus_first() or ma_node_input_bus_next(). Those
-        functions are specifically for the audio thread. We'll instead just manually iterate using standard
-        linked list logic. We don't need to worry about the audio thread referencing these because the step
-        above severed the connection to the graph.
-        */
-        for (pOutputBus = (ma_node_output_bus*)ma_atomic_load_ptr(&pInputBus->head.pNext); pOutputBus != NULL; pOutputBus = (ma_node_output_bus*)ma_atomic_load_ptr(&pOutputBus->pNext)) {
-            ma_node_detach_output_bus(pOutputBus->pNode, pOutputBus->outputBusIndex);   /* This won't do any waiting in practice and should be efficient. */
-        }
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_node_detach_output_bus(ma_node* pNode, ma_uint32 outputBusIndex)
-{
-    ma_result result = MA_SUCCESS;
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-    ma_node_base* pInputNodeBase;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (outputBusIndex >= ma_node_get_output_bus_count(pNode)) {
-        return MA_INVALID_ARGS; /* Invalid output bus index. */
-    }
-
-    /* We need to lock the output bus because we need to inspect the input node and grab it's input bus. */
-    ma_node_output_bus_lock(&pNodeBase->pOutputBuses[outputBusIndex]);
-    {
-        pInputNodeBase = (ma_node_base*)pNodeBase->pOutputBuses[outputBusIndex].pInputNode;
-        if (pInputNodeBase != NULL) {
-            ma_node_input_bus_detach__no_output_bus_lock(&pInputNodeBase->pInputBuses[pNodeBase->pOutputBuses[outputBusIndex].inputNodeInputBusIndex], &pNodeBase->pOutputBuses[outputBusIndex]);
-        }
-    }
-    ma_node_output_bus_unlock(&pNodeBase->pOutputBuses[outputBusIndex]);
-
-    return result;
-}
-
-MA_API ma_result ma_node_detach_all_output_buses(ma_node* pNode)
-{
-    ma_uint32 iOutputBus;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    for (iOutputBus = 0; iOutputBus < ma_node_get_output_bus_count(pNode); iOutputBus += 1) {
-        ma_node_detach_output_bus(pNode, iOutputBus);
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_node_attach_output_bus(ma_node* pNode, ma_uint32 outputBusIndex, ma_node* pOtherNode, ma_uint32 otherNodeInputBusIndex)
-{
-    ma_node_base* pNodeBase  = (ma_node_base*)pNode;
-    ma_node_base* pOtherNodeBase = (ma_node_base*)pOtherNode;
-
-    if (pNodeBase == NULL || pOtherNodeBase == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pNodeBase == pOtherNodeBase) {
-        return MA_INVALID_OPERATION;    /* Cannot attach a node to itself. */
-    }
-
-    if (outputBusIndex >= ma_node_get_output_bus_count(pNode) || otherNodeInputBusIndex >= ma_node_get_input_bus_count(pOtherNode)) {
-        return MA_INVALID_OPERATION;    /* Invalid bus index. */
-    }
-
-    /* The output channel count of the output node must be the same as the input channel count of the input node. */
-    if (ma_node_get_output_channels(pNode, outputBusIndex) != ma_node_get_input_channels(pOtherNode, otherNodeInputBusIndex)) {
-        return MA_INVALID_OPERATION;    /* Channel count is incompatible. */
-    }
-
-    /* This will deal with detaching if the output bus is already attached to something. */
-    ma_node_input_bus_attach(&pOtherNodeBase->pInputBuses[otherNodeInputBusIndex], &pNodeBase->pOutputBuses[outputBusIndex], pOtherNode, otherNodeInputBusIndex);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_node_set_output_bus_volume(ma_node* pNode, ma_uint32 outputBusIndex, float volume)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-
-    if (pNodeBase == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (outputBusIndex >= ma_node_get_output_bus_count(pNode)) {
-        return MA_INVALID_ARGS; /* Invalid bus index. */
-    }
-
-    return ma_node_output_bus_set_volume(&pNodeBase->pOutputBuses[outputBusIndex], volume);
-}
-
-MA_API float ma_node_get_output_bus_volume(const ma_node* pNode, ma_uint32 outputBusIndex)
-{
-    const ma_node_base* pNodeBase = (const ma_node_base*)pNode;
-
-    if (pNodeBase == NULL) {
-        return 0;
-    }
-
-    if (outputBusIndex >= ma_node_get_output_bus_count(pNode)) {
-        return 0;   /* Invalid bus index. */
-    }
-
-    return ma_node_output_bus_get_volume(&pNodeBase->pOutputBuses[outputBusIndex]);
-}
-
-MA_API ma_result ma_node_set_state(ma_node* pNode, ma_node_state state)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-
-    if (pNodeBase == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_atomic_exchange_i32(&pNodeBase->state, state);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_node_state ma_node_get_state(const ma_node* pNode)
-{
-    const ma_node_base* pNodeBase = (const ma_node_base*)pNode;
-
-    if (pNodeBase == NULL) {
-        return ma_node_state_stopped;
-    }
-
-    return (ma_node_state)ma_atomic_load_i32(&pNodeBase->state);
-}
-
-MA_API ma_result ma_node_set_state_time(ma_node* pNode, ma_node_state state, ma_uint64 globalTime)
-{
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Validation check for safety since we'll be using this as an index into stateTimes[]. */
-    if (state != ma_node_state_started && state != ma_node_state_stopped) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_atomic_exchange_64(&((ma_node_base*)pNode)->stateTimes[state], globalTime);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_uint64 ma_node_get_state_time(const ma_node* pNode, ma_node_state state)
-{
-    if (pNode == NULL) {
-        return 0;
-    }
-
-    /* Validation check for safety since we'll be using this as an index into stateTimes[]. */
-    if (state != ma_node_state_started && state != ma_node_state_stopped) {
-        return 0;
-    }
-
-    return ma_atomic_load_64(&((ma_node_base*)pNode)->stateTimes[state]);
-}
-
-MA_API ma_node_state ma_node_get_state_by_time(const ma_node* pNode, ma_uint64 globalTime)
-{
-    if (pNode == NULL) {
-        return ma_node_state_stopped;
-    }
-
-    return ma_node_get_state_by_time_range(pNode, globalTime, globalTime);
-}
-
-MA_API ma_node_state ma_node_get_state_by_time_range(const ma_node* pNode, ma_uint64 globalTimeBeg, ma_uint64 globalTimeEnd)
-{
-    ma_node_state state;
-
-    if (pNode == NULL) {
-        return ma_node_state_stopped;
-    }
-
-    state = ma_node_get_state(pNode);
-
-    /* An explicitly stopped node is always stopped. */
-    if (state == ma_node_state_stopped) {
-        return ma_node_state_stopped;
-    }
-
-    /*
-    Getting here means the node is marked as started, but it may still not be truly started due to
-    it's start time not having been reached yet. Also, the stop time may have also been reached in
-    which case it'll be considered stopped.
-    */
-    if (ma_node_get_state_time(pNode, ma_node_state_started) > globalTimeBeg) {
-        return ma_node_state_stopped;   /* Start time has not yet been reached. */
-    }
-
-    if (ma_node_get_state_time(pNode, ma_node_state_stopped) <= globalTimeEnd) {
-        return ma_node_state_stopped;   /* Stop time has been reached. */
-    }
-
-    /* Getting here means the node is marked as started and is within it's start/stop times. */
-    return ma_node_state_started;
-}
-
-MA_API ma_uint64 ma_node_get_time(const ma_node* pNode)
-{
-    if (pNode == NULL) {
-        return 0;
-    }
-
-    return ma_atomic_load_64(&((ma_node_base*)pNode)->localTime);
-}
-
-MA_API ma_result ma_node_set_time(ma_node* pNode, ma_uint64 localTime)
-{
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_atomic_exchange_64(&((ma_node_base*)pNode)->localTime, localTime);
-
-    return MA_SUCCESS;
-}
-
-
-
-static void ma_node_process_pcm_frames_internal(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-
-    if (pNodeBase->vtable->onProcess) {
-        pNodeBase->vtable->onProcess(pNode, ppFramesIn, pFrameCountIn, ppFramesOut, pFrameCountOut);
-    }
-}
-
-static ma_result ma_node_read_pcm_frames(ma_node* pNode, ma_uint32 outputBusIndex, float* pFramesOut, ma_uint32 frameCount, ma_uint32* pFramesRead, ma_uint64 globalTime)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-    ma_result result = MA_SUCCESS;
-    ma_uint32 iInputBus;
-    ma_uint32 iOutputBus;
-    ma_uint32 inputBusCount;
-    ma_uint32 outputBusCount;
-    ma_uint32 totalFramesRead = 0;
-    float* ppFramesIn[MA_MAX_NODE_BUS_COUNT];
-    float* ppFramesOut[MA_MAX_NODE_BUS_COUNT];
-    ma_uint64 globalTimeBeg;
-    ma_uint64 globalTimeEnd;
-    ma_uint64 startTime;
-    ma_uint64 stopTime;
-    ma_uint32 timeOffsetBeg;
-    ma_uint32 timeOffsetEnd;
-    ma_uint32 frameCountIn;
-    ma_uint32 frameCountOut;
-
-    /*
-    pFramesRead is mandatory. It must be used to determine how many frames were read. It's normal and
-    expected that the number of frames read may be different to that requested. Therefore, the caller
-    must look at this value to correctly determine how many frames were read.
-    */
-    MA_ASSERT(pFramesRead != NULL); /* <-- If you've triggered this assert, you're using this function wrong. You *must* use this variable and inspect it after the call returns. */
-    if (pFramesRead == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pFramesRead = 0;   /* Safety. */
-
-    if (pNodeBase == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (outputBusIndex >= ma_node_get_output_bus_count(pNodeBase)) {
-        return MA_INVALID_ARGS; /* Invalid output bus index. */
-    }
-
-    /* Don't do anything if we're in a stopped state. */
-    if (ma_node_get_state_by_time_range(pNode, globalTime, globalTime + frameCount) != ma_node_state_started) {
-        return MA_SUCCESS;  /* We're in a stopped state. This is not an error - we just need to not read anything. */
-    }
-
-
-    globalTimeBeg = globalTime;
-    globalTimeEnd = globalTime + frameCount;
-    startTime = ma_node_get_state_time(pNode, ma_node_state_started);
-    stopTime  = ma_node_get_state_time(pNode, ma_node_state_stopped);
-
-    /*
-    At this point we know that we are inside our start/stop times. However, we may need to adjust
-    our frame count and output pointer to accommodate since we could be straddling the time period
-    that this function is getting called for.
-
-    It's possible (and likely) that the start time does not line up with the output buffer. We
-    therefore need to offset it by a number of frames to accommodate. The same thing applies for
-    the stop time.
-    */
-    timeOffsetBeg = (globalTimeBeg < startTime) ? (ma_uint32)(globalTimeEnd - startTime) : 0;
-    timeOffsetEnd = (globalTimeEnd > stopTime)  ? (ma_uint32)(globalTimeEnd - stopTime)  : 0;
-
-    /* Trim based on the start offset. We need to silence the start of the buffer. */
-    if (timeOffsetBeg > 0) {
-        ma_silence_pcm_frames(pFramesOut, timeOffsetBeg, ma_format_f32, ma_node_get_output_channels(pNode, outputBusIndex));
-        pFramesOut += timeOffsetBeg * ma_node_get_output_channels(pNode, outputBusIndex);
-        frameCount -= timeOffsetBeg;
-    }
-
-    /* Trim based on the end offset. We don't need to silence the tail section because we'll just have a reduced value written to pFramesRead. */
-    if (timeOffsetEnd > 0) {
-        frameCount -= timeOffsetEnd;
-    }
-
-
-    /* We run on different paths depending on the bus counts. */
-    inputBusCount  = ma_node_get_input_bus_count(pNode);
-    outputBusCount = ma_node_get_output_bus_count(pNode);
-
-    /*
-    Run a simplified path when there are no inputs and one output. In this case there's nothing to
-    actually read and we can go straight to output. This is a very common scenario because the vast
-    majority of data source nodes will use this setup so this optimization I think is worthwhile.
-    */
-    if (inputBusCount == 0 && outputBusCount == 1) {
-        /* Fast path. No need to read from input and no need for any caching. */
-        frameCountIn  = 0;
-        frameCountOut = frameCount;    /* Just read as much as we can. The callback will return what was actually read. */
-
-        ppFramesOut[0] = pFramesOut;
-
-        /*
-        If it's a passthrough we won't be expecting the callback to output anything, so we'll
-        need to pre-silence the output buffer.
-        */
-        if ((pNodeBase->vtable->flags & MA_NODE_FLAG_PASSTHROUGH) != 0) {
-            ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, ma_node_get_output_channels(pNode, outputBusIndex));
-        }
-
-        ma_node_process_pcm_frames_internal(pNode, NULL, &frameCountIn, ppFramesOut, &frameCountOut);
-        totalFramesRead = frameCountOut;
-    } else {
-        /* Slow path. Need to read input data. */
-        if ((pNodeBase->vtable->flags & MA_NODE_FLAG_PASSTHROUGH) != 0) {
-            /*
-            Fast path. We're running a passthrough. We need to read directly into the output buffer, but
-            still fire the callback so that event handling and trigger nodes can do their thing. Since
-            it's a passthrough there's no need for any kind of caching logic.
-            */
-            MA_ASSERT(outputBusCount == inputBusCount);
-            MA_ASSERT(outputBusCount == 1);
-            MA_ASSERT(outputBusIndex == 0);
-
-            /* We just read directly from input bus to output buffer, and then afterwards fire the callback. */
-            ppFramesOut[0] = pFramesOut;
-            ppFramesIn[0] = ppFramesOut[0];
-
-            result = ma_node_input_bus_read_pcm_frames(pNodeBase, &pNodeBase->pInputBuses[0], ppFramesIn[0], frameCount, &totalFramesRead, globalTime);
-            if (result == MA_SUCCESS) {
-                /* Even though it's a passthrough, we still need to fire the callback. */
-                frameCountIn  = totalFramesRead;
-                frameCountOut = totalFramesRead;
-
-                if (totalFramesRead > 0) {
-                    ma_node_process_pcm_frames_internal(pNode, (const float**)ppFramesIn, &frameCountIn, ppFramesOut, &frameCountOut);  /* From GCC: expected 'const float **' but argument is of type 'float **'. Shouldn't this be implicit? Excplicit cast to silence the warning. */
-                }
-
-                /*
-                A passthrough should never have modified the input and output frame counts. If you're
-                triggering these assers you need to fix your processing callback.
-                */
-                MA_ASSERT(frameCountIn  == totalFramesRead);
-                MA_ASSERT(frameCountOut == totalFramesRead);
-            }
-        } else {
-            /* Slow path. Need to do caching. */
-            ma_uint32 framesToProcessIn;
-            ma_uint32 framesToProcessOut;
-            ma_bool32 consumeNullInput = MA_FALSE;
-
-            /*
-            We use frameCount as a basis for the number of frames to read since that's what's being
-            requested, however we still need to clamp it to whatever can fit in the cache.
-
-            This will also be used as the basis for determining how many input frames to read. This is
-            not ideal because it can result in too many input frames being read which introduces latency.
-            To solve this, nodes can implement an optional callback called onGetRequiredInputFrameCount
-            which is used as hint to miniaudio as to how many input frames it needs to read at a time. This
-            callback is completely optional, and if it's not set, miniaudio will assume `frameCount`.
-
-            This function will be called multiple times for each period of time, once for each output node.
-            We cannot read from each input node each time this function is called. Instead we need to check
-            whether or not this is first output bus to be read from for this time period, and if so, read
-            from our input data.
-
-            To determine whether or not we're ready to read data, we check a flag. There will be one flag
-            for each output. When the flag is set, it means data has been read previously and that we're
-            ready to advance time forward for our input nodes by reading fresh data.
-            */
-            framesToProcessOut = frameCount;
-            if (framesToProcessOut > pNodeBase->cachedDataCapInFramesPerBus) {
-                framesToProcessOut = pNodeBase->cachedDataCapInFramesPerBus;
-            }
-
-            framesToProcessIn  = frameCount;
-            if (pNodeBase->vtable->onGetRequiredInputFrameCount) {
-                pNodeBase->vtable->onGetRequiredInputFrameCount(pNode, framesToProcessOut, &framesToProcessIn); /* <-- It does not matter if this fails. */
-            }
-            if (framesToProcessIn > pNodeBase->cachedDataCapInFramesPerBus) {
-                framesToProcessIn = pNodeBase->cachedDataCapInFramesPerBus;
-            }
-
-
-            MA_ASSERT(framesToProcessIn  <= 0xFFFF);
-            MA_ASSERT(framesToProcessOut <= 0xFFFF);
-
-            if (ma_node_output_bus_has_read(&pNodeBase->pOutputBuses[outputBusIndex])) {
-                /* Getting here means we need to do another round of processing. */
-                pNodeBase->cachedFrameCountOut = 0;
-
-                for (;;) {
-                    frameCountOut = 0;
-
-                    /*
-                    We need to prepare our output frame pointers for processing. In the same iteration we need
-                    to mark every output bus as unread so that future calls to this function for different buses
-                    for the current time period don't pull in data when they should instead be reading from cache.
-                    */
-                    for (iOutputBus = 0; iOutputBus < outputBusCount; iOutputBus += 1) {
-                        ma_node_output_bus_set_has_read(&pNodeBase->pOutputBuses[iOutputBus], MA_FALSE); /* <-- This is what tells the next calls to this function for other output buses for this time period to read from cache instead of pulling in more data. */
-                        ppFramesOut[iOutputBus] = ma_node_get_cached_output_ptr(pNode, iOutputBus);
-                    }
-
-                    /* We only need to read from input buses if there isn't already some data in the cache. */
-                    if (pNodeBase->cachedFrameCountIn == 0) {
-                        ma_uint32 maxFramesReadIn = 0;
-
-                        /* Here is where we pull in data from the input buses. This is what will trigger an advance in time. */
-                        for (iInputBus = 0; iInputBus < inputBusCount; iInputBus += 1) {
-                            ma_uint32 framesRead;
-
-                            /* The first thing to do is get the offset within our bulk allocation to store this input data. */
-                            ppFramesIn[iInputBus] = ma_node_get_cached_input_ptr(pNode, iInputBus);
-
-                            /* Once we've determined our destination pointer we can read. Note that we must inspect the number of frames read and fill any leftovers with silence for safety. */
-                            result = ma_node_input_bus_read_pcm_frames(pNodeBase, &pNodeBase->pInputBuses[iInputBus], ppFramesIn[iInputBus], framesToProcessIn, &framesRead, globalTime);
-                            if (result != MA_SUCCESS) {
-                                /* It doesn't really matter if we fail because we'll just fill with silence. */
-                                framesRead = 0; /* Just for safety, but I don't think it's really needed. */
-                            }
-
-                            /* TODO: Minor optimization opportunity here. If no frames were read and the buffer is already filled with silence, no need to re-silence it. */
-                            /* Any leftover frames need to silenced for safety. */
-                            if (framesRead < framesToProcessIn) {
-                                ma_silence_pcm_frames(ppFramesIn[iInputBus] + (framesRead * ma_node_get_input_channels(pNodeBase, iInputBus)), (framesToProcessIn - framesRead), ma_format_f32, ma_node_get_input_channels(pNodeBase, iInputBus));
-                            }
-
-                            maxFramesReadIn = ma_max(maxFramesReadIn, framesRead);
-                        }
-
-                        /* This was a fresh load of input data so reset our consumption counter. */
-                        pNodeBase->consumedFrameCountIn = 0;
-
-                        /*
-                        We don't want to keep processing if there's nothing to process, so set the number of cached
-                        input frames to the maximum number we read from each attachment (the lesser will be padded
-                        with silence). If we didn't read anything, this will be set to 0 and the entire buffer will
-                        have been assigned to silence. This being equal to 0 is an important property for us because
-                        it allows us to detect when NULL can be passed into the processing callback for the input
-                        buffer for the purpose of continuous processing.
-                        */
-                        pNodeBase->cachedFrameCountIn = (ma_uint16)maxFramesReadIn;
-                    } else {
-                        /* We don't need to read anything, but we do need to prepare our input frame pointers. */
-                        for (iInputBus = 0; iInputBus < inputBusCount; iInputBus += 1) {
-                            ppFramesIn[iInputBus] = ma_node_get_cached_input_ptr(pNode, iInputBus) + (pNodeBase->consumedFrameCountIn * ma_node_get_input_channels(pNodeBase, iInputBus));
-                        }
-                    }
-
-                    /*
-                    At this point we have our input data so now we need to do some processing. Sneaky little
-                    optimization here - we can set the pointer to the output buffer for this output bus so
-                    that the final copy into the output buffer is done directly by onProcess().
-                    */
-                    if (pFramesOut != NULL) {
-                        ppFramesOut[outputBusIndex] = ma_offset_pcm_frames_ptr_f32(pFramesOut, pNodeBase->cachedFrameCountOut, ma_node_get_output_channels(pNode, outputBusIndex));
-                    }
-
-
-                    /* Give the processing function the entire capacity of the output buffer. */
-                    frameCountOut = (framesToProcessOut - pNodeBase->cachedFrameCountOut);
-
-                    /*
-                    We need to treat nodes with continuous processing a little differently. For these ones,
-                    we always want to fire the callback with the requested number of frames, regardless of
-                    pNodeBase->cachedFrameCountIn, which could be 0. Also, we want to check if we can pass
-                    in NULL for the input buffer to the callback.
-                    */
-                    if ((pNodeBase->vtable->flags & MA_NODE_FLAG_CONTINUOUS_PROCESSING) != 0) {
-                        /* We're using continuous processing. Make sure we specify the whole frame count at all times. */
-                        frameCountIn = framesToProcessIn;    /* Give the processing function as much input data as we've got in the buffer, including any silenced padding from short reads. */
-
-                        if ((pNodeBase->vtable->flags & MA_NODE_FLAG_ALLOW_NULL_INPUT) != 0 && pNodeBase->consumedFrameCountIn == 0 && pNodeBase->cachedFrameCountIn == 0) {
-                            consumeNullInput = MA_TRUE;
-                        } else {
-                            consumeNullInput = MA_FALSE;
-                        }
-
-                        /*
-                        Since we're using continuous processing we're always passing in a full frame count
-                        regardless of how much input data was read. If this is greater than what we read as
-                        input, we'll end up with an underflow. We instead need to make sure our cached frame
-                        count is set to the number of frames we'll be passing to the data callback. Not
-                        doing this will result in an underflow when we "consume" the cached data later on.
-
-                        Note that this check needs to be done after the "consumeNullInput" check above because
-                        we use the property of cachedFrameCountIn being 0 to determine whether or not we
-                        should be passing in a null pointer to the processing callback for when the node is
-                        configured with MA_NODE_FLAG_ALLOW_NULL_INPUT.
-                        */
-                        if (pNodeBase->cachedFrameCountIn < (ma_uint16)frameCountIn) {
-                            pNodeBase->cachedFrameCountIn = (ma_uint16)frameCountIn;
-                        }
-                    } else {
-                        frameCountIn = pNodeBase->cachedFrameCountIn;  /* Give the processing function as much valid input data as we've got. */
-                        consumeNullInput = MA_FALSE;
-                    }
-
-                    /*
-                    Process data slightly differently depending on whether or not we're consuming NULL
-                    input (checked just above).
-                    */
-                    if (consumeNullInput) {
-                        ma_node_process_pcm_frames_internal(pNode, NULL, &frameCountIn, ppFramesOut, &frameCountOut);
-                    } else {
-                        /*
-                        We want to skip processing if there's no input data, but we can only do that safely if
-                        we know that there is no chance of any output frames being produced. If continuous
-                        processing is being used, this won't be a problem because the input frame count will
-                        always be non-0. However, if continuous processing is *not* enabled and input and output
-                        data is processed at different rates, we still need to process that last input frame
-                        because there could be a few excess output frames needing to be produced from cached
-                        data. The `MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES` flag is used as the indicator for
-                        determining whether or not we need to process the node even when there are no input
-                        frames available right now.
-                        */
-                        if (frameCountIn > 0 || (pNodeBase->vtable->flags & MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES) != 0) {
-                            ma_node_process_pcm_frames_internal(pNode, (const float**)ppFramesIn, &frameCountIn, ppFramesOut, &frameCountOut);    /* From GCC: expected 'const float **' but argument is of type 'float **'. Shouldn't this be implicit? Excplicit cast to silence the warning. */
-                        } else {
-                            frameCountOut = 0;  /* No data was processed. */
-                        }
-                    }
-
-                    /*
-                    Thanks to our sneaky optimization above we don't need to do any data copying directly into
-                    the output buffer - the onProcess() callback just did that for us. We do, however, need to
-                    apply the number of input and output frames that were processed. Note that due to continuous
-                    processing above, we need to do explicit checks here. If we just consumed a NULL input
-                    buffer it means that no actual input data was processed from the internal buffers and we
-                    don't want to be modifying any counters.
-                    */
-                    if (consumeNullInput == MA_FALSE) {
-                        pNodeBase->consumedFrameCountIn += (ma_uint16)frameCountIn;
-                        pNodeBase->cachedFrameCountIn   -= (ma_uint16)frameCountIn;
-                    }
-
-                    /* The cached output frame count is always equal to what we just read. */
-                    pNodeBase->cachedFrameCountOut += (ma_uint16)frameCountOut;
-
-                    /* If we couldn't process any data, we're done. The loop needs to be terminated here or else we'll get stuck in a loop. */
-                    if (pNodeBase->cachedFrameCountOut == framesToProcessOut || (frameCountOut == 0 && frameCountIn == 0)) {
-                        break;
-                    }
-                }
-            } else {
-                /*
-                We're not needing to read anything from the input buffer so just read directly from our
-                already-processed data.
-                */
-                if (pFramesOut != NULL) {
-                    ma_copy_pcm_frames(pFramesOut, ma_node_get_cached_output_ptr(pNodeBase, outputBusIndex), pNodeBase->cachedFrameCountOut, ma_format_f32, ma_node_get_output_channels(pNodeBase, outputBusIndex));
-                }
-            }
-
-            /* The number of frames read is always equal to the number of cached output frames. */
-            totalFramesRead = pNodeBase->cachedFrameCountOut;
-
-            /* Now that we've read the data, make sure our read flag is set. */
-            ma_node_output_bus_set_has_read(&pNodeBase->pOutputBuses[outputBusIndex], MA_TRUE);
-        }
-    }
-
-    /* Apply volume, if necessary. */
-    ma_apply_volume_factor_f32(pFramesOut, totalFramesRead * ma_node_get_output_channels(pNodeBase, outputBusIndex), ma_node_output_bus_get_volume(&pNodeBase->pOutputBuses[outputBusIndex]));
-
-    /* Advance our local time forward. */
-    ma_atomic_fetch_add_64(&pNodeBase->localTime, (ma_uint64)totalFramesRead);
-
-    *pFramesRead = totalFramesRead + timeOffsetBeg; /* Must include the silenced section at the start of the buffer. */
-    return result;
-}
-
-
-
-
-/* Data source node. */
-MA_API ma_data_source_node_config ma_data_source_node_config_init(ma_data_source* pDataSource)
-{
-    ma_data_source_node_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.nodeConfig  = ma_node_config_init();
-    config.pDataSource = pDataSource;
-
-    return config;
-}
-
-
-static void ma_data_source_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_data_source_node* pDataSourceNode = (ma_data_source_node*)pNode;
-    ma_format format;
-    ma_uint32 channels;
-    ma_uint32 frameCount;
-    ma_uint64 framesRead = 0;
-
-    MA_ASSERT(pDataSourceNode != NULL);
-    MA_ASSERT(pDataSourceNode->pDataSource != NULL);
-    MA_ASSERT(ma_node_get_input_bus_count(pDataSourceNode)  == 0);
-    MA_ASSERT(ma_node_get_output_bus_count(pDataSourceNode) == 1);
-
-    /* We don't want to read from ppFramesIn at all. Instead we read from the data source. */
-    (void)ppFramesIn;
-    (void)pFrameCountIn;
-
-    frameCount = *pFrameCountOut;
-
-    /* miniaudio should never be calling this with a frame count of zero. */
-    MA_ASSERT(frameCount > 0);
-
-    if (ma_data_source_get_data_format(pDataSourceNode->pDataSource, &format, &channels, NULL, NULL, 0) == MA_SUCCESS) { /* <-- Don't care about sample rate here. */
-        /* The node graph system requires samples be in floating point format. This is checked in ma_data_source_node_init(). */
-        MA_ASSERT(format == ma_format_f32);
-        (void)format;   /* Just to silence some static analysis tools. */
-
-        ma_data_source_read_pcm_frames(pDataSourceNode->pDataSource, ppFramesOut[0], frameCount, &framesRead);
-    }
-
-    *pFrameCountOut = (ma_uint32)framesRead;
-}
-
-static ma_node_vtable g_ma_data_source_node_vtable =
-{
-    ma_data_source_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    0,      /* 0 input buses. */
-    1,      /* 1 output bus. */
-    0
-};
-
-MA_API ma_result ma_data_source_node_init(ma_node_graph* pNodeGraph, const ma_data_source_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source_node* pDataSourceNode)
-{
-    ma_result result;
-    ma_format format;   /* For validating the format, which must be ma_format_f32. */
-    ma_uint32 channels; /* For specifying the channel count of the output bus. */
-    ma_node_config baseConfig;
-
-    if (pDataSourceNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDataSourceNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    result = ma_data_source_get_data_format(pConfig->pDataSource, &format, &channels, NULL, NULL, 0);    /* Don't care about sample rate. This will check pDataSource for NULL. */
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    MA_ASSERT(format == ma_format_f32); /* <-- If you've triggered this it means your data source is not outputting floating-point samples. You must configure your data source to use ma_format_f32. */
-    if (format != ma_format_f32) {
-        return MA_INVALID_ARGS; /* Invalid format. */
-    }
-
-    /* The channel count is defined by the data source. If the caller has manually changed the channels we just ignore it. */
-    baseConfig = pConfig->nodeConfig;
-    baseConfig.vtable = &g_ma_data_source_node_vtable;  /* Explicitly set the vtable here to prevent callers from setting it incorrectly. */
-
-    /*
-    The channel count is defined by the data source. It is invalid for the caller to manually set
-    the channel counts in the config. `ma_data_source_node_config_init()` will have defaulted the
-    channel count pointer to NULL which is how it must remain. If you trigger any of these asserts
-    it means you're explicitly setting the channel count. Instead, configure the output channel
-    count of your data source to be the necessary channel count.
-    */
-    if (baseConfig.pOutputChannels != NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    baseConfig.pOutputChannels = &channels;
-
-    result = ma_node_init(pNodeGraph, &baseConfig, pAllocationCallbacks, &pDataSourceNode->base);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    pDataSourceNode->pDataSource = pConfig->pDataSource;
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_data_source_node_uninit(ma_data_source_node* pDataSourceNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_node_uninit(&pDataSourceNode->base, pAllocationCallbacks);
-}
-
-MA_API ma_result ma_data_source_node_set_looping(ma_data_source_node* pDataSourceNode, ma_bool32 isLooping)
-{
-    if (pDataSourceNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_data_source_set_looping(pDataSourceNode->pDataSource, isLooping);
-}
-
-MA_API ma_bool32 ma_data_source_node_is_looping(ma_data_source_node* pDataSourceNode)
-{
-    if (pDataSourceNode == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_data_source_is_looping(pDataSourceNode->pDataSource);
-}
-
-
-
-/* Splitter Node. */
-MA_API ma_splitter_node_config ma_splitter_node_config_init(ma_uint32 channels)
-{
-    ma_splitter_node_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.nodeConfig     = ma_node_config_init();
-    config.channels       = channels;
-    config.outputBusCount = 2;
-
-    return config;
-}
-
-
-static void ma_splitter_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_node_base* pNodeBase = (ma_node_base*)pNode;
-    ma_uint32 iOutputBus;
-    ma_uint32 channels;
-
-    MA_ASSERT(pNodeBase != NULL);
-    MA_ASSERT(ma_node_get_input_bus_count(pNodeBase) == 1);
-
-    /* We don't need to consider the input frame count - it'll be the same as the output frame count and we process everything. */
-    (void)pFrameCountIn;
-
-    /* NOTE: This assumes the same number of channels for all inputs and outputs. This was checked in ma_splitter_node_init(). */
-    channels = ma_node_get_input_channels(pNodeBase, 0);
-
-    /* Splitting is just copying the first input bus and copying it over to each output bus. */
-    for (iOutputBus = 0; iOutputBus < ma_node_get_output_bus_count(pNodeBase); iOutputBus += 1) {
-        ma_copy_pcm_frames(ppFramesOut[iOutputBus], ppFramesIn[0], *pFrameCountOut, ma_format_f32, channels);
-    }
-}
-
-static ma_node_vtable g_ma_splitter_node_vtable =
-{
-    ma_splitter_node_process_pcm_frames,
-    NULL,                       /* onGetRequiredInputFrameCount */
-    1,                          /* 1 input bus. */
-    MA_NODE_BUS_COUNT_UNKNOWN,  /* The output bus count is specified on a per-node basis. */
-    0
-};
-
-MA_API ma_result ma_splitter_node_init(ma_node_graph* pNodeGraph, const ma_splitter_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_splitter_node* pSplitterNode)
-{
-    ma_result result;
-    ma_node_config baseConfig;
-    ma_uint32 pInputChannels[1];
-    ma_uint32 pOutputChannels[MA_MAX_NODE_BUS_COUNT];
-    ma_uint32 iOutputBus;
-
-    if (pSplitterNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pSplitterNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->outputBusCount > MA_MAX_NODE_BUS_COUNT) {
-        return MA_INVALID_ARGS; /* Too many output buses. */
-    }
-
-    /* Splitters require the same number of channels between inputs and outputs. */
-    pInputChannels[0]  = pConfig->channels;
-    for (iOutputBus = 0; iOutputBus < pConfig->outputBusCount; iOutputBus += 1) {
-        pOutputChannels[iOutputBus] = pConfig->channels;
-    }
-
-    baseConfig = pConfig->nodeConfig;
-    baseConfig.vtable = &g_ma_splitter_node_vtable;
-    baseConfig.pInputChannels  = pInputChannels;
-    baseConfig.pOutputChannels = pOutputChannels;
-    baseConfig.outputBusCount  = pConfig->outputBusCount;
-
-    result = ma_node_init(pNodeGraph, &baseConfig, pAllocationCallbacks, &pSplitterNode->base);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to initialize the base node. */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API void ma_splitter_node_uninit(ma_splitter_node* pSplitterNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_node_uninit(pSplitterNode, pAllocationCallbacks);
-}
-
-
-/*
-Biquad Node
-*/
-MA_API ma_biquad_node_config ma_biquad_node_config_init(ma_uint32 channels, float b0, float b1, float b2, float a0, float a1, float a2)
-{
-    ma_biquad_node_config config;
-
-    config.nodeConfig = ma_node_config_init();
-    config.biquad = ma_biquad_config_init(ma_format_f32, channels, b0, b1, b2, a0, a1, a2);
-
-    return config;
-}
-
-static void ma_biquad_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_biquad_node* pLPFNode = (ma_biquad_node*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-    (void)pFrameCountIn;
-
-    ma_biquad_process_pcm_frames(&pLPFNode->biquad, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
-}
-
-static ma_node_vtable g_ma_biquad_node_vtable =
-{
-    ma_biquad_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    1,      /* One input. */
-    1,      /* One output. */
-    0       /* Default flags. */
-};
-
-MA_API ma_result ma_biquad_node_init(ma_node_graph* pNodeGraph, const ma_biquad_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_biquad_node* pNode)
-{
-    ma_result result;
-    ma_node_config baseNodeConfig;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->biquad.format != ma_format_f32) {
-        return MA_INVALID_ARGS; /* The format must be f32. */
-    }
-
-    result = ma_biquad_init(&pConfig->biquad, pAllocationCallbacks, &pNode->biquad);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    baseNodeConfig = ma_node_config_init();
-    baseNodeConfig.vtable          = &g_ma_biquad_node_vtable;
-    baseNodeConfig.pInputChannels  = &pConfig->biquad.channels;
-    baseNodeConfig.pOutputChannels = &pConfig->biquad.channels;
-
-    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_biquad_node_reinit(const ma_biquad_config* pConfig, ma_biquad_node* pNode)
-{
-    ma_biquad_node* pLPFNode = (ma_biquad_node*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-
-    return ma_biquad_reinit(pConfig, &pLPFNode->biquad);
-}
-
-MA_API void ma_biquad_node_uninit(ma_biquad_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_biquad_node* pLPFNode = (ma_biquad_node*)pNode;
-
-    if (pNode == NULL) {
-        return;
-    }
-
-    ma_node_uninit(pNode, pAllocationCallbacks);
-    ma_biquad_uninit(&pLPFNode->biquad, pAllocationCallbacks);
-}
-
-
-
-/*
-Low Pass Filter Node
-*/
-MA_API ma_lpf_node_config ma_lpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
-{
-    ma_lpf_node_config config;
-
-    config.nodeConfig = ma_node_config_init();
-    config.lpf = ma_lpf_config_init(ma_format_f32, channels, sampleRate, cutoffFrequency, order);
-
-    return config;
-}
-
-static void ma_lpf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_lpf_node* pLPFNode = (ma_lpf_node*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-    (void)pFrameCountIn;
-
-    ma_lpf_process_pcm_frames(&pLPFNode->lpf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
-}
-
-static ma_node_vtable g_ma_lpf_node_vtable =
-{
-    ma_lpf_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    1,      /* One input. */
-    1,      /* One output. */
-    0       /* Default flags. */
-};
-
-MA_API ma_result ma_lpf_node_init(ma_node_graph* pNodeGraph, const ma_lpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf_node* pNode)
-{
-    ma_result result;
-    ma_node_config baseNodeConfig;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->lpf.format != ma_format_f32) {
-        return MA_INVALID_ARGS; /* The format must be f32. */
-    }
-
-    result = ma_lpf_init(&pConfig->lpf, pAllocationCallbacks, &pNode->lpf);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    baseNodeConfig = ma_node_config_init();
-    baseNodeConfig.vtable          = &g_ma_lpf_node_vtable;
-    baseNodeConfig.pInputChannels  = &pConfig->lpf.channels;
-    baseNodeConfig.pOutputChannels = &pConfig->lpf.channels;
-
-    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_lpf_node_reinit(const ma_lpf_config* pConfig, ma_lpf_node* pNode)
-{
-    ma_lpf_node* pLPFNode = (ma_lpf_node*)pNode;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_lpf_reinit(pConfig, &pLPFNode->lpf);
-}
-
-MA_API void ma_lpf_node_uninit(ma_lpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_lpf_node* pLPFNode = (ma_lpf_node*)pNode;
-
-    if (pNode == NULL) {
-        return;
-    }
-
-    ma_node_uninit(pNode, pAllocationCallbacks);
-    ma_lpf_uninit(&pLPFNode->lpf, pAllocationCallbacks);
-}
-
-
-
-/*
-High Pass Filter Node
-*/
-MA_API ma_hpf_node_config ma_hpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
-{
-    ma_hpf_node_config config;
-
-    config.nodeConfig = ma_node_config_init();
-    config.hpf = ma_hpf_config_init(ma_format_f32, channels, sampleRate, cutoffFrequency, order);
-
-    return config;
-}
-
-static void ma_hpf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_hpf_node* pHPFNode = (ma_hpf_node*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-    (void)pFrameCountIn;
-
-    ma_hpf_process_pcm_frames(&pHPFNode->hpf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
-}
-
-static ma_node_vtable g_ma_hpf_node_vtable =
-{
-    ma_hpf_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    1,      /* One input. */
-    1,      /* One output. */
-    0       /* Default flags. */
-};
-
-MA_API ma_result ma_hpf_node_init(ma_node_graph* pNodeGraph, const ma_hpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf_node* pNode)
-{
-    ma_result result;
-    ma_node_config baseNodeConfig;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->hpf.format != ma_format_f32) {
-        return MA_INVALID_ARGS; /* The format must be f32. */
-    }
-
-    result = ma_hpf_init(&pConfig->hpf, pAllocationCallbacks, &pNode->hpf);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    baseNodeConfig = ma_node_config_init();
-    baseNodeConfig.vtable          = &g_ma_hpf_node_vtable;
-    baseNodeConfig.pInputChannels  = &pConfig->hpf.channels;
-    baseNodeConfig.pOutputChannels = &pConfig->hpf.channels;
-
-    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_hpf_node_reinit(const ma_hpf_config* pConfig, ma_hpf_node* pNode)
-{
-    ma_hpf_node* pHPFNode = (ma_hpf_node*)pNode;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_hpf_reinit(pConfig, &pHPFNode->hpf);
-}
-
-MA_API void ma_hpf_node_uninit(ma_hpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_hpf_node* pHPFNode = (ma_hpf_node*)pNode;
-
-    if (pNode == NULL) {
-        return;
-    }
-
-    ma_node_uninit(pNode, pAllocationCallbacks);
-    ma_hpf_uninit(&pHPFNode->hpf, pAllocationCallbacks);
-}
-
-
-
-
-/*
-Band Pass Filter Node
-*/
-MA_API ma_bpf_node_config ma_bpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
-{
-    ma_bpf_node_config config;
-
-    config.nodeConfig = ma_node_config_init();
-    config.bpf = ma_bpf_config_init(ma_format_f32, channels, sampleRate, cutoffFrequency, order);
-
-    return config;
-}
-
-static void ma_bpf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_bpf_node* pBPFNode = (ma_bpf_node*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-    (void)pFrameCountIn;
-
-    ma_bpf_process_pcm_frames(&pBPFNode->bpf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
-}
-
-static ma_node_vtable g_ma_bpf_node_vtable =
-{
-    ma_bpf_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    1,      /* One input. */
-    1,      /* One output. */
-    0       /* Default flags. */
-};
-
-MA_API ma_result ma_bpf_node_init(ma_node_graph* pNodeGraph, const ma_bpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf_node* pNode)
-{
-    ma_result result;
-    ma_node_config baseNodeConfig;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->bpf.format != ma_format_f32) {
-        return MA_INVALID_ARGS; /* The format must be f32. */
-    }
-
-    result = ma_bpf_init(&pConfig->bpf, pAllocationCallbacks, &pNode->bpf);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    baseNodeConfig = ma_node_config_init();
-    baseNodeConfig.vtable          = &g_ma_bpf_node_vtable;
-    baseNodeConfig.pInputChannels  = &pConfig->bpf.channels;
-    baseNodeConfig.pOutputChannels = &pConfig->bpf.channels;
-
-    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_bpf_node_reinit(const ma_bpf_config* pConfig, ma_bpf_node* pNode)
-{
-    ma_bpf_node* pBPFNode = (ma_bpf_node*)pNode;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_bpf_reinit(pConfig, &pBPFNode->bpf);
-}
-
-MA_API void ma_bpf_node_uninit(ma_bpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_bpf_node* pBPFNode = (ma_bpf_node*)pNode;
-
-    if (pNode == NULL) {
-        return;
-    }
-
-    ma_node_uninit(pNode, pAllocationCallbacks);
-    ma_bpf_uninit(&pBPFNode->bpf, pAllocationCallbacks);
-}
-
-
-
-/*
-Notching Filter Node
-*/
-MA_API ma_notch_node_config ma_notch_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double q, double frequency)
-{
-    ma_notch_node_config config;
-
-    config.nodeConfig = ma_node_config_init();
-    config.notch = ma_notch2_config_init(ma_format_f32, channels, sampleRate, q, frequency);
-
-    return config;
-}
-
-static void ma_notch_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_notch_node* pBPFNode = (ma_notch_node*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-    (void)pFrameCountIn;
-
-    ma_notch2_process_pcm_frames(&pBPFNode->notch, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
-}
-
-static ma_node_vtable g_ma_notch_node_vtable =
-{
-    ma_notch_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    1,      /* One input. */
-    1,      /* One output. */
-    0       /* Default flags. */
-};
-
-MA_API ma_result ma_notch_node_init(ma_node_graph* pNodeGraph, const ma_notch_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_notch_node* pNode)
-{
-    ma_result result;
-    ma_node_config baseNodeConfig;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->notch.format != ma_format_f32) {
-        return MA_INVALID_ARGS; /* The format must be f32. */
-    }
-
-    result = ma_notch2_init(&pConfig->notch, pAllocationCallbacks, &pNode->notch);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    baseNodeConfig = ma_node_config_init();
-    baseNodeConfig.vtable          = &g_ma_notch_node_vtable;
-    baseNodeConfig.pInputChannels  = &pConfig->notch.channels;
-    baseNodeConfig.pOutputChannels = &pConfig->notch.channels;
-
-    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_notch_node_reinit(const ma_notch_config* pConfig, ma_notch_node* pNode)
-{
-    ma_notch_node* pNotchNode = (ma_notch_node*)pNode;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_notch2_reinit(pConfig, &pNotchNode->notch);
-}
-
-MA_API void ma_notch_node_uninit(ma_notch_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_notch_node* pNotchNode = (ma_notch_node*)pNode;
-
-    if (pNode == NULL) {
-        return;
-    }
-
-    ma_node_uninit(pNode, pAllocationCallbacks);
-    ma_notch2_uninit(&pNotchNode->notch, pAllocationCallbacks);
-}
-
-
-
-/*
-Peaking Filter Node
-*/
-MA_API ma_peak_node_config ma_peak_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency)
-{
-    ma_peak_node_config config;
-
-    config.nodeConfig = ma_node_config_init();
-    config.peak = ma_peak2_config_init(ma_format_f32, channels, sampleRate, gainDB, q, frequency);
-
-    return config;
-}
-
-static void ma_peak_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_peak_node* pBPFNode = (ma_peak_node*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-    (void)pFrameCountIn;
-
-    ma_peak2_process_pcm_frames(&pBPFNode->peak, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
-}
-
-static ma_node_vtable g_ma_peak_node_vtable =
-{
-    ma_peak_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    1,      /* One input. */
-    1,      /* One output. */
-    0       /* Default flags. */
-};
-
-MA_API ma_result ma_peak_node_init(ma_node_graph* pNodeGraph, const ma_peak_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_peak_node* pNode)
-{
-    ma_result result;
-    ma_node_config baseNodeConfig;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->peak.format != ma_format_f32) {
-        return MA_INVALID_ARGS; /* The format must be f32. */
-    }
-
-    result = ma_peak2_init(&pConfig->peak, pAllocationCallbacks, &pNode->peak);
-    if (result != MA_SUCCESS) {
-        ma_node_uninit(pNode, pAllocationCallbacks);
-        return result;
-    }
-
-    baseNodeConfig = ma_node_config_init();
-    baseNodeConfig.vtable          = &g_ma_peak_node_vtable;
-    baseNodeConfig.pInputChannels  = &pConfig->peak.channels;
-    baseNodeConfig.pOutputChannels = &pConfig->peak.channels;
-
-    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_peak_node_reinit(const ma_peak_config* pConfig, ma_peak_node* pNode)
-{
-    ma_peak_node* pPeakNode = (ma_peak_node*)pNode;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_peak2_reinit(pConfig, &pPeakNode->peak);
-}
-
-MA_API void ma_peak_node_uninit(ma_peak_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_peak_node* pPeakNode = (ma_peak_node*)pNode;
-
-    if (pNode == NULL) {
-        return;
-    }
-
-    ma_node_uninit(pNode, pAllocationCallbacks);
-    ma_peak2_uninit(&pPeakNode->peak, pAllocationCallbacks);
-}
-
-
-
-/*
-Low Shelf Filter Node
-*/
-MA_API ma_loshelf_node_config ma_loshelf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency)
-{
-    ma_loshelf_node_config config;
-
-    config.nodeConfig = ma_node_config_init();
-    config.loshelf = ma_loshelf2_config_init(ma_format_f32, channels, sampleRate, gainDB, q, frequency);
-
-    return config;
-}
-
-static void ma_loshelf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_loshelf_node* pBPFNode = (ma_loshelf_node*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-    (void)pFrameCountIn;
-
-    ma_loshelf2_process_pcm_frames(&pBPFNode->loshelf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
-}
-
-static ma_node_vtable g_ma_loshelf_node_vtable =
-{
-    ma_loshelf_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    1,      /* One input. */
-    1,      /* One output. */
-    0       /* Default flags. */
-};
-
-MA_API ma_result ma_loshelf_node_init(ma_node_graph* pNodeGraph, const ma_loshelf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_loshelf_node* pNode)
-{
-    ma_result result;
-    ma_node_config baseNodeConfig;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->loshelf.format != ma_format_f32) {
-        return MA_INVALID_ARGS; /* The format must be f32. */
-    }
-
-    result = ma_loshelf2_init(&pConfig->loshelf, pAllocationCallbacks, &pNode->loshelf);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    baseNodeConfig = ma_node_config_init();
-    baseNodeConfig.vtable          = &g_ma_loshelf_node_vtable;
-    baseNodeConfig.pInputChannels  = &pConfig->loshelf.channels;
-    baseNodeConfig.pOutputChannels = &pConfig->loshelf.channels;
-
-    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_loshelf_node_reinit(const ma_loshelf_config* pConfig, ma_loshelf_node* pNode)
-{
-    ma_loshelf_node* pLoshelfNode = (ma_loshelf_node*)pNode;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_loshelf2_reinit(pConfig, &pLoshelfNode->loshelf);
-}
-
-MA_API void ma_loshelf_node_uninit(ma_loshelf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_loshelf_node* pLoshelfNode = (ma_loshelf_node*)pNode;
-
-    if (pNode == NULL) {
-        return;
-    }
-
-    ma_node_uninit(pNode, pAllocationCallbacks);
-    ma_loshelf2_uninit(&pLoshelfNode->loshelf, pAllocationCallbacks);
-}
-
-
-
-/*
-High Shelf Filter Node
-*/
-MA_API ma_hishelf_node_config ma_hishelf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency)
-{
-    ma_hishelf_node_config config;
-
-    config.nodeConfig = ma_node_config_init();
-    config.hishelf = ma_hishelf2_config_init(ma_format_f32, channels, sampleRate, gainDB, q, frequency);
-
-    return config;
-}
-
-static void ma_hishelf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_hishelf_node* pBPFNode = (ma_hishelf_node*)pNode;
-
-    MA_ASSERT(pNode != NULL);
-    (void)pFrameCountIn;
-
-    ma_hishelf2_process_pcm_frames(&pBPFNode->hishelf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
-}
-
-static ma_node_vtable g_ma_hishelf_node_vtable =
-{
-    ma_hishelf_node_process_pcm_frames,
-    NULL,   /* onGetRequiredInputFrameCount */
-    1,      /* One input. */
-    1,      /* One output. */
-    0       /* Default flags. */
-};
-
-MA_API ma_result ma_hishelf_node_init(ma_node_graph* pNodeGraph, const ma_hishelf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hishelf_node* pNode)
-{
-    ma_result result;
-    ma_node_config baseNodeConfig;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pNode);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->hishelf.format != ma_format_f32) {
-        return MA_INVALID_ARGS; /* The format must be f32. */
-    }
-
-    result = ma_hishelf2_init(&pConfig->hishelf, pAllocationCallbacks, &pNode->hishelf);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    baseNodeConfig = ma_node_config_init();
-    baseNodeConfig.vtable          = &g_ma_hishelf_node_vtable;
-    baseNodeConfig.pInputChannels  = &pConfig->hishelf.channels;
-    baseNodeConfig.pOutputChannels = &pConfig->hishelf.channels;
-
-    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return result;
-}
-
-MA_API ma_result ma_hishelf_node_reinit(const ma_hishelf_config* pConfig, ma_hishelf_node* pNode)
-{
-    ma_hishelf_node* pHishelfNode = (ma_hishelf_node*)pNode;
-
-    if (pNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_hishelf2_reinit(pConfig, &pHishelfNode->hishelf);
-}
-
-MA_API void ma_hishelf_node_uninit(ma_hishelf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_hishelf_node* pHishelfNode = (ma_hishelf_node*)pNode;
-
-    if (pNode == NULL) {
-        return;
-    }
-
-    ma_node_uninit(pNode, pAllocationCallbacks);
-    ma_hishelf2_uninit(&pHishelfNode->hishelf, pAllocationCallbacks);
-}
-
-
-
-
-MA_API ma_delay_node_config ma_delay_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 delayInFrames, float decay)
-{
-    ma_delay_node_config config;
-
-    config.nodeConfig = ma_node_config_init();
-    config.delay = ma_delay_config_init(channels, sampleRate, delayInFrames, decay);
-
-    return config;
-}
-
-
-static void ma_delay_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_delay_node* pDelayNode = (ma_delay_node*)pNode;
-
-    (void)pFrameCountIn;
-
-    ma_delay_process_pcm_frames(&pDelayNode->delay, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
-}
-
-static ma_node_vtable g_ma_delay_node_vtable =
-{
-    ma_delay_node_process_pcm_frames,
-    NULL,
-    1,  /* 1 input channels. */
-    1,  /* 1 output channel. */
-    MA_NODE_FLAG_CONTINUOUS_PROCESSING  /* Delay requires continuous processing to ensure the tail get's processed. */
-};
-
-MA_API ma_result ma_delay_node_init(ma_node_graph* pNodeGraph, const ma_delay_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_delay_node* pDelayNode)
-{
-    ma_result result;
-    ma_node_config baseConfig;
-
-    if (pDelayNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pDelayNode);
-
-    result = ma_delay_init(&pConfig->delay, pAllocationCallbacks, &pDelayNode->delay);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    baseConfig = pConfig->nodeConfig;
-    baseConfig.vtable          = &g_ma_delay_node_vtable;
-    baseConfig.pInputChannels  = &pConfig->delay.channels;
-    baseConfig.pOutputChannels = &pConfig->delay.channels;
-
-    result = ma_node_init(pNodeGraph, &baseConfig, pAllocationCallbacks, &pDelayNode->baseNode);
-    if (result != MA_SUCCESS) {
-        ma_delay_uninit(&pDelayNode->delay, pAllocationCallbacks);
-        return result;
-    }
-
-    return result;
-}
-
-MA_API void ma_delay_node_uninit(ma_delay_node* pDelayNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pDelayNode == NULL) {
-        return;
-    }
-
-    /* The base node is always uninitialized first. */
-    ma_node_uninit(pDelayNode, pAllocationCallbacks);
-    ma_delay_uninit(&pDelayNode->delay, pAllocationCallbacks);
-}
-
-MA_API void ma_delay_node_set_wet(ma_delay_node* pDelayNode, float value)
-{
-    if (pDelayNode == NULL) {
-        return;
-    }
-
-    ma_delay_set_wet(&pDelayNode->delay, value);
-}
-
-MA_API float ma_delay_node_get_wet(const ma_delay_node* pDelayNode)
-{
-    if (pDelayNode == NULL) {
-        return 0;
-    }
-
-    return ma_delay_get_wet(&pDelayNode->delay);
-}
-
-MA_API void ma_delay_node_set_dry(ma_delay_node* pDelayNode, float value)
-{
-    if (pDelayNode == NULL) {
-        return;
-    }
-
-    ma_delay_set_dry(&pDelayNode->delay, value);
-}
-
-MA_API float ma_delay_node_get_dry(const ma_delay_node* pDelayNode)
-{
-    if (pDelayNode == NULL) {
-        return 0;
-    }
-
-    return ma_delay_get_dry(&pDelayNode->delay);
-}
-
-MA_API void ma_delay_node_set_decay(ma_delay_node* pDelayNode, float value)
-{
-    if (pDelayNode == NULL) {
-        return;
-    }
-
-    ma_delay_set_decay(&pDelayNode->delay, value);
-}
-
-MA_API float ma_delay_node_get_decay(const ma_delay_node* pDelayNode)
-{
-    if (pDelayNode == NULL) {
-        return 0;
-    }
-
-    return ma_delay_get_decay(&pDelayNode->delay);
-}
-#endif  /* MA_NO_NODE_GRAPH */
-
-
-/* SECTION: miniaudio_engine.c */
-#if !defined(MA_NO_ENGINE) && !defined(MA_NO_NODE_GRAPH)
-/**************************************************************************************************************************************************************
-
-Engine
-
-**************************************************************************************************************************************************************/
-#define MA_SEEK_TARGET_NONE         (~(ma_uint64)0)
-
-
-static void ma_sound_set_at_end(ma_sound* pSound, ma_bool32 atEnd)
-{
-    MA_ASSERT(pSound != NULL);
-    ma_atomic_exchange_32(&pSound->atEnd, atEnd);
-
-    /* Fire any callbacks or events. */
-    if (atEnd) {
-        if (pSound->endCallback != NULL) {
-            pSound->endCallback(pSound->pEndCallbackUserData, pSound);
-        }
-    }
-}
-
-static ma_bool32 ma_sound_get_at_end(const ma_sound* pSound)
-{
-    MA_ASSERT(pSound != NULL);
-    return ma_atomic_load_32(&pSound->atEnd);
-}
-
-
-MA_API ma_engine_node_config ma_engine_node_config_init(ma_engine* pEngine, ma_engine_node_type type, ma_uint32 flags)
-{
-    ma_engine_node_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.pEngine                  = pEngine;
-    config.type                     = type;
-    config.isPitchDisabled          = (flags & MA_SOUND_FLAG_NO_PITCH) != 0;
-    config.isSpatializationDisabled = (flags & MA_SOUND_FLAG_NO_SPATIALIZATION) != 0;
-    config.monoExpansionMode        = pEngine->monoExpansionMode;
-
-    return config;
-}
-
-
-static void ma_engine_node_update_pitch_if_required(ma_engine_node* pEngineNode)
-{
-    ma_bool32 isUpdateRequired = MA_FALSE;
-    float newPitch;
-
-    MA_ASSERT(pEngineNode != NULL);
-
-    newPitch = ma_atomic_load_explicit_f32(&pEngineNode->pitch, ma_atomic_memory_order_acquire);
-
-    if (pEngineNode->oldPitch != newPitch) {
-        pEngineNode->oldPitch  = newPitch;
-        isUpdateRequired = MA_TRUE;
-    }
-
-    if (pEngineNode->oldDopplerPitch != pEngineNode->spatializer.dopplerPitch) {
-        pEngineNode->oldDopplerPitch  = pEngineNode->spatializer.dopplerPitch;
-        isUpdateRequired = MA_TRUE;
-    }
-
-    if (isUpdateRequired) {
-        float basePitch = (float)pEngineNode->sampleRate / ma_engine_get_sample_rate(pEngineNode->pEngine);
-        ma_linear_resampler_set_rate_ratio(&pEngineNode->resampler, basePitch * pEngineNode->oldPitch * pEngineNode->oldDopplerPitch);
-    }
-}
-
-static ma_bool32 ma_engine_node_is_pitching_enabled(const ma_engine_node* pEngineNode)
-{
-    MA_ASSERT(pEngineNode != NULL);
-
-    /* Don't try to be clever by skiping resampling in the pitch=1 case or else you'll glitch when moving away from 1. */
-    return !ma_atomic_load_explicit_32(&pEngineNode->isPitchDisabled, ma_atomic_memory_order_acquire);
-}
-
-static ma_bool32 ma_engine_node_is_spatialization_enabled(const ma_engine_node* pEngineNode)
-{
-    MA_ASSERT(pEngineNode != NULL);
-
-    return !ma_atomic_load_explicit_32(&pEngineNode->isSpatializationDisabled, ma_atomic_memory_order_acquire);
-}
-
-static ma_uint64 ma_engine_node_get_required_input_frame_count(const ma_engine_node* pEngineNode, ma_uint64 outputFrameCount)
-{
-    ma_uint64 inputFrameCount = 0;
-
-    if (ma_engine_node_is_pitching_enabled(pEngineNode)) {
-        ma_result result = ma_linear_resampler_get_required_input_frame_count(&pEngineNode->resampler, outputFrameCount, &inputFrameCount);
-        if (result != MA_SUCCESS) {
-            inputFrameCount = 0;
-        }
-    } else {
-        inputFrameCount = outputFrameCount;    /* No resampling, so 1:1. */
-    }
-
-    return inputFrameCount;
-}
-
-static ma_result ma_engine_node_set_volume(ma_engine_node* pEngineNode, float volume)
-{
-    if (pEngineNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    ma_atomic_float_set(&pEngineNode->volume, volume);
-
-    /* If we're not smoothing we should bypass the volume gainer entirely. */
-    if (pEngineNode->volumeSmoothTimeInPCMFrames == 0) {
-        /* We should always have an active spatializer because it can be enabled and disabled dynamically. We can just use that for hodling our volume. */
-        ma_spatializer_set_master_volume(&pEngineNode->spatializer, volume);
-    } else {
-        /* We're using volume smoothing, so apply the master volume to the gainer. */
-        ma_gainer_set_gain(&pEngineNode->volumeGainer, volume);
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_engine_node_get_volume(const ma_engine_node* pEngineNode, float* pVolume)
-{
-    if (pVolume == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pVolume = 0.0f;
-
-    if (pEngineNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pVolume = ma_atomic_float_get((ma_atomic_float*)&pEngineNode->volume);
-
-    return MA_SUCCESS;
-}
-
-
-static void ma_engine_node_process_pcm_frames__general(ma_engine_node* pEngineNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    ma_uint32 frameCountIn;
-    ma_uint32 frameCountOut;
-    ma_uint32 totalFramesProcessedIn;
-    ma_uint32 totalFramesProcessedOut;
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    ma_bool32 isPitchingEnabled;
-    ma_bool32 isFadingEnabled;
-    ma_bool32 isSpatializationEnabled;
-    ma_bool32 isPanningEnabled;
-    ma_bool32 isVolumeSmoothingEnabled;
-
-    frameCountIn  = *pFrameCountIn;
-    frameCountOut = *pFrameCountOut;
-
-    channelsIn  = ma_spatializer_get_input_channels(&pEngineNode->spatializer);
-    channelsOut = ma_spatializer_get_output_channels(&pEngineNode->spatializer);
-
-    totalFramesProcessedIn  = 0;
-    totalFramesProcessedOut = 0;
-
-    /* Update the fader if applicable. */
-    {
-        ma_uint64 fadeLengthInFrames = ma_atomic_uint64_get(&pEngineNode->fadeSettings.fadeLengthInFrames);
-        if (fadeLengthInFrames != ~(ma_uint64)0) {
-            float fadeVolumeBeg = ma_atomic_float_get(&pEngineNode->fadeSettings.volumeBeg);
-            float fadeVolumeEnd = ma_atomic_float_get(&pEngineNode->fadeSettings.volumeEnd);
-            ma_int64 fadeStartOffsetInFrames = (ma_int64)ma_atomic_uint64_get(&pEngineNode->fadeSettings.absoluteGlobalTimeInFrames);
-            if (fadeStartOffsetInFrames == (ma_int64)(~(ma_uint64)0)) {
-                fadeStartOffsetInFrames = 0;
-            } else {
-                fadeStartOffsetInFrames -= ma_engine_get_time_in_pcm_frames(pEngineNode->pEngine);
-            }
-
-            ma_fader_set_fade_ex(&pEngineNode->fader, fadeVolumeBeg, fadeVolumeEnd, fadeLengthInFrames, fadeStartOffsetInFrames);
-
-            /* Reset the fade length so we don't erroneously apply it again. */
-            ma_atomic_uint64_set(&pEngineNode->fadeSettings.fadeLengthInFrames, ~(ma_uint64)0);
-        }
-    }
-
-    isPitchingEnabled        = ma_engine_node_is_pitching_enabled(pEngineNode);
-    isFadingEnabled          = pEngineNode->fader.volumeBeg != 1 || pEngineNode->fader.volumeEnd != 1;
-    isSpatializationEnabled  = ma_engine_node_is_spatialization_enabled(pEngineNode);
-    isPanningEnabled         = pEngineNode->panner.pan != 0 && channelsOut != 1;
-    isVolumeSmoothingEnabled = pEngineNode->volumeSmoothTimeInPCMFrames > 0;
-
-    /* Keep going while we've still got data available for processing. */
-    while (totalFramesProcessedOut < frameCountOut) {
-        /*
-        We need to process in a specific order. We always do resampling first because it's likely
-        we're going to be increasing the channel count after spatialization. Also, I want to do
-        fading based on the output sample rate.
-
-        We'll first read into a buffer from the resampler. Then we'll do all processing that
-        operates on the on the input channel count. We'll then get the spatializer to output to
-        the output buffer and then do all effects from that point directly in the output buffer
-        in-place.
-
-        Note that we're always running the resampler if pitching is enabled, even when the pitch
-        is 1. If we try to be clever and skip resampling when the pitch is 1, we'll get a glitch
-        when we move away from 1, back to 1, and then away from 1 again. We'll want to implement
-        any pitch=1 optimizations in the resampler itself.
-
-        There's a small optimization here that we'll utilize since it might be a fairly common
-        case. When the input and output channel counts are the same, we'll read straight into the
-        output buffer from the resampler and do everything in-place.
-        */
-        const float* pRunningFramesIn;
-        float* pRunningFramesOut;
-        float* pWorkingBuffer;   /* This is the buffer that we'll be processing frames in. This is in input channels. */
-        float temp[MA_DATA_CONVERTER_STACK_BUFFER_SIZE / sizeof(float)];
-        ma_uint32 tempCapInFrames = ma_countof(temp) / channelsIn;
-        ma_uint32 framesAvailableIn;
-        ma_uint32 framesAvailableOut;
-        ma_uint32 framesJustProcessedIn;
-        ma_uint32 framesJustProcessedOut;
-        ma_bool32 isWorkingBufferValid = MA_FALSE;
-
-        framesAvailableIn  = frameCountIn  - totalFramesProcessedIn;
-        framesAvailableOut = frameCountOut - totalFramesProcessedOut;
-
-        pRunningFramesIn  = ma_offset_pcm_frames_const_ptr_f32(ppFramesIn[0], totalFramesProcessedIn, channelsIn);
-        pRunningFramesOut = ma_offset_pcm_frames_ptr_f32(ppFramesOut[0], totalFramesProcessedOut, channelsOut);
-
-        if (channelsIn == channelsOut) {
-            /* Fast path. Channel counts are the same. No need for an intermediary input buffer. */
-            pWorkingBuffer = pRunningFramesOut;
-        } else {
-            /* Slow path. Channel counts are different. Need to use an intermediary input buffer. */
-            pWorkingBuffer = temp;
-            if (framesAvailableOut > tempCapInFrames) {
-                framesAvailableOut = tempCapInFrames;
-            }
-        }
-
-        /* First is resampler. */
-        if (isPitchingEnabled) {
-            ma_uint64 resampleFrameCountIn  = framesAvailableIn;
-            ma_uint64 resampleFrameCountOut = framesAvailableOut;
-
-            ma_linear_resampler_process_pcm_frames(&pEngineNode->resampler, pRunningFramesIn, &resampleFrameCountIn, pWorkingBuffer, &resampleFrameCountOut);
-            isWorkingBufferValid = MA_TRUE;
-
-            framesJustProcessedIn  = (ma_uint32)resampleFrameCountIn;
-            framesJustProcessedOut = (ma_uint32)resampleFrameCountOut;
-        } else {
-            framesJustProcessedIn  = ma_min(framesAvailableIn, framesAvailableOut);
-            framesJustProcessedOut = framesJustProcessedIn; /* When no resampling is being performed, the number of output frames is the same as input frames. */
-        }
-
-        /* Fading. */
-        if (isFadingEnabled) {
-            if (isWorkingBufferValid) {
-                ma_fader_process_pcm_frames(&pEngineNode->fader, pWorkingBuffer, pWorkingBuffer, framesJustProcessedOut);   /* In-place processing. */
-            } else {
-                ma_fader_process_pcm_frames(&pEngineNode->fader, pWorkingBuffer, pRunningFramesIn, framesJustProcessedOut);
-                isWorkingBufferValid = MA_TRUE;
-            }
-        }
-
-        /*
-        If we're using smoothing, we won't be applying volume via the spatializer, but instead from a ma_gainer. In this case
-        we'll want to apply our volume now.
-        */
-        if (isVolumeSmoothingEnabled) {
-            if (isWorkingBufferValid) {
-                ma_gainer_process_pcm_frames(&pEngineNode->volumeGainer, pWorkingBuffer, pWorkingBuffer, framesJustProcessedOut);
-            } else {
-                ma_gainer_process_pcm_frames(&pEngineNode->volumeGainer, pWorkingBuffer, pRunningFramesIn, framesJustProcessedOut);
-                isWorkingBufferValid = MA_TRUE;
-            }
-        }
-
-        /*
-        If at this point we still haven't actually done anything with the working buffer we need
-        to just read straight from the input buffer.
-        */
-        if (isWorkingBufferValid == MA_FALSE) {
-            pWorkingBuffer = (float*)pRunningFramesIn;  /* Naughty const cast, but it's safe at this point because we won't ever be writing to it from this point out. */
-        }
-
-        /* Spatialization. */
-        if (isSpatializationEnabled) {
-            ma_uint32 iListener;
-
-            /*
-            When determining the listener to use, we first check to see if the sound is pinned to a
-            specific listener. If so, we use that. Otherwise we just use the closest listener.
-            */
-            if (pEngineNode->pinnedListenerIndex != MA_LISTENER_INDEX_CLOSEST && pEngineNode->pinnedListenerIndex < ma_engine_get_listener_count(pEngineNode->pEngine)) {
-                iListener = pEngineNode->pinnedListenerIndex;
-            } else {
-                ma_vec3f spatializerPosition = ma_spatializer_get_position(&pEngineNode->spatializer);
-                iListener = ma_engine_find_closest_listener(pEngineNode->pEngine, spatializerPosition.x, spatializerPosition.y, spatializerPosition.z);
-            }
-
-            ma_spatializer_process_pcm_frames(&pEngineNode->spatializer, &pEngineNode->pEngine->listeners[iListener], pRunningFramesOut, pWorkingBuffer, framesJustProcessedOut);
-        } else {
-            /* No spatialization, but we still need to do channel conversion and master volume. */
-            float volume;
-            ma_engine_node_get_volume(pEngineNode, &volume);    /* Should never fail. */
-
-            if (channelsIn == channelsOut) {
-                /* No channel conversion required. Just copy straight to the output buffer. */
-                if (isVolumeSmoothingEnabled) {
-                    /* Volume has already been applied. Just copy straight to the output buffer. */
-                    ma_copy_pcm_frames(pRunningFramesOut, pWorkingBuffer, framesJustProcessedOut * channelsOut, ma_format_f32, channelsOut);
-                } else {
-                    /* Volume has not been applied yet. Copy and apply volume in the same pass. */
-                    ma_copy_and_apply_volume_factor_f32(pRunningFramesOut, pWorkingBuffer, framesJustProcessedOut * channelsOut, volume);
-                }
-            } else {
-                /* Channel conversion required. TODO: Add support for channel maps here. */
-                ma_channel_map_apply_f32(pRunningFramesOut, NULL, channelsOut, pWorkingBuffer, NULL, channelsIn, framesJustProcessedOut, ma_channel_mix_mode_simple, pEngineNode->monoExpansionMode);
-
-                /* If we're using smoothing, the volume will have already been applied. */
-                if (!isVolumeSmoothingEnabled) {
-                    ma_apply_volume_factor_f32(pRunningFramesOut, framesJustProcessedOut * channelsOut, volume);
-                }
-            }
-        }
-
-        /* At this point we can guarantee that the output buffer contains valid data. We can process everything in place now. */
-
-        /* Panning. */
-        if (isPanningEnabled) {
-            ma_panner_process_pcm_frames(&pEngineNode->panner, pRunningFramesOut, pRunningFramesOut, framesJustProcessedOut);   /* In-place processing. */
-        }
-
-        /* We're done for this chunk. */
-        totalFramesProcessedIn  += framesJustProcessedIn;
-        totalFramesProcessedOut += framesJustProcessedOut;
-
-        /* If we didn't process any output frames this iteration it means we've either run out of input data, or run out of room in the output buffer. */
-        if (framesJustProcessedOut == 0) {
-            break;
-        }
-    }
-
-    /* At this point we're done processing. */
-    *pFrameCountIn  = totalFramesProcessedIn;
-    *pFrameCountOut = totalFramesProcessedOut;
-}
-
-static void ma_engine_node_process_pcm_frames__sound(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    /* For sounds, we need to first read from the data source. Then we need to apply the engine effects (pan, pitch, fades, etc.). */
-    ma_result result = MA_SUCCESS;
-    ma_sound* pSound = (ma_sound*)pNode;
-    ma_uint32 frameCount = *pFrameCountOut;
-    ma_uint32 totalFramesRead = 0;
-    ma_format dataSourceFormat;
-    ma_uint32 dataSourceChannels;
-    ma_uint8 temp[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-    ma_uint32 tempCapInFrames;
-    ma_uint64 seekTarget;
-
-    /* This is a data source node which means no input buses. */
-    (void)ppFramesIn;
-    (void)pFrameCountIn;
-
-    /* If we're marked at the end we need to stop the sound and do nothing. */
-    if (ma_sound_at_end(pSound)) {
-        ma_sound_stop(pSound);
-        *pFrameCountOut = 0;
-        return;
-    }
-
-    /* If we're seeking, do so now before reading. */
-    seekTarget = ma_atomic_load_64(&pSound->seekTarget);
-    if (seekTarget != MA_SEEK_TARGET_NONE) {
-        ma_data_source_seek_to_pcm_frame(pSound->pDataSource, seekTarget);
-
-        /* Any time-dependant effects need to have their times updated. */
-        ma_node_set_time(pSound, seekTarget);
-
-        ma_atomic_exchange_64(&pSound->seekTarget, MA_SEEK_TARGET_NONE);
-    }
-
-    /*
-    We want to update the pitch once. For sounds, this can be either at the start or at the end. If
-    we don't force this to only ever be updating once, we could end up in a situation where
-    retrieving the required input frame count ends up being different to what we actually retrieve.
-    What could happen is that the required input frame count is calculated, the pitch is update,
-    and then this processing function is called resulting in a different number of input frames
-    being processed. Do not call this in ma_engine_node_process_pcm_frames__general() or else
-    you'll hit the aforementioned bug.
-    */
-    ma_engine_node_update_pitch_if_required(&pSound->engineNode);
-
-    /*
-    For the convenience of the caller, we're doing to allow data sources to use non-floating-point formats and channel counts that differ
-    from the main engine.
-    */
-    result = ma_data_source_get_data_format(pSound->pDataSource, &dataSourceFormat, &dataSourceChannels, NULL, NULL, 0);
-    if (result == MA_SUCCESS) {
-        tempCapInFrames = sizeof(temp) / ma_get_bytes_per_frame(dataSourceFormat, dataSourceChannels);
-
-        /* Keep reading until we've read as much as was requested or we reach the end of the data source. */
-        while (totalFramesRead < frameCount) {
-            ma_uint32 framesRemaining = frameCount - totalFramesRead;
-            ma_uint32 framesToRead;
-            ma_uint64 framesJustRead;
-            ma_uint32 frameCountIn;
-            ma_uint32 frameCountOut;
-            const float* pRunningFramesIn;
-            float* pRunningFramesOut;
-
-            /*
-            The first thing we need to do is read into the temporary buffer. We can calculate exactly
-            how many input frames we'll need after resampling.
-            */
-            framesToRead = (ma_uint32)ma_engine_node_get_required_input_frame_count(&pSound->engineNode, framesRemaining);
-            if (framesToRead > tempCapInFrames) {
-                framesToRead = tempCapInFrames;
-            }
-
-            result = ma_data_source_read_pcm_frames(pSound->pDataSource, temp, framesToRead, &framesJustRead);
-
-            /* If we reached the end of the sound we'll want to mark it as at the end and stop it. This should never be returned for looping sounds. */
-            if (result == MA_AT_END) {
-                ma_sound_set_at_end(pSound, MA_TRUE);   /* This will be set to false in ma_sound_start(). */
-            }
-
-            pRunningFramesOut = ma_offset_pcm_frames_ptr_f32(ppFramesOut[0], totalFramesRead, ma_engine_get_channels(ma_sound_get_engine(pSound)));
-
-            frameCountIn = (ma_uint32)framesJustRead;
-            frameCountOut = framesRemaining;
-
-            /* Convert if necessary. */
-            if (dataSourceFormat == ma_format_f32) {
-                /* Fast path. No data conversion necessary. */
-                pRunningFramesIn = (float*)temp;
-                ma_engine_node_process_pcm_frames__general(&pSound->engineNode, &pRunningFramesIn, &frameCountIn, &pRunningFramesOut, &frameCountOut);
-            } else {
-                /* Slow path. Need to do sample format conversion to f32. If we give the f32 buffer the same count as the first temp buffer, we're guaranteed it'll be large enough. */
-                float tempf32[MA_DATA_CONVERTER_STACK_BUFFER_SIZE]; /* Do not do `MA_DATA_CONVERTER_STACK_BUFFER_SIZE/sizeof(float)` here like we've done in other places. */
-                ma_convert_pcm_frames_format(tempf32, ma_format_f32, temp, dataSourceFormat, framesJustRead, dataSourceChannels, ma_dither_mode_none);
-
-                /* Now that we have our samples in f32 format we can process like normal. */
-                pRunningFramesIn = tempf32;
-                ma_engine_node_process_pcm_frames__general(&pSound->engineNode, &pRunningFramesIn, &frameCountIn, &pRunningFramesOut, &frameCountOut);
-            }
-
-            /* We should have processed all of our input frames since we calculated the required number of input frames at the top. */
-            MA_ASSERT(frameCountIn == framesJustRead);
-            totalFramesRead += (ma_uint32)frameCountOut;   /* Safe cast. */
-
-            if (result != MA_SUCCESS || ma_sound_at_end(pSound)) {
-                break;  /* Might have reached the end. */
-            }
-        }
-    }
-
-    *pFrameCountOut = totalFramesRead;
-}
-
-static void ma_engine_node_process_pcm_frames__group(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
-{
-    /*
-    Make sure the pitch is updated before trying to read anything. It's important that this is done
-    only once and not in ma_engine_node_process_pcm_frames__general(). The reason for this is that
-    ma_engine_node_process_pcm_frames__general() will call ma_engine_node_get_required_input_frame_count(),
-    and if another thread modifies the pitch just after that call it can result in a glitch due to
-    the input rate changing.
-    */
-    ma_engine_node_update_pitch_if_required((ma_engine_node*)pNode);
-
-    /* For groups, the input data has already been read and we just need to apply the effect. */
-    ma_engine_node_process_pcm_frames__general((ma_engine_node*)pNode, ppFramesIn, pFrameCountIn, ppFramesOut, pFrameCountOut);
-}
-
-static ma_result ma_engine_node_get_required_input_frame_count__group(ma_node* pNode, ma_uint32 outputFrameCount, ma_uint32* pInputFrameCount)
-{
-    ma_uint64 inputFrameCount;
-
-    MA_ASSERT(pInputFrameCount != NULL);
-
-    /* Our pitch will affect this calculation. We need to update it. */
-    ma_engine_node_update_pitch_if_required((ma_engine_node*)pNode);
-
-    inputFrameCount = ma_engine_node_get_required_input_frame_count((ma_engine_node*)pNode, outputFrameCount);
-    if (inputFrameCount > 0xFFFFFFFF) {
-        inputFrameCount = 0xFFFFFFFF;    /* Will never happen because miniaudio will only ever process in relatively small chunks. */
-    }
-
-    *pInputFrameCount = (ma_uint32)inputFrameCount;
-
-    return MA_SUCCESS;
-}
-
-
-static ma_node_vtable g_ma_engine_node_vtable__sound =
-{
-    ma_engine_node_process_pcm_frames__sound,
-    NULL,   /* onGetRequiredInputFrameCount */
-    0,      /* Sounds are data source nodes which means they have zero inputs (their input is drawn from the data source itself). */
-    1,      /* Sounds have one output bus. */
-    0       /* Default flags. */
-};
-
-static ma_node_vtable g_ma_engine_node_vtable__group =
-{
-    ma_engine_node_process_pcm_frames__group,
-    ma_engine_node_get_required_input_frame_count__group,
-    1,      /* Groups have one input bus. */
-    1,      /* Groups have one output bus. */
-    MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES /* The engine node does resampling so should let miniaudio know about it. */
-};
-
-
-
-static ma_node_config ma_engine_node_base_node_config_init(const ma_engine_node_config* pConfig)
-{
-    ma_node_config baseNodeConfig;
-
-    if (pConfig->type == ma_engine_node_type_sound) {
-        /* Sound. */
-        baseNodeConfig = ma_node_config_init();
-        baseNodeConfig.vtable       = &g_ma_engine_node_vtable__sound;
-        baseNodeConfig.initialState = ma_node_state_stopped;    /* Sounds are stopped by default. */
-    } else {
-        /* Group. */
-        baseNodeConfig = ma_node_config_init();
-        baseNodeConfig.vtable       = &g_ma_engine_node_vtable__group;
-        baseNodeConfig.initialState = ma_node_state_started;    /* Groups are started by default. */
-    }
-
-    return baseNodeConfig;
-}
-
-static ma_spatializer_config ma_engine_node_spatializer_config_init(const ma_node_config* pBaseNodeConfig)
-{
-    return ma_spatializer_config_init(pBaseNodeConfig->pInputChannels[0], pBaseNodeConfig->pOutputChannels[0]);
-}
-
-typedef struct
-{
-    size_t sizeInBytes;
-    size_t baseNodeOffset;
-    size_t resamplerOffset;
-    size_t spatializerOffset;
-    size_t gainerOffset;
-} ma_engine_node_heap_layout;
-
-static ma_result ma_engine_node_get_heap_layout(const ma_engine_node_config* pConfig, ma_engine_node_heap_layout* pHeapLayout)
-{
-    ma_result result;
-    size_t tempHeapSize;
-    ma_node_config baseNodeConfig;
-    ma_linear_resampler_config resamplerConfig;
-    ma_spatializer_config spatializerConfig;
-    ma_gainer_config gainerConfig;
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    ma_channel defaultStereoChannelMap[2] = {MA_CHANNEL_SIDE_LEFT, MA_CHANNEL_SIDE_RIGHT};  /* <-- Consistent with the default channel map of a stereo listener. Means channel conversion can run on a fast path. */
-
-    MA_ASSERT(pHeapLayout);
-
-    MA_ZERO_OBJECT(pHeapLayout);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    if (pConfig->pEngine == NULL) {
-        return MA_INVALID_ARGS; /* An engine must be specified. */
-    }
-
-    pHeapLayout->sizeInBytes = 0;
-
-    channelsIn  = (pConfig->channelsIn  != 0) ? pConfig->channelsIn  : ma_engine_get_channels(pConfig->pEngine);
-    channelsOut = (pConfig->channelsOut != 0) ? pConfig->channelsOut : ma_engine_get_channels(pConfig->pEngine);
-
-
-    /* Base node. */
-    baseNodeConfig = ma_engine_node_base_node_config_init(pConfig);
-    baseNodeConfig.pInputChannels  = &channelsIn;
-    baseNodeConfig.pOutputChannels = &channelsOut;
-
-    result = ma_node_get_heap_size(ma_engine_get_node_graph(pConfig->pEngine), &baseNodeConfig, &tempHeapSize);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to retrieve the size of the heap for the base node. */
-    }
-
-    pHeapLayout->baseNodeOffset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += ma_align_64(tempHeapSize);
-
-
-    /* Resmapler. */
-    resamplerConfig = ma_linear_resampler_config_init(ma_format_f32, channelsIn, 1, 1); /* Input and output sample rates don't affect the calculation of the heap size. */
-    resamplerConfig.lpfOrder = 0;
-
-    result = ma_linear_resampler_get_heap_size(&resamplerConfig, &tempHeapSize);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to retrieve the size of the heap for the resampler. */
-    }
-
-    pHeapLayout->resamplerOffset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += ma_align_64(tempHeapSize);
-
-
-    /* Spatializer. */
-    spatializerConfig = ma_engine_node_spatializer_config_init(&baseNodeConfig);
-
-    if (spatializerConfig.channelsIn == 2) {
-        spatializerConfig.pChannelMapIn = defaultStereoChannelMap;
-    }
-
-    result = ma_spatializer_get_heap_size(&spatializerConfig, &tempHeapSize);
-    if (result != MA_SUCCESS) {
-        return result;  /* Failed to retrieve the size of the heap for the spatializer. */
-    }
-
-    pHeapLayout->spatializerOffset = pHeapLayout->sizeInBytes;
-    pHeapLayout->sizeInBytes += ma_align_64(tempHeapSize);
-
-
-    /* Gainer. Will not be used if we are not using smoothing. */
-    if (pConfig->volumeSmoothTimeInPCMFrames > 0) {
-        gainerConfig = ma_gainer_config_init(channelsIn, pConfig->volumeSmoothTimeInPCMFrames);
-
-        result = ma_gainer_get_heap_size(&gainerConfig, &tempHeapSize);
-        if (result != MA_SUCCESS) {
-            return result;
-        }
-
-        pHeapLayout->gainerOffset = pHeapLayout->sizeInBytes;
-        pHeapLayout->sizeInBytes += ma_align_64(tempHeapSize);
-    }
-
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_engine_node_get_heap_size(const ma_engine_node_config* pConfig, size_t* pHeapSizeInBytes)
-{
-    ma_result result;
-    ma_engine_node_heap_layout heapLayout;
-
-    if (pHeapSizeInBytes == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    *pHeapSizeInBytes = 0;
-
-    result = ma_engine_node_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    *pHeapSizeInBytes = heapLayout.sizeInBytes;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_engine_node_init_preallocated(const ma_engine_node_config* pConfig, void* pHeap, ma_engine_node* pEngineNode)
-{
-    ma_result result;
-    ma_engine_node_heap_layout heapLayout;
-    ma_node_config baseNodeConfig;
-    ma_linear_resampler_config resamplerConfig;
-    ma_fader_config faderConfig;
-    ma_spatializer_config spatializerConfig;
-    ma_panner_config pannerConfig;
-    ma_gainer_config gainerConfig;
-    ma_uint32 channelsIn;
-    ma_uint32 channelsOut;
-    ma_channel defaultStereoChannelMap[2] = {MA_CHANNEL_SIDE_LEFT, MA_CHANNEL_SIDE_RIGHT};  /* <-- Consistent with the default channel map of a stereo listener. Means channel conversion can run on a fast path. */
-
-    if (pEngineNode == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pEngineNode);
-
-    result = ma_engine_node_get_heap_layout(pConfig, &heapLayout);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pConfig->pinnedListenerIndex != MA_LISTENER_INDEX_CLOSEST && pConfig->pinnedListenerIndex >= ma_engine_get_listener_count(pConfig->pEngine)) {
-        return MA_INVALID_ARGS; /* Invalid listener. */
-    }
-
-    pEngineNode->_pHeap = pHeap;
-    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
-
-    pEngineNode->pEngine                     = pConfig->pEngine;
-    pEngineNode->sampleRate                  = (pConfig->sampleRate > 0) ? pConfig->sampleRate : ma_engine_get_sample_rate(pEngineNode->pEngine);
-    pEngineNode->volumeSmoothTimeInPCMFrames = pConfig->volumeSmoothTimeInPCMFrames;
-    pEngineNode->monoExpansionMode           = pConfig->monoExpansionMode;
-    ma_atomic_float_set(&pEngineNode->volume, 1);
-    pEngineNode->pitch                       = 1;
-    pEngineNode->oldPitch                    = 1;
-    pEngineNode->oldDopplerPitch             = 1;
-    pEngineNode->isPitchDisabled             = pConfig->isPitchDisabled;
-    pEngineNode->isSpatializationDisabled    = pConfig->isSpatializationDisabled;
-    pEngineNode->pinnedListenerIndex         = pConfig->pinnedListenerIndex;
-    ma_atomic_float_set(&pEngineNode->fadeSettings.volumeBeg, 1);
-    ma_atomic_float_set(&pEngineNode->fadeSettings.volumeEnd, 1);
-    ma_atomic_uint64_set(&pEngineNode->fadeSettings.fadeLengthInFrames, (~(ma_uint64)0));
-    ma_atomic_uint64_set(&pEngineNode->fadeSettings.absoluteGlobalTimeInFrames, (~(ma_uint64)0));   /* <-- Indicates that the fade should start immediately. */
-
-    channelsIn  = (pConfig->channelsIn  != 0) ? pConfig->channelsIn  : ma_engine_get_channels(pConfig->pEngine);
-    channelsOut = (pConfig->channelsOut != 0) ? pConfig->channelsOut : ma_engine_get_channels(pConfig->pEngine);
-
-    /*
-    If the sample rate of the sound is different to the engine, make sure pitching is enabled so that the resampler
-    is activated. Not doing this will result in the sound not being resampled if MA_SOUND_FLAG_NO_PITCH is used.
-    */
-    if (pEngineNode->sampleRate != ma_engine_get_sample_rate(pEngineNode->pEngine)) {
-        pEngineNode->isPitchDisabled = MA_FALSE;
-    }
-
-
-    /* Base node. */
-    baseNodeConfig = ma_engine_node_base_node_config_init(pConfig);
-    baseNodeConfig.pInputChannels  = &channelsIn;
-    baseNodeConfig.pOutputChannels = &channelsOut;
-
-    result = ma_node_init_preallocated(&pConfig->pEngine->nodeGraph, &baseNodeConfig, ma_offset_ptr(pHeap, heapLayout.baseNodeOffset), &pEngineNode->baseNode);
-    if (result != MA_SUCCESS) {
-        goto error0;
-    }
-
-
-    /*
-    We can now initialize the effects we need in order to implement the engine node. There's a
-    defined order of operations here, mainly centered around when we convert our channels from the
-    data source's native channel count to the engine's channel count. As a rule, we want to do as
-    much computation as possible before spatialization because there's a chance that will increase
-    the channel count, thereby increasing the amount of work needing to be done to process.
-    */
-
-    /* We'll always do resampling first. */
-    resamplerConfig = ma_linear_resampler_config_init(ma_format_f32, baseNodeConfig.pInputChannels[0], pEngineNode->sampleRate, ma_engine_get_sample_rate(pEngineNode->pEngine));
-    resamplerConfig.lpfOrder = 0;    /* <-- Need to disable low-pass filtering for pitch shifting for now because there's cases where the biquads are becoming unstable. Need to figure out a better fix for this. */
-
-    result = ma_linear_resampler_init_preallocated(&resamplerConfig, ma_offset_ptr(pHeap, heapLayout.resamplerOffset), &pEngineNode->resampler);
-    if (result != MA_SUCCESS) {
-        goto error1;
-    }
-
-
-    /* After resampling will come the fader. */
-    faderConfig = ma_fader_config_init(ma_format_f32, baseNodeConfig.pInputChannels[0], ma_engine_get_sample_rate(pEngineNode->pEngine));
-
-    result = ma_fader_init(&faderConfig, &pEngineNode->fader);
-    if (result != MA_SUCCESS) {
-        goto error2;
-    }
-
-
-    /*
-    Spatialization comes next. We spatialize based ont he node's output channel count. It's up the caller to
-    ensure channels counts link up correctly in the node graph.
-    */
-    spatializerConfig = ma_engine_node_spatializer_config_init(&baseNodeConfig);
-    spatializerConfig.gainSmoothTimeInFrames = pEngineNode->pEngine->gainSmoothTimeInFrames;
-
-    if (spatializerConfig.channelsIn == 2) {
-        spatializerConfig.pChannelMapIn = defaultStereoChannelMap;
-    }
-
-    result = ma_spatializer_init_preallocated(&spatializerConfig, ma_offset_ptr(pHeap, heapLayout.spatializerOffset), &pEngineNode->spatializer);
-    if (result != MA_SUCCESS) {
-        goto error2;
-    }
-
-
-    /*
-    After spatialization comes panning. We need to do this after spatialization because otherwise we wouldn't
-    be able to pan mono sounds.
-    */
-    pannerConfig = ma_panner_config_init(ma_format_f32, baseNodeConfig.pOutputChannels[0]);
-
-    result = ma_panner_init(&pannerConfig, &pEngineNode->panner);
-    if (result != MA_SUCCESS) {
-        goto error3;
-    }
-
-
-    /* We'll need a gainer for smoothing out volume changes if we have a non-zero smooth time. We apply this before converting to the output channel count. */
-    if (pConfig->volumeSmoothTimeInPCMFrames > 0) {
-        gainerConfig = ma_gainer_config_init(channelsIn, pConfig->volumeSmoothTimeInPCMFrames);
-
-        result = ma_gainer_init_preallocated(&gainerConfig, ma_offset_ptr(pHeap, heapLayout.gainerOffset), &pEngineNode->volumeGainer);
-        if (result != MA_SUCCESS) {
-            goto error3;
-        }
-    }
-
-
-    return MA_SUCCESS;
-
-    /* No need for allocation callbacks here because we use a preallocated heap. */
-error3: ma_spatializer_uninit(&pEngineNode->spatializer, NULL);
-error2: ma_linear_resampler_uninit(&pEngineNode->resampler, NULL);
-error1: ma_node_uninit(&pEngineNode->baseNode, NULL);
-error0: return result;
-}
-
-MA_API ma_result ma_engine_node_init(const ma_engine_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_engine_node* pEngineNode)
-{
-    ma_result result;
-    size_t heapSizeInBytes;
-    void* pHeap;
-
-    result = ma_engine_node_get_heap_size(pConfig, &heapSizeInBytes);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (heapSizeInBytes > 0) {
-        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
-        if (pHeap == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-    } else {
-        pHeap = NULL;
-    }
-
-    result = ma_engine_node_init_preallocated(pConfig, pHeap, pEngineNode);
-    if (result != MA_SUCCESS) {
-        ma_free(pHeap, pAllocationCallbacks);
-        return result;
-    }
-
-    pEngineNode->_ownsHeap = MA_TRUE;
-    return MA_SUCCESS;
-}
-
-MA_API void ma_engine_node_uninit(ma_engine_node* pEngineNode, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    /*
-    The base node always needs to be uninitialized first to ensure it's detached from the graph completely before we
-    destroy anything that might be in the middle of being used by the processing function.
-    */
-    ma_node_uninit(&pEngineNode->baseNode, pAllocationCallbacks);
-
-    /* Now that the node has been uninitialized we can safely uninitialize the rest. */
-    if (pEngineNode->volumeSmoothTimeInPCMFrames > 0) {
-        ma_gainer_uninit(&pEngineNode->volumeGainer, pAllocationCallbacks);
-    }
-
-    ma_spatializer_uninit(&pEngineNode->spatializer, pAllocationCallbacks);
-    ma_linear_resampler_uninit(&pEngineNode->resampler, pAllocationCallbacks);
-
-    /* Free the heap last. */
-    if (pEngineNode->_ownsHeap) {
-        ma_free(pEngineNode->_pHeap, pAllocationCallbacks);
-    }
-}
-
-
-MA_API ma_sound_config ma_sound_config_init(void)
-{
-    return ma_sound_config_init_2(NULL);
-}
-
-MA_API ma_sound_config ma_sound_config_init_2(ma_engine* pEngine)
-{
-    ma_sound_config config;
-
-    MA_ZERO_OBJECT(&config);
-
-    if (pEngine != NULL) {
-        config.monoExpansionMode = pEngine->monoExpansionMode;
-    } else {
-        config.monoExpansionMode = ma_mono_expansion_mode_default;
-    }
-
-    config.rangeEndInPCMFrames     = ~((ma_uint64)0);
-    config.loopPointEndInPCMFrames = ~((ma_uint64)0);
-
-    return config;
-}
-
-MA_API ma_sound_group_config ma_sound_group_config_init(void)
-{
-    return ma_sound_group_config_init_2(NULL);
-}
-
-MA_API ma_sound_group_config ma_sound_group_config_init_2(ma_engine* pEngine)
-{
-    ma_sound_group_config config;
-
-    MA_ZERO_OBJECT(&config);
-
-    if (pEngine != NULL) {
-        config.monoExpansionMode = pEngine->monoExpansionMode;
-    } else {
-        config.monoExpansionMode = ma_mono_expansion_mode_default;
-    }
-
-    return config;
-}
-
-
-MA_API ma_engine_config ma_engine_config_init(void)
-{
-    ma_engine_config config;
-
-    MA_ZERO_OBJECT(&config);
-    config.listenerCount     = 1;   /* Always want at least one listener. */
-    config.monoExpansionMode = ma_mono_expansion_mode_default;
-
-    return config;
-}
-
-
-#if !defined(MA_NO_DEVICE_IO)
-static void ma_engine_data_callback_internal(ma_device* pDevice, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
-{
-    ma_engine* pEngine = (ma_engine*)pDevice->pUserData;
-
-    (void)pFramesIn;
-
-    /*
-    Experiment: Try processing a resource manager job if we're on the Emscripten build.
-
-    This serves two purposes:
-
-        1) It ensures jobs are actually processed at some point since we cannot guarantee that the
-           caller is doing the right thing and calling ma_resource_manager_process_next_job(); and
-
-        2) It's an attempt at working around an issue where processing jobs on the Emscripten main
-           loop doesn't work as well as it should. When trying to load sounds without the `DECODE`
-           flag or with the `ASYNC` flag, the sound data is just not able to be loaded in time
-           before the callback is processed. I think it's got something to do with the single-
-           threaded nature of Web, but I'm not entirely sure.
-    */
-    #if !defined(MA_NO_RESOURCE_MANAGER) && defined(MA_EMSCRIPTEN)
-    {
-        if (pEngine->pResourceManager != NULL) {
-            if ((pEngine->pResourceManager->config.flags & MA_RESOURCE_MANAGER_FLAG_NO_THREADING) != 0) {
-                ma_resource_manager_process_next_job(pEngine->pResourceManager);
-            }
-        }
-    }
-    #endif
-
-    ma_engine_read_pcm_frames(pEngine, pFramesOut, frameCount, NULL);
-}
-#endif
-
-MA_API ma_result ma_engine_init(const ma_engine_config* pConfig, ma_engine* pEngine)
-{
-    ma_result result;
-    ma_node_graph_config nodeGraphConfig;
-    ma_engine_config engineConfig;
-    ma_spatializer_listener_config listenerConfig;
-    ma_uint32 iListener;
-
-    if (pEngine == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pEngine);
-
-    /* The config is allowed to be NULL in which case we use defaults for everything. */
-    if (pConfig != NULL) {
-        engineConfig = *pConfig;
-    } else {
-        engineConfig = ma_engine_config_init();
-    }
-
-    pEngine->monoExpansionMode = engineConfig.monoExpansionMode;
-    pEngine->defaultVolumeSmoothTimeInPCMFrames = engineConfig.defaultVolumeSmoothTimeInPCMFrames;
-    pEngine->onProcess = engineConfig.onProcess;
-    pEngine->pProcessUserData = engineConfig.pProcessUserData;
-    ma_allocation_callbacks_init_copy(&pEngine->allocationCallbacks, &engineConfig.allocationCallbacks);
-
-    #if !defined(MA_NO_RESOURCE_MANAGER)
-    {
-        pEngine->pResourceManager = engineConfig.pResourceManager;
-    }
-    #endif
-
-    #if !defined(MA_NO_DEVICE_IO)
-    {
-        pEngine->pDevice = engineConfig.pDevice;
-
-        /* If we don't have a device, we need one. */
-        if (pEngine->pDevice == NULL && engineConfig.noDevice == MA_FALSE) {
-            ma_device_config deviceConfig;
-
-            pEngine->pDevice = (ma_device*)ma_malloc(sizeof(*pEngine->pDevice), &pEngine->allocationCallbacks);
-            if (pEngine->pDevice == NULL) {
-                return MA_OUT_OF_MEMORY;
-            }
-
-            deviceConfig = ma_device_config_init(ma_device_type_playback);
-            deviceConfig.playback.pDeviceID        = engineConfig.pPlaybackDeviceID;
-            deviceConfig.playback.format           = ma_format_f32;
-            deviceConfig.playback.channels         = engineConfig.channels;
-            deviceConfig.sampleRate                = engineConfig.sampleRate;
-            deviceConfig.dataCallback              = (engineConfig.dataCallback != NULL) ? engineConfig.dataCallback : ma_engine_data_callback_internal;
-            deviceConfig.pUserData                 = pEngine;
-            deviceConfig.notificationCallback      = engineConfig.notificationCallback;
-            deviceConfig.periodSizeInFrames        = engineConfig.periodSizeInFrames;
-            deviceConfig.periodSizeInMilliseconds  = engineConfig.periodSizeInMilliseconds;
-            deviceConfig.noPreSilencedOutputBuffer = MA_TRUE;    /* We'll always be outputting to every frame in the callback so there's no need for a pre-silenced buffer. */
-            deviceConfig.noClip                    = MA_TRUE;    /* The engine will do clipping itself. */
-
-            if (engineConfig.pContext == NULL) {
-                ma_context_config contextConfig = ma_context_config_init();
-                contextConfig.allocationCallbacks = pEngine->allocationCallbacks;
-                contextConfig.pLog = engineConfig.pLog;
-
-                /* If the engine config does not specify a log, use the resource manager's if we have one. */
-                #ifndef MA_NO_RESOURCE_MANAGER
-                {
-                    if (contextConfig.pLog == NULL && engineConfig.pResourceManager != NULL) {
-                        contextConfig.pLog = ma_resource_manager_get_log(engineConfig.pResourceManager);
-                    }
-                }
-                #endif
-
-                result = ma_device_init_ex(NULL, 0, &contextConfig, &deviceConfig, pEngine->pDevice);
-            } else {
-                result = ma_device_init(engineConfig.pContext, &deviceConfig, pEngine->pDevice);
-            }
-
-            if (result != MA_SUCCESS) {
-                ma_free(pEngine->pDevice, &pEngine->allocationCallbacks);
-                pEngine->pDevice = NULL;
-                return result;
-            }
-
-            pEngine->ownsDevice = MA_TRUE;
-        }
-
-        /* Update the channel count and sample rate of the engine config so we can reference it below. */
-        if (pEngine->pDevice != NULL) {
-            engineConfig.channels   = pEngine->pDevice->playback.channels;
-            engineConfig.sampleRate = pEngine->pDevice->sampleRate;
-        }
-    }
-    #endif
-
-    if (engineConfig.channels == 0 || engineConfig.sampleRate == 0) {
-        return MA_INVALID_ARGS;
-    }
-
-    pEngine->sampleRate = engineConfig.sampleRate;
-
-    /* The engine always uses either the log that was passed into the config, or the context's log is available. */
-    if (engineConfig.pLog != NULL) {
-        pEngine->pLog = engineConfig.pLog;
-    } else {
-        #if !defined(MA_NO_DEVICE_IO)
-        {
-            pEngine->pLog = ma_device_get_log(pEngine->pDevice);
-        }
-        #else
-        {
-            pEngine->pLog = NULL;
-        }
-        #endif
-    }
-
-
-    /* The engine is a node graph. This needs to be initialized after we have the device so we can can determine the channel count. */
-    nodeGraphConfig = ma_node_graph_config_init(engineConfig.channels);
-    nodeGraphConfig.nodeCacheCapInFrames = (engineConfig.periodSizeInFrames > 0xFFFF) ? 0xFFFF : (ma_uint16)engineConfig.periodSizeInFrames;
-
-    result = ma_node_graph_init(&nodeGraphConfig, &pEngine->allocationCallbacks, &pEngine->nodeGraph);
-    if (result != MA_SUCCESS) {
-        goto on_error_1;
-    }
-
-
-    /* We need at least one listener. */
-    if (engineConfig.listenerCount == 0) {
-        engineConfig.listenerCount = 1;
-    }
-
-    if (engineConfig.listenerCount > MA_ENGINE_MAX_LISTENERS) {
-        result = MA_INVALID_ARGS;   /* Too many listeners. */
-        goto on_error_1;
-    }
-
-    for (iListener = 0; iListener < engineConfig.listenerCount; iListener += 1) {
-        listenerConfig = ma_spatializer_listener_config_init(ma_node_graph_get_channels(&pEngine->nodeGraph));
-
-        /*
-        If we're using a device, use the device's channel map for the listener. Otherwise just use
-        miniaudio's default channel map.
-        */
-        #if !defined(MA_NO_DEVICE_IO)
-        {
-            if (pEngine->pDevice != NULL) {
-                /*
-                Temporarily disabled. There is a subtle bug here where front-left and front-right
-                will be used by the device's channel map, but this is not what we want to use for
-                spatialization. Instead we want to use side-left and side-right. I need to figure
-                out a better solution for this. For now, disabling the use of device channel maps.
-                */
-                /*listenerConfig.pChannelMapOut = pEngine->pDevice->playback.channelMap;*/
-            }
-        }
-        #endif
-
-        result = ma_spatializer_listener_init(&listenerConfig, &pEngine->allocationCallbacks, &pEngine->listeners[iListener]);  /* TODO: Change this to a pre-allocated heap. */
-        if (result != MA_SUCCESS) {
-            goto on_error_2;
-        }
-
-        pEngine->listenerCount += 1;
-    }
-
-
-    /* Gain smoothing for spatialized sounds. */
-    pEngine->gainSmoothTimeInFrames = engineConfig.gainSmoothTimeInFrames;
-    if (pEngine->gainSmoothTimeInFrames == 0) {
-        ma_uint32 gainSmoothTimeInMilliseconds = engineConfig.gainSmoothTimeInMilliseconds;
-        if (gainSmoothTimeInMilliseconds == 0) {
-            gainSmoothTimeInMilliseconds = 8;
-        }
-
-        pEngine->gainSmoothTimeInFrames = (gainSmoothTimeInMilliseconds * ma_engine_get_sample_rate(pEngine)) / 1000;  /* 8ms by default. */
-    }
-
-
-    /* We need a resource manager. */
-    #ifndef MA_NO_RESOURCE_MANAGER
-    {
-        if (pEngine->pResourceManager == NULL) {
-            ma_resource_manager_config resourceManagerConfig;
-
-            pEngine->pResourceManager = (ma_resource_manager*)ma_malloc(sizeof(*pEngine->pResourceManager), &pEngine->allocationCallbacks);
-            if (pEngine->pResourceManager == NULL) {
-                result = MA_OUT_OF_MEMORY;
-                goto on_error_2;
-            }
-
-            resourceManagerConfig = ma_resource_manager_config_init();
-            resourceManagerConfig.pLog              = pEngine->pLog;    /* Always use the engine's log for internally-managed resource managers. */
-            resourceManagerConfig.decodedFormat     = ma_format_f32;
-            resourceManagerConfig.decodedChannels   = 0;  /* Leave the decoded channel count as 0 so we can get good spatialization. */
-            resourceManagerConfig.decodedSampleRate = ma_engine_get_sample_rate(pEngine);
-            ma_allocation_callbacks_init_copy(&resourceManagerConfig.allocationCallbacks, &pEngine->allocationCallbacks);
-            resourceManagerConfig.pVFS              = engineConfig.pResourceManagerVFS;
-
-            /* The Emscripten build cannot use threads. */
-            #if defined(MA_EMSCRIPTEN)
-            {
-                resourceManagerConfig.jobThreadCount = 0;
-                resourceManagerConfig.flags |= MA_RESOURCE_MANAGER_FLAG_NO_THREADING;
-            }
-            #endif
-
-            result = ma_resource_manager_init(&resourceManagerConfig, pEngine->pResourceManager);
-            if (result != MA_SUCCESS) {
-                goto on_error_3;
-            }
-
-            pEngine->ownsResourceManager = MA_TRUE;
-        }
-    }
-    #endif
-
-    /* Setup some stuff for inlined sounds. That is sounds played with ma_engine_play_sound(). */
-    pEngine->inlinedSoundLock  = 0;
-    pEngine->pInlinedSoundHead = NULL;
-
-    /* Start the engine if required. This should always be the last step. */
-    #if !defined(MA_NO_DEVICE_IO)
-    {
-        if (engineConfig.noAutoStart == MA_FALSE && pEngine->pDevice != NULL) {
-            result = ma_engine_start(pEngine);
-            if (result != MA_SUCCESS) {
-                goto on_error_4;    /* Failed to start the engine. */
-            }
-        }
-    }
-    #endif
-
-    return MA_SUCCESS;
-
-#if !defined(MA_NO_DEVICE_IO)
-on_error_4:
-#endif
-#if !defined(MA_NO_RESOURCE_MANAGER)
-on_error_3:
-    if (pEngine->ownsResourceManager) {
-        ma_free(pEngine->pResourceManager, &pEngine->allocationCallbacks);
-    }
-#endif  /* MA_NO_RESOURCE_MANAGER */
-on_error_2:
-    for (iListener = 0; iListener < pEngine->listenerCount; iListener += 1) {
-        ma_spatializer_listener_uninit(&pEngine->listeners[iListener], &pEngine->allocationCallbacks);
-    }
-
-    ma_node_graph_uninit(&pEngine->nodeGraph, &pEngine->allocationCallbacks);
-on_error_1:
-    #if !defined(MA_NO_DEVICE_IO)
-    {
-        if (pEngine->ownsDevice) {
-            ma_device_uninit(pEngine->pDevice);
-            ma_free(pEngine->pDevice, &pEngine->allocationCallbacks);
-        }
-    }
-    #endif
-
-    return result;
-}
-
-MA_API void ma_engine_uninit(ma_engine* pEngine)
-{
-    ma_uint32 iListener;
-
-    if (pEngine == NULL) {
-        return;
-    }
-
-    /* The device must be uninitialized before the node graph to ensure the audio thread doesn't try accessing it. */
-    #if !defined(MA_NO_DEVICE_IO)
-    {
-        if (pEngine->ownsDevice) {
-            ma_device_uninit(pEngine->pDevice);
-            ma_free(pEngine->pDevice, &pEngine->allocationCallbacks);
-        } else {
-            if (pEngine->pDevice != NULL) {
-                ma_device_stop(pEngine->pDevice);
-            }
-        }
-    }
-    #endif
-
-    /*
-    All inlined sounds need to be deleted. I'm going to use a lock here just to future proof in case
-    I want to do some kind of garbage collection later on.
-    */
-    ma_spinlock_lock(&pEngine->inlinedSoundLock);
-    {
-        for (;;) {
-            ma_sound_inlined* pSoundToDelete = pEngine->pInlinedSoundHead;
-            if (pSoundToDelete == NULL) {
-                break;  /* Done. */
-            }
-
-            pEngine->pInlinedSoundHead = pSoundToDelete->pNext;
-
-            ma_sound_uninit(&pSoundToDelete->sound);
-            ma_free(pSoundToDelete, &pEngine->allocationCallbacks);
-        }
-    }
-    ma_spinlock_unlock(&pEngine->inlinedSoundLock);
-
-    for (iListener = 0; iListener < pEngine->listenerCount; iListener += 1) {
-        ma_spatializer_listener_uninit(&pEngine->listeners[iListener], &pEngine->allocationCallbacks);
-    }
-
-    /* Make sure the node graph is uninitialized after the audio thread has been shutdown to prevent accessing of the node graph after being uninitialized. */
-    ma_node_graph_uninit(&pEngine->nodeGraph, &pEngine->allocationCallbacks);
-
-    /* Uninitialize the resource manager last to ensure we don't have a thread still trying to access it. */
-#ifndef MA_NO_RESOURCE_MANAGER
-    if (pEngine->ownsResourceManager) {
-        ma_resource_manager_uninit(pEngine->pResourceManager);
-        ma_free(pEngine->pResourceManager, &pEngine->allocationCallbacks);
-    }
-#endif
-}
-
-MA_API ma_result ma_engine_read_pcm_frames(ma_engine* pEngine, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
-{
-    ma_result result;
-    ma_uint64 framesRead = 0;
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = 0;
-    }
-
-    result = ma_node_graph_read_pcm_frames(&pEngine->nodeGraph, pFramesOut, frameCount, &framesRead);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pFramesRead != NULL) {
-        *pFramesRead = framesRead;
-    }
-
-    if (pEngine->onProcess) {
-        pEngine->onProcess(pEngine->pProcessUserData, (float*)pFramesOut, framesRead);  /* Safe cast to float* because the engine always works on floating point samples. */
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_node_graph* ma_engine_get_node_graph(ma_engine* pEngine)
-{
-    if (pEngine == NULL) {
-        return NULL;
-    }
-
-    return &pEngine->nodeGraph;
-}
-
-#if !defined(MA_NO_RESOURCE_MANAGER)
-MA_API ma_resource_manager* ma_engine_get_resource_manager(ma_engine* pEngine)
-{
-    if (pEngine == NULL) {
-        return NULL;
-    }
-
-    #if !defined(MA_NO_RESOURCE_MANAGER)
-    {
-        return pEngine->pResourceManager;
-    }
-    #else
-    {
-        return NULL;
-    }
-    #endif
-}
-#endif
-
-MA_API ma_device* ma_engine_get_device(ma_engine* pEngine)
-{
-    if (pEngine == NULL) {
-        return NULL;
-    }
-
-    #if !defined(MA_NO_DEVICE_IO)
-    {
-        return pEngine->pDevice;
-    }
-    #else
-    {
-        return NULL;
-    }
-    #endif
-}
-
-MA_API ma_log* ma_engine_get_log(ma_engine* pEngine)
-{
-    if (pEngine == NULL) {
-        return NULL;
-    }
-
-    if (pEngine->pLog != NULL) {
-        return pEngine->pLog;
-    } else {
-        #if !defined(MA_NO_DEVICE_IO)
-        {
-            return ma_device_get_log(ma_engine_get_device(pEngine));
-        }
-        #else
-        {
-            return NULL;
-        }
-        #endif
-    }
-}
-
-MA_API ma_node* ma_engine_get_endpoint(ma_engine* pEngine)
-{
-    return ma_node_graph_get_endpoint(&pEngine->nodeGraph);
-}
-
-MA_API ma_uint64 ma_engine_get_time_in_pcm_frames(const ma_engine* pEngine)
-{
-    return ma_node_graph_get_time(&pEngine->nodeGraph);
-}
-
-MA_API ma_uint64 ma_engine_get_time_in_milliseconds(const ma_engine* pEngine)
-{
-    return ma_engine_get_time_in_pcm_frames(pEngine) * 1000 / ma_engine_get_sample_rate(pEngine);
-}
-
-MA_API ma_result ma_engine_set_time_in_pcm_frames(ma_engine* pEngine, ma_uint64 globalTime)
-{
-    return ma_node_graph_set_time(&pEngine->nodeGraph, globalTime);
-}
-
-MA_API ma_result ma_engine_set_time_in_milliseconds(ma_engine* pEngine, ma_uint64 globalTime)
-{
-    return ma_engine_set_time_in_pcm_frames(pEngine, globalTime * ma_engine_get_sample_rate(pEngine) / 1000);
-}
-
-MA_API ma_uint64 ma_engine_get_time(const ma_engine* pEngine)
-{
-    return ma_engine_get_time_in_pcm_frames(pEngine);
-}
-
-MA_API ma_result ma_engine_set_time(ma_engine* pEngine, ma_uint64 globalTime)
-{
-    return ma_engine_set_time_in_pcm_frames(pEngine, globalTime);
-}
-
-MA_API ma_uint32 ma_engine_get_channels(const ma_engine* pEngine)
-{
-    return ma_node_graph_get_channels(&pEngine->nodeGraph);
-}
-
-MA_API ma_uint32 ma_engine_get_sample_rate(const ma_engine* pEngine)
-{
-    if (pEngine == NULL) {
-        return 0;
-    }
-
-    return pEngine->sampleRate;
-}
-
-
-MA_API ma_result ma_engine_start(ma_engine* pEngine)
-{
-    ma_result result;
-
-    if (pEngine == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_DEVICE_IO)
-    {
-        if (pEngine->pDevice != NULL) {
-            result = ma_device_start(pEngine->pDevice);
-        } else {
-            result = MA_INVALID_OPERATION;  /* The engine is running without a device which means there's no real notion of "starting" the engine. */
-        }
-    }
-    #else
-    {
-        result = MA_INVALID_OPERATION;  /* Device IO is disabled, so there's no real notion of "starting" the engine. */
-    }
-    #endif
-
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_engine_stop(ma_engine* pEngine)
-{
-    ma_result result;
-
-    if (pEngine == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    #if !defined(MA_NO_DEVICE_IO)
-    {
-        if (pEngine->pDevice != NULL) {
-            result = ma_device_stop(pEngine->pDevice);
-        } else {
-            result = MA_INVALID_OPERATION;  /* The engine is running without a device which means there's no real notion of "stopping" the engine. */
-        }
-    }
-    #else
-    {
-        result = MA_INVALID_OPERATION;  /* Device IO is disabled, so there's no real notion of "stopping" the engine. */
-    }
-    #endif
-
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_engine_set_volume(ma_engine* pEngine, float volume)
-{
-    if (pEngine == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_node_set_output_bus_volume(ma_node_graph_get_endpoint(&pEngine->nodeGraph), 0, volume);
-}
-
-MA_API float ma_engine_get_volume(ma_engine* pEngine)
-{
-    if (pEngine == NULL) {
-        return 0;
-    }
-
-    return ma_node_get_output_bus_volume(ma_node_graph_get_endpoint(&pEngine->nodeGraph), 0);
-}
-
-MA_API ma_result ma_engine_set_gain_db(ma_engine* pEngine, float gainDB)
-{
-    return ma_engine_set_volume(pEngine, ma_volume_db_to_linear(gainDB));
-}
-
-MA_API float ma_engine_get_gain_db(ma_engine* pEngine)
-{
-    return ma_volume_linear_to_db(ma_engine_get_volume(pEngine));
-}
-
-
-MA_API ma_uint32 ma_engine_get_listener_count(const ma_engine* pEngine)
-{
-    if (pEngine == NULL) {
-        return 0;
-    }
-
-    return pEngine->listenerCount;
-}
-
-MA_API ma_uint32 ma_engine_find_closest_listener(const ma_engine* pEngine, float absolutePosX, float absolutePosY, float absolutePosZ)
-{
-    ma_uint32 iListener;
-    ma_uint32 iListenerClosest;
-    float closestLen2 = MA_FLT_MAX;
-
-    if (pEngine == NULL || pEngine->listenerCount == 1) {
-        return 0;
-    }
-
-    iListenerClosest = 0;
-    for (iListener = 0; iListener < pEngine->listenerCount; iListener += 1) {
-        if (ma_engine_listener_is_enabled(pEngine, iListener)) {
-            float len2 = ma_vec3f_len2(ma_vec3f_sub(ma_spatializer_listener_get_position(&pEngine->listeners[iListener]), ma_vec3f_init_3f(absolutePosX, absolutePosY, absolutePosZ)));
-            if (closestLen2 > len2) {
-                closestLen2 = len2;
-                iListenerClosest = iListener;
-            }
-        }
-    }
-
-    MA_ASSERT(iListenerClosest < 255);
-    return iListenerClosest;
-}
-
-MA_API void ma_engine_listener_set_position(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return;
-    }
-
-    ma_spatializer_listener_set_position(&pEngine->listeners[listenerIndex], x, y, z);
-}
-
-MA_API ma_vec3f ma_engine_listener_get_position(const ma_engine* pEngine, ma_uint32 listenerIndex)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    return ma_spatializer_listener_get_position(&pEngine->listeners[listenerIndex]);
-}
-
-MA_API void ma_engine_listener_set_direction(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return;
-    }
-
-    ma_spatializer_listener_set_direction(&pEngine->listeners[listenerIndex], x, y, z);
-}
-
-MA_API ma_vec3f ma_engine_listener_get_direction(const ma_engine* pEngine, ma_uint32 listenerIndex)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return ma_vec3f_init_3f(0, 0, -1);
-    }
-
-    return ma_spatializer_listener_get_direction(&pEngine->listeners[listenerIndex]);
-}
-
-MA_API void ma_engine_listener_set_velocity(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return;
-    }
-
-    ma_spatializer_listener_set_velocity(&pEngine->listeners[listenerIndex], x, y, z);
-}
-
-MA_API ma_vec3f ma_engine_listener_get_velocity(const ma_engine* pEngine, ma_uint32 listenerIndex)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    return ma_spatializer_listener_get_velocity(&pEngine->listeners[listenerIndex]);
-}
-
-MA_API void ma_engine_listener_set_cone(ma_engine* pEngine, ma_uint32 listenerIndex, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return;
-    }
-
-    ma_spatializer_listener_set_cone(&pEngine->listeners[listenerIndex], innerAngleInRadians, outerAngleInRadians, outerGain);
-}
-
-MA_API void ma_engine_listener_get_cone(const ma_engine* pEngine, ma_uint32 listenerIndex, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
-{
-    if (pInnerAngleInRadians != NULL) {
-        *pInnerAngleInRadians = 0;
-    }
-
-    if (pOuterAngleInRadians != NULL) {
-        *pOuterAngleInRadians = 0;
-    }
-
-    if (pOuterGain != NULL) {
-        *pOuterGain = 0;
-    }
-
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return;
-    }
-
-    ma_spatializer_listener_get_cone(&pEngine->listeners[listenerIndex], pInnerAngleInRadians, pOuterAngleInRadians, pOuterGain);
-}
-
-MA_API void ma_engine_listener_set_world_up(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return;
-    }
-
-    ma_spatializer_listener_set_world_up(&pEngine->listeners[listenerIndex], x, y, z);
-}
-
-MA_API ma_vec3f ma_engine_listener_get_world_up(const ma_engine* pEngine, ma_uint32 listenerIndex)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return ma_vec3f_init_3f(0, 1, 0);
-    }
-
-    return ma_spatializer_listener_get_world_up(&pEngine->listeners[listenerIndex]);
-}
-
-MA_API void ma_engine_listener_set_enabled(ma_engine* pEngine, ma_uint32 listenerIndex, ma_bool32 isEnabled)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return;
-    }
-
-    ma_spatializer_listener_set_enabled(&pEngine->listeners[listenerIndex], isEnabled);
-}
-
-MA_API ma_bool32 ma_engine_listener_is_enabled(const ma_engine* pEngine, ma_uint32 listenerIndex)
-{
-    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
-        return MA_FALSE;
-    }
-
-    return ma_spatializer_listener_is_enabled(&pEngine->listeners[listenerIndex]);
-}
-
-
-#ifndef MA_NO_RESOURCE_MANAGER
-MA_API ma_result ma_engine_play_sound_ex(ma_engine* pEngine, const char* pFilePath, ma_node* pNode, ma_uint32 nodeInputBusIndex)
-{
-    ma_result result = MA_SUCCESS;
-    ma_sound_inlined* pSound = NULL;
-    ma_sound_inlined* pNextSound = NULL;
-
-    if (pEngine == NULL || pFilePath == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Attach to the endpoint node if nothing is specicied. */
-    if (pNode == NULL) {
-        pNode = ma_node_graph_get_endpoint(&pEngine->nodeGraph);
-        nodeInputBusIndex = 0;
-    }
-
-    /*
-    We want to check if we can recycle an already-allocated inlined sound. Since this is just a
-    helper I'm not *too* concerned about performance here and I'm happy to use a lock to keep
-    the implementation simple. Maybe this can be optimized later if there's enough demand, but
-    if this function is being used it probably means the caller doesn't really care too much.
-
-    What we do is check the atEnd flag. When this is true, we can recycle the sound. Otherwise
-    we just keep iterating. If we reach the end without finding a sound to recycle we just
-    allocate a new one. This doesn't scale well for a massive number of sounds being played
-    simultaneously as we don't ever actually free the sound objects. Some kind of garbage
-    collection routine might be valuable for this which I'll think about.
-    */
-    ma_spinlock_lock(&pEngine->inlinedSoundLock);
-    {
-        ma_uint32 soundFlags = 0;
-
-        for (pNextSound = pEngine->pInlinedSoundHead; pNextSound != NULL; pNextSound = pNextSound->pNext) {
-            if (ma_sound_at_end(&pNextSound->sound)) {
-                /*
-                The sound is at the end which means it's available for recycling. All we need to do
-                is uninitialize it and reinitialize it. All we're doing is recycling memory.
-                */
-                pSound = pNextSound;
-                ma_atomic_fetch_sub_32(&pEngine->inlinedSoundCount, 1);
-                break;
-            }
-        }
-
-        if (pSound != NULL) {
-            /*
-            We actually want to detach the sound from the list here. The reason is because we want the sound
-            to be in a consistent state at the non-recycled case to simplify the logic below.
-            */
-            if (pEngine->pInlinedSoundHead == pSound) {
-                pEngine->pInlinedSoundHead =  pSound->pNext;
-            }
-
-            if (pSound->pPrev != NULL) {
-                pSound->pPrev->pNext = pSound->pNext;
-            }
-            if (pSound->pNext != NULL) {
-                pSound->pNext->pPrev = pSound->pPrev;
-            }
-
-            /* Now the previous sound needs to be uninitialized. */
-            ma_sound_uninit(&pNextSound->sound);
-        } else {
-            /* No sound available for recycling. Allocate one now. */
-            pSound = (ma_sound_inlined*)ma_malloc(sizeof(*pSound), &pEngine->allocationCallbacks);
-        }
-
-        if (pSound != NULL) {   /* Safety check for the allocation above. */
-            /*
-            At this point we should have memory allocated for the inlined sound. We just need
-            to initialize it like a normal sound now.
-            */
-            soundFlags |= MA_SOUND_FLAG_ASYNC;                 /* For inlined sounds we don't want to be sitting around waiting for stuff to load so force an async load. */
-            soundFlags |= MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT; /* We want specific control over where the sound is attached in the graph. We'll attach it manually just before playing the sound. */
-            soundFlags |= MA_SOUND_FLAG_NO_PITCH;              /* Pitching isn't usable with inlined sounds, so disable it to save on speed. */
-            soundFlags |= MA_SOUND_FLAG_NO_SPATIALIZATION;     /* Not currently doing spatialization with inlined sounds, but this might actually change later. For now disable spatialization. Will be removed if we ever add support for spatialization here. */
-
-            result = ma_sound_init_from_file(pEngine, pFilePath, soundFlags, NULL, NULL, &pSound->sound);
-            if (result == MA_SUCCESS) {
-                /* Now attach the sound to the graph. */
-                result = ma_node_attach_output_bus(pSound, 0, pNode, nodeInputBusIndex);
-                if (result == MA_SUCCESS) {
-                    /* At this point the sound should be loaded and we can go ahead and add it to the list. The new item becomes the new head. */
-                    pSound->pNext = pEngine->pInlinedSoundHead;
-                    pSound->pPrev = NULL;
-
-                    pEngine->pInlinedSoundHead = pSound;    /* <-- This is what attaches the sound to the list. */
-                    if (pSound->pNext != NULL) {
-                        pSound->pNext->pPrev = pSound;
-                    }
-                } else {
-                    ma_free(pSound, &pEngine->allocationCallbacks);
-                }
-            } else {
-                ma_free(pSound, &pEngine->allocationCallbacks);
-            }
-        } else {
-            result = MA_OUT_OF_MEMORY;
-        }
-    }
-    ma_spinlock_unlock(&pEngine->inlinedSoundLock);
-
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* Finally we can start playing the sound. */
-    result = ma_sound_start(&pSound->sound);
-    if (result != MA_SUCCESS) {
-        /* Failed to start the sound. We need to mark it for recycling and return an error. */
-        ma_atomic_exchange_32(&pSound->sound.atEnd, MA_TRUE);
-        return result;
-    }
-
-    ma_atomic_fetch_add_32(&pEngine->inlinedSoundCount, 1);
-    return result;
-}
-
-MA_API ma_result ma_engine_play_sound(ma_engine* pEngine, const char* pFilePath, ma_sound_group* pGroup)
-{
-    return ma_engine_play_sound_ex(pEngine, pFilePath, pGroup, 0);
-}
-#endif
-
-
-static ma_result ma_sound_preinit(ma_engine* pEngine, ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pSound);
-    pSound->seekTarget = MA_SEEK_TARGET_NONE;
-
-    if (pEngine == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return MA_SUCCESS;
-}
-
-static ma_result ma_sound_init_from_data_source_internal(ma_engine* pEngine, const ma_sound_config* pConfig, ma_sound* pSound)
-{
-    ma_result result;
-    ma_engine_node_config engineNodeConfig;
-    ma_engine_node_type type;   /* Will be set to ma_engine_node_type_group if no data source is specified. */
-
-    /* Do not clear pSound to zero here - that's done at a higher level with ma_sound_preinit(). */
-    MA_ASSERT(pEngine != NULL);
-    MA_ASSERT(pSound  != NULL);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pSound->pDataSource = pConfig->pDataSource;
-
-    if (pConfig->pDataSource != NULL) {
-        type = ma_engine_node_type_sound;
-    } else {
-        type = ma_engine_node_type_group;
-    }
-
-    /*
-    Sounds are engine nodes. Before we can initialize this we need to determine the channel count.
-    If we can't do this we need to abort. It's up to the caller to ensure they're using a data
-    source that provides this information upfront.
-    */
-    engineNodeConfig = ma_engine_node_config_init(pEngine, type, pConfig->flags);
-    engineNodeConfig.channelsIn                  = pConfig->channelsIn;
-    engineNodeConfig.channelsOut                 = pConfig->channelsOut;
-    engineNodeConfig.volumeSmoothTimeInPCMFrames = pConfig->volumeSmoothTimeInPCMFrames;
-    engineNodeConfig.monoExpansionMode           = pConfig->monoExpansionMode;
-
-    if (engineNodeConfig.volumeSmoothTimeInPCMFrames == 0) {
-        engineNodeConfig.volumeSmoothTimeInPCMFrames = pEngine->defaultVolumeSmoothTimeInPCMFrames;
-    }
-
-    /* If we're loading from a data source the input channel count needs to be the data source's native channel count. */
-    if (pConfig->pDataSource != NULL) {
-        result = ma_data_source_get_data_format(pConfig->pDataSource, NULL, &engineNodeConfig.channelsIn, &engineNodeConfig.sampleRate, NULL, 0);
-        if (result != MA_SUCCESS) {
-            return result;  /* Failed to retrieve the channel count. */
-        }
-
-        if (engineNodeConfig.channelsIn == 0) {
-            return MA_INVALID_OPERATION;    /* Invalid channel count. */
-        }
-
-        if (engineNodeConfig.channelsOut == MA_SOUND_SOURCE_CHANNEL_COUNT) {
-            engineNodeConfig.channelsOut = engineNodeConfig.channelsIn;
-        }
-    }
-
-
-    /* Getting here means we should have a valid channel count and we can initialize the engine node. */
-    result = ma_engine_node_init(&engineNodeConfig, &pEngine->allocationCallbacks, &pSound->engineNode);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* If no attachment is specified, attach the sound straight to the endpoint. */
-    if (pConfig->pInitialAttachment == NULL) {
-        /* No group. Attach straight to the endpoint by default, unless the caller has requested that it not. */
-        if ((pConfig->flags & MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT) == 0) {
-            result = ma_node_attach_output_bus(pSound, 0, ma_node_graph_get_endpoint(&pEngine->nodeGraph), 0);
-        }
-    } else {
-        /* An attachment is specified. Attach to it by default. The sound has only a single output bus, and the config will specify which input bus to attach to. */
-        result = ma_node_attach_output_bus(pSound, 0, pConfig->pInitialAttachment, pConfig->initialAttachmentInputBusIndex);
-    }
-
-    if (result != MA_SUCCESS) {
-        ma_engine_node_uninit(&pSound->engineNode, &pEngine->allocationCallbacks);
-        return result;
-    }
-
-
-    /* Apply initial range and looping state to the data source if applicable. */
-    if (pConfig->rangeBegInPCMFrames != 0 || pConfig->rangeEndInPCMFrames != ~((ma_uint64)0)) {
-        ma_data_source_set_range_in_pcm_frames(ma_sound_get_data_source(pSound), pConfig->rangeBegInPCMFrames, pConfig->rangeEndInPCMFrames);
-    }
-
-    if (pConfig->loopPointBegInPCMFrames != 0 || pConfig->loopPointEndInPCMFrames != ~((ma_uint64)0)) {
-        ma_data_source_set_range_in_pcm_frames(ma_sound_get_data_source(pSound), pConfig->loopPointBegInPCMFrames, pConfig->loopPointEndInPCMFrames);
-    }
-
-    ma_sound_set_looping(pSound, pConfig->isLooping);
-
-    return MA_SUCCESS;
-}
-
-#ifndef MA_NO_RESOURCE_MANAGER
-MA_API ma_result ma_sound_init_from_file_internal(ma_engine* pEngine, const ma_sound_config* pConfig, ma_sound* pSound)
-{
-    ma_result result = MA_SUCCESS;
-    ma_uint32 flags;
-    ma_sound_config config;
-    ma_resource_manager_pipeline_notifications notifications;
-
-    /*
-    The engine requires knowledge of the channel count of the underlying data source before it can
-    initialize the sound. Therefore, we need to make the resource manager wait until initialization
-    of the underlying data source to be initialized so we can get access to the channel count. To
-    do this, the MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT is forced.
-
-    Because we're initializing the data source before the sound, there's a chance the notification
-    will get triggered before this function returns. This is OK, so long as the caller is aware of
-    it and can avoid accessing the sound from within the notification.
-    */
-    flags = pConfig->flags | MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT;
-
-    pSound->pResourceManagerDataSource = (ma_resource_manager_data_source*)ma_malloc(sizeof(*pSound->pResourceManagerDataSource), &pEngine->allocationCallbacks);
-    if (pSound->pResourceManagerDataSource == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    /* Removed in 0.12. Set pDoneFence on the notifications. */
-    notifications = pConfig->initNotifications;
-    if (pConfig->pDoneFence != NULL && notifications.done.pFence == NULL) {
-        notifications.done.pFence = pConfig->pDoneFence;
-    }
-
-    /*
-    We must wrap everything around the fence if one was specified. This ensures ma_fence_wait() does
-    not return prematurely before the sound has finished initializing.
-    */
-    if (notifications.done.pFence) { ma_fence_acquire(notifications.done.pFence); }
-    {
-        ma_resource_manager_data_source_config resourceManagerDataSourceConfig = ma_resource_manager_data_source_config_init();
-        resourceManagerDataSourceConfig.pFilePath                   = pConfig->pFilePath;
-        resourceManagerDataSourceConfig.pFilePathW                  = pConfig->pFilePathW;
-        resourceManagerDataSourceConfig.flags                       = flags;
-        resourceManagerDataSourceConfig.pNotifications              = &notifications;
-        resourceManagerDataSourceConfig.initialSeekPointInPCMFrames = pConfig->initialSeekPointInPCMFrames;
-        resourceManagerDataSourceConfig.rangeBegInPCMFrames         = pConfig->rangeBegInPCMFrames;
-        resourceManagerDataSourceConfig.rangeEndInPCMFrames         = pConfig->rangeEndInPCMFrames;
-        resourceManagerDataSourceConfig.loopPointBegInPCMFrames     = pConfig->loopPointBegInPCMFrames;
-        resourceManagerDataSourceConfig.loopPointEndInPCMFrames     = pConfig->loopPointEndInPCMFrames;
-        resourceManagerDataSourceConfig.isLooping                   = pConfig->isLooping;
-
-        result = ma_resource_manager_data_source_init_ex(pEngine->pResourceManager, &resourceManagerDataSourceConfig, pSound->pResourceManagerDataSource);
-        if (result != MA_SUCCESS) {
-            goto done;
-        }
-
-        pSound->ownsDataSource = MA_TRUE;   /* <-- Important. Not setting this will result in the resource manager data source never getting uninitialized. */
-
-        /* We need to use a slightly customized version of the config so we'll need to make a copy. */
-        config = *pConfig;
-        config.pFilePath   = NULL;
-        config.pFilePathW  = NULL;
-        config.pDataSource = pSound->pResourceManagerDataSource;
-
-        result = ma_sound_init_from_data_source_internal(pEngine, &config, pSound);
-        if (result != MA_SUCCESS) {
-            ma_resource_manager_data_source_uninit(pSound->pResourceManagerDataSource);
-            ma_free(pSound->pResourceManagerDataSource, &pEngine->allocationCallbacks);
-            MA_ZERO_OBJECT(pSound);
-            goto done;
-        }
-    }
-done:
-    if (notifications.done.pFence) { ma_fence_release(notifications.done.pFence); }
-    return result;
-}
-
-MA_API ma_result ma_sound_init_from_file(ma_engine* pEngine, const char* pFilePath, ma_uint32 flags, ma_sound_group* pGroup, ma_fence* pDoneFence, ma_sound* pSound)
-{
-    ma_sound_config config;
-
-    if (pFilePath == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    config = ma_sound_config_init_2(pEngine);
-    config.pFilePath          = pFilePath;
-    config.flags              = flags;
-    config.pInitialAttachment = pGroup;
-    config.pDoneFence         = pDoneFence;
-
-    return ma_sound_init_ex(pEngine, &config, pSound);
-}
-
-MA_API ma_result ma_sound_init_from_file_w(ma_engine* pEngine, const wchar_t* pFilePath, ma_uint32 flags, ma_sound_group* pGroup, ma_fence* pDoneFence, ma_sound* pSound)
-{
-    ma_sound_config config;
-
-    if (pFilePath == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    config = ma_sound_config_init_2(pEngine);
-    config.pFilePathW         = pFilePath;
-    config.flags              = flags;
-    config.pInitialAttachment = pGroup;
-    config.pDoneFence         = pDoneFence;
-
-    return ma_sound_init_ex(pEngine, &config, pSound);
-}
-
-MA_API ma_result ma_sound_init_copy(ma_engine* pEngine, const ma_sound* pExistingSound, ma_uint32 flags, ma_sound_group* pGroup, ma_sound* pSound)
-{
-    ma_result result;
-    ma_sound_config config;
-
-    result = ma_sound_preinit(pEngine, pSound);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pExistingSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Cloning only works for data buffers (not streams) that are loaded from the resource manager. */
-    if (pExistingSound->pResourceManagerDataSource == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /*
-    We need to make a clone of the data source. If the data source is not a data buffer (i.e. a stream)
-    this will fail.
-    */
-    pSound->pResourceManagerDataSource = (ma_resource_manager_data_source*)ma_malloc(sizeof(*pSound->pResourceManagerDataSource), &pEngine->allocationCallbacks);
-    if (pSound->pResourceManagerDataSource == NULL) {
-        return MA_OUT_OF_MEMORY;
-    }
-
-    result = ma_resource_manager_data_source_init_copy(pEngine->pResourceManager, pExistingSound->pResourceManagerDataSource, pSound->pResourceManagerDataSource);
-    if (result != MA_SUCCESS) {
-        ma_free(pSound->pResourceManagerDataSource, &pEngine->allocationCallbacks);
-        return result;
-    }
-
-    config = ma_sound_config_init_2(pEngine);
-    config.pDataSource                 = pSound->pResourceManagerDataSource;
-    config.flags                       = flags;
-    config.pInitialAttachment          = pGroup;
-    config.monoExpansionMode           = pExistingSound->engineNode.monoExpansionMode;
-    config.volumeSmoothTimeInPCMFrames = pExistingSound->engineNode.volumeSmoothTimeInPCMFrames;
-
-    result = ma_sound_init_from_data_source_internal(pEngine, &config, pSound);
-    if (result != MA_SUCCESS) {
-        ma_resource_manager_data_source_uninit(pSound->pResourceManagerDataSource);
-        ma_free(pSound->pResourceManagerDataSource, &pEngine->allocationCallbacks);
-        MA_ZERO_OBJECT(pSound);
-        return result;
-    }
-
-    /* Make sure the sound is marked as the owner of the data source or else it will never get uninitialized. */
-    pSound->ownsDataSource = MA_TRUE;
-
-    return MA_SUCCESS;
-}
-#endif
-
-MA_API ma_result ma_sound_init_from_data_source(ma_engine* pEngine, ma_data_source* pDataSource, ma_uint32 flags, ma_sound_group* pGroup, ma_sound* pSound)
-{
-    ma_sound_config config = ma_sound_config_init_2(pEngine);
-    config.pDataSource        = pDataSource;
-    config.flags              = flags;
-    config.pInitialAttachment = pGroup;
-    return ma_sound_init_ex(pEngine, &config, pSound);
-}
-
-MA_API ma_result ma_sound_init_ex(ma_engine* pEngine, const ma_sound_config* pConfig, ma_sound* pSound)
-{
-    ma_result result;
-
-    result = ma_sound_preinit(pEngine, pSound);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    pSound->endCallback          = pConfig->endCallback;
-    pSound->pEndCallbackUserData = pConfig->pEndCallbackUserData;
-
-    /* We need to load the sound differently depending on whether or not we're loading from a file. */
-#ifndef MA_NO_RESOURCE_MANAGER
-    if (pConfig->pFilePath != NULL || pConfig->pFilePathW != NULL) {
-        return ma_sound_init_from_file_internal(pEngine, pConfig, pSound);
-    } else
-#endif
-    {
-        /*
-        Getting here means we're not loading from a file. We may be loading from an already-initialized
-        data source, or none at all. If we aren't specifying any data source, we'll be initializing the
-        the equivalent to a group. ma_data_source_init_from_data_source_internal() will deal with this
-        for us, so no special treatment required here.
-        */
-        return ma_sound_init_from_data_source_internal(pEngine, pConfig, pSound);
-    }
-}
-
-MA_API void ma_sound_uninit(ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    /*
-    Always uninitialize the node first. This ensures it's detached from the graph and does not return until it has done
-    so which makes thread safety beyond this point trivial.
-    */
-    ma_engine_node_uninit(&pSound->engineNode, &pSound->engineNode.pEngine->allocationCallbacks);
-
-    /* Once the sound is detached from the group we can guarantee that it won't be referenced by the mixer thread which means it's safe for us to destroy the data source. */
-#ifndef MA_NO_RESOURCE_MANAGER
-    if (pSound->ownsDataSource) {
-        ma_resource_manager_data_source_uninit(pSound->pResourceManagerDataSource);
-        ma_free(pSound->pResourceManagerDataSource, &pSound->engineNode.pEngine->allocationCallbacks);
-        pSound->pDataSource = NULL;
-    }
-#else
-    MA_ASSERT(pSound->ownsDataSource == MA_FALSE);
-#endif
-}
-
-MA_API ma_engine* ma_sound_get_engine(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return NULL;
-    }
-
-    return pSound->engineNode.pEngine;
-}
-
-MA_API ma_data_source* ma_sound_get_data_source(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return NULL;
-    }
-
-    return pSound->pDataSource;
-}
-
-MA_API ma_result ma_sound_start(ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* If the sound is already playing, do nothing. */
-    if (ma_sound_is_playing(pSound)) {
-        return MA_SUCCESS;
-    }
-
-    /* If the sound is at the end it means we want to start from the start again. */
-    if (ma_sound_at_end(pSound)) {
-        ma_result result = ma_data_source_seek_to_pcm_frame(pSound->pDataSource, 0);
-        if (result != MA_SUCCESS && result != MA_NOT_IMPLEMENTED) {
-            return result;  /* Failed to seek back to the start. */
-        }
-
-        /* Make sure we clear the end indicator. */
-        ma_atomic_exchange_32(&pSound->atEnd, MA_FALSE);
-    }
-
-    /* Make sure the sound is started. If there's a start delay, the sound won't actually start until the start time is reached. */
-    ma_node_set_state(pSound, ma_node_state_started);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_sound_stop(ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* This will stop the sound immediately. Use ma_sound_set_stop_time() to stop the sound at a specific time. */
-    ma_node_set_state(pSound, ma_node_state_stopped);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_sound_stop_with_fade_in_pcm_frames(ma_sound* pSound, ma_uint64 fadeLengthInFrames)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Stopping with a fade out requires us to schedule the stop into the future by the fade length. */
-    ma_sound_set_stop_time_with_fade_in_pcm_frames(pSound, ma_engine_get_time_in_pcm_frames(ma_sound_get_engine(pSound)) + fadeLengthInFrames, fadeLengthInFrames);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_sound_stop_with_fade_in_milliseconds(ma_sound* pSound, ma_uint64 fadeLengthInMilliseconds)
-{
-    ma_uint64 sampleRate;
-
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    sampleRate = ma_engine_get_sample_rate(ma_sound_get_engine(pSound));
-
-    return ma_sound_stop_with_fade_in_pcm_frames(pSound, (fadeLengthInMilliseconds * sampleRate) / 1000);
-}
-
-MA_API void ma_sound_set_volume(ma_sound* pSound, float volume)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_engine_node_set_volume(&pSound->engineNode, volume);
-}
-
-MA_API float ma_sound_get_volume(const ma_sound* pSound)
-{
-    float volume = 0;
-
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    ma_engine_node_get_volume(&pSound->engineNode, &volume);
-
-    return volume;
-}
-
-MA_API void ma_sound_set_pan(ma_sound* pSound, float pan)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_panner_set_pan(&pSound->engineNode.panner, pan);
-}
-
-MA_API float ma_sound_get_pan(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    return ma_panner_get_pan(&pSound->engineNode.panner);
-}
-
-MA_API void ma_sound_set_pan_mode(ma_sound* pSound, ma_pan_mode panMode)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_panner_set_mode(&pSound->engineNode.panner, panMode);
-}
-
-MA_API ma_pan_mode ma_sound_get_pan_mode(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return ma_pan_mode_balance;
-    }
-
-    return ma_panner_get_mode(&pSound->engineNode.panner);
-}
-
-MA_API void ma_sound_set_pitch(ma_sound* pSound, float pitch)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    if (pitch <= 0) {
-        return;
-    }
-
-    ma_atomic_exchange_explicit_f32(&pSound->engineNode.pitch, pitch, ma_atomic_memory_order_release);
-}
-
-MA_API float ma_sound_get_pitch(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    return ma_atomic_load_f32(&pSound->engineNode.pitch);    /* Naughty const-cast for this. */
-}
-
-MA_API void ma_sound_set_spatialization_enabled(ma_sound* pSound, ma_bool32 enabled)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_atomic_exchange_explicit_32(&pSound->engineNode.isSpatializationDisabled, !enabled, ma_atomic_memory_order_release);
-}
-
-MA_API ma_bool32 ma_sound_is_spatialization_enabled(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_engine_node_is_spatialization_enabled(&pSound->engineNode);
-}
-
-MA_API void ma_sound_set_pinned_listener_index(ma_sound* pSound, ma_uint32 listenerIndex)
-{
-    if (pSound == NULL || listenerIndex >= ma_engine_get_listener_count(ma_sound_get_engine(pSound))) {
-        return;
-    }
-
-    ma_atomic_exchange_explicit_32(&pSound->engineNode.pinnedListenerIndex, listenerIndex, ma_atomic_memory_order_release);
-}
-
-MA_API ma_uint32 ma_sound_get_pinned_listener_index(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return MA_LISTENER_INDEX_CLOSEST;
-    }
-
-    return ma_atomic_load_explicit_32(&pSound->engineNode.pinnedListenerIndex, ma_atomic_memory_order_acquire);
-}
-
-MA_API ma_uint32 ma_sound_get_listener_index(const ma_sound* pSound)
-{
-    ma_uint32 listenerIndex;
-
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    listenerIndex = ma_sound_get_pinned_listener_index(pSound);
-    if (listenerIndex == MA_LISTENER_INDEX_CLOSEST) {
-        ma_vec3f position = ma_sound_get_position(pSound);
-        return ma_engine_find_closest_listener(ma_sound_get_engine(pSound), position.x, position.y, position.z);
-    }
-
-    return listenerIndex;
-}
-
-MA_API ma_vec3f ma_sound_get_direction_to_listener(const ma_sound* pSound)
-{
-    ma_vec3f relativePos;
-    ma_engine* pEngine;
-
-    if (pSound == NULL) {
-        return ma_vec3f_init_3f(0, 0, -1);
-    }
-
-    pEngine = ma_sound_get_engine(pSound);
-    if (pEngine == NULL) {
-        return ma_vec3f_init_3f(0, 0, -1);
-    }
-
-    ma_spatializer_get_relative_position_and_direction(&pSound->engineNode.spatializer, &pEngine->listeners[ma_sound_get_listener_index(pSound)], &relativePos, NULL);
-
-    return ma_vec3f_normalize(ma_vec3f_neg(relativePos));
-}
-
-MA_API void ma_sound_set_position(ma_sound* pSound, float x, float y, float z)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_position(&pSound->engineNode.spatializer, x, y, z);
-}
-
-MA_API ma_vec3f ma_sound_get_position(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    return ma_spatializer_get_position(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_direction(ma_sound* pSound, float x, float y, float z)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_direction(&pSound->engineNode.spatializer, x, y, z);
-}
-
-MA_API ma_vec3f ma_sound_get_direction(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    return ma_spatializer_get_direction(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_velocity(ma_sound* pSound, float x, float y, float z)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_velocity(&pSound->engineNode.spatializer, x, y, z);
-}
-
-MA_API ma_vec3f ma_sound_get_velocity(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return ma_vec3f_init_3f(0, 0, 0);
-    }
-
-    return ma_spatializer_get_velocity(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_attenuation_model(ma_sound* pSound, ma_attenuation_model attenuationModel)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_attenuation_model(&pSound->engineNode.spatializer, attenuationModel);
-}
-
-MA_API ma_attenuation_model ma_sound_get_attenuation_model(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return ma_attenuation_model_none;
-    }
-
-    return ma_spatializer_get_attenuation_model(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_positioning(ma_sound* pSound, ma_positioning positioning)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_positioning(&pSound->engineNode.spatializer, positioning);
-}
-
-MA_API ma_positioning ma_sound_get_positioning(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return ma_positioning_absolute;
-    }
-
-    return ma_spatializer_get_positioning(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_rolloff(ma_sound* pSound, float rolloff)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_rolloff(&pSound->engineNode.spatializer, rolloff);
-}
-
-MA_API float ma_sound_get_rolloff(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    return ma_spatializer_get_rolloff(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_min_gain(ma_sound* pSound, float minGain)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_min_gain(&pSound->engineNode.spatializer, minGain);
-}
-
-MA_API float ma_sound_get_min_gain(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    return ma_spatializer_get_min_gain(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_max_gain(ma_sound* pSound, float maxGain)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_max_gain(&pSound->engineNode.spatializer, maxGain);
-}
-
-MA_API float ma_sound_get_max_gain(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    return ma_spatializer_get_max_gain(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_min_distance(ma_sound* pSound, float minDistance)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_min_distance(&pSound->engineNode.spatializer, minDistance);
-}
-
-MA_API float ma_sound_get_min_distance(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    return ma_spatializer_get_min_distance(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_max_distance(ma_sound* pSound, float maxDistance)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_max_distance(&pSound->engineNode.spatializer, maxDistance);
-}
-
-MA_API float ma_sound_get_max_distance(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    return ma_spatializer_get_max_distance(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_cone(ma_sound* pSound, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_cone(&pSound->engineNode.spatializer, innerAngleInRadians, outerAngleInRadians, outerGain);
-}
-
-MA_API void ma_sound_get_cone(const ma_sound* pSound, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
-{
-    if (pInnerAngleInRadians != NULL) {
-        *pInnerAngleInRadians = 0;
-    }
-
-    if (pOuterAngleInRadians != NULL) {
-        *pOuterAngleInRadians = 0;
-    }
-
-    if (pOuterGain != NULL) {
-        *pOuterGain = 0;
-    }
-
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_get_cone(&pSound->engineNode.spatializer, pInnerAngleInRadians, pOuterAngleInRadians, pOuterGain);
-}
-
-MA_API void ma_sound_set_doppler_factor(ma_sound* pSound, float dopplerFactor)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_doppler_factor(&pSound->engineNode.spatializer, dopplerFactor);
-}
-
-MA_API float ma_sound_get_doppler_factor(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    return ma_spatializer_get_doppler_factor(&pSound->engineNode.spatializer);
-}
-
-MA_API void ma_sound_set_directional_attenuation_factor(ma_sound* pSound, float directionalAttenuationFactor)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_spatializer_set_directional_attenuation_factor(&pSound->engineNode.spatializer, directionalAttenuationFactor);
-}
-
-MA_API float ma_sound_get_directional_attenuation_factor(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 1;
-    }
-
-    return ma_spatializer_get_directional_attenuation_factor(&pSound->engineNode.spatializer);
-}
-
-
-MA_API void ma_sound_set_fade_in_pcm_frames(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_sound_set_fade_start_in_pcm_frames(pSound, volumeBeg, volumeEnd, fadeLengthInFrames, (~(ma_uint64)0));
-}
-
-MA_API void ma_sound_set_fade_in_milliseconds(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_sound_set_fade_in_pcm_frames(pSound, volumeBeg, volumeEnd, (fadeLengthInMilliseconds * pSound->engineNode.fader.config.sampleRate) / 1000);
-}
-
-MA_API void ma_sound_set_fade_start_in_pcm_frames(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames, ma_uint64 absoluteGlobalTimeInFrames)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    /*
-    We don't want to update the fader at this point because we need to use the engine's current time
-    to derive the fader's start offset. The timer is being updated on the audio thread so in order to
-    do this as accurately as possible we'll need to defer this to the audio thread.
-    */
-    ma_atomic_float_set(&pSound->engineNode.fadeSettings.volumeBeg, volumeBeg);
-    ma_atomic_float_set(&pSound->engineNode.fadeSettings.volumeEnd, volumeEnd);
-    ma_atomic_uint64_set(&pSound->engineNode.fadeSettings.fadeLengthInFrames, fadeLengthInFrames);
-    ma_atomic_uint64_set(&pSound->engineNode.fadeSettings.absoluteGlobalTimeInFrames, absoluteGlobalTimeInFrames);
-}
-
-MA_API void ma_sound_set_fade_start_in_milliseconds(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds, ma_uint64 absoluteGlobalTimeInMilliseconds)
-{
-    ma_uint32 sampleRate;
-
-    if (pSound == NULL) {
-        return;
-    }
-
-    sampleRate = ma_engine_get_sample_rate(ma_sound_get_engine(pSound));
-
-    ma_sound_set_fade_start_in_pcm_frames(pSound, volumeBeg, volumeEnd, (fadeLengthInMilliseconds * sampleRate) / 1000, (absoluteGlobalTimeInMilliseconds * sampleRate) / 1000);
-}
-
-MA_API float ma_sound_get_current_fade_volume(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    return ma_fader_get_current_volume(&pSound->engineNode.fader);
-}
-
-MA_API void ma_sound_set_start_time_in_pcm_frames(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInFrames)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_node_set_state_time(pSound, ma_node_state_started, absoluteGlobalTimeInFrames);
-}
-
-MA_API void ma_sound_set_start_time_in_milliseconds(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInMilliseconds)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_sound_set_start_time_in_pcm_frames(pSound, absoluteGlobalTimeInMilliseconds * ma_engine_get_sample_rate(ma_sound_get_engine(pSound)) / 1000);
-}
-
-MA_API void ma_sound_set_stop_time_in_pcm_frames(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInFrames)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_sound_set_stop_time_with_fade_in_pcm_frames(pSound, absoluteGlobalTimeInFrames, 0);
-}
-
-MA_API void ma_sound_set_stop_time_in_milliseconds(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInMilliseconds)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    ma_sound_set_stop_time_in_pcm_frames(pSound, absoluteGlobalTimeInMilliseconds * ma_engine_get_sample_rate(ma_sound_get_engine(pSound)) / 1000);
-}
-
-MA_API void ma_sound_set_stop_time_with_fade_in_pcm_frames(ma_sound* pSound, ma_uint64 stopAbsoluteGlobalTimeInFrames, ma_uint64 fadeLengthInFrames)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    if (fadeLengthInFrames > 0) {
-        if (fadeLengthInFrames > stopAbsoluteGlobalTimeInFrames) {
-            fadeLengthInFrames = stopAbsoluteGlobalTimeInFrames;
-        }
-
-        ma_sound_set_fade_start_in_pcm_frames(pSound, -1, 0, fadeLengthInFrames, stopAbsoluteGlobalTimeInFrames - fadeLengthInFrames);
-    }
-
-    ma_node_set_state_time(pSound, ma_node_state_stopped, stopAbsoluteGlobalTimeInFrames);
-}
-
-MA_API void ma_sound_set_stop_time_with_fade_in_milliseconds(ma_sound* pSound, ma_uint64 stopAbsoluteGlobalTimeInMilliseconds, ma_uint64 fadeLengthInMilliseconds)
-{
-    ma_uint32 sampleRate;
-
-    if (pSound == NULL) {
-        return;
-    }
-
-    sampleRate = ma_engine_get_sample_rate(ma_sound_get_engine(pSound));
-
-    ma_sound_set_stop_time_with_fade_in_pcm_frames(pSound, (stopAbsoluteGlobalTimeInMilliseconds * sampleRate) / 1000, (fadeLengthInMilliseconds * sampleRate) / 1000);
-}
-
-MA_API ma_bool32 ma_sound_is_playing(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_node_get_state_by_time(pSound, ma_engine_get_time_in_pcm_frames(ma_sound_get_engine(pSound))) == ma_node_state_started;
-}
-
-MA_API ma_uint64 ma_sound_get_time_in_pcm_frames(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return 0;
-    }
-
-    return ma_node_get_time(pSound);
-}
-
-MA_API ma_uint64 ma_sound_get_time_in_milliseconds(const ma_sound* pSound)
-{
-    return ma_sound_get_time_in_pcm_frames(pSound) * 1000 / ma_engine_get_sample_rate(ma_sound_get_engine(pSound));
-}
-
-MA_API void ma_sound_set_looping(ma_sound* pSound, ma_bool32 isLooping)
-{
-    if (pSound == NULL) {
-        return;
-    }
-
-    /* Looping is only a valid concept if the sound is backed by a data source. */
-    if (pSound->pDataSource == NULL) {
-        return;
-    }
-
-    /* The looping state needs to be applied to the data source in order for any looping to actually happen. */
-    ma_data_source_set_looping(pSound->pDataSource, isLooping);
-}
-
-MA_API ma_bool32 ma_sound_is_looping(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return MA_FALSE;
-    }
-
-    /* There is no notion of looping for sounds that are not backed by a data source. */
-    if (pSound->pDataSource == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_data_source_is_looping(pSound->pDataSource);
-}
-
-MA_API ma_bool32 ma_sound_at_end(const ma_sound* pSound)
-{
-    if (pSound == NULL) {
-        return MA_FALSE;
-    }
-
-    /* There is no notion of an end of a sound if it's not backed by a data source. */
-    if (pSound->pDataSource == NULL) {
-        return MA_FALSE;
-    }
-
-    return ma_sound_get_at_end(pSound);
-}
-
-MA_API ma_result ma_sound_seek_to_pcm_frame(ma_sound* pSound, ma_uint64 frameIndex)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* Seeking is only valid for sounds that are backed by a data source. */
-    if (pSound->pDataSource == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    /* We can't be seeking while reading at the same time. We just set the seek target and get the mixing thread to do the actual seek. */
-    ma_atomic_exchange_64(&pSound->seekTarget, frameIndex);
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_sound_get_data_format(ma_sound* pSound, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The data format is retrieved directly from the data source if the sound is backed by one. Otherwise we pull it from the node. */
-    if (pSound->pDataSource == NULL) {
-        ma_uint32 channels;
-
-        if (pFormat != NULL) {
-            *pFormat = ma_format_f32;
-        }
-
-        channels = ma_node_get_input_channels(&pSound->engineNode, 0);
-        if (pChannels != NULL) {
-            *pChannels = channels;
-        }
-
-        if (pSampleRate != NULL) {
-            *pSampleRate = pSound->engineNode.resampler.config.sampleRateIn;
-        }
-
-        if (pChannelMap != NULL) {
-            ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, channels);
-        }
-
-        return MA_SUCCESS;
-    } else {
-        return ma_data_source_get_data_format(pSound->pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
-    }
-}
-
-MA_API ma_result ma_sound_get_cursor_in_pcm_frames(ma_sound* pSound, ma_uint64* pCursor)
-{
-    ma_uint64 seekTarget;
-
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The notion of a cursor is only valid for sounds that are backed by a data source. */
-    if (pSound->pDataSource == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    seekTarget = ma_atomic_load_64(&pSound->seekTarget);
-    if (seekTarget != MA_SEEK_TARGET_NONE) {
-        *pCursor = seekTarget;
-        return MA_SUCCESS;
-    } else {
-        return ma_data_source_get_cursor_in_pcm_frames(pSound->pDataSource, pCursor);
-    }
-}
-
-MA_API ma_result ma_sound_get_length_in_pcm_frames(ma_sound* pSound, ma_uint64* pLength)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The notion of a sound length is only valid for sounds that are backed by a data source. */
-    if (pSound->pDataSource == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    return ma_data_source_get_length_in_pcm_frames(pSound->pDataSource, pLength);
-}
-
-MA_API ma_result ma_sound_get_cursor_in_seconds(ma_sound* pSound, float* pCursor)
-{
-    ma_result result;
-    ma_uint64 cursorInPCMFrames;
-    ma_uint32 sampleRate;
-
-    if (pCursor != NULL) {
-        *pCursor = 0;
-    }
-
-    result = ma_sound_get_cursor_in_pcm_frames(pSound, &cursorInPCMFrames);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    result = ma_sound_get_data_format(pSound, NULL, NULL, &sampleRate, NULL, 0);
-    if (result != MA_SUCCESS) {
-        return result;
-    }
-
-    /* VC6 does not support division of unsigned 64-bit integers with floating point numbers. Need to use a signed number. This shouldn't effect anything in practice. */
-    *pCursor = (ma_int64)cursorInPCMFrames / (float)sampleRate;
-
-    return MA_SUCCESS;
-}
-
-MA_API ma_result ma_sound_get_length_in_seconds(ma_sound* pSound, float* pLength)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The notion of a sound length is only valid for sounds that are backed by a data source. */
-    if (pSound->pDataSource == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    return ma_data_source_get_length_in_seconds(pSound->pDataSource, pLength);
-}
-
-MA_API ma_result ma_sound_set_end_callback(ma_sound* pSound, ma_sound_end_proc callback, void* pUserData)
-{
-    if (pSound == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* The notion of an end is only valid for sounds that are backed by a data source. */
-    if (pSound->pDataSource == NULL) {
-        return MA_INVALID_OPERATION;
-    }
-
-    pSound->endCallback          = callback;
-    pSound->pEndCallbackUserData = pUserData;
-
-    return MA_SUCCESS;
-}
-
-
-MA_API ma_result ma_sound_group_init(ma_engine* pEngine, ma_uint32 flags, ma_sound_group* pParentGroup, ma_sound_group* pGroup)
-{
-    ma_sound_group_config config = ma_sound_group_config_init_2(pEngine);
-    config.flags              = flags;
-    config.pInitialAttachment = pParentGroup;
-    return ma_sound_group_init_ex(pEngine, &config, pGroup);
-}
-
-MA_API ma_result ma_sound_group_init_ex(ma_engine* pEngine, const ma_sound_group_config* pConfig, ma_sound_group* pGroup)
-{
-    ma_sound_config soundConfig;
-
-    if (pGroup == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    MA_ZERO_OBJECT(pGroup);
-
-    if (pConfig == NULL) {
-        return MA_INVALID_ARGS;
-    }
-
-    /* A sound group is just a sound without a data source. */
-    soundConfig = *pConfig;
-    soundConfig.pFilePath   = NULL;
-    soundConfig.pFilePathW  = NULL;
-    soundConfig.pDataSource = NULL;
-
-    /*
-    Groups need to have spatialization disabled by default because I think it'll be pretty rare
-    that programs will want to spatialize groups (but not unheard of). Certainly it feels like
-    disabling this by default feels like the right option. Spatialization can be enabled with a
-    call to ma_sound_group_set_spatialization_enabled().
-    */
-    soundConfig.flags |= MA_SOUND_FLAG_NO_SPATIALIZATION;
-
-    return ma_sound_init_ex(pEngine, &soundConfig, pGroup);
-}
-
-MA_API void ma_sound_group_uninit(ma_sound_group* pGroup)
-{
-    ma_sound_uninit(pGroup);
-}
-
-MA_API ma_engine* ma_sound_group_get_engine(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_engine(pGroup);
-}
-
-MA_API ma_result ma_sound_group_start(ma_sound_group* pGroup)
-{
-    return ma_sound_start(pGroup);
-}
-
-MA_API ma_result ma_sound_group_stop(ma_sound_group* pGroup)
-{
-    return ma_sound_stop(pGroup);
-}
-
-MA_API void ma_sound_group_set_volume(ma_sound_group* pGroup, float volume)
-{
-    ma_sound_set_volume(pGroup, volume);
-}
-
-MA_API float ma_sound_group_get_volume(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_volume(pGroup);
-}
-
-MA_API void ma_sound_group_set_pan(ma_sound_group* pGroup, float pan)
-{
-    ma_sound_set_pan(pGroup, pan);
-}
-
-MA_API float ma_sound_group_get_pan(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_pan(pGroup);
-}
-
-MA_API void ma_sound_group_set_pan_mode(ma_sound_group* pGroup, ma_pan_mode panMode)
-{
-    ma_sound_set_pan_mode(pGroup, panMode);
-}
-
-MA_API ma_pan_mode ma_sound_group_get_pan_mode(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_pan_mode(pGroup);
-}
-
-MA_API void ma_sound_group_set_pitch(ma_sound_group* pGroup, float pitch)
-{
-    ma_sound_set_pitch(pGroup, pitch);
-}
-
-MA_API float ma_sound_group_get_pitch(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_pitch(pGroup);
-}
-
-MA_API void ma_sound_group_set_spatialization_enabled(ma_sound_group* pGroup, ma_bool32 enabled)
-{
-    ma_sound_set_spatialization_enabled(pGroup, enabled);
-}
-
-MA_API ma_bool32 ma_sound_group_is_spatialization_enabled(const ma_sound_group* pGroup)
-{
-    return ma_sound_is_spatialization_enabled(pGroup);
-}
-
-MA_API void ma_sound_group_set_pinned_listener_index(ma_sound_group* pGroup, ma_uint32 listenerIndex)
-{
-    ma_sound_set_pinned_listener_index(pGroup, listenerIndex);
-}
-
-MA_API ma_uint32 ma_sound_group_get_pinned_listener_index(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_pinned_listener_index(pGroup);
-}
-
-MA_API ma_uint32 ma_sound_group_get_listener_index(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_listener_index(pGroup);
-}
-
-MA_API ma_vec3f ma_sound_group_get_direction_to_listener(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_direction_to_listener(pGroup);
-}
-
-MA_API void ma_sound_group_set_position(ma_sound_group* pGroup, float x, float y, float z)
-{
-    ma_sound_set_position(pGroup, x, y, z);
-}
-
-MA_API ma_vec3f ma_sound_group_get_position(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_position(pGroup);
-}
-
-MA_API void ma_sound_group_set_direction(ma_sound_group* pGroup, float x, float y, float z)
-{
-    ma_sound_set_direction(pGroup, x, y, z);
-}
-
-MA_API ma_vec3f ma_sound_group_get_direction(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_direction(pGroup);
-}
-
-MA_API void ma_sound_group_set_velocity(ma_sound_group* pGroup, float x, float y, float z)
-{
-    ma_sound_set_velocity(pGroup, x, y, z);
-}
-
-MA_API ma_vec3f ma_sound_group_get_velocity(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_velocity(pGroup);
-}
-
-MA_API void ma_sound_group_set_attenuation_model(ma_sound_group* pGroup, ma_attenuation_model attenuationModel)
-{
-    ma_sound_set_attenuation_model(pGroup, attenuationModel);
-}
-
-MA_API ma_attenuation_model ma_sound_group_get_attenuation_model(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_attenuation_model(pGroup);
-}
-
-MA_API void ma_sound_group_set_positioning(ma_sound_group* pGroup, ma_positioning positioning)
-{
-    ma_sound_set_positioning(pGroup, positioning);
-}
-
-MA_API ma_positioning ma_sound_group_get_positioning(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_positioning(pGroup);
-}
-
-MA_API void ma_sound_group_set_rolloff(ma_sound_group* pGroup, float rolloff)
-{
-    ma_sound_set_rolloff(pGroup, rolloff);
-}
-
-MA_API float ma_sound_group_get_rolloff(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_rolloff(pGroup);
-}
-
-MA_API void ma_sound_group_set_min_gain(ma_sound_group* pGroup, float minGain)
-{
-    ma_sound_set_min_gain(pGroup, minGain);
-}
-
-MA_API float ma_sound_group_get_min_gain(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_min_gain(pGroup);
-}
-
-MA_API void ma_sound_group_set_max_gain(ma_sound_group* pGroup, float maxGain)
-{
-    ma_sound_set_max_gain(pGroup, maxGain);
-}
-
-MA_API float ma_sound_group_get_max_gain(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_max_gain(pGroup);
-}
-
-MA_API void ma_sound_group_set_min_distance(ma_sound_group* pGroup, float minDistance)
-{
-    ma_sound_set_min_distance(pGroup, minDistance);
-}
-
-MA_API float ma_sound_group_get_min_distance(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_min_distance(pGroup);
-}
-
-MA_API void ma_sound_group_set_max_distance(ma_sound_group* pGroup, float maxDistance)
-{
-    ma_sound_set_max_distance(pGroup, maxDistance);
-}
-
-MA_API float ma_sound_group_get_max_distance(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_max_distance(pGroup);
-}
-
-MA_API void ma_sound_group_set_cone(ma_sound_group* pGroup, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
-{
-    ma_sound_set_cone(pGroup, innerAngleInRadians, outerAngleInRadians, outerGain);
-}
-
-MA_API void ma_sound_group_get_cone(const ma_sound_group* pGroup, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
-{
-    ma_sound_get_cone(pGroup, pInnerAngleInRadians, pOuterAngleInRadians, pOuterGain);
-}
-
-MA_API void ma_sound_group_set_doppler_factor(ma_sound_group* pGroup, float dopplerFactor)
-{
-    ma_sound_set_doppler_factor(pGroup, dopplerFactor);
-}
-
-MA_API float ma_sound_group_get_doppler_factor(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_doppler_factor(pGroup);
-}
-
-MA_API void ma_sound_group_set_directional_attenuation_factor(ma_sound_group* pGroup, float directionalAttenuationFactor)
-{
-    ma_sound_set_directional_attenuation_factor(pGroup, directionalAttenuationFactor);
-}
-
-MA_API float ma_sound_group_get_directional_attenuation_factor(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_directional_attenuation_factor(pGroup);
-}
-
-MA_API void ma_sound_group_set_fade_in_pcm_frames(ma_sound_group* pGroup, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames)
-{
-    ma_sound_set_fade_in_pcm_frames(pGroup, volumeBeg, volumeEnd, fadeLengthInFrames);
-}
-
-MA_API void ma_sound_group_set_fade_in_milliseconds(ma_sound_group* pGroup, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds)
-{
-    ma_sound_set_fade_in_milliseconds(pGroup, volumeBeg, volumeEnd, fadeLengthInMilliseconds);
-}
-
-MA_API float ma_sound_group_get_current_fade_volume(ma_sound_group* pGroup)
-{
-    return ma_sound_get_current_fade_volume(pGroup);
-}
-
-MA_API void ma_sound_group_set_start_time_in_pcm_frames(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInFrames)
-{
-    ma_sound_set_start_time_in_pcm_frames(pGroup, absoluteGlobalTimeInFrames);
-}
-
-MA_API void ma_sound_group_set_start_time_in_milliseconds(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInMilliseconds)
-{
-    ma_sound_set_start_time_in_milliseconds(pGroup, absoluteGlobalTimeInMilliseconds);
-}
-
-MA_API void ma_sound_group_set_stop_time_in_pcm_frames(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInFrames)
-{
-    ma_sound_set_stop_time_in_pcm_frames(pGroup, absoluteGlobalTimeInFrames);
-}
-
-MA_API void ma_sound_group_set_stop_time_in_milliseconds(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInMilliseconds)
-{
-    ma_sound_set_stop_time_in_milliseconds(pGroup, absoluteGlobalTimeInMilliseconds);
-}
-
-MA_API ma_bool32 ma_sound_group_is_playing(const ma_sound_group* pGroup)
-{
-    return ma_sound_is_playing(pGroup);
-}
-
-MA_API ma_uint64 ma_sound_group_get_time_in_pcm_frames(const ma_sound_group* pGroup)
-{
-    return ma_sound_get_time_in_pcm_frames(pGroup);
-}
-#endif  /* MA_NO_ENGINE */
-/* END SECTION: miniaudio_engine.c */
-
-
-
-/**************************************************************************************************************************************************************
-***************************************************************************************************************************************************************
-
-Auto Generated
-==============
-All code below is auto-generated from a tool. This mostly consists of decoding backend implementations such as ma_dr_wav, ma_dr_flac, etc. If you find a bug in the
-code below please report the bug to the respective repository for the relevant project (probably dr_libs).
-
-***************************************************************************************************************************************************************
-**************************************************************************************************************************************************************/
-#if !defined(MA_NO_WAV) && (!defined(MA_NO_DECODING) || !defined(MA_NO_ENCODING))
-#if !defined(MA_DR_WAV_IMPLEMENTATION) && !defined(MA_DR_WAV_IMPLEMENTATION) /* For backwards compatibility. Will be removed in version 0.11 for cleanliness. */
-/* dr_wav_c begin */
-#ifndef ma_dr_wav_c
-#define ma_dr_wav_c
-#ifdef __MRC__
-#pragma options opt off
-#endif
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#ifndef MA_DR_WAV_NO_STDIO
-#include <stdio.h>
-#ifndef MA_DR_WAV_NO_WCHAR
-#include <wchar.h>
-#endif
-#endif
-#ifndef MA_DR_WAV_ASSERT
-#include <assert.h>
-#define MA_DR_WAV_ASSERT(expression)           assert(expression)
-#endif
-#ifndef MA_DR_WAV_MALLOC
-#define MA_DR_WAV_MALLOC(sz)                   malloc((sz))
-#endif
-#ifndef MA_DR_WAV_REALLOC
-#define MA_DR_WAV_REALLOC(p, sz)               realloc((p), (sz))
-#endif
-#ifndef MA_DR_WAV_FREE
-#define MA_DR_WAV_FREE(p)                      free((p))
-#endif
-#ifndef MA_DR_WAV_COPY_MEMORY
-#define MA_DR_WAV_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
-#endif
-#ifndef MA_DR_WAV_ZERO_MEMORY
-#define MA_DR_WAV_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
-#endif
-#ifndef MA_DR_WAV_ZERO_OBJECT
-#define MA_DR_WAV_ZERO_OBJECT(p)               MA_DR_WAV_ZERO_MEMORY((p), sizeof(*p))
-#endif
-#define ma_dr_wav_countof(x)                   (sizeof(x) / sizeof(x[0]))
-#define ma_dr_wav_align(x, a)                  ((((x) + (a) - 1) / (a)) * (a))
-#define ma_dr_wav_min(a, b)                    (((a) < (b)) ? (a) : (b))
-#define ma_dr_wav_max(a, b)                    (((a) > (b)) ? (a) : (b))
-#define ma_dr_wav_clamp(x, lo, hi)             (ma_dr_wav_max((lo), ma_dr_wav_min((hi), (x))))
-#define ma_dr_wav_offset_ptr(p, offset)        (((ma_uint8*)(p)) + (offset))
-#define MA_DR_WAV_MAX_SIMD_VECTOR_SIZE         32
-#define MA_DR_WAV_INT64_MIN ((ma_int64) ((ma_uint64)0x80000000 << 32))
-#define MA_DR_WAV_INT64_MAX ((ma_int64)(((ma_uint64)0x7FFFFFFF << 32) | 0xFFFFFFFF))
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-    #define MA_DR_WAV_HAS_BYTESWAP16_INTRINSIC
-    #define MA_DR_WAV_HAS_BYTESWAP32_INTRINSIC
-    #define MA_DR_WAV_HAS_BYTESWAP64_INTRINSIC
-#elif defined(__clang__)
-    #if defined(__has_builtin)
-        #if __has_builtin(__builtin_bswap16)
-            #define MA_DR_WAV_HAS_BYTESWAP16_INTRINSIC
-        #endif
-        #if __has_builtin(__builtin_bswap32)
-            #define MA_DR_WAV_HAS_BYTESWAP32_INTRINSIC
-        #endif
-        #if __has_builtin(__builtin_bswap64)
-            #define MA_DR_WAV_HAS_BYTESWAP64_INTRINSIC
-        #endif
-    #endif
-#elif defined(__GNUC__)
-    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
-        #define MA_DR_WAV_HAS_BYTESWAP32_INTRINSIC
-        #define MA_DR_WAV_HAS_BYTESWAP64_INTRINSIC
-    #endif
-    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
-        #define MA_DR_WAV_HAS_BYTESWAP16_INTRINSIC
-    #endif
-#endif
-MA_API void ma_dr_wav_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision)
-{
-    if (pMajor) {
-        *pMajor = MA_DR_WAV_VERSION_MAJOR;
-    }
-    if (pMinor) {
-        *pMinor = MA_DR_WAV_VERSION_MINOR;
-    }
-    if (pRevision) {
-        *pRevision = MA_DR_WAV_VERSION_REVISION;
-    }
-}
-MA_API const char* ma_dr_wav_version_string(void)
-{
-    return MA_DR_WAV_VERSION_STRING;
-}
-#ifndef MA_DR_WAV_MAX_SAMPLE_RATE
-#define MA_DR_WAV_MAX_SAMPLE_RATE       384000
-#endif
-#ifndef MA_DR_WAV_MAX_CHANNELS
-#define MA_DR_WAV_MAX_CHANNELS          256
-#endif
-#ifndef MA_DR_WAV_MAX_BITS_PER_SAMPLE
-#define MA_DR_WAV_MAX_BITS_PER_SAMPLE   64
-#endif
-static const ma_uint8 ma_dr_wavGUID_W64_RIFF[16] = {0x72,0x69,0x66,0x66, 0x2E,0x91, 0xCF,0x11, 0xA5,0xD6, 0x28,0xDB,0x04,0xC1,0x00,0x00};
-static const ma_uint8 ma_dr_wavGUID_W64_WAVE[16] = {0x77,0x61,0x76,0x65, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};
-static const ma_uint8 ma_dr_wavGUID_W64_FMT [16] = {0x66,0x6D,0x74,0x20, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};
-static const ma_uint8 ma_dr_wavGUID_W64_FACT[16] = {0x66,0x61,0x63,0x74, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};
-static const ma_uint8 ma_dr_wavGUID_W64_DATA[16] = {0x64,0x61,0x74,0x61, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};
-static MA_INLINE int ma_dr_wav__is_little_endian(void)
-{
-#if defined(MA_X86) || defined(MA_X64)
-    return MA_TRUE;
-#elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
-    return MA_TRUE;
-#else
-    int n = 1;
-    return (*(char*)&n) == 1;
-#endif
-}
-static MA_INLINE void ma_dr_wav_bytes_to_guid(const ma_uint8* data, ma_uint8* guid)
-{
-    int i;
-    for (i = 0; i < 16; ++i) {
-        guid[i] = data[i];
-    }
-}
-static MA_INLINE ma_uint16 ma_dr_wav__bswap16(ma_uint16 n)
-{
-#ifdef MA_DR_WAV_HAS_BYTESWAP16_INTRINSIC
-    #if defined(_MSC_VER)
-        return _byteswap_ushort(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        return __builtin_bswap16(n);
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    return ((n & 0xFF00) >> 8) |
-           ((n & 0x00FF) << 8);
-#endif
-}
-static MA_INLINE ma_uint32 ma_dr_wav__bswap32(ma_uint32 n)
-{
-#ifdef MA_DR_WAV_HAS_BYTESWAP32_INTRINSIC
-    #if defined(_MSC_VER)
-        return _byteswap_ulong(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        #if defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(MA_64BIT)
-            ma_uint32 r;
-            __asm__ __volatile__ (
-            #if defined(MA_64BIT)
-                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)
-            #else
-                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
-            #endif
-            );
-            return r;
-        #else
-            return __builtin_bswap32(n);
-        #endif
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    return ((n & 0xFF000000) >> 24) |
-           ((n & 0x00FF0000) >>  8) |
-           ((n & 0x0000FF00) <<  8) |
-           ((n & 0x000000FF) << 24);
-#endif
-}
-static MA_INLINE ma_uint64 ma_dr_wav__bswap64(ma_uint64 n)
-{
-#ifdef MA_DR_WAV_HAS_BYTESWAP64_INTRINSIC
-    #if defined(_MSC_VER)
-        return _byteswap_uint64(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        return __builtin_bswap64(n);
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    return ((n & ((ma_uint64)0xFF000000 << 32)) >> 56) |
-           ((n & ((ma_uint64)0x00FF0000 << 32)) >> 40) |
-           ((n & ((ma_uint64)0x0000FF00 << 32)) >> 24) |
-           ((n & ((ma_uint64)0x000000FF << 32)) >>  8) |
-           ((n & ((ma_uint64)0xFF000000      )) <<  8) |
-           ((n & ((ma_uint64)0x00FF0000      )) << 24) |
-           ((n & ((ma_uint64)0x0000FF00      )) << 40) |
-           ((n & ((ma_uint64)0x000000FF      )) << 56);
-#endif
-}
-static MA_INLINE ma_int16 ma_dr_wav__bswap_s16(ma_int16 n)
-{
-    return (ma_int16)ma_dr_wav__bswap16((ma_uint16)n);
-}
-static MA_INLINE void ma_dr_wav__bswap_samples_s16(ma_int16* pSamples, ma_uint64 sampleCount)
-{
-    ma_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamples[iSample] = ma_dr_wav__bswap_s16(pSamples[iSample]);
-    }
-}
-static MA_INLINE void ma_dr_wav__bswap_s24(ma_uint8* p)
-{
-    ma_uint8 t;
-    t = p[0];
-    p[0] = p[2];
-    p[2] = t;
-}
-static MA_INLINE void ma_dr_wav__bswap_samples_s24(ma_uint8* pSamples, ma_uint64 sampleCount)
-{
-    ma_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        ma_uint8* pSample = pSamples + (iSample*3);
-        ma_dr_wav__bswap_s24(pSample);
-    }
-}
-static MA_INLINE ma_int32 ma_dr_wav__bswap_s32(ma_int32 n)
-{
-    return (ma_int32)ma_dr_wav__bswap32((ma_uint32)n);
-}
-static MA_INLINE void ma_dr_wav__bswap_samples_s32(ma_int32* pSamples, ma_uint64 sampleCount)
-{
-    ma_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamples[iSample] = ma_dr_wav__bswap_s32(pSamples[iSample]);
-    }
-}
-static MA_INLINE ma_int64 ma_dr_wav__bswap_s64(ma_int64 n)
-{
-    return (ma_int64)ma_dr_wav__bswap64((ma_uint64)n);
-}
-static MA_INLINE void ma_dr_wav__bswap_samples_s64(ma_int64* pSamples, ma_uint64 sampleCount)
-{
-    ma_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamples[iSample] = ma_dr_wav__bswap_s64(pSamples[iSample]);
-    }
-}
-static MA_INLINE float ma_dr_wav__bswap_f32(float n)
-{
-    union {
-        ma_uint32 i;
-        float f;
-    } x;
-    x.f = n;
-    x.i = ma_dr_wav__bswap32(x.i);
-    return x.f;
-}
-static MA_INLINE void ma_dr_wav__bswap_samples_f32(float* pSamples, ma_uint64 sampleCount)
-{
-    ma_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamples[iSample] = ma_dr_wav__bswap_f32(pSamples[iSample]);
-    }
-}
-static MA_INLINE void ma_dr_wav__bswap_samples(void* pSamples, ma_uint64 sampleCount, ma_uint32 bytesPerSample)
-{
-    switch (bytesPerSample)
-    {
-        case 1:
-        {
-        } break;
-        case 2:
-        {
-            ma_dr_wav__bswap_samples_s16((ma_int16*)pSamples, sampleCount);
-        } break;
-        case 3:
-        {
-            ma_dr_wav__bswap_samples_s24((ma_uint8*)pSamples, sampleCount);
-        } break;
-        case 4:
-        {
-            ma_dr_wav__bswap_samples_s32((ma_int32*)pSamples, sampleCount);
-        } break;
-        case 8:
-        {
-            ma_dr_wav__bswap_samples_s64((ma_int64*)pSamples, sampleCount);
-        } break;
-        default:
-        {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-        } break;
-    }
-}
-MA_PRIVATE MA_INLINE ma_bool32 ma_dr_wav_is_container_be(ma_dr_wav_container container)
-{
-    if (container == ma_dr_wav_container_rifx || container == ma_dr_wav_container_aiff) {
-        return MA_TRUE;
-    } else {
-        return MA_FALSE;
-    }
-}
-MA_PRIVATE MA_INLINE ma_uint16 ma_dr_wav_bytes_to_u16_le(const ma_uint8* data)
-{
-    return ((ma_uint16)data[0] << 0) | ((ma_uint16)data[1] << 8);
-}
-MA_PRIVATE MA_INLINE ma_uint16 ma_dr_wav_bytes_to_u16_be(const ma_uint8* data)
-{
-    return ((ma_uint16)data[1] << 0) | ((ma_uint16)data[0] << 8);
-}
-MA_PRIVATE MA_INLINE ma_uint16 ma_dr_wav_bytes_to_u16_ex(const ma_uint8* data, ma_dr_wav_container container)
-{
-    if (ma_dr_wav_is_container_be(container)) {
-        return ma_dr_wav_bytes_to_u16_be(data);
-    } else {
-        return ma_dr_wav_bytes_to_u16_le(data);
-    }
-}
-MA_PRIVATE MA_INLINE ma_uint32 ma_dr_wav_bytes_to_u32_le(const ma_uint8* data)
-{
-    return ((ma_uint32)data[0] << 0) | ((ma_uint32)data[1] << 8) | ((ma_uint32)data[2] << 16) | ((ma_uint32)data[3] << 24);
-}
-MA_PRIVATE MA_INLINE ma_uint32 ma_dr_wav_bytes_to_u32_be(const ma_uint8* data)
-{
-    return ((ma_uint32)data[3] << 0) | ((ma_uint32)data[2] << 8) | ((ma_uint32)data[1] << 16) | ((ma_uint32)data[0] << 24);
-}
-MA_PRIVATE MA_INLINE ma_uint32 ma_dr_wav_bytes_to_u32_ex(const ma_uint8* data, ma_dr_wav_container container)
-{
-    if (ma_dr_wav_is_container_be(container)) {
-        return ma_dr_wav_bytes_to_u32_be(data);
-    } else {
-        return ma_dr_wav_bytes_to_u32_le(data);
-    }
-}
-MA_PRIVATE ma_int64 ma_dr_wav_aiff_extented_to_s64(const ma_uint8* data)
-{
-    ma_uint32 exponent = ((ma_uint32)data[0] << 8) | data[1];
-    ma_uint64 hi = ((ma_uint64)data[2] << 24) | ((ma_uint64)data[3] << 16) | ((ma_uint64)data[4] <<  8) | ((ma_uint64)data[5] <<  0);
-    ma_uint64 lo = ((ma_uint64)data[6] << 24) | ((ma_uint64)data[7] << 16) | ((ma_uint64)data[8] <<  8) | ((ma_uint64)data[9] <<  0);
-    ma_uint64 significand = (hi << 32) | lo;
-    int sign = exponent >> 15;
-    exponent &= 0x7FFF;
-    if (exponent == 0 && significand == 0) {
-        return 0;
-    } else if (exponent == 0x7FFF) {
-        return sign ? MA_DR_WAV_INT64_MIN : MA_DR_WAV_INT64_MAX;
-    }
-    exponent -= 16383;
-    if (exponent > 63) {
-        return sign ? MA_DR_WAV_INT64_MIN : MA_DR_WAV_INT64_MAX;
-    } else if (exponent < 1) {
-        return 0;
-    }
-    significand >>= (63 - exponent);
-    if (sign) {
-        return -(ma_int64)significand;
-    } else {
-        return  (ma_int64)significand;
-    }
-}
-MA_PRIVATE void* ma_dr_wav__malloc_default(size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return MA_DR_WAV_MALLOC(sz);
-}
-MA_PRIVATE void* ma_dr_wav__realloc_default(void* p, size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return MA_DR_WAV_REALLOC(p, sz);
-}
-MA_PRIVATE void ma_dr_wav__free_default(void* p, void* pUserData)
-{
-    (void)pUserData;
-    MA_DR_WAV_FREE(p);
-}
-MA_PRIVATE void* ma_dr_wav__malloc_from_callbacks(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks == NULL) {
-        return NULL;
-    }
-    if (pAllocationCallbacks->onMalloc != NULL) {
-        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
-    }
-    if (pAllocationCallbacks->onRealloc != NULL) {
-        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
-    }
-    return NULL;
-}
-MA_PRIVATE void* ma_dr_wav__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks == NULL) {
-        return NULL;
-    }
-    if (pAllocationCallbacks->onRealloc != NULL) {
-        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
-    }
-    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
-        void* p2;
-        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
-        if (p2 == NULL) {
-            return NULL;
-        }
-        if (p != NULL) {
-            MA_DR_WAV_COPY_MEMORY(p2, p, szOld);
-            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
-        }
-        return p2;
-    }
-    return NULL;
-}
-MA_PRIVATE void ma_dr_wav__free_from_callbacks(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (p == NULL || pAllocationCallbacks == NULL) {
-        return;
-    }
-    if (pAllocationCallbacks->onFree != NULL) {
-        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
-    }
-}
-MA_PRIVATE ma_allocation_callbacks ma_dr_wav_copy_allocation_callbacks_or_defaults(const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        return *pAllocationCallbacks;
-    } else {
-        ma_allocation_callbacks allocationCallbacks;
-        allocationCallbacks.pUserData = NULL;
-        allocationCallbacks.onMalloc  = ma_dr_wav__malloc_default;
-        allocationCallbacks.onRealloc = ma_dr_wav__realloc_default;
-        allocationCallbacks.onFree    = ma_dr_wav__free_default;
-        return allocationCallbacks;
-    }
-}
-static MA_INLINE ma_bool32 ma_dr_wav__is_compressed_format_tag(ma_uint16 formatTag)
-{
-    return
-        formatTag == MA_DR_WAVE_FORMAT_ADPCM ||
-        formatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM;
-}
-MA_PRIVATE unsigned int ma_dr_wav__chunk_padding_size_riff(ma_uint64 chunkSize)
-{
-    return (unsigned int)(chunkSize % 2);
-}
-MA_PRIVATE unsigned int ma_dr_wav__chunk_padding_size_w64(ma_uint64 chunkSize)
-{
-    return (unsigned int)(chunkSize % 8);
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__msadpcm(ma_dr_wav* pWav, ma_uint64 samplesToRead, ma_int16* pBufferOut);
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__ima(ma_dr_wav* pWav, ma_uint64 samplesToRead, ma_int16* pBufferOut);
-MA_PRIVATE ma_bool32 ma_dr_wav_init_write__internal(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount);
-MA_PRIVATE ma_result ma_dr_wav__read_chunk_header(ma_dr_wav_read_proc onRead, void* pUserData, ma_dr_wav_container container, ma_uint64* pRunningBytesReadOut, ma_dr_wav_chunk_header* pHeaderOut)
-{
-    if (container == ma_dr_wav_container_riff || container == ma_dr_wav_container_rifx || container == ma_dr_wav_container_rf64 || container == ma_dr_wav_container_aiff) {
-        ma_uint8 sizeInBytes[4];
-        if (onRead(pUserData, pHeaderOut->id.fourcc, 4) != 4) {
-            return MA_AT_END;
-        }
-        if (onRead(pUserData, sizeInBytes, 4) != 4) {
-            return MA_INVALID_FILE;
-        }
-        pHeaderOut->sizeInBytes = ma_dr_wav_bytes_to_u32_ex(sizeInBytes, container);
-        pHeaderOut->paddingSize = ma_dr_wav__chunk_padding_size_riff(pHeaderOut->sizeInBytes);
-        *pRunningBytesReadOut += 8;
-    } else if (container == ma_dr_wav_container_w64) {
-        ma_uint8 sizeInBytes[8];
-        if (onRead(pUserData, pHeaderOut->id.guid, 16) != 16) {
-            return MA_AT_END;
-        }
-        if (onRead(pUserData, sizeInBytes, 8) != 8) {
-            return MA_INVALID_FILE;
-        }
-        pHeaderOut->sizeInBytes = ma_dr_wav_bytes_to_u64(sizeInBytes) - 24;
-        pHeaderOut->paddingSize = ma_dr_wav__chunk_padding_size_w64(pHeaderOut->sizeInBytes);
-        *pRunningBytesReadOut += 24;
-    } else {
-        return MA_INVALID_FILE;
-    }
-    return MA_SUCCESS;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav__seek_forward(ma_dr_wav_seek_proc onSeek, ma_uint64 offset, void* pUserData)
-{
-    ma_uint64 bytesRemainingToSeek = offset;
-    while (bytesRemainingToSeek > 0) {
-        if (bytesRemainingToSeek > 0x7FFFFFFF) {
-            if (!onSeek(pUserData, 0x7FFFFFFF, ma_dr_wav_seek_origin_current)) {
-                return MA_FALSE;
-            }
-            bytesRemainingToSeek -= 0x7FFFFFFF;
-        } else {
-            if (!onSeek(pUserData, (int)bytesRemainingToSeek, ma_dr_wav_seek_origin_current)) {
-                return MA_FALSE;
-            }
-            bytesRemainingToSeek = 0;
-        }
-    }
-    return MA_TRUE;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav__seek_from_start(ma_dr_wav_seek_proc onSeek, ma_uint64 offset, void* pUserData)
-{
-    if (offset <= 0x7FFFFFFF) {
-        return onSeek(pUserData, (int)offset, ma_dr_wav_seek_origin_start);
-    }
-    if (!onSeek(pUserData, 0x7FFFFFFF, ma_dr_wav_seek_origin_start)) {
-        return MA_FALSE;
-    }
-    offset -= 0x7FFFFFFF;
-    for (;;) {
-        if (offset <= 0x7FFFFFFF) {
-            return onSeek(pUserData, (int)offset, ma_dr_wav_seek_origin_current);
-        }
-        if (!onSeek(pUserData, 0x7FFFFFFF, ma_dr_wav_seek_origin_current)) {
-            return MA_FALSE;
-        }
-        offset -= 0x7FFFFFFF;
-    }
-}
-MA_PRIVATE size_t ma_dr_wav__on_read(ma_dr_wav_read_proc onRead, void* pUserData, void* pBufferOut, size_t bytesToRead, ma_uint64* pCursor)
-{
-    size_t bytesRead;
-    MA_DR_WAV_ASSERT(onRead != NULL);
-    MA_DR_WAV_ASSERT(pCursor != NULL);
-    bytesRead = onRead(pUserData, pBufferOut, bytesToRead);
-    *pCursor += bytesRead;
-    return bytesRead;
-}
-#if 0
-MA_PRIVATE ma_bool32 ma_dr_wav__on_seek(ma_dr_wav_seek_proc onSeek, void* pUserData, int offset, ma_dr_wav_seek_origin origin, ma_uint64* pCursor)
-{
-    MA_DR_WAV_ASSERT(onSeek != NULL);
-    MA_DR_WAV_ASSERT(pCursor != NULL);
-    if (!onSeek(pUserData, offset, origin)) {
-        return MA_FALSE;
-    }
-    if (origin == ma_dr_wav_seek_origin_start) {
-        *pCursor = offset;
-    } else {
-        *pCursor += offset;
-    }
-    return MA_TRUE;
-}
-#endif
-#define MA_DR_WAV_SMPL_BYTES                    36
-#define MA_DR_WAV_SMPL_LOOP_BYTES               24
-#define MA_DR_WAV_INST_BYTES                    7
-#define MA_DR_WAV_ACID_BYTES                    24
-#define MA_DR_WAV_CUE_BYTES                     4
-#define MA_DR_WAV_BEXT_BYTES                    602
-#define MA_DR_WAV_BEXT_DESCRIPTION_BYTES        256
-#define MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES    32
-#define MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES     32
-#define MA_DR_WAV_BEXT_RESERVED_BYTES           180
-#define MA_DR_WAV_BEXT_UMID_BYTES               64
-#define MA_DR_WAV_CUE_POINT_BYTES               24
-#define MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES      4
-#define MA_DR_WAV_LIST_LABELLED_TEXT_BYTES      20
-#define MA_DR_WAV_METADATA_ALIGNMENT            8
-typedef enum
-{
-    ma_dr_wav__metadata_parser_stage_count,
-    ma_dr_wav__metadata_parser_stage_read
-} ma_dr_wav__metadata_parser_stage;
-typedef struct
-{
-    ma_dr_wav_read_proc onRead;
-    ma_dr_wav_seek_proc onSeek;
-    void *pReadSeekUserData;
-    ma_dr_wav__metadata_parser_stage stage;
-    ma_dr_wav_metadata *pMetadata;
-    ma_uint32 metadataCount;
-    ma_uint8 *pData;
-    ma_uint8 *pDataCursor;
-    ma_uint64 metadataCursor;
-    ma_uint64 extraCapacity;
-} ma_dr_wav__metadata_parser;
-MA_PRIVATE size_t ma_dr_wav__metadata_memory_capacity(ma_dr_wav__metadata_parser* pParser)
-{
-    ma_uint64 cap = sizeof(ma_dr_wav_metadata) * (ma_uint64)pParser->metadataCount + pParser->extraCapacity;
-    if (cap > MA_SIZE_MAX) {
-        return 0;
-    }
-    return (size_t)cap;
-}
-MA_PRIVATE ma_uint8* ma_dr_wav__metadata_get_memory(ma_dr_wav__metadata_parser* pParser, size_t size, size_t align)
-{
-    ma_uint8* pResult;
-    if (align) {
-        ma_uintptr modulo = (ma_uintptr)pParser->pDataCursor % align;
-        if (modulo != 0) {
-            pParser->pDataCursor += align - modulo;
-        }
-    }
-    pResult = pParser->pDataCursor;
-    MA_DR_WAV_ASSERT((pResult + size) <= (pParser->pData + ma_dr_wav__metadata_memory_capacity(pParser)));
-    pParser->pDataCursor += size;
-    return pResult;
-}
-MA_PRIVATE void ma_dr_wav__metadata_request_extra_memory_for_stage_2(ma_dr_wav__metadata_parser* pParser, size_t bytes, size_t align)
-{
-    size_t extra = bytes + (align ? (align - 1) : 0);
-    pParser->extraCapacity += extra;
-}
-MA_PRIVATE ma_result ma_dr_wav__metadata_alloc(ma_dr_wav__metadata_parser* pParser, ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pParser->extraCapacity != 0 || pParser->metadataCount != 0) {
-        pAllocationCallbacks->onFree(pParser->pData, pAllocationCallbacks->pUserData);
-        pParser->pData = (ma_uint8*)pAllocationCallbacks->onMalloc(ma_dr_wav__metadata_memory_capacity(pParser), pAllocationCallbacks->pUserData);
-        pParser->pDataCursor = pParser->pData;
-        if (pParser->pData == NULL) {
-            return MA_OUT_OF_MEMORY;
-        }
-        pParser->pMetadata = (ma_dr_wav_metadata*)ma_dr_wav__metadata_get_memory(pParser, sizeof(ma_dr_wav_metadata) * pParser->metadataCount, 1);
-        pParser->metadataCursor = 0;
-    }
-    return MA_SUCCESS;
-}
-MA_PRIVATE size_t ma_dr_wav__metadata_parser_read(ma_dr_wav__metadata_parser* pParser, void* pBufferOut, size_t bytesToRead, ma_uint64* pCursor)
-{
-    if (pCursor != NULL) {
-        return ma_dr_wav__on_read(pParser->onRead, pParser->pReadSeekUserData, pBufferOut, bytesToRead, pCursor);
-    } else {
-        return pParser->onRead(pParser->pReadSeekUserData, pBufferOut, bytesToRead);
-    }
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__read_smpl_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, const ma_dr_wav_chunk_header* pChunkHeader, ma_dr_wav_metadata* pMetadata)
-{
-    ma_uint8 smplHeaderData[MA_DR_WAV_SMPL_BYTES];
-    ma_uint64 totalBytesRead = 0;
-    size_t bytesJustRead;
-    if (pMetadata == NULL) {
-        return 0;
-    }
-    bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, smplHeaderData, sizeof(smplHeaderData), &totalBytesRead);
-    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
-    MA_DR_WAV_ASSERT(pChunkHeader != NULL);
-    if (pMetadata != NULL && bytesJustRead == sizeof(smplHeaderData)) {
-        ma_uint32 iSampleLoop;
-        pMetadata->type                                     = ma_dr_wav_metadata_type_smpl;
-        pMetadata->data.smpl.manufacturerId                 = ma_dr_wav_bytes_to_u32(smplHeaderData + 0);
-        pMetadata->data.smpl.productId                      = ma_dr_wav_bytes_to_u32(smplHeaderData + 4);
-        pMetadata->data.smpl.samplePeriodNanoseconds        = ma_dr_wav_bytes_to_u32(smplHeaderData + 8);
-        pMetadata->data.smpl.midiUnityNote                  = ma_dr_wav_bytes_to_u32(smplHeaderData + 12);
-        pMetadata->data.smpl.midiPitchFraction              = ma_dr_wav_bytes_to_u32(smplHeaderData + 16);
-        pMetadata->data.smpl.smpteFormat                    = ma_dr_wav_bytes_to_u32(smplHeaderData + 20);
-        pMetadata->data.smpl.smpteOffset                    = ma_dr_wav_bytes_to_u32(smplHeaderData + 24);
-        pMetadata->data.smpl.sampleLoopCount                = ma_dr_wav_bytes_to_u32(smplHeaderData + 28);
-        pMetadata->data.smpl.samplerSpecificDataSizeInBytes = ma_dr_wav_bytes_to_u32(smplHeaderData + 32);
-        if (pMetadata->data.smpl.sampleLoopCount == (pChunkHeader->sizeInBytes - MA_DR_WAV_SMPL_BYTES) / MA_DR_WAV_SMPL_LOOP_BYTES) {
-            pMetadata->data.smpl.pLoops = (ma_dr_wav_smpl_loop*)ma_dr_wav__metadata_get_memory(pParser, sizeof(ma_dr_wav_smpl_loop) * pMetadata->data.smpl.sampleLoopCount, MA_DR_WAV_METADATA_ALIGNMENT);
-            for (iSampleLoop = 0; iSampleLoop < pMetadata->data.smpl.sampleLoopCount; ++iSampleLoop) {
-                ma_uint8 smplLoopData[MA_DR_WAV_SMPL_LOOP_BYTES];
-                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, smplLoopData, sizeof(smplLoopData), &totalBytesRead);
-                if (bytesJustRead == sizeof(smplLoopData)) {
-                    pMetadata->data.smpl.pLoops[iSampleLoop].cuePointId            = ma_dr_wav_bytes_to_u32(smplLoopData + 0);
-                    pMetadata->data.smpl.pLoops[iSampleLoop].type                  = ma_dr_wav_bytes_to_u32(smplLoopData + 4);
-                    pMetadata->data.smpl.pLoops[iSampleLoop].firstSampleByteOffset = ma_dr_wav_bytes_to_u32(smplLoopData + 8);
-                    pMetadata->data.smpl.pLoops[iSampleLoop].lastSampleByteOffset  = ma_dr_wav_bytes_to_u32(smplLoopData + 12);
-                    pMetadata->data.smpl.pLoops[iSampleLoop].sampleFraction        = ma_dr_wav_bytes_to_u32(smplLoopData + 16);
-                    pMetadata->data.smpl.pLoops[iSampleLoop].playCount             = ma_dr_wav_bytes_to_u32(smplLoopData + 20);
-                } else {
-                    break;
-                }
-            }
-            if (pMetadata->data.smpl.samplerSpecificDataSizeInBytes > 0) {
-                pMetadata->data.smpl.pSamplerSpecificData = ma_dr_wav__metadata_get_memory(pParser, pMetadata->data.smpl.samplerSpecificDataSizeInBytes, 1);
-                MA_DR_WAV_ASSERT(pMetadata->data.smpl.pSamplerSpecificData != NULL);
-                ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.smpl.pSamplerSpecificData, pMetadata->data.smpl.samplerSpecificDataSizeInBytes, &totalBytesRead);
-            }
-        }
-    }
-    return totalBytesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__read_cue_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, const ma_dr_wav_chunk_header* pChunkHeader, ma_dr_wav_metadata* pMetadata)
-{
-    ma_uint8 cueHeaderSectionData[MA_DR_WAV_CUE_BYTES];
-    ma_uint64 totalBytesRead = 0;
-    size_t bytesJustRead;
-    if (pMetadata == NULL) {
-        return 0;
-    }
-    bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, cueHeaderSectionData, sizeof(cueHeaderSectionData), &totalBytesRead);
-    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
-    if (bytesJustRead == sizeof(cueHeaderSectionData)) {
-        pMetadata->type                   = ma_dr_wav_metadata_type_cue;
-        pMetadata->data.cue.cuePointCount = ma_dr_wav_bytes_to_u32(cueHeaderSectionData);
-        if (pMetadata->data.cue.cuePointCount == (pChunkHeader->sizeInBytes - MA_DR_WAV_CUE_BYTES) / MA_DR_WAV_CUE_POINT_BYTES) {
-            pMetadata->data.cue.pCuePoints    = (ma_dr_wav_cue_point*)ma_dr_wav__metadata_get_memory(pParser, sizeof(ma_dr_wav_cue_point) * pMetadata->data.cue.cuePointCount, MA_DR_WAV_METADATA_ALIGNMENT);
-            MA_DR_WAV_ASSERT(pMetadata->data.cue.pCuePoints != NULL);
-            if (pMetadata->data.cue.cuePointCount > 0) {
-                ma_uint32 iCuePoint;
-                for (iCuePoint = 0; iCuePoint < pMetadata->data.cue.cuePointCount; ++iCuePoint) {
-                    ma_uint8 cuePointData[MA_DR_WAV_CUE_POINT_BYTES];
-                    bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, cuePointData, sizeof(cuePointData), &totalBytesRead);
-                    if (bytesJustRead == sizeof(cuePointData)) {
-                        pMetadata->data.cue.pCuePoints[iCuePoint].id                = ma_dr_wav_bytes_to_u32(cuePointData + 0);
-                        pMetadata->data.cue.pCuePoints[iCuePoint].playOrderPosition = ma_dr_wav_bytes_to_u32(cuePointData + 4);
-                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[0]    = cuePointData[8];
-                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[1]    = cuePointData[9];
-                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[2]    = cuePointData[10];
-                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[3]    = cuePointData[11];
-                        pMetadata->data.cue.pCuePoints[iCuePoint].chunkStart        = ma_dr_wav_bytes_to_u32(cuePointData + 12);
-                        pMetadata->data.cue.pCuePoints[iCuePoint].blockStart        = ma_dr_wav_bytes_to_u32(cuePointData + 16);
-                        pMetadata->data.cue.pCuePoints[iCuePoint].sampleByteOffset  = ma_dr_wav_bytes_to_u32(cuePointData + 20);
-                    } else {
-                        break;
-                    }
-                }
-            }
-        }
-    }
-    return totalBytesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__read_inst_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata)
-{
-    ma_uint8 instData[MA_DR_WAV_INST_BYTES];
-    ma_uint64 bytesRead;
-    if (pMetadata == NULL) {
-        return 0;
-    }
-    bytesRead = ma_dr_wav__metadata_parser_read(pParser, instData, sizeof(instData), NULL);
-    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
-    if (bytesRead == sizeof(instData)) {
-        pMetadata->type                    = ma_dr_wav_metadata_type_inst;
-        pMetadata->data.inst.midiUnityNote = (ma_int8)instData[0];
-        pMetadata->data.inst.fineTuneCents = (ma_int8)instData[1];
-        pMetadata->data.inst.gainDecibels  = (ma_int8)instData[2];
-        pMetadata->data.inst.lowNote       = (ma_int8)instData[3];
-        pMetadata->data.inst.highNote      = (ma_int8)instData[4];
-        pMetadata->data.inst.lowVelocity   = (ma_int8)instData[5];
-        pMetadata->data.inst.highVelocity  = (ma_int8)instData[6];
-    }
-    return bytesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__read_acid_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata)
-{
-    ma_uint8 acidData[MA_DR_WAV_ACID_BYTES];
-    ma_uint64 bytesRead;
-    if (pMetadata == NULL) {
-        return 0;
-    }
-    bytesRead = ma_dr_wav__metadata_parser_read(pParser, acidData, sizeof(acidData), NULL);
-    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
-    if (bytesRead == sizeof(acidData)) {
-        pMetadata->type                       = ma_dr_wav_metadata_type_acid;
-        pMetadata->data.acid.flags            = ma_dr_wav_bytes_to_u32(acidData + 0);
-        pMetadata->data.acid.midiUnityNote    = ma_dr_wav_bytes_to_u16(acidData + 4);
-        pMetadata->data.acid.reserved1        = ma_dr_wav_bytes_to_u16(acidData + 6);
-        pMetadata->data.acid.reserved2        = ma_dr_wav_bytes_to_f32(acidData + 8);
-        pMetadata->data.acid.numBeats         = ma_dr_wav_bytes_to_u32(acidData + 12);
-        pMetadata->data.acid.meterDenominator = ma_dr_wav_bytes_to_u16(acidData + 16);
-        pMetadata->data.acid.meterNumerator   = ma_dr_wav_bytes_to_u16(acidData + 18);
-        pMetadata->data.acid.tempo            = ma_dr_wav_bytes_to_f32(acidData + 20);
-    }
-    return bytesRead;
-}
-MA_PRIVATE size_t ma_dr_wav__strlen(const char* str)
-{
-    size_t result = 0;
-    while (*str++) {
-        result += 1;
-    }
-    return result;
-}
-MA_PRIVATE size_t ma_dr_wav__strlen_clamped(const char* str, size_t maxToRead)
-{
-    size_t result = 0;
-    while (*str++ && result < maxToRead) {
-        result += 1;
-    }
-    return result;
-}
-MA_PRIVATE char* ma_dr_wav__metadata_copy_string(ma_dr_wav__metadata_parser* pParser, const char* str, size_t maxToRead)
-{
-    size_t len = ma_dr_wav__strlen_clamped(str, maxToRead);
-    if (len) {
-        char* result = (char*)ma_dr_wav__metadata_get_memory(pParser, len + 1, 1);
-        MA_DR_WAV_ASSERT(result != NULL);
-        MA_DR_WAV_COPY_MEMORY(result, str, len);
-        result[len] = '\0';
-        return result;
-    } else {
-        return NULL;
-    }
-}
-typedef struct
-{
-    const void* pBuffer;
-    size_t sizeInBytes;
-    size_t cursor;
-} ma_dr_wav_buffer_reader;
-MA_PRIVATE ma_result ma_dr_wav_buffer_reader_init(const void* pBuffer, size_t sizeInBytes, ma_dr_wav_buffer_reader* pReader)
-{
-    MA_DR_WAV_ASSERT(pBuffer != NULL);
-    MA_DR_WAV_ASSERT(pReader != NULL);
-    MA_DR_WAV_ZERO_OBJECT(pReader);
-    pReader->pBuffer     = pBuffer;
-    pReader->sizeInBytes = sizeInBytes;
-    pReader->cursor      = 0;
-    return MA_SUCCESS;
-}
-MA_PRIVATE const void* ma_dr_wav_buffer_reader_ptr(const ma_dr_wav_buffer_reader* pReader)
-{
-    MA_DR_WAV_ASSERT(pReader != NULL);
-    return ma_dr_wav_offset_ptr(pReader->pBuffer, pReader->cursor);
-}
-MA_PRIVATE ma_result ma_dr_wav_buffer_reader_seek(ma_dr_wav_buffer_reader* pReader, size_t bytesToSeek)
-{
-    MA_DR_WAV_ASSERT(pReader != NULL);
-    if (pReader->cursor + bytesToSeek > pReader->sizeInBytes) {
-        return MA_BAD_SEEK;
-    }
-    pReader->cursor += bytesToSeek;
-    return MA_SUCCESS;
-}
-MA_PRIVATE ma_result ma_dr_wav_buffer_reader_read(ma_dr_wav_buffer_reader* pReader, void* pDst, size_t bytesToRead, size_t* pBytesRead)
-{
-    ma_result result = MA_SUCCESS;
-    size_t bytesRemaining;
-    MA_DR_WAV_ASSERT(pReader != NULL);
-    if (pBytesRead != NULL) {
-        *pBytesRead = 0;
-    }
-    bytesRemaining = (pReader->sizeInBytes - pReader->cursor);
-    if (bytesToRead > bytesRemaining) {
-        bytesToRead = bytesRemaining;
-    }
-    if (pDst == NULL) {
-        result = ma_dr_wav_buffer_reader_seek(pReader, bytesToRead);
-    } else {
-        MA_DR_WAV_COPY_MEMORY(pDst, ma_dr_wav_buffer_reader_ptr(pReader), bytesToRead);
-        pReader->cursor += bytesToRead;
-    }
-    MA_DR_WAV_ASSERT(pReader->cursor <= pReader->sizeInBytes);
-    if (result == MA_SUCCESS) {
-        if (pBytesRead != NULL) {
-            *pBytesRead = bytesToRead;
-        }
-    }
-    return MA_SUCCESS;
-}
-MA_PRIVATE ma_result ma_dr_wav_buffer_reader_read_u16(ma_dr_wav_buffer_reader* pReader, ma_uint16* pDst)
-{
-    ma_result result;
-    size_t bytesRead;
-    ma_uint8 data[2];
-    MA_DR_WAV_ASSERT(pReader != NULL);
-    MA_DR_WAV_ASSERT(pDst != NULL);
-    *pDst = 0;
-    result = ma_dr_wav_buffer_reader_read(pReader, data, sizeof(*pDst), &bytesRead);
-    if (result != MA_SUCCESS || bytesRead != sizeof(*pDst)) {
-        return result;
-    }
-    *pDst = ma_dr_wav_bytes_to_u16(data);
-    return MA_SUCCESS;
-}
-MA_PRIVATE ma_result ma_dr_wav_buffer_reader_read_u32(ma_dr_wav_buffer_reader* pReader, ma_uint32* pDst)
-{
-    ma_result result;
-    size_t bytesRead;
-    ma_uint8 data[4];
-    MA_DR_WAV_ASSERT(pReader != NULL);
-    MA_DR_WAV_ASSERT(pDst != NULL);
-    *pDst = 0;
-    result = ma_dr_wav_buffer_reader_read(pReader, data, sizeof(*pDst), &bytesRead);
-    if (result != MA_SUCCESS || bytesRead != sizeof(*pDst)) {
-        return result;
-    }
-    *pDst = ma_dr_wav_bytes_to_u32(data);
-    return MA_SUCCESS;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__read_bext_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata, ma_uint64 chunkSize)
-{
-    ma_uint8 bextData[MA_DR_WAV_BEXT_BYTES];
-    size_t bytesRead = ma_dr_wav__metadata_parser_read(pParser, bextData, sizeof(bextData), NULL);
-    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
-    if (bytesRead == sizeof(bextData)) {
-        ma_dr_wav_buffer_reader reader;
-        ma_uint32 timeReferenceLow;
-        ma_uint32 timeReferenceHigh;
-        size_t extraBytes;
-        pMetadata->type = ma_dr_wav_metadata_type_bext;
-        if (ma_dr_wav_buffer_reader_init(bextData, bytesRead, &reader) == MA_SUCCESS) {
-            pMetadata->data.bext.pDescription = ma_dr_wav__metadata_copy_string(pParser, (const char*)ma_dr_wav_buffer_reader_ptr(&reader), MA_DR_WAV_BEXT_DESCRIPTION_BYTES);
-            ma_dr_wav_buffer_reader_seek(&reader, MA_DR_WAV_BEXT_DESCRIPTION_BYTES);
-            pMetadata->data.bext.pOriginatorName = ma_dr_wav__metadata_copy_string(pParser, (const char*)ma_dr_wav_buffer_reader_ptr(&reader), MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES);
-            ma_dr_wav_buffer_reader_seek(&reader, MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES);
-            pMetadata->data.bext.pOriginatorReference = ma_dr_wav__metadata_copy_string(pParser, (const char*)ma_dr_wav_buffer_reader_ptr(&reader), MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES);
-            ma_dr_wav_buffer_reader_seek(&reader, MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES);
-            ma_dr_wav_buffer_reader_read(&reader, pMetadata->data.bext.pOriginationDate, sizeof(pMetadata->data.bext.pOriginationDate), NULL);
-            ma_dr_wav_buffer_reader_read(&reader, pMetadata->data.bext.pOriginationTime, sizeof(pMetadata->data.bext.pOriginationTime), NULL);
-            ma_dr_wav_buffer_reader_read_u32(&reader, &timeReferenceLow);
-            ma_dr_wav_buffer_reader_read_u32(&reader, &timeReferenceHigh);
-            pMetadata->data.bext.timeReference = ((ma_uint64)timeReferenceHigh << 32) + timeReferenceLow;
-            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.version);
-            pMetadata->data.bext.pUMID = ma_dr_wav__metadata_get_memory(pParser, MA_DR_WAV_BEXT_UMID_BYTES, 1);
-            ma_dr_wav_buffer_reader_read(&reader, pMetadata->data.bext.pUMID, MA_DR_WAV_BEXT_UMID_BYTES, NULL);
-            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.loudnessValue);
-            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.loudnessRange);
-            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.maxTruePeakLevel);
-            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.maxMomentaryLoudness);
-            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.maxShortTermLoudness);
-            MA_DR_WAV_ASSERT((ma_dr_wav_offset_ptr(ma_dr_wav_buffer_reader_ptr(&reader), MA_DR_WAV_BEXT_RESERVED_BYTES)) == (bextData + MA_DR_WAV_BEXT_BYTES));
-            extraBytes = (size_t)(chunkSize - MA_DR_WAV_BEXT_BYTES);
-            if (extraBytes > 0) {
-                pMetadata->data.bext.pCodingHistory = (char*)ma_dr_wav__metadata_get_memory(pParser, extraBytes + 1, 1);
-                MA_DR_WAV_ASSERT(pMetadata->data.bext.pCodingHistory != NULL);
-                bytesRead += ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.bext.pCodingHistory, extraBytes, NULL);
-                pMetadata->data.bext.codingHistorySize = (ma_uint32)ma_dr_wav__strlen(pMetadata->data.bext.pCodingHistory);
-            } else {
-                pMetadata->data.bext.pCodingHistory    = NULL;
-                pMetadata->data.bext.codingHistorySize = 0;
-            }
-        }
-    }
-    return bytesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__read_list_label_or_note_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata, ma_uint64 chunkSize, ma_dr_wav_metadata_type type)
-{
-    ma_uint8 cueIDBuffer[MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES];
-    ma_uint64 totalBytesRead = 0;
-    size_t bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, cueIDBuffer, sizeof(cueIDBuffer), &totalBytesRead);
-    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
-    if (bytesJustRead == sizeof(cueIDBuffer)) {
-        ma_uint32 sizeIncludingNullTerminator;
-        pMetadata->type = type;
-        pMetadata->data.labelOrNote.cuePointId = ma_dr_wav_bytes_to_u32(cueIDBuffer);
-        sizeIncludingNullTerminator = (ma_uint32)chunkSize - MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES;
-        if (sizeIncludingNullTerminator > 0) {
-            pMetadata->data.labelOrNote.stringLength = sizeIncludingNullTerminator - 1;
-            pMetadata->data.labelOrNote.pString      = (char*)ma_dr_wav__metadata_get_memory(pParser, sizeIncludingNullTerminator, 1);
-            MA_DR_WAV_ASSERT(pMetadata->data.labelOrNote.pString != NULL);
-            ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.labelOrNote.pString, sizeIncludingNullTerminator, &totalBytesRead);
-        } else {
-            pMetadata->data.labelOrNote.stringLength = 0;
-            pMetadata->data.labelOrNote.pString      = NULL;
-        }
-    }
-    return totalBytesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__read_list_labelled_cue_region_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata, ma_uint64 chunkSize)
-{
-    ma_uint8 buffer[MA_DR_WAV_LIST_LABELLED_TEXT_BYTES];
-    ma_uint64 totalBytesRead = 0;
-    size_t bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, sizeof(buffer), &totalBytesRead);
-    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
-    if (bytesJustRead == sizeof(buffer)) {
-        ma_uint32 sizeIncludingNullTerminator;
-        pMetadata->type                                = ma_dr_wav_metadata_type_list_labelled_cue_region;
-        pMetadata->data.labelledCueRegion.cuePointId   = ma_dr_wav_bytes_to_u32(buffer + 0);
-        pMetadata->data.labelledCueRegion.sampleLength = ma_dr_wav_bytes_to_u32(buffer + 4);
-        pMetadata->data.labelledCueRegion.purposeId[0] = buffer[8];
-        pMetadata->data.labelledCueRegion.purposeId[1] = buffer[9];
-        pMetadata->data.labelledCueRegion.purposeId[2] = buffer[10];
-        pMetadata->data.labelledCueRegion.purposeId[3] = buffer[11];
-        pMetadata->data.labelledCueRegion.country      = ma_dr_wav_bytes_to_u16(buffer + 12);
-        pMetadata->data.labelledCueRegion.language     = ma_dr_wav_bytes_to_u16(buffer + 14);
-        pMetadata->data.labelledCueRegion.dialect      = ma_dr_wav_bytes_to_u16(buffer + 16);
-        pMetadata->data.labelledCueRegion.codePage     = ma_dr_wav_bytes_to_u16(buffer + 18);
-        sizeIncludingNullTerminator = (ma_uint32)chunkSize - MA_DR_WAV_LIST_LABELLED_TEXT_BYTES;
-        if (sizeIncludingNullTerminator > 0) {
-            pMetadata->data.labelledCueRegion.stringLength = sizeIncludingNullTerminator - 1;
-            pMetadata->data.labelledCueRegion.pString      = (char*)ma_dr_wav__metadata_get_memory(pParser, sizeIncludingNullTerminator, 1);
-            MA_DR_WAV_ASSERT(pMetadata->data.labelledCueRegion.pString != NULL);
-            ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.labelledCueRegion.pString, sizeIncludingNullTerminator, &totalBytesRead);
-        } else {
-            pMetadata->data.labelledCueRegion.stringLength = 0;
-            pMetadata->data.labelledCueRegion.pString      = NULL;
-        }
-    }
-    return totalBytesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__metadata_process_info_text_chunk(ma_dr_wav__metadata_parser* pParser, ma_uint64 chunkSize, ma_dr_wav_metadata_type type)
-{
-    ma_uint64 bytesRead = 0;
-    ma_uint32 stringSizeWithNullTerminator = (ma_uint32)chunkSize;
-    if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
-        pParser->metadataCount += 1;
-        ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, stringSizeWithNullTerminator, 1);
-    } else {
-        ma_dr_wav_metadata* pMetadata = &pParser->pMetadata[pParser->metadataCursor];
-        pMetadata->type = type;
-        if (stringSizeWithNullTerminator > 0) {
-            pMetadata->data.infoText.stringLength = stringSizeWithNullTerminator - 1;
-            pMetadata->data.infoText.pString = (char*)ma_dr_wav__metadata_get_memory(pParser, stringSizeWithNullTerminator, 1);
-            MA_DR_WAV_ASSERT(pMetadata->data.infoText.pString != NULL);
-            bytesRead = ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.infoText.pString, (size_t)stringSizeWithNullTerminator, NULL);
-            if (bytesRead == chunkSize) {
-                pParser->metadataCursor += 1;
-            } else {
-            }
-        } else {
-            pMetadata->data.infoText.stringLength = 0;
-            pMetadata->data.infoText.pString      = NULL;
-            pParser->metadataCursor += 1;
-        }
-    }
-    return bytesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__metadata_process_unknown_chunk(ma_dr_wav__metadata_parser* pParser, const ma_uint8* pChunkId, ma_uint64 chunkSize, ma_dr_wav_metadata_location location)
-{
-    ma_uint64 bytesRead = 0;
-    if (location == ma_dr_wav_metadata_location_invalid) {
-        return 0;
-    }
-    if (ma_dr_wav_fourcc_equal(pChunkId, "data") || ma_dr_wav_fourcc_equal(pChunkId, "fmt ") || ma_dr_wav_fourcc_equal(pChunkId, "fact")) {
-        return 0;
-    }
-    if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
-        pParser->metadataCount += 1;
-        ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, (size_t)chunkSize, 1);
-    } else {
-        ma_dr_wav_metadata* pMetadata = &pParser->pMetadata[pParser->metadataCursor];
-        pMetadata->type                         = ma_dr_wav_metadata_type_unknown;
-        pMetadata->data.unknown.chunkLocation   = location;
-        pMetadata->data.unknown.id[0]           = pChunkId[0];
-        pMetadata->data.unknown.id[1]           = pChunkId[1];
-        pMetadata->data.unknown.id[2]           = pChunkId[2];
-        pMetadata->data.unknown.id[3]           = pChunkId[3];
-        pMetadata->data.unknown.dataSizeInBytes = (ma_uint32)chunkSize;
-        pMetadata->data.unknown.pData           = (ma_uint8 *)ma_dr_wav__metadata_get_memory(pParser, (size_t)chunkSize, 1);
-        MA_DR_WAV_ASSERT(pMetadata->data.unknown.pData != NULL);
-        bytesRead = ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.unknown.pData, pMetadata->data.unknown.dataSizeInBytes, NULL);
-        if (bytesRead == pMetadata->data.unknown.dataSizeInBytes) {
-            pParser->metadataCursor += 1;
-        } else {
-        }
-    }
-    return bytesRead;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav__chunk_matches(ma_dr_wav_metadata_type allowedMetadataTypes, const ma_uint8* pChunkID, ma_dr_wav_metadata_type type, const char* pID)
-{
-    return (allowedMetadataTypes & type) && ma_dr_wav_fourcc_equal(pChunkID, pID);
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__metadata_process_chunk(ma_dr_wav__metadata_parser* pParser, const ma_dr_wav_chunk_header* pChunkHeader, ma_dr_wav_metadata_type allowedMetadataTypes)
-{
-    const ma_uint8 *pChunkID = pChunkHeader->id.fourcc;
-    ma_uint64 bytesRead = 0;
-    if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_smpl, "smpl")) {
-        if (pChunkHeader->sizeInBytes >= MA_DR_WAV_SMPL_BYTES) {
-            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
-                ma_uint8 buffer[4];
-                size_t bytesJustRead;
-                if (!pParser->onSeek(pParser->pReadSeekUserData, 28, ma_dr_wav_seek_origin_current)) {
-                    return bytesRead;
-                }
-                bytesRead += 28;
-                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, sizeof(buffer), &bytesRead);
-                if (bytesJustRead == sizeof(buffer)) {
-                    ma_uint32 loopCount = ma_dr_wav_bytes_to_u32(buffer);
-                    ma_uint64 calculatedLoopCount;
-                    calculatedLoopCount = (pChunkHeader->sizeInBytes - MA_DR_WAV_SMPL_BYTES) / MA_DR_WAV_SMPL_LOOP_BYTES;
-                    if (calculatedLoopCount == loopCount) {
-                        bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, sizeof(buffer), &bytesRead);
-                        if (bytesJustRead == sizeof(buffer)) {
-                            ma_uint32 samplerSpecificDataSizeInBytes = ma_dr_wav_bytes_to_u32(buffer);
-                            pParser->metadataCount += 1;
-                            ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, sizeof(ma_dr_wav_smpl_loop) * loopCount, MA_DR_WAV_METADATA_ALIGNMENT);
-                            ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, samplerSpecificDataSizeInBytes, 1);
-                        }
-                    } else {
-                    }
-                }
-            } else {
-                bytesRead = ma_dr_wav__read_smpl_to_metadata_obj(pParser, pChunkHeader, &pParser->pMetadata[pParser->metadataCursor]);
-                if (bytesRead == pChunkHeader->sizeInBytes) {
-                    pParser->metadataCursor += 1;
-                } else {
-                }
-            }
-        } else {
-        }
-    } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_inst, "inst")) {
-        if (pChunkHeader->sizeInBytes == MA_DR_WAV_INST_BYTES) {
-            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
-                pParser->metadataCount += 1;
-            } else {
-                bytesRead = ma_dr_wav__read_inst_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor]);
-                if (bytesRead == pChunkHeader->sizeInBytes) {
-                    pParser->metadataCursor += 1;
-                } else {
-                }
-            }
-        } else {
-        }
-    } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_acid, "acid")) {
-        if (pChunkHeader->sizeInBytes == MA_DR_WAV_ACID_BYTES) {
-            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
-                pParser->metadataCount += 1;
-            } else {
-                bytesRead = ma_dr_wav__read_acid_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor]);
-                if (bytesRead == pChunkHeader->sizeInBytes) {
-                    pParser->metadataCursor += 1;
-                } else {
-                }
-            }
-        } else {
-        }
-    } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_cue, "cue ")) {
-        if (pChunkHeader->sizeInBytes >= MA_DR_WAV_CUE_BYTES) {
-            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
-                size_t cueCount;
-                pParser->metadataCount += 1;
-                cueCount = (size_t)(pChunkHeader->sizeInBytes - MA_DR_WAV_CUE_BYTES) / MA_DR_WAV_CUE_POINT_BYTES;
-                ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, sizeof(ma_dr_wav_cue_point) * cueCount, MA_DR_WAV_METADATA_ALIGNMENT);
-            } else {
-                bytesRead = ma_dr_wav__read_cue_to_metadata_obj(pParser, pChunkHeader, &pParser->pMetadata[pParser->metadataCursor]);
-                if (bytesRead == pChunkHeader->sizeInBytes) {
-                    pParser->metadataCursor += 1;
-                } else {
-                }
-            }
-        } else {
-        }
-    } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_bext, "bext")) {
-        if (pChunkHeader->sizeInBytes >= MA_DR_WAV_BEXT_BYTES) {
-            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
-                char buffer[MA_DR_WAV_BEXT_DESCRIPTION_BYTES + 1];
-                size_t allocSizeNeeded = MA_DR_WAV_BEXT_UMID_BYTES;
-                size_t bytesJustRead;
-                buffer[MA_DR_WAV_BEXT_DESCRIPTION_BYTES] = '\0';
-                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, MA_DR_WAV_BEXT_DESCRIPTION_BYTES, &bytesRead);
-                if (bytesJustRead != MA_DR_WAV_BEXT_DESCRIPTION_BYTES) {
-                    return bytesRead;
-                }
-                allocSizeNeeded += ma_dr_wav__strlen(buffer) + 1;
-                buffer[MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES] = '\0';
-                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES, &bytesRead);
-                if (bytesJustRead != MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES) {
-                    return bytesRead;
-                }
-                allocSizeNeeded += ma_dr_wav__strlen(buffer) + 1;
-                buffer[MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES] = '\0';
-                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES, &bytesRead);
-                if (bytesJustRead != MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES) {
-                    return bytesRead;
-                }
-                allocSizeNeeded += ma_dr_wav__strlen(buffer) + 1;
-                allocSizeNeeded += (size_t)pChunkHeader->sizeInBytes - MA_DR_WAV_BEXT_BYTES;
-                ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, allocSizeNeeded, 1);
-                pParser->metadataCount += 1;
-            } else {
-                bytesRead = ma_dr_wav__read_bext_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor], pChunkHeader->sizeInBytes);
-                if (bytesRead == pChunkHeader->sizeInBytes) {
-                    pParser->metadataCursor += 1;
-                } else {
-                }
-            }
-        } else {
-        }
-    } else if (ma_dr_wav_fourcc_equal(pChunkID, "LIST") || ma_dr_wav_fourcc_equal(pChunkID, "list")) {
-        ma_dr_wav_metadata_location listType = ma_dr_wav_metadata_location_invalid;
-        while (bytesRead < pChunkHeader->sizeInBytes) {
-            ma_uint8 subchunkId[4];
-            ma_uint8 subchunkSizeBuffer[4];
-            ma_uint64 subchunkDataSize;
-            ma_uint64 subchunkBytesRead = 0;
-            ma_uint64 bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, subchunkId, sizeof(subchunkId), &bytesRead);
-            if (bytesJustRead != sizeof(subchunkId)) {
-                break;
-            }
-            if (ma_dr_wav_fourcc_equal(subchunkId, "adtl")) {
-                listType = ma_dr_wav_metadata_location_inside_adtl_list;
-                continue;
-            } else if (ma_dr_wav_fourcc_equal(subchunkId, "INFO")) {
-                listType = ma_dr_wav_metadata_location_inside_info_list;
-                continue;
-            }
-            bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, subchunkSizeBuffer, sizeof(subchunkSizeBuffer), &bytesRead);
-            if (bytesJustRead != sizeof(subchunkSizeBuffer)) {
-                break;
-            }
-            subchunkDataSize = ma_dr_wav_bytes_to_u32(subchunkSizeBuffer);
-            if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_label, "labl") || ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_note, "note")) {
-                if (subchunkDataSize >= MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES) {
-                    ma_uint64 stringSizeWithNullTerm = subchunkDataSize - MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES;
-                    if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
-                        pParser->metadataCount += 1;
-                        ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, (size_t)stringSizeWithNullTerm, 1);
-                    } else {
-                        subchunkBytesRead = ma_dr_wav__read_list_label_or_note_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor], subchunkDataSize, ma_dr_wav_fourcc_equal(subchunkId, "labl") ? ma_dr_wav_metadata_type_list_label : ma_dr_wav_metadata_type_list_note);
-                        if (subchunkBytesRead == subchunkDataSize) {
-                            pParser->metadataCursor += 1;
-                        } else {
-                        }
-                    }
-                } else {
-                }
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_labelled_cue_region, "ltxt")) {
-                if (subchunkDataSize >= MA_DR_WAV_LIST_LABELLED_TEXT_BYTES) {
-                    ma_uint64 stringSizeWithNullTerminator = subchunkDataSize - MA_DR_WAV_LIST_LABELLED_TEXT_BYTES;
-                    if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
-                        pParser->metadataCount += 1;
-                        ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, (size_t)stringSizeWithNullTerminator, 1);
-                    } else {
-                        subchunkBytesRead = ma_dr_wav__read_list_labelled_cue_region_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor], subchunkDataSize);
-                        if (subchunkBytesRead == subchunkDataSize) {
-                            pParser->metadataCursor += 1;
-                        } else {
-                        }
-                    }
-                } else {
-                }
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_software, "ISFT")) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_software);
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_copyright, "ICOP")) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_copyright);
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_title, "INAM")) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_title);
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_artist, "IART")) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_artist);
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_comment, "ICMT")) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_comment);
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_date, "ICRD")) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_date);
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_genre, "IGNR")) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_genre);
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_album, "IPRD")) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_album);
-            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_tracknumber, "ITRK")) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_tracknumber);
-            } else if ((allowedMetadataTypes & ma_dr_wav_metadata_type_unknown) != 0) {
-                subchunkBytesRead = ma_dr_wav__metadata_process_unknown_chunk(pParser, subchunkId, subchunkDataSize, listType);
-            }
-            bytesRead += subchunkBytesRead;
-            MA_DR_WAV_ASSERT(subchunkBytesRead <= subchunkDataSize);
-            if (subchunkBytesRead < subchunkDataSize) {
-                ma_uint64 bytesToSeek = subchunkDataSize - subchunkBytesRead;
-                if (!pParser->onSeek(pParser->pReadSeekUserData, (int)bytesToSeek, ma_dr_wav_seek_origin_current)) {
-                    break;
-                }
-                bytesRead += bytesToSeek;
-            }
-            if ((subchunkDataSize % 2) == 1) {
-                if (!pParser->onSeek(pParser->pReadSeekUserData, 1, ma_dr_wav_seek_origin_current)) {
-                    break;
-                }
-                bytesRead += 1;
-            }
-        }
-    } else if ((allowedMetadataTypes & ma_dr_wav_metadata_type_unknown) != 0) {
-        bytesRead = ma_dr_wav__metadata_process_unknown_chunk(pParser, pChunkID, pChunkHeader->sizeInBytes, ma_dr_wav_metadata_location_top_level);
-    }
-    return bytesRead;
-}
-MA_PRIVATE ma_uint32 ma_dr_wav_get_bytes_per_pcm_frame(ma_dr_wav* pWav)
-{
-    ma_uint32 bytesPerFrame;
-    if ((pWav->bitsPerSample & 0x7) == 0) {
-        bytesPerFrame = (pWav->bitsPerSample * pWav->fmt.channels) >> 3;
-    } else {
-        bytesPerFrame = pWav->fmt.blockAlign;
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ALAW || pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_MULAW) {
-        if (bytesPerFrame != pWav->fmt.channels) {
-            return 0;
-        }
-    }
-    return bytesPerFrame;
-}
-MA_API ma_uint16 ma_dr_wav_fmt_get_format(const ma_dr_wav_fmt* pFMT)
-{
-    if (pFMT == NULL) {
-        return 0;
-    }
-    if (pFMT->formatTag != MA_DR_WAVE_FORMAT_EXTENSIBLE) {
-        return pFMT->formatTag;
-    } else {
-        return ma_dr_wav_bytes_to_u16(pFMT->subFormat);
-    }
-}
-MA_PRIVATE ma_bool32 ma_dr_wav_preinit(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pReadSeekUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pWav == NULL || onRead == NULL || onSeek == NULL) {
-        return MA_FALSE;
-    }
-    MA_DR_WAV_ZERO_MEMORY(pWav, sizeof(*pWav));
-    pWav->onRead    = onRead;
-    pWav->onSeek    = onSeek;
-    pWav->pUserData = pReadSeekUserData;
-    pWav->allocationCallbacks = ma_dr_wav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
-    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
-        return MA_FALSE;
-    }
-    return MA_TRUE;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav_init__internal(ma_dr_wav* pWav, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags)
-{
-    ma_result result;
-    ma_uint64 cursor;
-    ma_bool32 sequential;
-    ma_uint8 riff[4];
-    ma_dr_wav_fmt fmt;
-    unsigned short translatedFormatTag;
-    ma_uint64 dataChunkSize = 0;
-    ma_uint64 sampleCountFromFactChunk = 0;
-    ma_uint64 metadataStartPos;
-    ma_dr_wav__metadata_parser metadataParser;
-    ma_bool8 isProcessingMetadata = MA_FALSE;
-    ma_bool8 foundChunk_fmt  = MA_FALSE;
-    ma_bool8 foundChunk_data = MA_FALSE;
-    ma_bool8 isAIFCFormType = MA_FALSE;
-    ma_uint64 aiffFrameCount = 0;
-    cursor = 0;
-    sequential = (flags & MA_DR_WAV_SEQUENTIAL) != 0;
-    MA_DR_WAV_ZERO_OBJECT(&fmt);
-    if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, riff, sizeof(riff), &cursor) != sizeof(riff)) {
-        return MA_FALSE;
-    }
-    if (ma_dr_wav_fourcc_equal(riff, "RIFF")) {
-        pWav->container = ma_dr_wav_container_riff;
-    } else if (ma_dr_wav_fourcc_equal(riff, "RIFX")) {
-        pWav->container = ma_dr_wav_container_rifx;
-    } else if (ma_dr_wav_fourcc_equal(riff, "riff")) {
-        int i;
-        ma_uint8 riff2[12];
-        pWav->container = ma_dr_wav_container_w64;
-        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, riff2, sizeof(riff2), &cursor) != sizeof(riff2)) {
-            return MA_FALSE;
-        }
-        for (i = 0; i < 12; ++i) {
-            if (riff2[i] != ma_dr_wavGUID_W64_RIFF[i+4]) {
-                return MA_FALSE;
-            }
-        }
-    } else if (ma_dr_wav_fourcc_equal(riff, "RF64")) {
-        pWav->container = ma_dr_wav_container_rf64;
-    } else if (ma_dr_wav_fourcc_equal(riff, "FORM")) {
-        pWav->container = ma_dr_wav_container_aiff;
-    } else {
-        return MA_FALSE;
-    }
-    if (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx || pWav->container == ma_dr_wav_container_rf64) {
-        ma_uint8 chunkSizeBytes[4];
-        ma_uint8 wave[4];
-        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
-            return MA_FALSE;
-        }
-        if (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx) {
-            if (ma_dr_wav_bytes_to_u32_ex(chunkSizeBytes, pWav->container) < 36) {
-                return MA_FALSE;
-            }
-        } else if (pWav->container == ma_dr_wav_container_rf64) {
-            if (ma_dr_wav_bytes_to_u32_le(chunkSizeBytes) != 0xFFFFFFFF) {
-                return MA_FALSE;
-            }
-        } else {
-            return MA_FALSE;
-        }
-        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
-            return MA_FALSE;
-        }
-        if (!ma_dr_wav_fourcc_equal(wave, "WAVE")) {
-            return MA_FALSE;
-        }
-    } else if (pWav->container == ma_dr_wav_container_w64) {
-        ma_uint8 chunkSizeBytes[8];
-        ma_uint8 wave[16];
-        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
-            return MA_FALSE;
-        }
-        if (ma_dr_wav_bytes_to_u64(chunkSizeBytes) < 80) {
-            return MA_FALSE;
-        }
-        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
-            return MA_FALSE;
-        }
-        if (!ma_dr_wav_guid_equal(wave, ma_dr_wavGUID_W64_WAVE)) {
-            return MA_FALSE;
-        }
-    } else if (pWav->container == ma_dr_wav_container_aiff) {
-        ma_uint8 chunkSizeBytes[4];
-        ma_uint8 aiff[4];
-        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
-            return MA_FALSE;
-        }
-        if (ma_dr_wav_bytes_to_u32_be(chunkSizeBytes) < 18) {
-            return MA_FALSE;
-        }
-        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, aiff, sizeof(aiff), &cursor) != sizeof(aiff)) {
-            return MA_FALSE;
-        }
-        if (ma_dr_wav_fourcc_equal(aiff, "AIFF")) {
-            isAIFCFormType = MA_FALSE;
-        } else if (ma_dr_wav_fourcc_equal(aiff, "AIFC")) {
-            isAIFCFormType = MA_TRUE;
-        } else {
-            return MA_FALSE;
-        }
-    } else {
-        return MA_FALSE;
-    }
-    if (pWav->container == ma_dr_wav_container_rf64) {
-        ma_uint8 sizeBytes[8];
-        ma_uint64 bytesRemainingInChunk;
-        ma_dr_wav_chunk_header header;
-        result = ma_dr_wav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
-        if (result != MA_SUCCESS) {
-            return MA_FALSE;
-        }
-        if (!ma_dr_wav_fourcc_equal(header.id.fourcc, "ds64")) {
-            return MA_FALSE;
-        }
-        bytesRemainingInChunk = header.sizeInBytes + header.paddingSize;
-        if (!ma_dr_wav__seek_forward(pWav->onSeek, 8, pWav->pUserData)) {
-            return MA_FALSE;
-        }
-        bytesRemainingInChunk -= 8;
-        cursor += 8;
-        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
-            return MA_FALSE;
-        }
-        bytesRemainingInChunk -= 8;
-        dataChunkSize = ma_dr_wav_bytes_to_u64(sizeBytes);
-        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
-            return MA_FALSE;
-        }
-        bytesRemainingInChunk -= 8;
-        sampleCountFromFactChunk = ma_dr_wav_bytes_to_u64(sizeBytes);
-        if (!ma_dr_wav__seek_forward(pWav->onSeek, bytesRemainingInChunk, pWav->pUserData)) {
-            return MA_FALSE;
-        }
-        cursor += bytesRemainingInChunk;
-    }
-    metadataStartPos = cursor;
-    isProcessingMetadata = !sequential && ((flags & MA_DR_WAV_WITH_METADATA) != 0);
-    if (pWav->container != ma_dr_wav_container_riff && pWav->container != ma_dr_wav_container_rf64) {
-        isProcessingMetadata = MA_FALSE;
-    }
-    MA_DR_WAV_ZERO_MEMORY(&metadataParser, sizeof(metadataParser));
-    if (isProcessingMetadata) {
-        metadataParser.onRead = pWav->onRead;
-        metadataParser.onSeek = pWav->onSeek;
-        metadataParser.pReadSeekUserData = pWav->pUserData;
-        metadataParser.stage  = ma_dr_wav__metadata_parser_stage_count;
-    }
-    for (;;) {
-        ma_dr_wav_chunk_header header;
-        ma_uint64 chunkSize;
-        result = ma_dr_wav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
-        if (result != MA_SUCCESS) {
-            break;
-        }
-        chunkSize = header.sizeInBytes;
-        if (!sequential && onChunk != NULL) {
-            ma_uint64 callbackBytesRead = onChunk(pChunkUserData, pWav->onRead, pWav->onSeek, pWav->pUserData, &header, pWav->container, &fmt);
-            if (callbackBytesRead > 0) {
-                if (ma_dr_wav__seek_from_start(pWav->onSeek, cursor, pWav->pUserData) == MA_FALSE) {
-                    return MA_FALSE;
-                }
-            }
-        }
-        if (((pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx || pWav->container == ma_dr_wav_container_rf64) && ma_dr_wav_fourcc_equal(header.id.fourcc, "fmt ")) ||
-            ((pWav->container == ma_dr_wav_container_w64) && ma_dr_wav_guid_equal(header.id.guid, ma_dr_wavGUID_W64_FMT))) {
-            ma_uint8 fmtData[16];
-            foundChunk_fmt = MA_TRUE;
-            if (pWav->onRead(pWav->pUserData, fmtData, sizeof(fmtData)) != sizeof(fmtData)) {
-                return MA_FALSE;
-            }
-            cursor += sizeof(fmtData);
-            fmt.formatTag      = ma_dr_wav_bytes_to_u16_ex(fmtData + 0,  pWav->container);
-            fmt.channels       = ma_dr_wav_bytes_to_u16_ex(fmtData + 2,  pWav->container);
-            fmt.sampleRate     = ma_dr_wav_bytes_to_u32_ex(fmtData + 4,  pWav->container);
-            fmt.avgBytesPerSec = ma_dr_wav_bytes_to_u32_ex(fmtData + 8,  pWav->container);
-            fmt.blockAlign     = ma_dr_wav_bytes_to_u16_ex(fmtData + 12, pWav->container);
-            fmt.bitsPerSample  = ma_dr_wav_bytes_to_u16_ex(fmtData + 14, pWav->container);
-            fmt.extendedSize       = 0;
-            fmt.validBitsPerSample = 0;
-            fmt.channelMask        = 0;
-            MA_DR_WAV_ZERO_MEMORY(fmt.subFormat, sizeof(fmt.subFormat));
-            if (header.sizeInBytes > 16) {
-                ma_uint8 fmt_cbSize[2];
-                int bytesReadSoFar = 0;
-                if (pWav->onRead(pWav->pUserData, fmt_cbSize, sizeof(fmt_cbSize)) != sizeof(fmt_cbSize)) {
-                    return MA_FALSE;
-                }
-                cursor += sizeof(fmt_cbSize);
-                bytesReadSoFar = 18;
-                fmt.extendedSize = ma_dr_wav_bytes_to_u16_ex(fmt_cbSize, pWav->container);
-                if (fmt.extendedSize > 0) {
-                    if (fmt.formatTag == MA_DR_WAVE_FORMAT_EXTENSIBLE) {
-                        if (fmt.extendedSize != 22) {
-                            return MA_FALSE;
-                        }
-                    }
-                    if (fmt.formatTag == MA_DR_WAVE_FORMAT_EXTENSIBLE) {
-                        ma_uint8 fmtext[22];
-                        if (pWav->onRead(pWav->pUserData, fmtext, fmt.extendedSize) != fmt.extendedSize) {
-                            return MA_FALSE;
-                        }
-                        fmt.validBitsPerSample = ma_dr_wav_bytes_to_u16_ex(fmtext + 0, pWav->container);
-                        fmt.channelMask        = ma_dr_wav_bytes_to_u32_ex(fmtext + 2, pWav->container);
-                        ma_dr_wav_bytes_to_guid(fmtext + 6, fmt.subFormat);
-                    } else {
-                        if (pWav->onSeek(pWav->pUserData, fmt.extendedSize, ma_dr_wav_seek_origin_current) == MA_FALSE) {
-                            return MA_FALSE;
-                        }
-                    }
-                    cursor += fmt.extendedSize;
-                    bytesReadSoFar += fmt.extendedSize;
-                }
-                if (pWav->onSeek(pWav->pUserData, (int)(header.sizeInBytes - bytesReadSoFar), ma_dr_wav_seek_origin_current) == MA_FALSE) {
-                    return MA_FALSE;
-                }
-                cursor += (header.sizeInBytes - bytesReadSoFar);
-            }
-            if (header.paddingSize > 0) {
-                if (ma_dr_wav__seek_forward(pWav->onSeek, header.paddingSize, pWav->pUserData) == MA_FALSE) {
-                    break;
-                }
-                cursor += header.paddingSize;
-            }
-            continue;
-        }
-        if (((pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx || pWav->container == ma_dr_wav_container_rf64) && ma_dr_wav_fourcc_equal(header.id.fourcc, "data")) ||
-            ((pWav->container == ma_dr_wav_container_w64) && ma_dr_wav_guid_equal(header.id.guid, ma_dr_wavGUID_W64_DATA))) {
-            foundChunk_data = MA_TRUE;
-            pWav->dataChunkDataPos  = cursor;
-            if (pWav->container != ma_dr_wav_container_rf64) {
-                dataChunkSize = chunkSize;
-            }
-            if (sequential || !isProcessingMetadata) {
-                break;
-            } else {
-                chunkSize += header.paddingSize;
-                if (ma_dr_wav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == MA_FALSE) {
-                    break;
-                }
-                cursor += chunkSize;
-                continue;
-            }
-        }
-        if (((pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx || pWav->container == ma_dr_wav_container_rf64) && ma_dr_wav_fourcc_equal(header.id.fourcc, "fact")) ||
-            ((pWav->container == ma_dr_wav_container_w64) && ma_dr_wav_guid_equal(header.id.guid, ma_dr_wavGUID_W64_FACT))) {
-            if (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx) {
-                ma_uint8 sampleCount[4];
-                if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, &sampleCount, 4, &cursor) != 4) {
-                    return MA_FALSE;
-                }
-                chunkSize -= 4;
-                if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
-                    sampleCountFromFactChunk = ma_dr_wav_bytes_to_u32_ex(sampleCount, pWav->container);
-                } else {
-                    sampleCountFromFactChunk = 0;
-                }
-            } else if (pWav->container == ma_dr_wav_container_w64) {
-                if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, &sampleCountFromFactChunk, 8, &cursor) != 8) {
-                    return MA_FALSE;
-                }
-                chunkSize -= 8;
-            } else if (pWav->container == ma_dr_wav_container_rf64) {
-            }
-            chunkSize += header.paddingSize;
-            if (ma_dr_wav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == MA_FALSE) {
-                break;
-            }
-            cursor += chunkSize;
-            continue;
-        }
-        if (pWav->container == ma_dr_wav_container_aiff && ma_dr_wav_fourcc_equal(header.id.fourcc, "COMM")) {
-            ma_uint8 commData[24];
-            ma_uint32 commDataBytesToRead;
-            ma_uint16 channels;
-            ma_uint32 frameCount;
-            ma_uint16 sampleSizeInBits;
-            ma_int64  sampleRate;
-            ma_uint16 compressionFormat;
-            foundChunk_fmt = MA_TRUE;
-            if (isAIFCFormType) {
-                commDataBytesToRead = 24;
-                if (header.sizeInBytes < commDataBytesToRead) {
-                    return MA_FALSE;
-                }
-            } else {
-                commDataBytesToRead = 18;
-                if (header.sizeInBytes != commDataBytesToRead) {
-                    return MA_FALSE;
-                }
-            }
-            if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, commData, commDataBytesToRead, &cursor) != commDataBytesToRead) {
-                return MA_FALSE;
-            }
-            channels         = ma_dr_wav_bytes_to_u16_ex     (commData + 0, pWav->container);
-            frameCount       = ma_dr_wav_bytes_to_u32_ex     (commData + 2, pWav->container);
-            sampleSizeInBits = ma_dr_wav_bytes_to_u16_ex     (commData + 6, pWav->container);
-            sampleRate       = ma_dr_wav_aiff_extented_to_s64(commData + 8);
-            if (sampleRate < 0 || sampleRate > 0xFFFFFFFF) {
-                return MA_FALSE;
-            }
-            if (isAIFCFormType) {
-                const ma_uint8* type = commData + 18;
-                if (ma_dr_wav_fourcc_equal(type, "NONE")) {
-                    compressionFormat = MA_DR_WAVE_FORMAT_PCM;
-                } else if (ma_dr_wav_fourcc_equal(type, "raw ")) {
-                    compressionFormat = MA_DR_WAVE_FORMAT_PCM;
-                    if (sampleSizeInBits == 8) {
-                        pWav->aiff.isUnsigned = MA_TRUE;
-                    }
-                } else if (ma_dr_wav_fourcc_equal(type, "sowt")) {
-                    compressionFormat = MA_DR_WAVE_FORMAT_PCM;
-                    pWav->aiff.isLE = MA_TRUE;
-                } else if (ma_dr_wav_fourcc_equal(type, "fl32") || ma_dr_wav_fourcc_equal(type, "fl64") || ma_dr_wav_fourcc_equal(type, "FL32") || ma_dr_wav_fourcc_equal(type, "FL64")) {
-                    compressionFormat = MA_DR_WAVE_FORMAT_IEEE_FLOAT;
-                } else if (ma_dr_wav_fourcc_equal(type, "alaw") || ma_dr_wav_fourcc_equal(type, "ALAW")) {
-                    compressionFormat = MA_DR_WAVE_FORMAT_ALAW;
-                } else if (ma_dr_wav_fourcc_equal(type, "ulaw") || ma_dr_wav_fourcc_equal(type, "ULAW")) {
-                    compressionFormat = MA_DR_WAVE_FORMAT_MULAW;
-                } else if (ma_dr_wav_fourcc_equal(type, "ima4")) {
-                    compressionFormat = MA_DR_WAVE_FORMAT_DVI_ADPCM;
-                    sampleSizeInBits = 4;
-                    return MA_FALSE;
-                } else {
-                    return MA_FALSE;
-                }
-            } else {
-                compressionFormat = MA_DR_WAVE_FORMAT_PCM;
-            }
-            aiffFrameCount = frameCount;
-            fmt.formatTag      = compressionFormat;
-            fmt.channels       = channels;
-            fmt.sampleRate     = (ma_uint32)sampleRate;
-            fmt.bitsPerSample  = sampleSizeInBits;
-            fmt.blockAlign     = (ma_uint16)(fmt.channels * fmt.bitsPerSample / 8);
-            fmt.avgBytesPerSec = fmt.blockAlign * fmt.sampleRate;
-            if (fmt.blockAlign == 0 && compressionFormat == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-                fmt.blockAlign = 34 * fmt.channels;
-            }
-            if (compressionFormat == MA_DR_WAVE_FORMAT_ALAW || compressionFormat == MA_DR_WAVE_FORMAT_MULAW) {
-                if (fmt.bitsPerSample > 8) {
-                    fmt.bitsPerSample = 8;
-                    fmt.blockAlign = fmt.channels;
-                }
-            }
-            fmt.bitsPerSample += (fmt.bitsPerSample & 7);
-            if (isAIFCFormType) {
-                if (ma_dr_wav__seek_forward(pWav->onSeek, (chunkSize - commDataBytesToRead), pWav->pUserData) == MA_FALSE) {
-                    return MA_FALSE;
-                }
-                cursor += (chunkSize - commDataBytesToRead);
-            }
-            continue;
-        }
-        if (pWav->container == ma_dr_wav_container_aiff && ma_dr_wav_fourcc_equal(header.id.fourcc, "SSND")) {
-            ma_uint8 offsetAndBlockSizeData[8];
-            ma_uint32 offset;
-            foundChunk_data = MA_TRUE;
-            if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, offsetAndBlockSizeData, sizeof(offsetAndBlockSizeData), &cursor) != sizeof(offsetAndBlockSizeData)) {
-                return MA_FALSE;
-            }
-            offset = ma_dr_wav_bytes_to_u32_ex(offsetAndBlockSizeData + 0, pWav->container);
-            if (ma_dr_wav__seek_forward(pWav->onSeek, offset, pWav->pUserData) == MA_FALSE) {
-                return MA_FALSE;
-            }
-            cursor += offset;
-            pWav->dataChunkDataPos = cursor;
-            dataChunkSize = chunkSize;
-            if (sequential || !isProcessingMetadata) {
-                break;
-            } else {
-                if (ma_dr_wav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == MA_FALSE) {
-                    break;
-                }
-                cursor += chunkSize;
-                continue;
-            }
-        }
-        if (isProcessingMetadata) {
-            ma_uint64 metadataBytesRead;
-            metadataBytesRead = ma_dr_wav__metadata_process_chunk(&metadataParser, &header, ma_dr_wav_metadata_type_all_including_unknown);
-            MA_DR_WAV_ASSERT(metadataBytesRead <= header.sizeInBytes);
-            if (ma_dr_wav__seek_from_start(pWav->onSeek, cursor, pWav->pUserData) == MA_FALSE) {
-                break;
-            }
-        }
-        chunkSize += header.paddingSize;
-        if (ma_dr_wav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == MA_FALSE) {
-            break;
-        }
-        cursor += chunkSize;
-    }
-    if (!foundChunk_fmt || !foundChunk_data) {
-        return MA_FALSE;
-    }
-    if ((fmt.sampleRate    == 0 || fmt.sampleRate    > MA_DR_WAV_MAX_SAMPLE_RATE    ) ||
-        (fmt.channels      == 0 || fmt.channels      > MA_DR_WAV_MAX_CHANNELS       ) ||
-        (fmt.bitsPerSample == 0 || fmt.bitsPerSample > MA_DR_WAV_MAX_BITS_PER_SAMPLE) ||
-        fmt.blockAlign == 0) {
-        return MA_FALSE;
-    }
-    translatedFormatTag = fmt.formatTag;
-    if (translatedFormatTag == MA_DR_WAVE_FORMAT_EXTENSIBLE) {
-        translatedFormatTag = ma_dr_wav_bytes_to_u16_ex(fmt.subFormat + 0, pWav->container);
-    }
-    if (!sequential) {
-        if (!ma_dr_wav__seek_from_start(pWav->onSeek, pWav->dataChunkDataPos, pWav->pUserData)) {
-            return MA_FALSE;
-        }
-        cursor = pWav->dataChunkDataPos;
-    }
-    if (isProcessingMetadata && metadataParser.metadataCount > 0) {
-        if (ma_dr_wav__seek_from_start(pWav->onSeek, metadataStartPos, pWav->pUserData) == MA_FALSE) {
-            return MA_FALSE;
-        }
-        result = ma_dr_wav__metadata_alloc(&metadataParser, &pWav->allocationCallbacks);
-        if (result != MA_SUCCESS) {
-            return MA_FALSE;
-        }
-        metadataParser.stage = ma_dr_wav__metadata_parser_stage_read;
-        for (;;) {
-            ma_dr_wav_chunk_header header;
-            ma_uint64 metadataBytesRead;
-            result = ma_dr_wav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
-            if (result != MA_SUCCESS) {
-                break;
-            }
-            metadataBytesRead = ma_dr_wav__metadata_process_chunk(&metadataParser, &header, ma_dr_wav_metadata_type_all_including_unknown);
-            if (ma_dr_wav__seek_forward(pWav->onSeek, (header.sizeInBytes + header.paddingSize) - metadataBytesRead, pWav->pUserData) == MA_FALSE) {
-                ma_dr_wav_free(metadataParser.pMetadata, &pWav->allocationCallbacks);
-                return MA_FALSE;
-            }
-        }
-        pWav->pMetadata     = metadataParser.pMetadata;
-        pWav->metadataCount = metadataParser.metadataCount;
-    }
-    if (dataChunkSize == 0xFFFFFFFF && (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx) && pWav->isSequentialWrite == MA_FALSE) {
-        dataChunkSize = 0;
-        for (;;) {
-            ma_uint8 temp[4096];
-            size_t bytesRead = pWav->onRead(pWav->pUserData, temp, sizeof(temp));
-            dataChunkSize += bytesRead;
-            if (bytesRead < sizeof(temp)) {
-                break;
-            }
-        }
-    }
-    if (ma_dr_wav__seek_from_start(pWav->onSeek, pWav->dataChunkDataPos, pWav->pUserData) == MA_FALSE) {
-        ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
-        return MA_FALSE;
-    }
-    pWav->fmt                 = fmt;
-    pWav->sampleRate          = fmt.sampleRate;
-    pWav->channels            = fmt.channels;
-    pWav->bitsPerSample       = fmt.bitsPerSample;
-    pWav->bytesRemaining      = dataChunkSize;
-    pWav->translatedFormatTag = translatedFormatTag;
-    pWav->dataChunkDataSize   = dataChunkSize;
-    if (sampleCountFromFactChunk != 0) {
-        pWav->totalPCMFrameCount = sampleCountFromFactChunk;
-    } else if (aiffFrameCount != 0) {
-        pWav->totalPCMFrameCount = aiffFrameCount;
-    } else {
-        ma_uint32 bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-        if (bytesPerFrame == 0) {
-            ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
-            return MA_FALSE;
-        }
-        pWav->totalPCMFrameCount = dataChunkSize / bytesPerFrame;
-        if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
-            ma_uint64 totalBlockHeaderSizeInBytes;
-            ma_uint64 blockCount = dataChunkSize / fmt.blockAlign;
-            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
-                blockCount += 1;
-            }
-            totalBlockHeaderSizeInBytes = blockCount * (6*fmt.channels);
-            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
-        }
-        if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-            ma_uint64 totalBlockHeaderSizeInBytes;
-            ma_uint64 blockCount = dataChunkSize / fmt.blockAlign;
-            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
-                blockCount += 1;
-            }
-            totalBlockHeaderSizeInBytes = blockCount * (4*fmt.channels);
-            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
-            pWav->totalPCMFrameCount += blockCount;
-        }
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-        if (pWav->channels > 2) {
-            ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
-            return MA_FALSE;
-        }
-    }
-    if (ma_dr_wav_get_bytes_per_pcm_frame(pWav) == 0) {
-        ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
-        return MA_FALSE;
-    }
-#ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
-        ma_uint64 blockCount = dataChunkSize / fmt.blockAlign;
-        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (6*pWav->channels))) * 2)) / fmt.channels;
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-        ma_uint64 blockCount = dataChunkSize / fmt.blockAlign;
-        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (4*pWav->channels))) * 2) + (blockCount * pWav->channels)) / fmt.channels;
-    }
-#endif
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_wav_init(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_ex(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, ma_dr_wav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (!ma_dr_wav_preinit(pWav, onRead, onSeek, pReadSeekUserData, pAllocationCallbacks)) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init__internal(pWav, onChunk, pChunkUserData, flags);
-}
-MA_API ma_bool32 ma_dr_wav_init_with_metadata(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (!ma_dr_wav_preinit(pWav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init__internal(pWav, NULL, NULL, flags | MA_DR_WAV_WITH_METADATA);
-}
-MA_API ma_dr_wav_metadata* ma_dr_wav_take_ownership_of_metadata(ma_dr_wav* pWav)
-{
-    ma_dr_wav_metadata *result = pWav->pMetadata;
-    pWav->pMetadata     = NULL;
-    pWav->metadataCount = 0;
-    return result;
-}
-MA_PRIVATE size_t ma_dr_wav__write(ma_dr_wav* pWav, const void* pData, size_t dataSize)
-{
-    MA_DR_WAV_ASSERT(pWav          != NULL);
-    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
-    return pWav->onWrite(pWav->pUserData, pData, dataSize);
-}
-MA_PRIVATE size_t ma_dr_wav__write_byte(ma_dr_wav* pWav, ma_uint8 byte)
-{
-    MA_DR_WAV_ASSERT(pWav          != NULL);
-    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
-    return pWav->onWrite(pWav->pUserData, &byte, 1);
-}
-MA_PRIVATE size_t ma_dr_wav__write_u16ne_to_le(ma_dr_wav* pWav, ma_uint16 value)
-{
-    MA_DR_WAV_ASSERT(pWav          != NULL);
-    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
-    if (!ma_dr_wav__is_little_endian()) {
-        value = ma_dr_wav__bswap16(value);
-    }
-    return ma_dr_wav__write(pWav, &value, 2);
-}
-MA_PRIVATE size_t ma_dr_wav__write_u32ne_to_le(ma_dr_wav* pWav, ma_uint32 value)
-{
-    MA_DR_WAV_ASSERT(pWav          != NULL);
-    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
-    if (!ma_dr_wav__is_little_endian()) {
-        value = ma_dr_wav__bswap32(value);
-    }
-    return ma_dr_wav__write(pWav, &value, 4);
-}
-MA_PRIVATE size_t ma_dr_wav__write_u64ne_to_le(ma_dr_wav* pWav, ma_uint64 value)
-{
-    MA_DR_WAV_ASSERT(pWav          != NULL);
-    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
-    if (!ma_dr_wav__is_little_endian()) {
-        value = ma_dr_wav__bswap64(value);
-    }
-    return ma_dr_wav__write(pWav, &value, 8);
-}
-MA_PRIVATE size_t ma_dr_wav__write_f32ne_to_le(ma_dr_wav* pWav, float value)
-{
-    union {
-       ma_uint32 u32;
-       float f32;
-    } u;
-    MA_DR_WAV_ASSERT(pWav          != NULL);
-    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
-    u.f32 = value;
-    if (!ma_dr_wav__is_little_endian()) {
-        u.u32 = ma_dr_wav__bswap32(u.u32);
-    }
-    return ma_dr_wav__write(pWav, &u.u32, 4);
-}
-MA_PRIVATE size_t ma_dr_wav__write_or_count(ma_dr_wav* pWav, const void* pData, size_t dataSize)
-{
-    if (pWav == NULL) {
-        return dataSize;
-    }
-    return ma_dr_wav__write(pWav, pData, dataSize);
-}
-MA_PRIVATE size_t ma_dr_wav__write_or_count_byte(ma_dr_wav* pWav, ma_uint8 byte)
-{
-    if (pWav == NULL) {
-        return 1;
-    }
-    return ma_dr_wav__write_byte(pWav, byte);
-}
-MA_PRIVATE size_t ma_dr_wav__write_or_count_u16ne_to_le(ma_dr_wav* pWav, ma_uint16 value)
-{
-    if (pWav == NULL) {
-        return 2;
-    }
-    return ma_dr_wav__write_u16ne_to_le(pWav, value);
-}
-MA_PRIVATE size_t ma_dr_wav__write_or_count_u32ne_to_le(ma_dr_wav* pWav, ma_uint32 value)
-{
-    if (pWav == NULL) {
-        return 4;
-    }
-    return ma_dr_wav__write_u32ne_to_le(pWav, value);
-}
-#if 0
-MA_PRIVATE size_t ma_dr_wav__write_or_count_u64ne_to_le(ma_dr_wav* pWav, ma_uint64 value)
-{
-    if (pWav == NULL) {
-        return 8;
-    }
-    return ma_dr_wav__write_u64ne_to_le(pWav, value);
-}
-#endif
-MA_PRIVATE size_t ma_dr_wav__write_or_count_f32ne_to_le(ma_dr_wav* pWav, float value)
-{
-    if (pWav == NULL) {
-        return 4;
-    }
-    return ma_dr_wav__write_f32ne_to_le(pWav, value);
-}
-MA_PRIVATE size_t ma_dr_wav__write_or_count_string_to_fixed_size_buf(ma_dr_wav* pWav, char* str, size_t bufFixedSize)
-{
-    size_t len;
-    if (pWav == NULL) {
-        return bufFixedSize;
-    }
-    len = ma_dr_wav__strlen_clamped(str, bufFixedSize);
-    ma_dr_wav__write_or_count(pWav, str, len);
-    if (len < bufFixedSize) {
-        size_t i;
-        for (i = 0; i < bufFixedSize - len; ++i) {
-            ma_dr_wav__write_byte(pWav, 0);
-        }
-    }
-    return bufFixedSize;
-}
-MA_PRIVATE size_t ma_dr_wav__write_or_count_metadata(ma_dr_wav* pWav, ma_dr_wav_metadata* pMetadatas, ma_uint32 metadataCount)
-{
-    size_t bytesWritten = 0;
-    ma_bool32 hasListAdtl = MA_FALSE;
-    ma_bool32 hasListInfo = MA_FALSE;
-    ma_uint32 iMetadata;
-    if (pMetadatas == NULL || metadataCount == 0) {
-        return 0;
-    }
-    for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
-        ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
-        ma_uint32 chunkSize = 0;
-        if ((pMetadata->type & ma_dr_wav_metadata_type_list_all_info_strings) || (pMetadata->type == ma_dr_wav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_info_list)) {
-            hasListInfo = MA_TRUE;
-        }
-        if ((pMetadata->type & ma_dr_wav_metadata_type_list_all_adtl) || (pMetadata->type == ma_dr_wav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_adtl_list)) {
-            hasListAdtl = MA_TRUE;
-        }
-        switch (pMetadata->type) {
-            case ma_dr_wav_metadata_type_smpl:
-            {
-                ma_uint32 iLoop;
-                chunkSize = MA_DR_WAV_SMPL_BYTES + MA_DR_WAV_SMPL_LOOP_BYTES * pMetadata->data.smpl.sampleLoopCount + pMetadata->data.smpl.samplerSpecificDataSizeInBytes;
-                bytesWritten += ma_dr_wav__write_or_count(pWav, "smpl", 4);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.manufacturerId);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.productId);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.samplePeriodNanoseconds);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.midiUnityNote);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.midiPitchFraction);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.smpteFormat);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.smpteOffset);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.sampleLoopCount);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.samplerSpecificDataSizeInBytes);
-                for (iLoop = 0; iLoop < pMetadata->data.smpl.sampleLoopCount; ++iLoop) {
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].cuePointId);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].type);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].firstSampleByteOffset);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].lastSampleByteOffset);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].sampleFraction);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].playCount);
-                }
-                if (pMetadata->data.smpl.samplerSpecificDataSizeInBytes > 0) {
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.smpl.pSamplerSpecificData, pMetadata->data.smpl.samplerSpecificDataSizeInBytes);
-                }
-            } break;
-            case ma_dr_wav_metadata_type_inst:
-            {
-                chunkSize = MA_DR_WAV_INST_BYTES;
-                bytesWritten += ma_dr_wav__write_or_count(pWav, "inst", 4);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
-                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.midiUnityNote, 1);
-                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.fineTuneCents, 1);
-                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.gainDecibels, 1);
-                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.lowNote, 1);
-                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.highNote, 1);
-                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.lowVelocity, 1);
-                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.highVelocity, 1);
-            } break;
-            case ma_dr_wav_metadata_type_cue:
-            {
-                ma_uint32 iCuePoint;
-                chunkSize = MA_DR_WAV_CUE_BYTES + MA_DR_WAV_CUE_POINT_BYTES * pMetadata->data.cue.cuePointCount;
-                bytesWritten += ma_dr_wav__write_or_count(pWav, "cue ", 4);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.cuePointCount);
-                for (iCuePoint = 0; iCuePoint < pMetadata->data.cue.cuePointCount; ++iCuePoint) {
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].id);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].playOrderPosition);
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId, 4);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].chunkStart);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].blockStart);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].sampleByteOffset);
-                }
-            } break;
-            case ma_dr_wav_metadata_type_acid:
-            {
-                chunkSize = MA_DR_WAV_ACID_BYTES;
-                bytesWritten += ma_dr_wav__write_or_count(pWav, "acid", 4);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.acid.flags);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.midiUnityNote);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.reserved1);
-                bytesWritten += ma_dr_wav__write_or_count_f32ne_to_le(pWav, pMetadata->data.acid.reserved2);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.acid.numBeats);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.meterDenominator);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.meterNumerator);
-                bytesWritten += ma_dr_wav__write_or_count_f32ne_to_le(pWav, pMetadata->data.acid.tempo);
-            } break;
-            case ma_dr_wav_metadata_type_bext:
-            {
-                char reservedBuf[MA_DR_WAV_BEXT_RESERVED_BYTES];
-                ma_uint32 timeReferenceLow;
-                ma_uint32 timeReferenceHigh;
-                chunkSize = MA_DR_WAV_BEXT_BYTES + pMetadata->data.bext.codingHistorySize;
-                bytesWritten += ma_dr_wav__write_or_count(pWav, "bext", 4);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
-                bytesWritten += ma_dr_wav__write_or_count_string_to_fixed_size_buf(pWav, pMetadata->data.bext.pDescription, MA_DR_WAV_BEXT_DESCRIPTION_BYTES);
-                bytesWritten += ma_dr_wav__write_or_count_string_to_fixed_size_buf(pWav, pMetadata->data.bext.pOriginatorName, MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES);
-                bytesWritten += ma_dr_wav__write_or_count_string_to_fixed_size_buf(pWav, pMetadata->data.bext.pOriginatorReference, MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES);
-                bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.bext.pOriginationDate, sizeof(pMetadata->data.bext.pOriginationDate));
-                bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.bext.pOriginationTime, sizeof(pMetadata->data.bext.pOriginationTime));
-                timeReferenceLow  = (ma_uint32)(pMetadata->data.bext.timeReference & 0xFFFFFFFF);
-                timeReferenceHigh = (ma_uint32)(pMetadata->data.bext.timeReference >> 32);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, timeReferenceLow);
-                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, timeReferenceHigh);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.version);
-                bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.bext.pUMID, MA_DR_WAV_BEXT_UMID_BYTES);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.loudnessValue);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.loudnessRange);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.maxTruePeakLevel);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.maxMomentaryLoudness);
-                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.maxShortTermLoudness);
-                MA_DR_WAV_ZERO_MEMORY(reservedBuf, sizeof(reservedBuf));
-                bytesWritten += ma_dr_wav__write_or_count(pWav, reservedBuf, sizeof(reservedBuf));
-                if (pMetadata->data.bext.codingHistorySize > 0) {
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.bext.pCodingHistory, pMetadata->data.bext.codingHistorySize);
-                }
-            } break;
-            case ma_dr_wav_metadata_type_unknown:
-            {
-                if (pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_top_level) {
-                    chunkSize = pMetadata->data.unknown.dataSizeInBytes;
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.id, 4);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.pData, pMetadata->data.unknown.dataSizeInBytes);
-                }
-            } break;
-            default: break;
-        }
-        if ((chunkSize % 2) != 0) {
-            bytesWritten += ma_dr_wav__write_or_count_byte(pWav, 0);
-        }
-    }
-    if (hasListInfo) {
-        ma_uint32 chunkSize = 4;
-        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
-            ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
-            if ((pMetadata->type & ma_dr_wav_metadata_type_list_all_info_strings)) {
-                chunkSize += 8;
-                chunkSize += pMetadata->data.infoText.stringLength + 1;
-            } else if (pMetadata->type == ma_dr_wav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_info_list) {
-                chunkSize += 8;
-                chunkSize += pMetadata->data.unknown.dataSizeInBytes;
-            }
-            if ((chunkSize % 2) != 0) {
-                chunkSize += 1;
-            }
-        }
-        bytesWritten += ma_dr_wav__write_or_count(pWav, "LIST", 4);
-        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
-        bytesWritten += ma_dr_wav__write_or_count(pWav, "INFO", 4);
-        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
-            ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
-            ma_uint32 subchunkSize = 0;
-            if (pMetadata->type & ma_dr_wav_metadata_type_list_all_info_strings) {
-                const char* pID = NULL;
-                switch (pMetadata->type) {
-                    case ma_dr_wav_metadata_type_list_info_software:    pID = "ISFT"; break;
-                    case ma_dr_wav_metadata_type_list_info_copyright:   pID = "ICOP"; break;
-                    case ma_dr_wav_metadata_type_list_info_title:       pID = "INAM"; break;
-                    case ma_dr_wav_metadata_type_list_info_artist:      pID = "IART"; break;
-                    case ma_dr_wav_metadata_type_list_info_comment:     pID = "ICMT"; break;
-                    case ma_dr_wav_metadata_type_list_info_date:        pID = "ICRD"; break;
-                    case ma_dr_wav_metadata_type_list_info_genre:       pID = "IGNR"; break;
-                    case ma_dr_wav_metadata_type_list_info_album:       pID = "IPRD"; break;
-                    case ma_dr_wav_metadata_type_list_info_tracknumber: pID = "ITRK"; break;
-                    default: break;
-                }
-                MA_DR_WAV_ASSERT(pID != NULL);
-                if (pMetadata->data.infoText.stringLength) {
-                    subchunkSize = pMetadata->data.infoText.stringLength + 1;
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pID, 4);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, subchunkSize);
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.infoText.pString, pMetadata->data.infoText.stringLength);
-                    bytesWritten += ma_dr_wav__write_or_count_byte(pWav, '\0');
-                }
-            } else if (pMetadata->type == ma_dr_wav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_info_list) {
-                if (pMetadata->data.unknown.dataSizeInBytes) {
-                    subchunkSize = pMetadata->data.unknown.dataSizeInBytes;
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.id, 4);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.unknown.dataSizeInBytes);
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.pData, subchunkSize);
-                }
-            }
-            if ((subchunkSize % 2) != 0) {
-                bytesWritten += ma_dr_wav__write_or_count_byte(pWav, 0);
-            }
-        }
-    }
-    if (hasListAdtl) {
-        ma_uint32 chunkSize = 4;
-        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
-            ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
-            switch (pMetadata->type)
-            {
-                case ma_dr_wav_metadata_type_list_label:
-                case ma_dr_wav_metadata_type_list_note:
-                {
-                    chunkSize += 8;
-                    chunkSize += MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES;
-                    if (pMetadata->data.labelOrNote.stringLength > 0) {
-                        chunkSize += pMetadata->data.labelOrNote.stringLength + 1;
-                    }
-                } break;
-                case ma_dr_wav_metadata_type_list_labelled_cue_region:
-                {
-                    chunkSize += 8;
-                    chunkSize += MA_DR_WAV_LIST_LABELLED_TEXT_BYTES;
-                    if (pMetadata->data.labelledCueRegion.stringLength > 0) {
-                        chunkSize += pMetadata->data.labelledCueRegion.stringLength + 1;
-                    }
-                } break;
-                case ma_dr_wav_metadata_type_unknown:
-                {
-                    if (pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_adtl_list) {
-                        chunkSize += 8;
-                        chunkSize += pMetadata->data.unknown.dataSizeInBytes;
-                    }
-                } break;
-                default: break;
-            }
-            if ((chunkSize % 2) != 0) {
-                chunkSize += 1;
-            }
-        }
-        bytesWritten += ma_dr_wav__write_or_count(pWav, "LIST", 4);
-        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
-        bytesWritten += ma_dr_wav__write_or_count(pWav, "adtl", 4);
-        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
-            ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
-            ma_uint32 subchunkSize = 0;
-            switch (pMetadata->type)
-            {
-                case ma_dr_wav_metadata_type_list_label:
-                case ma_dr_wav_metadata_type_list_note:
-                {
-                    if (pMetadata->data.labelOrNote.stringLength > 0) {
-                        const char *pID = NULL;
-                        if (pMetadata->type == ma_dr_wav_metadata_type_list_label) {
-                            pID = "labl";
-                        }
-                        else if (pMetadata->type == ma_dr_wav_metadata_type_list_note) {
-                            pID = "note";
-                        }
-                        MA_DR_WAV_ASSERT(pID != NULL);
-                        MA_DR_WAV_ASSERT(pMetadata->data.labelOrNote.pString != NULL);
-                        subchunkSize = MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES;
-                        bytesWritten += ma_dr_wav__write_or_count(pWav, pID, 4);
-                        subchunkSize += pMetadata->data.labelOrNote.stringLength + 1;
-                        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, subchunkSize);
-                        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.labelOrNote.cuePointId);
-                        bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.labelOrNote.pString, pMetadata->data.labelOrNote.stringLength);
-                        bytesWritten += ma_dr_wav__write_or_count_byte(pWav, '\0');
-                    }
-                } break;
-                case ma_dr_wav_metadata_type_list_labelled_cue_region:
-                {
-                    subchunkSize = MA_DR_WAV_LIST_LABELLED_TEXT_BYTES;
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, "ltxt", 4);
-                    if (pMetadata->data.labelledCueRegion.stringLength > 0) {
-                        subchunkSize += pMetadata->data.labelledCueRegion.stringLength + 1;
-                    }
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, subchunkSize);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.labelledCueRegion.cuePointId);
-                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.labelledCueRegion.sampleLength);
-                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.labelledCueRegion.purposeId, 4);
-                    bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.country);
-                    bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.language);
-                    bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.dialect);
-                    bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.codePage);
-                    if (pMetadata->data.labelledCueRegion.stringLength > 0) {
-                        MA_DR_WAV_ASSERT(pMetadata->data.labelledCueRegion.pString != NULL);
-                        bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.labelledCueRegion.pString, pMetadata->data.labelledCueRegion.stringLength);
-                        bytesWritten += ma_dr_wav__write_or_count_byte(pWav, '\0');
-                    }
-                } break;
-                case ma_dr_wav_metadata_type_unknown:
-                {
-                    if (pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_adtl_list) {
-                        subchunkSize = pMetadata->data.unknown.dataSizeInBytes;
-                        MA_DR_WAV_ASSERT(pMetadata->data.unknown.pData != NULL);
-                        bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.id, 4);
-                        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, subchunkSize);
-                        bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.pData, subchunkSize);
-                    }
-                } break;
-                default: break;
-            }
-            if ((subchunkSize % 2) != 0) {
-                bytesWritten += ma_dr_wav__write_or_count_byte(pWav, 0);
-            }
-        }
-    }
-    MA_DR_WAV_ASSERT((bytesWritten % 2) == 0);
-    return bytesWritten;
-}
-MA_PRIVATE ma_uint32 ma_dr_wav__riff_chunk_size_riff(ma_uint64 dataChunkSize, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount)
-{
-    ma_uint64 chunkSize = 4 + 24 + (ma_uint64)ma_dr_wav__write_or_count_metadata(NULL, pMetadata, metadataCount) + 8 + dataChunkSize + ma_dr_wav__chunk_padding_size_riff(dataChunkSize);
-    if (chunkSize > 0xFFFFFFFFUL) {
-        chunkSize = 0xFFFFFFFFUL;
-    }
-    return (ma_uint32)chunkSize;
-}
-MA_PRIVATE ma_uint32 ma_dr_wav__data_chunk_size_riff(ma_uint64 dataChunkSize)
-{
-    if (dataChunkSize <= 0xFFFFFFFFUL) {
-        return (ma_uint32)dataChunkSize;
-    } else {
-        return 0xFFFFFFFFUL;
-    }
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__riff_chunk_size_w64(ma_uint64 dataChunkSize)
-{
-    ma_uint64 dataSubchunkPaddingSize = ma_dr_wav__chunk_padding_size_w64(dataChunkSize);
-    return 80 + 24 + dataChunkSize + dataSubchunkPaddingSize;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__data_chunk_size_w64(ma_uint64 dataChunkSize)
-{
-    return 24 + dataChunkSize;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__riff_chunk_size_rf64(ma_uint64 dataChunkSize, ma_dr_wav_metadata *metadata, ma_uint32 numMetadata)
-{
-    ma_uint64 chunkSize = 4 + 36 + 24 + (ma_uint64)ma_dr_wav__write_or_count_metadata(NULL, metadata, numMetadata) + 8 + dataChunkSize + ma_dr_wav__chunk_padding_size_riff(dataChunkSize);
-    if (chunkSize > 0xFFFFFFFFUL) {
-        chunkSize = 0xFFFFFFFFUL;
-    }
-    return chunkSize;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav__data_chunk_size_rf64(ma_uint64 dataChunkSize)
-{
-    return dataChunkSize;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav_preinit_write(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_bool32 isSequential, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pWav == NULL || onWrite == NULL) {
-        return MA_FALSE;
-    }
-    if (!isSequential && onSeek == NULL) {
-        return MA_FALSE;
-    }
-    if (pFormat->format == MA_DR_WAVE_FORMAT_EXTENSIBLE) {
-        return MA_FALSE;
-    }
-    if (pFormat->format == MA_DR_WAVE_FORMAT_ADPCM || pFormat->format == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-        return MA_FALSE;
-    }
-    MA_DR_WAV_ZERO_MEMORY(pWav, sizeof(*pWav));
-    pWav->onWrite   = onWrite;
-    pWav->onSeek    = onSeek;
-    pWav->pUserData = pUserData;
-    pWav->allocationCallbacks = ma_dr_wav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
-    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
-        return MA_FALSE;
-    }
-    pWav->fmt.formatTag = (ma_uint16)pFormat->format;
-    pWav->fmt.channels = (ma_uint16)pFormat->channels;
-    pWav->fmt.sampleRate = pFormat->sampleRate;
-    pWav->fmt.avgBytesPerSec = (ma_uint32)((pFormat->bitsPerSample * pFormat->sampleRate * pFormat->channels) / 8);
-    pWav->fmt.blockAlign = (ma_uint16)((pFormat->channels * pFormat->bitsPerSample) / 8);
-    pWav->fmt.bitsPerSample = (ma_uint16)pFormat->bitsPerSample;
-    pWav->fmt.extendedSize = 0;
-    pWav->isSequentialWrite = isSequential;
-    return MA_TRUE;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav_init_write__internal(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount)
-{
-    size_t runningPos = 0;
-    ma_uint64 initialDataChunkSize = 0;
-    ma_uint64 chunkSizeFMT;
-    if (pWav->isSequentialWrite) {
-        initialDataChunkSize = (totalSampleCount * pWav->fmt.bitsPerSample) / 8;
-        if (pFormat->container == ma_dr_wav_container_riff) {
-            if (initialDataChunkSize > (0xFFFFFFFFUL - 36)) {
-                return MA_FALSE;
-            }
-        }
-    }
-    pWav->dataChunkDataSizeTargetWrite = initialDataChunkSize;
-    if (pFormat->container == ma_dr_wav_container_riff) {
-        ma_uint32 chunkSizeRIFF = 28 + (ma_uint32)initialDataChunkSize;
-        runningPos += ma_dr_wav__write(pWav, "RIFF", 4);
-        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, chunkSizeRIFF);
-        runningPos += ma_dr_wav__write(pWav, "WAVE", 4);
-    } else if (pFormat->container == ma_dr_wav_container_w64) {
-        ma_uint64 chunkSizeRIFF = 80 + 24 + initialDataChunkSize;
-        runningPos += ma_dr_wav__write(pWav, ma_dr_wavGUID_W64_RIFF, 16);
-        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, chunkSizeRIFF);
-        runningPos += ma_dr_wav__write(pWav, ma_dr_wavGUID_W64_WAVE, 16);
-    } else if (pFormat->container == ma_dr_wav_container_rf64) {
-        runningPos += ma_dr_wav__write(pWav, "RF64", 4);
-        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, 0xFFFFFFFF);
-        runningPos += ma_dr_wav__write(pWav, "WAVE", 4);
-    } else {
-        return MA_FALSE;
-    }
-    if (pFormat->container == ma_dr_wav_container_rf64) {
-        ma_uint32 initialds64ChunkSize = 28;
-        ma_uint64 initialRiffChunkSize = 8 + initialds64ChunkSize + initialDataChunkSize;
-        runningPos += ma_dr_wav__write(pWav, "ds64", 4);
-        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, initialds64ChunkSize);
-        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, initialRiffChunkSize);
-        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, initialDataChunkSize);
-        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, totalSampleCount);
-        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, 0);
-    }
-    if (pFormat->container == ma_dr_wav_container_riff || pFormat->container == ma_dr_wav_container_rf64) {
-        chunkSizeFMT = 16;
-        runningPos += ma_dr_wav__write(pWav, "fmt ", 4);
-        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, (ma_uint32)chunkSizeFMT);
-    } else if (pFormat->container == ma_dr_wav_container_w64) {
-        chunkSizeFMT = 40;
-        runningPos += ma_dr_wav__write(pWav, ma_dr_wavGUID_W64_FMT, 16);
-        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, chunkSizeFMT);
-    }
-    runningPos += ma_dr_wav__write_u16ne_to_le(pWav, pWav->fmt.formatTag);
-    runningPos += ma_dr_wav__write_u16ne_to_le(pWav, pWav->fmt.channels);
-    runningPos += ma_dr_wav__write_u32ne_to_le(pWav, pWav->fmt.sampleRate);
-    runningPos += ma_dr_wav__write_u32ne_to_le(pWav, pWav->fmt.avgBytesPerSec);
-    runningPos += ma_dr_wav__write_u16ne_to_le(pWav, pWav->fmt.blockAlign);
-    runningPos += ma_dr_wav__write_u16ne_to_le(pWav, pWav->fmt.bitsPerSample);
-    if (!pWav->isSequentialWrite && pWav->pMetadata != NULL && pWav->metadataCount > 0 && (pFormat->container == ma_dr_wav_container_riff || pFormat->container == ma_dr_wav_container_rf64)) {
-        runningPos += ma_dr_wav__write_or_count_metadata(pWav, pWav->pMetadata, pWav->metadataCount);
-    }
-    pWav->dataChunkDataPos = runningPos;
-    if (pFormat->container == ma_dr_wav_container_riff) {
-        ma_uint32 chunkSizeDATA = (ma_uint32)initialDataChunkSize;
-        runningPos += ma_dr_wav__write(pWav, "data", 4);
-        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, chunkSizeDATA);
-    } else if (pFormat->container == ma_dr_wav_container_w64) {
-        ma_uint64 chunkSizeDATA = 24 + initialDataChunkSize;
-        runningPos += ma_dr_wav__write(pWav, ma_dr_wavGUID_W64_DATA, 16);
-        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, chunkSizeDATA);
-    } else if (pFormat->container == ma_dr_wav_container_rf64) {
-        runningPos += ma_dr_wav__write(pWav, "data", 4);
-        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, 0xFFFFFFFF);
-    }
-    pWav->container = pFormat->container;
-    pWav->channels = (ma_uint16)pFormat->channels;
-    pWav->sampleRate = pFormat->sampleRate;
-    pWav->bitsPerSample = (ma_uint16)pFormat->bitsPerSample;
-    pWav->translatedFormatTag = (ma_uint16)pFormat->format;
-    pWav->dataChunkDataPos = runningPos;
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_wav_init_write(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (!ma_dr_wav_preinit_write(pWav, pFormat, MA_FALSE, onWrite, onSeek, pUserData, pAllocationCallbacks)) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_write__internal(pWav, pFormat, 0);
-}
-MA_API ma_bool32 ma_dr_wav_init_write_sequential(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_dr_wav_write_proc onWrite, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (!ma_dr_wav_preinit_write(pWav, pFormat, MA_TRUE, onWrite, NULL, pUserData, pAllocationCallbacks)) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_write__internal(pWav, pFormat, totalSampleCount);
-}
-MA_API ma_bool32 ma_dr_wav_init_write_sequential_pcm_frames(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, ma_dr_wav_write_proc onWrite, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFormat == NULL) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_write_sequential(pWav, pFormat, totalPCMFrameCount*pFormat->channels, onWrite, pUserData, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_write_with_metadata(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount)
-{
-    if (!ma_dr_wav_preinit_write(pWav, pFormat, MA_FALSE, onWrite, onSeek, pUserData, pAllocationCallbacks)) {
-        return MA_FALSE;
-    }
-    pWav->pMetadata     = pMetadata;
-    pWav->metadataCount = metadataCount;
-    return ma_dr_wav_init_write__internal(pWav, pFormat, 0);
-}
-MA_API ma_uint64 ma_dr_wav_target_write_size_bytes(const ma_dr_wav_data_format* pFormat, ma_uint64 totalFrameCount, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount)
-{
-    ma_uint64 targetDataSizeBytes = (ma_uint64)((ma_int64)totalFrameCount * pFormat->channels * pFormat->bitsPerSample/8.0);
-    ma_uint64 riffChunkSizeBytes;
-    ma_uint64 fileSizeBytes = 0;
-    if (pFormat->container == ma_dr_wav_container_riff) {
-        riffChunkSizeBytes = ma_dr_wav__riff_chunk_size_riff(targetDataSizeBytes, pMetadata, metadataCount);
-        fileSizeBytes = (8 + riffChunkSizeBytes);
-    } else if (pFormat->container == ma_dr_wav_container_w64) {
-        riffChunkSizeBytes = ma_dr_wav__riff_chunk_size_w64(targetDataSizeBytes);
-        fileSizeBytes = riffChunkSizeBytes;
-    } else if (pFormat->container == ma_dr_wav_container_rf64) {
-        riffChunkSizeBytes = ma_dr_wav__riff_chunk_size_rf64(targetDataSizeBytes, pMetadata, metadataCount);
-        fileSizeBytes = (8 + riffChunkSizeBytes);
-    }
-    return fileSizeBytes;
-}
-#ifndef MA_DR_WAV_NO_STDIO
-MA_PRIVATE size_t ma_dr_wav__on_read_stdio(void* pUserData, void* pBufferOut, size_t bytesToRead)
-{
-    return fread(pBufferOut, 1, bytesToRead, (FILE*)pUserData);
-}
-MA_PRIVATE size_t ma_dr_wav__on_write_stdio(void* pUserData, const void* pData, size_t bytesToWrite)
-{
-    return fwrite(pData, 1, bytesToWrite, (FILE*)pUserData);
-}
-MA_PRIVATE ma_bool32 ma_dr_wav__on_seek_stdio(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
-{
-    return fseek((FILE*)pUserData, offset, (origin == ma_dr_wav_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0;
-}
-MA_API ma_bool32 ma_dr_wav_init_file(ma_dr_wav* pWav, const char* filename, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_file_ex(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
-}
-MA_PRIVATE ma_bool32 ma_dr_wav_init_file__internal_FILE(ma_dr_wav* pWav, FILE* pFile, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_bool32 result;
-    result = ma_dr_wav_preinit(pWav, ma_dr_wav__on_read_stdio, ma_dr_wav__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
-    if (result != MA_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-    result = ma_dr_wav_init__internal(pWav, onChunk, pChunkUserData, flags);
-    if (result != MA_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_wav_init_file_ex(ma_dr_wav* pWav, const char* filename, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (ma_fopen(&pFile, filename, "rb") != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
-}
-#ifndef MA_DR_WAV_NO_WCHAR
-MA_API ma_bool32 ma_dr_wav_init_file_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_file_ex_w(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_file_ex_w(ma_dr_wav* pWav, const wchar_t* filename, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (ma_wfopen(&pFile, filename, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
-}
-#endif
-MA_API ma_bool32 ma_dr_wav_init_file_with_metadata(ma_dr_wav* pWav, const char* filename, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (ma_fopen(&pFile, filename, "rb") != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_file__internal_FILE(pWav, pFile, NULL, NULL, flags | MA_DR_WAV_WITH_METADATA, pAllocationCallbacks);
-}
-#ifndef MA_DR_WAV_NO_WCHAR
-MA_API ma_bool32 ma_dr_wav_init_file_with_metadata_w(ma_dr_wav* pWav, const wchar_t* filename, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (ma_wfopen(&pFile, filename, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_file__internal_FILE(pWav, pFile, NULL, NULL, flags | MA_DR_WAV_WITH_METADATA, pAllocationCallbacks);
-}
-#endif
-MA_PRIVATE ma_bool32 ma_dr_wav_init_file_write__internal_FILE(ma_dr_wav* pWav, FILE* pFile, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_bool32 isSequential, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_bool32 result;
-    result = ma_dr_wav_preinit_write(pWav, pFormat, isSequential, ma_dr_wav__on_write_stdio, ma_dr_wav__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
-    if (result != MA_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-    result = ma_dr_wav_init_write__internal(pWav, pFormat, totalSampleCount);
-    if (result != MA_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-    return MA_TRUE;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav_init_file_write__internal(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_bool32 isSequential, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (ma_fopen(&pFile, filename, "wb") != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
-}
-#ifndef MA_DR_WAV_NO_WCHAR
-MA_PRIVATE ma_bool32 ma_dr_wav_init_file_write_w__internal(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_bool32 isSequential, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (ma_wfopen(&pFile, filename, L"wb", pAllocationCallbacks) != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
-}
-#endif
-MA_API ma_bool32 ma_dr_wav_init_file_write(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_file_write__internal(pWav, filename, pFormat, 0, MA_FALSE, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_file_write_sequential(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_file_write__internal(pWav, filename, pFormat, totalSampleCount, MA_TRUE, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_pcm_frames(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFormat == NULL) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_file_write_sequential(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
-}
-#ifndef MA_DR_WAV_NO_WCHAR
-MA_API ma_bool32 ma_dr_wav_init_file_write_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_file_write_w__internal(pWav, filename, pFormat, 0, MA_FALSE, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_file_write_w__internal(pWav, filename, pFormat, totalSampleCount, MA_TRUE, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_pcm_frames_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFormat == NULL) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_file_write_sequential_w(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
-}
-#endif
-#endif
-MA_PRIVATE size_t ma_dr_wav__on_read_memory(void* pUserData, void* pBufferOut, size_t bytesToRead)
-{
-    ma_dr_wav* pWav = (ma_dr_wav*)pUserData;
-    size_t bytesRemaining;
-    MA_DR_WAV_ASSERT(pWav != NULL);
-    MA_DR_WAV_ASSERT(pWav->memoryStream.dataSize >= pWav->memoryStream.currentReadPos);
-    bytesRemaining = pWav->memoryStream.dataSize - pWav->memoryStream.currentReadPos;
-    if (bytesToRead > bytesRemaining) {
-        bytesToRead = bytesRemaining;
-    }
-    if (bytesToRead > 0) {
-        MA_DR_WAV_COPY_MEMORY(pBufferOut, pWav->memoryStream.data + pWav->memoryStream.currentReadPos, bytesToRead);
-        pWav->memoryStream.currentReadPos += bytesToRead;
-    }
-    return bytesToRead;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav__on_seek_memory(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
-{
-    ma_dr_wav* pWav = (ma_dr_wav*)pUserData;
-    MA_DR_WAV_ASSERT(pWav != NULL);
-    if (origin == ma_dr_wav_seek_origin_current) {
-        if (offset > 0) {
-            if (pWav->memoryStream.currentReadPos + offset > pWav->memoryStream.dataSize) {
-                return MA_FALSE;
-            }
-        } else {
-            if (pWav->memoryStream.currentReadPos < (size_t)-offset) {
-                return MA_FALSE;
-            }
-        }
-        pWav->memoryStream.currentReadPos += offset;
-    } else {
-        if ((ma_uint32)offset <= pWav->memoryStream.dataSize) {
-            pWav->memoryStream.currentReadPos = offset;
-        } else {
-            return MA_FALSE;
-        }
-    }
-    return MA_TRUE;
-}
-MA_PRIVATE size_t ma_dr_wav__on_write_memory(void* pUserData, const void* pDataIn, size_t bytesToWrite)
-{
-    ma_dr_wav* pWav = (ma_dr_wav*)pUserData;
-    size_t bytesRemaining;
-    MA_DR_WAV_ASSERT(pWav != NULL);
-    MA_DR_WAV_ASSERT(pWav->memoryStreamWrite.dataCapacity >= pWav->memoryStreamWrite.currentWritePos);
-    bytesRemaining = pWav->memoryStreamWrite.dataCapacity - pWav->memoryStreamWrite.currentWritePos;
-    if (bytesRemaining < bytesToWrite) {
-        void* pNewData;
-        size_t newDataCapacity = (pWav->memoryStreamWrite.dataCapacity == 0) ? 256 : pWav->memoryStreamWrite.dataCapacity * 2;
-        if ((newDataCapacity - pWav->memoryStreamWrite.currentWritePos) < bytesToWrite) {
-            newDataCapacity = pWav->memoryStreamWrite.currentWritePos + bytesToWrite;
-        }
-        pNewData = ma_dr_wav__realloc_from_callbacks(*pWav->memoryStreamWrite.ppData, newDataCapacity, pWav->memoryStreamWrite.dataCapacity, &pWav->allocationCallbacks);
-        if (pNewData == NULL) {
-            return 0;
-        }
-        *pWav->memoryStreamWrite.ppData = pNewData;
-        pWav->memoryStreamWrite.dataCapacity = newDataCapacity;
-    }
-    MA_DR_WAV_COPY_MEMORY(((ma_uint8*)(*pWav->memoryStreamWrite.ppData)) + pWav->memoryStreamWrite.currentWritePos, pDataIn, bytesToWrite);
-    pWav->memoryStreamWrite.currentWritePos += bytesToWrite;
-    if (pWav->memoryStreamWrite.dataSize < pWav->memoryStreamWrite.currentWritePos) {
-        pWav->memoryStreamWrite.dataSize = pWav->memoryStreamWrite.currentWritePos;
-    }
-    *pWav->memoryStreamWrite.pDataSize = pWav->memoryStreamWrite.dataSize;
-    return bytesToWrite;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav__on_seek_memory_write(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
-{
-    ma_dr_wav* pWav = (ma_dr_wav*)pUserData;
-    MA_DR_WAV_ASSERT(pWav != NULL);
-    if (origin == ma_dr_wav_seek_origin_current) {
-        if (offset > 0) {
-            if (pWav->memoryStreamWrite.currentWritePos + offset > pWav->memoryStreamWrite.dataSize) {
-                offset = (int)(pWav->memoryStreamWrite.dataSize - pWav->memoryStreamWrite.currentWritePos);
-            }
-        } else {
-            if (pWav->memoryStreamWrite.currentWritePos < (size_t)-offset) {
-                offset = -(int)pWav->memoryStreamWrite.currentWritePos;
-            }
-        }
-        pWav->memoryStreamWrite.currentWritePos += offset;
-    } else {
-        if ((ma_uint32)offset <= pWav->memoryStreamWrite.dataSize) {
-            pWav->memoryStreamWrite.currentWritePos = offset;
-        } else {
-            pWav->memoryStreamWrite.currentWritePos = pWav->memoryStreamWrite.dataSize;
-        }
-    }
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_wav_init_memory(ma_dr_wav* pWav, const void* data, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_memory_ex(pWav, data, dataSize, NULL, NULL, 0, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_memory_ex(ma_dr_wav* pWav, const void* data, size_t dataSize, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (data == NULL || dataSize == 0) {
-        return MA_FALSE;
-    }
-    if (!ma_dr_wav_preinit(pWav, ma_dr_wav__on_read_memory, ma_dr_wav__on_seek_memory, pWav, pAllocationCallbacks)) {
-        return MA_FALSE;
-    }
-    pWav->memoryStream.data = (const ma_uint8*)data;
-    pWav->memoryStream.dataSize = dataSize;
-    pWav->memoryStream.currentReadPos = 0;
-    return ma_dr_wav_init__internal(pWav, onChunk, pChunkUserData, flags);
-}
-MA_API ma_bool32 ma_dr_wav_init_memory_with_metadata(ma_dr_wav* pWav, const void* data, size_t dataSize, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (data == NULL || dataSize == 0) {
-        return MA_FALSE;
-    }
-    if (!ma_dr_wav_preinit(pWav, ma_dr_wav__on_read_memory, ma_dr_wav__on_seek_memory, pWav, pAllocationCallbacks)) {
-        return MA_FALSE;
-    }
-    pWav->memoryStream.data = (const ma_uint8*)data;
-    pWav->memoryStream.dataSize = dataSize;
-    pWav->memoryStream.currentReadPos = 0;
-    return ma_dr_wav_init__internal(pWav, NULL, NULL, flags | MA_DR_WAV_WITH_METADATA);
-}
-MA_PRIVATE ma_bool32 ma_dr_wav_init_memory_write__internal(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_bool32 isSequential, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (ppData == NULL || pDataSize == NULL) {
-        return MA_FALSE;
-    }
-    *ppData = NULL;
-    *pDataSize = 0;
-    if (!ma_dr_wav_preinit_write(pWav, pFormat, isSequential, ma_dr_wav__on_write_memory, ma_dr_wav__on_seek_memory_write, pWav, pAllocationCallbacks)) {
-        return MA_FALSE;
-    }
-    pWav->memoryStreamWrite.ppData = ppData;
-    pWav->memoryStreamWrite.pDataSize = pDataSize;
-    pWav->memoryStreamWrite.dataSize = 0;
-    pWav->memoryStreamWrite.dataCapacity = 0;
-    pWav->memoryStreamWrite.currentWritePos = 0;
-    return ma_dr_wav_init_write__internal(pWav, pFormat, totalSampleCount);
-}
-MA_API ma_bool32 ma_dr_wav_init_memory_write(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, 0, MA_FALSE, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_memory_write_sequential(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_wav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, totalSampleCount, MA_TRUE, pAllocationCallbacks);
-}
-MA_API ma_bool32 ma_dr_wav_init_memory_write_sequential_pcm_frames(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFormat == NULL) {
-        return MA_FALSE;
-    }
-    return ma_dr_wav_init_memory_write_sequential(pWav, ppData, pDataSize, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
-}
-MA_API ma_result ma_dr_wav_uninit(ma_dr_wav* pWav)
-{
-    ma_result result = MA_SUCCESS;
-    if (pWav == NULL) {
-        return MA_INVALID_ARGS;
-    }
-    if (pWav->onWrite != NULL) {
-        ma_uint32 paddingSize = 0;
-        if (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rf64) {
-            paddingSize = ma_dr_wav__chunk_padding_size_riff(pWav->dataChunkDataSize);
-        } else {
-            paddingSize = ma_dr_wav__chunk_padding_size_w64(pWav->dataChunkDataSize);
-        }
-        if (paddingSize > 0) {
-            ma_uint64 paddingData = 0;
-            ma_dr_wav__write(pWav, &paddingData, paddingSize);
-        }
-        if (pWav->onSeek && !pWav->isSequentialWrite) {
-            if (pWav->container == ma_dr_wav_container_riff) {
-                if (pWav->onSeek(pWav->pUserData, 4, ma_dr_wav_seek_origin_start)) {
-                    ma_uint32 riffChunkSize = ma_dr_wav__riff_chunk_size_riff(pWav->dataChunkDataSize, pWav->pMetadata, pWav->metadataCount);
-                    ma_dr_wav__write_u32ne_to_le(pWav, riffChunkSize);
-                }
-                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos - 4, ma_dr_wav_seek_origin_start)) {
-                    ma_uint32 dataChunkSize = ma_dr_wav__data_chunk_size_riff(pWav->dataChunkDataSize);
-                    ma_dr_wav__write_u32ne_to_le(pWav, dataChunkSize);
-                }
-            } else if (pWav->container == ma_dr_wav_container_w64) {
-                if (pWav->onSeek(pWav->pUserData, 16, ma_dr_wav_seek_origin_start)) {
-                    ma_uint64 riffChunkSize = ma_dr_wav__riff_chunk_size_w64(pWav->dataChunkDataSize);
-                    ma_dr_wav__write_u64ne_to_le(pWav, riffChunkSize);
-                }
-                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos - 8, ma_dr_wav_seek_origin_start)) {
-                    ma_uint64 dataChunkSize = ma_dr_wav__data_chunk_size_w64(pWav->dataChunkDataSize);
-                    ma_dr_wav__write_u64ne_to_le(pWav, dataChunkSize);
-                }
-            } else if (pWav->container == ma_dr_wav_container_rf64) {
-                int ds64BodyPos = 12 + 8;
-                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 0, ma_dr_wav_seek_origin_start)) {
-                    ma_uint64 riffChunkSize = ma_dr_wav__riff_chunk_size_rf64(pWav->dataChunkDataSize, pWav->pMetadata, pWav->metadataCount);
-                    ma_dr_wav__write_u64ne_to_le(pWav, riffChunkSize);
-                }
-                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 8, ma_dr_wav_seek_origin_start)) {
-                    ma_uint64 dataChunkSize = ma_dr_wav__data_chunk_size_rf64(pWav->dataChunkDataSize);
-                    ma_dr_wav__write_u64ne_to_le(pWav, dataChunkSize);
-                }
-            }
-        }
-        if (pWav->isSequentialWrite) {
-            if (pWav->dataChunkDataSize != pWav->dataChunkDataSizeTargetWrite) {
-                result = MA_INVALID_FILE;
-            }
-        }
-    } else {
-        ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
-    }
-#ifndef MA_DR_WAV_NO_STDIO
-    if (pWav->onRead == ma_dr_wav__on_read_stdio || pWav->onWrite == ma_dr_wav__on_write_stdio) {
-        fclose((FILE*)pWav->pUserData);
-    }
-#endif
-    return result;
-}
-MA_API size_t ma_dr_wav_read_raw(ma_dr_wav* pWav, size_t bytesToRead, void* pBufferOut)
-{
-    size_t bytesRead;
-    ma_uint32 bytesPerFrame;
-    if (pWav == NULL || bytesToRead == 0) {
-        return 0;
-    }
-    if (bytesToRead > pWav->bytesRemaining) {
-        bytesToRead = (size_t)pWav->bytesRemaining;
-    }
-    if (bytesToRead == 0) {
-        return 0;
-    }
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    if (pBufferOut != NULL) {
-        bytesRead = pWav->onRead(pWav->pUserData, pBufferOut, bytesToRead);
-    } else {
-        bytesRead = 0;
-        while (bytesRead < bytesToRead) {
-            size_t bytesToSeek = (bytesToRead - bytesRead);
-            if (bytesToSeek > 0x7FFFFFFF) {
-                bytesToSeek = 0x7FFFFFFF;
-            }
-            if (pWav->onSeek(pWav->pUserData, (int)bytesToSeek, ma_dr_wav_seek_origin_current) == MA_FALSE) {
-                break;
-            }
-            bytesRead += bytesToSeek;
-        }
-        while (bytesRead < bytesToRead) {
-            ma_uint8 buffer[4096];
-            size_t bytesSeeked;
-            size_t bytesToSeek = (bytesToRead - bytesRead);
-            if (bytesToSeek > sizeof(buffer)) {
-                bytesToSeek = sizeof(buffer);
-            }
-            bytesSeeked = pWav->onRead(pWav->pUserData, buffer, bytesToSeek);
-            bytesRead += bytesSeeked;
-            if (bytesSeeked < bytesToSeek) {
-                break;
-            }
-        }
-    }
-    pWav->readCursorInPCMFrames += bytesRead / bytesPerFrame;
-    pWav->bytesRemaining -= bytesRead;
-    return bytesRead;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_le(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut)
-{
-    ma_uint32 bytesPerFrame;
-    ma_uint64 bytesToRead;
-    ma_uint64 framesRemainingInFile;
-    if (pWav == NULL || framesToRead == 0) {
-        return 0;
-    }
-    if (ma_dr_wav__is_compressed_format_tag(pWav->translatedFormatTag)) {
-        return 0;
-    }
-    framesRemainingInFile = pWav->totalPCMFrameCount - pWav->readCursorInPCMFrames;
-    if (framesToRead > framesRemainingInFile) {
-        framesToRead = framesRemainingInFile;
-    }
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesToRead = framesToRead * bytesPerFrame;
-    if (bytesToRead > MA_SIZE_MAX) {
-        bytesToRead = (MA_SIZE_MAX / bytesPerFrame) * bytesPerFrame;
-    }
-    if (bytesToRead == 0) {
-        return 0;
-    }
-    return ma_dr_wav_read_raw(pWav, (size_t)bytesToRead, pBufferOut) / bytesPerFrame;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_be(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut)
-{
-    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL) {
-        ma_uint32 bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-        if (bytesPerFrame == 0) {
-            return 0;
-        }
-        ma_dr_wav__bswap_samples(pBufferOut, framesRead*pWav->channels, bytesPerFrame/pWav->channels);
-    }
-    return framesRead;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut)
-{
-    ma_uint64 framesRead = 0;
-    if (ma_dr_wav_is_container_be(pWav->container)) {
-        if (pWav->container != ma_dr_wav_container_aiff || pWav->aiff.isLE == MA_FALSE) {
-            if (ma_dr_wav__is_little_endian()) {
-                framesRead = ma_dr_wav_read_pcm_frames_be(pWav, framesToRead, pBufferOut);
-            } else {
-                framesRead = ma_dr_wav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
-            }
-            goto post_process;
-        }
-    }
-    if (ma_dr_wav__is_little_endian()) {
-        framesRead = ma_dr_wav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
-    } else {
-        framesRead = ma_dr_wav_read_pcm_frames_be(pWav, framesToRead, pBufferOut);
-    }
-    post_process:
-    {
-        if (pWav->container == ma_dr_wav_container_aiff && pWav->bitsPerSample == 8 && pWav->aiff.isUnsigned == MA_FALSE) {
-            if (pBufferOut != NULL) {
-                ma_uint64 iSample;
-                for (iSample = 0; iSample < framesRead * pWav->channels; iSample += 1) {
-                    ((ma_uint8*)pBufferOut)[iSample] += 128;
-                }
-            }
-        }
-    }
-    return framesRead;
-}
-MA_PRIVATE ma_bool32 ma_dr_wav_seek_to_first_pcm_frame(ma_dr_wav* pWav)
-{
-    if (pWav->onWrite != NULL) {
-        return MA_FALSE;
-    }
-    if (!pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos, ma_dr_wav_seek_origin_start)) {
-        return MA_FALSE;
-    }
-    if (ma_dr_wav__is_compressed_format_tag(pWav->translatedFormatTag)) {
-        if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
-            MA_DR_WAV_ZERO_OBJECT(&pWav->msadpcm);
-        } else if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-            MA_DR_WAV_ZERO_OBJECT(&pWav->ima);
-        } else {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-        }
-    }
-    pWav->readCursorInPCMFrames = 0;
-    pWav->bytesRemaining = pWav->dataChunkDataSize;
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_wav_seek_to_pcm_frame(ma_dr_wav* pWav, ma_uint64 targetFrameIndex)
-{
-    if (pWav == NULL || pWav->onSeek == NULL) {
-        return MA_FALSE;
-    }
-    if (pWav->onWrite != NULL) {
-        return MA_FALSE;
-    }
-    if (pWav->totalPCMFrameCount == 0) {
-        return MA_TRUE;
-    }
-    if (targetFrameIndex > pWav->totalPCMFrameCount) {
-        targetFrameIndex = pWav->totalPCMFrameCount;
-    }
-    if (ma_dr_wav__is_compressed_format_tag(pWav->translatedFormatTag)) {
-        if (targetFrameIndex < pWav->readCursorInPCMFrames) {
-            if (!ma_dr_wav_seek_to_first_pcm_frame(pWav)) {
-                return MA_FALSE;
-            }
-        }
-        if (targetFrameIndex > pWav->readCursorInPCMFrames) {
-            ma_uint64 offsetInFrames = targetFrameIndex - pWav->readCursorInPCMFrames;
-            ma_int16 devnull[2048];
-            while (offsetInFrames > 0) {
-                ma_uint64 framesRead = 0;
-                ma_uint64 framesToRead = offsetInFrames;
-                if (framesToRead > ma_dr_wav_countof(devnull)/pWav->channels) {
-                    framesToRead = ma_dr_wav_countof(devnull)/pWav->channels;
-                }
-                if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
-                    framesRead = ma_dr_wav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, devnull);
-                } else if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-                    framesRead = ma_dr_wav_read_pcm_frames_s16__ima(pWav, framesToRead, devnull);
-                } else {
-                    MA_DR_WAV_ASSERT(MA_FALSE);
-                }
-                if (framesRead != framesToRead) {
-                    return MA_FALSE;
-                }
-                offsetInFrames -= framesRead;
-            }
-        }
-    } else {
-        ma_uint64 totalSizeInBytes;
-        ma_uint64 currentBytePos;
-        ma_uint64 targetBytePos;
-        ma_uint64 offset;
-        ma_uint32 bytesPerFrame;
-        bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-        if (bytesPerFrame == 0) {
-            return MA_FALSE;
-        }
-        totalSizeInBytes = pWav->totalPCMFrameCount * bytesPerFrame;
-        currentBytePos = totalSizeInBytes - pWav->bytesRemaining;
-        targetBytePos  = targetFrameIndex * bytesPerFrame;
-        if (currentBytePos < targetBytePos) {
-            offset = (targetBytePos - currentBytePos);
-        } else {
-            if (!ma_dr_wav_seek_to_first_pcm_frame(pWav)) {
-                return MA_FALSE;
-            }
-            offset = targetBytePos;
-        }
-        while (offset > 0) {
-            int offset32 = ((offset > INT_MAX) ? INT_MAX : (int)offset);
-            if (!pWav->onSeek(pWav->pUserData, offset32, ma_dr_wav_seek_origin_current)) {
-                return MA_FALSE;
-            }
-            pWav->readCursorInPCMFrames += offset32 / bytesPerFrame;
-            pWav->bytesRemaining        -= offset32;
-            offset                      -= offset32;
-        }
-    }
-    return MA_TRUE;
-}
-MA_API ma_result ma_dr_wav_get_cursor_in_pcm_frames(ma_dr_wav* pWav, ma_uint64* pCursor)
-{
-    if (pCursor == NULL) {
-        return MA_INVALID_ARGS;
-    }
-    *pCursor = 0;
-    if (pWav == NULL) {
-        return MA_INVALID_ARGS;
-    }
-    *pCursor = pWav->readCursorInPCMFrames;
-    return MA_SUCCESS;
-}
-MA_API ma_result ma_dr_wav_get_length_in_pcm_frames(ma_dr_wav* pWav, ma_uint64* pLength)
-{
-    if (pLength == NULL) {
-        return MA_INVALID_ARGS;
-    }
-    *pLength = 0;
-    if (pWav == NULL) {
-        return MA_INVALID_ARGS;
-    }
-    *pLength = pWav->totalPCMFrameCount;
-    return MA_SUCCESS;
-}
-MA_API size_t ma_dr_wav_write_raw(ma_dr_wav* pWav, size_t bytesToWrite, const void* pData)
-{
-    size_t bytesWritten;
-    if (pWav == NULL || bytesToWrite == 0 || pData == NULL) {
-        return 0;
-    }
-    bytesWritten = pWav->onWrite(pWav->pUserData, pData, bytesToWrite);
-    pWav->dataChunkDataSize += bytesWritten;
-    return bytesWritten;
-}
-MA_API ma_uint64 ma_dr_wav_write_pcm_frames_le(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData)
-{
-    ma_uint64 bytesToWrite;
-    ma_uint64 bytesWritten;
-    const ma_uint8* pRunningData;
-    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
-        return 0;
-    }
-    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
-    if (bytesToWrite > MA_SIZE_MAX) {
-        return 0;
-    }
-    bytesWritten = 0;
-    pRunningData = (const ma_uint8*)pData;
-    while (bytesToWrite > 0) {
-        size_t bytesJustWritten;
-        ma_uint64 bytesToWriteThisIteration;
-        bytesToWriteThisIteration = bytesToWrite;
-        MA_DR_WAV_ASSERT(bytesToWriteThisIteration <= MA_SIZE_MAX);
-        bytesJustWritten = ma_dr_wav_write_raw(pWav, (size_t)bytesToWriteThisIteration, pRunningData);
-        if (bytesJustWritten == 0) {
-            break;
-        }
-        bytesToWrite -= bytesJustWritten;
-        bytesWritten += bytesJustWritten;
-        pRunningData += bytesJustWritten;
-    }
-    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
-}
-MA_API ma_uint64 ma_dr_wav_write_pcm_frames_be(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData)
-{
-    ma_uint64 bytesToWrite;
-    ma_uint64 bytesWritten;
-    ma_uint32 bytesPerSample;
-    const ma_uint8* pRunningData;
-    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
-        return 0;
-    }
-    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
-    if (bytesToWrite > MA_SIZE_MAX) {
-        return 0;
-    }
-    bytesWritten = 0;
-    pRunningData = (const ma_uint8*)pData;
-    bytesPerSample = ma_dr_wav_get_bytes_per_pcm_frame(pWav) / pWav->channels;
-    if (bytesPerSample == 0) {
-        return 0;
-    }
-    while (bytesToWrite > 0) {
-        ma_uint8 temp[4096];
-        ma_uint32 sampleCount;
-        size_t bytesJustWritten;
-        ma_uint64 bytesToWriteThisIteration;
-        bytesToWriteThisIteration = bytesToWrite;
-        MA_DR_WAV_ASSERT(bytesToWriteThisIteration <= MA_SIZE_MAX);
-        sampleCount = sizeof(temp)/bytesPerSample;
-        if (bytesToWriteThisIteration > ((ma_uint64)sampleCount)*bytesPerSample) {
-            bytesToWriteThisIteration = ((ma_uint64)sampleCount)*bytesPerSample;
-        }
-        MA_DR_WAV_COPY_MEMORY(temp, pRunningData, (size_t)bytesToWriteThisIteration);
-        ma_dr_wav__bswap_samples(temp, sampleCount, bytesPerSample);
-        bytesJustWritten = ma_dr_wav_write_raw(pWav, (size_t)bytesToWriteThisIteration, temp);
-        if (bytesJustWritten == 0) {
-            break;
-        }
-        bytesToWrite -= bytesJustWritten;
-        bytesWritten += bytesJustWritten;
-        pRunningData += bytesJustWritten;
-    }
-    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
-}
-MA_API ma_uint64 ma_dr_wav_write_pcm_frames(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData)
-{
-    if (ma_dr_wav__is_little_endian()) {
-        return ma_dr_wav_write_pcm_frames_le(pWav, framesToWrite, pData);
-    } else {
-        return ma_dr_wav_write_pcm_frames_be(pWav, framesToWrite, pData);
-    }
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__msadpcm(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    ma_uint64 totalFramesRead = 0;
-    MA_DR_WAV_ASSERT(pWav != NULL);
-    MA_DR_WAV_ASSERT(framesToRead > 0);
-    while (pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
-        MA_DR_WAV_ASSERT(framesToRead > 0);
-        if (pWav->msadpcm.cachedFrameCount == 0 && pWav->msadpcm.bytesRemainingInBlock == 0) {
-            if (pWav->channels == 1) {
-                ma_uint8 header[7];
-                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
-                    return totalFramesRead;
-                }
-                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
-                pWav->msadpcm.predictor[0]     = header[0];
-                pWav->msadpcm.delta[0]         = ma_dr_wav_bytes_to_s16(header + 1);
-                pWav->msadpcm.prevFrames[0][1] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 3);
-                pWav->msadpcm.prevFrames[0][0] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 5);
-                pWav->msadpcm.cachedFrames[2]  = pWav->msadpcm.prevFrames[0][0];
-                pWav->msadpcm.cachedFrames[3]  = pWav->msadpcm.prevFrames[0][1];
-                pWav->msadpcm.cachedFrameCount = 2;
-            } else {
-                ma_uint8 header[14];
-                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
-                    return totalFramesRead;
-                }
-                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
-                pWav->msadpcm.predictor[0] = header[0];
-                pWav->msadpcm.predictor[1] = header[1];
-                pWav->msadpcm.delta[0] = ma_dr_wav_bytes_to_s16(header + 2);
-                pWav->msadpcm.delta[1] = ma_dr_wav_bytes_to_s16(header + 4);
-                pWav->msadpcm.prevFrames[0][1] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 6);
-                pWav->msadpcm.prevFrames[1][1] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 8);
-                pWav->msadpcm.prevFrames[0][0] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 10);
-                pWav->msadpcm.prevFrames[1][0] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 12);
-                pWav->msadpcm.cachedFrames[0] = pWav->msadpcm.prevFrames[0][0];
-                pWav->msadpcm.cachedFrames[1] = pWav->msadpcm.prevFrames[1][0];
-                pWav->msadpcm.cachedFrames[2] = pWav->msadpcm.prevFrames[0][1];
-                pWav->msadpcm.cachedFrames[3] = pWav->msadpcm.prevFrames[1][1];
-                pWav->msadpcm.cachedFrameCount = 2;
-            }
-        }
-        while (framesToRead > 0 && pWav->msadpcm.cachedFrameCount > 0 && pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
-            if (pBufferOut != NULL) {
-                ma_uint32 iSample = 0;
-                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
-                    pBufferOut[iSample] = (ma_int16)pWav->msadpcm.cachedFrames[(ma_dr_wav_countof(pWav->msadpcm.cachedFrames) - (pWav->msadpcm.cachedFrameCount*pWav->channels)) + iSample];
-                }
-                pBufferOut += pWav->channels;
-            }
-            framesToRead    -= 1;
-            totalFramesRead += 1;
-            pWav->readCursorInPCMFrames += 1;
-            pWav->msadpcm.cachedFrameCount -= 1;
-        }
-        if (framesToRead == 0) {
-            break;
-        }
-        if (pWav->msadpcm.cachedFrameCount == 0) {
-            if (pWav->msadpcm.bytesRemainingInBlock == 0) {
-                continue;
-            } else {
-                static ma_int32 adaptationTable[] = {
-                    230, 230, 230, 230, 307, 409, 512, 614,
-                    768, 614, 512, 409, 307, 230, 230, 230
-                };
-                static ma_int32 coeff1Table[] = { 256, 512, 0, 192, 240, 460,  392 };
-                static ma_int32 coeff2Table[] = { 0,  -256, 0, 64,  0,  -208, -232 };
-                ma_uint8 nibbles;
-                ma_int32 nibble0;
-                ma_int32 nibble1;
-                if (pWav->onRead(pWav->pUserData, &nibbles, 1) != 1) {
-                    return totalFramesRead;
-                }
-                pWav->msadpcm.bytesRemainingInBlock -= 1;
-                nibble0 = ((nibbles & 0xF0) >> 4); if ((nibbles & 0x80)) { nibble0 |= 0xFFFFFFF0UL; }
-                nibble1 = ((nibbles & 0x0F) >> 0); if ((nibbles & 0x08)) { nibble1 |= 0xFFFFFFF0UL; }
-                if (pWav->channels == 1) {
-                    ma_int32 newSample0;
-                    ma_int32 newSample1;
-                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
-                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
-                    newSample0  = ma_dr_wav_clamp(newSample0, -32768, 32767);
-                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
-                    if (pWav->msadpcm.delta[0] < 16) {
-                        pWav->msadpcm.delta[0] = 16;
-                    }
-                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
-                    pWav->msadpcm.prevFrames[0][1] = newSample0;
-                    newSample1  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
-                    newSample1 += nibble1 * pWav->msadpcm.delta[0];
-                    newSample1  = ma_dr_wav_clamp(newSample1, -32768, 32767);
-                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[0]) >> 8;
-                    if (pWav->msadpcm.delta[0] < 16) {
-                        pWav->msadpcm.delta[0] = 16;
-                    }
-                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
-                    pWav->msadpcm.prevFrames[0][1] = newSample1;
-                    pWav->msadpcm.cachedFrames[2] = newSample0;
-                    pWav->msadpcm.cachedFrames[3] = newSample1;
-                    pWav->msadpcm.cachedFrameCount = 2;
-                } else {
-                    ma_int32 newSample0;
-                    ma_int32 newSample1;
-                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
-                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
-                    newSample0  = ma_dr_wav_clamp(newSample0, -32768, 32767);
-                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
-                    if (pWav->msadpcm.delta[0] < 16) {
-                        pWav->msadpcm.delta[0] = 16;
-                    }
-                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
-                    pWav->msadpcm.prevFrames[0][1] = newSample0;
-                    newSample1  = ((pWav->msadpcm.prevFrames[1][1] * coeff1Table[pWav->msadpcm.predictor[1]]) + (pWav->msadpcm.prevFrames[1][0] * coeff2Table[pWav->msadpcm.predictor[1]])) >> 8;
-                    newSample1 += nibble1 * pWav->msadpcm.delta[1];
-                    newSample1  = ma_dr_wav_clamp(newSample1, -32768, 32767);
-                    pWav->msadpcm.delta[1] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[1]) >> 8;
-                    if (pWav->msadpcm.delta[1] < 16) {
-                        pWav->msadpcm.delta[1] = 16;
-                    }
-                    pWav->msadpcm.prevFrames[1][0] = pWav->msadpcm.prevFrames[1][1];
-                    pWav->msadpcm.prevFrames[1][1] = newSample1;
-                    pWav->msadpcm.cachedFrames[2] = newSample0;
-                    pWav->msadpcm.cachedFrames[3] = newSample1;
-                    pWav->msadpcm.cachedFrameCount = 1;
-                }
-            }
-        }
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__ima(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    ma_uint64 totalFramesRead = 0;
-    ma_uint32 iChannel;
-    static ma_int32 indexTable[16] = {
-        -1, -1, -1, -1, 2, 4, 6, 8,
-        -1, -1, -1, -1, 2, 4, 6, 8
-    };
-    static ma_int32 stepTable[89] = {
-        7,     8,     9,     10,    11,    12,    13,    14,    16,    17,
-        19,    21,    23,    25,    28,    31,    34,    37,    41,    45,
-        50,    55,    60,    66,    73,    80,    88,    97,    107,   118,
-        130,   143,   157,   173,   190,   209,   230,   253,   279,   307,
-        337,   371,   408,   449,   494,   544,   598,   658,   724,   796,
-        876,   963,   1060,  1166,  1282,  1411,  1552,  1707,  1878,  2066,
-        2272,  2499,  2749,  3024,  3327,  3660,  4026,  4428,  4871,  5358,
-        5894,  6484,  7132,  7845,  8630,  9493,  10442, 11487, 12635, 13899,
-        15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767
-    };
-    MA_DR_WAV_ASSERT(pWav != NULL);
-    MA_DR_WAV_ASSERT(framesToRead > 0);
-    while (pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
-        MA_DR_WAV_ASSERT(framesToRead > 0);
-        if (pWav->ima.cachedFrameCount == 0 && pWav->ima.bytesRemainingInBlock == 0) {
-            if (pWav->channels == 1) {
-                ma_uint8 header[4];
-                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
-                    return totalFramesRead;
-                }
-                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
-                if (header[2] >= ma_dr_wav_countof(stepTable)) {
-                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, ma_dr_wav_seek_origin_current);
-                    pWav->ima.bytesRemainingInBlock = 0;
-                    return totalFramesRead;
-                }
-                pWav->ima.predictor[0] = (ma_int16)ma_dr_wav_bytes_to_u16(header + 0);
-                pWav->ima.stepIndex[0] = ma_dr_wav_clamp(header[2], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
-                pWav->ima.cachedFrames[ma_dr_wav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[0];
-                pWav->ima.cachedFrameCount = 1;
-            } else {
-                ma_uint8 header[8];
-                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
-                    return totalFramesRead;
-                }
-                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
-                if (header[2] >= ma_dr_wav_countof(stepTable) || header[6] >= ma_dr_wav_countof(stepTable)) {
-                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, ma_dr_wav_seek_origin_current);
-                    pWav->ima.bytesRemainingInBlock = 0;
-                    return totalFramesRead;
-                }
-                pWav->ima.predictor[0] = ma_dr_wav_bytes_to_s16(header + 0);
-                pWav->ima.stepIndex[0] = ma_dr_wav_clamp(header[2], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
-                pWav->ima.predictor[1] = ma_dr_wav_bytes_to_s16(header + 4);
-                pWav->ima.stepIndex[1] = ma_dr_wav_clamp(header[6], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
-                pWav->ima.cachedFrames[ma_dr_wav_countof(pWav->ima.cachedFrames) - 2] = pWav->ima.predictor[0];
-                pWav->ima.cachedFrames[ma_dr_wav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[1];
-                pWav->ima.cachedFrameCount = 1;
-            }
-        }
-        while (framesToRead > 0 && pWav->ima.cachedFrameCount > 0 && pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
-            if (pBufferOut != NULL) {
-                ma_uint32 iSample;
-                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
-                    pBufferOut[iSample] = (ma_int16)pWav->ima.cachedFrames[(ma_dr_wav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + iSample];
-                }
-                pBufferOut += pWav->channels;
-            }
-            framesToRead    -= 1;
-            totalFramesRead += 1;
-            pWav->readCursorInPCMFrames += 1;
-            pWav->ima.cachedFrameCount -= 1;
-        }
-        if (framesToRead == 0) {
-            break;
-        }
-        if (pWav->ima.cachedFrameCount == 0) {
-            if (pWav->ima.bytesRemainingInBlock == 0) {
-                continue;
-            } else {
-                pWav->ima.cachedFrameCount = 8;
-                for (iChannel = 0; iChannel < pWav->channels; ++iChannel) {
-                    ma_uint32 iByte;
-                    ma_uint8 nibbles[4];
-                    if (pWav->onRead(pWav->pUserData, &nibbles, 4) != 4) {
-                        pWav->ima.cachedFrameCount = 0;
-                        return totalFramesRead;
-                    }
-                    pWav->ima.bytesRemainingInBlock -= 4;
-                    for (iByte = 0; iByte < 4; ++iByte) {
-                        ma_uint8 nibble0 = ((nibbles[iByte] & 0x0F) >> 0);
-                        ma_uint8 nibble1 = ((nibbles[iByte] & 0xF0) >> 4);
-                        ma_int32 step      = stepTable[pWav->ima.stepIndex[iChannel]];
-                        ma_int32 predictor = pWav->ima.predictor[iChannel];
-                        ma_int32      diff  = step >> 3;
-                        if (nibble0 & 1) diff += step >> 2;
-                        if (nibble0 & 2) diff += step >> 1;
-                        if (nibble0 & 4) diff += step;
-                        if (nibble0 & 8) diff  = -diff;
-                        predictor = ma_dr_wav_clamp(predictor + diff, -32768, 32767);
-                        pWav->ima.predictor[iChannel] = predictor;
-                        pWav->ima.stepIndex[iChannel] = ma_dr_wav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble0], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
-                        pWav->ima.cachedFrames[(ma_dr_wav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+0)*pWav->channels + iChannel] = predictor;
-                        step      = stepTable[pWav->ima.stepIndex[iChannel]];
-                        predictor = pWav->ima.predictor[iChannel];
-                                         diff  = step >> 3;
-                        if (nibble1 & 1) diff += step >> 2;
-                        if (nibble1 & 2) diff += step >> 1;
-                        if (nibble1 & 4) diff += step;
-                        if (nibble1 & 8) diff  = -diff;
-                        predictor = ma_dr_wav_clamp(predictor + diff, -32768, 32767);
-                        pWav->ima.predictor[iChannel] = predictor;
-                        pWav->ima.stepIndex[iChannel] = ma_dr_wav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble1], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
-                        pWav->ima.cachedFrames[(ma_dr_wav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+1)*pWav->channels + iChannel] = predictor;
-                    }
-                }
-            }
-        }
-    }
-    return totalFramesRead;
-}
-#ifndef MA_DR_WAV_NO_CONVERSION_API
-static unsigned short g_ma_dr_wavAlawTable[256] = {
-    0xEA80, 0xEB80, 0xE880, 0xE980, 0xEE80, 0xEF80, 0xEC80, 0xED80, 0xE280, 0xE380, 0xE080, 0xE180, 0xE680, 0xE780, 0xE480, 0xE580,
-    0xF540, 0xF5C0, 0xF440, 0xF4C0, 0xF740, 0xF7C0, 0xF640, 0xF6C0, 0xF140, 0xF1C0, 0xF040, 0xF0C0, 0xF340, 0xF3C0, 0xF240, 0xF2C0,
-    0xAA00, 0xAE00, 0xA200, 0xA600, 0xBA00, 0xBE00, 0xB200, 0xB600, 0x8A00, 0x8E00, 0x8200, 0x8600, 0x9A00, 0x9E00, 0x9200, 0x9600,
-    0xD500, 0xD700, 0xD100, 0xD300, 0xDD00, 0xDF00, 0xD900, 0xDB00, 0xC500, 0xC700, 0xC100, 0xC300, 0xCD00, 0xCF00, 0xC900, 0xCB00,
-    0xFEA8, 0xFEB8, 0xFE88, 0xFE98, 0xFEE8, 0xFEF8, 0xFEC8, 0xFED8, 0xFE28, 0xFE38, 0xFE08, 0xFE18, 0xFE68, 0xFE78, 0xFE48, 0xFE58,
-    0xFFA8, 0xFFB8, 0xFF88, 0xFF98, 0xFFE8, 0xFFF8, 0xFFC8, 0xFFD8, 0xFF28, 0xFF38, 0xFF08, 0xFF18, 0xFF68, 0xFF78, 0xFF48, 0xFF58,
-    0xFAA0, 0xFAE0, 0xFA20, 0xFA60, 0xFBA0, 0xFBE0, 0xFB20, 0xFB60, 0xF8A0, 0xF8E0, 0xF820, 0xF860, 0xF9A0, 0xF9E0, 0xF920, 0xF960,
-    0xFD50, 0xFD70, 0xFD10, 0xFD30, 0xFDD0, 0xFDF0, 0xFD90, 0xFDB0, 0xFC50, 0xFC70, 0xFC10, 0xFC30, 0xFCD0, 0xFCF0, 0xFC90, 0xFCB0,
-    0x1580, 0x1480, 0x1780, 0x1680, 0x1180, 0x1080, 0x1380, 0x1280, 0x1D80, 0x1C80, 0x1F80, 0x1E80, 0x1980, 0x1880, 0x1B80, 0x1A80,
-    0x0AC0, 0x0A40, 0x0BC0, 0x0B40, 0x08C0, 0x0840, 0x09C0, 0x0940, 0x0EC0, 0x0E40, 0x0FC0, 0x0F40, 0x0CC0, 0x0C40, 0x0DC0, 0x0D40,
-    0x5600, 0x5200, 0x5E00, 0x5A00, 0x4600, 0x4200, 0x4E00, 0x4A00, 0x7600, 0x7200, 0x7E00, 0x7A00, 0x6600, 0x6200, 0x6E00, 0x6A00,
-    0x2B00, 0x2900, 0x2F00, 0x2D00, 0x2300, 0x2100, 0x2700, 0x2500, 0x3B00, 0x3900, 0x3F00, 0x3D00, 0x3300, 0x3100, 0x3700, 0x3500,
-    0x0158, 0x0148, 0x0178, 0x0168, 0x0118, 0x0108, 0x0138, 0x0128, 0x01D8, 0x01C8, 0x01F8, 0x01E8, 0x0198, 0x0188, 0x01B8, 0x01A8,
-    0x0058, 0x0048, 0x0078, 0x0068, 0x0018, 0x0008, 0x0038, 0x0028, 0x00D8, 0x00C8, 0x00F8, 0x00E8, 0x0098, 0x0088, 0x00B8, 0x00A8,
-    0x0560, 0x0520, 0x05E0, 0x05A0, 0x0460, 0x0420, 0x04E0, 0x04A0, 0x0760, 0x0720, 0x07E0, 0x07A0, 0x0660, 0x0620, 0x06E0, 0x06A0,
-    0x02B0, 0x0290, 0x02F0, 0x02D0, 0x0230, 0x0210, 0x0270, 0x0250, 0x03B0, 0x0390, 0x03F0, 0x03D0, 0x0330, 0x0310, 0x0370, 0x0350
-};
-static unsigned short g_ma_dr_wavMulawTable[256] = {
-    0x8284, 0x8684, 0x8A84, 0x8E84, 0x9284, 0x9684, 0x9A84, 0x9E84, 0xA284, 0xA684, 0xAA84, 0xAE84, 0xB284, 0xB684, 0xBA84, 0xBE84,
-    0xC184, 0xC384, 0xC584, 0xC784, 0xC984, 0xCB84, 0xCD84, 0xCF84, 0xD184, 0xD384, 0xD584, 0xD784, 0xD984, 0xDB84, 0xDD84, 0xDF84,
-    0xE104, 0xE204, 0xE304, 0xE404, 0xE504, 0xE604, 0xE704, 0xE804, 0xE904, 0xEA04, 0xEB04, 0xEC04, 0xED04, 0xEE04, 0xEF04, 0xF004,
-    0xF0C4, 0xF144, 0xF1C4, 0xF244, 0xF2C4, 0xF344, 0xF3C4, 0xF444, 0xF4C4, 0xF544, 0xF5C4, 0xF644, 0xF6C4, 0xF744, 0xF7C4, 0xF844,
-    0xF8A4, 0xF8E4, 0xF924, 0xF964, 0xF9A4, 0xF9E4, 0xFA24, 0xFA64, 0xFAA4, 0xFAE4, 0xFB24, 0xFB64, 0xFBA4, 0xFBE4, 0xFC24, 0xFC64,
-    0xFC94, 0xFCB4, 0xFCD4, 0xFCF4, 0xFD14, 0xFD34, 0xFD54, 0xFD74, 0xFD94, 0xFDB4, 0xFDD4, 0xFDF4, 0xFE14, 0xFE34, 0xFE54, 0xFE74,
-    0xFE8C, 0xFE9C, 0xFEAC, 0xFEBC, 0xFECC, 0xFEDC, 0xFEEC, 0xFEFC, 0xFF0C, 0xFF1C, 0xFF2C, 0xFF3C, 0xFF4C, 0xFF5C, 0xFF6C, 0xFF7C,
-    0xFF88, 0xFF90, 0xFF98, 0xFFA0, 0xFFA8, 0xFFB0, 0xFFB8, 0xFFC0, 0xFFC8, 0xFFD0, 0xFFD8, 0xFFE0, 0xFFE8, 0xFFF0, 0xFFF8, 0x0000,
-    0x7D7C, 0x797C, 0x757C, 0x717C, 0x6D7C, 0x697C, 0x657C, 0x617C, 0x5D7C, 0x597C, 0x557C, 0x517C, 0x4D7C, 0x497C, 0x457C, 0x417C,
-    0x3E7C, 0x3C7C, 0x3A7C, 0x387C, 0x367C, 0x347C, 0x327C, 0x307C, 0x2E7C, 0x2C7C, 0x2A7C, 0x287C, 0x267C, 0x247C, 0x227C, 0x207C,
-    0x1EFC, 0x1DFC, 0x1CFC, 0x1BFC, 0x1AFC, 0x19FC, 0x18FC, 0x17FC, 0x16FC, 0x15FC, 0x14FC, 0x13FC, 0x12FC, 0x11FC, 0x10FC, 0x0FFC,
-    0x0F3C, 0x0EBC, 0x0E3C, 0x0DBC, 0x0D3C, 0x0CBC, 0x0C3C, 0x0BBC, 0x0B3C, 0x0ABC, 0x0A3C, 0x09BC, 0x093C, 0x08BC, 0x083C, 0x07BC,
-    0x075C, 0x071C, 0x06DC, 0x069C, 0x065C, 0x061C, 0x05DC, 0x059C, 0x055C, 0x051C, 0x04DC, 0x049C, 0x045C, 0x041C, 0x03DC, 0x039C,
-    0x036C, 0x034C, 0x032C, 0x030C, 0x02EC, 0x02CC, 0x02AC, 0x028C, 0x026C, 0x024C, 0x022C, 0x020C, 0x01EC, 0x01CC, 0x01AC, 0x018C,
-    0x0174, 0x0164, 0x0154, 0x0144, 0x0134, 0x0124, 0x0114, 0x0104, 0x00F4, 0x00E4, 0x00D4, 0x00C4, 0x00B4, 0x00A4, 0x0094, 0x0084,
-    0x0078, 0x0070, 0x0068, 0x0060, 0x0058, 0x0050, 0x0048, 0x0040, 0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008, 0x0000
-};
-static MA_INLINE ma_int16 ma_dr_wav__alaw_to_s16(ma_uint8 sampleIn)
-{
-    return (short)g_ma_dr_wavAlawTable[sampleIn];
-}
-static MA_INLINE ma_int16 ma_dr_wav__mulaw_to_s16(ma_uint8 sampleIn)
-{
-    return (short)g_ma_dr_wavMulawTable[sampleIn];
-}
-MA_PRIVATE void ma_dr_wav__pcm_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
-{
-    size_t i;
-    if (bytesPerSample == 1) {
-        ma_dr_wav_u8_to_s16(pOut, pIn, totalSampleCount);
-        return;
-    }
-    if (bytesPerSample == 2) {
-        for (i = 0; i < totalSampleCount; ++i) {
-           *pOut++ = ((const ma_int16*)pIn)[i];
-        }
-        return;
-    }
-    if (bytesPerSample == 3) {
-        ma_dr_wav_s24_to_s16(pOut, pIn, totalSampleCount);
-        return;
-    }
-    if (bytesPerSample == 4) {
-        ma_dr_wav_s32_to_s16(pOut, (const ma_int32*)pIn, totalSampleCount);
-        return;
-    }
-    if (bytesPerSample > 8) {
-        MA_DR_WAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
-        return;
-    }
-    for (i = 0; i < totalSampleCount; ++i) {
-        ma_uint64 sample = 0;
-        unsigned int shift  = (8 - bytesPerSample) * 8;
-        unsigned int j;
-        for (j = 0; j < bytesPerSample; j += 1) {
-            MA_DR_WAV_ASSERT(j < 8);
-            sample |= (ma_uint64)(pIn[j]) << shift;
-            shift  += 8;
-        }
-        pIn += j;
-        *pOut++ = (ma_int16)((ma_int64)sample >> 48);
-    }
-}
-MA_PRIVATE void ma_dr_wav__ieee_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
-{
-    if (bytesPerSample == 4) {
-        ma_dr_wav_f32_to_s16(pOut, (const float*)pIn, totalSampleCount);
-        return;
-    } else if (bytesPerSample == 8) {
-        ma_dr_wav_f64_to_s16(pOut, (const double*)pIn, totalSampleCount);
-        return;
-    } else {
-        MA_DR_WAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
-        return;
-    }
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__pcm(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    if ((pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 16) || pBufferOut == NULL) {
-        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, pBufferOut);
-    }
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav__pcm_to_s16(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__ieee(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    if (pBufferOut == NULL) {
-        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav__ieee_to_s16(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__alaw(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    if (pBufferOut == NULL) {
-        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav_alaw_to_s16(pBufferOut, sampleData, (size_t)samplesRead);
-        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
-        {
-            if (pWav->container == ma_dr_wav_container_aiff) {
-                ma_uint64 iSample;
-                for (iSample = 0; iSample < samplesRead; iSample += 1) {
-                    pBufferOut[iSample] = -pBufferOut[iSample];
-                }
-            }
-        }
-        #endif
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__mulaw(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    if (pBufferOut == NULL) {
-        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav_mulaw_to_s16(pBufferOut, sampleData, (size_t)samplesRead);
-        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
-        {
-            if (pWav->container == ma_dr_wav_container_aiff) {
-                ma_uint64 iSample;
-                for (iSample = 0; iSample < samplesRead; iSample += 1) {
-                    pBufferOut[iSample] = -pBufferOut[iSample];
-                }
-            }
-        }
-        #endif
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    if (pWav == NULL || framesToRead == 0) {
-        return 0;
-    }
-    if (pBufferOut == NULL) {
-        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-    if (framesToRead * pWav->channels * sizeof(ma_int16) > MA_SIZE_MAX) {
-        framesToRead = MA_SIZE_MAX / sizeof(ma_int16) / pWav->channels;
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM) {
-        return ma_dr_wav_read_pcm_frames_s16__pcm(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_IEEE_FLOAT) {
-        return ma_dr_wav_read_pcm_frames_s16__ieee(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ALAW) {
-        return ma_dr_wav_read_pcm_frames_s16__alaw(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_MULAW) {
-        return ma_dr_wav_read_pcm_frames_s16__mulaw(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
-        return ma_dr_wav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-        return ma_dr_wav_read_pcm_frames_s16__ima(pWav, framesToRead, pBufferOut);
-    }
-    return 0;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16le(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_FALSE) {
-        ma_dr_wav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
-    }
-    return framesRead;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16be(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_TRUE) {
-        ma_dr_wav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
-    }
-    return framesRead;
-}
-MA_API void ma_dr_wav_u8_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        int x = pIn[i];
-        r = x << 8;
-        r = r - 32768;
-        pOut[i] = (short)r;
-    }
-}
-MA_API void ma_dr_wav_s24_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        int x = ((int)(((unsigned int)(((const ma_uint8*)pIn)[i*3+0]) << 8) | ((unsigned int)(((const ma_uint8*)pIn)[i*3+1]) << 16) | ((unsigned int)(((const ma_uint8*)pIn)[i*3+2])) << 24)) >> 8;
-        r = x >> 8;
-        pOut[i] = (short)r;
-    }
-}
-MA_API void ma_dr_wav_s32_to_s16(ma_int16* pOut, const ma_int32* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        int x = pIn[i];
-        r = x >> 16;
-        pOut[i] = (short)r;
-    }
-}
-MA_API void ma_dr_wav_f32_to_s16(ma_int16* pOut, const float* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        float x = pIn[i];
-        float c;
-        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
-        c = c + 1;
-        r = (int)(c * 32767.5f);
-        r = r - 32768;
-        pOut[i] = (short)r;
-    }
-}
-MA_API void ma_dr_wav_f64_to_s16(ma_int16* pOut, const double* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        double x = pIn[i];
-        double c;
-        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
-        c = c + 1;
-        r = (int)(c * 32767.5);
-        r = r - 32768;
-        pOut[i] = (short)r;
-    }
-}
-MA_API void ma_dr_wav_alaw_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        pOut[i] = ma_dr_wav__alaw_to_s16(pIn[i]);
-    }
-}
-MA_API void ma_dr_wav_mulaw_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        pOut[i] = ma_dr_wav__mulaw_to_s16(pIn[i]);
-    }
-}
-MA_PRIVATE void ma_dr_wav__pcm_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
-{
-    unsigned int i;
-    if (bytesPerSample == 1) {
-        ma_dr_wav_u8_to_f32(pOut, pIn, sampleCount);
-        return;
-    }
-    if (bytesPerSample == 2) {
-        ma_dr_wav_s16_to_f32(pOut, (const ma_int16*)pIn, sampleCount);
-        return;
-    }
-    if (bytesPerSample == 3) {
-        ma_dr_wav_s24_to_f32(pOut, pIn, sampleCount);
-        return;
-    }
-    if (bytesPerSample == 4) {
-        ma_dr_wav_s32_to_f32(pOut, (const ma_int32*)pIn, sampleCount);
-        return;
-    }
-    if (bytesPerSample > 8) {
-        MA_DR_WAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        ma_uint64 sample = 0;
-        unsigned int shift  = (8 - bytesPerSample) * 8;
-        unsigned int j;
-        for (j = 0; j < bytesPerSample; j += 1) {
-            MA_DR_WAV_ASSERT(j < 8);
-            sample |= (ma_uint64)(pIn[j]) << shift;
-            shift  += 8;
-        }
-        pIn += j;
-        *pOut++ = (float)((ma_int64)sample / 9223372036854775807.0);
-    }
-}
-MA_PRIVATE void ma_dr_wav__ieee_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
-{
-    if (bytesPerSample == 4) {
-        unsigned int i;
-        for (i = 0; i < sampleCount; ++i) {
-            *pOut++ = ((const float*)pIn)[i];
-        }
-        return;
-    } else if (bytesPerSample == 8) {
-        ma_dr_wav_f64_to_f32(pOut, (const double*)pIn, sampleCount);
-        return;
-    } else {
-        MA_DR_WAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
-        return;
-    }
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__pcm(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav__pcm_to_f32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__msadpcm_ima(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_int16 samples16[2048];
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, ma_dr_wav_countof(samples16)/pWav->channels);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, framesToReadThisIteration, samples16);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        ma_dr_wav_s16_to_f32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__ieee(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_IEEE_FLOAT && pWav->bitsPerSample == 32) {
-        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, pBufferOut);
-    }
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav__ieee_to_f32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__alaw(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav_alaw_to_f32(pBufferOut, sampleData, (size_t)samplesRead);
-        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
-        {
-            if (pWav->container == ma_dr_wav_container_aiff) {
-                ma_uint64 iSample;
-                for (iSample = 0; iSample < samplesRead; iSample += 1) {
-                    pBufferOut[iSample] = -pBufferOut[iSample];
-                }
-            }
-        }
-        #endif
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__mulaw(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav_mulaw_to_f32(pBufferOut, sampleData, (size_t)samplesRead);
-        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
-        {
-            if (pWav->container == ma_dr_wav_container_aiff) {
-                ma_uint64 iSample;
-                for (iSample = 0; iSample < samplesRead; iSample += 1) {
-                    pBufferOut[iSample] = -pBufferOut[iSample];
-                }
-            }
-        }
-        #endif
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
-{
-    if (pWav == NULL || framesToRead == 0) {
-        return 0;
-    }
-    if (pBufferOut == NULL) {
-        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-    if (framesToRead * pWav->channels * sizeof(float) > MA_SIZE_MAX) {
-        framesToRead = MA_SIZE_MAX / sizeof(float) / pWav->channels;
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM) {
-        return ma_dr_wav_read_pcm_frames_f32__pcm(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-        return ma_dr_wav_read_pcm_frames_f32__msadpcm_ima(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_IEEE_FLOAT) {
-        return ma_dr_wav_read_pcm_frames_f32__ieee(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ALAW) {
-        return ma_dr_wav_read_pcm_frames_f32__alaw(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_MULAW) {
-        return ma_dr_wav_read_pcm_frames_f32__mulaw(pWav, framesToRead, pBufferOut);
-    }
-    return 0;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32le(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
-{
-    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_FALSE) {
-        ma_dr_wav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
-    }
-    return framesRead;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32be(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
-{
-    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_TRUE) {
-        ma_dr_wav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
-    }
-    return framesRead;
-}
-MA_API void ma_dr_wav_u8_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-#ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (pIn[i] / 256.0f) * 2 - 1;
-    }
-#else
-    for (i = 0; i < sampleCount; ++i) {
-        float x = pIn[i];
-        x = x * 0.00784313725490196078f;
-        x = x - 1;
-        *pOut++ = x;
-    }
-#endif
-}
-MA_API void ma_dr_wav_s16_to_f32(float* pOut, const ma_int16* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = pIn[i] * 0.000030517578125f;
-    }
-}
-MA_API void ma_dr_wav_s24_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        double x;
-        ma_uint32 a = ((ma_uint32)(pIn[i*3+0]) <<  8);
-        ma_uint32 b = ((ma_uint32)(pIn[i*3+1]) << 16);
-        ma_uint32 c = ((ma_uint32)(pIn[i*3+2]) << 24);
-        x = (double)((ma_int32)(a | b | c) >> 8);
-        *pOut++ = (float)(x * 0.00000011920928955078125);
-    }
-}
-MA_API void ma_dr_wav_s32_to_f32(float* pOut, const ma_int32* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (float)(pIn[i] / 2147483648.0);
-    }
-}
-MA_API void ma_dr_wav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (float)pIn[i];
-    }
-}
-MA_API void ma_dr_wav_alaw_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = ma_dr_wav__alaw_to_s16(pIn[i]) / 32768.0f;
-    }
-}
-MA_API void ma_dr_wav_mulaw_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = ma_dr_wav__mulaw_to_s16(pIn[i]) / 32768.0f;
-    }
-}
-MA_PRIVATE void ma_dr_wav__pcm_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
-{
-    unsigned int i;
-    if (bytesPerSample == 1) {
-        ma_dr_wav_u8_to_s32(pOut, pIn, totalSampleCount);
-        return;
-    }
-    if (bytesPerSample == 2) {
-        ma_dr_wav_s16_to_s32(pOut, (const ma_int16*)pIn, totalSampleCount);
-        return;
-    }
-    if (bytesPerSample == 3) {
-        ma_dr_wav_s24_to_s32(pOut, pIn, totalSampleCount);
-        return;
-    }
-    if (bytesPerSample == 4) {
-        for (i = 0; i < totalSampleCount; ++i) {
-           *pOut++ = ((const ma_int32*)pIn)[i];
-        }
-        return;
-    }
-    if (bytesPerSample > 8) {
-        MA_DR_WAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
-        return;
-    }
-    for (i = 0; i < totalSampleCount; ++i) {
-        ma_uint64 sample = 0;
-        unsigned int shift  = (8 - bytesPerSample) * 8;
-        unsigned int j;
-        for (j = 0; j < bytesPerSample; j += 1) {
-            MA_DR_WAV_ASSERT(j < 8);
-            sample |= (ma_uint64)(pIn[j]) << shift;
-            shift  += 8;
-        }
-        pIn += j;
-        *pOut++ = (ma_int32)((ma_int64)sample >> 32);
-    }
-}
-MA_PRIVATE void ma_dr_wav__ieee_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
-{
-    if (bytesPerSample == 4) {
-        ma_dr_wav_f32_to_s32(pOut, (const float*)pIn, totalSampleCount);
-        return;
-    } else if (bytesPerSample == 8) {
-        ma_dr_wav_f64_to_s32(pOut, (const double*)pIn, totalSampleCount);
-        return;
-    } else {
-        MA_DR_WAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
-        return;
-    }
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__pcm(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 32) {
-        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, pBufferOut);
-    }
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav__pcm_to_s32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__msadpcm_ima(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
-{
-    ma_uint64 totalFramesRead = 0;
-    ma_int16 samples16[2048];
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, ma_dr_wav_countof(samples16)/pWav->channels);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, framesToReadThisIteration, samples16);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        ma_dr_wav_s16_to_s32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__ieee(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav__ieee_to_s32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__alaw(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav_alaw_to_s32(pBufferOut, sampleData, (size_t)samplesRead);
-        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
-        {
-            if (pWav->container == ma_dr_wav_container_aiff) {
-                ma_uint64 iSample;
-                for (iSample = 0; iSample < samplesRead; iSample += 1) {
-                    pBufferOut[iSample] = -pBufferOut[iSample];
-                }
-            }
-        }
-        #endif
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__mulaw(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
-{
-    ma_uint64 totalFramesRead;
-    ma_uint8 sampleData[4096] = {0};
-    ma_uint32 bytesPerFrame;
-    ma_uint32 bytesPerSample;
-    ma_uint64 samplesRead;
-    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-    bytesPerSample = bytesPerFrame / pWav->channels;
-    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
-        return 0;
-    }
-    totalFramesRead = 0;
-    while (framesToRead > 0) {
-        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
-        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
-        samplesRead = framesRead * pWav->channels;
-        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
-            MA_DR_WAV_ASSERT(MA_FALSE);
-            break;
-        }
-        ma_dr_wav_mulaw_to_s32(pBufferOut, sampleData, (size_t)samplesRead);
-        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
-        {
-            if (pWav->container == ma_dr_wav_container_aiff) {
-                ma_uint64 iSample;
-                for (iSample = 0; iSample < samplesRead; iSample += 1) {
-                    pBufferOut[iSample] = -pBufferOut[iSample];
-                }
-            }
-        }
-        #endif
-        pBufferOut      += samplesRead;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-    return totalFramesRead;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
-{
-    if (pWav == NULL || framesToRead == 0) {
-        return 0;
-    }
-    if (pBufferOut == NULL) {
-        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-    if (framesToRead * pWav->channels * sizeof(ma_int32) > MA_SIZE_MAX) {
-        framesToRead = MA_SIZE_MAX / sizeof(ma_int32) / pWav->channels;
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM) {
-        return ma_dr_wav_read_pcm_frames_s32__pcm(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
-        return ma_dr_wav_read_pcm_frames_s32__msadpcm_ima(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_IEEE_FLOAT) {
-        return ma_dr_wav_read_pcm_frames_s32__ieee(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ALAW) {
-        return ma_dr_wav_read_pcm_frames_s32__alaw(pWav, framesToRead, pBufferOut);
-    }
-    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_MULAW) {
-        return ma_dr_wav_read_pcm_frames_s32__mulaw(pWav, framesToRead, pBufferOut);
-    }
-    return 0;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32le(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
-{
-    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_FALSE) {
-        ma_dr_wav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
-    }
-    return framesRead;
-}
-MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32be(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
-{
-    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_TRUE) {
-        ma_dr_wav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
-    }
-    return framesRead;
-}
-MA_API void ma_dr_wav_u8_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = ((int)pIn[i] - 128) << 24;
-    }
-}
-MA_API void ma_dr_wav_s16_to_s32(ma_int32* pOut, const ma_int16* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = pIn[i] << 16;
-    }
-}
-MA_API void ma_dr_wav_s24_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        unsigned int s0 = pIn[i*3 + 0];
-        unsigned int s1 = pIn[i*3 + 1];
-        unsigned int s2 = pIn[i*3 + 2];
-        ma_int32 sample32 = (ma_int32)((s0 << 8) | (s1 << 16) | (s2 << 24));
-        *pOut++ = sample32;
-    }
-}
-MA_API void ma_dr_wav_f32_to_s32(ma_int32* pOut, const float* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (ma_int32)(2147483648.0 * pIn[i]);
-    }
-}
-MA_API void ma_dr_wav_f64_to_s32(ma_int32* pOut, const double* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (ma_int32)(2147483648.0 * pIn[i]);
-    }
-}
-MA_API void ma_dr_wav_alaw_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = ((ma_int32)ma_dr_wav__alaw_to_s16(pIn[i])) << 16;
-    }
-}
-MA_API void ma_dr_wav_mulaw_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-    for (i= 0; i < sampleCount; ++i) {
-        *pOut++ = ((ma_int32)ma_dr_wav__mulaw_to_s16(pIn[i])) << 16;
-    }
-}
-MA_PRIVATE ma_int16* ma_dr_wav__read_pcm_frames_and_close_s16(ma_dr_wav* pWav, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalFrameCount)
-{
-    ma_uint64 sampleDataSize;
-    ma_int16* pSampleData;
-    ma_uint64 framesRead;
-    MA_DR_WAV_ASSERT(pWav != NULL);
-    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(ma_int16);
-    if (sampleDataSize > MA_SIZE_MAX) {
-        ma_dr_wav_uninit(pWav);
-        return NULL;
-    }
-    pSampleData = (ma_int16*)ma_dr_wav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks);
-    if (pSampleData == NULL) {
-        ma_dr_wav_uninit(pWav);
-        return NULL;
-    }
-    framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
-    if (framesRead != pWav->totalPCMFrameCount) {
-        ma_dr_wav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
-        ma_dr_wav_uninit(pWav);
-        return NULL;
-    }
-    ma_dr_wav_uninit(pWav);
-    if (sampleRate) {
-        *sampleRate = pWav->sampleRate;
-    }
-    if (channels) {
-        *channels = pWav->channels;
-    }
-    if (totalFrameCount) {
-        *totalFrameCount = pWav->totalPCMFrameCount;
-    }
-    return pSampleData;
-}
-MA_PRIVATE float* ma_dr_wav__read_pcm_frames_and_close_f32(ma_dr_wav* pWav, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalFrameCount)
-{
-    ma_uint64 sampleDataSize;
-    float* pSampleData;
-    ma_uint64 framesRead;
-    MA_DR_WAV_ASSERT(pWav != NULL);
-    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(float);
-    if (sampleDataSize > MA_SIZE_MAX) {
-        ma_dr_wav_uninit(pWav);
-        return NULL;
-    }
-    pSampleData = (float*)ma_dr_wav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks);
-    if (pSampleData == NULL) {
-        ma_dr_wav_uninit(pWav);
-        return NULL;
-    }
-    framesRead = ma_dr_wav_read_pcm_frames_f32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
-    if (framesRead != pWav->totalPCMFrameCount) {
-        ma_dr_wav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
-        ma_dr_wav_uninit(pWav);
-        return NULL;
-    }
-    ma_dr_wav_uninit(pWav);
-    if (sampleRate) {
-        *sampleRate = pWav->sampleRate;
-    }
-    if (channels) {
-        *channels = pWav->channels;
-    }
-    if (totalFrameCount) {
-        *totalFrameCount = pWav->totalPCMFrameCount;
-    }
-    return pSampleData;
-}
-MA_PRIVATE ma_int32* ma_dr_wav__read_pcm_frames_and_close_s32(ma_dr_wav* pWav, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalFrameCount)
-{
-    ma_uint64 sampleDataSize;
-    ma_int32* pSampleData;
-    ma_uint64 framesRead;
-    MA_DR_WAV_ASSERT(pWav != NULL);
-    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(ma_int32);
-    if (sampleDataSize > MA_SIZE_MAX) {
-        ma_dr_wav_uninit(pWav);
-        return NULL;
-    }
-    pSampleData = (ma_int32*)ma_dr_wav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks);
-    if (pSampleData == NULL) {
-        ma_dr_wav_uninit(pWav);
-        return NULL;
-    }
-    framesRead = ma_dr_wav_read_pcm_frames_s32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
-    if (framesRead != pWav->totalPCMFrameCount) {
-        ma_dr_wav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
-        ma_dr_wav_uninit(pWav);
-        return NULL;
-    }
-    ma_dr_wav_uninit(pWav);
-    if (sampleRate) {
-        *sampleRate = pWav->sampleRate;
-    }
-    if (channels) {
-        *channels = pWav->channels;
-    }
-    if (totalFrameCount) {
-        *totalFrameCount = pWav->totalPCMFrameCount;
-    }
-    return pSampleData;
-}
-MA_API ma_int16* ma_dr_wav_open_and_read_pcm_frames_s16(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-MA_API float* ma_dr_wav_open_and_read_pcm_frames_f32(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-MA_API ma_int32* ma_dr_wav_open_and_read_pcm_frames_s32(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-#ifndef MA_DR_WAV_NO_STDIO
-MA_API ma_int16* ma_dr_wav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init_file(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-MA_API float* ma_dr_wav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init_file(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-MA_API ma_int32* ma_dr_wav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init_file(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-#ifndef MA_DR_WAV_NO_WCHAR
-MA_API ma_int16* ma_dr_wav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init_file_w(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-MA_API float* ma_dr_wav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init_file_w(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-MA_API ma_int32* ma_dr_wav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init_file_w(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-#endif
-#endif
-MA_API ma_int16* ma_dr_wav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-MA_API float* ma_dr_wav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-MA_API ma_int32* ma_dr_wav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_wav wav;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-    if (!ma_dr_wav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_wav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-#endif
-MA_API void ma_dr_wav_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        ma_dr_wav__free_from_callbacks(p, pAllocationCallbacks);
-    } else {
-        ma_dr_wav__free_default(p, NULL);
-    }
-}
-MA_API ma_uint16 ma_dr_wav_bytes_to_u16(const ma_uint8* data)
-{
-    return ((ma_uint16)data[0] << 0) | ((ma_uint16)data[1] << 8);
-}
-MA_API ma_int16 ma_dr_wav_bytes_to_s16(const ma_uint8* data)
-{
-    return (ma_int16)ma_dr_wav_bytes_to_u16(data);
-}
-MA_API ma_uint32 ma_dr_wav_bytes_to_u32(const ma_uint8* data)
-{
-    return ma_dr_wav_bytes_to_u32_le(data);
-}
-MA_API float ma_dr_wav_bytes_to_f32(const ma_uint8* data)
-{
-    union {
-        ma_uint32 u32;
-        float f32;
-    } value;
-    value.u32 = ma_dr_wav_bytes_to_u32(data);
-    return value.f32;
-}
-MA_API ma_int32 ma_dr_wav_bytes_to_s32(const ma_uint8* data)
-{
-    return (ma_int32)ma_dr_wav_bytes_to_u32(data);
-}
-MA_API ma_uint64 ma_dr_wav_bytes_to_u64(const ma_uint8* data)
-{
-    return
-        ((ma_uint64)data[0] <<  0) | ((ma_uint64)data[1] <<  8) | ((ma_uint64)data[2] << 16) | ((ma_uint64)data[3] << 24) |
-        ((ma_uint64)data[4] << 32) | ((ma_uint64)data[5] << 40) | ((ma_uint64)data[6] << 48) | ((ma_uint64)data[7] << 56);
-}
-MA_API ma_int64 ma_dr_wav_bytes_to_s64(const ma_uint8* data)
-{
-    return (ma_int64)ma_dr_wav_bytes_to_u64(data);
-}
-MA_API ma_bool32 ma_dr_wav_guid_equal(const ma_uint8 a[16], const ma_uint8 b[16])
-{
-    int i;
-    for (i = 0; i < 16; i += 1) {
-        if (a[i] != b[i]) {
-            return MA_FALSE;
-        }
-    }
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_wav_fourcc_equal(const ma_uint8* a, const char* b)
-{
-    return
-        a[0] == b[0] &&
-        a[1] == b[1] &&
-        a[2] == b[2] &&
-        a[3] == b[3];
-}
-#ifdef __MRC__
-#pragma options opt reset
-#endif
-#endif
-/* dr_wav_c end */
-#endif  /* MA_DR_WAV_IMPLEMENTATION */
-#endif  /* MA_NO_WAV */
-
-#if !defined(MA_NO_FLAC) && !defined(MA_NO_DECODING)
-#if !defined(MA_DR_FLAC_IMPLEMENTATION) && !defined(MA_DR_FLAC_IMPLEMENTATION) /* For backwards compatibility. Will be removed in version 0.11 for cleanliness. */
-/* dr_flac_c begin */
-#ifndef ma_dr_flac_c
-#define ma_dr_flac_c
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-    #pragma GCC diagnostic push
-    #if __GNUC__ >= 7
-    #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
-    #endif
-#endif
-#ifdef __linux__
-    #ifndef _BSD_SOURCE
-        #define _BSD_SOURCE
-    #endif
-    #ifndef _DEFAULT_SOURCE
-        #define _DEFAULT_SOURCE
-    #endif
-    #ifndef __USE_BSD
-        #define __USE_BSD
-    #endif
-    #include <endian.h>
-#endif
-#include <stdlib.h>
-#include <string.h>
-#if !defined(MA_DR_FLAC_NO_SIMD)
-    #if defined(MA_X64) || defined(MA_X86)
-        #if defined(_MSC_VER) && !defined(__clang__)
-            #if _MSC_VER >= 1400 && !defined(MA_DR_FLAC_NO_SSE2)
-                #define MA_DR_FLAC_SUPPORT_SSE2
-            #endif
-            #if _MSC_VER >= 1600 && !defined(MA_DR_FLAC_NO_SSE41)
-                #define MA_DR_FLAC_SUPPORT_SSE41
-            #endif
-        #elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
-            #if defined(__SSE2__) && !defined(MA_DR_FLAC_NO_SSE2)
-                #define MA_DR_FLAC_SUPPORT_SSE2
-            #endif
-            #if defined(__SSE4_1__) && !defined(MA_DR_FLAC_NO_SSE41)
-                #define MA_DR_FLAC_SUPPORT_SSE41
-            #endif
-        #endif
-        #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
-            #if !defined(MA_DR_FLAC_SUPPORT_SSE2) && !defined(MA_DR_FLAC_NO_SSE2) && __has_include(<emmintrin.h>)
-                #define MA_DR_FLAC_SUPPORT_SSE2
-            #endif
-            #if !defined(MA_DR_FLAC_SUPPORT_SSE41) && !defined(MA_DR_FLAC_NO_SSE41) && __has_include(<smmintrin.h>)
-                #define MA_DR_FLAC_SUPPORT_SSE41
-            #endif
-        #endif
-        #if defined(MA_DR_FLAC_SUPPORT_SSE41)
-            #include <smmintrin.h>
-        #elif defined(MA_DR_FLAC_SUPPORT_SSE2)
-            #include <emmintrin.h>
-        #endif
-    #endif
-    #if defined(MA_ARM)
-        #if !defined(MA_DR_FLAC_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
-            #define MA_DR_FLAC_SUPPORT_NEON
-            #include <arm_neon.h>
-        #endif
-    #endif
-#endif
-#if !defined(MA_DR_FLAC_NO_SIMD) && (defined(MA_X86) || defined(MA_X64))
-    #if defined(_MSC_VER) && !defined(__clang__)
-        #if _MSC_VER >= 1400
-            #include <intrin.h>
-            static void ma_dr_flac__cpuid(int info[4], int fid)
-            {
-                __cpuid(info, fid);
-            }
-        #else
-            #define MA_DR_FLAC_NO_CPUID
-        #endif
-    #else
-        #if defined(__GNUC__) || defined(__clang__)
-            static void ma_dr_flac__cpuid(int info[4], int fid)
-            {
-                #if defined(MA_X86) && defined(__PIC__)
-                    __asm__ __volatile__ (
-                        "xchg{l} {%%}ebx, %k1;"
-                        "cpuid;"
-                        "xchg{l} {%%}ebx, %k1;"
-                        : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
-                    );
-                #else
-                    __asm__ __volatile__ (
-                        "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
-                    );
-                #endif
-            }
-        #else
-            #define MA_DR_FLAC_NO_CPUID
-        #endif
-    #endif
-#else
-    #define MA_DR_FLAC_NO_CPUID
-#endif
-static MA_INLINE ma_bool32 ma_dr_flac_has_sse2(void)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_DR_FLAC_NO_SSE2)
-        #if defined(MA_X64)
-            return MA_TRUE;
-        #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
-            return MA_TRUE;
-        #else
-            #if defined(MA_DR_FLAC_NO_CPUID)
-                return MA_FALSE;
-            #else
-                int info[4];
-                ma_dr_flac__cpuid(info, 1);
-                return (info[3] & (1 << 26)) != 0;
-            #endif
-        #endif
-    #else
-        return MA_FALSE;
-    #endif
-#else
-    return MA_FALSE;
-#endif
-}
-static MA_INLINE ma_bool32 ma_dr_flac_has_sse41(void)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE41)
-    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_DR_FLAC_NO_SSE41)
-        #if defined(__SSE4_1__) || defined(__AVX__)
-            return MA_TRUE;
-        #else
-            #if defined(MA_DR_FLAC_NO_CPUID)
-                return MA_FALSE;
-            #else
-                int info[4];
-                ma_dr_flac__cpuid(info, 1);
-                return (info[2] & (1 << 19)) != 0;
-            #endif
-        #endif
-    #else
-        return MA_FALSE;
-    #endif
-#else
-    return MA_FALSE;
-#endif
-}
-#if defined(_MSC_VER) && _MSC_VER >= 1500 && (defined(MA_X86) || defined(MA_X64)) && !defined(__clang__)
-    #define MA_DR_FLAC_HAS_LZCNT_INTRINSIC
-#elif (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
-    #define MA_DR_FLAC_HAS_LZCNT_INTRINSIC
-#elif defined(__clang__)
-    #if defined(__has_builtin)
-        #if __has_builtin(__builtin_clzll) || __has_builtin(__builtin_clzl)
-            #define MA_DR_FLAC_HAS_LZCNT_INTRINSIC
-        #endif
-    #endif
-#endif
-#if defined(_MSC_VER) && _MSC_VER >= 1400 && !defined(__clang__)
-    #define MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
-    #define MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
-    #define MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
-#elif defined(__clang__)
-    #if defined(__has_builtin)
-        #if __has_builtin(__builtin_bswap16)
-            #define MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
-        #endif
-        #if __has_builtin(__builtin_bswap32)
-            #define MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
-        #endif
-        #if __has_builtin(__builtin_bswap64)
-            #define MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
-        #endif
-    #endif
-#elif defined(__GNUC__)
-    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
-        #define MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
-        #define MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
-    #endif
-    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
-        #define MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
-    #endif
-#elif defined(__WATCOMC__) && defined(__386__)
-    #define MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
-    #define MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
-    #define MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
-    extern __inline ma_uint16 _watcom_bswap16(ma_uint16);
-    extern __inline ma_uint32 _watcom_bswap32(ma_uint32);
-    extern __inline ma_uint64 _watcom_bswap64(ma_uint64);
-#pragma aux _watcom_bswap16 = \
-    "xchg al, ah" \
-    parm  [ax]    \
-    value [ax]    \
-    modify nomemory;
-#pragma aux _watcom_bswap32 = \
-    "bswap eax" \
-    parm  [eax] \
-    value [eax] \
-    modify nomemory;
-#pragma aux _watcom_bswap64 = \
-    "bswap eax"     \
-    "bswap edx"     \
-    "xchg eax,edx"  \
-    parm [eax edx]  \
-    value [eax edx] \
-    modify nomemory;
-#endif
-#ifndef MA_DR_FLAC_ASSERT
-#include <assert.h>
-#define MA_DR_FLAC_ASSERT(expression)           assert(expression)
-#endif
-#ifndef MA_DR_FLAC_MALLOC
-#define MA_DR_FLAC_MALLOC(sz)                   malloc((sz))
-#endif
-#ifndef MA_DR_FLAC_REALLOC
-#define MA_DR_FLAC_REALLOC(p, sz)               realloc((p), (sz))
-#endif
-#ifndef MA_DR_FLAC_FREE
-#define MA_DR_FLAC_FREE(p)                      free((p))
-#endif
-#ifndef MA_DR_FLAC_COPY_MEMORY
-#define MA_DR_FLAC_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
-#endif
-#ifndef MA_DR_FLAC_ZERO_MEMORY
-#define MA_DR_FLAC_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
-#endif
-#ifndef MA_DR_FLAC_ZERO_OBJECT
-#define MA_DR_FLAC_ZERO_OBJECT(p)               MA_DR_FLAC_ZERO_MEMORY((p), sizeof(*(p)))
-#endif
-#define MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE                     64
-#define MA_DR_FLAC_SUBFRAME_CONSTANT                        0
-#define MA_DR_FLAC_SUBFRAME_VERBATIM                        1
-#define MA_DR_FLAC_SUBFRAME_FIXED                           8
-#define MA_DR_FLAC_SUBFRAME_LPC                             32
-#define MA_DR_FLAC_SUBFRAME_RESERVED                        255
-#define MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
-#define MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1
-#define MA_DR_FLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
-#define MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
-#define MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
-#define MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
-#define MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES                  18
-#define MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES             36
-#define MA_DR_FLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES       12
-#define ma_dr_flac_align(x, a)                              ((((x) + (a) - 1) / (a)) * (a))
-MA_API void ma_dr_flac_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision)
-{
-    if (pMajor) {
-        *pMajor = MA_DR_FLAC_VERSION_MAJOR;
-    }
-    if (pMinor) {
-        *pMinor = MA_DR_FLAC_VERSION_MINOR;
-    }
-    if (pRevision) {
-        *pRevision = MA_DR_FLAC_VERSION_REVISION;
-    }
-}
-MA_API const char* ma_dr_flac_version_string(void)
-{
-    return MA_DR_FLAC_VERSION_STRING;
-}
-#if defined(__has_feature)
-    #if __has_feature(thread_sanitizer)
-        #define MA_DR_FLAC_NO_THREAD_SANITIZE __attribute__((no_sanitize("thread")))
-    #else
-        #define MA_DR_FLAC_NO_THREAD_SANITIZE
-    #endif
-#else
-    #define MA_DR_FLAC_NO_THREAD_SANITIZE
-#endif
-#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC)
-static ma_bool32 ma_dr_flac__gIsLZCNTSupported = MA_FALSE;
-#endif
-#ifndef MA_DR_FLAC_NO_CPUID
-static ma_bool32 ma_dr_flac__gIsSSE2Supported  = MA_FALSE;
-static ma_bool32 ma_dr_flac__gIsSSE41Supported = MA_FALSE;
-MA_DR_FLAC_NO_THREAD_SANITIZE static void ma_dr_flac__init_cpu_caps(void)
-{
-    static ma_bool32 isCPUCapsInitialized = MA_FALSE;
-    if (!isCPUCapsInitialized) {
-#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC)
-        int info[4] = {0};
-        ma_dr_flac__cpuid(info, 0x80000001);
-        ma_dr_flac__gIsLZCNTSupported = (info[2] & (1 << 5)) != 0;
-#endif
-        ma_dr_flac__gIsSSE2Supported = ma_dr_flac_has_sse2();
-        ma_dr_flac__gIsSSE41Supported = ma_dr_flac_has_sse41();
-        isCPUCapsInitialized = MA_TRUE;
-    }
-}
-#else
-static ma_bool32 ma_dr_flac__gIsNEONSupported  = MA_FALSE;
-static MA_INLINE ma_bool32 ma_dr_flac__has_neon(void)
-{
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-    #if defined(MA_ARM) && !defined(MA_DR_FLAC_NO_NEON)
-        #if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
-            return MA_TRUE;
-        #else
-            return MA_FALSE;
-        #endif
-    #else
-        return MA_FALSE;
-    #endif
-#else
-    return MA_FALSE;
-#endif
-}
-MA_DR_FLAC_NO_THREAD_SANITIZE static void ma_dr_flac__init_cpu_caps(void)
-{
-    ma_dr_flac__gIsNEONSupported = ma_dr_flac__has_neon();
-#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC) && defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
-    ma_dr_flac__gIsLZCNTSupported = MA_TRUE;
-#endif
-}
-#endif
-static MA_INLINE ma_bool32 ma_dr_flac__is_little_endian(void)
-{
-#if defined(MA_X86) || defined(MA_X64)
-    return MA_TRUE;
-#elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
-    return MA_TRUE;
-#else
-    int n = 1;
-    return (*(char*)&n) == 1;
-#endif
-}
-static MA_INLINE ma_uint16 ma_dr_flac__swap_endian_uint16(ma_uint16 n)
-{
-#ifdef MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
-    #if defined(_MSC_VER) && !defined(__clang__)
-        return _byteswap_ushort(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        return __builtin_bswap16(n);
-    #elif defined(__WATCOMC__) && defined(__386__)
-        return _watcom_bswap16(n);
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    return ((n & 0xFF00) >> 8) |
-           ((n & 0x00FF) << 8);
-#endif
-}
-static MA_INLINE ma_uint32 ma_dr_flac__swap_endian_uint32(ma_uint32 n)
-{
-#ifdef MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
-    #if defined(_MSC_VER) && !defined(__clang__)
-        return _byteswap_ulong(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        #if defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(__ARM_ARCH_6M__) && !defined(MA_64BIT)
-            ma_uint32 r;
-            __asm__ __volatile__ (
-            #if defined(MA_64BIT)
-                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)
-            #else
-                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
-            #endif
-            );
-            return r;
-        #else
-            return __builtin_bswap32(n);
-        #endif
-    #elif defined(__WATCOMC__) && defined(__386__)
-        return _watcom_bswap32(n);
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    return ((n & 0xFF000000) >> 24) |
-           ((n & 0x00FF0000) >>  8) |
-           ((n & 0x0000FF00) <<  8) |
-           ((n & 0x000000FF) << 24);
-#endif
-}
-static MA_INLINE ma_uint64 ma_dr_flac__swap_endian_uint64(ma_uint64 n)
-{
-#ifdef MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
-    #if defined(_MSC_VER) && !defined(__clang__)
-        return _byteswap_uint64(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        return __builtin_bswap64(n);
-    #elif defined(__WATCOMC__) && defined(__386__)
-        return _watcom_bswap64(n);
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    return ((n & ((ma_uint64)0xFF000000 << 32)) >> 56) |
-           ((n & ((ma_uint64)0x00FF0000 << 32)) >> 40) |
-           ((n & ((ma_uint64)0x0000FF00 << 32)) >> 24) |
-           ((n & ((ma_uint64)0x000000FF << 32)) >>  8) |
-           ((n & ((ma_uint64)0xFF000000      )) <<  8) |
-           ((n & ((ma_uint64)0x00FF0000      )) << 24) |
-           ((n & ((ma_uint64)0x0000FF00      )) << 40) |
-           ((n & ((ma_uint64)0x000000FF      )) << 56);
-#endif
-}
-static MA_INLINE ma_uint16 ma_dr_flac__be2host_16(ma_uint16 n)
-{
-    if (ma_dr_flac__is_little_endian()) {
-        return ma_dr_flac__swap_endian_uint16(n);
-    }
-    return n;
-}
-static MA_INLINE ma_uint32 ma_dr_flac__be2host_32(ma_uint32 n)
-{
-    if (ma_dr_flac__is_little_endian()) {
-        return ma_dr_flac__swap_endian_uint32(n);
-    }
-    return n;
-}
-static MA_INLINE ma_uint32 ma_dr_flac__be2host_32_ptr_unaligned(const void* pData)
-{
-    const ma_uint8* pNum = (ma_uint8*)pData;
-    return *(pNum) << 24 | *(pNum+1) << 16 | *(pNum+2) << 8 | *(pNum+3);
-}
-static MA_INLINE ma_uint64 ma_dr_flac__be2host_64(ma_uint64 n)
-{
-    if (ma_dr_flac__is_little_endian()) {
-        return ma_dr_flac__swap_endian_uint64(n);
-    }
-    return n;
-}
-static MA_INLINE ma_uint32 ma_dr_flac__le2host_32(ma_uint32 n)
-{
-    if (!ma_dr_flac__is_little_endian()) {
-        return ma_dr_flac__swap_endian_uint32(n);
-    }
-    return n;
-}
-static MA_INLINE ma_uint32 ma_dr_flac__le2host_32_ptr_unaligned(const void* pData)
-{
-    const ma_uint8* pNum = (ma_uint8*)pData;
-    return *pNum | *(pNum+1) << 8 |  *(pNum+2) << 16 | *(pNum+3) << 24;
-}
-static MA_INLINE ma_uint32 ma_dr_flac__unsynchsafe_32(ma_uint32 n)
-{
-    ma_uint32 result = 0;
-    result |= (n & 0x7F000000) >> 3;
-    result |= (n & 0x007F0000) >> 2;
-    result |= (n & 0x00007F00) >> 1;
-    result |= (n & 0x0000007F) >> 0;
-    return result;
-}
-static ma_uint8 ma_dr_flac__crc8_table[] = {
-    0x00, 0x07, 0x0E, 0x09, 0x1C, 0x1B, 0x12, 0x15, 0x38, 0x3F, 0x36, 0x31, 0x24, 0x23, 0x2A, 0x2D,
-    0x70, 0x77, 0x7E, 0x79, 0x6C, 0x6B, 0x62, 0x65, 0x48, 0x4F, 0x46, 0x41, 0x54, 0x53, 0x5A, 0x5D,
-    0xE0, 0xE7, 0xEE, 0xE9, 0xFC, 0xFB, 0xF2, 0xF5, 0xD8, 0xDF, 0xD6, 0xD1, 0xC4, 0xC3, 0xCA, 0xCD,
-    0x90, 0x97, 0x9E, 0x99, 0x8C, 0x8B, 0x82, 0x85, 0xA8, 0xAF, 0xA6, 0xA1, 0xB4, 0xB3, 0xBA, 0xBD,
-    0xC7, 0xC0, 0xC9, 0xCE, 0xDB, 0xDC, 0xD5, 0xD2, 0xFF, 0xF8, 0xF1, 0xF6, 0xE3, 0xE4, 0xED, 0xEA,
-    0xB7, 0xB0, 0xB9, 0xBE, 0xAB, 0xAC, 0xA5, 0xA2, 0x8F, 0x88, 0x81, 0x86, 0x93, 0x94, 0x9D, 0x9A,
-    0x27, 0x20, 0x29, 0x2E, 0x3B, 0x3C, 0x35, 0x32, 0x1F, 0x18, 0x11, 0x16, 0x03, 0x04, 0x0D, 0x0A,
-    0x57, 0x50, 0x59, 0x5E, 0x4B, 0x4C, 0x45, 0x42, 0x6F, 0x68, 0x61, 0x66, 0x73, 0x74, 0x7D, 0x7A,
-    0x89, 0x8E, 0x87, 0x80, 0x95, 0x92, 0x9B, 0x9C, 0xB1, 0xB6, 0xBF, 0xB8, 0xAD, 0xAA, 0xA3, 0xA4,
-    0xF9, 0xFE, 0xF7, 0xF0, 0xE5, 0xE2, 0xEB, 0xEC, 0xC1, 0xC6, 0xCF, 0xC8, 0xDD, 0xDA, 0xD3, 0xD4,
-    0x69, 0x6E, 0x67, 0x60, 0x75, 0x72, 0x7B, 0x7C, 0x51, 0x56, 0x5F, 0x58, 0x4D, 0x4A, 0x43, 0x44,
-    0x19, 0x1E, 0x17, 0x10, 0x05, 0x02, 0x0B, 0x0C, 0x21, 0x26, 0x2F, 0x28, 0x3D, 0x3A, 0x33, 0x34,
-    0x4E, 0x49, 0x40, 0x47, 0x52, 0x55, 0x5C, 0x5B, 0x76, 0x71, 0x78, 0x7F, 0x6A, 0x6D, 0x64, 0x63,
-    0x3E, 0x39, 0x30, 0x37, 0x22, 0x25, 0x2C, 0x2B, 0x06, 0x01, 0x08, 0x0F, 0x1A, 0x1D, 0x14, 0x13,
-    0xAE, 0xA9, 0xA0, 0xA7, 0xB2, 0xB5, 0xBC, 0xBB, 0x96, 0x91, 0x98, 0x9F, 0x8A, 0x8D, 0x84, 0x83,
-    0xDE, 0xD9, 0xD0, 0xD7, 0xC2, 0xC5, 0xCC, 0xCB, 0xE6, 0xE1, 0xE8, 0xEF, 0xFA, 0xFD, 0xF4, 0xF3
-};
-static ma_uint16 ma_dr_flac__crc16_table[] = {
-    0x0000, 0x8005, 0x800F, 0x000A, 0x801B, 0x001E, 0x0014, 0x8011,
-    0x8033, 0x0036, 0x003C, 0x8039, 0x0028, 0x802D, 0x8027, 0x0022,
-    0x8063, 0x0066, 0x006C, 0x8069, 0x0078, 0x807D, 0x8077, 0x0072,
-    0x0050, 0x8055, 0x805F, 0x005A, 0x804B, 0x004E, 0x0044, 0x8041,
-    0x80C3, 0x00C6, 0x00CC, 0x80C9, 0x00D8, 0x80DD, 0x80D7, 0x00D2,
-    0x00F0, 0x80F5, 0x80FF, 0x00FA, 0x80EB, 0x00EE, 0x00E4, 0x80E1,
-    0x00A0, 0x80A5, 0x80AF, 0x00AA, 0x80BB, 0x00BE, 0x00B4, 0x80B1,
-    0x8093, 0x0096, 0x009C, 0x8099, 0x0088, 0x808D, 0x8087, 0x0082,
-    0x8183, 0x0186, 0x018C, 0x8189, 0x0198, 0x819D, 0x8197, 0x0192,
-    0x01B0, 0x81B5, 0x81BF, 0x01BA, 0x81AB, 0x01AE, 0x01A4, 0x81A1,
-    0x01E0, 0x81E5, 0x81EF, 0x01EA, 0x81FB, 0x01FE, 0x01F4, 0x81F1,
-    0x81D3, 0x01D6, 0x01DC, 0x81D9, 0x01C8, 0x81CD, 0x81C7, 0x01C2,
-    0x0140, 0x8145, 0x814F, 0x014A, 0x815B, 0x015E, 0x0154, 0x8151,
-    0x8173, 0x0176, 0x017C, 0x8179, 0x0168, 0x816D, 0x8167, 0x0162,
-    0x8123, 0x0126, 0x012C, 0x8129, 0x0138, 0x813D, 0x8137, 0x0132,
-    0x0110, 0x8115, 0x811F, 0x011A, 0x810B, 0x010E, 0x0104, 0x8101,
-    0x8303, 0x0306, 0x030C, 0x8309, 0x0318, 0x831D, 0x8317, 0x0312,
-    0x0330, 0x8335, 0x833F, 0x033A, 0x832B, 0x032E, 0x0324, 0x8321,
-    0x0360, 0x8365, 0x836F, 0x036A, 0x837B, 0x037E, 0x0374, 0x8371,
-    0x8353, 0x0356, 0x035C, 0x8359, 0x0348, 0x834D, 0x8347, 0x0342,
-    0x03C0, 0x83C5, 0x83CF, 0x03CA, 0x83DB, 0x03DE, 0x03D4, 0x83D1,
-    0x83F3, 0x03F6, 0x03FC, 0x83F9, 0x03E8, 0x83ED, 0x83E7, 0x03E2,
-    0x83A3, 0x03A6, 0x03AC, 0x83A9, 0x03B8, 0x83BD, 0x83B7, 0x03B2,
-    0x0390, 0x8395, 0x839F, 0x039A, 0x838B, 0x038E, 0x0384, 0x8381,
-    0x0280, 0x8285, 0x828F, 0x028A, 0x829B, 0x029E, 0x0294, 0x8291,
-    0x82B3, 0x02B6, 0x02BC, 0x82B9, 0x02A8, 0x82AD, 0x82A7, 0x02A2,
-    0x82E3, 0x02E6, 0x02EC, 0x82E9, 0x02F8, 0x82FD, 0x82F7, 0x02F2,
-    0x02D0, 0x82D5, 0x82DF, 0x02DA, 0x82CB, 0x02CE, 0x02C4, 0x82C1,
-    0x8243, 0x0246, 0x024C, 0x8249, 0x0258, 0x825D, 0x8257, 0x0252,
-    0x0270, 0x8275, 0x827F, 0x027A, 0x826B, 0x026E, 0x0264, 0x8261,
-    0x0220, 0x8225, 0x822F, 0x022A, 0x823B, 0x023E, 0x0234, 0x8231,
-    0x8213, 0x0216, 0x021C, 0x8219, 0x0208, 0x820D, 0x8207, 0x0202
-};
-static MA_INLINE ma_uint8 ma_dr_flac_crc8_byte(ma_uint8 crc, ma_uint8 data)
-{
-    return ma_dr_flac__crc8_table[crc ^ data];
-}
-static MA_INLINE ma_uint8 ma_dr_flac_crc8(ma_uint8 crc, ma_uint32 data, ma_uint32 count)
-{
-#ifdef MA_DR_FLAC_NO_CRC
-    (void)crc;
-    (void)data;
-    (void)count;
-    return 0;
-#else
-#if 0
-    ma_uint8 p = 0x07;
-    for (int i = count-1; i >= 0; --i) {
-        ma_uint8 bit = (data & (1 << i)) >> i;
-        if (crc & 0x80) {
-            crc = ((crc << 1) | bit) ^ p;
-        } else {
-            crc = ((crc << 1) | bit);
-        }
-    }
-    return crc;
-#else
-    ma_uint32 wholeBytes;
-    ma_uint32 leftoverBits;
-    ma_uint64 leftoverDataMask;
-    static ma_uint64 leftoverDataMaskTable[8] = {
-        0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
-    };
-    MA_DR_FLAC_ASSERT(count <= 32);
-    wholeBytes = count >> 3;
-    leftoverBits = count - (wholeBytes*8);
-    leftoverDataMask = leftoverDataMaskTable[leftoverBits];
-    switch (wholeBytes) {
-        case 4: crc = ma_dr_flac_crc8_byte(crc, (ma_uint8)((data & (0xFF000000UL << leftoverBits)) >> (24 + leftoverBits)));
-        case 3: crc = ma_dr_flac_crc8_byte(crc, (ma_uint8)((data & (0x00FF0000UL << leftoverBits)) >> (16 + leftoverBits)));
-        case 2: crc = ma_dr_flac_crc8_byte(crc, (ma_uint8)((data & (0x0000FF00UL << leftoverBits)) >> ( 8 + leftoverBits)));
-        case 1: crc = ma_dr_flac_crc8_byte(crc, (ma_uint8)((data & (0x000000FFUL << leftoverBits)) >> ( 0 + leftoverBits)));
-        case 0: if (leftoverBits > 0) crc = (ma_uint8)((crc << leftoverBits) ^ ma_dr_flac__crc8_table[(crc >> (8 - leftoverBits)) ^ (data & leftoverDataMask)]);
-    }
-    return crc;
-#endif
-#endif
-}
-static MA_INLINE ma_uint16 ma_dr_flac_crc16_byte(ma_uint16 crc, ma_uint8 data)
-{
-    return (crc << 8) ^ ma_dr_flac__crc16_table[(ma_uint8)(crc >> 8) ^ data];
-}
-static MA_INLINE ma_uint16 ma_dr_flac_crc16_cache(ma_uint16 crc, ma_dr_flac_cache_t data)
-{
-#ifdef MA_64BIT
-    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 56) & 0xFF));
-    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 48) & 0xFF));
-    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 40) & 0xFF));
-    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 32) & 0xFF));
-#endif
-    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 24) & 0xFF));
-    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 16) & 0xFF));
-    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >>  8) & 0xFF));
-    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >>  0) & 0xFF));
-    return crc;
-}
-static MA_INLINE ma_uint16 ma_dr_flac_crc16_bytes(ma_uint16 crc, ma_dr_flac_cache_t data, ma_uint32 byteCount)
-{
-    switch (byteCount)
-    {
-#ifdef MA_64BIT
-    case 8: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 56) & 0xFF));
-    case 7: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 48) & 0xFF));
-    case 6: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 40) & 0xFF));
-    case 5: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 32) & 0xFF));
-#endif
-    case 4: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 24) & 0xFF));
-    case 3: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 16) & 0xFF));
-    case 2: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >>  8) & 0xFF));
-    case 1: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >>  0) & 0xFF));
-    }
-    return crc;
-}
-#if 0
-static MA_INLINE ma_uint16 ma_dr_flac_crc16__32bit(ma_uint16 crc, ma_uint32 data, ma_uint32 count)
-{
-#ifdef MA_DR_FLAC_NO_CRC
-    (void)crc;
-    (void)data;
-    (void)count;
-    return 0;
-#else
-#if 0
-    ma_uint16 p = 0x8005;
-    for (int i = count-1; i >= 0; --i) {
-        ma_uint16 bit = (data & (1ULL << i)) >> i;
-        if (r & 0x8000) {
-            r = ((r << 1) | bit) ^ p;
-        } else {
-            r = ((r << 1) | bit);
-        }
-    }
-    return crc;
-#else
-    ma_uint32 wholeBytes;
-    ma_uint32 leftoverBits;
-    ma_uint64 leftoverDataMask;
-    static ma_uint64 leftoverDataMaskTable[8] = {
-        0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
-    };
-    MA_DR_FLAC_ASSERT(count <= 64);
-    wholeBytes = count >> 3;
-    leftoverBits = count & 7;
-    leftoverDataMask = leftoverDataMaskTable[leftoverBits];
-    switch (wholeBytes) {
-        default:
-        case 4: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (0xFF000000UL << leftoverBits)) >> (24 + leftoverBits)));
-        case 3: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (0x00FF0000UL << leftoverBits)) >> (16 + leftoverBits)));
-        case 2: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (0x0000FF00UL << leftoverBits)) >> ( 8 + leftoverBits)));
-        case 1: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (0x000000FFUL << leftoverBits)) >> ( 0 + leftoverBits)));
-        case 0: if (leftoverBits > 0) crc = (crc << leftoverBits) ^ ma_dr_flac__crc16_table[(crc >> (16 - leftoverBits)) ^ (data & leftoverDataMask)];
-    }
-    return crc;
-#endif
-#endif
-}
-static MA_INLINE ma_uint16 ma_dr_flac_crc16__64bit(ma_uint16 crc, ma_uint64 data, ma_uint32 count)
-{
-#ifdef MA_DR_FLAC_NO_CRC
-    (void)crc;
-    (void)data;
-    (void)count;
-    return 0;
-#else
-    ma_uint32 wholeBytes;
-    ma_uint32 leftoverBits;
-    ma_uint64 leftoverDataMask;
-    static ma_uint64 leftoverDataMaskTable[8] = {
-        0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
-    };
-    MA_DR_FLAC_ASSERT(count <= 64);
-    wholeBytes = count >> 3;
-    leftoverBits = count & 7;
-    leftoverDataMask = leftoverDataMaskTable[leftoverBits];
-    switch (wholeBytes) {
-        default:
-        case 8: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0xFF000000 << 32) << leftoverBits)) >> (56 + leftoverBits)));
-        case 7: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x00FF0000 << 32) << leftoverBits)) >> (48 + leftoverBits)));
-        case 6: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x0000FF00 << 32) << leftoverBits)) >> (40 + leftoverBits)));
-        case 5: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x000000FF << 32) << leftoverBits)) >> (32 + leftoverBits)));
-        case 4: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0xFF000000      ) << leftoverBits)) >> (24 + leftoverBits)));
-        case 3: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x00FF0000      ) << leftoverBits)) >> (16 + leftoverBits)));
-        case 2: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x0000FF00      ) << leftoverBits)) >> ( 8 + leftoverBits)));
-        case 1: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x000000FF      ) << leftoverBits)) >> ( 0 + leftoverBits)));
-        case 0: if (leftoverBits > 0) crc = (crc << leftoverBits) ^ ma_dr_flac__crc16_table[(crc >> (16 - leftoverBits)) ^ (data & leftoverDataMask)];
-    }
-    return crc;
-#endif
-}
-static MA_INLINE ma_uint16 ma_dr_flac_crc16(ma_uint16 crc, ma_dr_flac_cache_t data, ma_uint32 count)
-{
-#ifdef MA_64BIT
-    return ma_dr_flac_crc16__64bit(crc, data, count);
-#else
-    return ma_dr_flac_crc16__32bit(crc, data, count);
-#endif
-}
-#endif
-#ifdef MA_64BIT
-#define ma_dr_flac__be2host__cache_line ma_dr_flac__be2host_64
-#else
-#define ma_dr_flac__be2host__cache_line ma_dr_flac__be2host_32
-#endif
-#define MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs)                      (sizeof((bs)->cache))
-#define MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)                       (sizeof((bs)->cache)*8)
-#define MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)                  (MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - (bs)->consumedBits)
-#define MA_DR_FLAC_CACHE_L1_SELECTION_MASK(_bitCount)           (~((~(ma_dr_flac_cache_t)0) >> (_bitCount)))
-#define MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT(bs, _bitCount)      (MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - (_bitCount))
-#define MA_DR_FLAC_CACHE_L1_SELECT(bs, _bitCount)               (((bs)->cache) & MA_DR_FLAC_CACHE_L1_SELECTION_MASK(_bitCount))
-#define MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, _bitCount)     (MA_DR_FLAC_CACHE_L1_SELECT((bs), (_bitCount)) >>  MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)))
-#define MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, _bitCount)(MA_DR_FLAC_CACHE_L1_SELECT((bs), (_bitCount)) >> (MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)) & (MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)-1)))
-#define MA_DR_FLAC_CACHE_L2_SIZE_BYTES(bs)                      (sizeof((bs)->cacheL2))
-#define MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)                      (MA_DR_FLAC_CACHE_L2_SIZE_BYTES(bs) / sizeof((bs)->cacheL2[0]))
-#define MA_DR_FLAC_CACHE_L2_LINES_REMAINING(bs)                 (MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs) - (bs)->nextL2Line)
-#ifndef MA_DR_FLAC_NO_CRC
-static MA_INLINE void ma_dr_flac__reset_crc16(ma_dr_flac_bs* bs)
-{
-    bs->crc16 = 0;
-    bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
-}
-static MA_INLINE void ma_dr_flac__update_crc16(ma_dr_flac_bs* bs)
-{
-    if (bs->crc16CacheIgnoredBytes == 0) {
-        bs->crc16 = ma_dr_flac_crc16_cache(bs->crc16, bs->crc16Cache);
-    } else {
-        bs->crc16 = ma_dr_flac_crc16_bytes(bs->crc16, bs->crc16Cache, MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs) - bs->crc16CacheIgnoredBytes);
-        bs->crc16CacheIgnoredBytes = 0;
-    }
-}
-static MA_INLINE ma_uint16 ma_dr_flac__flush_crc16(ma_dr_flac_bs* bs)
-{
-    MA_DR_FLAC_ASSERT((MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) & 7) == 0);
-    if (MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) == 0) {
-        ma_dr_flac__update_crc16(bs);
-    } else {
-        bs->crc16 = ma_dr_flac_crc16_bytes(bs->crc16, bs->crc16Cache >> MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs), (bs->consumedBits >> 3) - bs->crc16CacheIgnoredBytes);
-        bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
-    }
-    return bs->crc16;
-}
-#endif
-static MA_INLINE ma_bool32 ma_dr_flac__reload_l1_cache_from_l2(ma_dr_flac_bs* bs)
-{
-    size_t bytesRead;
-    size_t alignedL1LineCount;
-    if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
-        bs->cache = bs->cacheL2[bs->nextL2Line++];
-        return MA_TRUE;
-    }
-    if (bs->unalignedByteCount > 0) {
-        return MA_FALSE;
-    }
-    bytesRead = bs->onRead(bs->pUserData, bs->cacheL2, MA_DR_FLAC_CACHE_L2_SIZE_BYTES(bs));
-    bs->nextL2Line = 0;
-    if (bytesRead == MA_DR_FLAC_CACHE_L2_SIZE_BYTES(bs)) {
-        bs->cache = bs->cacheL2[bs->nextL2Line++];
-        return MA_TRUE;
-    }
-    alignedL1LineCount = bytesRead / MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs);
-    bs->unalignedByteCount = bytesRead - (alignedL1LineCount * MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs));
-    if (bs->unalignedByteCount > 0) {
-        bs->unalignedCache = bs->cacheL2[alignedL1LineCount];
-    }
-    if (alignedL1LineCount > 0) {
-        size_t offset = MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs) - alignedL1LineCount;
-        size_t i;
-        for (i = alignedL1LineCount; i > 0; --i) {
-            bs->cacheL2[i-1 + offset] = bs->cacheL2[i-1];
-        }
-        bs->nextL2Line = (ma_uint32)offset;
-        bs->cache = bs->cacheL2[bs->nextL2Line++];
-        return MA_TRUE;
-    } else {
-        bs->nextL2Line = MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs);
-        return MA_FALSE;
-    }
-}
-static ma_bool32 ma_dr_flac__reload_cache(ma_dr_flac_bs* bs)
-{
-    size_t bytesRead;
-#ifndef MA_DR_FLAC_NO_CRC
-    ma_dr_flac__update_crc16(bs);
-#endif
-    if (ma_dr_flac__reload_l1_cache_from_l2(bs)) {
-        bs->cache = ma_dr_flac__be2host__cache_line(bs->cache);
-        bs->consumedBits = 0;
-#ifndef MA_DR_FLAC_NO_CRC
-        bs->crc16Cache = bs->cache;
-#endif
-        return MA_TRUE;
-    }
-    bytesRead = bs->unalignedByteCount;
-    if (bytesRead == 0) {
-        bs->consumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
-        return MA_FALSE;
-    }
-    MA_DR_FLAC_ASSERT(bytesRead < MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs));
-    bs->consumedBits = (ma_uint32)(MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs) - bytesRead) * 8;
-    bs->cache = ma_dr_flac__be2host__cache_line(bs->unalignedCache);
-    bs->cache &= MA_DR_FLAC_CACHE_L1_SELECTION_MASK(MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs));
-    bs->unalignedByteCount = 0;
-#ifndef MA_DR_FLAC_NO_CRC
-    bs->crc16Cache = bs->cache >> bs->consumedBits;
-    bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
-#endif
-    return MA_TRUE;
-}
-static void ma_dr_flac__reset_cache(ma_dr_flac_bs* bs)
-{
-    bs->nextL2Line   = MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs);
-    bs->consumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
-    bs->cache = 0;
-    bs->unalignedByteCount = 0;
-    bs->unalignedCache = 0;
-#ifndef MA_DR_FLAC_NO_CRC
-    bs->crc16Cache = 0;
-    bs->crc16CacheIgnoredBytes = 0;
-#endif
-}
-static MA_INLINE ma_bool32 ma_dr_flac__read_uint32(ma_dr_flac_bs* bs, unsigned int bitCount, ma_uint32* pResultOut)
-{
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pResultOut != NULL);
-    MA_DR_FLAC_ASSERT(bitCount > 0);
-    MA_DR_FLAC_ASSERT(bitCount <= 32);
-    if (bs->consumedBits == MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)) {
-        if (!ma_dr_flac__reload_cache(bs)) {
-            return MA_FALSE;
-        }
-    }
-    if (bitCount <= MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
-#ifdef MA_64BIT
-        *pResultOut = (ma_uint32)MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
-        bs->consumedBits += bitCount;
-        bs->cache <<= bitCount;
-#else
-        if (bitCount < MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)) {
-            *pResultOut = (ma_uint32)MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
-            bs->consumedBits += bitCount;
-            bs->cache <<= bitCount;
-        } else {
-            *pResultOut = (ma_uint32)bs->cache;
-            bs->consumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
-            bs->cache = 0;
-        }
-#endif
-        return MA_TRUE;
-    } else {
-        ma_uint32 bitCountHi = MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
-        ma_uint32 bitCountLo = bitCount - bitCountHi;
-        ma_uint32 resultHi;
-        MA_DR_FLAC_ASSERT(bitCountHi > 0);
-        MA_DR_FLAC_ASSERT(bitCountHi < 32);
-        resultHi = (ma_uint32)MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountHi);
-        if (!ma_dr_flac__reload_cache(bs)) {
-            return MA_FALSE;
-        }
-        if (bitCountLo > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
-            return MA_FALSE;
-        }
-        *pResultOut = (resultHi << bitCountLo) | (ma_uint32)MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountLo);
-        bs->consumedBits += bitCountLo;
-        bs->cache <<= bitCountLo;
-        return MA_TRUE;
-    }
-}
-static ma_bool32 ma_dr_flac__read_int32(ma_dr_flac_bs* bs, unsigned int bitCount, ma_int32* pResult)
-{
-    ma_uint32 result;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pResult != NULL);
-    MA_DR_FLAC_ASSERT(bitCount > 0);
-    MA_DR_FLAC_ASSERT(bitCount <= 32);
-    if (!ma_dr_flac__read_uint32(bs, bitCount, &result)) {
-        return MA_FALSE;
-    }
-    if (bitCount < 32) {
-        ma_uint32 signbit;
-        signbit = ((result >> (bitCount-1)) & 0x01);
-        result |= (~signbit + 1) << bitCount;
-    }
-    *pResult = (ma_int32)result;
-    return MA_TRUE;
-}
-#ifdef MA_64BIT
-static ma_bool32 ma_dr_flac__read_uint64(ma_dr_flac_bs* bs, unsigned int bitCount, ma_uint64* pResultOut)
-{
-    ma_uint32 resultHi;
-    ma_uint32 resultLo;
-    MA_DR_FLAC_ASSERT(bitCount <= 64);
-    MA_DR_FLAC_ASSERT(bitCount >  32);
-    if (!ma_dr_flac__read_uint32(bs, bitCount - 32, &resultHi)) {
-        return MA_FALSE;
-    }
-    if (!ma_dr_flac__read_uint32(bs, 32, &resultLo)) {
-        return MA_FALSE;
-    }
-    *pResultOut = (((ma_uint64)resultHi) << 32) | ((ma_uint64)resultLo);
-    return MA_TRUE;
-}
-#endif
-#if 0
-static ma_bool32 ma_dr_flac__read_int64(ma_dr_flac_bs* bs, unsigned int bitCount, ma_int64* pResultOut)
-{
-    ma_uint64 result;
-    ma_uint64 signbit;
-    MA_DR_FLAC_ASSERT(bitCount <= 64);
-    if (!ma_dr_flac__read_uint64(bs, bitCount, &result)) {
-        return MA_FALSE;
-    }
-    signbit = ((result >> (bitCount-1)) & 0x01);
-    result |= (~signbit + 1) << bitCount;
-    *pResultOut = (ma_int64)result;
-    return MA_TRUE;
-}
-#endif
-static ma_bool32 ma_dr_flac__read_uint16(ma_dr_flac_bs* bs, unsigned int bitCount, ma_uint16* pResult)
-{
-    ma_uint32 result;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pResult != NULL);
-    MA_DR_FLAC_ASSERT(bitCount > 0);
-    MA_DR_FLAC_ASSERT(bitCount <= 16);
-    if (!ma_dr_flac__read_uint32(bs, bitCount, &result)) {
-        return MA_FALSE;
-    }
-    *pResult = (ma_uint16)result;
-    return MA_TRUE;
-}
-#if 0
-static ma_bool32 ma_dr_flac__read_int16(ma_dr_flac_bs* bs, unsigned int bitCount, ma_int16* pResult)
-{
-    ma_int32 result;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pResult != NULL);
-    MA_DR_FLAC_ASSERT(bitCount > 0);
-    MA_DR_FLAC_ASSERT(bitCount <= 16);
-    if (!ma_dr_flac__read_int32(bs, bitCount, &result)) {
-        return MA_FALSE;
-    }
-    *pResult = (ma_int16)result;
-    return MA_TRUE;
-}
-#endif
-static ma_bool32 ma_dr_flac__read_uint8(ma_dr_flac_bs* bs, unsigned int bitCount, ma_uint8* pResult)
-{
-    ma_uint32 result;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pResult != NULL);
-    MA_DR_FLAC_ASSERT(bitCount > 0);
-    MA_DR_FLAC_ASSERT(bitCount <= 8);
-    if (!ma_dr_flac__read_uint32(bs, bitCount, &result)) {
-        return MA_FALSE;
-    }
-    *pResult = (ma_uint8)result;
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__read_int8(ma_dr_flac_bs* bs, unsigned int bitCount, ma_int8* pResult)
-{
-    ma_int32 result;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pResult != NULL);
-    MA_DR_FLAC_ASSERT(bitCount > 0);
-    MA_DR_FLAC_ASSERT(bitCount <= 8);
-    if (!ma_dr_flac__read_int32(bs, bitCount, &result)) {
-        return MA_FALSE;
-    }
-    *pResult = (ma_int8)result;
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__seek_bits(ma_dr_flac_bs* bs, size_t bitsToSeek)
-{
-    if (bitsToSeek <= MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
-        bs->consumedBits += (ma_uint32)bitsToSeek;
-        bs->cache <<= bitsToSeek;
-        return MA_TRUE;
-    } else {
-        bitsToSeek       -= MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
-        bs->consumedBits += MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
-        bs->cache         = 0;
-#ifdef MA_64BIT
-        while (bitsToSeek >= MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)) {
-            ma_uint64 bin;
-            if (!ma_dr_flac__read_uint64(bs, MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
-                return MA_FALSE;
-            }
-            bitsToSeek -= MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
-        }
-#else
-        while (bitsToSeek >= MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)) {
-            ma_uint32 bin;
-            if (!ma_dr_flac__read_uint32(bs, MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
-                return MA_FALSE;
-            }
-            bitsToSeek -= MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
-        }
-#endif
-        while (bitsToSeek >= 8) {
-            ma_uint8 bin;
-            if (!ma_dr_flac__read_uint8(bs, 8, &bin)) {
-                return MA_FALSE;
-            }
-            bitsToSeek -= 8;
-        }
-        if (bitsToSeek > 0) {
-            ma_uint8 bin;
-            if (!ma_dr_flac__read_uint8(bs, (ma_uint32)bitsToSeek, &bin)) {
-                return MA_FALSE;
-            }
-            bitsToSeek = 0;
-        }
-        MA_DR_FLAC_ASSERT(bitsToSeek == 0);
-        return MA_TRUE;
-    }
-}
-static ma_bool32 ma_dr_flac__find_and_seek_to_next_sync_code(ma_dr_flac_bs* bs)
-{
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    if (!ma_dr_flac__seek_bits(bs, MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
-        return MA_FALSE;
-    }
-    for (;;) {
-        ma_uint8 hi;
-#ifndef MA_DR_FLAC_NO_CRC
-        ma_dr_flac__reset_crc16(bs);
-#endif
-        if (!ma_dr_flac__read_uint8(bs, 8, &hi)) {
-            return MA_FALSE;
-        }
-        if (hi == 0xFF) {
-            ma_uint8 lo;
-            if (!ma_dr_flac__read_uint8(bs, 6, &lo)) {
-                return MA_FALSE;
-            }
-            if (lo == 0x3E) {
-                return MA_TRUE;
-            } else {
-                if (!ma_dr_flac__seek_bits(bs, MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
-                    return MA_FALSE;
-                }
-            }
-        }
-    }
-}
-#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC)
-#define MA_DR_FLAC_IMPLEMENT_CLZ_LZCNT
-#endif
-#if  defined(_MSC_VER) && _MSC_VER >= 1400 && (defined(MA_X64) || defined(MA_X86)) && !defined(__clang__)
-#define MA_DR_FLAC_IMPLEMENT_CLZ_MSVC
-#endif
-#if  defined(__WATCOMC__) && defined(__386__)
-#define MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM
-#endif
-#ifdef __MRC__
-#include <intrinsics.h>
-#define MA_DR_FLAC_IMPLEMENT_CLZ_MRC
-#endif
-static MA_INLINE ma_uint32 ma_dr_flac__clz_software(ma_dr_flac_cache_t x)
-{
-    ma_uint32 n;
-    static ma_uint32 clz_table_4[] = {
-        0,
-        4,
-        3, 3,
-        2, 2, 2, 2,
-        1, 1, 1, 1, 1, 1, 1, 1
-    };
-    if (x == 0) {
-        return sizeof(x)*8;
-    }
-    n = clz_table_4[x >> (sizeof(x)*8 - 4)];
-    if (n == 0) {
-#ifdef MA_64BIT
-        if ((x & ((ma_uint64)0xFFFFFFFF << 32)) == 0) { n  = 32; x <<= 32; }
-        if ((x & ((ma_uint64)0xFFFF0000 << 32)) == 0) { n += 16; x <<= 16; }
-        if ((x & ((ma_uint64)0xFF000000 << 32)) == 0) { n += 8;  x <<= 8;  }
-        if ((x & ((ma_uint64)0xF0000000 << 32)) == 0) { n += 4;  x <<= 4;  }
-#else
-        if ((x & 0xFFFF0000) == 0) { n  = 16; x <<= 16; }
-        if ((x & 0xFF000000) == 0) { n += 8;  x <<= 8;  }
-        if ((x & 0xF0000000) == 0) { n += 4;  x <<= 4;  }
-#endif
-        n += clz_table_4[x >> (sizeof(x)*8 - 4)];
-    }
-    return n - 1;
-}
-#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_LZCNT
-static MA_INLINE ma_bool32 ma_dr_flac__is_lzcnt_supported(void)
-{
-#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC) && defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
-    return MA_TRUE;
-#elif defined(__MRC__)
-    return MA_TRUE;
-#else
-    #ifdef MA_DR_FLAC_HAS_LZCNT_INTRINSIC
-        return ma_dr_flac__gIsLZCNTSupported;
-    #else
-        return MA_FALSE;
-    #endif
-#endif
-}
-static MA_INLINE ma_uint32 ma_dr_flac__clz_lzcnt(ma_dr_flac_cache_t x)
-{
-#if defined(_MSC_VER)
-    #ifdef MA_64BIT
-        return (ma_uint32)__lzcnt64(x);
-    #else
-        return (ma_uint32)__lzcnt(x);
-    #endif
-#else
-    #if defined(__GNUC__) || defined(__clang__)
-        #if defined(MA_X64)
-            {
-                ma_uint64 r;
-                __asm__ __volatile__ (
-                    "lzcnt{ %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
-                );
-                return (ma_uint32)r;
-            }
-        #elif defined(MA_X86)
-            {
-                ma_uint32 r;
-                __asm__ __volatile__ (
-                    "lzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
-                );
-                return r;
-            }
-        #elif defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5) && !defined(__ARM_ARCH_6M__) && !defined(MA_64BIT)
-            {
-                unsigned int r;
-                __asm__ __volatile__ (
-                #if defined(MA_64BIT)
-                    "clz %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(x)
-                #else
-                    "clz %[out], %[in]" : [out]"=r"(r) : [in]"r"(x)
-                #endif
-                );
-                return r;
-            }
-        #else
-            if (x == 0) {
-                return sizeof(x)*8;
-            }
-            #ifdef MA_64BIT
-                return (ma_uint32)__builtin_clzll((ma_uint64)x);
-            #else
-                return (ma_uint32)__builtin_clzl((ma_uint32)x);
-            #endif
-        #endif
-    #else
-        #error "This compiler does not support the lzcnt intrinsic."
-    #endif
-#endif
-}
-#endif
-#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_MSVC
-#include <intrin.h>
-static MA_INLINE ma_uint32 ma_dr_flac__clz_msvc(ma_dr_flac_cache_t x)
-{
-    ma_uint32 n;
-    if (x == 0) {
-        return sizeof(x)*8;
-    }
-#ifdef MA_64BIT
-    _BitScanReverse64((unsigned long*)&n, x);
-#else
-    _BitScanReverse((unsigned long*)&n, x);
-#endif
-    return sizeof(x)*8 - n - 1;
-}
-#endif
-#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM
-static __inline ma_uint32 ma_dr_flac__clz_watcom (ma_uint32);
-#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM_LZCNT
-#pragma aux ma_dr_flac__clz_watcom_lzcnt = \
-    "db 0F3h, 0Fh, 0BDh, 0C0h"  \
-    parm [eax] \
-    value [eax] \
-    modify nomemory;
-#else
-#pragma aux ma_dr_flac__clz_watcom = \
-    "bsr eax, eax" \
-    "xor eax, 31" \
-    parm [eax] nomemory \
-    value [eax] \
-    modify exact [eax] nomemory;
-#endif
-#endif
-static MA_INLINE ma_uint32 ma_dr_flac__clz(ma_dr_flac_cache_t x)
-{
-#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_LZCNT
-    if (ma_dr_flac__is_lzcnt_supported()) {
-        return ma_dr_flac__clz_lzcnt(x);
-    } else
-#endif
-    {
-#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_MSVC
-        return ma_dr_flac__clz_msvc(x);
-#elif defined(MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM_LZCNT)
-        return ma_dr_flac__clz_watcom_lzcnt(x);
-#elif defined(MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM)
-        return (x == 0) ? sizeof(x)*8 : ma_dr_flac__clz_watcom(x);
-#elif defined(__MRC__)
-        return __cntlzw(x);
-#else
-        return ma_dr_flac__clz_software(x);
-#endif
-    }
-}
-static MA_INLINE ma_bool32 ma_dr_flac__seek_past_next_set_bit(ma_dr_flac_bs* bs, unsigned int* pOffsetOut)
-{
-    ma_uint32 zeroCounter = 0;
-    ma_uint32 setBitOffsetPlus1;
-    while (bs->cache == 0) {
-        zeroCounter += (ma_uint32)MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
-        if (!ma_dr_flac__reload_cache(bs)) {
-            return MA_FALSE;
-        }
-    }
-    if (bs->cache == 1) {
-        *pOffsetOut = zeroCounter + (ma_uint32)MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) - 1;
-        if (!ma_dr_flac__reload_cache(bs)) {
-            return MA_FALSE;
-        }
-        return MA_TRUE;
-    }
-    setBitOffsetPlus1 = ma_dr_flac__clz(bs->cache);
-    setBitOffsetPlus1 += 1;
-    if (setBitOffsetPlus1 > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
-        return MA_FALSE;
-    }
-    bs->consumedBits += setBitOffsetPlus1;
-    bs->cache <<= setBitOffsetPlus1;
-    *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__seek_to_byte(ma_dr_flac_bs* bs, ma_uint64 offsetFromStart)
-{
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(offsetFromStart > 0);
-    if (offsetFromStart > 0x7FFFFFFF) {
-        ma_uint64 bytesRemaining = offsetFromStart;
-        if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, ma_dr_flac_seek_origin_start)) {
-            return MA_FALSE;
-        }
-        bytesRemaining -= 0x7FFFFFFF;
-        while (bytesRemaining > 0x7FFFFFFF) {
-            if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, ma_dr_flac_seek_origin_current)) {
-                return MA_FALSE;
-            }
-            bytesRemaining -= 0x7FFFFFFF;
-        }
-        if (bytesRemaining > 0) {
-            if (!bs->onSeek(bs->pUserData, (int)bytesRemaining, ma_dr_flac_seek_origin_current)) {
-                return MA_FALSE;
-            }
-        }
-    } else {
-        if (!bs->onSeek(bs->pUserData, (int)offsetFromStart, ma_dr_flac_seek_origin_start)) {
-            return MA_FALSE;
-        }
-    }
-    ma_dr_flac__reset_cache(bs);
-    return MA_TRUE;
-}
-static ma_result ma_dr_flac__read_utf8_coded_number(ma_dr_flac_bs* bs, ma_uint64* pNumberOut, ma_uint8* pCRCOut)
-{
-    ma_uint8 crc;
-    ma_uint64 result;
-    ma_uint8 utf8[7] = {0};
-    int byteCount;
-    int i;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pNumberOut != NULL);
-    MA_DR_FLAC_ASSERT(pCRCOut != NULL);
-    crc = *pCRCOut;
-    if (!ma_dr_flac__read_uint8(bs, 8, utf8)) {
-        *pNumberOut = 0;
-        return MA_AT_END;
-    }
-    crc = ma_dr_flac_crc8(crc, utf8[0], 8);
-    if ((utf8[0] & 0x80) == 0) {
-        *pNumberOut = utf8[0];
-        *pCRCOut = crc;
-        return MA_SUCCESS;
-    }
-    if ((utf8[0] & 0xE0) == 0xC0) {
-        byteCount = 2;
-    } else if ((utf8[0] & 0xF0) == 0xE0) {
-        byteCount = 3;
-    } else if ((utf8[0] & 0xF8) == 0xF0) {
-        byteCount = 4;
-    } else if ((utf8[0] & 0xFC) == 0xF8) {
-        byteCount = 5;
-    } else if ((utf8[0] & 0xFE) == 0xFC) {
-        byteCount = 6;
-    } else if ((utf8[0] & 0xFF) == 0xFE) {
-        byteCount = 7;
-    } else {
-        *pNumberOut = 0;
-        return MA_CRC_MISMATCH;
-    }
-    MA_DR_FLAC_ASSERT(byteCount > 1);
-    result = (ma_uint64)(utf8[0] & (0xFF >> (byteCount + 1)));
-    for (i = 1; i < byteCount; ++i) {
-        if (!ma_dr_flac__read_uint8(bs, 8, utf8 + i)) {
-            *pNumberOut = 0;
-            return MA_AT_END;
-        }
-        crc = ma_dr_flac_crc8(crc, utf8[i], 8);
-        result = (result << 6) | (utf8[i] & 0x3F);
-    }
-    *pNumberOut = result;
-    *pCRCOut = crc;
-    return MA_SUCCESS;
-}
-static MA_INLINE ma_uint32 ma_dr_flac__ilog2_u32(ma_uint32 x)
-{
-#if 1
-    ma_uint32 result = 0;
-    while (x > 0) {
-        result += 1;
-        x >>= 1;
-    }
-    return result;
-#endif
-}
-static MA_INLINE ma_bool32 ma_dr_flac__use_64_bit_prediction(ma_uint32 bitsPerSample, ma_uint32 order, ma_uint32 precision)
-{
-    return bitsPerSample + precision + ma_dr_flac__ilog2_u32(order) > 32;
-}
-#if defined(__clang__)
-__attribute__((no_sanitize("signed-integer-overflow")))
-#endif
-static MA_INLINE ma_int32 ma_dr_flac__calculate_prediction_32(ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pDecodedSamples)
-{
-    ma_int32 prediction = 0;
-    MA_DR_FLAC_ASSERT(order <= 32);
-    switch (order)
-    {
-    case 32: prediction += coefficients[31] * pDecodedSamples[-32];
-    case 31: prediction += coefficients[30] * pDecodedSamples[-31];
-    case 30: prediction += coefficients[29] * pDecodedSamples[-30];
-    case 29: prediction += coefficients[28] * pDecodedSamples[-29];
-    case 28: prediction += coefficients[27] * pDecodedSamples[-28];
-    case 27: prediction += coefficients[26] * pDecodedSamples[-27];
-    case 26: prediction += coefficients[25] * pDecodedSamples[-26];
-    case 25: prediction += coefficients[24] * pDecodedSamples[-25];
-    case 24: prediction += coefficients[23] * pDecodedSamples[-24];
-    case 23: prediction += coefficients[22] * pDecodedSamples[-23];
-    case 22: prediction += coefficients[21] * pDecodedSamples[-22];
-    case 21: prediction += coefficients[20] * pDecodedSamples[-21];
-    case 20: prediction += coefficients[19] * pDecodedSamples[-20];
-    case 19: prediction += coefficients[18] * pDecodedSamples[-19];
-    case 18: prediction += coefficients[17] * pDecodedSamples[-18];
-    case 17: prediction += coefficients[16] * pDecodedSamples[-17];
-    case 16: prediction += coefficients[15] * pDecodedSamples[-16];
-    case 15: prediction += coefficients[14] * pDecodedSamples[-15];
-    case 14: prediction += coefficients[13] * pDecodedSamples[-14];
-    case 13: prediction += coefficients[12] * pDecodedSamples[-13];
-    case 12: prediction += coefficients[11] * pDecodedSamples[-12];
-    case 11: prediction += coefficients[10] * pDecodedSamples[-11];
-    case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
-    case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
-    case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
-    case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
-    case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
-    case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
-    case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
-    case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
-    case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
-    case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
-    }
-    return (ma_int32)(prediction >> shift);
-}
-static MA_INLINE ma_int32 ma_dr_flac__calculate_prediction_64(ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pDecodedSamples)
-{
-    ma_int64 prediction;
-    MA_DR_FLAC_ASSERT(order <= 32);
-#ifndef MA_64BIT
-    if (order == 8)
-    {
-        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
-        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
-        prediction += coefficients[4] * (ma_int64)pDecodedSamples[-5];
-        prediction += coefficients[5] * (ma_int64)pDecodedSamples[-6];
-        prediction += coefficients[6] * (ma_int64)pDecodedSamples[-7];
-        prediction += coefficients[7] * (ma_int64)pDecodedSamples[-8];
-    }
-    else if (order == 7)
-    {
-        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
-        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
-        prediction += coefficients[4] * (ma_int64)pDecodedSamples[-5];
-        prediction += coefficients[5] * (ma_int64)pDecodedSamples[-6];
-        prediction += coefficients[6] * (ma_int64)pDecodedSamples[-7];
-    }
-    else if (order == 3)
-    {
-        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
-    }
-    else if (order == 6)
-    {
-        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
-        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
-        prediction += coefficients[4] * (ma_int64)pDecodedSamples[-5];
-        prediction += coefficients[5] * (ma_int64)pDecodedSamples[-6];
-    }
-    else if (order == 5)
-    {
-        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
-        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
-        prediction += coefficients[4] * (ma_int64)pDecodedSamples[-5];
-    }
-    else if (order == 4)
-    {
-        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
-        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
-    }
-    else if (order == 12)
-    {
-        prediction  = coefficients[0]  * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1]  * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2]  * (ma_int64)pDecodedSamples[-3];
-        prediction += coefficients[3]  * (ma_int64)pDecodedSamples[-4];
-        prediction += coefficients[4]  * (ma_int64)pDecodedSamples[-5];
-        prediction += coefficients[5]  * (ma_int64)pDecodedSamples[-6];
-        prediction += coefficients[6]  * (ma_int64)pDecodedSamples[-7];
-        prediction += coefficients[7]  * (ma_int64)pDecodedSamples[-8];
-        prediction += coefficients[8]  * (ma_int64)pDecodedSamples[-9];
-        prediction += coefficients[9]  * (ma_int64)pDecodedSamples[-10];
-        prediction += coefficients[10] * (ma_int64)pDecodedSamples[-11];
-        prediction += coefficients[11] * (ma_int64)pDecodedSamples[-12];
-    }
-    else if (order == 2)
-    {
-        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
-    }
-    else if (order == 1)
-    {
-        prediction = coefficients[0] * (ma_int64)pDecodedSamples[-1];
-    }
-    else if (order == 10)
-    {
-        prediction  = coefficients[0]  * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1]  * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2]  * (ma_int64)pDecodedSamples[-3];
-        prediction += coefficients[3]  * (ma_int64)pDecodedSamples[-4];
-        prediction += coefficients[4]  * (ma_int64)pDecodedSamples[-5];
-        prediction += coefficients[5]  * (ma_int64)pDecodedSamples[-6];
-        prediction += coefficients[6]  * (ma_int64)pDecodedSamples[-7];
-        prediction += coefficients[7]  * (ma_int64)pDecodedSamples[-8];
-        prediction += coefficients[8]  * (ma_int64)pDecodedSamples[-9];
-        prediction += coefficients[9]  * (ma_int64)pDecodedSamples[-10];
-    }
-    else if (order == 9)
-    {
-        prediction  = coefficients[0]  * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1]  * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2]  * (ma_int64)pDecodedSamples[-3];
-        prediction += coefficients[3]  * (ma_int64)pDecodedSamples[-4];
-        prediction += coefficients[4]  * (ma_int64)pDecodedSamples[-5];
-        prediction += coefficients[5]  * (ma_int64)pDecodedSamples[-6];
-        prediction += coefficients[6]  * (ma_int64)pDecodedSamples[-7];
-        prediction += coefficients[7]  * (ma_int64)pDecodedSamples[-8];
-        prediction += coefficients[8]  * (ma_int64)pDecodedSamples[-9];
-    }
-    else if (order == 11)
-    {
-        prediction  = coefficients[0]  * (ma_int64)pDecodedSamples[-1];
-        prediction += coefficients[1]  * (ma_int64)pDecodedSamples[-2];
-        prediction += coefficients[2]  * (ma_int64)pDecodedSamples[-3];
-        prediction += coefficients[3]  * (ma_int64)pDecodedSamples[-4];
-        prediction += coefficients[4]  * (ma_int64)pDecodedSamples[-5];
-        prediction += coefficients[5]  * (ma_int64)pDecodedSamples[-6];
-        prediction += coefficients[6]  * (ma_int64)pDecodedSamples[-7];
-        prediction += coefficients[7]  * (ma_int64)pDecodedSamples[-8];
-        prediction += coefficients[8]  * (ma_int64)pDecodedSamples[-9];
-        prediction += coefficients[9]  * (ma_int64)pDecodedSamples[-10];
-        prediction += coefficients[10] * (ma_int64)pDecodedSamples[-11];
-    }
-    else
-    {
-        int j;
-        prediction = 0;
-        for (j = 0; j < (int)order; ++j) {
-            prediction += coefficients[j] * (ma_int64)pDecodedSamples[-j-1];
-        }
-    }
-#endif
-#ifdef MA_64BIT
-    prediction = 0;
-    switch (order)
-    {
-    case 32: prediction += coefficients[31] * (ma_int64)pDecodedSamples[-32];
-    case 31: prediction += coefficients[30] * (ma_int64)pDecodedSamples[-31];
-    case 30: prediction += coefficients[29] * (ma_int64)pDecodedSamples[-30];
-    case 29: prediction += coefficients[28] * (ma_int64)pDecodedSamples[-29];
-    case 28: prediction += coefficients[27] * (ma_int64)pDecodedSamples[-28];
-    case 27: prediction += coefficients[26] * (ma_int64)pDecodedSamples[-27];
-    case 26: prediction += coefficients[25] * (ma_int64)pDecodedSamples[-26];
-    case 25: prediction += coefficients[24] * (ma_int64)pDecodedSamples[-25];
-    case 24: prediction += coefficients[23] * (ma_int64)pDecodedSamples[-24];
-    case 23: prediction += coefficients[22] * (ma_int64)pDecodedSamples[-23];
-    case 22: prediction += coefficients[21] * (ma_int64)pDecodedSamples[-22];
-    case 21: prediction += coefficients[20] * (ma_int64)pDecodedSamples[-21];
-    case 20: prediction += coefficients[19] * (ma_int64)pDecodedSamples[-20];
-    case 19: prediction += coefficients[18] * (ma_int64)pDecodedSamples[-19];
-    case 18: prediction += coefficients[17] * (ma_int64)pDecodedSamples[-18];
-    case 17: prediction += coefficients[16] * (ma_int64)pDecodedSamples[-17];
-    case 16: prediction += coefficients[15] * (ma_int64)pDecodedSamples[-16];
-    case 15: prediction += coefficients[14] * (ma_int64)pDecodedSamples[-15];
-    case 14: prediction += coefficients[13] * (ma_int64)pDecodedSamples[-14];
-    case 13: prediction += coefficients[12] * (ma_int64)pDecodedSamples[-13];
-    case 12: prediction += coefficients[11] * (ma_int64)pDecodedSamples[-12];
-    case 11: prediction += coefficients[10] * (ma_int64)pDecodedSamples[-11];
-    case 10: prediction += coefficients[ 9] * (ma_int64)pDecodedSamples[-10];
-    case  9: prediction += coefficients[ 8] * (ma_int64)pDecodedSamples[- 9];
-    case  8: prediction += coefficients[ 7] * (ma_int64)pDecodedSamples[- 8];
-    case  7: prediction += coefficients[ 6] * (ma_int64)pDecodedSamples[- 7];
-    case  6: prediction += coefficients[ 5] * (ma_int64)pDecodedSamples[- 6];
-    case  5: prediction += coefficients[ 4] * (ma_int64)pDecodedSamples[- 5];
-    case  4: prediction += coefficients[ 3] * (ma_int64)pDecodedSamples[- 4];
-    case  3: prediction += coefficients[ 2] * (ma_int64)pDecodedSamples[- 3];
-    case  2: prediction += coefficients[ 1] * (ma_int64)pDecodedSamples[- 2];
-    case  1: prediction += coefficients[ 0] * (ma_int64)pDecodedSamples[- 1];
-    }
-#endif
-    return (ma_int32)(prediction >> shift);
-}
-#if 0
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__reference(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    ma_uint32 i;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
-    for (i = 0; i < count; ++i) {
-        ma_uint32 zeroCounter = 0;
-        for (;;) {
-            ma_uint8 bit;
-            if (!ma_dr_flac__read_uint8(bs, 1, &bit)) {
-                return MA_FALSE;
-            }
-            if (bit == 0) {
-                zeroCounter += 1;
-            } else {
-                break;
-            }
-        }
-        ma_uint32 decodedRice;
-        if (riceParam > 0) {
-            if (!ma_dr_flac__read_uint32(bs, riceParam, &decodedRice)) {
-                return MA_FALSE;
-            }
-        } else {
-            decodedRice = 0;
-        }
-        decodedRice |= (zeroCounter << riceParam);
-        if ((decodedRice & 0x01)) {
-            decodedRice = ~(decodedRice >> 1);
-        } else {
-            decodedRice =  (decodedRice >> 1);
-        }
-        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
-            pSamplesOut[i] = decodedRice + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
-        } else {
-            pSamplesOut[i] = decodedRice + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
-        }
-    }
-    return MA_TRUE;
-}
-#endif
-#if 0
-static ma_bool32 ma_dr_flac__read_rice_parts__reference(ma_dr_flac_bs* bs, ma_uint8 riceParam, ma_uint32* pZeroCounterOut, ma_uint32* pRiceParamPartOut)
-{
-    ma_uint32 zeroCounter = 0;
-    ma_uint32 decodedRice;
-    for (;;) {
-        ma_uint8 bit;
-        if (!ma_dr_flac__read_uint8(bs, 1, &bit)) {
-            return MA_FALSE;
-        }
-        if (bit == 0) {
-            zeroCounter += 1;
-        } else {
-            break;
-        }
-    }
-    if (riceParam > 0) {
-        if (!ma_dr_flac__read_uint32(bs, riceParam, &decodedRice)) {
-            return MA_FALSE;
-        }
-    } else {
-        decodedRice = 0;
-    }
-    *pZeroCounterOut = zeroCounter;
-    *pRiceParamPartOut = decodedRice;
-    return MA_TRUE;
-}
-#endif
-#if 0
-static MA_INLINE ma_bool32 ma_dr_flac__read_rice_parts(ma_dr_flac_bs* bs, ma_uint8 riceParam, ma_uint32* pZeroCounterOut, ma_uint32* pRiceParamPartOut)
-{
-    ma_dr_flac_cache_t riceParamMask;
-    ma_uint32 zeroCounter;
-    ma_uint32 setBitOffsetPlus1;
-    ma_uint32 riceParamPart;
-    ma_uint32 riceLength;
-    MA_DR_FLAC_ASSERT(riceParam > 0);
-    riceParamMask = MA_DR_FLAC_CACHE_L1_SELECTION_MASK(riceParam);
-    zeroCounter = 0;
-    while (bs->cache == 0) {
-        zeroCounter += (ma_uint32)MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
-        if (!ma_dr_flac__reload_cache(bs)) {
-            return MA_FALSE;
-        }
-    }
-    setBitOffsetPlus1 = ma_dr_flac__clz(bs->cache);
-    zeroCounter += setBitOffsetPlus1;
-    setBitOffsetPlus1 += 1;
-    riceLength = setBitOffsetPlus1 + riceParam;
-    if (riceLength < MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
-        riceParamPart = (ma_uint32)((bs->cache & (riceParamMask >> setBitOffsetPlus1)) >> MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT(bs, riceLength));
-        bs->consumedBits += riceLength;
-        bs->cache <<= riceLength;
-    } else {
-        ma_uint32 bitCountLo;
-        ma_dr_flac_cache_t resultHi;
-        bs->consumedBits += riceLength;
-        bs->cache <<= setBitOffsetPlus1 & (MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)-1);
-        bitCountLo = bs->consumedBits - MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
-        resultHi = MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, riceParam);
-        if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
-#ifndef MA_DR_FLAC_NO_CRC
-            ma_dr_flac__update_crc16(bs);
-#endif
-            bs->cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
-            bs->consumedBits = 0;
-#ifndef MA_DR_FLAC_NO_CRC
-            bs->crc16Cache = bs->cache;
-#endif
-        } else {
-            if (!ma_dr_flac__reload_cache(bs)) {
-                return MA_FALSE;
-            }
-            if (bitCountLo > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
-                return MA_FALSE;
-            }
-        }
-        riceParamPart = (ma_uint32)(resultHi | MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, bitCountLo));
-        bs->consumedBits += bitCountLo;
-        bs->cache <<= bitCountLo;
-    }
-    pZeroCounterOut[0] = zeroCounter;
-    pRiceParamPartOut[0] = riceParamPart;
-    return MA_TRUE;
-}
-#endif
-static MA_INLINE ma_bool32 ma_dr_flac__read_rice_parts_x1(ma_dr_flac_bs* bs, ma_uint8 riceParam, ma_uint32* pZeroCounterOut, ma_uint32* pRiceParamPartOut)
-{
-    ma_uint32  riceParamPlus1 = riceParam + 1;
-    ma_uint32  riceParamPlus1Shift = MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPlus1);
-    ma_uint32  riceParamPlus1MaxConsumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
-    ma_dr_flac_cache_t bs_cache = bs->cache;
-    ma_uint32  bs_consumedBits = bs->consumedBits;
-    ma_uint32  lzcount = ma_dr_flac__clz(bs_cache);
-    if (lzcount < sizeof(bs_cache)*8) {
-        pZeroCounterOut[0] = lzcount;
-    extract_rice_param_part:
-        bs_cache       <<= lzcount;
-        bs_consumedBits += lzcount;
-        if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
-            pRiceParamPartOut[0] = (ma_uint32)(bs_cache >> riceParamPlus1Shift);
-            bs_cache       <<= riceParamPlus1;
-            bs_consumedBits += riceParamPlus1;
-        } else {
-            ma_uint32 riceParamPartHi;
-            ma_uint32 riceParamPartLo;
-            ma_uint32 riceParamPartLoBitCount;
-            riceParamPartHi = (ma_uint32)(bs_cache >> riceParamPlus1Shift);
-            riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
-            MA_DR_FLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
-            if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
-            #ifndef MA_DR_FLAC_NO_CRC
-                ma_dr_flac__update_crc16(bs);
-            #endif
-                bs_cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
-                bs_consumedBits = riceParamPartLoBitCount;
-            #ifndef MA_DR_FLAC_NO_CRC
-                bs->crc16Cache = bs_cache;
-            #endif
-            } else {
-                if (!ma_dr_flac__reload_cache(bs)) {
-                    return MA_FALSE;
-                }
-                if (riceParamPartLoBitCount > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
-                    return MA_FALSE;
-                }
-                bs_cache = bs->cache;
-                bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
-            }
-            riceParamPartLo = (ma_uint32)(bs_cache >> (MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPartLoBitCount)));
-            pRiceParamPartOut[0] = riceParamPartHi | riceParamPartLo;
-            bs_cache <<= riceParamPartLoBitCount;
-        }
-    } else {
-        ma_uint32 zeroCounter = (ma_uint32)(MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - bs_consumedBits);
-        for (;;) {
-            if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
-            #ifndef MA_DR_FLAC_NO_CRC
-                ma_dr_flac__update_crc16(bs);
-            #endif
-                bs_cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
-                bs_consumedBits = 0;
-            #ifndef MA_DR_FLAC_NO_CRC
-                bs->crc16Cache = bs_cache;
-            #endif
-            } else {
-                if (!ma_dr_flac__reload_cache(bs)) {
-                    return MA_FALSE;
-                }
-                bs_cache = bs->cache;
-                bs_consumedBits = bs->consumedBits;
-            }
-            lzcount = ma_dr_flac__clz(bs_cache);
-            zeroCounter += lzcount;
-            if (lzcount < sizeof(bs_cache)*8) {
-                break;
-            }
-        }
-        pZeroCounterOut[0] = zeroCounter;
-        goto extract_rice_param_part;
-    }
-    bs->cache = bs_cache;
-    bs->consumedBits = bs_consumedBits;
-    return MA_TRUE;
-}
-static MA_INLINE ma_bool32 ma_dr_flac__seek_rice_parts(ma_dr_flac_bs* bs, ma_uint8 riceParam)
-{
-    ma_uint32  riceParamPlus1 = riceParam + 1;
-    ma_uint32  riceParamPlus1MaxConsumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
-    ma_dr_flac_cache_t bs_cache = bs->cache;
-    ma_uint32  bs_consumedBits = bs->consumedBits;
-    ma_uint32  lzcount = ma_dr_flac__clz(bs_cache);
-    if (lzcount < sizeof(bs_cache)*8) {
-    extract_rice_param_part:
-        bs_cache       <<= lzcount;
-        bs_consumedBits += lzcount;
-        if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
-            bs_cache       <<= riceParamPlus1;
-            bs_consumedBits += riceParamPlus1;
-        } else {
-            ma_uint32 riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
-            MA_DR_FLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
-            if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
-            #ifndef MA_DR_FLAC_NO_CRC
-                ma_dr_flac__update_crc16(bs);
-            #endif
-                bs_cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
-                bs_consumedBits = riceParamPartLoBitCount;
-            #ifndef MA_DR_FLAC_NO_CRC
-                bs->crc16Cache = bs_cache;
-            #endif
-            } else {
-                if (!ma_dr_flac__reload_cache(bs)) {
-                    return MA_FALSE;
-                }
-                if (riceParamPartLoBitCount > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
-                    return MA_FALSE;
-                }
-                bs_cache = bs->cache;
-                bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
-            }
-            bs_cache <<= riceParamPartLoBitCount;
-        }
-    } else {
-        for (;;) {
-            if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
-            #ifndef MA_DR_FLAC_NO_CRC
-                ma_dr_flac__update_crc16(bs);
-            #endif
-                bs_cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
-                bs_consumedBits = 0;
-            #ifndef MA_DR_FLAC_NO_CRC
-                bs->crc16Cache = bs_cache;
-            #endif
-            } else {
-                if (!ma_dr_flac__reload_cache(bs)) {
-                    return MA_FALSE;
-                }
-                bs_cache = bs->cache;
-                bs_consumedBits = bs->consumedBits;
-            }
-            lzcount = ma_dr_flac__clz(bs_cache);
-            if (lzcount < sizeof(bs_cache)*8) {
-                break;
-            }
-        }
-        goto extract_rice_param_part;
-    }
-    bs->cache = bs_cache;
-    bs->consumedBits = bs_consumedBits;
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__scalar_zeroorder(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
-    ma_uint32 zeroCountPart0;
-    ma_uint32 riceParamPart0;
-    ma_uint32 riceParamMask;
-    ma_uint32 i;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
-    (void)bitsPerSample;
-    (void)order;
-    (void)shift;
-    (void)coefficients;
-    riceParamMask  = (ma_uint32)~((~0UL) << riceParam);
-    i = 0;
-    while (i < count) {
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
-            return MA_FALSE;
-        }
-        riceParamPart0 &= riceParamMask;
-        riceParamPart0 |= (zeroCountPart0 << riceParam);
-        riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
-        pSamplesOut[i] = riceParamPart0;
-        i += 1;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__scalar(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
-    ma_uint32 zeroCountPart0 = 0;
-    ma_uint32 zeroCountPart1 = 0;
-    ma_uint32 zeroCountPart2 = 0;
-    ma_uint32 zeroCountPart3 = 0;
-    ma_uint32 riceParamPart0 = 0;
-    ma_uint32 riceParamPart1 = 0;
-    ma_uint32 riceParamPart2 = 0;
-    ma_uint32 riceParamPart3 = 0;
-    ma_uint32 riceParamMask;
-    const ma_int32* pSamplesOutEnd;
-    ma_uint32 i;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
-    if (lpcOrder == 0) {
-        return ma_dr_flac__decode_samples_with_residual__rice__scalar_zeroorder(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
-    }
-    riceParamMask  = (ma_uint32)~((~0UL) << riceParam);
-    pSamplesOutEnd = pSamplesOut + (count & ~3);
-    if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
-        while (pSamplesOut < pSamplesOutEnd) {
-            if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
-                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
-                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
-                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
-                return MA_FALSE;
-            }
-            riceParamPart0 &= riceParamMask;
-            riceParamPart1 &= riceParamMask;
-            riceParamPart2 &= riceParamMask;
-            riceParamPart3 &= riceParamMask;
-            riceParamPart0 |= (zeroCountPart0 << riceParam);
-            riceParamPart1 |= (zeroCountPart1 << riceParam);
-            riceParamPart2 |= (zeroCountPart2 << riceParam);
-            riceParamPart3 |= (zeroCountPart3 << riceParam);
-            riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
-            riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
-            riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
-            riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
-            pSamplesOut[0] = riceParamPart0 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
-            pSamplesOut[1] = riceParamPart1 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 1);
-            pSamplesOut[2] = riceParamPart2 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 2);
-            pSamplesOut[3] = riceParamPart3 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 3);
-            pSamplesOut += 4;
-        }
-    } else {
-        while (pSamplesOut < pSamplesOutEnd) {
-            if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
-                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
-                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
-                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
-                return MA_FALSE;
-            }
-            riceParamPart0 &= riceParamMask;
-            riceParamPart1 &= riceParamMask;
-            riceParamPart2 &= riceParamMask;
-            riceParamPart3 &= riceParamMask;
-            riceParamPart0 |= (zeroCountPart0 << riceParam);
-            riceParamPart1 |= (zeroCountPart1 << riceParam);
-            riceParamPart2 |= (zeroCountPart2 << riceParam);
-            riceParamPart3 |= (zeroCountPart3 << riceParam);
-            riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
-            riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
-            riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
-            riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
-            pSamplesOut[0] = riceParamPart0 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
-            pSamplesOut[1] = riceParamPart1 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 1);
-            pSamplesOut[2] = riceParamPart2 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 2);
-            pSamplesOut[3] = riceParamPart3 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 3);
-            pSamplesOut += 4;
-        }
-    }
-    i = (count & ~3);
-    while (i < count) {
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
-            return MA_FALSE;
-        }
-        riceParamPart0 &= riceParamMask;
-        riceParamPart0 |= (zeroCountPart0 << riceParam);
-        riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
-        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
-            pSamplesOut[0] = riceParamPart0 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
-        } else {
-            pSamplesOut[0] = riceParamPart0 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
-        }
-        i += 1;
-        pSamplesOut += 1;
-    }
-    return MA_TRUE;
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE __m128i ma_dr_flac__mm_packs_interleaved_epi32(__m128i a, __m128i b)
-{
-    __m128i r;
-    r = _mm_packs_epi32(a, b);
-    r = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 1, 2, 0));
-    r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
-    r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
-    return r;
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_SSE41)
-static MA_INLINE __m128i ma_dr_flac__mm_not_si128(__m128i a)
-{
-    return _mm_xor_si128(a, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
-}
-static MA_INLINE __m128i ma_dr_flac__mm_hadd_epi32(__m128i x)
-{
-    __m128i x64 = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
-    __m128i x32 = _mm_shufflelo_epi16(x64, _MM_SHUFFLE(1, 0, 3, 2));
-    return _mm_add_epi32(x64, x32);
-}
-static MA_INLINE __m128i ma_dr_flac__mm_hadd_epi64(__m128i x)
-{
-    return _mm_add_epi64(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
-}
-static MA_INLINE __m128i ma_dr_flac__mm_srai_epi64(__m128i x, int count)
-{
-    __m128i lo = _mm_srli_epi64(x, count);
-    __m128i hi = _mm_srai_epi32(x, count);
-    hi = _mm_and_si128(hi, _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0));
-    return _mm_or_si128(lo, hi);
-}
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__sse41_32(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    int i;
-    ma_uint32 riceParamMask;
-    ma_int32* pDecodedSamples    = pSamplesOut;
-    ma_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
-    ma_uint32 zeroCountParts0 = 0;
-    ma_uint32 zeroCountParts1 = 0;
-    ma_uint32 zeroCountParts2 = 0;
-    ma_uint32 zeroCountParts3 = 0;
-    ma_uint32 riceParamParts0 = 0;
-    ma_uint32 riceParamParts1 = 0;
-    ma_uint32 riceParamParts2 = 0;
-    ma_uint32 riceParamParts3 = 0;
-    __m128i coefficients128_0;
-    __m128i coefficients128_4;
-    __m128i coefficients128_8;
-    __m128i samples128_0;
-    __m128i samples128_4;
-    __m128i samples128_8;
-    __m128i riceParamMask128;
-    const ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
-    riceParamMask    = (ma_uint32)~((~0UL) << riceParam);
-    riceParamMask128 = _mm_set1_epi32(riceParamMask);
-    coefficients128_0 = _mm_setzero_si128();
-    coefficients128_4 = _mm_setzero_si128();
-    coefficients128_8 = _mm_setzero_si128();
-    samples128_0 = _mm_setzero_si128();
-    samples128_4 = _mm_setzero_si128();
-    samples128_8 = _mm_setzero_si128();
-#if 1
-    {
-        int runningOrder = order;
-        if (runningOrder >= 4) {
-            coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
-            samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
-                case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
-                case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
-            }
-            runningOrder = 0;
-        }
-        if (runningOrder >= 4) {
-            coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
-            samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
-                case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
-                case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
-            }
-            runningOrder = 0;
-        }
-        if (runningOrder == 4) {
-            coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
-            samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
-                case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
-                case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
-            }
-            runningOrder = 0;
-        }
-        coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
-        coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
-        coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
-    }
-#else
-    switch (order)
-    {
-    case 12: ((ma_int32*)&coefficients128_8)[0] = coefficients[11]; ((ma_int32*)&samples128_8)[0] = pDecodedSamples[-12];
-    case 11: ((ma_int32*)&coefficients128_8)[1] = coefficients[10]; ((ma_int32*)&samples128_8)[1] = pDecodedSamples[-11];
-    case 10: ((ma_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((ma_int32*)&samples128_8)[2] = pDecodedSamples[-10];
-    case 9:  ((ma_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((ma_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
-    case 8:  ((ma_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((ma_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
-    case 7:  ((ma_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((ma_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
-    case 6:  ((ma_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((ma_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
-    case 5:  ((ma_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((ma_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
-    case 4:  ((ma_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((ma_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
-    case 3:  ((ma_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((ma_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
-    case 2:  ((ma_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((ma_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
-    case 1:  ((ma_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((ma_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
-    }
-#endif
-    while (pDecodedSamples < pDecodedSamplesEnd) {
-        __m128i prediction128;
-        __m128i zeroCountPart128;
-        __m128i riceParamPart128;
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
-            return MA_FALSE;
-        }
-        zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
-        riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
-        riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
-        riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
-        riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(ma_dr_flac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(0x01))), _mm_set1_epi32(0x01)));
-        if (order <= 4) {
-            for (i = 0; i < 4; i += 1) {
-                prediction128 = _mm_mullo_epi32(coefficients128_0, samples128_0);
-                prediction128 = ma_dr_flac__mm_hadd_epi32(prediction128);
-                prediction128 = _mm_srai_epi32(prediction128, shift);
-                prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
-                samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
-                riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
-            }
-        } else if (order <= 8) {
-            for (i = 0; i < 4; i += 1) {
-                prediction128 =                              _mm_mullo_epi32(coefficients128_4, samples128_4);
-                prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
-                prediction128 = ma_dr_flac__mm_hadd_epi32(prediction128);
-                prediction128 = _mm_srai_epi32(prediction128, shift);
-                prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
-                samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
-                samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
-                riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
-            }
-        } else {
-            for (i = 0; i < 4; i += 1) {
-                prediction128 =                              _mm_mullo_epi32(coefficients128_8, samples128_8);
-                prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_4, samples128_4));
-                prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
-                prediction128 = ma_dr_flac__mm_hadd_epi32(prediction128);
-                prediction128 = _mm_srai_epi32(prediction128, shift);
-                prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
-                samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
-                samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
-                samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
-                riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
-            }
-        }
-        _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
-        pDecodedSamples += 4;
-    }
-    i = (count & ~3);
-    while (i < (int)count) {
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
-            return MA_FALSE;
-        }
-        riceParamParts0 &= riceParamMask;
-        riceParamParts0 |= (zeroCountParts0 << riceParam);
-        riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
-        pDecodedSamples[0] = riceParamParts0 + ma_dr_flac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
-        i += 1;
-        pDecodedSamples += 1;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__sse41_64(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    int i;
-    ma_uint32 riceParamMask;
-    ma_int32* pDecodedSamples    = pSamplesOut;
-    ma_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
-    ma_uint32 zeroCountParts0 = 0;
-    ma_uint32 zeroCountParts1 = 0;
-    ma_uint32 zeroCountParts2 = 0;
-    ma_uint32 zeroCountParts3 = 0;
-    ma_uint32 riceParamParts0 = 0;
-    ma_uint32 riceParamParts1 = 0;
-    ma_uint32 riceParamParts2 = 0;
-    ma_uint32 riceParamParts3 = 0;
-    __m128i coefficients128_0;
-    __m128i coefficients128_4;
-    __m128i coefficients128_8;
-    __m128i samples128_0;
-    __m128i samples128_4;
-    __m128i samples128_8;
-    __m128i prediction128;
-    __m128i riceParamMask128;
-    const ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
-    MA_DR_FLAC_ASSERT(order <= 12);
-    riceParamMask    = (ma_uint32)~((~0UL) << riceParam);
-    riceParamMask128 = _mm_set1_epi32(riceParamMask);
-    prediction128 = _mm_setzero_si128();
-    coefficients128_0  = _mm_setzero_si128();
-    coefficients128_4  = _mm_setzero_si128();
-    coefficients128_8  = _mm_setzero_si128();
-    samples128_0  = _mm_setzero_si128();
-    samples128_4  = _mm_setzero_si128();
-    samples128_8  = _mm_setzero_si128();
-#if 1
-    {
-        int runningOrder = order;
-        if (runningOrder >= 4) {
-            coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
-            samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
-                case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
-                case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
-            }
-            runningOrder = 0;
-        }
-        if (runningOrder >= 4) {
-            coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
-            samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
-                case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
-                case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
-            }
-            runningOrder = 0;
-        }
-        if (runningOrder == 4) {
-            coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
-            samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
-                case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
-                case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
-            }
-            runningOrder = 0;
-        }
-        coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
-        coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
-        coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
-    }
-#else
-    switch (order)
-    {
-    case 12: ((ma_int32*)&coefficients128_8)[0] = coefficients[11]; ((ma_int32*)&samples128_8)[0] = pDecodedSamples[-12];
-    case 11: ((ma_int32*)&coefficients128_8)[1] = coefficients[10]; ((ma_int32*)&samples128_8)[1] = pDecodedSamples[-11];
-    case 10: ((ma_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((ma_int32*)&samples128_8)[2] = pDecodedSamples[-10];
-    case 9:  ((ma_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((ma_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
-    case 8:  ((ma_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((ma_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
-    case 7:  ((ma_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((ma_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
-    case 6:  ((ma_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((ma_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
-    case 5:  ((ma_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((ma_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
-    case 4:  ((ma_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((ma_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
-    case 3:  ((ma_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((ma_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
-    case 2:  ((ma_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((ma_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
-    case 1:  ((ma_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((ma_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
-    }
-#endif
-    while (pDecodedSamples < pDecodedSamplesEnd) {
-        __m128i zeroCountPart128;
-        __m128i riceParamPart128;
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
-            return MA_FALSE;
-        }
-        zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
-        riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
-        riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
-        riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
-        riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(ma_dr_flac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(1))), _mm_set1_epi32(1)));
-        for (i = 0; i < 4; i += 1) {
-            prediction128 = _mm_xor_si128(prediction128, prediction128);
-            switch (order)
-            {
-            case 12:
-            case 11: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(1, 1, 0, 0))));
-            case 10:
-            case  9: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(3, 3, 2, 2))));
-            case  8:
-            case  7: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(1, 1, 0, 0))));
-            case  6:
-            case  5: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(3, 3, 2, 2))));
-            case  4:
-            case  3: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(1, 1, 0, 0))));
-            case  2:
-            case  1: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(3, 3, 2, 2))));
-            }
-            prediction128 = ma_dr_flac__mm_hadd_epi64(prediction128);
-            prediction128 = ma_dr_flac__mm_srai_epi64(prediction128, shift);
-            prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
-            samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
-            samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
-            samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
-            riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
-        }
-        _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
-        pDecodedSamples += 4;
-    }
-    i = (count & ~3);
-    while (i < (int)count) {
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
-            return MA_FALSE;
-        }
-        riceParamParts0 &= riceParamMask;
-        riceParamParts0 |= (zeroCountParts0 << riceParam);
-        riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
-        pDecodedSamples[0] = riceParamParts0 + ma_dr_flac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
-        i += 1;
-        pDecodedSamples += 1;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__sse41(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
-    if (lpcOrder > 0 && lpcOrder <= 12) {
-        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
-            return ma_dr_flac__decode_samples_with_residual__rice__sse41_64(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
-        } else {
-            return ma_dr_flac__decode_samples_with_residual__rice__sse41_32(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
-        }
-    } else {
-        return ma_dr_flac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac__vst2q_s32(ma_int32* p, int32x4x2_t x)
-{
-    vst1q_s32(p+0, x.val[0]);
-    vst1q_s32(p+4, x.val[1]);
-}
-static MA_INLINE void ma_dr_flac__vst2q_u32(ma_uint32* p, uint32x4x2_t x)
-{
-    vst1q_u32(p+0, x.val[0]);
-    vst1q_u32(p+4, x.val[1]);
-}
-static MA_INLINE void ma_dr_flac__vst2q_f32(float* p, float32x4x2_t x)
-{
-    vst1q_f32(p+0, x.val[0]);
-    vst1q_f32(p+4, x.val[1]);
-}
-static MA_INLINE void ma_dr_flac__vst2q_s16(ma_int16* p, int16x4x2_t x)
-{
-    vst1q_s16(p, vcombine_s16(x.val[0], x.val[1]));
-}
-static MA_INLINE void ma_dr_flac__vst2q_u16(ma_uint16* p, uint16x4x2_t x)
-{
-    vst1q_u16(p, vcombine_u16(x.val[0], x.val[1]));
-}
-static MA_INLINE int32x4_t ma_dr_flac__vdupq_n_s32x4(ma_int32 x3, ma_int32 x2, ma_int32 x1, ma_int32 x0)
-{
-    ma_int32 x[4];
-    x[3] = x3;
-    x[2] = x2;
-    x[1] = x1;
-    x[0] = x0;
-    return vld1q_s32(x);
-}
-static MA_INLINE int32x4_t ma_dr_flac__valignrq_s32_1(int32x4_t a, int32x4_t b)
-{
-    return vextq_s32(b, a, 1);
-}
-static MA_INLINE uint32x4_t ma_dr_flac__valignrq_u32_1(uint32x4_t a, uint32x4_t b)
-{
-    return vextq_u32(b, a, 1);
-}
-static MA_INLINE int32x2_t ma_dr_flac__vhaddq_s32(int32x4_t x)
-{
-    int32x2_t r = vadd_s32(vget_high_s32(x), vget_low_s32(x));
-    return vpadd_s32(r, r);
-}
-static MA_INLINE int64x1_t ma_dr_flac__vhaddq_s64(int64x2_t x)
-{
-    return vadd_s64(vget_high_s64(x), vget_low_s64(x));
-}
-static MA_INLINE int32x4_t ma_dr_flac__vrevq_s32(int32x4_t x)
-{
-    return vrev64q_s32(vcombine_s32(vget_high_s32(x), vget_low_s32(x)));
-}
-static MA_INLINE int32x4_t ma_dr_flac__vnotq_s32(int32x4_t x)
-{
-    return veorq_s32(x, vdupq_n_s32(0xFFFFFFFF));
-}
-static MA_INLINE uint32x4_t ma_dr_flac__vnotq_u32(uint32x4_t x)
-{
-    return veorq_u32(x, vdupq_n_u32(0xFFFFFFFF));
-}
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__neon_32(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    int i;
-    ma_uint32 riceParamMask;
-    ma_int32* pDecodedSamples    = pSamplesOut;
-    ma_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
-    ma_uint32 zeroCountParts[4];
-    ma_uint32 riceParamParts[4];
-    int32x4_t coefficients128_0;
-    int32x4_t coefficients128_4;
-    int32x4_t coefficients128_8;
-    int32x4_t samples128_0;
-    int32x4_t samples128_4;
-    int32x4_t samples128_8;
-    uint32x4_t riceParamMask128;
-    int32x4_t riceParam128;
-    int32x2_t shift64;
-    uint32x4_t one128;
-    const ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
-    riceParamMask    = (ma_uint32)~((~0UL) << riceParam);
-    riceParamMask128 = vdupq_n_u32(riceParamMask);
-    riceParam128 = vdupq_n_s32(riceParam);
-    shift64 = vdup_n_s32(-shift);
-    one128 = vdupq_n_u32(1);
-    {
-        int runningOrder = order;
-        ma_int32 tempC[4] = {0, 0, 0, 0};
-        ma_int32 tempS[4] = {0, 0, 0, 0};
-        if (runningOrder >= 4) {
-            coefficients128_0 = vld1q_s32(coefficients + 0);
-            samples128_0      = vld1q_s32(pSamplesOut  - 4);
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3];
-                case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2];
-                case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1];
-            }
-            coefficients128_0 = vld1q_s32(tempC);
-            samples128_0      = vld1q_s32(tempS);
-            runningOrder = 0;
-        }
-        if (runningOrder >= 4) {
-            coefficients128_4 = vld1q_s32(coefficients + 4);
-            samples128_4      = vld1q_s32(pSamplesOut  - 8);
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7];
-                case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6];
-                case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5];
-            }
-            coefficients128_4 = vld1q_s32(tempC);
-            samples128_4      = vld1q_s32(tempS);
-            runningOrder = 0;
-        }
-        if (runningOrder == 4) {
-            coefficients128_8 = vld1q_s32(coefficients + 8);
-            samples128_8      = vld1q_s32(pSamplesOut  - 12);
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11];
-                case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10];
-                case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9];
-            }
-            coefficients128_8 = vld1q_s32(tempC);
-            samples128_8      = vld1q_s32(tempS);
-            runningOrder = 0;
-        }
-        coefficients128_0 = ma_dr_flac__vrevq_s32(coefficients128_0);
-        coefficients128_4 = ma_dr_flac__vrevq_s32(coefficients128_4);
-        coefficients128_8 = ma_dr_flac__vrevq_s32(coefficients128_8);
-    }
-    while (pDecodedSamples < pDecodedSamplesEnd) {
-        int32x4_t prediction128;
-        int32x2_t prediction64;
-        uint32x4_t zeroCountPart128;
-        uint32x4_t riceParamPart128;
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
-            return MA_FALSE;
-        }
-        zeroCountPart128 = vld1q_u32(zeroCountParts);
-        riceParamPart128 = vld1q_u32(riceParamParts);
-        riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
-        riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
-        riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(ma_dr_flac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
-        if (order <= 4) {
-            for (i = 0; i < 4; i += 1) {
-                prediction128 = vmulq_s32(coefficients128_0, samples128_0);
-                prediction64 = ma_dr_flac__vhaddq_s32(prediction128);
-                prediction64 = vshl_s32(prediction64, shift64);
-                prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
-                samples128_0 = ma_dr_flac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
-                riceParamPart128 = ma_dr_flac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
-            }
-        } else if (order <= 8) {
-            for (i = 0; i < 4; i += 1) {
-                prediction128 =                vmulq_s32(coefficients128_4, samples128_4);
-                prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
-                prediction64 = ma_dr_flac__vhaddq_s32(prediction128);
-                prediction64 = vshl_s32(prediction64, shift64);
-                prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
-                samples128_4 = ma_dr_flac__valignrq_s32_1(samples128_0, samples128_4);
-                samples128_0 = ma_dr_flac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
-                riceParamPart128 = ma_dr_flac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
-            }
-        } else {
-            for (i = 0; i < 4; i += 1) {
-                prediction128 =                vmulq_s32(coefficients128_8, samples128_8);
-                prediction128 = vmlaq_s32(prediction128, coefficients128_4, samples128_4);
-                prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
-                prediction64 = ma_dr_flac__vhaddq_s32(prediction128);
-                prediction64 = vshl_s32(prediction64, shift64);
-                prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
-                samples128_8 = ma_dr_flac__valignrq_s32_1(samples128_4, samples128_8);
-                samples128_4 = ma_dr_flac__valignrq_s32_1(samples128_0, samples128_4);
-                samples128_0 = ma_dr_flac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
-                riceParamPart128 = ma_dr_flac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
-            }
-        }
-        vst1q_s32(pDecodedSamples, samples128_0);
-        pDecodedSamples += 4;
-    }
-    i = (count & ~3);
-    while (i < (int)count) {
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
-            return MA_FALSE;
-        }
-        riceParamParts[0] &= riceParamMask;
-        riceParamParts[0] |= (zeroCountParts[0] << riceParam);
-        riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
-        pDecodedSamples[0] = riceParamParts[0] + ma_dr_flac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
-        i += 1;
-        pDecodedSamples += 1;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__neon_64(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    int i;
-    ma_uint32 riceParamMask;
-    ma_int32* pDecodedSamples    = pSamplesOut;
-    ma_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
-    ma_uint32 zeroCountParts[4];
-    ma_uint32 riceParamParts[4];
-    int32x4_t coefficients128_0;
-    int32x4_t coefficients128_4;
-    int32x4_t coefficients128_8;
-    int32x4_t samples128_0;
-    int32x4_t samples128_4;
-    int32x4_t samples128_8;
-    uint32x4_t riceParamMask128;
-    int32x4_t riceParam128;
-    int64x1_t shift64;
-    uint32x4_t one128;
-    int64x2_t prediction128 = { 0 };
-    uint32x4_t zeroCountPart128;
-    uint32x4_t riceParamPart128;
-    const ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
-    riceParamMask    = (ma_uint32)~((~0UL) << riceParam);
-    riceParamMask128 = vdupq_n_u32(riceParamMask);
-    riceParam128 = vdupq_n_s32(riceParam);
-    shift64 = vdup_n_s64(-shift);
-    one128 = vdupq_n_u32(1);
-    {
-        int runningOrder = order;
-        ma_int32 tempC[4] = {0, 0, 0, 0};
-        ma_int32 tempS[4] = {0, 0, 0, 0};
-        if (runningOrder >= 4) {
-            coefficients128_0 = vld1q_s32(coefficients + 0);
-            samples128_0      = vld1q_s32(pSamplesOut  - 4);
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3];
-                case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2];
-                case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1];
-            }
-            coefficients128_0 = vld1q_s32(tempC);
-            samples128_0      = vld1q_s32(tempS);
-            runningOrder = 0;
-        }
-        if (runningOrder >= 4) {
-            coefficients128_4 = vld1q_s32(coefficients + 4);
-            samples128_4      = vld1q_s32(pSamplesOut  - 8);
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7];
-                case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6];
-                case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5];
-            }
-            coefficients128_4 = vld1q_s32(tempC);
-            samples128_4      = vld1q_s32(tempS);
-            runningOrder = 0;
-        }
-        if (runningOrder == 4) {
-            coefficients128_8 = vld1q_s32(coefficients + 8);
-            samples128_8      = vld1q_s32(pSamplesOut  - 12);
-            runningOrder -= 4;
-        } else {
-            switch (runningOrder) {
-                case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11];
-                case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10];
-                case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9];
-            }
-            coefficients128_8 = vld1q_s32(tempC);
-            samples128_8      = vld1q_s32(tempS);
-            runningOrder = 0;
-        }
-        coefficients128_0 = ma_dr_flac__vrevq_s32(coefficients128_0);
-        coefficients128_4 = ma_dr_flac__vrevq_s32(coefficients128_4);
-        coefficients128_8 = ma_dr_flac__vrevq_s32(coefficients128_8);
-    }
-    while (pDecodedSamples < pDecodedSamplesEnd) {
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
-            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
-            return MA_FALSE;
-        }
-        zeroCountPart128 = vld1q_u32(zeroCountParts);
-        riceParamPart128 = vld1q_u32(riceParamParts);
-        riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
-        riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
-        riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(ma_dr_flac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
-        for (i = 0; i < 4; i += 1) {
-            int64x1_t prediction64;
-            prediction128 = veorq_s64(prediction128, prediction128);
-            switch (order)
-            {
-            case 12:
-            case 11: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_8), vget_low_s32(samples128_8)));
-            case 10:
-            case  9: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_8), vget_high_s32(samples128_8)));
-            case  8:
-            case  7: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_4), vget_low_s32(samples128_4)));
-            case  6:
-            case  5: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_4), vget_high_s32(samples128_4)));
-            case  4:
-            case  3: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_0), vget_low_s32(samples128_0)));
-            case  2:
-            case  1: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_0), vget_high_s32(samples128_0)));
-            }
-            prediction64 = ma_dr_flac__vhaddq_s64(prediction128);
-            prediction64 = vshl_s64(prediction64, shift64);
-            prediction64 = vadd_s64(prediction64, vdup_n_s64(vgetq_lane_u32(riceParamPart128, 0)));
-            samples128_8 = ma_dr_flac__valignrq_s32_1(samples128_4, samples128_8);
-            samples128_4 = ma_dr_flac__valignrq_s32_1(samples128_0, samples128_4);
-            samples128_0 = ma_dr_flac__valignrq_s32_1(vcombine_s32(vreinterpret_s32_s64(prediction64), vdup_n_s32(0)), samples128_0);
-            riceParamPart128 = ma_dr_flac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
-        }
-        vst1q_s32(pDecodedSamples, samples128_0);
-        pDecodedSamples += 4;
-    }
-    i = (count & ~3);
-    while (i < (int)count) {
-        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
-            return MA_FALSE;
-        }
-        riceParamParts[0] &= riceParamMask;
-        riceParamParts[0] |= (zeroCountParts[0] << riceParam);
-        riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
-        pDecodedSamples[0] = riceParamParts[0] + ma_dr_flac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
-        i += 1;
-        pDecodedSamples += 1;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__neon(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
-    if (lpcOrder > 0 && lpcOrder <= 12) {
-        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
-            return ma_dr_flac__decode_samples_with_residual__rice__neon_64(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
-        } else {
-            return ma_dr_flac__decode_samples_with_residual__rice__neon_32(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
-        }
-    } else {
-        return ma_dr_flac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
-    }
-}
-#endif
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE41)
-    if (ma_dr_flac__gIsSSE41Supported) {
-        return ma_dr_flac__decode_samples_with_residual__rice__sse41(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported) {
-        return ma_dr_flac__decode_samples_with_residual__rice__neon(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
-    } else
-#endif
-    {
-    #if 0
-        return ma_dr_flac__decode_samples_with_residual__rice__reference(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
-    #else
-        return ma_dr_flac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
-    #endif
-    }
-}
-static ma_bool32 ma_dr_flac__read_and_seek_residual__rice(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam)
-{
-    ma_uint32 i;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    for (i = 0; i < count; ++i) {
-        if (!ma_dr_flac__seek_rice_parts(bs, riceParam)) {
-            return MA_FALSE;
-        }
-    }
-    return MA_TRUE;
-}
-#if defined(__clang__)
-__attribute__((no_sanitize("signed-integer-overflow")))
-#endif
-static ma_bool32 ma_dr_flac__decode_samples_with_residual__unencoded(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 unencodedBitsPerSample, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
-{
-    ma_uint32 i;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(unencodedBitsPerSample <= 31);
-    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
-    for (i = 0; i < count; ++i) {
-        if (unencodedBitsPerSample > 0) {
-            if (!ma_dr_flac__read_int32(bs, unencodedBitsPerSample, pSamplesOut + i)) {
-                return MA_FALSE;
-            }
-        } else {
-            pSamplesOut[i] = 0;
-        }
-        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
-            pSamplesOut[i] += ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
-        } else {
-            pSamplesOut[i] += ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
-        }
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples_with_residual(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 blockSize, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pDecodedSamples)
-{
-    ma_uint8 residualMethod;
-    ma_uint8 partitionOrder;
-    ma_uint32 samplesInPartition;
-    ma_uint32 partitionsRemaining;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(blockSize != 0);
-    MA_DR_FLAC_ASSERT(pDecodedSamples != NULL);
-    if (!ma_dr_flac__read_uint8(bs, 2, &residualMethod)) {
-        return MA_FALSE;
-    }
-    if (residualMethod != MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
-        return MA_FALSE;
-    }
-    pDecodedSamples += lpcOrder;
-    if (!ma_dr_flac__read_uint8(bs, 4, &partitionOrder)) {
-        return MA_FALSE;
-    }
-    if (partitionOrder > 8) {
-        return MA_FALSE;
-    }
-    if ((blockSize / (1 << partitionOrder)) < lpcOrder) {
-        return MA_FALSE;
-    }
-    samplesInPartition = (blockSize / (1 << partitionOrder)) - lpcOrder;
-    partitionsRemaining = (1 << partitionOrder);
-    for (;;) {
-        ma_uint8 riceParam = 0;
-        if (residualMethod == MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
-            if (!ma_dr_flac__read_uint8(bs, 4, &riceParam)) {
-                return MA_FALSE;
-            }
-            if (riceParam == 15) {
-                riceParam = 0xFF;
-            }
-        } else if (residualMethod == MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
-            if (!ma_dr_flac__read_uint8(bs, 5, &riceParam)) {
-                return MA_FALSE;
-            }
-            if (riceParam == 31) {
-                riceParam = 0xFF;
-            }
-        }
-        if (riceParam != 0xFF) {
-            if (!ma_dr_flac__decode_samples_with_residual__rice(bs, bitsPerSample, samplesInPartition, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pDecodedSamples)) {
-                return MA_FALSE;
-            }
-        } else {
-            ma_uint8 unencodedBitsPerSample = 0;
-            if (!ma_dr_flac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
-                return MA_FALSE;
-            }
-            if (!ma_dr_flac__decode_samples_with_residual__unencoded(bs, bitsPerSample, samplesInPartition, unencodedBitsPerSample, lpcOrder, lpcShift, lpcPrecision, coefficients, pDecodedSamples)) {
-                return MA_FALSE;
-            }
-        }
-        pDecodedSamples += samplesInPartition;
-        if (partitionsRemaining == 1) {
-            break;
-        }
-        partitionsRemaining -= 1;
-        if (partitionOrder != 0) {
-            samplesInPartition = blockSize / (1 << partitionOrder);
-        }
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__read_and_seek_residual(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 order)
-{
-    ma_uint8 residualMethod;
-    ma_uint8 partitionOrder;
-    ma_uint32 samplesInPartition;
-    ma_uint32 partitionsRemaining;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(blockSize != 0);
-    if (!ma_dr_flac__read_uint8(bs, 2, &residualMethod)) {
-        return MA_FALSE;
-    }
-    if (residualMethod != MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
-        return MA_FALSE;
-    }
-    if (!ma_dr_flac__read_uint8(bs, 4, &partitionOrder)) {
-        return MA_FALSE;
-    }
-    if (partitionOrder > 8) {
-        return MA_FALSE;
-    }
-    if ((blockSize / (1 << partitionOrder)) <= order) {
-        return MA_FALSE;
-    }
-    samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
-    partitionsRemaining = (1 << partitionOrder);
-    for (;;)
-    {
-        ma_uint8 riceParam = 0;
-        if (residualMethod == MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
-            if (!ma_dr_flac__read_uint8(bs, 4, &riceParam)) {
-                return MA_FALSE;
-            }
-            if (riceParam == 15) {
-                riceParam = 0xFF;
-            }
-        } else if (residualMethod == MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
-            if (!ma_dr_flac__read_uint8(bs, 5, &riceParam)) {
-                return MA_FALSE;
-            }
-            if (riceParam == 31) {
-                riceParam = 0xFF;
-            }
-        }
-        if (riceParam != 0xFF) {
-            if (!ma_dr_flac__read_and_seek_residual__rice(bs, samplesInPartition, riceParam)) {
-                return MA_FALSE;
-            }
-        } else {
-            ma_uint8 unencodedBitsPerSample = 0;
-            if (!ma_dr_flac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
-                return MA_FALSE;
-            }
-            if (!ma_dr_flac__seek_bits(bs, unencodedBitsPerSample * samplesInPartition)) {
-                return MA_FALSE;
-            }
-        }
-        if (partitionsRemaining == 1) {
-            break;
-        }
-        partitionsRemaining -= 1;
-        samplesInPartition = blockSize / (1 << partitionOrder);
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples__constant(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 subframeBitsPerSample, ma_int32* pDecodedSamples)
-{
-    ma_uint32 i;
-    ma_int32 sample;
-    if (!ma_dr_flac__read_int32(bs, subframeBitsPerSample, &sample)) {
-        return MA_FALSE;
-    }
-    for (i = 0; i < blockSize; ++i) {
-        pDecodedSamples[i] = sample;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples__verbatim(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 subframeBitsPerSample, ma_int32* pDecodedSamples)
-{
-    ma_uint32 i;
-    for (i = 0; i < blockSize; ++i) {
-        ma_int32 sample;
-        if (!ma_dr_flac__read_int32(bs, subframeBitsPerSample, &sample)) {
-            return MA_FALSE;
-        }
-        pDecodedSamples[i] = sample;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples__fixed(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 subframeBitsPerSample, ma_uint8 lpcOrder, ma_int32* pDecodedSamples)
-{
-    ma_uint32 i;
-    static ma_int32 lpcCoefficientsTable[5][4] = {
-        {0,  0, 0,  0},
-        {1,  0, 0,  0},
-        {2, -1, 0,  0},
-        {3, -3, 1,  0},
-        {4, -6, 4, -1}
-    };
-    for (i = 0; i < lpcOrder; ++i) {
-        ma_int32 sample;
-        if (!ma_dr_flac__read_int32(bs, subframeBitsPerSample, &sample)) {
-            return MA_FALSE;
-        }
-        pDecodedSamples[i] = sample;
-    }
-    if (!ma_dr_flac__decode_samples_with_residual(bs, subframeBitsPerSample, blockSize, lpcOrder, 0, 4, lpcCoefficientsTable[lpcOrder], pDecodedSamples)) {
-        return MA_FALSE;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_samples__lpc(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 bitsPerSample, ma_uint8 lpcOrder, ma_int32* pDecodedSamples)
-{
-    ma_uint8 i;
-    ma_uint8 lpcPrecision;
-    ma_int8 lpcShift;
-    ma_int32 coefficients[32];
-    for (i = 0; i < lpcOrder; ++i) {
-        ma_int32 sample;
-        if (!ma_dr_flac__read_int32(bs, bitsPerSample, &sample)) {
-            return MA_FALSE;
-        }
-        pDecodedSamples[i] = sample;
-    }
-    if (!ma_dr_flac__read_uint8(bs, 4, &lpcPrecision)) {
-        return MA_FALSE;
-    }
-    if (lpcPrecision == 15) {
-        return MA_FALSE;
-    }
-    lpcPrecision += 1;
-    if (!ma_dr_flac__read_int8(bs, 5, &lpcShift)) {
-        return MA_FALSE;
-    }
-    if (lpcShift < 0) {
-        return MA_FALSE;
-    }
-    MA_DR_FLAC_ZERO_MEMORY(coefficients, sizeof(coefficients));
-    for (i = 0; i < lpcOrder; ++i) {
-        if (!ma_dr_flac__read_int32(bs, lpcPrecision, coefficients + i)) {
-            return MA_FALSE;
-        }
-    }
-    if (!ma_dr_flac__decode_samples_with_residual(bs, bitsPerSample, blockSize, lpcOrder, lpcShift, lpcPrecision, coefficients, pDecodedSamples)) {
-        return MA_FALSE;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__read_next_flac_frame_header(ma_dr_flac_bs* bs, ma_uint8 streaminfoBitsPerSample, ma_dr_flac_frame_header* header)
-{
-    const ma_uint32 sampleRateTable[12]  = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
-    const ma_uint8 bitsPerSampleTable[8] = {0, 8, 12, (ma_uint8)-1, 16, 20, 24, (ma_uint8)-1};
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(header != NULL);
-    for (;;) {
-        ma_uint8 crc8 = 0xCE;
-        ma_uint8 reserved = 0;
-        ma_uint8 blockingStrategy = 0;
-        ma_uint8 blockSize = 0;
-        ma_uint8 sampleRate = 0;
-        ma_uint8 channelAssignment = 0;
-        ma_uint8 bitsPerSample = 0;
-        ma_bool32 isVariableBlockSize;
-        if (!ma_dr_flac__find_and_seek_to_next_sync_code(bs)) {
-            return MA_FALSE;
-        }
-        if (!ma_dr_flac__read_uint8(bs, 1, &reserved)) {
-            return MA_FALSE;
-        }
-        if (reserved == 1) {
-            continue;
-        }
-        crc8 = ma_dr_flac_crc8(crc8, reserved, 1);
-        if (!ma_dr_flac__read_uint8(bs, 1, &blockingStrategy)) {
-            return MA_FALSE;
-        }
-        crc8 = ma_dr_flac_crc8(crc8, blockingStrategy, 1);
-        if (!ma_dr_flac__read_uint8(bs, 4, &blockSize)) {
-            return MA_FALSE;
-        }
-        if (blockSize == 0) {
-            continue;
-        }
-        crc8 = ma_dr_flac_crc8(crc8, blockSize, 4);
-        if (!ma_dr_flac__read_uint8(bs, 4, &sampleRate)) {
-            return MA_FALSE;
-        }
-        crc8 = ma_dr_flac_crc8(crc8, sampleRate, 4);
-        if (!ma_dr_flac__read_uint8(bs, 4, &channelAssignment)) {
-            return MA_FALSE;
-        }
-        if (channelAssignment > 10) {
-            continue;
-        }
-        crc8 = ma_dr_flac_crc8(crc8, channelAssignment, 4);
-        if (!ma_dr_flac__read_uint8(bs, 3, &bitsPerSample)) {
-            return MA_FALSE;
-        }
-        if (bitsPerSample == 3 || bitsPerSample == 7) {
-            continue;
-        }
-        crc8 = ma_dr_flac_crc8(crc8, bitsPerSample, 3);
-        if (!ma_dr_flac__read_uint8(bs, 1, &reserved)) {
-            return MA_FALSE;
-        }
-        if (reserved == 1) {
-            continue;
-        }
-        crc8 = ma_dr_flac_crc8(crc8, reserved, 1);
-        isVariableBlockSize = blockingStrategy == 1;
-        if (isVariableBlockSize) {
-            ma_uint64 pcmFrameNumber;
-            ma_result result = ma_dr_flac__read_utf8_coded_number(bs, &pcmFrameNumber, &crc8);
-            if (result != MA_SUCCESS) {
-                if (result == MA_AT_END) {
-                    return MA_FALSE;
-                } else {
-                    continue;
-                }
-            }
-            header->flacFrameNumber  = 0;
-            header->pcmFrameNumber = pcmFrameNumber;
-        } else {
-            ma_uint64 flacFrameNumber = 0;
-            ma_result result = ma_dr_flac__read_utf8_coded_number(bs, &flacFrameNumber, &crc8);
-            if (result != MA_SUCCESS) {
-                if (result == MA_AT_END) {
-                    return MA_FALSE;
-                } else {
-                    continue;
-                }
-            }
-            header->flacFrameNumber  = (ma_uint32)flacFrameNumber;
-            header->pcmFrameNumber = 0;
-        }
-        MA_DR_FLAC_ASSERT(blockSize > 0);
-        if (blockSize == 1) {
-            header->blockSizeInPCMFrames = 192;
-        } else if (blockSize <= 5) {
-            MA_DR_FLAC_ASSERT(blockSize >= 2);
-            header->blockSizeInPCMFrames = 576 * (1 << (blockSize - 2));
-        } else if (blockSize == 6) {
-            if (!ma_dr_flac__read_uint16(bs, 8, &header->blockSizeInPCMFrames)) {
-                return MA_FALSE;
-            }
-            crc8 = ma_dr_flac_crc8(crc8, header->blockSizeInPCMFrames, 8);
-            header->blockSizeInPCMFrames += 1;
-        } else if (blockSize == 7) {
-            if (!ma_dr_flac__read_uint16(bs, 16, &header->blockSizeInPCMFrames)) {
-                return MA_FALSE;
-            }
-            crc8 = ma_dr_flac_crc8(crc8, header->blockSizeInPCMFrames, 16);
-            if (header->blockSizeInPCMFrames == 0xFFFF) {
-                return MA_FALSE;
-            }
-            header->blockSizeInPCMFrames += 1;
-        } else {
-            MA_DR_FLAC_ASSERT(blockSize >= 8);
-            header->blockSizeInPCMFrames = 256 * (1 << (blockSize - 8));
-        }
-        if (sampleRate <= 11) {
-            header->sampleRate = sampleRateTable[sampleRate];
-        } else if (sampleRate == 12) {
-            if (!ma_dr_flac__read_uint32(bs, 8, &header->sampleRate)) {
-                return MA_FALSE;
-            }
-            crc8 = ma_dr_flac_crc8(crc8, header->sampleRate, 8);
-            header->sampleRate *= 1000;
-        } else if (sampleRate == 13) {
-            if (!ma_dr_flac__read_uint32(bs, 16, &header->sampleRate)) {
-                return MA_FALSE;
-            }
-            crc8 = ma_dr_flac_crc8(crc8, header->sampleRate, 16);
-        } else if (sampleRate == 14) {
-            if (!ma_dr_flac__read_uint32(bs, 16, &header->sampleRate)) {
-                return MA_FALSE;
-            }
-            crc8 = ma_dr_flac_crc8(crc8, header->sampleRate, 16);
-            header->sampleRate *= 10;
-        } else {
-            continue;
-        }
-        header->channelAssignment = channelAssignment;
-        header->bitsPerSample = bitsPerSampleTable[bitsPerSample];
-        if (header->bitsPerSample == 0) {
-            header->bitsPerSample = streaminfoBitsPerSample;
-        }
-        if (header->bitsPerSample != streaminfoBitsPerSample) {
-            return MA_FALSE;
-        }
-        if (!ma_dr_flac__read_uint8(bs, 8, &header->crc8)) {
-            return MA_FALSE;
-        }
-#ifndef MA_DR_FLAC_NO_CRC
-        if (header->crc8 != crc8) {
-            continue;
-        }
-#endif
-        return MA_TRUE;
-    }
-}
-static ma_bool32 ma_dr_flac__read_subframe_header(ma_dr_flac_bs* bs, ma_dr_flac_subframe* pSubframe)
-{
-    ma_uint8 header;
-    int type;
-    if (!ma_dr_flac__read_uint8(bs, 8, &header)) {
-        return MA_FALSE;
-    }
-    if ((header & 0x80) != 0) {
-        return MA_FALSE;
-    }
-    type = (header & 0x7E) >> 1;
-    if (type == 0) {
-        pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_CONSTANT;
-    } else if (type == 1) {
-        pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_VERBATIM;
-    } else {
-        if ((type & 0x20) != 0) {
-            pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_LPC;
-            pSubframe->lpcOrder = (ma_uint8)(type & 0x1F) + 1;
-        } else if ((type & 0x08) != 0) {
-            pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_FIXED;
-            pSubframe->lpcOrder = (ma_uint8)(type & 0x07);
-            if (pSubframe->lpcOrder > 4) {
-                pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_RESERVED;
-                pSubframe->lpcOrder = 0;
-            }
-        } else {
-            pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_RESERVED;
-        }
-    }
-    if (pSubframe->subframeType == MA_DR_FLAC_SUBFRAME_RESERVED) {
-        return MA_FALSE;
-    }
-    pSubframe->wastedBitsPerSample = 0;
-    if ((header & 0x01) == 1) {
-        unsigned int wastedBitsPerSample;
-        if (!ma_dr_flac__seek_past_next_set_bit(bs, &wastedBitsPerSample)) {
-            return MA_FALSE;
-        }
-        pSubframe->wastedBitsPerSample = (ma_uint8)wastedBitsPerSample + 1;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_subframe(ma_dr_flac_bs* bs, ma_dr_flac_frame* frame, int subframeIndex, ma_int32* pDecodedSamplesOut)
-{
-    ma_dr_flac_subframe* pSubframe;
-    ma_uint32 subframeBitsPerSample;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(frame != NULL);
-    pSubframe = frame->subframes + subframeIndex;
-    if (!ma_dr_flac__read_subframe_header(bs, pSubframe)) {
-        return MA_FALSE;
-    }
-    subframeBitsPerSample = frame->header.bitsPerSample;
-    if ((frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
-        subframeBitsPerSample += 1;
-    } else if (frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
-        subframeBitsPerSample += 1;
-    }
-    if (subframeBitsPerSample > 32) {
-        return MA_FALSE;
-    }
-    if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
-        return MA_FALSE;
-    }
-    subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
-    pSubframe->pSamplesS32 = pDecodedSamplesOut;
-    switch (pSubframe->subframeType)
-    {
-        case MA_DR_FLAC_SUBFRAME_CONSTANT:
-        {
-            ma_dr_flac__decode_samples__constant(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
-        } break;
-        case MA_DR_FLAC_SUBFRAME_VERBATIM:
-        {
-            ma_dr_flac__decode_samples__verbatim(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
-        } break;
-        case MA_DR_FLAC_SUBFRAME_FIXED:
-        {
-            ma_dr_flac__decode_samples__fixed(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
-        } break;
-        case MA_DR_FLAC_SUBFRAME_LPC:
-        {
-            ma_dr_flac__decode_samples__lpc(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
-        } break;
-        default: return MA_FALSE;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__seek_subframe(ma_dr_flac_bs* bs, ma_dr_flac_frame* frame, int subframeIndex)
-{
-    ma_dr_flac_subframe* pSubframe;
-    ma_uint32 subframeBitsPerSample;
-    MA_DR_FLAC_ASSERT(bs != NULL);
-    MA_DR_FLAC_ASSERT(frame != NULL);
-    pSubframe = frame->subframes + subframeIndex;
-    if (!ma_dr_flac__read_subframe_header(bs, pSubframe)) {
-        return MA_FALSE;
-    }
-    subframeBitsPerSample = frame->header.bitsPerSample;
-    if ((frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
-        subframeBitsPerSample += 1;
-    } else if (frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
-        subframeBitsPerSample += 1;
-    }
-    if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
-        return MA_FALSE;
-    }
-    subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
-    pSubframe->pSamplesS32 = NULL;
-    switch (pSubframe->subframeType)
-    {
-        case MA_DR_FLAC_SUBFRAME_CONSTANT:
-        {
-            if (!ma_dr_flac__seek_bits(bs, subframeBitsPerSample)) {
-                return MA_FALSE;
-            }
-        } break;
-        case MA_DR_FLAC_SUBFRAME_VERBATIM:
-        {
-            unsigned int bitsToSeek = frame->header.blockSizeInPCMFrames * subframeBitsPerSample;
-            if (!ma_dr_flac__seek_bits(bs, bitsToSeek)) {
-                return MA_FALSE;
-            }
-        } break;
-        case MA_DR_FLAC_SUBFRAME_FIXED:
-        {
-            unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
-            if (!ma_dr_flac__seek_bits(bs, bitsToSeek)) {
-                return MA_FALSE;
-            }
-            if (!ma_dr_flac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
-                return MA_FALSE;
-            }
-        } break;
-        case MA_DR_FLAC_SUBFRAME_LPC:
-        {
-            ma_uint8 lpcPrecision;
-            unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
-            if (!ma_dr_flac__seek_bits(bs, bitsToSeek)) {
-                return MA_FALSE;
-            }
-            if (!ma_dr_flac__read_uint8(bs, 4, &lpcPrecision)) {
-                return MA_FALSE;
-            }
-            if (lpcPrecision == 15) {
-                return MA_FALSE;
-            }
-            lpcPrecision += 1;
-            bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;
-            if (!ma_dr_flac__seek_bits(bs, bitsToSeek)) {
-                return MA_FALSE;
-            }
-            if (!ma_dr_flac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
-                return MA_FALSE;
-            }
-        } break;
-        default: return MA_FALSE;
-    }
-    return MA_TRUE;
-}
-static MA_INLINE ma_uint8 ma_dr_flac__get_channel_count_from_channel_assignment(ma_int8 channelAssignment)
-{
-    ma_uint8 lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
-    MA_DR_FLAC_ASSERT(channelAssignment <= 10);
-    return lookup[channelAssignment];
-}
-static ma_result ma_dr_flac__decode_flac_frame(ma_dr_flac* pFlac)
-{
-    int channelCount;
-    int i;
-    ma_uint8 paddingSizeInBits;
-    ma_uint16 desiredCRC16;
-#ifndef MA_DR_FLAC_NO_CRC
-    ma_uint16 actualCRC16;
-#endif
-    MA_DR_FLAC_ZERO_MEMORY(pFlac->currentFLACFrame.subframes, sizeof(pFlac->currentFLACFrame.subframes));
-    if (pFlac->currentFLACFrame.header.blockSizeInPCMFrames > pFlac->maxBlockSizeInPCMFrames) {
-        return MA_ERROR;
-    }
-    channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
-    if (channelCount != (int)pFlac->channels) {
-        return MA_ERROR;
-    }
-    for (i = 0; i < channelCount; ++i) {
-        if (!ma_dr_flac__decode_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i, pFlac->pDecodedSamples + (pFlac->currentFLACFrame.header.blockSizeInPCMFrames * i))) {
-            return MA_ERROR;
-        }
-    }
-    paddingSizeInBits = (ma_uint8)(MA_DR_FLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7);
-    if (paddingSizeInBits > 0) {
-        ma_uint8 padding = 0;
-        if (!ma_dr_flac__read_uint8(&pFlac->bs, paddingSizeInBits, &padding)) {
-            return MA_AT_END;
-        }
-    }
-#ifndef MA_DR_FLAC_NO_CRC
-    actualCRC16 = ma_dr_flac__flush_crc16(&pFlac->bs);
-#endif
-    if (!ma_dr_flac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
-        return MA_AT_END;
-    }
-#ifndef MA_DR_FLAC_NO_CRC
-    if (actualCRC16 != desiredCRC16) {
-        return MA_CRC_MISMATCH;
-    }
-#endif
-    pFlac->currentFLACFrame.pcmFramesRemaining = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
-    return MA_SUCCESS;
-}
-static ma_result ma_dr_flac__seek_flac_frame(ma_dr_flac* pFlac)
-{
-    int channelCount;
-    int i;
-    ma_uint16 desiredCRC16;
-#ifndef MA_DR_FLAC_NO_CRC
-    ma_uint16 actualCRC16;
-#endif
-    channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
-    for (i = 0; i < channelCount; ++i) {
-        if (!ma_dr_flac__seek_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i)) {
-            return MA_ERROR;
-        }
-    }
-    if (!ma_dr_flac__seek_bits(&pFlac->bs, MA_DR_FLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7)) {
-        return MA_ERROR;
-    }
-#ifndef MA_DR_FLAC_NO_CRC
-    actualCRC16 = ma_dr_flac__flush_crc16(&pFlac->bs);
-#endif
-    if (!ma_dr_flac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
-        return MA_AT_END;
-    }
-#ifndef MA_DR_FLAC_NO_CRC
-    if (actualCRC16 != desiredCRC16) {
-        return MA_CRC_MISMATCH;
-    }
-#endif
-    return MA_SUCCESS;
-}
-static ma_bool32 ma_dr_flac__read_and_decode_next_flac_frame(ma_dr_flac* pFlac)
-{
-    MA_DR_FLAC_ASSERT(pFlac != NULL);
-    for (;;) {
-        ma_result result;
-        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-            return MA_FALSE;
-        }
-        result = ma_dr_flac__decode_flac_frame(pFlac);
-        if (result != MA_SUCCESS) {
-            if (result == MA_CRC_MISMATCH) {
-                continue;
-            } else {
-                return MA_FALSE;
-            }
-        }
-        return MA_TRUE;
-    }
-}
-static void ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(ma_dr_flac* pFlac, ma_uint64* pFirstPCMFrame, ma_uint64* pLastPCMFrame)
-{
-    ma_uint64 firstPCMFrame;
-    ma_uint64 lastPCMFrame;
-    MA_DR_FLAC_ASSERT(pFlac != NULL);
-    firstPCMFrame = pFlac->currentFLACFrame.header.pcmFrameNumber;
-    if (firstPCMFrame == 0) {
-        firstPCMFrame = ((ma_uint64)pFlac->currentFLACFrame.header.flacFrameNumber) * pFlac->maxBlockSizeInPCMFrames;
-    }
-    lastPCMFrame = firstPCMFrame + pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
-    if (lastPCMFrame > 0) {
-        lastPCMFrame -= 1;
-    }
-    if (pFirstPCMFrame) {
-        *pFirstPCMFrame = firstPCMFrame;
-    }
-    if (pLastPCMFrame) {
-        *pLastPCMFrame = lastPCMFrame;
-    }
-}
-static ma_bool32 ma_dr_flac__seek_to_first_frame(ma_dr_flac* pFlac)
-{
-    ma_bool32 result;
-    MA_DR_FLAC_ASSERT(pFlac != NULL);
-    result = ma_dr_flac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes);
-    MA_DR_FLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
-    pFlac->currentPCMFrame = 0;
-    return result;
-}
-static MA_INLINE ma_result ma_dr_flac__seek_to_next_flac_frame(ma_dr_flac* pFlac)
-{
-    MA_DR_FLAC_ASSERT(pFlac != NULL);
-    return ma_dr_flac__seek_flac_frame(pFlac);
-}
-static ma_uint64 ma_dr_flac__seek_forward_by_pcm_frames(ma_dr_flac* pFlac, ma_uint64 pcmFramesToSeek)
-{
-    ma_uint64 pcmFramesRead = 0;
-    while (pcmFramesToSeek > 0) {
-        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
-            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
-                break;
-            }
-        } else {
-            if (pFlac->currentFLACFrame.pcmFramesRemaining > pcmFramesToSeek) {
-                pcmFramesRead   += pcmFramesToSeek;
-                pFlac->currentFLACFrame.pcmFramesRemaining -= (ma_uint32)pcmFramesToSeek;
-                pcmFramesToSeek  = 0;
-            } else {
-                pcmFramesRead   += pFlac->currentFLACFrame.pcmFramesRemaining;
-                pcmFramesToSeek -= pFlac->currentFLACFrame.pcmFramesRemaining;
-                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
-            }
-        }
-    }
-    pFlac->currentPCMFrame += pcmFramesRead;
-    return pcmFramesRead;
-}
-static ma_bool32 ma_dr_flac__seek_to_pcm_frame__brute_force(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
-{
-    ma_bool32 isMidFrame = MA_FALSE;
-    ma_uint64 runningPCMFrameCount;
-    MA_DR_FLAC_ASSERT(pFlac != NULL);
-    if (pcmFrameIndex >= pFlac->currentPCMFrame) {
-        runningPCMFrameCount = pFlac->currentPCMFrame;
-        if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
-            if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-                return MA_FALSE;
-            }
-        } else {
-            isMidFrame = MA_TRUE;
-        }
-    } else {
-        runningPCMFrameCount = 0;
-        if (!ma_dr_flac__seek_to_first_frame(pFlac)) {
-            return MA_FALSE;
-        }
-        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-            return MA_FALSE;
-        }
-    }
-    for (;;) {
-        ma_uint64 pcmFrameCountInThisFLACFrame;
-        ma_uint64 firstPCMFrameInFLACFrame = 0;
-        ma_uint64 lastPCMFrameInFLACFrame = 0;
-        ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
-        pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
-        if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
-            ma_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
-            if (!isMidFrame) {
-                ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
-                if (result == MA_SUCCESS) {
-                    return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
-                } else {
-                    if (result == MA_CRC_MISMATCH) {
-                        goto next_iteration;
-                    } else {
-                        return MA_FALSE;
-                    }
-                }
-            } else {
-                return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
-            }
-        } else {
-            if (!isMidFrame) {
-                ma_result result = ma_dr_flac__seek_to_next_flac_frame(pFlac);
-                if (result == MA_SUCCESS) {
-                    runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
-                } else {
-                    if (result == MA_CRC_MISMATCH) {
-                        goto next_iteration;
-                    } else {
-                        return MA_FALSE;
-                    }
-                }
-            } else {
-                runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
-                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
-                isMidFrame = MA_FALSE;
-            }
-            if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
-                return MA_TRUE;
-            }
-        }
-    next_iteration:
-        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-            return MA_FALSE;
-        }
-    }
-}
-#if !defined(MA_DR_FLAC_NO_CRC)
-#define MA_DR_FLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO 0.6f
-static ma_bool32 ma_dr_flac__seek_to_approximate_flac_frame_to_byte(ma_dr_flac* pFlac, ma_uint64 targetByte, ma_uint64 rangeLo, ma_uint64 rangeHi, ma_uint64* pLastSuccessfulSeekOffset)
-{
-    MA_DR_FLAC_ASSERT(pFlac != NULL);
-    MA_DR_FLAC_ASSERT(pLastSuccessfulSeekOffset != NULL);
-    MA_DR_FLAC_ASSERT(targetByte >= rangeLo);
-    MA_DR_FLAC_ASSERT(targetByte <= rangeHi);
-    *pLastSuccessfulSeekOffset = pFlac->firstFLACFramePosInBytes;
-    for (;;) {
-        ma_uint64 lastTargetByte = targetByte;
-        if (!ma_dr_flac__seek_to_byte(&pFlac->bs, targetByte)) {
-            if (targetByte == 0) {
-                ma_dr_flac__seek_to_first_frame(pFlac);
-                return MA_FALSE;
-            }
-            targetByte = rangeLo + ((rangeHi - rangeLo)/2);
-            rangeHi = targetByte;
-        } else {
-            MA_DR_FLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
-#if 1
-            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
-                targetByte = rangeLo + ((rangeHi - rangeLo)/2);
-                rangeHi = targetByte;
-            } else {
-                break;
-            }
-#else
-            if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-                targetByte = rangeLo + ((rangeHi - rangeLo)/2);
-                rangeHi = targetByte;
-            } else {
-                break;
-            }
-#endif
-        }
-        if(targetByte == lastTargetByte) {
-            return MA_FALSE;
-        }
-    }
-    ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
-    MA_DR_FLAC_ASSERT(targetByte <= rangeHi);
-    *pLastSuccessfulSeekOffset = targetByte;
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__decode_flac_frame_and_seek_forward_by_pcm_frames(ma_dr_flac* pFlac, ma_uint64 offset)
-{
-#if 0
-    if (ma_dr_flac__decode_flac_frame(pFlac) != MA_SUCCESS) {
-        if (ma_dr_flac__read_and_decode_next_flac_frame(pFlac) == MA_FALSE) {
-            return MA_FALSE;
-        }
-    }
-#endif
-    return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, offset) == offset;
-}
-static ma_bool32 ma_dr_flac__seek_to_pcm_frame__binary_search_internal(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex, ma_uint64 byteRangeLo, ma_uint64 byteRangeHi)
-{
-    ma_uint64 targetByte;
-    ma_uint64 pcmRangeLo = pFlac->totalPCMFrameCount;
-    ma_uint64 pcmRangeHi = 0;
-    ma_uint64 lastSuccessfulSeekOffset = (ma_uint64)-1;
-    ma_uint64 closestSeekOffsetBeforeTargetPCMFrame = byteRangeLo;
-    ma_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
-    targetByte = byteRangeLo + (ma_uint64)(((ma_int64)((pcmFrameIndex - pFlac->currentPCMFrame) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * MA_DR_FLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO);
-    if (targetByte > byteRangeHi) {
-        targetByte = byteRangeHi;
-    }
-    for (;;) {
-        if (ma_dr_flac__seek_to_approximate_flac_frame_to_byte(pFlac, targetByte, byteRangeLo, byteRangeHi, &lastSuccessfulSeekOffset)) {
-            ma_uint64 newPCMRangeLo;
-            ma_uint64 newPCMRangeHi;
-            ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &newPCMRangeLo, &newPCMRangeHi);
-            if (pcmRangeLo == newPCMRangeLo) {
-                if (!ma_dr_flac__seek_to_approximate_flac_frame_to_byte(pFlac, closestSeekOffsetBeforeTargetPCMFrame, closestSeekOffsetBeforeTargetPCMFrame, byteRangeHi, &lastSuccessfulSeekOffset)) {
-                    break;
-                }
-                if (ma_dr_flac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
-                    return MA_TRUE;
-                } else {
-                    break;
-                }
-            }
-            pcmRangeLo = newPCMRangeLo;
-            pcmRangeHi = newPCMRangeHi;
-            if (pcmRangeLo <= pcmFrameIndex && pcmRangeHi >= pcmFrameIndex) {
-                if (ma_dr_flac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame) ) {
-                    return MA_TRUE;
-                } else {
-                    break;
-                }
-            } else {
-                const float approxCompressionRatio = (ma_int64)(lastSuccessfulSeekOffset - pFlac->firstFLACFramePosInBytes) / ((ma_int64)(pcmRangeLo * pFlac->channels * pFlac->bitsPerSample)/8.0f);
-                if (pcmRangeLo > pcmFrameIndex) {
-                    byteRangeHi = lastSuccessfulSeekOffset;
-                    if (byteRangeLo > byteRangeHi) {
-                        byteRangeLo = byteRangeHi;
-                    }
-                    targetByte = byteRangeLo + ((byteRangeHi - byteRangeLo) / 2);
-                    if (targetByte < byteRangeLo) {
-                        targetByte = byteRangeLo;
-                    }
-                } else  {
-                    if ((pcmFrameIndex - pcmRangeLo) < seekForwardThreshold) {
-                        if (ma_dr_flac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
-                            return MA_TRUE;
-                        } else {
-                            break;
-                        }
-                    } else {
-                        byteRangeLo = lastSuccessfulSeekOffset;
-                        if (byteRangeHi < byteRangeLo) {
-                            byteRangeHi = byteRangeLo;
-                        }
-                        targetByte = lastSuccessfulSeekOffset + (ma_uint64)(((ma_int64)((pcmFrameIndex-pcmRangeLo) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * approxCompressionRatio);
-                        if (targetByte > byteRangeHi) {
-                            targetByte = byteRangeHi;
-                        }
-                        if (closestSeekOffsetBeforeTargetPCMFrame < lastSuccessfulSeekOffset) {
-                            closestSeekOffsetBeforeTargetPCMFrame = lastSuccessfulSeekOffset;
-                        }
-                    }
-                }
-            }
-        } else {
-            break;
-        }
-    }
-    ma_dr_flac__seek_to_first_frame(pFlac);
-    return MA_FALSE;
-}
-static ma_bool32 ma_dr_flac__seek_to_pcm_frame__binary_search(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
-{
-    ma_uint64 byteRangeLo;
-    ma_uint64 byteRangeHi;
-    ma_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
-    if (ma_dr_flac__seek_to_first_frame(pFlac) == MA_FALSE) {
-        return MA_FALSE;
-    }
-    if (pcmFrameIndex < seekForwardThreshold) {
-        return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFrameIndex) == pcmFrameIndex;
-    }
-    byteRangeLo = pFlac->firstFLACFramePosInBytes;
-    byteRangeHi = pFlac->firstFLACFramePosInBytes + (ma_uint64)((ma_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
-    return ma_dr_flac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi);
-}
-#endif
-static ma_bool32 ma_dr_flac__seek_to_pcm_frame__seek_table(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
-{
-    ma_uint32 iClosestSeekpoint = 0;
-    ma_bool32 isMidFrame = MA_FALSE;
-    ma_uint64 runningPCMFrameCount;
-    ma_uint32 iSeekpoint;
-    MA_DR_FLAC_ASSERT(pFlac != NULL);
-    if (pFlac->pSeekpoints == NULL || pFlac->seekpointCount == 0) {
-        return MA_FALSE;
-    }
-    if (pFlac->pSeekpoints[0].firstPCMFrame > pcmFrameIndex) {
-        return MA_FALSE;
-    }
-    for (iSeekpoint = 0; iSeekpoint < pFlac->seekpointCount; ++iSeekpoint) {
-        if (pFlac->pSeekpoints[iSeekpoint].firstPCMFrame >= pcmFrameIndex) {
-            break;
-        }
-        iClosestSeekpoint = iSeekpoint;
-    }
-    if (pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount == 0 || pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount > pFlac->maxBlockSizeInPCMFrames) {
-        return MA_FALSE;
-    }
-    if (pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame > pFlac->totalPCMFrameCount && pFlac->totalPCMFrameCount > 0) {
-        return MA_FALSE;
-    }
-#if !defined(MA_DR_FLAC_NO_CRC)
-    if (pFlac->totalPCMFrameCount > 0) {
-        ma_uint64 byteRangeLo;
-        ma_uint64 byteRangeHi;
-        byteRangeHi = pFlac->firstFLACFramePosInBytes + (ma_uint64)((ma_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
-        byteRangeLo = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset;
-        if (iClosestSeekpoint < pFlac->seekpointCount-1) {
-            ma_uint32 iNextSeekpoint = iClosestSeekpoint + 1;
-            if (pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset >= pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset || pFlac->pSeekpoints[iNextSeekpoint].pcmFrameCount == 0) {
-                return MA_FALSE;
-            }
-            if (pFlac->pSeekpoints[iNextSeekpoint].firstPCMFrame != (((ma_uint64)0xFFFFFFFF << 32) | 0xFFFFFFFF)) {
-                byteRangeHi = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset - 1;
-            }
-        }
-        if (ma_dr_flac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
-            if (ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-                ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
-                if (ma_dr_flac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi)) {
-                    return MA_TRUE;
-                }
-            }
-        }
-    }
-#endif
-    if (pcmFrameIndex >= pFlac->currentPCMFrame && pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame <= pFlac->currentPCMFrame) {
-        runningPCMFrameCount = pFlac->currentPCMFrame;
-        if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
-            if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-                return MA_FALSE;
-            }
-        } else {
-            isMidFrame = MA_TRUE;
-        }
-    } else {
-        runningPCMFrameCount = pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame;
-        if (!ma_dr_flac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
-            return MA_FALSE;
-        }
-        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-            return MA_FALSE;
-        }
-    }
-    for (;;) {
-        ma_uint64 pcmFrameCountInThisFLACFrame;
-        ma_uint64 firstPCMFrameInFLACFrame = 0;
-        ma_uint64 lastPCMFrameInFLACFrame = 0;
-        ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
-        pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
-        if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
-            ma_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
-            if (!isMidFrame) {
-                ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
-                if (result == MA_SUCCESS) {
-                    return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
-                } else {
-                    if (result == MA_CRC_MISMATCH) {
-                        goto next_iteration;
-                    } else {
-                        return MA_FALSE;
-                    }
-                }
-            } else {
-                return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
-            }
-        } else {
-            if (!isMidFrame) {
-                ma_result result = ma_dr_flac__seek_to_next_flac_frame(pFlac);
-                if (result == MA_SUCCESS) {
-                    runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
-                } else {
-                    if (result == MA_CRC_MISMATCH) {
-                        goto next_iteration;
-                    } else {
-                        return MA_FALSE;
-                    }
-                }
-            } else {
-                runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
-                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
-                isMidFrame = MA_FALSE;
-            }
-            if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
-                return MA_TRUE;
-            }
-        }
-    next_iteration:
-        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-            return MA_FALSE;
-        }
-    }
-}
-#ifndef MA_DR_FLAC_NO_OGG
-typedef struct
-{
-    ma_uint8 capturePattern[4];
-    ma_uint8 structureVersion;
-    ma_uint8 headerType;
-    ma_uint64 granulePosition;
-    ma_uint32 serialNumber;
-    ma_uint32 sequenceNumber;
-    ma_uint32 checksum;
-    ma_uint8 segmentCount;
-    ma_uint8 segmentTable[255];
-} ma_dr_flac_ogg_page_header;
-#endif
-typedef struct
-{
-    ma_dr_flac_read_proc onRead;
-    ma_dr_flac_seek_proc onSeek;
-    ma_dr_flac_meta_proc onMeta;
-    ma_dr_flac_container container;
-    void* pUserData;
-    void* pUserDataMD;
-    ma_uint32 sampleRate;
-    ma_uint8  channels;
-    ma_uint8  bitsPerSample;
-    ma_uint64 totalPCMFrameCount;
-    ma_uint16 maxBlockSizeInPCMFrames;
-    ma_uint64 runningFilePos;
-    ma_bool32 hasStreamInfoBlock;
-    ma_bool32 hasMetadataBlocks;
-    ma_dr_flac_bs bs;
-    ma_dr_flac_frame_header firstFrameHeader;
-#ifndef MA_DR_FLAC_NO_OGG
-    ma_uint32 oggSerial;
-    ma_uint64 oggFirstBytePos;
-    ma_dr_flac_ogg_page_header oggBosHeader;
-#endif
-} ma_dr_flac_init_info;
-static MA_INLINE void ma_dr_flac__decode_block_header(ma_uint32 blockHeader, ma_uint8* isLastBlock, ma_uint8* blockType, ma_uint32* blockSize)
-{
-    blockHeader = ma_dr_flac__be2host_32(blockHeader);
-    *isLastBlock = (ma_uint8)((blockHeader & 0x80000000UL) >> 31);
-    *blockType   = (ma_uint8)((blockHeader & 0x7F000000UL) >> 24);
-    *blockSize   =                (blockHeader & 0x00FFFFFFUL);
-}
-static MA_INLINE ma_bool32 ma_dr_flac__read_and_decode_block_header(ma_dr_flac_read_proc onRead, void* pUserData, ma_uint8* isLastBlock, ma_uint8* blockType, ma_uint32* blockSize)
-{
-    ma_uint32 blockHeader;
-    *blockSize = 0;
-    if (onRead(pUserData, &blockHeader, 4) != 4) {
-        return MA_FALSE;
-    }
-    ma_dr_flac__decode_block_header(blockHeader, isLastBlock, blockType, blockSize);
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__read_streaminfo(ma_dr_flac_read_proc onRead, void* pUserData, ma_dr_flac_streaminfo* pStreamInfo)
-{
-    ma_uint32 blockSizes;
-    ma_uint64 frameSizes = 0;
-    ma_uint64 importantProps;
-    ma_uint8 md5[16];
-    if (onRead(pUserData, &blockSizes, 4) != 4) {
-        return MA_FALSE;
-    }
-    if (onRead(pUserData, &frameSizes, 6) != 6) {
-        return MA_FALSE;
-    }
-    if (onRead(pUserData, &importantProps, 8) != 8) {
-        return MA_FALSE;
-    }
-    if (onRead(pUserData, md5, sizeof(md5)) != sizeof(md5)) {
-        return MA_FALSE;
-    }
-    blockSizes     = ma_dr_flac__be2host_32(blockSizes);
-    frameSizes     = ma_dr_flac__be2host_64(frameSizes);
-    importantProps = ma_dr_flac__be2host_64(importantProps);
-    pStreamInfo->minBlockSizeInPCMFrames = (ma_uint16)((blockSizes & 0xFFFF0000) >> 16);
-    pStreamInfo->maxBlockSizeInPCMFrames = (ma_uint16) (blockSizes & 0x0000FFFF);
-    pStreamInfo->minFrameSizeInPCMFrames = (ma_uint32)((frameSizes     &  (((ma_uint64)0x00FFFFFF << 16) << 24)) >> 40);
-    pStreamInfo->maxFrameSizeInPCMFrames = (ma_uint32)((frameSizes     &  (((ma_uint64)0x00FFFFFF << 16) <<  0)) >> 16);
-    pStreamInfo->sampleRate              = (ma_uint32)((importantProps &  (((ma_uint64)0x000FFFFF << 16) << 28)) >> 44);
-    pStreamInfo->channels                = (ma_uint8 )((importantProps &  (((ma_uint64)0x0000000E << 16) << 24)) >> 41) + 1;
-    pStreamInfo->bitsPerSample           = (ma_uint8 )((importantProps &  (((ma_uint64)0x0000001F << 16) << 20)) >> 36) + 1;
-    pStreamInfo->totalPCMFrameCount      =                ((importantProps & ((((ma_uint64)0x0000000F << 16) << 16) | 0xFFFFFFFF)));
-    MA_DR_FLAC_COPY_MEMORY(pStreamInfo->md5, md5, sizeof(md5));
-    return MA_TRUE;
-}
-static void* ma_dr_flac__malloc_default(size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return MA_DR_FLAC_MALLOC(sz);
-}
-static void* ma_dr_flac__realloc_default(void* p, size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return MA_DR_FLAC_REALLOC(p, sz);
-}
-static void ma_dr_flac__free_default(void* p, void* pUserData)
-{
-    (void)pUserData;
-    MA_DR_FLAC_FREE(p);
-}
-static void* ma_dr_flac__malloc_from_callbacks(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks == NULL) {
-        return NULL;
-    }
-    if (pAllocationCallbacks->onMalloc != NULL) {
-        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
-    }
-    if (pAllocationCallbacks->onRealloc != NULL) {
-        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
-    }
-    return NULL;
-}
-static void* ma_dr_flac__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks == NULL) {
-        return NULL;
-    }
-    if (pAllocationCallbacks->onRealloc != NULL) {
-        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
-    }
-    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
-        void* p2;
-        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
-        if (p2 == NULL) {
-            return NULL;
-        }
-        if (p != NULL) {
-            MA_DR_FLAC_COPY_MEMORY(p2, p, szOld);
-            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
-        }
-        return p2;
-    }
-    return NULL;
-}
-static void ma_dr_flac__free_from_callbacks(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (p == NULL || pAllocationCallbacks == NULL) {
-        return;
-    }
-    if (pAllocationCallbacks->onFree != NULL) {
-        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
-    }
-}
-static ma_bool32 ma_dr_flac__read_and_decode_metadata(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, void* pUserDataMD, ma_uint64* pFirstFramePos, ma_uint64* pSeektablePos, ma_uint32* pSeekpointCount, ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_uint64 runningFilePos = 42;
-    ma_uint64 seektablePos   = 0;
-    ma_uint32 seektableSize  = 0;
-    for (;;) {
-        ma_dr_flac_metadata metadata;
-        ma_uint8 isLastBlock = 0;
-        ma_uint8 blockType = 0;
-        ma_uint32 blockSize;
-        if (ma_dr_flac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize) == MA_FALSE) {
-            return MA_FALSE;
-        }
-        runningFilePos += 4;
-        metadata.type = blockType;
-        metadata.pRawData = NULL;
-        metadata.rawDataSize = 0;
-        switch (blockType)
-        {
-            case MA_DR_FLAC_METADATA_BLOCK_TYPE_APPLICATION:
-            {
-                if (blockSize < 4) {
-                    return MA_FALSE;
-                }
-                if (onMeta) {
-                    void* pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
-                    if (pRawData == NULL) {
-                        return MA_FALSE;
-                    }
-                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    metadata.pRawData = pRawData;
-                    metadata.rawDataSize = blockSize;
-                    metadata.data.application.id       = ma_dr_flac__be2host_32(*(ma_uint32*)pRawData);
-                    metadata.data.application.pData    = (const void*)((ma_uint8*)pRawData + sizeof(ma_uint32));
-                    metadata.data.application.dataSize = blockSize - sizeof(ma_uint32);
-                    onMeta(pUserDataMD, &metadata);
-                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                }
-            } break;
-            case MA_DR_FLAC_METADATA_BLOCK_TYPE_SEEKTABLE:
-            {
-                seektablePos  = runningFilePos;
-                seektableSize = blockSize;
-                if (onMeta) {
-                    ma_uint32 seekpointCount;
-                    ma_uint32 iSeekpoint;
-                    void* pRawData;
-                    seekpointCount = blockSize/MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES;
-                    pRawData = ma_dr_flac__malloc_from_callbacks(seekpointCount * sizeof(ma_dr_flac_seekpoint), pAllocationCallbacks);
-                    if (pRawData == NULL) {
-                        return MA_FALSE;
-                    }
-                    for (iSeekpoint = 0; iSeekpoint < seekpointCount; ++iSeekpoint) {
-                        ma_dr_flac_seekpoint* pSeekpoint = (ma_dr_flac_seekpoint*)pRawData + iSeekpoint;
-                        if (onRead(pUserData, pSeekpoint, MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES) != MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES) {
-                            ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                            return MA_FALSE;
-                        }
-                        pSeekpoint->firstPCMFrame   = ma_dr_flac__be2host_64(pSeekpoint->firstPCMFrame);
-                        pSeekpoint->flacFrameOffset = ma_dr_flac__be2host_64(pSeekpoint->flacFrameOffset);
-                        pSeekpoint->pcmFrameCount   = ma_dr_flac__be2host_16(pSeekpoint->pcmFrameCount);
-                    }
-                    metadata.pRawData = pRawData;
-                    metadata.rawDataSize = blockSize;
-                    metadata.data.seektable.seekpointCount = seekpointCount;
-                    metadata.data.seektable.pSeekpoints = (const ma_dr_flac_seekpoint*)pRawData;
-                    onMeta(pUserDataMD, &metadata);
-                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                }
-            } break;
-            case MA_DR_FLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT:
-            {
-                if (blockSize < 8) {
-                    return MA_FALSE;
-                }
-                if (onMeta) {
-                    void* pRawData;
-                    const char* pRunningData;
-                    const char* pRunningDataEnd;
-                    ma_uint32 i;
-                    pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
-                    if (pRawData == NULL) {
-                        return MA_FALSE;
-                    }
-                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    metadata.pRawData = pRawData;
-                    metadata.rawDataSize = blockSize;
-                    pRunningData    = (const char*)pRawData;
-                    pRunningDataEnd = (const char*)pRawData + blockSize;
-                    metadata.data.vorbis_comment.vendorLength = ma_dr_flac__le2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    if ((pRunningDataEnd - pRunningData) - 4 < (ma_int64)metadata.data.vorbis_comment.vendorLength) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    metadata.data.vorbis_comment.vendor       = pRunningData;                                            pRunningData += metadata.data.vorbis_comment.vendorLength;
-                    metadata.data.vorbis_comment.commentCount = ma_dr_flac__le2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    if ((pRunningDataEnd - pRunningData) / sizeof(ma_uint32) < metadata.data.vorbis_comment.commentCount) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    metadata.data.vorbis_comment.pComments    = pRunningData;
-                    for (i = 0; i < metadata.data.vorbis_comment.commentCount; ++i) {
-                        ma_uint32 commentLength;
-                        if (pRunningDataEnd - pRunningData < 4) {
-                            ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                            return MA_FALSE;
-                        }
-                        commentLength = ma_dr_flac__le2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                        if (pRunningDataEnd - pRunningData < (ma_int64)commentLength) {
-                            ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                            return MA_FALSE;
-                        }
-                        pRunningData += commentLength;
-                    }
-                    onMeta(pUserDataMD, &metadata);
-                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                }
-            } break;
-            case MA_DR_FLAC_METADATA_BLOCK_TYPE_CUESHEET:
-            {
-                if (blockSize < 396) {
-                    return MA_FALSE;
-                }
-                if (onMeta) {
-                    void* pRawData;
-                    const char* pRunningData;
-                    const char* pRunningDataEnd;
-                    size_t bufferSize;
-                    ma_uint8 iTrack;
-                    ma_uint8 iIndex;
-                    void* pTrackData;
-                    pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
-                    if (pRawData == NULL) {
-                        return MA_FALSE;
-                    }
-                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    metadata.pRawData = pRawData;
-                    metadata.rawDataSize = blockSize;
-                    pRunningData    = (const char*)pRawData;
-                    pRunningDataEnd = (const char*)pRawData + blockSize;
-                    MA_DR_FLAC_COPY_MEMORY(metadata.data.cuesheet.catalog, pRunningData, 128);                              pRunningData += 128;
-                    metadata.data.cuesheet.leadInSampleCount = ma_dr_flac__be2host_64(*(const ma_uint64*)pRunningData); pRunningData += 8;
-                    metadata.data.cuesheet.isCD              = (pRunningData[0] & 0x80) != 0;                           pRunningData += 259;
-                    metadata.data.cuesheet.trackCount        = pRunningData[0];                                         pRunningData += 1;
-                    metadata.data.cuesheet.pTrackData        = NULL;
-                    {
-                        const char* pRunningDataSaved = pRunningData;
-                        bufferSize = metadata.data.cuesheet.trackCount * MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES;
-                        for (iTrack = 0; iTrack < metadata.data.cuesheet.trackCount; ++iTrack) {
-                            ma_uint8 indexCount;
-                            ma_uint32 indexPointSize;
-                            if (pRunningDataEnd - pRunningData < MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES) {
-                                ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                                return MA_FALSE;
-                            }
-                            pRunningData += 35;
-                            indexCount = pRunningData[0];
-                            pRunningData += 1;
-                            bufferSize += indexCount * sizeof(ma_dr_flac_cuesheet_track_index);
-                            indexPointSize = indexCount * MA_DR_FLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES;
-                            if (pRunningDataEnd - pRunningData < (ma_int64)indexPointSize) {
-                                ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                                return MA_FALSE;
-                            }
-                            pRunningData += indexPointSize;
-                        }
-                        pRunningData = pRunningDataSaved;
-                    }
-                    {
-                        char* pRunningTrackData;
-                        pTrackData = ma_dr_flac__malloc_from_callbacks(bufferSize, pAllocationCallbacks);
-                        if (pTrackData == NULL) {
-                            ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                            return MA_FALSE;
-                        }
-                        pRunningTrackData = (char*)pTrackData;
-                        for (iTrack = 0; iTrack < metadata.data.cuesheet.trackCount; ++iTrack) {
-                            ma_uint8 indexCount;
-                            MA_DR_FLAC_COPY_MEMORY(pRunningTrackData, pRunningData, MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES);
-                            pRunningData      += MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES-1;
-                            pRunningTrackData += MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES-1;
-                            indexCount = pRunningData[0];
-                            pRunningData      += 1;
-                            pRunningTrackData += 1;
-                            for (iIndex = 0; iIndex < indexCount; ++iIndex) {
-                                ma_dr_flac_cuesheet_track_index* pTrackIndex = (ma_dr_flac_cuesheet_track_index*)pRunningTrackData;
-                                MA_DR_FLAC_COPY_MEMORY(pRunningTrackData, pRunningData, MA_DR_FLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES);
-                                pRunningData      += MA_DR_FLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES;
-                                pRunningTrackData += sizeof(ma_dr_flac_cuesheet_track_index);
-                                pTrackIndex->offset = ma_dr_flac__be2host_64(pTrackIndex->offset);
-                            }
-                        }
-                        metadata.data.cuesheet.pTrackData = pTrackData;
-                    }
-                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                    pRawData = NULL;
-                    onMeta(pUserDataMD, &metadata);
-                    ma_dr_flac__free_from_callbacks(pTrackData, pAllocationCallbacks);
-                    pTrackData = NULL;
-                }
-            } break;
-            case MA_DR_FLAC_METADATA_BLOCK_TYPE_PICTURE:
-            {
-                if (blockSize < 32) {
-                    return MA_FALSE;
-                }
-                if (onMeta) {
-                    void* pRawData;
-                    const char* pRunningData;
-                    const char* pRunningDataEnd;
-                    pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
-                    if (pRawData == NULL) {
-                        return MA_FALSE;
-                    }
-                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    metadata.pRawData = pRawData;
-                    metadata.rawDataSize = blockSize;
-                    pRunningData    = (const char*)pRawData;
-                    pRunningDataEnd = (const char*)pRawData + blockSize;
-                    metadata.data.picture.type       = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    metadata.data.picture.mimeLength = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    if ((pRunningDataEnd - pRunningData) - 24 < (ma_int64)metadata.data.picture.mimeLength) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    metadata.data.picture.mime              = pRunningData;                                   pRunningData += metadata.data.picture.mimeLength;
-                    metadata.data.picture.descriptionLength = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    if ((pRunningDataEnd - pRunningData) - 20 < (ma_int64)metadata.data.picture.descriptionLength) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    metadata.data.picture.description     = pRunningData;                                   pRunningData += metadata.data.picture.descriptionLength;
-                    metadata.data.picture.width           = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    metadata.data.picture.height          = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    metadata.data.picture.colorDepth      = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    metadata.data.picture.indexColorCount = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    metadata.data.picture.pictureDataSize = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
-                    metadata.data.picture.pPictureData    = (const ma_uint8*)pRunningData;
-                    if (pRunningDataEnd - pRunningData < (ma_int64)metadata.data.picture.pictureDataSize) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    onMeta(pUserDataMD, &metadata);
-                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                }
-            } break;
-            case MA_DR_FLAC_METADATA_BLOCK_TYPE_PADDING:
-            {
-                if (onMeta) {
-                    metadata.data.padding.unused = 0;
-                    if (!onSeek(pUserData, blockSize, ma_dr_flac_seek_origin_current)) {
-                        isLastBlock = MA_TRUE;
-                    } else {
-                        onMeta(pUserDataMD, &metadata);
-                    }
-                }
-            } break;
-            case MA_DR_FLAC_METADATA_BLOCK_TYPE_INVALID:
-            {
-                if (onMeta) {
-                    if (!onSeek(pUserData, blockSize, ma_dr_flac_seek_origin_current)) {
-                        isLastBlock = MA_TRUE;
-                    }
-                }
-            } break;
-            default:
-            {
-                if (onMeta) {
-                    void* pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
-                    if (pRawData == NULL) {
-                        return MA_FALSE;
-                    }
-                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
-                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                        return MA_FALSE;
-                    }
-                    metadata.pRawData = pRawData;
-                    metadata.rawDataSize = blockSize;
-                    onMeta(pUserDataMD, &metadata);
-                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
-                }
-            } break;
-        }
-        if (onMeta == NULL && blockSize > 0) {
-            if (!onSeek(pUserData, blockSize, ma_dr_flac_seek_origin_current)) {
-                isLastBlock = MA_TRUE;
-            }
-        }
-        runningFilePos += blockSize;
-        if (isLastBlock) {
-            break;
-        }
-    }
-    *pSeektablePos   = seektablePos;
-    *pSeekpointCount = seektableSize / MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES;
-    *pFirstFramePos  = runningFilePos;
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac__init_private__native(ma_dr_flac_init_info* pInit, ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, void* pUserDataMD, ma_bool32 relaxed)
-{
-    ma_uint8 isLastBlock;
-    ma_uint8 blockType;
-    ma_uint32 blockSize;
-    (void)onSeek;
-    pInit->container = ma_dr_flac_container_native;
-    if (!ma_dr_flac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
-        return MA_FALSE;
-    }
-    if (blockType != MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
-        if (!relaxed) {
-            return MA_FALSE;
-        } else {
-            pInit->hasStreamInfoBlock = MA_FALSE;
-            pInit->hasMetadataBlocks  = MA_FALSE;
-            if (!ma_dr_flac__read_next_flac_frame_header(&pInit->bs, 0, &pInit->firstFrameHeader)) {
-                return MA_FALSE;
-            }
-            if (pInit->firstFrameHeader.bitsPerSample == 0) {
-                return MA_FALSE;
-            }
-            pInit->sampleRate              = pInit->firstFrameHeader.sampleRate;
-            pInit->channels                = ma_dr_flac__get_channel_count_from_channel_assignment(pInit->firstFrameHeader.channelAssignment);
-            pInit->bitsPerSample           = pInit->firstFrameHeader.bitsPerSample;
-            pInit->maxBlockSizeInPCMFrames = 65535;
-            return MA_TRUE;
-        }
-    } else {
-        ma_dr_flac_streaminfo streaminfo;
-        if (!ma_dr_flac__read_streaminfo(onRead, pUserData, &streaminfo)) {
-            return MA_FALSE;
-        }
-        pInit->hasStreamInfoBlock      = MA_TRUE;
-        pInit->sampleRate              = streaminfo.sampleRate;
-        pInit->channels                = streaminfo.channels;
-        pInit->bitsPerSample           = streaminfo.bitsPerSample;
-        pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
-        pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;
-        pInit->hasMetadataBlocks       = !isLastBlock;
-        if (onMeta) {
-            ma_dr_flac_metadata metadata;
-            metadata.type = MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO;
-            metadata.pRawData = NULL;
-            metadata.rawDataSize = 0;
-            metadata.data.streaminfo = streaminfo;
-            onMeta(pUserDataMD, &metadata);
-        }
-        return MA_TRUE;
-    }
-}
-#ifndef MA_DR_FLAC_NO_OGG
-#define MA_DR_FLAC_OGG_MAX_PAGE_SIZE            65307
-#define MA_DR_FLAC_OGG_CAPTURE_PATTERN_CRC32    1605413199
-typedef enum
-{
-    ma_dr_flac_ogg_recover_on_crc_mismatch,
-    ma_dr_flac_ogg_fail_on_crc_mismatch
-} ma_dr_flac_ogg_crc_mismatch_recovery;
-#ifndef MA_DR_FLAC_NO_CRC
-static ma_uint32 ma_dr_flac__crc32_table[] = {
-    0x00000000L, 0x04C11DB7L, 0x09823B6EL, 0x0D4326D9L,
-    0x130476DCL, 0x17C56B6BL, 0x1A864DB2L, 0x1E475005L,
-    0x2608EDB8L, 0x22C9F00FL, 0x2F8AD6D6L, 0x2B4BCB61L,
-    0x350C9B64L, 0x31CD86D3L, 0x3C8EA00AL, 0x384FBDBDL,
-    0x4C11DB70L, 0x48D0C6C7L, 0x4593E01EL, 0x4152FDA9L,
-    0x5F15ADACL, 0x5BD4B01BL, 0x569796C2L, 0x52568B75L,
-    0x6A1936C8L, 0x6ED82B7FL, 0x639B0DA6L, 0x675A1011L,
-    0x791D4014L, 0x7DDC5DA3L, 0x709F7B7AL, 0x745E66CDL,
-    0x9823B6E0L, 0x9CE2AB57L, 0x91A18D8EL, 0x95609039L,
-    0x8B27C03CL, 0x8FE6DD8BL, 0x82A5FB52L, 0x8664E6E5L,
-    0xBE2B5B58L, 0xBAEA46EFL, 0xB7A96036L, 0xB3687D81L,
-    0xAD2F2D84L, 0xA9EE3033L, 0xA4AD16EAL, 0xA06C0B5DL,
-    0xD4326D90L, 0xD0F37027L, 0xDDB056FEL, 0xD9714B49L,
-    0xC7361B4CL, 0xC3F706FBL, 0xCEB42022L, 0xCA753D95L,
-    0xF23A8028L, 0xF6FB9D9FL, 0xFBB8BB46L, 0xFF79A6F1L,
-    0xE13EF6F4L, 0xE5FFEB43L, 0xE8BCCD9AL, 0xEC7DD02DL,
-    0x34867077L, 0x30476DC0L, 0x3D044B19L, 0x39C556AEL,
-    0x278206ABL, 0x23431B1CL, 0x2E003DC5L, 0x2AC12072L,
-    0x128E9DCFL, 0x164F8078L, 0x1B0CA6A1L, 0x1FCDBB16L,
-    0x018AEB13L, 0x054BF6A4L, 0x0808D07DL, 0x0CC9CDCAL,
-    0x7897AB07L, 0x7C56B6B0L, 0x71159069L, 0x75D48DDEL,
-    0x6B93DDDBL, 0x6F52C06CL, 0x6211E6B5L, 0x66D0FB02L,
-    0x5E9F46BFL, 0x5A5E5B08L, 0x571D7DD1L, 0x53DC6066L,
-    0x4D9B3063L, 0x495A2DD4L, 0x44190B0DL, 0x40D816BAL,
-    0xACA5C697L, 0xA864DB20L, 0xA527FDF9L, 0xA1E6E04EL,
-    0xBFA1B04BL, 0xBB60ADFCL, 0xB6238B25L, 0xB2E29692L,
-    0x8AAD2B2FL, 0x8E6C3698L, 0x832F1041L, 0x87EE0DF6L,
-    0x99A95DF3L, 0x9D684044L, 0x902B669DL, 0x94EA7B2AL,
-    0xE0B41DE7L, 0xE4750050L, 0xE9362689L, 0xEDF73B3EL,
-    0xF3B06B3BL, 0xF771768CL, 0xFA325055L, 0xFEF34DE2L,
-    0xC6BCF05FL, 0xC27DEDE8L, 0xCF3ECB31L, 0xCBFFD686L,
-    0xD5B88683L, 0xD1799B34L, 0xDC3ABDEDL, 0xD8FBA05AL,
-    0x690CE0EEL, 0x6DCDFD59L, 0x608EDB80L, 0x644FC637L,
-    0x7A089632L, 0x7EC98B85L, 0x738AAD5CL, 0x774BB0EBL,
-    0x4F040D56L, 0x4BC510E1L, 0x46863638L, 0x42472B8FL,
-    0x5C007B8AL, 0x58C1663DL, 0x558240E4L, 0x51435D53L,
-    0x251D3B9EL, 0x21DC2629L, 0x2C9F00F0L, 0x285E1D47L,
-    0x36194D42L, 0x32D850F5L, 0x3F9B762CL, 0x3B5A6B9BL,
-    0x0315D626L, 0x07D4CB91L, 0x0A97ED48L, 0x0E56F0FFL,
-    0x1011A0FAL, 0x14D0BD4DL, 0x19939B94L, 0x1D528623L,
-    0xF12F560EL, 0xF5EE4BB9L, 0xF8AD6D60L, 0xFC6C70D7L,
-    0xE22B20D2L, 0xE6EA3D65L, 0xEBA91BBCL, 0xEF68060BL,
-    0xD727BBB6L, 0xD3E6A601L, 0xDEA580D8L, 0xDA649D6FL,
-    0xC423CD6AL, 0xC0E2D0DDL, 0xCDA1F604L, 0xC960EBB3L,
-    0xBD3E8D7EL, 0xB9FF90C9L, 0xB4BCB610L, 0xB07DABA7L,
-    0xAE3AFBA2L, 0xAAFBE615L, 0xA7B8C0CCL, 0xA379DD7BL,
-    0x9B3660C6L, 0x9FF77D71L, 0x92B45BA8L, 0x9675461FL,
-    0x8832161AL, 0x8CF30BADL, 0x81B02D74L, 0x857130C3L,
-    0x5D8A9099L, 0x594B8D2EL, 0x5408ABF7L, 0x50C9B640L,
-    0x4E8EE645L, 0x4A4FFBF2L, 0x470CDD2BL, 0x43CDC09CL,
-    0x7B827D21L, 0x7F436096L, 0x7200464FL, 0x76C15BF8L,
-    0x68860BFDL, 0x6C47164AL, 0x61043093L, 0x65C52D24L,
-    0x119B4BE9L, 0x155A565EL, 0x18197087L, 0x1CD86D30L,
-    0x029F3D35L, 0x065E2082L, 0x0B1D065BL, 0x0FDC1BECL,
-    0x3793A651L, 0x3352BBE6L, 0x3E119D3FL, 0x3AD08088L,
-    0x2497D08DL, 0x2056CD3AL, 0x2D15EBE3L, 0x29D4F654L,
-    0xC5A92679L, 0xC1683BCEL, 0xCC2B1D17L, 0xC8EA00A0L,
-    0xD6AD50A5L, 0xD26C4D12L, 0xDF2F6BCBL, 0xDBEE767CL,
-    0xE3A1CBC1L, 0xE760D676L, 0xEA23F0AFL, 0xEEE2ED18L,
-    0xF0A5BD1DL, 0xF464A0AAL, 0xF9278673L, 0xFDE69BC4L,
-    0x89B8FD09L, 0x8D79E0BEL, 0x803AC667L, 0x84FBDBD0L,
-    0x9ABC8BD5L, 0x9E7D9662L, 0x933EB0BBL, 0x97FFAD0CL,
-    0xAFB010B1L, 0xAB710D06L, 0xA6322BDFL, 0xA2F33668L,
-    0xBCB4666DL, 0xB8757BDAL, 0xB5365D03L, 0xB1F740B4L
-};
-#endif
-static MA_INLINE ma_uint32 ma_dr_flac_crc32_byte(ma_uint32 crc32, ma_uint8 data)
-{
-#ifndef MA_DR_FLAC_NO_CRC
-    return (crc32 << 8) ^ ma_dr_flac__crc32_table[(ma_uint8)((crc32 >> 24) & 0xFF) ^ data];
-#else
-    (void)data;
-    return crc32;
-#endif
-}
-#if 0
-static MA_INLINE ma_uint32 ma_dr_flac_crc32_uint32(ma_uint32 crc32, ma_uint32 data)
-{
-    crc32 = ma_dr_flac_crc32_byte(crc32, (ma_uint8)((data >> 24) & 0xFF));
-    crc32 = ma_dr_flac_crc32_byte(crc32, (ma_uint8)((data >> 16) & 0xFF));
-    crc32 = ma_dr_flac_crc32_byte(crc32, (ma_uint8)((data >>  8) & 0xFF));
-    crc32 = ma_dr_flac_crc32_byte(crc32, (ma_uint8)((data >>  0) & 0xFF));
-    return crc32;
-}
-static MA_INLINE ma_uint32 ma_dr_flac_crc32_uint64(ma_uint32 crc32, ma_uint64 data)
-{
-    crc32 = ma_dr_flac_crc32_uint32(crc32, (ma_uint32)((data >> 32) & 0xFFFFFFFF));
-    crc32 = ma_dr_flac_crc32_uint32(crc32, (ma_uint32)((data >>  0) & 0xFFFFFFFF));
-    return crc32;
-}
-#endif
-static MA_INLINE ma_uint32 ma_dr_flac_crc32_buffer(ma_uint32 crc32, ma_uint8* pData, ma_uint32 dataSize)
-{
-    ma_uint32 i;
-    for (i = 0; i < dataSize; ++i) {
-        crc32 = ma_dr_flac_crc32_byte(crc32, pData[i]);
-    }
-    return crc32;
-}
-static MA_INLINE ma_bool32 ma_dr_flac_ogg__is_capture_pattern(ma_uint8 pattern[4])
-{
-    return pattern[0] == 'O' && pattern[1] == 'g' && pattern[2] == 'g' && pattern[3] == 'S';
-}
-static MA_INLINE ma_uint32 ma_dr_flac_ogg__get_page_header_size(ma_dr_flac_ogg_page_header* pHeader)
-{
-    return 27 + pHeader->segmentCount;
-}
-static MA_INLINE ma_uint32 ma_dr_flac_ogg__get_page_body_size(ma_dr_flac_ogg_page_header* pHeader)
-{
-    ma_uint32 pageBodySize = 0;
-    int i;
-    for (i = 0; i < pHeader->segmentCount; ++i) {
-        pageBodySize += pHeader->segmentTable[i];
-    }
-    return pageBodySize;
-}
-static ma_result ma_dr_flac_ogg__read_page_header_after_capture_pattern(ma_dr_flac_read_proc onRead, void* pUserData, ma_dr_flac_ogg_page_header* pHeader, ma_uint32* pBytesRead, ma_uint32* pCRC32)
-{
-    ma_uint8 data[23];
-    ma_uint32 i;
-    MA_DR_FLAC_ASSERT(*pCRC32 == MA_DR_FLAC_OGG_CAPTURE_PATTERN_CRC32);
-    if (onRead(pUserData, data, 23) != 23) {
-        return MA_AT_END;
-    }
-    *pBytesRead += 23;
-    pHeader->capturePattern[0] = 'O';
-    pHeader->capturePattern[1] = 'g';
-    pHeader->capturePattern[2] = 'g';
-    pHeader->capturePattern[3] = 'S';
-    pHeader->structureVersion = data[0];
-    pHeader->headerType       = data[1];
-    MA_DR_FLAC_COPY_MEMORY(&pHeader->granulePosition, &data[ 2], 8);
-    MA_DR_FLAC_COPY_MEMORY(&pHeader->serialNumber,    &data[10], 4);
-    MA_DR_FLAC_COPY_MEMORY(&pHeader->sequenceNumber,  &data[14], 4);
-    MA_DR_FLAC_COPY_MEMORY(&pHeader->checksum,        &data[18], 4);
-    pHeader->segmentCount     = data[22];
-    data[18] = 0;
-    data[19] = 0;
-    data[20] = 0;
-    data[21] = 0;
-    for (i = 0; i < 23; ++i) {
-        *pCRC32 = ma_dr_flac_crc32_byte(*pCRC32, data[i]);
-    }
-    if (onRead(pUserData, pHeader->segmentTable, pHeader->segmentCount) != pHeader->segmentCount) {
-        return MA_AT_END;
-    }
-    *pBytesRead += pHeader->segmentCount;
-    for (i = 0; i < pHeader->segmentCount; ++i) {
-        *pCRC32 = ma_dr_flac_crc32_byte(*pCRC32, pHeader->segmentTable[i]);
-    }
-    return MA_SUCCESS;
-}
-static ma_result ma_dr_flac_ogg__read_page_header(ma_dr_flac_read_proc onRead, void* pUserData, ma_dr_flac_ogg_page_header* pHeader, ma_uint32* pBytesRead, ma_uint32* pCRC32)
-{
-    ma_uint8 id[4];
-    *pBytesRead = 0;
-    if (onRead(pUserData, id, 4) != 4) {
-        return MA_AT_END;
-    }
-    *pBytesRead += 4;
-    for (;;) {
-        if (ma_dr_flac_ogg__is_capture_pattern(id)) {
-            ma_result result;
-            *pCRC32 = MA_DR_FLAC_OGG_CAPTURE_PATTERN_CRC32;
-            result = ma_dr_flac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, pHeader, pBytesRead, pCRC32);
-            if (result == MA_SUCCESS) {
-                return MA_SUCCESS;
-            } else {
-                if (result == MA_CRC_MISMATCH) {
-                    continue;
-                } else {
-                    return result;
-                }
-            }
-        } else {
-            id[0] = id[1];
-            id[1] = id[2];
-            id[2] = id[3];
-            if (onRead(pUserData, &id[3], 1) != 1) {
-                return MA_AT_END;
-            }
-            *pBytesRead += 1;
-        }
-    }
-}
-typedef struct
-{
-    ma_dr_flac_read_proc onRead;
-    ma_dr_flac_seek_proc onSeek;
-    void* pUserData;
-    ma_uint64 currentBytePos;
-    ma_uint64 firstBytePos;
-    ma_uint32 serialNumber;
-    ma_dr_flac_ogg_page_header bosPageHeader;
-    ma_dr_flac_ogg_page_header currentPageHeader;
-    ma_uint32 bytesRemainingInPage;
-    ma_uint32 pageDataSize;
-    ma_uint8 pageData[MA_DR_FLAC_OGG_MAX_PAGE_SIZE];
-} ma_dr_flac_oggbs;
-static size_t ma_dr_flac_oggbs__read_physical(ma_dr_flac_oggbs* oggbs, void* bufferOut, size_t bytesToRead)
-{
-    size_t bytesActuallyRead = oggbs->onRead(oggbs->pUserData, bufferOut, bytesToRead);
-    oggbs->currentBytePos += bytesActuallyRead;
-    return bytesActuallyRead;
-}
-static ma_bool32 ma_dr_flac_oggbs__seek_physical(ma_dr_flac_oggbs* oggbs, ma_uint64 offset, ma_dr_flac_seek_origin origin)
-{
-    if (origin == ma_dr_flac_seek_origin_start) {
-        if (offset <= 0x7FFFFFFF) {
-            if (!oggbs->onSeek(oggbs->pUserData, (int)offset, ma_dr_flac_seek_origin_start)) {
-                return MA_FALSE;
-            }
-            oggbs->currentBytePos = offset;
-            return MA_TRUE;
-        } else {
-            if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, ma_dr_flac_seek_origin_start)) {
-                return MA_FALSE;
-            }
-            oggbs->currentBytePos = offset;
-            return ma_dr_flac_oggbs__seek_physical(oggbs, offset - 0x7FFFFFFF, ma_dr_flac_seek_origin_current);
-        }
-    } else {
-        while (offset > 0x7FFFFFFF) {
-            if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, ma_dr_flac_seek_origin_current)) {
-                return MA_FALSE;
-            }
-            oggbs->currentBytePos += 0x7FFFFFFF;
-            offset -= 0x7FFFFFFF;
-        }
-        if (!oggbs->onSeek(oggbs->pUserData, (int)offset, ma_dr_flac_seek_origin_current)) {
-            return MA_FALSE;
-        }
-        oggbs->currentBytePos += offset;
-        return MA_TRUE;
-    }
-}
-static ma_bool32 ma_dr_flac_oggbs__goto_next_page(ma_dr_flac_oggbs* oggbs, ma_dr_flac_ogg_crc_mismatch_recovery recoveryMethod)
-{
-    ma_dr_flac_ogg_page_header header;
-    for (;;) {
-        ma_uint32 crc32 = 0;
-        ma_uint32 bytesRead;
-        ma_uint32 pageBodySize;
-#ifndef MA_DR_FLAC_NO_CRC
-        ma_uint32 actualCRC32;
-#endif
-        if (ma_dr_flac_ogg__read_page_header(oggbs->onRead, oggbs->pUserData, &header, &bytesRead, &crc32) != MA_SUCCESS) {
-            return MA_FALSE;
-        }
-        oggbs->currentBytePos += bytesRead;
-        pageBodySize = ma_dr_flac_ogg__get_page_body_size(&header);
-        if (pageBodySize > MA_DR_FLAC_OGG_MAX_PAGE_SIZE) {
-            continue;
-        }
-        if (header.serialNumber != oggbs->serialNumber) {
-            if (pageBodySize > 0 && !ma_dr_flac_oggbs__seek_physical(oggbs, pageBodySize, ma_dr_flac_seek_origin_current)) {
-                return MA_FALSE;
-            }
-            continue;
-        }
-        if (ma_dr_flac_oggbs__read_physical(oggbs, oggbs->pageData, pageBodySize) != pageBodySize) {
-            return MA_FALSE;
-        }
-        oggbs->pageDataSize = pageBodySize;
-#ifndef MA_DR_FLAC_NO_CRC
-        actualCRC32 = ma_dr_flac_crc32_buffer(crc32, oggbs->pageData, oggbs->pageDataSize);
-        if (actualCRC32 != header.checksum) {
-            if (recoveryMethod == ma_dr_flac_ogg_recover_on_crc_mismatch) {
-                continue;
-            } else {
-                ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_recover_on_crc_mismatch);
-                return MA_FALSE;
-            }
-        }
-#else
-        (void)recoveryMethod;
-#endif
-        oggbs->currentPageHeader = header;
-        oggbs->bytesRemainingInPage = pageBodySize;
-        return MA_TRUE;
-    }
-}
-#if 0
-static ma_uint8 ma_dr_flac_oggbs__get_current_segment_index(ma_dr_flac_oggbs* oggbs, ma_uint8* pBytesRemainingInSeg)
-{
-    ma_uint32 bytesConsumedInPage = ma_dr_flac_ogg__get_page_body_size(&oggbs->currentPageHeader) - oggbs->bytesRemainingInPage;
-    ma_uint8 iSeg = 0;
-    ma_uint32 iByte = 0;
-    while (iByte < bytesConsumedInPage) {
-        ma_uint8 segmentSize = oggbs->currentPageHeader.segmentTable[iSeg];
-        if (iByte + segmentSize > bytesConsumedInPage) {
-            break;
-        } else {
-            iSeg += 1;
-            iByte += segmentSize;
-        }
-    }
-    *pBytesRemainingInSeg = oggbs->currentPageHeader.segmentTable[iSeg] - (ma_uint8)(bytesConsumedInPage - iByte);
-    return iSeg;
-}
-static ma_bool32 ma_dr_flac_oggbs__seek_to_next_packet(ma_dr_flac_oggbs* oggbs)
-{
-    for (;;) {
-        ma_bool32 atEndOfPage = MA_FALSE;
-        ma_uint8 bytesRemainingInSeg;
-        ma_uint8 iFirstSeg = ma_dr_flac_oggbs__get_current_segment_index(oggbs, &bytesRemainingInSeg);
-        ma_uint32 bytesToEndOfPacketOrPage = bytesRemainingInSeg;
-        for (ma_uint8 iSeg = iFirstSeg; iSeg < oggbs->currentPageHeader.segmentCount; ++iSeg) {
-            ma_uint8 segmentSize = oggbs->currentPageHeader.segmentTable[iSeg];
-            if (segmentSize < 255) {
-                if (iSeg == oggbs->currentPageHeader.segmentCount-1) {
-                    atEndOfPage = MA_TRUE;
-                }
-                break;
-            }
-            bytesToEndOfPacketOrPage += segmentSize;
-        }
-        ma_dr_flac_oggbs__seek_physical(oggbs, bytesToEndOfPacketOrPage, ma_dr_flac_seek_origin_current);
-        oggbs->bytesRemainingInPage -= bytesToEndOfPacketOrPage;
-        if (atEndOfPage) {
-            if (!ma_dr_flac_oggbs__goto_next_page(oggbs)) {
-                return MA_FALSE;
-            }
-            if ((oggbs->currentPageHeader.headerType & 0x01) == 0) {
-                return MA_TRUE;
-            }
-        } else {
-            return MA_TRUE;
-        }
-    }
-}
-static ma_bool32 ma_dr_flac_oggbs__seek_to_next_frame(ma_dr_flac_oggbs* oggbs)
-{
-    return ma_dr_flac_oggbs__seek_to_next_packet(oggbs);
-}
-#endif
-static size_t ma_dr_flac__on_read_ogg(void* pUserData, void* bufferOut, size_t bytesToRead)
-{
-    ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pUserData;
-    ma_uint8* pRunningBufferOut = (ma_uint8*)bufferOut;
-    size_t bytesRead = 0;
-    MA_DR_FLAC_ASSERT(oggbs != NULL);
-    MA_DR_FLAC_ASSERT(pRunningBufferOut != NULL);
-    while (bytesRead < bytesToRead) {
-        size_t bytesRemainingToRead = bytesToRead - bytesRead;
-        if (oggbs->bytesRemainingInPage >= bytesRemainingToRead) {
-            MA_DR_FLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), bytesRemainingToRead);
-            bytesRead += bytesRemainingToRead;
-            oggbs->bytesRemainingInPage -= (ma_uint32)bytesRemainingToRead;
-            break;
-        }
-        if (oggbs->bytesRemainingInPage > 0) {
-            MA_DR_FLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), oggbs->bytesRemainingInPage);
-            bytesRead += oggbs->bytesRemainingInPage;
-            pRunningBufferOut += oggbs->bytesRemainingInPage;
-            oggbs->bytesRemainingInPage = 0;
-        }
-        MA_DR_FLAC_ASSERT(bytesRemainingToRead > 0);
-        if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_recover_on_crc_mismatch)) {
-            break;
-        }
-    }
-    return bytesRead;
-}
-static ma_bool32 ma_dr_flac__on_seek_ogg(void* pUserData, int offset, ma_dr_flac_seek_origin origin)
-{
-    ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pUserData;
-    int bytesSeeked = 0;
-    MA_DR_FLAC_ASSERT(oggbs != NULL);
-    MA_DR_FLAC_ASSERT(offset >= 0);
-    if (origin == ma_dr_flac_seek_origin_start) {
-        if (!ma_dr_flac_oggbs__seek_physical(oggbs, (int)oggbs->firstBytePos, ma_dr_flac_seek_origin_start)) {
-            return MA_FALSE;
-        }
-        if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_fail_on_crc_mismatch)) {
-            return MA_FALSE;
-        }
-        return ma_dr_flac__on_seek_ogg(pUserData, offset, ma_dr_flac_seek_origin_current);
-    }
-    MA_DR_FLAC_ASSERT(origin == ma_dr_flac_seek_origin_current);
-    while (bytesSeeked < offset) {
-        int bytesRemainingToSeek = offset - bytesSeeked;
-        MA_DR_FLAC_ASSERT(bytesRemainingToSeek >= 0);
-        if (oggbs->bytesRemainingInPage >= (size_t)bytesRemainingToSeek) {
-            bytesSeeked += bytesRemainingToSeek;
-            (void)bytesSeeked;
-            oggbs->bytesRemainingInPage -= bytesRemainingToSeek;
-            break;
-        }
-        if (oggbs->bytesRemainingInPage > 0) {
-            bytesSeeked += (int)oggbs->bytesRemainingInPage;
-            oggbs->bytesRemainingInPage = 0;
-        }
-        MA_DR_FLAC_ASSERT(bytesRemainingToSeek > 0);
-        if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_fail_on_crc_mismatch)) {
-            return MA_FALSE;
-        }
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_flac_ogg__seek_to_pcm_frame(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
-{
-    ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pFlac->_oggbs;
-    ma_uint64 originalBytePos;
-    ma_uint64 runningGranulePosition;
-    ma_uint64 runningFrameBytePos;
-    ma_uint64 runningPCMFrameCount;
-    MA_DR_FLAC_ASSERT(oggbs != NULL);
-    originalBytePos = oggbs->currentBytePos;
-    if (!ma_dr_flac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes)) {
-        return MA_FALSE;
-    }
-    oggbs->bytesRemainingInPage = 0;
-    runningGranulePosition = 0;
-    for (;;) {
-        if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_recover_on_crc_mismatch)) {
-            ma_dr_flac_oggbs__seek_physical(oggbs, originalBytePos, ma_dr_flac_seek_origin_start);
-            return MA_FALSE;
-        }
-        runningFrameBytePos = oggbs->currentBytePos - ma_dr_flac_ogg__get_page_header_size(&oggbs->currentPageHeader) - oggbs->pageDataSize;
-        if (oggbs->currentPageHeader.granulePosition >= pcmFrameIndex) {
-            break;
-        }
-        if ((oggbs->currentPageHeader.headerType & 0x01) == 0) {
-            if (oggbs->currentPageHeader.segmentTable[0] >= 2) {
-                ma_uint8 firstBytesInPage[2];
-                firstBytesInPage[0] = oggbs->pageData[0];
-                firstBytesInPage[1] = oggbs->pageData[1];
-                if ((firstBytesInPage[0] == 0xFF) && (firstBytesInPage[1] & 0xFC) == 0xF8) {
-                    runningGranulePosition = oggbs->currentPageHeader.granulePosition;
-                }
-                continue;
-            }
-        }
-    }
-    if (!ma_dr_flac_oggbs__seek_physical(oggbs, runningFrameBytePos, ma_dr_flac_seek_origin_start)) {
-        return MA_FALSE;
-    }
-    if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_recover_on_crc_mismatch)) {
-        return MA_FALSE;
-    }
-    runningPCMFrameCount = runningGranulePosition;
-    for (;;) {
-        ma_uint64 firstPCMFrameInFLACFrame = 0;
-        ma_uint64 lastPCMFrameInFLACFrame = 0;
-        ma_uint64 pcmFrameCountInThisFrame;
-        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-            return MA_FALSE;
-        }
-        ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
-        pcmFrameCountInThisFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
-        if (pcmFrameIndex == pFlac->totalPCMFrameCount && (runningPCMFrameCount + pcmFrameCountInThisFrame) == pFlac->totalPCMFrameCount) {
-            ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
-            if (result == MA_SUCCESS) {
-                pFlac->currentPCMFrame = pcmFrameIndex;
-                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
-                return MA_TRUE;
-            } else {
-                return MA_FALSE;
-            }
-        }
-        if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFrame)) {
-            ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
-            if (result == MA_SUCCESS) {
-                ma_uint64 pcmFramesToDecode = (size_t)(pcmFrameIndex - runningPCMFrameCount);
-                if (pcmFramesToDecode == 0) {
-                    return MA_TRUE;
-                }
-                pFlac->currentPCMFrame = runningPCMFrameCount;
-                return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
-            } else {
-                if (result == MA_CRC_MISMATCH) {
-                    continue;
-                } else {
-                    return MA_FALSE;
-                }
-            }
-        } else {
-            ma_result result = ma_dr_flac__seek_to_next_flac_frame(pFlac);
-            if (result == MA_SUCCESS) {
-                runningPCMFrameCount += pcmFrameCountInThisFrame;
-            } else {
-                if (result == MA_CRC_MISMATCH) {
-                    continue;
-                } else {
-                    return MA_FALSE;
-                }
-            }
-        }
-    }
-}
-static ma_bool32 ma_dr_flac__init_private__ogg(ma_dr_flac_init_info* pInit, ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, void* pUserDataMD, ma_bool32 relaxed)
-{
-    ma_dr_flac_ogg_page_header header;
-    ma_uint32 crc32 = MA_DR_FLAC_OGG_CAPTURE_PATTERN_CRC32;
-    ma_uint32 bytesRead = 0;
-    (void)relaxed;
-    pInit->container = ma_dr_flac_container_ogg;
-    pInit->oggFirstBytePos = 0;
-    if (ma_dr_flac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, &header, &bytesRead, &crc32) != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-    pInit->runningFilePos += bytesRead;
-    for (;;) {
-        int pageBodySize;
-        if ((header.headerType & 0x02) == 0) {
-            return MA_FALSE;
-        }
-        pageBodySize = ma_dr_flac_ogg__get_page_body_size(&header);
-        if (pageBodySize == 51) {
-            ma_uint32 bytesRemainingInPage = pageBodySize;
-            ma_uint8 packetType;
-            if (onRead(pUserData, &packetType, 1) != 1) {
-                return MA_FALSE;
-            }
-            bytesRemainingInPage -= 1;
-            if (packetType == 0x7F) {
-                ma_uint8 sig[4];
-                if (onRead(pUserData, sig, 4) != 4) {
-                    return MA_FALSE;
-                }
-                bytesRemainingInPage -= 4;
-                if (sig[0] == 'F' && sig[1] == 'L' && sig[2] == 'A' && sig[3] == 'C') {
-                    ma_uint8 mappingVersion[2];
-                    if (onRead(pUserData, mappingVersion, 2) != 2) {
-                        return MA_FALSE;
-                    }
-                    if (mappingVersion[0] != 1) {
-                        return MA_FALSE;
-                    }
-                    if (!onSeek(pUserData, 2, ma_dr_flac_seek_origin_current)) {
-                        return MA_FALSE;
-                    }
-                    if (onRead(pUserData, sig, 4) != 4) {
-                        return MA_FALSE;
-                    }
-                    if (sig[0] == 'f' && sig[1] == 'L' && sig[2] == 'a' && sig[3] == 'C') {
-                        ma_dr_flac_streaminfo streaminfo;
-                        ma_uint8 isLastBlock;
-                        ma_uint8 blockType;
-                        ma_uint32 blockSize;
-                        if (!ma_dr_flac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
-                            return MA_FALSE;
-                        }
-                        if (blockType != MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
-                            return MA_FALSE;
-                        }
-                        if (ma_dr_flac__read_streaminfo(onRead, pUserData, &streaminfo)) {
-                            pInit->hasStreamInfoBlock      = MA_TRUE;
-                            pInit->sampleRate              = streaminfo.sampleRate;
-                            pInit->channels                = streaminfo.channels;
-                            pInit->bitsPerSample           = streaminfo.bitsPerSample;
-                            pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
-                            pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;
-                            pInit->hasMetadataBlocks       = !isLastBlock;
-                            if (onMeta) {
-                                ma_dr_flac_metadata metadata;
-                                metadata.type = MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO;
-                                metadata.pRawData = NULL;
-                                metadata.rawDataSize = 0;
-                                metadata.data.streaminfo = streaminfo;
-                                onMeta(pUserDataMD, &metadata);
-                            }
-                            pInit->runningFilePos  += pageBodySize;
-                            pInit->oggFirstBytePos  = pInit->runningFilePos - 79;
-                            pInit->oggSerial        = header.serialNumber;
-                            pInit->oggBosHeader     = header;
-                            break;
-                        } else {
-                            return MA_FALSE;
-                        }
-                    } else {
-                        return MA_FALSE;
-                    }
-                } else {
-                    if (!onSeek(pUserData, bytesRemainingInPage, ma_dr_flac_seek_origin_current)) {
-                        return MA_FALSE;
-                    }
-                }
-            } else {
-                if (!onSeek(pUserData, bytesRemainingInPage, ma_dr_flac_seek_origin_current)) {
-                    return MA_FALSE;
-                }
-            }
-        } else {
-            if (!onSeek(pUserData, pageBodySize, ma_dr_flac_seek_origin_current)) {
-                return MA_FALSE;
-            }
-        }
-        pInit->runningFilePos += pageBodySize;
-        if (ma_dr_flac_ogg__read_page_header(onRead, pUserData, &header, &bytesRead, &crc32) != MA_SUCCESS) {
-            return MA_FALSE;
-        }
-        pInit->runningFilePos += bytesRead;
-    }
-    pInit->hasMetadataBlocks = MA_TRUE;
-    return MA_TRUE;
-}
-#endif
-static ma_bool32 ma_dr_flac__init_private(ma_dr_flac_init_info* pInit, ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, ma_dr_flac_container container, void* pUserData, void* pUserDataMD)
-{
-    ma_bool32 relaxed;
-    ma_uint8 id[4];
-    if (pInit == NULL || onRead == NULL || onSeek == NULL) {
-        return MA_FALSE;
-    }
-    MA_DR_FLAC_ZERO_MEMORY(pInit, sizeof(*pInit));
-    pInit->onRead       = onRead;
-    pInit->onSeek       = onSeek;
-    pInit->onMeta       = onMeta;
-    pInit->container    = container;
-    pInit->pUserData    = pUserData;
-    pInit->pUserDataMD  = pUserDataMD;
-    pInit->bs.onRead    = onRead;
-    pInit->bs.onSeek    = onSeek;
-    pInit->bs.pUserData = pUserData;
-    ma_dr_flac__reset_cache(&pInit->bs);
-    relaxed = container != ma_dr_flac_container_unknown;
-    for (;;) {
-        if (onRead(pUserData, id, 4) != 4) {
-            return MA_FALSE;
-        }
-        pInit->runningFilePos += 4;
-        if (id[0] == 'I' && id[1] == 'D' && id[2] == '3') {
-            ma_uint8 header[6];
-            ma_uint8 flags;
-            ma_uint32 headerSize;
-            if (onRead(pUserData, header, 6) != 6) {
-                return MA_FALSE;
-            }
-            pInit->runningFilePos += 6;
-            flags = header[1];
-            MA_DR_FLAC_COPY_MEMORY(&headerSize, header+2, 4);
-            headerSize = ma_dr_flac__unsynchsafe_32(ma_dr_flac__be2host_32(headerSize));
-            if (flags & 0x10) {
-                headerSize += 10;
-            }
-            if (!onSeek(pUserData, headerSize, ma_dr_flac_seek_origin_current)) {
-                return MA_FALSE;
-            }
-            pInit->runningFilePos += headerSize;
-        } else {
-            break;
-        }
-    }
-    if (id[0] == 'f' && id[1] == 'L' && id[2] == 'a' && id[3] == 'C') {
-        return ma_dr_flac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
-    }
-#ifndef MA_DR_FLAC_NO_OGG
-    if (id[0] == 'O' && id[1] == 'g' && id[2] == 'g' && id[3] == 'S') {
-        return ma_dr_flac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
-    }
-#endif
-    if (relaxed) {
-        if (container == ma_dr_flac_container_native) {
-            return ma_dr_flac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
-        }
-#ifndef MA_DR_FLAC_NO_OGG
-        if (container == ma_dr_flac_container_ogg) {
-            return ma_dr_flac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
-        }
-#endif
-    }
-    return MA_FALSE;
-}
-static void ma_dr_flac__init_from_info(ma_dr_flac* pFlac, const ma_dr_flac_init_info* pInit)
-{
-    MA_DR_FLAC_ASSERT(pFlac != NULL);
-    MA_DR_FLAC_ASSERT(pInit != NULL);
-    MA_DR_FLAC_ZERO_MEMORY(pFlac, sizeof(*pFlac));
-    pFlac->bs                      = pInit->bs;
-    pFlac->onMeta                  = pInit->onMeta;
-    pFlac->pUserDataMD             = pInit->pUserDataMD;
-    pFlac->maxBlockSizeInPCMFrames = pInit->maxBlockSizeInPCMFrames;
-    pFlac->sampleRate              = pInit->sampleRate;
-    pFlac->channels                = (ma_uint8)pInit->channels;
-    pFlac->bitsPerSample           = (ma_uint8)pInit->bitsPerSample;
-    pFlac->totalPCMFrameCount      = pInit->totalPCMFrameCount;
-    pFlac->container               = pInit->container;
-}
-static ma_dr_flac* ma_dr_flac_open_with_metadata_private(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, ma_dr_flac_container container, void* pUserData, void* pUserDataMD, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac_init_info init;
-    ma_uint32 allocationSize;
-    ma_uint32 wholeSIMDVectorCountPerChannel;
-    ma_uint32 decodedSamplesAllocationSize;
-#ifndef MA_DR_FLAC_NO_OGG
-    ma_dr_flac_oggbs* pOggbs = NULL;
-#endif
-    ma_uint64 firstFramePos;
-    ma_uint64 seektablePos;
-    ma_uint32 seekpointCount;
-    ma_allocation_callbacks allocationCallbacks;
-    ma_dr_flac* pFlac;
-    ma_dr_flac__init_cpu_caps();
-    if (!ma_dr_flac__init_private(&init, onRead, onSeek, onMeta, container, pUserData, pUserDataMD)) {
-        return NULL;
-    }
-    if (pAllocationCallbacks != NULL) {
-        allocationCallbacks = *pAllocationCallbacks;
-        if (allocationCallbacks.onFree == NULL || (allocationCallbacks.onMalloc == NULL && allocationCallbacks.onRealloc == NULL)) {
-            return NULL;
-        }
-    } else {
-        allocationCallbacks.pUserData = NULL;
-        allocationCallbacks.onMalloc  = ma_dr_flac__malloc_default;
-        allocationCallbacks.onRealloc = ma_dr_flac__realloc_default;
-        allocationCallbacks.onFree    = ma_dr_flac__free_default;
-    }
-    allocationSize = sizeof(ma_dr_flac);
-    if ((init.maxBlockSizeInPCMFrames % (MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE / sizeof(ma_int32))) == 0) {
-        wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE / sizeof(ma_int32)));
-    } else {
-        wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE / sizeof(ma_int32))) + 1;
-    }
-    decodedSamplesAllocationSize = wholeSIMDVectorCountPerChannel * MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE * init.channels;
-    allocationSize += decodedSamplesAllocationSize;
-    allocationSize += MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE;
-#ifndef MA_DR_FLAC_NO_OGG
-    if (init.container == ma_dr_flac_container_ogg) {
-        allocationSize += sizeof(ma_dr_flac_oggbs);
-        pOggbs = (ma_dr_flac_oggbs*)ma_dr_flac__malloc_from_callbacks(sizeof(*pOggbs), &allocationCallbacks);
-        if (pOggbs == NULL) {
-            return NULL;
-        }
-        MA_DR_FLAC_ZERO_MEMORY(pOggbs, sizeof(*pOggbs));
-        pOggbs->onRead = onRead;
-        pOggbs->onSeek = onSeek;
-        pOggbs->pUserData = pUserData;
-        pOggbs->currentBytePos = init.oggFirstBytePos;
-        pOggbs->firstBytePos = init.oggFirstBytePos;
-        pOggbs->serialNumber = init.oggSerial;
-        pOggbs->bosPageHeader = init.oggBosHeader;
-        pOggbs->bytesRemainingInPage = 0;
-    }
-#endif
-    firstFramePos  = 42;
-    seektablePos   = 0;
-    seekpointCount = 0;
-    if (init.hasMetadataBlocks) {
-        ma_dr_flac_read_proc onReadOverride = onRead;
-        ma_dr_flac_seek_proc onSeekOverride = onSeek;
-        void* pUserDataOverride = pUserData;
-#ifndef MA_DR_FLAC_NO_OGG
-        if (init.container == ma_dr_flac_container_ogg) {
-            onReadOverride = ma_dr_flac__on_read_ogg;
-            onSeekOverride = ma_dr_flac__on_seek_ogg;
-            pUserDataOverride = (void*)pOggbs;
-        }
-#endif
-        if (!ma_dr_flac__read_and_decode_metadata(onReadOverride, onSeekOverride, onMeta, pUserDataOverride, pUserDataMD, &firstFramePos, &seektablePos, &seekpointCount, &allocationCallbacks)) {
-        #ifndef MA_DR_FLAC_NO_OGG
-            ma_dr_flac__free_from_callbacks(pOggbs, &allocationCallbacks);
-        #endif
-            return NULL;
-        }
-        allocationSize += seekpointCount * sizeof(ma_dr_flac_seekpoint);
-    }
-    pFlac = (ma_dr_flac*)ma_dr_flac__malloc_from_callbacks(allocationSize, &allocationCallbacks);
-    if (pFlac == NULL) {
-    #ifndef MA_DR_FLAC_NO_OGG
-        ma_dr_flac__free_from_callbacks(pOggbs, &allocationCallbacks);
-    #endif
-        return NULL;
-    }
-    ma_dr_flac__init_from_info(pFlac, &init);
-    pFlac->allocationCallbacks = allocationCallbacks;
-    pFlac->pDecodedSamples = (ma_int32*)ma_dr_flac_align((size_t)pFlac->pExtraData, MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE);
-#ifndef MA_DR_FLAC_NO_OGG
-    if (init.container == ma_dr_flac_container_ogg) {
-        ma_dr_flac_oggbs* pInternalOggbs = (ma_dr_flac_oggbs*)((ma_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize + (seekpointCount * sizeof(ma_dr_flac_seekpoint)));
-        MA_DR_FLAC_COPY_MEMORY(pInternalOggbs, pOggbs, sizeof(*pOggbs));
-        ma_dr_flac__free_from_callbacks(pOggbs, &allocationCallbacks);
-        pOggbs = NULL;
-        pFlac->bs.onRead = ma_dr_flac__on_read_ogg;
-        pFlac->bs.onSeek = ma_dr_flac__on_seek_ogg;
-        pFlac->bs.pUserData = (void*)pInternalOggbs;
-        pFlac->_oggbs = (void*)pInternalOggbs;
-    }
-#endif
-    pFlac->firstFLACFramePosInBytes = firstFramePos;
-#ifndef MA_DR_FLAC_NO_OGG
-    if (init.container == ma_dr_flac_container_ogg)
-    {
-        pFlac->pSeekpoints = NULL;
-        pFlac->seekpointCount = 0;
-    }
-    else
-#endif
-    {
-        if (seektablePos != 0) {
-            pFlac->seekpointCount = seekpointCount;
-            pFlac->pSeekpoints = (ma_dr_flac_seekpoint*)((ma_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize);
-            MA_DR_FLAC_ASSERT(pFlac->bs.onSeek != NULL);
-            MA_DR_FLAC_ASSERT(pFlac->bs.onRead != NULL);
-            if (pFlac->bs.onSeek(pFlac->bs.pUserData, (int)seektablePos, ma_dr_flac_seek_origin_start)) {
-                ma_uint32 iSeekpoint;
-                for (iSeekpoint = 0; iSeekpoint < seekpointCount; iSeekpoint += 1) {
-                    if (pFlac->bs.onRead(pFlac->bs.pUserData, pFlac->pSeekpoints + iSeekpoint, MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES) == MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES) {
-                        pFlac->pSeekpoints[iSeekpoint].firstPCMFrame   = ma_dr_flac__be2host_64(pFlac->pSeekpoints[iSeekpoint].firstPCMFrame);
-                        pFlac->pSeekpoints[iSeekpoint].flacFrameOffset = ma_dr_flac__be2host_64(pFlac->pSeekpoints[iSeekpoint].flacFrameOffset);
-                        pFlac->pSeekpoints[iSeekpoint].pcmFrameCount   = ma_dr_flac__be2host_16(pFlac->pSeekpoints[iSeekpoint].pcmFrameCount);
-                    } else {
-                        pFlac->pSeekpoints = NULL;
-                        pFlac->seekpointCount = 0;
-                        break;
-                    }
-                }
-                if (!pFlac->bs.onSeek(pFlac->bs.pUserData, (int)pFlac->firstFLACFramePosInBytes, ma_dr_flac_seek_origin_start)) {
-                    ma_dr_flac__free_from_callbacks(pFlac, &allocationCallbacks);
-                    return NULL;
-                }
-            } else {
-                pFlac->pSeekpoints = NULL;
-                pFlac->seekpointCount = 0;
-            }
-        }
-    }
-    if (!init.hasStreamInfoBlock) {
-        pFlac->currentFLACFrame.header = init.firstFrameHeader;
-        for (;;) {
-            ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
-            if (result == MA_SUCCESS) {
-                break;
-            } else {
-                if (result == MA_CRC_MISMATCH) {
-                    if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
-                        ma_dr_flac__free_from_callbacks(pFlac, &allocationCallbacks);
-                        return NULL;
-                    }
-                    continue;
-                } else {
-                    ma_dr_flac__free_from_callbacks(pFlac, &allocationCallbacks);
-                    return NULL;
-                }
-            }
-        }
-    }
-    return pFlac;
-}
-#ifndef MA_DR_FLAC_NO_STDIO
-#include <stdio.h>
-#ifndef MA_DR_FLAC_NO_WCHAR
-#include <wchar.h>
-#endif
-static size_t ma_dr_flac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
-{
-    return fread(bufferOut, 1, bytesToRead, (FILE*)pUserData);
-}
-static ma_bool32 ma_dr_flac__on_seek_stdio(void* pUserData, int offset, ma_dr_flac_seek_origin origin)
-{
-    MA_DR_FLAC_ASSERT(offset >= 0);
-    return fseek((FILE*)pUserData, offset, (origin == ma_dr_flac_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0;
-}
-MA_API ma_dr_flac* ma_dr_flac_open_file(const char* pFileName, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    FILE* pFile;
-    if (ma_fopen(&pFile, pFileName, "rb") != MA_SUCCESS) {
-        return NULL;
-    }
-    pFlac = ma_dr_flac_open(ma_dr_flac__on_read_stdio, ma_dr_flac__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        fclose(pFile);
-        return NULL;
-    }
-    return pFlac;
-}
-#ifndef MA_DR_FLAC_NO_WCHAR
-MA_API ma_dr_flac* ma_dr_flac_open_file_w(const wchar_t* pFileName, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    FILE* pFile;
-    if (ma_wfopen(&pFile, pFileName, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
-        return NULL;
-    }
-    pFlac = ma_dr_flac_open(ma_dr_flac__on_read_stdio, ma_dr_flac__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        fclose(pFile);
-        return NULL;
-    }
-    return pFlac;
-}
-#endif
-MA_API ma_dr_flac* ma_dr_flac_open_file_with_metadata(const char* pFileName, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    FILE* pFile;
-    if (ma_fopen(&pFile, pFileName, "rb") != MA_SUCCESS) {
-        return NULL;
-    }
-    pFlac = ma_dr_flac_open_with_metadata_private(ma_dr_flac__on_read_stdio, ma_dr_flac__on_seek_stdio, onMeta, ma_dr_flac_container_unknown, (void*)pFile, pUserData, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        fclose(pFile);
-        return pFlac;
-    }
-    return pFlac;
-}
-#ifndef MA_DR_FLAC_NO_WCHAR
-MA_API ma_dr_flac* ma_dr_flac_open_file_with_metadata_w(const wchar_t* pFileName, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    FILE* pFile;
-    if (ma_wfopen(&pFile, pFileName, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
-        return NULL;
-    }
-    pFlac = ma_dr_flac_open_with_metadata_private(ma_dr_flac__on_read_stdio, ma_dr_flac__on_seek_stdio, onMeta, ma_dr_flac_container_unknown, (void*)pFile, pUserData, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        fclose(pFile);
-        return pFlac;
-    }
-    return pFlac;
-}
-#endif
-#endif
-static size_t ma_dr_flac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
-{
-    ma_dr_flac__memory_stream* memoryStream = (ma_dr_flac__memory_stream*)pUserData;
-    size_t bytesRemaining;
-    MA_DR_FLAC_ASSERT(memoryStream != NULL);
-    MA_DR_FLAC_ASSERT(memoryStream->dataSize >= memoryStream->currentReadPos);
-    bytesRemaining = memoryStream->dataSize - memoryStream->currentReadPos;
-    if (bytesToRead > bytesRemaining) {
-        bytesToRead = bytesRemaining;
-    }
-    if (bytesToRead > 0) {
-        MA_DR_FLAC_COPY_MEMORY(bufferOut, memoryStream->data + memoryStream->currentReadPos, bytesToRead);
-        memoryStream->currentReadPos += bytesToRead;
-    }
-    return bytesToRead;
-}
-static ma_bool32 ma_dr_flac__on_seek_memory(void* pUserData, int offset, ma_dr_flac_seek_origin origin)
-{
-    ma_dr_flac__memory_stream* memoryStream = (ma_dr_flac__memory_stream*)pUserData;
-    MA_DR_FLAC_ASSERT(memoryStream != NULL);
-    MA_DR_FLAC_ASSERT(offset >= 0);
-    if (offset > (ma_int64)memoryStream->dataSize) {
-        return MA_FALSE;
-    }
-    if (origin == ma_dr_flac_seek_origin_current) {
-        if (memoryStream->currentReadPos + offset <= memoryStream->dataSize) {
-            memoryStream->currentReadPos += offset;
-        } else {
-            return MA_FALSE;
-        }
-    } else {
-        if ((ma_uint32)offset <= memoryStream->dataSize) {
-            memoryStream->currentReadPos = offset;
-        } else {
-            return MA_FALSE;
-        }
-    }
-    return MA_TRUE;
-}
-MA_API ma_dr_flac* ma_dr_flac_open_memory(const void* pData, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac__memory_stream memoryStream;
-    ma_dr_flac* pFlac;
-    memoryStream.data = (const ma_uint8*)pData;
-    memoryStream.dataSize = dataSize;
-    memoryStream.currentReadPos = 0;
-    pFlac = ma_dr_flac_open(ma_dr_flac__on_read_memory, ma_dr_flac__on_seek_memory, &memoryStream, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    pFlac->memoryStream = memoryStream;
-#ifndef MA_DR_FLAC_NO_OGG
-    if (pFlac->container == ma_dr_flac_container_ogg)
-    {
-        ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pFlac->_oggbs;
-        oggbs->pUserData = &pFlac->memoryStream;
-    }
-    else
-#endif
-    {
-        pFlac->bs.pUserData = &pFlac->memoryStream;
-    }
-    return pFlac;
-}
-MA_API ma_dr_flac* ma_dr_flac_open_memory_with_metadata(const void* pData, size_t dataSize, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac__memory_stream memoryStream;
-    ma_dr_flac* pFlac;
-    memoryStream.data = (const ma_uint8*)pData;
-    memoryStream.dataSize = dataSize;
-    memoryStream.currentReadPos = 0;
-    pFlac = ma_dr_flac_open_with_metadata_private(ma_dr_flac__on_read_memory, ma_dr_flac__on_seek_memory, onMeta, ma_dr_flac_container_unknown, &memoryStream, pUserData, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    pFlac->memoryStream = memoryStream;
-#ifndef MA_DR_FLAC_NO_OGG
-    if (pFlac->container == ma_dr_flac_container_ogg)
-    {
-        ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pFlac->_oggbs;
-        oggbs->pUserData = &pFlac->memoryStream;
-    }
-    else
-#endif
-    {
-        pFlac->bs.pUserData = &pFlac->memoryStream;
-    }
-    return pFlac;
-}
-MA_API ma_dr_flac* ma_dr_flac_open(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_flac_open_with_metadata_private(onRead, onSeek, NULL, ma_dr_flac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
-}
-MA_API ma_dr_flac* ma_dr_flac_open_relaxed(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_container container, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_flac_open_with_metadata_private(onRead, onSeek, NULL, container, pUserData, pUserData, pAllocationCallbacks);
-}
-MA_API ma_dr_flac* ma_dr_flac_open_with_metadata(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_flac_open_with_metadata_private(onRead, onSeek, onMeta, ma_dr_flac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
-}
-MA_API ma_dr_flac* ma_dr_flac_open_with_metadata_relaxed(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, ma_dr_flac_container container, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    return ma_dr_flac_open_with_metadata_private(onRead, onSeek, onMeta, container, pUserData, pUserData, pAllocationCallbacks);
-}
-MA_API void ma_dr_flac_close(ma_dr_flac* pFlac)
-{
-    if (pFlac == NULL) {
-        return;
-    }
-#ifndef MA_DR_FLAC_NO_STDIO
-    if (pFlac->bs.onRead == ma_dr_flac__on_read_stdio) {
-        fclose((FILE*)pFlac->bs.pUserData);
-    }
-#ifndef MA_DR_FLAC_NO_OGG
-    if (pFlac->container == ma_dr_flac_container_ogg) {
-        ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pFlac->_oggbs;
-        MA_DR_FLAC_ASSERT(pFlac->bs.onRead == ma_dr_flac__on_read_ogg);
-        if (oggbs->onRead == ma_dr_flac__on_read_stdio) {
-            fclose((FILE*)oggbs->pUserData);
-        }
-    }
-#endif
-#endif
-    ma_dr_flac__free_from_callbacks(pFlac, &pFlac->allocationCallbacks);
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    for (i = 0; i < frameCount; ++i) {
-        ma_uint32 left  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-        ma_uint32 side  = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-        ma_uint32 right = left - side;
-        pOutputSamples[i*2+0] = (ma_int32)left;
-        pOutputSamples[i*2+1] = (ma_int32)right;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    for (i = 0; i < frameCount4; ++i) {
-        ma_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
-        ma_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
-        ma_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
-        ma_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
-        ma_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
-        ma_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
-        ma_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
-        ma_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
-        ma_uint32 right0 = left0 - side0;
-        ma_uint32 right1 = left1 - side1;
-        ma_uint32 right2 = left2 - side2;
-        ma_uint32 right3 = left3 - side3;
-        pOutputSamples[i*8+0] = (ma_int32)left0;
-        pOutputSamples[i*8+1] = (ma_int32)right0;
-        pOutputSamples[i*8+2] = (ma_int32)left1;
-        pOutputSamples[i*8+3] = (ma_int32)right1;
-        pOutputSamples[i*8+4] = (ma_int32)left2;
-        pOutputSamples[i*8+5] = (ma_int32)right2;
-        pOutputSamples[i*8+6] = (ma_int32)left3;
-        pOutputSamples[i*8+7] = (ma_int32)right3;
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 left  = pInputSamples0U32[i] << shift0;
-        ma_uint32 side  = pInputSamples1U32[i] << shift1;
-        ma_uint32 right = left - side;
-        pOutputSamples[i*2+0] = (ma_int32)left;
-        pOutputSamples[i*2+1] = (ma_int32)right;
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    for (i = 0; i < frameCount4; ++i) {
-        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
-        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
-        __m128i right = _mm_sub_epi32(left, side);
-        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
-        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 left  = pInputSamples0U32[i] << shift0;
-        ma_uint32 side  = pInputSamples1U32[i] << shift1;
-        ma_uint32 right = left - side;
-        pOutputSamples[i*2+0] = (ma_int32)left;
-        pOutputSamples[i*2+1] = (ma_int32)right;
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    int32x4_t shift0_4;
-    int32x4_t shift1_4;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    shift0_4 = vdupq_n_s32(shift0);
-    shift1_4 = vdupq_n_s32(shift1);
-    for (i = 0; i < frameCount4; ++i) {
-        uint32x4_t left;
-        uint32x4_t side;
-        uint32x4_t right;
-        left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
-        side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
-        right = vsubq_u32(left, side);
-        ma_dr_flac__vst2q_u32((ma_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 left  = pInputSamples0U32[i] << shift0;
-        ma_uint32 side  = pInputSamples1U32[i] << shift1;
-        ma_uint32 right = left - side;
-        pOutputSamples[i*2+0] = (ma_int32)left;
-        pOutputSamples[i*2+1] = (ma_int32)right;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_s32__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_s32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    for (i = 0; i < frameCount; ++i) {
-        ma_uint32 side  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-        ma_uint32 right = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-        ma_uint32 left  = right + side;
-        pOutputSamples[i*2+0] = (ma_int32)left;
-        pOutputSamples[i*2+1] = (ma_int32)right;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    for (i = 0; i < frameCount4; ++i) {
-        ma_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
-        ma_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
-        ma_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
-        ma_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
-        ma_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
-        ma_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
-        ma_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
-        ma_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
-        ma_uint32 left0 = right0 + side0;
-        ma_uint32 left1 = right1 + side1;
-        ma_uint32 left2 = right2 + side2;
-        ma_uint32 left3 = right3 + side3;
-        pOutputSamples[i*8+0] = (ma_int32)left0;
-        pOutputSamples[i*8+1] = (ma_int32)right0;
-        pOutputSamples[i*8+2] = (ma_int32)left1;
-        pOutputSamples[i*8+3] = (ma_int32)right1;
-        pOutputSamples[i*8+4] = (ma_int32)left2;
-        pOutputSamples[i*8+5] = (ma_int32)right2;
-        pOutputSamples[i*8+6] = (ma_int32)left3;
-        pOutputSamples[i*8+7] = (ma_int32)right3;
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 side  = pInputSamples0U32[i] << shift0;
-        ma_uint32 right = pInputSamples1U32[i] << shift1;
-        ma_uint32 left  = right + side;
-        pOutputSamples[i*2+0] = (ma_int32)left;
-        pOutputSamples[i*2+1] = (ma_int32)right;
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    for (i = 0; i < frameCount4; ++i) {
-        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
-        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
-        __m128i left  = _mm_add_epi32(right, side);
-        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
-        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 side  = pInputSamples0U32[i] << shift0;
-        ma_uint32 right = pInputSamples1U32[i] << shift1;
-        ma_uint32 left  = right + side;
-        pOutputSamples[i*2+0] = (ma_int32)left;
-        pOutputSamples[i*2+1] = (ma_int32)right;
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    int32x4_t shift0_4;
-    int32x4_t shift1_4;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    shift0_4 = vdupq_n_s32(shift0);
-    shift1_4 = vdupq_n_s32(shift1);
-    for (i = 0; i < frameCount4; ++i) {
-        uint32x4_t side;
-        uint32x4_t right;
-        uint32x4_t left;
-        side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
-        right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
-        left  = vaddq_u32(right, side);
-        ma_dr_flac__vst2q_u32((ma_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 side  = pInputSamples0U32[i] << shift0;
-        ma_uint32 right = pInputSamples1U32[i] << shift1;
-        ma_uint32 left  = right + side;
-        pOutputSamples[i*2+0] = (ma_int32)left;
-        pOutputSamples[i*2+1] = (ma_int32)right;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_s32__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_s32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    for (ma_uint64 i = 0; i < frameCount; ++i) {
-        ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-        ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-        mid = (mid << 1) | (side & 0x01);
-        pOutputSamples[i*2+0] = (ma_int32)((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample);
-        pOutputSamples[i*2+1] = (ma_int32)((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample);
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_int32 shift = unusedBitsPerSample;
-    if (shift > 0) {
-        shift -= 1;
-        for (i = 0; i < frameCount4; ++i) {
-            ma_uint32 temp0L;
-            ma_uint32 temp1L;
-            ma_uint32 temp2L;
-            ma_uint32 temp3L;
-            ma_uint32 temp0R;
-            ma_uint32 temp1R;
-            ma_uint32 temp2R;
-            ma_uint32 temp3R;
-            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid0 = (mid0 << 1) | (side0 & 0x01);
-            mid1 = (mid1 << 1) | (side1 & 0x01);
-            mid2 = (mid2 << 1) | (side2 & 0x01);
-            mid3 = (mid3 << 1) | (side3 & 0x01);
-            temp0L = (mid0 + side0) << shift;
-            temp1L = (mid1 + side1) << shift;
-            temp2L = (mid2 + side2) << shift;
-            temp3L = (mid3 + side3) << shift;
-            temp0R = (mid0 - side0) << shift;
-            temp1R = (mid1 - side1) << shift;
-            temp2R = (mid2 - side2) << shift;
-            temp3R = (mid3 - side3) << shift;
-            pOutputSamples[i*8+0] = (ma_int32)temp0L;
-            pOutputSamples[i*8+1] = (ma_int32)temp0R;
-            pOutputSamples[i*8+2] = (ma_int32)temp1L;
-            pOutputSamples[i*8+3] = (ma_int32)temp1R;
-            pOutputSamples[i*8+4] = (ma_int32)temp2L;
-            pOutputSamples[i*8+5] = (ma_int32)temp2R;
-            pOutputSamples[i*8+6] = (ma_int32)temp3L;
-            pOutputSamples[i*8+7] = (ma_int32)temp3R;
-        }
-    } else {
-        for (i = 0; i < frameCount4; ++i) {
-            ma_uint32 temp0L;
-            ma_uint32 temp1L;
-            ma_uint32 temp2L;
-            ma_uint32 temp3L;
-            ma_uint32 temp0R;
-            ma_uint32 temp1R;
-            ma_uint32 temp2R;
-            ma_uint32 temp3R;
-            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid0 = (mid0 << 1) | (side0 & 0x01);
-            mid1 = (mid1 << 1) | (side1 & 0x01);
-            mid2 = (mid2 << 1) | (side2 & 0x01);
-            mid3 = (mid3 << 1) | (side3 & 0x01);
-            temp0L = (ma_uint32)((ma_int32)(mid0 + side0) >> 1);
-            temp1L = (ma_uint32)((ma_int32)(mid1 + side1) >> 1);
-            temp2L = (ma_uint32)((ma_int32)(mid2 + side2) >> 1);
-            temp3L = (ma_uint32)((ma_int32)(mid3 + side3) >> 1);
-            temp0R = (ma_uint32)((ma_int32)(mid0 - side0) >> 1);
-            temp1R = (ma_uint32)((ma_int32)(mid1 - side1) >> 1);
-            temp2R = (ma_uint32)((ma_int32)(mid2 - side2) >> 1);
-            temp3R = (ma_uint32)((ma_int32)(mid3 - side3) >> 1);
-            pOutputSamples[i*8+0] = (ma_int32)temp0L;
-            pOutputSamples[i*8+1] = (ma_int32)temp0R;
-            pOutputSamples[i*8+2] = (ma_int32)temp1L;
-            pOutputSamples[i*8+3] = (ma_int32)temp1R;
-            pOutputSamples[i*8+4] = (ma_int32)temp2L;
-            pOutputSamples[i*8+5] = (ma_int32)temp2R;
-            pOutputSamples[i*8+6] = (ma_int32)temp3L;
-            pOutputSamples[i*8+7] = (ma_int32)temp3R;
-        }
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-        ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-        mid = (mid << 1) | (side & 0x01);
-        pOutputSamples[i*2+0] = (ma_int32)((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample);
-        pOutputSamples[i*2+1] = (ma_int32)((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample);
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_int32 shift = unusedBitsPerSample;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    if (shift == 0) {
-        for (i = 0; i < frameCount4; ++i) {
-            __m128i mid;
-            __m128i side;
-            __m128i left;
-            __m128i right;
-            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
-            left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
-            right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
-            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
-            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int32)(mid + side) >> 1;
-            pOutputSamples[i*2+1] = (ma_int32)(mid - side) >> 1;
-        }
-    } else {
-        shift -= 1;
-        for (i = 0; i < frameCount4; ++i) {
-            __m128i mid;
-            __m128i side;
-            __m128i left;
-            __m128i right;
-            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
-            left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
-            right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
-            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
-            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int32)((mid + side) << shift);
-            pOutputSamples[i*2+1] = (ma_int32)((mid - side) << shift);
-        }
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_int32 shift = unusedBitsPerSample;
-    int32x4_t  wbpsShift0_4;
-    int32x4_t  wbpsShift1_4;
-    uint32x4_t one4;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-    wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-    one4         = vdupq_n_u32(1);
-    if (shift == 0) {
-        for (i = 0; i < frameCount4; ++i) {
-            uint32x4_t mid;
-            uint32x4_t side;
-            int32x4_t left;
-            int32x4_t right;
-            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
-            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
-            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
-            left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
-            right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
-            ma_dr_flac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int32)(mid + side) >> 1;
-            pOutputSamples[i*2+1] = (ma_int32)(mid - side) >> 1;
-        }
-    } else {
-        int32x4_t shift4;
-        shift -= 1;
-        shift4 = vdupq_n_s32(shift);
-        for (i = 0; i < frameCount4; ++i) {
-            uint32x4_t mid;
-            uint32x4_t side;
-            int32x4_t left;
-            int32x4_t right;
-            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
-            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
-            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
-            left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
-            right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
-            ma_dr_flac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int32)((mid + side) << shift);
-            pOutputSamples[i*2+1] = (ma_int32)((mid - side) << shift);
-        }
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_s32__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_s32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    for (ma_uint64 i = 0; i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int32)((ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample));
-        pOutputSamples[i*2+1] = (ma_int32)((ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample));
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    for (i = 0; i < frameCount4; ++i) {
-        ma_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
-        ma_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
-        ma_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
-        ma_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
-        ma_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
-        ma_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
-        ma_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
-        ma_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
-        pOutputSamples[i*8+0] = (ma_int32)tempL0;
-        pOutputSamples[i*8+1] = (ma_int32)tempR0;
-        pOutputSamples[i*8+2] = (ma_int32)tempL1;
-        pOutputSamples[i*8+3] = (ma_int32)tempR1;
-        pOutputSamples[i*8+4] = (ma_int32)tempL2;
-        pOutputSamples[i*8+5] = (ma_int32)tempR2;
-        pOutputSamples[i*8+6] = (ma_int32)tempL3;
-        pOutputSamples[i*8+7] = (ma_int32)tempR3;
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0);
-        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1);
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    for (i = 0; i < frameCount4; ++i) {
-        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
-        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
-        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
-        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0);
-        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1);
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    int32x4_t shift4_0 = vdupq_n_s32(shift0);
-    int32x4_t shift4_1 = vdupq_n_s32(shift1);
-    for (i = 0; i < frameCount4; ++i) {
-        int32x4_t left;
-        int32x4_t right;
-        left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift4_0));
-        right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift4_1));
-        ma_dr_flac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0);
-        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1);
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-MA_API ma_uint64 ma_dr_flac_read_pcm_frames_s32(ma_dr_flac* pFlac, ma_uint64 framesToRead, ma_int32* pBufferOut)
-{
-    ma_uint64 framesRead;
-    ma_uint32 unusedBitsPerSample;
-    if (pFlac == NULL || framesToRead == 0) {
-        return 0;
-    }
-    if (pBufferOut == NULL) {
-        return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, framesToRead);
-    }
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 32);
-    unusedBitsPerSample = 32 - pFlac->bitsPerSample;
-    framesRead = 0;
-    while (framesToRead > 0) {
-        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
-            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
-                break;
-            }
-        } else {
-            unsigned int channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
-            ma_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
-            ma_uint64 frameCountThisIteration = framesToRead;
-            if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
-                frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
-            }
-            if (channelCount == 2) {
-                const ma_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
-                const ma_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
-                switch (pFlac->currentFLACFrame.header.channelAssignment)
-                {
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
-                    {
-                        ma_dr_flac_read_pcm_frames_s32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
-                    {
-                        ma_dr_flac_read_pcm_frames_s32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
-                    {
-                        ma_dr_flac_read_pcm_frames_s32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
-                    default:
-                    {
-                        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                }
-            } else {
-                ma_uint64 i;
-                for (i = 0; i < frameCountThisIteration; ++i) {
-                    unsigned int j;
-                    for (j = 0; j < channelCount; ++j) {
-                        pBufferOut[(i*channelCount)+j] = (ma_int32)((ma_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
-                    }
-                }
-            }
-            framesRead                += frameCountThisIteration;
-            pBufferOut                += frameCountThisIteration * channelCount;
-            framesToRead              -= frameCountThisIteration;
-            pFlac->currentPCMFrame    += frameCountThisIteration;
-            pFlac->currentFLACFrame.pcmFramesRemaining -= (ma_uint32)frameCountThisIteration;
-        }
-    }
-    return framesRead;
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    for (i = 0; i < frameCount; ++i) {
-        ma_uint32 left  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-        ma_uint32 side  = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-        ma_uint32 right = left - side;
-        left  >>= 16;
-        right >>= 16;
-        pOutputSamples[i*2+0] = (ma_int16)left;
-        pOutputSamples[i*2+1] = (ma_int16)right;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    for (i = 0; i < frameCount4; ++i) {
-        ma_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
-        ma_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
-        ma_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
-        ma_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
-        ma_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
-        ma_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
-        ma_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
-        ma_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
-        ma_uint32 right0 = left0 - side0;
-        ma_uint32 right1 = left1 - side1;
-        ma_uint32 right2 = left2 - side2;
-        ma_uint32 right3 = left3 - side3;
-        left0  >>= 16;
-        left1  >>= 16;
-        left2  >>= 16;
-        left3  >>= 16;
-        right0 >>= 16;
-        right1 >>= 16;
-        right2 >>= 16;
-        right3 >>= 16;
-        pOutputSamples[i*8+0] = (ma_int16)left0;
-        pOutputSamples[i*8+1] = (ma_int16)right0;
-        pOutputSamples[i*8+2] = (ma_int16)left1;
-        pOutputSamples[i*8+3] = (ma_int16)right1;
-        pOutputSamples[i*8+4] = (ma_int16)left2;
-        pOutputSamples[i*8+5] = (ma_int16)right2;
-        pOutputSamples[i*8+6] = (ma_int16)left3;
-        pOutputSamples[i*8+7] = (ma_int16)right3;
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 left  = pInputSamples0U32[i] << shift0;
-        ma_uint32 side  = pInputSamples1U32[i] << shift1;
-        ma_uint32 right = left - side;
-        left  >>= 16;
-        right >>= 16;
-        pOutputSamples[i*2+0] = (ma_int16)left;
-        pOutputSamples[i*2+1] = (ma_int16)right;
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    for (i = 0; i < frameCount4; ++i) {
-        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
-        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
-        __m128i right = _mm_sub_epi32(left, side);
-        left  = _mm_srai_epi32(left,  16);
-        right = _mm_srai_epi32(right, 16);
-        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 left  = pInputSamples0U32[i] << shift0;
-        ma_uint32 side  = pInputSamples1U32[i] << shift1;
-        ma_uint32 right = left - side;
-        left  >>= 16;
-        right >>= 16;
-        pOutputSamples[i*2+0] = (ma_int16)left;
-        pOutputSamples[i*2+1] = (ma_int16)right;
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    int32x4_t shift0_4;
-    int32x4_t shift1_4;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    shift0_4 = vdupq_n_s32(shift0);
-    shift1_4 = vdupq_n_s32(shift1);
-    for (i = 0; i < frameCount4; ++i) {
-        uint32x4_t left;
-        uint32x4_t side;
-        uint32x4_t right;
-        left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
-        side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
-        right = vsubq_u32(left, side);
-        left  = vshrq_n_u32(left,  16);
-        right = vshrq_n_u32(right, 16);
-        ma_dr_flac__vst2q_u16((ma_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 left  = pInputSamples0U32[i] << shift0;
-        ma_uint32 side  = pInputSamples1U32[i] << shift1;
-        ma_uint32 right = left - side;
-        left  >>= 16;
-        right >>= 16;
-        pOutputSamples[i*2+0] = (ma_int16)left;
-        pOutputSamples[i*2+1] = (ma_int16)right;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s16__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s16__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_s16__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_s16__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    for (i = 0; i < frameCount; ++i) {
-        ma_uint32 side  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-        ma_uint32 right = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-        ma_uint32 left  = right + side;
-        left  >>= 16;
-        right >>= 16;
-        pOutputSamples[i*2+0] = (ma_int16)left;
-        pOutputSamples[i*2+1] = (ma_int16)right;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    for (i = 0; i < frameCount4; ++i) {
-        ma_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
-        ma_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
-        ma_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
-        ma_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
-        ma_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
-        ma_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
-        ma_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
-        ma_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
-        ma_uint32 left0 = right0 + side0;
-        ma_uint32 left1 = right1 + side1;
-        ma_uint32 left2 = right2 + side2;
-        ma_uint32 left3 = right3 + side3;
-        left0  >>= 16;
-        left1  >>= 16;
-        left2  >>= 16;
-        left3  >>= 16;
-        right0 >>= 16;
-        right1 >>= 16;
-        right2 >>= 16;
-        right3 >>= 16;
-        pOutputSamples[i*8+0] = (ma_int16)left0;
-        pOutputSamples[i*8+1] = (ma_int16)right0;
-        pOutputSamples[i*8+2] = (ma_int16)left1;
-        pOutputSamples[i*8+3] = (ma_int16)right1;
-        pOutputSamples[i*8+4] = (ma_int16)left2;
-        pOutputSamples[i*8+5] = (ma_int16)right2;
-        pOutputSamples[i*8+6] = (ma_int16)left3;
-        pOutputSamples[i*8+7] = (ma_int16)right3;
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 side  = pInputSamples0U32[i] << shift0;
-        ma_uint32 right = pInputSamples1U32[i] << shift1;
-        ma_uint32 left  = right + side;
-        left  >>= 16;
-        right >>= 16;
-        pOutputSamples[i*2+0] = (ma_int16)left;
-        pOutputSamples[i*2+1] = (ma_int16)right;
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    for (i = 0; i < frameCount4; ++i) {
-        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
-        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
-        __m128i left  = _mm_add_epi32(right, side);
-        left  = _mm_srai_epi32(left,  16);
-        right = _mm_srai_epi32(right, 16);
-        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 side  = pInputSamples0U32[i] << shift0;
-        ma_uint32 right = pInputSamples1U32[i] << shift1;
-        ma_uint32 left  = right + side;
-        left  >>= 16;
-        right >>= 16;
-        pOutputSamples[i*2+0] = (ma_int16)left;
-        pOutputSamples[i*2+1] = (ma_int16)right;
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    int32x4_t shift0_4;
-    int32x4_t shift1_4;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    shift0_4 = vdupq_n_s32(shift0);
-    shift1_4 = vdupq_n_s32(shift1);
-    for (i = 0; i < frameCount4; ++i) {
-        uint32x4_t side;
-        uint32x4_t right;
-        uint32x4_t left;
-        side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
-        right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
-        left  = vaddq_u32(right, side);
-        left  = vshrq_n_u32(left,  16);
-        right = vshrq_n_u32(right, 16);
-        ma_dr_flac__vst2q_u16((ma_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 side  = pInputSamples0U32[i] << shift0;
-        ma_uint32 right = pInputSamples1U32[i] << shift1;
-        ma_uint32 left  = right + side;
-        left  >>= 16;
-        right >>= 16;
-        pOutputSamples[i*2+0] = (ma_int16)left;
-        pOutputSamples[i*2+1] = (ma_int16)right;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s16__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s16__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_s16__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_s16__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    for (ma_uint64 i = 0; i < frameCount; ++i) {
-        ma_uint32 mid  = (ma_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-        ma_uint32 side = (ma_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-        mid = (mid << 1) | (side & 0x01);
-        pOutputSamples[i*2+0] = (ma_int16)(((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16);
-        pOutputSamples[i*2+1] = (ma_int16)(((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16);
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift = unusedBitsPerSample;
-    if (shift > 0) {
-        shift -= 1;
-        for (i = 0; i < frameCount4; ++i) {
-            ma_uint32 temp0L;
-            ma_uint32 temp1L;
-            ma_uint32 temp2L;
-            ma_uint32 temp3L;
-            ma_uint32 temp0R;
-            ma_uint32 temp1R;
-            ma_uint32 temp2R;
-            ma_uint32 temp3R;
-            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid0 = (mid0 << 1) | (side0 & 0x01);
-            mid1 = (mid1 << 1) | (side1 & 0x01);
-            mid2 = (mid2 << 1) | (side2 & 0x01);
-            mid3 = (mid3 << 1) | (side3 & 0x01);
-            temp0L = (mid0 + side0) << shift;
-            temp1L = (mid1 + side1) << shift;
-            temp2L = (mid2 + side2) << shift;
-            temp3L = (mid3 + side3) << shift;
-            temp0R = (mid0 - side0) << shift;
-            temp1R = (mid1 - side1) << shift;
-            temp2R = (mid2 - side2) << shift;
-            temp3R = (mid3 - side3) << shift;
-            temp0L >>= 16;
-            temp1L >>= 16;
-            temp2L >>= 16;
-            temp3L >>= 16;
-            temp0R >>= 16;
-            temp1R >>= 16;
-            temp2R >>= 16;
-            temp3R >>= 16;
-            pOutputSamples[i*8+0] = (ma_int16)temp0L;
-            pOutputSamples[i*8+1] = (ma_int16)temp0R;
-            pOutputSamples[i*8+2] = (ma_int16)temp1L;
-            pOutputSamples[i*8+3] = (ma_int16)temp1R;
-            pOutputSamples[i*8+4] = (ma_int16)temp2L;
-            pOutputSamples[i*8+5] = (ma_int16)temp2R;
-            pOutputSamples[i*8+6] = (ma_int16)temp3L;
-            pOutputSamples[i*8+7] = (ma_int16)temp3R;
-        }
-    } else {
-        for (i = 0; i < frameCount4; ++i) {
-            ma_uint32 temp0L;
-            ma_uint32 temp1L;
-            ma_uint32 temp2L;
-            ma_uint32 temp3L;
-            ma_uint32 temp0R;
-            ma_uint32 temp1R;
-            ma_uint32 temp2R;
-            ma_uint32 temp3R;
-            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid0 = (mid0 << 1) | (side0 & 0x01);
-            mid1 = (mid1 << 1) | (side1 & 0x01);
-            mid2 = (mid2 << 1) | (side2 & 0x01);
-            mid3 = (mid3 << 1) | (side3 & 0x01);
-            temp0L = ((ma_int32)(mid0 + side0) >> 1);
-            temp1L = ((ma_int32)(mid1 + side1) >> 1);
-            temp2L = ((ma_int32)(mid2 + side2) >> 1);
-            temp3L = ((ma_int32)(mid3 + side3) >> 1);
-            temp0R = ((ma_int32)(mid0 - side0) >> 1);
-            temp1R = ((ma_int32)(mid1 - side1) >> 1);
-            temp2R = ((ma_int32)(mid2 - side2) >> 1);
-            temp3R = ((ma_int32)(mid3 - side3) >> 1);
-            temp0L >>= 16;
-            temp1L >>= 16;
-            temp2L >>= 16;
-            temp3L >>= 16;
-            temp0R >>= 16;
-            temp1R >>= 16;
-            temp2R >>= 16;
-            temp3R >>= 16;
-            pOutputSamples[i*8+0] = (ma_int16)temp0L;
-            pOutputSamples[i*8+1] = (ma_int16)temp0R;
-            pOutputSamples[i*8+2] = (ma_int16)temp1L;
-            pOutputSamples[i*8+3] = (ma_int16)temp1R;
-            pOutputSamples[i*8+4] = (ma_int16)temp2L;
-            pOutputSamples[i*8+5] = (ma_int16)temp2R;
-            pOutputSamples[i*8+6] = (ma_int16)temp3L;
-            pOutputSamples[i*8+7] = (ma_int16)temp3R;
-        }
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-        ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-        mid = (mid << 1) | (side & 0x01);
-        pOutputSamples[i*2+0] = (ma_int16)(((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16);
-        pOutputSamples[i*2+1] = (ma_int16)(((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16);
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift = unusedBitsPerSample;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    if (shift == 0) {
-        for (i = 0; i < frameCount4; ++i) {
-            __m128i mid;
-            __m128i side;
-            __m128i left;
-            __m128i right;
-            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
-            left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
-            right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
-            left  = _mm_srai_epi32(left,  16);
-            right = _mm_srai_epi32(right, 16);
-            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int16)(((ma_int32)(mid + side) >> 1) >> 16);
-            pOutputSamples[i*2+1] = (ma_int16)(((ma_int32)(mid - side) >> 1) >> 16);
-        }
-    } else {
-        shift -= 1;
-        for (i = 0; i < frameCount4; ++i) {
-            __m128i mid;
-            __m128i side;
-            __m128i left;
-            __m128i right;
-            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
-            left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
-            right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
-            left  = _mm_srai_epi32(left,  16);
-            right = _mm_srai_epi32(right, 16);
-            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int16)(((mid + side) << shift) >> 16);
-            pOutputSamples[i*2+1] = (ma_int16)(((mid - side) << shift) >> 16);
-        }
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift = unusedBitsPerSample;
-    int32x4_t wbpsShift0_4;
-    int32x4_t wbpsShift1_4;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-    wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-    if (shift == 0) {
-        for (i = 0; i < frameCount4; ++i) {
-            uint32x4_t mid;
-            uint32x4_t side;
-            int32x4_t left;
-            int32x4_t right;
-            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
-            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
-            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
-            left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
-            right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
-            left  = vshrq_n_s32(left,  16);
-            right = vshrq_n_s32(right, 16);
-            ma_dr_flac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int16)(((ma_int32)(mid + side) >> 1) >> 16);
-            pOutputSamples[i*2+1] = (ma_int16)(((ma_int32)(mid - side) >> 1) >> 16);
-        }
-    } else {
-        int32x4_t shift4;
-        shift -= 1;
-        shift4 = vdupq_n_s32(shift);
-        for (i = 0; i < frameCount4; ++i) {
-            uint32x4_t mid;
-            uint32x4_t side;
-            int32x4_t left;
-            int32x4_t right;
-            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
-            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
-            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
-            left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
-            right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
-            left  = vshrq_n_s32(left,  16);
-            right = vshrq_n_s32(right, 16);
-            ma_dr_flac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int16)(((mid + side) << shift) >> 16);
-            pOutputSamples[i*2+1] = (ma_int16)(((mid - side) << shift) >> 16);
-        }
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s16__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s16__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_s16__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_s16__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    for (ma_uint64 i = 0; i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int16)((ma_int32)((ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) >> 16);
-        pOutputSamples[i*2+1] = (ma_int16)((ma_int32)((ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) >> 16);
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    for (i = 0; i < frameCount4; ++i) {
-        ma_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
-        ma_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
-        ma_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
-        ma_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
-        ma_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
-        ma_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
-        ma_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
-        ma_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
-        tempL0 >>= 16;
-        tempL1 >>= 16;
-        tempL2 >>= 16;
-        tempL3 >>= 16;
-        tempR0 >>= 16;
-        tempR1 >>= 16;
-        tempR2 >>= 16;
-        tempR3 >>= 16;
-        pOutputSamples[i*8+0] = (ma_int16)tempL0;
-        pOutputSamples[i*8+1] = (ma_int16)tempR0;
-        pOutputSamples[i*8+2] = (ma_int16)tempL1;
-        pOutputSamples[i*8+3] = (ma_int16)tempR1;
-        pOutputSamples[i*8+4] = (ma_int16)tempL2;
-        pOutputSamples[i*8+5] = (ma_int16)tempR2;
-        pOutputSamples[i*8+6] = (ma_int16)tempL3;
-        pOutputSamples[i*8+7] = (ma_int16)tempR3;
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int16)((pInputSamples0U32[i] << shift0) >> 16);
-        pOutputSamples[i*2+1] = (ma_int16)((pInputSamples1U32[i] << shift1) >> 16);
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    for (i = 0; i < frameCount4; ++i) {
-        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
-        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
-        left  = _mm_srai_epi32(left,  16);
-        right = _mm_srai_epi32(right, 16);
-        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int16)((pInputSamples0U32[i] << shift0) >> 16);
-        pOutputSamples[i*2+1] = (ma_int16)((pInputSamples1U32[i] << shift1) >> 16);
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    int32x4_t shift0_4 = vdupq_n_s32(shift0);
-    int32x4_t shift1_4 = vdupq_n_s32(shift1);
-    for (i = 0; i < frameCount4; ++i) {
-        int32x4_t left;
-        int32x4_t right;
-        left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
-        right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
-        left  = vshrq_n_s32(left,  16);
-        right = vshrq_n_s32(right, 16);
-        ma_dr_flac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int16)((pInputSamples0U32[i] << shift0) >> 16);
-        pOutputSamples[i*2+1] = (ma_int16)((pInputSamples1U32[i] << shift1) >> 16);
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-MA_API ma_uint64 ma_dr_flac_read_pcm_frames_s16(ma_dr_flac* pFlac, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    ma_uint64 framesRead;
-    ma_uint32 unusedBitsPerSample;
-    if (pFlac == NULL || framesToRead == 0) {
-        return 0;
-    }
-    if (pBufferOut == NULL) {
-        return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, framesToRead);
-    }
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 32);
-    unusedBitsPerSample = 32 - pFlac->bitsPerSample;
-    framesRead = 0;
-    while (framesToRead > 0) {
-        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
-            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
-                break;
-            }
-        } else {
-            unsigned int channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
-            ma_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
-            ma_uint64 frameCountThisIteration = framesToRead;
-            if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
-                frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
-            }
-            if (channelCount == 2) {
-                const ma_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
-                const ma_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
-                switch (pFlac->currentFLACFrame.header.channelAssignment)
-                {
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
-                    {
-                        ma_dr_flac_read_pcm_frames_s16__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
-                    {
-                        ma_dr_flac_read_pcm_frames_s16__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
-                    {
-                        ma_dr_flac_read_pcm_frames_s16__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
-                    default:
-                    {
-                        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                }
-            } else {
-                ma_uint64 i;
-                for (i = 0; i < frameCountThisIteration; ++i) {
-                    unsigned int j;
-                    for (j = 0; j < channelCount; ++j) {
-                        ma_int32 sampleS32 = (ma_int32)((ma_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
-                        pBufferOut[(i*channelCount)+j] = (ma_int16)(sampleS32 >> 16);
-                    }
-                }
-            }
-            framesRead                += frameCountThisIteration;
-            pBufferOut                += frameCountThisIteration * channelCount;
-            framesToRead              -= frameCountThisIteration;
-            pFlac->currentPCMFrame    += frameCountThisIteration;
-            pFlac->currentFLACFrame.pcmFramesRemaining -= (ma_uint32)frameCountThisIteration;
-        }
-    }
-    return framesRead;
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    for (i = 0; i < frameCount; ++i) {
-        ma_uint32 left  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-        ma_uint32 side  = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-        ma_uint32 right = left - side;
-        pOutputSamples[i*2+0] = (float)((ma_int32)left  / 2147483648.0);
-        pOutputSamples[i*2+1] = (float)((ma_int32)right / 2147483648.0);
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    float factor = 1 / 2147483648.0;
-    for (i = 0; i < frameCount4; ++i) {
-        ma_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
-        ma_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
-        ma_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
-        ma_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
-        ma_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
-        ma_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
-        ma_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
-        ma_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
-        ma_uint32 right0 = left0 - side0;
-        ma_uint32 right1 = left1 - side1;
-        ma_uint32 right2 = left2 - side2;
-        ma_uint32 right3 = left3 - side3;
-        pOutputSamples[i*8+0] = (ma_int32)left0  * factor;
-        pOutputSamples[i*8+1] = (ma_int32)right0 * factor;
-        pOutputSamples[i*8+2] = (ma_int32)left1  * factor;
-        pOutputSamples[i*8+3] = (ma_int32)right1 * factor;
-        pOutputSamples[i*8+4] = (ma_int32)left2  * factor;
-        pOutputSamples[i*8+5] = (ma_int32)right2 * factor;
-        pOutputSamples[i*8+6] = (ma_int32)left3  * factor;
-        pOutputSamples[i*8+7] = (ma_int32)right3 * factor;
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 left  = pInputSamples0U32[i] << shift0;
-        ma_uint32 side  = pInputSamples1U32[i] << shift1;
-        ma_uint32 right = left - side;
-        pOutputSamples[i*2+0] = (ma_int32)left  * factor;
-        pOutputSamples[i*2+1] = (ma_int32)right * factor;
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
-    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
-    __m128 factor;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    factor = _mm_set1_ps(1.0f / 8388608.0f);
-    for (i = 0; i < frameCount4; ++i) {
-        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
-        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
-        __m128i right = _mm_sub_epi32(left, side);
-        __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
-        __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
-        _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
-        _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 left  = pInputSamples0U32[i] << shift0;
-        ma_uint32 side  = pInputSamples1U32[i] << shift1;
-        ma_uint32 right = left - side;
-        pOutputSamples[i*2+0] = (ma_int32)left  / 8388608.0f;
-        pOutputSamples[i*2+1] = (ma_int32)right / 8388608.0f;
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
-    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
-    float32x4_t factor4;
-    int32x4_t shift0_4;
-    int32x4_t shift1_4;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    factor4  = vdupq_n_f32(1.0f / 8388608.0f);
-    shift0_4 = vdupq_n_s32(shift0);
-    shift1_4 = vdupq_n_s32(shift1);
-    for (i = 0; i < frameCount4; ++i) {
-        uint32x4_t left;
-        uint32x4_t side;
-        uint32x4_t right;
-        float32x4_t leftf;
-        float32x4_t rightf;
-        left   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
-        side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
-        right  = vsubq_u32(left, side);
-        leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
-        rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
-        ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 left  = pInputSamples0U32[i] << shift0;
-        ma_uint32 side  = pInputSamples1U32[i] << shift1;
-        ma_uint32 right = left - side;
-        pOutputSamples[i*2+0] = (ma_int32)left  / 8388608.0f;
-        pOutputSamples[i*2+1] = (ma_int32)right / 8388608.0f;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_f32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_f32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_f32__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_f32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    for (i = 0; i < frameCount; ++i) {
-        ma_uint32 side  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-        ma_uint32 right = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-        ma_uint32 left  = right + side;
-        pOutputSamples[i*2+0] = (float)((ma_int32)left  / 2147483648.0);
-        pOutputSamples[i*2+1] = (float)((ma_int32)right / 2147483648.0);
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    float factor = 1 / 2147483648.0;
-    for (i = 0; i < frameCount4; ++i) {
-        ma_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
-        ma_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
-        ma_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
-        ma_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
-        ma_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
-        ma_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
-        ma_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
-        ma_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
-        ma_uint32 left0 = right0 + side0;
-        ma_uint32 left1 = right1 + side1;
-        ma_uint32 left2 = right2 + side2;
-        ma_uint32 left3 = right3 + side3;
-        pOutputSamples[i*8+0] = (ma_int32)left0  * factor;
-        pOutputSamples[i*8+1] = (ma_int32)right0 * factor;
-        pOutputSamples[i*8+2] = (ma_int32)left1  * factor;
-        pOutputSamples[i*8+3] = (ma_int32)right1 * factor;
-        pOutputSamples[i*8+4] = (ma_int32)left2  * factor;
-        pOutputSamples[i*8+5] = (ma_int32)right2 * factor;
-        pOutputSamples[i*8+6] = (ma_int32)left3  * factor;
-        pOutputSamples[i*8+7] = (ma_int32)right3 * factor;
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 side  = pInputSamples0U32[i] << shift0;
-        ma_uint32 right = pInputSamples1U32[i] << shift1;
-        ma_uint32 left  = right + side;
-        pOutputSamples[i*2+0] = (ma_int32)left  * factor;
-        pOutputSamples[i*2+1] = (ma_int32)right * factor;
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
-    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
-    __m128 factor;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    factor = _mm_set1_ps(1.0f / 8388608.0f);
-    for (i = 0; i < frameCount4; ++i) {
-        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
-        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
-        __m128i left  = _mm_add_epi32(right, side);
-        __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
-        __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
-        _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
-        _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 side  = pInputSamples0U32[i] << shift0;
-        ma_uint32 right = pInputSamples1U32[i] << shift1;
-        ma_uint32 left  = right + side;
-        pOutputSamples[i*2+0] = (ma_int32)left  / 8388608.0f;
-        pOutputSamples[i*2+1] = (ma_int32)right / 8388608.0f;
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
-    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
-    float32x4_t factor4;
-    int32x4_t shift0_4;
-    int32x4_t shift1_4;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    factor4  = vdupq_n_f32(1.0f / 8388608.0f);
-    shift0_4 = vdupq_n_s32(shift0);
-    shift1_4 = vdupq_n_s32(shift1);
-    for (i = 0; i < frameCount4; ++i) {
-        uint32x4_t side;
-        uint32x4_t right;
-        uint32x4_t left;
-        float32x4_t leftf;
-        float32x4_t rightf;
-        side   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
-        right  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
-        left   = vaddq_u32(right, side);
-        leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
-        rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
-        ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 side  = pInputSamples0U32[i] << shift0;
-        ma_uint32 right = pInputSamples1U32[i] << shift1;
-        ma_uint32 left  = right + side;
-        pOutputSamples[i*2+0] = (ma_int32)left  / 8388608.0f;
-        pOutputSamples[i*2+1] = (ma_int32)right / 8388608.0f;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_f32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_f32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_f32__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_f32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    for (ma_uint64 i = 0; i < frameCount; ++i) {
-        ma_uint32 mid  = (ma_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-        ma_uint32 side = (ma_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-        mid = (mid << 1) | (side & 0x01);
-        pOutputSamples[i*2+0] = (float)((((ma_int32)(mid + side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
-        pOutputSamples[i*2+1] = (float)((((ma_int32)(mid - side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift = unusedBitsPerSample;
-    float factor = 1 / 2147483648.0;
-    if (shift > 0) {
-        shift -= 1;
-        for (i = 0; i < frameCount4; ++i) {
-            ma_uint32 temp0L;
-            ma_uint32 temp1L;
-            ma_uint32 temp2L;
-            ma_uint32 temp3L;
-            ma_uint32 temp0R;
-            ma_uint32 temp1R;
-            ma_uint32 temp2R;
-            ma_uint32 temp3R;
-            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid0 = (mid0 << 1) | (side0 & 0x01);
-            mid1 = (mid1 << 1) | (side1 & 0x01);
-            mid2 = (mid2 << 1) | (side2 & 0x01);
-            mid3 = (mid3 << 1) | (side3 & 0x01);
-            temp0L = (mid0 + side0) << shift;
-            temp1L = (mid1 + side1) << shift;
-            temp2L = (mid2 + side2) << shift;
-            temp3L = (mid3 + side3) << shift;
-            temp0R = (mid0 - side0) << shift;
-            temp1R = (mid1 - side1) << shift;
-            temp2R = (mid2 - side2) << shift;
-            temp3R = (mid3 - side3) << shift;
-            pOutputSamples[i*8+0] = (ma_int32)temp0L * factor;
-            pOutputSamples[i*8+1] = (ma_int32)temp0R * factor;
-            pOutputSamples[i*8+2] = (ma_int32)temp1L * factor;
-            pOutputSamples[i*8+3] = (ma_int32)temp1R * factor;
-            pOutputSamples[i*8+4] = (ma_int32)temp2L * factor;
-            pOutputSamples[i*8+5] = (ma_int32)temp2R * factor;
-            pOutputSamples[i*8+6] = (ma_int32)temp3L * factor;
-            pOutputSamples[i*8+7] = (ma_int32)temp3R * factor;
-        }
-    } else {
-        for (i = 0; i < frameCount4; ++i) {
-            ma_uint32 temp0L;
-            ma_uint32 temp1L;
-            ma_uint32 temp2L;
-            ma_uint32 temp3L;
-            ma_uint32 temp0R;
-            ma_uint32 temp1R;
-            ma_uint32 temp2R;
-            ma_uint32 temp3R;
-            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid0 = (mid0 << 1) | (side0 & 0x01);
-            mid1 = (mid1 << 1) | (side1 & 0x01);
-            mid2 = (mid2 << 1) | (side2 & 0x01);
-            mid3 = (mid3 << 1) | (side3 & 0x01);
-            temp0L = (ma_uint32)((ma_int32)(mid0 + side0) >> 1);
-            temp1L = (ma_uint32)((ma_int32)(mid1 + side1) >> 1);
-            temp2L = (ma_uint32)((ma_int32)(mid2 + side2) >> 1);
-            temp3L = (ma_uint32)((ma_int32)(mid3 + side3) >> 1);
-            temp0R = (ma_uint32)((ma_int32)(mid0 - side0) >> 1);
-            temp1R = (ma_uint32)((ma_int32)(mid1 - side1) >> 1);
-            temp2R = (ma_uint32)((ma_int32)(mid2 - side2) >> 1);
-            temp3R = (ma_uint32)((ma_int32)(mid3 - side3) >> 1);
-            pOutputSamples[i*8+0] = (ma_int32)temp0L * factor;
-            pOutputSamples[i*8+1] = (ma_int32)temp0R * factor;
-            pOutputSamples[i*8+2] = (ma_int32)temp1L * factor;
-            pOutputSamples[i*8+3] = (ma_int32)temp1R * factor;
-            pOutputSamples[i*8+4] = (ma_int32)temp2L * factor;
-            pOutputSamples[i*8+5] = (ma_int32)temp2R * factor;
-            pOutputSamples[i*8+6] = (ma_int32)temp3L * factor;
-            pOutputSamples[i*8+7] = (ma_int32)temp3R * factor;
-        }
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-        ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-        mid = (mid << 1) | (side & 0x01);
-        pOutputSamples[i*2+0] = (ma_int32)((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample) * factor;
-        pOutputSamples[i*2+1] = (ma_int32)((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample) * factor;
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift = unusedBitsPerSample - 8;
-    float factor;
-    __m128 factor128;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    factor = 1.0f / 8388608.0f;
-    factor128 = _mm_set1_ps(factor);
-    if (shift == 0) {
-        for (i = 0; i < frameCount4; ++i) {
-            __m128i mid;
-            __m128i side;
-            __m128i tempL;
-            __m128i tempR;
-            __m128  leftf;
-            __m128  rightf;
-            mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-            side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-            mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
-            tempL  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
-            tempR  = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
-            leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
-            rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
-            _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
-            _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = ((ma_int32)(mid + side) >> 1) * factor;
-            pOutputSamples[i*2+1] = ((ma_int32)(mid - side) >> 1) * factor;
-        }
-    } else {
-        shift -= 1;
-        for (i = 0; i < frameCount4; ++i) {
-            __m128i mid;
-            __m128i side;
-            __m128i tempL;
-            __m128i tempR;
-            __m128 leftf;
-            __m128 rightf;
-            mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-            side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-            mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
-            tempL  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
-            tempR  = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
-            leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
-            rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
-            _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
-            _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int32)((mid + side) << shift) * factor;
-            pOutputSamples[i*2+1] = (ma_int32)((mid - side) << shift) * factor;
-        }
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift = unusedBitsPerSample - 8;
-    float factor;
-    float32x4_t factor4;
-    int32x4_t shift4;
-    int32x4_t wbps0_4;
-    int32x4_t wbps1_4;
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
-    factor  = 1.0f / 8388608.0f;
-    factor4 = vdupq_n_f32(factor);
-    wbps0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
-    wbps1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
-    if (shift == 0) {
-        for (i = 0; i < frameCount4; ++i) {
-            int32x4_t lefti;
-            int32x4_t righti;
-            float32x4_t leftf;
-            float32x4_t rightf;
-            uint32x4_t mid  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
-            uint32x4_t side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
-            mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
-            lefti  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
-            righti = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
-            leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
-            rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
-            ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = ((ma_int32)(mid + side) >> 1) * factor;
-            pOutputSamples[i*2+1] = ((ma_int32)(mid - side) >> 1) * factor;
-        }
-    } else {
-        shift -= 1;
-        shift4 = vdupq_n_s32(shift);
-        for (i = 0; i < frameCount4; ++i) {
-            uint32x4_t mid;
-            uint32x4_t side;
-            int32x4_t lefti;
-            int32x4_t righti;
-            float32x4_t leftf;
-            float32x4_t rightf;
-            mid    = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
-            side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
-            mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
-            lefti  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
-            righti = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
-            leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
-            rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
-            ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
-        }
-        for (i = (frameCount4 << 2); i < frameCount; ++i) {
-            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-            mid = (mid << 1) | (side & 0x01);
-            pOutputSamples[i*2+0] = (ma_int32)((mid + side) << shift) * factor;
-            pOutputSamples[i*2+1] = (ma_int32)((mid - side) << shift) * factor;
-        }
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_f32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_f32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_f32__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_f32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-#if 0
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    for (ma_uint64 i = 0; i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (float)((ma_int32)((ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) / 2147483648.0);
-        pOutputSamples[i*2+1] = (float)((ma_int32)((ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) / 2147483648.0);
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
-    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
-    float factor = 1 / 2147483648.0;
-    for (i = 0; i < frameCount4; ++i) {
-        ma_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
-        ma_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
-        ma_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
-        ma_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
-        ma_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
-        ma_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
-        ma_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
-        ma_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
-        pOutputSamples[i*8+0] = (ma_int32)tempL0 * factor;
-        pOutputSamples[i*8+1] = (ma_int32)tempR0 * factor;
-        pOutputSamples[i*8+2] = (ma_int32)tempL1 * factor;
-        pOutputSamples[i*8+3] = (ma_int32)tempR1 * factor;
-        pOutputSamples[i*8+4] = (ma_int32)tempL2 * factor;
-        pOutputSamples[i*8+5] = (ma_int32)tempR2 * factor;
-        pOutputSamples[i*8+6] = (ma_int32)tempL3 * factor;
-        pOutputSamples[i*8+7] = (ma_int32)tempR3 * factor;
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0) * factor;
-        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1) * factor;
-    }
-}
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
-    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
-    float factor = 1.0f / 8388608.0f;
-    __m128 factor128 = _mm_set1_ps(factor);
-    for (i = 0; i < frameCount4; ++i) {
-        __m128i lefti;
-        __m128i righti;
-        __m128 leftf;
-        __m128 rightf;
-        lefti  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
-        righti = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
-        leftf  = _mm_mul_ps(_mm_cvtepi32_ps(lefti),  factor128);
-        rightf = _mm_mul_ps(_mm_cvtepi32_ps(righti), factor128);
-        _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
-        _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0) * factor;
-        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1) * factor;
-    }
-}
-#endif
-#if defined(MA_DR_FLAC_SUPPORT_NEON)
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-    ma_uint64 i;
-    ma_uint64 frameCount4 = frameCount >> 2;
-    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
-    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
-    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
-    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
-    float factor = 1.0f / 8388608.0f;
-    float32x4_t factor4 = vdupq_n_f32(factor);
-    int32x4_t shift0_4  = vdupq_n_s32(shift0);
-    int32x4_t shift1_4  = vdupq_n_s32(shift1);
-    for (i = 0; i < frameCount4; ++i) {
-        int32x4_t lefti;
-        int32x4_t righti;
-        float32x4_t leftf;
-        float32x4_t rightf;
-        lefti  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
-        righti = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
-        leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
-        rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
-        ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
-    }
-    for (i = (frameCount4 << 2); i < frameCount; ++i) {
-        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0) * factor;
-        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1) * factor;
-    }
-}
-#endif
-static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
-{
-#if defined(MA_DR_FLAC_SUPPORT_SSE2)
-    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#elif defined(MA_DR_FLAC_SUPPORT_NEON)
-    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
-        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-    } else
-#endif
-    {
-#if 0
-        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#else
-        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
-#endif
-    }
-}
-MA_API ma_uint64 ma_dr_flac_read_pcm_frames_f32(ma_dr_flac* pFlac, ma_uint64 framesToRead, float* pBufferOut)
-{
-    ma_uint64 framesRead;
-    ma_uint32 unusedBitsPerSample;
-    if (pFlac == NULL || framesToRead == 0) {
-        return 0;
-    }
-    if (pBufferOut == NULL) {
-        return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, framesToRead);
-    }
-    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 32);
-    unusedBitsPerSample = 32 - pFlac->bitsPerSample;
-    framesRead = 0;
-    while (framesToRead > 0) {
-        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
-            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
-                break;
-            }
-        } else {
-            unsigned int channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
-            ma_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
-            ma_uint64 frameCountThisIteration = framesToRead;
-            if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
-                frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
-            }
-            if (channelCount == 2) {
-                const ma_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
-                const ma_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
-                switch (pFlac->currentFLACFrame.header.channelAssignment)
-                {
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
-                    {
-                        ma_dr_flac_read_pcm_frames_f32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
-                    {
-                        ma_dr_flac_read_pcm_frames_f32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
-                    {
-                        ma_dr_flac_read_pcm_frames_f32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
-                    default:
-                    {
-                        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
-                    } break;
-                }
-            } else {
-                ma_uint64 i;
-                for (i = 0; i < frameCountThisIteration; ++i) {
-                    unsigned int j;
-                    for (j = 0; j < channelCount; ++j) {
-                        ma_int32 sampleS32 = (ma_int32)((ma_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
-                        pBufferOut[(i*channelCount)+j] = (float)(sampleS32 / 2147483648.0);
-                    }
-                }
-            }
-            framesRead                += frameCountThisIteration;
-            pBufferOut                += frameCountThisIteration * channelCount;
-            framesToRead              -= frameCountThisIteration;
-            pFlac->currentPCMFrame    += frameCountThisIteration;
-            pFlac->currentFLACFrame.pcmFramesRemaining -= (unsigned int)frameCountThisIteration;
-        }
-    }
-    return framesRead;
-}
-MA_API ma_bool32 ma_dr_flac_seek_to_pcm_frame(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
-{
-    if (pFlac == NULL) {
-        return MA_FALSE;
-    }
-    if (pFlac->currentPCMFrame == pcmFrameIndex) {
-        return MA_TRUE;
-    }
-    if (pFlac->firstFLACFramePosInBytes == 0) {
-        return MA_FALSE;
-    }
-    if (pcmFrameIndex == 0) {
-        pFlac->currentPCMFrame = 0;
-        return ma_dr_flac__seek_to_first_frame(pFlac);
-    } else {
-        ma_bool32 wasSuccessful = MA_FALSE;
-        ma_uint64 originalPCMFrame = pFlac->currentPCMFrame;
-        if (pcmFrameIndex > pFlac->totalPCMFrameCount) {
-            pcmFrameIndex = pFlac->totalPCMFrameCount;
-        }
-        if (pcmFrameIndex > pFlac->currentPCMFrame) {
-            ma_uint32 offset = (ma_uint32)(pcmFrameIndex - pFlac->currentPCMFrame);
-            if (pFlac->currentFLACFrame.pcmFramesRemaining >  offset) {
-                pFlac->currentFLACFrame.pcmFramesRemaining -= offset;
-                pFlac->currentPCMFrame = pcmFrameIndex;
-                return MA_TRUE;
-            }
-        } else {
-            ma_uint32 offsetAbs = (ma_uint32)(pFlac->currentPCMFrame - pcmFrameIndex);
-            ma_uint32 currentFLACFramePCMFrameCount = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
-            ma_uint32 currentFLACFramePCMFramesConsumed = currentFLACFramePCMFrameCount - pFlac->currentFLACFrame.pcmFramesRemaining;
-            if (currentFLACFramePCMFramesConsumed > offsetAbs) {
-                pFlac->currentFLACFrame.pcmFramesRemaining += offsetAbs;
-                pFlac->currentPCMFrame = pcmFrameIndex;
-                return MA_TRUE;
-            }
-        }
-#ifndef MA_DR_FLAC_NO_OGG
-        if (pFlac->container == ma_dr_flac_container_ogg)
-        {
-            wasSuccessful = ma_dr_flac_ogg__seek_to_pcm_frame(pFlac, pcmFrameIndex);
-        }
-        else
-#endif
-        {
-            if (!pFlac->_noSeekTableSeek) {
-                wasSuccessful = ma_dr_flac__seek_to_pcm_frame__seek_table(pFlac, pcmFrameIndex);
-            }
-#if !defined(MA_DR_FLAC_NO_CRC)
-            if (!wasSuccessful && !pFlac->_noBinarySearchSeek && pFlac->totalPCMFrameCount > 0) {
-                wasSuccessful = ma_dr_flac__seek_to_pcm_frame__binary_search(pFlac, pcmFrameIndex);
-            }
-#endif
-            if (!wasSuccessful && !pFlac->_noBruteForceSeek) {
-                wasSuccessful = ma_dr_flac__seek_to_pcm_frame__brute_force(pFlac, pcmFrameIndex);
-            }
-        }
-        if (wasSuccessful) {
-            pFlac->currentPCMFrame = pcmFrameIndex;
-        } else {
-            if (ma_dr_flac_seek_to_pcm_frame(pFlac, originalPCMFrame) == MA_FALSE) {
-                ma_dr_flac_seek_to_pcm_frame(pFlac, 0);
-            }
-        }
-        return wasSuccessful;
-    }
-}
-#define MA_DR_FLAC_DEFINE_FULL_READ_AND_CLOSE(extension, type) \
-static type* ma_dr_flac__full_read_and_close_ ## extension (ma_dr_flac* pFlac, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalPCMFrameCountOut)\
-{                                                                                                                                                                   \
-    type* pSampleData = NULL;                                                                                                                                       \
-    ma_uint64 totalPCMFrameCount;                                                                                                                               \
-                                                                                                                                                                    \
-    MA_DR_FLAC_ASSERT(pFlac != NULL);                                                                                                                                   \
-                                                                                                                                                                    \
-    totalPCMFrameCount = pFlac->totalPCMFrameCount;                                                                                                                 \
-                                                                                                                                                                    \
-    if (totalPCMFrameCount == 0) {                                                                                                                                  \
-        type buffer[4096];                                                                                                                                          \
-        ma_uint64 pcmFramesRead;                                                                                                                                \
-        size_t sampleDataBufferSize = sizeof(buffer);                                                                                                               \
-                                                                                                                                                                    \
-        pSampleData = (type*)ma_dr_flac__malloc_from_callbacks(sampleDataBufferSize, &pFlac->allocationCallbacks);                                                      \
-        if (pSampleData == NULL) {                                                                                                                                  \
-            goto on_error;                                                                                                                                          \
-        }                                                                                                                                                           \
-                                                                                                                                                                    \
-        while ((pcmFramesRead = (ma_uint64)ma_dr_flac_read_pcm_frames_##extension(pFlac, sizeof(buffer)/sizeof(buffer[0])/pFlac->channels, buffer)) > 0) {          \
-            if (((totalPCMFrameCount + pcmFramesRead) * pFlac->channels * sizeof(type)) > sampleDataBufferSize) {                                                   \
-                type* pNewSampleData;                                                                                                                               \
-                size_t newSampleDataBufferSize;                                                                                                                     \
-                                                                                                                                                                    \
-                newSampleDataBufferSize = sampleDataBufferSize * 2;                                                                                                 \
-                pNewSampleData = (type*)ma_dr_flac__realloc_from_callbacks(pSampleData, newSampleDataBufferSize, sampleDataBufferSize, &pFlac->allocationCallbacks);    \
-                if (pNewSampleData == NULL) {                                                                                                                       \
-                    ma_dr_flac__free_from_callbacks(pSampleData, &pFlac->allocationCallbacks);                                                                          \
-                    goto on_error;                                                                                                                                  \
-                }                                                                                                                                                   \
-                                                                                                                                                                    \
-                sampleDataBufferSize = newSampleDataBufferSize;                                                                                                     \
-                pSampleData = pNewSampleData;                                                                                                                       \
-            }                                                                                                                                                       \
-                                                                                                                                                                    \
-            MA_DR_FLAC_COPY_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), buffer, (size_t)(pcmFramesRead*pFlac->channels*sizeof(type)));                   \
-            totalPCMFrameCount += pcmFramesRead;                                                                                                                    \
-        }                                                                                                                                                           \
-                                                                                                                                                                    \
-                                                                                                                         \
-        MA_DR_FLAC_ZERO_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), (size_t)(sampleDataBufferSize - totalPCMFrameCount*pFlac->channels*sizeof(type)));   \
-    } else {                                                                                                                                                        \
-        ma_uint64 dataSize = totalPCMFrameCount*pFlac->channels*sizeof(type);                                                                                   \
-        if (dataSize > (ma_uint64)MA_SIZE_MAX) {                                                                                                            \
-            goto on_error;                                                                                                        \
-        }                                                                                                                                                           \
-                                                                                                                                                                    \
-        pSampleData = (type*)ma_dr_flac__malloc_from_callbacks((size_t)dataSize, &pFlac->allocationCallbacks);               \
-        if (pSampleData == NULL) {                                                                                                                                  \
-            goto on_error;                                                                                                                                          \
-        }                                                                                                                                                           \
-                                                                                                                                                                    \
-        totalPCMFrameCount = ma_dr_flac_read_pcm_frames_##extension(pFlac, pFlac->totalPCMFrameCount, pSampleData);                                                     \
-    }                                                                                                                                                               \
-                                                                                                                                                                    \
-    if (sampleRateOut) *sampleRateOut = pFlac->sampleRate;                                                                                                          \
-    if (channelsOut) *channelsOut = pFlac->channels;                                                                                                                \
-    if (totalPCMFrameCountOut) *totalPCMFrameCountOut = totalPCMFrameCount;                                                                                         \
-                                                                                                                                                                    \
-    ma_dr_flac_close(pFlac);                                                                                                                                            \
-    return pSampleData;                                                                                                                                             \
-                                                                                                                                                                    \
-on_error:                                                                                                                                                           \
-    ma_dr_flac_close(pFlac);                                                                                                                                            \
-    return NULL;                                                                                                                                                    \
-}
-MA_DR_FLAC_DEFINE_FULL_READ_AND_CLOSE(s32, ma_int32)
-MA_DR_FLAC_DEFINE_FULL_READ_AND_CLOSE(s16, ma_int16)
-MA_DR_FLAC_DEFINE_FULL_READ_AND_CLOSE(f32, float)
-MA_API ma_int32* ma_dr_flac_open_and_read_pcm_frames_s32(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalPCMFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalPCMFrameCountOut) {
-        *totalPCMFrameCountOut = 0;
-    }
-    pFlac = ma_dr_flac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    return ma_dr_flac__full_read_and_close_s32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
-}
-MA_API ma_int16* ma_dr_flac_open_and_read_pcm_frames_s16(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalPCMFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalPCMFrameCountOut) {
-        *totalPCMFrameCountOut = 0;
-    }
-    pFlac = ma_dr_flac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    return ma_dr_flac__full_read_and_close_s16(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
-}
-MA_API float* ma_dr_flac_open_and_read_pcm_frames_f32(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalPCMFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalPCMFrameCountOut) {
-        *totalPCMFrameCountOut = 0;
-    }
-    pFlac = ma_dr_flac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    return ma_dr_flac__full_read_and_close_f32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
-}
-#ifndef MA_DR_FLAC_NO_STDIO
-MA_API ma_int32* ma_dr_flac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    if (sampleRate) {
-        *sampleRate = 0;
-    }
-    if (channels) {
-        *channels = 0;
-    }
-    if (totalPCMFrameCount) {
-        *totalPCMFrameCount = 0;
-    }
-    pFlac = ma_dr_flac_open_file(filename, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    return ma_dr_flac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
-}
-MA_API ma_int16* ma_dr_flac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    if (sampleRate) {
-        *sampleRate = 0;
-    }
-    if (channels) {
-        *channels = 0;
-    }
-    if (totalPCMFrameCount) {
-        *totalPCMFrameCount = 0;
-    }
-    pFlac = ma_dr_flac_open_file(filename, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    return ma_dr_flac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
-}
-MA_API float* ma_dr_flac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    if (sampleRate) {
-        *sampleRate = 0;
-    }
-    if (channels) {
-        *channels = 0;
-    }
-    if (totalPCMFrameCount) {
-        *totalPCMFrameCount = 0;
-    }
-    pFlac = ma_dr_flac_open_file(filename, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    return ma_dr_flac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
-}
-#endif
-MA_API ma_int32* ma_dr_flac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    if (sampleRate) {
-        *sampleRate = 0;
-    }
-    if (channels) {
-        *channels = 0;
-    }
-    if (totalPCMFrameCount) {
-        *totalPCMFrameCount = 0;
-    }
-    pFlac = ma_dr_flac_open_memory(data, dataSize, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    return ma_dr_flac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
-}
-MA_API ma_int16* ma_dr_flac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    if (sampleRate) {
-        *sampleRate = 0;
-    }
-    if (channels) {
-        *channels = 0;
-    }
-    if (totalPCMFrameCount) {
-        *totalPCMFrameCount = 0;
-    }
-    pFlac = ma_dr_flac_open_memory(data, dataSize, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    return ma_dr_flac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
-}
-MA_API float* ma_dr_flac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_flac* pFlac;
-    if (sampleRate) {
-        *sampleRate = 0;
-    }
-    if (channels) {
-        *channels = 0;
-    }
-    if (totalPCMFrameCount) {
-        *totalPCMFrameCount = 0;
-    }
-    pFlac = ma_dr_flac_open_memory(data, dataSize, pAllocationCallbacks);
-    if (pFlac == NULL) {
-        return NULL;
-    }
-    return ma_dr_flac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
-}
-MA_API void ma_dr_flac_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        ma_dr_flac__free_from_callbacks(p, pAllocationCallbacks);
-    } else {
-        ma_dr_flac__free_default(p, NULL);
-    }
-}
-MA_API void ma_dr_flac_init_vorbis_comment_iterator(ma_dr_flac_vorbis_comment_iterator* pIter, ma_uint32 commentCount, const void* pComments)
-{
-    if (pIter == NULL) {
-        return;
-    }
-    pIter->countRemaining = commentCount;
-    pIter->pRunningData   = (const char*)pComments;
-}
-MA_API const char* ma_dr_flac_next_vorbis_comment(ma_dr_flac_vorbis_comment_iterator* pIter, ma_uint32* pCommentLengthOut)
-{
-    ma_int32 length;
-    const char* pComment;
-    if (pCommentLengthOut) {
-        *pCommentLengthOut = 0;
-    }
-    if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
-        return NULL;
-    }
-    length = ma_dr_flac__le2host_32_ptr_unaligned(pIter->pRunningData);
-    pIter->pRunningData += 4;
-    pComment = pIter->pRunningData;
-    pIter->pRunningData += length;
-    pIter->countRemaining -= 1;
-    if (pCommentLengthOut) {
-        *pCommentLengthOut = length;
-    }
-    return pComment;
-}
-MA_API void ma_dr_flac_init_cuesheet_track_iterator(ma_dr_flac_cuesheet_track_iterator* pIter, ma_uint32 trackCount, const void* pTrackData)
-{
-    if (pIter == NULL) {
-        return;
-    }
-    pIter->countRemaining = trackCount;
-    pIter->pRunningData   = (const char*)pTrackData;
-}
-MA_API ma_bool32 ma_dr_flac_next_cuesheet_track(ma_dr_flac_cuesheet_track_iterator* pIter, ma_dr_flac_cuesheet_track* pCuesheetTrack)
-{
-    ma_dr_flac_cuesheet_track cuesheetTrack;
-    const char* pRunningData;
-    ma_uint64 offsetHi;
-    ma_uint64 offsetLo;
-    if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
-        return MA_FALSE;
-    }
-    pRunningData = pIter->pRunningData;
-    offsetHi                   = ma_dr_flac__be2host_32(*(const ma_uint32*)pRunningData); pRunningData += 4;
-    offsetLo                   = ma_dr_flac__be2host_32(*(const ma_uint32*)pRunningData); pRunningData += 4;
-    cuesheetTrack.offset       = offsetLo | (offsetHi << 32);
-    cuesheetTrack.trackNumber  = pRunningData[0];                                         pRunningData += 1;
-    MA_DR_FLAC_COPY_MEMORY(cuesheetTrack.ISRC, pRunningData, sizeof(cuesheetTrack.ISRC));     pRunningData += 12;
-    cuesheetTrack.isAudio      = (pRunningData[0] & 0x80) != 0;
-    cuesheetTrack.preEmphasis  = (pRunningData[0] & 0x40) != 0;                           pRunningData += 14;
-    cuesheetTrack.indexCount   = pRunningData[0];                                         pRunningData += 1;
-    cuesheetTrack.pIndexPoints = (const ma_dr_flac_cuesheet_track_index*)pRunningData;        pRunningData += cuesheetTrack.indexCount * sizeof(ma_dr_flac_cuesheet_track_index);
-    pIter->pRunningData = pRunningData;
-    pIter->countRemaining -= 1;
-    if (pCuesheetTrack) {
-        *pCuesheetTrack = cuesheetTrack;
-    }
-    return MA_TRUE;
-}
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-    #pragma GCC diagnostic pop
-#endif
-#endif
-/* dr_flac_c end */
-#endif  /* MA_DR_FLAC_IMPLEMENTATION */
-#endif  /* MA_NO_FLAC */
-
-#if !defined(MA_NO_MP3) && !defined(MA_NO_DECODING)
-#if !defined(MA_DR_MP3_IMPLEMENTATION) && !defined(MA_DR_MP3_IMPLEMENTATION) /* For backwards compatibility. Will be removed in version 0.11 for cleanliness. */
-/* dr_mp3_c begin */
-#ifndef ma_dr_mp3_c
-#define ma_dr_mp3_c
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-MA_API void ma_dr_mp3_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision)
-{
-    if (pMajor) {
-        *pMajor = MA_DR_MP3_VERSION_MAJOR;
-    }
-    if (pMinor) {
-        *pMinor = MA_DR_MP3_VERSION_MINOR;
-    }
-    if (pRevision) {
-        *pRevision = MA_DR_MP3_VERSION_REVISION;
-    }
-}
-MA_API const char* ma_dr_mp3_version_string(void)
-{
-    return MA_DR_MP3_VERSION_STRING;
-}
-#if defined(__TINYC__)
-#define MA_DR_MP3_NO_SIMD
-#endif
-#define MA_DR_MP3_OFFSET_PTR(p, offset) ((void*)((ma_uint8*)(p) + (offset)))
-#define MA_DR_MP3_MAX_FREE_FORMAT_FRAME_SIZE  2304
-#ifndef MA_DR_MP3_MAX_FRAME_SYNC_MATCHES
-#define MA_DR_MP3_MAX_FRAME_SYNC_MATCHES      10
-#endif
-#define MA_DR_MP3_MAX_L3_FRAME_PAYLOAD_BYTES  MA_DR_MP3_MAX_FREE_FORMAT_FRAME_SIZE
-#define MA_DR_MP3_MAX_BITRESERVOIR_BYTES      511
-#define MA_DR_MP3_SHORT_BLOCK_TYPE            2
-#define MA_DR_MP3_STOP_BLOCK_TYPE             3
-#define MA_DR_MP3_MODE_MONO                   3
-#define MA_DR_MP3_MODE_JOINT_STEREO           1
-#define MA_DR_MP3_HDR_SIZE                    4
-#define MA_DR_MP3_HDR_IS_MONO(h)              (((h[3]) & 0xC0) == 0xC0)
-#define MA_DR_MP3_HDR_IS_MS_STEREO(h)         (((h[3]) & 0xE0) == 0x60)
-#define MA_DR_MP3_HDR_IS_FREE_FORMAT(h)       (((h[2]) & 0xF0) == 0)
-#define MA_DR_MP3_HDR_IS_CRC(h)               (!((h[1]) & 1))
-#define MA_DR_MP3_HDR_TEST_PADDING(h)         ((h[2]) & 0x2)
-#define MA_DR_MP3_HDR_TEST_MPEG1(h)           ((h[1]) & 0x8)
-#define MA_DR_MP3_HDR_TEST_NOT_MPEG25(h)      ((h[1]) & 0x10)
-#define MA_DR_MP3_HDR_TEST_I_STEREO(h)        ((h[3]) & 0x10)
-#define MA_DR_MP3_HDR_TEST_MS_STEREO(h)       ((h[3]) & 0x20)
-#define MA_DR_MP3_HDR_GET_STEREO_MODE(h)      (((h[3]) >> 6) & 3)
-#define MA_DR_MP3_HDR_GET_STEREO_MODE_EXT(h)  (((h[3]) >> 4) & 3)
-#define MA_DR_MP3_HDR_GET_LAYER(h)            (((h[1]) >> 1) & 3)
-#define MA_DR_MP3_HDR_GET_BITRATE(h)          ((h[2]) >> 4)
-#define MA_DR_MP3_HDR_GET_SAMPLE_RATE(h)      (((h[2]) >> 2) & 3)
-#define MA_DR_MP3_HDR_GET_MY_SAMPLE_RATE(h)   (MA_DR_MP3_HDR_GET_SAMPLE_RATE(h) + (((h[1] >> 3) & 1) + ((h[1] >> 4) & 1))*3)
-#define MA_DR_MP3_HDR_IS_FRAME_576(h)         ((h[1] & 14) == 2)
-#define MA_DR_MP3_HDR_IS_LAYER_1(h)           ((h[1] & 6) == 6)
-#define MA_DR_MP3_BITS_DEQUANTIZER_OUT        -1
-#define MA_DR_MP3_MAX_SCF                     (255 + MA_DR_MP3_BITS_DEQUANTIZER_OUT*4 - 210)
-#define MA_DR_MP3_MAX_SCFI                    ((MA_DR_MP3_MAX_SCF + 3) & ~3)
-#define MA_DR_MP3_MIN(a, b)           ((a) > (b) ? (b) : (a))
-#define MA_DR_MP3_MAX(a, b)           ((a) < (b) ? (b) : (a))
-#if !defined(MA_DR_MP3_NO_SIMD)
-#if !defined(MA_DR_MP3_ONLY_SIMD) && (defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64))
-#define MA_DR_MP3_ONLY_SIMD
-#endif
-#if ((defined(_MSC_VER) && _MSC_VER >= 1400) && defined(_M_X64)) || ((defined(__i386) || defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)))
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-#include <emmintrin.h>
-#define MA_DR_MP3_HAVE_SSE 1
-#define MA_DR_MP3_HAVE_SIMD 1
-#define MA_DR_MP3_VSTORE _mm_storeu_ps
-#define MA_DR_MP3_VLD _mm_loadu_ps
-#define MA_DR_MP3_VSET _mm_set1_ps
-#define MA_DR_MP3_VADD _mm_add_ps
-#define MA_DR_MP3_VSUB _mm_sub_ps
-#define MA_DR_MP3_VMUL _mm_mul_ps
-#define MA_DR_MP3_VMAC(a, x, y) _mm_add_ps(a, _mm_mul_ps(x, y))
-#define MA_DR_MP3_VMSB(a, x, y) _mm_sub_ps(a, _mm_mul_ps(x, y))
-#define MA_DR_MP3_VMUL_S(x, s)  _mm_mul_ps(x, _mm_set1_ps(s))
-#define MA_DR_MP3_VREV(x) _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3))
-typedef __m128 ma_dr_mp3_f4;
-#if defined(_MSC_VER) || defined(MA_DR_MP3_ONLY_SIMD)
-#define ma_dr_mp3_cpuid __cpuid
-#else
-static __inline__ __attribute__((always_inline)) void ma_dr_mp3_cpuid(int CPUInfo[], const int InfoType)
-{
-#if defined(__PIC__)
-    __asm__ __volatile__(
-#if defined(__x86_64__)
-        "push %%rbx\n"
-        "cpuid\n"
-        "xchgl %%ebx, %1\n"
-        "pop  %%rbx\n"
-#else
-        "xchgl %%ebx, %1\n"
-        "cpuid\n"
-        "xchgl %%ebx, %1\n"
-#endif
-        : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
-        : "a" (InfoType));
-#else
-    __asm__ __volatile__(
-        "cpuid"
-        : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
-        : "a" (InfoType));
-#endif
-}
-#endif
-static int ma_dr_mp3_have_simd(void)
-{
-#ifdef MA_DR_MP3_ONLY_SIMD
-    return 1;
-#else
-    static int g_have_simd;
-    int CPUInfo[4];
-#ifdef MINIMP3_TEST
-    static int g_counter;
-    if (g_counter++ > 100)
-        return 0;
-#endif
-    if (g_have_simd)
-        goto end;
-    ma_dr_mp3_cpuid(CPUInfo, 0);
-    if (CPUInfo[0] > 0)
-    {
-        ma_dr_mp3_cpuid(CPUInfo, 1);
-        g_have_simd = (CPUInfo[3] & (1 << 26)) + 1;
-        return g_have_simd - 1;
-    }
-end:
-    return g_have_simd - 1;
-#endif
-}
-#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64)
-#include <arm_neon.h>
-#define MA_DR_MP3_HAVE_SSE 0
-#define MA_DR_MP3_HAVE_SIMD 1
-#define MA_DR_MP3_VSTORE vst1q_f32
-#define MA_DR_MP3_VLD vld1q_f32
-#define MA_DR_MP3_VSET vmovq_n_f32
-#define MA_DR_MP3_VADD vaddq_f32
-#define MA_DR_MP3_VSUB vsubq_f32
-#define MA_DR_MP3_VMUL vmulq_f32
-#define MA_DR_MP3_VMAC(a, x, y) vmlaq_f32(a, x, y)
-#define MA_DR_MP3_VMSB(a, x, y) vmlsq_f32(a, x, y)
-#define MA_DR_MP3_VMUL_S(x, s)  vmulq_f32(x, vmovq_n_f32(s))
-#define MA_DR_MP3_VREV(x) vcombine_f32(vget_high_f32(vrev64q_f32(x)), vget_low_f32(vrev64q_f32(x)))
-typedef float32x4_t ma_dr_mp3_f4;
-static int ma_dr_mp3_have_simd(void)
-{
-    return 1;
-}
-#else
-#define MA_DR_MP3_HAVE_SSE 0
-#define MA_DR_MP3_HAVE_SIMD 0
-#ifdef MA_DR_MP3_ONLY_SIMD
-#error MA_DR_MP3_ONLY_SIMD used, but SSE/NEON not enabled
-#endif
-#endif
-#else
-#define MA_DR_MP3_HAVE_SIMD 0
-#endif
-#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64) && !defined(__ARM_ARCH_6M__)
-#define MA_DR_MP3_HAVE_ARMV6 1
-static __inline__ __attribute__((always_inline)) ma_int32 ma_dr_mp3_clip_int16_arm(ma_int32 a)
-{
-    ma_int32 x = 0;
-    __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a));
-    return x;
-}
-#else
-#define MA_DR_MP3_HAVE_ARMV6 0
-#endif
-#ifndef MA_DR_MP3_ASSERT
-#include <assert.h>
-#define MA_DR_MP3_ASSERT(expression) assert(expression)
-#endif
-#ifndef MA_DR_MP3_COPY_MEMORY
-#define MA_DR_MP3_COPY_MEMORY(dst, src, sz) memcpy((dst), (src), (sz))
-#endif
-#ifndef MA_DR_MP3_MOVE_MEMORY
-#define MA_DR_MP3_MOVE_MEMORY(dst, src, sz) memmove((dst), (src), (sz))
-#endif
-#ifndef MA_DR_MP3_ZERO_MEMORY
-#define MA_DR_MP3_ZERO_MEMORY(p, sz) memset((p), 0, (sz))
-#endif
-#define MA_DR_MP3_ZERO_OBJECT(p) MA_DR_MP3_ZERO_MEMORY((p), sizeof(*(p)))
-#ifndef MA_DR_MP3_MALLOC
-#define MA_DR_MP3_MALLOC(sz) malloc((sz))
-#endif
-#ifndef MA_DR_MP3_REALLOC
-#define MA_DR_MP3_REALLOC(p, sz) realloc((p), (sz))
-#endif
-#ifndef MA_DR_MP3_FREE
-#define MA_DR_MP3_FREE(p) free((p))
-#endif
-typedef struct
-{
-    const ma_uint8 *buf;
-    int pos, limit;
-} ma_dr_mp3_bs;
-typedef struct
-{
-    float scf[3*64];
-    ma_uint8 total_bands, stereo_bands, bitalloc[64], scfcod[64];
-} ma_dr_mp3_L12_scale_info;
-typedef struct
-{
-    ma_uint8 tab_offset, code_tab_width, band_count;
-} ma_dr_mp3_L12_subband_alloc;
-typedef struct
-{
-    const ma_uint8 *sfbtab;
-    ma_uint16 part_23_length, big_values, scalefac_compress;
-    ma_uint8 global_gain, block_type, mixed_block_flag, n_long_sfb, n_short_sfb;
-    ma_uint8 table_select[3], region_count[3], subblock_gain[3];
-    ma_uint8 preflag, scalefac_scale, count1_table, scfsi;
-} ma_dr_mp3_L3_gr_info;
-typedef struct
-{
-    ma_dr_mp3_bs bs;
-    ma_uint8 maindata[MA_DR_MP3_MAX_BITRESERVOIR_BYTES + MA_DR_MP3_MAX_L3_FRAME_PAYLOAD_BYTES];
-    ma_dr_mp3_L3_gr_info gr_info[4];
-    float grbuf[2][576], scf[40], syn[18 + 15][2*32];
-    ma_uint8 ist_pos[2][39];
-} ma_dr_mp3dec_scratch;
-static void ma_dr_mp3_bs_init(ma_dr_mp3_bs *bs, const ma_uint8 *data, int bytes)
-{
-    bs->buf   = data;
-    bs->pos   = 0;
-    bs->limit = bytes*8;
-}
-static ma_uint32 ma_dr_mp3_bs_get_bits(ma_dr_mp3_bs *bs, int n)
-{
-    ma_uint32 next, cache = 0, s = bs->pos & 7;
-    int shl = n + s;
-    const ma_uint8 *p = bs->buf + (bs->pos >> 3);
-    if ((bs->pos += n) > bs->limit)
-        return 0;
-    next = *p++ & (255 >> s);
-    while ((shl -= 8) > 0)
-    {
-        cache |= next << shl;
-        next = *p++;
-    }
-    return cache | (next >> -shl);
-}
-static int ma_dr_mp3_hdr_valid(const ma_uint8 *h)
-{
-    return h[0] == 0xff &&
-        ((h[1] & 0xF0) == 0xf0 || (h[1] & 0xFE) == 0xe2) &&
-        (MA_DR_MP3_HDR_GET_LAYER(h) != 0) &&
-        (MA_DR_MP3_HDR_GET_BITRATE(h) != 15) &&
-        (MA_DR_MP3_HDR_GET_SAMPLE_RATE(h) != 3);
-}
-static int ma_dr_mp3_hdr_compare(const ma_uint8 *h1, const ma_uint8 *h2)
-{
-    return ma_dr_mp3_hdr_valid(h2) &&
-        ((h1[1] ^ h2[1]) & 0xFE) == 0 &&
-        ((h1[2] ^ h2[2]) & 0x0C) == 0 &&
-        !(MA_DR_MP3_HDR_IS_FREE_FORMAT(h1) ^ MA_DR_MP3_HDR_IS_FREE_FORMAT(h2));
-}
-static unsigned ma_dr_mp3_hdr_bitrate_kbps(const ma_uint8 *h)
-{
-    static const ma_uint8 halfrate[2][3][15] = {
-        { { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,16,24,28,32,40,48,56,64,72,80,88,96,112,128 } },
-        { { 0,16,20,24,28,32,40,48,56,64,80,96,112,128,160 }, { 0,16,24,28,32,40,48,56,64,80,96,112,128,160,192 }, { 0,16,32,48,64,80,96,112,128,144,160,176,192,208,224 } },
-    };
-    return 2*halfrate[!!MA_DR_MP3_HDR_TEST_MPEG1(h)][MA_DR_MP3_HDR_GET_LAYER(h) - 1][MA_DR_MP3_HDR_GET_BITRATE(h)];
-}
-static unsigned ma_dr_mp3_hdr_sample_rate_hz(const ma_uint8 *h)
-{
-    static const unsigned g_hz[3] = { 44100, 48000, 32000 };
-    return g_hz[MA_DR_MP3_HDR_GET_SAMPLE_RATE(h)] >> (int)!MA_DR_MP3_HDR_TEST_MPEG1(h) >> (int)!MA_DR_MP3_HDR_TEST_NOT_MPEG25(h);
-}
-static unsigned ma_dr_mp3_hdr_frame_samples(const ma_uint8 *h)
-{
-    return MA_DR_MP3_HDR_IS_LAYER_1(h) ? 384 : (1152 >> (int)MA_DR_MP3_HDR_IS_FRAME_576(h));
-}
-static int ma_dr_mp3_hdr_frame_bytes(const ma_uint8 *h, int free_format_size)
-{
-    int frame_bytes = ma_dr_mp3_hdr_frame_samples(h)*ma_dr_mp3_hdr_bitrate_kbps(h)*125/ma_dr_mp3_hdr_sample_rate_hz(h);
-    if (MA_DR_MP3_HDR_IS_LAYER_1(h))
-    {
-        frame_bytes &= ~3;
-    }
-    return frame_bytes ? frame_bytes : free_format_size;
-}
-static int ma_dr_mp3_hdr_padding(const ma_uint8 *h)
-{
-    return MA_DR_MP3_HDR_TEST_PADDING(h) ? (MA_DR_MP3_HDR_IS_LAYER_1(h) ? 4 : 1) : 0;
-}
-#ifndef MA_DR_MP3_ONLY_MP3
-static const ma_dr_mp3_L12_subband_alloc *ma_dr_mp3_L12_subband_alloc_table(const ma_uint8 *hdr, ma_dr_mp3_L12_scale_info *sci)
-{
-    const ma_dr_mp3_L12_subband_alloc *alloc;
-    int mode = MA_DR_MP3_HDR_GET_STEREO_MODE(hdr);
-    int nbands, stereo_bands = (mode == MA_DR_MP3_MODE_MONO) ? 0 : (mode == MA_DR_MP3_MODE_JOINT_STEREO) ? (MA_DR_MP3_HDR_GET_STEREO_MODE_EXT(hdr) << 2) + 4 : 32;
-    if (MA_DR_MP3_HDR_IS_LAYER_1(hdr))
-    {
-        static const ma_dr_mp3_L12_subband_alloc g_alloc_L1[] = { { 76, 4, 32 } };
-        alloc = g_alloc_L1;
-        nbands = 32;
-    } else if (!MA_DR_MP3_HDR_TEST_MPEG1(hdr))
-    {
-        static const ma_dr_mp3_L12_subband_alloc g_alloc_L2M2[] = { { 60, 4, 4 }, { 44, 3, 7 }, { 44, 2, 19 } };
-        alloc = g_alloc_L2M2;
-        nbands = 30;
-    } else
-    {
-        static const ma_dr_mp3_L12_subband_alloc g_alloc_L2M1[] = { { 0, 4, 3 }, { 16, 4, 8 }, { 32, 3, 12 }, { 40, 2, 7 } };
-        int sample_rate_idx = MA_DR_MP3_HDR_GET_SAMPLE_RATE(hdr);
-        unsigned kbps = ma_dr_mp3_hdr_bitrate_kbps(hdr) >> (int)(mode != MA_DR_MP3_MODE_MONO);
-        if (!kbps)
-        {
-            kbps = 192;
-        }
-        alloc = g_alloc_L2M1;
-        nbands = 27;
-        if (kbps < 56)
-        {
-            static const ma_dr_mp3_L12_subband_alloc g_alloc_L2M1_lowrate[] = { { 44, 4, 2 }, { 44, 3, 10 } };
-            alloc = g_alloc_L2M1_lowrate;
-            nbands = sample_rate_idx == 2 ? 12 : 8;
-        } else if (kbps >= 96 && sample_rate_idx != 1)
-        {
-            nbands = 30;
-        }
-    }
-    sci->total_bands = (ma_uint8)nbands;
-    sci->stereo_bands = (ma_uint8)MA_DR_MP3_MIN(stereo_bands, nbands);
-    return alloc;
-}
-static void ma_dr_mp3_L12_read_scalefactors(ma_dr_mp3_bs *bs, ma_uint8 *pba, ma_uint8 *scfcod, int bands, float *scf)
-{
-    static const float g_deq_L12[18*3] = {
-#define MA_DR_MP3_DQ(x) 9.53674316e-07f/x, 7.56931807e-07f/x, 6.00777173e-07f/x
-        MA_DR_MP3_DQ(3),MA_DR_MP3_DQ(7),MA_DR_MP3_DQ(15),MA_DR_MP3_DQ(31),MA_DR_MP3_DQ(63),MA_DR_MP3_DQ(127),MA_DR_MP3_DQ(255),MA_DR_MP3_DQ(511),MA_DR_MP3_DQ(1023),MA_DR_MP3_DQ(2047),MA_DR_MP3_DQ(4095),MA_DR_MP3_DQ(8191),MA_DR_MP3_DQ(16383),MA_DR_MP3_DQ(32767),MA_DR_MP3_DQ(65535),MA_DR_MP3_DQ(3),MA_DR_MP3_DQ(5),MA_DR_MP3_DQ(9)
-    };
-    int i, m;
-    for (i = 0; i < bands; i++)
-    {
-        float s = 0;
-        int ba = *pba++;
-        int mask = ba ? 4 + ((19 >> scfcod[i]) & 3) : 0;
-        for (m = 4; m; m >>= 1)
-        {
-            if (mask & m)
-            {
-                int b = ma_dr_mp3_bs_get_bits(bs, 6);
-                s = g_deq_L12[ba*3 - 6 + b % 3]*(int)(1 << 21 >> b/3);
-            }
-            *scf++ = s;
-        }
-    }
-}
-static void ma_dr_mp3_L12_read_scale_info(const ma_uint8 *hdr, ma_dr_mp3_bs *bs, ma_dr_mp3_L12_scale_info *sci)
-{
-    static const ma_uint8 g_bitalloc_code_tab[] = {
-        0,17, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16,
-        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,16,
-        0,17,18, 3,19,4,5,16,
-        0,17,18,16,
-        0,17,18,19, 4,5,6, 7,8, 9,10,11,12,13,14,15,
-        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,14,
-        0, 2, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16
-    };
-    const ma_dr_mp3_L12_subband_alloc *subband_alloc = ma_dr_mp3_L12_subband_alloc_table(hdr, sci);
-    int i, k = 0, ba_bits = 0;
-    const ma_uint8 *ba_code_tab = g_bitalloc_code_tab;
-    for (i = 0; i < sci->total_bands; i++)
-    {
-        ma_uint8 ba;
-        if (i == k)
-        {
-            k += subband_alloc->band_count;
-            ba_bits = subband_alloc->code_tab_width;
-            ba_code_tab = g_bitalloc_code_tab + subband_alloc->tab_offset;
-            subband_alloc++;
-        }
-        ba = ba_code_tab[ma_dr_mp3_bs_get_bits(bs, ba_bits)];
-        sci->bitalloc[2*i] = ba;
-        if (i < sci->stereo_bands)
-        {
-            ba = ba_code_tab[ma_dr_mp3_bs_get_bits(bs, ba_bits)];
-        }
-        sci->bitalloc[2*i + 1] = sci->stereo_bands ? ba : 0;
-    }
-    for (i = 0; i < 2*sci->total_bands; i++)
-    {
-        sci->scfcod[i] = (ma_uint8)(sci->bitalloc[i] ? MA_DR_MP3_HDR_IS_LAYER_1(hdr) ? 2 : ma_dr_mp3_bs_get_bits(bs, 2) : 6);
-    }
-    ma_dr_mp3_L12_read_scalefactors(bs, sci->bitalloc, sci->scfcod, sci->total_bands*2, sci->scf);
-    for (i = sci->stereo_bands; i < sci->total_bands; i++)
-    {
-        sci->bitalloc[2*i + 1] = 0;
-    }
-}
-static int ma_dr_mp3_L12_dequantize_granule(float *grbuf, ma_dr_mp3_bs *bs, ma_dr_mp3_L12_scale_info *sci, int group_size)
-{
-    int i, j, k, choff = 576;
-    for (j = 0; j < 4; j++)
-    {
-        float *dst = grbuf + group_size*j;
-        for (i = 0; i < 2*sci->total_bands; i++)
-        {
-            int ba = sci->bitalloc[i];
-            if (ba != 0)
-            {
-                if (ba < 17)
-                {
-                    int half = (1 << (ba - 1)) - 1;
-                    for (k = 0; k < group_size; k++)
-                    {
-                        dst[k] = (float)((int)ma_dr_mp3_bs_get_bits(bs, ba) - half);
-                    }
-                } else
-                {
-                    unsigned mod = (2 << (ba - 17)) + 1;
-                    unsigned code = ma_dr_mp3_bs_get_bits(bs, mod + 2 - (mod >> 3));
-                    for (k = 0; k < group_size; k++, code /= mod)
-                    {
-                        dst[k] = (float)((int)(code % mod - mod/2));
-                    }
-                }
-            }
-            dst += choff;
-            choff = 18 - choff;
-        }
-    }
-    return group_size*4;
-}
-static void ma_dr_mp3_L12_apply_scf_384(ma_dr_mp3_L12_scale_info *sci, const float *scf, float *dst)
-{
-    int i, k;
-    MA_DR_MP3_COPY_MEMORY(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float));
-    for (i = 0; i < sci->total_bands; i++, dst += 18, scf += 6)
-    {
-        for (k = 0; k < 12; k++)
-        {
-            dst[k + 0]   *= scf[0];
-            dst[k + 576] *= scf[3];
-        }
-    }
-}
-#endif
-static int ma_dr_mp3_L3_read_side_info(ma_dr_mp3_bs *bs, ma_dr_mp3_L3_gr_info *gr, const ma_uint8 *hdr)
-{
-    static const ma_uint8 g_scf_long[8][23] = {
-        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
-        { 12,12,12,12,12,12,16,20,24,28,32,40,48,56,64,76,90,2,2,2,2,2,0 },
-        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
-        { 6,6,6,6,6,6,8,10,12,14,16,18,22,26,32,38,46,54,62,70,76,36,0 },
-        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
-        { 4,4,4,4,4,4,6,6,8,8,10,12,16,20,24,28,34,42,50,54,76,158,0 },
-        { 4,4,4,4,4,4,6,6,6,8,10,12,16,18,22,28,34,40,46,54,54,192,0 },
-        { 4,4,4,4,4,4,6,6,8,10,12,16,20,24,30,38,46,56,68,84,102,26,0 }
-    };
-    static const ma_uint8 g_scf_short[8][40] = {
-        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
-        { 8,8,8,8,8,8,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
-        { 4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
-        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
-        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
-        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
-        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
-        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
-    };
-    static const ma_uint8 g_scf_mixed[8][40] = {
-        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
-        { 12,12,12,4,4,4,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
-        { 6,6,6,6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
-        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
-        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
-        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
-        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
-        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
-    };
-    unsigned tables, scfsi = 0;
-    int main_data_begin, part_23_sum = 0;
-    int gr_count = MA_DR_MP3_HDR_IS_MONO(hdr) ? 1 : 2;
-    int sr_idx = MA_DR_MP3_HDR_GET_MY_SAMPLE_RATE(hdr); sr_idx -= (sr_idx != 0);
-    if (MA_DR_MP3_HDR_TEST_MPEG1(hdr))
-    {
-        gr_count *= 2;
-        main_data_begin = ma_dr_mp3_bs_get_bits(bs, 9);
-        scfsi = ma_dr_mp3_bs_get_bits(bs, 7 + gr_count);
-    } else
-    {
-        main_data_begin = ma_dr_mp3_bs_get_bits(bs, 8 + gr_count) >> gr_count;
-    }
-    do
-    {
-        if (MA_DR_MP3_HDR_IS_MONO(hdr))
-        {
-            scfsi <<= 4;
-        }
-        gr->part_23_length = (ma_uint16)ma_dr_mp3_bs_get_bits(bs, 12);
-        part_23_sum += gr->part_23_length;
-        gr->big_values = (ma_uint16)ma_dr_mp3_bs_get_bits(bs,  9);
-        if (gr->big_values > 288)
-        {
-            return -1;
-        }
-        gr->global_gain = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 8);
-        gr->scalefac_compress = (ma_uint16)ma_dr_mp3_bs_get_bits(bs, MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 4 : 9);
-        gr->sfbtab = g_scf_long[sr_idx];
-        gr->n_long_sfb  = 22;
-        gr->n_short_sfb = 0;
-        if (ma_dr_mp3_bs_get_bits(bs, 1))
-        {
-            gr->block_type = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 2);
-            if (!gr->block_type)
-            {
-                return -1;
-            }
-            gr->mixed_block_flag = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 1);
-            gr->region_count[0] = 7;
-            gr->region_count[1] = 255;
-            if (gr->block_type == MA_DR_MP3_SHORT_BLOCK_TYPE)
-            {
-                scfsi &= 0x0F0F;
-                if (!gr->mixed_block_flag)
-                {
-                    gr->region_count[0] = 8;
-                    gr->sfbtab = g_scf_short[sr_idx];
-                    gr->n_long_sfb = 0;
-                    gr->n_short_sfb = 39;
-                } else
-                {
-                    gr->sfbtab = g_scf_mixed[sr_idx];
-                    gr->n_long_sfb = MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 8 : 6;
-                    gr->n_short_sfb = 30;
-                }
-            }
-            tables = ma_dr_mp3_bs_get_bits(bs, 10);
-            tables <<= 5;
-            gr->subblock_gain[0] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 3);
-            gr->subblock_gain[1] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 3);
-            gr->subblock_gain[2] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 3);
-        } else
-        {
-            gr->block_type = 0;
-            gr->mixed_block_flag = 0;
-            tables = ma_dr_mp3_bs_get_bits(bs, 15);
-            gr->region_count[0] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 4);
-            gr->region_count[1] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 3);
-            gr->region_count[2] = 255;
-        }
-        gr->table_select[0] = (ma_uint8)(tables >> 10);
-        gr->table_select[1] = (ma_uint8)((tables >> 5) & 31);
-        gr->table_select[2] = (ma_uint8)((tables) & 31);
-        gr->preflag = (ma_uint8)(MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? ma_dr_mp3_bs_get_bits(bs, 1) : (gr->scalefac_compress >= 500));
-        gr->scalefac_scale = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 1);
-        gr->count1_table = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 1);
-        gr->scfsi = (ma_uint8)((scfsi >> 12) & 15);
-        scfsi <<= 4;
-        gr++;
-    } while(--gr_count);
-    if (part_23_sum + bs->pos > bs->limit + main_data_begin*8)
-    {
-        return -1;
-    }
-    return main_data_begin;
-}
-static void ma_dr_mp3_L3_read_scalefactors(ma_uint8 *scf, ma_uint8 *ist_pos, const ma_uint8 *scf_size, const ma_uint8 *scf_count, ma_dr_mp3_bs *bitbuf, int scfsi)
-{
-    int i, k;
-    for (i = 0; i < 4 && scf_count[i]; i++, scfsi *= 2)
-    {
-        int cnt = scf_count[i];
-        if (scfsi & 8)
-        {
-            MA_DR_MP3_COPY_MEMORY(scf, ist_pos, cnt);
-        } else
-        {
-            int bits = scf_size[i];
-            if (!bits)
-            {
-                MA_DR_MP3_ZERO_MEMORY(scf, cnt);
-                MA_DR_MP3_ZERO_MEMORY(ist_pos, cnt);
-            } else
-            {
-                int max_scf = (scfsi < 0) ? (1 << bits) - 1 : -1;
-                for (k = 0; k < cnt; k++)
-                {
-                    int s = ma_dr_mp3_bs_get_bits(bitbuf, bits);
-                    ist_pos[k] = (ma_uint8)(s == max_scf ? -1 : s);
-                    scf[k] = (ma_uint8)s;
-                }
-            }
-        }
-        ist_pos += cnt;
-        scf += cnt;
-    }
-    scf[0] = scf[1] = scf[2] = 0;
-}
-static float ma_dr_mp3_L3_ldexp_q2(float y, int exp_q2)
-{
-    static const float g_expfrac[4] = { 9.31322575e-10f,7.83145814e-10f,6.58544508e-10f,5.53767716e-10f };
-    int e;
-    do
-    {
-        e = MA_DR_MP3_MIN(30*4, exp_q2);
-        y *= g_expfrac[e & 3]*(1 << 30 >> (e >> 2));
-    } while ((exp_q2 -= e) > 0);
-    return y;
-}
-static void ma_dr_mp3_L3_decode_scalefactors(const ma_uint8 *hdr, ma_uint8 *ist_pos, ma_dr_mp3_bs *bs, const ma_dr_mp3_L3_gr_info *gr, float *scf, int ch)
-{
-    static const ma_uint8 g_scf_partitions[3][28] = {
-        { 6,5,5, 5,6,5,5,5,6,5, 7,3,11,10,0,0, 7, 7, 7,0, 6, 6,6,3, 8, 8,5,0 },
-        { 8,9,6,12,6,9,9,9,6,9,12,6,15,18,0,0, 6,15,12,0, 6,12,9,6, 6,18,9,0 },
-        { 9,9,6,12,9,9,9,9,9,9,12,6,18,18,0,0,12,12,12,0,12, 9,9,6,15,12,9,0 }
-    };
-    const ma_uint8 *scf_partition = g_scf_partitions[!!gr->n_short_sfb + !gr->n_long_sfb];
-    ma_uint8 scf_size[4], iscf[40];
-    int i, scf_shift = gr->scalefac_scale + 1, gain_exp, scfsi = gr->scfsi;
-    float gain;
-    if (MA_DR_MP3_HDR_TEST_MPEG1(hdr))
-    {
-        static const ma_uint8 g_scfc_decode[16] = { 0,1,2,3, 12,5,6,7, 9,10,11,13, 14,15,18,19 };
-        int part = g_scfc_decode[gr->scalefac_compress];
-        scf_size[1] = scf_size[0] = (ma_uint8)(part >> 2);
-        scf_size[3] = scf_size[2] = (ma_uint8)(part & 3);
-    } else
-    {
-        static const ma_uint8 g_mod[6*4] = { 5,5,4,4,5,5,4,1,4,3,1,1,5,6,6,1,4,4,4,1,4,3,1,1 };
-        int k, modprod, sfc, ist = MA_DR_MP3_HDR_TEST_I_STEREO(hdr) && ch;
-        sfc = gr->scalefac_compress >> ist;
-        for (k = ist*3*4; sfc >= 0; sfc -= modprod, k += 4)
-        {
-            for (modprod = 1, i = 3; i >= 0; i--)
-            {
-                scf_size[i] = (ma_uint8)(sfc / modprod % g_mod[k + i]);
-                modprod *= g_mod[k + i];
-            }
-        }
-        scf_partition += k;
-        scfsi = -16;
-    }
-    ma_dr_mp3_L3_read_scalefactors(iscf, ist_pos, scf_size, scf_partition, bs, scfsi);
-    if (gr->n_short_sfb)
-    {
-        int sh = 3 - scf_shift;
-        for (i = 0; i < gr->n_short_sfb; i += 3)
-        {
-            iscf[gr->n_long_sfb + i + 0] = (ma_uint8)(iscf[gr->n_long_sfb + i + 0] + (gr->subblock_gain[0] << sh));
-            iscf[gr->n_long_sfb + i + 1] = (ma_uint8)(iscf[gr->n_long_sfb + i + 1] + (gr->subblock_gain[1] << sh));
-            iscf[gr->n_long_sfb + i + 2] = (ma_uint8)(iscf[gr->n_long_sfb + i + 2] + (gr->subblock_gain[2] << sh));
-        }
-    } else if (gr->preflag)
-    {
-        static const ma_uint8 g_preamp[10] = { 1,1,1,1,2,2,3,3,3,2 };
-        for (i = 0; i < 10; i++)
-        {
-            iscf[11 + i] = (ma_uint8)(iscf[11 + i] + g_preamp[i]);
-        }
-    }
-    gain_exp = gr->global_gain + MA_DR_MP3_BITS_DEQUANTIZER_OUT*4 - 210 - (MA_DR_MP3_HDR_IS_MS_STEREO(hdr) ? 2 : 0);
-    gain = ma_dr_mp3_L3_ldexp_q2(1 << (MA_DR_MP3_MAX_SCFI/4),  MA_DR_MP3_MAX_SCFI - gain_exp);
-    for (i = 0; i < (int)(gr->n_long_sfb + gr->n_short_sfb); i++)
-    {
-        scf[i] = ma_dr_mp3_L3_ldexp_q2(gain, iscf[i] << scf_shift);
-    }
-}
-static const float g_ma_dr_mp3_pow43[129 + 16] = {
-    0,-1,-2.519842f,-4.326749f,-6.349604f,-8.549880f,-10.902724f,-13.390518f,-16.000000f,-18.720754f,-21.544347f,-24.463781f,-27.473142f,-30.567351f,-33.741992f,-36.993181f,
-    0,1,2.519842f,4.326749f,6.349604f,8.549880f,10.902724f,13.390518f,16.000000f,18.720754f,21.544347f,24.463781f,27.473142f,30.567351f,33.741992f,36.993181f,40.317474f,43.711787f,47.173345f,50.699631f,54.288352f,57.937408f,61.644865f,65.408941f,69.227979f,73.100443f,77.024898f,81.000000f,85.024491f,89.097188f,93.216975f,97.382800f,101.593667f,105.848633f,110.146801f,114.487321f,118.869381f,123.292209f,127.755065f,132.257246f,136.798076f,141.376907f,145.993119f,150.646117f,155.335327f,160.060199f,164.820202f,169.614826f,174.443577f,179.305980f,184.201575f,189.129918f,194.090580f,199.083145f,204.107210f,209.162385f,214.248292f,219.364564f,224.510845f,229.686789f,234.892058f,240.126328f,245.389280f,250.680604f,256.000000f,261.347174f,266.721841f,272.123723f,277.552547f,283.008049f,288.489971f,293.998060f,299.532071f,305.091761f,310.676898f,316.287249f,321.922592f,327.582707f,333.267377f,338.976394f,344.709550f,350.466646f,356.247482f,362.051866f,367.879608f,373.730522f,379.604427f,385.501143f,391.420496f,397.362314f,403.326427f,409.312672f,415.320884f,421.350905f,427.402579f,433.475750f,439.570269f,445.685987f,451.822757f,457.980436f,464.158883f,470.357960f,476.577530f,482.817459f,489.077615f,495.357868f,501.658090f,507.978156f,514.317941f,520.677324f,527.056184f,533.454404f,539.871867f,546.308458f,552.764065f,559.238575f,565.731879f,572.243870f,578.774440f,585.323483f,591.890898f,598.476581f,605.080431f,611.702349f,618.342238f,625.000000f,631.675540f,638.368763f,645.079578f
-};
-static float ma_dr_mp3_L3_pow_43(int x)
-{
-    float frac;
-    int sign, mult = 256;
-    if (x < 129)
-    {
-        return g_ma_dr_mp3_pow43[16 + x];
-    }
-    if (x < 1024)
-    {
-        mult = 16;
-        x <<= 3;
-    }
-    sign = 2*x & 64;
-    frac = (float)((x & 63) - sign) / ((x & ~63) + sign);
-    return g_ma_dr_mp3_pow43[16 + ((x + sign) >> 6)]*(1.f + frac*((4.f/3) + frac*(2.f/9)))*mult;
-}
-static void ma_dr_mp3_L3_huffman(float *dst, ma_dr_mp3_bs *bs, const ma_dr_mp3_L3_gr_info *gr_info, const float *scf, int layer3gr_limit)
-{
-    static const ma_int16 tabs[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        785,785,785,785,784,784,784,784,513,513,513,513,513,513,513,513,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,
-        -255,1313,1298,1282,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,290,288,
-        -255,1313,1298,1282,769,769,769,769,529,529,529,529,529,529,529,529,528,528,528,528,528,528,528,528,512,512,512,512,512,512,512,512,290,288,
-        -253,-318,-351,-367,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,819,818,547,547,275,275,275,275,561,560,515,546,289,274,288,258,
-        -254,-287,1329,1299,1314,1312,1057,1057,1042,1042,1026,1026,784,784,784,784,529,529,529,529,529,529,529,529,769,769,769,769,768,768,768,768,563,560,306,306,291,259,
-        -252,-413,-477,-542,1298,-575,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-383,-399,1107,1092,1106,1061,849,849,789,789,1104,1091,773,773,1076,1075,341,340,325,309,834,804,577,577,532,532,516,516,832,818,803,816,561,561,531,531,515,546,289,289,288,258,
-        -252,-429,-493,-559,1057,1057,1042,1042,529,529,529,529,529,529,529,529,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,-382,1077,-415,1106,1061,1104,849,849,789,789,1091,1076,1029,1075,834,834,597,581,340,340,339,324,804,833,532,532,832,772,818,803,817,787,816,771,290,290,290,290,288,258,
-        -253,-349,-414,-447,-463,1329,1299,-479,1314,1312,1057,1057,1042,1042,1026,1026,785,785,785,785,784,784,784,784,769,769,769,769,768,768,768,768,-319,851,821,-335,836,850,805,849,341,340,325,336,533,533,579,579,564,564,773,832,578,548,563,516,321,276,306,291,304,259,
-        -251,-572,-733,-830,-863,-879,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,1396,1351,1381,1366,1395,1335,1380,-559,1334,1138,1138,1063,1063,1350,1392,1031,1031,1062,1062,1364,1363,1120,1120,1333,1348,881,881,881,881,375,374,359,373,343,358,341,325,791,791,1123,1122,-703,1105,1045,-719,865,865,790,790,774,774,1104,1029,338,293,323,308,-799,-815,833,788,772,818,803,816,322,292,307,320,561,531,515,546,289,274,288,258,
-        -251,-525,-605,-685,-765,-831,-846,1298,1057,1057,1312,1282,785,785,785,785,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,1399,1398,1383,1367,1382,1396,1351,-511,1381,1366,1139,1139,1079,1079,1124,1124,1364,1349,1363,1333,882,882,882,882,807,807,807,807,1094,1094,1136,1136,373,341,535,535,881,775,867,822,774,-591,324,338,-671,849,550,550,866,864,609,609,293,336,534,534,789,835,773,-751,834,804,308,307,833,788,832,772,562,562,547,547,305,275,560,515,290,290,
-        -252,-397,-477,-557,-622,-653,-719,-735,-750,1329,1299,1314,1057,1057,1042,1042,1312,1282,1024,1024,785,785,785,785,784,784,784,784,769,769,769,769,-383,1127,1141,1111,1126,1140,1095,1110,869,869,883,883,1079,1109,882,882,375,374,807,868,838,881,791,-463,867,822,368,263,852,837,836,-543,610,610,550,550,352,336,534,534,865,774,851,821,850,805,593,533,579,564,773,832,578,578,548,548,577,577,307,276,306,291,516,560,259,259,
-        -250,-2107,-2507,-2764,-2909,-2974,-3007,-3023,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-767,-1052,-1213,-1277,-1358,-1405,-1469,-1535,-1550,-1582,-1614,-1647,-1662,-1694,-1726,-1759,-1774,-1807,-1822,-1854,-1886,1565,-1919,-1935,-1951,-1967,1731,1730,1580,1717,-1983,1729,1564,-1999,1548,-2015,-2031,1715,1595,-2047,1714,-2063,1610,-2079,1609,-2095,1323,1323,1457,1457,1307,1307,1712,1547,1641,1700,1699,1594,1685,1625,1442,1442,1322,1322,-780,-973,-910,1279,1278,1277,1262,1276,1261,1275,1215,1260,1229,-959,974,974,989,989,-943,735,478,478,495,463,506,414,-1039,1003,958,1017,927,942,987,957,431,476,1272,1167,1228,-1183,1256,-1199,895,895,941,941,1242,1227,1212,1135,1014,1014,490,489,503,487,910,1013,985,925,863,894,970,955,1012,847,-1343,831,755,755,984,909,428,366,754,559,-1391,752,486,457,924,997,698,698,983,893,740,740,908,877,739,739,667,667,953,938,497,287,271,271,683,606,590,712,726,574,302,302,738,736,481,286,526,725,605,711,636,724,696,651,589,681,666,710,364,467,573,695,466,466,301,465,379,379,709,604,665,679,316,316,634,633,436,436,464,269,424,394,452,332,438,363,347,408,393,448,331,422,362,407,392,421,346,406,391,376,375,359,1441,1306,-2367,1290,-2383,1337,-2399,-2415,1426,1321,-2431,1411,1336,-2447,-2463,-2479,1169,1169,1049,1049,1424,1289,1412,1352,1319,-2495,1154,1154,1064,1064,1153,1153,416,390,360,404,403,389,344,374,373,343,358,372,327,357,342,311,356,326,1395,1394,1137,1137,1047,1047,1365,1392,1287,1379,1334,1364,1349,1378,1318,1363,792,792,792,792,1152,1152,1032,1032,1121,1121,1046,1046,1120,1120,1030,1030,-2895,1106,1061,1104,849,849,789,789,1091,1076,1029,1090,1060,1075,833,833,309,324,532,532,832,772,818,803,561,561,531,560,515,546,289,274,288,258,
-        -250,-1179,-1579,-1836,-1996,-2124,-2253,-2333,-2413,-2477,-2542,-2574,-2607,-2622,-2655,1314,1313,1298,1312,1282,785,785,785,785,1040,1040,1025,1025,768,768,768,768,-766,-798,-830,-862,-895,-911,-927,-943,-959,-975,-991,-1007,-1023,-1039,-1055,-1070,1724,1647,-1103,-1119,1631,1767,1662,1738,1708,1723,-1135,1780,1615,1779,1599,1677,1646,1778,1583,-1151,1777,1567,1737,1692,1765,1722,1707,1630,1751,1661,1764,1614,1736,1676,1763,1750,1645,1598,1721,1691,1762,1706,1582,1761,1566,-1167,1749,1629,767,766,751,765,494,494,735,764,719,749,734,763,447,447,748,718,477,506,431,491,446,476,461,505,415,430,475,445,504,399,460,489,414,503,383,474,429,459,502,502,746,752,488,398,501,473,413,472,486,271,480,270,-1439,-1455,1357,-1471,-1487,-1503,1341,1325,-1519,1489,1463,1403,1309,-1535,1372,1448,1418,1476,1356,1462,1387,-1551,1475,1340,1447,1402,1386,-1567,1068,1068,1474,1461,455,380,468,440,395,425,410,454,364,467,466,464,453,269,409,448,268,432,1371,1473,1432,1417,1308,1460,1355,1446,1459,1431,1083,1083,1401,1416,1458,1445,1067,1067,1370,1457,1051,1051,1291,1430,1385,1444,1354,1415,1400,1443,1082,1082,1173,1113,1186,1066,1185,1050,-1967,1158,1128,1172,1097,1171,1081,-1983,1157,1112,416,266,375,400,1170,1142,1127,1065,793,793,1169,1033,1156,1096,1141,1111,1155,1080,1126,1140,898,898,808,808,897,897,792,792,1095,1152,1032,1125,1110,1139,1079,1124,882,807,838,881,853,791,-2319,867,368,263,822,852,837,866,806,865,-2399,851,352,262,534,534,821,836,594,594,549,549,593,593,533,533,848,773,579,579,564,578,548,563,276,276,577,576,306,291,516,560,305,305,275,259,
-        -251,-892,-2058,-2620,-2828,-2957,-3023,-3039,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,-559,1530,-575,-591,1528,1527,1407,1526,1391,1023,1023,1023,1023,1525,1375,1268,1268,1103,1103,1087,1087,1039,1039,1523,-604,815,815,815,815,510,495,509,479,508,463,507,447,431,505,415,399,-734,-782,1262,-815,1259,1244,-831,1258,1228,-847,-863,1196,-879,1253,987,987,748,-767,493,493,462,477,414,414,686,669,478,446,461,445,474,429,487,458,412,471,1266,1264,1009,1009,799,799,-1019,-1276,-1452,-1581,-1677,-1757,-1821,-1886,-1933,-1997,1257,1257,1483,1468,1512,1422,1497,1406,1467,1496,1421,1510,1134,1134,1225,1225,1466,1451,1374,1405,1252,1252,1358,1480,1164,1164,1251,1251,1238,1238,1389,1465,-1407,1054,1101,-1423,1207,-1439,830,830,1248,1038,1237,1117,1223,1148,1236,1208,411,426,395,410,379,269,1193,1222,1132,1235,1221,1116,976,976,1192,1162,1177,1220,1131,1191,963,963,-1647,961,780,-1663,558,558,994,993,437,408,393,407,829,978,813,797,947,-1743,721,721,377,392,844,950,828,890,706,706,812,859,796,960,948,843,934,874,571,571,-1919,690,555,689,421,346,539,539,944,779,918,873,932,842,903,888,570,570,931,917,674,674,-2575,1562,-2591,1609,-2607,1654,1322,1322,1441,1441,1696,1546,1683,1593,1669,1624,1426,1426,1321,1321,1639,1680,1425,1425,1305,1305,1545,1668,1608,1623,1667,1592,1638,1666,1320,1320,1652,1607,1409,1409,1304,1304,1288,1288,1664,1637,1395,1395,1335,1335,1622,1636,1394,1394,1319,1319,1606,1621,1392,1392,1137,1137,1137,1137,345,390,360,375,404,373,1047,-2751,-2767,-2783,1062,1121,1046,-2799,1077,-2815,1106,1061,789,789,1105,1104,263,355,310,340,325,354,352,262,339,324,1091,1076,1029,1090,1060,1075,833,833,788,788,1088,1028,818,818,803,803,561,561,531,531,816,771,546,546,289,274,288,258,
-        -253,-317,-381,-446,-478,-509,1279,1279,-811,-1179,-1451,-1756,-1900,-2028,-2189,-2253,-2333,-2414,-2445,-2511,-2526,1313,1298,-2559,1041,1041,1040,1040,1025,1025,1024,1024,1022,1007,1021,991,1020,975,1019,959,687,687,1018,1017,671,671,655,655,1016,1015,639,639,758,758,623,623,757,607,756,591,755,575,754,559,543,543,1009,783,-575,-621,-685,-749,496,-590,750,749,734,748,974,989,1003,958,988,973,1002,942,987,957,972,1001,926,986,941,971,956,1000,910,985,925,999,894,970,-1071,-1087,-1102,1390,-1135,1436,1509,1451,1374,-1151,1405,1358,1480,1420,-1167,1507,1494,1389,1342,1465,1435,1450,1326,1505,1310,1493,1373,1479,1404,1492,1464,1419,428,443,472,397,736,526,464,464,486,457,442,471,484,482,1357,1449,1434,1478,1388,1491,1341,1490,1325,1489,1463,1403,1309,1477,1372,1448,1418,1433,1476,1356,1462,1387,-1439,1475,1340,1447,1402,1474,1324,1461,1371,1473,269,448,1432,1417,1308,1460,-1711,1459,-1727,1441,1099,1099,1446,1386,1431,1401,-1743,1289,1083,1083,1160,1160,1458,1445,1067,1067,1370,1457,1307,1430,1129,1129,1098,1098,268,432,267,416,266,400,-1887,1144,1187,1082,1173,1113,1186,1066,1050,1158,1128,1143,1172,1097,1171,1081,420,391,1157,1112,1170,1142,1127,1065,1169,1049,1156,1096,1141,1111,1155,1080,1126,1154,1064,1153,1140,1095,1048,-2159,1125,1110,1137,-2175,823,823,1139,1138,807,807,384,264,368,263,868,838,853,791,867,822,852,837,866,806,865,790,-2319,851,821,836,352,262,850,805,849,-2399,533,533,835,820,336,261,578,548,563,577,532,532,832,772,562,562,547,547,305,275,560,515,290,290,288,258 };
-    static const ma_uint8 tab32[] = { 130,162,193,209,44,28,76,140,9,9,9,9,9,9,9,9,190,254,222,238,126,94,157,157,109,61,173,205};
-    static const ma_uint8 tab33[] = { 252,236,220,204,188,172,156,140,124,108,92,76,60,44,28,12 };
-    static const ma_int16 tabindex[2*16] = { 0,32,64,98,0,132,180,218,292,364,426,538,648,746,0,1126,1460,1460,1460,1460,1460,1460,1460,1460,1842,1842,1842,1842,1842,1842,1842,1842 };
-    static const ma_uint8 g_linbits[] =  { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,6,8,10,13,4,5,6,7,8,9,11,13 };
-#define MA_DR_MP3_PEEK_BITS(n)    (bs_cache >> (32 - (n)))
-#define MA_DR_MP3_FLUSH_BITS(n)   { bs_cache <<= (n); bs_sh += (n); }
-#define MA_DR_MP3_CHECK_BITS      while (bs_sh >= 0) { bs_cache |= (ma_uint32)*bs_next_ptr++ << bs_sh; bs_sh -= 8; }
-#define MA_DR_MP3_BSPOS           ((bs_next_ptr - bs->buf)*8 - 24 + bs_sh)
-    float one = 0.0f;
-    int ireg = 0, big_val_cnt = gr_info->big_values;
-    const ma_uint8 *sfb = gr_info->sfbtab;
-    const ma_uint8 *bs_next_ptr = bs->buf + bs->pos/8;
-    ma_uint32 bs_cache = (((bs_next_ptr[0]*256u + bs_next_ptr[1])*256u + bs_next_ptr[2])*256u + bs_next_ptr[3]) << (bs->pos & 7);
-    int pairs_to_decode, np, bs_sh = (bs->pos & 7) - 8;
-    bs_next_ptr += 4;
-    while (big_val_cnt > 0)
-    {
-        int tab_num = gr_info->table_select[ireg];
-        int sfb_cnt = gr_info->region_count[ireg++];
-        const ma_int16 *codebook = tabs + tabindex[tab_num];
-        int linbits = g_linbits[tab_num];
-        if (linbits)
-        {
-            do
-            {
-                np = *sfb++ / 2;
-                pairs_to_decode = MA_DR_MP3_MIN(big_val_cnt, np);
-                one = *scf++;
-                do
-                {
-                    int j, w = 5;
-                    int leaf = codebook[MA_DR_MP3_PEEK_BITS(w)];
-                    while (leaf < 0)
-                    {
-                        MA_DR_MP3_FLUSH_BITS(w);
-                        w = leaf & 7;
-                        leaf = codebook[MA_DR_MP3_PEEK_BITS(w) - (leaf >> 3)];
-                    }
-                    MA_DR_MP3_FLUSH_BITS(leaf >> 8);
-                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
-                    {
-                        int lsb = leaf & 0x0F;
-                        if (lsb == 15)
-                        {
-                            lsb += MA_DR_MP3_PEEK_BITS(linbits);
-                            MA_DR_MP3_FLUSH_BITS(linbits);
-                            MA_DR_MP3_CHECK_BITS;
-                            *dst = one*ma_dr_mp3_L3_pow_43(lsb)*((ma_int32)bs_cache < 0 ? -1: 1);
-                        } else
-                        {
-                            *dst = g_ma_dr_mp3_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
-                        }
-                        MA_DR_MP3_FLUSH_BITS(lsb ? 1 : 0);
-                    }
-                    MA_DR_MP3_CHECK_BITS;
-                } while (--pairs_to_decode);
-            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
-        } else
-        {
-            do
-            {
-                np = *sfb++ / 2;
-                pairs_to_decode = MA_DR_MP3_MIN(big_val_cnt, np);
-                one = *scf++;
-                do
-                {
-                    int j, w = 5;
-                    int leaf = codebook[MA_DR_MP3_PEEK_BITS(w)];
-                    while (leaf < 0)
-                    {
-                        MA_DR_MP3_FLUSH_BITS(w);
-                        w = leaf & 7;
-                        leaf = codebook[MA_DR_MP3_PEEK_BITS(w) - (leaf >> 3)];
-                    }
-                    MA_DR_MP3_FLUSH_BITS(leaf >> 8);
-                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
-                    {
-                        int lsb = leaf & 0x0F;
-                        *dst = g_ma_dr_mp3_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
-                        MA_DR_MP3_FLUSH_BITS(lsb ? 1 : 0);
-                    }
-                    MA_DR_MP3_CHECK_BITS;
-                } while (--pairs_to_decode);
-            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
-        }
-    }
-    for (np = 1 - big_val_cnt;; dst += 4)
-    {
-        const ma_uint8 *codebook_count1 = (gr_info->count1_table) ? tab33 : tab32;
-        int leaf = codebook_count1[MA_DR_MP3_PEEK_BITS(4)];
-        if (!(leaf & 8))
-        {
-            leaf = codebook_count1[(leaf >> 3) + (bs_cache << 4 >> (32 - (leaf & 3)))];
-        }
-        MA_DR_MP3_FLUSH_BITS(leaf & 7);
-        if (MA_DR_MP3_BSPOS > layer3gr_limit)
-        {
-            break;
-        }
-#define MA_DR_MP3_RELOAD_SCALEFACTOR  if (!--np) { np = *sfb++/2; if (!np) break; one = *scf++; }
-#define MA_DR_MP3_DEQ_COUNT1(s) if (leaf & (128 >> s)) { dst[s] = ((ma_int32)bs_cache < 0) ? -one : one; MA_DR_MP3_FLUSH_BITS(1) }
-        MA_DR_MP3_RELOAD_SCALEFACTOR;
-        MA_DR_MP3_DEQ_COUNT1(0);
-        MA_DR_MP3_DEQ_COUNT1(1);
-        MA_DR_MP3_RELOAD_SCALEFACTOR;
-        MA_DR_MP3_DEQ_COUNT1(2);
-        MA_DR_MP3_DEQ_COUNT1(3);
-        MA_DR_MP3_CHECK_BITS;
-    }
-    bs->pos = layer3gr_limit;
-}
-static void ma_dr_mp3_L3_midside_stereo(float *left, int n)
-{
-    int i = 0;
-    float *right = left + 576;
-#if MA_DR_MP3_HAVE_SIMD
-    if (ma_dr_mp3_have_simd())
-    {
-        for (; i < n - 3; i += 4)
-        {
-            ma_dr_mp3_f4 vl = MA_DR_MP3_VLD(left + i);
-            ma_dr_mp3_f4 vr = MA_DR_MP3_VLD(right + i);
-            MA_DR_MP3_VSTORE(left + i, MA_DR_MP3_VADD(vl, vr));
-            MA_DR_MP3_VSTORE(right + i, MA_DR_MP3_VSUB(vl, vr));
-        }
-#ifdef __GNUC__
-        if (__builtin_constant_p(n % 4 == 0) && n % 4 == 0)
-            return;
-#endif
-    }
-#endif
-    for (; i < n; i++)
-    {
-        float a = left[i];
-        float b = right[i];
-        left[i] = a + b;
-        right[i] = a - b;
-    }
-}
-static void ma_dr_mp3_L3_intensity_stereo_band(float *left, int n, float kl, float kr)
-{
-    int i;
-    for (i = 0; i < n; i++)
-    {
-        left[i + 576] = left[i]*kr;
-        left[i] = left[i]*kl;
-    }
-}
-static void ma_dr_mp3_L3_stereo_top_band(const float *right, const ma_uint8 *sfb, int nbands, int max_band[3])
-{
-    int i, k;
-    max_band[0] = max_band[1] = max_band[2] = -1;
-    for (i = 0; i < nbands; i++)
-    {
-        for (k = 0; k < sfb[i]; k += 2)
-        {
-            if (right[k] != 0 || right[k + 1] != 0)
-            {
-                max_band[i % 3] = i;
-                break;
-            }
-        }
-        right += sfb[i];
-    }
-}
-static void ma_dr_mp3_L3_stereo_process(float *left, const ma_uint8 *ist_pos, const ma_uint8 *sfb, const ma_uint8 *hdr, int max_band[3], int mpeg2_sh)
-{
-    static const float g_pan[7*2] = { 0,1,0.21132487f,0.78867513f,0.36602540f,0.63397460f,0.5f,0.5f,0.63397460f,0.36602540f,0.78867513f,0.21132487f,1,0 };
-    unsigned i, max_pos = MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 7 : 64;
-    for (i = 0; sfb[i]; i++)
-    {
-        unsigned ipos = ist_pos[i];
-        if ((int)i > max_band[i % 3] && ipos < max_pos)
-        {
-            float kl, kr, s = MA_DR_MP3_HDR_TEST_MS_STEREO(hdr) ? 1.41421356f : 1;
-            if (MA_DR_MP3_HDR_TEST_MPEG1(hdr))
-            {
-                kl = g_pan[2*ipos];
-                kr = g_pan[2*ipos + 1];
-            } else
-            {
-                kl = 1;
-                kr = ma_dr_mp3_L3_ldexp_q2(1, (ipos + 1) >> 1 << mpeg2_sh);
-                if (ipos & 1)
-                {
-                    kl = kr;
-                    kr = 1;
-                }
-            }
-            ma_dr_mp3_L3_intensity_stereo_band(left, sfb[i], kl*s, kr*s);
-        } else if (MA_DR_MP3_HDR_TEST_MS_STEREO(hdr))
-        {
-            ma_dr_mp3_L3_midside_stereo(left, sfb[i]);
-        }
-        left += sfb[i];
-    }
-}
-static void ma_dr_mp3_L3_intensity_stereo(float *left, ma_uint8 *ist_pos, const ma_dr_mp3_L3_gr_info *gr, const ma_uint8 *hdr)
-{
-    int max_band[3], n_sfb = gr->n_long_sfb + gr->n_short_sfb;
-    int i, max_blocks = gr->n_short_sfb ? 3 : 1;
-    ma_dr_mp3_L3_stereo_top_band(left + 576, gr->sfbtab, n_sfb, max_band);
-    if (gr->n_long_sfb)
-    {
-        max_band[0] = max_band[1] = max_band[2] = MA_DR_MP3_MAX(MA_DR_MP3_MAX(max_band[0], max_band[1]), max_band[2]);
-    }
-    for (i = 0; i < max_blocks; i++)
-    {
-        int default_pos = MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 3 : 0;
-        int itop = n_sfb - max_blocks + i;
-        int prev = itop - max_blocks;
-        ist_pos[itop] = (ma_uint8)(max_band[i] >= prev ? default_pos : ist_pos[prev]);
-    }
-    ma_dr_mp3_L3_stereo_process(left, ist_pos, gr->sfbtab, hdr, max_band, gr[1].scalefac_compress & 1);
-}
-static void ma_dr_mp3_L3_reorder(float *grbuf, float *scratch, const ma_uint8 *sfb)
-{
-    int i, len;
-    float *src = grbuf, *dst = scratch;
-    for (;0 != (len = *sfb); sfb += 3, src += 2*len)
-    {
-        for (i = 0; i < len; i++, src++)
-        {
-            *dst++ = src[0*len];
-            *dst++ = src[1*len];
-            *dst++ = src[2*len];
-        }
-    }
-    MA_DR_MP3_COPY_MEMORY(grbuf, scratch, (dst - scratch)*sizeof(float));
-}
-static void ma_dr_mp3_L3_antialias(float *grbuf, int nbands)
-{
-    static const float g_aa[2][8] = {
-        {0.85749293f,0.88174200f,0.94962865f,0.98331459f,0.99551782f,0.99916056f,0.99989920f,0.99999316f},
-        {0.51449576f,0.47173197f,0.31337745f,0.18191320f,0.09457419f,0.04096558f,0.01419856f,0.00369997f}
-    };
-    for (; nbands > 0; nbands--, grbuf += 18)
-    {
-        int i = 0;
-#if MA_DR_MP3_HAVE_SIMD
-        if (ma_dr_mp3_have_simd()) for (; i < 8; i += 4)
-        {
-            ma_dr_mp3_f4 vu = MA_DR_MP3_VLD(grbuf + 18 + i);
-            ma_dr_mp3_f4 vd = MA_DR_MP3_VLD(grbuf + 14 - i);
-            ma_dr_mp3_f4 vc0 = MA_DR_MP3_VLD(g_aa[0] + i);
-            ma_dr_mp3_f4 vc1 = MA_DR_MP3_VLD(g_aa[1] + i);
-            vd = MA_DR_MP3_VREV(vd);
-            MA_DR_MP3_VSTORE(grbuf + 18 + i, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vu, vc0), MA_DR_MP3_VMUL(vd, vc1)));
-            vd = MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vu, vc1), MA_DR_MP3_VMUL(vd, vc0));
-            MA_DR_MP3_VSTORE(grbuf + 14 - i, MA_DR_MP3_VREV(vd));
-        }
-#endif
-#ifndef MA_DR_MP3_ONLY_SIMD
-        for(; i < 8; i++)
-        {
-            float u = grbuf[18 + i];
-            float d = grbuf[17 - i];
-            grbuf[18 + i] = u*g_aa[0][i] - d*g_aa[1][i];
-            grbuf[17 - i] = u*g_aa[1][i] + d*g_aa[0][i];
-        }
-#endif
-    }
-}
-static void ma_dr_mp3_L3_dct3_9(float *y)
-{
-    float s0, s1, s2, s3, s4, s5, s6, s7, s8, t0, t2, t4;
-    s0 = y[0]; s2 = y[2]; s4 = y[4]; s6 = y[6]; s8 = y[8];
-    t0 = s0 + s6*0.5f;
-    s0 -= s6;
-    t4 = (s4 + s2)*0.93969262f;
-    t2 = (s8 + s2)*0.76604444f;
-    s6 = (s4 - s8)*0.17364818f;
-    s4 += s8 - s2;
-    s2 = s0 - s4*0.5f;
-    y[4] = s4 + s0;
-    s8 = t0 - t2 + s6;
-    s0 = t0 - t4 + t2;
-    s4 = t0 + t4 - s6;
-    s1 = y[1]; s3 = y[3]; s5 = y[5]; s7 = y[7];
-    s3 *= 0.86602540f;
-    t0 = (s5 + s1)*0.98480775f;
-    t4 = (s5 - s7)*0.34202014f;
-    t2 = (s1 + s7)*0.64278761f;
-    s1 = (s1 - s5 - s7)*0.86602540f;
-    s5 = t0 - s3 - t2;
-    s7 = t4 - s3 - t0;
-    s3 = t4 + s3 - t2;
-    y[0] = s4 - s7;
-    y[1] = s2 + s1;
-    y[2] = s0 - s3;
-    y[3] = s8 + s5;
-    y[5] = s8 - s5;
-    y[6] = s0 + s3;
-    y[7] = s2 - s1;
-    y[8] = s4 + s7;
-}
-static void ma_dr_mp3_L3_imdct36(float *grbuf, float *overlap, const float *window, int nbands)
-{
-    int i, j;
-    static const float g_twid9[18] = {
-        0.73727734f,0.79335334f,0.84339145f,0.88701083f,0.92387953f,0.95371695f,0.97629601f,0.99144486f,0.99904822f,0.67559021f,0.60876143f,0.53729961f,0.46174861f,0.38268343f,0.30070580f,0.21643961f,0.13052619f,0.04361938f
-    };
-    for (j = 0; j < nbands; j++, grbuf += 18, overlap += 9)
-    {
-        float co[9], si[9];
-        co[0] = -grbuf[0];
-        si[0] = grbuf[17];
-        for (i = 0; i < 4; i++)
-        {
-            si[8 - 2*i] =   grbuf[4*i + 1] - grbuf[4*i + 2];
-            co[1 + 2*i] =   grbuf[4*i + 1] + grbuf[4*i + 2];
-            si[7 - 2*i] =   grbuf[4*i + 4] - grbuf[4*i + 3];
-            co[2 + 2*i] = -(grbuf[4*i + 3] + grbuf[4*i + 4]);
-        }
-        ma_dr_mp3_L3_dct3_9(co);
-        ma_dr_mp3_L3_dct3_9(si);
-        si[1] = -si[1];
-        si[3] = -si[3];
-        si[5] = -si[5];
-        si[7] = -si[7];
-        i = 0;
-#if MA_DR_MP3_HAVE_SIMD
-        if (ma_dr_mp3_have_simd()) for (; i < 8; i += 4)
-        {
-            ma_dr_mp3_f4 vovl = MA_DR_MP3_VLD(overlap + i);
-            ma_dr_mp3_f4 vc = MA_DR_MP3_VLD(co + i);
-            ma_dr_mp3_f4 vs = MA_DR_MP3_VLD(si + i);
-            ma_dr_mp3_f4 vr0 = MA_DR_MP3_VLD(g_twid9 + i);
-            ma_dr_mp3_f4 vr1 = MA_DR_MP3_VLD(g_twid9 + 9 + i);
-            ma_dr_mp3_f4 vw0 = MA_DR_MP3_VLD(window + i);
-            ma_dr_mp3_f4 vw1 = MA_DR_MP3_VLD(window + 9 + i);
-            ma_dr_mp3_f4 vsum = MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vc, vr1), MA_DR_MP3_VMUL(vs, vr0));
-            MA_DR_MP3_VSTORE(overlap + i, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vc, vr0), MA_DR_MP3_VMUL(vs, vr1)));
-            MA_DR_MP3_VSTORE(grbuf + i, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vovl, vw0), MA_DR_MP3_VMUL(vsum, vw1)));
-            vsum = MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vovl, vw1), MA_DR_MP3_VMUL(vsum, vw0));
-            MA_DR_MP3_VSTORE(grbuf + 14 - i, MA_DR_MP3_VREV(vsum));
-        }
-#endif
-        for (; i < 9; i++)
-        {
-            float ovl  = overlap[i];
-            float sum  = co[i]*g_twid9[9 + i] + si[i]*g_twid9[0 + i];
-            overlap[i] = co[i]*g_twid9[0 + i] - si[i]*g_twid9[9 + i];
-            grbuf[i]      = ovl*window[0 + i] - sum*window[9 + i];
-            grbuf[17 - i] = ovl*window[9 + i] + sum*window[0 + i];
-        }
-    }
-}
-static void ma_dr_mp3_L3_idct3(float x0, float x1, float x2, float *dst)
-{
-    float m1 = x1*0.86602540f;
-    float a1 = x0 - x2*0.5f;
-    dst[1] = x0 + x2;
-    dst[0] = a1 + m1;
-    dst[2] = a1 - m1;
-}
-static void ma_dr_mp3_L3_imdct12(float *x, float *dst, float *overlap)
-{
-    static const float g_twid3[6] = { 0.79335334f,0.92387953f,0.99144486f, 0.60876143f,0.38268343f,0.13052619f };
-    float co[3], si[3];
-    int i;
-    ma_dr_mp3_L3_idct3(-x[0], x[6] + x[3], x[12] + x[9], co);
-    ma_dr_mp3_L3_idct3(x[15], x[12] - x[9], x[6] - x[3], si);
-    si[1] = -si[1];
-    for (i = 0; i < 3; i++)
-    {
-        float ovl  = overlap[i];
-        float sum  = co[i]*g_twid3[3 + i] + si[i]*g_twid3[0 + i];
-        overlap[i] = co[i]*g_twid3[0 + i] - si[i]*g_twid3[3 + i];
-        dst[i]     = ovl*g_twid3[2 - i] - sum*g_twid3[5 - i];
-        dst[5 - i] = ovl*g_twid3[5 - i] + sum*g_twid3[2 - i];
-    }
-}
-static void ma_dr_mp3_L3_imdct_short(float *grbuf, float *overlap, int nbands)
-{
-    for (;nbands > 0; nbands--, overlap += 9, grbuf += 18)
-    {
-        float tmp[18];
-        MA_DR_MP3_COPY_MEMORY(tmp, grbuf, sizeof(tmp));
-        MA_DR_MP3_COPY_MEMORY(grbuf, overlap, 6*sizeof(float));
-        ma_dr_mp3_L3_imdct12(tmp, grbuf + 6, overlap + 6);
-        ma_dr_mp3_L3_imdct12(tmp + 1, grbuf + 12, overlap + 6);
-        ma_dr_mp3_L3_imdct12(tmp + 2, overlap, overlap + 6);
-    }
-}
-static void ma_dr_mp3_L3_change_sign(float *grbuf)
-{
-    int b, i;
-    for (b = 0, grbuf += 18; b < 32; b += 2, grbuf += 36)
-        for (i = 1; i < 18; i += 2)
-            grbuf[i] = -grbuf[i];
-}
-static void ma_dr_mp3_L3_imdct_gr(float *grbuf, float *overlap, unsigned block_type, unsigned n_long_bands)
-{
-    static const float g_mdct_window[2][18] = {
-        { 0.99904822f,0.99144486f,0.97629601f,0.95371695f,0.92387953f,0.88701083f,0.84339145f,0.79335334f,0.73727734f,0.04361938f,0.13052619f,0.21643961f,0.30070580f,0.38268343f,0.46174861f,0.53729961f,0.60876143f,0.67559021f },
-        { 1,1,1,1,1,1,0.99144486f,0.92387953f,0.79335334f,0,0,0,0,0,0,0.13052619f,0.38268343f,0.60876143f }
-    };
-    if (n_long_bands)
-    {
-        ma_dr_mp3_L3_imdct36(grbuf, overlap, g_mdct_window[0], n_long_bands);
-        grbuf += 18*n_long_bands;
-        overlap += 9*n_long_bands;
-    }
-    if (block_type == MA_DR_MP3_SHORT_BLOCK_TYPE)
-        ma_dr_mp3_L3_imdct_short(grbuf, overlap, 32 - n_long_bands);
-    else
-        ma_dr_mp3_L3_imdct36(grbuf, overlap, g_mdct_window[block_type == MA_DR_MP3_STOP_BLOCK_TYPE], 32 - n_long_bands);
-}
-static void ma_dr_mp3_L3_save_reservoir(ma_dr_mp3dec *h, ma_dr_mp3dec_scratch *s)
-{
-    int pos = (s->bs.pos + 7)/8u;
-    int remains = s->bs.limit/8u - pos;
-    if (remains > MA_DR_MP3_MAX_BITRESERVOIR_BYTES)
-    {
-        pos += remains - MA_DR_MP3_MAX_BITRESERVOIR_BYTES;
-        remains = MA_DR_MP3_MAX_BITRESERVOIR_BYTES;
-    }
-    if (remains > 0)
-    {
-        MA_DR_MP3_MOVE_MEMORY(h->reserv_buf, s->maindata + pos, remains);
-    }
-    h->reserv = remains;
-}
-static int ma_dr_mp3_L3_restore_reservoir(ma_dr_mp3dec *h, ma_dr_mp3_bs *bs, ma_dr_mp3dec_scratch *s, int main_data_begin)
-{
-    int frame_bytes = (bs->limit - bs->pos)/8;
-    int bytes_have = MA_DR_MP3_MIN(h->reserv, main_data_begin);
-    MA_DR_MP3_COPY_MEMORY(s->maindata, h->reserv_buf + MA_DR_MP3_MAX(0, h->reserv - main_data_begin), MA_DR_MP3_MIN(h->reserv, main_data_begin));
-    MA_DR_MP3_COPY_MEMORY(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes);
-    ma_dr_mp3_bs_init(&s->bs, s->maindata, bytes_have + frame_bytes);
-    return h->reserv >= main_data_begin;
-}
-static void ma_dr_mp3_L3_decode(ma_dr_mp3dec *h, ma_dr_mp3dec_scratch *s, ma_dr_mp3_L3_gr_info *gr_info, int nch)
-{
-    int ch;
-    for (ch = 0; ch < nch; ch++)
-    {
-        int layer3gr_limit = s->bs.pos + gr_info[ch].part_23_length;
-        ma_dr_mp3_L3_decode_scalefactors(h->header, s->ist_pos[ch], &s->bs, gr_info + ch, s->scf, ch);
-        ma_dr_mp3_L3_huffman(s->grbuf[ch], &s->bs, gr_info + ch, s->scf, layer3gr_limit);
-    }
-    if (MA_DR_MP3_HDR_TEST_I_STEREO(h->header))
-    {
-        ma_dr_mp3_L3_intensity_stereo(s->grbuf[0], s->ist_pos[1], gr_info, h->header);
-    } else if (MA_DR_MP3_HDR_IS_MS_STEREO(h->header))
-    {
-        ma_dr_mp3_L3_midside_stereo(s->grbuf[0], 576);
-    }
-    for (ch = 0; ch < nch; ch++, gr_info++)
-    {
-        int aa_bands = 31;
-        int n_long_bands = (gr_info->mixed_block_flag ? 2 : 0) << (int)(MA_DR_MP3_HDR_GET_MY_SAMPLE_RATE(h->header) == 2);
-        if (gr_info->n_short_sfb)
-        {
-            aa_bands = n_long_bands - 1;
-            ma_dr_mp3_L3_reorder(s->grbuf[ch] + n_long_bands*18, s->syn[0], gr_info->sfbtab + gr_info->n_long_sfb);
-        }
-        ma_dr_mp3_L3_antialias(s->grbuf[ch], aa_bands);
-        ma_dr_mp3_L3_imdct_gr(s->grbuf[ch], h->mdct_overlap[ch], gr_info->block_type, n_long_bands);
-        ma_dr_mp3_L3_change_sign(s->grbuf[ch]);
-    }
-}
-static void ma_dr_mp3d_DCT_II(float *grbuf, int n)
-{
-    static const float g_sec[24] = {
-        10.19000816f,0.50060302f,0.50241929f,3.40760851f,0.50547093f,0.52249861f,2.05778098f,0.51544732f,0.56694406f,1.48416460f,0.53104258f,0.64682180f,1.16943991f,0.55310392f,0.78815460f,0.97256821f,0.58293498f,1.06067765f,0.83934963f,0.62250412f,1.72244716f,0.74453628f,0.67480832f,5.10114861f
-    };
-    int i, k = 0;
-#if MA_DR_MP3_HAVE_SIMD
-    if (ma_dr_mp3_have_simd()) for (; k < n; k += 4)
-    {
-        ma_dr_mp3_f4 t[4][8], *x;
-        float *y = grbuf + k;
-        for (x = t[0], i = 0; i < 8; i++, x++)
-        {
-            ma_dr_mp3_f4 x0 = MA_DR_MP3_VLD(&y[i*18]);
-            ma_dr_mp3_f4 x1 = MA_DR_MP3_VLD(&y[(15 - i)*18]);
-            ma_dr_mp3_f4 x2 = MA_DR_MP3_VLD(&y[(16 + i)*18]);
-            ma_dr_mp3_f4 x3 = MA_DR_MP3_VLD(&y[(31 - i)*18]);
-            ma_dr_mp3_f4 t0 = MA_DR_MP3_VADD(x0, x3);
-            ma_dr_mp3_f4 t1 = MA_DR_MP3_VADD(x1, x2);
-            ma_dr_mp3_f4 t2 = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x1, x2), g_sec[3*i + 0]);
-            ma_dr_mp3_f4 t3 = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x0, x3), g_sec[3*i + 1]);
-            x[0] = MA_DR_MP3_VADD(t0, t1);
-            x[8] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(t0, t1), g_sec[3*i + 2]);
-            x[16] = MA_DR_MP3_VADD(t3, t2);
-            x[24] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(t3, t2), g_sec[3*i + 2]);
-        }
-        for (x = t[0], i = 0; i < 4; i++, x += 8)
-        {
-            ma_dr_mp3_f4 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
-            xt = MA_DR_MP3_VSUB(x0, x7); x0 = MA_DR_MP3_VADD(x0, x7);
-            x7 = MA_DR_MP3_VSUB(x1, x6); x1 = MA_DR_MP3_VADD(x1, x6);
-            x6 = MA_DR_MP3_VSUB(x2, x5); x2 = MA_DR_MP3_VADD(x2, x5);
-            x5 = MA_DR_MP3_VSUB(x3, x4); x3 = MA_DR_MP3_VADD(x3, x4);
-            x4 = MA_DR_MP3_VSUB(x0, x3); x0 = MA_DR_MP3_VADD(x0, x3);
-            x3 = MA_DR_MP3_VSUB(x1, x2); x1 = MA_DR_MP3_VADD(x1, x2);
-            x[0] = MA_DR_MP3_VADD(x0, x1);
-            x[4] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x0, x1), 0.70710677f);
-            x5 = MA_DR_MP3_VADD(x5, x6);
-            x6 = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(x6, x7), 0.70710677f);
-            x7 = MA_DR_MP3_VADD(x7, xt);
-            x3 = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(x3, x4), 0.70710677f);
-            x5 = MA_DR_MP3_VSUB(x5, MA_DR_MP3_VMUL_S(x7, 0.198912367f));
-            x7 = MA_DR_MP3_VADD(x7, MA_DR_MP3_VMUL_S(x5, 0.382683432f));
-            x5 = MA_DR_MP3_VSUB(x5, MA_DR_MP3_VMUL_S(x7, 0.198912367f));
-            x0 = MA_DR_MP3_VSUB(xt, x6); xt = MA_DR_MP3_VADD(xt, x6);
-            x[1] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(xt, x7), 0.50979561f);
-            x[2] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(x4, x3), 0.54119611f);
-            x[3] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x0, x5), 0.60134488f);
-            x[5] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(x0, x5), 0.89997619f);
-            x[6] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x4, x3), 1.30656302f);
-            x[7] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(xt, x7), 2.56291556f);
-        }
-        if (k > n - 3)
-        {
-#if MA_DR_MP3_HAVE_SSE
-#define MA_DR_MP3_VSAVE2(i, v) _mm_storel_pi((__m64 *)(void*)&y[i*18], v)
-#else
-#define MA_DR_MP3_VSAVE2(i, v) vst1_f32((float32_t *)&y[(i)*18],  vget_low_f32(v))
-#endif
-            for (i = 0; i < 7; i++, y += 4*18)
-            {
-                ma_dr_mp3_f4 s = MA_DR_MP3_VADD(t[3][i], t[3][i + 1]);
-                MA_DR_MP3_VSAVE2(0, t[0][i]);
-                MA_DR_MP3_VSAVE2(1, MA_DR_MP3_VADD(t[2][i], s));
-                MA_DR_MP3_VSAVE2(2, MA_DR_MP3_VADD(t[1][i], t[1][i + 1]));
-                MA_DR_MP3_VSAVE2(3, MA_DR_MP3_VADD(t[2][1 + i], s));
-            }
-            MA_DR_MP3_VSAVE2(0, t[0][7]);
-            MA_DR_MP3_VSAVE2(1, MA_DR_MP3_VADD(t[2][7], t[3][7]));
-            MA_DR_MP3_VSAVE2(2, t[1][7]);
-            MA_DR_MP3_VSAVE2(3, t[3][7]);
-        } else
-        {
-#define MA_DR_MP3_VSAVE4(i, v) MA_DR_MP3_VSTORE(&y[(i)*18], v)
-            for (i = 0; i < 7; i++, y += 4*18)
-            {
-                ma_dr_mp3_f4 s = MA_DR_MP3_VADD(t[3][i], t[3][i + 1]);
-                MA_DR_MP3_VSAVE4(0, t[0][i]);
-                MA_DR_MP3_VSAVE4(1, MA_DR_MP3_VADD(t[2][i], s));
-                MA_DR_MP3_VSAVE4(2, MA_DR_MP3_VADD(t[1][i], t[1][i + 1]));
-                MA_DR_MP3_VSAVE4(3, MA_DR_MP3_VADD(t[2][1 + i], s));
-            }
-            MA_DR_MP3_VSAVE4(0, t[0][7]);
-            MA_DR_MP3_VSAVE4(1, MA_DR_MP3_VADD(t[2][7], t[3][7]));
-            MA_DR_MP3_VSAVE4(2, t[1][7]);
-            MA_DR_MP3_VSAVE4(3, t[3][7]);
-        }
-    } else
-#endif
-#ifdef MA_DR_MP3_ONLY_SIMD
-    {}
-#else
-    for (; k < n; k++)
-    {
-        float t[4][8], *x, *y = grbuf + k;
-        for (x = t[0], i = 0; i < 8; i++, x++)
-        {
-            float x0 = y[i*18];
-            float x1 = y[(15 - i)*18];
-            float x2 = y[(16 + i)*18];
-            float x3 = y[(31 - i)*18];
-            float t0 = x0 + x3;
-            float t1 = x1 + x2;
-            float t2 = (x1 - x2)*g_sec[3*i + 0];
-            float t3 = (x0 - x3)*g_sec[3*i + 1];
-            x[0] = t0 + t1;
-            x[8] = (t0 - t1)*g_sec[3*i + 2];
-            x[16] = t3 + t2;
-            x[24] = (t3 - t2)*g_sec[3*i + 2];
-        }
-        for (x = t[0], i = 0; i < 4; i++, x += 8)
-        {
-            float x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
-            xt = x0 - x7; x0 += x7;
-            x7 = x1 - x6; x1 += x6;
-            x6 = x2 - x5; x2 += x5;
-            x5 = x3 - x4; x3 += x4;
-            x4 = x0 - x3; x0 += x3;
-            x3 = x1 - x2; x1 += x2;
-            x[0] = x0 + x1;
-            x[4] = (x0 - x1)*0.70710677f;
-            x5 =  x5 + x6;
-            x6 = (x6 + x7)*0.70710677f;
-            x7 =  x7 + xt;
-            x3 = (x3 + x4)*0.70710677f;
-            x5 -= x7*0.198912367f;
-            x7 += x5*0.382683432f;
-            x5 -= x7*0.198912367f;
-            x0 = xt - x6; xt += x6;
-            x[1] = (xt + x7)*0.50979561f;
-            x[2] = (x4 + x3)*0.54119611f;
-            x[3] = (x0 - x5)*0.60134488f;
-            x[5] = (x0 + x5)*0.89997619f;
-            x[6] = (x4 - x3)*1.30656302f;
-            x[7] = (xt - x7)*2.56291556f;
-        }
-        for (i = 0; i < 7; i++, y += 4*18)
-        {
-            y[0*18] = t[0][i];
-            y[1*18] = t[2][i] + t[3][i] + t[3][i + 1];
-            y[2*18] = t[1][i] + t[1][i + 1];
-            y[3*18] = t[2][i + 1] + t[3][i] + t[3][i + 1];
-        }
-        y[0*18] = t[0][7];
-        y[1*18] = t[2][7] + t[3][7];
-        y[2*18] = t[1][7];
-        y[3*18] = t[3][7];
-    }
-#endif
-}
-#ifndef MA_DR_MP3_FLOAT_OUTPUT
-typedef ma_int16 ma_dr_mp3d_sample_t;
-static ma_int16 ma_dr_mp3d_scale_pcm(float sample)
-{
-    ma_int16 s;
-#if MA_DR_MP3_HAVE_ARMV6
-    ma_int32 s32 = (ma_int32)(sample + .5f);
-    s32 -= (s32 < 0);
-    s = (ma_int16)ma_dr_mp3_clip_int16_arm(s32);
-#else
-    if (sample >=  32766.5) return (ma_int16) 32767;
-    if (sample <= -32767.5) return (ma_int16)-32768;
-    s = (ma_int16)(sample + .5f);
-    s -= (s < 0);
-#endif
-    return s;
-}
-#else
-typedef float ma_dr_mp3d_sample_t;
-static float ma_dr_mp3d_scale_pcm(float sample)
-{
-    return sample*(1.f/32768.f);
-}
-#endif
-static void ma_dr_mp3d_synth_pair(ma_dr_mp3d_sample_t *pcm, int nch, const float *z)
-{
-    float a;
-    a  = (z[14*64] - z[    0]) * 29;
-    a += (z[ 1*64] + z[13*64]) * 213;
-    a += (z[12*64] - z[ 2*64]) * 459;
-    a += (z[ 3*64] + z[11*64]) * 2037;
-    a += (z[10*64] - z[ 4*64]) * 5153;
-    a += (z[ 5*64] + z[ 9*64]) * 6574;
-    a += (z[ 8*64] - z[ 6*64]) * 37489;
-    a +=  z[ 7*64]             * 75038;
-    pcm[0] = ma_dr_mp3d_scale_pcm(a);
-    z += 2;
-    a  = z[14*64] * 104;
-    a += z[12*64] * 1567;
-    a += z[10*64] * 9727;
-    a += z[ 8*64] * 64019;
-    a += z[ 6*64] * -9975;
-    a += z[ 4*64] * -45;
-    a += z[ 2*64] * 146;
-    a += z[ 0*64] * -5;
-    pcm[16*nch] = ma_dr_mp3d_scale_pcm(a);
-}
-static void ma_dr_mp3d_synth(float *xl, ma_dr_mp3d_sample_t *dstl, int nch, float *lins)
-{
-    int i;
-    float *xr = xl + 576*(nch - 1);
-    ma_dr_mp3d_sample_t *dstr = dstl + (nch - 1);
-    static const float g_win[] = {
-        -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992,
-        -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856,
-        -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630,
-        -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313,
-        -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908,
-        -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415,
-        -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835,
-        -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169,
-        -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420,
-        -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590,
-        -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679,
-        -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692,
-        -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629,
-        -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494,
-        -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290
-    };
-    float *zlin = lins + 15*64;
-    const float *w = g_win;
-    zlin[4*15]     = xl[18*16];
-    zlin[4*15 + 1] = xr[18*16];
-    zlin[4*15 + 2] = xl[0];
-    zlin[4*15 + 3] = xr[0];
-    zlin[4*31]     = xl[1 + 18*16];
-    zlin[4*31 + 1] = xr[1 + 18*16];
-    zlin[4*31 + 2] = xl[1];
-    zlin[4*31 + 3] = xr[1];
-    ma_dr_mp3d_synth_pair(dstr, nch, lins + 4*15 + 1);
-    ma_dr_mp3d_synth_pair(dstr + 32*nch, nch, lins + 4*15 + 64 + 1);
-    ma_dr_mp3d_synth_pair(dstl, nch, lins + 4*15);
-    ma_dr_mp3d_synth_pair(dstl + 32*nch, nch, lins + 4*15 + 64);
-#if MA_DR_MP3_HAVE_SIMD
-    if (ma_dr_mp3_have_simd()) for (i = 14; i >= 0; i--)
-    {
-#define MA_DR_MP3_VLOAD(k) ma_dr_mp3_f4 w0 = MA_DR_MP3_VSET(*w++); ma_dr_mp3_f4 w1 = MA_DR_MP3_VSET(*w++); ma_dr_mp3_f4 vz = MA_DR_MP3_VLD(&zlin[4*i - 64*k]); ma_dr_mp3_f4 vy = MA_DR_MP3_VLD(&zlin[4*i - 64*(15 - k)]);
-#define MA_DR_MP3_V0(k) { MA_DR_MP3_VLOAD(k) b =               MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vz, w1), MA_DR_MP3_VMUL(vy, w0)) ; a =               MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vz, w0), MA_DR_MP3_VMUL(vy, w1));  }
-#define MA_DR_MP3_V1(k) { MA_DR_MP3_VLOAD(k) b = MA_DR_MP3_VADD(b, MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vz, w1), MA_DR_MP3_VMUL(vy, w0))); a = MA_DR_MP3_VADD(a, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vz, w0), MA_DR_MP3_VMUL(vy, w1))); }
-#define MA_DR_MP3_V2(k) { MA_DR_MP3_VLOAD(k) b = MA_DR_MP3_VADD(b, MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vz, w1), MA_DR_MP3_VMUL(vy, w0))); a = MA_DR_MP3_VADD(a, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vy, w1), MA_DR_MP3_VMUL(vz, w0))); }
-        ma_dr_mp3_f4 a, b;
-        zlin[4*i]     = xl[18*(31 - i)];
-        zlin[4*i + 1] = xr[18*(31 - i)];
-        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
-        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
-        zlin[4*i + 64] = xl[1 + 18*(1 + i)];
-        zlin[4*i + 64 + 1] = xr[1 + 18*(1 + i)];
-        zlin[4*i - 64 + 2] = xl[18*(1 + i)];
-        zlin[4*i - 64 + 3] = xr[18*(1 + i)];
-        MA_DR_MP3_V0(0) MA_DR_MP3_V2(1) MA_DR_MP3_V1(2) MA_DR_MP3_V2(3) MA_DR_MP3_V1(4) MA_DR_MP3_V2(5) MA_DR_MP3_V1(6) MA_DR_MP3_V2(7)
-        {
-#ifndef MA_DR_MP3_FLOAT_OUTPUT
-#if MA_DR_MP3_HAVE_SSE
-            static const ma_dr_mp3_f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
-            static const ma_dr_mp3_f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
-            __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
-                                           _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
-            dstr[(15 - i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 1);
-            dstr[(17 + i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 5);
-            dstl[(15 - i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 0);
-            dstl[(17 + i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 4);
-            dstr[(47 - i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 3);
-            dstr[(49 + i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 7);
-            dstl[(47 - i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 2);
-            dstl[(49 + i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 6);
-#else
-            int16x4_t pcma, pcmb;
-            a = MA_DR_MP3_VADD(a, MA_DR_MP3_VSET(0.5f));
-            b = MA_DR_MP3_VADD(b, MA_DR_MP3_VSET(0.5f));
-            pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, MA_DR_MP3_VSET(0)))));
-            pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, MA_DR_MP3_VSET(0)))));
-            vst1_lane_s16(dstr + (15 - i)*nch, pcma, 1);
-            vst1_lane_s16(dstr + (17 + i)*nch, pcmb, 1);
-            vst1_lane_s16(dstl + (15 - i)*nch, pcma, 0);
-            vst1_lane_s16(dstl + (17 + i)*nch, pcmb, 0);
-            vst1_lane_s16(dstr + (47 - i)*nch, pcma, 3);
-            vst1_lane_s16(dstr + (49 + i)*nch, pcmb, 3);
-            vst1_lane_s16(dstl + (47 - i)*nch, pcma, 2);
-            vst1_lane_s16(dstl + (49 + i)*nch, pcmb, 2);
-#endif
-#else
-        #if MA_DR_MP3_HAVE_SSE
-            static const ma_dr_mp3_f4 g_scale = { 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f };
-        #else
-            const ma_dr_mp3_f4 g_scale = vdupq_n_f32(1.0f/32768.0f);
-        #endif
-            a = MA_DR_MP3_VMUL(a, g_scale);
-            b = MA_DR_MP3_VMUL(b, g_scale);
-#if MA_DR_MP3_HAVE_SSE
-            _mm_store_ss(dstr + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
-            _mm_store_ss(dstr + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
-            _mm_store_ss(dstl + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)));
-            _mm_store_ss(dstl + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 0, 0, 0)));
-            _mm_store_ss(dstr + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
-            _mm_store_ss(dstr + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3)));
-            _mm_store_ss(dstl + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
-            _mm_store_ss(dstl + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 2, 2)));
-#else
-            vst1q_lane_f32(dstr + (15 - i)*nch, a, 1);
-            vst1q_lane_f32(dstr + (17 + i)*nch, b, 1);
-            vst1q_lane_f32(dstl + (15 - i)*nch, a, 0);
-            vst1q_lane_f32(dstl + (17 + i)*nch, b, 0);
-            vst1q_lane_f32(dstr + (47 - i)*nch, a, 3);
-            vst1q_lane_f32(dstr + (49 + i)*nch, b, 3);
-            vst1q_lane_f32(dstl + (47 - i)*nch, a, 2);
-            vst1q_lane_f32(dstl + (49 + i)*nch, b, 2);
-#endif
-#endif
-        }
-    } else
-#endif
-#ifdef MA_DR_MP3_ONLY_SIMD
-    {}
-#else
-    for (i = 14; i >= 0; i--)
-    {
-#define MA_DR_MP3_LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64];
-#define MA_DR_MP3_S0(k) { int j; MA_DR_MP3_LOAD(k); for (j = 0; j < 4; j++) b[j]  = vz[j]*w1 + vy[j]*w0, a[j]  = vz[j]*w0 - vy[j]*w1; }
-#define MA_DR_MP3_S1(k) { int j; MA_DR_MP3_LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vz[j]*w0 - vy[j]*w1; }
-#define MA_DR_MP3_S2(k) { int j; MA_DR_MP3_LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vy[j]*w1 - vz[j]*w0; }
-        float a[4], b[4];
-        zlin[4*i]     = xl[18*(31 - i)];
-        zlin[4*i + 1] = xr[18*(31 - i)];
-        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
-        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
-        zlin[4*(i + 16)]   = xl[1 + 18*(1 + i)];
-        zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)];
-        zlin[4*(i - 16) + 2] = xl[18*(1 + i)];
-        zlin[4*(i - 16) + 3] = xr[18*(1 + i)];
-        MA_DR_MP3_S0(0) MA_DR_MP3_S2(1) MA_DR_MP3_S1(2) MA_DR_MP3_S2(3) MA_DR_MP3_S1(4) MA_DR_MP3_S2(5) MA_DR_MP3_S1(6) MA_DR_MP3_S2(7)
-        dstr[(15 - i)*nch] = ma_dr_mp3d_scale_pcm(a[1]);
-        dstr[(17 + i)*nch] = ma_dr_mp3d_scale_pcm(b[1]);
-        dstl[(15 - i)*nch] = ma_dr_mp3d_scale_pcm(a[0]);
-        dstl[(17 + i)*nch] = ma_dr_mp3d_scale_pcm(b[0]);
-        dstr[(47 - i)*nch] = ma_dr_mp3d_scale_pcm(a[3]);
-        dstr[(49 + i)*nch] = ma_dr_mp3d_scale_pcm(b[3]);
-        dstl[(47 - i)*nch] = ma_dr_mp3d_scale_pcm(a[2]);
-        dstl[(49 + i)*nch] = ma_dr_mp3d_scale_pcm(b[2]);
-    }
-#endif
-}
-static void ma_dr_mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, ma_dr_mp3d_sample_t *pcm, float *lins)
-{
-    int i;
-    for (i = 0; i < nch; i++)
-    {
-        ma_dr_mp3d_DCT_II(grbuf + 576*i, nbands);
-    }
-    MA_DR_MP3_COPY_MEMORY(lins, qmf_state, sizeof(float)*15*64);
-    for (i = 0; i < nbands; i += 2)
-    {
-        ma_dr_mp3d_synth(grbuf + i, pcm + 32*nch*i, nch, lins + i*64);
-    }
-#ifndef MA_DR_MP3_NONSTANDARD_BUT_LOGICAL
-    if (nch == 1)
-    {
-        for (i = 0; i < 15*64; i += 2)
-        {
-            qmf_state[i] = lins[nbands*64 + i];
-        }
-    } else
-#endif
-    {
-        MA_DR_MP3_COPY_MEMORY(qmf_state, lins + nbands*64, sizeof(float)*15*64);
-    }
-}
-static int ma_dr_mp3d_match_frame(const ma_uint8 *hdr, int mp3_bytes, int frame_bytes)
-{
-    int i, nmatch;
-    for (i = 0, nmatch = 0; nmatch < MA_DR_MP3_MAX_FRAME_SYNC_MATCHES; nmatch++)
-    {
-        i += ma_dr_mp3_hdr_frame_bytes(hdr + i, frame_bytes) + ma_dr_mp3_hdr_padding(hdr + i);
-        if (i + MA_DR_MP3_HDR_SIZE > mp3_bytes)
-            return nmatch > 0;
-        if (!ma_dr_mp3_hdr_compare(hdr, hdr + i))
-            return 0;
-    }
-    return 1;
-}
-static int ma_dr_mp3d_find_frame(const ma_uint8 *mp3, int mp3_bytes, int *free_format_bytes, int *ptr_frame_bytes)
-{
-    int i, k;
-    for (i = 0; i < mp3_bytes - MA_DR_MP3_HDR_SIZE; i++, mp3++)
-    {
-        if (ma_dr_mp3_hdr_valid(mp3))
-        {
-            int frame_bytes = ma_dr_mp3_hdr_frame_bytes(mp3, *free_format_bytes);
-            int frame_and_padding = frame_bytes + ma_dr_mp3_hdr_padding(mp3);
-            for (k = MA_DR_MP3_HDR_SIZE; !frame_bytes && k < MA_DR_MP3_MAX_FREE_FORMAT_FRAME_SIZE && i + 2*k < mp3_bytes - MA_DR_MP3_HDR_SIZE; k++)
-            {
-                if (ma_dr_mp3_hdr_compare(mp3, mp3 + k))
-                {
-                    int fb = k - ma_dr_mp3_hdr_padding(mp3);
-                    int nextfb = fb + ma_dr_mp3_hdr_padding(mp3 + k);
-                    if (i + k + nextfb + MA_DR_MP3_HDR_SIZE > mp3_bytes || !ma_dr_mp3_hdr_compare(mp3, mp3 + k + nextfb))
-                        continue;
-                    frame_and_padding = k;
-                    frame_bytes = fb;
-                    *free_format_bytes = fb;
-                }
-            }
-            if ((frame_bytes && i + frame_and_padding <= mp3_bytes &&
-                ma_dr_mp3d_match_frame(mp3, mp3_bytes - i, frame_bytes)) ||
-                (!i && frame_and_padding == mp3_bytes))
-            {
-                *ptr_frame_bytes = frame_and_padding;
-                return i;
-            }
-            *free_format_bytes = 0;
-        }
-    }
-    *ptr_frame_bytes = 0;
-    return mp3_bytes;
-}
-MA_API void ma_dr_mp3dec_init(ma_dr_mp3dec *dec)
-{
-    dec->header[0] = 0;
-}
-MA_API int ma_dr_mp3dec_decode_frame(ma_dr_mp3dec *dec, const ma_uint8 *mp3, int mp3_bytes, void *pcm, ma_dr_mp3dec_frame_info *info)
-{
-    int i = 0, igr, frame_size = 0, success = 1;
-    const ma_uint8 *hdr;
-    ma_dr_mp3_bs bs_frame[1];
-    ma_dr_mp3dec_scratch scratch;
-    if (mp3_bytes > 4 && dec->header[0] == 0xff && ma_dr_mp3_hdr_compare(dec->header, mp3))
-    {
-        frame_size = ma_dr_mp3_hdr_frame_bytes(mp3, dec->free_format_bytes) + ma_dr_mp3_hdr_padding(mp3);
-        if (frame_size != mp3_bytes && (frame_size + MA_DR_MP3_HDR_SIZE > mp3_bytes || !ma_dr_mp3_hdr_compare(mp3, mp3 + frame_size)))
-        {
-            frame_size = 0;
-        }
-    }
-    if (!frame_size)
-    {
-        MA_DR_MP3_ZERO_MEMORY(dec, sizeof(ma_dr_mp3dec));
-        i = ma_dr_mp3d_find_frame(mp3, mp3_bytes, &dec->free_format_bytes, &frame_size);
-        if (!frame_size || i + frame_size > mp3_bytes)
-        {
-            info->frame_bytes = i;
-            return 0;
-        }
-    }
-    hdr = mp3 + i;
-    MA_DR_MP3_COPY_MEMORY(dec->header, hdr, MA_DR_MP3_HDR_SIZE);
-    info->frame_bytes = i + frame_size;
-    info->channels = MA_DR_MP3_HDR_IS_MONO(hdr) ? 1 : 2;
-    info->hz = ma_dr_mp3_hdr_sample_rate_hz(hdr);
-    info->layer = 4 - MA_DR_MP3_HDR_GET_LAYER(hdr);
-    info->bitrate_kbps = ma_dr_mp3_hdr_bitrate_kbps(hdr);
-    ma_dr_mp3_bs_init(bs_frame, hdr + MA_DR_MP3_HDR_SIZE, frame_size - MA_DR_MP3_HDR_SIZE);
-    if (MA_DR_MP3_HDR_IS_CRC(hdr))
-    {
-        ma_dr_mp3_bs_get_bits(bs_frame, 16);
-    }
-    if (info->layer == 3)
-    {
-        int main_data_begin = ma_dr_mp3_L3_read_side_info(bs_frame, scratch.gr_info, hdr);
-        if (main_data_begin < 0 || bs_frame->pos > bs_frame->limit)
-        {
-            ma_dr_mp3dec_init(dec);
-            return 0;
-        }
-        success = ma_dr_mp3_L3_restore_reservoir(dec, bs_frame, &scratch, main_data_begin);
-        if (success && pcm != NULL)
-        {
-            for (igr = 0; igr < (MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm = MA_DR_MP3_OFFSET_PTR(pcm, sizeof(ma_dr_mp3d_sample_t)*576*info->channels))
-            {
-                MA_DR_MP3_ZERO_MEMORY(scratch.grbuf[0], 576*2*sizeof(float));
-                ma_dr_mp3_L3_decode(dec, &scratch, scratch.gr_info + igr*info->channels, info->channels);
-                ma_dr_mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 18, info->channels, (ma_dr_mp3d_sample_t*)pcm, scratch.syn[0]);
-            }
-        }
-        ma_dr_mp3_L3_save_reservoir(dec, &scratch);
-    } else
-    {
-#ifdef MA_DR_MP3_ONLY_MP3
-        return 0;
-#else
-        ma_dr_mp3_L12_scale_info sci[1];
-        if (pcm == NULL) {
-            return ma_dr_mp3_hdr_frame_samples(hdr);
-        }
-        ma_dr_mp3_L12_read_scale_info(hdr, bs_frame, sci);
-        MA_DR_MP3_ZERO_MEMORY(scratch.grbuf[0], 576*2*sizeof(float));
-        for (i = 0, igr = 0; igr < 3; igr++)
-        {
-            if (12 == (i += ma_dr_mp3_L12_dequantize_granule(scratch.grbuf[0] + i, bs_frame, sci, info->layer | 1)))
-            {
-                i = 0;
-                ma_dr_mp3_L12_apply_scf_384(sci, sci->scf + igr, scratch.grbuf[0]);
-                ma_dr_mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 12, info->channels, (ma_dr_mp3d_sample_t*)pcm, scratch.syn[0]);
-                MA_DR_MP3_ZERO_MEMORY(scratch.grbuf[0], 576*2*sizeof(float));
-                pcm = MA_DR_MP3_OFFSET_PTR(pcm, sizeof(ma_dr_mp3d_sample_t)*384*info->channels);
-            }
-            if (bs_frame->pos > bs_frame->limit)
-            {
-                ma_dr_mp3dec_init(dec);
-                return 0;
-            }
-        }
-#endif
-    }
-    return success*ma_dr_mp3_hdr_frame_samples(dec->header);
-}
-MA_API void ma_dr_mp3dec_f32_to_s16(const float *in, ma_int16 *out, size_t num_samples)
-{
-    size_t i = 0;
-#if MA_DR_MP3_HAVE_SIMD
-    size_t aligned_count = num_samples & ~7;
-    for(; i < aligned_count; i+=8)
-    {
-        ma_dr_mp3_f4 scale = MA_DR_MP3_VSET(32768.0f);
-        ma_dr_mp3_f4 a = MA_DR_MP3_VMUL(MA_DR_MP3_VLD(&in[i  ]), scale);
-        ma_dr_mp3_f4 b = MA_DR_MP3_VMUL(MA_DR_MP3_VLD(&in[i+4]), scale);
-#if MA_DR_MP3_HAVE_SSE
-        ma_dr_mp3_f4 s16max = MA_DR_MP3_VSET( 32767.0f);
-        ma_dr_mp3_f4 s16min = MA_DR_MP3_VSET(-32768.0f);
-        __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, s16max), s16min)),
-                                        _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, s16max), s16min)));
-        out[i  ] = (ma_int16)_mm_extract_epi16(pcm8, 0);
-        out[i+1] = (ma_int16)_mm_extract_epi16(pcm8, 1);
-        out[i+2] = (ma_int16)_mm_extract_epi16(pcm8, 2);
-        out[i+3] = (ma_int16)_mm_extract_epi16(pcm8, 3);
-        out[i+4] = (ma_int16)_mm_extract_epi16(pcm8, 4);
-        out[i+5] = (ma_int16)_mm_extract_epi16(pcm8, 5);
-        out[i+6] = (ma_int16)_mm_extract_epi16(pcm8, 6);
-        out[i+7] = (ma_int16)_mm_extract_epi16(pcm8, 7);
-#else
-        int16x4_t pcma, pcmb;
-        a = MA_DR_MP3_VADD(a, MA_DR_MP3_VSET(0.5f));
-        b = MA_DR_MP3_VADD(b, MA_DR_MP3_VSET(0.5f));
-        pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, MA_DR_MP3_VSET(0)))));
-        pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, MA_DR_MP3_VSET(0)))));
-        vst1_lane_s16(out+i  , pcma, 0);
-        vst1_lane_s16(out+i+1, pcma, 1);
-        vst1_lane_s16(out+i+2, pcma, 2);
-        vst1_lane_s16(out+i+3, pcma, 3);
-        vst1_lane_s16(out+i+4, pcmb, 0);
-        vst1_lane_s16(out+i+5, pcmb, 1);
-        vst1_lane_s16(out+i+6, pcmb, 2);
-        vst1_lane_s16(out+i+7, pcmb, 3);
-#endif
-    }
-#endif
-    for(; i < num_samples; i++)
-    {
-        float sample = in[i] * 32768.0f;
-        if (sample >=  32766.5)
-            out[i] = (ma_int16) 32767;
-        else if (sample <= -32767.5)
-            out[i] = (ma_int16)-32768;
-        else
-        {
-            short s = (ma_int16)(sample + .5f);
-            s -= (s < 0);
-            out[i] = s;
-        }
-    }
-}
-#ifndef MA_DR_MP3_SEEK_LEADING_MP3_FRAMES
-#define MA_DR_MP3_SEEK_LEADING_MP3_FRAMES   2
-#endif
-#define MA_DR_MP3_MIN_DATA_CHUNK_SIZE   16384
-#ifndef MA_DR_MP3_DATA_CHUNK_SIZE
-#define MA_DR_MP3_DATA_CHUNK_SIZE  (MA_DR_MP3_MIN_DATA_CHUNK_SIZE*4)
-#endif
-#define MA_DR_MP3_COUNTOF(x)        (sizeof(x) / sizeof(x[0]))
-#define MA_DR_MP3_CLAMP(x, lo, hi)  (MA_DR_MP3_MAX(lo, MA_DR_MP3_MIN(x, hi)))
-#ifndef MA_DR_MP3_PI_D
-#define MA_DR_MP3_PI_D    3.14159265358979323846264
-#endif
-#define MA_DR_MP3_DEFAULT_RESAMPLER_LPF_ORDER   2
-static MA_INLINE float ma_dr_mp3_mix_f32(float x, float y, float a)
-{
-    return x*(1-a) + y*a;
-}
-static MA_INLINE float ma_dr_mp3_mix_f32_fast(float x, float y, float a)
-{
-    float r0 = (y - x);
-    float r1 = r0*a;
-    return x + r1;
-}
-static MA_INLINE ma_uint32 ma_dr_mp3_gcf_u32(ma_uint32 a, ma_uint32 b)
-{
-    for (;;) {
-        if (b == 0) {
-            break;
-        } else {
-            ma_uint32 t = a;
-            a = b;
-            b = t % a;
-        }
-    }
-    return a;
-}
-static void* ma_dr_mp3__malloc_default(size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return MA_DR_MP3_MALLOC(sz);
-}
-static void* ma_dr_mp3__realloc_default(void* p, size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return MA_DR_MP3_REALLOC(p, sz);
-}
-static void ma_dr_mp3__free_default(void* p, void* pUserData)
-{
-    (void)pUserData;
-    MA_DR_MP3_FREE(p);
-}
-static void* ma_dr_mp3__malloc_from_callbacks(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks == NULL) {
-        return NULL;
-    }
-    if (pAllocationCallbacks->onMalloc != NULL) {
-        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
-    }
-    if (pAllocationCallbacks->onRealloc != NULL) {
-        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
-    }
-    return NULL;
-}
-static void* ma_dr_mp3__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks == NULL) {
-        return NULL;
-    }
-    if (pAllocationCallbacks->onRealloc != NULL) {
-        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
-    }
-    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
-        void* p2;
-        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
-        if (p2 == NULL) {
-            return NULL;
-        }
-        if (p != NULL) {
-            MA_DR_MP3_COPY_MEMORY(p2, p, szOld);
-            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
-        }
-        return p2;
-    }
-    return NULL;
-}
-static void ma_dr_mp3__free_from_callbacks(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (p == NULL || pAllocationCallbacks == NULL) {
-        return;
-    }
-    if (pAllocationCallbacks->onFree != NULL) {
-        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
-    }
-}
-static ma_allocation_callbacks ma_dr_mp3_copy_allocation_callbacks_or_defaults(const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        return *pAllocationCallbacks;
-    } else {
-        ma_allocation_callbacks allocationCallbacks;
-        allocationCallbacks.pUserData = NULL;
-        allocationCallbacks.onMalloc  = ma_dr_mp3__malloc_default;
-        allocationCallbacks.onRealloc = ma_dr_mp3__realloc_default;
-        allocationCallbacks.onFree    = ma_dr_mp3__free_default;
-        return allocationCallbacks;
-    }
-}
-static size_t ma_dr_mp3__on_read(ma_dr_mp3* pMP3, void* pBufferOut, size_t bytesToRead)
-{
-    size_t bytesRead = pMP3->onRead(pMP3->pUserData, pBufferOut, bytesToRead);
-    pMP3->streamCursor += bytesRead;
-    return bytesRead;
-}
-static ma_bool32 ma_dr_mp3__on_seek(ma_dr_mp3* pMP3, int offset, ma_dr_mp3_seek_origin origin)
-{
-    MA_DR_MP3_ASSERT(offset >= 0);
-    if (!pMP3->onSeek(pMP3->pUserData, offset, origin)) {
-        return MA_FALSE;
-    }
-    if (origin == ma_dr_mp3_seek_origin_start) {
-        pMP3->streamCursor = (ma_uint64)offset;
-    } else {
-        pMP3->streamCursor += offset;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_mp3__on_seek_64(ma_dr_mp3* pMP3, ma_uint64 offset, ma_dr_mp3_seek_origin origin)
-{
-    if (offset <= 0x7FFFFFFF) {
-        return ma_dr_mp3__on_seek(pMP3, (int)offset, origin);
-    }
-    if (!ma_dr_mp3__on_seek(pMP3, 0x7FFFFFFF, ma_dr_mp3_seek_origin_start)) {
-        return MA_FALSE;
-    }
-    offset -= 0x7FFFFFFF;
-    while (offset > 0) {
-        if (offset <= 0x7FFFFFFF) {
-            if (!ma_dr_mp3__on_seek(pMP3, (int)offset, ma_dr_mp3_seek_origin_current)) {
-                return MA_FALSE;
-            }
-            offset = 0;
-        } else {
-            if (!ma_dr_mp3__on_seek(pMP3, 0x7FFFFFFF, ma_dr_mp3_seek_origin_current)) {
-                return MA_FALSE;
-            }
-            offset -= 0x7FFFFFFF;
-        }
-    }
-    return MA_TRUE;
-}
-static ma_uint32 ma_dr_mp3_decode_next_frame_ex__callbacks(ma_dr_mp3* pMP3, ma_dr_mp3d_sample_t* pPCMFrames)
-{
-    ma_uint32 pcmFramesRead = 0;
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    MA_DR_MP3_ASSERT(pMP3->onRead != NULL);
-    if (pMP3->atEnd) {
-        return 0;
-    }
-    for (;;) {
-        ma_dr_mp3dec_frame_info info;
-        if (pMP3->dataSize < MA_DR_MP3_MIN_DATA_CHUNK_SIZE) {
-            size_t bytesRead;
-            if (pMP3->pData != NULL) {
-                MA_DR_MP3_MOVE_MEMORY(pMP3->pData, pMP3->pData + pMP3->dataConsumed, pMP3->dataSize);
-            }
-            pMP3->dataConsumed = 0;
-            if (pMP3->dataCapacity < MA_DR_MP3_DATA_CHUNK_SIZE) {
-                ma_uint8* pNewData;
-                size_t newDataCap;
-                newDataCap = MA_DR_MP3_DATA_CHUNK_SIZE;
-                pNewData = (ma_uint8*)ma_dr_mp3__realloc_from_callbacks(pMP3->pData, newDataCap, pMP3->dataCapacity, &pMP3->allocationCallbacks);
-                if (pNewData == NULL) {
-                    return 0;
-                }
-                pMP3->pData = pNewData;
-                pMP3->dataCapacity = newDataCap;
-            }
-            bytesRead = ma_dr_mp3__on_read(pMP3, pMP3->pData + pMP3->dataSize, (pMP3->dataCapacity - pMP3->dataSize));
-            if (bytesRead == 0) {
-                if (pMP3->dataSize == 0) {
-                    pMP3->atEnd = MA_TRUE;
-                    return 0;
-                }
-            }
-            pMP3->dataSize += bytesRead;
-        }
-        if (pMP3->dataSize > INT_MAX) {
-            pMP3->atEnd = MA_TRUE;
-            return 0;
-        }
-        MA_DR_MP3_ASSERT(pMP3->pData != NULL);
-        MA_DR_MP3_ASSERT(pMP3->dataCapacity > 0);
-        if (pMP3->pData == NULL) {
-            return 0;
-        }
-        pcmFramesRead = ma_dr_mp3dec_decode_frame(&pMP3->decoder, pMP3->pData + pMP3->dataConsumed, (int)pMP3->dataSize, pPCMFrames, &info);
-        if (info.frame_bytes > 0) {
-            pMP3->dataConsumed += (size_t)info.frame_bytes;
-            pMP3->dataSize     -= (size_t)info.frame_bytes;
-        }
-        if (pcmFramesRead > 0) {
-            pcmFramesRead = ma_dr_mp3_hdr_frame_samples(pMP3->decoder.header);
-            pMP3->pcmFramesConsumedInMP3Frame = 0;
-            pMP3->pcmFramesRemainingInMP3Frame = pcmFramesRead;
-            pMP3->mp3FrameChannels = info.channels;
-            pMP3->mp3FrameSampleRate = info.hz;
-            break;
-        } else if (info.frame_bytes == 0) {
-            size_t bytesRead;
-            MA_DR_MP3_MOVE_MEMORY(pMP3->pData, pMP3->pData + pMP3->dataConsumed, pMP3->dataSize);
-            pMP3->dataConsumed = 0;
-            if (pMP3->dataCapacity == pMP3->dataSize) {
-                ma_uint8* pNewData;
-                size_t newDataCap;
-                newDataCap = pMP3->dataCapacity + MA_DR_MP3_DATA_CHUNK_SIZE;
-                pNewData = (ma_uint8*)ma_dr_mp3__realloc_from_callbacks(pMP3->pData, newDataCap, pMP3->dataCapacity, &pMP3->allocationCallbacks);
-                if (pNewData == NULL) {
-                    return 0;
-                }
-                pMP3->pData = pNewData;
-                pMP3->dataCapacity = newDataCap;
-            }
-            bytesRead = ma_dr_mp3__on_read(pMP3, pMP3->pData + pMP3->dataSize, (pMP3->dataCapacity - pMP3->dataSize));
-            if (bytesRead == 0) {
-                pMP3->atEnd = MA_TRUE;
-                return 0;
-            }
-            pMP3->dataSize += bytesRead;
-        }
-    };
-    return pcmFramesRead;
-}
-static ma_uint32 ma_dr_mp3_decode_next_frame_ex__memory(ma_dr_mp3* pMP3, ma_dr_mp3d_sample_t* pPCMFrames)
-{
-    ma_uint32 pcmFramesRead = 0;
-    ma_dr_mp3dec_frame_info info;
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    MA_DR_MP3_ASSERT(pMP3->memory.pData != NULL);
-    if (pMP3->atEnd) {
-        return 0;
-    }
-    for (;;) {
-        pcmFramesRead = ma_dr_mp3dec_decode_frame(&pMP3->decoder, pMP3->memory.pData + pMP3->memory.currentReadPos, (int)(pMP3->memory.dataSize - pMP3->memory.currentReadPos), pPCMFrames, &info);
-        if (pcmFramesRead > 0) {
-            pcmFramesRead = ma_dr_mp3_hdr_frame_samples(pMP3->decoder.header);
-            pMP3->pcmFramesConsumedInMP3Frame  = 0;
-            pMP3->pcmFramesRemainingInMP3Frame = pcmFramesRead;
-            pMP3->mp3FrameChannels             = info.channels;
-            pMP3->mp3FrameSampleRate           = info.hz;
-            break;
-        } else if (info.frame_bytes > 0) {
-            pMP3->memory.currentReadPos += (size_t)info.frame_bytes;
-        } else {
-            break;
-        }
-    }
-    pMP3->memory.currentReadPos += (size_t)info.frame_bytes;
-    return pcmFramesRead;
-}
-static ma_uint32 ma_dr_mp3_decode_next_frame_ex(ma_dr_mp3* pMP3, ma_dr_mp3d_sample_t* pPCMFrames)
-{
-    if (pMP3->memory.pData != NULL && pMP3->memory.dataSize > 0) {
-        return ma_dr_mp3_decode_next_frame_ex__memory(pMP3, pPCMFrames);
-    } else {
-        return ma_dr_mp3_decode_next_frame_ex__callbacks(pMP3, pPCMFrames);
-    }
-}
-static ma_uint32 ma_dr_mp3_decode_next_frame(ma_dr_mp3* pMP3)
-{
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    return ma_dr_mp3_decode_next_frame_ex(pMP3, (ma_dr_mp3d_sample_t*)pMP3->pcmFrames);
-}
-#if 0
-static ma_uint32 ma_dr_mp3_seek_next_frame(ma_dr_mp3* pMP3)
-{
-    ma_uint32 pcmFrameCount;
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    pcmFrameCount = ma_dr_mp3_decode_next_frame_ex(pMP3, NULL);
-    if (pcmFrameCount == 0) {
-        return 0;
-    }
-    pMP3->currentPCMFrame             += pcmFrameCount;
-    pMP3->pcmFramesConsumedInMP3Frame  = pcmFrameCount;
-    pMP3->pcmFramesRemainingInMP3Frame = 0;
-    return pcmFrameCount;
-}
-#endif
-static ma_bool32 ma_dr_mp3_init_internal(ma_dr_mp3* pMP3, ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    MA_DR_MP3_ASSERT(onRead != NULL);
-    ma_dr_mp3dec_init(&pMP3->decoder);
-    pMP3->onRead = onRead;
-    pMP3->onSeek = onSeek;
-    pMP3->pUserData = pUserData;
-    pMP3->allocationCallbacks = ma_dr_mp3_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
-    if (pMP3->allocationCallbacks.onFree == NULL || (pMP3->allocationCallbacks.onMalloc == NULL && pMP3->allocationCallbacks.onRealloc == NULL)) {
-        return MA_FALSE;
-    }
-    if (ma_dr_mp3_decode_next_frame(pMP3) == 0) {
-        ma_dr_mp3__free_from_callbacks(pMP3->pData, &pMP3->allocationCallbacks);
-        return MA_FALSE;
-    }
-    pMP3->channels   = pMP3->mp3FrameChannels;
-    pMP3->sampleRate = pMP3->mp3FrameSampleRate;
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_mp3_init(ma_dr_mp3* pMP3, ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pMP3 == NULL || onRead == NULL) {
-        return MA_FALSE;
-    }
-    MA_DR_MP3_ZERO_OBJECT(pMP3);
-    return ma_dr_mp3_init_internal(pMP3, onRead, onSeek, pUserData, pAllocationCallbacks);
-}
-static size_t ma_dr_mp3__on_read_memory(void* pUserData, void* pBufferOut, size_t bytesToRead)
-{
-    ma_dr_mp3* pMP3 = (ma_dr_mp3*)pUserData;
-    size_t bytesRemaining;
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    MA_DR_MP3_ASSERT(pMP3->memory.dataSize >= pMP3->memory.currentReadPos);
-    bytesRemaining = pMP3->memory.dataSize - pMP3->memory.currentReadPos;
-    if (bytesToRead > bytesRemaining) {
-        bytesToRead = bytesRemaining;
-    }
-    if (bytesToRead > 0) {
-        MA_DR_MP3_COPY_MEMORY(pBufferOut, pMP3->memory.pData + pMP3->memory.currentReadPos, bytesToRead);
-        pMP3->memory.currentReadPos += bytesToRead;
-    }
-    return bytesToRead;
-}
-static ma_bool32 ma_dr_mp3__on_seek_memory(void* pUserData, int byteOffset, ma_dr_mp3_seek_origin origin)
-{
-    ma_dr_mp3* pMP3 = (ma_dr_mp3*)pUserData;
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    if (origin == ma_dr_mp3_seek_origin_current) {
-        if (byteOffset > 0) {
-            if (pMP3->memory.currentReadPos + byteOffset > pMP3->memory.dataSize) {
-                byteOffset = (int)(pMP3->memory.dataSize - pMP3->memory.currentReadPos);
-            }
-        } else {
-            if (pMP3->memory.currentReadPos < (size_t)-byteOffset) {
-                byteOffset = -(int)pMP3->memory.currentReadPos;
-            }
-        }
-        pMP3->memory.currentReadPos += byteOffset;
-    } else {
-        if ((ma_uint32)byteOffset <= pMP3->memory.dataSize) {
-            pMP3->memory.currentReadPos = byteOffset;
-        } else {
-            pMP3->memory.currentReadPos = pMP3->memory.dataSize;
-        }
-    }
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_mp3_init_memory(ma_dr_mp3* pMP3, const void* pData, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pMP3 == NULL) {
-        return MA_FALSE;
-    }
-    MA_DR_MP3_ZERO_OBJECT(pMP3);
-    if (pData == NULL || dataSize == 0) {
-        return MA_FALSE;
-    }
-    pMP3->memory.pData = (const ma_uint8*)pData;
-    pMP3->memory.dataSize = dataSize;
-    pMP3->memory.currentReadPos = 0;
-    return ma_dr_mp3_init_internal(pMP3, ma_dr_mp3__on_read_memory, ma_dr_mp3__on_seek_memory, pMP3, pAllocationCallbacks);
-}
-#ifndef MA_DR_MP3_NO_STDIO
-#include <stdio.h>
-#include <wchar.h>
-static size_t ma_dr_mp3__on_read_stdio(void* pUserData, void* pBufferOut, size_t bytesToRead)
-{
-    return fread(pBufferOut, 1, bytesToRead, (FILE*)pUserData);
-}
-static ma_bool32 ma_dr_mp3__on_seek_stdio(void* pUserData, int offset, ma_dr_mp3_seek_origin origin)
-{
-    return fseek((FILE*)pUserData, offset, (origin == ma_dr_mp3_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0;
-}
-MA_API ma_bool32 ma_dr_mp3_init_file(ma_dr_mp3* pMP3, const char* pFilePath, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_bool32 result;
-    FILE* pFile;
-    if (ma_fopen(&pFile, pFilePath, "rb") != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-    result = ma_dr_mp3_init(pMP3, ma_dr_mp3__on_read_stdio, ma_dr_mp3__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
-    if (result != MA_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_mp3_init_file_w(ma_dr_mp3* pMP3, const wchar_t* pFilePath, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_bool32 result;
-    FILE* pFile;
-    if (ma_wfopen(&pFile, pFilePath, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
-        return MA_FALSE;
-    }
-    result = ma_dr_mp3_init(pMP3, ma_dr_mp3__on_read_stdio, ma_dr_mp3__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
-    if (result != MA_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-    return MA_TRUE;
-}
-#endif
-MA_API void ma_dr_mp3_uninit(ma_dr_mp3* pMP3)
-{
-    if (pMP3 == NULL) {
-        return;
-    }
-#ifndef MA_DR_MP3_NO_STDIO
-    if (pMP3->onRead == ma_dr_mp3__on_read_stdio) {
-        FILE* pFile = (FILE*)pMP3->pUserData;
-        if (pFile != NULL) {
-            fclose(pFile);
-            pMP3->pUserData = NULL;
-        }
-    }
-#endif
-    ma_dr_mp3__free_from_callbacks(pMP3->pData, &pMP3->allocationCallbacks);
-}
-#if defined(MA_DR_MP3_FLOAT_OUTPUT)
-static void ma_dr_mp3_f32_to_s16(ma_int16* dst, const float* src, ma_uint64 sampleCount)
-{
-    ma_uint64 i;
-    ma_uint64 i4;
-    ma_uint64 sampleCount4;
-    i = 0;
-    sampleCount4 = sampleCount >> 2;
-    for (i4 = 0; i4 < sampleCount4; i4 += 1) {
-        float x0 = src[i+0];
-        float x1 = src[i+1];
-        float x2 = src[i+2];
-        float x3 = src[i+3];
-        x0 = ((x0 < -1) ? -1 : ((x0 > 1) ? 1 : x0));
-        x1 = ((x1 < -1) ? -1 : ((x1 > 1) ? 1 : x1));
-        x2 = ((x2 < -1) ? -1 : ((x2 > 1) ? 1 : x2));
-        x3 = ((x3 < -1) ? -1 : ((x3 > 1) ? 1 : x3));
-        x0 = x0 * 32767.0f;
-        x1 = x1 * 32767.0f;
-        x2 = x2 * 32767.0f;
-        x3 = x3 * 32767.0f;
-        dst[i+0] = (ma_int16)x0;
-        dst[i+1] = (ma_int16)x1;
-        dst[i+2] = (ma_int16)x2;
-        dst[i+3] = (ma_int16)x3;
-        i += 4;
-    }
-    for (; i < sampleCount; i += 1) {
-        float x = src[i];
-        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
-        x = x * 32767.0f;
-        dst[i] = (ma_int16)x;
-    }
-}
-#endif
-#if !defined(MA_DR_MP3_FLOAT_OUTPUT)
-static void ma_dr_mp3_s16_to_f32(float* dst, const ma_int16* src, ma_uint64 sampleCount)
-{
-    ma_uint64 i;
-    for (i = 0; i < sampleCount; i += 1) {
-        float x = (float)src[i];
-        x = x * 0.000030517578125f;
-        dst[i] = x;
-    }
-}
-#endif
-static ma_uint64 ma_dr_mp3_read_pcm_frames_raw(ma_dr_mp3* pMP3, ma_uint64 framesToRead, void* pBufferOut)
-{
-    ma_uint64 totalFramesRead = 0;
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    MA_DR_MP3_ASSERT(pMP3->onRead != NULL);
-    while (framesToRead > 0) {
-        ma_uint32 framesToConsume = (ma_uint32)MA_DR_MP3_MIN(pMP3->pcmFramesRemainingInMP3Frame, framesToRead);
-        if (pBufferOut != NULL) {
-        #if defined(MA_DR_MP3_FLOAT_OUTPUT)
-            float* pFramesOutF32 = (float*)MA_DR_MP3_OFFSET_PTR(pBufferOut,          sizeof(float) * totalFramesRead                   * pMP3->channels);
-            float* pFramesInF32  = (float*)MA_DR_MP3_OFFSET_PTR(&pMP3->pcmFrames[0], sizeof(float) * pMP3->pcmFramesConsumedInMP3Frame * pMP3->mp3FrameChannels);
-            MA_DR_MP3_COPY_MEMORY(pFramesOutF32, pFramesInF32, sizeof(float) * framesToConsume * pMP3->channels);
-        #else
-            ma_int16* pFramesOutS16 = (ma_int16*)MA_DR_MP3_OFFSET_PTR(pBufferOut,          sizeof(ma_int16) * totalFramesRead                   * pMP3->channels);
-            ma_int16* pFramesInS16  = (ma_int16*)MA_DR_MP3_OFFSET_PTR(&pMP3->pcmFrames[0], sizeof(ma_int16) * pMP3->pcmFramesConsumedInMP3Frame * pMP3->mp3FrameChannels);
-            MA_DR_MP3_COPY_MEMORY(pFramesOutS16, pFramesInS16, sizeof(ma_int16) * framesToConsume * pMP3->channels);
-        #endif
-        }
-        pMP3->currentPCMFrame              += framesToConsume;
-        pMP3->pcmFramesConsumedInMP3Frame  += framesToConsume;
-        pMP3->pcmFramesRemainingInMP3Frame -= framesToConsume;
-        totalFramesRead                    += framesToConsume;
-        framesToRead                       -= framesToConsume;
-        if (framesToRead == 0) {
-            break;
-        }
-        MA_DR_MP3_ASSERT(pMP3->pcmFramesRemainingInMP3Frame == 0);
-        if (ma_dr_mp3_decode_next_frame(pMP3) == 0) {
-            break;
-        }
-    }
-    return totalFramesRead;
-}
-MA_API ma_uint64 ma_dr_mp3_read_pcm_frames_f32(ma_dr_mp3* pMP3, ma_uint64 framesToRead, float* pBufferOut)
-{
-    if (pMP3 == NULL || pMP3->onRead == NULL) {
-        return 0;
-    }
-#if defined(MA_DR_MP3_FLOAT_OUTPUT)
-    return ma_dr_mp3_read_pcm_frames_raw(pMP3, framesToRead, pBufferOut);
-#else
-    {
-        ma_int16 pTempS16[8192];
-        ma_uint64 totalPCMFramesRead = 0;
-        while (totalPCMFramesRead < framesToRead) {
-            ma_uint64 framesJustRead;
-            ma_uint64 framesRemaining = framesToRead - totalPCMFramesRead;
-            ma_uint64 framesToReadNow = MA_DR_MP3_COUNTOF(pTempS16) / pMP3->channels;
-            if (framesToReadNow > framesRemaining) {
-                framesToReadNow = framesRemaining;
-            }
-            framesJustRead = ma_dr_mp3_read_pcm_frames_raw(pMP3, framesToReadNow, pTempS16);
-            if (framesJustRead == 0) {
-                break;
-            }
-            ma_dr_mp3_s16_to_f32((float*)MA_DR_MP3_OFFSET_PTR(pBufferOut, sizeof(float) * totalPCMFramesRead * pMP3->channels), pTempS16, framesJustRead * pMP3->channels);
-            totalPCMFramesRead += framesJustRead;
-        }
-        return totalPCMFramesRead;
-    }
-#endif
-}
-MA_API ma_uint64 ma_dr_mp3_read_pcm_frames_s16(ma_dr_mp3* pMP3, ma_uint64 framesToRead, ma_int16* pBufferOut)
-{
-    if (pMP3 == NULL || pMP3->onRead == NULL) {
-        return 0;
-    }
-#if !defined(MA_DR_MP3_FLOAT_OUTPUT)
-    return ma_dr_mp3_read_pcm_frames_raw(pMP3, framesToRead, pBufferOut);
-#else
-    {
-        float pTempF32[4096];
-        ma_uint64 totalPCMFramesRead = 0;
-        while (totalPCMFramesRead < framesToRead) {
-            ma_uint64 framesJustRead;
-            ma_uint64 framesRemaining = framesToRead - totalPCMFramesRead;
-            ma_uint64 framesToReadNow = MA_DR_MP3_COUNTOF(pTempF32) / pMP3->channels;
-            if (framesToReadNow > framesRemaining) {
-                framesToReadNow = framesRemaining;
-            }
-            framesJustRead = ma_dr_mp3_read_pcm_frames_raw(pMP3, framesToReadNow, pTempF32);
-            if (framesJustRead == 0) {
-                break;
-            }
-            ma_dr_mp3_f32_to_s16((ma_int16*)MA_DR_MP3_OFFSET_PTR(pBufferOut, sizeof(ma_int16) * totalPCMFramesRead * pMP3->channels), pTempF32, framesJustRead * pMP3->channels);
-            totalPCMFramesRead += framesJustRead;
-        }
-        return totalPCMFramesRead;
-    }
-#endif
-}
-static void ma_dr_mp3_reset(ma_dr_mp3* pMP3)
-{
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    pMP3->pcmFramesConsumedInMP3Frame = 0;
-    pMP3->pcmFramesRemainingInMP3Frame = 0;
-    pMP3->currentPCMFrame = 0;
-    pMP3->dataSize = 0;
-    pMP3->atEnd = MA_FALSE;
-    ma_dr_mp3dec_init(&pMP3->decoder);
-}
-static ma_bool32 ma_dr_mp3_seek_to_start_of_stream(ma_dr_mp3* pMP3)
-{
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    MA_DR_MP3_ASSERT(pMP3->onSeek != NULL);
-    if (!ma_dr_mp3__on_seek(pMP3, 0, ma_dr_mp3_seek_origin_start)) {
-        return MA_FALSE;
-    }
-    ma_dr_mp3_reset(pMP3);
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_mp3_seek_forward_by_pcm_frames__brute_force(ma_dr_mp3* pMP3, ma_uint64 frameOffset)
-{
-    ma_uint64 framesRead;
-#if defined(MA_DR_MP3_FLOAT_OUTPUT)
-    framesRead = ma_dr_mp3_read_pcm_frames_f32(pMP3, frameOffset, NULL);
-#else
-    framesRead = ma_dr_mp3_read_pcm_frames_s16(pMP3, frameOffset, NULL);
-#endif
-    if (framesRead != frameOffset) {
-        return MA_FALSE;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_mp3_seek_to_pcm_frame__brute_force(ma_dr_mp3* pMP3, ma_uint64 frameIndex)
-{
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    if (frameIndex == pMP3->currentPCMFrame) {
-        return MA_TRUE;
-    }
-    if (frameIndex < pMP3->currentPCMFrame) {
-        if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
-            return MA_FALSE;
-        }
-    }
-    MA_DR_MP3_ASSERT(frameIndex >= pMP3->currentPCMFrame);
-    return ma_dr_mp3_seek_forward_by_pcm_frames__brute_force(pMP3, (frameIndex - pMP3->currentPCMFrame));
-}
-static ma_bool32 ma_dr_mp3_find_closest_seek_point(ma_dr_mp3* pMP3, ma_uint64 frameIndex, ma_uint32* pSeekPointIndex)
-{
-    ma_uint32 iSeekPoint;
-    MA_DR_MP3_ASSERT(pSeekPointIndex != NULL);
-    *pSeekPointIndex = 0;
-    if (frameIndex < pMP3->pSeekPoints[0].pcmFrameIndex) {
-        return MA_FALSE;
-    }
-    for (iSeekPoint = 0; iSeekPoint < pMP3->seekPointCount; ++iSeekPoint) {
-        if (pMP3->pSeekPoints[iSeekPoint].pcmFrameIndex > frameIndex) {
-            break;
-        }
-        *pSeekPointIndex = iSeekPoint;
-    }
-    return MA_TRUE;
-}
-static ma_bool32 ma_dr_mp3_seek_to_pcm_frame__seek_table(ma_dr_mp3* pMP3, ma_uint64 frameIndex)
-{
-    ma_dr_mp3_seek_point seekPoint;
-    ma_uint32 priorSeekPointIndex;
-    ma_uint16 iMP3Frame;
-    ma_uint64 leftoverFrames;
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    MA_DR_MP3_ASSERT(pMP3->pSeekPoints != NULL);
-    MA_DR_MP3_ASSERT(pMP3->seekPointCount > 0);
-    if (ma_dr_mp3_find_closest_seek_point(pMP3, frameIndex, &priorSeekPointIndex)) {
-        seekPoint = pMP3->pSeekPoints[priorSeekPointIndex];
-    } else {
-        seekPoint.seekPosInBytes     = 0;
-        seekPoint.pcmFrameIndex      = 0;
-        seekPoint.mp3FramesToDiscard = 0;
-        seekPoint.pcmFramesToDiscard = 0;
-    }
-    if (!ma_dr_mp3__on_seek_64(pMP3, seekPoint.seekPosInBytes, ma_dr_mp3_seek_origin_start)) {
-        return MA_FALSE;
-    }
-    ma_dr_mp3_reset(pMP3);
-    for (iMP3Frame = 0; iMP3Frame < seekPoint.mp3FramesToDiscard; ++iMP3Frame) {
-        ma_uint32 pcmFramesRead;
-        ma_dr_mp3d_sample_t* pPCMFrames;
-        pPCMFrames = NULL;
-        if (iMP3Frame == seekPoint.mp3FramesToDiscard-1) {
-            pPCMFrames = (ma_dr_mp3d_sample_t*)pMP3->pcmFrames;
-        }
-        pcmFramesRead = ma_dr_mp3_decode_next_frame_ex(pMP3, pPCMFrames);
-        if (pcmFramesRead == 0) {
-            return MA_FALSE;
-        }
-    }
-    pMP3->currentPCMFrame = seekPoint.pcmFrameIndex - seekPoint.pcmFramesToDiscard;
-    leftoverFrames = frameIndex - pMP3->currentPCMFrame;
-    return ma_dr_mp3_seek_forward_by_pcm_frames__brute_force(pMP3, leftoverFrames);
-}
-MA_API ma_bool32 ma_dr_mp3_seek_to_pcm_frame(ma_dr_mp3* pMP3, ma_uint64 frameIndex)
-{
-    if (pMP3 == NULL || pMP3->onSeek == NULL) {
-        return MA_FALSE;
-    }
-    if (frameIndex == 0) {
-        return ma_dr_mp3_seek_to_start_of_stream(pMP3);
-    }
-    if (pMP3->pSeekPoints != NULL && pMP3->seekPointCount > 0) {
-        return ma_dr_mp3_seek_to_pcm_frame__seek_table(pMP3, frameIndex);
-    } else {
-        return ma_dr_mp3_seek_to_pcm_frame__brute_force(pMP3, frameIndex);
-    }
-}
-MA_API ma_bool32 ma_dr_mp3_get_mp3_and_pcm_frame_count(ma_dr_mp3* pMP3, ma_uint64* pMP3FrameCount, ma_uint64* pPCMFrameCount)
-{
-    ma_uint64 currentPCMFrame;
-    ma_uint64 totalPCMFrameCount;
-    ma_uint64 totalMP3FrameCount;
-    if (pMP3 == NULL) {
-        return MA_FALSE;
-    }
-    if (pMP3->onSeek == NULL) {
-        return MA_FALSE;
-    }
-    currentPCMFrame = pMP3->currentPCMFrame;
-    if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
-        return MA_FALSE;
-    }
-    totalPCMFrameCount = 0;
-    totalMP3FrameCount = 0;
-    for (;;) {
-        ma_uint32 pcmFramesInCurrentMP3Frame;
-        pcmFramesInCurrentMP3Frame = ma_dr_mp3_decode_next_frame_ex(pMP3, NULL);
-        if (pcmFramesInCurrentMP3Frame == 0) {
-            break;
-        }
-        totalPCMFrameCount += pcmFramesInCurrentMP3Frame;
-        totalMP3FrameCount += 1;
-    }
-    if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
-        return MA_FALSE;
-    }
-    if (!ma_dr_mp3_seek_to_pcm_frame(pMP3, currentPCMFrame)) {
-        return MA_FALSE;
-    }
-    if (pMP3FrameCount != NULL) {
-        *pMP3FrameCount = totalMP3FrameCount;
-    }
-    if (pPCMFrameCount != NULL) {
-        *pPCMFrameCount = totalPCMFrameCount;
-    }
-    return MA_TRUE;
-}
-MA_API ma_uint64 ma_dr_mp3_get_pcm_frame_count(ma_dr_mp3* pMP3)
-{
-    ma_uint64 totalPCMFrameCount;
-    if (!ma_dr_mp3_get_mp3_and_pcm_frame_count(pMP3, NULL, &totalPCMFrameCount)) {
-        return 0;
-    }
-    return totalPCMFrameCount;
-}
-MA_API ma_uint64 ma_dr_mp3_get_mp3_frame_count(ma_dr_mp3* pMP3)
-{
-    ma_uint64 totalMP3FrameCount;
-    if (!ma_dr_mp3_get_mp3_and_pcm_frame_count(pMP3, &totalMP3FrameCount, NULL)) {
-        return 0;
-    }
-    return totalMP3FrameCount;
-}
-static void ma_dr_mp3__accumulate_running_pcm_frame_count(ma_dr_mp3* pMP3, ma_uint32 pcmFrameCountIn, ma_uint64* pRunningPCMFrameCount, float* pRunningPCMFrameCountFractionalPart)
-{
-    float srcRatio;
-    float pcmFrameCountOutF;
-    ma_uint32 pcmFrameCountOut;
-    srcRatio = (float)pMP3->mp3FrameSampleRate / (float)pMP3->sampleRate;
-    MA_DR_MP3_ASSERT(srcRatio > 0);
-    pcmFrameCountOutF = *pRunningPCMFrameCountFractionalPart + (pcmFrameCountIn / srcRatio);
-    pcmFrameCountOut  = (ma_uint32)pcmFrameCountOutF;
-    *pRunningPCMFrameCountFractionalPart = pcmFrameCountOutF - pcmFrameCountOut;
-    *pRunningPCMFrameCount += pcmFrameCountOut;
-}
-typedef struct
-{
-    ma_uint64 bytePos;
-    ma_uint64 pcmFrameIndex;
-} ma_dr_mp3__seeking_mp3_frame_info;
-MA_API ma_bool32 ma_dr_mp3_calculate_seek_points(ma_dr_mp3* pMP3, ma_uint32* pSeekPointCount, ma_dr_mp3_seek_point* pSeekPoints)
-{
-    ma_uint32 seekPointCount;
-    ma_uint64 currentPCMFrame;
-    ma_uint64 totalMP3FrameCount;
-    ma_uint64 totalPCMFrameCount;
-    if (pMP3 == NULL || pSeekPointCount == NULL || pSeekPoints == NULL) {
-        return MA_FALSE;
-    }
-    seekPointCount = *pSeekPointCount;
-    if (seekPointCount == 0) {
-        return MA_FALSE;
-    }
-    currentPCMFrame = pMP3->currentPCMFrame;
-    if (!ma_dr_mp3_get_mp3_and_pcm_frame_count(pMP3, &totalMP3FrameCount, &totalPCMFrameCount)) {
-        return MA_FALSE;
-    }
-    if (totalMP3FrameCount < MA_DR_MP3_SEEK_LEADING_MP3_FRAMES+1) {
-        seekPointCount = 1;
-        pSeekPoints[0].seekPosInBytes     = 0;
-        pSeekPoints[0].pcmFrameIndex      = 0;
-        pSeekPoints[0].mp3FramesToDiscard = 0;
-        pSeekPoints[0].pcmFramesToDiscard = 0;
-    } else {
-        ma_uint64 pcmFramesBetweenSeekPoints;
-        ma_dr_mp3__seeking_mp3_frame_info mp3FrameInfo[MA_DR_MP3_SEEK_LEADING_MP3_FRAMES+1];
-        ma_uint64 runningPCMFrameCount = 0;
-        float runningPCMFrameCountFractionalPart = 0;
-        ma_uint64 nextTargetPCMFrame;
-        ma_uint32 iMP3Frame;
-        ma_uint32 iSeekPoint;
-        if (seekPointCount > totalMP3FrameCount-1) {
-            seekPointCount = (ma_uint32)totalMP3FrameCount-1;
-        }
-        pcmFramesBetweenSeekPoints = totalPCMFrameCount / (seekPointCount+1);
-        if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
-            return MA_FALSE;
-        }
-        for (iMP3Frame = 0; iMP3Frame < MA_DR_MP3_SEEK_LEADING_MP3_FRAMES+1; ++iMP3Frame) {
-            ma_uint32 pcmFramesInCurrentMP3FrameIn;
-            MA_DR_MP3_ASSERT(pMP3->streamCursor >= pMP3->dataSize);
-            mp3FrameInfo[iMP3Frame].bytePos       = pMP3->streamCursor - pMP3->dataSize;
-            mp3FrameInfo[iMP3Frame].pcmFrameIndex = runningPCMFrameCount;
-            pcmFramesInCurrentMP3FrameIn = ma_dr_mp3_decode_next_frame_ex(pMP3, NULL);
-            if (pcmFramesInCurrentMP3FrameIn == 0) {
-                return MA_FALSE;
-            }
-            ma_dr_mp3__accumulate_running_pcm_frame_count(pMP3, pcmFramesInCurrentMP3FrameIn, &runningPCMFrameCount, &runningPCMFrameCountFractionalPart);
-        }
-        nextTargetPCMFrame = 0;
-        for (iSeekPoint = 0; iSeekPoint < seekPointCount; ++iSeekPoint) {
-            nextTargetPCMFrame += pcmFramesBetweenSeekPoints;
-            for (;;) {
-                if (nextTargetPCMFrame < runningPCMFrameCount) {
-                    pSeekPoints[iSeekPoint].seekPosInBytes     = mp3FrameInfo[0].bytePos;
-                    pSeekPoints[iSeekPoint].pcmFrameIndex      = nextTargetPCMFrame;
-                    pSeekPoints[iSeekPoint].mp3FramesToDiscard = MA_DR_MP3_SEEK_LEADING_MP3_FRAMES;
-                    pSeekPoints[iSeekPoint].pcmFramesToDiscard = (ma_uint16)(nextTargetPCMFrame - mp3FrameInfo[MA_DR_MP3_SEEK_LEADING_MP3_FRAMES-1].pcmFrameIndex);
-                    break;
-                } else {
-                    size_t i;
-                    ma_uint32 pcmFramesInCurrentMP3FrameIn;
-                    for (i = 0; i < MA_DR_MP3_COUNTOF(mp3FrameInfo)-1; ++i) {
-                        mp3FrameInfo[i] = mp3FrameInfo[i+1];
-                    }
-                    mp3FrameInfo[MA_DR_MP3_COUNTOF(mp3FrameInfo)-1].bytePos       = pMP3->streamCursor - pMP3->dataSize;
-                    mp3FrameInfo[MA_DR_MP3_COUNTOF(mp3FrameInfo)-1].pcmFrameIndex = runningPCMFrameCount;
-                    pcmFramesInCurrentMP3FrameIn = ma_dr_mp3_decode_next_frame_ex(pMP3, NULL);
-                    if (pcmFramesInCurrentMP3FrameIn == 0) {
-                        pSeekPoints[iSeekPoint].seekPosInBytes     = mp3FrameInfo[0].bytePos;
-                        pSeekPoints[iSeekPoint].pcmFrameIndex      = nextTargetPCMFrame;
-                        pSeekPoints[iSeekPoint].mp3FramesToDiscard = MA_DR_MP3_SEEK_LEADING_MP3_FRAMES;
-                        pSeekPoints[iSeekPoint].pcmFramesToDiscard = (ma_uint16)(nextTargetPCMFrame - mp3FrameInfo[MA_DR_MP3_SEEK_LEADING_MP3_FRAMES-1].pcmFrameIndex);
-                        break;
-                    }
-                    ma_dr_mp3__accumulate_running_pcm_frame_count(pMP3, pcmFramesInCurrentMP3FrameIn, &runningPCMFrameCount, &runningPCMFrameCountFractionalPart);
-                }
-            }
-        }
-        if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
-            return MA_FALSE;
-        }
-        if (!ma_dr_mp3_seek_to_pcm_frame(pMP3, currentPCMFrame)) {
-            return MA_FALSE;
-        }
-    }
-    *pSeekPointCount = seekPointCount;
-    return MA_TRUE;
-}
-MA_API ma_bool32 ma_dr_mp3_bind_seek_table(ma_dr_mp3* pMP3, ma_uint32 seekPointCount, ma_dr_mp3_seek_point* pSeekPoints)
-{
-    if (pMP3 == NULL) {
-        return MA_FALSE;
-    }
-    if (seekPointCount == 0 || pSeekPoints == NULL) {
-        pMP3->seekPointCount = 0;
-        pMP3->pSeekPoints = NULL;
-    } else {
-        pMP3->seekPointCount = seekPointCount;
-        pMP3->pSeekPoints = pSeekPoints;
-    }
-    return MA_TRUE;
-}
-static float* ma_dr_mp3__full_read_and_close_f32(ma_dr_mp3* pMP3, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount)
-{
-    ma_uint64 totalFramesRead = 0;
-    ma_uint64 framesCapacity = 0;
-    float* pFrames = NULL;
-    float temp[4096];
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    for (;;) {
-        ma_uint64 framesToReadRightNow = MA_DR_MP3_COUNTOF(temp) / pMP3->channels;
-        ma_uint64 framesJustRead = ma_dr_mp3_read_pcm_frames_f32(pMP3, framesToReadRightNow, temp);
-        if (framesJustRead == 0) {
-            break;
-        }
-        if (framesCapacity < totalFramesRead + framesJustRead) {
-            ma_uint64 oldFramesBufferSize;
-            ma_uint64 newFramesBufferSize;
-            ma_uint64 newFramesCap;
-            float* pNewFrames;
-            newFramesCap = framesCapacity * 2;
-            if (newFramesCap < totalFramesRead + framesJustRead) {
-                newFramesCap = totalFramesRead + framesJustRead;
-            }
-            oldFramesBufferSize = framesCapacity * pMP3->channels * sizeof(float);
-            newFramesBufferSize = newFramesCap   * pMP3->channels * sizeof(float);
-            if (newFramesBufferSize > (ma_uint64)MA_SIZE_MAX) {
-                break;
-            }
-            pNewFrames = (float*)ma_dr_mp3__realloc_from_callbacks(pFrames, (size_t)newFramesBufferSize, (size_t)oldFramesBufferSize, &pMP3->allocationCallbacks);
-            if (pNewFrames == NULL) {
-                ma_dr_mp3__free_from_callbacks(pFrames, &pMP3->allocationCallbacks);
-                break;
-            }
-            pFrames = pNewFrames;
-            framesCapacity = newFramesCap;
-        }
-        MA_DR_MP3_COPY_MEMORY(pFrames + totalFramesRead*pMP3->channels, temp, (size_t)(framesJustRead*pMP3->channels*sizeof(float)));
-        totalFramesRead += framesJustRead;
-        if (framesJustRead != framesToReadRightNow) {
-            break;
-        }
-    }
-    if (pConfig != NULL) {
-        pConfig->channels   = pMP3->channels;
-        pConfig->sampleRate = pMP3->sampleRate;
-    }
-    ma_dr_mp3_uninit(pMP3);
-    if (pTotalFrameCount) {
-        *pTotalFrameCount = totalFramesRead;
-    }
-    return pFrames;
-}
-static ma_int16* ma_dr_mp3__full_read_and_close_s16(ma_dr_mp3* pMP3, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount)
-{
-    ma_uint64 totalFramesRead = 0;
-    ma_uint64 framesCapacity = 0;
-    ma_int16* pFrames = NULL;
-    ma_int16 temp[4096];
-    MA_DR_MP3_ASSERT(pMP3 != NULL);
-    for (;;) {
-        ma_uint64 framesToReadRightNow = MA_DR_MP3_COUNTOF(temp) / pMP3->channels;
-        ma_uint64 framesJustRead = ma_dr_mp3_read_pcm_frames_s16(pMP3, framesToReadRightNow, temp);
-        if (framesJustRead == 0) {
-            break;
-        }
-        if (framesCapacity < totalFramesRead + framesJustRead) {
-            ma_uint64 newFramesBufferSize;
-            ma_uint64 oldFramesBufferSize;
-            ma_uint64 newFramesCap;
-            ma_int16* pNewFrames;
-            newFramesCap = framesCapacity * 2;
-            if (newFramesCap < totalFramesRead + framesJustRead) {
-                newFramesCap = totalFramesRead + framesJustRead;
-            }
-            oldFramesBufferSize = framesCapacity * pMP3->channels * sizeof(ma_int16);
-            newFramesBufferSize = newFramesCap   * pMP3->channels * sizeof(ma_int16);
-            if (newFramesBufferSize > (ma_uint64)MA_SIZE_MAX) {
-                break;
-            }
-            pNewFrames = (ma_int16*)ma_dr_mp3__realloc_from_callbacks(pFrames, (size_t)newFramesBufferSize, (size_t)oldFramesBufferSize, &pMP3->allocationCallbacks);
-            if (pNewFrames == NULL) {
-                ma_dr_mp3__free_from_callbacks(pFrames, &pMP3->allocationCallbacks);
-                break;
-            }
-            pFrames = pNewFrames;
-            framesCapacity = newFramesCap;
-        }
-        MA_DR_MP3_COPY_MEMORY(pFrames + totalFramesRead*pMP3->channels, temp, (size_t)(framesJustRead*pMP3->channels*sizeof(ma_int16)));
-        totalFramesRead += framesJustRead;
-        if (framesJustRead != framesToReadRightNow) {
-            break;
-        }
-    }
-    if (pConfig != NULL) {
-        pConfig->channels   = pMP3->channels;
-        pConfig->sampleRate = pMP3->sampleRate;
-    }
-    ma_dr_mp3_uninit(pMP3);
-    if (pTotalFrameCount) {
-        *pTotalFrameCount = totalFramesRead;
-    }
-    return pFrames;
-}
-MA_API float* ma_dr_mp3_open_and_read_pcm_frames_f32(ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_mp3 mp3;
-    if (!ma_dr_mp3_init(&mp3, onRead, onSeek, pUserData, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_mp3__full_read_and_close_f32(&mp3, pConfig, pTotalFrameCount);
-}
-MA_API ma_int16* ma_dr_mp3_open_and_read_pcm_frames_s16(ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_mp3 mp3;
-    if (!ma_dr_mp3_init(&mp3, onRead, onSeek, pUserData, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_mp3__full_read_and_close_s16(&mp3, pConfig, pTotalFrameCount);
-}
-MA_API float* ma_dr_mp3_open_memory_and_read_pcm_frames_f32(const void* pData, size_t dataSize, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_mp3 mp3;
-    if (!ma_dr_mp3_init_memory(&mp3, pData, dataSize, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_mp3__full_read_and_close_f32(&mp3, pConfig, pTotalFrameCount);
-}
-MA_API ma_int16* ma_dr_mp3_open_memory_and_read_pcm_frames_s16(const void* pData, size_t dataSize, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_mp3 mp3;
-    if (!ma_dr_mp3_init_memory(&mp3, pData, dataSize, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_mp3__full_read_and_close_s16(&mp3, pConfig, pTotalFrameCount);
-}
-#ifndef MA_DR_MP3_NO_STDIO
-MA_API float* ma_dr_mp3_open_file_and_read_pcm_frames_f32(const char* filePath, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_mp3 mp3;
-    if (!ma_dr_mp3_init_file(&mp3, filePath, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_mp3__full_read_and_close_f32(&mp3, pConfig, pTotalFrameCount);
-}
-MA_API ma_int16* ma_dr_mp3_open_file_and_read_pcm_frames_s16(const char* filePath, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    ma_dr_mp3 mp3;
-    if (!ma_dr_mp3_init_file(&mp3, filePath, pAllocationCallbacks)) {
-        return NULL;
-    }
-    return ma_dr_mp3__full_read_and_close_s16(&mp3, pConfig, pTotalFrameCount);
-}
-#endif
-MA_API void* ma_dr_mp3_malloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        return ma_dr_mp3__malloc_from_callbacks(sz, pAllocationCallbacks);
-    } else {
-        return ma_dr_mp3__malloc_default(sz, NULL);
-    }
-}
-MA_API void ma_dr_mp3_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        ma_dr_mp3__free_from_callbacks(p, pAllocationCallbacks);
-    } else {
-        ma_dr_mp3__free_default(p, NULL);
-    }
-}
-#endif
-/* dr_mp3_c end */
-#endif  /* MA_DR_MP3_IMPLEMENTATION */
-#endif  /* MA_NO_MP3 */
-
-
-/* End globally disabled warnings. */
-#if defined(_MSC_VER)
-    #pragma warning(pop)
-#endif
-
-#endif  /* miniaudio_c */
-#endif  /* MINIAUDIO_IMPLEMENTATION */
-
-
-/*
-This software is available as a choice of the following licenses. Choose
-whichever you prefer.
-
-===============================================================================
-ALTERNATIVE 1 - Public Domain (www.unlicense.org)
-===============================================================================
-This is free and unencumbered software released into the public domain.
-
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-For more information, please refer to <http://unlicense.org/>
-
-===============================================================================
-ALTERNATIVE 2 - MIT No Attribution
-===============================================================================
-Copyright 2023 David Reid
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*/
diff --git a/dsp/audio/cosmoaudio/test.c b/dsp/audio/cosmoaudio/test.c
deleted file mode 100644
index a6c7e2375..000000000
--- a/dsp/audio/cosmoaudio/test.c
+++ /dev/null
@@ -1,76 +0,0 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include "cosmoaudio.h"
-
-#define SAMPLING_RATE 44100
-#define WAVE_INTERVAL 440
-#define CHANNELS      2
-
-#ifndef M_PIf
-#define M_PIf 3.14159265358979323846f
-#endif
-
-int main() {
-
-  struct CosmoAudioOpenOptions cao = {0};
-  cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions);
-  cao.deviceType = kCosmoAudioDeviceTypePlayback;
-  cao.sampleRate = SAMPLING_RATE;
-  cao.channels = CHANNELS;
-
-  int status;
-  struct CosmoAudio *ca;
-  status = cosmoaudio_open(&ca, &cao);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to open audio: %d\n", status);
-    return 1;
-  }
-
-  float buf[256 * CHANNELS];
-  for (int g = 0; g < SAMPLING_RATE;) {
-    int frames = 1;
-    status = cosmoaudio_poll(ca, NULL, &frames);
-    if (status != COSMOAUDIO_SUCCESS) {
-      fprintf(stderr, "failed to poll output: %d\n", status);
-      return 2;
-    }
-    if (frames > 256)
-      frames = 256;
-    if (frames > SAMPLING_RATE - g)
-      frames = SAMPLING_RATE - g;
-    for (int f = 0; f < frames; ++f) {
-      float t = (float)g++ / SAMPLING_RATE;
-      float s = sinf(2 * M_PIf * WAVE_INTERVAL * t);
-      for (int c = 0; c < CHANNELS; c++)
-        buf[f * CHANNELS + c] = s * .3f;
-    }
-    status = cosmoaudio_write(ca, buf, frames);
-    if (status != frames) {
-      fprintf(stderr, "failed to write output: %d\n", status);
-      return 3;
-    }
-  }
-
-  status = cosmoaudio_flush(ca);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to flush output: %d\n", status);
-    return 4;
-  }
-
-  status = cosmoaudio_close(ca);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to close audio: %d\n", status);
-    return 5;
-  }
-}
diff --git a/dsp/audio/describe.c b/dsp/audio/describe.c
deleted file mode 100644
index 71d6eb91d..000000000
--- a/dsp/audio/describe.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "dsp/audio/describe.h"
-#include "dsp/audio/cosmoaudio/cosmoaudio.h"
-#include "libc/intrin/describeflags.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
-
-#define append(...) o += ksnprintf(buf + o, n - o, __VA_ARGS__)
-
-const char *cosmoaudio_describe_status(char *buf, int n, int status) {
-  switch (status) {
-    case COSMOAUDIO_SUCCESS:
-      return "COSMOAUDIO_SUCCESS";
-    case COSMOAUDIO_ERROR:
-      return "COSMOAUDIO_ERROR";
-    case COSMOAUDIO_EINVAL:
-      return "COSMOAUDIO_EINVAL";
-    case COSMOAUDIO_ELINK:
-      return "COSMOAUDIO_ELINK";
-    case COSMOAUDIO_ENOBUF:
-      return "COSMOAUDIO_ENOBUF";
-    default:
-      ksnprintf(buf, n, "%d", status);
-      return buf;
-  }
-}
-
-const char *cosmoaudio_describe_open_options(
-    char *buf, int n, const struct CosmoAudioOpenOptions *options) {
-  int o = 0;
-  char b128[128];
-  bool gotsome = false;
-  if (!options)
-    return "NULL";
-  if (kisdangerous(options)) {
-    ksnprintf(buf, n, "%p", options);
-    return buf;
-  }
-  append("{");
-
-  if (options->sampleRate) {
-    if (gotsome)
-      append(", ");
-    append(".sampleRate=%d", options->sampleRate);
-    gotsome = true;
-  }
-
-  if (options->channels) {
-    if (gotsome)
-      append(", ");
-    append(".channels=%d", options->channels);
-    gotsome = true;
-  }
-
-  if (options->deviceType) {
-    if (gotsome)
-      append(", ");
-    static struct DescribeFlags kDeviceType[] = {
-        {kCosmoAudioDeviceTypeDuplex, "Duplex"},      //
-        {kCosmoAudioDeviceTypeCapture, "Capture"},    //
-        {kCosmoAudioDeviceTypePlayback, "Playback"},  //
-    };
-    append(".deviceType=%s",
-           _DescribeFlags(b128, 128, kDeviceType, ARRAYLEN(kDeviceType),
-                          "kCosmoAudioDeviceType", options->deviceType));
-    gotsome = true;
-  }
-
-  if (options->bufferFrames) {
-    if (gotsome)
-      append(", ");
-    append(".bufferFrames=%d", options->bufferFrames);
-    gotsome = true;
-  }
-
-  if (options->debugLog) {
-    if (gotsome)
-      append(", ");
-    append(".debugLog=%d", options->debugLog);
-    gotsome = true;
-  }
-
-  if (options->sizeofThis) {
-    if (gotsome)
-      append(", ");
-    append(".sizeofThis=%d", options->sizeofThis);
-    gotsome = true;
-  }
-
-  append("}");
-  return buf;
-}
-
-const char *cosmoaudio_describe_poll_frames(char *buf, int n,
-                                            int *in_out_frames) {
-  if (!in_out_frames)
-    return "NULL";
-  if (kisdangerous(in_out_frames)) {
-    ksnprintf(buf, n, "%p", in_out_frames);
-    return buf;
-  }
-  ksnprintf(buf, n, "[%d]", *in_out_frames);
-  return buf;
-}
diff --git a/dsp/audio/describe.h b/dsp/audio/describe.h
deleted file mode 100644
index 28f614574..000000000
--- a/dsp/audio/describe.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef COSMOPOLITAN_DSP_AUDIO_DESCRIBE_H_
-#define COSMOPOLITAN_DSP_AUDIO_DESCRIBE_H_
-#include "dsp/audio/cosmoaudio/cosmoaudio.h"
-COSMOPOLITAN_C_START_
-
-const char *cosmoaudio_describe_status(char *, int, int);
-const char *cosmoaudio_describe_open_options(
-    char *, int, const struct CosmoAudioOpenOptions *);
-const char *cosmoaudio_describe_poll_frames(char *, int, int *);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_DSP_AUDIO_DESCRIBE_H_ */
diff --git a/dsp/core/c161.h b/dsp/core/c161.h
index ddadcaa7f..40753eefa 100644
--- a/dsp/core/c161.h
+++ b/dsp/core/c161.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_DSP_CORE_C161_H_
 #define COSMOPOLITAN_DSP_CORE_C161_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #define EXTRA_SHARP 2
 
diff --git a/dsp/core/c161s.h b/dsp/core/c161s.h
index cd7018727..325278ee2 100644
--- a/dsp/core/c161s.h
+++ b/dsp/core/c161s.h
@@ -1,7 +1,7 @@
 #ifndef COSMOPOLITAN_DSP_CORE_C161S_H_
 #define COSMOPOLITAN_DSP_CORE_C161S_H_
 #include "dsp/core/c161.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 __funline signed char C161S(signed char al, signed char bl, signed char cl) {
   short ax, bx, cx;
diff --git a/dsp/core/double2byte.c b/dsp/core/double2byte.c
index 95bcad14c..1ac894e9b 100644
--- a/dsp/core/double2byte.c
+++ b/dsp/core/double2byte.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/core/core.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/mem.h"
 
diff --git a/dsp/core/float2short.c b/dsp/core/float2short.c
index 5efbcb4ee..2c35fb17e 100644
--- a/dsp/core/float2short.c
+++ b/dsp/core/float2short.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/str/str.h"
 
diff --git a/dsp/core/getintegercoefficients.c b/dsp/core/getintegercoefficients.c
index fc8cd77bb..b8b377d3d 100644
--- a/dsp/core/getintegercoefficients.c
+++ b/dsp/core/getintegercoefficients.c
@@ -20,7 +20,7 @@
 #include "libc/assert.h"
 #include "libc/dce.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/str/str.h"
 
diff --git a/dsp/core/getintegercoefficients8.c b/dsp/core/getintegercoefficients8.c
index defafc058..1ba9e5081 100644
--- a/dsp/core/getintegercoefficients8.c
+++ b/dsp/core/getintegercoefficients8.c
@@ -19,7 +19,7 @@
 #include "dsp/core/core.h"
 #include "dsp/core/q.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/str/str.h"
 
diff --git a/dsp/core/half.h b/dsp/core/half.h
index 0165ad76b..0955ddfba 100644
--- a/dsp/core/half.h
+++ b/dsp/core/half.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_DSP_CORE_HALF_H_
 #define COSMOPOLITAN_DSP_CORE_HALF_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 /**
  * Divides integer in half w/ rounding.
diff --git a/dsp/core/ks8.h b/dsp/core/ks8.h
index 4a0c81f72..100b3decc 100644
--- a/dsp/core/ks8.h
+++ b/dsp/core/ks8.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_DSP_CORE_KS8_H_
 #define COSMOPOLITAN_DSP_CORE_KS8_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 /**
  * Performs 16-bit scaled rounded madd w/ eight coefficients or fewer.
diff --git a/dsp/core/kss8.h b/dsp/core/kss8.h
index c86ae1a85..54bff129c 100644
--- a/dsp/core/kss8.h
+++ b/dsp/core/kss8.h
@@ -1,7 +1,7 @@
 #ifndef COSMOPOLITAN_DSP_CORE_KSS8_H_
 #define COSMOPOLITAN_DSP_CORE_KSS8_H_
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 /**
  * Performs 16-bit scaled rounded saturated madd w/ eight coefficients or fewer.
diff --git a/dsp/core/q.h b/dsp/core/q.h
index c23fa5f30..3d122bf49 100644
--- a/dsp/core/q.h
+++ b/dsp/core/q.h
@@ -1,7 +1,7 @@
 #ifndef COSMOPOLITAN_DSP_CORE_Q_H_
 #define COSMOPOLITAN_DSP_CORE_Q_H_
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 
 /**
diff --git a/dsp/core/sad16x8n.c b/dsp/core/sad16x8n.c
index c8e6f4fff..802836164 100644
--- a/dsp/core/sad16x8n.c
+++ b/dsp/core/sad16x8n.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/core/core.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/intel/emmintrin.internal.h"
 
diff --git a/dsp/mpeg/.clang-format b/dsp/mpeg/.clang-format
deleted file mode 100644
index 47a38a93f..000000000
--- a/dsp/mpeg/.clang-format
+++ /dev/null
@@ -1,2 +0,0 @@
-DisableFormat: true
-SortIncludes: Never
diff --git a/dsp/mpeg/BUILD.mk b/dsp/mpeg/BUILD.mk
index be219ca23..a16089eaa 100644
--- a/dsp/mpeg/BUILD.mk
+++ b/dsp/mpeg/BUILD.mk
@@ -25,13 +25,18 @@ DSP_MPEG_A_CHECKS =				\
 
 DSP_MPEG_A_DIRECTDEPS =				\
 	LIBC_CALLS				\
+	LIBC_FMT				\
 	LIBC_INTRIN				\
+	LIBC_LOG				\
+	LIBC_LOG				\
 	LIBC_MEM				\
 	LIBC_NEXGEN32E				\
+	LIBC_RUNTIME				\
 	LIBC_STDIO				\
 	LIBC_STR				\
+	LIBC_SYSV				\
 	LIBC_TINYMATH				\
-	THIRD_PARTY_COMPILER_RT			\
+	THIRD_PARTY_COMPILER_RT
 
 DSP_MPEG_A_DEPS :=				\
 	$(call uniq,$(foreach x,$(DSP_MPEG_A_DIRECTDEPS),$($(x))))
@@ -44,10 +49,9 @@ $(DSP_MPEG_A).pkg:				\
 		$(DSP_MPEG_A_OBJS)		\
 		$(foreach x,$(DSP_MPEG_A_DIRECTDEPS),$($(x)_A).pkg)
 
-o/$(MODE)/dsp/mpeg/pl_mpeg.o: private		\
+o/$(MODE)/dsp/mpeg/clamp4int256-k8.o: private	\
 		CFLAGS +=			\
-			-ffunction-sections	\
-			-fdata-sections
+			-Os
 
 DSP_MPEG_LIBS = $(foreach x,$(DSP_MPEG_ARTIFACTS),$($(x)))
 DSP_MPEG_SRCS = $(foreach x,$(DSP_MPEG_ARTIFACTS),$($(x)_SRCS))
diff --git a/dsp/mpeg/README.cosmo b/dsp/mpeg/README.cosmo
deleted file mode 100644
index 1c1991af4..000000000
--- a/dsp/mpeg/README.cosmo
+++ /dev/null
@@ -1,17 +0,0 @@
-DESCRIPTION
-
-  pl_mpeg lets you decode .mpg files
-
-ORIGIN
-
-  https://github.com/phoboslab/pl_mpeg/
-  9e40dd6536269d788728e32c39bfacf2ab7a0866
-
-LICENSE
-
-  MIT
-
-LOCAL CHANGES
-
-  - Added API for extracting pixel aspect ratio
-    https://github.com/phoboslab/pl_mpeg/pull/42
diff --git a/dsp/mpeg/README.md b/dsp/mpeg/README.md
deleted file mode 100755
index 4892e7026..000000000
--- a/dsp/mpeg/README.md
+++ /dev/null
@@ -1,68 +0,0 @@
-# PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer
-
-Single-file MIT licensed library for C/C++
-
-See [pl_mpeg.h](https://github.com/phoboslab/pl_mpeg/blob/master/pl_mpeg.h) for
-the documentation.
-
-
-## Why?
-
-This is meant as a simple way to get video playback into your app or game. Other
-solutions, such as ffmpeg require huge libraries and a lot of glue code.
-
-MPEG1 is an old and inefficient codec, but it's still good enough for many use
-cases. All patents related to MPEG1 and MP2 have expired, so it's completely
-free now.
-
-This library does not make use of any SIMD instructions, but because of
-the relative simplicity of the codec it still manages to decode 4k60fps video
-on a single CPU core (on my i7-6700k at least).
-
-## Compilation on Linux
-
-Use a GCC invocation like the following to build the example `pl_mpeg_player`
-program:
-
-```shell
-gcc -o pl_mpeg_player pl_mpeg_player.c $(pkg-config --cflags --libs sdl2 glew)
-```
-
-## Example Usage
-
-- [pl_mpeg_extract_frames.c](https://github.com/phoboslab/pl_mpeg/blob/master/pl_mpeg_extract_frames.c)
-extracts all frames from a video and saves them as PNG.
- - [pl_mpeg_player.c](https://github.com/phoboslab/pl_mpeg/blob/master/pl_mpeg_player.c)
-implements a video player using SDL2 and OpenGL for rendering.
-
-
-
-## Encoding for PL_MPEG
-
-Most [MPEG-PS](https://en.wikipedia.org/wiki/MPEG_program_stream) (`.mpg`) files
-containing MPEG1 Video ("mpeg1") and MPEG1 Audio Layer II ("mp2") streams should
-work with PL_MPEG. Note that `.mpg` files can also contain MPEG2 Video, which is
-not supported by this library.
-
-You can encode video in a suitable format using ffmpeg:
-
-```
-ffmpeg -i input.mp4 -c:v mpeg1video -q:v 0 -c:a mp2 -format mpeg output.mpg
-```
-
-`-q:v` sets a fixed video quality with a variable bitrate, where `0` is the
-highest. You may use `-b:v` to set a fixed bitrate instead; e.g.
-`-b:v 2000k` for 2000 kbit/s. Please refer to the
-[ffmpeg documentation](http://ffmpeg.org/ffmpeg.html#Options) for more details.
-
-If you just want to quickly test the library, try this file:
-
-https://phoboslab.org/files/bjork-all-is-full-of-love.mpg
-
-
-## Limitations
-
-- no error reporting. PL_MPEG will silently ignore any invalid data.
-- the pts (presentation time stamp) for packets in the MPEG-PS container is
-ignored. This may cause sync issues with some files.
-- bugs, probably.
diff --git a/dsp/mpeg/README.txt b/dsp/mpeg/README.txt
new file mode 100644
index 000000000..b8263c03c
--- /dev/null
+++ b/dsp/mpeg/README.txt
@@ -0,0 +1,92 @@
+PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer
+Dominic Szablewski - https://phoboslab.org
+
+-- Synopsis
+
+// This function gets called for each decoded video frame
+void my_video_callback(plm_t *plm, plm_frame_t *frame, void *user) {
+	// Do something with frame->y.data, frame->cr.data, frame->cb.data
+}
+
+// This function gets called for each decoded audio frame
+void my_audio_callback(plm_t *plm, plm_samples_t *frame, void *user) {
+	// Do something with samples->interleaved
+}
+
+// Load a .mpg (MPEG Program Stream) file
+plm_t *plm = plm_create_with_filename("some-file.mpg");
+
+// Install the video & audio decode callbacks
+plm_set_video_decode_callback(plm, my_video_callback, my_data);
+plm_set_audio_decode_callback(plm, my_audio_callback, my_data);
+
+
+// Decode
+do {
+	plm_decode(plm, time_since_last_call);
+} while (!plm_has_ended(plm));
+
+// All done
+plm_destroy(plm);
+
+
+
+-- Documentation
+
+This library provides several interfaces to load, demux and decode MPEG video
+and audio data. A high-level API combines the demuxer, video & audio decoders
+in an easy to use wrapper.
+
+Lower-level APIs for accessing the demuxer, video decoder and audio decoder,
+as well as providing different data sources are also available.
+
+Interfaces are written in an object orientet style, meaning you create object
+instances via various different constructor functions (plm_*create()),
+do some work on them and later dispose them via plm_*destroy().
+
+plm_*		-- the high-level interface, combining demuxer and decoders
+plm_buffer_* -- the data source used by all interfaces
+plm_demux_*  -- the MPEG-PS demuxer
+plm_video_*  -- the MPEG1 Video ("mpeg1") decoder
+plm_audio_*  -- the MPEG1 Audio Layer II ("mp2") decoder
+
+
+This library uses malloc(), realloc() and free() to manage memory. Typically
+all allocation happens up-front when creating the interface. However, the
+default buffer size may be too small for certain inputs. In these cases plmpeg
+will realloc() the buffer with a larger size whenever needed. You can configure
+the default buffer size by defining PLM_BUFFER_DEFAULT_SIZE *before*
+including this library.
+
+With the high-level interface you have two options to decode video & audio:
+
+1) Use plm_decode() and just hand over the delta time since the last call.
+It will decode everything needed and call your callbacks (specified through
+plm_set_{video|audio}_decode_callback()) any number of times.
+
+2) Use plm_decode_video() and plm_decode_audio() to decode exactly one
+frame of video or audio data at a time. How you handle the synchronization of
+both streams is up to you.
+
+If you only want to decode video *or* audio through these functions, you should
+disable the other stream (plm_set_{video|audio}_enabled(false))
+
+
+Video data is decoded into a struct with all 3 planes (Y, Cr, Cb) stored in
+separate buffers. You can either convert this to RGB on the CPU (slow) via the
+plm_frame_to_rgb() function or do it on the GPU with the following matrix:
+
+mat4 rec601 = mat4(
+	1.16438,  0.00000,  1.59603, -0.87079,
+	1.16438, -0.39176, -0.81297,  0.52959,
+	1.16438,  2.01723,  0.00000, -1.08139,
+	0, 0, 0, 1
+);
+gl_FragColor = vec4(y, cb, cr, 1.0) * rec601;
+
+Audio data is decoded into a struct with either one single float array with the
+samples for the left and right channel interleaved, or if the
+PLM_AUDIO_SEPARATE_CHANNELS is defined *before* including this library, into
+two separate float arrays - one for each channel.
+
+See below for detailed the API documentation.
diff --git a/dsp/mpeg/blockset.h b/dsp/mpeg/blockset.h
new file mode 100644
index 000000000..14d0f36b2
--- /dev/null
+++ b/dsp/mpeg/blockset.h
@@ -0,0 +1,20 @@
+#ifndef COSMOPOLITAN_DSP_MPEG_BLOCKSET_H_
+#define COSMOPOLITAN_DSP_MPEG_BLOCKSET_H_
+
+#define PLM_BLOCK_SET(DEST, DEST_INDEX, DEST_WIDTH, SOURCE_INDEX, \
+                      SOURCE_WIDTH, BLOCK_SIZE, OP)               \
+  do {                                                            \
+    int dest_scan = DEST_WIDTH - BLOCK_SIZE;                      \
+    int source_scan = SOURCE_WIDTH - BLOCK_SIZE;                  \
+    for (int y = 0; y < BLOCK_SIZE; y++) {                        \
+      for (int x = 0; x < BLOCK_SIZE; x++) {                      \
+        DEST[DEST_INDEX] = OP;                                    \
+        SOURCE_INDEX++;                                           \
+        DEST_INDEX++;                                             \
+      }                                                           \
+      SOURCE_INDEX += source_scan;                                \
+      DEST_INDEX += dest_scan;                                    \
+    }                                                             \
+  } while (false)
+
+#endif /* COSMOPOLITAN_DSP_MPEG_BLOCKSET_H_ */
diff --git a/dsp/mpeg/buffer.c b/dsp/mpeg/buffer.c
new file mode 100644
index 000000000..f0cb0de38
--- /dev/null
+++ b/dsp/mpeg/buffer.c
@@ -0,0 +1,153 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:4;tab-width:4;coding:utf-8   -*-│
+│ vi: set noet ft=c ts=4 sw=4 fenc=utf-8                                   :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│  PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer           │
+│  Dominic Szablewski - https://phoboslab.org                                  │
+│                                                                              │
+│  The MIT License(MIT)                                                        │
+│  Copyright(c) 2019 Dominic Szablewski                                        │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files(the              │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and / or sell copies of the Software, and to        │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│    The above copyright notice and this permission notice shall be            │
+│    included in all copies or substantial portions of the Software.           │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                       │
+│  NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE       │
+│  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN             │
+│  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN           │
+│  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE            │
+│  SOFTWARE.                                                                   │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "dsp/mpeg/buffer.h"
+#include "dsp/mpeg/mpeg.h"
+#include "libc/calls/calls.h"
+#include "libc/log/check.h"
+#include "libc/mem/mem.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/madv.h"
+__static_yoink("pl_mpeg_notice");
+
+/* clang-format off */
+// -----------------------------------------------------------------------------
+// plm_buffer implementation
+
+plm_buffer_t *plm_buffer_create_with_filename(const char *filename) {
+	FILE *fh = fopen(filename, "rb");
+	if (!fh) {
+		return NULL;
+	}
+	fadvise(fileno(fh), 0, 0, MADV_SEQUENTIAL);
+	return plm_buffer_create_with_file(fh, true);
+}
+
+plm_buffer_t *plm_buffer_create_with_file(FILE *fh, int close_when_done) {
+	plm_buffer_t *b;
+	b = plm_buffer_create_with_capacity(PLM_BUFFER_DEFAULT_SIZE);
+	b->fh = fh;
+	b->close_when_done = close_when_done;
+	b->mode = PLM_BUFFER_MODE_FILE;
+	plm_buffer_set_load_callback(b, plm_buffer_load_file_callback, NULL);
+	return b;
+}
+
+plm_buffer_t *plm_buffer_create_with_memory(unsigned char *bytes, size_t length, int free_when_done) {
+	plm_buffer_t *b;
+	b = memalign(_Alignof(plm_buffer_t), sizeof(plm_buffer_t));
+	memset(b, 0, sizeof(plm_buffer_t));
+	b->capacity = length;
+	b->length = length;
+	b->free_when_done = free_when_done;
+	b->bytes = bytes;
+	b->mode = PLM_BUFFER_MODE_FIXED_MEM;
+	return b;
+}
+
+plm_buffer_t * plm_buffer_create_with_capacity(size_t capacity) {
+	plm_buffer_t *b;
+	b = memalign(_Alignof(plm_buffer_t), sizeof(plm_buffer_t));
+	memset(b, 0, sizeof(plm_buffer_t));
+	b->capacity = capacity;
+	b->free_when_done = true;
+	b->bytes = (unsigned char *)malloc(capacity);
+	b->mode = PLM_BUFFER_MODE_DYNAMIC_MEM;
+	return b;
+}
+
+void plm_buffer_destroy(plm_buffer_t *self) {
+	if (self->fh && self->close_when_done) {
+		fclose(self->fh);
+	}
+	if (self->free_when_done) {
+		free(self->bytes);
+	}
+	free(self);
+}
+
+size_t plm_buffer_write(plm_buffer_t *self, unsigned char *bytes, size_t length) {
+	if (self->mode == PLM_BUFFER_MODE_FIXED_MEM) {
+		return 0;
+	}
+	// This should be a ring buffer, but instead it just shifts all unread data
+	// to the beginning of the buffer and appends new data at the end. Seems
+	// to be good enough.
+	plm_buffer_discard_read_bytes(self);
+	// Do we have to resize to fit the new data?
+	size_t bytes_available = self->capacity - self->length;
+	if (bytes_available < length) {
+		size_t new_size = self->capacity;
+		do {
+			new_size *= 2;
+		} while (new_size - self->length < length);
+		self->bytes = (unsigned char *)realloc(self->bytes, new_size);
+		self->capacity = new_size;
+	}
+	memcpy(self->bytes + self->length, bytes, length);
+	self->length += length;
+	return length;
+}
+
+void plm_buffer_set_load_callback(plm_buffer_t *self, plm_buffer_load_callback fp, void *user) {
+	self->load_callback = fp;
+	self->load_callback_user_data = user;
+}
+
+void plm_buffer_rewind(plm_buffer_t *self) {
+	if (self->fh) {
+		fseek(self->fh, 0, SEEK_SET);
+		self->length = 0;
+	}
+	if (self->mode != PLM_BUFFER_MODE_FIXED_MEM) {
+		self->length = 0;
+	}
+	self->bit_index = 0;
+}
+
+void plm_buffer_discard_read_bytes(plm_buffer_t *self) {
+	size_t byte_pos = self->bit_index >> 3;
+	if (byte_pos == self->length) {
+		self->bit_index = 0;
+		self->length = 0;
+	}
+	else if (byte_pos > 0) {
+		memmove(self->bytes, self->bytes + byte_pos, self->length - byte_pos);
+		self->bit_index -= byte_pos << 3;
+		self->length -= byte_pos;
+	}
+}
+
+void plm_buffer_load_file_callback(plm_buffer_t *self, void *user) {
+	plm_buffer_discard_read_bytes(self);
+	unsigned bytes_available = self->capacity - self->length;
+	unsigned bytes_read = fread(self->bytes + self->length, 1, bytes_available, self->fh);
+	self->length += bytes_read;
+}
diff --git a/dsp/mpeg/buffer.h b/dsp/mpeg/buffer.h
new file mode 100644
index 000000000..e841535fb
--- /dev/null
+++ b/dsp/mpeg/buffer.h
@@ -0,0 +1,160 @@
+#ifndef COSMOPOLITAN_DSP_MPEG_BUFFER_H_
+#define COSMOPOLITAN_DSP_MPEG_BUFFER_H_
+#include "dsp/mpeg/mpeg.h"
+COSMOPOLITAN_C_START_
+
+enum plm_buffer_mode {
+  PLM_BUFFER_MODE_FILE,
+  PLM_BUFFER_MODE_FIXED_MEM,
+  PLM_BUFFER_MODE_DYNAMIC_MEM
+};
+
+typedef struct plm_buffer_t {
+  unsigned bit_index;
+  unsigned capacity;
+  unsigned length;
+  int free_when_done;
+  int close_when_done;
+  FILE *fh;
+  plm_buffer_load_callback load_callback;
+  void *load_callback_user_data;
+  unsigned char *bytes;
+  enum plm_buffer_mode mode;
+} plm_buffer_t;
+
+typedef struct {
+  int16_t index;
+  int16_t value;
+} plm_vlc_t;
+
+typedef struct {
+  int16_t index;
+  uint16_t value;
+} plm_vlc_uint_t;
+
+/* bool plm_buffer_has(plm_buffer_t *, size_t); */
+/* int plm_buffer_read(plm_buffer_t *, int); */
+/* void plm_buffer_align(plm_buffer_t *); */
+/* void plm_buffer_skip(plm_buffer_t *, size_t); */
+/* int plm_buffer_skip_bytes(plm_buffer_t *, unsigned char); */
+/* int plm_buffer_next_start_code(plm_buffer_t *); */
+/* int plm_buffer_find_start_code(plm_buffer_t *, int); */
+/* int plm_buffer_no_start_code(plm_buffer_t *); */
+/* int16_t plm_buffer_read_vlc(plm_buffer_t *, const plm_vlc_t *); */
+/* uint16_t plm_buffer_read_vlc_uint(plm_buffer_t *, const plm_vlc_uint_t *); */
+
+void plm_buffer_discard_read_bytes(plm_buffer_t *);
+relegated void plm_buffer_load_file_callback(plm_buffer_t *, void *);
+
+forceinline bool plm_buffer_has(plm_buffer_t *b, size_t bits) {
+  unsigned have;
+  have = b->length;
+  have <<= 3;
+  have -= b->bit_index;
+  if (bits <= have) {
+    return true;
+  } else {
+    if (b->load_callback) {
+      b->load_callback(b, b->load_callback_user_data);
+      return ((b->length << 3) - b->bit_index) >= bits;
+    } else {
+      return false;
+    }
+  }
+}
+
+forceinline int plm_buffer_read(plm_buffer_t *self, int count) {
+  if (!plm_buffer_has(self, count))
+    return 0;
+  int value = 0;
+  while (count) {
+    int current_byte = self->bytes[self->bit_index >> 3];
+    int remaining = 8 - (self->bit_index & 7);         // Remaining bits in byte
+    int read = remaining < count ? remaining : count;  // Bits in self run
+    int shift = remaining - read;
+    int mask = (0xff >> (8 - read));
+    value = (value << read) | ((current_byte & (mask << shift)) >> shift);
+    self->bit_index += read;
+    count -= read;
+  }
+  return value;
+}
+
+forceinline void plm_buffer_align(plm_buffer_t *self) {
+  self->bit_index = ((self->bit_index + 7) >> 3) << 3;
+}
+
+forceinline void plm_buffer_skip(plm_buffer_t *self, size_t count) {
+  if (plm_buffer_has(self, count)) {
+    self->bit_index += count;
+  }
+}
+
+forceinline int plm_buffer_skip_bytes(plm_buffer_t *self, unsigned char v) {
+  unsigned skipped;
+  plm_buffer_align(self);
+  skipped = 0;
+  while (plm_buffer_has(self, 8)) {
+    if (v == self->bytes[self->bit_index >> 3]) {
+      self->bit_index += 8;
+      ++skipped;
+    } else {
+      break;
+    }
+  }
+  return skipped;
+}
+
+forceinline int plm_buffer_next_start_code(plm_buffer_t *self) {
+  plm_buffer_align(self);
+  while (plm_buffer_has(self, (5 << 3))) {
+    size_t byte_index = (self->bit_index) >> 3;
+    if (self->bytes[byte_index] == 0x00 &&
+        self->bytes[byte_index + 1] == 0x00 &&
+        self->bytes[byte_index + 2] == 0x01) {
+      self->bit_index = (byte_index + 4) << 3;
+      return self->bytes[byte_index + 3];
+    }
+    self->bit_index += 8;
+  }
+  self->bit_index = (self->length << 3);
+  return -1;
+}
+
+forceinline int plm_buffer_find_start_code(plm_buffer_t *self, int code) {
+  int current = 0;
+  while (true) {
+    current = plm_buffer_next_start_code(self);
+    if (current == code || current == -1) {
+      return current;
+    }
+  }
+  return -1;
+}
+
+forceinline int plm_buffer_no_start_code(plm_buffer_t *self) {
+  if (!plm_buffer_has(self, (5 << 3))) {
+    return false;
+  }
+  size_t byte_index = ((self->bit_index + 7) >> 3);
+  return !(self->bytes[byte_index] == 0x00 &&
+           self->bytes[byte_index + 1] == 0x00 &&
+           self->bytes[byte_index + 2] == 0x01);
+}
+
+forceinline int16_t plm_buffer_read_vlc(plm_buffer_t *self,
+                                        const plm_vlc_t *table) {
+  plm_vlc_t state = {0, 0};
+  do {
+    state = table[state.index + plm_buffer_read(self, 1)];
+  } while (state.index > 0);
+  return state.value;
+}
+
+forceinline uint16_t plm_buffer_read_vlc_uint(plm_buffer_t *self,
+                                              const plm_vlc_uint_t *table) {
+  return (uint16_t)plm_buffer_read_vlc(self, (plm_vlc_t *)table);
+}
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_DSP_MPEG_BUFFER_H_ */
diff --git a/dsp/mpeg/clamp4int256-core.S b/dsp/mpeg/clamp4int256-core.S
new file mode 100644
index 000000000..3cd6797b1
--- /dev/null
+++ b/dsp/mpeg/clamp4int256-core.S
@@ -0,0 +1,30 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│ vi: set noet ft=asm ts=8 sw=8 fenc=utf-8                                 :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/macros.internal.h"
+
+clamp4int256$core:
+	.leafprologue
+	pxor	%xmm1,%xmm1
+	pmaxsd	%xmm1,%xmm0
+	pminsd	0f(%rip),%xmm0
+	.leafepilogue
+	.endfn	clamp4int256$core,globl
+
+	.rodata.cst16
+0:	.long	255,255,255,255
diff --git a/dsp/mpeg/demux.c b/dsp/mpeg/demux.c
new file mode 100644
index 000000000..66eff844a
--- /dev/null
+++ b/dsp/mpeg/demux.c
@@ -0,0 +1,203 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:4;tab-width:4;coding:utf-8   -*-│
+│ vi: set noet ft=c ts=4 sw=4 fenc=utf-8                                   :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│  PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer           │
+│  Dominic Szablewski - https://phoboslab.org                                  │
+│                                                                              │
+│  The MIT License(MIT)                                                        │
+│  Copyright(c) 2019 Dominic Szablewski                                        │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files(the              │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and / or sell copies of the Software, and to        │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│    The above copyright notice and this permission notice shall be            │
+│    included in all copies or substantial portions of the Software.           │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                       │
+│  NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE       │
+│  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN             │
+│  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN           │
+│  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE            │
+│  SOFTWARE.                                                                   │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "dsp/mpeg/demux.h"
+#include "dsp/mpeg/buffer.h"
+#include "dsp/mpeg/mpeg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"
+__static_yoink("pl_mpeg_notice");
+
+/* clang-format off */
+// ----------------------------------------------------------------------------
+// plm_demux implementation
+
+plm_demux_t *plm_demux_create(plm_buffer_t *buffer, int destroy_when_done) {
+	plm_demux_t *self = (plm_demux_t *)malloc(sizeof(plm_demux_t));
+	memset(self, 0, sizeof(plm_demux_t));
+
+	self->buffer = buffer;
+	self->destroy_buffer_when_done = destroy_when_done;
+
+	if (plm_buffer_find_start_code(self->buffer, START_PACK) != -1) {
+		plm_demux_decode_pack_header(self);
+	}
+	if (plm_buffer_find_start_code(self->buffer, START_SYSTEM) != -1) {
+		plm_demux_decode_system_header(self);
+	}
+	return self;
+}
+
+void plm_demux_destroy(plm_demux_t *self) {
+	if (self->destroy_buffer_when_done) {
+		plm_buffer_destroy(self->buffer);
+	}
+	free(self);
+}
+
+int plm_demux_get_num_video_streams(plm_demux_t *self) {
+	return self->num_video_streams;
+}
+
+int plm_demux_get_num_audio_streams(plm_demux_t *self) {
+	return self->num_audio_streams;
+}
+
+void plm_demux_rewind(plm_demux_t *self) {
+	plm_buffer_rewind(self->buffer);
+}
+
+plm_packet_t *plm_demux_decode(plm_demux_t *self) {
+	if (self->current_packet.length) {
+		size_t bits_till_next_packet = self->current_packet.length << 3;
+		if (!plm_buffer_has(self->buffer, bits_till_next_packet)) {
+			return NULL;
+		}
+		plm_buffer_skip(self->buffer, bits_till_next_packet);
+		self->current_packet.length = 0;
+	}
+
+	if (!self->has_pack_header) {
+		if (plm_buffer_find_start_code(self->buffer, START_PACK) != -1) {
+			plm_demux_decode_pack_header(self);
+		}
+		else {
+			return NULL;
+		}
+	}
+
+	if (!self->has_system_header) {
+		if (plm_buffer_find_start_code(self->buffer, START_SYSTEM) != -1) {
+			plm_demux_decode_system_header(self);
+		}
+		else {
+			return NULL;
+		}
+	}
+
+	// pending packet just waiting for data?
+	if (self->next_packet.length) {
+		return plm_demux_get_packet(self);
+	}
+
+	int code;
+	do {
+		code = plm_buffer_next_start_code(self->buffer);
+		if (
+			code == PLM_DEMUX_PACKET_VIDEO_1 ||
+			code == PLM_DEMUX_PACKET_PRIVATE ||
+			(code >= PLM_DEMUX_PACKET_AUDIO_1 && code <= PLM_DEMUX_PACKET_AUDIO_4)
+		) {
+			return plm_demux_decode_packet(self, code);
+		}
+	} while (code != -1);
+
+	return NULL;
+}
+
+double plm_demux_read_time(plm_demux_t *self) {
+	int64_t clock = plm_buffer_read(self->buffer, 3) << 30;
+	plm_buffer_skip(self->buffer, 1);
+	clock |= plm_buffer_read(self->buffer, 15) << 15;
+	plm_buffer_skip(self->buffer, 1);
+	clock |= plm_buffer_read(self->buffer, 15);
+	plm_buffer_skip(self->buffer, 1);
+	return (double)clock / 90000.0;
+}
+
+void plm_demux_decode_pack_header(plm_demux_t *self) {
+	if (plm_buffer_read(self->buffer, 4) != 0x02) {
+		return; // invalid
+	}
+	self->system_clock_ref = plm_demux_read_time(self);
+	plm_buffer_skip(self->buffer, 1);
+	plm_buffer_skip(self->buffer, 22); // mux_rate * 50
+	plm_buffer_skip(self->buffer, 1);
+
+	self->has_pack_header = true;
+}
+
+void plm_demux_decode_system_header(plm_demux_t *self) {
+	plm_buffer_skip(self->buffer, 16); // header_length
+	plm_buffer_skip(self->buffer, 24); // rate bound
+	self->num_audio_streams = plm_buffer_read(self->buffer, 6);
+	plm_buffer_skip(self->buffer, 5); // misc flags
+	self->num_video_streams = plm_buffer_read(self->buffer, 5);
+
+	self->has_system_header = true;
+}
+
+plm_packet_t *plm_demux_decode_packet(plm_demux_t *self, int start_code) {
+	if (!plm_buffer_has(self->buffer, 8 << 3)) {
+		return NULL;
+	}
+
+	self->next_packet.type = start_code;
+	self->next_packet.length = plm_buffer_read(self->buffer, 16);
+	self->next_packet.length -= plm_buffer_skip_bytes(self->buffer, 0xff); // stuffing
+
+	// skip P-STD
+	if (plm_buffer_read(self->buffer, 2) == 0x01) {
+		plm_buffer_skip(self->buffer, 16);
+		self->next_packet.length -= 2;
+	}
+
+	int pts_dts_marker = plm_buffer_read(self->buffer, 2);
+	if (pts_dts_marker == 0x03) {
+		self->next_packet.pts = plm_demux_read_time(self);
+		plm_buffer_skip(self->buffer, 40); // skip dts
+		self->next_packet.length -= 10;
+	}
+	else if (pts_dts_marker == 0x02) {
+		self->next_packet.pts = plm_demux_read_time(self);
+		self->next_packet.length -= 5;
+	}
+	else if (pts_dts_marker == 0x00) {
+		self->next_packet.pts = 0;
+		plm_buffer_skip(self->buffer, 4);
+		self->next_packet.length -= 1;
+	}
+	else {
+		return NULL; // invalid
+	}
+
+	return plm_demux_get_packet(self);
+}
+
+plm_packet_t *plm_demux_get_packet(plm_demux_t *self) {
+	if (!plm_buffer_has(self->buffer, self->next_packet.length << 3)) {
+		return NULL;
+	}
+	self->current_packet.data = self->buffer->bytes + (self->buffer->bit_index >> 3);
+	self->current_packet.length = self->next_packet.length;
+	self->current_packet.type = self->next_packet.type;
+	self->current_packet.pts = self->next_packet.pts;
+	self->next_packet.length = 0;
+	return &self->current_packet;
+}
diff --git a/dsp/mpeg/demux.h b/dsp/mpeg/demux.h
new file mode 100644
index 000000000..f36de4d3a
--- /dev/null
+++ b/dsp/mpeg/demux.h
@@ -0,0 +1,29 @@
+#ifndef COSMOPOLITAN_DSP_MPEG_DEMUX_H_
+#define COSMOPOLITAN_DSP_MPEG_DEMUX_H_
+#include "dsp/mpeg/mpeg.h"
+COSMOPOLITAN_C_START_
+
+#define START_PACK   0xBA
+#define START_END    0xB9
+#define START_SYSTEM 0xBB
+
+typedef struct plm_demux_t {
+  plm_buffer_t *buffer;
+  int destroy_buffer_when_done;
+  double system_clock_ref;
+  int has_pack_header;
+  int has_system_header;
+  int num_audio_streams;
+  int num_video_streams;
+  plm_packet_t current_packet;
+  plm_packet_t next_packet;
+} plm_demux_t;
+
+double plm_demux_read_time(plm_demux_t *self);
+void plm_demux_decode_pack_header(plm_demux_t *self);
+void plm_demux_decode_system_header(plm_demux_t *self);
+plm_packet_t *plm_demux_decode_packet(plm_demux_t *self, int start_code);
+plm_packet_t *plm_demux_get_packet(plm_demux_t *self);
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_DSP_MPEG_DEMUX_H_ */
diff --git a/dsp/mpeg/idct.c b/dsp/mpeg/idct.c
new file mode 100644
index 000000000..11312607e
--- /dev/null
+++ b/dsp/mpeg/idct.c
@@ -0,0 +1,101 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:4;tab-width:4;coding:utf-8   -*-│
+│ vi: set et ft=c ts=4 sw=4 fenc=utf-8                                     :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│  PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer           │
+│  Dominic Szablewski - https://phoboslab.org                                  │
+│                                                                              │
+│  The MIT License(MIT)                                                        │
+│  Copyright(c) 2019 Dominic Szablewski                                        │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files(the              │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and / or sell copies of the Software, and to        │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│    The above copyright notice and this permission notice shall be            │
+│    included in all copies or substantial portions of the Software.           │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                       │
+│  NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE       │
+│  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN             │
+│  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN           │
+│  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE            │
+│  SOFTWARE.                                                                   │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "dsp/core/half.h"
+__static_yoink("pl_mpeg_notice");
+
+/**
+ * Computes Fixed-Point 8x8 Inverse Discrete Cosine Transform.
+ *
+ * @note discovered by Nasir Ahmed
+ */
+void plm_video_idct(int block[8][8]) {
+  int i, t1, t2, m0;
+  int b1, b3, b4, b6, b7;
+  int y3, y4, y5, y6, y7;
+  int x0, x1, x2, x3, x4;
+
+  for (i = 0; i < 8; ++i) {
+    b1 = block[4][i];
+    b3 = block[2][i] + block[6][i];
+    b4 = block[5][i] - block[3][i];
+    t1 = block[1][i] + block[7][i];
+    t2 = block[3][i] + block[5][i];
+    b6 = block[1][i] - block[7][i];
+    b7 = t1 + t2;
+    m0 = block[0][i];
+    x4 = ((b6 * 473 - b4 * 196 + 128) >> 8) - b7;
+    x0 = x4 - (((t1 - t2) * 362 + 128) >> 8);
+    x1 = m0 - b1;
+    x2 = (((block[2][i] - block[6][i]) * 362 + 128) >> 8) - b3;
+    x3 = m0 + b1;
+    y3 = x1 + x2;
+    y4 = x3 + b3;
+    y5 = x1 - x2;
+    y6 = x3 - b3;
+    y7 = -x0 - ((b4 * 473 + b6 * 196 + 128) >> 8);
+    block[0][i] = b7 + y4;
+    block[1][i] = x4 + y3;
+    block[2][i] = y5 - x0;
+    block[3][i] = y6 - y7;
+    block[4][i] = y6 + y7;
+    block[5][i] = x0 + y5;
+    block[6][i] = y3 - x4;
+    block[7][i] = y4 - b7;
+  }
+
+  for (i = 0; i < 8; ++i) {
+    b1 = block[i][4];
+    b3 = block[i][2] + block[i][6];
+    b4 = block[i][5] - block[i][3];
+    t1 = block[i][1] + block[i][7];
+    t2 = block[i][3] + block[i][5];
+    b6 = block[i][1] - block[i][7];
+    b7 = t1 + t2;
+    m0 = block[i][0];
+    x4 = ((b6 * 473 - b4 * 196 + 128) >> 8) - b7;
+    x0 = x4 - (((t1 - t2) * 362 + 128) >> 8);
+    x1 = m0 - b1;
+    x2 = (((block[i][2] - block[i][6]) * 362 + 128) >> 8) - b3;
+    x3 = m0 + b1;
+    y3 = x1 + x2;
+    y4 = x3 + b3;
+    y5 = x1 - x2;
+    y6 = x3 - b3;
+    y7 = -x0 - ((b4 * 473 + b6 * 196 + 128) >> 8);
+    block[i][0] = (b7 + y4 + 128) >> 8;
+    block[i][1] = (x4 + y3 + 128) >> 8;
+    block[i][2] = (y5 - x0 + 128) >> 8;
+    block[i][3] = (y6 - y7 + 128) >> 8;
+    block[i][4] = (y6 + y7 + 128) >> 8;
+    block[i][5] = (x0 + y5 + 128) >> 8;
+    block[i][6] = (y3 - x4 + 128) >> 8;
+    block[i][7] = (y4 - b7 + 128) >> 8;
+  }
+}
diff --git a/dsp/mpeg/idct.h b/dsp/mpeg/idct.h
new file mode 100644
index 000000000..1d16f8e38
--- /dev/null
+++ b/dsp/mpeg/idct.h
@@ -0,0 +1,8 @@
+#ifndef COSMOPOLITAN_DSP_MPEG_IDCT_H_
+#define COSMOPOLITAN_DSP_MPEG_IDCT_H_
+COSMOPOLITAN_C_START_
+
+void plm_video_idct(int *);
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_DSP_MPEG_IDCT_H_ */
diff --git a/dsp/mpeg/macroblock.c b/dsp/mpeg/macroblock.c
new file mode 100644
index 000000000..783f963bc
--- /dev/null
+++ b/dsp/mpeg/macroblock.c
@@ -0,0 +1,171 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:4;tab-width:4;coding:utf-8   -*-│
+│ vi: set et ft=c ts=4 sw=4 fenc=utf-8                                     :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│  PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer           │
+│  Dominic Szablewski - https://phoboslab.org                                  │
+│                                                                              │
+│  The MIT License(MIT)                                                        │
+│  Copyright(c) 2019 Dominic Szablewski                                        │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files(the              │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and / or sell copies of the Software, and to        │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│    The above copyright notice and this permission notice shall be            │
+│    included in all copies or substantial portions of the Software.           │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                       │
+│  NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE       │
+│  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN             │
+│  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN           │
+│  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE            │
+│  SOFTWARE.                                                                   │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "dsp/mpeg/mpeg.h"
+#include "dsp/mpeg/video.h"
+#include "libc/log/check.h"
+
+forceinline void plm_video_process_macroblock(plm_video_t *self, uint8_t *d,
+                                              uint8_t *s, int motion_h,
+                                              int motion_v, bool interpolate,
+                                              unsigned BW) {
+  unsigned si, di, max_address;
+  int y, x, dest_scan, source_scan, dw, hp, vp, odd_h, odd_v;
+  dw = self->mb_width * BW;
+  hp = motion_h >> 1;
+  vp = motion_v >> 1;
+  odd_h = (motion_h & 1) == 1;
+  odd_v = (motion_v & 1) == 1;
+  si = ((self->mb_row * BW) + vp) * dw + (self->mb_col * BW) + hp;
+  di = (self->mb_row * dw + self->mb_col) * BW;
+  max_address = (dw * (self->mb_height * BW - BW + 1) - BW);
+  if (si > max_address || di > max_address)
+    return;
+  d += di;
+  s += si;
+  switch (((interpolate << 2) | (odd_h << 1) | (odd_v)) & 7) {
+    case 0:
+      dest_scan = dw - BW;
+      source_scan = dw - BW;
+      for (y = 0; y < BW; y++) {
+        for (x = 0; x < BW; x++) {
+          *d++ = *s++;
+        }
+        s += source_scan;
+        d += dest_scan;
+      }
+      break;
+    case 1:
+      dest_scan = dw - BW;
+      source_scan = dw - BW;
+      for (y = 0; y < BW; y++) {
+        for (x = 0; x < BW; x++) {
+          *d++ = (s[0] + s[dw] + 1) >> 1;
+          s++;
+        }
+        s += source_scan;
+        d += dest_scan;
+      }
+      break;
+    case 2:
+      dest_scan = dw - BW;
+      source_scan = dw - BW;
+      for (y = 0; y < BW; y++) {
+        for (x = 0; x < BW; x++) {
+          *d++ = (s[0] + s[1] + 1) >> 1;
+          s++;
+        }
+        s += source_scan;
+        d += dest_scan;
+      }
+      break;
+    case 3:
+      dest_scan = dw - BW;
+      source_scan = dw - BW;
+      for (y = 0; y < BW; y++) {
+        for (x = 0; x < BW; x++) {
+          *d++ = (s[0] + s[1] + s[dw] + s[dw + 1] + 2) >> 2;
+          s++;
+        }
+        s += source_scan;
+        d += dest_scan;
+      }
+      break;
+    case 4:
+      dest_scan = dw - BW;
+      source_scan = dw - BW;
+      for (y = 0; y < BW; y++) {
+        for (x = 0; x < BW; x++) {
+          d[0] = (d[0] + (s[0]) + 1) >> 1;
+          d++;
+          s++;
+        }
+        s += source_scan;
+        d += dest_scan;
+      }
+      break;
+    case 5:
+      dest_scan = dw - BW;
+      source_scan = dw - BW;
+      for (y = 0; y < BW; y++) {
+        for (x = 0; x < BW; x++) {
+          d[0] = (d[0] + ((s[0] + s[dw] + 1) >> 1) + 1) >> 1;
+          d++;
+          s++;
+        }
+        s += source_scan;
+        d += dest_scan;
+      }
+      break;
+    case 6:
+      dest_scan = dw - BW;
+      source_scan = dw - BW;
+      for (y = 0; y < BW; y++) {
+        for (x = 0; x < BW; x++) {
+          d[0] = (d[0] + ((s[0] + s[1] + 1) >> 1) + 1) >> 1;
+          d++;
+          s++;
+        }
+        s += source_scan;
+        d += dest_scan;
+      }
+      break;
+    case 7:
+      dest_scan = dw - BW;
+      source_scan = dw - BW;
+      for (y = 0; y < BW; y++) {
+        for (x = 0; x < BW; x++) {
+          d[0] = (d[0] + ((s[0] + s[1] + s[dw] + s[dw + 1] + 2) >> 2) + 1) >> 1;
+          d++;
+          s++;
+        }
+        s += source_scan;
+        d += dest_scan;
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+void plm_video_process_macroblock_8(plm_video_t *self, uint8_t *d, uint8_t *s,
+                                    int motion_h, int motion_v,
+                                    bool interpolate) {
+  DCHECK_ALIGNED(8, d);
+  DCHECK_ALIGNED(8, s);
+  plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 8);
+}
+
+void plm_video_process_macroblock_16(plm_video_t *self, uint8_t *d, uint8_t *s,
+                                     int motion_h, int motion_v,
+                                     bool interpolate) {
+  DCHECK_ALIGNED(16, d);
+  DCHECK_ALIGNED(16, s);
+  plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 16);
+}
diff --git a/dsp/mpeg/mp2.c b/dsp/mpeg/mp2.c
new file mode 100644
index 000000000..53fc91a23
--- /dev/null
+++ b/dsp/mpeg/mp2.c
@@ -0,0 +1,769 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:4;tab-width:4;coding:utf-8   -*-│
+│ vi: set noet ft=c ts=4 sw=4 fenc=utf-8                                   :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│  PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer           │
+│  Dominic Szablewski - https://phoboslab.org                                  │
+│                                                                              │
+│  The MIT License(MIT)                                                        │
+│  Copyright(c) 2019 Dominic Szablewski                                        │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files(the              │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and / or sell copies of the Software, and to        │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│    The above copyright notice and this permission notice shall be            │
+│    included in all copies or substantial portions of the Software.           │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                       │
+│  NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE       │
+│  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN             │
+│  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN           │
+│  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE            │
+│  SOFTWARE.                                                                   │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "dsp/mpeg/buffer.h"
+#include "dsp/mpeg/mpeg.h"
+#include "libc/log/log.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"
+
+/* clang-format off */
+// -----------------------------------------------------------------------------
+// plm_audio implementation
+
+// Based on kjmp2 by Martin J. Fiedler
+// http://keyj.emphy.de/kjmp2/
+
+#define PLM_AUDIO_FRAME_SYNC 0x7ff
+
+#define PLM_AUDIO_MPEG_2_5 0x0
+#define PLM_AUDIO_MPEG_2 0x2
+#define PLM_AUDIO_MPEG_1 0x3
+
+#define PLM_AUDIO_LAYER_III 0x1
+#define PLM_AUDIO_LAYER_II 0x2
+#define PLM_AUDIO_LAYER_I 0x3
+
+#define PLM_AUDIO_MODE_STEREO 0x0
+#define PLM_AUDIO_MODE_JOINT_STEREO 0x1
+#define PLM_AUDIO_MODE_DUAL_CHANNEL 0x2
+#define PLM_AUDIO_MODE_MONO 0x3
+
+static const unsigned short PLM_AUDIO_SAMPLE_RATE[] = {
+	44100, 48000, 32000, 0, // MPEG-1
+	22050, 24000, 16000, 0  // MPEG-2
+};
+
+static const short PLM_AUDIO_BIT_RATE[] = {
+	32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384, // MPEG-1
+	 8, 16, 24, 32, 40, 48,  56,  64,  80,  96, 112, 128, 144, 160  // MPEG-2
+};
+
+static const int PLM_AUDIO_SCALEFACTOR_BASE[] = {
+	0x02000000, 0x01965FEA, 0x01428A30
+};
+
+static const float PLM_AUDIO_SYNTHESIS_WINDOW[] = {
+	     0.0,     -0.5,     -0.5,     -0.5,     -0.5,     -0.5,
+	    -0.5,     -1.0,     -1.0,     -1.0,     -1.0,     -1.5,
+	    -1.5,     -2.0,     -2.0,     -2.5,     -2.5,     -3.0,
+	    -3.5,     -3.5,     -4.0,     -4.5,     -5.0,     -5.5,
+	    -6.5,     -7.0,     -8.0,     -8.5,     -9.5,    -10.5,
+	   -12.0,    -13.0,    -14.5,    -15.5,    -17.5,    -19.0,
+	   -20.5,    -22.5,    -24.5,    -26.5,    -29.0,    -31.5,
+	   -34.0,    -36.5,    -39.5,    -42.5,    -45.5,    -48.5,
+	   -52.0,    -55.5,    -58.5,    -62.5,    -66.0,    -69.5,
+	   -73.5,    -77.0,    -80.5,    -84.5,    -88.0,    -91.5,
+	   -95.0,    -98.0,   -101.0,   -104.0,    106.5,    109.0,
+	   111.0,    112.5,    113.5,    114.0,    114.0,    113.5,
+	   112.0,    110.5,    107.5,    104.0,    100.0,     94.5,
+	    88.5,     81.5,     73.0,     63.5,     53.0,     41.5,
+	    28.5,     14.5,     -1.0,    -18.0,    -36.0,    -55.5,
+	   -76.5,    -98.5,   -122.0,   -147.0,   -173.5,   -200.5,
+	  -229.5,   -259.5,   -290.5,   -322.5,   -355.5,   -389.5,
+	  -424.0,   -459.5,   -495.5,   -532.0,   -568.5,   -605.0,
+	  -641.5,   -678.0,   -714.0,   -749.0,   -783.5,   -817.0,
+	  -849.0,   -879.5,   -908.5,   -935.0,   -959.5,   -981.0,
+	 -1000.5,  -1016.0,  -1028.5,  -1037.5,  -1042.5,  -1043.5,
+	 -1040.0,  -1031.5,   1018.5,   1000.0,    976.0,    946.5,
+	   911.0,    869.5,    822.0,    767.5,    707.0,    640.0,
+	   565.5,    485.0,    397.0,    302.5,    201.0,     92.5,
+	   -22.5,   -144.0,   -272.5,   -407.0,   -547.5,   -694.0,
+	  -846.0,  -1003.0,  -1165.0,  -1331.5,  -1502.0,  -1675.5,
+	 -1852.5,  -2031.5,  -2212.5,  -2394.0,  -2576.5,  -2758.5,
+	 -2939.5,  -3118.5,  -3294.5,  -3467.5,  -3635.5,  -3798.5,
+	 -3955.0,  -4104.5,  -4245.5,  -4377.5,  -4499.0,  -4609.5,
+	 -4708.0,  -4792.5,  -4863.5,  -4919.0,  -4958.0,  -4979.5,
+	 -4983.0,  -4967.5,  -4931.5,  -4875.0,  -4796.0,  -4694.5,
+	 -4569.5,  -4420.0,  -4246.0,  -4046.0,  -3820.0,  -3567.0,
+	  3287.0,   2979.5,   2644.0,   2280.5,   1888.0,   1467.5,
+	  1018.5,    541.0,     35.0,   -499.0,  -1061.0,  -1650.0,
+	 -2266.5,  -2909.0,  -3577.0,  -4270.0,  -4987.5,  -5727.5,
+	 -6490.0,  -7274.0,  -8077.5,  -8899.5,  -9739.0, -10594.5,
+	-11464.5, -12347.0, -13241.0, -14144.5, -15056.0, -15973.5,
+	-16895.5, -17820.0, -18744.5, -19668.0, -20588.0, -21503.0,
+	-22410.5, -23308.5, -24195.0, -25068.5, -25926.5, -26767.0,
+	-27589.0, -28389.0, -29166.5, -29919.0, -30644.5, -31342.0,
+	-32009.5, -32645.0, -33247.0, -33814.5, -34346.0, -34839.5,
+	-35295.0, -35710.0, -36084.5, -36417.5, -36707.5, -36954.0,
+	-37156.5, -37315.0, -37428.0, -37496.0,  37519.0,  37496.0,
+	 37428.0,  37315.0,  37156.5,  36954.0,  36707.5,  36417.5,
+	 36084.5,  35710.0,  35295.0,  34839.5,  34346.0,  33814.5,
+	 33247.0,  32645.0,  32009.5,  31342.0,  30644.5,  29919.0,
+	 29166.5,  28389.0,  27589.0,  26767.0,  25926.5,  25068.5,
+	 24195.0,  23308.5,  22410.5,  21503.0,  20588.0,  19668.0,
+	 18744.5,  17820.0,  16895.5,  15973.5,  15056.0,  14144.5,
+	 13241.0,  12347.0,  11464.5,  10594.5,   9739.0,   8899.5,
+	  8077.5,   7274.0,   6490.0,   5727.5,   4987.5,   4270.0,
+	  3577.0,   2909.0,   2266.5,   1650.0,   1061.0,    499.0,
+	   -35.0,   -541.0,  -1018.5,  -1467.5,  -1888.0,  -2280.5,
+	 -2644.0,  -2979.5,   3287.0,   3567.0,   3820.0,   4046.0,
+	  4246.0,   4420.0,   4569.5,   4694.5,   4796.0,   4875.0,
+	  4931.5,   4967.5,   4983.0,   4979.5,   4958.0,   4919.0,
+	  4863.5,   4792.5,   4708.0,   4609.5,   4499.0,   4377.5,
+	  4245.5,   4104.5,   3955.0,   3798.5,   3635.5,   3467.5,
+	  3294.5,   3118.5,   2939.5,   2758.5,   2576.5,   2394.0,
+	  2212.5,   2031.5,   1852.5,   1675.5,   1502.0,   1331.5,
+	  1165.0,   1003.0,    846.0,    694.0,    547.5,    407.0,
+	   272.5,    144.0,     22.5,    -92.5,   -201.0,   -302.5,
+	  -397.0,   -485.0,   -565.5,   -640.0,   -707.0,   -767.5,
+	  -822.0,   -869.5,   -911.0,   -946.5,   -976.0,  -1000.0,
+	  1018.5,   1031.5,   1040.0,   1043.5,   1042.5,   1037.5,
+	  1028.5,   1016.0,   1000.5,    981.0,    959.5,    935.0,
+	   908.5,    879.5,    849.0,    817.0,    783.5,    749.0,
+	   714.0,    678.0,    641.5,    605.0,    568.5,    532.0,
+	   495.5,    459.5,    424.0,    389.5,    355.5,    322.5,
+	   290.5,    259.5,    229.5,    200.5,    173.5,    147.0,
+	   122.0,     98.5,     76.5,     55.5,     36.0,     18.0,
+		1.0,    -14.5,    -28.5,    -41.5,    -53.0,    -63.5,
+	   -73.0,    -81.5,    -88.5,    -94.5,   -100.0,   -104.0,
+	  -107.5,   -110.5,   -112.0,   -113.5,   -114.0,   -114.0,
+	  -113.5,   -112.5,   -111.0,   -109.0,    106.5,    104.0,
+	   101.0,     98.0,     95.0,     91.5,     88.0,     84.5,
+	    80.5,     77.0,     73.5,     69.5,     66.0,     62.5,
+	    58.5,     55.5,     52.0,     48.5,     45.5,     42.5,
+	    39.5,     36.5,     34.0,     31.5,     29.0,     26.5,
+	    24.5,     22.5,     20.5,     19.0,     17.5,     15.5,
+	    14.5,     13.0,     12.0,     10.5,      9.5,      8.5,
+	     8.0,      7.0,      6.5,      5.5,      5.0,      4.5,
+	     4.0,      3.5,      3.5,      3.0,      2.5,      2.5,
+	     2.0,      2.0,      1.5,      1.5,      1.0,      1.0,
+	     1.0,      1.0,      0.5,      0.5,      0.5,      0.5,
+	     0.5,      0.5
+};
+
+// Quantizer lookup, step 1: bitrate classes
+static const uint8_t PLM_AUDIO_QUANT_LUT_STEP_1[2][16] = {
+	// 32, 48, 56, 64, 80, 96,112,128,160,192,224,256,320,384 <- bitrate
+	{ 0,  0,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2 }, // mono
+	// 16, 24, 28, 32, 40, 48, 56, 64, 80, 96,112,128,160,192 <- bitrate / chan
+	{ 0,  0,  0,  0,  0,  0,  1,  1,  1,  2,  2,  2,  2,  2 } // stereo
+};
+
+// Quantizer lookup, step 2: bitrate class, sample rate -> B2 table idx, sblimit
+static const uint8_t PLM_AUDIO_QUANT_TAB_A = (27 | 64);   // Table 3-B.2a: high-rate, sblimit = 27
+static const uint8_t PLM_AUDIO_QUANT_TAB_B = (30 | 64);   // Table 3-B.2b: high-rate, sblimit = 30
+static const uint8_t PLM_AUDIO_QUANT_TAB_C = 8;           // Table 3-B.2c:  low-rate, sblimit =  8
+static const uint8_t PLM_AUDIO_QUANT_TAB_D = 12;          // Table 3-B.2d:  low-rate, sblimit = 12
+
+static const uint8_t QUANT_LUT_STEP_2[3][3] = {
+	//             44.1 kHz,               48 kHz,                 32 kHz
+	{ PLM_AUDIO_QUANT_TAB_C, PLM_AUDIO_QUANT_TAB_C, PLM_AUDIO_QUANT_TAB_D }, // 32 - 48 kbit/sec/ch
+	{ PLM_AUDIO_QUANT_TAB_A, PLM_AUDIO_QUANT_TAB_A, PLM_AUDIO_QUANT_TAB_A }, // 56 - 80 kbit/sec/ch
+	{ PLM_AUDIO_QUANT_TAB_B, PLM_AUDIO_QUANT_TAB_A, PLM_AUDIO_QUANT_TAB_B }  // 96+	 kbit/sec/ch
+};
+
+// Quantizer lookup, step 3: B2 table, subband -> nbal, row index
+// (upper 4 bits: nbal, lower 4 bits: row index)
+static const uint8_t PLM_AUDIO_QUANT_LUT_STEP_3[3][32] = {
+	// Low-rate table (3-B.2c and 3-B.2d)
+	{
+		0x44,0x44,
+		0x34,0x34,0x34,0x34,0x34,0x34,0x34,0x34,0x34,0x34
+	},
+	// High-rate table (3-B.2a and 3-B.2b)
+	{
+		0x43,0x43,0x43,
+		0x42,0x42,0x42,0x42,0x42,0x42,0x42,0x42,
+		0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,
+		0x20,0x20,0x20,0x20,0x20,0x20,0x20
+	},
+	// MPEG-2 LSR table (B.2 in ISO 13818-3)
+	{
+		0x45,0x45,0x45,0x45,
+		0x34,0x34,0x34,0x34,0x34,0x34,0x34,
+		0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24,
+		0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24
+	}
+};
+
+// Quantizer lookup, step 4: table row, allocation[] value -> quant table index
+static const uint8_t PLM_AUDIO_QUANT_LUT_STEP4[6][16] = {
+	{ 0, 1, 2, 17 },
+	{ 0, 1, 2,  3, 4, 5, 6, 17 },
+	{ 0, 1, 2,  3, 4, 5, 6,  7,  8,  9, 10, 11, 12, 13, 14, 17 },
+	{ 0, 1, 3,  5, 6, 7, 8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
+	{ 0, 1, 2,  4, 5, 6, 7,  8,  9, 10, 11, 12, 13, 14, 15, 17 },
+	{ 0, 1, 2,  3, 4, 5, 6,  7,  8,  9, 10, 11, 12, 13, 14, 15 }
+};
+
+typedef struct plm_quantizer_spec_t {
+	unsigned short levels;
+	unsigned char group;
+	unsigned char bits;
+} plm_quantizer_spec_t;
+
+static const plm_quantizer_spec_t PLM_AUDIO_QUANT_TAB[] = {
+	{     3, 1,  5 },  //  1
+	{     5, 1,  7 },  //  2
+	{     7, 0,  3 },  //  3
+	{     9, 1, 10 },  //  4
+	{    15, 0,  4 },  //  5
+	{    31, 0,  5 },  //  6
+	{    63, 0,  6 },  //  7
+	{   127, 0,  7 },  //  8
+	{   255, 0,  8 },  //  9
+	{   511, 0,  9 },  // 10
+	{  1023, 0, 10 },  // 11
+	{  2047, 0, 11 },  // 12
+	{  4095, 0, 12 },  // 13
+	{  8191, 0, 13 },  // 14
+	{ 16383, 0, 14 },  // 15
+	{ 32767, 0, 15 },  // 16
+	{ 65535, 0, 16 }   // 17
+};
+
+struct plm_audio_t {
+	double time;
+	int samples_decoded;
+	int samplerate_index;
+	int bitrate_index;
+	int version;
+	int layer;
+	int mode;
+	int bound;
+	int v_pos;
+	int next_frame_data_size;
+	plm_buffer_t *buffer;
+	int destroy_buffer_when_done;
+	const plm_quantizer_spec_t *allocation[2][32];
+	uint8_t scale_factor_info[2][32];
+	int scale_factor[2][32][3];
+	int sample[2][32][3];
+	plm_samples_t samples;
+	float D[1024];
+	float V[1024];
+	float U[32];
+} forcealign(64);
+
+typedef plm_audio_t plm_audio_t;
+
+int plm_audio_decode_header(plm_audio_t *self);
+void plm_audio_decode_frame(plm_audio_t *self);
+const plm_quantizer_spec_t *plm_audio_read_allocation(plm_audio_t *self, int sb, int tab3);
+void plm_audio_read_samples(plm_audio_t *self, int ch, int sb, int part);
+void plm_audio_matrix_transform(int s[32][3], int ss, float *d, int dp);
+
+plm_audio_t *plm_audio_create_with_buffer(plm_buffer_t *buffer, int destroy_when_done) {
+	plm_audio_t *self = (plm_audio_t *)memalign(_Alignof(plm_audio_t), sizeof(plm_audio_t));
+	memset(self, 0, sizeof(plm_audio_t));
+
+	self->samples.count = PLM_AUDIO_SAMPLES_PER_FRAME;
+	self->buffer = buffer;
+	self->destroy_buffer_when_done = destroy_when_done;
+	self->samplerate_index = 3; // indicates 0 samplerate
+
+	memcpy(self->D, PLM_AUDIO_SYNTHESIS_WINDOW, 512 * sizeof(float));
+	memcpy(self->D + 512, PLM_AUDIO_SYNTHESIS_WINDOW, 512 * sizeof(float));
+
+	// Decode first header
+	if (plm_buffer_has(self->buffer, 48)) {
+		self->next_frame_data_size = plm_audio_decode_header(self);
+	}
+
+	return self;
+}
+
+void plm_audio_destroy(plm_audio_t *self) {
+	if (self->destroy_buffer_when_done) {
+		plm_buffer_destroy(self->buffer);
+	}
+	free(self);
+}
+
+int plm_audio_get_samplerate(plm_audio_t *self) {
+	return PLM_AUDIO_SAMPLE_RATE[self->samplerate_index];
+}
+
+double plm_audio_get_time(plm_audio_t *self) {
+	return self->time;
+}
+
+void plm_audio_rewind(plm_audio_t *self) {
+	plm_buffer_rewind(self->buffer);
+	self->time = 0;
+	self->samples_decoded = 0;
+	self->next_frame_data_size = 0;
+
+	// TODO: needed?
+	memset(self->V, 0, sizeof(self->V));
+	memset(self->U, 0, sizeof(self->U));
+}
+
+plm_samples_t *plm_audio_decode(plm_audio_t *self) {
+	DEBUGF("%s", "plm_audio_decode");
+	// Do we have at least enough information to decode the frame header?
+	if (!self->next_frame_data_size) {
+		if (!plm_buffer_has(self->buffer, 48)) {
+			return NULL;
+		}
+		self->next_frame_data_size = plm_audio_decode_header(self);
+	}
+
+	if (
+		self->next_frame_data_size == 0 ||
+		!plm_buffer_has(self->buffer, self->next_frame_data_size << 3)
+	) {
+		return NULL;
+	}
+
+	plm_audio_decode_frame(self);
+	self->next_frame_data_size = 0;
+
+	self->samples.time = self->time;
+
+	self->samples_decoded += PLM_AUDIO_SAMPLES_PER_FRAME;
+	self->time = (double)self->samples_decoded /
+		(double)PLM_AUDIO_SAMPLE_RATE[self->samplerate_index];
+
+	return &self->samples;
+}
+
+int plm_audio_decode_header(plm_audio_t *self) {
+	// Check for valid header: syncword OK, MPEG-Audio Layer 2
+	plm_buffer_skip_bytes(self->buffer, 0x00);
+
+	int sync = plm_buffer_read(self->buffer, 11);
+	self->version = plm_buffer_read(self->buffer, 2);
+	self->layer = plm_buffer_read(self->buffer, 2);
+	int hasCRC = !plm_buffer_read(self->buffer, 1);
+
+	if (
+		sync != PLM_AUDIO_FRAME_SYNC ||
+		self->version != PLM_AUDIO_MPEG_1 ||
+		self->layer != PLM_AUDIO_LAYER_II
+	) {
+		return false; // Invalid header or unsupported version
+	}
+
+	self->bitrate_index = plm_buffer_read(self->buffer, 4) - 1;
+	if (self->bitrate_index > 13) {
+		return false;  // Invalid bit rate or 'free format'
+	}
+
+	self->samplerate_index = plm_buffer_read(self->buffer, 2);
+	if (self->samplerate_index == 3) {
+		return false; // Invalid sample rate
+	}
+
+	if (self->version == PLM_AUDIO_MPEG_2) {
+		self->samplerate_index += 4;
+		self->bitrate_index += 14;
+	}
+	int padding = plm_buffer_read(self->buffer, 1);
+	plm_buffer_skip(self->buffer, 1); // f_private
+	self->mode = plm_buffer_read(self->buffer, 2);
+
+	// Parse the mode_extension, set up the stereo bound
+	self->bound = 0;
+	if (self->mode == PLM_AUDIO_MODE_JOINT_STEREO) {
+		self->bound = (plm_buffer_read(self->buffer, 2) + 1) << 2;
+	}
+	else {
+		plm_buffer_skip(self->buffer, 2);
+		self->bound = (self->mode == PLM_AUDIO_MODE_MONO) ? 0 : 32;
+	}
+
+	// Discard the last 4 bits of the header and the CRC value, if present
+	plm_buffer_skip(self->buffer, 4);
+	if (hasCRC) {
+		plm_buffer_skip(self->buffer, 16);
+	}
+
+	// Compute frame size, check if we have enough data to decode the whole
+	// frame.
+	int bitrate = PLM_AUDIO_BIT_RATE[self->bitrate_index];
+	int samplerate = PLM_AUDIO_SAMPLE_RATE[self->samplerate_index];
+	int frame_size = (144000 * bitrate / samplerate) + padding;
+	return frame_size - (hasCRC ? 6 : 4);
+}
+
+void plm_audio_decode_frame(plm_audio_t *self) {
+	// Prepare the quantizer table lookups
+	int tab3 = 0;
+	int sblimit = 0;
+	if (self->version == PLM_AUDIO_MPEG_2) {
+		// MPEG-2 (LSR)
+		tab3 = 2;
+		sblimit = 30;
+	}
+	else {
+		// MPEG-1
+		int tab1 = (self->mode == PLM_AUDIO_MODE_MONO) ? 0 : 1;
+		int tab2 = PLM_AUDIO_QUANT_LUT_STEP_1[tab1][self->bitrate_index];
+		tab3 = QUANT_LUT_STEP_2[tab2][self->samplerate_index];
+		sblimit = tab3 & 63;
+		tab3 >>= 6;
+	}
+
+	if (self->bound > sblimit) {
+		self->bound = sblimit;
+	}
+
+	// Read the allocation information
+	for (int sb = 0; sb < self->bound; sb++) {
+		self->allocation[0][sb] = plm_audio_read_allocation(self, sb, tab3);
+		self->allocation[1][sb] = plm_audio_read_allocation(self, sb, tab3);
+	}
+
+	for (int sb = self->bound; sb < sblimit; sb++) {
+		self->allocation[0][sb] =
+			self->allocation[1][sb] =
+			plm_audio_read_allocation(self, sb, tab3);
+	}
+
+	// Read scale factor selector information
+	int channels = (self->mode == PLM_AUDIO_MODE_MONO) ? 1 : 2;
+	for (int sb = 0; sb < sblimit; sb++) {
+		for (int ch = 0; ch < channels; ch++) {
+			if (self->allocation[ch][sb]) {
+				self->scale_factor_info[ch][sb] = plm_buffer_read(self->buffer, 2);
+			}
+		}
+		if (self->mode == PLM_AUDIO_MODE_MONO) {
+			self->scale_factor_info[1][sb] = self->scale_factor_info[0][sb];
+		}
+	}
+
+	// Read scale factors
+	for (int sb = 0; sb < sblimit; sb++) {
+		for (int ch = 0; ch < channels; ch++) {
+			if (self->allocation[ch][sb]) {
+				int *sf = self->scale_factor[ch][sb];
+				switch (self->scale_factor_info[ch][sb]) {
+				case 0:
+					sf[0] = plm_buffer_read(self->buffer, 6);
+					sf[1] = plm_buffer_read(self->buffer, 6);
+					sf[2] = plm_buffer_read(self->buffer, 6);
+					break;
+				case 1:
+					sf[0] =
+						sf[1] = plm_buffer_read(self->buffer, 6);
+					sf[2] = plm_buffer_read(self->buffer, 6);
+					break;
+				case 2:
+					sf[0] =
+						sf[1] =
+						sf[2] = plm_buffer_read(self->buffer, 6);
+					break;
+				case 3:
+					sf[0] = plm_buffer_read(self->buffer, 6);
+					sf[1] =
+						sf[2] = plm_buffer_read(self->buffer, 6);
+					break;
+				}
+			}
+		}
+		if (self->mode == PLM_AUDIO_MODE_MONO) {
+			self->scale_factor[1][sb][0] = self->scale_factor[0][sb][0];
+			self->scale_factor[1][sb][1] = self->scale_factor[0][sb][1];
+			self->scale_factor[1][sb][2] = self->scale_factor[0][sb][2];
+		}
+	}
+
+	// Coefficient input and reconstruction
+	int out_pos = 0;
+	for (int part = 0; part < 3; part++) {
+		for (int granule = 0; granule < 4; granule++) {
+
+			// Read the samples
+			for (int sb = 0; sb < self->bound; sb++) {
+				plm_audio_read_samples(self, 0, sb, part);
+				plm_audio_read_samples(self, 1, sb, part);
+			}
+			for (int sb = self->bound; sb < sblimit; sb++) {
+				plm_audio_read_samples(self, 0, sb, part);
+				self->sample[1][sb][0] = self->sample[0][sb][0];
+				self->sample[1][sb][1] = self->sample[0][sb][1];
+				self->sample[1][sb][2] = self->sample[0][sb][2];
+			}
+			for (int sb = sblimit; sb < 32; sb++) {
+				self->sample[0][sb][0] = 0;
+				self->sample[0][sb][1] = 0;
+				self->sample[0][sb][2] = 0;
+				self->sample[1][sb][0] = 0;
+				self->sample[1][sb][1] = 0;
+				self->sample[1][sb][2] = 0;
+			}
+
+			// Synthesis loop
+			for (int p = 0; p < 3; p++) {
+				// Shifting step
+				self->v_pos = (self->v_pos - 64) & 1023;
+
+				for (int ch = 0; ch < 2; ch++) {
+					plm_audio_matrix_transform(self->sample[ch], p, self->V, self->v_pos);
+
+					// Build U, windowing, calculate output
+					memset(self->U, 0, sizeof(self->U));
+
+					int d_index = 512 - (self->v_pos >> 1);
+					int v_index = (self->v_pos % 128) >> 1;
+					while (v_index < 1024) {
+						for (int i = 0; i < 32; ++i) {
+							self->U[i] += self->D[d_index++] * self->V[v_index++];
+						}
+
+						v_index += 128 - 32;
+						d_index += 64 - 32;
+					}
+
+					d_index -= (512 - 32);
+					v_index = (128 - 32 + 1024) - v_index;
+					while (v_index < 1024) {
+						for (int i = 0; i < 32; ++i) {
+							self->U[i] += self->D[d_index++] * self->V[v_index++];
+						}
+
+						v_index += 128 - 32;
+						d_index += 64 - 32;
+					}
+
+					// Output samples
+					#ifdef PLM_AUDIO_SEPARATE_CHANNELS
+						float *out_channel = ch == 0
+							? self->samples.left
+							: self->samples.right;
+						for (int j = 0; j < 32; j++) {
+							out_channel[out_pos + j] = self->U[j] / 2147418112.0f;
+						}
+					#else
+						for (int j = 0; j < 32; j++) {
+							self->samples.interleaved[((out_pos + j) << 1) + ch] =
+								self->U[j] / 2147418112.0f;
+						}
+					#endif
+				} // End of synthesis channel loop
+				out_pos += 32;
+			} // End of synthesis sub-block loop
+
+		} // Decoding of the granule finished
+	}
+
+	plm_buffer_align(self->buffer);
+}
+
+const plm_quantizer_spec_t *plm_audio_read_allocation(plm_audio_t *self, int sb, int tab3) {
+	int tab4 = PLM_AUDIO_QUANT_LUT_STEP_3[tab3][sb];
+	int qtab = PLM_AUDIO_QUANT_LUT_STEP4[tab4 & 15][plm_buffer_read(self->buffer, tab4 >> 4)];
+	return qtab ? (&PLM_AUDIO_QUANT_TAB[qtab - 1]) : 0;
+}
+
+void plm_audio_read_samples(plm_audio_t *self, int ch, int sb, int part) {
+	const plm_quantizer_spec_t *q = self->allocation[ch][sb];
+	int sf = self->scale_factor[ch][sb][part];
+	int *sample = self->sample[ch][sb];
+	int val = 0;
+
+	if (!q) {
+		// No bits allocated for this subband
+		sample[0] = sample[1] = sample[2] = 0;
+		return;
+	}
+
+	// Resolve scalefactor
+	if (sf == 63) {
+		sf = 0;
+	}
+	else {
+		int shift = (sf / 3) | 0;
+		sf = (PLM_AUDIO_SCALEFACTOR_BASE[sf % 3] + ((1u << shift) >> 1)) >> shift;
+	}
+
+	// Decode samples
+	int adj = q->levels;
+	if (q->group) {
+		// Decode grouped samples
+		val = plm_buffer_read(self->buffer, q->bits);
+		sample[0] = val % adj;
+		val /= adj;
+		sample[1] = val % adj;
+		sample[2] = val / adj;
+	}
+	else {
+		// Decode direct samples
+		sample[0] = plm_buffer_read(self->buffer, q->bits);
+		sample[1] = plm_buffer_read(self->buffer, q->bits);
+		sample[2] = plm_buffer_read(self->buffer, q->bits);
+	}
+
+	// Postmultiply samples
+	int scale = 65536 / (adj + 1);
+	adj = ((adj + 1) >> 1) - 1;
+
+	val = (adj - sample[0]) * scale;
+	sample[0] = (val * (sf >> 12) + ((val * (sf & 4095) + 2048) >> 12)) >> 12;
+
+	val = (adj - sample[1]) * scale;
+	sample[1] = (val * (sf >> 12) + ((val * (sf & 4095) + 2048) >> 12)) >> 12;
+
+	val = (adj - sample[2]) * scale;
+	sample[2] = (val * (sf >> 12) + ((val * (sf & 4095) + 2048) >> 12)) >> 12;
+}
+
+void plm_audio_matrix_transform(int s[32][3], int ss, float *d, int dp) {
+	float t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12,
+		t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23, t24,
+		t25, t26, t27, t28, t29, t30, t31, t32, t33;
+
+	t01 = (float)(s[0][ss] + s[31][ss]); t02 = (float)(s[0][ss] - s[31][ss]) * 0.500602998235f;
+	t03 = (float)(s[1][ss] + s[30][ss]); t04 = (float)(s[1][ss] - s[30][ss]) * 0.505470959898f;
+	t05 = (float)(s[2][ss] + s[29][ss]); t06 = (float)(s[2][ss] - s[29][ss]) * 0.515447309923f;
+	t07 = (float)(s[3][ss] + s[28][ss]); t08 = (float)(s[3][ss] - s[28][ss]) * 0.53104259109f;
+	t09 = (float)(s[4][ss] + s[27][ss]); t10 = (float)(s[4][ss] - s[27][ss]) * 0.553103896034f;
+	t11 = (float)(s[5][ss] + s[26][ss]); t12 = (float)(s[5][ss] - s[26][ss]) * 0.582934968206f;
+	t13 = (float)(s[6][ss] + s[25][ss]); t14 = (float)(s[6][ss] - s[25][ss]) * 0.622504123036f;
+	t15 = (float)(s[7][ss] + s[24][ss]); t16 = (float)(s[7][ss] - s[24][ss]) * 0.674808341455f;
+	t17 = (float)(s[8][ss] + s[23][ss]); t18 = (float)(s[8][ss] - s[23][ss]) * 0.744536271002f;
+	t19 = (float)(s[9][ss] + s[22][ss]); t20 = (float)(s[9][ss] - s[22][ss]) * 0.839349645416f;
+	t21 = (float)(s[10][ss] + s[21][ss]); t22 = (float)(s[10][ss] - s[21][ss]) * 0.972568237862f;
+	t23 = (float)(s[11][ss] + s[20][ss]); t24 = (float)(s[11][ss] - s[20][ss]) * 1.16943993343f;
+	t25 = (float)(s[12][ss] + s[19][ss]); t26 = (float)(s[12][ss] - s[19][ss]) * 1.48416461631f;
+	t27 = (float)(s[13][ss] + s[18][ss]); t28 = (float)(s[13][ss] - s[18][ss]) * 2.05778100995f;
+	t29 = (float)(s[14][ss] + s[17][ss]); t30 = (float)(s[14][ss] - s[17][ss]) * 3.40760841847f;
+	t31 = (float)(s[15][ss] + s[16][ss]); t32 = (float)(s[15][ss] - s[16][ss]) * 10.1900081235f;
+
+	t33 = t01 + t31; t31 = (t01 - t31) * 0.502419286188f;
+	t01 = t03 + t29; t29 = (t03 - t29) * 0.52249861494f;
+	t03 = t05 + t27; t27 = (t05 - t27) * 0.566944034816f;
+	t05 = t07 + t25; t25 = (t07 - t25) * 0.64682178336f;
+	t07 = t09 + t23; t23 = (t09 - t23) * 0.788154623451f;
+	t09 = t11 + t21; t21 = (t11 - t21) * 1.06067768599f;
+	t11 = t13 + t19; t19 = (t13 - t19) * 1.72244709824f;
+	t13 = t15 + t17; t17 = (t15 - t17) * 5.10114861869f;
+	t15 = t33 + t13; t13 = (t33 - t13) * 0.509795579104f;
+	t33 = t01 + t11; t01 = (t01 - t11) * 0.601344886935f;
+	t11 = t03 + t09; t09 = (t03 - t09) * 0.899976223136f;
+	t03 = t05 + t07; t07 = (t05 - t07) * 2.56291544774f;
+	t05 = t15 + t03; t15 = (t15 - t03) * 0.541196100146f;
+	t03 = t33 + t11; t11 = (t33 - t11) * 1.30656296488f;
+	t33 = t05 + t03; t05 = (t05 - t03) * 0.707106781187f;
+	t03 = t15 + t11; t15 = (t15 - t11) * 0.707106781187f;
+	t03 += t15;
+	t11 = t13 + t07; t13 = (t13 - t07) * 0.541196100146f;
+	t07 = t01 + t09; t09 = (t01 - t09) * 1.30656296488f;
+	t01 = t11 + t07; t07 = (t11 - t07) * 0.707106781187f;
+	t11 = t13 + t09; t13 = (t13 - t09) * 0.707106781187f;
+	t11 += t13; t01 += t11;
+	t11 += t07; t07 += t13;
+	t09 = t31 + t17; t31 = (t31 - t17) * 0.509795579104f;
+	t17 = t29 + t19; t29 = (t29 - t19) * 0.601344886935f;
+	t19 = t27 + t21; t21 = (t27 - t21) * 0.899976223136f;
+	t27 = t25 + t23; t23 = (t25 - t23) * 2.56291544774f;
+	t25 = t09 + t27; t09 = (t09 - t27) * 0.541196100146f;
+	t27 = t17 + t19; t19 = (t17 - t19) * 1.30656296488f;
+	t17 = t25 + t27; t27 = (t25 - t27) * 0.707106781187f;
+	t25 = t09 + t19; t19 = (t09 - t19) * 0.707106781187f;
+	t25 += t19;
+	t09 = t31 + t23; t31 = (t31 - t23) * 0.541196100146f;
+	t23 = t29 + t21; t21 = (t29 - t21) * 1.30656296488f;
+	t29 = t09 + t23; t23 = (t09 - t23) * 0.707106781187f;
+	t09 = t31 + t21; t31 = (t31 - t21) * 0.707106781187f;
+	t09 += t31;	t29 += t09;	t09 += t23;	t23 += t31;
+	t17 += t29;	t29 += t25;	t25 += t09;	t09 += t27;
+	t27 += t23;	t23 += t19; t19 += t31;
+	t21 = t02 + t32; t02 = (t02 - t32) * 0.502419286188f;
+	t32 = t04 + t30; t04 = (t04 - t30) * 0.52249861494f;
+	t30 = t06 + t28; t28 = (t06 - t28) * 0.566944034816f;
+	t06 = t08 + t26; t08 = (t08 - t26) * 0.64682178336f;
+	t26 = t10 + t24; t10 = (t10 - t24) * 0.788154623451f;
+	t24 = t12 + t22; t22 = (t12 - t22) * 1.06067768599f;
+	t12 = t14 + t20; t20 = (t14 - t20) * 1.72244709824f;
+	t14 = t16 + t18; t16 = (t16 - t18) * 5.10114861869f;
+	t18 = t21 + t14; t14 = (t21 - t14) * 0.509795579104f;
+	t21 = t32 + t12; t32 = (t32 - t12) * 0.601344886935f;
+	t12 = t30 + t24; t24 = (t30 - t24) * 0.899976223136f;
+	t30 = t06 + t26; t26 = (t06 - t26) * 2.56291544774f;
+	t06 = t18 + t30; t18 = (t18 - t30) * 0.541196100146f;
+	t30 = t21 + t12; t12 = (t21 - t12) * 1.30656296488f;
+	t21 = t06 + t30; t30 = (t06 - t30) * 0.707106781187f;
+	t06 = t18 + t12; t12 = (t18 - t12) * 0.707106781187f;
+	t06 += t12;
+	t18 = t14 + t26; t26 = (t14 - t26) * 0.541196100146f;
+	t14 = t32 + t24; t24 = (t32 - t24) * 1.30656296488f;
+	t32 = t18 + t14; t14 = (t18 - t14) * 0.707106781187f;
+	t18 = t26 + t24; t24 = (t26 - t24) * 0.707106781187f;
+	t18 += t24; t32 += t18;
+	t18 += t14; t26 = t14 + t24;
+	t14 = t02 + t16; t02 = (t02 - t16) * 0.509795579104f;
+	t16 = t04 + t20; t04 = (t04 - t20) * 0.601344886935f;
+	t20 = t28 + t22; t22 = (t28 - t22) * 0.899976223136f;
+	t28 = t08 + t10; t10 = (t08 - t10) * 2.56291544774f;
+	t08 = t14 + t28; t14 = (t14 - t28) * 0.541196100146f;
+	t28 = t16 + t20; t20 = (t16 - t20) * 1.30656296488f;
+	t16 = t08 + t28; t28 = (t08 - t28) * 0.707106781187f;
+	t08 = t14 + t20; t20 = (t14 - t20) * 0.707106781187f;
+	t08 += t20;
+	t14 = t02 + t10; t02 = (t02 - t10) * 0.541196100146f;
+	t10 = t04 + t22; t22 = (t04 - t22) * 1.30656296488f;
+	t04 = t14 + t10; t10 = (t14 - t10) * 0.707106781187f;
+	t14 = t02 + t22; t02 = (t02 - t22) * 0.707106781187f;
+	t14 += t02;	t04 += t14;	t14 += t10;	t10 += t02;
+	t16 += t04;	t04 += t08;	t08 += t14;	t14 += t28;
+	t28 += t10;	t10 += t20;	t20 += t02;	t21 += t16;
+	t16 += t32;	t32 += t04;	t04 += t06;	t06 += t08;
+	t08 += t18;	t18 += t14;	t14 += t30;	t30 += t28;
+	t28 += t26;	t26 += t10;	t10 += t12;	t12 += t20;
+	t20 += t24;	t24 += t02;
+
+	d[dp + 48] = -t33;
+	d[dp + 49] = d[dp + 47] = -t21;
+	d[dp + 50] = d[dp + 46] = -t17;
+	d[dp + 51] = d[dp + 45] = -t16;
+	d[dp + 52] = d[dp + 44] = -t01;
+	d[dp + 53] = d[dp + 43] = -t32;
+	d[dp + 54] = d[dp + 42] = -t29;
+	d[dp + 55] = d[dp + 41] = -t04;
+	d[dp + 56] = d[dp + 40] = -t03;
+	d[dp + 57] = d[dp + 39] = -t06;
+	d[dp + 58] = d[dp + 38] = -t25;
+	d[dp + 59] = d[dp + 37] = -t08;
+	d[dp + 60] = d[dp + 36] = -t11;
+	d[dp + 61] = d[dp + 35] = -t18;
+	d[dp + 62] = d[dp + 34] = -t09;
+	d[dp + 63] = d[dp + 33] = -t14;
+	d[dp + 32] = -t05;
+	d[dp + 0] = t05; d[dp + 31] = -t30;
+	d[dp + 1] = t30; d[dp + 30] = -t27;
+	d[dp + 2] = t27; d[dp + 29] = -t28;
+	d[dp + 3] = t28; d[dp + 28] = -t07;
+	d[dp + 4] = t07; d[dp + 27] = -t26;
+	d[dp + 5] = t26; d[dp + 26] = -t23;
+	d[dp + 6] = t23; d[dp + 25] = -t10;
+	d[dp + 7] = t10; d[dp + 24] = -t15;
+	d[dp + 8] = t15; d[dp + 23] = -t12;
+	d[dp + 9] = t12; d[dp + 22] = -t19;
+	d[dp + 10] = t19; d[dp + 21] = -t20;
+	d[dp + 11] = t20; d[dp + 20] = -t13;
+	d[dp + 12] = t13; d[dp + 19] = -t24;
+	d[dp + 13] = t24; d[dp + 18] = -t31;
+	d[dp + 14] = t31; d[dp + 17] = -t02;
+	d[dp + 15] = t02; d[dp + 16] = 0.0;
+};
+
diff --git a/dsp/mpeg/mpeg.h b/dsp/mpeg/mpeg.h
new file mode 100644
index 000000000..f49ed953b
--- /dev/null
+++ b/dsp/mpeg/mpeg.h
@@ -0,0 +1,447 @@
+#ifndef COSMOPOLITAN_DSP_MPEG_MPEG_H_
+#define COSMOPOLITAN_DSP_MPEG_MPEG_H_
+#include "libc/stdio/stdio.h"
+COSMOPOLITAN_C_START_
+
+typedef struct plm_t plm_t;
+typedef struct plm_buffer_t plm_buffer_t;
+typedef struct plm_demux_t plm_demux_t;
+typedef struct plm_video_t plm_video_t;
+typedef struct plm_audio_t plm_audio_t;
+
+/**
+ * Demuxed MPEG PS packet
+ *
+ * The type maps directly to the various MPEG-PES start codes. pts is
+ * the presentation time stamp of the packet in seconds. Not all packets
+ * have a pts value.
+ */
+typedef struct plm_packet_t {
+  int type;
+  double pts;
+  size_t length;
+  uint8_t *data;
+} plm_packet_t;
+
+/**
+ * Decoded Video Plane
+ *
+ * The byte length of the data is width * height. Note that different
+ * planes have different sizes: the Luma plane (Y) is double the size of
+ * each of the two Chroma planes (Cr, Cb) - i.e. 4 times the byte
+ * length. Also note that the size of the plane does *not* denote the
+ * size of the displayed frame. The sizes of planes are always rounded
+ * up to the nearest macroblock (16px).
+ */
+typedef struct plm_plane_t {
+  unsigned int width;
+  unsigned int height;
+  uint8_t *data;
+} plm_plane_t;
+
+/**
+ * Decoded Video Frame
+ *
+ * Width and height denote the desired display size of the frame. This
+ * may be different from the internal size of the 3 planes.
+ */
+typedef struct plm_frame_t {
+  double time;
+  unsigned int width;
+  unsigned int height;
+  plm_plane_t y;
+  plm_plane_t cr;
+  plm_plane_t cb;
+} plm_frame_t;
+
+/**
+ * Callback function type for decoded video frames used by the high-level
+ * plm_* interface
+ */
+typedef void (*plm_video_decode_callback)(plm_t *self, plm_frame_t *frame,
+                                          void *user);
+
+/**
+ * Decoded Audio Samples
+ *
+ * Samples are stored as normalized (-1, 1) float either interleaved, or if
+ * PLM_AUDIO_SEPARATE_CHANNELS is defined, in two separate arrays.
+ * The `count` is always PLM_AUDIO_SAMPLES_PER_FRAME and just there for
+ * convenience.
+ */
+#define PLM_AUDIO_SAMPLES_PER_FRAME 1152
+
+struct plm_samples_t {
+  double time;
+  unsigned int count;
+#ifdef PLM_AUDIO_SEPARATE_CHANNELS
+  float left[PLM_AUDIO_SAMPLES_PER_FRAME] forcealign(32);
+  float right[PLM_AUDIO_SAMPLES_PER_FRAME] forcealign(32);
+#else
+  float interleaved[PLM_AUDIO_SAMPLES_PER_FRAME * 2] forcealign(32);
+#endif
+} forcealign(32);
+
+typedef struct plm_samples_t plm_samples_t;
+
+/**
+ * Callback function type for decoded audio samples used by the high-level
+ * plm_* interface
+ */
+typedef void (*plm_audio_decode_callback)(plm_t *self, plm_samples_t *samples,
+                                          void *user);
+
+/**
+ * Callback function for plm_buffer when it needs more data
+ */
+typedef void (*plm_buffer_load_callback)(plm_buffer_t *self, void *user);
+
+/**
+ * -----------------------------------------------------------------------------
+ * plm_* public API
+ * High-Level API for loading/demuxing/decoding MPEG-PS data
+ *
+ * Create a plmpeg instance with a filename. Returns NULL if the file could not
+ * be opened.
+ */
+plm_t *plm_create_with_filename(const char *filename);
+
+/**
+ * Create a plmpeg instance with file handle. Pass true to close_when_done
+ * to let plmpeg call fclose() on the handle when plm_destroy() is
+ * called.
+ */
+plm_t *plm_create_with_file(FILE *fh, int close_when_done);
+
+/**
+ * Create a plmpeg instance with pointer to memory as source. This assumes the
+ * whole file is in memory. Pass true to free_when_done to let plmpeg call
+ * free() on the pointer when plm_destroy() is called.
+ */
+plm_t *plm_create_with_memory(uint8_t *bytes, size_t length,
+                              int free_when_done);
+
+/**
+ * Create a plmpeg instance with a plm_buffer as source. This is also
+ * called internally by all the above constructor functions.
+ */
+plm_t *plm_create_with_buffer(plm_buffer_t *buffer, int destroy_when_done);
+
+/**
+ * Destroy a plmpeg instance and free all data
+ */
+void plm_destroy(plm_t *self);
+
+/**
+ * Get or set whether video decoding is enabled.
+ */
+int plm_get_video_enabled(plm_t *self);
+void plm_set_video_enabled(plm_t *self, int enabled);
+
+/**
+ * Get or set whether audio decoding is enabled. When enabling, you can set the
+ * desired audio stream (0-3) to decode.
+ */
+int plm_get_audio_enabled(plm_t *self);
+void plm_set_audio_enabled(plm_t *self, int enabled, int stream_index);
+
+/**
+ * Get the display width/height of the video stream
+ */
+int plm_get_width(plm_t *self);
+int plm_get_height(plm_t *self);
+
+double plm_get_pixel_aspect_ratio(plm_t *);
+
+/**
+ * Get the framerate of the video stream in frames per second
+ */
+double plm_get_framerate(plm_t *self);
+
+/**
+ * Get the number of available audio streams in the file
+ */
+int plm_get_num_audio_streams(plm_t *self);
+
+/**
+ * Get the samplerate of the audio stream in samples per second
+ */
+int plm_get_samplerate(plm_t *self);
+
+/**
+ * Get or set the audio lead time in seconds - the time in which audio samples
+ * are decoded in advance (or behind) the video decode time. Default 0.
+ */
+double plm_get_audio_lead_time(plm_t *self);
+void plm_set_audio_lead_time(plm_t *self, double lead_time);
+
+/**
+ * Get the current internal time in seconds
+ */
+double plm_get_time(plm_t *self);
+
+/**
+ * Rewind all buffers back to the beginning.
+ */
+void plm_rewind(plm_t *self);
+
+/**
+ * Get or set looping. Default false.
+ */
+int plm_get_loop(plm_t *self);
+void plm_set_loop(plm_t *self, int loop);
+
+/**
+ * Get whether the file has ended. If looping is enabled, this will always
+ * return false.
+ */
+int plm_has_ended(plm_t *self);
+
+/**
+ * Set the callback for decoded video frames used with plm_decode(). If no
+ * callback is set, video data will be ignored and not be decoded.
+ */
+void plm_set_video_decode_callback(plm_t *self, plm_video_decode_callback fp,
+                                   void *user);
+
+/**
+ * Set the callback for decoded audio samples used with plm_decode(). If no
+ * callback is set, audio data will be ignored and not be decoded.
+ */
+void plm_set_audio_decode_callback(plm_t *self, plm_audio_decode_callback fp,
+                                   void *user);
+
+/**
+ * Advance the internal timer by seconds and decode video/audio up to
+ * this time. Returns true/false whether anything was decoded.
+ */
+int plm_decode(plm_t *self, double seconds);
+
+/**
+ * Decode and return one video frame. Returns NULL if no frame could be decoded
+ * (either because the source ended or data is corrupt). If you only want to
+ * decode video, you should disable audio via plm_set_audio_enabled().
+ * The returned plm_frame_t is valid until the next call to
+ * plm_decode_video call or until the plm_destroy is called.
+ */
+plm_frame_t *plm_decode_video(plm_t *self);
+
+/**
+ * Decode and return one audio frame. Returns NULL if no frame could be decoded
+ * (either because the source ended or data is corrupt). If you only want to
+ * decode audio, you should disable video via plm_set_video_enabled().
+ * The returned plm_samples_t is valid until the next call to
+ * plm_decode_video or until the plm_destroy is called.
+ */
+plm_samples_t *plm_decode_audio(plm_t *self);
+
+/* -----------------------------------------------------------------------------
+ * plm_buffer public API
+ * Provides the data source for all other plm_* interfaces
+ *
+ * The default size for buffers created from files or by the high-level API
+ */
+#ifndef PLM_BUFFER_DEFAULT_SIZE
+#define PLM_BUFFER_DEFAULT_SIZE (128 * 1024)
+#endif
+
+/**
+ * Create a buffer instance with a filename. Returns NULL if the file could not
+ * be opened.
+ */
+plm_buffer_t *plm_buffer_create_with_filename(const char *filename);
+
+/**
+ * Create a buffer instance with file handle. Pass true to close_when_done
+ * to let plmpeg call fclose() on the handle when plm_destroy() is
+ * called.
+ */
+plm_buffer_t *plm_buffer_create_with_file(FILE *fh, int close_when_done);
+
+/**
+ * Create a buffer instance with a pointer to memory as source. This assumes
+ * the whole file is in memory. Pass 1 to free_when_done to let plmpeg call
+ * free() on the pointer when plm_destroy() is called.
+ */
+plm_buffer_t *plm_buffer_create_with_memory(uint8_t *bytes, size_t length,
+                                            int free_when_done);
+
+/**
+ * Create an empty buffer with an initial capacity. The buffer will grow
+ * as needed.
+ */
+plm_buffer_t *plm_buffer_create_with_capacity(size_t capacity);
+
+/**
+ * Destroy a buffer instance and free all data
+ */
+void plm_buffer_destroy(plm_buffer_t *self);
+
+/**
+ * Copy data into the buffer. If the data to be written is larger than the
+ * available space, the buffer will realloc() with a larger capacity.
+ * Returns the number of bytes written. This will always be the same as the
+ * passed in length, except when the buffer was created _with_memory() for
+ * which _write() is forbidden.
+ */
+size_t plm_buffer_write(plm_buffer_t *self, uint8_t *bytes, size_t length);
+
+/**
+ * Set a callback that is called whenever the buffer needs more data
+ */
+void plm_buffer_set_load_callback(plm_buffer_t *self,
+                                  plm_buffer_load_callback fp, void *user);
+
+/**
+ * Rewind the buffer back to the beginning. When loading from a file handle,
+ * this also seeks to the beginning of the file.
+ */
+void plm_buffer_rewind(plm_buffer_t *self);
+
+/**
+ * -----------------------------------------------------------------------------
+ * plm_demux public API
+ * Demux an MPEG Program Stream (PS) data into separate packages
+ *
+ * Various Packet Types
+ */
+#define PLM_DEMUX_PACKET_PRIVATE 0xBD
+#define PLM_DEMUX_PACKET_AUDIO_1 0xC0
+#define PLM_DEMUX_PACKET_AUDIO_2 0xC1
+#define PLM_DEMUX_PACKET_AUDIO_3 0xC2
+#define PLM_DEMUX_PACKET_AUDIO_4 0xC2
+#define PLM_DEMUX_PACKET_VIDEO_1 0xE0
+
+/**
+ * Create a demuxer with a plm_buffer as source
+ */
+plm_demux_t *plm_demux_create(plm_buffer_t *buffer, int destroy_when_done);
+
+/**
+ * Destroy a demuxer and free all data
+ */
+void plm_demux_destroy(plm_demux_t *self);
+
+/**
+ * Returns the number of video streams found in the system header.
+ */
+int plm_demux_get_num_video_streams(plm_demux_t *self);
+
+/**
+ * Returns the number of audio streams found in the system header.
+ */
+int plm_demux_get_num_audio_streams(plm_demux_t *self);
+
+/**
+ * Rewinds the internal buffer. See plm_buffer_rewind().
+ */
+void plm_demux_rewind(plm_demux_t *self);
+
+/**
+ * Decode and return the next packet. The returned packet_t is valid until
+ * the next call to plm_demux_decode() or until the demuxer is destroyed.
+ */
+plm_packet_t *plm_demux_decode(plm_demux_t *self);
+
+/* -----------------------------------------------------------------------------
+ * plm_video public API
+ * Decode MPEG1 Video ("mpeg1") data into raw YCrCb frames
+ */
+
+/**
+ * Create a video decoder with a plm_buffer as source
+ */
+plm_video_t *plm_video_create_with_buffer(plm_buffer_t *buffer,
+                                          int destroy_when_done);
+
+/**
+ * Destroy a video decoder and free all data
+ */
+void plm_video_destroy(plm_video_t *self);
+
+/**
+ * Get the framerate in frames per second
+ */
+double plm_video_get_framerate(plm_video_t *);
+
+double plm_video_get_pixel_aspect_ratio(plm_video_t *);
+
+/**
+ * Get the display width/height
+ */
+int plm_video_get_width(plm_video_t *);
+int plm_video_get_height(plm_video_t *);
+
+/**
+ * Set "no delay" mode. When enabled, the decoder assumes that the video does
+ * *not* contain any B-Frames. This is useful for reducing lag when streaming.
+ */
+void plm_video_set_no_delay(plm_video_t *self, int no_delay);
+
+/**
+ * Get the current internal time in seconds
+ */
+double plm_video_get_time(plm_video_t *self);
+
+/**
+ * Rewinds the internal buffer. See plm_buffer_rewind().
+ */
+void plm_video_rewind(plm_video_t *self);
+
+/**
+ * Decode and return one frame of video and advance the internal time by
+ * 1/framerate seconds. The returned frame_t is valid until the next call of
+ * plm_video_decode() or until the video decoder is destroyed.
+ */
+plm_frame_t *plm_video_decode(plm_video_t *self);
+
+/**
+ * Convert the YCrCb data of a frame into an interleaved RGB buffer. The buffer
+ * pointed to by *rgb must have a size of (frame->width * frame->height * 3)
+ * bytes.
+ */
+void plm_frame_to_rgb(plm_frame_t *frame, uint8_t *rgb);
+
+/* -----------------------------------------------------------------------------
+ * plm_audio public API
+ * Decode MPEG-1 Audio Layer II ("mp2") data into raw samples
+ */
+
+/**
+ * Create an audio decoder with a plm_buffer as source.
+ */
+plm_audio_t *plm_audio_create_with_buffer(plm_buffer_t *buffer,
+                                          int destroy_when_done);
+
+/**
+ * Destroy an audio decoder and free all data
+ */
+void plm_audio_destroy(plm_audio_t *self);
+
+/**
+ * Get the samplerate in samples per second
+ */
+int plm_audio_get_samplerate(plm_audio_t *self);
+
+/**
+ * Get the current internal time in seconds
+ */
+double plm_audio_get_time(plm_audio_t *self);
+
+/**
+ * Rewinds the internal buffer. See plm_buffer_rewind().
+ */
+void plm_audio_rewind(plm_audio_t *self);
+
+/**
+ * Decode and return one "frame" of audio and advance the internal time by
+ * (PLM_AUDIO_SAMPLES_PER_FRAME/samplerate) seconds. The returned samples_t
+ * is valid until the next call of plm_audio_decode() or until the audio
+ * decoder is destroyed.
+ */
+plm_samples_t *plm_audio_decode(plm_audio_t *self);
+
+extern long plmpegdecode_latency_;
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_DSP_MPEG_MPEG_H_ */
diff --git a/dsp/mpeg/mpeg1.c b/dsp/mpeg/mpeg1.c
new file mode 100644
index 000000000..5b9eb0b82
--- /dev/null
+++ b/dsp/mpeg/mpeg1.c
@@ -0,0 +1,1110 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:4;tab-width:4;coding:utf-8   -*-│
+│ vi: set et ft=c ts=4 sw=4 fenc=utf-8                                     :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│  PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer           │
+│  Dominic Szablewski - https://phoboslab.org                                  │
+│                                                                              │
+│  The MIT License(MIT)                                                        │
+│  Copyright(c) 2019 Dominic Szablewski                                        │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files(the              │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and / or sell copies of the Software, and to        │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│    The above copyright notice and this permission notice shall be            │
+│    included in all copies or substantial portions of the Software.           │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                       │
+│  NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE       │
+│  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN             │
+│  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN           │
+│  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE            │
+│  SOFTWARE.                                                                   │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "dsp/mpeg/blockset.h"
+#include "dsp/mpeg/buffer.h"
+#include "dsp/mpeg/idct.h"
+#include "dsp/mpeg/mpeg.h"
+#include "dsp/mpeg/video.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/fmt/conv.h"
+#include "libc/log/log.h"
+#include "libc/macros.internal.h"
+#include "libc/math.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"
+#include "libc/time.h"
+#include "libc/x/x.h"
+__static_yoink("pl_mpeg_notice");
+
+// -----------------------------------------------------------------------------
+// plm_video implementation
+
+// Inspired by Java MPEG-1 Video Decoder and Player by Zoltan Korandi
+// https://sourceforge.net/projects/javampeg1video/
+
+#define GETCONST(ARRAY, DEFAULT)
+
+static const int PLM_VIDEO_PICTURE_TYPE_INTRA = 1;
+static const int PLM_VIDEO_PICTURE_TYPE_PREDICTIVE = 2;
+static const int PLM_VIDEO_PICTURE_TYPE_B = 3;
+
+static const int PLM_START_SEQUENCE = 0xB3;
+static const int PLM_START_SLICE_FIRST = 0x01;
+static const int PLM_START_SLICE_LAST = 0xAF;
+static const int PLM_START_PICTURE = 0x00;
+static const int PLM_START_EXTENSION = 0xB5;
+static const int PLM_START_USER_DATA = 0xB2;
+
+static const float PLM_VIDEO_PIXEL_ASPECT_RATIO[] = {
+    1.0000, /* square pixels */
+    0.6735, /* 3:4? */
+    0.7031, /* MPEG-1 / MPEG-2 video encoding divergence? */
+    0.7615, 0.8055, 0.8437, 0.8935, 0.9157, 0.9815,
+    1.0255, 1.0695, 1.0950, 1.1575, 1.2051,
+};
+
+static const float PLM_VIDEO_PICTURE_RATE[] = {
+    23.976, /* NTSC-Film */
+    24.000, /* NTSC-Film (enriched for foreign nations) */
+    25.000, /* PAL (Britain, Africa, China, etc.) */
+    29.970, /* NTSC */
+    30.000, /* NTSC (enriched for foreign nations) */
+    50.000, /* PAL? */
+    59.940, /* NTSC-Wow */
+    60.000  /* NTSC-Wow (enriched for foreign nations) */
+};
+
+static const uint8_t PLM_VIDEO_ZIG_ZAG[] = /* clang-format off */ {
+     0,  1,  8, 16,  9,  2,  3, 10,
+	17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+	27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+	29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+	53, 60, 61, 54, 47, 55, 62, 63,
+} /* clang-format on */;
+
+static const uint8_t PLM_VIDEO_INTRAQUANT_MATRIX[] = /* clang-format off */ {
+     8, 16, 19, 22, 26, 27, 29, 34,
+	16, 16, 22, 24, 27, 29, 34, 37,
+    19, 22, 26, 27, 29, 34, 34, 38,
+	22, 22, 26, 27, 29, 34, 37, 40,
+    22, 26, 27, 29, 32, 35, 40, 48,
+	26, 27, 29, 32, 35, 40, 48, 58,
+    26, 27, 29, 34, 38, 46, 56, 69,
+	27, 29, 35, 38, 46, 56, 69, 83,
+} /* clang-format on */;
+
+static const uint8_t PLM_VIDEO_NONINTRAQUANT_MATRIX[] = /* clang-format off */ {
+    16, 16, 16, 16, 16, 16, 16, 16,
+	16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16,
+	16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16,
+	16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16,
+	16, 16, 16, 16, 16, 16, 16, 16,
+} /* clang-format on */;
+
+static const uint8_t PLM_VIDEO_PREMULTIPLIER_MATRIX[] = /* clang-format off */ {
+    32, 44, 42, 38, 32, 25, 17,  9,
+	44, 62, 58, 52, 44, 35, 24, 12,
+    42, 58, 55, 49, 42, 33, 23, 12,
+	38, 52, 49, 44, 38, 30, 20, 10,
+    32, 44, 42, 38, 32, 25, 17,  9,
+	25, 35, 33, 30, 25, 20, 14,  7,
+    17, 24, 23, 20, 17, 14,  9,  5,
+	 9, 12, 12, 10,  9,  7,  5,  2,
+} /* clang-format on */;
+
+static const plm_vlc_t PLM_VIDEO_MACROBLOCK_ADDRESS_INCREMENT[] = {
+    {1 << 1, 0},  {0, 1},        //   0: x
+    {2 << 1, 0},  {3 << 1, 0},   //   1: 0x
+    {4 << 1, 0},  {5 << 1, 0},   //   2: 00x
+    {0, 3},       {0, 2},        //   3: 01x
+    {6 << 1, 0},  {7 << 1, 0},   //   4: 000x
+    {0, 5},       {0, 4},        //   5: 001x
+    {8 << 1, 0},  {9 << 1, 0},   //   6: 0000x
+    {0, 7},       {0, 6},        //   7: 0001x
+    {10 << 1, 0}, {11 << 1, 0},  //   8: 0000 0x
+    {12 << 1, 0}, {13 << 1, 0},  //   9: 0000 1x
+    {14 << 1, 0}, {15 << 1, 0},  //  10: 0000 00x
+    {16 << 1, 0}, {17 << 1, 0},  //  11: 0000 01x
+    {18 << 1, 0}, {19 << 1, 0},  //  12: 0000 10x
+    {0, 9},       {0, 8},        //  13: 0000 11x
+    {-1, 0},      {20 << 1, 0},  //  14: 0000 000x
+    {-1, 0},      {21 << 1, 0},  //  15: 0000 001x
+    {22 << 1, 0}, {23 << 1, 0},  //  16: 0000 010x
+    {0, 15},      {0, 14},       //  17: 0000 011x
+    {0, 13},      {0, 12},       //  18: 0000 100x
+    {0, 11},      {0, 10},       //  19: 0000 101x
+    {24 << 1, 0}, {25 << 1, 0},  //  20: 0000 0001x
+    {26 << 1, 0}, {27 << 1, 0},  //  21: 0000 0011x
+    {28 << 1, 0}, {29 << 1, 0},  //  22: 0000 0100x
+    {30 << 1, 0}, {31 << 1, 0},  //  23: 0000 0101x
+    {32 << 1, 0}, {-1, 0},       //  24: 0000 0001 0x
+    {-1, 0},      {33 << 1, 0},  //  25: 0000 0001 1x
+    {34 << 1, 0}, {35 << 1, 0},  //  26: 0000 0011 0x
+    {36 << 1, 0}, {37 << 1, 0},  //  27: 0000 0011 1x
+    {38 << 1, 0}, {39 << 1, 0},  //  28: 0000 0100 0x
+    {0, 21},      {0, 20},       //  29: 0000 0100 1x
+    {0, 19},      {0, 18},       //  30: 0000 0101 0x
+    {0, 17},      {0, 16},       //  31: 0000 0101 1x
+    {0, 35},      {-1, 0},       //  32: 0000 0001 00x
+    {-1, 0},      {0, 34},       //  33: 0000 0001 11x
+    {0, 33},      {0, 32},       //  34: 0000 0011 00x
+    {0, 31},      {0, 30},       //  35: 0000 0011 01x
+    {0, 29},      {0, 28},       //  36: 0000 0011 10x
+    {0, 27},      {0, 26},       //  37: 0000 0011 11x
+    {0, 25},      {0, 24},       //  38: 0000 0100 00x
+    {0, 23},      {0, 22},       //  39: 0000 0100 01x
+};
+
+static const plm_vlc_t PLM_VIDEO_MACROBLOCK_TYPE_INTRA[] = {
+    {1 << 1, 0},
+    {0, 0x01},  //   0: x
+    {-1, 0},
+    {0, 0x11},  //   1: 0x
+};
+
+static const plm_vlc_t PLM_VIDEO_MACROBLOCK_TYPE_PREDICTIVE[] = {
+    {1 << 1, 0}, {0, 0x0a},    //   0: x
+    {2 << 1, 0}, {0, 0x02},    //   1: 0x
+    {3 << 1, 0}, {0, 0x08},    //   2: 00x
+    {4 << 1, 0}, {5 << 1, 0},  //   3: 000x
+    {6 << 1, 0}, {0, 0x12},    //   4: 0000x
+    {0, 0x1a},   {0, 0x01},    //   5: 0001x
+    {-1, 0},     {0, 0x11},    //   6: 0000 0x
+};
+
+static const plm_vlc_t PLM_VIDEO_MACROBLOCK_TYPE_B[] = {
+    {1 << 1, 0}, {2 << 1, 0},   //   0: x
+    {3 << 1, 0}, {4 << 1, 0},   //   1: 0x
+    {0, 0x0c},   {0, 0x0e},     //   2: 1x
+    {5 << 1, 0}, {6 << 1, 0},   //   3: 00x
+    {0, 0x04},   {0, 0x06},     //   4: 01x
+    {7 << 1, 0}, {8 << 1, 0},   //   5: 000x
+    {0, 0x08},   {0, 0x0a},     //   6: 001x
+    {9 << 1, 0}, {10 << 1, 0},  //   7: 0000x
+    {0, 0x1e},   {0, 0x01},     //   8: 0001x
+    {-1, 0},     {0, 0x11},     //   9: 0000 0x
+    {0, 0x16},   {0, 0x1a},     //  10: 0000 1x
+};
+
+static const plm_vlc_t PLM_VIDEO_CODE_BLOCK_PATTERN[] = {
+    {1 << 1, 0},  {2 << 1, 0},   //   0: x
+    {3 << 1, 0},  {4 << 1, 0},   //   1: 0x
+    {5 << 1, 0},  {6 << 1, 0},   //   2: 1x
+    {7 << 1, 0},  {8 << 1, 0},   //   3: 00x
+    {9 << 1, 0},  {10 << 1, 0},  //   4: 01x
+    {11 << 1, 0}, {12 << 1, 0},  //   5: 10x
+    {13 << 1, 0}, {0, 60},       //   6: 11x
+    {14 << 1, 0}, {15 << 1, 0},  //   7: 000x
+    {16 << 1, 0}, {17 << 1, 0},  //   8: 001x
+    {18 << 1, 0}, {19 << 1, 0},  //   9: 010x
+    {20 << 1, 0}, {21 << 1, 0},  //  10: 011x
+    {22 << 1, 0}, {23 << 1, 0},  //  11: 100x
+    {0, 32},      {0, 16},       //  12: 101x
+    {0, 8},       {0, 4},        //  13: 110x
+    {24 << 1, 0}, {25 << 1, 0},  //  14: 0000x
+    {26 << 1, 0}, {27 << 1, 0},  //  15: 0001x
+    {28 << 1, 0}, {29 << 1, 0},  //  16: 0010x
+    {30 << 1, 0}, {31 << 1, 0},  //  17: 0011x
+    {0, 62},      {0, 2},        //  18: 0100x
+    {0, 61},      {0, 1},        //  19: 0101x
+    {0, 56},      {0, 52},       //  20: 0110x
+    {0, 44},      {0, 28},       //  21: 0111x
+    {0, 40},      {0, 20},       //  22: 1000x
+    {0, 48},      {0, 12},       //  23: 1001x
+    {32 << 1, 0}, {33 << 1, 0},  //  24: 0000 0x
+    {34 << 1, 0}, {35 << 1, 0},  //  25: 0000 1x
+    {36 << 1, 0}, {37 << 1, 0},  //  26: 0001 0x
+    {38 << 1, 0}, {39 << 1, 0},  //  27: 0001 1x
+    {40 << 1, 0}, {41 << 1, 0},  //  28: 0010 0x
+    {42 << 1, 0}, {43 << 1, 0},  //  29: 0010 1x
+    {0, 63},      {0, 3},        //  30: 0011 0x
+    {0, 36},      {0, 24},       //  31: 0011 1x
+    {44 << 1, 0}, {45 << 1, 0},  //  32: 0000 00x
+    {46 << 1, 0}, {47 << 1, 0},  //  33: 0000 01x
+    {48 << 1, 0}, {49 << 1, 0},  //  34: 0000 10x
+    {50 << 1, 0}, {51 << 1, 0},  //  35: 0000 11x
+    {52 << 1, 0}, {53 << 1, 0},  //  36: 0001 00x
+    {54 << 1, 0}, {55 << 1, 0},  //  37: 0001 01x
+    {56 << 1, 0}, {57 << 1, 0},  //  38: 0001 10x
+    {58 << 1, 0}, {59 << 1, 0},  //  39: 0001 11x
+    {0, 34},      {0, 18},       //  40: 0010 00x
+    {0, 10},      {0, 6},        //  41: 0010 01x
+    {0, 33},      {0, 17},       //  42: 0010 10x
+    {0, 9},       {0, 5},        //  43: 0010 11x
+    {-1, 0},      {60 << 1, 0},  //  44: 0000 000x
+    {61 << 1, 0}, {62 << 1, 0},  //  45: 0000 001x
+    {0, 58},      {0, 54},       //  46: 0000 010x
+    {0, 46},      {0, 30},       //  47: 0000 011x
+    {0, 57},      {0, 53},       //  48: 0000 100x
+    {0, 45},      {0, 29},       //  49: 0000 101x
+    {0, 38},      {0, 26},       //  50: 0000 110x
+    {0, 37},      {0, 25},       //  51: 0000 111x
+    {0, 43},      {0, 23},       //  52: 0001 000x
+    {0, 51},      {0, 15},       //  53: 0001 001x
+    {0, 42},      {0, 22},       //  54: 0001 010x
+    {0, 50},      {0, 14},       //  55: 0001 011x
+    {0, 41},      {0, 21},       //  56: 0001 100x
+    {0, 49},      {0, 13},       //  57: 0001 101x
+    {0, 35},      {0, 19},       //  58: 0001 110x
+    {0, 11},      {0, 7},        //  59: 0001 111x
+    {0, 39},      {0, 27},       //  60: 0000 0001x
+    {0, 59},      {0, 55},       //  61: 0000 0010x
+    {0, 47},      {0, 31},       //  62: 0000 0011x
+};
+
+static const plm_vlc_t PLM_VIDEO_MOTION[] = {
+    {1 << 1, 0},  {0, 0},        //   0: x
+    {2 << 1, 0},  {3 << 1, 0},   //   1: 0x
+    {4 << 1, 0},  {5 << 1, 0},   //   2: 00x
+    {0, 1},       {0, -1},       //   3: 01x
+    {6 << 1, 0},  {7 << 1, 0},   //   4: 000x
+    {0, 2},       {0, -2},       //   5: 001x
+    {8 << 1, 0},  {9 << 1, 0},   //   6: 0000x
+    {0, 3},       {0, -3},       //   7: 0001x
+    {10 << 1, 0}, {11 << 1, 0},  //   8: 0000 0x
+    {12 << 1, 0}, {13 << 1, 0},  //   9: 0000 1x
+    {-1, 0},      {14 << 1, 0},  //  10: 0000 00x
+    {15 << 1, 0}, {16 << 1, 0},  //  11: 0000 01x
+    {17 << 1, 0}, {18 << 1, 0},  //  12: 0000 10x
+    {0, 4},       {0, -4},       //  13: 0000 11x
+    {-1, 0},      {19 << 1, 0},  //  14: 0000 001x
+    {20 << 1, 0}, {21 << 1, 0},  //  15: 0000 010x
+    {0, 7},       {0, -7},       //  16: 0000 011x
+    {0, 6},       {0, -6},       //  17: 0000 100x
+    {0, 5},       {0, -5},       //  18: 0000 101x
+    {22 << 1, 0}, {23 << 1, 0},  //  19: 0000 0011x
+    {24 << 1, 0}, {25 << 1, 0},  //  20: 0000 0100x
+    {26 << 1, 0}, {27 << 1, 0},  //  21: 0000 0101x
+    {28 << 1, 0}, {29 << 1, 0},  //  22: 0000 0011 0x
+    {30 << 1, 0}, {31 << 1, 0},  //  23: 0000 0011 1x
+    {32 << 1, 0}, {33 << 1, 0},  //  24: 0000 0100 0x
+    {0, 10},      {0, -10},      //  25: 0000 0100 1x
+    {0, 9},       {0, -9},       //  26: 0000 0101 0x
+    {0, 8},       {0, -8},       //  27: 0000 0101 1x
+    {0, 16},      {0, -16},      //  28: 0000 0011 00x
+    {0, 15},      {0, -15},      //  29: 0000 0011 01x
+    {0, 14},      {0, -14},      //  30: 0000 0011 10x
+    {0, 13},      {0, -13},      //  31: 0000 0011 11x
+    {0, 12},      {0, -12},      //  32: 0000 0100 00x
+    {0, 11},      {0, -11},      //  33: 0000 0100 01x
+};
+
+static const plm_vlc_t PLM_VIDEO_DCT_SIZE_LUMINANCE[] = {
+    {1 << 1, 0}, {2 << 1, 0},  //   0: x
+    {0, 1},      {0, 2},       //   1: 0x
+    {3 << 1, 0}, {4 << 1, 0},  //   2: 1x
+    {0, 0},      {0, 3},       //   3: 10x
+    {0, 4},      {5 << 1, 0},  //   4: 11x
+    {0, 5},      {6 << 1, 0},  //   5: 111x
+    {0, 6},      {7 << 1, 0},  //   6: 1111x
+    {0, 7},      {8 << 1, 0},  //   7: 1111 1x
+    {0, 8},      {-1, 0},      //   8: 1111 11x
+};
+
+static const plm_vlc_t PLM_VIDEO_DCT_SIZE_CHROMINANCE[] = {
+    {1 << 1, 0}, {2 << 1, 0},  //   0: x
+    {0, 0},      {0, 1},       //   1: 0x
+    {0, 2},      {3 << 1, 0},  //   2: 1x
+    {0, 3},      {4 << 1, 0},  //   3: 11x
+    {0, 4},      {5 << 1, 0},  //   4: 111x
+    {0, 5},      {6 << 1, 0},  //   5: 1111x
+    {0, 6},      {7 << 1, 0},  //   6: 1111 1x
+    {0, 7},      {8 << 1, 0},  //   7: 1111 11x
+    {0, 8},      {-1, 0},      //   8: 1111 111x
+};
+
+//  dct_coeff bitmap:
+//    0xff00  run
+//    0x00ff  level
+
+//  Decoded values are unsigned. Sign bit follows in the stream.
+
+static const plm_vlc_uint_t PLM_VIDEO_DCT_COEFF[] = {
+    {1 << 1, 0},   {0, 0x0001},    //   0: x
+    {2 << 1, 0},   {3 << 1, 0},    //   1: 0x
+    {4 << 1, 0},   {5 << 1, 0},    //   2: 00x
+    {6 << 1, 0},   {0, 0x0101},    //   3: 01x
+    {7 << 1, 0},   {8 << 1, 0},    //   4: 000x
+    {9 << 1, 0},   {10 << 1, 0},   //   5: 001x
+    {0, 0x0002},   {0, 0x0201},    //   6: 010x
+    {11 << 1, 0},  {12 << 1, 0},   //   7: 0000x
+    {13 << 1, 0},  {14 << 1, 0},   //   8: 0001x
+    {15 << 1, 0},  {0, 0x0003},    //   9: 0010x
+    {0, 0x0401},   {0, 0x0301},    //  10: 0011x
+    {16 << 1, 0},  {0, 0xffff},    //  11: 0000 0x
+    {17 << 1, 0},  {18 << 1, 0},   //  12: 0000 1x
+    {0, 0x0701},   {0, 0x0601},    //  13: 0001 0x
+    {0, 0x0102},   {0, 0x0501},    //  14: 0001 1x
+    {19 << 1, 0},  {20 << 1, 0},   //  15: 0010 0x
+    {21 << 1, 0},  {22 << 1, 0},   //  16: 0000 00x
+    {0, 0x0202},   {0, 0x0901},    //  17: 0000 10x
+    {0, 0x0004},   {0, 0x0801},    //  18: 0000 11x
+    {23 << 1, 0},  {24 << 1, 0},   //  19: 0010 00x
+    {25 << 1, 0},  {26 << 1, 0},   //  20: 0010 01x
+    {27 << 1, 0},  {28 << 1, 0},   //  21: 0000 000x
+    {29 << 1, 0},  {30 << 1, 0},   //  22: 0000 001x
+    {0, 0x0d01},   {0, 0x0006},    //  23: 0010 000x
+    {0, 0x0c01},   {0, 0x0b01},    //  24: 0010 001x
+    {0, 0x0302},   {0, 0x0103},    //  25: 0010 010x
+    {0, 0x0005},   {0, 0x0a01},    //  26: 0010 011x
+    {31 << 1, 0},  {32 << 1, 0},   //  27: 0000 0000x
+    {33 << 1, 0},  {34 << 1, 0},   //  28: 0000 0001x
+    {35 << 1, 0},  {36 << 1, 0},   //  29: 0000 0010x
+    {37 << 1, 0},  {38 << 1, 0},   //  30: 0000 0011x
+    {39 << 1, 0},  {40 << 1, 0},   //  31: 0000 0000 0x
+    {41 << 1, 0},  {42 << 1, 0},   //  32: 0000 0000 1x
+    {43 << 1, 0},  {44 << 1, 0},   //  33: 0000 0001 0x
+    {45 << 1, 0},  {46 << 1, 0},   //  34: 0000 0001 1x
+    {0, 0x1001},   {0, 0x0502},    //  35: 0000 0010 0x
+    {0, 0x0007},   {0, 0x0203},    //  36: 0000 0010 1x
+    {0, 0x0104},   {0, 0x0f01},    //  37: 0000 0011 0x
+    {0, 0x0e01},   {0, 0x0402},    //  38: 0000 0011 1x
+    {47 << 1, 0},  {48 << 1, 0},   //  39: 0000 0000 00x
+    {49 << 1, 0},  {50 << 1, 0},   //  40: 0000 0000 01x
+    {51 << 1, 0},  {52 << 1, 0},   //  41: 0000 0000 10x
+    {53 << 1, 0},  {54 << 1, 0},   //  42: 0000 0000 11x
+    {55 << 1, 0},  {56 << 1, 0},   //  43: 0000 0001 00x
+    {57 << 1, 0},  {58 << 1, 0},   //  44: 0000 0001 01x
+    {59 << 1, 0},  {60 << 1, 0},   //  45: 0000 0001 10x
+    {61 << 1, 0},  {62 << 1, 0},   //  46: 0000 0001 11x
+    {-1, 0},       {63 << 1, 0},   //  47: 0000 0000 000x
+    {64 << 1, 0},  {65 << 1, 0},   //  48: 0000 0000 001x
+    {66 << 1, 0},  {67 << 1, 0},   //  49: 0000 0000 010x
+    {68 << 1, 0},  {69 << 1, 0},   //  50: 0000 0000 011x
+    {70 << 1, 0},  {71 << 1, 0},   //  51: 0000 0000 100x
+    {72 << 1, 0},  {73 << 1, 0},   //  52: 0000 0000 101x
+    {74 << 1, 0},  {75 << 1, 0},   //  53: 0000 0000 110x
+    {76 << 1, 0},  {77 << 1, 0},   //  54: 0000 0000 111x
+    {0, 0x000b},   {0, 0x0802},    //  55: 0000 0001 000x
+    {0, 0x0403},   {0, 0x000a},    //  56: 0000 0001 001x
+    {0, 0x0204},   {0, 0x0702},    //  57: 0000 0001 010x
+    {0, 0x1501},   {0, 0x1401},    //  58: 0000 0001 011x
+    {0, 0x0009},   {0, 0x1301},    //  59: 0000 0001 100x
+    {0, 0x1201},   {0, 0x0105},    //  60: 0000 0001 101x
+    {0, 0x0303},   {0, 0x0008},    //  61: 0000 0001 110x
+    {0, 0x0602},   {0, 0x1101},    //  62: 0000 0001 111x
+    {78 << 1, 0},  {79 << 1, 0},   //  63: 0000 0000 0001x
+    {80 << 1, 0},  {81 << 1, 0},   //  64: 0000 0000 0010x
+    {82 << 1, 0},  {83 << 1, 0},   //  65: 0000 0000 0011x
+    {84 << 1, 0},  {85 << 1, 0},   //  66: 0000 0000 0100x
+    {86 << 1, 0},  {87 << 1, 0},   //  67: 0000 0000 0101x
+    {88 << 1, 0},  {89 << 1, 0},   //  68: 0000 0000 0110x
+    {90 << 1, 0},  {91 << 1, 0},   //  69: 0000 0000 0111x
+    {0, 0x0a02},   {0, 0x0902},    //  70: 0000 0000 1000x
+    {0, 0x0503},   {0, 0x0304},    //  71: 0000 0000 1001x
+    {0, 0x0205},   {0, 0x0107},    //  72: 0000 0000 1010x
+    {0, 0x0106},   {0, 0x000f},    //  73: 0000 0000 1011x
+    {0, 0x000e},   {0, 0x000d},    //  74: 0000 0000 1100x
+    {0, 0x000c},   {0, 0x1a01},    //  75: 0000 0000 1101x
+    {0, 0x1901},   {0, 0x1801},    //  76: 0000 0000 1110x
+    {0, 0x1701},   {0, 0x1601},    //  77: 0000 0000 1111x
+    {92 << 1, 0},  {93 << 1, 0},   //  78: 0000 0000 0001 0x
+    {94 << 1, 0},  {95 << 1, 0},   //  79: 0000 0000 0001 1x
+    {96 << 1, 0},  {97 << 1, 0},   //  80: 0000 0000 0010 0x
+    {98 << 1, 0},  {99 << 1, 0},   //  81: 0000 0000 0010 1x
+    {100 << 1, 0}, {101 << 1, 0},  //  82: 0000 0000 0011 0x
+    {102 << 1, 0}, {103 << 1, 0},  //  83: 0000 0000 0011 1x
+    {0, 0x001f},   {0, 0x001e},    //  84: 0000 0000 0100 0x
+    {0, 0x001d},   {0, 0x001c},    //  85: 0000 0000 0100 1x
+    {0, 0x001b},   {0, 0x001a},    //  86: 0000 0000 0101 0x
+    {0, 0x0019},   {0, 0x0018},    //  87: 0000 0000 0101 1x
+    {0, 0x0017},   {0, 0x0016},    //  88: 0000 0000 0110 0x
+    {0, 0x0015},   {0, 0x0014},    //  89: 0000 0000 0110 1x
+    {0, 0x0013},   {0, 0x0012},    //  90: 0000 0000 0111 0x
+    {0, 0x0011},   {0, 0x0010},    //  91: 0000 0000 0111 1x
+    {104 << 1, 0}, {105 << 1, 0},  //  92: 0000 0000 0001 00x
+    {106 << 1, 0}, {107 << 1, 0},  //  93: 0000 0000 0001 01x
+    {108 << 1, 0}, {109 << 1, 0},  //  94: 0000 0000 0001 10x
+    {110 << 1, 0}, {111 << 1, 0},  //  95: 0000 0000 0001 11x
+    {0, 0x0028},   {0, 0x0027},    //  96: 0000 0000 0010 00x
+    {0, 0x0026},   {0, 0x0025},    //  97: 0000 0000 0010 01x
+    {0, 0x0024},   {0, 0x0023},    //  98: 0000 0000 0010 10x
+    {0, 0x0022},   {0, 0x0021},    //  99: 0000 0000 0010 11x
+    {0, 0x0020},   {0, 0x010e},    // 100: 0000 0000 0011 00x
+    {0, 0x010d},   {0, 0x010c},    // 101: 0000 0000 0011 01x
+    {0, 0x010b},   {0, 0x010a},    // 102: 0000 0000 0011 10x
+    {0, 0x0109},   {0, 0x0108},    // 103: 0000 0000 0011 11x
+    {0, 0x0112},   {0, 0x0111},    // 104: 0000 0000 0001 000x
+    {0, 0x0110},   {0, 0x010f},    // 105: 0000 0000 0001 001x
+    {0, 0x0603},   {0, 0x1002},    // 106: 0000 0000 0001 010x
+    {0, 0x0f02},   {0, 0x0e02},    // 107: 0000 0000 0001 011x
+    {0, 0x0d02},   {0, 0x0c02},    // 108: 0000 0000 0001 100x
+    {0, 0x0b02},   {0, 0x1f01},    // 109: 0000 0000 0001 101x
+    {0, 0x1e01},   {0, 0x1d01},    // 110: 0000 0000 0001 110x
+    {0, 0x1c01},   {0, 0x1b01},    // 111: 0000 0000 0001 111x
+};
+
+long plmpegdecode_latency_;
+
+static plm_vlc_t *PLM_VIDEO_MACROBLOCK_TYPE[4];
+static plm_vlc_t *PLM_VIDEO_DCT_SIZE[3];
+
+#define plm_clamp(n) MIN(255, MAX(0, n))
+
+void plm_video_destroy(plm_video_t *self) {
+  if (self->destroy_buffer_when_done) {
+    plm_buffer_destroy(self->buffer);
+  }
+  if (self->has_sequence_header) {
+    free(self->frames_data);
+  }
+  free(self);
+}
+
+double plm_video_get_pixel_aspect_ratio(plm_video_t *self) {
+  return self->pixel_aspect_ratio;
+}
+
+double plm_video_get_framerate(plm_video_t *self) {
+  return self->framerate;
+}
+
+int plm_video_get_width(plm_video_t *self) {
+  return self->width;
+}
+
+int plm_video_get_height(plm_video_t *self) {
+  return self->height;
+}
+
+void plm_video_set_no_delay(plm_video_t *self, int no_delay) {
+  self->assume_no_b_frames = no_delay;
+}
+
+double plm_video_get_time(plm_video_t *self) {
+  return self->time;
+}
+
+void plm_video_rewind(plm_video_t *self) {
+  plm_buffer_rewind(self->buffer);
+  self->time = 0;
+  self->frames_decoded = 0;
+  self->has_reference_frame = false;
+}
+
+void plm_video_init_frame(plm_video_t *self, plm_frame_t *frame,
+                          uint8_t *base) {
+  size_t plane_size = self->luma_width * self->luma_height;
+  frame->width = self->width;
+  frame->height = self->height;
+  frame->y.width = self->luma_width;
+  frame->y.height = self->luma_height;
+  frame->y.data = base;
+  frame->cr.width = self->chroma_width;
+  frame->cr.height = self->chroma_height;
+  frame->cr.data = base + plane_size;
+  frame->cb.width = self->chroma_width;
+  frame->cb.height = self->chroma_height;
+  frame->cb.data = base + plane_size * 2;
+}
+
+void plm_video_decode_sequence_header(plm_video_t *self) {
+  int previous_width = self->width;
+  int previous_height = self->height;
+
+  self->width = plm_buffer_read(self->buffer, 12);
+  self->height = plm_buffer_read(self->buffer, 12);
+
+  int pixel_aspect_ratio_code;
+  pixel_aspect_ratio_code = plm_buffer_read(self->buffer, 4);
+  pixel_aspect_ratio_code -= 1;
+  pixel_aspect_ratio_code = MAX(pixel_aspect_ratio_code, 0);
+  pixel_aspect_ratio_code =
+      MIN(pixel_aspect_ratio_code, ARRAYLEN(PLM_VIDEO_PIXEL_ASPECT_RATIO) - 1);
+  self->pixel_aspect_ratio =
+      PLM_VIDEO_PIXEL_ASPECT_RATIO[pixel_aspect_ratio_code];
+
+  int framerate_code;
+  framerate_code = plm_buffer_read(self->buffer, 4);
+  framerate_code -= 1;
+  framerate_code = MAX(framerate_code, 0);
+  framerate_code = MIN(framerate_code, ARRAYLEN(PLM_VIDEO_PICTURE_RATE) - 1);
+  self->framerate = PLM_VIDEO_PICTURE_RATE[framerate_code];
+
+  // skip bitRate, marker, bufferSize and constrained bit
+  plm_buffer_skip(self->buffer, 18 + 1 + 10 + 1);
+
+  if (plm_buffer_read(self->buffer, 1)) {  // load custom intra quant matrix?
+    for (int i = 0; i < 64; i++) {
+      int idx = PLM_VIDEO_ZIG_ZAG[i];
+      self->intra_quant_matrix[idx] = plm_buffer_read(self->buffer, 8);
+    }
+  } else {
+    memcpy(self->intra_quant_matrix, PLM_VIDEO_INTRAQUANT_MATRIX, 64);
+  }
+
+  if (plm_buffer_read(self->buffer,
+                      1)) {  // load custom non intra quant matrix?
+    for (int i = 0; i < 64; i++) {
+      int idx = PLM_VIDEO_ZIG_ZAG[i];
+      self->non_intra_quant_matrix[idx] = plm_buffer_read(self->buffer, 8);
+    }
+  } else {
+    memcpy(self->non_intra_quant_matrix, PLM_VIDEO_NONINTRAQUANT_MATRIX, 64);
+  }
+
+  if (self->has_sequence_header) {
+    if (self->width == previous_width && self->height == previous_height) {
+      // We already had a sequence header with the same width/height;
+      // nothing else to do here.
+      return;
+    }
+
+    // We had a sequence header but with different dimensions;
+    // delete the previous planes and allocate new.
+    free(self->frames_data);
+  }
+
+  self->mb_width = (self->width + 15) >> 4;
+  self->mb_height = (self->height + 15) >> 4;
+  self->mb_size = self->mb_width * self->mb_height;
+  self->luma_width = self->mb_width << 4;
+  self->luma_height = self->mb_height << 4;
+  self->chroma_width = self->mb_width << 3;
+  self->chroma_height = self->mb_height << 3;
+
+  size_t plane_size = self->luma_width * self->luma_height;
+  self->frames_data = memalign(64, plane_size * 9);
+  plm_video_init_frame(self, &self->frame_current,
+                       self->frames_data + plane_size * 0);
+  plm_video_init_frame(self, &self->frame_forward,
+                       self->frames_data + plane_size * 3);
+  plm_video_init_frame(self, &self->frame_backward,
+                       self->frames_data + plane_size * 6);
+
+  self->has_sequence_header = true;
+
+  INFOF("%s:\n"
+        "\t%-20s = %15d;\n"
+        "\t%-20s = %15d;\n"
+        "\t%-20s = %15f;\n"
+        "\t%-20s = %15f;\n"
+        "\t%-20s = %15d;\n"
+        "\t%-20s = %15d;\n"
+        "\t%-20s = %15d;\n"
+        "\t%-20s = %15d;\n"
+        "\t%-20s = %15d;\n"
+        "\t%-20s = %15d;\n"
+        "\t%-20s = %15d;",
+        "New MPEG Sequence", "width", self->width, "height", self->height,
+        "framerate", self->framerate, "pixel_aspect_ratio",
+        self->pixel_aspect_ratio, "mb_size", self->mb_size, "mb_width",
+        self->mb_width, "mb_height", self->mb_height, "luma_width",
+        self->luma_width, "luma_height", self->luma_height, "chroma_width",
+        self->chroma_width, "chroma_height", self->chroma_height);
+}
+
+static void plm_video_copy_macroblock(plm_video_t *self, int motion_h,
+                                      int motion_v, plm_frame_t *d) {
+  plm_frame_t *s = &self->frame_current;
+  plm_video_process_macroblock_16(self, s->y.data, d->y.data, motion_h,
+                                  motion_v, false);
+  plm_video_process_macroblock_8(self, s->cr.data, d->cr.data, motion_h / 2,
+                                 motion_v / 2, false);
+  plm_video_process_macroblock_8(self, s->cb.data, d->cb.data, motion_h / 2,
+                                 motion_v / 2, false);
+}
+
+static void plm_video_interpolate_macroblock(plm_video_t *self, int motion_h,
+                                             int motion_v, plm_frame_t *d) {
+  plm_frame_t *s = &self->frame_current;
+  plm_video_process_macroblock_16(self, s->y.data, d->y.data, motion_h,
+                                  motion_v, true);
+  plm_video_process_macroblock_8(self, s->cr.data, d->cr.data, motion_h / 2,
+                                 motion_v / 2, true);
+  plm_video_process_macroblock_8(self, s->cb.data, d->cb.data, motion_h / 2,
+                                 motion_v / 2, true);
+}
+
+static int plm_video_decode_motion_vector(plm_video_t *self, int r_size,
+                                          int motion) {
+  int fscale = 1u << r_size;
+  int m_code = plm_buffer_read_vlc(self->buffer, PLM_VIDEO_MOTION);
+  int r = 0;
+  int d;
+  if ((m_code != 0) && (fscale != 1)) {
+    r = plm_buffer_read(self->buffer, r_size);
+    d = ((abs(m_code) - 1) << r_size) + r + 1;
+    if (m_code < 0) {
+      d = -d;
+    }
+  } else {
+    d = m_code;
+  }
+  motion += d;
+  if (motion > (fscale << 4) - 1) {
+    motion -= fscale << 5;
+  } else if (motion < (int)(((unsigned)-fscale) << 4)) {
+    motion += fscale << 5;
+  }
+  return motion;
+}
+
+static void plm_video_decode_motion_vectors(plm_video_t *self) {
+  // Forward
+  if (self->motion_forward.is_set) {
+    int r_size = self->motion_forward.r_size;
+    self->motion_forward.h =
+        plm_video_decode_motion_vector(self, r_size, self->motion_forward.h);
+    self->motion_forward.v =
+        plm_video_decode_motion_vector(self, r_size, self->motion_forward.v);
+  } else if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE) {
+    // No motion information in P-picture, reset vectors
+    self->motion_forward.h = 0;
+    self->motion_forward.v = 0;
+  }
+  if (self->motion_backward.is_set) {
+    int r_size = self->motion_backward.r_size;
+    self->motion_backward.h =
+        plm_video_decode_motion_vector(self, r_size, self->motion_backward.h);
+    self->motion_backward.v =
+        plm_video_decode_motion_vector(self, r_size, self->motion_backward.v);
+  }
+}
+
+static void plm_video_predict_macroblock(plm_video_t *self) {
+  int fw_h = self->motion_forward.h;
+  int fw_v = self->motion_forward.v;
+  if (self->motion_forward.full_px) {
+    fw_h <<= 1;
+    fw_v <<= 1;
+  }
+  if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_B) {
+    int bw_h = self->motion_backward.h;
+    int bw_v = self->motion_backward.v;
+    if (self->motion_backward.full_px) {
+      bw_h <<= 1;
+      bw_v <<= 1;
+    }
+    if (self->motion_forward.is_set) {
+      plm_video_copy_macroblock(self, fw_h, fw_v, &self->frame_forward);
+      if (self->motion_backward.is_set) {
+        plm_video_interpolate_macroblock(self, bw_h, bw_v,
+                                         &self->frame_backward);
+      }
+    } else {
+      plm_video_copy_macroblock(self, bw_h, bw_v, &self->frame_backward);
+    }
+  } else {
+    plm_video_copy_macroblock(self, fw_h, fw_v, &self->frame_forward);
+  }
+}
+
+static void plm_video_decode_block(plm_video_t *self, int block) {
+  int n = 0;
+  uint8_t *quant_matrix;
+
+  // Decode DC coefficient of intra-coded blocks
+  if (self->macroblock_intra) {
+    int predictor;
+    int dct_size;
+
+    // DC prediction
+    int plane_index = block > 3 ? block - 3 : 0;
+    predictor = self->dc_predictor[plane_index];
+    dct_size =
+        plm_buffer_read_vlc(self->buffer, PLM_VIDEO_DCT_SIZE[plane_index]);
+
+    // Read DC coeff
+    if (dct_size > 0) {
+      int differential = plm_buffer_read(self->buffer, dct_size);
+      if ((differential & (1 << (dct_size - 1))) != 0) {
+        self->block_data[0] = predictor + differential;
+      } else {
+        self->block_data[0] =
+            predictor + ((-1u << dct_size) | (differential + 1));
+      }
+    } else {
+      self->block_data[0] = predictor;
+    }
+
+    // Save predictor value
+    self->dc_predictor[plane_index] = self->block_data[0];
+
+    // Dequantize + premultiply
+    self->block_data[0] <<= (3 + 5);
+
+    quant_matrix = self->intra_quant_matrix;
+    n = 1;
+  } else {
+    quant_matrix = self->non_intra_quant_matrix;
+  }
+
+  // Decode AC coefficients (+DC for non-intra)
+  int level = 0;
+  while (true) {
+    int run = 0;
+    uint16_t coeff =
+        plm_buffer_read_vlc_uint(self->buffer, PLM_VIDEO_DCT_COEFF);
+
+    if ((coeff == 0x0001) && (n > 0) &&
+        (plm_buffer_read(self->buffer, 1) == 0)) {
+      // end_of_block
+      break;
+    }
+    if (coeff == 0xffff) {
+      // escape
+      run = plm_buffer_read(self->buffer, 6);
+      level = plm_buffer_read(self->buffer, 8);
+      if (level == 0) {
+        level = plm_buffer_read(self->buffer, 8);
+      } else if (level == 128) {
+        level = plm_buffer_read(self->buffer, 8) - 256;
+      } else if (level > 128) {
+        level = level - 256;
+      }
+    } else {
+      run = coeff >> 8;
+      level = coeff & 0xff;
+      if (plm_buffer_read(self->buffer, 1)) {
+        level = -level;
+      }
+    }
+
+    n += run;
+    if (n < 0 || n >= 64) {
+      return;  // invalid
+    }
+
+    int de_zig_zagged = PLM_VIDEO_ZIG_ZAG[n];
+    n++;
+
+    // Dequantize, oddify, clip
+    level = (unsigned)level << 1;
+    if (!self->macroblock_intra) {
+      level += (level < 0 ? -1 : 1);
+    }
+    level = (level * self->quantizer_scale * quant_matrix[de_zig_zagged]) >> 4;
+    if ((level & 1) == 0) {
+      level -= level > 0 ? 1 : -1;
+    }
+    if (level > 2047) {
+      level = 2047;
+    } else if (level < -2048) {
+      level = -2048;
+    }
+
+    // Save premultiplied coefficient
+    self->block_data[de_zig_zagged] =
+        level * PLM_VIDEO_PREMULTIPLIER_MATRIX[de_zig_zagged];
+  }
+
+  // Move block to its place
+  uint8_t *d;
+  int dw;
+  int di;
+
+  if (block < 4) {
+    d = self->frame_current.y.data;
+    dw = self->luma_width;
+    di = (self->mb_row * self->luma_width + self->mb_col) << 4;
+    if ((block & 1) != 0) {
+      di += 8;
+    }
+    if ((block & 2) != 0) {
+      di += self->luma_width << 3;
+    }
+  } else {
+    d = (block == 4) ? self->frame_current.cb.data
+                     : self->frame_current.cr.data;
+    dw = self->chroma_width;
+    di = ((self->mb_row * self->luma_width) << 2) + (self->mb_col << 3);
+  }
+
+  int *s = self->block_data;
+  int si = 0;
+  if (self->macroblock_intra) {
+    // Overwrite (no prediction)
+    if (n == 1) {
+      int clamped = plm_clamp((s[0] + 128) >> 8);
+      PLM_BLOCK_SET(d, di, dw, si, 8, 8, clamped);
+      s[0] = 0;
+    } else {
+      plm_video_idct(s);
+      PLM_BLOCK_SET(d, di, dw, si, 8, 8, plm_clamp(s[si]));
+      memset(self->block_data, 0, sizeof(self->block_data));
+    }
+  } else {
+    // Add data to the predicted macroblock
+    if (n == 1) {
+      int value = (s[0] + 128) >> 8;
+      PLM_BLOCK_SET(d, di, dw, si, 8, 8, plm_clamp(d[di] + value));
+      s[0] = 0;
+    } else {
+      plm_video_idct(s);
+      PLM_BLOCK_SET(d, di, dw, si, 8, 8, plm_clamp(d[di] + s[si]));
+      memset(self->block_data, 0, sizeof(self->block_data));
+    }
+  }
+}
+
+static void plm_video_decode_macroblock(plm_video_t *self) {
+  // Decode self->macroblock_address_increment
+  int increment = 0;
+  int t =
+      plm_buffer_read_vlc(self->buffer, PLM_VIDEO_MACROBLOCK_ADDRESS_INCREMENT);
+
+  while (t == 34) {
+    // macroblock_stuffing
+    t = plm_buffer_read_vlc(self->buffer,
+                            PLM_VIDEO_MACROBLOCK_ADDRESS_INCREMENT);
+  }
+  while (t == 35) {
+    // macroblock_escape
+    increment += 33;
+    t = plm_buffer_read_vlc(self->buffer,
+                            PLM_VIDEO_MACROBLOCK_ADDRESS_INCREMENT);
+  }
+  increment += t;
+
+  // Process any skipped macroblocks
+  if (self->slice_begin) {
+    // The first self->macroblock_address_increment of each slice is relative
+    // to beginning of the preverious row, not the preverious macroblock
+    self->slice_begin = false;
+    self->macroblock_address += increment;
+  } else {
+    if (self->macroblock_address + increment >= self->mb_size) {
+      return;  // invalid
+    }
+    if (increment > 1) {
+      // Skipped macroblocks reset DC predictors
+      self->dc_predictor[0] = 128;
+      self->dc_predictor[1] = 128;
+      self->dc_predictor[2] = 128;
+
+      // Skipped macroblocks in P-pictures reset motion vectors
+      if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE) {
+        self->motion_forward.h = 0;
+        self->motion_forward.v = 0;
+      }
+    }
+
+    // Predict skipped macroblocks
+    while (increment > 1) {
+      self->macroblock_address++;
+      self->mb_row = self->macroblock_address / self->mb_width;
+      self->mb_col = self->macroblock_address % self->mb_width;
+
+      plm_video_predict_macroblock(self);
+      increment--;
+    }
+    self->macroblock_address++;
+  }
+
+  self->mb_row = self->macroblock_address / self->mb_width;
+  self->mb_col = self->macroblock_address % self->mb_width;
+
+  if (self->mb_col >= self->mb_width || self->mb_row >= self->mb_height) {
+    return;  // corrupt stream;
+  }
+
+  // Process the current macroblock
+  // static const s16 *mbTable = MACROBLOCK_TYPE[self->picture_type];
+  // macroblock_type = read_huffman(self->bits, mbTable);
+
+  const plm_vlc_t *table = PLM_VIDEO_MACROBLOCK_TYPE[self->picture_type];
+  self->macroblock_type = plm_buffer_read_vlc(self->buffer, table);
+
+  self->macroblock_intra = (self->macroblock_type & 0x01);
+  self->motion_forward.is_set = (self->macroblock_type & 0x08);
+  self->motion_backward.is_set = (self->macroblock_type & 0x04);
+
+  // Quantizer scale
+  if ((self->macroblock_type & 0x10) != 0) {
+    self->quantizer_scale = plm_buffer_read(self->buffer, 5);
+  }
+
+  if (self->macroblock_intra) {
+    // Intra-coded macroblocks reset motion vectors
+    self->motion_backward.h = self->motion_forward.h = 0;
+    self->motion_backward.v = self->motion_forward.v = 0;
+  } else {
+    // Non-intra macroblocks reset DC predictors
+    self->dc_predictor[0] = 128;
+    self->dc_predictor[1] = 128;
+    self->dc_predictor[2] = 128;
+
+    plm_video_decode_motion_vectors(self);
+    plm_video_predict_macroblock(self);
+  }
+
+  // Decode blocks
+  int cbp =
+      ((self->macroblock_type & 0x02) != 0)
+          ? plm_buffer_read_vlc(self->buffer, PLM_VIDEO_CODE_BLOCK_PATTERN)
+          : (self->macroblock_intra ? 0x3f : 0);
+
+  for (int block = 0, mask = 0x20; block < 6; block++) {
+    if ((cbp & mask) != 0) {
+      plm_video_decode_block(self, block);
+    }
+    mask >>= 1;
+  }
+}
+
+static void plm_video_decode_slice(plm_video_t *self, int slice) {
+  self->slice_begin = true;
+  self->macroblock_address = (slice - 1) * self->mb_width - 1;
+  // Reset motion vectors and DC predictors
+  self->motion_backward.h = self->motion_forward.h = 0;
+  self->motion_backward.v = self->motion_forward.v = 0;
+  self->dc_predictor[0] = 128;
+  self->dc_predictor[1] = 128;
+  self->dc_predictor[2] = 128;
+  self->quantizer_scale = plm_buffer_read(self->buffer, 5);
+  // Skip extra
+  while (plm_buffer_read(self->buffer, 1)) {
+    plm_buffer_skip(self->buffer, 8);
+  }
+  do {
+    plm_video_decode_macroblock(self);
+  } while (self->macroblock_address < self->mb_size - 1 &&
+           plm_buffer_no_start_code(self->buffer));
+}
+
+static void plm_video_decode_picture(plm_video_t *self) {
+  plm_buffer_skip(self->buffer, 10);  // skip temporalReference
+  self->picture_type = plm_buffer_read(self->buffer, 3);
+  plm_buffer_skip(self->buffer, 16);  // skip vbv_delay
+
+  // D frames or unknown coding type
+  if (self->picture_type <= 0 ||
+      self->picture_type > PLM_VIDEO_PICTURE_TYPE_B) {
+    return;
+  }
+
+  // forward full_px, f_code
+  if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE ||
+      self->picture_type == PLM_VIDEO_PICTURE_TYPE_B) {
+    self->motion_forward.full_px = plm_buffer_read(self->buffer, 1);
+    int f_code = plm_buffer_read(self->buffer, 3);
+    if (f_code == 0) {
+      // Ignore picture with zero f_code
+      return;
+    }
+    self->motion_forward.r_size = f_code - 1;
+  }
+
+  // backward full_px, f_code
+  if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_B) {
+    self->motion_backward.full_px = plm_buffer_read(self->buffer, 1);
+    int f_code = plm_buffer_read(self->buffer, 3);
+    if (f_code == 0) {
+      // Ignore picture with zero f_code
+      return;
+    }
+    self->motion_backward.r_size = f_code - 1;
+  }
+
+  plm_frame_t frame_temp = self->frame_forward;
+
+  if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_INTRA ||
+      self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE) {
+    self->frame_forward = self->frame_backward;
+  }
+
+  // Skip extensions, user data
+  do {
+    self->start_code = plm_buffer_next_start_code(self->buffer);
+  } while (self->start_code == PLM_START_EXTENSION ||
+           self->start_code == PLM_START_USER_DATA);
+
+  while (self->start_code >= PLM_START_SLICE_FIRST &&
+         self->start_code <= PLM_START_SLICE_LAST) {
+    plm_video_decode_slice(self, self->start_code & 0x000000FF);
+    if (self->macroblock_address == self->mb_size - 1) {
+      break;
+    }
+    self->start_code = plm_buffer_next_start_code(self->buffer);
+  }
+
+  // If this is a reference picutre rotate the prediction pointers
+  if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_INTRA ||
+      self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE) {
+    self->frame_backward = self->frame_current;
+    self->frame_current = frame_temp;
+  }
+}
+
+static plm_frame_t *plm_video_decode_impl(plm_video_t *self) {
+  plm_frame_t *frame = NULL;
+  if (!self->has_sequence_header) {
+    self->start_code =
+        plm_buffer_find_start_code(self->buffer, PLM_START_SEQUENCE);
+    if (self->start_code == -1) {
+      return NULL;
+    }
+    plm_video_decode_sequence_header(self);
+  }
+  do {
+    if (self->start_code != PLM_START_PICTURE) {
+      self->start_code =
+          plm_buffer_find_start_code(self->buffer, PLM_START_PICTURE);
+    }
+    if (self->start_code == -1) {
+      return NULL;
+    }
+    plm_video_decode_picture(self);
+    if (self->assume_no_b_frames) {
+      frame = &self->frame_backward;
+    } else if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_B) {
+      frame = &self->frame_current;
+    } else if (self->has_reference_frame) {
+      frame = &self->frame_forward;
+    } else {
+      self->has_reference_frame = true;
+    }
+  } while (!frame);
+  frame->time = self->time;
+  self->frames_decoded++;
+  self->time = (double)self->frames_decoded / self->framerate;
+  return frame;
+}
+
+plm_frame_t *plm_video_decode(plm_video_t *self) {
+  plm_frame_t *res;
+  struct timespec tsc;
+  INFOF("plm_video_decode");
+  tsc = timespec_real();
+  res = plm_video_decode_impl(self);
+  plmpegdecode_latency_ = timespec_tomicros(timespec_sub(timespec_real(), tsc));
+  return res;
+}
+
+plm_video_t *plm_video_create_with_buffer(plm_buffer_t *buffer,
+                                          int destroy_when_done) {
+  plm_video_t *self = (plm_video_t *)memalign(64, sizeof(plm_video_t));
+  memset(self, 0, sizeof(plm_video_t));
+  self->buffer = buffer;
+  self->destroy_buffer_when_done = destroy_when_done;
+  self->start_code =
+      plm_buffer_find_start_code(self->buffer, PLM_START_SEQUENCE);
+  if (self->start_code != -1) {
+    plm_video_decode_sequence_header(self);
+  }
+  return self;
+}
+
+__attribute__((__constructor__)) static textstartup void plm_video_init(void) {
+  PLM_VIDEO_MACROBLOCK_TYPE[0] = NULL;
+  PLM_VIDEO_MACROBLOCK_TYPE[1] = (void *)PLM_VIDEO_MACROBLOCK_TYPE_INTRA;
+  PLM_VIDEO_MACROBLOCK_TYPE[2] = (void *)PLM_VIDEO_MACROBLOCK_TYPE_PREDICTIVE;
+  PLM_VIDEO_MACROBLOCK_TYPE[3] = (void *)PLM_VIDEO_MACROBLOCK_TYPE_B;
+  PLM_VIDEO_DCT_SIZE[0] = (void *)PLM_VIDEO_DCT_SIZE_LUMINANCE;
+  PLM_VIDEO_DCT_SIZE[1] = (void *)PLM_VIDEO_DCT_SIZE_CHROMINANCE;
+  PLM_VIDEO_DCT_SIZE[2] = (void *)PLM_VIDEO_DCT_SIZE_CHROMINANCE;
+}
diff --git a/dsp/mpeg/pl_mpeg.c b/dsp/mpeg/notice.c
similarity index 59%
rename from dsp/mpeg/pl_mpeg.c
rename to dsp/mpeg/notice.c
index e2f4a75d9..264a7549b 100644
--- a/dsp/mpeg/pl_mpeg.c
+++ b/dsp/mpeg/notice.c
@@ -2,8 +2,3 @@ __notice(pl_mpeg_notice, "\
 PL_MPEG (MIT License)\n\
 Copyright(c) 2019 Dominic Szablewski\n\
 https://phoboslab.org");
-
-long plmpegdecode_latency_;
-
-#define PL_MPEG_IMPLEMENTATION
-#include "pl_mpeg.h"
diff --git a/dsp/mpeg/pl_mpeg.h b/dsp/mpeg/pl_mpeg.h
deleted file mode 100755
index f81a8463a..000000000
--- a/dsp/mpeg/pl_mpeg.h
+++ /dev/null
@@ -1,4379 +0,0 @@
-/*
-PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer
-
-Dominic Szablewski - https://phoboslab.org
-
-
--- LICENSE: The MIT License(MIT)
-
-Copyright(c) 2019 Dominic Szablewski
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files(the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions :
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-
-
-
--- Synopsis
-
-// Define `PL_MPEG_IMPLEMENTATION` in *one* C/C++ file before including this
-// library to create the implementation.
-
-#define PL_MPEG_IMPLEMENTATION
-#𝑖𝑛𝑐𝑙𝑢𝑑𝑒 "plmpeg.h"
-
-// This function gets called for each decoded video frame
-void my_video_callback(plm_t *plm, plm_frame_t *frame, void *user) {
-	// Do something with frame->y.data, frame->cr.data, frame->cb.data
-}
-
-// This function gets called for each decoded audio frame
-void my_audio_callback(plm_t *plm, plm_samples_t *frame, void *user) {
-	// Do something with samples->interleaved
-}
-
-// Load a .mpg (MPEG Program Stream) file
-plm_t *plm = plm_create_with_filename("some-file.mpg");
-
-// Install the video & audio decode callbacks
-plm_set_video_decode_callback(plm, my_video_callback, my_data);
-plm_set_audio_decode_callback(plm, my_audio_callback, my_data);
-
-
-// Decode
-do {
-	plm_decode(plm, time_since_last_call);
-} while (!plm_has_ended(plm));
-
-// All done
-plm_destroy(plm);
-
-
-
--- Documentation
-
-This library provides several interfaces to load, demux and decode MPEG video
-and audio data. A high-level API combines the demuxer, video & audio decoders
-in an easy to use wrapper.
-
-Lower-level APIs for accessing the demuxer, video decoder and audio decoder, 
-as well as providing different data sources are also available.
-
-Interfaces are written in an object oriented style, meaning you create object 
-instances via various different constructor functions (plm_*create()),
-do some work on them and later dispose them via plm_*destroy().
-
-plm_* ......... the high-level interface, combining demuxer and decoders
-plm_buffer_* .. the data source used by all interfaces
-plm_demux_* ... the MPEG-PS demuxer
-plm_video_* ... the MPEG1 Video ("mpeg1") decoder
-plm_audio_* ... the MPEG1 Audio Layer II ("mp2") decoder
-
-
-With the high-level interface you have two options to decode video & audio:
-
- 1. Use plm_decode() and just hand over the delta time since the last call.
-    It will decode everything needed and call your callbacks (specified through
-    plm_set_{video|audio}_decode_callback()) any number of times.
-
- 2. Use plm_decode_video() and plm_decode_audio() to decode exactly one
-    frame of video or audio data at a time. How you handle the synchronization 
-    of both streams is up to you.
-
-If you only want to decode video *or* audio through these functions, you should
-disable the other stream (plm_set_{video|audio}_enabled(FALSE))
-
-Video data is decoded into a struct with all 3 planes (Y, Cr, Cb) stored in
-separate buffers. You can either convert this to RGB on the CPU (slow) via the
-plm_frame_to_rgb() function or do it on the GPU with the following matrix:
-
-mat4 bt601 = mat4(
-	1.16438,  0.00000,  1.59603, -0.87079,
-	1.16438, -0.39176, -0.81297,  0.52959,
-	1.16438,  2.01723,  0.00000, -1.08139,
-	0, 0, 0, 1
-);
-gl_FragColor = vec4(y, cb, cr, 1.0) * bt601;
-
-Audio data is decoded into a struct with either one single float array with the
-samples for the left and right channel interleaved, or if the 
-PLM_AUDIO_SEPARATE_CHANNELS is defined *before* including this library, into
-two separate float arrays - one for each channel.
-
-
-Data can be supplied to the high level interface, the demuxer and the decoders
-in three different ways:
-
- 1. Using plm_create_from_filename() or with a file handle with 
-    plm_create_from_file().
-
- 2. Using plm_create_with_memory() and supplying a pointer to memory that
-    contains the whole file.
-
- 3. Using plm_create_with_buffer(), supplying your own plm_buffer_t instance and
-    periodically writing to this buffer.
-
-When using your own plm_buffer_t instance, you can fill this buffer using 
-plm_buffer_write(). You can either monitor plm_buffer_get_remaining() and push 
-data when appropriate, or install a callback on the buffer with 
-plm_buffer_set_load_callback() that gets called whenever the buffer needs more 
-data.
-
-A buffer created with plm_buffer_create_with_capacity() is treated as a ring
-buffer, meaning that data that has already been read, will be discarded. In
-contrast, a buffer created with plm_buffer_create_for_appending() will keep all
-data written to it in memory. This enables seeking in the already loaded data.
-
-
-There should be no need to use the lower level plm_demux_*, plm_video_* and 
-plm_audio_* functions, if all you want to do is read/decode an MPEG-PS file.
-However, if you get raw mpeg1video data or raw mp2 audio data from a different
-source, these functions can be used to decode the raw data directly. Similarly, 
-if you only want to analyze an MPEG-PS file or extract raw video or audio
-packets from it, you can use the plm_demux_* functions.
-
-
-This library uses malloc(), realloc() and free() to manage memory. Typically 
-all allocation happens up-front when creating the interface. However, the
-default buffer size may be too small for certain inputs. In these cases plmpeg
-will realloc() the buffer with a larger size whenever needed. You can configure
-the default buffer size by defining PLM_BUFFER_DEFAULT_SIZE *before* 
-including this library.
-
-You can also define PLM_MALLOC, PLM_REALLOC and PLM_FREE to provide your own
-memory management functions.
-
-
-See below for detailed the API documentation.
-
-*/
-
-
-#ifndef PL_MPEG_H
-#define PL_MPEG_H
-
-#include <stdint.h>
-#include <stdio.h>
-#include <time.h>
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern long plmpegdecode_latency_; // [jart]
-
-// -----------------------------------------------------------------------------
-// Public Data Types
-
-
-// Object types for the various interfaces
-
-typedef struct plm_t plm_t;
-typedef struct plm_buffer_t plm_buffer_t;
-typedef struct plm_demux_t plm_demux_t;
-typedef struct plm_video_t plm_video_t;
-typedef struct plm_audio_t plm_audio_t;
-
-
-// Demuxed MPEG PS packet
-// The type maps directly to the various MPEG-PES start codes. PTS is the
-// presentation time stamp of the packet in seconds. Note that not all packets
-// have a PTS value, indicated by PLM_PACKET_INVALID_TS.
-
-#define PLM_PACKET_INVALID_TS -1
-
-typedef struct {
-	int type;
-	double pts;
-	size_t length;
-	uint8_t *data;
-} plm_packet_t;
-
-
-// Decoded Video Plane 
-// The byte length of the data is width * height. Note that different planes
-// have different sizes: the Luma plane (Y) is double the size of each of 
-// the two Chroma planes (Cr, Cb) - i.e. 4 times the byte length.
-// Also note that the size of the plane does *not* denote the size of the 
-// displayed frame. The sizes of planes are always rounded up to the nearest
-// macroblock (16px).
-
-typedef struct {
-	unsigned int width;
-	unsigned int height;
-	uint8_t *data;
-} plm_plane_t;
-
-
-// Decoded Video Frame
-// width and height denote the desired display size of the frame. This may be
-// different from the internal size of the 3 planes.
-
-typedef struct {
-	double time;
-	unsigned int width;
-	unsigned int height;
-	plm_plane_t y;
-	plm_plane_t cr;
-	plm_plane_t cb;
-} plm_frame_t;
-
-
-// Callback function type for decoded video frames used by the high-level
-// plm_* interface
-
-typedef void(*plm_video_decode_callback)
-	(plm_t *self, plm_frame_t *frame, void *user);
-
-
-// Decoded Audio Samples
-// Samples are stored as normalized (-1, 1) float either interleaved, or if
-// PLM_AUDIO_SEPARATE_CHANNELS is defined, in two separate arrays.
-// The `count` is always PLM_AUDIO_SAMPLES_PER_FRAME and just there for
-// convenience.
-
-#define PLM_AUDIO_SAMPLES_PER_FRAME 1152
-
-typedef struct {
-	double time;
-	unsigned int count;
-	#ifdef PLM_AUDIO_SEPARATE_CHANNELS
-		float left[PLM_AUDIO_SAMPLES_PER_FRAME];
-		float right[PLM_AUDIO_SAMPLES_PER_FRAME];
-	#else
-		float interleaved[PLM_AUDIO_SAMPLES_PER_FRAME * 2];
-	#endif
-} plm_samples_t;
-
-
-// Callback function type for decoded audio samples used by the high-level
-// plm_* interface
-
-typedef void(*plm_audio_decode_callback)
-	(plm_t *self, plm_samples_t *samples, void *user);
-
-
-// Callback function for plm_buffer when it needs more data
-
-typedef void(*plm_buffer_load_callback)(plm_buffer_t *self, void *user);
-
-
-
-// -----------------------------------------------------------------------------
-// plm_* public API
-// High-Level API for loading/demuxing/decoding MPEG-PS data
-
-
-// Create a plmpeg instance with a filename. Returns NULL if the file could not
-// be opened.
-
-plm_t *plm_create_with_filename(const char *filename);
-
-
-// Create a plmpeg instance with a file handle. Pass TRUE to close_when_done to
-// let plmpeg call fclose() on the handle when plm_destroy() is called.
-
-plm_t *plm_create_with_file(FILE *fh, int close_when_done);
-
-
-// Create a plmpeg instance with a pointer to memory as source. This assumes the
-// whole file is in memory. The memory is not copied. Pass TRUE to 
-// free_when_done to let plmpeg call free() on the pointer when plm_destroy() 
-// is called.
-
-plm_t *plm_create_with_memory(uint8_t *bytes, size_t length, int free_when_done);
-
-
-// Create a plmpeg instance with a plm_buffer as source. Pass TRUE to
-// destroy_when_done to let plmpeg call plm_buffer_destroy() on the buffer when
-// plm_destroy() is called.
-
-plm_t *plm_create_with_buffer(plm_buffer_t *buffer, int destroy_when_done);
-
-
-// Destroy a plmpeg instance and free all data.
-
-void plm_destroy(plm_t *self);
-
-
-// Get whether we have headers on all available streams and we can report the 
-// number of video/audio streams, video dimensions, framerate and audio 
-// samplerate.
-// This returns FALSE if the file is not an MPEG-PS file or - when not using a
-// file as source - when not enough data is available yet.
-
-int plm_has_headers(plm_t *self);
-
-
-// Probe the MPEG-PS data to find the actual number of video and audio streams
-// within the buffer. For certain files (e.g. VideoCD) this can be more accurate
-// than just reading the number of streams from the headers.
-// This should only be used when the underlying plm_buffer is seekable, i.e. for 
-// files, fixed memory buffers or _for_appending buffers. If used with dynamic
-// memory buffers it will skip decoding the probesize!
-// The necessary probesize is dependent on the files you expect to read. Usually
-// a few hundred KB should be enough to find all streams.
-// Use plm_get_num_{audio|video}_streams() afterwards to get the number of 
-// streams in the file.
-// Returns TRUE if any streams were found within the probesize.
-
-int plm_probe(plm_t *self, size_t probesize);
-
-
-// Get or set whether video decoding is enabled. Default TRUE.
-
-int plm_get_video_enabled(plm_t *self);
-void plm_set_video_enabled(plm_t *self, int enabled);
-
-
-// Get the number of video streams (0--1) reported in the system header.
-
-int plm_get_num_video_streams(plm_t *self);
-
-
-// Get the display width/height of the video stream.
-
-int plm_get_width(plm_t *self);
-int plm_get_height(plm_t *self);
-double plm_get_pixel_aspect_ratio(plm_t *self); // [jart]
-
-
-// Get the framerate of the video stream in frames per second.
-
-double plm_get_framerate(plm_t *self);
-
-
-// Get or set whether audio decoding is enabled. Default TRUE.
-
-int plm_get_audio_enabled(plm_t *self);
-void plm_set_audio_enabled(plm_t *self, int enabled);
-
-
-// Get the number of audio streams (0--4) reported in the system header.
-
-int plm_get_num_audio_streams(plm_t *self);
-
-
-// Set the desired audio stream (0--3). Default 0.
-
-void plm_set_audio_stream(plm_t *self, int stream_index);
-
-
-// Get the samplerate of the audio stream in samples per second.
-
-int plm_get_samplerate(plm_t *self);
-
-
-// Get or set the audio lead time in seconds - the time in which audio samples
-// are decoded in advance (or behind) the video decode time. Typically this
-// should be set to the duration of the buffer of the audio API that you use
-// for output. E.g. for SDL2: (SDL_AudioSpec.samples / samplerate)
-
-double plm_get_audio_lead_time(plm_t *self);
-void plm_set_audio_lead_time(plm_t *self, double lead_time);
-
-
-// Get the current internal time in seconds.
-
-double plm_get_time(plm_t *self);
-
-
-// Get the video duration of the underlying source in seconds.
-
-double plm_get_duration(plm_t *self);
-
-
-// Rewind all buffers back to the beginning.
-
-void plm_rewind(plm_t *self);
-
-
-// Get or set looping. Default FALSE.
-
-int plm_get_loop(plm_t *self);
-void plm_set_loop(plm_t *self, int loop);
-
-
-// Get whether the file has ended. If looping is enabled, this will always
-// return FALSE.
-
-int plm_has_ended(plm_t *self);
-
-
-// Set the callback for decoded video frames used with plm_decode(). If no 
-// callback is set, video data will be ignored and not be decoded. The *user
-// Parameter will be passed to your callback.
-
-void plm_set_video_decode_callback(plm_t *self, plm_video_decode_callback fp, void *user);
-
-
-// Set the callback for decoded audio samples used with plm_decode(). If no 
-// callback is set, audio data will be ignored and not be decoded. The *user
-// Parameter will be passed to your callback.
-
-void plm_set_audio_decode_callback(plm_t *self, plm_audio_decode_callback fp, void *user);
-
-
-// Advance the internal timer by seconds and decode video/audio up to this time.
-// This will call the video_decode_callback and audio_decode_callback any number
-// of times. A frame-skip is not implemented, i.e. everything up to current time
-// will be decoded.
-
-void plm_decode(plm_t *self, double seconds);
-
-
-// Decode and return one video frame. Returns NULL if no frame could be decoded
-// (either because the source ended or data is corrupt). If you only want to 
-// decode video, you should disable audio via plm_set_audio_enabled().
-// The returned plm_frame_t is valid until the next call to plm_decode_video() 
-// or until plm_destroy() is called.
-
-plm_frame_t *plm_decode_video(plm_t *self);
-
-
-// Decode and return one audio frame. Returns NULL if no frame could be decoded
-// (either because the source ended or data is corrupt). If you only want to 
-// decode audio, you should disable video via plm_set_video_enabled().
-// The returned plm_samples_t is valid until the next call to plm_decode_audio()
-// or until plm_destroy() is called.
-
-plm_samples_t *plm_decode_audio(plm_t *self);
-
-
-// Seek to the specified time, clamped between 0 -- duration. This can only be 
-// used when the underlying plm_buffer is seekable, i.e. for files, fixed 
-// memory buffers or _for_appending buffers. 
-// If seek_exact is TRUE this will seek to the exact time, otherwise it will 
-// seek to the last intra frame just before the desired time. Exact seeking can 
-// be slow, because all frames up to the seeked one have to be decoded on top of
-// the previous intra frame.
-// If seeking succeeds, this function will call the video_decode_callback 
-// exactly once with the target frame. If audio is enabled, it will also call
-// the audio_decode_callback any number of times, until the audio_lead_time is
-// satisfied.
-// Returns TRUE if seeking succeeded or FALSE if no frame could be found.
-
-int plm_seek(plm_t *self, double time, int seek_exact);
-
-
-// Similar to plm_seek(), but will not call the video_decode_callback,
-// audio_decode_callback or make any attempts to sync audio.
-// Returns the found frame or NULL if no frame could be found.
-
-plm_frame_t *plm_seek_frame(plm_t *self, double time, int seek_exact);
-
-
-
-// -----------------------------------------------------------------------------
-// plm_buffer public API
-// Provides the data source for all other plm_* interfaces
-
-
-// The default size for buffers created from files or by the high-level API
-
-#ifndef PLM_BUFFER_DEFAULT_SIZE
-#define PLM_BUFFER_DEFAULT_SIZE (128 * 1024)
-#endif
-
-
-// Create a buffer instance with a filename. Returns NULL if the file could not
-// be opened.
-
-plm_buffer_t *plm_buffer_create_with_filename(const char *filename);
-
-
-// Create a buffer instance with a file handle. Pass TRUE to close_when_done
-// to let plmpeg call fclose() on the handle when plm_destroy() is called.
-
-plm_buffer_t *plm_buffer_create_with_file(FILE *fh, int close_when_done);
-
-
-// Create a buffer instance with a pointer to memory as source. This assumes
-// the whole file is in memory. The bytes are not copied. Pass 1 to 
-// free_when_done to let plmpeg call free() on the pointer when plm_destroy() 
-// is called.
-
-plm_buffer_t *plm_buffer_create_with_memory(uint8_t *bytes, size_t length, int free_when_done);
-
-
-// Create an empty buffer with an initial capacity. The buffer will grow
-// as needed. Data that has already been read, will be discarded.
-
-plm_buffer_t *plm_buffer_create_with_capacity(size_t capacity);
-
-
-// Create an empty buffer with an initial capacity. The buffer will grow
-// as needed. Decoded data will *not* be discarded. This can be used when
-// loading a file over the network, without needing to throttle the download. 
-// It also allows for seeking in the already loaded data.
-
-plm_buffer_t *plm_buffer_create_for_appending(size_t initial_capacity);
-
-
-// Destroy a buffer instance and free all data
-
-void plm_buffer_destroy(plm_buffer_t *self);
-
-
-// Copy data into the buffer. If the data to be written is larger than the 
-// available space, the buffer will realloc() with a larger capacity. 
-// Returns the number of bytes written. This will always be the same as the
-// passed in length, except when the buffer was created _with_memory() for
-// which _write() is forbidden.
-
-size_t plm_buffer_write(plm_buffer_t *self, uint8_t *bytes, size_t length);
-
-
-// Mark the current byte length as the end of this buffer and signal that no 
-// more data is expected to be written to it. This function should be called
-// just after the last plm_buffer_write().
-// For _with_capacity buffers, this is cleared on a plm_buffer_rewind().
-
-void plm_buffer_signal_end(plm_buffer_t *self);
-
-
-// Set a callback that is called whenever the buffer needs more data
-
-void plm_buffer_set_load_callback(plm_buffer_t *self, plm_buffer_load_callback fp, void *user);
-
-
-// Rewind the buffer back to the beginning. When loading from a file handle,
-// this also seeks to the beginning of the file.
-
-void plm_buffer_rewind(plm_buffer_t *self);
-
-
-// Get the total size. For files, this returns the file size. For all other 
-// types it returns the number of bytes currently in the buffer.
-
-size_t plm_buffer_get_size(plm_buffer_t *self);
-
-
-// Get the number of remaining (yet unread) bytes in the buffer. This can be
-// useful to throttle writing.
-
-size_t plm_buffer_get_remaining(plm_buffer_t *self);
-
-
-// Get whether the read position of the buffer is at the end and no more data 
-// is expected.
-
-int plm_buffer_has_ended(plm_buffer_t *self);
-
-
-
-// -----------------------------------------------------------------------------
-// plm_demux public API
-// Demux an MPEG Program Stream (PS) data into separate packages
-
-
-// Various Packet Types
-
-static const int PLM_DEMUX_PACKET_PRIVATE = 0xBD;
-static const int PLM_DEMUX_PACKET_AUDIO_1 = 0xC0;
-static const int PLM_DEMUX_PACKET_AUDIO_2 = 0xC1;
-static const int PLM_DEMUX_PACKET_AUDIO_3 = 0xC2;
-static const int PLM_DEMUX_PACKET_AUDIO_4 = 0xC3;
-static const int PLM_DEMUX_PACKET_VIDEO_1 = 0xE0;
-
-
-// Create a demuxer with a plm_buffer as source. This will also attempt to read
-// the pack and system headers from the buffer.
-
-plm_demux_t *plm_demux_create(plm_buffer_t *buffer, int destroy_when_done);
-
-
-// Destroy a demuxer and free all data.
-
-void plm_demux_destroy(plm_demux_t *self);
-
-
-// Returns TRUE/FALSE whether pack and system headers have been found. This will
-// attempt to read the headers if non are present yet.
-
-int plm_demux_has_headers(plm_demux_t *self);
-
-
-// Probe the file for the actual number of video/audio streams. See
-// plm_probe() for the details.
-
-int plm_demux_probe(plm_demux_t *self, size_t probesize);
-
-
-// Returns the number of video streams found in the system header. This will
-// attempt to read the system header if non is present yet.
-
-int plm_demux_get_num_video_streams(plm_demux_t *self);
-
-
-// Returns the number of audio streams found in the system header. This will
-// attempt to read the system header if non is present yet.
-
-int plm_demux_get_num_audio_streams(plm_demux_t *self);
-
-
-// Rewind the internal buffer. See plm_buffer_rewind().
-
-void plm_demux_rewind(plm_demux_t *self);
-
-
-// Get whether the file has ended. This will be cleared on seeking or rewind.
-
-int plm_demux_has_ended(plm_demux_t *self);
-
-
-// Seek to a packet of the specified type with a PTS just before specified time.
-// If force_intra is TRUE, only packets containing an intra frame will be 
-// considered - this only makes sense when the type is PLM_DEMUX_PACKET_VIDEO_1.
-// Note that the specified time is considered 0-based, regardless of the first 
-// PTS in the data source.
-
-plm_packet_t *plm_demux_seek(plm_demux_t *self, double time, int type, int force_intra);
-
-
-// Get the PTS of the first packet of this type. Returns PLM_PACKET_INVALID_TS
-// if not packet of this packet type can be found.
-
-double plm_demux_get_start_time(plm_demux_t *self, int type);
-
-
-// Get the duration for the specified packet type - i.e. the span between the
-// the first PTS and the last PTS in the data source. This only makes sense when
-// the underlying data source is a file or fixed memory.
-
-double plm_demux_get_duration(plm_demux_t *self, int type);
-
-
-// Decode and return the next packet. The returned packet_t is valid until
-// the next call to plm_demux_decode() or until the demuxer is destroyed.
-
-plm_packet_t *plm_demux_decode(plm_demux_t *self);
-
-
-
-// -----------------------------------------------------------------------------
-// plm_video public API
-// Decode MPEG1 Video ("mpeg1") data into raw YCrCb frames
-
-
-// Create a video decoder with a plm_buffer as source.
-
-plm_video_t *plm_video_create_with_buffer(plm_buffer_t *buffer, int destroy_when_done);
-
-
-// Destroy a video decoder and free all data.
-
-void plm_video_destroy(plm_video_t *self);
-
-
-// Get whether a sequence header was found and we can accurately report on
-// dimensions and framerate.
-
-int plm_video_has_header(plm_video_t *self);
-
-
-// Get the framerate in frames per second.
-
-double plm_video_get_framerate(plm_video_t *self);
-double plm_video_get_pixel_aspect_ratio(plm_video_t *self); // [jart]
-
-
-// Get the display width/height.
-
-int plm_video_get_width(plm_video_t *self);
-int plm_video_get_height(plm_video_t *self);
-
-
-// Set "no delay" mode. When enabled, the decoder assumes that the video does
-// *not* contain any B-Frames. This is useful for reducing lag when streaming.
-// The default is FALSE.
-
-void plm_video_set_no_delay(plm_video_t *self, int no_delay);
-
-
-// Get the current internal time in seconds.
-
-double plm_video_get_time(plm_video_t *self);
-
-
-// Set the current internal time in seconds. This is only useful when you
-// manipulate the underlying video buffer and want to enforce a correct
-// timestamps.
-
-void plm_video_set_time(plm_video_t *self, double time);
-
-
-// Rewind the internal buffer. See plm_buffer_rewind().
-
-void plm_video_rewind(plm_video_t *self);
-
-
-// Get whether the file has ended. This will be cleared on rewind.
-
-int plm_video_has_ended(plm_video_t *self);
-
-
-// Decode and return one frame of video and advance the internal time by 
-// 1/framerate seconds. The returned frame_t is valid until the next call of
-// plm_video_decode() or until the video decoder is destroyed.
-
-plm_frame_t *plm_video_decode(plm_video_t *self);
-
-
-// Convert the YCrCb data of a frame into interleaved R G B data. The stride
-// specifies the width in bytes of the destination buffer. I.e. the number of
-// bytes from one line to the next. The stride must be at least 
-// (frame->width * bytes_per_pixel). The buffer pointed to by *dest must have a
-// size of at least (stride * frame->height).
-// Note that the alpha component of the dest buffer is always left untouched.
-
-void plm_frame_to_rgb(plm_frame_t *frame, uint8_t *dest, int stride);
-void plm_frame_to_bgr(plm_frame_t *frame, uint8_t *dest, int stride);
-void plm_frame_to_rgba(plm_frame_t *frame, uint8_t *dest, int stride);
-void plm_frame_to_bgra(plm_frame_t *frame, uint8_t *dest, int stride);
-void plm_frame_to_argb(plm_frame_t *frame, uint8_t *dest, int stride);
-void plm_frame_to_abgr(plm_frame_t *frame, uint8_t *dest, int stride);
-
-
-// -----------------------------------------------------------------------------
-// plm_audio public API
-// Decode MPEG-1 Audio Layer II ("mp2") data into raw samples
-
-
-// Create an audio decoder with a plm_buffer as source.
-
-plm_audio_t *plm_audio_create_with_buffer(plm_buffer_t *buffer, int destroy_when_done);
-
-
-// Destroy an audio decoder and free all data.
-
-void plm_audio_destroy(plm_audio_t *self);
-
-
-// Get whether a frame header was found and we can accurately report on
-// samplerate.
-
-int plm_audio_has_header(plm_audio_t *self);
-
-
-// Get the samplerate in samples per second.
-
-int plm_audio_get_samplerate(plm_audio_t *self);
-
-
-// Get the current internal time in seconds.
-
-double plm_audio_get_time(plm_audio_t *self);
-
-
-// Set the current internal time in seconds. This is only useful when you
-// manipulate the underlying video buffer and want to enforce a correct
-// timestamps.
-
-void plm_audio_set_time(plm_audio_t *self, double time);
-
-
-// Rewind the internal buffer. See plm_buffer_rewind().
-
-void plm_audio_rewind(plm_audio_t *self);
-
-
-// Get whether the file has ended. This will be cleared on rewind.
-
-int plm_audio_has_ended(plm_audio_t *self);
-
-
-// Decode and return one "frame" of audio and advance the internal time by 
-// (PLM_AUDIO_SAMPLES_PER_FRAME/samplerate) seconds. The returned samples_t 
-// is valid until the next call of plm_audio_decode() or until the audio
-// decoder is destroyed.
-
-plm_samples_t *plm_audio_decode(plm_audio_t *self);
-
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // PL_MPEG_H
-
-
-
-
-
-// -----------------------------------------------------------------------------
-// -----------------------------------------------------------------------------
-// IMPLEMENTATION
-
-#ifdef PL_MPEG_IMPLEMENTATION
-
-#include <string.h>
-#include <stdlib.h>
-
-#ifndef TRUE
-#define TRUE 1
-#define FALSE 0
-#endif
-
-#ifndef PLM_MALLOC
-	#define PLM_MALLOC(sz) malloc(sz)
-	#define PLM_FREE(p) free(p)
-	#define PLM_REALLOC(p, sz) realloc(p, sz)
-#endif
-
-#define PLM_UNUSED(expr) (void)(expr)
-
-
-// -----------------------------------------------------------------------------
-// plm (high-level interface) implementation
-
-struct plm_t {
-	plm_demux_t *demux;
-	double time;
-	int has_ended;
-	int loop;
-	int has_decoders;
-
-	int video_enabled;
-	int video_packet_type;
-	plm_buffer_t *video_buffer;
-	plm_video_t *video_decoder;
-
-	int audio_enabled;
-	int audio_stream_index;
-	int audio_packet_type;
-	double audio_lead_time;
-	plm_buffer_t *audio_buffer;
-	plm_audio_t *audio_decoder;
-
-	plm_video_decode_callback video_decode_callback;
-	void *video_decode_callback_user_data;
-
-	plm_audio_decode_callback audio_decode_callback;
-	void *audio_decode_callback_user_data;
-};
-
-int plm_init_decoders(plm_t *self);
-void plm_handle_end(plm_t *self);
-void plm_read_video_packet(plm_buffer_t *buffer, void *user);
-void plm_read_audio_packet(plm_buffer_t *buffer, void *user);
-void plm_read_packets(plm_t *self, int requested_type);
-
-plm_t *plm_create_with_filename(const char *filename) {
-	plm_buffer_t *buffer = plm_buffer_create_with_filename(filename);
-	if (!buffer) {
-		return NULL;
-	}
-	return plm_create_with_buffer(buffer, TRUE);
-}
-
-plm_t *plm_create_with_file(FILE *fh, int close_when_done) {
-	plm_buffer_t *buffer = plm_buffer_create_with_file(fh, close_when_done);
-	return plm_create_with_buffer(buffer, TRUE);
-}
-
-plm_t *plm_create_with_memory(uint8_t *bytes, size_t length, int free_when_done) {
-	plm_buffer_t *buffer = plm_buffer_create_with_memory(bytes, length, free_when_done);
-	return plm_create_with_buffer(buffer, TRUE);
-}
-
-plm_t *plm_create_with_buffer(plm_buffer_t *buffer, int destroy_when_done) {
-	plm_t *self = (plm_t *)PLM_MALLOC(sizeof(plm_t));
-	memset(self, 0, sizeof(plm_t));
-
-	self->demux = plm_demux_create(buffer, destroy_when_done);
-	self->video_enabled = TRUE;
-	self->audio_enabled = TRUE;
-	plm_init_decoders(self);
-
-	return self;
-}
-
-int plm_init_decoders(plm_t *self) {
-	if (self->has_decoders) {
-		return TRUE;
-	}
-
-	if (!plm_demux_has_headers(self->demux)) {
-		return FALSE;
-	}
-
-	if (plm_demux_get_num_video_streams(self->demux) > 0) {
-		if (self->video_enabled) {
-			self->video_packet_type = PLM_DEMUX_PACKET_VIDEO_1;
-		}
-		if (!self->video_decoder) {
-			self->video_buffer = plm_buffer_create_with_capacity(PLM_BUFFER_DEFAULT_SIZE);
-			plm_buffer_set_load_callback(self->video_buffer, plm_read_video_packet, self);
-			self->video_decoder = plm_video_create_with_buffer(self->video_buffer, TRUE);
-		}
-	}
-
-	if (plm_demux_get_num_audio_streams(self->demux) > 0) {
-		if (self->audio_enabled) {
-			self->audio_packet_type = PLM_DEMUX_PACKET_AUDIO_1 + self->audio_stream_index;
-		}
-		if (!self->audio_decoder) {
-			self->audio_buffer = plm_buffer_create_with_capacity(PLM_BUFFER_DEFAULT_SIZE);
-			plm_buffer_set_load_callback(self->audio_buffer, plm_read_audio_packet, self);
-			self->audio_decoder = plm_audio_create_with_buffer(self->audio_buffer, TRUE);
-		}
-	}
-
-	self->has_decoders = TRUE;
-	return TRUE;
-}
-
-void plm_destroy(plm_t *self) {
-	if (self->video_decoder) {
-		plm_video_destroy(self->video_decoder);
-	}
-	if (self->audio_decoder) {
-		plm_audio_destroy(self->audio_decoder);
-	}
-
-	plm_demux_destroy(self->demux);
-	PLM_FREE(self);
-}
-
-int plm_get_audio_enabled(plm_t *self) {
-	return self->audio_enabled;
-}
-
-int plm_has_headers(plm_t *self) {
-	if (!plm_demux_has_headers(self->demux)) {
-		return FALSE;
-	}
-	
-	if (!plm_init_decoders(self)) {
-		return FALSE;
-	}
-
-	if (
-		(self->video_decoder && !plm_video_has_header(self->video_decoder)) ||
-		(self->audio_decoder && !plm_audio_has_header(self->audio_decoder))
-	) {
-		return FALSE;
-	}
-
-	return TRUE;
-}
-
-int plm_probe(plm_t *self, size_t probesize) {
-	int found_streams = plm_demux_probe(self->demux, probesize);
-	if (!found_streams) {
-		return FALSE;
-	}
-
-	// Re-init decoders
-	self->has_decoders = FALSE;
-	self->video_packet_type = 0;
-	self->audio_packet_type = 0;
-	return plm_init_decoders(self);
-}
-
-void plm_set_audio_enabled(plm_t *self, int enabled) {
-	self->audio_enabled = enabled;
-
-	if (!enabled) {
-		self->audio_packet_type = 0;
-		return;
-	}
-
-	self->audio_packet_type = (plm_init_decoders(self) && self->audio_decoder)
-		? PLM_DEMUX_PACKET_AUDIO_1 + self->audio_stream_index
-		: 0;
-}
-
-void plm_set_audio_stream(plm_t *self, int stream_index) {
-	if (stream_index < 0 || stream_index > 3) {
-		return;
-	}
-	self->audio_stream_index = stream_index;
-
-	// Set the correct audio_packet_type
-	plm_set_audio_enabled(self, self->audio_enabled);
-}
-
-int plm_get_video_enabled(plm_t *self) {
-	return self->video_enabled;
-}
-
-void plm_set_video_enabled(plm_t *self, int enabled) {
-	self->video_enabled = enabled;
-
-	if (!enabled) {
-		self->video_packet_type = 0;
-		return;
-	}
-
-	self->video_packet_type = (plm_init_decoders(self) && self->video_decoder)
-		? PLM_DEMUX_PACKET_VIDEO_1
-		: 0;
-}
-
-int plm_get_num_video_streams(plm_t *self) {
-	return plm_demux_get_num_video_streams(self->demux);
-}
-
-int plm_get_width(plm_t *self) {
-	return (plm_init_decoders(self) && self->video_decoder)
-		? plm_video_get_width(self->video_decoder)
-		: 0;
-}
-
-int plm_get_height(plm_t *self) {
-	return (plm_init_decoders(self) && self->video_decoder)
-		? plm_video_get_height(self->video_decoder)
-		: 0;
-}
-
-double plm_get_framerate(plm_t *self) {
-	return (plm_init_decoders(self) && self->video_decoder)
-		? plm_video_get_framerate(self->video_decoder)
-		: 0;
-}
-
-double plm_get_pixel_aspect_ratio(plm_t *self) { // [jart]
-	return (plm_init_decoders(self) && self->video_decoder)
-		? plm_video_get_pixel_aspect_ratio(self->video_decoder)
-		: 0;
-}
-
-int plm_get_num_audio_streams(plm_t *self) {
-	return plm_demux_get_num_audio_streams(self->demux);
-}
-
-int plm_get_samplerate(plm_t *self) {
-	return (plm_init_decoders(self) && self->audio_decoder)
-		? plm_audio_get_samplerate(self->audio_decoder)
-		: 0;
-}
-
-double plm_get_audio_lead_time(plm_t *self) {
-	return self->audio_lead_time;
-}
-
-void plm_set_audio_lead_time(plm_t *self, double lead_time) {
-	self->audio_lead_time = lead_time;
-}
-
-double plm_get_time(plm_t *self) {
-	return self->time;
-}
-
-double plm_get_duration(plm_t *self) {
-	return plm_demux_get_duration(self->demux, PLM_DEMUX_PACKET_VIDEO_1);
-}
-
-void plm_rewind(plm_t *self) {
-	if (self->video_decoder) {
-		plm_video_rewind(self->video_decoder);
-	}
-
-	if (self->audio_decoder) {
-		plm_audio_rewind(self->audio_decoder);
-	}
-
-	plm_demux_rewind(self->demux);
-	self->time = 0;
-}
-
-int plm_get_loop(plm_t *self) {
-	return self->loop;
-}
-
-void plm_set_loop(plm_t *self, int loop) {
-	self->loop = loop;
-}
-
-int plm_has_ended(plm_t *self) {
-	return self->has_ended;
-}
-
-void plm_set_video_decode_callback(plm_t *self, plm_video_decode_callback fp, void *user) {
-	self->video_decode_callback = fp;
-	self->video_decode_callback_user_data = user;
-}
-
-void plm_set_audio_decode_callback(plm_t *self, plm_audio_decode_callback fp, void *user) {
-	self->audio_decode_callback = fp;
-	self->audio_decode_callback_user_data = user;
-}
-
-void plm_decode(plm_t *self, double tick) {
-	if (!plm_init_decoders(self)) {
-		return;
-	}
-
-	int decode_video = (self->video_decode_callback && self->video_packet_type);
-	int decode_audio = (self->audio_decode_callback && self->audio_packet_type);
-
-	if (!decode_video && !decode_audio) {
-		// Nothing to do here
-		return;
-	}
-
-	int did_decode = FALSE;
-	int decode_video_failed = FALSE;
-	int decode_audio_failed = FALSE;
-
-	double video_target_time = self->time + tick;
-	double audio_target_time = self->time + tick + self->audio_lead_time;
-
-	do {
-		did_decode = FALSE;
-		
-		if (decode_video && plm_video_get_time(self->video_decoder) < video_target_time) {
-			plm_frame_t *frame = plm_video_decode(self->video_decoder);
-			if (frame) {
-				self->video_decode_callback(self, frame, self->video_decode_callback_user_data);
-				did_decode = TRUE;
-			}
-			else {
-				decode_video_failed = TRUE;
-			}
-		}
-
-		if (decode_audio && plm_audio_get_time(self->audio_decoder) < audio_target_time) {
-			plm_samples_t *samples = plm_audio_decode(self->audio_decoder);
-			if (samples) {
-				self->audio_decode_callback(self, samples, self->audio_decode_callback_user_data);
-				did_decode = TRUE;
-			}
-			else {
-				decode_audio_failed = TRUE;
-			}
-		}
-	} while (did_decode);
-	
-	// Did all sources we wanted to decode fail and the demuxer is at the end?
-	if (
-		(!decode_video || decode_video_failed) && 
-		(!decode_audio || decode_audio_failed) &&
-		plm_demux_has_ended(self->demux)
-	) {
-		plm_handle_end(self);
-		return;
-	}
-
-	self->time += tick;
-}
-
-plm_frame_t *plm_decode_video(plm_t *self) {
-	if (!plm_init_decoders(self)) {
-		return NULL;
-	}
-
-	if (!self->video_packet_type) {
-		return NULL;
-	}
-
-	plm_frame_t *frame = plm_video_decode(self->video_decoder);
-	if (frame) {
-		self->time = frame->time;
-	}
-	else if (plm_demux_has_ended(self->demux)) {
-		plm_handle_end(self);
-	}
-	return frame;
-}
-
-plm_samples_t *plm_decode_audio(plm_t *self) {
-	if (!plm_init_decoders(self)) {
-		return NULL;
-	}
-
-	if (!self->audio_packet_type) {
-		return NULL;
-	}
-
-	plm_samples_t *samples = plm_audio_decode(self->audio_decoder);
-	if (samples) {
-		self->time = samples->time;
-	}
-	else if (plm_demux_has_ended(self->demux)) {
-		plm_handle_end(self);
-	}
-	return samples;
-}
-
-void plm_handle_end(plm_t *self) {
-	if (self->loop) {
-		plm_rewind(self);
-	}
-	else {
-		self->has_ended = TRUE;
-	}
-}
-
-void plm_read_video_packet(plm_buffer_t *buffer, void *user) {
-	PLM_UNUSED(buffer);
-	plm_t *self = (plm_t *)user;
-	plm_read_packets(self, self->video_packet_type);
-}
-
-void plm_read_audio_packet(plm_buffer_t *buffer, void *user) {
-	PLM_UNUSED(buffer);
-	plm_t *self = (plm_t *)user;
-	plm_read_packets(self, self->audio_packet_type);
-}
-
-void plm_read_packets(plm_t *self, int requested_type) {
-	plm_packet_t *packet;
-	while ((packet = plm_demux_decode(self->demux))) {
-		if (packet->type == self->video_packet_type) {
-			plm_buffer_write(self->video_buffer, packet->data, packet->length);
-		}
-		else if (packet->type == self->audio_packet_type) {
-			plm_buffer_write(self->audio_buffer, packet->data, packet->length);
-		}
-
-		if (packet->type == requested_type) {
-			return;
-		}
-	}
-
-	if (plm_demux_has_ended(self->demux)) {
-		if (self->video_buffer) {
-			plm_buffer_signal_end(self->video_buffer);
-		}
-		if (self->audio_buffer) {
-			plm_buffer_signal_end(self->audio_buffer);
-		}
-	}
-}
-
-plm_frame_t *plm_seek_frame(plm_t *self, double time, int seek_exact) {
-	if (!plm_init_decoders(self)) {
-		return NULL;
-	}
-
-	if (!self->video_packet_type) {
-		return NULL;
-	}
-
-	int type = self->video_packet_type;
-
-	double start_time = plm_demux_get_start_time(self->demux, type);
-	double duration = plm_demux_get_duration(self->demux, type);
-
-	if (time < 0) {
-		time = 0;
-	}
-	else if (time > duration) {
-		time = duration;
-	}
-	
-	plm_packet_t *packet = plm_demux_seek(self->demux, time, type, TRUE);
-	if (!packet) {
-		return NULL;
-	}
-
-	// Disable writing to the audio buffer while decoding video
-	int previous_audio_packet_type = self->audio_packet_type;
-	self->audio_packet_type = 0;
-
-	// Clear video buffer and decode the found packet
-	plm_video_rewind(self->video_decoder);
-	plm_video_set_time(self->video_decoder, packet->pts - start_time);
-	plm_buffer_write(self->video_buffer, packet->data, packet->length);
-	plm_frame_t *frame = plm_video_decode(self->video_decoder);	
-
-	// If we want to seek to an exact frame, we have to decode all frames
-	// on top of the intra frame we just jumped to.
-	if (seek_exact) {
-		while (frame && frame->time < time) {
-			frame = plm_video_decode(self->video_decoder);
-		}
-	}
-
-	// Enable writing to the audio buffer again?
-	self->audio_packet_type = previous_audio_packet_type;
-
-	if (frame) {
-		self->time = frame->time;
-	}
-
-	self->has_ended = FALSE;
-	return frame;
-}
-
-int plm_seek(plm_t *self, double time, int seek_exact) {
-	plm_frame_t *frame = plm_seek_frame(self, time, seek_exact);
-	
-	if (!frame) {
-		return FALSE;
-	}
-
-	if (self->video_decode_callback) {
-		self->video_decode_callback(self, frame, self->video_decode_callback_user_data);	
-	}
-
-	// If audio is not enabled we are done here.
-	if (!self->audio_packet_type) {
-		return TRUE;
-	}
-
-	// Sync up Audio. This demuxes more packets until the first audio packet
-	// with a PTS greater than the current time is found. plm_decode() is then
-	// called to decode enough audio data to satisfy the audio_lead_time.
-
-	double start_time = plm_demux_get_start_time(self->demux, self->video_packet_type);
-	plm_audio_rewind(self->audio_decoder);
-
-	plm_packet_t *packet = NULL;
-	while ((packet = plm_demux_decode(self->demux))) {
-		if (packet->type == self->video_packet_type) {
-			plm_buffer_write(self->video_buffer, packet->data, packet->length);
-		}
-		else if (
-			packet->type == self->audio_packet_type &&
-			packet->pts - start_time > self->time
-		) {
-			plm_audio_set_time(self->audio_decoder, packet->pts - start_time);
-			plm_buffer_write(self->audio_buffer, packet->data, packet->length);
-			plm_decode(self, 0);
-			break;
-		}
-	}	
-	
-	return TRUE;
-}
-
-
-
-// -----------------------------------------------------------------------------
-// plm_buffer implementation
-
-enum plm_buffer_mode {
-	PLM_BUFFER_MODE_FILE,
-	PLM_BUFFER_MODE_FIXED_MEM,
-	PLM_BUFFER_MODE_RING,
-	PLM_BUFFER_MODE_APPEND
-};
-
-struct plm_buffer_t {
-	size_t bit_index;
-	size_t capacity;
-	size_t length;
-	size_t total_size;
-	int discard_read_bytes;
-	int has_ended;
-	int free_when_done;
-	int close_when_done;
-	FILE *fh;
-	plm_buffer_load_callback load_callback;
-	void *load_callback_user_data;
-	uint8_t *bytes;
-	enum plm_buffer_mode mode;
-};
-
-typedef struct {
-	int16_t index;
-	int16_t value;
-} plm_vlc_t;
-
-typedef struct {
-	int16_t index;
-	uint16_t value;
-} plm_vlc_uint_t;
-
-
-void plm_buffer_seek(plm_buffer_t *self, size_t pos);
-size_t plm_buffer_tell(plm_buffer_t *self);
-void plm_buffer_discard_read_bytes(plm_buffer_t *self);
-void plm_buffer_load_file_callback(plm_buffer_t *self, void *user);
-
-int plm_buffer_has(plm_buffer_t *self, size_t count);
-int plm_buffer_read(plm_buffer_t *self, int count);
-void plm_buffer_align(plm_buffer_t *self);
-void plm_buffer_skip(plm_buffer_t *self, size_t count);
-int plm_buffer_skip_bytes(plm_buffer_t *self, uint8_t v);
-int plm_buffer_next_start_code(plm_buffer_t *self);
-int plm_buffer_find_start_code(plm_buffer_t *self, int code);
-int plm_buffer_no_start_code(plm_buffer_t *self);
-int16_t plm_buffer_read_vlc(plm_buffer_t *self, const plm_vlc_t *table);
-uint16_t plm_buffer_read_vlc_uint(plm_buffer_t *self, const plm_vlc_uint_t *table);
-
-plm_buffer_t *plm_buffer_create_with_filename(const char *filename) {
-	FILE *fh = fopen(filename, "rb");
-	if (!fh) {
-		return NULL;
-	}
-	return plm_buffer_create_with_file(fh, TRUE);
-}
-
-plm_buffer_t *plm_buffer_create_with_file(FILE *fh, int close_when_done) {
-	plm_buffer_t *self = plm_buffer_create_with_capacity(PLM_BUFFER_DEFAULT_SIZE);
-	self->fh = fh;
-	self->close_when_done = close_when_done;
-	self->mode = PLM_BUFFER_MODE_FILE;
-	self->discard_read_bytes = TRUE;
-	
-	fseek(self->fh, 0, SEEK_END);
-	self->total_size = ftell(self->fh);
-	fseek(self->fh, 0, SEEK_SET);
-
-	plm_buffer_set_load_callback(self, plm_buffer_load_file_callback, NULL);
-	return self;
-}
-
-plm_buffer_t *plm_buffer_create_with_memory(uint8_t *bytes, size_t length, int free_when_done) {
-	plm_buffer_t *self = (plm_buffer_t *)PLM_MALLOC(sizeof(plm_buffer_t));
-	memset(self, 0, sizeof(plm_buffer_t));
-	self->capacity = length;
-	self->length = length;
-	self->total_size = length;
-	self->free_when_done = free_when_done;
-	self->bytes = bytes;
-	self->mode = PLM_BUFFER_MODE_FIXED_MEM;
-	self->discard_read_bytes = FALSE;
-	return self;
-}
-
-plm_buffer_t *plm_buffer_create_with_capacity(size_t capacity) {
-	plm_buffer_t *self = (plm_buffer_t *)PLM_MALLOC(sizeof(plm_buffer_t));
-	memset(self, 0, sizeof(plm_buffer_t));
-	self->capacity = capacity;
-	self->free_when_done = TRUE;
-	self->bytes = (uint8_t *)PLM_MALLOC(capacity);
-	self->mode = PLM_BUFFER_MODE_RING;
-	self->discard_read_bytes = TRUE;
-	return self;
-}
-
-plm_buffer_t *plm_buffer_create_for_appending(size_t initial_capacity) {
-	plm_buffer_t *self = plm_buffer_create_with_capacity(initial_capacity);
-	self->mode = PLM_BUFFER_MODE_APPEND;
-	self->discard_read_bytes = FALSE;
-	return self;
-}
-
-void plm_buffer_destroy(plm_buffer_t *self) {
-	if (self->fh && self->close_when_done) {
-		fclose(self->fh);
-	}
-	if (self->free_when_done) {
-		PLM_FREE(self->bytes);
-	}
-	PLM_FREE(self);
-}
-
-size_t plm_buffer_get_size(plm_buffer_t *self) {
-	return (self->mode == PLM_BUFFER_MODE_FILE)
-		? self->total_size
-		: self->length;
-}
-
-size_t plm_buffer_get_remaining(plm_buffer_t *self) {
-	return self->length - (self->bit_index >> 3);
-}
-
-size_t plm_buffer_write(plm_buffer_t *self, uint8_t *bytes, size_t length) {
-	if (self->mode == PLM_BUFFER_MODE_FIXED_MEM) {
-		return 0;
-	}
-
-	if (self->discard_read_bytes) {
-		// This should be a ring buffer, but instead it just shifts all unread 
-		// data to the beginning of the buffer and appends new data at the end. 
-		// Seems to be good enough.
-
-		plm_buffer_discard_read_bytes(self);
-		if (self->mode == PLM_BUFFER_MODE_RING) {
-			self->total_size = 0;
-		}
-	}
-
-	// Do we have to resize to fit the new data?
-	size_t bytes_available = self->capacity - self->length;
-	if (bytes_available < length) {
-		size_t new_size = self->capacity;
-		do {
-			new_size *= 2;
-		} while (new_size - self->length < length);
-		self->bytes = (uint8_t *)PLM_REALLOC(self->bytes, new_size);
-		self->capacity = new_size;
-	}
-
-	memcpy(self->bytes + self->length, bytes, length);
-	self->length += length;
-	self->has_ended = FALSE;
-	return length;
-}
-
-void plm_buffer_signal_end(plm_buffer_t *self) {
-	self->total_size = self->length;
-}
-
-void plm_buffer_set_load_callback(plm_buffer_t *self, plm_buffer_load_callback fp, void *user) {
-	self->load_callback = fp;
-	self->load_callback_user_data = user;
-}
-
-void plm_buffer_rewind(plm_buffer_t *self) {
-	plm_buffer_seek(self, 0);
-}
-
-void plm_buffer_seek(plm_buffer_t *self, size_t pos) {
-	self->has_ended = FALSE;
-
-	if (self->mode == PLM_BUFFER_MODE_FILE) {
-		fseek(self->fh, pos, SEEK_SET);
-		self->bit_index = 0;
-		self->length = 0;
-	}
-	else if (self->mode == PLM_BUFFER_MODE_RING) {
-		if (pos != 0) {
-			// Seeking to non-0 is forbidden for dynamic-mem buffers
-			return; 
-		}
-		self->bit_index = 0;
-		self->length = 0;
-		self->total_size = 0;
-	}
-	else if (pos < self->length) {
-		self->bit_index = pos << 3;
-	}
-}
-
-size_t plm_buffer_tell(plm_buffer_t *self) {
-	return self->mode == PLM_BUFFER_MODE_FILE
-		? ftell(self->fh) + (self->bit_index >> 3) - self->length
-		: self->bit_index >> 3;
-}
-
-void plm_buffer_discard_read_bytes(plm_buffer_t *self) {
-	size_t byte_pos = self->bit_index >> 3;
-	if (byte_pos == self->length) {
-		self->bit_index = 0;
-		self->length = 0;
-	}
-	else if (byte_pos > 0) {
-		memmove(self->bytes, self->bytes + byte_pos, self->length - byte_pos);
-		self->bit_index -= byte_pos << 3;
-		self->length -= byte_pos;
-	}
-}
-
-void plm_buffer_load_file_callback(plm_buffer_t *self, void *user) {
-	PLM_UNUSED(user);
-	
-	if (self->discard_read_bytes) {
-		plm_buffer_discard_read_bytes(self);
-	}
-
-	size_t bytes_available = self->capacity - self->length;
-	size_t bytes_read = fread(self->bytes + self->length, 1, bytes_available, self->fh);
-	self->length += bytes_read;
-
-	if (bytes_read == 0) {
-		self->has_ended = TRUE;
-	}
-}
-
-int plm_buffer_has_ended(plm_buffer_t *self) {
-	return self->has_ended;
-}
-
-int plm_buffer_has(plm_buffer_t *self, size_t count) {
-	if (((self->length << 3) - self->bit_index) >= count) {
-		return TRUE;
-	}
-
-	if (self->load_callback) {
-		self->load_callback(self, self->load_callback_user_data);
-		
-		if (((self->length << 3) - self->bit_index) >= count) {
-			return TRUE;
-		}
-	}	
-	
-	if (self->total_size != 0 && self->length == self->total_size) {
-		self->has_ended = TRUE;
-	}
-	return FALSE;
-}
-
-int plm_buffer_read(plm_buffer_t *self, int count) {
-	if (!plm_buffer_has(self, count)) {
-		return 0;
-	}
-
-	int value = 0;
-	while (count) {
-		int current_byte = self->bytes[self->bit_index >> 3];
-
-		int remaining = 8 - (self->bit_index & 7); // Remaining bits in byte
-		int read = remaining < count ? remaining : count; // Bits in self run
-		int shift = remaining - read;
-		int mask = (0xff >> (8 - read));
-
-		value = (value << read) | ((current_byte & (mask << shift)) >> shift);
-
-		self->bit_index += read;
-		count -= read;
-	}
-
-	return value;
-}
-
-void plm_buffer_align(plm_buffer_t *self) {
-	self->bit_index = ((self->bit_index + 7) >> 3) << 3; // Align to next byte
-}
-
-void plm_buffer_skip(plm_buffer_t *self, size_t count) {
-	if (plm_buffer_has(self, count)) {
-		self->bit_index += count;
-	}
-}
-
-int plm_buffer_skip_bytes(plm_buffer_t *self, uint8_t v) {
-	plm_buffer_align(self);
-	int skipped = 0;
-	while (plm_buffer_has(self, 8) && self->bytes[self->bit_index >> 3] == v) {
-		self->bit_index += 8;
-		skipped++;
-	}
-	return skipped;
-}
-
-int plm_buffer_next_start_code(plm_buffer_t *self) {
-	plm_buffer_align(self);
-
-	while (plm_buffer_has(self, (5 << 3))) {
-		size_t byte_index = (self->bit_index) >> 3;
-		if (
-			self->bytes[byte_index] == 0x00 &&
-			self->bytes[byte_index + 1] == 0x00 &&
-			self->bytes[byte_index + 2] == 0x01
-		) {
-			self->bit_index = (byte_index + 4) << 3;
-			return self->bytes[byte_index + 3];
-		}
-		self->bit_index += 8;
-	}
-	return -1;
-}
-
-int plm_buffer_find_start_code(plm_buffer_t *self, int code) {
-	int current = 0;
-	while (TRUE) {
-		current = plm_buffer_next_start_code(self);
-		if (current == code || current == -1) {
-			return current;
-		}
-	}
-	return -1;
-}
-
-int plm_buffer_has_start_code(plm_buffer_t *self, int code) {
-	size_t previous_bit_index = self->bit_index;
-	int previous_discard_read_bytes = self->discard_read_bytes;
-	
-	self->discard_read_bytes = FALSE;
-	int current = plm_buffer_find_start_code(self, code);
-
-	self->bit_index = previous_bit_index;
-	self->discard_read_bytes = previous_discard_read_bytes;
-	return current;
-}
-
-int plm_buffer_peek_non_zero(plm_buffer_t *self, int bit_count) {
-	if (!plm_buffer_has(self, bit_count)) {
-		return FALSE;
-	}
-
-	int val = plm_buffer_read(self, bit_count);
-	self->bit_index -= bit_count;
-	return val != 0;
-}
-
-int16_t plm_buffer_read_vlc(plm_buffer_t *self, const plm_vlc_t *table) {
-	plm_vlc_t state = {0, 0};
-	do {
-		state = table[state.index + plm_buffer_read(self, 1)];
-	} while (state.index > 0);
-	return state.value;
-}
-
-uint16_t plm_buffer_read_vlc_uint(plm_buffer_t *self, const plm_vlc_uint_t *table) {
-	return (uint16_t)plm_buffer_read_vlc(self, (const plm_vlc_t *)table);
-}
-
-
-
-// ----------------------------------------------------------------------------
-// plm_demux implementation
-
-static const int PLM_START_PACK = 0xBA;
-static const int PLM_START_END = 0xB9;
-static const int PLM_START_SYSTEM = 0xBB;
-
-struct plm_demux_t {
-	plm_buffer_t *buffer;
-	int destroy_buffer_when_done;
-	double system_clock_ref;
-
-	size_t last_file_size;
-	double last_decoded_pts;
-	double start_time;
-	double duration;
-
-	int start_code;
-	int has_pack_header;
-	int has_system_header;
-	int has_headers;
-
-	int num_audio_streams;
-	int num_video_streams;
-	plm_packet_t current_packet;
-	plm_packet_t next_packet;
-};
-
-
-void plm_demux_buffer_seek(plm_demux_t *self, size_t pos);
-double plm_demux_decode_time(plm_demux_t *self);
-plm_packet_t *plm_demux_decode_packet(plm_demux_t *self, int type);
-plm_packet_t *plm_demux_get_packet(plm_demux_t *self);
-
-plm_demux_t *plm_demux_create(plm_buffer_t *buffer, int destroy_when_done) {
-	plm_demux_t *self = (plm_demux_t *)PLM_MALLOC(sizeof(plm_demux_t));
-	memset(self, 0, sizeof(plm_demux_t));
-
-	self->buffer = buffer;
-	self->destroy_buffer_when_done = destroy_when_done;
-
-	self->start_time = PLM_PACKET_INVALID_TS;
-	self->duration = PLM_PACKET_INVALID_TS;
-	self->start_code = -1;
-
-	plm_demux_has_headers(self);
-	return self;
-}
-
-void plm_demux_destroy(plm_demux_t *self) {
-	if (self->destroy_buffer_when_done) {
-		plm_buffer_destroy(self->buffer);
-	}
-	PLM_FREE(self);
-}
-
-int plm_demux_has_headers(plm_demux_t *self) {
-	if (self->has_headers) {
-		return TRUE;
-	}
-
-	// Decode pack header
-	if (!self->has_pack_header) {
-		if (
-			self->start_code != PLM_START_PACK &&
-			plm_buffer_find_start_code(self->buffer, PLM_START_PACK) == -1
-		) {
-			return FALSE;
-		}
-
-		self->start_code = PLM_START_PACK;
-		if (!plm_buffer_has(self->buffer, 64)) {
-			return FALSE;
-		}
-		self->start_code = -1;
-
-		if (plm_buffer_read(self->buffer, 4) != 0x02) {
-			return FALSE;
-		}
-
-		self->system_clock_ref = plm_demux_decode_time(self);
-		plm_buffer_skip(self->buffer, 1);
-		plm_buffer_skip(self->buffer, 22); // mux_rate * 50
-		plm_buffer_skip(self->buffer, 1);
-
-		self->has_pack_header = TRUE;
-	}
-
-	// Decode system header
-	if (!self->has_system_header) {
-		if (
-			self->start_code != PLM_START_SYSTEM &&
-			plm_buffer_find_start_code(self->buffer, PLM_START_SYSTEM) == -1
-		) {
-			return FALSE;
-		}
-
-		self->start_code = PLM_START_SYSTEM;
-		if (!plm_buffer_has(self->buffer, 56)) {
-			return FALSE;
-		}
-		self->start_code = -1;
-
-		plm_buffer_skip(self->buffer, 16); // header_length
-		plm_buffer_skip(self->buffer, 24); // rate bound
-		self->num_audio_streams = plm_buffer_read(self->buffer, 6);
-		plm_buffer_skip(self->buffer, 5); // misc flags
-		self->num_video_streams = plm_buffer_read(self->buffer, 5);
-
-		self->has_system_header = TRUE;
-	}
-
-	self->has_headers = TRUE;
-	return TRUE;
-}
-
-int plm_demux_probe(plm_demux_t *self, size_t probesize) {
-	int previous_pos = plm_buffer_tell(self->buffer);
-
-	int video_stream = FALSE;
-	int audio_streams[4] = {FALSE, FALSE, FALSE, FALSE};
-	do {
-		self->start_code = plm_buffer_next_start_code(self->buffer);
-		if (self->start_code == PLM_DEMUX_PACKET_VIDEO_1) {
-			video_stream = TRUE;
-		}
-		else if (
-			self->start_code >= PLM_DEMUX_PACKET_AUDIO_1 && 
-			self->start_code <= PLM_DEMUX_PACKET_AUDIO_4
-		) {
-			audio_streams[self->start_code - PLM_DEMUX_PACKET_AUDIO_1] = TRUE;
-		}
-	} while (
-		self->start_code != -1 && 
-		plm_buffer_tell(self->buffer) - previous_pos < probesize
-	);
-
-	self->num_video_streams = video_stream ? 1 : 0;
-	self->num_audio_streams = 0;
-	for (int i = 0; i < 4; i++) {
-		if (audio_streams[i]) {
-			self->num_audio_streams++;
-		}
-	}
-
-	plm_demux_buffer_seek(self, previous_pos);
-	return (self->num_video_streams || self->num_audio_streams);
-}
-
-int plm_demux_get_num_video_streams(plm_demux_t *self) {
-	return plm_demux_has_headers(self)
-		? self->num_video_streams
-		: 0;
-}
-
-int plm_demux_get_num_audio_streams(plm_demux_t *self) {
-	return plm_demux_has_headers(self)
-		? self->num_audio_streams
-		: 0;
-}
-
-void plm_demux_rewind(plm_demux_t *self) {
-	plm_buffer_rewind(self->buffer);
-	self->current_packet.length = 0;
-	self->next_packet.length = 0;
-	self->start_code = -1;
-}
-
-int plm_demux_has_ended(plm_demux_t *self) {
-	return plm_buffer_has_ended(self->buffer);
-}
-
-void plm_demux_buffer_seek(plm_demux_t *self, size_t pos) {
-	plm_buffer_seek(self->buffer, pos);
-	self->current_packet.length = 0;
-	self->next_packet.length = 0;
-	self->start_code = -1;
-}
-
-double plm_demux_get_start_time(plm_demux_t *self, int type) {
-	if (self->start_time != PLM_PACKET_INVALID_TS) {
-		return self->start_time;
-	}
-
-	int previous_pos = plm_buffer_tell(self->buffer);
-	int previous_start_code = self->start_code;
-	
-	// Find first video PTS
-	plm_demux_rewind(self);
-	do {
-		plm_packet_t *packet = plm_demux_decode(self);
-		if (!packet) {
-			break;
-		}
-		if (packet->type == type) {
-			self->start_time = packet->pts;
-		}
-	} while (self->start_time == PLM_PACKET_INVALID_TS);
-
-	plm_demux_buffer_seek(self, previous_pos);
-	self->start_code = previous_start_code;
-	return self->start_time;
-}
-
-double plm_demux_get_duration(plm_demux_t *self, int type) {
-	size_t file_size = plm_buffer_get_size(self->buffer);
-
-	if (
-		self->duration != PLM_PACKET_INVALID_TS &&
-		self->last_file_size == file_size
-	) {
-		return self->duration;
-	}
-
-	size_t previous_pos = plm_buffer_tell(self->buffer);
-	int previous_start_code = self->start_code;
-	
-	// Find last video PTS. Start searching 64kb from the end and go further 
-	// back if needed.
-	long start_range = 64 * 1024;
-	long max_range = 4096 * 1024;
-	for (long range = start_range; range <= max_range; range *= 2) {
-		long seek_pos = file_size - range;
-		if (seek_pos < 0) {
-			seek_pos = 0;
-			range = max_range; // Make sure to bail after this round
-		}
-		plm_demux_buffer_seek(self, seek_pos);
-		self->current_packet.length = 0;
-
-		double last_pts = PLM_PACKET_INVALID_TS;
-		plm_packet_t *packet = NULL;
-		while ((packet = plm_demux_decode(self))) {
-			if (packet->pts != PLM_PACKET_INVALID_TS && packet->type == type) {
-				last_pts = packet->pts;
-			}
-		}
-		if (last_pts != PLM_PACKET_INVALID_TS) {
-			self->duration = last_pts - plm_demux_get_start_time(self, type);
-			break;
-		}
-	}
-
-	plm_demux_buffer_seek(self, previous_pos);
-	self->start_code = previous_start_code;
-	self->last_file_size = file_size;
-	return self->duration;
-}
-
-plm_packet_t *plm_demux_seek(plm_demux_t *self, double seek_time, int type, int force_intra) {
-	if (!plm_demux_has_headers(self)) {
-		return NULL;
-	}
-
-	// Using the current time, current byte position and the average bytes per
-	// second for this file, try to jump to a byte position that hopefully has
-	// packets containing timestamps within one second before to the desired 
-	// seek_time.
-
-	// If we hit close to the seek_time scan through all packets to find the
-	// last one (just before the seek_time) containing an intra frame.
-	// Otherwise we should at least be closer than before. Calculate the bytes
-	// per second for the jumped range and jump again.
-
-	// The number of retries here is hard-limited to a generous amount. Usually
-	// the correct range is found after 1--5 jumps, even for files with very 
-	// variable bitrates. If significantly more jumps are needed, there's
-	// probably something wrong with the file and we just avoid getting into an
-	// infinite loop. 32 retries should be enough for anybody.
-
-	double duration = plm_demux_get_duration(self, type);
-	long file_size = plm_buffer_get_size(self->buffer);
-	long byterate = file_size / duration;
-
-	double cur_time = self->last_decoded_pts;
-	double scan_span = 1;
-
-	if (seek_time > duration) {
-		seek_time = duration;
-	}
-	else if (seek_time < 0) {
-		seek_time = 0;
-	}
-	seek_time += self->start_time;
-
-	for (int retry = 0; retry < 32; retry++) {
-		int found_packet_with_pts = FALSE;
-		int found_packet_in_range = FALSE;
-		long last_valid_packet_start = -1;
-		double first_packet_time = PLM_PACKET_INVALID_TS;
-
-		long cur_pos = plm_buffer_tell(self->buffer);
-
-		// Estimate byte offset and jump to it.
-		long offset = (seek_time - cur_time - scan_span) * byterate;
-		long seek_pos = cur_pos + offset;
-		if (seek_pos < 0) {
-			seek_pos = 0;
-		}
-		else if (seek_pos > file_size - 256) {
-			seek_pos = file_size - 256;
-		}
-
-		plm_demux_buffer_seek(self, seek_pos);
-
-		// Scan through all packets up to the seek_time to find the last packet
-		// containing an intra frame.
-		while (plm_buffer_find_start_code(self->buffer, type) != -1) {
-			long packet_start = plm_buffer_tell(self->buffer);
-			plm_packet_t *packet = plm_demux_decode_packet(self, type);
-
-			// Skip packet if it has no PTS
-			if (!packet || packet->pts == PLM_PACKET_INVALID_TS) {
-				continue;
-			}
-
-			// Bail scanning through packets if we hit one that is outside
-			// seek_time - scan_span.
-			// We also adjust the cur_time and byterate values here so the next 
-			// iteration can be a bit more precise.
-			if (packet->pts > seek_time || packet->pts < seek_time - scan_span) {
-				found_packet_with_pts = TRUE;
-				byterate = (seek_pos - cur_pos) / (packet->pts - cur_time);
-				cur_time = packet->pts;
-				break;
-			}
-
-			// If we are still here, it means this packet is in close range to
-			// the seek_time. If this is the first packet for this jump position
-			// record the PTS. If we later have to back off, when there was no
-			// intra frame in this range, we can lower the seek_time to not scan
-			// this range again.
-			if (!found_packet_in_range) {
-				found_packet_in_range = TRUE;
-				first_packet_time = packet->pts;
-			}
-
-			// Check if this is an intra frame packet. If so, record the buffer
-			// position of the start of this packet. We want to jump back to it 
-			// later, when we know it's the last intra frame before desired
-			// seek time.
-			if (force_intra) {
-				for (size_t i = 0; i < packet->length - 6; i++) {
-					// Find the START_PICTURE code
-					if (
-						packet->data[i] == 0x00 &&
-						packet->data[i + 1] == 0x00 &&
-						packet->data[i + 2] == 0x01 &&
-						packet->data[i + 3] == 0x00
-					) {
-						// Bits 11--13 in the picture header contain the frame 
-						// type, where 1=Intra
-						if ((packet->data[i + 5] & 0x38) == 8) {
-							last_valid_packet_start = packet_start;
-						}
-						break;
-					}
-				}
-			}
-
-			// If we don't want intra frames, just use the last PTS found.
-			else {
-				last_valid_packet_start = packet_start;
-			}
-		}
-
-		// If there was at least one intra frame in the range scanned above,
-		// our search is over. Jump back to the packet and decode it again.
-		if (last_valid_packet_start != -1) {
-			plm_demux_buffer_seek(self, last_valid_packet_start);
-			return plm_demux_decode_packet(self, type);
-		}
-
-		// If we hit the right range, but still found no intra frame, we have
-		// to increases the scan_span. This is done exponentially to also handle
-		// video files with very few intra frames.
-		else if (found_packet_in_range) {
-			scan_span *= 2;
-			seek_time = first_packet_time;
-		}
-
-		// If we didn't find any packet with a PTS, it probably means we reached
-		// the end of the file. Estimate byterate and cur_time accordingly.
-		else if (!found_packet_with_pts) {
-			byterate = (seek_pos - cur_pos) / (duration - cur_time);
-			cur_time = duration;
-		}
-	}
-
-	return NULL;
-}
-
-plm_packet_t *plm_demux_decode(plm_demux_t *self) {
-	if (!plm_demux_has_headers(self)) {
-		return NULL;
-	}
-
-	if (self->current_packet.length) {
-		size_t bits_till_next_packet = self->current_packet.length << 3;
-		if (!plm_buffer_has(self->buffer, bits_till_next_packet)) {
-			return NULL;
-		}
-		plm_buffer_skip(self->buffer, bits_till_next_packet);
-		self->current_packet.length = 0;
-	}
-
-	// Pending packet waiting for data?
-	if (self->next_packet.length) {
-		return plm_demux_get_packet(self);
-	}
-
-	// Pending packet waiting for header?
-	if (self->start_code != -1) {
-		return plm_demux_decode_packet(self, self->start_code);
-	}
-
-	do {
-		self->start_code = plm_buffer_next_start_code(self->buffer);
-		if (
-			self->start_code == PLM_DEMUX_PACKET_VIDEO_1 || 
-			self->start_code == PLM_DEMUX_PACKET_PRIVATE || (
-				self->start_code >= PLM_DEMUX_PACKET_AUDIO_1 && 
-				self->start_code <= PLM_DEMUX_PACKET_AUDIO_4
-			)
-		) {
-			return plm_demux_decode_packet(self, self->start_code);
-		}
-	} while (self->start_code != -1);
-
-	return NULL;
-}
-
-double plm_demux_decode_time(plm_demux_t *self) {
-	int64_t clock = plm_buffer_read(self->buffer, 3) << 30;
-	plm_buffer_skip(self->buffer, 1);
-	clock |= plm_buffer_read(self->buffer, 15) << 15;
-	plm_buffer_skip(self->buffer, 1);
-	clock |= plm_buffer_read(self->buffer, 15);
-	plm_buffer_skip(self->buffer, 1);
-	return (double)clock / 90000.0;
-}
-
-plm_packet_t *plm_demux_decode_packet(plm_demux_t *self, int type) {
-	if (!plm_buffer_has(self->buffer, 16 << 3)) {
-		return NULL;
-	}
-
-	self->start_code = -1;
-
-	self->next_packet.type = type;
-	self->next_packet.length = plm_buffer_read(self->buffer, 16);
-	self->next_packet.length -= plm_buffer_skip_bytes(self->buffer, 0xff); // stuffing
-
-	// skip P-STD
-	if (plm_buffer_read(self->buffer, 2) == 0x01) {
-		plm_buffer_skip(self->buffer, 16);
-		self->next_packet.length -= 2;
-	}
-
-	int pts_dts_marker = plm_buffer_read(self->buffer, 2);
-	if (pts_dts_marker == 0x03) {
-		self->next_packet.pts = plm_demux_decode_time(self);
-		self->last_decoded_pts = self->next_packet.pts;
-		plm_buffer_skip(self->buffer, 40); // skip dts
-		self->next_packet.length -= 10;
-	}
-	else if (pts_dts_marker == 0x02) {
-		self->next_packet.pts = plm_demux_decode_time(self);
-		self->last_decoded_pts = self->next_packet.pts;
-		self->next_packet.length -= 5;
-	}
-	else if (pts_dts_marker == 0x00) {
-		self->next_packet.pts = PLM_PACKET_INVALID_TS;
-		plm_buffer_skip(self->buffer, 4);
-		self->next_packet.length -= 1;
-	}
-	else {
-		return NULL; // invalid
-	}
-	
-	return plm_demux_get_packet(self);
-}
-
-plm_packet_t *plm_demux_get_packet(plm_demux_t *self) {
-	if (!plm_buffer_has(self->buffer, self->next_packet.length << 3)) {
-		return NULL;
-	}
-
-	self->current_packet.data = self->buffer->bytes + (self->buffer->bit_index >> 3);
-	self->current_packet.length = self->next_packet.length;
-	self->current_packet.type = self->next_packet.type;
-	self->current_packet.pts = self->next_packet.pts;
-
-	self->next_packet.length = 0;
-	return &self->current_packet;
-}
-
-
-
-// -----------------------------------------------------------------------------
-// plm_video implementation
-
-// Inspired by Java MPEG-1 Video Decoder and Player by Zoltan Korandi 
-// https://sourceforge.net/projects/javampeg1video/
-
-static const int PLM_VIDEO_PICTURE_TYPE_INTRA = 1;
-static const int PLM_VIDEO_PICTURE_TYPE_PREDICTIVE = 2;
-static const int PLM_VIDEO_PICTURE_TYPE_B = 3;
-
-static const int PLM_START_SEQUENCE = 0xB3;
-static const int PLM_START_SLICE_FIRST = 0x01;
-static const int PLM_START_SLICE_LAST = 0xAF;
-static const int PLM_START_PICTURE = 0x00;
-static const int PLM_START_EXTENSION = 0xB5;
-static const int PLM_START_USER_DATA = 0xB2;
-
-#define PLM_START_IS_SLICE(c) \
-	(c >= PLM_START_SLICE_FIRST && c <= PLM_START_SLICE_LAST)
-
-static const float PLM_VIDEO_PIXEL_ASPECT_RATIO[] = { // [jart]
-	1.0000, /* square pixels */
-	0.6735, /* 3:4? */
-	0.7031, /* MPEG-1 / MPEG-2 video encoding divergence? */
-	0.7615, 0.8055, 0.8437, 0.8935, 0.9157, 0.9815,
-	1.0255, 1.0695, 1.0950, 1.1575, 1.2051,
-};
-
-static const double PLM_VIDEO_PICTURE_RATE[] = {
-	0.000, 23.976, 24.000, 25.000, 29.970, 30.000, 50.000, 59.940,
-	60.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-};
-
-static const uint8_t PLM_VIDEO_ZIG_ZAG[] = {
-	 0,  1,  8, 16,  9,  2,  3, 10,
-	17, 24, 32, 25, 18, 11,  4,  5,
-	12, 19, 26, 33, 40, 48, 41, 34,
-	27, 20, 13,  6,  7, 14, 21, 28,
-	35, 42, 49, 56, 57, 50, 43, 36,
-	29, 22, 15, 23, 30, 37, 44, 51,
-	58, 59, 52, 45, 38, 31, 39, 46,
-	53, 60, 61, 54, 47, 55, 62, 63
-};
-
-static const uint8_t PLM_VIDEO_INTRA_QUANT_MATRIX[] = {
-	 8, 16, 19, 22, 26, 27, 29, 34,
-	16, 16, 22, 24, 27, 29, 34, 37,
-	19, 22, 26, 27, 29, 34, 34, 38,
-	22, 22, 26, 27, 29, 34, 37, 40,
-	22, 26, 27, 29, 32, 35, 40, 48,
-	26, 27, 29, 32, 35, 40, 48, 58,
-	26, 27, 29, 34, 38, 46, 56, 69,
-	27, 29, 35, 38, 46, 56, 69, 83
-};
-
-static const uint8_t PLM_VIDEO_NON_INTRA_QUANT_MATRIX[] = {
-	16, 16, 16, 16, 16, 16, 16, 16,
-	16, 16, 16, 16, 16, 16, 16, 16,
-	16, 16, 16, 16, 16, 16, 16, 16,
-	16, 16, 16, 16, 16, 16, 16, 16,
-	16, 16, 16, 16, 16, 16, 16, 16,
-	16, 16, 16, 16, 16, 16, 16, 16,
-	16, 16, 16, 16, 16, 16, 16, 16,
-	16, 16, 16, 16, 16, 16, 16, 16
-};
-
-static const uint8_t PLM_VIDEO_PREMULTIPLIER_MATRIX[] = {
-	32, 44, 42, 38, 32, 25, 17,  9,
-	44, 62, 58, 52, 44, 35, 24, 12,
-	42, 58, 55, 49, 42, 33, 23, 12,
-	38, 52, 49, 44, 38, 30, 20, 10,
-	32, 44, 42, 38, 32, 25, 17,  9,
-	25, 35, 33, 30, 25, 20, 14,  7,
-	17, 24, 23, 20, 17, 14,  9,  5,
-	 9, 12, 12, 10,  9,  7,  5,  2
-};
-
-static const plm_vlc_t PLM_VIDEO_MACROBLOCK_ADDRESS_INCREMENT[] = {
-	{  1 << 1,    0}, {       0,    1},  //   0: x
-	{  2 << 1,    0}, {  3 << 1,    0},  //   1: 0x
-	{  4 << 1,    0}, {  5 << 1,    0},  //   2: 00x
-	{       0,    3}, {       0,    2},  //   3: 01x
-	{  6 << 1,    0}, {  7 << 1,    0},  //   4: 000x
-	{       0,    5}, {       0,    4},  //   5: 001x
-	{  8 << 1,    0}, {  9 << 1,    0},  //   6: 0000x
-	{       0,    7}, {       0,    6},  //   7: 0001x
-	{ 10 << 1,    0}, { 11 << 1,    0},  //   8: 0000 0x
-	{ 12 << 1,    0}, { 13 << 1,    0},  //   9: 0000 1x
-	{ 14 << 1,    0}, { 15 << 1,    0},  //  10: 0000 00x
-	{ 16 << 1,    0}, { 17 << 1,    0},  //  11: 0000 01x
-	{ 18 << 1,    0}, { 19 << 1,    0},  //  12: 0000 10x
-	{       0,    9}, {       0,    8},  //  13: 0000 11x
-	{      -1,    0}, { 20 << 1,    0},  //  14: 0000 000x
-	{      -1,    0}, { 21 << 1,    0},  //  15: 0000 001x
-	{ 22 << 1,    0}, { 23 << 1,    0},  //  16: 0000 010x
-	{       0,   15}, {       0,   14},  //  17: 0000 011x
-	{       0,   13}, {       0,   12},  //  18: 0000 100x
-	{       0,   11}, {       0,   10},  //  19: 0000 101x
-	{ 24 << 1,    0}, { 25 << 1,    0},  //  20: 0000 0001x
-	{ 26 << 1,    0}, { 27 << 1,    0},  //  21: 0000 0011x
-	{ 28 << 1,    0}, { 29 << 1,    0},  //  22: 0000 0100x
-	{ 30 << 1,    0}, { 31 << 1,    0},  //  23: 0000 0101x
-	{ 32 << 1,    0}, {      -1,    0},  //  24: 0000 0001 0x
-	{      -1,    0}, { 33 << 1,    0},  //  25: 0000 0001 1x
-	{ 34 << 1,    0}, { 35 << 1,    0},  //  26: 0000 0011 0x
-	{ 36 << 1,    0}, { 37 << 1,    0},  //  27: 0000 0011 1x
-	{ 38 << 1,    0}, { 39 << 1,    0},  //  28: 0000 0100 0x
-	{       0,   21}, {       0,   20},  //  29: 0000 0100 1x
-	{       0,   19}, {       0,   18},  //  30: 0000 0101 0x
-	{       0,   17}, {       0,   16},  //  31: 0000 0101 1x
-	{       0,   35}, {      -1,    0},  //  32: 0000 0001 00x
-	{      -1,    0}, {       0,   34},  //  33: 0000 0001 11x
-	{       0,   33}, {       0,   32},  //  34: 0000 0011 00x
-	{       0,   31}, {       0,   30},  //  35: 0000 0011 01x
-	{       0,   29}, {       0,   28},  //  36: 0000 0011 10x
-	{       0,   27}, {       0,   26},  //  37: 0000 0011 11x
-	{       0,   25}, {       0,   24},  //  38: 0000 0100 00x
-	{       0,   23}, {       0,   22},  //  39: 0000 0100 01x
-};
-
-static const plm_vlc_t PLM_VIDEO_MACROBLOCK_TYPE_INTRA[] = {
-	{  1 << 1,    0}, {       0,  0x01},  //   0: x
-	{      -1,    0}, {       0,  0x11},  //   1: 0x
-};
-
-static const plm_vlc_t PLM_VIDEO_MACROBLOCK_TYPE_PREDICTIVE[] = {
-	{  1 << 1,    0}, {       0, 0x0a},  //   0: x
-	{  2 << 1,    0}, {       0, 0x02},  //   1: 0x
-	{  3 << 1,    0}, {       0, 0x08},  //   2: 00x
-	{  4 << 1,    0}, {  5 << 1,    0},  //   3: 000x
-	{  6 << 1,    0}, {       0, 0x12},  //   4: 0000x
-	{       0, 0x1a}, {       0, 0x01},  //   5: 0001x
-	{      -1,    0}, {       0, 0x11},  //   6: 0000 0x
-};
-
-static const plm_vlc_t PLM_VIDEO_MACROBLOCK_TYPE_B[] = {
-	{  1 << 1,    0}, {  2 << 1,    0},  //   0: x
-	{  3 << 1,    0}, {  4 << 1,    0},  //   1: 0x
-	{       0, 0x0c}, {       0, 0x0e},  //   2: 1x
-	{  5 << 1,    0}, {  6 << 1,    0},  //   3: 00x
-	{       0, 0x04}, {       0, 0x06},  //   4: 01x
-	{  7 << 1,    0}, {  8 << 1,    0},  //   5: 000x
-	{       0, 0x08}, {       0, 0x0a},  //   6: 001x
-	{  9 << 1,    0}, { 10 << 1,    0},  //   7: 0000x
-	{       0, 0x1e}, {       0, 0x01},  //   8: 0001x
-	{      -1,    0}, {       0, 0x11},  //   9: 0000 0x
-	{       0, 0x16}, {       0, 0x1a},  //  10: 0000 1x
-};
-
-static const plm_vlc_t *PLM_VIDEO_MACROBLOCK_TYPE[] = {
-	NULL,
-	PLM_VIDEO_MACROBLOCK_TYPE_INTRA,
-	PLM_VIDEO_MACROBLOCK_TYPE_PREDICTIVE,
-	PLM_VIDEO_MACROBLOCK_TYPE_B
-};
-
-static const plm_vlc_t PLM_VIDEO_CODE_BLOCK_PATTERN[] = {
-	{  1 << 1,    0}, {  2 << 1,    0},  //   0: x
-	{  3 << 1,    0}, {  4 << 1,    0},  //   1: 0x
-	{  5 << 1,    0}, {  6 << 1,    0},  //   2: 1x
-	{  7 << 1,    0}, {  8 << 1,    0},  //   3: 00x
-	{  9 << 1,    0}, { 10 << 1,    0},  //   4: 01x
-	{ 11 << 1,    0}, { 12 << 1,    0},  //   5: 10x
-	{ 13 << 1,    0}, {       0,   60},  //   6: 11x
-	{ 14 << 1,    0}, { 15 << 1,    0},  //   7: 000x
-	{ 16 << 1,    0}, { 17 << 1,    0},  //   8: 001x
-	{ 18 << 1,    0}, { 19 << 1,    0},  //   9: 010x
-	{ 20 << 1,    0}, { 21 << 1,    0},  //  10: 011x
-	{ 22 << 1,    0}, { 23 << 1,    0},  //  11: 100x
-	{       0,   32}, {       0,   16},  //  12: 101x
-	{       0,    8}, {       0,    4},  //  13: 110x
-	{ 24 << 1,    0}, { 25 << 1,    0},  //  14: 0000x
-	{ 26 << 1,    0}, { 27 << 1,    0},  //  15: 0001x
-	{ 28 << 1,    0}, { 29 << 1,    0},  //  16: 0010x
-	{ 30 << 1,    0}, { 31 << 1,    0},  //  17: 0011x
-	{       0,   62}, {       0,    2},  //  18: 0100x
-	{       0,   61}, {       0,    1},  //  19: 0101x
-	{       0,   56}, {       0,   52},  //  20: 0110x
-	{       0,   44}, {       0,   28},  //  21: 0111x
-	{       0,   40}, {       0,   20},  //  22: 1000x
-	{       0,   48}, {       0,   12},  //  23: 1001x
-	{ 32 << 1,    0}, { 33 << 1,    0},  //  24: 0000 0x
-	{ 34 << 1,    0}, { 35 << 1,    0},  //  25: 0000 1x
-	{ 36 << 1,    0}, { 37 << 1,    0},  //  26: 0001 0x
-	{ 38 << 1,    0}, { 39 << 1,    0},  //  27: 0001 1x
-	{ 40 << 1,    0}, { 41 << 1,    0},  //  28: 0010 0x
-	{ 42 << 1,    0}, { 43 << 1,    0},  //  29: 0010 1x
-	{       0,   63}, {       0,    3},  //  30: 0011 0x
-	{       0,   36}, {       0,   24},  //  31: 0011 1x
-	{ 44 << 1,    0}, { 45 << 1,    0},  //  32: 0000 00x
-	{ 46 << 1,    0}, { 47 << 1,    0},  //  33: 0000 01x
-	{ 48 << 1,    0}, { 49 << 1,    0},  //  34: 0000 10x
-	{ 50 << 1,    0}, { 51 << 1,    0},  //  35: 0000 11x
-	{ 52 << 1,    0}, { 53 << 1,    0},  //  36: 0001 00x
-	{ 54 << 1,    0}, { 55 << 1,    0},  //  37: 0001 01x
-	{ 56 << 1,    0}, { 57 << 1,    0},  //  38: 0001 10x
-	{ 58 << 1,    0}, { 59 << 1,    0},  //  39: 0001 11x
-	{       0,   34}, {       0,   18},  //  40: 0010 00x
-	{       0,   10}, {       0,    6},  //  41: 0010 01x
-	{       0,   33}, {       0,   17},  //  42: 0010 10x
-	{       0,    9}, {       0,    5},  //  43: 0010 11x
-	{      -1,    0}, { 60 << 1,    0},  //  44: 0000 000x
-	{ 61 << 1,    0}, { 62 << 1,    0},  //  45: 0000 001x
-	{       0,   58}, {       0,   54},  //  46: 0000 010x
-	{       0,   46}, {       0,   30},  //  47: 0000 011x
-	{       0,   57}, {       0,   53},  //  48: 0000 100x
-	{       0,   45}, {       0,   29},  //  49: 0000 101x
-	{       0,   38}, {       0,   26},  //  50: 0000 110x
-	{       0,   37}, {       0,   25},  //  51: 0000 111x
-	{       0,   43}, {       0,   23},  //  52: 0001 000x
-	{       0,   51}, {       0,   15},  //  53: 0001 001x
-	{       0,   42}, {       0,   22},  //  54: 0001 010x
-	{       0,   50}, {       0,   14},  //  55: 0001 011x
-	{       0,   41}, {       0,   21},  //  56: 0001 100x
-	{       0,   49}, {       0,   13},  //  57: 0001 101x
-	{       0,   35}, {       0,   19},  //  58: 0001 110x
-	{       0,   11}, {       0,    7},  //  59: 0001 111x
-	{       0,   39}, {       0,   27},  //  60: 0000 0001x
-	{       0,   59}, {       0,   55},  //  61: 0000 0010x
-	{       0,   47}, {       0,   31},  //  62: 0000 0011x
-};
-
-static const plm_vlc_t PLM_VIDEO_MOTION[] = {
-	{  1 << 1,    0}, {       0,    0},  //   0: x
-	{  2 << 1,    0}, {  3 << 1,    0},  //   1: 0x
-	{  4 << 1,    0}, {  5 << 1,    0},  //   2: 00x
-	{       0,    1}, {       0,   -1},  //   3: 01x
-	{  6 << 1,    0}, {  7 << 1,    0},  //   4: 000x
-	{       0,    2}, {       0,   -2},  //   5: 001x
-	{  8 << 1,    0}, {  9 << 1,    0},  //   6: 0000x
-	{       0,    3}, {       0,   -3},  //   7: 0001x
-	{ 10 << 1,    0}, { 11 << 1,    0},  //   8: 0000 0x
-	{ 12 << 1,    0}, { 13 << 1,    0},  //   9: 0000 1x
-	{      -1,    0}, { 14 << 1,    0},  //  10: 0000 00x
-	{ 15 << 1,    0}, { 16 << 1,    0},  //  11: 0000 01x
-	{ 17 << 1,    0}, { 18 << 1,    0},  //  12: 0000 10x
-	{       0,    4}, {       0,   -4},  //  13: 0000 11x
-	{      -1,    0}, { 19 << 1,    0},  //  14: 0000 001x
-	{ 20 << 1,    0}, { 21 << 1,    0},  //  15: 0000 010x
-	{       0,    7}, {       0,   -7},  //  16: 0000 011x
-	{       0,    6}, {       0,   -6},  //  17: 0000 100x
-	{       0,    5}, {       0,   -5},  //  18: 0000 101x
-	{ 22 << 1,    0}, { 23 << 1,    0},  //  19: 0000 0011x
-	{ 24 << 1,    0}, { 25 << 1,    0},  //  20: 0000 0100x
-	{ 26 << 1,    0}, { 27 << 1,    0},  //  21: 0000 0101x
-	{ 28 << 1,    0}, { 29 << 1,    0},  //  22: 0000 0011 0x
-	{ 30 << 1,    0}, { 31 << 1,    0},  //  23: 0000 0011 1x
-	{ 32 << 1,    0}, { 33 << 1,    0},  //  24: 0000 0100 0x
-	{       0,   10}, {       0,  -10},  //  25: 0000 0100 1x
-	{       0,    9}, {       0,   -9},  //  26: 0000 0101 0x
-	{       0,    8}, {       0,   -8},  //  27: 0000 0101 1x
-	{       0,   16}, {       0,  -16},  //  28: 0000 0011 00x
-	{       0,   15}, {       0,  -15},  //  29: 0000 0011 01x
-	{       0,   14}, {       0,  -14},  //  30: 0000 0011 10x
-	{       0,   13}, {       0,  -13},  //  31: 0000 0011 11x
-	{       0,   12}, {       0,  -12},  //  32: 0000 0100 00x
-	{       0,   11}, {       0,  -11},  //  33: 0000 0100 01x
-};
-
-static const plm_vlc_t PLM_VIDEO_DCT_SIZE_LUMINANCE[] = {
-	{  1 << 1,    0}, {  2 << 1,    0},  //   0: x
-	{       0,    1}, {       0,    2},  //   1: 0x
-	{  3 << 1,    0}, {  4 << 1,    0},  //   2: 1x
-	{       0,    0}, {       0,    3},  //   3: 10x
-	{       0,    4}, {  5 << 1,    0},  //   4: 11x
-	{       0,    5}, {  6 << 1,    0},  //   5: 111x
-	{       0,    6}, {  7 << 1,    0},  //   6: 1111x
-	{       0,    7}, {  8 << 1,    0},  //   7: 1111 1x
-	{       0,    8}, {      -1,    0},  //   8: 1111 11x
-};
-
-static const plm_vlc_t PLM_VIDEO_DCT_SIZE_CHROMINANCE[] = {
-	{  1 << 1,    0}, {  2 << 1,    0},  //   0: x
-	{       0,    0}, {       0,    1},  //   1: 0x
-	{       0,    2}, {  3 << 1,    0},  //   2: 1x
-	{       0,    3}, {  4 << 1,    0},  //   3: 11x
-	{       0,    4}, {  5 << 1,    0},  //   4: 111x
-	{       0,    5}, {  6 << 1,    0},  //   5: 1111x
-	{       0,    6}, {  7 << 1,    0},  //   6: 1111 1x
-	{       0,    7}, {  8 << 1,    0},  //   7: 1111 11x
-	{       0,    8}, {      -1,    0},  //   8: 1111 111x
-};
-
-static const plm_vlc_t *PLM_VIDEO_DCT_SIZE[] = {
-	PLM_VIDEO_DCT_SIZE_LUMINANCE,
-	PLM_VIDEO_DCT_SIZE_CHROMINANCE,
-	PLM_VIDEO_DCT_SIZE_CHROMINANCE
-};
-
-
-//  dct_coeff bitmap:
-//    0xff00  run
-//    0x00ff  level
-
-//  Decoded values are unsigned. Sign bit follows in the stream.
-
-static const plm_vlc_uint_t PLM_VIDEO_DCT_COEFF[] = {
-	{  1 << 1,        0}, {       0,   0x0001},  //   0: x
-	{  2 << 1,        0}, {  3 << 1,        0},  //   1: 0x
-	{  4 << 1,        0}, {  5 << 1,        0},  //   2: 00x
-	{  6 << 1,        0}, {       0,   0x0101},  //   3: 01x
-	{  7 << 1,        0}, {  8 << 1,        0},  //   4: 000x
-	{  9 << 1,        0}, { 10 << 1,        0},  //   5: 001x
-	{       0,   0x0002}, {       0,   0x0201},  //   6: 010x
-	{ 11 << 1,        0}, { 12 << 1,        0},  //   7: 0000x
-	{ 13 << 1,        0}, { 14 << 1,        0},  //   8: 0001x
-	{ 15 << 1,        0}, {       0,   0x0003},  //   9: 0010x
-	{       0,   0x0401}, {       0,   0x0301},  //  10: 0011x
-	{ 16 << 1,        0}, {       0,   0xffff},  //  11: 0000 0x
-	{ 17 << 1,        0}, { 18 << 1,        0},  //  12: 0000 1x
-	{       0,   0x0701}, {       0,   0x0601},  //  13: 0001 0x
-	{       0,   0x0102}, {       0,   0x0501},  //  14: 0001 1x
-	{ 19 << 1,        0}, { 20 << 1,        0},  //  15: 0010 0x
-	{ 21 << 1,        0}, { 22 << 1,        0},  //  16: 0000 00x
-	{       0,   0x0202}, {       0,   0x0901},  //  17: 0000 10x
-	{       0,   0x0004}, {       0,   0x0801},  //  18: 0000 11x
-	{ 23 << 1,        0}, { 24 << 1,        0},  //  19: 0010 00x
-	{ 25 << 1,        0}, { 26 << 1,        0},  //  20: 0010 01x
-	{ 27 << 1,        0}, { 28 << 1,        0},  //  21: 0000 000x
-	{ 29 << 1,        0}, { 30 << 1,        0},  //  22: 0000 001x
-	{       0,   0x0d01}, {       0,   0x0006},  //  23: 0010 000x
-	{       0,   0x0c01}, {       0,   0x0b01},  //  24: 0010 001x
-	{       0,   0x0302}, {       0,   0x0103},  //  25: 0010 010x
-	{       0,   0x0005}, {       0,   0x0a01},  //  26: 0010 011x
-	{ 31 << 1,        0}, { 32 << 1,        0},  //  27: 0000 0000x
-	{ 33 << 1,        0}, { 34 << 1,        0},  //  28: 0000 0001x
-	{ 35 << 1,        0}, { 36 << 1,        0},  //  29: 0000 0010x
-	{ 37 << 1,        0}, { 38 << 1,        0},  //  30: 0000 0011x
-	{ 39 << 1,        0}, { 40 << 1,        0},  //  31: 0000 0000 0x
-	{ 41 << 1,        0}, { 42 << 1,        0},  //  32: 0000 0000 1x
-	{ 43 << 1,        0}, { 44 << 1,        0},  //  33: 0000 0001 0x
-	{ 45 << 1,        0}, { 46 << 1,        0},  //  34: 0000 0001 1x
-	{       0,   0x1001}, {       0,   0x0502},  //  35: 0000 0010 0x
-	{       0,   0x0007}, {       0,   0x0203},  //  36: 0000 0010 1x
-	{       0,   0x0104}, {       0,   0x0f01},  //  37: 0000 0011 0x
-	{       0,   0x0e01}, {       0,   0x0402},  //  38: 0000 0011 1x
-	{ 47 << 1,        0}, { 48 << 1,        0},  //  39: 0000 0000 00x
-	{ 49 << 1,        0}, { 50 << 1,        0},  //  40: 0000 0000 01x
-	{ 51 << 1,        0}, { 52 << 1,        0},  //  41: 0000 0000 10x
-	{ 53 << 1,        0}, { 54 << 1,        0},  //  42: 0000 0000 11x
-	{ 55 << 1,        0}, { 56 << 1,        0},  //  43: 0000 0001 00x
-	{ 57 << 1,        0}, { 58 << 1,        0},  //  44: 0000 0001 01x
-	{ 59 << 1,        0}, { 60 << 1,        0},  //  45: 0000 0001 10x
-	{ 61 << 1,        0}, { 62 << 1,        0},  //  46: 0000 0001 11x
-	{      -1,        0}, { 63 << 1,        0},  //  47: 0000 0000 000x
-	{ 64 << 1,        0}, { 65 << 1,        0},  //  48: 0000 0000 001x
-	{ 66 << 1,        0}, { 67 << 1,        0},  //  49: 0000 0000 010x
-	{ 68 << 1,        0}, { 69 << 1,        0},  //  50: 0000 0000 011x
-	{ 70 << 1,        0}, { 71 << 1,        0},  //  51: 0000 0000 100x
-	{ 72 << 1,        0}, { 73 << 1,        0},  //  52: 0000 0000 101x
-	{ 74 << 1,        0}, { 75 << 1,        0},  //  53: 0000 0000 110x
-	{ 76 << 1,        0}, { 77 << 1,        0},  //  54: 0000 0000 111x
-	{       0,   0x000b}, {       0,   0x0802},  //  55: 0000 0001 000x
-	{       0,   0x0403}, {       0,   0x000a},  //  56: 0000 0001 001x
-	{       0,   0x0204}, {       0,   0x0702},  //  57: 0000 0001 010x
-	{       0,   0x1501}, {       0,   0x1401},  //  58: 0000 0001 011x
-	{       0,   0x0009}, {       0,   0x1301},  //  59: 0000 0001 100x
-	{       0,   0x1201}, {       0,   0x0105},  //  60: 0000 0001 101x
-	{       0,   0x0303}, {       0,   0x0008},  //  61: 0000 0001 110x
-	{       0,   0x0602}, {       0,   0x1101},  //  62: 0000 0001 111x
-	{ 78 << 1,        0}, { 79 << 1,        0},  //  63: 0000 0000 0001x
-	{ 80 << 1,        0}, { 81 << 1,        0},  //  64: 0000 0000 0010x
-	{ 82 << 1,        0}, { 83 << 1,        0},  //  65: 0000 0000 0011x
-	{ 84 << 1,        0}, { 85 << 1,        0},  //  66: 0000 0000 0100x
-	{ 86 << 1,        0}, { 87 << 1,        0},  //  67: 0000 0000 0101x
-	{ 88 << 1,        0}, { 89 << 1,        0},  //  68: 0000 0000 0110x
-	{ 90 << 1,        0}, { 91 << 1,        0},  //  69: 0000 0000 0111x
-	{       0,   0x0a02}, {       0,   0x0902},  //  70: 0000 0000 1000x
-	{       0,   0x0503}, {       0,   0x0304},  //  71: 0000 0000 1001x
-	{       0,   0x0205}, {       0,   0x0107},  //  72: 0000 0000 1010x
-	{       0,   0x0106}, {       0,   0x000f},  //  73: 0000 0000 1011x
-	{       0,   0x000e}, {       0,   0x000d},  //  74: 0000 0000 1100x
-	{       0,   0x000c}, {       0,   0x1a01},  //  75: 0000 0000 1101x
-	{       0,   0x1901}, {       0,   0x1801},  //  76: 0000 0000 1110x
-	{       0,   0x1701}, {       0,   0x1601},  //  77: 0000 0000 1111x
-	{ 92 << 1,        0}, { 93 << 1,        0},  //  78: 0000 0000 0001 0x
-	{ 94 << 1,        0}, { 95 << 1,        0},  //  79: 0000 0000 0001 1x
-	{ 96 << 1,        0}, { 97 << 1,        0},  //  80: 0000 0000 0010 0x
-	{ 98 << 1,        0}, { 99 << 1,        0},  //  81: 0000 0000 0010 1x
-	{100 << 1,        0}, {101 << 1,        0},  //  82: 0000 0000 0011 0x
-	{102 << 1,        0}, {103 << 1,        0},  //  83: 0000 0000 0011 1x
-	{       0,   0x001f}, {       0,   0x001e},  //  84: 0000 0000 0100 0x
-	{       0,   0x001d}, {       0,   0x001c},  //  85: 0000 0000 0100 1x
-	{       0,   0x001b}, {       0,   0x001a},  //  86: 0000 0000 0101 0x
-	{       0,   0x0019}, {       0,   0x0018},  //  87: 0000 0000 0101 1x
-	{       0,   0x0017}, {       0,   0x0016},  //  88: 0000 0000 0110 0x
-	{       0,   0x0015}, {       0,   0x0014},  //  89: 0000 0000 0110 1x
-	{       0,   0x0013}, {       0,   0x0012},  //  90: 0000 0000 0111 0x
-	{       0,   0x0011}, {       0,   0x0010},  //  91: 0000 0000 0111 1x
-	{104 << 1,        0}, {105 << 1,        0},  //  92: 0000 0000 0001 00x
-	{106 << 1,        0}, {107 << 1,        0},  //  93: 0000 0000 0001 01x
-	{108 << 1,        0}, {109 << 1,        0},  //  94: 0000 0000 0001 10x
-	{110 << 1,        0}, {111 << 1,        0},  //  95: 0000 0000 0001 11x
-	{       0,   0x0028}, {       0,   0x0027},  //  96: 0000 0000 0010 00x
-	{       0,   0x0026}, {       0,   0x0025},  //  97: 0000 0000 0010 01x
-	{       0,   0x0024}, {       0,   0x0023},  //  98: 0000 0000 0010 10x
-	{       0,   0x0022}, {       0,   0x0021},  //  99: 0000 0000 0010 11x
-	{       0,   0x0020}, {       0,   0x010e},  // 100: 0000 0000 0011 00x
-	{       0,   0x010d}, {       0,   0x010c},  // 101: 0000 0000 0011 01x
-	{       0,   0x010b}, {       0,   0x010a},  // 102: 0000 0000 0011 10x
-	{       0,   0x0109}, {       0,   0x0108},  // 103: 0000 0000 0011 11x
-	{       0,   0x0112}, {       0,   0x0111},  // 104: 0000 0000 0001 000x
-	{       0,   0x0110}, {       0,   0x010f},  // 105: 0000 0000 0001 001x
-	{       0,   0x0603}, {       0,   0x1002},  // 106: 0000 0000 0001 010x
-	{       0,   0x0f02}, {       0,   0x0e02},  // 107: 0000 0000 0001 011x
-	{       0,   0x0d02}, {       0,   0x0c02},  // 108: 0000 0000 0001 100x
-	{       0,   0x0b02}, {       0,   0x1f01},  // 109: 0000 0000 0001 101x
-	{       0,   0x1e01}, {       0,   0x1d01},  // 110: 0000 0000 0001 110x
-	{       0,   0x1c01}, {       0,   0x1b01},  // 111: 0000 0000 0001 111x
-};
-
-typedef struct {
-	int full_px;
-	int is_set;
-	int r_size;
-	int h;
-	int v;
-} plm_video_motion_t;
-
-struct plm_video_t {
-	double framerate;
-	double pixel_aspect_ratio; // [jart]
-	double time;
-	int frames_decoded;
-	int width;
-	int height;
-	int mb_width;
-	int mb_height;
-	int mb_size;
-
-	int luma_width;
-	int luma_height;
-
-	int chroma_width;
-	int chroma_height;
-
-	int start_code;
-	int picture_type;
-
-	plm_video_motion_t motion_forward;
-	plm_video_motion_t motion_backward;
-
-	int has_sequence_header;
-
-	int quantizer_scale;
-	int slice_begin;
-	int macroblock_address;
-
-	int mb_row;
-	int mb_col;
-
-	int macroblock_type;
-	int macroblock_intra;
-
-	int dc_predictor[3];
-
-	plm_buffer_t *buffer;
-	int destroy_buffer_when_done;
-
-	plm_frame_t frame_current;
-	plm_frame_t frame_forward;
-	plm_frame_t frame_backward;
-
-	uint8_t *frames_data;
-
-	int block_data[64];
-	uint8_t intra_quant_matrix[64];
-	uint8_t non_intra_quant_matrix[64];
-
-	int has_reference_frame;
-	int assume_no_b_frames;
-};
-
-static inline uint8_t plm_clamp(int n) {
-	if (n > 255) {
-		n = 255;
-	}
-	else if (n < 0) {
-		n = 0;
-	}
-	return n;
-}
-
-int plm_video_decode_sequence_header(plm_video_t *self);
-void plm_video_init_frame(plm_video_t *self, plm_frame_t *frame, uint8_t *base);
-void plm_video_decode_picture(plm_video_t *self);
-void plm_video_decode_slice(plm_video_t *self, int slice);
-void plm_video_decode_macroblock(plm_video_t *self);
-void plm_video_decode_motion_vectors(plm_video_t *self);
-int plm_video_decode_motion_vector(plm_video_t *self, int r_size, int motion);
-void plm_video_predict_macroblock(plm_video_t *self);
-void plm_video_copy_macroblock(plm_video_t *self, plm_frame_t *s, int motion_h, int motion_v);
-void plm_video_interpolate_macroblock(plm_video_t *self, plm_frame_t *s, int motion_h, int motion_v);
-void plm_video_process_macroblock(plm_video_t *self, uint8_t *s, uint8_t *d, int mh, int mb, int bs, int interp);
-void plm_video_decode_block(plm_video_t *self, int block);
-void plm_video_idct(int *block);
-
-plm_video_t * plm_video_create_with_buffer(plm_buffer_t *buffer, int destroy_when_done) {
-	plm_video_t *self = (plm_video_t *)PLM_MALLOC(sizeof(plm_video_t));
-	memset(self, 0, sizeof(plm_video_t));
-	
-	self->buffer = buffer;
-	self->destroy_buffer_when_done = destroy_when_done;
-
-	// Attempt to decode the sequence header
-	self->start_code = plm_buffer_find_start_code(self->buffer, PLM_START_SEQUENCE);
-	if (self->start_code != -1) {
-		plm_video_decode_sequence_header(self);
-	}
-	return self;
-}
-
-void plm_video_destroy(plm_video_t *self) {
-	if (self->destroy_buffer_when_done) {
-		plm_buffer_destroy(self->buffer);
-	}
-
-	if (self->has_sequence_header) {
-		PLM_FREE(self->frames_data);
-	}
-
-	PLM_FREE(self);
-}
-
-double plm_video_get_framerate(plm_video_t *self) {
-	return plm_video_has_header(self)
-		? self->framerate
-		: 0;
-}
-
-double plm_video_get_pixel_aspect_ratio(plm_video_t *self) { // [jart]
-	return plm_video_has_header(self)
-		? self->pixel_aspect_ratio
-		: 0;
-}
-
-int plm_video_get_width(plm_video_t *self) {
-	return plm_video_has_header(self)
-		? self->width
-		: 0;
-}
-
-int plm_video_get_height(plm_video_t *self) {
-	return plm_video_has_header(self)
-		? self->height
-		: 0;
-}
-
-void plm_video_set_no_delay(plm_video_t *self, int no_delay) {
-	self->assume_no_b_frames = no_delay;
-}
-
-double plm_video_get_time(plm_video_t *self) {
-	return self->time;
-}
-
-void plm_video_set_time(plm_video_t *self, double time) {
-	self->frames_decoded = self->framerate * time;
-	self->time = time;
-}
-
-void plm_video_rewind(plm_video_t *self) {
-	plm_buffer_rewind(self->buffer);
-	self->time = 0;
-	self->frames_decoded = 0;
-	self->has_reference_frame = FALSE;
-	self->start_code = -1;
-}
-
-int plm_video_has_ended(plm_video_t *self) {
-	return plm_buffer_has_ended(self->buffer);
-}
-
-plm_frame_t *plm_video_decode(plm_video_t *self) {
-	if (!plm_video_has_header(self)) {
-		return NULL;
-	}
-	
-	struct timespec tsc; // [jart]
-	tsc = timespec_real(); // [jart]
-
-	plm_frame_t *frame = NULL;
-	do {
-		if (self->start_code != PLM_START_PICTURE) {
-			self->start_code = plm_buffer_find_start_code(self->buffer, PLM_START_PICTURE);
-			
-			if (self->start_code == -1) {
-				// If we reached the end of the file and the previously decoded
-				// frame was a reference frame, we still have to return it.
-				if (
-					self->has_reference_frame &&
-					!self->assume_no_b_frames &&
-					plm_buffer_has_ended(self->buffer) && (
-						self->picture_type == PLM_VIDEO_PICTURE_TYPE_INTRA ||
-						self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE
-					)
-				) {
-					self->has_reference_frame = FALSE;
-					frame = &self->frame_backward;
-					break;
-				}
-
-				return NULL;
-			}
-		}
-
-		// Make sure we have a full picture in the buffer before attempting to
-		// decode it. Sadly, this can only be done by seeking for the start code
-		// of the next picture. Also, if we didn't find the start code for the
-		// next picture, but the source has ended, we assume that this last
-		// picture is in the buffer.
-		if (
-			plm_buffer_has_start_code(self->buffer, PLM_START_PICTURE) == -1 &&
-			!plm_buffer_has_ended(self->buffer)
-		) {
-			return NULL;
-		}
-		plm_buffer_discard_read_bytes(self->buffer);
-		
-		plm_video_decode_picture(self);
-
-		if (self->assume_no_b_frames) {
-			frame = &self->frame_backward;
-		}
-		else if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_B) {
-			frame = &self->frame_current;
-		}
-		else if (self->has_reference_frame) {
-			frame = &self->frame_forward;
-		}
-		else {
-			self->has_reference_frame = TRUE;
-		}
-	} while (!frame);
-	
-	frame->time = self->time;
-	self->frames_decoded++;
-	self->time = (double)self->frames_decoded / self->framerate;
-	
-	plmpegdecode_latency_ = timespec_tomicros(timespec_sub(timespec_real(), tsc));
-	return frame;
-}
-
-int plm_video_has_header(plm_video_t *self) {
-	if (self->has_sequence_header) {
-		return TRUE;
-	}
-
-	if (self->start_code != PLM_START_SEQUENCE) {
-		self->start_code = plm_buffer_find_start_code(self->buffer, PLM_START_SEQUENCE);
-	}
-	if (self->start_code == -1) {
-		return FALSE;
-	}
-	
-	if (!plm_video_decode_sequence_header(self)) {
-		return FALSE;
-	}
-
-	return TRUE;
-}
-
-int plm_video_decode_sequence_header(plm_video_t *self) {
-	int max_header_size = 64 + 2 * 64 * 8; // 64 bit header + 2x 64 byte matrix
-	if (!plm_buffer_has(self->buffer, max_header_size)) {
-		return FALSE;
-	}
-
-	self->width = plm_buffer_read(self->buffer, 12);
-	self->height = plm_buffer_read(self->buffer, 12);
-
-	if (self->width <= 0 || self->height <= 0) {
-		return FALSE;
-	}
-
-	// [jart] get pixel aspect ratio
-	int pixel_aspect_ratio_code;
-	pixel_aspect_ratio_code = plm_buffer_read(self->buffer, 4);
-	pixel_aspect_ratio_code -= 1;
-	if (pixel_aspect_ratio_code < 0)
-		pixel_aspect_ratio_code = 0;
-	int par_last = (sizeof(PLM_VIDEO_PIXEL_ASPECT_RATIO) /
-			sizeof(PLM_VIDEO_PIXEL_ASPECT_RATIO[0]) - 1);
-	if (pixel_aspect_ratio_code > par_last)
-		pixel_aspect_ratio_code = par_last;
-	self->pixel_aspect_ratio =
-		PLM_VIDEO_PIXEL_ASPECT_RATIO[pixel_aspect_ratio_code];
-
-	self->framerate = PLM_VIDEO_PICTURE_RATE[plm_buffer_read(self->buffer, 4)];
-
-	// Skip bit_rate, marker, buffer_size and constrained bit
-	plm_buffer_skip(self->buffer, 18 + 1 + 10 + 1);
-
-	// Load custom intra quant matrix?
-	if (plm_buffer_read(self->buffer, 1)) { 
-		for (int i = 0; i < 64; i++) {
-			int idx = PLM_VIDEO_ZIG_ZAG[i];
-			self->intra_quant_matrix[idx] = plm_buffer_read(self->buffer, 8);
-		}
-	}
-	else {
-		memcpy(self->intra_quant_matrix, PLM_VIDEO_INTRA_QUANT_MATRIX, 64);
-	}
-
-	// Load custom non intra quant matrix?
-	if (plm_buffer_read(self->buffer, 1)) { 
-		for (int i = 0; i < 64; i++) {
-			int idx = PLM_VIDEO_ZIG_ZAG[i];
-			self->non_intra_quant_matrix[idx] = plm_buffer_read(self->buffer, 8);
-		}
-	}
-	else {
-		memcpy(self->non_intra_quant_matrix, PLM_VIDEO_NON_INTRA_QUANT_MATRIX, 64);
-	}
-
-	self->mb_width = (self->width + 15) >> 4;
-	self->mb_height = (self->height + 15) >> 4;
-	self->mb_size = self->mb_width * self->mb_height;
-
-	self->luma_width = self->mb_width << 4;
-	self->luma_height = self->mb_height << 4;
-
-	self->chroma_width = self->mb_width << 3;
-	self->chroma_height = self->mb_height << 3;
-
-
-	// Allocate one big chunk of data for all 3 frames = 9 planes
-	size_t luma_plane_size = self->luma_width * self->luma_height;
-	size_t chroma_plane_size = self->chroma_width * self->chroma_height;
-	size_t frame_data_size = (luma_plane_size + 2 * chroma_plane_size);
-
-	self->frames_data = (uint8_t*)PLM_MALLOC(frame_data_size * 3);
-	plm_video_init_frame(self, &self->frame_current, self->frames_data + frame_data_size * 0);
-	plm_video_init_frame(self, &self->frame_forward, self->frames_data + frame_data_size * 1);
-	plm_video_init_frame(self, &self->frame_backward, self->frames_data + frame_data_size * 2);
-
-	self->has_sequence_header = TRUE;
-	return TRUE;
-}
-
-void plm_video_init_frame(plm_video_t *self, plm_frame_t *frame, uint8_t *base) {
-	size_t luma_plane_size = self->luma_width * self->luma_height;
-	size_t chroma_plane_size = self->chroma_width * self->chroma_height;
-
-	frame->width = self->width;
-	frame->height = self->height;
-	frame->y.width = self->luma_width;
-	frame->y.height = self->luma_height;
-	frame->y.data = base;
-
-	frame->cr.width = self->chroma_width;
-	frame->cr.height = self->chroma_height;
-	frame->cr.data = base + luma_plane_size;
-
-	frame->cb.width = self->chroma_width;
-	frame->cb.height = self->chroma_height;
-	frame->cb.data = base + luma_plane_size + chroma_plane_size;
-}
-
-void plm_video_decode_picture(plm_video_t *self) {
-	plm_buffer_skip(self->buffer, 10); // skip temporalReference
-	self->picture_type = plm_buffer_read(self->buffer, 3);
-	plm_buffer_skip(self->buffer, 16); // skip vbv_delay
-
-	// D frames or unknown coding type
-	if (self->picture_type <= 0 || self->picture_type > PLM_VIDEO_PICTURE_TYPE_B) {
-		return;
-	}
-
-	// Forward full_px, f_code
-	if (
-		self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE ||
-		self->picture_type == PLM_VIDEO_PICTURE_TYPE_B
-	) {
-		self->motion_forward.full_px = plm_buffer_read(self->buffer, 1);
-		int f_code = plm_buffer_read(self->buffer, 3);
-		if (f_code == 0) {
-			// Ignore picture with zero f_code
-			return;
-		}
-		self->motion_forward.r_size = f_code - 1;
-	}
-
-	// Backward full_px, f_code
-	if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_B) {
-		self->motion_backward.full_px = plm_buffer_read(self->buffer, 1);
-		int f_code = plm_buffer_read(self->buffer, 3);
-		if (f_code == 0) {
-			// Ignore picture with zero f_code
-			return;
-		}
-		self->motion_backward.r_size = f_code - 1;
-	}
-
-	plm_frame_t frame_temp = self->frame_forward;
-	if (
-		self->picture_type == PLM_VIDEO_PICTURE_TYPE_INTRA ||
-		self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE
-	) {
-		self->frame_forward = self->frame_backward;
-	}
-
-
-	// Find first slice start code; skip extension and user data
-	do {
-		self->start_code = plm_buffer_next_start_code(self->buffer);
-	} while (
-		self->start_code == PLM_START_EXTENSION || 
-		self->start_code == PLM_START_USER_DATA
-	);
-
-	// Decode all slices
-	while (PLM_START_IS_SLICE(self->start_code)) {
-		plm_video_decode_slice(self, self->start_code & 0x000000FF);
-		if (self->macroblock_address >= self->mb_size - 2) {
-			break;
-		}
-		self->start_code = plm_buffer_next_start_code(self->buffer);
-	}
-
-	// If this is a reference picture rotate the prediction pointers
-	if (
-		self->picture_type == PLM_VIDEO_PICTURE_TYPE_INTRA ||
-		self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE
-	) {
-		self->frame_backward = self->frame_current;
-		self->frame_current = frame_temp;
-	}
-}
-
-void plm_video_decode_slice(plm_video_t *self, int slice) {
-	self->slice_begin = TRUE;
-	self->macroblock_address = (slice - 1) * self->mb_width - 1;
-
-	// Reset motion vectors and DC predictors
-	self->motion_backward.h = self->motion_forward.h = 0;
-	self->motion_backward.v = self->motion_forward.v = 0;
-	self->dc_predictor[0] = 128;
-	self->dc_predictor[1] = 128;
-	self->dc_predictor[2] = 128;
-
-	self->quantizer_scale = plm_buffer_read(self->buffer, 5);
-
-	// Skip extra
-	while (plm_buffer_read(self->buffer, 1)) {
-		plm_buffer_skip(self->buffer, 8);
-	}
-
-	do {
-		plm_video_decode_macroblock(self);
-	} while (
-		self->macroblock_address < self->mb_size - 1 &&
-		plm_buffer_peek_non_zero(self->buffer, 23)
-	);
-}
-
-void plm_video_decode_macroblock(plm_video_t *self) {
-	// Decode increment
-	int increment = 0;
-	int t = plm_buffer_read_vlc(self->buffer, PLM_VIDEO_MACROBLOCK_ADDRESS_INCREMENT);
-
-	while (t == 34) {
-		// macroblock_stuffing
-		t = plm_buffer_read_vlc(self->buffer, PLM_VIDEO_MACROBLOCK_ADDRESS_INCREMENT);
-	}
-	while (t == 35) {
-		// macroblock_escape
-		increment += 33;
-		t = plm_buffer_read_vlc(self->buffer, PLM_VIDEO_MACROBLOCK_ADDRESS_INCREMENT);
-	}
-	increment += t;
-
-	// Process any skipped macroblocks
-	if (self->slice_begin) {
-		// The first increment of each slice is relative to beginning of the
-		// previous row, not the previous macroblock
-		self->slice_begin = FALSE;
-		self->macroblock_address += increment;
-	}
-	else {
-		if (self->macroblock_address + increment >= self->mb_size) {
-			return; // invalid
-		}
-		if (increment > 1) {
-			// Skipped macroblocks reset DC predictors
-			self->dc_predictor[0] = 128;
-			self->dc_predictor[1] = 128;
-			self->dc_predictor[2] = 128;
-
-			// Skipped macroblocks in P-pictures reset motion vectors
-			if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE) {
-				self->motion_forward.h = 0;
-				self->motion_forward.v = 0;
-			}
-		}
-
-		// Predict skipped macroblocks
-		while (increment > 1) {
-			self->macroblock_address++;
-			self->mb_row = self->macroblock_address / self->mb_width;
-			self->mb_col = self->macroblock_address % self->mb_width;
-
-			plm_video_predict_macroblock(self);
-			increment--;
-		}
-		self->macroblock_address++;
-	}
-
-	self->mb_row = self->macroblock_address / self->mb_width;
-	self->mb_col = self->macroblock_address % self->mb_width;
-
-	if (self->mb_col >= self->mb_width || self->mb_row >= self->mb_height) {
-		return; // corrupt stream;
-	}
-
-	// Process the current macroblock
-	const plm_vlc_t *table = PLM_VIDEO_MACROBLOCK_TYPE[self->picture_type];
-	self->macroblock_type = plm_buffer_read_vlc(self->buffer, table);
-
-	self->macroblock_intra = (self->macroblock_type & 0x01);
-	self->motion_forward.is_set = (self->macroblock_type & 0x08);
-	self->motion_backward.is_set = (self->macroblock_type & 0x04);
-
-	// Quantizer scale
-	if ((self->macroblock_type & 0x10) != 0) {
-		self->quantizer_scale = plm_buffer_read(self->buffer, 5);
-	}
-
-	if (self->macroblock_intra) {
-		// Intra-coded macroblocks reset motion vectors
-		self->motion_backward.h = self->motion_forward.h = 0;
-		self->motion_backward.v = self->motion_forward.v = 0;
-	}
-	else {
-		// Non-intra macroblocks reset DC predictors
-		self->dc_predictor[0] = 128;
-		self->dc_predictor[1] = 128;
-		self->dc_predictor[2] = 128;
-
-		plm_video_decode_motion_vectors(self);
-		plm_video_predict_macroblock(self);
-	}
-
-	// Decode blocks
-	int cbp = ((self->macroblock_type & 0x02) != 0)
-		? plm_buffer_read_vlc(self->buffer, PLM_VIDEO_CODE_BLOCK_PATTERN)
-		: (self->macroblock_intra ? 0x3f : 0);
-
-	for (int block = 0, mask = 0x20; block < 6; block++) {
-		if ((cbp & mask) != 0) {
-			plm_video_decode_block(self, block);
-		}
-		mask >>= 1;
-	}
-}
-
-void plm_video_decode_motion_vectors(plm_video_t *self) {
-
-	// Forward
-	if (self->motion_forward.is_set) {
-		int r_size = self->motion_forward.r_size;
-		self->motion_forward.h = plm_video_decode_motion_vector(self, r_size, self->motion_forward.h);
-		self->motion_forward.v = plm_video_decode_motion_vector(self, r_size, self->motion_forward.v);
-	}
-	else if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_PREDICTIVE) {
-		// No motion information in P-picture, reset vectors
-		self->motion_forward.h = 0;
-		self->motion_forward.v = 0;
-	}
-
-	if (self->motion_backward.is_set) {
-		int r_size = self->motion_backward.r_size;
-		self->motion_backward.h = plm_video_decode_motion_vector(self, r_size, self->motion_backward.h);
-		self->motion_backward.v = plm_video_decode_motion_vector(self, r_size, self->motion_backward.v);
-	}
-}
-
-int plm_video_decode_motion_vector(plm_video_t *self, int r_size, int motion) {
-	int fscale = 1 << r_size;
-	int m_code = plm_buffer_read_vlc(self->buffer, PLM_VIDEO_MOTION);
-	int r = 0;
-	int d;
-
-	if ((m_code != 0) && (fscale != 1)) {
-		r = plm_buffer_read(self->buffer, r_size);
-		d = ((abs(m_code) - 1) << r_size) + r + 1;
-		if (m_code < 0) {
-			d = -d;
-		}
-	}
-	else {
-		d = m_code;
-	}
-
-	motion += d;
-	if (motion > (fscale << 4) - 1) {
-		motion -= fscale << 5;
-	}
-	else if (motion < (int)((unsigned)(-fscale) << 4)) {  // [jart]
-		motion += fscale << 5;
-	}
-
-	return motion;
-}
-
-void plm_video_predict_macroblock(plm_video_t *self) {
-	int fw_h = self->motion_forward.h;
-	int fw_v = self->motion_forward.v;
-
-	if (self->motion_forward.full_px) {
-		fw_h <<= 1;
-		fw_v <<= 1;
-	}
-
-	if (self->picture_type == PLM_VIDEO_PICTURE_TYPE_B) {
-		int bw_h = self->motion_backward.h;
-		int bw_v = self->motion_backward.v;
-
-		if (self->motion_backward.full_px) {
-			bw_h <<= 1;
-			bw_v <<= 1;
-		}
-
-		if (self->motion_forward.is_set) {
-			plm_video_copy_macroblock(self, &self->frame_forward, fw_h, fw_v);
-			if (self->motion_backward.is_set) {
-				plm_video_interpolate_macroblock(self, &self->frame_backward, bw_h, bw_v);
-			}
-		}
-		else {
-			plm_video_copy_macroblock(self, &self->frame_backward, bw_h, bw_v);
-		}
-	}
-	else {
-		plm_video_copy_macroblock(self, &self->frame_forward, fw_h, fw_v);
-	}
-}
-
-void plm_video_copy_macroblock(plm_video_t *self, plm_frame_t *s, int motion_h, int motion_v) {
-	plm_frame_t *d = &self->frame_current;
-	plm_video_process_macroblock(self, s->y.data, d->y.data, motion_h, motion_v, 16, FALSE);
-	plm_video_process_macroblock(self, s->cr.data, d->cr.data, motion_h / 2, motion_v / 2, 8, FALSE);
-	plm_video_process_macroblock(self, s->cb.data, d->cb.data, motion_h / 2, motion_v / 2, 8, FALSE);
-}
-
-void plm_video_interpolate_macroblock(plm_video_t *self, plm_frame_t *s, int motion_h, int motion_v) {
-	plm_frame_t *d = &self->frame_current;
-	plm_video_process_macroblock(self, s->y.data, d->y.data, motion_h, motion_v, 16, TRUE);
-	plm_video_process_macroblock(self, s->cr.data, d->cr.data, motion_h / 2, motion_v / 2, 8, TRUE);
-	plm_video_process_macroblock(self, s->cb.data, d->cb.data, motion_h / 2, motion_v / 2, 8, TRUE);
-}
-
-#define PLM_BLOCK_SET(DEST, DEST_INDEX, DEST_WIDTH, SOURCE_INDEX, SOURCE_WIDTH, BLOCK_SIZE, OP) do { \
-	int dest_scan = DEST_WIDTH - BLOCK_SIZE; \
-	int source_scan = SOURCE_WIDTH - BLOCK_SIZE; \
-	for (int y = 0; y < BLOCK_SIZE; y++) { \
-		for (int x = 0; x < BLOCK_SIZE; x++) { \
-			DEST[DEST_INDEX] = OP; \
-			SOURCE_INDEX++; DEST_INDEX++; \
-		} \
-		SOURCE_INDEX += source_scan; \
-		DEST_INDEX += dest_scan; \
-	}} while(FALSE)
-
-void plm_video_process_macroblock(
-	plm_video_t *self, uint8_t *s, uint8_t *d,
-	int motion_h, int motion_v, int block_size, int interpolate
-) {
-	int dw = self->mb_width * block_size;
-
-	int hp = motion_h >> 1;
-	int vp = motion_v >> 1;
-	int odd_h = (motion_h & 1) == 1;
-	int odd_v = (motion_v & 1) == 1;
-
-	unsigned int si = ((self->mb_row * block_size) + vp) * dw + (self->mb_col * block_size) + hp;
-	unsigned int di = (self->mb_row * dw + self->mb_col) * block_size;
-	
-	unsigned int max_address = (dw * (self->mb_height * block_size - block_size + 1) - block_size);
-	if (si > max_address || di > max_address) {
-		return; // corrupt video
-	}
-
-	#define PLM_MB_CASE(INTERPOLATE, ODD_H, ODD_V, OP) \
-		case ((INTERPOLATE << 2) | (ODD_H << 1) | (ODD_V)): \
-			PLM_BLOCK_SET(d, di, dw, si, dw, block_size, OP); \
-			break
-
-	switch ((interpolate << 2) | (odd_h << 1) | (odd_v)) {
-		PLM_MB_CASE(0, 0, 0, (s[si]));
-		PLM_MB_CASE(0, 0, 1, (s[si] + s[si + dw] + 1) >> 1);
-		PLM_MB_CASE(0, 1, 0, (s[si] + s[si + 1] + 1) >> 1);
-		PLM_MB_CASE(0, 1, 1, (s[si] + s[si + 1] + s[si + dw] + s[si + dw + 1] + 2) >> 2);
-
-		PLM_MB_CASE(1, 0, 0, (d[di] + (s[si]) + 1) >> 1);
-		PLM_MB_CASE(1, 0, 1, (d[di] + ((s[si] + s[si + dw] + 1) >> 1) + 1) >> 1);
-		PLM_MB_CASE(1, 1, 0, (d[di] + ((s[si] + s[si + 1] + 1) >> 1) + 1) >> 1);
-		PLM_MB_CASE(1, 1, 1, (d[di] + ((s[si] + s[si + 1] + s[si + dw] + s[si + dw + 1] + 2) >> 2) + 1) >> 1);
-	}
-
-	#undef PLM_MB_CASE
-}
-
-void plm_video_decode_block(plm_video_t *self, int block) {
-
-	int n = 0;
-	uint8_t *quant_matrix;
-
-	// Decode DC coefficient of intra-coded blocks
-	if (self->macroblock_intra) {
-		int predictor;
-		int dct_size;
-
-		// DC prediction
-		int plane_index = block > 3 ? block - 3 : 0;
-		predictor = self->dc_predictor[plane_index];
-		dct_size = plm_buffer_read_vlc(self->buffer, PLM_VIDEO_DCT_SIZE[plane_index]);
-
-		// Read DC coeff
-		if (dct_size > 0) {
-			int differential = plm_buffer_read(self->buffer, dct_size);
-			if ((differential & (1 << (dct_size - 1))) != 0) {
-				self->block_data[0] = predictor + differential;
-			}
-			else {
-				self->block_data[0] = predictor + (-(1 << dct_size) | (differential + 1));
-			}
-		}
-		else {
-			self->block_data[0] = predictor;
-		}
-
-		// Save predictor value
-		self->dc_predictor[plane_index] = self->block_data[0];
-
-		// Dequantize + premultiply
-		self->block_data[0] <<= (3 + 5);
-
-		quant_matrix = self->intra_quant_matrix;
-		n = 1;
-	}
-	else {
-		quant_matrix = self->non_intra_quant_matrix;
-	}
-
-	// Decode AC coefficients (+DC for non-intra)
-	int level = 0;
-	while (TRUE) {
-		int run = 0;
-		uint16_t coeff = plm_buffer_read_vlc_uint(self->buffer, PLM_VIDEO_DCT_COEFF);
-
-		if ((coeff == 0x0001) && (n > 0) && (plm_buffer_read(self->buffer, 1) == 0)) {
-			// end_of_block
-			break;
-		}
-		if (coeff == 0xffff) {
-			// escape
-			run = plm_buffer_read(self->buffer, 6);
-			level = plm_buffer_read(self->buffer, 8);
-			if (level == 0) {
-				level = plm_buffer_read(self->buffer, 8);
-			}
-			else if (level == 128) {
-				level = plm_buffer_read(self->buffer, 8) - 256;
-			}
-			else if (level > 128) {
-				level = level - 256;
-			}
-		}
-		else {
-			run = coeff >> 8;
-			level = coeff & 0xff;
-			if (plm_buffer_read(self->buffer, 1)) {
-				level = -level;
-			}
-		}
-
-		n += run;
-		if (n < 0 || n >= 64) {
-			return; // invalid
-		}
-
-		int de_zig_zagged = PLM_VIDEO_ZIG_ZAG[n];
-		n++;
-
-		// Dequantize, oddify, clip
-		level = (unsigned)level << 1;  // [jart]
-		if (!self->macroblock_intra) {
-			level += (level < 0 ? -1 : 1);
-		}
-		level = (level * self->quantizer_scale * quant_matrix[de_zig_zagged]) >> 4;
-		if ((level & 1) == 0) {
-			level -= level > 0 ? 1 : -1;
-		}
-		if (level > 2047) {
-			level = 2047;
-		}
-		else if (level < -2048) {
-			level = -2048;
-		}
-
-		// Save premultiplied coefficient
-		self->block_data[de_zig_zagged] = level * PLM_VIDEO_PREMULTIPLIER_MATRIX[de_zig_zagged];
-	}
-
-	// Move block to its place
-	uint8_t *d;
-	int dw;
-	int di;
-
-	if (block < 4) {
-		d = self->frame_current.y.data;
-		dw = self->luma_width;
-		di = (self->mb_row * self->luma_width + self->mb_col) << 4;
-		if ((block & 1) != 0) {
-			di += 8;
-		}
-		if ((block & 2) != 0) {
-			di += self->luma_width << 3;
-		}
-	}
-	else {
-		d = (block == 4) ? self->frame_current.cb.data : self->frame_current.cr.data;
-		dw = self->chroma_width;
-		di = ((self->mb_row * self->luma_width) << 2) + (self->mb_col << 3);
-	}
-
-	int *s = self->block_data;
-	int si = 0;
-	if (self->macroblock_intra) {
-		// Overwrite (no prediction)
-		if (n == 1) {
-			int clamped = plm_clamp((s[0] + 128) >> 8);
-			PLM_BLOCK_SET(d, di, dw, si, 8, 8, clamped);
-			s[0] = 0;
-		}
-		else {
-			plm_video_idct(s);
-			PLM_BLOCK_SET(d, di, dw, si, 8, 8, plm_clamp(s[si]));
-			memset(self->block_data, 0, sizeof(self->block_data));
-		}
-	}
-	else {
-		// Add data to the predicted macroblock
-		if (n == 1) {
-			int value = (s[0] + 128) >> 8;
-			PLM_BLOCK_SET(d, di, dw, si, 8, 8, plm_clamp(d[di] + value));
-			s[0] = 0;
-		}
-		else {
-			plm_video_idct(s);
-			PLM_BLOCK_SET(d, di, dw, si, 8, 8, plm_clamp(d[di] + s[si]));
-			memset(self->block_data, 0, sizeof(self->block_data));
-		}
-	}
-}
-
-void plm_video_idct(int *block) {
-	int
-		b1, b3, b4, b6, b7, tmp1, tmp2, m0,
-		x0, x1, x2, x3, x4, y3, y4, y5, y6, y7;
-
-	// Transform columns
-	for (int i = 0; i < 8; ++i) {
-		b1 = block[4 * 8 + i];
-		b3 = block[2 * 8 + i] + block[6 * 8 + i];
-		b4 = block[5 * 8 + i] - block[3 * 8 + i];
-		tmp1 = block[1 * 8 + i] + block[7 * 8 + i];
-		tmp2 = block[3 * 8 + i] + block[5 * 8 + i];
-		b6 = block[1 * 8 + i] - block[7 * 8 + i];
-		b7 = tmp1 + tmp2;
-		m0 = block[0 * 8 + i];
-		x4 = ((b6 * 473 - b4 * 196 + 128) >> 8) - b7;
-		x0 = x4 - (((tmp1 - tmp2) * 362 + 128) >> 8);
-		x1 = m0 - b1;
-		x2 = (((block[2 * 8 + i] - block[6 * 8 + i]) * 362 + 128) >> 8) - b3;
-		x3 = m0 + b1;
-		y3 = x1 + x2;
-		y4 = x3 + b3;
-		y5 = x1 - x2;
-		y6 = x3 - b3;
-		y7 = -x0 - ((b4 * 473 + b6 * 196 + 128) >> 8);
-		block[0 * 8 + i] = b7 + y4;
-		block[1 * 8 + i] = x4 + y3;
-		block[2 * 8 + i] = y5 - x0;
-		block[3 * 8 + i] = y6 - y7;
-		block[4 * 8 + i] = y6 + y7;
-		block[5 * 8 + i] = x0 + y5;
-		block[6 * 8 + i] = y3 - x4;
-		block[7 * 8 + i] = y4 - b7;
-	}
-
-	// Transform rows
-	for (int i = 0; i < 64; i += 8) {
-		b1 = block[4 + i];
-		b3 = block[2 + i] + block[6 + i];
-		b4 = block[5 + i] - block[3 + i];
-		tmp1 = block[1 + i] + block[7 + i];
-		tmp2 = block[3 + i] + block[5 + i];
-		b6 = block[1 + i] - block[7 + i];
-		b7 = tmp1 + tmp2;
-		m0 = block[0 + i];
-		x4 = ((b6 * 473 - b4 * 196 + 128) >> 8) - b7;
-		x0 = x4 - (((tmp1 - tmp2) * 362 + 128) >> 8);
-		x1 = m0 - b1;
-		x2 = (((block[2 + i] - block[6 + i]) * 362 + 128) >> 8) - b3;
-		x3 = m0 + b1;
-		y3 = x1 + x2;
-		y4 = x3 + b3;
-		y5 = x1 - x2;
-		y6 = x3 - b3;
-		y7 = -x0 - ((b4 * 473 + b6 * 196 + 128) >> 8);
-		block[0 + i] = (b7 + y4 + 128) >> 8;
-		block[1 + i] = (x4 + y3 + 128) >> 8;
-		block[2 + i] = (y5 - x0 + 128) >> 8;
-		block[3 + i] = (y6 - y7 + 128) >> 8;
-		block[4 + i] = (y6 + y7 + 128) >> 8;
-		block[5 + i] = (x0 + y5 + 128) >> 8;
-		block[6 + i] = (y3 - x4 + 128) >> 8;
-		block[7 + i] = (y4 - b7 + 128) >> 8;
-	}
-}
-
-// YCbCr conversion following the BT.601 standard:
-// https://infogalactic.com/info/YCbCr#ITU-R_BT.601_conversion
-
-#define PLM_PUT_PIXEL(RI, GI, BI, Y_OFFSET, DEST_OFFSET) \
-	y = ((frame->y.data[y_index + Y_OFFSET]-16) * 76309) >> 16; \
-	dest[d_index + DEST_OFFSET + RI] = plm_clamp(y + r); \
-	dest[d_index + DEST_OFFSET + GI] = plm_clamp(y - g); \
-	dest[d_index + DEST_OFFSET + BI] = plm_clamp(y + b);
-
-#define PLM_DEFINE_FRAME_CONVERT_FUNCTION(NAME, BYTES_PER_PIXEL, RI, GI, BI) \
-	void NAME(plm_frame_t *frame, uint8_t *dest, int stride) { \
-		int cols = frame->width >> 1; \
-		int rows = frame->height >> 1; \
-		int yw = frame->y.width; \
-		int cw = frame->cb.width; \
-		for (int row = 0; row < rows; row++) { \
-			int c_index = row * cw; \
-			int y_index = row * 2 * yw; \
-			int d_index = row * 2 * stride; \
-			for (int col = 0; col < cols; col++) { \
-				int y; \
-				int cr = frame->cr.data[c_index] - 128; \
-				int cb = frame->cb.data[c_index] - 128; \
-				int r = (cr * 104597) >> 16; \
-				int g = (cb * 25674 + cr * 53278) >> 16; \
-				int b = (cb * 132201) >> 16; \
-				PLM_PUT_PIXEL(RI, GI, BI, 0,      0); \
-				PLM_PUT_PIXEL(RI, GI, BI, 1,      BYTES_PER_PIXEL); \
-				PLM_PUT_PIXEL(RI, GI, BI, yw,     stride); \
-				PLM_PUT_PIXEL(RI, GI, BI, yw + 1, stride + BYTES_PER_PIXEL); \
-				c_index += 1; \
-				y_index += 2; \
-				d_index += 2 * BYTES_PER_PIXEL; \
-			} \
-		} \
-	}
-
-PLM_DEFINE_FRAME_CONVERT_FUNCTION(plm_frame_to_rgb,  3, 0, 1, 2)
-PLM_DEFINE_FRAME_CONVERT_FUNCTION(plm_frame_to_bgr,  3, 2, 1, 0)
-PLM_DEFINE_FRAME_CONVERT_FUNCTION(plm_frame_to_rgba, 4, 0, 1, 2)
-PLM_DEFINE_FRAME_CONVERT_FUNCTION(plm_frame_to_bgra, 4, 2, 1, 0)
-PLM_DEFINE_FRAME_CONVERT_FUNCTION(plm_frame_to_argb, 4, 1, 2, 3)
-PLM_DEFINE_FRAME_CONVERT_FUNCTION(plm_frame_to_abgr, 4, 3, 2, 1)
-
-
-#undef PLM_PUT_PIXEL
-#undef PLM_DEFINE_FRAME_CONVERT_FUNCTION
-
-
-
-// -----------------------------------------------------------------------------
-// plm_audio implementation
-
-// Based on kjmp2 by Martin J. Fiedler
-// http://keyj.emphy.de/kjmp2/
-
-static const int PLM_AUDIO_FRAME_SYNC = 0x7ff;
-
-static const int PLM_AUDIO_MPEG_2_5 = 0x0;
-static const int PLM_AUDIO_MPEG_2 = 0x2;
-static const int PLM_AUDIO_MPEG_1 = 0x3;
-
-static const int PLM_AUDIO_LAYER_III = 0x1;
-static const int PLM_AUDIO_LAYER_II = 0x2;
-static const int PLM_AUDIO_LAYER_I = 0x3;
-
-static const int PLM_AUDIO_MODE_STEREO = 0x0;
-static const int PLM_AUDIO_MODE_JOINT_STEREO = 0x1;
-static const int PLM_AUDIO_MODE_DUAL_CHANNEL = 0x2;
-static const int PLM_AUDIO_MODE_MONO = 0x3;
-
-static const unsigned short PLM_AUDIO_SAMPLE_RATE[] = {
-	44100, 48000, 32000, 0, // MPEG-1
-	22050, 24000, 16000, 0  // MPEG-2
-};
-
-static const short PLM_AUDIO_BIT_RATE[] = {
-	32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384, // MPEG-1
-	 8, 16, 24, 32, 40, 48,  56,  64,  80,  96, 112, 128, 144, 160  // MPEG-2
-};
-
-static const int PLM_AUDIO_SCALEFACTOR_BASE[] = {
-	0x02000000, 0x01965FEA, 0x01428A30
-};
-
-static const float PLM_AUDIO_SYNTHESIS_WINDOW[] = {
-	     0.0,     -0.5,     -0.5,     -0.5,     -0.5,     -0.5,
-	    -0.5,     -1.0,     -1.0,     -1.0,     -1.0,     -1.5,
-	    -1.5,     -2.0,     -2.0,     -2.5,     -2.5,     -3.0,
-	    -3.5,     -3.5,     -4.0,     -4.5,     -5.0,     -5.5,
-	    -6.5,     -7.0,     -8.0,     -8.5,     -9.5,    -10.5,
-	   -12.0,    -13.0,    -14.5,    -15.5,    -17.5,    -19.0,
-	   -20.5,    -22.5,    -24.5,    -26.5,    -29.0,    -31.5,
-	   -34.0,    -36.5,    -39.5,    -42.5,    -45.5,    -48.5,
-	   -52.0,    -55.5,    -58.5,    -62.5,    -66.0,    -69.5,
-	   -73.5,    -77.0,    -80.5,    -84.5,    -88.0,    -91.5,
-	   -95.0,    -98.0,   -101.0,   -104.0,    106.5,    109.0,
-	   111.0,    112.5,    113.5,    114.0,    114.0,    113.5,
-	   112.0,    110.5,    107.5,    104.0,    100.0,     94.5,
-	    88.5,     81.5,     73.0,     63.5,     53.0,     41.5,
-	    28.5,     14.5,     -1.0,    -18.0,    -36.0,    -55.5,
-	   -76.5,    -98.5,   -122.0,   -147.0,   -173.5,   -200.5,
-	  -229.5,   -259.5,   -290.5,   -322.5,   -355.5,   -389.5,
-	  -424.0,   -459.5,   -495.5,   -532.0,   -568.5,   -605.0,
-	  -641.5,   -678.0,   -714.0,   -749.0,   -783.5,   -817.0,
-	  -849.0,   -879.5,   -908.5,   -935.0,   -959.5,   -981.0,
-	 -1000.5,  -1016.0,  -1028.5,  -1037.5,  -1042.5,  -1043.5,
-	 -1040.0,  -1031.5,   1018.5,   1000.0,    976.0,    946.5,
-	   911.0,    869.5,    822.0,    767.5,    707.0,    640.0,
-	   565.5,    485.0,    397.0,    302.5,    201.0,     92.5,
-	   -22.5,   -144.0,   -272.5,   -407.0,   -547.5,   -694.0,
-	  -846.0,  -1003.0,  -1165.0,  -1331.5,  -1502.0,  -1675.5,
-	 -1852.5,  -2031.5,  -2212.5,  -2394.0,  -2576.5,  -2758.5,
-	 -2939.5,  -3118.5,  -3294.5,  -3467.5,  -3635.5,  -3798.5,
-	 -3955.0,  -4104.5,  -4245.5,  -4377.5,  -4499.0,  -4609.5,
-	 -4708.0,  -4792.5,  -4863.5,  -4919.0,  -4958.0,  -4979.5,
-	 -4983.0,  -4967.5,  -4931.5,  -4875.0,  -4796.0,  -4694.5,
-	 -4569.5,  -4420.0,  -4246.0,  -4046.0,  -3820.0,  -3567.0,
-	  3287.0,   2979.5,   2644.0,   2280.5,   1888.0,   1467.5,
-	  1018.5,    541.0,     35.0,   -499.0,  -1061.0,  -1650.0,
-	 -2266.5,  -2909.0,  -3577.0,  -4270.0,  -4987.5,  -5727.5,
-	 -6490.0,  -7274.0,  -8077.5,  -8899.5,  -9739.0, -10594.5,
-	-11464.5, -12347.0, -13241.0, -14144.5, -15056.0, -15973.5,
-	-16895.5, -17820.0, -18744.5, -19668.0, -20588.0, -21503.0,
-	-22410.5, -23308.5, -24195.0, -25068.5, -25926.5, -26767.0,
-	-27589.0, -28389.0, -29166.5, -29919.0, -30644.5, -31342.0,
-	-32009.5, -32645.0, -33247.0, -33814.5, -34346.0, -34839.5,
-	-35295.0, -35710.0, -36084.5, -36417.5, -36707.5, -36954.0,
-	-37156.5, -37315.0, -37428.0, -37496.0,  37519.0,  37496.0,
-	 37428.0,  37315.0,  37156.5,  36954.0,  36707.5,  36417.5,
-	 36084.5,  35710.0,  35295.0,  34839.5,  34346.0,  33814.5,
-	 33247.0,  32645.0,  32009.5,  31342.0,  30644.5,  29919.0,
-	 29166.5,  28389.0,  27589.0,  26767.0,  25926.5,  25068.5,
-	 24195.0,  23308.5,  22410.5,  21503.0,  20588.0,  19668.0,
-	 18744.5,  17820.0,  16895.5,  15973.5,  15056.0,  14144.5,
-	 13241.0,  12347.0,  11464.5,  10594.5,   9739.0,   8899.5,
-	  8077.5,   7274.0,   6490.0,   5727.5,   4987.5,   4270.0,
-	  3577.0,   2909.0,   2266.5,   1650.0,   1061.0,    499.0,
-	   -35.0,   -541.0,  -1018.5,  -1467.5,  -1888.0,  -2280.5,
-	 -2644.0,  -2979.5,   3287.0,   3567.0,   3820.0,   4046.0,
-	  4246.0,   4420.0,   4569.5,   4694.5,   4796.0,   4875.0,
-	  4931.5,   4967.5,   4983.0,   4979.5,   4958.0,   4919.0,
-	  4863.5,   4792.5,   4708.0,   4609.5,   4499.0,   4377.5,
-	  4245.5,   4104.5,   3955.0,   3798.5,   3635.5,   3467.5,
-	  3294.5,   3118.5,   2939.5,   2758.5,   2576.5,   2394.0,
-	  2212.5,   2031.5,   1852.5,   1675.5,   1502.0,   1331.5,
-	  1165.0,   1003.0,    846.0,    694.0,    547.5,    407.0,
-	   272.5,    144.0,     22.5,    -92.5,   -201.0,   -302.5,
-	  -397.0,   -485.0,   -565.5,   -640.0,   -707.0,   -767.5,
-	  -822.0,   -869.5,   -911.0,   -946.5,   -976.0,  -1000.0,
-	  1018.5,   1031.5,   1040.0,   1043.5,   1042.5,   1037.5,
-	  1028.5,   1016.0,   1000.5,    981.0,    959.5,    935.0,
-	   908.5,    879.5,    849.0,    817.0,    783.5,    749.0,
-	   714.0,    678.0,    641.5,    605.0,    568.5,    532.0,
-	   495.5,    459.5,    424.0,    389.5,    355.5,    322.5,
-	   290.5,    259.5,    229.5,    200.5,    173.5,    147.0,
-	   122.0,     98.5,     76.5,     55.5,     36.0,     18.0,
-	     1.0,    -14.5,    -28.5,    -41.5,    -53.0,    -63.5,
-	   -73.0,    -81.5,    -88.5,    -94.5,   -100.0,   -104.0,
-	  -107.5,   -110.5,   -112.0,   -113.5,   -114.0,   -114.0,
-	  -113.5,   -112.5,   -111.0,   -109.0,    106.5,    104.0,
-	   101.0,     98.0,     95.0,     91.5,     88.0,     84.5,
-	    80.5,     77.0,     73.5,     69.5,     66.0,     62.5,
-	    58.5,     55.5,     52.0,     48.5,     45.5,     42.5,
-	    39.5,     36.5,     34.0,     31.5,     29.0,     26.5,
-	    24.5,     22.5,     20.5,     19.0,     17.5,     15.5,
-	    14.5,     13.0,     12.0,     10.5,      9.5,      8.5,
-	     8.0,      7.0,      6.5,      5.5,      5.0,      4.5,
-	     4.0,      3.5,      3.5,      3.0,      2.5,      2.5,
-	     2.0,      2.0,      1.5,      1.5,      1.0,      1.0,
-	     1.0,      1.0,      0.5,      0.5,      0.5,      0.5,
-	     0.5,      0.5
-};
-
-// Quantizer lookup, step 1: bitrate classes
-static const uint8_t PLM_AUDIO_QUANT_LUT_STEP_1[2][16] = {
-	// 32, 48, 56, 64, 80, 96,112,128,160,192,224,256,320,384 <- bitrate
-	{ 0,  0,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2 }, // mono
-	// 16, 24, 28, 32, 40, 48, 56, 64, 80, 96,112,128,160,192 <- bitrate / chan
-	{ 0,  0,  0,  0,  0,  0,  1,  1,  1,  2,  2,  2,  2,  2 } // stereo
-};
-
-// Quantizer lookup, step 2: bitrate class, sample rate -> B2 table idx, sblimit
-#define PLM_AUDIO_QUANT_TAB_A (27 | 64)   // Table 3-B.2a: high-rate, sblimit = 27
-#define PLM_AUDIO_QUANT_TAB_B (30 | 64)   // Table 3-B.2b: high-rate, sblimit = 30
-#define PLM_AUDIO_QUANT_TAB_C 8           // Table 3-B.2c:  low-rate, sblimit =  8
-#define PLM_AUDIO_QUANT_TAB_D 12          // Table 3-B.2d:  low-rate, sblimit = 12
-
-static const uint8_t QUANT_LUT_STEP_2[3][3] = {
-	//44.1 kHz,              48 kHz,                32 kHz
-	{ PLM_AUDIO_QUANT_TAB_C, PLM_AUDIO_QUANT_TAB_C, PLM_AUDIO_QUANT_TAB_D }, // 32 - 48 kbit/sec/ch
-	{ PLM_AUDIO_QUANT_TAB_A, PLM_AUDIO_QUANT_TAB_A, PLM_AUDIO_QUANT_TAB_A }, // 56 - 80 kbit/sec/ch
-	{ PLM_AUDIO_QUANT_TAB_B, PLM_AUDIO_QUANT_TAB_A, PLM_AUDIO_QUANT_TAB_B }  // 96+	 kbit/sec/ch
-};
-
-// Quantizer lookup, step 3: B2 table, subband -> nbal, row index
-// (upper 4 bits: nbal, lower 4 bits: row index)
-static const uint8_t PLM_AUDIO_QUANT_LUT_STEP_3[3][32] = {
-	// Low-rate table (3-B.2c and 3-B.2d)
-	{
-		0x44,0x44,
-		0x34,0x34,0x34,0x34,0x34,0x34,0x34,0x34,0x34,0x34
-	},
-	// High-rate table (3-B.2a and 3-B.2b)
-	{
-		0x43,0x43,0x43,
-		0x42,0x42,0x42,0x42,0x42,0x42,0x42,0x42,
-		0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,0x31,
-		0x20,0x20,0x20,0x20,0x20,0x20,0x20
-	},
-	// MPEG-2 LSR table (B.2 in ISO 13818-3)
-	{
-		0x45,0x45,0x45,0x45,
-		0x34,0x34,0x34,0x34,0x34,0x34,0x34,
-		0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24,
-		0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24,0x24
-	}
-};
-
-// Quantizer lookup, step 4: table row, allocation[] value -> quant table index
-static const uint8_t PLM_AUDIO_QUANT_LUT_STEP_4[6][16] = {
-	{ 0, 1, 2, 17 },
-	{ 0, 1, 2,  3, 4, 5, 6, 17 },
-	{ 0, 1, 2,  3, 4, 5, 6,  7,  8,  9, 10, 11, 12, 13, 14, 17 },
-	{ 0, 1, 3,  5, 6, 7, 8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
-	{ 0, 1, 2,  4, 5, 6, 7,  8,  9, 10, 11, 12, 13, 14, 15, 17 },
-	{ 0, 1, 2,  3, 4, 5, 6,  7,  8,  9, 10, 11, 12, 13, 14, 15 }
-};
-
-typedef struct plm_quantizer_spec_t {
-	unsigned short levels;
-	unsigned char group;
-	unsigned char bits;
-} plm_quantizer_spec_t;
-
-static const plm_quantizer_spec_t PLM_AUDIO_QUANT_TAB[] = {
-	{     3, 1,  5 },  //  1
-	{     5, 1,  7 },  //  2
-	{     7, 0,  3 },  //  3
-	{     9, 1, 10 },  //  4
-	{    15, 0,  4 },  //  5
-	{    31, 0,  5 },  //  6
-	{    63, 0,  6 },  //  7
-	{   127, 0,  7 },  //  8
-	{   255, 0,  8 },  //  9
-	{   511, 0,  9 },  // 10
-	{  1023, 0, 10 },  // 11
-	{  2047, 0, 11 },  // 12
-	{  4095, 0, 12 },  // 13
-	{  8191, 0, 13 },  // 14
-	{ 16383, 0, 14 },  // 15
-	{ 32767, 0, 15 },  // 16
-	{ 65535, 0, 16 }   // 17
-};
-
-struct plm_audio_t {
-	double time;
-	int samples_decoded;
-	int samplerate_index;
-	int bitrate_index;
-	int version;
-	int layer;
-	int mode;
-	int bound;
-	int v_pos;
-	int next_frame_data_size;
-	int has_header;
-	
-	plm_buffer_t *buffer;
-	int destroy_buffer_when_done;
-
-	const plm_quantizer_spec_t *allocation[2][32];
-	uint8_t scale_factor_info[2][32];
-	int scale_factor[2][32][3];
-	int sample[2][32][3];
-
-	plm_samples_t samples;
-	float D[1024];
-	float V[2][1024];
-	float U[32];
-};
-
-int plm_audio_find_frame_sync(plm_audio_t *self);
-int plm_audio_decode_header(plm_audio_t *self);
-void plm_audio_decode_frame(plm_audio_t *self);
-const plm_quantizer_spec_t *plm_audio_read_allocation(plm_audio_t *self, int sb, int tab3);
-void plm_audio_read_samples(plm_audio_t *self, int ch, int sb, int part); 
-void plm_audio_idct36(int s[32][3], int ss, float *d, int dp);
-
-plm_audio_t *plm_audio_create_with_buffer(plm_buffer_t *buffer, int destroy_when_done) {
-	plm_audio_t *self = (plm_audio_t *)PLM_MALLOC(sizeof(plm_audio_t));
-	memset(self, 0, sizeof(plm_audio_t));
-
-	self->samples.count = PLM_AUDIO_SAMPLES_PER_FRAME;
-	self->buffer = buffer;
-	self->destroy_buffer_when_done = destroy_when_done;
-	self->samplerate_index = 3; // Indicates 0
-
-	memcpy(self->D, PLM_AUDIO_SYNTHESIS_WINDOW, 512 * sizeof(float));
-	memcpy(self->D + 512, PLM_AUDIO_SYNTHESIS_WINDOW, 512 * sizeof(float));
-
-	// Attempt to decode first header
-	self->next_frame_data_size = plm_audio_decode_header(self);
-
-	return self;
-}
-
-void plm_audio_destroy(plm_audio_t *self) {
-	if (self->destroy_buffer_when_done) {
-		plm_buffer_destroy(self->buffer);
-	}
-	PLM_FREE(self);
-}
-
-int plm_audio_has_header(plm_audio_t *self) {
-	if (self->has_header) {
-		return TRUE;
-	}
-	
-	self->next_frame_data_size = plm_audio_decode_header(self);
-	return self->has_header;
-}
-
-int plm_audio_get_samplerate(plm_audio_t *self) {
-	return plm_audio_has_header(self)
-		? PLM_AUDIO_SAMPLE_RATE[self->samplerate_index]
-		: 0;
-}
-
-double plm_audio_get_time(plm_audio_t *self) {
-	return self->time;
-}
-
-void plm_audio_set_time(plm_audio_t *self, double time) {
-	self->samples_decoded = time * 
-		(double)PLM_AUDIO_SAMPLE_RATE[self->samplerate_index];
-	self->time = time;
-}
-
-void plm_audio_rewind(plm_audio_t *self) {
-	plm_buffer_rewind(self->buffer);
-	self->time = 0;
-	self->samples_decoded = 0;
-	self->next_frame_data_size = 0;
-}
-
-int plm_audio_has_ended(plm_audio_t *self) {
-	return plm_buffer_has_ended(self->buffer);
-}
-
-plm_samples_t *plm_audio_decode(plm_audio_t *self) {
-	// Do we have at least enough information to decode the frame header?
-	if (!self->next_frame_data_size) {
-		if (!plm_buffer_has(self->buffer, 48)) {
-			return NULL;
-		}
-		self->next_frame_data_size = plm_audio_decode_header(self);
-	}
-
-	if (
-		self->next_frame_data_size == 0 ||
-		!plm_buffer_has(self->buffer, self->next_frame_data_size << 3)
-	) {
-		return NULL;
-	}
-
-	plm_audio_decode_frame(self);
-	self->next_frame_data_size = 0;
-	
-	self->samples.time = self->time;
-
-	self->samples_decoded += PLM_AUDIO_SAMPLES_PER_FRAME;
-	self->time = (double)self->samples_decoded / 
-		(double)PLM_AUDIO_SAMPLE_RATE[self->samplerate_index];
-	
-	return &self->samples;
-}
-
-int plm_audio_find_frame_sync(plm_audio_t *self) {
-	size_t i;
-	for (i = self->buffer->bit_index >> 3; i < self->buffer->length-1; i++) {
-		if (
-			self->buffer->bytes[i] == 0xFF &&
-			(self->buffer->bytes[i+1] & 0xFE) == 0xFC
-		) {
-			self->buffer->bit_index = ((i+1) << 3) + 3;
-			return TRUE;
-		}
-	}
-	self->buffer->bit_index = (i + 1) << 3;
-	return FALSE;
-}
-
-int plm_audio_decode_header(plm_audio_t *self) {
-	if (!plm_buffer_has(self->buffer, 48)) {
-		return 0;
-	}
-
-	plm_buffer_skip_bytes(self->buffer, 0x00);
-	int sync = plm_buffer_read(self->buffer, 11);
-
-
-	// Attempt to resync if no syncword was found. This sucks balls. The MP2 
-	// stream contains a syncword just before every frame (11 bits set to 1).
-	// However, this syncword is not guaranteed to not occur elsewhere in the
-	// stream. So, if we have to resync, we also have to check if the header 
-	// (samplerate, bitrate) differs from the one we had before. This all
-	// may still lead to garbage data being decoded :/
-
-	if (sync != PLM_AUDIO_FRAME_SYNC && !plm_audio_find_frame_sync(self)) {
-		return 0;
-	}
-
-	self->version = plm_buffer_read(self->buffer, 2);
-	self->layer = plm_buffer_read(self->buffer, 2);
-	int hasCRC = !plm_buffer_read(self->buffer, 1);
-
-	if (
-		self->version != PLM_AUDIO_MPEG_1 ||
-		self->layer != PLM_AUDIO_LAYER_II
-	) {
-		return 0;
-	}
-
-	int bitrate_index = plm_buffer_read(self->buffer, 4) - 1;
-	if (bitrate_index > 13) {
-		return 0;
-	}
-
-	int samplerate_index = plm_buffer_read(self->buffer, 2);
-	if (samplerate_index == 3) {
-		return 0;
-	}
-
-	int padding = plm_buffer_read(self->buffer, 1);
-	plm_buffer_skip(self->buffer, 1); // f_private
-	int mode = plm_buffer_read(self->buffer, 2);
-
-	// If we already have a header, make sure the samplerate, bitrate and mode
-	// are still the same, otherwise we might have missed sync.
-	if (
-		self->has_header && (
-			self->bitrate_index != bitrate_index ||
-			self->samplerate_index != samplerate_index ||
-			self->mode != mode
-		)
-	) {
-		return 0;
-	}
-
-	self->bitrate_index = bitrate_index;
-	self->samplerate_index = samplerate_index;
-	self->mode = mode;
-	self->has_header = TRUE;
-
-	// Parse the mode_extension, set up the stereo bound
-	if (mode == PLM_AUDIO_MODE_JOINT_STEREO) {
-		self->bound = (plm_buffer_read(self->buffer, 2) + 1) << 2;
-	}
-	else {
-		plm_buffer_skip(self->buffer, 2);
-		self->bound = (mode == PLM_AUDIO_MODE_MONO) ? 0 : 32;
-	}
-
-	// Discard the last 4 bits of the header and the CRC value, if present
-	plm_buffer_skip(self->buffer, 4); // copyright(1), original(1), emphasis(2)
-	if (hasCRC) {
-		plm_buffer_skip(self->buffer, 16);
-	}
-
-	// Compute frame size, check if we have enough data to decode the whole
-	// frame.
-	int bitrate = PLM_AUDIO_BIT_RATE[self->bitrate_index];
-	int samplerate = PLM_AUDIO_SAMPLE_RATE[self->samplerate_index];
-	int frame_size = (144000 * bitrate / samplerate) + padding;
-	return frame_size - (hasCRC ? 6 : 4);
-}
-
-void plm_audio_decode_frame(plm_audio_t *self) {
-	// Prepare the quantizer table lookups
-	int tab3 = 0;
-	int sblimit = 0;
-	
-	int tab1 = (self->mode == PLM_AUDIO_MODE_MONO) ? 0 : 1;
-	int tab2 = PLM_AUDIO_QUANT_LUT_STEP_1[tab1][self->bitrate_index];
-	tab3 = QUANT_LUT_STEP_2[tab2][self->samplerate_index];
-	sblimit = tab3 & 63;
-	tab3 >>= 6;
-
-	if (self->bound > sblimit) {
-		self->bound = sblimit;
-	}
-
-	// Read the allocation information
-	for (int sb = 0; sb < self->bound; sb++) {
-		self->allocation[0][sb] = plm_audio_read_allocation(self, sb, tab3);
-		self->allocation[1][sb] = plm_audio_read_allocation(self, sb, tab3);
-	}
-
-	for (int sb = self->bound; sb < sblimit; sb++) {
-		self->allocation[0][sb] =
-			self->allocation[1][sb] =
-			plm_audio_read_allocation(self, sb, tab3);
-	}
-
-	// Read scale factor selector information
-	int channels = (self->mode == PLM_AUDIO_MODE_MONO) ? 1 : 2;
-	for (int sb = 0; sb < sblimit; sb++) {
-		for (int ch = 0; ch < channels; ch++) {
-			if (self->allocation[ch][sb]) {
-				self->scale_factor_info[ch][sb] = plm_buffer_read(self->buffer, 2);
-			}
-		}
-		if (self->mode == PLM_AUDIO_MODE_MONO) {
-			self->scale_factor_info[1][sb] = self->scale_factor_info[0][sb];
-		}
-	}
-
-	// Read scale factors
-	for (int sb = 0; sb < sblimit; sb++) {
-		for (int ch = 0; ch < channels; ch++) {
-			if (self->allocation[ch][sb]) {
-				int *sf = self->scale_factor[ch][sb];
-				switch (self->scale_factor_info[ch][sb]) {
-					case 0:
-						sf[0] = plm_buffer_read(self->buffer, 6);
-						sf[1] = plm_buffer_read(self->buffer, 6);
-						sf[2] = plm_buffer_read(self->buffer, 6);
-						break;
-					case 1:
-						sf[0] = 
-						sf[1] = plm_buffer_read(self->buffer, 6);
-						sf[2] = plm_buffer_read(self->buffer, 6);
-						break;
-					case 2:
-						sf[0] = 
-						sf[1] = 
-						sf[2] = plm_buffer_read(self->buffer, 6);
-						break;
-					case 3:
-						sf[0] = plm_buffer_read(self->buffer, 6);
-						sf[1] = 
-						sf[2] = plm_buffer_read(self->buffer, 6);
-						break;
-				}
-			}
-		}
-		if (self->mode == PLM_AUDIO_MODE_MONO) {
-			self->scale_factor[1][sb][0] = self->scale_factor[0][sb][0];
-			self->scale_factor[1][sb][1] = self->scale_factor[0][sb][1];
-			self->scale_factor[1][sb][2] = self->scale_factor[0][sb][2];
-		}
-	}
-
-	// Coefficient input and reconstruction
-	int out_pos = 0;
-	for (int part = 0; part < 3; part++) {
-		for (int granule = 0; granule < 4; granule++) {
-
-			// Read the samples
-			for (int sb = 0; sb < self->bound; sb++) {
-				plm_audio_read_samples(self, 0, sb, part);
-				plm_audio_read_samples(self, 1, sb, part);
-			}
-			for (int sb = self->bound; sb < sblimit; sb++) {
-				plm_audio_read_samples(self, 0, sb, part);
-				self->sample[1][sb][0] = self->sample[0][sb][0];
-				self->sample[1][sb][1] = self->sample[0][sb][1];
-				self->sample[1][sb][2] = self->sample[0][sb][2];
-			}
-			for (int sb = sblimit; sb < 32; sb++) {
-				self->sample[0][sb][0] = 0;
-				self->sample[0][sb][1] = 0;
-				self->sample[0][sb][2] = 0;
-				self->sample[1][sb][0] = 0;
-				self->sample[1][sb][1] = 0;
-				self->sample[1][sb][2] = 0;
-			}
-
-			// Synthesis loop
-			for (int p = 0; p < 3; p++) {
-				// Shifting step
-				self->v_pos = (self->v_pos - 64) & 1023;
-
-				for (int ch = 0; ch < 2; ch++) {
-					plm_audio_idct36(self->sample[ch], p, self->V[ch], self->v_pos);
-
-					// Build U, windowing, calculate output
-					memset(self->U, 0, sizeof(self->U));
-
-					int d_index = 512 - (self->v_pos >> 1);
-					int v_index = (self->v_pos % 128) >> 1;
-					while (v_index < 1024) {
-						for (int i = 0; i < 32; ++i) {
-							self->U[i] += self->D[d_index++] * self->V[ch][v_index++];
-						}
-
-						v_index += 128 - 32;
-						d_index += 64 - 32;
-					}
-
-					d_index -= (512 - 32);
-					v_index = (128 - 32 + 1024) - v_index;
-					while (v_index < 1024) {
-						for (int i = 0; i < 32; ++i) {
-							self->U[i] += self->D[d_index++] * self->V[ch][v_index++];
-						}
-
-						v_index += 128 - 32;
-						d_index += 64 - 32;
-					}
-
-					// Output samples
-					#ifdef PLM_AUDIO_SEPARATE_CHANNELS
-						float *out_channel = ch == 0
-							? self->samples.left
-							: self->samples.right;
-						for (int j = 0; j < 32; j++) {
-							out_channel[out_pos + j] = self->U[j] / 2147418112.0f;
-						}
-					#else
-						for (int j = 0; j < 32; j++) {
-							self->samples.interleaved[((out_pos + j) << 1) + ch] = 
-								self->U[j] / 2147418112.0f;
-						}
-					#endif
-				} // End of synthesis channel loop
-				out_pos += 32;
-			} // End of synthesis sub-block loop
-
-		} // Decoding of the granule finished
-	}
-
-	plm_buffer_align(self->buffer);
-}
-
-const plm_quantizer_spec_t *plm_audio_read_allocation(plm_audio_t *self, int sb, int tab3) {
-	int tab4 = PLM_AUDIO_QUANT_LUT_STEP_3[tab3][sb];
-	int qtab = PLM_AUDIO_QUANT_LUT_STEP_4[tab4 & 15][plm_buffer_read(self->buffer, tab4 >> 4)];
-	return qtab ? (&PLM_AUDIO_QUANT_TAB[qtab - 1]) : 0;
-}
-
-void plm_audio_read_samples(plm_audio_t *self, int ch, int sb, int part) {
-	const plm_quantizer_spec_t *q = self->allocation[ch][sb];
-	int sf = self->scale_factor[ch][sb][part];
-	int *sample = self->sample[ch][sb];
-	int val = 0;
-
-	if (!q) {
-		// No bits allocated for this subband
-		sample[0] = sample[1] = sample[2] = 0;
-		return;
-	}
-
-	// Resolve scalefactor
-	if (sf == 63) {
-		sf = 0;
-	}
-	else {
-		int shift = (sf / 3) | 0;
-		sf = (PLM_AUDIO_SCALEFACTOR_BASE[sf % 3] + ((1 << shift) >> 1)) >> shift;
-	}
-
-	// Decode samples
-	int adj = q->levels;
-	if (q->group) {
-		// Decode grouped samples
-		val = plm_buffer_read(self->buffer, q->bits);
-		sample[0] = val % adj;
-		val /= adj;
-		sample[1] = val % adj;
-		sample[2] = val / adj;
-	}
-	else {
-		// Decode direct samples
-		sample[0] = plm_buffer_read(self->buffer, q->bits);
-		sample[1] = plm_buffer_read(self->buffer, q->bits);
-		sample[2] = plm_buffer_read(self->buffer, q->bits);
-	}
-
-	// Postmultiply samples
-	int scale = 65536 / (adj + 1);
-	adj = ((adj + 1) >> 1) - 1;
-
-	val = (adj - sample[0]) * scale;
-	sample[0] = (val * (sf >> 12) + ((val * (sf & 4095) + 2048) >> 12)) >> 12;
-
-	val = (adj - sample[1]) * scale;
-	sample[1] = (val * (sf >> 12) + ((val * (sf & 4095) + 2048) >> 12)) >> 12;
-
-	val = (adj - sample[2]) * scale;
-	sample[2] = (val * (sf >> 12) + ((val * (sf & 4095) + 2048) >> 12)) >> 12;
-}
-
-void plm_audio_idct36(int s[32][3], int ss, float *d, int dp) {
-	float t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12,
-		t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23, t24,
-		t25, t26, t27, t28, t29, t30, t31, t32, t33;
-
-	t01 = (float)(s[0][ss] + s[31][ss]); t02 = (float)(s[0][ss] - s[31][ss]) * 0.500602998235f;
-	t03 = (float)(s[1][ss] + s[30][ss]); t04 = (float)(s[1][ss] - s[30][ss]) * 0.505470959898f;
-	t05 = (float)(s[2][ss] + s[29][ss]); t06 = (float)(s[2][ss] - s[29][ss]) * 0.515447309923f;
-	t07 = (float)(s[3][ss] + s[28][ss]); t08 = (float)(s[3][ss] - s[28][ss]) * 0.53104259109f;
-	t09 = (float)(s[4][ss] + s[27][ss]); t10 = (float)(s[4][ss] - s[27][ss]) * 0.553103896034f;
-	t11 = (float)(s[5][ss] + s[26][ss]); t12 = (float)(s[5][ss] - s[26][ss]) * 0.582934968206f;
-	t13 = (float)(s[6][ss] + s[25][ss]); t14 = (float)(s[6][ss] - s[25][ss]) * 0.622504123036f;
-	t15 = (float)(s[7][ss] + s[24][ss]); t16 = (float)(s[7][ss] - s[24][ss]) * 0.674808341455f;
-	t17 = (float)(s[8][ss] + s[23][ss]); t18 = (float)(s[8][ss] - s[23][ss]) * 0.744536271002f;
-	t19 = (float)(s[9][ss] + s[22][ss]); t20 = (float)(s[9][ss] - s[22][ss]) * 0.839349645416f;
-	t21 = (float)(s[10][ss] + s[21][ss]); t22 = (float)(s[10][ss] - s[21][ss]) * 0.972568237862f;
-	t23 = (float)(s[11][ss] + s[20][ss]); t24 = (float)(s[11][ss] - s[20][ss]) * 1.16943993343f;
-	t25 = (float)(s[12][ss] + s[19][ss]); t26 = (float)(s[12][ss] - s[19][ss]) * 1.48416461631f;
-	t27 = (float)(s[13][ss] + s[18][ss]); t28 = (float)(s[13][ss] - s[18][ss]) * 2.05778100995f;
-	t29 = (float)(s[14][ss] + s[17][ss]); t30 = (float)(s[14][ss] - s[17][ss]) * 3.40760841847f;
-	t31 = (float)(s[15][ss] + s[16][ss]); t32 = (float)(s[15][ss] - s[16][ss]) * 10.1900081235f;
-
-	t33 = t01 + t31; t31 = (t01 - t31) * 0.502419286188f;
-	t01 = t03 + t29; t29 = (t03 - t29) * 0.52249861494f;
-	t03 = t05 + t27; t27 = (t05 - t27) * 0.566944034816f;
-	t05 = t07 + t25; t25 = (t07 - t25) * 0.64682178336f;
-	t07 = t09 + t23; t23 = (t09 - t23) * 0.788154623451f;
-	t09 = t11 + t21; t21 = (t11 - t21) * 1.06067768599f;
-	t11 = t13 + t19; t19 = (t13 - t19) * 1.72244709824f;
-	t13 = t15 + t17; t17 = (t15 - t17) * 5.10114861869f;
-	t15 = t33 + t13; t13 = (t33 - t13) * 0.509795579104f;
-	t33 = t01 + t11; t01 = (t01 - t11) * 0.601344886935f;
-	t11 = t03 + t09; t09 = (t03 - t09) * 0.899976223136f;
-	t03 = t05 + t07; t07 = (t05 - t07) * 2.56291544774f;
-	t05 = t15 + t03; t15 = (t15 - t03) * 0.541196100146f;
-	t03 = t33 + t11; t11 = (t33 - t11) * 1.30656296488f;
-	t33 = t05 + t03; t05 = (t05 - t03) * 0.707106781187f;
-	t03 = t15 + t11; t15 = (t15 - t11) * 0.707106781187f;
-	t03 += t15;
-	t11 = t13 + t07; t13 = (t13 - t07) * 0.541196100146f;
-	t07 = t01 + t09; t09 = (t01 - t09) * 1.30656296488f;
-	t01 = t11 + t07; t07 = (t11 - t07) * 0.707106781187f;
-	t11 = t13 + t09; t13 = (t13 - t09) * 0.707106781187f;
-	t11 += t13; t01 += t11;
-	t11 += t07; t07 += t13;
-	t09 = t31 + t17; t31 = (t31 - t17) * 0.509795579104f;
-	t17 = t29 + t19; t29 = (t29 - t19) * 0.601344886935f;
-	t19 = t27 + t21; t21 = (t27 - t21) * 0.899976223136f;
-	t27 = t25 + t23; t23 = (t25 - t23) * 2.56291544774f;
-	t25 = t09 + t27; t09 = (t09 - t27) * 0.541196100146f;
-	t27 = t17 + t19; t19 = (t17 - t19) * 1.30656296488f;
-	t17 = t25 + t27; t27 = (t25 - t27) * 0.707106781187f;
-	t25 = t09 + t19; t19 = (t09 - t19) * 0.707106781187f;
-	t25 += t19;
-	t09 = t31 + t23; t31 = (t31 - t23) * 0.541196100146f;
-	t23 = t29 + t21; t21 = (t29 - t21) * 1.30656296488f;
-	t29 = t09 + t23; t23 = (t09 - t23) * 0.707106781187f;
-	t09 = t31 + t21; t31 = (t31 - t21) * 0.707106781187f;
-	t09 += t31;	t29 += t09;	t09 += t23;	t23 += t31;
-	t17 += t29;	t29 += t25;	t25 += t09;	t09 += t27;
-	t27 += t23;	t23 += t19; t19 += t31;
-	t21 = t02 + t32; t02 = (t02 - t32) * 0.502419286188f;
-	t32 = t04 + t30; t04 = (t04 - t30) * 0.52249861494f;
-	t30 = t06 + t28; t28 = (t06 - t28) * 0.566944034816f;
-	t06 = t08 + t26; t08 = (t08 - t26) * 0.64682178336f;
-	t26 = t10 + t24; t10 = (t10 - t24) * 0.788154623451f;
-	t24 = t12 + t22; t22 = (t12 - t22) * 1.06067768599f;
-	t12 = t14 + t20; t20 = (t14 - t20) * 1.72244709824f;
-	t14 = t16 + t18; t16 = (t16 - t18) * 5.10114861869f;
-	t18 = t21 + t14; t14 = (t21 - t14) * 0.509795579104f;
-	t21 = t32 + t12; t32 = (t32 - t12) * 0.601344886935f;
-	t12 = t30 + t24; t24 = (t30 - t24) * 0.899976223136f;
-	t30 = t06 + t26; t26 = (t06 - t26) * 2.56291544774f;
-	t06 = t18 + t30; t18 = (t18 - t30) * 0.541196100146f;
-	t30 = t21 + t12; t12 = (t21 - t12) * 1.30656296488f;
-	t21 = t06 + t30; t30 = (t06 - t30) * 0.707106781187f;
-	t06 = t18 + t12; t12 = (t18 - t12) * 0.707106781187f;
-	t06 += t12;
-	t18 = t14 + t26; t26 = (t14 - t26) * 0.541196100146f;
-	t14 = t32 + t24; t24 = (t32 - t24) * 1.30656296488f;
-	t32 = t18 + t14; t14 = (t18 - t14) * 0.707106781187f;
-	t18 = t26 + t24; t24 = (t26 - t24) * 0.707106781187f;
-	t18 += t24; t32 += t18;
-	t18 += t14; t26 = t14 + t24;
-	t14 = t02 + t16; t02 = (t02 - t16) * 0.509795579104f;
-	t16 = t04 + t20; t04 = (t04 - t20) * 0.601344886935f;
-	t20 = t28 + t22; t22 = (t28 - t22) * 0.899976223136f;
-	t28 = t08 + t10; t10 = (t08 - t10) * 2.56291544774f;
-	t08 = t14 + t28; t14 = (t14 - t28) * 0.541196100146f;
-	t28 = t16 + t20; t20 = (t16 - t20) * 1.30656296488f;
-	t16 = t08 + t28; t28 = (t08 - t28) * 0.707106781187f;
-	t08 = t14 + t20; t20 = (t14 - t20) * 0.707106781187f;
-	t08 += t20;
-	t14 = t02 + t10; t02 = (t02 - t10) * 0.541196100146f;
-	t10 = t04 + t22; t22 = (t04 - t22) * 1.30656296488f;
-	t04 = t14 + t10; t10 = (t14 - t10) * 0.707106781187f;
-	t14 = t02 + t22; t02 = (t02 - t22) * 0.707106781187f;
-	t14 += t02;	t04 += t14;	t14 += t10;	t10 += t02;
-	t16 += t04;	t04 += t08;	t08 += t14;	t14 += t28;
-	t28 += t10;	t10 += t20;	t20 += t02;	t21 += t16;
-	t16 += t32;	t32 += t04;	t04 += t06;	t06 += t08;
-	t08 += t18;	t18 += t14;	t14 += t30;	t30 += t28;
-	t28 += t26;	t26 += t10;	t10 += t12;	t12 += t20;
-	t20 += t24;	t24 += t02;
-
-	d[dp + 48] = -t33;
-	d[dp + 49] = d[dp + 47] = -t21;
-	d[dp + 50] = d[dp + 46] = -t17;
-	d[dp + 51] = d[dp + 45] = -t16;
-	d[dp + 52] = d[dp + 44] = -t01;
-	d[dp + 53] = d[dp + 43] = -t32;
-	d[dp + 54] = d[dp + 42] = -t29;
-	d[dp + 55] = d[dp + 41] = -t04;
-	d[dp + 56] = d[dp + 40] = -t03;
-	d[dp + 57] = d[dp + 39] = -t06;
-	d[dp + 58] = d[dp + 38] = -t25;
-	d[dp + 59] = d[dp + 37] = -t08;
-	d[dp + 60] = d[dp + 36] = -t11;
-	d[dp + 61] = d[dp + 35] = -t18;
-	d[dp + 62] = d[dp + 34] = -t09;
-	d[dp + 63] = d[dp + 33] = -t14;
-	d[dp + 32] = -t05;
-	d[dp + 0] = t05; d[dp + 31] = -t30;
-	d[dp + 1] = t30; d[dp + 30] = -t27;
-	d[dp + 2] = t27; d[dp + 29] = -t28;
-	d[dp + 3] = t28; d[dp + 28] = -t07;
-	d[dp + 4] = t07; d[dp + 27] = -t26;
-	d[dp + 5] = t26; d[dp + 26] = -t23;
-	d[dp + 6] = t23; d[dp + 25] = -t10;
-	d[dp + 7] = t10; d[dp + 24] = -t15;
-	d[dp + 8] = t15; d[dp + 23] = -t12;
-	d[dp + 9] = t12; d[dp + 22] = -t19;
-	d[dp + 10] = t19; d[dp + 21] = -t20;
-	d[dp + 11] = t20; d[dp + 20] = -t13;
-	d[dp + 12] = t13; d[dp + 19] = -t24;
-	d[dp + 13] = t24; d[dp + 18] = -t31;
-	d[dp + 14] = t31; d[dp + 17] = -t02;
-	d[dp + 15] = t02; d[dp + 16] = 0.0;
-}
-
-
-#endif // PL_MPEG_IMPLEMENTATION
diff --git a/dsp/mpeg/plm.c b/dsp/mpeg/plm.c
new file mode 100644
index 000000000..7704643ff
--- /dev/null
+++ b/dsp/mpeg/plm.c
@@ -0,0 +1,332 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:4;tab-width:4;coding:utf-8   -*-│
+│ vi: set noet ft=c ts=4 sw=4 fenc=utf-8                                   :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│  PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer           │
+│  Dominic Szablewski - https://phoboslab.org                                  │
+│                                                                              │
+│  The MIT License(MIT)                                                        │
+│  Copyright(c) 2019 Dominic Szablewski                                        │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files(the              │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and / or sell copies of the Software, and to        │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│    The above copyright notice and this permission notice shall be            │
+│    included in all copies or substantial portions of the Software.           │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                       │
+│  NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE       │
+│  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN             │
+│  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN           │
+│  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE            │
+│  SOFTWARE.                                                                   │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "dsp/mpeg/mpeg.h"
+#include "libc/log/log.h"
+#include "libc/mem/mem.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+__static_yoink("pl_mpeg_notice");
+
+/* clang-format off */
+// -----------------------------------------------------------------------------
+// plm (high-level interface) implementation
+
+typedef struct plm_t {
+	plm_demux_t *demux;
+	double time;
+	int has_ended;
+	int loop;
+
+	int video_packet_type;
+	plm_buffer_t *video_buffer;
+	plm_video_t *video_decoder;
+
+	int audio_packet_type;
+	double audio_lead_time;
+	plm_buffer_t *audio_buffer;
+	plm_audio_t *audio_decoder;
+
+	plm_video_decode_callback video_decode_callback;
+	void *video_decode_callback_user_data;
+
+	plm_audio_decode_callback audio_decode_callback;
+	void *audio_decode_callback_user_data;
+} plm_t;
+
+void plm_handle_end(plm_t *self);
+void plm_read_video_packet(plm_buffer_t *buffer, void *user);
+void plm_read_audio_packet(plm_buffer_t *buffer, void *user);
+void plm_read_packets(plm_t *self, int requested_type);
+
+plm_t *plm_create_with_filename(const char *filename) {
+	plm_buffer_t *buffer = plm_buffer_create_with_filename(filename);
+	if (!buffer) {
+		return NULL;
+	}
+	return plm_create_with_buffer(buffer, true);
+}
+
+plm_t *plm_create_with_file(FILE *fh, int close_when_done) {
+	plm_buffer_t *buffer = plm_buffer_create_with_file(fh, close_when_done);
+	return plm_create_with_buffer(buffer, true);
+}
+
+plm_t *plm_create_with_memory(uint8_t *bytes, size_t length, int free_when_done) {
+	plm_buffer_t *buffer = plm_buffer_create_with_memory(bytes, length, free_when_done);
+	return plm_create_with_buffer(buffer, true);
+}
+
+plm_t *plm_create_with_buffer(plm_buffer_t *buffer, int destroy_when_done) {
+	plm_t *self = (plm_t *)malloc(sizeof(plm_t));
+	memset(self, 0, sizeof(plm_t));
+
+	self->demux = plm_demux_create(buffer, destroy_when_done);
+
+	// In theory we should check plm_demux_get_num_video_streams() and
+	// plm_demux_get_num_audio_streams() here, but older files typically
+	// do not specify these correctly. So we just assume we have a video and
+	// audio stream and create the decoders.
+
+	self->video_packet_type = PLM_DEMUX_PACKET_VIDEO_1;
+	self->video_buffer = plm_buffer_create_with_capacity(PLM_BUFFER_DEFAULT_SIZE);
+	plm_buffer_set_load_callback(self->video_buffer, plm_read_video_packet, self);
+
+	self->audio_packet_type = PLM_DEMUX_PACKET_AUDIO_1;
+	self->audio_buffer = plm_buffer_create_with_capacity(PLM_BUFFER_DEFAULT_SIZE);
+	plm_buffer_set_load_callback(self->audio_buffer, plm_read_audio_packet, self);
+
+	self->video_decoder = plm_video_create_with_buffer(self->video_buffer, true);
+	self->audio_decoder = plm_audio_create_with_buffer(self->audio_buffer, true);
+
+	return self;
+}
+
+void plm_destroy(plm_t *self) {
+	plm_video_destroy(self->video_decoder);
+	plm_audio_destroy(self->audio_decoder);
+	plm_demux_destroy(self->demux);
+	free(self);
+}
+
+int plm_get_audio_enabled(plm_t *self) {
+	return (self->audio_packet_type != 0);
+}
+
+void plm_set_audio_enabled(plm_t *self, int enabled, int stream_index) {
+	/* int num_streams = plm_demux_get_num_audio_streams(self->demux); */
+	self->audio_packet_type = (enabled && stream_index >= 0 && stream_index < 4)
+		? PLM_DEMUX_PACKET_AUDIO_1 + stream_index
+		: 0;
+}
+
+int plm_get_video_enabled(plm_t *self) {
+	return (self->video_packet_type != 0);
+}
+
+void plm_set_video_enabled(plm_t *self, int enabled) {
+	self->video_packet_type = (enabled)
+		? PLM_DEMUX_PACKET_VIDEO_1
+		: 0;
+}
+
+int plm_get_width(plm_t *self) {
+	return plm_video_get_width(self->video_decoder);
+}
+
+double plm_get_pixel_aspect_ratio(plm_t *self) {
+	return plm_video_get_pixel_aspect_ratio(self->video_decoder);
+}
+
+int plm_get_height(plm_t *self) {
+	return plm_video_get_height(self->video_decoder);
+}
+
+double plm_get_framerate(plm_t *self) {
+	return plm_video_get_framerate(self->video_decoder);
+}
+
+int plm_get_num_audio_streams(plm_t *self) {
+	// Some files do not specify the number of audio streams in the system header.
+	// If the reported number of streams is 0, we check if we have a samplerate,
+	// indicating at least one audio stream.
+	int num_streams = plm_demux_get_num_audio_streams(self->demux);
+	return num_streams == 0 && plm_get_samplerate(self) ? 1 : num_streams;
+}
+
+int plm_get_samplerate(plm_t *self) {
+	return plm_audio_get_samplerate(self->audio_decoder);
+}
+
+double plm_get_audio_lead_time(plm_t *self) {
+	return self->audio_lead_time;
+}
+
+void plm_set_audio_lead_time(plm_t *self, double lead_time) {
+	self->audio_lead_time = lead_time;
+}
+
+double plm_get_time(plm_t *self) {
+	return self->time;
+}
+
+void plm_rewind(plm_t *self) {
+	plm_video_rewind(self->video_decoder);
+	plm_audio_rewind(self->audio_decoder);
+	plm_demux_rewind(self->demux);
+	self->time = 0;
+}
+
+int plm_get_loop(plm_t *self) {
+	return self->loop;
+}
+
+void plm_set_loop(plm_t *self, int loop) {
+	self->loop = loop;
+}
+
+int plm_has_ended(plm_t *self) {
+	return self->has_ended;
+}
+
+void plm_set_video_decode_callback(plm_t *self, plm_video_decode_callback fp, void *user) {
+	self->video_decode_callback = fp;
+	self->video_decode_callback_user_data = user;
+}
+
+void plm_set_audio_decode_callback(plm_t *self, plm_audio_decode_callback fp, void *user) {
+	self->audio_decode_callback = fp;
+	self->audio_decode_callback_user_data = user;
+}
+
+int plm_decode(plm_t *self, double tick) {
+	DEBUGF("%s", "plm_decode");
+
+	int decode_video = (self->video_decode_callback && self->video_packet_type);
+	int decode_audio = (self->audio_decode_callback && self->audio_packet_type);
+
+	if (!decode_video && !decode_audio) {
+		// Nothing to do here
+		return false;
+	}
+
+	int did_decode = false;
+	int video_ended = false;
+	int audio_ended = false;
+
+	double video_target_time = self->time + tick;
+	double audio_target_time = self->time + tick;
+
+	if (self->audio_lead_time > 0 && decode_audio) {
+		video_target_time -= self->audio_lead_time;
+	}
+	else {
+		audio_target_time -= self->audio_lead_time;
+	}
+
+	do {
+		did_decode = false;
+
+		if (decode_video &&	plm_video_get_time(self->video_decoder) < video_target_time) {
+			plm_frame_t *frame = plm_video_decode(self->video_decoder);
+			if (frame) {
+				self->video_decode_callback(self, frame, self->video_decode_callback_user_data);
+				did_decode = true;
+			}
+			else {
+				video_ended = true;
+			}
+		}
+
+		if (decode_audio && plm_audio_get_time(self->audio_decoder) < audio_target_time) {
+			plm_samples_t *samples = plm_audio_decode(self->audio_decoder);
+			if (samples) {
+				self->audio_decode_callback(self, samples, self->audio_decode_callback_user_data);
+				did_decode = true;
+			}
+			else {
+				audio_ended = true;
+			}
+		}
+	} while (did_decode);
+
+	// We wanted to decode something but failed -> the source must have ended
+	if ((!decode_video || video_ended) && (!decode_audio || audio_ended)) {
+		plm_handle_end(self);
+	}
+	else {
+		self->time += tick;
+	}
+
+	return did_decode ? true : false;
+}
+
+plm_frame_t *plm_decode_video(plm_t *self) {
+	if (!self->video_packet_type) {
+		return NULL;
+	}
+
+	plm_frame_t *frame = plm_video_decode(self->video_decoder);
+	if (frame) {
+		self->time = frame->time;
+	}
+	else {
+		plm_handle_end(self);
+	}
+	return frame;
+}
+
+plm_samples_t *plm_decode_audio(plm_t *self) {
+	if (!self->audio_packet_type) {
+		return NULL;
+	}
+
+	plm_samples_t *samples = plm_audio_decode(self->audio_decoder);
+	if (samples) {
+		self->time = samples->time;
+	}
+	else {
+		plm_handle_end(self);
+	}
+	return samples;
+}
+
+void plm_handle_end(plm_t *self) {
+	if (self->loop) {
+		plm_rewind(self);
+	}
+	else {
+		self->has_ended = true;
+	}
+}
+
+void plm_read_video_packet(plm_buffer_t *buffer, void *user) {
+	plm_t *self = (plm_t *)user;
+	plm_read_packets(self, self->video_packet_type);
+}
+
+void plm_read_audio_packet(plm_buffer_t *buffer, void *user) {
+	plm_t *self = (plm_t *)user;
+	plm_read_packets(self, self->audio_packet_type);
+}
+
+void plm_read_packets(plm_t *self, int requested_type) {
+	plm_packet_t *packet;
+	while ((packet = plm_demux_decode(self->demux))) {
+		if (packet->type == self->video_packet_type) {
+			plm_buffer_write(self->video_buffer, packet->data, packet->length);
+		}
+		else if (packet->type == self->audio_packet_type) {
+			plm_buffer_write(self->audio_buffer, packet->data, packet->length);
+		}
+		if (packet->type == requested_type) {
+			return;
+		}
+	}
+}
diff --git a/dsp/mpeg/slowrgb.c b/dsp/mpeg/slowrgb.c
new file mode 100644
index 000000000..7472d82f3
--- /dev/null
+++ b/dsp/mpeg/slowrgb.c
@@ -0,0 +1,83 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:4;tab-width:4;coding:utf-8   -*-│
+│ vi: set et ft=c ts=4 sw=4 fenc=utf-8                                     :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│  PL_MPEG - MPEG1 Video decoder, MP2 Audio decoder, MPEG-PS demuxer           │
+│  Dominic Szablewski - https://phoboslab.org                                  │
+│                                                                              │
+│  The MIT License(MIT)                                                        │
+│  Copyright(c) 2019 Dominic Szablewski                                        │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files(the              │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and / or sell copies of the Software, and to        │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│    The above copyright notice and this permission notice shall be            │
+│    included in all copies or substantial portions of the Software.           │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                       │
+│  NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE       │
+│  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN             │
+│  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN           │
+│  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE            │
+│  SOFTWARE.                                                                   │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "dsp/mpeg/mpeg.h"
+#include "libc/macros.internal.h"
+__static_yoink("pl_mpeg_notice");
+
+/**
+ * @see YCbCr2RGB() in tool/viz/lib/ycbcr2rgb.c
+ */
+void plm_frame_to_rgb(plm_frame_t *frame, uint8_t *rgb) {
+  // Chroma values are the same for each block of 4 pixels, so we process
+  // 2 lines at a time, 2 neighboring pixels each.
+  int w = frame->y.width, w2 = w >> 1;
+  int y_index1 = 0, y_index2 = w, y_next_2_lines = w + (w - frame->width);
+  int c_index = 0, c_next_line = w2 - (frame->width >> 1);
+  int rgb_index1 = 0, rgb_index2 = frame->width * 3,
+      rgb_next_2_lines = frame->width * 3;
+  int cols = frame->width >> 1, rows = frame->height >> 1;
+  int ccb, ccr, r, g, b;
+  uint8_t *y = frame->y.data, *cb = frame->cb.data, *cr = frame->cr.data;
+  for (int row = 0; row < rows; row++) {
+    for (int col = 0; col < cols; col++) {
+      ccb = cb[c_index];
+      ccr = cr[c_index];
+      c_index++;
+      r = (ccr + ((ccr * 103) >> 8)) - 179;
+      g = ((ccb * 88) >> 8) - 44 + ((ccr * 183) >> 8) - 91;
+      b = (ccb + ((ccb * 198) >> 8)) - 227;
+      // Line 1
+      int y1 = y[y_index1++];
+      int y2 = y[y_index1++];
+      rgb[rgb_index1 + 0] = MAX(0, MIN(255, y1 + r));
+      rgb[rgb_index1 + 1] = MAX(0, MIN(255, y1 - g));
+      rgb[rgb_index1 + 2] = MAX(0, MIN(255, y1 + b));
+      rgb[rgb_index1 + 3] = MAX(0, MIN(255, y2 + r));
+      rgb[rgb_index1 + 4] = MAX(0, MIN(255, y2 - g));
+      rgb[rgb_index1 + 5] = MAX(0, MIN(255, y2 + b));
+      rgb_index1 += 6;
+      // Line 2
+      int y3 = y[y_index2++];
+      int y4 = y[y_index2++];
+      rgb[rgb_index2 + 0] = MAX(0, MIN(255, y3 + r));
+      rgb[rgb_index2 + 1] = MAX(0, MIN(255, y3 - g));
+      rgb[rgb_index2 + 2] = MAX(0, MIN(255, y3 + b));
+      rgb[rgb_index2 + 3] = MAX(0, MIN(255, y4 + r));
+      rgb[rgb_index2 + 4] = MAX(0, MIN(255, y4 - g));
+      rgb[rgb_index2 + 5] = MAX(0, MIN(255, y4 + b));
+      rgb_index2 += 6;
+    }
+    y_index1 += y_next_2_lines;
+    y_index2 += y_next_2_lines;
+    rgb_index1 += rgb_next_2_lines;
+    rgb_index2 += rgb_next_2_lines;
+    c_index += c_next_line;
+  }
+}
diff --git a/dsp/mpeg/video.h b/dsp/mpeg/video.h
new file mode 100644
index 000000000..d2ed05181
--- /dev/null
+++ b/dsp/mpeg/video.h
@@ -0,0 +1,60 @@
+#ifndef COSMOPOLITAN_DSP_MPEG_VIDEO_H_
+#define COSMOPOLITAN_DSP_MPEG_VIDEO_H_
+#include "dsp/mpeg/mpeg.h"
+COSMOPOLITAN_C_START_
+
+typedef struct {
+  int full_px;
+  int is_set;
+  int r_size;
+  int h;
+  int v;
+} plm_video_motion_t;
+
+typedef struct plm_video_t {
+  double framerate;
+  double time;
+  double pixel_aspect_ratio;
+  int frames_decoded;
+  int width;
+  int height;
+  int mb_width;
+  int mb_height;
+  int mb_size;
+  int luma_width;
+  int luma_height;
+  int chroma_width;
+  int chroma_height;
+  int start_code;
+  int picture_type;
+  plm_video_motion_t motion_forward;
+  plm_video_motion_t motion_backward;
+  int has_sequence_header;
+  int quantizer_scale;
+  int slice_begin;
+  int macroblock_address;
+  int mb_row;
+  int mb_col;
+  int macroblock_type;
+  int macroblock_intra;
+  int dc_predictor[3];
+  plm_buffer_t *buffer;
+  int destroy_buffer_when_done;
+  plm_frame_t frame_current;
+  plm_frame_t frame_forward;
+  plm_frame_t frame_backward;
+  uint8_t *frames_data;
+  int block_data[64];
+  uint8_t intra_quant_matrix[64];
+  uint8_t non_intra_quant_matrix[64];
+  int has_reference_frame;
+  int assume_no_b_frames;
+} plm_video_t;
+
+void plm_video_process_macroblock_8(plm_video_t *, uint8_t *, uint8_t *, int,
+                                    int, bool);
+void plm_video_process_macroblock_16(plm_video_t *, uint8_t *, uint8_t *, int,
+                                     int, bool);
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_DSP_MPEG_VIDEO_H_ */
diff --git a/dsp/prog/BUILD.mk b/dsp/prog/BUILD.mk
deleted file mode 100644
index adc0668ee..000000000
--- a/dsp/prog/BUILD.mk
+++ /dev/null
@@ -1,43 +0,0 @@
-#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
-#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
-
-PKGS += DSP_PROG
-
-DSP_PROG_FILES := $(wildcard dsp/prog/*)
-DSP_PROG_HDRS = $(filter %.h,$(DSP_PROG_FILES))
-DSP_PROG_SRCS = $(filter %.c,$(DSP_PROG_FILES))
-DSP_PROG_OBJS = $(DSP_PROG_SRCS:%.c=o/$(MODE)/%.o)
-DSP_PROG_COMS = $(DSP_PROG_SRCS:%.c=o/$(MODE)/%)
-DSP_PROG_BINS = $(DSP_PROG_COMS) $(DSP_PROG_COMS:%=%.dbg)
-
-DSP_PROG_DIRECTDEPS =				\
-	DSP_AUDIO				\
-	LIBC_CALLS				\
-	LIBC_INTRIN				\
-	LIBC_NEXGEN32E				\
-	LIBC_RUNTIME				\
-	LIBC_SOCK				\
-	LIBC_STDIO				\
-	LIBC_SYSV				\
-	LIBC_TINYMATH				\
-	THIRD_PARTY_MUSL			\
-
-DSP_PROG_DEPS :=				\
-	$(call uniq,$(foreach x,$(DSP_PROG_DIRECTDEPS),$($(x))))
-
-o/$(MODE)/dsp/prog/prog.pkg:			\
-		$(DSP_PROG_OBJS)		\
-		$(foreach x,$(DSP_PROG_DIRECTDEPS),$($(x)_A).pkg)
-
-o/$(MODE)/dsp/prog/%.dbg:			\
-		$(DSP_PROG_DEPS)		\
-		o/$(MODE)/dsp/prog/prog.pkg	\
-		o/$(MODE)/dsp/prog/%.o		\
-		$(CRT)				\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-$(DSP_PROG_OBJS): dsp/prog/BUILD.mk
-
-.PHONY: o/$(MODE)/dsp/prog
-o/$(MODE)/dsp/prog: $(DSP_PROG_BINS)
diff --git a/dsp/prog/loudness.h b/dsp/prog/loudness.h
deleted file mode 100644
index 75a1e9518..000000000
--- a/dsp/prog/loudness.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef COSMOPOLITAN_DSP_PROG_LOUDNESS_H_
-#define COSMOPOLITAN_DSP_PROG_LOUDNESS_H_
-#include <math.h>
-#include <stdio.h>
-
-#define MIN_DECIBEL -60
-#define MAX_DECIBEL 0
-
-// computes root of mean squares
-static double rms(float *p, int n) {
-  double s = 0;
-  for (int i = 0; i < n; ++i)
-    s += p[i] * p[i];
-  return sqrt(s / n);
-}
-
-// converts rms to decibel
-static double rms_to_db(double rms) {
-  double db = 20 * log10(rms);
-  db = fmin(db, MAX_DECIBEL);
-  db = fmax(db, MIN_DECIBEL);
-  return db;
-}
-
-// char meter[21];
-// format_decibel_meter(meter, 20, rms_to_db(rms(samps, count)))
-static char *format_decibel_meter(char *meter, int width, double db) {
-  double range = MAX_DECIBEL - MIN_DECIBEL;
-  int filled = (db - MIN_DECIBEL) / range * width;
-  for (int i = 0; i < width; ++i) {
-    if (i < filled) {
-      meter[i] = '=';
-    } else {
-      meter[i] = ' ';
-    }
-  }
-  meter[width] = 0;
-  return meter;
-}
-
-#endif /* COSMOPOLITAN_DSP_PROG_LOUDNESS_H_ */
diff --git a/dsp/prog/recvaudio.c b/dsp/prog/recvaudio.c
deleted file mode 100644
index 85ef98ed8..000000000
--- a/dsp/prog/recvaudio.c
+++ /dev/null
@@ -1,127 +0,0 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-#include <arpa/inet.h>
-#include <assert.h>
-#include <cosmoaudio.h>
-#include <errno.h>
-#include <math.h>
-#include <netinet/in.h>
-#include <signal.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <time.h>
-#include "loudness.h"
-
-/**
- * @fileoverview plays audio from remote computer on speaker
- * @see dsp/prog/sendaudio.c
- */
-
-#define SAMPLING_RATE     44100
-#define FRAMES_PER_SECOND 60
-#define DEBUG_LOG         0
-#define PORT              9834
-
-#define CHUNK_FRAMES (SAMPLING_RATE / FRAMES_PER_SECOND)
-
-static_assert(CHUNK_FRAMES * sizeof(short) < 1472,
-              "audio chunks won't fit in udp ethernet packet");
-
-sig_atomic_t g_done;
-
-void onsig(int sig) {
-  g_done = 1;
-}
-
-short toshort(float x) {
-  return fmaxf(-1, fminf(1, x)) * 32767;
-}
-
-float tofloat(short x) {
-  return x / 32768.f;
-}
-
-int main(int argc, char* argv[]) {
-
-  // listen on udp port for audio
-  int server;
-  if ((server = socket(AF_INET, SOCK_DGRAM, 0)) == -1) {
-    perror("socket");
-    return 3;
-  }
-  struct sockaddr_in addr = {.sin_family = AF_INET, .sin_port = htons(PORT)};
-  if (bind(server, (struct sockaddr*)&addr, sizeof(addr))) {
-    perror("bind");
-    return 4;
-  }
-
-  // setup signals
-  struct sigaction sa;
-  sa.sa_flags = 0;
-  sa.sa_handler = onsig;
-  sigemptyset(&sa.sa_mask);
-  sigaction(SIGINT, &sa, 0);
-
-  // configure cosmo audio
-  struct CosmoAudioOpenOptions cao = {0};
-  cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions);
-  cao.deviceType = kCosmoAudioDeviceTypePlayback;
-  cao.sampleRate = SAMPLING_RATE;
-  cao.bufferFrames = CHUNK_FRAMES * 2;
-  cao.debugLog = DEBUG_LOG;
-  cao.channels = 1;
-
-  // connect to microphone and speaker
-  int status;
-  struct CosmoAudio* ca;
-  status = cosmoaudio_open(&ca, &cao);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to open audio: %d\n", status);
-    return 5;
-  }
-
-  while (!g_done) {
-    // read from network
-    ssize_t got;
-    short buf16[CHUNK_FRAMES];
-    if ((got = read(server, buf16, CHUNK_FRAMES * sizeof(short))) == -1) {
-      if (errno == EINTR)
-        continue;
-      perror("read");
-      return 7;
-    }
-    if (got != CHUNK_FRAMES * sizeof(short)) {
-      fprintf(stderr, "warning: got partial audio frame\n");
-      continue;
-    }
-
-    // write to speaker
-    float buf32[CHUNK_FRAMES];
-    for (int i = 0; i < CHUNK_FRAMES; ++i)
-      buf32[i] = tofloat(buf16[i]);
-    cosmoaudio_poll(ca, 0, (int[]){CHUNK_FRAMES});
-    cosmoaudio_write(ca, buf32, CHUNK_FRAMES);
-
-    // print loudness in ascii
-    char meter[21];
-    double db = rms_to_db(rms(buf32, CHUNK_FRAMES));
-    format_decibel_meter(meter, 20, db);
-    printf("\r%s| %+6.2f dB", meter, db);
-    fflush(stdout);
-  }
-
-  // clean up resources
-  cosmoaudio_flush(ca);
-  cosmoaudio_close(ca);
-  close(server);
-}
diff --git a/dsp/prog/sendaudio.c b/dsp/prog/sendaudio.c
deleted file mode 100644
index 436bbfcdb..000000000
--- a/dsp/prog/sendaudio.c
+++ /dev/null
@@ -1,149 +0,0 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-#include <arpa/inet.h>
-#include <assert.h>
-#include <cosmoaudio.h>
-#include <errno.h>
-#include <math.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <signal.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <time.h>
-#include "loudness.h"
-
-/**
- * @fileoverview sends audio from microphone to remote computer
- * @see dsp/prog/recvaudio.c
- */
-
-#define SAMPLING_RATE     44100
-#define FRAMES_PER_SECOND 60
-#define DEBUG_LOG         0
-#define PORT              9834
-
-#define CHUNK_FRAMES (SAMPLING_RATE / FRAMES_PER_SECOND)
-
-static_assert(CHUNK_FRAMES * sizeof(short) < 1472,
-              "audio chunks won't fit in udp ethernet packet");
-
-sig_atomic_t g_done;
-
-void onsig(int sig) {
-  g_done = 1;
-}
-
-short toshort(float x) {
-  return fmaxf(-1, fminf(1, x)) * 32767;
-}
-
-float tofloat(short x) {
-  return x / 32768.f;
-}
-
-uint32_t host2ip(const char* host) {
-  uint32_t ip;
-  if ((ip = inet_addr(host)) != -1u)
-    return ip;
-  int rc;
-  struct addrinfo* ai = NULL;
-  struct addrinfo hint = {AI_NUMERICSERV, AF_INET, SOCK_STREAM, IPPROTO_TCP};
-  if ((rc = getaddrinfo(host, "0", &hint, &ai))) {
-    fprintf(stderr, "%s: %s\n", host, gai_strerror(rc));
-    exit(50 + rc);
-  }
-  ip = ntohl(((struct sockaddr_in*)ai->ai_addr)->sin_addr.s_addr);
-  freeaddrinfo(ai);
-  return ip;
-}
-
-int main(int argc, char* argv[]) {
-
-  if (argc != 2) {
-    fprintf(stderr, "%s: missing host argument\n", argv[0]);
-    return 1;
-  }
-
-  // get host argument
-  const char* remote_host = argv[1];
-  uint32_t ip = host2ip(remote_host);
-
-  // connect to server
-  int client;
-  if ((client = socket(AF_INET, SOCK_DGRAM, 0)) == -1) {
-    perror(remote_host);
-    return 3;
-  }
-  struct sockaddr_in addr = {.sin_family = AF_INET,
-                             .sin_port = htons(PORT),
-                             .sin_addr.s_addr = htonl(ip)};
-  if (connect(client, (struct sockaddr*)&addr, sizeof(addr))) {
-    perror(remote_host);
-    return 4;
-  }
-
-  // setup signals
-  struct sigaction sa;
-  sa.sa_flags = 0;
-  sa.sa_handler = onsig;
-  sigemptyset(&sa.sa_mask);
-  sigaction(SIGINT, &sa, 0);
-
-  // configure cosmo audio
-  struct CosmoAudioOpenOptions cao = {0};
-  cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions);
-  cao.deviceType = kCosmoAudioDeviceTypeCapture;
-  cao.sampleRate = SAMPLING_RATE;
-  cao.bufferFrames = CHUNK_FRAMES * 2;
-  cao.debugLog = DEBUG_LOG;
-  cao.channels = 1;
-
-  // connect to microphone and speaker
-  int status;
-  struct CosmoAudio* ca;
-  status = cosmoaudio_open(&ca, &cao);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to open audio: %d\n", status);
-    return 5;
-  }
-
-  while (!g_done) {
-    // read from microphone
-    float buf32[CHUNK_FRAMES];
-    cosmoaudio_poll(ca, (int[]){CHUNK_FRAMES}, 0);
-    cosmoaudio_read(ca, buf32, CHUNK_FRAMES);
-    short buf16[CHUNK_FRAMES];
-    for (int i = 0; i < CHUNK_FRAMES; ++i)
-      buf16[i] = toshort(buf32[i]);
-
-    // send to server
-    if (write(client, buf16, CHUNK_FRAMES * sizeof(short)) == -1) {
-      if (errno == EINTR && g_done)
-        break;
-      perror(remote_host);
-      return 7;
-    }
-
-    // print loudness in ascii
-    char meter[21];
-    double db = rms_to_db(rms(buf32, CHUNK_FRAMES));
-    format_decibel_meter(meter, 20, db);
-    printf("\r%s| %+6.2f dB", meter, db);
-    fflush(stdout);
-  }
-
-  // clean up resources
-  cosmoaudio_close(ca);
-  close(client);
-}
diff --git a/dsp/scale/BUILD.mk b/dsp/scale/BUILD.mk
index 80397cd97..bd4f6df7e 100644
--- a/dsp/scale/BUILD.mk
+++ b/dsp/scale/BUILD.mk
@@ -45,12 +45,6 @@ $(DSP_SCALE_A).pkg:				\
 		$(DSP_SCALE_A_OBJS)		\
 		$(foreach x,$(DSP_SCALE_A_DIRECTDEPS),$($(x)_A).pkg)
 
-ifeq ($(ARCH),x86_64)
-o/$(MODE)/dsp/scale/cdecimate2xuint8x8.o: private \
-		CFLAGS +=			\
-			-mssse3
-endif
-
 o/$(MODE)/dsp/scale/cdecimate2xuint8x8.o	\
 o/$(MODE)/dsp/scale/gyarados.o			\
 o/$(MODE)/dsp/scale/magikarp.o			\
diff --git a/dsp/scale/cdecimate2xuint8x8.c b/dsp/scale/cdecimate2xuint8x8.c
index 2a96284d9..4a7e8a084 100644
--- a/dsp/scale/cdecimate2xuint8x8.c
+++ b/dsp/scale/cdecimate2xuint8x8.c
@@ -16,20 +16,17 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/assert.h"
+#include "libc/intrin/packuswb.h"
+#include "libc/intrin/paddw.h"
+#include "libc/intrin/palignr.h"
+#include "libc/intrin/pmaddubsw.h"
+#include "libc/intrin/psraw.h"
+#include "libc/log/check.h"
+#include "libc/log/log.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
-#include "third_party/intel/immintrin.internal.h"
 
-/**
- * Performs 2D Motion Picture Convolution Acceleration by Leveraging SSSE3.
- *
- * @note H/T John Costella, Jean-Baptiste Joseph Fourier
- * @note RIP Huixiang Chen
- */
-void *cDecimate2xUint8x8(unsigned long n, unsigned char A[n],
-                         const signed char K[8]) {
-#ifdef __x86_64__
 #define TAPS       8
 #define RATIO      2
 #define OFFSET     3
@@ -40,107 +37,62 @@ void *cDecimate2xUint8x8(unsigned long n, unsigned char A[n],
 #define LOOKAHEAD  (SPREAD - LOOKBEHIND)
 #define SCALE      5
 #define ROUND      (1 << (SCALE - 1))
-  __m128i kRound = _mm_set1_epi16(ROUND);
-  __m128i kMadd1 = _mm_set_epi8(K[1], K[0], K[1], K[0], K[1], K[0], K[1], K[0],
-                                K[1], K[0], K[1], K[0], K[1], K[0], K[1], K[0]);
-  __m128i kMadd2 = _mm_set_epi8(K[3], K[2], K[3], K[2], K[3], K[2], K[3], K[2],
-                                K[3], K[2], K[3], K[2], K[3], K[2], K[3], K[2]);
-  __m128i kMadd3 = _mm_set_epi8(K[5], K[4], K[5], K[4], K[5], K[4], K[5], K[4],
-                                K[5], K[4], K[5], K[4], K[5], K[4], K[5], K[4]);
-  __m128i kMadd4 = _mm_set_epi8(K[7], K[6], K[7], K[6], K[7], K[6], K[7], K[6],
-                                K[7], K[6], K[7], K[6], K[7], K[6], K[7], K[6]);
-  __m128i bv0, bv1, bv2, bv3;
-  __m128i in1, in2, in3;
-  __m128i wv0, wv1, wv2, wv3;
+
+/**
+ * Performs 2D Motion Picture Convolution Acceleration by Leveraging SSSE3.
+ *
+ * @note H/T John Costella, Jean-Baptiste Joseph Fourier
+ * @note RIP Huixiang Chen
+ */
+void *cDecimate2xUint8x8(unsigned long n, unsigned char A[n],
+                         const signed char K[8]) {
+  short kRound[8] = {ROUND, ROUND, ROUND, ROUND, ROUND, ROUND, ROUND, ROUND};
+  signed char kMadd1[16] = {K[0], K[1], K[0], K[1], K[0], K[1], K[0], K[1],
+                            K[0], K[1], K[0], K[1], K[0], K[1], K[0], K[1]};
+  signed char kMadd2[16] = {K[2], K[3], K[2], K[3], K[2], K[3], K[2], K[3],
+                            K[2], K[3], K[2], K[3], K[2], K[3], K[2], K[3]};
+  signed char kMadd3[16] = {K[4], K[5], K[4], K[5], K[4], K[5], K[4], K[5],
+                            K[4], K[5], K[4], K[5], K[4], K[5], K[4], K[5]};
+  signed char kMadd4[16] = {K[6], K[7], K[6], K[7], K[6], K[7], K[6], K[7],
+                            K[6], K[7], K[6], K[7], K[6], K[7], K[6], K[7]};
+  unsigned char bv0[16], bv1[16], bv2[16], bv3[16];
+  unsigned char in1[16], in2[16], in3[16];
+  short wv0[8], wv1[8], wv2[8], wv3[8];
   unsigned long i, j, w;
   if (n >= STRIDE) {
     i = 0;
     w = (n + RATIO / 2) / RATIO;
-    in1 = _mm_set1_epi8(A[0]);
-    in2 = _mm_set1_epi8(A[n - 1]);
-    _mm_storeu_si128((__m128i *)&in2, _mm_loadu_si128((__m128i *)A));
+    memset(in1, A[0], sizeof(in1));
+    memset(in2, A[n - 1], 16);
+    memcpy(in2, A, MIN(16, n));
     for (; i < w; i += STRIDE) {
       j = i * RATIO + 16;
       if (j + 16 <= n) {
-        in3 = _mm_loadu_si128((__m128i *)&A[j]);
+        memcpy(in3, &A[j], 16);
       } else {
-        in3 = _mm_set1_epi8(A[n - 1]);
+        memset(in3, A[n - 1], 16);
         if (j < n) {
-          // SSSE3-compatible way to handle partial loads
-          __m128i mask = _mm_loadu_si128((__m128i *)&A[j]);
-          __m128i shuffle_mask = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7,
-                                              6, 5, 4, 3, 2, 1, 0);
-          __m128i index = _mm_set1_epi8(n - j);
-          __m128i cmp = _mm_cmplt_epi8(shuffle_mask, index);
-          in3 = _mm_or_si128(_mm_and_si128(cmp, mask),
-                             _mm_andnot_si128(cmp, in3));
+          memcpy(in3, &A[j], n - j);
         }
       }
-      bv0 = _mm_alignr_epi8(in2, in1, 13);
-      bv1 = _mm_alignr_epi8(in2, in1, 15);
-      bv2 = _mm_alignr_epi8(in3, in2, 1);
-      bv3 = _mm_alignr_epi8(in3, in2, 3);
-      wv0 = _mm_maddubs_epi16(bv0, kMadd1);
-      wv1 = _mm_maddubs_epi16(bv1, kMadd2);
-      wv2 = _mm_maddubs_epi16(bv2, kMadd3);
-      wv3 = _mm_maddubs_epi16(bv3, kMadd4);
-      wv0 = _mm_add_epi16(wv0, kRound);
-      wv0 = _mm_add_epi16(wv0, wv1);
-      wv0 = _mm_add_epi16(wv0, wv2);
-      wv0 = _mm_add_epi16(wv0, wv3);
-      wv0 = _mm_srai_epi16(wv0, SCALE);
-      bv2 = _mm_packus_epi16(wv0, wv0);
-      _mm_storel_epi64((__m128i *)&A[i], bv2);
-      in1 = in2;
-      in2 = in3;
+      palignr(bv0, in2, in1, 13);
+      palignr(bv1, in2, in1, 15);
+      palignr(bv2, in3, in2, 1);
+      palignr(bv3, in3, in2, 3);
+      pmaddubsw(wv0, bv0, kMadd1);
+      pmaddubsw(wv1, bv1, kMadd2);
+      pmaddubsw(wv2, bv2, kMadd3);
+      pmaddubsw(wv3, bv3, kMadd4);
+      paddw(wv0, wv0, kRound);
+      paddw(wv0, wv0, wv1);
+      paddw(wv0, wv0, wv2);
+      paddw(wv0, wv0, wv3);
+      psraw(wv0, wv0, SCALE);
+      packuswb(bv2, wv0, wv0);
+      memcpy(&A[i], bv2, STRIDE);
+      memcpy(in1, in2, 16);
+      memcpy(in2, in3, 16);
     }
   }
   return A;
-#else
-  long h, i;
-  if (n < 2)
-    return A;
-  unsigned char M[3 + n + 4];
-  unsigned char *q = M;
-  q[0] = A[0];
-  q[1] = A[0];
-  q[2] = A[0];
-  memcpy(q + 3, A, n);
-  q[3 + n + 0] = A[n - 1];
-  q[3 + n + 1] = A[n - 1];
-  q[3 + n + 2] = A[n - 1];
-  q[3 + n + 3] = A[n - 1];
-  q += 3;
-  h = (n + 1) >> 1;
-  for (i = 0; i < h; ++i) {
-    short x0, x1, x2, x3, x4, x5, x6, x7;
-    x0 = q[i * 2 - 3];
-    x1 = q[i * 2 - 2];
-    x2 = q[i * 2 - 1];
-    x3 = q[i * 2 + 0];
-    x4 = q[i * 2 + 1];
-    x5 = q[i * 2 + 2];
-    x6 = q[i * 2 + 3];
-    x7 = q[i * 2 + 4];
-    x0 *= K[0];
-    x1 *= K[1];
-    x2 *= K[2];
-    x3 *= K[3];
-    x4 *= K[4];
-    x5 *= K[5];
-    x6 *= K[6];
-    x7 *= K[7];
-    x0 += x1;
-    x2 += x3;
-    x4 += x5;
-    x6 += x7;
-    x0 += x2;
-    x4 += x6;
-    x0 += x4;
-    x0 += 1 << 4;
-    x0 >>= 5;
-    A[i] = MIN(255, MAX(0, x0));
-  }
-  return A;
-#endif
 }
diff --git a/dsp/scale/gyarados.c b/dsp/scale/gyarados.c
index 0785bbb0e..61beace01 100644
--- a/dsp/scale/gyarados.c
+++ b/dsp/scale/gyarados.c
@@ -25,7 +25,7 @@
 #include "libc/limits.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
@@ -53,11 +53,11 @@ struct SamplingSolution {
 static double ComputeWeight(double x) {
   if (-1.5 < x && x < 1.5) {
     if (-.5 < x && x < .5) {
-      return .75 - SQR(x);
+      return.75 - SQR(x);
     } else if (x < 0) {
-      return .5 * SQR(x + 1.5);
+      return.5 * SQR(x + 1.5);
     } else {
-      return .5 * SQR(x - 1.5);
+      return.5 * SQR(x - 1.5);
     }
   } else {
     return 0;
@@ -164,19 +164,12 @@ static void GyaradosImpl(long dyw, long dxw, int dst[dyw][dxw], long syw,
       tmp0[dy][sx] = QRS(M, eax);
     }
   }
-  if (sharpen) {
-    for (dy = 0; dy < dyn; ++dy) {
-      for (sx = 0; sx < sxn; ++sx) {
-        tmp1[dy][sx] =
-            Sharpen(tmp0[MIN(dyn - 1, MAX(0, dy - 1))][sx], tmp0[dy][sx],
-                    tmp0[MIN(dyn - 1, MAX(0, dy + 1))][sx]);
-      }
-    }
-  } else {
-    for (dy = 0; dy < dyn; ++dy) {
-      for (sx = 0; sx < sxn; ++sx) {
-        tmp1[dy][sx] = tmp0[dy][sx];
-      }
+  for (dy = 0; dy < dyn; ++dy) {
+    for (sx = 0; sx < sxn; ++sx) {
+      tmp1[dy][sx] = sharpen ? Sharpen(tmp0[MIN(dyn - 1, MAX(0, dy - 1))][sx],
+                                       tmp0[dy][sx],
+                                       tmp0[MIN(dyn - 1, MAX(0, dy + 1))][sx])
+                             : tmp0[dy][sx];
     }
   }
   for (dx = 0; dx < dxn; ++dx) {
@@ -187,19 +180,12 @@ static void GyaradosImpl(long dyw, long dxw, int dst[dyw][dxw], long syw,
       tmp2[dy][dx] = QRS(M, eax);
     }
   }
-  if (sharpen) {
-    for (dx = 0; dx < dxn; ++dx) {
-      for (dy = 0; dy < dyn; ++dy) {
-        dst[dy][dx] =
-            Sharpen(tmp2[dy][MIN(dxn - 1, MAX(0, dx - 1))], tmp2[dy][dx],
-                    tmp2[dy][MIN(dxn - 1, MAX(0, dx + 1))]);
-      }
-    }
-  } else {
-    for (dx = 0; dx < dxn; ++dx) {
-      for (dy = 0; dy < dyn; ++dy) {
-        dst[dy][dx] = tmp2[dy][dx];
-      }
+  for (dx = 0; dx < dxn; ++dx) {
+    for (dy = 0; dy < dyn; ++dy) {
+      dst[dy][dx] = sharpen ? Sharpen(tmp2[dy][MIN(dxn - 1, MAX(0, dx - 1))],
+                                      tmp2[dy][dx],
+                                      tmp2[dy][MIN(dxn - 1, MAX(0, dx + 1))])
+                            : tmp2[dy][dx];
     }
   }
 }
diff --git a/dsp/scale/magikarp.c b/dsp/scale/magikarp.c
index f1f84b2ed..ca5d3cf8c 100644
--- a/dsp/scale/magikarp.c
+++ b/dsp/scale/magikarp.c
@@ -20,7 +20,7 @@
 #include "dsp/core/ks8.h"
 #include "dsp/core/kss8.h"
 #include "dsp/scale/cdecimate2xuint8x8.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
 #include "libc/x/x.h"
diff --git a/dsp/tty/mpsadbw.S b/dsp/tty/mpsadbw.S
index 165e17edd..833824ea4 100644
--- a/dsp/tty/mpsadbw.S
+++ b/dsp/tty/mpsadbw.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	TODO(jart): write me
 
diff --git a/dsp/tty/rgb2ansi.c b/dsp/tty/rgb2ansi.c
index 4ae5c54fa..baece9e8a 100644
--- a/dsp/tty/rgb2ansi.c
+++ b/dsp/tty/rgb2ansi.c
@@ -21,7 +21,7 @@
 #include "libc/assert.h"
 #include "libc/limits.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/str/str.h"
 
diff --git a/dsp/tty/rgb2ttyi2f.c b/dsp/tty/rgb2ttyi2f.c
index 55472a40a..e323c57f2 100644
--- a/dsp/tty/rgb2ttyi2f.c
+++ b/dsp/tty/rgb2ttyi2f.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/tty/quant.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 struct TtyRgb rgb2ttyi2f_(int r, int g, int b) {
   return rgb2ttyf((ttyrgb_m128){r, g, b} / 255);
diff --git a/dsp/tty/rgb2xterm24.c b/dsp/tty/rgb2xterm24.c
index 166412a32..afbb6e9f5 100644
--- a/dsp/tty/rgb2xterm24.c
+++ b/dsp/tty/rgb2xterm24.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/tty/quant.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 struct TtyRgb rgb2xterm24_(int r, int g, int b) {
   return (struct TtyRgb){MAX(MIN(r, 255), 0), MAX(MIN(g, 255), 0),
diff --git a/dsp/tty/rgb2xterm24f.c b/dsp/tty/rgb2xterm24f.c
index 6cad0f3cc..5a3b59594 100644
--- a/dsp/tty/rgb2xterm24f.c
+++ b/dsp/tty/rgb2xterm24f.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/tty/quant.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "third_party/intel/xmmintrin.internal.h"
 
diff --git a/dsp/tty/ttyraster.c b/dsp/tty/ttyraster.c
index 3bdb5ae97..a4e5e98fd 100644
--- a/dsp/tty/ttyraster.c
+++ b/dsp/tty/ttyraster.c
@@ -26,7 +26,7 @@
 #include "libc/limits.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/runtime/runtime.h"
diff --git a/dsp/tty/ttyraw.c b/dsp/tty/ttyraw.c
index bfb03f84f..333c641f0 100644
--- a/dsp/tty/ttyraw.c
+++ b/dsp/tty/ttyraw.c
@@ -24,7 +24,7 @@
 #include "libc/calls/termios.h"
 #include "libc/calls/ucontext.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
diff --git a/dsp/tty/windex-avx2.S b/dsp/tty/windex-avx2.S
index f106fab29..4ebf2931e 100644
--- a/dsp/tty/windex-avx2.S
+++ b/dsp/tty/windex-avx2.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Returns index of minimum uint16 in array.
 //
diff --git a/dsp/tty/windex-sse4.S b/dsp/tty/windex-sse4.S
index 40ef0d856..0347cb763 100644
--- a/dsp/tty/windex-sse4.S
+++ b/dsp/tty/windex-sse4.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Returns index of minimum positive int16 in array.
 //
diff --git a/examples/BUILD.mk b/examples/BUILD.mk
index 3d50b5429..ff8e97a79 100644
--- a/examples/BUILD.mk
+++ b/examples/BUILD.mk
@@ -40,7 +40,6 @@ EXAMPLES_BINS =							\
 
 EXAMPLES_DIRECTDEPS =						\
 	CTL							\
-	DSP_AUDIO						\
 	DSP_CORE						\
 	DSP_SCALE						\
 	DSP_TTY							\
@@ -55,7 +54,6 @@ EXAMPLES_DIRECTDEPS =						\
 	LIBC_NT_ADVAPI32					\
 	LIBC_NT_IPHLPAPI					\
 	LIBC_NT_KERNEL32					\
-	LIBC_NT_MEMORY						\
 	LIBC_NT_NTDLL						\
 	LIBC_NT_USER32						\
 	LIBC_NT_WS2_32						\
@@ -64,7 +62,6 @@ EXAMPLES_DIRECTDEPS =						\
 	LIBC_SOCK						\
 	LIBC_STDIO						\
 	LIBC_STR						\
-	LIBC_SYSTEM						\
 	LIBC_SYSV						\
 	LIBC_SYSV_CALLS						\
 	LIBC_TESTLIB						\
@@ -82,8 +79,6 @@ EXAMPLES_DIRECTDEPS =						\
 	THIRD_PARTY_GETOPT					\
 	THIRD_PARTY_HIREDIS					\
 	THIRD_PARTY_LIBCXX					\
-	THIRD_PARTY_LIBCXXABI					\
-	THIRD_PARTY_LIBUNWIND					\
 	THIRD_PARTY_LINENOISE					\
 	THIRD_PARTY_LUA						\
 	THIRD_PARTY_MBEDTLS					\
@@ -97,10 +92,11 @@ EXAMPLES_DIRECTDEPS =						\
 	THIRD_PARTY_TZ						\
 	THIRD_PARTY_VQSORT					\
 	THIRD_PARTY_XED						\
+	THIRD_PARTY_LIBCXXABI					\
 	THIRD_PARTY_ZLIB					\
 	TOOL_ARGS						\
 	TOOL_BUILD_LIB						\
-	TOOL_VIZ_LIB						\
+	TOOL_VIZ_LIB
 
 EXAMPLES_DEPS :=						\
 	$(call uniq,$(foreach x,$(EXAMPLES_DIRECTDEPS),$($(x))))
@@ -151,10 +147,6 @@ o/$(MODE)/examples/picol.o: private				\
 		CPPFLAGS +=					\
 			-DSTACK_FRAME_UNLIMITED
 
-o/$(MODE)/examples/nesemu1.o: private				\
-		CPPFLAGS +=					\
-			-O3
-
 o/$(MODE)/examples/picol.dbg:					\
 		$(EXAMPLES_DEPS)				\
 		o/$(MODE)/examples/picol.o			\
diff --git a/examples/a440.c b/examples/a440.c
deleted file mode 100644
index 3b927da7d..000000000
--- a/examples/a440.c
+++ /dev/null
@@ -1,80 +0,0 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-#include <cosmoaudio.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-/**
- * @fileoverview plays pure A.440 tone on speakers for 1 second
- * @see https://en.wikipedia.org/wiki/A440_%28pitch_standard%29
- */
-
-#define SAMPLING_RATE 44100
-#define WAVE_INTERVAL 440
-#define CHANNELS      2
-#define LOUDNESS      .3
-#define DEBUG_LOG     1
-
-int main() {
-
-  struct CosmoAudioOpenOptions cao = {0};
-  cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions);
-  cao.deviceType = kCosmoAudioDeviceTypePlayback;
-  cao.sampleRate = SAMPLING_RATE;
-  cao.debugLog = DEBUG_LOG;
-  cao.channels = CHANNELS;
-
-  int status;
-  struct CosmoAudio *ca;
-  status = cosmoaudio_open(&ca, &cao);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to open audio: %d\n", status);
-    return 1;
-  }
-
-  float buf[256 * CHANNELS];
-  for (int g = 0; g < SAMPLING_RATE;) {
-    int frames = 1;
-    status = cosmoaudio_poll(ca, NULL, &frames);
-    if (status != COSMOAUDIO_SUCCESS) {
-      fprintf(stderr, "failed to poll output: %d\n", status);
-      return 2;
-    }
-    if (frames > 256)
-      frames = 256;
-    if (frames > SAMPLING_RATE - g)
-      frames = SAMPLING_RATE - g;
-    for (int f = 0; f < frames; ++f) {
-      float t = (float)g++ / SAMPLING_RATE;
-      float s = sinf(2 * M_PIf * WAVE_INTERVAL * t);
-      for (int c = 0; c < CHANNELS; c++)
-        buf[f * CHANNELS + c] = s * LOUDNESS;
-    }
-    status = cosmoaudio_write(ca, buf, frames);
-    if (status != frames) {
-      fprintf(stderr, "failed to write output: %d\n", status);
-      return 3;
-    }
-  }
-
-  status = cosmoaudio_flush(ca);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to flush output: %d\n", status);
-    return 4;
-  }
-
-  status = cosmoaudio_close(ca);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to close audio: %d\n", status);
-    return 5;
-  }
-}
diff --git a/examples/aba.c b/examples/aba.c
deleted file mode 100644
index b38b26028..000000000
--- a/examples/aba.c
+++ /dev/null
@@ -1,125 +0,0 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-#include <assert.h>
-#include <cosmo.h>
-#include <pthread.h>
-#include <stdatomic.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-// lockless push / pop tutorial
-//
-// this file demonstrates how to create a singly linked list that can be
-// pushed and popped across multiple threads, using only atomics. atomic
-// operations (rather than using a mutex) make push/pop go faster and it
-// ensures asynchronous signal safety too. therefore it will be safe for
-// use in a variety of contexts, such as signal handlers.
-
-#define THREADS    128
-#define ITERATIONS 10000
-
-// adjust mask based on alignment of list struct
-//
-// - 0x00fffffffffffff0 may be used if List* is always 16-byte aligned.
-//   We know that's the case here, because we call malloc() to create
-//   every List* object, and malloc() results are always max aligned.
-//
-// - 0x00fffffffffffff8 may be used if List* is always 8-byte aligned.
-//   This might be the case if you're pushing and popping stuff that was
-//   allocated from an array, to avoid malloc() calls. This has one
-//   fewer byte of safeguards against the ABA problem though.
-//
-// - 0x00fffffffffff000 may be used if List* is always page aligned.
-//   This is a good choice if you use mmap() to allocate each List*
-//   element, since it offers maximum protection against ABA.
-//
-// - only the highest byte of a 64-bit pointer is safe to use on our
-//   supported platforms. on most x86 and arm systems, it's possible to
-//   use the top sixteen bits. however that's not the case on more
-//   recent high end x86-64 systems that have pml5t.
-//
-#define MASQUE 0x00fffffffffffff0
-
-#define PTR(x)    ((uintptr_t)(x) & MASQUE)
-#define TAG(x)    ROL((uintptr_t)(x) & ~MASQUE, 8)
-#define ABA(p, t) ((uintptr_t)(p) | (ROR((uintptr_t)(t), 8) & ~MASQUE))
-#define ROL(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
-#define ROR(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
-
-struct List {
-  struct List* next;
-  int count;
-};
-
-atomic_uintptr_t list;
-
-void push(struct List* elem) {
-  uintptr_t tip;
-  assert(!TAG(elem));
-  for (tip = atomic_load_explicit(&list, memory_order_relaxed);;) {
-    elem->next = (struct List*)PTR(tip);
-    if (atomic_compare_exchange_weak_explicit(
-            &list, &tip, ABA(elem, TAG(tip) + 1), memory_order_release,
-            memory_order_relaxed))
-      break;
-    pthread_pause_np();
-  }
-}
-
-struct List* pop(void) {
-  uintptr_t tip;
-  struct List* elem;
-  tip = atomic_load_explicit(&list, memory_order_relaxed);
-  while ((elem = (struct List*)PTR(tip))) {
-    if (atomic_compare_exchange_weak_explicit(
-            &list, &tip, ABA(elem->next, TAG(tip) + 1), memory_order_acquire,
-            memory_order_relaxed))
-      break;
-    pthread_pause_np();
-  }
-  return elem;
-}
-
-void* tester(void* arg) {
-  struct List* elem;
-  for (int i = 0; i < ITERATIONS; ++i) {
-    while (!(elem = pop())) {
-      elem = malloc(sizeof(*elem));
-      elem->count = 0;
-      push(elem);
-    }
-    elem->count++;
-    push(elem);
-  }
-  return 0;
-}
-
-int main() {
-  printf("testing aba problem...");
-  fflush(stdout);
-  pthread_t th[THREADS];
-  for (int i = 0; i < THREADS; ++i)
-    pthread_create(&th[i], 0, tester, 0);
-  for (int i = 0; i < THREADS; ++i)
-    pthread_join(th[i], 0);
-  int sum = 0;
-  struct List* elem;
-  while ((elem = pop())) {
-    printf(" %d", elem->count);
-    sum += elem->count;
-    free(elem);
-  }
-  printf("\n");
-  assert(sum == ITERATIONS * THREADS);
-  printf("you are the dancing queen\n");
-  CheckForMemoryLeaks();
-}
diff --git a/examples/art.c b/examples/art.c
deleted file mode 100644
index 3802afa9b..000000000
--- a/examples/art.c
+++ /dev/null
@@ -1,353 +0,0 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-#include <errno.h>
-#include <fcntl.h>
-#include <getopt.h>
-#include <iconv.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <termios.h>
-#include <time.h>
-#include <unistd.h>
-
-/**
- * @fileoverview program for viewing bbs art files
- * @see https://github.com/blocktronics/artpacks
- * @see http://www.textfiles.com/art/
- */
-
-#define HELP \
-  "Usage:\n\
-    art [-b %d] [-f %s] [-t %s] FILE...\n\
-\n\
-Flags:\n\
-    -b NUMBER   specifies simulated modem baud rate, which defaults to\n\
-                2400 since that was the most common modem speed in the\n\
-                later half of the 1980s during the BBS golden age; you\n\
-                could also say 300 for the slowest experience possible\n\
-                or you could say 14.4k to get more of a 90's feel, and\n\
-                there's also the infamous 56k to bring you back to y2k\n\
-    -f CHARSET  specifies charset of input bytes, where the default is\n\
-                cp347 which means IBM Code Page 347 a.k.a. DOS\n\
-    -t CHARSET  specifies output charset used by your terminal, and it\n\
-                defaults to utf8 a.k.a. thompson-pike encoding\n\
-\n\
-Supported charsets:\n\
-    utf8, ascii, wchar_t, ucs2be, ucs2le, utf16be, utf16le, ucs4be,\n\
-    ucs4le, utf16, ucs4, ucs2, eucjp, shiftjis, iso2022jp, gb18030, gbk,\n\
-    gb2312, big5, euckr, iso88591, latin1, iso88592, iso88593, iso88594,\n\
-    iso88595, iso88596, iso88597, iso88598, iso88599, iso885910,\n\
-    iso885911, iso885913, iso885914, iso885915, iso885916, cp1250,\n\
-    windows1250, cp1251, windows1251, cp1252, windows1252, cp1253,\n\
-    windows1253, cp1254, windows1254, cp1255, windows1255, cp1256,\n\
-    windows1256, cp1257, windows1257, cp1258, windows1258, koi8r, koi8u,\n\
-    cp437, cp850, cp866, ibm1047, cp1047.\n\
-\n\
-See also:\n\
-    http://www.textfiles.com/art/\n\
-    https://github.com/blocktronics/artpacks\n\
-\n"
-
-#define INBUFSZ 256
-#define OUBUFSZ (INBUFSZ * 6)
-#define SLIT(s) ((unsigned)s[3] << 24 | s[2] << 16 | s[1] << 8 | s[0])
-
-// "When new technology comes out, people don't all buy it right away.
-//  If what they have works, some will wait until it doesn't. A few
-//  people do get the latest though. In 1984 2400 baud modems became
-//  available, so some people had them, but many didn't. A BBS list
-//  from 1986 shows operators were mostly 300 and 1200, but some were
-//  using 2400. The next 5 years were the hayday of the 2400."
-//
-// https://forum.vcfed.org/index.php?threads/the-2400-baud-modem.44241/
-
-int baud_rate = 2400;                // -b 2400
-const char* from_charset = "CP437";  // -f CP437
-const char* to_charset = "UTF-8";    // -t UTF-8
-
-volatile sig_atomic_t done;
-
-void on_signal(int sig) {
-  done = 1;
-  (void)sig;
-}
-
-void print(const char* s) {
-  (void)!write(STDOUT_FILENO, s, strlen(s));
-}
-
-int encode_character(char output[8], const char* codec, wchar_t character) {
-  size_t inbytesleft = sizeof(wchar_t);
-  size_t outbytesleft = 7;
-  char* inbuf = (char*)&character;
-  char* outbuf = output;
-  iconv_t cd = iconv_open(codec, "wchar_t");
-  if (cd == (iconv_t)-1)
-    return -1;
-  size_t result = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
-  iconv_close(cd);
-  if (result == (size_t)-1)
-    return -1;
-  *outbuf = '\0';
-  return 7 - outbytesleft;
-}
-
-void append_replacement_character(char** b) {
-  int n = encode_character(*b, to_charset, 0xFFFD);
-  if (n == -1)
-    n = encode_character(*b, to_charset, '?');
-  if (n != -1)
-    *b += n;
-}
-
-int compare_time(struct timespec a, struct timespec b) {
-  int cmp;
-  if (!(cmp = (a.tv_sec > b.tv_sec) - (a.tv_sec < b.tv_sec)))
-    cmp = (a.tv_nsec > b.tv_nsec) - (a.tv_nsec < b.tv_nsec);
-  return cmp;
-}
-
-struct timespec add_time(struct timespec x, struct timespec y) {
-  x.tv_sec += y.tv_sec;
-  x.tv_nsec += y.tv_nsec;
-  if (x.tv_nsec >= 1000000000) {
-    x.tv_nsec -= 1000000000;
-    x.tv_sec += 1;
-  }
-  return x;
-}
-
-struct timespec subtract_time(struct timespec a, struct timespec b) {
-  a.tv_sec -= b.tv_sec;
-  if (a.tv_nsec < b.tv_nsec) {
-    a.tv_nsec += 1000000000;
-    a.tv_sec--;
-  }
-  a.tv_nsec -= b.tv_nsec;
-  return a;
-}
-
-struct timespec fromnanos(long long x) {
-  struct timespec ts;
-  ts.tv_sec = x / 1000000000;
-  ts.tv_nsec = x % 1000000000;
-  return ts;
-}
-
-void process_file(const char* path, int fd, iconv_t cd) {
-  size_t carry = 0;
-  struct timespec next;
-  char input_buffer[INBUFSZ];
-
-  clock_gettime(CLOCK_MONOTONIC, &next);
-
-  for (;;) {
-
-    // read from file
-    ssize_t bytes_read = read(fd, input_buffer + carry, INBUFSZ - carry);
-    if (!bytes_read)
-      return;
-    if (bytes_read == -1) {
-      perror(path);
-      done = 1;
-      return;
-    }
-
-    // modernize character set
-    char* input_ptr = input_buffer;
-    size_t input_left = carry + bytes_read;
-    char output_buffer[OUBUFSZ];
-    char* output_ptr = output_buffer;
-    size_t output_left = OUBUFSZ;
-    size_t ir = iconv(cd, &input_ptr, &input_left, &output_ptr, &output_left);
-    carry = 0;
-    if (ir == (size_t)-1) {
-      if (errno == EINVAL) {
-        // incomplete multibyte sequence encountered
-        memmove(input_buffer, input_ptr, input_left);
-        carry = input_left;
-      } else if (errno == EILSEQ && input_left) {
-        // EILSEQ means either
-        // 1. illegal input sequence encountered
-        // 2. code not encodable in output codec
-        //
-        // so we skip one byte of input, and insert � or ? in the output
-        // this isn't the most desirable behavior, but it is the best we
-        // can do, since we don't know specifics about the codecs in use
-        //
-        // unlike glibc cosmo's iconv implementation may handle case (2)
-        // automatically by inserting an asterisk in place of a sequence
-        ++input_ptr;
-        --input_left;
-        memmove(input_buffer, input_ptr, input_left);
-        carry = input_left;
-        if (output_left >= 8)
-          append_replacement_character(&output_ptr);
-      } else {
-        perror(path);
-        done = 1;
-        return;
-      }
-    }
-
-    // write to terminal
-    for (char* p = output_buffer; p < output_ptr; p++) {
-      if (done)
-        return;
-
-      (void)!write(STDOUT_FILENO, p, 1);
-
-      // allow arrow keys to change baud rate
-      int have;
-      if (ioctl(STDIN_FILENO, FIONREAD, &have)) {
-        perror("ioctl");
-        done = 1;
-        return;
-      }
-      if (have > 0) {
-        char key[4] = {0};
-        if (read(STDIN_FILENO, key, sizeof(key)) > 0) {
-          if (SLIT(key) == SLIT("\33[A") ||  // up
-              SLIT(key) == SLIT("\33[C")) {  // right
-            baud_rate *= 1.4;
-          } else if (SLIT(key) == SLIT("\33[B") ||  // down
-                     SLIT(key) == SLIT("\33[D")) {  // left
-            baud_rate *= 0.6;
-          }
-          if (baud_rate < 3)
-            baud_rate = 3;
-          if (baud_rate > 1000000000)
-            baud_rate = 1000000000;
-        }
-      }
-
-      // insert artificial delay for one byte. we divide by 10 to convert
-      // bits to bytes, because that is how many bits 8-N-1 encoding used
-      struct timespec now;
-      clock_gettime(CLOCK_MONOTONIC, &now);
-      next = add_time(next, fromnanos(1e9 / (baud_rate / 10.)));
-      if (compare_time(next, now) > 0) {
-        struct timespec sleep = subtract_time(next, now);
-        nanosleep(&sleep, 0);
-      }
-    }
-  }
-}
-
-int main(int argc, char* argv[]) {
-  int opt;
-  while ((opt = getopt(argc, argv, "hb:f:t:")) != -1) {
-    switch (opt) {
-      case 'b': {
-        char* endptr;
-        double rate = strtod(optarg, &endptr);
-        if (*endptr == 'k') {
-          rate *= 1e3;
-          ++endptr;
-        } else if (*endptr == 'm') {
-          rate *= 1e6;
-          ++endptr;
-        }
-        if (*endptr || baud_rate <= 0) {
-          fprintf(stderr, "%s: invalid baud rate: %s\n", argv[0], optarg);
-          exit(1);
-        }
-        baud_rate = rate;
-        break;
-      }
-      case 'f':
-        from_charset = optarg;
-        break;
-      case 't':
-        to_charset = optarg;
-        break;
-      case 'h':
-        fprintf(stderr, HELP, baud_rate, from_charset, to_charset);
-        exit(0);
-      default:
-        fprintf(stderr, "protip: pass the -h flag for help\n");
-        exit(1);
-    }
-  }
-  if (optind == argc) {
-    fprintf(stderr, "%s: missing operand\n", argv[0]);
-    exit(1);
-  }
-
-  // create character transcoder
-  iconv_t cd = iconv_open(to_charset, from_charset);
-  if (cd == (iconv_t)-1) {
-    fprintf(stderr, "error: conversion from %s to %s not supported\n",
-            from_charset, to_charset);
-    exit(1);
-  }
-
-  // catch ctrl-c
-  signal(SIGINT, on_signal);
-
-  // don't wait until newline to read() keystrokes
-  struct termios t;
-  if (!tcgetattr(STDIN_FILENO, &t)) {
-    struct termios t2 = t;
-    t2.c_lflag &= ~(ICANON | ECHO);
-    tcsetattr(STDIN_FILENO, TCSANOW, &t2);
-  }
-
-  // Process each file specified on the command line
-  for (int i = optind; i < argc && !done; i++) {
-
-    // open file
-    int fd = open(argv[i], O_RDONLY);
-    if (fd == -1) {
-      perror(argv[i]);
-      break;
-    }
-
-    // wait between files
-    if (i > optind)
-      sleep(1);
-
-    print("\33[?25l");   // hide cursor
-    print("\33[H");      // move cursor to top-left
-    print("\33[J");      // erase display forward
-    print("\33[1;24r");  // set scrolling region to first 24 lines
-    print("\33[?7h");    // enable auto-wrap mode
-    print("\33[?3l");    // 80 column mode (deccolm) vt100
-    print("\33[H");      // move cursor to top-left, again
-
-    // get busy
-    process_file(argv[i], fd, cd);
-    close(fd);
-  }
-
-  // cleanup
-  iconv_close(cd);
-
-  print("\33[s");        // save cursor position
-  print("\33[?25h");     // show cursor
-  print("\33[0m");       // reset text attributes (color, bold, etc.)
-  print("\33[?1049l");   // exit alternate screen mode
-  print("\33(B");        // exit line drawing and other alt charset modes
-  print("\33[r");        // reset scrolling region
-  print("\33[?2004l");   // turn off bracketed paste mode
-  print("\33[4l");       // exit insert mode
-  print("\33[?1l\33>");  // exit application keypad mode
-  print("\33[?7h");      // reset text wrapping mode
-  print("\33[?12l");     // reset cursor blinking mode
-  print("\33[?6l");      // reset origin mode
-  print("\33[20l");      // reset auto newline mode
-  print("\33[u");        // restore cursor position
-
-  // restore terminal
-  tcsetattr(STDIN_FILENO, TCSANOW, &t);
-}
diff --git a/examples/asteroids.c b/examples/asteroids.c
deleted file mode 100644
index d9537936c..000000000
--- a/examples/asteroids.c
+++ /dev/null
@@ -1,353 +0,0 @@
-// -*- mode:c; indent-tabs-mode:nil; c-basic-offset:4 -*-
-// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8
-
-// asteroids by tsotchke
-// https://github.com/tsotchke/asteroids
-
-// clang-format off
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <time.h>
-#include <unistd.h>
-#include <signal.h>
-#include <termios.h>
-#include <sys/select.h>
-
-#define SCREEN_WIDTH 80
-#define SCREEN_HEIGHT 24
-#define MAX_ASTEROIDS 5
-#define MAX_BULLETS 5
-
-typedef struct {
-    float x, y;
-} Vector2;
-
-typedef struct {
-    Vector2 position;
-    Vector2 velocity;
-    float angle;
-    float radius;
-} GameObject;
-
-GameObject spaceship;
-GameObject asteroids[MAX_ASTEROIDS];
-GameObject bullets[MAX_BULLETS];
-
-int score = 0;
-time_t startTime;
-int isGameOver = 0;
-int shouldExit = 0;
-int finalTime = 0; // To store final time at game over
-char display[SCREEN_HEIGHT][SCREEN_WIDTH];
-
-// Function to clear the screen buffer
-void clearDisplay() {
-    memset(display, ' ', sizeof(display));
-}
-
-// Function to draw a pixel on the screen
-void drawPixel(int x, int y) {
-    if (x >= 0 && x < SCREEN_WIDTH && y >= 0 && y < SCREEN_HEIGHT) {
-        display[y][x] = '*';
-    }
-}
-
-// Function to draw a line using Bresenham's algorithm
-void drawLine(int x1, int y1, int x2, int y2) {
-    int dx = abs(x2 - x1), sx = (x1 < x2) ? 1 : -1;
-    int dy = -abs(y2 - y1), sy = (y1 < y2) ? 1 : -1;
-    int error = dx + dy, e2;
-
-    while (1) {
-        drawPixel(x1, y1);
-        if (x1 == x2 && y1 == y2) break;
-        e2 = 2 * error;
-        if (e2 >= dy) { error += dy; x1 += sx; }
-        if (e2 <= dx) { error += dx; y1 += sy; }
-    }
-}
-
-// Function to draw a circle
-void drawCircle(int centerX, int centerY, int radius) {
-    int x = radius - 1, y = 0, dx = 1, dy = 1, err = dx - (radius << 1);
-    while (x >= y) {
-        drawPixel(centerX + x, centerY + y);
-        drawPixel(centerX + y, centerY + x);
-        drawPixel(centerX - y, centerY + x);
-        drawPixel(centerX - x, centerY + y);
-        drawPixel(centerX - x, centerY - y);
-        drawPixel(centerX - y, centerY - x);
-        drawPixel(centerX + y, centerY - x);
-        drawPixel(centerX + x, centerY - y);
-
-        if (err <= 0) {
-            y++;
-            err += dy;
-            dy += 2;
-        }
-        if (err > 0) {
-            x--;
-            dx += 2;
-            err += dx - (radius << 1);
-        }
-    }
-}
-
-// Initialize a game object
-void initializeGameObject(GameObject *obj, float x, float y, float angle, float radius) {
-    obj->position = (Vector2){x, y};
-    obj->velocity = (Vector2){0, 0};
-    obj->angle = angle;
-    obj->radius = radius;
-}
-
-// Wrap position of the spaceship and asteroids within screen bounds
-void wrapPosition(Vector2 *pos) {
-    if (pos->x < 0) pos->x = SCREEN_WIDTH - 1;
-    if (pos->x >= SCREEN_WIDTH) pos->x = 0;
-    if (pos->y < 0) pos->y = SCREEN_HEIGHT - 1;
-    if (pos->y >= SCREEN_HEIGHT) pos->y = 0;
-}
-
-// Check if two game objects are colliding
-int checkCollision(GameObject *a, GameObject *b) {
-    float deltaX = a->position.x - b->position.x;
-    float deltaY = a->position.y - b->position.y;
-    return sqrt(deltaX * deltaX + deltaY * deltaY) < (a->radius + b->radius);
-}
-
-// Initialize game state
-void initGame() {
-    score = 0; // Reset the score
-    initializeGameObject(&spaceship, SCREEN_WIDTH / 2, SCREEN_HEIGHT / 2, 0, 2);
-
-    for (int i = 0; i < MAX_ASTEROIDS; i++) {
-        initializeGameObject(&asteroids[i],
-            rand() % SCREEN_WIDTH,
-            rand() % SCREEN_HEIGHT,
-            0,
-            2 + rand() % 3);
-        asteroids[i].velocity.x = ((float)rand() / RAND_MAX) * 2 - 1;
-        asteroids[i].velocity.y = ((float)rand() / RAND_MAX) * 2 - 1;
-    }
-
-    for (int i = 0; i < MAX_BULLETS; i++) {
-        bullets[i].position.x = -1; // Mark bullet as inactive
-        bullets[i].position.y = -1;
-    }
-
-    startTime = time(NULL);
-    isGameOver = 0;
-    finalTime = 0; // Reset final time
-}
-
-// Draw the spaceship on the screen
-void drawSpaceship() {
-    int x = (int)spaceship.position.x;
-    int y = (int)spaceship.position.y;
-    int size = 3;
-
-    float cosAngle = cos(spaceship.angle);
-    float sinAngle = sin(spaceship.angle);
-
-    int x1 = x + size * cosAngle;
-    int y1 = y + size * sinAngle;
-    int x2 = x + size * cos(spaceship.angle + 2.5);
-    int y2 = y + size * sin(spaceship.angle + 2.5);
-    int x3 = x + size * cos(spaceship.angle - 2.5);
-    int y3 = y + size * sin(spaceship.angle - 2.5);
-
-    drawLine(x1, y1, x2, y2);
-    drawLine(x2, y2, x3, y3);
-    drawLine(x3, y3, x1, y1);
-}
-
-// Draw all entities on the screen
-void drawEntities(GameObject *entities, int count, void (*drawFunc)(GameObject *)) {
-    for (int i = 0; i < count; i++) {
-        drawFunc(&entities[i]);
-    }
-}
-
-// Draw a bullet on the screen
-void drawBullet(GameObject *bullet) { // Changed to non-const
-    if (bullet->position.x >= 0) {
-        drawPixel((int)bullet->position.x, (int)bullet->position.y);
-    }
-}
-
-// Draw an asteroid on the screen
-void drawAsteroid(GameObject *asteroid) { // Changed to non-const
-    drawCircle((int)asteroid->position.x, (int)asteroid->position.y, (int)asteroid->radius);
-}
-
-// Refresh the display
-void updateDisplay() {
-    clearDisplay();
-    if (!isGameOver) {
-        drawSpaceship();
-        drawEntities(asteroids, MAX_ASTEROIDS, drawAsteroid);
-        drawEntities(bullets, MAX_BULLETS, drawBullet);
-    }
-
-    // Print the screen buffer
-    printf("\033[H");
-    for (int y = 0; y < SCREEN_HEIGHT; y++) {
-        for (int x = 0; x < SCREEN_WIDTH; x++) {
-            putchar(display[y][x]);
-        }
-        putchar('\n');
-    }
-    
-    // Display score and elapsed time
-    time_t currentTime = time(NULL);
-    int elapsedTime = isGameOver ? finalTime : (currentTime - startTime);
-    printf("Score: %d | Time: %02d:%02d | %s\n", score, elapsedTime / 60, elapsedTime % 60, isGameOver ? "Game Over!" : "           ");
-}
-
-// Update the position of game objects
-void updateGameObject(GameObject *obj, int isBullet) {
-    obj->position.x += obj->velocity.x;
-    obj->position.y += obj->velocity.y;
-
-    // If it's a bullet, check if it's out of bounds
-    if (isBullet) {
-        if (obj->position.x < 0 || obj->position.x >= SCREEN_WIDTH || obj->position.y < 0 || obj->position.y >= SCREEN_HEIGHT) {
-            obj->position.x = -1; // Deactivate bullet
-            obj->position.y = -1;
-        }
-    } else {
-        wrapPosition(&obj->position);
-    }
-}
-
-// Update the game state
-void updateGame() {
-    if (isGameOver) return; 
-
-    // Update spaceship and apply friction
-    updateGameObject(&spaceship, 0); // 0 indicates it's not a bullet
-    spaceship.velocity.x *= 0.98;
-    spaceship.velocity.y *= 0.98;
-
-    // Move asteroids and check for collisions
-    for (int i = 0; i < MAX_ASTEROIDS; i++) {
-        updateGameObject(&asteroids[i], 0);
-        if (checkCollision(&spaceship, &asteroids[i])) {
-            isGameOver = 1;
-            finalTime = time(NULL) - startTime;
-            return;
-        }
-    }
-
-    // Update bullet positions
-    for (int i = 0; i < MAX_BULLETS; i++) {
-        if (bullets[i].position.x >= 0) {
-            updateGameObject(&bullets[i], 1); // 1 indicates it's a bullet
-        }
-    }
-
-    // Check for bullet collisions with asteroids
-    for (int i = 0; i < MAX_BULLETS; i++) {
-        if (bullets[i].position.x >= 0) {
-            for (int j = 0; j < MAX_ASTEROIDS; j++) {
-                if (checkCollision(&bullets[i], &asteroids[j])) {
-                    bullets[i].position.x = -1; // Deactivate bullet
-                    bullets[i].position.y = -1;
-                    asteroids[j].position.x = rand() % SCREEN_WIDTH;
-                    asteroids[j].position.y = rand() % SCREEN_HEIGHT;
-                    score += 100;
-                }
-            }
-        }
-    }
-}
-
-// Fire a bullet
-void shootBullet() {
-    for (int i = 0; i < MAX_BULLETS; i++) {
-        if (bullets[i].position.x < 0) {
-            bullets[i].position = spaceship.position;
-            bullets[i].velocity.x = cos(spaceship.angle) * 2;
-            bullets[i].velocity.y = sin(spaceship.angle) * 2;
-            break;
-        }
-    }
-}
-
-// Check if a key was hit
-int isKeyHit() {
-    struct timeval tv = { 0L, 0L };
-    fd_set fds;
-    FD_ZERO(&fds);
-    FD_SET(0, &fds);
-    return select(1, &fds, NULL, NULL, &tv);
-}
-
-// Configure terminal settings
-void configureTerminal(struct termios *old_tio, struct termios *new_tio) {
-    tcgetattr(STDIN_FILENO, old_tio);
-    *new_tio = *old_tio;
-    new_tio->c_lflag &= (~ICANON & ~ECHO);
-    tcsetattr(STDIN_FILENO, TCSANOW, new_tio);
-}
-
-// Restore terminal settings
-void restoreTerminal(struct termios *old_tio) {
-    tcsetattr(STDIN_FILENO, TCSANOW, old_tio);
-}
-
-void onSignal(int sig) {
-    shouldExit = 1;
-}
-
-// Main game loop
-int main() {
-    signal(SIGINT, onSignal); // Capture ^C
-    srand(time(NULL)); // Seed the random number generator
-    initGame(); // Initialize the game state
-
-    struct termios old_tio, new_tio;
-    configureTerminal(&old_tio, &new_tio);
-
-    printf("\033[?25l");  // Hide the cursor
-
-    while (!shouldExit) {
-        if (isKeyHit()) {
-            char input = getchar();
-            if (input == 27) {  // ESC key
-                if (getchar() == '[') {  // Handle arrow keys
-                    switch (getchar()) {
-                        case 'A':  // Up arrow
-                            spaceship.velocity.x += cos(spaceship.angle) * 0.2;
-                            spaceship.velocity.y += sin(spaceship.angle) * 0.2;
-                            break;
-                        case 'B':  // Down arrow
-                            spaceship.velocity.x -= cos(spaceship.angle) * 0.2;
-                            spaceship.velocity.y -= sin(spaceship.angle) * 0.2;
-                            break;
-                        case 'D': spaceship.angle -= 0.2; break;  // Left arrow
-                        case 'C': spaceship.angle += 0.2; break;  // Right arrow
-                    }
-                }
-            } else if (input == ' ') {
-                shootBullet(); // Fire a bullet
-            } else if (input == 'q') {
-                break; // Quit the game
-            } else if (input == 'r' && isGameOver) {
-                initGame(); // Restart the game
-            }
-        }
-
-        updateGame(); // Update game state
-        updateDisplay(); // Refresh the display
-        usleep(50000);  // Wait for 50ms (20 FPS)
-    }
-
-    printf("\033[?25h");  // Show the cursor
-    restoreTerminal(&old_tio); // Restore terminal settings
-    return 0;
-}
diff --git a/examples/blas.cc b/examples/blas.cc
index 1dbdec7cc..70c32c451 100644
--- a/examples/blas.cc
+++ b/examples/blas.cc
@@ -14,16 +14,16 @@
 // PERFORMANCE OF THIS SOFTWARE.
 
 #include <unistd.h>
-#include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
+#include "libc/assert.h"
 
 // high performance high accuracy matrix multiplication in ansi c
 
-#define MATH __target_clones("avx512f,fma,avx")
+#define MATH __target_clones("avx512f,fma")
 
 namespace {
 namespace ansiBLAS {
diff --git a/examples/clear.c b/examples/clear.c
index c03453e0e..f008c5845 100644
--- a/examples/clear.c
+++ b/examples/clear.c
@@ -7,7 +7,7 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <unistd.h>
+#include "libc/calls/calls.h"
 
 // clears teletypewriter display
 //
diff --git a/examples/crashreport.c b/examples/crashreport.c
index b8219d83c..4e41ebf7f 100644
--- a/examples/crashreport.c
+++ b/examples/crashreport.c
@@ -7,7 +7,12 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <cosmo.h>
+#include "libc/calls/calls.h"
+#include "libc/intrin/kprintf.h"
+#include "libc/math.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/symbols.internal.h"
+#include "libc/stdio/stdio.h"
 
 /**
  * @fileoverview How to print backtraces and cpu state on crash.
diff --git a/examples/ctrlc.c b/examples/ctrlc.c
index f291add80..ee1ca37fa 100644
--- a/examples/ctrlc.c
+++ b/examples/ctrlc.c
@@ -7,43 +7,20 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <ctype.h>
-#include <errno.h>
-#include <limits.h>
-#include <signal.h>
-#include <stdio.h>
-#include <string.h>
-#include <termios.h>
-#include <unistd.h>
-
-// this program is used by jart for manually testing teletype interrupts
-// and canonical mode line editing. this file documents the hidden depth
-// of 1960's era computer usage, that's entrenched in primitive i/o apis
-//
-// manual testing checklist:
-//
-// - "hello" enter echos "got: hello^J"
-//
-// - "hello" ctrl-d echos "got: hello"
-//
-// - "hello" ctrl-r echos "^R\nhello"
-//
-// - "hello" ctrl-u enter echos "got: ^J"
-//
-// - ctrl-d during i/o task prints "got eof" and exits
-//
-// - ctrl-d during cpu task gets delayed until read() is called
-//
-// - ctrl-c during cpu task echos ^C, then calls SignalHandler()
-//   asynchronously, and program exits
-//
-// - ctrl-c during i/o task echos ^C, then calls SignalHandler()
-//   asynchronously, read() raises EINTR, and program exits
-//
-// - ctrl-v ctrl-c should echo "^\b" then echo "^C" and insert "\3"
-//
-// - ctrl-v ctrl-d should echo "^\b" then echo "^D" and insert "\4"
-//
+#include "libc/assert.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sock/struct/pollfd.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/limits.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/poll.h"
+#include "libc/sysv/consts/sig.h"
 
 volatile bool gotsig;
 
@@ -57,41 +34,23 @@ void SignalHandler(int sig) {
   gotsig = true;
 }
 
-// this is the easiest way to write a string literal to standard output,
-// without formatting. printf() has an enormous binary footprint so it's
-// nice to avoid linking that when it is not needed.
-#define WRITE(sliteral) write(1, sliteral, sizeof(sliteral) - 1)
-
 int main(int argc, char *argv[]) {
 
-  WRITE("echoing stdin until ctrl+c is pressed\n");
+  printf("echoing stdin until ctrl+c is pressed\n");
 
-  // when you type ctrl-c, by default it'll kill the process, unless you
-  // define a SIGINT handler. there's multiple ways to do it. the common
-  // way is to say signal(SIGINT, func) which is normally defined to put
-  // the signal handler in Berkeley-style SA_RESTART mode. that means if
-  // a signal handler is called while inside a function like read() then
-  // the read operation will keep going afterwards like nothing happened
-  // which can make it difficult to break your event loop. to avoid this
-  // we can use sigaction() without specifying SA_RESTART in sa_flag and
-  // that'll put the signal in system v mode. this means that whenever a
-  // signal handler function in your program is called during an i/o op,
-  // that i/o op will return an EINTR error, so you can churn your loop.
-  // don't take that error too seriously though since SIGINT can also be
-  // delivered asynchronously, during the times you're crunching numbers
-  // rather than performing i/o which means you get no EINTR to warn you
+  // you need to set your signal handler using sigaction() rather than
+  // signal(), since the latter uses .sa_flags=SA_RESTART, which means
+  // read will restart itself after signals, rather than raising EINTR
   sigaction(SIGINT, &(struct sigaction){.sa_handler = SignalHandler}, 0);
 
   for (;;) {
 
-    // asynchronous signals are needed to interrupt math, which we shall
-    // simulate here. signals can happen any time any place. that's only
-    // not the case when you use sigprocmask() to block signals which is
-    // useful for kicking the can down the road.
-    WRITE("doing cpu task...\n");
-    for (volatile int i = 0; i < INT_MAX / 3; ++i) {
+    // some programs are blocked on cpu rather than i/o
+    // such programs shall rely on asynchronous signals
+    printf("doing cpu task...\n");
+    for (volatile int i = 0; i < INT_MAX / 5; ++i) {
       if (gotsig) {
-        WRITE("\rgot ctrl+c asynchronously\n");
+        printf("\rgot ctrl+c asynchronously\n");
         exit(0);
       }
     }
@@ -112,18 +71,14 @@ int main(int argc, char *argv[]) {
 
     // read data from standard input
     //
-    // assuming you started this program in your terminal standard input
-    // will be plugged into your termios driver, which cosmpolitan codes
-    // in libc/calls/read-nt.c on windows. your read() function includes
-    // a primitive version of readline/linenoise called "canonical mode"
-    // which lets you edit the data that'll be returned by read() before
-    // it's actually returned. for example, if you type hello and enter,
-    // then "hello\n" will be returned. if you type hello and then ^D or
-    // ctrl-d, then "hello" will be returned. the ctrl-d keystroke is in
-    // fact an ascii control code whose special behavior can be bypassed
-    // if you type ctrl-v ctrl-d and then enter, in which case "\3\n" is
-    // returned, also known as ^D^J.
-    WRITE("doing read i/o task...\n");
+    // since this is a blocking operation and we're not performing a
+    // cpu-bound operation it is almost with absolute certainty that
+    // when the ctrl-c signal gets delivered, it'll happen in read()
+    //
+    // it's possible to be more precise if we were building library
+    // code. for example, you can block signals using sigprocmask()
+    // and then use pselect() to do the waiting.
+    printf("doing read i/o task...\n");
     int got = read(0, buf, sizeof(buf));
 
     // check if the read operation failed
@@ -139,10 +94,10 @@ int main(int argc, char *argv[]) {
         // the \r character is needed so when the line is printed
         // it'll overwrite the ^C that got echo'd with the ctrl-c
         if (gotsig) {
-          WRITE("\rgot ctrl+c via i/o eintr\n");
+          printf("\rgot ctrl+c via i/o eintr\n");
           exit(0);
         } else {
-          WRITE("\rgot spurious eintr\n");
+          printf("\rgot spurious eintr\n");
           continue;
         }
       } else {
@@ -154,34 +109,16 @@ int main(int argc, char *argv[]) {
 
     // check if the user typed ctrl-d which closes the input handle
     if (!got) {
-      WRITE("got eof\n");
+      printf("got eof\n");
       exit(0);
     }
 
-    // visualize line data returned by canonical mode to standard output
+    // relay read data to standard output
     //
-    // it's usually safe to ignore the return code of write; your system
-    // will send SIGPIPE if there's any problem, which kills by default.
-    //
-    // it's possible to use keyboard shortcuts to embed control codes in
-    // the line. so we visualize them using the classic tty notation. it
-    // is also possible to type the ascii representation, so we use bold
-    // to visually distinguish ascii codes. see also o//examples/ttyinfo
+    // it's usually safe to ignore the return code of write. the
+    // operating system will send SIGPIPE if there's any problem
+    // which kills the process by default
     write(1, "got: ", 5);
-    for (int i = 0; i < got; ++i) {
-      if (isascii(buf[i])) {
-        if (iscntrl(buf[i])) {
-          char ctl[2];
-          ctl[0] = '^';
-          ctl[1] = buf[i] ^ 0100;
-          WRITE("\033[1m");
-          write(1, ctl, 2);
-          WRITE("\033[0m");
-        } else {
-          write(1, &buf[i], 1);
-        }
-      }
-    }
-    WRITE("\n");
+    write(1, buf, got);
   }
 }
diff --git a/examples/date.c b/examples/date.c
index a0d3c6656..fbee50f5d 100644
--- a/examples/date.c
+++ b/examples/date.c
@@ -7,11 +7,18 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <stdlib.h>
-#include <string.h>
-#include <threads.h>
-#include <time.h>
-#include <unistd.h>
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/intrin/kprintf.h"
+#include "libc/macros.internal.h"
+#include "libc/nt/enum/timezoneid.h"
+#include "libc/nt/struct/timezoneinformation.h"
+#include "libc/nt/time.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/thread/threads.h"
+#include "libc/time.h"
 
 /**
  * @fileoverview High performance ISO-8601 timestamp formatter.
@@ -20,8 +27,6 @@
  * Consider using something like this instead for your loggers.
  */
 
-#define ABS(X) ((X) >= 0 ? (X) : -(X))
-
 char *GetTimestamp(void) {
   int x;
   struct timespec ts;
diff --git a/examples/dlopen.c b/examples/dlopen.c
index 3198e5361..545513918 100644
--- a/examples/dlopen.c
+++ b/examples/dlopen.c
@@ -7,9 +7,11 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <cosmo.h>
-#include <dlfcn.h>
-#include <stdlib.h>
+#include "libc/calls/calls.h"
+#include "libc/dlopen/dlfcn.h"
+#include "libc/fmt/itoa.h"
+#include "libc/nt/thunk/msabi.h"
+#include "libc/runtime/runtime.h"
 
 /**
  * @fileoverview cosmopolitan dynamic runtime linking demo
diff --git a/examples/env.c b/examples/env.c
index 83973ddee..5e607ddad 100644
--- a/examples/env.c
+++ b/examples/env.c
@@ -1,21 +1,10 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-
-/**
- * @fileoverview prints environment variables
- */
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
 
 int main(int argc, char* argv[]) {
-  for (char** p = environ; *p; ++p)
-    puts(*p);
+  fprintf(stderr, "%s (%s)\n", argv[0], GetProgramExecutableName());
+  for (char** p = environ; *p; ++p) {
+    printf("%s\n", *p);
+  }
   return 0;
 }
diff --git a/examples/greenbean.c b/examples/greenbean.c
index eca939a7b..8ffc51622 100644
--- a/examples/greenbean.c
+++ b/examples/greenbean.c
@@ -23,6 +23,8 @@
 #include <sys/auxv.h>
 #include <sys/socket.h>
 #include <time.h>
+#include "libc/mem/leaks.h"
+#include "libc/runtime/runtime.h"
 
 /**
  * @fileoverview greenbean lightweight threaded web server
@@ -337,7 +339,7 @@ int main(int argc, char *argv[]) {
   sigaddset(&block, SIGQUIT);
   pthread_attr_t attr;
   unassert(!pthread_attr_init(&attr));
-  unassert(!pthread_attr_setstacksize(&attr, 65536 - getpagesize()));
+  unassert(!pthread_attr_setstacksize(&attr, 65536));
   unassert(!pthread_attr_setguardsize(&attr, getpagesize()));
   unassert(!pthread_attr_setsigmask_np(&attr, &block));
   unassert(!pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0));
diff --git a/examples/hangman.c b/examples/hangman.c
index a739af289..4aa736490 100644
--- a/examples/hangman.c
+++ b/examples/hangman.c
@@ -36,10 +36,14 @@
  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/stat.h>
-#include <time.h>
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/time.h"
+#include "third_party/zlib/zlib.h"
 // clang-format off
 
 #define DICT "usr/share/dict/hangman"
diff --git a/examples/hello.c b/examples/hello.c
index f3cc59316..f56cbae1e 100644
--- a/examples/hello.c
+++ b/examples/hello.c
@@ -7,8 +7,9 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <stdio.h>
+#include "libc/stdio/stdio.h"
 
 int main() {
   printf("hello world\n");
+  return 0;
 }
diff --git a/examples/hello2.c b/examples/hello2.c
index 25cbd9a07..ecf749dee 100644
--- a/examples/hello2.c
+++ b/examples/hello2.c
@@ -7,7 +7,7 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <unistd.h>
+#include "libc/calls/calls.h"
 
 int main() {
   write(1, "hello world\n", 12);
diff --git a/examples/nproc.c b/examples/localtime.c
similarity index 91%
rename from examples/nproc.c
rename to examples/localtime.c
index 73ad91934..70d67c1c2 100644
--- a/examples/nproc.c
+++ b/examples/localtime.c
@@ -7,9 +7,9 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <cosmo.h>
-#include <stdio.h>
+#include "libc/time.h"
 
 int main(int argc, char *argv[]) {
-  printf("%d\n", __get_cpu_count());
+  int64_t t = 0;
+  localtime(&t);
 }
diff --git a/examples/loudness.c b/examples/loudness.c
deleted file mode 100644
index 194600f08..000000000
--- a/examples/loudness.c
+++ /dev/null
@@ -1,133 +0,0 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-#include <cosmoaudio.h>
-#include <math.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-/**
- * @fileoverview prints ascii meter of microphone loudness
- *
- * 0. -60 dB is nearly silent, barely audible, even in a quiet room
- * 1. -50 dB is very quiet background sounds
- * 2. -40 dB is quiet ambient noise
- * 3. -30 dB is clear but soft sounds
- * 4. -20 dB is moderate volume, comfortable for extended listening
- * 5. -10 dB is fairly loud, but not uncomfortable
- * 6.  -6 dB is loud, but not at full volume
- * 7.  -3 dB is very loud, approaching system limits
- * 8.  -1 dB is extremely loud, just below maximum
- * 9.  -0 dB is maximum volume without distortion
- */
-
-#define SAMPLING_RATE     44100
-#define ASCII_METER_WIDTH 20
-#define FRAMES_PER_SECOND 30
-#define MIN_DECIBEL       -60
-#define MAX_DECIBEL       0
-#define DEBUG_LOG         1
-
-sig_atomic_t g_done;
-
-void on_signal(int sig) {
-  g_done = 1;
-}
-
-// computes root of mean squares
-double rms(float* p, int n) {
-  double s = 0;
-  for (int i = 0; i < n; ++i)
-    s += p[i] * p[i];
-  return sqrt(s / n);
-}
-
-// converts rms to decibel
-double rms_to_db(double rms) {
-  double db = 20 * log10(rms);
-  db = fmin(db, MAX_DECIBEL);
-  db = fmax(db, MIN_DECIBEL);
-  return db;
-}
-
-int main() {
-  signal(SIGINT, on_signal);
-
-  // how many samples should we process at once
-  int chunkFrames = SAMPLING_RATE / FRAMES_PER_SECOND;
-
-  // configure cosmo audio
-  struct CosmoAudioOpenOptions cao = {0};
-  cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions);
-  cao.deviceType = kCosmoAudioDeviceTypeCapture;
-  cao.sampleRate = SAMPLING_RATE;
-  cao.bufferFrames = chunkFrames * 2;
-  cao.debugLog = DEBUG_LOG;
-  cao.channels = 1;
-
-  // connect to microphone
-  int status;
-  struct CosmoAudio* ca;
-  status = cosmoaudio_open(&ca, &cao);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to open microphone: %d\n", status);
-    return 1;
-  }
-
-  // allocate memory for audio work area
-  float* chunk = malloc(chunkFrames * sizeof(float));
-  if (!chunk) {
-    fprintf(stderr, "out of memory\n");
-    return 1;
-  }
-
-  while (!g_done) {
-
-    // wait for full chunk of audio to become available
-    int need_in_frames = chunkFrames;
-    status = cosmoaudio_poll(ca, &need_in_frames, NULL);
-    if (status != COSMOAUDIO_SUCCESS) {
-      fprintf(stderr, "failed to poll microphone: %d\n", status);
-      return 2;
-    }
-
-    // read audio frames from microphone ring buffer
-    status = cosmoaudio_read(ca, chunk, chunkFrames);
-    if (status != chunkFrames) {
-      fprintf(stderr, "failed to read microphone: %d\n", status);
-      return 3;
-    }
-
-    // convert audio chunk to to ascii meter
-    char s[ASCII_METER_WIDTH + 1] = {0};
-    double db = rms_to_db(rms(chunk, chunkFrames));
-    double db_range = MAX_DECIBEL - MIN_DECIBEL;
-    int filled_length = (db - MIN_DECIBEL) / db_range * ASCII_METER_WIDTH;
-    for (int i = 0; i < ASCII_METER_WIDTH; ++i) {
-      if (i < filled_length) {
-        s[i] = '=';
-      } else {
-        s[i] = ' ';
-      }
-    }
-    printf("\r%s| %+6.2f dB", s, db);
-    fflush(stdout);
-  }
-  printf("\n");
-
-  // clean up resources
-  status = cosmoaudio_close(ca);
-  if (status != COSMOAUDIO_SUCCESS) {
-    fprintf(stderr, "failed to close microphone: %d\n", status);
-    return 5;
-  }
-  free(chunk);
-}
diff --git a/examples/ls.c b/examples/ls.c
new file mode 100644
index 000000000..7d8e509f1
--- /dev/null
+++ b/examples/ls.c
@@ -0,0 +1,83 @@
+#if 0
+/*─────────────────────────────────────────────────────────────────╗
+│ To the extent possible under law, Justine Tunney has waived      │
+│ all copyright and related or neighboring rights to this file,    │
+│ as it is written in the following disclaimers:                   │
+│   • http://unlicense.org/                                        │
+│   • http://creativecommons.org/publicdomain/zero/1.0/            │
+╚─────────────────────────────────────────────────────────────────*/
+#endif
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/dirent.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/log/check.h"
+#include "libc/mem/gc.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/dt.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/x/xasprintf.h"
+
+struct stat st;
+
+const char *TypeToString(uint8_t type) {
+  switch (type) {
+    case DT_UNKNOWN:
+      return "DT_UNKNOWN";
+    case DT_FIFO:
+      return "DT_FIFO";
+    case DT_CHR:
+      return "DT_CHR";
+    case DT_DIR:
+      return "DT_DIR";
+    case DT_BLK:
+      return "DT_BLK";
+    case DT_REG:
+      return "DT_REG";
+    case DT_LNK:
+      return "DT_LNK";
+    case DT_SOCK:
+      return "DT_SOCK";
+    default:
+      return "UNKNOWN";
+  }
+}
+
+void List(const char *path) {
+  DIR *d;
+  struct dirent *e;
+  const char *vpath;
+  if (strcmp(path, ".") == 0) {
+    vpath = "";
+  } else if (!endswith(path, "/")) {
+    vpath = gc(xasprintf("%s/", path));
+  } else {
+    vpath = path;
+  }
+  if (stat(path, &st) != -1) {
+    if (S_ISDIR(st.st_mode)) {
+      CHECK((d = opendir(path)));
+      while ((e = readdir(d))) {
+        printf("0x%016x 0x%016x %-10s %s%s\n", e->d_ino, e->d_off,
+               TypeToString(e->d_type), vpath, e->d_name);
+      }
+      closedir(d);
+    } else {
+      printf("%s\n", path);
+    }
+  } else {
+    fprintf(stderr, "not found: %s\n", path);
+  }
+}
+
+int main(int argc, char *argv[]) {
+  int i;
+  if (argc == 1) {
+    List(".");
+  } else {
+    for (i = 1; i < argc; ++i) {
+      List(argv[i]);
+    }
+  }
+  return 0;
+}
diff --git a/examples/nc.c b/examples/nc.c
index 1e4f9945b..3e29a51d2 100644
--- a/examples/nc.c
+++ b/examples/nc.c
@@ -7,16 +7,25 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <cosmo.h>
-#include <getopt.h>
-#include <netdb.h>
-#include <poll.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
-// clang-format off
+#include "libc/calls/calls.h"
+#include "libc/fmt/conv.h"
+#include "libc/log/log.h"
+#include "libc/macros.internal.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sock/sock.h"
+#include "libc/sock/struct/linger.h"
+#include "libc/sock/struct/pollfd.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/af.h"
+#include "libc/sysv/consts/ipproto.h"
+#include "libc/sysv/consts/poll.h"
+#include "libc/sysv/consts/shut.h"
+#include "libc/sysv/consts/so.h"
+#include "libc/sysv/consts/sock.h"
+#include "libc/sysv/consts/sol.h"
+#include "third_party/getopt/getopt.internal.h"
+#include "third_party/musl/netdb.h"
 
 /**
  * @fileoverview netcat clone
@@ -27,14 +36,12 @@
  * Here's an example usage:
  *
  *     make -j8 o//examples/nc.com
- *     printf 'GET /\r\nHost: justine.lol\r\n\r\n' | o//examples/nc.com justine.lol 80
+ *     printf 'GET /\r\nHost: justine.lol\r\n\r\n' | o//examples/nc.com
+ * justine.lol 80
  *
- * Once upon time we called this command basically "telnet"
+ * Once upon time we called this command "telnet"
  */
 
-#define ARRAYLEN(A) \
-  ((sizeof(A) / sizeof(*(A))) / ((unsigned)!(sizeof(A) % sizeof(*(A)))))
-
 int main(int argc, char *argv[]) {
   ssize_t rc;
   size_t i, got;
diff --git a/examples/nesemu1.cc b/examples/nesemu1.cc
index a7597c265..a7799bff0 100644
--- a/examples/nesemu1.cc
+++ b/examples/nesemu1.cc
@@ -3,7 +3,6 @@
 /* PORTED TO TELETYPEWRITERS IN YEAR 2020 BY JUSTINE ALEXANDRA ROBERTS TUNNEY */
 /* TRADEMARKS ARE OWNED BY THEIR RESPECTIVE OWNERS LAWYERCATS LUV TAUTOLOGIES */
 /* https://bisqwit.iki.fi/jutut/kuvat/programming_examples/nesemu1/nesemu1.cc */
-#include "dsp/audio/cosmoaudio/cosmoaudio.h"
 #include "dsp/core/core.h"
 #include "dsp/core/half.h"
 #include "dsp/core/illumination.h"
@@ -13,6 +12,7 @@
 #include "dsp/tty/tty.h"
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
+#include "libc/calls/struct/itimerval.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/winsize.h"
 #include "libc/calls/termios.h"
@@ -23,7 +23,7 @@
 #include "libc/inttypes.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/arraylist2.internal.h"
 #include "libc/mem/mem.h"
@@ -35,17 +35,20 @@
 #include "libc/str/str.h"
 #include "libc/sysv/consts/ex.h"
 #include "libc/sysv/consts/exit.h"
+#include "libc/sysv/consts/f.h"
 #include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/itimer.h"
+#include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/poll.h"
 #include "libc/sysv/consts/prio.h"
 #include "libc/sysv/consts/sig.h"
+#include "libc/sysv/consts/w.h"
 #include "libc/thread/thread.h"
 #include "libc/time.h"
 #include "libc/x/xasprintf.h"
 #include "libc/x/xsigaction.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "third_party/getopt/getopt.internal.h"
-#include "third_party/libcxx/__atomic/atomic.h"
 #include "third_party/libcxx/vector"
 #include "tool/viz/lib/knobs.h"
 
@@ -108,9 +111,7 @@ AUTHORS\n\
 #define DYN     240
 #define DXN     256
 #define FPS     60.0988
-#define CPUHZ   1789773
-#define SRATE   44100
-#define ABUFZ   ((int)(SRATE / FPS) + 1)
+#define HZ      1789773
 #define GAMMA   2.2
 #define CTRL(C) ((C) ^ 0100)
 #define ALT(C)  ((033 << 010) | (C))
@@ -120,11 +121,25 @@ typedef uint8_t u8;
 typedef uint16_t u16;
 typedef uint32_t u32;
 
+static const struct itimerval kNesFps = {
+    {0, 1. / FPS * 1e6},
+    {0, 1. / FPS * 1e6},
+};
+
+struct Frame {
+  char *p, *w, *mem;
+};
+
 struct Action {
   int code;
   int wait;
 };
 
+struct Audio {
+  size_t i;
+  int16_t p[65536];
+};
+
 struct Status {
   int wait;
   char text[80];
@@ -135,33 +150,32 @@ struct ZipGames {
   char** p;
 };
 
-static const struct timespec kNesFps = {0, 1. / FPS * 1e9};
-
+static int frame_;
+static int playfd_;
+static int playpid_;
+static size_t vtsize_;
 static bool artifacts_;
 static long tyn_, txn_;
+static struct Frame vf_[2];
+static struct Audio audio_;
 static const char* inputfn_;
 static struct Status status_;
 static volatile bool exited_;
+static volatile bool timeout_;
 static volatile bool resized_;
-static struct CosmoAudio* ca_;
 static struct TtyRgb* ttyrgb_;
 static unsigned char *R, *G, *B;
 static struct ZipGames zipgames_;
 static struct Action arrow_, button_;
 static struct SamplingSolution* ssy_;
 static struct SamplingSolution* ssx_;
-static unsigned char (*pixels_)[3][DYN][DXN];
+static unsigned char pixels_[3][DYN][DXN];
 static unsigned char palette_[3][64][512][3];
 static int joy_current_[2], joy_next_[2], joypos_[2];
 
 static int keyframes_ = 10;
 static enum TtyBlocksSelection blocks_ = kTtyBlocksUnicode;
-static enum TtyQuantizationAlgorithm quant_ = kTtyQuantXterm256;
-
-static struct timespec deadline_;
-static std::atomic<void*> pixels_ready_;
-static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static enum TtyQuantizationAlgorithm quant_ = kTtyQuantTrue;
 
 static int Clamp(int v) {
   return MAX(0, MIN(255, v));
@@ -215,39 +229,73 @@ void InitPalette(void) {
           rgbc[u] = FixGamma(y / 1980. + i * A[u] / 9e6 + q * B[u] / 9e6);
         }
         matvmul3(rgbd65, lightbulb, rgbc);
-        for (u = 0; u < 3; ++u)
+        for (u = 0; u < 3; ++u) {
           palette_[o][p1][p0][u] = Clamp(rgbd65[u] * 255);
+        }
       }
     }
   }
 }
 
+static ssize_t Write(int fd, const void* p, size_t n) {
+  int rc;
+  sigset_t ss, oldss;
+  sigfillset(&ss);
+  sigprocmask(SIG_SETMASK, &ss, &oldss);
+  rc = write(fd, p, n);
+  sigprocmask(SIG_SETMASK, &oldss, 0);
+  return rc;
+}
+
 static void WriteString(const char* s) {
-  write(STDOUT_FILENO, s, strlen(s));
+  Write(STDOUT_FILENO, s, strlen(s));
 }
 
 void Exit(int rc) {
   WriteString("\r\n\e[0m\e[J");
-  if (rc && errno)
+  if (rc && errno) {
     fprintf(stderr, "%s%s\r\n", "error: ", strerror(errno));
+  }
   exit(rc);
 }
 
 void Cleanup(void) {
   ttyraw((enum TtyRawFlags)(-1u));
   ttyshowcursor(STDOUT_FILENO);
-  cosmoaudio_close(ca_);
-  ca_ = 0;
+  if (playpid_) {
+    kill(playpid_, SIGKILL);
+    close(playfd_);
+    playfd_ = -1;
+  }
 }
 
 void OnCtrlC(void) {
   exited_ = true;
 }
 
+void OnTimer(void) {
+  timeout_ = true;
+}
+
 void OnResize(void) {
   resized_ = true;
 }
 
+void OnPiped(void) {
+  exited_ = true;
+}
+
+void OnSigChld(void) {
+  waitpid(-1, 0, WNOHANG);
+  close(playfd_);
+  playpid_ = 0;
+  playfd_ = -1;
+}
+
+void InitFrame(struct Frame* f) {
+  f->p = f->w = f->mem = (char*)realloc(f->mem, vtsize_);
+}
+
 long ChopAxis(long dn, long sn) {
   while (HALF(sn) > dn) {
     sn = HALF(sn);
@@ -270,6 +318,11 @@ void GetTermSize(void) {
   G = (unsigned char*)realloc(G, tyn_ * txn_);
   B = (unsigned char*)realloc(B, tyn_ * txn_);
   ttyrgb_ = (struct TtyRgb*)realloc(ttyrgb_, tyn_ * txn_ * 4);
+  vtsize_ = ((tyn_ * txn_ * strlen("\e[48;2;255;48;2;255m▄")) +
+             (tyn_ * strlen("\e[0m\r\n")) + 128);
+  frame_ = 0;
+  InitFrame(&vf_[0]);
+  InitFrame(&vf_[1]);
   WriteString("\e[0m\e[H\e[J");
   resized_ = false;
 }
@@ -277,7 +330,11 @@ void GetTermSize(void) {
 void IoInit(void) {
   GetTermSize();
   xsigaction(SIGINT, (void*)OnCtrlC, 0, 0, NULL);
+  xsigaction(SIGPIPE, (void*)OnPiped, 0, 0, NULL);
   xsigaction(SIGWINCH, (void*)OnResize, 0, 0, NULL);
+  xsigaction(SIGALRM, (void*)OnTimer, 0, 0, NULL);
+  xsigaction(SIGCHLD, (void*)OnSigChld, 0, 0, NULL);
+  setitimer(ITIMER_REAL, &kNesFps, NULL);
   ttyhidecursor(STDOUT_FILENO);
   ttyraw(kTtySigs);
   ttyquantsetup(quant_, kTtyQuantRgb, blocks_);
@@ -414,29 +471,81 @@ ssize_t ReadKeyboard(void) {
   return rc;
 }
 
-void ScaleVideoFrameToTeletypewriter(unsigned char (*pixels)[3][DYN][DXN]) {
+bool HasVideo(struct Frame* f) {
+  return f->w < f->p;
+}
+
+bool HasPendingVideo(void) {
+  return HasVideo(&vf_[0]) || HasVideo(&vf_[1]);
+}
+
+bool HasPendingAudio(void) {
+  return playpid_ && audio_.i;
+}
+
+struct Frame* FlipFrameBuffer(void) {
+  frame_ = !frame_;
+  return &vf_[frame_];
+}
+
+void TransmitVideo(void) {
+  ssize_t rc;
+  struct Frame* f;
+  f = &vf_[frame_];
+  if (!HasVideo(f))
+    f = FlipFrameBuffer();
+  if ((rc = Write(STDOUT_FILENO, f->w, f->p - f->w)) != -1) {
+    f->w += rc;
+  } else if (errno == EAGAIN) {
+    // slow teletypewriter
+  } else if (errno == EPIPE) {
+    Exit(0);
+  }
+}
+
+void TransmitAudio(void) {
+  ssize_t rc;
+  if (!playpid_)
+    return;
+  if (!audio_.i)
+    return;
+  if (playfd_ == -1)
+    return;
+  if ((rc = Write(playfd_, audio_.p, audio_.i * sizeof(short))) != -1) {
+    rc /= sizeof(short);
+    memmove(audio_.p, audio_.p + rc, (audio_.i - rc) * sizeof(short));
+    audio_.i -= rc;
+  } else if (errno == EPIPE) {
+    kill(playpid_, SIGKILL);
+    close(playfd_);
+    playfd_ = -1;
+    Exit(0);
+  }
+}
+
+void ScaleVideoFrameToTeletypewriter(void) {
   long y, x, yn, xn;
   yn = DYN, xn = DXN;
   while (HALF(yn) > tyn_ || HALF(xn) > txn_) {
     if (HALF(xn) > txn_) {
-      Magikarp2xX(DYN, DXN, (*pixels)[0], yn, xn);
-      Magikarp2xX(DYN, DXN, (*pixels)[1], yn, xn);
-      Magikarp2xX(DYN, DXN, (*pixels)[2], yn, xn);
+      Magikarp2xX(DYN, DXN, pixels_[0], yn, xn);
+      Magikarp2xX(DYN, DXN, pixels_[1], yn, xn);
+      Magikarp2xX(DYN, DXN, pixels_[2], yn, xn);
       xn = HALF(xn);
     }
     if (HALF(yn) > tyn_) {
-      Magikarp2xY(DYN, DXN, (*pixels)[0], yn, xn);
-      Magikarp2xY(DYN, DXN, (*pixels)[1], yn, xn);
-      Magikarp2xY(DYN, DXN, (*pixels)[2], yn, xn);
+      Magikarp2xY(DYN, DXN, pixels_[0], yn, xn);
+      Magikarp2xY(DYN, DXN, pixels_[1], yn, xn);
+      Magikarp2xY(DYN, DXN, pixels_[2], yn, xn);
       yn = HALF(yn);
     }
   }
-  GyaradosUint8(tyn_, txn_, R, DYN, DXN, (*pixels)[0], tyn_, txn_, yn, xn, 0,
-                255, ssy_, ssx_, true);
-  GyaradosUint8(tyn_, txn_, G, DYN, DXN, (*pixels)[1], tyn_, txn_, yn, xn, 0,
-                255, ssy_, ssx_, true);
-  GyaradosUint8(tyn_, txn_, B, DYN, DXN, (*pixels)[2], tyn_, txn_, yn, xn, 0,
-                255, ssy_, ssx_, true);
+  GyaradosUint8(tyn_, txn_, R, DYN, DXN, pixels_[0], tyn_, txn_, yn, xn, 0, 255,
+                ssy_, ssx_, true);
+  GyaradosUint8(tyn_, txn_, G, DYN, DXN, pixels_[1], tyn_, txn_, yn, xn, 0, 255,
+                ssy_, ssx_, true);
+  GyaradosUint8(tyn_, txn_, B, DYN, DXN, pixels_[2], tyn_, txn_, yn, xn, 0, 255,
+                ssy_, ssx_, true);
   for (y = 0; y < tyn_; ++y) {
     for (x = 0; x < txn_; ++x) {
       ttyrgb_[y * txn_ + x] =
@@ -453,104 +562,57 @@ void KeyCountdown(struct Action* a) {
   }
 }
 
-void Raster(unsigned char (*pixels)[3][DYN][DXN]) {
-  struct TtyRgb bg = {0x12, 0x34, 0x56, 0};
-  struct TtyRgb fg = {0x12, 0x34, 0x56, 0};
-  ScaleVideoFrameToTeletypewriter(pixels);
-  char* ansi = (char*)malloc((tyn_ * txn_ * strlen("\e[48;2;255;48;2;255m▄")) +
-                             (tyn_ * strlen("\e[0m\r\n")) + 128);
-  char* p = ansi;
-  p = stpcpy(p, "\e[0m\e[H");
-  p = ttyraster(p, ttyrgb_, tyn_, txn_, bg, fg);
-  free(pixels);
-  if (status_.wait) {
-    status_.wait--;
-    p = stpcpy(p, "\e[0m\e[H");
-    p = stpcpy(p, status_.text);
-  }
-  size_t n = p - ansi;
-  ssize_t wrote;
-  for (size_t i = 0; i < n; i += wrote) {
-    if ((wrote = write(STDOUT_FILENO, ansi + i, n - i)) == -1) {
-      exited_ = true;
-      break;
+void PollAndSynchronize(void) {
+  do {
+    if (ReadKeyboard() == -1) {
+      if (errno != EINTR)
+        Exit(1);
+      if (exited_)
+        Exit(0);
+      if (resized_)
+        GetTermSize();
     }
-  }
-  free(ansi);
+  } while (!timeout_);
+  TransmitVideo();
+  TransmitAudio();
+  timeout_ = false;
+  KeyCountdown(&arrow_);
+  KeyCountdown(&button_);
+  joy_next_[0] = arrow_.code | button_.code;
+  joy_next_[1] = arrow_.code | button_.code;
 }
 
-void* RasterThread(void* arg) {
-  sigset_t ss;
-  sigemptyset(&ss);
-  sigaddset(&ss, SIGINT);
-  sigaddset(&ss, SIGHUP);
-  sigaddset(&ss, SIGQUIT);
-  sigaddset(&ss, SIGTERM);
-  sigaddset(&ss, SIGPIPE);
-  sigprocmask(SIG_SETMASK, &ss, 0);
-  for (;;) {
-    unsigned char(*pixels)[3][DYN][DXN];
-    pthread_mutex_lock(&lock);
-    while (!(pixels = (unsigned char(*)[3][DYN][DXN])pixels_ready_.load()))
-      pthread_cond_wait(&cond, &lock);
-    pixels_ready_.store(0);
-    pthread_mutex_unlock(&lock);
-    if (resized_)
-      GetTermSize();
-    Raster(pixels);
+void Raster(void) {
+  struct Frame* f;
+  struct TtyRgb bg = {0x12, 0x34, 0x56, 0};
+  struct TtyRgb fg = {0x12, 0x34, 0x56, 0};
+  ScaleVideoFrameToTeletypewriter();
+  f = &vf_[!frame_];
+  f->p = f->w = f->mem;
+  f->p = stpcpy(f->p, "\e[0m\e[H");
+  f->p = ttyraster(f->p, ttyrgb_, tyn_, txn_, bg, fg);
+  if (status_.wait) {
+    status_.wait--;
+    f->p = stpcpy(f->p, "\e[0m\e[H");
+    f->p = stpcpy(f->p, status_.text);
   }
+  PollAndSynchronize();
 }
 
 void FlushScanline(unsigned py) {
-  if (py != DYN - 1)
-    return;
-  pthread_mutex_lock(&lock);
-  if (!pixels_ready_) {
-    pixels_ready_.store(pixels_);
-    pixels_ = 0;
-    pthread_cond_signal(&cond);
+  if (py == DYN - 1) {
+    if (!timeout_) {
+      Raster();
+    }
+    timeout_ = false;
   }
-  pthread_mutex_unlock(&lock);
-  if (!pixels_)
-    pixels_ = (unsigned char(*)[3][DYN][DXN])malloc(3 * DYN * DXN);
-  if (exited_)
-    Exit(0);
-  do {
-    struct timespec now = timespec_mono();
-    struct timespec remain = timespec_subz(deadline_, now);
-    int remain_ms = timespec_tomillis(remain);
-    struct pollfd fds[] = {{STDIN_FILENO, POLLIN}};
-    int got = poll(fds, 1, remain_ms);
-    if (got == -1) {
-      if (errno == EINTR)
-        continue;
-      Exit(1);
-    }
-    if (got == 1) {
-      do {
-        if (ReadKeyboard() == -1) {
-          if (errno == EINTR)
-            continue;
-          Exit(1);
-        }
-      } while (0);
-    }
-    KeyCountdown(&arrow_);
-    KeyCountdown(&button_);
-    joy_next_[0] = arrow_.code | button_.code;
-    joy_next_[1] = arrow_.code | button_.code;
-    now = timespec_mono();
-    do
-      deadline_ = timespec_add(deadline_, kNesFps);
-    while (timespec_cmp(deadline_, now) <= 0);
-  } while (0);
 }
 
 static void PutPixel(unsigned px, unsigned py, unsigned pixel, int offset) {
   static unsigned prev;
-  (*pixels_)[0][py][px] = palette_[offset][prev % 64][pixel][2];
-  (*pixels_)[1][py][px] = palette_[offset][prev % 64][pixel][1];
-  (*pixels_)[2][py][px] = palette_[offset][prev % 64][pixel][0];
+  pixels_[0][py][px] = palette_[offset][prev % 64][pixel][2];
+  pixels_[1][py][px] = palette_[offset][prev % 64][pixel][1];
+  pixels_[2][py][px] = palette_[offset][prev % 64][pixel][0];
   prev = pixel;
 }
 
@@ -1432,7 +1494,8 @@ void Tick() {  // Invoked at CPU's rate.
 // Mix the audio: Get the momentary sample from each channel and mix them.
 #define s(c) channels[c].Tick<c == 1 ? 0 : c>()
   auto v = [](float m, float n, float d) { return n != 0.f ? m / n : d; };
-  float sample =
+  short sample =
+      30000 *
       (v(95.88f, (100.f + v(8128.f, s(0) + s(1), -100.f)), 0.f) +
        v(159.79f,
          (100.f +
@@ -1441,19 +1504,7 @@ void Tick() {  // Invoked at CPU's rate.
        0.5f);
 #undef s
 
-  // Relay audio to speaker.
-  static int buffer_position = 0;
-  static float audio_buffer[ABUFZ];
-  static double sample_counter = 0.0;
-  sample_counter += (double)SRATE / CPUHZ;
-  while (sample_counter >= 1.0) {
-    audio_buffer[buffer_position++] = sample;
-    sample_counter -= 1.0;
-    if (buffer_position == ABUFZ) {
-      cosmoaudio_write(ca_, audio_buffer, buffer_position);
-      buffer_position = 0;
-    }
-  }
+  audio_.p[audio_.i = (audio_.i + 1) & (ARRAYLEN(audio_.p) - 1)] = sample;
 }
 
 }  // namespace APU
@@ -1665,8 +1716,8 @@ void Op() {
   if (!nmi_now)
     nmi_edge_detected = false;
 
-  // Define function pointers for each opcode (00..FF) and each interrupt
-  // (100,101,102)
+    // Define function pointers for each opcode (00..FF) and each interrupt
+    // (100,101,102)
 #define c(n) Ins<0x##n>, Ins<0x##n + 1>,
 #define o(n) c(n) c(n + 2) c(n + 4) c(n + 6)
   static void (*const i[0x108])() = {
@@ -1694,6 +1745,9 @@ char* GetLine(void) {
 
 int PlayGame(const char* romfile, const char* opt_tasfile) {
   FILE* fp;
+  int devnull;
+  int pipefds[2];
+  const char* ffplay;
   inputfn_ = opt_tasfile;
 
   if (!(fp = fopen(romfile, "rb"))) {
@@ -1706,28 +1760,46 @@ int PlayGame(const char* romfile, const char* opt_tasfile) {
     return 3;
   }
 
-  // initialize screen
-  pixels_ = (unsigned char(*)[3][DYN][DXN])malloc(3 * DYN * DXN);
   InitPalette();
 
-  // start raster thread
-  errno_t err;
-  pthread_t th;
-  if ((err = pthread_create(&th, 0, RasterThread, 0))) {
-    fprintf(stderr, "pthread_create: %s\n", strerror(err));
-    exit(1);
-  }
-
   // open speaker
-  struct CosmoAudioOpenOptions cao = {};
-  cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions);
-  cao.deviceType = kCosmoAudioDeviceTypePlayback;
-  cao.sampleRate = SRATE;
-  cao.channels = 1;
-  cosmoaudio_open(&ca_, &cao);
-
-  // initialize time
-  deadline_ = timespec_add(timespec_mono(), kNesFps);
+  // todo: this needs plenty of work
+  if (!IsWindows()) {
+    if ((ffplay = commandvenv("FFPLAY", "ffplay"))) {
+      devnull = open("/dev/null", O_WRONLY | O_CLOEXEC);
+      pipe2(pipefds, O_CLOEXEC);
+      if (!(playpid_ = fork())) {
+        const char* const args[] = {
+            ffplay,                  //
+            "-nodisp",               //
+            "-loglevel", "quiet",    //
+            "-ac",       "1",        //
+            "-ar",       "1789773",  //
+            "-f",        "s16le",    //
+            "pipe:",                 //
+            NULL,
+        };
+        dup2(pipefds[0], 0);
+        dup2(devnull, 1);
+        dup2(devnull, 2);
+        execv(ffplay, (char* const*)args);
+        abort();
+      }
+      close(pipefds[0]);
+      playfd_ = pipefds[1];
+    } else {
+      fputs("\nWARNING\n\
+\n\
+  Need `ffplay` command to play audio\n\
+  Try `sudo apt install ffmpeg` on Linux\n\
+  You can specify it on `PATH` or in `FFPLAY`\n\
+\n\
+Press enter to continue without sound: ",
+            stdout);
+      fflush(stdout);
+      GetLine();
+    }
+  }
 
   // Read the ROM file header
   u8 rom16count = fgetc(fp);
@@ -1835,8 +1907,9 @@ int SelectGameFromZip(void) {
   int i, rc;
   char *line, *uri;
   fputs("\nCOSMOPOLITAN NESEMU1\n\n", stdout);
-  for (i = 0; i < (int)zipgames_.i; ++i)
+  for (i = 0; i < (int)zipgames_.i; ++i) {
     printf("  [%d] %s\n", i, zipgames_.p[i]);
+  }
   fputs("\nPlease choose a game (or CTRL-C to quit) [default 0]: ", stdout);
   fflush(stdout);
   rc = 0;
@@ -1859,8 +1932,9 @@ int main(int argc, char** argv) {
   } else if (optind < argc) {
     rc = PlayGame(argv[optind], NULL);
   } else {
-    if (!FindZipGames())
+    if (!FindZipGames()) {
       PrintUsage(0, stderr);
+    }
     rc = SelectGameFromZip();
   }
   return rc;
diff --git a/examples/package/lib/myasm.S b/examples/package/lib/myasm.S
index f0e0cad66..acb21b98e 100644
--- a/examples/package/lib/myasm.S
+++ b/examples/package/lib/myasm.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Example assembly function.
 //
diff --git a/examples/parsefloat.c b/examples/parsefloat.c
index eb8e21aaa..c9f049aef 100644
--- a/examples/parsefloat.c
+++ b/examples/parsefloat.c
@@ -1,12 +1,3 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
 #include <stdio.h>
 
 #define PARSE_AND_PRINT(type, scan_fmt, print_fmt, str)      \
diff --git a/examples/pause.c b/examples/pause.c
index a36ccdec1..be13f5f45 100644
--- a/examples/pause.c
+++ b/examples/pause.c
@@ -7,10 +7,10 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <signal.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/fmt/itoa.h"
+#include "libc/str/str.h"
 
 volatile int g_sig;
 
@@ -21,13 +21,16 @@ void OnSig(int sig) {
 int main(int argc, char *argv[]) {
 
   // listen for all signals
-  for (int sig = 1; sig <= NSIG; ++sig)
+  for (int sig = 1; sig <= NSIG; ++sig) {
     signal(sig, OnSig);
+  }
 
   // wait for a signal
-  printf("waiting for signal to be sent to my pid %d\n", getpid());
+  char ibuf[12];
+  FormatInt32(ibuf, getpid());
+  tinyprint(2, "waiting for signal to be sent to ", ibuf, "\n", NULL);
   pause();
 
   // report the signal
-  printf("got %s\n", strsignal(g_sig));
+  tinyprint(1, "got ", strsignal(g_sig), "\n", NULL);
 }
diff --git a/examples/picol.c b/examples/picol.c
index 449e98ed4..fd54cca53 100644
--- a/examples/picol.c
+++ b/examples/picol.c
@@ -32,9 +32,12 @@
  * . Formatted as per Cosmopolitan's standards.
  */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include "libc/fmt/conv.h"
+#include "libc/log/log.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
 
 enum { PICOL_OK, PICOL_ERR, PICOL_RETURN, PICOL_BREAK, PICOL_CONTINUE };
 enum { PT_ESC, PT_STR, PT_CMD, PT_VAR, PT_SEP, PT_EOL, PT_EOF };
diff --git a/examples/printargs.c b/examples/printargs.c
index d65954fcb..60ce148ed 100644
--- a/examples/printargs.c
+++ b/examples/printargs.c
@@ -7,7 +7,7 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <cosmo.h>
+#include "libc/runtime/runtime.h"
 
 int main() {
   __printargs("");
diff --git a/examples/romanize.c b/examples/romanize.c
deleted file mode 100644
index 2b7df6561..000000000
--- a/examples/romanize.c
+++ /dev/null
@@ -1,1022 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <ctype.h>
-#include <stdio.h>
-#include <string.h>
-#include <wchar.h>
-#include <wctype.h>
-
-/**
- * @fileoverview Roman Transliteration, e.g.
- *
- *     $ echo 'gaius julius cæsar' | o//examples/romanize
- *     CAIVS IVLIVS CAESAR
- *     $ echo 'гаиус юлиус цаесар' | o//examples/romanize
- *     CAIVS IVLIVS TSAESAR
- *     $ echo 'عودة أبو تايه' | o//examples/romanize
- *     EVVDTA AEBVV TAIH
- *
- */
-
-#define PASSPORT 0
-
-enum Mode {
-  kArchaic,
-  kOld,
-  kClassical,
-  kMedieval,
-  kModern,
-} mode = kModern;
-
-bool IsHindiConsonant(wint_t c) {
-  switch (c) {
-    case 0x915:  // क
-    case 0x916:  // ख
-    case 0x917:  // ग
-    case 0x918:  // घ
-    case 0x91a:  // च
-    case 0x91b:  // छ
-    case L'ज':
-    case L'झ':
-    case L'ट':
-    case L'ठ':
-    case L'ड':
-    case L'ढ':
-    case L'ण':
-    case 0x924:  // त
-    case L'थ':
-    case L'द':
-    case L'ध':
-    case L'न':
-    case L'प':
-    case L'फ':
-    case L'ब':
-    case 0x92d:  // भ
-    case L'म':
-    case L'य':
-    case 0x930:  // र
-    case L'ल':
-    case L'व':
-    case L'श':
-    case L'ष':
-    case L'स':
-    case L'ह':
-      return true;
-    default:
-      return false;
-  }
-}
-
-bool IsHindiMagicConsonant(wint_t c) {
-  switch (c) {
-    case 0x902:  // ं
-      return true;
-    default:
-      return false;
-  }
-}
-
-int main(int argc, char* argv[]) {
-  wint_t c1, c2;
-  while ((c1 = towupper(fgetwc(stdin))) != -1) {
-    if (!iswcntrl(c1)) {
-      c2 = fgetwc(stdin);
-      if (mode < kMedieval || !isascii(c2))
-        c2 = towupper(c2);
-    } else {
-      c2 = 0;
-    }
-    switch (c1) {
-      case '.':
-        if (c2 == ' ') {
-          fputwc(L'·', stdout);
-          continue;
-        }
-        break;
-      case '"':
-      case '\'':
-      case ',':
-      case ';':
-      case 0xFEFF:  // ZERO WIDTH NO-BREAK SPACE (UTF-8 BOM)
-      case 0x200E:  // LEFT-TO-RIGHT MARK
-      case 0x200F:  // RIGHT-TO-LEFT MARK
-        fputwc(c1, stdout);
-        break;
-      case L'Ĵ':
-      case L'Ј':
-      case 'J':
-      J:
-        if (mode >= kModern) {
-          fputc('J', stdout);
-        } else {
-          fputc('I', stdout);
-        }
-        break;
-      case L'Ũ':
-      case L'Ū':
-      case L'Ŭ':
-      case L'Ů':
-      case L'Ű':
-      case L'Ų':
-      case L'Ù':
-      case L'Ú':
-      case L'Û':
-      case L'ў':
-      case L'У':
-      case 0x046A:
-      case 'U':
-      U:
-        if (mode >= kMedieval) {
-          fputc('U', stdout);
-        } else {
-          fputc('V', stdout);
-        }
-        break;
-      case L'Ŵ':
-      case L'Ƿ':
-      case 'W':
-      W:
-        if (mode >= kMedieval) {
-          fputc('W', stdout);
-        } else {
-          fputc('V', stdout);
-          fputc('V', stdout);
-        }
-        break;
-      case 'Y':
-      case L'Ý':
-      case L'Ŷ':
-      case L'Ÿ':
-      case L'Ы':
-      Y:
-        if (mode == kClassical) {
-          fputc('Y', stdout);
-        } else {
-          fputc('I', stdout);
-        }
-        break;
-      case L'Ẍ':
-      case L'ẍ':
-      case L'Ẋ':
-      case L'ẋ':
-      case 'X':
-        fputc('X', stdout);
-        break;
-      case L'Ź':
-      case L'Ż':
-      case L'Ž':
-      case L'З':
-      case 'Z':
-      Z:
-        if (mode == kOld) {
-          fputc('G', stdout);
-        } else {
-          fputc('Z', stdout);
-        }
-        break;
-      case L'Ĝ':
-      case L'Ğ':
-      case L'Ġ':
-      case L'Ģ':
-      case L'Ґ':
-      case L'Г':
-      case 0x0492:
-      case 'G':
-        if (mode >= kOld) {
-          fputc('G', stdout);
-        } else if (c2 == 'U' || c2 == 'O') {
-          fputc('Q', stdout);
-        } else {
-          fputc('C', stdout);
-        }
-        break;
-      case L'Ķ':
-      case L'К':
-      case 'K':
-        if (mode >= kMedieval) {
-          fputc('K', stdout);
-          break;
-        }
-        if (c2 == 'O') {
-          fputc('Q', stdout);
-          break;
-        }
-        if (c2 == 'N') {
-          break;
-        }
-        /* fallthrough */
-      case 'C':
-      case 0x04BA:
-        switch (c2) {
-          case 'A':
-            if (mode >= kOld) {
-              fputc('C', stdout);
-            } else {
-              fputc('K', stdout);
-            }
-            break;
-          /* case 'O': */
-          case 'U':
-          case 'V':
-            fputc('Q', stdout);
-            break;
-          default:
-            fputc('C', stdout);
-            break;
-        }
-        break;
-      case L'Æ':
-      case L'Ä':
-        fputc('A', stdout);
-        fputc('E', stdout);
-        break;
-      case L'Ĳ':
-        fputc('I', stdout);
-        goto J;
-      case L'Þ':
-        fputc('T', stdout);
-        fputc('H', stdout);
-        break;
-      case L'Œ':
-      case L'Ö':
-      case L'Ø':
-        fputc('O', stdout);
-        fputc('E', stdout);
-        break;
-      case L'Ü':
-        if (mode >= kMedieval) {
-          fputc('U', stdout);
-        } else {
-          fputc('V', stdout);
-        }
-        fputc('E', stdout);
-        break;
-      case L'ẞ':
-        fputc('S', stdout);
-        fputc('S', stdout);
-        break;
-      case L'À':
-      case L'Á':
-      case L'Â':
-      case L'Ã':
-      case L'Ā':
-      case L'Ă':
-      case L'Ą':
-      case L'А':
-        fputc('A', stdout);
-        break;
-      case L'Ç':
-      case L'Ć':
-      case L'Ĉ':
-      case L'Ċ':
-      case L'Č':
-        fputc('C', stdout);
-        break;
-      case L'È':
-      case L'É':
-      case L'Ê':
-      case L'Ë':
-      case L'Ē':
-      case L'Ĕ':
-      case L'Ė':
-      case L'Ę':
-      case L'Ě':
-        fputc('E', stdout);
-        break;
-      case L'Ì':
-      case L'Í':
-      case L'Î':
-      case L'Ï':
-        fputc('I', stdout);
-        break;
-      case L'Ð':
-      case L'Ď':
-        fputc('D', stdout);
-        break;
-      case L'Ñ':
-      case L'Ń':
-      case L'Ņ':
-      case L'Ň':
-      case L'Ŋ':
-        fputc('N', stdout);
-        break;
-      case L'Ò':
-      case L'Ó':
-      case L'Ô':
-      case L'Õ':
-      case L'Ō':
-      case L'Ŏ':
-      case L'Ő':
-        fputc('O', stdout);
-        break;
-      default:
-        fputwc(c1, stdout);
-        break;
-      case L'Ĥ':
-      case L'Ħ':
-        fputc('H', stdout);
-        break;
-      case L'Ĩ':
-      case L'Ī':
-      case L'Ĭ':
-      case L'Į':
-      case L'İ':
-      case L'I':
-      case L'И':
-      case L'Й':
-        fputc('I', stdout);
-        break;
-      case L'Ĺ':
-      case L'Ļ':
-      case L'Ľ':
-      case L'Ŀ':
-      case L'Ł':
-        fputc('L', stdout);
-        break;
-      case L'Ŕ':
-      case L'Ŗ':
-      case L'Ř':
-        fputc('R', stdout);
-        break;
-      case L'Ś':
-      case L'Ŝ':
-      case L'Ş':
-      case L'Š':
-        fputc('S', stdout);
-        break;
-      case L'Ţ':
-      case L'Ť':
-      case L'Ŧ':
-        fputc('T', stdout);
-        break;
-      case L'Ё':
-        fputc('E', stdout);
-        break;
-      case L'Ћ':
-        fputc('D', stdout);
-        break;
-      case L'Є':
-        fputc('I', stdout);
-        fputc('E', stdout);
-        break;
-      case L'Ѕ':
-        fputc('D', stdout);
-        fputc('Z', stdout);
-        break;
-      case L'І':
-        fputc('I', stdout);
-        break;
-      case L'Ї':
-        fputc('I', stdout);
-        break;
-      case L'Љ':
-        fputc('L', stdout);
-        if (mode >= kMedieval) {
-          fputc('J', stdout);
-        } else {
-          fputc('I', stdout);
-        }
-        break;
-      case L'Њ':
-        fputc('N', stdout);
-        goto J;
-      case L'Ќ':
-        fputc('K', stdout);
-        break;
-      case L'Џ':
-        fputc('D', stdout);
-        goto Z;
-      case L'Б':
-        fputc('B', stdout);
-        break;
-      case L'В':
-        fputc('V', stdout);
-        break;
-      case L'Д':
-        fputc('D', stdout);
-        break;
-      case L'Е':
-        fputc('E', stdout);
-        break;
-      case L'Ж':
-        if (mode == kOld) {
-          fputc('G', stdout);
-        } else {
-          fputc('Z', stdout);
-        }
-        fputc('H', stdout);
-        break;
-      case L'Л':
-        fputc('L', stdout);
-        break;
-      case L'М':
-        fputc('M', stdout);
-        break;
-      case L'Н':
-        fputc('N', stdout);
-        break;
-      case L'О':
-        fputc('O', stdout);
-        break;
-      case L'П':
-        fputc('P', stdout);
-        break;
-      case L'Р':
-        fputc('R', stdout);
-        break;
-      case L'С':
-        fputc('S', stdout);
-        break;
-      case L'Т':
-        fputc('T', stdout);
-        break;
-      case L'Ф':
-        fputc('F', stdout);
-        break;
-      case L'Х':
-        /* fputc('K', stdout); */
-        fputc('H', stdout);
-        break;
-      case L'Ц':
-        fputc('T', stdout);
-        fputc('S', stdout);
-        break;
-      case L'Ч':
-        fputc('C', stdout);
-        fputc('H', stdout);
-        break;
-      case L'Ш':
-        fputc('S', stdout);
-        fputc('H', stdout);
-        break;
-      case L'Щ':
-        fputc('S', stdout);
-        fputc('H', stdout);
-        fputc('C', stdout);
-        fputc('H', stdout);
-        break;
-      case L'Ъ':
-        fputc('I', stdout);
-        fputc('E', stdout);
-        break;
-      case L'Э':
-        fputc('E', stdout);
-        break;
-      case L'Ю':
-        fputc('I', stdout);
-        goto U;
-      case L'Я':
-        fputc('I', stdout);
-        fputc('A', stdout);
-        break;
-      case L'Ȝ':
-        if (mode >= kOld) {
-          fputc('G', stdout);
-        } else if (mode == kArchaic) {
-          fputc('C', stdout);
-        }
-        fputc('H', stdout);
-        break;
-      case L'ſ':
-        fputc('S', stdout);
-        break;
-      case 0x0621:  // hamza
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('E', stdout);
-        break;
-      case 0x0622:  // alef with madda above
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('A', stdout);
-        fputc('A', stdout);
-        break;
-      case 0x0623:  // alef with hamza above
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('A', stdout);
-        fputc('E', stdout);
-        break;
-      case 0x0624:  // waw with hamza above
-        goto U;
-      case 0x0625:  // alef with hamza below
-        fputc('I', stdout);
-        break;
-      case 0x0626:  // yeh with hamza above
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('I', stdout);
-        break;
-      case 0x0627:  // alef
-        fputc('A', stdout);
-        break;
-      case 0x0628:  // beh
-        fputc('B', stdout);
-        break;
-      case 0x0629:  // teh marbuta
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('T', stdout);
-        fputc('A', stdout);
-        break;
-      case 0x062A:  // teh
-        fputc('T', stdout);
-        break;
-      case 0x062B:  // theh
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('T', stdout);
-        fputc('H', stdout);
-        break;
-      case 0x062C:  // jeem
-        goto J;
-      case 0x062D:  // hah
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('H', stdout);
-        break;
-      case 0x062E:  // khah
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('K', stdout);
-        fputc('H', stdout);
-        break;
-      case 0x062F:  // dal
-        fputc('D', stdout);
-        break;
-      case 0x0630:  // thal
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('D', stdout);
-        fputc('H', stdout);
-        break;
-      case 0x0631:  // reh
-        fputc('R', stdout);
-        break;
-      case 0x0632:  // zain
-        fputc('Z', stdout);
-        break;
-      case 0x0633:  // seen
-        fputc('S', stdout);
-        break;
-      case 0x0634:  // sheen
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('S', stdout);
-        fputc('H', stdout);
-        break;
-      case 0x0635:  // sad
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('S', stdout);
-        fputc('S', stdout);
-        break;
-      case 0x0636:  // dad
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('D', stdout);
-        fputc('Z', stdout);
-        break;
-      case 0x0637:  // tah
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('T', stdout);
-        fputc('T', stdout);
-        break;
-      case 0x0638:  // zah
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('Z', stdout);
-        fputc('Z', stdout);
-        break;
-      case 0x0639:  // ain
-        fputc('E', stdout);
-        break;
-      case 0x063A:  // ghain
-        fputc('G', stdout);
-        break;
-      case 0x0641:  // feh
-        fputc('F', stdout);
-        break;
-      case 0x0642:  // qaf
-        fputc('Q', stdout);
-        break;
-      case 0x0643:  // kaf
-        fputc('K', stdout);
-        break;
-      case 0x0644:  // lam
-        fputc('L', stdout);
-        break;
-      case 0x0645:  // meem
-        fputc('M', stdout);
-        break;
-      case 0x0646:  // noon
-        fputc('N', stdout);
-        break;
-      case 0x0647:  // heh
-        fputc('H', stdout);
-        break;
-      case 0x0648:  // waw
-        goto W;
-      case 0x0649:  // alef maksura
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('A', stdout);
-        goto Y;
-      case 0x064A:  // yeh
-        goto Y;
-      case 0x0671:  // alef wasla
-        if (PASSPORT)
-          fputc('X', stdout);
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('A', stdout);
-        break;
-      case 0x0679:  // tteh
-        if (PASSPORT)
-          fputc('X', stdout);
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('T', stdout);
-        break;
-      case 0x067C:  // teh with ring
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('R', stdout);
-        fputc('T', stdout);
-        break;
-      case 0x067E:  // peh
-        fputc('P', stdout);
-        break;
-      case 0x0681:  // hah with hamza above
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('K', stdout);
-        fputc('E', stdout);
-        break;
-      case 0x0685:  // hah with 3 dots above
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('X', stdout);
-        fputc('H', stdout);
-        break;
-      case 0x0686:  // tcheh
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('C', stdout);
-        break;
-      case 0x0688:  // ddal
-        if (PASSPORT)
-          fputc('X', stdout);
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('D', stdout);
-        break;
-      case 0x0689:  // dal with ring
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('D', stdout);
-        fputc('R', stdout);
-        break;
-      case 0x0691:  // rreh
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('X', stdout);
-        fputc('R', stdout);
-        break;
-      case 0x0693:  // reh with ring
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('R', stdout);
-        fputc('R', stdout);
-        break;
-      case 0x0696:  // reh with dot below and dot above
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('R', stdout);
-        fputc('X', stdout);
-        break;
-      case 0x0698:  // jeh
-        if (PASSPORT)
-          fputc('X', stdout);
-        goto J;
-      case 0x069A:  // seen with dot below and dot above
-        if (PASSPORT)
-          fputc('X', stdout);
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('S', stdout);
-        break;
-      case 0x06A9:  // keheh
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('K', stdout);
-        fputc('K', stdout);
-        break;
-      case 0x06AB:  // kaf with ring
-        if (PASSPORT)
-          fputc('X', stdout);
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('K', stdout);
-        break;
-      case 0x06AD:  // ng
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('N', stdout);
-        fputc('G', stdout);
-        break;
-      case 0x06AF:  // gaf
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('G', stdout);
-        fputc('G', stdout);
-        break;
-      case 0x06BA:  // noon ghunna
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('N', stdout);
-        fputc('N', stdout);
-        break;
-      case 0x06BC:  // noon with ring
-        if (PASSPORT)
-          fputc('X', stdout);
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('N', stdout);
-        break;
-      case 0x06BE:  // heh doachashmee
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('D', stdout);
-        fputc('O', stdout);
-        break;
-      case 0x06C0:  // heh with yeh above
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('Y', stdout);
-        fputc('H', stdout);
-        break;
-      case 0x06C1:  // heh goal
-        if (PASSPORT)
-          fputc('X', stdout);
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('G', stdout);
-        break;
-      case 0x06C2:  // heh goal with hamza above
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('G', stdout);
-        fputc('E', stdout);
-        break;
-      case 0x06C3:  // teh marbuta goal
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('T', stdout);
-        fputc('G', stdout);
-        break;
-      case 0x06CC:  // farsi yeh
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('Y', stdout);
-        fputc('A', stdout);
-        break;
-      case 0x06CD:  // yeh with tail
-        if (PASSPORT)
-          fputc('X', stdout);
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('Y', stdout);
-        break;
-      case 0x06D0:  // yeh
-        goto Y;
-      case 0x06D2:  // yeh barree
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('Y', stdout);
-        fputc('B', stdout);
-        break;
-      case 0x06D3:  // yeh barree with hamza above
-        if (PASSPORT)
-          fputc('X', stdout);
-        fputc('B', stdout);
-        fputc('E', stdout);
-        break;
-      case 0x069C:  // seen with 3 dots below and 3 dots above
-      case 0x06A2:  // feh with dot moved below
-      case 0x06A7:  // qaf with dot above
-      case 0x06A8:  // qaf with 3 dots above
-      case 0x0651:  // shadda
-      case 0x0652:  // sukun
-      case 0x0670:  // superscript alef
-      case 0x064B:  // fathatan
-      case 0x064C:  // dammatan
-      case 0x064D:  // kasratan
-      case 0x064E:  // fatha
-      case 0x064F:  // damma
-      case 0x0650:  // kasra
-      case 0x0640:  // tatwheel
-        break;
-
-      //
-      // HINDI
-      //
-      // The following C code for the romanization of Hindi was designed
-      // and written by vasant and jart on 2024-08-20.
-      //
-      // भारत देश का नाम है,
-      // तिरंगा झंडा इसकी शान है।
-      // अलग-अलग हैं बोली-भाषा,
-      // कहीं पहाड़, तो कहीं मैदान हैं।
-      // बहुत बड़ा है देश हमारा,
-      // परम्पराओं पर हमको अभिमान है।
-      // अनेकता में एकता,
-      // यही हमारा संविधान है।
-      //
-      // BHARAT DESH KA NAM HAI,
-      // TIRANGA JHANDA ISAKII SHAN HAI.
-      // ALAG-ALAG HAIN BOLII-BHASSA,
-      // KAHIIN PAHAD, TO KAHIIN MAIDAN HAIN.
-      // BAHUT BADA HAI DESH HAMARA,
-      // PARAMPARAON PAR HAMAKO ABHIMAN HAI.
-      // ANEKATA MEN EKATA,
-      // YAHII HAMARA SANVIDHAN HAI.
-      //
-
-      // Hindi Consonants
-      case 0x915:  // क
-        fputs("K", stdout);
-        break;
-      case 0x916:  // ख
-        fputs("KH", stdout);
-        break;
-      case 0x917:  // ग
-        fputs("G", stdout);
-        break;
-      case 0x918:  // घ
-        fputs("GH", stdout);
-        break;
-      case 0x91a:  // च
-        fputs("CH", stdout);
-        break;
-      case 0x91b:  // छ
-        fputs("CHH", stdout);
-        break;
-      case L'ज':
-        fputs("J", stdout);
-        break;
-      case L'झ':
-        fputs("JH", stdout);
-        break;
-      case L'ट':
-        fputs("T", stdout);
-        break;
-      case L'ठ':
-        fputs("TH", stdout);
-        break;
-      case L'ड':
-        fputs("D", stdout);
-        break;
-      case L'ढ':
-        fputs("DH", stdout);
-        break;
-      case L'ण':
-        fputs("N", stdout);
-        break;
-      case 0x924:  // त
-        fputs("T", stdout);
-        break;
-      case L'थ':
-        fputs("TH", stdout);
-        break;
-      case L'द':
-        fputs("D", stdout);
-        break;
-      case L'ध':
-        fputs("DH", stdout);
-        break;
-      case L'न':
-        fputs("N", stdout);
-        break;
-      case L'प':
-        fputs("P", stdout);
-        break;
-      case L'फ':
-        fputs("PH", stdout);
-        break;
-      case L'ब':
-        fputs("B", stdout);
-        break;
-      case 0x92d:  // भ
-        fputs("BH", stdout);
-        break;
-      case L'म':
-        fputs("M", stdout);
-        break;
-      case L'य':
-        fputs("Y", stdout);
-        break;
-      case 0x930:  // र
-        fputs("R", stdout);
-        break;
-      case L'ल':
-        fputs("L", stdout);
-        break;
-      case L'व':
-        fputs("V", stdout);
-        break;
-      case L'श':
-        fputs("SH", stdout);
-        break;
-      case L'ष':
-        fputs("SS", stdout);
-        break;
-      case L'स':
-        fputs("S", stdout);
-        break;
-      case L'ह':
-        fputs("H", stdout);
-        break;
-
-      // Hindi Vowels
-      case 0x905:  // अ
-      case 0x93e:  // ा
-        fputs("A", stdout);
-        break;
-      case 0x906:  // आ
-        fputs("AA", stdout);
-        break;
-      case 0x907:  // इ
-      case 0x93f:  // ि
-        fputs("I", stdout);
-        break;
-      case 0x940:  // ी
-      case 0x908:  // ई
-        fputs("II", stdout);
-        break;
-      case 0x942:  // ू
-      case 0x90A:  // ऊ
-        fputs("UU", stdout);
-        break;
-      case 0x947:  // े
-      case 0x90F:  // ए
-        fputs("E", stdout);
-        break;
-      case 0x948:  // ै
-      case 0x910:  // ऐ
-        fputs("AI", stdout);
-        break;
-      case 0x94b:  // ो
-      case 0x913:  // ओ
-        fputs("O", stdout);
-        break;
-      case 0x941:  // ु
-      case 0x909:  // उ
-        fputs("U", stdout);
-        break;
-      case 0x94c:  // ौ
-      case 0x914:  // औ
-        fputs("AU", stdout);
-        break;
-
-      // Hindi Magic Consonants
-      case 0x902:  // ं
-        fputs("N", stdout);
-        break;
-
-      // Hindi Miscellaneous
-      case 0x93c:  // ़ Devanagari Sign Nukta
-        break;
-      case 0x94d:  // ् Devanagari Sign Virama
-        break;
-
-      // Hindi Punctuation
-      case L'।':
-        fputc('.', stdout);
-        break;
-    }
-
-    if ((IsHindiConsonant(c1) && IsHindiConsonant(c2)) ||
-        (IsHindiConsonant(c1) && IsHindiMagicConsonant(c2)))
-      fputs("A", stdout);
-
-    if (c2) {
-      ungetwc(c2, stdin);
-    }
-  }
-  return 0;
-}
diff --git a/examples/rote.c b/examples/rote.c
deleted file mode 100644
index 3819ce331..000000000
--- a/examples/rote.c
+++ /dev/null
@@ -1,322 +0,0 @@
-#/*────────────────────────────────────────────────────────────────╗
-┌┘ To the extent possible under law, Justine Tunney has waived     │
-│  all copyright and related or neighboring rights to this file,   │
-│  as it is written in the following disclaimers:                  │
-│    • http://unlicense.org/                                       │
-│    • http://creativecommons.org/publicdomain/zero/1.0/           │
-╚─────────────────────────────────────────────────────────────────*/
-#include <ctype.h>
-#include <signal.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <termios.h>
-
-/**
- * @fileoverview cosmopolitan flash cards viewer
- */
-
-struct Card {
-  char* qa[2];
-};
-
-atomic_int g_done;
-
-void onsig(int sig) {
-  g_done = 1;
-}
-
-void* xmalloc(int n) {
-  void* p;
-  if ((p = malloc(n)))
-    return p;
-  perror("malloc");
-  exit(1);
-}
-
-void* xrealloc(void* p, int n) {
-  if ((p = realloc(p, n)))
-    return p;
-  perror("realloc");
-  exit(1);
-}
-
-char* xstrcat(const char* a, const char* b) {
-  char* p;
-  size_t n, m;
-  n = strlen(a);
-  m = strlen(b);
-  p = xmalloc(n + m + 1);
-  memcpy(p, a, n);
-  memcpy(p + n, b, m + 1);
-  return p;
-}
-
-void shuffle(struct Card* a, int n) {
-  while (n > 1) {
-    int i = rand() % n--;
-    struct Card t = a[i];
-    a[i] = a[n];
-    a[n] = t;
-  }
-}
-
-char* trim(char* s) {
-  int i;
-  if (s) {
-    while (isspace(*s))
-      ++s;
-    for (i = strlen(s); i--;) {
-      if (isspace(s[i])) {
-        s[i] = 0;
-      } else {
-        break;
-      }
-    }
-  }
-  return s;
-}
-
-char* readline(FILE* f) {
-  for (;;) {
-    char* line = trim(fgetln(f, 0));
-    if (!line)
-      return 0;
-    if (*line != '#')
-      if (*line)
-        return line;
-  }
-}
-
-char* fill(const char* text, int max_line_width, int* out_line_count) {
-  int text_len = strlen(text);
-  char* result = xmalloc(text_len * 2 + 1);
-  int result_pos = 0;
-  int line_start = 0;
-  int line_count = 1;
-  int i = 0;
-  while (i < text_len && isspace(text[i]))
-    i++;
-  while (i < text_len) {
-    int word_end = i;
-    while (word_end < text_len && !isspace(text[word_end]))
-      word_end++;
-    int word_length = word_end - i;
-    if ((result_pos - line_start) + (result_pos > line_start ? 1 : 0) +
-            word_length >
-        max_line_width) {
-      if (result_pos > line_start) {
-        ++line_count;
-        result[result_pos++] = '\n';
-        line_start = result_pos;
-      }
-    } else if (result_pos > line_start) {
-      result[result_pos++] = ' ';
-    }
-    memcpy(result + result_pos, text + i, word_length);
-    result_pos += word_length;
-    i = word_end;
-    while (i < text_len && isspace(text[i]))
-      i++;
-  }
-  result[result_pos] = '\0';
-  result = xrealloc(result, result_pos + 1);
-  if (out_line_count)
-    *out_line_count = line_count;
-  return result;
-}
-
-void show(const char* text, int i, int n) {
-
-  // get pseudoteletypewriter dimensions
-  struct winsize ws = {80, 25};
-  tcgetwinsize(1, &ws);
-  int width = ws.ws_col;
-  if (width > (int)(ws.ws_col * .9))
-    width = ws.ws_col * .9;
-  if (width > 80)
-    width = 80;
-  width &= -2;
-
-  // clear display
-  printf("\033[H\033[J");
-
-  // display flash card text in middle of display
-  char buf[32];
-  int line_count;
-  char* lines = fill(text, width, &line_count);
-  sprintf(buf, "%d/%d\r\n\r\n", i + 1, n);
-  line_count += 2;
-  char* extra = xstrcat(buf, lines);
-  free(lines);
-  char* tokens = extra;
-  for (int j = 0;; ++j) {
-    char* line = strtok(tokens, "\n");
-    tokens = 0;
-    if (!line)
-      break;
-    printf("\033[%d;%dH%s", ws.ws_row / 2 - line_count / 2 + j + 1,
-           ws.ws_col / 2 - strlen(line) / 2 + 1, line);
-  }
-  free(extra);
-  fflush(stdout);
-}
-
-void usage(FILE* f, const char* prog) {
-  fprintf(f,
-          "usage: %s FILE\n"
-          "\n"
-          "here's an example of what your file should look like:\n"
-          "\n"
-          "  # cosmopolitan flash cards\n"
-          "  # california dmv drivers test\n"
-          "  \n"
-          "  which of the following point totals could result in "
-          "your license being suspended by the dmv?\n"
-          "  4 points in 12 months (middle)\n"
-          "  \n"
-          "  at 55 mph under good conditions a passenger vehicle can stop "
-          "within\n"
-          "  300 feet (not 200, not 400, middle)\n"
-          "  \n"
-          "  two sets of solid double yellow lines spaced two or more feet "
-          "apart indicate\n"
-          "  a BARRIER (do not cross unless there's an opening)\n"
-          "\n"
-          "more specifically, empty lines are ignored, lines starting with\n"
-          "a hash are ignored, then an even number of lines must remain,\n"
-          "where each two lines is a card, holding question and answer.\n",
-          prog);
-}
-
-int main(int argc, char* argv[]) {
-
-  // show help
-  if (argc != 2) {
-    usage(stderr, argv[0]);
-    return 1;
-  }
-  if (!strcmp(argv[1], "-?") ||  //
-      !strcmp(argv[1], "-h") ||  //
-      !strcmp(argv[1], "--help")) {
-    usage(stdout, argv[0]);
-    return 0;
-  }
-
-  // teletypewriter is required
-  if (!isatty(0) || !isatty(1)) {
-    perror("isatty");
-    return 2;
-  }
-
-  // load cards
-  FILE* f = fopen(argv[1], "r");
-  if (!f) {
-    perror(argv[1]);
-    return 3;
-  }
-  int count = 0;
-  struct Card* cards = 0;
-  for (;;) {
-    struct Card card;
-    if (!(card.qa[0] = readline(f)))
-      break;
-    card.qa[0] = strdup(card.qa[0]);
-    if (!(card.qa[1] = readline(f))) {
-      fprintf(stderr, "%s: flash card file has odd number of lines\n", argv[1]);
-      exit(1);
-    }
-    card.qa[1] = strdup(card.qa[1]);
-    cards = xrealloc(cards, (count + 1) * sizeof(struct Card));
-    cards[count++] = card;
-  }
-  fclose(f);
-
-  // randomize
-  srand(time(0));
-  shuffle(cards, count);
-
-  // catch ctrl-c
-  struct sigaction sa;
-  sa.sa_flags = 0;
-  sa.sa_handler = onsig;
-  sigemptyset(&sa.sa_mask);
-  sigaction(SIGINT, &sa, 0);
-
-  // enter raw mode
-  struct termios ot;
-  tcgetattr(1, &ot);
-  struct termios nt = ot;
-  cfmakeraw(&nt);
-  nt.c_lflag |= ISIG;
-  tcsetattr(1, TCSANOW, &nt);
-  printf("\033[?25l");
-
-  // show flash cards
-  int i = 0;
-  while (!g_done) {
-    show(cards[i / 2].qa[i % 2], i / 2, count);
-
-    // press any key
-    char b[8] = {0};
-    read(0, b, sizeof(b));
-
-    // q quits
-    if (b[0] == 'q')
-      break;
-
-    // b or ctrl-b goes backward
-    if (b[0] == 'b' ||  //
-        b[0] == ('B' ^ 0100)) {
-      if (--i < 0)
-        i = count * 2 - 1;
-      i &= -2;
-      continue;
-    }
-
-    // p or ctrl-p goes backward
-    if (b[0] == 'p' ||  //
-        b[0] == ('P' ^ 0100)) {
-      if (--i < 0)
-        i = count * 2 - 1;
-      i &= -2;
-      continue;
-    }
-
-    // up arrow goes backward
-    if (b[0] == 033 &&  //
-        b[1] == '[' &&  //
-        b[2] == 'A') {
-      if (--i < 0)
-        i = count * 2 - 1;
-      i &= -2;
-      continue;
-    }
-
-    // left arrow goes backward
-    if (b[0] == 033 &&  //
-        b[1] == '[' &&  //
-        b[2] == 'D') {
-      if (--i < 0)
-        i = count * 2 - 1;
-      i &= -2;
-      continue;
-    }
-
-    // only advance
-    if (++i == count * 2)
-      i = 0;
-  }
-
-  // free memory
-  for (int i = 0; i < count; ++i)
-    for (int j = 0; j < 2; ++j)
-      free(cards[i].qa[j]);
-  free(cards);
-
-  // cleanup terminal and show cursor
-  tcsetattr(1, TCSANOW, &ot);
-  printf("\033[?25h");
-  printf("\n");
-}
diff --git a/examples/script.c b/examples/script.c
index 414f07e8c..e6559e626 100644
--- a/examples/script.c
+++ b/examples/script.c
@@ -29,30 +29,39 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <err.h>
-#include <errno.h>
-#include <paths.h>
-#include <pty.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/param.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/uio.h>
-#include <termios.h>
-#include <time.h>
-#include <unistd.h>
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/iovec.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/calls/struct/termios.h"
+#include "libc/calls/struct/timeval.h"
+#include "libc/calls/struct/winsize.h"
+#include "libc/calls/termios.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/errno.h"
+#include "libc/fmt/conv.h"
+#include "libc/intrin/bswap.h"
+#include "libc/log/bsd.h"
+#include "libc/macros.internal.h"
+#include "libc/mem/mem.h"
+#include "libc/paths.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sock/select.h"
+#include "libc/stdio/stdio.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/termios.h"
+#include "libc/time.h"
+#include "third_party/getopt/getopt.internal.h"
 // clang-format off
 
 /**
  * @fileoverview Terminal Screencast Recorder / Player, e.g.
  *
  *     make o//examples/script.com
- *     o//examples/script.com -w80 -h24 -r recording.tty
+ *     o//examples/script.com -r
  *     # type stuff..
  *     # CTRL-D
- *     o//examples/script.com -p recording.tty
+ *     o//examples/script.com -p typescript
  *
  * @note works on Linux, OpenBSD, NetBSD, FreeBSD, MacOS
  * @see https://asciinema.org/
@@ -103,9 +112,9 @@ main(int argc, char *argv[])
 	fd_set rfd;
 	int fm_fd;
 	int aflg, Fflg, kflg, pflg, ch, k, n;
-	int flushtime, readstdin, width, height;
+	int flushtime, readstdin;
 
-	aflg = Fflg = kflg = pflg = height = width = 0;
+	aflg = Fflg = kflg = pflg = 0;
 	usesleep = 1;
 	rawout = 0;
 	flushtime = 30;
@@ -115,7 +124,7 @@ main(int argc, char *argv[])
 
 	(void)fm_fd;
 
-	while ((ch = getopt(argc, argv, "adeFfkpqrt:w:h:")) != -1)
+	while ((ch = getopt(argc, argv, "adeFfkpqrt:")) != -1)
 		switch(ch) {
 		case 'a':
 			aflg = 1;
@@ -145,12 +154,6 @@ main(int argc, char *argv[])
 			if (flushtime < 0)
 				err(1, "invalid flush time %d", flushtime);
 			break;
-		case 'w':
-			width = atoi(optarg);
-			break;
-		case 'h':
-			height = atoi(optarg);
-			break;
 		case '?':
 		default:
 			usage();
@@ -178,10 +181,6 @@ main(int argc, char *argv[])
 		if (openpty(&master, &slave, NULL, NULL, NULL) == -1)
 			err(1, "openpty");
 	} else {
-		if (width)
-			win.ws_col = width;
-		if (height)
-			win.ws_row = height;
 		if (openpty(&master, &slave, NULL, &tt, &win) == -1)
 			err(1, "openpty");
 		ttyflg = 1;
diff --git a/examples/seq.c b/examples/seq.c
index f8403f60f..4d401133b 100644
--- a/examples/seq.c
+++ b/examples/seq.c
@@ -7,8 +7,9 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <cosmo.h>
-#include <stdlib.h>
+#include "libc/calls/calls.h"
+#include "libc/fmt/conv.h"
+#include "libc/fmt/itoa.h"
 
 /**
  * @fileoverview Prints sequence of numbers.
diff --git a/examples/setcontext.c b/examples/setcontext.c
index ff20aee7e..07afabe30 100644
--- a/examples/setcontext.c
+++ b/examples/setcontext.c
@@ -7,9 +7,12 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <ucontext.h>
+#include "libc/calls/calls.h"
+#include "libc/calls/ucontext.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
 
 /**
  * @fileoverview swapcontext() and makecontext() example
@@ -30,16 +33,18 @@ static ucontext_t uctx_func2;
 static void func1(void) {
   say("func1: started\n");
   say("func1: swapcontext(&uctx_func1, &uctx_func2)\n");
-  if (swapcontext(&uctx_func1, &uctx_func2) == -1)
+  if (swapcontext(&uctx_func1, &uctx_func2) == -1) {
     handle_error("swapcontext");
+  }
   say("func1: returning\n");
 }
 
 static void func2(void) {
   say("func2: started\n");
   say("func2: swapcontext(&uctx_func2, &uctx_func1)\n");
-  if (swapcontext(&uctx_func2, &uctx_func1) == -1)
+  if (swapcontext(&uctx_func2, &uctx_func1) == -1) {
     handle_error("swapcontext");
+  }
   say("func2: returning\n");
 }
 
@@ -47,15 +52,17 @@ int main(int argc, char *argv[]) {
   char func1_stack[8192];
   char func2_stack[8192];
 
-  if (getcontext(&uctx_func1) == -1)
+  if (getcontext(&uctx_func1) == -1) {
     handle_error("getcontext");
+  }
   uctx_func1.uc_stack.ss_sp = func1_stack;
   uctx_func1.uc_stack.ss_size = sizeof(func1_stack);
   uctx_func1.uc_link = &uctx_main;
   makecontext(&uctx_func1, func1, 0);
 
-  if (getcontext(&uctx_func2) == -1)
+  if (getcontext(&uctx_func2) == -1) {
     handle_error("getcontext");
+  }
   uctx_func2.uc_stack.ss_sp = func2_stack;
   uctx_func2.uc_stack.ss_size = sizeof(func2_stack);
   /* Successor context is f1(), unless argc > 1 */
@@ -63,8 +70,9 @@ int main(int argc, char *argv[]) {
   makecontext(&uctx_func2, func2, 0);
 
   say("main: swapcontext(&uctx_main, &uctx_func2)\n");
-  if (swapcontext(&uctx_main, &uctx_func2) == -1)
+  if (swapcontext(&uctx_main, &uctx_func2) == -1) {
     handle_error("swapcontext");
+  }
 
   say("main: exiting\n");
   exit(EXIT_SUCCESS);
diff --git a/examples/setitimer.c b/examples/setitimer.c
index 6d926bca8..89b291941 100644
--- a/examples/setitimer.c
+++ b/examples/setitimer.c
@@ -7,15 +7,17 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <assert.h>
-#include <signal.h>
-#include <stdio.h>
-#include <sys/time.h>
-#include <unistd.h>
-
-/**
- * @fileoverview interval timer tutorial
- */
+#include "libc/assert.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/itimerval.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/calls/ucontext.h"
+#include "libc/stdio/stdio.h"
+#include "libc/sysv/consts/itimer.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sig.h"
+#include "libc/time.h"
 
 volatile bool gotalrm;
 
diff --git a/examples/spawn.c b/examples/spawn.c
deleted file mode 100644
index c118fc7a1..000000000
--- a/examples/spawn.c
+++ /dev/null
@@ -1,366 +0,0 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-
-// posix_spawn() example
-//
-// This program demonstrates the use of posix_spawn() to run the command
-// `ls --dired` and capture its output. It teaches several key features:
-//
-// - Changing the working directory for the child process
-// - Redirecting stdout and stderr to pipes
-// - Handling the output from the child process
-//
-// The primary advantage of using posix_spawn() instead of the
-// traditional fork()/execve() combination for launching processes is
-// safety, efficiency, and cross-platform compatibility.
-//
-// 1. On Linux, FreeBSD, and NetBSD:
-//
-//    Cosmopolitan Libc's posix_spawn() uses vfork() under the hood on
-//    these platforms automatically, since it's faster than fork(). It's
-//    because vfork() creates a child process without needing to copy
-//    the parent's page tables, making it more efficient, especially for
-//    large processes. Furthermore, vfork() avoids the need to acquire
-//    every single mutex (see pthread_atfork() for more details) which
-//    makes it scalable in multi-threaded apps, since the other threads
-//    in your app can keep going while the spawning thread waits for the
-//    subprocess to call execve(). Normally vfork() is error-prone since
-//    there exists few functions that are @vforksafe. the posix_spawn()
-//    API is designed to offer maximum assurance that you can't shoot
-//    yourself in the foot. If you do, then file a bug with Cosmo.
-//
-// 2. On Windows:
-//
-//    posix_spawn() avoids fork() entirely. Windows doesn't natively
-//    support fork(), and emulating it can be slow and memory-intensive.
-//    By using posix_spawn(), we get a much faster process creation on
-//    Windows systems, because it only needs to call CreateProcess().
-//    Your file actions are replayed beforehand in a simulated way. Only
-//    Cosmopolitan Libc offers this level of quality. With Cygwin you'd
-//    have to use its proprietary APIs to achieve the same performance.
-//
-// 3. Simplified error handling:
-//
-//    posix_spawn() combines process creation and program execution in a
-//    single call, reducing the points of failure and simplifying error
-//    handling. One important thing that happens with Cosmopolitan's
-//    posix_spawn() implementation is that the error code of execve()
-//    inside your subprocess, should it fail, will be propagated to your
-//    parent process. This will happen efficiently via vfork() shared
-//    memory in the event your Linux environment supports this. If it
-//    doesn't, then Cosmopolitan will fall back to a throwaway pipe().
-//    The pipe is needed on platforms like XNU and OpenBSD which do not
-//    support vfork(). It's also needed under QEMU User.
-//
-// 4. Signal safety:
-//
-//    posix_spawn() guarantees your signal handler callback functions
-//    won't be executed in the child process. By default, it'll remove
-//    sigaction() callbacks atomically. This ensures that if something
-//    like a SIGTERM or SIGHUP is sent to the child process before it's
-//    had a chance to call execve(), then the child process will simply
-//    be terminated (like the spawned process would) instead of running
-//    whatever signal handlers the spawning process has installed. If
-//    you've set some signals to SIG_IGN, then that'll be preserved for
-//    the child process by posix_spawn(), unless you explicitly call
-//    posix_spawnattr_setsigdefault() to reset them.
-//
-// 5. Portability:
-//
-//    posix_spawn() is part of the POSIX standard, making it more
-//    portable across different UNIX-like systems and Windows (with
-//    appropriate libraries). Even the non-POSIX APIs we use here are
-//    portable; e.g. posix_spawn_file_actions_addchdir_np() is supported
-//    by glibc, musl, freebsd, and apple too.
-//
-// These benefits make posix_spawn() a preferred choice for efficient
-// and portable process creation in many scenarios, especially when
-// launching many processes or on systems where process creation
-// performance is critical.
-
-#define _GNU_SOURCE
-#include <errno.h>
-#include <fcntl.h>
-#include <poll.h>
-#include <signal.h>
-#include <spawn.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/select.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-#define max(X, Y) ((Y) < (X) ? (X) : (Y))
-
-#define USE_SELECT 0  // want poll() or select()? they both work great
-
-#define PIPE_READ  0
-#define PIPE_WRITE 1
-
-int main() {
-  errno_t err;
-
-  // Create spawn attributes object.
-  posix_spawnattr_t attr;
-  err = posix_spawnattr_init(&attr);
-  if (err != 0) {
-    fprintf(stderr, "posix_spawnattr_init failed: %s\n", strerror(err));
-    exit(1);
-  }
-
-  // Explicitly request vfork() from posix_spawn() implementation.
-  //
-  // This is currently the default for Cosmopolitan Libc, however you
-  // may want to set this anyway, for portability with other platforms.
-  // Please note that vfork() isn't officially specified by POSIX, so
-  // portable code may want to omit this and just use the default.
-  err = posix_spawnattr_setflags(&attr, POSIX_SPAWN_USEVFORK);
-  if (err != 0) {
-    fprintf(stderr, "posix_spawnattr_setflags: %s\n", strerror(err));
-    exit(2);
-  }
-
-  // Create file actions object.
-  posix_spawn_file_actions_t actions;
-  err = posix_spawn_file_actions_init(&actions);
-  if (err != 0) {
-    fprintf(stderr, "posix_spawn_file_actions_init: %s\n", strerror(err));
-    exit(3);
-  }
-
-  // Change directory to root directory in child process.
-  err = posix_spawn_file_actions_addchdir_np(&actions, "/");
-  if (err != 0) {
-    fprintf(stderr, "posix_spawn_file_actions_addchdir_np: %s\n",
-            strerror(err));
-    exit(4);
-  }
-
-  // Disable stdin in child process.
-  //
-  // By default, if you launch this example in your terminal, then child
-  // processes can read from your teletypewriter's keyboard too. You can
-  // avoid this by assigning /dev/null to standard input so if the child
-  // tries to read input, read() will return zero, indicating eof.
-  if ((err = posix_spawn_file_actions_addopen(&actions, STDIN_FILENO,
-                                              "/dev/null", O_RDONLY, 0644))) {
-    fprintf(stderr, "posix_spawn_file_actions_addopen: %s\n", strerror(err));
-    exit(5);
-  }
-
-  // Create pipes for stdout and stderr.
-  //
-  // Using O_DIRECT puts the pipe in message mode. This way we have some
-  // visibility into how the child process is using write(). It can also
-  // help ensure that logged lines won't be chopped up here, which could
-  // happen more frequently on platforms like Windows, which is somewhat
-  // less sophisticated than Linux with how it performs buffering.
-  //
-  // You can also specify O_CLOEXEC, which is a nice touch that lets you
-  // avoid needing to call posix_spawn_file_actions_addclose() later on.
-  // That's because all file descriptors are inherited by child programs
-  // by default. This is even the case with Cosmopolitan Libc on Windows
-  //
-  // XXX: We assume that stdin/stdout/stderr exist in this process. It's
-  //      possible for a rogue parent process to launch this example, in
-  //      a way where the following spawn logic will break.
-  int pipe_stdout[2];
-  int pipe_stderr[2];
-  if (pipe2(pipe_stdout, O_DIRECT) == -1 ||
-      pipe2(pipe_stderr, O_DIRECT) == -1) {
-    perror("pipe");
-    exit(6);
-  }
-
-  // Redirect child's stdout/stderr to pipes
-  if ((err = posix_spawn_file_actions_adddup2(&actions, pipe_stdout[PIPE_WRITE],
-                                              STDOUT_FILENO)) ||
-      (err = posix_spawn_file_actions_adddup2(&actions, pipe_stderr[PIPE_WRITE],
-                                              STDERR_FILENO))) {
-    fprintf(stderr, "posix_spawn_file_actions_adddup2: %s\n", strerror(err));
-    exit(7);
-  }
-
-  // Close unwanted write ends of pipes in the child process
-  if ((err = posix_spawn_file_actions_addclose(&actions,
-                                               pipe_stdout[PIPE_READ])) ||
-      (err = posix_spawn_file_actions_addclose(&actions,
-                                               pipe_stderr[PIPE_READ]))) {
-    fprintf(stderr, "posix_spawn_file_actions_addclose: %s\n", strerror(err));
-    exit(8);
-  };
-
-  // Asynchronously launch the child process.
-  pid_t pid;
-  char *const argv[] = {"ls", "--dired", NULL};
-  printf("** Launching `ls --dired` in root directory\n");
-  err = posix_spawnp(&pid, argv[0], &actions, NULL, argv, NULL);
-  if (err) {
-    fprintf(stderr, "posix_spawn: %s\n", strerror(err));
-    exit(9);
-  }
-
-  // Close unused write ends of pipes in the parent process
-  close(pipe_stdout[PIPE_WRITE]);
-  close(pipe_stderr[PIPE_WRITE]);
-
-  // we need poll() or select() because we're multiplexing output
-  // both poll() and select() work across all supported platforms
-#if USE_SELECT
-  // Relay output from child process using select()
-  char buffer[512];
-  ssize_t got_stdout = 1;
-  ssize_t got_stderr = 1;
-  while (got_stdout > 0 || got_stderr > 0) {
-    fd_set rfds;
-    FD_ZERO(&rfds);
-    if (got_stdout > 0)
-      FD_SET(pipe_stdout[PIPE_READ], &rfds);
-    if (got_stderr > 0)
-      FD_SET(pipe_stderr[PIPE_READ], &rfds);
-    int nfds = max(pipe_stdout[PIPE_READ], pipe_stderr[PIPE_READ]) + 1;
-    if (select(nfds, &rfds, 0, 0, 0) == -1) {
-      perror("select");
-      exit(10);
-    }
-    if (FD_ISSET(pipe_stdout[PIPE_READ], &rfds)) {
-      got_stdout = read(pipe_stdout[PIPE_READ], buffer, sizeof(buffer));
-      printf("\n");
-      if (got_stdout > 0) {
-        printf("** Got stdout from child process:\n");
-        fflush(stdout);
-        write(STDOUT_FILENO, buffer, got_stdout);
-      } else if (!got_stdout) {
-        printf("** Got stdout EOF from child process\n");
-      } else {
-        printf("** Got stdout read() error from child process: %s\n",
-               strerror(errno));
-      }
-    }
-    if (FD_ISSET(pipe_stderr[PIPE_READ], &rfds)) {
-      got_stderr = read(pipe_stderr[PIPE_READ], buffer, sizeof(buffer));
-      printf("\n");
-      if (got_stderr > 0) {
-        printf("** Got stderr from child process:\n");
-        fflush(stdout);
-        write(STDOUT_FILENO, buffer, got_stderr);
-      } else if (!got_stderr) {
-        printf("** Got stderr EOF from child process\n");
-      } else {
-        printf("** Got stderr read() error from child process: %s\n",
-               strerror(errno));
-      }
-    }
-  }
-
-#else
-  // Relay output from child process using poll()
-  char buffer[512];
-  ssize_t got_stdout = 1;
-  ssize_t got_stderr = 1;
-  while (got_stdout > 0 || got_stderr > 0) {
-    struct pollfd fds[2];
-    fds[0].fd = got_stdout > 0 ? pipe_stdout[PIPE_READ] : -1;
-    fds[0].events = POLLIN;  // POLLHUP, POLLNVAL, and POLLERR are implied
-    fds[1].fd = got_stderr > 0 ? pipe_stderr[PIPE_READ] : -1;
-    fds[1].events = POLLIN;  // POLLHUP, POLLNVAL, and POLLERR are implied
-    if (poll(fds, 2, -1) == -1) {
-      perror("select");
-      exit(10);
-    }
-    if (fds[0].revents) {
-      printf("\n");
-      if (fds[0].revents & POLLIN)
-        printf("** Got POLLIN on stdout from child process\n");
-      if (fds[0].revents & POLLHUP)
-        printf("** Got POLLHUP on stdout from child process\n");
-      if (fds[0].revents & POLLERR)
-        printf("** Got POLLERR on stdout from child process\n");
-      if (fds[0].revents & POLLNVAL)
-        printf("** Got POLLNVAL on stdout from child process\n");
-      got_stdout = read(pipe_stdout[PIPE_READ], buffer, sizeof(buffer));
-      if (got_stdout > 0) {
-        printf("** Got stdout from child process:\n");
-        fflush(stdout);
-        write(STDOUT_FILENO, buffer, got_stdout);
-      } else if (!got_stdout) {
-        printf("** Got stdout EOF from child process\n");
-      } else {
-        printf("** Got stdout read() error from child process: %s\n",
-               strerror(errno));
-      }
-    }
-    if (fds[1].revents) {
-      printf("\n");
-      if (fds[1].revents & POLLIN)
-        printf("** Got POLLIN on stderr from child process\n");
-      if (fds[1].revents & POLLHUP)
-        printf("** Got POLLHUP on stderr from child process\n");
-      if (fds[1].revents & POLLERR)
-        printf("** Got POLLERR on stderr from child process\n");
-      if (fds[1].revents & POLLNVAL)
-        printf("** Got POLLNVAL on stderr from child process\n");
-      got_stderr = read(pipe_stderr[PIPE_READ], buffer, sizeof(buffer));
-      if (got_stderr > 0) {
-        printf("** Got stderr from child process:\n");
-        fflush(stdout);
-        write(STDOUT_FILENO, buffer, got_stderr);
-      } else if (!got_stderr) {
-        printf("** Got stderr EOF from child process\n");
-      } else {
-        printf("** Got stderr read() error from child process: %s\n",
-               strerror(errno));
-      }
-    }
-  }
-#endif
-
-  // Wait for child process to die.
-  int wait_status;
-  if (waitpid(pid, &wait_status, 0) == -1) {
-    perror("waitpid");
-    exit(11);
-  }
-
-  // Clean up resources.
-  posix_spawn_file_actions_destroy(&actions);
-  posix_spawnattr_destroy(&attr);
-  close(pipe_stdout[PIPE_READ]);
-  close(pipe_stderr[PIPE_READ]);
-
-  // Report wait status.
-  //
-  // When a process dies, it's almost always due to calling _Exit() or
-  // being killed due to an unhandled signal. On both UNIX and Windows
-  // this information will be propagated to the parent. That status is
-  // able to be propagated to the parent of this process too.
-  printf("\n");
-  if (WIFEXITED(wait_status)) {
-    printf("** Child process exited with exit code %d\n",
-           WEXITSTATUS(wait_status));
-    exit(WEXITSTATUS(wait_status));
-  } else if (WIFSIGNALED(wait_status)) {
-    printf("** Child process terminated with signal %s\n",
-           strsignal(WTERMSIG(wait_status)));
-    fflush(stdout);
-    sigset_t sm;
-    sigemptyset(&sm);
-    sigaddset(&sm, WTERMSIG(wait_status));
-    sigprocmask(SIG_UNBLOCK, &sm, 0);
-    signal(SIGABRT, SIG_DFL);
-    raise(WTERMSIG(wait_status));
-    exit(128 + WTERMSIG(wait_status));
-  } else {
-    printf("** Child process exited weirdly with wait status 0x%08x\n",
-           wait_status);
-    exit(12);
-  }
-}
diff --git a/examples/spawn_bench.c b/examples/spawn_bench.c
index 66ccdbc6a..38423389d 100644
--- a/examples/spawn_bench.c
+++ b/examples/spawn_bench.c
@@ -7,20 +7,24 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <spawn.h>
-#include <stdalign.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <time.h>
+#include "libc/atomic.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/mem/mem.h"
+#include "libc/proc/posix_spawn.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/prot.h"
 
 #define ITERATIONS 10
 
-alignas(128) int a;
-alignas(128) int b;
-alignas(128) atomic_int lock;
+_Alignas(128) int a;
+_Alignas(128) int b;
+_Alignas(128) atomic_int lock;
 
 static struct timespec SubtractTime(struct timespec a, struct timespec b) {
   a.tv_sec -= b.tv_sec;
@@ -113,11 +117,6 @@ int main(int argc, char *argv[]) {
   void *p;
   const char *prog;
 
-  // if you need the tiny64 program for windows:
-  //
-  //     make -j o//tool/hello/life-pe.ape
-  //     scp o//tool/hello/life-pe.ape windows:tiny64
-  //
   if (argc <= 1) {
     prog = "tiny64";
   } else {
diff --git a/examples/stackexplorer.c b/examples/stackexplorer.c
index 96c34114c..5b5d9add9 100644
--- a/examples/stackexplorer.c
+++ b/examples/stackexplorer.c
@@ -7,13 +7,9 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include "libc/dce.h"
-#include "libc/intrin/maps.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
-#include "libc/runtime/stack.h"
-#include "libc/runtime/winargs.internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/x/xasprintf.h"
 
@@ -71,18 +67,8 @@ int main(int argc, char *argv[]) {
     Append((uintptr_t)&__auxv[i + 1],
            xasprintf("&auxv[%d] = %#lx", i + 1, __auxv[i + 1]));
   }
-  if (!IsWindows()) {
-    struct AddrSize stak = __get_main_stack();
-    Append((intptr_t)stak.addr + stak.size, "top of stack");
-    Append((intptr_t)stak.addr, "bottom of stack");
-  } else {
-#ifdef __x86_64__
-    Append(GetStaticStackAddr(0) + GetStaticStackSize(), "top of stack");
-    Append(GetStaticStackAddr(0) + GetGuardSize(), "bottom of stack");
-    Append(GetStaticStackAddr(0), "bottom of guard region");
-#endif
-  }
   qsort(things.p, things.n, sizeof(*things.p), Compare);
-  for (int i = 0; i < things.n; ++i)
+  for (int i = 0; i < things.n; ++i) {
     printf("%012lx %s\n", things.p[i].i, things.p[i].s);
+  }
 }
diff --git a/examples/stat.c b/examples/stat.c
index 45e17e354..dce122cbb 100644
--- a/examples/stat.c
+++ b/examples/stat.c
@@ -7,12 +7,19 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <assert.h>
-#include <cosmo.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/stat.h>
-#include <time.h>
+#include "libc/calls/struct/stat.h"
+#include "libc/assert.h"
+#include "libc/calls/calls.h"
+#include "libc/errno.h"
+#include "libc/fmt/conv.h"
+#include "libc/log/check.h"
+#include "libc/log/log.h"
+#include "libc/mem/gc.h"
+#include "libc/mem/mem.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/time.h"
 
 /**
  * @fileoverview File metadata viewer.
@@ -65,15 +72,9 @@ void PrintFileMetadata(const char *pathname, struct stat *st) {
   printf("\n%s:", pathname);
   if (numeric) {
     fd = atoi(pathname);
-    if (fstat(fd, st)) {
-      perror(pathname);
-      exit(1);
-    }
+    CHECK_NE(-1, fstat(fd, st), "fd=%d", fd);
   } else {
-    if (stat(pathname, st)) {
-      perror(pathname);
-      exit(1);
-    }
+    CHECK_NE(-1, stat(pathname, st), "pathname=%s", pathname);
   }
   printf("\n"
          "%-32s%,ld\n"
diff --git a/examples/statfs.c b/examples/statfs.c
index 817017d25..ce6367794 100644
--- a/examples/statfs.c
+++ b/examples/statfs.c
@@ -7,19 +7,18 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
+#include "libc/calls/struct/statfs.h"
+#include "libc/dce.h"
+#include "libc/fmt/conv.h"
+#include "libc/log/check.h"
 #include "libc/nt/enum/statfs.h"
-#include <cosmo.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/vfs.h>
+#include "libc/stdio/stdio.h"
+#include "libc/sysv/consts/st.h"
 
-void ShowIt(const char *path) {
+dontinline void ShowIt(const char *path) {
   char ibuf[21];
   struct statfs sf = {0};
-  if (statfs(path, &sf)) {
-    perror(path);
-    exit(1);
-  }
+  CHECK_NE(-1, statfs(path, &sf));
 
   printf("filesystem %s\n", path);
   printf("f_type    = %#x (%s)\n", sf.f_type, sf.f_fstypename);
diff --git a/examples/stringbuffer.c b/examples/stringbuffer.c
new file mode 100644
index 000000000..4f965be2f
--- /dev/null
+++ b/examples/stringbuffer.c
@@ -0,0 +1,36 @@
+#if 0
+/*─────────────────────────────────────────────────────────────────╗
+│ To the extent possible under law, Justine Tunney has waived      │
+│ all copyright and related or neighboring rights to this file,    │
+│ as it is written in the following disclaimers:                   │
+│   • http://unlicense.org/                                        │
+│   • http://creativecommons.org/publicdomain/zero/1.0/            │
+╚─────────────────────────────────────────────────────────────────*/
+#endif
+#include "libc/calls/calls.h"
+#include "libc/log/check.h"
+#include "libc/mem/mem.h"
+#include "libc/stdio/append.h"
+#include "libc/str/str.h"
+
+/**
+ * @fileoverview Fast Growable Strings Tutorial
+ */
+
+int main(int argc, char *argv[]) {
+  char *b = 0;
+  appendf(&b, "hello ");  // guarantees nul terminator
+  CHECK_EQ(6, strlen(b));
+  CHECK_EQ(6, appendz(b).i);
+  appendf(&b, " world\n");
+  CHECK_EQ(13, strlen(b));
+  CHECK_EQ(13, appendz(b).i);
+  appendd(&b, "\0", 1);  // supports binary
+  CHECK_EQ(13, strlen(b));
+  CHECK_EQ(14, appendz(b).i);
+  appendf(&b, "%d arg%s\n", argc, argc == 1 ? "" : "s");
+  appendf(&b, "%s\n", "have a nice day");
+  write(1, b, appendz(b).i);
+  free(b);
+  return 0;
+}
diff --git a/examples/sysconf.c b/examples/sysconf.c
index 553b3a245..19a91e6b6 100644
--- a/examples/sysconf.c
+++ b/examples/sysconf.c
@@ -7,8 +7,8 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <stdio.h>
-#include <unistd.h>
+#include "libc/runtime/sysconf.h"
+#include "libc/stdio/stdio.h"
 
 #define SYSCONF(NAME) printf("%-24s %,ld\n", #NAME, sysconf(NAME))
 
diff --git a/examples/sysinfo.c b/examples/sysinfo.c
index afd2c5bca..4892a8183 100644
--- a/examples/sysinfo.c
+++ b/examples/sysinfo.c
@@ -7,19 +7,19 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <cosmo.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/sysinfo.h>
+#include "libc/calls/struct/sysinfo.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/fmt/conv.h"
+#include "libc/fmt/itoa.h"
+#include "libc/log/check.h"
+#include "libc/stdio/stdio.h"
+#include "libc/sysv/consts/clock.h"
 
 int main(int argc, char *argv[]) {
   int64_t x;
   char ibuf[21];
   struct sysinfo si;
-  if (sysinfo(&si)) {
-    perror("sysinfo");
-    exit(1);
-  }
+  CHECK_NE(-1, sysinfo(&si));
 
   printf("%-16s", "uptime");
   x = si.uptime / (24 * 60 * 60);
diff --git a/examples/system.c b/examples/system.c
index dcb2be248..070665191 100644
--- a/examples/system.c
+++ b/examples/system.c
@@ -7,7 +7,9 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include <stdlib.h>
+#include "libc/calls/calls.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
 
 /**
  * @fileoverview Cosmopolitan Command Interpreter Demo
diff --git a/examples/thread.c b/examples/thread.c
deleted file mode 100644
index 283c2f8b0..000000000
--- a/examples/thread.c
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <pthread.h>
-#include <stdio.h>
-
-// how to spawn a thread
-
-void *my_thread(void *arg) {
-  printf("my_thread(%p) is running\n", arg);
-  return (void *)0x456L;
-}
-
-int main(int argc, char *argv[]) {
-  void *res;
-  pthread_t th;
-  pthread_create(&th, 0, my_thread, (void *)0x123L);
-  pthread_join(th, &res);
-  printf("my_thread() returned %p\n", res);
-}
diff --git a/examples/ttyinfo.c b/examples/ttyinfo.c
index 82a009fcd..16e3c23a8 100644
--- a/examples/ttyinfo.c
+++ b/examples/ttyinfo.c
@@ -161,10 +161,6 @@ void OnSignalThatWillEintrRead(int sig) {
 }
 
 int main(int argc, char *argv[]) {
-
-  // // emacs sends this to enable decckm mode
-  // WRITE(1, "\e[?1049h\e[22;0;0t\e[?12;25h\e[?1h\e=");
-
   int e, c, y, x, n, yn, xn;
   infd = 0;
   outfd = 1;
diff --git a/libc/BUILD.mk b/libc/BUILD.mk
index 8e13bbc7c..acc7f7739 100644
--- a/libc/BUILD.mk
+++ b/libc/BUILD.mk
@@ -18,7 +18,6 @@ libc/isystem/byteswap.h \
 libc/isystem/clzerointrin.h \
 libc/isystem/complex.h \
 libc/isystem/cosmo.h \
-libc/isystem/cosmoaudio.h \
 libc/isystem/cpio.h \
 libc/isystem/cpuid.h \
 libc/isystem/crypt.h \
diff --git a/libc/calls/BUILD.mk b/libc/calls/BUILD.mk
index 75ac5a00a..442dab18e 100644
--- a/libc/calls/BUILD.mk
+++ b/libc/calls/BUILD.mk
@@ -48,13 +48,12 @@ LIBC_CALLS_A_DIRECTDEPS =				\
 	LIBC_NT_PDH					\
 	LIBC_NT_POWRPROF				\
 	LIBC_NT_PSAPI					\
-	LIBC_NT_REALTIME				\
 	LIBC_NT_SYNCHRONIZATION				\
 	LIBC_NT_WS2_32					\
 	LIBC_STR					\
 	LIBC_SYSV					\
 	LIBC_SYSV_CALLS					\
-	THIRD_PARTY_COMPILER_RT				\
+	THIRD_PARTY_COMPILER_RT
 
 LIBC_CALLS_A_DEPS :=					\
 	$(call uniq,$(foreach x,$(LIBC_CALLS_A_DIRECTDEPS),$($(x))))
@@ -155,67 +154,13 @@ o/$(MODE)/libc/calls/sigcrashsig.o: private		\
 		CFLAGS +=				\
 			-Os
 
-# avoid legacy sse decoding penalty on avx systems
-o//libc/calls/cfmakeraw.o				\
-o//libc/calls/clock_gettime-xnu.o			\
-o//libc/calls/CPU_AND.o					\
-o//libc/calls/CPU_OR.o					\
-o//libc/calls/CPU_XOR.o					\
-o//libc/calls/dl_iterate_phdr.o				\
-o//libc/calls/dup-nt.o					\
-o//libc/calls/fcntl-nt.o				\
-o//libc/calls/flock-nt.o				\
-o//libc/calls/fstatfs-nt.o				\
-o//libc/calls/fstat-nt.o				\
-o//libc/calls/futimesat.o				\
-o//libc/calls/futimes.o					\
-o//libc/calls/getrlimit.o				\
-o//libc/calls/gettimeofday.o				\
-o//libc/calls/ioctl.o					\
-o//libc/calls/lutimes.o					\
-o//libc/calls/metaflock.o				\
-o//libc/calls/ntaccesscheck.o				\
-o//libc/calls/ntspawn.o					\
-o//libc/calls/open-nt.o					\
-o//libc/calls/pledge-linux.o				\
-o//libc/calls/ppoll.o					\
-o//libc/calls/preadv.o					\
-o//libc/calls/pselect.o					\
-o//libc/calls/pwritev.o					\
-o//libc/calls/read-nt.o					\
-o//libc/calls/readv.o					\
-o//libc/calls/readwrite-nt.o				\
-o//libc/calls/releasefd.o				\
-o//libc/calls/select.o					\
-o//libc/calls/sigaction.o				\
-o//libc/calls/sigenter-freebsd.o			\
-o//libc/calls/sigenter-netbsd.o				\
-o//libc/calls/sigenter-openbsd.o			\
-o//libc/calls/sigenter-xnu.o				\
-o//libc/calls/sigignore.o				\
-o//libc/calls/siginfo2cosmo.o				\
-o//libc/calls/signal.o					\
-o//libc/calls/sig.o					\
-o//libc/calls/sigtimedwait.o				\
-o//libc/calls/stat2cosmo.o				\
-o//libc/calls/statfs2cosmo.o				\
-o//libc/calls/statfs2statvfs.o				\
-o//libc/calls/tcgetattr-nt.o				\
-o//libc/calls/tcgetattr.o				\
-o//libc/calls/tcgetwinsize-nt.o				\
-o//libc/calls/tcsetattr-nt.o				\
-o//libc/calls/tcsetwinsize-nt.o				\
-o//libc/calls/termios2host.o				\
-o//libc/calls/timespec_sleep.o				\
-o//libc/calls/uname.o					\
-o//libc/calls/utimensat-old.o				\
-o//libc/calls/utimes.o					\
-o//libc/calls/winexec.o					\
-o//libc/calls/writev.o: private				\
-		COPTS +=				\
-			-mgeneral-regs-only
-
 # these assembly files are safe to build on aarch64
+o/$(MODE)/libc/calls/getcontext.o: libc/calls/getcontext.S
+	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
+o/$(MODE)/libc/calls/swapcontext.o: libc/calls/swapcontext.S
+	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
+o/$(MODE)/libc/calls/tailcontext.o: libc/calls/tailcontext.S
+	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/calls/stackjump.o: libc/calls/stackjump.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 
diff --git a/libc/calls/CPU_AND.c b/libc/calls/CPU_AND.c
index bfe7ef2cc..b90a0d1e4 100644
--- a/libc/calls/CPU_AND.c
+++ b/libc/calls/CPU_AND.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/cpuset.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 void CPU_AND(cpu_set_t *d, cpu_set_t *x, cpu_set_t *y) {
   int i;
diff --git a/libc/calls/CPU_COUNT.c b/libc/calls/CPU_COUNT.c
index 792cbfc7d..0e2348cb7 100644
--- a/libc/calls/CPU_COUNT.c
+++ b/libc/calls/CPU_COUNT.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/cpuset.h"
 #include "libc/intrin/popcnt.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 int CPU_COUNT(cpu_set_t *set) {
   int i, c;
diff --git a/libc/calls/CPU_OR.c b/libc/calls/CPU_OR.c
index 11fcaf20e..8218b9158 100644
--- a/libc/calls/CPU_OR.c
+++ b/libc/calls/CPU_OR.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/cpuset.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 void CPU_OR(cpu_set_t *d, cpu_set_t *x, cpu_set_t *y) {
   int i;
diff --git a/libc/calls/CPU_XOR.c b/libc/calls/CPU_XOR.c
index 08277e43f..db5ced87a 100644
--- a/libc/calls/CPU_XOR.c
+++ b/libc/calls/CPU_XOR.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/cpuset.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 void CPU_XOR(cpu_set_t *d, cpu_set_t *x, cpu_set_t *y) {
   int i;
diff --git a/libc/calls/calls.h b/libc/calls/calls.h
index cd90cce73..b5fd6d824 100644
--- a/libc/calls/calls.h
+++ b/libc/calls/calls.h
@@ -69,49 +69,48 @@ COSMOPOLITAN_C_START_
 /*───────────────────────────────────────────────────────────────────────────│─╗
 │ cosmopolitan § system calls                                              ─╬─│┼
 ╚────────────────────────────────────────────────────────────────────────────│*/
-/* clang-format off */
 
 typedef int sig_atomic_t;
 
 bool32 isatty(int) libcesque;
-char *getcwd(char *, size_t) dontthrow __write_only(1, 2);
-char *realpath(const char *, char *) libcesque __wur __read_only(1) __write_only(2);
+char *getcwd(char *, size_t) dontthrow;
+char *realpath(const char *, char *) libcesque __wur;
 char *ttyname(int) libcesque;
-int access(const char *, int) libcesque __read_only(1);
-int chdir(const char *) libcesque __read_only(1);
-int chmod(const char *, unsigned) libcesque __read_only(1);
-int chown(const char *, unsigned, unsigned) libcesque __read_only(1);
-int chroot(const char *) libcesque __read_only(1);
+int access(const char *, int) libcesque;
+int chdir(const char *) libcesque;
+int chmod(const char *, unsigned) libcesque;
+int chown(const char *, unsigned, unsigned) libcesque;
+int chroot(const char *) libcesque;
 int close(int) libcesque;
 int close_range(unsigned, unsigned, unsigned) libcesque;
 int closefrom(int) libcesque;
-int creat(const char *, unsigned) libcesque __read_only(1);
+int creat(const char *, unsigned) libcesque;
 int dup(int) libcesque;
 int dup2(int, int) libcesque;
 int dup3(int, int, int) libcesque;
-int execl(const char *, const char *, ...) nullterminated() libcesque __read_only(1) __read_only(2);
-int execle(const char *, const char *, ...) nullterminated((1)) libcesque __read_only(1) __read_only(2);
-int execlp(const char *, const char *, ...) nullterminated() libcesque __read_only(1) __read_only(2);
-int execv(const char *, char *const[]) libcesque __read_only(1) __read_only(2);
-int execve(const char *, char *const[], char *const[]) libcesque __read_only(1) __read_only(2) __read_only(3);
-int execvp(const char *, char *const[]) libcesque __read_only(1) __read_only(2);
-int faccessat(int, const char *, int, int) libcesque __read_only(2);
+int execl(const char *, const char *, ...) nullterminated() libcesque;
+int execle(const char *, const char *, ...) nullterminated((1)) libcesque;
+int execlp(const char *, const char *, ...) nullterminated() libcesque;
+int execv(const char *, char *const[]) libcesque;
+int execve(const char *, char *const[], char *const[]) libcesque;
+int execvp(const char *, char *const[]) libcesque;
+int faccessat(int, const char *, int, int) libcesque;
 int fchdir(int) libcesque;
 int fchmod(int, unsigned) libcesque;
-int fchmodat(int, const char *, unsigned, int) libcesque __read_only(2);
+int fchmodat(int, const char *, unsigned, int) libcesque;
 int fchown(int, unsigned, unsigned) libcesque;
-int fchownat(int, const char *, unsigned, unsigned, int) libcesque __read_only(2);
+int fchownat(int, const char *, unsigned, unsigned, int) libcesque;
 int fcntl(int, int, ...) libcesque;
 int fdatasync(int) libcesque;
-int fexecve(int, char *const[], char *const[]) libcesque __read_only(2) __read_only(3);
+int fexecve(int, char *const[], char *const[]) libcesque;
 int flock(int, int) libcesque;
 int fork(void) libcesque;
 int fsync(int) libcesque;
 int ftruncate(int, int64_t) libcesque;
-int getdomainname(char *, size_t) libcesque __write_only(1, 2);
-int getgroups(int, unsigned[]) libcesque __write_only(2, 1);
-int gethostname(char *, size_t) libcesque __write_only(1, 2);
-int getloadavg(double *, int) libcesque __write_only(1, 2);
+int getdomainname(char *, size_t) libcesque;
+int getgroups(int, unsigned[]) libcesque;
+int gethostname(char *, size_t) libcesque;
+int getloadavg(double *, int) libcesque;
 int getpgid(int) libcesque;
 int getpgrp(void) libcesque nosideeffect;
 int getpid(void) libcesque nosideeffect;
@@ -122,35 +121,35 @@ int ioctl(int, unsigned long, ...) libcesque;
 int issetugid(void) libcesque;
 int kill(int, int) libcesque;
 int killpg(int, int) libcesque;
-int lchmod(const char *, unsigned) libcesque __read_only(1);
-int lchown(const char *, unsigned, unsigned) libcesque __read_only(1);
-int link(const char *, const char *) libcesque __read_only(1) __read_only(2);
-int linkat(int, const char *, int, const char *, int) libcesque __read_only(2) __read_only(4);
-int mincore(void *, size_t, unsigned char *) libcesque __read_only(1) __write_only(3);
-int mkdir(const char *, unsigned) libcesque __read_only(1);
-int mkdirat(int, const char *, unsigned) libcesque __read_only(2);
-int mknod(const char *, unsigned, uint64_t) libcesque __read_only(1);
+int lchmod(const char *, unsigned) libcesque;
+int lchown(const char *, unsigned, unsigned) libcesque;
+int link(const char *, const char *) libcesque;
+int linkat(int, const char *, int, const char *, int) libcesque;
+int mincore(void *, size_t, unsigned char *) libcesque;
+int mkdir(const char *, unsigned) libcesque;
+int mkdirat(int, const char *, unsigned) libcesque;
+int mknod(const char *, unsigned, uint64_t) libcesque;
 int nice(int) libcesque;
-int open(const char *, int, ...) libcesque __read_only(1);
-int openat(int, const char *, int, ...) libcesque __read_only(2);
+int open(const char *, int, ...) libcesque;
+int openat(int, const char *, int, ...) libcesque;
 int pause(void) libcesque;
-int pipe(int[2]) libcesque __write_only(1);
-int pipe2(int[2], int) libcesque __write_only(1);
+int pipe(int[2]) libcesque;
+int pipe2(int[2], int) libcesque;
 int posix_fadvise(int, int64_t, int64_t, int) libcesque;
-int posix_madvise(void *, uint64_t, int) libcesque __read_write(1);
+int posix_madvise(void *, uint64_t, int) libcesque;
 int raise(int) libcesque;
 int reboot(int) libcesque;
-int remove(const char *) libcesque __read_only(1);
-int rename(const char *, const char *) libcesque __read_only(1) __read_only(2);
-int renameat(int, const char *, int, const char *) libcesque __read_only(2) __read_only(4);
-int rmdir(const char *) libcesque __read_only(1);
+int remove(const char *) libcesque;
+int rename(const char *, const char *) libcesque;
+int renameat(int, const char *, int, const char *) libcesque;
+int rmdir(const char *) libcesque;
 int sched_yield(void) libcesque;
 int setegid(unsigned) libcesque;
 int seteuid(unsigned) libcesque;
 int setfsgid(unsigned) libcesque;
 int setfsuid(unsigned) libcesque;
 int setgid(unsigned) libcesque;
-int setgroups(size_t, const unsigned[]) libcesque __read_only(2);
+int setgroups(size_t, const unsigned[]) libcesque;
 int setpgid(int, int) libcesque;
 int setpgrp(void) libcesque;
 int setpriority(int, unsigned, int) libcesque;
@@ -158,32 +157,32 @@ int setregid(unsigned, unsigned) libcesque;
 int setreuid(unsigned, unsigned) libcesque;
 int setsid(void) libcesque;
 int setuid(unsigned) libcesque;
-int shm_open(const char *, int, unsigned) libcesque __read_only(1);
-int shm_unlink(const char *) libcesque __read_only(1);
+int shm_open(const char *, int, unsigned) libcesque;
+int shm_unlink(const char *) libcesque;
 int sigignore(int) libcesque;
 int siginterrupt(int, int) libcesque;
-int symlink(const char *, const char *) libcesque __read_only(1) __read_only(2);
-int symlinkat(const char *, int, const char *) libcesque __read_only(1) __read_only(3);
+int symlink(const char *, const char *) libcesque;
+int symlinkat(const char *, int, const char *) libcesque;
 int tcgetpgrp(int) libcesque;
 int tcsetpgrp(int, int) libcesque;
-int truncate(const char *, int64_t) libcesque __read_only(1);
-int ttyname_r(int, char *, size_t) libcesque __write_only(2, 3);
-int unlink(const char *) libcesque __read_only(1);
-int unlinkat(int, const char *, int) libcesque __read_only(2);
+int truncate(const char *, int64_t) libcesque;
+int ttyname_r(int, char *, size_t) libcesque;
+int unlink(const char *) libcesque;
+int unlinkat(int, const char *, int) libcesque;
 int usleep(uint64_t) libcesque;
 int vfork(void) libcesque returnstwice;
-int wait(int *) libcesque __write_only(1);
-int waitpid(int, int *, int) libcesque __write_only(2);
+int wait(int *) libcesque;
+int waitpid(int, int *, int) libcesque;
 int64_t clock(void) libcesque;
-int64_t time(int64_t *) libcesque __write_only(1);
-ssize_t copy_file_range(int, long *, int, long *, size_t, unsigned) libcesque __read_write(2) __read_write(4);
+int64_t time(int64_t *) libcesque;
+ssize_t copy_file_range(int, long *, int, long *, size_t, unsigned) libcesque;
 ssize_t lseek(int, int64_t, int) libcesque;
-ssize_t pread(int, void *, size_t, int64_t) libcesque __write_only(2, 3);
-ssize_t pwrite(int, const void *, size_t, int64_t) libcesque __read_only(2);
-ssize_t read(int, void *, size_t) libcesque __write_only(2, 3);
-ssize_t readlink(const char *, char *, size_t) libcesque __read_only(1) __write_only(2, 3);
-ssize_t readlinkat(int, const char *, char *, size_t) libcesque __read_only(2) __write_only(3, 4);
-ssize_t write(int, const void *, size_t) libcesque __read_only(2);
+ssize_t pread(int, void *, size_t, int64_t) libcesque;
+ssize_t pwrite(int, const void *, size_t, int64_t) libcesque;
+ssize_t read(int, void *, size_t) libcesque;
+ssize_t readlink(const char *, char *, size_t) libcesque;
+ssize_t readlinkat(int, const char *, char *, size_t) libcesque;
+ssize_t write(int, const void *, size_t) libcesque;
 unsigned alarm(unsigned) libcesque;
 unsigned getegid(void) libcesque nosideeffect;
 unsigned geteuid(void) libcesque nosideeffect;
@@ -200,50 +199,50 @@ int prctl(int, ...) libcesque;
 int gettid(void) libcesque;
 int setresgid(unsigned, unsigned, unsigned) libcesque;
 int setresuid(unsigned, unsigned, unsigned) libcesque;
-int getresgid(unsigned *, unsigned *, unsigned *) libcesque __write_only(1) __write_only(2) __write_only(3);
-int getresuid(unsigned *, unsigned *, unsigned *) libcesque __write_only(1) __write_only(2) __write_only(3);
+int getresgid(unsigned *, unsigned *, unsigned *) libcesque;
+int getresuid(unsigned *, unsigned *, unsigned *) libcesque;
 char *get_current_dir_name(void) libcesque __wur;
-ssize_t splice(int, int64_t *, int, int64_t *, size_t, unsigned) libcesque __read_write(2) __read_write(4);
-int memfd_create(const char *, unsigned int) libcesque __read_only(1);
-int execvpe(const char *, char *const[], char *const[]) libcesque __read_only(1) __read_only(2) __read_only(3);
-int euidaccess(const char *, int) libcesque __read_only(1);
-int eaccess(const char *, int) libcesque __read_only(1);
-int madvise(void *, uint64_t, int) libcesque __read_write(1);
-int getcpu(unsigned *, unsigned *) libcesque __write_only(1) __write_only(2);
+ssize_t splice(int, int64_t *, int, int64_t *, size_t, unsigned) libcesque;
+int memfd_create(const char *, unsigned int) libcesque;
+int execvpe(const char *, char *const[], char *const[]) libcesque;
+int euidaccess(const char *, int) libcesque;
+int eaccess(const char *, int) libcesque;
+int madvise(void *, uint64_t, int) libcesque;
+int getcpu(unsigned *, unsigned *) libcesque;
 #endif
 
 #ifdef _COSMO_SOURCE
 bool32 fdexists(int) libcesque;
-bool32 fileexists(const char *) libcesque __read_only(1);
+bool32 fileexists(const char *) libcesque;
 bool32 ischardev(int) libcesque;
-bool32 isdirectory(const char *) libcesque __read_only(1);
-bool32 isexecutable(const char *) libcesque __read_only(1);
-bool32 isregularfile(const char *) libcesque __read_only(1);
-bool32 issymlink(const char *) libcesque __read_only(1);
-char *commandv(const char *, char *, size_t) libcesque __read_only(1) __write_only(2, 3);
-int __getcwd(char *, size_t) libcesque __write_only(1, 2);
+bool32 isdirectory(const char *) libcesque;
+bool32 isexecutable(const char *) libcesque;
+bool32 isregularfile(const char *) libcesque;
+bool32 issymlink(const char *) libcesque;
+char *commandv(const char *, char *, size_t) libcesque;
+int __getcwd(char *, size_t) libcesque;
 int clone(void *, void *, size_t, int, void *, void *, void *, void *);
 int fadvise(int, uint64_t, uint64_t, int) libcesque;
-int makedirs(const char *, unsigned) libcesque __read_only(1);
-int pivot_root(const char *, const char *) libcesque __read_only(1) __read_only(2);
-int pledge(const char *, const char *) libcesque __read_only(1) __read_only(2);
-int seccomp(unsigned, unsigned, void *) libcesque __read_only(3);
+int makedirs(const char *, unsigned) libcesque;
+int pivot_root(const char *, const char *) libcesque;
+int pledge(const char *, const char *) libcesque;
+int seccomp(unsigned, unsigned, void *) libcesque;
 int sys_iopl(int) libcesque;
 int sys_ioprio_get(int, int) libcesque;
 int sys_ioprio_set(int, int, int) libcesque;
-int sys_mlock(const void *, size_t) libcesque __read_only(1);
-int sys_mlock2(const void *, size_t, int) libcesque __read_only(1);
+int sys_mlock(const void *, size_t) libcesque;
+int sys_mlock2(const void *, size_t, int) libcesque;
 int sys_mlockall(int) libcesque;
-int sys_munlock(const void *, size_t) libcesque __read_only(1);
+int sys_munlock(const void *, size_t) libcesque;
 int sys_munlockall(void) libcesque;
 int sys_personality(uint64_t) libcesque;
 int sys_ptrace(int, ...) libcesque;
-int sysctl(int *, unsigned, void *, size_t *, void *, size_t) libcesque __read_write(1) __read_write(4) __read_write(5);
-int sysctlbyname(const char *, void *, size_t *, void *, size_t) libcesque __read_only(1) __write_only(2) __read_write(3) __read_only(4);
-int sysctlnametomib(const char *, int *, size_t *) libcesque __read_only(1) __write_only(2) __read_write(3);
+int sysctl(int *, unsigned, void *, size_t *, void *, size_t) libcesque;
+int sysctlbyname(const char *, void *, size_t *, void *, size_t) libcesque;
+int sysctlnametomib(const char *, int *, size_t *) libcesque;
 int tmpfd(void) libcesque;
-int touch(const char *, unsigned) libcesque __read_only(1);
-int unveil(const char *, const char *) libcesque __read_only(1);
+int touch(const char *, unsigned) libcesque;
+int unveil(const char *, const char *) libcesque;
 long ptrace(int, ...) libcesque;
 ssize_t copyfd(int, int, size_t) libcesque;
 ssize_t readansi(int, char *, size_t) libcesque;
diff --git a/libc/calls/chdir-nt.c b/libc/calls/chdir-nt.c
index 86e104bc8..ebc052c5a 100644
--- a/libc/calls/chdir-nt.c
+++ b/libc/calls/chdir-nt.c
@@ -16,48 +16,70 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/syscall-nt.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/limits.h"
+#include "libc/errno.h"
+#include "libc/macros.internal.h"
+#include "libc/nt/errors.h"
 #include "libc/nt/files.h"
 #include "libc/nt/process.h"
+#include "libc/nt/runtime.h"
+#include "libc/nt/synchronization.h"
 #include "libc/sysv/errfuns.h"
 
 textwindows int sys_chdir_nt_impl(char16_t path[hasatleast PATH_MAX],
                                   uint32_t len) {
   uint32_t n;
+  int e, ms, err;
   char16_t var[4];
+
   if (len && path[len - 1] != u'\\') {
     if (len + 2 > PATH_MAX)
       return enametoolong();
     path[len + 0] = u'\\';
     path[len + 1] = u'\0';
   }
-  if (SetCurrentDirectory(path)) {
-    /*
-     * Now we need to set a magic environment variable.
-     */
-    if ((n = GetCurrentDirectory(PATH_MAX, path))) {
-      if (n < PATH_MAX) {
-        if (!((path[0] == '/' && path[1] == '/') ||
-              (path[0] == '\\' && path[1] == '\\'))) {
-          var[0] = '=';
-          var[1] = path[0];
-          var[2] = ':';
-          var[3] = 0;
-          if (!SetEnvironmentVariable(var, path))
-            return __winerr();
+
+  /*
+   * chdir() seems flaky on windows 7
+   * in a similar way to rmdir() sigh
+   */
+  for (err = errno, ms = 1;; ms *= 2) {
+    if (SetCurrentDirectory(path)) {
+      /*
+       * Now we need to set a magic environment variable.
+       */
+      if ((n = GetCurrentDirectory(PATH_MAX, path))) {
+        if (n < PATH_MAX) {
+          if (!((path[0] == '/' && path[1] == '/') ||
+                (path[0] == '\\' && path[1] == '\\'))) {
+            var[0] = '=';
+            var[1] = path[0];
+            var[2] = ':';
+            var[3] = 0;
+            if (!SetEnvironmentVariable(var, path)) {
+              return __winerr();
+            }
+          }
+          return 0;
+        } else {
+          return enametoolong();
         }
-        return 0;
       } else {
-        return enametoolong();
+        return __winerr();
       }
     } else {
-      return __winerr();
+      e = GetLastError();
+      if (ms <= 512 &&
+          (e == kNtErrorFileNotFound || e == kNtErrorAccessDenied)) {
+        Sleep(ms);
+        errno = err;
+        continue;
+      } else {
+        break;
+      }
     }
-  } else {
-    return __fix_enotdir(__winerr(), path);
   }
+  return __fix_enotdir(-1, path);
 }
 
 textwindows int sys_chdir_nt(const char *path) {
diff --git a/libc/intrin/checkcancel.c b/libc/calls/checkcancel.c
similarity index 84%
rename from libc/intrin/checkcancel.c
rename to libc/calls/checkcancel.c
index 51e1bfee7..8b95bf3cd 100644
--- a/libc/intrin/checkcancel.c
+++ b/libc/calls/checkcancel.c
@@ -21,16 +21,12 @@
 #include "libc/intrin/weaken.h"
 #include "libc/thread/posixthread.internal.h"
 
-textwindows bool _is_canceled(void) {
-  struct PosixThread *pt;
-  return _weaken(_pthread_cancel_ack) && (pt = _pthread_self()) &&
-         atomic_load_explicit(&pt->pt_canceled, memory_order_acquire) &&
-         !(pt->pt_flags & PT_NOCANCEL);
-}
-
-textwindows int _check_cancel(void) {
-  if (_is_canceled())
-    // once acknowledged _is_canceled() will return false
+int _check_cancel(void) {
+  if (_weaken(_pthread_cancel_ack) &&  //
+      _pthread_self() && !(_pthread_self()->pt_flags & PT_NOCANCEL) &&
+      atomic_load_explicit(&_pthread_self()->pt_canceled,
+                           memory_order_acquire)) {
     return _weaken(_pthread_cancel_ack)();
+  }
   return 0;
 }
diff --git a/libc/calls/clock_getres.c b/libc/calls/clock_getres.c
index 39ba39fc0..fae7959c4 100644
--- a/libc/calls/clock_getres.c
+++ b/libc/calls/clock_getres.c
@@ -20,37 +20,24 @@
 #include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
-#include "libc/runtime/clktck.h"
 #include "libc/sysv/consts/clock.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/time.h"
 
-static uint64_t hz_to_nanos(uint64_t frequency) {
-  if (!frequency)
+static int sys_clock_getres_poly(int clock, struct timespec *ts, int64_t real,
+                                 int64_t real_coarse, int64_t boot) {
+  ts->tv_sec = 0;
+  if (clock == CLOCK_REALTIME) {
+    ts->tv_nsec = real;
     return 0;
-  uint64_t quotient = 1000000000 / frequency;
-  uint64_t remainder = 1000000000 % frequency;
-  if (remainder > 0)
-    quotient += 1;
-  return quotient;
-}
-
-static int sys_clock_getres_poly(int clock, struct timespec *ts, int64_t prec) {
-  if (ts)
-    ts->tv_sec = 0;
-  if (clock == CLOCK_REALTIME ||   //
-      clock == CLOCK_BOOTTIME ||   //
-      clock == CLOCK_MONOTONIC ||  //
-      clock == CLOCK_MONOTONIC_RAW) {
-    if (ts)
-      ts->tv_nsec = prec;
+  } else if (clock == CLOCK_REALTIME_COARSE) {
+    ts->tv_nsec = real_coarse;
     return 0;
-  } else if (clock == CLOCK_REALTIME_COARSE ||
-             clock == CLOCK_MONOTONIC_COARSE ||
-             clock == CLOCK_THREAD_CPUTIME_ID ||
-             clock == CLOCK_PROCESS_CPUTIME_ID) {
-    if (ts)
-      *ts = timespec_fromnanos(hz_to_nanos(CLK_TCK));
+  } else if (clock == CLOCK_MONOTONIC) {
+    ts->tv_nsec = 10;
+    return 0;
+  } else if (clock == CLOCK_BOOTTIME) {
+    ts->tv_nsec = boot;
     return 0;
   } else {
     return einval();
@@ -58,11 +45,11 @@ static int sys_clock_getres_poly(int clock, struct timespec *ts, int64_t prec) {
 }
 
 static int sys_clock_getres_nt(int clock, struct timespec *ts) {
-  return sys_clock_getres_poly(clock, ts, 100);
+  return sys_clock_getres_poly(clock, ts, 100, 1000000, 1000000);
 }
 
 static int sys_clock_getres_xnu(int clock, struct timespec *ts) {
-  return sys_clock_getres_poly(clock, ts, 1000);
+  return sys_clock_getres_poly(clock, ts, 1000, 1000, 1000);
 }
 
 /**
diff --git a/libc/intrin/clock_gettime-mono.c b/libc/calls/clock_gettime-mono.c
similarity index 64%
rename from libc/intrin/clock_gettime-mono.c
rename to libc/calls/clock_gettime-mono.c
index 937967f57..3bc38f37f 100644
--- a/libc/intrin/clock_gettime-mono.c
+++ b/libc/calls/clock_gettime-mono.c
@@ -18,63 +18,42 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/atomic.h"
 #include "libc/calls/struct/timespec.h"
-#include "libc/calls/struct/timespec.internal.h"
-#include "libc/calls/struct/timeval.h"
 #include "libc/cosmo.h"
-#include "libc/dce.h"
+#include "libc/errno.h"
 #include "libc/nexgen32e/rdtsc.h"
+#include "libc/nexgen32e/x86feature.h"
 
 /**
- * @fileoverview Monotonic clock polyfill.
- *
- * This isn't quite `CLOCK_MONOTONIC` and isn't quite `CLOCK_BOOTTIME`
- * either; however it is fast and almost always goes in one direction.
- *
- * Intel architecture guarantees that a mapping exists between rdtsc &
- * nanoseconds only if the cpu advertises invariant timestamps support
- * however this shouldn't matter for a monotonic clock since we really
- * don't want to have it tick while suspended. Sadly that shall happen
- * since nearly all x86 microprocessors support invariant tsc which is
- * why we try to avoid this fallback when possible.
+ * @fileoverview Fast Monotonic Clock Polyfill for XNU/NT.
  */
 
-int sys_sysctl(int *, unsigned, void *, size_t *, void *, size_t) libcesque;
-
 static struct {
   atomic_uint once;
-  unsigned long base;
-  struct timespec boot;
+  struct timespec base_wall;
+  uint64_t base_tick;
 } g_mono;
 
-static struct timespec get_boot_time_xnu(void) {
-  struct timeval t;
-  size_t n = sizeof(t);
-  int mib[] = {1 /* CTL_KERN */, 21 /* KERN_BOOTTIME */};
-  if (sys_sysctl(mib, 2, &t, &n, 0, 0) == -1)
-    __builtin_trap();
-  return timeval_totimespec(t);
-}
-
 static void sys_clock_gettime_mono_init(void) {
-  g_mono.base = rdtsc();
-  if (IsXnu()) {
-    g_mono.boot = get_boot_time_xnu();
-  } else {
-    __builtin_trap();
-  }
+  g_mono.base_wall = timespec_real();
+  g_mono.base_tick = rdtsc();
 }
 
 int sys_clock_gettime_mono(struct timespec *time) {
   uint64_t nanos;
   uint64_t cycles;
+#ifdef __x86_64__
+  // intel architecture guarantees that a mapping exists between rdtsc &
+  // nanoseconds only if the cpu advertises invariant timestamps support
+  if (!X86_HAVE(INVTSC))
+    return -EINVAL;
+#endif
   cosmo_once(&g_mono.once, sys_clock_gettime_mono_init);
-  // ensure we get the full 64 bits of counting, which avoids wraparound
-  cycles = rdtsc() - g_mono.base;
+  cycles = rdtsc() - g_mono.base_tick;
   // this is a crude approximation, that's worked reasonably well so far
   // only the kernel knows the actual mapping between rdtsc and nanosecs
   // which we could attempt to measure ourselves using clock_gettime but
   // we'd need to impose 100 ms of startup latency for a guess this good
   nanos = cycles / 3;
-  *time = timespec_add(g_mono.boot, timespec_fromnanos(nanos));
+  *time = timespec_add(g_mono.base_wall, timespec_fromnanos(nanos));
   return 0;
 }
diff --git a/libc/calls/clock_gettime-nt.c b/libc/calls/clock_gettime-nt.c
new file mode 100644
index 000000000..5a6464e42
--- /dev/null
+++ b/libc/calls/clock_gettime-nt.c
@@ -0,0 +1,95 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/struct/timespec.internal.h"
+#include "libc/dce.h"
+#include "libc/errno.h"
+#include "libc/fmt/wintime.internal.h"
+#include "libc/nt/accounting.h"
+#include "libc/nt/runtime.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+
+#define _CLOCK_REALTIME           0
+#define _CLOCK_MONOTONIC          1
+#define _CLOCK_REALTIME_COARSE    2
+#define _CLOCK_BOOTTIME           3
+#define _CLOCK_PROCESS_CPUTIME_ID 4
+#define _CLOCK_THREAD_CPUTIME_ID  5
+
+static struct {
+  uint64_t base;
+  uint64_t freq;
+} g_winclock;
+
+textwindows int sys_clock_gettime_nt(int clock, struct timespec *ts) {
+  uint64_t t;
+  struct NtFileTime ft, ftExit, ftUser, ftKernel, ftCreation;
+  switch (clock) {
+    case _CLOCK_REALTIME:
+      if (ts) {
+        GetSystemTimePreciseAsFileTime(&ft);
+        *ts = FileTimeToTimeSpec(ft);
+      }
+      return 0;
+    case _CLOCK_REALTIME_COARSE:
+      if (ts) {
+        GetSystemTimeAsFileTime(&ft);
+        *ts = FileTimeToTimeSpec(ft);
+      }
+      return 0;
+    case _CLOCK_MONOTONIC:
+      if (ts) {
+        QueryPerformanceCounter(&t);
+        t = ((t - g_winclock.base) * 1000000000) / g_winclock.freq;
+        *ts = timespec_fromnanos(t);
+      }
+      return 0;
+    case _CLOCK_BOOTTIME:
+      if (ts) {
+        *ts = timespec_frommillis(GetTickCount64());
+      }
+      return 0;
+    case _CLOCK_PROCESS_CPUTIME_ID:
+      if (ts) {
+        GetProcessTimes(GetCurrentProcess(), &ftCreation, &ftExit, &ftKernel,
+                        &ftUser);
+        *ts = WindowsDurationToTimeSpec(ReadFileTime(ftUser) +
+                                        ReadFileTime(ftKernel));
+      }
+      return 0;
+    case _CLOCK_THREAD_CPUTIME_ID:
+      if (ts) {
+        GetThreadTimes(GetCurrentThread(), &ftCreation, &ftExit, &ftKernel,
+                       &ftUser);
+        *ts = WindowsDurationToTimeSpec(ReadFileTime(ftUser) +
+                                        ReadFileTime(ftKernel));
+      }
+      return 0;
+    default:
+      return -EINVAL;
+  }
+}
+
+__attribute__((__constructor__(40))) static textstartup void winclock_init() {
+  if (IsWindows()) {
+    QueryPerformanceCounter(&g_winclock.base);
+    QueryPerformanceFrequency(&g_winclock.freq);
+  }
+}
diff --git a/libc/intrin/clock_gettime-sysv.c b/libc/calls/clock_gettime-sysv.c
similarity index 100%
rename from libc/intrin/clock_gettime-sysv.c
rename to libc/calls/clock_gettime-sysv.c
diff --git a/libc/intrin/clock_gettime-xnu.c b/libc/calls/clock_gettime-xnu.c
similarity index 78%
rename from libc/intrin/clock_gettime-xnu.c
rename to libc/calls/clock_gettime-xnu.c
index a0eb5b9e2..e9548884e 100644
--- a/libc/intrin/clock_gettime-xnu.c
+++ b/libc/calls/clock_gettime-xnu.c
@@ -21,10 +21,13 @@
 #include "libc/calls/struct/timeval.h"
 #include "libc/calls/struct/timeval.internal.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/clock.h"
 #ifdef __x86_64__
 
+#define CTL_KERN      1
+#define KERN_BOOTTIME 21
+
 int sys_clock_gettime_xnu(int clock, struct timespec *ts) {
   long ax, dx;
   if (clock == CLOCK_REALTIME) {
@@ -44,20 +47,31 @@ int sys_clock_gettime_xnu(int clock, struct timespec *ts) {
     //   2. old xnu returns *ts in rax:rdx regs
     //
     // we assume this system call always succeeds
-    asm volatile("syscall"
-                 : "=a"(ax), "=d"(dx)
-                 : "0"(0x2000000 | 116), "D"(ts), "S"(0), "1"(0)
-                 : "rcx", "r8", "r9", "r10", "r11", "memory");
-    if (ax) {
-      ts->tv_sec = ax;
-      ts->tv_nsec = dx;
+    if (ts) {
+      asm volatile("syscall"
+                   : "=a"(ax), "=d"(dx)
+                   : "0"(0x2000000 | 116), "D"(ts), "S"(0), "1"(0)
+                   : "rcx", "r8", "r9", "r10", "r11", "memory");
+      if (ax) {
+        ts->tv_sec = ax;
+        ts->tv_nsec = dx;
+      }
+      ts->tv_nsec *= 1000;
     }
-    ts->tv_nsec *= 1000;
     return 0;
-  } else if (clock == CLOCK_BOOTTIME ||   //
-             clock == CLOCK_MONOTONIC ||  //
-             clock == CLOCK_MONOTONIC_COARSE) {
+  } else if (clock == CLOCK_MONOTONIC) {
+    if (!ts)
+      return 0;
     return sys_clock_gettime_mono(ts);
+  } else if (clock == CLOCK_BOOTTIME) {
+    struct timeval x;
+    size_t n = sizeof(x);
+    int mib[] = {CTL_KERN, KERN_BOOTTIME};
+    if (sysctl(mib, ARRAYLEN(mib), &x, &n, 0, 0) == -1)
+      return -1;
+    if (ts)
+      *ts = timeval_totimespec(timeval_sub(timeval_real(), x));
+    return 0;
   } else {
     return -EINVAL;
   }
diff --git a/libc/calls/clock_gettime.c b/libc/calls/clock_gettime.c
new file mode 100644
index 000000000..2aac56c98
--- /dev/null
+++ b/libc/calls/clock_gettime.c
@@ -0,0 +1,91 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=8 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/struct/timespec.internal.h"
+#include "libc/calls/syscall_support-sysv.internal.h"
+#include "libc/dce.h"
+#include "libc/errno.h"
+#include "libc/intrin/describeflags.h"
+#include "libc/intrin/strace.h"
+#include "libc/runtime/syslib.internal.h"
+
+#ifdef __aarch64__
+#define CGT_VDSO __vdsosym("LINUX_2.6.39", "__kernel_clock_gettime")
+#else
+#define CGT_VDSO __vdsosym("LINUX_2.6", "__vdso_clock_gettime")
+#endif
+
+typedef int clock_gettime_f(int, struct timespec *);
+
+static clock_gettime_f *__clock_gettime_get(void) {
+  clock_gettime_f *cgt;
+  if (IsLinux() && (cgt = CGT_VDSO)) {
+    return cgt;
+  } else if (__syslib) {
+    return (void *)__syslib->__clock_gettime;
+  } else if (IsWindows()) {
+    return sys_clock_gettime_nt;
+#ifdef __x86_64__
+  } else if (IsXnu()) {
+    return sys_clock_gettime_xnu;
+#endif
+  } else {
+    return sys_clock_gettime;
+  }
+}
+
+static int __clock_gettime_init(int, struct timespec *);
+static clock_gettime_f *__clock_gettime = __clock_gettime_init;
+static int __clock_gettime_init(int clockid, struct timespec *ts) {
+  clock_gettime_f *cgt;
+  __clock_gettime = cgt = __clock_gettime_get();
+  return cgt(clockid, ts);
+}
+
+/**
+ * Returns nanosecond time.
+ *
+ * @param clock supports the following values across OSes:
+ *    - `CLOCK_REALTIME`
+ *    - `CLOCK_MONOTONIC`
+ *    - `CLOCK_REALTIME_COARSE`
+ *    - `CLOCK_MONOTONIC_COARSE`
+ *    - `CLOCK_THREAD_CPUTIME_ID`
+ *    - `CLOCK_PROCESS_CPUTIME_ID`
+ * @param ts is where the result is stored (or null to do clock check)
+ * @return 0 on success, or -1 w/ errno
+ * @raise EFAULT if `ts` points to invalid memory
+ * @error EINVAL if `clock` isn't supported on this system
+ * @error EPERM if pledge() is in play without stdio promise
+ * @error ESRCH on NetBSD if PID/TID OR'd into `clock` wasn't found
+ * @see strftime(), gettimeofday()
+ * @asyncsignalsafe
+ * @vforksafe
+ */
+int clock_gettime(int clock, struct timespec *ts) {
+  // threads on win32 stacks call this so we can't asan check *ts
+  int rc = __clock_gettime(clock, ts);
+  if (rc) {
+    errno = -rc;
+    rc = -1;
+  }
+  TIMETRACE("clock_gettime(%s, [%s]) → %d% m", DescribeClockName(clock),
+            DescribeTimespec(rc, ts), rc);
+  return rc;
+}
diff --git a/libc/calls/clock_nanosleep-cosmo.c b/libc/calls/clock_nanosleep-cosmo.c
new file mode 100644
index 000000000..f49c4c50f
--- /dev/null
+++ b/libc/calls/clock_nanosleep-cosmo.c
@@ -0,0 +1,85 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/calls/internal.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/errno.h"
+#include "libc/runtime/clktck.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/sysv/consts/timer.h"
+
+/**
+ * Sleeps with higher accuracy at the cost of cpu.
+ */
+int cosmo_clock_nanosleep(int clock, int flags, const struct timespec *req,
+                          struct timespec *rem) {
+
+  // pick clocks
+  int time_clock;
+  int sleep_clock;
+  if (clock == CLOCK_REALTIME ||  //
+      clock == CLOCK_REALTIME_PRECISE) {
+    time_clock = clock;
+    sleep_clock = CLOCK_REALTIME;
+  } else if (clock == CLOCK_MONOTONIC ||  //
+             clock == CLOCK_MONOTONIC_PRECISE) {
+    time_clock = clock;
+    sleep_clock = CLOCK_MONOTONIC;
+  } else if (clock == CLOCK_REALTIME_COARSE ||  //
+             clock == CLOCK_REALTIME_FAST) {
+    return sys_clock_nanosleep(CLOCK_REALTIME, flags, req, rem);
+  } else if (clock == CLOCK_MONOTONIC_COARSE ||  //
+             clock == CLOCK_MONOTONIC_FAST) {
+    return sys_clock_nanosleep(CLOCK_MONOTONIC, flags, req, rem);
+  } else {
+    return sys_clock_nanosleep(clock, flags, req, rem);
+  }
+
+  // sleep bulk of time in kernel
+  struct timespec start, deadline, remain, waitfor, now;
+  struct timespec quantum = timespec_fromnanos(1000000000 / CLK_TCK);
+  clock_gettime(time_clock, &start);
+  deadline = flags & TIMER_ABSTIME ? *req : timespec_add(start, *req);
+  if (timespec_cmp(start, deadline) >= 0)
+    return 0;
+  remain = timespec_sub(deadline, start);
+  if (timespec_cmp(remain, quantum) > 0) {
+    waitfor = timespec_sub(remain, quantum);
+    if (sys_clock_nanosleep(sleep_clock, 0, &waitfor, rem) == -1) {
+      if (!flags && rem && errno == EINTR) {
+        *rem = timespec_add(*rem, quantum);
+      }
+      return -1;
+    }
+  }
+
+  // spin through final scheduling quantum
+  int rc = 0;
+  ftrace_enabled(-1);
+  do {
+    if (_check_cancel()) {
+      rc = -1;
+      break;
+    }
+    clock_gettime(time_clock, &now);
+  } while (timespec_cmp(now, deadline) < 0);
+  ftrace_enabled(+1);
+  return rc;
+}
diff --git a/libc/calls/clock_nanosleep-nt.c b/libc/calls/clock_nanosleep-nt.c
index 1e1a09cbd..6d7adc5be 100644
--- a/libc/calls/clock_nanosleep-nt.c
+++ b/libc/calls/clock_nanosleep-nt.c
@@ -16,45 +16,30 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/atomic.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/struct/timespec.h"
 #include "libc/calls/struct/timespec.internal.h"
-#include "libc/calls/syscall-sysv.internal.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
-#include "libc/nt/enum/status.h"
-#include "libc/nt/ntdll.h"
-#include "libc/stdio/sysparam.h"
-#include "libc/sysv/consts/clock.h"
 #include "libc/sysv/consts/timer.h"
 #include "libc/thread/tls.h"
 #ifdef __x86_64__
 
-static atomic_int usingRes;
-static atomic_bool changedRes;
-
 static textwindows int sys_clock_nanosleep_nt_impl(int clock,
                                                    struct timespec abs,
                                                    sigset_t waitmask) {
-  struct timespec now, wall;
-  uint32_t minRes, maxRes, oldRes;
-  sys_clock_gettime_nt(0, &wall);
-  if (sys_clock_gettime_nt(clock, &now))
-    return -1;
-  bool wantRes = clock == CLOCK_REALTIME ||   //
-                 clock == CLOCK_MONOTONIC ||  //
-                 clock == CLOCK_BOOTTIME;
-  if (wantRes && !atomic_fetch_add(&usingRes, 1))
-    changedRes = NtSuccess(NtQueryTimerResolution(&minRes, &maxRes, &oldRes)) &&
-                 NtSuccess(NtSetTimerResolution(maxRes, true, &oldRes));
-  if (timespec_cmp(abs, now) > 0)
-    wall = timespec_add(wall, timespec_sub(abs, now));
-  int rc = _park_norestart(wall, waitmask);
-  if (wantRes && atomic_fetch_sub(&usingRes, 1) == 1 && changedRes)
-    NtSetTimerResolution(0, false, &minRes);
-  return rc;
+  uint32_t msdelay;
+  struct timespec now;
+  for (;;) {
+    if (sys_clock_gettime_nt(clock, &now))
+      return -1;
+    if (timespec_cmp(now, abs) >= 0)
+      return 0;
+    msdelay = timespec_tomillis(timespec_sub(abs, now));
+    if (_park_norestart(msdelay, waitmask))
+      return -1;
+  }
 }
 
 textwindows int sys_clock_nanosleep_nt(int clock, int flags,
@@ -63,17 +48,15 @@ textwindows int sys_clock_nanosleep_nt(int clock, int flags,
   int rc;
   struct timespec abs, now;
   sigset_t m = __sig_block();
-  if (flags) {
+  if (flags & TIMER_ABSTIME) {
     abs = *req;
   } else {
-    if ((rc = sys_clock_gettime_nt(clock, &now))) {
-      rc = _sysret(rc);
+    if ((rc = sys_clock_gettime_nt(clock, &now)))
       goto BailOut;
-    }
     abs = timespec_add(now, *req);
   }
   rc = sys_clock_nanosleep_nt_impl(clock, abs, m);
-  if (rc == -1 && !flags && rem && errno == EINTR) {
+  if (rc == -1 && rem && errno == EINTR) {
     sys_clock_gettime_nt(clock, &now);
     *rem = timespec_subz(abs, now);
   }
diff --git a/libc/calls/clock_nanosleep-openbsd.c b/libc/calls/clock_nanosleep-openbsd.c
index dec285314..e1d67bcef 100644
--- a/libc/calls/clock_nanosleep-openbsd.c
+++ b/libc/calls/clock_nanosleep-openbsd.c
@@ -18,27 +18,28 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/timespec.h"
 #include "libc/calls/struct/timespec.internal.h"
-#include "libc/calls/syscall-sysv.internal.h"
-#include "libc/errno.h"
 #include "libc/sysv/consts/clock.h"
 #include "libc/sysv/errfuns.h"
 
-relegated int sys_clock_nanosleep_openbsd(int clock, int flags,
-                                          const struct timespec *req,
-                                          struct timespec *rem) {
+int sys_clock_nanosleep_openbsd(int clock, int flags,
+                                const struct timespec *req,
+                                struct timespec *rem) {
   int res;
-  struct timespec start, relative, remainder;
-  if (!flags) {
-    relative = *req;
+  struct timespec now, rel;
+  if (clock == CLOCK_REALTIME) {
+    if (!flags) {
+      res = sys_nanosleep(req, rem);
+    } else {
+      sys_clock_gettime(clock, &now);
+      if (timespec_cmp(*req, now) > 0) {
+        rel = timespec_sub(*req, now);
+        res = sys_nanosleep(&rel, 0);
+      } else {
+        res = 0;
+      }
+    }
   } else {
-    if ((res = sys_clock_gettime(clock, &start)))
-      return _sysret(res);
-    if (timespec_cmp(start, *req) >= 0)
-      return 0;
-    relative = timespec_sub(*req, start);
+    res = enotsup();
   }
-  res = sys_nanosleep(&relative, &remainder);
-  if (res == -1 && errno == EINTR && rem && !flags)
-    *rem = remainder;
   return res;
 }
diff --git a/libc/calls/clock_nanosleep-xnu.c b/libc/calls/clock_nanosleep-xnu.c
index 83a358f8e..23d0f2125 100644
--- a/libc/calls/clock_nanosleep-xnu.c
+++ b/libc/calls/clock_nanosleep-xnu.c
@@ -35,10 +35,8 @@ int sys_clock_nanosleep_xnu(int clock, int flags, const struct timespec *req,
                             struct timespec *rem) {
 #ifdef __x86_64__
   if (flags & TIMER_ABSTIME) {
-    int nerr;
     struct timespec now;
-    if ((nerr = sys_clock_gettime_xnu(clock, &now)))
-      return _sysret(nerr);
+    sys_clock_gettime_xnu(clock, &now);
     if (timespec_cmp(*req, now) > 0) {
       struct timeval rel = timespec_totimeval(timespec_sub(*req, now));
       return sys_select(0, 0, 0, 0, &rel);
@@ -49,13 +47,12 @@ int sys_clock_nanosleep_xnu(int clock, int flags, const struct timespec *req,
     int rc;
     struct timespec beg;
     if (rem)
-      if ((rc = sys_clock_gettime_xnu(clock, &beg)))
-        return _sysret(rc);
+      sys_clock_gettime_xnu(CLOCK_REALTIME, &beg);
     struct timeval rel = timespec_totimeval(*req);  // rounds up
     rc = sys_select(0, 0, 0, 0, &rel);
     if (rc == -1 && rem && errno == EINTR) {
       struct timespec end;
-      sys_clock_gettime_xnu(clock, &end);
+      sys_clock_gettime_xnu(CLOCK_REALTIME, &end);
       *rem = timespec_subz(*req, timespec_sub(end, beg));
     }
     return rc;
@@ -64,8 +61,9 @@ int sys_clock_nanosleep_xnu(int clock, int flags, const struct timespec *req,
   long res;
   struct timespec abs, now, rel;
   if (_weaken(pthread_testcancel_np) &&  //
-      _weaken(pthread_testcancel_np)())
+      _weaken(pthread_testcancel_np)()) {
     return ecanceled();
+  }
   if (flags & TIMER_ABSTIME) {
     abs = *req;
     if (!(res = __syslib->__clock_gettime(clock, &now))) {
@@ -75,10 +73,7 @@ int sys_clock_nanosleep_xnu(int clock, int flags, const struct timespec *req,
       }
     }
   } else {
-    struct timespec remainder;
-    res = __syslib->__nanosleep(req, &remainder);
-    if (res == -EINTR && rem)
-      *rem = remainder;
+    res = __syslib->__nanosleep(req, rem);
   }
   if (res == -EINTR &&                    //
       (_weaken(pthread_testcancel_np) &&  //
diff --git a/libc/calls/clock_nanosleep.c b/libc/calls/clock_nanosleep.c
index 459e50328..20a6b03ee 100644
--- a/libc/calls/clock_nanosleep.c
+++ b/libc/calls/clock_nanosleep.c
@@ -19,7 +19,6 @@
 #include "libc/calls/struct/timespec.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/sysv/consts/clock.h"
 #include "libc/sysv/consts/timer.h"
 
 /**
@@ -57,10 +56,7 @@
  *
  * @param clock may be
  *     - `CLOCK_REALTIME`
- *     - `CLOCK_BOOTTIME`
  *     - `CLOCK_MONOTONIC`
- *     - `CLOCK_REALTIME_COARSE` but is likely to sleep negative time
- *     - `CLOCK_MONTONIC_COARSE` but is likely to sleep negative time
  * @param flags can be 0 for relative and `TIMER_ABSTIME` for absolute
  * @param req can be a relative or absolute time, depending on `flags`
  * @param rem shall be updated with the remainder of unslept time when
@@ -83,21 +79,18 @@
 errno_t clock_nanosleep(int clock, int flags,        //
                         const struct timespec *req,  //
                         struct timespec *rem) {
-  if (IsMetal())
+  if (IsMetal()) {
     return ENOSYS;
-  if (IsLinux() && clock == CLOCK_REALTIME_COARSE)
-    clock = CLOCK_REALTIME;
-  if (IsLinux() && clock == CLOCK_MONOTONIC_COARSE)
-    clock = CLOCK_MONOTONIC;
+  }
   if (clock == 127 ||              //
       (flags & ~TIMER_ABSTIME) ||  //
       req->tv_sec < 0 ||           //
-      !(0 <= req->tv_nsec && req->tv_nsec <= 999999999))
+      !(0 <= req->tv_nsec && req->tv_nsec <= 999999999)) {
     return EINVAL;
-  int rc;
-  errno_t err, old = errno;
-  rc = sys_clock_nanosleep(clock, flags, req, rem);
-  err = !rc ? 0 : errno;
+  }
+  errno_t old = errno;
+  int rc = sys_clock_nanosleep(clock, flags, req, rem);
+  errno_t err = !rc ? 0 : errno;
   errno = old;
   return err;
 }
diff --git a/libc/calls/close-nt.c b/libc/calls/close-nt.c
index 16557582f..74475c8d3 100644
--- a/libc/calls/close-nt.c
+++ b/libc/calls/close-nt.c
@@ -17,14 +17,13 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/weaken.h"
 #include "libc/nt/enum/filetype.h"
 #include "libc/nt/files.h"
 #include "libc/nt/runtime.h"
-#include "libc/runtime/runtime.h"
 #include "libc/runtime/zipos.internal.h"
 #include "libc/sock/syscall_fd.internal.h"
 #include "libc/sysv/consts/o.h"
@@ -52,6 +51,11 @@ textwindows int sys_close_nt(int fd, int fildes) {
         FlushFileBuffers(f->handle);
       }
       break;
+    case kFdEpoll:
+      if (_weaken(sys_close_epoll_nt)) {
+        return _weaken(sys_close_epoll_nt)(fd);
+      }
+      break;
     case kFdSocket:
       if (_weaken(sys_closesocket_nt)) {
         return _weaken(sys_closesocket_nt)(g_fds.p + fd);
@@ -60,7 +64,5 @@ textwindows int sys_close_nt(int fd, int fildes) {
     default:
       break;
   }
-  if (f->cursor)
-    __cursor_unref(f->cursor);
   return CloseHandle(f->handle) ? 0 : __winerr();
 }
diff --git a/libc/calls/close.c b/libc/calls/close.c
index 2de6a55ef..979d24937 100644
--- a/libc/calls/close.c
+++ b/libc/calls/close.c
@@ -20,12 +20,12 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/state.internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/calls/syscall-sysv.internal.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
@@ -74,6 +74,7 @@ static int close_impl(int fd) {
  * - openat()
  * - socket()
  * - accept()
+ * - epoll_create()
  * - landlock_create_ruleset()
  *
  * This function should never be reattempted if an error is returned;
diff --git a/libc/calls/copy.c b/libc/calls/copy.c
index 9140d40e9..8be9d5c36 100644
--- a/libc/calls/copy.c
+++ b/libc/calls/copy.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 
 /**
diff --git a/libc/calls/copy_file_range.c b/libc/calls/copy_file_range.c
index 1a340710e..af90ab52d 100644
--- a/libc/calls/copy_file_range.c
+++ b/libc/calls/copy_file_range.c
@@ -83,11 +83,9 @@ static void copy_file_range_init(void) {
  * @return number of bytes transferred, or -1 w/ errno
  * @raise EXDEV if source and destination are on different filesystems
  * @raise EBADF if `infd` or `outfd` aren't open files or append-only
- * @raise EOPNOTSUPP if filesystem doesn't support this operation
  * @raise EPERM if `fdout` refers to an immutable file on Linux
  * @raise ECANCELED if thread was cancelled in masked mode
  * @raise EINVAL if ranges overlap or `flags` is non-zero
- * @raise EINVAL on eCryptFs filesystems that have a bug
  * @raise EFBIG if `setrlimit(RLIMIT_FSIZE)` is exceeded
  * @raise EFAULT if one of the pointers memory is bad
  * @raise ERANGE if overflow happens computing ranges
diff --git a/libc/calls/createpipename.c b/libc/calls/createpipename.c
index f8eed1531..a5d518522 100644
--- a/libc/calls/createpipename.c
+++ b/libc/calls/createpipename.c
@@ -17,10 +17,26 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/atomic.h"
-#include "libc/fmt/internal.h"
 #include "libc/intrin/atomic.h"
 #include "libc/runtime/internal.h"
 
+static textwindows char16_t *itoa16(char16_t p[21], uint64_t x) {
+  char t;
+  size_t a, b, i = 0;
+  do {
+    p[i++] = x % 10 + '0';
+    x = x / 10;
+  } while (x > 0);
+  if (i) {
+    for (a = 0, b = i - 1; a < b; ++a, --b) {
+      t = p[a];
+      p[a] = p[b];
+      p[b] = t;
+    }
+  }
+  return p + i;
+}
+
 // This function is called very early by WinMain().
 textwindows char16_t *__create_pipe_name(char16_t *a) {
   char16_t *p = a;
@@ -28,9 +44,9 @@ textwindows char16_t *__create_pipe_name(char16_t *a) {
   static atomic_uint x;
   while (*q)
     *p++ = *q++;
-  p = __itoa16(p, __pid);
+  p = itoa16(p, __pid);
   *p++ = '-';
-  p = __itoa16(p, atomic_fetch_add(&x, 1));
+  p = itoa16(p, atomic_fetch_add(&x, 1));
   *p = 0;
   return a;
 }
diff --git a/libc/calls/dup-nt.c b/libc/calls/dup-nt.c
index 4bf3f68fc..35a07a223 100644
--- a/libc/calls/dup-nt.c
+++ b/libc/calls/dup-nt.c
@@ -24,7 +24,6 @@
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/errno.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/weaken.h"
 #include "libc/nt/files.h"
@@ -83,7 +82,6 @@ static textwindows int sys_dup_nt_impl(int oldfd, int newfd, int flags,
 
   g_fds.p[newfd] = g_fds.p[oldfd];
   g_fds.p[newfd].handle = handle;
-  __cursor_ref(g_fds.p[newfd].cursor);
   if (flags & _O_CLOEXEC) {
     g_fds.p[newfd].flags |= _O_CLOEXEC;
   } else {
diff --git a/libc/calls/fcntl-nt.c b/libc/calls/fcntl-nt.c
index 77b8331a1..a94130412 100644
--- a/libc/calls/fcntl-nt.c
+++ b/libc/calls/fcntl-nt.c
@@ -20,18 +20,18 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/createfileflags.internal.h"
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/flock.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/calls/wincrash.internal.h"
 #include "libc/errno.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/weaken.h"
 #include "libc/limits.h"
 #include "libc/log/backtrace.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/leaks.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/createfile.h"
@@ -51,7 +51,6 @@
 #include "libc/sysv/consts/fio.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
 struct FileLock {
@@ -68,9 +67,7 @@ struct FileLocks {
   struct FileLock *free;
 };
 
-static struct FileLocks g_locks = {
-    .mu = PTHREAD_MUTEX_INITIALIZER,
-};
+static struct FileLocks g_locks;
 
 static textwindows struct FileLock *NewFileLock(void) {
   struct FileLock *fl;
@@ -113,7 +110,7 @@ static textwindows bool EqualsFileLock(struct FileLock *fl, int64_t off,
 
 textwindows void sys_fcntl_nt_lock_cleanup(int fd) {
   struct FileLock *fl, *ft, **flp;
-  _pthread_mutex_lock(&g_locks.mu);
+  pthread_mutex_lock(&g_locks.mu);
   for (flp = &g_locks.list, fl = *flp; fl;) {
     if (fl->fd == fd) {
       *flp = fl->next;
@@ -125,7 +122,7 @@ textwindows void sys_fcntl_nt_lock_cleanup(int fd) {
       fl = *flp;
     }
   }
-  _pthread_mutex_unlock(&g_locks.mu);
+  pthread_mutex_unlock(&g_locks.mu);
 }
 
 static textwindows int64_t GetfileSize(int64_t handle) {
@@ -154,7 +151,7 @@ static textwindows int sys_fcntl_nt_lock(struct Fd *f, int fd, int cmd,
     case SEEK_SET:
       break;
     case SEEK_CUR:
-      off = f->cursor->shared->pointer + off;
+      off = f->pointer + off;
       break;
     case SEEK_END: {
       int64_t size;
@@ -354,14 +351,9 @@ textwindows int sys_fcntl_nt(int fd, int cmd, uintptr_t arg) {
       }
       rc = 0;
     } else if (cmd == F_SETLK || cmd == F_SETLKW || cmd == F_GETLK) {
-      struct Fd *f = g_fds.p + fd;
-      if (f->cursor) {
-        _pthread_mutex_lock(&g_locks.mu);
-        rc = sys_fcntl_nt_lock(f, fd, cmd, arg);
-        _pthread_mutex_unlock(&g_locks.mu);
-      } else {
-        rc = ebadf();
-      }
+      pthread_mutex_lock(&g_locks.mu);
+      rc = sys_fcntl_nt_lock(g_fds.p + fd, fd, cmd, arg);
+      pthread_mutex_unlock(&g_locks.mu);
     } else if (cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC) {
       rc = sys_fcntl_nt_dupfd(fd, cmd, arg);
     } else {
diff --git a/libc/calls/finddebugbinary.c b/libc/calls/finddebugbinary.c
index 8b0efed7d..e8afd3935 100644
--- a/libc/calls/finddebugbinary.c
+++ b/libc/calls/finddebugbinary.c
@@ -27,7 +27,6 @@
 #include "libc/elf/tinyelf.internal.h"
 #include "libc/errno.h"
 #include "libc/intrin/directmap.h"
-#include "libc/intrin/promises.h"
 #include "libc/nt/memory.h"
 #include "libc/nt/runtime.h"
 #include "libc/runtime/runtime.h"
@@ -70,35 +69,42 @@ static int GetElfMachine(void) {
 }
 
 static bool IsMyDebugBinary(const char *path) {
-  void *addr;
   int64_t size;
   uintptr_t value;
   bool res = false;
   int fd, e = errno;
+  struct DirectMap dm;
+  BLOCK_CANCELATION;
   if ((fd = open(path, O_RDONLY | O_CLOEXEC, 0)) != -1) {
     // sanity test that this .com.dbg file (1) is an elf image, and (2)
     // contains the same number of bytes of code as our .com executable
     // which is currently running in memory.
     if ((size = lseek(fd, 0, SEEK_END)) != -1 &&
-        (addr = mmap(0, size, PROT_READ, MAP_SHARED, fd, 0)) != MAP_FAILED) {
-      if (READ32LE((char *)addr) == READ32LE("\177ELF") &&
-          ((Elf64_Ehdr *)addr)->e_machine == GetElfMachine() &&
-          GetElfSymbolValue(addr, "_etext", &value)) {
+        (dm = sys_mmap((void *)0x12345000000, size, PROT_READ, MAP_SHARED, fd,
+                       0))
+                .addr != MAP_FAILED) {
+      if (READ32LE((char *)dm.addr) == READ32LE("\177ELF") &&
+          ((Elf64_Ehdr *)dm.addr)->e_machine == GetElfMachine() &&
+          GetElfSymbolValue(dm.addr, "_etext", &value)) {
         res = !_etext || value == (uintptr_t)_etext;
       }
-      munmap(addr, size);
+      if (!IsWindows()) {
+        sys_munmap(dm.addr, size);
+      } else {
+        CloseHandle(dm.maphandle);
+        UnmapViewOfFile(dm.addr);
+      }
     }
     close(fd);
   }
+  ALLOW_CANCELATION;
   errno = e;
   return res;
 }
 
 static void FindDebugBinaryInit(void) {
   const char *comdbg;
-  if (issetugid())
-    return;
-  if ((comdbg = getenv("COMDBG"))) {
+  if ((comdbg = getenv("COMDBG")) && IsMyDebugBinary(comdbg)) {
     g_comdbg.res = comdbg;
     return;
   }
@@ -117,19 +123,9 @@ static void FindDebugBinaryInit(void) {
 /**
  * Returns path of binary with the debug information, or null.
  *
- * You can specify the COMDBG environment variable, with the path of the
- * debug binary, in case the automatic heuristics fail. What we look for
- * is GetProgramExecutableName() with ".dbg", ".com.dbg", etc. appended.
- *
- * @return path to debug binary, or NULL if we couldn't find it
- * @asyncsignalsafe
+ * @return path to debug binary, or NULL
  */
 const char *FindDebugBinary(void) {
   cosmo_once(&g_comdbg.once, FindDebugBinaryInit);
   return g_comdbg.res;
 }
-
-// pay startup cost to make this signal safe from the user's perspective
-__attribute__((__constructor__(10))) static void FindDebugBinaryCtor(void) {
-  cosmo_once(&g_comdbg.once, FindDebugBinaryInit);
-}
diff --git a/libc/calls/fixenotdir.c b/libc/calls/fixenotdir.c
index 1537ea69c..8787578a6 100644
--- a/libc/calls/fixenotdir.c
+++ b/libc/calls/fixenotdir.c
@@ -23,20 +23,10 @@
 #include "libc/nt/files.h"
 #include "libc/str/str.h"
 
-static int IsAlpha(int c) {
-  return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
-}
-
 static textwindows bool SubpathExistsThatsNotDirectory(char16_t *path) {
   char16_t *p;
   uint32_t attrs;
   while ((p = strrchr16(path, '\\'))) {
-    if (p == path)
-      // don't bother checking GetFileAttributes(u"\\")
-      break;
-    if (p == path + 2 && IsAlpha(path[0]) && path[1] == ':')
-      // don't bother checking GetFileAttributes(u"C:\\")
-      break;
     *p = u'\0';
     if ((attrs = GetFileAttributes(path)) != -1u &&
         !(attrs & kNtFileAttributeDirectory)) {
diff --git a/libc/calls/fstat-nt.c b/libc/calls/fstat-nt.c
index b3edea321..3c6b8c1cd 100644
--- a/libc/calls/fstat-nt.c
+++ b/libc/calls/fstat-nt.c
@@ -19,15 +19,15 @@
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/stat.h"
 #include "libc/calls/struct/stat.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/fmt/wintime.internal.h"
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/bsr.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alloca.h"
 #include "libc/nt/enum/fileflagandattributes.h"
 #include "libc/nt/enum/fileinfobyhandleclass.h"
@@ -119,83 +119,82 @@ textwindows int sys_fstat_nt_handle(int64_t handle, const char16_t *path,
 
   // Always set st_blksize to avoid divide by zero issues.
   // The Linux kernel sets this for /dev/tty and similar too.
+  // TODO(jart): GetVolumeInformationByHandle?
   st.st_blksize = 4096;
   st.st_gid = st.st_uid = sys_getuid_nt();
 
+  // We'll use the "umask" to fake out the mode bits.
+  uint32_t umask = atomic_load_explicit(&__umask, memory_order_acquire);
+
   switch (GetFileType(handle)) {
     case kNtFileTypeUnknown:
       break;
     case kNtFileTypeChar:
-      st.st_mode = S_IFCHR | 0664;
+      st.st_mode = S_IFCHR | (0666 & ~umask);
       st.st_dev = 0x66666666;
       st.st_ino = handle;
       break;
     case kNtFileTypePipe:
-      st.st_mode = S_IFIFO | 0664;
+      st.st_mode = S_IFIFO | (0666 & ~umask);
       st.st_dev = 0x55555555;
       st.st_ino = handle;
       break;
     case kNtFileTypeDisk: {
       struct NtByHandleFileInformation wst;
-      if (GetFileInformationByHandle(handle, &wst)) {
-        st.st_mode = 0444;
-        if ((wst.dwFileAttributes & kNtFileAttributeDirectory) ||
-            IsWindowsExecutable(handle, path))
-          st.st_mode |= 0111;
-        st.st_flags = wst.dwFileAttributes;
-        if (!(wst.dwFileAttributes & kNtFileAttributeReadonly))
-          st.st_mode |= 0220;
-        if (wst.dwFileAttributes & kNtFileAttributeReparsePoint) {
-          st.st_mode |= S_IFLNK;
-        } else if (wst.dwFileAttributes & kNtFileAttributeDirectory) {
-          st.st_mode |= S_IFDIR;
-        } else {
-          st.st_mode |= S_IFREG;
-        }
-        st.st_atim = FileTimeToTimeSpec(wst.ftLastAccessFileTime);
-        st.st_mtim = FileTimeToTimeSpec(wst.ftLastWriteFileTime);
-        st.st_birthtim = FileTimeToTimeSpec(wst.ftCreationFileTime);
-        // compute time of last status change
-        if (timespec_cmp(st.st_atim, st.st_mtim) > 0) {
-          st.st_ctim = st.st_atim;
-        } else {
-          st.st_ctim = st.st_mtim;
-        }
-        st.st_size = (wst.nFileSizeHigh + 0ull) << 32 | wst.nFileSizeLow;
-        st.st_dev = wst.dwVolumeSerialNumber;
-        st.st_ino = (wst.nFileIndexHigh + 0ull) << 32 | wst.nFileIndexLow;
-        st.st_nlink = wst.nNumberOfLinks;
-        if (S_ISLNK(st.st_mode)) {
-          if (!st.st_size) {
-            long size = GetSizeOfReparsePoint(handle);
-            if (size == -1)
-              return -1;
-            st.st_size = size;
-          }
-        } else {
-          // st_size       = uncompressed size
-          // st_blocks*512 = physical size
-          uint64_t physicalsize;
-          struct NtFileCompressionInfo fci;
-          if (!(wst.dwFileAttributes &
-                (kNtFileAttributeDirectory | kNtFileAttributeReparsePoint)) &&
-              GetFileInformationByHandleEx(handle, kNtFileCompressionInfo, &fci,
-                                           sizeof(fci))) {
-            physicalsize = fci.CompressedFileSize;
-          } else {
-            physicalsize = st.st_size;
-          }
-          st.st_blocks = ROUNDUP(physicalsize, st.st_blksize) / 512;
-        }
-      } else if (GetVolumeInformationByHandle(
-                     handle, 0, 0, &wst.dwVolumeSerialNumber, 0, 0, 0, 0)) {
-        st.st_dev = wst.dwVolumeSerialNumber;
-        st.st_mode = S_IFDIR | 0555;
-      } else {
-        // both GetFileInformationByHandle and
-        // GetVolumeInformationByHandle failed
+      if (!GetFileInformationByHandle(handle, &wst)) {
         return __winerr();
       }
+      st.st_mode = 0444 & ~umask;
+      if ((wst.dwFileAttributes & kNtFileAttributeDirectory) ||
+          IsWindowsExecutable(handle, path)) {
+        st.st_mode |= 0111 & ~umask;
+      }
+      st.st_flags = wst.dwFileAttributes;
+      if (!(wst.dwFileAttributes & kNtFileAttributeReadonly)) {
+        st.st_mode |= 0222 & ~umask;
+      }
+      if (wst.dwFileAttributes & kNtFileAttributeReparsePoint) {
+        st.st_mode |= S_IFLNK;
+      } else if (wst.dwFileAttributes & kNtFileAttributeDirectory) {
+        st.st_mode |= S_IFDIR;
+      } else {
+        st.st_mode |= S_IFREG;
+      }
+      st.st_atim = FileTimeToTimeSpec(wst.ftLastAccessFileTime);
+      st.st_mtim = FileTimeToTimeSpec(wst.ftLastWriteFileTime);
+      st.st_birthtim = FileTimeToTimeSpec(wst.ftCreationFileTime);
+      // compute time of last status change
+      if (timespec_cmp(st.st_atim, st.st_mtim) > 0) {
+        st.st_ctim = st.st_atim;
+      } else {
+        st.st_ctim = st.st_mtim;
+      }
+      st.st_size = (wst.nFileSizeHigh + 0ull) << 32 | wst.nFileSizeLow;
+      st.st_dev = wst.dwVolumeSerialNumber;
+      st.st_ino = (wst.nFileIndexHigh + 0ull) << 32 | wst.nFileIndexLow;
+      st.st_nlink = wst.nNumberOfLinks;
+      if (S_ISLNK(st.st_mode)) {
+        if (!st.st_size) {
+          long size = GetSizeOfReparsePoint(handle);
+          if (size == -1)
+            return -1;
+          st.st_size = size;
+        }
+      } else {
+        // st_size       = uncompressed size
+        // st_blocks*512 = physical size
+        uint64_t physicalsize;
+        struct NtFileCompressionInfo fci;
+        if (!(wst.dwFileAttributes &
+              (kNtFileAttributeDirectory | kNtFileAttributeReparsePoint)) &&
+            GetFileInformationByHandleEx(handle, kNtFileCompressionInfo, &fci,
+                                         sizeof(fci))) {
+          physicalsize = fci.CompressedFileSize;
+        } else {
+          physicalsize = st.st_size;
+        }
+        st.st_blocks = ROUNDUP(physicalsize, st.st_blksize) / 512;
+      }
       break;
     }
     default:
diff --git a/libc/calls/fstatat-nt.c b/libc/calls/fstatat-nt.c
index cfca355cc..1415cb84c 100644
--- a/libc/calls/fstatat-nt.c
+++ b/libc/calls/fstatat-nt.c
@@ -16,11 +16,11 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/struct/stat.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/errno.h"
-#include "libc/intrin/fds.h"
 #include "libc/nt/createfile.h"
 #include "libc/nt/enum/accessmask.h"
 #include "libc/nt/enum/creationdisposition.h"
@@ -97,27 +97,14 @@ TryAgain:
            0)) != -1) {
     rc = st ? sys_fstat_nt_handle(fh, path16, st) : 0;
     CloseHandle(fh);
+  } else if (dwDesiredAccess == kNtFileGenericRead &&
+             (GetLastError() == kNtErrorAccessDenied ||
+              GetLastError() == kNtErrorSharingViolation)) {
+    dwDesiredAccess = kNtFileReadAttributes;
+    errno = e;
+    goto TryAgain;
   } else {
-    uint32_t dwErrorCode = GetLastError();
-    if (dwDesiredAccess == kNtFileGenericRead &&
-        (dwErrorCode == kNtErrorAccessDenied ||
-         dwErrorCode == kNtErrorSharingViolation)) {
-      dwDesiredAccess = kNtFileReadAttributes;
-      errno = e;
-      goto TryAgain;
-    } else if (!(flags & AT_SYMLINK_NOFOLLOW) &&
-               dwErrorCode == kNtErrorCantAccessFile) {
-      // ERROR_CANT_ACCESS_FILE (1920) usually means that the I/O system
-      // a WSL symlink is accessed from WIN32 API. Falling back with the
-      // failed to traverse a filesystem reparse point. For example when
-      // details of the link itself is better than providing nothing. It
-      // should never be like this on UNIX but Windows gets a bit screwy
-      flags |= AT_SYMLINK_NOFOLLOW;
-      errno = e;
-      goto TryAgain;
-    } else {
-      rc = __winerr();
-    }
+    rc = __winerr();
   }
   ALLOW_SIGNALS;
 
diff --git a/libc/calls/fstatfs-nt.c b/libc/calls/fstatfs-nt.c
index f7d0229bc..06c0ce515 100644
--- a/libc/calls/fstatfs-nt.c
+++ b/libc/calls/fstatfs-nt.c
@@ -23,7 +23,7 @@
 #include "libc/calls/struct/statfs.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/fsinformationclass.h"
 #include "libc/nt/enum/status.h"
 #include "libc/nt/files.h"
diff --git a/libc/intrin/getcontext.S b/libc/calls/getcontext.S
similarity index 96%
rename from libc/intrin/getcontext.S
rename to libc/calls/getcontext.S
index bdfaded97..bf3400f43 100644
--- a/libc/intrin/getcontext.S
+++ b/libc/calls/getcontext.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Gets machine state.
 //
@@ -26,9 +26,7 @@
 //	@see	setcontext()
 	.ftrace1
 getcontext:
-	beg
 	.ftrace2
-#include "libc/intrin/getcontext.inc"
+#include "libc/calls/getcontext.inc"
 	jmp	__getcontextsig
-	end
 	.endfn	getcontext,globl
diff --git a/libc/intrin/getcontext.inc b/libc/calls/getcontext.inc
similarity index 99%
rename from libc/intrin/getcontext.inc
rename to libc/calls/getcontext.inc
index bbc452658..ea0a8b4a8 100644
--- a/libc/intrin/getcontext.inc
+++ b/libc/calls/getcontext.inc
@@ -34,7 +34,6 @@
 	mov	%rbp,120(%rdi)
 	mov	%rbx,128(%rdi)
 	mov	%rdx,136(%rdi)
-	mov	%rax,144(%rdi)
 	mov	%rcx,152(%rdi)
 	lea	8(%rsp),%rax
 	mov	%rax,160(%rdi)		// rsp = caller's rsp
diff --git a/libc/calls/getcpu.c b/libc/calls/getcpu.c
index b689f43fc..bdc97089e 100644
--- a/libc/calls/getcpu.c
+++ b/libc/calls/getcpu.c
@@ -30,63 +30,39 @@
 
 int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
 
-/**
- * Determines ID of CPU on which thread is currently scheduled.
- *
- * This is the same as sched_getcpu(), except it also supports returning
- * the ID of the current NUMA node. On some platforms this functionality
- * isn't available, in which case `out_opt_node` is always be set to 0.
- */
 int getcpu(unsigned *out_opt_cpu, unsigned *out_opt_node) {
-
-  if (IsWindows()) {
-    struct NtProcessorNumber pn;
-    if (out_opt_cpu) {
-      GetCurrentProcessorNumberEx(&pn);
-      *out_opt_cpu = 64 * pn.Group + pn.Number;
-    }
-    if (out_opt_node) {
-      unsigned short node16;
-      if (GetNumaProcessorNodeEx(&pn, &node16)) {
-        *out_opt_node = node16;
-      } else {
-        return __winerr();
-      }
-    }
-    return 0;
-  }
-
-#ifdef __x86_64__
-  if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
+  unsigned cpu;
+  unsigned node;
+  if (X86_HAVE(RDTSCP)) {
     unsigned tsc_aux;
     rdtscp(&tsc_aux);
-    if (out_opt_cpu)
-      *out_opt_cpu = TSC_AUX_CORE(tsc_aux);
-    if (out_opt_node)
-      *out_opt_node = TSC_AUX_NODE(tsc_aux);
-    return 0;
-  }
-#endif
-
-  if (IsXnu() || IsOpenbsd() || IsNetbsd() || IsFreebsd()) {
-    if (out_opt_cpu) {
-      int rc = sched_getcpu();
-      if (rc == -1)
-        return -1;
-      *out_opt_cpu = rc;
+    cpu = TSC_AUX_CORE(tsc_aux);
+    node = TSC_AUX_NODE(tsc_aux);
+  } else if (IsWindows()) {
+    struct NtProcessorNumber pn;
+    GetCurrentProcessorNumberEx(&pn);
+    cpu = 64 * pn.Group + pn.Number;
+    unsigned short node16;
+    if (GetNumaProcessorNodeEx(&pn, &node16)) {
+      node = node16;
+    } else {
+      return __winerr();
     }
-    if (out_opt_node)
-      *out_opt_node = 0;
-    return 0;
+  } else if (IsAarch64()) {
+    long tpidr_el0;
+    asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
+    cpu = tpidr_el0 & 255;
+    node = 0;
+  } else {
+    int rc = sys_getcpu(&cpu, &node, 0);
+    if (rc == -1)
+      return -1;
   }
-
-  unsigned cpu, node;
-  int rc = sys_getcpu(&cpu, &node, 0);
-  if (rc == -1)
-    return -1;
-  if (out_opt_cpu)
+  if (out_opt_cpu) {
     *out_opt_cpu = cpu;
-  if (out_opt_node)
+  }
+  if (out_opt_node) {
     *out_opt_node = node;
+  }
   return 0;
 }
diff --git a/libc/calls/getdomainname.c b/libc/calls/getdomainname.c
index 988cbdd2e..9cfb722f6 100644
--- a/libc/calls/getdomainname.c
+++ b/libc/calls/getdomainname.c
@@ -20,7 +20,7 @@
 #include "libc/calls/syscall_support-sysv.internal.h"
 #include "libc/dce.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/computernameformat.h"
 #include "libc/str/str.h"
 #include "libc/sysv/errfuns.h"
diff --git a/libc/calls/getdtablesize.c b/libc/calls/getdtablesize.c
index cc5a7460e..fdfd06ac5 100644
--- a/libc/calls/getdtablesize.c
+++ b/libc/calls/getdtablesize.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/rlimit.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/rlimit.h"
 
diff --git a/libc/calls/getgroups.c b/libc/calls/getgroups.c
index 25251453b..d4c8fe1a9 100644
--- a/libc/calls/getgroups.c
+++ b/libc/calls/getgroups.c
@@ -21,6 +21,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
+#include "libc/stdckdint.h"
 #include "libc/sysv/errfuns.h"
 
 /**
diff --git a/libc/calls/gethostname-nt.c b/libc/calls/gethostname-nt.c
index a39b0d013..e1c9e1f0b 100644
--- a/libc/calls/gethostname-nt.c
+++ b/libc/calls/gethostname-nt.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/computernameformat.h"
 #include "libc/nt/systeminfo.h"
 #include "libc/str/str.h"
diff --git a/libc/calls/getloadavg-nt.c b/libc/calls/getloadavg-nt.c
index 77e0a83ed..4e8d6d847 100644
--- a/libc/calls/getloadavg-nt.c
+++ b/libc/calls/getloadavg-nt.c
@@ -21,23 +21,24 @@
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/dce.h"
 #include "libc/fmt/conv.h"
-#include "libc/intrin/cxaatexit.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/accounting.h"
 #include "libc/runtime/runtime.h"
+#include "libc/thread/thread.h"
 
-#define CTOR  __attribute__((__constructor__(99)))
 #define FT(x) (x.dwLowDateTime | (uint64_t)x.dwHighDateTime << 32)
 
 static int cpus;
 static double load;
+static pthread_spinlock_t lock;
 static struct NtFileTime idle1, kern1, user1;
 
 textwindows int sys_getloadavg_nt(double *a, int n) {
   int i, rc;
   uint64_t elapsed, used;
   struct NtFileTime idle, kern, user;
-  __cxa_lock();
+  BLOCK_SIGNALS;
+  pthread_spin_lock(&lock);
   if (GetSystemTimes(&idle, &kern, &user)) {
     elapsed = (FT(kern) - FT(kern1)) + (FT(user) - FT(user1));
     if (elapsed) {
@@ -53,11 +54,12 @@ textwindows int sys_getloadavg_nt(double *a, int n) {
   } else {
     rc = __winerr();
   }
-  __cxa_unlock();
+  pthread_spin_unlock(&lock);
+  ALLOW_SIGNALS;
   return rc;
 }
 
-CTOR static textstartup void sys_getloadavg_nt_init(void) {
+__attribute__((__constructor__(40))) static textstartup void ntinitload(void) {
   if (IsWindows()) {
     load = 1;
     cpus = __get_cpu_count() / 2;
diff --git a/libc/calls/getntsyspath.S b/libc/calls/getntsyspath.S
index bd8178bd3..62bd818f0 100644
--- a/libc/calls/getntsyspath.S
+++ b/libc/calls/getntsyspath.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Obtains WIN32 magic path, e.g. GetTempPathA.
 //
@@ -28,8 +28,8 @@
 //	@return	rdi is rdi+edx
 	.text.startup
 __getntsyspath:
-	beg
-	pro
+	push	%rbp
+	mov	%rsp,%rbp
 	push	%rdx
 	movpp	%rdi,%rcx		# call f=%rax(p1=%rcx,p2=%rdx)
 	sub	$40,%rsp
@@ -55,7 +55,6 @@ __getntsyspath:
 	jne	2f
 	movb	$'/',-1(%rdi)
 2:	.loop	1b
-	epi
+	leave
 	ret
-	end
 	.endfn	__getntsyspath,globl,hidden
diff --git a/libc/calls/sigcheck.c b/libc/calls/getppid-nt.c
similarity index 69%
rename from libc/calls/sigcheck.c
rename to libc/calls/getppid-nt.c
index e8cad756d..438cafc61 100644
--- a/libc/calls/sigcheck.c
+++ b/libc/calls/getppid-nt.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,25 +16,22 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/internal.h"
-#include "libc/calls/sig.internal.h"
-#include "libc/intrin/weaken.h"
-#include "libc/sysv/consts/sicode.h"
-#include "libc/sysv/errfuns.h"
+#include "libc/calls/syscall-nt.internal.h"
+#include "libc/nt/enum/status.h"
+#include "libc/nt/nt/process.h"
+#include "libc/nt/process.h"
+#include "libc/nt/runtime.h"
+#include "libc/nt/struct/processbasicinformation.h"
 
-textwindows int __sigcheck(sigset_t waitmask, bool restartable) {
-  int sig, handler_was_called = 0;
-  if (_check_cancel() == -1)
-    return -1;
-  while (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
-    handler_was_called |= _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
-    if (_check_cancel() == -1)
-      return -1;
+textwindows int sys_getppid_nt(void) {
+  struct NtProcessBasicInformation ProcessInformation;
+  uint32_t gotsize = 0;
+  if (!NtError(
+          NtQueryInformationProcess(GetCurrentProcess(), 0, &ProcessInformation,
+                                    sizeof(ProcessInformation), &gotsize)) &&
+      gotsize >= sizeof(ProcessInformation) &&
+      ProcessInformation.InheritedFromUniqueProcessId) {
+    return ProcessInformation.InheritedFromUniqueProcessId;
   }
-  if (handler_was_called & SIG_HANDLED_NO_RESTART)
-    return eintr();
-  if (handler_was_called & SIG_HANDLED_SA_RESTART)
-    if (!restartable)
-      return eintr();
-  return 0;
+  return GetCurrentProcessId();
 }
diff --git a/libc/proc/getppid.c b/libc/calls/getppid.c
similarity index 100%
rename from libc/proc/getppid.c
rename to libc/calls/getppid.c
diff --git a/libc/calls/getprogramexecutablename.greg.c b/libc/calls/getprogramexecutablename.greg.c
index 8589bb099..12f02933c 100644
--- a/libc/calls/getprogramexecutablename.greg.c
+++ b/libc/calls/getprogramexecutablename.greg.c
@@ -27,7 +27,7 @@
 #include "libc/intrin/getenv.h"
 #include "libc/intrin/strace.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/runtime.h"
 #include "libc/runtime/runtime.h"
 #include "libc/serialize.h"
@@ -96,8 +96,9 @@ static int OldApeLoader(char *s) {
 static int CopyWithCwd(const char *q, char *p, char *e) {
   char c;
   if (*q != '/') {
-    if (q[0] == '.' && q[1] == '/')
+    if (q[0] == '.' && q[1] == '/') {
       q += 2;
+    }
     int got = __getcwd(p, e - p - 1 /* '/' */);
     if (got != -1) {
       p += got - 1;
@@ -117,10 +118,9 @@ static int CopyWithCwd(const char *q, char *p, char *e) {
 
 // if q exists then turn it into an absolute path.
 static int TryPath(const char *q) {
-  if (!q)
-    return 0;
-  if (!CopyWithCwd(q, g_prog.u.buf, g_prog.u.buf + sizeof(g_prog.u.buf)))
+  if (!CopyWithCwd(q, g_prog.u.buf, g_prog.u.buf + sizeof(g_prog.u.buf))) {
     return 0;
+  }
   return !sys_faccessat(AT_FDCWD, g_prog.u.buf, F_OK, 0);
 }
 
@@ -129,8 +129,9 @@ static int TryPath(const char *q) {
 void __init_program_executable_name(void) {
   if (__program_executable_name && *__program_executable_name != '/' &&
       CopyWithCwd(__program_executable_name, g_prog.u.buf,
-                  g_prog.u.buf + sizeof(g_prog.u.buf)))
+                  g_prog.u.buf + sizeof(g_prog.u.buf))) {
     __program_executable_name = g_prog.u.buf;
+  }
 }
 
 static inline void InitProgramExecutableNameImpl(void) {
@@ -211,12 +212,14 @@ static inline void InitProgramExecutableNameImpl(void) {
   }
 
   // don't trust argv or envp if set-id.
-  if (issetugid())
+  if (issetugid()) {
     goto UseEmpty;
+  }
 
   // try argv[0], then then $_.
-  if (TryPath(__argv[0]) || TryPath(__getenv(__envp, "_").s))
+  if (TryPath(__argv[0]) || TryPath(__getenv(__envp, "_").s)) {
     goto UseBuf;
+  }
 
   // give up and just copy argv[0] into it
   if ((q = __argv[0])) {
diff --git a/libc/calls/getrandom.c b/libc/calls/getrandom.c
index a0c53383b..957c7bc18 100644
--- a/libc/calls/getrandom.c
+++ b/libc/calls/getrandom.c
@@ -32,7 +32,7 @@
 #include "libc/intrin/asmflag.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/kcpuids.h"
 #include "libc/nexgen32e/rdtsc.h"
 #include "libc/nexgen32e/vendor.internal.h"
@@ -86,36 +86,26 @@ static ssize_t GetRandomBsd(char *p, size_t n, void impl(char *, size_t)) {
   }
 }
 
-static ssize_t GetDevUrandom(char *p, size_t n, unsigned f) {
+static ssize_t GetDevUrandom(char *p, size_t n) {
   int fd;
-  int oflags;
   ssize_t rc;
-  const char *dev;
-  BEGIN_CANCELATION_POINT;
-  if (f & GRND_RANDOM) {
-    dev = "/dev/random";
-  } else {
-    dev = "/dev/urandom";
-  }
-  oflags = O_RDONLY | O_CLOEXEC;
-  if (f & GRND_NONBLOCK)
-    oflags |= O_NONBLOCK;
-  fd = sys_openat(AT_FDCWD, dev, oflags, 0);
+  BLOCK_SIGNALS;
+  BLOCK_CANCELATION;
+  fd = sys_openat(AT_FDCWD, "/dev/urandom", O_RDONLY | O_CLOEXEC, 0);
   if (fd != -1) {
     rc = sys_read(fd, p, n);
-    sys_close(fd);
   } else {
     rc = -1;
   }
-  END_CANCELATION_POINT;
+  ALLOW_CANCELATION;
+  ALLOW_SIGNALS;
   return rc;
 }
 
 ssize_t __getrandom(void *p, size_t n, unsigned f) {
   ssize_t rc;
   if (IsWindows()) {
-    ProcessPrng(p, n);  // never fails
-    rc = n;
+    rc = ProcessPrng(p, n) ? n : __winerr();
   } else if (have_getrandom) {
     if (IsXnu() || IsOpenbsd()) {
       rc = GetRandomBsd(p, n, GetRandomEntropy);
@@ -132,7 +122,7 @@ ssize_t __getrandom(void *p, size_t n, unsigned f) {
 #endif
   } else {
     BEGIN_CANCELATION_POINT;
-    rc = GetDevUrandom(p, n, f);
+    rc = GetDevUrandom(p, n);
     END_CANCELATION_POINT;
   }
   return rc;
@@ -185,7 +175,9 @@ ssize_t __getrandom(void *p, size_t n, unsigned f) {
  * @raise EFAULT if the `n` bytes at `p` aren't valid memory
  * @raise EINTR if we needed to block and a signal was delivered instead
  * @cancelationpoint
+ * @asyncsignalsafe
  * @restartable
+ * @vforksafe
  */
 ssize_t getrandom(void *p, size_t n, unsigned f) {
   ssize_t rc;
diff --git a/libc/calls/getrlimit.c b/libc/calls/getrlimit.c
index d2a826eda..de7df079e 100644
--- a/libc/calls/getrlimit.c
+++ b/libc/calls/getrlimit.c
@@ -21,7 +21,6 @@
 #include "libc/calls/syscall-sysv.internal.h"
 #include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/intrin/rlimit.h"
 #include "libc/intrin/strace.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
@@ -48,7 +47,8 @@ int getrlimit(int resource, struct rlimit *rlim) {
   } else if (!IsWindows()) {
     rc = sys_getrlimit(resource, rlim);
   } else if (resource == RLIMIT_STACK) {
-    *rlim = __rlimit_stack_get();
+    rlim->rlim_cur = GetStaticStackSize();
+    rlim->rlim_max = GetStaticStackSize();
     rc = 0;
   } else if (resource == RLIMIT_AS) {
     rlim->rlim_cur = __virtualmax;
diff --git a/libc/calls/getuid-nt.c b/libc/calls/getuid-nt.c
index c6acd2a91..7f191db4e 100644
--- a/libc/calls/getuid-nt.c
+++ b/libc/calls/getuid-nt.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
 #include "libc/intrin/atomic.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/accounting.h"
 #include "libc/str/str.h"
 
diff --git a/libc/calls/getuid.c b/libc/calls/getuid.c
index 3be6e8245..483be9c15 100644
--- a/libc/calls/getuid.c
+++ b/libc/calls/getuid.c
@@ -25,7 +25,7 @@
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/strace.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
 
diff --git a/libc/calls/groups.internal.h b/libc/calls/groups.internal.h
index 2fca2ca10..d7d0c80b7 100644
--- a/libc/calls/groups.internal.h
+++ b/libc/calls/groups.internal.h
@@ -5,9 +5,9 @@ COSMOPOLITAN_C_START_
 int sys_getgroups(int size, uint32_t list[]);
 int sys_setgroups(size_t size, const uint32_t list[]);
 
-const char *_DescribeGidList(char[128], int, int, const uint32_t list[]);
+const char *DescribeGidList(char[128], int, int, const uint32_t list[]);
 #define DescribeGidList(rc, length, gidlist) \
-  _DescribeGidList(alloca(128), rc, length, gidlist)
+  DescribeGidList(alloca(128), rc, length, gidlist)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_GROUPS_INTERNAL_H_ */
diff --git a/libc/calls/internal.h b/libc/calls/internal.h
index 80ffd0c58..8a9b54819 100644
--- a/libc/calls/internal.h
+++ b/libc/calls/internal.h
@@ -1,12 +1,10 @@
 #ifndef COSMOPOLITAN_LIBC_CALLS_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_CALLS_INTERNAL_H_
 #include "libc/atomic.h"
-#include "libc/calls/struct/sigset.h"
-#include "libc/calls/struct/sigval.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/dce.h"
 #include "libc/intrin/fds.h"
-#include "libc/macros.h"
+#include "libc/calls/struct/sigval.h"
+#include "libc/dce.h"
+#include "libc/macros.internal.h"
 #include "libc/stdbool.h"
 
 #define kSigactionMinRva 8 /* >SIG_{ERR,DFL,IGN,...} */
@@ -26,15 +24,12 @@ int __ensurefds(int);
 uint32_t sys_getuid_nt(void);
 int __ensurefds_unlocked(int);
 void __printfds(struct Fd *, size_t);
-int __sigcheck(sigset_t, bool);
 int CountConsoleInputBytes(void);
 int FlushConsoleInputBytes(void);
 int64_t GetConsoleInputHandle(void);
 int64_t GetConsoleOutputHandle(void);
-void EchoConsoleNt(const char *, size_t, bool);
 int IsWindowsExecutable(int64_t, const char16_t *);
 void InterceptTerminalCommands(const char *, size_t);
-void sys_read_nt_wipe_keystrokes(void);
 
 forceinline bool __isfdopen(int fd) {
   return 0 <= fd && fd < g_fds.n && g_fds.p[fd].kind != kFdEmpty;
@@ -46,10 +41,9 @@ forceinline bool __isfdkind(int fd, int kind) {
 
 int _check_signal(bool);
 int _check_cancel(void);
-bool _is_canceled(void);
 int sys_close_nt(int, int);
-int _park_norestart(struct timespec, uint64_t);
-int _park_restartable(struct timespec, uint64_t);
+int _park_norestart(uint32_t, uint64_t);
+int _park_restartable(uint32_t, uint64_t);
 int sys_openat_metal(int, const char *, int, unsigned);
 
 #ifdef __x86_64__
diff --git a/libc/calls/ioctl.c b/libc/calls/ioctl.c
index 5151c8524..bd0c0642d 100644
--- a/libc/calls/ioctl.c
+++ b/libc/calls/ioctl.c
@@ -18,16 +18,16 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/syscall-sysv.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/calls/termios.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/cmpxchg.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alloca.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/console.h"
@@ -38,7 +38,6 @@
 #include "libc/nt/iphlpapi.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/struct/ipadapteraddresses.h"
-#include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
@@ -60,8 +59,6 @@
 #define MAX_UNICAST_ADDR 32
 #define MAX_NAME_CLASH   ((int)('z' - 'a')) /* Allow a..z */
 
-__msabi extern typeof(__sys_ioctlsocket_nt) *const __imp_ioctlsocket;
-
 static struct HostAdapterInfoNode {
   struct HostAdapterInfoNode *next;
   char name[IFNAMSIZ]; /* Obtained from FriendlyName */
@@ -79,7 +76,7 @@ static int ioctl_default(int fd, unsigned long request, void *arg) {
   } else if (__isfdopen(fd)) {
     if (g_fds.p[fd].kind == kFdSocket) {
       handle = g_fds.p[fd].handle;
-      if ((rc = __imp_ioctlsocket(handle, request, arg)) != -1) {
+      if ((rc = _weaken(__sys_ioctlsocket_nt)(handle, request, arg)) != -1) {
         return rc;
       } else {
         return _weaken(__winsockerr)();
@@ -100,7 +97,7 @@ static int ioctl_fionread(int fd, uint32_t *arg) {
   } else if (__isfdopen(fd)) {
     handle = g_fds.p[fd].handle;
     if (g_fds.p[fd].kind == kFdSocket) {
-      if ((rc = __imp_ioctlsocket(handle, FIONREAD, arg)) != -1) {
+      if ((rc = _weaken(__sys_ioctlsocket_nt)(handle, FIONREAD, arg)) != -1) {
         return rc;
       } else {
         return _weaken(__winsockerr)();
@@ -261,10 +258,7 @@ static textwindows struct HostAdapterInfoNode *appendHostInfo(
     node->flags = flags;
   } else {
     /* Copy from previous node */
-    if (parentInfoNode)
-      node->flags = parentInfoNode->flags;
-    else
-      node->flags = 0;
+    node->flags = parentInfoNode->flags;
   }
 
   ip = ntohl(
@@ -514,7 +508,6 @@ static int ioctl_siocgifconf_sysv(int fd, struct ifconf *ifc) {
   }
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
   bufMax = 15000; /* conservative guesstimate */
   b = alloca(bufMax);
   CheckLargeStackAllocation(b, bufMax);
diff --git a/libc/calls/isapemagic.c b/libc/calls/isapemagic.c
index e387880cc..a1ca56460 100644
--- a/libc/calls/isapemagic.c
+++ b/libc/calls/isapemagic.c
@@ -25,5 +25,6 @@
 bool IsApeLoadable(char buf[8]) {
   return READ32LE(buf) == READ32LE("\177ELF") ||
          READ64LE(buf) == READ64LE("MZqFpD='") ||
-         READ64LE(buf) == READ64LE("jartsr='");
+         READ64LE(buf) == READ64LE("jartsr='") ||
+         READ64LE(buf) == READ64LE("APEDBG='");
 }
diff --git a/libc/intrin/isqemu.c b/libc/calls/isqemu.c
similarity index 100%
rename from libc/intrin/isqemu.c
rename to libc/calls/isqemu.c
diff --git a/libc/calls/kntsystemdirectory.S b/libc/calls/kntsystemdirectory.S
index e50cc3485..85338d555 100644
--- a/libc/calls/kntsystemdirectory.S
+++ b/libc/calls/kntsystemdirectory.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #define BYTES 64
 
diff --git a/libc/calls/kntwindowsdirectory.S b/libc/calls/kntwindowsdirectory.S
index 0a20e3183..de7418a62 100644
--- a/libc/calls/kntwindowsdirectory.S
+++ b/libc/calls/kntwindowsdirectory.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #define BYTES 64
 
diff --git a/libc/calls/linkat-nt.c b/libc/calls/linkat-nt.c
index 4f55b6707..9364b62be 100644
--- a/libc/calls/linkat-nt.c
+++ b/libc/calls/linkat-nt.c
@@ -18,15 +18,14 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/intrin/strace.h"
 #include "libc/limits.h"
 #include "libc/nt/files.h"
 #include "libc/nt/runtime.h"
 #include "libc/runtime/stack.h"
 #include "libc/str/str.h"
 
-textwindows int sys_linkat_nt(int olddirfd, const char *oldpath,  //
-                              int newdirfd, const char *newpath) {
+textwindows int sys_linkat_nt(int olddirfd, const char *oldpath, int newdirfd,
+                              const char *newpath) {
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Wframe-larger-than="
   struct {
@@ -37,10 +36,7 @@ textwindows int sys_linkat_nt(int olddirfd, const char *oldpath,  //
 #pragma GCC pop_options
   if (__mkntpathat(olddirfd, oldpath, 0, M.oldpath16) != -1 &&
       __mkntpathat(newdirfd, newpath, 0, M.newpath16) != -1) {
-    bool32 ok = CreateHardLink(M.newpath16, M.oldpath16, NULL);
-    NTTRACE("CreateHardLink(%#hs, %#hs, NULL) → {%hhhd, %d}", M.newpath16,
-            M.oldpath16, ok, GetLastError());
-    if (ok) {
+    if (CreateHardLink(M.newpath16, M.oldpath16, NULL)) {
       return 0;
     } else {
       return __fix_enotdir3(__winerr(), M.newpath16, M.oldpath16);
diff --git a/libc/calls/linkat.c b/libc/calls/linkat.c
index 84237baf3..09c2153b7 100644
--- a/libc/calls/linkat.c
+++ b/libc/calls/linkat.c
@@ -34,7 +34,6 @@
  *
  * @param flags can have AT_EMPTY_PATH or AT_SYMLINK_NOFOLLOW
  * @return 0 on success, or -1 w/ errno
- * @raise EROFS if either path is under /zip/...
  * @asyncsignalsafe
  */
 int linkat(int olddirfd, const char *oldpath, int newdirfd, const char *newpath,
@@ -43,7 +42,7 @@ int linkat(int olddirfd, const char *oldpath, int newdirfd, const char *newpath,
   if (_weaken(__zipos_notat) &&
       ((rc = __zipos_notat(olddirfd, oldpath)) == -1 ||
        (rc = __zipos_notat(newdirfd, newpath)) == -1)) {
-    rc = erofs();
+    STRACE("zipos fchownat not supported yet");
   } else if (!IsWindows()) {
     rc = sys_linkat(olddirfd, oldpath, newdirfd, newpath, flags);
   } else {
diff --git a/libc/calls/lseek-nt.c b/libc/calls/lseek-nt.c
index 9e073b8c4..11ef3471f 100644
--- a/libc/calls/lseek-nt.c
+++ b/libc/calls/lseek-nt.c
@@ -19,7 +19,6 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/intrin/fds.h"
 #include "libc/nt/enum/filetype.h"
 #include "libc/nt/files.h"
 #include "libc/nt/struct/byhandlefileinformation.h"
@@ -32,7 +31,7 @@ static textwindows int64_t GetPosition(struct Fd *f, int whence) {
     case SEEK_SET:
       return 0;
     case SEEK_CUR:
-      return f->cursor->shared->pointer;
+      return f->pointer;
     case SEEK_END: {
       struct NtByHandleFileInformation wst;
       if (!GetFileInformationByHandle(f->handle, &wst)) {
@@ -68,14 +67,11 @@ textwindows int64_t sys_lseek_nt(int fd, int64_t offset, int whence) {
   } else if (__isfdkind(fd, kFdFile)) {
     struct Fd *f = g_fds.p + fd;
     int filetype = GetFileType(f->handle);
-    if (filetype != kNtFileTypePipe &&  //
-        filetype != kNtFileTypeChar &&  //
-        f->cursor->shared) {
+    if (filetype != kNtFileTypePipe && filetype != kNtFileTypeChar) {
       int64_t res;
-      __cursor_lock(f->cursor);
-      if ((res = Seek(f, offset, whence)) != -1)
-        f->cursor->shared->pointer = res;
-      __cursor_unlock(f->cursor);
+      if ((res = Seek(f, offset, whence)) != -1) {
+        f->pointer = res;
+      }
       return res;
     } else {
       return espipe();
diff --git a/libc/calls/makedev.h b/libc/calls/makedev.h
index 21479c795..dcd221026 100644
--- a/libc/calls/makedev.h
+++ b/libc/calls/makedev.h
@@ -1,6 +1,5 @@
 #ifndef COSMOPOLITAN_LIBC_CALLS_MAKEDEV_H_
 #define COSMOPOLITAN_LIBC_CALLS_MAKEDEV_H_
-COSMOPOLITAN_C_START_
 
 uint64_t makedev(uint32_t, uint32_t) libcesque;
 uint32_t major(uint64_t) libcesque;
@@ -10,5 +9,4 @@ uint32_t minor(uint64_t) libcesque;
 #define minor(x)      minor(x)
 #define makedev(x, y) makedev(x, y)
 
-COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_MAKEDEV_H_ */
diff --git a/libc/calls/metalfile.c b/libc/calls/metalfile.c
index 5d2c57540..cdfb6bc5f 100644
--- a/libc/calls/metalfile.c
+++ b/libc/calls/metalfile.c
@@ -32,7 +32,7 @@
 #include "libc/intrin/directmap.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/pc.internal.h"
 #include "libc/runtime/runtime.h"
@@ -67,9 +67,10 @@ textstartup void InitializeMetalFile(void) {
     size_t size = ROUNDUP(_ezip - __executable_start, 4096);
     // TODO(jart): Restore support for ZIPOS on metal.
     void *copied_base;
-    void *addr = sys_mmap_metal(NULL, size, PROT_READ | PROT_WRITE,
-                                MAP_SHARED_linux | MAP_ANONYMOUS_linux, -1, 0);
-    copied_base = addr;
+    struct DirectMap dm;
+    dm = sys_mmap_metal(NULL, size, PROT_READ | PROT_WRITE,
+                        MAP_SHARED_linux | MAP_ANONYMOUS_linux, -1, 0);
+    copied_base = dm.addr;
     npassert(copied_base != (void *)-1);
     memcpy(copied_base, (void *)(BANE + IMAGE_BASE_PHYSICAL), size);
     __ape_com_base = copied_base;
diff --git a/libc/calls/metalfile_init.S b/libc/calls/metalfile_init.S
index 0f5466fc5..72e7f8972 100644
--- a/libc/calls/metalfile_init.S
+++ b/libc/calls/metalfile_init.S
@@ -24,7 +24,7 @@
 │ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR        │
 │ OTHER DEALINGS IN THE SOFTWARE.                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/calls/metalfile.internal.h"
 
 	.init.start 102,_init_metalfile
diff --git a/libc/calls/mkdirat.c b/libc/calls/mkdirat.c
index b4a2cb1b4..64ad9ea2d 100644
--- a/libc/calls/mkdirat.c
+++ b/libc/calls/mkdirat.c
@@ -53,7 +53,7 @@
 int mkdirat(int dirfd, const char *path, unsigned mode) {
   int rc;
   if (_weaken(__zipos_notat) && (rc = __zipos_notat(dirfd, path)) == -1) {
-    rc = erofs();
+    STRACE("zipos mkdirat not supported yet");
   } else if (!IsWindows()) {
     rc = sys_mkdirat(dirfd, path, mode);
   } else {
diff --git a/libc/calls/mkntenvblock.c b/libc/calls/mkntenvblock.c
index a6738fa7a..ad47b28d2 100644
--- a/libc/calls/mkntenvblock.c
+++ b/libc/calls/mkntenvblock.c
@@ -144,7 +144,6 @@ textwindows int mkntenvblock(char16_t envblock[32767], char *const envp[],
   n = (CountStrings(envp) + CountStrings(extravars) + 1) * sizeof(char *);
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
   env.var = alloca(n);
   CheckLargeStackAllocation(env.var, n);
 #pragma GCC pop_options
diff --git a/libc/calls/mkntpath.c b/libc/calls/mkntpath.c
index 7c535de1a..f1ca7d153 100644
--- a/libc/calls/mkntpath.c
+++ b/libc/calls/mkntpath.c
@@ -20,7 +20,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/systeminfo.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/o.h"
@@ -55,19 +55,6 @@ textwindows size_t __normntpath(char16_t *p, size_t n) {
       // matched "/../" or "/..$"
       while (j && p[j - 1] == '\\')
         --j;
-      if (j && p[j - 1] == '.') {
-        // matched "." before
-        if (j >= 2 && p[j - 2] == '.' &&  //
-            (j == 2 || p[j - 3] == '\\')) {
-          // matched "^.." or "/.." before
-          p[++j] = '.';
-          ++j;
-          continue;
-        } else if (j == 1 || p[j - 2] == '\\') {
-          // matched "^." or "/." before
-          continue;
-        }
-      }
       while (j && p[j - 1] != '\\')
         --j;
     } else {
diff --git a/libc/calls/mkntpathat.c b/libc/calls/mkntpathat.c
index a263cd838..e1e845b55 100644
--- a/libc/calls/mkntpathat.c
+++ b/libc/calls/mkntpathat.c
@@ -18,9 +18,8 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/fileflagandattributes.h"
 #include "libc/nt/files.h"
 #include "libc/nt/thunk/msabi.h"
@@ -28,18 +27,6 @@
 #include "libc/sysv/consts/at.h"
 #include "libc/sysv/errfuns.h"
 
-static int IsAlpha(int c) {
-  return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
-}
-
-static bool IsAbsolutePathWin32(char16_t *path) {
-  if (path[0] == '\\')
-    return true;
-  if (IsAlpha(path[0]) && path[1] == ':')
-    return true;
-  return false;
-}
-
 static textwindows int __mkntpathath_impl(int64_t dirhand, const char *path,
                                           int flags,
                                           char16_t file[hasatleast PATH_MAX]) {
@@ -52,7 +39,7 @@ static textwindows int __mkntpathath_impl(int64_t dirhand, const char *path,
     return -1;
   if (!filelen)
     return enoent();
-  if (dirhand != AT_FDCWD && !IsAbsolutePathWin32(file)) {
+  if (file[0] != u'\\' && dirhand != AT_FDCWD) {  // ProTip: \\?\C:\foo
     dirlen = GetFinalPathNameByHandle(dirhand, dir, ARRAYLEN(dir),
                                       kNtFileNameNormalized | kNtVolumeNameDos);
     if (!dirlen)
@@ -62,20 +49,8 @@ static textwindows int __mkntpathath_impl(int64_t dirhand, const char *path,
     dir[dirlen] = u'\\';
     memcpy(dir + dirlen + 1, file, (filelen + 1) * sizeof(char16_t));
     memcpy(file, dir, ((n = dirlen + 1 + filelen) + 1) * sizeof(char16_t));
-    n = __normntpath(file, n);
-
-    // UNC paths break some things when they are not needed.
-    if (n > 4 && n < 260 &&  //
-        file[0] == '\\' &&   //
-        file[1] == '\\' &&   //
-        file[2] == '?' &&    //
-        file[3] == '\\') {
-      memmove(file, file + 4, (n - 4 + 1) * sizeof(char16_t));
-    }
-
-    return n;
+    return __normntpath(file, n);
   } else {
-    filelen = __normntpath(file, filelen);
     return filelen;
   }
 }
@@ -85,20 +60,23 @@ textwindows int __mkntpathath(int64_t dirhand, const char *path, int flags,
 
   // convert the path.
   int len;
-  if ((len = __mkntpathath_impl(dirhand, path, flags, file)) == -1)
+  if ((len = __mkntpathath_impl(dirhand, path, flags, file)) == -1) {
     return -1;
+  }
 
   // if path ends with a slash, then we need to manually do what linux
   // does and check to make sure it's a directory, and return ENOTDIR,
   // since WIN32 will reject the path with EINVAL if we don't do this.
   if (len && file[len - 1] == '\\') {
     uint32_t fattr;
-    if (len > 1 && !(len == 3 && file[1] == ':'))
+    if (len > 1 && !(len == 3 && file[1] == ':')) {
       file[--len] = 0;
+    }
     if ((fattr = GetFileAttributes(file)) != -1u &&
         !(fattr & kNtFileAttributeReparsePoint) &&
-        !(fattr & kNtFileAttributeDirectory))
+        !(fattr & kNtFileAttributeDirectory)) {
       return enotdir();
+    }
   }
 
   return len;
diff --git a/libc/calls/netbsdtramp.S b/libc/calls/netbsdtramp.S
index dddd3536b..01fdca769 100644
--- a/libc/calls/netbsdtramp.S
+++ b/libc/calls/netbsdtramp.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .privileged
 
 __restore_rt_netbsd:
diff --git a/libc/calls/ntspawn.c b/libc/calls/ntspawn.c
index cd7531148..392531915 100644
--- a/libc/calls/ntspawn.c
+++ b/libc/calls/ntspawn.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/proc/ntspawn.h"
-#include "libc/calls/state.internal.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/intrin/strace.h"
@@ -39,15 +38,12 @@
 #include "libc/nt/struct/procthreadattributelist.h"
 #include "libc/nt/struct/startupinfo.h"
 #include "libc/nt/struct/startupinfoex.h"
-#include "libc/nt/thunk/msabi.h"
 #include "libc/proc/ntspawn.h"
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/errfuns.h"
 #ifdef __x86_64__
 
-__msabi extern typeof(CloseHandle) *const __imp_CloseHandle;
-
 struct SpawnBlock {
   char16_t path[PATH_MAX];
   char16_t cmdline[32767];
@@ -67,12 +63,10 @@ static textwindows ssize_t ntspawn_read(intptr_t fh, char *buf, size_t len) {
   bool ok;
   uint32_t got;
   struct NtOverlapped overlap = {.hEvent = CreateEvent(0, 0, 0, 0)};
-  ok = overlap.hEvent &&
-       (ReadFile(fh, buf, len, 0, &overlap) ||
+  ok = (ReadFile(fh, buf, len, 0, &overlap) ||
         GetLastError() == kNtErrorIoPending) &&
        GetOverlappedResult(fh, &overlap, &got, true);
-  if (overlap.hEvent)
-    __imp_CloseHandle(overlap.hEvent);
+  CloseHandle(overlap.hEvent);
   return ok ? got : -1;
 }
 
@@ -92,7 +86,7 @@ static textwindows int ntspawn2(struct NtSpawnArgs *a, struct SpawnBlock *sb) {
   if (fh == -1)
     return -1;
   ssize_t got = ntspawn_read(fh, p, pe - p);
-  __imp_CloseHandle(fh);
+  CloseHandle(fh);
   if (got < 3)
     return enoexec();
   pe = p + got;
@@ -159,7 +153,7 @@ static textwindows int ntspawn2(struct NtSpawnArgs *a, struct SpawnBlock *sb) {
   alignas(16) char memory[128];
   size_t size = sizeof(memory);
   struct NtProcThreadAttributeList *alist = (void *)memory;
-  uint32_t items = !!a->opt_hParentProcess + !!a->dwExplicitHandleCount;
+  uint32_t items = !!a->opt_hParentProcess + !!a->opt_lpExplicitHandleList;
   ok = InitializeProcThreadAttributeList(alist, items, 0, &size);
   if (!ok && GetLastError() == kNtErrorInsufficientBuffer) {
     ok = !!(alist = freeme = ntspawn_malloc(size));
@@ -172,7 +166,7 @@ static textwindows int ntspawn2(struct NtSpawnArgs *a, struct SpawnBlock *sb) {
         alist, 0, kNtProcThreadAttributeParentProcess, &a->opt_hParentProcess,
         sizeof(a->opt_hParentProcess), 0, 0);
   }
-  if (ok && a->dwExplicitHandleCount) {
+  if (ok && a->opt_lpExplicitHandleList) {
     ok = UpdateProcThreadAttribute(
         alist, 0, kNtProcThreadAttributeHandleList, a->opt_lpExplicitHandleList,
         a->dwExplicitHandleCount * sizeof(*a->opt_lpExplicitHandleList), 0, 0);
@@ -244,7 +238,6 @@ textwindows int ntspawn(struct NtSpawnArgs *args) {
   BLOCK_SIGNALS;
   if ((sb = ntspawn_malloc(sizeof(*sb)))) {
     rc = ntspawn2(args, sb);
-    ntspawn_free(sb);
   } else {
     rc = -1;
   }
diff --git a/libc/calls/open-nt.c b/libc/calls/open-nt.c
index 09061eb8a..c7339171b 100644
--- a/libc/calls/open-nt.c
+++ b/libc/calls/open-nt.c
@@ -24,8 +24,7 @@
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/errno.h"
-#include "libc/intrin/fds.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/console.h"
 #include "libc/nt/createfile.h"
 #include "libc/nt/enum/accessmask.h"
@@ -139,7 +138,6 @@ static textwindows int sys_open_nt_file(int dirfd, const char *file,
   int64_t handle;
   if ((handle = sys_open_nt_impl(dirfd, file, flags, mode,
                                  kNtFileFlagOverlapped)) != -1) {
-    g_fds.p[fd].cursor = __cursor_new();
     g_fds.p[fd].handle = handle;
     g_fds.p[fd].kind = kFdFile;
     g_fds.p[fd].flags = flags;
@@ -172,15 +170,15 @@ static textwindows int sys_open_nt_no_handle(int fd, int flags, int mode,
 
 static textwindows int sys_open_nt_dup(int fd, int flags, int mode, int oldfd) {
   int64_t handle;
-  if (!__isfdopen(oldfd))
+  if (!__isfdopen(oldfd)) {
     return enoent();
+  }
   if (DuplicateHandle(GetCurrentProcess(), g_fds.p[oldfd].handle,
                       GetCurrentProcess(), &handle, 0, true,
                       kNtDuplicateSameAccess)) {
     g_fds.p[fd] = g_fds.p[oldfd];
     g_fds.p[fd].handle = handle;
     g_fds.p[fd].mode = mode;
-    __cursor_ref(g_fds.p[fd].cursor);
     if (!sys_fcntl_nt_setfl(fd, flags)) {
       return fd;
     } else {
diff --git a/libc/calls/openat-metal.c b/libc/calls/openat-metal.c
index 647af1360..ec958f4c0 100644
--- a/libc/calls/openat-metal.c
+++ b/libc/calls/openat-metal.c
@@ -23,7 +23,7 @@
 #include "libc/calls/metalfile.internal.h"
 #include "libc/intrin/directmap.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/pc.internal.h"
 #include "libc/runtime/runtime.h"
@@ -49,9 +49,11 @@ int sys_openat_metal(int dirfd, const char *file, int flags, unsigned mode) {
   if ((fd = __reservefd(-1)) == -1)
     return -1;
   if (!_weaken(calloc) || !_weaken(free)) {
-    state = sys_mmap_metal(NULL, ROUNDUP(sizeof(struct MetalFile), 4096),
-                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
-                           -1, 0);
+    struct DirectMap dm;
+    dm = sys_mmap_metal(NULL, ROUNDUP(sizeof(struct MetalFile), 4096),
+                        PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1,
+                        0);
+    state = dm.addr;
     if (state == (void *)-1)
       return -1;
   } else {
diff --git a/libc/calls/park.c b/libc/calls/park.c
index 103a6cbdf..286c77555 100644
--- a/libc/calls/park.c
+++ b/libc/calls/park.c
@@ -18,97 +18,50 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
-#include "libc/calls/struct/sigset.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/fmt/wintime.internal.h"
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/weaken.h"
-#include "libc/nt/events.h"
-#include "libc/nt/runtime.h"
 #include "libc/nt/synchronization.h"
-#include "libc/str/str.h"
 #include "libc/sysv/consts/sicode.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/posixthread.internal.h"
-
 #ifdef __x86_64__
 
-// returns 0 if deadline is reached
+// returns 0 on timeout or spurious wakeup
 // raises EINTR if a signal delivery interrupted wait operation
 // raises ECANCELED if this POSIX thread was canceled in masked mode
-textwindows static int _park_thread(struct timespec deadline, sigset_t waitmask,
+static textwindows int _park_thread(uint32_t msdelay, sigset_t waitmask,
                                     bool restartable) {
-  for (;;) {
-    uint32_t handl = 0;
-    intptr_t hands[2];
-
-    // create event object
-    intptr_t sigev;
-    if (!(sigev = CreateEvent(0, 0, 0, 0)))
-      return __winerr();
-    hands[handl++] = sigev;
-
-    // create high precision timer if needed
-    if (memcmp(&deadline, &timespec_max, sizeof(struct timespec))) {
-      intptr_t hTimer;
-      if ((hTimer = CreateWaitableTimer(NULL, true, NULL))) {
-        int64_t due = TimeSpecToWindowsTime(deadline);
-        if (SetWaitableTimer(hTimer, &due, 0, NULL, NULL, false)) {
-          hands[handl++] = hTimer;
-        } else {
-          CloseHandle(hTimer);
-        }
-      }
-    }
-
-    // perform wait operation
-    struct PosixThread *pt = _pthread_self();
-    pt->pt_event = sigev;
-    pt->pt_blkmask = waitmask;
-    atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_EVENT,
-                          memory_order_release);
-    //!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!//
-    int sig = 0;
-    uint32_t wi = 0;
-    if (!_is_canceled() &&
-        !(_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))))
-      wi = WaitForMultipleObjects(handl, hands, false, -1u);
-    //!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!//
-    atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
-    for (int i = 0; i < handl; ++i)
-      CloseHandle(hands[i]);
-
-    // recursion is now safe
-    if (wi == 1)
-      return 0;
-    if (wi == -1u)
-      return __winerr();
-    int handler_was_called = 0;
-    if (!sig) {
-      if (_check_cancel())
-        return -1;
-      if (_weaken(__sig_get))
-        sig = _weaken(__sig_get)(waitmask);
-    }
-    if (sig)
-      handler_was_called = _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
-    if (_check_cancel())
-      return -1;
-    if (handler_was_called & SIG_HANDLED_NO_RESTART)
-      return eintr();
-    if (handler_was_called & SIG_HANDLED_SA_RESTART)
-      if (!restartable)
-        return eintr();
+  int sig, handler_was_called;
+  if (_check_cancel() == -1)
+    return -1;
+  if (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
+    goto HandleSignal;
   }
+  int expect = 0;
+  atomic_int futex = 0;
+  struct PosixThread *pt = _pthread_self();
+  pt->pt_blkmask = waitmask;
+  atomic_store_explicit(&pt->pt_blocker, &futex, memory_order_release);
+  bool32 ok = WaitOnAddress(&futex, &expect, sizeof(int), msdelay);
+  atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
+  if (ok && _weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
+  HandleSignal:
+    handler_was_called = _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
+    if (_check_cancel() == -1)
+      return -1;
+    if (!restartable || (handler_was_called & SIG_HANDLED_NO_RESTART)) {
+      return eintr();
+    }
+  }
+  return 0;
 }
 
-textwindows int _park_norestart(struct timespec deadline, sigset_t waitmask) {
-  return _park_thread(deadline, waitmask, false);
+textwindows int _park_norestart(uint32_t msdelay, sigset_t waitmask) {
+  return _park_thread(msdelay, waitmask, false);
 }
 
-textwindows int _park_restartable(struct timespec deadline, sigset_t waitmask) {
-  return _park_thread(deadline, waitmask, true);
+textwindows int _park_restartable(uint32_t msdelay, sigset_t waitmask) {
+  return _park_thread(msdelay, waitmask, true);
 }
 
 #endif /* __x86_64__ */
diff --git a/libc/calls/parsepromises.c b/libc/calls/parsepromises.c
index c12b2ee33..af4770e06 100644
--- a/libc/calls/parsepromises.c
+++ b/libc/calls/parsepromises.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/pledge.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 
 static int FindPromise(const char *name) {
diff --git a/libc/calls/pause-nt.c b/libc/calls/pause-nt.c
index 28e5e4184..0a43e5089 100644
--- a/libc/calls/pause-nt.c
+++ b/libc/calls/pause-nt.c
@@ -17,21 +17,14 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
-#include "libc/calls/struct/sigset.internal.h"
-#include "libc/calls/struct/timespec.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #ifdef __x86_64__
 
 textwindows int sys_pause_nt(void) {
-  // we don't strictly need to block signals, but it reduces signal
-  // delivery latency, by preventing other threads from delivering a
-  // signal asynchronously. it takes about ~5us to deliver a signal
-  // using SetEvent() whereas it takes ~30us to use SuspendThread(),
-  // GetThreadContext(), SetThreadContext(), and ResumeThread().
-  BLOCK_SIGNALS;
-  _park_norestart(timespec_max, 0);
-  ALLOW_SIGNALS;
-  return -1;
+  int rc;
+  while (!(rc = _park_norestart(-1u, 0)))
+    donothing;
+  return rc;
 }
 
 #endif /* __x86_64__ */
diff --git a/libc/calls/pipe-nt.c b/libc/calls/pipe-nt.c
index 40a16c375..d2ed971e0 100644
--- a/libc/calls/pipe-nt.c
+++ b/libc/calls/pipe-nt.c
@@ -56,6 +56,7 @@ static textwindows int sys_pipe_nt_impl(int pipefd[2], unsigned flags) {
   __fds_unlock();
   hin = CreateNamedPipe(pipename, kNtPipeAccessInbound | kNtFileFlagOverlapped,
                         mode, 1, PIPE_BUF, PIPE_BUF, 0, &kNtIsInheritable);
+  __fds_lock();
   if (hin != -1) {
     if ((hout = CreateFile(
              pipename, kNtGenericWrite,
@@ -72,6 +73,7 @@ static textwindows int sys_pipe_nt_impl(int pipefd[2], unsigned flags) {
       g_fds.p[writer].handle = hout;
       pipefd[0] = reader;
       pipefd[1] = writer;
+      __fds_unlock();
       return 0;
     } else {
       CloseHandle(hin);
@@ -79,6 +81,7 @@ static textwindows int sys_pipe_nt_impl(int pipefd[2], unsigned flags) {
   }
   __releasefd(writer);
   __releasefd(reader);
+  __fds_unlock();
   return -1;
 }
 
diff --git a/libc/calls/pledge-linux.c b/libc/calls/pledge-linux.c
index 3ea63946c..fbafd3d1e 100644
--- a/libc/calls/pledge-linux.c
+++ b/libc/calls/pledge-linux.c
@@ -29,7 +29,7 @@
 #include "libc/intrin/bsr.h"
 #include "libc/intrin/likely.h"
 #include "libc/intrin/promises.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
 #include "libc/sysv/consts/audit.h"
@@ -694,7 +694,6 @@ static const uint16_t kPledgeStdio[] = {
     __NR_linux_sched_getaffinity,  //
     __NR_linux_sched_setaffinity,  //
     __NR_linux_sigtimedwait,       //
-    __NR_linux_getcpu,             //
 };
 
 static const uint16_t kPledgeFlock[] = {
@@ -713,7 +712,6 @@ static const uint16_t kPledgeRpath[] = {
 #endif                             //
     __NR_linux_fstat,              //
     __NR_linux_fstatat,            //
-    __NR_linux_statx,              //
 #ifdef __NR_linux_access           //
     __NR_linux_access,             //
 #endif                             //
@@ -741,7 +739,6 @@ static const uint16_t kPledgeWpath[] = {
     __NR_linux_lstat,               //
 #endif                              //
     __NR_linux_fstatat,             //
-    __NR_linux_statx,               //
 #ifdef __NR_linux_access            //
     __NR_linux_access,              //
 #endif                              //
@@ -1008,15 +1005,16 @@ static const struct sock_filter kPledgeStart[] = {
     BPF_STMT(BPF_LD | BPF_W | BPF_ABS, OFF(nr)),
 #ifdef __NR_linux_memfd_secret
     // forbid some system calls with ENOSYS (rather than EPERM)
-    BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, __NR_linux_memfd_secret, 4, 0),
+    BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, __NR_linux_memfd_secret, 5, 0),
 #else
     BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, __NR_linux_landlock_restrict_self + 1,
-             4, 0),
+             5, 0),
 #endif
-    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_rseq, 3, 0),
-    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_memfd_create, 2, 0),
-    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_openat2, 1, 0),
-    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_clone3, 0, 1),
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_rseq, 4, 0),
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_memfd_create, 3, 0),
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_openat2, 2, 0),
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_clone3, 1, 0),
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_linux_statx, 0, 1),
     BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (Enosys & SECCOMP_RET_DATA)),
 };
 
diff --git a/libc/calls/pledge.c b/libc/calls/pledge.c
index 812844502..b5a6dbb63 100644
--- a/libc/calls/pledge.c
+++ b/libc/calls/pledge.c
@@ -169,7 +169,8 @@
  *   turn APE binaries into static native binaries.
  *
  * - "prot_exec" allows mmap(PROT_EXEC) and mprotect(PROT_EXEC). This is
- *   needed to launch non-static or non-native executables, e.g.
+ *   needed to (1) code morph mutexes in __enable_threads(), and it's
+ *   needed to (2) launch non-static or non-native executables, e.g.
  *   non-assimilated APE binaries, or dynamic-linked executables.
  *
  * - "unveil" allows unveil() to be called, as well as the underlying
@@ -232,21 +233,6 @@
  *   option might not be a good idea if you're pledging `exec` because
  *   subprocesses can't inherit the `SIGSYS` handler this installs.
  *
- * If you experience crashes during startup when execve'ing a cosmo
- * binary that's had permissions like rpath pledged away, then try doing
- * this before calling execve. This prevents special startup checks.
- *
- *     putenv("COMDBG=program.dbg");
- *
- * If having pledge() security is mission critical, then add this code
- * to the start of your main() function to ensure your program fails
- * with an error if it isn't available.
- *
- *     if (pledge(0, 0)) {
- *       fprintf(stderr, "error: OS doesn't support pledge() security\n");
- *       exit(1);
- *     }
- *
  * @return 0 on success, or -1 w/ errno
  * @raise ENOSYS if `pledge(0, 0)` was used and security is not possible
  * @raise EINVAL if `execpromises` on Linux isn't a subset of `promises`
diff --git a/libc/calls/poll-nt.c b/libc/calls/poll-nt.c
index cc015f045..ac1e64c7e 100644
--- a/libc/calls/poll-nt.c
+++ b/libc/calls/poll-nt.c
@@ -16,358 +16,219 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
 #include "libc/calls/state.internal.h"
-#include "libc/calls/struct/sigset.h"
+#include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/struct/timespec.h"
-#include "libc/calls/struct/timespec.internal.h"
-#include "libc/calls/syscall_support-nt.internal.h"
+#include "libc/dce.h"
+#include "libc/errno.h"
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/fds.h"
-#include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/intrin/strace.h"
+#include "libc/macros.internal.h"
+#include "libc/mem/mem.h"
 #include "libc/nt/console.h"
 #include "libc/nt/enum/filetype.h"
-#include "libc/nt/enum/wait.h"
 #include "libc/nt/errors.h"
-#include "libc/nt/events.h"
 #include "libc/nt/files.h"
 #include "libc/nt/ipc.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/struct/pollfd.h"
 #include "libc/nt/synchronization.h"
-#include "libc/nt/time.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
+#include "libc/runtime/runtime.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/struct/pollfd.h"
+#include "libc/sock/struct/pollfd.internal.h"
+#include "libc/stdio/sysparam.h"
 #include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/poll.h"
+#include "libc/sysv/consts/sig.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/posixthread.internal.h"
+#include "libc/thread/tls.h"
 #ifdef __x86_64__
 
-// <sync libc/sysv/consts.sh>
-#define POLLERR_    0x0001  // implied in events
-#define POLLHUP_    0x0002  // implied in events
-#define POLLNVAL_   0x0004  // implied in events
-#define POLLIN_     0x0300
-#define POLLRDNORM_ 0x0100
-#define POLLRDBAND_ 0x0200
-#define POLLOUT_    0x0010
-#define POLLWRNORM_ 0x0010
-#define POLLWRBAND_ 0x0020  // MSDN undocumented
-#define POLLPRI_    0x0400  // MSDN unsupported
-// </sync libc/sysv/consts.sh>
-
-textwindows static uint32_t sys_poll_nt_waitms(struct timespec deadline) {
-  struct timespec now = sys_clock_gettime_monotonic_nt();
-  if (timespec_cmp(now, deadline) < 0) {
-    struct timespec remain = timespec_sub(deadline, now);
-    int64_t millis = timespec_tomillis(remain);
-    uint32_t waitfor = MIN(millis, 0xffffffffu);
-    return MIN(waitfor, POLL_INTERVAL_MS);
-  } else {
-    return 0;  // we timed out
-  }
-}
+#define POLL_INTERVAL_MS 10
 
 // Polls on the New Technology.
 //
 // This function is used to implement poll() and select(). You may poll
 // on sockets, files and the console at the same time. We also poll for
 // both signals and posix thread cancelation, while the poll is polling
-textwindows static int sys_poll_nt_actual(struct pollfd *fds, uint64_t nfds,
-                                          struct timespec deadline,
-                                          sigset_t waitmask) {
-  int fileindices[64];
-  int sockindices[64];
-  int64_t filehands[64];
-  struct PosixThread *pt;
-  int i, rc, ev, kind, gotsocks;
+static textwindows int sys_poll_nt_impl(struct pollfd *fds, uint64_t nfds,
+                                        uint32_t *ms, sigset_t sigmask) {
+  bool ok;
+  uint64_t millis;
+  uint32_t cm, avail, waitfor;
+  struct sys_pollfd_nt pipefds[8];
   struct sys_pollfd_nt sockfds[64];
-  uint32_t cm, fi, sn, pn, avail, waitfor, already_slept;
+  int pipeindices[ARRAYLEN(pipefds)];
+  int sockindices[ARRAYLEN(sockfds)];
+  struct timespec started, deadline, remain, now;
+  int i, rc, sn, pn, gotinvals, gotpipes, gotsocks;
 
-  // ensure revents is cleared
-  for (i = 0; i < nfds; ++i)
-    fds[i].revents = 0;
+  started = timespec_real();
+  deadline = timespec_add(started, timespec_frommillis(ms ? *ms : -1u));
 
-  // divide files from sockets
-  // check for invalid file descriptors
+  // do the planning
+  // we need to read static variables
+  // we might need to spawn threads and open pipes
   __fds_lock();
-  for (rc = sn = pn = i = 0; i < nfds; ++i) {
+  for (gotinvals = rc = sn = pn = i = 0; i < nfds; ++i) {
     if (fds[i].fd < 0)
       continue;
     if (__isfdopen(fds[i].fd)) {
-      kind = g_fds.p[fds[i].fd].kind;
-      if (kind == kFdSocket) {
-        // we can use WSAPoll() for these fds
+      if (__isfdkind(fds[i].fd, kFdSocket)) {
         if (sn < ARRAYLEN(sockfds)) {
-          // WSAPoll whines if we pass POLLNVAL, POLLHUP, or POLLERR.
+          // the magnums for POLLIN/OUT/PRI on NT include the other ones too
+          // we need to clear ones like POLLNVAL or else WSAPoll shall whine
           sockindices[sn] = i;
           sockfds[sn].handle = g_fds.p[fds[i].fd].handle;
-          sockfds[sn].events =
-              fds[i].events & (POLLRDNORM_ | POLLRDBAND_ | POLLWRNORM_);
+          sockfds[sn].events = fds[i].events & (POLLPRI | POLLIN | POLLOUT);
           sockfds[sn].revents = 0;
           ++sn;
         } else {
-          // too many sockets
-          rc = einval();
+          // too many socket fds
+          rc = enomem();
           break;
         }
-      } else if (kind == kFdFile || kind == kFdConsole) {
-        // we can use WaitForMultipleObjects() for these fds
-        if (pn < ARRAYLEN(fileindices) - 1) {  // last slot for signal event
-          fileindices[pn] = i;
-          filehands[pn] = g_fds.p[fds[i].fd].handle;
-          ++pn;
-        } else {
-          // too many files
-          rc = einval();
-          break;
-        }
-      } else if (kind == kFdDevNull || kind == kFdDevRandom || kind == kFdZip) {
-        // we can't wait on these kinds via win32
-        if (fds[i].events & (POLLRDNORM_ | POLLWRNORM_)) {
-          // the linux kernel does this irrespective of oflags
-          fds[i].revents = fds[i].events & (POLLRDNORM_ | POLLWRNORM_);
-        }
-      } else {
-        // unsupported file type
-        fds[i].revents = POLLNVAL_;
-      }
-    } else {
-      // file not open
-      fds[i].revents = POLLNVAL_;
-    }
-    rc += !!fds[i].revents;
-  }
-  __fds_unlock();
-  if (rc == -1)
-    return rc;
-
-  // perform poll operation
-  for (;;) {
-
-    // check input status of pipes / consoles without blocking
-    // this ensures any socket fds won't starve them of events
-    // we can't poll file handles, so we just mark those ready
-    for (i = 0; i < pn; ++i) {
-      fi = fileindices[i];
-      ev = fds[fi].events;
-      ev &= POLLRDNORM_ | POLLWRNORM_;
-      if ((g_fds.p[fds[fi].fd].flags & O_ACCMODE) == O_RDONLY)
-        ev &= ~POLLWRNORM_;
-      if ((g_fds.p[fds[fi].fd].flags & O_ACCMODE) == O_WRONLY)
-        ev &= ~POLLRDNORM_;
-      if ((ev & POLLWRNORM_) && !(ev & POLLRDNORM_)) {
-        fds[fi].revents = fds[fi].events & (POLLRDNORM_ | POLLWRNORM_);
-      } else if (GetFileType(filehands[i]) == kNtFileTypePipe) {
-        if (PeekNamedPipe(filehands[i], 0, 0, 0, &avail, 0)) {
-          if (avail)
-            fds[fi].revents = POLLRDNORM_;
-        } else if (GetLastError() == kNtErrorHandleEof ||
-                   GetLastError() == kNtErrorBrokenPipe) {
-          fds[fi].revents = POLLHUP_;
-        } else {
-          fds[fi].revents = POLLERR_;
-        }
-      } else if (GetConsoleMode(filehands[i], &cm)) {
-        switch (CountConsoleInputBytes()) {
-          case 0:
-            fds[fi].revents = fds[fi].events & POLLWRNORM_;
+      } else if (pn < ARRAYLEN(pipefds)) {
+        pipeindices[pn] = i;
+        pipefds[pn].handle = g_fds.p[fds[i].fd].handle;
+        pipefds[pn].events = 0;
+        pipefds[pn].revents = 0;
+        switch (g_fds.p[fds[i].fd].flags & O_ACCMODE) {
+          case O_RDONLY:
+            pipefds[pn].events = fds[i].events & POLLIN;
             break;
-          case -1:
-            fds[fi].revents = POLLHUP_;
+          case O_WRONLY:
+            pipefds[pn].events = fds[i].events & POLLOUT;
+            break;
+          case O_RDWR:
+            pipefds[pn].events = fds[i].events & (POLLIN | POLLOUT);
             break;
           default:
-            fds[fi].revents = fds[fi].events & (POLLRDNORM_ | POLLWRNORM_);
             break;
         }
+        ++pn;
       } else {
-        fds[fi].revents = fds[fi].events & (POLLRDNORM_ | POLLWRNORM_);
-      }
-      rc += !!fds[fi].revents;
-    }
-
-    // determine how long to wait
-    waitfor = sys_poll_nt_waitms(deadline);
-
-    // check for events and/or readiness on sockets
-    // we always do this due to issues with POLLOUT
-    if (sn) {
-      // if we need to wait, then we prefer to wait inside WSAPoll()
-      // this ensures network events are received in ~10µs not ~10ms
-      if (!rc && waitfor) {
-        if (__sigcheck(waitmask, false))
-          return -1;
-        already_slept = waitfor;
-      } else {
-        already_slept = 0;
-      }
-      if ((gotsocks = WSAPoll(sockfds, sn, already_slept)) == -1)
-        return __winsockerr();
-      if (gotsocks) {
-        for (i = 0; i < sn; ++i)
-          if (sockfds[i].revents) {
-            fds[sockindices[i]].revents = sockfds[i].revents;
-            ++rc;
-          }
-      } else if (already_slept) {
-        if (__sigcheck(waitmask, false))
-          return -1;
+        // too many non-socket fds
+        rc = enomem();
+        break;
       }
     } else {
-      already_slept = 0;
+      ++gotinvals;
     }
+  }
+  __fds_unlock();
+  if (rc) {
+    // failed to create a polling solution
+    return rc;
+  }
 
-    // return if we observed events
-    if (rc || !waitfor)
-      break;
-
-    // if nothing has happened and we haven't already waited in poll()
-    // then we can wait on consoles, pipes, and signals simultaneously
-    // this ensures low latency for apps like emacs which with no sock
-    // here we shall actually report that something can be written too
-    if (!already_slept) {
-      intptr_t sigev;
-      if (!(sigev = CreateEvent(0, 0, 0, 0)))
-        return __winerr();
-      filehands[pn] = sigev;
-      pt = _pthread_self();
-      pt->pt_event = sigev;
-      pt->pt_blkmask = waitmask;
-      atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_EVENT,
-                            memory_order_release);
-      //!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!//
-      int sig = 0;
-      uint32_t wi = pn;
-      if (!_is_canceled() &&
-          !(_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))))
-        wi = WaitForMultipleObjects(pn + 1, filehands, 0, waitfor);
-      //!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!/!//
-      atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
-      CloseHandle(sigev);
-      if (wi == -1u)
-        // win32 wait failure
-        return __winerr();
-      if (wi == pn) {
-        // our signal event was signalled
-        int handler_was_called = 0;
-        if (sig)
-          handler_was_called = _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
-        if (_check_cancel() == -1)
-          return -1;
-        if (handler_was_called)
-          return eintr();
-      } else if ((wi ^ kNtWaitAbandoned) < pn) {
-        // this is possibly because a process or thread was killed
-        fds[fileindices[wi ^ kNtWaitAbandoned]].revents = POLLERR_;
-        ++rc;
-      } else if (wi < pn) {
-        fi = fileindices[wi];
-        // one of the handles we polled is ready for fi/o
-        if (GetConsoleMode(filehands[wi], &cm)) {
-          switch (CountConsoleInputBytes()) {
-            case 0:
-              // it's possible there was input and it was handled by the
-              // ICANON reader, and therefore should not be reported yet
-              if (fds[fi].events & POLLWRNORM_)
-                fds[fi].revents = POLLWRNORM_;
-              break;
-            case -1:
-              fds[fi].revents = POLLHUP_;
-              break;
-            default:
-              fds[fi].revents = fds[fi].events & (POLLRDNORM_ | POLLWRNORM_);
-              break;
-          }
-        } else if (GetFileType(filehands[wi]) == kNtFileTypePipe) {
-          if ((fds[fi].events & POLLRDNORM_) &&
-              (g_fds.p[fds[fi].fd].flags & O_ACCMODE) != O_WRONLY) {
-            if (PeekNamedPipe(filehands[wi], 0, 0, 0, &avail, 0)) {
-              fds[fi].revents = fds[fi].events & (POLLRDNORM_ | POLLWRNORM_);
-            } else if (GetLastError() == kNtErrorHandleEof ||
-                       GetLastError() == kNtErrorBrokenPipe) {
-              fds[fi].revents = POLLHUP_;
-            } else {
-              fds[fi].revents = POLLERR_;
+  // perform the i/o and sleeping and looping
+  for (;;) {
+    // see if input is available on non-sockets
+    for (gotpipes = i = 0; i < pn; ++i) {
+      if (pipefds[i].events & POLLOUT) {
+        // we have no way of polling if a non-socket is writeable yet
+        // therefore we assume that if it can happen, it shall happen
+        pipefds[i].revents |= POLLOUT;
+      }
+      if (pipefds[i].events & POLLIN) {
+        if (GetFileType(pipefds[i].handle) == kNtFileTypePipe) {
+          ok = PeekNamedPipe(pipefds[i].handle, 0, 0, 0, &avail, 0);
+          POLLTRACE("PeekNamedPipe(%ld, 0, 0, 0, [%'u], 0) → %hhhd% m",
+                    pipefds[i].handle, avail, ok);
+          if (ok) {
+            if (avail) {
+              pipefds[i].revents |= POLLIN;
             }
           } else {
-            fds[fi].revents = fds[fi].events & (POLLRDNORM_ | POLLWRNORM_);
+            pipefds[i].revents |= POLLERR;
+          }
+        } else if (GetConsoleMode(pipefds[i].handle, &cm)) {
+          if (CountConsoleInputBytes()) {
+            pipefds[i].revents |= POLLIN;  // both >0 and -1 (eof) are pollin
           }
         } else {
-          fds[fi].revents = fds[fi].events & (POLLRDNORM_ | POLLWRNORM_);
+          // we have no way of polling if a non-socket is readable yet
+          // therefore we assume that if it can happen it shall happen
+          pipefds[i].revents |= POLLIN;
+        }
+      }
+      if (pipefds[i].revents) {
+        ++gotpipes;
+      }
+    }
+    // if we haven't found any good results yet then here we
+    // compute a small time slice we don't mind sleeping for
+    if (sn) {
+      if ((gotsocks = WSAPoll(sockfds, sn, 0)) == -1) {
+        return __winsockerr();
+      }
+    } else {
+      gotsocks = 0;
+    }
+
+    // add some artificial delay, which we use as an opportunity to also
+    // check for pending signals, thread cancelation, etc.
+    waitfor = 0;
+    if (!gotinvals && !gotsocks && !gotpipes) {
+      now = timespec_real();
+      if (timespec_cmp(now, deadline) < 0) {
+        remain = timespec_sub(deadline, now);
+        millis = timespec_tomillis(remain);
+        waitfor = MIN(millis, 0xffffffffu);
+        waitfor = MIN(waitfor, POLL_INTERVAL_MS);
+        if (waitfor) {
+          POLLTRACE("poll() sleeping for %'d out of %'lu ms", waitfor,
+                    timespec_tomillis(remain));
+          if ((rc = _park_norestart(waitfor, sigmask)) == -1) {
+            return -1;  // eintr, ecanceled, etc.
+          }
         }
-        rc += !!fds[fi].revents;
-      } else {
-        // should only be possible on kNtWaitTimeout or semaphore abandoned
-        // keep looping for events and we'll catch timeout when appropriate
       }
     }
 
-    // once again, return if we observed events
-    if (rc)
+    // we gave all the sockets and all the named pipes a shot
+    // if we found anything at all then it's time to end work
+    if (gotinvals || gotpipes || gotsocks || !waitfor) {
       break;
+    }
   }
 
-  return rc;
-}
-
-textwindows static int sys_poll_nt_impl(struct pollfd *fds, uint64_t nfds,
-                                        struct timespec deadline,
-                                        const sigset_t waitmask) {
-  int i, n, rc, got = 0;
-  struct timespec now, next, target;
-
-  // we normally don't check for signals until we decide to wait, since
-  // it's nice to have functions like write() be unlikely to EINTR, but
-  // ppoll is a function where users are surely thinking about signals,
-  // since ppoll actually allows them to block signals everywhere else.
-  if (__sigcheck(waitmask, false))
-    return -1;
-
-  // fast path
-  if (nfds <= 63)
-    return sys_poll_nt_actual(fds, nfds, deadline, waitmask);
-
-  // clumsy path
-  for (;;) {
-    for (i = 0; i < nfds; i += 64) {
-      n = nfds - i;
-      n = n > 64 ? 64 : n;
-      rc = sys_poll_nt_actual(fds + i, n, timespec_zero, waitmask);
-      if (rc == -1)
-        return -1;
-      got += rc;
-    }
-    if (got)
-      return got;
-    now = sys_clock_gettime_monotonic_nt();
-    if (timespec_cmp(now, deadline) >= 0)
-      return 0;
-    next = timespec_add(now, timespec_frommillis(POLL_INTERVAL_MS));
-    if (timespec_cmp(next, deadline) >= 0) {
-      target = deadline;
+  // the system call is going to succeed
+  // it's now ok to start setting the output memory
+  for (i = 0; i < nfds; ++i) {
+    if (fds[i].fd < 0 || __isfdopen(fds[i].fd)) {
+      fds[i].revents = 0;
     } else {
-      target = next;
+      fds[i].revents = POLLNVAL;
     }
-    if (_park_norestart(target, waitmask) == -1)
-      return -1;
   }
+  for (i = 0; i < pn; ++i) {
+    fds[pipeindices[i]].revents = pipefds[i].revents;
+  }
+  for (i = 0; i < sn; ++i) {
+    fds[sockindices[i]].revents = sockfds[i].revents;
+  }
+
+  // and finally return
+  return gotinvals + gotpipes + gotsocks;
 }
 
-textwindows int sys_poll_nt(struct pollfd *fds, uint64_t nfds,
-                            const struct timespec *relative,
+textwindows int sys_poll_nt(struct pollfd *fds, uint64_t nfds, uint32_t *ms,
                             const sigset_t *sigmask) {
   int rc;
-  struct timespec now, timeout, deadline;
   BLOCK_SIGNALS;
-  now = relative ? sys_clock_gettime_monotonic_nt() : timespec_zero;
-  timeout = relative ? *relative : timespec_max;
-  deadline = timespec_add(now, timeout);
-  rc = sys_poll_nt_impl(fds, nfds, deadline, sigmask ? *sigmask : _SigMask);
+  rc = sys_poll_nt_impl(fds, nfds, ms, sigmask ? *sigmask : 0);
   ALLOW_SIGNALS;
   return rc;
 }
diff --git a/libc/calls/poll.c b/libc/calls/poll.c
index e547a41cf..5e13677a5 100644
--- a/libc/calls/poll.c
+++ b/libc/calls/poll.c
@@ -16,65 +16,69 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/timespec.h"
+#include "libc/calls/cp.internal.h"
+#include "libc/dce.h"
+#include "libc/intrin/strace.h"
 #include "libc/sock/struct/pollfd.h"
+#include "libc/sock/struct/pollfd.internal.h"
+#include "libc/stdckdint.h"
+#include "libc/sysv/errfuns.h"
 
 /**
- * Checks status on multiple file descriptors at once.
+ * Waits for something to happen on multiple file descriptors at once.
  *
- * Servers that need to handle an unbounded number of client connections
- * should just create a separate thread for each client. poll() isn't a
- * scalable i/o solution on any platform.
+ * Warning: XNU has an inconsistency with other platforms. If you have
+ * pollfds with fd≥0 and none of the meaningful events flags are added
+ * e.g. POLLIN then XNU won't check for POLLNVAL. This matters because
+ * one of the use-cases for poll() is quickly checking for open files.
  *
- * One of the use cases for poll() is to quickly check if a number of
- * file descriptors are valid. The canonical way to do this is to set
- * events to 0 which prevents blocking and causes only the invalid,
- * hangup, and error statuses to be checked.
+ * Note: Polling works best on Windows for sockets. We're able to poll
+ * input on named pipes. But for anything that isn't a socket, or pipe
+ * with POLLIN, (e.g. regular file) then POLLIN/POLLOUT are always set
+ * into revents if they're requested, provided they were opened with a
+ * mode that permits reading and/or writing.
  *
- * On XNU, the POLLHUP and POLLERR statuses aren't checked unless either
- * POLLIN, POLLOUT, or POLLPRI are specified in the events field. Cosmo
- * will however polyfill the checking of POLLNVAL on XNU with the events
- * doesn't specify any of the above i/o events.
- *
- * When XNU and BSD OSes report POLLHUP, they will always set POLLIN too
- * when POLLIN is requested, even in cases when there isn't unread data.
- *
- * Your poll() function will check the status of all file descriptors
- * before returning. This function won't block unless none of the fds
- * had had any reportable status.
- *
- * The impact shutdown() will have on poll() is a dice roll across OSes.
+ * Note: Windows has a limit of 64 file descriptors and ENOMEM with -1
+ * is returned if that limit is exceeded. In practice the limit is not
+ * this low. For example, pollfds with fd<0 don't count. So the caller
+ * could flip the sign bit with a short timeout, to poll a larger set.
  *
  * @param fds[𝑖].fd should be a socket, input pipe, or conosle input
- *     and if it's a negative number then the entry is ignored, plus
- *     revents will be set to zero
+ *     and if it's a negative number then the entry is ignored
  * @param fds[𝑖].events flags can have POLLIN, POLLOUT, POLLPRI,
  *     POLLRDNORM, POLLWRNORM, POLLRDBAND, POLLWRBAND as well as
  *     POLLERR, POLLHUP, and POLLNVAL although the latter are
  *     always implied (assuming fd≥0) so they're ignored here
- * @param timeout_ms if 0 means don't wait and negative waits forever
- * @return number of `fds` whose revents field has been set to a nonzero
- *     number, 0 if the timeout elapsed without events, or -1 w/ errno
+ * @param timeout_ms if 0 means don't wait and -1 means wait forever
+ * @return number of items fds whose revents field has been set to
+ *     nonzero to describe its events, or 0 if the timeout elapsed,
+ *     or -1 w/ errno
  * @return fds[𝑖].revents is always zero initializaed and then will
  *     be populated with POLL{IN,OUT,PRI,HUP,ERR,NVAL} if something
  *     was determined about the file descriptor
  * @raise ECANCELED if thread was cancelled in masked mode
- * @raise EINVAL if `nfds` exceeded `RLIMIT_NOFILE`
- * @raise ENOMEM on failure to allocate memory
  * @raise EINTR if signal was delivered
  * @cancelationpoint
  * @asyncsignalsafe
  * @norestart
  */
 int poll(struct pollfd *fds, size_t nfds, int timeout_ms) {
-  struct timespec ts;
-  struct timespec *tsp;
-  if (timeout_ms >= 0) {
-    ts.tv_sec = timeout_ms / 1000;
-    ts.tv_nsec = timeout_ms % 1000 * 1000000;
-    tsp = &ts;
+  int rc;
+  BEGIN_CANCELATION_POINT;
+
+  if (!IsWindows()) {
+    if (!IsMetal()) {
+      rc = sys_poll(fds, nfds, timeout_ms);
+    } else {
+      rc = sys_poll_metal(fds, nfds, timeout_ms);
+    }
   } else {
-    tsp = 0;
+    uint32_t ms = timeout_ms >= 0 ? timeout_ms : -1u;
+    rc = sys_poll_nt(fds, nfds, &ms, 0);
   }
-  return ppoll(fds, nfds, tsp, 0);
+
+  END_CANCELATION_POINT;
+  STRACE("poll(%s, %'zu, %'d) → %d% lm", DescribePollFds(rc, fds, nfds), nfds,
+         timeout_ms, rc);
+  return rc;
 }
diff --git a/libc/calls/ppoll.c b/libc/calls/ppoll.c
index d961a27f4..a50cfc5e0 100644
--- a/libc/calls/ppoll.c
+++ b/libc/calls/ppoll.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
 #include "libc/calls/cp.internal.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/sigset.internal.h"
@@ -25,108 +24,14 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/strace.h"
-#include "libc/limits.h"
-#include "libc/runtime/stack.h"
 #include "libc/sock/struct/pollfd.h"
 #include "libc/sock/struct/pollfd.internal.h"
 #include "libc/stdckdint.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/f.h"
-#include "libc/sysv/consts/poll.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/errfuns.h"
 
-static int ppoll_impl(struct pollfd *fds, size_t nfds,
-                      const struct timespec *timeout, const sigset_t *sigmask) {
-  int e, fdcount;
-  sigset_t oldmask;
-  struct timespec ts, *tsp;
-
-  // validate timeout
-  if (timeout && timeout->tv_nsec >= 1000000000ull)
-    return einval();
-
-  // The OpenBSD poll() man pages claims it'll ignore POLLERR, POLLHUP,
-  // and POLLNVAL in pollfd::events except it doesn't actually do this.
-  size_t bytes = 0;
-  struct pollfd *fds2 = 0;
-  if (IsOpenbsd()) {
-    if (ckd_mul(&bytes, nfds, sizeof(struct pollfd)))
-      return einval();
-#pragma GCC push_options
-#pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
-    fds2 = alloca(bytes);
-#pragma GCC pop_options
-    CheckLargeStackAllocation(fds2, bytes);
-    memcpy(fds2, fds, bytes);
-    for (size_t i = 0; i < nfds; ++i)
-      fds2[i].events &= ~(POLLERR | POLLHUP | POLLNVAL);
-    struct pollfd *swap = fds;
-    fds = fds2;
-    fds2 = swap;
-  }
-
-  if (!IsWindows()) {
-    e = errno;
-    if (timeout) {
-      ts = *timeout;
-      tsp = &ts;
-    } else {
-      tsp = 0;
-    }
-    fdcount = sys_ppoll(fds, nfds, tsp, sigmask, 8);
-    if (fdcount == -1 && errno == ENOSYS) {
-      int64_t ms;
-      errno = e;
-      if (timeout) {
-        ms = timespec_tomillis(*timeout);
-        if (ms > INT_MAX)
-          ms = -1;
-      } else {
-        ms = -1;
-      }
-      if (sigmask)
-        sys_sigprocmask(SIG_SETMASK, sigmask, &oldmask);
-      fdcount = sys_poll(fds, nfds, ms);
-      if (sigmask)
-        sys_sigprocmask(SIG_SETMASK, &oldmask, 0);
-    }
-  } else {
-    fdcount = sys_poll_nt(fds, nfds, timeout, sigmask);
-  }
-
-  if (IsOpenbsd() && fdcount != -1) {
-    struct pollfd *swap = fds;
-    fds = fds2;
-    fds2 = swap;
-    memcpy(fds, fds2, bytes);
-  }
-
-  // One of the use cases for poll() is checking if a large number of
-  // file descriptors exist. However on XNU if none of the meaningful
-  // event flags are specified (e.g. POLLIN, POLLOUT) then it doesn't
-  // perform the POLLNVAL check that's implied on all other platforms
-  if (IsXnu() && fdcount != -1) {
-    for (size_t i = 0; i < nfds; ++i) {
-      if (fds[i].fd >= 0 &&   //
-          !fds[i].revents &&  //
-          !(fds[i].events & (POLLIN | POLLOUT | POLLPRI))) {
-        int err = errno;
-        if (fcntl(fds[i].fd, F_GETFL) == -1) {
-          errno = err;
-          fds[i].revents = POLLNVAL;
-          ++fdcount;
-        }
-      }
-    }
-  }
-
-  return fdcount;
-}
-
 /**
- * Checks status on multiple file descriptors at once.
+ * Waits for something to happen on multiple file descriptors at once.
  *
  * This function is the same as saying:
  *
@@ -136,54 +41,17 @@ static int ppoll_impl(struct pollfd *fds, size_t nfds,
  *     sigprocmask(SIG_SETMASK, old, 0);
  *
  * Except it happens atomically when the kernel supports doing that. On
- * kernels such as XNU and NetBSD which don't, this wrapper will fall
- * back to using the example above. If you need ironclad assurances of
- * signal mask atomicity, then consider using pselect() which Cosmo Libc
- * guarantees to be atomic on all supported platforms.
+ * kernel such as XNU and NetBSD which don't, this wrapper will fall
+ * back to using the example above. Consider using pselect() which is
+ * atomic on all supported platforms.
  *
- * Servers that need to handle an unbounded number of client connections
- * should just create a separate thread for each client. poll(), ppoll()
- * and select() aren't scalable i/o solutions on any platform.
+ * The Linux Kernel modifies the timeout parameter. This wrapper gives
+ * it a local variable due to POSIX requiring that `timeout` be const.
+ * If you need that information from the Linux Kernel use sys_ppoll().
  *
- * On Windows it's only possible to poll 64 file descriptors at a time;
- * it's a limitation imposed by WSAPoll(). Cosmopolitan Libc's ppoll()
- * polyfill can go higher in some cases; for example, It's possible to
- * poll 64 sockets and 64 pipes/terminals at the same time. Furthermore,
- * elements whose fd field is set to a negative number are ignored and
- * will not count against this limit.
- *
- * One of the use cases for poll() is to quickly check if a number of
- * file descriptors are valid. The canonical way to do this is to set
- * events to 0 which prevents blocking and causes only the invalid,
- * hangup, and error statuses to be checked.
- *
- * On XNU, the POLLHUP and POLLERR statuses aren't checked unless either
- * POLLIN, POLLOUT, or POLLPRI are specified in the events field. Cosmo
- * will however polyfill the checking of POLLNVAL on XNU with the events
- * doesn't specify any of the above i/o events.
- *
- * When XNU and BSD OSes report POLLHUP, they will always set POLLIN too
- * when POLLIN is requested, even in cases when there isn't unread data.
- *
- * @param fds[𝑖].fd should be a socket, input pipe, or conosle input
- *     and if it's a negative number then the entry is ignored, plus
- *     revents will be set to zero
- * @param fds[𝑖].events flags can have POLLIN, POLLOUT, POLLPRI,
- *     POLLRDNORM, POLLWRNORM, POLLRDBAND, POLLWRBAND as well as
- *     POLLERR, POLLHUP, and POLLNVAL although the latter are
- *     always implied (assuming fd≥0) so they're ignored here
- * @param timeout_ms if 0 means don't wait and negative waits forever
- * @return number of `fds` whose revents field has been set to a nonzero
- *     number, 0 if the timeout elapsed without events, or -1 w/ errno
- * @return fds[𝑖].revents is always zero initializaed and then will
- *     be populated with POLL{IN,OUT,PRI,HUP,ERR,NVAL} if something
- *     was determined about the file descriptor
  * @param timeout if null will block indefinitely
  * @param sigmask may be null in which case no mask change happens
  * @raise ECANCELED if thread was cancelled in masked mode
- * @raise EINVAL if `nfds` exceeded `RLIMIT_NOFILE`
- * @raise ENOMEM on failure to allocate memory
- * @raise EINVAL if `*timeout` is invalid
  * @raise EINTR if signal was delivered
  * @cancelationpoint
  * @asyncsignalsafe
@@ -191,12 +59,44 @@ static int ppoll_impl(struct pollfd *fds, size_t nfds,
  */
 int ppoll(struct pollfd *fds, size_t nfds, const struct timespec *timeout,
           const sigset_t *sigmask) {
-  int fdcount;
+  int e, rc;
+  sigset_t oldmask;
+  struct timespec ts, *tsp;
   BEGIN_CANCELATION_POINT;
-  fdcount = ppoll_impl(fds, nfds, timeout, sigmask);
+
+  if (!IsWindows()) {
+    e = errno;
+    if (timeout) {
+      ts = *timeout;
+      tsp = &ts;
+    } else {
+      tsp = 0;
+    }
+    rc = sys_ppoll(fds, nfds, tsp, sigmask, 8);
+    if (rc == -1 && errno == ENOSYS) {
+      int ms;
+      errno = e;
+      if (!timeout || ckd_add(&ms, timeout->tv_sec,
+                              (timeout->tv_nsec + 999999) / 1000000)) {
+        ms = -1;
+      }
+      if (sigmask)
+        sys_sigprocmask(SIG_SETMASK, sigmask, &oldmask);
+      rc = poll(fds, nfds, ms);
+      if (sigmask)
+        sys_sigprocmask(SIG_SETMASK, &oldmask, 0);
+    }
+  } else {
+    uint32_t ms;
+    if (!timeout ||
+        ckd_add(&ms, timeout->tv_sec, (timeout->tv_nsec + 999999) / 1000000)) {
+      ms = -1u;
+    }
+    rc = sys_poll_nt(fds, nfds, &ms, sigmask);
+  }
+
   END_CANCELATION_POINT;
-  STRACE("ppoll(%s, %'zu, %s, %s) → %d% lm",
-         DescribePollFds(fdcount, fds, nfds), nfds,
-         DescribeTimespec(0, timeout), DescribeSigset(0, sigmask), fdcount);
-  return fdcount;
+  STRACE("ppoll(%s, %'zu, %s, %s) → %d% lm", DescribePollFds(rc, fds, nfds),
+         nfds, DescribeTimespec(0, timeout), DescribeSigset(0, sigmask), rc);
+  return rc;
 }
diff --git a/libc/calls/pread.c b/libc/calls/pread.c
index ee42b55bd..0064397cb 100644
--- a/libc/calls/pread.c
+++ b/libc/calls/pread.c
@@ -26,7 +26,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/zipos.internal.h"
 #include "libc/stdio/sysparam.h"
diff --git a/libc/calls/preadv.c b/libc/calls/preadv.c
index 21ac113e0..83a6f2a86 100644
--- a/libc/calls/preadv.c
+++ b/libc/calls/preadv.c
@@ -60,7 +60,6 @@ static ssize_t Preadv(int fd, struct iovec *iov, int iovlen, int64_t off) {
       struct iovec *iov2;
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
       iov2 = alloca(iovlen * sizeof(struct iovec));
       CheckLargeStackAllocation(iov2, iovlen * sizeof(struct iovec));
 #pragma GCC pop_options
diff --git a/libc/calls/printfds.c b/libc/calls/printfds.c
index 4786357ef..36cb548d7 100644
--- a/libc/calls/printfds.c
+++ b/libc/calls/printfds.c
@@ -18,8 +18,8 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
 #include "libc/calls/state.internal.h"
-#include "libc/intrin/describeflags.h"
 #include "libc/intrin/fds.h"
+#include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
 
 static const char *__fdkind2str(int x) {
@@ -55,7 +55,7 @@ void __printfds(struct Fd *fds, size_t fdslen) {
       continue;
     kprintf("%3d %s", i, __fdkind2str(fds[i].kind));
     if (fds[i].flags) {
-      kprintf(" flags=%s", _DescribeOpenFlags(buf, fds[i].flags));
+      kprintf(" flags=%s", (DescribeOpenFlags)(buf, fds[i].flags));
     }
     if (fds[i].mode)
       kprintf(" mode=%#o", fds[i].mode);
diff --git a/libc/calls/program_executable_name_init.S b/libc/calls/program_executable_name_init.S
index 262018e25..99ed4db3e 100644
--- a/libc/calls/program_executable_name_init.S
+++ b/libc/calls/program_executable_name_init.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.init.start 305,_init_program_executable_name
 	push	%rdi
diff --git a/libc/calls/pselect.c b/libc/calls/pselect.c
index 93c7f495f..9d3036a4c 100644
--- a/libc/calls/pselect.c
+++ b/libc/calls/pselect.c
@@ -32,7 +32,7 @@
 #include "libc/sysv/errfuns.h"
 
 /**
- * Checks status on multiple file descriptors at once.
+ * Does what poll() does except with bitset API.
  *
  * This function is the same as saying:
  *
@@ -41,23 +41,15 @@
  *     select(nfds, readfds, writefds, exceptfds, timeout);
  *     sigprocmask(SIG_SETMASK, old, 0);
  *
- * Except it happens atomically. Unlike ppoll() Cosmo guarantees this is
- * atomic on all supported platforms.
+ * Except it happens atomically.
+ *
+ * The Linux Kernel modifies the timeout parameter. This wrapper gives
+ * it a local variable due to POSIX requiring that `timeout` be const.
+ * If you need that information from the Linux Kernel use sys_pselect.
+ *
+ * This system call is supported on all platforms. It's like select()
+ * except that it atomically changes the sigprocmask() during the op.
  *
- * @param nfds is the number of the highest file descriptor set in these
- *     bitsets by the caller, plus one; this value can't be greater than
- *     `FD_SETSIZE` which Cosmopolitan currently defines as 1024 because
- *     `fd_set` has a static size
- * @param readfds may be used to be notified when you can call read() on
- *     a file descriptor without it blocking; this includes when data is
- *     is available to be read as well as eof and error conditions
- * @param writefds may be used to be notified when write() may be called
- *     on a file descriptor without it blocking
- * @param exceptfds may be used to be notified of exceptional conditions
- *     such as out-of-band data on a socket; it is equivalent to POLLPRI
- *     in the revents of poll()
- * @param timeout if null will block indefinitely
- * @param sigmask may be null in which case no mask change happens
  * @raise ECANCELED if thread was cancelled in masked mode
  * @raise EINTR if signal was delivered
  * @cancelationpoint
@@ -67,6 +59,7 @@
 int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
             const struct timespec *timeout, const sigset_t *sigmask) {
   int rc;
+  struct timeval tv, *tvp;
   struct timespec ts, *tsp;
   struct {
     const sigset_t *s;
@@ -81,7 +74,7 @@ int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
   fd_set *old_exceptfds_ptr = 0;
 
   BEGIN_CANCELATION_POINT;
-  if (nfds < 0 || nfds > FD_SETSIZE) {
+  if (nfds < 0) {
     rc = einval();
   } else {
     if (readfds) {
@@ -110,17 +103,24 @@ int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
       rc = sys_pselect(nfds, readfds, writefds, exceptfds,
                        (struct timespec *)timeout, sigmask);
     } else {
-      rc = sys_select_nt(nfds, readfds, writefds, exceptfds, timeout, sigmask);
+      if (timeout) {
+        tv.tv_sec = timeout->tv_sec;
+        tv.tv_usec = timeout->tv_nsec / 1000;
+        tvp = &tv;
+      } else {
+        tvp = 0;
+      }
+      rc = sys_select_nt(nfds, readfds, writefds, exceptfds, tvp, sigmask);
     }
   }
   END_CANCELATION_POINT;
 
   STRACE("pselect(%d, %s → [%s], %s → [%s], %s → [%s], %s, %s) → %d% m", nfds,
-         DescribeFdSet(0, nfds, old_readfds_ptr),
+         DescribeFdSet(rc, nfds, old_readfds_ptr),
          DescribeFdSet(rc, nfds, readfds),
-         DescribeFdSet(0, nfds, old_writefds_ptr),
+         DescribeFdSet(rc, nfds, old_writefds_ptr),
          DescribeFdSet(rc, nfds, writefds),
-         DescribeFdSet(0, nfds, old_exceptfds_ptr),
+         DescribeFdSet(rc, nfds, old_exceptfds_ptr),
          DescribeFdSet(rc, nfds, exceptfds),  //
          DescribeTimespec(0, timeout),        //
          DescribeSigset(0, sigmask), rc);
diff --git a/libc/calls/pwrite.c b/libc/calls/pwrite.c
index c76c1d01a..e1f030def 100644
--- a/libc/calls/pwrite.c
+++ b/libc/calls/pwrite.c
@@ -26,7 +26,7 @@
 #include "libc/calls/syscall-sysv.internal.h"
 #include "libc/dce.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdio/sysparam.h"
 #include "libc/sysv/errfuns.h"
 
diff --git a/libc/calls/pwritev.c b/libc/calls/pwritev.c
index 3373a8f16..d2d8cd043 100644
--- a/libc/calls/pwritev.c
+++ b/libc/calls/pwritev.c
@@ -62,7 +62,6 @@ static ssize_t Pwritev(int fd, const struct iovec *iov, int iovlen,
       struct iovec *iov2;
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
       iov2 = alloca(iovlen * sizeof(struct iovec));
       CheckLargeStackAllocation(iov2, iovlen * sizeof(struct iovec));
 #pragma GCC pop_options
diff --git a/libc/calls/read-nt.c b/libc/calls/read-nt.c
index b50f428e2..bd460a4f3 100644
--- a/libc/calls/read-nt.c
+++ b/libc/calls/read-nt.c
@@ -16,15 +16,12 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/createfileflags.internal.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
 #include "libc/calls/state.internal.h"
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/sigset.internal.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/calls/struct/timespec.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/cosmo.h"
 #include "libc/ctype.h"
@@ -33,11 +30,10 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/dll.h"
 #include "libc/intrin/fds.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/intrin/nomultics.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/console.h"
 #include "libc/nt/createfile.h"
 #include "libc/nt/enum/accessmask.h"
@@ -47,8 +43,6 @@
 #include "libc/nt/enum/vk.h"
 #include "libc/nt/enum/wait.h"
 #include "libc/nt/errors.h"
-#include "libc/nt/events.h"
-#include "libc/nt/memory.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/struct/inputrecord.h"
 #include "libc/nt/synchronization.h"
@@ -88,15 +82,15 @@ struct VirtualKey {
 #define S(s) W(s "\0\0")
 #define W(s) (s[3] << 24 | s[2] << 16 | s[1] << 8 | s[0])
 
-static struct VirtualKey kVirtualKey[] = {
-    {kNtVkUp, S("A"), S("1;2A"), S("1;5A"), S("1;6A")},     // order matters
-    {kNtVkDown, S("B"), S("1;2B"), S("1;5B"), S("1;6B")},   // order matters
-    {kNtVkRight, S("C"), S("1;2C"), S("1;5C"), S("1;6C")},  // order matters
-    {kNtVkLeft, S("D"), S("1;2D"), S("1;5D"), S("1;6D")},   // order matters
-    {kNtVkEnd, S("F"), S("1;2F"), S("1;5F"), S("1;6F")},    // order matters
-    {kNtVkHome, S("H"), S("1;2H"), S("1;5H"), S("1;6H")},   // order matters
+static const struct VirtualKey kVirtualKey[] = {
+    {kNtVkUp, S("A"), S("1;2A"), S("1;5A"), S("1;6A")},
+    {kNtVkDown, S("B"), S("1;2B"), S("1;5B"), S("1;6B")},
+    {kNtVkRight, S("C"), S("1;2C"), S("1;5C"), S("1;6C")},
+    {kNtVkLeft, S("D"), S("1;2D"), S("1;5D"), S("1;6D")},
     {kNtVkInsert, S("2~"), S("2;2~"), S("2;5~"), S("2;6~")},
     {kNtVkDelete, S("3~"), S("3;2~"), S("3;5~"), S("3;6~")},
+    {kNtVkHome, S("H"), S("1;2H"), S("1;5H"), S("1;6H")},
+    {kNtVkEnd, S("F"), S("1;2F"), S("1;5F"), S("1;6F")},
     {kNtVkPrior, S("5~"), S("5;2~"), S("5;5~"), S("5;6~")},
     {kNtVkNext, S("6~"), S("6;2~"), S("6;5~"), S("6;6~")},
     {kNtVkF1, -S("OP"), S("1;2P"), S("11^"), S("1;6P")},
@@ -114,6 +108,17 @@ static struct VirtualKey kVirtualKey[] = {
     {0},
 };
 
+// TODO: How can we configure `less` to not need this bloat?
+static const struct VirtualKey kDecckm[] = {
+    {kNtVkUp, -S("OA"), -S("OA"), S("A"), S("A")},
+    {kNtVkDown, -S("OB"), -S("OB"), S("B"), S("B")},
+    {kNtVkRight, -S("OC"), -S("OC"), S("C"), S("C")},
+    {kNtVkLeft, -S("OD"), -S("OD"), S("D"), S("D")},
+    {kNtVkPrior, S("5~"), S("5;2~"), S("5;5~"), S("5;6~")},
+    {kNtVkNext, S("6~"), S("6;2~"), S("6;5~"), S("6;6~")},
+    {0},
+};
+
 #define KEYSTROKE_CONTAINER(e) DLL_CONTAINER(struct Keystroke, elem, e)
 
 struct Keystroke {
@@ -126,97 +131,79 @@ struct Keystrokes {
   atomic_uint once;
   bool end_of_file;
   bool ohno_decckm;
-  bool bypass_mode;
   uint16_t utf16hs;
-  size_t free_keys;
+  int16_t freekeys;
   int64_t cin, cot;
   struct Dll *list;
   struct Dll *line;
   struct Dll *free;
+  pthread_mutex_t lock;
+  const struct VirtualKey *vkt;
+  struct Keystroke pool[512];
 };
 
 static struct Keystrokes __keystroke;
-static pthread_mutex_t __keystroke_lock = PTHREAD_MUTEX_INITIALIZER;
 
-textwindows void sys_read_nt_wipe_keystrokes(void) {
+textwindows void WipeKeystrokes(void) {
   bzero(&__keystroke, sizeof(__keystroke));
-  _pthread_mutex_wipe_np(&__keystroke_lock);
 }
 
-textwindows static void FreeKeystrokeImpl(struct Dll *key) {
+static textwindows void FreeKeystrokeImpl(struct Dll *key) {
   dll_make_first(&__keystroke.free, key);
-  ++__keystroke.free_keys;
+  ++__keystroke.freekeys;
 }
 
-textwindows static struct Keystroke *AllocKeystroke(void) {
-  struct Keystroke *k;
-  if (!(k = HeapAlloc(GetProcessHeap(), 0, sizeof(struct Keystroke))))
-    return 0;
-  dll_init(&k->elem);
-  return k;
-}
-
-textwindows static struct Keystroke *NewKeystroke(void) {
-  struct Dll *e;
-  struct Keystroke *k;
-  if ((e = dll_first(__keystroke.free))) {
-    dll_remove(&__keystroke.free, e);
-    k = KEYSTROKE_CONTAINER(e);
-    --__keystroke.free_keys;
-  } else {
-    // PopulateKeystrokes() should make this branch impossible
-    if (!(k = AllocKeystroke()))
-      return 0;
-  }
+static textwindows struct Keystroke *NewKeystroke(void) {
+  struct Dll *e = dll_first(__keystroke.free);
+  struct Keystroke *k = KEYSTROKE_CONTAINER(e);
+  dll_remove(&__keystroke.free, &k->elem);
+  --__keystroke.freekeys;
+  // TODO(jart): What's wrong with GCC 12.3?
+  asm("" : "+r"(k));
   k->buflen = 0;
   return k;
 }
 
-textwindows static void FreeKeystroke(struct Dll **list, struct Dll *key) {
+static textwindows void FreeKeystroke(struct Dll **list, struct Dll *key) {
   dll_remove(list, key);
   FreeKeystrokeImpl(key);
 }
 
-textwindows static void FreeKeystrokes(struct Dll **list) {
+static textwindows void FreeKeystrokes(struct Dll **list) {
   struct Dll *key;
-  while ((key = dll_first(*list)))
+  while ((key = dll_first(*list))) {
     FreeKeystroke(list, key);
-}
-
-textwindows static void PopulateKeystrokes(size_t want) {
-  struct Keystroke *k;
-  while (__keystroke.free_keys < want) {
-    if ((k = AllocKeystroke())) {
-      FreeKeystrokeImpl(&k->elem);
-    } else {
-      break;
-    }
   }
 }
 
-textwindows static void OpenConsole(void) {
+static textwindows void OpenConsole(void) {
+  __keystroke.vkt = kVirtualKey;
   __keystroke.cin = CreateFile(u"CONIN$", kNtGenericRead | kNtGenericWrite,
                                kNtFileShareRead, 0, kNtOpenExisting, 0, 0);
   __keystroke.cot = CreateFile(u"CONOUT$", kNtGenericRead | kNtGenericWrite,
                                kNtFileShareWrite, 0, kNtOpenExisting, 0, 0);
+  for (int i = 0; i < ARRAYLEN(__keystroke.pool); ++i) {
+    dll_init(&__keystroke.pool[i].elem);
+    FreeKeystrokeImpl(&__keystroke.pool[i].elem);
+  }
 }
 
-textwindows static int AddSignal(int sig) {
+static textwindows int AddSignal(int sig) {
   atomic_fetch_or_explicit(&__get_tls()->tib_sigpending, 1ull << (sig - 1),
                            memory_order_relaxed);
   return 0;
 }
 
-textwindows static void InitConsole(void) {
+static textwindows void InitConsole(void) {
   cosmo_once(&__keystroke.once, OpenConsole);
 }
 
-textwindows static void LockKeystrokes(void) {
-  _pthread_mutex_lock(&__keystroke_lock);
+static textwindows void LockKeystrokes(void) {
+  pthread_mutex_lock(&__keystroke.lock);
 }
 
-textwindows static void UnlockKeystrokes(void) {
-  _pthread_mutex_unlock(&__keystroke_lock);
+static textwindows void UnlockKeystrokes(void) {
+  pthread_mutex_unlock(&__keystroke.lock);
 }
 
 textwindows int64_t GetConsoleInputHandle(void) {
@@ -229,39 +216,40 @@ textwindows int64_t GetConsoleOutputHandle(void) {
   return __keystroke.cot;
 }
 
-textwindows static bool IsMouseModeCommand(int x) {
+static textwindows bool IsMouseModeCommand(int x) {
   return x == 1000 ||  // SET_VT200_MOUSE
          x == 1002 ||  // SET_BTN_EVENT_MOUSE
          x == 1006 ||  // SET_SGR_EXT_MODE_MOUSE
          x == 1015;    // SET_URXVT_EXT_MODE_MOUSE
 }
 
-textwindows static int GetVirtualKey(uint16_t vk, bool shift, bool ctrl) {
-  for (int i = 0; kVirtualKey[i].vk; ++i) {
-    if (kVirtualKey[i].vk == vk) {
+static textwindows int GetVirtualKey(uint16_t vk, bool shift, bool ctrl) {
+  for (int i = 0; __keystroke.vkt[i].vk; ++i) {
+    if (__keystroke.vkt[i].vk == vk) {
       if (shift && ctrl) {
-        return kVirtualKey[i].shift_ctrl_str;
+        return __keystroke.vkt[i].shift_ctrl_str;
       } else if (shift) {
-        return kVirtualKey[i].shift_str;
+        return __keystroke.vkt[i].shift_str;
       } else if (ctrl) {
-        return kVirtualKey[i].ctrl_str;
+        return __keystroke.vkt[i].ctrl_str;
       } else {
-        return kVirtualKey[i].normal_str;
+        return __keystroke.vkt[i].normal_str;
       }
     }
   }
   return 0;
 }
 
-textwindows static int ProcessKeyEvent(const struct NtInputRecord *r, char *p) {
+static textwindows int ProcessKeyEvent(const struct NtInputRecord *r, char *p) {
 
   uint32_t c = r->Event.KeyEvent.uChar.UnicodeChar;
   uint16_t vk = r->Event.KeyEvent.wVirtualKeyCode;
   uint16_t cks = r->Event.KeyEvent.dwControlKeyState;
 
   // ignore keyup events
-  if (!r->Event.KeyEvent.bKeyDown && (!c || vk != kNtVkMenu))
+  if (!r->Event.KeyEvent.bKeyDown && (!c || vk != kNtVkMenu)) {
     return 0;
+  }
 
 #if 0
   // this code is useful for troubleshooting why keys don't work
@@ -321,41 +309,40 @@ textwindows static int ProcessKeyEvent(const struct NtInputRecord *r, char *p) {
     __keystroke.utf16hs = c;
     return 0;
   }
-  if (IsLowSurrogate(c))
+  if (IsLowSurrogate(c)) {
     c = MergeUtf16(__keystroke.utf16hs, c);
+  }
 
   // enter sends \r with raw terminals
   // make it a multics newline instead
-  if (c == '\r' && !(__ttyconf.magic & kTtyNoCr2Nl))
+  if (c == '\r' && !(__ttyconf.magic & kTtyNoCr2Nl)) {
     c = '\n';
+  }
 
   // ctrl-space (^@) is literally zero
-  if (c == ' ' && (cks & (kNtLeftCtrlPressed | kNtRightCtrlPressed)))
+  if (c == ' ' && (cks & (kNtLeftCtrlPressed | kNtRightCtrlPressed))) {
     c = '\0';
+  }
 
   // make backspace (^?) distinguishable from ctrl-h (^H)
-  if (c == kNtVkBack && !(cks & (kNtLeftCtrlPressed | kNtRightCtrlPressed)))
+  if (c == kNtVkBack && !(cks & (kNtLeftCtrlPressed | kNtRightCtrlPressed))) {
     c = 0177;
+  }
 
   // handle ctrl-\ and ctrl-c
   // note we define _POSIX_VDISABLE as zero
   // tcsetattr() lets anyone reconfigure these keybindings
-  if (c && !(__ttyconf.magic & kTtyNoIsigs) && !__keystroke.bypass_mode) {
-    char b[] = {c};
+  if (c && !(__ttyconf.magic & kTtyNoIsigs)) {
     if (c == __ttyconf.vintr) {
-      EchoConsoleNt(b, 1, false);
       return AddSignal(SIGINT);
     } else if (c == __ttyconf.vquit) {
-      EchoConsoleNt(b, 1, false);
       return AddSignal(SIGQUIT);
     }
   }
 
   // handle ctrl-d which generates end-of-file, unless pending line data
   // is present, in which case we flush that without the newline instead
-  if (c && c == __ttyconf.veof &&  //
-      !__keystroke.bypass_mode &&  //
-      !(__ttyconf.magic & kTtyUncanon)) {
+  if (c && c == __ttyconf.veof && !(__ttyconf.magic & kTtyUncanon)) {
     if (dll_is_empty(__keystroke.line)) {
       __keystroke.end_of_file = true;
     } else {
@@ -385,7 +372,7 @@ textwindows static int ProcessKeyEvent(const struct NtInputRecord *r, char *p) {
 //   - write(1, "\e[?1000;1002;1015;1006h") to enable
 //   - write(1, "\e[?1000;1002;1015;1006l") to disable
 // See o//examples/ttyinfo and o//tool/viz/life
-textwindows static int ProcessMouseEvent(const struct NtInputRecord *r,
+static textwindows int ProcessMouseEvent(const struct NtInputRecord *r,
                                          char *b) {
   char *p = b;
   unsigned char e = 0;
@@ -410,21 +397,21 @@ textwindows static int ProcessMouseEvent(const struct NtInputRecord *r,
                   kNtLeftAltPressed | kNtRightAltPressed))) {
       // we disable mouse highlighting when the tty is put in raw mode
       // to mouse wheel events with widely understood vt100 arrow keys
-      for (int i = 0; i < 3; ++i) {
-        *p++ = 033;
-        *p++ = !__keystroke.ohno_decckm ? '[' : 'O';
-        if (isup) {
-          *p++ = 'A';
-        } else {
-          *p++ = 'B';
-        }
+      *p++ = 033;
+      *p++ = !__keystroke.ohno_decckm ? '[' : 'O';
+      if (isup) {
+        *p++ = 'A';
+      } else {
+        *p++ = 'B';
       }
     }
   } else if ((bs || currentbs) && (__ttyconf.magic & kTtyXtMouse)) {
-    if (bs && (ev & kNtMouseMoved) && currentbs)
+    if (bs && (ev & kNtMouseMoved) && currentbs) {
       e |= 32;  // dragging
-    if ((bs | currentbs) & kNtRightmostButtonPressed)
+    }
+    if ((bs | currentbs) & kNtRightmostButtonPressed) {
       e |= 2;  // right
+    }
   OutputXtermMouseEvent:
     *p++ = 033;
     *p++ = '[';
@@ -444,7 +431,7 @@ textwindows static int ProcessMouseEvent(const struct NtInputRecord *r,
   return p - b;
 }
 
-textwindows static int ConvertConsoleInputToAnsi(const struct NtInputRecord *r,
+static textwindows int ConvertConsoleInputToAnsi(const struct NtInputRecord *r,
                                                  char p[hasatleast 23]) {
   switch (r->EventType) {
     case kNtKeyEvent:
@@ -458,19 +445,18 @@ textwindows static int ConvertConsoleInputToAnsi(const struct NtInputRecord *r,
   }
 }
 
-textwindows static void WriteTty(const char *p, size_t n) {
+static textwindows void WriteTty(const char *p, size_t n) {
   WriteFile(__keystroke.cot, p, n, 0, 0);
 }
 
-textwindows static bool IsCtl(int c, bool escape_harder) {
-  return isascii(c) && iscntrl(c) &&
-         (escape_harder || (c != '\n' && c != '\t'));
+static textwindows bool IsCtl(int c) {
+  return isascii(c) && iscntrl(c) && c != '\n' && c != '\t';
 }
 
-textwindows static void WriteCtl(const char *p, size_t n, bool escape_harder) {
+static textwindows void WriteCtl(const char *p, size_t n) {
   size_t i;
   for (i = 0; i < n; ++i) {
-    if (IsCtl(p[i], escape_harder)) {
+    if (IsCtl(p[i])) {
       char ctl[2];
       ctl[0] = '^';
       ctl[1] = p[i] ^ 0100;
@@ -481,23 +467,19 @@ textwindows static void WriteCtl(const char *p, size_t n, bool escape_harder) {
   }
 }
 
-textwindows void EchoConsoleNt(const char *p, size_t n, bool escape_harder) {
-  InitConsole();
-  if (!(__ttyconf.magic & kTtySilence)) {
-    if (__ttyconf.magic & kTtyEchoRaw) {
-      WriteTty(p, n);
-    } else {
-      WriteCtl(p, n, escape_harder);
-    }
+static textwindows void EchoTty(const char *p, size_t n) {
+  if (__ttyconf.magic & kTtyEchoRaw) {
+    WriteTty(p, n);
+  } else {
+    WriteCtl(p, n);
   }
 }
 
-textwindows static void EraseCharacter(bool should_echo) {
-  if (should_echo)
-    WriteTty("\b \b", 3);
+static textwindows void EraseCharacter(void) {
+  WriteTty("\b \b", 3);
 }
 
-textwindows static bool EraseKeystroke(bool should_echo) {
+static textwindows bool EraseKeystroke(void) {
   struct Dll *e;
   if ((e = dll_last(__keystroke.line))) {
     struct Keystroke *k = KEYSTROKE_CONTAINER(e);
@@ -505,9 +487,10 @@ textwindows static bool EraseKeystroke(bool should_echo) {
     for (int i = k->buflen; i--;) {
       if ((k->buf[i] & 0300) == 0200)
         continue;  // utf-8 cont
-      EraseCharacter(should_echo);
-      if (!(__ttyconf.magic & kTtyEchoRaw) && IsCtl(k->buf[i], true))
-        EraseCharacter(should_echo);
+      EraseCharacter();
+      if (!(__ttyconf.magic & kTtyEchoRaw) && IsCtl(k->buf[i])) {
+        EraseCharacter();
+      }
     }
     return true;
   } else {
@@ -515,133 +498,42 @@ textwindows static bool EraseKeystroke(bool should_echo) {
   }
 }
 
-textwindows static int IsLookingAtSpace(void) {
-  struct Dll *e;
-  if ((e = dll_last(__keystroke.line))) {
-    struct Keystroke *k = KEYSTROKE_CONTAINER(e);
-    return k->buflen == 1 && isascii(k->buf[0]) && isspace(k->buf[0]);
-  } else {
-    return -1;
-  }
-}
-
-textwindows static void IngestConsoleInputRecord(struct NtInputRecord *r) {
+static textwindows void IngestConsoleInputRecord(struct NtInputRecord *r) {
 
   // convert win32 console event into ansi
   int len;
   char buf[23];
-  if (!(len = ConvertConsoleInputToAnsi(r, buf)))
+  if (!(len = ConvertConsoleInputToAnsi(r, buf))) {
     return;
-
-  // handle ctrl-v in canonical mode
-  // the next keystroke will bypass input processing
-  if (!(__ttyconf.magic & kTtyUncanon) &&   // ICANON
-      !(__ttyconf.magic & kTtyNoIexten)) {  // IEXTEN
-    if (__keystroke.bypass_mode) {
-      struct Keystroke *k = NewKeystroke();
-      if (!k)
-        return;
-      memcpy(k->buf, buf, sizeof(k->buf));
-      k->buflen = len;
-      dll_make_last(&__keystroke.line, &k->elem);
-      EchoConsoleNt(buf, len, true);
-      __keystroke.bypass_mode = false;
-      return;
-    } else if (len == 1 && buf[0] &&  //
-               (buf[0] & 255) == __ttyconf.vlnext) {
-      __keystroke.bypass_mode = true;
-      if (!(__ttyconf.magic & kTtySilence) &&  // ECHO
-          !(__ttyconf.magic & kTtyEchoRaw))    // ECHOCTL
-        WriteTty("^\b", 2);
-      return;
-    }
   }
 
   // handle backspace in canonical mode
   if (len == 1 && buf[0] &&                  //
       (buf[0] & 255) == __ttyconf.verase &&  //
-      !(__ttyconf.magic & kTtyUncanon) &&    //
-      !(__ttyconf.magic & kTtyNoIexten)) {
-    bool should_visually_erase =             //
-        !(__ttyconf.magic & kTtySilence) &&  // ECHO
-        !(__ttyconf.magic & kTtyNoEchoe);    // ECHOE
-    EraseKeystroke(should_visually_erase);
-    if (!(__ttyconf.magic & kTtySilence) &&  // ECHO
-        (__ttyconf.magic & kTtyNoEchoe) &&   // !ECHOE
-        !(__ttyconf.magic & kTtyEchoRaw))    // ECHOCTL
-      WriteCtl(buf, len, true);
-    return;
-  }
-
-  // handle ctrl-w in canonical mode
-  // this lets you erase the last word
-  if (len == 1 && buf[0] &&                   //
-      (buf[0] & 255) == __ttyconf.vwerase &&  //
-      !(__ttyconf.magic & kTtyUncanon) &&     //
-      !(__ttyconf.magic & kTtyNoIexten)) {
-    bool should_visually_erase =             //
-        !(__ttyconf.magic & kTtySilence) &&  // ECHO
-        !(__ttyconf.magic & kTtyNoEchoe);    // ECHOE
-    while (IsLookingAtSpace() == 1)
-      EraseKeystroke(should_visually_erase);
-    while (IsLookingAtSpace() == 0)
-      EraseKeystroke(should_visually_erase);
-    if (!(__ttyconf.magic & kTtySilence) &&  // ECHO
-        (__ttyconf.magic & kTtyNoEchoe) &&   // !ECHOE
-        !(__ttyconf.magic & kTtyEchoRaw))    // ECHOCTL
-      WriteCtl(buf, len, true);
+      !(__ttyconf.magic & kTtyUncanon)) {
+    EraseKeystroke();
     return;
   }
 
   // handle kill in canonical mode
-  // this clears the line you're editing
   if (len == 1 && buf[0] &&                 //
       (buf[0] & 255) == __ttyconf.vkill &&  //
-      !(__ttyconf.magic & kTtyUncanon) &&   //
-      !(__ttyconf.magic & kTtyNoIexten)) {
-    bool should_visually_kill =              //
-        !(__ttyconf.magic & kTtySilence) &&  // ECHO
-        !(__ttyconf.magic & kTtyNoEchok) &&  // ECHOK
-        !(__ttyconf.magic & kTtyNoEchoke);   // ECHOKE
-    while (EraseKeystroke(should_visually_kill)) {
-    }
-    if (!(__ttyconf.magic & kTtySilence) &&  // ECHO
-        !(__ttyconf.magic & kTtyNoEchok) &&  // ECHOK
-        (__ttyconf.magic & kTtyNoEchoke) &&  // !ECHOKE
-        !(__ttyconf.magic & kTtyEchoRaw))    // ECHOCTL
-      WriteCtl(buf, len, true);
-    return;
-  }
-
-  // handle ctrl-r in canonical mode
-  // this reprints the line you're editing
-  if (len == 1 && buf[0] &&                    //
-      (buf[0] & 255) == __ttyconf.vreprint &&  //
-      !(__ttyconf.magic & kTtyUncanon) &&      // ICANON
-      !(__ttyconf.magic & kTtyNoIexten) &&     // IEXTEN
-      !(__ttyconf.magic & kTtySilence)) {      // ECHO
-    struct Dll *e;
-    if (!(__ttyconf.magic & kTtyEchoRaw))
-      WriteCtl(buf, len, true);
-    WriteTty("\r\n", 2);
-    for (e = dll_first(__keystroke.line); e;
-         e = dll_next(__keystroke.line, e)) {
-      struct Keystroke *k = KEYSTROKE_CONTAINER(e);
-      WriteCtl(k->buf, k->buflen, true);
+      !(__ttyconf.magic & kTtyUncanon)) {
+    while (EraseKeystroke()) {
     }
     return;
   }
 
   // allocate object to hold keystroke
   struct Keystroke *k = NewKeystroke();
-  if (!k)
-    return;
   memcpy(k->buf, buf, sizeof(k->buf));
   k->buflen = len;
 
   // echo input if it was successfully recorded
   // assuming the win32 console isn't doing it already
-  EchoConsoleNt(buf, len, false);
+  if (!(__ttyconf.magic & kTtySilence)) {
+    EchoTty(buf, len);
+  }
 
   // save keystroke to appropriate list
   if (__ttyconf.magic & kTtyUncanon) {
@@ -649,37 +541,37 @@ textwindows static void IngestConsoleInputRecord(struct NtInputRecord *r) {
   } else {
     dll_make_last(&__keystroke.line, &k->elem);
 
-    // flush canonical mode line on enter
-    if (len == 1 && buf[0] &&
-        ((buf[0] & 255) == '\n' ||            //
-         (buf[0] & 255) == __ttyconf.veol ||  //
-         ((buf[0] & 255) == __ttyconf.veol2 &&
-          !(__ttyconf.magic & kTtyNoIexten)))) {
+    // flush canonical mode line if oom or enter
+    if (!__keystroke.freekeys || (len == 1 && buf[0] &&
+                                  ((buf[0] & 255) == '\n' ||            //
+                                   (buf[0] & 255) == __ttyconf.veol ||  //
+                                   (buf[0] & 255) == __ttyconf.veol2))) {
       dll_make_last(&__keystroke.list, __keystroke.line);
       __keystroke.line = 0;
     }
   }
 }
 
-textwindows static void IngestConsoleInput(void) {
+static textwindows void IngestConsoleInput(void) {
   uint32_t i, n;
   struct NtInputRecord records[16];
   for (;;) {
+    if (!__keystroke.freekeys)
+      return;
     if (__keystroke.end_of_file)
       return;
-    if (!GetNumberOfConsoleInputEvents(__keystroke.cin, &n))
+    if (!GetNumberOfConsoleInputEvents(__keystroke.cin, &n)) {
       goto UnexpectedEof;
-    if (n > ARRAYLEN(records))
-      n = ARRAYLEN(records);
-    PopulateKeystrokes(n + 1);
-    if (n > __keystroke.free_keys)
-      n = __keystroke.free_keys;
+    }
     if (!n)
       return;
-    if (!ReadConsoleInput(__keystroke.cin, records, n, &n))
+    n = MIN(__keystroke.freekeys, MIN(ARRAYLEN(records), n));
+    if (!ReadConsoleInput(__keystroke.cin, records, n, &n)) {
       goto UnexpectedEof;
-    for (i = 0; i < n && !__keystroke.end_of_file; ++i)
+    }
+    for (i = 0; i < n && !__keystroke.end_of_file; ++i) {
       IngestConsoleInputRecord(records + i);
+    }
   }
 UnexpectedEof:
   STRACE("console read error %d", GetLastError());
@@ -707,10 +599,12 @@ textwindows int CountConsoleInputBytes(void) {
   InitConsole();
   LockKeystrokes();
   IngestConsoleInput();
-  for (e = dll_first(__keystroke.list); e; e = dll_next(__keystroke.list, e))
+  for (e = dll_first(__keystroke.list); e; e = dll_next(__keystroke.list, e)) {
     count += KEYSTROKE_CONTAINER(e)->buflen;
-  if (!count && __keystroke.end_of_file)
+  }
+  if (!count && __keystroke.end_of_file) {
     count = -1;
+  }
   UnlockKeystrokes();
   ALLOW_SIGNALS;
   return count;
@@ -755,14 +649,8 @@ textwindows void InterceptTerminalCommands(const char *data, size_t size) {
           x = 0;
         } else if (data[i] == 'h') {
           if (x == 1) {
-            // \e[?1h decckm on
+            __keystroke.vkt = kDecckm;  // \e[?1h decckm on
             __keystroke.ohno_decckm = true;
-            kVirtualKey[0].normal_str = -S("OA");  // kNtVkUp
-            kVirtualKey[1].normal_str = -S("OB");  // kNtVkDown
-            kVirtualKey[2].normal_str = -S("OC");  // kNtVkRight
-            kVirtualKey[3].normal_str = -S("OD");  // kNtVkLeft
-            kVirtualKey[4].normal_str = -S("OF");  // kNtVkEnd
-            kVirtualKey[5].normal_str = -S("OH");  // kNtVkHome
           } else if ((ismouse |= IsMouseModeCommand(x))) {
             __ttyconf.magic |= kTtyXtMouse;
             cm2 |= kNtEnableMouseInput;
@@ -771,14 +659,8 @@ textwindows void InterceptTerminalCommands(const char *data, size_t size) {
           t = ASC;
         } else if (data[i] == 'l') {
           if (x == 1) {
-            // \e[?1l decckm off
+            __keystroke.vkt = kVirtualKey;  // \e[?1l decckm off
             __keystroke.ohno_decckm = false;
-            kVirtualKey[0].normal_str = S("A");  // kNtVkUp
-            kVirtualKey[1].normal_str = S("B");  // kNtVkDown
-            kVirtualKey[2].normal_str = S("C");  // kNtVkRight
-            kVirtualKey[3].normal_str = S("D");  // kNtVkLeft
-            kVirtualKey[4].normal_str = S("F");  // kNtVkEnd
-            kVirtualKey[5].normal_str = S("H");  // kNtVkHome
           } else if ((ismouse |= IsMouseModeCommand(x))) {
             __ttyconf.magic &= ~kTtyXtMouse;
             cm2 |= kNtEnableQuickEditMode;  // release mouse
@@ -790,11 +672,12 @@ textwindows void InterceptTerminalCommands(const char *data, size_t size) {
         __builtin_unreachable();
     }
   }
-  if (cm2 != cm)
+  if (cm2 != cm) {
     SetConsoleMode(GetConsoleInputHandle(), cm2);
+  }
 }
 
-textwindows static bool DigestConsoleInput(char *data, size_t size, int *rc) {
+static textwindows bool DigestConsoleInput(char *data, size_t size, int *rc) {
 
   // handle eof once available input is consumed
   if (dll_is_empty(__keystroke.list) && __keystroke.end_of_file) {
@@ -821,8 +704,9 @@ textwindows static bool DigestConsoleInput(char *data, size_t size, int *rc) {
     } else {
       FreeKeystroke(&__keystroke.list, e);
     }
-    if ((__ttyconf.magic & kTtyUncanon) && toto >= __ttyconf.vmin)
+    if ((__ttyconf.magic & kTtyUncanon) && toto >= __ttyconf.vmin) {
       break;
+    }
   }
 
   // return result
@@ -834,124 +718,10 @@ textwindows static bool DigestConsoleInput(char *data, size_t size, int *rc) {
   }
 }
 
-textwindows static uint32_t DisableProcessedInput(void) {
-  // the time has come to ensure that ctrl-v ctrl-c works in icanon mode
-  // we're perfectly capable of generating a SIGINT or SIGQUIT ourselves
-  // while the cosmo termios driver is in control; so we disable windows
-  // console input processing for now; we'll turn it back on when we are
-  // done, since it's useful for ensuring asynchronous signal deliveries
-  uint32_t inmode = 0;
-  if (GetConsoleMode(__keystroke.cin, &inmode))
-    if (inmode & kNtEnableProcessedInput)
-      SetConsoleMode(__keystroke.cin, inmode & ~kNtEnableProcessedInput);
-  return inmode;
-}
-
-textwindows static void RestoreProcessedInput(uint32_t inmode) {
-  // re-enable win32 console input processing, if it was enabled when we
-  // started, and no signal handler callbacks changed things in-between.
-  if (inmode & kNtEnableProcessedInput) {
-    uint32_t inmode2;
-    if (GetConsoleMode(__keystroke.cin, &inmode2))
-      if (inmode2 == (inmode & ~kNtEnableProcessedInput))
-        SetConsoleMode(__keystroke.cin, inmode);
-  }
-}
-
-textwindows static int CountConsoleInputBytesBlockingImpl(uint32_t ms,
-                                                          sigset_t waitmask,
-                                                          bool restartable) {
-  InitConsole();
-  struct timespec deadline =
-      timespec_add(sys_clock_gettime_monotonic_nt(), timespec_frommillis(ms));
-  for (;;) {
-    int sig = 0;
-    intptr_t sev;
-    if (!(sev = CreateEvent(0, 0, 0, 0)))
-      return __winerr();
-    struct PosixThread *pt = _pthread_self();
-    pt->pt_event = sev;
-    pt->pt_blkmask = waitmask;
-    atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_EVENT,
-                          memory_order_release);
-    if (_check_cancel() == -1) {
-      atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
-      CloseHandle(sev);
-      return -1;
-    }
-    if (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
-      atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
-      CloseHandle(sev);
-      goto DeliverSignal;
-    }
-    struct timespec now = sys_clock_gettime_monotonic_nt();
-    struct timespec remain = timespec_subz(deadline, now);
-    int64_t millis = timespec_tomillis(remain);
-    uint32_t waitms = MIN(millis, 0xffffffffu);
-    intptr_t hands[] = {__keystroke.cin, sev};
-    uint32_t wi = WaitForMultipleObjects(2, hands, 0, waitms);
-    atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
-    CloseHandle(sev);
-    if (wi == -1u)
-      return __winerr();
-
-    // check for wait timeout
-    if (wi == kNtWaitTimeout)
-      return etimedout();
-
-    // handle event on console handle. this means we can now read from the
-    // conosle without blocking. so the first thing we do is slurp up your
-    // keystroke data. some of those keystrokes might cause a signal to be
-    // raised. so we need to check for pending signals again and handle it
-    if (wi == 0) {
-      int got = CountConsoleInputBytes();
-      // we might have read a keystroke that generated a signal
-      if (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask)))
-        goto DeliverSignal;
-      if (got == -1)
-        // this is a bona fide eof and console errors are logged to strace
-        return 0;
-      if (got == 0)
-        // this can happen for multiple reasons. first our driver controls
-        // user interactions in canonical mode. secondly we could lose the
-        // race with another thread that's reading input.
-        continue;
-      return got;
-    }
-
-    if (wi == 1 && _weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
-      // handle event on throwaway semaphore, it is poked by signal delivery
-    DeliverSignal:;
-      int handler_was_called = 0;
-      do {
-        handler_was_called |= _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
-      } while ((sig = _weaken(__sig_get)(waitmask)));
-      if (_check_cancel() == -1)
-        return -1;
-      if (handler_was_called & SIG_HANDLED_NO_RESTART)
-        return eintr();
-      if (handler_was_called & SIG_HANDLED_SA_RESTART)
-        if (!restartable)
-          return eintr();
-    }
-  }
-}
-
-textwindows static int CountConsoleInputBytesBlocking(uint32_t ms,
-                                                      sigset_t waitmask) {
-  int got = CountConsoleInputBytes();
-  if (got == -1)
-    return 0;
-  if (got > 0)
-    return got;
-  uint32_t inmode = DisableProcessedInput();
-  int rc = CountConsoleInputBytesBlockingImpl(ms, waitmask, true);
-  RestoreProcessedInput(inmode);
-  return rc;
-}
-
-textwindows static int WaitToReadFromConsole(struct Fd *f, sigset_t waitmask) {
-  uint32_t ms = -1;
+static textwindows int WaitForConsole(struct Fd *f, sigset_t waitmask) {
+  int sig;
+  int64_t sem;
+  uint32_t wi, ms = -1;
   if (!__ttyconf.vmin) {
     if (!__ttyconf.vtime) {
       return 0;  // non-blocking w/o raising eagain
@@ -959,32 +729,51 @@ textwindows static int WaitToReadFromConsole(struct Fd *f, sigset_t waitmask) {
       ms = __ttyconf.vtime * 100;
     }
   }
+  if (_check_cancel() == -1)
+    return -1;
   if (f->flags & _O_NONBLOCK)
     return eagain();
-  int olderr = errno;
-  int rc = CountConsoleInputBytesBlockingImpl(ms, waitmask, true);
-  if (rc == -1 && errno == ETIMEDOUT) {
-    // read() never raises ETIMEDOUT so if vtime elapses we raise an EOF
-    errno = olderr;
-    rc = 0;
+  if (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
+    goto DeliverSignal;
   }
-  return rc;
+  struct PosixThread *pt = _pthread_self();
+  pt->pt_blkmask = waitmask;
+  pt->pt_semaphore = sem = CreateSemaphore(0, 0, 1, 0);
+  atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_SEM, memory_order_release);
+  wi = WaitForMultipleObjects(2, (int64_t[2]){__keystroke.cin, sem}, 0, ms);
+  atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
+  CloseHandle(sem);
+  if (wi == kNtWaitTimeout)
+    return 0;  // vtime elapsed
+  if (wi == 0)
+    return -2;  // console data
+  if (wi != 1)
+    return __winerr();  // wait failed
+  if (_weaken(__sig_get)) {
+    if (!(sig = _weaken(__sig_get)(waitmask)))
+      return eintr();
+  DeliverSignal:
+    int handler_was_called = _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
+    if (_check_cancel() == -1)
+      return -1;
+    if (!(handler_was_called & SIG_HANDLED_NO_RESTART))
+      return -2;
+  }
+  return eintr();
 }
 
-textwindows static ssize_t ReadFromConsole(struct Fd *f, void *data,
+static textwindows ssize_t ReadFromConsole(struct Fd *f, void *data,
                                            size_t size, sigset_t waitmask) {
   int rc;
   InitConsole();
-  uint32_t inmode = DisableProcessedInput();
   do {
     LockKeystrokes();
     IngestConsoleInput();
     bool done = DigestConsoleInput(data, size, &rc);
     UnlockKeystrokes();
     if (done)
-      break;
-  } while ((rc = WaitToReadFromConsole(f, waitmask)) > 0);
-  RestoreProcessedInput(inmode);
+      return rc;
+  } while ((rc = WaitForConsole(f, waitmask)) == -2);
   return rc;
 }
 
@@ -994,16 +783,17 @@ textwindows ssize_t ReadBuffer(int fd, void *data, size_t size, int64_t offset,
   // switch to terminal polyfill if reading from win32 console
   struct Fd *f = g_fds.p + fd;
 
-  if (f->kind == kFdDevNull)
+  if (f->kind == kFdDevNull) {
     return 0;
-
-  if (f->kind == kFdDevRandom) {
-    ProcessPrng(data, size);
-    return size;
   }
 
-  if (f->kind == kFdConsole)
+  if (f->kind == kFdDevRandom) {
+    return ProcessPrng(data, size) ? size : __winerr();
+  }
+
+  if (f->kind == kFdConsole) {
     return ReadFromConsole(f, data, size, waitmask);
+  }
 
   // perform heavy lifting
   ssize_t rc;
@@ -1024,7 +814,7 @@ textwindows ssize_t ReadBuffer(int fd, void *data, size_t size, int64_t offset,
   }
 }
 
-textwindows static ssize_t ReadIovecs(int fd, const struct iovec *iov,
+static textwindows ssize_t ReadIovecs(int fd, const struct iovec *iov,
                                       size_t iovlen, int64_t opt_offset,
                                       sigset_t waitmask) {
   ssize_t rc;
diff --git a/libc/calls/readlinkat-nt.c b/libc/calls/readlinkat-nt.c
index d70cf5583..bd2423272 100644
--- a/libc/calls/readlinkat-nt.c
+++ b/libc/calls/readlinkat-nt.c
@@ -52,7 +52,6 @@ static textwindows ssize_t sys_readlinkat_nt_impl(int dirfd, const char *path,
   ssize_t rc;
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
   uint32_t mem = 6000;
   volatile char *memory = alloca(mem);
   CheckLargeStackAllocation((char *)memory, mem);
@@ -120,17 +119,7 @@ static textwindows ssize_t sys_readlinkat_nt_impl(int dirfd, const char *path,
         }
         rc = j;
       } else {
-        // e.g. 0xA000001D means IO_REPARSE_TAG_LX_SYMLINK
-        //
-        //     "WSL symlinks can't be opened from Windows, only from
-        //      within WSL, so if we identify them as fs.ModeSymlink,
-        //      then functions like filepath.Walk would fail when trying
-        //      to follow the link."
-        //
-        //            —Quoth Quim Muntal (dev on Go team at Microsoft)
-        //
-        // See also MSDN Learn § 2.1.2.1 Reparse Tags
-        NTTRACE("reparse tag %#x != kNtIoReparseTagSymlink", rdb->ReparseTag);
+        NTTRACE("sys_readlinkat_nt() should have kNtIoReparseTagSymlink");
         rc = einval();
       }
     } else {
diff --git a/libc/calls/readlinkat.c b/libc/calls/readlinkat.c
index a9b539cfb..dc5041c21 100644
--- a/libc/calls/readlinkat.c
+++ b/libc/calls/readlinkat.c
@@ -57,7 +57,6 @@ ssize_t readlinkat(int dirfd, const char *path, char *buf, size_t bufsiz) {
   } else if (_weaken(__zipos_notat) &&
              (bytes = __zipos_notat(dirfd, path)) == -1) {
     STRACE("TODO: zipos support for readlinkat");
-    bytes = einval();
   } else if (!IsWindows()) {
     bytes = sys_readlinkat(dirfd, path, buf, bufsiz);
   } else {
diff --git a/libc/calls/readv-metal.c b/libc/calls/readv-metal.c
index 926d6fbc8..5de8ace91 100644
--- a/libc/calls/readv-metal.c
+++ b/libc/calls/readv-metal.c
@@ -22,7 +22,7 @@
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/iovec.internal.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/vga/vga.internal.h"
diff --git a/libc/calls/readv.c b/libc/calls/readv.c
index 26d05690f..b48ca2ff6 100644
--- a/libc/calls/readv.c
+++ b/libc/calls/readv.c
@@ -57,7 +57,6 @@ static ssize_t readv_impl(int fd, const struct iovec *iov, int iovlen) {
       struct iovec *iov2;
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
       iov2 = alloca(iovlen * sizeof(struct iovec));
       CheckLargeStackAllocation(iov2, iovlen * sizeof(struct iovec));
 #pragma GCC pop_options
diff --git a/libc/calls/readwrite-nt.c b/libc/calls/readwrite-nt.c
index 1c983ca80..6fbfc1075 100644
--- a/libc/calls/readwrite-nt.c
+++ b/libc/calls/readwrite-nt.c
@@ -19,21 +19,18 @@
 #include "libc/calls/createfileflags.internal.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/errno.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/weaken.h"
 #include "libc/nt/enum/filetype.h"
 #include "libc/nt/errors.h"
 #include "libc/nt/events.h"
 #include "libc/nt/files.h"
-#include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/struct/overlapped.h"
 #include "libc/nt/synchronization.h"
 #include "libc/nt/thread.h"
-#include "libc/sock/internal.h"
 #include "libc/stdio/sysparam.h"
 #include "libc/sysv/consts/sicode.h"
 #include "libc/sysv/errfuns.h"
@@ -50,161 +47,108 @@ sys_readwrite_nt(int fd, void *data, size_t size, ssize_t offset,
                  int64_t handle, sigset_t waitmask,
                  bool32 ReadOrWriteFile(int64_t, void *, uint32_t, uint32_t *,
                                         struct NtOverlapped *)) {
+  int sig;
+  uint32_t exchanged;
   struct Fd *f = g_fds.p + fd;
 
+  // win32 i/o apis generally take 32-bit values thus we implicitly
+  // truncate outrageously large sizes. linux actually does it too!
+  size = MIN(size, 0x7ffff000);
+
   // pread() and pwrite() perform an implicit lseek() operation, so
   // similar to the lseek() system call, they too raise ESPIPE when
   // operating on a non-seekable file.
   bool pwriting = offset != -1;
-  bool isdisk = f->kind == kFdFile && GetFileType(handle) == kNtFileTypeDisk;
-  bool seekable = isdisk || f->kind == kFdDevNull || f->kind == kFdDevRandom;
-  if (pwriting && !seekable)
+  bool seekable =
+      (f->kind == kFdFile && GetFileType(handle) == kNtFileTypeDisk) ||
+      f->kind == kFdDevNull || f->kind == kFdDevRandom;
+  if (pwriting && !seekable) {
     return espipe();
-
-  // determine if we need to lock file descriptor
-  bool locked = isdisk && !pwriting && f->cursor;
-
-  for (;;) {
-    int got_sig = 0;
-    bool got_eagain = false;
-    uint32_t other_error = 0;
-
-    // create event handle for overlapped i/o
-    intptr_t event;
-    if (!(event = CreateEvent(0, 1, 0, 0)))
-      return __winerr();
-
-    // ensure iops are ordered across threads and processes if seeking
-    if (locked)
-      __cursor_lock(f->cursor);
-
-    // when a file is opened in overlapped mode win32 requires that we
-    // take over full responsibility for managing our own file pointer
-    // which is fine, because the one win32 has was never very good in
-    // the sense that it behaves so differently from linux, that using
-    // win32 i/o required more compatibilty toil than doing it by hand
-    if (!pwriting) {
-      if (seekable && f->cursor) {
-        offset = f->cursor->shared->pointer;
-      } else {
-        offset = 0;
-      }
-    }
-
-    // initiate asynchronous i/o operation with win32
-    struct NtOverlapped overlap = {.hEvent = event, .Pointer = offset};
-    bool32 ok = ReadOrWriteFile(handle, data, size, 0, &overlap);
-    if (!ok && GetLastError() == kNtErrorIoPending) {
-      if (f->flags & _O_NONBLOCK) {
-        // immediately back out of blocking i/o if non-blocking
-        CancelIoEx(handle, &overlap);
-        got_eagain = true;
-      } else {
-        // atomic block on i/o completion, signal, or cancel
-        // it's not safe to acknowledge cancelation from here
-        // it's not safe to call any signal handlers from here
-        intptr_t sigev;
-        if ((sigev = CreateEvent(0, 0, 0, 0))) {
-          // installing semaphore before sig get makes wait atomic
-          struct PosixThread *pt = _pthread_self();
-          pt->pt_event = sigev;
-          pt->pt_blkmask = waitmask;
-          atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_EVENT,
-                                memory_order_release);
-          if (_is_canceled()) {
-            CancelIoEx(handle, &overlap);
-          } else if (_weaken(__sig_get) &&
-                     (got_sig = _weaken(__sig_get)(waitmask))) {
-            CancelIoEx(handle, &overlap);
-          } else {
-            intptr_t hands[] = {event, sigev};
-            uint32_t wi = WaitForMultipleObjects(2, hands, 0, -1u);
-            if (wi == 1) {  // event was signaled by signal enqueue
-              CancelIoEx(handle, &overlap);
-              if (_weaken(__sig_get))
-                got_sig = _weaken(__sig_get)(waitmask);
-            } else if (wi == -1u) {
-              other_error = GetLastError();
-              CancelIoEx(handle, &overlap);
-            }
-          }
-          atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
-          CloseHandle(sigev);
-        } else {
-          other_error = GetLastError();
-          CancelIoEx(handle, &overlap);
-        }
-      }
-      ok = true;
-    }
-    uint32_t exchanged = 0;
-    if (ok)
-      ok = GetOverlappedResult(handle, &overlap, &exchanged, true);
-    uint32_t io_error = GetLastError();
-    CloseHandle(event);
-
-    // check if i/o completed
-    // this could forseeably happen even if CancelIoEx was called
-    if (ok) {
-      if (!pwriting && seekable && f->cursor)
-        f->cursor->shared->pointer = offset + exchanged;
-      if (locked)
-        __cursor_unlock(f->cursor);
-      if (got_sig)  // swallow dequeued signal
-        _weaken(__sig_relay)(got_sig, SI_KERNEL, waitmask);
-      return exchanged;
-    }
-
-    // it's now safe to unlock cursor
-    if (locked)
-      __cursor_unlock(f->cursor);
-
-    // check if i/o failed
-    if (io_error != kNtErrorOperationAborted) {
-      if (got_sig)  // swallow dequeued signal
-        _weaken(__sig_relay)(got_sig, SI_KERNEL, waitmask);
-      // read() and write() have different error paths
-      SetLastError(io_error);
-      return -2;
-    }
-
-    // the i/o operation was successfully canceled
-    if (got_eagain)
-      return eagain();
-
-    // it's now reasonable to report semaphore creation error
-    if (other_error) {
-      errno = __dos2errno(other_error);
-      return -1;
-    }
-
-    // check for thread cancelation and acknowledge
-    if (_check_cancel() == -1)
-      return -1;
-
-    // if signal module has been linked, then
-    if (_weaken(__sig_get)) {
-
-      // gobble up all unmasked pending signals
-      // it's now safe to recurse into signal handlers
-      int handler_was_called = 0;
-      do {
-        if (got_sig)
-          handler_was_called |=
-              _weaken(__sig_relay)(got_sig, SI_KERNEL, waitmask);
-      } while ((got_sig = _weaken(__sig_get)(waitmask)));
-
-      // check if SIGTHR handler was called
-      if (_check_cancel() == -1)
-        return -1;
-
-      // check if signal handler without SA_RESTART was called
-      if (handler_was_called & SIG_HANDLED_NO_RESTART)
-        return eintr();
-    }
-
-    // otherwise try the i/o operation again
   }
+
+  // when a file is opened in overlapped mode win32 requires that we
+  // take over full responsibility for managing our own file pointer
+  // which is fine, because the one win32 has was never very good in
+  // the sense that it behaves so differently from linux, that using
+  // win32 i/o required more compatibilty toil than doing it by hand
+  if (!pwriting) {
+    if (seekable) {
+      offset = f->pointer;
+    } else {
+      offset = 0;
+    }
+  }
+
+RestartOperation:
+  bool eagained = false;
+  // check for signals and cancelation
+  if (_check_cancel() == -1)
+    return -1;  // ECANCELED
+  if (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
+    goto HandleInterrupt;
+  }
+
+  // signals have already been fully blocked by caller
+  // perform i/o operation with atomic signal/cancel checking
+  struct NtOverlapped overlap = {.hEvent = CreateEvent(0, 1, 0, 0),
+                                 .Pointer = offset};
+  bool32 ok = ReadOrWriteFile(handle, data, size, 0, &overlap);
+  if (!ok && GetLastError() == kNtErrorIoPending) {
+    // win32 says this i/o operation needs to block
+    if (f->flags & _O_NONBLOCK) {
+      // abort the i/o operation if file descriptor is in non-blocking mode
+      CancelIoEx(handle, &overlap);
+      eagained = true;
+    } else {
+      // wait until i/o either completes or is canceled by another thread
+      // we avoid a race condition by having a second mask for unblocking
+      struct PosixThread *pt;
+      pt = _pthread_self();
+      pt->pt_blkmask = waitmask;
+      pt->pt_iohandle = handle;
+      pt->pt_ioverlap = &overlap;
+      atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_IO,
+                            memory_order_release);
+      WaitForSingleObject(overlap.hEvent, -1u);
+      atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
+    }
+    ok = true;
+  }
+  if (ok) {
+    ok = GetOverlappedResult(handle, &overlap, &exchanged, true);
+  }
+  CloseHandle(overlap.hEvent);
+
+  // if i/o succeeded then return its result
+  if (ok) {
+    if (!pwriting && seekable) {
+      f->pointer = offset + exchanged;
+    }
+    return exchanged;
+  }
+
+  // only raise EINTR or EAGAIN if I/O got canceled
+  if (GetLastError() == kNtErrorOperationAborted) {
+    // raise EAGAIN if it's due to O_NONBLOCK mmode
+    if (eagained) {
+      return eagain();
+    }
+    // otherwise it must be due to a kill() via __sig_cancel()
+    if (_weaken(__sig_relay) && (sig = _weaken(__sig_get)(waitmask))) {
+    HandleInterrupt:
+      int handler_was_called = _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
+      if (_check_cancel() == -1)
+        return -1;  // possible if we SIGTHR'd
+      // read() is @restartable unless non-SA_RESTART hands were called
+      if (!(handler_was_called & SIG_HANDLED_NO_RESTART)) {
+        goto RestartOperation;
+      }
+    }
+    return eintr();
+  }
+
+  // read() and write() have generally different error-handling paths
+  return -2;
 }
 
 #endif /* __x86_64__ */
diff --git a/libc/calls/releasefd.c b/libc/calls/releasefd.c
index ccf2664bd..f6947d22d 100644
--- a/libc/calls/releasefd.c
+++ b/libc/calls/releasefd.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
 #include "libc/intrin/atomic.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 
 // really want to avoid locking here so close() needn't block signals
diff --git a/libc/calls/renameat.c b/libc/calls/renameat.c
index 821cdba4b..80f2a230c 100644
--- a/libc/calls/renameat.c
+++ b/libc/calls/renameat.c
@@ -41,7 +41,6 @@
  * @param newdirfd is normally AT_FDCWD but if it's an open directory
  *     and newpath is relative, then newpath become relative to dirfd
  * @return 0 on success, or -1 w/ errno
- * @raise EROFS if either path is under /zip/...
  */
 int renameat(int olddirfd, const char *oldpath, int newdirfd,
              const char *newpath) {
@@ -49,7 +48,7 @@ int renameat(int olddirfd, const char *oldpath, int newdirfd,
   if (_weaken(__zipos_notat) &&
       ((rc = __zipos_notat(olddirfd, oldpath)) == -1 ||
        (rc = __zipos_notat(newdirfd, newpath)) == -1)) {
-    rc = erofs();
+    STRACE("zipos renameat not supported yet");
   } else if (!IsWindows()) {
     rc = sys_renameat(olddirfd, oldpath, newdirfd, newpath);
   } else {
diff --git a/libc/intrin/restore.S b/libc/calls/restore.S
similarity index 98%
rename from libc/intrin/restore.S
rename to libc/calls/restore.S
index 596b01f7c..6ce347160 100644
--- a/libc/intrin/restore.S
+++ b/libc/calls/restore.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.windows
 
 //	Restores thread to state before signal.
diff --git a/libc/calls/rusage_add.c b/libc/calls/rusage_add.c
index 254b04169..38a831aac 100644
--- a/libc/calls/rusage_add.c
+++ b/libc/calls/rusage_add.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/rusage.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 /**
  * Accumulates resource statistics in `y` to `x`.
diff --git a/libc/calls/sched_getcpu.c b/libc/calls/sched_getcpu.c
index e671e80ca..12a0a832b 100644
--- a/libc/calls/sched_getcpu.c
+++ b/libc/calls/sched_getcpu.c
@@ -23,82 +23,32 @@
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/nt/struct/processornumber.h"
 #include "libc/nt/synchronization.h"
-#include "libc/runtime/syslib.internal.h"
 #include "libc/sysv/errfuns.h"
 
 int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
 
 /**
  * Returns ID of CPU on which thread is currently scheduled.
- *
- * This function is supported on the following platforms:
- *
- * - x86-64
- *
- *   - Linux: rdtsc
- *   - FreeBSD: rdtsc
- *   - Windows: win32
- *   - OpenBSD: unsupported
- *   - NetBSD: unsupported
- *   - MacOS: unsupported
- *
- * - aarch64
- *
- *   - Linux: syscall
- *   - FreeBSD: syscall
- *   - MacOS: supported
- *
  * @return cpu number on success, or -1 w/ errno
  */
 int sched_getcpu(void) {
-
-  if (IsWindows()) {
-    struct NtProcessorNumber pn;
-    GetCurrentProcessorNumberEx(&pn);
-    return 64 * pn.Group + pn.Number;
-  }
-
-#ifdef __x86_64__
-  if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
-    // Only the Linux, FreeBSD, and Windows kernels can be counted upon
-    // to populate the TSC_AUX register with the current thread number.
+  if (X86_HAVE(RDTSCP)) {
     unsigned tsc_aux;
     rdtscp(&tsc_aux);
     return TSC_AUX_CORE(tsc_aux);
+  } else if (IsAarch64()) {
+    long tpidr_el0;
+    asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
+    return tpidr_el0 & 255;
+  } else if (IsWindows()) {
+    struct NtProcessorNumber pn;
+    GetCurrentProcessorNumberEx(&pn);
+    return 64 * pn.Group + pn.Number;
+  } else {
+    unsigned cpu = 0;
+    int rc = sys_getcpu(&cpu, 0, 0);
+    if (rc == -1)
+      return -1;
+    return cpu;
   }
-#endif
-
-#ifdef __aarch64__
-  if (IsXnu()) {
-    // pthread_cpu_number_np() is defined by MacOS 11.0+ (Big Sur) in
-    // the SDK pthread.h header file, even though there's no man page
-    if (__syslib && __syslib->__version >= 9) {
-      errno_t err;
-      size_t out = 0;
-      if ((err = __syslib->__pthread_cpu_number_np(&out))) {
-        errno = err;
-        return -1;
-      }
-      return out;
-    } else {
-      errno = ENOSYS;  // upgrade your ape loader
-      return -1;       // cc -o /usr/local/bin/ape ape/ape-m1.c
-    }
-  }
-#endif
-
-#ifdef __aarch64__
-  if (IsFreebsd()) {
-    register int x0 asm("x0");
-    register int x8 asm("x8") = 581;  // sched_getcpu
-    asm volatile("svc\t0" : "=r"(x0) : "r"(x8) : "memory");
-    return x0;
-  }
-#endif
-
-  unsigned cpu = 0;
-  int rc = sys_getcpu(&cpu, 0, 0);
-  if (rc == -1)
-    return -1;
-  return cpu;
 }
diff --git a/libc/calls/sched_getscheduler.c b/libc/calls/sched_getscheduler.c
index fafe9fdd3..d7a15554f 100644
--- a/libc/calls/sched_getscheduler.c
+++ b/libc/calls/sched_getscheduler.c
@@ -31,6 +31,7 @@
  *     special; the kernel treats this as a thread id (noting that
  *     `getpid() == gettid()` is always the case on Linux for the main
  *     thread) and will only take effect for the specified tid.
+ *     Therefore this function is POSIX-compliant iif `!__threaded`.
  * @return scheduler policy, or -1 w/ errno
  * @error ESRCH if `pid` not found
  * @error EPERM if not permitted
diff --git a/libc/calls/sched_setscheduler.c b/libc/calls/sched_setscheduler.c
index 5f4839846..576ecadfa 100644
--- a/libc/calls/sched_setscheduler.c
+++ b/libc/calls/sched_setscheduler.c
@@ -41,6 +41,7 @@
  *     special; the kernel treats this as a thread id (noting that
  *     `getpid() == gettid()` is always the case on Linux for the main
  *     thread) and will only take effect for the specified tid.
+ *     Therefore this function is POSIX-compliant iif `!__threaded`.
  *
  * @param policy specifies the kernel's timesharing strategy.
  *
diff --git a/libc/calls/seccomp.c b/libc/calls/seccomp.c
index 1d004fe2d..3048745d0 100644
--- a/libc/calls/seccomp.c
+++ b/libc/calls/seccomp.c
@@ -82,7 +82,7 @@ int seccomp(unsigned operation, unsigned flags, void *args) {
   } else {
     rc = enosys();
   }
-  STRACE("seccomp(%s, %#x, %p) → %d% m", _DescribeSeccompOperation(operation),
+  STRACE("seccomp(%s, %#x, %p) → %d% m", DescribeSeccompOperation(operation),
          flags, args, rc);
   return rc;
 }
diff --git a/libc/calls/select-nt.c b/libc/calls/select-nt.c
index 0d9e2f2e0..8245f9f32 100644
--- a/libc/calls/select-nt.c
+++ b/libc/calls/select-nt.c
@@ -16,60 +16,65 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/state.internal.h"
 #include "libc/calls/struct/timeval.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sock/select.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/pollfd.h"
 #include "libc/sock/struct/pollfd.internal.h"
+#include "libc/stdckdint.h"
 #include "libc/sysv/consts/poll.h"
 #include "libc/sysv/errfuns.h"
 #ifdef __x86_64__
 
-// <sync libc/sysv/consts.sh>
-#define POLLERR_    0x0001  // implied in events
-#define POLLHUP_    0x0002  // implied in events
-#define POLLNVAL_   0x0004  // implied in events
-#define POLLIN_     0x0300
-#define POLLRDNORM_ 0x0100
-#define POLLRDBAND_ 0x0200
-#define POLLOUT_    0x0010
-#define POLLWRNORM_ 0x0010
-#define POLLWRBAND_ 0x0020  // MSDN undocumented
-#define POLLPRI_    0x0400  // MSDN unsupported
-// </sync libc/sysv/consts.sh>
-
 int sys_select_nt(int nfds, fd_set *readfds, fd_set *writefds,
-                  fd_set *exceptfds, const struct timespec *timeout,
+                  fd_set *exceptfds, struct timeval *timeout,
                   const sigset_t *sigmask) {
-  int pfds = 0;
+  int i, pfds, events, fdcount;
 
   // convert bitsets to pollfd
-  struct pollfd fds[128];
-  for (int fd = 0; fd < nfds; ++fd) {
-    int events = 0;
-    if (readfds && FD_ISSET(fd, readfds))
-      events |= POLLIN_;
-    if (writefds && FD_ISSET(fd, writefds))
-      events |= POLLOUT_;
-    if (exceptfds && FD_ISSET(fd, exceptfds))
-      events |= POLLPRI_;
+  struct pollfd fds[64];
+  for (pfds = i = 0; i < nfds; ++i) {
+    events = 0;
+    if (readfds && FD_ISSET(i, readfds))
+      events |= POLLIN;
+    if (writefds && FD_ISSET(i, writefds))
+      events |= POLLOUT;
+    if (exceptfds && FD_ISSET(i, exceptfds))
+      events |= POLLERR;
     if (events) {
-      if (pfds == ARRAYLEN(fds))
-        return e2big();
-      fds[pfds].fd = fd;
-      fds[pfds].events = events;
-      fds[pfds].revents = 0;
-      ++pfds;
+      if (pfds < ARRAYLEN(fds)) {
+        fds[pfds].fd = i;
+        fds[pfds].events = events;
+        fds[pfds].revents = 0;
+        pfds += 1;
+      } else {
+        return enomem();
+      }
+    }
+  }
+
+  // convert the wait time to a word
+  uint32_t millis;
+  if (!timeout) {
+    millis = -1;
+  } else {
+    int64_t ms = timeval_tomillis(*timeout);
+    if (ms < 0 || ms > UINT32_MAX) {
+      millis = -1u;
+    } else {
+      millis = ms;
     }
   }
 
   // call our nt poll implementation
-  int fdcount = sys_poll_nt(fds, pfds, timeout, sigmask);
-  if (fdcount == -1)
+  fdcount = sys_poll_nt(fds, pfds, &millis, sigmask);
+  unassert(fdcount < 64);
+  if (fdcount < 0)
     return -1;
 
   // convert pollfd back to bitsets
@@ -80,20 +85,20 @@ int sys_select_nt(int nfds, fd_set *readfds, fd_set *writefds,
   if (exceptfds)
     FD_ZERO(exceptfds);
   int bits = 0;
-  for (int i = 0; i < pfds; ++i) {
-    if (fds[i].revents & (POLLIN_ | POLLHUP_ | POLLERR_ | POLLNVAL_)) {
+  for (i = 0; i < pfds; ++i) {
+    if (fds[i].revents & POLLIN) {
       if (readfds) {
         FD_SET(fds[i].fd, readfds);
         ++bits;
       }
     }
-    if (fds[i].revents & POLLOUT_) {
+    if (fds[i].revents & POLLOUT) {
       if (writefds) {
         FD_SET(fds[i].fd, writefds);
         ++bits;
       }
     }
-    if (fds[i].revents & POLLPRI_) {
+    if (fds[i].revents & (POLLERR | POLLNVAL)) {
       if (exceptfds) {
         FD_SET(fds[i].fd, exceptfds);
         ++bits;
@@ -101,6 +106,11 @@ int sys_select_nt(int nfds, fd_set *readfds, fd_set *writefds,
     }
   }
 
+  // store remaining time back in caller's timeval
+  if (timeout) {
+    *timeout = timeval_frommillis(millis);
+  }
+
   return bits;
 }
 
diff --git a/libc/calls/select.c b/libc/calls/select.c
index 93704b269..9c234f601 100644
--- a/libc/calls/select.c
+++ b/libc/calls/select.c
@@ -17,22 +17,26 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/sock/select.h"
+#include "libc/calls/cp.internal.h"
+#include "libc/calls/struct/itimerval.internal.h"
+#include "libc/calls/struct/timespec.h"
 #include "libc/calls/struct/timeval.h"
+#include "libc/calls/struct/timeval.internal.h"
+#include "libc/dce.h"
+#include "libc/intrin/describeflags.h"
+#include "libc/intrin/strace.h"
+#include "libc/sock/internal.h"
+#include "libc/sock/select.h"
+#include "libc/sock/select.internal.h"
+#include "libc/sysv/errfuns.h"
 
 /**
- * Checks status on multiple file descriptors at once.
+ * Does what poll() does except with bitset API.
+ *
+ * This system call is supported on all platforms. However, on Windows,
+ * this is polyfilled to translate into poll(). So it's recommended that
+ * poll() be used instead.
  *
- * @param readfds may be used to be notified when you can call read() on
- *     a file descriptor without it blocking; this includes when data is
- *     is available to be read as well as eof and error conditions
- * @param writefds may be used to be notified when write() may be called
- *     on a file descriptor without it blocking
- * @param exceptfds may be used to be notified of exceptional conditions
- *     such as out-of-band data on a socket; it is equivalent to POLLPRI
- *     in the revents of poll()
- * @param timeout may be null which means to block indefinitely; cosmo's
- *     implementation of select() never modifies this parameter
- * @raise E2BIG if we exceeded the 64 socket limit on Windows
  * @raise ECANCELED if thread was cancelled in masked mode
  * @raise EINTR if signal was delivered
  * @cancelationpoint
@@ -41,13 +45,70 @@
  */
 int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
            struct timeval *timeout) {
-  struct timespec ts;
-  struct timespec *tsp;
-  if (timeout) {
-    ts = timeval_totimespec(*timeout);
-    tsp = &ts;
+
+  int rc;
+  fd_set old_readfds;
+  fd_set *old_readfds_ptr = 0;
+  fd_set old_writefds;
+  fd_set *old_writefds_ptr = 0;
+  fd_set old_exceptfds;
+  fd_set *old_exceptfds_ptr = 0;
+  struct timeval old_timeout;
+  struct timeval *old_timeout_ptr = 0;
+
+  POLLTRACE("select(%d, %p, %p, %p, %s) → ...", nfds, readfds, writefds,
+            exceptfds, DescribeTimeval(0, timeout));
+
+  BEGIN_CANCELATION_POINT;
+  if (nfds < 0) {
+    rc = einval();
   } else {
-    tsp = 0;
+    if (readfds) {
+      old_readfds = *readfds;
+      old_readfds_ptr = &old_readfds;
+    }
+    if (writefds) {
+      old_writefds = *writefds;
+      old_writefds_ptr = &old_writefds;
+    }
+    if (exceptfds) {
+      old_exceptfds = *exceptfds;
+      old_exceptfds_ptr = &old_exceptfds;
+    }
+    if (timeout) {
+      old_timeout = *timeout;
+      old_timeout_ptr = &old_timeout;
+    }
+    if (!IsWindows()) {
+#ifdef __aarch64__
+      struct timespec ts, *tsp;
+      if (timeout) {
+        ts = timeval_totimespec(*timeout);
+        tsp = &ts;
+      } else {
+        tsp = 0;
+      }
+      rc = sys_pselect(nfds, readfds, writefds, exceptfds, tsp, 0);
+      if (timeout) {
+        *timeout = timespec_totimeval(ts);
+      }
+#else
+      rc = sys_select(nfds, readfds, writefds, exceptfds, timeout);
+#endif
+    } else {
+      rc = sys_select_nt(nfds, readfds, writefds, exceptfds, timeout, 0);
+    }
   }
-  return pselect(nfds, readfds, writefds, exceptfds, tsp, 0);
+  END_CANCELATION_POINT;
+
+  STRACE("select(%d, %s → [%s], %s → [%s], %s → [%s], %s → [%s]) → %d% m", nfds,
+         DescribeFdSet(rc, nfds, old_readfds_ptr),
+         DescribeFdSet(rc, nfds, readfds),
+         DescribeFdSet(rc, nfds, old_writefds_ptr),
+         DescribeFdSet(rc, nfds, writefds),
+         DescribeFdSet(rc, nfds, old_exceptfds_ptr),
+         DescribeFdSet(rc, nfds, exceptfds),    //
+         DescribeTimeval(rc, old_timeout_ptr),  //
+         DescribeTimeval(rc, timeout), rc);
+  return rc;
 }
diff --git a/libc/calls/setgroups.c b/libc/calls/setgroups.c
index 534f5962a..d030239bb 100644
--- a/libc/calls/setgroups.c
+++ b/libc/calls/setgroups.c
@@ -21,6 +21,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
+#include "libc/stdckdint.h"
 #include "libc/sysv/errfuns.h"
 
 /**
diff --git a/libc/calls/setrlimit.c b/libc/calls/setrlimit.c
index 0a2b12ffa..7cfaeccc6 100644
--- a/libc/calls/setrlimit.c
+++ b/libc/calls/setrlimit.c
@@ -23,9 +23,8 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/intrin/rlimit.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/syslib.internal.h"
 #include "libc/sysv/consts/rlimit.h"
@@ -89,12 +88,10 @@ int setrlimit(int resource, const struct rlimit *rlim) {
   } else if (!IsWindows() && !(IsNetbsd() && resource == RLIMIT_AS)) {
     rc = sys_setrlimit(resource, rlim);
   } else if (resource == RLIMIT_STACK) {
-    rc = 0;
+    rc = enotsup();
   } else {
     rc = einval();
   }
-  if (!rc && resource == RLIMIT_STACK)
-    __rlimit_stack_set(*rlim);  // so __rlimit_stack_get() works on all OSes
   if (resource == RLIMIT_AS) {
     __virtualmax = rlim->rlim_cur;
     errno = olde;
diff --git a/libc/calls/shm_path_np.c b/libc/calls/shm_path_np.c
index dc5813b8a..42df957c4 100644
--- a/libc/calls/shm_path_np.c
+++ b/libc/calls/shm_path_np.c
@@ -35,8 +35,9 @@ void shm_path_np(const char *name, char buf[hasatleast 78]) {
   const char *a;
   uint8_t digest[BLAKE2B256_DIGEST_LENGTH];
   a = "/tmp/", n = 5;
-  if (IsLinux() && isdirectory("/dev/shm"))
+  if (IsLinux() && isdirectory("/dev/shm")) {
     a = "/dev/shm/", n = 9;
+  }
   BLAKE2B256(name, strlen(name), digest);
   p = mempcpy(buf, a, n);
   p = hexpcpy(p, digest, BLAKE2B256_DIGEST_LENGTH);
diff --git a/libc/calls/sig.c b/libc/calls/sig.c
new file mode 100644
index 000000000..247c56746
--- /dev/null
+++ b/libc/calls/sig.c
@@ -0,0 +1,617 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/sysv/consts/sig.h"
+#include "ape/sections.internal.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/sig.internal.h"
+#include "libc/calls/state.internal.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/calls/struct/sigset.internal.h"
+#include "libc/calls/struct/ucontext.internal.h"
+#include "libc/calls/ucontext.h"
+#include "libc/dce.h"
+#include "libc/errno.h"
+#include "libc/intrin/atomic.h"
+#include "libc/intrin/bsf.h"
+#include "libc/intrin/describebacktrace.h"
+#include "libc/intrin/dll.h"
+#include "libc/intrin/kprintf.h"
+#include "libc/intrin/strace.h"
+#include "libc/intrin/weaken.h"
+#include "libc/nt/console.h"
+#include "libc/nt/enum/context.h"
+#include "libc/nt/enum/exceptionhandleractions.h"
+#include "libc/nt/enum/signal.h"
+#include "libc/nt/enum/status.h"
+#include "libc/nt/runtime.h"
+#include "libc/nt/signals.h"
+#include "libc/nt/struct/ntexceptionpointers.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/runtime/symbols.internal.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/ss.h"
+#include "libc/thread/posixthread.internal.h"
+#ifdef __x86_64__
+
+/**
+ * @fileoverview Cosmopolitan Signals for Windows.
+ */
+
+struct SignalFrame {
+  unsigned rva;
+  unsigned flags;
+  siginfo_t si;
+  ucontext_t ctx;
+};
+
+static textwindows bool __sig_ignored_by_default(int sig) {
+  return sig == SIGURG ||   //
+         sig == SIGCONT ||  //
+         sig == SIGCHLD ||  //
+         sig == SIGWINCH;
+}
+
+textwindows bool __sig_ignored(int sig) {
+  return __sighandrvas[sig] == (intptr_t)SIG_IGN ||
+         (__sighandrvas[sig] == (intptr_t)SIG_DFL &&
+          __sig_ignored_by_default(sig));
+}
+
+textwindows void __sig_delete(int sig) {
+  struct Dll *e;
+  atomic_fetch_and_explicit(&__sig.pending, ~(1ull << (sig - 1)),
+                            memory_order_relaxed);
+  _pthread_lock();
+  for (e = dll_last(_pthread_list); e; e = dll_prev(_pthread_list, e))
+    atomic_fetch_and_explicit(&POSIXTHREAD_CONTAINER(e)->tib->tib_sigpending,
+                              ~(1ull << (sig - 1)), memory_order_relaxed);
+  _pthread_unlock();
+}
+
+static textwindows int __sig_getter(atomic_ulong *sigs, sigset_t masked) {
+  int sig;
+  sigset_t bit, pending, deliverable;
+  for (;;) {
+    pending = atomic_load_explicit(sigs, memory_order_acquire);
+    if ((deliverable = pending & ~masked)) {
+      sig = bsfl(deliverable) + 1;
+      bit = 1ull << (sig - 1);
+      if (atomic_fetch_and_explicit(sigs, ~bit, memory_order_acq_rel) & bit)
+        return sig;
+    } else {
+      return 0;
+    }
+  }
+}
+
+textwindows int __sig_get(sigset_t masked) {
+  int sig;
+  if (!(sig = __sig_getter(&__get_tls()->tib_sigpending, masked)))
+    sig = __sig_getter(&__sig.pending, masked);
+  return sig;
+}
+
+static textwindows bool __sig_should_use_altstack(unsigned flags,
+                                                  struct CosmoTib *tib) {
+  if (!(flags & SA_ONSTACK))
+    return false;  // signal handler didn't enable it
+  if (!tib->tib_sigstack_size)
+    return false;  // sigaltstack() wasn't installed on this thread
+  if (tib->tib_sigstack_flags & SS_DISABLE)
+    return false;  // sigaltstack() on this thread was disabled by user
+  char *bp = __builtin_frame_address(0);
+  if (tib->tib_sigstack_addr <= bp &&
+      bp <= tib->tib_sigstack_addr + tib->tib_sigstack_size)
+    return false;  // we're already on the alternate stack
+  return true;
+}
+
+static textwindows wontreturn void __sig_terminate(int sig) {
+  TerminateThisProcess(sig);
+}
+
+static textwindows bool __sig_start(struct PosixThread *pt, int sig,
+                                    unsigned *rva, unsigned *flags) {
+  *rva = __sighandrvas[sig];
+  *flags = __sighandflags[sig];
+  if (*rva == (intptr_t)SIG_IGN ||
+      (*rva == (intptr_t)SIG_DFL && __sig_ignored_by_default(sig))) {
+    STRACE("ignoring %G", sig);
+    return false;
+  }
+  if (atomic_load_explicit(&pt->tib->tib_sigmask, memory_order_acquire) &
+      (1ull << (sig - 1))) {
+    STRACE("enqueing %G on %d", sig, _pthread_tid(pt));
+    atomic_fetch_or_explicit(&pt->tib->tib_sigpending, 1ull << (sig - 1),
+                             memory_order_relaxed);
+    return false;
+  }
+  if (*rva == (intptr_t)SIG_DFL) {
+    STRACE("terminating on %G due to no handler", sig);
+    __sig_terminate(sig);
+  }
+  return true;
+}
+
+static textwindows sigaction_f __sig_handler(unsigned rva) {
+  atomic_fetch_add_explicit(&__sig.count, 1, memory_order_relaxed);
+  return (sigaction_f)(__executable_start + rva);
+}
+
+textwindows int __sig_raise(volatile int sig, int sic) {
+
+  // bitset of kinds of handlers called
+  volatile int handler_was_called = 0;
+
+  // loop over pending signals
+  ucontext_t ctx;
+  getcontext(&ctx);
+  if (!sig) {
+    if ((sig = __sig_get(ctx.uc_sigmask))) {
+      sic = SI_KERNEL;
+    } else {
+      return handler_was_called;
+    }
+  }
+
+  // process signal(s)
+  unsigned rva, flags;
+  struct PosixThread *pt = _pthread_self();
+  if (__sig_start(pt, sig, &rva, &flags)) {
+    if (flags & SA_RESETHAND) {
+      STRACE("resetting %G handler", sig);
+      __sighandrvas[sig] = (int32_t)(intptr_t)SIG_DFL;
+    }
+
+    // update the signal mask in preparation for signal handller
+    sigset_t blocksigs = __sighandmask[sig];
+    if (!(flags & SA_NODEFER))
+      blocksigs |= 1ull << (sig - 1);
+    ctx.uc_sigmask = atomic_fetch_or_explicit(&pt->tib->tib_sigmask, blocksigs,
+                                              memory_order_acquire);
+
+    // call the user's signal handler
+    char ssbuf[128];
+    siginfo_t si = {.si_signo = sig, .si_code = sic};
+    STRACE("__sig_raise(%G, %t) mask %s", sig, __sig_handler(rva),
+           (DescribeSigset)(ssbuf, 0, (sigset_t *)&pt->tib->tib_sigmask));
+    __sig_handler(rva)(sig, &si, &ctx);
+
+    // record this handler
+    if (flags & SA_RESTART) {
+      handler_was_called |= SIG_HANDLED_SA_RESTART;
+    } else {
+      handler_was_called |= SIG_HANDLED_NO_RESTART;
+    }
+  }
+
+  // restore sigmask
+  // loop back to top
+  // jump where handler says
+  sig = 0;
+  return setcontext(&ctx);
+}
+
+textwindows int __sig_relay(int sig, int sic, sigset_t waitmask) {
+  sigset_t m;
+  int handler_was_called;
+  m = atomic_exchange_explicit(&__get_tls()->tib_sigmask, waitmask,
+                               memory_order_acquire);
+  handler_was_called = __sig_raise(sig, SI_KERNEL);
+  atomic_store_explicit(&__get_tls()->tib_sigmask, m, memory_order_release);
+  return handler_was_called;
+}
+
+// cancels blocking operations being performed by signaled thread
+textwindows void __sig_cancel(struct PosixThread *pt, int sig, unsigned flags) {
+  atomic_int *blocker;
+  blocker = atomic_load_explicit(&pt->pt_blocker, memory_order_acquire);
+  if (!blocker) {
+    STRACE("%G sent to %d asynchronously", sig, _pthread_tid(pt));
+    return;
+  }
+  // we can cancel another thread's overlapped i/o op after the freeze
+  if (blocker == PT_BLOCKER_IO) {
+    STRACE("%G canceling %d's i/o", sig, _pthread_tid(pt));
+    CancelIoEx(pt->pt_iohandle, pt->pt_ioverlap);
+    return;
+  }
+  // threads can create semaphores on an as-needed basis
+  if (blocker == PT_BLOCKER_SEM) {
+    STRACE("%G releasing %d's semaphore", sig, _pthread_tid(pt));
+    ReleaseSemaphore(pt->pt_semaphore, 1, 0);
+    return;
+  }
+  // all other blocking ops that aren't overlap should use futexes
+  // we force restartable futexes to churn by waking w/o releasing
+  STRACE("%G waking %d's futex", sig, _pthread_tid(pt));
+  WakeByAddressSingle(blocker);
+}
+
+// the user's signal handler callback is wrapped with this trampoline
+static textwindows wontreturn void __sig_tramp(struct SignalFrame *sf) {
+  int sig = sf->si.si_signo;
+  struct CosmoTib *tib = __get_tls();
+  struct PosixThread *pt = (struct PosixThread *)tib->tib_pthread;
+  for (;;) {
+
+    // update the signal mask in preparation for signal handller
+    sigset_t blocksigs = __sighandmask[sig];
+    if (!(sf->flags & SA_NODEFER))
+      blocksigs |= 1ull << (sig - 1);
+    sf->ctx.uc_sigmask = atomic_fetch_or_explicit(&tib->tib_sigmask, blocksigs,
+                                                  memory_order_acquire);
+
+    // call the user's signal handler
+    char ssbuf[2][128];
+    STRACE("__sig_tramp(%G, %t) mask %s → %s", sig, __sig_handler(sf->rva),
+           (DescribeSigset)(ssbuf[0], 0, &sf->ctx.uc_sigmask),
+           (DescribeSigset)(ssbuf[1], 0, (sigset_t *)&tib->tib_sigmask));
+    __sig_handler(sf->rva)(sig, &sf->si, &sf->ctx);
+
+    // restore the signal mask that was used by the interrupted code
+    // this may have been modified by the signal handler in the callback
+    atomic_store_explicit(&tib->tib_sigmask, sf->ctx.uc_sigmask,
+                          memory_order_release);
+
+    // jump back into original code if there aren't any pending signals
+    do {
+      if (!(sig = __sig_get(sf->ctx.uc_sigmask)))
+        __sig_restore(&sf->ctx);
+    } while (!__sig_start(pt, sig, &sf->rva, &sf->flags));
+
+    // tail recurse into another signal handler
+    sf->si.si_signo = sig;
+    sf->si.si_code = SI_KERNEL;
+    if (sf->flags & SA_RESETHAND) {
+      STRACE("resetting %G handler", sig);
+      __sighandrvas[sig] = (int32_t)(intptr_t)SIG_DFL;
+    }
+  }
+}
+
+// sends signal to another specific thread which is ref'd
+static textwindows int __sig_killer(struct PosixThread *pt, int sig, int sic) {
+  unsigned rva = __sighandrvas[sig];
+  unsigned flags = __sighandflags[sig];
+
+  // do nothing if signal is ignored
+  if (rva == (intptr_t)SIG_IGN ||
+      (rva == (intptr_t)SIG_DFL && __sig_ignored_by_default(sig))) {
+    STRACE("ignoring %G", sig);
+    return 0;
+  }
+
+  // if there's no handler then killing a thread kills the process
+  if (rva == (intptr_t)SIG_DFL) {
+    STRACE("terminating on %G due to no handler", sig);
+    __sig_terminate(sig);
+  }
+
+  // ignore signals already pending
+  uintptr_t th = _pthread_syshand(pt);
+  if (atomic_load_explicit(&pt->tib->tib_sigpending, memory_order_acquire) &
+      (1ull << (sig - 1))) {
+    return 0;
+  }
+
+  // take control of thread
+  // suspending the thread happens asynchronously
+  // however getting the context blocks until it's frozen
+  static pthread_spinlock_t killer_lock;
+  pthread_spin_lock(&killer_lock);
+  if (SuspendThread(th) == -1u) {
+    STRACE("SuspendThread failed w/ %d", GetLastError());
+    pthread_spin_unlock(&killer_lock);
+    return ESRCH;
+  }
+  struct NtContext nc;
+  nc.ContextFlags = kNtContextFull;
+  if (!GetThreadContext(th, &nc)) {
+    STRACE("GetThreadContext failed w/ %d", GetLastError());
+    ResumeThread(th);
+    pthread_spin_unlock(&killer_lock);
+    return ESRCH;
+  }
+  pthread_spin_unlock(&killer_lock);
+
+  // we can't preempt threads that masked sig or are blocked
+  // we can't preempt threads that are running in win32 code
+  // so we shall unblock the thread and let it signal itself
+  if ((atomic_load_explicit(&pt->tib->tib_sigmask, memory_order_acquire) &
+       (1ull << (sig - 1))) ||
+      !((uintptr_t)__executable_start <= nc.Rip &&
+        nc.Rip < (uintptr_t)__privileged_start)) {
+    atomic_fetch_or_explicit(&pt->tib->tib_sigpending, 1ull << (sig - 1),
+                             memory_order_relaxed);
+    ResumeThread(th);
+    __sig_cancel(pt, sig, flags);
+    return 0;
+  }
+
+  // preferring to live dangerously
+  // the thread will be signaled asynchronously
+  if (flags & SA_RESETHAND) {
+    STRACE("resetting %G handler", sig);
+    __sighandrvas[sig] = (int32_t)(intptr_t)SIG_DFL;
+  }
+
+  // inject call to trampoline function into thread
+  uintptr_t sp;
+  if (__sig_should_use_altstack(flags, pt->tib)) {
+    sp = (uintptr_t)pt->tib->tib_sigstack_addr + pt->tib->tib_sigstack_size;
+  } else {
+    sp = nc.Rsp;
+  }
+  sp -= sizeof(struct SignalFrame);
+  sp &= -16;
+  struct SignalFrame *sf = (struct SignalFrame *)sp;
+  _ntcontext2linux(&sf->ctx, &nc);
+  bzero(&sf->si, sizeof(sf->si));
+  sf->rva = rva;
+  sf->flags = flags;
+  sf->si.si_code = sic;
+  sf->si.si_signo = sig;
+  *(uintptr_t *)(sp -= sizeof(uintptr_t)) = nc.Rip;
+  nc.Rip = (intptr_t)__sig_tramp;
+  nc.Rdi = (intptr_t)sf;
+  nc.Rsp = sp;
+  if (!SetThreadContext(th, &nc)) {
+    STRACE("SetThreadContext failed w/ %d", GetLastError());
+    return ESRCH;
+  }
+  ResumeThread(th);
+  __sig_cancel(pt, sig, flags);
+  return 0;
+}
+
+// sends signal to another specific thread
+textwindows int __sig_kill(struct PosixThread *pt, int sig, int sic) {
+  int rc;
+  BLOCK_SIGNALS;
+  rc = __sig_killer(pt, sig, sic);
+  ALLOW_SIGNALS;
+  return rc;
+}
+
+// sends signal to any other thread
+textwindows void __sig_generate(int sig, int sic) {
+  struct Dll *e;
+  struct PosixThread *pt, *mark = 0;
+  if (__sig_ignored(sig)) {
+    STRACE("ignoring %G", sig);
+    return;
+  }
+  if (__sighandrvas[sig] == (intptr_t)SIG_DFL) {
+    STRACE("terminating on %G due to no handler", sig);
+    __sig_terminate(sig);
+  }
+  if (atomic_load_explicit(&__sig.pending, memory_order_acquire) &
+      (1ull << (sig - 1))) {
+    return;
+  }
+  BLOCK_SIGNALS;
+  _pthread_lock();
+  for (e = dll_first(_pthread_list); e; e = dll_next(_pthread_list, e)) {
+    pt = POSIXTHREAD_CONTAINER(e);
+    // we don't want to signal ourself
+    if (pt == _pthread_self())
+      continue;
+    // we don't want to signal a thread that isn't running
+    if (atomic_load_explicit(&pt->pt_status, memory_order_acquire) >=
+        kPosixThreadTerminated) {
+      continue;
+    }
+    // choose this thread if it isn't masking sig
+    if (!(atomic_load_explicit(&pt->tib->tib_sigmask, memory_order_acquire) &
+          (1ull << (sig - 1)))) {
+      _pthread_ref(pt);
+      mark = pt;
+      break;
+    }
+    // if a thread is blocking then we check to see if it's planning
+    // to unblock our sig once the wait operation is completed; when
+    // that's the case we can cancel the thread's i/o to deliver sig
+    if (atomic_load_explicit(&pt->pt_blocker, memory_order_acquire) &&
+        !(pt->pt_blkmask & (1ull << (sig - 1)))) {
+      _pthread_ref(pt);
+      mark = pt;
+      break;
+    }
+  }
+  _pthread_unlock();
+  if (mark) {
+    __sig_killer(mark, sig, sic);
+    _pthread_unref(mark);
+  } else {
+    atomic_fetch_or_explicit(&__sig.pending, 1ull << (sig - 1),
+                             memory_order_relaxed);
+  }
+  ALLOW_SIGNALS;
+}
+
+static textwindows char *__sig_stpcpy(char *d, const char *s) {
+  size_t i;
+  for (i = 0;; ++i)
+    if (!(d[i] = s[i]))
+      return d + i;
+}
+
+static textwindows wontreturn void __sig_death(int sig, const char *thing) {
+#ifndef TINY
+  intptr_t hStderr;
+  char sigbuf[21], s[128], *p;
+  hStderr = GetStdHandle(kNtStdErrorHandle);
+  p = __sig_stpcpy(s, "Terminating on ");
+  p = __sig_stpcpy(p, thing);
+  p = __sig_stpcpy(p, strsignal_r(sig, sigbuf));
+  p = __sig_stpcpy(p,
+                   ". Pass --strace and/or ShowCrashReports() for details.\n");
+  WriteFile(hStderr, s, p - s, 0, 0);
+#endif
+  __sig_terminate(sig);
+}
+
+static textwindows void __sig_unmaskable(struct NtExceptionPointers *ep,
+                                         int code, int sig,
+                                         struct CosmoTib *tib) {
+
+  // log vital crash information reliably for --strace before doing much
+  // we don't print this without the flag since raw numbers scare people
+  // this needs at least one page of stack memory in order to get logged
+  // otherwise it'll print a warning message about the lack of stack mem
+  STRACE("win32 vectored exception 0x%08Xu raising %G "
+         "cosmoaddr2line %s %lx %s",
+         ep->ExceptionRecord->ExceptionCode, sig,
+         _weaken(FindDebugBinary) ? _weaken(FindDebugBinary)()
+                                  : program_invocation_name,
+         ep->ContextRecord->Rip,
+         DescribeBacktrace((struct StackFrame *)ep->ContextRecord->Rbp));
+
+  // if the user didn't install a signal handler for this unmaskable
+  // exception, then print a friendly helpful hint message to stderr
+  unsigned rva = __sighandrvas[sig];
+  if (rva == (intptr_t)SIG_DFL || rva == (intptr_t)SIG_IGN)
+    __sig_death(sig, "uncaught ");
+
+  // if this signal handler is configured to auto-reset to the default
+  // then that reset needs to happen before the user handler is called
+  unsigned flags = __sighandflags[sig];
+  if (flags & SA_RESETHAND) {
+    STRACE("resetting %G handler", sig);
+    __sighandrvas[sig] = (int32_t)(intptr_t)SIG_DFL;
+  }
+
+  // determine the true memory address at which fault occurred
+  // if this is a stack overflow then reapply guard protection
+  void *si_addr;
+  if (ep->ExceptionRecord->ExceptionCode == kNtSignalGuardPage) {
+    si_addr = (void *)ep->ExceptionRecord->ExceptionInformation[1];
+  } else {
+    si_addr = ep->ExceptionRecord->ExceptionAddress;
+  }
+
+  // call the user signal handler
+  // and a modifiable view of the faulting code's cpu state
+  // temporarily replace signal mask while calling crash handler
+  // abort process if sig is already blocked to avoid crash loop
+  // note ucontext_t is a hefty data structures on top of NtContext
+  ucontext_t ctx = {0};
+  siginfo_t si = {.si_signo = sig, .si_code = code, .si_addr = si_addr};
+  _ntcontext2linux(&ctx, ep->ContextRecord);
+  sigset_t blocksigs = __sighandmask[sig];
+  if (!(flags & SA_NODEFER))
+    blocksigs |= 1ull << (sig - 1);
+  ctx.uc_sigmask = atomic_fetch_or_explicit(&tib->tib_sigmask, blocksigs,
+                                            memory_order_acquire);
+  if (ctx.uc_sigmask & (1ull << (sig - 1))) {
+    __sig_death(sig, "masked ");
+    __sig_terminate(sig);
+  }
+  __sig_handler(rva)(sig, &si, &ctx);
+  atomic_store_explicit(&tib->tib_sigmask, ctx.uc_sigmask,
+                        memory_order_release);
+  _ntlinux2context(ep->ContextRecord, &ctx);
+}
+
+void __stack_call(struct NtExceptionPointers *, int, int, struct CosmoTib *,
+                  void (*)(struct NtExceptionPointers *, int, int,
+                           struct CosmoTib *),
+                  void *);
+
+//                         abashed the devil stood
+//                      and felt how awful goodness is
+__msabi dontinstrument unsigned __sig_crash(struct NtExceptionPointers *ep) {
+
+  // translate win32 to unix si_signo and si_code
+  int code, sig = __sig_crash_sig(ep->ExceptionRecord->ExceptionCode, &code);
+
+  // advance the instruction pointer to skip over debugger breakpoints
+  // this behavior is consistent with how unix kernels are implemented
+  if (sig == SIGTRAP) {
+    ep->ContextRecord->Rip++;
+    if (__sig_ignored(sig))
+      return kNtExceptionContinueExecution;
+  }
+
+  // win32 stack overflow detection executes INSIDE the guard page
+  // thus switch to the alternate signal stack as soon as possible
+  struct CosmoTib *tib = __get_tls();
+  unsigned flags = __sighandflags[sig];
+  if (__sig_should_use_altstack(flags, tib)) {
+    __stack_call(ep, code, sig, tib, __sig_unmaskable,
+                 tib->tib_sigstack_addr + tib->tib_sigstack_size);
+  } else {
+    __sig_unmaskable(ep, code, sig, tib);
+  }
+
+  // resume running user program
+  // hopefully the user fixed the cpu state
+  // otherwise the crash will keep happening
+  return kNtExceptionContinueExecution;
+}
+
+static textwindows int __sig_console_sig(uint32_t dwCtrlType) {
+  switch (dwCtrlType) {
+    case kNtCtrlCEvent:
+      return SIGINT;
+    case kNtCtrlBreakEvent:
+      return SIGQUIT;
+    case kNtCtrlCloseEvent:
+    case kNtCtrlLogoffEvent:    // only received by services
+    case kNtCtrlShutdownEvent:  // only received by services
+      return SIGHUP;
+    default:
+      return SIGSTKFLT;
+  }
+}
+
+__msabi textwindows dontinstrument bool32 __sig_console(uint32_t dwCtrlType) {
+  struct CosmoTib tls;
+  __bootstrap_tls(&tls, __builtin_frame_address(0));
+  __sig_generate(__sig_console_sig(dwCtrlType), SI_KERNEL);
+  return true;
+}
+
+// returns 0 if no signal handlers were called, otherwise a bitmask
+// consisting of `1` which means a signal handler was invoked which
+// didn't have the SA_RESTART flag, and `2`, which means SA_RESTART
+// handlers were called (or `3` if both were the case).
+textwindows int __sig_check(void) {
+  int sig;
+  if ((sig = __sig_get(atomic_load_explicit(&__get_tls()->tib_sigmask,
+                                            memory_order_acquire)))) {
+    return __sig_raise(sig, SI_KERNEL);
+  } else {
+    return 0;
+  }
+}
+
+__attribute__((__constructor__(10))) textstartup void __sig_init(void) {
+  if (!IsWindows())
+    return;
+  AddVectoredExceptionHandler(true, (void *)__sig_crash);
+  SetConsoleCtrlHandler((void *)__sig_console, true);
+}
+
+#endif /* __x86_64__ */
diff --git a/libc/calls/sig.internal.h b/libc/calls/sig.internal.h
index 34b2f1d7c..48566f303 100644
--- a/libc/calls/sig.internal.h
+++ b/libc/calls/sig.internal.h
@@ -1,8 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_CALLS_SIGNALS_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_CALLS_SIGNALS_INTERNAL_H_
-#include "libc/atomic.h"
 #include "libc/calls/struct/sigset.h"
-#include "libc/nt/thunk/msabi.h"
 #include "libc/thread/posixthread.internal.h"
 
 #define SIG_HANDLED_NO_RESTART 1
@@ -11,8 +9,8 @@
 COSMOPOLITAN_C_START_
 
 struct Signals {
-  atomic_ulong *process;
-  atomic_ulong count;
+  _Atomic(uint64_t) pending;
+  _Atomic(uint64_t) count;
 };
 
 extern struct Signals __sig;
@@ -29,8 +27,5 @@ void __sig_delete(int);
 void __sig_generate(int, int);
 void __sig_init(void);
 
-__msabi char16_t *__sig_process_path(char16_t *, uint32_t, int);
-__msabi atomic_ulong *__sig_map_process(int, int);
-
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_SIGNALS_INTERNAL_H_ */
diff --git a/libc/calls/sigaction.c b/libc/calls/sigaction.c
index 7c28b6851..d3f46e71b 100644
--- a/libc/calls/sigaction.c
+++ b/libc/calls/sigaction.c
@@ -36,7 +36,7 @@
 #include "libc/limits.h"
 #include "libc/log/backtrace.internal.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/syslib.internal.h"
@@ -423,7 +423,7 @@ static int __sigaction(int sig, const struct sigaction *act,
  *     }
  *
  *     void ContinueOnCrash(void) {
- *       struct sigaction sa = {.sa_sigaction = OnCrash,
+ *       struct sigaction sa = {.sa_handler = OnSigSegv,
  *                              .sa_flags = SA_SIGINFO | SA_RESETHAND};
  *       sigaction(SIGSEGV, &sa, 0);
  *       sigaction(SIGFPE, &sa, 0);
diff --git a/libc/calls/sigaltstack.c b/libc/calls/sigaltstack.c
index a580a0fec..0e246d749 100644
--- a/libc/calls/sigaltstack.c
+++ b/libc/calls/sigaltstack.c
@@ -23,7 +23,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/syslib.internal.h"
 #include "libc/sysv/consts/ss.h"
@@ -113,7 +113,7 @@ static int sigaltstack_bsd(const struct sigaltstack *neu,
  *     struct sigaction sa;
  *     struct sigaltstack ss;
  *     ss.ss_flags = 0;
- *     ss.ss_size = sysconf(_SC_SIGSTKSZ);
+ *     ss.ss_size = sysconf(_SC_MINSIGSTKSZ) + 8192;
  *     ss.ss_sp = malloc(ss.ss_size);
  *     sigaltstack(&ss, 0);
  *     sigemptyset(&sa.ss_mask);
@@ -121,16 +121,11 @@ static int sigaltstack_bsd(const struct sigaltstack *neu,
  *     sa.sa_handler = OnStackOverflow;
  *     sigaction(SIGSEGV, &sa, 0);
  *
- * Your stack size should be `sysconf(_SC_SIGSTKSZ)` which should be
- * somewhere in the ballpark of 32kb to 64kb. You should go no lower
- * than `sysconf(_SC_MINSIGSTKSZ) + 2048` which could be 4kb - 34kb.
- * Cosmo also defines `SIGSTKSZ` as 32kb, which should also be safe.
- *
  * @param neu if non-null will install new signal alt stack
  * @param old if non-null will receive current signal alt stack
  * @return 0 on success, or -1 w/ errno
  * @raise EFAULT if bad memory was supplied
- * @raise ENOMEM if `neu->ss_size` is beneath `sysconf(_SC_MINSIGSTKSZ)`
+ * @raise ENOMEM if `neu->ss_size` is less than `MINSIGSTKSZ`
  */
 int sigaltstack(const struct sigaltstack *neu, struct sigaltstack *old) {
   int rc;
diff --git a/libc/intrin/sigcrashsig.c b/libc/calls/sigcrashsig.c
similarity index 99%
rename from libc/intrin/sigcrashsig.c
rename to libc/calls/sigcrashsig.c
index 060c2b310..21e0d0203 100644
--- a/libc/intrin/sigcrashsig.c
+++ b/libc/calls/sigcrashsig.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/sig.internal.h"
 #include "libc/intrin/pushpop.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/signal.h"
 #include "libc/nt/enum/status.h"
 #include "libc/nt/struct/ntexceptionpointers.h"
diff --git a/libc/calls/sigenter-freebsd.c b/libc/calls/sigenter-freebsd.c
index d895f630c..0f29ad547 100644
--- a/libc/calls/sigenter-freebsd.c
+++ b/libc/calls/sigenter-freebsd.c
@@ -28,7 +28,7 @@
 #include "libc/calls/struct/ucontext-freebsd.internal.h"
 #include "libc/calls/ucontext.h"
 #include "libc/log/libfatal.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
 #include "libc/str/str.h"
diff --git a/libc/calls/sigenter-netbsd.c b/libc/calls/sigenter-netbsd.c
index 09f8ff90c..9e20817df 100644
--- a/libc/calls/sigenter-netbsd.c
+++ b/libc/calls/sigenter-netbsd.c
@@ -27,7 +27,7 @@
 #include "libc/calls/struct/ucontext-netbsd.internal.h"
 #include "libc/calls/ucontext.h"
 #include "libc/log/libfatal.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
 #include "libc/str/str.h"
diff --git a/libc/calls/sigenter-openbsd.c b/libc/calls/sigenter-openbsd.c
index ac3819740..5be46f32a 100644
--- a/libc/calls/sigenter-openbsd.c
+++ b/libc/calls/sigenter-openbsd.c
@@ -27,7 +27,7 @@
 #include "libc/calls/struct/ucontext-openbsd.internal.h"
 #include "libc/calls/ucontext.h"
 #include "libc/log/libfatal.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
 #include "libc/str/str.h"
diff --git a/libc/calls/sigenter-xnu.c b/libc/calls/sigenter-xnu.c
index 9d546ff28..c68a9c7c5 100644
--- a/libc/calls/sigenter-xnu.c
+++ b/libc/calls/sigenter-xnu.c
@@ -33,7 +33,6 @@
 #include "libc/runtime/syslib.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/sa.h"
-#include "libc/sysv/consts/sig.h"
 
 /**
  * @fileoverview XNU kernel callback normalization.
@@ -514,7 +513,6 @@ privileged void __sigenter_xnu(int sig, struct siginfo_xnu *xnuinfo,
     flags = __sighandflags[sig];
 
 #ifdef __aarch64__
-
     // xnu silicon claims to support sa_resethand but it does nothing
     // this can be tested, since it clears the bit from flags as well
     if (flags & SA_RESETHAND) {
@@ -523,13 +521,6 @@ privileged void __sigenter_xnu(int sig, struct siginfo_xnu *xnuinfo,
       __sighandflags[sig] = 0;
       __sighandrvas[sig] = 0;
     }
-
-    // unlike amd64, the instruction pointer on arm64 isn't advanced
-    // past the debugger breakpoint instruction automatically. we need
-    // this so execution can resume after __builtin_trap().
-    if (xnuctx && sig == SIGTRAP)
-      xnuctx->uc_mcontext->__ss.__pc += 4;
-
 #endif
 
     if (~flags & SA_SIGINFO) {
diff --git a/libc/calls/sigpending.c b/libc/calls/sigpending.c
index 0fceafa98..e11e194c0 100644
--- a/libc/calls/sigpending.c
+++ b/libc/calls/sigpending.c
@@ -53,7 +53,7 @@ int sigpending(sigset_t *pending) {
     }
     rc = 0;
   } else if (IsWindows()) {
-    *pending = atomic_load_explicit(__sig.process, memory_order_acquire) |
+    *pending = atomic_load_explicit(&__sig.pending, memory_order_acquire) |
                atomic_load_explicit(&__get_tls()->tib_sigpending,
                                     memory_order_acquire);
     rc = 0;
diff --git a/libc/calls/sigsuspend.c b/libc/calls/sigsuspend.c
index fc7187f57..134eda44c 100644
--- a/libc/calls/sigsuspend.c
+++ b/libc/calls/sigsuspend.c
@@ -21,7 +21,6 @@
 #include "libc/calls/sig.internal.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/sigset.internal.h"
-#include "libc/calls/struct/timespec.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
@@ -54,14 +53,8 @@ int sigsuspend(const sigset_t *ignore) {
   } else {
     sigset_t waitmask = ignore ? *ignore : 0;
     if (IsWindows() || IsMetal()) {
-      // we don't strictly need to block signals, but it reduces signal
-      // delivery latency, by preventing other threads from delivering a
-      // signal asynchronously. it takes about ~5us to deliver a signal
-      // using SetEvent() whereas it takes ~30us to use SuspendThread(),
-      // GetThreadContext(), SetThreadContext(), and ResumeThread().
-      BLOCK_SIGNALS;
-      rc = _park_norestart(timespec_max, waitmask);
-      ALLOW_SIGNALS;
+      while (!(rc = _park_norestart(-1u, waitmask)))
+        donothing;
     } else {
       rc = sys_sigsuspend((uint64_t[2]){waitmask}, 8);
     }
diff --git a/libc/calls/sigtimedwait-nt.c b/libc/calls/sigtimedwait-nt.c
deleted file mode 100644
index 9deaa9d33..000000000
--- a/libc/calls/sigtimedwait-nt.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
-#include "libc/calls/internal.h"
-#include "libc/calls/sig.internal.h"
-#include "libc/calls/struct/siginfo.h"
-#include "libc/calls/struct/sigset.internal.h"
-#include "libc/calls/struct/timespec.internal.h"
-#include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/intrin/atomic.h"
-#include "libc/macros.h"
-#include "libc/nt/events.h"
-#include "libc/nt/runtime.h"
-#include "libc/nt/synchronization.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/sicode.h"
-#include "libc/sysv/consts/sig.h"
-#include "libc/sysv/errfuns.h"
-#include "libc/thread/posixthread.internal.h"
-#ifdef __x86_64__
-
-textwindows static int sys_sigtimedwait_nt_check(sigset_t syncsigs,
-                                                 siginfo_t *opt_info,
-                                                 sigset_t waitmask) {
-  int sig;
-  if (_check_cancel() == -1)
-    return -1;
-  if ((sig = __sig_get(waitmask))) {
-    if ((1ull << (sig - 1)) & syncsigs) {
-      if (opt_info) {
-        memset(opt_info, 0, sizeof(*opt_info));
-        opt_info->si_signo = sig;
-        opt_info->si_code = SI_TKILL;
-        opt_info->si_uid = sys_getuid_nt();
-      }
-      return sig;
-    }
-    int handler_was_called = __sig_relay(sig, SI_TKILL, waitmask);
-    if (_check_cancel() == -1)
-      return -1;
-    if (handler_was_called)
-      return eintr();
-  }
-  return 0;
-}
-
-textwindows static int sys_sigtimedwait_nt_impl(sigset_t syncsigs,
-                                                siginfo_t *opt_info,
-                                                struct timespec deadline,
-                                                sigset_t waitmask,
-                                                intptr_t semaphore) {
-  for (;;) {
-    int sig;
-    if ((sig = sys_sigtimedwait_nt_check(syncsigs, opt_info, waitmask)))
-      return sig;
-    struct timespec now = sys_clock_gettime_monotonic_nt();
-    if (timespec_cmp(now, deadline) >= 0)
-      return eagain();
-    struct timespec remain = timespec_sub(deadline, now);
-    int64_t millis = timespec_tomillis(remain);
-    uint32_t waitms = MIN(millis, 0xffffffffu);
-    uint32_t wi = WaitForSingleObject(semaphore, waitms);
-    if (wi == -1u)
-      return __winerr();
-    if (wi)
-      return eagain();
-  }
-}
-
-textwindows int sys_sigtimedwait_nt(const sigset_t *set, siginfo_t *opt_info,
-                                    const struct timespec *opt_timeout) {
-  int rc;
-  intptr_t sev;
-  struct PosixThread *pt;
-  struct timespec deadline;
-  sigset_t syncsigs, waitmask;
-  BLOCK_SIGNALS;
-  if (opt_timeout) {
-    deadline = timespec_add(sys_clock_gettime_monotonic_nt(), *opt_timeout);
-  } else {
-    deadline = timespec_max;
-  }
-  if ((sev = CreateEvent(0, 0, 0, 0))) {
-    syncsigs = *set & ~(1ull << (SIGTHR - 1));  // internal to pthreads
-    waitmask = ~syncsigs & _SigMask;
-    pt = _pthread_self();
-    pt->pt_event = sev;
-    pt->pt_blkmask = waitmask;
-    atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_EVENT,
-                          memory_order_release);
-    rc = sys_sigtimedwait_nt_impl(syncsigs, opt_info, deadline, waitmask, sev);
-    atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
-    CloseHandle(sev);
-  } else {
-    rc = __winerr();
-  }
-  ALLOW_SIGNALS;
-  return rc;
-}
-
-#endif /* __x86_64__ */
diff --git a/libc/calls/sigtimedwait.c b/libc/calls/sigtimedwait.c
index 65528d1e1..20579e199 100644
--- a/libc/calls/sigtimedwait.c
+++ b/libc/calls/sigtimedwait.c
@@ -27,62 +27,48 @@
 #include "libc/str/str.h"
 #include "libc/sysv/errfuns.h"
 
-int sys_sigtimedwait_nt(const sigset_t *, siginfo_t *, const struct timespec *);
-
 /**
  * Waits for signal synchronously, w/ timeout.
  *
- * This function does not change the thread signal mask. Signals that
- * aren't masked, which aren't in `set`, will be handled normally, in
- * which case this function will raise `EINTR`.
- *
- * This function silently ignores attempts to synchronously wait for
- * SIGTHR which is used internally by the POSIX threads implementation.
- *
  * @param set is signals for which we'll be waiting
- * @param opt_info if not null shall receive info about signal
- * @param opt_timeout is relative deadline and null means wait forever
+ * @param info if not null shall receive info about signal
+ * @param timeout is relative deadline and null means wait forever
  * @return signal number on success, or -1 w/ errno
  * @raise EINTR if an asynchronous signal was delivered instead
  * @raise ECANCELED if thread was cancelled in masked mode
  * @raise EINVAL if nanoseconds parameter was out of range
- * @raise EAGAIN if timeout elapsed
- * @raise ENOSYS on XNU, OpenBSD, and Metal
+ * @raise EAGAIN if deadline expired
+ * @raise ENOSYS on Windows, XNU, OpenBSD, Metal
  * @raise EFAULT if invalid memory was supplied
  * @cancelationpoint
- * @norestart
  */
-int sigtimedwait(const sigset_t *set, siginfo_t *opt_info,
-                 const struct timespec *opt_timeout) {
+int sigtimedwait(const sigset_t *set, siginfo_t *info,
+                 const struct timespec *timeout) {
   int rc;
   char strsig[21];
   struct timespec ts;
   union siginfo_meta si = {0};
   BEGIN_CANCELATION_POINT;
 
-  // validate timeout
-  if (opt_timeout && opt_timeout->tv_nsec >= 1000000000ull) {
-    rc = einval();
-  } else if (IsLinux() || IsFreebsd() || IsNetbsd()) {
-    if (opt_timeout) {
+  if (IsLinux() || IsFreebsd() || IsNetbsd()) {
+    if (timeout) {
       // 1. Linux needs its size parameter
       // 2. NetBSD modifies timeout argument
-      ts = *opt_timeout;
+      ts = *timeout;
       rc = sys_sigtimedwait(set, &si, &ts, 8);
     } else {
       rc = sys_sigtimedwait(set, &si, 0, 8);
     }
-    if (rc != -1 && opt_info)
-      __siginfo2cosmo(opt_info, &si);
-  } else if (IsWindows()) {
-    rc = sys_sigtimedwait_nt(set, opt_info, opt_timeout);
+    if (rc != -1 && info) {
+      __siginfo2cosmo(info, &si);
+    }
   } else {
     rc = enosys();
   }
 
   END_CANCELATION_POINT;
   STRACE("sigtimedwait(%s, [%s], %s) → %s% m", DescribeSigset(0, set),
-         DescribeSiginfo(rc, opt_info), DescribeTimespec(0, opt_timeout),
+         DescribeSiginfo(rc, info), DescribeTimespec(0, timeout),
          strsignal_r(rc, strsig));
   return rc;
 }
diff --git a/libc/calls/sigwait.c b/libc/calls/sigwait.c
index a76b0dfd0..77dc014fa 100644
--- a/libc/calls/sigwait.c
+++ b/libc/calls/sigwait.c
@@ -18,26 +18,10 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/sigtimedwait.h"
 
-/**
- * Waits for signal synchronously.
- *
- * See sigtimedwait() for further details.
- *
- * @param set is signals for which we'll be waiting
- * @param out_sig shall receive signal number
- * @return 0 on success, or -1 w/ errno
- * @raise EINTR if an asynchronous signal was delivered instead
- * @raise ECANCELED if thread was cancelled in masked mode
- * @raise ENOSYS on OpenBSD, XNU, and Metal
- * @see sigtimedwait()
- * @cancelationpoint
- * @norestart
- */
-int sigwait(const sigset_t *mask, int *out_sig) {
-  int sig;
-  if ((sig = sigtimedwait(mask, 0, 0)) == -1)
+int sigwait(const sigset_t *mask, int *sig) {
+  siginfo_t si;
+  if (sigtimedwait(mask, &si, 0) < 0)
     return -1;
-  if (out_sig)
-    *out_sig = sig;
+  *sig = si.si_signo;
   return 0;
 }
diff --git a/libc/calls/sigwaitinfo.c b/libc/calls/sigwaitinfo.c
index ef5876698..368ae1777 100644
--- a/libc/calls/sigwaitinfo.c
+++ b/libc/calls/sigwaitinfo.c
@@ -21,17 +21,14 @@
 /**
  * Waits for signal synchronously.
  *
- * See sigtimedwait() for further details.
- *
  * @param set is signals for which we'll be waiting
  * @param info if not null shall receive info about signal
  * @return signal number on success, or -1 w/ errno
  * @raise EINTR if an asynchronous signal was delivered instead
  * @raise ECANCELED if thread was cancelled in masked mode
- * @raise ENOSYS on OpenBSD, XNU, and Metal
+ * @raise ENOSYS on OpenBSD, XNU, and Windows
  * @see sigtimedwait()
  * @cancelationpoint
- * @norestart
  */
 int sigwaitinfo(const sigset_t *mask, siginfo_t *si) {
   return sigtimedwait(mask, si, 0);
diff --git a/libc/calls/state.internal.h b/libc/calls/state.internal.h
index 3d4d2a2d9..003265867 100644
--- a/libc/calls/state.internal.h
+++ b/libc/calls/state.internal.h
@@ -13,6 +13,7 @@ extern unsigned __sighandflags[NSIG + 1];
 extern uint64_t __sighandmask[NSIG + 1];
 extern const struct NtSecurityAttributes kNtIsInheritable;
 
+void __fds_wipe(void);
 void __fds_lock(void);
 void __fds_unlock(void);
 
diff --git a/libc/calls/statfs.c b/libc/calls/statfs.c
index ef71c8bc4..6068760d7 100644
--- a/libc/calls/statfs.c
+++ b/libc/calls/statfs.c
@@ -34,63 +34,6 @@
 /**
  * Returns information about filesystem.
  *
- * The `struct statfs` returned has the following fields:
- *
- * - `f_fstypename` holds a NUL-terminated string identifying the file
- *   system type. On Linux, this will usually be "nfs". On FreeBSD, it
- *   will usually be "zfs". On OpenBSD and NetBSD, it's usually "ffs".
- *   On MacOS it's usually "apfs", and on Windows it's usually "NTFS".
- *
- * - `f_bsize` is the optimal transfer block size. This may be used to
- *   appropriately chunk your i/o operations. On local file systems it
- *   will usually be somewhere between 4096 and 131072 bytes. With NFS
- *   it may be as high as 512kb.
- *
- * - `f_frsize` is the fragment size of the file system. This could be
- *   anywhere between 512 and 4096 bytes for local filesystems usually
- *   although it could go higher. It should less than, or equal to the
- *   `f_bsize`. This fragment size is what you want to use to multiply
- *   other fields that count blocks into a byte count.
- *
- * - `f_bfree` is the number of free blocks in the filesystem. You can
- *   multiply this number by `f_frsize` to obtain the free byte count.
- *
- * - `f_bavail` is the number of free blocks in the filesystem you can
- *   access from userspace. It's less than or equal to `f_bfree` which
- *   generally has some blocks reserved for root in a pinch. You could
- *   multiply this by `f_frsize` to convert this number to bytes.
- *
- * - `f_files` is the total number of file nodes. Not every OS has it.
- *   On Windows for instance it's currently always `INT64_MAX`. It has
- *   an unspecified meaning. It should be seen as informative.
- *
- * - `f_fsid` is an opaque data structure that uniquely identifies the
- *   filesystem. We're not yet certain how reliable this is across the
- *   various OSes and filesystem types.
- *
- * - `f_namelen` is basically the same as `NAME_MAX` which seems to be
- *   255 on all the OSes we've evaluated. It's the maximum length when
- *   it comes to individual components in a filesystem path.
- *
- * - `f_type` is an OS-specific file system type ID. This is just some
- *   magic number. No defines are provided by Cosmopolitan Libc for it
- *
- * - `f_flags` specifies the options used when a filesystem is mounted
- *   and the numbers vary across OSes. Cosmopolitan Libc polyfills the
- *   magic numbers somewhat consistently. If `IsWindows()` is set then
- *   the constants defined by Microsoft (e.g. `FILE_READ_ONLY_VOLUME`)
- *   should be used. Otherwise on any other UNIX system, the following
- *   constants are provided. You should check each constant at runtime
- *   before using them, to determine if they're non-zero, for support.
- *
- *   - `ST_RDONLY` if mounted in read-only mode (works UNIX + Windows)
- *   - `ST_NOSUID` if setuid binaries are forbidden (all UNIX support)
- *   - `ST_NODEV` when device file access forbidden (all UNIX support)
- *   - `ST_NOEXEC` when a file executions forbidden (all UNIX support)
- *   - `ST_SYNCHRONOUS`, if `O_SYNC` always happens (all UNIX support)
- *   - `ST_NOATIME` if access timestamps aren't set (all UNIX support)
- *   - `ST_RELATIME` if relative acces time is used (all UNIX support)
- *
  * @return 0 on success, or -1 w/ errno
  * @raise ECANCELED if thread was cancelled in masked mode
  * @raise EINTR if signal was delivered
diff --git a/libc/calls/struct/flock.internal.h b/libc/calls/struct/flock.internal.h
index 52b14ad66..aea17b09e 100644
--- a/libc/calls/struct/flock.internal.h
+++ b/libc/calls/struct/flock.internal.h
@@ -4,8 +4,8 @@
 #include "libc/mem/alloca.h"
 COSMOPOLITAN_C_START_
 
-const char *_DescribeFlock(char[300], int, const struct flock *);
-#define DescribeFlock(c, l) _DescribeFlock(alloca(300), c, l)
+const char *DescribeFlock(char[300], int, const struct flock *);
+#define DescribeFlock(c, l) DescribeFlock(alloca(300), c, l)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_FLOCK_INTERNAL_H_ */
diff --git a/libc/calls/struct/iovec.internal.h b/libc/calls/struct/iovec.internal.h
index 6c6a3661a..cfa58b479 100644
--- a/libc/calls/struct/iovec.internal.h
+++ b/libc/calls/struct/iovec.internal.h
@@ -1,7 +1,7 @@
 #ifndef COSMOPOLITAN_LIBC_CALLS_STRUCT_IOVEC_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_CALLS_STRUCT_IOVEC_INTERNAL_H_
-#include "libc/calls/struct/iovec.h"
 #include "libc/intrin/fds.h"
+#include "libc/calls/struct/iovec.h"
 #include "libc/mem/alloca.h"
 COSMOPOLITAN_C_START_
 
@@ -22,8 +22,8 @@ ssize_t sys_send_nt(int, const struct iovec *, size_t, uint32_t);
 ssize_t sys_sendto_nt(int, const struct iovec *, size_t, uint32_t, const void *,
                       uint32_t);
 
-const char *_DescribeIovec(char[300], ssize_t, const struct iovec *, int);
-#define DescribeIovec(x, y, z) _DescribeIovec(alloca(300), x, y, z)
+const char *DescribeIovec(char[300], ssize_t, const struct iovec *, int);
+#define DescribeIovec(x, y, z) DescribeIovec(alloca(300), x, y, z)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_IOVEC_INTERNAL_H_ */
diff --git a/libc/calls/struct/itimerval.internal.h b/libc/calls/struct/itimerval.internal.h
index 9bad9b7a0..ababa1bee 100644
--- a/libc/calls/struct/itimerval.internal.h
+++ b/libc/calls/struct/itimerval.internal.h
@@ -8,8 +8,8 @@ int sys_getitimer(int, struct itimerval *);
 int sys_setitimer(int, const struct itimerval *, struct itimerval *);
 int sys_setitimer_nt(int, const struct itimerval *, struct itimerval *);
 
-const char *_DescribeItimerval(char[90], int, const struct itimerval *);
-#define DescribeItimerval(rc, ts) _DescribeItimerval(alloca(90), rc, ts)
+const char *DescribeItimerval(char[90], int, const struct itimerval *);
+#define DescribeItimerval(rc, ts) DescribeItimerval(alloca(90), rc, ts)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_ITIMERVAL_INTERNAL_H_ */
diff --git a/libc/calls/struct/rlimit.internal.h b/libc/calls/struct/rlimit.internal.h
index 5818c6d5a..fa8ce9200 100644
--- a/libc/calls/struct/rlimit.internal.h
+++ b/libc/calls/struct/rlimit.internal.h
@@ -8,8 +8,8 @@ int sys_getrlimit(int, struct rlimit *);
 int sys_setrlimit(int, const struct rlimit *);
 int sys_setrlimit_nt(int, const struct rlimit *);
 
-const char *_DescribeRlimit(char[64], int, const struct rlimit *);
-#define DescribeRlimit(rc, rl) _DescribeRlimit(alloca(64), rc, rl)
+const char *DescribeRlimit(char[64], int, const struct rlimit *);
+#define DescribeRlimit(rc, rl) DescribeRlimit(alloca(64), rc, rl)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_RLIMIT_INTERNAL_H_ */
diff --git a/libc/calls/struct/sched_param.internal.h b/libc/calls/struct/sched_param.internal.h
index 565d661b3..9df42312e 100644
--- a/libc/calls/struct/sched_param.internal.h
+++ b/libc/calls/struct/sched_param.internal.h
@@ -4,8 +4,8 @@
 #include "libc/mem/alloca.h"
 COSMOPOLITAN_C_START_
 
-const char *_DescribeSchedParam(char[32], const struct sched_param *);
-#define DescribeSchedParam(x) _DescribeSchedParam(alloca(32), x)
+const char *DescribeSchedParam(char[32], const struct sched_param *);
+#define DescribeSchedParam(x) DescribeSchedParam(alloca(32), x)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_SCHED_PARAM_INTERNAL_H_ */
diff --git a/libc/calls/struct/sigaction.internal.h b/libc/calls/struct/sigaction.internal.h
index 19b1177a5..bf69a81db 100644
--- a/libc/calls/struct/sigaction.internal.h
+++ b/libc/calls/struct/sigaction.internal.h
@@ -66,8 +66,8 @@ void __sigenter_netbsd(int, siginfo_t *, void *);
 void __sigenter_freebsd(int, siginfo_t *, void *);
 void __sigenter_openbsd(int, siginfo_t *, void *);
 
-const char *_DescribeSigaction(char[256], int, const struct sigaction *);
-#define DescribeSigaction(rc, sa) _DescribeSigaction(alloca(256), rc, sa)
+const char *DescribeSigaction(char[256], int, const struct sigaction *);
+#define DescribeSigaction(rc, sa) DescribeSigaction(alloca(256), rc, sa)
 
 void _init_onntconsoleevent(void);
 
diff --git a/libc/calls/struct/sigaltstack.internal.h b/libc/calls/struct/sigaltstack.internal.h
index b2416b560..c95eea696 100644
--- a/libc/calls/struct/sigaltstack.internal.h
+++ b/libc/calls/struct/sigaltstack.internal.h
@@ -4,8 +4,8 @@
 #include "libc/mem/alloca.h"
 COSMOPOLITAN_C_START_
 
-const char *_DescribeSigaltstack(char[128], int, const struct sigaltstack *);
-#define DescribeSigaltstack(rc, ss) _DescribeSigaltstack(alloca(128), rc, ss)
+const char *DescribeSigaltstack(char[128], int, const struct sigaltstack *);
+#define DescribeSigaltstack(rc, ss) DescribeSigaltstack(alloca(128), rc, ss)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_SIGALTSTACK_INTERNAL_H_ */
diff --git a/libc/calls/struct/siginfo.internal.h b/libc/calls/struct/siginfo.internal.h
index d5479c464..99b2e4eda 100644
--- a/libc/calls/struct/siginfo.internal.h
+++ b/libc/calls/struct/siginfo.internal.h
@@ -6,8 +6,8 @@ COSMOPOLITAN_C_START_
 
 int sys_sigqueueinfo(int, const siginfo_t *);
 
-const char *_DescribeSiginfo(char[300], int, const siginfo_t *);
-#define DescribeSiginfo(x, y) _DescribeSiginfo(alloca(300), x, y)
+const char *DescribeSiginfo(char[300], int, const siginfo_t *);
+#define DescribeSiginfo(x, y) DescribeSiginfo(alloca(300), x, y)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_SIGINFO_INTERNAL_H_ */
diff --git a/libc/calls/struct/sigset.h b/libc/calls/struct/sigset.h
index 09faa3118..3d783c47a 100644
--- a/libc/calls/struct/sigset.h
+++ b/libc/calls/struct/sigset.h
@@ -4,21 +4,19 @@ COSMOPOLITAN_C_START_
 
 typedef uint64_t sigset_t;
 
-/* clang-format off */
-int sigaddset(sigset_t *, int) libcesque paramsnonnull();
-int sigdelset(sigset_t *, int) libcesque paramsnonnull();
-int sigemptyset(sigset_t *) libcesque paramsnonnull();
-int sigfillset(sigset_t *) libcesque paramsnonnull();
-int sigandset(sigset_t *, const sigset_t *, const sigset_t *) libcesque paramsnonnull();
-int sigorset(sigset_t *, const sigset_t *, const sigset_t *) libcesque paramsnonnull();
-int sigisemptyset(const sigset_t *) libcesque paramsnonnull() nosideeffect;
-int sigismember(const sigset_t *, int) libcesque paramsnonnull() nosideeffect;
-int sigcountset(const sigset_t *) libcesque paramsnonnull() nosideeffect;
-int sigprocmask(int, const sigset_t *, sigset_t *) dontthrow;
-int sigsuspend(const sigset_t *) dontthrow;
-int sigpending(sigset_t *) libcesque;
-int pthread_sigmask(int, const sigset_t *, sigset_t *) dontthrow;
-/* clang-format on */
+int sigaddset(sigset_t *, int) paramsnonnull();
+int sigdelset(sigset_t *, int) paramsnonnull();
+int sigemptyset(sigset_t *) paramsnonnull();
+int sigfillset(sigset_t *) paramsnonnull();
+int sigandset(sigset_t *, const sigset_t *, const sigset_t *) paramsnonnull();
+int sigorset(sigset_t *, const sigset_t *, const sigset_t *) paramsnonnull();
+int sigisemptyset(const sigset_t *) paramsnonnull() nosideeffect;
+int sigismember(const sigset_t *, int) paramsnonnull() nosideeffect;
+int sigcountset(const sigset_t *) paramsnonnull() nosideeffect;
+int sigprocmask(int, const sigset_t *, sigset_t *);
+int sigsuspend(const sigset_t *);
+int sigpending(sigset_t *);
+int pthread_sigmask(int, const sigset_t *, sigset_t *);
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_SIGSET_H_ */
diff --git a/libc/calls/struct/sigset.internal.h b/libc/calls/struct/sigset.internal.h
index 77af35704..ad4fe0a78 100644
--- a/libc/calls/struct/sigset.internal.h
+++ b/libc/calls/struct/sigset.internal.h
@@ -5,15 +5,27 @@
 #include "libc/sysv/consts/sig.h"
 COSMOPOLITAN_C_START_
 
+#ifndef MODE_DBG
+/* block sigs because theoretical edge cases */
 #define BLOCK_SIGNALS  \
   do {                 \
     sigset_t _SigMask; \
   _SigMask = __sig_block()
-
 #define ALLOW_SIGNALS      \
   __sig_unblock(_SigMask); \
   }                        \
   while (0)
+#else
+/* doesn't block signals so we can get a crash
+   report, when a core runtime library crashes */
+#define BLOCK_SIGNALS  \
+  do {                 \
+    sigset_t _SigMask; \
+  sigprocmask(SIG_SETMASK, 0, &_SigMask)
+#define ALLOW_SIGNALS \
+  }                   \
+  while (0)
+#endif
 
 sigset_t __sig_block(void);
 void __sig_unblock(sigset_t);
@@ -22,8 +34,8 @@ int sys_sigprocmask(int, const sigset_t *, sigset_t *);
 int sys_sigsuspend(const uint64_t *, uint64_t);
 int sys_sigpending(uint64_t *, size_t);
 
-const char *_DescribeSigset(char[128], int, const sigset_t *);
-#define DescribeSigset(rc, ss) _DescribeSigset(alloca(128), rc, ss)
+const char *DescribeSigset(char[128], int, const sigset_t *);
+#define DescribeSigset(rc, ss) DescribeSigset(alloca(128), rc, ss)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_SIGSET_INTERNAL_H_ */
diff --git a/libc/calls/struct/stat.internal.h b/libc/calls/struct/stat.internal.h
index eb9aa1ca6..5d1c9fed6 100644
--- a/libc/calls/struct/stat.internal.h
+++ b/libc/calls/struct/stat.internal.h
@@ -13,8 +13,8 @@ int sys_fstatat_nt(int, const char *, struct stat *, int);
 int sys_lstat_nt(const char *, struct stat *);
 int sys_fstat_metal(int, struct stat *);
 
-const char *_DescribeStat(char[300], int, const struct stat *);
-#define DescribeStat(rc, st) _DescribeStat(alloca(300), rc, st)
+const char *DescribeStat(char[300], int, const struct stat *);
+#define DescribeStat(rc, st) DescribeStat(alloca(300), rc, st)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_STAT_INTERNAL_H_ */
diff --git a/libc/calls/struct/statfs.internal.h b/libc/calls/struct/statfs.internal.h
index b98073dc4..ab3919628 100644
--- a/libc/calls/struct/statfs.internal.h
+++ b/libc/calls/struct/statfs.internal.h
@@ -12,8 +12,8 @@ int sys_fstatfs_nt(int64_t, struct statfs *);
 int sys_statfs_nt(const char *, struct statfs *);
 void statfs2statvfs(struct statvfs *, const struct statfs *);
 
-const char *_DescribeStatfs(char[300], int, const struct statfs *);
-#define DescribeStatfs(rc, sf) _DescribeStatfs(alloca(300), rc, sf)
+const char *DescribeStatfs(char[300], int, const struct statfs *);
+#define DescribeStatfs(rc, sf) DescribeStatfs(alloca(300), rc, sf)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_STATFS_INTERNAL_H_ */
diff --git a/libc/calls/struct/termios.internal.h b/libc/calls/struct/termios.internal.h
index ac85545dc..c116d4d04 100644
--- a/libc/calls/struct/termios.internal.h
+++ b/libc/calls/struct/termios.internal.h
@@ -4,9 +4,9 @@
 #include "libc/mem/alloca.h"
 COSMOPOLITAN_C_START_
 
-const char *_DescribeTermios(char[1024], ssize_t, const struct termios *);
+const char *DescribeTermios(char[1024], ssize_t, const struct termios *);
 
-#define DescribeTermios(rc, tio) _DescribeTermios(alloca(1024), rc, tio)
+#define DescribeTermios(rc, tio) DescribeTermios(alloca(1024), rc, tio)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_TERMIOS_INTERNAL_H_ */
diff --git a/libc/calls/struct/timespec.h b/libc/calls/struct/timespec.h
index 9a8c3e8db..7dbcb5b28 100644
--- a/libc/calls/struct/timespec.h
+++ b/libc/calls/struct/timespec.h
@@ -20,6 +20,7 @@ int timespec_get(struct timespec *, int) libcesque;
 
 #ifdef _COSMO_SOURCE
 int sys_clock_nanosleep(int, int, const struct timespec *, struct timespec *);
+int cosmo_clock_nanosleep(int, int, const struct timespec *, struct timespec *);
 #define timespec_zero ((struct timespec){0})
 #define timespec_max  ((struct timespec){0x7fffffffffffffff, 999999999})
 libcesque int timespec_cmp(struct timespec, struct timespec) pureconst;
@@ -33,8 +34,8 @@ libcesque struct timespec timespec_frommicros(int64_t) pureconst;
 libcesque struct timespec timespec_frommillis(int64_t) pureconst;
 libcesque struct timespec timespec_real(void) libcesque;
 libcesque struct timespec timespec_mono(void) libcesque;
-libcesque struct timespec timespec_sleep(int, struct timespec) libcesque;
-libcesque int timespec_sleep_until(int, struct timespec) libcesque;
+libcesque struct timespec timespec_sleep(struct timespec) libcesque;
+libcesque int timespec_sleep_until(struct timespec) libcesque;
 libcesque struct timespec timespec_sub(struct timespec,
                                        struct timespec) pureconst;
 libcesque struct timespec timespec_subz(struct timespec,
diff --git a/libc/calls/struct/timespec.internal.h b/libc/calls/struct/timespec.internal.h
index 4eaa08004..fc15a2061 100644
--- a/libc/calls/struct/timespec.internal.h
+++ b/libc/calls/struct/timespec.internal.h
@@ -25,10 +25,9 @@ int sys_sem_timedwait(int64_t, const struct timespec *);
 int sys_utimensat(int, const char *, const struct timespec[2], int);
 int sys_utimensat_nt(int, const char *, const struct timespec[2], int);
 int sys_utimensat_old(int, const char *, const struct timespec[2], int);
-struct timespec sys_clock_gettime_monotonic_nt(void);
 
-const char *_DescribeTimespec(char[45], int, const struct timespec *);
-#define DescribeTimespec(rc, ts) _DescribeTimespec(alloca(45), rc, ts)
+const char *DescribeTimespec(char[45], int, const struct timespec *);
+#define DescribeTimespec(rc, ts) DescribeTimespec(alloca(45), rc, ts)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_TIMESPEC_INTERNAL_H_ */
diff --git a/libc/calls/struct/timeval.internal.h b/libc/calls/struct/timeval.internal.h
index a3cf06847..ceaf8f73e 100644
--- a/libc/calls/struct/timeval.internal.h
+++ b/libc/calls/struct/timeval.internal.h
@@ -11,8 +11,8 @@ int sys_lutimes(const char *, const struct timeval *);
 int sys_utimes(const char *, const struct timeval *);
 int sys_utimes_nt(const char *, const struct timeval[2]);
 
-const char *_DescribeTimeval(char[45], int, const struct timeval *);
-#define DescribeTimeval(rc, ts) _DescribeTimeval(alloca(45), rc, ts)
+const char *DescribeTimeval(char[45], int, const struct timeval *);
+#define DescribeTimeval(rc, ts) DescribeTimeval(alloca(45), rc, ts)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_TIMEVAL_INTERNAL_H_ */
diff --git a/libc/calls/struct/ucontext.internal.h b/libc/calls/struct/ucontext.internal.h
index 18a271f10..9122af24a 100644
--- a/libc/calls/struct/ucontext.internal.h
+++ b/libc/calls/struct/ucontext.internal.h
@@ -1,14 +1,13 @@
 #ifndef COSMOPOLITAN_LIBC_CALLS_STRUCT_UCONTEXT_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_CALLS_STRUCT_UCONTEXT_INTERNAL_H_
 #include "libc/calls/ucontext.h"
+#include "libc/nt/struct/context.h"
 COSMOPOLITAN_C_START_
 
 #ifdef __x86_64__
 #define PC   rip
 #define SP   rsp
 #define BP   rbp
-#define RES0 rax
-#define RES1 rdx
 #define ARG0 rdi
 #define ARG1 rsi
 #define ARG2 rdx
@@ -19,8 +18,6 @@ COSMOPOLITAN_C_START_
 #define PC   pc
 #define SP   sp
 #define BP   regs[29]
-#define RES0 regs[0]
-#define RES1 regs[1]
 #define ARG0 regs[0]
 #define ARG1 regs[1]
 #define ARG2 regs[2]
@@ -31,5 +28,8 @@ COSMOPOLITAN_C_START_
 #error "unsupported architecture"
 #endif
 
+void _ntcontext2linux(struct ucontext *, const struct NtContext *);
+void _ntlinux2context(struct NtContext *, const ucontext_t *);
+
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_UCONTEXT_INTERNAL_H_ */
diff --git a/libc/calls/struct/user_regs_struct.h b/libc/calls/struct/user_regs_struct.h
new file mode 100644
index 000000000..5cbe63f5c
--- /dev/null
+++ b/libc/calls/struct/user_regs_struct.h
@@ -0,0 +1,71 @@
+#ifndef COSMOPOLITAN_LIBC_CALLS_STRUCT_USER_REGS_STRUCT_H_
+#define COSMOPOLITAN_LIBC_CALLS_STRUCT_USER_REGS_STRUCT_H_
+COSMOPOLITAN_C_START_
+
+/**
+ * Linux Kernel user registers.
+ *
+ * @note superset of struct pt_regs
+ * @see ptrace() w/ PTRACE_SYSCALL
+ */
+struct user_regs_struct {
+  uint64_t r15;
+  uint64_t r14;
+  uint64_t r13;
+  uint64_t r12;
+  uint64_t rbp;
+  uint64_t rbx;
+  uint64_t r11;
+  uint64_t r10;
+  uint64_t r9;
+  uint64_t r8;
+  uint64_t rax;
+  uint64_t rcx;
+  uint64_t rdx;
+  uint64_t rsi;
+  uint64_t rdi;
+  uint64_t orig_rax;
+  uint64_t rip;
+  uint64_t cs;
+  uint64_t eflags;
+  uint64_t rsp;
+  uint64_t ss;
+  uint64_t fs_base;
+  uint64_t gs_base;
+  uint64_t ds;
+  uint64_t es;
+  uint64_t fs;
+  uint64_t gs;
+};
+
+struct useregs_struct_freebsd {
+  int64_t r15;
+  int64_t r14;
+  int64_t r13;
+  int64_t r12;
+  int64_t r11;
+  int64_t r10;
+  int64_t r9;
+  int64_t r8;
+  int64_t rdi;
+  int64_t rsi;
+  int64_t rbp;
+  int64_t rbx;
+  int64_t rdx;
+  int64_t rcx;
+  int64_t rax;
+  uint32_t trapno;
+  uint16_t fs;
+  uint16_t gs;
+  uint32_t err;
+  uint16_t es;
+  uint16_t ds;
+  int64_t rip;
+  int64_t cs;
+  int64_t rflags;
+  int64_t rsp;
+  int64_t ss;
+};
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_USER_REGS_STRUCT_H_ */
diff --git a/libc/calls/struct/winsize.internal.h b/libc/calls/struct/winsize.internal.h
index 8c5e07fad..642b995d8 100644
--- a/libc/calls/struct/winsize.internal.h
+++ b/libc/calls/struct/winsize.internal.h
@@ -1,13 +1,13 @@
 #ifndef COSMOPOLITAN_LIBC_CALLS_STRUCT_WINSIZE_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_CALLS_STRUCT_WINSIZE_INTERNAL_H_
-#include "libc/calls/struct/winsize.h"
 #include "libc/intrin/fds.h"
+#include "libc/calls/struct/winsize.h"
 #include "libc/mem/alloca.h"
 COSMOPOLITAN_C_START_
 
 int tcgetwinsize_nt(int, struct winsize *);
-const char *_DescribeWinsize(char[64], int, const struct winsize *);
-#define DescribeWinsize(rc, ws) _DescribeWinsize(alloca(64), rc, ws)
+const char *DescribeWinsize(char[64], int, const struct winsize *);
+#define DescribeWinsize(rc, ws) DescribeWinsize(alloca(64), rc, ws)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_WINSIZE_INTERNAL_H_ */
diff --git a/libc/intrin/swapcontext.S b/libc/calls/swapcontext.S
similarity index 94%
rename from libc/intrin/swapcontext.S
rename to libc/calls/swapcontext.S
index b40b86777..6dd78947f 100644
--- a/libc/intrin/swapcontext.S
+++ b/libc/calls/swapcontext.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Saves machine to 𝑥 and activates 𝑦, i.e.
 //
@@ -31,17 +31,17 @@
 //	@returnstwice
 	.ftrace1
 swapcontext:
-	beg
 	.ftrace2
-#include "libc/intrin/getcontext.inc"
+#include "libc/calls/getcontext.inc"
 #ifdef __x86_64__
-	pro
-	cpush	%rsi
-	cpush	%rsi
+	push	%rbp
+	mov	%rsp,%rbp
+	push	%rsi
+	push	%rsi
 	call	__swapcontextsig
-	cpop	%rdi
-	cpop	%rdi
-	epi
+	pop	%rdi
+	pop	%rdi
+	pop	%rbp
 	test	%eax,%eax
 	jnz	1f
 #elif defined(__aarch64__)
@@ -56,5 +56,4 @@ swapcontext:
 #endif
 	jmp	__tailcontext
 1:	ret
-	end
 	.endfn	swapcontext,globl
diff --git a/libc/calls/syscall-nt.internal.h b/libc/calls/syscall-nt.internal.h
index dafbf18ea..70fa3b41d 100644
--- a/libc/calls/syscall-nt.internal.h
+++ b/libc/calls/syscall-nt.internal.h
@@ -2,11 +2,9 @@
 #define COSMOPOLITAN_LIBC_CALLS_SYSCALL_NT_INTERNAL_H_
 COSMOPOLITAN_C_START_
 
-extern int sys_getppid_nt_cosmo;
-extern int sys_getppid_nt_win32;
-
 bool32 sys_isatty(int);
 int sys_chdir_nt(const char *);
+int sys_close_epoll_nt(int);
 int sys_dup_nt(int, int, int, int);
 int sys_execve_nt(const char *, char *const[], char *const[]);
 int sys_faccessat_nt(int, const char *, int, uint32_t);
@@ -40,7 +38,6 @@ int sys_unlinkat_nt(int, const char *, int);
 int64_t sys_lseek_nt(int, int64_t, int);
 ssize_t sys_read_nt_impl(int, void *, size_t, int64_t);
 ssize_t sys_readlinkat_nt(int, const char *, char *, size_t);
-void sys_getppid_nt_wipe(int, int);
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_SYSCALL_NT_INTERNAL_H_ */
diff --git a/libc/calls/syscall_support-nt.internal.h b/libc/calls/syscall_support-nt.internal.h
index 7b0ced2d3..a002ef9e3 100644
--- a/libc/calls/syscall_support-nt.internal.h
+++ b/libc/calls/syscall_support-nt.internal.h
@@ -4,8 +4,6 @@
 #include "libc/nt/struct/overlapped.h"
 COSMOPOLITAN_C_START_
 
-#define POLL_INTERVAL_MS 10
-
 bool isdirectory_nt(const char *);
 bool isregularfile_nt(const char *);
 bool issymlink_nt(const char *);
diff --git a/libc/calls/sysinfo.c b/libc/calls/sysinfo.c
index e67de5921..cf7ce29d3 100644
--- a/libc/calls/sysinfo.c
+++ b/libc/calls/sysinfo.c
@@ -23,7 +23,7 @@
 #include "libc/calls/struct/timeval.h"
 #include "libc/dce.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/errfuns.h"
 
diff --git a/libc/intrin/tailcontext.S b/libc/calls/tailcontext.S
similarity index 98%
rename from libc/intrin/tailcontext.S
rename to libc/calls/tailcontext.S
index 8ed1b17c9..a55163dce 100644
--- a/libc/intrin/tailcontext.S
+++ b/libc/calls/tailcontext.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	tailed called by setcontext() implementation
 __tailcontext:
@@ -57,7 +57,8 @@ __tailcontext:
 	mov	80(%rax),%rsp
 	push	88(%rax)
 	mov	24(%rax),%rdi
-	mov	64(%rax),%rax
+
+	xor	%eax,%eax
 	ret
 
 #elif defined(__aarch64__)
diff --git a/libc/calls/tcflush.c b/libc/calls/tcflush.c
index e9a171b20..3802897b6 100644
--- a/libc/calls/tcflush.c
+++ b/libc/calls/tcflush.c
@@ -17,6 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/calls/syscall-sysv.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
@@ -24,7 +25,6 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/fmt/itoa.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/strace.h"
 #include "libc/mem/alloca.h"
 #include "libc/nt/comms.h"
@@ -52,7 +52,6 @@ static const char *DescribeFlush(char buf[12], int action) {
 }
 
 static dontinline textwindows int sys_tcflush_nt(int fd, int queue) {
-#ifdef __x86_64__
   if (!sys_isatty(fd)) {
     return -1;  // ebadf, enotty
   }
@@ -60,9 +59,6 @@ static dontinline textwindows int sys_tcflush_nt(int fd, int queue) {
     return 0;  // windows console output is never buffered
   }
   return FlushConsoleInputBytes();
-#else
-  return enosys();
-#endif
 }
 
 /**
diff --git a/libc/calls/tcgetattr-nt.c b/libc/calls/tcgetattr-nt.c
index 8e4c72945..00950d3c0 100644
--- a/libc/calls/tcgetattr-nt.c
+++ b/libc/calls/tcgetattr-nt.c
@@ -18,9 +18,9 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/termios.h"
 #include "libc/calls/syscall-nt.internal.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/nomultics.h"
 #include "libc/nt/console.h"
 #include "libc/nt/enum/consolemodeflags.h"
@@ -58,34 +58,32 @@ textwindows int tcgetattr_nt(int fd, struct termios *tio) {
   // kNtEnableLineInput and kNtEnableEchoInput only apply to programs
   // that call ReadFile() or ReadConsole(). since we do not use them,
   // the flags could serve the purpose of inter-process communication
-  if ((inmode & kNtEnableLineInput) || !(__ttyconf.magic & kTtyUncanon))
+  if ((inmode & kNtEnableLineInput) || !(__ttyconf.magic & kTtyUncanon)) {
     tio->c_lflag |= ICANON;
-
+  }
   // kNtEnableEchoInput only works with kNtEnableLineInput enabled.
-  if ((inmode & kNtEnableEchoInput) || !(__ttyconf.magic & kTtySilence))
+  if ((inmode & kNtEnableEchoInput) || !(__ttyconf.magic & kTtySilence)) {
     tio->c_lflag |= ECHO;
-
+  }
   // The Windows console itself always echos control codes as ASCII.
-  if (!(__ttyconf.magic & kTtyEchoRaw))
+  if ((inmode & kNtEnableEchoInput) || !(__ttyconf.magic & kTtyEchoRaw)) {
     tio->c_lflag |= ECHOCTL;
-
-  if (!(__ttyconf.magic & kTtyNoEchoe))
-    tio->c_lflag |= ECHOE;
-  if (!(__ttyconf.magic & kTtyNoEchok))
-    tio->c_lflag |= ECHOK;
-  if (!(__ttyconf.magic & kTtyNoEchoke))
-    tio->c_lflag |= ECHOKE;
-
-  if (!(__ttyconf.magic & kTtyNoCr2Nl))
+  }
+  if (!(__ttyconf.magic & kTtyNoCr2Nl)) {
     tio->c_iflag |= ICRNL;
-  if (!(__ttyconf.magic & kTtyNoIsigs))
+  }
+  if (!(__ttyconf.magic & kTtyNoIsigs)) {
     tio->c_lflag |= ISIG;
-  if (!(__ttyconf.magic & kTtyNoIexten))
+  }
+  if (inmode & kNtEnableProcessedInput) {
     tio->c_lflag |= IEXTEN;
-  if (outmode & kNtEnableProcessedOutput)
+  }
+  if (outmode & kNtEnableProcessedOutput) {
     tio->c_oflag |= OPOST;
-  if (!(outmode & kNtDisableNewlineAutoReturn))
+  }
+  if (!(outmode & kNtDisableNewlineAutoReturn)) {
     tio->c_oflag |= OPOST | ONLCR;
+  }
 
   return 0;
 }
diff --git a/libc/calls/tcsetattr-nt.c b/libc/calls/tcsetattr-nt.c
index 984ff48ec..4a5484ad3 100644
--- a/libc/calls/tcsetattr-nt.c
+++ b/libc/calls/tcsetattr-nt.c
@@ -18,10 +18,10 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/termios.h"
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/calls/ttydefaults.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/nomultics.h"
 #include "libc/nt/console.h"
 #include "libc/nt/enum/consolemodeflags.h"
@@ -61,50 +61,40 @@ textwindows int tcsetattr_nt(int fd, int opt, const struct termios *tio) {
     inmode &= ~kNtEnableQuickEditMode;
     __ttyconf.magic |= kTtyUncanon;
   }
-  if (!(tio->c_iflag & ICRNL))
+  if (!(tio->c_iflag & ICRNL)) {
     __ttyconf.magic |= kTtyNoCr2Nl;
-
-  if (!(tio->c_lflag & ECHOE))
-    __ttyconf.magic |= kTtyNoEchoe;
-  if (!(tio->c_lflag & ECHOK))
-    __ttyconf.magic |= kTtyNoEchok;
-  if (!(tio->c_lflag & ECHOKE))
-    __ttyconf.magic |= kTtyNoEchoke;
-  if (!(tio->c_lflag & ECHOCTL))
+  }
+  if (!(tio->c_lflag & ECHOCTL)) {
     __ttyconf.magic |= kTtyEchoRaw;
-
+  }
   if (tio->c_lflag & ECHO) {
     // "kNtEnableEchoInput can be used only if the
     //  kNtEnableLineInput mode is also enabled." -MSDN
-    if (tio->c_lflag & ICANON)
+    if (tio->c_lflag & ICANON) {
       inmode |= kNtEnableEchoInput;
+    }
   } else {
     __ttyconf.magic |= kTtySilence;
   }
-
-  if (!(tio->c_lflag & ISIG))
+  if (!(tio->c_lflag & ISIG)) {
     __ttyconf.magic |= kTtyNoIsigs;
-
-  // IEXTEN enables implementation-defined input processing. This flag,
-  // as well as ICANON must be enabled for the special characters EOL2,
-  // LNEXT, REPRINT, WERASE to be interpreted.
-  if (!(tio->c_lflag & IEXTEN))
-    __ttyconf.magic |= kTtyNoIexten;
-
+  }
   memcpy(__ttyconf.c_cc, tio->c_cc, NCCS);
-
-  if ((tio->c_lflag & ISIG) && __ttyconf.vintr == CTRL('C'))
+  if ((tio->c_lflag & ISIG) &&     //
+      !(tio->c_lflag & ICANON) &&  //
+      __ttyconf.vintr == CTRL('C')) {
     // allows ctrl-c to be delivered asynchronously via win32
     // we normally don't want win32 doing this 24/7 in the bg
     // because we don't have job control, tcsetpgrp, etc. yet
     // it's normally much better to let read-nt.c raise a sig
-    // because read-nt only manages your tty while it is used
+    // because read-nt only manages your tty whilst it's used
     inmode |= kNtEnableProcessedInput;
-
+  }
   outmode &= ~kNtDisableNewlineAutoReturn;
   outmode |= kNtEnableProcessedOutput;
-  if (!(tio->c_oflag & ONLCR))
+  if (!(tio->c_oflag & ONLCR)) {
     outmode |= kNtDisableNewlineAutoReturn;
+  }
   outmode |= kNtEnableVirtualTerminalProcessing;
 
   // tune the win32 configuration
diff --git a/libc/calls/time.c b/libc/calls/time.c
index f0455d2b5..d592bc256 100644
--- a/libc/calls/time.c
+++ b/libc/calls/time.c
@@ -16,9 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/sysv/consts/clock.h"
+#include "libc/time.h"
+#include "libc/calls/struct/timeval.h"
+#include "libc/dce.h"
+#include "libc/sysv/errfuns.h"
 
 /**
  * Returns time as seconds from UNIX epoch.
@@ -28,11 +29,15 @@
  * @asyncsignalsafe
  */
 int64_t time(int64_t *opt_out_ret) {
-  int64_t secs = -1;
-  struct timespec ts;
-  if (!clock_gettime(CLOCK_REALTIME, &ts))
-    secs = ts.tv_sec;
-  if (opt_out_ret)
-    *opt_out_ret = secs;
+  int64_t secs;
+  struct timeval tv;
+  if (gettimeofday(&tv, 0) != -1) {
+    secs = tv.tv_sec;
+    if (opt_out_ret) {
+      *opt_out_ret = secs;
+    }
+  } else {
+    secs = -1;
+  }
   return secs;
 }
diff --git a/libc/intrin/timespec_add.c b/libc/calls/timespec_add.c
similarity index 100%
rename from libc/intrin/timespec_add.c
rename to libc/calls/timespec_add.c
diff --git a/libc/intrin/timespec_cmp.c b/libc/calls/timespec_cmp.c
similarity index 100%
rename from libc/intrin/timespec_cmp.c
rename to libc/calls/timespec_cmp.c
diff --git a/libc/intrin/timespec_frommicros.c b/libc/calls/timespec_frommicros.c
similarity index 100%
rename from libc/intrin/timespec_frommicros.c
rename to libc/calls/timespec_frommicros.c
diff --git a/libc/intrin/timespec_frommillis.c b/libc/calls/timespec_frommillis.c
similarity index 100%
rename from libc/intrin/timespec_frommillis.c
rename to libc/calls/timespec_frommillis.c
diff --git a/libc/intrin/timespec_fromnanos.c b/libc/calls/timespec_fromnanos.c
similarity index 100%
rename from libc/intrin/timespec_fromnanos.c
rename to libc/calls/timespec_fromnanos.c
diff --git a/libc/calls/timespec_mono.c b/libc/calls/timespec_mono.c
index 4ca4fd2e7..044a3edfd 100644
--- a/libc/calls/timespec_mono.c
+++ b/libc/calls/timespec_mono.c
@@ -29,6 +29,6 @@
  */
 struct timespec timespec_mono(void) {
   struct timespec ts;
-  unassert(!clock_gettime(CLOCK_MONOTONIC, &ts));
+  npassert(!clock_gettime(CLOCK_MONOTONIC, &ts));
   return ts;
 }
diff --git a/libc/calls/timespec_real.c b/libc/calls/timespec_real.c
index 81f6ed84b..59a5c8a80 100644
--- a/libc/calls/timespec_real.c
+++ b/libc/calls/timespec_real.c
@@ -16,6 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
 #include "libc/calls/struct/timespec.h"
 #include "libc/sysv/consts/clock.h"
 
@@ -30,6 +31,6 @@
  */
 struct timespec timespec_real(void) {
   struct timespec ts;
-  clock_gettime(CLOCK_REALTIME, &ts);
+  unassert(!clock_gettime(CLOCK_REALTIME, &ts));
   return ts;
 }
diff --git a/libc/calls/timespec_sleep.c b/libc/calls/timespec_sleep.c
index 41d56a3fb..30cb9c52f 100644
--- a/libc/calls/timespec_sleep.c
+++ b/libc/calls/timespec_sleep.c
@@ -34,16 +34,19 @@
  * @return unslept time which may be non-zero if the call was interrupted
  * @cancelationpoint
  */
-struct timespec timespec_sleep(int clock, struct timespec delay) {
+struct timespec timespec_sleep(struct timespec delay) {
   int cs = -1;
   errno_t err;
   struct timespec remain;
   remain = timespec_zero;
-  if (_pthread_self()->pt_flags & PT_MASKED)
+  if (_pthread_self()->pt_flags & PT_MASKED) {
     cs = _pthread_block_cancelation();
-  if ((err = clock_nanosleep(clock, 0, &delay, &remain)))
+  }
+  if ((err = clock_nanosleep(CLOCK_REALTIME, 0, &delay, &remain))) {
     unassert(err == EINTR);
-  if (cs != -1)
+  }
+  if (cs != -1) {
     _pthread_allow_cancelation(cs);
+  }
   return remain;
 }
diff --git a/libc/calls/timespec_sleep_until.c b/libc/calls/timespec_sleep_until.c
index 5749d39f8..867268b96 100644
--- a/libc/calls/timespec_sleep_until.c
+++ b/libc/calls/timespec_sleep_until.c
@@ -30,9 +30,9 @@
  * @raise EINTR if signal was delivered
  * @cancelationpoint
  */
-errno_t timespec_sleep_until(int clock, struct timespec abs_deadline) {
+errno_t timespec_sleep_until(struct timespec abs_deadline) {
   errno_t rc;
-  rc = clock_nanosleep(clock, TIMER_ABSTIME, &abs_deadline, 0);
+  rc = clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &abs_deadline, 0);
   unassert(!rc || rc == EINTR || rc == ECANCELED);
   return rc;
 }
diff --git a/libc/intrin/timespec_sub.c b/libc/calls/timespec_sub.c
similarity index 100%
rename from libc/intrin/timespec_sub.c
rename to libc/calls/timespec_sub.c
diff --git a/libc/intrin/timespec_subz.c b/libc/calls/timespec_subz.c
similarity index 100%
rename from libc/intrin/timespec_subz.c
rename to libc/calls/timespec_subz.c
diff --git a/libc/intrin/timespec_tomicros.c b/libc/calls/timespec_tomicros.c
similarity index 100%
rename from libc/intrin/timespec_tomicros.c
rename to libc/calls/timespec_tomicros.c
diff --git a/libc/intrin/timespec_tomillis.c b/libc/calls/timespec_tomillis.c
similarity index 100%
rename from libc/intrin/timespec_tomillis.c
rename to libc/calls/timespec_tomillis.c
diff --git a/libc/intrin/timespec_tonanos.c b/libc/calls/timespec_tonanos.c
similarity index 100%
rename from libc/intrin/timespec_tonanos.c
rename to libc/calls/timespec_tonanos.c
diff --git a/libc/intrin/timespec_totimeval.c b/libc/calls/timespec_totimeval.c
similarity index 100%
rename from libc/intrin/timespec_totimeval.c
rename to libc/calls/timespec_totimeval.c
diff --git a/libc/intrin/timeval_add.c b/libc/calls/timeval_add.c
similarity index 100%
rename from libc/intrin/timeval_add.c
rename to libc/calls/timeval_add.c
diff --git a/libc/intrin/timeval_cmp.c b/libc/calls/timeval_cmp.c
similarity index 100%
rename from libc/intrin/timeval_cmp.c
rename to libc/calls/timeval_cmp.c
diff --git a/libc/intrin/timeval_frommicros.c b/libc/calls/timeval_frommicros.c
similarity index 100%
rename from libc/intrin/timeval_frommicros.c
rename to libc/calls/timeval_frommicros.c
diff --git a/libc/intrin/timeval_frommillis.c b/libc/calls/timeval_frommillis.c
similarity index 100%
rename from libc/intrin/timeval_frommillis.c
rename to libc/calls/timeval_frommillis.c
diff --git a/libc/intrin/timeval_sub.c b/libc/calls/timeval_sub.c
similarity index 100%
rename from libc/intrin/timeval_sub.c
rename to libc/calls/timeval_sub.c
diff --git a/libc/intrin/timeval_subz.c b/libc/calls/timeval_subz.c
similarity index 100%
rename from libc/intrin/timeval_subz.c
rename to libc/calls/timeval_subz.c
diff --git a/libc/intrin/timeval_tomicros.c b/libc/calls/timeval_tomicros.c
similarity index 100%
rename from libc/intrin/timeval_tomicros.c
rename to libc/calls/timeval_tomicros.c
diff --git a/libc/intrin/timeval_tomillis.c b/libc/calls/timeval_tomillis.c
similarity index 100%
rename from libc/intrin/timeval_tomillis.c
rename to libc/calls/timeval_tomillis.c
diff --git a/libc/intrin/timeval_toseconds.c b/libc/calls/timeval_toseconds.c
similarity index 100%
rename from libc/intrin/timeval_toseconds.c
rename to libc/calls/timeval_toseconds.c
diff --git a/libc/calls/tinyprint.c b/libc/calls/tinyprint.c
index 2411d9b20..55aec26ef 100644
--- a/libc/calls/tinyprint.c
+++ b/libc/calls/tinyprint.c
@@ -62,7 +62,6 @@ ssize_t tinyprint(int fd, const char *s, ...) {
       buf[n++] = c;
       if (n == sizeof(buf)) {
         if (tinyflush(fd, buf, n, &toto)) {
-          va_end(va);
           return toto;
         }
         n = 0;
diff --git a/libc/calls/tmpdir.c b/libc/calls/tmpdir.c
index 946bf2bbf..552467167 100644
--- a/libc/calls/tmpdir.c
+++ b/libc/calls/tmpdir.c
@@ -21,7 +21,7 @@
 #include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/systeminfo.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
diff --git a/libc/intrin/ucontext.c b/libc/calls/ucontext.c
similarity index 97%
rename from libc/intrin/ucontext.c
rename to libc/calls/ucontext.c
index f7472c127..d5ba75a94 100644
--- a/libc/intrin/ucontext.c
+++ b/libc/calls/ucontext.c
@@ -23,7 +23,7 @@
 #include "libc/sysv/consts/sig.h"
 #include "libc/thread/tls.h"
 
-int __tailcontext(const ucontext_t *) wontreturn;
+int __tailcontext(const ucontext_t *);
 
 /**
  * Sets machine context.
@@ -40,7 +40,7 @@ int setcontext(const ucontext_t *uc) {
   } else {
     sys_sigprocmask(SIG_SETMASK, &uc->uc_sigmask, 0);
   }
-  __tailcontext(uc);
+  return __tailcontext(uc);
 }
 
 int __getcontextsig(ucontext_t *uc) {
diff --git a/libc/calls/ucontext.h b/libc/calls/ucontext.h
index b17527c93..b820686bc 100644
--- a/libc/calls/ucontext.h
+++ b/libc/calls/ucontext.h
@@ -168,8 +168,8 @@ struct ucontext {
 
 typedef struct ucontext ucontext_t;
 
-int getcontext(ucontext_t *) dontthrow __read_write(1);
-int setcontext(const ucontext_t *) dontthrow __read_only(1);
+int getcontext(ucontext_t *) dontthrow;
+int setcontext(const ucontext_t *) dontthrow;
 int swapcontext(ucontext_t *, const ucontext_t *) dontthrow returnstwice;
 void makecontext(ucontext_t *, void *, int, ...) dontthrow dontcallback;
 void __sig_restore(const ucontext_t *) wontreturn;
diff --git a/libc/calls/uname.c b/libc/calls/uname.c
index 0f3f0e2d4..8569c7839 100644
--- a/libc/calls/uname.c
+++ b/libc/calls/uname.c
@@ -27,7 +27,7 @@
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/strace.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/computernameformat.h"
 #include "libc/nt/systeminfo.h"
 #include "libc/runtime/runtime.h"
diff --git a/libc/calls/unlinkat-nt.c b/libc/calls/unlinkat-nt.c
index a0209ed1c..536694bd3 100644
--- a/libc/calls/unlinkat-nt.c
+++ b/libc/calls/unlinkat-nt.c
@@ -57,14 +57,15 @@ static textwindows bool IsDirectorySymlink(const char16_t *path) {
 static textwindows int sys_rmdir_nt(const char16_t *path) {
   int ms;
   for (ms = 1;; ms *= 2) {
-    if (RemoveDirectory(path))
+    if (RemoveDirectory(path)) {
       return 0;
+    }
     // Files can linger, for absolutely no reason.
     // Possibly some Windows Defender bug on Win7.
     // Sleep for up to one second w/ expo backoff.
     // Alternative is use Microsoft internal APIs.
     // Never could have imagined it'd be this bad.
-    if (GetLastError() == kNtErrorDirNotEmpty && ms <= 1024) {
+    if (GetLastError() == kNtErrorDirNotEmpty && ms <= 2048) {
       Sleep(ms);
       continue;
     } else {
diff --git a/libc/calls/unlinkat.c b/libc/calls/unlinkat.c
index b23830b59..33bd2f572 100644
--- a/libc/calls/unlinkat.c
+++ b/libc/calls/unlinkat.c
@@ -39,13 +39,12 @@
  * @param path is the thing to delete
  * @param flags can have AT_REMOVEDIR
  * @return 0 on success, or -1 w/ errno
- * @raise EROFS if either path is under /zip/...
  */
 int unlinkat(int dirfd, const char *path, int flags) {
   int rc;
 
   if (_weaken(__zipos_notat) && (rc = __zipos_notat(dirfd, path)) == -1) {
-    rc = erofs();
+    STRACE("zipos unlinkat not supported yet");
   } else if (!IsWindows()) {
     rc = sys_unlinkat(dirfd, path, flags);
   } else {
@@ -54,13 +53,12 @@ int unlinkat(int dirfd, const char *path, int flags) {
 
   // POSIX.1 says unlink(directory) raises EPERM but on Linux
   // it always raises EISDIR, which is so much less ambiguous
-  int e = errno;
-  if (!IsLinux() && rc == -1 && !flags && (e == EPERM || e == EACCES)) {
+  if (!IsLinux() && rc == -1 && !flags && errno == EPERM) {
     struct stat st;
     if (!fstatat(dirfd, path, &st, 0) && S_ISDIR(st.st_mode)) {
       errno = EISDIR;
     } else {
-      errno = e;
+      errno = EPERM;
     }
   }
 
diff --git a/libc/calls/unveil.c b/libc/calls/unveil.c
index 971c7b2b0..8112fc721 100644
--- a/libc/calls/unveil.c
+++ b/libc/calls/unveil.c
@@ -33,7 +33,7 @@
 #include "libc/fmt/libgen.h"
 #include "libc/intrin/strace.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/vendor.internal.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
@@ -405,15 +405,6 @@ int sys_unveil_linux(const char *path, const char *permissions) {
  *     - `c` allows `path` to be created and removed, corresponding to
  *       the pledge promise "cpath".
  *
- * If having unveil() security is mission critical, then add this code
- * to the start of your main() function to ensure your program fails
- * with an error if it isn't available.
- *
- *     if (unveil("", 0) >= 0) {
- *       fprintf(stderr, "error: OS doesn't support unveil() security\n");
- *       exit(1);
- *     }
- *
  * @return 0 on success, or -1 w/ errno; note: if `unveil("",0)` is used
  *     to perform a feature check, then on Linux a value greater than 0
  *     shall be returned which is the supported Landlock ABI version
diff --git a/libc/calls/usleep.c b/libc/calls/usleep.c
index b137bfdd1..82dd7b55f 100644
--- a/libc/calls/usleep.c
+++ b/libc/calls/usleep.c
@@ -34,14 +34,10 @@
  * @norestart
  */
 int usleep(uint64_t micros) {
-  // All OSes except OpenBSD return instantly on usleep(0). So we might
-  // as well avoid system call overhead and helping OpenBSD work better
-  if (micros) {
-    errno_t err;
-    struct timespec ts = timespec_frommicros(micros);
-    err = clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, 0);
-    if (err)
-      return errno = err, -1;
-  }
+  errno_t err;
+  struct timespec ts = timespec_frommicros(micros);
+  err = clock_nanosleep(CLOCK_REALTIME, 0, &ts, 0);
+  if (err)
+    return errno = err, -1;
   return 0;
 }
diff --git a/libc/intrin/vdsofunc.c b/libc/calls/vdsofunc.greg.c
similarity index 100%
rename from libc/intrin/vdsofunc.c
rename to libc/calls/vdsofunc.greg.c
diff --git a/libc/calls/winexec.c b/libc/calls/winexec.c
index 429589c10..cdb41dd72 100644
--- a/libc/calls/winexec.c
+++ b/libc/calls/winexec.c
@@ -24,7 +24,7 @@
 #include "libc/nt/runtime.h"
 #include "libc/nt/struct/overlapped.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "third_party/linenoise/linenoise.h"
 
 #define Read32(s) (s[3] << 24 | s[2] << 16 | s[1] << 8 | s[0])
@@ -80,8 +80,7 @@ textwindows int IsWindowsExecutable(int64_t handle, const char16_t *path) {
   uint32_t got;
   BLOCK_SIGNALS;
   struct NtOverlapped overlap = {.hEvent = CreateEvent(0, 0, 0, 0)};
-  ok = overlap.hEvent &&
-       (ReadFile(handle, buf, 2, 0, &overlap) ||
+  ok = (ReadFile(handle, buf, 2, 0, &overlap) ||
         GetLastError() == kNtErrorIoPending) &&
        GetOverlappedResult(handle, &overlap, &got, true);
   CloseHandle(overlap.hEvent);
diff --git a/libc/calls/write-nt.c b/libc/calls/write-nt.c
index cbd721483..3e5eb0163 100644
--- a/libc/calls/write-nt.c
+++ b/libc/calls/write-nt.c
@@ -18,6 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/sigset.internal.h"
@@ -25,7 +26,6 @@
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/nomultics.h"
 #include "libc/intrin/weaken.h"
 #include "libc/nt/console.h"
@@ -53,17 +53,20 @@ static textwindows ssize_t sys_write_nt_impl(int fd, void *data, size_t size,
   bool isconsole = f->kind == kFdConsole;
 
   // not implemented, XNU returns eperm();
-  if (f->kind == kFdDevRandom)
+  if (f->kind == kFdDevRandom) {
     return eperm();
+  }
 
   // determine win32 handle for writing
   int64_t handle = f->handle;
-  if (isconsole && _weaken(GetConsoleOutputHandle))
+  if (isconsole && _weaken(GetConsoleOutputHandle)) {
     handle = _weaken(GetConsoleOutputHandle)();
+  }
 
   // intercept ansi tty configuration sequences
-  if (isconsole && _weaken(GetConsoleOutputHandle))
+  if (isconsole && _weaken(GetConsoleOutputHandle)) {
     _weaken(InterceptTerminalCommands)(data, size);
+  }
 
   // perform heavy lifting
   ssize_t rc;
diff --git a/libc/calls/writev.c b/libc/calls/writev.c
index 5e8899e14..7c476d3e5 100644
--- a/libc/calls/writev.c
+++ b/libc/calls/writev.c
@@ -57,7 +57,6 @@ static ssize_t writev_impl(int fd, const struct iovec *iov, int iovlen) {
       struct iovec *iov2;
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
       iov2 = alloca(iovlen * sizeof(struct iovec));
       CheckLargeStackAllocation(iov2, iovlen * sizeof(struct iovec));
 #pragma GCC pop_options
diff --git a/libc/cosmo.h b/libc/cosmo.h
index e91621e48..af2dc289c 100644
--- a/libc/cosmo.h
+++ b/libc/cosmo.h
@@ -1,6 +1,5 @@
 #ifndef COSMOPOLITAN_LIBC_COSMO_H_
 #define COSMOPOLITAN_LIBC_COSMO_H_
-#include "libc/calls/struct/timespec.h"
 COSMOPOLITAN_C_START_
 
 #ifndef __cplusplus
@@ -9,33 +8,12 @@ COSMOPOLITAN_C_START_
 #define _COSMO_ATOMIC(x) x
 #endif
 
-errno_t cosmo_once(_COSMO_ATOMIC(unsigned) *, void (*)(void));
+errno_t cosmo_once(_COSMO_ATOMIC(unsigned) *, void (*)(void)) libcesque;
 int systemvpe(const char *, char *const[], char *const[]) libcesque;
 char *GetProgramExecutableName(void) libcesque;
 void unleaf(void) libcesque;
-bool32 IsLinuxModern(void) libcesque;
-
 int __demangle(char *, const char *, size_t) libcesque;
 int __is_mangled(const char *) libcesque;
 
-int cosmo_args(const char *, char ***) libcesque;
-int LoadZipArgs(int *, char ***) libcesque;
-
-int cosmo_futex_wake(_COSMO_ATOMIC(int) *, int, char);
-int cosmo_futex_wait(_COSMO_ATOMIC(int) *, int, char, int,
-                     const struct timespec *);
-
-errno_t cosmo_stack_alloc(size_t *, size_t *, void **) libcesque;
-errno_t cosmo_stack_free(void *, size_t, size_t) libcesque;
-void cosmo_stack_clear(void) libcesque;
-void cosmo_stack_setmaxstacks(int) libcesque;
-int cosmo_stack_getmaxstacks(void) libcesque;
-
-int __deadlock_check(void *, int) libcesque;
-int __deadlock_tracked(void *) libcesque;
-void __deadlock_record(void *, int) libcesque;
-void __deadlock_track(void *, int) libcesque;
-void __deadlock_untrack(void *) libcesque;
-
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_COSMO_H_ */
diff --git a/libc/crt/crt.S b/libc/crt/crt.S
index 74226c641..b5ba61a59 100644
--- a/libc/crt/crt.S
+++ b/libc/crt/crt.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .section .start,"ax",@progbits
 
 #if SupportsXnu() && defined(__x86_64__)
@@ -47,14 +47,7 @@ __oops_win32:
 //	@note	ape.S and ape-loader both set RCX to XNU on Darwin
 //	@noreturn
 _start:
-	.cfi_startproc
-#if defined(__x86_64__)
-	.cfi_undefined rip
-#elif defined(__aarch64__)
-	.cfi_undefined x30
-#endif /* __x86_64__ */
-
-#if defined(__x86_64__)
+#ifdef __x86_64__
 
 #if SupportsFreebsd()
 //	detect free besiyata dishmaya
@@ -166,5 +159,4 @@ _start:
 #else
 #error "architecture unsupported"
 #endif /* __x86_64__ */
-	.cfi_endproc
 	.endfn	_start,weak,hidden
diff --git a/libc/dlopen/dlopen.c b/libc/dlopen/dlopen.c
index 2a47aa99e..99c648776 100644
--- a/libc/dlopen/dlopen.c
+++ b/libc/dlopen/dlopen.c
@@ -57,7 +57,6 @@
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/temp.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 
@@ -132,23 +131,24 @@ struct {
 
 long __sysv2nt14();
 long foreign_tramp();
-void __dlopen_lock(void);
-void __dlopen_unlock(void);
 
 static _Thread_local char dlerror_buf[128];
 
 static const char *get_tmp_dir(void) {
   const char *tmpdir;
-  if (!(tmpdir = getenv("TMPDIR")) || !*tmpdir)
-    if (!(tmpdir = getenv("HOME")) || !*tmpdir)
+  if (!(tmpdir = getenv("TMPDIR")) || !*tmpdir) {
+    if (!(tmpdir = getenv("HOME")) || !*tmpdir) {
       tmpdir = ".";
+    }
+  }
   return tmpdir;
 }
 
 static int is_file_newer_than(const char *path, const char *other) {
   struct stat st1, st2;
-  if (stat(path, &st1))
+  if (stat(path, &st1)) {
     return -1;
+  }
   if (stat(other, &st2)) {
     if (errno == ENOENT) {
       return 2;
@@ -191,24 +191,29 @@ static char *elf_map(int fd, Elf64_Ehdr *ehdr, Elf64_Phdr *phdr, long pagesz,
   Elf64_Addr maxva = 0;
   Elf64_Addr minva = -1;
   for (Elf64_Phdr *p = phdr; p < phdr + ehdr->e_phnum; p++) {
-    if (p->p_type != PT_LOAD)
+    if (p->p_type != PT_LOAD) {
       continue;
-    if (p->p_vaddr < minva)
+    }
+    if (p->p_vaddr < minva) {
       minva = p->p_vaddr & -pagesz;
-    if (p->p_vaddr + p->p_memsz > maxva)
+    }
+    if (p->p_vaddr + p->p_memsz > maxva) {
       maxva = p->p_vaddr + p->p_memsz;
+    }
   }
   uint8_t *base =
       __sys_mmap(0, maxva - minva, PROT_NONE,
                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0, 0);
-  if (base == MAP_FAILED)
+  if (base == MAP_FAILED) {
     return MAP_FAILED;
+  }
   for (Elf64_Phdr *p = phdr; p < phdr + ehdr->e_phnum; p++) {
     if (p->p_type != PT_LOAD) {
       if (p->p_type == PT_INTERP && interp_size &&
           (p->p_filesz >= interp_size - 1 ||
-           pread(fd, interp_path, p->p_filesz, p->p_offset) != p->p_filesz))
+           pread(fd, interp_path, p->p_filesz, p->p_offset) != p->p_filesz)) {
         return MAP_FAILED;
+      }
       continue;
     }
     Elf64_Addr skew = p->p_vaddr & (pagesz - 1);
@@ -223,24 +228,29 @@ static char *elf_map(int fd, Elf64_Ehdr *ehdr, Elf64_Phdr *phdr, long pagesz,
       prot1 &= ~PROT_EXEC;
     }
     if (__sys_mmap(base + p->p_vaddr - skew, skew + p->p_filesz, prot1,
-                   MAP_FIXED | MAP_PRIVATE, fd, off, off) == MAP_FAILED)
+                   MAP_FIXED | MAP_PRIVATE, fd, off, off) == MAP_FAILED) {
       return MAP_FAILED;
-    if (b > a)
+    }
+    if (b > a) {
       bzero(base + a, b - a);
+    }
     if (c > b && __sys_mmap(base + b, c - b, prot2,
                             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0,
-                            0) == MAP_FAILED)
+                            0) == MAP_FAILED) {
       return MAP_FAILED;
+    }
     if (prot1 != prot2 &&
-        sys_mprotect(base + p->p_vaddr - skew, skew + p->p_filesz, prot2))
+        sys_mprotect(base + p->p_vaddr - skew, skew + p->p_filesz, prot2)) {
       return MAP_FAILED;
+    }
   }
   return (void *)base;
 }
 
 static bool elf_slurp(struct Loaded *l, int fd, const char *file) {
-  if (pread(fd, &l->eh, 64, 0) != 64)
+  if (pread(fd, &l->eh, 64, 0) != 64) {
     return false;
+  }
   if (!IsElf64Binary(&l->eh, 64) ||                      //
       l->eh.e_phnum > sizeof(l->ph) / sizeof(*l->ph) ||  //
       l->eh.e_machine != get_host_elf_machine()) {
@@ -248,17 +258,19 @@ static bool elf_slurp(struct Loaded *l, int fd, const char *file) {
     return false;
   }
   int bytes = l->eh.e_phnum * sizeof(l->ph[0]);
-  if (pread(fd, l->ph, bytes, l->eh.e_phoff) != bytes)
+  if (pread(fd, l->ph, bytes, l->eh.e_phoff) != bytes) {
     return false;
+  }
   l->entry = (char *)l->eh.e_entry;
   return true;
 }
 
-dontinline static bool elf_load(struct Loaded *l, const char *file, long pagesz,
+static dontinline bool elf_load(struct Loaded *l, const char *file, long pagesz,
                                 char *interp_path, size_t interp_size) {
   int fd;
-  if ((fd = open(file, O_RDONLY | O_CLOEXEC)) == -1)
+  if ((fd = open(file, O_RDONLY | O_CLOEXEC)) == -1) {
     return false;
+  }
   if (!elf_slurp(l, fd, file)) {
     close(fd);
     return false;
@@ -280,7 +292,7 @@ static long *push_strs(long *sp, char **list, int count) {
   return sp;
 }
 
-wontreturn dontinstrument static void foreign_helper(void **p) {
+static wontreturn dontinstrument void foreign_helper(void **p) {
   __foreign.dlopen = p[0];
   __foreign.dlsym = p[1];
   __foreign.dlclose = p[2];
@@ -288,7 +300,7 @@ wontreturn dontinstrument static void foreign_helper(void **p) {
   _longjmp(__foreign.jb, 1);
 }
 
-dontinline static void elf_exec(const char *file, char **envp) {
+static dontinline void elf_exec(const char *file, char **envp) {
 
   // get microprocessor page size
   long pagesz = __pagesize;
@@ -296,13 +308,15 @@ dontinline static void elf_exec(const char *file, char **envp) {
   // load helper executable into address space
   struct Loaded prog;
   char interp_path[256] = {0};
-  if (!elf_load(&prog, file, pagesz, interp_path, sizeof(interp_path)))
+  if (!elf_load(&prog, file, pagesz, interp_path, sizeof(interp_path))) {
     return;
+  }
 
   // load platform c library into address space
   struct Loaded interp;
-  if (!elf_load(&interp, interp_path, pagesz, 0, 0))
+  if (!elf_load(&interp, interp_path, pagesz, 0, 0)) {
     return;
+  }
 
   // count environment variables
   int envc = 0;
@@ -412,14 +426,15 @@ static char *dlerror_set(const char *str) {
   return dlerror_buf;
 }
 
-dontinline static char *foreign_alloc_block(void) {
+static dontinline char *foreign_alloc_block(void) {
   char *p = 0;
   size_t sz = 65536;
   if (!IsWindows()) {
     p = __sys_mmap(0, sz, PROT_READ | PROT_WRITE | PROT_EXEC,
                    MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT, -1, 0, 0);
-    if (p == MAP_FAILED)
+    if (p == MAP_FAILED) {
       p = 0;
+    }
   } else {
     uintptr_t h;
     if ((h = CreateFileMapping(-1, 0, kNtPageExecuteReadwrite, 0, sz, 0))) {
@@ -435,16 +450,19 @@ dontinline static char *foreign_alloc_block(void) {
   return p;
 }
 
-dontinline static void *foreign_alloc(size_t n) {
+static dontinline void *foreign_alloc(size_t n) {
   void *res;
   static char *block;
-  __dlopen_lock();
-  if (!block || READ32LE(block) + n > 65536)
-    if (!(block = foreign_alloc_block()))
+  static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+  pthread_mutex_lock(&lock);
+  if (!block || READ32LE(block) + n > 65536) {
+    if (!(block = foreign_alloc_block())) {
       return 0;
+    }
+  }
   res = block + READ32LE(block);
   WRITE32LE(block, READ32LE(block) + n);
-  __dlopen_unlock();
+  pthread_mutex_unlock(&lock);
   return res;
 }
 
@@ -452,8 +470,9 @@ static uint8_t *movimm(uint8_t p[static 16], int reg, uint64_t val) {
 #ifdef __x86_64__
   int rex;
   rex = AMD_REXW;
-  if (reg & 8)
+  if (reg & 8) {
     rex |= AMD_REXB;
+  }
   *p++ = rex;
   *p++ = AMD_MOV_IMM | (reg & 7);
   p = WRITE64LE(p, val);
@@ -538,9 +557,7 @@ static void *foreign_thunk_nt(void *func) {
   // movabs $tramp,%r10
   code[14] = 0x49;
   code[15] = 0xba;
-#ifdef __x86_64__
   WRITE64LE(code + 16, (uintptr_t)__sysv2nt14);
-#endif
   // jmp *%r10
   code[24] = 0x41;
   code[25] = 0xff;
@@ -548,13 +565,14 @@ static void *foreign_thunk_nt(void *func) {
   return code;
 }
 
-dontinline static bool foreign_compile(char exe[hasatleast PATH_MAX]) {
+static dontinline bool foreign_compile(char exe[hasatleast PATH_MAX]) {
 
   // construct path
   strlcpy(exe, get_tmp_dir(), PATH_MAX);
   strlcat(exe, "/.cosmo/", PATH_MAX);
-  if (mkdir(exe, 0755) && errno != EEXIST)
+  if (mkdir(exe, 0755) && errno != EEXIST) {
     return false;
+  }
   strlcat(exe, "dlopen-helper", PATH_MAX);
 
   // skip build if helper exists and this program is older
@@ -585,8 +603,9 @@ dontinline static bool foreign_compile(char exe[hasatleast PATH_MAX]) {
       ssize_t got = pread(fd, sauce, sizeof(HELPER), 0);
       close(fd);
       if (got == sizeof(HELPER) - 1 &&
-          !memcmp(sauce, HELPER, sizeof(HELPER) - 1))
+          !memcmp(sauce, HELPER, sizeof(HELPER) - 1)) {
         return true;
+      }
     }
   }
 
@@ -594,8 +613,9 @@ dontinline static bool foreign_compile(char exe[hasatleast PATH_MAX]) {
   char tmp[PATH_MAX];
   strlcpy(tmp, src, PATH_MAX);
   strlcat(tmp, ".XXXXXX", PATH_MAX);
-  if ((fd = mkostemp(tmp, O_CLOEXEC)) == -1)
+  if ((fd = mkostemp(tmp, O_CLOEXEC)) == -1) {
     return false;
+  }
   if (write(fd, HELPER, sizeof(HELPER) - 1) != sizeof(HELPER) - 1) {
     close(fd);
     unlink(tmp);
@@ -613,8 +633,9 @@ dontinline static bool foreign_compile(char exe[hasatleast PATH_MAX]) {
   // create executable
   strlcpy(tmp, exe, PATH_MAX);
   strlcat(tmp, ".XXXXXX", PATH_MAX);
-  if ((fd = mkostemp(tmp, O_CLOEXEC)) == -1)
+  if ((fd = mkostemp(tmp, O_CLOEXEC)) == -1) {
     return false;
+  }
   int pid, ws;
   char *args[] = {
       "cc",
@@ -633,11 +654,11 @@ dontinline static bool foreign_compile(char exe[hasatleast PATH_MAX]) {
     errno = err;
     return false;
   }
-  if (waitpid(pid, &ws, 0) == -1) {
-    // signals and cancelation are blocked
-    // therefore this must be a real error
-    unlink(tmp);
-    return false;
+  while (waitpid(pid, &ws, 0) == -1) {
+    if (errno != EINTR) {
+      unlink(tmp);
+      return false;
+    }
   }
   if (ws) {
     unlink(tmp);
@@ -683,8 +704,9 @@ static void foreign_once(void) {
 static bool foreign_init(void) {
   bool res;
   cosmo_once(&__foreign.once, foreign_once);
-  if (!(res = __foreign.is_supported))
+  if (!(res = __foreign.is_supported)) {
     dlerror_set("dlopen() isn't supported on this platform");
+  }
   return res;
 }
 
@@ -720,8 +742,9 @@ static void *dlopen_nt(const char *path, int mode) {
     path16[n + 0] = 'l';
     path16[n + 1] = 0;
   }
-  if (!(handle = LoadLibrary(path16)))
+  if (!(handle = LoadLibrary(path16))) {
     dlerror_set("library not found");
+  }
   return (void *)handle;
 }
 
@@ -740,14 +763,18 @@ static void *dlopen_silicon(const char *path, int mode) {
   int n;
   int xnu_mode = 0;
   char path2[PATH_MAX + 5];
-  if (mode & ~(RTLD_LOCAL | RTLD_LAZY | RTLD_NOW | RTLD_GLOBAL))
+  if (mode & ~(RTLD_LOCAL | RTLD_LAZY | RTLD_NOW | RTLD_GLOBAL)) {
     xnu_mode = -1;  // punt error to system dlerror() impl
-  if (!(mode & RTLD_GLOBAL))
+  }
+  if (!(mode & RTLD_GLOBAL)) {
     xnu_mode |= XNU_RTLD_LOCAL;  // unlike Linux, XNU defaults to RTLD_GLOBAL
-  if (mode & RTLD_NOW)
+  }
+  if (mode & RTLD_NOW) {
     xnu_mode |= XNU_RTLD_NOW;
-  if (mode & RTLD_LAZY)
+  }
+  if (mode & RTLD_LAZY) {
     xnu_mode |= XNU_RTLD_LAZY;
+  }
   if ((n = strlen(path)) < PATH_MAX && n > 3 &&  //
       path[n - 3] == '.' &&                      //
       path[n - 2] == 's' &&                      //
@@ -810,7 +837,7 @@ void *cosmo_dlopen(const char *path, int mode) {
   }
   ALLOW_CANCELATION;
   ALLOW_SIGNALS;
-  STRACE("cosmo_dlopen(%#s, %d) → %p% m", path, mode, res);
+  STRACE("dlopen(%#s, %d) → %p% m", path, mode, res);
   return res;
 }
 
@@ -855,7 +882,7 @@ void *cosmo_dlsym(void *handle, const char *name) {
   } else {
     func = 0;
   }
-  STRACE("cosmo_dlsym(%p, %#s) → %p", handle, name, func);
+  STRACE("dlsym(%p, %#s) → %p", handle, name, func);
   return func;
 }
 
@@ -890,7 +917,7 @@ int cosmo_dlclose(void *handle) {
   } else {
     res = -1;
   }
-  STRACE("cosmo_dlclose(%p) → %d", handle, res);
+  STRACE("dlclose(%p) → %d", handle, res);
   return res;
 }
 
@@ -909,6 +936,6 @@ char *cosmo_dlerror(void) {
   } else {
     res = dlerror_buf;
   }
-  STRACE("cosmo_dlerror() → %#s", res);
+  STRACE("dlerror() → %#s", res);
   return res;
 }
diff --git a/libc/dlopen/foreign_tramp.S b/libc/dlopen/foreign_tramp.S
index 38dc914f1..dbd036306 100644
--- a/libc/dlopen/foreign_tramp.S
+++ b/libc/dlopen/foreign_tramp.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #define SIZE 0x0200
 #define SKEW 0x10
diff --git a/libc/dlopen/stubs.c b/libc/dlopen/stubs.c
index 357f864f3..9a94e891b 100644
--- a/libc/dlopen/stubs.c
+++ b/libc/dlopen/stubs.c
@@ -17,10 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dlopen/dlfcn.h"
-#include "libc/intrin/strace.h"
-
-#define DLOPEN_ERROR \
-  "dlopen() isn't supported; consider using cosmo_dlopen() and read its docs"
 
 /**
  * Opens dynamic shared object using host platform libc.
@@ -31,13 +27,12 @@
  *
  * @return null always
  */
-void *dlopen(const char *path, int mode) {
-  STRACE("dlopen(%#s, %d) → 0 [%s]", path, mode, DLOPEN_ERROR);
+void *dlopen(const char *, int) {
   return 0;
 }
 
 char *dlerror(void) {
-  return DLOPEN_ERROR;
+  return "dlopen() isn't supported by cosmo; try using cosmo_dlopen()";
 }
 
 void *dlsym(void *, const char *) {
@@ -47,3 +42,7 @@ void *dlsym(void *, const char *) {
 int dlclose(void *) {
   return -1;
 }
+
+int dl_iterate_phdr(int (*)(void *, size_t, void *), void *) {
+  return -1;
+}
diff --git a/libc/dos.h b/libc/dos.internal.h
similarity index 100%
rename from libc/dos.h
rename to libc/dos.internal.h
diff --git a/libc/elf/def.h b/libc/elf/def.h
index 04d69985e..913e9c930 100644
--- a/libc/elf/def.h
+++ b/libc/elf/def.h
@@ -68,7 +68,6 @@
 #define EM_NONE      0
 #define EM_M32       1
 #define EM_386       3
-#define EM_MIPS      8
 #define EM_PPC64     21
 #define EM_S390      22
 #define EM_ARM       40
diff --git a/libc/errno.h b/libc/errno.h
index f8963ed98..8a3a04f30 100644
--- a/libc/errno.h
+++ b/libc/errno.h
@@ -26,11 +26,11 @@ COSMOPOLITAN_C_START_
 /* this header is included by 700+ files; therefore we */
 /* hand-roll &__get_tls()->tib_errno to avoid #include */
 /* cosmopolitan uses x28 as the tls register b/c apple */
-#define errno                                       \
-  (*__extension__({                                 \
-    errno_t *__ep;                                  \
-    __asm__("sub\t%0,x28,#1024-0x3c" : "=r"(__ep)); \
-    __ep;                                           \
+#define errno                                      \
+  (*__extension__({                                \
+    errno_t *__ep;                                 \
+    __asm__("sub\t%0,x28,#512-0x3c" : "=r"(__ep)); \
+    __ep;                                          \
   }))
 #else
 #define errno (*__errno_location())
diff --git a/libc/fmt/BUILD.mk b/libc/fmt/BUILD.mk
index 8fdbfeb14..4114c6ba7 100644
--- a/libc/fmt/BUILD.mk
+++ b/libc/fmt/BUILD.mk
@@ -40,7 +40,7 @@ LIBC_FMT_A_DIRECTDEPS =				\
 	LIBC_STR				\
 	LIBC_SYSV				\
 	LIBC_TINYMATH				\
-	THIRD_PARTY_COMPILER_RT			\
+	THIRD_PARTY_COMPILER_RT
 
 LIBC_FMT_A_DEPS :=				\
 	$(call uniq,$(foreach x,$(LIBC_FMT_A_DIRECTDEPS),$($(x))))
diff --git a/libc/fmt/bing.c b/libc/fmt/bing.c
index 8280685cb..3ccfb87c6 100644
--- a/libc/fmt/bing.c
+++ b/libc/fmt/bing.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/fmt/bing.internal.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * Turns binary octet into unicode glyph representation.
diff --git a/libc/fmt/internal.h b/libc/fmt/internal.h
index b82c4b382..65e80281a 100644
--- a/libc/fmt/internal.h
+++ b/libc/fmt/internal.h
@@ -2,7 +2,6 @@
 #define COSMOPOLITAN_LIBC_FMT_STRTOL_H_
 #include "libc/ctype.h"
 #include "libc/errno.h"
-#include "libc/nt/thunk/msabi.h"
 #include "libc/str/str.h"
 
 #define CONSUME_SPACES(t, s, c) \
@@ -48,7 +47,6 @@
 
 int __vcscanf(int (*)(void *), int (*)(int, void *), void *, const char *,
               va_list);
-int __fmt(void *, void *, const char *, va_list, int *);
-char16_t *__itoa16(char16_t[21], uint64_t) __msabi;
+int __fmt(void *, void *, const char *, va_list);
 
 #endif /* COSMOPOLITAN_LIBC_FMT_STRTOL_H_ */
diff --git a/libc/fmt/itoa64radix16.greg.c b/libc/fmt/itoa64radix16.greg.c
index 28555685d..25c5e55a9 100644
--- a/libc/fmt/itoa64radix16.greg.c
+++ b/libc/fmt/itoa64radix16.greg.c
@@ -19,7 +19,7 @@
 #include "libc/fmt/conv.h"
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/bsr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 size_t uint64toarray_radix16(uint64_t x, char b[hasatleast 17]) {
   return uint64toarray_fixed16(x, b, ROUNDUP(x ? bsrl(x) + 1 : 1, 4));
diff --git a/libc/fmt/magnumstrs.internal.h b/libc/fmt/magnumstrs.internal.h
index af1daba81..77833499c 100644
--- a/libc/fmt/magnumstrs.internal.h
+++ b/libc/fmt/magnumstrs.internal.h
@@ -21,7 +21,6 @@ extern const struct MagnumStr kErrnoDocs[];
 extern const struct MagnumStr kErrnoNames[];
 extern const struct MagnumStr kFcntlCmds[];
 extern const struct MagnumStr kIpOptnames[];
-extern const struct MagnumStr kIpv6Optnames[];
 extern const struct MagnumStr kOpenFlags[];
 extern const struct MagnumStr kRlimitNames[];
 extern const struct MagnumStr kSignalNames[];
@@ -29,8 +28,7 @@ extern const struct MagnumStr kSockOptnames[];
 extern const struct MagnumStr kTcpOptnames[];
 extern const struct MagnumStr kPollNames[];
 
-const char *_DescribeMagnum(char *, const struct MagnumStr *, const char *,
-                            int);
+const char *DescribeMagnum(char *, const struct MagnumStr *, const char *, int);
 
 __funline const char *GetMagnumStr(const struct MagnumStr *ms, int x) {
   int i;
diff --git a/libc/fmt/unbing.c b/libc/fmt/unbing.c
index 66ae63d7f..ddee8e828 100644
--- a/libc/fmt/unbing.c
+++ b/libc/fmt/unbing.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/bing.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 static const int kCp437i[] = {
     0x000a << 8 | 10,   // \n NEWLINE
diff --git a/libc/fmt/unzleb64.c b/libc/fmt/unzleb64.c
index 4627da678..edc7c71c7 100644
--- a/libc/fmt/unzleb64.c
+++ b/libc/fmt/unzleb64.c
@@ -28,7 +28,7 @@
        ░███▓▀                                                    ▀▓▓██▀▀░
         ░▀░                                                         */
 #include "libc/fmt/leb128.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 /**
  * Decodes array to signed integer w/ zig-zag encoding.
diff --git a/libc/fmt/wcstol.c b/libc/fmt/wcstol.c
index ac3037325..f95000e1c 100644
--- a/libc/fmt/wcstol.c
+++ b/libc/fmt/wcstol.c
@@ -22,7 +22,7 @@
 #include "libc/limits.h"
 #include "libc/stdckdint.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * Decodes signed long integer from wide string.
diff --git a/libc/fmt/wcstoul.c b/libc/fmt/wcstoul.c
index b953c1366..9085a8000 100644
--- a/libc/fmt/wcstoul.c
+++ b/libc/fmt/wcstoul.c
@@ -22,7 +22,7 @@
 #include "libc/limits.h"
 #include "libc/stdckdint.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * Decodes unsigned integer from wide string.
diff --git a/libc/imag.h b/libc/imag.internal.h
similarity index 100%
rename from libc/imag.h
rename to libc/imag.internal.h
diff --git a/libc/integral/c.inc b/libc/integral/c.inc
index 7a00cd8da..c3638a08b 100644
--- a/libc/integral/c.inc
+++ b/libc/integral/c.inc
@@ -65,26 +65,6 @@ typedef __UINT64_TYPE__ uint64_t;
 typedef __INTMAX_TYPE__ intmax_t;
 typedef __UINTMAX_TYPE__ uintmax_t;
 
-/* TODO(jart): re-import compiler-rt once they have it */
-#if defined(__x86_64__) && defined(__FLT128_MAX_10_EXP__)
-#undef __FLT128_MAX_10_EXP__
-#undef __FLT128_DENORM_MIN__
-#undef __FLT128_MIN_EXP__
-#undef __FLT128_MIN_10_EXP__
-#undef __FLT128_MANT_DIG__
-#undef __FLT128_HAS_INFINITY__
-#undef __FLT128_EPSILON__
-#undef __FLT128_MAX_EXP__
-#undef __FLT128_HAS_DENORM__
-#undef __FLT128_DIG__
-#undef __FLT128_MIN__
-#undef __FLT128_MAX__
-#undef __FLT128_NORM_MAX__
-#undef __FLT128_HAS_QUIET_NAN__
-#undef __FLT128_IS_IEC_60559__
-#undef __FLT128_DECIMAL_DIG__
-#endif
-
 #define __DEFINED_max_align_t
 typedef long double max_align_t;
 
@@ -135,7 +115,7 @@ typedef struct {
 #define strftimeesque(n) __attribute__((__format__(__strftime__, n, 0)))
 
 #ifndef privileged
-#define privileged _Section(".privileged") dontinstrument dontubsan
+#define privileged _Section(".privileged") dontinline dontinstrument dontubsan
 #endif
 
 #ifndef wontreturn
@@ -387,24 +367,6 @@ typedef struct {
 
 #define offsetof(type, member) __builtin_offsetof(type, member)
 
-#if defined(__GNUC__) && __GNUC__ >= 10
-#define __read_only(...) __attribute__((__access__(__read_only__, __VA_ARGS__)))
-#define __write_only(...) \
-  __attribute__((__access__(__write_only__, __VA_ARGS__)))
-#define __read_write(...) \
-  __attribute__((__access__(__read_write__, __VA_ARGS__)))
-#else
-#define __read_only(...)
-#define __write_only(...)
-#define __read_write(...)
-#endif
-
-#if defined(__GNUC__) && __GNUC__ >= 13
-#define __fd_arg(N) __attribute__((__fd_arg__(N)))
-#else
-#define __fd_arg(N)
-#endif
-
 #ifdef _COSMO_SOURCE
 
 #ifndef dontinstrument
diff --git a/libc/integral/normalize.inc b/libc/integral/normalize.inc
index 97bd665cb..41f5933c8 100644
--- a/libc/integral/normalize.inc
+++ b/libc/integral/normalize.inc
@@ -2,9 +2,9 @@
 #undef __COSMOPOLITAN__
 #endif
 
-#define __COSMOPOLITAN_MAJOR__ 4
-#define __COSMOPOLITAN_MINOR__ 0
-#define __COSMOPOLITAN_PATCH__ 2
+#define __COSMOPOLITAN_MAJOR__ 3
+#define __COSMOPOLITAN_MINOR__ 6
+#define __COSMOPOLITAN_PATCH__ 1
 #define __COSMOPOLITAN__                                                   \
   (100000000 * __COSMOPOLITAN_MAJOR__ + 1000000 * __COSMOPOLITAN_MINOR__ + \
    __COSMOPOLITAN_PATCH__)
@@ -79,10 +79,6 @@
 #undef __linux__
 #endif
 
-#ifdef __gnu_linux__
-#undef __gnu_linux__
-#endif
-
 #ifndef __BIGGEST_ALIGNMENT__
 #define __BIGGEST_ALIGNMENT__ 16
 #endif
@@ -93,30 +89,6 @@
 #include "libc/integral/llp64.inc"
 #endif
 
-#undef __INT_FAST16_MAX__
-#undef __INT_FAST16_TYPE__
-#undef __UINT_FAST16_MAX__
-#undef __INT_FAST16_WIDTH__
-#undef __UINT_FAST16_TYPE__
-
-#define __INT_FAST16_MAX__   2147483647
-#define __INT_FAST16_TYPE__  int
-#define __UINT_FAST16_MAX__  4294967295U
-#define __INT_FAST16_WIDTH__ 32
-#define __UINT_FAST16_TYPE__ unsigned int
-
-#undef __INT_FAST32_MAX__
-#undef __INT_FAST32_TYPE__
-#undef __UINT_FAST32_MAX__
-#undef __INT_FAST32_WIDTH__
-#undef __UINT_FAST32_TYPE__
-
-#define __INT_FAST32_MAX__   2147483647
-#define __INT_FAST32_TYPE__  int
-#define __UINT_FAST32_MAX__  4294967295U
-#define __INT_FAST32_WIDTH__ 32
-#define __UINT_FAST32_TYPE__ unsigned int
-
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
 #ifdef __STDC__
 #include "libc/integral/c.inc"
diff --git a/libc/intrin/BUILD.mk b/libc/intrin/BUILD.mk
index c84aeebaf..fa18d9b46 100644
--- a/libc/intrin/BUILD.mk
+++ b/libc/intrin/BUILD.mk
@@ -30,11 +30,9 @@ LIBC_INTRIN_A_CHECKS =					\
 LIBC_INTRIN_A_DIRECTDEPS =				\
 	LIBC_NEXGEN32E					\
 	LIBC_NT_KERNEL32				\
-	LIBC_NT_REALTIME				\
-	LIBC_NT_SYNCHRONIZATION				\
 	LIBC_NT_WS2_32					\
 	LIBC_SYSV					\
-	LIBC_SYSV_CALLS					\
+	LIBC_SYSV_CALLS
 
 LIBC_INTRIN_A_DEPS :=					\
 	$(call uniq,$(foreach x,$(LIBC_INTRIN_A_DIRECTDEPS),$($(x))))
@@ -64,7 +62,6 @@ o/$(MODE)/libc/intrin/kprintf.o: private		\
 			-Wframe-larger-than=128		\
 			-Walloca-larger-than=128
 
-o/$(MODE)/libc/intrin/cursor.o				\
 o/$(MODE)/libc/intrin/mmap.o				\
 o/$(MODE)/libc/intrin/tree.o: private			\
 		CFLAGS +=				\
@@ -100,38 +97,14 @@ o/$(MODE)/libc/intrin/x86.o: private			\
 			-fpatchable-function-entry=0	\
 			-Os
 
-# avoid the legacy sse decoding penalty on avx systems
-o//libc/intrin/dll.o					\
-o//libc/intrin/fds.o					\
-o//libc/intrin/mmap.o					\
-o//libc/intrin/demangle.o: private			\
-		CFLAGS +=				\
-			-mgeneral-regs-only
-
-# ensure that division is optimized
-o/$(MODE)/libc/intrin/windowsdurationtotimeval.o	\
-o/$(MODE)/libc/intrin/windowsdurationtotimespec.o	\
-o/$(MODE)/libc/intrin/timevaltowindowstime.o		\
-o/$(MODE)/libc/intrin/timespectowindowstime.o		\
-o/$(MODE)/libc/intrin/windowstimetotimeval.o		\
-o/$(MODE)/libc/intrin/windowstimetotimespec.o: private	\
-		CFLAGS +=				\
-			-O2
-
 # these assembly files are safe to build on aarch64
-o/$(MODE)/libc/intrin/getcontext.o: libc/intrin/getcontext.S
-	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
-o/$(MODE)/libc/intrin/swapcontext.o: libc/intrin/swapcontext.S
-	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
-o/$(MODE)/libc/intrin/tailcontext.o: libc/intrin/tailcontext.S
-	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/intrin/aarch64/%.o: libc/intrin/aarch64/%.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/intrin/fenv.o: libc/intrin/fenv.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/intrin/gcov.o: libc/intrin/gcov.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
-o/$(MODE)/libc/intrin/cosmo_futex_thunk.o: libc/intrin/cosmo_futex_thunk.S
+o/$(MODE)/libc/intrin/futex.o: libc/intrin/futex.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/intrin/typeinfo.o: libc/intrin/typeinfo.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
@@ -143,8 +116,6 @@ o/$(MODE)/libc/intrin/kerrnodocs.o: libc/intrin/kerrnodocs.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/intrin/kipoptnames.o: libc/intrin/kipoptnames.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
-o/$(MODE)/libc/intrin/kipv6optnames.o: libc/intrin/kipv6optnames.S
-	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/intrin/kerrnonames.o: libc/intrin/kerrnonames.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/intrin/kfcntlcmds.o: libc/intrin/kfcntlcmds.S
diff --git a/libc/intrin/__getenv.c b/libc/intrin/__getenv.c
index 6d40aa91d..b387b458d 100644
--- a/libc/intrin/__getenv.c
+++ b/libc/intrin/__getenv.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/getenv.h"
 #include "libc/intrin/kprintf.h"
 
-privileged optimizesize struct Env __getenv(char **p, const char *k) {
+privileged struct Env __getenv(char **p, const char *k) {
   char *t;
   int i, j;
   for (i = 0; (t = p[i]); ++i) {
diff --git a/libc/intrin/aarch64/asmdefs.h b/libc/intrin/aarch64/asmdefs.h
index e8d677849..f18eb2bc1 100644
--- a/libc/intrin/aarch64/asmdefs.h
+++ b/libc/intrin/aarch64/asmdefs.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
 #define COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #ifdef __ASSEMBLER__
 // clang-format off
 
diff --git a/libc/intrin/aarch64/atomics.S b/libc/intrin/aarch64/atomics.S
deleted file mode 100644
index 17bc04fc3..000000000
--- a/libc/intrin/aarch64/atomics.S
+++ /dev/null
@@ -1,1919 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include "libc/macros.h"
-
-// aarch64 atomics compiler runtime
-//
-// armv8.1 introduced atomic instructions that go considerably faster.
-// you can pass the -mno-outline-atomics flag to the compiler to avoid
-// this runtime, however that'll go slower.
-
-.arch armv8-a+lse
-
-.macro .prvfn name
-	.privileged
-	.balign	16
-\name:
-.endm
-
-.macro .begfn name
-	.section .text.\name,"ax",%progbits
-	.balign	16
-	.ftrace1
-\name:
-	.ftrace2
-.endm
-
-.macro	jnatom	label
-	adrp	x16,__aarch64_have_lse_atomics
-	ldrb	w16,[x16,:lo12:__aarch64_have_lse_atomics]
-	cbz	w16,\label
-.endm
-
-
-.begfn __aarch64_swp1_relax
-	jnatom	1f
-	swpb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	stxrb	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp1_relax,globl
-
-.begfn __aarch64_swp1_acq
-	jnatom	1f
-	swpab	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	stxrb	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp1_acq,globl
-
-.begfn __aarch64_swp1_rel
-	jnatom	1f
-	swplb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	stlxrb	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp1_rel,globl
-
-.begfn __aarch64_swp1_acq_rel
-	jnatom	1f
-	swpalb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	stlxrb	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp1_acq_rel,globl
-
-.begfn __aarch64_swp1_sync
-	jnatom	1f
-	swpab	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	stxrb	w17,w16,[x1]
-	cbnz	w17,0b
-	dmb	ish
-	ret
-.endfn __aarch64_swp1_sync,globl
-
-
-.begfn __aarch64_cas1_relax
-	jnatom	1f
-	casb	w0,w1,[x2]
-	ret
-1:	uxtb	w16,w0
-0:	ldxrb	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stxrb	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas1_relax,globl
-
-.begfn __aarch64_cas1_acq
-	jnatom	1f
-	casab	w0,w1,[x2]
-	ret
-1:	uxtb	w16,w0
-0:	ldaxrb	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stxrb	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas1_acq,globl
-
-.begfn __aarch64_cas1_rel
-	jnatom	1f
-	caslb	w0,w1,[x2]
-	ret
-1:	uxtb	w16,w0
-0:	ldxrb	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stlxrb	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas1_rel,globl
-
-.begfn __aarch64_cas1_acq_rel
-	jnatom	1f
-	casalb	w0,w1,[x2]
-	ret
-1:	uxtb	w16,w0
-0:	ldaxrb	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stlxrb	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas1_acq_rel,globl
-
-.begfn __aarch64_cas1_sync
-	jnatom	1f
-	casalb	w0,w1,[x2]
-	ret
-1:	uxtb	w16,w0
-0:	ldxrb	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stlxrb	w17,w1,[x2]
-	cbnz	w17,0b
-1:	dmb	ish
-	ret
-.endfn __aarch64_cas1_sync,globl
-
-
-.begfn __aarch64_ldadd1_relax
-	jnatom	1f
-	ldaddb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	add	w17,w0,w16
-	stxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd1_relax,globl
-
-.begfn __aarch64_ldadd1_acq
-	jnatom	1f
-	ldaddab	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	add	w17,w0,w16
-	stxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd1_acq,globl
-
-.begfn __aarch64_ldadd1_rel
-	jnatom	1f
-	ldaddlb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	add	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd1_rel,globl
-
-.begfn __aarch64_ldadd1_acq_rel
-	jnatom	1f
-	ldaddalb w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	add	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd1_acq_rel,globl
-
-.begfn __aarch64_ldadd1_sync
-	jnatom	1f
-	ldaddalb w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	add	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldadd1_sync,globl
-
-
-.begfn __aarch64_ldset1_relax
-	jnatom	1f
-	ldsetb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	orr	w17,w0,w16
-	stxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset1_relax,globl
-
-.begfn __aarch64_ldset1_acq
-	jnatom	1f
-	ldsetab	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	orr	w17,w0,w16
-	stxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset1_acq,globl
-
-.begfn __aarch64_ldset1_rel
-	jnatom	1f
-	ldsetlb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	orr	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset1_rel,globl
-
-.begfn __aarch64_ldset1_acq_rel
-	jnatom	1f
-	ldsetalb w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	orr	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset1_acq_rel,globl
-
-.begfn __aarch64_ldset1_sync
-	jnatom	1f
-	ldsetalb w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	orr	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldset1_sync,globl
-
-
-.begfn __aarch64_ldclr1_relax
-	jnatom	1f
-	ldclrb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	bic	w17,w0,w16
-	stxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr1_relax,globl
-
-.begfn __aarch64_ldclr1_acq
-	jnatom	1f
-	ldclrab	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	bic	w17,w0,w16
-	stxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr1_acq,globl
-
-.begfn __aarch64_ldclr1_rel
-	jnatom	1f
-	ldclrlb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	bic	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr1_rel,globl
-
-.begfn __aarch64_ldclr1_acq_rel
-	jnatom	1f
-	ldclralb w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	bic	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr1_acq_rel,globl
-
-.begfn __aarch64_ldclr1_sync
-	jnatom	1f
-	ldclralb w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	bic	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldclr1_sync,globl
-
-
-.begfn __aarch64_ldeor1_relax
-	jnatom	1f
-	ldeorb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	eor	w17,w0,w16
-	stxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor1_relax,globl
-
-.begfn __aarch64_ldeor1_acq
-	jnatom	1f
-	ldeorab	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	eor	w17,w0,w16
-	stxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor1_acq,globl
-
-.begfn __aarch64_ldeor1_rel
-	jnatom	1f
-	ldeorlb	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	eor	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor1_rel,globl
-
-.begfn __aarch64_ldeor1_acq_rel
-	jnatom	1f
-	ldeoralb w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrb	w0,[x1]
-	eor	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor1_acq_rel,globl
-
-.begfn __aarch64_ldeor1_sync
-	jnatom	1f
-	ldeoralb w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrb	w0,[x1]
-	eor	w17,w0,w16
-	stlxrb	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldeor1_sync,globl
-
-
-.begfn __aarch64_swp2_relax
-	jnatom	1f
-	swph	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	stxrh	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp2_relax,globl
-
-.begfn __aarch64_swp2_acq
-	jnatom	1f
-	swpah	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	stxrh	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp2_acq,globl
-
-.begfn __aarch64_swp2_rel
-	jnatom	1f
-	swplh	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	stlxrh	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp2_rel,globl
-
-.begfn __aarch64_swp2_acq_rel
-	jnatom	1f
-	swpalh	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	stlxrh	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp2_acq_rel,globl
-
-.begfn __aarch64_swp2_sync
-	jnatom	1f
-	swpah	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	stxrh	w17,w16,[x1]
-	cbnz	w17,0b
-	dmb	ish
-	ret
-.endfn __aarch64_swp2_sync,globl
-
-
-.begfn __aarch64_cas2_relax
-	jnatom	1f
-	cash	w0,w1,[x2]
-	ret
-1:	uxth	w16,w0
-0:	ldxrh	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stxrh	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas2_relax,globl
-
-.begfn __aarch64_cas2_acq
-	jnatom	1f
-	casah	w0,w1,[x2]
-	ret
-1:	uxth	w16,w0
-0:	ldaxrh	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stxrh	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas2_acq,globl
-
-.begfn __aarch64_cas2_rel
-	jnatom	1f
-	caslh	w0,w1,[x2]
-	ret
-1:	uxth	w16,w0
-0:	ldxrh	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stlxrh	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas2_rel,globl
-
-.begfn __aarch64_cas2_acq_rel
-	jnatom	1f
-	casalh	w0,w1,[x2]
-	ret
-1:	uxth	w16,w0
-0:	ldaxrh	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stlxrh	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas2_acq_rel,globl
-
-.begfn __aarch64_cas2_sync
-	jnatom	1f
-	casalh	w0,w1,[x2]
-	ret
-1:	uxth	w16,w0
-0:	ldxrh	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stlxrh	w17,w1,[x2]
-	cbnz	w17,0b
-1:	dmb	ish
-	ret
-.endfn __aarch64_cas2_sync,globl
-
-
-.begfn __aarch64_ldadd2_relax
-	jnatom	1f
-	ldaddh	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	add	w17,w0,w16
-	stxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd2_relax,globl
-
-.begfn __aarch64_ldadd2_acq
-	jnatom	1f
-	ldaddah	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	add	w17,w0,w16
-	stxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd2_acq,globl
-
-.begfn __aarch64_ldadd2_rel
-	jnatom	1f
-	ldaddlh	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	add	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd2_rel,globl
-
-.begfn __aarch64_ldadd2_acq_rel
-	jnatom	1f
-	ldaddalh w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	add	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd2_acq_rel,globl
-
-.begfn __aarch64_ldadd2_sync
-	jnatom	1f
-	ldaddalh w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	add	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldadd2_sync,globl
-
-
-.begfn __aarch64_ldset2_relax
-	jnatom	1f
-	ldseth	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	orr	w17,w0,w16
-	stxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset2_relax,globl
-
-.begfn __aarch64_ldset2_acq
-	jnatom	1f
-	ldsetah	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	orr	w17,w0,w16
-	stxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset2_acq,globl
-
-.begfn __aarch64_ldset2_rel
-	jnatom	1f
-	ldsetlh	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	orr	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset2_rel,globl
-
-.begfn __aarch64_ldset2_acq_rel
-	jnatom	1f
-	ldsetalh w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	orr	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset2_acq_rel,globl
-
-.begfn __aarch64_ldset2_sync
-	jnatom	1f
-	ldsetalh w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	orr	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldset2_sync,globl
-
-
-.begfn __aarch64_ldclr2_relax
-	jnatom	1f
-	ldclrh	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	bic	w17,w0,w16
-	stxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr2_relax,globl
-
-.begfn __aarch64_ldclr2_acq
-	jnatom	1f
-	ldclrah	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	bic	w17,w0,w16
-	stxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr2_acq,globl
-
-.begfn __aarch64_ldclr2_rel
-	jnatom	1f
-	ldclrlh	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	bic	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr2_rel,globl
-
-.begfn __aarch64_ldclr2_acq_rel
-	jnatom	1f
-	ldclralh w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	bic	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr2_acq_rel,globl
-
-.begfn __aarch64_ldclr2_sync
-	jnatom	1f
-	ldclralh w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	bic	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldclr2_sync,globl
-
-
-.begfn __aarch64_ldeor2_relax
-	jnatom	1f
-	ldeorh	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	eor	w17,w0,w16
-	stxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor2_relax,globl
-
-.begfn __aarch64_ldeor2_acq
-	jnatom	1f
-	ldeorah	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	eor	w17,w0,w16
-	stxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor2_acq,globl
-
-.begfn __aarch64_ldeor2_rel
-	jnatom	1f
-	ldeorlh	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	eor	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor2_rel,globl
-
-.begfn __aarch64_ldeor2_acq_rel
-	jnatom	1f
-	ldeoralh w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxrh	w0,[x1]
-	eor	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor2_acq_rel,globl
-
-.begfn __aarch64_ldeor2_sync
-	jnatom	1f
-	ldeoralh w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxrh	w0,[x1]
-	eor	w17,w0,w16
-	stlxrh	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldeor2_sync,globl
-
-
-.begfn __aarch64_swp4_relax
-	jnatom	1f
-	swp	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	stxr	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp4_relax,globl
-
-.begfn __aarch64_swp4_acq
-	jnatom	1f
-	swpa	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	stxr	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp4_acq,globl
-
-.begfn __aarch64_swp4_rel
-	jnatom	1f
-	swpl	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	stlxr	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp4_rel,globl
-
-.begfn __aarch64_swp4_acq_rel
-	jnatom	1f
-	swpal	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	stlxr	w17,w16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp4_acq_rel,globl
-
-.begfn __aarch64_swp4_sync
-	jnatom	1f
-	swpa	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	stxr	w17,w16,[x1]
-	cbnz	w17,0b
-	dmb	ish
-	ret
-.endfn __aarch64_swp4_sync,globl
-
-
-.begfn __aarch64_cas4_relax
-	jnatom	1f
-	cas	w0,w1,[x2]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stxr	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas4_relax,globl
-
-.begfn __aarch64_cas4_acq
-	jnatom	1f
-	casa	w0,w1,[x2]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stxr	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas4_acq,globl
-
-.begfn __aarch64_cas4_rel
-	jnatom	1f
-	casl	w0,w1,[x2]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stlxr	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas4_rel,globl
-
-.begfn __aarch64_cas4_acq_rel
-	jnatom	1f
-	casal	w0,w1,[x2]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stlxr	w17,w1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas4_acq_rel,globl
-
-.begfn __aarch64_cas4_sync
-	jnatom	1f
-	casal	w0,w1,[x2]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x2]
-	cmp	w0,w16
-	bne	1f
-	stlxr	w17,w1,[x2]
-	cbnz	w17,0b
-1:	dmb	ish
-	ret
-.endfn __aarch64_cas4_sync,globl
-
-
-.begfn __aarch64_ldadd4_relax
-	jnatom	1f
-	ldadd	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	add	w17,w0,w16
-	stxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd4_relax,globl
-
-.begfn __aarch64_ldadd4_acq
-	jnatom	1f
-	ldadda	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	add	w17,w0,w16
-	stxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd4_acq,globl
-
-.begfn __aarch64_ldadd4_rel
-	jnatom	1f
-	ldaddl	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	add	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd4_rel,globl
-
-.begfn __aarch64_ldadd4_acq_rel
-	jnatom	1f
-	ldaddal	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	add	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd4_acq_rel,globl
-
-.begfn __aarch64_ldadd4_sync
-	jnatom	1f
-	ldaddal	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	add	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldadd4_sync,globl
-
-
-.begfn __aarch64_ldset4_relax
-	jnatom	1f
-	ldset	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	orr	w17,w0,w16
-	stxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset4_relax,globl
-
-.begfn __aarch64_ldset4_acq
-	jnatom	1f
-	ldseta	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	orr	w17,w0,w16
-	stxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset4_acq,globl
-
-.begfn __aarch64_ldset4_rel
-	jnatom	1f
-	ldsetl	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	orr	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset4_rel,globl
-
-.begfn __aarch64_ldset4_acq_rel
-	jnatom	1f
-	ldsetal	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	orr	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset4_acq_rel,globl
-
-.begfn __aarch64_ldset4_sync
-	jnatom	1f
-	ldsetal	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	orr	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldset4_sync,globl
-
-
-.begfn __aarch64_ldclr4_relax
-	jnatom	1f
-	ldclr	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	bic	w17,w0,w16
-	stxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr4_relax,globl
-
-.begfn __aarch64_ldclr4_acq
-	jnatom	1f
-	ldclra	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	bic	w17,w0,w16
-	stxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr4_acq,globl
-
-.begfn __aarch64_ldclr4_rel
-	jnatom	1f
-	ldclrl	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	bic	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr4_rel,globl
-
-.begfn __aarch64_ldclr4_acq_rel
-	jnatom	1f
-	ldclral	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	bic	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr4_acq_rel,globl
-
-.begfn __aarch64_ldclr4_sync
-	jnatom	1f
-	ldclral	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	bic	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldclr4_sync,globl
-
-
-.begfn __aarch64_ldeor4_relax
-	jnatom	1f
-	ldeor	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	eor	w17,w0,w16
-	stxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor4_relax,globl
-
-.begfn __aarch64_ldeor4_acq
-	jnatom	1f
-	ldeora	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	eor	w17,w0,w16
-	stxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor4_acq,globl
-
-.begfn __aarch64_ldeor4_rel
-	jnatom	1f
-	ldeorl	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	eor	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor4_rel,globl
-
-.begfn __aarch64_ldeor4_acq_rel
-	jnatom	1f
-	ldeoral	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldaxr	w0,[x1]
-	eor	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor4_acq_rel,globl
-
-.begfn __aarch64_ldeor4_sync
-	jnatom	1f
-	ldeoral	w0,w0,[x1]
-	ret
-1:	mov	w16,w0
-0:	ldxr	w0,[x1]
-	eor	w17,w0,w16
-	stlxr	w15,w17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldeor4_sync,globl
-
-
-.begfn __aarch64_swp8_relax
-	jnatom	1f
-	swp	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	stxr	w17,x16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp8_relax,globl
-
-.begfn __aarch64_swp8_acq
-	jnatom	1f
-	swpa	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	stxr	w17,x16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp8_acq,globl
-
-.begfn __aarch64_swp8_rel
-	jnatom	1f
-	swpl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	stlxr	w17,x16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp8_rel,globl
-
-.begfn __aarch64_swp8_acq_rel
-	jnatom	1f
-	swpal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	stlxr	w17,x16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp8_acq_rel,globl
-
-.begfn __aarch64_swp8_sync
-	jnatom	1f
-	swpa	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	stxr	w17,x16,[x1]
-	cbnz	w17,0b
-	dmb	ish
-	ret
-.endfn __aarch64_swp8_sync,globl
-
-
-.prvfn __aarch64_cas8_relax
-	jnatom	1f
-	cas	x0,x1,[x2]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x2]
-	cmp	x0,x16
-	bne	1f
-	stxr	w17,x1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas8_relax,globl
-
-.prvfn __aarch64_cas8_acq
-	jnatom	1f
-	casa	x0,x1,[x2]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x2]
-	cmp	x0,x16
-	bne	1f
-	stxr	w17,x1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas8_acq,globl
-
-.prvfn __aarch64_cas8_rel
-	jnatom	1f
-	casl	x0,x1,[x2]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x2]
-	cmp	x0,x16
-	bne	1f
-	stlxr	w17,x1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas8_rel,globl
-
-.begfn __aarch64_cas8_acq_rel
-	jnatom	1f
-	casal	x0,x1,[x2]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x2]
-	cmp	x0,x16
-	bne	1f
-	stlxr	w17,x1,[x2]
-	cbnz	w17,0b
-1:	ret
-.endfn __aarch64_cas8_acq_rel,globl
-
-.begfn __aarch64_cas8_sync
-	jnatom	1f
-	casal	x0,x1,[x2]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x2]
-	cmp	x0,x16
-	bne	1f
-	stlxr	w17,x1,[x2]
-	cbnz	w17,0b
-1:	dmb	ish
-	ret
-.endfn __aarch64_cas8_sync,globl
-
-
-.begfn __aarch64_ldadd8_relax
-	jnatom	1f
-	ldadd	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	add	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd8_relax,globl
-
-.begfn __aarch64_ldadd8_acq
-	jnatom	1f
-	ldadda	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	add	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd8_acq,globl
-
-.begfn __aarch64_ldadd8_rel
-	jnatom	1f
-	ldaddl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	add	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd8_rel,globl
-
-.begfn __aarch64_ldadd8_acq_rel
-	jnatom	1f
-	ldaddal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	add	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd8_acq_rel,globl
-
-.begfn __aarch64_ldadd8_sync
-	jnatom	1f
-	ldaddal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	add	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldadd8_sync,globl
-
-
-.begfn __aarch64_ldset8_relax
-	jnatom	1f
-	ldset	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	orr	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset8_relax,globl
-
-.begfn __aarch64_ldset8_acq
-	jnatom	1f
-	ldseta	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	orr	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset8_acq,globl
-
-.begfn __aarch64_ldset8_rel
-	jnatom	1f
-	ldsetl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	orr	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset8_rel,globl
-
-.begfn __aarch64_ldset8_acq_rel
-	jnatom	1f
-	ldsetal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	orr	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset8_acq_rel,globl
-
-.begfn __aarch64_ldset8_sync
-	jnatom	1f
-	ldsetal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	orr	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldset8_sync,globl
-
-
-.begfn __aarch64_ldclr8_relax
-	jnatom	1f
-	ldclr	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	bic	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr8_relax,globl
-
-.begfn __aarch64_ldclr8_acq
-	jnatom	1f
-	ldclra	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	bic	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr8_acq,globl
-
-.begfn __aarch64_ldclr8_rel
-	jnatom	1f
-	ldclrl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	bic	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr8_rel,globl
-
-.begfn __aarch64_ldclr8_acq_rel
-	jnatom	1f
-	ldclral	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	bic	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr8_acq_rel,globl
-
-.begfn __aarch64_ldclr8_sync
-	jnatom	1f
-	ldclral	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	bic	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldclr8_sync,globl
-
-
-.begfn __aarch64_ldeor8_relax
-	jnatom	1f
-	ldeor	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	eor	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor8_relax,globl
-
-.begfn __aarch64_ldeor8_acq
-	jnatom	1f
-	ldeora	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	eor	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor8_acq,globl
-
-.begfn __aarch64_ldeor8_rel
-	jnatom	1f
-	ldeorl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	eor	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor8_rel,globl
-
-.begfn __aarch64_ldeor8_acq_rel
-	jnatom	1f
-	ldeoral	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	eor	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor8_acq_rel,globl
-
-.begfn __aarch64_ldeor8_sync
-	jnatom	1f
-	ldeoral	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	eor	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldeor8_sync,globl
-
-
-.begfn __aarch64_swp16_relax
-	jnatom	1f
-	swp	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	stxr	w17,x16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp16_relax,globl
-
-.begfn __aarch64_swp16_acq
-	jnatom	1f
-	swpa	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	stxr	w17,x16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp16_acq,globl
-
-.begfn __aarch64_swp16_rel
-	jnatom	1f
-	swpl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	stlxr	w17,x16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp16_rel,globl
-
-.begfn __aarch64_swp16_acq_rel
-	jnatom	1f
-	swpal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	stlxr	w17,x16,[x1]
-	cbnz	w17,0b
-	ret
-.endfn __aarch64_swp16_acq_rel,globl
-
-.begfn __aarch64_swp16_sync
-	jnatom	1f
-	swpa	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	stxr	w17,x16,[x1]
-	cbnz	w17,0b
-	dmb	ish
-	ret
-.endfn __aarch64_swp16_sync,globl
-
-
-.begfn __aarch64_cas16_relax
-	jnatom	1f
-	casp	x0,x1,x2,x3,[x4]
-	ret
-1:	mov	x16,x0
-	mov	x17,x1
-0:	ldxp	x0,x1,[x4]
-	cmp	x0,x16
-	ccmp	x1,x17,#0,eq
-	csel	x15,x2,x0,eq
-	csel	x14,x3,x1,eq
-	stxp	w13,x15,x14,[x4]
-	cbnz	w13,0b
-	ret
-.endfn __aarch64_cas16_relax,globl
-
-.begfn __aarch64_cas16_acq
-	jnatom	1f
-	caspa	x0,x1,x2,x3,[x4]
-	ret
-1:	mov	x16,x0
-	mov	x17,x1
-0:	ldaxp	x0,x1,[x4]
-	cmp	x0,x16
-	ccmp	x1,x17,#0,eq
-	csel	x15,x2,x0,eq
-	csel	x14,x3,x1,eq
-	stxp	w13,x15,x14,[x4]
-	cbnz	w13,0b
-	ret
-.endfn __aarch64_cas16_acq,globl
-
-.begfn __aarch64_cas16_rel
-	jnatom	1f
-	caspl	x0,x1,x2,x3,[x4]
-	ret
-1:	mov	x16,x0
-	mov	x17,x1
-0:	ldxp	x0,x1,[x4]
-	cmp	x0,x16
-	ccmp	x1,x17,#0,eq
-	csel	x15,x2,x0,eq
-	csel	x14,x3,x1,eq
-	stlxp	w13,x15,x14,[x4]
-	cbnz	w13,0b
-	ret
-.endfn __aarch64_cas16_rel,globl
-
-.begfn __aarch64_cas16_acq_rel
-	jnatom	1f
-	caspal	x0,x1,x2,x3,[x4]
-	ret
-1:	mov	x16,x0
-	mov	x17,x1
-0:	ldaxp	x0,x1,[x4]
-	cmp	x0,x16
-	ccmp	x1,x17,#0,eq
-	csel	x15,x2,x0,eq
-	csel	x14,x3,x1,eq
-	stlxp	w13,x15,x14,[x4]
-	cbnz	w13,0b
-	ret
-.endfn __aarch64_cas16_acq_rel,globl
-
-.begfn __aarch64_cas16_sync
-	jnatom	1f
-	caspal	x0,x1,x2,x3,[x4]
-	ret
-1:	mov	x16,x0
-	mov	x17,x1
-0:	ldxp	x0,x1,[x4]
-	cmp	x0,x16
-	ccmp	x1,x17,#0,eq
-	csel	x15,x2,x0,eq
-	csel	x14,x3,x1,eq
-	stlxp	w13,x15,x14,[x4]
-	cbnz	w13,0b
-	dmb	ish
-	ret
-.endfn __aarch64_cas16_sync,globl
-
-
-.begfn __aarch64_ldadd16_relax
-	jnatom	1f
-	ldadd	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	add	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd16_relax,globl
-
-.begfn __aarch64_ldadd16_acq
-	jnatom	1f
-	ldadda	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	add	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd16_acq,globl
-
-.begfn __aarch64_ldadd16_rel
-	jnatom	1f
-	ldaddl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	add	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd16_rel,globl
-
-.begfn __aarch64_ldadd16_acq_rel
-	jnatom	1f
-	ldaddal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	add	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldadd16_acq_rel,globl
-
-.begfn __aarch64_ldadd16_sync
-	jnatom	1f
-	ldaddal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	add	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldadd16_sync,globl
-
-
-.begfn __aarch64_ldset16_relax
-	jnatom	1f
-	ldset	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	orr	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset16_relax,globl
-
-.begfn __aarch64_ldset16_acq
-	jnatom	1f
-	ldseta	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	orr	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset16_acq,globl
-
-.begfn __aarch64_ldset16_rel
-	jnatom	1f
-	ldsetl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	orr	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset16_rel,globl
-
-.begfn __aarch64_ldset16_acq_rel
-	jnatom	1f
-	ldsetal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	orr	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldset16_acq_rel,globl
-
-.begfn __aarch64_ldset16_sync
-	jnatom	1f
-	ldsetal	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	orr	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldset16_sync,globl
-
-
-.begfn __aarch64_ldclr16_relax
-	jnatom	1f
-	ldclr	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	bic	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr16_relax,globl
-
-.begfn __aarch64_ldclr16_acq
-	jnatom	1f
-	ldclra	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	bic	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr16_acq,globl
-
-.begfn __aarch64_ldclr16_rel
-	jnatom	1f
-	ldclrl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	bic	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr16_rel,globl
-
-.begfn __aarch64_ldclr16_acq_rel
-	jnatom	1f
-	ldclral	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	bic	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldclr16_acq_rel,globl
-
-.begfn __aarch64_ldclr16_sync
-	jnatom	1f
-	ldclral	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	bic	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldclr16_sync,globl
-
-
-.begfn __aarch64_ldeor16_relax
-	jnatom	1f
-	ldeor	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	eor	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor16_relax,globl
-
-.begfn __aarch64_ldeor16_acq
-	jnatom	1f
-	ldeora	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	eor	x17,x0,x16
-	stxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor16_acq,globl
-
-.begfn __aarch64_ldeor16_rel
-	jnatom	1f
-	ldeorl	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	eor	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor16_rel,globl
-
-.begfn __aarch64_ldeor16_acq_rel
-	jnatom	1f
-	ldeoral	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldaxr	x0,[x1]
-	eor	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	ret
-.endfn __aarch64_ldeor16_acq_rel,globl
-
-.begfn __aarch64_ldeor16_sync
-	jnatom	1f
-	ldeoral	x0,x0,[x1]
-	ret
-1:	mov	x16,x0
-0:	ldxr	x0,[x1]
-	eor	x17,x0,x16
-	stlxr	w15,x17,[x1]
-	cbnz	w15,0b
-	dmb	ish
-	ret
-.endfn __aarch64_ldeor16_sync,globl
diff --git a/libc/intrin/atomic.h b/libc/intrin/atomic.h
index a2d93df8a..3d503d37f 100644
--- a/libc/intrin/atomic.h
+++ b/libc/intrin/atomic.h
@@ -13,26 +13,48 @@
  */
 
 typedef enum {
-  memory_order_relaxed = __ATOMIC_RELAXED,
-  memory_order_consume = __ATOMIC_CONSUME,
-  memory_order_acquire = __ATOMIC_ACQUIRE,
-  memory_order_release = __ATOMIC_RELEASE,
-  memory_order_acq_rel = __ATOMIC_ACQ_REL,
-  memory_order_seq_cst = __ATOMIC_SEQ_CST
+  memory_order_relaxed,
+  memory_order_consume,
+  memory_order_acquire,
+  memory_order_release,
+  memory_order_acq_rel,
+  memory_order_seq_cst,
 } memory_order;
 
-#if !(defined __STDC_VERSION__ && __STDC_VERSION__ > 201710L)
-#define ATOMIC_VAR_INIT(...) __VA_ARGS__
-#endif
-
+#define ATOMIC_VAR_INIT(...)     __VA_ARGS__
 #define atomic_is_lock_free(obj) ((void)(obj), sizeof(obj) <= sizeof(void *))
 
 #define atomic_flag      atomic_bool
-#define ATOMIC_FLAG_INIT false
+#define ATOMIC_FLAG_INIT ATOMIC_VAR_INIT(0)
 #define atomic_flag_test_and_set_explicit(x, order) \
   atomic_exchange_explicit(x, 1, order)
 #define atomic_flag_clear_explicit(x, order) atomic_store_explicit(x, 0, order)
 
+#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
+  atomic_compare_exchange_strong_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
+  atomic_compare_exchange_weak_explicit(                          \
+      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_exchange(pObject, desired) \
+  atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_fetch_add(pObject, operand) \
+  atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_and(pObject, operand) \
+  atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_or(pObject, operand) \
+  atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_sub(pObject, operand) \
+  atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_fetch_xor(pObject, operand) \
+  atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
+#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
+#define atomic_store(pObject, desired) \
+  atomic_store_explicit(pObject, desired, memory_order_seq_cst)
+#define atomic_flag_test_and_set(x) \
+  atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
+#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
+
 #if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
 
 #define atomic_init(obj, value)    __c11_atomic_init(obj, value)
@@ -62,35 +84,9 @@ typedef enum {
 #define atomic_store_explicit(object, desired, order) \
   __c11_atomic_store(object, desired, order)
 
-#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
-  atomic_compare_exchange_strong_explicit(                          \
-      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
-#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
-  atomic_compare_exchange_weak_explicit(                          \
-      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
-#define atomic_exchange(pObject, desired) \
-  atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
-#define atomic_fetch_add(pObject, operand) \
-  atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_and(pObject, operand) \
-  atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_or(pObject, operand) \
-  atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_sub(pObject, operand) \
-  atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_xor(pObject, operand) \
-  atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
-#define atomic_store(pObject, desired) \
-  atomic_store_explicit(pObject, desired, memory_order_seq_cst)
-#define atomic_flag_test_and_set(x) \
-  atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
-#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
-
 #elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 407
 
-#define atomic_init(obj, value) \
-  atomic_store_explicit(obj, value, __ATOMIC_RELAXED)
+#define atomic_init(obj, value)    ((void)(*(obj) = (value)))
 #define atomic_thread_fence(order) __atomic_thread_fence(order)
 #define atomic_signal_fence(order) __atomic_signal_fence(order)
 #define atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
@@ -115,31 +111,6 @@ typedef enum {
 #define atomic_store_explicit(pObject, desired, order) \
   __atomic_store_n(pObject, desired, order)
 
-#define atomic_compare_exchange_strong(pObject, pExpected, desired)    \
-  atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
-                                          __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-#define atomic_compare_exchange_weak(pObject, pExpected, desired)    \
-  atomic_compare_exchange_weak_explicit(pObject, pExpected, desired, \
-                                        __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-#define atomic_exchange(pObject, desired) \
-  atomic_exchange_explicit(pObject, desired, __ATOMIC_SEQ_CST)
-#define atomic_fetch_add(pObject, operand) \
-  atomic_fetch_add_explicit(pObject, operand, __ATOMIC_SEQ_CST)
-#define atomic_fetch_and(pObject, operand) \
-  atomic_fetch_and_explicit(pObject, operand, __ATOMIC_SEQ_CST)
-#define atomic_fetch_or(pObject, operand) \
-  atomic_fetch_or_explicit(pObject, operand, __ATOMIC_SEQ_CST)
-#define atomic_fetch_sub(pObject, operand) \
-  atomic_fetch_sub_explicit(pObject, operand, __ATOMIC_SEQ_CST)
-#define atomic_fetch_xor(pObject, operand) \
-  atomic_fetch_xor_explicit(pObject, operand, __ATOMIC_SEQ_CST)
-#define atomic_load(pObject) atomic_load_explicit(pObject, __ATOMIC_SEQ_CST)
-#define atomic_store(pObject, desired) \
-  atomic_store_explicit(pObject, desired, __ATOMIC_SEQ_CST)
-#define atomic_flag_test_and_set(x) \
-  atomic_flag_test_and_set_explicit(x, __ATOMIC_SEQ_CST)
-#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, __ATOMIC_SEQ_CST)
-
 #elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 401
 
 #define atomic_init(obj, value)    ((void)(*(obj) = (value)))
@@ -239,31 +210,6 @@ typedef enum {
 #define atomic_store_explicit(object, desired, order) \
   ((void)atomic_exchange_explicit(object, desired, order))
 
-#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
-  atomic_compare_exchange_strong_explicit(                          \
-      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
-#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
-  atomic_compare_exchange_weak_explicit(                          \
-      pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
-#define atomic_exchange(pObject, desired) \
-  atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
-#define atomic_fetch_add(pObject, operand) \
-  atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_and(pObject, operand) \
-  atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_or(pObject, operand) \
-  atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_sub(pObject, operand) \
-  atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_fetch_xor(pObject, operand) \
-  atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
-#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
-#define atomic_store(pObject, desired) \
-  atomic_store_explicit(pObject, desired, memory_order_seq_cst)
-#define atomic_flag_test_and_set(x) \
-  atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
-#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
-
 #else /* non-gcc or old gcc w/o x86 */
 #error "atomic operations not supported with this compiler and/or architecture"
 #endif
diff --git a/libc/intrin/brain16.c b/libc/intrin/brain16.c
deleted file mode 100644
index 95b0050b8..000000000
--- a/libc/intrin/brain16.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-
-/**
- * @fileoverview bf16 compiler runtime
- */
-
-_Float32 __extendbfsf2(__bf16 f) {
-  union {
-    __bf16 f;
-    uint16_t i;
-  } ub = {f};
-
-  // convert brain16 to binary32
-  uint32_t x = (uint32_t)ub.i << 16;
-
-  // force nan to quiet
-  if ((x & 0x7fffffff) > 0x7f800000)
-    x |= 0x00400000;
-
-  // pun to _Float32
-  union {
-    uint32_t i;
-    _Float32 f;
-  } uf = {x};
-  return uf.f;
-}
-
-_Float64 __extendbfdf2(__bf16 f) {
-  return __extendbfsf2(f);
-}
-
-#ifdef __x86_64__
-__float80 __extendbfxf2(__bf16 f) {
-  return __extendbfsf2(f);
-}
-#endif
-
-#ifdef __aarch64__
-_Float128 __extendbftf2(__bf16 f) {
-  return __extendbfsf2(f);
-}
-#endif
-
-__bf16 __truncsfbf2(_Float32 f) {
-  union {
-    _Float32 f;
-    uint32_t i;
-  } uf = {f};
-  uint32_t x = uf.i;
-
-  if ((x & 0x7fffffff) > 0x7f800000)
-    // force nan to quiet
-    x = (x | 0x00400000) >> 16;
-  else
-    // convert binary32 to brain16 with nearest rounding
-    x = (x + (0x7fff + ((x >> 16) & 1))) >> 16;
-
-  // pun to bf16
-  union {
-    uint16_t i;
-    __bf16 f;
-  } ub = {x};
-  return ub.f;
-}
-
-__bf16 __truncdfbf2(_Float64 f) {
-  return __truncsfbf2(f);
-}
-
-#ifdef __x86_64__
-__bf16 __truncxfbf2(__float80 f) {
-  return __truncsfbf2(f);
-}
-#endif
-
-#ifdef __aarch64__
-__bf16 __trunctfbf2(_Float128 f) {
-  return __truncsfbf2(f);
-}
-#endif
diff --git a/libc/intrin/bzero.c b/libc/intrin/bzero.c
index 2d51a9314..8f5087109 100644
--- a/libc/intrin/bzero.c
+++ b/libc/intrin/bzero.c
@@ -16,18 +16,155 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "libc/dce.h"
+#include "libc/nexgen32e/nexgen32e.h"
+#include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
 
+typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
+typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
+
+static void bzero128(char *p, size_t n) {
+  xmm_t v = {0};
+  if (n <= 32) {
+    *(xmm_t *)(p + n - 16) = v;
+    *(xmm_t *)p = v;
+  } else {
+    do {
+      n -= 32;
+      *(xmm_t *)(p + n) = v;
+      *(xmm_t *)(p + n + 16) = v;
+    } while (n > 32);
+    *(xmm_t *)(p + 16) = v;
+    *(xmm_t *)p = v;
+  }
+}
+
+#if defined(__x86_64__) && !defined(__chibicc__)
+_Microarchitecture("avx") static void bzero_avx(char *p, size_t n) {
+  xmm_t v = {0};
+  if (n <= 32) {
+    *(xmm_t *)(p + n - 16) = v;
+    *(xmm_t *)p = v;
+  } else if (n >= 1024 && X86_HAVE(ERMS)) {
+    asm("rep stosb" : "+D"(p), "+c"(n), "=m"(*(char(*)[n])p) : "a"(0));
+  } else {
+    if (n < kHalfCache3 || !kHalfCache3) {
+      do {
+        n -= 32;
+        *(xmm_t *)(p + n) = v;
+        *(xmm_t *)(p + n + 16) = v;
+      } while (n > 32);
+    } else {
+      while ((uintptr_t)(p + n) & 15) {
+        p[--n] = 0;
+      }
+      do {
+        n -= 32;
+        __builtin_ia32_movntdq((xmm_a *)(p + n), (xmm_a)v);
+        __builtin_ia32_movntdq((xmm_a *)(p + n + 16), (xmm_a)v);
+      } while (n > 32);
+      asm("sfence");
+    }
+    *(xmm_t *)(p + 16) = v;
+    *(xmm_t *)p = v;
+  }
+}
+#endif
+
 /**
  * Sets memory to zero.
  *
+ *     bzero n=0                          661 picoseconds
+ *     bzero n=1                          661 ps/byte          1,476 mb/s
+ *     bzero n=2                          330 ps/byte          2,952 mb/s
+ *     bzero n=3                          220 ps/byte          4,428 mb/s
+ *     bzero n=4                          165 ps/byte          5,904 mb/s
+ *     bzero n=7                           94 ps/byte         10,333 mb/s
+ *     bzero n=8                           41 ps/byte         23,618 mb/s
+ *     bzero n=15                          44 ps/byte         22,142 mb/s
+ *     bzero n=16                          20 ps/byte         47,236 mb/s
+ *     bzero n=31                          21 ps/byte         45,760 mb/s
+ *     bzero n=32                          20 ps/byte         47,236 mb/s
+ *     bzero n=63                          10 ps/byte         92,997 mb/s
+ *     bzero n=64                          15 ps/byte         62,982 mb/s
+ *     bzero n=127                         15 ps/byte         62,490 mb/s
+ *     bzero n=128                         10 ps/byte         94,473 mb/s
+ *     bzero n=255                         14 ps/byte         68,439 mb/s
+ *     bzero n=256                          9 ps/byte            105 gb/s
+ *     bzero n=511                         15 ps/byte         62,859 mb/s
+ *     bzero n=512                         11 ps/byte         83,976 mb/s
+ *     bzero n=1023                        15 ps/byte         61,636 mb/s
+ *     bzero n=1024                        10 ps/byte         88,916 mb/s
+ *     bzero n=2047                         9 ps/byte            105 gb/s
+ *     bzero n=2048                         8 ps/byte            109 gb/s
+ *     bzero n=4095                         8 ps/byte            115 gb/s
+ *     bzero n=4096                         8 ps/byte            118 gb/s
+ *     bzero n=8191                         7 ps/byte            129 gb/s
+ *     bzero n=8192                         7 ps/byte            130 gb/s
+ *     bzero n=16383                        6 ps/byte            136 gb/s
+ *     bzero n=16384                        6 ps/byte            137 gb/s
+ *     bzero n=32767                        6 ps/byte            140 gb/s
+ *     bzero n=32768                        6 ps/byte            141 gb/s
+ *     bzero n=65535                       15 ps/byte         64,257 mb/s
+ *     bzero n=65536                       15 ps/byte         64,279 mb/s
+ *     bzero n=131071                      15 ps/byte         63,166 mb/s
+ *     bzero n=131072                      15 ps/byte         63,115 mb/s
+ *     bzero n=262143                      15 ps/byte         62,052 mb/s
+ *     bzero n=262144                      15 ps/byte         62,097 mb/s
+ *     bzero n=524287                      15 ps/byte         61,699 mb/s
+ *     bzero n=524288                      15 ps/byte         61,674 mb/s
+ *     bzero n=1048575                     16 ps/byte         60,179 mb/s
+ *     bzero n=1048576                     15 ps/byte         61,330 mb/s
+ *     bzero n=2097151                     15 ps/byte         61,071 mb/s
+ *     bzero n=2097152                     15 ps/byte         61,065 mb/s
+ *     bzero n=4194303                     16 ps/byte         60,942 mb/s
+ *     bzero n=4194304                     16 ps/byte         60,947 mb/s
+ *     bzero n=8388607                     16 ps/byte         60,872 mb/s
+ *     bzero n=8388608                     16 ps/byte         60,879 mb/s
+ *
  * @param p is memory address
  * @param n is byte length
  * @return p
  * @asyncsignalsafe
  */
 void bzero(void *p, size_t n) {
-  memset(p, 0, n);
+  char *b;
+  uint64_t x;
+  b = p;
+#ifdef __x86_64__
+  asm("xorl\t%k0,%k0" : "=r"(x));
+#else
+  if (1) {
+    memset(p, 0, n);
+    return;
+  }
+  x = 0;
+#endif
+  if (n <= 16) {
+    if (n >= 8) {
+      __builtin_memcpy(b, &x, 8);
+      __builtin_memcpy(b + n - 8, &x, 8);
+    } else if (n >= 4) {
+      __builtin_memcpy(b, &x, 4);
+      __builtin_memcpy(b + n - 4, &x, 4);
+    } else if (n) {
+      do {
+        asm volatile("" ::: "memory");
+        b[--n] = x;
+      } while (n);
+    }
+#if defined(__x86_64__) && !defined(__chibicc__)
+  } else if (IsTiny()) {
+    asm("rep stosb" : "+D"(b), "+c"(n), "=m"(*(char(*)[n])b) : "a"(0));
+    return;
+  } else if (X86_HAVE(AVX)) {
+    bzero_avx(b, n);
+#endif
+  } else {
+    bzero128(b, n);
+  }
 }
 
 __weak_reference(bzero, explicit_bzero);
diff --git a/libc/intrin/clock_gettime-nt.c b/libc/intrin/clock_gettime-nt.c
deleted file mode 100644
index 9020e9cfd..000000000
--- a/libc/intrin/clock_gettime-nt.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/timespec.h"
-#include "libc/calls/struct/timespec.internal.h"
-#include "libc/dce.h"
-#include "libc/errno.h"
-#include "libc/fmt/wintime.internal.h"
-#include "libc/nt/accounting.h"
-#include "libc/nt/runtime.h"
-#include "libc/nt/synchronization.h"
-#include "libc/nt/thread.h"
-#include "libc/nt/time.h"
-#ifdef __x86_64__
-
-#define _CLOCK_REALTIME           0
-#define _CLOCK_MONOTONIC          1
-#define _CLOCK_REALTIME_COARSE    2
-#define _CLOCK_BOOTTIME           3
-#define _CLOCK_PROCESS_CPUTIME_ID 4
-#define _CLOCK_THREAD_CPUTIME_ID  5
-#define _CLOCK_MONOTONIC_COARSE   6
-
-textwindows int sys_clock_gettime_nt(int clock, struct timespec *ts) {
-  uint64_t hectons;
-  struct NtFileTime ft, ftExit, ftUser, ftKernel, ftCreation;
-  switch (clock) {
-    case _CLOCK_REALTIME:
-      GetSystemTimePreciseAsFileTime(&ft);
-      *ts = FileTimeToTimeSpec(ft);
-      return 0;
-    case _CLOCK_REALTIME_COARSE:
-      GetSystemTimeAsFileTime(&ft);
-      *ts = FileTimeToTimeSpec(ft);
-      return 0;
-    case _CLOCK_MONOTONIC:
-      //
-      // "If you need a higher resolution timer, use the
-      //  QueryUnbiasedInterruptTime function, a multimedia timer, or a
-      //  high-resolution timer. The elapsed time retrieved by the
-      //  QueryUnbiasedInterruptTime function includes only time that
-      //  the system spends in the working state."
-      //
-      //                     —Quoth MSDN § Windows Time
-      //
-      QueryUnbiasedInterruptTimePrecise(&hectons);
-      *ts = WindowsDurationToTimeSpec(hectons);
-      return 0;
-    case _CLOCK_MONOTONIC_COARSE:
-      //
-      // "QueryUnbiasedInterruptTimePrecise is similar to the
-      //  QueryUnbiasedInterruptTime routine, but is more precise. The
-      //  interrupt time reported by QueryUnbiasedInterruptTime is based
-      //  on the latest tick of the system clock timer. The system clock
-      //  timer is the hardware timer that periodically generates
-      //  interrupts for the system clock. The uniform period between
-      //  system clock timer interrupts is referred to as a system clock
-      //  tick, and is typically in the range of 0.5 milliseconds to
-      //  15.625 milliseconds, depending on the hardware platform. The
-      //  interrupt time value retrieved by QueryUnbiasedInterruptTime
-      //  is accurate within a system clock tick. ¶To provide a system
-      //  time value that is more precise than that of
-      //  QueryUnbiasedInterruptTime, QueryUnbiasedInterruptTimePrecise
-      //  reads the timer hardware directly, therefore a
-      //  QueryUnbiasedInterruptTimePrecise call can be slower than a
-      //  QueryUnbiasedInterruptTime call."
-      //
-      //                     —Quoth MSDN § QueryUnbiasedInterruptTimePrecise
-      //
-      QueryUnbiasedInterruptTime(&hectons);
-      *ts = WindowsDurationToTimeSpec(hectons);
-      return 0;
-    case _CLOCK_BOOTTIME:
-      //
-      // "Unbiased interrupt-time means that only time that the system
-      //  is in the working state is counted; therefore, the interrupt
-      //  time count is not "biased" by time the system spends in sleep
-      //  or hibernation."
-      //
-      //                     —Quoth MSDN § Interrupt Time
-      //
-      QueryInterruptTimePrecise(&hectons);
-      *ts = WindowsDurationToTimeSpec(hectons);
-      return 0;
-    case _CLOCK_PROCESS_CPUTIME_ID:
-      GetProcessTimes(GetCurrentProcess(), &ftCreation, &ftExit, &ftKernel,
-                      &ftUser);
-      *ts = WindowsDurationToTimeSpec(ReadFileTime(ftUser) +
-                                      ReadFileTime(ftKernel));
-      return 0;
-    case _CLOCK_THREAD_CPUTIME_ID:
-      GetThreadTimes(GetCurrentThread(), &ftCreation, &ftExit, &ftKernel,
-                     &ftUser);
-      *ts = WindowsDurationToTimeSpec(ReadFileTime(ftUser) +
-                                      ReadFileTime(ftKernel));
-      return 0;
-    default:
-      return -EINVAL;
-  }
-}
-
-#endif  // __x86_64__
diff --git a/libc/intrin/clock_gettime.c b/libc/intrin/clock_gettime.c
deleted file mode 100644
index c087d8145..000000000
--- a/libc/intrin/clock_gettime.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=8 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/timespec.h"
-#include "libc/calls/struct/timespec.internal.h"
-#include "libc/calls/syscall_support-sysv.internal.h"
-#include "libc/dce.h"
-#include "libc/errno.h"
-#include "libc/intrin/describeflags.h"
-#include "libc/intrin/strace.h"
-#include "libc/runtime/syslib.internal.h"
-#include "libc/sysv/consts/clock.h"
-
-#ifdef __aarch64__
-#define CGT_VDSO __vdsosym("LINUX_2.6.39", "__kernel_clock_gettime")
-#else
-#define CGT_VDSO __vdsosym("LINUX_2.6", "__vdso_clock_gettime")
-#endif
-
-typedef int clock_gettime_f(int, struct timespec *);
-
-static clock_gettime_f *__clock_gettime_get(void) {
-  clock_gettime_f *cgt;
-  if (IsLinux() && (cgt = CGT_VDSO)) {
-    return cgt;
-  } else if (__syslib) {
-    return (void *)__syslib->__clock_gettime;
-#ifdef __x86_64__
-  } else if (IsWindows()) {
-    return sys_clock_gettime_nt;
-  } else if (IsXnu()) {
-    return sys_clock_gettime_xnu;
-#endif
-  } else {
-    return sys_clock_gettime;
-  }
-}
-
-static int __clock_gettime_init(int, struct timespec *);
-static clock_gettime_f *__clock_gettime = __clock_gettime_init;
-static int __clock_gettime_init(int clockid, struct timespec *ts) {
-  clock_gettime_f *cgt;
-  __clock_gettime = cgt = __clock_gettime_get();
-  return cgt(clockid, ts);
-}
-
-static int clock_gettime_impl(int clock, struct timespec *ts) {
-  // BSDs and sometimes Linux too will crash when `ts` is NULL
-  // it's also nice to not have to check for null in polyfills
-  struct timespec memory;
-  if (!ts)
-    ts = &memory;
-  return __clock_gettime(clock, ts);
-}
-
-/**
- * Returns nanosecond time.
- *
- * The `clock` parameter may bo set to:
- *
- * - `CLOCK_REALTIME` returns a wall clock timestamp represented in
- *   nanoseconds since the UNIX epoch (~1970). It'll count time in the
- *   suspend state. This clock is subject to being smeared by various
- *   adjustments made by NTP. These timestamps can have unpredictable
- *   discontinuous jumps when clock_settime() is used. Therefore this
- *   clock is the default clock for everything, even pthread condition
- *   variables. Cosmopoiltan guarantees this clock will never raise
- *   `EINVAL` and also guarantees `CLOCK_REALTIME == 0` will always be
- *   the case. On Windows this maps to GetSystemTimePreciseAsFileTime().
- *   On platforms with vDSOs like Linux, Windows, and MacOS ARM64 this
- *   should take about 20 nanoseconds.
- *
- * - `CLOCK_MONOTONIC` returns a timestamp with an unspecified epoch,
- *   that should be when the system was powered on. These timestamps
- *   shouldn't go backwards. Timestamps shouldn't count time spent in
- *   the sleep, suspend, and hibernation states. These timestamps won't
- *   be impacted by clock_settime(). These timestamps may be impacted by
- *   frequency adjustments made by NTP. Cosmopoiltan guarantees this
- *   clock will never raise `EINVAL`. MacOS and BSDs use the word
- *   "uptime" to describe this clock. On Windows this maps to
- *   QueryUnbiasedInterruptTimePrecise().
- *
- * - `CLOCK_BOOTTIME` is a monotonic clock returning a timestamp with an
- *   unspecified epoch, that should be relative to when the host system
- *   was powered on. These timestamps shouldn't go backwards. Timestamps
- *   should also include time spent in a sleep, suspend, or hibernation
- *   state. These timestamps aren't impacted by clock_settime(), but
- *   they may be impacted by frequency adjustments made by NTP. This
- *   clock will raise an `EINVAL` error on extremely old Linux distros
- *   like RHEL5. MacOS and BSDs use the word "monotonic" to describe
- *   this clock. On Windows this maps to QueryInterruptTimePrecise().
- *
- * - `CLOCK_MONOTONIC_RAW` returns a timestamp from an unspecified
- *   epoch. These timestamps don't count time spent in the sleep,
- *   suspend, and hibernation states. This clock is not impacted by
- *   clock_settime(). Unlike `CLOCK_MONOTONIC` this clock is guaranteed
- *   to not be impacted by frequency adjustments. Providing this level
- *   of assurances may make this clock 10x slower than the monotonic
- *   clock. Furthermore this clock may cause `EINVAL` to be raised if
- *   running on a host system that doesn't provide those guarantees,
- *   e.g. OpenBSD and MacOS on AMD64.
- *
- * - `CLOCK_REALTIME_COARSE` is the same as `CLOCK_REALTIME` except
- *   it'll go faster if the host OS provides a cheaper way to read the
- *   wall time. Please be warned that coarse can be really coarse.
- *   Rather than nano precision, you're looking at `CLK_TCK` precision,
- *   which can lag as far as 30 milliseconds behind or possibly more.
- *   Cosmopolitan may fallback to `CLOCK_REALTIME` if a faster less
- *   accurate clock isn't provided by the system. This clock will raise
- *   an `EINVAL` error on extremely old Linux distros like RHEL5. On
- *   platforms with vDSOs like Linux, Windows, and MacOS ARM64 this
- *   should take about 5 nanoseconds.
- *
- * - `CLOCK_MONOTONIC_COARSE` is the same as `CLOCK_MONOTONIC` except
- *   it'll go faster if the host OS provides a cheaper way to read the
- *   unbiased time. Please be warned that coarse can be really coarse.
- *   Rather than nano precision, you're looking at `CLK_TCK` precision,
- *   which can lag as far as 30 milliseconds behind or possibly more.
- *   Cosmopolitan may fallback to `CLOCK_REALTIME` if a faster less
- *   accurate clock isn't provided by the system. This clock will raise
- *   an `EINVAL` error on extremely old Linux distros like RHEL5. On
- *   platforms with vDSOs like Linux, Windows, and MacOS ARM64 this
- *   should take about 5 nanoseconds.
- *
- * - `CLOCK_PROCESS_CPUTIME_ID` returns the amount of time this process
- *   was actively scheduled. This is similar to getrusage() and clock().
- *
- * - `CLOCK_THREAD_CPUTIME_ID` returns the amount of time this thread
- *   was actively scheduled. This is similar to getrusage() and clock().
- *
- * @param ts is where the result is stored (or null to do clock check)
- * @return 0 on success, or -1 w/ errno
- * @raise EFAULT if `ts` points to invalid memory
- * @error EINVAL if `clock` isn't supported on this system
- * @error EPERM if pledge() is in play without stdio promise
- * @error ESRCH on NetBSD if PID/TID OR'd into `clock` wasn't found
- * @see strftime(), gettimeofday()
- * @asyncsignalsafe
- * @vforksafe
- */
-int clock_gettime(int clock, struct timespec *ts) {
-  int rc = clock_gettime_impl(clock, ts);
-  if (rc) {
-    errno = -rc;
-    rc = -1;
-  }
-  TIMETRACE("clock_gettime(%s, [%s]) → %d% m", DescribeClockName(clock),
-            DescribeTimespec(rc, ts), rc);
-  return rc;
-}
diff --git a/libc/intrin/count.c b/libc/intrin/count.c
deleted file mode 100644
index d4f4365bb..000000000
--- a/libc/intrin/count.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/atomic.h"
-#include "libc/stdalign.h"
-#include "libc/thread/thread.h"
-
-// this counter is important because pthread_exit() needs to know if
-// it's an orphan thread, without needing to acquire _pthread_lock()
-// which causes contention and a file descriptor explosion on netbsd
-alignas(64) atomic_uint _pthread_count = 1;
diff --git a/libc/intrin/cp.c b/libc/intrin/cp.c
index d98c36e66..5f4061033 100644
--- a/libc/intrin/cp.c
+++ b/libc/intrin/cp.c
@@ -18,9 +18,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/blockcancel.internal.h"
 #include "libc/calls/cp.internal.h"
-#include "libc/intrin/describebacktrace.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/nexgen32e/stackframe.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/thread/posixthread.internal.h"
@@ -49,11 +46,7 @@ void end_cancelation_point(int state) {
   }
 }
 
-void report_cancelation_point(int sysv_ordinal, int xnu_ordinal) {
-  char bt[160];
-  struct StackFrame *bp = __builtin_frame_address(0);
-  kprintf("error: report_cancelation_point(%#x, %#x) %s\n", sysv_ordinal,
-          xnu_ordinal, _DescribeBacktrace(bt, bp));
+void report_cancelation_point(void) {
   __builtin_trap();
 }
 
diff --git a/libc/intrin/createfile.c b/libc/intrin/createfile.c
index 3063379b1..265675a1b 100644
--- a/libc/intrin/createfile.c
+++ b/libc/intrin/createfile.c
@@ -56,12 +56,12 @@ TryAgain:
   hHandle = __imp_CreateFileW(lpFileName, dwDesiredAccess, dwShareMode,
                               opt_lpSecurity, dwCreationDisposition,
                               dwFlagsAndAttributes, opt_hTemplateFile);
-  NTTRACE("CreateFile(%#!hs, %s, %s, %s, %s, %s, %ld) → {%ld, %d}", lpFileName,
-          _DescribeNtFileAccessFlags(buf_accessflags, dwDesiredAccess),
-          _DescribeNtFileShareFlags(buf_shareflags, dwShareMode),
-          _DescribeNtSecurityAttributes(buf_secattr, opt_lpSecurity),
-          _DescribeNtCreationDisposition(dwCreationDisposition),
-          _DescribeNtFileFlagAttr(buf_flagattr, dwFlagsAndAttributes),
+  NTTRACE("CreateFile(%#hs, %s, %s, %s, %s, %s, %ld) → {%ld, %d}", lpFileName,
+          (DescribeNtFileAccessFlags)(buf_accessflags, dwDesiredAccess),
+          (DescribeNtFileShareFlags)(buf_shareflags, dwShareMode),
+          (DescribeNtSecurityAttributes)(buf_secattr, opt_lpSecurity),
+          DescribeNtCreationDisposition(dwCreationDisposition),
+          (DescribeNtFileFlagAttr)(buf_flagattr, dwFlagsAndAttributes),
           opt_hTemplateFile, hHandle, __imp_GetLastError());
   if (hHandle == -1) {
     switch (__imp_GetLastError()) {
diff --git a/libc/intrin/cxaatexit.c b/libc/intrin/cxaatexit.c
index eca0952ce..7f13261bf 100644
--- a/libc/intrin/cxaatexit.c
+++ b/libc/intrin/cxaatexit.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/cxaatexit.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/errfuns.h"
diff --git a/libc/intrin/cxaatexit.h b/libc/intrin/cxaatexit.h
index ac89d7614..45b566b70 100644
--- a/libc/intrin/cxaatexit.h
+++ b/libc/intrin/cxaatexit.h
@@ -18,9 +18,9 @@ struct CxaAtexitBlocks {
 
 extern struct CxaAtexitBlocks __cxa_blocks;
 
-void __cxa_lock(void) dontthrow;
-void __cxa_unlock(void) dontthrow;
-void __cxa_thread_finalize(void) dontthrow;
+void __cxa_lock(void) libcesque;
+void __cxa_unlock(void) libcesque;
+void __cxa_thread_finalize(void) libcesque;
 void __cxa_printexits(FILE *, void *) libcesque;
 int __cxa_thread_atexit_impl(void *, void *, void *);
 
diff --git a/libc/intrin/cxalock.c b/libc/intrin/cxalock.c
index cb5256757..e0d43f534 100644
--- a/libc/intrin/cxalock.c
+++ b/libc/intrin/cxalock.c
@@ -17,15 +17,22 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/cxaatexit.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
-pthread_mutex_t __cxa_lock_obj = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t __cxa_lock_obj;
+
+void __cxa_wipe(void) {
+  pthread_mutex_init(&__cxa_lock_obj, 0);
+}
 
 void __cxa_lock(void) {
-  _pthread_mutex_lock(&__cxa_lock_obj);
+  pthread_mutex_lock(&__cxa_lock_obj);
 }
 
 void __cxa_unlock(void) {
-  _pthread_mutex_unlock(&__cxa_lock_obj);
+  pthread_mutex_unlock(&__cxa_lock_obj);
+}
+
+__attribute__((__constructor__(60))) static textstartup void __cxa_init() {
+  pthread_atfork(__cxa_lock, __cxa_unlock, __cxa_wipe);
 }
diff --git a/libc/intrin/deadlock.c b/libc/intrin/deadlock.c
deleted file mode 100644
index 57da577a4..000000000
--- a/libc/intrin/deadlock.c
+++ /dev/null
@@ -1,277 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "ape/sections.internal.h"
-#include "libc/assert.h"
-#include "libc/atomic.h"
-#include "libc/cosmo.h"
-#include "libc/dce.h"
-#include "libc/errno.h"
-#include "libc/intrin/atomic.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/intrin/maps.h"
-#include "libc/macros.h"
-#include "libc/str/str.h"
-#include "libc/thread/lock.h"
-#include "libc/thread/thread.h"
-#include "libc/thread/tls.h"
-
-/**
- * @fileoverview deadlock detector for statically allocated locks
- *
- * This module helps you spot multi-threading bugs in your program.
- * High-level abstractions like mutexes are much easier to use than
- * atomics, but they still carry their own non-obvious dangers. For
- * example, nesting locks need to be nested in a consistent way and
- * normal mutexes can't be required recursively. Normally this will
- * cause your program to deadlock, i.e. hang indefinitely, but this
- * module can detect such conditions and return errors instead, and
- * better yet print helpful information when using `cosmocc -mdbg`.
- */
-
-#define ABI privileged optimizesize
-
-// building our visitor function using this optimizesize keyword shrinks
-// the stack memory requirement from 7168 to 2048 bytes. totally amazing
-// although please note this maximum isn't a hard limit. for normal mode
-// builds your posix mandated mutex error checking will be less accurate
-// but still helpful and reliable, although your cosmocc -mdbg will trap
-// and report that you've run into the limit, so you can talk to justine
-#define MAX_LOCKS 64
-
-// cosmo's tib reserves space for 64 nested locks before things degrade.
-// the cosmopolitan c runtime defines 16 locks, which are all registered
-// with pthread_atfork(). it means you get to have 48 mutexes right now,
-// and if you register all of them, then calling fork() will cause there
-// to be 2080 edges in your lock graph. talk to justine if you need more
-// because we're obviously going to need to find a way to make this grow
-#define LOCK_EDGES_MAX 2080
-
-// supported lock objects must define `void *_edges`
-#define LOCK_EDGES_OFFSET 0
-static_assert(offsetof(struct MapLock, edges) == LOCK_EDGES_OFFSET);
-static_assert(offsetof(pthread_mutex_t, _edges) == LOCK_EDGES_OFFSET);
-
-struct LockEdge {
-  struct LockEdge *next;
-  void *dest;
-};
-
-struct VisitedLock {
-  struct VisitedLock *next;
-  void *lock;
-};
-
-typedef _Atomic(struct LockEdge *) LockEdges;
-
-static struct DeadlockDetector {
-  atomic_size_t edges_allocated;
-  struct LockEdge edges_memory[LOCK_EDGES_MAX];
-} __deadlock;
-
-forceinline struct CosmoTib *__deadlock_tls(void) {
-  return __get_tls_privileged();
-}
-
-forceinline LockEdges *get_lock_edges(void *lock) {
-  return (LockEdges *)((char *)lock + LOCK_EDGES_OFFSET);
-}
-
-forceinline struct LockEdge *load_lock_edges(LockEdges *edges) {
-  return atomic_load_explicit(edges, memory_order_relaxed);
-}
-
-ABI static int is_static_memory(void *lock) {
-  return _etext <= (unsigned char *)lock && (unsigned char *)lock < _end;
-}
-
-ABI static struct LockEdge *__deadlock_alloc(void) {
-  size_t edges_allocated =
-      atomic_load_explicit(&__deadlock.edges_allocated, memory_order_relaxed);
-  for (;;) {
-    if (edges_allocated == LOCK_EDGES_MAX) {
-      if (IsModeDbg()) {
-        kprintf("error: cosmo LOCK_EDGES_MAX needs to be increased\n");
-        DebugBreak();
-      }
-      return 0;
-    }
-    if (atomic_compare_exchange_weak_explicit(
-            &__deadlock.edges_allocated, &edges_allocated, edges_allocated + 1,
-            memory_order_relaxed, memory_order_relaxed))
-      return &__deadlock.edges_memory[edges_allocated];
-  }
-}
-
-ABI static void __deadlock_add_edge(void *from, void *dest) {
-  LockEdges *edges = get_lock_edges(from);
-  for (struct LockEdge *e = load_lock_edges(edges); e; e = e->next)
-    if (e->dest == dest)
-      return;
-  struct LockEdge *edge;
-  if ((edge = __deadlock_alloc())) {
-    edge->next = load_lock_edges(edges);
-    edge->dest = dest;
-    // we tolerate duplicate elements in the interest of performance.
-    // once an element is inserted, it's never removed. that's why we
-    // don't need need to worry about the aba problem. the cas itself
-    // is very important since it ensures inserted edges aren't lost.
-    for (;;)
-      if (atomic_compare_exchange_weak_explicit(edges, &edge->next, edge,
-                                                memory_order_relaxed,
-                                                memory_order_relaxed))
-        break;
-  }
-}
-
-ABI static bool __deadlock_visit(void *lock, struct VisitedLock *visited,
-                                 int notrap, int depth) {
-  if (++depth == MAX_LOCKS) {
-    if (IsModeDbg()) {
-      kprintf("error: too much recursion in deadlock detector\n");
-      DebugBreak();
-    }
-    return false;
-  }
-  for (struct VisitedLock *v = visited; v; v = v->next) {
-    if (v->lock == lock) {
-      if (IsModeDbg() && !notrap) {
-        // lock hierarchy violated!
-        //
-        // when you lock mutexes in a nested way, your locks must be
-        // nested in the same order globally. otherwise deadlocks might
-        // occur. for example, if you say in your first thread
-        //
-        //     pthread_mutex_lock(&x);
-        //     pthread_mutex_lock(&y);
-        //     pthread_mutex_unlock(&y);
-        //     pthread_mutex_unlock(&x);
-        //
-        // then in your second thread you say
-        //
-        //     pthread_mutex_lock(&y);
-        //     pthread_mutex_lock(&x);
-        //     pthread_mutex_unlock(&x);
-        //     pthread_mutex_unlock(&y);
-        //
-        // then a deadlock might happen, because {x→y, y→x} is cyclic!
-        // they don't happen often, but this is the kind of thing that
-        // matters if you want to build carrier grade production stuff
-        kprintf("error: cycle detected in directed graph of nested locks\n");
-        for (struct VisitedLock *v = visited; v; v = v->next)
-          kprintf("\t- %t\n", v->lock);  // strongly connected component
-        DebugBreak();
-      }
-      return true;
-    }
-  }
-  LockEdges *edges = get_lock_edges(lock);
-  struct VisitedLock visit = {visited, lock};
-  for (struct LockEdge *e = load_lock_edges(edges); e; e = e->next)
-    if (__deadlock_visit(e->dest, &visit, notrap, depth))
-      return true;
-  return false;
-}
-
-/**
- * Returns true if lock is already locked by calling thread.
- *
- * This function may return false negatives if we run out of TLS memory.
- * That suboptimal condition will be reported in debug mode.
- *
- * @return 1 if lock is certainly owned by calling thread, 0 if lock is
- *     certainly not owned by calling thread, and -1 if we're uncertain
- */
-ABI int __deadlock_tracked(void *lock) {
-  int full = 1;
-  int owned = 0;
-  struct CosmoTib *tib = __deadlock_tls();
-  for (int i = 0; i < ARRAYLEN(tib->tib_locks); ++i) {
-    full &= tib->tib_locks[i] != NULL;
-    owned |= tib->tib_locks[i] == lock;
-  }
-  if (full)
-    return -1;
-  if (!owned && !is_static_memory(lock))
-    return -1;
-  return owned;
-}
-
-/**
- * Records that lock is held by thread.
- * @param notrap can prevent error printing and debug breaking
- * @asyncsignalsafe
- */
-ABI void __deadlock_track(void *lock, int notrap) {
-  if (!notrap && !is_static_memory(lock))
-    return;
-  struct CosmoTib *tib = __deadlock_tls();
-  for (int i = 0; i < ARRAYLEN(tib->tib_locks); ++i) {
-    if (!tib->tib_locks[i]) {
-      tib->tib_locks[i] = lock;
-      return;
-    }
-  }
-  if (IsModeDbg()) {
-    kprintf("error: cosmo tls max lock depth needs to be increased!\n");
-    DebugBreak();
-  }
-}
-
-/**
- * Records relationship for all held locks to `lock`.
- * @param notrap can prevent error printing and debug breaking
- * @asyncsignalsafe
- */
-ABI void __deadlock_record(void *lock, int notrap) {
-  if (!notrap && !is_static_memory(lock))
-    return;
-  struct CosmoTib *tib = __deadlock_tls();
-  for (int i = 0; i < ARRAYLEN(tib->tib_locks); ++i)
-    if (tib->tib_locks[i] && tib->tib_locks[i] != lock)
-      __deadlock_add_edge(tib->tib_locks[i], lock);
-}
-
-/**
- * Returns EDEADLK if locking `lock` could cause a deadlock.
- * @param notrap can prevent error printing and debug breaking
- * @asyncsignalsafe
- */
-ABI int __deadlock_check(void *lock, int notrap) {
-  struct CosmoTib *tib = __deadlock_tls();
-  for (int i = 0; i < ARRAYLEN(tib->tib_locks); ++i) {
-    if (tib->tib_locks[i] == lock)
-      return 0;
-    if (tib->tib_locks[i]) {
-      struct VisitedLock visit = {0, tib->tib_locks[i]};
-      if (__deadlock_visit(lock, &visit, notrap, 0))
-        return EDEADLK;
-    }
-  }
-  return 0;
-}
-
-/**
- * Records that lock isn't held by thread.
- * @asyncsignalsafe
- */
-ABI void __deadlock_untrack(void *lock) {
-  struct CosmoTib *tib = __deadlock_tls();
-  for (int i = 0; i < ARRAYLEN(tib->tib_locks); ++i)
-    tib->tib_locks[i] = tib->tib_locks[i] != lock ? tib->tib_locks[i] : 0;
-}
diff --git a/libc/intrin/demangle.c b/libc/intrin/demangle.c
index c44803f12..ad21c1eb1 100644
--- a/libc/intrin/demangle.c
+++ b/libc/intrin/demangle.c
@@ -91,8 +91,6 @@ Copyright (c) 2024 Justine Tunney <jtunney@gmail.com>");
  *
  */
 
-#define ABI privileged optimizesize
-
 #define DEMANGLE_NO_FLOATING_POINT
 
 #define ASSERT(x)	    (void)0
@@ -105,7 +103,6 @@ Copyright (c) 2024 Justine Tunney <jtunney@gmail.com>");
 #define ELFTC_SUCCESS	    1
 
 #define VECTOR_DEF_CAPACITY 1
-#define MAX_DEPTH           20
 
 typedef unsigned short index_t;
 
@@ -191,7 +188,6 @@ struct demangle_data {
 	enum type_qualifier ref_qualifier_type; /* ref qualifier type */
 	enum push_qualifier push_qualifier;	/* which qualifiers to push */
 	int func_type;
-	int depth;
 	const char *cur;	/* current mangled name ptr */
 	const char *last_sname; /* last source name */
 	intptr_t jmpbuf[5];
@@ -224,18 +220,16 @@ static int demangle_read_sname(struct demangle_data *);
 static int demangle_read_subst(struct demangle_data *);
 static int demangle_read_type(struct demangle_data *, struct type_delimit *);
 
-ABI static size_t
+static privileged size_t
 demangle_strlen(const char *s)
 {
 	size_t n = 0;
-	while (*s++) {
-		asm volatile("" ::: "memory");
+	while (*s++)
 		++n;
-	}
 	return n;
 }
 
-ABI static char *
+static privileged char *
 demangle_stpcpy(char *d, const char *s)
 {
 	size_t i = 0;
@@ -246,7 +240,7 @@ demangle_stpcpy(char *d, const char *s)
 	}
 }
 
-ABI static void *
+static privileged void *
 demangle_mempcpy(void *a, const void *b, size_t n)
 {
 	char *d = a;
@@ -256,14 +250,14 @@ demangle_mempcpy(void *a, const void *b, size_t n)
 	return d;
 }
 
-ABI static void *
+static privileged void *
 demangle_memcpy(void *a, const void *b, size_t n)
 {
 	demangle_mempcpy(a, b, n);
 	return a;
 }
 
-ABI static int
+static privileged int
 demangle_strncmp(const char *a, const char *b, size_t n)
 {
 	size_t i = 0;
@@ -274,7 +268,7 @@ demangle_strncmp(const char *a, const char *b, size_t n)
 	return (a[i] & 0xff) - (b[i] & 0xff);
 }
 
-ABI static int
+static privileged int
 demangle_memcmp(const void *a, const void *b, size_t n)
 {
 	int c;
@@ -289,7 +283,7 @@ demangle_memcmp(const void *a, const void *b, size_t n)
 	return 0;
 }
 
-ABI static void
+static privileged void
 demangle_strlcpy(char *dst, const char *src, size_t dsize)
 {
 	size_t remain;
@@ -301,7 +295,7 @@ demangle_strlcpy(char *dst, const char *src, size_t dsize)
 		*dst = 0;
 }
 
-ABI static long
+static privileged long
 demangle_strtol(const char *s, int base)
 {
 	static const uint8_t demangle_base36[80] = { 1, 2, 3, 4, 5, 6, 7, 8, 9,
@@ -318,7 +312,7 @@ demangle_strtol(const char *s, int base)
 	return x;
 }
 
-ABI static char *
+static privileged char *
 demangle_strstr(const char *haystack, const char *needle)
 {
 	size_t i;
@@ -339,7 +333,7 @@ demangle_strstr(const char *haystack, const char *needle)
 	return 0;
 }
 
-ABI static char *
+static privileged char *
 demangle_utoa(char *p, unsigned long long x)
 {
 	char t;
@@ -360,7 +354,7 @@ demangle_utoa(char *p, unsigned long long x)
 	return p + i;
 }
 
-ABI static char *
+static privileged char *
 demangle_itoa(char *p, long long x)
 {
 	if (x < 0)
@@ -368,7 +362,7 @@ demangle_itoa(char *p, long long x)
 	return demangle_utoa(p, x);
 }
 
-ABI static void
+static privileged void
 demangle_free(struct demangle_data *h, void *ptr)
 {
 	index_t base;
@@ -385,17 +379,14 @@ demangle_free(struct demangle_data *h, void *ptr)
 	}
 }
 
-ABI static returnspointerwithnoaliases returnsnonnull void *
-demangle_malloc(struct demangle_data *h, long a, long n)
+static privileged returnspointerwithnoaliases returnsnonnull void *
+demangle_malloc(struct demangle_data *h, int a, int n)
 {
-	long rem;
+	int rem;
 	uintptr_t ptr;
 	index_t next, next2;
 	index_t *link, *link2;
-	long b = sizeof(index_t);
-
-	if (n < 0 || n >= 32768)
-		__builtin_longjmp(h->jmpbuf, 1);
+	int b = sizeof(index_t);
 
 	/* Roundup size. */
 	n += a - 1;
@@ -442,7 +433,7 @@ demangle_malloc(struct demangle_data *h, long a, long n)
 	}
 }
 
-ABI static returnspointerwithnoaliases char *
+static privileged returnspointerwithnoaliases char *
 demangle_strdup(struct demangle_data *h, const char *s)
 {
 	char *d = 0;
@@ -454,7 +445,7 @@ demangle_strdup(struct demangle_data *h, const char *s)
 	return d;
 }
 
-ABI static void
+static privileged void
 demangle_vector_str_dest(struct demangle_data *h, struct vector_str *v)
 {
 	int i;
@@ -463,7 +454,7 @@ demangle_vector_str_dest(struct demangle_data *h, struct vector_str *v)
 	demangle_free(h, v->container);
 }
 
-ABI static void
+static privileged void
 demangle_vector_type_qualifier_dest(struct demangle_data *d,
     struct vector_type_qualifier *v)
 {
@@ -471,7 +462,7 @@ demangle_vector_type_qualifier_dest(struct demangle_data *d,
 	demangle_vector_str_dest(d, &v->ext_name);
 }
 
-ABI static void
+static privileged void
 demangle_stack_str_init(struct stack_str *ss)
 {
 	ss->str = ss->buf;
@@ -480,7 +471,7 @@ demangle_stack_str_init(struct stack_str *ss)
 	ss->cap = sizeof(ss->buf);
 }
 
-ABI static void
+static privileged void
 demangle_stack_str_append(struct demangle_data *h, struct stack_str *ss,
     const char *str, size_t len)
 {
@@ -503,7 +494,7 @@ demangle_stack_str_append(struct demangle_data *h, struct stack_str *ss,
 #define demangle_stack_str_append_str(h, ss, s) \
 	demangle_stack_str_append(h, ss, s, demangle_strlen(s))
 
-ABI static size_t
+static privileged size_t
 demangle_get_strlen_sum(struct demangle_data *h, const struct vector_str *v)
 {
 	size_t i, len = 0;
@@ -513,7 +504,7 @@ demangle_get_strlen_sum(struct demangle_data *h, const struct vector_str *v)
 	return len;
 }
 
-ABI static int
+static privileged int
 demangle_demangle_strncmp(const char *a, const char *b, size_t n)
 {
 	size_t i = 0;
@@ -531,7 +522,7 @@ demangle_demangle_strncmp(const char *a, const char *b, size_t n)
  * @param l Length of the string.
  * @return -1 at failed, 0 at not found, 1 at found.
  */
-ABI static int
+static privileged int
 demangle_vector_str_find(struct demangle_data *h, const struct vector_str *v,
     const char *o, size_t l)
 {
@@ -555,7 +546,7 @@ demangle_vector_str_find(struct demangle_data *h, const struct vector_str *v,
  * @param l Length of the string.
  * @return NULL at failed or NUL terminated new allocated string.
  */
-ABI static char *
+static privileged char *
 demangle_vector_str_get_flat(struct demangle_data *ddata,
     const struct vector_str *v, size_t *l)
 {
@@ -581,7 +572,7 @@ demangle_vector_str_get_flat(struct demangle_data *ddata,
 	return rtn;
 }
 
-ABI static void
+static privileged void
 demangle_vector_str_grow(struct demangle_data *ddata, struct vector_str *v)
 {
 	size_t i, tmp_cap;
@@ -609,7 +600,7 @@ demangle_vector_str_grow(struct demangle_data *ddata, struct vector_str *v)
  * @brief Initialize vector_str.
  * @return false at failed, true at success.
  */
-ABI static void
+static privileged void
 demangle_vector_str_init(struct demangle_data *ddata, struct vector_str *v)
 {
 	v->size = 0;
@@ -625,7 +616,7 @@ demangle_vector_str_init(struct demangle_data *ddata, struct vector_str *v)
  * @brief Remove last element in vector_str.
  * @return false at failed, true at success.
  */
-ABI static bool
+static privileged bool
 demangle_vector_str_pop(struct vector_str *v)
 {
 	if (!v)
@@ -645,7 +636,7 @@ demangle_vector_str_pop(struct vector_str *v)
  * @brief Push back string to vector.
  * @return false at failed, true at success.
  */
-ABI static bool
+static privileged bool
 demangle_vector_str_push(struct demangle_data *ddata, struct vector_str *v,
     const char *str, size_t len)
 {
@@ -669,7 +660,7 @@ demangle_vector_str_push(struct demangle_data *ddata, struct vector_str *v,
  * @brief Push front org vector to det vector.
  * @return false at failed, true at success.
  */
-ABI static bool
+static privileged bool
 demangle_vector_str_push_vector_head(struct demangle_data *ddata,
     struct vector_str *dst, struct vector_str *org)
 {
@@ -702,7 +693,7 @@ demangle_vector_str_push_vector_head(struct demangle_data *ddata,
  * @brief Push org vector to the tail of det vector.
  * @return false at failed, true at success.
  */
-ABI static bool
+static privileged bool
 demangle_vector_str_push_vector(struct demangle_data *ddata,
     struct vector_str *dst, struct vector_str *org)
 {
@@ -740,7 +731,7 @@ demangle_vector_str_push_vector(struct demangle_data *ddata,
  * If r_len is not NULL, string length will be returned.
  * @return NULL at failed or NUL terminated new allocated string.
  */
-ABI static returnspointerwithnoaliases char *
+static privileged returnspointerwithnoaliases char *
 demangle_vector_str_substr(struct demangle_data *ddata,
     const struct vector_str *v, size_t begin, size_t end, size_t *r_len)
 {
@@ -766,7 +757,7 @@ demangle_vector_str_substr(struct demangle_data *ddata,
 	return rtn;
 }
 
-ABI static int
+static privileged int
 demangle_vector_read_cmd_pop(struct vector_read_cmd *v)
 {
 	if (!v->size)
@@ -779,7 +770,7 @@ demangle_vector_read_cmd_pop(struct vector_read_cmd *v)
 	return 1;
 }
 
-ABI static void
+static privileged void
 demangle_vector_read_cmd_init(struct demangle_data *ddata,
     struct vector_read_cmd *v)
 {
@@ -790,7 +781,7 @@ demangle_vector_read_cmd_init(struct demangle_data *ddata,
 	    alignof(*v->r_container), sizeof(*v->r_container) * v->capacity);
 }
 
-ABI static void
+static privileged void
 demangle_data_init(struct demangle_data *d, const char *cur)
 {
 	demangle_vector_str_init(d, &d->output);
@@ -820,7 +811,7 @@ demangle_data_init(struct demangle_data *d, const char *cur)
 	d->last_sname = NULL;
 }
 
-ABI static int
+static privileged int
 demangle_push_str(struct demangle_data *ddata, const char *str, size_t len)
 {
 	if (!str || !len)
@@ -837,7 +828,7 @@ demangle_push_str(struct demangle_data *ddata, const char *str, size_t len)
 }
 
 #ifndef DEMANGLE_NO_FLOATING_POINT
-ABI static int
+static privileged int
 demangle_push_fp(struct demangle_data *ddata,
     char *decoder(struct demangle_data *, const char *, size_t))
 {
@@ -866,13 +857,13 @@ demangle_push_fp(struct demangle_data *ddata,
 }
 #endif // DEMANGLE_NO_FLOATING_POINT
 
-ABI static int
+static privileged int
 demangle_pop_str(struct demangle_data *ddata)
 {
 	return demangle_vector_str_pop(ddata->cur_output);
 }
 
-ABI static int
+static privileged int
 demangle_push_subst(struct demangle_data *ddata, const char *str, size_t len)
 {
 	if (!str || !len)
@@ -884,7 +875,7 @@ demangle_push_subst(struct demangle_data *ddata, const char *str, size_t len)
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_push_subst_v(struct demangle_data *ddata, struct vector_str *v)
 {
 	int rtn;
@@ -904,7 +895,7 @@ demangle_push_subst_v(struct demangle_data *ddata, struct vector_str *v)
 	return rtn;
 }
 
-ABI static int
+static privileged int
 demangle_push_type_qualifier(struct demangle_data *ddata,
     struct vector_type_qualifier *v, const char *type_str)
 {
@@ -1137,7 +1128,7 @@ demangle_push_type_qualifier(struct demangle_data *ddata,
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_get_subst(struct demangle_data *ddata, size_t idx)
 {
 	size_t len;
@@ -1155,7 +1146,7 @@ demangle_get_subst(struct demangle_data *ddata, size_t idx)
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_get_tmpl_param(struct demangle_data *ddata, size_t idx)
 {
 	size_t len;
@@ -1172,7 +1163,7 @@ demangle_get_tmpl_param(struct demangle_data *ddata, size_t idx)
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_array(struct demangle_data *ddata)
 {
 	size_t i, num_len, exp_len, p_idx, idx;
@@ -1244,7 +1235,7 @@ demangle_read_array(struct demangle_data *ddata)
 #ifndef DEMANGLE_NO_FLOATING_POINT
 
 /* Simple hex to integer function used by decode_to_* function. */
-ABI static int
+static privileged int
 hex_to_dec(char c)
 {
 	switch (c) {
@@ -1292,7 +1283,7 @@ hex_to_dec(char c)
  * Todo
  * Replace these functions to macro.
  */
-ABI static returnspointerwithnoaliases char *
+static privileged returnspointerwithnoaliases char *
 decode_fp_to_double(struct demangle_data *ddata, const char *p, size_t len)
 {
 	double f;
@@ -1336,7 +1327,7 @@ again:
 	return rtn;
 }
 
-ABI static returnspointerwithnoaliases char *
+static privileged returnspointerwithnoaliases char *
 decode_fp_to_float(struct demangle_data *ddata, const char *p, size_t len)
 {
 	size_t i, rtn_len, limit;
@@ -1378,7 +1369,7 @@ again:
 	return rtn;
 }
 
-ABI static returnspointerwithnoaliases char *
+static privileged returnspointerwithnoaliases char *
 decode_fp_to_long_double(struct demangle_data *ddata, const char *p, size_t len)
 {
 	long double f;
@@ -1422,7 +1413,7 @@ again:
 	return rtn;
 }
 
-ABI static returnspointerwithnoaliases char *
+static privileged returnspointerwithnoaliases char *
 decode_fp_to_float128(struct demangle_data *ddata, const char *p, size_t len)
 {
 	long double f;
@@ -1479,7 +1470,7 @@ decode_fp_to_float128(struct demangle_data *ddata, const char *p, size_t len)
 	}
 }
 
-ABI static returnspointerwithnoaliases char *
+static privileged returnspointerwithnoaliases char *
 decode_fp_to_float80(struct demangle_data *ddata, const char *p, size_t len)
 {
 	long double f;
@@ -1542,7 +1533,7 @@ decode_fp_to_float80(struct demangle_data *ddata, const char *p, size_t len)
 
 #endif // DEMANGLE_NO_FLOATING_POINT
 
-ABI static int
+static privileged int
 demangle_read_expr_primary(struct demangle_data *ddata)
 {
 	const char *num;
@@ -1634,7 +1625,7 @@ demangle_read_expr_primary(struct demangle_data *ddata)
  *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31775
  *   http://gcc.gnu.org/viewcvs?view=rev&revision=124467
  */
-ABI static int
+static privileged int
 demangle_local_source_name(struct demangle_data *ddata)
 {
 	/* L */
@@ -1660,7 +1651,7 @@ demangle_local_source_name(struct demangle_data *ddata)
  * read unqualified-name, unqualified name are operator-name, ctor-dtor-name,
  * source-name
  */
-ABI static int
+static privileged int
 demangle_read_uqname(struct demangle_data *ddata)
 {
 	size_t len;
@@ -2089,7 +2080,7 @@ demangle_read_uqname(struct demangle_data *ddata)
  * Read template parameter that forms in 'T[number]_'.
  * This function much like to read_subst but only for types.
  */
-ABI static int
+static privileged int
 demangle_read_tmpl_param(struct demangle_data *ddata)
 {
 	long nth;
@@ -2107,11 +2098,10 @@ demangle_read_tmpl_param(struct demangle_data *ddata)
 		/* T_ is first */
 		++nth;
 
-		while (*ddata->cur && *ddata->cur != '_')
+		while (*ddata->cur != '_')
 			++ddata->cur;
 
-		if (nth <= 0)
-			return 0;
+		ASSERT(nth > 0);
 
 		return demangle_get_tmpl_param(ddata, nth);
 	}
@@ -2120,7 +2110,7 @@ demangle_read_tmpl_param(struct demangle_data *ddata)
 	return 0;
 }
 
-ABI static int
+static privileged int
 demangle_vector_read_cmd_push(struct demangle_data *ddata,
     struct vector_read_cmd *v, enum read_cmd cmd, void *data)
 {
@@ -2149,7 +2139,7 @@ demangle_vector_read_cmd_push(struct demangle_data *ddata,
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_tmpl_arg(struct demangle_data *ddata)
 {
 	if (*ddata->cur == '\0')
@@ -2168,7 +2158,7 @@ demangle_read_tmpl_arg(struct demangle_data *ddata)
 	return demangle_read_type(ddata, NULL);
 }
 
-ABI static int
+static privileged int
 demangle_read_tmpl_args(struct demangle_data *ddata)
 {
 	struct vector_str *v;
@@ -2221,7 +2211,7 @@ demangle_read_tmpl_args(struct demangle_data *ddata)
 	return demangle_vector_read_cmd_pop(&ddata->cmd);
 }
 
-ABI static int
+static privileged int
 demangle_read_expression_trinary(struct demangle_data *ddata, const char *name1,
     size_t len1, const char *name2, size_t len2)
 {
@@ -2240,7 +2230,7 @@ demangle_read_expression_trinary(struct demangle_data *ddata, const char *name1,
 	return demangle_read_expression(ddata);
 }
 
-ABI static int
+static privileged int
 demangle_read_expression_unary(struct demangle_data *ddata, const char *name,
     size_t len)
 {
@@ -2252,7 +2242,7 @@ demangle_read_expression_unary(struct demangle_data *ddata, const char *name,
 	return demangle_push_str(ddata, name, len);
 }
 
-ABI static int
+static privileged int
 demangle_read_expression_binary(struct demangle_data *ddata, const char *name,
     size_t len)
 {
@@ -2266,8 +2256,8 @@ demangle_read_expression_binary(struct demangle_data *ddata, const char *name,
 	return demangle_read_expression(ddata);
 }
 
-ABI static int
-demangle_read_expression_impl(struct demangle_data *ddata)
+static privileged int
+demangle_read_expression(struct demangle_data *ddata)
 {
 	if (*ddata->cur == '\0')
 		return 0;
@@ -2548,18 +2538,7 @@ demangle_read_expression_impl(struct demangle_data *ddata)
 	return 0;
 }
 
-ABI static int
-demangle_read_expression(struct demangle_data *ddata)
-{
-	if (ddata->depth == MAX_DEPTH)
-		__builtin_longjmp(ddata->jmpbuf, 1);
-	++ddata->depth;
-	int res = demangle_read_expression_impl(ddata);
-	--ddata->depth;
-	return res;
-}
-
-ABI static int
+static privileged int
 demangle_read_expression_flat(struct demangle_data *ddata, char **str)
 {
 	struct vector_str *output;
@@ -2588,7 +2567,7 @@ demangle_read_expression_flat(struct demangle_data *ddata, char **str)
 }
 
 /* size, capacity, ext_name */
-ABI static void
+static privileged void
 demangle_vector_type_qualifier_init(struct demangle_data *ddata,
     struct vector_type_qualifier *v)
 {
@@ -2604,7 +2583,7 @@ demangle_vector_type_qualifier_init(struct demangle_data *ddata,
 	demangle_vector_str_init(ddata, &v->ext_name);
 }
 
-ABI static struct read_cmd_item *
+static privileged struct read_cmd_item *
 demangle_vector_read_cmd_find(struct vector_read_cmd *v, enum read_cmd dst)
 {
 	int i;
@@ -2619,7 +2598,7 @@ demangle_vector_read_cmd_find(struct vector_read_cmd *v, enum read_cmd dst)
 	return 0;
 }
 
-ABI static int
+static privileged int
 demangle_read_function(struct demangle_data *ddata, int *ext_c,
     struct vector_type_qualifier *v)
 {
@@ -2755,7 +2734,7 @@ demangle_read_function(struct demangle_data *ddata, int *ext_c,
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_offset_number(struct demangle_data *ddata)
 {
 	bool negative;
@@ -2773,7 +2752,7 @@ demangle_read_offset_number(struct demangle_data *ddata)
 		start = ddata->cur;
 	}
 
-	while (*ddata->cur && *ddata->cur != '_')
+	while (*ddata->cur != '_')
 		++ddata->cur;
 
 	if (negative && !DEM_PUSH_STR(ddata, "-"))
@@ -2791,7 +2770,7 @@ demangle_read_offset_number(struct demangle_data *ddata)
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_nv_offset(struct demangle_data *ddata)
 {
 	if (!DEM_PUSH_STR(ddata, "offset : "))
@@ -2800,7 +2779,7 @@ demangle_read_nv_offset(struct demangle_data *ddata)
 	return demangle_read_offset_number(ddata);
 }
 
-ABI static int
+static privileged int
 demangle_read_v_offset(struct demangle_data *ddata)
 {
 	if (!DEM_PUSH_STR(ddata, "offset : "))
@@ -2816,7 +2795,7 @@ demangle_read_v_offset(struct demangle_data *ddata)
 }
 
 /* read offset, offset are nv-offset, v-offset */
-ABI static int
+static privileged int
 demangle_read_offset(struct demangle_data *ddata)
 {
 	if (*ddata->cur == 'h') {
@@ -2830,7 +2809,7 @@ demangle_read_offset(struct demangle_data *ddata)
 	return 0;
 }
 
-ABI static int
+static privileged int
 demangle_read_type_flat(struct demangle_data *ddata, char **str)
 {
 	struct vector_str *output;
@@ -2862,7 +2841,7 @@ demangle_read_type_flat(struct demangle_data *ddata, char **str)
  * read number
  * number ::= [n] <decimal>
  */
-ABI static int
+static privileged int
 demangle_read_number(struct demangle_data *ddata, long *rtn)
 {
 	long len, negative_factor;
@@ -2880,18 +2859,19 @@ demangle_read_number(struct demangle_data *ddata, long *rtn)
 		return 0;
 
 	len = demangle_strtol(ddata->cur, 10);
-	if (len < 0)
-		__builtin_longjmp(ddata->jmpbuf, 1);
 
 	while (ELFTC_ISDIGIT(*ddata->cur))
 		++ddata->cur;
 
+	ASSERT(len >= 0);
+	ASSERT(negative_factor == 1 || negative_factor == -1);
+
 	*rtn = len * negative_factor;
 
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_number_as_string(struct demangle_data *ddata, char **str)
 {
 	long n;
@@ -2908,8 +2888,9 @@ demangle_read_number_as_string(struct demangle_data *ddata, char **str)
 	return 1;
 }
 
-ABI static int
-demangle_read_encoding_impl(struct demangle_data *ddata)
+/* read encoding, encoding are function name, data name, special-name */
+static privileged int
+demangle_read_encoding(struct demangle_data *ddata)
 {
 	char *name, *type, *num_str;
 	long offset;
@@ -3116,19 +3097,7 @@ demangle_read_encoding_impl(struct demangle_data *ddata)
 	return demangle_read_name(ddata);
 }
 
-/* read encoding, encoding are function name, data name, special-name */
-ABI static int
-demangle_read_encoding(struct demangle_data *ddata)
-{
-	if (ddata->depth == MAX_DEPTH)
-		__builtin_longjmp(ddata->jmpbuf, 1);
-	++ddata->depth;
-	int res = demangle_read_encoding_impl(ddata);
-	--ddata->depth;
-	return res;
-}
-
-ABI static int
+static privileged int
 demangle_read_local_name(struct demangle_data *ddata)
 {
 	struct vector_str local_name;
@@ -3209,7 +3178,7 @@ demangle_read_local_name(struct demangle_data *ddata)
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_nested_name(struct demangle_data *ddata)
 {
 	struct stack_str v;
@@ -3297,8 +3266,8 @@ next:
 	return 1;
 }
 
-ABI static int
-demangle_read_name_impl(struct demangle_data *ddata)
+static privileged int
+demangle_read_name(struct demangle_data *ddata)
 {
 	struct stack_str v;
 	struct vector_str *output;
@@ -3359,18 +3328,7 @@ clean:
 	return rtn;
 }
 
-ABI static int
-demangle_read_name(struct demangle_data *ddata)
-{
-	if (ddata->depth == MAX_DEPTH)
-		__builtin_longjmp(ddata->jmpbuf, 1);
-	++ddata->depth;
-	int res = demangle_read_name_impl(ddata);
-	--ddata->depth;
-	return res;
-}
-
-ABI static int
+static privileged int
 demangle_read_name_flat(struct demangle_data *ddata, char **str)
 {
 	struct vector_str *output;
@@ -3398,7 +3356,7 @@ demangle_read_name_flat(struct demangle_data *ddata, char **str)
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_pointer_to_member(struct demangle_data *ddata,
     struct vector_type_qualifier *v)
 {
@@ -3458,10 +3416,9 @@ clean1:
 }
 
 /* read source-name, source-name is <len> <ID> */
-ABI static int
+static privileged int
 demangle_read_sname(struct demangle_data *ddata)
 {
-	size_t lim;
 	long len;
 	int err;
 
@@ -3481,15 +3438,12 @@ demangle_read_sname(struct demangle_data *ddata)
 		ddata->last_sname = VEC_STR(ddata, ddata->cur_output,
 		    ddata->cur_output->size - 1);
 
-	lim = demangle_strlen(ddata->cur);
-	if (len > lim)
-		len = lim;
 	ddata->cur += len;
 
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_subst_stdtmpl(struct demangle_data *ddata, const char *str)
 {
 	struct vector_str *output;
@@ -3527,7 +3481,7 @@ demangle_read_subst_stdtmpl(struct demangle_data *ddata, const char *str)
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_subst_std(struct demangle_data *ddata)
 {
 	struct vector_str *output, v;
@@ -3578,7 +3532,7 @@ demangle_read_subst_std(struct demangle_data *ddata)
 	return 1;
 }
 
-ABI static int
+static privileged int
 demangle_read_subst(struct demangle_data *ddata)
 {
 	long nth;
@@ -3693,11 +3647,10 @@ demangle_read_subst(struct demangle_data *ddata)
 		/* first was '_', so increase one */
 		++nth;
 
-		while (*ddata->cur && *ddata->cur != '_')
+		while (*ddata->cur != '_')
 			++ddata->cur;
 
-		if (nth <= 0)
-			return 0;
+		ASSERT(nth > 0);
 
 		return demangle_get_subst(ddata, nth);
 	}
@@ -3706,7 +3659,7 @@ demangle_read_subst(struct demangle_data *ddata)
 	return 0;
 }
 
-ABI static int
+static privileged int
 demangle_vector_type_qualifier_push(struct demangle_data *ddata,
     struct vector_type_qualifier *v, enum type_qualifier t)
 {
@@ -3735,8 +3688,8 @@ demangle_vector_type_qualifier_push(struct demangle_data *ddata,
 	return 1;
 }
 
-ABI static int
-demangle_read_type_impl(struct demangle_data *ddata, struct type_delimit *td)
+static privileged int
+demangle_read_type(struct demangle_data *ddata, struct type_delimit *td)
 {
 	struct vector_type_qualifier v;
 	struct vector_str *output, sv;
@@ -3928,7 +3881,7 @@ again:
 
 	case 'E':
 		/* unexpected end (except some things) */
-		if (td && ddata->is_guard_variable)
+		if (ddata->is_guard_variable)
 			td->paren = false;
 		if (ddata->is_guard_variable ||
 		    (ddata->ref_qualifier && ddata->is_functype)) {
@@ -4149,8 +4102,6 @@ again:
 		if (!demangle_vector_str_push(ddata, &v.ext_name, ddata->cur,
 			len))
 			return 0;
-		if (len > demangle_strlen(ddata->cur))
-			len = demangle_strlen(ddata->cur);
 		ddata->cur += len;
 		if (!demangle_vector_type_qualifier_push(ddata, &v, TYPE_EXT))
 			return 0;
@@ -4258,18 +4209,7 @@ clean:
 	return 0;
 }
 
-ABI static int
-demangle_read_type(struct demangle_data *ddata, struct type_delimit *td)
-{
-	if (ddata->depth == MAX_DEPTH)
-		__builtin_longjmp(ddata->jmpbuf, 1);
-	++ddata->depth;
-	int res = demangle_read_type_impl(ddata, td);
-	--ddata->depth;
-	return res;
-}
-
-ABI static int
+static privileged int
 demangle_copy_output(struct demangle_data *ddata, char *buf,
     const struct vector_str *v, size_t buflen)
 {
@@ -4292,14 +4232,14 @@ demangle_copy_output(struct demangle_data *ddata, char *buf,
 		return -1;
 }
 
-ABI static int
+static privileged int
 demangle_failure(char *buf, const char *org, size_t buflen)
 {
 	demangle_strlcpy(buf, org, buflen);
 	return -1;
 }
 
-ABI static int
+static privileged int
 demangle(struct demangle_data *ddata, char *buf, const char *org, size_t buflen)
 {
 	struct vector_str ret_type;
@@ -4451,7 +4391,7 @@ demangle(struct demangle_data *ddata, char *buf, const char *org, size_t buflen)
  * @return bytes of output name or -1 upon error or truncation
  * @asyncsignalsafe
  */
-ABI int
+privileged int
 __demangle(char *buf, const char *org, size_t buflen)
 {
 	struct demangle_data ddata[1];
@@ -4465,7 +4405,7 @@ __demangle(char *buf, const char *org, size_t buflen)
  *
  * This means it starts with either "_Z" or "_GLOBAL__I_".
  */
-ABI int
+privileged int
 __is_mangled(const char *org)
 {
 	if (!org)
diff --git a/libc/intrin/describeallocationtype.c b/libc/intrin/describeallocationtype.c
deleted file mode 100644
index 4dd69e733..000000000
--- a/libc/intrin/describeallocationtype.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
-#include "libc/nt/enum/memflags.h"
-
-static const struct DescribeFlags kNtAllocationTypeFlags[] = {
-    {kNtMemCommit, "Commit"},    //
-    {kNtMemReserve, "Reserve"},  //
-    {kNtMemReset, "Reset"},      //
-};
-
-const char *_DescribeNtAllocationType(char buf[48], uint32_t x) {
-  return _DescribeFlags(buf, 48, kNtAllocationTypeFlags,
-                        ARRAYLEN(kNtAllocationTypeFlags), "kNtMem", x);
-}
diff --git a/libc/intrin/describearchprctlcode.c b/libc/intrin/describearchprctlcode.c
index b260b73ba..9c1fea1f9 100644
--- a/libc/intrin/describearchprctlcode.c
+++ b/libc/intrin/describearchprctlcode.c
@@ -21,7 +21,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/sysv/consts/arch.h"
 
-const char *_DescribeArchPrctlCode(char buf[12], int x) {
+const char *(DescribeArchPrctlCode)(char buf[12], int x) {
   if (x == ARCH_SET_FS)
     return "ARCH_SET_FS";
   if (x == ARCH_GET_FS)
diff --git a/libc/intrin/describebacktrace.c b/libc/intrin/describebacktrace.c
index 7d61f5bc9..113854ff6 100644
--- a/libc/intrin/describebacktrace.c
+++ b/libc/intrin/describebacktrace.c
@@ -24,15 +24,13 @@
 
 #define N 160
 
-#define ABI privileged optimizesize
-
-ABI static bool IsDangerous(const void *ptr) {
+static bool IsDangerous(const void *ptr) {
   if (_weaken(kisdangerous))
     return _weaken(kisdangerous)(ptr);
   return false;
 }
 
-ABI static char *FormatHex(char *p, unsigned long x) {
+static char *FormatHex(char *p, unsigned long x) {
   int k = x ? (__builtin_clzl(x) ^ 63) + 1 : 1;
   k = (k + 3) & -4;
   while (k > 0)
@@ -41,7 +39,8 @@ ABI static char *FormatHex(char *p, unsigned long x) {
   return p;
 }
 
-ABI const char *_DescribeBacktrace(char buf[N], const struct StackFrame *fr) {
+dontinstrument const char *(DescribeBacktrace)(char buf[N],
+                                               const struct StackFrame *fr) {
   char *p = buf;
   char *pe = p + N;
   bool gotsome = false;
diff --git a/libc/intrin/describebacktrace.h b/libc/intrin/describebacktrace.h
index ee8614317..c9e600d66 100644
--- a/libc/intrin/describebacktrace.h
+++ b/libc/intrin/describebacktrace.h
@@ -4,8 +4,8 @@
 #include "libc/nexgen32e/stackframe.h"
 COSMOPOLITAN_C_START_
 
-const char *_DescribeBacktrace(char[160], const struct StackFrame *) libcesque;
-#define DescribeBacktrace(x) _DescribeBacktrace(alloca(160), x)
+const char *DescribeBacktrace(char[160], const struct StackFrame *) libcesque;
+#define DescribeBacktrace(x) DescribeBacktrace(alloca(160), x)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_INTRIN_DESCRIBEBACKTRACE_H_ */
diff --git a/libc/intrin/describecancelstate.c b/libc/intrin/describecancelstate.c
index ccaa1d139..4b5856666 100644
--- a/libc/intrin/describecancelstate.c
+++ b/libc/intrin/describecancelstate.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/thread/thread.h"
 
-const char *_DescribeCancelState(char buf[12], int err, int *state) {
+const char *(DescribeCancelState)(char buf[12], int err, int *state) {
   if (err)
     return "n/a";
   if (!state)
diff --git a/libc/intrin/describecapability.c b/libc/intrin/describecapability.c
index 67ddcf829..6e072f4dd 100644
--- a/libc/intrin/describecapability.c
+++ b/libc/intrin/describecapability.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/cap.h"
 
@@ -69,7 +69,7 @@ static const struct thatispacked {
     {CAP_CHECKPOINT_RESTORE, "CHECKPOINT_RESTORE"},  //
 };
 
-const char *_DescribeCapability(char buf[32], int x) {
+const char *(DescribeCapability)(char buf[32], int x) {
   int i;
   for (i = 0; i < ARRAYLEN(kCapabilityName); ++i) {
     if (kCapabilityName[i].x == x) {
diff --git a/libc/intrin/describeclockname.c b/libc/intrin/describeclockname.c
index 58d73b86f..c8e704ca5 100644
--- a/libc/intrin/describeclockname.c
+++ b/libc/intrin/describeclockname.c
@@ -22,6 +22,6 @@
 /**
  * Describes clock_gettime() clock argument.
  */
-const char *_DescribeClockName(char buf[32], int x) {
-  return _DescribeMagnum(buf, kClockNames, "CLOCK_", x);
+const char *(DescribeClockName)(char buf[32], int x) {
+  return DescribeMagnum(buf, kClockNames, "CLOCK_", x);
 }
diff --git a/libc/intrin/describecontrolkeystate.c b/libc/intrin/describecontrolkeystate.c
index f2e75e300..c46d859a2 100644
--- a/libc/intrin/describecontrolkeystate.c
+++ b/libc/intrin/describecontrolkeystate.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/struct/inputrecord.h"
 
 static const struct DescribeFlags kControlKeyState[] = {
@@ -32,7 +32,7 @@ static const struct DescribeFlags kControlKeyState[] = {
     {kNtEnhancedKey, "EnhancedKey"},            //
 };
 
-const char *_DescribeControlKeyState(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kControlKeyState, ARRAYLEN(kControlKeyState),
-                        "kNt", x);
+const char *(DescribeControlKeyState)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kControlKeyState, ARRAYLEN(kControlKeyState),
+                       "kNt", x);
 }
diff --git a/libc/intrin/describedirfd.c b/libc/intrin/describedirfd.c
index 36f729296..6b33d8ebf 100644
--- a/libc/intrin/describedirfd.c
+++ b/libc/intrin/describedirfd.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/sysv/consts/at.h"
 
-const char *_DescribeDirfd(char buf[12], int dirfd) {
+const char *(DescribeDirfd)(char buf[12], int dirfd) {
   if (dirfd == AT_FDCWD)
     return "AT_FDCWD";
   FormatInt32(buf, dirfd);
diff --git a/libc/intrin/describednotify.c b/libc/intrin/describednotify.c
index 2a56e1557..b4b719940 100644
--- a/libc/intrin/describednotify.c
+++ b/libc/intrin/describednotify.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/processaccess.h"
 #include "libc/sysv/consts/dn.h"
 
@@ -31,7 +31,7 @@ static const struct DescribeFlags kDnotifyFlags[] = {
     {DN_MULTISHOT, "MULTISHOT"},  //
 };
 
-const char *_DescribeDnotifyFlags(char buf[80], int x) {
-  return _DescribeFlags(buf, 80, kDnotifyFlags, ARRAYLEN(kDnotifyFlags), "DN_",
-                        x);
+const char *(DescribeDnotifyFlags)(char buf[80], int x) {
+  return DescribeFlags(buf, 80, kDnotifyFlags, ARRAYLEN(kDnotifyFlags), "DN_",
+                       x);
 }
diff --git a/libc/intrin/describeerrnoresult.c b/libc/intrin/describeerrnoresult.c
index 9f3817e02..0deed696e 100644
--- a/libc/intrin/describeerrnoresult.c
+++ b/libc/intrin/describeerrnoresult.c
@@ -22,7 +22,7 @@
 #include "libc/log/libfatal.internal.h"
 #include "libc/str/str.h"
 
-const char *_DescribeErrno(char buf[30], int ax) {
+const char *(DescribeErrno)(char buf[30], int ax) {
   char *p = buf;
   const char *s;
   if (ax < 0) {
diff --git a/libc/intrin/describefcntlcmd.c b/libc/intrin/describefcntlcmd.c
index dbdef8513..bcebd1b19 100644
--- a/libc/intrin/describefcntlcmd.c
+++ b/libc/intrin/describefcntlcmd.c
@@ -21,7 +21,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/str/str.h"
 
-const char *_DescribeFcntlCmd(char buf[20], int x) {
+const char *(DescribeFcntlCmd)(char buf[20], int x) {
   const char *s;
   if (x >= 0 && (s = GetMagnumStr(kFcntlCmds, x))) {
     buf[0] = 'F';
diff --git a/libc/intrin/describefdset.c b/libc/intrin/describefdset.c
index 7241b00db..1ef26444d 100644
--- a/libc/intrin/describefdset.c
+++ b/libc/intrin/describefdset.c
@@ -26,13 +26,11 @@
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribeFdSet(char buf[N], ssize_t rc, int nfds, fd_set *fds) {
+const char *(DescribeFdSet)(char buf[N], ssize_t rc, int nfds, fd_set *fds) {
   int o = 0;
 
   if (!fds)
     return "NULL";
-  if (rc == -1)
-    return "n/a";
   if (kisdangerous(fds)) {
     ksnprintf(buf, N, "%p", fds);
     return buf;
diff --git a/libc/intrin/describeflags.c b/libc/intrin/describeflags.c
index b4ea4ed98..10aa0ea1c 100644
--- a/libc/intrin/describeflags.c
+++ b/libc/intrin/describeflags.c
@@ -18,8 +18,8 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
 
-const char *_DescribeFlags(char *p, size_t n, const struct DescribeFlags *d,
-                           size_t m, const char *prefix, unsigned x) {
+const char *DescribeFlags(char *p, size_t n, const struct DescribeFlags *d,
+                          size_t m, const char *prefix, unsigned x) {
   bool t;
   char b[21];
   size_t i, j, k;
diff --git a/libc/intrin/describeflags.h b/libc/intrin/describeflags.h
index e63059f0e..9bcf96218 100644
--- a/libc/intrin/describeflags.h
+++ b/libc/intrin/describeflags.h
@@ -8,125 +8,121 @@ struct thatispacked DescribeFlags {
   const char *name;
 };
 
-const char *_DescribeFlags(char *, size_t, const struct DescribeFlags *, size_t,
-                           const char *, unsigned) libcesque;
+const char *DescribeFlags(char *, size_t, const struct DescribeFlags *, size_t,
+                          const char *, unsigned) libcesque;
 
-const char *_DescribeArchPrctlCode(char[12], int) libcesque;
-const char *_DescribeCancelState(char[12], int, int *) libcesque;
-const char *_DescribeClockName(char[32], int) libcesque;
-const char *_DescribeControlKeyState(char[64], uint32_t) libcesque;
-const char *_DescribeDirfd(char[12], int) libcesque;
-const char *_DescribeDnotifyFlags(char[80], int) libcesque;
-const char *_DescribeErrno(char[30], int) libcesque;
-const char *_DescribeFcntlCmd(char[20], int) libcesque;
-const char *_DescribeFlockType(char[12], int) libcesque;
-const char *_DescribeFutexOp(char[64], int) libcesque;
-const char *_DescribeHow(char[12], int) libcesque;
-const char *_DescribeInOutInt64(char[23], ssize_t, int64_t *) libcesque;
-const char *_DescribeItimer(char[12], int) libcesque;
-const char *_DescribeMapFlags(char[64], int) libcesque;
-const char *_DescribeMapping(char[8], int, int) libcesque;
-const char *_DescribeMremapFlags(char[30], int) libcesque;
-const char *_DescribeMsg(char[16], int) libcesque;
-const char *_DescribeMsyncFlags(char[48], int) libcesque;
-const char *_DescribeNtAllocationType(char[48], uint32_t);
-const char *_DescribeNtConsoleInFlags(char[256], uint32_t) libcesque;
-const char *_DescribeNtConsoleOutFlags(char[128], uint32_t) libcesque;
-const char *_DescribeNtCreationDisposition(uint32_t) libcesque;
-const char *_DescribeNtFileAccessFlags(char[512], uint32_t) libcesque;
-const char *_DescribeNtFileFlagAttr(char[256], uint32_t) libcesque;
-const char *_DescribeNtFileMapFlags(char[64], uint32_t) libcesque;
-const char *_DescribeNtFileShareFlags(char[64], uint32_t) libcesque;
-const char *_DescribeNtFiletypeFlags(char[64], uint32_t) libcesque;
-const char *_DescribeNtLockFileFlags(char[64], uint32_t) libcesque;
-const char *_DescribeNtMovFileInpFlags(char[256], uint32_t) libcesque;
-const char *_DescribeNtPageFlags(char[64], uint32_t) libcesque;
-const char *_DescribeNtPipeModeFlags(char[64], uint32_t) libcesque;
-const char *_DescribeNtPipeOpenFlags(char[64], uint32_t) libcesque;
-const char *_DescribeNtProcAccessFlags(char[256], uint32_t) libcesque;
-const char *_DescribeNtStartFlags(char[128], uint32_t) libcesque;
-const char *_DescribeNtSymlinkFlags(char[64], uint32_t) libcesque;
-const char *_DescribeOpenFlags(char[128], int) libcesque;
-const char *_DescribeOpenMode(char[15], int, int) libcesque;
-const char *_DescribePersonalityFlags(char[128], int) libcesque;
-const char *_DescribePollFlags(char[64], int) libcesque;
-const char *_DescribeProtFlags(char[48], int) libcesque;
-const char *_DescribePtrace(char[12], int) libcesque;
-const char *_DescribePtraceEvent(char[32], int) libcesque;
-const char *_DescribeRlimitName(char[20], int) libcesque;
-const char *_DescribeSchedPolicy(char[48], int) libcesque;
-const char *_DescribeSeccompOperation(int) libcesque;
-const char *_DescribeSiCode(char[20], int, int) libcesque;
-const char *_DescribeSigaltstackFlags(char[22], int) libcesque;
-const char *_DescribeSleepFlags(char[16], int) libcesque;
-const char *_DescribeSockLevel(char[12], int) libcesque;
-const char *_DescribeSockOptname(char[32], int, int) libcesque;
-const char *_DescribeSocketFamily(char[12], int) libcesque;
-const char *_DescribeSocketProtocol(char[12], int) libcesque;
-const char *_DescribeSocketType(char[64], int) libcesque;
-const char *_DescribeStdioState(char[12], int) libcesque;
-const char *_DescribeStringList(char[300], char *const[]) libcesque;
-const char *_DescribeThreadCreateFlags(char[64], uint32_t) libcesque;
-const char *_DescribeVirtualKeyCode(char[32], uint32_t) libcesque;
-const char *_DescribeWhence(char[12], int) libcesque;
-const char *_DescribeWhichPrio(char[12], int) libcesque;
+const char *DescribeArchPrctlCode(char[12], int) libcesque;
+const char *DescribeCancelState(char[12], int, int *) libcesque;
+const char *DescribeClockName(char[32], int) libcesque;
+const char *DescribeControlKeyState(char[64], uint32_t) libcesque;
+const char *DescribeDirfd(char[12], int) libcesque;
+const char *DescribeDnotifyFlags(char[80], int) libcesque;
+const char *DescribeErrno(char[30], int) libcesque;
+const char *DescribeFcntlCmd(char[20], int) libcesque;
+const char *DescribeFlockType(char[12], int) libcesque;
+const char *DescribeFutexOp(char[64], int) libcesque;
+const char *DescribeHow(char[12], int) libcesque;
+const char *DescribeInOutInt64(char[23], ssize_t, int64_t *) libcesque;
+const char *DescribeItimer(char[12], int) libcesque;
+const char *DescribeMapFlags(char[64], int) libcesque;
+const char *DescribeMapping(char[8], int, int) libcesque;
+const char *DescribeMremapFlags(char[30], int) libcesque;
+const char *DescribeMsyncFlags(char[48], int) libcesque;
+const char *DescribeNtConsoleInFlags(char[256], uint32_t) libcesque;
+const char *DescribeNtConsoleOutFlags(char[128], uint32_t) libcesque;
+const char *DescribeNtCreationDisposition(uint32_t) libcesque;
+const char *DescribeNtFileAccessFlags(char[512], uint32_t) libcesque;
+const char *DescribeNtFileFlagAttr(char[256], uint32_t) libcesque;
+const char *DescribeNtFileMapFlags(char[64], uint32_t) libcesque;
+const char *DescribeNtFileShareFlags(char[64], uint32_t) libcesque;
+const char *DescribeNtFiletypeFlags(char[64], uint32_t) libcesque;
+const char *DescribeNtLockFileFlags(char[64], uint32_t) libcesque;
+const char *DescribeNtMovFileInpFlags(char[256], uint32_t) libcesque;
+const char *DescribeNtPageFlags(char[64], uint32_t) libcesque;
+const char *DescribeNtPipeModeFlags(char[64], uint32_t) libcesque;
+const char *DescribeNtPipeOpenFlags(char[64], uint32_t) libcesque;
+const char *DescribeNtProcAccessFlags(char[256], uint32_t) libcesque;
+const char *DescribeNtStartFlags(char[128], uint32_t) libcesque;
+const char *DescribeNtSymlinkFlags(char[64], uint32_t) libcesque;
+const char *DescribeOpenFlags(char[128], int) libcesque;
+const char *DescribeOpenMode(char[15], int, int) libcesque;
+const char *DescribePersonalityFlags(char[128], int) libcesque;
+const char *DescribePollFlags(char[64], int) libcesque;
+const char *DescribeProtFlags(char[48], int) libcesque;
+const char *DescribePtrace(char[12], int) libcesque;
+const char *DescribePtraceEvent(char[32], int) libcesque;
+const char *DescribeRlimitName(char[20], int) libcesque;
+const char *DescribeSchedPolicy(char[48], int) libcesque;
+const char *DescribeSeccompOperation(int) libcesque;
+const char *DescribeSiCode(char[20], int, int) libcesque;
+const char *DescribeSigaltstackFlags(char[22], int) libcesque;
+const char *DescribeSleepFlags(char[16], int) libcesque;
+const char *DescribeSockLevel(char[12], int) libcesque;
+const char *DescribeSockOptname(char[32], int, int) libcesque;
+const char *DescribeSocketFamily(char[12], int) libcesque;
+const char *DescribeSocketProtocol(char[12], int) libcesque;
+const char *DescribeSocketType(char[64], int) libcesque;
+const char *DescribeStdioState(char[12], int) libcesque;
+const char *DescribeStringList(char[300], char *const[]) libcesque;
+const char *DescribeThreadCreateFlags(char[64], uint32_t) libcesque;
+const char *DescribeVirtualKeyCode(char[32], uint32_t) libcesque;
+const char *DescribeWhence(char[12], int) libcesque;
+const char *DescribeWhichPrio(char[12], int) libcesque;
 
-#define DescribeCancelState(x, y)    _DescribeCancelState(alloca(12), x, y)
-#define DescribeClockName(x)         _DescribeClockName(alloca(32), x)
-#define DescribeControlKeyState(x)   _DescribeControlKeyState(alloca(64), x)
-#define DescribeDirfd(x)             _DescribeDirfd(alloca(12), x)
-#define DescribeDnotifyFlags(x)      _DescribeDnotifyFlags(alloca(80), x)
-#define DescribeErrno(x)             _DescribeErrno(alloca(30), x)
-#define DescribeFcntlCmd(x)          _DescribeFcntlCmd(alloca(20), x)
-#define DescribeFlockType(x)         _DescribeFlockType(alloca(12), x)
-#define DescribeFutexOp(x)           _DescribeFutexOp(alloca(64), x)
-#define DescribeHow(x)               _DescribeHow(alloca(12), x)
-#define DescribeInOutInt64(rc, x)    _DescribeInOutInt64(alloca(23), rc, x)
-#define DescribeItimer(x)            _DescribeItimer(alloca(12), x)
-#define DescribeMapFlags(x)          _DescribeMapFlags(alloca(64), x)
-#define DescribeMapping(x, y)        _DescribeMapping(alloca(8), x, y)
-#define DescribeMremapFlags(x)       _DescribeMremapFlags(alloca(30), x)
-#define DescribeMsg(x)               _DescribeMsg(alloca(16), x)
-#define DescribeMsyncFlags(x)        _DescribeMsyncFlags(alloca(48), x)
-#define DescribeNtAllocationType(x)  _DescribeNtAllocationType(alloca(48), x)
-#define DescribeNtConsoleInFlags(x)  _DescribeNtConsoleInFlags(alloca(256), x)
-#define DescribeNtConsoleOutFlags(x) _DescribeNtConsoleOutFlags(alloca(128), x)
-#define DescribeNtFileAccessFlags(x) _DescribeNtFileAccessFlags(alloca(512), x)
-#define DescribeNtFileFlagAttr(x)    _DescribeNtFileFlagAttr(alloca(256), x)
-#define DescribeNtFileMapFlags(x)    _DescribeNtFileMapFlags(alloca(64), x)
-#define DescribeNtFileShareFlags(x)  _DescribeNtFileShareFlags(alloca(64), x)
-#define DescribeNtFiletypeFlags(x)   _DescribeNtFiletypeFlags(alloca(64), x)
-#define DescribeNtLockFileFlags(x)   _DescribeNtLockFileFlags(alloca(64), x)
-#define DescribeNtMovFileInpFlags(x) _DescribeNtMovFileInpFlags(alloca(256), x)
-#define DescribeNtPageFlags(x)       _DescribeNtPageFlags(alloca(64), x)
-#define DescribeNtPipeModeFlags(x)   _DescribeNtPipeModeFlags(alloca(64), x)
-#define DescribeNtPipeOpenFlags(x)   _DescribeNtPipeOpenFlags(alloca(64), x)
-#define DescribeNtProcAccessFlags(x) _DescribeNtProcAccessFlags(alloca(256), x)
-#define DescribeNtStartFlags(x)      _DescribeNtStartFlags(alloca(128), x)
-#define DescribeNtSymlinkFlags(x)    _DescribeNtSymlinkFlags(alloca(64), x)
-#define DescribeOpenFlags(x)         _DescribeOpenFlags(alloca(128), x)
-#define DescribeOpenMode(x, y)       _DescribeOpenMode(alloca(15), x, y)
-#define DescribePersonalityFlags(p)  _DescribePersonalityFlags(alloca(128), p)
-#define DescribePollFlags(p)         _DescribePollFlags(alloca(64), p)
-#define DescribeProtFlags(x)         _DescribeProtFlags(alloca(48), x)
-#define DescribePtrace(i)            _DescribePtrace(alloca(12), i)
-#define DescribePtraceEvent(x)       _DescribePtraceEvent(alloca(32), x)
-#define DescribeRlimitName(rl)       _DescribeRlimitName(alloca(20), rl)
-#define DescribeSchedPolicy(x)       _DescribeSchedPolicy(alloca(48), x)
-#define DescribeSiCode(x, y)         _DescribeSiCode(alloca(20), x, y)
-#define DescribeSigaltstackFlags(x)  _DescribeSigaltstackFlags(alloca(22), x)
-#define DescribeSleepFlags(x)        _DescribeSleepFlags(alloca(16), x)
-#define DescribeSockLevel(x)         _DescribeSockLevel(alloca(12), x)
-#define DescribeSockOptname(x, y)    _DescribeSockOptname(alloca(32), x, y)
-#define DescribeSocketFamily(x)      _DescribeSocketFamily(alloca(12), x)
-#define DescribeSocketProtocol(x)    _DescribeSocketProtocol(alloca(12), x)
-#define DescribeSocketType(x)        _DescribeSocketType(alloca(64), x)
-#define DescribeStdioState(x)        _DescribeStdioState(alloca(12), x)
-#define DescribeStringList(x)        _DescribeStringList(alloca(300), x)
-#define DescribeThreadCreateFlags(x) _DescribeThreadCreateFlags(alloca(64), x)
-#define DescribeVirtualKeyCode(x)    _DescribeVirtualKeyCode(alloca(32), x)
-#define DescribeWhence(x)            _DescribeWhence(alloca(12), x)
-#define DescribeWhichPrio(x)         _DescribeWhichPrio(alloca(12), x)
+#define DescribeCancelState(x, y)    DescribeCancelState(alloca(12), x, y)
+#define DescribeClockName(x)         DescribeClockName(alloca(32), x)
+#define DescribeControlKeyState(x)   DescribeControlKeyState(alloca(64), x)
+#define DescribeDirfd(x)             DescribeDirfd(alloca(12), x)
+#define DescribeDnotifyFlags(x)      DescribeDnotifyFlags(alloca(80), x)
+#define DescribeErrno(x)             DescribeErrno(alloca(30), x)
+#define DescribeFcntlCmd(x)          DescribeFcntlCmd(alloca(20), x)
+#define DescribeFlockType(x)         DescribeFlockType(alloca(12), x)
+#define DescribeFutexOp(x)           DescribeFutexOp(alloca(64), x)
+#define DescribeHow(x)               DescribeHow(alloca(12), x)
+#define DescribeInOutInt64(rc, x)    DescribeInOutInt64(alloca(23), rc, x)
+#define DescribeItimer(x)            DescribeItimer(alloca(12), x)
+#define DescribeMapFlags(x)          DescribeMapFlags(alloca(64), x)
+#define DescribeMapping(x, y)        DescribeMapping(alloca(8), x, y)
+#define DescribeMremapFlags(x)       DescribeMremapFlags(alloca(30), x)
+#define DescribeMsyncFlags(x)        DescribeMsyncFlags(alloca(48), x)
+#define DescribeNtConsoleInFlags(x)  DescribeNtConsoleInFlags(alloca(256), x)
+#define DescribeNtConsoleOutFlags(x) DescribeNtConsoleOutFlags(alloca(128), x)
+#define DescribeNtFileAccessFlags(x) DescribeNtFileAccessFlags(alloca(512), x)
+#define DescribeNtFileFlagAttr(x)    DescribeNtFileFlagAttr(alloca(256), x)
+#define DescribeNtFileMapFlags(x)    DescribeNtFileMapFlags(alloca(64), x)
+#define DescribeNtFileShareFlags(x)  DescribeNtFileShareFlags(alloca(64), x)
+#define DescribeNtFiletypeFlags(x)   DescribeNtFiletypeFlags(alloca(64), x)
+#define DescribeNtLockFileFlags(x)   DescribeNtLockFileFlags(alloca(64), x)
+#define DescribeNtMovFileInpFlags(x) DescribeNtMovFileInpFlags(alloca(256), x)
+#define DescribeNtPageFlags(x)       DescribeNtPageFlags(alloca(64), x)
+#define DescribeNtPipeModeFlags(x)   DescribeNtPipeModeFlags(alloca(64), x)
+#define DescribeNtPipeOpenFlags(x)   DescribeNtPipeOpenFlags(alloca(64), x)
+#define DescribeNtProcAccessFlags(x) DescribeNtProcAccessFlags(alloca(256), x)
+#define DescribeNtStartFlags(x)      DescribeNtStartFlags(alloca(128), x)
+#define DescribeNtSymlinkFlags(x)    DescribeNtSymlinkFlags(alloca(64), x)
+#define DescribeOpenFlags(x)         DescribeOpenFlags(alloca(128), x)
+#define DescribeOpenMode(x, y)       DescribeOpenMode(alloca(15), x, y)
+#define DescribePersonalityFlags(p)  DescribePersonalityFlags(alloca(128), p)
+#define DescribePollFlags(p)         DescribePollFlags(alloca(64), p)
+#define DescribeProtFlags(x)         DescribeProtFlags(alloca(48), x)
+#define DescribePtrace(i)            DescribePtrace(alloca(12), i)
+#define DescribePtraceEvent(x)       DescribePtraceEvent(alloca(32), x)
+#define DescribeRlimitName(rl)       DescribeRlimitName(alloca(20), rl)
+#define DescribeSchedPolicy(x)       DescribeSchedPolicy(alloca(48), x)
+#define DescribeSiCode(x, y)         DescribeSiCode(alloca(20), x, y)
+#define DescribeSigaltstackFlags(x)  DescribeSigaltstackFlags(alloca(22), x)
+#define DescribeSleepFlags(x)        DescribeSleepFlags(alloca(16), x)
+#define DescribeSockLevel(x)         DescribeSockLevel(alloca(12), x)
+#define DescribeSockOptname(x, y)    DescribeSockOptname(alloca(32), x, y)
+#define DescribeSocketFamily(x)      DescribeSocketFamily(alloca(12), x)
+#define DescribeSocketProtocol(x)    DescribeSocketProtocol(alloca(12), x)
+#define DescribeSocketType(x)        DescribeSocketType(alloca(64), x)
+#define DescribeStdioState(x)        DescribeStdioState(alloca(12), x)
+#define DescribeStringList(x)        DescribeStringList(alloca(300), x)
+#define DescribeThreadCreateFlags(x) DescribeThreadCreateFlags(alloca(64), x)
+#define DescribeVirtualKeyCode(x)    DescribeVirtualKeyCode(alloca(32), x)
+#define DescribeWhence(x)            DescribeWhence(alloca(12), x)
+#define DescribeWhichPrio(x)         DescribeWhichPrio(alloca(12), x)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_INTRIN_DESCRIBEFLAGS_INTERNAL_H_ */
diff --git a/libc/intrin/describeflock.c b/libc/intrin/describeflock.c
index 7f95445c4..ea7b744bd 100644
--- a/libc/intrin/describeflock.c
+++ b/libc/intrin/describeflock.c
@@ -27,7 +27,7 @@
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribeFlock(char buf[N], int cmd, const struct flock *l) {
+const char *(DescribeFlock)(char buf[N], int cmd, const struct flock *l) {
   int o = 0;
 
   if (!l)
diff --git a/libc/intrin/describeflocktype.c b/libc/intrin/describeflocktype.c
index ffb13879f..67c13a024 100644
--- a/libc/intrin/describeflocktype.c
+++ b/libc/intrin/describeflocktype.c
@@ -19,7 +19,7 @@
 #include "libc/fmt/itoa.h"
 #include "libc/sysv/consts/f.h"
 
-const char *_DescribeFlockType(char buf[12], int x) {
+const char *(DescribeFlockType)(char buf[12], int x) {
   if (x == F_RDLCK)
     return "F_RDLCK";
   if (x == F_WRLCK)
diff --git a/libc/intrin/describefutexop.c b/libc/intrin/describefutexop.c
index 7777a4755..7a4b0c783 100644
--- a/libc/intrin/describefutexop.c
+++ b/libc/intrin/describefutexop.c
@@ -21,7 +21,7 @@
 #include "libc/str/str.h"
 #include "libc/sysv/consts/futex.h"
 
-const char *_DescribeFutexOp(char buf[64], int x) {
+const char *(DescribeFutexOp)(char buf[64], int x) {
 
   bool priv = false;
   if (x & FUTEX_PRIVATE_FLAG) {
diff --git a/libc/intrin/describegidlist.c b/libc/intrin/describegidlist.c
index f915da80b..d35e9db87 100644
--- a/libc/intrin/describegidlist.c
+++ b/libc/intrin/describegidlist.c
@@ -20,13 +20,13 @@
 #include "libc/dce.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/popcnt.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 
 #define N 128
 
-const char *_DescribeGidList(char buf[N], int rc, int size,
-                             const uint32_t list[]) {
+const char *(DescribeGidList)(char buf[N], int rc, int size,
+                              const uint32_t list[]) {
   if ((rc == -1) || (size < 0))
     return "n/a";
   if (!size)
diff --git a/libc/intrin/describehow.c b/libc/intrin/describehow.c
index 9a92abd7a..f4fc6798d 100644
--- a/libc/intrin/describehow.c
+++ b/libc/intrin/describehow.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/sysv/consts/sig.h"
 
-const char *_DescribeHow(char buf[12], int how) {
+const char *(DescribeHow)(char buf[12], int how) {
   if (how == SIG_BLOCK)
     return "SIG_BLOCK";
   if (how == SIG_UNBLOCK)
diff --git a/libc/intrin/describeinoutint64.c b/libc/intrin/describeinoutint64.c
index 19a8f31ad..49fe1015b 100644
--- a/libc/intrin/describeinoutint64.c
+++ b/libc/intrin/describeinoutint64.c
@@ -20,7 +20,7 @@
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/describeflags.h"
 
-const char *_DescribeInOutInt64(char buf[23], ssize_t rc, int64_t *x) {
+const char *(DescribeInOutInt64)(char buf[23], ssize_t rc, int64_t *x) {
   if (!x)
     return "NULL";
   char *p = buf;
diff --git a/libc/intrin/describeiovec.c b/libc/intrin/describeiovec.c
index 30a2f3afb..2f1e97350 100644
--- a/libc/intrin/describeiovec.c
+++ b/libc/intrin/describeiovec.c
@@ -21,14 +21,14 @@
 #include "libc/dce.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #define N 300
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribeIovec(char buf[N], ssize_t rc, const struct iovec *iov,
-                           int iovlen) {
+const char *(DescribeIovec)(char buf[N], ssize_t rc, const struct iovec *iov,
+                            int iovlen) {
   const char *d;
   int i, j, o = 0;
 
diff --git a/libc/intrin/describeiovnt.c b/libc/intrin/describeiovnt.c
index e33b493fb..8229301d2 100644
--- a/libc/intrin/describeiovnt.c
+++ b/libc/intrin/describeiovnt.c
@@ -19,10 +19,10 @@
 #include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/winsock.h"
 
-void _DescribeIovNt(const struct NtIovec *iov, uint32_t iovlen, ssize_t rem) {
+void DescribeIovNt(const struct NtIovec *iov, uint32_t iovlen, ssize_t rem) {
   int i;
   if (kisdangerous(iov)) {
     kprintf("%p", iov);
diff --git a/libc/intrin/describeitimer.c b/libc/intrin/describeitimer.c
index aa1f96c75..98c4e2a8c 100644
--- a/libc/intrin/describeitimer.c
+++ b/libc/intrin/describeitimer.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/sysv/consts/itimer.h"
 
-const char *_DescribeItimer(char buf[12], int which) {
+const char *(DescribeItimer)(char buf[12], int which) {
   if (which == ITIMER_REAL)
     return "ITIMER_REAL";
   if (which == ITIMER_VIRTUAL)
diff --git a/libc/intrin/describeitimerval.c b/libc/intrin/describeitimerval.c
index 94af2a008..1e5661f50 100644
--- a/libc/intrin/describeitimerval.c
+++ b/libc/intrin/describeitimerval.c
@@ -25,8 +25,8 @@
 
 #define N 90
 
-const char *_DescribeItimerval(char buf[N], int rc,
-                               const struct itimerval *it) {
+const char *(DescribeItimerval)(char buf[N], int rc,
+                                const struct itimerval *it) {
   if (!it)
     return "NULL";
   if (rc == -1)
diff --git a/libc/intrin/describemagnums.c b/libc/intrin/describemagnums.c
index c5540cfc1..fe76de780 100644
--- a/libc/intrin/describemagnums.c
+++ b/libc/intrin/describemagnums.c
@@ -20,8 +20,8 @@
 #include "libc/fmt/magnumstrs.internal.h"
 #include "libc/str/str.h"
 
-const char *_DescribeMagnum(char *b, const struct MagnumStr *m, const char *p,
-                            int x) {
+const char *DescribeMagnum(char *b, const struct MagnumStr *m, const char *p,
+                           int x) {
   const char *s;
   if (x == 127)
     return "CLOCK_INVALID";
diff --git a/libc/intrin/describemapflags.c b/libc/intrin/describemapflags.c
index 7d6461b19..770798ac0 100644
--- a/libc/intrin/describemapflags.c
+++ b/libc/intrin/describemapflags.c
@@ -16,29 +16,25 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/consolemodeflags.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/prot.h"
 
-#define MAP_GROWSDOWN_LINUX 0x00000100
-
-const char *_DescribeMapFlags(char buf[64], int x) {
+const char *(DescribeMapFlags)(char buf[64], int x) {
   const struct DescribeFlags kMapFlags[] = {
-      {MAP_PRIVATE, "PRIVATE"},                            //
-      {MAP_ANONYMOUS, "ANONYMOUS"},                        //
-      {MAP_SHARED, "SHARED"},                              //
-      {MAP_FIXED, "FIXED"},                                //
-      {MAP_FIXED_NOREPLACE, "FIXED_NOREPLACE"},            //
-      {MAP_HUGETLB, "HUGETLB"},                            //
-      {MAP_CONCEAL, "CONCEAL"},                            //
-      {MAP_LOCKED, "LOCKED"},                              //
-      {MAP_NORESERVE, "NORESERVE"},                        //
-      {MAP_NONBLOCK, "NONBLOCK"},                          //
-      {MAP_POPULATE, "POPULATE"},                          //
-      {IsLinux() ? MAP_GROWSDOWN_LINUX : 0, "GROWSDOWN"},  //
+      {MAP_PRIVATE, "PRIVATE"},                  //
+      {MAP_ANONYMOUS, "ANONYMOUS"},              //
+      {MAP_SHARED, "SHARED"},                    //
+      {MAP_FIXED, "FIXED"},                      //
+      {MAP_FIXED_NOREPLACE, "FIXED_NOREPLACE"},  //
+      {MAP_HUGETLB, "HUGETLB"},                  //
+      {MAP_CONCEAL, "CONCEAL"},                  //
+      {MAP_LOCKED, "LOCKED"},                    //
+      {MAP_NORESERVE, "NORESERVE"},              //
+      {MAP_NONBLOCK, "NONBLOCK"},                //
+      {MAP_POPULATE, "POPULATE"},                //
   };
-  return _DescribeFlags(buf, 64, kMapFlags, ARRAYLEN(kMapFlags), "MAP_", x);
+  return DescribeFlags(buf, 64, kMapFlags, ARRAYLEN(kMapFlags), "MAP_", x);
 }
diff --git a/libc/intrin/describemapping.c b/libc/intrin/describemapping.c
index 9371028b8..79f1cf706 100644
--- a/libc/intrin/describemapping.c
+++ b/libc/intrin/describemapping.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/intrin/maps.h"
 #include "libc/runtime/memtrack.internal.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/prot.h"
@@ -25,13 +24,12 @@
 static char DescribeMapType(int flags) {
   switch (flags & MAP_TYPE) {
     case MAP_FILE:
-      if (flags & MAP_NOFORK)
-        return 'i';  // executable image
       return '-';
     case MAP_PRIVATE:
       if (flags & MAP_NOFORK)
-        return 'w';  // windows memory
-      return 'p';
+        return 'P';
+      else
+        return 'p';
     case MAP_SHARED:
       return 's';
     default:
@@ -47,7 +45,7 @@ char *DescribeProt(char p[4], int prot) {
   return p;
 }
 
-const char *_DescribeMapping(char p[8], int prot, int flags) {
+const char *(DescribeMapping)(char p[8], int prot, int flags) {
   /* asan runtime depends on this function */
   DescribeProt(p, prot);
   p[3] = DescribeMapType(flags);
diff --git a/libc/intrin/describemremapflags.c b/libc/intrin/describemremapflags.c
index 152f8a4ce..185206e3c 100644
--- a/libc/intrin/describemremapflags.c
+++ b/libc/intrin/describemremapflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/mremap.h"
 
 static const struct DescribeFlags kMremapFlags[] = {
@@ -25,7 +25,7 @@ static const struct DescribeFlags kMremapFlags[] = {
     {MREMAP_FIXED, "FIXED"},      //
 };
 
-const char *_DescribeMremapFlags(char buf[30], int x) {
-  return _DescribeFlags(buf, 30, kMremapFlags, ARRAYLEN(kMremapFlags),
-                        "MREMAP_", x);
+const char *(DescribeMremapFlags)(char buf[30], int x) {
+  return DescribeFlags(buf, 30, kMremapFlags, ARRAYLEN(kMremapFlags), "MREMAP_",
+                       x);
 }
diff --git a/libc/intrin/describemsg.c b/libc/intrin/describemsg.c
deleted file mode 100644
index 9cfc5372e..000000000
--- a/libc/intrin/describemsg.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
-#include "libc/sysv/consts/msg.h"
-
-const char *_DescribeMsg(char buf[16], int x) {
-  const struct DescribeFlags kMsgFlags[] = {
-      {MSG_FASTOPEN, "FASTOPEN"},    // order matters
-      {MSG_OOB, "OOB"},              //
-      {MSG_PEEK, "PEEK"},            //
-      {MSG_DONTROUTE, "DONTROUTE"},  //
-      {MSG_DONTWAIT, "DONTWAIT"},    //
-      {MSG_WAITALL, "WAITALL"},      //
-      {MSG_NOSIGNAL, "NOSIGNAL"},    //
-      {MSG_TRUNC, "TRUNC"},          //
-      {MSG_CTRUNC, "CTRUNC"},        //
-  };
-  return _DescribeFlags(buf, 16, kMsgFlags, ARRAYLEN(kMsgFlags), "MSG_", x);
-}
diff --git a/libc/intrin/describemsyncflags.c b/libc/intrin/describemsyncflags.c
index 56a838e97..481493489 100644
--- a/libc/intrin/describemsyncflags.c
+++ b/libc/intrin/describemsyncflags.c
@@ -17,14 +17,14 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/msync.h"
 
-const char *_DescribeMsyncFlags(char buf[48], int x) {
+const char *(DescribeMsyncFlags)(char buf[48], int x) {
   const struct DescribeFlags kMsyncFlags[] = {
       {MS_SYNC, "SYNC"},              //
       {MS_ASYNC, "ASYNC"},            //
       {MS_INVALIDATE, "INVALIDATE"},  //
   };
-  return _DescribeFlags(buf, 48, kMsyncFlags, ARRAYLEN(kMsyncFlags), "MS_", x);
+  return DescribeFlags(buf, 48, kMsyncFlags, ARRAYLEN(kMsyncFlags), "MS_", x);
 }
diff --git a/libc/intrin/describentconsolemodeinputflags.c b/libc/intrin/describentconsolemodeinputflags.c
index a53575481..caeeae037 100644
--- a/libc/intrin/describentconsolemodeinputflags.c
+++ b/libc/intrin/describentconsolemodeinputflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/consolemodeflags.h"
 
 static const struct DescribeFlags kConsoleModeInputFlags[] = {
@@ -33,7 +33,7 @@ static const struct DescribeFlags kConsoleModeInputFlags[] = {
     {kNtEnableVirtualTerminalInput, "VirtualTerminalInput"},  //
 };
 
-const char *_DescribeNtConsoleInFlags(char buf[256], uint32_t x) {
-  return _DescribeFlags(buf, 256, kConsoleModeInputFlags,
-                        ARRAYLEN(kConsoleModeInputFlags), "kNtEnable", x);
+const char *(DescribeNtConsoleInFlags)(char buf[256], uint32_t x) {
+  return DescribeFlags(buf, 256, kConsoleModeInputFlags,
+                       ARRAYLEN(kConsoleModeInputFlags), "kNtEnable", x);
 }
diff --git a/libc/intrin/describentconsolemodeoutputflags.c b/libc/intrin/describentconsolemodeoutputflags.c
index 2686e765a..68ab4c2c1 100644
--- a/libc/intrin/describentconsolemodeoutputflags.c
+++ b/libc/intrin/describentconsolemodeoutputflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/consolemodeflags.h"
 
 static const struct DescribeFlags kConsoleModeOutputFlags[] = {
@@ -28,7 +28,7 @@ static const struct DescribeFlags kConsoleModeOutputFlags[] = {
     {kNtEnableLvbGridWorldwide, "EnableLvbGridWorldwide"},                    //
 };
 
-const char *_DescribeNtConsoleOutFlags(char buf[128], uint32_t x) {
-  return _DescribeFlags(buf, 128, kConsoleModeOutputFlags,
-                        ARRAYLEN(kConsoleModeOutputFlags), "kNt", x);
+const char *(DescribeNtConsoleOutFlags)(char buf[128], uint32_t x) {
+  return DescribeFlags(buf, 128, kConsoleModeOutputFlags,
+                       ARRAYLEN(kConsoleModeOutputFlags), "kNt", x);
 }
diff --git a/libc/intrin/describentcreationdisposition.c b/libc/intrin/describentcreationdisposition.c
index 00636b1f7..136a8119f 100644
--- a/libc/intrin/describentcreationdisposition.c
+++ b/libc/intrin/describentcreationdisposition.c
@@ -19,7 +19,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/nt/enum/creationdisposition.h"
 
-const char *_DescribeNtCreationDisposition(uint32_t x) {
+const char *DescribeNtCreationDisposition(uint32_t x) {
   switch (x) {
     case kNtCreateNew:
       return "kNtCreateNew";
diff --git a/libc/intrin/describentfileaccessflags.c b/libc/intrin/describentfileaccessflags.c
index 582f519e4..996c9f36b 100644
--- a/libc/intrin/describentfileaccessflags.c
+++ b/libc/intrin/describentfileaccessflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/accessmask.h"
 #include "libc/nt/enum/filesharemode.h"
 // clang-format off
@@ -72,7 +72,7 @@ static const struct DescribeFlags kFileAccessflags[] = {
     {kNtTokenAdjustSessionid, "kNtTokenAdjustSessionid"},
 };
 
-const char *_DescribeNtFileAccessFlags(char buf[512], uint32_t x) {
-  return _DescribeFlags(buf, 512, kFileAccessflags, ARRAYLEN(kFileAccessflags),
+const char *(DescribeNtFileAccessFlags)(char buf[512], uint32_t x) {
+  return DescribeFlags(buf, 512, kFileAccessflags, ARRAYLEN(kFileAccessflags),
                        "", x);
 }
diff --git a/libc/intrin/describentfileflagattr.c b/libc/intrin/describentfileflagattr.c
index 4dc9528df..a27024fbd 100644
--- a/libc/intrin/describentfileflagattr.c
+++ b/libc/intrin/describentfileflagattr.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/fileflagandattributes.h"
 #include "libc/runtime/runtime.h"
 
@@ -50,9 +50,9 @@ static const struct DescribeFlags kFileFlags[] = {
     {kNtFileFlagFirstPipeInstance, "FlagFirstPipeInstance"},            //
 };
 
-const char *_DescribeNtFileFlagAttr(char buf[256], uint32_t x) {
+const char *(DescribeNtFileFlagAttr)(char buf[256], uint32_t x) {
   if (x == -1u)
     return "-1u";
-  return _DescribeFlags(buf, 256, kFileFlags, ARRAYLEN(kFileFlags), "kNtFile",
-                        x);
+  return DescribeFlags(buf, 256, kFileFlags, ARRAYLEN(kFileFlags), "kNtFile",
+                       x);
 }
diff --git a/libc/intrin/describentfilemapflags.c b/libc/intrin/describentfilemapflags.c
index 08ba98242..11d28d561 100644
--- a/libc/intrin/describentfilemapflags.c
+++ b/libc/intrin/describentfilemapflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/filemapflags.h"
 
 static const struct DescribeFlags kFileMapFlags[] = {
@@ -30,7 +30,7 @@ static const struct DescribeFlags kFileMapFlags[] = {
     {kNtFileMapLargePages, "LargePages"},          //
 };
 
-const char *_DescribeNtFileMapFlags(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kFileMapFlags, ARRAYLEN(kFileMapFlags),
-                        "kNtFileMap", x);
+const char *(DescribeNtFileMapFlags)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kFileMapFlags, ARRAYLEN(kFileMapFlags),
+                       "kNtFileMap", x);
 }
diff --git a/libc/intrin/describentfileshareflags.c b/libc/intrin/describentfileshareflags.c
index 52f3aeee4..865cda1d7 100644
--- a/libc/intrin/describentfileshareflags.c
+++ b/libc/intrin/describentfileshareflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/filesharemode.h"
 
 static const struct DescribeFlags kFileShareflags[] = {
@@ -26,7 +26,7 @@ static const struct DescribeFlags kFileShareflags[] = {
     {kNtFileShareDelete, "Delete"},  //
 };
 
-const char *_DescribeNtFileShareFlags(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kFileShareflags, ARRAYLEN(kFileShareflags),
-                        "kNtFileShare", x);
+const char *(DescribeNtFileShareFlags)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kFileShareflags, ARRAYLEN(kFileShareflags),
+                       "kNtFileShare", x);
 }
diff --git a/libc/intrin/describentfiletypeflags.c b/libc/intrin/describentfiletypeflags.c
index 7cecebc04..8e720f302 100644
--- a/libc/intrin/describentfiletypeflags.c
+++ b/libc/intrin/describentfiletypeflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/filetype.h"
 #include "libc/sysv/consts/mremap.h"
 
@@ -28,7 +28,7 @@ static const struct DescribeFlags kFiletypeFlags[] = {
     {kNtFileTypeChar, "Char"},      //
 };
 
-const char *_DescribeNtFiletypeFlags(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kFiletypeFlags, ARRAYLEN(kFiletypeFlags),
-                        "kNtFileType", x);
+const char *(DescribeNtFiletypeFlags)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kFiletypeFlags, ARRAYLEN(kFiletypeFlags),
+                       "kNtFileType", x);
 }
diff --git a/libc/intrin/describentlockfileflags.c b/libc/intrin/describentlockfileflags.c
index 007b5c74d..69f0875bc 100644
--- a/libc/intrin/describentlockfileflags.c
+++ b/libc/intrin/describentlockfileflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/filelockflags.h"
 
 static const struct DescribeFlags kNtLockFileFlags[] = {
@@ -25,7 +25,7 @@ static const struct DescribeFlags kNtLockFileFlags[] = {
     {kNtLockfileExclusiveLock, "ExclusiveLock"},      //
 };
 
-const char *_DescribeNtLockFileFlags(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kNtLockFileFlags, ARRAYLEN(kNtLockFileFlags),
-                        "kNtLockfile", x);
+const char *(DescribeNtLockFileFlags)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kNtLockFileFlags, ARRAYLEN(kNtLockFileFlags),
+                       "kNtLockfile", x);
 }
diff --git a/libc/intrin/describentmovfileinpflags.c b/libc/intrin/describentmovfileinpflags.c
index 311a3992b..1b301cd6a 100644
--- a/libc/intrin/describentmovfileinpflags.c
+++ b/libc/intrin/describentmovfileinpflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/movefileexflags.h"
 
 static const struct DescribeFlags kMoveFileInputFlags[] = {
@@ -29,7 +29,7 @@ static const struct DescribeFlags kMoveFileInputFlags[] = {
     {kNtMovefileFailIfNotTrackable, "FailIfNotTrackable"},  //
 };
 
-const char *_DescribeNtMovFileInpFlags(char buf[256], uint32_t x) {
-  return _DescribeFlags(buf, 256, kMoveFileInputFlags,
-                        ARRAYLEN(kMoveFileInputFlags), "kNtMovefile", x);
+const char *(DescribeNtMovFileInpFlags)(char buf[256], uint32_t x) {
+  return DescribeFlags(buf, 256, kMoveFileInputFlags,
+                       ARRAYLEN(kMoveFileInputFlags), "kNtMovefile", x);
 }
diff --git a/libc/intrin/describentoverlapped.c b/libc/intrin/describentoverlapped.c
index c21e231e5..d6727424d 100644
--- a/libc/intrin/describentoverlapped.c
+++ b/libc/intrin/describentoverlapped.c
@@ -18,9 +18,9 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describentoverlapped.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
-const char *_DescribeNtOverlapped(char b[128], const struct NtOverlapped *o) {
+const char *(DescribeNtOverlapped)(char b[128], const struct NtOverlapped *o) {
   int i = 0, n = 128;
   bool gotsome = false;
   if (!o)
diff --git a/libc/intrin/describentoverlapped.h b/libc/intrin/describentoverlapped.h
index b7d97f6d3..009dad0c1 100644
--- a/libc/intrin/describentoverlapped.h
+++ b/libc/intrin/describentoverlapped.h
@@ -4,8 +4,8 @@
 #include "libc/nt/struct/overlapped.h"
 COSMOPOLITAN_C_START_
 
-const char *_DescribeNtOverlapped(char[128], const struct NtOverlapped *);
-#define DescribeNtOverlapped(x) _DescribeNtOverlapped(alloca(128), x)
+const char *DescribeNtOverlapped(char[128], const struct NtOverlapped *);
+#define DescribeNtOverlapped(x) DescribeNtOverlapped(alloca(128), x)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_INTRIN_DESCRIBENTOVERLAPPED_INTERNAL_H_ */
diff --git a/libc/intrin/describentpageflags.c b/libc/intrin/describentpageflags.c
index aac644adb..30acb62bc 100644
--- a/libc/intrin/describentpageflags.c
+++ b/libc/intrin/describentpageflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/pageflags.h"
 
 static const struct DescribeFlags kPageFlags[] = {
@@ -41,6 +41,6 @@ static const struct DescribeFlags kPageFlags[] = {
     {kNtSecWritecombine, "SecWritecombine"},            //
 };
 
-const char *_DescribeNtPageFlags(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kPageFlags, ARRAYLEN(kPageFlags), "kNt", x);
+const char *(DescribeNtPageFlags)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kPageFlags, ARRAYLEN(kPageFlags), "kNt", x);
 }
diff --git a/libc/intrin/describentpipemodeflags.c b/libc/intrin/describentpipemodeflags.c
index 0c8f58bfc..4bab699f4 100644
--- a/libc/intrin/describentpipemodeflags.c
+++ b/libc/intrin/describentpipemodeflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/filemapflags.h"
 #include "libc/nt/ipc.h"
 
@@ -32,7 +32,7 @@ static const struct DescribeFlags kPipeModeFlags[] = {
     //{kNtPipeTypeByte, "TypeByte"},                        // 0x00000000
 };
 
-const char *_DescribeNtPipeModeFlags(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kPipeModeFlags, ARRAYLEN(kPipeModeFlags),
-                        "kNtPipe", x);
+const char *(DescribeNtPipeModeFlags)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kPipeModeFlags, ARRAYLEN(kPipeModeFlags),
+                       "kNtPipe", x);
 }
diff --git a/libc/intrin/describentpipeopenflags.c b/libc/intrin/describentpipeopenflags.c
index 623075468..bc8134229 100644
--- a/libc/intrin/describentpipeopenflags.c
+++ b/libc/intrin/describentpipeopenflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/accessmask.h"
 #include "libc/nt/enum/fileflagandattributes.h"
 #include "libc/nt/enum/filemapflags.h"
@@ -35,7 +35,7 @@ static const struct DescribeFlags kPipeOpenFlags[] = {
     {kNtAccessSystemSecurity, "kNtAccessSystemSecurity"},
 };
 
-const char *_DescribeNtPipeOpenFlags(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kPipeOpenFlags, ARRAYLEN(kPipeOpenFlags), "",
-                        x);
+const char *(DescribeNtPipeOpenFlags)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kPipeOpenFlags, ARRAYLEN(kPipeOpenFlags), "",
+                       x);
 }
diff --git a/libc/intrin/describentprocaccessflags.c b/libc/intrin/describentprocaccessflags.c
index 63b94754b..a7f5db917 100644
--- a/libc/intrin/describentprocaccessflags.c
+++ b/libc/intrin/describentprocaccessflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/processaccess.h"
 
 static const struct DescribeFlags kProcessAccessflags[] = {
@@ -37,7 +37,7 @@ static const struct DescribeFlags kProcessAccessflags[] = {
     {kNtProcessSynchronize, "Synchronize"},                          //
 };
 
-const char *_DescribeNtProcAccessFlags(char buf[256], uint32_t x) {
-  return _DescribeFlags(buf, 256, kProcessAccessflags,
-                        ARRAYLEN(kProcessAccessflags), "kNtProcess", x);
+const char *(DescribeNtProcAccessFlags)(char buf[256], uint32_t x) {
+  return DescribeFlags(buf, 256, kProcessAccessflags,
+                       ARRAYLEN(kProcessAccessflags), "kNtProcess", x);
 }
diff --git a/libc/intrin/describentsecurityattributes.c b/libc/intrin/describentsecurityattributes.c
index 674fe2f28..058c2a49f 100644
--- a/libc/intrin/describentsecurityattributes.c
+++ b/libc/intrin/describentsecurityattributes.c
@@ -21,8 +21,9 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/nt/struct/securityattributes.h"
 
-const char *_DescribeNtSecurityAttributes(
-    char buf[32], const struct NtSecurityAttributes *p) {
+const char *(
+    DescribeNtSecurityAttributes)(char buf[32],
+                                  const struct NtSecurityAttributes *p) {
   FormatInt64(buf, (uintptr_t)p);
   return buf;
 }
diff --git a/libc/intrin/describentstartflags.c b/libc/intrin/describentstartflags.c
index 6af9c8744..6a46b736c 100644
--- a/libc/intrin/describentstartflags.c
+++ b/libc/intrin/describentstartflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/startf.h"
 #include "libc/sysv/consts/prot.h"
 
@@ -38,7 +38,7 @@ static const struct DescribeFlags kNtStartFlags[] = {
     {kNtStartfUntrustedsource, "Untrustedsource"},    //
 };
 
-const char *_DescribeNtStartFlags(char buf[128], uint32_t x) {
-  return _DescribeFlags(buf, 128, kNtStartFlags, ARRAYLEN(kNtStartFlags),
-                        "kNtStartf", x);
+const char *(DescribeNtStartFlags)(char buf[128], uint32_t x) {
+  return DescribeFlags(buf, 128, kNtStartFlags, ARRAYLEN(kNtStartFlags),
+                       "kNtStartf", x);
 }
diff --git a/libc/intrin/describentsymlinkflags.c b/libc/intrin/describentsymlinkflags.c
index c9924fba5..85e5d5896 100644
--- a/libc/intrin/describentsymlinkflags.c
+++ b/libc/intrin/describentsymlinkflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/symboliclink.h"
 
 static const struct DescribeFlags kSymbolicLinkflags[] = {
@@ -25,7 +25,7 @@ static const struct DescribeFlags kSymbolicLinkflags[] = {
     {kNtSymbolicLinkFlagAllowUnprivilegedCreate, "AllowUnprivilegedCreate"},  //
 };
 
-const char *_DescribeNtSymlinkFlags(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kSymbolicLinkflags,
-                        ARRAYLEN(kSymbolicLinkflags), "kNtSymbolicLinkFlag", x);
+const char *(DescribeNtSymlinkFlags)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kSymbolicLinkflags,
+                       ARRAYLEN(kSymbolicLinkflags), "kNtSymbolicLinkFlag", x);
 }
diff --git a/libc/intrin/describeopenflags.c b/libc/intrin/describeopenflags.c
index 0f640288f..984ecd76e 100644
--- a/libc/intrin/describeopenflags.c
+++ b/libc/intrin/describeopenflags.c
@@ -20,7 +20,7 @@
 #include "libc/fmt/itoa.h"
 #include "libc/fmt/magnumstrs.internal.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/sol.h"
@@ -30,7 +30,7 @@
 /**
  * Describes clock_gettime() clock argument.
  */
-const char *_DescribeOpenFlags(char buf[128], int x) {
+const char *(DescribeOpenFlags)(char buf[128], int x) {
   char *p;
   int i, n;
   const char *pipe;
@@ -68,7 +68,7 @@ const char *_DescribeOpenFlags(char buf[128], int x) {
       d[i].flag = MAGNUM_NUMBER(kOpenFlags, i);
       d[i].name = MAGNUM_STRING(kOpenFlags, i);
     }
-    _DescribeFlags(p, 128 - (p - buf), d, n, "O_", x);
+    DescribeFlags(p, 128 - (p - buf), d, n, "O_", x);
   }
   return buf;
 }
diff --git a/libc/intrin/describeopenmode.c b/libc/intrin/describeopenmode.c
index bfcc3397e..bbfa86e8e 100644
--- a/libc/intrin/describeopenmode.c
+++ b/libc/intrin/describeopenmode.c
@@ -28,7 +28,7 @@ static bool IsCreatingFile(int flags) {
          (IsLinux() && (flags & O_TMPFILE_LINUX) == O_TMPFILE_LINUX);
 }
 
-const char *_DescribeOpenMode(char buf[15], int flags, int mode) {
+const char *(DescribeOpenMode)(char buf[15], int flags, int mode) {
   if (!IsCreatingFile(flags)) {
     return "";
   }
diff --git a/libc/intrin/describepersonalityflags.c b/libc/intrin/describepersonalityflags.c
index 88fab3203..86d5563cf 100644
--- a/libc/intrin/describepersonalityflags.c
+++ b/libc/intrin/describepersonalityflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/accessmask.h"
 #include "libc/nt/enum/filesharemode.h"
 #include "libc/sysv/consts/personality.h"
@@ -36,7 +36,7 @@ static const struct DescribeFlags kPersonalityFlags[] = {
     {UNAME26, "UNAME26"},                        //
 };
 
-const char *_DescribePersonalityFlags(char buf[128], int x) {
-  return _DescribeFlags(buf, 128, kPersonalityFlags,
-                        ARRAYLEN(kPersonalityFlags), "", x);
+const char *(DescribePersonalityFlags)(char buf[128], int x) {
+  return DescribeFlags(buf, 128, kPersonalityFlags, ARRAYLEN(kPersonalityFlags),
+                       "", x);
 }
diff --git a/libc/intrin/describepollfds.c b/libc/intrin/describepollfds.c
index 94ebd2da5..dd1b8a19a 100644
--- a/libc/intrin/describepollfds.c
+++ b/libc/intrin/describepollfds.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sock/struct/pollfd.h"
 #include "libc/sock/struct/pollfd.internal.h"
 
@@ -28,8 +28,8 @@
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribePollFds(char buf[N], ssize_t rc, struct pollfd *fds,
-                             size_t nfds) {
+const char *(DescribePollFds)(char buf[N], ssize_t rc, struct pollfd *fds,
+                              size_t nfds) {
   char b64[64];
   int i, o = 0;
 
@@ -45,9 +45,9 @@ const char *_DescribePollFds(char buf[N], ssize_t rc, struct pollfd *fds,
   for (i = 0; i < nfds; ++i) {
     if (i)
       append(", ");
-    append("{%d, %s", fds[i].fd, _DescribePollFlags(b64, fds[i].events));
+    append("{%d, %s", fds[i].fd, (DescribePollFlags)(b64, fds[i].events));
     if (rc >= 0) {
-      append(", [%s]", _DescribePollFlags(b64, fds[i].revents));
+      append(", [%s]", (DescribePollFlags)(b64, fds[i].revents));
     }
     append("}");
   }
diff --git a/libc/intrin/describepollflags.c b/libc/intrin/describepollflags.c
index 6445b4e54..e902914a3 100644
--- a/libc/intrin/describepollflags.c
+++ b/libc/intrin/describepollflags.c
@@ -17,11 +17,11 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/filemapflags.h"
 #include "libc/sysv/consts/poll.h"
 
-const char *_DescribePollFlags(char buf[64], int x) {
+const char *(DescribePollFlags)(char buf[64], int x) {
   const struct DescribeFlags kPollFlags[] = {
       {POLLIN, "IN"},          // order matters
       {POLLOUT, "OUT"},        // order matters
@@ -35,5 +35,5 @@ const char *_DescribePollFlags(char buf[64], int x) {
       {POLLWRBAND, "WRBAND"},  //
       {POLLWRNORM, "WRNORM"},  //
   };
-  return _DescribeFlags(buf, 64, kPollFlags, ARRAYLEN(kPollFlags), "POLL", x);
+  return DescribeFlags(buf, 64, kPollFlags, ARRAYLEN(kPollFlags), "POLL", x);
 }
diff --git a/libc/intrin/describeprotflags.c b/libc/intrin/describeprotflags.c
index 9fad2bd32..33baf5fcf 100644
--- a/libc/intrin/describeprotflags.c
+++ b/libc/intrin/describeprotflags.c
@@ -17,17 +17,15 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/prot.h"
 
-const char *_DescribeProtFlags(char buf[48], int x) {
-  if (!x)
-    return "PROT_NONE";
+const char *(DescribeProtFlags)(char buf[48], int x) {
   const struct DescribeFlags kProtFlags[] = {
       {PROT_READ, "READ"},    //
       {PROT_WRITE, "WRITE"},  //
       {PROT_EXEC, "EXEC"},    //
       {PROT_GUARD, "GUARD"},  //
   };
-  return _DescribeFlags(buf, 48, kProtFlags, ARRAYLEN(kProtFlags), "PROT_", x);
+  return DescribeFlags(buf, 48, kProtFlags, ARRAYLEN(kProtFlags), "PROT_", x);
 }
diff --git a/libc/intrin/describeptrace.c b/libc/intrin/describeptrace.c
index 84baa0e5d..d4c6f4fec 100644
--- a/libc/intrin/describeptrace.c
+++ b/libc/intrin/describeptrace.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/sysv/consts/ptrace.h"
 
-const char *_DescribePtrace(char buf[12], int x) {
+const char *(DescribePtrace)(char buf[12], int x) {
   if (x == -1)
     return "-1";
   if (x == PTRACE_TRACEME)
diff --git a/libc/intrin/describeptraceevent.c b/libc/intrin/describeptraceevent.c
index 11c0c4699..b3ba1ee18 100644
--- a/libc/intrin/describeptraceevent.c
+++ b/libc/intrin/describeptraceevent.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/sysv/consts/ptrace.h"
 
-const char *_DescribePtraceEvent(char buf[32], int x) {
+const char *(DescribePtraceEvent)(char buf[32], int x) {
   if (x == PTRACE_EVENT_FORK)
     return "PTRACE_EVENT_FORK";
   if (x == PTRACE_EVENT_VFORK)
diff --git a/libc/intrin/describerlimit.c b/libc/intrin/describerlimit.c
index 7c58a965b..feb7574ee 100644
--- a/libc/intrin/describerlimit.c
+++ b/libc/intrin/describerlimit.c
@@ -24,7 +24,7 @@
 #include "libc/str/str.h"
 #include "libc/sysv/consts/rlim.h"
 
-const char *_DescribeRlimit(char buf[64], int rc, const struct rlimit *rlim) {
+const char *DescribeRlimit(char buf[64], int rc, const struct rlimit *rlim) {
   if (rc == -1)
     return "n/a";
   if (!rlim)
diff --git a/libc/intrin/describerlimitname.c b/libc/intrin/describerlimitname.c
index 1872e7792..15ee5a7b9 100644
--- a/libc/intrin/describerlimitname.c
+++ b/libc/intrin/describerlimitname.c
@@ -22,8 +22,8 @@
 /**
  * Describes setrlimit() / getrlimit() argument.
  */
-const char *_DescribeRlimitName(char buf[20], int x) {
+const char *(DescribeRlimitName)(char buf[20], int x) {
   if (x == 127)
     return "n/a";
-  return _DescribeMagnum(buf, kRlimitNames, "RLIMIT_", x);
+  return DescribeMagnum(buf, kRlimitNames, "RLIMIT_", x);
 }
diff --git a/libc/intrin/describeschedparam.c b/libc/intrin/describeschedparam.c
index f559ffad5..369f52d93 100644
--- a/libc/intrin/describeschedparam.c
+++ b/libc/intrin/describeschedparam.c
@@ -24,7 +24,7 @@
 /**
  * Describes clock_gettime() clock argument.
  */
-const char *_DescribeSchedParam(char buf[32], const struct sched_param *x) {
+const char *(DescribeSchedParam)(char buf[32], const struct sched_param *x) {
   char *p;
   if (!x)
     return "0";
diff --git a/libc/intrin/describeschedpolicy.c b/libc/intrin/describeschedpolicy.c
index 24c7eb67c..687636ae3 100644
--- a/libc/intrin/describeschedpolicy.c
+++ b/libc/intrin/describeschedpolicy.c
@@ -19,14 +19,14 @@
 #include "libc/fmt/itoa.h"
 #include "libc/fmt/magnumstrs.internal.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/sched.h"
 
 /**
  * Describes clock_gettime() clock argument.
  */
-const char *_DescribeSchedPolicy(char buf[48], int x) {
+const char *(DescribeSchedPolicy)(char buf[48], int x) {
   char *p = buf;
   if (x == -1) {
     goto DoNumber;
diff --git a/libc/intrin/describeseccompoperation.c b/libc/intrin/describeseccompoperation.c
index 824f10841..a18b18d6f 100644
--- a/libc/intrin/describeseccompoperation.c
+++ b/libc/intrin/describeseccompoperation.c
@@ -19,7 +19,7 @@
 #include "libc/calls/struct/seccomp.internal.h"
 #include "libc/intrin/describeflags.h"
 
-const char *_DescribeSeccompOperation(int x) {
+const char *DescribeSeccompOperation(int x) {
   switch (x) {
     case SECCOMP_SET_MODE_STRICT:
       return "SECCOMP_SET_MODE_STRICT";
diff --git a/libc/intrin/describesicode.c b/libc/intrin/describesicode.c
index a9e33ca4a..56dcf898d 100644
--- a/libc/intrin/describesicode.c
+++ b/libc/intrin/describesicode.c
@@ -38,7 +38,7 @@ static void NameIt(char p[20], const char *s, int si_code) {
 /**
  * Returns symbolic name for siginfo::si_code value.
  */
-const char *_DescribeSiCode(char b[20], int sig, int si_code) {
+const char *(DescribeSiCode)(char b[20], int sig, int si_code) {
   NameIt(b, "SI_", si_code);
   if (si_code == SI_QUEUE) {
     strcpy(b + 3, "QUEUE"); /* sent by sigqueue(2) */
diff --git a/libc/intrin/describesigaction.c b/libc/intrin/describesigaction.c
index 5a7a417a6..a9b1fb8d6 100644
--- a/libc/intrin/describesigaction.c
+++ b/libc/intrin/describesigaction.c
@@ -23,7 +23,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alloca.h"
 #include "libc/sysv/consts/sa.h"
 
@@ -51,15 +51,15 @@ static const char *DescribeSigFlags(char buf[64], int x) {
       {SA_ONESHOT, "ONESHOT"},      //
       {0x04000000, "RESTORER"},     //
   };
-  return _DescribeFlags(buf, 64, kSigFlags, ARRAYLEN(kSigFlags), "SA_", x);
+  return DescribeFlags(buf, 64, kSigFlags, ARRAYLEN(kSigFlags), "SA_", x);
 }
 
 #define N 256
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribeSigaction(char buf[N], int rc,
-                               const struct sigaction *sa) {
+const char *(DescribeSigaction)(char buf[N], int rc,
+                                const struct sigaction *sa) {
   int o = 0;
   char b64[64];
 
diff --git a/libc/intrin/describesigaltstack.c b/libc/intrin/describesigaltstack.c
index 71ed50335..32cdb3bc2 100644
--- a/libc/intrin/describesigaltstack.c
+++ b/libc/intrin/describesigaltstack.c
@@ -21,8 +21,8 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
 
-const char *_DescribeSigaltstack(char buf[128], int rc,
-                                 const struct sigaltstack *ss) {
+const char *(DescribeSigaltstack)(char buf[128], int rc,
+                                  const struct sigaltstack *ss) {
   if (rc == -1)
     return "n/a";
   if (!ss)
diff --git a/libc/intrin/describesigaltstackflags.c b/libc/intrin/describesigaltstackflags.c
index 33354d07e..e9c7c6e8b 100644
--- a/libc/intrin/describesigaltstackflags.c
+++ b/libc/intrin/describesigaltstackflags.c
@@ -17,14 +17,14 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/ss.h"
 
-const char *_DescribeSigaltstackFlags(char buf[22], int x) {
+const char *(DescribeSigaltstackFlags)(char buf[22], int x) {
   const struct DescribeFlags kSigaltstackFlags[] = {
       {SS_ONSTACK, "ONSTACK"},  //
       {SS_DISABLE, "DISABLE"},  //
   };
-  return _DescribeFlags(buf, 48, kSigaltstackFlags, ARRAYLEN(kSigaltstackFlags),
-                        "SS_", x);
+  return DescribeFlags(buf, 48, kSigaltstackFlags, ARRAYLEN(kSigaltstackFlags),
+                       "SS_", x);
 }
diff --git a/libc/intrin/describesiginfo.c b/libc/intrin/describesiginfo.c
index 074e09442..8235b078b 100644
--- a/libc/intrin/describesiginfo.c
+++ b/libc/intrin/describesiginfo.c
@@ -29,7 +29,7 @@
 
 #define append(...) i += ksnprintf(buf + i, N - i, __VA_ARGS__)
 
-const char *_DescribeSiginfo(char buf[N], int rc, const siginfo_t *si) {
+const char *(DescribeSiginfo)(char buf[N], int rc, const siginfo_t *si) {
   int i = 0;
 
   if (rc == -1)
diff --git a/libc/intrin/describesigset.c b/libc/intrin/describesigset.c
index ee1529155..e05758d49 100644
--- a/libc/intrin/describesigset.c
+++ b/libc/intrin/describesigset.c
@@ -31,7 +31,7 @@
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribeSigset(char buf[N], int rc, const sigset_t *ss) {
+const char *(DescribeSigset)(char buf[N], int rc, const sigset_t *ss) {
   int olderr;
   bool gotsome;
   const char *s;
diff --git a/libc/intrin/describesleepflags.c b/libc/intrin/describesleepflags.c
index 2bcfaf4d7..858a254f6 100644
--- a/libc/intrin/describesleepflags.c
+++ b/libc/intrin/describesleepflags.c
@@ -24,7 +24,7 @@
 /**
  * Describes clock_nanosleep() flags argument.
  */
-const char *_DescribeSleepFlags(char buf[16], int x) {
+const char *(DescribeSleepFlags)(char buf[16], int x) {
   switch (x) {
     case 0:
       return "0";
diff --git a/libc/intrin/describesocketfamily.c b/libc/intrin/describesocketfamily.c
index 1144ec8da..e4bacb527 100644
--- a/libc/intrin/describesocketfamily.c
+++ b/libc/intrin/describesocketfamily.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/sysv/consts/af.h"
 
-const char *_DescribeSocketFamily(char buf[12], int family) {
+const char *(DescribeSocketFamily)(char buf[12], int family) {
   if (family == AF_UNIX)
     return "AF_UNIX";
   if (family == AF_INET)
diff --git a/libc/intrin/describesocketprotocol.c b/libc/intrin/describesocketprotocol.c
index 16059fa1f..39086245f 100644
--- a/libc/intrin/describesocketprotocol.c
+++ b/libc/intrin/describesocketprotocol.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/sysv/consts/ipproto.h"
 
-const char *_DescribeSocketProtocol(char buf[12], int family) {
+const char *(DescribeSocketProtocol)(char buf[12], int family) {
   if (family == IPPROTO_IP)
     return "IPPROTO_IP";
   if (family == IPPROTO_ICMP)
diff --git a/libc/intrin/describesockettype.c b/libc/intrin/describesockettype.c
index c8084b573..f28ffc5b8 100644
--- a/libc/intrin/describesockettype.c
+++ b/libc/intrin/describesockettype.c
@@ -21,7 +21,7 @@
 #include "libc/str/str.h"
 #include "libc/sysv/consts/sock.h"
 
-const char *_DescribeSocketType(char buf[64], int type) {
+const char *(DescribeSocketType)(char buf[64], int type) {
   int x;
   char *p;
   p = buf;
diff --git a/libc/intrin/describesocklevel.c b/libc/intrin/describesocklevel.c
index d2b981a36..8edadadc5 100644
--- a/libc/intrin/describesocklevel.c
+++ b/libc/intrin/describesocklevel.c
@@ -23,7 +23,7 @@
 /**
  * Describes setsockopt() level arguments.
  */
-const char *_DescribeSockLevel(char buf[12], int x) {
+const char *(DescribeSockLevel)(char buf[12], int x) {
   if (x == SOL_SOCKET)
     return "SOL_SOCKET";
   if (x == SOL_IP)
diff --git a/libc/intrin/describesockoptname.c b/libc/intrin/describesockoptname.c
index acfe5b25d..baf37f81b 100644
--- a/libc/intrin/describesockoptname.c
+++ b/libc/intrin/describesockoptname.c
@@ -25,7 +25,7 @@
 /**
  * Describes setsockopt() optname arguments.
  */
-const char *_DescribeSockOptname(char buf[32], int l, int x) {
+const char *(DescribeSockOptname)(char buf[32], int l, int x) {
   char *p;
   const char *s;
   const struct MagnumStr *ms;
@@ -49,14 +49,6 @@ const char *_DescribeSockOptname(char buf[32], int l, int x) {
       *p++ = '_';
       *p = 0;
       ms = kIpOptnames;
-    } else if (l == SOL_IPV6) {
-      *p++ = 'I';
-      *p++ = 'P';
-      *p++ = 'V';
-      *p++ = '6';
-      *p++ = '_';
-      *p = 0;
-      ms = kIpv6Optnames;
     } else {
       ms = 0;
     }
diff --git a/libc/intrin/describestat.c b/libc/intrin/describestat.c
index 530708568..102e58ef7 100644
--- a/libc/intrin/describestat.c
+++ b/libc/intrin/describestat.c
@@ -25,7 +25,7 @@
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribeStat(char buf[N], int rc, const struct stat *st) {
+const char *(DescribeStat)(char buf[N], int rc, const struct stat *st) {
   int o = 0;
 
   if (rc == -1)
diff --git a/libc/intrin/describestatfs.c b/libc/intrin/describestatfs.c
index 439fb925f..787062edc 100644
--- a/libc/intrin/describestatfs.c
+++ b/libc/intrin/describestatfs.c
@@ -27,7 +27,7 @@
 
 #define append(...) i += ksnprintf(buf + i, N - i, __VA_ARGS__)
 
-const char *_DescribeStatfs(char buf[N], int rc, const struct statfs *f) {
+const char *(DescribeStatfs)(char buf[N], int rc, const struct statfs *f) {
   int i = 0;
   char ibuf[21];
   int64_t flags;
diff --git a/libc/intrin/describestdiostate.c b/libc/intrin/describestdiostate.c
index ec7fdba3e..822bd2ed9 100644
--- a/libc/intrin/describestdiostate.c
+++ b/libc/intrin/describestdiostate.c
@@ -21,7 +21,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/str/str.h"
 
-const char *_DescribeStdioState(char buf[12], int x) {
+const char *(DescribeStdioState)(char buf[12], int x) {
   if (!x)
     return "";
   if (x == -1)
diff --git a/libc/intrin/describestringlist.c b/libc/intrin/describestringlist.c
index 67baea91e..9f0e5949f 100644
--- a/libc/intrin/describestringlist.c
+++ b/libc/intrin/describestringlist.c
@@ -24,7 +24,7 @@
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribeStringList(char buf[N], char *const list[]) {
+const char *(DescribeStringList)(char buf[N], char *const list[]) {
   int i, o = 0;
 
   if (!list)
diff --git a/libc/intrin/describetermios.c b/libc/intrin/describetermios.c
index 1cbb40694..6a76a2234 100644
--- a/libc/intrin/describetermios.c
+++ b/libc/intrin/describetermios.c
@@ -22,15 +22,15 @@
 #include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/termios.h"
 
 #define N 1024
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribeTermios(char buf[N], ssize_t rc,
-                             const struct termios *tio) {
+const char *(DescribeTermios)(char buf[N], ssize_t rc,
+                              const struct termios *tio) {
   int o = 0;
   char b128[128];
 
@@ -61,7 +61,7 @@ const char *_DescribeTermios(char buf[N], ssize_t rc,
       {IUTF8, "IUTF8"},      //
   };
   append(".c_iflag=%s",
-         _DescribeFlags(b128, 128, kInput, ARRAYLEN(kInput), "", tio->c_iflag));
+         DescribeFlags(b128, 128, kInput, ARRAYLEN(kInput), "", tio->c_iflag));
 
   struct DescribeFlags kOutput[] = {
       {OPOST, "OPOST"},    //
@@ -83,8 +83,8 @@ const char *_DescribeTermios(char buf[N], ssize_t rc,
       {VT1, "VT1"},        //
       {FF1, "FF1"},        //
   };
-  append(", .c_oflag=%s", _DescribeFlags(b128, 128, kOutput, ARRAYLEN(kOutput),
-                                         "", tio->c_oflag));
+  append(", .c_oflag=%s", DescribeFlags(b128, 128, kOutput, ARRAYLEN(kOutput),
+                                        "", tio->c_oflag));
 
   struct DescribeFlags kControl[] = {
       {CS8, "CS8"},          //
@@ -98,8 +98,8 @@ const char *_DescribeTermios(char buf[N], ssize_t rc,
       {CLOCAL, "CLOCAL"},    //
       {CRTSCTS, "CRTSCTS"},  //
   };
-  append(", .c_cflag=%s", _DescribeFlags(b128, 128, kControl,
-                                         ARRAYLEN(kControl), "", tio->c_cflag));
+  append(", .c_cflag=%s", DescribeFlags(b128, 128, kControl, ARRAYLEN(kControl),
+                                        "", tio->c_cflag));
 
   struct DescribeFlags kLocal[] = {
       {ISIG, "ISIG"},        //
@@ -125,7 +125,7 @@ const char *_DescribeTermios(char buf[N], ssize_t rc,
          ".c_cc[VTIME]=%d, "
          ".c_cc[VINTR]=CTRL(%#c), "
          ".c_cc[VQUIT]=CTRL(%#c)",
-         _DescribeFlags(b128, 128, kLocal, ARRAYLEN(kLocal), "", tio->c_lflag),
+         DescribeFlags(b128, 128, kLocal, ARRAYLEN(kLocal), "", tio->c_lflag),
          tio->c_cc[VMIN], tio->c_cc[VTIME], CTRL(tio->c_cc[VINTR]),
          CTRL(tio->c_cc[VQUIT]));
 
diff --git a/libc/intrin/describethreadcreationflags.c b/libc/intrin/describethreadcreationflags.c
index d77faee10..1b0e75b62 100644
--- a/libc/intrin/describethreadcreationflags.c
+++ b/libc/intrin/describethreadcreationflags.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/processcreationflags.h"
 
 static const struct DescribeFlags kThreadCreationFlags[] = {
@@ -25,7 +25,7 @@ static const struct DescribeFlags kThreadCreationFlags[] = {
     {kNtStackSizeParamIsAReservation, "kNtStackSizeParamIsAReservation"},  //
 };
 
-const char *_DescribeThreadCreateFlags(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kThreadCreationFlags,
-                        ARRAYLEN(kThreadCreationFlags), "", x);
+const char *(DescribeThreadCreateFlags)(char buf[64], uint32_t x) {
+  return DescribeFlags(buf, 64, kThreadCreationFlags,
+                       ARRAYLEN(kThreadCreationFlags), "", x);
 }
diff --git a/libc/intrin/describetimespec.c b/libc/intrin/describetimespec.c
index 121c7f479..a07f0c992 100644
--- a/libc/intrin/describetimespec.c
+++ b/libc/intrin/describetimespec.c
@@ -22,7 +22,8 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/str/str.h"
 
-const char *_DescribeTimespec(char buf[45], int rc, const struct timespec *ts) {
+const char *(DescribeTimespec)(char buf[45], int rc,
+                               const struct timespec *ts) {
   if (rc == -1)
     return "n/a";
   if (!ts)
diff --git a/libc/intrin/describetimeval.c b/libc/intrin/describetimeval.c
index 10d0abb3d..896b10c3d 100644
--- a/libc/intrin/describetimeval.c
+++ b/libc/intrin/describetimeval.c
@@ -21,7 +21,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
 
-const char *_DescribeTimeval(char buf[45], int rc, const struct timeval *tv) {
+const char *(DescribeTimeval)(char buf[45], int rc, const struct timeval *tv) {
   if (!tv)
     return "NULL";
   if (rc == -1)
diff --git a/libc/intrin/describevirtualkeycode.c b/libc/intrin/describevirtualkeycode.c
index 8655ca081..2e4bc1b05 100644
--- a/libc/intrin/describevirtualkeycode.c
+++ b/libc/intrin/describevirtualkeycode.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/vk.h"
 
 // clang-format off
@@ -205,7 +205,7 @@ static const struct VirtualKeyCodeName {
 };
 // clang-format on
 
-const char *_DescribeVirtualKeyCode(char buf[32], uint32_t x) {
+const char *(DescribeVirtualKeyCode)(char buf[32], uint32_t x) {
   for (int i = 0; i < ARRAYLEN(kVirtualKeyCodeNames); ++i) {
     if (x == kVirtualKeyCodeNames[i].code) {
       return kVirtualKeyCodeNames[i].name;
diff --git a/libc/intrin/describewhence.c b/libc/intrin/describewhence.c
index 3c0820b7c..3d166fcbd 100644
--- a/libc/intrin/describewhence.c
+++ b/libc/intrin/describewhence.c
@@ -20,7 +20,7 @@
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/describeflags.h"
 
-const char *_DescribeWhence(char buf[12], int whence) {
+const char *(DescribeWhence)(char buf[12], int whence) {
   if (whence == SEEK_SET)
     return "SEEK_SET";
   if (whence == SEEK_CUR)
diff --git a/libc/intrin/describewhichprio.c b/libc/intrin/describewhichprio.c
index 121e026f9..c72bf5eca 100644
--- a/libc/intrin/describewhichprio.c
+++ b/libc/intrin/describewhichprio.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/sysv/consts/prio.h"
 
-const char *_DescribeWhichPrio(char buf[12], int x) {
+const char *(DescribeWhichPrio)(char buf[12], int x) {
   if (x == PRIO_PROCESS)
     return "PRIO_PROCESS";
   if (x == PRIO_PGRP)
diff --git a/libc/intrin/describewinsize.c b/libc/intrin/describewinsize.c
index a84c15907..994ade424 100644
--- a/libc/intrin/describewinsize.c
+++ b/libc/intrin/describewinsize.c
@@ -22,13 +22,13 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #define N 64
 
 #define append(...) o += ksnprintf(buf + o, N - o, __VA_ARGS__)
 
-const char *_DescribeWinsize(char buf[N], int rc, const struct winsize *ws) {
+const char *(DescribeWinsize)(char buf[N], int rc, const struct winsize *ws) {
   int o = 0;
   if (!ws)
     return "NULL";
diff --git a/libc/intrin/directmap-metal.c b/libc/intrin/directmap-metal.c
index 30e377da9..77c0ee8b9 100644
--- a/libc/intrin/directmap-metal.c
+++ b/libc/intrin/directmap-metal.c
@@ -19,7 +19,8 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/metalfile.internal.h"
-#include "libc/macros.h"
+#include "libc/intrin/directmap.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/pc.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/prot.h"
@@ -31,11 +32,19 @@
 
 static uint64_t sys_mmap_metal_break;
 
-void *sys_mmap_metal(void *vaddr, size_t size, int prot, int flags, int fd,
-                     int64_t off) {
+static struct DirectMap bad_mmap(void) {
+  struct DirectMap res;
+  res.addr = (void *)-1;
+  res.maphandle = -1;
+  return res;
+}
+
+struct DirectMap sys_mmap_metal(void *vaddr, size_t size, int prot, int flags,
+                                int fd, int64_t off) {
   /* asan runtime depends on this function */
   size_t i;
   struct mman *mm;
+  struct DirectMap res;
   uint64_t addr, faddr = 0, page, e, *pte, *fdpte, *pml4t;
   mm = __get_mm();
   pml4t = __get_pml4t();
@@ -45,18 +54,18 @@ void *sys_mmap_metal(void *vaddr, size_t size, int prot, int flags, int fd,
     struct Fd *sfd;
     struct MetalFile *file;
     if (off < 0 || fd < 0 || fd >= g_fds.n)
-      return MAP_FAILED;
+      return bad_mmap();
     sfd = &g_fds.p[fd];
     if (sfd->kind != kFdFile)
-      return MAP_FAILED;
+      return bad_mmap();
     file = (struct MetalFile *)sfd->handle;
     /* TODO: allow mapping partial page at end of file, if file size not
      * multiple of page size */
     if (off > file->size || size > file->size - off)
-      return MAP_FAILED;
+      return bad_mmap();
     faddr = (uint64_t)file->base + off;
     if (faddr % 4096 != 0)
-      return MAP_FAILED;
+      return bad_mmap();
   }
   if (!(flags & MAP_FIXED_linux)) {
     if (!addr) {
@@ -79,7 +88,7 @@ void *sys_mmap_metal(void *vaddr, size_t size, int prot, int flags, int fd,
       if ((flags & MAP_ANONYMOUS_linux)) {
         page = __new_page(mm);
         if (!page)
-          return MAP_FAILED;
+          return bad_mmap();
         __clear_page(BANE + page);
         e = page | PAGE_RSRV | PAGE_U;
         if ((prot & PROT_WRITE))
@@ -105,7 +114,9 @@ void *sys_mmap_metal(void *vaddr, size_t size, int prot, int flags, int fd,
       break;
     }
   }
-  return (void *)addr;
+  res.addr = (void *)addr;
+  res.maphandle = -1;
+  return res;
 }
 
 #endif /* __x86_64__ */
diff --git a/libc/intrin/directmap-nt.c b/libc/intrin/directmap-nt.c
new file mode 100644
index 000000000..3cd19da78
--- /dev/null
+++ b/libc/intrin/directmap-nt.c
@@ -0,0 +1,122 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "libc/calls/internal.h"
+#include "libc/calls/state.internal.h"
+#include "libc/errno.h"
+#include "libc/intrin/directmap.h"
+#include "libc/nt/enum/filemapflags.h"
+#include "libc/nt/enum/pageflags.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/runtime.h"
+#include "libc/nt/struct/processmemorycounters.h"
+#include "libc/nt/struct/securityattributes.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/prot.h"
+
+textwindows struct DirectMap sys_mmap_nt(void *addr, size_t size, int prot,
+                                         int flags, int fd, int64_t off) {
+
+  int64_t handle;
+  if (flags & MAP_ANONYMOUS) {
+    handle = kNtInvalidHandleValue;
+  } else {
+    handle = g_fds.p[fd].handle;
+  }
+
+  // mark map handle as inheritable if fork might need it
+  const struct NtSecurityAttributes *mapsec;
+  if ((flags & MAP_TYPE) == MAP_SHARED) {
+    mapsec = &kNtIsInheritable;
+  } else {
+    mapsec = 0;
+  }
+
+  // nt will whine under many circumstances if we change the execute bit
+  // later using mprotect(). the workaround is to always request execute
+  // and then virtualprotect() it away until we actually need it. please
+  // note that open-nt.c always requests an kNtGenericExecute accessmask
+  int iscow = false;
+  struct ProtectNt fl;
+  if (handle != -1) {
+    if ((flags & MAP_TYPE) != MAP_SHARED) {
+      // windows has cow pages but they can't propagate across fork()
+      // that means we only get copy-on-write for the root process :(
+      fl = (struct ProtectNt){kNtPageExecuteWritecopy,
+                              kNtFileMapCopy | kNtFileMapExecute};
+      iscow = true;
+    } else {
+      if ((g_fds.p[fd].flags & O_ACCMODE) == O_RDONLY) {
+        fl = (struct ProtectNt){kNtPageExecuteRead,
+                                kNtFileMapRead | kNtFileMapExecute};
+      } else {
+        fl = (struct ProtectNt){kNtPageExecuteReadwrite,
+                                kNtFileMapWrite | kNtFileMapExecute};
+      }
+    }
+  } else {
+    unassert(flags & MAP_ANONYMOUS);
+    fl = (struct ProtectNt){kNtPageExecuteReadwrite,
+                            kNtFileMapWrite | kNtFileMapExecute};
+  }
+
+  int e = errno;
+  struct DirectMap dm;
+TryAgain:
+  if ((dm.maphandle = CreateFileMapping(handle, mapsec, fl.flags1,
+                                        (size + off) >> 32, (size + off), 0))) {
+    if ((dm.addr = MapViewOfFileEx(dm.maphandle, fl.flags2, off >> 32, off,
+                                   size, addr))) {
+      uint32_t oldprot;
+      if (VirtualProtect(dm.addr, size, __prot2nt(prot, iscow), &oldprot))
+        return dm;
+      UnmapViewOfFile(dm.addr);
+    }
+    CloseHandle(dm.maphandle);
+  } else if (!(prot & PROT_EXEC) &&              //
+             (fl.flags2 & kNtFileMapExecute) &&  //
+             GetLastError() == kNtErrorAccessDenied) {
+    // your file needs to have been O_CREAT'd with exec `mode` bits in
+    // order to be mapped with executable permission. we always try to
+    // get execute permission if the kernel will give it to us because
+    // win32 would otherwise forbid mprotect() from elevating later on
+    fl.flags2 &= ~kNtFileMapExecute;
+    switch (fl.flags1) {
+      case kNtPageExecuteWritecopy:
+        fl.flags1 = kNtPageWritecopy;
+        break;
+      case kNtPageExecuteReadwrite:
+        fl.flags1 = kNtPageReadwrite;
+        break;
+      case kNtPageExecuteRead:
+        fl.flags1 = kNtPageReadonly;
+        break;
+      default:
+        __builtin_unreachable();
+    }
+    errno = e;
+    goto TryAgain;
+  }
+
+  dm.maphandle = kNtInvalidHandleValue;
+  dm.addr = (void *)(intptr_t)-1;
+  return dm;
+}
diff --git a/test/libc/intrin/stack_test.c b/libc/intrin/directmap.c
similarity index 52%
rename from test/libc/intrin/stack_test.c
rename to libc/intrin/directmap.c
index e07a2d7fc..b0a40ff59 100644
--- a/test/libc/intrin/stack_test.c
+++ b/libc/intrin/directmap.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,60 +16,51 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/syscall-sysv.internal.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
+#include "libc/intrin/describeflags.h"
+#include "libc/intrin/directmap.h"
+#include "libc/intrin/strace.h"
+#include "libc/nt/runtime.h"
+#include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/testlib/testlib.h"
+#include "libc/runtime/syslib.internal.h"
+#include "libc/sysv/errfuns.h"
 
-// returns true if byte at memory address is readable
-bool readable(void *addr) {
-  return testlib_pokememory(addr);
-}
-
-// returns true if page is reserved by linux memory manager
-// it can be true for addresses that aren't listed in /proc/PID/maps
-bool occupied(void *addr) {
-  int olde = errno;
-  char *want = (char *)((uintptr_t)addr & -__pagesize);
-  char *got =
-      __sys_mmap(want, __pagesize, PROT_READ | PROT_WRITE,
-                 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0, 0);
-  if (got == MAP_FAILED) {
-    unassert(errno == IsFreebsd() ? EINVAL : EEXIST);
-    errno = olde;
-    return true;
+/**
+ * Obtains memory mapping directly from system.
+ *
+ * The mmap() function needs to track memory mappings in order to
+ * support Windows NT and Address Sanitizer. That memory tracking can be
+ * bypassed by calling this function. However the caller is responsible
+ * for passing the magic memory handle on Windows NT to CloseHandle().
+ *
+ * @asyncsignalsafe
+ */
+struct DirectMap sys_mmap(void *addr, size_t size, int prot, int flags, int fd,
+                          int64_t off) {
+  struct DirectMap d;
+  if ((__virtualsize += size) >= __virtualmax) {
+    d.maphandle = kNtInvalidHandleValue;
+    d.addr = (void *)enomem();
+  } else if (IsXnuSilicon()) {
+    long p = _sysret(__syslib->__mmap(addr, size, prot, flags, fd, off));
+    d.maphandle = kNtInvalidHandleValue;
+    d.addr = (void *)p;
+  } else if (!IsWindows() && !IsMetal()) {
+    d.addr = __sys_mmap(addr, size, prot, flags, fd, off, off);
+    d.maphandle = kNtInvalidHandleValue;
+  } else if (IsMetal()) {
+    d = sys_mmap_metal(addr, size, prot, flags, fd, off);
+  } else {
+    d = sys_mmap_nt(addr, size, prot, flags, fd, off);
   }
-  sys_munmap(got, __pagesize);
-  return got != want;
-}
-
-TEST(stack, test) {
-  if (IsWindows())
-    return;
-
-  void *vstackaddr;
-  size_t stacksize = 65536;
-  size_t guardsize = 4096;
-  unassert(!cosmo_stack_alloc(&stacksize, &guardsize, &vstackaddr));
-  char *stackaddr = vstackaddr;
-
-  /* check memory reservation */
-  unassert(occupied(stackaddr + stacksize - 1));  // top stack
-  unassert(occupied(stackaddr));                  // bot stack
-  unassert(occupied(stackaddr - 1));              // top guard
-  unassert(occupied(stackaddr - guardsize));      // bot guard
-
-  /* check memory accessibility */
-  unassert(readable(stackaddr + stacksize - 1));  // top stack
-  unassert(readable(stackaddr));                  // bot stack
-  unassert(!readable(stackaddr - 1));             // top guard
-  unassert(!readable(stackaddr - guardsize));     // bot guard
-
-  unassert(!cosmo_stack_free(stackaddr, stacksize, guardsize));
+  if (d.addr == MAP_FAILED)
+    __virtualsize -= size;
+  KERNTRACE("sys_mmap(%.12p, %'zu, %s, %s, %d, %'ld) → {%.12p, %p}% m", addr,
+            size, DescribeProtFlags(prot), DescribeMapFlags(flags), fd, off,
+            d.addr, d.maphandle);
+  return d;
 }
diff --git a/libc/intrin/directmap.h b/libc/intrin/directmap.h
index 389336a91..a3eefc30a 100644
--- a/libc/intrin/directmap.h
+++ b/libc/intrin/directmap.h
@@ -2,7 +2,19 @@
 #define COSMOPOLITAN_LIBC_INTRIN_DIRECTMAP_H_
 COSMOPOLITAN_C_START_
 
-void *sys_mmap_metal(void *, size_t, int, int, int, int64_t) libcesque;
+struct ProtectNt {
+  uint32_t flags1;
+  uint32_t flags2;
+};
+
+struct DirectMap {
+  void *addr;
+  int64_t maphandle;
+};
+
+struct DirectMap sys_mmap(void *, size_t, int, int, int, int64_t);
+struct DirectMap sys_mmap_nt(void *, size_t, int, int, int, int64_t);
+struct DirectMap sys_mmap_metal(void *, size_t, int, int, int, int64_t);
 int sys_munmap_metal(void *, size_t) libcesque;
 int __prot2nt(int, int) libcesque;
 
diff --git a/libc/intrin/dlopen.c b/libc/intrin/dlopen.c
deleted file mode 100644
index 3e93f8be3..000000000
--- a/libc/intrin/dlopen.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/thread/posixthread.internal.h"
-#include "libc/thread/thread.h"
-
-static pthread_mutex_t __dlopen_lock_obj = PTHREAD_MUTEX_INITIALIZER;
-
-void __dlopen_lock(void) {
-  _pthread_mutex_lock(&__dlopen_lock_obj);
-}
-
-void __dlopen_unlock(void) {
-  _pthread_mutex_unlock(&__dlopen_lock_obj);
-}
-
-void __dlopen_wipe(void) {
-  _pthread_mutex_wipe_np(&__dlopen_lock_obj);
-}
diff --git a/libc/intrin/dsohandle.S b/libc/intrin/dsohandle.S
index 37108e8a3..39cc3e989 100644
--- a/libc/intrin/dsohandle.S
+++ b/libc/intrin/dsohandle.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.underrun
 //	Uniquely identifies each artifact linked in an address space.
diff --git a/libc/intrin/clock_gettime_monotonic_nt.c b/libc/intrin/enable_threads.c
similarity index 84%
rename from libc/intrin/clock_gettime_monotonic_nt.c
rename to libc/intrin/enable_threads.c
index f1371d27d..18c27364f 100644
--- a/libc/intrin/clock_gettime_monotonic_nt.c
+++ b/libc/intrin/enable_threads.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,11 +16,9 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/timespec.internal.h"
-#include "libc/nt/time.h"
+#include "libc/runtime/runtime.h"
+#include "libc/thread/tls.h"
 
-textwindows struct timespec sys_clock_gettime_monotonic_nt(void) {
-  uint64_t hectons;
-  QueryUnbiasedInterruptTimePrecise(&hectons);
-  return timespec_fromnanos(hectons * 100);
+void __enable_threads(void) {
+  __threaded = 1;
 }
diff --git a/libc/intrin/fds.c b/libc/intrin/fds.c
index 7fcfe983d..f70abfb92 100644
--- a/libc/intrin/fds.c
+++ b/libc/intrin/fds.c
@@ -16,14 +16,14 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/fds.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/state.internal.h"
 #include "libc/calls/ttydefaults.h"
 #include "libc/dce.h"
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/extend.h"
-#include "libc/intrin/maps.h"
+#include "libc/intrin/fds.h"
+#include "libc/intrin/kprintf.h"
 #include "libc/intrin/nomultics.h"
 #include "libc/intrin/pushpop.h"
 #include "libc/intrin/weaken.h"
@@ -32,9 +32,7 @@
 #include "libc/nt/enum/accessmask.h"
 #include "libc/nt/enum/creationdisposition.h"
 #include "libc/nt/enum/fileflagandattributes.h"
-#include "libc/nt/enum/filemapflags.h"
 #include "libc/nt/enum/filesharemode.h"
-#include "libc/nt/memory.h"
 #include "libc/nt/runtime.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/memtrack.internal.h"
@@ -42,24 +40,21 @@
 #include "libc/sock/sock.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/prot.h"
 #include "libc/thread/thread.h"
-#include "libc/thread/tls.h"
+
+#define OPEN_MAX 16
 
 #ifdef __x86_64__
 __static_yoink("_init_fds");
 #endif
 
 struct Fds g_fds;
+static struct Fd g_fds_static[OPEN_MAX];
 
 static bool TokAtoi(const char **str, long *res) {
   int c, d;
   unsigned long x = 0;
-  d = 1;
-  if (**str == '-') {
-    (*str)++;
-    d = -1;
-  }
+  d = **str == '-' ? -1 : 1;
   while ((c = *(*str)++)) {
     if (('0' <= c && c <= '9')) {
       x *= 10;
@@ -84,14 +79,19 @@ static textwindows void SetupWinStd(struct Fds *fds, int i, uint32_t x) {
 }
 
 textstartup void __init_fds(int argc, char **argv, char **envp) {
-
   struct Fds *fds;
   fds = &g_fds;
   fds->n = 4;
   atomic_store_explicit(&fds->f, 3, memory_order_relaxed);
-  fds->p = fds->e = (void *)kMemtrackFdsStart;
-  fds->e = _extend(fds->p, fds->n * sizeof(*fds->p), fds->e, MAP_PRIVATE,
-                   kMemtrackFdsStart + kMemtrackFdsSize);
+  if (_weaken(_extend)) {
+    fds->p = fds->e = (void *)kMemtrackFdsStart;
+    fds->e =
+        _weaken(_extend)(fds->p, fds->n * sizeof(*fds->p), fds->e, MAP_PRIVATE,
+                         kMemtrackFdsStart + kMemtrackFdsSize);
+  } else {
+    fds->p = g_fds_static;
+    fds->e = g_fds_static + OPEN_MAX;
+  }
 
   // inherit standard i/o file descriptors
   if (IsMetal()) {
@@ -121,12 +121,10 @@ textstartup void __init_fds(int argc, char **argv, char **envp) {
   // inherit file descriptors from cosmo parent process
   if (IsWindows()) {
     const char *fdspec;
-    if ((fdspec = getenv("_COSMO_FDS_V2"))) {
-      char *smaddr = 0;
+    if ((fdspec = getenv("_COSMO_FDS"))) {
       unsetenv("_COSMO_FDS");
-      unsetenv("_COSMO_FDS_V2");
       for (;;) {
-        long fd, kind, flags, mode, handle, shand, type, family, protocol;
+        long fd, kind, flags, mode, handle, pointer, type, family, protocol;
         if (!TokAtoi(&fdspec, &fd))
           break;
         if (!TokAtoi(&fdspec, &handle))
@@ -137,7 +135,7 @@ textstartup void __init_fds(int argc, char **argv, char **envp) {
           break;
         if (!TokAtoi(&fdspec, &mode))
           break;
-        if (!TokAtoi(&fdspec, &shand))
+        if (!TokAtoi(&fdspec, &pointer))
           break;
         if (!TokAtoi(&fdspec, &type))
           break;
@@ -145,54 +143,25 @@ textstartup void __init_fds(int argc, char **argv, char **envp) {
           break;
         if (!TokAtoi(&fdspec, &protocol))
           break;
-        __ensurefds_unlocked(fd);
+        if (_weaken(__ensurefds_unlocked))
+          _weaken(__ensurefds_unlocked)(fd);
         struct Fd *f = fds->p + fd;
         if (f->handle && f->handle != -1 && f->handle != handle) {
           CloseHandle(f->handle);
-          if (fd < 3)
+          if (fd < 3) {
             SetStdHandle(kNtStdio[fd], handle);
+          }
         }
         f->handle = handle;
         f->kind = kind;
         f->flags = flags;
         f->mode = mode;
+        f->pointer = pointer;
         f->type = type;
         f->family = family;
         f->protocol = protocol;
         atomic_store_explicit(&fds->f, fd + 1, memory_order_relaxed);
-
-        if (shand) {
-          struct Map *map;
-          struct CursorShared *shared;
-          if (!smaddr) {
-            smaddr = __maps_randaddr();
-          } else {
-            smaddr += 65536;
-          }
-          if ((shared = MapViewOfFileEx(shand, kNtFileMapWrite, 0, 0,
-                                        sizeof(struct CursorShared), smaddr))) {
-            if ((f->cursor = _mapanon(sizeof(struct Cursor)))) {
-              f->cursor->shared = shared;
-              if ((map = __maps_alloc())) {
-                map->addr = (char *)shared;
-                map->size = sizeof(struct CursorShared);
-                map->off = 0;
-                map->prot = PROT_READ | PROT_WRITE;
-                map->flags = MAP_SHARED | MAP_ANONYMOUS;
-                map->hand = shand;
-                __maps_lock();
-                __maps_insert(map);
-                __maps_unlock();
-              }
-            }
-          }
-        }
       }
     }
-    for (int i = 0; i < 3; ++i) {
-      struct Fd *f = fds->p + i;
-      if (f->kind == kFdFile && !f->cursor)
-        f->cursor = __cursor_new();
-    }
   }
 }
diff --git a/libc/intrin/fds.h b/libc/intrin/fds.h
index 2cfccc771..e3ff36fc5 100644
--- a/libc/intrin/fds.h
+++ b/libc/intrin/fds.h
@@ -1,7 +1,5 @@
 #ifndef COSMOPOLITAN_LIBC_CALLS_STRUCT_FD_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_CALLS_STRUCT_FD_INTERNAL_H_
-#include "libc/sock/struct/sockaddr.h"
-#include "libc/thread/thread.h"
 COSMOPOLITAN_C_START_
 
 #define kFdEmpty     0
@@ -10,35 +8,24 @@ COSMOPOLITAN_C_START_
 #define kFdConsole   4
 #define kFdSerial    5
 #define kFdZip       6
-#define kFdEpoll     7 /* epoll() deleted on 2024-09-01 */
+#define kFdEpoll     7
 #define kFdReserved  8
 #define kFdDevNull   9
 #define kFdDevRandom 10
 
-struct CursorShared {
-  pthread_mutex_t lock;
-  long pointer;
-};
-
-struct Cursor {
-  struct CursorShared *shared;
-  _Atomic(int) refs;
-};
-
 struct Fd {
   char kind;
   bool isbound;
-  char connecting;
   unsigned flags;
   unsigned mode;
   long handle;
+  long pointer;
   int family;
   int type;
   int protocol;
   unsigned rcvtimeo; /* millis; 0 means wait forever */
   unsigned sndtimeo; /* millis; 0 means wait forever */
   void *connect_op;
-  struct Cursor *cursor;
 };
 
 struct Fds {
@@ -47,11 +34,5 @@ struct Fds {
   struct Fd *p, *e;
 };
 
-struct Cursor *__cursor_new(void);
-void __cursor_ref(struct Cursor *);
-int __cursor_unref(struct Cursor *);
-void __cursor_lock(struct Cursor *);
-void __cursor_unlock(struct Cursor *);
-
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_FD_INTERNAL_H_ */
diff --git a/libc/intrin/fds_init.S b/libc/intrin/fds_init.S
index f86569b6f..d0fa0b96d 100644
--- a/libc/intrin/fds_init.S
+++ b/libc/intrin/fds_init.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.init.start 305,_init_fds
 	push	%rdi
diff --git a/libc/intrin/fds_lock.c b/libc/intrin/fds_lock.c
index 1e1ddcc32..c32367d85 100644
--- a/libc/intrin/fds_lock.c
+++ b/libc/intrin/fds_lock.c
@@ -17,13 +17,12 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/state.internal.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
 void __fds_lock(void) {
-  _pthread_mutex_lock(&__fds_lock_obj);
+  pthread_mutex_lock(&__fds_lock_obj);
 }
 
 void __fds_unlock(void) {
-  _pthread_mutex_unlock(&__fds_lock_obj);
+  pthread_mutex_unlock(&__fds_lock_obj);
 }
diff --git a/libc/intrin/fenv.S b/libc/intrin/fenv.S
index 697d19999..ae00d8684 100644
--- a/libc/intrin/fenv.S
+++ b/libc/intrin/fenv.S
@@ -25,7 +25,7 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Clears floating point exception status, e.g.
 //
diff --git a/libc/intrin/float16.c b/libc/intrin/float16.c
index 434f0cafd..476a2f6c9 100644
--- a/libc/intrin/float16.c
+++ b/libc/intrin/float16.c
@@ -21,135 +21,22 @@
  * @fileoverview fp16 compiler runtime
  */
 
-#define isnan16(x) (((x) & 0x7fff) > 0x7c00)
+#define asint(x) ((union pun){x}).i
+#define isnan(x) (((x) & 0x7fff) > 0x7c00)
 
-static inline _Float16 tofloat16(int x) {
-  union {
-    uint16_t i;
-    _Float16 f;
-  } u = {x};
-  return u.f;
-}
-
-static inline int fromfloat16(_Float16 x) {
-  union {
-    _Float16 f;
-    uint16_t i;
-  } u = {x};
-  return u.i;
-}
-
-static inline _Float32 tofloat32(uint32_t w) {
-  union {
-    uint32_t as_bits;
-    _Float32 as_value;
-  } fp32;
-  fp32.as_bits = w;
-  return fp32.as_value;
-}
-
-static inline uint32_t fromfloat32(_Float32 f) {
-  union {
-    _Float32 as_value;
-    uint32_t as_bits;
-  } fp32;
-  fp32.as_value = f;
-  return fp32.as_bits;
-}
-
-static inline _Float32 fabs32(_Float32 x) {
-  return tofloat32(fromfloat32(x) & 0x7fffffffu);
-}
+union pun {
+  _Float16 f;
+  unsigned short i;
+};
 
 int __eqhf2(_Float16 fx, _Float16 fy) {
-  int x = fromfloat16(fx);
-  int y = fromfloat16(fy);
-  return (x == y) & !isnan16(x) & !isnan16(y);
+  int x = asint(fx);
+  int y = asint(fy);
+  return (x == y) & !isnan(x) & !isnan(y);
 }
 
 int __nehf2(_Float16 fx, _Float16 fy) {
-  int x = fromfloat16(fx);
-  int y = fromfloat16(fy);
-  return (x != y) & !isnan16(x) & !isnan16(y);
+  int x = asint(fx);
+  int y = asint(fy);
+  return (x != y) & !isnan(x) & !isnan(y);
 }
-
-_Float32 __extendhfsf2(_Float16 f) {
-  uint16_t h = fromfloat16(f);
-  const uint32_t w = (uint32_t)h << 16;
-  const uint32_t sign = w & 0x80000000u;
-  const uint32_t two_w = w + w;
-  const uint32_t exp_offset = 0xE0u << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
-    defined(__GNUC__) && !defined(__STRICT_ANSI__)
-  const _Float32 exp_scale = 0x1.0p-112f;
-#else
-  const _Float32 exp_scale = tofloat32(0x7800000u);
-#endif
-  const _Float32 normalized_value =
-      tofloat32((two_w >> 4) + exp_offset) * exp_scale;
-  const uint32_t magic_mask = 126u << 23;
-  const _Float32 magic_bias = 0.5f;
-  const _Float32 denormalized_value =
-      tofloat32((two_w >> 17) | magic_mask) - magic_bias;
-  const uint32_t denormalized_cutoff = 1u << 27;
-  const uint32_t result =
-      sign | (two_w < denormalized_cutoff ? fromfloat32(denormalized_value)
-                                          : fromfloat32(normalized_value));
-  return tofloat32(result);
-}
-
-_Float64 __extendhfdf2(_Float16 f) {
-  return __extendhfsf2(f);
-}
-
-#ifdef __x86_64__
-__float80 __extendhfxf2(_Float16 f) {
-  return __extendhfsf2(f);
-}
-#endif
-
-#ifdef __aarch64__
-_Float128 __extendhftf2(_Float16 f) {
-  return __extendhfsf2(f);
-}
-#endif
-
-_Float16 __truncsfhf2(_Float32 f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
-    defined(__GNUC__) && !defined(__STRICT_ANSI__)
-  const _Float32 scale_to_inf = 0x1.0p+112f;
-  const _Float32 scale_to_zero = 0x1.0p-110f;
-#else
-  const _Float32 scale_to_inf = tofloat32(0x77800000u);
-  const _Float32 scale_to_zero = tofloat32(0x08800000u);
-#endif
-  _Float32 base = (fabs32(f) * scale_to_inf) * scale_to_zero;
-  const uint32_t w = fromfloat32(f);
-  const uint32_t shl1_w = w + w;
-  const uint32_t sign = w & 0x80000000u;
-  uint32_t bias = shl1_w & 0xFF000000u;
-  if (bias < 0x71000000u)
-    bias = 0x71000000u;
-  base = tofloat32((bias >> 1) + 0x07800000u) + base;
-  const uint32_t bits = fromfloat32(base);
-  const uint32_t exp_bits = (bits >> 13) & 0x00007C00u;
-  const uint32_t mantissa_bits = bits & 0x00000FFFu;
-  const uint32_t nonsign = exp_bits + mantissa_bits;
-  return tofloat16((sign >> 16) | (shl1_w > 0xFF000000u ? 0x7E00u : nonsign));
-}
-
-_Float16 __truncdfhf2(_Float64 f) {
-  return __truncsfhf2(f);
-}
-
-#ifdef __x86_64__
-_Float16 __truncxfhf2(__float80 f) {
-  return __truncsfhf2(f);
-}
-#endif
-
-#ifdef __aarch64__
-_Float16 __trunctfhf2(_Float128 f) {
-  return __truncsfhf2(f);
-}
-#endif
diff --git a/libc/intrin/flushers.c b/libc/intrin/flushers.c
new file mode 100644
index 000000000..9ef0e0576
--- /dev/null
+++ b/libc/intrin/flushers.c
@@ -0,0 +1,22 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/stdio/fflush.internal.h"
+
+pthread_mutex_t __fflush_lock_obj;
+struct StdioFlush __fflush;
diff --git a/libc/intrin/formathex64.c b/libc/intrin/formathex64.c
index e54a1b2c7..33ba78f8e 100644
--- a/libc/intrin/formathex64.c
+++ b/libc/intrin/formathex64.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/bsr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 static inline int PickGoodWidth(unsigned x, char z) {
   if (z) {
diff --git a/libc/intrin/cosmo_futex_thunk.S b/libc/intrin/futex.S
similarity index 95%
rename from libc/intrin/cosmo_futex_thunk.S
rename to libc/intrin/futex.S
index ad65cc106..73971e959 100644
--- a/libc/intrin/cosmo_futex_thunk.S
+++ b/libc/intrin/futex.S
@@ -17,19 +17,20 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/sysv/consts/nr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .privileged
 
-cosmo_futex_thunk:
-	beg
-	pro
+_futex:
 #ifdef __x86_64__
+	push	%rbp
+	mov	%rsp,%rbp
 	mov	%rcx,%r10
 	mov	__NR_futex,%eax
 	clc
 	syscall
 	jnc	1f
 	neg	%eax
+1:	pop	%rbp
 #elif defined(__aarch64__)
 	ldr	x7,=__hostos
 	ldr	w7,[x7]
@@ -45,7 +46,5 @@ cosmo_futex_thunk:
 #else
 #error "unsupported architecture"
 #endif /* __x86_64__ */
-1:	epi
-	ret
-	end
-	.endfn	cosmo_futex_thunk,globl,hidden
+1:	ret
+	.endfn	_futex,globl,hidden
diff --git a/libc/intrin/gcov.S b/libc/intrin/gcov.S
index 410e30da2..c32da3b85 100644
--- a/libc/intrin/gcov.S
+++ b/libc/intrin/gcov.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Magic words to unbreak build if GCOV flags are passed.
 
diff --git a/libc/intrin/getcpuidbrand.S b/libc/intrin/getcpuidbrand.S
index 255d34a48..0f4c397f4 100644
--- a/libc/intrin/getcpuidbrand.S
+++ b/libc/intrin/getcpuidbrand.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 GetCpuidBrand:
 	mov	%esi,%eax
diff --git a/libc/intrin/getfileattributes.c b/libc/intrin/getfileattributes.c
index 67cb6808e..976d0a2e3 100644
--- a/libc/intrin/getfileattributes.c
+++ b/libc/intrin/getfileattributes.c
@@ -19,7 +19,6 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
 #include "libc/nt/files.h"
-#include "libc/nt/runtime.h"
 #include "libc/nt/thunk/msabi.h"
 
 __msabi extern typeof(GetFileAttributes) *const __imp_GetFileAttributesW;
@@ -31,7 +30,7 @@ __msabi extern typeof(GetFileAttributes) *const __imp_GetFileAttributesW;
 textwindows uint32_t GetFileAttributes(const char16_t *lpPathName) {
   uint32_t flags;
   flags = __imp_GetFileAttributesW(lpPathName);
-  NTTRACE("GetFileAttributes(%#hs) → {%s, %d}", lpPathName,
-          DescribeNtFileFlagAttr(flags), GetLastError());
+  NTTRACE("GetFileAttributes(%#hs) → %s", lpPathName,
+          DescribeNtFileFlagAttr(flags));
   return flags;
 }
diff --git a/libc/intrin/getmainstack.c b/libc/intrin/getmainstack.c
index afcf18e5a..af6e901ba 100644
--- a/libc/intrin/getmainstack.c
+++ b/libc/intrin/getmainstack.c
@@ -17,13 +17,16 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/rlimit.h"
+#include "libc/calls/struct/rlimit.internal.h"
+#include "libc/dce.h"
 #include "libc/intrin/getauxval.h"
+#include "libc/intrin/kprintf.h"
 #include "libc/intrin/maps.h"
-#include "libc/intrin/rlimit.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
-#include "libc/stdio/sysparam.h"
 #include "libc/sysv/consts/auxv.h"
+#include "libc/sysv/consts/rlim.h"
+#include "libc/sysv/consts/rlimit.h"
 
 // Hack for guessing boundaries of _start()'s stack
 //
@@ -88,9 +91,12 @@ static uintptr_t __get_main_top(int pagesz) {
 }
 
 static size_t __get_stack_size(int pagesz, uintptr_t start, uintptr_t top) {
-  size_t stacksz = __rlimit_stack_get().rlim_cur;
-  stacksz = MIN(stacksz, 1024ul * 1024 * 1024 * 1024);
-  return MAX(ROUNDDOWN(stacksz, pagesz), ROUNDUP(top - start, pagesz));
+  size_t size, max = 8 * 1024 * 1024;
+  struct rlimit rlim = {RLIM_INFINITY};
+  sys_getrlimit(RLIMIT_STACK, &rlim);
+  if ((size = rlim.rlim_cur) > max)
+    size = max;
+  return MAX(ROUNDUP(size, pagesz), ROUNDUP(top - start, pagesz));
 }
 
 /**
diff --git a/libc/intrin/getminsigstksz.c b/libc/intrin/getminsigstksz.c
index 9b746e279..cb87e441c 100644
--- a/libc/intrin/getminsigstksz.c
+++ b/libc/intrin/getminsigstksz.c
@@ -16,47 +16,18 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/siginfo.h"
-#include "libc/calls/ucontext.h"
-#include "libc/dce.h"
 #include "libc/intrin/getauxval.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/auxv.h"
 #include "libc/sysv/consts/ss.h"
 
 long __get_minsigstksz(void) {
-  struct AuxiliaryValue av;
-  av = __getauxval(AT_MINSIGSTKSZ);
-  if (av.isfound) {
-    long res = av.value;
-    if (!IsLinux())
-      res += sizeof(struct ucontext) + sizeof(struct siginfo) + 128;
-    if (res < _MINSIGSTKSZ)
-      res = _MINSIGSTKSZ;
-    return res;
+  struct AuxiliaryValue x;
+  x = __getauxval(AT_MINSIGSTKSZ);
+  if (x.isfound) {
+    return MAX(_MINSIGSTKSZ, x.value);
   } else {
-    // _MINSIGSTKSZ takes these things into consideration:
-    //
-    // 1. The platform definition of MINSIGSTKSZ. This will probably be
-    //    enforced by the kernel when calling sys_sigaltstack(). On ARM
-    //    platforms this might be several kilobytes larger than x86. On
-    //    Linux they really want you to use AT_MINSIGSTKSZ instead. The
-    //    kernel should ideally set this to be the number of bytes that
-    //    get subtracted from the stack pointer when delivering signals
-    //    meaning that if you use this for a stack size your handler is
-    //    called successfully but if it uses the stack then it'll crash
-    //
-    // 2. Cosmo sigenter overhead. On non-Linux OSes the kernel calls a
-    //    trampoline in the libc runtime, which translates the platform
-    //    specific signal frame to the Linux memory layout. It means we
-    //    need to push ~1024 extra bytes on the stack to call a handler
-    //
-    // 3. Sanity testing. Assume we use sysconf(_SC_MINSIGSTKSZ) + 2048
-    //    as our stack size (see stackoverflow1_test.c). Then we should
-    //    have enough room to use kprintf() from our signal handler. If
-    //    that isn't the case, then this should be increased a bit more
-    //    noting that if 1024 is used then kprintf should print refusal
-    //
     return _MINSIGSTKSZ;
   }
 }
diff --git a/libc/intrin/getsafesize.greg.c b/libc/intrin/getsafesize.greg.c
index 5a6d9123b..83a772e8a 100644
--- a/libc/intrin/getsafesize.greg.c
+++ b/libc/intrin/getsafesize.greg.c
@@ -17,11 +17,12 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "ape/sections.internal.h"
+#include "libc/intrin/kprintf.h"
 #include "libc/runtime/memtrack.internal.h"
-#include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/tls.h"
+#include "libc/thread/tls2.internal.h"
 
 /**
  * Computes safer buffer size for alloca().
@@ -31,19 +32,18 @@
  * @return number of bytes to use for your buffer, or negative if the
  *     allocation would likely cause a stack overflow
  */
-privileged optimizesize long __get_safe_size(long want, long extraspace) {
+privileged long __get_safe_size(long want, long extraspace) {
   if (!__tls_enabled)
     return want;
   struct PosixThread *pt;
   struct CosmoTib *tib = __get_tls_privileged();
   long bottom, sp = GetStackPointer();
-  if (sp >= (long)tib->tib_sigstack_addr &&
-      sp < (long)tib->tib_sigstack_addr + tib->tib_sigstack_size) {
+  if ((char *)sp >= tib->tib_sigstack_addr &&
+      (char *)sp <= tib->tib_sigstack_addr + tib->tib_sigstack_size) {
     bottom = (long)tib->tib_sigstack_addr;
   } else if ((pt = (struct PosixThread *)tib->tib_pthread) &&
-             sp >= (long)pt->pt_attr.__stackaddr &&
-             sp < (long)pt->pt_attr.__stackaddr + pt->pt_attr.__stacksize) {
-    bottom = (long)pt->pt_attr.__stackaddr;
+             pt->pt_attr.__stacksize) {
+    bottom = (long)pt->pt_attr.__stackaddr + pt->pt_attr.__guardsize;
   } else {
     return want;
   }
diff --git a/libc/intrin/gettid.c b/libc/intrin/gettid.c
index 48c7c9e42..6c5b0c9de 100644
--- a/libc/intrin/gettid.c
+++ b/libc/intrin/gettid.c
@@ -39,7 +39,7 @@
 int gettid(void) {
   int tid;
   if (VERY_LIKELY(__tls_enabled && !__vforked)) {
-    tid = atomic_load_explicit(&__get_tls()->tib_ptid, memory_order_relaxed);
+    tid = atomic_load_explicit(&__get_tls()->tib_tid, memory_order_acquire);
     if (VERY_LIKELY(tid > 0))
       return tid;
   }
diff --git a/libc/intrin/interrupts.S b/libc/intrin/interrupts.S
index f1b4298c5..837973a18 100644
--- a/libc/intrin/interrupts.S
+++ b/libc/intrin/interrupts.S
@@ -26,7 +26,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/runtime/pc.internal.h"
 
diff --git a/libc/intrin/iscall.c b/libc/intrin/iscall.c
index 2cdd23de0..d97d446c1 100644
--- a/libc/intrin/iscall.c
+++ b/libc/intrin/iscall.c
@@ -20,7 +20,7 @@
 
 // returns true if `p` is preceded by x86 call instruction
 // this is actually impossible to do but we'll do our best
-privileged dontinstrument int __is_call(const unsigned char *p) {
+dontinstrument int __is_call(const unsigned char *p) {
   if (p[-5] == 0xe8)
     return 5;  // call Jvds
   if (p[-2] == 0xff && (p[-1] & 070) == 020)
diff --git a/libc/intrin/kclocknames.S b/libc/intrin/kclocknames.S
index fec200aca..9a4ba9d6a 100644
--- a/libc/intrin/kclocknames.S
+++ b/libc/intrin/kclocknames.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kClockNames
@@ -32,13 +32,25 @@
 	.underrun
 kClockNames:
 	.e	CLOCK_REALTIME,"REALTIME"
+	.e	CLOCK_REALTIME_FAST,"REALTIME_FAST"		// order matters
+	.e	CLOCK_REALTIME_PRECISE,"REALTIME_PRECISE"	// order matters
 	.e	CLOCK_REALTIME_COARSE,"REALTIME_COARSE"		// order matters
 	.e	CLOCK_MONOTONIC,"MONOTONIC"
+	.e	CLOCK_MONOTONIC_FAST,"MONOTONIC_FAST"		// order matters
 	.e	CLOCK_MONOTONIC_RAW,"MONOTONIC_RAW"		// order matters
+	.e	CLOCK_MONOTONIC_PRECISE,"MONOTONIC_PRECISE"	// order matters
 	.e	CLOCK_MONOTONIC_COARSE,"MONOTONIC_COARSE"	// order matters
 	.e	CLOCK_PROCESS_CPUTIME_ID,"PROCESS_CPUTIME_ID"
 	.e	CLOCK_THREAD_CPUTIME_ID,"THREAD_CPUTIME_ID"
+	.e	CLOCK_TAI,"TAI"
+	.e	CLOCK_PROF,"PROF"
 	.e	CLOCK_BOOTTIME,"BOOTTIME"
+	.e	CLOCK_REALTIME_ALARM,"REALTIME_ALARM"
+	.e	CLOCK_BOOTTIME_ALARM,"BOOTTIME_ALARM"
+	.e	CLOCK_UPTIME,"UPTIME"
+	.e	CLOCK_UPTIME_FAST,"UPTIME_FAST"
+	.e	CLOCK_UPTIME_PRECISE,"UPTIME_PRECISE"
+	.e	CLOCK_SECOND,"SECOND"
 	.long	MAGNUM_TERMINATOR
 	.endobj	kClockNames,globl,hidden
 	.overrun
diff --git a/libc/intrin/kdos2errno.S b/libc/intrin/kdos2errno.S
index e753485c4..8d3e824a7 100644
--- a/libc/intrin/kdos2errno.S
+++ b/libc/intrin/kdos2errno.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	@fileoverview data structure for __dos2errno()
 //	@see	libc/sysv/dos2errno.sh for the numbers
diff --git a/libc/intrin/kerrnodocs.S b/libc/intrin/kerrnodocs.S
index e2947b79a..5bae4dd56 100644
--- a/libc/intrin/kerrnodocs.S
+++ b/libc/intrin/kerrnodocs.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kErrnoDocs
diff --git a/libc/intrin/kerrnonames.S b/libc/intrin/kerrnonames.S
index 078a60306..a79a52a13 100644
--- a/libc/intrin/kerrnonames.S
+++ b/libc/intrin/kerrnonames.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e
 	.long	\e - kErrnoNames
diff --git a/libc/intrin/kfcntlcmds.S b/libc/intrin/kfcntlcmds.S
index 6de1f427e..6eb1db6e1 100644
--- a/libc/intrin/kfcntlcmds.S
+++ b/libc/intrin/kfcntlcmds.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kFcntlCmds
diff --git a/libc/intrin/kipoptnames.S b/libc/intrin/kipoptnames.S
index 1980cfac3..f2a2ede1f 100644
--- a/libc/intrin/kipoptnames.S
+++ b/libc/intrin/kipoptnames.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kIpOptnames
@@ -32,18 +32,9 @@
 	.underrun
 kIpOptnames:
 	.e	IP_TOS,"TOS"			// int
-	.e	IP_TTL,"TTL"			// int
 	.e	IP_MTU,"MTU"			// int
+	.e	IP_TTL,"TTL"			// int
 	.e	IP_HDRINCL,"HDRINCL"		// bool32
-	.e	IP_OPTIONS,"OPTIONS"
-	.e	IP_RECVTTL,"RECVTTL"
-	.e	IP_ADD_MEMBERSHIP,"ADD_MEMBERSHIP"
-	.e	IP_DROP_MEMBERSHIP,"DROP_MEMBERSHIP"
-	.e	IP_MULTICAST_IF,"MULTICAST_IF"
-	.e	IP_MULTICAST_LOOP,"MULTICAST_LOOP"
-	.e	IP_MULTICAST_TTL,"MULTICAST_TTL"
-	.e	IP_PKTINFO,"PKTINFO"
-	.e	IP_RECVTOS,"RECVTOS"
 	.long	MAGNUM_TERMINATOR
 	.endobj	kIpOptnames,globl,hidden
 	.overrun
diff --git a/libc/intrin/kisdangerous.c b/libc/intrin/kisdangerous.c
deleted file mode 100644
index 2672eae0d..000000000
--- a/libc/intrin/kisdangerous.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/kprintf.h"
-#include "libc/intrin/maps.h"
-#include "libc/runtime/runtime.h"
-
-privileged optimizesize bool32 kisdangerous(const void *addr) {
-  bool32 res = true;
-  __maps_lock();
-  if (__maps.maps) {
-    struct Map *map;
-    if ((map = __maps_floor(addr)))
-      if ((const char *)addr >= map->addr &&
-          (const char *)addr <
-              map->addr + ((map->size + __pagesize - 1) & -__pagesize))
-        res = false;
-  } else {
-    res = false;
-  }
-  __maps_unlock();
-  return res;
-}
diff --git a/libc/intrin/kmonthname.S b/libc/intrin/kmonthname.S
index bf47c7622..e9a4984d0 100644
--- a/libc/intrin/kmonthname.S
+++ b/libc/intrin/kmonthname.S
@@ -7,7 +7,7 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	extern const char kMonthName[12][10];
 	.section .rodata,"a",@progbits
diff --git a/libc/intrin/kmonthnameshort.S b/libc/intrin/kmonthnameshort.S
index 573f1bc6a..4f1874086 100644
--- a/libc/intrin/kmonthnameshort.S
+++ b/libc/intrin/kmonthnameshort.S
@@ -7,7 +7,7 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Type #1:
 //	  - Indexable C-String Array
diff --git a/libc/intrin/kopenflags.S b/libc/intrin/kopenflags.S
index 7927fbacd..bc6691990 100644
--- a/libc/intrin/kopenflags.S
+++ b/libc/intrin/kopenflags.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kOpenFlags
diff --git a/libc/intrin/kpollnames.S b/libc/intrin/kpollnames.S
index 76fbc4b00..21e0d4038 100644
--- a/libc/intrin/kpollnames.S
+++ b/libc/intrin/kpollnames.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kPollNames
diff --git a/libc/intrin/kprintf.greg.c b/libc/intrin/kprintf.greg.c
index a303723c5..21eac26a6 100644
--- a/libc/intrin/kprintf.greg.c
+++ b/libc/intrin/kprintf.greg.c
@@ -40,11 +40,9 @@
 #include "libc/nt/enum/fileflagandattributes.h"
 #include "libc/nt/enum/filesharemode.h"
 #include "libc/nt/errors.h"
-#include "libc/nt/events.h"
 #include "libc/nt/files.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
-#include "libc/nt/struct/overlapped.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/memtrack.internal.h"
@@ -55,7 +53,7 @@
 #include "libc/stdckdint.h"
 #include "libc/stdio/sysparam.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/str/utf16.h"
 #include "libc/sysv/consts/at.h"
 #include "libc/sysv/consts/f.h"
@@ -65,11 +63,10 @@
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/thread/tls.h"
+#include "libc/thread/tls2.internal.h"
 #include "libc/vga/vga.internal.h"
 #include "libc/wctype.h"
 
-#define ABI privileged optimizesize
-
 #define STACK_ERROR "kprintf error: stack is about to overflow\n"
 
 #define KGETINT(x, va, t, s)                 \
@@ -116,13 +113,10 @@
   }
 
 // clang-format off
-__msabi extern typeof(CloseHandle) *const __imp_CloseHandle;
-__msabi extern typeof(CreateEvent) *const __imp_CreateEventW;
 __msabi extern typeof(CreateFile) *const __imp_CreateFileW;
 __msabi extern typeof(DuplicateHandle) *const __imp_DuplicateHandle;
 __msabi extern typeof(GetEnvironmentVariable) *const __imp_GetEnvironmentVariableW;
 __msabi extern typeof(GetLastError) *const __imp_GetLastError;
-__msabi extern typeof(GetOverlappedResult) *const __imp_GetOverlappedResult;
 __msabi extern typeof(GetStdHandle) *const __imp_GetStdHandle;
 __msabi extern typeof(SetLastError) *const __imp_SetLastError;
 __msabi extern typeof(WriteFile) *const __imp_WriteFile;
@@ -160,7 +154,23 @@ __funline bool kischarmisaligned(const char *p, signed char t) {
   return false;
 }
 
-ABI static void klogclose(long fd) {
+privileged bool32 kisdangerous(const void *addr) {
+  bool32 res = true;
+  __maps_lock();
+  if (__maps.maps) {
+    struct Map *map;
+    if ((map = __maps_floor(addr)))
+      if ((const char *)addr >= map->addr &&
+          (const char *)addr < map->addr + map->size)
+        res = false;
+  } else {
+    res = false;
+  }
+  __maps_unlock();
+  return res;
+}
+
+privileged static void klogclose(long fd) {
 #ifdef __x86_64__
   long ax = __NR_close;
   asm volatile("syscall"
@@ -177,7 +187,7 @@ ABI static void klogclose(long fd) {
 #endif
 }
 
-ABI static long klogfcntl(long fd, long cmd, long arg) {
+privileged static long klogfcntl(long fd, long cmd, long arg) {
 #ifdef __x86_64__
   char cf;
   long ax = __NR_fcntl;
@@ -209,7 +219,7 @@ ABI static long klogfcntl(long fd, long cmd, long arg) {
 #endif
 }
 
-ABI static long klogopen(const char *path) {
+privileged static long klogopen(const char *path) {
   long dirfd = AT_FDCWD;
   long flags = O_WRONLY | O_CREAT | O_APPEND;
   long mode = 0600;
@@ -248,7 +258,7 @@ ABI static long klogopen(const char *path) {
 }
 
 // returns log handle or -1 if logging shouldn't happen
-ABI long kloghandle(void) {
+privileged long kloghandle(void) {
   // kprintf() needs to own a file descriptor in case apps closes stderr
   // our close() and dup() implementations will trigger this initializer
   // to minimize a chance that the user accidentally closes their logger
@@ -273,7 +283,7 @@ ABI long kloghandle(void) {
         hand = __imp_CreateFileW(
             path, kNtFileAppendData,
             kNtFileShareRead | kNtFileShareWrite | kNtFileShareDelete, 0,
-            kNtOpenAlways, kNtFileAttributeNormal | kNtFileFlagOverlapped, 0);
+            kNtOpenAlways, kNtFileAttributeNormal, 0);
       } else {
         hand = -1;  // KPRINTF_LOG was empty string or too long
       }
@@ -327,7 +337,7 @@ ABI long kloghandle(void) {
 }
 
 #ifdef __x86_64__
-ABI void _klog_serial(const char *b, size_t n) {
+privileged void _klog_serial(const char *b, size_t n) {
   size_t i;
   uint16_t dx;
   unsigned char al;
@@ -347,28 +357,21 @@ ABI void _klog_serial(const char *b, size_t n) {
 }
 #endif /* __x86_64__ */
 
-ABI void klog(const char *b, size_t n) {
+privileged void klog(const char *b, size_t n) {
 #ifdef __x86_64__
+  int e;
   long h;
   uint32_t wrote;
   long rax, rdi, rsi, rdx;
-  if ((h = kloghandle()) == -1)
+  if ((h = kloghandle()) == -1) {
     return;
+  }
   if (IsWindows()) {
-    bool32 ok;
-    intptr_t ev;
-    int e = __imp_GetLastError();
-    if ((ev = __imp_CreateEventW(0, 0, 0, 0))) {
-      struct NtOverlapped overlap = {.hEvent = ev};
-      ok = !!__imp_WriteFile(h, b, n, 0, &overlap);
-      if (!ok && __imp_GetLastError() == kNtErrorIoPending)
-        ok = true;
-      ok &= !!__imp_GetOverlappedResult(h, &overlap, &wrote, true);
-      if (!ok)
-        __klog_handle = 0;
-      __imp_CloseHandle(ev);
+    e = __imp_GetLastError();
+    if (!__imp_WriteFile(h, b, n, &wrote, 0)) {
+      __imp_SetLastError(e);
+      __klog_handle = 0;
     }
-    __imp_SetLastError(e);
   } else if (IsMetal()) {
     if (_weaken(_klog_vga)) {
       _weaken(_klog_vga)(b, n);
@@ -404,14 +407,14 @@ ABI void klog(const char *b, size_t n) {
 #endif
 }
 
-ABI static size_t kformat(char *b, size_t n, const char *fmt, va_list va) {
+privileged static size_t kformat(char *b, size_t n, const char *fmt,
+                                 va_list va) {
   int si;
   wint_t t, u;
-  char *cxxbuf;
   const char *abet;
   signed char type;
   const char *s, *f;
-  int cxxbufsize = 0;
+  char cxxbuf[3000];
   struct CosmoTib *tib;
   unsigned long long x;
   unsigned i, j, m, rem, sign, hash, cols, prec;
@@ -561,7 +564,7 @@ ABI static size_t kformat(char *b, size_t n, const char *fmt, va_list va) {
           tib = __tls_enabled ? __get_tls_privileged() : 0;
           if (!(tib && (tib->tib_flags & TIB_FLAG_VFORKED))) {
             if (tib) {
-              x = atomic_load_explicit(&tib->tib_ptid, memory_order_relaxed);
+              x = atomic_load_explicit(&tib->tib_tid, memory_order_relaxed);
             } else {
               x = __pid;
             }
@@ -755,25 +758,13 @@ ABI static size_t kformat(char *b, size_t n, const char *fmt, va_list va) {
           x = va_arg(va, intptr_t);
           if (_weaken(__symtab) && *_weaken(__symtab) &&
               (idx = _weaken(__get_symbol)(0, x)) != -1) {
+            /* if (p + 1 <= e) */
+            /*   *p++ = '&'; */
             s = (*_weaken(__symtab))->name_base +
                 (*_weaken(__symtab))->names[idx];
-#pragma GCC push_options
-#pragma GCC diagnostic ignored "-Walloca-larger-than="
-            // decipher c++ symbols if there's enough stack memory
-            // stack size requirement assumes max_depth's still 20
-            if (_weaken(__demangle) &&    //
-                _weaken(__is_mangled) &&  //
-                _weaken(__is_mangled)(s)) {
-              if (!cxxbufsize)
-                if ((cxxbufsize = __get_safe_size(8192, 8192)) >= 512) {
-                  cxxbuf = alloca(cxxbufsize);
-                  CheckLargeStackAllocation(cxxbuf, sizeof(cxxbufsize));
-                }
-              if (cxxbufsize >= 512)
-                if (_weaken(__demangle)(cxxbuf, s, cxxbufsize) != -1)
-                  s = cxxbuf;
-            }
-#pragma GCC pop_options
+            if (_weaken(__is_mangled) && _weaken(__is_mangled)(s) &&
+                _weaken(__demangle)(cxxbuf, s, sizeof(cxxbuf)) != -1)
+              s = cxxbuf;
             goto FormatString;
           }
           base = 4;
@@ -1029,7 +1020,7 @@ ABI static size_t kformat(char *b, size_t n, const char *fmt, va_list va) {
  * @asyncsignalsafe
  * @vforksafe
  */
-ABI size_t ksnprintf(char *b, size_t n, const char *fmt, ...) {
+privileged size_t ksnprintf(char *b, size_t n, const char *fmt, ...) {
   size_t m;
   va_list v;
   va_start(v, fmt);
@@ -1048,7 +1039,7 @@ ABI size_t ksnprintf(char *b, size_t n, const char *fmt, ...) {
  * @asyncsignalsafe
  * @vforksafe
  */
-ABI size_t kvsnprintf(char *b, size_t n, const char *fmt, va_list v) {
+privileged size_t kvsnprintf(char *b, size_t n, const char *fmt, va_list v) {
   return kformat(b, n, fmt, v);
 }
 
@@ -1059,10 +1050,10 @@ ABI size_t kvsnprintf(char *b, size_t n, const char *fmt, va_list v) {
  * @asyncsignalsafe
  * @vforksafe
  */
-ABI void kvprintf(const char *fmt, va_list v) {
+privileged void kvprintf(const char *fmt, va_list v) {
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Walloca-larger-than="
-  long size = __get_safe_size(8192, 2048);
+  long size = __get_safe_size(8000, 8000);
   if (size < 80) {
     klog(STACK_ERROR, sizeof(STACK_ERROR) - 1);
     return;
@@ -1145,7 +1136,7 @@ ABI void kvprintf(const char *fmt, va_list v) {
  * @asyncsignalsafe
  * @vforksafe
  */
-ABI void kprintf(const char *fmt, ...) {
+privileged void kprintf(const char *fmt, ...) {
   // system call support runtime depends on this function
   // function tracing runtime depends on this function
   // asan runtime depends on this function
diff --git a/libc/intrin/krlimitnames.S b/libc/intrin/krlimitnames.S
index e7f0e788b..46c6f5dd1 100644
--- a/libc/intrin/krlimitnames.S
+++ b/libc/intrin/krlimitnames.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kRlimitNames
diff --git a/libc/intrin/ksignalnames.S b/libc/intrin/ksignalnames.S
index 6dbc09c01..19ad59e9c 100644
--- a/libc/intrin/ksignalnames.S
+++ b/libc/intrin/ksignalnames.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kSignalNames
diff --git a/libc/intrin/ksockoptnames.S b/libc/intrin/ksockoptnames.S
index 28c62f276..90d592788 100644
--- a/libc/intrin/ksockoptnames.S
+++ b/libc/intrin/ksockoptnames.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kSockOptnames
@@ -47,7 +47,6 @@ kSockOptnames:
 	.e	SO_RCVLOWAT,"RCVLOWAT"			// int
 	.e	SO_SNDLOWAT,"SNDLOWAT"			// int
 	.e	SO_ERROR,"ERROR"			// int
-	.e	SO_OOBINLINE,"OOBINLINE"		// int
 	.long	MAGNUM_TERMINATOR
 	.endobj	kSockOptnames,globl,hidden
 	.overrun
diff --git a/libc/intrin/ktcpoptnames.S b/libc/intrin/ktcpoptnames.S
index 314c6b16b..d4ab2fdbe 100644
--- a/libc/intrin/ktcpoptnames.S
+++ b/libc/intrin/ktcpoptnames.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.macro	.e e s
 	.long	\e - kTcpOptnames
diff --git a/libc/intrin/kweekdayname.S b/libc/intrin/kweekdayname.S
index 835b8bfe9..0fa2d967c 100644
--- a/libc/intrin/kweekdayname.S
+++ b/libc/intrin/kweekdayname.S
@@ -7,7 +7,7 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	extern const char kWeekdayName[7][10];
 	.section .rodata,"a",@progbits
diff --git a/libc/intrin/kweekdaynameshort.S b/libc/intrin/kweekdaynameshort.S
index 05886838f..14a37b1bd 100644
--- a/libc/intrin/kweekdaynameshort.S
+++ b/libc/intrin/kweekdaynameshort.S
@@ -7,7 +7,7 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Type #1:
 //	  - Indexable C-String Array
diff --git a/libc/intrin/leaky.S b/libc/intrin/leaky.S
index 45b034ffd..b9ba43ed2 100644
--- a/libc/intrin/leaky.S
+++ b/libc/intrin/leaky.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Decentralized section for leaky functions.
 	.section .piro.relo.sort.leaky.1,"aw",@progbits
diff --git a/libc/intrin/localtime_lock.c b/libc/intrin/localtime_lock.c
deleted file mode 100644
index bbc0a04d1..000000000
--- a/libc/intrin/localtime_lock.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/thread/posixthread.internal.h"
-#include "third_party/tz/lock.h"
-
-static pthread_mutex_t __localtime_lock_obj = PTHREAD_MUTEX_INITIALIZER;
-
-void __localtime_lock(void) {
-  _pthread_mutex_lock(&__localtime_lock_obj);
-}
-
-void __localtime_unlock(void) {
-  _pthread_mutex_unlock(&__localtime_lock_obj);
-}
-
-void __localtime_wipe(void) {
-  _pthread_mutex_wipe_np(&__localtime_lock_obj);
-}
diff --git a/libc/intrin/lockless.h b/libc/intrin/lockless.h
deleted file mode 100644
index 7855f16c2..000000000
--- a/libc/intrin/lockless.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_INTRIN_LOCKLESS_H_
-#define COSMOPOLITAN_LIBC_INTRIN_LOCKLESS_H_
-#include "libc/atomic.h"
-#include "libc/intrin/atomic.h"
-COSMOPOLITAN_C_START_
-
-// lockless memory transactions
-//
-// - one writer
-// - many readers
-// - generation is monotonic
-// - even numbers mean memory is ready
-// - odd numbers mean memory is actively being changed
-// - always use acquire semantics inside your read transaction
-//
-// let's say you want to be able to atomically read and write to 128-bit
-// values, but you've only got a 64-bit system. if you expect that it'll
-// frequently written, then you should use a mutex. but if you expect it
-// to be frequently read and rarely written, then it's possible to do it
-// without a mutex; in fact you don't even need the x86 lock instruction
-// prefix; all that is required is a series of carefully ordered mov ops
-// which are designed to exploit the strong ordering of the architecture
-
-static inline unsigned lockless_write_begin(atomic_uint* genptr) {
-  unsigned gen = atomic_load_explicit(genptr, memory_order_acquire);
-  atomic_store_explicit(genptr, gen + 1, memory_order_release);
-  return gen;
-}
-
-static inline void lockless_write_end(atomic_uint* genptr, unsigned gen) {
-  atomic_store_explicit(genptr, gen + 2, memory_order_release);
-}
-
-static inline unsigned lockless_read_begin(atomic_uint* genptr) {
-  return atomic_load_explicit(genptr, memory_order_acquire);
-}
-
-static inline bool lockless_read_end(atomic_uint* genptr, unsigned* want) {
-  unsigned gen1 = *want;
-  unsigned gen2 = atomic_load_explicit(genptr, memory_order_acquire);
-  unsigned is_being_actively_changed = gen1 & 1;
-  unsigned we_lost_race_with_writers = gen1 ^ gen2;
-  if (!(is_being_actively_changed | we_lost_race_with_writers))
-    return true;
-  *want = gen2;
-  return false;
-}
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_LIBC_INTRIN_LOCKLESS_H_ */
diff --git a/libc/intrin/macros.h b/libc/intrin/macros.h
new file mode 100644
index 000000000..38d6324bd
--- /dev/null
+++ b/libc/intrin/macros.h
@@ -0,0 +1,82 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_MACROS_H_
+#define COSMOPOLITAN_LIBC_INTRIN_MACROS_H_
+#include "libc/dce.h"
+#include "libc/nexgen32e/x86feature.h"
+
+#define INTRIN_COMMUTATIVE "%"
+#define INTRIN_NONCOMMUTATIVE
+
+#if defined(__x86_64__) && !defined(__STRICT_ANSI__)
+
+typedef char __intrin_xmm_t
+    __attribute__((__vector_size__(16), __aligned__(16), __may_alias__));
+
+#define INTRIN_SSEVEX_X_X_X_(PURE, ISA, OP, FLAGS, A, B, C)                    \
+  do {                                                                         \
+    if (X86_HAVE(ISA)) {                                                       \
+      __intrin_xmm_t *Xmm0 = (void *)(A);                                      \
+      const __intrin_xmm_t *Xmm1 = (const __intrin_xmm_t *)(B);                \
+      const __intrin_xmm_t *Xmm2 = (const __intrin_xmm_t *)(C);                \
+      if (!X86_NEED(AVX)) {                                                    \
+        asm(OP "\t%1,%0" : "=x"(*Xmm0) : FLAGS "x"(*Xmm2), "0"(*Xmm1));        \
+      } else {                                                                 \
+        asm("v" OP "\t%2,%1,%0" : "=x"(*Xmm0) : FLAGS "x"(*Xmm1), "x"(*Xmm2)); \
+      }                                                                        \
+    } else {                                                                   \
+      PURE(A, B, C);                                                           \
+    }                                                                          \
+  } while (0)
+
+#define INTRIN_SSEVEX_X_X_I_(PURE, ISA, OP, A, B, I)                 \
+  do {                                                               \
+    if (X86_HAVE(ISA)) {                                             \
+      __intrin_xmm_t *Xmm0 = (void *)(A);                            \
+      const __intrin_xmm_t *Xmm1 = (const __intrin_xmm_t *)(B);      \
+      if (!X86_NEED(AVX)) {                                          \
+        asm(OP "\t%2,%1,%0" : "=x"(*Xmm0) : "x"(*Xmm1), "i"(I));     \
+      } else {                                                       \
+        asm("v" OP "\t%2,%1,%0" : "=x"(*Xmm0) : "x"(*Xmm1), "i"(I)); \
+      }                                                              \
+    } else {                                                         \
+      PURE(A, B, I);                                                 \
+    }                                                                \
+  } while (0)
+
+#define INTRIN_SSEVEX_X_X_(PURE, ISA, OP, A, B)                 \
+  do {                                                          \
+    if (X86_HAVE(ISA)) {                                        \
+      __intrin_xmm_t *Xmm0 = (void *)(A);                       \
+      const __intrin_xmm_t *Xmm1 = (const __intrin_xmm_t *)(B); \
+      if (!X86_NEED(AVX)) {                                     \
+        asm(OP "\t%1,%0" : "=x"(*Xmm0) : "0"(*Xmm1));           \
+      } else {                                                  \
+        asm("v" OP "\t%1,%0" : "=x"(*Xmm0) : "x"(*Xmm1));       \
+      }                                                         \
+    } else {                                                    \
+      PURE(A, B);                                               \
+    }                                                           \
+  } while (0)
+
+#define INTRIN_SSEVEX_X_I_(PURE, ISA, OP, A, B, I)                   \
+  do {                                                               \
+    if (!IsModeDbg() && X86_HAVE(ISA)) {                             \
+      __intrin_xmm_t *Xmm0 = (void *)(A);                            \
+      const __intrin_xmm_t *Xmm1 = (const __intrin_xmm_t *)(B);      \
+      if (!X86_NEED(AVX)) {                                          \
+        asm(OP "\t%1,%0" : "=x"(*Xmm0) : "i"(I), "0"(*Xmm1));        \
+      } else {                                                       \
+        asm("v" OP "\t%2,%1,%0" : "=x"(*Xmm0) : "x"(*Xmm1), "i"(I)); \
+      }                                                              \
+    } else {                                                         \
+      PURE(A, B, I);                                                 \
+    }                                                                \
+  } while (0)
+
+#else
+#define INTRIN_SSEVEX_X_X_X_(PURE, ISA, OP, FLAGS, A, B, C) PURE(A, B, C)
+#define INTRIN_SSEVEX_X_X_I_(PURE, ISA, OP, A, B, I)        PURE(A, B, I)
+#define INTRIN_SSEVEX_X_I_(PURE, ISA, OP, A, B, I)          PURE(A, B, I)
+#define INTRIN_SSEVEX_X_X_(PURE, ISA, OP, A, B)             PURE(A, B)
+#endif /* X86 && !ANSI */
+
+#endif /* COSMOPOLITAN_LIBC_INTRIN_MACROS_H_ */
diff --git a/libc/intrin/maps.c b/libc/intrin/maps.c
index 7f74960e1..3d042e5d2 100644
--- a/libc/intrin/maps.c
+++ b/libc/intrin/maps.c
@@ -18,28 +18,18 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/maps.h"
 #include "ape/sections.internal.h"
-#include "libc/calls/state.internal.h"
-#include "libc/calls/syscall-sysv.internal.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
-#include "libc/intrin/describebacktrace.h"
 #include "libc/intrin/dll.h"
 #include "libc/intrin/maps.h"
-#include "libc/macros.h"
-#include "libc/nexgen32e/rdtsc.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
-#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/auxv.h"
 #include "libc/sysv/consts/prot.h"
-#include "libc/thread/lock.h"
-#include "libc/thread/tls.h"
 
 #ifdef __x86_64__
 __static_yoink("_init_maps");
 #endif
 
-#define ABI privileged optimizespeed
-
 struct Maps __maps;
 
 void __maps_add(struct Map *map) {
@@ -57,136 +47,75 @@ void __maps_stack(char *stackaddr, int pagesz, int guardsize, size_t stacksize,
   __maps.stack.addr = stackaddr + guardsize;
   __maps.stack.size = stacksize - guardsize;
   __maps.stack.prot = stackprot;
-  __maps.stack.hand = MAPS_SUBREGION;
-  __maps.stack.flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  __maps.stack.hand = -1;
   __maps_adder(&__maps.stack, pagesz);
   if (guardsize) {
     __maps.guard.addr = stackaddr;
     __maps.guard.size = guardsize;
-    __maps.guard.prot = PROT_NONE | PROT_GUARD;
+    __maps.guard.prot = PROT_NONE;
     __maps.guard.hand = stackhand;
-    __maps.guard.flags = MAP_PRIVATE | MAP_ANONYMOUS;
     __maps_adder(&__maps.guard, pagesz);
-  } else {
-    __maps.stack.hand = stackhand;
   }
 }
 
 void __maps_init(void) {
   int pagesz = __pagesize;
 
-  // initialize lemur64
-  __maps.rand = 2131259787901769494;
-  __maps.rand ^= kStartTsc;
-
-  // these static map objects avoid mandatory mmap() in __maps_alloc()
-  // they aren't actually needed for bootstrapping this memory manager
-  for (int i = 0; i < ARRAYLEN(__maps.spool); ++i)
-    __maps_free(&__maps.spool[i]);
-
   // record _start() stack mapping
   if (!IsWindows()) {
-
-    // linux v4.12+ reserves 1mb of guard space beneath rlimit_stack
-    // https://lwn.net/Articles/725832/. if we guess too small, then
-    // slackmap will create a bunch of zombie stacks in __print_maps
-    // to coverup the undisclosed memory but no cost if we guess big
-    size_t guardsize = 1024 * 1024;
-    guardsize += __pagesize - 1;
-    guardsize &= -__pagesize;
-
-    // track the main stack region that the os gave to start earlier
-    struct AddrSize stack = __get_main_stack();
-    __maps_stack(stack.addr - guardsize, pagesz, guardsize,
-                 guardsize + stack.size, (uintptr_t)ape_stack_prot, 0);
+    struct AddrSize stack;
+    stack = __get_main_stack();
+    __maps_stack(stack.addr, pagesz, 0, stack.size, (uintptr_t)ape_stack_prot,
+                 0);
   }
 
   // record .text and .data mappings
-  __maps_track((char *)__executable_start, _etext - __executable_start,
-               PROT_READ | PROT_EXEC, MAP_NOFORK);
+  static struct Map text, data;
+  text.addr = (char *)__executable_start;
+  text.size = _etext - __executable_start;
+  text.prot = PROT_READ | PROT_EXEC;
   uintptr_t ds = ((uintptr_t)_etext + pagesz - 1) & -pagesz;
-  if (ds < (uintptr_t)_end)
-    __maps_track((char *)ds, (uintptr_t)_end - ds, PROT_READ | PROT_WRITE,
-                 MAP_NOFORK);
+  if (ds < (uintptr_t)_end) {
+    data.addr = (char *)ds;
+    data.size = (uintptr_t)_end - ds;
+    data.prot = PROT_READ | PROT_WRITE;
+    __maps_adder(&data, pagesz);
+  }
+  __maps_adder(&text, pagesz);
 }
 
-bool __maps_held(void) {
-  return !__tls_enabled || (__get_tls()->tib_flags & TIB_FLAG_VFORKED) ||
-         MUTEX_OWNER(
-             atomic_load_explicit(&__maps.lock.word, memory_order_relaxed)) ==
-             atomic_load_explicit(&__get_tls()->tib_ptid, memory_order_relaxed);
+privileged bool __maps_lock(void) {
+  struct CosmoTib *tib;
+  if (!__tls_enabled)
+    return false;
+  tib = __get_tls_privileged();
+  if (atomic_fetch_add_explicit(&tib->tib_relock_maps, 1, memory_order_relaxed))
+    return true;
+  int backoff = 0;
+  while (atomic_exchange_explicit(&__maps.lock, 1, memory_order_acquire)) {
+    if (backoff < 7) {
+      volatile int i;
+      for (i = 0; i != 1 << backoff; i++) {
+      }
+      backoff++;
+    } else {
+      // STRACE("pthread_delay_np(__maps)");
+#if defined(__GNUC__) && defined(__aarch64__)
+      __asm__ volatile("yield");
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+      __asm__ volatile("pause");
+#endif
+    }
+  }
+  return false;
 }
 
-bool __maps_reentrant(void) {
-  return __tls_enabled && !(__get_tls()->tib_flags & TIB_FLAG_VFORKED) &&
-         MUTEX_OWNER(
-             atomic_load_explicit(&__maps.lock.word, memory_order_relaxed)) ==
-             atomic_load_explicit(&__get_tls()->tib_ptid, memory_order_relaxed);
-}
-
-ABI void __maps_lock(void) {
-  int me;
-  uint64_t word, lock;
+privileged void __maps_unlock(void) {
   struct CosmoTib *tib;
   if (!__tls_enabled)
     return;
-  if (!(tib = __get_tls_privileged()))
-    return;
-  if (tib->tib_flags & TIB_FLAG_VFORKED)
-    return;
-  me = atomic_load_explicit(&tib->tib_ptid, memory_order_relaxed);
-  word = 0;
-  lock = MUTEX_LOCK(word);
-  lock = MUTEX_SET_OWNER(lock, me);
-  if (atomic_compare_exchange_strong_explicit(&__maps.lock.word, &word, lock,
-                                              memory_order_acquire,
-                                              memory_order_relaxed))
-    return;
-  word = atomic_load_explicit(&__maps.lock.word, memory_order_relaxed);
-  for (;;) {
-    if (MUTEX_OWNER(word) == me) {
-      if (atomic_compare_exchange_weak_explicit(
-              &__maps.lock.word, &word, MUTEX_INC_DEPTH(word),
-              memory_order_relaxed, memory_order_relaxed))
-        return;
-      continue;
-    }
-    word = 0;
-    lock = MUTEX_LOCK(word);
-    lock = MUTEX_SET_OWNER(lock, me);
-    if (atomic_compare_exchange_weak_explicit(&__maps.lock.word, &word, lock,
-                                              memory_order_acquire,
-                                              memory_order_relaxed))
-      return;
-    for (;;) {
-      word = atomic_load_explicit(&__maps.lock.word, memory_order_relaxed);
-      if (MUTEX_OWNER(word) == me)
-        break;
-      if (!word)
-        break;
-    }
-  }
-}
-
-ABI void __maps_unlock(void) {
-  uint64_t word;
-  struct CosmoTib *tib;
-  if (!__tls_enabled)
-    return;
-  if (!(tib = __get_tls_privileged()))
-    return;
-  if (tib->tib_flags & TIB_FLAG_VFORKED)
-    return;
-  word = atomic_load_explicit(&__maps.lock.word, memory_order_relaxed);
-  for (;;) {
-    if (MUTEX_DEPTH(word))
-      if (atomic_compare_exchange_weak_explicit(
-              &__maps.lock.word, &word, MUTEX_DEC_DEPTH(word),
-              memory_order_relaxed, memory_order_relaxed))
-        break;
-    if (atomic_compare_exchange_weak_explicit(&__maps.lock.word, &word, 0,
-                                              memory_order_release,
-                                              memory_order_relaxed))
-      break;
-  }
+  tib = __get_tls_privileged();
+  if (atomic_fetch_sub_explicit(&tib->tib_relock_maps, 1,
+                                memory_order_relaxed) == 1)
+    atomic_store_explicit(&__maps.lock, 0, memory_order_release);
 }
diff --git a/libc/intrin/maps.h b/libc/intrin/maps.h
index 86b1f2f55..3a30c752a 100644
--- a/libc/intrin/maps.h
+++ b/libc/intrin/maps.h
@@ -3,29 +3,10 @@
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/tree.h"
 #include "libc/runtime/runtime.h"
+#include "libc/thread/tls2.internal.h"
 COSMOPOLITAN_C_START_
 
-/* size of dynamic memory that is used internally by your memory manager */
-#define MAPS_SIZE 65536
-
-/* when map->hand is MAPS_RESERVATION it means mmap() is transactionally
-   reserving address space it is in the process of requesting from win32 */
-#define MAPS_RESERVATION -2
-
-/* when map->hand is MAPS_SUBREGION it means that an allocation has been
-   broken into multiple fragments by mprotect(). the first fragment must
-   be set to MAPS_VIRTUAL or your CreateFileMapping() handle. your frags
-   must be perfectly contiguous in memory and should have the same flags */
-#define MAPS_SUBREGION -3
-
-/* indicates an allocation was created by VirtualAlloc() and so munmap()
-   must call VirtualFree() when destroying it. use it on the hand field. */
-#define MAPS_VIRTUAL -4
-
-/* if this is used on MAP_PRIVATE memory, then it's assumed to be memory
-   that win32 allocated, e.g. a CreateThread() stack. if this is used on
-   MAP_FILE memory, then it's assumed to be part of the executable image */
-#define MAP_NOFORK 0x10000000
+#define MAPS_RETRY ((void *)-1)
 
 #define MAP_TREE_CONTAINER(e) TREE_CONTAINER(struct Map, tree, e)
 
@@ -33,8 +14,8 @@ struct Map {
   char *addr;        /* granule aligned */
   size_t size;       /* must be nonzero */
   int64_t off;       /* ignore for anon */
+  int prot;          /* memory protects */
   int flags;         /* memory map flag */
-  short prot;        /* memory protects */
   bool iscow;        /* windows nt only */
   bool readonlyfile; /* windows nt only */
   unsigned visited;  /* checks and fork */
@@ -45,31 +26,15 @@ struct Map {
   };
 };
 
-struct MapLock {
-  void *edges;
-  _Atomic(uint64_t) word;
-};
-
-struct MapSlab {
-  struct MapSlab *next;
-  struct Map maps[(MAPS_SIZE - sizeof(struct MapSlab *)) / sizeof(struct Map)];
-};
-
 struct Maps {
-  uint128_t rand;
+  atomic_int lock;
   struct Tree *maps;
-  struct MapLock lock;
-  _Atomic(uintptr_t) freed;
-  _Atomic(struct MapSlab *) slabs;
+  _Atomic(struct Map *) freed;
   size_t count;
   size_t pages;
+  _Atomic(char *) pick;
   struct Map stack;
   struct Map guard;
-#ifdef MODE_DBG
-  struct Map spool[1];
-#else
-  struct Map spool[20];
-#endif
 };
 
 struct AddrSize {
@@ -79,18 +44,14 @@ struct AddrSize {
 
 extern struct Maps __maps;
 
-bool __maps_held(void);
 void __maps_init(void);
-void __maps_lock(void);
+bool __maps_lock(void);
 void __maps_check(void);
 void __maps_unlock(void);
-bool __maps_reentrant(void);
 void *__maps_randaddr(void);
+void *__maps_pickaddr(size_t);
 void __maps_add(struct Map *);
 void __maps_free(struct Map *);
-void __maps_insert(struct Map *);
-int __maps_untrack(char *, size_t);
-bool __maps_track(char *, size_t, int, int);
 struct Map *__maps_alloc(void);
 struct Map *__maps_floor(const char *);
 void __maps_stack(char *, int, int, size_t, int, intptr_t);
@@ -111,13 +72,6 @@ static inline struct Map *__maps_next(struct Map *map) {
   return 0;
 }
 
-static inline struct Map *__maps_prev(struct Map *map) {
-  struct Tree *node;
-  if ((node = tree_prev(&map->tree)))
-    return MAP_TREE_CONTAINER(node);
-  return 0;
-}
-
 static inline struct Map *__maps_first(void) {
   struct Tree *node;
   if ((node = tree_first(__maps.maps)))
@@ -125,16 +79,5 @@ static inline struct Map *__maps_first(void) {
   return 0;
 }
 
-static inline struct Map *__maps_last(void) {
-  struct Tree *node;
-  if ((node = tree_last(__maps.maps)))
-    return MAP_TREE_CONTAINER(node);
-  return 0;
-}
-
-static inline bool __maps_isalloc(struct Map *map) {
-  return map->hand != MAPS_SUBREGION;
-}
-
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_MAPS_H_ */
diff --git a/libc/intrin/maps_init.S b/libc/intrin/maps_init.S
index 134ed77f7..56f2c7f63 100644
--- a/libc/intrin/maps_init.S
+++ b/libc/intrin/maps_init.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.init.start 301,_init_maps
 	push	%rdi
diff --git a/libc/intrin/memchr.c b/libc/intrin/memchr.c
index fbe1ad409..6680c5292 100644
--- a/libc/intrin/memchr.c
+++ b/libc/intrin/memchr.c
@@ -19,10 +19,10 @@
 #include "libc/dce.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/intel/immintrin.internal.h"
 #ifndef __aarch64__
 
+typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
+
 static inline const unsigned char *memchr_pure(const unsigned char *s,
                                                unsigned char c, size_t n) {
   size_t i;
@@ -35,27 +35,22 @@ static inline const unsigned char *memchr_pure(const unsigned char *s,
 }
 
 #if defined(__x86_64__) && !defined(__chibicc__)
-static const char *memchr_sse(const char *s, char c, size_t n) {
-  const char *e = s + n;
-  __m128i t = _mm_set1_epi8(c);
-  unsigned m, k = (uintptr_t)s & 15;
-  m = _mm_movemask_epi8(
-      _mm_cmpeq_epi8(_mm_load_si128((const __m128i *)((uintptr_t)s & -16)), t));
-  m >>= k;
-  if (m) {
-    s += __builtin_ctz(m);
-    if (s < e)
-      return s;
-    return 0;
-  }
-  for (s += 16 - k; s < e; s += 16) {
-    m = _mm_movemask_epi8(
-        _mm_cmpeq_epi8(_mm_load_si128((const __m128i *)s), t));
+static __vex const unsigned char *memchr_sse(const unsigned char *s,
+                                             unsigned char c, size_t n) {
+  size_t i;
+  unsigned m;
+  xmm_t v, t = {c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
+  for (; n >= 16; n -= 16, s += 16) {
+    v = *(const xmm_t *)s;
+    m = __builtin_ia32_pmovmskb128(v == t);
     if (m) {
-      s += __builtin_ctz(m);
-      if (s < e)
-        return s;
-      return 0;
+      m = __builtin_ctzll(m);
+      return s + m;
+    }
+  }
+  for (i = 0; i < n; ++i) {
+    if (s[i] == c) {
+      return s + i;
     }
   }
   return 0;
diff --git a/libc/intrin/mman.greg.c b/libc/intrin/mman.greg.c
index 07059f919..8d7f17449 100644
--- a/libc/intrin/mman.greg.c
+++ b/libc/intrin/mman.greg.c
@@ -36,7 +36,7 @@
 #include "libc/assert.h"
 #include "libc/elf/def.h"
 #include "libc/elf/struct/phdr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/uart.internal.h"
 #include "libc/runtime/e820.internal.h"
 #include "libc/runtime/metalprintf.internal.h"
diff --git a/libc/intrin/mmap.c b/libc/intrin/mmap.c
index de3b5571a..65cccc769 100644
--- a/libc/intrin/mmap.c
+++ b/libc/intrin/mmap.c
@@ -18,9 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
-#include "libc/calls/state.internal.h"
 #include "libc/calls/syscall-sysv.internal.h"
-#include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
@@ -32,42 +30,25 @@
 #include "libc/intrin/strace.h"
 #include "libc/intrin/tree.h"
 #include "libc/intrin/weaken.h"
-#include "libc/limits.h"
-#include "libc/macros.h"
-#include "libc/nt/enum/filemapflags.h"
-#include "libc/nt/enum/memflags.h"
-#include "libc/nt/enum/pageflags.h"
-#include "libc/nt/errors.h"
 #include "libc/nt/memory.h"
 #include "libc/nt/runtime.h"
 #include "libc/runtime/runtime.h"
-#include "libc/runtime/syslib.internal.h"
 #include "libc/runtime/zipos.internal.h"
-#include "libc/stdckdint.h"
+#include "libc/stdio/rand.h"
 #include "libc/stdio/sysparam.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/mremap.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/thread/lock.h"
-#include "libc/thread/thread.h"
-#include "libc/thread/tls.h"
 
-#define MMDEBUG  0
-#define MAX_SIZE 0x0ff800000000ul
+#define MMDEBUG   IsModeDbg()
+#define MAX_SIZE  0x0ff800000000ul
+#define MAX_TRIES 10
 
 #define MAP_FIXED_NOREPLACE_linux 0x100000
 
-#define PGUP(x) (((x) + __pagesize - 1) & -__pagesize)
-#define GRUP(x) (((x) + __gransize - 1) & -__gransize)
-
-#define MASQUE    0x00fffffffffffff8
-#define PTR(x)    ((uintptr_t)(x) & MASQUE)
-#define TAG(x)    ROL((uintptr_t)(x) & ~MASQUE, 8)
-#define ABA(p, t) ((uintptr_t)(p) | (ROR((uintptr_t)(t), 8) & ~MASQUE))
-#define ROL(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
-#define ROR(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
+#define PGUP(x) (((x) + pagesz - 1) & -pagesz)
 
 #if !MMDEBUG
 #define ASSERT(x) (void)0
@@ -78,18 +59,13 @@
       char bt[160];                                                       \
       struct StackFrame *bp = __builtin_frame_address(0);                 \
       kprintf("%!s:%d: assertion failed: %!s\n", __FILE__, __LINE__, #x); \
-      kprintf("bt %!s\n", _DescribeBacktrace(bt, bp));                    \
+      kprintf("bt %!s\n", (DescribeBacktrace)(bt, bp));                   \
       __print_maps(0);                                                    \
       __builtin_trap();                                                   \
     }                                                                     \
   } while (0)
 #endif
 
-struct DirectMap {
-  void *addr;
-  int64_t hand;
-};
-
 int __maps_compare(const struct Tree *ra, const struct Tree *rb) {
   const struct Map *a = (const struct Map *)MAP_TREE_CONTAINER(ra);
   const struct Map *b = (const struct Map *)MAP_TREE_CONTAINER(rb);
@@ -103,50 +79,20 @@ privileged optimizespeed struct Map *__maps_floor(const char *addr) {
   return 0;
 }
 
-static bool __maps_overlaps(const char *addr, size_t size) {
-  struct Map *map;
-  ASSERT(__maps_held());
-  if (!(map = __maps_floor(addr)))
-    map = __maps_first();
-  for (; map && map->addr <= addr + size; map = __maps_next(map))
+static bool __maps_overlaps(const char *addr, size_t size, int pagesz) {
+  struct Map *map, *floor = __maps_floor(addr);
+  for (map = floor; map && map->addr <= addr + size; map = __maps_next(map))
     if (MAX(addr, map->addr) <
         MIN(addr + PGUP(size), map->addr + PGUP(map->size)))
       return true;
   return false;
 }
 
-// returns true if all fragments of all allocations which overlap
-// [addr,addr+size) are completely contained by [addr,addr+size).
-textwindows static bool __maps_envelops(const char *addr, size_t size) {
-  struct Map *map;
-  size = PGUP(size);
-  ASSERT(__maps_held());
-  if (!(map = __maps_floor(addr)))
-    map = __maps_first();
-  while (map && map->addr <= addr + size) {
-    if (MAX(addr, map->addr) < MIN(addr + size, map->addr + map->size)) {
-      if (!__maps_isalloc(map))
-        return false;  // didn't include first fragment of alloc
-      if (addr > map->addr)
-        return false;    // excluded leading pages of first fragment
-      struct Map *next;  // set map to last fragment in allocation
-      for (; (next = __maps_next(map)) && !__maps_isalloc(next); map = next)
-        ASSERT(map->addr + map->size == next->addr);  // contiguous
-      if (addr + size < map->addr + PGUP(map->size))
-        return false;  // excluded trailing pages of allocation
-      map = next;
-    } else {
-      map = __maps_next(map);
-    }
-  }
-  return true;
-}
-
 void __maps_check(void) {
 #if MMDEBUG
-  ASSERT(__maps_held());
   size_t maps = 0;
   size_t pages = 0;
+  int pagesz = __pagesize;
   static unsigned mono;
   unsigned id = ++mono;
   for (struct Map *map = __maps_first(); map; map = __maps_next(map)) {
@@ -154,7 +100,7 @@ void __maps_check(void) {
     ASSERT(map->visited != id);
     ASSERT(map->size);
     map->visited = id;
-    pages += (map->size + __pagesize - 1) / __pagesize;
+    pages += (map->size + pagesz - 1) / pagesz;
     maps += 1;
     struct Map *next;
     if ((next = __maps_next(map))) {
@@ -168,112 +114,110 @@ void __maps_check(void) {
 #endif
 }
 
-#if MMDEBUG
-static void __maps_ok(void) {
-  ASSERT(!__maps_reentrant());
-  __maps_lock();
-  __maps_check();
-  __maps_unlock();
-}
-__attribute__((__constructor__)) static void __maps_ctor(void) {
-  atexit(__maps_ok);
-  __maps_ok();
-}
-__attribute__((__destructor__)) static void __maps_dtor(void) {
-  __maps_ok();
-}
-#endif
-
-static int __muntrack(char *addr, size_t size, struct Map **deleted,
-                      struct Map **untracked, struct Map temp[2]) {
+static int __muntrack(char *addr, size_t size, int pagesz,
+                      struct Map **deleted) {
   int rc = 0;
-  size_t ti = 0;
   struct Map *map;
   struct Map *next;
-  size = PGUP(size);
-  ASSERT(__maps_held());
-  if (!(map = __maps_floor(addr)))
-    map = __maps_first();
-  for (; map && map->addr <= addr + size; map = next) {
+  struct Map *floor;
+StartOver:
+  floor = __maps_floor(addr);
+  for (map = floor; map && map->addr <= addr + size; map = next) {
     next = __maps_next(map);
     char *map_addr = map->addr;
     size_t map_size = map->size;
-    if (MAX(addr, map_addr) >= MIN(addr + size, map_addr + PGUP(map_size)))
+    if (!(MAX(addr, map_addr) <
+          MIN(addr + PGUP(size), map_addr + PGUP(map_size))))
       continue;
-    if (addr <= map_addr && addr + size >= map_addr + PGUP(map_size)) {
-      if (map->hand == MAPS_RESERVATION)
-        continue;
+    if (addr <= map_addr && addr + PGUP(size) >= map_addr + PGUP(map_size)) {
       // remove mapping completely
       tree_remove(&__maps.maps, &map->tree);
       map->freed = *deleted;
       *deleted = map;
-      __maps.pages -= (map_size + __pagesize - 1) / __pagesize;
+      __maps.pages -= (map_size + pagesz - 1) / pagesz;
       __maps.count -= 1;
       __maps_check();
+    } else if (IsWindows()) {
+      // you can't carve up memory maps on windows ;_;
+      rc = einval();
     } else if (addr <= map_addr) {
       // shave off lefthand side of mapping
-      ASSERT(addr + size < map_addr + PGUP(map_size));
-      size_t left = addr + size - map_addr;
+      ASSERT(addr + PGUP(size) < map_addr + PGUP(map_size));
+      size_t left = addr + PGUP(size) - map_addr;
       size_t right = map_size - left;
       ASSERT(right > 0);
       ASSERT(left > 0);
-      map->addr += left;
-      map->size = right;
-      if (!(map->flags & MAP_ANONYMOUS))
-        map->off += left;
-      __maps.pages -= (left + __pagesize - 1) / __pagesize;
-      if (untracked) {
-        ASSERT(ti < 2);
-        temp[ti].addr = map_addr;
-        temp[ti].size = left;
-        temp[ti].freed = *untracked;
-        *untracked = temp;
-        ++ti;
+      struct Map *leftmap;
+      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
+        map->addr += left;
+        map->size = right;
+        if (!(map->flags & MAP_ANONYMOUS))
+          map->off += left;
+        __maps.pages -= (left + pagesz - 1) / pagesz;
+        leftmap->addr = map_addr;
+        leftmap->size = left;
+        leftmap->freed = *deleted;
+        *deleted = leftmap;
+        __maps_check();
+      } else {
+        rc = -1;
       }
-      __maps_check();
-    } else if (addr + size >= map_addr + PGUP(map_size)) {
+    } else if (addr + PGUP(size) >= map_addr + PGUP(map_size)) {
       // shave off righthand side of mapping
       size_t left = addr - map_addr;
       size_t right = map_addr + map_size - addr;
-      map->size = left;
-      __maps.pages -= (right + __pagesize - 1) / __pagesize;
-      if (untracked) {
-        ASSERT(ti < 2);
-        temp[ti].addr = addr;
-        temp[ti].size = right;
-        temp[ti].freed = *untracked;
-        *untracked = temp;
-        ++ti;
+      struct Map *rightmap;
+      if ((rightmap = __maps_alloc())) {
+        if (rightmap == MAPS_RETRY)
+          goto StartOver;
+        map->size = left;
+        __maps.pages -= (right + pagesz - 1) / pagesz;
+        rightmap->addr = addr;
+        rightmap->size = right;
+        rightmap->freed = *deleted;
+        *deleted = rightmap;
+        __maps_check();
+      } else {
+        rc = -1;
       }
-      __maps_check();
     } else {
       // punch hole in mapping
       size_t left = addr - map_addr;
-      size_t middle = size;
+      size_t middle = PGUP(size);
       size_t right = map_size - middle - left;
       struct Map *leftmap;
       if ((leftmap = __maps_alloc())) {
-        leftmap->addr = map_addr;
-        leftmap->size = left;
-        leftmap->off = map->off;
-        leftmap->prot = map->prot;
-        leftmap->flags = map->flags;
-        map->addr += left + middle;
-        map->size = right;
-        if (!(map->flags & MAP_ANONYMOUS))
-          map->off += left + middle;
-        tree_insert(&__maps.maps, &leftmap->tree, __maps_compare);
-        __maps.pages -= (middle + __pagesize - 1) / __pagesize;
-        __maps.count += 1;
-        if (untracked) {
-          ASSERT(ti < 2);
-          temp[ti].addr = addr;
-          temp[ti].size = size;
-          temp[ti].freed = *untracked;
-          *untracked = temp;
-          ++ti;
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
+        struct Map *middlemap;
+        if ((middlemap = __maps_alloc())) {
+          if (middlemap == MAPS_RETRY) {
+            __maps_free(leftmap);
+            goto StartOver;
+          }
+          leftmap->addr = map_addr;
+          leftmap->size = left;
+          leftmap->off = map->off;
+          leftmap->prot = map->prot;
+          leftmap->flags = map->flags;
+          map->addr += left + middle;
+          map->size = right;
+          if (!(map->flags & MAP_ANONYMOUS))
+            map->off += left + middle;
+          tree_insert(&__maps.maps, &leftmap->tree, __maps_compare);
+          __maps.pages -= (middle + pagesz - 1) / pagesz;
+          __maps.count += 1;
+          middlemap->addr = addr;
+          middlemap->size = size;
+          middlemap->freed = *deleted;
+          *deleted = middlemap;
+          __maps_check();
+        } else {
+          __maps_free(leftmap);
+          rc = -1;
         }
-        __maps_check();
       } else {
         rc = -1;
       }
@@ -283,16 +227,13 @@ static int __muntrack(char *addr, size_t size, struct Map **deleted,
 }
 
 void __maps_free(struct Map *map) {
-  uintptr_t tip;
-  ASSERT(!TAG(map));
   map->size = 0;
   map->addr = MAP_FAILED;
-  map->hand = kNtInvalidHandleValue;
-  for (tip = atomic_load_explicit(&__maps.freed, memory_order_relaxed);;) {
-    map->freed = (struct Map *)PTR(tip);
-    if (atomic_compare_exchange_weak_explicit(
-            &__maps.freed, &tip, ABA(map, TAG(tip) + 1), memory_order_release,
-            memory_order_relaxed))
+  map->freed = atomic_load_explicit(&__maps.freed, memory_order_relaxed);
+  for (;;) {
+    if (atomic_compare_exchange_weak_explicit(&__maps.freed, &map->freed, map,
+                                              memory_order_release,
+                                              memory_order_relaxed))
       break;
   }
 }
@@ -305,333 +246,131 @@ static void __maps_free_all(struct Map *list) {
   }
 }
 
-static void __maps_insert_all(struct Map *list) {
-  struct Map *next;
-  for (struct Map *map = list; map; map = next) {
-    next = map->freed;
-    __maps_insert(map);
-  }
-}
+static void __maps_insert(struct Map *map) {
+  map->flags &= MAP_TYPE | MAP_ANONYMOUS | MAP_NOFORK;
 
-static int __maps_destroy_all(struct Map *list) {
-  int rc = 0;
-  for (struct Map *map = list; map; map = map->freed) {
-    if (!IsWindows()) {
-      if (sys_munmap(map->addr, map->size))
-        rc = -1;
-    } else {
-      switch (map->hand) {
-        case MAPS_SUBREGION:
-        case MAPS_RESERVATION:
-          break;
-        case MAPS_VIRTUAL:
-          if (!VirtualFree(map->addr, 0, kNtMemRelease))
-            rc = __winerr();
-          break;
-        default:
-          ASSERT(map->hand > 0);
-          if (!UnmapViewOfFile(map->addr))
-            rc = -1;
-          if (!CloseHandle(map->hand))
-            rc = -1;
-          break;
+  // coalesce adjacent mappings
+  if (!IsWindows() && (map->flags & MAP_ANONYMOUS)) {
+    int prot = map->prot & ~(MAP_FIXED | MAP_FIXED_NOREPLACE);
+    int flags = map->flags;
+    bool coalesced = false;
+    struct Map *floor, *other, *last = 0;
+    for (other = floor = __maps_floor(map->addr);
+         other && other->addr <= map->addr + map->size;
+         last = other, other = __maps_next(other)) {
+      if (prot == other->prot && flags == other->flags) {
+        if (!coalesced) {
+          if (map->addr == other->addr + other->size) {
+            __maps.pages += (map->size + __pagesize - 1) / __pagesize;
+            other->size += map->size;
+            __maps_free(map);
+            __maps_check();
+            coalesced = true;
+          } else if (map->addr + map->size == other->addr) {
+            __maps.pages += (map->size + __pagesize - 1) / __pagesize;
+            other->addr -= map->size;
+            other->size += map->size;
+            __maps_free(map);
+            __maps_check();
+            coalesced = true;
+          }
+        }
+        if (last && other->addr == last->addr + last->size) {
+          other->addr -= last->size;
+          other->size += last->size;
+          tree_remove(&__maps.maps, &last->tree);
+          __maps.count -= 1;
+          __maps_free(last);
+          __maps_check();
+        }
       }
     }
+    if (coalesced)
+      return;
   }
-  return rc;
-}
 
-static int __maps_funge_flags(int flags) {
-  flags &= ~MAP_FIXED;
-  flags &= ~MAP_FIXED_NOREPLACE;
-  if ((flags & MAP_TYPE) == MAP_SHARED_VALIDATE) {
-    flags &= ~MAP_TYPE;
-    flags |= MAP_SHARED;
-  }
-  return flags;
-}
-
-static bool __maps_fungible(const struct Map *map) {
-  // anonymous memory is fungible on unix, so we may coalesce such
-  // mappings in the rbtree to have fewer objects. on windows even
-  // anonymous memory has unique win32 handles we need to preserve
-  return !IsWindows() && (map->flags & MAP_ANONYMOUS);
-}
-
-static bool __maps_adjacent(const struct Map *x, const struct Map *y) {
-  char *a = x->addr + PGUP(x->size);
-  char *b = y->addr;
-  ASSERT(a <= b);
-  return a == b;
-}
-
-static bool __maps_mergeable(const struct Map *x, const struct Map *y) {
-  if (!__maps_adjacent(x, y))
-    return false;
-  if (!__maps_fungible(x))
-    return false;
-  if (!__maps_fungible(y))
-    return false;
-  if (x->prot != y->prot)
-    return false;
-  if (__maps_funge_flags(x->flags) != __maps_funge_flags(y->flags))
-    return false;
-  return true;
-}
-
-void __maps_insert(struct Map *map) {
-  struct Map *left, *right;
-  ASSERT(map->size);
-  ASSERT(__maps_held());
-  ASSERT(!__maps_overlaps(map->addr, map->size));
+  // otherwise insert new mapping
   __maps.pages += (map->size + __pagesize - 1) / __pagesize;
-
-  // find adjacent mappings
-  if ((left = __maps_floor(map->addr))) {
-    right = __maps_next(left);
-  } else {
-    right = __maps_first();
-  }
-
-  // avoid insert by making mapping on left bigger
-  if (left)
-    if (__maps_mergeable(left, map)) {
-      left->size = PGUP(left->size);
-      left->size += map->size;
-      __maps_free(map);
-      map = 0;
-    }
-
-  // avoid insert by making mapping on right bigger
-  if (map && right)
-    if (__maps_mergeable(map, right)) {
-      map->size = PGUP(map->size);
-      right->addr -= map->size;
-      right->size += map->size;
-      __maps_free(map);
-      map = 0;
-    }
-
-  // check if we filled a hole
-  if (!map && left && right)
-    if (__maps_mergeable(left, right)) {
-      left->size = PGUP(left->size);
-      left->size += right->size;
-      tree_remove(&__maps.maps, &right->tree);
-      __maps_free(right);
-      __maps.count -= 1;
-    }
-
-  // otherwise just insert
-  if (map)
-    __maps_add(map);
-
-  // sanity check
+  __maps_add(map);
   __maps_check();
 }
 
-// adds interval to rbtree
-bool __maps_track(char *addr, size_t size, int prot, int flags) {
-  struct Map *map;
-  if (!(map = __maps_alloc()))
-    return false;
-  map->addr = addr;
-  map->size = size;
-  map->prot = prot;
-  map->flags = flags;
-  map->hand = MAPS_VIRTUAL;
-  __maps_lock();
-  __maps_insert(map);
-  __maps_unlock();
-  return true;
-}
-
-// removes interval from rbtree (no sys_munmap)
-int __maps_untrack(char *addr, size_t size) {
-  struct Map *deleted = 0;
-  __maps_lock();
-  int rc = __muntrack(addr, size, &deleted, 0, 0);
-  __maps_unlock();
-  __maps_free_all(deleted);
-  return rc;
-}
-
-textwindows dontinline static struct DirectMap sys_mmap_nt(
-    void *addr, size_t size, int prot, int flags, int fd, int64_t off) {
-  struct DirectMap dm;
-
-  // it's 5x faster
-  if ((flags & MAP_ANONYMOUS) && (flags & MAP_TYPE) != MAP_SHARED) {
-    if (!(dm.addr = VirtualAlloc(addr, size, kNtMemReserve | kNtMemCommit,
-                                 __prot2nt(prot, false)))) {
-      dm.addr = MAP_FAILED;
-    }
-    dm.hand = MAPS_VIRTUAL;
-    return dm;
-  }
-
-  int64_t file_handle;
-  if (flags & MAP_ANONYMOUS) {
-    file_handle = kNtInvalidHandleValue;
-  } else {
-    file_handle = g_fds.p[fd].handle;
-  }
-
-  // mark map handle as inheritable if fork might need it
-  const struct NtSecurityAttributes *mapsec;
-  if ((flags & MAP_TYPE) == MAP_SHARED) {
-    mapsec = &kNtIsInheritable;
-  } else {
-    mapsec = 0;
-  }
-
-  // nt will whine under many circumstances if we change the execute bit
-  // later using mprotect(). the workaround is to always request execute
-  // and then virtualprotect() it away until we actually need it. please
-  // note that open-nt.c always requests an kNtGenericExecute accessmask
-  int iscow = 0;
-  int page_flags;
-  int file_flags;
-  if (file_handle != -1) {
-    if ((flags & MAP_TYPE) != MAP_SHARED) {
-      // windows has cow pages but they can't propagate across fork()
-      // that means we only get copy-on-write for the root process :(
-      page_flags = kNtPageExecuteWritecopy;
-      file_flags = kNtFileMapCopy | kNtFileMapExecute;
-      iscow = 1;
-    } else {
-      if ((g_fds.p[fd].flags & O_ACCMODE) == O_RDONLY) {
-        page_flags = kNtPageExecuteRead;
-        file_flags = kNtFileMapRead | kNtFileMapExecute;
-      } else {
-        page_flags = kNtPageExecuteReadwrite;
-        file_flags = kNtFileMapWrite | kNtFileMapExecute;
-      }
-    }
-  } else {
-    page_flags = kNtPageExecuteReadwrite;
-    file_flags = kNtFileMapWrite | kNtFileMapExecute;
-  }
-
-  int e = errno;
-TryAgain:
-  if ((dm.hand = CreateFileMapping(file_handle, mapsec, page_flags,
-                                   (size + off) >> 32, (size + off), 0))) {
-    if ((dm.addr = MapViewOfFileEx(dm.hand, file_flags, off >> 32, off, size,
-                                   addr))) {
-      uint32_t oldprot;
-      if (VirtualProtect(dm.addr, size, __prot2nt(prot, iscow), &oldprot))
-        return dm;
-      UnmapViewOfFile(dm.addr);
-    }
-    CloseHandle(dm.hand);
-  } else if (!(prot & PROT_EXEC) &&               //
-             (file_flags & kNtFileMapExecute) &&  //
-             GetLastError() == kNtErrorAccessDenied) {
-    // your file needs to have been O_CREAT'd with exec `mode` bits in
-    // order to be mapped with executable permission. we always try to
-    // get execute permission if the kernel will give it to us because
-    // win32 would otherwise forbid mprotect() from elevating later on
-    file_flags &= ~kNtFileMapExecute;
-    switch (page_flags) {
-      case kNtPageExecuteWritecopy:
-        page_flags = kNtPageWritecopy;
-        break;
-      case kNtPageExecuteReadwrite:
-        page_flags = kNtPageReadwrite;
-        break;
-      case kNtPageExecuteRead:
-        page_flags = kNtPageReadonly;
-        break;
-      default:
-        __builtin_unreachable();
-    }
-    errno = e;
-    goto TryAgain;
-  }
-
-  dm.hand = kNtInvalidHandleValue;
-  dm.addr = (void *)(intptr_t)-1;
-  return dm;
-}
-
-static struct DirectMap sys_mmap(void *addr, size_t size, int prot, int flags,
-                                 int fd, int64_t off) {
-  struct DirectMap d;
-  if (IsXnuSilicon()) {
-    long p = _sysret(__syslib->__mmap(addr, size, prot, flags, fd, off));
-    d.hand = kNtInvalidHandleValue;
-    d.addr = (void *)p;
-  } else if (IsWindows()) {
-    d = sys_mmap_nt(addr, size, prot, flags, fd, off);
-  } else if (IsMetal()) {
-    d.addr = sys_mmap_metal(addr, size, prot, flags, fd, off);
-    d.hand = kNtInvalidHandleValue;
-  } else {
-    d.addr = __sys_mmap(addr, size, prot, flags, fd, off, off);
-    d.hand = kNtInvalidHandleValue;
-  }
-  return d;
-}
-
 struct Map *__maps_alloc(void) {
   struct Map *map;
-  uintptr_t tip = atomic_load_explicit(&__maps.freed, memory_order_relaxed);
-  while ((map = (struct Map *)PTR(tip)))
-    if (atomic_compare_exchange_weak_explicit(
-            &__maps.freed, &tip, ABA(map->freed, TAG(tip) + 1),
-            memory_order_acquire, memory_order_relaxed))
+  map = atomic_load_explicit(&__maps.freed, memory_order_relaxed);
+  while (map) {
+    if (atomic_compare_exchange_weak_explicit(&__maps.freed, &map, map->freed,
+                                              memory_order_acquire,
+                                              memory_order_relaxed))
       return map;
-  // we're creating sudden surprise memory. the user might be in the
-  // middle of carefully planning a fixed memory structure. we don't
-  // want the system allocator to put our surprise memory inside it,
-  // and we also want to avoid the chances of accidentally unmapping
-  struct DirectMap sys =
-      sys_mmap(__maps_randaddr(), MAPS_SIZE, PROT_READ | PROT_WRITE,
-               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  }
+  int gransz = __gransize;
+  struct DirectMap sys = sys_mmap(0, gransz, PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   if (sys.addr == MAP_FAILED)
     return 0;
-  struct MapSlab *slab = sys.addr;
-  while (!atomic_compare_exchange_weak(&__maps.slabs, &slab->next, slab)) {
-  }
-  for (size_t i = 1; i < ARRAYLEN(slab->maps); ++i)
-    __maps_free(&slab->maps[i]);
-  return &slab->maps[0];
+  map = sys.addr;
+  map->addr = sys.addr;
+  map->size = gransz;
+  map->prot = PROT_READ | PROT_WRITE;
+  map->flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK;
+  map->hand = sys.maphandle;
+  __maps_lock();
+  __maps_insert(map);
+  __maps_unlock();
+  for (int i = 1; i < gransz / sizeof(struct Map); ++i)
+    __maps_free(map + i);
+  return MAPS_RETRY;
 }
 
 static int __munmap(char *addr, size_t size) {
 
   // validate arguments
-  if (((uintptr_t)addr & (__gransize - 1)) ||  //
+  int pagesz = __pagesize;
+  int gransz = __gransize;
+  if (((uintptr_t)addr & (gransz - 1)) ||  //
       !size || (uintptr_t)addr + size < size)
     return einval();
 
-  // test for signal handler tragedy
-  if (__maps_reentrant())
-    return edeadlk();
-
   // lock the memory manager
-  __maps_lock();
+  // abort on reentry due to signal handler
+  if (__maps_lock()) {
+    __maps_unlock();
+    return edeadlk();
+  }
   __maps_check();
 
-  // on windows we can only unmap whole allocations
-  if (IsWindows())
-    if (!__maps_envelops(addr, size)) {
+  // normalize size
+  // abort if size doesn't include all pages in granule
+  size_t pgup_size = (size + pagesz - 1) & -pagesz;
+  size_t grup_size = (size + gransz - 1) & -gransz;
+  if (grup_size > pgup_size)
+    if (__maps_overlaps(addr + pgup_size, grup_size - pgup_size, pagesz)) {
       __maps_unlock();
-      return enotsup();
+      return einval();
     }
 
   // untrack mappings
-  int rc;
-  struct Map temp[2];
   struct Map *deleted = 0;
-  struct Map *untracked = 0;
-  rc = __muntrack(addr, size, &deleted, &untracked, temp);
+  __muntrack(addr, pgup_size, pagesz, &deleted);
   __maps_unlock();
 
-  // ask operating system to remove mappings
-  rc |= __maps_destroy_all(untracked);
-  rc |= __maps_destroy_all(deleted);
+  // delete mappings
+  int rc = 0;
+  for (struct Map *map = deleted; map; map = map->freed) {
+    if (!IsWindows()) {
+      if (sys_munmap(map->addr, map->size))
+        rc = -1;
+    } else if (map->hand != -1) {
+      ASSERT(!((uintptr_t)map->addr & (gransz - 1)));
+      if (!UnmapViewOfFile(map->addr))
+        rc = -1;
+      if (!CloseHandle(map->hand))
+        rc = -1;
+    }
+  }
+
+  // freed mappings
   __maps_free_all(deleted);
 
   return rc;
@@ -639,76 +378,46 @@ static int __munmap(char *addr, size_t size) {
 
 void *__maps_randaddr(void) {
   uintptr_t addr;
-  __maps_lock();
-  addr = (__maps.rand *= 15750249268501108917ull) >> 64;
-  __maps_unlock();
-  addr &= 0x3fffffffffff;
+  addr = _rand64();
+  addr &= 0x007fffffffff;
   addr |= 0x004000000000;
   addr &= -__gransize;
   return (void *)addr;
 }
 
-static void *__maps_pickaddr(size_t size) {
-  ASSERT(__maps_held());
-  char *addr = 0;
-  struct Map *map, *prev;
-  size = GRUP(size);
-  if ((map = __maps_last())) {
-    // choose address beneath higher mapping
-    for (; map; map = prev) {
-      char *min = (char *)(intptr_t)__gransize;
-      if ((prev = __maps_prev(map)))
-        min = prev->addr + GRUP(prev->size);
-      if (map->addr > min &&  //
-          map->addr - min >= size) {
-        addr = map->addr - size;
-        break;
-      }
+void *__maps_pickaddr(size_t size) {
+  char *addr;
+  for (int try = 0; try < MAX_TRIES; ++try) {
+    addr = atomic_exchange_explicit(&__maps.pick, 0, memory_order_acq_rel);
+    if (!addr)
+      addr = __maps_randaddr();
+    __maps_lock();
+    bool overlaps = __maps_overlaps(addr, size, __pagesize);
+    __maps_unlock();
+    if (!overlaps) {
+      atomic_store_explicit(&__maps.pick,
+                            addr + ((size + __gransize - 1) & __gransize),
+                            memory_order_release);
+      return addr;
     }
-    // append if existing maps are too dense
-    if (!addr) {
-      map = __maps_last();
-      addr = map->addr + GRUP(map->size);
-      intptr_t end = (intptr_t)addr;
-      if (ckd_add(&end, end, size))
-        return 0;
-    }
-  } else {
-    // roll the dice if rbtree is empty
-    addr = __maps_randaddr();
   }
-  return addr;
+  return 0;
 }
 
-static void *__mmap_impl(char *addr, size_t size, int prot, int flags, int fd,
-                         int64_t off) {
-
-  // validate file map args
-  if (flags & MAP_ANONYMOUS) {
-    // some operating systems will complain unless we do this
-    fd = -1;
-    off = 0;
-  } else {
-    // validate arguments for file mapping
-    if (off & (__gransize - 1))
-      return (void *)einval();
-    if (IsWindows()) {
-      if (!__isfdkind(fd, kFdFile))
-        return (void *)eacces();
-      if ((g_fds.p[fd].flags & O_ACCMODE) == O_WRONLY)
-        return (void *)eacces();
-    }
-  }
+static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
+                          int64_t off, int pagesz, int gransz) {
 
   // allocate Map object
   struct Map *map;
-  if (!(map = __maps_alloc()))
-    return MAP_FAILED;
+  do {
+    if (!(map = __maps_alloc()))
+      return MAP_FAILED;
+  } while (map == MAPS_RETRY);
 
   // polyfill nuances of fixed mappings
   int sysflags = flags;
   bool noreplace = false;
-  bool fixedmode = false;
+  bool should_untrack = false;
   if (flags & MAP_FIXED_NOREPLACE) {
     if (flags & MAP_FIXED) {
       __maps_free(map);
@@ -719,107 +428,39 @@ static void *__mmap_impl(char *addr, size_t size, int prot, int flags, int fd,
       noreplace = true;
       sysflags |= MAP_FIXED_NOREPLACE_linux;
     } else if (IsFreebsd() || IsNetbsd()) {
-      // todo: insert a reservation like windows
       sysflags |= MAP_FIXED;
-      __maps_lock();
-      if (__maps_overlaps(addr, size)) {
-        __maps_unlock();
+      if (__maps_overlaps(addr, size, pagesz)) {
         __maps_free(map);
         return (void *)eexist();
       }
-      __maps_unlock();
     } else {
       noreplace = true;
     }
   } else if (flags & MAP_FIXED) {
-    fixedmode = true;
+    should_untrack = true;
   }
 
-  // loop for memory
+  // remove mapping we blew away
+  if (IsWindows() && should_untrack)
+    __munmap(addr, size);
+
+  // obtain mapping from operating system
   int olderr = errno;
+  int tries = MAX_TRIES;
   struct DirectMap res;
-  for (;;) {
-
-    // transactionally find the mark on windows
-    if (IsWindows()) {
-      __maps_lock();
-      if (!fixedmode) {
-        // give user desired address if possible
-        if (addr && __maps_overlaps(addr, size)) {
-          if (noreplace) {
-            __maps_unlock();
-            __maps_free(map);
-            return (void *)eexist();
-          }
-          addr = 0;
-        }
-        // choose suitable address then claim it in our rbtree
-        if (!addr && !(addr = __maps_pickaddr(size))) {
-          __maps_unlock();
-          __maps_free(map);
-          return (void *)enomem();
-        }
+TryAgain:
+  res = sys_mmap(addr, size, prot, sysflags, fd, off);
+  if (res.addr == MAP_FAILED) {
+    if (IsWindows() && errno == EADDRNOTAVAIL) {
+      if (noreplace) {
+        errno = EEXIST;
+      } else if (should_untrack) {
+        errno = ENOMEM;
+      } else if (--tries && (addr = __maps_pickaddr(size))) {
+        errno = olderr;
+        goto TryAgain;
       } else {
-        // remove existing mappings and their tracking objects
-        if (!__maps_envelops(addr, size)) {
-          __maps_unlock();
-          __maps_free(map);
-          return (void *)enotsup();
-        }
-        struct Map *deleted = 0;
-        if (__muntrack(addr, size, &deleted, 0, 0)) {
-          __maps_insert_all(deleted);
-          __maps_unlock();
-          __maps_free(map);
-          return MAP_FAILED;
-        }
-        int rc = __maps_destroy_all(deleted);
-        __maps_free_all(deleted);
-        if (rc) {
-          __maps_unlock();
-          __maps_free(map);
-          return (void *)eperm();
-        }
-      }
-      // claims intended interval while still holding the lock
-      map->addr = addr;
-      map->size = size;
-      map->prot = 0;
-      map->flags = 0;
-      map->hand = MAPS_RESERVATION;
-      __maps_insert(map);
-      __maps_unlock();
-    }
-
-    // ask operating system for our memory
-    // notice how we're not holding the lock
-    res = sys_mmap(addr, size, prot, sysflags, fd, off);
-    if (res.addr != MAP_FAILED)
-      break;
-
-    // handle failure
-    if (IsWindows()) {
-      // untrack reservation
-      __maps_lock();
-      tree_remove(&__maps.maps, &map->tree);
-      __maps.pages -= (map->size + __pagesize - 1) / __pagesize;
-      __maps_unlock();
-      if (errno == EADDRNOTAVAIL) {
-        // we've encountered mystery memory
-        if (fixedmode) {
-          // TODO(jart): Use VirtualQuery() to destroy mystery memory.
-          errno = ENOMEM;
-        } else if (noreplace) {
-          // we can't try again with a different address in this case
-          errno = EEXIST;
-        } else {
-          // we shall leak the tracking object since it should at least
-          // partially cover the mystery mapping. so if we loop forever
-          // the system should eventually recover and find fresh spaces
-          errno = olderr;
-          addr = 0;
-          continue;
-        }
+        errno = ENOMEM;
       }
     }
     __maps_free(map);
@@ -827,49 +468,79 @@ static void *__mmap_impl(char *addr, size_t size, int prot, int flags, int fd,
   }
 
   // polyfill map fixed noreplace
+  // we assume non-linux gives us addr if it's freed
+  // that's what linux (e.g. rhel7) did before noreplace
   if (noreplace && res.addr != addr) {
-    ASSERT(!IsWindows());
-    sys_munmap(res.addr, size);
+    if (!IsWindows()) {
+      sys_munmap(res.addr, size);
+    } else {
+      UnmapViewOfFile(res.addr);
+      CloseHandle(res.maphandle);
+    }
     __maps_free(map);
     return (void *)eexist();
   }
 
-  // setup map object
+  // untrack mapping we blew away
+  if (!IsWindows() && should_untrack) {
+    struct Map *deleted = 0;
+    __muntrack(res.addr, size, pagesz, &deleted);
+    __maps_free_all(deleted);
+  }
+
+  // track map object
   map->addr = res.addr;
   map->size = size;
   map->off = off;
   map->prot = prot;
   map->flags = flags;
-  map->hand = res.hand;
+  map->hand = res.maphandle;
   if (IsWindows()) {
     map->iscow = (flags & MAP_TYPE) != MAP_SHARED && fd != -1;
     map->readonlyfile = (flags & MAP_TYPE) == MAP_SHARED && fd != -1 &&
                         (g_fds.p[fd].flags & O_ACCMODE) == O_RDONLY;
   }
-
-  // track map object
-  if (!IsWindows()) {
-    struct Map *deleted = 0;
-    __maps_lock();
-    if (fixedmode)
-      if (__muntrack(res.addr, size, &deleted, 0, 0))
-        STRACE("memtrack compromised by hole punch oom");
-    __maps_insert(map);
-    __maps_unlock();
-    __maps_free_all(deleted);
-  } else {
-    atomic_thread_fence(memory_order_release);
-  }
+  __maps_lock();
+  __maps_insert(map);
+  __maps_unlock();
 
   return res.addr;
 }
 
+static void *__mmap_impl(char *addr, size_t size, int prot, int flags, int fd,
+                         int64_t off, int pagesz, int gransz) {
+
+  // validate file map args
+  if (!(flags & MAP_ANONYMOUS)) {
+    if (off & (gransz - 1))
+      return (void *)einval();
+    if (IsWindows()) {
+      if (!__isfdkind(fd, kFdFile))
+        return (void *)eacces();
+      if ((g_fds.p[fd].flags & O_ACCMODE) == O_WRONLY)
+        return (void *)eacces();
+    }
+  }
+
+  // try to pick our own addresses on windows which are higher up in the
+  // vaspace. this is important so that conflicts are less likely, after
+  // forking when resurrecting mappings, because win32 has a strong pref
+  // with lower memory addresses which may get assigned to who knows wut
+  if (IsWindows() && !addr)
+    if (!(addr = __maps_pickaddr(size)))
+      return (void *)enomem();
+
+  return __mmap_chunk(addr, size, prot, flags, fd, off, pagesz, gransz);
+}
+
 static void *__mmap(char *addr, size_t size, int prot, int flags, int fd,
                     int64_t off) {
   char *res;
+  int pagesz = __pagesize;
+  int gransz = __gransize;
 
   // validate arguments
-  if ((uintptr_t)addr & (__gransize - 1))
+  if ((uintptr_t)addr & (gransz - 1))
     addr = NULL;
   if (!addr && (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)))
     return (void *)eperm();
@@ -879,16 +550,12 @@ static void *__mmap(char *addr, size_t size, int prot, int flags, int fd,
     return (void *)einval();
   if (size > MAX_SIZE)
     return (void *)enomem();
-  if (__maps.count * __pagesize + size > __virtualmax)
+  if (__maps.count * pagesz + size > __virtualmax)
     return (void *)enomem();
 
-  // test for signal handler reentry
-  if (__maps_reentrant())
-    return (void *)edeadlk();
-
   // create memory mappping
   if (!__isfdkind(fd, kFdZip)) {
-    res = __mmap_impl(addr, size, prot, flags, fd, off);
+    res = __mmap_impl(addr, size, prot, flags, fd, off, pagesz, gransz);
   } else {
     res = _weaken(__zipos_mmap)(
         addr, size, prot, flags,
@@ -899,17 +566,40 @@ static void *__mmap(char *addr, size_t size, int prot, int flags, int fd,
 }
 
 static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size,
-                           int flags, char *new_addr) {
+                           int flags, char *new_addr, int pagesz, int gransz) {
+
+  // normalize and validate old size
+  // abort if size doesn't include all pages in granule
+  size_t pgup_old_size = (old_size + pagesz - 1) & -pagesz;
+  size_t grup_old_size = (old_size + gransz - 1) & -gransz;
+  if (grup_old_size > pgup_old_size)
+    if (__maps_overlaps(old_addr + pgup_old_size, grup_old_size - pgup_old_size,
+                        pagesz))
+      return (void *)einval();
+  old_size = pgup_old_size;
+
+  // validate new size
+  // abort if size doesn't include all pages in granule
+  if (flags & MREMAP_FIXED) {
+    size_t pgup_new_size = (new_size + pagesz - 1) & -pagesz;
+    size_t grup_new_size = (new_size + gransz - 1) & -gransz;
+    if (grup_new_size > pgup_new_size)
+      if (__maps_overlaps(new_addr + pgup_new_size,
+                          grup_new_size - pgup_new_size, pagesz))
+        return (void *)einval();
+  }
 
   // allocate object for tracking new mapping
   struct Map *map;
-  if (!(map = __maps_alloc()))
-    return (void *)enomem();
+  do {
+    if (!(map = __maps_alloc()))
+      return (void *)enomem();
+  } while (map == MAPS_RETRY);
 
   // check old interval is fully contained within one mapping
   struct Map *old_map;
   if (!(old_map = __maps_floor(old_addr)) ||
-      old_addr + PGUP(old_size) > old_map->addr + PGUP(old_map->size) ||
+      old_addr + old_size > old_map->addr + PGUP(old_map->size) ||
       old_addr < old_map->addr) {
     __maps_free(map);
     return (void *)efault();
@@ -954,7 +644,7 @@ static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size,
 
   // untrack old mapping
   struct Map *deleted = 0;
-  __muntrack(old_addr, old_size, &deleted, 0, 0);
+  __muntrack(old_addr, old_size, pagesz, &deleted);
   __maps_free_all(deleted);
 
   // track map object
@@ -963,7 +653,6 @@ static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size,
   map->off = old_off;
   map->prot = old_prot;
   map->flags = old_flags;
-  map->hand = kNtInvalidHandleValue;
   __maps_insert(map);
 
   return res;
@@ -972,6 +661,9 @@ static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size,
 static void *__mremap(char *old_addr, size_t old_size, size_t new_size,
                       int flags, char *new_addr) {
 
+  int pagesz = __pagesize;
+  int gransz = __gransize;
+
   // kernel support
   if (!IsLinux() && !IsNetbsd())
     return (void *)enosys();
@@ -985,16 +677,17 @@ static void *__mremap(char *old_addr, size_t old_size, size_t new_size,
   // we support these flags
   if (flags & ~(MREMAP_MAYMOVE | MREMAP_FIXED))
     return (void *)einval();
-  if (IsNetbsd() && !(flags & MREMAP_MAYMOVE) && PGUP(new_size) > old_size)
+  if (IsNetbsd() && !(flags & MREMAP_MAYMOVE) &&
+      ((new_size + pagesz - 1) & -pagesz) > old_size)
     return (void *)enotsup();
   if ((flags & MREMAP_FIXED) && !(flags & MREMAP_MAYMOVE))
     return (void *)einval();
 
   // addresses must be granularity aligned
-  if ((uintptr_t)old_addr & (__gransize - 1))
+  if ((uintptr_t)old_addr & (gransz - 1))
     return (void *)einval();
   if (flags & MREMAP_FIXED)
-    if ((uintptr_t)new_addr & (__gransize - 1))
+    if ((uintptr_t)new_addr & (gransz - 1))
       return (void *)einval();
 
   // sizes must not be zero
@@ -1024,19 +717,20 @@ static void *__mremap(char *old_addr, size_t old_size, size_t new_size,
 
   // memory increase must not exceed RLIMIT_AS
   if (PGUP(new_size) > old_size)
-    if (__maps.count * __pagesize - old_size + PGUP(new_size) > __virtualmax)
+    if (__maps.count * pagesz - old_size + PGUP(new_size) > __virtualmax)
       return (void *)enomem();
 
-  // test for signal handler reentry
-  if (__maps_reentrant())
-    return (void *)edeadlk();
-
   // lock the memory manager
-  __maps_lock();
+  // abort on reentry due to signal handler
+  if (__maps_lock()) {
+    __maps_unlock();
+    return (void *)edeadlk();
+  }
   __maps_check();
 
   // perform operation
-  char *res = __mremap_impl(old_addr, old_size, new_size, flags, new_addr);
+  char *res = __mremap_impl(old_addr, old_size, new_size, flags, new_addr,
+                            pagesz, gransz);
 
   // return result
   __maps_unlock();
@@ -1113,24 +807,6 @@ static void *__mremap(char *old_addr, size_t old_size, size_t new_size,
  * The `MAP_CONCEAL` flag may be passed to prevent a memory mapping from
  * appearing in core dumps. This is currently supported on BSD OSes, and
  * is ignored on everything else.
- *
- * POSIX does not require mmap() to be asynchronous signal safe. But you
- * should be able to call this from a signal handler safely, if you know
- * that your signal will never interrupt the cosmopolitan memory manager
- * and the only way you can ensure that, is by blocking signals whenever
- * you call mmap(), munmap(), mprotect(), etc.
- *
- * @raise ENOMEM if `RUSAGE_AS` or similar limits are exceeded
- * @raise EEXIST if `flags` has `MAP_FIXED_NOREPLACE` and `addr` is used
- * @raise ENOTSUP if interval overlapped without enveloping win32 alloc
- * @raise EPERM if `addr` is null and `flags` has `MAP_FIXED`
- * @raise EINVAL if `addr` isn't granularity aligned with `MAP_FIXED`
- * @raise EINVAL if `size` is zero
- * @raise EINVAL if `flags` or `prot` hold invalid values
- * @raise EACCESS if `fd` isn't a regular file
- * @raise EACCESS if `fd` was opened in write-only mode
- * @raise EACCESS if `off` isn't getgransize() aligned
- * @raise EDEADLK if called from signal handler interrupting mmap()
  */
 void *mmap(void *addr, size_t size, int prot, int flags, int fd, int64_t off) {
   void *res = __mmap(addr, size, prot, flags, fd, off);
@@ -1176,10 +852,6 @@ void *mremap(void *old_addr, size_t old_size, size_t new_size, int flags, ...) {
  * The `size` parameter is implicitly rounded up to the page size.
  *
  * @return 0 on success, or -1 w/ errno.
- * @raise ENOMEM if OOM happened when punching hole in existing mapping
- * @raise ENOTSUP if interval overlapped without enveloping win32 alloc
- * @raise EDEADLK if called from signal handler interrupting mmap()
- * @raise EINVAL if `addr` isn't granularity aligned
  */
 int munmap(void *addr, size_t size) {
   int rc = __munmap(addr, size);
diff --git a/libc/intrin/mprotect.c b/libc/intrin/mprotect.c
index 393bc641c..784906acc 100644
--- a/libc/intrin/mprotect.c
+++ b/libc/intrin/mprotect.c
@@ -22,6 +22,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/directmap.h"
 #include "libc/intrin/dll.h"
+#include "libc/intrin/kprintf.h"
 #include "libc/intrin/maps.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/tree.h"
@@ -66,18 +67,17 @@ int __mprotect(char *addr, size_t size, int prot) {
   // normalize size
   size = (size + pagesz - 1) & -pagesz;
 
-  // test for signal handler reentry
-  if (__maps_reentrant())
-    return edeadlk();
-
   // change mappings
   int rc = 0;
   bool found = false;
-  __maps_lock();
-  struct Map *map;
-  if (!(map = __maps_floor(addr)))
-    map = __maps_first();
-  for (; map && map->addr <= addr + size; map = __maps_next(map)) {
+  if (__maps_lock()) {
+    __maps_unlock();
+    return edeadlk();
+  }
+  struct Map *map, *floor;
+StartOver:
+  floor = __maps_floor(addr);
+  for (map = floor; map && map->addr <= addr + size; map = __maps_next(map)) {
     char *map_addr = map->addr;
     size_t map_size = map->size;
     char *beg = MAX(addr, map_addr);
@@ -86,7 +86,7 @@ int __mprotect(char *addr, size_t size, int prot) {
       continue;
     found = true;
     if (addr <= map_addr && addr + size >= map_addr + PGUP(map_size)) {
-      // change protection status of pages
+      // change protection of entire mapping
       if (!__mprotect_chunk(map_addr, map_size, prot, map->iscow)) {
         map->prot = prot;
       } else {
@@ -98,6 +98,8 @@ int __mprotect(char *addr, size_t size, int prot) {
       size_t right = map_size - left;
       struct Map *leftmap;
       if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
         if (!__mprotect_chunk(map_addr, left, prot, false)) {
           leftmap->addr = map_addr;
           leftmap->size = left;
@@ -109,7 +111,7 @@ int __mprotect(char *addr, size_t size, int prot) {
           leftmap->hand = map->hand;
           map->addr += left;
           map->size = right;
-          map->hand = MAPS_SUBREGION;
+          map->hand = -1;
           if (!(map->flags & MAP_ANONYMOUS))
             map->off += left;
           tree_insert(&__maps.maps, &leftmap->tree, __maps_compare);
@@ -128,6 +130,8 @@ int __mprotect(char *addr, size_t size, int prot) {
       size_t right = map_addr + map_size - addr;
       struct Map *leftmap;
       if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
         if (!__mprotect_chunk(map_addr + left, right, prot, false)) {
           leftmap->addr = map_addr;
           leftmap->size = left;
@@ -140,7 +144,7 @@ int __mprotect(char *addr, size_t size, int prot) {
           map->addr += left;
           map->size = right;
           map->prot = prot;
-          map->hand = MAPS_SUBREGION;
+          map->hand = -1;
           if (!(map->flags & MAP_ANONYMOUS))
             map->off += left;
           tree_insert(&__maps.maps, &leftmap->tree, __maps_compare);
@@ -160,8 +164,14 @@ int __mprotect(char *addr, size_t size, int prot) {
       size_t right = map_size - middle - left;
       struct Map *leftmap;
       if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
         struct Map *midlmap;
         if ((midlmap = __maps_alloc())) {
+          if (midlmap == MAPS_RETRY) {
+            __maps_free(leftmap);
+            goto StartOver;
+          }
           if (!__mprotect_chunk(map_addr + left, middle, prot, false)) {
             leftmap->addr = map_addr;
             leftmap->size = left;
@@ -176,10 +186,10 @@ int __mprotect(char *addr, size_t size, int prot) {
             midlmap->off = (map->flags & MAP_ANONYMOUS) ? 0 : map->off + left;
             midlmap->prot = prot;
             midlmap->flags = map->flags;
-            midlmap->hand = MAPS_SUBREGION;
+            midlmap->hand = -1;
             map->addr += left + middle;
             map->size = right;
-            map->hand = MAPS_SUBREGION;
+            map->hand = -1;
             if (!(map->flags & MAP_ANONYMOUS))
               map->off += left + middle;
             tree_insert(&__maps.maps, &leftmap->tree, __maps_compare);
@@ -212,20 +222,11 @@ int __mprotect(char *addr, size_t size, int prot) {
 /**
  * Modifies restrictions on virtual memory address range.
  *
- * POSIX doesn't require mprotect() to be async signal safe. However you
- * should be able to call this from a signal handler safely, if you know
- * that your signal will never interrupt the cosmopolitan memory manager
- * and the only way you can ensure that, is by blocking signals whenever
- * you call mmap(), munmap(), mprotect(), etc.
- *
- * @param addr needs to be page size aligned
- * @param size is rounded up to the page size
- * @param prot can be PROT_NONE or a combination of PROT_READ,
- *     PROT_WRITE, and PROT_EXEC
+ * @param addr needs to be 4kb aligned
+ * @param prot can have PROT_{NONE,READ,WRITE,EXEC}
  * @return 0 on success, or -1 w/ errno
- * @raise EINVAL if `size` is zero
  * @raise ENOMEM on tracking memory oom
- * @raise EDEADLK if called from signal handler interrupting mmap()
+ * @see mmap()
  */
 int mprotect(void *addr, size_t size, int prot) {
   int rc;
diff --git a/libc/intrin/msync-nt.c b/libc/intrin/msync-nt.c
index 4e737678b..73f6ed95a 100644
--- a/libc/intrin/msync-nt.c
+++ b/libc/intrin/msync-nt.c
@@ -19,34 +19,34 @@
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/intrin/maps.h"
 #include "libc/nt/memory.h"
+#include "libc/nt/runtime.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/sysparam.h"
-#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/auxv.h"
 #include "libc/sysv/errfuns.h"
 
 textwindows int sys_msync_nt(char *addr, size_t size, int flags) {
-  size = (size + __pagesize - 1) & -__pagesize;
 
-  if ((uintptr_t)addr & (__pagesize - 1))
+  int pagesz = __pagesize;
+  size = (size + pagesz - 1) & -pagesz;
+
+  if ((uintptr_t)addr & (pagesz - 1))
     return einval();
-  if (__maps_reentrant())
-    return edeadlk();
 
   int rc = 0;
-  __maps_lock();
-  struct Map *map;
-  if (!(map = __maps_floor(addr)))
-    map = __maps_first();
-  for (; map && map->addr <= addr + size; map = __maps_next(map)) {
-    if (map->flags & MAP_ANONYMOUS)
-      continue;  // msync() is about coherency between file and memory
-    char *beg = MAX(addr, map->addr);
-    char *end = MIN(addr + size, map->addr + map->size);
-    if (beg >= end)
-      continue;  // didn't overlap mapping
-    if (!FlushViewOfFile(beg, end - beg))
-      rc = -1;
-    // TODO(jart): FlushFileBuffers too on g_fds handle if MS_SYNC?
+  if (__maps_lock()) {
+    rc = edeadlk();
+  } else {
+    struct Map *map, *floor;
+    floor = __maps_floor(addr);
+    for (map = floor; map && map->addr <= addr + size; map = __maps_next(map)) {
+      char *beg = MAX(addr, map->addr);
+      char *end = MIN(addr + size, map->addr + map->size);
+      if (beg < end)
+        if (!FlushViewOfFile(beg, end - beg))
+          rc = -1;
+      // TODO(jart): FlushFileBuffers too on g_fds handle if MS_SYNC?
+    }
   }
   __maps_unlock();
 
diff --git a/libc/intrin/msync.c b/libc/intrin/msync.c
index e9be44863..d3e43e26d 100644
--- a/libc/intrin/msync.c
+++ b/libc/intrin/msync.c
@@ -25,7 +25,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/errfuns.h"
 
 /**
@@ -38,7 +38,6 @@
  * @param flags needs MS_ASYNC or MS_SYNC and can have MS_INVALIDATE
  * @return 0 on success or -1 w/ errno
  * @raise ECANCELED if thread was cancelled in masked mode
- * @raise EDEADLK if called from signal handler interrupting mmap()
  * @raise EINTR if we needed to block and a signal was delivered instead
  * @raise EINVAL if `MS_SYNC` and `MS_ASYNC` were both specified
  * @raise EINVAL if unknown `flags` were passed
@@ -68,19 +67,23 @@ int msync(void *addr, size_t size, int flags) {
   } else {
     sysflags = MS_ASYNC;
   }
-  if (flags & MS_INVALIDATE)
+  if (flags & MS_INVALIDATE) {
     sysflags |= MS_INVALIDATE;
+  }
 
   // FreeBSD's manual says "The flags argument was both MS_ASYNC and
   // MS_INVALIDATE. Only one of these flags is allowed." which makes
   // following the POSIX recommendation somewhat difficult.
-  if (IsFreebsd())
-    if (sysflags == (MS_ASYNC | MS_INVALIDATE))
+  if (IsFreebsd()) {
+    if (sysflags == (MS_ASYNC | MS_INVALIDATE)) {
       sysflags = MS_INVALIDATE;
+    }
+  }
 
   // FreeBSD specifies MS_SYNC as 0 so we shift the Cosmo constants
-  if (IsFreebsd())
+  if (IsFreebsd()) {
     sysflags >>= 1;
+  }
 
   BEGIN_CANCELATION_POINT;
   if (!IsWindows()) {
diff --git a/libc/intrin/munmap-sysv.c b/libc/intrin/munmap-sysv.c
index 3d4b0c6ae..0f00ddc5c 100644
--- a/libc/intrin/munmap-sysv.c
+++ b/libc/intrin/munmap-sysv.c
@@ -41,6 +41,8 @@ int sys_munmap(void *p, size_t n) {
   } else {
     rc = __sys_munmap(p, n);
   }
+  if (!rc)
+    __virtualsize -= n;
   KERNTRACE("sys_munmap(%p, %'zu) → %d", p, n, rc);
   return rc;
 }
diff --git a/libc/intrin/nomultics.h b/libc/intrin/nomultics.h
index b2aca3ecf..833bc7e28 100644
--- a/libc/intrin/nomultics.h
+++ b/libc/intrin/nomultics.h
@@ -1,21 +1,17 @@
 #ifndef COSMOPOLITAN_NOMULTICS_H_
 #define COSMOPOLITAN_NOMULTICS_H_
 
-#define kTtySilence  1  /* do not relay read() into write() */
-#define kTtyEchoRaw  2  /* don't ^X visualize control codes */
-#define kTtyUncanon  4  /* enables non-canonical (raw) mode */
-#define kTtyNoCr2Nl  8  /* don't map \r → \n (a.k.a !ICRNL) */
-#define kTtyNoIsigs  16 /* don't auto-raise signals on keys */
-#define kTtyXtMouse  32 /* enables eXtreme Xterm mouse mode */
-#define kTtyNoIexten 64 /* disable various canon keystrokes */
-#define kTtyNoEchoe  128
-#define kTtyNoEchok  256
-#define kTtyNoEchoke 512
+#define kTtySilence 1  /* do not relay read() into write() */
+#define kTtyEchoRaw 2  /* don't ^X visualize control codes */
+#define kTtyUncanon 4  /* enables non-canonical (raw) mode */
+#define kTtyNoCr2Nl 8  /* don't map \r → \n (a.k.a !ICRNL) */
+#define kTtyNoIsigs 16 /* don't auto-raise signals on keys */
+#define kTtyXtMouse 32 /* enables eXtreme Xterm mouse mode */
 
 COSMOPOLITAN_C_START_
 
 struct TtyConf {
-  unsigned magic;
+  unsigned char magic;
   unsigned char mousebs;
   unsigned char replmode;
   unsigned char replstderr;
diff --git a/libc/intrin/ntcontext2linux.c b/libc/intrin/ntcontext2linux.c
new file mode 100644
index 000000000..bf9d3df15
--- /dev/null
+++ b/libc/intrin/ntcontext2linux.c
@@ -0,0 +1,82 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/ucontext.h"
+#include "libc/log/libfatal.internal.h"
+#include "libc/nt/struct/context.h"
+#include "libc/str/str.h"
+#ifdef __x86_64__
+
+textwindows void _ntcontext2linux(ucontext_t *ctx, const struct NtContext *cr) {
+  if (!cr)
+    return;
+  ctx->uc_mcontext.eflags = cr->EFlags;
+  ctx->uc_mcontext.rax = cr->Rax;
+  ctx->uc_mcontext.rbx = cr->Rbx;
+  ctx->uc_mcontext.rcx = cr->Rcx;
+  ctx->uc_mcontext.rdx = cr->Rdx;
+  ctx->uc_mcontext.rdi = cr->Rdi;
+  ctx->uc_mcontext.rsi = cr->Rsi;
+  ctx->uc_mcontext.rbp = cr->Rbp;
+  ctx->uc_mcontext.rsp = cr->Rsp;
+  ctx->uc_mcontext.rip = cr->Rip;
+  ctx->uc_mcontext.r8 = cr->R8;
+  ctx->uc_mcontext.r9 = cr->R9;
+  ctx->uc_mcontext.r10 = cr->R10;
+  ctx->uc_mcontext.r11 = cr->R11;
+  ctx->uc_mcontext.r12 = cr->R12;
+  ctx->uc_mcontext.r13 = cr->R13;
+  ctx->uc_mcontext.r14 = cr->R14;
+  ctx->uc_mcontext.r15 = cr->R15;
+  ctx->uc_mcontext.cs = cr->SegCs;
+  ctx->uc_mcontext.gs = cr->SegGs;
+  ctx->uc_mcontext.fs = cr->SegFs;
+  ctx->uc_mcontext.fpregs = &ctx->__fpustate;
+  __repmovsb(&ctx->__fpustate, &cr->FltSave, sizeof(ctx->__fpustate));
+  ctx->__fpustate.mxcsr = cr->MxCsr;
+}
+
+textwindows void _ntlinux2context(struct NtContext *cr, const ucontext_t *ctx) {
+  if (!cr)
+    return;
+  cr->EFlags = ctx->uc_mcontext.eflags;
+  cr->Rax = ctx->uc_mcontext.rax;
+  cr->Rbx = ctx->uc_mcontext.rbx;
+  cr->Rcx = ctx->uc_mcontext.rcx;
+  cr->Rdx = ctx->uc_mcontext.rdx;
+  cr->Rdi = ctx->uc_mcontext.rdi;
+  cr->Rsi = ctx->uc_mcontext.rsi;
+  cr->Rbp = ctx->uc_mcontext.rbp;
+  cr->Rsp = ctx->uc_mcontext.rsp;
+  cr->Rip = ctx->uc_mcontext.rip;
+  cr->R8 = ctx->uc_mcontext.r8;
+  cr->R9 = ctx->uc_mcontext.r9;
+  cr->R10 = ctx->uc_mcontext.r10;
+  cr->R11 = ctx->uc_mcontext.r11;
+  cr->R12 = ctx->uc_mcontext.r12;
+  cr->R13 = ctx->uc_mcontext.r13;
+  cr->R14 = ctx->uc_mcontext.r14;
+  cr->R15 = ctx->uc_mcontext.r15;
+  cr->SegCs = ctx->uc_mcontext.cs;
+  cr->SegGs = ctx->uc_mcontext.gs;
+  cr->SegFs = ctx->uc_mcontext.fs;
+  cr->MxCsr = ctx->__fpustate.mxcsr;
+  __repmovsb(&cr->FltSave, &ctx->__fpustate, sizeof(ctx->__fpustate));
+}
+
+#endif /* __x86_64__ */
diff --git a/libc/intrin/packsswb.c b/libc/intrin/packsswb.c
new file mode 100644
index 000000000..da3fa67b9
--- /dev/null
+++ b/libc/intrin/packsswb.c
@@ -0,0 +1,40 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/packsswb.h"
+#include "libc/limits.h"
+#include "libc/macros.internal.h"
+#include "libc/str/str.h"
+
+/**
+ * Casts shorts to signed chars w/ saturation.
+ *
+ *   𝑎 ← {CLAMP[𝑏ᵢ]|𝑖∈[0,4)} ║ {CLAMP[𝑐ᵢ]|𝑖∈[4,8)}
+ *
+ * @see packuswb()
+ * @mayalias
+ */
+void(packsswb)(int8_t a[16], const int16_t b[8], const int16_t c[8]) {
+  unsigned i;
+  int8_t r[16];
+  for (i = 0; i < 8; ++i)
+    r[i + 0] = MIN(INT8_MAX, MAX(INT8_MIN, b[i]));
+  for (i = 0; i < 8; ++i)
+    r[i + 8] = MIN(INT8_MAX, MAX(INT8_MIN, c[i]));
+  __builtin_memcpy(a, r, 16);
+}
diff --git a/libc/intrin/packsswb.h b/libc/intrin/packsswb.h
new file mode 100644
index 000000000..fae659b0b
--- /dev/null
+++ b/libc/intrin/packsswb.h
@@ -0,0 +1,13 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PACKSSWB_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PACKSSWB_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void packsswb(int8_t[16], const int16_t[8], const int16_t[8]);
+
+#define packsswb(A, B, C)                                                    \
+  INTRIN_SSEVEX_X_X_X_(packsswb, SSE2, "packsswb", INTRIN_NONCOMMUTATIVE, A, \
+                       B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PACKSSWB_H_ */
diff --git a/libc/intrin/packuswb.c b/libc/intrin/packuswb.c
new file mode 100644
index 000000000..66d9c766f
--- /dev/null
+++ b/libc/intrin/packuswb.c
@@ -0,0 +1,40 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/packuswb.h"
+#include "libc/limits.h"
+#include "libc/macros.internal.h"
+#include "libc/str/str.h"
+
+/**
+ * Casts shorts to unsigned chars w/ saturation.
+ *
+ *   𝑎 ← {CLAMP[𝑏ᵢ]|𝑖∈[0,4)} ║ {CLAMP[𝑐ᵢ]|𝑖∈[4,8)}
+ *
+ * @see packsswb()
+ * @mayalias
+ */
+void(packuswb)(uint8_t a[16], const int16_t b[8], const int16_t c[8]) {
+  unsigned i;
+  uint8_t r[16];
+  for (i = 0; i < 8; ++i)
+    r[i + 0] = MIN(UINT8_MAX, MAX(UINT8_MIN, b[i]));
+  for (i = 0; i < 8; ++i)
+    r[i + 8] = MIN(UINT8_MAX, MAX(UINT8_MIN, c[i]));
+  __builtin_memcpy(a, r, 16);
+}
diff --git a/libc/intrin/packuswb.h b/libc/intrin/packuswb.h
new file mode 100644
index 000000000..3c8ddf7da
--- /dev/null
+++ b/libc/intrin/packuswb.h
@@ -0,0 +1,13 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PACKUSWB_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PACKUSWB_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void packuswb(uint8_t[16], const int16_t[8], const int16_t[8]);
+
+#define packuswb(A, B, C)                                                    \
+  INTRIN_SSEVEX_X_X_X_(packuswb, SSE2, "packuswb", INTRIN_NONCOMMUTATIVE, A, \
+                       B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PACKUSWB_H_ */
diff --git a/libc/intrin/itoa16.c b/libc/intrin/paddw.c
similarity index 75%
rename from libc/intrin/itoa16.c
rename to libc/intrin/paddw.c
index 003aba59c..ea3351e4f 100644
--- a/libc/intrin/itoa16.c
+++ b/libc/intrin/paddw.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,22 +16,24 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/fmt/internal.h"
+#include "libc/intrin/paddw.h"
+#include "libc/str/str.h"
 
-__msabi textwindows dontinstrument char16_t *__itoa16(char16_t p[21],
-                                                      uint64_t x) {
-  char t;
-  size_t a, b, i = 0;
-  do {
-    p[i++] = x % 10 + '0';
-    x = x / 10;
-  } while (x > 0);
-  if (i) {
-    for (a = 0, b = i - 1; a < b; ++a, --b) {
-      t = p[a];
-      p[a] = p[b];
-      p[b] = t;
-    }
+/**
+ * Adds 16-bit integers.
+ *
+ * @param 𝑎 [w/o] receives result
+ * @param 𝑏 [r/o] supplies first input vector
+ * @param 𝑐 [r/o] supplies second input vector
+ * @note shorts can't overflow so ubsan won't report it when it happens
+ * @see paddsw()
+ * @mayalias
+ */
+void(paddw)(int16_t a[8], const int16_t b[8], const int16_t c[8]) {
+  unsigned i;
+  int16_t r[8];
+  for (i = 0; i < 8; ++i) {
+    r[i] = b[i] + c[i];
   }
-  return p + i;
+  __builtin_memcpy(a, r, 16);
 }
diff --git a/libc/intrin/paddw.h b/libc/intrin/paddw.h
new file mode 100644
index 000000000..bdad518d9
--- /dev/null
+++ b/libc/intrin/paddw.h
@@ -0,0 +1,12 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PADDW_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PADDW_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void paddw(int16_t[8], const int16_t[8], const int16_t[8]);
+
+#define paddw(A, B, C) \
+  INTRIN_SSEVEX_X_X_X_(paddw, SSE2, "paddw", INTRIN_COMMUTATIVE, A, B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PADDW_H_ */
diff --git a/libc/intrin/pagesize_init.S b/libc/intrin/pagesize_init.S
index bb9a8188e..5c1cda3fa 100644
--- a/libc/intrin/pagesize_init.S
+++ b/libc/intrin/pagesize_init.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.init.start 251,_init_pagesize
 	push	%rdi
diff --git a/libc/intrin/palignr.c b/libc/intrin/palignr.c
new file mode 100644
index 000000000..2f4474076
--- /dev/null
+++ b/libc/intrin/palignr.c
@@ -0,0 +1,43 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/palignr.h"
+#include "libc/assert.h"
+#include "libc/macros.internal.h"
+
+/**
+ * Overlaps vectors.
+ *
+ *     𝑖= 0 means 𝑐←𝑎
+ *   0<𝑖<16 means 𝑐←𝑎║𝑏
+ *     𝑖=16 means 𝑐←𝑏
+ *  16<𝑖<32 means 𝑐←𝑏║0
+ *     𝑖≥32 means 𝑐←0
+ *
+ * @param 𝑖 goes faster as constexpr
+ * @note not compatible with mmx
+ * @see pvalignr()
+ * @mayalias
+ */
+void(palignr)(void *c, const void *b, const void *a, unsigned long i) {
+  char t[48];
+  __builtin_memcpy(t, a, 16);
+  __builtin_memcpy(t + 16, b, 16);
+  __builtin_memset(t + 32, 0, 16);
+  __builtin_memcpy(c, t + MIN(i, 32), 16);
+}
diff --git a/libc/intrin/palignr.h b/libc/intrin/palignr.h
new file mode 100644
index 000000000..3995bd4a2
--- /dev/null
+++ b/libc/intrin/palignr.h
@@ -0,0 +1,45 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PALIGNR_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PALIGNR_H_
+#include "libc/intrin/macros.h"
+#include "libc/str/str.h"
+COSMOPOLITAN_C_START_
+
+void palignr(void *, const void *, const void *, unsigned long);
+
+#if !defined(__STRICT_ANSI__) && !defined(__chibicc__) && defined(__x86_64__)
+__intrin_xmm_t __palignrs(__intrin_xmm_t, __intrin_xmm_t);
+#define palignr(C, B, A, I)                                                 \
+  do {                                                                      \
+    if (__builtin_expect(!IsModeDbg() && X86_NEED(SSE) && X86_HAVE(SSSE3),  \
+                         1)) {                                              \
+      __intrin_xmm_t *Xmm0 = (void *)(C);                                   \
+      const __intrin_xmm_t *Xmm1 = (const __intrin_xmm_t *)(B);             \
+      const __intrin_xmm_t *Xmm2 = (const __intrin_xmm_t *)(A);             \
+      if (__builtin_constant_p(I)) {                                        \
+        if (!X86_NEED(AVX)) {                                               \
+          asm("palignr\t%2,%1,%0"                                           \
+              : "=x"(*Xmm0)                                                 \
+              : "x"(*Xmm2), "i"(I), "0"(*Xmm1));                            \
+        } else {                                                            \
+          asm("vpalignr\t%3,%2,%1,%0"                                       \
+              : "=x"(*Xmm0)                                                 \
+              : "x"(*Xmm1), "x"(*Xmm2), "i"(I));                            \
+        }                                                                   \
+      } else {                                                              \
+        unsigned long Vimm = (I);                                           \
+        typeof(__palignrs) *Fn;                                             \
+        if (__builtin_expect(Vimm < 32, 1)) {                               \
+          Fn = (typeof(__palignrs) *)((uintptr_t) & __palignrs + Vimm * 8); \
+          *Xmm0 = Fn(*Xmm1, *Xmm2);                                         \
+        } else {                                                            \
+          memset(Xmm0, 0, 16);                                              \
+        }                                                                   \
+      }                                                                     \
+    } else {                                                                \
+      palignr(C, B, A, I);                                                  \
+    }                                                                       \
+  } while (0)
+#endif
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PALIGNR_H_ */
diff --git a/libc/intrin/kipv6optnames.S b/libc/intrin/palignrs.S
similarity index 53%
rename from libc/intrin/kipv6optnames.S
rename to libc/intrin/palignrs.S
index a3bb86a37..9eeee072f 100644
--- a/libc/intrin/kipv6optnames.S
+++ b/libc/intrin/palignrs.S
@@ -1,7 +1,7 @@
 /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 │ vi: set noet ft=asm ts=8 sw=8 fenc=utf-8                                 :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,37 +16,110 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
-	.macro	.e e s
-	.long	\e - kIpv6Optnames
-	.long	.L\@ - kIpv6Optnames
-	.rodata.str1.1
-.L\@:	.string	"\s"
-	.previous
-	.endm
-
-	.section .rodata
-	.balign	4
-	.underrun
-kIpv6Optnames:
-	.e	IPV6_V6ONLY,"V6ONLY"
-	.e	IPV6_CHECKSUM,"CHECKSUM"
-	.e	IPV6_JOIN_GROUP,"JOIN_GROUP"
-	.e	IPV6_LEAVE_GROUP,"LEAVE_GROUP"
-	.e	IPV6_MULTICAST_HOPS,"MULTICAST_HOPS"
-	.e	IPV6_MULTICAST_IF,"MULTICAST_IF"
-	.e	IPV6_MULTICAST_LOOP,"MULTICAST_LOOP"
-	.e	IPV6_UNICAST_HOPS,"UNICAST_HOPS"
-	.e	IPV6_RECVTCLASS,"RECVTCLASS"
-	.e	IPV6_TCLASS,"TCLASS"
-	.e	IPV6_DONTFRAG,"DONTFRAG"
-	.e	IPV6_HOPLIMIT,"HOPLIMIT"
-	.e	IPV6_HOPOPTS,"HOPOPTS"
-	.e	IPV6_PKTINFO,"PKTINFO"
-	.e	IPV6_RECVRTHDR,"RECVRTHDR"
-	.e	IPV6_RTHDR,"RTHDR"
-	.long	MAGNUM_TERMINATOR
-	.endobj	kIpv6Optnames,globl,hidden
-	.overrun
+//	Jump table for palignr() with non-constexpr immediate parameter.
+//
+//	@note	needs ssse3 cf. prescott c. 2004 cf. bulldozer c. 2011
+//	@see	palignr()
+	.balign	8
+__palignrs:
+  	palignr	$0,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$1,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$2,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$3,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$4,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$5,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$6,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$7,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$8,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$9,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$10,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$11,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$12,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$13,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$14,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$15,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$16,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$17,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$18,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$19,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$20,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$21,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$22,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$23,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$24,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$25,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$26,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$27,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$28,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$29,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$30,%xmm1,%xmm0
+	ret
+	nop
+  	palignr	$31,%xmm1,%xmm0
+	ret
+	.if	. - __palignrs != 8 * 32 - 1
+	.error	"bad assemblage"
+	.endif
+	.endfn	__palignrs,globl
diff --git a/libc/intrin/pandn.c b/libc/intrin/pandn.c
new file mode 100644
index 000000000..10d91c52b
--- /dev/null
+++ b/libc/intrin/pandn.c
@@ -0,0 +1,34 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/pandn.h"
+
+/**
+ * Nands 128-bit integers.
+ *
+ * @param 𝑎 [w/o] receives result
+ * @param 𝑏 [r/o] supplies first input vector
+ * @param 𝑐 [r/o] supplies second input vector
+ * @mayalias
+ */
+void(pandn)(uint64_t a[2], const uint64_t b[2], const uint64_t c[2]) {
+  unsigned i;
+  for (i = 0; i < 2; ++i) {
+    a[i] = ~b[i] & c[i];
+  }
+}
diff --git a/libc/intrin/pandn.h b/libc/intrin/pandn.h
new file mode 100644
index 000000000..bb4687614
--- /dev/null
+++ b/libc/intrin/pandn.h
@@ -0,0 +1,12 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PANDN_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PANDN_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void pandn(uint64_t[2], const uint64_t[2], const uint64_t[2]);
+
+#define pandn(A, B, C) \
+  INTRIN_SSEVEX_X_X_X_(pandn, SSE2, "pandn", INTRIN_NONCOMMUTATIVE, A, B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PANDN_H_ */
diff --git a/libc/intrin/pcmpgtb.c b/libc/intrin/pcmpgtb.c
new file mode 100644
index 000000000..f1c895d72
--- /dev/null
+++ b/libc/intrin/pcmpgtb.c
@@ -0,0 +1,38 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/pcmpgtb.h"
+#include "libc/str/str.h"
+
+/**
+ * Compares signed 8-bit integers w/ greater than predicate.
+ *
+ * Note that operands can be xor'd with 0x80 for unsigned compares.
+ *
+ * @param 𝑎 [w/o] receives result
+ * @param 𝑏 [r/o] supplies first input vector
+ * @param 𝑐 [r/o] supplies second input vector
+ * @mayalias
+ */
+void(pcmpgtb)(int8_t a[16], const int8_t b[16], const int8_t c[16]) {
+  unsigned i;
+  int8_t r[16];
+  for (i = 0; i < 16; ++i)
+    r[i] = -(b[i] > c[i]);
+  __builtin_memcpy(a, r, 16);
+}
diff --git a/libc/intrin/pcmpgtb.h b/libc/intrin/pcmpgtb.h
new file mode 100644
index 000000000..043cedf4f
--- /dev/null
+++ b/libc/intrin/pcmpgtb.h
@@ -0,0 +1,12 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PCMPGTB_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PCMPGTB_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void pcmpgtb(int8_t[16], const int8_t[16], const int8_t[16]);
+
+#define pcmpgtb(A, B, C) \
+  INTRIN_SSEVEX_X_X_X_(pcmpgtb, SSE2, "pcmpgtb", INTRIN_NONCOMMUTATIVE, A, B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PCMPGTB_H_ */
diff --git a/libc/intrin/armlse.c b/libc/intrin/pcmpgtw.c
similarity index 77%
rename from libc/intrin/armlse.c
rename to libc/intrin/pcmpgtw.c
index b05bf0709..7bf94ef49 100644
--- a/libc/intrin/armlse.c
+++ b/libc/intrin/pcmpgtw.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,17 +16,21 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/getauxval.h"
-#include "libc/runtime/runtime.h"
-#include "libc/sysv/consts/auxv.h"
-#include "libc/sysv/consts/hwcap.h"
-#ifdef __aarch64__
+#include "libc/intrin/pcmpgtw.h"
+#include "libc/str/str.h"
 
-bool __aarch64_have_lse_atomics;
-
-static __attribute__((__constructor__(1))) void __aarch64_atomics_init(void) {
-  struct AuxiliaryValue x = __getauxval(AT_HWCAP);
-  __aarch64_have_lse_atomics = !!(x.value & HWCAP_ATOMICS);
+/**
+ * Compares signed 16-bit integers w/ greater than predicate.
+ *
+ * @param 𝑎 [w/o] receives result
+ * @param 𝑏 [r/o] supplies first input vector
+ * @param 𝑐 [r/o] supplies second input vector
+ * @mayalias
+ */
+void(pcmpgtw)(int16_t a[8], const int16_t b[8], const int16_t c[8]) {
+  unsigned i;
+  int16_t r[8];
+  for (i = 0; i < 8; ++i)
+    r[i] = -(b[i] > c[i]);
+  __builtin_memcpy(a, r, 16);
 }
-
-#endif /* __aarch64__ */
diff --git a/libc/intrin/pcmpgtw.h b/libc/intrin/pcmpgtw.h
new file mode 100644
index 000000000..bb9707d19
--- /dev/null
+++ b/libc/intrin/pcmpgtw.h
@@ -0,0 +1,12 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PCMPGTW_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PCMPGTW_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void pcmpgtw(int16_t[8], const int16_t[8], const int16_t[8]);
+
+#define pcmpgtw(A, B, C) \
+  INTRIN_SSEVEX_X_X_X_(pcmpgtw, SSE2, "pcmpgtw", INTRIN_NONCOMMUTATIVE, A, B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PCMPGTW_H_ */
diff --git a/libc/intrin/itimer.c b/libc/intrin/pmaddubsw.c
similarity index 68%
rename from libc/intrin/itimer.c
rename to libc/intrin/pmaddubsw.c
index 4d1825396..f2bdc9b58 100644
--- a/libc/intrin/itimer.c
+++ b/libc/intrin/pmaddubsw.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,28 +16,27 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/thread/itimer.h"
+#include "libc/intrin/pmaddubsw.h"
+#include "libc/limits.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
-#include "libc/thread/posixthread.internal.h"
 
-struct IntervalTimer __itimer = {
-    .lock = PTHREAD_MUTEX_INITIALIZER,
-    .cond = PTHREAD_COND_INITIALIZER,
-};
-
-textwindows void __itimer_lock(void) {
-  _pthread_mutex_lock(&__itimer.lock);
-}
-
-textwindows void __itimer_unlock(void) {
-  _pthread_mutex_unlock(&__itimer.lock);
-}
-
-textwindows void __itimer_wipe_and_reset(void) {
-  // timers aren't inherited by forked subprocesses
-  bzero(&__itimer.it, sizeof(__itimer.it));
-  _pthread_mutex_wipe_np(&__itimer.lock);
-  bzero(&__itimer.cond, sizeof(__itimer.cond));
-  __itimer.thread = 0;
-  __itimer.once = 0;
+/**
+ * Multiplies bytes and adds adjacent results w/ short saturation.
+ *
+ *     𝑤ᵢ ← CLAMP[ 𝑏₂ᵢ𝑐₂ᵢ + 𝑏₍₂ᵢ₊₁₎𝑐₍₂ᵢ₊₁₎ ]
+ *
+ * @param 𝑤 [w/o] receives shorts
+ * @param 𝑏 [r/o] is your byte data
+ * @param 𝑐 [r/o] are your int8 coefficients
+ * @note SSSE3 w/ Prescott c. 2004, Bulldozer c. 2011
+ * @note greatest simd op, like, ever
+ * @mayalias
+ */
+void(pmaddubsw)(int16_t w[8], const uint8_t b[16], const int8_t c[16]) {
+  unsigned i;
+  for (i = 0; i < 8; ++i) {
+    w[i] = MIN(SHRT_MAX, MAX(SHRT_MIN, (c[i * 2 + 0] * b[i * 2 + 0] +
+                                        c[i * 2 + 1] * b[i * 2 + 1])));
+  }
 }
diff --git a/libc/intrin/pmaddubsw.h b/libc/intrin/pmaddubsw.h
new file mode 100644
index 000000000..5e503c56c
--- /dev/null
+++ b/libc/intrin/pmaddubsw.h
@@ -0,0 +1,13 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PMADDUBSW_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PMADDUBSW_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void pmaddubsw(int16_t[8], const uint8_t[16], const int8_t[16]);
+
+#define pmaddubsw(W, B, C)                                                   \
+  INTRIN_SSEVEX_X_X_X_(pmaddubsw, SSSE3, "pmaddubsw", INTRIN_NONCOMMUTATIVE, \
+                       W, B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PMADDUBSW_H_ */
diff --git a/libc/calls/islinuxmodern.c b/libc/intrin/pmovmskb.c
similarity index 82%
rename from libc/calls/islinuxmodern.c
rename to libc/intrin/pmovmskb.c
index 565bd8fab..0ff024d1d 100644
--- a/libc/calls/islinuxmodern.c
+++ b/libc/intrin/pmovmskb.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,12 +16,19 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
-#include "libc/calls/syscall-sysv.internal.h"
-#include "libc/calls/syscall_support-sysv.internal.h"
-#include "libc/dce.h"
-#include "libc/errno.h"
+#include "libc/intrin/pmovmskb.h"
 
-bool32 IsLinuxModern(void) {
-  return IsLinux() && sys_close_range(-1, -2, 0) == -1 && errno == EINVAL;
+/**
+ * Turns result of byte comparison into bitmask.
+ *
+ * @param 𝑝 is byte vector to crunch
+ * @see pcmpeqb(), bsf(), etc.
+ */
+uint32_t(pmovmskb)(const uint8_t p[16]) {
+  uint32_t i, m;
+  for (m = i = 0; i < 16; ++i) {
+    if (p[i] & 0x80)
+      m |= 1 << i;
+  }
+  return m;
 }
diff --git a/libc/intrin/pmovmskb.h b/libc/intrin/pmovmskb.h
new file mode 100644
index 000000000..e17e1fb16
--- /dev/null
+++ b/libc/intrin/pmovmskb.h
@@ -0,0 +1,27 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PMOVMSKB_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PMOVMSKB_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+uint32_t pmovmskb(const uint8_t[16]);
+
+#if defined(__x86_64__) && defined(__GNUC__)
+#define pmovmskb(A)                                            \
+  ({                                                           \
+    uint32_t Mask;                                             \
+    if (!IsModeDbg() && X86_HAVE(SSE2)) {                      \
+      const __intrin_xmm_t *Xmm = (const __intrin_xmm_t *)(A); \
+      if (!X86_NEED(AVX)) {                                    \
+        asm("pmovmskb\t%1,%0" : "=r"(Mask) : "x"(*Xmm));       \
+      } else {                                                 \
+        asm("vpmovmskb\t%1,%0" : "=r"(Mask) : "x"(*Xmm));      \
+      }                                                        \
+    } else {                                                   \
+      Mask = pmovmskb(A);                                      \
+    }                                                          \
+    Mask;                                                      \
+  })
+#endif
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PMOVMSKB_H_ */
diff --git a/libc/thread/pthread_condattr_getclock.c b/libc/intrin/pmulhrsw.c
similarity index 77%
rename from libc/thread/pthread_condattr_getclock.c
rename to libc/intrin/pmulhrsw.c
index 3cb9b22d5..4326542e0 100644
--- a/libc/thread/pthread_condattr_getclock.c
+++ b/libc/intrin/pmulhrsw.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,19 +16,21 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/thread/thread.h"
+#include "libc/intrin/pmulhrsw.h"
+#include "libc/str/str.h"
 
 /**
- * Gets clock on condition variable attributes.
+ * Multiplies Q15 numbers.
  *
- * @param clock will be set to one of
- *     - `CLOCK_REALTIME` (default)
- *     - `CLOCK_MONOTONIC`
- *     - `CLOCK_REALTIME_COARSE`
- *     - `CLOCK_MONOTONIC_COARSE`
- * @return 0 on success, or error on failure
+ * @note goes fast w/ ssse3 (intel c. 2004, amd c. 2011)
+ * @note a.k.a. packed multiply high w/ round & scale
+ * @see Q2F(15,𝑥), F2Q(15,𝑥)
+ * @mayalias
  */
-int pthread_condattr_getclock(const pthread_condattr_t *attr, int *clock) {
-  *clock = attr->_clock;
-  return 0;
+void(pmulhrsw)(int16_t a[8], const int16_t b[8], const int16_t c[8]) {
+  unsigned i;
+  int16_t r[8];
+  for (i = 0; i < 8; ++i)
+    r[i] = (((b[i] * c[i]) >> 14) + 1) >> 1;
+  __builtin_memcpy(a, r, 16);
 }
diff --git a/libc/intrin/pmulhrsw.h b/libc/intrin/pmulhrsw.h
new file mode 100644
index 000000000..2182c3404
--- /dev/null
+++ b/libc/intrin/pmulhrsw.h
@@ -0,0 +1,12 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PMULHRSW_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PMULHRSW_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void pmulhrsw(int16_t a[8], const int16_t b[8], const int16_t c[8]);
+
+#define pmulhrsw(A, B, C) \
+  INTRIN_SSEVEX_X_X_X_(pmulhrsw, SSSE3, "pmulhrsw", INTRIN_COMMUTATIVE, A, B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PMULHRSW_H_ */
diff --git a/libc/intrin/printmaps.c b/libc/intrin/printmaps.c
index 7503876ed..d9eaa32af 100644
--- a/libc/intrin/printmaps.c
+++ b/libc/intrin/printmaps.c
@@ -16,92 +16,39 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/dce.h"
 #include "libc/fmt/conv.h"
 #include "libc/fmt/itoa.h"
-#include "libc/intrin/bsr.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/maps.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/auxv.h"
 
-// this will usually return 12 since x86 pml4t uses a 47 bit address
-// space in userspace, and decent arm machines uses a 48 bit address
-// space. however it could go lower on embedded devices. it can also
-// rise higher on expensive x86 machines with pml5t, if user uses it
-static int get_address_digits(int pagesz) {
-  int max_bits = 0;
-  for (struct Tree *e = tree_first(__maps.maps); e; e = tree_next(e)) {
-    struct Map *map = MAP_TREE_CONTAINER(e);
-    char *end = map->addr + ((map->size + pagesz - 1) & -pagesz);
-    int bits = bsrll((uintptr_t)end) + 1;
-    if (bits > max_bits)
-      max_bits = bits;
-  }
-  return ((max_bits + 3) & -4) / 4;
-}
-
 /**
- * Prints memory mappings known to cosmo.
+ * Prints memory mappings.
  */
 void __print_maps(size_t limit) {
+  char mappingbuf[8], sb[16];
   __maps_lock();
-  char sb[16];
-  char mappingbuf[8];
-  struct Map *last = 0;
-  int pagesz = __pagesize;
-  int gransz = __gransize;
-  int digs = get_address_digits(pagesz);
   for (struct Tree *e = tree_first(__maps.maps); e; e = tree_next(e)) {
     struct Map *map = MAP_TREE_CONTAINER(e);
-
-    // show gaps between maps
-    if (last) {
-      char *beg = last->addr + ((last->size + gransz - 1) & -gransz);
-      char *end = map->addr;
-      if (end > beg) {
-        size_t gap = end - beg;
-        sizefmt(sb, gap, 1024);
-        kprintf("%0*lx-%0*lx       %sb\n", digs, beg, digs, end, sb);
-      }
-    }
-    last = map;
-
-    // show mapping
-    kprintf("%0*lx-%0*lx %!s", digs, map->addr, digs, map->addr + map->size,
-            _DescribeMapping(mappingbuf, map->prot, map->flags));
+    kprintf("%012lx-%012lx %!s", map->addr, map->addr + map->size,
+            (DescribeMapping)(mappingbuf, map->prot, map->flags));
     sizefmt(sb, map->size, 1024);
     kprintf(" %!sb", sb);
-    if (IsWindows()) {
-      switch (map->hand) {
-        case MAPS_RESERVATION:
-          kprintf(" reservation");
-          break;
-        case MAPS_SUBREGION:
-          break;
-        case MAPS_VIRTUAL:
-          kprintf(" virtual");
-          break;
-        default:
-          kprintf(" hand=%ld", map->hand);
-          break;
-      }
-    }
+    if (map->hand && map->hand != -1)
+      kprintf(" hand=%ld", map->hand);
     if (map->iscow)
       kprintf(" cow");
     if (map->readonlyfile)
       kprintf(" readonlyfile");
     kprintf("\n");
-
-    // stay beneath our limit
     if (!--limit)
       break;
   }
-
-  // print summary
-  kprintf("# %'zu bytes in %'zu mappings\n", __maps.pages * pagesz,
+  kprintf("# %'zu bytes in %'zu mappings\n", __maps.pages * __pagesize,
           __maps.count);
   __maps_unlock();
 }
diff --git a/libc/intrin/printmapswin32.c b/libc/intrin/printmapswin32.c
deleted file mode 100644
index 8f03b7db0..000000000
--- a/libc/intrin/printmapswin32.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/fmt/conv.h"
-#include "libc/intrin/describeflags.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
-#include "libc/nt/enum/memflags.h"
-#include "libc/nt/memory.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/sysparam.h"
-#include "libc/str/str.h"
-
-static const struct DescribeFlags kNtMemState[] = {
-    {kNtMemCommit, "Commit"},    //
-    {kNtMemFree, "Free"},        //
-    {kNtMemReserve, "Reserve"},  //
-};
-
-const char *DescribeNtMemState(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kNtMemState, ARRAYLEN(kNtMemState), "kNtMem",
-                        x);
-}
-
-static const struct DescribeFlags kNtMemType[] = {
-    {kNtMemImage, "Image"},      //
-    {kNtMemMapped, "Mapped"},    //
-    {kNtMemPrivate, "Private"},  //
-};
-
-const char *DescribeNtMemType(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kNtMemType, ARRAYLEN(kNtMemType), "kNtMem", x);
-}
-
-void __print_maps_win32(int64_t hProcess, const char *addr, size_t size) {
-  char *p, b[5][64];
-  struct NtMemoryBasicInformation mi;
-  kprintf("%-12s %-12s %10s %16s %16s %32s %32s\n", "Allocation", "BaseAddress",
-          "RegionSize", "State", "Type", "AllocationProtect", "Protect");
-  for (p = 0;; p = (char *)mi.BaseAddress + mi.RegionSize) {
-    bzero(&mi, sizeof(mi));
-    if (!VirtualQueryEx(hProcess, p, &mi, sizeof(mi)))
-      break;
-    sizefmt(b[0], mi.RegionSize, 1024);
-    kprintf("%.12lx %.12lx %10s %16s %16s %32s %32s%s\n", mi.AllocationBase,
-            mi.BaseAddress, b[0], DescribeNtMemState(b[1], mi.State),
-            DescribeNtMemType(b[2], mi.Type),
-            _DescribeNtPageFlags(b[3], mi.AllocationProtect),
-            _DescribeNtPageFlags(b[4], mi.Protect),
-            (mi.State != kNtMemFree &&
-             MAX(addr, (const char *)mi.BaseAddress) <
-                 MIN(addr + size, (const char *)mi.BaseAddress + mi.RegionSize))
-                ? " [OVERLAPS]"
-                : "");
-  }
-}
diff --git a/libc/intrin/virtualalloc.c b/libc/intrin/psraw.c
similarity index 80%
rename from libc/intrin/virtualalloc.c
rename to libc/intrin/psraw.c
index 6993d8154..0ea62c129 100644
--- a/libc/intrin/virtualalloc.c
+++ b/libc/intrin/psraw.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,14 +16,20 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/nt/memory.h"
-#include "libc/nt/runtime.h"
+#include "libc/intrin/psraw.h"
 
 /**
- * Allocates memory on The New Technology.
+ * Divides shorts by two power.
+ *
+ * @note c needs to be a literal, asmconstexpr, or linkconstsym
+ * @note arithmetic shift right will sign extend negatives
+ * @mayalias
  */
-textwindows void *VirtualAlloc(void *lpAddress, uint64_t dwSize,
-                               uint32_t flAllocationType, uint32_t flProtect) {
-  return VirtualAllocEx(GetCurrentProcess(), lpAddress, dwSize,
-                        flAllocationType, flProtect);
+void(psraw)(int16_t a[8], const int16_t b[8], unsigned char k) {
+  unsigned i;
+  if (k > 15)
+    k = 15;
+  for (i = 0; i < 8; ++i) {
+    a[i] = b[i] >> k;
+  }
 }
diff --git a/libc/intrin/psraw.h b/libc/intrin/psraw.h
new file mode 100644
index 000000000..4814b073c
--- /dev/null
+++ b/libc/intrin/psraw.h
@@ -0,0 +1,14 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PSRAW_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PSRAW_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void psraw(int16_t[8], const int16_t[8], unsigned char) libcesque;
+void psrawv(int16_t[8], const int16_t[8], const uint64_t[2]) libcesque;
+
+#define psraw(A, B, I) INTRIN_SSEVEX_X_I_(psraw, SSE2, "psraw", A, B, I)
+#define psrawv(A, B, C) \
+  INTRIN_SSEVEX_X_X_X_(psrawv, SSE2, "psraw", INTRIN_NONCOMMUTATIVE, A, B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PSRAW_H_ */
diff --git a/libc/intrin/psrawv.c b/libc/intrin/psrawv.c
new file mode 100644
index 000000000..5409db233
--- /dev/null
+++ b/libc/intrin/psrawv.c
@@ -0,0 +1,34 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/psraw.h"
+
+/**
+ * Divides shorts by two power.
+ *
+ * @note arithmetic shift right will sign extend negatives
+ * @mayalias
+ */
+void(psrawv)(int16_t a[8], const int16_t b[8], const uint64_t c[2]) {
+  unsigned i;
+  unsigned char k;
+  k = c[0] > 15 ? 15 : c[0];
+  for (i = 0; i < 8; ++i) {
+    a[i] = b[i] >> k;
+  }
+}
diff --git a/libc/thread/pthread_spin_lock.c b/libc/intrin/pthread_atfork.c
similarity index 55%
rename from libc/thread/pthread_spin_lock.c
rename to libc/intrin/pthread_atfork.c
index ff7175a0a..5093ed594 100644
--- a/libc/thread/pthread_spin_lock.c
+++ b/libc/intrin/pthread_atfork.c
@@ -16,48 +16,62 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/atomic.h"
-#include "libc/intrin/strace.h"
+#include "libc/intrin/weaken.h"
+#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
 /**
- * Acquires spin lock, e.g.
+ * Registers fork() handlers.
  *
- *     pthread_spinlock_t lock;
- *     pthread_spin_init(&lock, PTHREAD_PROCESS_PRIVATE);
- *     pthread_spin_lock(&lock);
- *     // do work...
- *     pthread_spin_unlock(&lock);
- *     pthread_spin_destroy(&lock);
+ * Parent and child functions are called in the same order they're
+ * registered. Prepare functions are called in reverse order.
  *
- * This function has undefined behavior when `spin` wasn't intialized or
- * was destroyed, and if the lock is already held by the calling thread.
+ * Here's an example of how pthread_atfork() can be used:
  *
- * You can debug locks the acquisition of locks by building your program
- * with `cosmocc -mdbg` and passing the `--strace` flag to your program.
- * This will cause a line to be logged each time a mutex or spin lock is
- * locked or unlocked. When locking, this is printed after the lock gets
- * acquired. The entry to the lock operation will be logged too but only
- * if the lock couldn't be immediately acquired. Lock logging works best
- * when `mutex` refers to a static variable, in which case its name will
- * be printed in the log.
+ *     static struct {
+ *       pthread_once_t once;
+ *       pthread_mutex_t lock;
+ *       // data structures...
+ *     } g_lib;
  *
+ *     static void lib_wipe(void) {
+ *       pthread_mutex_init(&g_lib.lock, 0);
+ *     }
+ *
+ *     static void lib_lock(void) {
+ *       pthread_mutex_lock(&g_lib.lock);
+ *     }
+ *
+ *     static void lib_unlock(void) {
+ *       pthread_mutex_unlock(&g_lib.lock);
+ *     }
+ *
+ *     static void lib_setup(void) {
+ *       lib_wipe();
+ *       pthread_atfork(lib_lock, lib_unlock, lib_wipe);
+ *     }
+ *
+ *     static void lib_init(void) {
+ *       pthread_once(&g_lib.once, lib_setup);
+ *     }
+ *
+ *     void lib(void) {
+ *       lib_init();
+ *       lib_lock();
+ *       // do stuff...
+ *       lib_unlock();
+ *     }
+ *
+ * @param prepare is run by fork() before forking happens
+ * @param parent is run by fork() after forking happens in parent process
+ * @param child is run by fork() after forking happens in childe process
  * @return 0 on success, or errno on error
- * @see pthread_spin_trylock
- * @see pthread_spin_unlock
- * @see pthread_spin_init
+ * @raise ENOMEM if we require more vespene gas
  */
-errno_t pthread_spin_lock(pthread_spinlock_t *spin) {
-  if (atomic_exchange_explicit(&spin->_lock, 1, memory_order_acquire)) {
-    LOCKTRACE("acquiring pthread_spin_lock(%t)...", spin);
-    for (;;) {
-      for (;;)
-        if (!atomic_load_explicit(&spin->_lock, memory_order_relaxed))
-          break;
-      if (!atomic_exchange_explicit(&spin->_lock, 1, memory_order_acquire))
-        break;
-    }
+int pthread_atfork(atfork_f prepare, atfork_f parent, atfork_f child) {
+  if (_weaken(_pthread_atfork)) {
+    return _weaken(_pthread_atfork)(prepare, parent, child);
+  } else {
+    return 0;
   }
-  LOCKTRACE("pthread_spin_lock(%t)", spin);
-  return 0;
 }
diff --git a/libc/intrin/cursor.c b/libc/intrin/pthread_atfork_actual.c
similarity index 51%
rename from libc/intrin/cursor.c
rename to libc/intrin/pthread_atfork_actual.c
index b89b1be27..815517206 100644
--- a/libc/intrin/cursor.c
+++ b/libc/intrin/pthread_atfork_actual.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,46 +16,86 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
+#include "libc/atomic.h"
+#include "libc/calls/state.internal.h"
+#include "libc/cosmo.h"
+#include "libc/dce.h"
+#include "libc/errno.h"
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/fds.h"
+#include "libc/intrin/dll.h"
+#include "libc/intrin/strace.h"
+#include "libc/macros.internal.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/runtime/runtime.h"
+#include "libc/str/str.h"
 #include "libc/thread/posixthread.internal.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/tls.h"
 
-struct Cursor *__cursor_new(void) {
-  struct Cursor *c;
-  if ((c = _mapanon(sizeof(struct Cursor)))) {
-    if ((c->shared = _mapshared(sizeof(struct CursorShared)))) {
-      c->shared->lock = (pthread_mutex_t)PTHREAD_SHARED_MUTEX_INITIALIZER_NP;
-    } else {
-      munmap(c, sizeof(struct Cursor));
-      c = 0;
+struct AtFork {
+  struct AtFork *p[2];
+  atfork_f f[3];
+};
+
+static struct AtForks {
+  pthread_spinlock_t lock;
+  struct AtFork *list;
+  struct AtFork pool[64];
+  atomic_int allocated;
+} _atforks;
+
+static void _pthread_onfork(int i, const char *op) {
+  struct AtFork *a;
+  if (!i)
+    pthread_spin_lock(&_atforks.lock);
+  for (a = _atforks.list; a; a = a->p[!i]) {
+    if (a->f[i]) {
+      STRACE("pthread_atfork(%s, %t)", op, a->f[i]);
+      a->f[i]();
     }
+    _atforks.list = a;
   }
-  return c;
+  if (i)
+    pthread_spin_unlock(&_atforks.lock);
 }
 
-void __cursor_ref(struct Cursor *c) {
-  if (!c)
-    return;
-  unassert(atomic_fetch_add_explicit(&c->refs, 1, memory_order_relaxed) >= 0);
+void _pthread_onfork_prepare(void) {
+  _pthread_onfork(0, "prepare");
 }
 
-int __cursor_unref(struct Cursor *c) {
-  if (!c)
+void _pthread_onfork_parent(void) {
+  _pthread_onfork(1, "parent");
+}
+
+void _pthread_onfork_child(void) {
+  _pthread_onfork(2, "child");
+}
+
+static struct AtFork *_pthread_atfork_alloc(void) {
+  int i, n = ARRAYLEN(_atforks.pool);
+  if (atomic_load_explicit(&_atforks.allocated, memory_order_relaxed) < n &&
+      (i = atomic_fetch_add(&_atforks.allocated, 1)) < n) {
+    return _atforks.pool + i;
+  } else {
     return 0;
-  if (atomic_fetch_sub_explicit(&c->refs, 1, memory_order_release))
-    return 0;
-  atomic_thread_fence(memory_order_acquire);
-  int rc = munmap(c->shared, sizeof(struct CursorShared));
-  rc |= munmap(c, sizeof(struct Cursor));
+  }
+}
+
+int _pthread_atfork(atfork_f prepare, atfork_f parent, atfork_f child) {
+  int rc;
+  struct AtFork *a;
+  if (!(a = _pthread_atfork_alloc()))
+    return ENOMEM;
+  a->f[0] = prepare;
+  a->f[1] = parent;
+  a->f[2] = child;
+  pthread_spin_lock(&_atforks.lock);
+  a->p[0] = 0;
+  a->p[1] = _atforks.list;
+  if (_atforks.list)
+    _atforks.list->p[0] = a;
+  _atforks.list = a;
+  pthread_spin_unlock(&_atforks.lock);
+  rc = 0;
   return rc;
 }
-
-void __cursor_lock(struct Cursor *c) {
-  _pthread_mutex_lock(&c->shared->lock);
-}
-
-void __cursor_unlock(struct Cursor *c) {
-  _pthread_mutex_unlock(&c->shared->lock);
-}
diff --git a/libc/thread/pthread_mutex_destroy.c b/libc/intrin/pthread_mutex_destroy.c
similarity index 100%
rename from libc/thread/pthread_mutex_destroy.c
rename to libc/intrin/pthread_mutex_destroy.c
diff --git a/libc/thread/pthread_mutex_init.c b/libc/intrin/pthread_mutex_init.c
similarity index 97%
rename from libc/thread/pthread_mutex_init.c
rename to libc/intrin/pthread_mutex_init.c
index 1ce34716b..8801f2372 100644
--- a/libc/thread/pthread_mutex_init.c
+++ b/libc/intrin/pthread_mutex_init.c
@@ -24,7 +24,7 @@
  *     pthread_mutex_t lock;
  *     pthread_mutexattr_t attr;
  *     pthread_mutexattr_init(&attr);
- *     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
+ *     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL);
  *     pthread_mutex_init(&lock, &attr);
  *     pthread_mutexattr_destroy(&attr);
  *     // ...
diff --git a/libc/intrin/pthread_mutex_lock.c b/libc/intrin/pthread_mutex_lock.c
index 8ee1daa12..cfde8a623 100644
--- a/libc/intrin/pthread_mutex_lock.c
+++ b/libc/intrin/pthread_mutex_lock.c
@@ -17,219 +17,143 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/blockcancel.internal.h"
+#include "libc/calls/calls.h"
 #include "libc/calls/state.internal.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
-#include "libc/thread/tls.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"
 
-static errno_t pthread_mutex_lock_normal_success(pthread_mutex_t *mutex,
-                                                 uint64_t word) {
-  if (IsModeDbg() || MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK) {
-    __deadlock_track(mutex, MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK);
-    __deadlock_record(mutex, MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK);
+static void pthread_mutex_lock_naive(pthread_mutex_t *mutex, uint64_t word) {
+  int backoff = 0;
+  uint64_t lock;
+  for (;;) {
+    word = MUTEX_UNLOCK(word);
+    lock = MUTEX_LOCK(word);
+    if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
+                                              memory_order_acquire,
+                                              memory_order_relaxed))
+      return;
+    backoff = pthread_delay_np(mutex, backoff);
   }
-  return 0;
 }
 
 // see "take 3" algorithm in "futexes are tricky" by ulrich drepper
 // slightly improved to attempt acquiring multiple times b4 syscall
-static int pthread_mutex_lock_drepper(pthread_mutex_t *mutex, uint64_t word,
-                                      bool is_trylock) {
-  int val = 0;
-  if (atomic_compare_exchange_strong_explicit(
-          &mutex->_futex, &val, 1, memory_order_acquire, memory_order_acquire))
-    return pthread_mutex_lock_normal_success(mutex, word);
-  if (is_trylock)
-    return EBUSY;
-  LOCKTRACE("acquiring pthread_mutex_lock_drepper(%t)...", mutex);
-  if (val == 1)
-    val = atomic_exchange_explicit(&mutex->_futex, 2, memory_order_acquire);
-  BLOCK_CANCELATION;
-  while (val > 0) {
-    cosmo_futex_wait(&mutex->_futex, 2, MUTEX_PSHARED(word), 0, 0);
-    val = atomic_exchange_explicit(&mutex->_futex, 2, memory_order_acquire);
+static void pthread_mutex_lock_drepper(atomic_int *futex, char pshare) {
+  int word;
+  for (int i = 0; i < 4; ++i) {
+    word = 0;
+    if (atomic_compare_exchange_strong_explicit(
+            futex, &word, 1, memory_order_acquire, memory_order_acquire))
+      return;
+    pthread_pause_np();
+  }
+  if (word == 1)
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  while (word > 0) {
+    BLOCK_CANCELATION;
+    _weaken(nsync_futex_wait_)(futex, 2, pshare, 0);
+    ALLOW_CANCELATION;
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
   }
-  ALLOW_CANCELATION;
-  return pthread_mutex_lock_normal_success(mutex, word);
 }
 
 static errno_t pthread_mutex_lock_recursive(pthread_mutex_t *mutex,
-                                            uint64_t word, bool is_trylock) {
+                                            uint64_t word) {
   uint64_t lock;
   int backoff = 0;
-  int me = atomic_load_explicit(&__get_tls()->tib_ptid, memory_order_relaxed);
-  bool once = false;
+  int me = gettid();
   for (;;) {
     if (MUTEX_OWNER(word) == me) {
-      if (MUTEX_DEPTH(word) < MUTEX_DEPTH_MAX) {
-        if (atomic_compare_exchange_weak_explicit(
-                &mutex->_word, &word, MUTEX_INC_DEPTH(word),
-                memory_order_relaxed, memory_order_relaxed))
-          return 0;
-        continue;
+      if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
+        if (MUTEX_DEPTH(word) < MUTEX_DEPTH_MAX) {
+          if (atomic_compare_exchange_weak_explicit(
+                  &mutex->_word, &word, MUTEX_INC_DEPTH(word),
+                  memory_order_relaxed, memory_order_relaxed))
+            return 0;
+          continue;
+        } else {
+          return EAGAIN;
+        }
       } else {
-        return EAGAIN;
+        return EDEADLK;
       }
     }
-    if (IsModeDbg())
-      __deadlock_check(mutex, 0);
     word = MUTEX_UNLOCK(word);
     lock = MUTEX_LOCK(word);
     lock = MUTEX_SET_OWNER(lock, me);
     if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
                                               memory_order_acquire,
                                               memory_order_relaxed)) {
-      if (IsModeDbg()) {
-        __deadlock_track(mutex, 0);
-        __deadlock_record(mutex, 0);
-      }
       mutex->_pid = __pid;
       return 0;
     }
-    if (is_trylock)
-      return EBUSY;
-    if (!once) {
-      LOCKTRACE("acquiring pthread_mutex_lock_recursive(%t)...", mutex);
-      once = true;
-    }
-    for (;;) {
-      word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-      if (MUTEX_OWNER(word) == me)
-        break;
-      if (word == MUTEX_UNLOCK(word))
-        break;
-      backoff = pthread_delay_np(mutex, backoff);
-    }
+    backoff = pthread_delay_np(mutex, backoff);
   }
 }
 
-#if PTHREAD_USE_NSYNC
-static errno_t pthread_mutex_lock_recursive_nsync(pthread_mutex_t *mutex,
-                                                  uint64_t word,
-                                                  bool is_trylock) {
-  int me = atomic_load_explicit(&__get_tls()->tib_ptid, memory_order_relaxed);
-  for (;;) {
-    if (MUTEX_OWNER(word) == me) {
-      if (MUTEX_DEPTH(word) < MUTEX_DEPTH_MAX) {
-        if (atomic_compare_exchange_weak_explicit(
-                &mutex->_word, &word, MUTEX_INC_DEPTH(word),
-                memory_order_relaxed, memory_order_relaxed))
-          return 0;
-        continue;
-      } else {
-        return EAGAIN;
-      }
-    }
-    if (IsModeDbg())
-      __deadlock_check(mutex, 0);
-    if (!is_trylock) {
-      _weaken(nsync_mu_lock)((nsync_mu *)mutex->_nsync);
-    } else {
-      if (!_weaken(nsync_mu_trylock)((nsync_mu *)mutex->_nsync))
-        return EBUSY;
-    }
-    if (IsModeDbg()) {
-      __deadlock_track(mutex, 0);
-      __deadlock_record(mutex, 0);
-    }
-    word = MUTEX_UNLOCK(word);
-    word = MUTEX_LOCK(word);
-    word = MUTEX_SET_OWNER(word, me);
-    mutex->_word = word;
-    mutex->_pid = __pid;
-    return 0;
-  }
-}
-#endif
+static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
+  uint64_t word;
 
-static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex,
-                                       bool is_trylock) {
-  uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-
-  // handle recursive mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_RECURSIVE) {
-#if PTHREAD_USE_NSYNC
-    if (_weaken(nsync_mu_lock) &&
-        MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE) {
-      return pthread_mutex_lock_recursive_nsync(mutex, word, is_trylock);
-    } else {
-      return pthread_mutex_lock_recursive(mutex, word, is_trylock);
-    }
-#else
-    return pthread_mutex_lock_recursive(mutex, word, is_trylock);
-#endif
-  }
-
-  // check if normal mutex is already owned by calling thread
-  if (!is_trylock &&
-      (MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK ||
-       (IsModeDbg() && MUTEX_TYPE(word) == PTHREAD_MUTEX_DEFAULT))) {
-    if (__deadlock_tracked(mutex) == 1) {
-      if (IsModeDbg() && MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
-        kprintf("error: attempted to lock non-recursive mutex that's already "
-                "held by the calling thread: %t\n",
-                mutex);
-        DebugBreak();
-      }
-      return EDEADLK;
-    }
-  }
-
-  // check if locking will create cycle in lock graph
-  if (IsModeDbg() || MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK)
-    if (__deadlock_check(mutex, MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK))
-      return EDEADLK;
+  // get current state of lock
+  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
 
 #if PTHREAD_USE_NSYNC
   // use superior mutexes if possible
-  if (MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
       _weaken(nsync_mu_lock)) {
-    // on apple silicon we should just put our faith in ulock
-    // otherwise *nsync gets struck down by the eye of sauron
-    if (!IsXnuSilicon()) {
-      if (!is_trylock) {
-        _weaken(nsync_mu_lock)((nsync_mu *)mutex->_nsync);
-        return pthread_mutex_lock_normal_success(mutex, word);
-      } else {
-        if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex->_nsync))
-          return pthread_mutex_lock_normal_success(mutex, word);
-        return EBUSY;
-      }
-    }
+    _weaken(nsync_mu_lock)((nsync_mu *)mutex);
+    return 0;
   }
 #endif
 
-  // isc licensed non-recursive mutex implementation
-  return pthread_mutex_lock_drepper(mutex, word, is_trylock);
+  // handle normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wait_)) {
+      pthread_mutex_lock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
+    } else {
+      pthread_mutex_lock_naive(mutex, word);
+    }
+    return 0;
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_lock_recursive(mutex, word);
 }
 
 /**
- * Locks mutex, e.g.
+ * Locks mutex.
+ *
+ * Here's an example of using a normal mutex:
  *
  *     pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
  *     pthread_mutex_lock(&lock);
  *     // do work...
  *     pthread_mutex_unlock(&lock);
+ *     pthread_mutex_destroy(&lock);
  *
- * The long way to do that is:
+ * Cosmopolitan permits succinct notation for normal mutexes:
+ *
+ *     pthread_mutex_t lock = {0};
+ *     pthread_mutex_lock(&lock);
+ *     // do work...
+ *     pthread_mutex_unlock(&lock);
+ *
+ * Here's an example of the proper way to do recursive mutexes:
  *
  *     pthread_mutex_t lock;
  *     pthread_mutexattr_t attr;
  *     pthread_mutexattr_init(&attr);
- *     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
- *     pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_PRIVATE);
+ *     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
  *     pthread_mutex_init(&lock, &attr);
  *     pthread_mutexattr_destroy(&attr);
  *     pthread_mutex_lock(&lock);
@@ -237,101 +161,17 @@ static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex,
  *     pthread_mutex_unlock(&lock);
  *     pthread_mutex_destroy(&lock);
  *
- * The following non-POSIX initializers are also provided by cosmo libc:
- *
- * - `PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP`
- * - `PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP`
- * - `PTHREAD_NORMAL_MUTEX_INITIALIZER_NP`
- *
- * Locking a mutex that's already locked by the calling thread will make
- * the thread hang indefinitely, i.e. it's a deadlock condition. You can
- * use `PTHREAD_MUTEX_RECURSIVE` to allow recursive locking, which could
- * result in somewhat less performance. An alternative solution is using
- * the `PTHREAD_MUTEX_ERRORCHECK` mode, which raises `EDEADLK` for that.
- *
- * If a thread locks a mutex while other mutexes are already locked then
- * you need to observe a consistent global ordering, otherwise deadlocks
- * might occur. The Cosmopolitan runtime can detect these cycles quickly
- * so you can fix your code before it becomes an issue. With error check
- * mode, an EPERM will be returned. If your app is using `cosmocc -mdbg`
- * then an error message will be printed including the demangled symbols
- * of the mutexes in the strongly connected component that was detected.
- * Please note that, even for debug builds mutexes set to explicitly use
- * the `PTHREAD_MUTEX_ERRORCHECK` mode will return an error code instead
- * which means the cosmo debug mode only influences undefined behaviors.
- *
- * Cosmopolitan only supports error checking on mutexes stored in static
- * memory, i.e. your `mutex` pointer must point inside the .data or .bss
- * sections of your executable. When compiling your programs using -mdbg
- * all your locks will gain error checking automatically. When deadlocks
- * are detected an error message will be printed and a SIGTRAP signal is
- * raised, which may be ignored to force EDEADLK and EPERM to be raised.
- *
- * Using `cosmocc -mdbg` also enhances `--strace` with information about
- * mutexes. First, locks and unlocks will be logged. Since the lock line
- * only appears after the lock is acquired, that might mean you'll never
- * get an indication about a lock that takes a very long time to acquire
- * so, whenever a lock can't immediately be acquired, a second line gets
- * printed *before* the lock is acquired to let you know that the thread
- * is waiting for a particular lock. If your mutex object resides within
- * static memory, then its demangled symbol name will be printed. If you
- * call ShowCrashReports() at the beginning of your main() function then
- * you'll also see a backtrace when a locking violation occurs. When the
- * symbols in the violation error messages show up as numbers, and it is
- * desirable to see demangled symbols without enabling full crash report
- * functionality the GetSymbolTable() function may be called for effect.
- *
- * If you use `PTHREAD_MUTEX_NORMAL`, instead of `PTHREAD_MUTEX_DEFAULT`
- * then deadlocking is actually defined behavior according to POSIX.1 so
- * the helpfulness of `cosmocc -mdbg` will be somewhat weakened.
- *
- * If your `mutex` object resides in `MAP_SHARED` memory, then undefined
- * behavior will happen unless you use `PTHREAD_PROCESS_SHARED` mode, if
- * the lock is used by multiple processes.
- *
- * This function does nothing when the process is in vfork() mode.
+ * This function does nothing in vfork() children.
  *
  * @return 0 on success, or error number on failure
- * @raise EDEADLK if mutex is recursive and locked by another thread
- * @raise EDEADLK if mutex is non-recursive and locked by current thread
- * @raise EDEADLK if cycle is detected in global nested lock graph
- * @raise EAGAIN if maximum recursive locks is exceeded
  * @see pthread_spin_lock()
  * @vforksafe
  */
-errno_t _pthread_mutex_lock(pthread_mutex_t *mutex) {
-  if (__tls_enabled && !__vforked) {
-    errno_t err = pthread_mutex_lock_impl(mutex, false);
-    LOCKTRACE("pthread_mutex_lock(%t) → %s", mutex, DescribeErrno(err));
-    return err;
-  } else {
-    LOCKTRACE("skipping pthread_mutex_lock(%t) due to runtime state", mutex);
+errno_t pthread_mutex_lock(pthread_mutex_t *mutex) {
+  if (__vforked)
     return 0;
-  }
+  LOCKTRACE("acquiring %t...", mutex);
+  errno_t err = pthread_mutex_lock_impl(mutex);
+  LOCKTRACE("pthread_mutex_lock(%t) → %s", mutex, DescribeErrno(err));
+  return err;
 }
-
-/**
- * Attempts acquiring lock.
- *
- * Unlike pthread_mutex_lock() this function won't block and instead
- * returns an error immediately if the lock couldn't be acquired.
- *
- * @return 0 if lock was acquired, otherwise an errno
- * @raise EBUSY if lock is currently held by another thread
- * @raise EAGAIN if maximum number of recursive locks is held
- * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
- *     current thread already holds this mutex
- */
-errno_t _pthread_mutex_trylock(pthread_mutex_t *mutex) {
-  if (__tls_enabled && !__vforked) {
-    errno_t err = pthread_mutex_lock_impl(mutex, true);
-    LOCKTRACE("pthread_mutex_trylock(%t) → %s", mutex, DescribeErrno(err));
-    return err;
-  } else {
-    LOCKTRACE("skipping pthread_mutex_trylock(%t) due to runtime state", mutex);
-    return 0;
-  }
-}
-
-__weak_reference(_pthread_mutex_lock, pthread_mutex_lock);
-__weak_reference(_pthread_mutex_trylock, pthread_mutex_trylock);
diff --git a/libc/intrin/pthread_mutex_trylock.c b/libc/intrin/pthread_mutex_trylock.c
new file mode 100644
index 000000000..5fd06a078
--- /dev/null
+++ b/libc/intrin/pthread_mutex_trylock.c
@@ -0,0 +1,125 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/dce.h"
+#include "libc/errno.h"
+#include "libc/intrin/atomic.h"
+#include "libc/intrin/weaken.h"
+#include "libc/runtime/internal.h"
+#include "libc/thread/lock.h"
+#include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
+#include "third_party/nsync/mu.h"
+
+static errno_t pthread_mutex_trylock_naive(pthread_mutex_t *mutex,
+                                           uint64_t word) {
+  uint64_t lock;
+  word = MUTEX_UNLOCK(word);
+  lock = MUTEX_LOCK(word);
+  if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
+                                            memory_order_acquire,
+                                            memory_order_relaxed))
+    return 0;
+  return EBUSY;
+}
+
+static errno_t pthread_mutex_trylock_drepper(atomic_int *futex) {
+  int word = 0;
+  if (atomic_compare_exchange_strong_explicit(
+          futex, &word, 1, memory_order_acquire, memory_order_acquire))
+    return 0;
+  return EBUSY;
+}
+
+static errno_t pthread_mutex_trylock_recursive(pthread_mutex_t *mutex,
+                                               uint64_t word) {
+  uint64_t lock;
+  int me = gettid();
+  for (;;) {
+    if (MUTEX_OWNER(word) == me) {
+      if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
+        if (MUTEX_DEPTH(word) < MUTEX_DEPTH_MAX) {
+          if (atomic_compare_exchange_weak_explicit(
+                  &mutex->_word, &word, MUTEX_INC_DEPTH(word),
+                  memory_order_relaxed, memory_order_relaxed))
+            return 0;
+          continue;
+        } else {
+          return EAGAIN;
+        }
+      } else {
+        return EDEADLK;
+      }
+    }
+    word = MUTEX_UNLOCK(word);
+    lock = MUTEX_LOCK(word);
+    lock = MUTEX_SET_OWNER(lock, me);
+    if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
+                                              memory_order_acquire,
+                                              memory_order_relaxed)) {
+      mutex->_pid = __pid;
+      return 0;
+    }
+    return EBUSY;
+  }
+}
+
+/**
+ * Attempts acquiring lock.
+ *
+ * Unlike pthread_mutex_lock() this function won't block and instead
+ * returns an error immediately if the lock couldn't be acquired.
+ *
+ * @return 0 if lock was acquired, otherwise an errno
+ * @raise EAGAIN if maximum number of recursive locks is held
+ * @raise EBUSY if lock is currently held in read or write mode
+ * @raise EINVAL if `mutex` doesn't refer to an initialized lock
+ * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
+ *     current thread already holds this mutex
+ */
+errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
+
+  // get current state of lock
+  uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_trylock)) {
+    if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
+      return 0;
+    } else {
+      return EBUSY;
+    }
+  }
+#endif
+
+  // handle normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wait_)) {
+      return pthread_mutex_trylock_drepper(&mutex->_futex);
+    } else {
+      return pthread_mutex_trylock_naive(mutex, word);
+    }
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_trylock_recursive(mutex, word);
+}
diff --git a/libc/intrin/pthread_mutex_unlock.c b/libc/intrin/pthread_mutex_unlock.c
index 25525dccb..fcb549dcb 100644
--- a/libc/intrin/pthread_mutex_unlock.c
+++ b/libc/intrin/pthread_mutex_unlock.c
@@ -17,34 +17,34 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
-#include "libc/calls/state.internal.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/describeflags.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
-#include "libc/thread/tls.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"
 
+static void pthread_mutex_unlock_naive(pthread_mutex_t *mutex, uint64_t word) {
+  uint64_t lock = MUTEX_UNLOCK(word);
+  atomic_store_explicit(&mutex->_word, lock, memory_order_release);
+}
+
 // see "take 3" algorithm in "futexes are tricky" by ulrich drepper
 static void pthread_mutex_unlock_drepper(atomic_int *futex, char pshare) {
   int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
   if (word == 2) {
     atomic_store_explicit(futex, 0, memory_order_release);
-    cosmo_futex_wake(futex, 1, pshare);
+    _weaken(nsync_futex_wake_)(futex, 1, pshare);
   }
 }
 
 static errno_t pthread_mutex_unlock_recursive(pthread_mutex_t *mutex,
                                               uint64_t word) {
-  int me = atomic_load_explicit(&__get_tls()->tib_ptid, memory_order_relaxed);
+  int me = gettid();
   for (;;) {
 
     // we allow unlocking an initialized lock that wasn't locked, but we
@@ -65,118 +65,48 @@ static errno_t pthread_mutex_unlock_recursive(pthread_mutex_t *mutex,
     // actually unlock the mutex
     if (atomic_compare_exchange_weak_explicit(
             &mutex->_word, &word, MUTEX_UNLOCK(word), memory_order_release,
-            memory_order_relaxed)) {
-      if (IsModeDbg())
-        __deadlock_untrack(mutex);
+            memory_order_relaxed))
       return 0;
-    }
   }
 }
 
-#if PTHREAD_USE_NSYNC
-static errno_t pthread_mutex_unlock_recursive_nsync(pthread_mutex_t *mutex,
-                                                    uint64_t word) {
-  int me = atomic_load_explicit(&__get_tls()->tib_ptid, memory_order_relaxed);
-  for (;;) {
-
-    // we allow unlocking an initialized lock that wasn't locked, but we
-    // don't allow unlocking a lock held by another thread, or unlocking
-    // recursive locks from a forked child, since it should be re-init'd
-    if (MUTEX_OWNER(word) && (MUTEX_OWNER(word) != me || mutex->_pid != __pid))
-      return EPERM;
-
-    // check if this is a nested lock with signal safety
-    if (MUTEX_DEPTH(word)) {
-      if (atomic_compare_exchange_strong_explicit(
-              &mutex->_word, &word, MUTEX_DEC_DEPTH(word), memory_order_relaxed,
-              memory_order_relaxed))
-        return 0;
-      continue;
-    }
-
-    // actually unlock the mutex
-    mutex->_word = MUTEX_UNLOCK(word);
-    _weaken(nsync_mu_unlock)((nsync_mu *)mutex->_nsync);
-    if (IsModeDbg())
-      __deadlock_untrack(mutex);
-    return 0;
-  }
-}
-#endif
-
-static errno_t pthread_mutex_unlock_impl(pthread_mutex_t *mutex) {
-  uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-
-  // check if mutex isn't held by calling thread
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK || IsModeDbg()) {
-    if (__deadlock_tracked(mutex) == 0) {
-      if (IsModeDbg() && MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
-        kprintf("error: unlock mutex not owned by calling thread: %t\n", mutex);
-        DebugBreak();
-      }
-      return EPERM;
-    }
-  }
-
-  // handle recursive mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_RECURSIVE) {
-#if PTHREAD_USE_NSYNC
-    if (_weaken(nsync_mu_unlock) &&
-        MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE) {
-      return pthread_mutex_unlock_recursive_nsync(mutex, word);
-    } else {
-      return pthread_mutex_unlock_recursive(mutex, word);
-    }
-#else
-    return pthread_mutex_unlock_recursive(mutex, word);
-#endif
-  }
-
-#if PTHREAD_USE_NSYNC
-  // use superior mutexes if possible
-  if (MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_unlock)) {
-    // on apple silicon we should just put our faith in ulock
-    // otherwise *nsync gets struck down by the eye of sauron
-    if (!IsXnuSilicon()) {
-      _weaken(nsync_mu_unlock)((nsync_mu *)mutex->_nsync);
-      if (MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK || IsModeDbg())
-        __deadlock_untrack(mutex);
-      return 0;
-    }
-  }
-#endif
-
-  // implement barebones normal mutexes
-  pthread_mutex_unlock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK || IsModeDbg())
-    __deadlock_untrack(mutex);
-  return 0;
-}
-
 /**
  * Releases mutex.
  *
- * POSIX.1 says it's undefined behavior to unlock a mutex that wasn't
- * locked by the calling thread. Therefore, if `mutex` isn't locked, or
- * it is locked and the thing that locked it was a different thread or
- * process, then you should expect your program to deadlock or crash.
- *
  * This function does nothing in vfork() children.
  *
  * @return 0 on success or error number on failure
- * @raises EPERM if mutex ownership isn't acceptable
+ * @raises EPERM if in error check mode and not owned by caller
  * @vforksafe
  */
-errno_t _pthread_mutex_unlock(pthread_mutex_t *mutex) {
-  if (__tls_enabled && !__vforked) {
-    errno_t err = pthread_mutex_unlock_impl(mutex);
-    LOCKTRACE("pthread_mutex_unlock(%t) → %s", mutex, DescribeErrno(err));
-    return err;
-  } else {
-    LOCKTRACE("skipping pthread_mutex_lock(%t) due to runtime state", mutex);
+errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
+  uint64_t word;
+
+  LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
+
+  // get current state of lock
+  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_unlock)) {
+    _weaken(nsync_mu_unlock)((nsync_mu *)mutex);
     return 0;
   }
-}
+#endif
 
-__weak_reference(_pthread_mutex_unlock, pthread_mutex_unlock);
+  // implement barebones normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wake_)) {
+      pthread_mutex_unlock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
+    } else {
+      pthread_mutex_unlock_naive(mutex, word);
+    }
+    return 0;
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_unlock_recursive(mutex, word);
+}
diff --git a/libc/thread/pthread_mutexattr_destroy.c b/libc/intrin/pthread_mutexattr_destroy.c
similarity index 100%
rename from libc/thread/pthread_mutexattr_destroy.c
rename to libc/intrin/pthread_mutexattr_destroy.c
diff --git a/libc/thread/pthread_mutexattr_getpshared.c b/libc/intrin/pthread_mutexattr_getpshared.c
similarity index 100%
rename from libc/thread/pthread_mutexattr_getpshared.c
rename to libc/intrin/pthread_mutexattr_getpshared.c
diff --git a/libc/thread/pthread_mutexattr_gettype.c b/libc/intrin/pthread_mutexattr_gettype.c
similarity index 98%
rename from libc/thread/pthread_mutexattr_gettype.c
rename to libc/intrin/pthread_mutexattr_gettype.c
index 6e4caa149..9b85dca0d 100644
--- a/libc/thread/pthread_mutexattr_gettype.c
+++ b/libc/intrin/pthread_mutexattr_gettype.c
@@ -23,7 +23,6 @@
  * Gets mutex type.
  *
  * @param type will be set to one of these on success
- *     - `PTHREAD_MUTEX_DEFAULT`
  *     - `PTHREAD_MUTEX_NORMAL`
  *     - `PTHREAD_MUTEX_RECURSIVE`
  *     - `PTHREAD_MUTEX_ERRORCHECK`
diff --git a/libc/thread/pthread_mutexattr_init.c b/libc/intrin/pthread_mutexattr_init.c
similarity index 100%
rename from libc/thread/pthread_mutexattr_init.c
rename to libc/intrin/pthread_mutexattr_init.c
diff --git a/libc/thread/pthread_mutexattr_setpshared.c b/libc/intrin/pthread_mutexattr_setpshared.c
similarity index 100%
rename from libc/thread/pthread_mutexattr_setpshared.c
rename to libc/intrin/pthread_mutexattr_setpshared.c
diff --git a/libc/thread/pthread_mutexattr_settype.c b/libc/intrin/pthread_mutexattr_settype.c
similarity index 98%
rename from libc/thread/pthread_mutexattr_settype.c
rename to libc/intrin/pthread_mutexattr_settype.c
index aefe262f4..96dc080de 100644
--- a/libc/thread/pthread_mutexattr_settype.c
+++ b/libc/intrin/pthread_mutexattr_settype.c
@@ -24,8 +24,8 @@
  * Sets mutex type.
  *
  * @param type can be one of
- *     - `PTHREAD_MUTEX_DEFAULT`
  *     - `PTHREAD_MUTEX_NORMAL`
+ *     - `PTHREAD_MUTEX_DEFAULT`
  *     - `PTHREAD_MUTEX_RECURSIVE`
  *     - `PTHREAD_MUTEX_ERRORCHECK`
  * @return 0 on success, or error on failure
@@ -33,7 +33,6 @@
  */
 errno_t pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type) {
   switch (type) {
-    case PTHREAD_MUTEX_DEFAULT:
     case PTHREAD_MUTEX_NORMAL:
     case PTHREAD_MUTEX_RECURSIVE:
     case PTHREAD_MUTEX_ERRORCHECK:
diff --git a/libc/intrin/pthread_orphan_np.c b/libc/intrin/pthread_orphan_np.c
index 1575502f1..68e2a9f5f 100644
--- a/libc/intrin/pthread_orphan_np.c
+++ b/libc/intrin/pthread_orphan_np.c
@@ -16,8 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/intrin/atomic.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
@@ -30,6 +28,5 @@ int pthread_orphan_np(void) {
   res = _pthread_list == _pthread_list->prev &&
         _pthread_list == _pthread_list->next;
   _pthread_unlock();
-  unassert(!res || atomic_load(&_pthread_count) <= 1);
   return res;
 }
diff --git a/libc/intrin/pthread_pause_np.c b/libc/intrin/pthread_pause_np.c
index 8f5c399c1..ceb85d242 100644
--- a/libc/intrin/pthread_pause_np.c
+++ b/libc/intrin/pthread_pause_np.c
@@ -16,23 +16,15 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#ifdef _MSC_VER
-#include <intrin.h>
-#else
-#include <xmmintrin.h>
-#endif
+#include "libc/thread/thread.h"
 
 /**
  * Yields hyperthread.
  */
 void pthread_pause_np(void) {
 #if defined(__GNUC__) && defined(__aarch64__)
-  __asm__("yield");
-#elif defined(__x86_64__) || defined(__i386__)
-  _mm_pause();
-#elif defined(__GNUC__) && (defined(__PPC__) || defined(__PPC64__))
-  __asm__("or 27,27,27");
-#else
-  // do nothing
+  __asm__ volatile("yield");
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+  __asm__ volatile("pause");
 #endif
 }
diff --git a/libc/intrin/pthread_setcancelstate.c b/libc/intrin/pthread_setcancelstate.c
index 6e2a35f35..9ce15824d 100644
--- a/libc/intrin/pthread_setcancelstate.c
+++ b/libc/intrin/pthread_setcancelstate.c
@@ -47,30 +47,28 @@
  * @asyncsignalsafe
  */
 errno_t pthread_setcancelstate(int state, int *oldstate) {
-  int old;
   errno_t err;
   struct PosixThread *pt;
   if (__tls_enabled && (pt = _pthread_self())) {
-    if (pt->pt_flags & PT_NOCANCEL) {
-      old = PTHREAD_CANCEL_DISABLE;
-    } else if (pt->pt_flags & PT_MASKED) {
-      old = PTHREAD_CANCEL_MASKED;
-    } else {
-      old = PTHREAD_CANCEL_ENABLE;
-    }
     switch (state) {
       case PTHREAD_CANCEL_ENABLE:
-        pt->pt_flags &= ~(PT_NOCANCEL | PT_MASKED);
-        err = 0;
-        break;
       case PTHREAD_CANCEL_DISABLE:
-        pt->pt_flags &= ~(PT_NOCANCEL | PT_MASKED);
-        pt->pt_flags |= PT_NOCANCEL;
-        err = 0;
-        break;
       case PTHREAD_CANCEL_MASKED:
+        if (oldstate) {
+          if (pt->pt_flags & PT_NOCANCEL) {
+            *oldstate = PTHREAD_CANCEL_DISABLE;
+          } else if (pt->pt_flags & PT_MASKED) {
+            *oldstate = PTHREAD_CANCEL_MASKED;
+          } else {
+            *oldstate = PTHREAD_CANCEL_ENABLE;
+          }
+        }
         pt->pt_flags &= ~(PT_NOCANCEL | PT_MASKED);
-        pt->pt_flags |= PT_MASKED;
+        if (state == PTHREAD_CANCEL_MASKED) {
+          pt->pt_flags |= PT_MASKED;
+        } else if (state == PTHREAD_CANCEL_DISABLE) {
+          pt->pt_flags |= PT_NOCANCEL;
+        }
         err = 0;
         break;
       default:
@@ -78,13 +76,12 @@ errno_t pthread_setcancelstate(int state, int *oldstate) {
         break;
     }
   } else {
-    old = 0;
+    if (oldstate) {
+      *oldstate = 0;
+    }
     err = 0;
   }
-  if (!err)
-    if (oldstate)
-      *oldstate = old;
-#if IsModeDbg() && 0
+#if IsModeDbg()
   STRACE("pthread_setcancelstate(%s, [%s]) → %s",
          DescribeCancelState(0, &state), DescribeCancelState(err, oldstate),
          DescribeErrno(err));
diff --git a/libc/thread/pthread_spin_destroy.c b/libc/intrin/pthread_spin_destroy.c
similarity index 100%
rename from libc/thread/pthread_spin_destroy.c
rename to libc/intrin/pthread_spin_destroy.c
diff --git a/libc/thread/pthread_spin_init.c b/libc/intrin/pthread_spin_init.c
similarity index 100%
rename from libc/thread/pthread_spin_init.c
rename to libc/intrin/pthread_spin_init.c
diff --git a/libc/thread/pthread_mutex_consistent.c b/libc/intrin/pthread_spin_lock.c
similarity index 73%
rename from libc/thread/pthread_mutex_consistent.c
rename to libc/intrin/pthread_spin_lock.c
index 44a5fd5f6..4ce73139a 100644
--- a/libc/thread/pthread_mutex_consistent.c
+++ b/libc/intrin/pthread_spin_lock.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,26 +16,30 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/cosmo.h"
-#include "libc/dce.h"
 #include "libc/intrin/atomic.h"
-#include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
 
 /**
- * Recovers mutex whose owner died.
+ * Acquires spin lock, e.g.
+ *
+ *     pthread_spinlock_t lock;
+ *     pthread_spin_init(&lock, PTHREAD_PROCESS_PRIVATE);
+ *     pthread_spin_lock(&lock);
+ *     // do work...
+ *     pthread_spin_unlock(&lock);
+ *     pthread_spin_destroy(&lock);
+ *
+ * This function has undefined behavior when `spin` wasn't intialized,
+ * was destroyed, or if the lock's already held by the calling thread.
  *
  * @return 0 on success, or errno on error
+ * @see pthread_spin_trylock
+ * @see pthread_spin_unlock
+ * @see pthread_spin_init
  */
-int pthread_mutex_consistent(pthread_mutex_t *mutex) {
-
-  // The POSIX concept of robust mutexes is a bit cray. So let's change
-  // things up a bit. Rather than implementing all those goofy behaviors
-  // we shall simply use this function to weasel around the ownership
-  // check in pthread_mutex_unlock().
-  uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-  if (IsModeDbg() || MUTEX_TYPE(word) == PTHREAD_MUTEX_ERRORCHECK)
-    __deadlock_track(mutex, 0);
-
+errno_t pthread_spin_lock(pthread_spinlock_t *spin) {
+  while (atomic_exchange_explicit(&spin->_lock, 1, memory_order_acquire)) {
+    pthread_pause_np();
+  }
   return 0;
 }
diff --git a/libc/thread/pthread_spin_trylock.c b/libc/intrin/pthread_spin_trylock.c
similarity index 100%
rename from libc/thread/pthread_spin_trylock.c
rename to libc/intrin/pthread_spin_trylock.c
diff --git a/libc/thread/pthread_spin_unlock.c b/libc/intrin/pthread_spin_unlock.c
similarity index 96%
rename from libc/thread/pthread_spin_unlock.c
rename to libc/intrin/pthread_spin_unlock.c
index fb881ce33..927de65c4 100644
--- a/libc/thread/pthread_spin_unlock.c
+++ b/libc/intrin/pthread_spin_unlock.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/strace.h"
 #include "libc/thread/thread.h"
 
 /**
@@ -30,7 +29,6 @@
  * @see pthread_spin_lock
  */
 errno_t pthread_spin_unlock(pthread_spinlock_t *spin) {
-  LOCKTRACE("pthread_spin_unlock(%t)", spin);
   atomic_store_explicit(&spin->_lock, 0, memory_order_release);
   return 0;
 }
diff --git a/libc/intrin/pthread_tid.c b/libc/intrin/pthread_tid.c
index fb9d22f44..4f7553e9a 100644
--- a/libc/intrin/pthread_tid.c
+++ b/libc/intrin/pthread_tid.c
@@ -21,25 +21,9 @@
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
-//
-// - tib_ptid: always guaranteed to be non-zero in thread itself. on
-//             some platforms (e.g. xnu) the parent thread and other
-//             threads may need to wait for this value to be set. this
-//             is generally the value you want to read to get the tid.
-//
-// - tib_ctid: starts off as -1. once thread starts, it's set to the
-//             thread's tid before calling the thread callback. when
-//             thread is done executing, this is set to zero, and then
-//             this address is futex woken, in case the parent thread or
-//             any other thread is waiting on its completion. when a
-//             thread wants to read its own tid, it shouldn't use this,
-//             because the thread might need to do things after clearing
-//             its own tib_ctid (see pthread_exit() for static thread).
-//
 int _pthread_tid(struct PosixThread *pt) {
   int tid = 0;
-  while (pt && !(tid = atomic_load_explicit(&pt->tib->tib_ptid,
-                                            memory_order_acquire)))
+  while (pt && !(tid = atomic_load_explicit(&pt->ptid, memory_order_acquire)))
     pthread_yield_np();
   return tid;
 }
diff --git a/libc/intrin/pthreadlock.c b/libc/intrin/pthreadlock.c
index 085f5bba0..dccad6479 100644
--- a/libc/intrin/pthreadlock.c
+++ b/libc/intrin/pthreadlock.c
@@ -16,15 +16,14 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/stdalign.h"
 #include "libc/thread/posixthread.internal.h"
 
-alignas(64) pthread_mutex_t __pthread_lock_obj = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t _pthread_lock_obj = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
 
 void _pthread_lock(void) {
-  _pthread_mutex_lock(&__pthread_lock_obj);
+  pthread_mutex_lock(&_pthread_lock_obj);
 }
 
 void _pthread_unlock(void) {
-  _pthread_mutex_unlock(&__pthread_lock_obj);
+  pthread_mutex_unlock(&_pthread_lock_obj);
 }
diff --git a/libc/intrin/punpckhbw.c b/libc/intrin/punpckhbw.c
new file mode 100644
index 000000000..151530c77
--- /dev/null
+++ b/libc/intrin/punpckhbw.c
@@ -0,0 +1,46 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/punpckhbw.h"
+
+/**
+ * Interleaves high bytes.
+ *
+ * @param 𝑎 [w/o] receives reduced 𝑏 and 𝑐 interleaved
+ * @param 𝑏 [r/o] supplies eight words
+ * @param 𝑐 [r/o] supplies eight words
+ * @mayalias
+ */
+void(punpckhbw)(uint8_t a[16], const uint8_t b[16], const uint8_t c[16]) {
+  a[0x0] = b[0x8];
+  a[0x1] = c[0x8];
+  a[0x2] = b[0x9];
+  a[0x3] = c[0x9];
+  a[0x4] = b[0xa];
+  a[0x5] = c[0xa];
+  a[0x6] = b[0xb];
+  a[0x7] = c[0xb];
+  a[0x8] = b[0xc];
+  a[0x9] = c[0xc];
+  a[0xa] = b[0xd];
+  a[0xb] = c[0xd];
+  a[0xc] = b[0xe];
+  a[0xd] = c[0xe];
+  a[0xe] = b[0xf];
+  a[0xf] = c[0xf];
+}
diff --git a/libc/intrin/punpckhbw.h b/libc/intrin/punpckhbw.h
new file mode 100644
index 000000000..306cb1597
--- /dev/null
+++ b/libc/intrin/punpckhbw.h
@@ -0,0 +1,13 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PUNPCKHBW_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PUNPCKHBW_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void punpckhbw(uint8_t[16], const uint8_t[16], const uint8_t[16]);
+
+#define punpckhbw(A, B, C)                                                     \
+  INTRIN_SSEVEX_X_X_X_(punpckhbw, SSE2, "punpckhbw", INTRIN_NONCOMMUTATIVE, A, \
+                       B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PUNPCKHBW_H_ */
diff --git a/libc/mem/levenshtein.c b/libc/intrin/punpckhwd.c
similarity index 66%
rename from libc/mem/levenshtein.c
rename to libc/intrin/punpckhwd.c
index f1e1cc131..5aad8b10b 100644
--- a/libc/mem/levenshtein.c
+++ b/libc/intrin/punpckhwd.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,34 +16,34 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/mem/alg.h"
-#include "libc/mem/mem.h"
+#include "libc/intrin/punpckhwd.h"
 #include "libc/str/str.h"
 
-#define MIN3(a, b, c) \
-  ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
-
 /**
- * Computes similarity between two strings.
+ * Interleaves high words.
+ *
+ *          0  1  2  3  4  5  6  7
+ *       B  aa bb cc dd EE FF GG HH
+ *       C  ii jj kk ll MM NN OO PP
+ *                      └┤ └┤ └┤ └┤
+ *              ┌────────┘  │  │  │
+ *              │     ┌─────┘  │  │
+ *              │     │     ┌──┘  │
+ *          ┌───┤ ┌───┤ ┌───┤ ┌───┤
+ *     → A  EE MM FF NN GG OO HH PP
+ *
+ * @param 𝑎 [w/o] receives reduced 𝑏 and 𝑐 interleaved
+ * @param 𝑏 [r/o] supplies eight words
+ * @param 𝑐 [r/o] supplies eight words
+ * @mayalias
  */
-double levenshtein(const char *s0, const char *s1) {
-  int n0 = strlen(s0) + 1;
-  int n1 = strlen(s1) + 1;
-  int *col = (int *)malloc(n1 * sizeof(int));
-  int *pol = (int *)malloc(n1 * sizeof(int));
-  for (int i = 0; i < n1; i++)
-    pol[i] = i;
-  for (int i = 0; i < n0; i++) {
-    col[0] = i;
-    for (int j = 1; j < n1; j++)
-      col[j] = MIN3(1 + col[j - 1], 1 + pol[j],
-                    pol[j - 1] + !(i > 0 && s0[i - 1] == s1[j - 1]));
-    int *t = col;
-    col = pol;
-    pol = t;
-  }
-  int dist = pol[n1 - 1];
-  free(pol);
-  free(col);
-  return 1 - dist / ((n0 > n1 ? n0 : n1) - 1.);
+void(punpckhwd)(uint16_t a[8], const uint16_t b[8], const uint16_t c[8]) {
+  a[0] = b[4];
+  a[1] = c[4];
+  a[2] = b[5];
+  a[3] = c[5];
+  a[4] = b[6];
+  a[5] = c[6];
+  a[6] = b[7];
+  a[7] = c[7];
 }
diff --git a/libc/intrin/punpckhwd.h b/libc/intrin/punpckhwd.h
new file mode 100644
index 000000000..548e6ee92
--- /dev/null
+++ b/libc/intrin/punpckhwd.h
@@ -0,0 +1,13 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PUNPCKHWD_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PUNPCKHWD_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void punpckhwd(uint16_t[8], const uint16_t[8], const uint16_t[8]);
+
+#define punpckhwd(A, B, C)                                                     \
+  INTRIN_SSEVEX_X_X_X_(punpckhwd, SSE2, "punpckhwd", INTRIN_NONCOMMUTATIVE, A, \
+                       B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PUNPCKHWD_H_ */
diff --git a/libc/intrin/punpcklbw.c b/libc/intrin/punpcklbw.c
new file mode 100644
index 000000000..559d8a553
--- /dev/null
+++ b/libc/intrin/punpcklbw.c
@@ -0,0 +1,56 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/punpcklbw.h"
+
+/**
+ * Interleaves low bytes.
+ *
+ *          0 1 2 3 4 5 6 7 8 9 A B C D E F
+ *       B  A B C D E F G H i j k l m n o p
+ *       C  Q R S T U V W X y z α σ π μ τ ε
+ *          │ │ │ │ │ │ │ │
+ *          │ │ │ └─────┐
+ *          │ │ └───┐   │  etc...
+ *          │ └─┐   │   │
+ *          ├─┐ ├─┐ ├─┐ ├─┐
+ *     → A  A Q B R C S D T E U F V G W H X
+ *
+ * @param 𝑎 [w/o] receives reduced 𝑏 and 𝑐 interleaved
+ * @param 𝑏 [r/o] supplies eight words
+ * @param 𝑐 [r/o] supplies eight words
+ * @mayalias
+ */
+void(punpcklbw)(uint8_t a[16], const uint8_t b[16], const uint8_t c[16]) {
+  a[0xf] = c[7];
+  a[0xe] = b[7];
+  a[0xd] = c[6];
+  a[0xc] = b[6];
+  a[0xb] = c[5];
+  a[0xa] = b[5];
+  a[0x9] = c[4];
+  a[0x8] = b[4];
+  a[0x7] = c[3];
+  a[0x6] = b[3];
+  a[0x5] = c[2];
+  a[0x4] = b[2];
+  a[0x3] = c[1];
+  a[0x2] = b[1];
+  a[0x1] = c[0];
+  a[0x0] = b[0];
+}
diff --git a/libc/intrin/punpcklbw.h b/libc/intrin/punpcklbw.h
new file mode 100644
index 000000000..40c9cef89
--- /dev/null
+++ b/libc/intrin/punpcklbw.h
@@ -0,0 +1,13 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PUNPCKLBW_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PUNPCKLBW_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void punpcklbw(uint8_t[16], const uint8_t[16], const uint8_t[16]);
+
+#define punpcklbw(A, B, C)                                                     \
+  INTRIN_SSEVEX_X_X_X_(punpcklbw, SSE2, "punpcklbw", INTRIN_NONCOMMUTATIVE, A, \
+                       B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PUNPCKLBW_H_ */
diff --git a/libc/thread/pthread_condattr_setclock.c b/libc/intrin/punpcklwd.c
similarity index 66%
rename from libc/thread/pthread_condattr_setclock.c
rename to libc/intrin/punpcklwd.c
index 7d5176b02..11936c456 100644
--- a/libc/thread/pthread_condattr_setclock.c
+++ b/libc/intrin/punpcklwd.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,27 +16,33 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/errno.h"
-#include "libc/sysv/consts/clock.h"
-#include "libc/thread/thread.h"
+#include "libc/intrin/punpcklwd.h"
 
 /**
- * Sets clock for condition variable.
+ * Interleaves low words.
  *
- * @param clock can be one of
- *     - `CLOCK_REALTIME` (default)
- *     - `CLOCK_MONOTONIC`
- *     - `CLOCK_REALTIME_COARSE`
- *     - `CLOCK_MONOTONIC_COARSE`
- * @return 0 on success, or error on failure
- * @raises EINVAL if `clock` is invalid
+ *          0  1  2  3  4  5  6  7
+ *       B  AA BB CC DD ee ff gg hh
+ *       C  II JJ KK LL mm nn oo pp
+ *          ├┘ ├┘ ├┘ ├┘
+ *          │  │  │  └────────┐
+ *          │  │  └─────┐     │
+ *          │  └──┐     │     │
+ *          ├───┐ ├───┐ ├───┐ ├───┐
+ *     → A  AA II BB JJ CC KK DD LL
+ *
+ * @param 𝑎 [w/o] receives reduced 𝑏 and 𝑐 interleaved
+ * @param 𝑏 [r/o] supplies eight words
+ * @param 𝑐 [r/o] supplies eight words
+ * @mayalias
  */
-int pthread_condattr_setclock(pthread_condattr_t *attr, int clock) {
-  if (clock != CLOCK_REALTIME &&         //
-      clock != CLOCK_REALTIME_COARSE &&  //
-      clock != CLOCK_MONOTONIC &&        //
-      clock != CLOCK_MONOTONIC_COARSE)
-    return EINVAL;
-  attr->_clock = clock;
-  return 0;
+void(punpcklwd)(uint16_t a[8], const uint16_t b[8], const uint16_t c[8]) {
+  a[7] = c[3];
+  a[6] = b[3];
+  a[5] = c[2];
+  a[4] = b[2];
+  a[3] = c[1];
+  a[2] = b[1];
+  a[1] = c[0];
+  a[0] = b[0];
 }
diff --git a/libc/intrin/punpcklwd.h b/libc/intrin/punpcklwd.h
new file mode 100644
index 000000000..e286ba9c2
--- /dev/null
+++ b/libc/intrin/punpcklwd.h
@@ -0,0 +1,13 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_PUNPCKLWD_H_
+#define COSMOPOLITAN_LIBC_INTRIN_PUNPCKLWD_H_
+#include "libc/intrin/macros.h"
+COSMOPOLITAN_C_START_
+
+void punpcklwd(uint16_t[8], const uint16_t[8], const uint16_t[8]);
+
+#define punpcklwd(A, B, C)                                                     \
+  INTRIN_SSEVEX_X_X_X_(punpcklwd, SSE2, "punpcklwd", INTRIN_NONCOMMUTATIVE, A, \
+                       B, C)
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_INTRIN_PUNPCKLWD_H_ */
diff --git a/libc/intrin/pushpop.h b/libc/intrin/pushpop.h
index 17f551ac4..2f693e542 100644
--- a/libc/intrin/pushpop.h
+++ b/libc/intrin/pushpop.h
@@ -1,7 +1,7 @@
 #ifndef COSMOPOLITAN_LIBC_BITS_PUSHPOP_H_
 #define COSMOPOLITAN_LIBC_BITS_PUSHPOP_H_
 #ifdef _COSMO_SOURCE
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #if !defined(__GNUC__) || defined(__STRICT_ANSI__) || !defined(__x86_64__) || \
     !defined(__MNO_RED_ZONE__)
diff --git a/libc/intrin/rand64.c b/libc/intrin/rand64.c
index 53252327e..73308daa2 100644
--- a/libc/intrin/rand64.c
+++ b/libc/intrin/rand64.c
@@ -22,25 +22,12 @@
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/auxv.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 
 static int _rand64_pid;
 static unsigned __int128 _rand64_pool;
-static pthread_mutex_t __rand64_lock_obj = PTHREAD_MUTEX_INITIALIZER;
-
-void __rand64_lock(void) {
-  _pthread_mutex_lock(&__rand64_lock_obj);
-}
-
-void __rand64_unlock(void) {
-  _pthread_mutex_unlock(&__rand64_lock_obj);
-}
-
-void __rand64_wipe(void) {
-  _pthread_mutex_wipe_np(&__rand64_lock_obj);
-}
+pthread_mutex_t _rand64_lock_obj = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
 
 /**
  * Returns nondeterministic random data.
@@ -51,11 +38,12 @@ void __rand64_wipe(void) {
  *
  * @see rdseed(), rdrand(), rand(), random(), rngset()
  * @note this function passes bigcrush and practrand
+ * @asyncsignalsafe
  */
 uint64_t _rand64(void) {
   void *p;
   uint128_t s;
-  __rand64_lock();
+  pthread_mutex_lock(&_rand64_lock_obj);
   if (__pid == _rand64_pid) {
     s = _rand64_pool;  // normal path
   } else {
@@ -76,6 +64,6 @@ uint64_t _rand64(void) {
     _rand64_pid = __pid;
   }
   _rand64_pool = (s *= 15750249268501108917ull);  // lemur64
-  __rand64_unlock();
+  pthread_mutex_unlock(&_rand64_lock_obj);
   return s >> 64;
 }
diff --git a/libc/intrin/reservefd.c b/libc/intrin/reservefd.c
index 272590122..e751c1dd6 100644
--- a/libc/intrin/reservefd.c
+++ b/libc/intrin/reservefd.c
@@ -18,11 +18,11 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
 #include "libc/calls/state.internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/cmpxchg.h"
 #include "libc/intrin/extend.h"
-#include "libc/intrin/fds.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/memtrack.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/map.h"
@@ -47,7 +47,7 @@ int __ensurefds_unlocked(int fd) {
 
 /**
  * Grows file descriptor array memory if needed.
- * @asyncsignalsafe if signals are blocked
+ * @asyncsignalsafe
  */
 int __ensurefds(int fd) {
   __fds_lock();
@@ -72,7 +72,7 @@ int __reservefd_unlocked(int start) {
     if (_cmpxchg(&g_fds.p[fd].kind, kFdEmpty, kFdReserved)) {
       // g_fds.f isn't guarded by our mutex
       do {
-        f2 = MIN(fd + 1, f1);
+        f2 = MAX(fd + 1, f1);
       } while (!atomic_compare_exchange_weak_explicit(
           &g_fds.f, &f1, f2, memory_order_release, memory_order_relaxed));
       return fd;
@@ -82,7 +82,7 @@ int __reservefd_unlocked(int start) {
 
 /**
  * Finds open file descriptor slot.
- * @asyncsignalsafe if signals are blocked
+ * @asyncsignalsafe
  */
 int __reservefd(int start) {
   int fd;
diff --git a/libc/intrin/rlimit.h b/libc/intrin/rlimit.h
deleted file mode 100644
index 05d0fb96e..000000000
--- a/libc/intrin/rlimit.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_INTRIN_RLIMIT_H_
-#define COSMOPOLITAN_LIBC_INTRIN_RLIMIT_H_
-#include "libc/calls/struct/rlimit.h"
-COSMOPOLITAN_C_START_
-
-void __rlimit_stack_set(struct rlimit);
-struct rlimit __rlimit_stack_get(void);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_LIBC_INTRIN_RLIMIT_H_ */
diff --git a/libc/intrin/rlimitstack.c b/libc/intrin/rlimitstack.c
deleted file mode 100644
index 66f47c64a..000000000
--- a/libc/intrin/rlimitstack.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/atomic.h"
-#include "libc/calls/struct/rlimit.h"
-#include "libc/calls/struct/rlimit.internal.h"
-#include "libc/cosmo.h"
-#include "libc/dce.h"
-#include "libc/intrin/cxaatexit.h"
-#include "libc/intrin/lockless.h"
-#include "libc/intrin/rlimit.h"
-#include "libc/runtime/stack.h"
-#include "libc/sysv/consts/rlim.h"
-#include "libc/sysv/consts/rlimit.h"
-
-struct atomic_rlimit {
-  atomic_ulong cur;
-  atomic_ulong max;
-  atomic_uint once;
-  atomic_uint gen;
-};
-
-static struct atomic_rlimit __rlimit_stack;
-
-static void __rlimit_stack_init(void) {
-  struct rlimit rlim;
-  if (IsWindows()) {
-    rlim.rlim_cur = GetStaticStackSize();
-    rlim.rlim_max = -1;  // RLIM_INFINITY in consts.sh
-  } else {
-    sys_getrlimit(RLIMIT_STACK, &rlim);
-  }
-  atomic_init(&__rlimit_stack.cur, rlim.rlim_cur);
-  atomic_init(&__rlimit_stack.max, rlim.rlim_max);
-}
-
-struct rlimit __rlimit_stack_get(void) {
-  unsigned gen;
-  unsigned long cur, max;
-  cosmo_once(&__rlimit_stack.once, __rlimit_stack_init);
-  gen = lockless_read_begin(&__rlimit_stack.gen);
-  do {
-    cur = atomic_load_explicit(&__rlimit_stack.cur, memory_order_acquire);
-    max = atomic_load_explicit(&__rlimit_stack.max, memory_order_acquire);
-  } while (!lockless_read_end(&__rlimit_stack.gen, &gen));
-  return (struct rlimit){cur, max};
-}
-
-void __rlimit_stack_set(struct rlimit rlim) {
-  unsigned gen;
-  unsigned long cur, max;
-  cosmo_once(&__rlimit_stack.once, __rlimit_stack_init);
-  __cxa_lock();
-  cur = rlim.rlim_cur;
-  max = rlim.rlim_max;
-  gen = lockless_write_begin(&__rlimit_stack.gen);
-  atomic_store_explicit(&__rlimit_stack.cur, cur, memory_order_release);
-  atomic_store_explicit(&__rlimit_stack.max, max, memory_order_release);
-  lockless_write_end(&__rlimit_stack.gen, gen);
-  __cxa_unlock();
-}
diff --git a/libc/intrin/safemacros.h b/libc/intrin/safemacros.h
index 8fe6613f4..443843f37 100644
--- a/libc/intrin/safemacros.h
+++ b/libc/intrin/safemacros.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_BITS_SAFEMACROS_H_
 #define COSMOPOLITAN_LIBC_BITS_SAFEMACROS_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 COSMOPOLITAN_C_START_
 
diff --git a/libc/intrin/sig.c b/libc/intrin/sig.c
index 0cf56902d..8679b811d 100644
--- a/libc/intrin/sig.c
+++ b/libc/intrin/sig.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -17,812 +17,38 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/sysv/consts/sig.h"
-#include "ape/sections.internal.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
-#include "libc/calls/state.internal.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/siginfo.h"
 #include "libc/calls/struct/sigset.internal.h"
-#include "libc/calls/struct/ucontext.internal.h"
-#include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/calls/ucontext.h"
 #include "libc/dce.h"
-#include "libc/errno.h"
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/bsf.h"
-#include "libc/intrin/describebacktrace.h"
-#include "libc/intrin/dll.h"
-#include "libc/intrin/maps.h"
-#include "libc/intrin/nomultics.h"
-#include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/log/libfatal.internal.h"
-#include "libc/mem/alloca.h"
-#include "libc/nt/console.h"
-#include "libc/nt/enum/context.h"
-#include "libc/nt/enum/exceptionhandleractions.h"
-#include "libc/nt/enum/pageflags.h"
-#include "libc/nt/enum/processcreationflags.h"
-#include "libc/nt/enum/signal.h"
-#include "libc/nt/enum/status.h"
-#include "libc/nt/events.h"
-#include "libc/nt/memory.h"
-#include "libc/nt/runtime.h"
-#include "libc/nt/signals.h"
-#include "libc/nt/struct/memorybasicinformation.h"
-#include "libc/nt/struct/ntexceptionpointers.h"
-#include "libc/nt/synchronization.h"
-#include "libc/nt/thread.h"
-#include "libc/runtime/internal.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/symbols.internal.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/sysv/consts/sa.h"
-#include "libc/sysv/consts/sicode.h"
-#include "libc/sysv/consts/ss.h"
-#include "libc/sysv/consts/termios.h"
-#include "libc/thread/posixthread.internal.h"
-#ifdef __x86_64__
+#include "libc/thread/tls.h"
 
-/**
- * @fileoverview Cosmopolitan Signals for Windows.
- */
+struct Signals __sig;
 
-#define STKSZ 65536
-#define HAIRY textwindows dontinstrument dontinline
-
-struct SignalFrame {
-  unsigned rva;
-  unsigned flags;
-  siginfo_t si;
-  ucontext_t ctx;
-};
-
-__msabi extern typeof(GetStdHandle) *const __imp_GetStdHandle;
-__msabi extern typeof(VirtualProtectEx) *const __imp_VirtualProtectEx;
-__msabi extern typeof(VirtualQuery) *const __imp_VirtualQuery;
-__msabi extern typeof(WriteFile) *const __imp_WriteFile;
-
-atomic_int __sig_worker_state;
-
-textwindows static bool __sig_ignored_by_default(int sig) {
-  return sig == SIGURG ||   //
-         sig == SIGCONT ||  //
-         sig == SIGCHLD ||  //
-         sig == SIGWINCH;
-}
-
-textwindows bool __sig_ignored(int sig) {
-  return __sighandrvas[sig] == (intptr_t)SIG_IGN ||
-         (__sighandrvas[sig] == (intptr_t)SIG_DFL &&
-          __sig_ignored_by_default(sig));
-}
-
-textwindows void __sig_delete(int sig) {
-  struct Dll *e;
-  atomic_fetch_and_explicit(__sig.process, ~(1ull << (sig - 1)),
-                            memory_order_relaxed);
-  _pthread_lock();
-  for (e = dll_last(_pthread_list); e; e = dll_prev(_pthread_list, e))
-    atomic_fetch_and_explicit(&POSIXTHREAD_CONTAINER(e)->tib->tib_sigpending,
-                              ~(1ull << (sig - 1)), memory_order_relaxed);
-  _pthread_unlock();
-}
-
-textwindows static int __sig_getter(atomic_ulong *sigs, sigset_t masked) {
-  int sig;
-  sigset_t bit, pending, deliverable;
-  for (;;) {
-    pending = atomic_load_explicit(sigs, memory_order_acquire);
-    if ((deliverable = pending & ~masked)) {
-      sig = bsfl(deliverable) + 1;
-      bit = 1ull << (sig - 1);
-      if (atomic_fetch_and_explicit(sigs, ~bit, memory_order_acq_rel) & bit)
-        return sig;
-    } else {
+sigset_t __sig_block(void) {
+  if (IsWindows() || IsMetal()) {
+    if (__tls_enabled)
+      return atomic_exchange_explicit(&__get_tls()->tib_sigmask, -1,
+                                      memory_order_acquire);
+    else
       return 0;
-    }
-  }
-}
-
-textwindows int __sig_get(sigset_t masked) {
-  int sig;
-  if (!(sig = __sig_getter(&__get_tls()->tib_sigpending, masked)))
-    sig = __sig_getter(__sig.process, masked);
-  return sig;
-}
-
-HAIRY static bool __sig_should_use_altstack(unsigned flags,
-                                            struct CosmoTib *tib) {
-  if (!(flags & SA_ONSTACK))
-    return false;  // signal handler didn't enable it
-  if (!tib->tib_sigstack_size)
-    return false;  // sigaltstack() wasn't installed on this thread
-  if (tib->tib_sigstack_flags & SS_DISABLE)
-    return false;  // sigaltstack() on this thread was disabled by user
-  char *bp = __builtin_frame_address(0);
-  if (tib->tib_sigstack_addr <= bp &&
-      bp <= tib->tib_sigstack_addr + tib->tib_sigstack_size)
-    return false;  // we're already on the alternate stack
-  return true;
-}
-
-forceinline wontreturn void __sig_terminate(int sig) {
-  TerminateThisProcess(sig);
-}
-
-textwindows static bool __sig_wake(struct PosixThread *pt, int sig) {
-  atomic_int *blocker;
-  blocker = atomic_load_explicit(&pt->pt_blocker, memory_order_acquire);
-  if (!blocker)
-    return false;
-  // threads can create semaphores on an as-needed basis
-  if (blocker == PT_BLOCKER_EVENT) {
-    STRACE("%G set %d's event object", sig, _pthread_tid(pt));
-    SetEvent(pt->pt_event);
-    return !!atomic_load_explicit(&pt->pt_blocker, memory_order_acquire);
-  }
-  // all other blocking ops that aren't overlap should use futexes
-  // we force restartable futexes to churn by waking w/o releasing
-  STRACE("%G waking %d's futex", sig, _pthread_tid(pt));
-  WakeByAddressSingle(blocker);
-  return !!atomic_load_explicit(&pt->pt_blocker, memory_order_acquire);
-}
-
-textwindows static bool __sig_start(struct PosixThread *pt, int sig,
-                                    unsigned *rva, unsigned *flags) {
-  *rva = __sighandrvas[sig];
-  *flags = __sighandflags[sig];
-  if (*rva == (intptr_t)SIG_IGN ||
-      (*rva == (intptr_t)SIG_DFL && __sig_ignored_by_default(sig))) {
-    STRACE("ignoring %G", sig);
-    return false;
-  }
-  if (atomic_load_explicit(&pt->tib->tib_sigmask, memory_order_acquire) &
-      (1ull << (sig - 1))) {
-    STRACE("enqueing %G on %d", sig, _pthread_tid(pt));
-    atomic_fetch_or_explicit(&pt->tib->tib_sigpending, 1ull << (sig - 1),
-                             memory_order_relaxed);
-    __sig_wake(pt, sig);
-    return false;
-  }
-  if (*rva == (intptr_t)SIG_DFL) {
-    STRACE("terminating on %G due to no handler", sig);
-    __sig_terminate(sig);
-  }
-  return true;
-}
-
-textwindows static sigaction_f __sig_handler(unsigned rva) {
-  atomic_fetch_add_explicit(&__sig.count, 1, memory_order_relaxed);
-  return (sigaction_f)(__executable_start + rva);
-}
-
-textwindows int __sig_raise(volatile int sig, int sic) {
-
-  // bitset of kinds of handlers called
-  volatile int handler_was_called = 0;
-
-  // loop over pending signals
-  ucontext_t ctx;
-  getcontext(&ctx);
-  if (!sig) {
-    if ((sig = __sig_get(ctx.uc_sigmask))) {
-      sic = SI_KERNEL;
-    } else {
-      return handler_was_called;
-    }
-  }
-
-  // process signal(s)
-  unsigned rva, flags;
-  struct PosixThread *pt = _pthread_self();
-  if (__sig_start(pt, sig, &rva, &flags)) {
-
-    if (flags & SA_RESETHAND) {
-      STRACE("resetting %G handler", sig);
-      __sighandrvas[sig] = (int32_t)(intptr_t)SIG_DFL;
-    }
-
-    // update the signal mask in preparation for signal handller
-    sigset_t blocksigs = __sighandmask[sig];
-    if (!(flags & SA_NODEFER))
-      blocksigs |= 1ull << (sig - 1);
-    ctx.uc_sigmask = atomic_fetch_or_explicit(&pt->tib->tib_sigmask, blocksigs,
-                                              memory_order_acquire);
-
-    // call the user's signal handler
-    char ssbuf[128];
-    siginfo_t si = {.si_signo = sig, .si_code = sic};
-    STRACE("__sig_raise(%G, %t) mask %s", sig, __sig_handler(rva),
-           _DescribeSigset(ssbuf, 0, (sigset_t *)&pt->tib->tib_sigmask));
-    __sig_handler(rva)(sig, &si, &ctx);
-
-    // record this handler
-    if (flags & SA_RESTART) {
-      handler_was_called |= SIG_HANDLED_SA_RESTART;
-    } else {
-      handler_was_called |= SIG_HANDLED_NO_RESTART;
-    }
-  }
-
-  // restore sigmask
-  // loop back to top
-  // jump where handler says
-  sig = 0;
-  setcontext(&ctx);
-  __builtin_unreachable();
-}
-
-textwindows int __sig_relay(int sig, int sic, sigset_t waitmask) {
-  sigset_t m;
-  int handler_was_called;
-  m = atomic_exchange_explicit(&__get_tls()->tib_sigmask, waitmask,
-                               memory_order_acquire);
-  handler_was_called = __sig_raise(sig, SI_KERNEL);
-  atomic_store_explicit(&__get_tls()->tib_sigmask, m, memory_order_release);
-  return handler_was_called;
-}
-
-// the user's signal handler callback is wrapped with this trampoline
-textwindows wontreturn static void __sig_tramp(struct SignalFrame *sf) {
-  int sig = sf->si.si_signo;
-  struct CosmoTib *tib = __get_tls();
-  struct PosixThread *pt = (struct PosixThread *)tib->tib_pthread;
-  atomic_store_explicit(&pt->pt_intoff, 0, memory_order_release);
-  for (;;) {
-
-    // update the signal mask in preparation for signal handler
-    sigset_t blocksigs = __sighandmask[sig];
-    if (!(sf->flags & SA_NODEFER))
-      blocksigs |= 1ull << (sig - 1);
-    sf->ctx.uc_sigmask = atomic_fetch_or_explicit(&tib->tib_sigmask, blocksigs,
-                                                  memory_order_acquire);
-
-    // call the user's signal handler
-    char ssbuf[2][128];
-    STRACE("__sig_tramp(%G, %t) mask %s → %s", sig, __sig_handler(sf->rva),
-           _DescribeSigset(ssbuf[0], 0, &sf->ctx.uc_sigmask),
-           _DescribeSigset(ssbuf[1], 0, (sigset_t *)&tib->tib_sigmask));
-    __sig_handler(sf->rva)(sig, &sf->si, &sf->ctx);
-
-    // restore the signal mask that was used by the interrupted code
-    // this may have been modified by the signal handler in the callback
-    atomic_store_explicit(&tib->tib_sigmask, sf->ctx.uc_sigmask,
-                          memory_order_release);
-
-    // jump back into original code if there aren't any pending signals
-    do {
-      if (!(sig = __sig_get(sf->ctx.uc_sigmask)))
-        __sig_restore(&sf->ctx);
-    } while (!__sig_start(pt, sig, &sf->rva, &sf->flags));
-
-    // tail recurse into another signal handler
-    sf->si.si_signo = sig;
-    sf->si.si_code = SI_KERNEL;
-    if (sf->flags & SA_RESETHAND) {
-      STRACE("resetting %G handler", sig);
-      __sighandrvas[sig] = (int32_t)(intptr_t)SIG_DFL;
-    }
-  }
-}
-
-HAIRY optimizespeed void __sig_translate(ucontext_t *ctx,
-                                         const struct NtContext *cr) {
-  ctx->uc_mcontext.eflags = cr->EFlags;
-  ctx->uc_mcontext.rax = cr->Rax;
-  ctx->uc_mcontext.rbx = cr->Rbx;
-  ctx->uc_mcontext.rcx = cr->Rcx;
-  ctx->uc_mcontext.rdx = cr->Rdx;
-  ctx->uc_mcontext.rdi = cr->Rdi;
-  ctx->uc_mcontext.rsi = cr->Rsi;
-  ctx->uc_mcontext.rbp = cr->Rbp;
-  ctx->uc_mcontext.rsp = cr->Rsp;
-  ctx->uc_mcontext.rip = cr->Rip;
-  ctx->uc_mcontext.r8 = cr->R8;
-  ctx->uc_mcontext.r9 = cr->R9;
-  ctx->uc_mcontext.r10 = cr->R10;
-  ctx->uc_mcontext.r11 = cr->R11;
-  ctx->uc_mcontext.r12 = cr->R12;
-  ctx->uc_mcontext.r13 = cr->R13;
-  ctx->uc_mcontext.r14 = cr->R14;
-  ctx->uc_mcontext.r15 = cr->R15;
-  ctx->uc_mcontext.cs = cr->SegCs;
-  ctx->uc_mcontext.gs = cr->SegGs;
-  ctx->uc_mcontext.fs = cr->SegFs;
-  ctx->uc_mcontext.fpregs = &ctx->__fpustate;
-  __repmovsb(&ctx->__fpustate, &cr->FltSave, sizeof(ctx->__fpustate));
-  ctx->__fpustate.mxcsr = cr->MxCsr;
-}
-
-// sends signal to another specific thread which is ref'd
-textwindows static int __sig_killer(struct PosixThread *pt, int sig, int sic) {
-  unsigned rva = __sighandrvas[sig];
-  unsigned flags = __sighandflags[sig];
-
-  // do nothing if signal is ignored
-  if (rva == (intptr_t)SIG_IGN ||
-      (rva == (intptr_t)SIG_DFL && __sig_ignored_by_default(sig))) {
-    STRACE("ignoring %G", sig);
-    return 0;
-  }
-
-  // we can't preempt threads that masked sigs or are blocked on i/o
-  while ((atomic_load_explicit(&pt->tib->tib_sigmask, memory_order_acquire) &
-          (1ull << (sig - 1)))) {
-    if (atomic_fetch_or_explicit(&pt->tib->tib_sigpending, 1ull << (sig - 1),
-                                 memory_order_acq_rel) &
-        (1ull << (sig - 1)))
-      // we believe signal was already enqueued
-      return 0;
-    if (__sig_wake(pt, sig))
-      // we believe i/o routine will handle signal
-      return 0;
-    if (atomic_load_explicit(&pt->tib->tib_sigmask, memory_order_acquire) &
-        (1ull << (sig - 1)))
-      // we believe ALLOW_SIGNALS will handle signal
-      return 0;
-    if (!(atomic_fetch_and_explicit(&pt->tib->tib_sigpending,
-                                    ~(1ull << (sig - 1)),
-                                    memory_order_acq_rel) &
-          (1ull << (sig - 1))))
-      // we believe another thread sniped our signal
-      return 0;
-    break;
-  }
-
-  // avoid race conditions and deadlocks with thread suspend process
-  if (atomic_exchange_explicit(&pt->pt_intoff, 1, memory_order_acquire)) {
-    // we believe another thread is asynchronously waking the mark
-    if (atomic_fetch_or_explicit(&pt->tib->tib_sigpending, 1ull << (sig - 1),
-                                 memory_order_acq_rel) &
-        (1ull << (sig - 1)))
-      // we believe our signal is already being delivered
-      return 0;
-    if (atomic_load_explicit(&pt->pt_intoff, memory_order_acquire) ||
-        atomic_exchange_explicit(&pt->pt_intoff, 1, memory_order_acquire))
-      // we believe __sig_tramp will deliver our signal
-      return 0;
-    if (!(atomic_fetch_and_explicit(&pt->tib->tib_sigpending,
-                                    ~(1ull << (sig - 1)),
-                                    memory_order_acq_rel) &
-          (1ull << (sig - 1))))
-      // we believe another thread sniped our signal
-      return 0;
-  }
-
-  // if there's no handler then killing a thread kills the process
-  if (rva == (intptr_t)SIG_DFL) {
-    STRACE("terminating on %G due to no handler", sig);
-    __sig_terminate(sig);
-  }
-
-  // take control of thread
-  // suspending the thread happens asynchronously
-  // however getting the context blocks until it's frozen
-  uintptr_t th = _pthread_syshand(pt);
-  if (SuspendThread(th) == -1u) {
-    STRACE("SuspendThread failed w/ %d", GetLastError());
-    atomic_store_explicit(&pt->pt_intoff, 0, memory_order_release);
-    return ESRCH;
-  }
-  struct NtContext nc;
-  nc.ContextFlags = kNtContextFull;
-  if (!GetThreadContext(th, &nc)) {
-    STRACE("GetThreadContext failed w/ %d", GetLastError());
-    ResumeThread(th);
-    atomic_store_explicit(&pt->pt_intoff, 0, memory_order_release);
-    return ESRCH;
-  }
-
-  // we can't preempt threads that masked sig or are blocked
-  // we can't preempt threads that are running in win32 code
-  // so we shall unblock the thread and let it signal itself
-  if (!((uintptr_t)__executable_start <= nc.Rip &&
-        nc.Rip < (uintptr_t)__privileged_start)) {
-    atomic_fetch_or_explicit(&pt->tib->tib_sigpending, 1ull << (sig - 1),
-                             memory_order_relaxed);
-    ResumeThread(th);
-    atomic_store_explicit(&pt->pt_intoff, 0, memory_order_release);
-    __sig_wake(pt, sig);
-    return 0;
-  }
-
-  // preferring to live dangerously
-  // the thread will be signaled asynchronously
-  if (flags & SA_RESETHAND) {
-    STRACE("resetting %G handler", sig);
-    __sighandrvas[sig] = (int32_t)(intptr_t)SIG_DFL;
-  }
-
-  // inject call to trampoline function into thread
-  uintptr_t sp;
-  if (__sig_should_use_altstack(flags, pt->tib)) {
-    sp = (uintptr_t)pt->tib->tib_sigstack_addr + pt->tib->tib_sigstack_size;
   } else {
-    sp = nc.Rsp;
-  }
-  sp -= sizeof(struct SignalFrame);
-  sp &= -16;
-  struct SignalFrame *sf = (struct SignalFrame *)sp;
-  __repstosb(sf, 0, sizeof(*sf));
-  __sig_translate(&sf->ctx, &nc);
-  sf->rva = rva;
-  sf->flags = flags;
-  sf->si.si_code = sic;
-  sf->si.si_signo = sig;
-  *(uintptr_t *)(sp -= sizeof(uintptr_t)) = nc.Rip;
-  nc.Rip = (intptr_t)__sig_tramp;
-  nc.Rdi = (intptr_t)sf;
-  nc.Rsp = sp;
-  if (!SetThreadContext(th, &nc)) {
-    STRACE("SetThreadContext failed w/ %d", GetLastError());
-    atomic_store_explicit(&pt->pt_intoff, 0, memory_order_release);
-    return ESRCH;
-  }
-  ResumeThread(th);
-  __sig_wake(pt, sig);
-  return 0;
-}
-
-// sends signal to another specific thread
-textwindows int __sig_kill(struct PosixThread *pt, int sig, int sic) {
-  int rc;
-  BLOCK_SIGNALS;
-  rc = __sig_killer(pt, sig, sic);
-  ALLOW_SIGNALS;
-  return rc;
-}
-
-// sends signal to any other thread
-// this should only be called by non-posix threads
-textwindows void __sig_generate(int sig, int sic) {
-  struct Dll *e;
-  struct PosixThread *pt, *mark = 0;
-  if (__sig_ignored(sig)) {
-    STRACE("ignoring %G", sig);
-    return;
-  }
-  if (__sighandrvas[sig] == (intptr_t)SIG_DFL) {
-    STRACE("terminating on %G due to no handler", sig);
-    __sig_terminate(sig);
-  }
-  if (atomic_load_explicit(__sig.process, memory_order_acquire) &
-      (1ull << (sig - 1)))
-    return;
-  _pthread_lock();
-  for (e = dll_first(_pthread_list); e; e = dll_next(_pthread_list, e)) {
-    pt = POSIXTHREAD_CONTAINER(e);
-    // we don't want to signal ourself
-    if (pt == _pthread_self())
-      continue;
-    // we don't want to signal a thread that isn't running
-    if (atomic_load_explicit(&pt->pt_status, memory_order_acquire) >=
-        kPosixThreadTerminated)
-      continue;
-    // choose this thread if it isn't masking sig
-    if (!(atomic_load_explicit(&pt->tib->tib_sigmask, memory_order_acquire) &
-          (1ull << (sig - 1)))) {
-      _pthread_ref(pt);
-      mark = pt;
-      break;
-    }
-    // if a thread is blocking then we check to see if it's planning
-    // to unblock our sig once the wait operation is completed; when
-    // that's the case we can cancel the thread's i/o to deliver sig
-    if (atomic_load_explicit(&pt->pt_blocker, memory_order_acquire) &&
-        !(pt->pt_blkmask & (1ull << (sig - 1)))) {
-      _pthread_ref(pt);
-      mark = pt;
-      break;
-    }
-  }
-  _pthread_unlock();
-  if (mark) {
-    // no lock needed since current thread is nameless and formless
-    __sig_killer(mark, sig, sic);
-    _pthread_unref(mark);
-  } else {
-    atomic_fetch_or_explicit(__sig.process, 1ull << (sig - 1),
-                             memory_order_relaxed);
+    sigset_t res, neu = -1;
+    sys_sigprocmask(SIG_SETMASK, &neu, &res);
+    return res;
   }
 }
 
-textwindows static char *__sig_stpcpy(char *d, const char *s) {
-  size_t i;
-  for (i = 0;; ++i)
-    if (!(d[i] = s[i]))
-      return d + i;
-}
-
-textwindows wontreturn static void __sig_death(int sig, const char *thing) {
-#ifndef TINY
-  intptr_t hStderr;
-  char sigbuf[21], s[128], *p;
-  hStderr = __imp_GetStdHandle(kNtStdErrorHandle);
-  p = __sig_stpcpy(s, "Terminating on ");
-  p = __sig_stpcpy(p, thing);
-  p = __sig_stpcpy(p, strsignal_r(sig, sigbuf));
-  p = __sig_stpcpy(p,
-                   ". Pass --strace and/or ShowCrashReports() for details.\n");
-  __imp_WriteFile(hStderr, s, p - s, 0, 0);
-#endif
-  __sig_terminate(sig);
-}
-
-//
-//   "If a program attempts to access an address within a guard page,
-//    the system raises a kNtStatusGuardPageViolation (0x80000001)
-//    exception. The system also clears the kNtPageGuard modifier,
-//    removing the memory page's guard page status. The system will not
-//    stop the next attempt to access the memory page with a
-//    kNtStatusGuardPageViolation exception."
-//
-//                              —Quoth MSDN § Creating Guard Pages
-//
-forceinline void __sig_reguard(void *page) {
-  uint32_t old_protect;
-  __imp_VirtualProtectEx(GetCurrentProcess(),
-                         (void *)((uintptr_t)page & -__pagesize), __pagesize,
-                         kNtPageReadwrite | kNtPageGuard, &old_protect);
-}
-
-// trampoline for calling signal handler when system reports crash
-textwindows static void __sig_unmaskable(struct SignalFrame *sf) {
-
-  // log vital crash information reliably for --strace before doing much
-  // we don't print this without the flag since raw numbers scare people
-  // this needs at least one page of stack memory in order to get logged
-  // otherwise it'll print a warning message about the lack of stack mem
-  STRACE("win32 vectored exception 0x%08Xu raising %G "
-         "cosmoaddr2line %s %lx %s",
-         sf->si.si_errno, sf->si.si_signo,
-         _weaken(FindDebugBinary) ? _weaken(FindDebugBinary)()
-                                  : program_invocation_name,
-         sf->ctx.uc_mcontext.gregs[REG_RIP],
-         DescribeBacktrace(
-             (struct StackFrame *)sf->ctx.uc_mcontext.gregs[REG_RBP]));
-
-  // kills process if the user did not specify a handler for this signal
-  // we also don't allow unmaskable signals to be ignored by the program
-  if (sf->rva == (intptr_t)SIG_DFL ||  //
-      sf->rva == (intptr_t)SIG_IGN)
-    __sig_death(sf->si.si_signo, "uncaught ");
-
-  // we kill the process if this thread's signal mask blocks this signal
-  // then we block some extra signals while executing the signal handler
-  struct CosmoTib *tib = __get_tls();
-  sigset_t blocksigs = __sighandmask[sf->si.si_signo];
-  if (!(sf->flags & SA_NODEFER))
-    blocksigs |= 1ull << (sf->si.si_signo - 1);
-  sf->ctx.uc_sigmask = atomic_fetch_or(&tib->tib_sigmask, blocksigs);
-  if (sf->ctx.uc_sigmask & (1ull << (sf->si.si_signo - 1)))
-    __sig_death(sf->si.si_signo, "masked ");
-
-  // this will restore the guard page if the user is using a sigaltstack
-  if (sf->si.si_errno == kNtStatusGuardPageViolation)
-    __sig_reguard(sf->si.si_addr);
-
-  // call the user signal handler
-  // and a modifiable view of the faulting code's cpu state
-  // then finally restore signal mask and return control to program
-  __sig_handler(sf->rva)(sf->si.si_signo, &sf->si, &sf->ctx);
-  atomic_store_explicit(&__get_tls()->tib_sigmask, sf->ctx.uc_sigmask,
-                        memory_order_release);
-  setcontext(&sf->ctx);
-  __builtin_unreachable();
-}
-
-//                         abashed the devil stood
-//                      and felt how awful goodness is
-__msabi HAIRY static unsigned __sig_crash(struct NtExceptionPointers *ep) {
-
-  // translate the win32 exception code into unix's si_signo and si_code
-  int sic, sig = __sig_crash_sig(ep->ExceptionRecord->ExceptionCode, &sic);
-
-  // advances the instruction pointer, to skip over debugger breakpoints
-  // this makes windows consistent with how unix kernels are implemented
-  if (sig == SIGTRAP)
-    ep->ContextRecord->Rip++;
-
-  // clears signal handler if user asked sigaction for one-shot behavior
-  unsigned rva = __sighandrvas[sig];
-  unsigned flags = __sighandflags[sig];
-  if (flags & SA_RESETHAND)
-    __sighandrvas[sig] = (int32_t)(intptr_t)SIG_DFL;
-
-  // we don't know if it is safe for signal handlers to longjmp() out of
-  // win32 vectored exception handlers so let's copy the machine context
-  // and tell win32 to restore control to __sig_unmaskable() which shall
-  // call the user signal handler safely. please note that if this crash
-  // was caused by stack overflow, then we're literally executing inside
-  // the guard page so this code can't use more than 4096 bytes of stack
-  uintptr_t sp;
-  struct CosmoTib *tib = __get_tls();
-  if (__sig_should_use_altstack(flags, tib)) {
-    sp = (uintptr_t)tib->tib_sigstack_addr + tib->tib_sigstack_size;
-  } else {
-    size_t n = sizeof(struct SignalFrame) + 32;
-    sp = (uintptr_t)alloca(n) + n;
-  }
-  sp -= sizeof(struct SignalFrame);
-  sp &= -16;
-  struct SignalFrame *sf = (struct SignalFrame *)sp;
-  __repstosb(sf, 0, sizeof(*sf));
-  __sig_translate(&sf->ctx, ep->ContextRecord);
-  sf->rva = rva;
-  sf->flags = flags;
-  sf->si.si_code = sic;
-  sf->si.si_signo = sig;
-  sf->si.si_errno = ep->ExceptionRecord->ExceptionCode;
-  if (sf->si.si_errno == kNtStatusGuardPageViolation) {
-    sf->si.si_addr = (void *)ep->ExceptionRecord->ExceptionInformation[1];
-  } else {
-    sf->si.si_addr = ep->ExceptionRecord->ExceptionAddress;
-  }
-  *(uintptr_t *)(sp -= sizeof(uintptr_t)) = ep->ContextRecord->Rip;
-  ep->ContextRecord->Rip = (intptr_t)__sig_unmaskable;
-  ep->ContextRecord->Rdi = (intptr_t)sf;
-  ep->ContextRecord->Rsp = sp;
-  return kNtExceptionContinueExecution;
-}
-
-textwindows static int __sig_console_sig(uint32_t dwCtrlType) {
-  switch (dwCtrlType) {
-    case kNtCtrlCEvent:
-      return SIGINT;
-    case kNtCtrlBreakEvent:
-      return SIGQUIT;
-    case kNtCtrlCloseEvent:
-    case kNtCtrlLogoffEvent:    // only received by services
-    case kNtCtrlShutdownEvent:  // only received by services
-      return SIGHUP;
-    default:
-      return SIGSTKFLT;
-  }
-}
-
-textwindows static int __sig_console_char(uint32_t dwCtrlType) {
-  switch (dwCtrlType) {
-    case kNtCtrlCEvent:
-      return __ttyconf.vintr;
-    case kNtCtrlBreakEvent:
-      return __ttyconf.vquit;
-    default:
-      return _POSIX_VDISABLE;
-  }
-}
-
-__msabi HAIRY bool32 __sig_console(uint32_t dwCtrlType) {
-  // win32 launches a thread to deliver ctrl-c and ctrl-break when typed
-  // it only happens when kNtEnableProcessedInput is in play on console.
-  // otherwise we need to wait until read-nt.c discovers that keystroke.
-  struct CosmoTib tls;
-  __bootstrap_tls(&tls, __builtin_frame_address(0));
-
-  // ensure that ^C or ^\ gets printed to console appropriately
-  if (_weaken(EchoConsoleNt)) {
-    char c;
-    if ((c = __sig_console_char(dwCtrlType)) != _POSIX_VDISABLE)
-      _weaken(EchoConsoleNt)(&c, sizeof(c), false);
-  }
-
-  // take control of random thread and inject call to signal handler
-  __sig_generate(__sig_console_sig(dwCtrlType), SI_KERNEL);
-  return true;
-}
-
-// returns 0 if no signal handlers were called, otherwise a bitmask
-// consisting of `1` which means a signal handler was invoked which
-// didn't have the SA_RESTART flag, and `2`, which means SA_RESTART
-// handlers were called (or `3` if both were the case).
-textwindows int __sig_check(void) {
-  int sig, res = 0;
-  while ((sig = __sig_get(atomic_load_explicit(&__get_tls()->tib_sigmask,
-                                               memory_order_acquire))))
-    res |= __sig_raise(sig, SI_KERNEL);
-  return res;
-}
-
-// background thread for delivering inter-process signals asynchronously
-// this checks for undelivered process-wide signals, once per scheduling
-// quantum, which on windows should be every ~15ms or so, unless somehow
-// the process was tuned to have more fine-grained event timing. we want
-// signals to happen faster when possible; that happens when cancelation
-// points, e.g. read need to wait on i/o; they too check for new signals
-HAIRY static uint32_t __sig_worker(void *arg) {
-  struct CosmoTib tls;
-  __bootstrap_tls(&tls, __builtin_frame_address(0));
-  char *sp = __builtin_frame_address(0);
-  __maps_track((char *)(((uintptr_t)sp + __pagesize - 1) & -__pagesize) - STKSZ,
-               STKSZ, PROT_READ | PROT_WRITE,
-               MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK);
-  for (;;) {
-    // ok sys_execve_nt() might disable this worker
-    if (~__sig_worker_state & 2) {
-
-      // dequeue all pending signals and fire them off. if there's no
-      // thread that can handle them then __sig_generate will requeue
-      // those signals back to __sig.process; hence the need for xchg
-      unsigned long sigs =
-          atomic_exchange_explicit(__sig.process, 0, memory_order_acq_rel);
-      while (sigs) {
-        int sig = bsfl(sigs) + 1;
-        sigs &= ~(1ull << (sig - 1));
-        __sig_generate(sig, SI_KERNEL);
-      }
-
-      // unblock stalled i/o signals in threads
-      _pthread_lock();
-      for (struct Dll *e = dll_first(_pthread_list); e;
-           e = dll_next(_pthread_list, e)) {
-        struct PosixThread *pt = POSIXTHREAD_CONTAINER(e);
-        if (atomic_load_explicit(&pt->pt_status, memory_order_acquire) >=
-            kPosixThreadTerminated)
-          break;
-        if (atomic_load_explicit(&pt->pt_blocker, memory_order_acquire) &&
-            (atomic_load_explicit(&pt->tib->tib_sigpending,
-                                  memory_order_acquire) &
-             ~atomic_load_explicit(&pt->pt_blkmask, memory_order_acquire)))
-          __sig_wake(pt, 0);
-      }
-      _pthread_unlock();
-
-      // unblock stalled asynchronous signals in threads
-      for (;;) {
-        sigset_t pending, mask;
-        struct PosixThread *mark = 0;
-        _pthread_lock();
-        for (struct Dll *e = dll_first(_pthread_list); e;
-             e = dll_next(_pthread_list, e)) {
-          struct PosixThread *pt = POSIXTHREAD_CONTAINER(e);
-          if (atomic_load_explicit(&pt->pt_status, memory_order_acquire) >=
-              kPosixThreadTerminated)
-            break;
-          pending = atomic_load_explicit(&pt->tib->tib_sigpending,
-                                         memory_order_acquire);
-          mask =
-              atomic_load_explicit(&pt->tib->tib_sigmask, memory_order_acquire);
-          if (pending & ~mask) {
-            _pthread_ref(pt);
-            mark = pt;
-            break;
-          }
-        }
-        _pthread_unlock();
-        if (!mark)
-          break;
-        while (!atomic_compare_exchange_weak_explicit(
-            &mark->tib->tib_sigpending, &pending, pending & ~mask,
-            memory_order_acq_rel, memory_order_relaxed)) {
-        }
-        while ((pending = pending & ~mask)) {
-          int sig = bsfl(pending) + 1;
-          pending &= ~(1ull << (sig - 1));
-          __sig_killer(mark, sig, SI_KERNEL);
-        }
-        _pthread_unref(mark);
+void __sig_unblock(sigset_t m) {
+  if (IsWindows() || IsMetal()) {
+    if (__tls_enabled) {
+      atomic_store_explicit(&__get_tls()->tib_sigmask, m, memory_order_release);
+      if (_weaken(__sig_check)) {
+        _weaken(__sig_check)();
       }
     }
-
-    // wait until next scheduler quantum
-    __sig_worker_state |= 1;
-    Sleep(POLL_INTERVAL_MS);
-    __sig_worker_state &= ~1;
+  } else {
+    sys_sigprocmask(SIG_SETMASK, &m, 0);
   }
-  __builtin_unreachable();
 }
-
-__attribute__((__constructor__(10))) textstartup void __sig_init(void) {
-  if (!IsWindows())
-    return;
-  AddVectoredExceptionHandler(true, (void *)__sig_crash);
-  SetConsoleCtrlHandler((void *)__sig_console, true);
-  CreateThread(0, STKSZ, __sig_worker, 0, kNtStackSizeParamIsAReservation, 0);
-}
-
-#endif /* __x86_64__ */
diff --git a/libc/intrin/sigblock.c b/libc/intrin/sigblock.c
deleted file mode 100644
index 919dced56..000000000
--- a/libc/intrin/sigblock.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/sig.internal.h"
-#include "libc/calls/struct/sigset.internal.h"
-#include "libc/dce.h"
-#include "libc/intrin/atomic.h"
-#include "libc/intrin/weaken.h"
-#include "libc/sysv/consts/sig.h"
-#include "libc/thread/tls.h"
-
-// since there's so many c library interfaces and system call wrappers
-// that always need to block signals we avoid the distraction of their
-// ftrace and strace output being muddied with sigprocmask lines. it's
-// usually better that sigprocmask only strace the user is calling it.
-// plus, since we have a very specific use case, this code goes faster
-
-sigset_t __sig_block(void) {
-  if (IsWindows() || IsMetal()) {
-    if (__tls_enabled)
-      return atomic_exchange_explicit(&__get_tls()->tib_sigmask, -1,
-                                      memory_order_acquire);
-    else
-      return 0;
-  } else {
-    sigset_t res, neu = -1;
-    sys_sigprocmask(SIG_SETMASK, &neu, &res);
-    return res;
-  }
-}
-
-void __sig_unblock(sigset_t m) {
-  if (IsWindows() || IsMetal()) {
-    if (__tls_enabled) {
-      atomic_store_explicit(&__get_tls()->tib_sigmask, m, memory_order_release);
-      if (_weaken(__sig_check))
-        _weaken(__sig_check)();
-    }
-  } else {
-    sys_sigprocmask(SIG_SETMASK, &m, 0);
-  }
-}
diff --git a/libc/intrin/sigproc.c b/libc/intrin/sigproc.c
deleted file mode 100644
index e3f6d0673..000000000
--- a/libc/intrin/sigproc.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/atomic.h"
-#include "libc/calls/sig.internal.h"
-#include "libc/fmt/internal.h"
-#include "libc/nt/createfile.h"
-#include "libc/nt/enum/accessmask.h"
-#include "libc/nt/enum/creationdisposition.h"
-#include "libc/nt/enum/fileflagandattributes.h"
-#include "libc/nt/enum/filemapflags.h"
-#include "libc/nt/enum/filemovemethod.h"
-#include "libc/nt/enum/filesharemode.h"
-#include "libc/nt/enum/pageflags.h"
-#include "libc/nt/files.h"
-#include "libc/nt/memory.h"
-#include "libc/nt/process.h"
-#include "libc/nt/runtime.h"
-#include "libc/nt/thunk/msabi.h"
-#ifdef __x86_64__
-
-#define ABI __msabi textwindows dontinstrument
-
-// cut back on code size and avoid setting errno
-// this code is a mandatory dependency of winmain
-__msabi extern typeof(CloseHandle) *const __imp_CloseHandle;
-__msabi extern typeof(CreateDirectory) *const __imp_CreateDirectoryW;
-__msabi extern typeof(CreateFile) *const __imp_CreateFileW;
-__msabi extern typeof(CreateFileMapping) *const __imp_CreateFileMappingW;
-__msabi extern typeof(MapViewOfFileEx) *const __imp_MapViewOfFileEx;
-__msabi extern typeof(SetEndOfFile) *const __imp_SetEndOfFile;
-__msabi extern typeof(SetFilePointer) *const __imp_SetFilePointer;
-__msabi extern typeof(GetEnvironmentVariable)
-    *const __imp_GetEnvironmentVariableW;
-
-// Generates C:\ProgramData\cosmo\sig\x\y.pid like path
-ABI char16_t *__sig_process_path(char16_t *path, uint32_t pid,
-                                 int create_directories) {
-  char16_t buf[3];
-  char16_t *p = path;
-  uint32_t vlen = __imp_GetEnvironmentVariableW(u"SYSTEMDRIVE", buf, 3);
-  *p++ = vlen == 2 ? buf[0] : 'C';
-  *p++ = ':';
-  *p++ = '\\';
-  *p++ = 'P';
-  *p++ = 'r';
-  *p++ = 'o';
-  *p++ = 'g';
-  *p++ = 'r';
-  *p++ = 'a';
-  *p++ = 'm';
-  *p++ = 'D';
-  *p++ = 'a';
-  *p++ = 't';
-  *p++ = 'a';
-  *p = 0;
-  if (create_directories)
-    __imp_CreateDirectoryW(path, 0);
-  *p++ = '\\';
-  *p++ = 'c';
-  *p++ = 'o';
-  *p++ = 's';
-  *p++ = 'm';
-  *p++ = 'o';
-  *p = 0;
-  if (create_directories)
-    __imp_CreateDirectoryW(path, 0);
-  *p++ = '\\';
-  *p++ = 's';
-  *p++ = 'i';
-  *p++ = 'g';
-  *p = 0;
-  if (create_directories)
-    __imp_CreateDirectoryW(path, 0);
-  *p++ = '\\';
-  p = __itoa16(p, (pid & 0x000ff800) >> 11);
-  *p = 0;
-  if (create_directories)
-    __imp_CreateDirectoryW(path, 0);
-  *p++ = '\\';
-  p = __itoa16(p, pid);
-  *p++ = '.';
-  *p++ = 'p';
-  *p++ = 'i';
-  *p++ = 'd';
-  *p = 0;
-  return path;
-}
-
-ABI atomic_ulong *__sig_map_process(int pid, int disposition) {
-  char16_t path[128];
-  __sig_process_path(path, pid, disposition == kNtOpenAlways);
-  intptr_t hand = __imp_CreateFileW(path, kNtGenericRead | kNtGenericWrite,
-                                    kNtFileShareRead | kNtFileShareWrite, 0,
-                                    disposition, kNtFileAttributeNormal, 0);
-  if (hand == -1)
-    return 0;
-  __imp_SetFilePointer(hand, 8, 0, kNtFileBegin);
-  __imp_SetEndOfFile(hand);
-  intptr_t map = __imp_CreateFileMappingW(hand, 0, kNtPageReadwrite, 0, 8, 0);
-  if (!map) {
-    __imp_CloseHandle(hand);
-    return 0;
-  }
-  atomic_ulong *sigs = __imp_MapViewOfFileEx(map, kNtFileMapWrite, 0, 0, 8, 0);
-  __imp_CloseHandle(map);
-  __imp_CloseHandle(hand);
-  return sigs;
-}
-
-#endif /* __x86_64__ */
diff --git a/libc/intrin/sigprocmask-nt.c b/libc/intrin/sigprocmask-nt.c
index 38246b430..0d31b61af 100644
--- a/libc/intrin/sigprocmask-nt.c
+++ b/libc/intrin/sigprocmask-nt.c
@@ -16,6 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
 #include "libc/calls/sig.internal.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/intrin/atomic.h"
@@ -26,25 +27,38 @@
 #ifdef __x86_64__
 
 textwindows int __sig_mask(int how, const sigset_t *neu, sigset_t *old) {
-  if (how != SIG_BLOCK && how != SIG_UNBLOCK && how != SIG_SETMASK)
+
+  // validate api usage
+  if (how != SIG_BLOCK && how != SIG_UNBLOCK && how != SIG_SETMASK) {
     return einval();
+  }
+
+  // get address of sigset to modify
+  _Atomic(uint64_t) *mask = &__get_tls()->tib_sigmask;
+
+  // handle read-only case
   sigset_t oldmask;
-  atomic_ulong *mask = &__get_tls()->tib_sigmask;
   if (neu) {
     if (how == SIG_BLOCK) {
-      oldmask = atomic_fetch_or(mask, *neu);
+      oldmask = atomic_fetch_or_explicit(mask, *neu, memory_order_acq_rel);
     } else if (how == SIG_UNBLOCK) {
-      oldmask = atomic_fetch_and(mask, ~*neu);
-    } else {
-      oldmask = atomic_exchange(mask, *neu);
+      oldmask = atomic_fetch_and_explicit(mask, ~*neu, memory_order_acq_rel);
+    } else {  // SIG_SETMASK
+      oldmask = atomic_exchange_explicit(mask, *neu, memory_order_acq_rel);
     }
-    if (_weaken(__sig_check))
-      _weaken(__sig_check)();
   } else {
-    oldmask = atomic_load(mask);
+    oldmask = atomic_load_explicit(mask, memory_order_acquire);
   }
-  if (old)
+
+  // return old signal mask to caller
+  if (old) {
     *old = oldmask;
+  }
+
+  if (_weaken(__sig_check)) {
+    _weaken(__sig_check)();
+  }
+
   return 0;
 }
 
diff --git a/libc/intrin/sigprocmask-sysv.c b/libc/intrin/sigprocmask-sysv.c
index c7a43aaeb..685f36c15 100644
--- a/libc/intrin/sigprocmask-sysv.c
+++ b/libc/intrin/sigprocmask-sysv.c
@@ -32,7 +32,8 @@ int sys_sigprocmask(int how, const sigset_t *opt_set,
         how, opt_set ? (sigset_t *)(intptr_t)(uint32_t)*opt_set : 0, 0, 0);
     rc = 0;
   }
-  if (rc != -1 && opt_out_oldset)
+  if (rc != -1 && opt_out_oldset) {
     *opt_out_oldset = old[0];
+  }
   return rc;
 }
diff --git a/libc/intrin/sigprocmask.c b/libc/intrin/sigprocmask.c
index bb1406624..aa76966ab 100644
--- a/libc/intrin/sigprocmask.c
+++ b/libc/intrin/sigprocmask.c
@@ -16,12 +16,18 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/dce.h"
+#include "libc/fmt/itoa.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/sig.h"
+#include "libc/sysv/errfuns.h"
 
 /**
  * Changes signal blocking state of calling thread, e.g.:
@@ -49,8 +55,9 @@ int sigprocmask(int how, const sigset_t *opt_set, sigset_t *opt_out_oldset) {
   } else {
     rc = sys_sigprocmask(how, opt_set, opt_out_oldset ? &old : 0);
   }
-  if (rc != -1 && opt_out_oldset)
+  if (rc != -1 && opt_out_oldset) {
     *opt_out_oldset = old;
+  }
   STRACE("sigprocmask(%s, %s, [%s]) → %d% m", DescribeHow(how),
          DescribeSigset(0, opt_set), DescribeSigset(rc, opt_out_oldset), rc);
   return rc;
diff --git a/libc/intrin/sizefmt.c b/libc/intrin/sizefmt.c
index 14e52c9f9..a3cb8ea6b 100644
--- a/libc/intrin/sizefmt.c
+++ b/libc/intrin/sizefmt.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/itoa.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 /**
  * Represents size as readable string.
diff --git a/libc/intrin/stack.c b/libc/intrin/stack.c
deleted file mode 100644
index 27a20a06c..000000000
--- a/libc/intrin/stack.c
+++ /dev/null
@@ -1,513 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/stack.h"
-#include "libc/assert.h"
-#include "libc/atomic.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/syscall-sysv.internal.h"
-#include "libc/cosmo.h"
-#include "libc/dce.h"
-#include "libc/dlopen/dlfcn.h"
-#include "libc/errno.h"
-#include "libc/intrin/describeflags.h"
-#include "libc/intrin/dll.h"
-#include "libc/intrin/maps.h"
-#include "libc/intrin/rlimit.h"
-#include "libc/intrin/strace.h"
-#include "libc/intrin/weaken.h"
-#include "libc/limits.h"
-#include "libc/macros.h"
-#include "libc/runtime/runtime.h"
-#include "libc/sock/internal.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/thread/posixthread.internal.h"
-#include "libc/thread/thread.h"
-
-/**
- * @fileoverview cosmo stack memory manager
- */
-
-#define MAP_GROWSDOWN_LINUX 0x00000100
-#define MAP_ANONYMOUS_LINUX 0x00000020
-#define MAP_NOREPLACE_LINUX 0x08000000
-#define MAP_NORESERVE_LINUX 0x00004000
-
-#define MAP_ANON_OPENBSD  0x1000
-#define MAP_STACK_OPENBSD 0x4000
-
-#define THREADSTACK_CONTAINER(e) DLL_CONTAINER(struct CosmoStack, elem, e)
-
-struct CosmoStack {
-  struct Dll elem;
-  void *stackaddr;
-  size_t stacksize;
-  size_t guardsize;
-};
-
-struct CosmoStacks {
-  atomic_uint once;
-  pthread_mutex_t lock;
-  struct Dll *stacks;
-  struct Dll *objects;
-  unsigned count;
-};
-
-struct CosmoStacksConfig {
-  unsigned maxstacks;
-};
-
-static struct CosmoStacks cosmo_stacks = {
-    .lock = PTHREAD_MUTEX_INITIALIZER,
-};
-
-static struct CosmoStacksConfig cosmo_stacks_config = {
-    .maxstacks = 3,
-};
-
-void cosmo_stack_lock(void) {
-  _pthread_mutex_lock(&cosmo_stacks.lock);
-}
-
-void cosmo_stack_unlock(void) {
-  _pthread_mutex_unlock(&cosmo_stacks.lock);
-}
-
-void cosmo_stack_wipe(void) {
-  _pthread_mutex_wipe_np(&cosmo_stacks.lock);
-}
-
-// map_growsdown will not grow more than rlimit_stack
-static size_t cosmo_stack_maxgrow(void) {
-  return __rlimit_stack_get().rlim_cur & -__pagesize;
-}
-
-// allocates private anonymous fixed noreplace memory on linux
-static void *flixmap(void *addr, size_t size, int prot, int flags) {
-  flags |= MAP_PRIVATE | MAP_ANONYMOUS_LINUX | MAP_NOREPLACE_LINUX;
-  void *res = __sys_mmap(addr, size, prot, flags, -1, 0, 0);
-  if (res != MAP_FAILED) {
-    if (res != addr) {
-      sys_munmap(addr, size);
-      errno = EEXIST;  // polyfill linux 4.17+ behavior
-      res = 0;
-    }
-  } else {
-    res = 0;
-  }
-  STRACE("mmap(%p, %'zu, %s, %s) → %p% m", addr, size, DescribeProtFlags(prot),
-         DescribeMapFlags(flags), res);
-  return res;
-}
-
-// maps stack on linux
-static void *slackmap(size_t stacksize, size_t guardsize) {
-  int olde = errno;
-  struct Map *prev, *map;
-  char *max = (char *)PTRDIFF_MAX;
-  size_t need = guardsize + stacksize;
-  __maps_lock();
-  for (;;) {
-
-    // look for empty space beneath higher mappings
-    char *region = 0;
-    for (map = __maps_floor(max); map; map = prev) {
-      char *min = (char *)(intptr_t)__gransize;
-      if ((prev = __maps_prev(map)))
-        min = prev->addr + ROUNDUP(prev->size, __gransize);
-      if (map->addr - min >= need) {
-        region = map->addr - need;
-        max = region - 1;
-        break;
-      }
-    }
-    if (!region)
-      break;
-
-    // track intended memory in rbtree
-    if (!__maps_track(region, guardsize, PROT_NONE,
-                      MAP_PRIVATE | MAP_ANONYMOUS_LINUX))
-      break;
-    if (!__maps_track(region + guardsize, stacksize, PROT_READ | PROT_WRITE,
-                      MAP_PRIVATE | MAP_ANONYMOUS_LINUX)) {
-      __maps_untrack(region, need);
-      break;
-    }
-    __maps_unlock();
-
-    // ask kernel to create guard region
-    // taking special care to not clobber untracked mappings
-    //
-    // it's important that this call happen first, since it limits how
-    // much memory map_growsdown will secretly consume. if there's
-    // nothing beneath a map_growsdown mapping, then the kernel reserves
-    // (and this isn't listed /proc/PID/maps so don't bother looking)
-    // `rlimit_stack.rlim_cur & -__pagesize` bytes of memory including
-    // this top-most page, and another 1mb of guard pages beneath that.
-    // but by mapping our guard pages manually, we ensure the guard
-    // region and the stack itself will be exactly as big as we want.
-    //
-    // you'd think we could mmap(0, pagesz, growsdown) to let the kernel
-    // pick an address and then we could just upscale the user's stack
-    // size request to whatever rlimit_stack is if it's bigger. but the
-    // linux kernel will actually choose addresses between existing maps
-    // where the hole is smaller than rlimit_stack.
-    //
-    // to use map_growsdown, we must use map_fixed. normally when we use
-    // map_fixed, we reserve an entire kernel-assigned region beforehand
-    // to ensure there isn't any overlap with existing mappings. however
-    // since growsdown stops growing when it encounters another mapping,
-    // you can't map it on top of a reservation mapping. so we must take
-    // a leap of faith there aren't any mystery mappings twixt the guard
-    // region and growsdown page below.
-    char *guard_region =
-        flixmap(region, guardsize, PROT_NONE, MAP_NORESERVE_LINUX);
-    if (!guard_region) {
-    RecoverFromMmapFailure:
-      if (errno != EEXIST) {
-        // mmap() probably raised enomem due to rlimit_as etc.
-        __maps_untrack(region, need);
-        return 0;
-      } else {
-        // we've encountered a mystery mapping. it's hard to imagine
-        // this happening, since we don't use map_growsdown when
-        // cosmo_dlopen() is linked in the binary. in that case, the
-        // tracker we created covers at least some of the rogue map,
-        // therefore this issue should fix itself if we keep going
-        errno = olde;
-        __maps_lock();
-        ++max;
-        continue;
-      }
-    }
-
-    // ask kernel to create stack pages
-    // taking special care to not clobber untracked mappings
-    char *top_page = flixmap(region + need - __pagesize, __pagesize,
-                             PROT_READ | PROT_WRITE, MAP_GROWSDOWN_LINUX);
-    if (!top_page) {
-      sys_munmap(region, guardsize);
-      goto RecoverFromMmapFailure;
-    }
-
-    // return address to bottom of stack
-    return region + guardsize;
-  }
-  __maps_unlock();
-  errno = ENOMEM;
-  return 0;
-}
-
-static errno_t cosmo_stack_munmap(char *stackaddr, size_t stacksize,
-                                  size_t guardsize) {
-  errno_t r = 0;
-  errno_t e = errno;
-  if (!munmap(stackaddr - guardsize,  //
-              guardsize + stacksize)) {
-    r = errno;
-    errno = e;
-  }
-  return r;
-}
-
-static void cosmo_stack_populate(void) {
-  errno_t e = errno;
-  void *map = mmap(0, __pagesize, PROT_READ | PROT_WRITE,
-                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-  errno = e;
-  if (map != MAP_FAILED) {
-    struct CosmoStack *ts = map;
-    int n = __pagesize / sizeof(struct CosmoStack);
-    for (int i = 0; i < n; ++i) {
-      dll_init(&ts[i].elem);
-      dll_make_first(&cosmo_stacks.objects, &ts[i].elem);
-    }
-  }
-}
-
-static struct Dll *cosmo_stack_decimate(unsigned maxstacks) {
-  struct Dll *surplus = 0;
-  while (cosmo_stacks.count > maxstacks) {
-    struct Dll *e = dll_last(cosmo_stacks.stacks);
-    dll_remove(&cosmo_stacks.stacks, e);
-    dll_make_first(&surplus, e);
-    --cosmo_stacks.count;
-  }
-  return surplus;
-}
-
-static void cosmo_stack_rehabilitate(struct Dll *stacks) {
-  struct Dll *e;
-  for (e = dll_first(stacks); e; e = dll_next(stacks, e))
-    cosmo_stack_munmap(THREADSTACK_CONTAINER(e)->stackaddr,
-                       THREADSTACK_CONTAINER(e)->stacksize,
-                       THREADSTACK_CONTAINER(e)->guardsize);
-  cosmo_stack_lock();
-  dll_make_first(&cosmo_stacks.objects, stacks);
-  cosmo_stack_unlock();
-}
-
-/**
- * Empties unused stack cache.
- *
- * To make POSIX threads as cheap as possible to spawn, we recycle their
- * stacks without zeroing their memory. On Linux for an 80kb stack size,
- * that makes launching a thread take 40µs rather than 80µs. However the
- * stack cache needs to be cleared in certain cases. This is called upon
- * exit() automatically but anyone can clear this at any other time too.
- *
- * @see pthread_decimate_np()
- */
-void cosmo_stack_clear(void) {
-  cosmo_stack_lock();
-  struct Dll *stacks = cosmo_stacks.stacks;
-  cosmo_stacks.stacks = 0;
-  cosmo_stacks.count = 0;
-  cosmo_stack_unlock();
-  cosmo_stack_rehabilitate(stacks);
-}
-
-/**
- * Gets maximum number of unused stacks cosmo should cache.
- * @see cosmo_stack_setmaxstacks()
- */
-int cosmo_stack_getmaxstacks(void) {
-  return cosmo_stacks_config.maxstacks;
-}
-
-/**
- * Sets maximum number of unused stacks cosmo should cache.
- *
- * This lets you place some limitations on how much stack memory the
- * cosmo runtime will cache. This number is a count of stacks rather
- * than the number of bytes they contain. Old stacks are freed in a
- * least recently used fashion once the cache exceeds this limit.
- *
- * If this is set to zero, then the cosmo stack allocator enters a
- * highly secure hardening mode where cosmo_stack_alloc() zeroes all
- * stack memory that's returned.
- *
- * Setting this to a negative number makes the cache size unlimited.
- *
- * Please note this limit only applies to stacks that aren't in use.
- *
- * Your default is three stacks may be cached at any given moment.
- *
- * If `maxstacks` is less than the current cache size, then surplus
- * entries will be evicted and freed before this function returns.
- */
-void cosmo_stack_setmaxstacks(int maxstacks) {
-  cosmo_stack_lock();
-  cosmo_stacks_config.maxstacks = maxstacks;
-  struct Dll *stacks = cosmo_stack_decimate(maxstacks);
-  cosmo_stack_unlock();
-  cosmo_stack_rehabilitate(stacks);
-}
-
-/**
- * Allocates stack memory.
- *
- * This is a caching stack allocator that's used by the POSIX threads
- * runtime but you may also find it useful for setcontext() coroutines
- * or sigaltstack(). Normally you can get away with using malloc() for
- * creating stacks. However some OSes (e.g. OpenBSD) forbid you from
- * doing that for anything except sigaltstack(). This API serves to
- * abstract all the gory details of gaining authorized memory, and
- * additionally implements caching for lightning fast performance.
- *
- * The stack size must be nonzero. It specifies the minimum amount of
- * stack space that will be available for use. The provided value is
- * rounded up to the system page size. It may be increased further for
- * various reasons. Your stack size parameter will be updated with the
- * chosen value upon success.
- *
- * The guard size specifies the minimum amount of memory that should be
- * protected beneath your stack. This helps ensure stack overflows cause
- * a segfault rather than corrupting memory silently. This may be set to
- * zero in which case no guard pages will be made. This value is rounded
- * up to the system page size. The corrected value will be returned upon
- * success. Your guard size needs to be small enough to leave room for
- * at least one memory page in your stack size i.e. `guardsize +
- * pagesize <= stacksize` must be the case. Otherwise this function will
- * return an `EINVAL` error.
- *
- * When you're done using your stack, pass it to cosmo_stack_free() so
- * it can be recycled. Stacks are only recycled when the `stacksize` and
- * `guardsize` parameters match the constraints described above. Stacks
- * that don't end up getting reused will be freed eventually, in a least
- * recently used way based upon your cosmo_stack_setmaxstacks() setting.
- *
- * This function returns 0 on success, or an errno on error. See the
- * documentation of mmap() for a list possible errors that may occur.
- */
-errno_t cosmo_stack_alloc(size_t *inout_stacksize,  //
-                          size_t *inout_guardsize,  //
-                          void **out_stackaddr) {
-
-  // validate arguments
-  size_t stacksize = *inout_stacksize;
-  size_t guardsize = *inout_guardsize;
-  stacksize = (stacksize + __pagesize - 1) & -__pagesize;
-  guardsize = (guardsize + __pagesize - 1) & -__pagesize;
-  if (!stacksize)
-    return EINVAL;
-
-  // recycle stack
-  void *stackaddr = 0;
-  cosmo_stack_lock();
-  for (struct Dll *e = dll_first(cosmo_stacks.stacks); e;
-       e = dll_next(cosmo_stacks.stacks, e)) {
-    struct CosmoStack *ts = THREADSTACK_CONTAINER(e);
-    if (ts->stacksize == stacksize &&  //
-        ts->guardsize == guardsize) {
-      stackaddr = ts->stackaddr;
-      stacksize = ts->stacksize;
-      guardsize = ts->guardsize;
-      dll_remove(&cosmo_stacks.stacks, e);
-      dll_make_first(&cosmo_stacks.objects, e);
-      --cosmo_stacks.count;
-      break;
-    }
-  }
-  cosmo_stack_unlock();
-
-  // create stack
-  if (!stackaddr) {
-    errno_t olde = errno;
-    if (!IsTiny() && IsLinux() && guardsize && !_weaken(cosmo_dlopen) &&
-        stacksize <= cosmo_stack_maxgrow() && !IsQemuUser()) {
-      // this special linux-only stack allocator significantly reduces
-      // the consumption of virtual memory.
-      if (!(stackaddr = slackmap(stacksize, guardsize))) {
-        errno_t err = errno;
-        errno = olde;
-        return err;
-      }
-    } else {
-      char *map = mmap(0, guardsize + stacksize, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-      if (map == MAP_FAILED) {
-        errno_t err = errno;
-        errno = olde;
-        return err;
-      }
-      stackaddr = map + guardsize;
-      if (IsOpenbsd())
-        if (!TellOpenbsdThisIsStackMemory(stackaddr, stacksize))
-          notpossible;
-      if (guardsize) {
-        if (mprotect(map, guardsize, PROT_NONE | PROT_GUARD)) {
-          errno_t err = errno;
-          munmap(map, guardsize + stacksize);
-          errno = olde;
-          return err;
-        }
-      }
-    }
-  }
-
-  // return stack
-  *inout_stacksize = stacksize;
-  *inout_guardsize = guardsize;
-  *out_stackaddr = stackaddr;
-  return 0;
-}
-
-static void cosmo_stack_setup(void) {
-  atexit(cosmo_stack_clear);
-}
-
-/**
- * Frees stack memory.
- *
- * While not strictly required, it's assumed the three parameters are
- * those returned by an earlier call to cosmo_stack_alloc(). If they
- * aren't page aligned and rounded, this function will return EINVAL.
- *
- * This function returns 0 on success, or an errno on error. The `errno`
- * variable is never clobbered. You can only dependably count on this to
- * return an error on failure when you say `cosmo_stack_setmaxstacks(0)`
- */
-errno_t cosmo_stack_free(void *stackaddr, size_t stacksize, size_t guardsize) {
-  if (!stacksize)
-    return EINVAL;
-  if (stacksize & (__pagesize - 1))
-    return EINVAL;
-  if (guardsize & (__pagesize - 1))
-    return EINVAL;
-  if ((uintptr_t)stackaddr & (__pagesize - 1))
-    return EINVAL;
-  cosmo_stack_lock();
-  struct Dll *surplus = 0;
-  if (cosmo_stacks_config.maxstacks) {
-    cosmo_once(&cosmo_stacks.once, cosmo_stack_setup);
-    surplus = cosmo_stack_decimate(cosmo_stacks_config.maxstacks - 1);
-    struct CosmoStack *ts = 0;
-    if (dll_is_empty(cosmo_stacks.objects))
-      cosmo_stack_populate();
-    struct Dll *e;
-    if ((e = dll_first(cosmo_stacks.objects))) {
-      dll_remove(&cosmo_stacks.objects, e);
-      ts = THREADSTACK_CONTAINER(e);
-    }
-    if (ts) {
-      ts->stackaddr = stackaddr;
-      ts->stacksize = stacksize;
-      ts->guardsize = guardsize;
-      dll_make_first(&cosmo_stacks.stacks, &ts->elem);
-      ++cosmo_stacks.count;
-      stackaddr = 0;
-    }
-  }
-  cosmo_stack_unlock();
-  cosmo_stack_rehabilitate(surplus);
-  errno_t err = 0;
-  if (stackaddr)
-    err = cosmo_stack_munmap(stackaddr, stacksize, guardsize);
-  return err;
-}
-
-relegated bool TellOpenbsdThisIsStackMemory(void *addr, size_t size) {
-  return __sys_mmap(
-             addr, size, PROT_READ | PROT_WRITE,
-             MAP_PRIVATE | MAP_FIXED | MAP_ANON_OPENBSD | MAP_STACK_OPENBSD, -1,
-             0, 0) == addr;
-}
-
-// OpenBSD only permits RSP to occupy memory that's been explicitly
-// defined as stack memory, i.e. `lo <= %rsp < hi` must be the case
-relegated bool FixupCustomStackOnOpenbsd(pthread_attr_t *attr) {
-
-  // get interval
-  uintptr_t lo = (uintptr_t)attr->__stackaddr;
-  uintptr_t hi = lo + attr->__stacksize;
-
-  // squeeze interval
-  lo = (lo + __pagesize - 1) & -__pagesize;
-  hi = hi & -__pagesize;
-
-  // tell os it's stack memory
-  if (!TellOpenbsdThisIsStackMemory((void *)lo, hi - lo))
-    return false;
-
-  // update attributes with usable stack address
-  attr->__stackaddr = (void *)lo;
-  attr->__stacksize = hi - lo;
-  return true;
-}
diff --git a/libc/intrin/stack.h b/libc/intrin/stack.h
deleted file mode 100644
index 282244547..000000000
--- a/libc/intrin/stack.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_STACK_H_
-#define COSMOPOLITAN_LIBC_STACK_H_
-#include "libc/thread/thread.h"
-COSMOPOLITAN_C_START_
-
-void cosmo_stack_lock(void);
-void cosmo_stack_unlock(void);
-void cosmo_stack_wipe(void);
-
-bool TellOpenbsdThisIsStackMemory(void *, size_t);
-bool FixupCustomStackOnOpenbsd(pthread_attr_t *);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_LIBC_STACK_H_ */
diff --git a/libc/intrin/stackcall.S b/libc/intrin/stackcall.S
index 6e3658a47..6ad9bc8ec 100644
--- a/libc/intrin/stackcall.S
+++ b/libc/intrin/stackcall.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Calls function on different stack.
 //
diff --git a/libc/intrin/stackchkguard.S b/libc/intrin/stackchkguard.S
index f35484a8e..b78117a5d 100644
--- a/libc/intrin/stackchkguard.S
+++ b/libc/intrin/stackchkguard.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Canary for -fstack-protector.
 //
diff --git a/libc/intrin/stdio.c b/libc/intrin/stdio.c
deleted file mode 100644
index f487b0867..000000000
--- a/libc/intrin/stdio.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/intrin/atomic.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/intrin/weaken.h"
-#include "libc/mem/mem.h"
-#include "libc/stdio/internal.h"
-#include "libc/thread/posixthread.internal.h"
-
-#define STDIO_FILE_USE_AFTER_FREE 1
-#define CORRUPT_STDIO_FILE_OBJECT 1
-
-struct Stdio __stdio = {
-    .lock = PTHREAD_MUTEX_INITIALIZER,
-};
-
-void __stdio_lock(void) {
-  _pthread_mutex_lock(&__stdio.lock);
-}
-
-void __stdio_unlock(void) {
-  _pthread_mutex_unlock(&__stdio.lock);
-}
-
-static int refchk(int refs) {
-  unassert(refs != STDIO_FILE_USE_AFTER_FREE);
-  unassert(refs < CORRUPT_STDIO_FILE_OBJECT);
-  return refs;
-}
-
-void __stdio_ref(FILE *f) {
-  refchk(atomic_fetch_sub_explicit(&f->refs, 1, memory_order_relaxed));
-}
-
-static void __stdio_unref_impl(FILE *f, bool should_lock) {
-  int refs = atomic_load_explicit(&f->refs, memory_order_relaxed);
-  for (;;) {
-    refchk(refs);
-    if (refs) {
-      if (atomic_compare_exchange_strong_explicit(&f->refs, &refs, refs + 1,
-                                                  memory_order_acq_rel,
-                                                  memory_order_relaxed))
-        return;
-      continue;
-    }
-    if (should_lock) {
-      __stdio_lock();
-      if ((refs = atomic_load_explicit(&f->refs, memory_order_relaxed))) {
-        __stdio_unlock();
-        continue;
-      }
-    }
-    if (!atomic_compare_exchange_strong_explicit(
-            &f->refs, &refs, 1, memory_order_acq_rel, memory_order_relaxed)) {
-      if (should_lock)
-        __stdio_unlock();
-      continue;
-    }
-    dll_remove(&__stdio.files, &f->elem);
-    if (should_lock)
-      __stdio_unlock();
-    break;
-  }
-  if (_weaken(free)) {
-    _weaken(free)(f->getln);
-    if (f->freebuf)
-      _weaken(free)(f->buf);
-    if (f->freethis)
-      _weaken(free)(f);
-  }
-}
-
-void __stdio_unref(FILE *f) {
-  __stdio_unref_impl(f, true);
-}
-
-void __stdio_unref_unlocked(FILE *f) {
-  __stdio_unref_impl(f, false);
-}
diff --git a/libc/intrin/strace.h b/libc/intrin/strace.h
index b39a50709..3c521857f 100644
--- a/libc/intrin/strace.h
+++ b/libc/intrin/strace.h
@@ -5,19 +5,13 @@
 #define SYSDEBUG 0
 #endif
 
-#ifdef MODE_DBG
-#define _STRACE_VERBOSE 1
-#else
-#define _STRACE_VERBOSE 0
-#endif
-
-#define _NTTRACE    _STRACE_VERBOSE /* not configurable w/ flag yet */
-#define _KERNTRACE  _STRACE_VERBOSE /* not configurable w/ flag yet */
-#define _POLLTRACE  _STRACE_VERBOSE /* not configurable w/ flag yet */
-#define _LOCKTRACE  _STRACE_VERBOSE /* not configurable w/ flag yet */
-#define _DATATRACE  1               /* not configurable w/ flag yet */
-#define _STDIOTRACE 0               /* not configurable w/ flag yet */
-#define _TIMETRACE  0               /* not configurable w/ flag yet */
+#define _NTTRACE    0 /* not configurable w/ flag yet */
+#define _POLLTRACE  0 /* not configurable w/ flag yet */
+#define _DATATRACE  1 /* not configurable w/ flag yet */
+#define _LOCKTRACE  0 /* not configurable w/ flag yet */
+#define _STDIOTRACE 0 /* not configurable w/ flag yet */
+#define _KERNTRACE  0 /* not configurable w/ flag yet */
+#define _TIMETRACE  0 /* not configurable w/ flag yet */
 
 #define STRACE_PROLOGUE "%rSYS %6P %6H %'18T "
 
@@ -36,10 +30,9 @@ COSMOPOLITAN_C_START_
   ((void)(SYSDEBUG && _POLLTRACE && strace_enabled(0) > 0 && \
           (__stracef(STRACE_PROLOGUE FMT "\n", ##__VA_ARGS__), 0)))
 
-#define KERNTRACE(FMT, ...)                                                 \
-  ((void)(SYSDEBUG && _KERNTRACE && strace_enabled(0) > 0 &&                \
-          (__stracef(STRACE_PROLOGUE "\e[2m" FMT "\e[0m\n", ##__VA_ARGS__), \
-           0)))
+#define KERNTRACE(FMT, ...)                                  \
+  ((void)(SYSDEBUG && _KERNTRACE && strace_enabled(0) > 0 && \
+          (__stracef(STRACE_PROLOGUE FMT "\n", ##__VA_ARGS__), 0)))
 
 #define STDIOTRACE(FMT, ...)                                  \
   ((void)(SYSDEBUG && _STDIOTRACE && strace_enabled(0) > 0 && \
diff --git a/libc/intrin/strerror.c b/libc/intrin/strerror.c
index 89110783e..a465c5c7d 100644
--- a/libc/intrin/strerror.c
+++ b/libc/intrin/strerror.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #include "libc/str/str.h"
 
 alignas(1) static char strerror_buf[128];
diff --git a/libc/intrin/strsignal_r.c b/libc/intrin/strsignal_r.c
index 24417b36a..325838bf8 100644
--- a/libc/intrin/strsignal_r.c
+++ b/libc/intrin/strsignal_r.c
@@ -36,10 +36,12 @@
 privileged const char *strsignal_r(int sig, char buf[21]) {
   char *p;
   const char *s;
-  if (!sig)
+  if (!sig) {
     return "0";
-  if ((s = GetMagnumStr(kSignalNames, sig)))
+  }
+  if ((s = GetMagnumStr(kSignalNames, sig))) {
     return s;
+  }
   if (SIGRTMIN <= sig && sig <= SIGRTMAX) {
     sig -= SIGRTMIN;
     buf[0] = 'S';
diff --git a/libc/intrin/sys_gettid.greg.c b/libc/intrin/sys_gettid.greg.c
index fbc4dadd0..408025bc0 100644
--- a/libc/intrin/sys_gettid.greg.c
+++ b/libc/intrin/sys_gettid.greg.c
@@ -25,10 +25,7 @@
 
 __msabi extern typeof(GetCurrentThreadId) *const __imp_GetCurrentThreadId;
 
-// it's important that this be noinstrument because the child process
-// created by fork() needs to update this value quickly, since ftrace
-// will deadlock __maps_lock() if the wrong tid is accidentally used.
-dontinstrument int sys_gettid(void) {
+int sys_gettid(void) {
   int64_t wut;
 #ifdef __x86_64__
   int tid;
diff --git a/libc/intrin/sys_sched_yield.S b/libc/intrin/sys_sched_yield.S
index f78f48712..eab709511 100644
--- a/libc/intrin/sys_sched_yield.S
+++ b/libc/intrin/sys_sched_yield.S
@@ -18,15 +18,15 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/sysv/consts/nr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Relinquishes scheduled quantum.
 //
 //	@return	0 on success, or -1 w/ errno
 sys_sched_yield:
-	beg
 #ifdef __x86_64__
-	pro
+	push	%rbp
+	mov	%rsp,%rbp
 	xor	%eax,%eax
 	mov	__hostos(%rip),%dl
 
@@ -84,16 +84,13 @@ sys_sched_yield:
 //	fails a positive or negative errno might get returned.
 #endif
 
-9:	epi
+9:	leave
 	ret
 
 #elif defined(__aarch64__)
 
 	stp	x29,x30,[sp,-32]!
 	mov	x29,sp
-	.cfi_adjust_cfa_offset 32
-	.cfi_rel_offset x29,16
-	.cfi_rel_offset x30,24
 	mov	x3,0
 	mov	x2,0
 	add	x4,sp,16
@@ -104,14 +101,10 @@ sys_sched_yield:
 	mov	x16,#0x5d			// select(0,0,0,0,&blah) for xnu
 	svc	0
 	ldp	x29,x30,[sp],32
-	.cfi_adjust_cfa_offset -32
-	.cfi_restore x30
-	.cfi_restore x29
 	ret
 
 #else
 #error "arch unsupported"
 #endif
-	end
 	.endfn	sys_sched_yield,globl
 	.previous
diff --git a/libc/intrin/sys_set_tls.S b/libc/intrin/sys_set_tls.S
index 4f8130521..e013014b5 100644
--- a/libc/intrin/sys_set_tls.S
+++ b/libc/intrin/sys_set_tls.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	we can't allow ftrace here since ftrace needs tls
 sys_set_tls:
diff --git a/libc/intrin/sys_umtx_timedwait_uint.c b/libc/intrin/sys_umtx_timedwait_uint.c
index f82c2898c..9bcc7c595 100644
--- a/libc/intrin/sys_umtx_timedwait_uint.c
+++ b/libc/intrin/sys_umtx_timedwait_uint.c
@@ -23,7 +23,7 @@
 int sys_umtx_timedwait_uint_cp(atomic_int *, int, int, size_t,
                                struct _umtx_time *) asm("sys_futex_cp");
 
-int sys_umtx_timedwait_uint(atomic_int *p, int expect, bool pshare, int clock,
+int sys_umtx_timedwait_uint(atomic_int *p, int expect, bool pshare,
                             const struct timespec *abstime) {
   int op;
   size_t size;
@@ -32,7 +32,7 @@ int sys_umtx_timedwait_uint(atomic_int *p, int expect, bool pshare, int clock,
     tm_p = 0;
     size = 0;
   } else {
-    timo._clockid = clock;
+    timo._clockid = CLOCK_REALTIME;
     timo._flags = UMTX_ABSTIME;
     timo._timeout = *abstime;
     tm_p = &timo;
diff --git a/libc/intrin/terminatethisprocess.c b/libc/intrin/terminatethisprocess.c
index 2f61cdb27..cff6d6e79 100644
--- a/libc/intrin/terminatethisprocess.c
+++ b/libc/intrin/terminatethisprocess.c
@@ -16,38 +16,17 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/atomic.h"
-#include "libc/calls/sig.internal.h"
-#include "libc/limits.h"
-#include "libc/nt/files.h"
-#include "libc/nt/memory.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/thunk/msabi.h"
-#include "libc/runtime/internal.h"
-#ifdef __x86_64__
 
-__msabi extern typeof(DeleteFile) *const __imp_DeleteFileW;
 __msabi extern typeof(TerminateProcess) *const __imp_TerminateProcess;
-__msabi extern typeof(UnmapViewOfFile) *const __imp_UnmapViewOfFile;
 
 /**
  * Terminates the calling process and all of its threads.
  */
 textwindows dontinstrument void TerminateThisProcess(uint32_t dwWaitStatus) {
-
-  // delete sig file
-  char16_t path[128];
-  atomic_ulong *real;
-  atomic_ulong fake = 0;
-  real = __sig.process;
-  __sig.process = &fake;
-  __imp_UnmapViewOfFile(real);
-  __imp_DeleteFileW(__sig_process_path(path, __pid, false));
-
   // "When a process terminates itself, TerminateProcess stops execution
   // of the calling thread and does not return." -Quoth MSDN
   __imp_TerminateProcess(-1, dwWaitStatus);
   __builtin_unreachable();
 }
-
-#endif /* __x86_64__ */
diff --git a/libc/intrin/tls.c b/libc/intrin/tls.c
deleted file mode 100644
index 3a6d82db2..000000000
--- a/libc/intrin/tls.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/thread/tls.h"
-#include "libc/dce.h"
-
-/**
- * Returns location of thread information block.
- *
- * This should be favored over __get_tls() for .privileged code that
- * can't be self-modified by __enable_tls().
- */
-privileged optimizespeed struct CosmoTib *__get_tls_privileged(void) {
-#if defined(__x86_64__)
-  char *tib, *lin = (char *)0x30;
-  if (IsNetbsd() || IsOpenbsd()) {
-    asm("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");
-  } else {
-    asm("mov\t%%gs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");
-    if (IsWindows())
-      tib = *(char **)(tib + 0x1480 + __tls_index * 8);
-  }
-  return (struct CosmoTib *)tib;
-#elif defined(__aarch64__)
-  return __get_tls();
-#endif
-}
-
-#if defined(__x86_64__)
-privileged optimizespeed struct CosmoTib *__get_tls_win32(void) {
-  char *tib, *lin = (char *)0x30;
-  asm("mov\t%%gs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");
-  tib = *(char **)(tib + 0x1480 + __tls_index * 8);
-  return (struct CosmoTib *)tib;
-}
-privileged void __set_tls_win32(void *tls) {
-  asm("mov\t%1,%%gs:%0" : "=m"(*((long *)0x1480 + __tls_index)) : "r"(tls));
-}
-#endif
diff --git a/libc/intrin/tree.c b/libc/intrin/tree.c
index 2c3e3fecc..23e25f7f5 100644
--- a/libc/intrin/tree.c
+++ b/libc/intrin/tree.c
@@ -54,8 +54,7 @@ struct Tree *tree_prev(struct Tree *node) {
   return parent;
 }
 
-dontinstrument static void tree_rotate_left(struct Tree **root,
-                                            struct Tree *x) {
+static void tree_rotate_left(struct Tree **root, struct Tree *x) {
   struct Tree *y = x->right;
   x->right = tree_get_left(y);
   if (tree_get_left(y))
@@ -72,8 +71,7 @@ dontinstrument static void tree_rotate_left(struct Tree **root,
   x->parent = y;
 }
 
-dontinstrument static void tree_rotate_right(struct Tree **root,
-                                             struct Tree *y) {
+static void tree_rotate_right(struct Tree **root, struct Tree *y) {
   struct Tree *x = tree_get_left(y);
   tree_set_left(y, x->right);
   if (x->right)
@@ -90,8 +88,7 @@ dontinstrument static void tree_rotate_right(struct Tree **root,
   x->right = y;
 }
 
-dontinstrument static void tree_rebalance_insert(struct Tree **root,
-                                                 struct Tree *node) {
+static void tree_rebalance_insert(struct Tree **root, struct Tree *node) {
   struct Tree *uncle;
   tree_set_red(node, 1);
   while (node != *root && tree_get_red(node->parent)) {
@@ -160,8 +157,8 @@ void tree_insert(struct Tree **root, struct Tree *node, tree_cmp_f *cmp) {
   }
 }
 
-dontinstrument static void tree_transplant(struct Tree **root, struct Tree *u,
-                                           struct Tree *v) {
+static void tree_transplant(struct Tree **root, struct Tree *u,
+                            struct Tree *v) {
   if (!u->parent) {
     *root = v;
   } else if (u == tree_get_left(u->parent)) {
@@ -173,9 +170,8 @@ dontinstrument static void tree_transplant(struct Tree **root, struct Tree *u,
     v->parent = u->parent;
 }
 
-dontinstrument static void tree_rebalance_remove(struct Tree **root,
-                                                 struct Tree *node,
-                                                 struct Tree *parent) {
+static void tree_rebalance_remove(struct Tree **root, struct Tree *node,
+                                  struct Tree *parent) {
   struct Tree *sibling;
   while (node != *root && (!node || !tree_get_red(node))) {
     if (node == tree_get_left(parent)) {
diff --git a/libc/intrin/typeinfo.S b/libc/intrin/typeinfo.S
new file mode 100644
index 000000000..e195c73c3
--- /dev/null
+++ b/libc/intrin/typeinfo.S
@@ -0,0 +1,26 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│ vi: set noet ft=asm ts=8 sw=8 fenc=utf-8                                 :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/macros.internal.h"
+
+//	__cxxabiv1::__function_type_info (?)
+//	Because Clang in MODE=dbg doesn't respect -fno-rtti
+	.balign	8
+_ZTVN10__cxxabiv120__function_type_infoE:
+	.quad	0
+	.endobj	_ZTVN10__cxxabiv120__function_type_infoE,globl
diff --git a/libc/intrin/ubsan.c b/libc/intrin/ubsan.c
index bef84f828..e6107594a 100644
--- a/libc/intrin/ubsan.c
+++ b/libc/intrin/ubsan.c
@@ -18,7 +18,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/ubsan.h"
 #include "libc/calls/calls.h"
-#include "libc/intrin/describebacktrace.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/pushpop.h"
 #include "libc/intrin/strace.h"
@@ -32,7 +31,6 @@
 #include "libc/nt/runtime.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
-#include "libc/runtime/symbols.internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/fileno.h"
@@ -242,20 +240,16 @@ __wur static __ubsan_die_f *__ubsan_die(void) {
 
 static void __ubsan_warning(const struct UbsanSourceLocation *loc,
                             const char *description) {
-  kprintf("%s:%d: %subsan warning: %s is undefined behavior%s\n"
-          "cosmoaddr2line %s %s\n",
-          loc->file, loc->line, SUBTLE, description, RESET, __argv[0],
-          DescribeBacktrace(__builtin_frame_address(0)));
+  kprintf("%s:%d: %subsan warning: %s is undefined behavior%s\n", loc->file,
+          loc->line, SUBTLE, description, RESET);
   if (__ubsan_strict)
     __ubsan_die()();
 }
 
 __wur __ubsan_die_f *__ubsan_abort(const struct UbsanSourceLocation *loc,
                                    const char *description) {
-  kprintf("\n%s:%d: %subsan error%s: %s (tid %d)\n"
-          "cosmoaddr2line %s %s\n",
-          loc->file, loc->line, RED2, RESET, description, gettid(), __argv[0],
-          DescribeBacktrace(__builtin_frame_address(0)));
+  kprintf("\n%s:%d: %subsan error%s: %s (tid %d)\n", loc->file, loc->line, RED2,
+          RESET, description, gettid());
   return __ubsan_die();
 }
 
diff --git a/libc/intrin/ulock.c b/libc/intrin/ulock.c
index f4da16d18..906f96ecc 100644
--- a/libc/intrin/ulock.c
+++ b/libc/intrin/ulock.c
@@ -17,12 +17,12 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/ulock.h"
+#include "libc/assert.h"
+#include "libc/calls/calls.h"
 #include "libc/calls/syscall_support-sysv.internal.h"
-#include "libc/errno.h"
+#include "libc/dce.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/intrin/strace.h"
-#include "libc/intrin/ulock.h"
 
 // XNU futexes
 // https://opensource.apple.com/source/xnu/xnu-7195.50.7.100.1/bsd/sys/ulock.h.auto.html
@@ -32,26 +32,6 @@ int sys_ulock_wait(uint32_t operation, void *addr, uint64_t value,
                    uint32_t timeout_micros) asm("sys_futex_cp");
 
 // returns number of other waiters, or -1 w/ errno
-//
-// - EINTR means a signal handler was called. This is how we support
-//   things like POSIX thread cancelation.
-//
-// - EFAULT if XNU couldn't read `addr`. This is normally considered a
-//   programming error, but with ulock it can actually be a transient
-//   error due to low memory conditions. Apple recommends retrying.
-//
-// - ENOMEM means XNU wasn't able to allocate memory for kernel internal
-//   data structures. Apple doesn't provide any advice on what to do. We
-//   simply turn this into EAGAIN.
-//
-// - EAGAIN if XNU told us EFAULT but cosmo believes the address exists.
-//   This value is also used as a substitute for ENOMEM.
-//
-// - EINVAL could mean operation is invalid, addr is null or misaligned;
-//   it could also mean another thread calling ulock on this address was
-//   configured (via operation) in an inconsistent way.
-//
-// see also os_sync_wait_on_address.h from xcode sdk
 int ulock_wait(uint32_t operation, void *addr, uint64_t value,
                uint32_t timeout_micros) {
   int rc;
@@ -59,31 +39,13 @@ int ulock_wait(uint32_t operation, void *addr, uint64_t value,
   LOCKTRACE("ulock_wait(%#x, %p, %lx, %u) → ...", operation, addr, value,
             timeout_micros);
   rc = sys_ulock_wait(operation, addr, value, timeout_micros);
-  if (rc == -1) {
-    if (errno == ENOMEM)
-      errno = EAGAIN;
-    if (errno == EFAULT)
-      if (!kisdangerous(addr))
-        errno = EAGAIN;
-  }
   LOCKTRACE("ulock_wait(%#x, %p, %lx, %u) → %d% m", operation, addr, value,
             timeout_micros, rc);
   return rc;
 }
 
 // returns -errno
-//
-// - ENOENT means there wasn't anyone to wake
-//
-// - EINVAL could mean operation is invalid, addr is null or misaligned;
-//   it could also mean another thread calling ulock on this address was
-//   configured (via operation) in an inconsistent way.
-//
-// should be dontinstrument because SiliconThreadMain() calls this from
-// a stack managed by apple libc.
-//
-dontinstrument int ulock_wake(uint32_t operation, void *addr,
-                              uint64_t wake_value) {
+int ulock_wake(uint32_t operation, void *addr, uint64_t wake_value) {
   int rc;
   rc = __syscall3i(operation, (long)addr, wake_value, 0x2000000 | 516);
   LOCKTRACE("ulock_wake(%#x, %p, %lx) → %s", operation, addr, wake_value,
diff --git a/libc/intrin/umask.c b/libc/intrin/umask.c
index 2bec074a8..c3af0d52c 100644
--- a/libc/intrin/umask.c
+++ b/libc/intrin/umask.c
@@ -19,4 +19,4 @@
 #include "libc/atomic.h"
 #include "libc/calls/internal.h"
 
-atomic_int __umask = 0777;
+atomic_int __umask;
diff --git a/libc/intrin/virtualallocex.c b/libc/intrin/virtualallocex.c
deleted file mode 100644
index 77e938819..000000000
--- a/libc/intrin/virtualallocex.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/intrin/describeflags.h"
-#include "libc/intrin/strace.h"
-#include "libc/nt/memory.h"
-#include "libc/nt/thunk/msabi.h"
-
-__msabi extern typeof(VirtualAllocEx) *const __imp_VirtualAllocEx;
-
-/**
- * Allocates memory on The New Technology.
- */
-textwindows void *VirtualAllocEx(int64_t hProcess, void *lpAddress,
-                                 uint64_t dwSize, uint32_t flAllocationType,
-                                 uint32_t flProtect) {
-  void *res = __imp_VirtualAllocEx(hProcess, lpAddress, dwSize,
-                                   flAllocationType, flProtect);
-  if (!res)
-    __winerr();
-  NTTRACE("VirtualAllocEx(%ld, %p, %'lu, %s, %s) → %p% m", hProcess, lpAddress,
-          dwSize, DescribeNtAllocationType(flAllocationType),
-          DescribeNtPageFlags(flProtect), res);
-  return res;
-}
diff --git a/libc/intrin/virtualmax.c b/libc/intrin/virtualmax.c
index e6b5b1888..4f24070e2 100644
--- a/libc/intrin/virtualmax.c
+++ b/libc/intrin/virtualmax.c
@@ -19,3 +19,4 @@
 #include "libc/runtime/runtime.h"
 
 size_t __virtualmax = -1;
+size_t __virtualsize = 0;
diff --git a/libc/intrin/virtualprotect.c b/libc/intrin/virtualprotect.c
index 5f653afff..4b1aaa1a0 100644
--- a/libc/intrin/virtualprotect.c
+++ b/libc/intrin/virtualprotect.c
@@ -16,8 +16,13 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/syscall_support-nt.internal.h"
+#include "libc/intrin/describeflags.h"
+#include "libc/intrin/strace.h"
+#include "libc/log/libfatal.internal.h"
 #include "libc/nt/memory.h"
-#include "libc/nt/runtime.h"
+
+__msabi extern typeof(VirtualProtect) *const __imp_VirtualProtect;
 
 /**
  * Protects memory on the New Technology.
@@ -26,6 +31,12 @@
 textwindows bool32 VirtualProtect(void *lpAddress, uint64_t dwSize,
                                   uint32_t flNewProtect,
                                   uint32_t *lpflOldProtect) {
-  return VirtualProtectEx(GetCurrentProcess(), lpAddress, dwSize, flNewProtect,
-                          lpflOldProtect);
+  bool32 bOk;
+  bOk = __imp_VirtualProtect(lpAddress, dwSize, flNewProtect, lpflOldProtect);
+  if (!bOk)
+    __winerr();
+  NTTRACE("VirtualProtect(%p, %'zu, %s, [%s]) → %hhhd% m", lpAddress, dwSize,
+          DescribeNtPageFlags(flNewProtect),
+          DescribeNtPageFlags(*lpflOldProtect), bOk);
+  return bOk;
 }
diff --git a/libc/intrin/virtualprotectex.c b/libc/intrin/virtualprotectex.c
deleted file mode 100644
index 44615c730..000000000
--- a/libc/intrin/virtualprotectex.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/intrin/describeflags.h"
-#include "libc/intrin/strace.h"
-#include "libc/log/libfatal.internal.h"
-#include "libc/nt/memory.h"
-
-__msabi extern typeof(VirtualProtectEx) *const __imp_VirtualProtectEx;
-
-/**
- * Protects memory on the New Technology.
- * @note this wrapper takes care of ABI, STRACE(), and __winerr()
- */
-textwindows bool32 VirtualProtectEx(int64_t hProcess, void *lpAddress,
-                                    uint64_t dwSize, uint32_t flNewProtect,
-                                    uint32_t *lpflOldProtect) {
-  bool32 bOk;
-  bOk = __imp_VirtualProtectEx(hProcess, lpAddress, dwSize, flNewProtect,
-                               lpflOldProtect);
-  if (!bOk)
-    __winerr();
-  NTTRACE("VirtualProtectEx(%ld, %p, %'zu, %s, [%s]) → %hhhd% m", hProcess,
-          lpAddress, dwSize, DescribeNtPageFlags(flNewProtect),
-          DescribeNtPageFlags(*lpflOldProtect), bOk);
-  return bOk;
-}
diff --git a/libc/intrin/winerr.greg.c b/libc/intrin/winerr.greg.c
index 68abab78e..b960296a1 100644
--- a/libc/intrin/winerr.greg.c
+++ b/libc/intrin/winerr.greg.c
@@ -24,7 +24,7 @@
 #include "libc/nt/runtime.h"
 #include "libc/sock/internal.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/thread/tls.h"
+#include "libc/thread/tls2.internal.h"
 
 /**
  * Return path for failed Win32 API calls.
@@ -32,7 +32,7 @@
  * @return -1 w/ few exceptions
  * @note this is a code-size saving device
  */
-privileged optimizesize int64_t __winerr(void) {
+privileged int64_t __winerr(void) {
   errno_t e;
   if (IsWindows()) {
     e = __dos2errno(__imp_GetLastError());
diff --git a/libc/intrin/wintlsinit.c b/libc/intrin/wintlsinit.c
index eb19331ff..599bffb13 100644
--- a/libc/intrin/wintlsinit.c
+++ b/libc/intrin/wintlsinit.c
@@ -16,13 +16,12 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/atomic.h"
 #include "libc/log/libfatal.internal.h"
 #include "libc/nt/thread.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/runtime/runtime.h"
 #include "libc/thread/tls.h"
-#ifdef __x86_64__
+#include "libc/thread/tls2.internal.h"
 
 __msabi extern typeof(GetCurrentThreadId) *const __imp_GetCurrentThreadId;
 
@@ -35,14 +34,10 @@ textwindows dontinstrument void __bootstrap_tls(struct CosmoTib *tib,
   tib->tib_self = tib;
   tib->tib_self2 = tib;
   tib->tib_sigmask = -1;
-  tib->tib_strace = -100;
-  tib->tib_ftrace = -100;
+  tib->tib_strace = __strace;
+  tib->tib_ftrace = __ftrace;
   tib->tib_sigstack_size = 57344;
   tib->tib_sigstack_addr = bp - 57344;
-  int tid = __imp_GetCurrentThreadId();
-  atomic_init(&tib->tib_ptid, tid);
-  atomic_init(&tib->tib_ctid, tid);
+  tib->tib_tid = __imp_GetCurrentThreadId();
   __set_tls_win32(tib);
 }
-
-#endif /* __x86_64__ */
diff --git a/libc/intrin/writeprocessmemory.c b/libc/intrin/writeprocessmemory.c
deleted file mode 100644
index ec99b583b..000000000
--- a/libc/intrin/writeprocessmemory.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/intrin/strace.h"
-#include "libc/nt/memory.h"
-#include "libc/nt/thunk/msabi.h"
-
-__msabi extern typeof(WriteProcessMemory) *const __imp_WriteProcessMemory;
-
-bool32 WriteProcessMemory(int64_t hProcess, void *lpBaseAddress,
-                          const void *lpBuffer, uint64_t nSize,
-                          uint64_t *opt_out_lpNumberOfBytesWritten) {
-  bool32 ok = __imp_WriteProcessMemory(hProcess, lpBaseAddress, lpBuffer, nSize,
-                                       opt_out_lpNumberOfBytesWritten);
-  if (!ok)
-    __winerr();
-  NTTRACE("WriteProcessMemory(%ld, %p, %p, %'lu, %p) → %hhhd% m", hProcess,
-          lpBaseAddress, lpBuffer, nSize, opt_out_lpNumberOfBytesWritten, ok);
-  return ok;
-}
diff --git a/libc/intrin/wsarecv.c b/libc/intrin/wsarecv.c
index 62c489e0f..e4fe65f11 100644
--- a/libc/intrin/wsarecv.c
+++ b/libc/intrin/wsarecv.c
@@ -59,8 +59,8 @@ textwindows int WSARecv(
   }
   if (UNLIKELY(__strace > 0) && strace_enabled(0) > 0) {
     kprintf(STRACE_PROLOGUE "WSARecv(%lu, [", s);
-    _DescribeIovNt(inout_lpBuffers, dwBufferCount,
-                   rc != -1 ? NumberOfBytesRecvd : 0);
+    DescribeIovNt(inout_lpBuffers, dwBufferCount,
+                  rc != -1 ? NumberOfBytesRecvd : 0);
     kprintf("], %u, [%'u], %p, %s, %p) → %d% lm\n", dwBufferCount,
             NumberOfBytesRecvd, inout_lpFlags,
             DescribeNtOverlapped(opt_inout_lpOverlapped),
diff --git a/libc/intrin/wsarecvfrom.c b/libc/intrin/wsarecvfrom.c
index 0885c0eec..170eb9977 100644
--- a/libc/intrin/wsarecvfrom.c
+++ b/libc/intrin/wsarecvfrom.c
@@ -23,7 +23,6 @@
 #include "libc/intrin/likely.h"
 #include "libc/intrin/strace.h"
 #include "libc/nt/runtime.h"
-#include "libc/nt/struct/iovec.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
 #include "libc/runtime/runtime.h"
@@ -55,8 +54,8 @@ textwindows int WSARecvFrom(
   }
   if (UNLIKELY(__strace > 0) && strace_enabled(0) > 0) {
     kprintf(STRACE_PROLOGUE "WSARecvFrom(%lu, [", s);
-    _DescribeIovNt(inout_lpBuffers, dwBufferCount,
-                   rc != -1 ? NumberOfBytesRecvd : 0);
+    DescribeIovNt(inout_lpBuffers, dwBufferCount,
+                  rc != -1 ? NumberOfBytesRecvd : 0);
     kprintf("], %u, [%'u], %p, %p, %p, %s, %p) → %d %d\n", dwBufferCount,
             NumberOfBytesRecvd, opt_out_fromsockaddr, opt_inout_fromsockaddrlen,
             inout_lpFlags, DescribeNtOverlapped(opt_inout_lpOverlapped),
diff --git a/libc/intrin/x86.c b/libc/intrin/x86.c
index a86531533..0ee34f136 100644
--- a/libc/intrin/x86.c
+++ b/libc/intrin/x86.c
@@ -381,13 +381,6 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
   const char *CPU = 0;
 
   switch (Family) {
-    case 15:
-      if (testFeature(FEATURE_SSE3)) {
-        CPU = "k8-sse3";
-        break;
-      }
-      CPU = "k8";
-      break;
     case 16:
       CPU = "amdfam10";
       *Type = AMDFAM10H;
@@ -542,15 +535,13 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(FEATURE_AES);
   if ((ECX >> 29) & 1)
     setFeature(FEATURE_F16C);
-  if ((ECX >> 30) & 1)
-    setFeature(FEATURE_RDRND);
 
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
   // indicates that the AVX registers will be saved and restored on context
   // switch, then we have full AVX support.
   const unsigned AVXBits = (1 << 27) | (1 << 28);
-  bool HasAVXSave = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) &&
-                    ((EAX & 0x6) == 0x6);
+  bool HasAVX = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) &&
+                ((EAX & 0x6) == 0x6);
 #if defined(__APPLE__)
   // Darwin lazily saves the AVX512 context on first use: trust that the OS will
   // save the AVX512 context if we use AVX512 instructions, even the bit is not
@@ -558,174 +549,71 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   bool HasAVX512Save = true;
 #else
   // AVX512 requires additional context to be saved by the OS.
-  bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0);
+  bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0);
 #endif
-  // AMX requires additional context to be saved by the OS.
-  const unsigned AMXBits = (1 << 17) | (1 << 18);
-  bool HasXSave = ((ECX >> 27) & 1) && !getX86XCR0(&EAX, &EDX);
-  bool HasAMXSave = HasXSave && ((EAX & AMXBits) == AMXBits);
 
-  if (HasAVXSave)
+  if (HasAVX)
     setFeature(FEATURE_AVX);
 
   bool HasLeaf7 =
       MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
-  if (HasLeaf7 && ((EBX >> 0) & 1))
-    setFeature(FEATURE_FSGSBASE);
-  if (HasLeaf7 && ((EBX >> 2) & 1))
-    setFeature(FEATURE_SGX);
-  if (HasLeaf7 && ((EBX >> 3) & 1))
-    setFeature(FEATURE_BMI);
-  if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVXSave)
-    setFeature(FEATURE_AVX2);
-  if (HasLeaf7 && ((EBX >> 8) & 1))
-    setFeature(FEATURE_BMI2);
-  if (HasLeaf7 && ((EBX >> 11) & 1))
-    setFeature(FEATURE_RTM);
-  if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512F);
-  if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512DQ);
-  if (HasLeaf7 && ((EBX >> 18) & 1))
-    setFeature(FEATURE_RDSEED);
-  if (HasLeaf7 && ((EBX >> 19) & 1))
-    setFeature(FEATURE_ADX);
-  if (HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512IFMA);
-  if (HasLeaf7 && ((EBX >> 24) & 1))
-    setFeature(FEATURE_CLWB);
-  if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512PF);
-  if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512ER);
-  if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512CD);
-  if (HasLeaf7 && ((EBX >> 29) & 1))
-    setFeature(FEATURE_SHA);
-  if (HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512BW);
-  if (HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VL);
-
-  if (HasLeaf7 && ((ECX >> 0) & 1))
-    setFeature(FEATURE_PREFETCHWT1);
-  if (HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VBMI);
-  if (HasLeaf7 && ((ECX >> 4) & 1))
-    setFeature(FEATURE_PKU);
-  if (HasLeaf7 && ((ECX >> 5) & 1))
-    setFeature(FEATURE_WAITPKG);
-  if (HasLeaf7 && ((ECX >> 6) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VBMI2);
-  if (HasLeaf7 && ((ECX >> 7) & 1))
-    setFeature(FEATURE_SHSTK);
-  if (HasLeaf7 && ((ECX >> 8) & 1))
-    setFeature(FEATURE_GFNI);
-  if (HasLeaf7 && ((ECX >> 9) & 1) && HasAVXSave)
-    setFeature(FEATURE_VAES);
-  if (HasLeaf7 && ((ECX >> 10) & 1) && HasAVXSave)
-    setFeature(FEATURE_VPCLMULQDQ);
-  if (HasLeaf7 && ((ECX >> 11) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VNNI);
-  if (HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512BITALG);
-  if (HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VPOPCNTDQ);
-  if (HasLeaf7 && ((ECX >> 22) & 1))
-    setFeature(FEATURE_RDPID);
-  if (HasLeaf7 && ((ECX >> 23) & 1))
-    setFeature(FEATURE_KL);
-  if (HasLeaf7 && ((ECX >> 25) & 1))
-    setFeature(FEATURE_CLDEMOTE);
-  if (HasLeaf7 && ((ECX >> 27) & 1))
-    setFeature(FEATURE_MOVDIRI);
-  if (HasLeaf7 && ((ECX >> 28) & 1))
-    setFeature(FEATURE_MOVDIR64B);
-  if (HasLeaf7 && ((ECX >> 29) & 1))
-    setFeature(FEATURE_ENQCMD);
-
-  if (HasLeaf7 && ((EDX >> 2) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX5124VNNIW);
-  if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX5124FMAPS);
-  if (HasLeaf7 && ((EDX >> 5) & 1))
-    setFeature(FEATURE_UINTR);
-  if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VP2INTERSECT);
-  if (HasLeaf7 && ((EDX >> 14) & 1))
-    setFeature(FEATURE_SERIALIZE);
-  if (HasLeaf7 && ((EDX >> 16) & 1))
-    setFeature(FEATURE_TSXLDTRK);
-  if (HasLeaf7 && ((EDX >> 18) & 1))
-    setFeature(FEATURE_PCONFIG);
-  if (HasLeaf7 && ((EDX >> 22) & 1) && HasAMXSave)
-    setFeature(FEATURE_AMX_BF16);
-  if (HasLeaf7 && ((EDX >> 23) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512FP16);
-  if (HasLeaf7 && ((EDX >> 24) & 1) && HasAMXSave)
-    setFeature(FEATURE_AMX_TILE);
-  if (HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave)
-    setFeature(FEATURE_AMX_INT8);
+  if (HasLeaf7) {
+    if ((EBX >> 3) & 1)
+      setFeature(FEATURE_BMI);
+    if (((EBX >> 5) & 1) && HasAVX)
+      setFeature(FEATURE_AVX2);
+    if ((EBX >> 8) & 1)
+      setFeature(FEATURE_BMI2);
+    if (HasAVX512Save) {
+      if ((EBX >> 16) & 1)
+        setFeature(FEATURE_AVX512F);
+      if ((EBX >> 17) & 1)
+        setFeature(FEATURE_AVX512DQ);
+      if ((EBX >> 21) & 1)
+        setFeature(FEATURE_AVX512IFMA);
+      if ((EBX >> 26) & 1)
+        setFeature(FEATURE_AVX512PF);
+      if ((EBX >> 27) & 1)
+        setFeature(FEATURE_AVX512ER);
+      if ((EBX >> 28) & 1)
+        setFeature(FEATURE_AVX512CD);
+      if ((EBX >> 30) & 1)
+        setFeature(FEATURE_AVX512BW);
+      if ((EBX >> 31) & 1)
+        setFeature(FEATURE_AVX512VL);
+      if ((ECX >> 1) & 1)
+        setFeature(FEATURE_AVX512VBMI);
+      if ((ECX >> 6) & 1)
+        setFeature(FEATURE_AVX512VBMI2);
+      if ((ECX >> 11) & 1)
+        setFeature(FEATURE_AVX512VNNI);
+      if ((ECX >> 12) & 1)
+        setFeature(FEATURE_AVX512BITALG);
+      if ((ECX >> 14) & 1)
+        setFeature(FEATURE_AVX512VPOPCNTDQ);
+      if ((EDX >> 2) & 1)
+        setFeature(FEATURE_AVX5124VNNIW);
+      if ((EDX >> 3) & 1)
+        setFeature(FEATURE_AVX5124FMAPS);
+      if ((EDX >> 8) & 1)
+        setFeature(FEATURE_AVX512VP2INTERSECT);
+      if ((EDX >> 23) & 1)
+        setFeature(FEATURE_AVX512FP16);
+    }
+    if ((ECX >> 8) & 1)
+      setFeature(FEATURE_GFNI);
+    if (((ECX >> 10) & 1) && HasAVX)
+      setFeature(FEATURE_VPCLMULQDQ);
+  }
 
   // EAX from subleaf 0 is the maximum subleaf supported. Some CPUs don't
   // return all 0s for invalid subleaves so check the limit.
   bool HasLeaf7Subleaf1 =
       HasLeaf7 && EAX >= 1 &&
       !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
-  if (HasLeaf7Subleaf1 && ((EAX >> 0) & 1))
-    setFeature(FEATURE_SHA512);
-  if (HasLeaf7Subleaf1 && ((EAX >> 1) & 1))
-    setFeature(FEATURE_SM3);
-  if (HasLeaf7Subleaf1 && ((EAX >> 2) & 1))
-    setFeature(FEATURE_SM4);
-  if (HasLeaf7Subleaf1 && ((EAX >> 3) & 1))
-    setFeature(FEATURE_RAOINT);
-  if (HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave)
-    setFeature(FEATURE_AVXVNNI);
   if (HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512BF16);
-  if (HasLeaf7Subleaf1 && ((EAX >> 7) & 1))
-    setFeature(FEATURE_CMPCCXADD);
-  if (HasLeaf7Subleaf1 && ((EAX >> 21) & 1) && HasAMXSave)
-    setFeature(FEATURE_AMX_FP16);
-  if (HasLeaf7Subleaf1 && ((EAX >> 22) & 1))
-    setFeature(FEATURE_HRESET);
-  if (HasLeaf7Subleaf1 && ((EAX >> 23) & 1) && HasAVXSave)
-    setFeature(FEATURE_AVXIFMA);
-
-  if (HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave)
-    setFeature(FEATURE_AVXVNNIINT8);
-  if (HasLeaf7Subleaf1 && ((EDX >> 5) & 1) && HasAVXSave)
-    setFeature(FEATURE_AVXNECONVERT);
-  if (HasLeaf7Subleaf1 && ((EDX >> 8) & 1) && HasAMXSave)
-    setFeature(FEATURE_AMX_COMPLEX);
-  if (HasLeaf7Subleaf1 && ((EDX >> 10) & 1) && HasAVXSave)
-    setFeature(FEATURE_AVXVNNIINT16);
-  if (HasLeaf7Subleaf1 && ((EDX >> 14) & 1))
-    setFeature(FEATURE_PREFETCHI);
-  if (HasLeaf7Subleaf1 && ((EDX >> 15) & 1))
-    setFeature(FEATURE_USERMSR);
-  if (HasLeaf7Subleaf1 && ((EDX >> 19) & 1))
-    setFeature(FEATURE_AVX10_1_256);
-  if (HasLeaf7Subleaf1 && ((EDX >> 21) & 1))
-    setFeature(FEATURE_APXF);
-
-  unsigned MaxLevel;
-  getX86CpuIDAndInfo(0, &MaxLevel, &EBX, &ECX, &EDX);
-  bool HasLeafD = MaxLevel >= 0xd &&
-                  !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
-  if (HasLeafD && ((EAX >> 0) & 1) && HasAVXSave)
-    setFeature(FEATURE_XSAVEOPT);
-  if (HasLeafD && ((EAX >> 1) & 1) && HasAVXSave)
-    setFeature(FEATURE_XSAVEC);
-  if (HasLeafD && ((EAX >> 3) & 1) && HasAVXSave)
-    setFeature(FEATURE_XSAVES);
-
-  bool HasLeaf24 =
-      MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);
-  if (HasLeaf7Subleaf1 && ((EDX >> 19) & 1) && HasLeaf24 && ((EBX >> 18) & 1))
-    setFeature(FEATURE_AVX10_1_512);
 
   unsigned MaxExtLevel;
   getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -739,40 +627,14 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
       setFeature(FEATURE_LZCNT);
     if (((ECX >> 6) & 1))
       setFeature(FEATURE_SSE4_A);
-    if (((ECX >> 8) & 1))
-      setFeature(FEATURE_PRFCHW);
     if (((ECX >> 11) & 1))
       setFeature(FEATURE_XOP);
-    if (((ECX >> 15) & 1))
-      setFeature(FEATURE_LWP);
     if (((ECX >> 16) & 1))
       setFeature(FEATURE_FMA4);
-    if (((ECX >> 21) & 1))
-      setFeature(FEATURE_TBM);
-    if (((ECX >> 29) & 1))
-      setFeature(FEATURE_MWAITX);
-
     if (((EDX >> 29) & 1))
       setFeature(FEATURE_LM);
   }
 
-  bool HasExtLeaf8 = MaxExtLevel >= 0x80000008 &&
-                     !getX86CpuIDAndInfo(0x80000008, &EAX, &EBX, &ECX, &EDX);
-  if (HasExtLeaf8 && ((EBX >> 0) & 1))
-    setFeature(FEATURE_CLZERO);
-  if (HasExtLeaf8 && ((EBX >> 9) & 1))
-    setFeature(FEATURE_WBNOINVD);
-
-  bool HasLeaf14 = MaxLevel >= 0x14 &&
-                   !getX86CpuIDAndInfoEx(0x14, 0x0, &EAX, &EBX, &ECX, &EDX);
-  if (HasLeaf14 && ((EBX >> 4) & 1))
-    setFeature(FEATURE_PTWRITE);
-
-  bool HasLeaf19 =
-      MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX);
-  if (HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1))
-    setFeature(FEATURE_WIDEKL);
-
   if (hasFeature(FEATURE_LM) && hasFeature(FEATURE_SSE2)) {
     setFeature(FEATURE_X86_64_BASELINE);
     if (hasFeature(FEATURE_CMPXCHG16B) && hasFeature(FEATURE_POPCNT) &&
diff --git a/libc/intrin/x86.h b/libc/intrin/x86.h
index 9624fb496..8608454f2 100644
--- a/libc/intrin/x86.h
+++ b/libc/intrin/x86.h
@@ -114,88 +114,20 @@ enum ProcessorFeatures {
   FEATURE_AVX512BITALG,
   FEATURE_AVX512BF16,
   FEATURE_AVX512VP2INTERSECT,
-  // FIXME: Below Features has some missings comparing to gcc, it's because gcc
-  // has some not one-to-one mapped in llvm.
-  // FEATURE_3DNOW,
-  // FEATURE_3DNOWP,
-  FEATURE_ADX = 40,
-  // FEATURE_ABM,
-  FEATURE_CLDEMOTE = 42,
-  FEATURE_CLFLUSHOPT,
-  FEATURE_CLWB,
-  FEATURE_CLZERO,
-  FEATURE_CMPXCHG16B,
-  // FIXME: Not adding FEATURE_CMPXCHG8B is a workaround to make 'generic' as
-  // a cpu string with no X86_FEATURE_COMPAT features, which is required in
-  // current implementantion of cpu_specific/cpu_dispatch FMV feature.
-  // FEATURE_CMPXCHG8B,
-  FEATURE_ENQCMD = 48,
-  FEATURE_F16C,
-  FEATURE_FSGSBASE,
-  // FEATURE_FXSAVE,
-  // FEATURE_HLE,
-  // FEATURE_IBT,
+
+  FEATURE_CMPXCHG16B = 46,
+  FEATURE_F16C = 49,
   FEATURE_LAHF_LM = 54,
   FEATURE_LM,
-  FEATURE_LWP,
+  FEATURE_WP,
   FEATURE_LZCNT,
   FEATURE_MOVBE,
-  FEATURE_MOVDIR64B,
-  FEATURE_MOVDIRI,
-  FEATURE_MWAITX,
-  // FEATURE_OSXSAVE,
-  FEATURE_PCONFIG = 63,
-  FEATURE_PKU,
-  FEATURE_PREFETCHWT1,
-  FEATURE_PRFCHW,
-  FEATURE_PTWRITE,
-  FEATURE_RDPID,
-  FEATURE_RDRND,
-  FEATURE_RDSEED,
-  FEATURE_RTM,
-  FEATURE_SERIALIZE,
-  FEATURE_SGX,
-  FEATURE_SHA,
-  FEATURE_SHSTK,
-  FEATURE_TBM,
-  FEATURE_TSXLDTRK,
-  FEATURE_VAES,
-  FEATURE_WAITPKG,
-  FEATURE_WBNOINVD,
-  FEATURE_XSAVE,
-  FEATURE_XSAVEC,
-  FEATURE_XSAVEOPT,
-  FEATURE_XSAVES,
-  FEATURE_AMX_TILE,
-  FEATURE_AMX_INT8,
-  FEATURE_AMX_BF16,
-  FEATURE_UINTR,
-  FEATURE_HRESET,
-  FEATURE_KL,
-  // FEATURE_AESKLE,
-  FEATURE_WIDEKL = 92,
-  FEATURE_AVXVNNI,
-  FEATURE_AVX512FP16,
+
+  FEATURE_AVX512FP16 = 94,
   FEATURE_X86_64_BASELINE,
   FEATURE_X86_64_V2,
   FEATURE_X86_64_V3,
   FEATURE_X86_64_V4,
-  FEATURE_AVXIFMA,
-  FEATURE_AVXVNNIINT8,
-  FEATURE_AVXNECONVERT,
-  FEATURE_CMPCCXADD,
-  FEATURE_AMX_FP16,
-  FEATURE_PREFETCHI,
-  FEATURE_RAOINT,
-  FEATURE_AMX_COMPLEX,
-  FEATURE_AVXVNNIINT16,
-  FEATURE_SM3,
-  FEATURE_SHA512,
-  FEATURE_SM4,
-  FEATURE_APXF,
-  FEATURE_USERMSR,
-  FEATURE_AVX10_1_256,
-  FEATURE_AVX10_1_512,
   CPU_FEATURE_MAX
 };
 
diff --git a/libc/irq/acpi-fadt-init.S b/libc/irq/acpi-fadt-init.S
index de1038be3..e58410cc3 100644
--- a/libc/irq/acpi-fadt-init.S
+++ b/libc/irq/acpi-fadt-init.S
@@ -26,7 +26,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/irq/acpi.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/pc.internal.h"
 
 	.init.start 312,_init_acpi_fadt
diff --git a/libc/irq/acpi-madt-init.S b/libc/irq/acpi-madt-init.S
index 6bfbddbf5..3a3b473d1 100644
--- a/libc/irq/acpi-madt-init.S
+++ b/libc/irq/acpi-madt-init.S
@@ -26,7 +26,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/irq/acpi.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/pc.internal.h"
 
 	.init.start 311,_init_acpi_madt
diff --git a/libc/irq/acpi-xsdt-init.S b/libc/irq/acpi-xsdt-init.S
index 98f7db6a3..2275dc8d3 100644
--- a/libc/irq/acpi-xsdt-init.S
+++ b/libc/irq/acpi-xsdt-init.S
@@ -26,7 +26,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/irq/acpi.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/pc.internal.h"
 
 	.init.start 310,_init_acpi_xsdt
diff --git a/libc/irq/acpi-xsdt.c b/libc/irq/acpi-xsdt.c
index 83b71ffd1..94dc50a82 100644
--- a/libc/irq/acpi-xsdt.c
+++ b/libc/irq/acpi-xsdt.c
@@ -30,7 +30,7 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/irq/acpi.internal.h"
 #include "libc/log/color.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/efi.h"
 #include "libc/runtime/pc.internal.h"
 #include "libc/serialize.h"
@@ -58,8 +58,9 @@ textstartup void *_AcpiOsMapUncachedMemory(uintptr_t phy, size_t n) {
 }
 
 textstartup static void *_AcpiOsAllocatePages(size_t n) {
-  void *addr = sys_mmap_metal(NULL, n, PROT_READ | PROT_WRITE,
-                              MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  struct DirectMap dm = sys_mmap_metal(NULL, n, PROT_READ | PROT_WRITE,
+                                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  void *addr = dm.addr;
   if (addr == (void *)-1)
     addr = NULL;
   return addr;
diff --git a/libc/iso646.h b/libc/iso646.internal.h
similarity index 100%
rename from libc/iso646.h
rename to libc/iso646.internal.h
diff --git a/libc/isystem/__algorithm/adjacent_find.h b/libc/isystem/__algorithm/adjacent_find.h
new file mode 100644
index 000000000..5e1d2ad07
--- /dev/null
+++ b/libc/isystem/__algorithm/adjacent_find.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/adjacent_find.h"
diff --git a/libc/isystem/__algorithm/all_of.h b/libc/isystem/__algorithm/all_of.h
new file mode 100644
index 000000000..4652ac854
--- /dev/null
+++ b/libc/isystem/__algorithm/all_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/all_of.h"
diff --git a/libc/isystem/__algorithm/any_of.h b/libc/isystem/__algorithm/any_of.h
new file mode 100644
index 000000000..6f273d1d9
--- /dev/null
+++ b/libc/isystem/__algorithm/any_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/any_of.h"
diff --git a/libc/isystem/__algorithm/binary_search.h b/libc/isystem/__algorithm/binary_search.h
new file mode 100644
index 000000000..980063c6d
--- /dev/null
+++ b/libc/isystem/__algorithm/binary_search.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/binary_search.h"
diff --git a/libc/isystem/__algorithm/clamp.h b/libc/isystem/__algorithm/clamp.h
new file mode 100644
index 000000000..7b2774011
--- /dev/null
+++ b/libc/isystem/__algorithm/clamp.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/clamp.h"
diff --git a/libc/isystem/__algorithm/comp.h b/libc/isystem/__algorithm/comp.h
new file mode 100644
index 000000000..eefb8bf55
--- /dev/null
+++ b/libc/isystem/__algorithm/comp.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/comp.h"
diff --git a/libc/isystem/__algorithm/comp_ref_type.h b/libc/isystem/__algorithm/comp_ref_type.h
new file mode 100644
index 000000000..a99ce32cf
--- /dev/null
+++ b/libc/isystem/__algorithm/comp_ref_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/comp_ref_type.h"
diff --git a/libc/isystem/__algorithm/copy.h b/libc/isystem/__algorithm/copy.h
new file mode 100644
index 000000000..f0b135cda
--- /dev/null
+++ b/libc/isystem/__algorithm/copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/copy.h"
diff --git a/libc/isystem/__algorithm/copy_backward.h b/libc/isystem/__algorithm/copy_backward.h
new file mode 100644
index 000000000..f1f982802
--- /dev/null
+++ b/libc/isystem/__algorithm/copy_backward.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/copy_backward.h"
diff --git a/libc/isystem/__algorithm/copy_if.h b/libc/isystem/__algorithm/copy_if.h
new file mode 100644
index 000000000..78b1e991d
--- /dev/null
+++ b/libc/isystem/__algorithm/copy_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/copy_if.h"
diff --git a/libc/isystem/__algorithm/copy_move_common.h b/libc/isystem/__algorithm/copy_move_common.h
new file mode 100644
index 000000000..dbd46eca0
--- /dev/null
+++ b/libc/isystem/__algorithm/copy_move_common.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/copy_move_common.h"
diff --git a/libc/isystem/__algorithm/copy_n.h b/libc/isystem/__algorithm/copy_n.h
new file mode 100644
index 000000000..e1678d6b0
--- /dev/null
+++ b/libc/isystem/__algorithm/copy_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/copy_n.h"
diff --git a/libc/isystem/__algorithm/count.h b/libc/isystem/__algorithm/count.h
new file mode 100644
index 000000000..29503d08c
--- /dev/null
+++ b/libc/isystem/__algorithm/count.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/count.h"
diff --git a/libc/isystem/__algorithm/count_if.h b/libc/isystem/__algorithm/count_if.h
new file mode 100644
index 000000000..1e92f0c16
--- /dev/null
+++ b/libc/isystem/__algorithm/count_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/count_if.h"
diff --git a/libc/isystem/__algorithm/equal.h b/libc/isystem/__algorithm/equal.h
new file mode 100644
index 000000000..c59e36840
--- /dev/null
+++ b/libc/isystem/__algorithm/equal.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/equal.h"
diff --git a/libc/isystem/__algorithm/equal_range.h b/libc/isystem/__algorithm/equal_range.h
new file mode 100644
index 000000000..69b5941b3
--- /dev/null
+++ b/libc/isystem/__algorithm/equal_range.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/equal_range.h"
diff --git a/libc/isystem/__algorithm/fill.h b/libc/isystem/__algorithm/fill.h
new file mode 100644
index 000000000..b0e93bc45
--- /dev/null
+++ b/libc/isystem/__algorithm/fill.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/fill.h"
diff --git a/libc/isystem/__algorithm/fill_n.h b/libc/isystem/__algorithm/fill_n.h
new file mode 100644
index 000000000..d23b90a51
--- /dev/null
+++ b/libc/isystem/__algorithm/fill_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/fill_n.h"
diff --git a/libc/isystem/__algorithm/find.h b/libc/isystem/__algorithm/find.h
new file mode 100644
index 000000000..3a409b265
--- /dev/null
+++ b/libc/isystem/__algorithm/find.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/find.h"
diff --git a/libc/isystem/__algorithm/find_end.h b/libc/isystem/__algorithm/find_end.h
new file mode 100644
index 000000000..01b432a3d
--- /dev/null
+++ b/libc/isystem/__algorithm/find_end.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/find_end.h"
diff --git a/libc/isystem/__algorithm/find_first_of.h b/libc/isystem/__algorithm/find_first_of.h
new file mode 100644
index 000000000..389dfab08
--- /dev/null
+++ b/libc/isystem/__algorithm/find_first_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/find_first_of.h"
diff --git a/libc/isystem/__algorithm/find_if.h b/libc/isystem/__algorithm/find_if.h
new file mode 100644
index 000000000..617f7b0b6
--- /dev/null
+++ b/libc/isystem/__algorithm/find_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/find_if.h"
diff --git a/libc/isystem/__algorithm/find_if_not.h b/libc/isystem/__algorithm/find_if_not.h
new file mode 100644
index 000000000..62423636e
--- /dev/null
+++ b/libc/isystem/__algorithm/find_if_not.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/find_if_not.h"
diff --git a/libc/isystem/__algorithm/for_each.h b/libc/isystem/__algorithm/for_each.h
new file mode 100644
index 000000000..3dbb80023
--- /dev/null
+++ b/libc/isystem/__algorithm/for_each.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/for_each.h"
diff --git a/libc/isystem/__algorithm/for_each_n.h b/libc/isystem/__algorithm/for_each_n.h
new file mode 100644
index 000000000..f1d6bb9ca
--- /dev/null
+++ b/libc/isystem/__algorithm/for_each_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/for_each_n.h"
diff --git a/libc/isystem/__algorithm/for_each_segment.h b/libc/isystem/__algorithm/for_each_segment.h
new file mode 100644
index 000000000..a48155365
--- /dev/null
+++ b/libc/isystem/__algorithm/for_each_segment.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/for_each_segment.h"
diff --git a/libc/isystem/__algorithm/generate.h b/libc/isystem/__algorithm/generate.h
new file mode 100644
index 000000000..cdeb4f740
--- /dev/null
+++ b/libc/isystem/__algorithm/generate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/generate.h"
diff --git a/libc/isystem/__algorithm/generate_n.h b/libc/isystem/__algorithm/generate_n.h
new file mode 100644
index 000000000..bc63ac495
--- /dev/null
+++ b/libc/isystem/__algorithm/generate_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/generate_n.h"
diff --git a/libc/isystem/__algorithm/half_positive.h b/libc/isystem/__algorithm/half_positive.h
new file mode 100644
index 000000000..07dbde6d8
--- /dev/null
+++ b/libc/isystem/__algorithm/half_positive.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/half_positive.h"
diff --git a/libc/isystem/__algorithm/in_found_result.h b/libc/isystem/__algorithm/in_found_result.h
new file mode 100644
index 000000000..b1ba3443b
--- /dev/null
+++ b/libc/isystem/__algorithm/in_found_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/in_found_result.h"
diff --git a/libc/isystem/__algorithm/in_fun_result.h b/libc/isystem/__algorithm/in_fun_result.h
new file mode 100644
index 000000000..8ddaa0703
--- /dev/null
+++ b/libc/isystem/__algorithm/in_fun_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/in_fun_result.h"
diff --git a/libc/isystem/__algorithm/in_in_out_result.h b/libc/isystem/__algorithm/in_in_out_result.h
new file mode 100644
index 000000000..f60a94127
--- /dev/null
+++ b/libc/isystem/__algorithm/in_in_out_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/in_in_out_result.h"
diff --git a/libc/isystem/__algorithm/in_in_result.h b/libc/isystem/__algorithm/in_in_result.h
new file mode 100644
index 000000000..935c3d9f2
--- /dev/null
+++ b/libc/isystem/__algorithm/in_in_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/in_in_result.h"
diff --git a/libc/isystem/__algorithm/in_out_out_result.h b/libc/isystem/__algorithm/in_out_out_result.h
new file mode 100644
index 000000000..6aca65796
--- /dev/null
+++ b/libc/isystem/__algorithm/in_out_out_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/in_out_out_result.h"
diff --git a/libc/isystem/__algorithm/in_out_result.h b/libc/isystem/__algorithm/in_out_result.h
new file mode 100644
index 000000000..fae41433a
--- /dev/null
+++ b/libc/isystem/__algorithm/in_out_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/in_out_result.h"
diff --git a/libc/isystem/__algorithm/includes.h b/libc/isystem/__algorithm/includes.h
new file mode 100644
index 000000000..fe1dbcefc
--- /dev/null
+++ b/libc/isystem/__algorithm/includes.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/includes.h"
diff --git a/libc/isystem/__algorithm/inplace_merge.h b/libc/isystem/__algorithm/inplace_merge.h
new file mode 100644
index 000000000..3a52f879d
--- /dev/null
+++ b/libc/isystem/__algorithm/inplace_merge.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/inplace_merge.h"
diff --git a/libc/isystem/__algorithm/is_heap.h b/libc/isystem/__algorithm/is_heap.h
new file mode 100644
index 000000000..c8fb92ecb
--- /dev/null
+++ b/libc/isystem/__algorithm/is_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/is_heap.h"
diff --git a/libc/isystem/__algorithm/is_heap_until.h b/libc/isystem/__algorithm/is_heap_until.h
new file mode 100644
index 000000000..b10a5340a
--- /dev/null
+++ b/libc/isystem/__algorithm/is_heap_until.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/is_heap_until.h"
diff --git a/libc/isystem/__algorithm/is_partitioned.h b/libc/isystem/__algorithm/is_partitioned.h
new file mode 100644
index 000000000..c749a1ad6
--- /dev/null
+++ b/libc/isystem/__algorithm/is_partitioned.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/is_partitioned.h"
diff --git a/libc/isystem/__algorithm/is_permutation.h b/libc/isystem/__algorithm/is_permutation.h
new file mode 100644
index 000000000..04023edcf
--- /dev/null
+++ b/libc/isystem/__algorithm/is_permutation.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/is_permutation.h"
diff --git a/libc/isystem/__algorithm/is_sorted.h b/libc/isystem/__algorithm/is_sorted.h
new file mode 100644
index 000000000..327f6f2f6
--- /dev/null
+++ b/libc/isystem/__algorithm/is_sorted.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/is_sorted.h"
diff --git a/libc/isystem/__algorithm/is_sorted_until.h b/libc/isystem/__algorithm/is_sorted_until.h
new file mode 100644
index 000000000..3aaa25a47
--- /dev/null
+++ b/libc/isystem/__algorithm/is_sorted_until.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/is_sorted_until.h"
diff --git a/libc/isystem/__algorithm/iter_swap.h b/libc/isystem/__algorithm/iter_swap.h
new file mode 100644
index 000000000..453fe0d58
--- /dev/null
+++ b/libc/isystem/__algorithm/iter_swap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/iter_swap.h"
diff --git a/libc/isystem/__algorithm/iterator_operations.h b/libc/isystem/__algorithm/iterator_operations.h
new file mode 100644
index 000000000..40fb82c6f
--- /dev/null
+++ b/libc/isystem/__algorithm/iterator_operations.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/iterator_operations.h"
diff --git a/libc/isystem/__algorithm/lexicographical_compare.h b/libc/isystem/__algorithm/lexicographical_compare.h
new file mode 100644
index 000000000..dfb6994dd
--- /dev/null
+++ b/libc/isystem/__algorithm/lexicographical_compare.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/lexicographical_compare.h"
diff --git a/libc/isystem/__algorithm/lexicographical_compare_three_way.h b/libc/isystem/__algorithm/lexicographical_compare_three_way.h
new file mode 100644
index 000000000..0ad819b94
--- /dev/null
+++ b/libc/isystem/__algorithm/lexicographical_compare_three_way.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/lexicographical_compare_three_way.h"
diff --git a/libc/isystem/__algorithm/lower_bound.h b/libc/isystem/__algorithm/lower_bound.h
new file mode 100644
index 000000000..94b2db647
--- /dev/null
+++ b/libc/isystem/__algorithm/lower_bound.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/lower_bound.h"
diff --git a/libc/isystem/__algorithm/make_heap.h b/libc/isystem/__algorithm/make_heap.h
new file mode 100644
index 000000000..bcf103889
--- /dev/null
+++ b/libc/isystem/__algorithm/make_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/make_heap.h"
diff --git a/libc/isystem/__algorithm/make_projected.h b/libc/isystem/__algorithm/make_projected.h
new file mode 100644
index 000000000..6d729c4b5
--- /dev/null
+++ b/libc/isystem/__algorithm/make_projected.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/make_projected.h"
diff --git a/libc/isystem/__algorithm/max.h b/libc/isystem/__algorithm/max.h
new file mode 100644
index 000000000..484addcad
--- /dev/null
+++ b/libc/isystem/__algorithm/max.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/max.h"
diff --git a/libc/isystem/__algorithm/max_element.h b/libc/isystem/__algorithm/max_element.h
new file mode 100644
index 000000000..cfb1f66c6
--- /dev/null
+++ b/libc/isystem/__algorithm/max_element.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/max_element.h"
diff --git a/libc/isystem/__algorithm/merge.h b/libc/isystem/__algorithm/merge.h
new file mode 100644
index 000000000..25dd21f3d
--- /dev/null
+++ b/libc/isystem/__algorithm/merge.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/merge.h"
diff --git a/libc/isystem/__algorithm/min.h b/libc/isystem/__algorithm/min.h
new file mode 100644
index 000000000..93a1be51a
--- /dev/null
+++ b/libc/isystem/__algorithm/min.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/min.h"
diff --git a/libc/isystem/__algorithm/min_element.h b/libc/isystem/__algorithm/min_element.h
new file mode 100644
index 000000000..e6745293d
--- /dev/null
+++ b/libc/isystem/__algorithm/min_element.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/min_element.h"
diff --git a/libc/isystem/__algorithm/min_max_result.h b/libc/isystem/__algorithm/min_max_result.h
new file mode 100644
index 000000000..f944c2265
--- /dev/null
+++ b/libc/isystem/__algorithm/min_max_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/min_max_result.h"
diff --git a/libc/isystem/__algorithm/minmax.h b/libc/isystem/__algorithm/minmax.h
new file mode 100644
index 000000000..17ef2ddf9
--- /dev/null
+++ b/libc/isystem/__algorithm/minmax.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/minmax.h"
diff --git a/libc/isystem/__algorithm/minmax_element.h b/libc/isystem/__algorithm/minmax_element.h
new file mode 100644
index 000000000..405b168a9
--- /dev/null
+++ b/libc/isystem/__algorithm/minmax_element.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/minmax_element.h"
diff --git a/libc/isystem/__algorithm/mismatch.h b/libc/isystem/__algorithm/mismatch.h
new file mode 100644
index 000000000..54b9c8896
--- /dev/null
+++ b/libc/isystem/__algorithm/mismatch.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/mismatch.h"
diff --git a/libc/isystem/__algorithm/move.h b/libc/isystem/__algorithm/move.h
new file mode 100644
index 000000000..1f7b306ba
--- /dev/null
+++ b/libc/isystem/__algorithm/move.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/move.h"
diff --git a/libc/isystem/__algorithm/move_backward.h b/libc/isystem/__algorithm/move_backward.h
new file mode 100644
index 000000000..430f91c33
--- /dev/null
+++ b/libc/isystem/__algorithm/move_backward.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/move_backward.h"
diff --git a/libc/isystem/__algorithm/next_permutation.h b/libc/isystem/__algorithm/next_permutation.h
new file mode 100644
index 000000000..fbbee7f89
--- /dev/null
+++ b/libc/isystem/__algorithm/next_permutation.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/next_permutation.h"
diff --git a/libc/isystem/__algorithm/none_of.h b/libc/isystem/__algorithm/none_of.h
new file mode 100644
index 000000000..c708d865e
--- /dev/null
+++ b/libc/isystem/__algorithm/none_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/none_of.h"
diff --git a/libc/isystem/__algorithm/nth_element.h b/libc/isystem/__algorithm/nth_element.h
new file mode 100644
index 000000000..361b51b08
--- /dev/null
+++ b/libc/isystem/__algorithm/nth_element.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/nth_element.h"
diff --git a/libc/isystem/__algorithm/partial_sort.h b/libc/isystem/__algorithm/partial_sort.h
new file mode 100644
index 000000000..826252843
--- /dev/null
+++ b/libc/isystem/__algorithm/partial_sort.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/partial_sort.h"
diff --git a/libc/isystem/__algorithm/partial_sort_copy.h b/libc/isystem/__algorithm/partial_sort_copy.h
new file mode 100644
index 000000000..c3304c0c0
--- /dev/null
+++ b/libc/isystem/__algorithm/partial_sort_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/partial_sort_copy.h"
diff --git a/libc/isystem/__algorithm/partition.h b/libc/isystem/__algorithm/partition.h
new file mode 100644
index 000000000..d746009c2
--- /dev/null
+++ b/libc/isystem/__algorithm/partition.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/partition.h"
diff --git a/libc/isystem/__algorithm/partition_copy.h b/libc/isystem/__algorithm/partition_copy.h
new file mode 100644
index 000000000..c53139f01
--- /dev/null
+++ b/libc/isystem/__algorithm/partition_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/partition_copy.h"
diff --git a/libc/isystem/__algorithm/partition_point.h b/libc/isystem/__algorithm/partition_point.h
new file mode 100644
index 000000000..af904ffe3
--- /dev/null
+++ b/libc/isystem/__algorithm/partition_point.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/partition_point.h"
diff --git a/libc/isystem/__algorithm/pop_heap.h b/libc/isystem/__algorithm/pop_heap.h
new file mode 100644
index 000000000..29efc6977
--- /dev/null
+++ b/libc/isystem/__algorithm/pop_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pop_heap.h"
diff --git a/libc/isystem/__algorithm/prev_permutation.h b/libc/isystem/__algorithm/prev_permutation.h
new file mode 100644
index 000000000..d2b0e6729
--- /dev/null
+++ b/libc/isystem/__algorithm/prev_permutation.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/prev_permutation.h"
diff --git a/libc/isystem/__algorithm/pstl_any_all_none_of.h b/libc/isystem/__algorithm/pstl_any_all_none_of.h
new file mode 100644
index 000000000..1383d20ea
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_any_all_none_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_any_all_none_of.h"
diff --git a/libc/isystem/__algorithm/pstl_backend.h b/libc/isystem/__algorithm/pstl_backend.h
new file mode 100644
index 000000000..f46ff3449
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backend.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backend.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backend.h b/libc/isystem/__algorithm/pstl_backends/cpu_backend.h
new file mode 100644
index 000000000..b85042f11
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backend.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backend.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backends/any_of.h b/libc/isystem/__algorithm/pstl_backends/cpu_backends/any_of.h
new file mode 100644
index 000000000..ee3e81079
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backends/any_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backends/any_of.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backends/backend.h b/libc/isystem/__algorithm/pstl_backends/cpu_backends/backend.h
new file mode 100644
index 000000000..84df5fe9f
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backends/backend.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backends/fill.h b/libc/isystem/__algorithm/pstl_backends/cpu_backends/fill.h
new file mode 100644
index 000000000..0f5b7b478
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backends/fill.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backends/find_if.h b/libc/isystem/__algorithm/pstl_backends/cpu_backends/find_if.h
new file mode 100644
index 000000000..7adf76049
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backends/find_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backends/find_if.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backends/for_each.h b/libc/isystem/__algorithm/pstl_backends/cpu_backends/for_each.h
new file mode 100644
index 000000000..aaa45c6c8
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backends/for_each.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backends/merge.h b/libc/isystem/__algorithm/pstl_backends/cpu_backends/merge.h
new file mode 100644
index 000000000..7676a5da3
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backends/merge.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backends/merge.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backends/serial.h b/libc/isystem/__algorithm/pstl_backends/cpu_backends/serial.h
new file mode 100644
index 000000000..4b25ed3b5
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backends/serial.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backends/serial.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backends/thread.h b/libc/isystem/__algorithm/pstl_backends/cpu_backends/thread.h
new file mode 100644
index 000000000..6487ec38c
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backends/thread.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backends/thread.h"
diff --git a/libc/isystem/__algorithm/pstl_backends/cpu_backends/transform.h b/libc/isystem/__algorithm/pstl_backends/cpu_backends/transform.h
new file mode 100644
index 000000000..1217711a0
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_backends/cpu_backends/transform.h"
diff --git a/libc/isystem/__algorithm/pstl_copy.h b/libc/isystem/__algorithm/pstl_copy.h
new file mode 100644
index 000000000..be0f6c2d3
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_copy.h"
diff --git a/libc/isystem/__algorithm/pstl_fill.h b/libc/isystem/__algorithm/pstl_fill.h
new file mode 100644
index 000000000..0740e0139
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_fill.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_fill.h"
diff --git a/libc/isystem/__algorithm/pstl_find.h b/libc/isystem/__algorithm/pstl_find.h
new file mode 100644
index 000000000..cbc557e5d
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_find.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_find.h"
diff --git a/libc/isystem/__algorithm/pstl_for_each.h b/libc/isystem/__algorithm/pstl_for_each.h
new file mode 100644
index 000000000..438931f61
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_for_each.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_for_each.h"
diff --git a/libc/isystem/__algorithm/pstl_frontend_dispatch.h b/libc/isystem/__algorithm/pstl_frontend_dispatch.h
new file mode 100644
index 000000000..a36a3e4af
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_frontend_dispatch.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_frontend_dispatch.h"
diff --git a/libc/isystem/__algorithm/pstl_merge.h b/libc/isystem/__algorithm/pstl_merge.h
new file mode 100644
index 000000000..0121cf6b7
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_merge.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_merge.h"
diff --git a/libc/isystem/__algorithm/pstl_transform.h b/libc/isystem/__algorithm/pstl_transform.h
new file mode 100644
index 000000000..d4b998947
--- /dev/null
+++ b/libc/isystem/__algorithm/pstl_transform.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/pstl_transform.h"
diff --git a/libc/isystem/__algorithm/push_heap.h b/libc/isystem/__algorithm/push_heap.h
new file mode 100644
index 000000000..c02a0d194
--- /dev/null
+++ b/libc/isystem/__algorithm/push_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/push_heap.h"
diff --git a/libc/isystem/__algorithm/ranges_adjacent_find.h b/libc/isystem/__algorithm/ranges_adjacent_find.h
new file mode 100644
index 000000000..1f2376204
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_adjacent_find.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_adjacent_find.h"
diff --git a/libc/isystem/__algorithm/ranges_all_of.h b/libc/isystem/__algorithm/ranges_all_of.h
new file mode 100644
index 000000000..2363a515f
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_all_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_all_of.h"
diff --git a/libc/isystem/__algorithm/ranges_any_of.h b/libc/isystem/__algorithm/ranges_any_of.h
new file mode 100644
index 000000000..b87580f0b
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_any_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_any_of.h"
diff --git a/libc/isystem/__algorithm/ranges_binary_search.h b/libc/isystem/__algorithm/ranges_binary_search.h
new file mode 100644
index 000000000..c6a2f3b12
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_binary_search.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_binary_search.h"
diff --git a/libc/isystem/__algorithm/ranges_clamp.h b/libc/isystem/__algorithm/ranges_clamp.h
new file mode 100644
index 000000000..00a415218
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_clamp.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_clamp.h"
diff --git a/libc/isystem/__algorithm/ranges_copy.h b/libc/isystem/__algorithm/ranges_copy.h
new file mode 100644
index 000000000..614e85eaf
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_copy.h"
diff --git a/libc/isystem/__algorithm/ranges_copy_backward.h b/libc/isystem/__algorithm/ranges_copy_backward.h
new file mode 100644
index 000000000..3918baef3
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_copy_backward.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_copy_backward.h"
diff --git a/libc/isystem/__algorithm/ranges_copy_if.h b/libc/isystem/__algorithm/ranges_copy_if.h
new file mode 100644
index 000000000..d38f65586
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_copy_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_copy_if.h"
diff --git a/libc/isystem/__algorithm/ranges_copy_n.h b/libc/isystem/__algorithm/ranges_copy_n.h
new file mode 100644
index 000000000..e420c638c
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_copy_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_copy_n.h"
diff --git a/libc/isystem/__algorithm/ranges_count.h b/libc/isystem/__algorithm/ranges_count.h
new file mode 100644
index 000000000..2367c1f69
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_count.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_count.h"
diff --git a/libc/isystem/__algorithm/ranges_count_if.h b/libc/isystem/__algorithm/ranges_count_if.h
new file mode 100644
index 000000000..23f801f37
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_count_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_count_if.h"
diff --git a/libc/isystem/__algorithm/ranges_equal.h b/libc/isystem/__algorithm/ranges_equal.h
new file mode 100644
index 000000000..6d754d9dc
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_equal.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_equal.h"
diff --git a/libc/isystem/__algorithm/ranges_equal_range.h b/libc/isystem/__algorithm/ranges_equal_range.h
new file mode 100644
index 000000000..0bba88876
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_equal_range.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_equal_range.h"
diff --git a/libc/isystem/__algorithm/ranges_fill.h b/libc/isystem/__algorithm/ranges_fill.h
new file mode 100644
index 000000000..8a37d08a6
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_fill.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_fill.h"
diff --git a/libc/isystem/__algorithm/ranges_fill_n.h b/libc/isystem/__algorithm/ranges_fill_n.h
new file mode 100644
index 000000000..549344c61
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_fill_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_fill_n.h"
diff --git a/libc/isystem/__algorithm/ranges_find.h b/libc/isystem/__algorithm/ranges_find.h
new file mode 100644
index 000000000..bb84856c7
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_find.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_find.h"
diff --git a/libc/isystem/__algorithm/ranges_find_end.h b/libc/isystem/__algorithm/ranges_find_end.h
new file mode 100644
index 000000000..4ad4cecf6
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_find_end.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_find_end.h"
diff --git a/libc/isystem/__algorithm/ranges_find_first_of.h b/libc/isystem/__algorithm/ranges_find_first_of.h
new file mode 100644
index 000000000..aca73405f
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_find_first_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_find_first_of.h"
diff --git a/libc/isystem/__algorithm/ranges_find_if.h b/libc/isystem/__algorithm/ranges_find_if.h
new file mode 100644
index 000000000..2f36e6891
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_find_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_find_if.h"
diff --git a/libc/isystem/__algorithm/ranges_find_if_not.h b/libc/isystem/__algorithm/ranges_find_if_not.h
new file mode 100644
index 000000000..56af48b0c
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_find_if_not.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_find_if_not.h"
diff --git a/libc/isystem/__algorithm/ranges_for_each.h b/libc/isystem/__algorithm/ranges_for_each.h
new file mode 100644
index 000000000..df9984033
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_for_each.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_for_each.h"
diff --git a/libc/isystem/__algorithm/ranges_for_each_n.h b/libc/isystem/__algorithm/ranges_for_each_n.h
new file mode 100644
index 000000000..d14180101
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_for_each_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_for_each_n.h"
diff --git a/libc/isystem/__algorithm/ranges_generate.h b/libc/isystem/__algorithm/ranges_generate.h
new file mode 100644
index 000000000..835940bce
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_generate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_generate.h"
diff --git a/libc/isystem/__algorithm/ranges_generate_n.h b/libc/isystem/__algorithm/ranges_generate_n.h
new file mode 100644
index 000000000..87628794d
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_generate_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_generate_n.h"
diff --git a/libc/isystem/__algorithm/ranges_includes.h b/libc/isystem/__algorithm/ranges_includes.h
new file mode 100644
index 000000000..ae5763c6a
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_includes.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_includes.h"
diff --git a/libc/isystem/__algorithm/ranges_inplace_merge.h b/libc/isystem/__algorithm/ranges_inplace_merge.h
new file mode 100644
index 000000000..d93fc2f6a
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_inplace_merge.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_inplace_merge.h"
diff --git a/libc/isystem/__algorithm/ranges_is_heap.h b/libc/isystem/__algorithm/ranges_is_heap.h
new file mode 100644
index 000000000..2098e2926
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_is_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_is_heap.h"
diff --git a/libc/isystem/__algorithm/ranges_is_heap_until.h b/libc/isystem/__algorithm/ranges_is_heap_until.h
new file mode 100644
index 000000000..9211fed6a
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_is_heap_until.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_is_heap_until.h"
diff --git a/libc/isystem/__algorithm/ranges_is_partitioned.h b/libc/isystem/__algorithm/ranges_is_partitioned.h
new file mode 100644
index 000000000..d85d14103
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_is_partitioned.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_is_partitioned.h"
diff --git a/libc/isystem/__algorithm/ranges_is_permutation.h b/libc/isystem/__algorithm/ranges_is_permutation.h
new file mode 100644
index 000000000..16ca02f32
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_is_permutation.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_is_permutation.h"
diff --git a/libc/isystem/__algorithm/ranges_is_sorted.h b/libc/isystem/__algorithm/ranges_is_sorted.h
new file mode 100644
index 000000000..5c284c3c9
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_is_sorted.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_is_sorted.h"
diff --git a/libc/isystem/__algorithm/ranges_is_sorted_until.h b/libc/isystem/__algorithm/ranges_is_sorted_until.h
new file mode 100644
index 000000000..0518f5cdf
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_is_sorted_until.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_is_sorted_until.h"
diff --git a/libc/isystem/__algorithm/ranges_iterator_concept.h b/libc/isystem/__algorithm/ranges_iterator_concept.h
new file mode 100644
index 000000000..45e6c1170
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_iterator_concept.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_iterator_concept.h"
diff --git a/libc/isystem/__algorithm/ranges_lexicographical_compare.h b/libc/isystem/__algorithm/ranges_lexicographical_compare.h
new file mode 100644
index 000000000..07f8d9f3b
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_lexicographical_compare.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_lexicographical_compare.h"
diff --git a/libc/isystem/__algorithm/ranges_lower_bound.h b/libc/isystem/__algorithm/ranges_lower_bound.h
new file mode 100644
index 000000000..2267d4ae4
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_lower_bound.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_lower_bound.h"
diff --git a/libc/isystem/__algorithm/ranges_make_heap.h b/libc/isystem/__algorithm/ranges_make_heap.h
new file mode 100644
index 000000000..b365948f4
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_make_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_make_heap.h"
diff --git a/libc/isystem/__algorithm/ranges_max.h b/libc/isystem/__algorithm/ranges_max.h
new file mode 100644
index 000000000..841f0c942
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_max.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_max.h"
diff --git a/libc/isystem/__algorithm/ranges_max_element.h b/libc/isystem/__algorithm/ranges_max_element.h
new file mode 100644
index 000000000..df3ac4b63
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_max_element.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_max_element.h"
diff --git a/libc/isystem/__algorithm/ranges_merge.h b/libc/isystem/__algorithm/ranges_merge.h
new file mode 100644
index 000000000..79595cd16
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_merge.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_merge.h"
diff --git a/libc/isystem/__algorithm/ranges_min.h b/libc/isystem/__algorithm/ranges_min.h
new file mode 100644
index 000000000..cdee59d2a
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_min.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_min.h"
diff --git a/libc/isystem/__algorithm/ranges_min_element.h b/libc/isystem/__algorithm/ranges_min_element.h
new file mode 100644
index 000000000..9d910deac
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_min_element.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_min_element.h"
diff --git a/libc/isystem/__algorithm/ranges_minmax.h b/libc/isystem/__algorithm/ranges_minmax.h
new file mode 100644
index 000000000..dde2e06a2
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_minmax.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_minmax.h"
diff --git a/libc/isystem/__algorithm/ranges_minmax_element.h b/libc/isystem/__algorithm/ranges_minmax_element.h
new file mode 100644
index 000000000..6e4492f16
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_minmax_element.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_minmax_element.h"
diff --git a/libc/isystem/__algorithm/ranges_mismatch.h b/libc/isystem/__algorithm/ranges_mismatch.h
new file mode 100644
index 000000000..792d15396
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_mismatch.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_mismatch.h"
diff --git a/libc/isystem/__algorithm/ranges_move.h b/libc/isystem/__algorithm/ranges_move.h
new file mode 100644
index 000000000..31679c803
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_move.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_move.h"
diff --git a/libc/isystem/__algorithm/ranges_move_backward.h b/libc/isystem/__algorithm/ranges_move_backward.h
new file mode 100644
index 000000000..bc818a177
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_move_backward.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_move_backward.h"
diff --git a/libc/isystem/__algorithm/ranges_next_permutation.h b/libc/isystem/__algorithm/ranges_next_permutation.h
new file mode 100644
index 000000000..6ad640a97
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_next_permutation.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_next_permutation.h"
diff --git a/libc/isystem/__algorithm/ranges_none_of.h b/libc/isystem/__algorithm/ranges_none_of.h
new file mode 100644
index 000000000..1c646d68d
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_none_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_none_of.h"
diff --git a/libc/isystem/__algorithm/ranges_nth_element.h b/libc/isystem/__algorithm/ranges_nth_element.h
new file mode 100644
index 000000000..e1bf4c096
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_nth_element.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_nth_element.h"
diff --git a/libc/isystem/__algorithm/ranges_partial_sort.h b/libc/isystem/__algorithm/ranges_partial_sort.h
new file mode 100644
index 000000000..31ef088ce
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_partial_sort.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_partial_sort.h"
diff --git a/libc/isystem/__algorithm/ranges_partial_sort_copy.h b/libc/isystem/__algorithm/ranges_partial_sort_copy.h
new file mode 100644
index 000000000..a77684b58
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_partial_sort_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_partial_sort_copy.h"
diff --git a/libc/isystem/__algorithm/ranges_partition.h b/libc/isystem/__algorithm/ranges_partition.h
new file mode 100644
index 000000000..5066131c7
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_partition.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_partition.h"
diff --git a/libc/isystem/__algorithm/ranges_partition_copy.h b/libc/isystem/__algorithm/ranges_partition_copy.h
new file mode 100644
index 000000000..f11101304
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_partition_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_partition_copy.h"
diff --git a/libc/isystem/__algorithm/ranges_partition_point.h b/libc/isystem/__algorithm/ranges_partition_point.h
new file mode 100644
index 000000000..44c05b726
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_partition_point.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_partition_point.h"
diff --git a/libc/isystem/__algorithm/ranges_pop_heap.h b/libc/isystem/__algorithm/ranges_pop_heap.h
new file mode 100644
index 000000000..a39869c2b
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_pop_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_pop_heap.h"
diff --git a/libc/isystem/__algorithm/ranges_prev_permutation.h b/libc/isystem/__algorithm/ranges_prev_permutation.h
new file mode 100644
index 000000000..fe51f73d9
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_prev_permutation.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_prev_permutation.h"
diff --git a/libc/isystem/__algorithm/ranges_push_heap.h b/libc/isystem/__algorithm/ranges_push_heap.h
new file mode 100644
index 000000000..4b42e1a96
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_push_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_push_heap.h"
diff --git a/libc/isystem/__algorithm/ranges_remove.h b/libc/isystem/__algorithm/ranges_remove.h
new file mode 100644
index 000000000..2daa6b198
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_remove.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_remove.h"
diff --git a/libc/isystem/__algorithm/ranges_remove_copy.h b/libc/isystem/__algorithm/ranges_remove_copy.h
new file mode 100644
index 000000000..56dfa76c8
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_remove_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_remove_copy.h"
diff --git a/libc/isystem/__algorithm/ranges_remove_copy_if.h b/libc/isystem/__algorithm/ranges_remove_copy_if.h
new file mode 100644
index 000000000..50b979301
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_remove_copy_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_remove_copy_if.h"
diff --git a/libc/isystem/__algorithm/ranges_remove_if.h b/libc/isystem/__algorithm/ranges_remove_if.h
new file mode 100644
index 000000000..aa6d62fdb
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_remove_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_remove_if.h"
diff --git a/libc/isystem/__algorithm/ranges_replace.h b/libc/isystem/__algorithm/ranges_replace.h
new file mode 100644
index 000000000..c768b9880
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_replace.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_replace.h"
diff --git a/libc/isystem/__algorithm/ranges_replace_copy.h b/libc/isystem/__algorithm/ranges_replace_copy.h
new file mode 100644
index 000000000..d74d03164
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_replace_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_replace_copy.h"
diff --git a/libc/isystem/__algorithm/ranges_replace_copy_if.h b/libc/isystem/__algorithm/ranges_replace_copy_if.h
new file mode 100644
index 000000000..52b6e66a5
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_replace_copy_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_replace_copy_if.h"
diff --git a/libc/isystem/__algorithm/ranges_replace_if.h b/libc/isystem/__algorithm/ranges_replace_if.h
new file mode 100644
index 000000000..7ba904e37
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_replace_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_replace_if.h"
diff --git a/libc/isystem/__algorithm/ranges_reverse.h b/libc/isystem/__algorithm/ranges_reverse.h
new file mode 100644
index 000000000..1d6511164
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_reverse.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_reverse.h"
diff --git a/libc/isystem/__algorithm/ranges_reverse_copy.h b/libc/isystem/__algorithm/ranges_reverse_copy.h
new file mode 100644
index 000000000..33326159a
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_reverse_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_reverse_copy.h"
diff --git a/libc/isystem/__algorithm/ranges_rotate.h b/libc/isystem/__algorithm/ranges_rotate.h
new file mode 100644
index 000000000..1940bbc41
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_rotate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_rotate.h"
diff --git a/libc/isystem/__algorithm/ranges_rotate_copy.h b/libc/isystem/__algorithm/ranges_rotate_copy.h
new file mode 100644
index 000000000..43d13c4a9
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_rotate_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_rotate_copy.h"
diff --git a/libc/isystem/__algorithm/ranges_sample.h b/libc/isystem/__algorithm/ranges_sample.h
new file mode 100644
index 000000000..52f55a29d
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_sample.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_sample.h"
diff --git a/libc/isystem/__algorithm/ranges_search.h b/libc/isystem/__algorithm/ranges_search.h
new file mode 100644
index 000000000..acfc90c94
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_search.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_search.h"
diff --git a/libc/isystem/__algorithm/ranges_search_n.h b/libc/isystem/__algorithm/ranges_search_n.h
new file mode 100644
index 000000000..12056e956
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_search_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_search_n.h"
diff --git a/libc/isystem/__algorithm/ranges_set_difference.h b/libc/isystem/__algorithm/ranges_set_difference.h
new file mode 100644
index 000000000..b4705f503
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_set_difference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_set_difference.h"
diff --git a/libc/isystem/__algorithm/ranges_set_intersection.h b/libc/isystem/__algorithm/ranges_set_intersection.h
new file mode 100644
index 000000000..592bf86a2
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_set_intersection.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_set_intersection.h"
diff --git a/libc/isystem/__algorithm/ranges_set_symmetric_difference.h b/libc/isystem/__algorithm/ranges_set_symmetric_difference.h
new file mode 100644
index 000000000..d0a8d3e4d
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_set_symmetric_difference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_set_symmetric_difference.h"
diff --git a/libc/isystem/__algorithm/ranges_set_union.h b/libc/isystem/__algorithm/ranges_set_union.h
new file mode 100644
index 000000000..be428aa5c
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_set_union.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_set_union.h"
diff --git a/libc/isystem/__algorithm/ranges_shuffle.h b/libc/isystem/__algorithm/ranges_shuffle.h
new file mode 100644
index 000000000..39554ef60
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_shuffle.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_shuffle.h"
diff --git a/libc/isystem/__algorithm/ranges_sort.h b/libc/isystem/__algorithm/ranges_sort.h
new file mode 100644
index 000000000..049c5288f
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_sort.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_sort.h"
diff --git a/libc/isystem/__algorithm/ranges_sort_heap.h b/libc/isystem/__algorithm/ranges_sort_heap.h
new file mode 100644
index 000000000..88c46cbe5
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_sort_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_sort_heap.h"
diff --git a/libc/isystem/__algorithm/ranges_stable_partition.h b/libc/isystem/__algorithm/ranges_stable_partition.h
new file mode 100644
index 000000000..d82565db0
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_stable_partition.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_stable_partition.h"
diff --git a/libc/isystem/__algorithm/ranges_stable_sort.h b/libc/isystem/__algorithm/ranges_stable_sort.h
new file mode 100644
index 000000000..53b3bfd7e
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_stable_sort.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_stable_sort.h"
diff --git a/libc/isystem/__algorithm/ranges_starts_with.h b/libc/isystem/__algorithm/ranges_starts_with.h
new file mode 100644
index 000000000..9a2a22b8f
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_starts_with.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_starts_with.h"
diff --git a/libc/isystem/__algorithm/ranges_swap_ranges.h b/libc/isystem/__algorithm/ranges_swap_ranges.h
new file mode 100644
index 000000000..7ee898342
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_swap_ranges.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_swap_ranges.h"
diff --git a/libc/isystem/__algorithm/ranges_transform.h b/libc/isystem/__algorithm/ranges_transform.h
new file mode 100644
index 000000000..8dc2bd3a7
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_transform.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_transform.h"
diff --git a/libc/isystem/__algorithm/ranges_unique.h b/libc/isystem/__algorithm/ranges_unique.h
new file mode 100644
index 000000000..741c22c41
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_unique.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_unique.h"
diff --git a/libc/isystem/__algorithm/ranges_unique_copy.h b/libc/isystem/__algorithm/ranges_unique_copy.h
new file mode 100644
index 000000000..fa3ead46b
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_unique_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_unique_copy.h"
diff --git a/libc/isystem/__algorithm/ranges_upper_bound.h b/libc/isystem/__algorithm/ranges_upper_bound.h
new file mode 100644
index 000000000..658bb096b
--- /dev/null
+++ b/libc/isystem/__algorithm/ranges_upper_bound.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/ranges_upper_bound.h"
diff --git a/libc/isystem/__algorithm/remove.h b/libc/isystem/__algorithm/remove.h
new file mode 100644
index 000000000..1c6d33eb9
--- /dev/null
+++ b/libc/isystem/__algorithm/remove.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/remove.h"
diff --git a/libc/isystem/__algorithm/remove_copy.h b/libc/isystem/__algorithm/remove_copy.h
new file mode 100644
index 000000000..cc8f5347d
--- /dev/null
+++ b/libc/isystem/__algorithm/remove_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/remove_copy.h"
diff --git a/libc/isystem/__algorithm/remove_copy_if.h b/libc/isystem/__algorithm/remove_copy_if.h
new file mode 100644
index 000000000..fdf763b14
--- /dev/null
+++ b/libc/isystem/__algorithm/remove_copy_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/remove_copy_if.h"
diff --git a/libc/isystem/__algorithm/remove_if.h b/libc/isystem/__algorithm/remove_if.h
new file mode 100644
index 000000000..e9dc12444
--- /dev/null
+++ b/libc/isystem/__algorithm/remove_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/remove_if.h"
diff --git a/libc/isystem/__algorithm/replace.h b/libc/isystem/__algorithm/replace.h
new file mode 100644
index 000000000..29b8205d4
--- /dev/null
+++ b/libc/isystem/__algorithm/replace.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/replace.h"
diff --git a/libc/isystem/__algorithm/replace_copy.h b/libc/isystem/__algorithm/replace_copy.h
new file mode 100644
index 000000000..6b05b8502
--- /dev/null
+++ b/libc/isystem/__algorithm/replace_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/replace_copy.h"
diff --git a/libc/isystem/__algorithm/replace_copy_if.h b/libc/isystem/__algorithm/replace_copy_if.h
new file mode 100644
index 000000000..13ee607b1
--- /dev/null
+++ b/libc/isystem/__algorithm/replace_copy_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/replace_copy_if.h"
diff --git a/libc/isystem/__algorithm/replace_if.h b/libc/isystem/__algorithm/replace_if.h
new file mode 100644
index 000000000..42ac7b810
--- /dev/null
+++ b/libc/isystem/__algorithm/replace_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/replace_if.h"
diff --git a/libc/isystem/__algorithm/reverse.h b/libc/isystem/__algorithm/reverse.h
new file mode 100644
index 000000000..76a9096f7
--- /dev/null
+++ b/libc/isystem/__algorithm/reverse.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/reverse.h"
diff --git a/libc/isystem/__algorithm/reverse_copy.h b/libc/isystem/__algorithm/reverse_copy.h
new file mode 100644
index 000000000..a0b6debef
--- /dev/null
+++ b/libc/isystem/__algorithm/reverse_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/reverse_copy.h"
diff --git a/libc/isystem/__algorithm/rotate.h b/libc/isystem/__algorithm/rotate.h
new file mode 100644
index 000000000..84afe3294
--- /dev/null
+++ b/libc/isystem/__algorithm/rotate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/rotate.h"
diff --git a/libc/isystem/__algorithm/rotate_copy.h b/libc/isystem/__algorithm/rotate_copy.h
new file mode 100644
index 000000000..d0ad68ce0
--- /dev/null
+++ b/libc/isystem/__algorithm/rotate_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/rotate_copy.h"
diff --git a/libc/isystem/__algorithm/sample.h b/libc/isystem/__algorithm/sample.h
new file mode 100644
index 000000000..540d05e44
--- /dev/null
+++ b/libc/isystem/__algorithm/sample.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/sample.h"
diff --git a/libc/isystem/__algorithm/search.h b/libc/isystem/__algorithm/search.h
new file mode 100644
index 000000000..ba33010f4
--- /dev/null
+++ b/libc/isystem/__algorithm/search.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/search.h"
diff --git a/libc/isystem/__algorithm/search_n.h b/libc/isystem/__algorithm/search_n.h
new file mode 100644
index 000000000..7e4c2e984
--- /dev/null
+++ b/libc/isystem/__algorithm/search_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/search_n.h"
diff --git a/libc/isystem/__algorithm/set_difference.h b/libc/isystem/__algorithm/set_difference.h
new file mode 100644
index 000000000..1c171eabb
--- /dev/null
+++ b/libc/isystem/__algorithm/set_difference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/set_difference.h"
diff --git a/libc/isystem/__algorithm/set_intersection.h b/libc/isystem/__algorithm/set_intersection.h
new file mode 100644
index 000000000..f04fd24a8
--- /dev/null
+++ b/libc/isystem/__algorithm/set_intersection.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/set_intersection.h"
diff --git a/libc/isystem/__algorithm/set_symmetric_difference.h b/libc/isystem/__algorithm/set_symmetric_difference.h
new file mode 100644
index 000000000..8b00ca162
--- /dev/null
+++ b/libc/isystem/__algorithm/set_symmetric_difference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/set_symmetric_difference.h"
diff --git a/libc/isystem/__algorithm/set_union.h b/libc/isystem/__algorithm/set_union.h
new file mode 100644
index 000000000..0d6a276bc
--- /dev/null
+++ b/libc/isystem/__algorithm/set_union.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/set_union.h"
diff --git a/libc/isystem/__algorithm/shift_left.h b/libc/isystem/__algorithm/shift_left.h
new file mode 100644
index 000000000..775fa8a60
--- /dev/null
+++ b/libc/isystem/__algorithm/shift_left.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/shift_left.h"
diff --git a/libc/isystem/__algorithm/shift_right.h b/libc/isystem/__algorithm/shift_right.h
new file mode 100644
index 000000000..ab37a39b8
--- /dev/null
+++ b/libc/isystem/__algorithm/shift_right.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/shift_right.h"
diff --git a/libc/isystem/__algorithm/shuffle.h b/libc/isystem/__algorithm/shuffle.h
new file mode 100644
index 000000000..68818703b
--- /dev/null
+++ b/libc/isystem/__algorithm/shuffle.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/shuffle.h"
diff --git a/libc/isystem/__algorithm/sift_down.h b/libc/isystem/__algorithm/sift_down.h
new file mode 100644
index 000000000..385e7ddf9
--- /dev/null
+++ b/libc/isystem/__algorithm/sift_down.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/sift_down.h"
diff --git a/libc/isystem/__algorithm/sort.h b/libc/isystem/__algorithm/sort.h
new file mode 100644
index 000000000..9317e69a3
--- /dev/null
+++ b/libc/isystem/__algorithm/sort.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/sort.h"
diff --git a/libc/isystem/__algorithm/sort_heap.h b/libc/isystem/__algorithm/sort_heap.h
new file mode 100644
index 000000000..9114e3b1d
--- /dev/null
+++ b/libc/isystem/__algorithm/sort_heap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/sort_heap.h"
diff --git a/libc/isystem/__algorithm/stable_partition.h b/libc/isystem/__algorithm/stable_partition.h
new file mode 100644
index 000000000..68df678f4
--- /dev/null
+++ b/libc/isystem/__algorithm/stable_partition.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/stable_partition.h"
diff --git a/libc/isystem/__algorithm/stable_sort.h b/libc/isystem/__algorithm/stable_sort.h
new file mode 100644
index 000000000..0a4f0ab4f
--- /dev/null
+++ b/libc/isystem/__algorithm/stable_sort.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/stable_sort.h"
diff --git a/libc/isystem/__algorithm/swap_ranges.h b/libc/isystem/__algorithm/swap_ranges.h
new file mode 100644
index 000000000..2d10b25fa
--- /dev/null
+++ b/libc/isystem/__algorithm/swap_ranges.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/swap_ranges.h"
diff --git a/libc/isystem/__algorithm/three_way_comp_ref_type.h b/libc/isystem/__algorithm/three_way_comp_ref_type.h
new file mode 100644
index 000000000..e504e19c6
--- /dev/null
+++ b/libc/isystem/__algorithm/three_way_comp_ref_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/three_way_comp_ref_type.h"
diff --git a/libc/isystem/__algorithm/transform.h b/libc/isystem/__algorithm/transform.h
new file mode 100644
index 000000000..60614cb88
--- /dev/null
+++ b/libc/isystem/__algorithm/transform.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/transform.h"
diff --git a/libc/isystem/__algorithm/uniform_random_bit_generator_adaptor.h b/libc/isystem/__algorithm/uniform_random_bit_generator_adaptor.h
new file mode 100644
index 000000000..f3ca28284
--- /dev/null
+++ b/libc/isystem/__algorithm/uniform_random_bit_generator_adaptor.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/uniform_random_bit_generator_adaptor.h"
diff --git a/libc/isystem/__algorithm/unique.h b/libc/isystem/__algorithm/unique.h
new file mode 100644
index 000000000..a1ef228cf
--- /dev/null
+++ b/libc/isystem/__algorithm/unique.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/unique.h"
diff --git a/libc/isystem/__algorithm/unique_copy.h b/libc/isystem/__algorithm/unique_copy.h
new file mode 100644
index 000000000..3f8ddeb61
--- /dev/null
+++ b/libc/isystem/__algorithm/unique_copy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/unique_copy.h"
diff --git a/libc/isystem/__algorithm/unwrap_iter.h b/libc/isystem/__algorithm/unwrap_iter.h
new file mode 100644
index 000000000..277288f9a
--- /dev/null
+++ b/libc/isystem/__algorithm/unwrap_iter.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/unwrap_iter.h"
diff --git a/libc/isystem/__algorithm/unwrap_range.h b/libc/isystem/__algorithm/unwrap_range.h
new file mode 100644
index 000000000..9cb43bd44
--- /dev/null
+++ b/libc/isystem/__algorithm/unwrap_range.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/unwrap_range.h"
diff --git a/libc/isystem/__algorithm/upper_bound.h b/libc/isystem/__algorithm/upper_bound.h
new file mode 100644
index 000000000..c694ac61d
--- /dev/null
+++ b/libc/isystem/__algorithm/upper_bound.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__algorithm/upper_bound.h"
diff --git a/libc/isystem/__assert b/libc/isystem/__assert
new file mode 100644
index 000000000..29532acc8
--- /dev/null
+++ b/libc/isystem/__assert
@@ -0,0 +1 @@
+#include "third_party/libcxx/__assert"
diff --git a/libc/isystem/__atomic/aliases.h b/libc/isystem/__atomic/aliases.h
new file mode 100644
index 000000000..6849bcd1b
--- /dev/null
+++ b/libc/isystem/__atomic/aliases.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/aliases.h"
diff --git a/libc/isystem/__atomic/atomic.h b/libc/isystem/__atomic/atomic.h
new file mode 100644
index 000000000..b2324671e
--- /dev/null
+++ b/libc/isystem/__atomic/atomic.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/atomic.h"
diff --git a/libc/isystem/__atomic/atomic_base.h b/libc/isystem/__atomic/atomic_base.h
new file mode 100644
index 000000000..ec1733d30
--- /dev/null
+++ b/libc/isystem/__atomic/atomic_base.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/atomic_base.h"
diff --git a/libc/isystem/__atomic/atomic_flag.h b/libc/isystem/__atomic/atomic_flag.h
new file mode 100644
index 000000000..70e1268a5
--- /dev/null
+++ b/libc/isystem/__atomic/atomic_flag.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/atomic_flag.h"
diff --git a/libc/isystem/__atomic/atomic_init.h b/libc/isystem/__atomic/atomic_init.h
new file mode 100644
index 000000000..c04e99f38
--- /dev/null
+++ b/libc/isystem/__atomic/atomic_init.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/atomic_init.h"
diff --git a/libc/isystem/__atomic/atomic_lock_free.h b/libc/isystem/__atomic/atomic_lock_free.h
new file mode 100644
index 000000000..3a086be7f
--- /dev/null
+++ b/libc/isystem/__atomic/atomic_lock_free.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/atomic_lock_free.h"
diff --git a/libc/isystem/__atomic/atomic_sync.h b/libc/isystem/__atomic/atomic_sync.h
new file mode 100644
index 000000000..8fb086325
--- /dev/null
+++ b/libc/isystem/__atomic/atomic_sync.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/atomic_sync.h"
diff --git a/libc/isystem/__atomic/check_memory_order.h b/libc/isystem/__atomic/check_memory_order.h
new file mode 100644
index 000000000..71de721f7
--- /dev/null
+++ b/libc/isystem/__atomic/check_memory_order.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/check_memory_order.h"
diff --git a/libc/isystem/__atomic/contention_t.h b/libc/isystem/__atomic/contention_t.h
new file mode 100644
index 000000000..64c180391
--- /dev/null
+++ b/libc/isystem/__atomic/contention_t.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/contention_t.h"
diff --git a/libc/isystem/__atomic/cxx_atomic_impl.h b/libc/isystem/__atomic/cxx_atomic_impl.h
new file mode 100644
index 000000000..78aabe632
--- /dev/null
+++ b/libc/isystem/__atomic/cxx_atomic_impl.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/cxx_atomic_impl.h"
diff --git a/libc/isystem/__atomic/fence.h b/libc/isystem/__atomic/fence.h
new file mode 100644
index 000000000..183ace12e
--- /dev/null
+++ b/libc/isystem/__atomic/fence.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/fence.h"
diff --git a/libc/isystem/__atomic/is_always_lock_free.h b/libc/isystem/__atomic/is_always_lock_free.h
new file mode 100644
index 000000000..9b6374f35
--- /dev/null
+++ b/libc/isystem/__atomic/is_always_lock_free.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/is_always_lock_free.h"
diff --git a/libc/isystem/__atomic/kill_dependency.h b/libc/isystem/__atomic/kill_dependency.h
new file mode 100644
index 000000000..8b89ed54e
--- /dev/null
+++ b/libc/isystem/__atomic/kill_dependency.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/kill_dependency.h"
diff --git a/libc/isystem/__atomic/memory_order.h b/libc/isystem/__atomic/memory_order.h
new file mode 100644
index 000000000..d29e57fa5
--- /dev/null
+++ b/libc/isystem/__atomic/memory_order.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__atomic/memory_order.h"
diff --git a/libc/isystem/__availability b/libc/isystem/__availability
new file mode 100644
index 000000000..479d7b7ac
--- /dev/null
+++ b/libc/isystem/__availability
@@ -0,0 +1 @@
+#include "third_party/libcxx/__availability"
diff --git a/libc/isystem/__bit/bit_cast.h b/libc/isystem/__bit/bit_cast.h
new file mode 100644
index 000000000..44318231a
--- /dev/null
+++ b/libc/isystem/__bit/bit_cast.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/bit_cast.h"
diff --git a/libc/isystem/__bit/bit_ceil.h b/libc/isystem/__bit/bit_ceil.h
new file mode 100644
index 000000000..2626419a0
--- /dev/null
+++ b/libc/isystem/__bit/bit_ceil.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/bit_ceil.h"
diff --git a/libc/isystem/__bit/bit_floor.h b/libc/isystem/__bit/bit_floor.h
new file mode 100644
index 000000000..bff96affd
--- /dev/null
+++ b/libc/isystem/__bit/bit_floor.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/bit_floor.h"
diff --git a/libc/isystem/__bit/bit_log2.h b/libc/isystem/__bit/bit_log2.h
new file mode 100644
index 000000000..6cd2cf734
--- /dev/null
+++ b/libc/isystem/__bit/bit_log2.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/bit_log2.h"
diff --git a/libc/isystem/__bit/bit_width.h b/libc/isystem/__bit/bit_width.h
new file mode 100644
index 000000000..ca0ffe6e7
--- /dev/null
+++ b/libc/isystem/__bit/bit_width.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/bit_width.h"
diff --git a/libc/isystem/__bit/blsr.h b/libc/isystem/__bit/blsr.h
new file mode 100644
index 000000000..785569624
--- /dev/null
+++ b/libc/isystem/__bit/blsr.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/blsr.h"
diff --git a/libc/isystem/__bit/byteswap.h b/libc/isystem/__bit/byteswap.h
new file mode 100644
index 000000000..698990aca
--- /dev/null
+++ b/libc/isystem/__bit/byteswap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/byteswap.h"
diff --git a/libc/isystem/__bit/countl.h b/libc/isystem/__bit/countl.h
new file mode 100644
index 000000000..a1bb62153
--- /dev/null
+++ b/libc/isystem/__bit/countl.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/countl.h"
diff --git a/libc/isystem/__bit/countr.h b/libc/isystem/__bit/countr.h
new file mode 100644
index 000000000..ff1b0056b
--- /dev/null
+++ b/libc/isystem/__bit/countr.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/countr.h"
diff --git a/libc/isystem/__bit/endian.h b/libc/isystem/__bit/endian.h
new file mode 100644
index 000000000..ab076f21a
--- /dev/null
+++ b/libc/isystem/__bit/endian.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/endian.h"
diff --git a/libc/isystem/__bit/has_single_bit.h b/libc/isystem/__bit/has_single_bit.h
new file mode 100644
index 000000000..9a3f4bc10
--- /dev/null
+++ b/libc/isystem/__bit/has_single_bit.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/has_single_bit.h"
diff --git a/libc/isystem/__bit/popcount.h b/libc/isystem/__bit/popcount.h
new file mode 100644
index 000000000..b12a77a70
--- /dev/null
+++ b/libc/isystem/__bit/popcount.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/popcount.h"
diff --git a/libc/isystem/__bit/rotate.h b/libc/isystem/__bit/rotate.h
new file mode 100644
index 000000000..20f0ec244
--- /dev/null
+++ b/libc/isystem/__bit/rotate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit/rotate.h"
diff --git a/libc/isystem/__bit_reference b/libc/isystem/__bit_reference
new file mode 100644
index 000000000..89438011c
--- /dev/null
+++ b/libc/isystem/__bit_reference
@@ -0,0 +1 @@
+#include "third_party/libcxx/__bit_reference"
diff --git a/libc/isystem/__charconv/chars_format.h b/libc/isystem/__charconv/chars_format.h
new file mode 100644
index 000000000..7e15b94c4
--- /dev/null
+++ b/libc/isystem/__charconv/chars_format.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/chars_format.h"
diff --git a/libc/isystem/__charconv/from_chars_integral.h b/libc/isystem/__charconv/from_chars_integral.h
new file mode 100644
index 000000000..13af75de0
--- /dev/null
+++ b/libc/isystem/__charconv/from_chars_integral.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/from_chars_integral.h"
diff --git a/libc/isystem/__charconv/from_chars_result.h b/libc/isystem/__charconv/from_chars_result.h
new file mode 100644
index 000000000..c3b6d41ce
--- /dev/null
+++ b/libc/isystem/__charconv/from_chars_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/from_chars_result.h"
diff --git a/libc/isystem/__charconv/tables.h b/libc/isystem/__charconv/tables.h
new file mode 100644
index 000000000..4abd3d125
--- /dev/null
+++ b/libc/isystem/__charconv/tables.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/tables.h"
diff --git a/libc/isystem/__charconv/to_chars.h b/libc/isystem/__charconv/to_chars.h
new file mode 100644
index 000000000..ee2ee1db2
--- /dev/null
+++ b/libc/isystem/__charconv/to_chars.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/to_chars.h"
diff --git a/libc/isystem/__charconv/to_chars_base_10.h b/libc/isystem/__charconv/to_chars_base_10.h
new file mode 100644
index 000000000..850ffbf1f
--- /dev/null
+++ b/libc/isystem/__charconv/to_chars_base_10.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/to_chars_base_10.h"
diff --git a/libc/isystem/__charconv/to_chars_floating_point.h b/libc/isystem/__charconv/to_chars_floating_point.h
new file mode 100644
index 000000000..b88f5777b
--- /dev/null
+++ b/libc/isystem/__charconv/to_chars_floating_point.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/to_chars_floating_point.h"
diff --git a/libc/isystem/__charconv/to_chars_integral.h b/libc/isystem/__charconv/to_chars_integral.h
new file mode 100644
index 000000000..0180d2f2f
--- /dev/null
+++ b/libc/isystem/__charconv/to_chars_integral.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/to_chars_integral.h"
diff --git a/libc/isystem/__charconv/to_chars_result.h b/libc/isystem/__charconv/to_chars_result.h
new file mode 100644
index 000000000..c6256c3d3
--- /dev/null
+++ b/libc/isystem/__charconv/to_chars_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/to_chars_result.h"
diff --git a/libc/isystem/__charconv/traits.h b/libc/isystem/__charconv/traits.h
new file mode 100644
index 000000000..e2c5ac488
--- /dev/null
+++ b/libc/isystem/__charconv/traits.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__charconv/traits.h"
diff --git a/libc/isystem/__chrono/calendar.h b/libc/isystem/__chrono/calendar.h
new file mode 100644
index 000000000..4f83cab7b
--- /dev/null
+++ b/libc/isystem/__chrono/calendar.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/calendar.h"
diff --git a/libc/isystem/__chrono/concepts.h b/libc/isystem/__chrono/concepts.h
new file mode 100644
index 000000000..e3a11cdde
--- /dev/null
+++ b/libc/isystem/__chrono/concepts.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/concepts.h"
diff --git a/libc/isystem/__chrono/convert_to_timespec.h b/libc/isystem/__chrono/convert_to_timespec.h
new file mode 100644
index 000000000..628d0b0a8
--- /dev/null
+++ b/libc/isystem/__chrono/convert_to_timespec.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/convert_to_timespec.h"
diff --git a/libc/isystem/__chrono/convert_to_tm.h b/libc/isystem/__chrono/convert_to_tm.h
new file mode 100644
index 000000000..f007e86fe
--- /dev/null
+++ b/libc/isystem/__chrono/convert_to_tm.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/convert_to_tm.h"
diff --git a/libc/isystem/__chrono/day.h b/libc/isystem/__chrono/day.h
new file mode 100644
index 000000000..120234303
--- /dev/null
+++ b/libc/isystem/__chrono/day.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/day.h"
diff --git a/libc/isystem/__chrono/duration.h b/libc/isystem/__chrono/duration.h
new file mode 100644
index 000000000..c3ed6729d
--- /dev/null
+++ b/libc/isystem/__chrono/duration.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/duration.h"
diff --git a/libc/isystem/__chrono/file_clock.h b/libc/isystem/__chrono/file_clock.h
new file mode 100644
index 000000000..95c3844f7
--- /dev/null
+++ b/libc/isystem/__chrono/file_clock.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/file_clock.h"
diff --git a/libc/isystem/__chrono/formatter.h b/libc/isystem/__chrono/formatter.h
new file mode 100644
index 000000000..fa3aadbb9
--- /dev/null
+++ b/libc/isystem/__chrono/formatter.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/formatter.h"
diff --git a/libc/isystem/__chrono/hh_mm_ss.h b/libc/isystem/__chrono/hh_mm_ss.h
new file mode 100644
index 000000000..9ad5eb53c
--- /dev/null
+++ b/libc/isystem/__chrono/hh_mm_ss.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/hh_mm_ss.h"
diff --git a/libc/isystem/__chrono/high_resolution_clock.h b/libc/isystem/__chrono/high_resolution_clock.h
new file mode 100644
index 000000000..154105dfa
--- /dev/null
+++ b/libc/isystem/__chrono/high_resolution_clock.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/high_resolution_clock.h"
diff --git a/libc/isystem/__chrono/literals.h b/libc/isystem/__chrono/literals.h
new file mode 100644
index 000000000..a78197f4c
--- /dev/null
+++ b/libc/isystem/__chrono/literals.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/literals.h"
diff --git a/libc/isystem/__chrono/month.h b/libc/isystem/__chrono/month.h
new file mode 100644
index 000000000..1efb44743
--- /dev/null
+++ b/libc/isystem/__chrono/month.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/month.h"
diff --git a/libc/isystem/__chrono/month_weekday.h b/libc/isystem/__chrono/month_weekday.h
new file mode 100644
index 000000000..4933df3c4
--- /dev/null
+++ b/libc/isystem/__chrono/month_weekday.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/month_weekday.h"
diff --git a/libc/isystem/__chrono/monthday.h b/libc/isystem/__chrono/monthday.h
new file mode 100644
index 000000000..116a0667c
--- /dev/null
+++ b/libc/isystem/__chrono/monthday.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/monthday.h"
diff --git a/libc/isystem/__chrono/ostream.h b/libc/isystem/__chrono/ostream.h
new file mode 100644
index 000000000..03c1d617a
--- /dev/null
+++ b/libc/isystem/__chrono/ostream.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/ostream.h"
diff --git a/libc/isystem/__chrono/parser_std_format_spec.h b/libc/isystem/__chrono/parser_std_format_spec.h
new file mode 100644
index 000000000..a1f0a7a66
--- /dev/null
+++ b/libc/isystem/__chrono/parser_std_format_spec.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/parser_std_format_spec.h"
diff --git a/libc/isystem/__chrono/statically_widen.h b/libc/isystem/__chrono/statically_widen.h
new file mode 100644
index 000000000..dc6792601
--- /dev/null
+++ b/libc/isystem/__chrono/statically_widen.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/statically_widen.h"
diff --git a/libc/isystem/__chrono/steady_clock.h b/libc/isystem/__chrono/steady_clock.h
new file mode 100644
index 000000000..662b4ff44
--- /dev/null
+++ b/libc/isystem/__chrono/steady_clock.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/steady_clock.h"
diff --git a/libc/isystem/__chrono/system_clock.h b/libc/isystem/__chrono/system_clock.h
new file mode 100644
index 000000000..074c1a445
--- /dev/null
+++ b/libc/isystem/__chrono/system_clock.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/system_clock.h"
diff --git a/libc/isystem/__chrono/time_point.h b/libc/isystem/__chrono/time_point.h
new file mode 100644
index 000000000..12e91e5d0
--- /dev/null
+++ b/libc/isystem/__chrono/time_point.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/time_point.h"
diff --git a/libc/isystem/__chrono/weekday.h b/libc/isystem/__chrono/weekday.h
new file mode 100644
index 000000000..a0495b5f1
--- /dev/null
+++ b/libc/isystem/__chrono/weekday.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/weekday.h"
diff --git a/libc/isystem/__chrono/year.h b/libc/isystem/__chrono/year.h
new file mode 100644
index 000000000..0cc26795f
--- /dev/null
+++ b/libc/isystem/__chrono/year.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/year.h"
diff --git a/libc/isystem/__chrono/year_month.h b/libc/isystem/__chrono/year_month.h
new file mode 100644
index 000000000..017ce52d5
--- /dev/null
+++ b/libc/isystem/__chrono/year_month.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/year_month.h"
diff --git a/libc/isystem/__chrono/year_month_day.h b/libc/isystem/__chrono/year_month_day.h
new file mode 100644
index 000000000..3dabb5381
--- /dev/null
+++ b/libc/isystem/__chrono/year_month_day.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/year_month_day.h"
diff --git a/libc/isystem/__chrono/year_month_weekday.h b/libc/isystem/__chrono/year_month_weekday.h
new file mode 100644
index 000000000..fea61c011
--- /dev/null
+++ b/libc/isystem/__chrono/year_month_weekday.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__chrono/year_month_weekday.h"
diff --git a/libc/isystem/__compare/common_comparison_category.h b/libc/isystem/__compare/common_comparison_category.h
new file mode 100644
index 000000000..de98b2a88
--- /dev/null
+++ b/libc/isystem/__compare/common_comparison_category.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/common_comparison_category.h"
diff --git a/libc/isystem/__compare/compare_partial_order_fallback.h b/libc/isystem/__compare/compare_partial_order_fallback.h
new file mode 100644
index 000000000..f3f249923
--- /dev/null
+++ b/libc/isystem/__compare/compare_partial_order_fallback.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/compare_partial_order_fallback.h"
diff --git a/libc/isystem/__compare/compare_strong_order_fallback.h b/libc/isystem/__compare/compare_strong_order_fallback.h
new file mode 100644
index 000000000..688861e61
--- /dev/null
+++ b/libc/isystem/__compare/compare_strong_order_fallback.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/compare_strong_order_fallback.h"
diff --git a/libc/isystem/__compare/compare_three_way.h b/libc/isystem/__compare/compare_three_way.h
new file mode 100644
index 000000000..87a359eaa
--- /dev/null
+++ b/libc/isystem/__compare/compare_three_way.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/compare_three_way.h"
diff --git a/libc/isystem/__compare/compare_three_way_result.h b/libc/isystem/__compare/compare_three_way_result.h
new file mode 100644
index 000000000..b917b6682
--- /dev/null
+++ b/libc/isystem/__compare/compare_three_way_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/compare_three_way_result.h"
diff --git a/libc/isystem/__compare/compare_weak_order_fallback.h b/libc/isystem/__compare/compare_weak_order_fallback.h
new file mode 100644
index 000000000..9fad4a764
--- /dev/null
+++ b/libc/isystem/__compare/compare_weak_order_fallback.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/compare_weak_order_fallback.h"
diff --git a/libc/isystem/__compare/is_eq.h b/libc/isystem/__compare/is_eq.h
new file mode 100644
index 000000000..8cc06f947
--- /dev/null
+++ b/libc/isystem/__compare/is_eq.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/is_eq.h"
diff --git a/libc/isystem/__compare/ordering.h b/libc/isystem/__compare/ordering.h
new file mode 100644
index 000000000..f3c216b03
--- /dev/null
+++ b/libc/isystem/__compare/ordering.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/ordering.h"
diff --git a/libc/isystem/__compare/partial_order.h b/libc/isystem/__compare/partial_order.h
new file mode 100644
index 000000000..a6d273736
--- /dev/null
+++ b/libc/isystem/__compare/partial_order.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/partial_order.h"
diff --git a/libc/isystem/__compare/strong_order.h b/libc/isystem/__compare/strong_order.h
new file mode 100644
index 000000000..d95c48d0d
--- /dev/null
+++ b/libc/isystem/__compare/strong_order.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/strong_order.h"
diff --git a/libc/isystem/__compare/synth_three_way.h b/libc/isystem/__compare/synth_three_way.h
new file mode 100644
index 000000000..d3ce27c29
--- /dev/null
+++ b/libc/isystem/__compare/synth_three_way.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/synth_three_way.h"
diff --git a/libc/isystem/__compare/three_way_comparable.h b/libc/isystem/__compare/three_way_comparable.h
new file mode 100644
index 000000000..2af969ffd
--- /dev/null
+++ b/libc/isystem/__compare/three_way_comparable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/three_way_comparable.h"
diff --git a/libc/isystem/__compare/weak_order.h b/libc/isystem/__compare/weak_order.h
new file mode 100644
index 000000000..4005b2733
--- /dev/null
+++ b/libc/isystem/__compare/weak_order.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__compare/weak_order.h"
diff --git a/libc/isystem/__concepts/arithmetic.h b/libc/isystem/__concepts/arithmetic.h
new file mode 100644
index 000000000..b7b3a862b
--- /dev/null
+++ b/libc/isystem/__concepts/arithmetic.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/arithmetic.h"
diff --git a/libc/isystem/__concepts/assignable.h b/libc/isystem/__concepts/assignable.h
new file mode 100644
index 000000000..a5ba0abc6
--- /dev/null
+++ b/libc/isystem/__concepts/assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/assignable.h"
diff --git a/libc/isystem/__concepts/boolean_testable.h b/libc/isystem/__concepts/boolean_testable.h
new file mode 100644
index 000000000..d2090da3e
--- /dev/null
+++ b/libc/isystem/__concepts/boolean_testable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/boolean_testable.h"
diff --git a/libc/isystem/__concepts/class_or_enum.h b/libc/isystem/__concepts/class_or_enum.h
new file mode 100644
index 000000000..06f8b868b
--- /dev/null
+++ b/libc/isystem/__concepts/class_or_enum.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/class_or_enum.h"
diff --git a/libc/isystem/__concepts/common_reference_with.h b/libc/isystem/__concepts/common_reference_with.h
new file mode 100644
index 000000000..3fb55b0bf
--- /dev/null
+++ b/libc/isystem/__concepts/common_reference_with.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/common_reference_with.h"
diff --git a/libc/isystem/__concepts/common_with.h b/libc/isystem/__concepts/common_with.h
new file mode 100644
index 000000000..8312c9793
--- /dev/null
+++ b/libc/isystem/__concepts/common_with.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/common_with.h"
diff --git a/libc/isystem/__concepts/constructible.h b/libc/isystem/__concepts/constructible.h
new file mode 100644
index 000000000..bcc66912e
--- /dev/null
+++ b/libc/isystem/__concepts/constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/constructible.h"
diff --git a/libc/isystem/__concepts/convertible_to.h b/libc/isystem/__concepts/convertible_to.h
new file mode 100644
index 000000000..28d89bcea
--- /dev/null
+++ b/libc/isystem/__concepts/convertible_to.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/convertible_to.h"
diff --git a/libc/isystem/__concepts/copyable.h b/libc/isystem/__concepts/copyable.h
new file mode 100644
index 000000000..ee9df4ce8
--- /dev/null
+++ b/libc/isystem/__concepts/copyable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/copyable.h"
diff --git a/libc/isystem/__concepts/derived_from.h b/libc/isystem/__concepts/derived_from.h
new file mode 100644
index 000000000..4598169a0
--- /dev/null
+++ b/libc/isystem/__concepts/derived_from.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/derived_from.h"
diff --git a/libc/isystem/__concepts/destructible.h b/libc/isystem/__concepts/destructible.h
new file mode 100644
index 000000000..7b6bd65bb
--- /dev/null
+++ b/libc/isystem/__concepts/destructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/destructible.h"
diff --git a/libc/isystem/__concepts/different_from.h b/libc/isystem/__concepts/different_from.h
new file mode 100644
index 000000000..81c2114f3
--- /dev/null
+++ b/libc/isystem/__concepts/different_from.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/different_from.h"
diff --git a/libc/isystem/__concepts/equality_comparable.h b/libc/isystem/__concepts/equality_comparable.h
new file mode 100644
index 000000000..6c1abeafa
--- /dev/null
+++ b/libc/isystem/__concepts/equality_comparable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/equality_comparable.h"
diff --git a/libc/isystem/__concepts/invocable.h b/libc/isystem/__concepts/invocable.h
new file mode 100644
index 000000000..e5639ff74
--- /dev/null
+++ b/libc/isystem/__concepts/invocable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/invocable.h"
diff --git a/libc/isystem/__concepts/movable.h b/libc/isystem/__concepts/movable.h
new file mode 100644
index 000000000..d3939c357
--- /dev/null
+++ b/libc/isystem/__concepts/movable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/movable.h"
diff --git a/libc/isystem/__concepts/predicate.h b/libc/isystem/__concepts/predicate.h
new file mode 100644
index 000000000..bb5145da0
--- /dev/null
+++ b/libc/isystem/__concepts/predicate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/predicate.h"
diff --git a/libc/isystem/__concepts/regular.h b/libc/isystem/__concepts/regular.h
new file mode 100644
index 000000000..39772486a
--- /dev/null
+++ b/libc/isystem/__concepts/regular.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/regular.h"
diff --git a/libc/isystem/__concepts/relation.h b/libc/isystem/__concepts/relation.h
new file mode 100644
index 000000000..ca0f9603f
--- /dev/null
+++ b/libc/isystem/__concepts/relation.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/relation.h"
diff --git a/libc/isystem/__concepts/same_as.h b/libc/isystem/__concepts/same_as.h
new file mode 100644
index 000000000..5aed22981
--- /dev/null
+++ b/libc/isystem/__concepts/same_as.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/same_as.h"
diff --git a/libc/isystem/__concepts/semiregular.h b/libc/isystem/__concepts/semiregular.h
new file mode 100644
index 000000000..419b73bed
--- /dev/null
+++ b/libc/isystem/__concepts/semiregular.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/semiregular.h"
diff --git a/libc/isystem/__concepts/swappable.h b/libc/isystem/__concepts/swappable.h
new file mode 100644
index 000000000..30f479c29
--- /dev/null
+++ b/libc/isystem/__concepts/swappable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/swappable.h"
diff --git a/libc/isystem/__concepts/totally_ordered.h b/libc/isystem/__concepts/totally_ordered.h
new file mode 100644
index 000000000..b7b4cd760
--- /dev/null
+++ b/libc/isystem/__concepts/totally_ordered.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__concepts/totally_ordered.h"
diff --git a/libc/isystem/__condition_variable/condition_variable.h b/libc/isystem/__condition_variable/condition_variable.h
new file mode 100644
index 000000000..27d82ac00
--- /dev/null
+++ b/libc/isystem/__condition_variable/condition_variable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__condition_variable/condition_variable.h"
diff --git a/libc/isystem/__config b/libc/isystem/__config
new file mode 100644
index 000000000..e0803675d
--- /dev/null
+++ b/libc/isystem/__config
@@ -0,0 +1 @@
+#include "third_party/libcxx/__config"
diff --git a/libc/isystem/__config_site b/libc/isystem/__config_site
new file mode 100644
index 000000000..281eff4b8
--- /dev/null
+++ b/libc/isystem/__config_site
@@ -0,0 +1 @@
+#include "third_party/libcxx/__config_site"
diff --git a/libc/isystem/__coroutine/coroutine_handle.h b/libc/isystem/__coroutine/coroutine_handle.h
new file mode 100644
index 000000000..edc585dcc
--- /dev/null
+++ b/libc/isystem/__coroutine/coroutine_handle.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__coroutine/coroutine_handle.h"
diff --git a/libc/isystem/__coroutine/coroutine_traits.h b/libc/isystem/__coroutine/coroutine_traits.h
new file mode 100644
index 000000000..bbff1ebdd
--- /dev/null
+++ b/libc/isystem/__coroutine/coroutine_traits.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__coroutine/coroutine_traits.h"
diff --git a/libc/isystem/__coroutine/noop_coroutine_handle.h b/libc/isystem/__coroutine/noop_coroutine_handle.h
new file mode 100644
index 000000000..2f171894e
--- /dev/null
+++ b/libc/isystem/__coroutine/noop_coroutine_handle.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__coroutine/noop_coroutine_handle.h"
diff --git a/libc/isystem/__coroutine/trivial_awaitables.h b/libc/isystem/__coroutine/trivial_awaitables.h
new file mode 100644
index 000000000..3bdd68e0e
--- /dev/null
+++ b/libc/isystem/__coroutine/trivial_awaitables.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__coroutine/trivial_awaitables.h"
diff --git a/libc/isystem/__debug b/libc/isystem/__debug
new file mode 100644
index 000000000..04395b9bc
--- /dev/null
+++ b/libc/isystem/__debug
@@ -0,0 +1 @@
+#include "third_party/libcxx/__debug"
diff --git a/libc/isystem/__debug_utils/randomize_range.h b/libc/isystem/__debug_utils/randomize_range.h
new file mode 100644
index 000000000..a33bc6b98
--- /dev/null
+++ b/libc/isystem/__debug_utils/randomize_range.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__debug_utils/randomize_range.h"
diff --git a/libc/isystem/__exception/exception.h b/libc/isystem/__exception/exception.h
new file mode 100644
index 000000000..fc7c3312e
--- /dev/null
+++ b/libc/isystem/__exception/exception.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__exception/exception.h"
diff --git a/libc/isystem/__exception/exception_ptr.h b/libc/isystem/__exception/exception_ptr.h
new file mode 100644
index 000000000..10b134614
--- /dev/null
+++ b/libc/isystem/__exception/exception_ptr.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__exception/exception_ptr.h"
diff --git a/libc/isystem/__exception/nested_exception.h b/libc/isystem/__exception/nested_exception.h
new file mode 100644
index 000000000..e5900e7b3
--- /dev/null
+++ b/libc/isystem/__exception/nested_exception.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__exception/nested_exception.h"
diff --git a/libc/isystem/__exception/operations.h b/libc/isystem/__exception/operations.h
new file mode 100644
index 000000000..330230368
--- /dev/null
+++ b/libc/isystem/__exception/operations.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__exception/operations.h"
diff --git a/libc/isystem/__exception/terminate.h b/libc/isystem/__exception/terminate.h
new file mode 100644
index 000000000..bd9211ab7
--- /dev/null
+++ b/libc/isystem/__exception/terminate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__exception/terminate.h"
diff --git a/libc/isystem/__expected/bad_expected_access.h b/libc/isystem/__expected/bad_expected_access.h
new file mode 100644
index 000000000..42f6c73e4
--- /dev/null
+++ b/libc/isystem/__expected/bad_expected_access.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__expected/bad_expected_access.h"
diff --git a/libc/isystem/__expected/expected.h b/libc/isystem/__expected/expected.h
new file mode 100644
index 000000000..ba78e807d
--- /dev/null
+++ b/libc/isystem/__expected/expected.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__expected/expected.h"
diff --git a/libc/isystem/__expected/unexpect.h b/libc/isystem/__expected/unexpect.h
new file mode 100644
index 000000000..16d23ea4e
--- /dev/null
+++ b/libc/isystem/__expected/unexpect.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__expected/unexpect.h"
diff --git a/libc/isystem/__expected/unexpected.h b/libc/isystem/__expected/unexpected.h
new file mode 100644
index 000000000..51a2d1b6a
--- /dev/null
+++ b/libc/isystem/__expected/unexpected.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__expected/unexpected.h"
diff --git a/libc/isystem/__filesystem/copy_options.h b/libc/isystem/__filesystem/copy_options.h
new file mode 100644
index 000000000..6bb46afbf
--- /dev/null
+++ b/libc/isystem/__filesystem/copy_options.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/copy_options.h"
diff --git a/libc/isystem/__filesystem/directory_entry.h b/libc/isystem/__filesystem/directory_entry.h
new file mode 100644
index 000000000..888c4219c
--- /dev/null
+++ b/libc/isystem/__filesystem/directory_entry.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/directory_entry.h"
diff --git a/libc/isystem/__filesystem/directory_iterator.h b/libc/isystem/__filesystem/directory_iterator.h
new file mode 100644
index 000000000..c8a3ab02e
--- /dev/null
+++ b/libc/isystem/__filesystem/directory_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/directory_iterator.h"
diff --git a/libc/isystem/__filesystem/directory_options.h b/libc/isystem/__filesystem/directory_options.h
new file mode 100644
index 000000000..c6fd6d089
--- /dev/null
+++ b/libc/isystem/__filesystem/directory_options.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/directory_options.h"
diff --git a/libc/isystem/__filesystem/file_status.h b/libc/isystem/__filesystem/file_status.h
new file mode 100644
index 000000000..45626c212
--- /dev/null
+++ b/libc/isystem/__filesystem/file_status.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/file_status.h"
diff --git a/libc/isystem/__filesystem/file_time_type.h b/libc/isystem/__filesystem/file_time_type.h
new file mode 100644
index 000000000..69568dbcf
--- /dev/null
+++ b/libc/isystem/__filesystem/file_time_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/file_time_type.h"
diff --git a/libc/isystem/__filesystem/file_type.h b/libc/isystem/__filesystem/file_type.h
new file mode 100644
index 000000000..335a11c9e
--- /dev/null
+++ b/libc/isystem/__filesystem/file_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/file_type.h"
diff --git a/libc/isystem/__filesystem/filesystem_error.h b/libc/isystem/__filesystem/filesystem_error.h
new file mode 100644
index 000000000..b95126848
--- /dev/null
+++ b/libc/isystem/__filesystem/filesystem_error.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/filesystem_error.h"
diff --git a/libc/isystem/__filesystem/operations.h b/libc/isystem/__filesystem/operations.h
new file mode 100644
index 000000000..d0d79480d
--- /dev/null
+++ b/libc/isystem/__filesystem/operations.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/operations.h"
diff --git a/libc/isystem/__filesystem/path.h b/libc/isystem/__filesystem/path.h
new file mode 100644
index 000000000..c5a25621f
--- /dev/null
+++ b/libc/isystem/__filesystem/path.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/path.h"
diff --git a/libc/isystem/__filesystem/path_iterator.h b/libc/isystem/__filesystem/path_iterator.h
new file mode 100644
index 000000000..c730fd4b3
--- /dev/null
+++ b/libc/isystem/__filesystem/path_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/path_iterator.h"
diff --git a/libc/isystem/__filesystem/perm_options.h b/libc/isystem/__filesystem/perm_options.h
new file mode 100644
index 000000000..dbd32cbc7
--- /dev/null
+++ b/libc/isystem/__filesystem/perm_options.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/perm_options.h"
diff --git a/libc/isystem/__filesystem/perms.h b/libc/isystem/__filesystem/perms.h
new file mode 100644
index 000000000..b83178fc0
--- /dev/null
+++ b/libc/isystem/__filesystem/perms.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/perms.h"
diff --git a/libc/isystem/__filesystem/recursive_directory_iterator.h b/libc/isystem/__filesystem/recursive_directory_iterator.h
new file mode 100644
index 000000000..08bf7946c
--- /dev/null
+++ b/libc/isystem/__filesystem/recursive_directory_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/recursive_directory_iterator.h"
diff --git a/libc/isystem/__filesystem/space_info.h b/libc/isystem/__filesystem/space_info.h
new file mode 100644
index 000000000..81b17e6e7
--- /dev/null
+++ b/libc/isystem/__filesystem/space_info.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/space_info.h"
diff --git a/libc/isystem/__filesystem/u8path.h b/libc/isystem/__filesystem/u8path.h
new file mode 100644
index 000000000..fffbb8ce0
--- /dev/null
+++ b/libc/isystem/__filesystem/u8path.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__filesystem/u8path.h"
diff --git a/libc/isystem/__format/buffer.h b/libc/isystem/__format/buffer.h
new file mode 100644
index 000000000..818c3b38e
--- /dev/null
+++ b/libc/isystem/__format/buffer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/buffer.h"
diff --git a/libc/isystem/__format/concepts.h b/libc/isystem/__format/concepts.h
new file mode 100644
index 000000000..ecb0e8480
--- /dev/null
+++ b/libc/isystem/__format/concepts.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/concepts.h"
diff --git a/libc/isystem/__format/container_adaptor.h b/libc/isystem/__format/container_adaptor.h
new file mode 100644
index 000000000..4cd42bf05
--- /dev/null
+++ b/libc/isystem/__format/container_adaptor.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/container_adaptor.h"
diff --git a/libc/isystem/__format/enable_insertable.h b/libc/isystem/__format/enable_insertable.h
new file mode 100644
index 000000000..b10f358d7
--- /dev/null
+++ b/libc/isystem/__format/enable_insertable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/enable_insertable.h"
diff --git a/libc/isystem/__format/escaped_output_table.h b/libc/isystem/__format/escaped_output_table.h
new file mode 100644
index 000000000..0388bfd25
--- /dev/null
+++ b/libc/isystem/__format/escaped_output_table.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/escaped_output_table.h"
diff --git a/libc/isystem/__format/extended_grapheme_cluster_table.h b/libc/isystem/__format/extended_grapheme_cluster_table.h
new file mode 100644
index 000000000..9dcfd92a8
--- /dev/null
+++ b/libc/isystem/__format/extended_grapheme_cluster_table.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/extended_grapheme_cluster_table.h"
diff --git a/libc/isystem/__format/format_arg.h b/libc/isystem/__format/format_arg.h
new file mode 100644
index 000000000..8e13b8af3
--- /dev/null
+++ b/libc/isystem/__format/format_arg.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_arg.h"
diff --git a/libc/isystem/__format/format_arg_store.h b/libc/isystem/__format/format_arg_store.h
new file mode 100644
index 000000000..3f917bb4e
--- /dev/null
+++ b/libc/isystem/__format/format_arg_store.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_arg_store.h"
diff --git a/libc/isystem/__format/format_args.h b/libc/isystem/__format/format_args.h
new file mode 100644
index 000000000..55d301edf
--- /dev/null
+++ b/libc/isystem/__format/format_args.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_args.h"
diff --git a/libc/isystem/__format/format_context.h b/libc/isystem/__format/format_context.h
new file mode 100644
index 000000000..b29184b2d
--- /dev/null
+++ b/libc/isystem/__format/format_context.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_context.h"
diff --git a/libc/isystem/__format/format_error.h b/libc/isystem/__format/format_error.h
new file mode 100644
index 000000000..5f3f358b5
--- /dev/null
+++ b/libc/isystem/__format/format_error.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_error.h"
diff --git a/libc/isystem/__format/format_functions.h b/libc/isystem/__format/format_functions.h
new file mode 100644
index 000000000..14f8df3b1
--- /dev/null
+++ b/libc/isystem/__format/format_functions.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_functions.h"
diff --git a/libc/isystem/__format/format_fwd.h b/libc/isystem/__format/format_fwd.h
new file mode 100644
index 000000000..590990237
--- /dev/null
+++ b/libc/isystem/__format/format_fwd.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_fwd.h"
diff --git a/libc/isystem/__format/format_parse_context.h b/libc/isystem/__format/format_parse_context.h
new file mode 100644
index 000000000..c1cf595f1
--- /dev/null
+++ b/libc/isystem/__format/format_parse_context.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_parse_context.h"
diff --git a/libc/isystem/__format/format_string.h b/libc/isystem/__format/format_string.h
new file mode 100644
index 000000000..427a501bd
--- /dev/null
+++ b/libc/isystem/__format/format_string.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_string.h"
diff --git a/libc/isystem/__format/format_to_n_result.h b/libc/isystem/__format/format_to_n_result.h
new file mode 100644
index 000000000..099dee51a
--- /dev/null
+++ b/libc/isystem/__format/format_to_n_result.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/format_to_n_result.h"
diff --git a/libc/isystem/__format/formatter.h b/libc/isystem/__format/formatter.h
new file mode 100644
index 000000000..740c4cc54
--- /dev/null
+++ b/libc/isystem/__format/formatter.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter.h"
diff --git a/libc/isystem/__format/formatter_bool.h b/libc/isystem/__format/formatter_bool.h
new file mode 100644
index 000000000..bf532de5d
--- /dev/null
+++ b/libc/isystem/__format/formatter_bool.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter_bool.h"
diff --git a/libc/isystem/__format/formatter_char.h b/libc/isystem/__format/formatter_char.h
new file mode 100644
index 000000000..6674430b2
--- /dev/null
+++ b/libc/isystem/__format/formatter_char.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter_char.h"
diff --git a/libc/isystem/__format/formatter_floating_point.h b/libc/isystem/__format/formatter_floating_point.h
new file mode 100644
index 000000000..ec3237b4d
--- /dev/null
+++ b/libc/isystem/__format/formatter_floating_point.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter_floating_point.h"
diff --git a/libc/isystem/__format/formatter_integer.h b/libc/isystem/__format/formatter_integer.h
new file mode 100644
index 000000000..5999edecd
--- /dev/null
+++ b/libc/isystem/__format/formatter_integer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter_integer.h"
diff --git a/libc/isystem/__format/formatter_integral.h b/libc/isystem/__format/formatter_integral.h
new file mode 100644
index 000000000..4564f4087
--- /dev/null
+++ b/libc/isystem/__format/formatter_integral.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter_integral.h"
diff --git a/libc/isystem/__format/formatter_output.h b/libc/isystem/__format/formatter_output.h
new file mode 100644
index 000000000..858d25ee8
--- /dev/null
+++ b/libc/isystem/__format/formatter_output.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter_output.h"
diff --git a/libc/isystem/__format/formatter_pointer.h b/libc/isystem/__format/formatter_pointer.h
new file mode 100644
index 000000000..1299e68bf
--- /dev/null
+++ b/libc/isystem/__format/formatter_pointer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter_pointer.h"
diff --git a/libc/isystem/__format/formatter_string.h b/libc/isystem/__format/formatter_string.h
new file mode 100644
index 000000000..3223ce63b
--- /dev/null
+++ b/libc/isystem/__format/formatter_string.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter_string.h"
diff --git a/libc/isystem/__format/formatter_tuple.h b/libc/isystem/__format/formatter_tuple.h
new file mode 100644
index 000000000..1cc3c466a
--- /dev/null
+++ b/libc/isystem/__format/formatter_tuple.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/formatter_tuple.h"
diff --git a/libc/isystem/__format/parser_std_format_spec.h b/libc/isystem/__format/parser_std_format_spec.h
new file mode 100644
index 000000000..e308602a4
--- /dev/null
+++ b/libc/isystem/__format/parser_std_format_spec.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/parser_std_format_spec.h"
diff --git a/libc/isystem/__format/range_default_formatter.h b/libc/isystem/__format/range_default_formatter.h
new file mode 100644
index 000000000..fd460f6f3
--- /dev/null
+++ b/libc/isystem/__format/range_default_formatter.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/range_default_formatter.h"
diff --git a/libc/isystem/__format/range_formatter.h b/libc/isystem/__format/range_formatter.h
new file mode 100644
index 000000000..c99e4af1b
--- /dev/null
+++ b/libc/isystem/__format/range_formatter.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/range_formatter.h"
diff --git a/libc/isystem/__format/unicode.h b/libc/isystem/__format/unicode.h
new file mode 100644
index 000000000..75cfb9873
--- /dev/null
+++ b/libc/isystem/__format/unicode.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/unicode.h"
diff --git a/libc/isystem/__format/width_estimation_table.h b/libc/isystem/__format/width_estimation_table.h
new file mode 100644
index 000000000..61e359e53
--- /dev/null
+++ b/libc/isystem/__format/width_estimation_table.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__format/width_estimation_table.h"
diff --git a/libc/isystem/__functional/binary_function.h b/libc/isystem/__functional/binary_function.h
new file mode 100644
index 000000000..dfa940f7c
--- /dev/null
+++ b/libc/isystem/__functional/binary_function.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/binary_function.h"
diff --git a/libc/isystem/__functional/binary_negate.h b/libc/isystem/__functional/binary_negate.h
new file mode 100644
index 000000000..74b7fb637
--- /dev/null
+++ b/libc/isystem/__functional/binary_negate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/binary_negate.h"
diff --git a/libc/isystem/__functional/bind.h b/libc/isystem/__functional/bind.h
new file mode 100644
index 000000000..495bb1111
--- /dev/null
+++ b/libc/isystem/__functional/bind.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/bind.h"
diff --git a/libc/isystem/__functional/bind_back.h b/libc/isystem/__functional/bind_back.h
new file mode 100644
index 000000000..d8b1b5704
--- /dev/null
+++ b/libc/isystem/__functional/bind_back.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/bind_back.h"
diff --git a/libc/isystem/__functional/bind_front.h b/libc/isystem/__functional/bind_front.h
new file mode 100644
index 000000000..8581fe048
--- /dev/null
+++ b/libc/isystem/__functional/bind_front.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/bind_front.h"
diff --git a/libc/isystem/__functional/binder1st.h b/libc/isystem/__functional/binder1st.h
new file mode 100644
index 000000000..548aa5cc6
--- /dev/null
+++ b/libc/isystem/__functional/binder1st.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/binder1st.h"
diff --git a/libc/isystem/__functional/binder2nd.h b/libc/isystem/__functional/binder2nd.h
new file mode 100644
index 000000000..f194a998d
--- /dev/null
+++ b/libc/isystem/__functional/binder2nd.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/binder2nd.h"
diff --git a/libc/isystem/__functional/boyer_moore_searcher.h b/libc/isystem/__functional/boyer_moore_searcher.h
new file mode 100644
index 000000000..028e35e6d
--- /dev/null
+++ b/libc/isystem/__functional/boyer_moore_searcher.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/boyer_moore_searcher.h"
diff --git a/libc/isystem/__functional/compose.h b/libc/isystem/__functional/compose.h
new file mode 100644
index 000000000..e190a0a2e
--- /dev/null
+++ b/libc/isystem/__functional/compose.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/compose.h"
diff --git a/libc/isystem/__functional/default_searcher.h b/libc/isystem/__functional/default_searcher.h
new file mode 100644
index 000000000..64ab66f8e
--- /dev/null
+++ b/libc/isystem/__functional/default_searcher.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/default_searcher.h"
diff --git a/libc/isystem/__functional/function.h b/libc/isystem/__functional/function.h
new file mode 100644
index 000000000..e70e5e883
--- /dev/null
+++ b/libc/isystem/__functional/function.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/function.h"
diff --git a/libc/isystem/__functional/hash.h b/libc/isystem/__functional/hash.h
new file mode 100644
index 000000000..269bca2e1
--- /dev/null
+++ b/libc/isystem/__functional/hash.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/hash.h"
diff --git a/libc/isystem/__functional/identity.h b/libc/isystem/__functional/identity.h
new file mode 100644
index 000000000..1174eb660
--- /dev/null
+++ b/libc/isystem/__functional/identity.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/identity.h"
diff --git a/libc/isystem/__functional/invoke.h b/libc/isystem/__functional/invoke.h
new file mode 100644
index 000000000..541cfff6e
--- /dev/null
+++ b/libc/isystem/__functional/invoke.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/invoke.h"
diff --git a/libc/isystem/__functional/is_transparent.h b/libc/isystem/__functional/is_transparent.h
new file mode 100644
index 000000000..cbfdcd080
--- /dev/null
+++ b/libc/isystem/__functional/is_transparent.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/is_transparent.h"
diff --git a/libc/isystem/__functional/mem_fn.h b/libc/isystem/__functional/mem_fn.h
new file mode 100644
index 000000000..a7d6c0309
--- /dev/null
+++ b/libc/isystem/__functional/mem_fn.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/mem_fn.h"
diff --git a/libc/isystem/__functional/mem_fun_ref.h b/libc/isystem/__functional/mem_fun_ref.h
new file mode 100644
index 000000000..49430c517
--- /dev/null
+++ b/libc/isystem/__functional/mem_fun_ref.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/mem_fun_ref.h"
diff --git a/libc/isystem/__functional/not_fn.h b/libc/isystem/__functional/not_fn.h
new file mode 100644
index 000000000..f1f98ada8
--- /dev/null
+++ b/libc/isystem/__functional/not_fn.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/not_fn.h"
diff --git a/libc/isystem/__functional/operations.h b/libc/isystem/__functional/operations.h
new file mode 100644
index 000000000..bd14a162a
--- /dev/null
+++ b/libc/isystem/__functional/operations.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/operations.h"
diff --git a/libc/isystem/__functional/perfect_forward.h b/libc/isystem/__functional/perfect_forward.h
new file mode 100644
index 000000000..8506b8297
--- /dev/null
+++ b/libc/isystem/__functional/perfect_forward.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/perfect_forward.h"
diff --git a/libc/isystem/__functional/pointer_to_binary_function.h b/libc/isystem/__functional/pointer_to_binary_function.h
new file mode 100644
index 000000000..648eb8c42
--- /dev/null
+++ b/libc/isystem/__functional/pointer_to_binary_function.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/pointer_to_binary_function.h"
diff --git a/libc/isystem/__functional/pointer_to_unary_function.h b/libc/isystem/__functional/pointer_to_unary_function.h
new file mode 100644
index 000000000..ef966d038
--- /dev/null
+++ b/libc/isystem/__functional/pointer_to_unary_function.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/pointer_to_unary_function.h"
diff --git a/libc/isystem/__functional/ranges_operations.h b/libc/isystem/__functional/ranges_operations.h
new file mode 100644
index 000000000..42d5ebb19
--- /dev/null
+++ b/libc/isystem/__functional/ranges_operations.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/ranges_operations.h"
diff --git a/libc/isystem/__functional/reference_wrapper.h b/libc/isystem/__functional/reference_wrapper.h
new file mode 100644
index 000000000..0dcebaf1d
--- /dev/null
+++ b/libc/isystem/__functional/reference_wrapper.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/reference_wrapper.h"
diff --git a/libc/isystem/__functional/unary_function.h b/libc/isystem/__functional/unary_function.h
new file mode 100644
index 000000000..31762250a
--- /dev/null
+++ b/libc/isystem/__functional/unary_function.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/unary_function.h"
diff --git a/libc/isystem/__functional/unary_negate.h b/libc/isystem/__functional/unary_negate.h
new file mode 100644
index 000000000..b411f1d1e
--- /dev/null
+++ b/libc/isystem/__functional/unary_negate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/unary_negate.h"
diff --git a/libc/isystem/__functional/weak_result_type.h b/libc/isystem/__functional/weak_result_type.h
new file mode 100644
index 000000000..931b520c8
--- /dev/null
+++ b/libc/isystem/__functional/weak_result_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__functional/weak_result_type.h"
diff --git a/libc/isystem/__fwd/array.h b/libc/isystem/__fwd/array.h
new file mode 100644
index 000000000..f35d9fa29
--- /dev/null
+++ b/libc/isystem/__fwd/array.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/array.h"
diff --git a/libc/isystem/__fwd/fstream.h b/libc/isystem/__fwd/fstream.h
new file mode 100644
index 000000000..320157622
--- /dev/null
+++ b/libc/isystem/__fwd/fstream.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/fstream.h"
diff --git a/libc/isystem/__fwd/get.h b/libc/isystem/__fwd/get.h
new file mode 100644
index 000000000..62f51e163
--- /dev/null
+++ b/libc/isystem/__fwd/get.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/get.h"
diff --git a/libc/isystem/__fwd/hash.h b/libc/isystem/__fwd/hash.h
new file mode 100644
index 000000000..efb447e83
--- /dev/null
+++ b/libc/isystem/__fwd/hash.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/hash.h"
diff --git a/libc/isystem/__fwd/ios.h b/libc/isystem/__fwd/ios.h
new file mode 100644
index 000000000..a121bea35
--- /dev/null
+++ b/libc/isystem/__fwd/ios.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/ios.h"
diff --git a/libc/isystem/__fwd/istream.h b/libc/isystem/__fwd/istream.h
new file mode 100644
index 000000000..f2f3a07ed
--- /dev/null
+++ b/libc/isystem/__fwd/istream.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/istream.h"
diff --git a/libc/isystem/__fwd/memory_resource.h b/libc/isystem/__fwd/memory_resource.h
new file mode 100644
index 000000000..9c3a22885
--- /dev/null
+++ b/libc/isystem/__fwd/memory_resource.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/memory_resource.h"
diff --git a/libc/isystem/__fwd/ostream.h b/libc/isystem/__fwd/ostream.h
new file mode 100644
index 000000000..a3ed81564
--- /dev/null
+++ b/libc/isystem/__fwd/ostream.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/ostream.h"
diff --git a/libc/isystem/__fwd/pair.h b/libc/isystem/__fwd/pair.h
new file mode 100644
index 000000000..8ae0ca4be
--- /dev/null
+++ b/libc/isystem/__fwd/pair.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/pair.h"
diff --git a/libc/isystem/__fwd/span.h b/libc/isystem/__fwd/span.h
new file mode 100644
index 000000000..bc2ac552e
--- /dev/null
+++ b/libc/isystem/__fwd/span.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/span.h"
diff --git a/libc/isystem/__fwd/sstream.h b/libc/isystem/__fwd/sstream.h
new file mode 100644
index 000000000..fccf7caa6
--- /dev/null
+++ b/libc/isystem/__fwd/sstream.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/sstream.h"
diff --git a/libc/isystem/__fwd/streambuf.h b/libc/isystem/__fwd/streambuf.h
new file mode 100644
index 000000000..cd3c6bc22
--- /dev/null
+++ b/libc/isystem/__fwd/streambuf.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/streambuf.h"
diff --git a/libc/isystem/__fwd/string.h b/libc/isystem/__fwd/string.h
new file mode 100644
index 000000000..403ba5d2e
--- /dev/null
+++ b/libc/isystem/__fwd/string.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/string.h"
diff --git a/libc/isystem/__fwd/string_view.h b/libc/isystem/__fwd/string_view.h
new file mode 100644
index 000000000..d094372be
--- /dev/null
+++ b/libc/isystem/__fwd/string_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/string_view.h"
diff --git a/libc/isystem/__fwd/subrange.h b/libc/isystem/__fwd/subrange.h
new file mode 100644
index 000000000..baecfb3b7
--- /dev/null
+++ b/libc/isystem/__fwd/subrange.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/subrange.h"
diff --git a/libc/isystem/__fwd/tuple.h b/libc/isystem/__fwd/tuple.h
new file mode 100644
index 000000000..ba1a2d888
--- /dev/null
+++ b/libc/isystem/__fwd/tuple.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__fwd/tuple.h"
diff --git a/libc/isystem/__hash_table b/libc/isystem/__hash_table
new file mode 100644
index 000000000..06dd032db
--- /dev/null
+++ b/libc/isystem/__hash_table
@@ -0,0 +1 @@
+#include "third_party/libcxx/__hash_table"
diff --git a/libc/isystem/__ios/fpos.h b/libc/isystem/__ios/fpos.h
new file mode 100644
index 000000000..8752622fd
--- /dev/null
+++ b/libc/isystem/__ios/fpos.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ios/fpos.h"
diff --git a/libc/isystem/__iterator/access.h b/libc/isystem/__iterator/access.h
new file mode 100644
index 000000000..42e8993ad
--- /dev/null
+++ b/libc/isystem/__iterator/access.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/access.h"
diff --git a/libc/isystem/__iterator/advance.h b/libc/isystem/__iterator/advance.h
new file mode 100644
index 000000000..b9d6ecb59
--- /dev/null
+++ b/libc/isystem/__iterator/advance.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/advance.h"
diff --git a/libc/isystem/__iterator/back_insert_iterator.h b/libc/isystem/__iterator/back_insert_iterator.h
new file mode 100644
index 000000000..8b6ea8655
--- /dev/null
+++ b/libc/isystem/__iterator/back_insert_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/back_insert_iterator.h"
diff --git a/libc/isystem/__iterator/bounded_iter.h b/libc/isystem/__iterator/bounded_iter.h
new file mode 100644
index 000000000..3acf996fc
--- /dev/null
+++ b/libc/isystem/__iterator/bounded_iter.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/bounded_iter.h"
diff --git a/libc/isystem/__iterator/common_iterator.h b/libc/isystem/__iterator/common_iterator.h
new file mode 100644
index 000000000..74b1bfeda
--- /dev/null
+++ b/libc/isystem/__iterator/common_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/common_iterator.h"
diff --git a/libc/isystem/__iterator/concepts.h b/libc/isystem/__iterator/concepts.h
new file mode 100644
index 000000000..b930c317d
--- /dev/null
+++ b/libc/isystem/__iterator/concepts.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/concepts.h"
diff --git a/libc/isystem/__iterator/counted_iterator.h b/libc/isystem/__iterator/counted_iterator.h
new file mode 100644
index 000000000..e8fea8179
--- /dev/null
+++ b/libc/isystem/__iterator/counted_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/counted_iterator.h"
diff --git a/libc/isystem/__iterator/data.h b/libc/isystem/__iterator/data.h
new file mode 100644
index 000000000..074df0122
--- /dev/null
+++ b/libc/isystem/__iterator/data.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/data.h"
diff --git a/libc/isystem/__iterator/default_sentinel.h b/libc/isystem/__iterator/default_sentinel.h
new file mode 100644
index 000000000..35f516299
--- /dev/null
+++ b/libc/isystem/__iterator/default_sentinel.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/default_sentinel.h"
diff --git a/libc/isystem/__iterator/distance.h b/libc/isystem/__iterator/distance.h
new file mode 100644
index 000000000..03427f348
--- /dev/null
+++ b/libc/isystem/__iterator/distance.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/distance.h"
diff --git a/libc/isystem/__iterator/empty.h b/libc/isystem/__iterator/empty.h
new file mode 100644
index 000000000..7dff787a6
--- /dev/null
+++ b/libc/isystem/__iterator/empty.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/empty.h"
diff --git a/libc/isystem/__iterator/erase_if_container.h b/libc/isystem/__iterator/erase_if_container.h
new file mode 100644
index 000000000..891c35d32
--- /dev/null
+++ b/libc/isystem/__iterator/erase_if_container.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/erase_if_container.h"
diff --git a/libc/isystem/__iterator/front_insert_iterator.h b/libc/isystem/__iterator/front_insert_iterator.h
new file mode 100644
index 000000000..803fa7c19
--- /dev/null
+++ b/libc/isystem/__iterator/front_insert_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/front_insert_iterator.h"
diff --git a/libc/isystem/__iterator/incrementable_traits.h b/libc/isystem/__iterator/incrementable_traits.h
new file mode 100644
index 000000000..016a7429d
--- /dev/null
+++ b/libc/isystem/__iterator/incrementable_traits.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/incrementable_traits.h"
diff --git a/libc/isystem/__iterator/indirectly_comparable.h b/libc/isystem/__iterator/indirectly_comparable.h
new file mode 100644
index 000000000..e8dd61611
--- /dev/null
+++ b/libc/isystem/__iterator/indirectly_comparable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/indirectly_comparable.h"
diff --git a/libc/isystem/__iterator/insert_iterator.h b/libc/isystem/__iterator/insert_iterator.h
new file mode 100644
index 000000000..d7d51d5a2
--- /dev/null
+++ b/libc/isystem/__iterator/insert_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/insert_iterator.h"
diff --git a/libc/isystem/__iterator/istream_iterator.h b/libc/isystem/__iterator/istream_iterator.h
new file mode 100644
index 000000000..c4dca4cf7
--- /dev/null
+++ b/libc/isystem/__iterator/istream_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/istream_iterator.h"
diff --git a/libc/isystem/__iterator/istreambuf_iterator.h b/libc/isystem/__iterator/istreambuf_iterator.h
new file mode 100644
index 000000000..e1c6f5c0c
--- /dev/null
+++ b/libc/isystem/__iterator/istreambuf_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/istreambuf_iterator.h"
diff --git a/libc/isystem/__iterator/iter_move.h b/libc/isystem/__iterator/iter_move.h
new file mode 100644
index 000000000..d6ec40c76
--- /dev/null
+++ b/libc/isystem/__iterator/iter_move.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/iter_move.h"
diff --git a/libc/isystem/__iterator/iter_swap.h b/libc/isystem/__iterator/iter_swap.h
new file mode 100644
index 000000000..d417d7b99
--- /dev/null
+++ b/libc/isystem/__iterator/iter_swap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/iter_swap.h"
diff --git a/libc/isystem/__iterator/iterator.h b/libc/isystem/__iterator/iterator.h
new file mode 100644
index 000000000..e25632aaa
--- /dev/null
+++ b/libc/isystem/__iterator/iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/iterator.h"
diff --git a/libc/isystem/__iterator/iterator_traits.h b/libc/isystem/__iterator/iterator_traits.h
new file mode 100644
index 000000000..74329d233
--- /dev/null
+++ b/libc/isystem/__iterator/iterator_traits.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/iterator_traits.h"
diff --git a/libc/isystem/__iterator/iterator_with_data.h b/libc/isystem/__iterator/iterator_with_data.h
new file mode 100644
index 000000000..a13a042d6
--- /dev/null
+++ b/libc/isystem/__iterator/iterator_with_data.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/iterator_with_data.h"
diff --git a/libc/isystem/__iterator/mergeable.h b/libc/isystem/__iterator/mergeable.h
new file mode 100644
index 000000000..9c05a7ec7
--- /dev/null
+++ b/libc/isystem/__iterator/mergeable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/mergeable.h"
diff --git a/libc/isystem/__iterator/move_iterator.h b/libc/isystem/__iterator/move_iterator.h
new file mode 100644
index 000000000..93556c2c4
--- /dev/null
+++ b/libc/isystem/__iterator/move_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/move_iterator.h"
diff --git a/libc/isystem/__iterator/move_sentinel.h b/libc/isystem/__iterator/move_sentinel.h
new file mode 100644
index 000000000..d92a398a2
--- /dev/null
+++ b/libc/isystem/__iterator/move_sentinel.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/move_sentinel.h"
diff --git a/libc/isystem/__iterator/next.h b/libc/isystem/__iterator/next.h
new file mode 100644
index 000000000..b7083bb64
--- /dev/null
+++ b/libc/isystem/__iterator/next.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/next.h"
diff --git a/libc/isystem/__iterator/ostream_iterator.h b/libc/isystem/__iterator/ostream_iterator.h
new file mode 100644
index 000000000..11cf36d9a
--- /dev/null
+++ b/libc/isystem/__iterator/ostream_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/ostream_iterator.h"
diff --git a/libc/isystem/__iterator/ostreambuf_iterator.h b/libc/isystem/__iterator/ostreambuf_iterator.h
new file mode 100644
index 000000000..d8e15b6b8
--- /dev/null
+++ b/libc/isystem/__iterator/ostreambuf_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/ostreambuf_iterator.h"
diff --git a/libc/isystem/__iterator/permutable.h b/libc/isystem/__iterator/permutable.h
new file mode 100644
index 000000000..e09e8078b
--- /dev/null
+++ b/libc/isystem/__iterator/permutable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/permutable.h"
diff --git a/libc/isystem/__iterator/prev.h b/libc/isystem/__iterator/prev.h
new file mode 100644
index 000000000..5b540a33a
--- /dev/null
+++ b/libc/isystem/__iterator/prev.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/prev.h"
diff --git a/libc/isystem/__iterator/projected.h b/libc/isystem/__iterator/projected.h
new file mode 100644
index 000000000..8014a4a53
--- /dev/null
+++ b/libc/isystem/__iterator/projected.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/projected.h"
diff --git a/libc/isystem/__iterator/readable_traits.h b/libc/isystem/__iterator/readable_traits.h
new file mode 100644
index 000000000..01e249bd7
--- /dev/null
+++ b/libc/isystem/__iterator/readable_traits.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/readable_traits.h"
diff --git a/libc/isystem/__iterator/reverse_access.h b/libc/isystem/__iterator/reverse_access.h
new file mode 100644
index 000000000..6b46073f1
--- /dev/null
+++ b/libc/isystem/__iterator/reverse_access.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/reverse_access.h"
diff --git a/libc/isystem/__iterator/reverse_iterator.h b/libc/isystem/__iterator/reverse_iterator.h
new file mode 100644
index 000000000..261ca8fe4
--- /dev/null
+++ b/libc/isystem/__iterator/reverse_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/reverse_iterator.h"
diff --git a/libc/isystem/__iterator/segmented_iterator.h b/libc/isystem/__iterator/segmented_iterator.h
new file mode 100644
index 000000000..083dc0e6c
--- /dev/null
+++ b/libc/isystem/__iterator/segmented_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/segmented_iterator.h"
diff --git a/libc/isystem/__iterator/size.h b/libc/isystem/__iterator/size.h
new file mode 100644
index 000000000..fd8f8ea22
--- /dev/null
+++ b/libc/isystem/__iterator/size.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/size.h"
diff --git a/libc/isystem/__iterator/sortable.h b/libc/isystem/__iterator/sortable.h
new file mode 100644
index 000000000..aa714d95a
--- /dev/null
+++ b/libc/isystem/__iterator/sortable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/sortable.h"
diff --git a/libc/isystem/__iterator/unreachable_sentinel.h b/libc/isystem/__iterator/unreachable_sentinel.h
new file mode 100644
index 000000000..100669d16
--- /dev/null
+++ b/libc/isystem/__iterator/unreachable_sentinel.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/unreachable_sentinel.h"
diff --git a/libc/isystem/__iterator/wrap_iter.h b/libc/isystem/__iterator/wrap_iter.h
new file mode 100644
index 000000000..f9f9e17a8
--- /dev/null
+++ b/libc/isystem/__iterator/wrap_iter.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__iterator/wrap_iter.h"
diff --git a/libc/isystem/__locale b/libc/isystem/__locale
new file mode 100644
index 000000000..430f7d277
--- /dev/null
+++ b/libc/isystem/__locale
@@ -0,0 +1 @@
+#include "third_party/libcxx/__locale"
diff --git a/libc/isystem/__locale_dir/locale_base_api/bsd_locale_defaults.h b/libc/isystem/__locale_dir/locale_base_api/bsd_locale_defaults.h
new file mode 100644
index 000000000..9616b1307
--- /dev/null
+++ b/libc/isystem/__locale_dir/locale_base_api/bsd_locale_defaults.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__locale_dir/locale_base_api/bsd_locale_defaults.h"
diff --git a/libc/isystem/__locale_dir/locale_base_api/bsd_locale_fallbacks.h b/libc/isystem/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
new file mode 100644
index 000000000..aefdb8f3c
--- /dev/null
+++ b/libc/isystem/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__locale_dir/locale_base_api/bsd_locale_fallbacks.h"
diff --git a/libc/isystem/__locale_dir/locale_base_api/locale_guard.h b/libc/isystem/__locale_dir/locale_base_api/locale_guard.h
new file mode 100644
index 000000000..9c1a10090
--- /dev/null
+++ b/libc/isystem/__locale_dir/locale_base_api/locale_guard.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__locale_dir/locale_base_api/locale_guard.h"
diff --git a/libc/isystem/__mbstate_t.h b/libc/isystem/__mbstate_t.h
new file mode 100644
index 000000000..ce032b9ae
--- /dev/null
+++ b/libc/isystem/__mbstate_t.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__mbstate_t.h"
diff --git a/libc/isystem/__mdspan/extents.h b/libc/isystem/__mdspan/extents.h
new file mode 100644
index 000000000..d4060eeb0
--- /dev/null
+++ b/libc/isystem/__mdspan/extents.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__mdspan/extents.h"
diff --git a/libc/isystem/__memory/addressof.h b/libc/isystem/__memory/addressof.h
new file mode 100644
index 000000000..74479acbc
--- /dev/null
+++ b/libc/isystem/__memory/addressof.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/addressof.h"
diff --git a/libc/isystem/__memory/align.h b/libc/isystem/__memory/align.h
new file mode 100644
index 000000000..dac036328
--- /dev/null
+++ b/libc/isystem/__memory/align.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/align.h"
diff --git a/libc/isystem/__memory/aligned_alloc.h b/libc/isystem/__memory/aligned_alloc.h
new file mode 100644
index 000000000..8f5c4a6ca
--- /dev/null
+++ b/libc/isystem/__memory/aligned_alloc.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/aligned_alloc.h"
diff --git a/libc/isystem/__memory/allocate_at_least.h b/libc/isystem/__memory/allocate_at_least.h
new file mode 100644
index 000000000..b8806999b
--- /dev/null
+++ b/libc/isystem/__memory/allocate_at_least.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/allocate_at_least.h"
diff --git a/libc/isystem/__memory/allocation_guard.h b/libc/isystem/__memory/allocation_guard.h
new file mode 100644
index 000000000..a7f3f3020
--- /dev/null
+++ b/libc/isystem/__memory/allocation_guard.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/allocation_guard.h"
diff --git a/libc/isystem/__memory/allocator.h b/libc/isystem/__memory/allocator.h
new file mode 100644
index 000000000..4460c2cfe
--- /dev/null
+++ b/libc/isystem/__memory/allocator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/allocator.h"
diff --git a/libc/isystem/__memory/allocator_arg_t.h b/libc/isystem/__memory/allocator_arg_t.h
new file mode 100644
index 000000000..8606655c2
--- /dev/null
+++ b/libc/isystem/__memory/allocator_arg_t.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/allocator_arg_t.h"
diff --git a/libc/isystem/__memory/allocator_destructor.h b/libc/isystem/__memory/allocator_destructor.h
new file mode 100644
index 000000000..9922a6ad2
--- /dev/null
+++ b/libc/isystem/__memory/allocator_destructor.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/allocator_destructor.h"
diff --git a/libc/isystem/__memory/allocator_traits.h b/libc/isystem/__memory/allocator_traits.h
new file mode 100644
index 000000000..7c0ef7fe0
--- /dev/null
+++ b/libc/isystem/__memory/allocator_traits.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/allocator_traits.h"
diff --git a/libc/isystem/__memory/assume_aligned.h b/libc/isystem/__memory/assume_aligned.h
new file mode 100644
index 000000000..816f2a513
--- /dev/null
+++ b/libc/isystem/__memory/assume_aligned.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/assume_aligned.h"
diff --git a/libc/isystem/__memory/auto_ptr.h b/libc/isystem/__memory/auto_ptr.h
new file mode 100644
index 000000000..31694389a
--- /dev/null
+++ b/libc/isystem/__memory/auto_ptr.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/auto_ptr.h"
diff --git a/libc/isystem/__memory/builtin_new_allocator.h b/libc/isystem/__memory/builtin_new_allocator.h
new file mode 100644
index 000000000..f23c8a06e
--- /dev/null
+++ b/libc/isystem/__memory/builtin_new_allocator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/builtin_new_allocator.h"
diff --git a/libc/isystem/__memory/compressed_pair.h b/libc/isystem/__memory/compressed_pair.h
new file mode 100644
index 000000000..2ce22d3af
--- /dev/null
+++ b/libc/isystem/__memory/compressed_pair.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/compressed_pair.h"
diff --git a/libc/isystem/__memory/concepts.h b/libc/isystem/__memory/concepts.h
new file mode 100644
index 000000000..3e8e04d19
--- /dev/null
+++ b/libc/isystem/__memory/concepts.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/concepts.h"
diff --git a/libc/isystem/__memory/construct_at.h b/libc/isystem/__memory/construct_at.h
new file mode 100644
index 000000000..612ad40b0
--- /dev/null
+++ b/libc/isystem/__memory/construct_at.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/construct_at.h"
diff --git a/libc/isystem/__memory/destruct_n.h b/libc/isystem/__memory/destruct_n.h
new file mode 100644
index 000000000..ce33b80cf
--- /dev/null
+++ b/libc/isystem/__memory/destruct_n.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/destruct_n.h"
diff --git a/libc/isystem/__memory/pointer_traits.h b/libc/isystem/__memory/pointer_traits.h
new file mode 100644
index 000000000..9f3e55118
--- /dev/null
+++ b/libc/isystem/__memory/pointer_traits.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/pointer_traits.h"
diff --git a/libc/isystem/__memory/ranges_construct_at.h b/libc/isystem/__memory/ranges_construct_at.h
new file mode 100644
index 000000000..5facc37fe
--- /dev/null
+++ b/libc/isystem/__memory/ranges_construct_at.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/ranges_construct_at.h"
diff --git a/libc/isystem/__memory/ranges_uninitialized_algorithms.h b/libc/isystem/__memory/ranges_uninitialized_algorithms.h
new file mode 100644
index 000000000..0cebbbb26
--- /dev/null
+++ b/libc/isystem/__memory/ranges_uninitialized_algorithms.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/ranges_uninitialized_algorithms.h"
diff --git a/libc/isystem/__memory/raw_storage_iterator.h b/libc/isystem/__memory/raw_storage_iterator.h
new file mode 100644
index 000000000..c08589f0a
--- /dev/null
+++ b/libc/isystem/__memory/raw_storage_iterator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/raw_storage_iterator.h"
diff --git a/libc/isystem/__memory/shared_ptr.h b/libc/isystem/__memory/shared_ptr.h
new file mode 100644
index 000000000..0ef2be9a7
--- /dev/null
+++ b/libc/isystem/__memory/shared_ptr.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/shared_ptr.h"
diff --git a/libc/isystem/__memory/swap_allocator.h b/libc/isystem/__memory/swap_allocator.h
new file mode 100644
index 000000000..926309208
--- /dev/null
+++ b/libc/isystem/__memory/swap_allocator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/swap_allocator.h"
diff --git a/libc/isystem/__memory/temp_value.h b/libc/isystem/__memory/temp_value.h
new file mode 100644
index 000000000..3443ff2ce
--- /dev/null
+++ b/libc/isystem/__memory/temp_value.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/temp_value.h"
diff --git a/libc/isystem/__memory/temporary_buffer.h b/libc/isystem/__memory/temporary_buffer.h
new file mode 100644
index 000000000..8e3d01430
--- /dev/null
+++ b/libc/isystem/__memory/temporary_buffer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/temporary_buffer.h"
diff --git a/libc/isystem/__memory/uninitialized_algorithms.h b/libc/isystem/__memory/uninitialized_algorithms.h
new file mode 100644
index 000000000..ace3ba3b4
--- /dev/null
+++ b/libc/isystem/__memory/uninitialized_algorithms.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/uninitialized_algorithms.h"
diff --git a/libc/isystem/__memory/unique_ptr.h b/libc/isystem/__memory/unique_ptr.h
new file mode 100644
index 000000000..a2c62bf6e
--- /dev/null
+++ b/libc/isystem/__memory/unique_ptr.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/unique_ptr.h"
diff --git a/libc/isystem/__memory/uses_allocator.h b/libc/isystem/__memory/uses_allocator.h
new file mode 100644
index 000000000..fe24b8572
--- /dev/null
+++ b/libc/isystem/__memory/uses_allocator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/uses_allocator.h"
diff --git a/libc/isystem/__memory/uses_allocator_construction.h b/libc/isystem/__memory/uses_allocator_construction.h
new file mode 100644
index 000000000..52e3a3be8
--- /dev/null
+++ b/libc/isystem/__memory/uses_allocator_construction.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/uses_allocator_construction.h"
diff --git a/libc/isystem/__memory/voidify.h b/libc/isystem/__memory/voidify.h
new file mode 100644
index 000000000..5c7ebde6a
--- /dev/null
+++ b/libc/isystem/__memory/voidify.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory/voidify.h"
diff --git a/libc/isystem/__memory_resource/memory_resource.h b/libc/isystem/__memory_resource/memory_resource.h
new file mode 100644
index 000000000..b99854ba2
--- /dev/null
+++ b/libc/isystem/__memory_resource/memory_resource.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory_resource/memory_resource.h"
diff --git a/libc/isystem/__memory_resource/monotonic_buffer_resource.h b/libc/isystem/__memory_resource/monotonic_buffer_resource.h
new file mode 100644
index 000000000..86a7afce1
--- /dev/null
+++ b/libc/isystem/__memory_resource/monotonic_buffer_resource.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory_resource/monotonic_buffer_resource.h"
diff --git a/libc/isystem/__memory_resource/polymorphic_allocator.h b/libc/isystem/__memory_resource/polymorphic_allocator.h
new file mode 100644
index 000000000..75cb9ffc0
--- /dev/null
+++ b/libc/isystem/__memory_resource/polymorphic_allocator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory_resource/polymorphic_allocator.h"
diff --git a/libc/isystem/__memory_resource/pool_options.h b/libc/isystem/__memory_resource/pool_options.h
new file mode 100644
index 000000000..932881687
--- /dev/null
+++ b/libc/isystem/__memory_resource/pool_options.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory_resource/pool_options.h"
diff --git a/libc/isystem/__memory_resource/synchronized_pool_resource.h b/libc/isystem/__memory_resource/synchronized_pool_resource.h
new file mode 100644
index 000000000..d65c0ee3d
--- /dev/null
+++ b/libc/isystem/__memory_resource/synchronized_pool_resource.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory_resource/synchronized_pool_resource.h"
diff --git a/libc/isystem/__memory_resource/unsynchronized_pool_resource.h b/libc/isystem/__memory_resource/unsynchronized_pool_resource.h
new file mode 100644
index 000000000..70a08c647
--- /dev/null
+++ b/libc/isystem/__memory_resource/unsynchronized_pool_resource.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__memory_resource/unsynchronized_pool_resource.h"
diff --git a/libc/isystem/__mutex/lock_guard.h b/libc/isystem/__mutex/lock_guard.h
new file mode 100644
index 000000000..2f16fdf99
--- /dev/null
+++ b/libc/isystem/__mutex/lock_guard.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__mutex/lock_guard.h"
diff --git a/libc/isystem/__mutex/mutex.h b/libc/isystem/__mutex/mutex.h
new file mode 100644
index 000000000..2cfbd07d0
--- /dev/null
+++ b/libc/isystem/__mutex/mutex.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__mutex/mutex.h"
diff --git a/libc/isystem/__mutex/tag_types.h b/libc/isystem/__mutex/tag_types.h
new file mode 100644
index 000000000..ab92992ab
--- /dev/null
+++ b/libc/isystem/__mutex/tag_types.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__mutex/tag_types.h"
diff --git a/libc/isystem/__mutex/unique_lock.h b/libc/isystem/__mutex/unique_lock.h
new file mode 100644
index 000000000..beddbbc34
--- /dev/null
+++ b/libc/isystem/__mutex/unique_lock.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__mutex/unique_lock.h"
diff --git a/libc/isystem/__node_handle b/libc/isystem/__node_handle
new file mode 100644
index 000000000..46fb2a977
--- /dev/null
+++ b/libc/isystem/__node_handle
@@ -0,0 +1 @@
+#include "third_party/libcxx/__node_handle"
diff --git a/libc/isystem/__numeric/accumulate.h b/libc/isystem/__numeric/accumulate.h
new file mode 100644
index 000000000..c6b2b5d7b
--- /dev/null
+++ b/libc/isystem/__numeric/accumulate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/accumulate.h"
diff --git a/libc/isystem/__numeric/adjacent_difference.h b/libc/isystem/__numeric/adjacent_difference.h
new file mode 100644
index 000000000..15f6adefa
--- /dev/null
+++ b/libc/isystem/__numeric/adjacent_difference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/adjacent_difference.h"
diff --git a/libc/isystem/__numeric/exclusive_scan.h b/libc/isystem/__numeric/exclusive_scan.h
new file mode 100644
index 000000000..3c9ac4039
--- /dev/null
+++ b/libc/isystem/__numeric/exclusive_scan.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/exclusive_scan.h"
diff --git a/libc/isystem/__numeric/gcd_lcm.h b/libc/isystem/__numeric/gcd_lcm.h
new file mode 100644
index 000000000..da21251ac
--- /dev/null
+++ b/libc/isystem/__numeric/gcd_lcm.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/gcd_lcm.h"
diff --git a/libc/isystem/__numeric/inclusive_scan.h b/libc/isystem/__numeric/inclusive_scan.h
new file mode 100644
index 000000000..760af62db
--- /dev/null
+++ b/libc/isystem/__numeric/inclusive_scan.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/inclusive_scan.h"
diff --git a/libc/isystem/__numeric/inner_product.h b/libc/isystem/__numeric/inner_product.h
new file mode 100644
index 000000000..50ff22838
--- /dev/null
+++ b/libc/isystem/__numeric/inner_product.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/inner_product.h"
diff --git a/libc/isystem/__numeric/iota.h b/libc/isystem/__numeric/iota.h
new file mode 100644
index 000000000..7e790b8c3
--- /dev/null
+++ b/libc/isystem/__numeric/iota.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/iota.h"
diff --git a/libc/isystem/__numeric/midpoint.h b/libc/isystem/__numeric/midpoint.h
new file mode 100644
index 000000000..82b5eba42
--- /dev/null
+++ b/libc/isystem/__numeric/midpoint.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/midpoint.h"
diff --git a/libc/isystem/__numeric/partial_sum.h b/libc/isystem/__numeric/partial_sum.h
new file mode 100644
index 000000000..cab9525da
--- /dev/null
+++ b/libc/isystem/__numeric/partial_sum.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/partial_sum.h"
diff --git a/libc/isystem/__numeric/reduce.h b/libc/isystem/__numeric/reduce.h
new file mode 100644
index 000000000..15f0d16d8
--- /dev/null
+++ b/libc/isystem/__numeric/reduce.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/reduce.h"
diff --git a/libc/isystem/__numeric/transform_exclusive_scan.h b/libc/isystem/__numeric/transform_exclusive_scan.h
new file mode 100644
index 000000000..e21a234b5
--- /dev/null
+++ b/libc/isystem/__numeric/transform_exclusive_scan.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/transform_exclusive_scan.h"
diff --git a/libc/isystem/__numeric/transform_inclusive_scan.h b/libc/isystem/__numeric/transform_inclusive_scan.h
new file mode 100644
index 000000000..df792c263
--- /dev/null
+++ b/libc/isystem/__numeric/transform_inclusive_scan.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/transform_inclusive_scan.h"
diff --git a/libc/isystem/__numeric/transform_reduce.h b/libc/isystem/__numeric/transform_reduce.h
new file mode 100644
index 000000000..60857126d
--- /dev/null
+++ b/libc/isystem/__numeric/transform_reduce.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__numeric/transform_reduce.h"
diff --git a/libc/isystem/__pstl/internal/algorithm_fwd.h b/libc/isystem/__pstl/internal/algorithm_fwd.h
new file mode 100644
index 000000000..0ec6c87f2
--- /dev/null
+++ b/libc/isystem/__pstl/internal/algorithm_fwd.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/algorithm_fwd.h"
diff --git a/libc/isystem/__pstl/internal/algorithm_impl.h b/libc/isystem/__pstl/internal/algorithm_impl.h
new file mode 100644
index 000000000..8b95f0d63
--- /dev/null
+++ b/libc/isystem/__pstl/internal/algorithm_impl.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/algorithm_impl.h"
diff --git a/libc/isystem/__pstl/internal/execution_defs.h b/libc/isystem/__pstl/internal/execution_defs.h
new file mode 100644
index 000000000..25e27ab00
--- /dev/null
+++ b/libc/isystem/__pstl/internal/execution_defs.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/execution_defs.h"
diff --git a/libc/isystem/__pstl/internal/execution_impl.h b/libc/isystem/__pstl/internal/execution_impl.h
new file mode 100644
index 000000000..3517735c2
--- /dev/null
+++ b/libc/isystem/__pstl/internal/execution_impl.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/execution_impl.h"
diff --git a/libc/isystem/__pstl/internal/glue_algorithm_defs.h b/libc/isystem/__pstl/internal/glue_algorithm_defs.h
new file mode 100644
index 000000000..404b11ea9
--- /dev/null
+++ b/libc/isystem/__pstl/internal/glue_algorithm_defs.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/glue_algorithm_defs.h"
diff --git a/libc/isystem/__pstl/internal/glue_algorithm_impl.h b/libc/isystem/__pstl/internal/glue_algorithm_impl.h
new file mode 100644
index 000000000..6ba6ab16d
--- /dev/null
+++ b/libc/isystem/__pstl/internal/glue_algorithm_impl.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/glue_algorithm_impl.h"
diff --git a/libc/isystem/__pstl/internal/glue_memory_defs.h b/libc/isystem/__pstl/internal/glue_memory_defs.h
new file mode 100644
index 000000000..3dd439cf8
--- /dev/null
+++ b/libc/isystem/__pstl/internal/glue_memory_defs.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/glue_memory_defs.h"
diff --git a/libc/isystem/__pstl/internal/glue_memory_impl.h b/libc/isystem/__pstl/internal/glue_memory_impl.h
new file mode 100644
index 000000000..6542bd695
--- /dev/null
+++ b/libc/isystem/__pstl/internal/glue_memory_impl.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/glue_memory_impl.h"
diff --git a/libc/isystem/__pstl/internal/glue_numeric_defs.h b/libc/isystem/__pstl/internal/glue_numeric_defs.h
new file mode 100644
index 000000000..f7b8b77e6
--- /dev/null
+++ b/libc/isystem/__pstl/internal/glue_numeric_defs.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/glue_numeric_defs.h"
diff --git a/libc/isystem/__pstl/internal/glue_numeric_impl.h b/libc/isystem/__pstl/internal/glue_numeric_impl.h
new file mode 100644
index 000000000..f04d6c080
--- /dev/null
+++ b/libc/isystem/__pstl/internal/glue_numeric_impl.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/glue_numeric_impl.h"
diff --git a/libc/isystem/__pstl/internal/memory_impl.h b/libc/isystem/__pstl/internal/memory_impl.h
new file mode 100644
index 000000000..d98f079b2
--- /dev/null
+++ b/libc/isystem/__pstl/internal/memory_impl.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/memory_impl.h"
diff --git a/libc/isystem/__pstl/internal/numeric_fwd.h b/libc/isystem/__pstl/internal/numeric_fwd.h
new file mode 100644
index 000000000..2a973e3a6
--- /dev/null
+++ b/libc/isystem/__pstl/internal/numeric_fwd.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/numeric_fwd.h"
diff --git a/libc/isystem/__pstl/internal/numeric_impl.h b/libc/isystem/__pstl/internal/numeric_impl.h
new file mode 100644
index 000000000..a88a76d2e
--- /dev/null
+++ b/libc/isystem/__pstl/internal/numeric_impl.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/numeric_impl.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_for.h b/libc/isystem/__pstl/internal/omp/parallel_for.h
new file mode 100644
index 000000000..bbce2cda6
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_for.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_for.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_for_each.h b/libc/isystem/__pstl/internal/omp/parallel_for_each.h
new file mode 100644
index 000000000..4d09b34a7
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_for_each.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_for_each.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_invoke.h b/libc/isystem/__pstl/internal/omp/parallel_invoke.h
new file mode 100644
index 000000000..11448e477
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_invoke.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_invoke.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_merge.h b/libc/isystem/__pstl/internal/omp/parallel_merge.h
new file mode 100644
index 000000000..1d33dc8b7
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_merge.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_merge.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_reduce.h b/libc/isystem/__pstl/internal/omp/parallel_reduce.h
new file mode 100644
index 000000000..06a5ca107
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_reduce.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_reduce.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_scan.h b/libc/isystem/__pstl/internal/omp/parallel_scan.h
new file mode 100644
index 000000000..e7a1ee8d0
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_scan.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_scan.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_stable_partial_sort.h b/libc/isystem/__pstl/internal/omp/parallel_stable_partial_sort.h
new file mode 100644
index 000000000..57386b22e
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_stable_partial_sort.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_stable_partial_sort.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_stable_sort.h b/libc/isystem/__pstl/internal/omp/parallel_stable_sort.h
new file mode 100644
index 000000000..c8d84edbe
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_stable_sort.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_stable_sort.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_transform_reduce.h b/libc/isystem/__pstl/internal/omp/parallel_transform_reduce.h
new file mode 100644
index 000000000..089e909b5
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_transform_reduce.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_transform_reduce.h"
diff --git a/libc/isystem/__pstl/internal/omp/parallel_transform_scan.h b/libc/isystem/__pstl/internal/omp/parallel_transform_scan.h
new file mode 100644
index 000000000..627c15f1a
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/parallel_transform_scan.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/parallel_transform_scan.h"
diff --git a/libc/isystem/__pstl/internal/omp/util.h b/libc/isystem/__pstl/internal/omp/util.h
new file mode 100644
index 000000000..bd8355cf4
--- /dev/null
+++ b/libc/isystem/__pstl/internal/omp/util.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/omp/util.h"
diff --git a/libc/isystem/__pstl/internal/parallel_backend.h b/libc/isystem/__pstl/internal/parallel_backend.h
new file mode 100644
index 000000000..3231344c5
--- /dev/null
+++ b/libc/isystem/__pstl/internal/parallel_backend.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/parallel_backend.h"
diff --git a/libc/isystem/__pstl/internal/parallel_backend_omp.h b/libc/isystem/__pstl/internal/parallel_backend_omp.h
new file mode 100644
index 000000000..b5abe31ae
--- /dev/null
+++ b/libc/isystem/__pstl/internal/parallel_backend_omp.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/parallel_backend_omp.h"
diff --git a/libc/isystem/__pstl/internal/parallel_backend_serial.h b/libc/isystem/__pstl/internal/parallel_backend_serial.h
new file mode 100644
index 000000000..044d2de57
--- /dev/null
+++ b/libc/isystem/__pstl/internal/parallel_backend_serial.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/parallel_backend_serial.h"
diff --git a/libc/isystem/__pstl/internal/parallel_backend_tbb.h b/libc/isystem/__pstl/internal/parallel_backend_tbb.h
new file mode 100644
index 000000000..c9310efc5
--- /dev/null
+++ b/libc/isystem/__pstl/internal/parallel_backend_tbb.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/parallel_backend_tbb.h"
diff --git a/libc/isystem/__pstl/internal/parallel_backend_utils.h b/libc/isystem/__pstl/internal/parallel_backend_utils.h
new file mode 100644
index 000000000..89f833757
--- /dev/null
+++ b/libc/isystem/__pstl/internal/parallel_backend_utils.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/parallel_backend_utils.h"
diff --git a/libc/isystem/__pstl/internal/unseq_backend_simd.h b/libc/isystem/__pstl/internal/unseq_backend_simd.h
new file mode 100644
index 000000000..ae6d5645c
--- /dev/null
+++ b/libc/isystem/__pstl/internal/unseq_backend_simd.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/unseq_backend_simd.h"
diff --git a/libc/isystem/__pstl/internal/utils.h b/libc/isystem/__pstl/internal/utils.h
new file mode 100644
index 000000000..36bf3054d
--- /dev/null
+++ b/libc/isystem/__pstl/internal/utils.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl/internal/utils.h"
diff --git a/libc/isystem/__pstl_algorithm b/libc/isystem/__pstl_algorithm
new file mode 100644
index 000000000..8a0d459be
--- /dev/null
+++ b/libc/isystem/__pstl_algorithm
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl_algorithm"
diff --git a/libc/isystem/__pstl_config_site b/libc/isystem/__pstl_config_site
new file mode 100644
index 000000000..492945c9d
--- /dev/null
+++ b/libc/isystem/__pstl_config_site
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl_config_site"
diff --git a/libc/isystem/__pstl_memory b/libc/isystem/__pstl_memory
new file mode 100644
index 000000000..8412f9cf5
--- /dev/null
+++ b/libc/isystem/__pstl_memory
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl_memory"
diff --git a/libc/isystem/__pstl_numeric b/libc/isystem/__pstl_numeric
new file mode 100644
index 000000000..bf03f0a0a
--- /dev/null
+++ b/libc/isystem/__pstl_numeric
@@ -0,0 +1 @@
+#include "third_party/libcxx/__pstl_numeric"
diff --git a/libc/isystem/__random/bernoulli_distribution.h b/libc/isystem/__random/bernoulli_distribution.h
new file mode 100644
index 000000000..e240f5277
--- /dev/null
+++ b/libc/isystem/__random/bernoulli_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/bernoulli_distribution.h"
diff --git a/libc/isystem/__random/binomial_distribution.h b/libc/isystem/__random/binomial_distribution.h
new file mode 100644
index 000000000..ddacda8dc
--- /dev/null
+++ b/libc/isystem/__random/binomial_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/binomial_distribution.h"
diff --git a/libc/isystem/__random/cauchy_distribution.h b/libc/isystem/__random/cauchy_distribution.h
new file mode 100644
index 000000000..178ff7480
--- /dev/null
+++ b/libc/isystem/__random/cauchy_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/cauchy_distribution.h"
diff --git a/libc/isystem/__random/chi_squared_distribution.h b/libc/isystem/__random/chi_squared_distribution.h
new file mode 100644
index 000000000..f87e342bf
--- /dev/null
+++ b/libc/isystem/__random/chi_squared_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/chi_squared_distribution.h"
diff --git a/libc/isystem/__random/clamp_to_integral.h b/libc/isystem/__random/clamp_to_integral.h
new file mode 100644
index 000000000..cfe92b74b
--- /dev/null
+++ b/libc/isystem/__random/clamp_to_integral.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/clamp_to_integral.h"
diff --git a/libc/isystem/__random/default_random_engine.h b/libc/isystem/__random/default_random_engine.h
new file mode 100644
index 000000000..2134e8dbf
--- /dev/null
+++ b/libc/isystem/__random/default_random_engine.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/default_random_engine.h"
diff --git a/libc/isystem/__random/discard_block_engine.h b/libc/isystem/__random/discard_block_engine.h
new file mode 100644
index 000000000..9cf79e60b
--- /dev/null
+++ b/libc/isystem/__random/discard_block_engine.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/discard_block_engine.h"
diff --git a/libc/isystem/__random/discrete_distribution.h b/libc/isystem/__random/discrete_distribution.h
new file mode 100644
index 000000000..5478b7da2
--- /dev/null
+++ b/libc/isystem/__random/discrete_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/discrete_distribution.h"
diff --git a/libc/isystem/__random/exponential_distribution.h b/libc/isystem/__random/exponential_distribution.h
new file mode 100644
index 000000000..547b9198f
--- /dev/null
+++ b/libc/isystem/__random/exponential_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/exponential_distribution.h"
diff --git a/libc/isystem/__random/extreme_value_distribution.h b/libc/isystem/__random/extreme_value_distribution.h
new file mode 100644
index 000000000..df7e193de
--- /dev/null
+++ b/libc/isystem/__random/extreme_value_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/extreme_value_distribution.h"
diff --git a/libc/isystem/__random/fisher_f_distribution.h b/libc/isystem/__random/fisher_f_distribution.h
new file mode 100644
index 000000000..b1ac457f8
--- /dev/null
+++ b/libc/isystem/__random/fisher_f_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/fisher_f_distribution.h"
diff --git a/libc/isystem/__random/gamma_distribution.h b/libc/isystem/__random/gamma_distribution.h
new file mode 100644
index 000000000..0cdde5138
--- /dev/null
+++ b/libc/isystem/__random/gamma_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/gamma_distribution.h"
diff --git a/libc/isystem/__random/generate_canonical.h b/libc/isystem/__random/generate_canonical.h
new file mode 100644
index 000000000..e4737fb26
--- /dev/null
+++ b/libc/isystem/__random/generate_canonical.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/generate_canonical.h"
diff --git a/libc/isystem/__random/geometric_distribution.h b/libc/isystem/__random/geometric_distribution.h
new file mode 100644
index 000000000..86cce45e4
--- /dev/null
+++ b/libc/isystem/__random/geometric_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/geometric_distribution.h"
diff --git a/libc/isystem/__random/independent_bits_engine.h b/libc/isystem/__random/independent_bits_engine.h
new file mode 100644
index 000000000..76eb66d97
--- /dev/null
+++ b/libc/isystem/__random/independent_bits_engine.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/independent_bits_engine.h"
diff --git a/libc/isystem/__random/is_seed_sequence.h b/libc/isystem/__random/is_seed_sequence.h
new file mode 100644
index 000000000..037f36a94
--- /dev/null
+++ b/libc/isystem/__random/is_seed_sequence.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/is_seed_sequence.h"
diff --git a/libc/isystem/__random/is_valid.h b/libc/isystem/__random/is_valid.h
new file mode 100644
index 000000000..c1f871a6f
--- /dev/null
+++ b/libc/isystem/__random/is_valid.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/is_valid.h"
diff --git a/libc/isystem/__random/knuth_b.h b/libc/isystem/__random/knuth_b.h
new file mode 100644
index 000000000..425206b18
--- /dev/null
+++ b/libc/isystem/__random/knuth_b.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/knuth_b.h"
diff --git a/libc/isystem/__random/linear_congruential_engine.h b/libc/isystem/__random/linear_congruential_engine.h
new file mode 100644
index 000000000..37826ff22
--- /dev/null
+++ b/libc/isystem/__random/linear_congruential_engine.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/linear_congruential_engine.h"
diff --git a/libc/isystem/__random/log2.h b/libc/isystem/__random/log2.h
new file mode 100644
index 000000000..a6f96d734
--- /dev/null
+++ b/libc/isystem/__random/log2.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/log2.h"
diff --git a/libc/isystem/__random/lognormal_distribution.h b/libc/isystem/__random/lognormal_distribution.h
new file mode 100644
index 000000000..1bc2e2f11
--- /dev/null
+++ b/libc/isystem/__random/lognormal_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/lognormal_distribution.h"
diff --git a/libc/isystem/__random/mersenne_twister_engine.h b/libc/isystem/__random/mersenne_twister_engine.h
new file mode 100644
index 000000000..c1d04a247
--- /dev/null
+++ b/libc/isystem/__random/mersenne_twister_engine.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/mersenne_twister_engine.h"
diff --git a/libc/isystem/__random/negative_binomial_distribution.h b/libc/isystem/__random/negative_binomial_distribution.h
new file mode 100644
index 000000000..d0ee1d480
--- /dev/null
+++ b/libc/isystem/__random/negative_binomial_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/negative_binomial_distribution.h"
diff --git a/libc/isystem/__random/normal_distribution.h b/libc/isystem/__random/normal_distribution.h
new file mode 100644
index 000000000..6514d7e80
--- /dev/null
+++ b/libc/isystem/__random/normal_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/normal_distribution.h"
diff --git a/libc/isystem/__random/piecewise_constant_distribution.h b/libc/isystem/__random/piecewise_constant_distribution.h
new file mode 100644
index 000000000..100ba45a0
--- /dev/null
+++ b/libc/isystem/__random/piecewise_constant_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/piecewise_constant_distribution.h"
diff --git a/libc/isystem/__random/piecewise_linear_distribution.h b/libc/isystem/__random/piecewise_linear_distribution.h
new file mode 100644
index 000000000..666bee165
--- /dev/null
+++ b/libc/isystem/__random/piecewise_linear_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/piecewise_linear_distribution.h"
diff --git a/libc/isystem/__random/poisson_distribution.h b/libc/isystem/__random/poisson_distribution.h
new file mode 100644
index 000000000..0b8562668
--- /dev/null
+++ b/libc/isystem/__random/poisson_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/poisson_distribution.h"
diff --git a/libc/isystem/__random/random_device.h b/libc/isystem/__random/random_device.h
new file mode 100644
index 000000000..7c636db53
--- /dev/null
+++ b/libc/isystem/__random/random_device.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/random_device.h"
diff --git a/libc/isystem/__random/ranlux.h b/libc/isystem/__random/ranlux.h
new file mode 100644
index 000000000..09c20cfcb
--- /dev/null
+++ b/libc/isystem/__random/ranlux.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/ranlux.h"
diff --git a/libc/isystem/__random/seed_seq.h b/libc/isystem/__random/seed_seq.h
new file mode 100644
index 000000000..0aa3307f6
--- /dev/null
+++ b/libc/isystem/__random/seed_seq.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/seed_seq.h"
diff --git a/libc/isystem/__random/shuffle_order_engine.h b/libc/isystem/__random/shuffle_order_engine.h
new file mode 100644
index 000000000..61961bba3
--- /dev/null
+++ b/libc/isystem/__random/shuffle_order_engine.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/shuffle_order_engine.h"
diff --git a/libc/isystem/__random/student_t_distribution.h b/libc/isystem/__random/student_t_distribution.h
new file mode 100644
index 000000000..faa747d85
--- /dev/null
+++ b/libc/isystem/__random/student_t_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/student_t_distribution.h"
diff --git a/libc/isystem/__random/subtract_with_carry_engine.h b/libc/isystem/__random/subtract_with_carry_engine.h
new file mode 100644
index 000000000..df0e52cc8
--- /dev/null
+++ b/libc/isystem/__random/subtract_with_carry_engine.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/subtract_with_carry_engine.h"
diff --git a/libc/isystem/__random/uniform_int_distribution.h b/libc/isystem/__random/uniform_int_distribution.h
new file mode 100644
index 000000000..3d14ec164
--- /dev/null
+++ b/libc/isystem/__random/uniform_int_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/uniform_int_distribution.h"
diff --git a/libc/isystem/__random/uniform_random_bit_generator.h b/libc/isystem/__random/uniform_random_bit_generator.h
new file mode 100644
index 000000000..af2fd5f5c
--- /dev/null
+++ b/libc/isystem/__random/uniform_random_bit_generator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/uniform_random_bit_generator.h"
diff --git a/libc/isystem/__random/uniform_real_distribution.h b/libc/isystem/__random/uniform_real_distribution.h
new file mode 100644
index 000000000..d0ee5da8b
--- /dev/null
+++ b/libc/isystem/__random/uniform_real_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/uniform_real_distribution.h"
diff --git a/libc/isystem/__random/weibull_distribution.h b/libc/isystem/__random/weibull_distribution.h
new file mode 100644
index 000000000..81334a3b7
--- /dev/null
+++ b/libc/isystem/__random/weibull_distribution.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__random/weibull_distribution.h"
diff --git a/libc/isystem/__ranges/access.h b/libc/isystem/__ranges/access.h
new file mode 100644
index 000000000..6f811bdb6
--- /dev/null
+++ b/libc/isystem/__ranges/access.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/access.h"
diff --git a/libc/isystem/__ranges/all.h b/libc/isystem/__ranges/all.h
new file mode 100644
index 000000000..bbd3f32b0
--- /dev/null
+++ b/libc/isystem/__ranges/all.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/all.h"
diff --git a/libc/isystem/__ranges/as_rvalue_view.h b/libc/isystem/__ranges/as_rvalue_view.h
new file mode 100644
index 000000000..efda2bc13
--- /dev/null
+++ b/libc/isystem/__ranges/as_rvalue_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/as_rvalue_view.h"
diff --git a/libc/isystem/__ranges/common_view.h b/libc/isystem/__ranges/common_view.h
new file mode 100644
index 000000000..ecec8365e
--- /dev/null
+++ b/libc/isystem/__ranges/common_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/common_view.h"
diff --git a/libc/isystem/__ranges/concepts.h b/libc/isystem/__ranges/concepts.h
new file mode 100644
index 000000000..5e3917d5b
--- /dev/null
+++ b/libc/isystem/__ranges/concepts.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/concepts.h"
diff --git a/libc/isystem/__ranges/container_compatible_range.h b/libc/isystem/__ranges/container_compatible_range.h
new file mode 100644
index 000000000..0139d6769
--- /dev/null
+++ b/libc/isystem/__ranges/container_compatible_range.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/container_compatible_range.h"
diff --git a/libc/isystem/__ranges/copyable_box.h b/libc/isystem/__ranges/copyable_box.h
new file mode 100644
index 000000000..85dc87732
--- /dev/null
+++ b/libc/isystem/__ranges/copyable_box.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/copyable_box.h"
diff --git a/libc/isystem/__ranges/counted.h b/libc/isystem/__ranges/counted.h
new file mode 100644
index 000000000..b455c84df
--- /dev/null
+++ b/libc/isystem/__ranges/counted.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/counted.h"
diff --git a/libc/isystem/__ranges/dangling.h b/libc/isystem/__ranges/dangling.h
new file mode 100644
index 000000000..0d4a25638
--- /dev/null
+++ b/libc/isystem/__ranges/dangling.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/dangling.h"
diff --git a/libc/isystem/__ranges/data.h b/libc/isystem/__ranges/data.h
new file mode 100644
index 000000000..067c84437
--- /dev/null
+++ b/libc/isystem/__ranges/data.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/data.h"
diff --git a/libc/isystem/__ranges/drop_view.h b/libc/isystem/__ranges/drop_view.h
new file mode 100644
index 000000000..22ba897fb
--- /dev/null
+++ b/libc/isystem/__ranges/drop_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/drop_view.h"
diff --git a/libc/isystem/__ranges/drop_while_view.h b/libc/isystem/__ranges/drop_while_view.h
new file mode 100644
index 000000000..900e498e5
--- /dev/null
+++ b/libc/isystem/__ranges/drop_while_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/drop_while_view.h"
diff --git a/libc/isystem/__ranges/elements_view.h b/libc/isystem/__ranges/elements_view.h
new file mode 100644
index 000000000..1cba59aa8
--- /dev/null
+++ b/libc/isystem/__ranges/elements_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/elements_view.h"
diff --git a/libc/isystem/__ranges/empty.h b/libc/isystem/__ranges/empty.h
new file mode 100644
index 000000000..4e3547c5b
--- /dev/null
+++ b/libc/isystem/__ranges/empty.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/empty.h"
diff --git a/libc/isystem/__ranges/empty_view.h b/libc/isystem/__ranges/empty_view.h
new file mode 100644
index 000000000..77c23486e
--- /dev/null
+++ b/libc/isystem/__ranges/empty_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/empty_view.h"
diff --git a/libc/isystem/__ranges/enable_borrowed_range.h b/libc/isystem/__ranges/enable_borrowed_range.h
new file mode 100644
index 000000000..3e451211d
--- /dev/null
+++ b/libc/isystem/__ranges/enable_borrowed_range.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/enable_borrowed_range.h"
diff --git a/libc/isystem/__ranges/enable_view.h b/libc/isystem/__ranges/enable_view.h
new file mode 100644
index 000000000..1298361b5
--- /dev/null
+++ b/libc/isystem/__ranges/enable_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/enable_view.h"
diff --git a/libc/isystem/__ranges/filter_view.h b/libc/isystem/__ranges/filter_view.h
new file mode 100644
index 000000000..af4411d34
--- /dev/null
+++ b/libc/isystem/__ranges/filter_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/filter_view.h"
diff --git a/libc/isystem/__ranges/from_range.h b/libc/isystem/__ranges/from_range.h
new file mode 100644
index 000000000..68fd72b7e
--- /dev/null
+++ b/libc/isystem/__ranges/from_range.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/from_range.h"
diff --git a/libc/isystem/__ranges/iota_view.h b/libc/isystem/__ranges/iota_view.h
new file mode 100644
index 000000000..f5376c75c
--- /dev/null
+++ b/libc/isystem/__ranges/iota_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/iota_view.h"
diff --git a/libc/isystem/__ranges/istream_view.h b/libc/isystem/__ranges/istream_view.h
new file mode 100644
index 000000000..eda421941
--- /dev/null
+++ b/libc/isystem/__ranges/istream_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/istream_view.h"
diff --git a/libc/isystem/__ranges/join_view.h b/libc/isystem/__ranges/join_view.h
new file mode 100644
index 000000000..54897c651
--- /dev/null
+++ b/libc/isystem/__ranges/join_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/join_view.h"
diff --git a/libc/isystem/__ranges/lazy_split_view.h b/libc/isystem/__ranges/lazy_split_view.h
new file mode 100644
index 000000000..f06c4210a
--- /dev/null
+++ b/libc/isystem/__ranges/lazy_split_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/lazy_split_view.h"
diff --git a/libc/isystem/__ranges/non_propagating_cache.h b/libc/isystem/__ranges/non_propagating_cache.h
new file mode 100644
index 000000000..5ea20eb1b
--- /dev/null
+++ b/libc/isystem/__ranges/non_propagating_cache.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/non_propagating_cache.h"
diff --git a/libc/isystem/__ranges/owning_view.h b/libc/isystem/__ranges/owning_view.h
new file mode 100644
index 000000000..575ea2ce8
--- /dev/null
+++ b/libc/isystem/__ranges/owning_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/owning_view.h"
diff --git a/libc/isystem/__ranges/range_adaptor.h b/libc/isystem/__ranges/range_adaptor.h
new file mode 100644
index 000000000..6947a9511
--- /dev/null
+++ b/libc/isystem/__ranges/range_adaptor.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/range_adaptor.h"
diff --git a/libc/isystem/__ranges/rbegin.h b/libc/isystem/__ranges/rbegin.h
new file mode 100644
index 000000000..f20972e47
--- /dev/null
+++ b/libc/isystem/__ranges/rbegin.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/rbegin.h"
diff --git a/libc/isystem/__ranges/ref_view.h b/libc/isystem/__ranges/ref_view.h
new file mode 100644
index 000000000..d536ffe84
--- /dev/null
+++ b/libc/isystem/__ranges/ref_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/ref_view.h"
diff --git a/libc/isystem/__ranges/rend.h b/libc/isystem/__ranges/rend.h
new file mode 100644
index 000000000..5d075f6ae
--- /dev/null
+++ b/libc/isystem/__ranges/rend.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/rend.h"
diff --git a/libc/isystem/__ranges/reverse_view.h b/libc/isystem/__ranges/reverse_view.h
new file mode 100644
index 000000000..b12903231
--- /dev/null
+++ b/libc/isystem/__ranges/reverse_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/reverse_view.h"
diff --git a/libc/isystem/__ranges/single_view.h b/libc/isystem/__ranges/single_view.h
new file mode 100644
index 000000000..557ea13ff
--- /dev/null
+++ b/libc/isystem/__ranges/single_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/single_view.h"
diff --git a/libc/isystem/__ranges/size.h b/libc/isystem/__ranges/size.h
new file mode 100644
index 000000000..48956a294
--- /dev/null
+++ b/libc/isystem/__ranges/size.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/size.h"
diff --git a/libc/isystem/__ranges/split_view.h b/libc/isystem/__ranges/split_view.h
new file mode 100644
index 000000000..e0a7e3b98
--- /dev/null
+++ b/libc/isystem/__ranges/split_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/split_view.h"
diff --git a/libc/isystem/__ranges/subrange.h b/libc/isystem/__ranges/subrange.h
new file mode 100644
index 000000000..3f9f6267e
--- /dev/null
+++ b/libc/isystem/__ranges/subrange.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/subrange.h"
diff --git a/libc/isystem/__ranges/take_view.h b/libc/isystem/__ranges/take_view.h
new file mode 100644
index 000000000..321b84983
--- /dev/null
+++ b/libc/isystem/__ranges/take_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/take_view.h"
diff --git a/libc/isystem/__ranges/take_while_view.h b/libc/isystem/__ranges/take_while_view.h
new file mode 100644
index 000000000..50a752997
--- /dev/null
+++ b/libc/isystem/__ranges/take_while_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/take_while_view.h"
diff --git a/libc/isystem/__ranges/transform_view.h b/libc/isystem/__ranges/transform_view.h
new file mode 100644
index 000000000..fd58130fa
--- /dev/null
+++ b/libc/isystem/__ranges/transform_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/transform_view.h"
diff --git a/libc/isystem/__ranges/view_interface.h b/libc/isystem/__ranges/view_interface.h
new file mode 100644
index 000000000..d4d88d92e
--- /dev/null
+++ b/libc/isystem/__ranges/view_interface.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/view_interface.h"
diff --git a/libc/isystem/__ranges/views.h b/libc/isystem/__ranges/views.h
new file mode 100644
index 000000000..cabb6af44
--- /dev/null
+++ b/libc/isystem/__ranges/views.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/views.h"
diff --git a/libc/isystem/__ranges/zip_view.h b/libc/isystem/__ranges/zip_view.h
new file mode 100644
index 000000000..8be240c44
--- /dev/null
+++ b/libc/isystem/__ranges/zip_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__ranges/zip_view.h"
diff --git a/libc/isystem/__split_buffer b/libc/isystem/__split_buffer
new file mode 100644
index 000000000..902141724
--- /dev/null
+++ b/libc/isystem/__split_buffer
@@ -0,0 +1 @@
+#include "third_party/libcxx/__split_buffer"
diff --git a/libc/isystem/__std_mbstate_t.h b/libc/isystem/__std_mbstate_t.h
new file mode 100644
index 000000000..16e4afab1
--- /dev/null
+++ b/libc/isystem/__std_mbstate_t.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__std_mbstate_t.h"
diff --git a/libc/isystem/__stop_token/atomic_unique_lock.h b/libc/isystem/__stop_token/atomic_unique_lock.h
new file mode 100644
index 000000000..cf7fe650d
--- /dev/null
+++ b/libc/isystem/__stop_token/atomic_unique_lock.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__stop_token/atomic_unique_lock.h"
diff --git a/libc/isystem/__stop_token/intrusive_list_view.h b/libc/isystem/__stop_token/intrusive_list_view.h
new file mode 100644
index 000000000..8cf31de7b
--- /dev/null
+++ b/libc/isystem/__stop_token/intrusive_list_view.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__stop_token/intrusive_list_view.h"
diff --git a/libc/isystem/__stop_token/intrusive_shared_ptr.h b/libc/isystem/__stop_token/intrusive_shared_ptr.h
new file mode 100644
index 000000000..4d8690f30
--- /dev/null
+++ b/libc/isystem/__stop_token/intrusive_shared_ptr.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__stop_token/intrusive_shared_ptr.h"
diff --git a/libc/isystem/__string/char_traits.h b/libc/isystem/__string/char_traits.h
new file mode 100644
index 000000000..595ecc11d
--- /dev/null
+++ b/libc/isystem/__string/char_traits.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__string/char_traits.h"
diff --git a/libc/isystem/__string/constexpr_c_functions.h b/libc/isystem/__string/constexpr_c_functions.h
new file mode 100644
index 000000000..43b738d8c
--- /dev/null
+++ b/libc/isystem/__string/constexpr_c_functions.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__string/constexpr_c_functions.h"
diff --git a/libc/isystem/__string/extern_template_lists.h b/libc/isystem/__string/extern_template_lists.h
new file mode 100644
index 000000000..21ed1bd1e
--- /dev/null
+++ b/libc/isystem/__string/extern_template_lists.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__string/extern_template_lists.h"
diff --git a/libc/isystem/__support/android/locale_bionic.h b/libc/isystem/__support/android/locale_bionic.h
new file mode 100644
index 000000000..fda130e29
--- /dev/null
+++ b/libc/isystem/__support/android/locale_bionic.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/android/locale_bionic.h"
diff --git a/libc/isystem/__support/fuchsia/xlocale.h b/libc/isystem/__support/fuchsia/xlocale.h
new file mode 100644
index 000000000..d1009593c
--- /dev/null
+++ b/libc/isystem/__support/fuchsia/xlocale.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/fuchsia/xlocale.h"
diff --git a/libc/isystem/__support/ibm/gettod_zos.h b/libc/isystem/__support/ibm/gettod_zos.h
new file mode 100644
index 000000000..c16419704
--- /dev/null
+++ b/libc/isystem/__support/ibm/gettod_zos.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/ibm/gettod_zos.h"
diff --git a/libc/isystem/__support/ibm/locale_mgmt_zos.h b/libc/isystem/__support/ibm/locale_mgmt_zos.h
new file mode 100644
index 000000000..fb69a30cf
--- /dev/null
+++ b/libc/isystem/__support/ibm/locale_mgmt_zos.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/ibm/locale_mgmt_zos.h"
diff --git a/libc/isystem/__support/ibm/nanosleep.h b/libc/isystem/__support/ibm/nanosleep.h
new file mode 100644
index 000000000..9a371298a
--- /dev/null
+++ b/libc/isystem/__support/ibm/nanosleep.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/ibm/nanosleep.h"
diff --git a/libc/isystem/__support/ibm/xlocale.h b/libc/isystem/__support/ibm/xlocale.h
new file mode 100644
index 000000000..91f0cfaae
--- /dev/null
+++ b/libc/isystem/__support/ibm/xlocale.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/ibm/xlocale.h"
diff --git a/libc/isystem/__support/musl/xlocale.h b/libc/isystem/__support/musl/xlocale.h
new file mode 100644
index 000000000..6adc3cfa5
--- /dev/null
+++ b/libc/isystem/__support/musl/xlocale.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/musl/xlocale.h"
diff --git a/libc/isystem/__support/newlib/xlocale.h b/libc/isystem/__support/newlib/xlocale.h
new file mode 100644
index 000000000..0e48c4211
--- /dev/null
+++ b/libc/isystem/__support/newlib/xlocale.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/newlib/xlocale.h"
diff --git a/libc/isystem/__support/openbsd/xlocale.h b/libc/isystem/__support/openbsd/xlocale.h
new file mode 100644
index 000000000..42661d821
--- /dev/null
+++ b/libc/isystem/__support/openbsd/xlocale.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/openbsd/xlocale.h"
diff --git a/libc/isystem/__support/win32/locale_win32.h b/libc/isystem/__support/win32/locale_win32.h
new file mode 100644
index 000000000..3a35381fa
--- /dev/null
+++ b/libc/isystem/__support/win32/locale_win32.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/win32/locale_win32.h"
diff --git a/libc/isystem/__support/xlocale/__nop_locale_mgmt.h b/libc/isystem/__support/xlocale/__nop_locale_mgmt.h
new file mode 100644
index 000000000..2fb6f4d0d
--- /dev/null
+++ b/libc/isystem/__support/xlocale/__nop_locale_mgmt.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/xlocale/__nop_locale_mgmt.h"
diff --git a/libc/isystem/__support/xlocale/__posix_l_fallback.h b/libc/isystem/__support/xlocale/__posix_l_fallback.h
new file mode 100644
index 000000000..3d36f53b6
--- /dev/null
+++ b/libc/isystem/__support/xlocale/__posix_l_fallback.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/xlocale/__posix_l_fallback.h"
diff --git a/libc/isystem/__support/xlocale/__strtonum_fallback.h b/libc/isystem/__support/xlocale/__strtonum_fallback.h
new file mode 100644
index 000000000..56345c0e3
--- /dev/null
+++ b/libc/isystem/__support/xlocale/__strtonum_fallback.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__support/xlocale/__strtonum_fallback.h"
diff --git a/libc/isystem/__system_error/errc.h b/libc/isystem/__system_error/errc.h
new file mode 100644
index 000000000..316ed06a1
--- /dev/null
+++ b/libc/isystem/__system_error/errc.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__system_error/errc.h"
diff --git a/libc/isystem/__system_error/error_category.h b/libc/isystem/__system_error/error_category.h
new file mode 100644
index 000000000..614dd4e08
--- /dev/null
+++ b/libc/isystem/__system_error/error_category.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__system_error/error_category.h"
diff --git a/libc/isystem/__system_error/error_code.h b/libc/isystem/__system_error/error_code.h
new file mode 100644
index 000000000..ac222e5de
--- /dev/null
+++ b/libc/isystem/__system_error/error_code.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__system_error/error_code.h"
diff --git a/libc/isystem/__system_error/error_condition.h b/libc/isystem/__system_error/error_condition.h
new file mode 100644
index 000000000..b401abeb4
--- /dev/null
+++ b/libc/isystem/__system_error/error_condition.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__system_error/error_condition.h"
diff --git a/libc/isystem/__system_error/system_error.h b/libc/isystem/__system_error/system_error.h
new file mode 100644
index 000000000..058736285
--- /dev/null
+++ b/libc/isystem/__system_error/system_error.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__system_error/system_error.h"
diff --git a/libc/isystem/__thread/poll_with_backoff.h b/libc/isystem/__thread/poll_with_backoff.h
new file mode 100644
index 000000000..d30bc37e3
--- /dev/null
+++ b/libc/isystem/__thread/poll_with_backoff.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__thread/poll_with_backoff.h"
diff --git a/libc/isystem/__thread/timed_backoff_policy.h b/libc/isystem/__thread/timed_backoff_policy.h
new file mode 100644
index 000000000..daaa7d605
--- /dev/null
+++ b/libc/isystem/__thread/timed_backoff_policy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__thread/timed_backoff_policy.h"
diff --git a/libc/isystem/__tree b/libc/isystem/__tree
new file mode 100644
index 000000000..58ced574a
--- /dev/null
+++ b/libc/isystem/__tree
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tree"
diff --git a/libc/isystem/__tuple/make_tuple_types.h b/libc/isystem/__tuple/make_tuple_types.h
new file mode 100644
index 000000000..b8cb61f50
--- /dev/null
+++ b/libc/isystem/__tuple/make_tuple_types.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tuple/make_tuple_types.h"
diff --git a/libc/isystem/__tuple/pair_like.h b/libc/isystem/__tuple/pair_like.h
new file mode 100644
index 000000000..f0af1bb9f
--- /dev/null
+++ b/libc/isystem/__tuple/pair_like.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tuple/pair_like.h"
diff --git a/libc/isystem/__tuple/sfinae_helpers.h b/libc/isystem/__tuple/sfinae_helpers.h
new file mode 100644
index 000000000..b303e7b90
--- /dev/null
+++ b/libc/isystem/__tuple/sfinae_helpers.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tuple/sfinae_helpers.h"
diff --git a/libc/isystem/__tuple/tuple_element.h b/libc/isystem/__tuple/tuple_element.h
new file mode 100644
index 000000000..bad24fc7b
--- /dev/null
+++ b/libc/isystem/__tuple/tuple_element.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tuple/tuple_element.h"
diff --git a/libc/isystem/__tuple/tuple_indices.h b/libc/isystem/__tuple/tuple_indices.h
new file mode 100644
index 000000000..8e95917fe
--- /dev/null
+++ b/libc/isystem/__tuple/tuple_indices.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tuple/tuple_indices.h"
diff --git a/libc/isystem/__tuple/tuple_like.h b/libc/isystem/__tuple/tuple_like.h
new file mode 100644
index 000000000..12f6324f4
--- /dev/null
+++ b/libc/isystem/__tuple/tuple_like.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tuple/tuple_like.h"
diff --git a/libc/isystem/__tuple/tuple_like_ext.h b/libc/isystem/__tuple/tuple_like_ext.h
new file mode 100644
index 000000000..48bda7deb
--- /dev/null
+++ b/libc/isystem/__tuple/tuple_like_ext.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tuple/tuple_like_ext.h"
diff --git a/libc/isystem/__tuple/tuple_size.h b/libc/isystem/__tuple/tuple_size.h
new file mode 100644
index 000000000..7efcc527e
--- /dev/null
+++ b/libc/isystem/__tuple/tuple_size.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tuple/tuple_size.h"
diff --git a/libc/isystem/__tuple/tuple_types.h b/libc/isystem/__tuple/tuple_types.h
new file mode 100644
index 000000000..9f27a4acc
--- /dev/null
+++ b/libc/isystem/__tuple/tuple_types.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__tuple/tuple_types.h"
diff --git a/libc/isystem/__type_traits/add_const.h b/libc/isystem/__type_traits/add_const.h
new file mode 100644
index 000000000..f9f9622df
--- /dev/null
+++ b/libc/isystem/__type_traits/add_const.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/add_const.h"
diff --git a/libc/isystem/__type_traits/add_cv.h b/libc/isystem/__type_traits/add_cv.h
new file mode 100644
index 000000000..9a012ffbb
--- /dev/null
+++ b/libc/isystem/__type_traits/add_cv.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/add_cv.h"
diff --git a/libc/isystem/__type_traits/add_lvalue_reference.h b/libc/isystem/__type_traits/add_lvalue_reference.h
new file mode 100644
index 000000000..af6e976da
--- /dev/null
+++ b/libc/isystem/__type_traits/add_lvalue_reference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/add_lvalue_reference.h"
diff --git a/libc/isystem/__type_traits/add_pointer.h b/libc/isystem/__type_traits/add_pointer.h
new file mode 100644
index 000000000..fd3665007
--- /dev/null
+++ b/libc/isystem/__type_traits/add_pointer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/add_pointer.h"
diff --git a/libc/isystem/__type_traits/add_rvalue_reference.h b/libc/isystem/__type_traits/add_rvalue_reference.h
new file mode 100644
index 000000000..4cdf48028
--- /dev/null
+++ b/libc/isystem/__type_traits/add_rvalue_reference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/add_rvalue_reference.h"
diff --git a/libc/isystem/__type_traits/add_volatile.h b/libc/isystem/__type_traits/add_volatile.h
new file mode 100644
index 000000000..73082c9e6
--- /dev/null
+++ b/libc/isystem/__type_traits/add_volatile.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/add_volatile.h"
diff --git a/libc/isystem/__type_traits/aligned_storage.h b/libc/isystem/__type_traits/aligned_storage.h
new file mode 100644
index 000000000..3ee11fc58
--- /dev/null
+++ b/libc/isystem/__type_traits/aligned_storage.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/aligned_storage.h"
diff --git a/libc/isystem/__type_traits/aligned_union.h b/libc/isystem/__type_traits/aligned_union.h
new file mode 100644
index 000000000..c83dff35c
--- /dev/null
+++ b/libc/isystem/__type_traits/aligned_union.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/aligned_union.h"
diff --git a/libc/isystem/__type_traits/alignment_of.h b/libc/isystem/__type_traits/alignment_of.h
new file mode 100644
index 000000000..8cca54a65
--- /dev/null
+++ b/libc/isystem/__type_traits/alignment_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/alignment_of.h"
diff --git a/libc/isystem/__type_traits/apply_cv.h b/libc/isystem/__type_traits/apply_cv.h
new file mode 100644
index 000000000..c7465dd1a
--- /dev/null
+++ b/libc/isystem/__type_traits/apply_cv.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/apply_cv.h"
diff --git a/libc/isystem/__type_traits/can_extract_key.h b/libc/isystem/__type_traits/can_extract_key.h
new file mode 100644
index 000000000..bf477ffa5
--- /dev/null
+++ b/libc/isystem/__type_traits/can_extract_key.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/can_extract_key.h"
diff --git a/libc/isystem/__type_traits/common_reference.h b/libc/isystem/__type_traits/common_reference.h
new file mode 100644
index 000000000..abd2af421
--- /dev/null
+++ b/libc/isystem/__type_traits/common_reference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/common_reference.h"
diff --git a/libc/isystem/__type_traits/common_type.h b/libc/isystem/__type_traits/common_type.h
new file mode 100644
index 000000000..61fc9f099
--- /dev/null
+++ b/libc/isystem/__type_traits/common_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/common_type.h"
diff --git a/libc/isystem/__type_traits/conditional.h b/libc/isystem/__type_traits/conditional.h
new file mode 100644
index 000000000..d14de120c
--- /dev/null
+++ b/libc/isystem/__type_traits/conditional.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/conditional.h"
diff --git a/libc/isystem/__type_traits/conjunction.h b/libc/isystem/__type_traits/conjunction.h
new file mode 100644
index 000000000..cd73d37bb
--- /dev/null
+++ b/libc/isystem/__type_traits/conjunction.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/conjunction.h"
diff --git a/libc/isystem/__type_traits/copy_cv.h b/libc/isystem/__type_traits/copy_cv.h
new file mode 100644
index 000000000..aa6d3fd76
--- /dev/null
+++ b/libc/isystem/__type_traits/copy_cv.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/copy_cv.h"
diff --git a/libc/isystem/__type_traits/copy_cvref.h b/libc/isystem/__type_traits/copy_cvref.h
new file mode 100644
index 000000000..b3f60045d
--- /dev/null
+++ b/libc/isystem/__type_traits/copy_cvref.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/copy_cvref.h"
diff --git a/libc/isystem/__type_traits/decay.h b/libc/isystem/__type_traits/decay.h
new file mode 100644
index 000000000..5f073bd3d
--- /dev/null
+++ b/libc/isystem/__type_traits/decay.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/decay.h"
diff --git a/libc/isystem/__type_traits/dependent_type.h b/libc/isystem/__type_traits/dependent_type.h
new file mode 100644
index 000000000..fb4558646
--- /dev/null
+++ b/libc/isystem/__type_traits/dependent_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/dependent_type.h"
diff --git a/libc/isystem/__type_traits/disjunction.h b/libc/isystem/__type_traits/disjunction.h
new file mode 100644
index 000000000..9089736d6
--- /dev/null
+++ b/libc/isystem/__type_traits/disjunction.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/disjunction.h"
diff --git a/libc/isystem/__type_traits/enable_if.h b/libc/isystem/__type_traits/enable_if.h
new file mode 100644
index 000000000..834849bec
--- /dev/null
+++ b/libc/isystem/__type_traits/enable_if.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/enable_if.h"
diff --git a/libc/isystem/__type_traits/extent.h b/libc/isystem/__type_traits/extent.h
new file mode 100644
index 000000000..b796acdc0
--- /dev/null
+++ b/libc/isystem/__type_traits/extent.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/extent.h"
diff --git a/libc/isystem/__type_traits/has_unique_object_representation.h b/libc/isystem/__type_traits/has_unique_object_representation.h
new file mode 100644
index 000000000..81b1b6835
--- /dev/null
+++ b/libc/isystem/__type_traits/has_unique_object_representation.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/has_unique_object_representation.h"
diff --git a/libc/isystem/__type_traits/has_virtual_destructor.h b/libc/isystem/__type_traits/has_virtual_destructor.h
new file mode 100644
index 000000000..2c55e7dd7
--- /dev/null
+++ b/libc/isystem/__type_traits/has_virtual_destructor.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/has_virtual_destructor.h"
diff --git a/libc/isystem/__type_traits/integral_constant.h b/libc/isystem/__type_traits/integral_constant.h
new file mode 100644
index 000000000..e2998650e
--- /dev/null
+++ b/libc/isystem/__type_traits/integral_constant.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/integral_constant.h"
diff --git a/libc/isystem/__type_traits/invoke.h b/libc/isystem/__type_traits/invoke.h
new file mode 100644
index 000000000..8bcb03c23
--- /dev/null
+++ b/libc/isystem/__type_traits/invoke.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/invoke.h"
diff --git a/libc/isystem/__type_traits/is_abstract.h b/libc/isystem/__type_traits/is_abstract.h
new file mode 100644
index 000000000..7af736c48
--- /dev/null
+++ b/libc/isystem/__type_traits/is_abstract.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_abstract.h"
diff --git a/libc/isystem/__type_traits/is_aggregate.h b/libc/isystem/__type_traits/is_aggregate.h
new file mode 100644
index 000000000..751dddd65
--- /dev/null
+++ b/libc/isystem/__type_traits/is_aggregate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_aggregate.h"
diff --git a/libc/isystem/__type_traits/is_allocator.h b/libc/isystem/__type_traits/is_allocator.h
new file mode 100644
index 000000000..1149f184c
--- /dev/null
+++ b/libc/isystem/__type_traits/is_allocator.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_allocator.h"
diff --git a/libc/isystem/__type_traits/is_always_bitcastable.h b/libc/isystem/__type_traits/is_always_bitcastable.h
new file mode 100644
index 000000000..167d6adff
--- /dev/null
+++ b/libc/isystem/__type_traits/is_always_bitcastable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_always_bitcastable.h"
diff --git a/libc/isystem/__type_traits/is_arithmetic.h b/libc/isystem/__type_traits/is_arithmetic.h
new file mode 100644
index 000000000..1daf36b39
--- /dev/null
+++ b/libc/isystem/__type_traits/is_arithmetic.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_arithmetic.h"
diff --git a/libc/isystem/__type_traits/is_array.h b/libc/isystem/__type_traits/is_array.h
new file mode 100644
index 000000000..510e0533b
--- /dev/null
+++ b/libc/isystem/__type_traits/is_array.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_array.h"
diff --git a/libc/isystem/__type_traits/is_assignable.h b/libc/isystem/__type_traits/is_assignable.h
new file mode 100644
index 000000000..feb481cfa
--- /dev/null
+++ b/libc/isystem/__type_traits/is_assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_assignable.h"
diff --git a/libc/isystem/__type_traits/is_base_of.h b/libc/isystem/__type_traits/is_base_of.h
new file mode 100644
index 000000000..44c6370a7
--- /dev/null
+++ b/libc/isystem/__type_traits/is_base_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_base_of.h"
diff --git a/libc/isystem/__type_traits/is_bounded_array.h b/libc/isystem/__type_traits/is_bounded_array.h
new file mode 100644
index 000000000..779a4c3d2
--- /dev/null
+++ b/libc/isystem/__type_traits/is_bounded_array.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_bounded_array.h"
diff --git a/libc/isystem/__type_traits/is_callable.h b/libc/isystem/__type_traits/is_callable.h
new file mode 100644
index 000000000..55bb174ca
--- /dev/null
+++ b/libc/isystem/__type_traits/is_callable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_callable.h"
diff --git a/libc/isystem/__type_traits/is_char_like_type.h b/libc/isystem/__type_traits/is_char_like_type.h
new file mode 100644
index 000000000..fe739ebb0
--- /dev/null
+++ b/libc/isystem/__type_traits/is_char_like_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_char_like_type.h"
diff --git a/libc/isystem/__type_traits/is_class.h b/libc/isystem/__type_traits/is_class.h
new file mode 100644
index 000000000..765c0e65c
--- /dev/null
+++ b/libc/isystem/__type_traits/is_class.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_class.h"
diff --git a/libc/isystem/__type_traits/is_compound.h b/libc/isystem/__type_traits/is_compound.h
new file mode 100644
index 000000000..190ae4953
--- /dev/null
+++ b/libc/isystem/__type_traits/is_compound.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_compound.h"
diff --git a/libc/isystem/__type_traits/is_const.h b/libc/isystem/__type_traits/is_const.h
new file mode 100644
index 000000000..13a04ef03
--- /dev/null
+++ b/libc/isystem/__type_traits/is_const.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_const.h"
diff --git a/libc/isystem/__type_traits/is_constant_evaluated.h b/libc/isystem/__type_traits/is_constant_evaluated.h
new file mode 100644
index 000000000..f83dd12a7
--- /dev/null
+++ b/libc/isystem/__type_traits/is_constant_evaluated.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_constant_evaluated.h"
diff --git a/libc/isystem/__type_traits/is_constructible.h b/libc/isystem/__type_traits/is_constructible.h
new file mode 100644
index 000000000..ff75c90f2
--- /dev/null
+++ b/libc/isystem/__type_traits/is_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_constructible.h"
diff --git a/libc/isystem/__type_traits/is_convertible.h b/libc/isystem/__type_traits/is_convertible.h
new file mode 100644
index 000000000..47941c5eb
--- /dev/null
+++ b/libc/isystem/__type_traits/is_convertible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_convertible.h"
diff --git a/libc/isystem/__type_traits/is_copy_assignable.h b/libc/isystem/__type_traits/is_copy_assignable.h
new file mode 100644
index 000000000..55f36e99d
--- /dev/null
+++ b/libc/isystem/__type_traits/is_copy_assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_copy_assignable.h"
diff --git a/libc/isystem/__type_traits/is_copy_constructible.h b/libc/isystem/__type_traits/is_copy_constructible.h
new file mode 100644
index 000000000..279a7fcee
--- /dev/null
+++ b/libc/isystem/__type_traits/is_copy_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_copy_constructible.h"
diff --git a/libc/isystem/__type_traits/is_core_convertible.h b/libc/isystem/__type_traits/is_core_convertible.h
new file mode 100644
index 000000000..4f62f122e
--- /dev/null
+++ b/libc/isystem/__type_traits/is_core_convertible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_core_convertible.h"
diff --git a/libc/isystem/__type_traits/is_default_constructible.h b/libc/isystem/__type_traits/is_default_constructible.h
new file mode 100644
index 000000000..550bd34e6
--- /dev/null
+++ b/libc/isystem/__type_traits/is_default_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_default_constructible.h"
diff --git a/libc/isystem/__type_traits/is_destructible.h b/libc/isystem/__type_traits/is_destructible.h
new file mode 100644
index 000000000..20e16dd9b
--- /dev/null
+++ b/libc/isystem/__type_traits/is_destructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_destructible.h"
diff --git a/libc/isystem/__type_traits/is_empty.h b/libc/isystem/__type_traits/is_empty.h
new file mode 100644
index 000000000..0b67f378d
--- /dev/null
+++ b/libc/isystem/__type_traits/is_empty.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_empty.h"
diff --git a/libc/isystem/__type_traits/is_enum.h b/libc/isystem/__type_traits/is_enum.h
new file mode 100644
index 000000000..db7f368e3
--- /dev/null
+++ b/libc/isystem/__type_traits/is_enum.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_enum.h"
diff --git a/libc/isystem/__type_traits/is_equality_comparable.h b/libc/isystem/__type_traits/is_equality_comparable.h
new file mode 100644
index 000000000..008522fa8
--- /dev/null
+++ b/libc/isystem/__type_traits/is_equality_comparable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_equality_comparable.h"
diff --git a/libc/isystem/__type_traits/is_execution_policy.h b/libc/isystem/__type_traits/is_execution_policy.h
new file mode 100644
index 000000000..dc634945f
--- /dev/null
+++ b/libc/isystem/__type_traits/is_execution_policy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_execution_policy.h"
diff --git a/libc/isystem/__type_traits/is_final.h b/libc/isystem/__type_traits/is_final.h
new file mode 100644
index 000000000..6876cbfeb
--- /dev/null
+++ b/libc/isystem/__type_traits/is_final.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_final.h"
diff --git a/libc/isystem/__type_traits/is_floating_point.h b/libc/isystem/__type_traits/is_floating_point.h
new file mode 100644
index 000000000..a9d8bd5b8
--- /dev/null
+++ b/libc/isystem/__type_traits/is_floating_point.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_floating_point.h"
diff --git a/libc/isystem/__type_traits/is_function.h b/libc/isystem/__type_traits/is_function.h
new file mode 100644
index 000000000..20126bba3
--- /dev/null
+++ b/libc/isystem/__type_traits/is_function.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_function.h"
diff --git a/libc/isystem/__type_traits/is_fundamental.h b/libc/isystem/__type_traits/is_fundamental.h
new file mode 100644
index 000000000..29d4b6e2d
--- /dev/null
+++ b/libc/isystem/__type_traits/is_fundamental.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_fundamental.h"
diff --git a/libc/isystem/__type_traits/is_implicitly_default_constructible.h b/libc/isystem/__type_traits/is_implicitly_default_constructible.h
new file mode 100644
index 000000000..6f51771da
--- /dev/null
+++ b/libc/isystem/__type_traits/is_implicitly_default_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_implicitly_default_constructible.h"
diff --git a/libc/isystem/__type_traits/is_integral.h b/libc/isystem/__type_traits/is_integral.h
new file mode 100644
index 000000000..9c206e3d6
--- /dev/null
+++ b/libc/isystem/__type_traits/is_integral.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_integral.h"
diff --git a/libc/isystem/__type_traits/is_literal_type.h b/libc/isystem/__type_traits/is_literal_type.h
new file mode 100644
index 000000000..45ca76824
--- /dev/null
+++ b/libc/isystem/__type_traits/is_literal_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_literal_type.h"
diff --git a/libc/isystem/__type_traits/is_member_function_pointer.h b/libc/isystem/__type_traits/is_member_function_pointer.h
new file mode 100644
index 000000000..35dd4ff62
--- /dev/null
+++ b/libc/isystem/__type_traits/is_member_function_pointer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_member_function_pointer.h"
diff --git a/libc/isystem/__type_traits/is_member_object_pointer.h b/libc/isystem/__type_traits/is_member_object_pointer.h
new file mode 100644
index 000000000..b3416ddbc
--- /dev/null
+++ b/libc/isystem/__type_traits/is_member_object_pointer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_member_object_pointer.h"
diff --git a/libc/isystem/__type_traits/is_member_pointer.h b/libc/isystem/__type_traits/is_member_pointer.h
new file mode 100644
index 000000000..bf0f9074a
--- /dev/null
+++ b/libc/isystem/__type_traits/is_member_pointer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_member_pointer.h"
diff --git a/libc/isystem/__type_traits/is_move_assignable.h b/libc/isystem/__type_traits/is_move_assignable.h
new file mode 100644
index 000000000..ff502b025
--- /dev/null
+++ b/libc/isystem/__type_traits/is_move_assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_move_assignable.h"
diff --git a/libc/isystem/__type_traits/is_move_constructible.h b/libc/isystem/__type_traits/is_move_constructible.h
new file mode 100644
index 000000000..86dc68397
--- /dev/null
+++ b/libc/isystem/__type_traits/is_move_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_move_constructible.h"
diff --git a/libc/isystem/__type_traits/is_nothrow_assignable.h b/libc/isystem/__type_traits/is_nothrow_assignable.h
new file mode 100644
index 000000000..0ecb04386
--- /dev/null
+++ b/libc/isystem/__type_traits/is_nothrow_assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_nothrow_assignable.h"
diff --git a/libc/isystem/__type_traits/is_nothrow_constructible.h b/libc/isystem/__type_traits/is_nothrow_constructible.h
new file mode 100644
index 000000000..78062675e
--- /dev/null
+++ b/libc/isystem/__type_traits/is_nothrow_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_nothrow_constructible.h"
diff --git a/libc/isystem/__type_traits/is_nothrow_convertible.h b/libc/isystem/__type_traits/is_nothrow_convertible.h
new file mode 100644
index 000000000..a9e7da15a
--- /dev/null
+++ b/libc/isystem/__type_traits/is_nothrow_convertible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_nothrow_convertible.h"
diff --git a/libc/isystem/__type_traits/is_nothrow_copy_assignable.h b/libc/isystem/__type_traits/is_nothrow_copy_assignable.h
new file mode 100644
index 000000000..88d02b91d
--- /dev/null
+++ b/libc/isystem/__type_traits/is_nothrow_copy_assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_nothrow_copy_assignable.h"
diff --git a/libc/isystem/__type_traits/is_nothrow_copy_constructible.h b/libc/isystem/__type_traits/is_nothrow_copy_constructible.h
new file mode 100644
index 000000000..ca1961fb1
--- /dev/null
+++ b/libc/isystem/__type_traits/is_nothrow_copy_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_nothrow_copy_constructible.h"
diff --git a/libc/isystem/cosmoaudio.h b/libc/isystem/__type_traits/is_nothrow_default_constructible.h
similarity index 56%
rename from libc/isystem/cosmoaudio.h
rename to libc/isystem/__type_traits/is_nothrow_default_constructible.h
index 84eabad2e..ed62c54cb 100644
--- a/libc/isystem/cosmoaudio.h
+++ b/libc/isystem/__type_traits/is_nothrow_default_constructible.h
@@ -1 +1 @@
-#include "dsp/audio/cosmoaudio/cosmoaudio.h"
+#include "third_party/libcxx/__type_traits/is_nothrow_default_constructible.h"
diff --git a/libc/isystem/__type_traits/is_nothrow_destructible.h b/libc/isystem/__type_traits/is_nothrow_destructible.h
new file mode 100644
index 000000000..07530bbc1
--- /dev/null
+++ b/libc/isystem/__type_traits/is_nothrow_destructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_nothrow_destructible.h"
diff --git a/libc/isystem/__type_traits/is_nothrow_move_assignable.h b/libc/isystem/__type_traits/is_nothrow_move_assignable.h
new file mode 100644
index 000000000..34b36de9a
--- /dev/null
+++ b/libc/isystem/__type_traits/is_nothrow_move_assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_nothrow_move_assignable.h"
diff --git a/libc/isystem/__type_traits/is_nothrow_move_constructible.h b/libc/isystem/__type_traits/is_nothrow_move_constructible.h
new file mode 100644
index 000000000..551fd299b
--- /dev/null
+++ b/libc/isystem/__type_traits/is_nothrow_move_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_nothrow_move_constructible.h"
diff --git a/libc/isystem/__type_traits/is_null_pointer.h b/libc/isystem/__type_traits/is_null_pointer.h
new file mode 100644
index 000000000..34126db4f
--- /dev/null
+++ b/libc/isystem/__type_traits/is_null_pointer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_null_pointer.h"
diff --git a/libc/isystem/__type_traits/is_object.h b/libc/isystem/__type_traits/is_object.h
new file mode 100644
index 000000000..12726de25
--- /dev/null
+++ b/libc/isystem/__type_traits/is_object.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_object.h"
diff --git a/libc/isystem/__type_traits/is_pod.h b/libc/isystem/__type_traits/is_pod.h
new file mode 100644
index 000000000..564d5b017
--- /dev/null
+++ b/libc/isystem/__type_traits/is_pod.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_pod.h"
diff --git a/libc/isystem/__type_traits/is_pointer.h b/libc/isystem/__type_traits/is_pointer.h
new file mode 100644
index 000000000..5a790de23
--- /dev/null
+++ b/libc/isystem/__type_traits/is_pointer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_pointer.h"
diff --git a/libc/isystem/__type_traits/is_polymorphic.h b/libc/isystem/__type_traits/is_polymorphic.h
new file mode 100644
index 000000000..31f16c24a
--- /dev/null
+++ b/libc/isystem/__type_traits/is_polymorphic.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_polymorphic.h"
diff --git a/libc/isystem/__type_traits/is_primary_template.h b/libc/isystem/__type_traits/is_primary_template.h
new file mode 100644
index 000000000..4efbc8287
--- /dev/null
+++ b/libc/isystem/__type_traits/is_primary_template.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_primary_template.h"
diff --git a/libc/isystem/__type_traits/is_reference.h b/libc/isystem/__type_traits/is_reference.h
new file mode 100644
index 000000000..5d612d1bc
--- /dev/null
+++ b/libc/isystem/__type_traits/is_reference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_reference.h"
diff --git a/libc/isystem/__type_traits/is_reference_wrapper.h b/libc/isystem/__type_traits/is_reference_wrapper.h
new file mode 100644
index 000000000..e5cde5867
--- /dev/null
+++ b/libc/isystem/__type_traits/is_reference_wrapper.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_reference_wrapper.h"
diff --git a/libc/isystem/__type_traits/is_referenceable.h b/libc/isystem/__type_traits/is_referenceable.h
new file mode 100644
index 000000000..355e8ba81
--- /dev/null
+++ b/libc/isystem/__type_traits/is_referenceable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_referenceable.h"
diff --git a/libc/isystem/__type_traits/is_same.h b/libc/isystem/__type_traits/is_same.h
new file mode 100644
index 000000000..3eb4654e9
--- /dev/null
+++ b/libc/isystem/__type_traits/is_same.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_same.h"
diff --git a/libc/isystem/__type_traits/is_scalar.h b/libc/isystem/__type_traits/is_scalar.h
new file mode 100644
index 000000000..e7b943bd1
--- /dev/null
+++ b/libc/isystem/__type_traits/is_scalar.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_scalar.h"
diff --git a/libc/isystem/__type_traits/is_scoped_enum.h b/libc/isystem/__type_traits/is_scoped_enum.h
new file mode 100644
index 000000000..6d10abec5
--- /dev/null
+++ b/libc/isystem/__type_traits/is_scoped_enum.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_scoped_enum.h"
diff --git a/libc/isystem/__type_traits/is_signed.h b/libc/isystem/__type_traits/is_signed.h
new file mode 100644
index 000000000..cb478ab69
--- /dev/null
+++ b/libc/isystem/__type_traits/is_signed.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_signed.h"
diff --git a/libc/isystem/__type_traits/is_signed_integer.h b/libc/isystem/__type_traits/is_signed_integer.h
new file mode 100644
index 000000000..bcbb428ba
--- /dev/null
+++ b/libc/isystem/__type_traits/is_signed_integer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_signed_integer.h"
diff --git a/libc/isystem/__type_traits/is_specialization.h b/libc/isystem/__type_traits/is_specialization.h
new file mode 100644
index 000000000..eb5d1b280
--- /dev/null
+++ b/libc/isystem/__type_traits/is_specialization.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_specialization.h"
diff --git a/libc/isystem/__type_traits/is_standard_layout.h b/libc/isystem/__type_traits/is_standard_layout.h
new file mode 100644
index 000000000..c930c71dc
--- /dev/null
+++ b/libc/isystem/__type_traits/is_standard_layout.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_standard_layout.h"
diff --git a/libc/isystem/__type_traits/is_swappable.h b/libc/isystem/__type_traits/is_swappable.h
new file mode 100644
index 000000000..749ac4eb4
--- /dev/null
+++ b/libc/isystem/__type_traits/is_swappable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_swappable.h"
diff --git a/libc/isystem/__type_traits/is_trivial.h b/libc/isystem/__type_traits/is_trivial.h
new file mode 100644
index 000000000..46a57f6f9
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivial.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivial.h"
diff --git a/libc/isystem/__type_traits/is_trivially_assignable.h b/libc/isystem/__type_traits/is_trivially_assignable.h
new file mode 100644
index 000000000..92ebe07f1
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_assignable.h"
diff --git a/libc/isystem/__type_traits/is_trivially_constructible.h b/libc/isystem/__type_traits/is_trivially_constructible.h
new file mode 100644
index 000000000..5987cb011
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_constructible.h"
diff --git a/libc/isystem/__type_traits/is_trivially_copy_assignable.h b/libc/isystem/__type_traits/is_trivially_copy_assignable.h
new file mode 100644
index 000000000..78e9db695
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_copy_assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_copy_assignable.h"
diff --git a/libc/isystem/__type_traits/is_trivially_copy_constructible.h b/libc/isystem/__type_traits/is_trivially_copy_constructible.h
new file mode 100644
index 000000000..6333c20fc
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_copy_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_copy_constructible.h"
diff --git a/libc/isystem/__type_traits/is_trivially_copyable.h b/libc/isystem/__type_traits/is_trivially_copyable.h
new file mode 100644
index 000000000..818579907
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_copyable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_copyable.h"
diff --git a/libc/isystem/__type_traits/is_trivially_default_constructible.h b/libc/isystem/__type_traits/is_trivially_default_constructible.h
new file mode 100644
index 000000000..1c81c485d
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_default_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_default_constructible.h"
diff --git a/libc/isystem/__type_traits/is_trivially_destructible.h b/libc/isystem/__type_traits/is_trivially_destructible.h
new file mode 100644
index 000000000..f36f1c412
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_destructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_destructible.h"
diff --git a/libc/isystem/__type_traits/is_trivially_lexicographically_comparable.h b/libc/isystem/__type_traits/is_trivially_lexicographically_comparable.h
new file mode 100644
index 000000000..0be37bf25
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_lexicographically_comparable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_lexicographically_comparable.h"
diff --git a/libc/isystem/__type_traits/is_trivially_move_assignable.h b/libc/isystem/__type_traits/is_trivially_move_assignable.h
new file mode 100644
index 000000000..0fd012dbe
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_move_assignable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_move_assignable.h"
diff --git a/libc/isystem/__type_traits/is_trivially_move_constructible.h b/libc/isystem/__type_traits/is_trivially_move_constructible.h
new file mode 100644
index 000000000..12610f283
--- /dev/null
+++ b/libc/isystem/__type_traits/is_trivially_move_constructible.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_trivially_move_constructible.h"
diff --git a/libc/isystem/__type_traits/is_unbounded_array.h b/libc/isystem/__type_traits/is_unbounded_array.h
new file mode 100644
index 000000000..a40efc493
--- /dev/null
+++ b/libc/isystem/__type_traits/is_unbounded_array.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_unbounded_array.h"
diff --git a/libc/isystem/__type_traits/is_union.h b/libc/isystem/__type_traits/is_union.h
new file mode 100644
index 000000000..c70568c14
--- /dev/null
+++ b/libc/isystem/__type_traits/is_union.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_union.h"
diff --git a/libc/isystem/__type_traits/is_unsigned.h b/libc/isystem/__type_traits/is_unsigned.h
new file mode 100644
index 000000000..06cbecdb7
--- /dev/null
+++ b/libc/isystem/__type_traits/is_unsigned.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_unsigned.h"
diff --git a/libc/isystem/__type_traits/is_unsigned_integer.h b/libc/isystem/__type_traits/is_unsigned_integer.h
new file mode 100644
index 000000000..28a87ddbb
--- /dev/null
+++ b/libc/isystem/__type_traits/is_unsigned_integer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_unsigned_integer.h"
diff --git a/libc/isystem/__type_traits/is_valid_expansion.h b/libc/isystem/__type_traits/is_valid_expansion.h
new file mode 100644
index 000000000..54b3b40dd
--- /dev/null
+++ b/libc/isystem/__type_traits/is_valid_expansion.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_valid_expansion.h"
diff --git a/libc/isystem/__type_traits/is_void.h b/libc/isystem/__type_traits/is_void.h
new file mode 100644
index 000000000..ff6e5ab16
--- /dev/null
+++ b/libc/isystem/__type_traits/is_void.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_void.h"
diff --git a/libc/isystem/__type_traits/is_volatile.h b/libc/isystem/__type_traits/is_volatile.h
new file mode 100644
index 000000000..41e60d4fe
--- /dev/null
+++ b/libc/isystem/__type_traits/is_volatile.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/is_volatile.h"
diff --git a/libc/isystem/__type_traits/lazy.h b/libc/isystem/__type_traits/lazy.h
new file mode 100644
index 000000000..fafac0f65
--- /dev/null
+++ b/libc/isystem/__type_traits/lazy.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/lazy.h"
diff --git a/libc/isystem/__type_traits/make_32_64_or_128_bit.h b/libc/isystem/__type_traits/make_32_64_or_128_bit.h
new file mode 100644
index 000000000..ad6a8b84f
--- /dev/null
+++ b/libc/isystem/__type_traits/make_32_64_or_128_bit.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/make_32_64_or_128_bit.h"
diff --git a/libc/isystem/__type_traits/make_const_lvalue_ref.h b/libc/isystem/__type_traits/make_const_lvalue_ref.h
new file mode 100644
index 000000000..e68557857
--- /dev/null
+++ b/libc/isystem/__type_traits/make_const_lvalue_ref.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/make_const_lvalue_ref.h"
diff --git a/libc/isystem/__type_traits/make_signed.h b/libc/isystem/__type_traits/make_signed.h
new file mode 100644
index 000000000..872328dfc
--- /dev/null
+++ b/libc/isystem/__type_traits/make_signed.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/make_signed.h"
diff --git a/libc/isystem/__type_traits/make_unsigned.h b/libc/isystem/__type_traits/make_unsigned.h
new file mode 100644
index 000000000..a66b4f91d
--- /dev/null
+++ b/libc/isystem/__type_traits/make_unsigned.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/make_unsigned.h"
diff --git a/libc/isystem/__type_traits/maybe_const.h b/libc/isystem/__type_traits/maybe_const.h
new file mode 100644
index 000000000..96b7db155
--- /dev/null
+++ b/libc/isystem/__type_traits/maybe_const.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/maybe_const.h"
diff --git a/libc/isystem/__type_traits/nat.h b/libc/isystem/__type_traits/nat.h
new file mode 100644
index 000000000..e3e8f8e17
--- /dev/null
+++ b/libc/isystem/__type_traits/nat.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/nat.h"
diff --git a/libc/isystem/__type_traits/negation.h b/libc/isystem/__type_traits/negation.h
new file mode 100644
index 000000000..4a7526d64
--- /dev/null
+++ b/libc/isystem/__type_traits/negation.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/negation.h"
diff --git a/libc/isystem/__type_traits/noexcept_move_assign_container.h b/libc/isystem/__type_traits/noexcept_move_assign_container.h
new file mode 100644
index 000000000..daed1653c
--- /dev/null
+++ b/libc/isystem/__type_traits/noexcept_move_assign_container.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/noexcept_move_assign_container.h"
diff --git a/libc/isystem/__type_traits/predicate_traits.h b/libc/isystem/__type_traits/predicate_traits.h
new file mode 100644
index 000000000..35cd151b5
--- /dev/null
+++ b/libc/isystem/__type_traits/predicate_traits.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/predicate_traits.h"
diff --git a/libc/isystem/__type_traits/promote.h b/libc/isystem/__type_traits/promote.h
new file mode 100644
index 000000000..8922694bd
--- /dev/null
+++ b/libc/isystem/__type_traits/promote.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/promote.h"
diff --git a/libc/isystem/__type_traits/rank.h b/libc/isystem/__type_traits/rank.h
new file mode 100644
index 000000000..82d80b323
--- /dev/null
+++ b/libc/isystem/__type_traits/rank.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/rank.h"
diff --git a/libc/isystem/__type_traits/remove_all_extents.h b/libc/isystem/__type_traits/remove_all_extents.h
new file mode 100644
index 000000000..92f1effee
--- /dev/null
+++ b/libc/isystem/__type_traits/remove_all_extents.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/remove_all_extents.h"
diff --git a/libc/isystem/__type_traits/remove_const.h b/libc/isystem/__type_traits/remove_const.h
new file mode 100644
index 000000000..cecaff288
--- /dev/null
+++ b/libc/isystem/__type_traits/remove_const.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/remove_const.h"
diff --git a/libc/isystem/__type_traits/remove_const_ref.h b/libc/isystem/__type_traits/remove_const_ref.h
new file mode 100644
index 000000000..c9ffba8ca
--- /dev/null
+++ b/libc/isystem/__type_traits/remove_const_ref.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/remove_const_ref.h"
diff --git a/libc/isystem/__type_traits/remove_cv.h b/libc/isystem/__type_traits/remove_cv.h
new file mode 100644
index 000000000..aa0c33a6d
--- /dev/null
+++ b/libc/isystem/__type_traits/remove_cv.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/remove_cv.h"
diff --git a/libc/isystem/__type_traits/remove_cvref.h b/libc/isystem/__type_traits/remove_cvref.h
new file mode 100644
index 000000000..9783b669a
--- /dev/null
+++ b/libc/isystem/__type_traits/remove_cvref.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/remove_cvref.h"
diff --git a/libc/isystem/__type_traits/remove_extent.h b/libc/isystem/__type_traits/remove_extent.h
new file mode 100644
index 000000000..591435551
--- /dev/null
+++ b/libc/isystem/__type_traits/remove_extent.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/remove_extent.h"
diff --git a/libc/isystem/__type_traits/remove_pointer.h b/libc/isystem/__type_traits/remove_pointer.h
new file mode 100644
index 000000000..4debc55fc
--- /dev/null
+++ b/libc/isystem/__type_traits/remove_pointer.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/remove_pointer.h"
diff --git a/libc/isystem/__type_traits/remove_reference.h b/libc/isystem/__type_traits/remove_reference.h
new file mode 100644
index 000000000..94093ed97
--- /dev/null
+++ b/libc/isystem/__type_traits/remove_reference.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/remove_reference.h"
diff --git a/libc/isystem/__type_traits/remove_volatile.h b/libc/isystem/__type_traits/remove_volatile.h
new file mode 100644
index 000000000..1cc860aea
--- /dev/null
+++ b/libc/isystem/__type_traits/remove_volatile.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/remove_volatile.h"
diff --git a/libc/isystem/__type_traits/result_of.h b/libc/isystem/__type_traits/result_of.h
new file mode 100644
index 000000000..140c19a93
--- /dev/null
+++ b/libc/isystem/__type_traits/result_of.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/result_of.h"
diff --git a/libc/isystem/__type_traits/strip_signature.h b/libc/isystem/__type_traits/strip_signature.h
new file mode 100644
index 000000000..bcd68770e
--- /dev/null
+++ b/libc/isystem/__type_traits/strip_signature.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/strip_signature.h"
diff --git a/libc/isystem/__type_traits/type_identity.h b/libc/isystem/__type_traits/type_identity.h
new file mode 100644
index 000000000..e848c9ae8
--- /dev/null
+++ b/libc/isystem/__type_traits/type_identity.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/type_identity.h"
diff --git a/libc/isystem/__type_traits/type_list.h b/libc/isystem/__type_traits/type_list.h
new file mode 100644
index 000000000..0b09b0b71
--- /dev/null
+++ b/libc/isystem/__type_traits/type_list.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/type_list.h"
diff --git a/libc/isystem/__type_traits/underlying_type.h b/libc/isystem/__type_traits/underlying_type.h
new file mode 100644
index 000000000..b2ac822cb
--- /dev/null
+++ b/libc/isystem/__type_traits/underlying_type.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/underlying_type.h"
diff --git a/libc/isystem/__type_traits/unwrap_ref.h b/libc/isystem/__type_traits/unwrap_ref.h
new file mode 100644
index 000000000..c374c1159
--- /dev/null
+++ b/libc/isystem/__type_traits/unwrap_ref.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/unwrap_ref.h"
diff --git a/libc/isystem/__type_traits/void_t.h b/libc/isystem/__type_traits/void_t.h
new file mode 100644
index 000000000..7d0770fb2
--- /dev/null
+++ b/libc/isystem/__type_traits/void_t.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__type_traits/void_t.h"
diff --git a/libc/isystem/__undef_macros b/libc/isystem/__undef_macros
new file mode 100644
index 000000000..e7cf229e8
--- /dev/null
+++ b/libc/isystem/__undef_macros
@@ -0,0 +1 @@
+#include "third_party/libcxx/__undef_macros"
diff --git a/libc/isystem/__utility/as_const.h b/libc/isystem/__utility/as_const.h
new file mode 100644
index 000000000..27b4f0441
--- /dev/null
+++ b/libc/isystem/__utility/as_const.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/as_const.h"
diff --git a/libc/isystem/__utility/auto_cast.h b/libc/isystem/__utility/auto_cast.h
new file mode 100644
index 000000000..e1d2b1b21
--- /dev/null
+++ b/libc/isystem/__utility/auto_cast.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/auto_cast.h"
diff --git a/libc/isystem/__utility/cmp.h b/libc/isystem/__utility/cmp.h
new file mode 100644
index 000000000..e934c6834
--- /dev/null
+++ b/libc/isystem/__utility/cmp.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/cmp.h"
diff --git a/libc/isystem/__utility/convert_to_integral.h b/libc/isystem/__utility/convert_to_integral.h
new file mode 100644
index 000000000..8e425f3a1
--- /dev/null
+++ b/libc/isystem/__utility/convert_to_integral.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/convert_to_integral.h"
diff --git a/libc/isystem/__utility/declval.h b/libc/isystem/__utility/declval.h
new file mode 100644
index 000000000..29abde632
--- /dev/null
+++ b/libc/isystem/__utility/declval.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/declval.h"
diff --git a/libc/isystem/__utility/exception_guard.h b/libc/isystem/__utility/exception_guard.h
new file mode 100644
index 000000000..3d82c94ab
--- /dev/null
+++ b/libc/isystem/__utility/exception_guard.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/exception_guard.h"
diff --git a/libc/isystem/__utility/exchange.h b/libc/isystem/__utility/exchange.h
new file mode 100644
index 000000000..1829ac4c4
--- /dev/null
+++ b/libc/isystem/__utility/exchange.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/exchange.h"
diff --git a/libc/isystem/__utility/forward.h b/libc/isystem/__utility/forward.h
new file mode 100644
index 000000000..78304269e
--- /dev/null
+++ b/libc/isystem/__utility/forward.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/forward.h"
diff --git a/libc/isystem/__utility/forward_like.h b/libc/isystem/__utility/forward_like.h
new file mode 100644
index 000000000..c7704f6c4
--- /dev/null
+++ b/libc/isystem/__utility/forward_like.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/forward_like.h"
diff --git a/libc/isystem/__utility/in_place.h b/libc/isystem/__utility/in_place.h
new file mode 100644
index 000000000..b00e62b97
--- /dev/null
+++ b/libc/isystem/__utility/in_place.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/in_place.h"
diff --git a/libc/isystem/__utility/integer_sequence.h b/libc/isystem/__utility/integer_sequence.h
new file mode 100644
index 000000000..ff7f69662
--- /dev/null
+++ b/libc/isystem/__utility/integer_sequence.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/integer_sequence.h"
diff --git a/libc/isystem/__utility/move.h b/libc/isystem/__utility/move.h
new file mode 100644
index 000000000..73351f46e
--- /dev/null
+++ b/libc/isystem/__utility/move.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/move.h"
diff --git a/libc/isystem/__utility/pair.h b/libc/isystem/__utility/pair.h
new file mode 100644
index 000000000..cb75842e7
--- /dev/null
+++ b/libc/isystem/__utility/pair.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/pair.h"
diff --git a/libc/isystem/__utility/piecewise_construct.h b/libc/isystem/__utility/piecewise_construct.h
new file mode 100644
index 000000000..838a91798
--- /dev/null
+++ b/libc/isystem/__utility/piecewise_construct.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/piecewise_construct.h"
diff --git a/libc/isystem/__utility/priority_tag.h b/libc/isystem/__utility/priority_tag.h
new file mode 100644
index 000000000..03113256f
--- /dev/null
+++ b/libc/isystem/__utility/priority_tag.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/priority_tag.h"
diff --git a/libc/isystem/__utility/rel_ops.h b/libc/isystem/__utility/rel_ops.h
new file mode 100644
index 000000000..fb9995be0
--- /dev/null
+++ b/libc/isystem/__utility/rel_ops.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/rel_ops.h"
diff --git a/libc/isystem/__utility/swap.h b/libc/isystem/__utility/swap.h
new file mode 100644
index 000000000..6a8b815a9
--- /dev/null
+++ b/libc/isystem/__utility/swap.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/swap.h"
diff --git a/libc/isystem/__utility/terminate_on_exception.h b/libc/isystem/__utility/terminate_on_exception.h
new file mode 100644
index 000000000..7d5ce26b3
--- /dev/null
+++ b/libc/isystem/__utility/terminate_on_exception.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/terminate_on_exception.h"
diff --git a/libc/isystem/__utility/to_underlying.h b/libc/isystem/__utility/to_underlying.h
new file mode 100644
index 000000000..2d99d1f1d
--- /dev/null
+++ b/libc/isystem/__utility/to_underlying.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/to_underlying.h"
diff --git a/libc/isystem/__utility/unreachable.h b/libc/isystem/__utility/unreachable.h
new file mode 100644
index 000000000..a0876cd42
--- /dev/null
+++ b/libc/isystem/__utility/unreachable.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__utility/unreachable.h"
diff --git a/libc/isystem/__variant/monostate.h b/libc/isystem/__variant/monostate.h
new file mode 100644
index 000000000..b6a59228d
--- /dev/null
+++ b/libc/isystem/__variant/monostate.h
@@ -0,0 +1 @@
+#include "third_party/libcxx/__variant/monostate.h"
diff --git a/libc/isystem/__verbose_abort b/libc/isystem/__verbose_abort
new file mode 100644
index 000000000..7ce36bffb
--- /dev/null
+++ b/libc/isystem/__verbose_abort
@@ -0,0 +1 @@
+#include "third_party/libcxx/__verbose_abort"
diff --git a/libc/isystem/ammintrin.h b/libc/isystem/ammintrin.h
index c29f7c84a..028098a89 100644
--- a/libc/isystem/ammintrin.h
+++ b/libc/isystem/ammintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/ammintrin.h"
-#else
 #include "third_party/intel/ammintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/amxcomplexintrin.h b/libc/isystem/amxcomplexintrin.h
index be8122bd3..b6b9ea7d3 100644
--- a/libc/isystem/amxcomplexintrin.h
+++ b/libc/isystem/amxcomplexintrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/amxcomplexintrin.h"
-#else
 #include "third_party/intel/amxcomplexintrin.internal.h"
-#endif
diff --git a/libc/isystem/amxfp16intrin.h b/libc/isystem/amxfp16intrin.h
index eb25dfc70..6b4043496 100644
--- a/libc/isystem/amxfp16intrin.h
+++ b/libc/isystem/amxfp16intrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/amxfp16intrin.h"
-#else
 #include "third_party/intel/amxfp16intrin.internal.h"
-#endif
diff --git a/libc/isystem/arm_acle.h b/libc/isystem/arm_acle.h
index 70d0a1ed2..5e695146a 100644
--- a/libc/isystem/arm_acle.h
+++ b/libc/isystem/arm_acle.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
-#ifdef __clang__
-#include "third_party/aarch64/clang/arm_acle.h"
-#else
 #include "third_party/aarch64/arm_acle.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_ */
diff --git a/libc/isystem/arm_bf16.h b/libc/isystem/arm_bf16.h
index bbfdfe6ba..8177f26c0 100644
--- a/libc/isystem/arm_bf16.h
+++ b/libc/isystem/arm_bf16.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
-#ifdef __clang__
-#include "third_party/aarch64/clang/arm_bf16.h"
-#else
 #include "third_party/aarch64/arm_bf16.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_ */
diff --git a/libc/isystem/arm_fp16.h b/libc/isystem/arm_fp16.h
index 5c269e35d..2df9b91a2 100644
--- a/libc/isystem/arm_fp16.h
+++ b/libc/isystem/arm_fp16.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
-#ifdef __clang__
-#include "third_party/aarch64/clang/arm_fp16.h"
-#else
 #include "third_party/aarch64/arm_fp16.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_ */
diff --git a/libc/isystem/arm_neon.h b/libc/isystem/arm_neon.h
index 6beff8834..c59b01ae6 100644
--- a/libc/isystem/arm_neon.h
+++ b/libc/isystem/arm_neon.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
-#ifdef __clang__
-#include "third_party/aarch64/clang/arm_neon.h"
-#else
 #include "third_party/aarch64/arm_neon.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_ */
diff --git a/libc/isystem/arm_sve.h b/libc/isystem/arm_sve.h
deleted file mode 100644
index 2e8fc6d18..000000000
--- a/libc/isystem/arm_sve.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_
-#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_
-#ifdef __clang__
-#include "third_party/aarch64/clang/arm_sve.h"
-#else
-#include "third_party/aarch64/arm_sve.internal.h"
-#endif
-#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_ */
diff --git a/libc/isystem/arm_vector_types.h b/libc/isystem/arm_vector_types.h
deleted file mode 100644
index b2018e69d..000000000
--- a/libc/isystem/arm_vector_types.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_
-#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_
-#ifdef __clang__
-#include "third_party/aarch64/clang/arm_vector_types.h"
-#else
-#include "third_party/aarch64/arm_vector_types.internal.h"
-#endif
-#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_ */
diff --git a/libc/isystem/avxifmaintrin.h b/libc/isystem/avxifmaintrin.h
index 8b94c5d8e..a93835f7e 100644
--- a/libc/isystem/avxifmaintrin.h
+++ b/libc/isystem/avxifmaintrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/avxifmaintrin.h"
-#else
 #include "third_party/intel/avxifmaintrin.internal.h"
-#endif
diff --git a/libc/isystem/avxneconvertintrin.h b/libc/isystem/avxneconvertintrin.h
index fac905bc6..691504600 100644
--- a/libc/isystem/avxneconvertintrin.h
+++ b/libc/isystem/avxneconvertintrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/avxneconvertintrin.h"
-#else
 #include "third_party/intel/avxneconvertintrin.internal.h"
-#endif
diff --git a/libc/isystem/avxvnniint16intrin.h b/libc/isystem/avxvnniint16intrin.h
index b7cce37b7..fc8c6d0ab 100644
--- a/libc/isystem/avxvnniint16intrin.h
+++ b/libc/isystem/avxvnniint16intrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/avxvnniint16intrin.h"
-#else
 #include "third_party/intel/avxvnniint16intrin.internal.h"
-#endif
diff --git a/libc/isystem/avxvnniint8intrin.h b/libc/isystem/avxvnniint8intrin.h
index e760b646d..ccb746b56 100644
--- a/libc/isystem/avxvnniint8intrin.h
+++ b/libc/isystem/avxvnniint8intrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/avxvnniint8intrin.h"
-#else
 #include "third_party/intel/avxvnniint8intrin.internal.h"
-#endif
diff --git a/libc/isystem/barrier b/libc/isystem/barrier
new file mode 100644
index 000000000..9f7236ad3
--- /dev/null
+++ b/libc/isystem/barrier
@@ -0,0 +1 @@
+#include "third_party/libcxx/barrier"
diff --git a/libc/isystem/clzerointrin.h b/libc/isystem/clzerointrin.h
index 5e9b053d6..5c0be5400 100644
--- a/libc/isystem/clzerointrin.h
+++ b/libc/isystem/clzerointrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/clzerointrin.h"
-#else
 #include "third_party/intel/clzerointrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/cmpccxaddintrin.h b/libc/isystem/cmpccxaddintrin.h
index 10f4e9b7e..48fd2c5db 100644
--- a/libc/isystem/cmpccxaddintrin.h
+++ b/libc/isystem/cmpccxaddintrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/cmpccxaddintrin.h"
-#else
 #include "third_party/intel/cmpccxaddintrin.internal.h"
-#endif
diff --git a/libc/isystem/complex.h b/libc/isystem/complex.h
index 417707176..bd8c4569b 100644
--- a/libc/isystem/complex.h
+++ b/libc/isystem/complex.h
@@ -1,6 +1,6 @@
 #ifndef _COMPLEX_H
 #define _COMPLEX_H
-#include <math.h>
 #include "libc/complex.h"
-#include "libc/imag.h"
+#include "libc/imag.internal.h"
+#include "libc/math.h"
 #endif /* _COMPLEX_H */
diff --git a/libc/isystem/concepts b/libc/isystem/concepts
new file mode 100644
index 000000000..f7d134987
--- /dev/null
+++ b/libc/isystem/concepts
@@ -0,0 +1 @@
+#include "third_party/libcxx/concepts"
diff --git a/libc/isystem/coroutine b/libc/isystem/coroutine
new file mode 100644
index 000000000..e024e6a48
--- /dev/null
+++ b/libc/isystem/coroutine
@@ -0,0 +1 @@
+#include "third_party/libcxx/coroutine"
diff --git a/libc/isystem/cosmo.h b/libc/isystem/cosmo.h
index 5004c0a11..e8f15be72 100644
--- a/libc/isystem/cosmo.h
+++ b/libc/isystem/cosmo.h
@@ -60,6 +60,7 @@
 #include "libc/str/utf16.h"
 #include "libc/sysv/errfuns.h"
 #include "net/http/http.h"
+#include "tool/args/args.h"
 
 #ifdef COSMO_ALREADY_DEFINED
 #undef COSMO_ALREADY_DEFINED
diff --git a/libc/isystem/cuchar b/libc/isystem/cuchar
new file mode 100644
index 000000000..90ac28fd7
--- /dev/null
+++ b/libc/isystem/cuchar
@@ -0,0 +1 @@
+#include "third_party/libcxx/cuchar"
diff --git a/libc/isystem/emmintrin.h b/libc/isystem/emmintrin.h
index 5123aa712..1c670b16a 100644
--- a/libc/isystem/emmintrin.h
+++ b/libc/isystem/emmintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/emmintrin.h"
-#else
 #include "third_party/intel/emmintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/expected b/libc/isystem/expected
new file mode 100644
index 000000000..02ac281f6
--- /dev/null
+++ b/libc/isystem/expected
@@ -0,0 +1 @@
+#include "third_party/libcxx/expected"
diff --git a/libc/isystem/float.h b/libc/isystem/float.h
index a5cf995a2..c1effda08 100644
--- a/libc/isystem/float.h
+++ b/libc/isystem/float.h
@@ -1,5 +1,5 @@
 #ifndef _FLOAT_H
 #define _FLOAT_H
-#include <math.h>
+#include "libc/math.h"
 #include "libc/runtime/fenv.h"
 #endif /* _FLOAT_H */
diff --git a/libc/isystem/format b/libc/isystem/format
new file mode 100644
index 000000000..5e20f807c
--- /dev/null
+++ b/libc/isystem/format
@@ -0,0 +1 @@
+#include "third_party/libcxx/format"
diff --git a/libc/isystem/immintrin.h b/libc/isystem/immintrin.h
index 72cb67c80..683eb5a7a 100644
--- a/libc/isystem/immintrin.h
+++ b/libc/isystem/immintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/immintrin.h"
-#else
 #include "third_party/intel/immintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/iso646.h b/libc/isystem/iso646.h
index 11e3a77c9..5d203df5f 100644
--- a/libc/isystem/iso646.h
+++ b/libc/isystem/iso646.h
@@ -1,4 +1,4 @@
 #ifndef _ISO646_H
 #define _ISO646_H
-#include "libc/iso646.h"
+#include "libc/iso646.internal.h"
 #endif /* _ISO646_H */
diff --git a/libc/isystem/langinfo.h b/libc/isystem/langinfo.h
index fbc4c3d46..6be085243 100644
--- a/libc/isystem/langinfo.h
+++ b/libc/isystem/langinfo.h
@@ -1,4 +1,6 @@
 #ifndef _LANGINFO_H
 #define _LANGINFO_H
 #include "libc/str/langinfo.h"
+#include "libc/str/locale.h"
+#include "libc/str/nltypes.h"
 #endif /* _LANGINFO_H */
diff --git a/libc/isystem/latch b/libc/isystem/latch
new file mode 100644
index 000000000..07ec09a4c
--- /dev/null
+++ b/libc/isystem/latch
@@ -0,0 +1 @@
+#include "third_party/libcxx/latch"
diff --git a/libc/isystem/mdspan b/libc/isystem/mdspan
new file mode 100644
index 000000000..abfed35d9
--- /dev/null
+++ b/libc/isystem/mdspan
@@ -0,0 +1 @@
+#include "third_party/libcxx/mdspan"
diff --git a/libc/isystem/memory_resource b/libc/isystem/memory_resource
new file mode 100644
index 000000000..d875d7b72
--- /dev/null
+++ b/libc/isystem/memory_resource
@@ -0,0 +1 @@
+#include "third_party/libcxx/memory_resource"
diff --git a/libc/isystem/mm_malloc.h b/libc/isystem/mm_malloc.h
index a81913524..7634fa6de 100644
--- a/libc/isystem/mm_malloc.h
+++ b/libc/isystem/mm_malloc.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/mm_malloc.h"
-#else
 #include "third_party/intel/mm_malloc.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_ */
diff --git a/libc/isystem/mmintrin.h b/libc/isystem/mmintrin.h
index f4fbbe9d3..af089e7c6 100644
--- a/libc/isystem/mmintrin.h
+++ b/libc/isystem/mmintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/mmintrin.h"
-#else
 #include "third_party/intel/mmintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/mwaitxintrin.h b/libc/isystem/mwaitxintrin.h
index aa5d8ef88..42a5f3e72 100644
--- a/libc/isystem/mwaitxintrin.h
+++ b/libc/isystem/mwaitxintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/mwaitxintrin.h"
-#else
 #include "third_party/intel/mwaitxintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/nmmintrin.h b/libc/isystem/nmmintrin.h
index f2fcea020..0a5ef7c98 100644
--- a/libc/isystem/nmmintrin.h
+++ b/libc/isystem/nmmintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/nmmintrin.h"
-#else
 #include "third_party/intel/nmmintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/numbers b/libc/isystem/numbers
new file mode 100644
index 000000000..2e2b78a74
--- /dev/null
+++ b/libc/isystem/numbers
@@ -0,0 +1 @@
+#include "third_party/libcxx/numbers"
diff --git a/libc/isystem/pmmintrin.h b/libc/isystem/pmmintrin.h
index 5c557dc55..21e098b7c 100644
--- a/libc/isystem/pmmintrin.h
+++ b/libc/isystem/pmmintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/pmmintrin.h"
-#else
 #include "third_party/intel/pmmintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/popcntintrin.h b/libc/isystem/popcntintrin.h
index 9583b31e4..632667eb0 100644
--- a/libc/isystem/popcntintrin.h
+++ b/libc/isystem/popcntintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/popcntintrin.h"
-#else
 #include "third_party/intel/popcntintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/prfchiintrin.h b/libc/isystem/prfchiintrin.h
index 059e1e0db..f76698468 100644
--- a/libc/isystem/prfchiintrin.h
+++ b/libc/isystem/prfchiintrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/prfchiintrin.h"
-#else
 #include "third_party/intel/prfchiintrin.internal.h"
-#endif
diff --git a/libc/isystem/ranges b/libc/isystem/ranges
new file mode 100644
index 000000000..d2d911734
--- /dev/null
+++ b/libc/isystem/ranges
@@ -0,0 +1 @@
+#include "third_party/libcxx/ranges"
diff --git a/libc/isystem/raointintrin.h b/libc/isystem/raointintrin.h
index e9486ee29..4f41b106a 100644
--- a/libc/isystem/raointintrin.h
+++ b/libc/isystem/raointintrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/raointintrin.h"
-#else
 #include "third_party/intel/raointintrin.internal.h"
-#endif
diff --git a/libc/isystem/semaphore b/libc/isystem/semaphore
new file mode 100644
index 000000000..757c40616
--- /dev/null
+++ b/libc/isystem/semaphore
@@ -0,0 +1 @@
+#include "third_party/libcxx/semaphore"
diff --git a/libc/isystem/sgxintrin.h b/libc/isystem/sgxintrin.h
index e44b52753..0ba872436 100644
--- a/libc/isystem/sgxintrin.h
+++ b/libc/isystem/sgxintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/sgxintrin.h"
-#else
 #include "third_party/intel/sgxintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/sha512intrin.h b/libc/isystem/sha512intrin.h
index 1b1ed0d39..f364a7e5f 100644
--- a/libc/isystem/sha512intrin.h
+++ b/libc/isystem/sha512intrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/sha512intrin.h"
-#else
 #include "third_party/intel/sha512intrin.internal.h"
-#endif
diff --git a/libc/isystem/sm3intrin.h b/libc/isystem/sm3intrin.h
index 80271fb9b..2f35eeba6 100644
--- a/libc/isystem/sm3intrin.h
+++ b/libc/isystem/sm3intrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/sm3intrin.h"
-#else
 #include "third_party/intel/sm3intrin.internal.h"
-#endif
diff --git a/libc/isystem/sm4intrin.h b/libc/isystem/sm4intrin.h
index 2c9100603..91edac356 100644
--- a/libc/isystem/sm4intrin.h
+++ b/libc/isystem/sm4intrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/sm4intrin.h"
-#else
 #include "third_party/intel/sm4intrin.internal.h"
-#endif
diff --git a/libc/isystem/smmintrin.h b/libc/isystem/smmintrin.h
index 4fdb44f60..fd7d9b648 100644
--- a/libc/isystem/smmintrin.h
+++ b/libc/isystem/smmintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/smmintrin.h"
-#else
 #include "third_party/intel/smmintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/source_location b/libc/isystem/source_location
new file mode 100644
index 000000000..19b35c368
--- /dev/null
+++ b/libc/isystem/source_location
@@ -0,0 +1 @@
+#include "third_party/libcxx/source_location"
diff --git a/libc/isystem/stdalign.h b/libc/isystem/stdalign.h
index 9aeebe101..16874814a 100644
--- a/libc/isystem/stdalign.h
+++ b/libc/isystem/stdalign.h
@@ -1,4 +1,4 @@
 #ifndef _STDALIGN_H
 #define _STDALIGN_H
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #endif /* _STDALIGN_H */
diff --git a/libc/isystem/sys/poll.h b/libc/isystem/sys/poll.h
index df9126fb7..98177f98c 100644
--- a/libc/isystem/sys/poll.h
+++ b/libc/isystem/sys/poll.h
@@ -1,7 +1,5 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_SYS_POLL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_SYS_POLL_H_
-#include "libc/calls/weirdtypes.h"
 #include "libc/sock/sock.h"
-#include "libc/sock/struct/pollfd.h"
 #include "libc/sysv/consts/poll.h"
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_SYS_POLL_H_ */
diff --git a/libc/isystem/sys/vfs.h b/libc/isystem/sys/vfs.h
index 2904d0feb..9d8ac9412 100644
--- a/libc/isystem/sys/vfs.h
+++ b/libc/isystem/sys/vfs.h
@@ -1,5 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_SYS_VFS_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_SYS_VFS_H_
 #include "libc/calls/struct/statfs.h"
-#include "libc/sysv/consts/st.h"
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_SYS_VFS_H_ */
diff --git a/libc/isystem/tgmath.h b/libc/isystem/tgmath.h
index 8721bb944..28d124486 100644
--- a/libc/isystem/tgmath.h
+++ b/libc/isystem/tgmath.h
@@ -1,7 +1,7 @@
 #ifndef _TGMATH_H
 #define _TGMATH_H
 #include "libc/complex.h"
-#include "libc/imag.h"
+#include "libc/imag.internal.h"
 #include "libc/math.h"
 #if __STDC_VERSION__ + 0 >= 201112
 
diff --git a/libc/isystem/tmmintrin.h b/libc/isystem/tmmintrin.h
index 952e63841..d1279467e 100644
--- a/libc/isystem/tmmintrin.h
+++ b/libc/isystem/tmmintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/tmmintrin.h"
-#else
 #include "third_party/intel/tmmintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/usermsrintrin.h b/libc/isystem/usermsrintrin.h
index d996e157c..85a8d8130 100644
--- a/libc/isystem/usermsrintrin.h
+++ b/libc/isystem/usermsrintrin.h
@@ -1,5 +1 @@
-#ifdef __clang__
-#include "third_party/intel/clang/usermsrintrin.h"
-#else
 #include "third_party/intel/usermsrintrin.internal.h"
-#endif
diff --git a/libc/isystem/windowsesque.h b/libc/isystem/windowsesque.h
index 4b27c516c..f228173de 100644
--- a/libc/isystem/windowsesque.h
+++ b/libc/isystem/windowsesque.h
@@ -12,7 +12,6 @@
 #include "libc/nt/files.h"
 #include "libc/nt/ipc.h"
 #include "libc/nt/memory.h"
-#include "libc/nt/nls.h"
 #include "libc/nt/paint.h"
 #include "libc/nt/process.h"
 #include "libc/nt/registry.h"
@@ -1421,15 +1420,6 @@
 #define HKEY_CURRENT_CONFIG kNtHkeyCurrentConfig
 #define HKEY_DYN_DATA kNtHkeyDynData
 #define HKEY_CURRENT_USER_LOCAL_SETTINGS kNtHkeyCurrentUserLocalSettings
-#define KEY_QUERY_VALUE kNtKeyQueryValue
-#define KEY_SET_VALUE kNtKeySetValue
-#define KEY_CREATE_SUB_KEY kNtKeyCreateSubKey
-#define KEY_ENUMERATE_SUB_KEYS kNtKeyEnumerateSubKeys
-#define KEY_NOTIFY kNtKeyNotify
-#define KEY_CREATE_LINK kNtKeyCreateLink
-#define KEY_WOW64_32KEY kNtWow6432Key
-#define KEY_WOW64_64KEY kNtWow6464Key
-#define KEY_WOW64_RES kNtWow64Res
 #define KEY_READ kNtKeyRead
 #define KEY_WRITE kNtKeyWrite
 #define KEY_EXECUTE kNtKeyExecute
@@ -4301,13 +4291,6 @@
 #define MAKE_HRESULT(sev,fac,code) ((HRESULT) (((unsigned long)(sev)<<31) | ((unsigned long)(fac)<<16) | ((unsigned long)(code))) )
 #define MAKE_SCODE(sev,fac,code) ((SCODE) (((unsigned long)(sev)<<31) | ((unsigned long)(fac)<<16) | ((unsigned long)(code))) )
 
-#define CP_ACP        0
-#define CP_OEMCP      1
-#define CP_MACCP      2
-#define CP_THREAD_ACP 3
-#define CP_SYMBOL     42
-
-#define CP_UTF7 65000
 #define CP_UTF8 65001
 
 #endif /* COSMOPOLITAN_LIBC_COMPAT_INCLUDE_WINDOWS_H_ */
diff --git a/libc/isystem/wmmintrin.h b/libc/isystem/wmmintrin.h
index 15ed4a9fe..8c4f60e00 100644
--- a/libc/isystem/wmmintrin.h
+++ b/libc/isystem/wmmintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/wmmintrin.h"
-#else
 #include "third_party/intel/wmmintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/x86intrin.h b/libc/isystem/x86intrin.h
index da763450b..fb8c3f971 100644
--- a/libc/isystem/x86intrin.h
+++ b/libc/isystem/x86intrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/x86intrin.h"
-#else
 #include "third_party/intel/x86intrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_ */
diff --git a/libc/isystem/xmmintrin.h b/libc/isystem/xmmintrin.h
index 3f528bcb3..594e650fd 100644
--- a/libc/isystem/xmmintrin.h
+++ b/libc/isystem/xmmintrin.h
@@ -1,8 +1,4 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
-#ifdef __clang__
-#include "third_party/intel/clang/xmmintrin.h"
-#else
 #include "third_party/intel/xmmintrin.internal.h"
-#endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_ */
diff --git a/libc/log/addr2linepath.c b/libc/log/addr2linepath.c
index 058b52d80..eb656175e 100644
--- a/libc/log/addr2linepath.c
+++ b/libc/log/addr2linepath.c
@@ -40,7 +40,7 @@ void GetAddr2linePathInit(void) {
   char *res;
   int e = errno;
   const char *env, *cmd, *path;
-  if ((env = secure_getenv("ADDR2LINE"))) {
+  if ((env = getenv("ADDR2LINE"))) {
     cmd = env;
     path = env;
   } else {
diff --git a/libc/log/backtrace3.c b/libc/log/backtrace3.c
index 2714e7d33..a49240bce 100644
--- a/libc/log/backtrace3.c
+++ b/libc/log/backtrace3.c
@@ -24,13 +24,11 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/weaken.h"
 #include "libc/log/backtrace.internal.h"
-#include "libc/macros.h"
-#include "libc/mem/alloca.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/gc.internal.h"
 #include "libc/nexgen32e/stackframe.h"
 #include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
-#include "libc/runtime/stack.h"
 #include "libc/runtime/symbols.internal.h"
 #include "libc/str/str.h"
 #include "libc/thread/thread.h"
@@ -48,13 +46,13 @@
  * @param st is open symbol table for current executable
  * @return -1 w/ errno if error happened
  */
-int PrintBacktraceUsingSymbols(int fd, const struct StackFrame *bp,
-                               struct SymbolTable *st) {
+dontinstrument int PrintBacktraceUsingSymbols(int fd,
+                                              const struct StackFrame *bp,
+                                              struct SymbolTable *st) {
   size_t gi;
-  char *cxxbuf;
   intptr_t addr;
   const char *name;
-  int cxxbufsize = 0;
+  char cxxbuf[3000];
   int i, symbol, addend;
   struct Garbages *garbage;
   const struct StackFrame *frame;
@@ -93,25 +91,14 @@ int PrintBacktraceUsingSymbols(int fd, const struct StackFrame *bp,
       symbol = 0;
       addend = 0;
     }
-    name = __get_symbol_name(st, symbol);
-#pragma GCC push_options
-#pragma GCC diagnostic ignored "-Walloca-larger-than="
-    // decipher c++ symbols if there's enough stack memory
-    // stack size requirement assumes max_depth's still 20
-    if (_weaken(__demangle) &&    //
-        _weaken(__is_mangled) &&  //
-        _weaken(__is_mangled)(name)) {
-      if (!cxxbufsize)
-        if ((cxxbufsize = __get_safe_size(8192, 8192)) >= 512) {
-          cxxbuf = alloca(cxxbufsize);
-          CheckLargeStackAllocation(cxxbuf, sizeof(cxxbufsize));
-        }
-      if (cxxbufsize >= 512)
-        if (_weaken(__demangle)(cxxbuf, name, cxxbufsize) != -1)
-          name = cxxbuf;
+    if ((name = __get_symbol_name(st, symbol)) &&
+        (_weaken(__is_mangled) && _weaken(__is_mangled)(name))) {
+      _weaken(__demangle)(cxxbuf, name, sizeof(cxxbuf));
+      kprintf("%012lx %lx %s%+d\n", frame, addr, cxxbuf, addend);
+      name = cxxbuf;
+    } else {
+      kprintf("%012lx %lx %s%+d\n", frame, addr, name, addend);
     }
-#pragma GCC pop_options
-    kprintf("%012lx %lx %s%+d\n", frame, addr, name, addend);
   }
   return 0;
 }
diff --git a/libc/log/check.h b/libc/log/check.h
index fa482304b..f4fda0c01 100644
--- a/libc/log/check.h
+++ b/libc/log/check.h
@@ -1,7 +1,7 @@
 #ifndef COSMOPOLITAN_LIBC_LOG_CHECK_H_
 #define COSMOPOLITAN_LIBC_LOG_CHECK_H_
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 COSMOPOLITAN_C_START_
 
 #define CHECK(X, ...)         __CHK(ne, !=, false, "false", !!(X), #X, "" __VA_ARGS__)
diff --git a/libc/log/countbranch.h b/libc/log/countbranch.h
index 7476e7b41..403a0c98e 100644
--- a/libc/log/countbranch.h
+++ b/libc/log/countbranch.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_LOG_COUNTBRANCH_H_
 #define COSMOPOLITAN_LIBC_LOG_COUNTBRANCH_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdbool.h"
 COSMOPOLITAN_C_START_
 
diff --git a/libc/log/countbranch_data.S b/libc/log/countbranch_data.S
index a0063f7a4..eaf3dc078 100644
--- a/libc/log/countbranch_data.S
+++ b/libc/log/countbranch_data.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.yoink	countbranch_report
 
diff --git a/libc/log/countbranch_report.c b/libc/log/countbranch_report.c
index 2e1c88158..a152dbc07 100644
--- a/libc/log/countbranch_report.c
+++ b/libc/log/countbranch_report.c
@@ -19,7 +19,7 @@
 #include "libc/calls/calls.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/log/countbranch.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/alg.h"
 #include "libc/runtime/runtime.h"
diff --git a/libc/log/countexpr.h b/libc/log/countexpr.h
index 84e0be7fc..51104bcbe 100644
--- a/libc/log/countexpr.h
+++ b/libc/log/countexpr.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_LOG_COUNTEXPR_H_
 #define COSMOPOLITAN_LIBC_LOG_COUNTEXPR_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/bench.h"
 COSMOPOLITAN_C_START_
 
diff --git a/libc/log/countexpr_data.S b/libc/log/countexpr_data.S
index 2546e06a1..72db2252f 100644
--- a/libc/log/countexpr_data.S
+++ b/libc/log/countexpr_data.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.yoink	countexpr_report
 
diff --git a/libc/log/countexpr_report.c b/libc/log/countexpr_report.c
index e8f151d7f..75be60857 100644
--- a/libc/log/countexpr_report.c
+++ b/libc/log/countexpr_report.c
@@ -21,7 +21,7 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
 #include "libc/log/countexpr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdckdint.h"
diff --git a/libc/log/gdb.h b/libc/log/gdb.h
index b7d29fc0c..26e252a7c 100644
--- a/libc/log/gdb.h
+++ b/libc/log/gdb.h
@@ -3,7 +3,7 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/rusage.h"
 #include "libc/dce.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/sysv/consts/nr.h"
 #include "libc/sysv/consts/w.h"
 COSMOPOLITAN_C_START_
diff --git a/libc/log/libfatal.internal.h b/libc/log/libfatal.internal.h
index bd10073a1..4c55269c4 100644
--- a/libc/log/libfatal.internal.h
+++ b/libc/log/libfatal.internal.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_LOG_LIBFATAL_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_LOG_LIBFATAL_INTERNAL_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 COSMOPOLITAN_C_START_
 
 forceinline unsigned long __strlen(const char *s) {
diff --git a/libc/log/log.h b/libc/log/log.h
index 7f2498cc4..d8e62f7ea 100644
--- a/libc/log/log.h
+++ b/libc/log/log.h
@@ -48,6 +48,8 @@ void PrintGarbage(void);
 void PrintGarbageNumeric(FILE *);
 void PrintWindowsMemory(const char *, size_t);
 
+#ifndef __STRICT_ANSI__
+
 #define _LOG_UNLIKELY(x) __builtin_expect(!!(x), 0)
 
 extern unsigned __log_level; /* log level for runtime check */
@@ -243,6 +245,7 @@ void vffatalf(ARGS, va_list) asm("vflogf") ATTRV relegated wontreturn libcesque;
 #undef ATTR
 #undef ATTRV
 
+#endif /* __STRICT_ANSI__ */
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_LOG_LOG_H_ */
 #endif /* _COSMO_SOURCE */
diff --git a/libc/log/oncrash_amd64.c b/libc/log/oncrash_amd64.c
index f2726bb19..e55cfa7e5 100644
--- a/libc/log/oncrash_amd64.c
+++ b/libc/log/oncrash_amd64.c
@@ -20,7 +20,6 @@
 #include "libc/atomic.h"
 #include "libc/calls/blockcancel.internal.h"
 #include "libc/calls/calls.h"
-#include "libc/calls/internal.h"
 #include "libc/calls/state.internal.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/siginfo.h"
@@ -42,7 +41,7 @@
 #include "libc/log/gdb.h"
 #include "libc/log/internal.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/alloca.h"
 #include "libc/nexgen32e/stackframe.h"
@@ -243,10 +242,6 @@ static relegated void ShowCrashReport(int err, int sig, siginfo_t *si,
   }
   kprintf("\n");
   __print_maps(15);
-  if (g_fds.n)
-    kprintf("\n");
-  __printfds(g_fds.p, g_fds.n);
-  kprintf("\n");
   if (__argv)
     for (i = 0; i < __argc; ++i)
       kprintf("%s ", __argv[i]);
diff --git a/libc/log/oncrash_arm64.c b/libc/log/oncrash_arm64.c
index c91f39a92..83a347a75 100644
--- a/libc/log/oncrash_arm64.c
+++ b/libc/log/oncrash_arm64.c
@@ -41,13 +41,12 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/log/internal.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/stackframe.h"
 #include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
 #include "libc/runtime/symbols.internal.h"
-#include "libc/runtime/syslib.internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/auxv.h"
@@ -267,8 +266,7 @@ static relegated void __oncrash_impl(int sig, siginfo_t *si, ucontext_t *ctx) {
         if (j)
           Append(b, " ");
         Append(b, "%s%016lx%s x%d%s", ColorRegister(r),
-               ((uint64_t *)ctx->uc_mcontext.regs)[r], reset, r,
-               r == 8 || r == 9 ? " " : "");
+               ctx->uc_mcontext.regs[r], reset, r, r == 8 || r == 9 ? " " : "");
       }
       Append(b, "\n");
     }
@@ -397,6 +395,12 @@ relegated void __oncrash(int sig, siginfo_t *si, void *arg) {
   SpinLock(&lock);
   __oncrash_impl(sig, si, arg);
 
+  // unlike amd64, the instruction pointer on arm64 isn't advanced past
+  // the debugger breakpoint instruction automatically. we need this so
+  // execution can resume after __builtin_trap().
+  if (arg && sig == SIGTRAP)
+    ((ucontext_t *)arg)->uc_mcontext.PC += 4;
+
   // ensure execution doesn't resume for anything but SIGTRAP / SIGQUIT
   if (arg && sig != SIGTRAP && sig != SIGQUIT) {
     if (!IsXnu()) {
diff --git a/libc/log/printwindowsmemory.c b/libc/log/printwindowsmemory.c
index 9353b00ee..ed9a4a145 100644
--- a/libc/log/printwindowsmemory.c
+++ b/libc/log/printwindowsmemory.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/memflags.h"
 #include "libc/nt/memory.h"
 #include "libc/nt/struct/memorybasicinformation.h"
@@ -33,8 +33,8 @@ static const struct DescribeFlags kNtMemState[] = {
 };
 
 static const char *DescribeNtMemState(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kNtMemState, ARRAYLEN(kNtMemState), "kNtMem",
-                        x);
+  return DescribeFlags(buf, 64, kNtMemState, ARRAYLEN(kNtMemState), "kNtMem",
+                       x);
 }
 
 static const struct DescribeFlags kNtMemType[] = {
@@ -44,7 +44,7 @@ static const struct DescribeFlags kNtMemType[] = {
 };
 
 static const char *DescribeNtMemType(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kNtMemType, ARRAYLEN(kNtMemType), "kNtMem", x);
+  return DescribeFlags(buf, 64, kNtMemType, ARRAYLEN(kNtMemType), "kNtMem", x);
 }
 
 /**
@@ -77,7 +77,7 @@ void PrintWindowsMemory(const char *high, size_t size) {
             mi.AllocationBase, mi.BaseAddress, b[0],
             DescribeNtMemState(b[1], mi.State),
             DescribeNtMemType(b[2], mi.Type),
-            _DescribeNtPageFlags(b[3], mi.AllocationProtect),
-            _DescribeNtPageFlags(b[4], mi.Protect), stop);
+            (DescribeNtPageFlags)(b[3], mi.AllocationProtect),
+            (DescribeNtPageFlags)(b[4], mi.Protect), stop);
   }
 }
diff --git a/libc/log/showcrashreports.c b/libc/log/showcrashreports.c
index 7e3340e64..ff7ea1132 100644
--- a/libc/log/showcrashreports.c
+++ b/libc/log/showcrashreports.c
@@ -82,7 +82,11 @@ void ShowCrashReports(void) {
   ss.ss_sp = crashstack;
   unassert(!sigaltstack(&ss, 0));
   InstallCrashHandler(SIGQUIT, 0);
+#ifdef __x86_64__
   InstallCrashHandler(SIGTRAP, 0);
+#else
+  InstallCrashHandler(SIGTRAP, 0);
+#endif
   InstallCrashHandler(SIGFPE, 0);
   InstallCrashHandler(SIGILL, 0);
   InstallCrashHandler(SIGBUS, 0);
diff --git a/libc/log/showcrashreportsearly.S b/libc/log/showcrashreportsearly.S
index dfc41fe19..757414263 100644
--- a/libc/log/showcrashreportsearly.S
+++ b/libc/log/showcrashreportsearly.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Normally we call ShowCrashReports() from main, but if
 //	there's a crash in a constructor, this will help with
diff --git a/libc/log/watch-hook.S b/libc/log/watch-hook.S
index c8b99ba55..e73da1dc8 100644
--- a/libc/log/watch-hook.S
+++ b/libc/log/watch-hook.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 __watch_hook:
 	push	%rbp
diff --git a/libc/log/watch.c b/libc/log/watch.c
index 794242ca2..1cc96f767 100644
--- a/libc/log/watch.c
+++ b/libc/log/watch.c
@@ -33,14 +33,14 @@ static char __watch_last[4096];
 
 void __watch_hook(void);
 
-dontinstrument static inline void Copy(char *p, char *q, size_t n) {
+static dontinstrument inline void Copy(char *p, char *q, size_t n) {
   size_t i;
   for (i = 0; i < n; ++i) {
     p[i] = q[i];
   }
 }
 
-dontinstrument static inline int Cmp(char *p, char *q, size_t n) {
+static dontinstrument inline int Cmp(char *p, char *q, size_t n) {
   if (n == 8)
     return READ64LE(p) != READ64LE(q);
   if (n == 4)
diff --git a/libc/mach.h b/libc/mach.internal.h
similarity index 100%
rename from libc/mach.h
rename to libc/mach.internal.h
diff --git a/libc/macho.h b/libc/macho.internal.h
similarity index 100%
rename from libc/macho.h
rename to libc/macho.internal.h
diff --git a/libc/macros.h b/libc/macros.internal.h
similarity index 93%
rename from libc/macros.h
rename to libc/macros.internal.h
index 257007a84..cf00d0364 100644
--- a/libc/macros.h
+++ b/libc/macros.internal.h
@@ -158,60 +158,6 @@
 	.weak	\canonical
 .endm
 
-.macro	beg
-	.cfi_startproc
-.endm
-
-.macro	pro
-#if defined(__x86_64__)
-	push	%rbp
-	.cfi_adjust_cfa_offset 8
-	.cfi_rel_offset %rbp,0
-	mov	%rsp,%rbp
-	.cfi_def_cfa_register %rbp
-#elif defined(__aarch64__)
-	stp	x29,x30,[sp,-16]!
-	mov	x29,sp
-	.cfi_adjust_cfa_offset 16
-	.cfi_rel_offset x29,0
-	.cfi_rel_offset x30,8
-#else
-#error "unsupported architecture"
-#endif
-.endm
-
-.macro	epi
-#if defined(__x86_64__)
-	.cfi_def_cfa_register %rsp
-	leave
-	.cfi_adjust_cfa_offset -8
-	.cfi_restore %rbp
-#elif defined(__aarch64__)
-	ldp	x29,x30,[sp],#16
-	.cfi_adjust_cfa_offset -16
-	.cfi_restore x30
-	.cfi_restore x29
-#else
-#error "unsupported architecture"
-#endif
-.endm
-
-.macro	end
-	.cfi_endproc
-.endm
-
-.macro	cpush	reg:req
-	push	\reg
-	.cfi_adjust_cfa_offset 8
-	.cfi_rel_offset \reg,0
-.endm
-
-.macro	cpop	reg:req
-	pop	\reg
-	.cfi_adjust_cfa_offset -8
-	.cfi_restore \reg
-.endm
-
 #ifdef __aarch64__
 .macro	jmp	dest:req
 	b	\dest
@@ -345,6 +291,21 @@
 	.balign	4
 .endm
 
+//	Loads address of errno into %rcx
+.macro	.errno
+	call	__errno_location
+.endm
+
+//	Post-Initialization Read-Only (PIRO) BSS section.
+//	@param	ss is an optional string, for control image locality
+.macro	.piro	ss
+ .ifnb	\ss
+	.section .piro.sort.bss.\ss,"aw",@nobits
+ .else
+	.section .piro.bss,"aw",@nobits
+ .endif
+.endm
+
 //	Helpers for Cosmopolitan _init() amalgamation magic.
 //	@param	name should be consistent across macros for a module
 //	@see	libc/runtime/_init.S
diff --git a/libc/mem/BUILD.mk b/libc/mem/BUILD.mk
index 52f2ff9f4..438837a7a 100644
--- a/libc/mem/BUILD.mk
+++ b/libc/mem/BUILD.mk
@@ -42,8 +42,7 @@ $(LIBC_MEM_A_OBJS): private				\
 		COPTS +=				\
 			-fno-sanitize=all		\
 			-Wframe-larger-than=4096	\
-			-Walloca-larger-than=4096	\
-			-fexceptions
+			-Walloca-larger-than=4096
 
 o/$(MODE)/libc/mem/asan.o: private			\
 		CFLAGS +=				\
diff --git a/libc/mem/alg.h b/libc/mem/alg.h
index 8a887a524..ae519f76f 100644
--- a/libc/mem/alg.h
+++ b/libc/mem/alg.h
@@ -4,10 +4,14 @@ COSMOPOLITAN_C_START_
 
 void *bsearch(const void *, const void *, size_t, size_t,
               int (*)(const void *, const void *)) paramsnonnull() nosideeffect;
-void qsort3(void *, size_t, size_t, int (*)(const void *, const void *))
-    paramsnonnull();
-void qsort(void *, size_t, size_t, int (*)(const void *, const void *))
-    paramsnonnull();
+void *bsearch_r(const void *, const void *, size_t, size_t,
+                int (*)(const void *, const void *, void *), void *)
+    paramsnonnull((1, 2, 5)) nosideeffect;
+void djbsort(int32_t *, size_t) libcesque;
+void qsort3(void *, size_t, size_t,
+            int (*)(const void *, const void *)) libcesque paramsnonnull();
+void qsort(void *, size_t, size_t,
+           int (*)(const void *, const void *)) libcesque paramsnonnull();
 void qsort_r(void *, size_t, size_t,
              int (*)(const void *, const void *, void *), void *)
     paramsnonnull((1, 4));
@@ -21,12 +25,8 @@ int mergesort(void *, size_t, size_t, int (*)(const void *, const void *));
 int mergesort_r(void *, size_t, size_t,
                 int (*)(const void *, const void *, void *), void *);
 
-#ifdef _COSMO_SOURCE
-void djbsort(int32_t *, size_t) libcesque;
 int radix_sort_int32(int32_t *, size_t) libcesque;
 int radix_sort_int64(int64_t *, size_t) libcesque;
-double levenshtein(const char *, const char *) libcesque;
-#endif
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_ALG_ALG_H_ */
diff --git a/libc/mem/aligned_alloc.c b/libc/mem/aligned_alloc.c
index 27090f73e..3432bfbc7 100644
--- a/libc/mem/aligned_alloc.c
+++ b/libc/mem/aligned_alloc.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 
 /**
diff --git a/libc/mem/bisect.internal.h b/libc/mem/bisect.internal.h
new file mode 100644
index 000000000..2365f82cd
--- /dev/null
+++ b/libc/mem/bisect.internal.h
@@ -0,0 +1,31 @@
+#ifndef COSMOPOLITAN_LIBC_ALG_BISECT_H_
+#define COSMOPOLITAN_LIBC_ALG_BISECT_H_
+COSMOPOLITAN_C_START_
+
+forceinline void *bisect(const void *k, const void *data, size_t n, size_t size,
+                         int cmp(const void *a, const void *b, void *arg),
+                         void *arg) {
+  int c;
+  const char *p;
+  ssize_t m, l, r;
+  if (n) {
+    l = 0;
+    r = n - 1;
+    p = data;
+    while (l <= r) {
+      m = (l & r) + ((l ^ r) >> 1);
+      c = cmp(k, p + m * size, arg);
+      if (c > 0) {
+        l = m + 1;
+      } else if (c < 0) {
+        r = m - 1;
+      } else {
+        return (char *)p + m * size;
+      }
+    }
+  }
+  return NULL;
+}
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_ALG_BISECT_H_ */
diff --git a/libc/mem/bsearch.c b/libc/mem/bsearch.c
new file mode 100644
index 000000000..5bfc7cb82
--- /dev/null
+++ b/libc/mem/bsearch.c
@@ -0,0 +1,29 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/mem/alg.h"
+#include "libc/mem/bisect.internal.h"
+
+/**
+ * Searches sorted array for exact item in logarithmic time.
+ * @see bsearch_r()
+ */
+void *bsearch(const void *key, const void *base, size_t nmemb, size_t size,
+              int cmp(const void *a, const void *b)) {
+  return bisect(key, base, nmemb, size, (void *)cmp, NULL);
+}
diff --git a/libc/mem/bsearch_r.c b/libc/mem/bsearch_r.c
new file mode 100644
index 000000000..832d79edd
--- /dev/null
+++ b/libc/mem/bsearch_r.c
@@ -0,0 +1,29 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/mem/alg.h"
+#include "libc/mem/bisect.internal.h"
+
+/**
+ * Searches sorted array for exact item in logarithmic time.
+ * @see bsearch()
+ */
+void *bsearch_r(const void *key, const void *base, size_t nmemb, size_t size,
+                int cmp(const void *a, const void *b, void *arg), void *arg) {
+  return bisect(key, base, nmemb, size, cmp, arg);
+}
diff --git a/libc/mem/calloc.c b/libc/mem/calloc.c
index d70aefd3e..df578353c 100644
--- a/libc/mem/calloc.c
+++ b/libc/mem/calloc.c
@@ -19,8 +19,6 @@
 #include "libc/mem/mem.h"
 #include "third_party/dlmalloc/dlmalloc.h"
 
-__static_yoink("free");
-
 /**
  * Allocates n * itemsize bytes, initialized to zero.
  *
@@ -33,3 +31,4 @@ __static_yoink("free");
 void *calloc(size_t n, size_t itemsize) {
   return dlcalloc(n, itemsize);
 }
+
diff --git a/libc/mem/leaks.c b/libc/mem/leaks.c
index 97febe422..8c7ac7f9d 100644
--- a/libc/mem/leaks.c
+++ b/libc/mem/leaks.c
@@ -16,19 +16,16 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/mem/leaks.h"
 #include "libc/cxxabi.h"
 #include "libc/intrin/cxaatexit.h"
 #include "libc/intrin/dll.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/typedef/imagetlscallback.h"
 #include "libc/runtime/runtime.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
-#include "libc/thread/tls.h"
 
 #define LEAK_CONTAINER(e) DLL_CONTAINER(struct Leak, elem, e)
 
@@ -40,12 +37,12 @@ struct Leak {
 static int leak_count;
 static struct Dll *leaks;
 static struct Dll *freaks;
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t lock;
 
 void __may_leak(void *alloc) {
   if (!alloc)
     return;
-  _pthread_mutex_lock(&lock);
+  pthread_mutex_lock(&lock);
   if (dll_is_empty(freaks)) {
     int g = __gransize;
     struct Leak *p = _mapanon(g);
@@ -59,7 +56,7 @@ void __may_leak(void *alloc) {
   LEAK_CONTAINER(e)->alloc = alloc;
   dll_remove(&freaks, e);
   dll_make_first(&leaks, e);
-  _pthread_mutex_unlock(&lock);
+  pthread_mutex_unlock(&lock);
 }
 
 static void visitor(void *start, void *end, size_t used_bytes, void *arg) {
@@ -79,7 +76,7 @@ void CheckForMemoryLeaks(void) {
 
   // validate usage of this api
   if (_weaken(_pthread_decimate))
-    _weaken(_pthread_decimate)(kPosixThreadZombie);
+    _weaken(_pthread_decimate)(false);
   if (!pthread_orphan_np())
     kprintf("warning: called CheckForMemoryLeaks() from non-orphaned thread\n");
 
@@ -90,29 +87,8 @@ void CheckForMemoryLeaks(void) {
   // check for leaks
   malloc_inspect_all(visitor, 0);
   if (leak_count) {
-    kprintf("       you forgot to call free %'d time%s\n", leak_count,
+    kprintf("loser: you forgot to call free %'d time%s\n", leak_count,
             leak_count == 1 ? "" : "s");
     _exit(73);
   }
 }
-
-static bool IsHoldingLocks(struct CosmoTib *tib) {
-  for (int i = 0; i < ARRAYLEN(tib->tib_locks); ++i)
-    if (tib->tib_locks[i])
-      return true;
-  return false;
-}
-
-/**
- * Aborts if any locks are held by calling thread.
- */
-void AssertNoLocksAreHeld(void) {
-  struct CosmoTib *tib = __get_tls();
-  if (IsHoldingLocks(tib)) {
-    kprintf("error: the following locks are held by this thread:\n");
-    for (int i = 0; i < ARRAYLEN(tib->tib_locks); ++i)
-      if (tib->tib_locks[i])
-        kprintf("\t- %t\n", tib->tib_locks[i]);
-    _Exit(74);
-  }
-}
diff --git a/libc/mem/leaks.h b/libc/mem/leaks.h
index f77c609d2..dcf2ad464 100644
--- a/libc/mem/leaks.h
+++ b/libc/mem/leaks.h
@@ -4,7 +4,6 @@
 COSMOPOLITAN_C_START_
 
 void CheckForMemoryLeaks(void) libcesque;
-void AssertNoLocksAreHeld(void) libcesque;
 
 /**
  * Declares that allocation needn't be freed.
diff --git a/libc/mem/malloc.c b/libc/mem/malloc.c
index 0d3793cf9..043a41aac 100644
--- a/libc/mem/malloc.c
+++ b/libc/mem/malloc.c
@@ -19,8 +19,6 @@
 #include "libc/mem/mem.h"
 #include "third_party/dlmalloc/dlmalloc.h"
 
-__static_yoink("free");
-
 /**
  * Allocates uninitialized memory.
  *
diff --git a/libc/mem/memalign.c b/libc/mem/memalign.c
index 94129aaba..bdf8f9ff7 100644
--- a/libc/mem/memalign.c
+++ b/libc/mem/memalign.c
@@ -19,8 +19,6 @@
 #include "libc/mem/mem.h"
 #include "third_party/dlmalloc/dlmalloc.h"
 
-__static_yoink("free");
-
 /**
  * Allocates aligned memory.
  *
@@ -37,3 +35,4 @@ __static_yoink("free");
 void *memalign(size_t align, size_t bytes) {
   return dlmemalign(align, bytes);
 }
+
diff --git a/libc/mem/mergesort.c b/libc/mem/mergesort.c
index b79e67756..400b6dfe3 100644
--- a/libc/mem/mergesort.c
+++ b/libc/mem/mergesort.c
@@ -28,7 +28,7 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
diff --git a/libc/mem/posix_memalign.c b/libc/mem/posix_memalign.c
index 4cc5b65cb..32f9411aa 100644
--- a/libc/mem/posix_memalign.c
+++ b/libc/mem/posix_memalign.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 
 /**
diff --git a/libc/mem/putenv.c b/libc/mem/putenv.c
index 423b3eb69..9a096673c 100644
--- a/libc/mem/putenv.c
+++ b/libc/mem/putenv.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/getenv.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/internal.h"
 #include "libc/mem/leaks.h"
 #include "libc/mem/mem.h"
diff --git a/libc/mem/qsort.c b/libc/mem/qsort.c
index 0a2c1e550..361f26a86 100644
--- a/libc/mem/qsort.c
+++ b/libc/mem/qsort.c
@@ -28,7 +28,7 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/str/str.h"
 __static_yoink("openbsd_sorting_notice");
diff --git a/libc/mem/realloc.c b/libc/mem/realloc.c
index b9a4fe7b4..6d7451a8e 100644
--- a/libc/mem/realloc.c
+++ b/libc/mem/realloc.c
@@ -19,8 +19,6 @@
 #include "libc/mem/mem.h"
 #include "third_party/dlmalloc/dlmalloc.h"
 
-__static_yoink("free");
-
 /**
  * Allocates / resizes / frees memory, e.g.
  *
@@ -62,3 +60,4 @@ __static_yoink("free");
 void *realloc(void *p, size_t n) {
   return dlrealloc(p, n);
 }
+
diff --git a/libc/mem/shuffle.internal.h b/libc/mem/shuffle.internal.h
new file mode 100644
index 000000000..2b543a89d
--- /dev/null
+++ b/libc/mem/shuffle.internal.h
@@ -0,0 +1,21 @@
+#ifndef COSMOPOLITAN_LIBC_RAND_SHUFFLE_H_
+#define COSMOPOLITAN_LIBC_RAND_SHUFFLE_H_
+#include "libc/intrin/xchg.h"
+
+/**
+ * Fisher-Yates shuffle.
+ *
+ * @param R is a function like rand() → ≥0
+ * @param A is a typed array
+ * @param n is the number of items in A
+ * @see ARRAYLEN()
+ */
+#define shuffle(R, A, n)                      \
+  do {                                        \
+    autotype(A) Array = (A);                  \
+    for (size_t i = (n) - 1; i >= 1; --i) {   \
+      xchg(&Array[i], &Array[R() % (i + 1)]); \
+    }                                         \
+  } while (0)
+
+#endif /* COSMOPOLITAN_LIBC_RAND_SHUFFLE_H_ */
diff --git a/libc/mem/tinymalloc.inc b/libc/mem/tinymalloc.inc
index 6dd9b984f..1a4527c6b 100644
--- a/libc/mem/tinymalloc.inc
+++ b/libc/mem/tinymalloc.inc
@@ -16,9 +16,8 @@
 #include "libc/assert.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/mem/mem.h"
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #include "libc/stdckdint.h"
 #include "libc/str/str.h"
 
@@ -30,10 +29,6 @@
 #define TINYMALLOC_MAX_ALIGN sizeof(max_align_t)
 #endif
 
-#pragma GCC push_options
-#pragma GCC diagnostic ignored "-Wanalyzer-malloc-leak"
-#pragma GCC diagnostic ignored "-Wanalyzer-use-after-free"
-
 static struct {
   alignas(max_align_t) char bits[TINYMALLOC_MAX_BYTES];
   char *memory;
@@ -51,10 +46,6 @@ static void tinymalloc_init(void) {
   heap.once = 1;
 }
 
-__attribute__((__destructor__)) static void destroy(void) {
-  kprintf("used = %'zu\n", heap.used);
-}
-
 static inline int isheap(char *mem) {
   return heap.memory <= mem && mem < heap.memory + heap.used;
 }
@@ -187,5 +178,3 @@ OutOfMemory:
   errno = ENOMEM;
   return 0;
 }
-
-#pragma GCC pop_options
diff --git a/libc/nexgen32e/BUILD.mk b/libc/nexgen32e/BUILD.mk
index d84d8d853..cf50a81e7 100644
--- a/libc/nexgen32e/BUILD.mk
+++ b/libc/nexgen32e/BUILD.mk
@@ -71,6 +71,8 @@ o/$(MODE)/libc/nexgen32e/ksha512.o: libc/nexgen32e/ksha512.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/nexgen32e/kcp437.o: libc/nexgen32e/kcp437.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
+o/$(MODE)/libc/nexgen32e/kreversebits.o: libc/nexgen32e/kreversebits.S
+	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/nexgen32e/ktensindex.o: libc/nexgen32e/ktensindex.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/nexgen32e/longjmp.o: libc/nexgen32e/longjmp.S
diff --git a/libc/nexgen32e/argc.S b/libc/nexgen32e/argc.S
index 165c05447..9e85f4409 100644
--- a/libc/nexgen32e/argc.S
+++ b/libc/nexgen32e/argc.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.initbss 300,_init_argc
 //	Global variable holding _start(argc) parameter.
diff --git a/libc/nexgen32e/argv.S b/libc/nexgen32e/argv.S
index 7963698c8..9ee093476 100644
--- a/libc/nexgen32e/argv.S
+++ b/libc/nexgen32e/argv.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.initbss 300,_init_argv
 //	Global variable holding _start(argv) parameter.
diff --git a/libc/nexgen32e/auxv.S b/libc/nexgen32e/auxv.S
index 52381e1c6..e750b74fe 100644
--- a/libc/nexgen32e/auxv.S
+++ b/libc/nexgen32e/auxv.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.initbss 250,_init_auxv
 //	Global variable holding _start(auxv) parameter.
diff --git a/libc/nexgen32e/checkstackalign.S b/libc/nexgen32e/checkstackalign.S
index abe8dae61..18d360507 100644
--- a/libc/nexgen32e/checkstackalign.S
+++ b/libc/nexgen32e/checkstackalign.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Checks that stack is 16-byte aligned.
 //
diff --git a/libc/nexgen32e/djbsort-avx2.S b/libc/nexgen32e/djbsort-avx2.S
index 70868472d..70e24cbdd 100644
--- a/libc/nexgen32e/djbsort-avx2.S
+++ b/libc/nexgen32e/djbsort-avx2.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	D.J. Bernstein's outrageously fast integer sorting algorithm.
 //
@@ -7,8 +7,8 @@
 //	@note	public domain
 //	@see	en.wikipedia.org/wiki/Sorting_network
 djbsort_avx2:
-	beg
-	pro
+	push	%rbp
+	mov	%rsp,%rbp
 	push	%r15
 	push	%r14
 	push	%r13
@@ -795,13 +795,11 @@ djbsort_avx2:
 	pop	%r13
 	pop	%r14
 	pop	%r15
-	epi
+	pop	%rbp
 	ret
-	end
 	.endfn	djbsort_avx2,globl,hidden
 
 minmax_vector:
-	beg
 	cmp	$7,%rdx
 	jg	.L13
 .L2:	test	%rdx,%rdx
@@ -840,11 +838,9 @@ minmax_vector:
 	sub	$8,%rdx
 	jne	.L7
 	ret
-	end
 	.endfn	minmax_vector
 
 int32_twostages_32:
-	beg
 	sub	$-128,%rdi
 .L17:	lea	-128(%rdi),%rax
 	test	%rsi,%rsi
@@ -870,14 +866,13 @@ int32_twostages_32:
 	add	$512,%rdi
 	jmp	.L17
 .L21:	ret
-	end
 	.endfn	int32_twostages_32
 
 int32_threestages:
-	beg
-	pro
+	push	%rbp
 	imul	$-24,%rdx,%r8
 	lea	0(,%rdx,8),%rax
+	mov	%rsp,%rbp
 	push	%r15
 	push	%r14
 	push	%r13
@@ -966,13 +961,11 @@ int32_threestages:
 	pop	%r13
 	pop	%r14
 	pop	%r15
-	epi
+	pop	%rbp
 	ret
-	end
 	.endfn	int32_threestages
 
 merge16_finish:
-	beg
 	vpminsd	%ymm1,%ymm0,%ymm3
 	vpmaxsd	%ymm1,%ymm0,%ymm0
 	vperm2i128 $32,%ymm0,%ymm3,%ymm2
@@ -1001,11 +994,9 @@ merge16_finish:
 .L31:	vmovdqu	%ymm2,(%rdi)
 	vmovdqu	%ymm0,32(%rdi)
 	ret
-	end
 	.endfn	merge16_finish
 
 int32_sort_2power:
-	beg
 	push	%r13
 	lea	16(%rsp),%r13
 	andq	$-32,%rsp
@@ -2084,7 +2075,6 @@ int32_sort_2power:
 	lea	-16(%r13),%rsp
 	pop	%r13
 	ret
-	end
 	.endfn	int32_sort_2power
 
 	.rodata.cst32
diff --git a/libc/nexgen32e/environ.S b/libc/nexgen32e/environ.S
index dc3cb2d34..d1419a52c 100644
--- a/libc/nexgen32e/environ.S
+++ b/libc/nexgen32e/environ.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Environment variable pointer list.
 	.bss
diff --git a/libc/nexgen32e/gc.S b/libc/nexgen32e/gc.S
index 1e6f30266..6b60ae240 100644
--- a/libc/nexgen32e/gc.S
+++ b/libc/nexgen32e/gc.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/dce.h"
 
 	nop
@@ -32,8 +32,7 @@
 //	@param	rax,rdx,xmm0,xmm1,st0,st1 is return value
 //	@see	test/libc/runtime/gc_test.c
 	.ftrace1
-__gc:	beg
-	.ftrace2
+__gc:	.ftrace2
 
 #ifdef __x86_64__
 
@@ -48,7 +47,8 @@ __gc:	beg
 	mov	8(%r8),%r9
 	mov	16(%r8),%rdi
 	push	24(%r8)
-	pro
+	push	%rbp
+	mov	%rsp,%rbp
 	sub	$32,%rsp
 	mov	%rax,-8(%rbp)
 	mov	%rdx,-16(%rbp)
@@ -57,7 +57,7 @@ __gc:	beg
 	movdqa	-32(%rbp),%xmm0
 	mov	-16(%rbp),%rdx
 	mov	-8(%rbp),%rax
-	epi
+	leave
 	ret
 9:	ud2
 	nop
@@ -66,7 +66,7 @@ __gc:	beg
 
 //	if this code fails
 //	check if CosmoTib's size changed
-	sub	x8,x28,#1024			// __get_tls()
+	sub	x8,x28,#512			// __get_tls()
 	ldr	x9,[x8,0x18]			// tib::garbages
 	ldr	x10,[x9]			// g->i
 	ldr	x8,[x9,8]			// g->p
@@ -102,5 +102,4 @@ __gc:	beg
 
 #endif /* __x86_64__ */
 
-	end
 	.endfn	__gc,globl,hidden
diff --git a/libc/nexgen32e/gclongjmp.S b/libc/nexgen32e/gclongjmp.S
index 51f93cb15..1fb68131b 100644
--- a/libc/nexgen32e/gclongjmp.S
+++ b/libc/nexgen32e/gclongjmp.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Jumps up stack to previous setjmp() invocation.
 //
@@ -31,9 +31,7 @@
 //	@noreturn
 	.ftrace1
 gclongjmp:
-	beg
 	.ftrace2
-	pro
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
@@ -67,5 +65,4 @@ gclongjmp:
 #else
 #error "unsupported architecture"
 #endif /* __x86_64__ */
-	end
 	.endfn	gclongjmp,globl
diff --git a/libc/nexgen32e/identity.S b/libc/nexgen32e/identity.S
index 804b65bda..7fc23e4d8 100644
--- a/libc/nexgen32e/identity.S
+++ b/libc/nexgen32e/identity.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	The identity() function.
 //	@return	first argument
diff --git a/libc/nexgen32e/kbase36.c b/libc/nexgen32e/kbase36.c
index d490a966b..8a105da1c 100644
--- a/libc/nexgen32e/kbase36.c
+++ b/libc/nexgen32e/kbase36.c
@@ -16,8 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/stdalign.h"
-#include "libc/str/tab.h"
+#include "libc/stdalign.internal.h"
+#include "libc/str/tab.internal.h"
 
 alignas(uint8_t) const uint8_t kBase36[256] = {
     ['0'] = 1,   //
diff --git a/libc/nexgen32e/kcp437.S b/libc/nexgen32e/kcp437.S
index 36f5bf28d..084313d8e 100644
--- a/libc/nexgen32e/kcp437.S
+++ b/libc/nexgen32e/kcp437.S
@@ -16,8 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/tab.h"
-#include "libc/macros.h"
+#include "libc/str/tab.internal.h"
+#include "libc/macros.internal.h"
 .rodata
 .balign	2
 
diff --git a/libc/nexgen32e/kcpuids.S b/libc/nexgen32e/kcpuids.S
index 6cb2f1248..adc6ef5d1 100644
--- a/libc/nexgen32e/kcpuids.S
+++ b/libc/nexgen32e/kcpuids.S
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/runtime/pc.internal.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/kcpuids.h"
 #include "libc/nexgen32e/x86feature.h"
 
diff --git a/libc/nexgen32e/khalfcache3.S b/libc/nexgen32e/khalfcache3.S
index b6e6c7a3b..2d53f7167 100644
--- a/libc/nexgen32e/khalfcache3.S
+++ b/libc/nexgen32e/khalfcache3.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #ifdef __x86_64__
 
diff --git a/libc/nexgen32e/ksha256.S b/libc/nexgen32e/ksha256.S
index 58056ee47..0df26986e 100644
--- a/libc/nexgen32e/ksha256.S
+++ b/libc/nexgen32e/ksha256.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.rodata
 	.balign	64
diff --git a/libc/nexgen32e/ksha512.S b/libc/nexgen32e/ksha512.S
index f039aa0c2..1d9450a8c 100644
--- a/libc/nexgen32e/ksha512.S
+++ b/libc/nexgen32e/ksha512.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.rodata
 	.balign	64
diff --git a/libc/nexgen32e/ktensindex.S b/libc/nexgen32e/ktensindex.S
index 7d9081c0c..0840a17db 100644
--- a/libc/nexgen32e/ktensindex.S
+++ b/libc/nexgen32e/ktensindex.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.rodata
 kTensIndex:
diff --git a/libc/nexgen32e/ktolower.c b/libc/nexgen32e/ktolower.c
index 40bf0127c..db169897e 100644
--- a/libc/nexgen32e/ktolower.c
+++ b/libc/nexgen32e/ktolower.c
@@ -16,8 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/stdalign.h"
-#include "libc/str/tab.h"
+#include "libc/stdalign.internal.h"
+#include "libc/str/tab.internal.h"
 
 alignas(uint8_t) const uint8_t kToLower[256] = {
     0,   1,   2,   3,   4,   5,   6,    7,   8,    9,   10,  11,   12,  13,
diff --git a/libc/nexgen32e/ktoupper.c b/libc/nexgen32e/ktoupper.c
index 4c57cda9b..86e688a85 100644
--- a/libc/nexgen32e/ktoupper.c
+++ b/libc/nexgen32e/ktoupper.c
@@ -16,8 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/stdalign.h"
-#include "libc/str/tab.h"
+#include "libc/stdalign.internal.h"
+#include "libc/str/tab.internal.h"
 
 alignas(uint8_t) const uint8_t kToUpper[256] = {
     0,   1,   2,   3,   4,   5,   6,    7,   8,    9,   10,  11,   12,  13,
diff --git a/libc/nexgen32e/longjmp.S b/libc/nexgen32e/longjmp.S
index 5aefd029f..985a2d657 100644
--- a/libc/nexgen32e/longjmp.S
+++ b/libc/nexgen32e/longjmp.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Loads previously saved processor state.
 //
@@ -26,7 +26,7 @@
 //	@see	gclongjmp()
 //	@see	siglongjmp()
 	.ftrace1
-longjmp:beg
+longjmp:
 	.ftrace2
 _longjmp:
 #ifdef __x86_64__
@@ -61,7 +61,6 @@ _longjmp:
 #else
 #error "unsupported architecture"
 #endif
-	end
 	.endfn	longjmp,globl
 	.endfn	_longjmp,globl
 	.alias	longjmp,siglongjmp
diff --git a/libc/nexgen32e/mcount.S b/libc/nexgen32e/mcount.S
index f95b46cee..27076f1f6 100644
--- a/libc/nexgen32e/mcount.S
+++ b/libc/nexgen32e/mcount.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Function Profiling Hook.
 //	cc -pg adds this to the start of global functions.
diff --git a/libc/nexgen32e/mul4x4adx.S b/libc/nexgen32e/mul4x4adx.S
index 55643f75a..67d7f7216 100644
--- a/libc/nexgen32e/mul4x4adx.S
+++ b/libc/nexgen32e/mul4x4adx.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Computes 512-bit product of 256-bit and 256-bit numbers.
 //
diff --git a/libc/nexgen32e/mul6x6adx.S b/libc/nexgen32e/mul6x6adx.S
index 204bb4444..e0213a389 100644
--- a/libc/nexgen32e/mul6x6adx.S
+++ b/libc/nexgen32e/mul6x6adx.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Computes 768-bit product of 384-bit and 384-bit numbers.
 //
diff --git a/libc/nexgen32e/mul8x8adx.S b/libc/nexgen32e/mul8x8adx.S
index d4eeb0269..f83450d22 100644
--- a/libc/nexgen32e/mul8x8adx.S
+++ b/libc/nexgen32e/mul8x8adx.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Computes 1024-bit product of 512-bit and 512-bit numbers.
 //
diff --git a/libc/nexgen32e/nt2sysv.S b/libc/nexgen32e/nt2sysv.S
index 185687de6..ca9d87c19 100644
--- a/libc/nexgen32e/nt2sysv.S
+++ b/libc/nexgen32e/nt2sysv.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.windows
 
 //	Translates function call from code built w/ MS-style compiler.
@@ -30,8 +30,8 @@
 //	@note	slower than __sysv2nt
 //	@see	NT2SYSV() macro
 __nt2sysv:
-	beg
-	pro
+	push	%rbp
+	mov	%rsp,%rbp
 	sub	$256,%rsp
 	push	%rbx
 	push	%rdi
@@ -48,7 +48,6 @@ __nt2sysv:
 	pop	%rsi
 	pop	%rdi
 	pop	%rbx
-	epi
+	leave
 	ret
-	end
 	.endfn	__nt2sysv,globl,hidden
diff --git a/libc/nexgen32e/nt2sysv.h b/libc/nexgen32e/nt2sysv.h
index 6afb40234..4b9373325 100644
--- a/libc/nexgen32e/nt2sysv.h
+++ b/libc/nexgen32e/nt2sysv.h
@@ -7,10 +7,6 @@
  *
  * This macro should be used when specifying callbacks in the WIN32 API.
  */
-#ifdef __x86_64__
 #define NT2SYSV(FUNCTION) TRAMPOLINE(FUNCTION, __nt2sysv)
-#else
-#define NT2SYSV(FUNCTION) FUNCTION
-#endif
 
 #endif /* COSMOPOLITAN_LIBC_NEXGEN32E_NT2SYSV_H_ */
diff --git a/libc/nexgen32e/pid.c b/libc/nexgen32e/pid.c
index 235e575f1..1abfb511e 100644
--- a/libc/nexgen32e/pid.c
+++ b/libc/nexgen32e/pid.c
@@ -17,8 +17,5 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/runtime/internal.h"
-#ifndef __x86_64__
 
 int __pid;
-
-#endif
diff --git a/libc/nexgen32e/program_invocation_name.S b/libc/nexgen32e/program_invocation_name.S
index ab82db0b5..a5cd491d6 100644
--- a/libc/nexgen32e/program_invocation_name.S
+++ b/libc/nexgen32e/program_invocation_name.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.initbss 300,_init_program_invocation_name
 //	Supplies argv[0] the GNU way.
diff --git a/libc/nexgen32e/rldecode.S b/libc/nexgen32e/rldecode.S
index 74d150faf..06fcba57e 100644
--- a/libc/nexgen32e/rldecode.S
+++ b/libc/nexgen32e/rldecode.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.startup
 
 //	Thirteen byte decompressor.
diff --git a/libc/nexgen32e/setjmp.S b/libc/nexgen32e/setjmp.S
index 7f795ed4a..0dfac7df9 100644
--- a/libc/nexgen32e/setjmp.S
+++ b/libc/nexgen32e/setjmp.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Saves cpu state.
 //
diff --git a/libc/nexgen32e/sha1.S b/libc/nexgen32e/sha1.S
index 34ecfd2fe..1016c0498 100644
--- a/libc/nexgen32e/sha1.S
+++ b/libc/nexgen32e/sha1.S
@@ -31,7 +31,7 @@
 │  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.        │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.section .notice,"aR",@progbits
 	.asciz	"\n\n\
diff --git a/libc/nexgen32e/sha1ni.S b/libc/nexgen32e/sha1ni.S
index cfbc9f7bd..223f5f25d 100644
--- a/libc/nexgen32e/sha1ni.S
+++ b/libc/nexgen32e/sha1ni.S
@@ -31,7 +31,7 @@
 │  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.        │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 .section .notice,"aR",@progbits
 .asciz "\n\n\
diff --git a/libc/nexgen32e/sha256.S b/libc/nexgen32e/sha256.S
index fd5e42030..df175bf5b 100644
--- a/libc/nexgen32e/sha256.S
+++ b/libc/nexgen32e/sha256.S
@@ -47,7 +47,7 @@
 /////////////////////////////////////////////////////////////////////////
 // This code schedules 2 blocks at a time, with 4 lanes per block
 /////////////////////////////////////////////////////////////////////////
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 .section .notice,"aR",@progbits
 .asciz "\n\n\
diff --git a/libc/nexgen32e/sha256ni.S b/libc/nexgen32e/sha256ni.S
index 736524822..eb020d706 100644
--- a/libc/nexgen32e/sha256ni.S
+++ b/libc/nexgen32e/sha256ni.S
@@ -31,7 +31,7 @@
 │  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.        │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 .section .notice,"aR",@progbits
 .asciz "\n\n\
diff --git a/libc/nexgen32e/sha512.S b/libc/nexgen32e/sha512.S
index 34f7c5fbb..6e36d6d1b 100644
--- a/libc/nexgen32e/sha512.S
+++ b/libc/nexgen32e/sha512.S
@@ -48,7 +48,7 @@
 /////////////////////////////////////////////////////////////////////////
 // This code schedules 1 blocks at a time, with 4 lanes per block
 /////////////////////////////////////////////////////////////////////////
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 .section .notice,"aR",@progbits
 .asciz "\n\n\
diff --git a/libc/nexgen32e/threaded.c b/libc/nexgen32e/threaded.c
index b2c53384b..5524c4667 100644
--- a/libc/nexgen32e/threaded.c
+++ b/libc/nexgen32e/threaded.c
@@ -18,6 +18,13 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/thread/tls.h"
 
-#ifndef __x86_64__
-unsigned __tls_index;
+/**
+ * Contains TID of main thread or 0 if threading isn't enabled.
+ */
+int __threaded;
+
+#ifdef __x86_64__
+char __tls_enabled;
 #endif
+
+unsigned __tls_index;
diff --git a/libc/nexgen32e/x86info.h b/libc/nexgen32e/x86info.h
index 5e07d0e9e..14eed9fd3 100644
--- a/libc/nexgen32e/x86info.h
+++ b/libc/nexgen32e/x86info.h
@@ -65,6 +65,7 @@ struct X86ProcessorModel {
   unsigned char grade;
 };
 
+extern const size_t kX86ProcessorModelCount;
 extern const struct X86ProcessorModel kX86ProcessorModels[];
 
 const struct X86ProcessorModel *getx86processormodel(short) nosideeffect;
diff --git a/libc/nexgen32e/xmm.S b/libc/nexgen32e/xmm.S
index e944d6253..ef3658a2b 100644
--- a/libc/nexgen32e/xmm.S
+++ b/libc/nexgen32e/xmm.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .privileged
 
 __xmm_save:
diff --git a/libc/nexgen32e/zip.S b/libc/nexgen32e/zip.S
index b15142f06..313f84e2d 100644
--- a/libc/nexgen32e/zip.S
+++ b/libc/nexgen32e/zip.S
@@ -16,9 +16,9 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "ape/relocations.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 .section .zip.eocd,"",@progbits
 
 //	ZIP End Of Central Directory (EOCD) record.
diff --git a/libc/nt/API-MS-Win-Core-Realtime-l1-1-1/QueryInterruptTime.S b/libc/nt/API-MS-Win-Core-Realtime-l1-1-1/QueryInterruptTime.S
deleted file mode 100644
index ef2c8cd8a..000000000
--- a/libc/nt/API-MS-Win-Core-Realtime-l1-1-1/QueryInterruptTime.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	API-MS-Win-Core-Realtime-l1-1-1,__imp_QueryInterruptTime,QueryInterruptTime
-
-	.text.windows
-	.ftrace1
-QueryInterruptTime:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_QueryInterruptTime(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	QueryInterruptTime,globl
-	.previous
diff --git a/libc/nt/API-MS-Win-Core-Realtime-l1-1-1/QueryInterruptTimePrecise.S b/libc/nt/API-MS-Win-Core-Realtime-l1-1-1/QueryInterruptTimePrecise.S
deleted file mode 100644
index 0fb28d032..000000000
--- a/libc/nt/API-MS-Win-Core-Realtime-l1-1-1/QueryInterruptTimePrecise.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	API-MS-Win-Core-Realtime-l1-1-1,__imp_QueryInterruptTimePrecise,QueryInterruptTimePrecise
-
-	.text.windows
-	.ftrace1
-QueryInterruptTimePrecise:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_QueryInterruptTimePrecise(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	QueryInterruptTimePrecise,globl
-	.previous
diff --git a/libc/nt/API-MS-Win-Core-Realtime-l1-1-1/QueryUnbiasedInterruptTimePrecise.S b/libc/nt/API-MS-Win-Core-Realtime-l1-1-1/QueryUnbiasedInterruptTimePrecise.S
deleted file mode 100644
index 23fe67993..000000000
--- a/libc/nt/API-MS-Win-Core-Realtime-l1-1-1/QueryUnbiasedInterruptTimePrecise.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	API-MS-Win-Core-Realtime-l1-1-1,__imp_QueryUnbiasedInterruptTimePrecise,QueryUnbiasedInterruptTimePrecise
-
-	.text.windows
-	.ftrace1
-QueryUnbiasedInterruptTimePrecise:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_QueryUnbiasedInterruptTimePrecise(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	QueryUnbiasedInterruptTimePrecise,globl
-	.previous
diff --git a/libc/nt/BUILD.mk b/libc/nt/BUILD.mk
index b5660d1be..f6bd28be4 100644
--- a/libc/nt/BUILD.mk
+++ b/libc/nt/BUILD.mk
@@ -91,27 +91,6 @@ $(LIBC_NT_COMDLG32_A).pkg:				\
 
 #───────────────────────────────────────────────────────────────────────────────
 
-LIBC_NT_ARTIFACTS += LIBC_NT_SHELL32_A
-LIBC_NT_SHELL32 = $(LIBC_NT_SHELL32_A_DEPS) $(LIBC_NT_SHELL32_A)
-LIBC_NT_SHELL32_A = o/$(MODE)/libc/nt/shell32.a
-LIBC_NT_SHELL32_A_SRCS := $(wildcard libc/nt/shell32/*.S)
-LIBC_NT_SHELL32_A_OBJS = $(LIBC_NT_SHELL32_A_SRCS:%.S=o/$(MODE)/%.o)
-LIBC_NT_SHELL32_A_CHECKS = $(LIBC_NT_SHELL32_A).pkg
-LIBC_NT_SHELL32_A_DIRECTDEPS = LIBC_NT_KERNEL32
-LIBC_NT_SHELL32_A_DEPS :=					\
-	$(call uniq,$(foreach x,$(LIBC_NT_SHELL32_A_DIRECTDEPS),$($(x))))
-
-$(LIBC_NT_SHELL32_A):						\
-		libc/nt/shell32/				\
-		$(LIBC_NT_SHELL32_A).pkg			\
-		$(LIBC_NT_SHELL32_A_OBJS)
-
-$(LIBC_NT_SHELL32_A).pkg:					\
-		$(LIBC_NT_SHELL32_A_OBJS)			\
-		$(foreach x,$(LIBC_NT_SHELL32_A_DIRECTDEPS),$($(x)_A).pkg)
-
-#───────────────────────────────────────────────────────────────────────────────
-
 LIBC_NT_ARTIFACTS += LIBC_NT_GDI32_A
 LIBC_NT_GDI32 = $(LIBC_NT_GDI32_A_DEPS) $(LIBC_NT_GDI32_A)
 LIBC_NT_GDI32_A = o/$(MODE)/libc/nt/gdi32.a
@@ -200,27 +179,6 @@ $(LIBC_NT_MEMORY_A).pkg:				\
 
 #───────────────────────────────────────────────────────────────────────────────
 
-LIBC_NT_ARTIFACTS += LIBC_NT_REALTIME_A
-LIBC_NT_REALTIME = $(LIBC_NT_REALTIME_A_DEPS) $(LIBC_NT_REALTIME_A)
-LIBC_NT_REALTIME_A = o/$(MODE)/libc/nt/realtime.a
-LIBC_NT_REALTIME_A_SRCS := $(wildcard libc/nt/API-MS-Win-Core-Realtime-l1-1-1/*.S)
-LIBC_NT_REALTIME_A_OBJS = $(LIBC_NT_REALTIME_A_SRCS:%.S=o/$(MODE)/%.o)
-LIBC_NT_REALTIME_A_CHECKS = $(LIBC_NT_REALTIME_A).pkg
-LIBC_NT_REALTIME_A_DIRECTDEPS = LIBC_NT_KERNEL32
-LIBC_NT_REALTIME_A_DEPS :=					\
-	$(call uniq,$(foreach x,$(LIBC_NT_REALTIME_A_DIRECTDEPS),$($(x))))
-
-$(LIBC_NT_REALTIME_A):						\
-		libc/nt/API-MS-Win-Core-Realtime-l1-1-1/	\
-		$(LIBC_NT_REALTIME_A).pkg			\
-		$(LIBC_NT_REALTIME_A_OBJS)
-
-$(LIBC_NT_REALTIME_A).pkg:					\
-		$(LIBC_NT_REALTIME_A_OBJS)			\
-		$(foreach x,$(LIBC_NT_REALTIME_A_DIRECTDEPS),$($(x)_A).pkg)
-
-#───────────────────────────────────────────────────────────────────────────────
-
 LIBC_NT_ARTIFACTS += LIBC_NT_USER32_A
 LIBC_NT_USER32 = $(LIBC_NT_USER32_A_DEPS) $(LIBC_NT_USER32_A)
 LIBC_NT_USER32_A = o/$(MODE)/libc/nt/user32.a
diff --git a/libc/nt/advapi32/RegOpenKeyExA.S b/libc/nt/advapi32/RegOpenKeyExA.S
deleted file mode 100644
index 31ee26848..000000000
--- a/libc/nt/advapi32/RegOpenKeyExA.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	advapi32,__imp_RegOpenKeyExA,RegOpenKeyExA
-
-	.text.windows
-	.ftrace1
-RegOpenKeyExA:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_RegOpenKeyExA(%rip),%rax
-	jmp	__sysv2nt6
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	RegOpenKeyExA,globl
-	.previous
diff --git a/libc/nt/enum/keyaccess.h b/libc/nt/enum/keyaccess.h
index 1abb200a4..06709ad42 100644
--- a/libc/nt/enum/keyaccess.h
+++ b/libc/nt/enum/keyaccess.h
@@ -1,16 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_NT_ENUM_KEYACCESS_H_
 #define COSMOPOLITAN_LIBC_NT_ENUM_KEYACCESS_H_
 
-#define kNtKeyQueryValue        0x00000001
-#define kNtKeySetValue          0x00000002
-#define kNtKeyCreateSubKey      0x00000004
-#define kNtKeyEnumerateSubKeys  0x00000008
-#define kNtKeyNotify            0x00000010
-#define kNtKeyCreateLink        0x00000020
-#define kNtWow6432Key           0x00000200
-#define kNtWow6464Key           0x00000100
-#define kNtWow64Res             0x00000300
-
 #define kNtKeyRead      0x00020019
 #define kNtKeyWrite     0x00020006
 #define kNtKeyExecute   0x00020019
diff --git a/libc/nt/enum/pageflags.h b/libc/nt/enum/pageflags.h
index 40569decb..5cef3a2fa 100644
--- a/libc/nt/enum/pageflags.h
+++ b/libc/nt/enum/pageflags.h
@@ -23,7 +23,4 @@
 #define kNtSecLargePages     0x80000000
 #define kNtSecWritecombine   0x40000000
 
-#define kNtPageTargetsInvalid  0x40000000
-#define kNtPageTargetsNoUpdate 0x40000000
-
 #endif /* COSMOPOLITAN_LIBC_NT_ENUM_PAGEFLAGS_H_ */
diff --git a/libc/nt/enum/status.h b/libc/nt/enum/status.h
index cc11bc96b..ed3dc8ff3 100644
--- a/libc/nt/enum/status.h
+++ b/libc/nt/enum/status.h
@@ -2,68 +2,68 @@
 #define COSMOPOLITAN_LIBC_NT_STATUS_H_
 
 /* high two bits = {success,informational,warning,error} */
-#define kNtStatusSuccess                  0x00000000u /* success statuses */
-#define kNtStatusWait0                    0x00000000u
-#define kNtStatusAbandonedWait0           0x00000080u
-#define kNtStatusUserApc                  0x000000C0u
-#define kNtStatusTimeout                  0x00000102u
-#define kNtStatusPending                  0x00000103u
-#define kNtStatusGuardPageViolation       0x80000001u /* warning statuses */
-#define kNtStatusDatatypeMisalignment     0x80000002u
-#define kNtStatusBreakpoint               0x80000003u
-#define kNtStatusSingleStep               0x80000004u
-#define kNtStatusLongjump                 0x80000026u
-#define kNtStatusUnwindConsolidate        0x80000029u
-#define kNtStatusAccessViolation          0xC0000005u /* error statuses */
-#define kNtStatusInPageError              0xC0000006u
-#define kNtStatusInvalidHandle            0xC0000008u
-#define kNtStatusInvalidParameter         0xC000000Du
-#define kNtStatusNoMemory                 0xC0000017u
-#define kNtStatusIllegalInstruction       0xC000001Du
-#define kNtStatusNoncontinuableException  0xC0000025u
-#define kNtStatusInvalidDisposition       0xC0000026u
-#define kNtStatusArrayBoundsExceeded      0xC000008Cu
-#define kNtStatusFloatDenormalOperand     0xC000008Du
-#define kNtStatusFloatDivideByZero        0xC000008Eu
-#define kNtStatusFloatInexactResult       0xC000008Fu
-#define kNtStatusFloatInvalidOperation    0xC0000090u
-#define kNtStatusFloatOverflow            0xC0000091u
-#define kNtStatusFloatStackCheck          0xC0000092u
-#define kNtStatusFloatUnderflow           0xC0000093u
-#define kNtStatusIntegerDivideBYZero      0xC0000094u
-#define kNtStatusIntegerOverflow          0xC0000095u
-#define kNtStatusPrivilegedInstruction    0xC0000096u
-#define kNtStatusStackOverflow            0xC00000FDu
-#define kNtStatusDllNotFound              0xC0000135u
-#define kNtStatusOrdinalNotFound          0xC0000138u
-#define kNtStatusEntrypointNotFound       0xC0000139u
-#define kNtStatusControlCExit             0xC000013Au
-#define kNtStatusDllInitFailed            0xC0000142u
-#define kNtStatusFloatMultipleFaults      0xC00002B4u
-#define kNtStatusFloatMultipleTraps       0xC00002B5u
-#define kNtStatusRegNatConsumption        0xC00002C9u
-#define kNtStatusHeapCorruption           0xC0000374u
-#define kNtStatusStackBufferOverrun       0xC0000409u
-#define kNtStatusInvalidCruntimeParameter 0xC0000417u
-#define kNtStatusAssertionFailure         0xC0000420u
-#define kNtStatusEnclaveViolation         0xC00004A2u
-#define kNtStatusSegmentNotification      0x40000005u
-#define kNtStatusFatalAppExit             0x40000015u
-#define kNtStatusNotFound                 0xC0000225u
-#define kNtStatusCancelled                0xC0000120u
+#define kNtStatusSuccess                  0x00000000 /* success statuses */
+#define kNtStatusWait0                    0x00000000
+#define kNtStatusAbandonedWait0           0x00000080
+#define kNtStatusUserApc                  0x000000C0
+#define kNtStatusTimeout                  0x00000102
+#define kNtStatusPending                  0x00000103
+#define kNtStatusGuardPageViolation       0x80000001 /* warning statuses */
+#define kNtStatusDatatypeMisalignment     0x80000002
+#define kNtStatusBreakpoint               0x80000003
+#define kNtStatusSingleStep               0x80000004
+#define kNtStatusLongjump                 0x80000026
+#define kNtStatusUnwindConsolidate        0x80000029
+#define kNtStatusAccessViolation          0xC0000005 /* error statuses */
+#define kNtStatusInPageError              0xC0000006
+#define kNtStatusInvalidHandle            0xC0000008
+#define kNtStatusInvalidParameter         0xC000000D
+#define kNtStatusNoMemory                 0xC0000017
+#define kNtStatusIllegalInstruction       0xC000001D
+#define kNtStatusNoncontinuableException  0xC0000025
+#define kNtStatusInvalidDisposition       0xC0000026
+#define kNtStatusArrayBoundsExceeded      0xC000008C
+#define kNtStatusFloatDenormalOperand     0xC000008D
+#define kNtStatusFloatDivideByZero        0xC000008E
+#define kNtStatusFloatInexactResult       0xC000008F
+#define kNtStatusFloatInvalidOperation    0xC0000090
+#define kNtStatusFloatOverflow            0xC0000091
+#define kNtStatusFloatStackCheck          0xC0000092
+#define kNtStatusFloatUnderflow           0xC0000093
+#define kNtStatusIntegerDivideBYZero      0xC0000094
+#define kNtStatusIntegerOverflow          0xC0000095
+#define kNtStatusPrivilegedInstruction    0xC0000096
+#define kNtStatusStackOverflow            0xC00000FD
+#define kNtStatusDllNotFound              0xC0000135
+#define kNtStatusOrdinalNotFound          0xC0000138
+#define kNtStatusEntrypointNotFound       0xC0000139
+#define kNtStatusControlCExit             0xC000013A
+#define kNtStatusDllInitFailed            0xC0000142
+#define kNtStatusFloatMultipleFaults      0xC00002B4
+#define kNtStatusFloatMultipleTraps       0xC00002B5
+#define kNtStatusRegNatConsumption        0xC00002C9
+#define kNtStatusHeapCorruption           0xC0000374
+#define kNtStatusStackBufferOverrun       0xC0000409
+#define kNtStatusInvalidCruntimeParameter 0xC0000417
+#define kNtStatusAssertionFailure         0xC0000420
+#define kNtStatusEnclaveViolation         0xC00004A2
+#define kNtStatusSegmentNotification      0x40000005
+#define kNtStatusFatalAppExit             0x40000015
+#define kNtStatusNotFound                 0xC0000225
+#define kNtStatusCancelled                0xC0000120
 
-#define kNtDbgExceptionHandled    0x00010001u
-#define kNtDbgContinue            0x00010002u
-#define kNtDbgReplyLater          0x40010001u
-#define kNtDbgTerminateThread     0x40010003u
-#define kNtDbgTerminateProcess    0x40010004u
-#define kNtDbgControlC            0x40010005u
-#define kNtDbgPrintexceptionC     0x40010006u
-#define kNtDbgRipexception        0x40010007u
-#define kNtDbgControlBreak        0x40010008u
-#define kNtDbgCommandException    0x40010009u
-#define kNtDbgPrintexceptionWideC 0x4001000Au
-#define kNtDbgExceptionNotHandled 0x80010001u
+#define kNtDbgExceptionHandled    0x00010001
+#define kNtDbgContinue            0x00010002
+#define kNtDbgReplyLater          0x40010001
+#define kNtDbgTerminateThread     0x40010003
+#define kNtDbgTerminateProcess    0x40010004
+#define kNtDbgControlC            0x40010005
+#define kNtDbgPrintexceptionC     0x40010006
+#define kNtDbgRipexception        0x40010007
+#define kNtDbgControlBreak        0x40010008
+#define kNtDbgCommandException    0x40010009
+#define kNtDbgPrintexceptionWideC 0x4001000A
+#define kNtDbgExceptionNotHandled 0x80010001
 #define kNtStillActive            kNtStatusPending
 
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
diff --git a/libc/nt/enum/wt.h b/libc/nt/enum/wt.h
index 74910853f..a79955ea8 100644
--- a/libc/nt/enum/wt.h
+++ b/libc/nt/enum/wt.h
@@ -6,6 +6,6 @@
 #define kNtWtExecuteintimerthread      0x00000020u
 #define kNtWtExecuteinpersistentthread 0x00000080u
 #define kNtWtExecutelongfunction       0x00000010u
-#define kNtWtTransferImpersonation     0x00000100u
+#define kNtWtTransferImpersonation     0𝔵00000100𝔲
 
 #endif /* COSMOPOLITAN_LIBC_NT_ENUM_WT_H_ */
diff --git a/libc/nt/files.h b/libc/nt/files.h
index 2b844c32f..fcbc294cb 100644
--- a/libc/nt/files.h
+++ b/libc/nt/files.h
@@ -49,7 +49,6 @@ COSMOPOLITAN_C_START_
 intptr_t LoadResource(int64_t hModule, int64_t hResInfo);
 uint32_t SetHandleCount(uint32_t uNumber);
 uint32_t GetLogicalDrives(void);
-uint32_t GetLogicalDriveStringsA(uint32_t nBufferLength, char *lpBuffer);
 bool32 FlushFileBuffers(int64_t hFile);
 
 int64_t ReOpenFile(int64_t hOriginalFile, uint32_t dwDesiredAccess,
@@ -206,7 +205,6 @@ uint32_t GetFinalPathNameByHandle(int64_t hFile, char16_t *out_path,
 
 uint32_t GetFullPathName(const char16_t *lpFileName, uint32_t nBufferLength,
                          char16_t *lpBuffer, char16_t **lpFilePart);
-uint32_t GetShortPathName(const char16_t *lpszLongPath, char16_t *out_lpszShortPath, uint32_t cchBuffer);
 
 bool32 GetOverlappedResult(int64_t hFile, struct NtOverlapped *lpOverlapped,
                            uint32_t *lpNumberOfBytesTransferred, bool32 bWait);
@@ -227,10 +225,6 @@ bool32 GetVolumeInformationByHandle(int64_t hFile,
                                     char16_t *opt_out_lpFileSystemNameBuffer,
                                     uint32_t nFileSystemNameSize);
 
-uint32_t SetFilePointer(intptr_t hFile, int32_t lDistanceToMove,
-                        long *opt_inout_lpDistanceToMoveHigh,
-                        uint32_t dwMoveMethod);
-
 #if ShouldUseMsabiAttribute()
 #include "libc/nt/thunk/files.inc"
 #endif /* ShouldUseMsabiAttribute() */
diff --git a/libc/nt/gdi32/DescribePixelFormat.S b/libc/nt/gdi32/DescribePixelFormat.S
deleted file mode 100644
index 44b3dc746..000000000
--- a/libc/nt/gdi32/DescribePixelFormat.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	gdi32,__imp_DescribePixelFormat,DescribePixelFormat
-
-	.text.windows
-	.ftrace1
-DescribePixelFormat:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_DescribePixelFormat(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	DescribePixelFormat,globl
-	.previous
diff --git a/libc/nt/kernel32/GetACP.S b/libc/nt/kernel32/GetACP.S
deleted file mode 100644
index f0121f7e0..000000000
--- a/libc/nt/kernel32/GetACP.S
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_GetACP,GetACP
-
-	.text.windows
-	.ftrace1
-GetACP:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	sub	$32,%rsp
-	call	*__imp_GetACP(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	GetACP,globl
-	.previous
diff --git a/libc/nt/kernel32/GetCPInfoExW.S b/libc/nt/kernel32/GetCPInfoExW.S
deleted file mode 100644
index a58310911..000000000
--- a/libc/nt/kernel32/GetCPInfoExW.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_GetCPInfoExW,GetCPInfoExW
-
-	.text.windows
-	.ftrace1
-GetCPInfoEx:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_GetCPInfoExW(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	GetCPInfoEx,globl
-	.previous
diff --git a/libc/nt/kernel32/GetLogicalDriveStringsA.S b/libc/nt/kernel32/GetLogicalDriveStringsA.S
deleted file mode 100644
index de327c7fc..000000000
--- a/libc/nt/kernel32/GetLogicalDriveStringsA.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_GetLogicalDriveStringsA,GetLogicalDriveStringsA
-
-	.text.windows
-	.ftrace1
-GetLogicalDriveStringsA:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_GetLogicalDriveStringsA(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	GetLogicalDriveStringsA,globl
-	.previous
diff --git a/libc/nt/kernel32/GetOEMCP.S b/libc/nt/kernel32/GetOEMCP.S
deleted file mode 100644
index 18227546f..000000000
--- a/libc/nt/kernel32/GetOEMCP.S
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_GetOEMCP,GetOEMCP
-
-	.text.windows
-	.ftrace1
-GetOEMCP:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	sub	$32,%rsp
-	call	*__imp_GetOEMCP(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	GetOEMCP,globl
-	.previous
diff --git a/libc/nt/kernel32/GetShortPathNameW.S b/libc/nt/kernel32/GetShortPathNameW.S
deleted file mode 100644
index d0c28f2f6..000000000
--- a/libc/nt/kernel32/GetShortPathNameW.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_GetShortPathNameW,GetShortPathNameW
-
-	.text.windows
-	.ftrace1
-GetShortPathName:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_GetShortPathNameW(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	GetShortPathName,globl
-	.previous
diff --git a/libc/nt/kernel32/GlobalLock.S b/libc/nt/kernel32/GlobalLock.S
deleted file mode 100644
index 1407a4427..000000000
--- a/libc/nt/kernel32/GlobalLock.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_GlobalLock,GlobalLock
-
-	.text.windows
-	.ftrace1
-GlobalLock:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_GlobalLock(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	GlobalLock,globl
-	.previous
diff --git a/libc/nt/kernel32/GlobalUnlock.S b/libc/nt/kernel32/GlobalUnlock.S
deleted file mode 100644
index b9ba550f8..000000000
--- a/libc/nt/kernel32/GlobalUnlock.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_GlobalUnlock,GlobalUnlock
-
-	.text.windows
-	.ftrace1
-GlobalUnlock:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_GlobalUnlock(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	GlobalUnlock,globl
-	.previous
diff --git a/libc/nt/kernel32/IsWow64Process2.S b/libc/nt/kernel32/IsWow64Process2.S
deleted file mode 100644
index 4cb92ff17..000000000
--- a/libc/nt/kernel32/IsWow64Process2.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_IsWow64Process2,IsWow64Process2
-
-	.text.windows
-	.ftrace1
-IsWow64Process2:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_IsWow64Process2(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	IsWow64Process2,globl
-	.previous
diff --git a/libc/nt/kernel32/QueryUnbiasedInterruptTime.S b/libc/nt/kernel32/QueryUnbiasedInterruptTime.S
deleted file mode 100644
index ee296b5b6..000000000
--- a/libc/nt/kernel32/QueryUnbiasedInterruptTime.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_QueryUnbiasedInterruptTime,QueryUnbiasedInterruptTime
-
-	.text.windows
-	.ftrace1
-QueryUnbiasedInterruptTime:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_QueryUnbiasedInterruptTime(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	QueryUnbiasedInterruptTime,globl
-	.previous
diff --git a/libc/nt/kernel32/SetFilePointer.S b/libc/nt/kernel32/SetFilePointer.S
deleted file mode 100644
index 2e14b9e3e..000000000
--- a/libc/nt/kernel32/SetFilePointer.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_SetFilePointer,SetFilePointer
-
-	.text.windows
-	.ftrace1
-SetFilePointer:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_SetFilePointer(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	SetFilePointer,globl
-	.previous
diff --git a/libc/nt/user32/PtInRect.S b/libc/nt/kernel32/VirtualAlloc.S
similarity index 60%
rename from libc/nt/user32/PtInRect.S
rename to libc/nt/kernel32/VirtualAlloc.S
index 3a3ae8c38..f8e5f815a 100644
--- a/libc/nt/user32/PtInRect.S
+++ b/libc/nt/kernel32/VirtualAlloc.S
@@ -1,18 +1,18 @@
 #include "libc/nt/codegen.h"
-.imp	user32,__imp_PtInRect,PtInRect
+.imp	kernel32,__imp_VirtualAlloc,VirtualAlloc
 
 	.text.windows
 	.ftrace1
-PtInRect:
+VirtualAlloc:
 	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
-	mov	__imp_PtInRect(%rip),%rax
+	mov	__imp_VirtualAlloc(%rip),%rax
 	jmp	__sysv2nt
 #elif defined(__aarch64__)
 	mov	x0,#0
 	ret
 #endif
-	.endfn	PtInRect,globl
+	.endfn	VirtualAlloc,globl
 	.previous
diff --git a/libc/nt/kernel32/VirtualAllocEx.S b/libc/nt/kernel32/VirtualAllocEx.S
index 239913a84..bdf00950b 100644
--- a/libc/nt/kernel32/VirtualAllocEx.S
+++ b/libc/nt/kernel32/VirtualAllocEx.S
@@ -1,2 +1,18 @@
 #include "libc/nt/codegen.h"
 .imp	kernel32,__imp_VirtualAllocEx,VirtualAllocEx
+
+	.text.windows
+	.ftrace1
+VirtualAllocEx:
+	.ftrace2
+#ifdef __x86_64__
+	push	%rbp
+	mov	%rsp,%rbp
+	mov	__imp_VirtualAllocEx(%rip),%rax
+	jmp	__sysv2nt6
+#elif defined(__aarch64__)
+	mov	x0,#0
+	ret
+#endif
+	.endfn	VirtualAllocEx,globl
+	.previous
diff --git a/libc/nt/kernel32/VirtualProtectEx.S b/libc/nt/kernel32/VirtualProtectEx.S
deleted file mode 100644
index 8d22b1789..000000000
--- a/libc/nt/kernel32/VirtualProtectEx.S
+++ /dev/null
@@ -1,2 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_VirtualProtectEx,VirtualProtectEx
diff --git a/libc/nt/kernel32/VirtualQueryEx.S b/libc/nt/kernel32/VirtualQueryEx.S
deleted file mode 100644
index d810cf97a..000000000
--- a/libc/nt/kernel32/VirtualQueryEx.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_VirtualQueryEx,VirtualQueryEx
-
-	.text.windows
-	.ftrace1
-VirtualQueryEx:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_VirtualQueryEx(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	VirtualQueryEx,globl
-	.previous
diff --git a/libc/nt/kernel32/WriteProcessMemory.S b/libc/nt/kernel32/WriteProcessMemory.S
deleted file mode 100644
index 222dd5e72..000000000
--- a/libc/nt/kernel32/WriteProcessMemory.S
+++ /dev/null
@@ -1,2 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	kernel32,__imp_WriteProcessMemory,WriteProcessMemory
diff --git a/libc/nt/master.sh b/libc/nt/master.sh
index 570a77e72..4f44cc057 100755
--- a/libc/nt/master.sh
+++ b/libc/nt/master.sh
@@ -40,12 +40,9 @@ imp	''							SetCurrentDirectoryW					kernel32	1
 imp	''							TerminateProcess					kernel32	2
 imp	''							UnlockFileEx						kernel32	5
 imp	''							UnmapViewOfFile						kernel32	1
-imp	''							VirtualAllocEx						kernel32	5
 imp	''							VirtualProtect						kernel32	4
-imp	''							VirtualProtectEx					kernel32	5
 imp	''							WaitForMultipleObjects					kernel32	4
 imp	''							WaitForSingleObject					kernel32	2
-imp	''							WriteProcessMemory					kernel32	5
 imp	'AcquireSRWLockExclusive'				AcquireSRWLockExclusive					kernel32	1
 imp	'AcquireSRWLockShared'					AcquireSRWLockShared					kernel32	1
 imp	'AddDllDirectory'					AddDllDirectory						kernel32	1
@@ -116,7 +113,6 @@ imp	'GetCurrentProcessId'					GetCurrentProcessId					kernel32	0
 imp	'GetCurrentProcessorNumberEx'				GetCurrentProcessorNumberEx				kernel32	1
 imp	'GetCurrentThread'					GetCurrentThread					kernel32	0
 imp	'GetCurrentThreadId'					GetCurrentThreadId					kernel32	0
-imp	'GetDynamicTimeZoneInformation'				GetDynamicTimeZoneInformation				kernel32	1
 imp	'GetEnvironmentStrings'					GetEnvironmentStringsW					kernel32	1
 imp	'GetEnvironmentVariable'				GetEnvironmentVariableW					kernel32	3
 imp	'GetExitCodeThread'					GetExitCodeThread					kernel32	2
@@ -129,12 +125,10 @@ imp	'GetFileTime'						GetFileTime						kernel32	4
 imp	'GetFileType'						GetFileType						kernel32	1
 imp	'GetFinalPathNameByHandle'				GetFinalPathNameByHandleW				kernel32	4
 imp	'GetFullPathName'					GetFullPathNameW					kernel32	4
-imp	'GetShortPathName'					GetShortPathNameW					kernel32	3
 imp	'GetHandleInformation'					GetHandleInformation					kernel32	2
 imp	'GetLargestConsoleWindowSize'				GetLargestConsoleWindowSize				kernel32	1
 imp	'GetLastError'						GetLastError						kernel32	0
 imp	'GetLogicalDrives'					GetLogicalDrives					kernel32	0
-imp	'GetLogicalDriveStringsA'				GetLogicalDriveStringsA					kernel32	2
 imp	'GetMaximumProcessorCount'				GetMaximumProcessorCount				kernel32	1	# Windows 7+
 imp	'GetModuleFileName'					GetModuleFileNameW					kernel32	3
 imp	'GetModuleHandle'					GetModuleHandleA					kernel32	1
@@ -174,6 +168,8 @@ imp	'GetSystemTimePreciseAsFileTime'			GetSystemTimePreciseAsFileTime				kernel3
 imp	'GetSystemTimes'					GetSystemTimes						kernel32	3
 imp	'GetTempPath'						GetTempPathW						kernel32	2
 imp	'GetTempPathA'						GetTempPathA						kernel32	2
+imp	'GetDynamicTimeZoneInformation'				GetDynamicTimeZoneInformation				kernel32	1
+imp	'GetTimeZoneInformation'				GetTimeZoneInformation					kernel32	1
 imp	'GetThreadContext'					GetThreadContext					kernel32	2
 imp	'GetThreadDescription'					GetThreadDescription					kernel32	2
 imp	'GetThreadIOPendingFlag'				GetThreadIOPendingFlag					kernel32	2
@@ -182,20 +178,14 @@ imp	'GetThreadPriority'					GetThreadPriority					kernel32	1
 imp	'GetThreadPriorityBoost'				GetThreadPriorityBoost					kernel32	2
 imp	'GetThreadTimes'					GetThreadTimes						kernel32	5
 imp	'GetTickCount64'					GetTickCount64						kernel32	0
-imp	'GetTimeZoneInformation'				GetTimeZoneInformation					kernel32	1
 imp	'GetVersionEx'						GetVersionExW						kernel32	1
 imp	'GetVolumeInformationByHandle'				GetVolumeInformationByHandleW				kernel32	8
 imp	'GetVolumePathName'					GetVolumePathNameW					kernel32	3
 imp	'GetWindowsDirectory'					GetWindowsDirectoryW					kernel32	2
 imp	'GetWindowsDirectoryA'					GetWindowsDirectoryA					kernel32	2
-imp	'GetOEMCP'						GetOEMCP						kernel32	0
-imp	'GetACP'						GetACP							kernel32	0
-imp	'GetCPInfoEx'						GetCPInfoExW						kernel32	3
 imp	'GlobalAlloc'						GlobalAlloc						kernel32	2
 imp	'GlobalFree'						GlobalFree						kernel32	1
-imp	'GlobalLock'						GlobalLock						kernel32	1
 imp	'GlobalMemoryStatusEx'					GlobalMemoryStatusEx					kernel32	1
-imp	'GlobalUnlock'						GlobalUnlock						kernel32	1
 imp	'HeapAlloc'						HeapAlloc						kernel32	3
 imp	'HeapCompact'						HeapCompact						kernel32	2
 imp	'HeapCreate'						HeapCreate						kernel32	3
@@ -207,7 +197,6 @@ imp	'InitializeCriticalSection'				InitializeCriticalSection				kernel32	1
 imp	'InitializeCriticalSectionAndSpinCount'			InitializeCriticalSectionAndSpinCount			kernel32	2
 imp	'InitializeProcThreadAttributeList'			InitializeProcThreadAttributeList			kernel32	4
 imp	'InitializeSRWLock'					InitializeSRWLock					kernel32	1
-imp	'IsWow64Process2'					IsWow64Process2						kernel32	3
 imp	'LeaveCriticalSection'					LeaveCriticalSection					kernel32	1
 imp	'LoadLibrary'						LoadLibraryW						kernel32	1
 imp	'LoadLibraryA'						LoadLibraryA						kernel32	1
@@ -228,9 +217,8 @@ imp	'Process32First'					Process32FirstW						kernel32	2
 imp	'Process32Next'						Process32NextW						kernel32	2
 imp	'PulseEvent'						PulseEvent						kernel32	1
 imp	'PurgeComm'						PurgeComm						kernel32	2
-imp	'QueryPerformanceCounter'				QueryPerformanceCounter					kernel32	1	# Windows 7+
+imp	'QueryPerformanceCounter'				QueryPerformanceCounter					kernel32	1
 imp	'QueryPerformanceFrequency'				QueryPerformanceFrequency				kernel32	1
-imp	'QueryUnbiasedInterruptTime'				QueryUnbiasedInterruptTime				kernel32	1	# Windows 7+
 imp	'ReadConsole'						ReadConsoleW						kernel32	5
 imp	'ReadConsoleInput'					ReadConsoleInputW					kernel32	4
 imp	'ReadConsoleOutput'					ReadConsoleOutputW					kernel32	5
@@ -269,7 +257,6 @@ imp	'SetEvent'						SetEvent						kernel32	1
 imp	'SetFileAttributes'					SetFileAttributesW					kernel32	2
 imp	'SetFileCompletionNotificationModes'			SetFileCompletionNotificationModes			kernel32	2
 imp	'SetFileInformationByHandle'				SetFileInformationByHandle				kernel32	4
-imp	'SetFilePointer'					SetFilePointer						kernel32	4
 imp	'SetFileTime'						SetFileTime						kernel32	4
 imp	'SetFileValidData'					SetFileValidData					kernel32	2
 imp	'SetHandleCount'					SetHandleCount						kernel32	1
@@ -307,10 +294,11 @@ imp	'UnlockFile'						UnlockFile						kernel32	5
 imp	'UnmapViewOfFile2'					UnmapViewOfFile2					kernel32	2
 imp	'UnmapViewOfFileEx'					UnmapViewOfFileEx					kernel32	3
 imp	'UpdateProcThreadAttribute'				UpdateProcThreadAttribute				kernel32	7
+imp	'VirtualAlloc'						VirtualAlloc						kernel32	4
+imp	'VirtualAllocEx'					VirtualAllocEx						kernel32	5
 imp	'VirtualFree'						VirtualFree						kernel32	3
 imp	'VirtualLock'						VirtualLock						kernel32	2
 imp	'VirtualQuery'						VirtualQuery						kernel32	3
-imp	'VirtualQueryEx'					VirtualQueryEx						kernel32	4
 imp	'VirtualUnlock'						VirtualUnlock						kernel32	2
 imp	'WaitForMultipleObjectsEx'				WaitForMultipleObjectsEx				kernel32	5
 imp	'WaitForSingleObjectEx'					WaitForSingleObjectEx					kernel32	3
@@ -361,7 +349,6 @@ imp	'RegLoadKey'						RegLoadKeyW						advapi32	3
 imp	'RegNotifyChangeKeyValue'				RegNotifyChangeKeyValue					advapi32	5
 imp	'RegOpenCurrentUser'					RegOpenCurrentUser					advapi32	2
 imp	'RegOpenKeyEx'						RegOpenKeyExW						advapi32	5
-imp	'RegOpenKeyExA'						RegOpenKeyExA						advapi32	5
 imp	'RegOpenUserClassesRoot'				RegOpenUserClassesRoot					advapi32	4
 imp	'RegOverridePredefKey'					RegOverridePredefKey					advapi32	2
 imp	'RegQueryInfoKey'					RegQueryInfoKeyW					advapi32	12
@@ -386,7 +373,6 @@ imp	'TraceSetInformation'					TraceSetInformation					advapi32 # Windows 7+
 #
 #	Name							Actual							DLL		Arity
 imp	'AdjustWindowRect'					AdjustWindowRect					user32		3
-imp	'AdjustWindowRectEx'					AdjustWindowRectEx					user32		4
 imp	'AnimateWindow'						AnimateWindow						user32		3
 imp	'AppendMenuA'						AppendMenuA						user32		4
 imp	'AppendMenu'						AppendMenuW						user32		4
@@ -394,9 +380,6 @@ imp	'BeginPaint'						BeginPaint						user32		2
 imp	'BringWindowToTop'					BringWindowToTop					user32		1
 imp	'CallNextHookEx'					CallNextHookEx						user32		4
 imp	'CloseWindow'						CloseWindow						user32		1
-imp	'ClientToScreen'					ClientToScreen						user32		2
-imp	'ClipCursor'						ClipCursor						user32		1
-imp	'CloseClipboard'					CloseClipboard						user32		0
 imp	'CreateIconIndirect'					CreateIconIndirect					user32		1
 imp	'CreateMenu'						CreateMenu						user32		0
 imp	'CreatePopupMenu'					CreatePopupMenu						user32		0
@@ -409,15 +392,12 @@ imp	'DestroyWindow'						DestroyWindow						user32		1
 imp	'DispatchMessage'					DispatchMessageW					user32		1
 imp	'DrawText'						DrawTextW						user32		5
 imp	'DrawTextEx'						DrawTextExW						user32		6
-imp	'EmptyClipboard'					EmptyClipboard						user32		0
 imp	'EndPaint'						EndPaint						user32		2
 imp	'EnumChildWindows'					EnumChildWindows					user32		3
 imp	'FillRect'						FillRect						user32		3
 imp	'FindWindow'						FindWindowW						user32		2
 imp	'FindWindowEx'						FindWindowExW						user32		4
-imp	'GetAsyncKeyState'					GetAsyncKeyState					user32		1
 imp	'GetClientRect'						GetClientRect						user32		2
-imp	'GetClipboardData'					GetClipboardData					user32		1
 imp	'GetCursor'						GetCursor						user32		0
 imp	'GetCursorPos'						GetCursorPos						user32		1
 imp	'GetDC'							GetDC							user32		1
@@ -426,12 +406,9 @@ imp	'GetKeyState'						GetKeyState						user32		1
 imp	'GetKeyboardLayout'					GetKeyboardLayout					user32		1
 imp	'GetMenu'						GetMenu							user32		1
 imp	'GetMessage'						GetMessageW						user32		4
-imp	'GetMonitorInfo'					GetMonitorInfoW						user32		2
-imp	'GetRawInputData'					GetRawInputData						user32		5
 imp	'GetParent'						GetParent						user32		1
 imp	'GetShellWindow'					GetShellWindow						user32		0
 imp	'GetSystemMenu'						GetSystemMenu						user32		2
-imp	'GetSystemMetrics'					GetSystemMetrics					user32		1
 imp	'GetWindow'						GetWindow						user32		2
 imp	'GetWindowPlacement'					GetWindowPlacement					user32		2
 imp	'GetWindowRect'						GetWindowRect						user32		2
@@ -452,22 +429,13 @@ imp	'MapVirtualKeyEx'					MapVirtualKeyExW					user32		3
 imp	'MessageBox'						MessageBoxW						user32		4
 imp	'MessageBoxEx'						MessageBoxExW						user32		5
 imp	'MoveWindow'						MoveWindow						user32		6
-imp	'MonitorFromPoint'					MonitorFromPoint					user32		2
-imp	'MonitorFromWindow'					MonitorFromWindow					user32		2
-imp	'OpenClipboard'						OpenClipboard						user32		1
 imp	'PeekMessage'						PeekMessageW						user32		5
-imp	'PostMessage'						PostMessageW						user32		4
 imp	'PostQuitMessage'					PostQuitMessage						user32		1
-imp	'PtInRect'						PtInRect						user32		2
 imp	'RedrawWindow'						RedrawWindow						user32		4
 imp	'RegisterClass'						RegisterClassW						user32		1
 imp	'RegisterClassEx'					RegisterClassExW					user32		1
-imp	'RegisterRawInputDevices'				RegisterRawInputDevices					user32		3
 imp	'ReleaseCapture'					ReleaseCapture						user32		0
 imp	'ReleaseDC'						ReleaseDC						user32		2
-imp	'ScreenToClient'					ScreenToClient						user32		2
-imp	'SetClipboardData'					SetClipboardData					user32		2
-imp	'SetCursorPos'						SetCursorPos						user32		2
 imp	'SendMessage'						SendMessageW						user32		4
 imp	'SetCapture'						SetCapture						user32		1
 imp	'SetClassLong'						SetClassLongW						user32		3
@@ -475,7 +443,6 @@ imp	'SetCursor'						SetCursor						user32		1
 imp	'SetParent'						SetParent						user32		2
 imp	'SetTimer'						SetTimer						user32		4
 imp	'SetWindowLong'						SetWindowLongW						user32		3
-imp	'SetWindowLongPtr'					SetWindowLongPtrW					user32		3
 imp	'SetWindowPlacement'					SetWindowPlacement					user32		2
 imp	'SetWindowPos'						SetWindowPos						user32		7
 imp	'SetWindowText'						SetWindowTextW						user32		2
@@ -484,23 +451,12 @@ imp	'SetWindowsHookEx'					SetWindowsHookExW					user32		4
 imp	'ShowCaret'						ShowCaret						user32		1
 imp	'ShowCursor'						ShowCursor						user32		1
 imp	'ShowWindow'						ShowWindow						user32		2
-imp	'TrackMouseEvent'					TrackMouseEvent						user32		1
 imp	'TrackPopupMenu'					TrackPopupMenu						user32		7
 imp	'TranslateMessage'					TranslateMessage					user32		1
 imp	'UnhookWindowsHook'					UnhookWindowsHook					user32		2
 imp	'UnhookWindowsHookEx'					UnhookWindowsHookEx					user32		1
-imp	'UnregisterClass'					UnregisterClassW					user32		2
 imp	'UpdateWindow'						UpdateWindow						user32		1
 imp	'WaitForInputIdle'					WaitForInputIdle					user32		2
-imp	'WindowFromPoint'					WindowFromPoint						user32		1
-
-# SHELL32.DLL
-#
-#	Name							Actual							DLL		Arity
-imp	'CommandLineToArgv'					CommandLineToArgvW					shell32		2
-imp	'DragAcceptFiles'					DragAcceptFiles						shell32		2
-imp	'DragFinish'						DragFinish						shell32		1
-imp	'DragQueryFile'						DragQueryFileW						shell32		4
 
 # GDI32.DLL
 #
@@ -514,7 +470,6 @@ imp	'CreateDIBSection'					CreateDIBSection					gdi32		6
 imp	'CreateRectRgn'						CreateRectRgn						gdi32		4
 imp	'DeleteDC'						DeleteDC						gdi32		1
 imp	'DeleteObject'						DeleteObject						gdi32		1
-imp	'DescribePixelFormat'					DescribePixelFormat					gdi32		4
 imp	'GetPixel'						GetPixel						gdi32		3
 imp	'RestoreDC'						RestoreDC						gdi32		2
 imp	'SaveDC'						SaveDC							gdi32		1
@@ -678,13 +633,6 @@ imp	'WakeByAddressSingle'					WakeByAddressSingle					API-MS-Win-Core-Synch-l1-2
 imp	'MapViewOfFile3'					MapViewOfFile3						API-MS-Win-Core-Memory-l1-1-6		9
 imp	'VirtualAlloc2'						VirtualAlloc2						API-MS-Win-Core-Memory-l1-1-6		7
 
-# API-MS-Win-Core-Realtime-l1-1-1.dll (Windows 10+)
-#
-#	Name							Actual							DLL					Arity
-imp	'QueryInterruptTime'					QueryInterruptTime					API-MS-Win-Core-Realtime-l1-1-1		1
-imp	'QueryInterruptTimePrecise'				QueryInterruptTimePrecise				API-MS-Win-Core-Realtime-l1-1-1		1
-imp	'QueryUnbiasedInterruptTimePrecise'			QueryUnbiasedInterruptTimePrecise			API-MS-Win-Core-Realtime-l1-1-1		1
-
 # NTDLL.DLL
 # BEYOND THE PALE
 #
@@ -759,7 +707,6 @@ imp	'NtQuerySecurityObject'					NtQuerySecurityObject					ntdll		5
 imp	'NtQuerySymbolicLinkObject'				NtQuerySymbolicLinkObject				ntdll		3
 imp	'NtQuerySystemInformation'				NtQuerySystemInformation				ntdll		4
 imp	'NtQuerySystemTime'					NtQuerySystemTime					ntdll		1
-imp	'NtQueryTimerResolution'				NtQueryTimerResolution					ntdll		3
 imp	'NtQueryValueKey'					NtQueryValueKey						ntdll		6
 imp	'NtQueryVirtualMemory'					NtQueryVirtualMemory					ntdll		6
 imp	'NtQueryVolumeInformationFile'				NtQueryVolumeInformationFile				ntdll		5
@@ -776,7 +723,6 @@ imp	'NtSetInformationFile'					NtSetInformationFile					ntdll		5
 imp	'NtSetInformationThread'				NtSetInformationThread					ntdll		4
 imp	'NtSetIntervalProfile'					NtSetIntervalProfile					ntdll		2
 imp	'NtSetTimer'						NtSetTimer						ntdll		7
-imp	'NtSetTimerResolution'					NtSetTimerResolution					ntdll		3
 imp	'NtSetValueKey'						NtSetValueKey						ntdll		6
 imp	'NtSignalAndWaitForSingleObject'			NtSignalAndWaitForSingleObject				ntdll		4
 imp	'NtStartProfile'					NtStartProfile						ntdll		1
diff --git a/libc/nt/memory.h b/libc/nt/memory.h
index 9f6792657..376f0fb16 100644
--- a/libc/nt/memory.h
+++ b/libc/nt/memory.h
@@ -71,17 +71,8 @@ bool32 VirtualUnlock(const void *lpAddress, size_t dwSize);
 uint64_t VirtualQuery(const void *lpAddress,
                       struct NtMemoryBasicInformation *lpBuffer,
                       uint64_t dwLength);
-uint64_t VirtualQueryEx(int64_t hProcess, const void *lpAddress,
-                        struct NtMemoryBasicInformation *lpBuffer,
-                        uint64_t dwLength);
-
 void *VirtualAllocEx(int64_t hProcess, void *lpAddress, uint64_t dwSize,
                      uint32_t flAllocationType, uint32_t flProtect);
-bool32 VirtualProtectEx(int64_t hProcess, void *lpAddress, uint64_t dwSize,
-                        uint32_t flNewProtect, uint32_t *out_lpflOldProtect);
-bool32 WriteProcessMemory(int64_t hProcess, void *lpBaseAddress,
-                          const void *lpBuffer, uint64_t nSize,
-                          uint64_t *opt_out_lpNumberOfBytesWritten);
 
 int64_t GetProcessHeap(void);
 void *HeapAlloc(int64_t hHeap, uint32_t dwFlags, size_t dwBytes) __wur;
diff --git a/libc/nt/nls.h b/libc/nt/nls.h
deleted file mode 100644
index 4e2761519..000000000
--- a/libc/nt/nls.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_NT_NLS_H_
-#define COSMOPOLITAN_LIBC_NT_NLS_H_
-#include "libc/nt/struct/cpinfoex.h"
-/*                            ░░░░
-                       ▒▒▒░░░▒▒▒▒▒▒▒▓▓▓░
-                      ▒▒▒▒░░░▒▒▒▒▒▒▓▓▓▓▓▓░
-                     ▒▒▒▒░░░▒▒▒▒▒▒▒▓▓▓▓▓▓  ▒▓░
-                     ▒▒▒░░░░▒▒▒▒▒▒▓▓▓▓▓▓   ▓▓▓▓▓▓▒        ▒▒▒▓▓█
-                    ▒▒▒▒░░░▒▒▒▒▒▒▒▓▓▓▓▓▓  ▓▓▓▓▓▓▓▓▒▒▒▒▒▒▒▒▒▒▓▓▓
-                   ░▒▒▒░░░░▒▒▒▒▒▒▓▓▓▓▓▓   █▓▓▓▓▓▓▓▒▒▒▒▒▒▒▒▒▒▓▓█
-                   ▒▒▒▒░░░▒▒▒▒▒▒▒▓▓▓▓▓░  ▓▓▓▓▓▓▓▓▒▒▒▒▒▒▒▒▒▒▓▓▓
-                  ▒▒▒▒░░░▒▒▒▒▒▒▒▓▓▓▓▓▓  ▒▓▓▓▓▓▓▓▓▒▒▒▒▒▒▒▒▒▒▓▓▒
-                  ▒▒▒▒▓▓      ▓▒▒▓▓▓▓   ▓▓▓▓▓▓▓▓▒▒▒▒▒▒▒▒▒▒▓▓█
-                                   ▒▓  ▓▓▓▓▓▓▓▓▓▒▒▒▒▒▒▒▒▒▒▓▓
-                  ░░░░░░░░░░░▒▒▒▒      ▓▓▓▓▓▓▓▓▒▒▒▒▒▒▒▒▒▒▓▓█
-                ▒▒░░░░░░░░░░▒▒▒▒▒▓▓▓     ▓▓▓▓▓▒▒▒▒▒▒▒▒▒▒▓▓▓
-               ░▒░░░░░░░░░░░▒▒▒▒▒▓▓   ▓░      ░▓███▓
-               ▒▒░░░░░░░░░░▒▒▒▒▒▓▓░  ▒▓▓▓▒▒▒         ░▒▒▒▓   ████████████
-              ▒▒░░░░░░░░░░░▒▒▒▒▒▓▓  ▒▓▓▓▓▒▒▒▒▒▒▒▒░░░▒▒▒▒▒░           ░███
-              ▒░░░░░░░░░░░▒▒▒▒▒▓▓   ▓▓▓▓▒▒▒▒▒▒▒▒░░░░▒▒▒▒▓            ███
-             ▒▒░░░░░░░░░░▒▒▒▒▒▒▓▓  ▒▓▓▓▒▒▒▒▒▒▒▒░░░░▒▒▒▒▒            ▓██
-             ▒░░░░░░░░░░░▒▒▒▒▒▓▓   ▓▓▓▓▒▒▒▒▒▒▒▒░░░▒▒▒▒▒▓           ▓██
-            ▒▒░░░▒▒▒░░░▒▒░▒▒▒▓▓▒  ▒▓▓▓▒▒▒▒▒▒▒▒░░░░▒▒▒▒▒           ███
-                            ░▒▓  ░▓▓▓▓▒▒▒▒▒▒▒▒░░░░▒▒▒▒▓          ▓██
-╔────────────────────────────────────────────────────────────────▀▀▀─────────│─╗
-│ cosmopolitan § new technology » internationalization                      ─╬─│┼
-╚────────────────────────────────────────────────────────────────────────────│*/
-COSMOPOLITAN_C_START_
-
-uint32_t GetOEMCP();
-uint32_t GetACP();
-bool32 GetCPInfoEx(uint32_t CodePage, uint32_t dwFlags, struct NtCpInfoEx *out_lpCPInfoEx) paramsnonnull((3));
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_LIBC_NT_NLS_H_ */
\ No newline at end of file
diff --git a/libc/nt/ntdll.h b/libc/nt/ntdll.h
index f251b923a..04a8e60f3 100644
--- a/libc/nt/ntdll.h
+++ b/libc/nt/ntdll.h
@@ -224,16 +224,6 @@ NtStatus RtlUnlockHeap(int64_t heap);
 NtStatus RtlGetProcessHeaps(uint32_t count, void **out_Heaps);
 NtStatus RtlWalkHeap(int64_t heap, void *out_Info);
 
-/*───────────────────────────────────────────────────────────────────────────│─╗
-│ cosmopolitan § new technology » beyond the pale » i am the time lorde    ─╬─│┼
-╚────────────────────────────────────────────────────────────────────────────│*/
-
-NtStatus NtSetTimerResolution(uint32_t DesiredResolution, bool32 SetResolution,
-                              uint32_t *out_CurrentResolution);
-NtStatus NtQueryTimerResolution(uint32_t *out_MinimumResolution,
-                                uint32_t *out_MaximumResolution,
-                                uint32_t *out_CurrentResolution);
-
 #if ShouldUseMsabiAttribute()
 #include "libc/nt/thunk/ntdll.inc"
 #endif /* ShouldUseMsabiAttribute() */
diff --git a/libc/nt/ntdll/NtQueryTimerResolution.S b/libc/nt/ntdll/NtQueryTimerResolution.S
deleted file mode 100644
index 2bb696be7..000000000
--- a/libc/nt/ntdll/NtQueryTimerResolution.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/ntdllimport.h"
-.ntimp	NtQueryTimerResolution,NtQueryTimerResolution
-
-	.text.windows
-	.ftrace1
-NtQueryTimerResolution:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_NtQueryTimerResolution(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	NtQueryTimerResolution,globl
-	.previous
diff --git a/libc/nt/ntdll/NtSetTimerResolution.S b/libc/nt/ntdll/NtSetTimerResolution.S
deleted file mode 100644
index bbd707afe..000000000
--- a/libc/nt/ntdll/NtSetTimerResolution.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/ntdllimport.h"
-.ntimp	NtSetTimerResolution,NtSetTimerResolution
-
-	.text.windows
-	.ftrace1
-NtSetTimerResolution:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_NtSetTimerResolution(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	NtSetTimerResolution,globl
-	.previous
diff --git a/libc/nt/ntdllimport.S b/libc/nt/ntdllimport.S
index 438f340f9..b84411560 100644
--- a/libc/nt/ntdllimport.S
+++ b/libc/nt/ntdllimport.S
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/nt/enum/status.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #ifdef __x86_64__
 
 //	@fileoverview NTDLL.DLL Non-Mandatory Importer
diff --git a/libc/nt/ntdllimport.h b/libc/nt/ntdllimport.h
index 657ed05ea..6eb8e93f6 100644
--- a/libc/nt/ntdllimport.h
+++ b/libc/nt/ntdllimport.h
@@ -19,7 +19,7 @@
 #ifndef COSMOPOLITAN_LIBC_NT_NTDLLIMPORT_H_
 #define COSMOPOLITAN_LIBC_NT_NTDLLIMPORT_H_
 #include "ape/relocations.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #ifdef __ASSEMBLER__
 /* clang-format off */
 
diff --git a/libc/nt/registry.h b/libc/nt/registry.h
index a03abfc57..d7f8abb99 100644
--- a/libc/nt/registry.h
+++ b/libc/nt/registry.h
@@ -51,8 +51,6 @@ int RegOpenKey(int64_t hKey, const char16_t *opt_lpSubKey,
 int RegOpenKeyEx(int64_t hKey, const char16_t *opt_lpSubKey,
                  uint32_t opt_ulOptions, int samDesired, int64_t *out_phkResult)
     paramsnonnull((5));
-int RegOpenKeyExA(int64_t hKey, const char *opt_lpSubKey, uint32_t opt_ulOptions, 
-                  int samDesired, int64_t *out_phkResult) paramsnonnull((5));
 int RegCloseKey(int64_t hKey);
 
 int RegGetValue(int64_t hkey, const char16_t *opt_lpSubKey,
diff --git a/libc/nt/runtime.h b/libc/nt/runtime.h
index 5aa2df862..953e77692 100644
--- a/libc/nt/runtime.h
+++ b/libc/nt/runtime.h
@@ -43,8 +43,6 @@ bool32 SetDefaultDllDirectories(unsigned dirflags);
 bool32 ProcessPrng(void *RandomBuffer, uint32_t RandomBufferLength);
 uint32_t GetModuleFileName(int64_t hModule, char16_t *lpFilename,
                            uint32_t nSize);
-bool32 IsWow64Process2(intptr_t hProcess, uint16_t *out_pProcessMachine,
-                       uint16_t *out_opt_pNativeMachine);
 
 #if ShouldUseMsabiAttribute()
 #include "libc/nt/thunk/runtime.inc"
diff --git a/libc/nt/shell32/CommandLineToArgvW.S b/libc/nt/shell32/CommandLineToArgvW.S
deleted file mode 100644
index ef2f71100..000000000
--- a/libc/nt/shell32/CommandLineToArgvW.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	shell32,__imp_CommandLineToArgvW,CommandLineToArgvW
-
-	.text.windows
-	.ftrace1
-CommandLineToArgv:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_CommandLineToArgvW(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	CommandLineToArgv,globl
-	.previous
diff --git a/libc/nt/shell32/DragAcceptFiles.S b/libc/nt/shell32/DragAcceptFiles.S
deleted file mode 100644
index 2c3c78775..000000000
--- a/libc/nt/shell32/DragAcceptFiles.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	shell32,__imp_DragAcceptFiles,DragAcceptFiles
-
-	.text.windows
-	.ftrace1
-DragAcceptFiles:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_DragAcceptFiles(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	DragAcceptFiles,globl
-	.previous
diff --git a/libc/nt/shell32/DragFinish.S b/libc/nt/shell32/DragFinish.S
deleted file mode 100644
index 5bb00758e..000000000
--- a/libc/nt/shell32/DragFinish.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	shell32,__imp_DragFinish,DragFinish
-
-	.text.windows
-	.ftrace1
-DragFinish:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_DragFinish(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	DragFinish,globl
-	.previous
diff --git a/libc/nt/shell32/DragQueryFileW.S b/libc/nt/shell32/DragQueryFileW.S
deleted file mode 100644
index efec6118d..000000000
--- a/libc/nt/shell32/DragQueryFileW.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	shell32,__imp_DragQueryFileW,DragQueryFileW
-
-	.text.windows
-	.ftrace1
-DragQueryFile:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_DragQueryFileW(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	DragQueryFile,globl
-	.previous
diff --git a/libc/nt/struct/arm64.h b/libc/nt/struct/arm64.h
deleted file mode 100644
index 295da0fcd..000000000
--- a/libc/nt/struct/arm64.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_NT_STRUCT_ARM64_H_
-#define COSMOPOLITAN_LIBC_NT_STRUCT_ARM64_H_
-
-struct NtArm64RuntimeFunction {
-  uint32_t BeginAddress;
-  union {
-    uint32_t UnwindData;
-    struct {
-      uint32_t Flag : 2;
-      uint32_t FunctionLength : 11;
-      uint32_t RegF : 3;
-      uint32_t RegI : 4;
-      uint32_t H : 1;
-      uint32_t CR : 2;
-      uint32_t FrameSize : 9;
-    };
-  };
-};
-
-#endif /* COSMOPOLITAN_LIBC_NT_STRUCT_ARM64_H_ */
diff --git a/libc/nt/struct/cpinfoex.h b/libc/nt/struct/cpinfoex.h
deleted file mode 100644
index 754501bb5..000000000
--- a/libc/nt/struct/cpinfoex.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_NT_STRUCT_CPINFOEX_H_
-#define COSMOPOLITAN_LIBC_NT_STRUCT_CPINFOEX_H_
-
-struct NtCpInfoEx {
-  uint32_t MaxCharSize;
-  uint8_t DefaultChar[2];
-  uint8_t LeadByte[12];
-  char16_t UnicodeDefaultChar;
-  uint32_t CodePage;
-  char16_t CodePageName[260];
-};
-
-#endif /* COSMOPOLITAN_LIBC_NT_STRUCT_CPINFOEX_H_ */
diff --git a/libc/nt/struct/iovec.h b/libc/nt/struct/iovec.h
index e2898da96..b29f4bd8b 100644
--- a/libc/nt/struct/iovec.h
+++ b/libc/nt/struct/iovec.h
@@ -7,7 +7,7 @@ struct NtIovec {
   char *buf;
 };
 
-void _DescribeIovNt(const struct NtIovec *, uint32_t, ssize_t);
+void DescribeIovNt(const struct NtIovec *, uint32_t, ssize_t);
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_NT_STRUCT_IOVEC_H_ */
diff --git a/libc/nt/struct/memextendedparameter.h b/libc/nt/struct/memextendedparameter.h
index 5fdd3985c..6cd4d0f5d 100644
--- a/libc/nt/struct/memextendedparameter.h
+++ b/libc/nt/struct/memextendedparameter.h
@@ -9,28 +9,26 @@
 #define kNtMemExtendedParameterPartitionHandle     3
 #define kNtMemExtendedParameterUserPhysicalHandle  4
 #define kNtMemExtendedParameterAttributeFlags      5
-#define kNtMemExtendedParameterImageMachine        6
-#define kNtMemExtendedParameterMax                 7
+#define kNtMemExtendedParameterMax                 6
 
 #define kNtMemExtendedParameterGraphics          0x00000001
 #define kNtMemExtendedParameterNonpaged          0x00000002
 #define kNtMemExtendedParameterZeroPagesOptional 0x00000004
 #define kNtMemExtendedParameterNonpagedLarge     0x00000008
 #define kNtMemExtendedParameterNonpagedHuge      0x00000010
-#define kNtMemExtendedParameterSoftFaultPages    0x00000020
-#define kNtMemExtendedParameterEcCode            0x00000040
-#define kNtMemExtendedParameterImageNoHpat       0x00000080
 
 struct NtMemExtendedParameter {
-  uint8_t Type;
-  uint8_t Reserved[7];
+  struct {
+    uint64_t Type : kNtMemExtendedParameterTypeBits;
+    uint64_t Reserved : 64 - kNtMemExtendedParameterTypeBits;
+  } DUMMYSTRUCTNAME;
   union {
     uint64_t ULong64;
     void *Pointer;
     size_t Size;
     intptr_t Handle;
     unsigned ULong;
-  };
+  } DUMMYUNIONNAME;
 };
 
 #endif /* COSMOPOLITAN_LIBC_NT_STRUCT_MEMEXTENDEDPARAMETER_H_ */
diff --git a/libc/nt/struct/securityattributes.h b/libc/nt/struct/securityattributes.h
index 05145944c..e481ede22 100644
--- a/libc/nt/struct/securityattributes.h
+++ b/libc/nt/struct/securityattributes.h
@@ -9,9 +9,9 @@ struct NtSecurityAttributes {
   bool32 bInheritHandle;
 };
 
-const char *_DescribeNtSecurityAttributes(char[32],
-                                          const struct NtSecurityAttributes *);
+const char *DescribeNtSecurityAttributes(char[32],
+                                         const struct NtSecurityAttributes *);
 #define DescribeNtSecurityAttributes(x) \
-  _DescribeNtSecurityAttributes(alloca(32), x)
+  DescribeNtSecurityAttributes(alloca(32), x)
 
 #endif /* COSMOPOLITAN_LIBC_NT_STRUCT_SECURITYATTRIBUTES_H_ */
diff --git a/libc/nt/time.h b/libc/nt/time.h
index a415075f9..59c359ca3 100644
--- a/libc/nt/time.h
+++ b/libc/nt/time.h
@@ -33,10 +33,5 @@ uint32_t GetTimeZoneInformation(
 uint32_t GetDynamicTimeZoneInformation(
     struct NtDynamicTimeZoneInformation *out_lpTimeZoneInformation);
 
-bool32 QueryInterruptTime(uint64_t *);                /* Windows 10+ */
-bool32 QueryInterruptTimePrecise(uint64_t *);         /* Windows 10+ */
-bool32 QueryUnbiasedInterruptTime(uint64_t *);        /* Windows  7+ */
-bool32 QueryUnbiasedInterruptTimePrecise(uint64_t *); /* Windows 10+ */
-
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_NT_TIME_H_ */
diff --git a/libc/nt/user32/AdjustWindowRectEx.S b/libc/nt/user32/AdjustWindowRectEx.S
deleted file mode 100644
index 04416b211..000000000
--- a/libc/nt/user32/AdjustWindowRectEx.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_AdjustWindowRectEx,AdjustWindowRectEx
-
-	.text.windows
-	.ftrace1
-AdjustWindowRectEx:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_AdjustWindowRectEx(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	AdjustWindowRectEx,globl
-	.previous
diff --git a/libc/nt/user32/ClientToScreen.S b/libc/nt/user32/ClientToScreen.S
deleted file mode 100644
index 9bc45dee8..000000000
--- a/libc/nt/user32/ClientToScreen.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_ClientToScreen,ClientToScreen
-
-	.text.windows
-	.ftrace1
-ClientToScreen:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_ClientToScreen(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	ClientToScreen,globl
-	.previous
diff --git a/libc/nt/user32/ClipCursor.S b/libc/nt/user32/ClipCursor.S
deleted file mode 100644
index f2c7f19ec..000000000
--- a/libc/nt/user32/ClipCursor.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_ClipCursor,ClipCursor
-
-	.text.windows
-	.ftrace1
-ClipCursor:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_ClipCursor(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	ClipCursor,globl
-	.previous
diff --git a/libc/nt/user32/CloseClipboard.S b/libc/nt/user32/CloseClipboard.S
deleted file mode 100644
index b57e82022..000000000
--- a/libc/nt/user32/CloseClipboard.S
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_CloseClipboard,CloseClipboard
-
-	.text.windows
-	.ftrace1
-CloseClipboard:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	sub	$32,%rsp
-	call	*__imp_CloseClipboard(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	CloseClipboard,globl
-	.previous
diff --git a/libc/nt/user32/EmptyClipboard.S b/libc/nt/user32/EmptyClipboard.S
deleted file mode 100644
index 6038d29da..000000000
--- a/libc/nt/user32/EmptyClipboard.S
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_EmptyClipboard,EmptyClipboard
-
-	.text.windows
-	.ftrace1
-EmptyClipboard:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	sub	$32,%rsp
-	call	*__imp_EmptyClipboard(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	EmptyClipboard,globl
-	.previous
diff --git a/libc/nt/user32/GetAsyncKeyState.S b/libc/nt/user32/GetAsyncKeyState.S
deleted file mode 100644
index 9b1d32e1d..000000000
--- a/libc/nt/user32/GetAsyncKeyState.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_GetAsyncKeyState,GetAsyncKeyState
-
-	.text.windows
-	.ftrace1
-GetAsyncKeyState:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_GetAsyncKeyState(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	GetAsyncKeyState,globl
-	.previous
diff --git a/libc/nt/user32/GetClipboardData.S b/libc/nt/user32/GetClipboardData.S
deleted file mode 100644
index b51af7d3a..000000000
--- a/libc/nt/user32/GetClipboardData.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_GetClipboardData,GetClipboardData
-
-	.text.windows
-	.ftrace1
-GetClipboardData:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_GetClipboardData(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	GetClipboardData,globl
-	.previous
diff --git a/libc/nt/user32/GetMonitorInfoW.S b/libc/nt/user32/GetMonitorInfoW.S
deleted file mode 100644
index 2ec6986fe..000000000
--- a/libc/nt/user32/GetMonitorInfoW.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_GetMonitorInfoW,GetMonitorInfoW
-
-	.text.windows
-	.ftrace1
-GetMonitorInfo:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_GetMonitorInfoW(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	GetMonitorInfo,globl
-	.previous
diff --git a/libc/nt/user32/GetRawInputData.S b/libc/nt/user32/GetRawInputData.S
deleted file mode 100644
index 6324125ea..000000000
--- a/libc/nt/user32/GetRawInputData.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_GetRawInputData,GetRawInputData
-
-	.text.windows
-	.ftrace1
-GetRawInputData:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_GetRawInputData(%rip),%rax
-	jmp	__sysv2nt6
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	GetRawInputData,globl
-	.previous
diff --git a/libc/nt/user32/GetSystemMetrics.S b/libc/nt/user32/GetSystemMetrics.S
deleted file mode 100644
index 4a92d3184..000000000
--- a/libc/nt/user32/GetSystemMetrics.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_GetSystemMetrics,GetSystemMetrics
-
-	.text.windows
-	.ftrace1
-GetSystemMetrics:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_GetSystemMetrics(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	GetSystemMetrics,globl
-	.previous
diff --git a/libc/nt/user32/MonitorFromPoint.S b/libc/nt/user32/MonitorFromPoint.S
deleted file mode 100644
index b27b7a8aa..000000000
--- a/libc/nt/user32/MonitorFromPoint.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_MonitorFromPoint,MonitorFromPoint
-
-	.text.windows
-	.ftrace1
-MonitorFromPoint:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_MonitorFromPoint(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	MonitorFromPoint,globl
-	.previous
diff --git a/libc/nt/user32/MonitorFromWindow.S b/libc/nt/user32/MonitorFromWindow.S
deleted file mode 100644
index ec49593a4..000000000
--- a/libc/nt/user32/MonitorFromWindow.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_MonitorFromWindow,MonitorFromWindow
-
-	.text.windows
-	.ftrace1
-MonitorFromWindow:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_MonitorFromWindow(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	MonitorFromWindow,globl
-	.previous
diff --git a/libc/nt/user32/OpenClipboard.S b/libc/nt/user32/OpenClipboard.S
deleted file mode 100644
index f6b8afb7e..000000000
--- a/libc/nt/user32/OpenClipboard.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_OpenClipboard,OpenClipboard
-
-	.text.windows
-	.ftrace1
-OpenClipboard:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_OpenClipboard(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	OpenClipboard,globl
-	.previous
diff --git a/libc/nt/user32/PostMessageW.S b/libc/nt/user32/PostMessageW.S
deleted file mode 100644
index 5da8cf132..000000000
--- a/libc/nt/user32/PostMessageW.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_PostMessageW,PostMessageW
-
-	.text.windows
-	.ftrace1
-PostMessage:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_PostMessageW(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	PostMessage,globl
-	.previous
diff --git a/libc/nt/user32/RegisterRawInputDevices.S b/libc/nt/user32/RegisterRawInputDevices.S
deleted file mode 100644
index 9f100d244..000000000
--- a/libc/nt/user32/RegisterRawInputDevices.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_RegisterRawInputDevices,RegisterRawInputDevices
-
-	.text.windows
-	.ftrace1
-RegisterRawInputDevices:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_RegisterRawInputDevices(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	RegisterRawInputDevices,globl
-	.previous
diff --git a/libc/nt/user32/ScreenToClient.S b/libc/nt/user32/ScreenToClient.S
deleted file mode 100644
index 3c1d2ca54..000000000
--- a/libc/nt/user32/ScreenToClient.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_ScreenToClient,ScreenToClient
-
-	.text.windows
-	.ftrace1
-ScreenToClient:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_ScreenToClient(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	ScreenToClient,globl
-	.previous
diff --git a/libc/nt/user32/SetClipboardData.S b/libc/nt/user32/SetClipboardData.S
deleted file mode 100644
index ca8e59591..000000000
--- a/libc/nt/user32/SetClipboardData.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_SetClipboardData,SetClipboardData
-
-	.text.windows
-	.ftrace1
-SetClipboardData:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_SetClipboardData(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	SetClipboardData,globl
-	.previous
diff --git a/libc/nt/user32/SetCursorPos.S b/libc/nt/user32/SetCursorPos.S
deleted file mode 100644
index f29847884..000000000
--- a/libc/nt/user32/SetCursorPos.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_SetCursorPos,SetCursorPos
-
-	.text.windows
-	.ftrace1
-SetCursorPos:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_SetCursorPos(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	SetCursorPos,globl
-	.previous
diff --git a/libc/nt/user32/SetWindowLongPtrW.S b/libc/nt/user32/SetWindowLongPtrW.S
deleted file mode 100644
index 3ff044c05..000000000
--- a/libc/nt/user32/SetWindowLongPtrW.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_SetWindowLongPtrW,SetWindowLongPtrW
-
-	.text.windows
-	.ftrace1
-SetWindowLongPtr:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_SetWindowLongPtrW(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	SetWindowLongPtr,globl
-	.previous
diff --git a/libc/nt/user32/TrackMouseEvent.S b/libc/nt/user32/TrackMouseEvent.S
deleted file mode 100644
index 7220d178a..000000000
--- a/libc/nt/user32/TrackMouseEvent.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_TrackMouseEvent,TrackMouseEvent
-
-	.text.windows
-	.ftrace1
-TrackMouseEvent:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_TrackMouseEvent(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	TrackMouseEvent,globl
-	.previous
diff --git a/libc/nt/user32/UnregisterClassW.S b/libc/nt/user32/UnregisterClassW.S
deleted file mode 100644
index 398af1b53..000000000
--- a/libc/nt/user32/UnregisterClassW.S
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_UnregisterClassW,UnregisterClassW
-
-	.text.windows
-	.ftrace1
-UnregisterClass:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	__imp_UnregisterClassW(%rip),%rax
-	jmp	__sysv2nt
-#elif defined(__aarch64__)
-	mov	x0,#0
-	ret
-#endif
-	.endfn	UnregisterClass,globl
-	.previous
diff --git a/libc/nt/user32/WindowFromPoint.S b/libc/nt/user32/WindowFromPoint.S
deleted file mode 100644
index eaf6c0410..000000000
--- a/libc/nt/user32/WindowFromPoint.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "libc/nt/codegen.h"
-.imp	user32,__imp_WindowFromPoint,WindowFromPoint
-
-	.text.windows
-	.ftrace1
-WindowFromPoint:
-	.ftrace2
-#ifdef __x86_64__
-	push	%rbp
-	mov	%rsp,%rbp
-	mov	%rdi,%rcx
-	sub	$32,%rsp
-	call	*__imp_WindowFromPoint(%rip)
-	leave
-#elif defined(__aarch64__)
-	mov	x0,#0
-#endif
-	ret
-	.endfn	WindowFromPoint,globl
-	.previous
diff --git a/libc/proc/BUILD.mk b/libc/proc/BUILD.mk
index 8491e5635..1ddefad2b 100644
--- a/libc/proc/BUILD.mk
+++ b/libc/proc/BUILD.mk
@@ -30,14 +30,11 @@ LIBC_PROC_A_DIRECTDEPS =				\
 	LIBC_MEM					\
 	LIBC_NEXGEN32E					\
 	LIBC_NT_KERNEL32				\
-	LIBC_NT_NTDLL					\
 	LIBC_NT_PSAPI					\
 	LIBC_RUNTIME					\
 	LIBC_STR					\
 	LIBC_SYSV					\
 	LIBC_SYSV_CALLS					\
-	THIRD_PARTY_DLMALLOC				\
-	THIRD_PARTY_GDTOA				\
 	THIRD_PARTY_NSYNC				\
 
 LIBC_PROC_A_DEPS :=					\
diff --git a/libc/system/cocmd.c b/libc/proc/cocmd.c
similarity index 96%
rename from libc/system/cocmd.c
rename to libc/proc/cocmd.c
index 36bbbf5ba..4632ab559 100644
--- a/libc/system/cocmd.c
+++ b/libc/proc/cocmd.c
@@ -29,7 +29,7 @@
 #include "libc/intrin/getenv.h"
 #include "libc/intrin/weaken.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/serialize.h"
 #include "libc/stdio/stdio.h"
@@ -743,12 +743,15 @@ static int TryBuiltin(bool wantexec) {
     return Usleep();
   if (!strcmp(args[0], "toupper"))
     return Toupper();
-  if (!strcmp(args[0], "tr"))
-    return Fake(_tr, wantexec);
-  if (!strcmp(args[0], "sed"))
-    return Fake(_sed, wantexec);
-  if (_weaken(_awk) && strcmp(args[0], "awk"))
+  if (_weaken(_tr) && !strcmp(args[0], "tr")) {
+    return Fake(_weaken(_tr), wantexec);
+  }
+  if (_weaken(_sed) && !strcmp(args[0], "sed")) {
+    return Fake(_weaken(_sed), wantexec);
+  }
+  if (_weaken(_awk) && !strcmp(args[0], "awk")) {
     return Fake(_weaken(_awk), wantexec);
+  }
   if (_weaken(_curl) && !strcmp(args[0], "curl")) {
     return Fake(_weaken(_curl), wantexec);
   }
@@ -1052,9 +1055,12 @@ int _cocmd(int argc, char **argv, char **envp) {
   unsupported['('] = true;
   unsupported[')'] = true;
   unsupported['{'] = true;
-  // Perl t/op/exec.t depends on unpaired } being
-  // passed from the shell to Perl
-  unsupported['}'] = false;
+  unsupported['}'] = false;  // Perl t/op/exec.t depends on unpaired } being
+                             // passed from the shell to Perl
+  if (!_weaken(glob)) {
+    unsupported['*'] = true;
+    unsupported['?'] = true;
+  }
 
   if (argc >= 3 && !strcmp(argv[1], "--")) {
     for (i = 2; i < argc; ++i) {
@@ -1115,16 +1121,18 @@ int _cocmd(int argc, char **argv, char **envp) {
         Open(GetRedirectArg(prog, arg, 1), 0, O_RDONLY);
       } else {
         int globrc = GLOB_NOMATCH;
-        globrc = glob(arg, globFlags, NULL, &globTheBuilder);
-        if (globrc == 0) {
-          for (; globCount < globTheBuilder.gl_pathc; globCount++) {
-            args[n++] = globTheBuilder.gl_pathv[globCount];
+        if (_weaken(glob)) {
+          globrc = _weaken(glob)(arg, globFlags, NULL, &globTheBuilder);
+          if (globrc == 0) {
+            for (; globCount < globTheBuilder.gl_pathc; globCount++) {
+              args[n++] = globTheBuilder.gl_pathv[globCount];
+            }
+          } else if (globrc != GLOB_NOMATCH) {
+            tinyprint(2, prog, ": error: with glob\n", NULL);
+            _Exit(16);
           }
-        } else if (globrc != GLOB_NOMATCH) {
-          tinyprint(2, prog, ": error: with glob\n", NULL);
-          _Exit(16);
+          globFlags |= GLOB_APPEND;
         }
-        globFlags |= GLOB_APPEND;
         if (globrc == GLOB_NOMATCH) {
           args[n++] = arg;
         }
diff --git a/libc/proc/describefds.c b/libc/proc/describefds.c
index 4bc203fe3..d2cee918c 100644
--- a/libc/proc/describefds.c
+++ b/libc/proc/describefds.c
@@ -17,18 +17,17 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
-#include "libc/calls/syscall_support-nt.internal.h"
-#include "libc/errno.h"
-#include "libc/fmt/itoa.h"
 #include "libc/intrin/fds.h"
-#include "libc/intrin/maps.h"
+#include "libc/calls/syscall_support-nt.internal.h"
+#include "libc/fmt/itoa.h"
+#include "libc/intrin/strace.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/files.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/struct/startupinfo.h"
 #include "libc/sysv/consts/o.h"
 
-#define FDS_VAR "_COSMO_FDS_V2="
+#define FDS_VAR "_COSMO_FDS="
 
 #define MAX_ENTRY_BYTES 256
 
@@ -100,8 +99,6 @@ textwindows char *__describe_fds(const struct Fd *fds, size_t fdslen,
     if (__is_cloexec(f))
       continue;
     ++handlecount;
-    if (f->cursor)
-      ++handlecount;
   }
   if (!(handles = calloc(handlecount, sizeof(*handles)))) {
   OnFailure:
@@ -119,30 +116,16 @@ textwindows char *__describe_fds(const struct Fd *fds, size_t fdslen,
     // make inheritable version of handle exist in creator process
     if (!DuplicateHandle(GetCurrentProcess(), f->handle, hCreatorProcess,
                          &handle, 0, true, kNtDuplicateSameAccess)) {
+      STRACE("__describe_fds() DuplicateHandle() failed w/ %d", GetLastError());
       __winerr();
       goto OnFailure;
     }
-    for (uint32_t i = 0; i < 3; ++i)
-      if (lpStartupInfo->stdiofds[i] == f->handle)
+    for (uint32_t i = 0; i < 3; ++i) {
+      if (lpStartupInfo->stdiofds[i] == f->handle) {
         lpStartupInfo->stdiofds[i] = handle;
-    handles[hi++] = handle;
-
-    // get shared memory handle for the file offset pointer
-    intptr_t shand = 0;
-    if (f->cursor) {
-      struct Map *map;
-      if (!(map = __maps_floor((const char *)f->cursor->shared)) ||
-          map->addr != (const char *)f->cursor->shared) {
-        errno = EFAULT;
-        goto OnFailure;
       }
-      if (!DuplicateHandle(GetCurrentProcess(), map->hand, hCreatorProcess,
-                           &shand, 0, true, kNtDuplicateSameAccess)) {
-        __winerr();
-        goto OnFailure;
-      }
-      handles[hi++] = shand;
     }
+    handles[hi++] = handle;
 
     // ensure output string has enough space for new entry
     if (sb.i + MAX_ENTRY_BYTES > sb.n) {
@@ -168,7 +151,7 @@ textwindows char *__describe_fds(const struct Fd *fds, size_t fdslen,
     *p++ = '_';
     p = FormatInt64(p, f->mode);
     *p++ = '_';
-    p = FormatInt64(p, shand);
+    p = FormatInt64(p, f->pointer);
     *p++ = '_';
     p = FormatInt64(p, f->type);
     *p++ = '_';
diff --git a/libc/proc/describefds.internal.h b/libc/proc/describefds.internal.h
index 1cde5234b..dd192630a 100644
--- a/libc/proc/describefds.internal.h
+++ b/libc/proc/describefds.internal.h
@@ -4,8 +4,6 @@
 #include "libc/nt/struct/startupinfo.h"
 COSMOPOLITAN_C_START_
 
-#define CURSOR_ADDRESS_FLAG 0x4000000000000000
-
 bool __is_cloexec(const struct Fd *) libcesque;
 void __undescribe_fds(int64_t, int64_t *, uint32_t) libcesque;
 char *__describe_fds(const struct Fd *, size_t, struct NtStartupInfo *, int64_t,
diff --git a/libc/proc/execve-nt.greg.c b/libc/proc/execve-nt.greg.c
index 42cb01c67..226029e1b 100644
--- a/libc/proc/execve-nt.greg.c
+++ b/libc/proc/execve-nt.greg.c
@@ -17,89 +17,53 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
-#include "libc/atomic.h"
-#include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
-#include "libc/calls/sig.internal.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/errno.h"
 #include "libc/fmt/itoa.h"
-#include "libc/intrin/dll.h"
 #include "libc/intrin/fds.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/intrin/strace.h"
 #include "libc/mem/mem.h"
-#include "libc/nt/accounting.h"
 #include "libc/nt/enum/processaccess.h"
 #include "libc/nt/enum/startf.h"
-#include "libc/nt/enum/status.h"
 #include "libc/nt/errors.h"
 #include "libc/nt/files.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/struct/processinformation.h"
 #include "libc/nt/struct/startupinfo.h"
-#include "libc/nt/synchronization.h"
-#include "libc/nt/thread.h"
-#include "libc/nt/thunk/msabi.h"
 #include "libc/proc/describefds.internal.h"
 #include "libc/proc/ntspawn.h"
-#include "libc/runtime/internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/at.h"
 #include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/sig.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #ifdef __x86_64__
 
-__msabi extern typeof(CloseHandle) *const __imp_CloseHandle;
-__msabi extern typeof(TerminateProcess) *const __imp_TerminateProcess;
-
-extern atomic_int __sig_worker_state;
-
-static void sys_execve_nt_abort(sigset_t sigmask) {
-  __sig_worker_state &= ~2;
-  __sig_unblock(sigmask);
-}
-
 textwindows int sys_execve_nt(const char *program, char *const argv[],
                               char *const envp[]) {
 
   // execve() needs to be @asyncsignalsafe
   sigset_t sigmask = __sig_block();
-  __sig_worker_state |= 2;
-  for (;;)
-    if (__sig_worker_state & 1)
-      break;
+  _pthread_lock();
 
   // new process should be a child of our parent
-  int64_t hParentProcess =
-      sys_getppid_nt_win32
-          ? OpenProcess(kNtProcessDupHandle | kNtProcessCreateProcess, false,
-                        sys_getppid_nt_win32)
-          : 0;
-
-  // inherit pid
-  char pidvar[11 + 21];
-  FormatUint64(stpcpy(pidvar, "_COSMO_PID="), __pid);
+  int64_t hParentProcess;
+  int ppid = sys_getppid_nt();
+  if (!(hParentProcess = OpenProcess(
+            kNtProcessDupHandle | kNtProcessCreateProcess, false, ppid))) {
+    _pthread_unlock();
+    __sig_unblock(sigmask);
+    return -1;
+  }
 
   // inherit signal mask
   char maskvar[6 + 21];
   FormatUint64(stpcpy(maskvar, "_MASK="), sigmask);
 
-  // inherit parent process id
-  char ppidvar[12 + 21 + 1 + 21 + 1], *p = ppidvar;
-  p = stpcpy(p, "_COSMO_PPID=");
-  if (hParentProcess) {
-    p = FormatUint64(p, sys_getppid_nt_win32);
-    *p++ = ':';
-    p = FormatUint64(p, __pid);
-    setenv("_COSMO_PPID", ppidvar, true);
-  }
-
   // define stdio handles for the spawned subprocess
   struct NtStartupInfo si = {
       .cb = sizeof(struct NtStartupInfo),
@@ -113,44 +77,29 @@ textwindows int sys_execve_nt(const char *program, char *const argv[],
     }
   }
 
-  // which process is responsible for spawning the child?
-  int64_t hCreatorProcess;
-  if (hParentProcess) {
-    hCreatorProcess = hParentProcess;
-  } else {
-    hCreatorProcess = GetCurrentProcess();
-  }
-
   // pass serialized file descriptor table in environment
   char *fdspec;
   int64_t *lpExplicitHandles;
   uint32_t dwExplicitHandleCount;
-  if (!(fdspec = __describe_fds(g_fds.p, g_fds.n, &si, hCreatorProcess,
+  if (!(fdspec = __describe_fds(g_fds.p, g_fds.n, &si, hParentProcess,
                                 &lpExplicitHandles, &dwExplicitHandleCount))) {
-    if (hParentProcess)
-      __imp_CloseHandle(hParentProcess);
-    sys_execve_nt_abort(sigmask);
+    CloseHandle(hParentProcess);
+    _pthread_unlock();
+    __sig_unblock(sigmask);
     return -1;
   }
 
-  // inherit pending signals
-  atomic_fetch_or_explicit(
-      __sig.process,
-      atomic_load_explicit(&__get_tls()->tib_sigpending, memory_order_acquire),
-      memory_order_release);
-
   // launch the process
   struct NtProcessInformation pi;
   int rc = ntspawn(&(struct NtSpawnArgs){
-      AT_FDCWD, program, argv, envp,
-      (char *[]){fdspec, maskvar, pidvar, ppidvar, 0}, 0, 0, hCreatorProcess,
-      lpExplicitHandles, dwExplicitHandleCount, &si, &pi});
-  __undescribe_fds(hCreatorProcess, lpExplicitHandles, dwExplicitHandleCount);
+      AT_FDCWD, program, argv, envp, (char *[]){fdspec, maskvar, 0}, 0, 0,
+      hParentProcess, lpExplicitHandles, dwExplicitHandleCount, &si, &pi});
+  __undescribe_fds(hParentProcess, lpExplicitHandles, dwExplicitHandleCount);
   if (rc == -1) {
     free(fdspec);
-    if (hParentProcess)
-      __imp_CloseHandle(hParentProcess);
-    sys_execve_nt_abort(sigmask);
+    CloseHandle(hParentProcess);
+    _pthread_unlock();
+    __sig_unblock(sigmask);
     if (GetLastError() == kNtErrorSharingViolation) {
       return etxtbsy();
     } else {
@@ -158,55 +107,15 @@ textwindows int sys_execve_nt(const char *program, char *const argv[],
     }
   }
 
-  // check if parent spoofing worked
-  if (hParentProcess) {
-    // give child to libc/proc/proc.c worker thread in parent
-    int64_t handle;
-    if (DuplicateHandle(GetCurrentProcess(), pi.hProcess, hParentProcess,
-                        &handle, 0, false, kNtDuplicateSameAccess)) {
-      unassert(!(handle & 0xFFFFFFFFFF000000));
-      __imp_TerminateProcess(-1, 0x23000000u | handle);
-    } else {
-      // TODO(jart): Why does `make loc` print this?
-      // kprintf("DuplicateHandle failed w/ %d\n", GetLastError());
-      __imp_TerminateProcess(-1, ECHILD);
-    }
-    __builtin_unreachable();
-  }
-
-  // we couldn't reparent the new process
-  STRACE("warning: execve() lingering due to non-cosmo parent process");
-
-  // terminate other threads
-  _pthread_lock();
-  struct Dll *e;
-  struct PosixThread *me = _pthread_self();
-  for (e = dll_first(_pthread_list); e; e = dll_next(_pthread_list, e)) {
-    struct PosixThread *pt = POSIXTHREAD_CONTAINER(e);
-    if (pt == me)
-      continue;
-    TerminateThread(
-        atomic_load_explicit(&pt->tib->tib_syshand, memory_order_relaxed),
-        SIGKILL);
-  }
-
-  // wait for child to terminate and propagate exit code
-  for (;;) {
-    uint32_t status;
-    WaitForSingleObject(pi.hProcess, -1u);
-    GetExitCodeProcess(pi.hProcess, &status);
-    if (status != kNtStillActive) {
-      if ((status & 0xFF000000u) == 0x23000000u) {
-        // handle child execve()
-        __imp_CloseHandle(pi.hProcess);
-        pi.hProcess = status & 0x00FFFFFF;
-      } else {
-        // handle child _Exit()
-        if (status == 0xc9af3d51u)
-          status = kNtStillActive;
-        TerminateThisProcess(status);
-      }
-    }
+  // give child to libc/proc/proc.c worker thread in parent
+  int64_t handle;
+  if (DuplicateHandle(GetCurrentProcess(), pi.hProcess, hParentProcess, &handle,
+                      0, false, kNtDuplicateSameAccess)) {
+    unassert(!(handle & 0xFFFFFFFFFF000000));
+    TerminateThisProcess(0x23000000u | handle);
+  } else {
+    kprintf("DuplicateHandle failed w/ %d\n", GetLastError());
+    TerminateThisProcess(ECHILD);
   }
 }
 
diff --git a/libc/proc/execve.c b/libc/proc/execve.c
index b610f8b29..781bd3f26 100644
--- a/libc/proc/execve.c
+++ b/libc/proc/execve.c
@@ -36,55 +36,14 @@
 /**
  * Replaces current process with program.
  *
- * Your `prog` may be an actually portable executable or a platform
- * native binary (e.g. ELF, Mach-O, PE). On UNIX systems, your execve
- * implementation will try to find where the `ape` interpreter program
- * is installed on your system. The preferred location is `/usr/bin/ape`
- * except on Apple Silicon where it's `/usr/local/bin/ape`. The $TMPDIR
- * and $HOME locations that the APE shell script extracts the versioned
- * ape binaries to will also be checked as a fallback path. Finally, if
- * `prog` isn't an executable in any recognizable format, cosmo assumes
- * it's a bourne shell script and launches it under /bin/sh.
- *
- * The signal mask and pending signals are inherited by the new process.
- * Note the NetBSD kernel has a bug where pending signals are cleared.
- *
- * File descriptors that haven't been marked `O_CLOEXEC` through various
- * devices such as open() and fcntl() will be inherited by the executed
- * subprocess. The current file position of the duplicated descriptors
- * is shared across processes. On Windows, `prog` needs to be built by
- * cosmocc in order to properly inherit file descriptors. If a program
- * compiled by MSVC or Cygwin is launched instead, then only the stdio
- * file descriptors can be passed along.
- *
  * On Windows, `argv` and `envp` can't contain binary strings. They need
  * to be valid UTF-8 in order to round-trip the WIN32 API, without being
  * corrupted.
  *
- * On Windows, cosmo execve uses parent spoofing to implement the UNIX
- * behavior of replacing the current process. Since POSIX.1 also needs
- * us to maintain the same PID number too, the _COSMO_PID environemnt
- * variable is passed to the child process which specifies a spoofed
- * PID. Whatever is in that variable will be reported by getpid() and
- * other cosmo processes will be able to send signals to the process
- * using that pid, via kill(). These synthetic PIDs which are only
- * created by execve could potentially overlap with OS assignments if
- * Windows recycles them. Cosmo avoids that by tracking handles of
- * subprocesses. Each process has its own process manager thread, to
- * associate pids with win32 handles, and execve will tell the parent
- * process its new handle when it changes. However it's not perfect.
- * There's still situations where processes created by execve() can
- * cause surprising things to happen. For an alternative, consider
- * posix_spawn() which is fastest and awesomest across all OSes.
- *
- * On Windows, support is currently not implemented for inheriting
- * setitimer() and alarm() into an executed process.
- *
- * On Windows, support is currently not implemented for inheriting
- * getrusage() statistics into an executed process.
- *
- * The executed process will share the same terminal and current
- * directory.
+ * On Windows, only file descriptors 0, 1 and 2 can be passed to a child
+ * process in such a way that allows them to be automatically discovered
+ * when the child process initializes. Cosmpolitan currently treats your
+ * other file descriptors as implicitly O_CLOEXEC.
  *
  * @param program will not be PATH searched, see commandv()
  * @param argv[0] is the name of the program to run
diff --git a/libc/proc/fork-nt.c b/libc/proc/fork-nt.c
index 7f0ddca2d..4ca7b725a 100644
--- a/libc/proc/fork-nt.c
+++ b/libc/proc/fork-nt.c
@@ -16,55 +16,63 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "ape/sections.internal.h"
+#include "libc/assert.h"
+#include "libc/atomic.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
 #include "libc/calls/state.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/errno.h"
+#include "libc/fmt/itoa.h"
+#include "libc/intrin/atomic.h"
 #include "libc/intrin/directmap.h"
-#include "libc/intrin/dll.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/maps.h"
 #include "libc/intrin/strace.h"
+#include "libc/intrin/tree.h"
 #include "libc/intrin/weaken.h"
-#include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
+#include "libc/nt/createfile.h"
+#include "libc/nt/enum/accessmask.h"
 #include "libc/nt/enum/creationdisposition.h"
 #include "libc/nt/enum/filemapflags.h"
-#include "libc/nt/enum/memflags.h"
 #include "libc/nt/enum/pageflags.h"
-#include "libc/nt/enum/processcreationflags.h"
 #include "libc/nt/enum/startf.h"
 #include "libc/nt/errors.h"
+#include "libc/nt/ipc.h"
 #include "libc/nt/memory.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
-#include "libc/nt/struct/processinformation.h"
-#include "libc/nt/struct/startupinfo.h"
+#include "libc/nt/signals.h"
+#include "libc/nt/struct/ntexceptionpointers.h"
 #include "libc/nt/thread.h"
 #include "libc/nt/thunk/msabi.h"
-#include "libc/nt/winsock.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/ntspawn.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/runtime/internal.h"
+#include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
+#include "libc/runtime/stack.h"
 #include "libc/runtime/symbols.internal.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/at.h"
+#include "libc/sysv/consts/limits.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/errfuns.h"
+#include "libc/thread/itimer.internal.h"
+#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/tls.h"
+#include "libc/thread/tls2.internal.h"
 #ifdef __x86_64__
 
 extern long __klog_handle;
-extern bool __winmain_isfork;
-extern intptr_t __winmain_jmpbuf[5];
-extern struct CosmoTib *__winmain_tib;
+void WipeKeystrokes(void);
+__msabi extern typeof(GetCurrentProcessId) *const __imp_GetCurrentProcessId;
 
-__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
-__msabi extern typeof(MapViewOfFileEx) *const __imp_MapViewOfFileEx;
-__msabi extern typeof(VirtualProtectEx) *const __imp_VirtualProtectEx;
-
-textwindows wontreturn static void AbortFork(const char *func, void *addr) {
+static textwindows wontreturn void AbortFork(const char *func, void *addr) {
 #if SYSDEBUG
   kprintf("fork() %!s(%lx) failed with win32 error %u\n", func, addr,
           GetLastError());
@@ -72,10 +80,96 @@ textwindows wontreturn static void AbortFork(const char *func, void *addr) {
   TerminateThisProcess(SIGSTKFLT);
 }
 
-textwindows static void ViewOrDie(int64_t h, uint32_t access, size_t pos,
+static textwindows char16_t *ParseInt(char16_t *p, int64_t *x) {
+  *x = 0;
+  while (*p == ' ')
+    p++;
+  while ('0' <= *p && *p <= '9') {
+    *x *= 10;
+    *x += *p++ - '0';
+  }
+  return p;
+}
+
+static inline textwindows ssize_t ForkIo(int64_t h, char *p, size_t n,
+                                         bool32 (*f)(int64_t, void *, uint32_t,
+                                                     uint32_t *,
+                                                     struct NtOverlapped *)) {
+  size_t i;
+  uint32_t x;
+  for (i = 0; i < n; i += x) {
+    if (!f(h, p + i, n - i, &x, 0))
+      return __winerr();
+    if (!x)
+      break;
+  }
+  return i;
+}
+
+static dontinline textwindows ssize_t ForkIo2(
+    int64_t h, void *buf, size_t n,
+    bool32 (*fn)(int64_t, void *, uint32_t, uint32_t *, struct NtOverlapped *),
+    const char *sf, bool ischild) {
+  ssize_t rc = ForkIo(h, buf, n, fn);
+  if (ischild) {
+    // prevent crashes
+    __threaded = false;
+    __tls_enabled = false;
+    __pid = __imp_GetCurrentProcessId();
+    __klog_handle = 0;
+    __maps.maps = 0;
+  }
+  NTTRACE("%s(%ld, %p, %'zu) → %'zd% m", sf, h, buf, n, rc);
+  return rc;
+}
+
+static dontinline textwindows bool WriteAll(int64_t h, void *buf, size_t n) {
+  bool ok;
+  ok = ForkIo2(h, buf, n, (void *)WriteFile, "WriteFile", false) != -1;
+  if (!ok) {
+    STRACE("fork() failed in parent due to WriteAll(%ld, %p, %'zu) → %u", h,
+           buf, n, GetLastError());
+    __print_maps(0);
+  }
+  return ok;
+}
+
+static textwindows dontinline void ReadOrDie(int64_t h, void *buf, size_t n) {
+  ssize_t got;
+  if ((got = ForkIo2(h, buf, n, ReadFile, "ReadFile", true)) == -1)
+    AbortFork("ReadFile1", buf);
+  if (got != n)
+    AbortFork("ReadFile2", buf);
+}
+
+static textwindows int64_t MapOrDie(uint32_t prot, uint64_t size) {
+  int64_t h;
+  for (;;) {
+    if ((h = CreateFileMapping(-1, 0, prot, size >> 32, size, 0)))
+      return h;
+    if (GetLastError() == kNtErrorAccessDenied) {
+      switch (prot) {
+        case kNtPageExecuteWritecopy:
+          prot = kNtPageWritecopy;
+          continue;
+        case kNtPageExecuteReadwrite:
+          prot = kNtPageReadwrite;
+          continue;
+        case kNtPageExecuteRead:
+          prot = kNtPageReadonly;
+          continue;
+        default:
+          break;
+      }
+    }
+    AbortFork("MapOrDie", (void *)size);
+  }
+}
+
+static textwindows void ViewOrDie(int64_t h, uint32_t access, size_t pos,
                                   size_t size, void *base) {
 TryAgain:
-  if (!__imp_MapViewOfFileEx(h, access, pos >> 32, pos, size, base)) {
+  if (!MapViewOfFileEx(h, access, pos >> 32, pos, size, base)) {
     if ((access & kNtFileMapExecute) &&
         GetLastError() == kNtErrorAccessDenied) {
       access &= ~kNtFileMapExecute;
@@ -85,237 +179,310 @@ TryAgain:
   }
 }
 
-textwindows static void sys_fork_nt_child(void) {
+static __msabi textwindows int OnForkCrash(struct NtExceptionPointers *ep) {
+  kprintf("error: fork() child crashed!%n"
+          "\tExceptionCode = %#x%n"
+          "\tRip = %x%n",
+          ep->ExceptionRecord->ExceptionCode,
+          ep->ContextRecord ? ep->ContextRecord->Rip : -1);
+  TerminateThisProcess(SIGSTKFLT);
+}
 
-  // setup runtime
-  __klog_handle = 0;
-  __tls_index = __imp_TlsAlloc();
-  __morph_tls();
-  __set_tls_win32(__winmain_tib);
-  __tls_enabled = true;
+static textwindows void *Malloc(size_t size) {
+  return HeapAlloc(GetProcessHeap(), 0, size);
+}
 
-  // resurrect shared memory mappings
-  struct Map *next;
-  for (struct Map *map = __maps_first(); map; map = next) {
-    next = __maps_next(map);
+textwindows void WinMainForked(void) {
+  jmp_buf jb;
+  int64_t reader;
+  int64_t savetsc;
+  uint32_t varlen;
+  char16_t fvar[21 + 1 + 21 + 1];
+  struct Fds *fds = __veil("r", &g_fds);
 
-    // cleanup nofork mappings
-    if (map->flags & MAP_NOFORK) {
-      if ((map->flags & MAP_TYPE) != MAP_FILE) {
-        tree_remove(&__maps.maps, &map->tree);
-        __maps.pages -= (map->size + __pagesize - 1) / __pagesize;
-        __maps.count -= 1;
-        __maps_free(map);
-      }
-      continue;
-    }
+  // check to see if the process was actually forked
+  // this variable should have the pipe handle numba
+  varlen = GetEnvironmentVariable(u"_FORK", fvar, ARRAYLEN(fvar));
+  if (!varlen || varlen >= ARRAYLEN(fvar))
+    return;
+  /* STRACE("WinMainForked()"); */
+  SetEnvironmentVariable(u"_FORK", NULL);
+#if SYSDEBUG
+  int64_t oncrash = AddVectoredExceptionHandler(1, (void *)OnForkCrash);
+#endif
+  ParseInt(fvar, &reader);
 
-    // private maps already copied/protected to child by parent
-    if ((map->flags & MAP_TYPE) != MAP_SHARED) {
-      // it's not copy-on-write anymore
-      map->iscow = false;
-      // but it used VirtualAlloc() so munmap() must VirtualFree()
-      if (map->hand > 0) {
-        CloseHandle(map->hand);
-        map->hand = MAPS_VIRTUAL;
-      }
-      continue;
-    }
+  // read the cpu state from the parent process & plus
+  ReadOrDie(reader, jb, sizeof(jb));
 
-    // handle granularity aligned shared mapping
-    if (__maps_isalloc(map)) {
-
-      // get true size of win32 allocation
-      size_t allocsize = map->size;
-      for (struct Map *map2 = next; map2; map2 = __maps_next(map2)) {
-        if (!__maps_isalloc(map2) && map->addr + allocsize == map2->addr) {
-          allocsize += map2->size;
-        } else {
-          break;
-        }
-      }
-
-      // create allocation with most permissive access possible
-      // if we don't create as rwx then we can't mprotect(rwx) later
-      unsigned access;
-      if (map->readonlyfile) {
-        access = kNtFileMapRead | kNtFileMapExecute;
-      } else {
-        access = kNtFileMapWrite | kNtFileMapExecute;
-      }
-
-      // resurrect copyless memory via inherited win32 handle
-      ViewOrDie(map->hand, access, map->off, allocsize, map->addr);
-    }
-
-    // restore memory protection status on pages
-    unsigned old_protect;
-    if (!__imp_VirtualProtectEx(GetCurrentProcess(), map->addr, map->size,
-                                __prot2nt(map->prot, false), &old_protect))
-      AbortFork("VirtualProtectEx", map->addr);
+  // read memory mappings from parent process
+  struct Tree *maps = 0;
+  for (;;) {
+    struct Map *map = Malloc(sizeof(struct Map));
+    ReadOrDie(reader, map, sizeof(struct Map));
+    if (map->addr == MAP_FAILED)
+      break;
+    tree_insert(&maps, &map->tree, __maps_compare);
   }
 
-  // function tracing is now safe
-  ftrace_enabled(+1);
+  // map memory into process
+  int granularity = __gransize;
+  for (struct Tree *e = tree_first(maps); e; e = tree_next(e)) {
+    struct Map *map = MAP_TREE_CONTAINER(e);
+    if ((uintptr_t)map->addr & (granularity - 1))
+      continue;
+    // get true length in case mprotect() chopped up actual win32 map
+    size_t size = map->size;
+    for (struct Tree *e2 = tree_next(e); e2; e2 = tree_next(e2)) {
+      struct Map *map2 = MAP_TREE_CONTAINER(e2);
+      if (map2->hand == -1 && map->addr + size == map2->addr) {
+        size += map2->size;
+      } else {
+        break;
+      }
+    }
+    // obtain the most permissive access possible
+    unsigned prot, access;
+    if (map->readonlyfile) {
+      prot = kNtPageExecuteRead;
+      access = kNtFileMapRead | kNtFileMapExecute;
+    } else {
+      prot = kNtPageExecuteReadwrite;
+      access = kNtFileMapWrite | kNtFileMapExecute;
+    }
+    if ((map->flags & MAP_TYPE) != MAP_SHARED) {
+      // we don't need to close the map handle because sys_mmap_nt
+      // doesn't mark it inheritable across fork() for MAP_PRIVATE
+      map->hand = MapOrDie(prot, size);
+      ViewOrDie(map->hand, access, 0, size, map->addr);
+      ReadOrDie(reader, map->addr, size);
+    } else {
+      // we can however safely inherit MAP_SHARED with zero copy
+      ViewOrDie(map->hand, access, map->off, size, map->addr);
+    }
+  }
 
-  // initialize winsock
-  void WinSockFork(void);
-  if (_weaken(WinSockFork))
-    _weaken(WinSockFork)();
+  // read the .data and .bss program image sections
+  savetsc = kStartTsc;
+  ReadOrDie(reader, __data_start, __data_end - __data_start);
+  ReadOrDie(reader, __bss_start, __bss_end - __bss_start);
+  kStartTsc = savetsc;
+  __tls_enabled = false;
+  __threaded = false;
+
+  // fixup memory manager
+  __maps.maps = 0;
+  __maps.freed = 0;
+  __maps.count = 0;
+  __maps.pages = 0;
+  for (struct Tree *e = tree_first(maps); e; e = tree_next(e)) {
+    struct Map *map = MAP_TREE_CONTAINER(e);
+    __maps.count += 1;
+    __maps.pages += (map->size + __pagesize - 1) / __pagesize;
+    unsigned old_protect;
+    if (!VirtualProtect(map->addr, map->size, __prot2nt(map->prot, map->iscow),
+                        &old_protect))
+      AbortFork("VirtualProtect", map->addr);
+  }
+  __maps.maps = maps;
+  __maps_init();
+
+  // mitosis complete
+  if (!CloseHandle(reader))
+    AbortFork("CloseHandle", (void *)reader);
 
   // rewrap the stdin named pipe hack
   // since the handles closed on fork
-  g_fds.p[0].handle = GetStdHandle(kNtStdInputHandle);
-  g_fds.p[1].handle = GetStdHandle(kNtStdOutputHandle);
-  g_fds.p[2].handle = GetStdHandle(kNtStdErrorHandle);
-}
+  fds->p[0].handle = GetStdHandle(kNtStdInputHandle);
+  fds->p[1].handle = GetStdHandle(kNtStdOutputHandle);
+  fds->p[2].handle = GetStdHandle(kNtStdErrorHandle);
 
-textwindows static int sys_fork_nt_parent(uint32_t dwCreationFlags) {
+  // restore the crash reporting stuff
+#if SYSDEBUG
+  RemoveVectoredExceptionHandler(oncrash);
+#endif
+  if (_weaken(__sig_init))
+    _weaken(__sig_init)();
 
-  // allocate process object
-  struct Proc *proc;
-  if (!(proc = __proc_new()))
-    return -1;
-
-  // get path of this executable
-  char16_t prog[PATH_MAX];
-  unsigned got = GetModuleFileName(0, prog, ARRAYLEN(prog));
-  if (!got || got >= ARRAYLEN(prog)) {
-    dll_make_first(&__proc.free, &proc->elem);
-    enomem();
-    return -1;
-  }
-
-  // spawn new process in suspended state
-  struct NtProcessInformation procinfo;
-  struct NtStartupInfo startinfo = {
-      .cb = sizeof(struct NtStartupInfo),
-      .dwFlags = kNtStartfUsestdhandles,
-      .hStdInput = g_fds.p[0].handle,
-      .hStdOutput = g_fds.p[1].handle,
-      .hStdError = g_fds.p[2].handle,
-  };
-  if (!CreateProcess(prog, 0, 0, 0, true,
-                     dwCreationFlags | kNtCreateSuspended |
-                         kNtInheritParentAffinity |
-                         kNtCreateUnicodeEnvironment |
-                         GetPriorityClass(GetCurrentProcess()),
-                     0, 0, &startinfo, &procinfo)) {
-    STRACE("fork() %s() failed w/ %m %d", "CreateProcess", GetLastError());
-    dll_make_first(&__proc.free, &proc->elem);
-    if (errno != ENOMEM)
-      eagain();
-    return -1;
-  }
-
-  // ensure process can be signaled before returning
-  UnmapViewOfFile(__sig_map_process(procinfo.dwProcessId, kNtOpenAlways));
-
-  // let's go
-  bool ok = true;
-
-  // copy memory manager maps
-  for (struct MapSlab *slab =
-           atomic_load_explicit(&__maps.slabs, memory_order_acquire);
-       slab; slab = slab->next) {
-    ok = ok && !!VirtualAllocEx(procinfo.hProcess, slab, MAPS_SIZE,
-                                kNtMemReserve | kNtMemCommit, kNtPageReadwrite);
-    ok =
-        ok && !!WriteProcessMemory(procinfo.hProcess, slab, slab, MAPS_SIZE, 0);
-  }
-
-  // copy private memory maps
-  int alloc_prot = -1;
-  for (struct Map *map = __maps_first(); map; map = __maps_next(map)) {
-    if ((map->flags & MAP_TYPE) == MAP_SHARED)
-      continue;  // shared memory doesn't need to be copied to subprocess
-    if ((map->flags & MAP_NOFORK) && (map->flags & MAP_TYPE) != MAP_FILE)
-      continue;  // ignore things like signal worker stack memory
-    if (__maps_isalloc(map)) {
-      size_t allocsize = map->size;
-      for (struct Map *m2 = __maps_next(map); m2; m2 = __maps_next(m2)) {
-        if (!__maps_isalloc(m2) && map->addr + allocsize == m2->addr) {
-          allocsize += m2->size;
-        } else {
-          break;
-        }
-      }
-      if ((map->flags & MAP_NOFORK) && (map->flags & MAP_TYPE) == MAP_FILE) {
-        // portable executable segment
-        if (map->prot & PROT_EXEC)
-          // TODO(jart): write a __remorph_tls() function
-          continue;
-        if (!(map->prot & PROT_WRITE)) {
-          uint32_t child_old_protect;
-          ok = ok && !!VirtualProtectEx(procinfo.hProcess, map->addr, allocsize,
-                                        kNtPageReadwrite, &child_old_protect);
-          alloc_prot = PROT_READ | PROT_WRITE;
-        } else {
-          alloc_prot = map->prot;
-        }
-      } else {
-        // private mapping
-        uint32_t page_flags;
-        if (!(alloc_prot & PROT_WRITE)) {
-          page_flags = kNtPageReadwrite;
-          alloc_prot = PROT_READ | PROT_WRITE;
-        } else {
-          page_flags = __prot2nt(alloc_prot, false);
-        }
-        ok = ok && !!VirtualAllocEx(procinfo.hProcess, map->addr, allocsize,
-                                    kNtMemReserve | kNtMemCommit, page_flags);
-      }
-    }
-    uint32_t parent_old_protect;
-    if (!(map->prot & PROT_READ))
-      ok = ok && !!VirtualProtect(map->addr, map->size, kNtPageReadwrite,
-                                  &parent_old_protect);
-    ok = ok &&
-         !!WriteProcessMemory(procinfo.hProcess, map->addr, map->addr,
-                              (map->size + __pagesize - 1) & -__pagesize, 0);
-    if (map->prot != alloc_prot) {
-      uint32_t child_old_protect;
-      ok = ok &&
-           !!VirtualProtectEx(procinfo.hProcess, map->addr, map->size,
-                              __prot2nt(map->prot, false), &child_old_protect);
-    }
-    if (!(map->prot & PROT_READ))
-      ok = ok && !!VirtualProtect(map->addr, map->size, parent_old_protect,
-                                  &parent_old_protect);
-  }
-
-  // set process loose
-  ok = ok && ResumeThread(procinfo.hThread) != -1u;
-  ok &= !!CloseHandle(procinfo.hThread);
-
-  // return pid of new process
-  if (ok) {
-    proc->wasforked = true;
-    proc->handle = procinfo.hProcess;
-    proc->pid = procinfo.dwProcessId;
-    __proc_add(proc);
-    return procinfo.dwProcessId;
-  } else {
-    if (errno != ENOMEM)
-      eagain();  // posix fork() only specifies two errors
-    TerminateProcess(procinfo.hProcess, SIGKILL);
-    CloseHandle(procinfo.hProcess);
-    dll_make_first(&__proc.free, &proc->elem);
-    return -1;
-  }
+  // jump back into function below
+  longjmp(jb, 1);
 }
 
 textwindows int sys_fork_nt(uint32_t dwCreationFlags) {
-  int rc;
-  __winmain_isfork = true;
-  __winmain_tib = __get_tls();
-  if (!__builtin_setjmp(__winmain_jmpbuf)) {
-    rc = sys_fork_nt_parent(dwCreationFlags);
+  char ok;
+  jmp_buf jb;
+  char **args;
+  int rc = -1;
+  struct Proc *proc;
+  struct CosmoTib *tib;
+  char16_t pipename[64];
+  int64_t reader, writer;
+  struct NtStartupInfo startinfo;
+  struct NtProcessInformation procinfo;
+  char *p, forkvar[6 + 21 + 1 + 21 + 1];
+  tib = __get_tls();
+  if (!(proc = __proc_new()))
+    return -1;
+  ftrace_enabled(-1);
+  strace_enabled(-1);
+  if (!setjmp(jb)) {
+    reader = CreateNamedPipe(__create_pipe_name(pipename), kNtPipeAccessInbound,
+                             kNtPipeTypeByte | kNtPipeReadmodeByte, 1, PIPE_BUF,
+                             PIPE_BUF, 0, &kNtIsInheritable);
+    writer = CreateFile(pipename, kNtGenericWrite, 0, 0, kNtOpenExisting, 0, 0);
+    if (reader != -1 && writer != -1) {
+      p = stpcpy(forkvar, "_FORK=");
+      p = FormatUint64(p, reader);
+      bzero(&startinfo, sizeof(startinfo));
+      startinfo.cb = sizeof(struct NtStartupInfo);
+      startinfo.dwFlags = kNtStartfUsestdhandles;
+      startinfo.hStdInput = g_fds.p[0].handle;
+      startinfo.hStdOutput = g_fds.p[1].handle;
+      startinfo.hStdError = g_fds.p[2].handle;
+      args = __argv;
+#if SYSDEBUG
+      int i;
+      // If --strace was passed to this program, then propagate it the
+      // forked process since the flag was removed by __intercept_flag
+      if (strace_enabled(0) > 0) {
+        int n;
+        for (n = 0; args[n];)
+          ++n;
+#pragma GCC push_options
+#pragma GCC diagnostic ignored "-Walloca-larger-than="
+        int nbytes = (n + 2) * sizeof(char *);
+        char **args2 = alloca(nbytes);
+        CheckLargeStackAllocation(args2, nbytes);
+#pragma GCC pop_options
+        for (i = 0; i < n; ++i)
+          args2[i] = args[i];
+        args2[i++] = "--strace";
+        args2[i] = 0;
+        args = args2;
+      }
+#endif
+      NTTRACE("STARTING SPAWN");
+      int spawnrc = ntspawn(&(struct NtSpawnArgs){
+          AT_FDCWD, GetProgramExecutableName(), args, environ,
+          (char *[]){forkvar, 0}, dwCreationFlags, 0, 0, 0, 0, &startinfo,
+          &procinfo});
+      if (spawnrc != -1) {
+        CloseHandle(procinfo.hThread);
+        ok = WriteAll(writer, jb, sizeof(jb));
+        // this list will be populated with the maps we're transferring
+        for (struct Map *map = __maps_first(); ok && map;
+             map = __maps_next(map)) {
+          if (map->flags & MAP_NOFORK)
+            continue;
+          if (MAX((char *)__executable_start, map->addr) <
+              MIN((char *)_end, map->addr + map->size))
+            continue;  // executable image is loaded by windows
+          ok = WriteAll(writer, map, sizeof(*map));
+        }
+        // send a terminating Map struct to child
+        if (ok) {
+          struct Map map;
+          map.addr = MAP_FAILED;
+          ok = WriteAll(writer, &map, sizeof(map));
+        }
+        // now write content of each map to child
+        int granularity = __gransize;
+        for (struct Map *map = __maps_first(); ok && map;
+             map = __maps_next(map)) {
+          if (map->flags & MAP_NOFORK)
+            continue;
+          // we only need to worry about the base mapping
+          if ((uintptr_t)map->addr & (granularity - 1))
+            continue;
+          if (MAX((char *)__executable_start, map->addr) <
+              MIN((char *)_end, map->addr + map->size))
+            continue;  // executable image is loaded by windows
+          // shared mappings don't need to be copied
+          if ((map->flags & MAP_TYPE) == MAP_SHARED)
+            continue;
+          // get true length in case mprotect() chopped up actual win32 map
+          size_t size = map->size;
+          for (struct Map *map2 = __maps_next(map); map2;
+               map2 = __maps_next(map2)) {
+            if (map2->hand == -1 && map->addr + size == map2->addr) {
+              size += map2->size;
+            } else {
+              break;
+            }
+          }
+          for (struct Map *map2 = map; ok && map2; map2 = __maps_next(map2)) {
+            if (!(map2->prot & PROT_READ))
+              if (map->addr >= map2->addr && map->addr < map->addr + size)
+                ok = VirtualProtect(
+                    map2->addr, map2->size,
+                    __prot2nt(map2->prot | PROT_READ, map2->iscow),
+                    &map2->visited);
+          }
+          if (ok)
+            ok = WriteAll(writer, map->addr, size);
+          for (struct Map *map2 = map; ok && map2; map2 = __maps_next(map2)) {
+            if (!(map2->prot & PROT_READ))
+              if (map->addr >= map2->addr && map->addr < map->addr + size)
+                ok = VirtualProtect(map2->addr, map2->size, map2->visited,
+                                    &map2->visited);
+          }
+        }
+        if (ok)
+          ok = WriteAll(writer, __data_start, __data_end - __data_start);
+        if (ok)
+          ok = WriteAll(writer, __bss_start, __bss_end - __bss_start);
+        if (ok) {
+          if (!CloseHandle(writer))
+            ok = false;
+          writer = -1;
+        }
+        if (ok) {
+          proc->wasforked = true;
+          proc->handle = procinfo.hProcess;
+          rc = proc->pid = procinfo.dwProcessId;
+          __proc_add(proc);
+        } else {
+          TerminateProcess(procinfo.hProcess, SIGKILL);
+          CloseHandle(procinfo.hProcess);
+          rc = -1;
+        }
+      }
+    }
+    if (reader != -1)
+      CloseHandle(reader);
+    if (writer != -1)
+      CloseHandle(writer);
+    if (rc == -1 && errno != ENOMEM)
+      eagain();  // posix fork() only specifies two errors
   } else {
-    sys_fork_nt_child();
     rc = 0;
+    // re-apply code morphing for thread-local storage
+    __tls_index = TlsAlloc();
+    __set_tls_win32(tib);
+    __morph_tls();
+    __tls_enabled = true;
+    // the child's pending signals is initially empty
+    atomic_store_explicit(&__sig.pending, 0, memory_order_relaxed);
+    atomic_store_explicit(&tib->tib_sigpending, 0, memory_order_relaxed);
+    // re-enable threads
+    __enable_threads();
+    // re-apply code morphing for function tracing
+    if (ftrace_stackdigs) {
+      _weaken(__hook)(_weaken(ftrace_hook), _weaken(GetSymbolTable)());
+    }
+    // reset core runtime services
+    __proc_wipe();
+    WipeKeystrokes();
+    if (_weaken(__itimer_wipe))
+      _weaken(__itimer_wipe)();
+    // notify pthread join
+    atomic_store_explicit(&_pthread_static.ptid, GetCurrentThreadId(),
+                          memory_order_release);
   }
-  __winmain_isfork = false;
+  if (rc == -1)
+    dll_make_first(&__proc.free, &proc->elem);
+  ftrace_enabled(+1);
+  strace_enabled(+1);
   return rc;
 }
 
diff --git a/libc/proc/fork.c b/libc/proc/fork.c
index fad92dc5a..35abbbe72 100644
--- a/libc/proc/fork.c
+++ b/libc/proc/fork.c
@@ -16,242 +16,120 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
 #include "libc/atomic.h"
 #include "libc/calls/calls.h"
-#include "libc/calls/internal.h"
-#include "libc/calls/sig.internal.h"
 #include "libc/calls/state.internal.h"
-#include "libc/calls/struct/metasigaltstack.h"
+#include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/struct/timespec.h"
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/calls/syscall-sysv.internal.h"
 #include "libc/dce.h"
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/cxaatexit.h"
 #include "libc/intrin/dll.h"
+#include "libc/intrin/kprintf.h"
 #include "libc/intrin/maps.h"
-#include "libc/intrin/stack.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/nt/files.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
+#include "libc/nt/synchronization.h"
 #include "libc/nt/thread.h"
-#include "libc/nt/thunk/msabi.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/runtime/internal.h"
+#include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/syslib.internal.h"
-#include "libc/stdio/internal.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/ss.h"
-#include "libc/thread/itimer.h"
+#include "libc/sysv/consts/sig.h"
 #include "libc/thread/posixthread.internal.h"
-#include "libc/thread/thread.h"
-#include "third_party/dlmalloc/dlmalloc.h"
-#include "third_party/gdtoa/lock.h"
-#include "third_party/tz/lock.h"
+#include "libc/thread/tls.h"
 
-__msabi extern typeof(GetCurrentProcessId) *const __imp_GetCurrentProcessId;
+__static_yoink("_pthread_atfork");
 
-extern atomic_int __sig_worker_state;
-extern pthread_mutex_t __cxa_lock_obj;
-extern pthread_mutex_t __pthread_lock_obj;
+extern pthread_mutex_t _rand64_lock_obj;
+extern pthread_mutex_t _pthread_lock_obj;
 
-void __rand64_lock(void);
-void __rand64_unlock(void);
-void __rand64_wipe(void);
-
-void __dlopen_lock(void);
-void __dlopen_unlock(void);
-void __dlopen_wipe(void);
-
-// first and last and always
-// it is the lord of all locks
-// subordinate to no other lock
-static pthread_mutex_t supreme_lock = PTHREAD_MUTEX_INITIALIZER;
-
-static void fork_prepare_stdio(void) {
-  struct Dll *e;
-  // we acquire the following locks, in order
-  //
-  //   1. FILE objects created by the user
-  //   2. stdin, stdout, and stderr
-  //   3. __stdio.lock
-  //
-StartOver:
-  __stdio_lock();
-  for (e = dll_last(__stdio.files); e; e = dll_prev(__stdio.files, e)) {
-    FILE *f = FILE_CONTAINER(e);
-    if (f->forking)
-      continue;
-    f->forking = 1;
-    __stdio_ref(f);
-    __stdio_unlock();
-    _pthread_mutex_lock(&f->lock);
-    __stdio_unref(f);
-    goto StartOver;
-  }
-}
-
-static void fork_parent_stdio(void) {
-  struct Dll *e;
-  for (e = dll_first(__stdio.files); e; e = dll_next(__stdio.files, e)) {
-    FILE_CONTAINER(e)->forking = 0;
-    _pthread_mutex_unlock(&FILE_CONTAINER(e)->lock);
-  }
-  __stdio_unlock();
-}
-
-static void fork_child_stdio(void) {
-  struct Dll *e;
-  for (e = dll_first(__stdio.files); e; e = dll_next(__stdio.files, e)) {
-    _pthread_mutex_wipe_np(&FILE_CONTAINER(e)->lock);
-    FILE_CONTAINER(e)->forking = 0;
-  }
-  _pthread_mutex_wipe_np(&__stdio.lock);
-}
-
-static void fork_prepare(void) {
-  _pthread_mutex_lock(&supreme_lock);
+static void _onfork_prepare(void) {
   if (_weaken(_pthread_onfork_prepare))
     _weaken(_pthread_onfork_prepare)();
-  fork_prepare_stdio();
-  if (_weaken(__localtime_lock))
-    _weaken(__localtime_lock)();
-  if (_weaken(__dlopen_lock))
-    _weaken(__dlopen_lock)();
   if (IsWindows())
     __proc_lock();
-  if (_weaken(cosmo_stack_lock))
-    _weaken(cosmo_stack_lock)();
-  __cxa_lock();
-  if (_weaken(__gdtoa_lock)) {
-    _weaken(__gdtoa_lock1)();
-    _weaken(__gdtoa_lock)();
-  }
   _pthread_lock();
-  if (_weaken(dlmalloc_pre_fork))
-    _weaken(dlmalloc_pre_fork)();
-  __fds_lock();
-  if (_weaken(__rand64_lock))
-    _weaken(__rand64_lock)();
   __maps_lock();
-  LOCKTRACE("READY TO LOCK AND ROLL");
+  __fds_lock();
+  pthread_mutex_lock(&_rand64_lock_obj);
+  LOCKTRACE("READY TO ROCK AND ROLL");
 }
 
-static void fork_parent(void) {
-  __maps_unlock();
-  if (_weaken(__rand64_unlock))
-    _weaken(__rand64_unlock)();
+static void _onfork_parent(void) {
+  pthread_mutex_unlock(&_rand64_lock_obj);
   __fds_unlock();
-  if (_weaken(dlmalloc_post_fork_parent))
-    _weaken(dlmalloc_post_fork_parent)();
+  __maps_unlock();
   _pthread_unlock();
-  if (_weaken(__gdtoa_unlock)) {
-    _weaken(__gdtoa_unlock)();
-    _weaken(__gdtoa_unlock1)();
-  }
-  __cxa_unlock();
-  if (_weaken(cosmo_stack_unlock))
-    _weaken(cosmo_stack_unlock)();
   if (IsWindows())
     __proc_unlock();
-  if (_weaken(__dlopen_unlock))
-    _weaken(__dlopen_unlock)();
-  if (_weaken(__localtime_unlock))
-    _weaken(__localtime_unlock)();
-  fork_parent_stdio();
   if (_weaken(_pthread_onfork_parent))
     _weaken(_pthread_onfork_parent)();
-  _pthread_mutex_unlock(&supreme_lock);
 }
 
-static void fork_child(int ppid_win32, int ppid_cosmo) {
-  if (_weaken(__rand64_wipe))
-    _weaken(__rand64_wipe)();
-  _pthread_mutex_wipe_np(&__fds_lock_obj);
-  dlmalloc_post_fork_child();
-  if (_weaken(__gdtoa_wipe)) {
-    _weaken(__gdtoa_wipe)();
-    _weaken(__gdtoa_wipe1)();
-  }
-  fork_child_stdio();
-  _pthread_mutex_wipe_np(&__pthread_lock_obj);
-  _pthread_mutex_wipe_np(&__cxa_lock_obj);
-  if (_weaken(cosmo_stack_wipe))
-    _weaken(cosmo_stack_wipe)();
-  if (_weaken(__dlopen_wipe))
-    _weaken(__dlopen_wipe)();
-  if (_weaken(__localtime_wipe))
-    _weaken(__localtime_wipe)();
-  if (IsWindows()) {
-    // we don't bother locking the proc/itimer/sig locks above since
-    // their state is reset in the forked child. nothing to protect.
-    sys_read_nt_wipe_keystrokes();
-    __proc_wipe_and_reset();
-    __itimer_wipe_and_reset();
-    atomic_init(&__sig_worker_state, 0);
-    if (_weaken(__sig_init))
-      _weaken(__sig_init)();
-    if (_weaken(sys_getppid_nt_wipe))
-      _weaken(sys_getppid_nt_wipe)(ppid_win32, ppid_cosmo);
-  }
+static void _onfork_child(void) {
+  if (IsWindows())
+    __proc_wipe();
+  __fds_lock_obj = (pthread_mutex_t)PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+  _rand64_lock_obj = (pthread_mutex_t)PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+  _pthread_lock_obj = (pthread_mutex_t)PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+  atomic_store_explicit(&__maps.lock, 0, memory_order_relaxed);
+  atomic_store_explicit(&__get_tls()->tib_relock_maps, 0, memory_order_relaxed);
   if (_weaken(_pthread_onfork_child))
     _weaken(_pthread_onfork_child)();
-  _pthread_mutex_wipe_np(&supreme_lock);
 }
 
 int _fork(uint32_t dwCreationFlags) {
+  long micros;
   struct Dll *e;
-  int ax, dx, tid, ppid_win32, ppid_cosmo;
-  ppid_win32 = IsWindows() ? GetCurrentProcessId() : 0;
-  ppid_cosmo = __pid;
+  struct timespec started;
+  int ax, dx, tid, parent;
+  parent = __pid;
   BLOCK_SIGNALS;
-  fork_prepare();
+  if (__threaded)
+    _onfork_prepare();
+  started = timespec_real();
   if (!IsWindows()) {
     ax = sys_fork();
   } else {
     ax = sys_fork_nt(dwCreationFlags);
   }
+  micros = timespec_tomicros(timespec_sub(timespec_real(), started));
   if (!ax) {
 
     // get new process id
     if (!IsWindows()) {
       dx = sys_getpid().ax;
     } else {
-      dx = __imp_GetCurrentProcessId();
+      dx = GetCurrentProcessId();
     }
     __pid = dx;
 
-    // get new thread id
-    struct CosmoTib *tib = __get_tls();
-    struct PosixThread *me = (struct PosixThread *)tib->tib_pthread;
-    tid = IsLinux() || IsXnuSilicon() ? dx : sys_gettid();
-    atomic_init(&tib->tib_ctid, tid);
-    atomic_init(&tib->tib_ptid, tid);
-
-    // tracing and kisdangerous need this lock wiped a little earlier
-    atomic_init(&__maps.lock.word, 0);
-
-    /*
-     * it's now safe to call normal functions again
-     */
-
-    // this wipe must happen fast
-    void nsync_waiter_wipe_(void);
-    if (_weaken(nsync_waiter_wipe_))
-      _weaken(nsync_waiter_wipe_)();
-
     // turn other threads into zombies
     // we can't free() them since we're monopolizing all locks
     // we assume the operating system already reclaimed system handles
-    dll_remove(&_pthread_list, &me->list);
-    struct Dll *old_threads = _pthread_list;
-    _pthread_list = 0;
-    dll_make_first(&_pthread_list, &me->list);
-    atomic_init(&_pthread_count, 1);
+    struct CosmoTib *tib = __get_tls();
+    struct PosixThread *pt = (struct PosixThread *)tib->tib_pthread;
+    dll_remove(&_pthread_list, &pt->list);
+    for (e = dll_first(_pthread_list); e; e = dll_next(_pthread_list, e)) {
+      atomic_store_explicit(&POSIXTHREAD_CONTAINER(e)->pt_status,
+                            kPosixThreadZombie, memory_order_relaxed);
+      atomic_store_explicit(&POSIXTHREAD_CONTAINER(e)->tib->tib_syshand, 0,
+                            memory_order_relaxed);
+    }
+    dll_make_first(&_pthread_list, &pt->list);
+
+    // get new main thread id
+    tid = IsLinux() || IsXnuSilicon() ? dx : sys_gettid();
+    atomic_store_explicit(&tib->tib_tid, tid, memory_order_relaxed);
+    atomic_store_explicit(&pt->ptid, tid, memory_order_relaxed);
 
     // get new system thread handle
     intptr_t syshand = 0;
@@ -262,53 +140,20 @@ int _fork(uint32_t dwCreationFlags) {
                       GetCurrentProcess(), &syshand, 0, false,
                       kNtDuplicateSameAccess);
     }
-    atomic_init(&tib->tib_syshand, syshand);
-
-    // the child's pending signals is initially empty
-    atomic_init(&tib->tib_sigpending, 0);
+    atomic_store_explicit(&tib->tib_syshand, syshand, memory_order_relaxed);
 
     // we can't be canceled if the canceler no longer exists
-    atomic_init(&me->pt_canceled, false);
-
-    // forget locks
-    bzero(tib->tib_locks, sizeof(tib->tib_locks));
-
-    // xnu fork() doesn't preserve sigaltstack()
-    if (IsXnu() && me->tib->tib_sigstack_addr) {
-      struct sigaltstack_bsd ss;
-      ss.ss_sp = me->tib->tib_sigstack_addr;
-      ss.ss_size = me->tib->tib_sigstack_size;
-      ss.ss_flags = me->tib->tib_sigstack_flags;
-      if (IsXnuSilicon()) {
-        __syslib->__sigaltstack(&ss, 0);
-      } else {
-        sys_sigaltstack(&ss, 0);
-      }
-    }
+    atomic_store_explicit(&pt->pt_canceled, false, memory_order_relaxed);
 
     // run user fork callbacks
-    fork_child(ppid_win32, ppid_cosmo);
-
-    // free threads
-    if (_weaken(_pthread_free)) {
-      while ((e = dll_first(old_threads))) {
-        struct PosixThread *pt = POSIXTHREAD_CONTAINER(e);
-        atomic_init(&pt->tib->tib_syshand, 0);
-        dll_remove(&old_threads, e);
-        _weaken(_pthread_free)(pt);
-      }
-    }
-
-    // reactivate ftrace
-    /* if (ftrace_stackdigs) */
-    /*   if (_weaken(ftrace_install)) */
-    /*     _weaken(ftrace_install)(); */
-
-    STRACE("fork() → 0 (child of %d)", ppid_cosmo);
+    if (__threaded)
+      _onfork_child();
+    STRACE("fork() → 0 (child of %d; took %ld us)", parent, micros);
   } else {
     // this is the parent process
-    fork_parent();
-    STRACE("fork() → %d% m", ax);
+    if (__threaded)
+      _onfork_parent();
+    STRACE("fork() → %d% m (took %ld us)", ax, micros);
   }
   ALLOW_SIGNALS;
   return ax;
diff --git a/libc/proc/getppid-nt.c b/libc/proc/getppid-nt.c
deleted file mode 100644
index c602042e6..000000000
--- a/libc/proc/getppid-nt.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/atomic.h"
-#include "libc/calls/syscall-nt.internal.h"
-#include "libc/cosmo.h"
-#include "libc/dce.h"
-#include "libc/fmt/itoa.h"
-#include "libc/nt/enum/status.h"
-#include "libc/nt/nt/process.h"
-#include "libc/nt/process.h"
-#include "libc/nt/runtime.h"
-#include "libc/nt/struct/processbasicinformation.h"
-#include "libc/runtime/internal.h"
-#include "libc/runtime/runtime.h"
-
-int sys_getppid_nt_win32;
-int sys_getppid_nt_cosmo;
-
-textwindows static int sys_getppid_nt_ntdll(void) {
-  struct NtProcessBasicInformation ProcessInformation;
-  uint32_t gotsize = 0;
-  if (!NtError(
-          NtQueryInformationProcess(GetCurrentProcess(), 0, &ProcessInformation,
-                                    sizeof(ProcessInformation), &gotsize)) &&
-      gotsize >= sizeof(ProcessInformation) &&
-      ProcessInformation.InheritedFromUniqueProcessId) {
-    return ProcessInformation.InheritedFromUniqueProcessId;
-  }
-  return 0;
-}
-
-static void sys_getppid_nt_extract(const char *str) {
-  int c;
-  int win32 = 0;
-  int cosmo = 0;
-  if (str) {
-    for (;;) {
-      c = *str;
-      if (!('0' <= c && c <= '9'))
-        break;
-      win32 *= 10;
-      win32 += c - '0';
-      ++str;
-    }
-    if (win32 && *str++ == ':') {
-      for (;;) {
-        c = *str;
-        if (!('0' <= c && c <= '9'))
-          break;
-        cosmo *= 10;
-        cosmo += c - '0';
-        ++str;
-      }
-      if (win32 == sys_getppid_nt_ntdll()) {
-        sys_getppid_nt_win32 = win32;
-        sys_getppid_nt_cosmo = cosmo;
-      }
-    }
-  }
-}
-
-__attribute__((__constructor__(90))) static void init(void) {
-  if (!IsWindows())
-    return;
-  sys_getppid_nt_extract(getenv("_COSMO_PPID"));
-}
-
-textwindows int sys_getppid_nt(void) {
-  if (sys_getppid_nt_cosmo)
-    return sys_getppid_nt_cosmo;
-  return sys_getppid_nt_ntdll();
-}
-
-textwindows void sys_getppid_nt_wipe(int win32, int cosmo) {
-  sys_getppid_nt_win32 = win32;
-  sys_getppid_nt_cosmo = cosmo;
-}
diff --git a/libc/proc/getpriority-nt.c b/libc/proc/getpriority-nt.c
index ff4fca305..67d84363c 100644
--- a/libc/proc/getpriority-nt.c
+++ b/libc/proc/getpriority-nt.c
@@ -22,7 +22,7 @@
 #include "libc/nt/errors.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/sysv/consts/prio.h"
 #include "libc/sysv/errfuns.h"
 
diff --git a/libc/proc/getrusage-nt.c b/libc/proc/getrusage-nt.c
index 07bde103a..2b0917843 100644
--- a/libc/proc/getrusage-nt.c
+++ b/libc/proc/getrusage-nt.c
@@ -29,7 +29,7 @@
 #include "libc/nt/struct/iocounters.h"
 #include "libc/nt/struct/processmemorycounters.h"
 #include "libc/nt/thread.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/rusage.h"
 #include "libc/sysv/errfuns.h"
@@ -58,8 +58,9 @@ textwindows int sys_getrusage_nt(int who, struct rusage *usage) {
     return einval();
   }
 
-  if (!usage)
+  if (!usage) {
     return 0;
+  }
 
   if (!(who == RUSAGE_THREAD ? GetThreadTimes : GetProcessTimes)(
           me, &ftCreation, &ftExit, &ftKernel, &ftUser) ||
diff --git a/libc/proc/handle.c b/libc/proc/handle.c
index 83c134e4a..10c220328 100644
--- a/libc/proc/handle.c
+++ b/libc/proc/handle.c
@@ -19,7 +19,7 @@
 #include "libc/calls/calls.h"
 #include "libc/intrin/weaken.h"
 #include "libc/nt/runtime.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 
 // retrieves handle of process
 // supports only current process and processes we created
diff --git a/libc/proc/kill-nt.c b/libc/proc/kill-nt.c
index a820bf932..45f9b740d 100644
--- a/libc/proc/kill-nt.c
+++ b/libc/proc/kill-nt.c
@@ -16,24 +16,19 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/atomic.h"
 #include "libc/calls/calls.h"
-#include "libc/calls/sig.internal.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/syscall-nt.internal.h"
 #include "libc/errno.h"
-#include "libc/intrin/atomic.h"
 #include "libc/intrin/dll.h"
 #include "libc/intrin/strace.h"
 #include "libc/nt/console.h"
-#include "libc/nt/enum/creationdisposition.h"
 #include "libc/nt/enum/ctrlevent.h"
 #include "libc/nt/enum/processaccess.h"
 #include "libc/nt/errors.h"
-#include "libc/nt/memory.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/errfuns.h"
 #ifdef __x86_64__
@@ -41,39 +36,31 @@
 textwindows int sys_kill_nt(int pid, int sig) {
 
   // validate api usage
-  if (!(0 <= sig && sig <= 64))
+  if (!(0 <= sig && sig <= 64)) {
     return einval();
+  }
 
   // XXX: NT doesn't really have process groups. For instance the
   //      CreateProcess() flag for starting a process group actually
   //      just does an "ignore ctrl-c" internally.
-  if (pid < -1)
+  if (pid < -1) {
     pid = -pid;
+  }
 
   // no support for kill all yet
-  if (pid == -1)
+  if (pid == -1) {
     return einval();
+  }
 
   // just call raise() if we're targeting self
   if (pid <= 0 || pid == getpid()) {
     if (sig) {
       if (pid <= 0) {
-        // if pid is 0 or -1 then kill the processes beneath us too.
-        // this isn't entirely right but it's closer to being right.
-        // having this behavior is helpful for servers like redbean.
         struct Dll *e;
         BLOCK_SIGNALS;
         __proc_lock();
-        for (e = dll_first(__proc.list); e; e = dll_next(__proc.list, e)) {
-          atomic_ulong *sigproc;
-          struct Proc *pr = PROC_CONTAINER(e);
-          if (sig != 9 && (sigproc = __sig_map_process(pid, kNtOpenExisting))) {
-            atomic_fetch_or_explicit(sigproc, 1ull << (sig - 1),
-                                     memory_order_release);
-          } else {
-            TerminateProcess(pr->handle, sig);
-          }
-        }
+        for (e = dll_first(__proc.list); e; e = dll_next(__proc.list, e))
+          TerminateProcess(PROC_CONTAINER(e)->handle, sig);
         __proc_unlock();
         ALLOW_SIGNALS;
       }
@@ -83,45 +70,33 @@ textwindows int sys_kill_nt(int pid, int sig) {
     }
   }
 
-  // attempt to signal via shared memory file
-  //
-  // now that we know the process exists, if it has a shared memory file
-  // then we can be reasonably certain it's a cosmo process which should
-  // be trusted to deliver its signal, unless it's a nine exterminations
-  if (pid > 0 && sig != 9) {
-    atomic_ulong *sigproc;
-    if ((sigproc = __sig_map_process(pid, kNtOpenExisting))) {
-      if (sig > 0)
-        atomic_fetch_or_explicit(sigproc, 1ull << (sig - 1),
-                                 memory_order_release);
-      UnmapViewOfFile(sigproc);
-      if (sig != 9)
-        return 0;
-    }
-  }
-
   // find existing handle we own for process
-  //
-  // this step should come first to verify process existence. this is
-  // because there's no guarantee that just because the shared memory
-  // file exists, the process actually exists.
   int64_t handle, closeme = 0;
   if (!(handle = __proc_handle(pid))) {
-    if (!(handle = OpenProcess(kNtProcessTerminate, false, pid)))
-      return eperm();
-    closeme = handle;
+    if ((handle = OpenProcess(kNtProcessTerminate, false, pid))) {
+      closeme = handle;
+    } else {
+      goto OnError;
+    }
   }
 
   // perform actual kill
   // process will report WIFSIGNALED with WTERMSIG(sig)
-  if (sig != 9)
-    STRACE("warning: kill() sending %G via terminate", sig);
   bool32 ok = TerminateProcess(handle, sig);
   if (closeme)
     CloseHandle(closeme);
   if (ok)
     return 0;
-  return esrch();
+
+  // handle error
+OnError:
+  switch (GetLastError()) {
+    case kNtErrorInvalidHandle:
+    case kNtErrorInvalidParameter:
+      return esrch();
+    default:
+      return eperm();
+  }
 }
 
 #endif /* __x86_64__ */
diff --git a/libc/proc/kill.c b/libc/proc/kill.c
index 5de445fd3..9c0e99a6e 100644
--- a/libc/proc/kill.c
+++ b/libc/proc/kill.c
@@ -29,20 +29,6 @@
  * The impact of this action can be terminating the process, or
  * interrupting it to request something happen.
  *
- * On Windows, signals are delivered between processes using shared
- * memory files stored in C:\ProgramData\cosmo\sig\x\y.pid which hold
- * the process signal mask. Any process that can access these files can
- * signal a cosmo process. The targeting process will then notice that a
- * signal has been added and delivers to any thread as soon as possible.
- *
- * On Windows, the only signal that's guaranteed to work on non-cosmocc
- * processes is SIGKILL.
- *
- * On Windows, the concept of a process group isn't fully implemented.
- * Saying `kill(0, sig)` will deliver `sig` to all direct descendent
- * processes. Saying `kill(-pid, sig)` will be the same as saying
- * `kill(pid, sig)`.
- *
  * @param pid can be:
  *      >0 signals one process by id
  *      =0 signals all processes in current process group
diff --git a/libc/proc/nice.c b/libc/proc/nice.c
index de243b93c..783ed4936 100644
--- a/libc/proc/nice.c
+++ b/libc/proc/nice.c
@@ -19,7 +19,7 @@
 #include "libc/calls/calls.h"
 #include "libc/fmt/conv.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/prio.h"
 
 static int clamp(int p) {
diff --git a/libc/proc/posix_spawn.c b/libc/proc/posix_spawn.c
index 4dbbdcea9..a7209abac 100644
--- a/libc/proc/posix_spawn.c
+++ b/libc/proc/posix_spawn.c
@@ -22,6 +22,7 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/state.internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/rlimit.h"
 #include "libc/calls/struct/rlimit.internal.h"
 #include "libc/calls/struct/rusage.internal.h"
@@ -38,7 +39,6 @@
 #include "libc/intrin/bsf.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/dll.h"
-#include "libc/intrin/fds.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/mem/alloca.h"
@@ -51,7 +51,6 @@
 #include "libc/nt/enum/processcreationflags.h"
 #include "libc/nt/enum/startf.h"
 #include "libc/nt/files.h"
-#include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/struct/processinformation.h"
 #include "libc/nt/struct/startupinfo.h"
@@ -59,8 +58,7 @@
 #include "libc/proc/ntspawn.h"
 #include "libc/proc/posix_spawn.h"
 #include "libc/proc/posix_spawn.internal.h"
-#include "libc/proc/proc.h"
-#include "libc/runtime/internal.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sock/sock.h"
 #include "libc/stdio/stdio.h"
@@ -97,10 +95,6 @@
 
 #define CLOSER_CONTAINER(e) DLL_CONTAINER(struct Closer, elem, e)
 
-static atomic_bool has_vfork;  // i.e. not qemu/wsl/xnu/openbsd
-
-#ifdef __x86_64__
-
 struct Closer {
   int64_t handle;
   struct Dll elem;
@@ -112,6 +106,8 @@ struct SpawnFds {
   struct Dll *closers;
 };
 
+static atomic_bool has_vfork;  // i.e. not qemu/wsl/xnu/openbsd
+
 static textwindows int64_t spawnfds_handle(struct SpawnFds *fds, int fd) {
   if (__is_cloexec(fds->p + fd))
     return -1;
@@ -198,21 +194,10 @@ static textwindows errno_t spawnfds_open(struct SpawnFds *fds, int64_t dirhand,
   errno_t err;
   char16_t path16[PATH_MAX];
   uint32_t perm, share, disp, attr;
-  if (!strcmp(path, "/dev/null")) {
-    strcpy16(path16, u"NUL");
-  } else if (!strcmp(path, "/dev/stdin")) {
-    return spawnfds_dup2(fds, 0, fildes);
-  } else if (!strcmp(path, "/dev/stdout")) {
-    return spawnfds_dup2(fds, 1, fildes);
-  } else if (!strcmp(path, "/dev/stderr")) {
-    return spawnfds_dup2(fds, 2, fildes);
-  } else {
-    if (__mkntpathath(dirhand, path, 0, path16) == -1)
-      return errno;
-  }
   if ((err = spawnfds_ensure(fds, fildes)))
     return err;
-  if (GetNtOpenFlags(oflag, mode, &perm, &share, &disp, &attr) != -1 &&
+  if (__mkntpathath(dirhand, path, 0, path16) != -1 &&
+      GetNtOpenFlags(oflag, mode, &perm, &share, &disp, &attr) != -1 &&
       (h = CreateFile(path16, perm, share, &kNtIsInheritable, disp, attr, 0))) {
     spawnfds_closelater(fds, h);
     fds->p[fildes].kind = kFdFile;
@@ -315,30 +300,30 @@ static textwindows errno_t posix_spawn_nt_impl(
         case _POSIX_SPAWN_CLOSE:
           err = spawnfds_close(&fds, a->fildes);
           STRACE("spawnfds_close(%d) → %s", a->fildes,
-                 _DescribeErrno(errno_buf, err));
+                 (DescribeErrno)(errno_buf, err));
           break;
         case _POSIX_SPAWN_DUP2:
           err = spawnfds_dup2(&fds, a->fildes, a->newfildes);
           STRACE("spawnfds_dup2(%d, %d) → %s", a->fildes, a->newfildes,
-                 _DescribeErrno(errno_buf, err));
+                 (DescribeErrno)(errno_buf, err));
           break;
         case _POSIX_SPAWN_OPEN:
           err = spawnfds_open(&fds, dirhand, a->path, a->oflag, a->mode,
                               a->fildes);
           STRACE("spawnfds_open(%#s, %s, %s, %d) → %s", a->path,
-                 _DescribeOpenFlags(oflags_buf, a->oflag),
-                 _DescribeOpenMode(openmode_buf, a->oflag, a->mode), a->fildes,
-                 _DescribeErrno(errno_buf, err));
+                 (DescribeOpenFlags)(oflags_buf, a->oflag),
+                 (DescribeOpenMode)(openmode_buf, a->oflag, a->mode), a->fildes,
+                 (DescribeErrno)(errno_buf, err));
           break;
         case _POSIX_SPAWN_CHDIR:
           err = spawnfds_chdir(&fds, dirhand, a->path, &dirhand);
           STRACE("spawnfds_chdir(%#s) → %s", a->path,
-                 _DescribeErrno(errno_buf, err));
+                 (DescribeErrno)(errno_buf, err));
           break;
         case _POSIX_SPAWN_FCHDIR:
           err = spawnfds_fchdir(&fds, a->fildes, &dirhand);
           STRACE("spawnfds_fchdir(%d) → %s", a->fildes,
-                 _DescribeErrno(errno_buf, err));
+                 (DescribeErrno)(errno_buf, err));
           break;
         default:
           __builtin_unreachable();
@@ -352,8 +337,12 @@ static textwindows errno_t posix_spawn_nt_impl(
   // figure out flags
   uint32_t dwCreationFlags = 0;
   short flags = attrp && *attrp ? (*attrp)->flags : 0;
-  if (flags & (POSIX_SPAWN_SETPGROUP | POSIX_SPAWN_SETSID))
+  if (flags & POSIX_SPAWN_SETSID) {
+    dwCreationFlags |= kNtDetachedProcess;
+  }
+  if (flags & POSIX_SPAWN_SETPGROUP) {
     dwCreationFlags |= kNtCreateNewProcessGroup;
+  }
 
   // create process startinfo
   struct NtStartupInfo startinfo = {
@@ -375,19 +364,6 @@ static textwindows errno_t posix_spawn_nt_impl(
     }
   }
 
-  // UNC paths break some things when they are not needed.
-  if (lpCurrentDirectory) {
-    size_t n = strlen16(lpCurrentDirectory);
-    if (n > 4 && n < 260 &&               //
-        lpCurrentDirectory[0] == '\\' &&  //
-        lpCurrentDirectory[1] == '\\' &&  //
-        lpCurrentDirectory[2] == '?' &&   //
-        lpCurrentDirectory[3] == '\\') {
-      memmove(lpCurrentDirectory, lpCurrentDirectory + 4,
-              (n - 4 + 1) * sizeof(char16_t));
-    }
-  }
-
   // inherit signal mask
   sigset_t childmask;
   char maskvar[6 + 21];
@@ -398,14 +374,6 @@ static textwindows errno_t posix_spawn_nt_impl(
   }
   FormatUint64(stpcpy(maskvar, "_MASK="), childmask);
 
-  // inherit parent process id
-  char ppidvar[12 + 21 + 1 + 21 + 1], *p = ppidvar;
-  p = stpcpy(p, "_COSMO_PPID=");
-  p = FormatUint64(p, GetCurrentProcessId());
-  *p++ = ':';
-  p = FormatUint64(p, __pid);
-  setenv("_COSMO_PPID", ppidvar, true);
-
   // launch process
   int rc = -1;
   struct NtProcessInformation procinfo;
@@ -461,8 +429,6 @@ static textwindows dontinline errno_t posix_spawn_nt(
   return err;
 }
 
-#endif  // __x86_64__
-
 /**
  * Spawns process, the POSIX way, e.g.
  *
@@ -480,101 +446,23 @@ static textwindows dontinline errno_t posix_spawn_nt(
  *     posix_spawnattr_destroy(&sa);
  *     while (wait(&status) != -1);
  *
- * The posix_spawn() function may be used to launch subprocesses. The
- * primary advantage of using posix_spawn() instead of the traditional
- * fork() / execve() combination for launching processes is efficiency
- * and cross-platform compatibility.
+ * This provides superior process creation performance across systems
  *
- * 1. On Linux, FreeBSD, and NetBSD:
- *
- *    Cosmopolitan Libc's posix_spawn() uses vfork() under the hood on
- *    these platforms automatically, since it's faster than fork(). It's
- *    because vfork() creates a child process without needing to copy
- *    the parent's page tables, making it more efficient, especially for
- *    large processes. Furthermore, vfork() avoids the need to acquire
- *    every single mutex (see pthread_atfork() for more details) which
- *    makes it scalable in multi-threaded apps, since the other threads
- *    in your app can keep going while the spawning thread waits for the
- *    subprocess to call execve(). Normally vfork() is error-prone since
- *    there exists few functions that are @vforksafe. the posix_spawn()
- *    API is designed to offer maximum assurance that you can't shoot
- *    yourself in the foot. If you do, then file a bug with Cosmo.
- *
- * 2. On Windows:
- *
- *    posix_spawn() avoids fork() entirely. Windows doesn't natively
- *    support fork(), and emulating it can be slow and memory-intensive.
- *    By using posix_spawn(), we get a much faster process creation on
- *    Windows systems, because it only needs to call CreateProcess().
- *    Your file actions are replayed beforehand in a simulated way. Only
- *    Cosmopolitan Libc offers this level of quality. With Cygwin you'd
- *    have to use its proprietary APIs to achieve the same performance.
- *
- * 3. Simplified error handling:
- *
- *    posix_spawn() combines process creation and program execution in a
- *    single call, reducing the points of failure and simplifying error
- *    handling. One important thing that happens with Cosmopolitan's
- *    posix_spawn() implementation is that the error code of execve()
- *    inside your subprocess, should it fail, will be propagated to your
- *    parent process. This will happen efficiently via vfork() shared
- *    memory in the event your Linux environment supports this. If it
- *    doesn't, then Cosmopolitan will fall back to a throwaway pipe().
- *    The pipe is needed on platforms like XNU and OpenBSD which do not
- *    support vfork(). It's also needed under QEMU User.
- *
- * 4. Signal safety:
- *
- *    posix_spawn() guarantees your signal handler callback functions
- *    won't be executed in the child process. By default, it'll remove
- *    sigaction() callbacks atomically. This ensures that if something
- *    like a SIGTERM or SIGHUP is sent to the child process before it's
- *    had a chance to call execve(), then the child process will simply
- *    be terminated (like the spawned process would) instead of running
- *    whatever signal handlers the spawning process has installed. If
- *    you've set some signals to SIG_IGN, then that'll be preserved for
- *    the child process by posix_spawn(), unless you explicitly call
- *    posix_spawnattr_setsigdefault() to reset them.
- *
- * 5. Portability:
- *
- *    posix_spawn() is part of the POSIX standard, making it more
- *    portable across different UNIX-like systems and Windows (with
- *    appropriate libraries). Even the non-POSIX APIs we use here are
- *    portable; e.g. posix_spawn_file_actions_addchdir_np() is supported
- *    by glibc, musl libc, and apple libc too.
- *
- * When using posix_spawn() you have the option of passing an attributes
- * object that specifies how the child process should be created. These
- * functions are provided by Cosmopolitan Libc for setting attributes:
- *
- * - posix_spawnattr_init()
- * - posix_spawnattr_destroy()
- * - posix_spawnattr_setflags()
- * - posix_spawnattr_getflags()
- * - posix_spawnattr_setsigmask()
- * - posix_spawnattr_getsigmask()
- * - posix_spawnattr_setpgroup()
- * - posix_spawnattr_getpgroup()
- * - posix_spawnattr_setrlimit_np()
- * - posix_spawnattr_getrlimit_np()
- * - posix_spawnattr_setschedparam()
- * - posix_spawnattr_getschedparam()
- * - posix_spawnattr_setschedpolicy()
- * - posix_spawnattr_getschedpolicy()
- * - posix_spawnattr_setsigdefault()
- * - posix_spawnattr_getsigdefault()
- *
- * You can also pass an ordered list of file actions to perform. The
- * following APIs are provided by Cosmopolitan Libc for doing that:
- *
- * - posix_spawn_file_actions_init()
- * - posix_spawn_file_actions_destroy()
- * - posix_spawn_file_actions_adddup2()
- * - posix_spawn_file_actions_addopen()
- * - posix_spawn_file_actions_addclose()
- * - posix_spawn_file_actions_addchdir_np()
- * - posix_spawn_file_actions_addfchdir_np()
+ * Processes are normally spawned by calling fork() and execve(), but
+ * that goes slow on Windows if the caller has allocated a nontrivial
+ * number of memory mappings, all of which need to be copied into the
+ * forked child, only to be destroyed a moment later. On UNIX systems
+ * fork() bears a similar cost that's 100x less bad, which is copying
+ * the page tables. So what this implementation does is on Windows it
+ * calls CreateProcess() directly and on UNIX it uses vfork() if it's
+ * possible (XNU and OpenBSD don't have it). On UNIX this API has the
+ * benefit of avoiding the footguns of using vfork() directly because
+ * this implementation will ensure signal handlers can't be called in
+ * the child process since that'd likely corrupt the parent's memory.
+ * On systems with a real vfork() implementation, the execve() status
+ * code is returned by this function via shared memory; otherwise, it
+ * gets passed via a temporary pipe (on systems like QEmu, Blink, and
+ * XNU/OpenBSD) whose support is auto-detected at runtime.
  *
  * @param pid if non-null shall be set to child pid on success
  * @param path is resolved path of program which is not `$PATH` searched
@@ -594,40 +482,39 @@ errno_t posix_spawn(int *pid, const char *path,
                     const posix_spawn_file_actions_t *file_actions,
                     const posix_spawnattr_t *attrp, char *const argv[],
                     char *const envp[]) {
-#ifdef __x86_64__
   if (IsWindows())
     return posix_spawn_nt(pid, path, file_actions, attrp, argv, envp);
-#endif
   int pfds[2];
   bool use_pipe;
   volatile int status = 0;
   sigset_t blockall, oldmask;
   int child, res, cs, e = errno;
   volatile bool can_clobber = false;
-  short flags = attrp && *attrp ? (*attrp)->flags : 0;
   sigfillset(&blockall);
   sigprocmask(SIG_SETMASK, &blockall, &oldmask);
   pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
-  if ((use_pipe = (flags & POSIX_SPAWN_USEFORK) ||
-                  !atomic_load_explicit(&has_vfork, memory_order_acquire))) {
+  if ((use_pipe = !atomic_load_explicit(&has_vfork, memory_order_acquire))) {
     if (pipe2(pfds, O_CLOEXEC)) {
       res = errno;
       goto ParentFailed;
     }
   }
-  if (!(child = (flags & POSIX_SPAWN_USEFORK) ? fork() : vfork())) {
+  if (!(child = vfork())) {
     can_clobber = true;
     sigset_t childmask;
     bool lost_cloexec = 0;
     struct sigaction dfl = {0};
+    short flags = attrp && *attrp ? (*attrp)->flags : 0;
     if (use_pipe)
       close(pfds[0]);
-    for (int sig = 1; sig <= NSIG; sig++)
+    for (int sig = 1; sig < _NSIG; sig++) {
       if (__sighandrvas[sig] != (long)SIG_DFL &&
           (__sighandrvas[sig] != (long)SIG_IGN ||
            ((flags & POSIX_SPAWN_SETSIGDEF) &&
-            sigismember(&(*attrp)->sigdefault, sig) == 1)))
+            sigismember(&(*attrp)->sigdefault, sig) == 1))) {
         sigaction(sig, &dfl, 0);
+      }
+    }
     if (flags & POSIX_SPAWN_SETSID)
       setsid();
     if ((flags & POSIX_SPAWN_SETPGROUP) && setpgid(0, (*attrp)->pgroup))
@@ -692,7 +579,7 @@ errno_t posix_spawn(int *pid, const char *path,
         if (sched_setparam(0, &(*attrp)->schedparam))
           goto ChildFailed;
     }
-    if (flags & POSIX_SPAWN_SETRLIMIT_NP) {
+    if (flags & POSIX_SPAWN_SETRLIMIT) {
       int rlimset = (*attrp)->rlimset;
       while (rlimset) {
         int resource = bsf(rlimset);
@@ -725,8 +612,9 @@ errno_t posix_spawn(int *pid, const char *path,
     }
     _Exit(127);
   }
-  if (use_pipe)
+  if (use_pipe) {
     close(pfds[1]);
+  }
   if (child != -1) {
     if (!use_pipe) {
       res = status;
diff --git a/libc/proc/posix_spawn.h b/libc/proc/posix_spawn.h
index 2efa2258a..9da96f721 100644
--- a/libc/proc/posix_spawn.h
+++ b/libc/proc/posix_spawn.h
@@ -12,8 +12,7 @@
 #define POSIX_SPAWN_SETSCHEDPARAM 16
 #define POSIX_SPAWN_SETSCHEDULER  32
 #define POSIX_SPAWN_SETSID        128
-#define POSIX_SPAWN_SETRLIMIT_NP  256
-#define POSIX_SPAWN_USEFORK       512
+#define POSIX_SPAWN_SETRLIMIT     256
 
 COSMOPOLITAN_C_START_
 
@@ -56,10 +55,10 @@ int posix_spawnattr_getsigdefault(const posix_spawnattr_t *,
                                   sigset_t *) libcesque;
 int posix_spawnattr_setsigdefault(posix_spawnattr_t *,
                                   const sigset_t *) libcesque;
-int posix_spawnattr_getrlimit_np(const posix_spawnattr_t *, int,
-                                 struct rlimit *) libcesque;
-int posix_spawnattr_setrlimit_np(posix_spawnattr_t *, int,
-                                 const struct rlimit *) libcesque;
+int posix_spawnattr_getrlimit(const posix_spawnattr_t *, int,
+                              struct rlimit *) libcesque;
+int posix_spawnattr_setrlimit(posix_spawnattr_t *, int,
+                              const struct rlimit *) libcesque;
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_STDIO_SPAWN_H_ */
diff --git a/libc/proc/posix_spawnattr_getrlimit.c b/libc/proc/posix_spawnattr_getrlimit.c
index c45a3387c..941e40889 100644
--- a/libc/proc/posix_spawnattr_getrlimit.c
+++ b/libc/proc/posix_spawnattr_getrlimit.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/rlimit.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/proc/posix_spawn.h"
 #include "libc/proc/posix_spawn.internal.h"
 #include "libc/stdio/sysparam.h"
@@ -31,8 +31,8 @@
  * @raise EINVAL if `resource` is invalid or unsupported by host
  * @raise ENOENT if `resource` is absent
  */
-int posix_spawnattr_getrlimit_np(const posix_spawnattr_t *attr, int resource,
-                                 struct rlimit *rlim) {
+int posix_spawnattr_getrlimit(const posix_spawnattr_t *attr, int resource,
+                              struct rlimit *rlim) {
   if (0 <= resource && resource < MIN(RLIM_NLIMITS, ARRAYLEN((*attr)->rlim))) {
     if (((*attr)->rlimset & (1u << resource))) {
       *rlim = (*attr)->rlim[resource];
diff --git a/libc/proc/posix_spawnattr_setflags.c b/libc/proc/posix_spawnattr_setflags.c
index 057306dec..13cb82f28 100644
--- a/libc/proc/posix_spawnattr_setflags.c
+++ b/libc/proc/posix_spawnattr_setflags.c
@@ -25,8 +25,6 @@
  *
  * @param attr was initialized by posix_spawnattr_init()
  * @param flags may have any of the following
- *     - `POSIX_SPAWN_USEFORK`
- *     - `POSIX_SPAWN_USEVFORK`
  *     - `POSIX_SPAWN_RESETIDS`
  *     - `POSIX_SPAWN_SETPGROUP`
  *     - `POSIX_SPAWN_SETSIGDEF`
@@ -34,13 +32,12 @@
  *     - `POSIX_SPAWN_SETSCHEDPARAM`
  *     - `POSIX_SPAWN_SETSCHEDULER`
  *     - `POSIX_SPAWN_SETSID`
- *     - `POSIX_SPAWN_SETRLIMIT_NP`
+ *     - `POSIX_SPAWN_SETRLIMIT`
  * @return 0 on success, or errno on error
  * @raise EINVAL if `flags` has invalid bits
  */
 int posix_spawnattr_setflags(posix_spawnattr_t *attr, short flags) {
-  if (flags & ~(POSIX_SPAWN_USEFORK |        //
-                POSIX_SPAWN_USEVFORK |       //
+  if (flags & ~(POSIX_SPAWN_USEVFORK |       //
                 POSIX_SPAWN_RESETIDS |       //
                 POSIX_SPAWN_SETPGROUP |      //
                 POSIX_SPAWN_SETSIGDEF |      //
@@ -48,7 +45,7 @@ int posix_spawnattr_setflags(posix_spawnattr_t *attr, short flags) {
                 POSIX_SPAWN_SETSCHEDPARAM |  //
                 POSIX_SPAWN_SETSCHEDULER |   //
                 POSIX_SPAWN_SETSID |         //
-                POSIX_SPAWN_SETRLIMIT_NP)) {
+                POSIX_SPAWN_SETRLIMIT)) {
     return EINVAL;
   }
   (*attr)->flags = flags;
diff --git a/libc/proc/posix_spawnattr_setrlimit.c b/libc/proc/posix_spawnattr_setrlimit.c
index 60c68cdf2..0d0a7970b 100644
--- a/libc/proc/posix_spawnattr_setrlimit.c
+++ b/libc/proc/posix_spawnattr_setrlimit.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/rlimit.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/proc/posix_spawn.h"
 #include "libc/proc/posix_spawn.internal.h"
 #include "libc/sysv/consts/rlim.h"
@@ -26,14 +26,14 @@
 /**
  * Sets resource limit on spawned process.
  *
- * You also need to pass `POSIX_SPAWN_SETRLIMIT_NP` to
+ * You also need to pass `POSIX_SPAWN_SETRLIMIT` to
  * posix_spawnattr_setflags() for it to take effect.
  *
  * @return 0 on success, or errno on error
  * @raise EINVAL if resource is invalid
  */
-int posix_spawnattr_setrlimit_np(posix_spawnattr_t *attr, int resource,
-                                 const struct rlimit *rlim) {
+int posix_spawnattr_setrlimit(posix_spawnattr_t *attr, int resource,
+                              const struct rlimit *rlim) {
   if (0 <= resource && resource < MIN(RLIM_NLIMITS, ARRAYLEN((*attr)->rlim))) {
     (*attr)->rlimset |= 1u << resource;
     (*attr)->rlim[resource] = *rlim;
diff --git a/libc/proc/proc.c b/libc/proc/proc.c
index 0288075d1..324c08356 100644
--- a/libc/proc/proc.c
+++ b/libc/proc/proc.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/proc/proc.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
@@ -24,23 +23,19 @@
 #include "libc/calls/struct/rusage.h"
 #include "libc/calls/struct/siginfo.h"
 #include "libc/calls/struct/sigset.internal.h"
-#include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/cosmo.h"
 #include "libc/errno.h"
 #include "libc/fmt/wintime.internal.h"
 #include "libc/intrin/dll.h"
-#include "libc/intrin/maps.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/mem/leaks.h"
 #include "libc/nt/accounting.h"
-#include "libc/nt/enum/heap.h"
 #include "libc/nt/enum/processaccess.h"
 #include "libc/nt/enum/processcreationflags.h"
 #include "libc/nt/enum/status.h"
 #include "libc/nt/enum/wait.h"
 #include "libc/nt/events.h"
-#include "libc/nt/memory.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/struct/filetime.h"
@@ -48,6 +43,7 @@
 #include "libc/nt/struct/processmemorycounters.h"
 #include "libc/nt/synchronization.h"
 #include "libc/nt/thread.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/map.h"
@@ -56,7 +52,6 @@
 #include "libc/sysv/consts/sicode.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 #include "third_party/nsync/mu.h"
 #ifdef __x86_64__
@@ -65,12 +60,9 @@
  * @fileoverview Windows Subprocess Management.
  */
 
-#define STACK_SIZE 65536
-
 struct Procs __proc;
-static pthread_mutex_t __proc_lock_obj = PTHREAD_MUTEX_INITIALIZER;
 
-textwindows static void __proc_stats(int64_t h, struct rusage *ru) {
+static textwindows void __proc_stats(int64_t h, struct rusage *ru) {
   bzero(ru, sizeof(*ru));
   struct NtProcessMemoryCountersEx memcount = {sizeof(memcount)};
   GetProcessMemoryInfo(h, &memcount, sizeof(memcount));
@@ -106,14 +98,14 @@ textwindows int __proc_harvest(struct Proc *pr, bool iswait4) {
     pr->handle = status & 0x00FFFFFF;
   } else {
     // handle child _Exit()
-    if (status == 0xc9af3d51u)
+    if (status == 0xc9af3d51u) {
       status = kNtStillActive;
+    }
     pr->wstatus = status;
     if (!iswait4 && !pr->waiters && !__proc.waiters &&
         (__sighandrvas[SIGCHLD] == (uintptr_t)SIG_IGN ||
          (__sighandflags[SIGCHLD] & SA_NOCLDWAIT))) {
       // perform automatic zombie reaping
-      STRACE("automatically reaping zombie");
       dll_remove(&__proc.list, &pr->elem);
       dll_make_first(&__proc.free, &pr->elem);
       CloseHandle(pr->handle);
@@ -136,14 +128,9 @@ textwindows int __proc_harvest(struct Proc *pr, bool iswait4) {
   return sic;
 }
 
-textwindows dontinstrument static uint32_t __proc_worker(void *arg) {
+static textwindows dontinstrument uint32_t __proc_worker(void *arg) {
   struct CosmoTib tls;
-  char *sp = __builtin_frame_address(0);
   __bootstrap_tls(&tls, __builtin_frame_address(0));
-  __maps_track(
-      (char *)(((uintptr_t)sp + __pagesize - 1) & -__pagesize) - STACK_SIZE,
-      STACK_SIZE, PROT_READ | PROT_WRITE,
-      MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK);
   for (;;) {
 
     // assemble a group of processes to wait on. if more than 64
@@ -175,7 +162,7 @@ textwindows dontinstrument static uint32_t __proc_worker(void *arg) {
 
     // wait for something to happen
     if (n == 64) {
-      millis = POLL_INTERVAL_MS;
+      millis = 5;
     } else {
       millis = -1u;
       handles[n++] = __proc.onbirth;
@@ -197,8 +184,9 @@ textwindows dontinstrument static uint32_t __proc_worker(void *arg) {
         continue;
       if (j == i)
         continue;
-      if (!--objects[j]->waiters && objects[j]->status == PROC_UNDEAD)
+      if (!--objects[j]->waiters && objects[j]->status == PROC_UNDEAD) {
         __proc_free(objects[j]);
+      }
     }
 
     // check if we need to churn due to >64 processes
@@ -223,8 +211,9 @@ textwindows dontinstrument static uint32_t __proc_worker(void *arg) {
       case PROC_ZOMBIE:
         break;
       case PROC_UNDEAD:
-        if (!objects[i]->waiters)
+        if (!objects[i]->waiters) {
           __proc_free(objects[i]);
+        }
         break;
       default:
         __builtin_unreachable();
@@ -236,8 +225,9 @@ textwindows dontinstrument static uint32_t __proc_worker(void *arg) {
     // 1. wait4() is being used
     // 2. SIGCHLD has SIG_IGN handler
     // 3. SIGCHLD has SA_NOCLDWAIT flag
-    if (sic)
+    if (sic) {
       __sig_generate(SIGCHLD, sic);
+    }
   }
   return 0;
 }
@@ -245,10 +235,11 @@ textwindows dontinstrument static uint32_t __proc_worker(void *arg) {
 /**
  * Lazy initializes process tracker data structures and worker.
  */
-textwindows static void __proc_setup(void) {
+static textwindows void __proc_setup(void) {
+  __enable_threads();
   __proc.onbirth = CreateEvent(0, 0, 0, 0);     // auto reset
   __proc.haszombies = CreateEvent(0, 1, 0, 0);  // manual reset
-  __proc.thread = CreateThread(0, STACK_SIZE, __proc_worker, 0,
+  __proc.thread = CreateThread(0, 65536, __proc_worker, 0,
                                kNtStackSizeParamIsAReservation, 0);
 }
 
@@ -257,22 +248,20 @@ textwindows static void __proc_setup(void) {
  */
 textwindows void __proc_lock(void) {
   cosmo_once(&__proc.once, __proc_setup);
-  _pthread_mutex_lock(&__proc_lock_obj);
+  nsync_mu_lock(&__proc.lock);
 }
 
 /**
  * Unlocks process tracker.
  */
 textwindows void __proc_unlock(void) {
-  _pthread_mutex_unlock(&__proc_lock_obj);
+  nsync_mu_unlock(&__proc.lock);
 }
 
 /**
  * Resets process tracker from forked child.
  */
-textwindows void __proc_wipe_and_reset(void) {
-  // TODO(jart): Should we preserve this state in forked children?
-  _pthread_mutex_wipe_np(&__proc_lock_obj);
+textwindows void __proc_wipe(void) {
   bzero(&__proc, sizeof(__proc));
 }
 
@@ -291,9 +280,16 @@ textwindows struct Proc *__proc_new(void) {
     proc = PROC_CONTAINER(e);
     dll_remove(&__proc.free, &proc->elem);
   }
-  if (!proc && !(proc = HeapAlloc(GetProcessHeap(), 0, sizeof(struct Proc))))
-    return 0;
-  bzero(proc, sizeof(*proc));
+  if (proc) {
+    bzero(proc, sizeof(*proc));
+  } else {
+    proc = mmap(0, sizeof(struct Proc), PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (proc == MAP_FAILED) {
+      enomem();
+      return 0;
+    }
+  }
   dll_init(&proc->elem);
   return proc;
 }
@@ -321,7 +317,6 @@ textwindows int64_t __proc_search(int pid) {
   int64_t handle = 0;
   BLOCK_SIGNALS;
   __proc_lock();
-  // TODO(jart): we should increment a reference count when returning
   for (e = dll_first(__proc.list); e; e = dll_next(__proc.list, e)) {
     if (pid == PROC_CONTAINER(e)->pid) {
       handle = PROC_CONTAINER(e)->handle;
diff --git a/libc/proc/proc.h b/libc/proc/proc.internal.h
similarity index 64%
rename from libc/proc/proc.h
rename to libc/proc/proc.internal.h
index 44b4ed5ad..fd59bc5f1 100644
--- a/libc/proc/proc.h
+++ b/libc/proc/proc.internal.h
@@ -5,6 +5,7 @@
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/dll.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/mu.h"
 COSMOPOLITAN_C_START_
 
 #define PROC_ALIVE  0
@@ -27,6 +28,7 @@ struct Proc {
 struct Procs {
   int waiters;
   atomic_uint once;
+  nsync_mu lock;
   intptr_t thread;
   intptr_t onbirth;
   intptr_t haszombies;
@@ -40,16 +42,16 @@ struct Procs {
 
 extern struct Procs __proc;
 
-void __proc_lock(void);
-void __proc_unlock(void);
-int64_t __proc_handle(int);
-int64_t __proc_search(int);
-struct Proc *__proc_new(void);
-void __proc_add(struct Proc *);
-void __proc_free(struct Proc *);
-void __proc_wipe_and_reset(void);
-int __proc_harvest(struct Proc *, bool);
-int sys_wait4_nt(int, int *, int, struct rusage *);
+void __proc_wipe(void) libcesque;
+void __proc_lock(void) libcesque;
+void __proc_unlock(void) libcesque;
+int64_t __proc_handle(int) libcesque;
+int64_t __proc_search(int) libcesque;
+struct Proc *__proc_new(void) libcesque;
+void __proc_add(struct Proc *) libcesque;
+void __proc_free(struct Proc *) libcesque;
+int __proc_harvest(struct Proc *, bool) libcesque;
+int sys_wait4_nt(int, int *, int, struct rusage *) libcesque;
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_PROC_H_ */
diff --git a/libc/proc/sched_getaffinity.c b/libc/proc/sched_getaffinity.c
index 752bc9c79..5bddf33bf 100644
--- a/libc/proc/sched_getaffinity.c
+++ b/libc/proc/sched_getaffinity.c
@@ -24,7 +24,7 @@
 #include "libc/nt/errors.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/errfuns.h"
 
diff --git a/libc/proc/sched_setaffinity.c b/libc/proc/sched_setaffinity.c
index 79ab9fcfe..5e494ee98 100644
--- a/libc/proc/sched_setaffinity.c
+++ b/libc/proc/sched_setaffinity.c
@@ -24,7 +24,7 @@
 #include "libc/nt/errors.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/sysv/errfuns.h"
 
 static dontinline textwindows int sys_sched_setaffinity_nt(
diff --git a/libc/proc/setpriority-nt.c b/libc/proc/setpriority-nt.c
index 3aeb56dfc..837bd8743 100644
--- a/libc/proc/setpriority-nt.c
+++ b/libc/proc/setpriority-nt.c
@@ -23,7 +23,7 @@
 #include "libc/nt/errors.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/sysv/consts/prio.h"
 #include "libc/sysv/errfuns.h"
 
diff --git a/libc/system/system.c b/libc/proc/system.c
similarity index 96%
rename from libc/system/system.c
rename to libc/proc/system.c
index 6755d1f06..fddb4a0dd 100644
--- a/libc/system/system.c
+++ b/libc/proc/system.c
@@ -38,7 +38,9 @@
  * provides Bourne-like syntax on all platforms, including Windows. Many
  * builtin commands are included, e.g. exit, cd, rm, [, cat, wait, exec,
  * env, echo, read, true, test, kill, touch, rmdir, mkdir, false, mktemp
- * sed, tr, and usleep.
+ * and usleep. It's also possible to __static_yoink() the symbols `_tr`,
+ * `_sed`, `_awk`, and `_curl` for the tr, sed, awk and curl commands if
+ * you're using the Cosmopolitan mono-repo.
  *
  * If you just have a program name and arguments, and you don't need the
  * full power of a UNIX-like shell, then consider using the Cosmopolitan
diff --git a/libc/system/systemvpe.c b/libc/proc/systemvpe.c
similarity index 97%
rename from libc/system/systemvpe.c
rename to libc/proc/systemvpe.c
index 1165d45c3..e44ed8d66 100644
--- a/libc/system/systemvpe.c
+++ b/libc/proc/systemvpe.c
@@ -52,8 +52,9 @@ int systemvpe(const char *prog, char *const argv[], char *const envp[]) {
   int pid, wstatus;
   char pathbuf[PATH_MAX + 1];
   sigset_t chldmask, savemask;
-  if (!(exe = commandv(prog, pathbuf, sizeof(pathbuf))))
+  if (!(exe = commandv(prog, pathbuf, sizeof(pathbuf)))) {
     return -1;
+  }
   sigemptyset(&chldmask);
   sigaddset(&chldmask, SIGINT);
   sigaddset(&chldmask, SIGQUIT);
@@ -61,7 +62,7 @@ int systemvpe(const char *prog, char *const argv[], char *const envp[]) {
   sigprocmask(SIG_BLOCK, &chldmask, &savemask);
   if (!(pid = vfork())) {
     sigprocmask(SIG_SETMASK, &savemask, 0);
-    execve(exe, argv, envp);
+    execve(prog, argv, envp);
     _Exit(127);
   } else if (pid == -1) {
     wstatus = -1;
diff --git a/libc/proc/times.c b/libc/proc/times.c
index 28973c964..1538e1a9b 100644
--- a/libc/proc/times.c
+++ b/libc/proc/times.c
@@ -16,38 +16,55 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
 #include "libc/calls/struct/rusage.h"
-#include "libc/calls/struct/timespec.h"
 #include "libc/calls/struct/timeval.h"
 #include "libc/calls/struct/tms.h"
+#include "libc/calls/syscall_support-nt.internal.h"
+#include "libc/dce.h"
+#include "libc/fmt/wintime.internal.h"
+#include "libc/nt/accounting.h"
+#include "libc/nt/runtime.h"
 #include "libc/runtime/clktck.h"
-#include "libc/sysv/consts/clock.h"
+#include "libc/runtime/sysconf.h"
 #include "libc/sysv/consts/rusage.h"
+#include "libc/time.h"
 
-static long MicrosToTicks(struct timeval tv) {
+static dontinline long ConvertMicros(struct timeval tv) {
   return tv.tv_sec * CLK_TCK + tv.tv_usec / (1000000 / CLK_TCK);
 }
 
-static long NanosToTicks(struct timespec ts) {
-  return ts.tv_sec * CLK_TCK + ts.tv_nsec / (1000000000 / CLK_TCK);
+static dontinline long times2(struct tms *out_times, struct rusage *ru) {
+  struct timeval tv;
+  struct NtFileTime CreationTime, ExitTime, KernelTime, UserTime;
+  if (!IsWindows()) {
+    if (getrusage(RUSAGE_SELF, ru) == -1)
+      return -1;
+    out_times->tms_utime = ConvertMicros(ru->ru_utime);
+    out_times->tms_stime = ConvertMicros(ru->ru_stime);
+    if (getrusage(RUSAGE_CHILDREN, ru) == -1)
+      return -1;
+    out_times->tms_cutime = ConvertMicros(ru->ru_utime);
+    out_times->tms_cstime = ConvertMicros(ru->ru_stime);
+  } else {
+    if (!GetProcessTimes(GetCurrentProcess(), &CreationTime, &ExitTime,
+                         &KernelTime, &UserTime)) {
+      return __winerr();
+    }
+    out_times->tms_utime = ReadFileTime(UserTime);
+    out_times->tms_stime = ReadFileTime(KernelTime);
+    out_times->tms_cutime = 0;
+    out_times->tms_cstime = 0;
+  }
+  if (gettimeofday(&tv, NULL) == -1)
+    return -1;
+  return ConvertMicros(tv);
 }
 
 /**
  * Returns accounting data for process on time-sharing system.
- * @return number of `CLK_TCK` from `CLOCK_BOOTTIME` epoch
  */
 long times(struct tms *out_times) {
-  struct timespec bt;
-  struct rusage rus, ruc;
-  if (getrusage(RUSAGE_SELF, &rus))
-    return -1;
-  if (getrusage(RUSAGE_CHILDREN, &ruc))
-    return -1;
-  if (clock_gettime(CLOCK_BOOTTIME, &bt))
-    return -1;
-  out_times->tms_utime = MicrosToTicks(rus.ru_utime);
-  out_times->tms_stime = MicrosToTicks(rus.ru_stime);
-  out_times->tms_cutime = MicrosToTicks(ruc.ru_utime);
-  out_times->tms_cstime = MicrosToTicks(ruc.ru_stime);
-  return NanosToTicks(bt);
+  struct rusage ru;
+  return times2(out_times, &ru);
 }
diff --git a/libc/proc/vfork.S b/libc/proc/vfork.S
index 3f87d74e1..39d7ae6e2 100644
--- a/libc/proc/vfork.S
+++ b/libc/proc/vfork.S
@@ -19,7 +19,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/strace.h"
 #include "libc/thread/tls.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Forks process without copying page tables.
 //
@@ -121,7 +121,7 @@ vfork:
 //	} else {
 //	  __get_tls()->tib_flags &= ~TIB_FLAG_VFORKED;
 //	}
-	sub	x1,x28,#1024		// sizeof(CosmoTib)
+	sub	x1,x28,#512		// sizeof(CosmoTib)
 	ldr	x2,[x1,64]
 	cbnz	x0,2f
 	orr	x2,x2,#TIB_FLAG_VFORKED
diff --git a/libc/proc/wait4-nt.c b/libc/proc/wait4-nt.c
index fe8e0d85d..5f4a4f9d4 100644
--- a/libc/proc/wait4-nt.c
+++ b/libc/proc/wait4-nt.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
 #include "libc/calls/struct/sigset.h"
@@ -28,22 +27,25 @@
 #include "libc/nt/events.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/synchronization.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/sysv/consts/sicode.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/consts/w.h"
 #include "libc/sysv/errfuns.h"
 #ifdef __x86_64__
 
-textwindows static int __proc_reap(struct Proc *pr, int *wstatus,
+static textwindows int __proc_reap(struct Proc *pr, int *wstatus,
                                    struct rusage *opt_out_rusage) {
-  if (wstatus)
+  if (wstatus) {
     *wstatus = pr->wstatus;
-  if (opt_out_rusage)
+  }
+  if (opt_out_rusage) {
     *opt_out_rusage = pr->ru;
+  }
   dll_remove(&__proc.zombies, &pr->elem);
-  if (dll_is_empty(__proc.zombies))
+  if (dll_is_empty(__proc.zombies)) {
     ResetEvent(__proc.haszombies);
+  }
   if (pr->waiters) {
     pr->status = PROC_UNDEAD;
     dll_make_first(&__proc.undead, &pr->elem);
@@ -54,51 +56,53 @@ textwindows static int __proc_reap(struct Proc *pr, int *wstatus,
   return pr->pid;
 }
 
-textwindows static int __proc_check(int pid, int *wstatus,
+static textwindows int __proc_check(int pid, int *wstatus,
                                     struct rusage *opt_out_rusage) {
   struct Dll *e;
   for (e = dll_first(__proc.zombies); e; e = dll_next(__proc.zombies, e)) {
     struct Proc *pr = PROC_CONTAINER(e);
-    if (pid == -1 || pid == pr->pid)
+    if (pid == -1 || pid == pr->pid) {
       return __proc_reap(pr, wstatus, opt_out_rusage);
+    }
   }
   return 0;
 }
 
-textwindows static int __proc_wait(int pid, int *wstatus, int options,
+static textwindows int __proc_wait(int pid, int *wstatus, int options,
                                    struct rusage *rusage, sigset_t waitmask) {
   for (;;) {
 
     // check for signals and cancelation
     int sig, handler_was_called;
-    if (_check_cancel() == -1)
+    if (_check_cancel() == -1) {
       return -1;
+    }
     if (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
       handler_was_called = _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
-      if (_check_cancel() == -1)
+      if (_check_cancel() == -1) {
         return -1;  // ECANCELED because SIGTHR was just handled
-      if (handler_was_called & SIG_HANDLED_NO_RESTART)
+      }
+      if (handler_was_called & SIG_HANDLED_NO_RESTART) {
         return eintr();  // a non-SA_RESTART handler was called
+      }
     }
 
     // check for zombie to harvest
     __proc_lock();
   CheckForZombies:
     int rc = __proc_check(pid, wstatus, rusage);
-
-    // if there's no zombies left
-    // check if there's any living processes
-    if (!rc && dll_is_empty(__proc.list)) {
-      __proc_unlock();
-      return echild();
-    }
-
-    // otherwise return zombie or zero
     if (rc || (options & WNOHANG)) {
       __proc_unlock();
       return rc;
     }
 
+    // there's no zombies left
+    // check if there's any living processes
+    if (dll_is_empty(__proc.list)) {
+      __proc_unlock();
+      return echild();
+    }
+
     // get appropriate wait object
     // register ourself as waiting
     struct Proc *pr = 0;
@@ -130,19 +134,15 @@ textwindows static int __proc_wait(int pid, int *wstatus, int options,
 
     // perform blocking operation
     uint32_t wi;
-    uintptr_t event;
-    if ((event = CreateEvent(0, 0, 0, 0))) {
-      struct PosixThread *pt = _pthread_self();
-      pt->pt_event = event;
-      pt->pt_blkmask = waitmask;
-      atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_EVENT,
-                            memory_order_release);
-      wi = WaitForMultipleObjects(2, (intptr_t[2]){hWaitObject, event}, 0, -1u);
-      atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
-      CloseHandle(event);
-    } else {
-      wi = -1u;
-    }
+    uintptr_t sem;
+    struct PosixThread *pt = _pthread_self();
+    pt->pt_blkmask = waitmask;
+    pt->pt_semaphore = sem = CreateSemaphore(0, 0, 1, 0);
+    atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_SEM,
+                          memory_order_release);
+    wi = WaitForMultipleObjects(2, (intptr_t[2]){hWaitObject, sem}, 0, -1u);
+    atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
+    CloseHandle(sem);
 
     // log warning if handle unexpectedly closed
     if (wi & kNtWaitAbandoned) {
@@ -160,14 +160,15 @@ textwindows static int __proc_wait(int pid, int *wstatus, int options,
     // check if killed or win32 error
     if (wi) {
       if (pr) {
-        if (!--pr->waiters && pr->status == PROC_UNDEAD)
+        if (!--pr->waiters && pr->status == PROC_UNDEAD) {
           __proc_free(pr);
+        }
       } else {
         --__proc.waiters;
       }
       __proc_unlock();
       if (wi == 1) {
-        // __sig_wake() woke our semaphore
+        // __sig_cancel() woke our semaphore
         continue;
       } else {
         // neither posix or win32 define i/o error conditions for
@@ -178,15 +179,17 @@ textwindows static int __proc_wait(int pid, int *wstatus, int options,
 
     // handle process exit notification
     --pr->waiters;
-    if (pr->status == PROC_ALIVE)
+    if (pr->status == PROC_ALIVE) {
       __proc_harvest(pr, true);
+    }
     switch (pr->status) {
       case PROC_ALIVE:
         // exit caused by execve() reparenting
-        if (!pr->waiters)
+        __proc_unlock();
+        if (!pr->waiters) {
           // avoid deadlock that could theoretically happen
           SetEvent(__proc.onbirth);
-        __proc_unlock();
+        }
         break;
       case PROC_ZOMBIE:
         // exit happened and we're the first to know
@@ -195,8 +198,9 @@ textwindows static int __proc_wait(int pid, int *wstatus, int options,
         return rc;
       case PROC_UNDEAD:
         // exit happened but another thread waited first
-        if (!pr->waiters)
+        if (!pr->waiters) {
           __proc_free(pr);
+        }
         __proc_unlock();
         return echild();
       default:
diff --git a/libc/proc/wait4.c b/libc/proc/wait4.c
index 0db1f4a81..056e9b371 100644
--- a/libc/proc/wait4.c
+++ b/libc/proc/wait4.c
@@ -21,7 +21,7 @@
 #include "libc/calls/struct/rusage.internal.h"
 #include "libc/dce.h"
 #include "libc/intrin/strace.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/sysv/errfuns.h"
 
 /**
diff --git a/libc/runtime/at_quick_exit.c b/libc/runtime/at_quick_exit.c
index c9aa389b9..4786d801b 100644
--- a/libc/runtime/at_quick_exit.c
+++ b/libc/runtime/at_quick_exit.c
@@ -16,32 +16,35 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/cxaatexit.h"
-#include "libc/macros.h"
-#include "libc/stdlib.h"
+#include "libc/atomic.h"
+#include "libc/macros.internal.h"
+#include "libc/runtime/runtime.h"
+#include "libc/thread/thread.h"
 
 static void (*funcs[32])(void);
 static int count;
+static pthread_spinlock_t lock;
+pthread_spinlock_t *const __at_quick_exit_lockptr = &lock;
 
 void __funcs_on_quick_exit(void) {
   void (*func)(void);
-  __cxa_lock();
+  pthread_spin_lock(&lock);
   while (count) {
     func = funcs[--count];
-    __cxa_unlock();
+    pthread_spin_unlock(&lock);
     func();
-    __cxa_lock();
+    pthread_spin_lock(&lock);
   }
 }
 
 int at_quick_exit(void func(void)) {
   int res = 0;
-  __cxa_lock();
+  pthread_spin_lock(&lock);
   if (count == ARRAYLEN(funcs)) {
     res = -1;
   } else {
     funcs[count++] = func;
   }
-  __cxa_unlock();
+  pthread_spin_unlock(&lock);
   return res;
 }
diff --git a/libc/runtime/clone-linux.S b/libc/runtime/clone-linux.S
index 909d525fe..2863daac8 100644
--- a/libc/runtime/clone-linux.S
+++ b/libc/runtime/clone-linux.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .privileged
 
 //	Invokes clone() system call on GNU/Systemd.
@@ -26,46 +26,54 @@
 //	@param	rdx	x2	is ptid
 //	@param	rcx	x3	is ctid
 //	@param	r8	x4	is tls
-//	@param	r9	x5	is func(void*)→int
+//	@param	r9	x5	is func(void*,int)→int
 //	@param	8(rsp)	x6	is arg
 //	@return	tid of child on success, or -errno on error
 sys_clone_linux:
-	beg
-	pro
 #ifdef __x86_64__
-	cpush	%rbx
+	push	%rbp
+	mov	%rsp,%rbp
+	push	%rbx
 	mov	%rcx,%r10
 	mov	16(%rbp),%rbx
 	mov	$56,%eax		// __NR_clone
 	syscall
 	test	%rax,%rax
 	jz	2f
-0:	cpop	%rbx
-	epi
+0:	pop	%rbx
+	pop	%rbp
 	ret
 2:	xor	%ebp,%ebp		// child thread
 	mov	%rbx,%rdi		// arg
+	mov	%r10,%r15		// experiment
+	mov	(%r10),%esi		// tid
 	call	*%r9			// func(arg,tid)
 	xchg	%eax,%edi		// func(arg,tid) → exitcode
+	mov	(%r15),%eax		// experiment
+	test	%eax,%eax		// experiment
+	jz	1f			// experiment
 	mov	$60,%eax		// __NR_exit(exitcode)
 	syscall
+1:	hlt				// ctid was corrupted by program!
 #elif defined(__aarch64__)
+	stp	x29,x30,[sp,#-16]!
+	mov	x29,sp
 	mov	x8,x3			// swap x3 and x4
 	mov	x3,x4			// swap x3 and x4
 	mov	x4,x8			// swap x3 and x4
 	mov	x8,#220			// __NR_clone
 	svc	#0
 	cbz	x0,2f
-	epi
+	ldp	x29,x30,[sp],#16
 	ret
 2:	mov	x29,#0			// wipe backtrace
 	mov	x28,x3			// set cosmo tls
 	mov	x0,x6			// child thread
+	ldr	w1,[x4]			// arg2 = *ctid
 	blr	x5
 	mov	x8,#93			// __NR_exit
 	svc	#0
 #else
 #error "unsupported architecture"
 #endif
-	end
 	.endfn	sys_clone_linux,globl,hidden
diff --git a/libc/runtime/clone-openbsd.S b/libc/runtime/clone-openbsd.S
index daa3cde0c..a1a9feb78 100644
--- a/libc/runtime/clone-openbsd.S
+++ b/libc/runtime/clone-openbsd.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #define SYS___tfork 8
 
diff --git a/libc/runtime/clone-xnu.S b/libc/runtime/clone-xnu.S
index 66b1c4283..fea63d203 100644
--- a/libc/runtime/clone-xnu.S
+++ b/libc/runtime/clone-xnu.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 sys_clone_xnu:
 	mov	$0x02000168,%eax		# bsdthread_create
diff --git a/libc/runtime/clone.c b/libc/runtime/clone.c
index c2325896e..46347d47b 100644
--- a/libc/runtime/clone.c
+++ b/libc/runtime/clone.c
@@ -16,27 +16,49 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/sysv/consts/clone.h"
+#include "libc/assert.h"
 #include "libc/atomic.h"
+#include "libc/calls/calls.h"
 #include "libc/calls/state.internal.h"
+#include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/ucontext-netbsd.internal.h"
+#include "libc/calls/syscall-sysv.internal.h"
+#include "libc/calls/wincrash.internal.h"
 #include "libc/dce.h"
-#include "libc/intrin/asmflag.h"
+#include "libc/errno.h"
 #include "libc/intrin/atomic.h"
+#include "libc/intrin/describeflags.h"
 #include "libc/intrin/ulock.h"
+#include "libc/intrin/weaken.h"
 #include "libc/limits.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alloca.h"
 #include "libc/nt/enum/processcreationflags.h"
 #include "libc/nt/runtime.h"
+#include "libc/nt/signals.h"
 #include "libc/nt/synchronization.h"
 #include "libc/nt/thread.h"
 #include "libc/nt/thunk/msabi.h"
+#include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
+#include "libc/runtime/stack.h"
 #include "libc/runtime/syslib.internal.h"
 #include "libc/sock/internal.h"
+#include "libc/stdalign.internal.h"
+#include "libc/stdio/sysparam.h"
+#include "libc/str/str.h"
 #include "libc/sysv/consts/arch.h"
+#include "libc/sysv/consts/clone.h"
+#include "libc/sysv/consts/futex.h"
+#include "libc/sysv/consts/nr.h"
+#include "libc/sysv/consts/nrlinux.h"
+#include "libc/sysv/errfuns.h"
 #include "libc/thread/freebsd.internal.h"
 #include "libc/thread/openbsd.internal.h"
-#include "libc/thread/posixthread.internal.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/tls.h"
+#include "libc/thread/tls2.internal.h"
 #include "libc/thread/xnu.internal.h"
 
 #define kMaxThreadIds 32768
@@ -56,73 +78,85 @@
 #define LWP_SUSPENDED                     0x00000080
 
 struct CloneArgs {
-  union {
-    long sp;
+  alignas(16) union {
+    struct {
+      atomic_int tid;
+      int this;
+    };
     int64_t tid64;
   };
   atomic_int *ptid;
   atomic_int *ctid;
+  atomic_int *ztid;
   char *tls;
-  int (*func)(void *);
+  int (*func)(void *, int);
   void *arg;
+  long sp;
 };
 
 int sys_set_tls(uintptr_t, void *);
-int __stack_call(void *, int, long, long, int (*)(void *), long);
+int __stack_call(void *, int, long, long, int (*)(void *, int), long);
+
+static long AlignStack(long sp, char *stk, long stksz, int mal) {
+  return sp & -mal;
+}
 
 #ifdef __x86_64__
 
 ////////////////////////////////////////////////////////////////////////////////
 // THE NEW TECHNOLOGY
 
+__msabi extern typeof(TlsSetValue) *const __imp_TlsSetValue;
 __msabi extern typeof(ExitThread) *const __imp_ExitThread;
-__msabi extern typeof(GetCurrentThreadId) *const __imp_GetCurrentThreadId;
 __msabi extern typeof(WakeByAddressAll) *const __imp_WakeByAddressAll;
 
-textwindows dontinstrument wontreturn static void  //
+static textwindows dontinstrument wontreturn void  //
 WinThreadEntry(int rdi,                            // rcx
                int rsi,                            // rdx
                int rdx,                            // r8
                struct CloneArgs *wt) {             // r9
-  __set_tls_win32(wt->tls);
-  int tid = __imp_GetCurrentThreadId();
-  atomic_int *ctid = wt->ctid;
-  atomic_init(ctid, tid);
-  atomic_init(wt->ptid, tid);
-  int rc = __stack_call(wt->arg, tid, 0, 0, wt->func, wt->sp);
+  int rc;
+  if (wt->tls)
+    __set_tls_win32(wt->tls);
+  *wt->ctid = GetCurrentThreadId();
+  rc = __stack_call(wt->arg, wt->tid, 0, 0, wt->func, wt->sp);
   // we can now clear ctid directly since we're no longer using our own
   // stack memory, which can now be safely free'd by the parent thread.
-  atomic_store_explicit(ctid, 0, memory_order_release);
-  __imp_WakeByAddressAll(ctid);
+  *wt->ztid = 0;
+  __imp_WakeByAddressAll(wt->ztid);
   // since we didn't indirect this function through NT2SYSV() it's not
   // safe to simply return, and as such, we need ExitThread().
   __imp_ExitThread(rc);
   __builtin_unreachable();
 }
 
-textwindows static errno_t CloneWindows(int (*func)(void *), char *stk,
-                                        size_t stksz, void *arg, void *tls,
-                                        atomic_int *ptid, atomic_int *ctid) {
+static textwindows errno_t CloneWindows(int (*func)(void *, int), char *stk,
+                                        size_t stksz, int flags, void *arg,
+                                        void *tls, atomic_int *ptid,
+                                        atomic_int *ctid) {
   long sp;
   int64_t h;
-  intptr_t tip;
   uint32_t utid;
   struct CloneArgs *wt;
-  sp = tip = (intptr_t)stk + stksz;
+  sp = (intptr_t)stk + stksz;
+  sp = AlignStack(sp, stk, stksz, 16);
   sp -= sizeof(struct CloneArgs);
   sp &= -alignof(struct CloneArgs);
   wt = (struct CloneArgs *)sp;
-  wt->ctid = ctid;
-  wt->ptid = ptid;
+  wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
+  wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
   wt->func = func;
   wt->arg = arg;
-  wt->tls = tls;
-  wt->sp = tip & -16;
-  if ((h = CreateThread(0, 65536, (void *)WinThreadEntry, wt,
+  wt->tls = flags & CLONE_SETTLS ? tls : 0;
+  wt->sp = sp;
+  if ((h = CreateThread(&kNtIsInheritable, 65536, (void *)WinThreadEntry, wt,
                         kNtStackSizeParamIsAReservation, &utid))) {
-    atomic_init(ptid, utid);
-    struct CosmoTib *tib = tls;
-    atomic_store_explicit(&tib->tib_syshand, h, memory_order_release);
+    if (flags & CLONE_PARENT_SETTID)
+      *ptid = utid;
+    if (flags & CLONE_SETTLS) {
+      struct CosmoTib *tib = tls;
+      atomic_store_explicit(&tib->tib_syshand, h, memory_order_release);
+    }
     return 0;
   } else {
     return __dos2errno(GetLastError());
@@ -146,33 +180,36 @@ asm("XnuThreadThunk:\n\t"
     ".size\tXnuThreadThunk,.-XnuThreadThunk");
 __attribute__((__used__))
 
-dontinstrument wontreturn static void
-XnuThreadMain(void *pthread,           // rdi
-              int tid,                 // rsi
-              int (*func)(void *arg),  // rdx
-              void *arg,               // rcx
-              struct CloneArgs *wt,    // r8
-              unsigned xnuflags) {     // r9
-  atomic_init(wt->ctid, tid);
-  atomic_init(wt->ptid, tid);
-
-  // XNU uses the same 0x30 offset as the WIN32 TIB x64. They told the
-  // Go team at Google that they Apply stands by our ability to use it
-  // https://github.com/golang/go/issues/23617#issuecomment-376662373
+static wontreturn void
+XnuThreadMain(void *pthread,                    // rdi
+              int tid,                          // rsi
+              int (*func)(void *arg, int tid),  // rdx
+              void *arg,                        // rcx
+              struct CloneArgs *wt,             // r8
+              unsigned xnuflags) {              // r9
   int ax;
-  asm volatile("syscall"
-               : "=a"(ax)
-               : "0"(__NR_thread_fast_set_cthread_self), "D"(wt->tls - 0x30)
-               : "rcx", "rdx", "r8", "r9", "r10", "r11", "memory", "cc");
+  wt->tid = tid;
+  *wt->ctid = tid;
+  *wt->ptid = tid;
 
-  func(arg);
+  if (wt->tls) {
+    // XNU uses the same 0x30 offset as the WIN32 TIB x64. They told the
+    // Go team at Google that they Apply stands by our ability to use it
+    // https://github.com/golang/go/issues/23617#issuecomment-376662373
+    asm volatile("syscall"
+                 : "=a"(ax)
+                 : "0"(__NR_thread_fast_set_cthread_self), "D"(wt->tls - 0x30)
+                 : "rcx", "rdx", "r8", "r9", "r10", "r11", "memory", "cc");
+  }
+
+  func(arg, tid);
 
   // we no longer use the stack after this point
   // %rax = int bsdthread_terminate(%rdi = void *stackaddr,
   //                                %rsi = size_t freesize,
   //                                %rdx = uint32_t port,
   //                                %r10 = uint32_t sem);
-  asm volatile("movl\t$0,(%%rsi)\n\t"        // *wt->ctid = 0
+  asm volatile("movl\t$0,(%%rsi)\n\t"        // *wt->ztid = 0
                "mov\t$0x101,%%edi\n\t"       // wake all
                "xor\t%%edx,%%edx\n\t"        // wake_value
                "mov\t$0x02000204,%%eax\n\t"  // ulock_wake()
@@ -184,18 +221,19 @@ XnuThreadMain(void *pthread,           // rdi
                "mov\t$0x02000169,%%eax\n\t"  // bsdthread_terminate()
                "syscall"
                : /* no outputs */
-               : "S"(wt->ctid)
+               : "S"(wt->ztid)
                : "rax", "rcx", "r10", "r11", "memory");
   __builtin_unreachable();
 }
 
-static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, void *arg,
-                        void *tls, atomic_int *ptid, atomic_int *ctid) {
+static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
+                        void *arg, void *tls, atomic_int *ptid,
+                        atomic_int *ctid) {
 
   // perform this weird mandatory system call once
   static bool once;
   if (!once) {
-    sys_bsdthread_register(XnuThreadThunk, 0, 0, 0, 0, 0, 0);
+    npassert(sys_bsdthread_register(XnuThreadThunk, 0, 0, 0, 0, 0, 0) != -1);
     once = true;
   }
 
@@ -203,15 +241,16 @@ static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, void *arg,
   long sp;
   struct CloneArgs *wt;
   sp = (intptr_t)stk + stksz;
+  sp = AlignStack(sp, stk, stksz, 16);
   sp -= sizeof(struct CloneArgs);
   sp &= -alignof(struct CloneArgs);
   wt = (struct CloneArgs *)sp;
-  sp &= -16;
 
   // pass parameters to new thread via xnu
-  wt->ctid = ctid;
-  wt->ptid = ptid;
-  wt->tls = tls;
+  wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
+  wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
+  wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
+  wt->tls = flags & CLONE_SETTLS ? tls : 0;
   return sys_clone_xnu(fn, arg, wt, 0, PTHREAD_START_CUSTOM_XNU);
 }
 
@@ -220,27 +259,26 @@ static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, void *arg,
 
 // we can't use address sanitizer because:
 //   1. __asan_handle_no_return wipes stack [todo?]
-relegated dontinstrument wontreturn static void OpenbsdThreadMain(void *p) {
+static wontreturn void OpenbsdThreadMain(void *p) {
   struct CloneArgs *wt = p;
-  int tid = atomic_load_explicit(wt->ctid, memory_order_relaxed);
-  atomic_init(wt->ptid, tid);
-  wt->func(wt->arg);
-  asm volatile("mov\t%1,%%rsp\n\t"     // so syscall can validate stack exists
-               "movl\t$0,(%2)\n\t"     // *wt->ctid = 0 (old stack now free'd)
+  *wt->ctid = wt->tid;
+  wt->func(wt->arg, wt->tid);
+  asm volatile("mov\t%2,%%rsp\n\t"     // so syscall can validate stack exists
+               "movl\t$0,(%%rdi)\n\t"  // *wt->ztid = 0 (old stack now free'd)
                "syscall\n\t"           // futex(int*, op, val) will wake wait0
                "xor\t%%edi,%%edi\n\t"  // so kernel doesn't write to old stack
                "mov\t$302,%%eax\n\t"   // __threxit(int *notdead) doesn't wake
                "syscall"
-               : /* no outputs */
-               : "a"(83), "m"(__oldstack), "D"(wt->ctid),
+               : "=m"(*wt->ztid)
+               : "a"(83), "m"(__oldstack), "D"(wt->ztid),
                  "S"(2 /* FUTEX_WAKE */), "d"(INT_MAX)
                : "rcx", "r11", "memory");
   __builtin_unreachable();
 }
 
-relegated static errno_t CloneOpenbsd(int (*func)(void *), char *stk,
-                                      size_t stksz, void *arg, void *tls,
-                                      atomic_int *ptid, atomic_int *ctid) {
+static errno_t CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz,
+                            int flags, void *arg, void *tls, atomic_int *ptid,
+                            atomic_int *ctid) {
   int rc;
   intptr_t sp;
   struct __tfork *tf;
@@ -252,18 +290,19 @@ relegated static errno_t CloneOpenbsd(int (*func)(void *), char *stk,
   sp -= sizeof(struct CloneArgs);
   sp &= -alignof(struct CloneArgs);
   wt = (struct CloneArgs *)sp;
-  sp &= -16;
-  sp -= 8;
-  *(intptr_t *)sp = (intptr_t)CloneOpenbsd + 1;
-  wt->ctid = ctid;
-  wt->ptid = ptid;
+  sp = AlignStack(sp, stk, stksz, 16);
+  wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
+  wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
   wt->arg = arg;
   wt->func = func;
-  tf->tf_stack = (char *)sp;
-  tf->tf_tcb = tls;
-  tf->tf_tid = ctid;
+  tf->tf_stack = (char *)sp - 8;
+  tf->tf_tcb = flags & CLONE_SETTLS ? tls : 0;
+  tf->tf_tid = &wt->tid;
   if ((rc = __tfork_thread(tf, sizeof(*tf), OpenbsdThreadMain, wt)) >= 0) {
-    atomic_init(ptid, rc);
+    npassert(rc);
+    if (flags & CLONE_PARENT_SETTID) {
+      *ptid = rc;
+    }
     return 0;
   } else {
     return -rc;
@@ -273,35 +312,35 @@ relegated static errno_t CloneOpenbsd(int (*func)(void *), char *stk,
 ////////////////////////////////////////////////////////////////////////////////
 // NET BESIYATA DISHMAYA
 
-wontreturn dontinstrument static void NetbsdThreadMain(
-    void *arg,            // rdi
-    int (*func)(void *),  // rsi
-    atomic_int *ctid,     // rdx
-    atomic_int *ptid) {   // rcx
-  int ax;
-  asm("syscall"
-      : "=a"(ax)  // man says always succeeds
-      : "0"(311)  // _lwp_self()
-      : "rcx", "rdx", "r8", "r9", "r10", "r11", "memory", "cc");
-  atomic_init(ctid, ax);
-  atomic_init(ptid, ax);
-  func(arg);
+static wontreturn void NetbsdThreadMain(void *arg,                 // rdi
+                                        int (*func)(void *, int),  // rsi
+                                        int *tid,                  // rdx
+                                        atomic_int *ctid,          // rcx
+                                        int *ztid) {               // r9
+  int ax, dx;
+  // TODO(jart): Why are we seeing flakes where *tid is zero?
+  // ax = *tid;
+  ax = sys_gettid();
+  *ctid = ax;
+  func(arg, ax);
   // we no longer use the stack after this point
   // %eax = int __lwp_exit(void);
-  asm volatile("movl\t$0,(%2)\n\t"  // *ztid = 0
-               "syscall"            // __lwp_exit()
-               : "=a"(ax)
-               : "0"(310), "r"(ctid)
+  asm volatile("movl\t$0,%2\n\t"  // *wt->ztid = 0
+               "syscall"          // __lwp_exit()
+               : "=a"(ax), "=d"(dx), "=m"(*ztid)
+               : "0"(310)
                : "rcx", "r11", "memory");
   __builtin_unreachable();
 }
 
-static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, void *arg,
-                       void *tls, atomic_int *ptid, atomic_int *ctid) {
+static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
+                       int flags, void *arg, void *tls, atomic_int *ptid,
+                       atomic_int *ctid) {
   // NetBSD has its own clone() and it works, but it's technically a
   // second-class API, intended to help Linux folks migrate to this.
   int ax;
   bool failed;
+  atomic_int *tid;
   intptr_t dx, sp;
   static bool once;
   struct ucontext_netbsd *ctx;
@@ -313,12 +352,19 @@ static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, void *arg,
                  : CFLAG_CONSTRAINT(failed), "=a"(ax)
                  : "1"(__NR_getcontext_netbsd), "D"(&netbsd_clone_template)
                  : "rcx", "rdx", "r8", "r9", "r10", "r11", "memory");
+    npassert(!failed);
     once = true;
   }
   sp = (intptr_t)stk + stksz;
 
+  // allocate memory for tid
+  sp -= sizeof(atomic_int);
+  sp = sp & -alignof(atomic_int);
+  tid = (atomic_int *)sp;
+  *tid = 0;
+
   // align the stack
-  sp &= -16;
+  sp = AlignStack(sp, stk, stksz, 16);
 
   // simulate call to misalign stack and ensure backtrace looks good
   sp -= 8;
@@ -326,7 +372,8 @@ static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, void *arg,
 
   // place the giant 784 byte ucontext structure in the red zone!
   // it only has to live long enough for the thread to come alive
-  ctx = (struct ucontext_netbsd *)((sp - sizeof(struct ucontext_netbsd)) & -64);
+  ctx = (struct ucontext_netbsd *)((sp - sizeof(struct ucontext_netbsd)) &
+                                   -alignof(struct ucontext_netbsd));
 
   // pass parameters in process state
   memcpy(ctx, &netbsd_clone_template, sizeof(*ctx));
@@ -336,23 +383,28 @@ static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, void *arg,
   ctx->uc_mcontext.rip = (intptr_t)NetbsdThreadMain;
   ctx->uc_mcontext.rdi = (intptr_t)arg;
   ctx->uc_mcontext.rsi = (intptr_t)func;
-  ctx->uc_mcontext.rdx = (intptr_t)ctid;
-  ctx->uc_mcontext.rcx = (intptr_t)ptid;
+  ctx->uc_mcontext.rdx = (intptr_t)tid;
+  ctx->uc_mcontext.rcx = (intptr_t)(flags & CLONE_CHILD_SETTID ? ctid : tid);
+  ctx->uc_mcontext.r8 = (intptr_t)(flags & CLONE_CHILD_CLEARTID ? ctid : tid);
   ctx->uc_flags |= _UC_STACK;
   ctx->uc_stack.ss_sp = stk;
   ctx->uc_stack.ss_size = stksz;
   ctx->uc_stack.ss_flags = 0;
-  ctx->uc_flags |= _UC_TLSBASE;
-  ctx->uc_mcontext._mc_tlsbase = (intptr_t)tls;
+  if (flags & CLONE_SETTLS) {
+    ctx->uc_flags |= _UC_TLSBASE;
+    ctx->uc_mcontext._mc_tlsbase = (intptr_t)tls;
+  }
 
   // perform the system call
-  int tid = 0;
   asm volatile(CFLAG_ASM("syscall")
                : CFLAG_CONSTRAINT(failed), "=a"(ax), "=d"(dx)
-               : "1"(__NR__lwp_create), "D"(ctx), "S"(LWP_DETACHED), "2"(&tid)
+               : "1"(__NR__lwp_create), "D"(ctx), "S"(LWP_DETACHED), "2"(tid)
                : "rcx", "r8", "r9", "r10", "r11", "memory");
   if (!failed) {
-    atomic_init(ptid, tid);
+    npassert(*tid);
+    if (flags & CLONE_PARENT_SETTID) {
+      *ptid = *tid;
+    }
     return 0;
   } else {
     return ax;
@@ -364,42 +416,41 @@ static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, void *arg,
 ////////////////////////////////////////////////////////////////////////////////
 // FREE BESIYATA DISHMAYA
 
-wontreturn dontinstrument static void FreebsdThreadMain(void *p) {
+static wontreturn void FreebsdThreadMain(void *p) {
   struct CloneArgs *wt = p;
 #ifdef __aarch64__
   asm volatile("mov\tx28,%0" : /* no outputs */ : "r"(wt->tls));
 #elif defined(__x86_64__)
   sys_set_tls(AMD64_SET_GSBASE, wt->tls);
 #endif
-  atomic_init(wt->ctid, wt->tid64);
-  atomic_init(wt->ptid, wt->tid64);
-  wt->func(wt->arg);
+  *wt->ctid = wt->tid;
+  wt->func(wt->arg, wt->tid);
   // we no longer use the stack after this point
   // void thr_exit(%rdi = long *state);
 #ifdef __x86_64__
-  asm volatile("movl\t$0,%0\n\t"       // *wt->ctid = 0
-               "syscall\n\t"           // _umtx_op(wt->ctid, WAKE, INT_MAX)
+  asm volatile("movl\t$0,%0\n\t"       // *wt->ztid = 0
+               "syscall\n\t"           // _umtx_op(wt->ztid, WAKE, INT_MAX)
                "movl\t$431,%%eax\n\t"  // thr_exit(long *nonzeroes_and_wake)
                "xor\t%%edi,%%edi\n\t"  // sad we can't use this free futex op
                "syscall\n\t"           // thr_exit() fails if thread is orphaned
                "movl\t$1,%%eax\n\t"    // _exit()
                "syscall"               //
-               : "=m"(*wt->ctid)
-               : "a"(454), "D"(wt->ctid), "S"(UMTX_OP_WAKE), "d"(INT_MAX)
+               : "=m"(*wt->ztid)
+               : "a"(454), "D"(wt->ztid), "S"(UMTX_OP_WAKE), "d"(INT_MAX)
                : "rcx", "r8", "r9", "r10", "r11", "memory");
 #elif defined(__aarch64__)
-  register long x0 asm("x0") = (long)wt->ctid;
+  register long x0 asm("x0") = (long)wt->ztid;
   register long x1 asm("x1") = UMTX_OP_WAKE;
   register long x2 asm("x2") = INT_MAX;
   register long x8 asm("x8") = 454;  // _umtx_op
-  asm volatile("str\twzr,%0\n\t"     // *wt->ctid = 0
-               "svc\t0\n\t"          // _umtx_op(wt->ctid, WAKE, INT_MAX)
+  asm volatile("str\twzr,%0\n\t"     // *wt->ztid = 0
+               "svc\t0\n\t"          // _umtx_op(wt->ztid, WAKE, INT_MAX)
                "mov\tx0,#0\n\t"      // arg0 = 0
                "mov\tx8,#431\n\t"    // thr_exit
                "svc\t0\n\t"          // thr_exit(long *nonzeroes_and_wake = 0)
                "mov\tx8,#1\n\t"      // _exit
                "svc\t0"              // _exit(long *nonzeroes_and_wake = 0)
-               : "=m"(*wt->ctid)
+               : "=m"(*wt->ztid)
                : "r"(x0), "r"(x1), "r"(x2), "r"(x8));
 #else
 #error "unsupported architecture"
@@ -407,19 +458,19 @@ wontreturn dontinstrument static void FreebsdThreadMain(void *p) {
   __builtin_unreachable();
 }
 
-static errno_t CloneFreebsd(int (*func)(void *), char *stk, size_t stksz,
-                            void *arg, void *tls, atomic_int *ptid,
+static errno_t CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
+                            int flags, void *arg, void *tls, atomic_int *ptid,
                             atomic_int *ctid) {
   long sp;
-  int64_t tid64;
+  int64_t tid;
   struct CloneArgs *wt;
   sp = (intptr_t)stk + stksz;
   sp -= sizeof(struct CloneArgs);
   sp &= -alignof(struct CloneArgs);
   wt = (struct CloneArgs *)sp;
-  sp &= -16;
-  wt->ctid = ctid;
-  wt->ptid = ptid;
+  sp = AlignStack(sp, stk, stksz, 16);
+  wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
+  wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
   wt->tls = tls;
   wt->func = func;
   wt->arg = arg;
@@ -428,10 +479,10 @@ static errno_t CloneFreebsd(int (*func)(void *), char *stk, size_t stksz,
       .arg = wt,
       .stack_base = stk,
       .stack_size = sp - (long)stk,
-      .tls_base = tls,
+      .tls_base = flags & CLONE_SETTLS ? tls : 0,
       .tls_size = 64,
       .child_tid = &wt->tid64,
-      .parent_tid = &tid64,
+      .parent_tid = &tid,
   };
 #ifdef __x86_64__
   int ax;
@@ -452,7 +503,8 @@ static errno_t CloneFreebsd(int (*func)(void *), char *stk, size_t stksz,
 #else
 #error "unsupported architecture"
 #endif
-  atomic_init(ptid, tid64);
+  if (flags & CLONE_PARENT_SETTID)
+    *ptid = tid;
   return 0;
 }
 
@@ -461,59 +513,57 @@ static errno_t CloneFreebsd(int (*func)(void *), char *stk, size_t stksz,
 ////////////////////////////////////////////////////////////////////////////////
 // APPLE SILICON
 
-dontinstrument static void *SiliconThreadMain(void *arg) {
+static void *SiliconThreadMain(void *arg) {
   struct CloneArgs *wt = arg;
-  atomic_int *ctid = wt->ctid;
-  int tid = atomic_load_explicit(ctid, memory_order_relaxed);
   asm volatile("mov\tx28,%0" : /* no outputs */ : "r"(wt->tls));
-  __stack_call(wt->arg, tid, 0, 0, wt->func, wt->sp);
-  atomic_store_explicit(ctid, 0, memory_order_release);
-  ulock_wake(UL_COMPARE_AND_WAIT | ULF_WAKE_ALL, ctid, 0);
+  *wt->ctid = wt->this;
+  __stack_call(wt->arg, wt->this, 0, 0, wt->func, wt->sp);
+  *wt->ztid = 0;
+  ulock_wake(UL_COMPARE_AND_WAIT | ULF_WAKE_ALL, wt->ztid, 0);
   return 0;
 }
 
-static errno_t CloneSilicon(int (*fn)(void *), char *stk, size_t stksz,
-                            void *arg, void *tls, atomic_int *ptid,
+static errno_t CloneSilicon(int (*fn)(void *, int), char *stk, size_t stksz,
+                            int flags, void *arg, void *tls, atomic_int *ptid,
                             atomic_int *ctid) {
-
-  // assign tid to new thread
-  static atomic_uint tids;
-  unsigned tid = atomic_fetch_add_explicit(&tids, 1, memory_order_relaxed);
-  tid %= kMaxThreadIds;
-  tid += kMinThreadId;
-  atomic_init(ctid, tid);
-  atomic_init(ptid, tid);
-
-  // pass temp data on stack
-  intptr_t sp, tip;
+  long sp;
+  void *attr;
+  errno_t res;
+  unsigned tid;
+  pthread_t th;
+  size_t babystack;
   struct CloneArgs *wt;
-  sp = tip = (intptr_t)stk + stksz;
+  static atomic_uint tids;
+  sp = (intptr_t)stk + stksz;
   sp -= sizeof(struct CloneArgs);
   sp &= -alignof(struct CloneArgs);
   wt = (struct CloneArgs *)sp;
+  sp = AlignStack(sp, stk, stksz, 16);
+  tid = atomic_fetch_add_explicit(&tids, 1, memory_order_acq_rel);
+  wt->this = tid = (tid & (kMaxThreadIds - 1)) + kMinThreadId;
+  wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
+  wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
+  wt->tls = flags & CLONE_SETTLS ? tls : 0;
   wt->func = fn;
   wt->arg = arg;
-  wt->tls = tls;
-  wt->ctid = ctid;
-  wt->sp = tip & -16;
-
-  // ask apple libc to spawn thread
-  errno_t res;
-  pthread_t th;
-  size_t babystack = __syslib->__pthread_stack_min;
+  wt->sp = sp;
+  babystack = __syslib->__pthread_stack_min;
 #pragma GCC push_options
 #pragma GCC diagnostic ignored "-Walloca-larger-than="
-  void *attr = alloca(__syslib->__sizeof_pthread_attr_t);
+  attr = alloca(__syslib->__sizeof_pthread_attr_t);
 #pragma GCC pop_options
-  __syslib->__pthread_attr_init(attr);
-  __syslib->__pthread_attr_setguardsize(attr, 0);
-  __syslib->__pthread_attr_setstacksize(attr, babystack);
-  if (!(res = __syslib->__pthread_create(&th, attr, SiliconThreadMain, wt))) {
-    atomic_init(ptid, tid);
-    struct CosmoTib *tib = tls;
-    atomic_store_explicit(&tib[-1].tib_syshand, th, memory_order_release);
+  unassert(!__syslib->__pthread_attr_init(attr));
+  unassert(!__syslib->__pthread_attr_setguardsize(attr, 0));
+  unassert(!__syslib->__pthread_attr_setstacksize(attr, babystack));
+  if (!(res = __syslib->__pthread_create(&th, attr, SiliconThreadMain, wt)) &&
+      (flags & CLONE_PARENT_SETTID)) {
+    *ptid = tid;
+    if (flags & CLONE_SETTLS) {
+      struct CosmoTib *tib = tls;
+      atomic_store_explicit(&tib[-1].tib_syshand, th, memory_order_release);
+    }
   }
-  __syslib->__pthread_attr_destroy(attr);
+  unassert(!__syslib->__pthread_attr_destroy(attr));
   return res;
 }
 
@@ -523,9 +573,10 @@ static errno_t CloneSilicon(int (*fn)(void *), char *stk, size_t stksz,
 // GNU/SYSTEMD
 
 struct LinuxCloneArgs {
-  int (*func)(void *);
+  int (*func)(void *, int);
   void *arg;
   char *tls;
+  atomic_int ctid;
 };
 
 int sys_clone_linux(int flags,         // rdi
@@ -536,34 +587,42 @@ int sys_clone_linux(int flags,         // rdi
                     void *func,        // r9
                     void *arg);        // 8(rsp)
 
-dontinstrument static int AmdLinuxThreadEntry(void *arg) {
+static int LinuxThreadEntry(void *arg, int tid) {
   struct LinuxCloneArgs *wt = arg;
-#if defined(__x86_64__)
   sys_set_tls(ARCH_SET_GS, wt->tls);
-#endif
-  return wt->func(wt->arg);
+  return wt->func(wt->arg, tid);
 }
 
-static int CloneLinux(int (*func)(void *), char *stk, size_t stksz, int flags,
-                      void *arg, void *tls, atomic_int *ptid,
+static int CloneLinux(int (*func)(void *arg, int rc), char *stk, size_t stksz,
+                      int flags, void *arg, void *tls, atomic_int *ptid,
                       atomic_int *ctid) {
-  long sp = (intptr_t)stk + stksz;
-
-#if defined(__x86_64__)
+  int rc;
+  long sp;
+  struct LinuxCloneArgs *wt;
+  sp = (intptr_t)stk + stksz;
   sp -= sizeof(struct LinuxCloneArgs);
   sp &= -alignof(struct LinuxCloneArgs);
-  struct LinuxCloneArgs *wt = (struct LinuxCloneArgs *)sp;
-  sp &= -16;  // align the stack
-  wt->arg = arg;
-  wt->tls = tls;
-  wt->func = func;
-  func = AmdLinuxThreadEntry;
-  arg = wt;
-#elif defined(__aarch64__)
-  sp &= -128;  // for kernels <=4.6
+  wt = (struct LinuxCloneArgs *)sp;
+  // align the stack
+#ifdef __aarch64__
+  sp = AlignStack(sp, stk, stksz, 128);  // for kernel <=4.6
+#else
+  sp = AlignStack(sp, stk, stksz, 16);
 #endif
-
-  int rc;
+#ifdef __x86_64__
+  if (flags & CLONE_SETTLS) {
+    flags &= ~CLONE_SETTLS;
+    wt->arg = arg;
+    wt->tls = tls;
+    wt->func = func;
+    func = LinuxThreadEntry;
+    arg = wt;
+  }
+#endif
+  if (~flags & CLONE_CHILD_SETTID) {
+    flags |= CLONE_CHILD_SETTID;
+    ctid = &wt->ctid;
+  }
   if ((rc = sys_clone_linux(flags, sp, ptid, ctid, tls, func, arg)) >= 0) {
     // clone() is documented as setting ptid before return
     return 0;
@@ -576,45 +635,154 @@ static int CloneLinux(int (*func)(void *), char *stk, size_t stksz, int flags,
 // COSMOPOLITAN
 
 /**
- * Creates thread without malloc() being linked.
+ * Creates thread without malloc being linked.
  *
- * If you use clone() you're on your own.
+ * If you use clone() you're on your own. Example:
+ *
+ *     int worker(void *arg) { return 0; }
+ *     struct CosmoTib tib = {.tib_self = &tib, .tib_tid = -1};
+ *     atomic_int tid;
+ *     char *stk = NewCosmoStack();
+ *     clone(worker, stk, GetStackSize() - 16,
+ *           CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES |
+ *           CLONE_SYSVSEM | CLONE_SIGHAND | CLONE_PARENT_SETTID |
+ *           CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | CLONE_SETTLS,
+ *           arg, &tid, &tib, &tib.tib_tid);
+ *     while (atomic_load(&tid) == 0) sched_yield();
+ *     // thread is known
+ *     while (atomic_load(&tib.tib_tid) < 0) sched_yield();
+ *     // thread is running
+ *     while (atomic_load(&tib.tib_tid) > 0) sched_yield();
+ *     // thread has terminated
+ *     FreeCosmoStack(stk);
+ *
+ * Threads are created in a detached manner. They currently can't be
+ * synchronized using wait() or posix signals. Threads created by this
+ * function should be synchronized using shared memory operations.
+ *
+ * Any memory that's required by this system call wrapper is allocated
+ * to the top of your stack. This shouldn't be more than 128 bytes.
+ *
+ * Your function is called from within the stack you specify. A return
+ * address is pushed onto your stack, that causes returning to jump to
+ * _Exit1() which terminates the thread. Even though the callback says
+ * it supports a return code, that'll only work on Linux and Windows.
+ *
+ * This function follows the same ABI convention as the Linux userspace
+ * libraries, with a few small changes. The varargs has been removed to
+ * help prevent broken code, and the stack size and tls size parameters
+ * are introduced for compatibility with FreeBSD.
+ *
+ * To keep this system call lightweight, only the thread creation use
+ * case is polyfilled across platforms. For example, if you want fork
+ * that works on OpenBSD for example, don't do it with clone(SIGCHLD)
+ * and please just call fork(). Even if you do that on Linux, it will
+ * effectively work around libc features like atfork(), so that means
+ * other calls like getpid() may return incorrect values.
+ *
+ * @param func is your callback function, which this wrapper requires
+ *     not be null, otherwise EINVAL is raised. It is passed two args
+ *     within the child thread: (1) the caller-supplied `arg` and (2)
+ *     the new tid is always passed in the second arg for convenience
+ *
+ * @param stk points to the bottom of a caller allocated stack, which
+ *     must be allocated via mmap() using the MAP_STACK flag, or else
+ *     you won't get optimal performance and it won't work on OpenBSD
+ *
+ * @param stksz is the size of that stack in bytes, we recommend that
+ *     that this be set to GetStackSize() or else memory safety tools
+ *     like kprintf() can't do as good and quick of a job; this value
+ *     must be 16-aligned plus it must be at least 4192 bytes in size
+ *     and it's advised to have the bottom-most page, be a guard page
+ *
+ * @param flags which SHOULD always have all of these flags:
+ *
+ *     - `CLONE_THREAD`
+ *     - `CLONE_VM`
+ *     - `CLONE_FS`
+ *     - `CLONE_FILES`
+ *     - `CLONE_SIGHAND`
+ *     - `CLONE_SYSVSEM`
+ *
+ *     This system call wrapper is intended for threads, and as such, we
+ *     won't polyfill Linux's ability to simulate unrelated calls (e.g.
+ *     fork, vfork) via clone() on other platforms. Please just call
+ *     fork() and vfork() when that's what you want.
+ *
+ *     Your `flags` may also optionally also additionally bitwise-OR any
+ *     combination of the following additional flags:
+ *
+ *     - `CLONE_CHILD_SETTID` must be specified if you intend to set the
+ *       `ctid` argument, which will updated with the child tid once the
+ *       child has started.
+ *
+ *     - `CLONE_PARENT_SETTID` must be specified if you intend to set
+ *       the `ptid` argument, and it is updated at the most opportune
+ *       moment. On all platforms except XNU x86, this happens before
+ *       clone() returns. But since it might not be available yet you
+ *       need to use pthread_getunique_np() to obtain it.
+ *
+ *     - `CLONE_CHILD_CLEARTID` causes `*ctid = 0` upon child thread
+ *       termination. This is used to implement join so that the parent
+ *       may know when it's safe to free the child's stack memory, and
+ *       as such, is guaranteed to happen AFTER the child thread has
+ *       either terminated or has finished using its stack memory
+ *
+ *     - `CLONE_SETTLS` is needed if you intend to specify the `tls`
+ *       argument, which after thread creation may be accessed using
+ *       __get_tls(). Doing this means that `errno`, gettid(), etc.
+ *       correctly work. Caveat emptor if you choose not to do this.
+ *
+ * @param arg is passed as an argument to `func` in the child thread
+ * @param tls may be used to set the thread local storage segment;
+ *     this parameter is ignored if `CLONE_SETTLS` is not set
+ * @param ctid lets the child receive its thread id without having to
+ *     call gettid() and is ignored if `CLONE_CHILD_SETTID` isn't set
+ * @return 0 on success, or errno on errno
  */
 errno_t clone(void *func, void *stk, size_t stksz, int flags, void *arg,
               void *ptid, void *tls, void *ctid) {
-  errno_t err;
+  int rc;
 
-  atomic_fetch_add(&_pthread_count, 1);
+  if (flags & CLONE_THREAD) {
+    __enable_threads();
+  }
 
-  if (IsLinux()) {
-    err = CloneLinux(func, stk, stksz, flags, arg, tls, ptid, ctid);
+  if (!func) {
+    rc = EINVAL;
+  } else if (IsLinux()) {
+    rc = CloneLinux(func, stk, stksz, flags, arg, tls, ptid, ctid);
+  } else if (!IsTiny() &&
+             (flags & ~(CLONE_SETTLS | CLONE_PARENT_SETTID |
+                        CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) !=
+                 (CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES |
+                  CLONE_SIGHAND | CLONE_SYSVSEM)) {
+    rc = EINVAL;
   } else if (IsXnu()) {
-#if defined(__x86_64__)
-    err = CloneXnu(func, stk, stksz, arg, tls, ptid, ctid);
+#ifdef __x86_64__
+    rc = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid);
 #elif defined(__aarch64__)
-    err = CloneSilicon(func, stk, stksz, arg, tls, ptid, ctid);
+    rc = CloneSilicon(func, stk, stksz, flags, arg, tls, ptid, ctid);
 #else
 #error "unsupported architecture"
 #endif
   } else if (IsFreebsd()) {
-    err = CloneFreebsd(func, stk, stksz, arg, tls, ptid, ctid);
-#if defined(__x86_64__)
-  } else if (IsWindows()) {
-    err = CloneWindows(func, stk, stksz, arg, tls, ptid, ctid);
+    rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
+#ifdef __x86_64__
   } else if (IsNetbsd()) {
-    err = CloneNetbsd(func, stk, stksz, arg, tls, ptid, ctid);
+    rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
   } else if (IsOpenbsd()) {
-    err = CloneOpenbsd(func, stk, stksz, arg, tls, ptid, ctid);
+    rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
+  } else if (IsWindows()) {
+    rc = CloneWindows(func, stk, stksz, flags, arg, tls, ptid, ctid);
 #endif /* __x86_64__ */
   } else {
-    err = ENOSYS;
+    rc = ENOSYS;
   }
 
-  if (SupportsBsd() && err == EPROCLIM)
-    err = EAGAIN;
+  if (SupportsBsd() && rc == EPROCLIM) {
+    rc = EAGAIN;
+  }
 
-  if (err)
-    atomic_fetch_sub(&_pthread_count, 1);
-
-  return err;
+  return rc;
 }
diff --git a/libc/runtime/cosmo.S b/libc/runtime/cosmo.S
index 59fe944c1..07b6c459f 100644
--- a/libc/runtime/cosmo.S
+++ b/libc/runtime/cosmo.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/intrin/strace.h"
@@ -32,8 +32,8 @@
 //	@param	rdx is environ
 //	@param	rcx is auxv
 //	@noreturn
-cosmo:	beg
-	pro
+cosmo:	push	%rbp
+	mov	%rsp,%rbp
 	mov	%edi,%r12d
 	mov	%rsi,%r13
 	mov	%rdx,%r14
@@ -104,10 +104,7 @@ cosmo:	beg
 	je	2f
 	push	%rax
 	push	%rax
-	mov	%r12d,%edi
-	mov	%r13,%rsi
-	mov	%r14,%rdx
-	mov	%r15,%rcx
+	call	.Largs
 	call	*(%rax)
 	pop	%rax
 	pop	%rax
@@ -115,15 +112,17 @@ cosmo:	beg
 	jmp	1b
 
 //	call main()
-2:	mov	%r12d,%edi
-	mov	%r13,%rsi
-	mov	%r14,%rdx
-	mov	%r15,%rcx
+2:	call	.Largs
 	.weak	main
 	call	main
 	xchg	%eax,%edi
 	call	exit
-	end
+
+.Largs:	mov	%r12d,%edi
+	mov	%r13,%rsi
+	mov	%r14,%rdx
+	mov	%r15,%rcx
+	ret
 	.endfn	cosmo,weak
 
 //	Enables Thread Local Storage.
diff --git a/libc/runtime/cosmo2.c b/libc/runtime/cosmo2.c
index a218af579..ccd926192 100644
--- a/libc/runtime/cosmo2.c
+++ b/libc/runtime/cosmo2.c
@@ -24,7 +24,7 @@
 #include "libc/intrin/maps.h"
 #include "libc/intrin/strace.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/rdtsc.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/memtrack.internal.h"
@@ -93,8 +93,7 @@ wontreturn textstartup void cosmo(long *sp, struct Syslib *m1, char *exename,
       .tib_sigmask = -1,
       .tib_sigstack_size = 57344,
       .tib_sigstack_addr = (char *)__builtin_frame_address(0) - 57344,
-      .tib_ptid = 1,
-      .tib_ctid = 1,
+      .tib_tid = 1,
   };
   __set_tls(&tib);
 
diff --git a/libc/runtime/cxa_thread_atexit.c b/libc/runtime/cxa_thread_atexit.c
index 57ce06849..76b89ec89 100644
--- a/libc/runtime/cxa_thread_atexit.c
+++ b/libc/runtime/cxa_thread_atexit.c
@@ -23,6 +23,7 @@
 #include "libc/nexgen32e/gc.internal.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/tls.h"
+#include "third_party/nsync/wait_s.internal.h"
 
 struct Dtor {
   void *fun;
@@ -88,7 +89,10 @@ void __cxa_thread_finalize(void) {
   //  thread has any thread-specific data, appropriate destructor
   //  functions shall be called in an unspecified order."
   //                              ──Quoth POSIX.1-2017
+  if (tib->tib_nsync)
+    _weaken(nsync_waiter_destroy)(tib->tib_nsync);
   _pthread_unkey(tib);
+
   _pthread_ungarbage(tib);
 
   while ((dtor = tib->tib_atexit)) {
diff --git a/libc/runtime/efimain.greg.c b/libc/runtime/efimain.greg.c
index 33aefcb21..4ecda0eca 100644
--- a/libc/runtime/efimain.greg.c
+++ b/libc/runtime/efimain.greg.c
@@ -21,7 +21,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/newbie.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/efi.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/runtime/e820.internal.h"
diff --git a/libc/runtime/efipostboot.S b/libc/runtime/efipostboot.S
index 6b3561562..4a2f715de 100644
--- a/libc/runtime/efipostboot.S
+++ b/libc/runtime/efipostboot.S
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "ape/relocations.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/pc.internal.h"
 .real
 
diff --git a/libc/runtime/enable_tls.c b/libc/runtime/enable_tls.c
index 0296e6fda..faaa704c3 100644
--- a/libc/runtime/enable_tls.c
+++ b/libc/runtime/enable_tls.c
@@ -26,7 +26,7 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/maps.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/files.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
@@ -35,9 +35,8 @@
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/syslib.internal.h"
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #include "libc/str/locale.h"
-#include "libc/str/locale.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/prot.h"
@@ -214,13 +213,14 @@ textstartup void __enable_tls(void) {
   tib->tib_errno = __errno;
   tib->tib_strace = __strace;
   tib->tib_ftrace = __ftrace;
+  tib->tib_locale = (intptr_t)&__c_dot_utf8_locale;
   tib->tib_pthread = (pthread_t)&_pthread_static;
   if (IsWindows()) {
     intptr_t hThread;
     DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
                     GetCurrentProcess(), &hThread, 0, false,
                     kNtDuplicateSameAccess);
-    atomic_init(&tib->tib_syshand, hThread);
+    atomic_store_explicit(&tib->tib_syshand, hThread, memory_order_relaxed);
   } else if (IsXnuSilicon()) {
     tib->tib_syshand = __syslib->__pthread_self();
   }
@@ -233,22 +233,22 @@ textstartup void __enable_tls(void) {
   } else {
     tid = sys_gettid();
   }
-  atomic_init(&tib->tib_ptid, tid);
-  atomic_init(&tib->tib_ctid, tid);
+  atomic_store_explicit(&tib->tib_tid, tid, memory_order_relaxed);
   // TODO(jart): set_tid_address?
 
   // inherit signal mask
-  if (IsWindows())
-    atomic_init(&tib->tib_sigmask, ParseMask(__getenv(environ, "_MASK").s));
+  if (IsWindows()) {
+    atomic_store_explicit(&tib->tib_sigmask,
+                          ParseMask(__getenv(environ, "_MASK").s),
+                          memory_order_relaxed);
+  }
 
   // initialize posix threads
   _pthread_static.tib = tib;
   _pthread_static.pt_flags = PT_STATIC;
-  _pthread_static.pt_locale = &__global_locale;
-  _pthread_static.pt_attr.__stackaddr = __maps.stack.addr;
-  _pthread_static.pt_attr.__stacksize = __maps.stack.size;
   dll_init(&_pthread_static.list);
   _pthread_list = &_pthread_static.list;
+  atomic_store_explicit(&_pthread_static.ptid, tid, memory_order_release);
 
   // ask the operating system to change the x86 segment register
   if (IsWindows())
diff --git a/libc/runtime/findcombinary.c b/libc/runtime/findcombinary.c
index bc2214f8b..4c0084456 100644
--- a/libc/runtime/findcombinary.c
+++ b/libc/runtime/findcombinary.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/auxv.h"
diff --git a/libc/runtime/fpreset.S b/libc/runtime/fpreset.S
index ec817a942..1a130ae18 100644
--- a/libc/runtime/fpreset.S
+++ b/libc/runtime/fpreset.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Re-initializes FPU.
 	.ftrace1
diff --git a/libc/runtime/ftrace-hook.S b/libc/runtime/ftrace-hook.S
index cd25a18c4..9340d7f73 100644
--- a/libc/runtime/ftrace-hook.S
+++ b/libc/runtime/ftrace-hook.S
@@ -16,59 +16,44 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .privileged
 
 ftrace_hook:
 #ifdef __x86_64__
 
-//	save argument registers
-//	we save %rax because __gc() takes it as an argument.
-//	we save %r10 because it's used as a syscall argument.
+//	We need to save saved registers because we have some functions
+//	like __errno_location which can be called from an inline asm()
+//	statement. It's nice to have the flexibility anyway.
 
 	cmpl	$0,__ftrace(%rip)
 	jle	1f
-	.cfi_startproc
 	push	%rbp
-	.cfi_def_cfa_offset 16
-	.cfi_offset %rbp, -16
 	mov	%rsp,%rbp
-	.cfi_def_cfa_register %rbp
 	and	$-16,%rsp
-	sub	$128,%rsp
-	movdqu	%xmm0,-0x80(%rbp)
-	movdqu	%xmm1,-0x70(%rbp)
-	movdqu	%xmm2,-0x60(%rbp)
-	movdqu	%xmm3,-0x50(%rbp)
-	movdqu	%xmm4,-0x40(%rbp)
-	movdqu	%xmm5,-0x30(%rbp)
-	movdqu	%xmm6,-0x20(%rbp)
-	movdqu	%xmm7,-0x10(%rbp)
+	sub	$256,%rsp
 	push	%rax
-	.cfi_offset %rax, -24
+	push	%rbx
 	push	%rcx
-	.cfi_offset %rcx, -32
 	push	%rdx
-	.cfi_offset %rdx, -40
 	push	%rdi
-	.cfi_offset %rdi, -48
 	push	%rsi
-	.cfi_offset %rsi, -56
 	push	%r8
-	.cfi_offset %r8, -64
 	push	%r9
-	.cfi_offset %r9, -72
 	push	%r10
-	.cfi_offset %r10, -80
+	push	%r11
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	call	__xmm_save
 	call	ftracer
-	movdqu	-0x80(%rbp),%xmm0
-	movdqu	-0x70(%rbp),%xmm1
-	movdqu	-0x60(%rbp),%xmm2
-	movdqu	-0x50(%rbp),%xmm3
-	movdqu	-0x40(%rbp),%xmm4
-	movdqu	-0x30(%rbp),%xmm5
-	movdqu	-0x20(%rbp),%xmm6
-	movdqu	-0x10(%rbp),%xmm7
+	call	__xmm_load
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%r11
 	pop	%r10
 	pop	%r9
 	pop	%r8
@@ -76,22 +61,15 @@ ftrace_hook:
 	pop	%rdi
 	pop	%rdx
 	pop	%rcx
+	pop	%rbx
 	pop	%rax
 	leave
-	.cfi_restore %rbp
-	.cfi_def_cfa %rsp, 8
 1:	ret
-	.cfi_endproc
 
 #elif defined(__aarch64__)
 
 	stp	x29,x30,[sp,-384]!
-	.cfi_startproc
-	.cfi_def_cfa_offset 384
-	.cfi_offset 29, -384	// x29 (fp) is saved at [sp - 384]
-	.cfi_offset 30, -376	// x30 (lr) is saved at [sp - 376]
 	mov	x29,sp
-	.cfi_def_cfa_register 29
 	stp	x0,x1,[sp,16]
 
 	adrp	x0,__ftrace
@@ -100,45 +78,18 @@ ftrace_hook:
 	ble	1f
 
 	stp	x2,x3,[sp,32]
-	.cfi_offset 2, -352
-	.cfi_offset 3, -344
 	stp	x4,x5,[sp,48]
-	.cfi_offset 4, -336
-	.cfi_offset 5, -328
 	stp	x6,x7,[sp,64]
-	.cfi_offset 6, -320
-	.cfi_offset 7, -312
 	stp	x8,x9,[sp,80]
-	.cfi_offset 8, -304
-	.cfi_offset 9, -296
 	stp	x10,x11,[sp,96]
-	.cfi_offset 10, -288
-	.cfi_offset 11, -280
 	stp	x12,x13,[sp,112]
-	.cfi_offset 12, -272
-	.cfi_offset 13, -264
 	stp	x14,x15,[sp,128]
-	.cfi_offset 14, -256
-	.cfi_offset 15, -248
 	stp	x16,x19,[sp,160]
-	.cfi_offset 16, -224
-	.cfi_offset 19, -216
 	stp	x20,x21,[sp,176]
-	.cfi_offset 20, -208
-	.cfi_offset 21, -200
 	stp	x22,x23,[sp,192]
-	.cfi_offset 22, -192
-	.cfi_offset 23, -184
 	stp	x24,x25,[sp,208]
-	.cfi_offset 24, -176
-	.cfi_offset 25, -168
 	stp	x26,x27,[sp,224]
-	.cfi_offset 26, -160
-	.cfi_offset 27, -152
 	stp	x17,x28,[sp,240]
-	.cfi_offset 17, -144
-	.cfi_offset 28, -136
-	// No CFI directives needed for FP registers
 	stp	q0,q1,[sp,256]
 	stp	q2,q3,[sp,288]
 	stp	q4,q5,[sp,320]
@@ -166,12 +117,7 @@ ftrace_hook:
 
 1:	ldp	x0,x1,[sp,16]
 	ldp	x29,x30,[sp],384
-	.cfi_restore 29
-	.cfi_restore 30
-	.cfi_def_cfa 7, 0	// On some ARM systems the stack pointer is represented by register 7
-	.cfi_def_cfa_offset 0
 	ret
-	.cfi_endproc
 
 #endif /* __x86_64__ */
 	.endfn	ftrace_hook,globl,hidden
diff --git a/libc/runtime/ftraceinit.greg.c b/libc/runtime/ftraceinit.greg.c
index 0f18fcf68..f0f4a1e48 100644
--- a/libc/runtime/ftraceinit.greg.c
+++ b/libc/runtime/ftraceinit.greg.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/dce.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/symbols.internal.h"
@@ -38,7 +37,7 @@ __static_yoink("zipos");
  * @see libc/runtime/_init.S for documentation
  */
 textstartup int ftrace_init(void) {
-  if (IsModeDbg() || strace_enabled(0) > 0) {
+  if (strace_enabled(0) > 0) {
     GetSymbolTable();
   }
   if (__intercept_flag(&__argc, __argv, "--ftrace")) {
diff --git a/libc/runtime/ftracer.c b/libc/runtime/ftracer.c
index 56f4d53f9..d2e686d3b 100644
--- a/libc/runtime/ftracer.c
+++ b/libc/runtime/ftracer.c
@@ -21,7 +21,7 @@
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/cmpxchg.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/stackframe.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
@@ -29,9 +29,14 @@
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
+#include "libc/thread/tls2.internal.h"
 
 /**
- * @fileoverview plain-text function call logging
+ * @fileoverview Plain-text function call logging.
+ *
+ * Able to log ~2 million function calls per second, which is mostly
+ * bottlenecked by system call overhead. Log size is reasonable if piped
+ * into gzip.
  */
 
 #define MAX_NESTING 512
@@ -45,7 +50,7 @@
 static struct CosmoFtrace g_ftrace;
 
 __funline int GetNestingLevelImpl(struct StackFrame *frame) {
-  int nesting = -1;
+  int nesting = -2;
   while (frame && !kisdangerous(frame)) {
     ++nesting;
     frame = frame->next;
@@ -78,63 +83,38 @@ privileged void ftracer(void) {
   struct StackFrame *sf;
   struct CosmoFtrace *ft;
   struct PosixThread *pt;
-
-  // get interesting values
   sf = __builtin_frame_address(0);
   st = (uintptr_t)__argv - sizeof(uintptr_t);
   if (__ftrace <= 0)
     return;
-
-  // determine top of stack
-  // main thread won't consider kernel provided argblock
   if (__tls_enabled) {
     tib = __get_tls_privileged();
     if (tib->tib_ftrace <= 0)
       return;
     ft = &tib->tib_ftracer;
-    pt = (struct PosixThread *)tib->tib_pthread;
-    if (pt != &_pthread_static) {
-      if ((char *)sf >= tib->tib_sigstack_addr &&
-          (char *)sf <= tib->tib_sigstack_addr + tib->tib_sigstack_size) {
-        st = (uintptr_t)tib->tib_sigstack_addr + tib->tib_sigstack_size;
-      } else if (pt && pt->pt_attr.__stacksize) {
-        st = (uintptr_t)pt->pt_attr.__stackaddr + pt->pt_attr.__stacksize;
-      }
+    if ((char *)sf >= tib->tib_sigstack_addr &&
+        (char *)sf <= tib->tib_sigstack_addr + tib->tib_sigstack_size) {
+      st = (uintptr_t)tib->tib_sigstack_addr + tib->tib_sigstack_size;
+    } else if ((pt = (struct PosixThread *)tib->tib_pthread) &&
+               pt->pt_attr.__stacksize) {
+      st = (uintptr_t)pt->pt_attr.__stackaddr + pt->pt_attr.__stacksize;
     }
   } else {
     ft = &g_ftrace;
   }
-
-  // estimate stack pointer of hooked function
-  uintptr_t usp = (uintptr_t)sf;
-  usp += sizeof(struct StackFrame);  // overhead of this function
-#if defined(__x86_64__)
-  usp += 8;       // ftrace_hook() stack aligning
-  usp += 8 * 8;   // ftrace_hook() pushed 8x regs
-  usp += 8 * 16;  // ftrace_hook() pushed 8x xmms
-#elif defined(__aarch64__)
-  usp += 384;  // overhead of ftrace_hook()
-#else
-#error "unsupported architecture"
-#endif
-
-  // determine how much stack hooked function is using
-  stackuse = st - usp;
-
-  // log function call
-  //
-  //     FUN $PID $TID $STARTNANOS $STACKUSE $SYMBOL
-  //
-  if (!ft->ft_once) {
+  stackuse = st - (intptr_t)sf;
+  if (_cmpxchg(&ft->ft_once, false, true)) {
     ft->ft_lastaddr = -1;
     ft->ft_skew = GetNestingLevelImpl(sf);
-    ft->ft_once = true;
   }
-  sf = sf->next;
-  fn = sf->addr + DETOUR_SKEW;
-  if (fn != ft->ft_lastaddr) {
-    kprintf("%rFUN %6P %6H %'18T %'*ld %*s%t\n", ftrace_stackdigs, stackuse,
-            GetNestingLevel(ft, sf) * 2, "", fn);
-    ft->ft_lastaddr = fn;
+  if (_cmpxchg(&ft->ft_noreentry, false, true)) {
+    sf = sf->next;
+    fn = sf->addr + DETOUR_SKEW;
+    if (fn != ft->ft_lastaddr) {
+      kprintf("%rFUN %6P %6H %'18T %'*ld %*s%t\n", ftrace_stackdigs, stackuse,
+              GetNestingLevel(ft, sf) * 2, "", fn);
+      ft->ft_lastaddr = fn;
+    }
+    ft->ft_noreentry = false;
   }
 }
diff --git a/libc/runtime/getargmax.c b/libc/runtime/getargmax.c
index a94a3fa7c..37ce64c83 100644
--- a/libc/runtime/getargmax.c
+++ b/libc/runtime/getargmax.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/sysparam.h"
 #include "libc/sysv/consts/_posix.h"
diff --git a/libc/runtime/getinterpreterexecutablename.c b/libc/runtime/getinterpreterexecutablename.c
index ba4069ccb..6a93514d0 100644
--- a/libc/runtime/getinterpreterexecutablename.c
+++ b/libc/runtime/getinterpreterexecutablename.c
@@ -20,7 +20,7 @@
 #include "libc/calls/syscall-sysv.internal.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/at.h"
diff --git a/libc/runtime/getlogin.c b/libc/runtime/getlogin.c
index 72f92bbb6..73b8dec59 100644
--- a/libc/runtime/getlogin.c
+++ b/libc/runtime/getlogin.c
@@ -19,7 +19,7 @@
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/dce.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/accounting.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
diff --git a/libc/runtime/getlogin_r.c b/libc/runtime/getlogin_r.c
index 53950be9a..4b2311b07 100644
--- a/libc/runtime/getlogin_r.c
+++ b/libc/runtime/getlogin_r.c
@@ -19,7 +19,7 @@
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/dce.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/accounting.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
diff --git a/libc/runtime/getresourcelimit.c b/libc/runtime/getresourcelimit.c
index 48e10be8e..d68b256b3 100644
--- a/libc/runtime/getresourcelimit.c
+++ b/libc/runtime/getresourcelimit.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/rlimit.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/rlim.h"
 
 long __get_rlimit(int resource) {
diff --git a/libc/runtime/getsymbol.c b/libc/runtime/getsymbol.c
index e7ced6883..855e13611 100644
--- a/libc/runtime/getsymbol.c
+++ b/libc/runtime/getsymbol.c
@@ -33,22 +33,24 @@ privileged int __get_symbol(struct SymbolTable *t, intptr_t a) {
   // we don't want function tracing because:
   //   function tracing depends on this function via kprintf
   unsigned l, m, r, n, k;
-  if (!t && __symtab)
+  if (!t && __symtab) {
     t = __symtab;
+  }
   if (t) {
     l = 0;
     r = n = t->count;
     k = a - t->addr_base;
     while (l < r) {
       m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
-      if (k < t->symbols[m].x) {
-        r = m;
-      } else if (k > t->symbols[m].y) {
+      if (t->symbols[m].y < k) {
         l = m + 1;
       } else {
-        return m;
+        r = m;
       }
     }
+    if (l < n && t->symbols[l].x <= k && k <= t->symbols[l].y) {
+      return l;
+    }
   }
   return -1;
 }
diff --git a/libc/runtime/getsymboltable.c b/libc/runtime/getsymboltable.c
index 3ee47d5a3..90dcb169f 100644
--- a/libc/runtime/getsymboltable.c
+++ b/libc/runtime/getsymboltable.c
@@ -17,24 +17,24 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
-#include "libc/atomic.h"
-#include "libc/cosmo.h"
 #include "libc/errno.h"
 #include "libc/intrin/promises.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/symbols.internal.h"
 #include "libc/runtime/zipos.internal.h"
 #include "libc/str/str.h"
+#include "libc/thread/thread.h"
 #include "libc/x/x.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "third_party/puff/puff.h"
 
 __static_yoink("__get_symbol");
 
+static pthread_spinlock_t g_lock;
 struct SymbolTable *__symtab;  // for kprintf
 
 static ssize_t GetZipFile(struct Zipos *zipos, const char *name) {
@@ -100,25 +100,6 @@ static struct SymbolTable *GetSymbolTableFromElf(void) {
   }
 }
 
-static void GetSymbolTableInit(void) {
-  struct Zipos *z;
-  int e = errno;
-  if (!__symtab && !__isworker) {
-    if (_weaken(__zipos_get) && (z = _weaken(__zipos_get)())) {
-      if ((__symtab = GetSymbolTableFromZip(z))) {
-        __symtab->names =
-            (uint32_t *)((char *)__symtab + __symtab->names_offset);
-        __symtab->name_base =
-            (char *)((char *)__symtab + __symtab->name_base_offset);
-      }
-    }
-    if (!__symtab) {
-      __symtab = GetSymbolTableFromElf();
-    }
-  }
-  errno = e;
-}
-
 /**
  * Returns symbol table singleton.
  *
@@ -140,7 +121,24 @@ static void GetSymbolTableInit(void) {
  * @return symbol table, or NULL if not found
  */
 struct SymbolTable *GetSymbolTable(void) {
-  static atomic_uint once;
-  cosmo_once(&once, GetSymbolTableInit);
+  struct Zipos *z;
+  if (pthread_spin_trylock(&g_lock))
+    return 0;
+  int e = errno;
+  if (!__symtab && !__isworker) {
+    if (_weaken(__zipos_get) && (z = _weaken(__zipos_get)())) {
+      if ((__symtab = GetSymbolTableFromZip(z))) {
+        __symtab->names =
+            (uint32_t *)((char *)__symtab + __symtab->names_offset);
+        __symtab->name_base =
+            (char *)((char *)__symtab + __symtab->name_base_offset);
+      }
+    }
+    if (!__symtab) {
+      __symtab = GetSymbolTableFromElf();
+    }
+  }
+  errno = e;
+  pthread_spin_unlock(&g_lock);
   return __symtab;
 }
diff --git a/libc/mem/grow.c b/libc/runtime/grow.c
similarity index 95%
rename from libc/mem/grow.c
rename to libc/runtime/grow.c
index 1c69653a7..1b865d499 100644
--- a/libc/mem/grow.c
+++ b/libc/runtime/grow.c
@@ -19,7 +19,8 @@
 #include "ape/sections.internal.h"
 #include "libc/assert.h"
 #include "libc/fmt/conv.h"
-#include "libc/macros.h"
+#include "libc/intrin/weaken.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdckdint.h"
@@ -52,7 +53,7 @@ bool __grow(void *pp, size_t *capacity, size_t itemsize, size_t extra) {
   n1 = *capacity;
   n2 = (*p ? n1 + (n1 >> 1) : MAX(4, INITIAL_CAPACITY / itemsize)) + extra;
   if (!ckd_mul(&t1, n1, itemsize) && !ckd_mul(&t2, n2, itemsize)) {
-    if ((p2 = realloc(p1, ROUNDUP(t2, 32)))) {
+    if (_weaken(realloc) && (p2 = _weaken(realloc)(p1, ROUNDUP(t2, 32)))) {
       if (!p1 && *p)
         memcpy(p2, *p, t1);
       bzero((char *)p2 + t1, t2 - t1);
diff --git a/libc/runtime/hook.greg.c b/libc/runtime/hook.greg.c
index 795512481..16596bf6a 100644
--- a/libc/runtime/hook.greg.c
+++ b/libc/runtime/hook.greg.c
@@ -19,7 +19,7 @@
 #include "ape/sections.internal.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/symbols.internal.h"
 
@@ -119,7 +119,6 @@ privileged int __hook(void *dest, struct SymbolTable *st) {
   if (!st)
     return -1;
   __morph_begin();
-  __jit_begin();
   lowest = MAX((intptr_t)__executable_start, (intptr_t)_ereal);
   for (i = 0; i < st->count; ++i) {
     if (st->symbols[i].x < 9)
@@ -139,9 +138,6 @@ privileged int __hook(void *dest, struct SymbolTable *st) {
       // kprintf("can't hook %t at %lx\n", p, p);
     }
   }
-  __clear_cache(MAX((char *)__executable_start, (char *)_ereal),
-                MIN((char *)__privileged_start, (char *)_etext));
-  __jit_end();
   __morph_end();
   return 0;
 }
diff --git a/libc/runtime/inflate.c b/libc/runtime/inflate.c
index f3264a3fe..c7a82fa0b 100644
--- a/libc/runtime/inflate.c
+++ b/libc/runtime/inflate.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
 #include "third_party/puff/puff.h"
diff --git a/libc/runtime/init.S b/libc/runtime/init.S
index 1cf5e035b..a3922476b 100644
--- a/libc/runtime/init.S
+++ b/libc/runtime/init.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/dce.h"
 
@@ -42,7 +42,7 @@
 //	@param	r15 is envp (still callee saved)
 //	@note	rdi is __init_bss_start (callee monotonic lockstep)
 //	@note	rsi is __init_rodata_start (callee monotonic lockstep)
-//	@see	.init.start & .init.end (libc/macros.h)
+//	@see	.init.start & .init.end (libc/macros.internal.h)
 //	@see	ape/ape.lds
 	.section .initprologue,"ax",@progbits
 	.type	_init,@function
@@ -86,7 +86,7 @@ _init_check_rdi_rsi:
 
 //	Decentralized section for packed data structures & initializers.
 //
-//	@see	.initro (libc/macros.h)
+//	@see	.initro (libc/macros.internal.h)
 //	@see	ape/ape.lds
 	.section .initroprologue,"a",@progbits
 	.type	__init_rodata_start,@object
@@ -110,7 +110,7 @@ __init_rodata_end:
 //
 //	Data in this section becomes read-only after initialization.
 //
-//	@see	.piro.bss.init (libc/macros.h)
+//	@see	.piro.bss.init (libc/macros.internal.h)
 //	@see	libc/runtime/piro.c
 //	@see	ape/ape.lds
 	.section .piro.bss.init.1,"aw",@nobits
diff --git a/libc/thread/isstackoverflow.c b/libc/runtime/isstackoverflow.c
similarity index 75%
rename from libc/thread/isstackoverflow.c
rename to libc/runtime/isstackoverflow.c
index 850eb5a60..cb1068a9c 100644
--- a/libc/thread/isstackoverflow.c
+++ b/libc/runtime/isstackoverflow.c
@@ -19,40 +19,21 @@
 #include "libc/calls/struct/siginfo.h"
 #include "libc/calls/struct/ucontext.internal.h"
 #include "libc/calls/ucontext.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/auxv.h"
 #include "libc/sysv/consts/sig.h"
-#include "libc/thread/thread.h"
 
 /**
- * Returns true if signal is caused by stack overflow.
+ * Returns true if signal is most likely a stack overflow.
  */
 char __is_stack_overflow(siginfo_t *si, void *arg) {
-
-  // sanity check
   ucontext_t *uc = arg;
   if (!si || !uc)
     return false;
-  if (si->si_signo != SIGSEGV &&  //
-      si->si_signo != SIGBUS)
+  if (si->si_signo != SIGSEGV && si->si_signo != SIGBUS)
     return false;
-
-  // get stack information
-  pthread_attr_t attr;
-  if (pthread_getattr_np(pthread_self(), &attr))
-    return false;
-  size_t guardsize;
-  if (pthread_attr_getguardsize(&attr, &guardsize))
-    return false;
-  void *stackaddr;
-  size_t stacksize;
-  if (pthread_attr_getstack(&attr, &stackaddr, &stacksize))
-    return false;
-
-  // determine if faulting address is inside guard region
-  char *x = (char *)si->si_addr;
-  char *lo = (char *)stackaddr - guardsize;
-  char *hi = (char *)stackaddr;
-  return lo <= x && x < hi;
+  intptr_t sp = uc->uc_mcontext.SP;
+  intptr_t fp = (intptr_t)si->si_addr;
+  return ABS(fp - sp) < __pagesize;
 }
diff --git a/libc/runtime/jit.c b/libc/runtime/jit.c
index a418f75dc..6ea45ecb5 100644
--- a/libc/runtime/jit.c
+++ b/libc/runtime/jit.c
@@ -20,7 +20,7 @@
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/syslib.internal.h"
 
-privileged void __jit_begin(void) {
+void __jit_begin(void) {
   if (IsXnuSilicon()) {
     if (__syslib->__pthread_jit_write_protect_supported_np()) {
       __syslib->__pthread_jit_write_protect_np(false);
@@ -28,7 +28,7 @@ privileged void __jit_begin(void) {
   }
 }
 
-privileged void __jit_end(void) {
+void __jit_end(void) {
   if (IsXnuSilicon()) {
     if (__syslib->__pthread_jit_write_protect_supported_np()) {
       __syslib->__pthread_jit_write_protect_np(true);
diff --git a/libc/intrin/mapanon.c b/libc/runtime/mapanon.c
similarity index 100%
rename from libc/intrin/mapanon.c
rename to libc/runtime/mapanon.c
diff --git a/libc/intrin/mapshared.c b/libc/runtime/mapshared.c
similarity index 100%
rename from libc/intrin/mapshared.c
rename to libc/runtime/mapshared.c
diff --git a/libc/thread/mapstack.c b/libc/runtime/mapstack.c
similarity index 70%
rename from libc/thread/mapstack.c
rename to libc/runtime/mapstack.c
index 470ab58a6..eccd5cefc 100644
--- a/libc/thread/mapstack.c
+++ b/libc/runtime/mapstack.c
@@ -16,9 +16,18 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/cosmo.h"
-#include "libc/errno.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/syscall-sysv.internal.h"
+#include "libc/dce.h"
+#include "libc/runtime/memtrack.internal.h"
+#include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
+#include "libc/sysv/consts/auxv.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/prot.h"
+
+#define MAP_ANON_OPENBSD  0x1000
+#define MAP_STACK_OPENBSD 0x4000
 
 /**
  * Allocates stack.
@@ -34,23 +43,28 @@
  * @return stack bottom address on success, or null w/ errno
  */
 void *NewCosmoStack(void) {
-  void *stackaddr;
-  size_t stacksize = GetStackSize();
-  size_t guardsize = GetGuardSize();
-  errno_t err = cosmo_stack_alloc(&stacksize, &guardsize, &stackaddr);
-  if (!err)
-    return stackaddr;
-  errno = err;
-  return 0;
+  char *p;
+  size_t n = GetStackSize();
+  if ((p = mmap(0, n, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1,
+                0)) != MAP_FAILED) {
+    if (IsOpenbsd() && __sys_mmap(p, n, PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_FIXED | MAP_ANON_OPENBSD |
+                                      MAP_STACK_OPENBSD,
+                                  -1, 0, 0) != p)
+      notpossible;
+    if (mprotect(p, GetGuardSize(), PROT_NONE | PROT_GUARD))
+      notpossible;
+    return p;
+  } else {
+    return 0;
+  }
 }
 
 /**
  * Frees stack.
  *
- * @param stackaddr was allocated by NewCosmoStack()
- * @return 0 on success, or -1 w/ errno
+ * @param stk was allocated by NewCosmoStack()
  */
-int FreeCosmoStack(void *stackaddr) {
-  cosmo_stack_free(stackaddr, GetStackSize(), GetGuardSize());
-  return 0;
+int FreeCosmoStack(void *stk) {
+  return munmap(stk, GetStackSize());
 }
diff --git a/libc/runtime/morph.c b/libc/runtime/morph.c
index c3bcc4ae3..08abcc410 100644
--- a/libc/runtime/morph.c
+++ b/libc/runtime/morph.c
@@ -24,13 +24,12 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/nt/enum/pageflags.h"
 #include "libc/nt/memory.h"
-#include "libc/nt/runtime.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/nr.h"
 #include "libc/sysv/consts/prot.h"
 
-__msabi extern typeof(VirtualProtectEx) *const __imp_VirtualProtectEx;
+__msabi extern typeof(VirtualProtect) *const __imp_VirtualProtect;
 
 __funline void __morph_mprotect(void *addr, size_t size, int prot, int ntprot) {
 #ifdef __x86_64__
@@ -55,7 +54,7 @@ __funline void __morph_mprotect(void *addr, size_t size, int prot, int ntprot) {
     }
 #endif
   } else {
-    __imp_VirtualProtectEx(GetCurrentProcess(), addr, size, ntprot, &op);
+    __imp_VirtualProtect(addr, size, ntprot, &op);
   }
 #elif defined(__aarch64__)
   register long r0 asm("x0") = (long)addr;
diff --git a/libc/runtime/opensymboltable.greg.c b/libc/runtime/opensymboltable.greg.c
index 145c8be21..2a54eb7b6 100644
--- a/libc/runtime/opensymboltable.greg.c
+++ b/libc/runtime/opensymboltable.greg.c
@@ -25,7 +25,7 @@
 #include "libc/intrin/strace.h"
 #include "libc/limits.h"
 #include "libc/log/libfatal.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
@@ -130,8 +130,7 @@ static struct SymbolTable *OpenSymbolTableImpl(const char *filename) {
     ++j;
   }
   t->count = j;
-  if (!IsWindows())
-    munmap(stp, sizeof(const Elf64_Sym *) * n);
+  munmap(stp, sizeof(const Elf64_Sym *) * n);
   munmap(map, filesize);
   close(fd);
   return t;
@@ -145,8 +144,9 @@ RaiseEnoexec:
   errno = ENOEXEC;
 SystemError:
   STRACE("OpenSymbolTable()% m");
-  if (map != MAP_FAILED)
+  if (map != MAP_FAILED) {
     munmap(map, filesize);
+  }
   close(fd);
   return 0;
 }
diff --git a/libc/runtime/progname.S b/libc/runtime/progname.S
index e98f0fb3c..c2f807dcc 100644
--- a/libc/runtime/progname.S
+++ b/libc/runtime/progname.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Provides argv[0] The BSD Way.
 	.initbss 300,_init___progname
diff --git a/libc/runtime/runtime.h b/libc/runtime/runtime.h
index 4ea96a3cc..2492fc95a 100644
--- a/libc/runtime/runtime.h
+++ b/libc/runtime/runtime.h
@@ -4,7 +4,6 @@ COSMOPOLITAN_C_START_
 /*───────────────────────────────────────────────────────────────────────────│─╗
 │ cosmopolitan § runtime                                                   ─╬─│┼
 ╚────────────────────────────────────────────────────────────────────────────│*/
-/* clang-format off */
 
 #ifdef __x86_64__
 typedef long jmp_buf[8];
@@ -23,9 +22,11 @@ typedef unsigned long jmp_buf[26];
 void mcount(void) libcesque;
 int daemon(int, int) libcesque;
 unsigned long getauxval(unsigned long) libcesque;
-int setjmp(jmp_buf) libcesque returnstwice paramsnonnull();
+int setjmp(jmp_buf)
+libcesque returnstwice paramsnonnull();
 void longjmp(jmp_buf, int) libcesque wontreturn paramsnonnull();
-int _setjmp(jmp_buf) libcesque returnstwice paramsnonnull();
+int _setjmp(jmp_buf)
+libcesque returnstwice paramsnonnull();
 int sigsetjmp(sigjmp_buf, int) libcesque returnstwice paramsnonnull();
 void siglongjmp(sigjmp_buf, int) libcesque wontreturn paramsnonnull();
 void _longjmp(jmp_buf, int) libcesque wontreturn paramsnonnull();
@@ -36,7 +37,7 @@ void quick_exit(int) wontreturn;
 void abort(void) wontreturn;
 int atexit(void (*)(void)) paramsnonnull() libcesque;
 char *getenv(const char *) paramsnonnull() __wur nosideeffect libcesque;
-int putenv(char *) libcesque __read_write(1);
+int putenv(char *) libcesque;
 int setenv(const char *, const char *, int) libcesque;
 int unsetenv(const char *) libcesque;
 int clearenv(void) libcesque;
@@ -51,8 +52,8 @@ int munlock(const void *, size_t) libcesque;
 long gethostid(void) libcesque;
 int sethostid(long) libcesque;
 char *getlogin(void) libcesque;
-int getlogin_r(char *, size_t) libcesque __write_only(1, 2);
-int login_tty(int) libcesque __fd_arg(1);
+int getlogin_r(char *, size_t) libcesque;
+int login_tty(int) libcesque;
 int getpagesize(void) pureconst libcesque;
 int getgransize(void) pureconst libcesque;
 int syncfs(int) dontthrow libcesque;
@@ -83,18 +84,18 @@ extern uint64_t kStartTsc;
 extern const char kNtSystemDirectory[];
 extern const char kNtWindowsDirectory[];
 extern size_t __virtualmax;
+extern size_t __virtualsize;
 extern size_t __stackmax;
 extern bool32 __isworker;
 /* utilities */
-void _intsort(int *, size_t) libcesque __read_write(1, 2);
-void _longsort(long *, size_t) libcesque __read_write(1, 2);
+void _intsort(int *, size_t) libcesque;
+void _longsort(long *, size_t) libcesque;
 /* diagnostics */
 void ShowCrashReports(void) libcesque;
 int ftrace_install(void) libcesque;
 int ftrace_enabled(int) libcesque;
 int strace_enabled(int) libcesque;
 void __print_maps(size_t) libcesque;
-void __print_maps_win32(int64_t, const char *, size_t) libcesque;
 void __printargs(const char *) libcesque;
 /* builtin sh-like system/popen dsl */
 int _cocmd(int, char **, char **) libcesque;
@@ -106,20 +107,21 @@ int __open_executable(void) libcesque;
 int verynice(void) libcesque;
 void __warn_if_powersave(void) libcesque;
 void _Exit1(int) libcesque wontreturn libcesque;
-void __paginate(int, const char *) libcesque __fd_arg(1);
-void __paginate_file(int, const char *) libcesque __fd_arg(1);
+void __paginate(int, const char *) libcesque;
+void __paginate_file(int, const char *) libcesque;
 /* memory management */
 void _weakfree(void *) libcesque;
 void *_mapanon(size_t) attributeallocsize((1)) mallocesque libcesque;
 void *_mapshared(size_t) attributeallocsize((1)) mallocesque libcesque;
 void CheckForFileLeaks(void) libcesque;
+void __enable_threads(void) libcesque;
 void __oom_hook(size_t) libcesque;
 /* code morphing */
 void __morph_begin(void) libcesque;
 void __morph_end(void) libcesque;
 void __jit_begin(void) libcesque;
 void __jit_end(void) libcesque;
-void __clear_cache(void *, void *);
+void __clear_cache(void *, void *) libcesque;
 /* portability */
 bool32 IsGenuineBlink(void) libcesque;
 bool32 IsCygwin(void) libcesque;
diff --git a/libc/runtime/set_tls.c b/libc/runtime/set_tls.c
index c8385bacc..0ed3609d0 100644
--- a/libc/runtime/set_tls.c
+++ b/libc/runtime/set_tls.c
@@ -24,6 +24,7 @@
 #include "libc/nt/thread.h"
 #include "libc/sysv/consts/arch.h"
 #include "libc/thread/tls.h"
+#include "libc/thread/tls2.internal.h"
 
 #define AMD64_SET_FSBASE 129
 #define AMD64_SET_GSBASE 131
diff --git a/libc/runtime/sigsetjmp.S b/libc/runtime/sigsetjmp.S
index 3187bd295..98598bd50 100644
--- a/libc/runtime/sigsetjmp.S
+++ b/libc/runtime/sigsetjmp.S
@@ -25,11 +25,11 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Saves caller CPU state and signal mask.
 //
-//	@param	rdi points to sigjmp_buf
+//	@param	rdi points to jmp_buf
 //	@param	esi if non-zero will cause mask to be saved
 //	@return	eax 0 when set and !0 when longjmp'd
 //	@returnstwice
diff --git a/libc/runtime/stack.h b/libc/runtime/stack.h
index d526bb3da..8a8c5d934 100644
--- a/libc/runtime/stack.h
+++ b/libc/runtime/stack.h
@@ -6,11 +6,7 @@
 /**
  * Returns preferred size and alignment of thread stack.
  */
-#ifndef MODE_DBG
 #define GetStackSize() 81920
-#else
-#define GetStackSize() 163840
-#endif
 
 /**
  * Returns preferred stack guard size.
@@ -69,8 +65,9 @@ uintptr_t GetStackBottom(void) pureconst;
  * will also trigger the stack to grow down safely.
  */
 forceinline void CheckLargeStackAllocation(void *p, ssize_t n) {
-  for (; n > 0; n -= 4096)
-    ((volatile char *)p)[n - 1] = 0;
+  for (; n > 0; n -= 4096) {
+    ((char *)p)[n - 1] = 0;
+  }
 }
 
 void *NewCosmoStack(void) vallocesque;
diff --git a/libc/runtime/straceinit.greg.c b/libc/runtime/straceinit.greg.c
index 92bf2ce18..151e3449a 100644
--- a/libc/runtime/straceinit.greg.c
+++ b/libc/runtime/straceinit.greg.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
 #include "libc/intrin/getenv.h"
 #include "libc/intrin/safemacros.h"
 #include "libc/log/libfatal.internal.h"
diff --git a/libc/runtime/sysconf.c b/libc/runtime/sysconf.c
index 4f3dafd75..f0c32dd90 100644
--- a/libc/runtime/sysconf.c
+++ b/libc/runtime/sysconf.c
@@ -24,7 +24,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/maps.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/clktck.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/sysconf.h"
@@ -45,8 +45,8 @@
  * - `_SC_GRANSIZE` returns addr alignment for mmap()
  * - `_SC_CLK_TCK` returns number of clock ticks per second
  * - `_SC_ARG_MAX` will perform expensive rlimit calculations
- * - `_SC_SIGSTKSZ` returns recommended `SIGSTKSZ` for platform
- * - `_SC_MINSIGSTKSZ` returns size of kernel pushed signal frame
+ * - `_SC_SIGSTKSZ` returns host platform's preferred SIGSTKSZ
+ * - `_SC_MINSIGSTKSZ` returns host platform's required MINSIGSTKSZ
  * - `_SC_AVPHYS_PAGES` returns average physical memory pages
  * - `_SC_PHYS_PAGES` returns physical memory pages available
  * - `_SC_NPROCESSORS_ONLN` returns number of effective CPUs
@@ -67,7 +67,7 @@ long sysconf(int name) {
     case _SC_ARG_MAX:
       return __get_arg_max();
     case _SC_SIGSTKSZ:
-      return __get_minsigstksz() + SIGSTKSZ;
+      return _SIGSTKSZ;
     case _SC_MINSIGSTKSZ:
       return __get_minsigstksz();
     case _SC_CHILD_MAX:
diff --git a/libc/runtime/syslib.internal.h b/libc/runtime/syslib.internal.h
index 424034537..90ed2994f 100644
--- a/libc/runtime/syslib.internal.h
+++ b/libc/runtime/syslib.internal.h
@@ -82,7 +82,6 @@ struct Syslib {
   char *(*__dlerror)(void);
   /* v9 (2024-01-31) */
   int (*__pthread_cpu_number_np)(size_t *);
-  /* v10 (2024-05-02) */
   long (*__sysctl)(int *, unsigned, void *, size_t *, void *, size_t);
   long (*__sysctlbyname)(const char *, void *, size_t *, void *, size_t);
   long (*__sysctlnametomib)(const char *, int *, size_t *);
diff --git a/libc/runtime/valist.c b/libc/runtime/valist.c
index b996e2732..47294559b 100644
--- a/libc/runtime/valist.c
+++ b/libc/runtime/valist.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 /* <sync libc/integral/lp64arg.inc> */
 struct __va_list {
diff --git a/libc/runtime/winmain.greg.c b/libc/runtime/winmain.greg.c
index 640314f93..77ebd63c2 100644
--- a/libc/runtime/winmain.greg.c
+++ b/libc/runtime/winmain.greg.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
-#include "libc/atomic.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
@@ -26,12 +25,11 @@
 #include "libc/intrin/nomultics.h"
 #include "libc/intrin/weaken.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/rdtsc.h"
 #include "libc/nt/accounting.h"
 #include "libc/nt/console.h"
 #include "libc/nt/enum/consolemodeflags.h"
-#include "libc/nt/enum/creationdisposition.h"
 #include "libc/nt/enum/filemapflags.h"
 #include "libc/nt/enum/pageflags.h"
 #include "libc/nt/files.h"
@@ -52,7 +50,6 @@
 #include "libc/sock/internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/prot.h"
-#include "libc/thread/tls.h"
 #ifdef __x86_64__
 
 #define abi __msabi textwindows dontinstrument
@@ -79,7 +76,7 @@ __msabi extern typeof(SetConsoleMode) *const __imp_SetConsoleMode;
 __msabi extern typeof(SetConsoleOutputCP) *const __imp_SetConsoleOutputCP;
 __msabi extern typeof(SetEnvironmentVariable) *const __imp_SetEnvironmentVariableW;
 __msabi extern typeof(SetStdHandle) *const __imp_SetStdHandle;
-__msabi extern typeof(VirtualProtectEx) *const __imp_VirtualProtectEx;
+__msabi extern typeof(VirtualProtect) *const __imp_VirtualProtect;
 __msabi extern typeof(WriteFile) *const __imp_WriteFile;
 // clang-format on
 
@@ -88,15 +85,11 @@ void __stack_call(int, char **, char **, long (*)[2],
                   void (*)(int, char **, char **, long (*)[2]),
                   intptr_t) wontreturn;
 
-bool __winmain_isfork;
-intptr_t __winmain_jmpbuf[5];
-struct CosmoTib *__winmain_tib;
-
 __funline int IsAlpha(int c) {
   return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
 }
 
-abi static char16_t *StrStr(const char16_t *haystack, const char16_t *needle) {
+static abi char16_t *StrStr(const char16_t *haystack, const char16_t *needle) {
   size_t i;
   for (;;) {
     for (i = 0;; ++i) {
@@ -113,13 +106,13 @@ abi static char16_t *StrStr(const char16_t *haystack, const char16_t *needle) {
   return 0;
 }
 
-abi static void PrintError(const char *s, size_t n) {
+static abi void PrintError(const char *s, size_t n) {
 #define PrintError(s) PrintError(s, sizeof(s) - 1)
   __imp_WriteFile(__imp_GetStdHandle(kNtStdErrorHandle), s, n, 0, 0);
 }
 
 // detect the unholiest of environments
-abi static bool32 IsWslChimera(void) {
+static abi bool32 IsWslChimera(void) {
   char16_t path[PATH_MAX];
   return __imp_GetCurrentDirectoryW(PATH_MAX, path) &&  //
          path[0] == '\\' &&                             //
@@ -130,7 +123,7 @@ abi static bool32 IsWslChimera(void) {
 }
 
 // returns true if utf-8 path is a win32-style path that exists
-abi static bool32 WinFileExists(const char *path) {
+static abi bool32 WinFileExists(const char *path) {
   uint16_t path16[PATH_MAX];
   size_t z = ARRAYLEN(path16);
   size_t n = tprecode8to16(path16, z, path).ax;
@@ -140,7 +133,7 @@ abi static bool32 WinFileExists(const char *path) {
 }
 
 // this ensures close(1) won't accidentally close(2) for example
-abi static void DeduplicateStdioHandles(void) {
+static abi void DeduplicateStdioHandles(void) {
   for (long i = 0; i < 3; ++i) {
     int64_t h1 = __imp_GetStdHandle(kNtStdio[i]);
     for (long j = i + 1; j < 3; ++j) {
@@ -155,19 +148,19 @@ abi static void DeduplicateStdioHandles(void) {
   }
 }
 
-abi static bool32 HasEnvironmentVariable(const char16_t *name) {
+static bool32 HasEnvironmentVariable(const char16_t *name) {
   char16_t buf[4];
   return __imp_GetEnvironmentVariableW(name, buf, ARRAYLEN(buf));
 }
 
-abi static unsigned OnWinCrash(struct NtExceptionPointers *ep) {
+static abi unsigned OnWinCrash(struct NtExceptionPointers *ep) {
   int code, sig = __sig_crash_sig(ep->ExceptionRecord->ExceptionCode, &code);
   TerminateThisProcess(sig);
 }
 
 // main function of windows init process
 // i.e. first process spawned that isn't forked
-abi wontreturn static void WinInit(const char16_t *cmdline) {
+static abi wontreturn void WinInit(const char16_t *cmdline) {
   __oldstack = (intptr_t)__builtin_frame_address(0);
 
   __imp_SetConsoleOutputCP(kNtCpUtf8);
@@ -206,12 +199,11 @@ abi wontreturn static void WinInit(const char16_t *cmdline) {
   int stackprot = (intptr_t)ape_stack_prot;
   if (~stackprot & PROT_EXEC) {
     uint32_t old;
-    __imp_VirtualProtectEx(GetCurrentProcess(), stackaddr, stacksize,
-                           kNtPageReadwrite, &old);
+    __imp_VirtualProtect(stackaddr, stacksize, kNtPageReadwrite, &old);
   }
   uint32_t oldattr;
-  __imp_VirtualProtectEx(GetCurrentProcess(), stackaddr, GetGuardSize(),
-                         kNtPageReadwrite | kNtPageGuard, &oldattr);
+  __imp_VirtualProtect(stackaddr, GetGuardSize(),
+                       kNtPageReadwrite | kNtPageGuard, &oldattr);
   if (_weaken(__maps_stack)) {
     struct NtSystemInfo si;
     __imp_GetSystemInfo(&si);
@@ -301,51 +293,18 @@ abi wontreturn static void WinInit(const char16_t *cmdline) {
                 ARRAYLEN(wa->envp) - 1);
   __imp_FreeEnvironmentStringsW(env16);
   __envp = &wa->envp[0];
+
   // handover control to cosmopolitan runtime
   __stack_call(count, wa->argv, wa->envp, wa->auxv, cosmo,
                (uintptr_t)(stackaddr + (stacksize - sizeof(struct WinArgs))));
 }
 
-static int Atoi(const char16_t *str) {
-  int c;
-  unsigned x = 0;
-  while ((c = *str++)) {
-    if ('0' <= c && c <= '9') {
-      x *= 10;
-      x += c - '0';
-    } else {
-      return -1;
-    }
-  }
-  return x;
-}
-
-abi static int WinGetPid(const char16_t *var, bool *out_is_inherited) {
-  uint32_t len;
-  char16_t val[12];
-  if ((len = __imp_GetEnvironmentVariableW(var, val, ARRAYLEN(val)))) {
-    int pid = -1;
-    if (len < ARRAYLEN(val))
-      pid = Atoi(val);
-    __imp_SetEnvironmentVariableW(var, NULL);
-    if (pid > 0) {
-      *out_is_inherited = true;
-      return pid;
-    }
-  }
-  *out_is_inherited = false;
-  return __imp_GetCurrentProcessId();
-}
-
 abi int64_t WinMain(int64_t hInstance, int64_t hPrevInstance,
                     const char *lpCmdLine, int64_t nCmdShow) {
-  static atomic_ulong fake_process_signals;
   const char16_t *cmdline;
   extern char os asm("__hostos");
   os = _HOSTWINDOWS;  // madness https://news.ycombinator.com/item?id=21019722
   kStartTsc = rdtsc();
-  __tls_enabled = false;
-  ftrace_enabled(-1);
   if (!IsTiny() && IsWslChimera()) {
     PrintError("error: APE is running on WIN32 inside WSL. You need to run: "
                "sudo sh -c 'echo -1 > /proc/sys/fs/binfmt_misc/WSLInterop'\n");
@@ -355,24 +314,21 @@ abi int64_t WinMain(int64_t hInstance, int64_t hPrevInstance,
   __imp_GetSystemInfo(&si);
   __pagesize = si.dwPageSize;
   __gransize = si.dwAllocationGranularity;
-  bool pid_is_inherited;
-  __pid = WinGetPid(u"_COSMO_PID", &pid_is_inherited);
-  if (!(__sig.process = __sig_map_process(__pid, kNtOpenAlways)))
-    __sig.process = &fake_process_signals;
-  if (__winmain_isfork)
-    __builtin_longjmp(__winmain_jmpbuf, 1);
-  if (!pid_is_inherited)
-    atomic_store_explicit(__sig.process, 0, memory_order_release);
+  __umask = 077;
+  __pid = __imp_GetCurrentProcessId();
   cmdline = __imp_GetCommandLineW();
 #if SYSDEBUG
   // sloppy flag-only check for early initialization
   if (StrStr(cmdline, u"--strace"))
     ++__strace;
 #endif
-  ftrace_enabled(+1);
-  if (_weaken(WinSockInit))
+  if (_weaken(WinSockInit)) {
     _weaken(WinSockInit)();
+  }
   DeduplicateStdioHandles();
+  if (_weaken(WinMainForked)) {
+    _weaken(WinMainForked)();
+  }
   WinInit(cmdline);
 }
 
diff --git a/libc/runtime/zipos-access.c b/libc/runtime/zipos-access.c
index eb3807b0f..291000d27 100644
--- a/libc/runtime/zipos-access.c
+++ b/libc/runtime/zipos-access.c
@@ -22,7 +22,7 @@
 #include "libc/sysv/consts/ok.h"
 #include "libc/sysv/consts/s.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 // TODO: this should check parent directory components
 
diff --git a/libc/runtime/zipos-find.c b/libc/runtime/zipos-find.c
index 7431c5e9a..2fc30d442 100644
--- a/libc/runtime/zipos-find.c
+++ b/libc/runtime/zipos-find.c
@@ -16,12 +16,13 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/intrin/kprintf.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/zipos.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/s.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 static ssize_t __zipos_match(struct Zipos *z, struct ZiposUri *name, int len,
                              int i) {
@@ -43,8 +44,9 @@ ssize_t __zipos_scan(struct Zipos *zipos, struct ZiposUri *name) {
 
   // strip trailing slash from search name
   int len = name->len;
-  if (len && name->path[len - 1] == '/')
+  if (len && name->path[len - 1] == '/') {
     --len;
+  }
 
   // empty string means the /zip root directory
   if (!len) {
@@ -89,8 +91,9 @@ ssize_t __zipos_scan(struct Zipos *zipos, struct ZiposUri *name) {
       dx = dx < -1 ? -1 : dx;
       for (l += dx; 0 <= l && l < zipos->records; l += dx) {
         ssize_t cf;
-        if ((cf = __zipos_match(zipos, name, len, l)) != -1)
+        if ((cf = __zipos_match(zipos, name, len, l)) != -1) {
           return cf;
+        }
         cfile = zipos->index[l];
         zname = ZIP_CFILE_NAME(zipos->map + cfile);
         zsize = ZIP_CFILE_NAMESIZE(zipos->map + cfile);
diff --git a/libc/runtime/zipos-get.c b/libc/runtime/zipos-get.c
index e98b6b363..e3615c5f4 100644
--- a/libc/runtime/zipos-get.c
+++ b/libc/runtime/zipos-get.c
@@ -21,12 +21,11 @@
 #include "libc/calls/metalfile.internal.h"
 #include "libc/calls/struct/stat.h"
 #include "libc/cosmo.h"
-#include "libc/dce.h"
 #include "libc/fmt/conv.h"
 #include "libc/intrin/cmpxchg.h"
 #include "libc/intrin/promises.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/zipos.internal.h"
@@ -38,7 +37,7 @@
 #include "libc/sysv/consts/posix.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/thread/thread.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 #ifdef __x86_64__
 __static_yoink(APE_COM_NAME);
@@ -64,8 +63,14 @@ static void __zipos_dismiss(uint8_t *map, const uint8_t *cdir, long pg) {
 
   // unmap the executable portion beneath the local files
   mo = ROUNDDOWN(lo, __gransize);
-  if (mo && !IsWindows())
+  if (mo)
     munmap(map, mo);
+
+  // this is supposed to reduce our rss usage but does it really?
+  lo = ROUNDDOWN(lo, pg);
+  hi = MIN(ROUNDUP(hi, pg), ROUNDDOWN(c, pg));
+  if (hi > lo)
+    posix_madvise(map + lo, hi - lo, POSIX_MADV_DONTNEED);
 }
 
 static int __zipos_compare_names(const void *a, const void *b, void *c) {
@@ -107,7 +112,7 @@ static void __zipos_init(void) {
   const char *progpath;
   if (!(s = getenv("COSMOPOLITAN_DISABLE_ZIPOS"))) {
     // this environment variable may be a filename or file descriptor
-    if ((progpath = secure_getenv("COSMOPOLITAN_INIT_ZIPOS")) &&
+    if ((progpath = getenv("COSMOPOLITAN_INIT_ZIPOS")) &&
         (x = strtol(progpath, &endptr, 10)) >= 0 && !*endptr) {
       fd = x;
     } else {
diff --git a/libc/runtime/zipos-inode.c b/libc/runtime/zipos-inode.c
index 480d999c4..c4f26ae27 100644
--- a/libc/runtime/zipos-inode.c
+++ b/libc/runtime/zipos-inode.c
@@ -21,7 +21,7 @@
 #include "libc/runtime/zipos.internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 uint64_t __zipos_inode(struct Zipos *zipos, int64_t cfile,  //
                        const void *name, size_t namelen) {
diff --git a/libc/runtime/zipos-mmap.c b/libc/runtime/zipos-mmap.c
index d137d372d..d2551ed3d 100644
--- a/libc/runtime/zipos-mmap.c
+++ b/libc/runtime/zipos-mmap.c
@@ -29,7 +29,7 @@
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/s.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 #define IP(X)  (intptr_t)(X)
 #define VIP(X) (void *)IP(X)
diff --git a/libc/runtime/zipos-notat.c b/libc/runtime/zipos-notat.c
index cebe24757..809d622df 100644
--- a/libc/runtime/zipos-notat.c
+++ b/libc/runtime/zipos-notat.c
@@ -23,8 +23,9 @@
 int __zipos_notat(int dirfd, const char *path) {
   struct ZiposUri zipname;
   if (!path)
-    return 0;
-  if (__isfdkind(dirfd, kFdZip) || __zipos_parseuri(path, &zipname) != -1)
-    return -1;
+    return efault();
+  if (__isfdkind(dirfd, kFdZip) || __zipos_parseuri(path, &zipname) != -1) {
+    return einval();
+  }
   return 0;
 }
diff --git a/libc/runtime/zipos-open.c b/libc/runtime/zipos-open.c
index af01a567c..46707b179 100644
--- a/libc/runtime/zipos-open.c
+++ b/libc/runtime/zipos-open.c
@@ -37,7 +37,7 @@
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/s.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 struct ZiposHandle *__zipos_keep(struct ZiposHandle *h) {
   atomic_fetch_add_explicit(&h->refs, 1, memory_order_relaxed);
diff --git a/libc/runtime/zipos-read.c b/libc/runtime/zipos-read.c
index 9b90c987b..8fae44da4 100644
--- a/libc/runtime/zipos-read.c
+++ b/libc/runtime/zipos-read.c
@@ -27,7 +27,7 @@
 #include "libc/sysv/consts/s.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/tls.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 static ssize_t __zipos_read_impl(struct ZiposHandle *h, const struct iovec *iov,
                                  size_t iovlen, ssize_t opt_offset) {
diff --git a/libc/runtime/zipos-stat-impl.c b/libc/runtime/zipos-stat-impl.c
index ef47bb207..ff0a4b316 100644
--- a/libc/runtime/zipos-stat-impl.c
+++ b/libc/runtime/zipos-stat-impl.c
@@ -24,7 +24,7 @@
 #include "libc/runtime/zipos.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/s.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 int __zipos_stat_impl(struct Zipos *zipos, size_t cf, struct stat *st) {
   size_t lf;
diff --git a/libc/runtime/zipos.S b/libc/runtime/zipos.S
index df0ba39ff..507db9efe 100644
--- a/libc/runtime/zipos.S
+++ b/libc/runtime/zipos.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	static_yoink this symbol for open(/zip/...) support.
 	zipos = 0
diff --git a/libc/sock/BUILD.mk b/libc/sock/BUILD.mk
index 3b8982eae..bd74fe141 100644
--- a/libc/sock/BUILD.mk
+++ b/libc/sock/BUILD.mk
@@ -32,9 +32,9 @@ LIBC_SOCK_A_DIRECTDEPS =			\
 	LIBC_NEXGEN32E				\
 	LIBC_NT_ADVAPI32			\
 	LIBC_NT_IPHLPAPI			\
+	LIBC_NT_IPHLPAPI			\
 	LIBC_NT_KERNEL32			\
 	LIBC_NT_NTDLL				\
-	LIBC_NT_REALTIME			\
 	LIBC_NT_WS2_32				\
 	LIBC_RUNTIME				\
 	LIBC_STDIO				\
diff --git a/libc/sock/accept-nt.c b/libc/sock/accept-nt.c
index 1fef186d8..839553624 100644
--- a/libc/sock/accept-nt.c
+++ b/libc/sock/accept-nt.c
@@ -16,73 +16,115 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "libc/atomic.h"
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/sigset.internal.h"
-#include "libc/calls/syscall_support-nt.internal.h"
+#include "libc/cosmo.h"
 #include "libc/errno.h"
-#include "libc/nt/errors.h"
-#include "libc/nt/struct/pollfd.h"
+#include "libc/nt/enum/wsaid.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
-#include "libc/sock/syscall_fd.internal.h"
-#include "libc/sysv/consts/fio.h"
+#include "libc/sock/struct/sockaddr.h"
+#include "libc/sock/wsaid.internal.h"
+#include "libc/str/str.h"
 #include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/poll.h"
 #include "libc/sysv/consts/sock.h"
 #include "libc/sysv/consts/sol.h"
-#include "libc/sysv/errfuns.h"
+#include "libc/thread/thread.h"
 #ifdef __x86_64__
 
-__msabi extern typeof(__sys_ioctlsocket_nt) *const __imp_ioctlsocket;
+__msabi extern typeof(__sys_setsockopt_nt) *const __imp_setsockopt;
+__msabi extern typeof(__sys_closesocket_nt) *const __imp_closesocket;
 
-textwindows static int sys_accept_nt_impl(struct Fd *f,
-                                          struct sockaddr_storage *addr,
-                                          int accept4_flags,
-                                          sigset_t waitmask) {
+union AcceptExAddr {
+  struct sockaddr_storage addr;
+  char buf[sizeof(struct sockaddr_storage) + 16];
+};
+
+struct AcceptExBuffer {
+  union AcceptExAddr local;
+  union AcceptExAddr remote;
+};
+
+struct AcceptResources {
   int64_t handle;
-  int client = -1;
+};
 
-  // accepting sockets must always be non-blocking at the os level. this
-  // is because WSAAccept doesn't support overlapped i/o operations. the
-  // AcceptEx function claims to support overlapped i/o however it can't
-  // be canceled by CancelIoEx, which makes it quite useless to us sadly
-  // this can't be called in listen(), because then fork() will break it
-  uint32_t mode = 1;
-  if (__imp_ioctlsocket(f->handle, FIONBIO, &mode))
-    return __winsockerr();
+struct AcceptArgs {
+  int64_t listensock;
+  struct AcceptExBuffer *buffer;
+};
 
-  for (;;) {
+static struct {
+  atomic_uint once;
+  bool32 (*__msabi lpAcceptEx)(
+      int64_t sListenSocket, int64_t sAcceptSocket,
+      void *out_lpOutputBuffer /*[recvlen+local+remoteaddrlen]*/,
+      uint32_t dwReceiveDataLength, uint32_t dwLocalAddressLength,
+      uint32_t dwRemoteAddressLength, uint32_t *out_lpdwBytesReceived,
+      struct NtOverlapped *inout_lpOverlapped);
+} g_acceptex;
 
-    // perform non-blocking accept
-    int32_t addrsize = sizeof(*addr);
-    struct sockaddr *paddr = (struct sockaddr *)addr;
-    if ((handle = WSAAccept(f->handle, paddr, &addrsize, 0, 0)) != -1)
-      break;
+static void acceptex_init(void) {
+  static struct NtGuid AcceptExGuid = WSAID_ACCEPTEX;
+  g_acceptex.lpAcceptEx = __get_wsaid(&AcceptExGuid);
+}
 
-    // return on genuine errors
-    uint32_t err = WSAGetLastError();
-    if (err != WSAEWOULDBLOCK) {
-      errno = __dos2errno(err);
-      if (errno == ECONNRESET)
-        errno = ECONNABORTED;
-      return -1;
-    }
-
-    // check for non-blocking
-    if (f->flags & O_NONBLOCK)
-      return eagain();
-
-    // check for signals and thread cancelation
-    // accept() will restart if SA_RESTART is used
-    if (__sigcheck(waitmask, true) == -1)
-      return -1;
-
-    // time to block
-    struct sys_pollfd_nt fds[1] = {{f->handle, POLLIN}};
-    if (WSAPoll(fds, 1, POLL_INTERVAL_MS) == -1)
-      return __winsockerr();
+static void sys_accept_nt_unwind(void *arg) {
+  struct AcceptResources *resources = arg;
+  if (resources->handle != -1) {
+    __imp_closesocket(resources->handle);
   }
+}
+
+static int sys_accept_nt_start(int64_t handle, struct NtOverlapped *overlap,
+                               uint32_t *flags, void *arg) {
+  struct AcceptArgs *args = arg;
+  cosmo_once(&g_acceptex.once, acceptex_init);
+  if (g_acceptex.lpAcceptEx(args->listensock, handle, args->buffer, 0,
+                            sizeof(args->buffer->local),
+                            sizeof(args->buffer->remote), 0, overlap)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+textwindows int sys_accept_nt(struct Fd *f, struct sockaddr_storage *addr,
+                              int accept4_flags) {
+  int client = -1;
+  sigset_t m = __sig_block();
+  struct AcceptResources resources = {-1};
+  pthread_cleanup_push(sys_accept_nt_unwind, &resources);
+
+  // creates resources for child socket
+  // inherit the listener configuration
+  if ((resources.handle = WSASocket(f->family, f->type, f->protocol, 0, 0,
+                                    kNtWsaFlagOverlapped)) == -1) {
+    client = __winsockerr();
+    goto Finish;
+  }
+
+  // accept network connection
+  // this operation can re-enter, interrupt, cancel, block, timeout, etc.
+  struct AcceptExBuffer buffer;
+  ssize_t bytes_received = __winsock_block(
+      resources.handle, 0, !!(f->flags & O_NONBLOCK), f->rcvtimeo, m,
+      sys_accept_nt_start, &(struct AcceptArgs){f->handle, &buffer});
+  if (bytes_received == -1) {
+    __imp_closesocket(resources.handle);
+    goto Finish;
+  }
+
+  // inherit properties of listening socket
+  // errors ignored as if f->handle was created before forking
+  // this fails with WSAENOTSOCK, see
+  // https://github.com/jart/cosmopolitan/issues/1174
+  __imp_setsockopt(resources.handle, SOL_SOCKET, kNtSoUpdateAcceptContext,
+                   &f->handle, sizeof(f->handle));
 
   // create file descriptor for new socket
   // don't inherit the file open mode bits
@@ -99,18 +141,18 @@ textwindows static int sys_accept_nt_impl(struct Fd *f,
   g_fds.p[client].protocol = f->protocol;
   g_fds.p[client].sndtimeo = f->sndtimeo;
   g_fds.p[client].rcvtimeo = f->rcvtimeo;
-  g_fds.p[client].handle = handle;
+  g_fds.p[client].handle = resources.handle;
+  resources.handle = -1;
+  memcpy(addr, &buffer.remote.addr, sizeof(*addr));
   g_fds.p[client].kind = kFdSocket;
+
+Finish:
+  pthread_cleanup_pop(false);
+  __sig_unblock(m);
+  if (client == -1 && errno == ECONNRESET) {
+    errno = ECONNABORTED;
+  }
   return client;
 }
 
-textwindows int sys_accept_nt(struct Fd *f, struct sockaddr_storage *addr,
-                              int accept4_flags) {
-  int rc;
-  BLOCK_SIGNALS;
-  rc = sys_accept_nt_impl(f, addr, accept4_flags, _SigMask);
-  ALLOW_SIGNALS;
-  return rc;
-}
-
 #endif /* __x86_64__ */
diff --git a/libc/sock/accept.c b/libc/sock/accept.c
index 02f5fba31..882f38752 100644
--- a/libc/sock/accept.c
+++ b/libc/sock/accept.c
@@ -22,9 +22,6 @@
 /**
  * Creates client socket file descriptor for incoming connection.
  *
- * On Windows, when this function blocks, there may be a 10 millisecond
- * delay on the handling of signals or thread cancelation.
- *
  * @param fd is the server socket file descriptor
  * @param opt_out_addr will receive the remote address
  * @param opt_inout_addrsize provides and receives addr's byte length
diff --git a/libc/sock/accept4.c b/libc/sock/accept4.c
index ded323600..9b44071cc 100644
--- a/libc/sock/accept4.c
+++ b/libc/sock/accept4.c
@@ -30,23 +30,12 @@
 /**
  * Creates client socket file descriptor for incoming connection.
  *
- * When `fd` is in `O_NONBLOCK` mode, this function will raise `EAGAIN`
- * when no client is available to accept. To wait until a client exists
- * the poll() function may be called using `POLLIN`.
- *
- * On Linux, your `SO_RCVTIMEO` will timeout accept4(). Other OSes (i.e.
- * Windows, MacOS, and BSDs) do not support this and will block forever.
- *
- * On Windows, when this function blocks, there may be a 10 millisecond
- * delay on the handling of signals or thread cancelation.
- *
  * @param fd is the server socket file descriptor
  * @param opt_out_addr will receive the remote address
  * @param opt_inout_addrsize provides and receives out_addr's byte length
  * @param flags can have SOCK_{CLOEXEC,NONBLOCK}, which may apply to
  *     both the newly created socket and the server one
  * @return client fd which needs close(), or -1 w/ errno
- * @raise EAGAIN if `O_NONBLOCK` and no clients pending
  * @cancelationpoint
  * @asyncsignalsafe
  * @restartable (unless SO_RCVTIMEO)
diff --git a/libc/sock/closesocket-nt.c b/libc/sock/closesocket-nt.c
index 1084ac363..ed6d41e97 100644
--- a/libc/sock/closesocket-nt.c
+++ b/libc/sock/closesocket-nt.c
@@ -16,6 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/weaken.h"
+#include "libc/mem/mem.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
@@ -30,6 +32,9 @@ __msabi extern typeof(__sys_closesocket_nt) *const __imp_closesocket;
  * This function should only be called by close().
  */
 textwindows int sys_closesocket_nt(struct Fd *f) {
+  if (_weaken(sys_connect_nt_cleanup)) {
+    _weaken(sys_connect_nt_cleanup)(f, true);
+  }
   if (!__imp_closesocket(f->handle)) {
     return 0;
   } else {
diff --git a/libc/sock/connect-nt.c b/libc/sock/connect-nt.c
index 473066d0e..6fbd14937 100644
--- a/libc/sock/connect-nt.c
+++ b/libc/sock/connect-nt.c
@@ -16,46 +16,90 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/internal.h"
-#include "libc/calls/struct/sigset.h"
+#include "libc/assert.h"
+#include "libc/atomic.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/sigset.internal.h"
-#include "libc/calls/syscall_support-nt.internal.h"
+#include "libc/cosmo.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/mem/mem.h"
+#include "libc/nt/enum/wsaid.h"
 #include "libc/nt/errors.h"
-#include "libc/nt/struct/fdset.h"
-#include "libc/nt/struct/pollfd.h"
-#include "libc/nt/struct/timeval.h"
+#include "libc/nt/struct/guid.h"
+#include "libc/nt/struct/overlapped.h"
+#include "libc/nt/thread.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/struct/sockaddr.h"
 #include "libc/sock/syscall_fd.internal.h"
-#include "libc/sysv/consts/fio.h"
+#include "libc/sock/wsaid.internal.h"
 #include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/poll.h"
-#include "libc/sysv/consts/so.h"
 #include "libc/sysv/consts/sol.h"
 #include "libc/sysv/errfuns.h"
+
 #ifdef __x86_64__
+#include "libc/sock/yoink.inc"
 
-#define UNCONNECTED 0
-#define CONNECTING  1
-#define CONNECTED   2
+__msabi extern typeof(__sys_setsockopt_nt) *const __imp_setsockopt;
 
-__msabi extern typeof(__sys_getsockopt_nt) *const __imp_getsockopt;
-__msabi extern typeof(__sys_ioctlsocket_nt) *const __imp_ioctlsocket;
-__msabi extern typeof(__sys_select_nt) *const __imp_select;
+struct ConnectArgs {
+  const void *addr;
+  uint32_t addrsize;
+};
 
-textwindows static int sys_connect_nt_impl(struct Fd *f, const void *addr,
-                                           uint32_t addrsize,
-                                           sigset_t waitmask) {
+static struct {
+  atomic_uint once;
+  bool32 (*__msabi lpConnectEx)(int64_t hSocket, const struct sockaddr *name,
+                                int namelen, const void *opt_lpSendBuffer,
+                                uint32_t dwSendDataLength,
+                                uint32_t *opt_out_lpdwBytesSent,
+                                struct NtOverlapped *lpOverlapped);
+} g_connectex;
 
-  // check if already connected
-  if (f->connecting == 2)
-    return eisconn();
+static void connectex_init(void) {
+  static struct NtGuid ConnectExGuid = WSAID_CONNECTEX;
+  g_connectex.lpConnectEx = __get_wsaid(&ConnectExGuid);
+}
 
-  // winsock requires bind() be called beforehand
+void sys_connect_nt_cleanup(struct Fd *f, bool cancel) {
+  struct NtOverlapped *overlap;
+  if ((overlap = f->connect_op)) {
+    uint32_t got, flags;
+    if (cancel)
+      CancelIoEx(f->handle, overlap);
+    if (WSAGetOverlappedResult(f->handle, overlap, &got, cancel, &flags) ||
+        WSAGetLastError() != kNtErrorIoIncomplete) {
+      WSACloseEvent(overlap->hEvent);
+      free(overlap);
+      f->connect_op = 0;
+    }
+  }
+}
+
+static int sys_connect_nt_start(int64_t hSocket,
+                                struct NtOverlapped *lpOverlapped,
+                                uint32_t *flags, void *arg) {
+  struct ConnectArgs *args = arg;
+  if (g_connectex.lpConnectEx(hSocket, args->addr, args->addrsize, 0, 0, 0,
+                              lpOverlapped)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static textwindows int sys_connect_nt_impl(struct Fd *f, const void *addr,
+                                           uint32_t addrsize, sigset_t mask) {
+
+  // get connect function from winsock api
+  cosmo_once(&g_connectex.once, connectex_init);
+
+  // fail if previous connect() is still in progress
+  if (f->connect_op)
+    return ealready();
+
+  // ConnectEx() requires bind() be called beforehand
   if (!f->isbound) {
     struct sockaddr_storage ss = {0};
     ss.ss_family = ((struct sockaddr *)addr)->sa_family;
@@ -63,121 +107,55 @@ textwindows static int sys_connect_nt_impl(struct Fd *f, const void *addr,
       return -1;
   }
 
-  if (f->connecting == UNCONNECTED) {
-
-    // make sure winsock is in non-blocking mode
-    uint32_t mode = 1;
-    if (__imp_ioctlsocket(f->handle, FIONBIO, &mode))
-      return __winsockerr();
-
-    // perform non-blocking connect
-    if (!WSAConnect(f->handle, addr, addrsize, 0, 0, 0, 0)) {
-      f->connecting = CONNECTED;
-      return 0;
-    }
-
-    // check for errors
-    switch (WSAGetLastError()) {
-      case WSAEISCONN:
-        f->connecting = CONNECTED;
-        return eisconn();
-      case WSAEALREADY:
-        f->connecting = CONNECTING;
-        break;
-      case WSAEWOULDBLOCK:
-        break;
-      default:
-        return __winsockerr();
-    }
-
-    // handle non-blocking
-    if (f->flags & O_NONBLOCK) {
-      if (f->connecting == UNCONNECTED) {
-        f->connecting = CONNECTING;
-        return einprogress();
-      } else {
-        return ealready();
-      }
-    } else {
-      f->connecting = CONNECTING;
+  // perform normal connect
+  if (!(f->flags & O_NONBLOCK)) {
+    ssize_t rc = __winsock_block(f->handle, 0, false, f->sndtimeo, mask,
+                                 sys_connect_nt_start,
+                                 &(struct ConnectArgs){addr, addrsize});
+    if (rc == -1 && errno == EAGAIN) {
+      // return ETIMEDOUT if SO_SNDTIMEO elapsed
+      // note that Linux will return EINPROGRESS
+      errno = etimedout();
+    } else if (!rc) {
+      __imp_setsockopt(f->handle, SOL_SOCKET, kNtSoUpdateConnectContext, 0, 0);
     }
+    return rc;
   }
 
-  for (;;) {
-
-    // check for signals and thread cancelation
-    // connect() will restart if SA_RESTART is used
-    if (!(f->flags & O_NONBLOCK))
-      if (__sigcheck(waitmask, true) == -1)
-        return -1;
-
-    //
-    // "Use select to determine the completion of the connection request
-    //  by checking if the socket is writable."
-    //
-    //                  —Quoth MSDN § WSAConnect function
-    //
-    // "If a socket is processing a connect call (nonblocking), failure
-    //  of the connect attempt is indicated in exceptfds (application
-    //  must then call getsockopt SO_ERROR to determine the error value
-    //  to describe why the failure occurred). This document does not
-    //  define which other errors will be included."
-    //
-    //                  —Quoth MSDN § select function
-    //
-    struct NtFdSet wrfds;
-    struct NtFdSet exfds;
-    struct NtTimeval timeout;
-    wrfds.fd_count = 1;
-    wrfds.fd_array[0] = f->handle;
-    exfds.fd_count = 1;
-    exfds.fd_array[0] = f->handle;
-    if (f->flags & O_NONBLOCK) {
-      timeout.tv_sec = 0;
-      timeout.tv_usec = 0;
-    } else {
-      timeout.tv_sec = POLL_INTERVAL_MS / 1000;
-      timeout.tv_usec = POLL_INTERVAL_MS % 1000 * 1000;
-    }
-    int ready = __imp_select(1, 0, &wrfds, &exfds, &timeout);
-    if (ready == -1)
+  // perform nonblocking connect(), i.e.
+  // 1. connect(O_NONBLOCK) → EINPROGRESS
+  // 2. poll(POLLOUT)
+  bool32 ok;
+  struct NtOverlapped *overlap = calloc(1, sizeof(struct NtOverlapped));
+  if (!overlap)
+    return -1;
+  overlap->hEvent = WSACreateEvent();
+  ok = g_connectex.lpConnectEx(f->handle, addr, addrsize, 0, 0, 0, overlap);
+  if (ok) {
+    uint32_t dwBytes, dwFlags;
+    ok = WSAGetOverlappedResult(f->handle, overlap, &dwBytes, false, &dwFlags);
+    WSACloseEvent(overlap->hEvent);
+    free(overlap);
+    if (!ok) {
       return __winsockerr();
-
-    // check if we still need more time
-    if (!ready) {
-      if (f->flags & O_NONBLOCK) {
-        return etimedout();
-      } else {
-        continue;
-      }
     }
-
-    // check if connect failed
-    if (exfds.fd_count) {
-      int err;
-      uint32_t len = sizeof(err);
-      if (__imp_getsockopt(f->handle, SOL_SOCKET, SO_ERROR, &err, &len) == -1)
-        return __winsockerr();
-      if (!err)
-        return eio();  // should be impossible
-      errno = __dos2errno(err);
-      return -1;
-    }
-
-    // handle successful connection
-    if (!wrfds.fd_count)
-      return eio();  // should be impossible
-    f->connecting = CONNECTED;
+    __imp_setsockopt(f->handle, SOL_SOCKET, kNtSoUpdateConnectContext, 0, 0);
     return 0;
+  } else if (WSAGetLastError() == kNtErrorIoPending) {
+    f->connect_op = overlap;
+    return einprogress();
+  } else {
+    WSACloseEvent(overlap->hEvent);
+    free(overlap);
+    return __winsockerr();
   }
 }
 
 textwindows int sys_connect_nt(struct Fd *f, const void *addr,
                                uint32_t addrsize) {
-  int rc;
-  BLOCK_SIGNALS;
-  rc = sys_connect_nt_impl(f, addr, addrsize, _SigMask);
-  ALLOW_SIGNALS;
+  sigset_t mask = __sig_block();
+  int rc = sys_connect_nt_impl(f, addr, addrsize, mask);
+  __sig_unblock(mask);
   return rc;
 }
 
diff --git a/libc/sock/connect.c b/libc/sock/connect.c
index 8426c2102..5f3fd2d7c 100644
--- a/libc/sock/connect.c
+++ b/libc/sock/connect.c
@@ -18,8 +18,8 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/cp.internal.h"
 #include "libc/calls/internal.h"
-#include "libc/dce.h"
 #include "libc/intrin/fds.h"
+#include "libc/dce.h"
 #include "libc/intrin/strace.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/sock.h"
@@ -31,28 +31,11 @@
 /**
  * Connects socket to remote end.
  *
- * When `fd` is in `O_NONBLOCK` mode, this raises `EINPROGRESS`. To wait
- * for establishment poll() function may be called using `POLLOUT`. Then
- * `SO_ERROR` may be used to check for errors.
- *
- * Connectionless sockets, e.g. UDP, can be connected too. The benefit
- * is not needing to specify the remote address on each send. It also
- * means getsockname() can be called to retrieve routing details.
- *
- * On Linux, your `SO_SNDTIMEO` will timeout connect(). Other OSes (i.e.
- * Windows, MacOS, and BSDs) do not support this and will block forever.
- *
- * On Windows, when this function blocks, there may be a 10 millisecond
- * delay on the handling of signals or thread cancelation.
+ * ProTip: Connectionless sockets, e.g. UDP, can be connected too. The
+ * benefit is not needing to specify the remote address on each send. It
+ * also means getsockname() can be called to retrieve routing details.
  *
  * @return 0 on success or -1 w/ errno
- * @raise EINPROGRESS if `O_NONBLOCK` and connecting process initiated
- * @raise EALREADY if a `O_NONBLOCK` connecting already in flight
- * @raise EADDRINUSE if local address is already in use
- * @raise EINTR if a signal handler was called instead
- * @raise ENETUNREACH if network is unreachable
- * @raise ETIMEDOUT if connection timed out
- * @raise EISCONN if already connected
  * @cancelationpoint
  * @asyncsignalsafe
  * @restartable (unless SO_RCVTIMEO)
diff --git a/libc/sock/epoll.c b/libc/sock/epoll.c
new file mode 100644
index 000000000..818ddf19e
--- /dev/null
+++ b/libc/sock/epoll.c
@@ -0,0 +1,1655 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  wepoll                                                                      │
+│  https://github.com/piscisaureus/wepoll                                      │
+│                                                                              │
+│  Copyright 2012-2020, Bert Belder <bertbelder@gmail.com>                     │
+│  All rights reserved.                                                        │
+│                                                                              │
+│  Redistribution and use in source and binary forms, with or without          │
+│  modification, are permitted provided that the following conditions are      │
+│  met:                                                                        │
+│                                                                              │
+│    * Redistributions of source code must retain the above copyright          │
+│      notice, this list of conditions and the following disclaimer.           │
+│                                                                              │
+│    * Redistributions in binary form must reproduce the above copyright       │
+│      notice, this list of conditions and the following disclaimer in the     │
+│      documentation and/or other materials provided with the distribution.    │
+│                                                                              │
+│  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS         │
+│  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT           │
+│  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR       │
+│  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT        │
+│  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       │
+│  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT            │
+│  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,       │
+│  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY       │
+│  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT         │
+│  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE       │
+│  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.        │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/sock/epoll.h"
+#include "libc/assert.h"
+#include "libc/calls/cp.internal.h"
+#include "libc/calls/internal.h"
+#include "libc/calls/state.internal.h"
+#include "libc/calls/struct/sigset.internal.h"
+#include "libc/calls/syscall_support-sysv.internal.h"
+#include "libc/dce.h"
+#include "libc/errno.h"
+#include "libc/intrin/strace.h"
+#include "libc/limits.h"
+#include "libc/macros.internal.h"
+#include "libc/mem/mem.h"
+#include "libc/nt/enum/accessmask.h"
+#include "libc/nt/enum/afd.h"
+#include "libc/nt/enum/filesharemode.h"
+#include "libc/nt/enum/ioctl.h"
+#include "libc/nt/enum/keyedevent.h"
+#include "libc/nt/enum/sio.h"
+#include "libc/nt/enum/status.h"
+#include "libc/nt/enum/wait.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/files.h"
+#include "libc/nt/iocp.h"
+#include "libc/nt/nt/file.h"
+#include "libc/nt/nt/key.h"
+#include "libc/nt/ntdll.h"
+#include "libc/nt/process.h"
+#include "libc/nt/runtime.h"
+#include "libc/nt/struct/afd.h"
+#include "libc/nt/struct/criticalsection.h"
+#include "libc/nt/struct/objectattributes.h"
+#include "libc/nt/struct/overlappedentry.h"
+#include "libc/nt/struct/unicodestring.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/winsock.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sock/internal.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/epoll.h"
+#include "libc/sysv/consts/sig.h"
+#include "libc/sysv/errfuns.h"
+
+/**
+ * @fileoverview epoll
+ *
+ * This is an alternative to poll() that's popular for event driven
+ * network servers that want >10,000 sockets per machine and don't do
+ * cpu bound computations that would otherwise block the event loop.
+ *
+ * This works on Linux and is polyfilled on Windows. It's worth noting
+ * that these polyfills depend on Microsoft's internal APIs. However
+ * these particular NTDLL APIs are also used by libuv, nodejs, etc. so
+ * we're reasonably certain Microsoft has compatibility policies in
+ * place where they've promised not to break them.
+ *
+ * TODO(jart): Polyfill kqueue for XNU/FreeBSD/OpenBSD.
+ */
+
+__notice(wepoll_notice, "\
+wepoll (BSD-2)\n\
+Copyright 2012-2020 Bert Belder\n\
+https://github.com/piscisaureus/wepoll");
+
+#define MAX_GROUP_SIZE 32
+
+#define REFLOCK__REF          0x00000001
+#define REFLOCK__REF_MASK     0x0fffffff
+#define REFLOCK__DESTROY      0x10000000
+#define REFLOCK__DESTROY_MASK 0xf0000000
+#define REFLOCK__POISON       0x300dead0
+
+#define KNOWN_EVENTS                                                   \
+  (EPOLLIN | EPOLLPRI | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | \
+   EPOLLRDBAND | EPOLLWRNORM | EPOLLWRBAND | EPOLLMSG | EPOLLRDHUP)
+
+#define RTL_CONSTANT_STRING(s) \
+  { sizeof(s) - sizeof((s)[0]), sizeof(s), s }
+
+#define RTL_CONSTANT_OBJECT_ATTRIBUTES(ObjectName, Attributes) \
+  { sizeof(struct NtObjectAttributes), 0, ObjectName, Attributes, NULL, NULL }
+
+#define RETURN_MAP_ERROR(value) \
+  do {                          \
+    err_map_win_error();        \
+    return value;               \
+  } while (0)
+
+#define RETURN_SET_ERROR(value, error) \
+  do {                                 \
+    err_set_win_error(error);          \
+    return value;                      \
+  } while (0)
+
+#define CONTAINOF(ptr, type, member) \
+  ((type *)((uintptr_t)(ptr) - offsetof(type, member)))
+
+#define TREE__ROTATE(cis, trans)       \
+  struct TreeNode *p = node;           \
+  struct TreeNode *q = node->trans;    \
+  struct TreeNode *parent = p->parent; \
+  if (parent) {                        \
+    if (parent->left == p)             \
+      parent->left = q;                \
+    else                               \
+      parent->right = q;               \
+  } else {                             \
+    tree->root = q;                    \
+  }                                    \
+  q->parent = parent;                  \
+  p->parent = q;                       \
+  p->trans = q->cis;                   \
+  if (p->trans)                        \
+    p->trans->parent = p;              \
+  q->cis = p;
+
+#define TREE__INSERT_OR_DESCEND(side) \
+  if (parent->side) {                 \
+    parent = parent->side;            \
+  } else {                            \
+    parent->side = node;              \
+    break;                            \
+  }
+
+#define TREE__REBALANCE_AFTER_INSERT(cis, trans) \
+  struct TreeNode *grandparent = parent->parent; \
+  struct TreeNode *uncle = grandparent->trans;   \
+  if (uncle && uncle->red) {                     \
+    parent->red = uncle->red = false;            \
+    grandparent->red = true;                     \
+    node = grandparent;                          \
+  } else {                                       \
+    if (node == parent->trans) {                 \
+      tree__rotate_##cis(tree, parent);          \
+      node = parent;                             \
+      parent = node->parent;                     \
+    }                                            \
+    parent->red = false;                         \
+    grandparent->red = true;                     \
+    tree__rotate_##trans(tree, grandparent);     \
+  }
+
+#define TREE__REBALANCE_AFTER_REMOVE(cis, trans)   \
+  struct TreeNode *sibling = parent->trans;        \
+  if (sibling->red) {                              \
+    sibling->red = false;                          \
+    parent->red = true;                            \
+    tree__rotate_##cis(tree, parent);              \
+    sibling = parent->trans;                       \
+  }                                                \
+  if ((sibling->left && sibling->left->red) ||     \
+      (sibling->right && sibling->right->red)) {   \
+    if (!sibling->trans || !sibling->trans->red) { \
+      sibling->cis->red = false;                   \
+      sibling->red = true;                         \
+      tree__rotate_##trans(tree, sibling);         \
+      sibling = parent->trans;                     \
+    }                                              \
+    sibling->red = parent->red;                    \
+    parent->red = sibling->trans->red = false;     \
+    tree__rotate_##cis(tree, parent);              \
+    node = tree->root;                             \
+    break;                                         \
+  }                                                \
+  sibling->red = true;
+
+#define tree_root(t)                         (t)->root
+#define port_state_to_handle_tree_node(p)    (&(p)->handle_tree_node)
+#define sock_state_from_queue_node(q)        CONTAINOF(q, struct SockState, queue_node)
+#define sock_state_to_queue_node(s)          (&(s)->queue_node)
+#define sock_state_from_tree_node(t)         CONTAINOF(t, struct SockState, tree_node)
+#define sock_state_to_tree_node(s)           (&(s)->tree_node)
+#define poll_group_from_queue_node(q)        CONTAINOF(q, struct PollGroup, queue_node)
+#define poll_group_get_afd_device_handle(pg) (pg)->afd_device_handle
+
+enum PollStatus {
+  kPollIdle,
+  kPollPending,
+  kPollCancelled,
+};
+
+struct RefLock {
+  int state;
+};
+
+struct TreeNode {
+  struct TreeNode *left;
+  struct TreeNode *right;
+  struct TreeNode *parent;
+  uintptr_t key;
+  bool red;
+};
+
+struct Tree {
+  struct TreeNode *root;
+};
+
+struct TsTree {
+  struct Tree tree;
+  intptr_t lock;
+};
+
+struct TsTreeNode {
+  struct TreeNode tree_node;
+  struct RefLock reflock;
+};
+
+struct QueueNode {
+  struct QueueNode *prev;
+  struct QueueNode *next;
+};
+
+struct Queue {
+  struct QueueNode head;
+};
+
+struct PortState {
+  int64_t iocp_handle;
+  struct Tree sock_tree;
+  struct Queue sock_update_queue;
+  struct Queue sock_deleted_queue;
+  struct Queue poll_group_queue;
+  struct TsTreeNode handle_tree_node;
+  struct NtCriticalSection lock;
+  size_t active_poll_count;
+};
+
+struct PollGroup {
+  struct PortState *port_state;
+  struct QueueNode queue_node;
+  int64_t afd_device_handle;
+  size_t group_size;
+};
+
+struct SockState {
+  struct NtIoStatusBlock io_status_block;
+  struct NtAfdPollInfo poll_info;
+  struct QueueNode queue_node;
+  struct TreeNode tree_node;
+  struct PollGroup *poll_group;
+  int64_t base_socket;
+  epoll_data_t user_data;
+  uint32_t user_events;
+  uint32_t pending_events;
+  enum PollStatus poll_status;
+  bool delete_pending;
+};
+
+static const struct NtUnicodeString afd__device_name =
+    RTL_CONSTANT_STRING(u"\\Device\\Afd\\Wepoll");
+
+static const struct NtObjectAttributes afd__device_attributes =
+    RTL_CONSTANT_OBJECT_ATTRIBUTES(&afd__device_name, 0);
+
+static int64_t reflock__keyed_event;
+static struct TsTree epoll__handle_tree;
+
+static textwindows void err_map_win_error(void) {
+  errno = __dos2errno(GetLastError());
+}
+
+static textwindows void err_set_win_error(uint32_t error) {
+  SetLastError(error);
+  errno = __dos2errno(error);
+}
+
+static textwindows int err_check_handle(int64_t handle) {
+  uint32_t flags;
+  /* GetHandleInformation() succeeds when passed INVALID_HANDLE_VALUE,
+     so check for this condition explicitly. */
+  if (handle == kNtInvalidHandleValue) {
+    RETURN_SET_ERROR(-1, kNtErrorInvalidHandle);
+  }
+  if (!GetHandleInformation(handle, &flags)) {
+    RETURN_MAP_ERROR(-1);
+  }
+  return 0;
+}
+
+static textwindows void tree_init(struct Tree *tree) {
+  bzero(tree, sizeof *tree);
+}
+
+static textwindows void ts_tree_init(struct TsTree *ts_tree) {
+  tree_init(&ts_tree->tree);
+  InitializeSRWLock(&ts_tree->lock);
+}
+
+static textwindows int reflock_global_init(void) {
+  NtStatus status;
+  if ((status = NtCreateKeyedEvent(&reflock__keyed_event,
+                                   kNtKeyedeventAllAccess, NULL, 0)) !=
+      kNtStatusSuccess) {
+    RETURN_SET_ERROR(-1, RtlNtStatusToDosError(status));
+  }
+  return 0;
+}
+
+static textwindows int epoll_global_init(void) {
+  ts_tree_init(&epoll__handle_tree);
+  return 0;
+}
+
+static textwindows int wepoll_init(void) {
+  static bool once;
+  static bool result;
+  if (!once) {
+    if (reflock_global_init() < 0 || epoll_global_init() < 0) {
+      result = false;
+    } else {
+      result = true;
+    }
+    once = true;
+  }
+  return result;
+}
+
+static textwindows int afd_create_device_handle(
+    int64_t iocp_handle, int64_t *afd_device_handle_out) {
+  NtStatus status;
+  int64_t afd_device_handle;
+  struct NtIoStatusBlock iosb;
+  /* By opening \Device\Afd without specifying any extended attributes,
+     we'll get a handle that lets us talk to the AFD driver, but that
+     doesn't have an *associated endpoint (so it's not a socket). */
+  status = NtCreateFile(&afd_device_handle, kNtSynchronize,
+                        &afd__device_attributes, &iosb, NULL, 0,
+                        kNtFileShareRead | kNtFileShareWrite, 1, 0, NULL, 0);
+  if (status != kNtStatusSuccess) {
+    RETURN_SET_ERROR(-1, RtlNtStatusToDosError(status));
+  }
+  if (!CreateIoCompletionPort(afd_device_handle, iocp_handle, 0, 0)) {
+    goto error;
+  }
+  if (!SetFileCompletionNotificationModes(afd_device_handle,
+                                          kNtFileSkipSetEventOnHandle)) {
+    goto error;
+  }
+  *afd_device_handle_out = afd_device_handle;
+  return 0;
+error:
+  CloseHandle(afd_device_handle);
+  RETURN_MAP_ERROR(-1);
+}
+
+static textwindows int afd_poll(int64_t afd_device_handle,
+                                struct NtAfdPollInfo *poll_info,
+                                struct NtIoStatusBlock *io_status_block) {
+  NtStatus status;
+  /* Blocking operation is not supported.*/
+  npassert(io_status_block);
+  io_status_block->Status = kNtStatusPending;
+  status =
+      NtDeviceIoControlFile(afd_device_handle, 0, NULL, io_status_block,
+                            io_status_block, kNtIoctlAfdPoll, poll_info,
+                            sizeof(*poll_info), poll_info, sizeof(*poll_info));
+  if (status == kNtStatusSuccess) {
+    return 0;
+  } else if (status == kNtStatusPending) {
+    RETURN_SET_ERROR(-1, kNtErrorIoPending);
+  } else {
+    RETURN_SET_ERROR(-1, RtlNtStatusToDosError(status));
+  }
+}
+
+static textwindows int afd_cancel_poll(
+    int64_t afd_device_handle, struct NtIoStatusBlock *io_status_block) {
+  NtStatus cancel_status;
+  struct NtIoStatusBlock cancel_iosb;
+  /* If the poll operation has already completed or has been cancelled
+     earlier, there's nothing left for us to do. */
+  if (io_status_block->Status != kNtStatusPending)
+    return 0;
+  cancel_status =
+      NtCancelIoFileEx(afd_device_handle, io_status_block, &cancel_iosb);
+  /* NtCancelIoFileEx() may return STATUS_NOT_FOUND if the operation completed
+     just before calling NtCancelIoFileEx(). This is not an error. */
+  if (cancel_status == kNtStatusSuccess || cancel_status == kNtStatusNotFound) {
+    return 0;
+  } else {
+    RETURN_SET_ERROR(-1, RtlNtStatusToDosError(cancel_status));
+  }
+}
+
+static textwindows void queue_node_init(struct QueueNode *node) {
+  node->prev = node;
+  node->next = node;
+}
+
+static textwindows void queue_init(struct Queue *queue) {
+  queue_node_init(&queue->head);
+}
+
+static textwindows void queue__detach_node(struct QueueNode *node) {
+  node->prev->next = node->next;
+  node->next->prev = node->prev;
+}
+
+forceinline bool queue_is_enqueued(const struct QueueNode *node) {
+  return node->prev != node;
+}
+
+forceinline bool queue_is_empty(const struct Queue *queue) {
+  return !queue_is_enqueued(&queue->head);
+}
+
+static textwindows struct QueueNode *queue_first(const struct Queue *queue) {
+  return !queue_is_empty(queue) ? queue->head.next : NULL;
+}
+
+static textwindows struct QueueNode *queue_last(const struct Queue *queue) {
+  return !queue_is_empty(queue) ? queue->head.prev : NULL;
+}
+
+static textwindows void queue_prepend(struct Queue *queue,
+                                      struct QueueNode *node) {
+  node->next = queue->head.next;
+  node->prev = &queue->head;
+  node->next->prev = node;
+  queue->head.next = node;
+}
+
+static textwindows void queue_append(struct Queue *queue,
+                                     struct QueueNode *node) {
+  node->next = &queue->head;
+  node->prev = queue->head.prev;
+  node->prev->next = node;
+  queue->head.prev = node;
+}
+
+static textwindows void queue_move_to_start(struct Queue *queue,
+                                            struct QueueNode *node) {
+  queue__detach_node(node);
+  queue_prepend(queue, node);
+}
+
+static textwindows void queue_move_to_end(struct Queue *queue,
+                                          struct QueueNode *node) {
+  queue__detach_node(node);
+  queue_append(queue, node);
+}
+
+static textwindows void queue_remove(struct QueueNode *node) {
+  queue__detach_node(node);
+  queue_node_init(node);
+}
+
+static textwindows struct PortState *port__alloc(void) {
+  struct PortState *port_state = malloc(sizeof *port_state);
+  if (!port_state)
+    RETURN_SET_ERROR(NULL, kNtErrorNotEnoughMemory);
+  return port_state;
+}
+
+static textwindows int64_t port__create_iocp(void) {
+  int64_t iocp_handle = CreateIoCompletionPort(kNtInvalidHandleValue, 0, 0, 0);
+  if (!iocp_handle)
+    RETURN_MAP_ERROR(0);
+  return iocp_handle;
+}
+
+static textwindows int port__close_iocp(struct PortState *port_state) {
+  int64_t iocp_handle = port_state->iocp_handle;
+  port_state->iocp_handle = 0;
+  if (!CloseHandle(iocp_handle))
+    RETURN_MAP_ERROR(-1);
+  return 0;
+}
+
+static textwindows void tree_node_init(struct TreeNode *node) {
+  bzero(node, sizeof *node);
+}
+
+static textwindows void reflock_init(struct RefLock *reflock) {
+  reflock->state = 0;
+}
+
+static textwindows void ts_tree_node_init(struct TsTreeNode *node) {
+  tree_node_init(&node->tree_node);
+  reflock_init(&node->reflock);
+}
+
+static textwindows void tree__rotate_left(struct Tree *tree,
+                                          struct TreeNode *node) {
+  TREE__ROTATE(left, right)
+}
+
+static textwindows void tree__rotate_right(struct Tree *tree,
+                                           struct TreeNode *node) {
+  TREE__ROTATE(right, left)
+}
+
+static textwindows int tree_add(struct Tree *tree, struct TreeNode *node,
+                                uintptr_t key) {
+  struct TreeNode *parent;
+  parent = tree->root;
+  if (parent) {
+    for (;;) {
+      if (key < parent->key) {
+        TREE__INSERT_OR_DESCEND(left)
+      } else if (key > parent->key) {
+        TREE__INSERT_OR_DESCEND(right)
+      } else {
+        return -1;
+      }
+    }
+  } else {
+    tree->root = node;
+  }
+  node->key = key;
+  node->left = node->right = NULL;
+  node->parent = parent;
+  node->red = true;
+  for (; parent && parent->red; parent = node->parent) {
+    if (parent == parent->parent->left) {
+      TREE__REBALANCE_AFTER_INSERT(left, right)
+    } else {
+      TREE__REBALANCE_AFTER_INSERT(right, left)
+    }
+  }
+  tree->root->red = false;
+  return 0;
+}
+
+static textwindows int ts_tree_add(struct TsTree *ts_tree,
+                                   struct TsTreeNode *node, uintptr_t key) {
+  int r;
+  AcquireSRWLockExclusive(&ts_tree->lock);
+  r = tree_add(&ts_tree->tree, &node->tree_node, key);
+  ReleaseSRWLockExclusive(&ts_tree->lock);
+  return r;
+}
+
+static textwindows void port__free(struct PortState *port) {
+  npassert(port);
+  free(port);
+}
+
+static textwindows struct PortState *port_new(int64_t *iocp_handle_out) {
+  struct PortState *port_state;
+  int64_t iocp_handle;
+  port_state = port__alloc();
+  if (!port_state)
+    goto err1;
+  iocp_handle = port__create_iocp();
+  if (!iocp_handle)
+    goto err2;
+  bzero(port_state, sizeof *port_state);
+  port_state->iocp_handle = iocp_handle;
+  tree_init(&port_state->sock_tree);
+  queue_init(&port_state->sock_update_queue);
+  queue_init(&port_state->sock_deleted_queue);
+  queue_init(&port_state->poll_group_queue);
+  ts_tree_node_init(&port_state->handle_tree_node);
+  InitializeCriticalSection(&port_state->lock);
+  *iocp_handle_out = iocp_handle;
+  return port_state;
+err2:
+  port__free(port_state);
+err1:
+  return NULL;
+}
+
+static textwindows int sock__cancel_poll(struct SockState *sock_state) {
+  npassert(sock_state->poll_status == kPollPending);
+  if (afd_cancel_poll(poll_group_get_afd_device_handle(sock_state->poll_group),
+                      &sock_state->io_status_block) < 0) {
+    return -1;
+  }
+  sock_state->poll_status = kPollCancelled;
+  sock_state->pending_events = 0;
+  return 0;
+}
+
+static textwindows void port_cancel_socket_update(
+    struct PortState *port_state, struct SockState *sock_state) {
+  if (!queue_is_enqueued(sock_state_to_queue_node(sock_state)))
+    return;
+  queue_remove(sock_state_to_queue_node(sock_state));
+}
+
+static textwindows struct TreeNode *tree_find(const struct Tree *tree,
+                                              uintptr_t key) {
+  struct TreeNode *node = tree->root;
+  while (node) {
+    if (key < node->key) {
+      node = node->left;
+    } else if (key > node->key) {
+      node = node->right;
+    } else {
+      return node;
+    }
+  }
+  return NULL;
+}
+
+static textwindows struct TsTreeNode *ts_tree__find_node(struct TsTree *ts_tree,
+                                                         uintptr_t key) {
+  struct TreeNode *tree_node = tree_find(&ts_tree->tree, key);
+  if (!tree_node)
+    return NULL;
+  return CONTAINOF(tree_node, struct TsTreeNode, tree_node);
+}
+
+static textwindows void tree_del(struct Tree *tree, struct TreeNode *node) {
+  bool red;
+  struct TreeNode *parent, *left, *right, *next;
+  parent = node->parent;
+  left = node->left;
+  right = node->right;
+  if (!left) {
+    next = right;
+  } else if (!right) {
+    next = left;
+  } else {
+    next = right;
+    while (next->left)
+      next = next->left;
+  }
+  if (parent) {
+    if (parent->left == node) {
+      parent->left = next;
+    } else {
+      parent->right = next;
+    }
+  } else {
+    tree->root = next;
+  }
+  if (left && right) {
+    red = next->red;
+    next->red = node->red;
+    next->left = left;
+    left->parent = next;
+    if (next != right) {
+      parent = next->parent;
+      next->parent = node->parent;
+      node = next->right;
+      parent->left = node;
+      next->right = right;
+      right->parent = next;
+    } else {
+      next->parent = parent;
+      parent = next;
+      node = next->right;
+    }
+  } else {
+    red = node->red;
+    node = next;
+  }
+  if (node)
+    node->parent = parent;
+  if (red)
+    return;
+  if (node && node->red) {
+    node->red = false;
+    return;
+  }
+  do {
+    if (node == tree->root)
+      break;
+    if (node == parent->left) {
+      TREE__REBALANCE_AFTER_REMOVE(left, right)
+    } else {
+      TREE__REBALANCE_AFTER_REMOVE(right, left)
+    }
+    node = parent;
+    parent = parent->parent;
+  } while (!node->red);
+  if (node)
+    node->red = false;
+}
+
+static textwindows void reflock__signal_event(void *address) {
+  NtStatus status =
+      NtReleaseKeyedEvent(reflock__keyed_event, address, false, NULL);
+  if (status != kNtStatusSuccess)
+    abort();
+}
+
+static textwindows void reflock__await_event(void *address) {
+  NtStatus status =
+      NtWaitForKeyedEvent(reflock__keyed_event, address, false, NULL);
+  if (status != kNtStatusSuccess)
+    abort();
+}
+
+static textwindows void reflock_ref(struct RefLock *reflock) {
+  long state = InterlockedAdd(&reflock->state, REFLOCK__REF);
+  /* Verify that the counter didn 't overflow and the lock isn' t destroyed.*/
+  npassert((state & REFLOCK__DESTROY_MASK) == 0);
+}
+
+static textwindows void reflock_unref(struct RefLock *reflock) {
+  long state = InterlockedAdd(&reflock->state, -REFLOCK__REF);
+  /* Verify that the lock was referenced and not already destroyed.*/
+  npassert((state & REFLOCK__DESTROY_MASK & ~REFLOCK__DESTROY) == 0);
+  if (state == REFLOCK__DESTROY)
+    reflock__signal_event(reflock);
+}
+
+static textwindows struct TsTreeNode *ts_tree_del_and_ref(
+    struct TsTree *ts_tree, uintptr_t key) {
+  struct TsTreeNode *ts_tree_node;
+  AcquireSRWLockExclusive(&ts_tree->lock);
+  ts_tree_node = ts_tree__find_node(ts_tree, key);
+  if (ts_tree_node != NULL) {
+    tree_del(&ts_tree->tree, &ts_tree_node->tree_node);
+    reflock_ref(&ts_tree_node->reflock);
+  }
+  ReleaseSRWLockExclusive(&ts_tree->lock);
+  return ts_tree_node;
+}
+
+static textwindows struct TsTreeNode *ts_tree_find_and_ref(
+    struct TsTree *ts_tree, uintptr_t key) {
+  struct TsTreeNode *ts_tree_node;
+  AcquireSRWLockShared(&ts_tree->lock);
+  ts_tree_node = ts_tree__find_node(ts_tree, key);
+  if (ts_tree_node != NULL)
+    reflock_ref(&ts_tree_node->reflock);
+  ReleaseSRWLockShared(&ts_tree->lock);
+  return ts_tree_node;
+}
+
+static textwindows void ts_tree_node_unref(struct TsTreeNode *node) {
+  reflock_unref(&node->reflock);
+}
+
+static textwindows void reflock_unref_and_destroy(struct RefLock *reflock) {
+  long state, ref_count;
+  state = InterlockedAdd(&reflock->state, REFLOCK__DESTROY - REFLOCK__REF);
+  ref_count = state & REFLOCK__REF_MASK;
+  /* Verify that the lock was referenced and not already destroyed. */
+  npassert((state & REFLOCK__DESTROY_MASK) == REFLOCK__DESTROY);
+  if (ref_count != 0)
+    reflock__await_event(reflock);
+  state = InterlockedExchange(&reflock->state, REFLOCK__POISON);
+  npassert(state == REFLOCK__DESTROY);
+}
+
+static textwindows void ts_tree_node_unref_and_destroy(
+    struct TsTreeNode *node) {
+  reflock_unref_and_destroy(&node->reflock);
+}
+
+static textwindows void port_unregister_socket(struct PortState *port_state,
+                                               struct SockState *sock_state) {
+  tree_del(&port_state->sock_tree, sock_state_to_tree_node(sock_state));
+}
+
+static textwindows void port_remove_deleted_socket(
+    struct PortState *port_state, struct SockState *sock_state) {
+  if (!queue_is_enqueued(sock_state_to_queue_node(sock_state)))
+    return;
+  queue_remove(sock_state_to_queue_node(sock_state));
+}
+
+static textwindows struct Queue *port_get_poll_group_queue(
+    struct PortState *port_state) {
+  return &port_state->poll_group_queue;
+}
+
+static textwindows void poll_group_release(struct PollGroup *poll_group) {
+  struct PortState *port_state = poll_group->port_state;
+  struct Queue *poll_group_queue = port_get_poll_group_queue(port_state);
+  poll_group->group_size--;
+  npassert(poll_group->group_size < MAX_GROUP_SIZE);
+  queue_move_to_end(poll_group_queue, &poll_group->queue_node);
+  /* Poll groups are currently only freed when the epoll port is closed. */
+}
+
+static textwindows void sock__free(struct SockState *sock_state) {
+  npassert(sock_state != NULL);
+  free(sock_state);
+}
+
+static textwindows void port_add_deleted_socket(struct PortState *port_state,
+                                                struct SockState *sock_state) {
+  if (queue_is_enqueued(sock_state_to_queue_node(sock_state)))
+    return;
+  queue_append(&port_state->sock_deleted_queue,
+               sock_state_to_queue_node(sock_state));
+}
+
+static textwindows int sock__delete(struct PortState *port_state,
+                                    struct SockState *sock_state, bool force) {
+  if (!sock_state->delete_pending) {
+    if (sock_state->poll_status == kPollPending) {
+      sock__cancel_poll(sock_state);
+    }
+    port_cancel_socket_update(port_state, sock_state);
+    port_unregister_socket(port_state, sock_state);
+    sock_state->delete_pending = true;
+  }
+  /* If the poll request still needs to complete, the sock_state object
+     can't be free'd yet. `sock_feed_event()` or `port_close()` will
+     take care of this later. */
+  if (force || sock_state->poll_status == kPollIdle) {
+    port_remove_deleted_socket(port_state, sock_state);
+    poll_group_release(sock_state->poll_group);
+    sock__free(sock_state);
+  } else {
+    /* Free the socket later.*/
+    port_add_deleted_socket(port_state, sock_state);
+  }
+  return 0;
+}
+
+static textwindows void sock_delete(struct PortState *port_state,
+                                    struct SockState *sock_state) {
+  sock__delete(port_state, sock_state, false);
+}
+
+static textwindows void sock_force_delete(struct PortState *port_state,
+                                          struct SockState *sock_state) {
+  sock__delete(port_state, sock_state, true);
+}
+
+static textwindows void poll_group_delete(struct PollGroup *poll_group) {
+  npassert(poll_group->group_size == 0);
+  CloseHandle(poll_group->afd_device_handle);
+  queue_remove(&poll_group->queue_node);
+  free(poll_group);
+}
+
+static textwindows int port_delete(struct PortState *port_state) {
+  struct TreeNode *tree_node;
+  struct QueueNode *queue_node;
+  struct SockState *sock_state;
+  struct PollGroup *poll_group;
+  /* At this point the IOCP port should have been closed.*/
+  npassert(!port_state->iocp_handle);
+  while ((tree_node = tree_root(&port_state->sock_tree)) != NULL) {
+    sock_state = sock_state_from_tree_node(tree_node);
+    sock_force_delete(port_state, sock_state);
+  }
+  while ((queue_node = queue_first(&port_state->sock_deleted_queue)) != NULL) {
+    sock_state = sock_state_from_queue_node(queue_node);
+    sock_force_delete(port_state, sock_state);
+  }
+  while ((queue_node = queue_first(&port_state->poll_group_queue)) != NULL) {
+    poll_group = poll_group_from_queue_node(queue_node);
+    poll_group_delete(poll_group);
+  }
+  npassert(queue_is_empty(&port_state->sock_update_queue));
+  DeleteCriticalSection(&port_state->lock);
+  port__free(port_state);
+  return 0;
+}
+
+static textwindows int64_t port_get_iocp_handle(struct PortState *port_state) {
+  npassert(port_state->iocp_handle);
+  return port_state->iocp_handle;
+}
+
+static textwindows struct PollGroup *poll_group__new(
+    struct PortState *port_state) {
+  int64_t iocp_handle = port_get_iocp_handle(port_state);
+  struct Queue *poll_group_queue = port_get_poll_group_queue(port_state);
+  struct PollGroup *poll_group = malloc(sizeof *poll_group);
+  if (!poll_group)
+    RETURN_SET_ERROR(NULL, kNtErrorNotEnoughMemory);
+  bzero(poll_group, sizeof *poll_group);
+  queue_node_init(&poll_group->queue_node);
+  poll_group->port_state = port_state;
+  if (afd_create_device_handle(iocp_handle, &poll_group->afd_device_handle) <
+      0) {
+    free(poll_group);
+    return NULL;
+  }
+  queue_append(poll_group_queue, &poll_group->queue_node);
+  return poll_group;
+}
+
+static textwindows struct PollGroup *poll_group_acquire(
+    struct PortState *port_state) {
+  struct Queue *poll_group_queue = port_get_poll_group_queue(port_state);
+  struct PollGroup *poll_group = !queue_is_empty(poll_group_queue)
+                                     ? CONTAINOF(queue_last(poll_group_queue),
+                                                 struct PollGroup, queue_node)
+                                     : NULL;
+  if (!poll_group || poll_group->group_size >= MAX_GROUP_SIZE)
+    poll_group = poll_group__new(port_state);
+  if (!poll_group)
+    return NULL;
+  if (++poll_group->group_size == MAX_GROUP_SIZE)
+    queue_move_to_start(poll_group_queue, &poll_group->queue_node);
+  return poll_group;
+}
+
+static textwindows int port_close(struct PortState *port_state) {
+  int result;
+  EnterCriticalSection(&port_state->lock);
+  result = port__close_iocp(port_state);
+  LeaveCriticalSection(&port_state->lock);
+  return result;
+}
+
+static textwindows uint32_t sock__epoll_events_to_afd_events(uint32_t e) {
+  /* Always monitor for kNtAfdPollLocalClose, which is triggered when
+     the socket is closed with closesocket() or CloseHandle(). */
+  uint32_t a = kNtAfdPollLocalClose;
+  if (e & (EPOLLIN | EPOLLRDNORM))
+    a |= kNtAfdPollReceive | kNtAfdPollAccept;
+  if (e & (EPOLLPRI | EPOLLRDBAND))
+    a |= kNtAfdPollReceiveExpedited;
+  if (e & (EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND))
+    a |= kNtAfdPollSend;
+  if (e & (EPOLLIN | EPOLLRDNORM | EPOLLRDHUP))
+    a |= kNtAfdPollDisconnect;
+  if (e & EPOLLHUP)
+    a |= kNtAfdPollAbort;
+  if (e & EPOLLERR)
+    a |= kNtAfdPollConnectFail;
+  return a;
+}
+
+static textwindows uint32_t sock__afd_events_to_epoll_events(uint32_t a) {
+  uint32_t e = 0;
+  if (a & (kNtAfdPollReceive | kNtAfdPollAccept))
+    e |= EPOLLIN | EPOLLRDNORM;
+  if (a & kNtAfdPollReceiveExpedited)
+    e |= EPOLLPRI | EPOLLRDBAND;
+  if (a & kNtAfdPollSend)
+    e |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
+  if (a & kNtAfdPollDisconnect)
+    e |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+  if (a & kNtAfdPollAbort)
+    e |= EPOLLHUP;
+  if (a & kNtAfdPollConnectFail) {
+    /* Linux reports all these events after connect() has failed. */
+    e |= EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLRDNORM | EPOLLWRNORM | EPOLLRDHUP;
+  }
+  return e;
+}
+
+static textwindows int sock_update(struct PortState *port_state,
+                                   struct SockState *sock_state) {
+  npassert(!sock_state->delete_pending);
+  if ((sock_state->poll_status == kPollPending) &&
+      !(sock_state->user_events & KNOWN_EVENTS & ~sock_state->pending_events)) {
+    /* All the events the user is interested in are already being
+       monitored by the pending poll operation. It might spuriously
+       complete because of an event that we're no longer interested in;
+       when that happens we'll submit a new poll operation with the
+       updated event mask. */
+  } else if (sock_state->poll_status == kPollPending) {
+    /* A poll operation is already pending, but it's not monitoring for
+       all the *events that the user is interested in. Therefore, cancel
+       the pending *poll operation; when we receive it's completion
+       package, a new poll *operation will be submitted with the correct
+       event mask. */
+    if (sock__cancel_poll(sock_state) < 0)
+      return -1;
+  } else if (sock_state->poll_status == kPollCancelled) {
+    /* The poll operation has already been cancelled, we're still waiting for
+       it to return.For now, there' s nothing that needs to be done. */
+  } else if (sock_state->poll_status == kPollIdle) {
+    /* No poll operation is pending; start one. */
+    sock_state->poll_info.Exclusive = false;
+    sock_state->poll_info.NumberOfHandles = 1;
+    sock_state->poll_info.Timeout = INT64_MAX;
+    sock_state->poll_info.Handles[0].Handle = (int64_t)sock_state->base_socket;
+    sock_state->poll_info.Handles[0].Status = 0;
+    sock_state->poll_info.Handles[0].Events =
+        sock__epoll_events_to_afd_events(sock_state->user_events);
+    if (afd_poll(poll_group_get_afd_device_handle(sock_state->poll_group),
+                 &sock_state->poll_info, &sock_state->io_status_block) < 0) {
+      switch (GetLastError()) {
+        case kNtErrorIoPending:
+          /* Overlapped poll operation in progress; this is expected. */
+          break;
+        case kNtErrorInvalidHandle:
+          /* Socket closed; it'll be dropped from the epoll set. */
+          return sock__delete(port_state, sock_state, false);
+        default:
+          /* Other errors are propagated to the caller. */
+          RETURN_MAP_ERROR(-1);
+      }
+    }
+    /* The poll request was successfully submitted.*/
+    sock_state->poll_status = kPollPending;
+    sock_state->pending_events = sock_state->user_events;
+  } else {
+    __builtin_unreachable();
+  }
+  port_cancel_socket_update(port_state, sock_state);
+  return 0;
+}
+
+static textwindows int port__update_events(struct PortState *port_state) {
+  struct QueueNode *queue_node;
+  struct SockState *sock_state;
+  struct Queue *sock_update_queue = &port_state->sock_update_queue;
+  /* Walk queue, submitting new poll requests for sockets needing it */
+  while (!queue_is_empty(sock_update_queue)) {
+    queue_node = queue_first(sock_update_queue);
+    sock_state = sock_state_from_queue_node(queue_node);
+    if (sock_update(port_state, sock_state) < 0)
+      return -1;
+    /* sock_update() removes the socket from the update queue.*/
+  }
+  return 0;
+}
+
+static textwindows void port__update_events_if_polling(
+    struct PortState *port_state) {
+  if (port_state->active_poll_count > 0)
+    port__update_events(port_state);
+}
+
+static textwindows void port_request_socket_update(
+    struct PortState *port_state, struct SockState *sock_state) {
+  if (queue_is_enqueued(sock_state_to_queue_node(sock_state)))
+    return;
+  queue_append(&port_state->sock_update_queue,
+               sock_state_to_queue_node(sock_state));
+}
+
+static textwindows int sock_feed_event(struct PortState *port_state,
+                                       struct NtIoStatusBlock *io_status_block,
+                                       struct epoll_event *ev) {
+  uint32_t epoll_events;
+  struct SockState *sock_state;
+  struct NtAfdPollInfo *poll_info;
+  epoll_events = 0;
+  sock_state = CONTAINOF(io_status_block, struct SockState, io_status_block);
+  poll_info = &sock_state->poll_info;
+  sock_state->poll_status = kPollIdle;
+  sock_state->pending_events = 0;
+  if (sock_state->delete_pending) {
+    /* Socket has been deleted earlier and can now be freed.*/
+    return sock__delete(port_state, sock_state, false);
+  } else if (io_status_block->Status == kNtStatusCancelled) {
+    /* The poll request was cancelled by CancelIoEx.*/
+  } else if (!NtSuccess(io_status_block->Status)) {
+    /* The overlapped request itself failed in an unexpected way.*/
+    epoll_events = EPOLLERR;
+  } else if (poll_info->NumberOfHandles < 1) {
+    /* This poll operation succeeded but didn't report any socket events. */
+  } else if (poll_info->Handles[0].Events & kNtAfdPollLocalClose) {
+    /* The poll operation reported that the socket was closed.*/
+    return sock__delete(port_state, sock_state, false);
+  } else {
+    /* Events related to our socket were reported.*/
+    epoll_events =
+        sock__afd_events_to_epoll_events(poll_info->Handles[0].Events);
+  }
+  /* Requeue the socket so a new poll request will be submitted.*/
+  port_request_socket_update(port_state, sock_state);
+  /* Filter out events that the user didn't ask for. */
+  epoll_events &= sock_state->user_events;
+  /* Return if there are no epoll events to report.*/
+  if (epoll_events == 0)
+    return 0;
+  /* If the the socket has the EPOLLONESHOT flag set, unmonitor all
+     events, even EPOLLERR and EPOLLHUP. But always keep looking for
+     closed sockets. */
+  if (sock_state->user_events & EPOLLONESHOT) {
+    sock_state->user_events = 0;
+  }
+  ev->data = sock_state->user_data;
+  ev->events = epoll_events;
+  return 1;
+}
+
+static textwindows int port__feed_events(struct PortState *port_state,
+                                         struct epoll_event *epoll_events,
+                                         struct NtOverlappedEntry *iocp_events,
+                                         uint32_t iocp_event_count) {
+  uint32_t i;
+  int epoll_event_count;
+  struct epoll_event *ev;
+  struct NtIoStatusBlock *io_status_block;
+  epoll_event_count = 0;
+  for (i = 0; i < iocp_event_count; i++) {
+    io_status_block = (struct NtIoStatusBlock *)iocp_events[i].lpOverlapped;
+    ev = &epoll_events[epoll_event_count];
+    epoll_event_count += sock_feed_event(port_state, io_status_block, ev);
+  }
+  return epoll_event_count;
+}
+
+static textwindows int port__poll(struct PortState *port_state,
+                                  struct epoll_event *epoll_events,
+                                  struct NtOverlappedEntry *iocp_events,
+                                  uint32_t maxevents, uint32_t timeout) {
+  bool32 r;
+  uint32_t completion_count;
+  if (port__update_events(port_state) < 0)
+    return -1;
+  port_state->active_poll_count++;
+  LeaveCriticalSection(&port_state->lock);
+  r = GetQueuedCompletionStatusEx(port_state->iocp_handle, iocp_events,
+                                  maxevents, &completion_count, timeout, false);
+  EnterCriticalSection(&port_state->lock);
+  port_state->active_poll_count--;
+  if (!r)
+    RETURN_MAP_ERROR(-1);
+  return port__feed_events(port_state, epoll_events, iocp_events,
+                           completion_count);
+}
+
+static textwindows int port_wait(struct PortState *port_state,
+                                 struct epoll_event *events, int maxevents,
+                                 int timeout) {
+  int result;
+  uint64_t now, due = 0;
+  uint32_t gqcs_timeout;
+  struct NtOverlappedEntry *iocp_events;
+  struct NtOverlappedEntry stack_iocp_events[64];
+  /* Check whether `maxevents` is in range.*/
+  if (maxevents <= 0)
+    RETURN_SET_ERROR(-1, kNtErrorInvalidParameter);
+  /* Decide whether the IOCP completion list can live on the stack, or
+     allocate memory for it on the heap. */
+  if ((size_t)maxevents <= ARRAYLEN(stack_iocp_events)) {
+    iocp_events = stack_iocp_events;
+  } else if ((iocp_events = malloc((size_t)maxevents * sizeof(*iocp_events))) ==
+             NULL) {
+    iocp_events = stack_iocp_events;
+    maxevents = ARRAYLEN(stack_iocp_events);
+  }
+  /* Compute the timeout for GetQueuedCompletionStatus, and the wait end
+     time, if the user specified a timeout other than zero or infinite. */
+  if (timeout > 0) {
+    due = GetTickCount64() + (uint64_t)timeout;
+    gqcs_timeout = (uint32_t)timeout;
+  } else if (timeout == 0) {
+    gqcs_timeout = 0;
+  } else {
+    gqcs_timeout = -1;
+  }
+  EnterCriticalSection(&port_state->lock);
+  /* Dequeue completion packets until either at least one interesting
+     event has been discovered, or the timeout is reached. */
+  for (;;) {
+    result = port__poll(port_state, events, iocp_events, (uint32_t)maxevents,
+                        gqcs_timeout);
+    if (result < 0 || result > 0)
+      break;
+    /* Result, error, or time - out. */
+    if (timeout < 0)
+      continue;
+    /* When timeout is negative, never time out. */
+    /* Update time. */
+    now = GetTickCount64();
+    /* Do not allow the due time to be in the past. */
+    if (now >= due) {
+      SetLastError(kNtWaitTimeout);
+      break;
+    }
+    /* Recompute time-out argument for GetQueuedCompletionStatus. */
+    gqcs_timeout = (uint32_t)(due - now);
+  }
+  port__update_events_if_polling(port_state);
+  LeaveCriticalSection(&port_state->lock);
+  if (iocp_events != stack_iocp_events) {
+    free(iocp_events);
+  }
+  if (result >= 0) {
+    return result;
+  } else if (GetLastError() == kNtWaitTimeout) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static textwindows int64_t ws__ioctl_get_bsp_socket(int64_t socket,
+                                                    uint32_t ioctl) {
+  uint32_t bytes;
+  int64_t bsp_socket;
+  if (WSAIoctl(socket, ioctl, NULL, 0, &bsp_socket, sizeof(bsp_socket), &bytes,
+               NULL, NULL) != -1) {
+    return bsp_socket;
+  } else {
+    return -1;
+  }
+}
+
+static textwindows int64_t ws_get_base_socket(int64_t socket) {
+  uint32_t error;
+  int64_t base_socket;
+  for (;;) {
+    base_socket = ws__ioctl_get_bsp_socket(socket, kNtSioBaseHandle);
+    if (base_socket != -1) {
+      return base_socket;
+    }
+    error = GetLastError();
+    if (error == WSAENOTSOCK) {
+      RETURN_SET_ERROR(-1, error);
+    }
+    /*
+     * Even though Microsoft documentation clearly states that Layered
+     * Spyware Providers must never ever intercept the SIO_BASE_HANDLE
+     * ioctl, Komodia LSPs (that Lenovo got sued for preinstalling) do
+     * so anyway in order to redirect decrypted https requests through
+     * some foreign proxy and inject ads which breaks high-performance
+     * network event io. However it doesn't handle SIO_BSP_HANDLE_POLL
+     * which will at least let us obtain the socket associated with the
+     * next winsock protocol chain entry. If this succeeds, loop around
+     * and call SIO_BASE_HANDLE again with the returned BSP socket, to
+     * make sure we unwrap all layers and retrieve the real base socket.
+     */
+    base_socket = ws__ioctl_get_bsp_socket(socket, kNtSioBspHandlePoll);
+    if (base_socket != -1 && base_socket != socket) {
+      socket = base_socket;
+    } else {
+      RETURN_SET_ERROR(-1, error);
+    }
+  }
+}
+
+static textwindows struct SockState *sock__alloc(void) {
+  struct SockState *sock_state = malloc(sizeof *sock_state);
+  if (!sock_state)
+    RETURN_SET_ERROR(NULL, kNtErrorNotEnoughMemory);
+  return sock_state;
+}
+
+static textwindows int port_register_socket(struct PortState *port_state,
+                                            struct SockState *sock_state,
+                                            int64_t socket) {
+  if (tree_add(&port_state->sock_tree, sock_state_to_tree_node(sock_state),
+               socket) < 0) {
+    RETURN_SET_ERROR(-1, kNtErrorAlreadyExists);
+  }
+  return 0;
+}
+
+static textwindows struct SockState *sock_new(struct PortState *port_state,
+                                              int64_t socket) {
+  int64_t base_socket;
+  struct PollGroup *poll_group;
+  struct SockState *sock_state;
+  if (socket == 0 || socket == -1)
+    RETURN_SET_ERROR(0, kNtErrorInvalidHandle);
+  base_socket = ws_get_base_socket(socket);
+  if (base_socket == -1)
+    return NULL;
+  poll_group = poll_group_acquire(port_state);
+  if (!poll_group)
+    return NULL;
+  sock_state = sock__alloc();
+  if (!sock_state)
+    goto err1;
+  bzero(sock_state, sizeof *sock_state);
+  sock_state->base_socket = base_socket;
+  sock_state->poll_group = poll_group;
+  tree_node_init(&sock_state->tree_node);
+  queue_node_init(&sock_state->queue_node);
+  if (port_register_socket(port_state, sock_state, socket) < 0)
+    goto err2;
+  return sock_state;
+err2:
+  sock__free(sock_state);
+err1:
+  poll_group_release(poll_group);
+  return NULL;
+}
+
+static textwindows int sock_set_event(struct PortState *port_state,
+                                      struct SockState *sock_state,
+                                      const struct epoll_event *ev) {
+  /* EPOLLERR and EPOLLHUP are always reported, even when not requested
+     by the caller. However they are disabled after a event has been
+     reported for a socket for which the EPOLLONESHOT flag was set. */
+  uint32_t events = ev->events | EPOLLERR | EPOLLHUP;
+  sock_state->user_events = events;
+  sock_state->user_data = ev->data;
+  if ((events & KNOWN_EVENTS & ~sock_state->pending_events) != 0) {
+    port_request_socket_update(port_state, sock_state);
+  }
+  return 0;
+}
+
+static textwindows int port__ctl_add(struct PortState *port_state, int64_t sock,
+                                     struct epoll_event *ev) {
+  struct SockState *sock_state = sock_new(port_state, sock);
+  if (!sock_state)
+    return -1;
+  if (sock_set_event(port_state, sock_state, ev) < 0) {
+    sock_delete(port_state, sock_state);
+    return -1;
+  }
+  port__update_events_if_polling(port_state);
+  return 0;
+}
+
+static textwindows struct SockState *port_find_socket(
+    struct PortState *port_state, int64_t socket) {
+  struct TreeNode *tree_node = tree_find(&port_state->sock_tree, socket);
+  if (!tree_node)
+    RETURN_SET_ERROR(NULL, kNtErrorNotFound);
+  return sock_state_from_tree_node(tree_node);
+}
+
+static textwindows int port__ctl_mod(struct PortState *port_state, int64_t sock,
+                                     struct epoll_event *ev) {
+  struct SockState *sock_state = port_find_socket(port_state, sock);
+  if (!sock_state)
+    return -1;
+  if (sock_set_event(port_state, sock_state, ev) < 0)
+    return -1;
+  port__update_events_if_polling(port_state);
+  return 0;
+}
+
+static textwindows int port__ctl_del(struct PortState *port_state,
+                                     int64_t sock) {
+  struct SockState *sock_state = port_find_socket(port_state, sock);
+  if (!sock_state)
+    return -1;
+  sock_delete(port_state, sock_state);
+  return 0;
+}
+
+static textwindows int port__ctl_op(struct PortState *port_state, int op,
+                                    int64_t sock, struct epoll_event *ev) {
+  switch (op) {
+    case EPOLL_CTL_ADD:
+      return port__ctl_add(port_state, sock, ev);
+    case EPOLL_CTL_MOD:
+      return port__ctl_mod(port_state, sock, ev);
+    case EPOLL_CTL_DEL:
+      return port__ctl_del(port_state, sock);
+    default:
+      RETURN_SET_ERROR(-1, kNtErrorInvalidParameter);
+  }
+}
+
+static textwindows int port_ctl(struct PortState *port_state, int op,
+                                int64_t sock, struct epoll_event *ev) {
+  int result;
+  EnterCriticalSection(&port_state->lock);
+  result = port__ctl_op(port_state, op, sock, ev);
+  LeaveCriticalSection(&port_state->lock);
+  return result;
+}
+
+static textwindows struct PortState *port_state_from_handle_tree_node(
+    struct TsTreeNode *tree_node) {
+  return CONTAINOF(tree_node, struct PortState, handle_tree_node);
+}
+
+static textwindows dontinline int sys_epoll_create1_nt(uint32_t flags) {
+  int fd;
+  int64_t ephnd;
+  struct PortState *port_state;
+  struct TsTreeNode *tree_node;
+  if (wepoll_init() < 0)
+    return -1;
+  fd = __reservefd(-1);
+  if (fd == -1)
+    return -1;
+  port_state = port_new(&ephnd);
+  if (!port_state) {
+    __releasefd(fd);
+    return -1;
+  }
+  tree_node = port_state_to_handle_tree_node(port_state);
+  if (ts_tree_add(&epoll__handle_tree, tree_node, (uintptr_t)ephnd) < 0) {
+    /* This should never happen. */
+    port_delete(port_state);
+    err_set_win_error(kNtErrorAlreadyExists);
+    __releasefd(fd);
+    return -1;
+  }
+  __fds_lock();
+  g_fds.p[fd].kind = kFdEpoll;
+  g_fds.p[fd].handle = ephnd;
+  g_fds.p[fd].flags = flags;
+  g_fds.p[fd].mode = 0140666;
+  __fds_unlock();
+  return fd;
+}
+
+static textwindows dontinline int sys_epoll_ctl_nt(int epfd, int op, int fd,
+                                                   struct epoll_event *ev) {
+  int r;
+  struct PortState *port_state;
+  struct TsTreeNode *tree_node;
+  if (!IsWindows()) {
+    return sys_epoll_ctl(epfd, op, fd, ev);
+  } else {
+    if (wepoll_init() < 0)
+      return -1;
+    if (!__isfdopen(fd))
+      return ebadf();
+    if (!__isfdkind(epfd, kFdEpoll))
+      return ebadf();
+    tree_node = ts_tree_find_and_ref(&epoll__handle_tree, g_fds.p[epfd].handle);
+    if (!tree_node) {
+      err_set_win_error(kNtErrorInvalidParameter);
+      goto err;
+    }
+    port_state = port_state_from_handle_tree_node(tree_node);
+    r = port_ctl(port_state, op, g_fds.p[fd].handle, ev);
+    ts_tree_node_unref(tree_node);
+    if (r < 0)
+      goto err;
+    return 0;
+  err:
+    /* On Linux, in the case of epoll_ctl(), EBADF takes priority over
+       other *errors. Wepoll mimics this behavior. */
+    err_check_handle(g_fds.p[epfd].handle);
+    err_check_handle(g_fds.p[fd].handle);
+    return -1;
+  }
+}
+
+static textwindows dontinline int sys_epoll_wait_nt(int epfd,
+                                                    struct epoll_event *events,
+                                                    int maxevents,
+                                                    int timeoutms) {
+  int num_events;
+  struct PortState *port_state;
+  struct TsTreeNode *tree_node;
+  if (!__isfdkind(epfd, kFdEpoll))
+    return ebadf();
+  if (maxevents <= 0)
+    return einval();
+  if (wepoll_init() < 0)
+    return -1;
+  tree_node = ts_tree_find_and_ref(&epoll__handle_tree, g_fds.p[epfd].handle);
+  if (!tree_node) {
+    err_set_win_error(kNtErrorInvalidParameter);
+    goto err;
+  }
+  port_state = port_state_from_handle_tree_node(tree_node);
+  num_events = port_wait(port_state, events, maxevents, timeoutms);
+  ts_tree_node_unref(tree_node);
+  if (num_events < 0)
+    goto err;
+  return num_events;
+err:
+  err_check_handle(g_fds.p[epfd].handle);
+  return -1;
+}
+
+#if SupportsWindows()
+textwindows int sys_close_epoll_nt(int fd) {
+  struct PortState *port_state;
+  struct TsTreeNode *tree_node;
+  if (wepoll_init() < 0)
+    return -1;
+  tree_node = ts_tree_del_and_ref(&epoll__handle_tree, g_fds.p[fd].handle);
+  if (!tree_node) {
+    err_set_win_error(kNtErrorInvalidParameter);
+    goto err;
+  }
+  port_state = port_state_from_handle_tree_node(tree_node);
+  port_close(port_state);
+  ts_tree_node_unref_and_destroy(tree_node);
+  return port_delete(port_state);
+err:
+  err_check_handle(g_fds.p[fd].handle);
+  return -1;
+}
+#endif
+
+/**
+ * Creates new epoll instance.
+ *
+ * @param size is ignored but must be greater than zero
+ * @param flags must be zero as there are no supported flags
+ * @return epoll file descriptor, or -1 on failure
+ */
+int epoll_create(int size) {
+  int rc;
+  if (size <= 0) {
+    rc = einval();
+  } else {
+    BLOCK_SIGNALS;
+    rc = epoll_create1(0);
+    ALLOW_SIGNALS;
+  }
+  STRACE("epoll_create(%d) → %d% m", size, rc);
+  return rc;
+}
+
+/**
+ * Creates new epoll instance.
+ *
+ * @param size is ignored but must be greater than zero
+ * @param flags must be zero or can have O_CLOEXEC
+ * @return epoll file descriptor, or -1 on failure
+ */
+int epoll_create1(int flags) {
+  int rc;
+  if (flags & ~O_CLOEXEC) {
+    rc = einval();
+  } else if (!IsWindows()) {
+    rc = __fixupnewfd(sys_epoll_create(1337), flags);
+  } else {
+    BLOCK_SIGNALS;
+    rc = sys_epoll_create1_nt(flags);
+    ALLOW_SIGNALS;
+  }
+  STRACE("epoll_create1(%#x) → %d% m", flags, rc);
+  return rc;
+}
+
+/**
+ * Controls which socket events are monitored.
+ *
+ * It is recommended to always explicitly remove a socket from its epoll
+ * set using EPOLL_CTL_DEL before closing it. As on Linux, your closed
+ * sockets are automatically removed from the epoll set, but wepoll may
+ * not be able to detect that a socket was closed until the next call to
+ * epoll_wait().
+ *
+ * @param epfd is file descriptor created by epoll_create()
+ * @param op can be EPOLL_CTL_{ADD,MOD,DEL}
+ * @param fd is file descriptor to monitor
+ * @param ev is ignored if op is EPOLL_CTL_DEL
+ * @param ev->events can have these flags:
+ *     - `EPOLLIN`: trigger on fd readable
+ *     - `EPOLLOUT`: trigger on fd writeable
+ *     - `EPOLLERR`: trigger on fd error (superfluous: always reported)
+ *     - `EPOLLHUP`: trigger on fd remote hangup (superfluous: always reported)
+ *     - `EPOLLPRI`: trigger on fd exceptional conditions, e.g. oob
+ *     - `EPOLLONESHOT`: report event(s) only once
+ *     - `EPOLLEXCLUSIVE`: not supported on windows
+ *     - `EPOLLWAKEUP`: not supported on windows
+ *     - `EPOLLET`: edge triggered mode (not supported on windows)
+ *     - `EPOLLRDNORM`
+ *     - `EPOLLRDBAND`
+ *     - `EPOLLWRNORM`
+ *     - `EPOLLWRBAND`
+ *     - `EPOLLRDHUP`
+ *     - `EPOLLMSG`
+ * @error ENOTSOCK on Windows if fd isn't a socket :(
+ * @return 0 on success, or -1 w/ errno
+ */
+int epoll_ctl(int epfd, int op, int fd, struct epoll_event *ev) {
+  int rc;
+  if (!IsWindows()) {
+    rc = sys_epoll_ctl(epfd, op, fd, ev);
+  } else {
+    BLOCK_SIGNALS;
+    rc = sys_epoll_ctl_nt(epfd, op, fd, ev);
+    ALLOW_SIGNALS;
+  }
+  STRACE("epoll_ctl(%d, %d, %d, %p) → %d% m", epfd, op, fd, ev, rc);
+  return rc;
+}
+
+/**
+ * Receives socket events.
+ *
+ * @param events will receive information about what happened
+ * @param maxevents is array length of events
+ * @param timeoutms is milliseconds, 0 to not block, or -1 for forever
+ * @return number of events stored, 0 on timeout, or -1 w/ errno
+ * @cancelationpoint
+ * @norestart
+ */
+int epoll_wait(int epfd, struct epoll_event *events, int maxevents,
+               int timeoutms) {
+  int e, rc;
+  BEGIN_CANCELATION_POINT;
+  if (!IsWindows()) {
+    e = errno;
+    rc = sys_epoll_wait(epfd, events, maxevents, timeoutms);
+    if (rc == -1 && errno == ENOSYS) {
+      errno = e;
+      rc = sys_epoll_pwait(epfd, events, maxevents, timeoutms, 0, 0);
+    }
+  } else {
+    BLOCK_SIGNALS;
+    // eintr/ecanceled not implemented for epoll() on win32 yet
+    rc = sys_epoll_wait_nt(epfd, events, maxevents, timeoutms);
+    ALLOW_SIGNALS;
+  }
+  END_CANCELATION_POINT;
+  STRACE("epoll_wait(%d, %p, %d, %d) → %d% m", epfd, events, maxevents,
+         timeoutms, rc);
+  return rc;
+}
+
+/**
+ * Receives socket events.
+ *
+ * @param events will receive information about what happened
+ * @param maxevents is array length of events
+ * @param timeoutms is milliseconds, 0 to not block, or -1 for forever
+ * @param sigmask is an optional sigprocmask() to use during call
+ * @return number of events stored, 0 on timeout, or -1 w/ errno
+ * @cancelationpoint
+ * @norestart
+ */
+int epoll_pwait(int epfd, struct epoll_event *events, int maxevents,
+                int timeoutms, const sigset_t *sigmask) {
+  int e, rc;
+  sigset_t oldmask;
+  BEGIN_CANCELATION_POINT;
+  if (!IsWindows()) {
+    e = errno;
+    rc = sys_epoll_pwait(epfd, events, maxevents, timeoutms, sigmask,
+                         sizeof(*sigmask));
+    if (rc == -1 && errno == ENOSYS) {
+      errno = e;
+      if (sigmask)
+        sys_sigprocmask(SIG_SETMASK, sigmask, &oldmask);
+      rc = sys_epoll_wait(epfd, events, maxevents, timeoutms);
+      if (sigmask)
+        sys_sigprocmask(SIG_SETMASK, &oldmask, 0);
+    }
+  } else {
+    BLOCK_SIGNALS;
+    // eintr/ecanceled not implemented for epoll() on win32 yet
+    rc = sys_epoll_wait_nt(epfd, events, maxevents, timeoutms);
+    ALLOW_SIGNALS;
+  }
+  END_CANCELATION_POINT;
+  STRACE("epoll_pwait(%d, %p, %d, %d) → %d% m", epfd, events, maxevents,
+         timeoutms, DescribeSigset(0, sigmask), rc);
+  return rc;
+}
diff --git a/libc/sock/epoll.h b/libc/sock/epoll.h
new file mode 100644
index 000000000..ff858f09d
--- /dev/null
+++ b/libc/sock/epoll.h
@@ -0,0 +1,25 @@
+#ifndef COSMOPOLITAN_LIBC_SOCK_WEPOLL_H_
+#define COSMOPOLITAN_LIBC_SOCK_WEPOLL_H_
+COSMOPOLITAN_C_START_
+#include "libc/calls/struct/sigset.h"
+
+typedef union epoll_data {
+  void *ptr;
+  int fd;
+  uint32_t u32;
+  uint64_t u64;
+} epoll_data_t;
+
+struct thatispacked epoll_event {
+  uint32_t events;
+  epoll_data_t data;
+};
+
+int epoll_create(int) libcesque;
+int epoll_create1(int) libcesque;
+int epoll_ctl(int, int, int, struct epoll_event *) libcesque;
+int epoll_wait(int, struct epoll_event *, int, int) libcesque;
+int epoll_pwait(int, struct epoll_event *, int, int, const sigset_t *);
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_SOCK_WEPOLL_H_ */
diff --git a/libc/sock/gethostips.c b/libc/sock/gethostips.c
index 6a1234f0b..0e956c1b0 100644
--- a/libc/sock/gethostips.c
+++ b/libc/sock/gethostips.c
@@ -20,7 +20,7 @@
 #include "libc/calls/syscall-sysv.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/errors.h"
 #include "libc/nt/iphlpapi.h"
diff --git a/libc/sock/getsockname.c b/libc/sock/getsockname.c
index 58012d056..b09543f2f 100644
--- a/libc/sock/getsockname.c
+++ b/libc/sock/getsockname.c
@@ -17,8 +17,8 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
-#include "libc/dce.h"
 #include "libc/intrin/fds.h"
+#include "libc/dce.h"
 #include "libc/intrin/strace.h"
 #include "libc/nt/errors.h"
 #include "libc/nt/thunk/msabi.h"
@@ -28,7 +28,6 @@
 #include "libc/sock/struct/sockaddr.h"
 #include "libc/sock/struct/sockaddr.internal.h"
 #include "libc/sock/syscall_fd.internal.h"
-#include "libc/sysv/consts/af.h"
 #include "libc/sysv/errfuns.h"
 
 __msabi extern typeof(__sys_getsockname_nt) *const __imp_getsockname;
diff --git a/libc/sock/getsockopt-nt.c b/libc/sock/getsockopt-nt.c
index 631cd25e2..a5b3e3dd8 100644
--- a/libc/sock/getsockopt-nt.c
+++ b/libc/sock/getsockopt-nt.c
@@ -39,6 +39,7 @@ textwindows int sys_getsockopt_nt(struct Fd *fd, int level, int optname,
   uint64_t ms;
   uint32_t in_optlen;
   struct linger_nt linger;
+  npassert(fd->kind == kFdSocket);
 
   if (out_opt_optval && inout_optlen) {
     in_optlen = *inout_optlen;
@@ -46,35 +47,28 @@ textwindows int sys_getsockopt_nt(struct Fd *fd, int level, int optname,
     in_optlen = 0;
   }
 
-  if (level == SOL_SOCKET && optname == SO_ERROR) {
-    if (in_optlen < sizeof(int))
-      return einval();
-    int err;
-    uint32_t len = sizeof(err);
-    if (__imp_getsockopt(fd->handle, SOL_SOCKET, SO_ERROR, &err, &len) == -1)
-      return __winsockerr();
-    *(int *)out_opt_optval = __dos2errno(err);
-    *inout_optlen = sizeof(int);
-  }
-
   if (level == SOL_SOCKET &&
       (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)) {
-    if (in_optlen < sizeof(struct timeval))
-      return einval();
-    if (optname == SO_RCVTIMEO) {
-      ms = fd->rcvtimeo;
+    if (in_optlen >= sizeof(struct timeval)) {
+      if (optname == SO_RCVTIMEO) {
+        ms = fd->rcvtimeo;
+      } else {
+        ms = fd->sndtimeo;
+      }
+      ((struct timeval *)out_opt_optval)->tv_sec = ms / 1000;
+      ((struct timeval *)out_opt_optval)->tv_usec = ms % 1000 * 1000;
+      *inout_optlen = sizeof(struct timeval);
+      return 0;
     } else {
-      ms = fd->sndtimeo;
+      return einval();
     }
-    *(struct timeval *)out_opt_optval = timeval_frommillis(ms);
-    *inout_optlen = sizeof(struct timeval);
-    return 0;
   }
 
   // TODO(jart): Use WSAIoctl?
   if (__imp_getsockopt(fd->handle, level, optname, out_opt_optval,
-                       inout_optlen) == -1)
+                       inout_optlen) == -1) {
     return __winsockerr();
+  }
 
   if (level == SOL_SOCKET) {
     if (optname == SO_LINGER && in_optlen == sizeof(struct linger)) {
diff --git a/libc/sock/ifaddrs.c b/libc/sock/ifaddrs.c
index 9ea609e09..01c0d8e05 100644
--- a/libc/sock/ifaddrs.c
+++ b/libc/sock/ifaddrs.c
@@ -18,19 +18,13 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/sock/ifaddrs.h"
 #include "libc/calls/calls.h"
-#include "libc/calls/syscall-sysv.internal.h"
-#include "libc/dce.h"
-#include "libc/limits.h"
 #include "libc/mem/mem.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/ifconf.h"
 #include "libc/sock/struct/ifreq.h"
-#include "libc/sock/struct/sockaddr6.h"
-#include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/af.h"
 #include "libc/sysv/consts/iff.h"
-#include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/sio.h"
 #include "libc/sysv/consts/sock.h"
 
@@ -42,20 +36,6 @@ struct IfAddr {
   struct sockaddr_in bstaddr;
 };
 
-struct IfAddr6Info {
-  int addr_scope;
-  int addr_flags;
-};
-
-struct IfAddr6 {
-  struct ifaddrs ifaddrs;
-  char name[IFNAMSIZ];
-  struct sockaddr_in6 addr;
-  struct sockaddr_in6 netmask;
-  struct sockaddr_in6 bstaddr;  // unused
-  struct IfAddr6Info info;
-};
-
 /**
  * Frees network interface address list.
  */
@@ -68,73 +48,6 @@ void freeifaddrs(struct ifaddrs *ifp) {
   }
 }
 
-// hex repr to network order int
-static uint128_t hex2no(const char *str) {
-  uint128_t res = 0;
-  const int max_quads = sizeof(uint128_t) * 2;
-  int i = 0;
-  while ((i < max_quads) && str[i]) {
-    uint8_t acc = (((str[i] & 0xF) + (str[i] >> 6)) | ((str[i] >> 3) & 0x8));
-    acc = acc << 4;
-    i += 1;
-    if (str[i]) {
-      acc = acc | (((str[i] & 0xF) + (str[i] >> 6)) | ((str[i] >> 3) & 0x8));
-      i += 1;
-    }
-    res = (res >> 8) | (((uint128_t)acc) << ((sizeof(uint128_t) - 1) * 8));
-  }
-  res = res >> ((max_quads - i) * 4);
-  return res;
-}
-
-/**
- * Gets network interface IPv6 address list on linux.
- *
- * @return 0 on success, or -1 w/ errno
- */
-static int getifaddrs_linux_ip6(struct ifconf *conf) {
-  int fd;
-  int n = 0;
-  struct ifreq *ifreq = conf->ifc_req;
-  const int bufsz = 44 + IFNAMSIZ + 1;
-  char buf[bufsz + 1];  // one line max size
-  if ((fd = sys_openat(0, "/proc/net/if_inet6", O_RDONLY, 0)) == -1) {
-    return -1;
-  }
-
-  while ((n = sys_read(fd, &buf[n], bufsz - n)) &&
-         ((char *)ifreq < (conf->ifc_buf + conf->ifc_len))) {
-    // flags linux include/uapi/linux/if_addr.h:44
-    // scope linux include/net/ipv6.h:L99
-
-    //           *addr,   *index,   *plen,    *scope,   *flags,   *ifname
-    char *s[] = {&buf[0], &buf[33], &buf[36], &buf[39], &buf[42], &buf[45]};
-    int ifnamelen = 0;
-    while (*s[5] == ' ') {
-      ++s[5];
-    }
-    while (s[5][ifnamelen] > '\n') {
-      ++ifnamelen;
-    }
-    buf[32] = buf[35] = buf[38] = buf[41] = buf[44] = s[5][ifnamelen] = '\0';
-    bzero(ifreq, sizeof(*ifreq));
-    ifreq->ifr_addr.sa_family = AF_INET6;
-    memcpy(&ifreq->ifr_name, s[5], ifnamelen);
-    *((uint128_t *)&ifreq->ifr6_addr) = hex2no(s[0]);
-    ifreq->ifr6_ifindex = hex2no(s[1]);
-    ifreq->ifr6_prefixlen = hex2no(s[2]);
-    ifreq->ifr6_scope = hex2no(s[3]);
-    ifreq->ifr6_flags = hex2no(s[4]);
-    ++ifreq;
-    int tlen = &s[5][ifnamelen] - &buf[0] + 1;
-    n = bufsz - tlen;
-    memcpy(&buf, &buf[tlen], n);
-  }
-
-  conf->ifc_len = (char *)ifreq - conf->ifc_buf;
-  return sys_close(fd);
-}
-
 /**
  * Gets network interface address list.
  *
@@ -142,7 +55,6 @@ static int getifaddrs_linux_ip6(struct ifconf *conf) {
  * @see tool/viz/getifaddrs.c for example code
  */
 int getifaddrs(struct ifaddrs **out_ifpp) {
-  // printf("%d\n", sizeof(struct ifreq));
   int rc = -1;
   int fd;
   if ((fd = socket(AF_INET, SOCK_DGRAM | SOCK_CLOEXEC, 0)) != -1) {
@@ -153,88 +65,42 @@ int getifaddrs(struct ifaddrs **out_ifpp) {
       conf.ifc_buf = data;
       conf.ifc_len = size;
       if (!ioctl(fd, SIOCGIFCONF, &conf)) {
-        if (IsLinux()) {
-          struct ifconf confl6;
-          confl6.ifc_buf = data + conf.ifc_len;
-          confl6.ifc_len = size - conf.ifc_len;
-          if ((rc = getifaddrs_linux_ip6(&confl6)))
-            return rc;
-          conf.ifc_len += confl6.ifc_len;
-        }
-
         struct ifaddrs *res = 0;
         for (struct ifreq *ifr = (struct ifreq *)data;
              (char *)ifr < data + conf.ifc_len; ++ifr) {
-          uint16_t family = ifr->ifr_addr.sa_family;
-          if (family == AF_INET) {
-            struct IfAddr *addr;
-            if ((addr = calloc(1, sizeof(struct IfAddr)))) {
-              memcpy(addr->name, ifr->ifr_name, IFNAMSIZ);
-              addr->ifaddrs.ifa_name = addr->name;
-              memcpy(&addr->addr, &ifr->ifr_addr, sizeof(struct sockaddr_in));
-              addr->ifaddrs.ifa_addr = (struct sockaddr *)&addr->addr;
-              addr->ifaddrs.ifa_netmask = (struct sockaddr *)&addr->netmask;
-              if (!ioctl(fd, SIOCGIFFLAGS, ifr)) {
-                addr->ifaddrs.ifa_flags = ifr->ifr_flags;
-              }
-              if (!ioctl(fd, SIOCGIFNETMASK, ifr)) {
-                memcpy(&addr->netmask, &ifr->ifr_addr,
-                       sizeof(struct sockaddr_in));
-              }
-              unsigned long op;
-              if (addr->ifaddrs.ifa_flags & IFF_BROADCAST) {
-                op = SIOCGIFBRDADDR;
-              } else if (addr->ifaddrs.ifa_flags & IFF_POINTOPOINT) {
-                op = SIOCGIFDSTADDR;
-              } else {
-                op = 0;
-              }
-              if (op && !ioctl(fd, op, ifr)) {
-                memcpy(&addr->bstaddr, &ifr->ifr_addr,
-                       sizeof(struct sockaddr_in));
-                addr->ifaddrs.ifa_broadaddr =  // is union'd w/ ifu_dstaddr
-                    (struct sockaddr *)&addr->bstaddr;
-              }
-              addr->ifaddrs.ifa_next = res;
-              res = (struct ifaddrs *)addr;
+          if (ifr->ifr_addr.sa_family != AF_INET) {
+            continue;  // TODO(jart): IPv6 support
+          }
+          struct IfAddr *addr;
+          if ((addr = calloc(1, sizeof(struct IfAddr)))) {
+            memcpy(addr->name, ifr->ifr_name, IFNAMSIZ);
+            addr->ifaddrs.ifa_name = addr->name;
+            memcpy(&addr->addr, &ifr->ifr_addr, sizeof(struct sockaddr_in));
+            addr->ifaddrs.ifa_addr = (struct sockaddr *)&addr->addr;
+            addr->ifaddrs.ifa_netmask = (struct sockaddr *)&addr->netmask;
+            if (!ioctl(fd, SIOCGIFFLAGS, ifr)) {
+              addr->ifaddrs.ifa_flags = ifr->ifr_flags;
             }
-          } else if (family == AF_INET6) {
-            struct IfAddr6 *addr6;
-            if ((addr6 = calloc(1, sizeof(struct IfAddr6)))) {
-              addr6->ifaddrs.ifa_name = addr6->name;
-              addr6->ifaddrs.ifa_addr = (struct sockaddr *)&addr6->addr;
-              addr6->ifaddrs.ifa_netmask = (struct sockaddr *)&addr6->netmask;
-              addr6->ifaddrs.ifa_broadaddr = (struct sockaddr *)&addr6->bstaddr;
-              addr6->ifaddrs.ifa_data = (void *)&addr6->info;
-
-              memcpy(&addr6->name, &ifr->ifr_name, IFNAMSIZ);
-              addr6->info.addr_flags = ifr->ifr6_flags;
-              addr6->info.addr_scope = ifr->ifr6_scope;
-
-              addr6->addr.sin6_family = AF_INET6;
-              addr6->addr.sin6_port = 0;
-              addr6->addr.sin6_flowinfo = 0;
-              addr6->addr.sin6_scope_id = ifr->ifr6_ifindex;
-              memcpy(&addr6->addr.sin6_addr, &ifr->ifr6_addr,
-                     sizeof(struct in6_addr));
-
-              addr6->netmask.sin6_family = AF_INET6;
-              addr6->netmask.sin6_port = 0;
-              addr6->netmask.sin6_flowinfo = 0;
-              addr6->addr.sin6_scope_id = ifr->ifr6_ifindex;
-              memcpy(&addr6->netmask.sin6_addr, &ifr->ifr6_addr,
-                     sizeof(struct in6_addr));
-              *((uint128_t *)&(addr6->netmask.sin6_addr)) &=
-                  (UINT128_MAX >> ifr->ifr6_prefixlen);
-
-              if (!ioctl(fd, SIOCGIFFLAGS, ifr)) {
-                addr6->ifaddrs.ifa_flags = ifr->ifr_flags;
-              }
-
-              bzero(&addr6->bstaddr, sizeof(struct sockaddr_in6));
-              addr6->ifaddrs.ifa_next = res;
-              res = (struct ifaddrs *)addr6;
+            if (!ioctl(fd, SIOCGIFNETMASK, ifr)) {
+              memcpy(&addr->netmask, &ifr->ifr_addr,
+                     sizeof(struct sockaddr_in));
             }
+            unsigned long op;
+            if (addr->ifaddrs.ifa_flags & IFF_BROADCAST) {
+              op = SIOCGIFBRDADDR;
+            } else if (addr->ifaddrs.ifa_flags & IFF_POINTOPOINT) {
+              op = SIOCGIFDSTADDR;
+            } else {
+              op = 0;
+            }
+            if (op && !ioctl(fd, op, ifr)) {
+              memcpy(&addr->bstaddr, &ifr->ifr_addr,
+                     sizeof(struct sockaddr_in));
+              addr->ifaddrs.ifa_broadaddr =  // is union'd w/ ifu_dstaddr
+                  (struct sockaddr *)&addr->bstaddr;
+            }
+            addr->ifaddrs.ifa_next = res;
+            res = (struct ifaddrs *)addr;
           }
         }
         *out_ifpp = res;
diff --git a/libc/sock/inet_pton.c b/libc/sock/inet_pton.c
index 8ad1bbd24..dd226eaae 100644
--- a/libc/sock/inet_pton.c
+++ b/libc/sock/inet_pton.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/ctype.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/sock.h"
 #include "libc/sysv/consts/af.h"
diff --git a/libc/sock/internal.h b/libc/sock/internal.h
index 927b531fe..3cc13b061 100644
--- a/libc/sock/internal.h
+++ b/libc/sock/internal.h
@@ -2,7 +2,6 @@
 #define COSMOPOLITAN_LIBC_SOCK_INTERNAL_H_
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/sigset.h"
-#include "libc/calls/struct/timespec.h"
 #include "libc/nt/struct/overlapped.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
@@ -53,6 +52,11 @@ int32_t sys_select(int32_t, fd_set *, fd_set *, fd_set *, struct timeval *);
 int sys_pselect(int, fd_set *, fd_set *, fd_set *, struct timespec *,
                 const void *);
 int sys_setsockopt(int, int, int, const void *, uint32_t);
+int32_t sys_epoll_create(int32_t);
+int32_t sys_epoll_ctl(int32_t, int32_t, int32_t, void *);
+int32_t sys_epoll_wait(int32_t, void *, int32_t, int32_t);
+int32_t sys_epoll_pwait(int32_t, void *, int32_t, int32_t, const sigset_t *,
+                        size_t);
 
 int sys_socket_nt(int, int, int);
 
@@ -61,7 +65,7 @@ int sys_socketpair_nt_stream(int, int, int, int[2]) ;
 int sys_socketpair_nt_dgram(int, int, int, int[2]) ;
 */
 int sys_socketpair_nt(int, int, int, int[2]);
-int sys_select_nt(int, fd_set *, fd_set *, fd_set *, const struct timespec *,
+int sys_select_nt(int, fd_set *, fd_set *, fd_set *, struct timeval *,
                   const sigset_t *);
 
 size_t __iovec2nt(struct NtIovec[hasatleast 16], const struct iovec *, size_t);
diff --git a/libc/sock/iovec2nt.c b/libc/sock/iovec2nt.c
index 3f13b22a2..cba7dce6b 100644
--- a/libc/sock/iovec2nt.c
+++ b/libc/sock/iovec2nt.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/iovec.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sock/internal.h"
 #include "libc/sysv/consts/iov.h"
 
@@ -29,21 +29,15 @@
  */
 textwindows size_t __iovec2nt(struct NtIovec iovnt[hasatleast 16],
                               const struct iovec *iov, size_t iovlen) {
-  size_t i, j, limit = 0x7ffff000;
-  for (j = i = 0; i < iovlen; ++i) {
-    if (!iov[i].iov_len)
-      continue;
-    if (j == 16)
-      break;
-    iovnt[j].buf = iov[i].iov_base;
+  size_t i, limit;
+  for (limit = 0x7ffff000, i = 0; i < MIN(16, iovlen); ++i) {
+    iovnt[i].buf = iov[i].iov_base;
     if (iov[i].iov_len < limit) {
-      limit -= (iovnt[j].len = iov[i].iov_len);
-      ++j;
+      limit -= (iovnt[i].len = iov[i].iov_len);
     } else {
-      iovnt[j].len = limit;
-      ++j;
+      iovnt[i].len = limit;
       break;
     }
   }
-  return j;
+  return i;
 }
diff --git a/libc/sock/kntwsadata.c b/libc/sock/kntwsadata.c
index 6e03dc588..2c08015e1 100644
--- a/libc/sock/kntwsadata.c
+++ b/libc/sock/kntwsadata.c
@@ -51,7 +51,3 @@ textwindows void WinSockInit(void) {
     _Exit(1);
   }
 }
-
-textwindows dontinstrument void WinSockFork(void) {
-  WSAStartup(VERSION, &kNtWsaData);
-}
diff --git a/libc/sock/listen-nt.c b/libc/sock/listen-nt.c
index f39c9b313..b82bf4a5d 100644
--- a/libc/sock/listen-nt.c
+++ b/libc/sock/listen-nt.c
@@ -20,28 +20,18 @@
 #include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
-#include "libc/sock/struct/sockaddr.h"
 #include "libc/sock/syscall_fd.internal.h"
-#include "libc/sysv/consts/af.h"
-#include "libc/sysv/consts/fio.h"
 #ifdef __x86_64__
 
 __msabi extern typeof(__sys_listen_nt) *const __imp_listen;
 
-textwindows int sys_listen_nt(struct Fd *f, int backlog) {
-  unassert(f->kind == kFdSocket);
-
-  // winsock listen() requires bind() be called beforehand
-  if (!f->isbound) {
-    struct sockaddr_in sin = {AF_INET};
-    if (sys_bind_nt(f, (struct sockaddr *)&sin, sizeof(sin)) == -1)
-      return -1;
-  }
-
-  if (__imp_listen(f->handle, backlog) == -1)
+textwindows int sys_listen_nt(struct Fd *fd, int backlog) {
+  npassert(fd->kind == kFdSocket);
+  if (__imp_listen(fd->handle, backlog) != -1) {
+    return 0;
+  } else {
     return __winsockerr();
-
-  return 0;
+  }
 }
 
 #endif /* __x86_64__ */
diff --git a/libc/sock/recv-nt.c b/libc/sock/recv-nt.c
index 013fc930a..1652c113e 100644
--- a/libc/sock/recv-nt.c
+++ b/libc/sock/recv-nt.c
@@ -17,17 +17,15 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
+#include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/nt/struct/iovec.h"
-#include "libc/nt/struct/overlapped.h"
-#include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/syscall_fd.internal.h"
-#include "libc/sysv/consts/fio.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/vga/vga.internal.h"
 #ifdef __x86_64__
 
 #define _MSG_OOB      1
@@ -35,15 +33,13 @@
 #define _MSG_WAITALL  8
 #define _MSG_DONTWAIT 64
 
-__msabi extern typeof(__sys_ioctlsocket_nt) *const __imp_ioctlsocket;
-
 struct RecvArgs {
   const struct iovec *iov;
   size_t iovlen;
   struct NtIovec iovnt[16];
 };
 
-textwindows static int sys_recv_nt_start(int64_t handle,
+static textwindows int sys_recv_nt_start(int64_t handle,
                                          struct NtOverlapped *overlap,
                                          uint32_t *flags, void *arg) {
   struct RecvArgs *args = arg;
@@ -54,28 +50,18 @@ textwindows static int sys_recv_nt_start(int64_t handle,
 
 textwindows ssize_t sys_recv_nt(int fd, const struct iovec *iov, size_t iovlen,
                                 uint32_t flags) {
-  if (flags & ~(_MSG_DONTWAIT | _MSG_OOB | _MSG_PEEK | _MSG_WAITALL))
+  if (flags & ~(_MSG_DONTWAIT | _MSG_OOB | _MSG_PEEK | _MSG_WAITALL)) {
     return einval();
+  }
   ssize_t rc;
   struct Fd *f = g_fds.p + fd;
-  sigset_t waitmask = __sig_block();
-
-  // "Be aware that if the underlying transport provider does not
-  //  support MSG_WAITALL, or if the socket is in a non-blocking mode,
-  //  then this call will fail with WSAEOPNOTSUPP. Also, if MSG_WAITALL
-  //  is specified along with MSG_OOB, MSG_PEEK, or MSG_PARTIAL, then
-  //  this call will fail with WSAEOPNOTSUPP."
-  //                             —Quoth MSDN § WSARecv
-  if (flags & _MSG_WAITALL)
-    __imp_ioctlsocket(f->handle, FIONBIO, (uint32_t[]){0});
-
-  rc = __winsock_block(f->handle, flags & ~_MSG_DONTWAIT,
-                       (f->flags & O_NONBLOCK) || (flags & _MSG_DONTWAIT),
-                       f->rcvtimeo, waitmask, sys_recv_nt_start,
-                       &(struct RecvArgs){iov, iovlen});
-
-  __sig_unblock(waitmask);
-
+  sigset_t m = __sig_block();
+  bool nonblock = !(flags & _MSG_WAITALL) &&
+                  ((f->flags & O_NONBLOCK) || (flags & _MSG_DONTWAIT));
+  flags &= ~_MSG_DONTWAIT;
+  rc = __winsock_block(f->handle, flags, nonblock, f->rcvtimeo, m,
+                       sys_recv_nt_start, &(struct RecvArgs){iov, iovlen});
+  __sig_unblock(m);
   return rc;
 }
 
diff --git a/libc/sock/recv.c b/libc/sock/recv.c
index 31003e7fb..ec85a9b48 100644
--- a/libc/sock/recv.c
+++ b/libc/sock/recv.c
@@ -20,55 +20,31 @@
 #include "libc/calls/internal.h"
 #include "libc/calls/struct/iovec.internal.h"
 #include "libc/dce.h"
-#include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/syscall_fd.internal.h"
-#include "libc/sysv/consts/msg.h"
 #include "libc/sysv/errfuns.h"
 
 /**
  * Receives data from network socket.
  *
- * Calling `recv(fd, p, n, 0)` is equivalent to `read(fd, p, n)`.
- *
- * Unlike files where the OS tries very hard to fulfill the entire
- * requested `size` before returning, read operations on sockets aim to
- * return as quickly as possible. For example, if 10 bytes are requested
- * and a packet comes in with only 5 bytes, then recv() will most likely
- * return those 5 bytes before waiting longer. The `MSG_WAITALL` flag
- * may be passed when waiting longer is desired. In that case, short
- * reads should only be possible when the connection status changes or
- * the receive operation is interrupted by a signal.
- *
  * @param fd is the file descriptor returned by socket()
  * @param buf is where received network data gets copied
  * @param size is the byte capacity of buf
  * @param flags can have `MSG_OOB`, `MSG_PEEK`, `MSG_DONTWAIT`, `MSG_WAITALL`
  * @return number of bytes received, 0 on remote close, or -1 w/ errno
- * @raise EINTR if signal handler was called instead
- * @raise EINVAL if unknown bits were passed in `flags`
- * @raise EINVAL if flag isn't supported by host operating system
- * @raise EINVAL if `MSG_WAITALL` and `MSG_PEEK` were both passed
- * @raise EBADF if `fd` is an invalid file descriptor
- * @raise EAGAIN if `MSG_DONTWAIT` was passed and no data was available
- * @raise EAGAIN if `O_NONBLOCK` is in play and no data was available
  * @error EINTR, EHOSTUNREACH, ECONNRESET (UDP ICMP Port Unreachable),
  *     EPIPE (if MSG_NOSIGNAL), EMSGSIZE, ENOTSOCK, EFAULT, etc.
  * @cancelationpoint
  * @asyncsignalsafe
- * @restartable (unless SO_RCVTIMEO on Linux or Windows)
+ * @restartable (unless SO_RCVTIMEO)
  */
 ssize_t recv(int fd, void *buf, size_t size, int flags) {
   ssize_t rc;
   BEGIN_CANCELATION_POINT;
 
-  if ((flags & (MSG_WAITALL | MSG_PEEK)) == (MSG_WAITALL | MSG_PEEK)) {
-    // this is possible on some OSes like Linux but it breaks FreeBSD
-    // and Windows will raise EOPNOTSUPP when it gets passed together
-    return einval();
-  } else if (fd < g_fds.n && g_fds.p[fd].kind == kFdZip) {
+  if (fd < g_fds.n && g_fds.p[fd].kind == kFdZip) {
     rc = enotsock();
   } else if (!IsWindows()) {
     rc = sys_recvfrom(fd, buf, size, flags, 0, 0);
@@ -89,8 +65,7 @@ ssize_t recv(int fd, void *buf, size_t size, int flags) {
   }
 
   END_CANCELATION_POINT;
-  DATATRACE("recv(%d, [%#.*hhs%s], %'zu, %s) → %'ld% lm", fd,
-            MAX(0, MIN(40, rc)), buf, rc > 40 ? "..." : "", size,
-            DescribeMsg(flags), rc);
+  DATATRACE("recv(%d, [%#.*hhs%s], %'zu, %#x) → %'ld% lm", fd,
+            MAX(0, MIN(40, rc)), buf, rc > 40 ? "..." : "", size, flags);
   return rc;
 }
diff --git a/libc/sock/recvfrom-nt.c b/libc/sock/recvfrom-nt.c
index e40e6ed6a..69f436d07 100644
--- a/libc/sock/recvfrom-nt.c
+++ b/libc/sock/recvfrom-nt.c
@@ -17,9 +17,9 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/sigset.internal.h"
-#include "libc/intrin/fds.h"
 #include "libc/nt/struct/iovec.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
@@ -42,7 +42,7 @@ struct RecvFromArgs {
   struct NtIovec iovnt[16];
 };
 
-textwindows static int sys_recvfrom_nt_start(int64_t handle,
+static textwindows int sys_recvfrom_nt_start(int64_t handle,
                                              struct NtOverlapped *overlap,
                                              uint32_t *flags, void *arg) {
   struct RecvFromArgs *args = arg;
@@ -59,13 +59,14 @@ textwindows ssize_t sys_recvfrom_nt(int fd, const struct iovec *iov,
     return einval();
   ssize_t rc;
   struct Fd *f = g_fds.p + fd;
-  sigset_t waitmask = __sig_block();
-  rc = __winsock_block(f->handle, flags & ~_MSG_DONTWAIT,
-                       (f->flags & O_NONBLOCK) || (flags & _MSG_DONTWAIT),
-                       f->rcvtimeo, waitmask, sys_recvfrom_nt_start,
+  sigset_t m = __sig_block();
+  bool nonblock = (f->flags & O_NONBLOCK) || (flags & _MSG_DONTWAIT);
+  flags &= ~_MSG_DONTWAIT;
+  rc = __winsock_block(f->handle, flags, nonblock, f->rcvtimeo, m,
+                       sys_recvfrom_nt_start,
                        &(struct RecvFromArgs){iov, iovlen, opt_out_srcaddr,
                                               opt_inout_srcaddrsize});
-  __sig_unblock(waitmask);
+  __sig_unblock(m);
   return rc;
 }
 
diff --git a/libc/sock/recvfrom.c b/libc/sock/recvfrom.c
index dbcda5b2c..d5e7565cf 100644
--- a/libc/sock/recvfrom.c
+++ b/libc/sock/recvfrom.c
@@ -21,7 +21,6 @@
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/iovec.internal.h"
 #include "libc/dce.h"
-#include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
@@ -50,7 +49,7 @@
  *     EPIPE (if MSG_NOSIGNAL), EMSGSIZE, ENOTSOCK, EFAULT, etc.
  * @cancelationpoint
  * @asyncsignalsafe
- * @restartable (unless SO_RCVTIMEO on Linux or Windows)
+ * @restartable (unless SO_RCVTIMEO)
  */
 ssize_t recvfrom(int fd, void *buf, size_t size, int flags,
                  struct sockaddr *opt_out_srcaddr,
@@ -96,11 +95,7 @@ ssize_t recvfrom(int fd, void *buf, size_t size, int flags,
   }
 
   END_CANCELATION_POINT;
-  DATATRACE(
-      "recvfrom(%d, [%#.*hhs%s], %'zu, %s, %s) → %'ld% lm", fd,
-      MAX(0, MIN(40, rc)), buf, rc > 40 ? "..." : "", size, DescribeMsg(flags),
-      DescribeSockaddr(opt_out_srcaddr,
-                       opt_inout_srcaddrsize ? *opt_inout_srcaddrsize : 0),
-      rc);
+  DATATRACE("recvfrom(%d, [%#.*hhs%s], %'zu, %#x) → %'ld% lm", fd,
+            MAX(0, MIN(40, rc)), buf, rc > 40 ? "..." : "", size, flags, rc);
   return rc;
 }
diff --git a/libc/sock/select.internal.h b/libc/sock/select.internal.h
index 5de28441a..d565635a6 100644
--- a/libc/sock/select.internal.h
+++ b/libc/sock/select.internal.h
@@ -4,8 +4,8 @@
 #include "libc/sock/select.h"
 COSMOPOLITAN_C_START_
 
-const char *_DescribeFdSet(char[100], ssize_t, int, fd_set *) libcesque;
-#define DescribeFdSet(x, y, z) _DescribeFdSet(alloca(100), x, y, z)
+const char *DescribeFdSet(char[100], ssize_t, int, fd_set *) libcesque;
+#define DescribeFdSet(x, y, z) DescribeFdSet(alloca(100), x, y, z)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_SOCK_SELECT_INTERNAL_H_ */
diff --git a/libc/sock/send-nt.c b/libc/sock/send-nt.c
index ba0570665..63cf646cf 100644
--- a/libc/sock/send-nt.c
+++ b/libc/sock/send-nt.c
@@ -17,25 +17,20 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
-#include "libc/calls/sig.internal.h"
-#include "libc/calls/struct/iovec.internal.h"
+#include "libc/intrin/fds.h"
+#include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/sigset.internal.h"
-#include "libc/errno.h"
-#include "libc/nt/errors.h"
 #include "libc/nt/struct/iovec.h"
-#include "libc/nt/struct/overlapped.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
-#include "libc/sysv/consts/sicode.h"
-#include "libc/sysv/consts/sig.h"
+#include "libc/sock/syscall_fd.internal.h"
+#include "libc/sysv/consts/o.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/vga/vga.internal.h"
 #ifdef __x86_64__
 
 #define _MSG_OOB       1
 #define _MSG_DONTROUTE 4
 #define _MSG_DONTWAIT  64
-#define _MSG_NOSIGNAL  0x10000000
 
 struct SendArgs {
   const struct iovec *iov;
@@ -43,7 +38,7 @@ struct SendArgs {
   struct NtIovec iovnt[16];
 };
 
-textwindows static int sys_send_nt_start(int64_t handle,
+static textwindows int sys_send_nt_start(int64_t handle,
                                          struct NtOverlapped *overlap,
                                          uint32_t *flags, void *arg) {
   struct SendArgs *args = arg;
@@ -54,25 +49,16 @@ textwindows static int sys_send_nt_start(int64_t handle,
 
 textwindows ssize_t sys_send_nt(int fd, const struct iovec *iov, size_t iovlen,
                                 uint32_t flags) {
-  if (flags & ~(_MSG_DONTWAIT | _MSG_OOB | _MSG_DONTROUTE | _MSG_NOSIGNAL))
+  if (flags & ~(_MSG_DONTWAIT | _MSG_OOB | _MSG_DONTROUTE))
     return einval();
   ssize_t rc;
   struct Fd *f = g_fds.p + fd;
-  sigset_t waitmask = __sig_block();
-
-  rc = __winsock_block(f->handle, flags & ~(_MSG_DONTWAIT | _MSG_NOSIGNAL),
-                       false, f->sndtimeo, waitmask, sys_send_nt_start,
-                       &(struct SendArgs){iov, iovlen});
-
-  __sig_unblock(waitmask);
-
-  if (rc == -1 && (errno == WSAESHUTDOWN ||      // ESHUTDOWN
-                   errno == WSAECONNABORTED)) {  // ECONNABORTED
-    errno = kNtErrorBrokenPipe;                  // EPIPE
-    if (!(flags & _MSG_NOSIGNAL))
-      __sig_raise(SIGPIPE, SI_KERNEL);
-  }
-
+  sigset_t m = __sig_block();
+  bool nonblock = (f->flags & O_NONBLOCK) || (flags & _MSG_DONTWAIT);
+  flags &= ~_MSG_DONTWAIT;
+  rc = __winsock_block(f->handle, flags, nonblock, f->sndtimeo, m,
+                       sys_send_nt_start, &(struct SendArgs){iov, iovlen});
+  __sig_unblock(m);
   return rc;
 }
 
diff --git a/libc/sock/send.c b/libc/sock/send.c
index 81831b27c..4d9df49ad 100644
--- a/libc/sock/send.c
+++ b/libc/sock/send.c
@@ -21,9 +21,8 @@
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/iovec.internal.h"
 #include "libc/dce.h"
-#include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/sock.h"
 #include "libc/sysv/errfuns.h"
@@ -31,18 +30,6 @@
 /**
  * Sends data to network socket.
  *
- * Calling `send(fd, p, n, 0)` is equivalent to `write(fd, p, n)`.
- *
- * On Windows, calling send() or write() on a socket in `O_NONBLOCK`
- * mode will block. This is done for many reasons. First, most UNIX OSes
- * have a similar behavior, due to how little code checks the return
- * status of write(). Secondly, WIN32 has bugs that prevent us from
- * canceling an overlapped WSASend() operation safely. Programs that
- * want to avoid send() blocking should call poll() beforehand with the
- * POLLOUT flag to test when the socket can safely be written without
- * blocking. It's also possible to pass `MSG_DONTWAIT` via `flags` in
- * which case send() will do this for you automatically.
- *
  * @param fd is the file descriptor returned by socket()
  * @param buf is the data to send, which we'll copy if necessary
  * @param size is the byte-length of buf
@@ -52,7 +39,7 @@
  *     EPIPE (if MSG_NOSIGNAL), EMSGSIZE, ENOTSOCK, EFAULT, etc.
  * @cancelationpoint
  * @asyncsignalsafe
- * @restartable (unless SO_SNDTIMEO on Linux or Windows)
+ * @restartable (unless SO_RCVTIMEO)
  */
 ssize_t send(int fd, const void *buf, size_t size, int flags) {
   ssize_t rc;
@@ -79,7 +66,7 @@ ssize_t send(int fd, const void *buf, size_t size, int flags) {
   }
 
   END_CANCELATION_POINT;
-  DATATRACE("send(%d, %#.*hhs%s, %'zu, %s) → %'ld% lm", fd, MAX(0, MIN(40, rc)),
-            buf, rc > 40 ? "..." : "", size, DescribeMsg(flags), rc);
+  DATATRACE("send(%d, %#.*hhs%s, %'zu, %#x) → %'ld% lm", fd,
+            MAX(0, MIN(40, rc)), buf, rc > 40 ? "..." : "", size, flags, rc);
   return rc;
 }
diff --git a/libc/sock/sendfile.c b/libc/sock/sendfile.c
index 15a5c03cf..26d7cef55 100644
--- a/libc/sock/sendfile.c
+++ b/libc/sock/sendfile.c
@@ -38,7 +38,6 @@
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/sendfile.internal.h"
-#include "libc/sock/syscall_fd.internal.h"
 #include "libc/sock/wsaid.internal.h"
 #include "libc/stdio/sysparam.h"
 #include "libc/sysv/errfuns.h"
@@ -59,14 +58,13 @@ static void transmitfile_init(void) {
   g_transmitfile.lpTransmitFile = __get_wsaid(&TransmitfileGuid);
 }
 
-textwindows dontinline static ssize_t sys_sendfile_nt(
+static dontinline textwindows ssize_t sys_sendfile_nt(
     int outfd, int infd, int64_t *opt_in_out_inoffset, uint32_t uptobytes) {
   ssize_t rc;
   uint32_t flags = 0;
-  bool locked = false;
   int64_t ih, oh, eof, offset;
   struct NtByHandleFileInformation wst;
-  if (!__isfdkind(infd, kFdFile) || !g_fds.p[infd].cursor)
+  if (!__isfdkind(infd, kFdFile))
     return ebadf();
   if (!__isfdkind(outfd, kFdSocket))
     return ebadf();
@@ -75,9 +73,7 @@ textwindows dontinline static ssize_t sys_sendfile_nt(
   if (opt_in_out_inoffset) {
     offset = *opt_in_out_inoffset;
   } else {
-    locked = true;
-    __cursor_lock(g_fds.p[infd].cursor);
-    offset = g_fds.p[infd].cursor->shared->pointer;
+    offset = g_fds.p[infd].pointer;
   }
   if (GetFileInformationByHandle(ih, &wst)) {
     // TransmitFile() returns EINVAL if `uptobytes` goes past EOF.
@@ -86,22 +82,20 @@ textwindows dontinline static ssize_t sys_sendfile_nt(
       uptobytes = eof - offset;
     }
   } else {
-    if (locked)
-      __cursor_unlock(g_fds.p[infd].cursor);
     return ebadf();
   }
+  BLOCK_SIGNALS;
   struct NtOverlapped ov = {.hEvent = WSACreateEvent(), .Pointer = offset};
   cosmo_once(&g_transmitfile.once, transmitfile_init);
-  if (ov.hEvent &&
-      (g_transmitfile.lpTransmitFile(oh, ih, uptobytes, 0, &ov, 0, 0) ||
-       WSAGetLastError() == kNtErrorIoPending ||
-       WSAGetLastError() == WSAEINPROGRESS)) {
+  if (g_transmitfile.lpTransmitFile(oh, ih, uptobytes, 0, &ov, 0, 0) ||
+      WSAGetLastError() == kNtErrorIoPending ||
+      WSAGetLastError() == WSAEINPROGRESS) {
     if (WSAGetOverlappedResult(oh, &ov, &uptobytes, true, &flags)) {
       rc = uptobytes;
       if (opt_in_out_inoffset) {
         *opt_in_out_inoffset = offset + rc;
       } else {
-        g_fds.p[infd].cursor->shared->pointer = offset + rc;
+        g_fds.p[infd].pointer = offset + rc;
       }
     } else {
       rc = __winsockerr();
@@ -109,9 +103,8 @@ textwindows dontinline static ssize_t sys_sendfile_nt(
   } else {
     rc = __winsockerr();
   }
-  if (locked)
-    __cursor_unlock(g_fds.p[infd].cursor);
   WSACloseEvent(ov.hEvent);
+  ALLOW_SIGNALS;
   return rc;
 }
 
@@ -193,9 +186,7 @@ ssize_t sendfile(int outfd, int infd, int64_t *opt_in_out_inoffset,
   } else if (IsFreebsd() || IsXnu()) {
     rc = sys_sendfile_bsd(outfd, infd, opt_in_out_inoffset, uptobytes);
   } else if (IsWindows()) {
-    BLOCK_SIGNALS;
     rc = sys_sendfile_nt(outfd, infd, opt_in_out_inoffset, uptobytes);
-    ALLOW_SIGNALS;
   } else {
     rc = enosys();
   }
diff --git a/libc/sock/sendto-nt.c b/libc/sock/sendto-nt.c
index f0be2f4c9..2daf9badc 100644
--- a/libc/sock/sendto-nt.c
+++ b/libc/sock/sendto-nt.c
@@ -17,26 +17,20 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
-#include "libc/calls/sig.internal.h"
+#include "libc/intrin/fds.h"
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/sigset.internal.h"
-#include "libc/errno.h"
-#include "libc/intrin/fds.h"
-#include "libc/nt/errors.h"
 #include "libc/nt/struct/iovec.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/syscall_fd.internal.h"
 #include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/sicode.h"
-#include "libc/sysv/consts/sig.h"
 #include "libc/sysv/errfuns.h"
 #ifdef __x86_64__
 
 #define _MSG_OOB       1
 #define _MSG_DONTROUTE 4
 #define _MSG_DONTWAIT  64
-#define _MSG_NOSIGNAL  0x10000000
 
 struct SendToArgs {
   const struct iovec *iov;
@@ -58,26 +52,17 @@ static textwindows int sys_sendto_nt_start(int64_t handle,
 textwindows ssize_t sys_sendto_nt(int fd, const struct iovec *iov,
                                   size_t iovlen, uint32_t flags,
                                   void *opt_in_addr, uint32_t in_addrsize) {
-  if (flags & ~(_MSG_DONTWAIT | _MSG_OOB | _MSG_DONTROUTE | _MSG_NOSIGNAL))
+  if (flags & ~(_MSG_DONTWAIT | _MSG_OOB | _MSG_DONTROUTE))
     return einval();
   ssize_t rc;
   struct Fd *f = g_fds.p + fd;
-  sigset_t waitmask = __sig_block();
-
-  rc = __winsock_block(f->handle, flags & ~(_MSG_DONTWAIT | _MSG_NOSIGNAL),
-                       false, f->sndtimeo, waitmask, sys_sendto_nt_start,
-                       &(struct SendToArgs){iov, iovlen,  //
-                                            opt_in_addr, in_addrsize});
-
-  __sig_unblock(waitmask);
-
-  if (rc == -1 && (errno == WSAESHUTDOWN ||      // ESHUTDOWN
-                   errno == WSAECONNABORTED)) {  // ECONNABORTED
-    errno = kNtErrorBrokenPipe;                  // EPIPE
-    if (!(flags & _MSG_NOSIGNAL))
-      __sig_raise(SIGPIPE, SI_KERNEL);
-  }
-
+  sigset_t m = __sig_block();
+  bool nonblock = (f->flags & O_NONBLOCK) || (flags & _MSG_DONTWAIT);
+  flags &= ~_MSG_DONTWAIT;
+  rc = __winsock_block(
+      f->handle, flags, nonblock, f->sndtimeo, m, sys_sendto_nt_start,
+      &(struct SendToArgs){iov, iovlen, opt_in_addr, in_addrsize});
+  __sig_unblock(m);
   return rc;
 }
 
diff --git a/libc/sock/sendto.c b/libc/sock/sendto.c
index f5a458aeb..803e48fa0 100644
--- a/libc/sock/sendto.c
+++ b/libc/sock/sendto.c
@@ -22,9 +22,8 @@
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/iovec.internal.h"
 #include "libc/dce.h"
-#include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/sockaddr.h"
@@ -52,7 +51,7 @@
  *     EPIPE (if MSG_NOSIGNAL), EMSGSIZE, ENOTSOCK, EFAULT, etc.
  * @cancelationpoint
  * @asyncsignalsafe
- * @restartable (unless SO_SNDTIMEO on Linux or Windows)
+ * @restartable (unless SO_RCVTIMEO)
  */
 ssize_t sendto(int fd, const void *buf, size_t size, int flags,
                const struct sockaddr *opt_addr, uint32_t addrsize) {
@@ -89,8 +88,8 @@ ssize_t sendto(int fd, const void *buf, size_t size, int flags,
   }
 
   END_CANCELATION_POINT;
-  DATATRACE("sendto(%d, %#.*hhs%s, %'zu, %s, %s) → %'ld% lm", fd,
-            MAX(0, MIN(40, rc)), buf, rc > 40 ? "..." : "", size,
-            DescribeMsg(flags), DescribeSockaddr(opt_addr, addrsize), rc);
+  DATATRACE("sendto(%d, %#.*hhs%s, %'zu, %#x, %p, %u) → %'ld% lm", fd,
+            MAX(0, MIN(40, rc)), buf, rc > 40 ? "..." : "", size, flags,
+            opt_addr, addrsize, rc);
   return rc;
 }
diff --git a/libc/sock/setsockopt-nt.c b/libc/sock/setsockopt-nt.c
index 670c9a150..f9e7a6a4d 100644
--- a/libc/sock/setsockopt-nt.c
+++ b/libc/sock/setsockopt-nt.c
@@ -16,8 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/timeval.h"
 #include "libc/intrin/fds.h"
+#include "libc/calls/struct/timeval.h"
 #include "libc/nt/struct/linger.h"
 #include "libc/nt/thunk/msabi.h"
 #include "libc/nt/winsock.h"
@@ -36,17 +36,14 @@ textwindows int sys_setsockopt_nt(struct Fd *fd, int level, int optname,
                                   const void *optval, uint32_t optlen) {
 
   // socket read/write timeouts
-  // timeout of zero means wait forever (default)
   if (level == SOL_SOCKET &&
       (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)) {
-    if (!optval)
-      return einval();
-    if (optlen < sizeof(struct timeval))
+    if (!(optval && optlen == sizeof(struct timeval)))
       return einval();
     const struct timeval *tv = optval;
     int64_t ms = timeval_tomillis(*tv);
-    if (ms > -1u)
-      ms = -1u;
+    if (ms >= 0xffffffffu)
+      ms = 0;  // wait forever (default)
     if (optname == SO_RCVTIMEO)
       fd->rcvtimeo = ms;
     if (optname == SO_SNDTIMEO)
@@ -54,7 +51,7 @@ textwindows int sys_setsockopt_nt(struct Fd *fd, int level, int optname,
     return 0;  // we want to handle this on our own
   }
 
-  // how to make close() a blocking i/o call lool
+  // how to make close() a blocking i/o call
   union {
     uint32_t millis;
     struct linger_nt linger;
diff --git a/libc/sock/sock.h b/libc/sock/sock.h
index 64221af63..43bcf8fb2 100644
--- a/libc/sock/sock.h
+++ b/libc/sock/sock.h
@@ -17,22 +17,21 @@ libcesque uint32_t ntohl(uint32_t) pureconst;
 #define ntohl(x) __builtin_bswap32(x)
 #endif
 
-/* clang-format off */
 const char *inet_ntop(int, const void *, char *, uint32_t) libcesque;
 int inet_pton(int, const char *, void *) libcesque;
 uint32_t inet_addr(const char *) libcesque;
 libcesque uint32_t *GetHostIps(void) __wur;
 
 int socket(int, int, int) libcesque;
-int listen(int, int) libcesque __fd_arg(1);
-int shutdown(int, int) libcesque __fd_arg(1);
-ssize_t send(int, const void *, size_t, int) libcesque __fd_arg(1) __read_only(2, 3);
-ssize_t recv(int, void *, size_t, int) libcesque __fd_arg(1) __write_only(2, 3);
-ssize_t sendfile(int, int, int64_t *, size_t) libcesque __fd_arg(1) __fd_arg(2) __read_write(3);
-int getsockopt(int, int, int, void *, uint32_t *) libcesque __fd_arg(1);
-int setsockopt(int, int, int, const void *, uint32_t) libcesque __fd_arg(1);
+int listen(int, int) libcesque;
+int shutdown(int, int) libcesque;
+ssize_t send(int, const void *, size_t, int) libcesque;
+ssize_t recv(int, void *, size_t, int) libcesque;
+ssize_t sendfile(int, int, int64_t *, size_t) libcesque;
+int getsockopt(int, int, int, void *, uint32_t *) libcesque;
+int setsockopt(int, int, int, const void *, uint32_t) libcesque;
 int socketpair(int, int, int, int[2]) libcesque;
-int sockatmark(int) libcesque __fd_arg(1);
+int sockatmark(int) libcesque;
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_SOCK_SOCK_H_ */
diff --git a/libc/sock/sockaddr.c b/libc/sock/sockaddr.c
index 36973bde3..f72f664ac 100644
--- a/libc/sock/sockaddr.c
+++ b/libc/sock/sockaddr.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/sock/struct/sockaddr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/sockaddr.h"
 #include "libc/sock/struct/sockaddr.internal.h"
diff --git a/libc/sock/sockaddr2linux.c b/libc/sock/sockaddr2linux.c
index 26fd4e0f4..69efdb9f1 100644
--- a/libc/sock/sockaddr2linux.c
+++ b/libc/sock/sockaddr2linux.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sock/internal.h"
 #include "libc/sock/struct/sockaddr.internal.h"
 #include "libc/sock/struct/sockaddr6-bsd.internal.h"
diff --git a/libc/sock/sockdebug.c b/libc/sock/sockdebug.c
index dbdb3e094..fd98aeb30 100644
--- a/libc/sock/sockdebug.c
+++ b/libc/sock/sockdebug.c
@@ -19,7 +19,7 @@
 #include "libc/errno.h"
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/sockaddr.h"
 #include "libc/sock/struct/sockaddr6.h"
@@ -28,8 +28,8 @@
 #include "libc/sysv/consts/ipproto.h"
 #include "libc/sysv/consts/sock.h"
 
-const char *_DescribeSockaddr(char buf[128], const struct sockaddr *sa,
-                              size_t sasize) {
+const char *(DescribeSockaddr)(char buf[128], const struct sockaddr *sa,
+                               size_t sasize) {
   int e;
   size_t n;
   char *p, ip[72];
@@ -56,7 +56,7 @@ const char *_DescribeSockaddr(char buf[128], const struct sockaddr *sa,
         p = stpcpy(p, ip);
         *p++ = ']';
         *p++ = ':';
-        p = FormatUint32(p, ntohs(in6->sin6_port));
+        p = FormatUint32(p, in6->sin6_port);
       }
     } else if (sa->sa_family == AF_UNIX &&
                sasize >= sizeof(struct sockaddr_un)) {
diff --git a/libc/sock/socketpair-nt.c b/libc/sock/socketpair-nt.c
index 833de4b82..8a6bdb625 100644
--- a/libc/sock/socketpair-nt.c
+++ b/libc/sock/socketpair-nt.c
@@ -18,7 +18,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/internal.h"
 #include "libc/calls/state.internal.h"
-#include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/nt/createfile.h"
 #include "libc/nt/enum/accessmask.h"
@@ -34,8 +33,7 @@
 #include "libc/sysv/errfuns.h"
 #ifdef __x86_64__
 
-textwindows static int sys_socketpair_nt_impl(int family, int type, int proto,
-                                              int sv[2]) {
+textwindows int sys_socketpair_nt(int family, int type, int proto, int sv[2]) {
   uint32_t mode;
   int64_t hpipe, h1;
   char16_t pipename[64];
@@ -113,12 +111,4 @@ textwindows static int sys_socketpair_nt_impl(int family, int type, int proto,
   return rc;
 }
 
-textwindows int sys_socketpair_nt(int family, int type, int proto, int sv[2]) {
-  int rc;
-  BLOCK_SIGNALS;
-  rc = sys_socketpair_nt_impl(family, type, proto, sv);
-  ALLOW_SIGNALS;
-  return rc;
-}
-
 #endif /* __x86_64__ */
diff --git a/libc/sock/struct/ifreq.h b/libc/sock/struct/ifreq.h
index 1f6317cb6..0f7061f5f 100644
--- a/libc/sock/struct/ifreq.h
+++ b/libc/sock/struct/ifreq.h
@@ -1,7 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_SOCK_STRUCT_IFREQ_H_
 #define COSMOPOLITAN_LIBC_SOCK_STRUCT_IFREQ_H_
 #include "libc/sock/struct/sockaddr.h"
-#include "libc/sock/struct/sockaddr6.h"
 COSMOPOLITAN_C_START_
 
 #define IF_NAMESIZE 16
@@ -12,14 +11,6 @@ struct ifreq {
     char ifrn_name[IFNAMSIZ]; /* Interface name, e.g. "en0".  */
   } ifr_ifrn;
   union {
-    struct {
-      uint16_t sa_family;
-      uint16_t ifr6_ifindex;  /* Interface index */
-      uint16_t ifr6_flags;    /* Flags */
-      uint8_t ifr6_scope;     /* Addr scope */
-      uint8_t ifr6_prefixlen; /* Prefix length */
-      struct in6_addr ifr6_addr;
-    } in6;
     struct sockaddr ifru_addr;      /* SIOCGIFADDR */
     struct sockaddr ifru_dstaddr;   /* SIOCGIFDSTADDR */
     struct sockaddr ifru_netmask;   /* SIOCGIFNETMASK */
@@ -38,11 +29,5 @@ struct ifreq {
 #define ifr_flags     ifr_ifru.ifru_flags     /* flags */
 #define ifr_ifindex   ifr_ifru.ifru_ivalue
 
-#define ifr6_addr      ifr_ifru.in6.ifr6_addr      /* IP6 Addr */
-#define ifr6_scope     ifr_ifru.in6.ifr6_scope     /* IP6 Addr scope */
-#define ifr6_prefixlen ifr_ifru.in6.ifr6_prefixlen /* IP6 Prefix length */
-#define ifr6_ifindex   ifr_ifru.in6.ifr6_ifindex   /* IP6 If index */
-#define ifr6_flags     ifr_ifru.in6.ifr6_flags     /* IP6 If flags */
-
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_SOCK_STRUCT_IFREQ_H_ */
diff --git a/libc/sock/struct/pollfd.internal.h b/libc/sock/struct/pollfd.internal.h
index 69f58d30d..70b452258 100644
--- a/libc/sock/struct/pollfd.internal.h
+++ b/libc/sock/struct/pollfd.internal.h
@@ -11,11 +11,10 @@ int32_t __sys_poll(struct pollfd *, uint64_t, signed);
 int sys_ppoll(struct pollfd *, size_t, const struct timespec *,
               const sigset_t *, size_t);
 int sys_poll_metal(struct pollfd *, size_t, unsigned);
-int sys_poll_nt(struct pollfd *, uint64_t, const struct timespec *,
-                const sigset_t *);
+int sys_poll_nt(struct pollfd *, uint64_t, uint32_t *, const sigset_t *);
 
-const char *_DescribePollFds(char[300], ssize_t, struct pollfd *, size_t);
-#define DescribePollFds(x, y, z) _DescribePollFds(alloca(300), x, y, z)
+const char *DescribePollFds(char[300], ssize_t, struct pollfd *, size_t);
+#define DescribePollFds(x, y, z) DescribePollFds(alloca(300), x, y, z)
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_SOCK_STRUCT_POLLFD_INTERNAL_H_ */
diff --git a/libc/sock/struct/sockaddr.internal.h b/libc/sock/struct/sockaddr.internal.h
index 724478d2c..59d996d4a 100644
--- a/libc/sock/struct/sockaddr.internal.h
+++ b/libc/sock/struct/sockaddr.internal.h
@@ -40,8 +40,8 @@ union sockaddr_storage_linux {
   struct sockaddr_un sun;
 };
 
-const char *_DescribeSockaddr(char[128], const struct sockaddr *, size_t);
-#define DescribeSockaddr(sa, sz) _DescribeSockaddr(alloca(128), sa, sz)
+const char *DescribeSockaddr(char[128], const struct sockaddr *, size_t);
+#define DescribeSockaddr(sa, sz) DescribeSockaddr(alloca(128), sa, sz)
 
 void __convert_bsd_to_sockaddr(struct sockaddr_storage *);
 void __convert_sockaddr_to_bsd(struct sockaddr_storage *);
diff --git a/libc/sock/sys_sendfile_freebsd.S b/libc/sock/sys_sendfile_freebsd.S
index a8443b42b..6e7a97334 100644
--- a/libc/sock/sys_sendfile_freebsd.S
+++ b/libc/sock/sys_sendfile_freebsd.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 sys_sendfile_freebsd:
 	jmp	sys_sendfile
diff --git a/libc/sock/sys_sendfile_xnu.S b/libc/sock/sys_sendfile_xnu.S
index ca04c39a7..99d28b1e4 100644
--- a/libc/sock/sys_sendfile_xnu.S
+++ b/libc/sock/sys_sendfile_xnu.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 sys_sendfile_xnu:
 	jmp	sys_sendfile
diff --git a/libc/sock/syscall_fd.internal.h b/libc/sock/syscall_fd.internal.h
index 51f7f8082..1c51d00ff 100644
--- a/libc/sock/syscall_fd.internal.h
+++ b/libc/sock/syscall_fd.internal.h
@@ -1,15 +1,15 @@
 #ifndef COSMOPOLITAN_LIBC_SOCK_SYSCALL_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_SOCK_SYSCALL_INTERNAL_H_
-#include "libc/calls/struct/iovec.h"
 #include "libc/intrin/fds.h"
+#include "libc/calls/struct/iovec.h"
 #include "libc/nt/struct/overlapped.h"
 #include "libc/sock/struct/sockaddr.h"
 COSMOPOLITAN_C_START_
 
+void sys_connect_nt_cleanup(struct Fd *, bool);
 int sys_accept_nt(struct Fd *, struct sockaddr_storage *, int);
 int sys_bind_nt(struct Fd *, const void *, uint32_t);
 int sys_closesocket_nt(struct Fd *);
-int sys_ioctlsocket_nt(struct Fd *);
 int sys_connect_nt(struct Fd *, const void *, uint32_t);
 int sys_getpeername_nt(struct Fd *, void *, uint32_t *);
 int sys_getsockname_nt(struct Fd *, void *, uint32_t *);
diff --git a/libc/x/syslog.c b/libc/sock/syslog.c
similarity index 99%
rename from libc/x/syslog.c
rename to libc/sock/syslog.c
index 9d278ed84..8c5840016 100644
--- a/libc/x/syslog.c
+++ b/libc/sock/syslog.c
@@ -25,7 +25,7 @@
 #include "libc/errno.h"
 #include "libc/intrin/safemacros.h"
 #include "libc/log/internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/events.h"
 #include "libc/nt/runtime.h"
 #include "libc/sock/sock.h"
diff --git a/libc/sock/winsockblock.c b/libc/sock/winsockblock.c
index 32a7e0c82..e0d0b2848 100644
--- a/libc/sock/winsockblock.c
+++ b/libc/sock/winsockblock.c
@@ -16,26 +16,17 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
 #include "libc/calls/struct/sigset.h"
-#include "libc/calls/struct/timespec.internal.h"
-#include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/errno.h"
-#include "libc/intrin/atomic.h"
 #include "libc/intrin/weaken.h"
 #include "libc/nt/enum/wait.h"
 #include "libc/nt/errors.h"
-#include "libc/nt/events.h"
-#include "libc/nt/runtime.h"
 #include "libc/nt/struct/overlapped.h"
-#include "libc/nt/struct/pollfd.h"
-#include "libc/nt/synchronization.h"
 #include "libc/nt/thread.h"
 #include "libc/nt/winsock.h"
 #include "libc/sock/internal.h"
-#include "libc/sysv/consts/poll.h"
 #include "libc/sysv/consts/sicode.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/posixthread.internal.h"
@@ -48,146 +39,69 @@ __winsock_block(int64_t handle, uint32_t flags, bool nonblock,
                                   uint32_t *flags, void *arg),
                 void *arg) {
 
-  // convert relative to absolute timeout
-  struct timespec deadline;
-  if (srwtimeout) {
-    deadline = timespec_add(sys_clock_gettime_monotonic_nt(),
-                            timespec_frommillis(srwtimeout));
-  } else {
-    deadline = timespec_max;
+RestartOperation:
+  int rc, sig, reason = 0;
+  uint32_t status, exchanged;
+  if (_check_cancel() == -1)
+    return -1;  // ECANCELED
+  if (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
+    goto HandleInterrupt;
   }
 
-  for (;;) {
-    int got_sig = 0;
-    bool got_cancel = false;
-    bool got_eagain = false;
-    uint32_t other_error = 0;
-
-    // create event handle for overlapped i/o
-    intptr_t event;
-    if (!(event = WSACreateEvent()))
-      return __winsockerr();
-
-    struct NtOverlapped overlap = {.hEvent = event};
-    bool32 ok = !StartSocketOp(handle, &overlap, &flags, arg);
-    if (!ok && WSAGetLastError() == kNtErrorIoPending) {
-      if (nonblock) {
-        // send() and sendto() shall not pass O_NONBLOCK along to here
-        // because winsock has a bug that causes CancelIoEx() to cause
-        // WSAGetOverlappedResult() to report errors when it succeeded
-        CancelIoEx(handle, &overlap);
-        got_eagain = true;
-      } else {
-        // atomic block on i/o completion, signal, or cancel
-        // it's not safe to acknowledge cancelation from here
-        // it's not safe to call any signal handlers from here
-        intptr_t sev;
-        if ((sev = CreateEvent(0, 0, 0, 0))) {
-          // installing semaphore before sig get makes wait atomic
-          struct PosixThread *pt = _pthread_self();
-          pt->pt_event = sev;
-          pt->pt_blkmask = waitmask;
-          atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_EVENT,
-                                memory_order_release);
-          if (_is_canceled()) {
-            got_cancel = true;
-            CancelIoEx(handle, &overlap);
-          } else if (_weaken(__sig_get) &&
-                     (got_sig = _weaken(__sig_get)(waitmask))) {
-            CancelIoEx(handle, &overlap);
-          } else {
-            struct timespec now = sys_clock_gettime_monotonic_nt();
-            struct timespec remain = timespec_subz(deadline, now);
-            int64_t millis = timespec_tomillis(remain);
-            uint32_t waitms = MIN(millis, 0xffffffffu);
-            intptr_t hands[] = {event, sev};
-            uint32_t wi = WSAWaitForMultipleEvents(2, hands, 0, waitms, 0);
-            if (wi == 1) {  // semaphore was signaled by signal enqueue
-              CancelIoEx(handle, &overlap);
-              if (_weaken(__sig_get))
-                got_sig = _weaken(__sig_get)(waitmask);
-            } else if (wi == kNtWaitTimeout) {
-              CancelIoEx(handle, &overlap);
-              got_eagain = true;
-            } else if (wi == -1u) {
-              other_error = WSAGetLastError();
-              CancelIoEx(handle, &overlap);
-            }
-          }
-          atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
-          CloseHandle(sev);
+  struct NtOverlapped overlap = {.hEvent = WSACreateEvent()};
+  rc = StartSocketOp(handle, &overlap, &flags, arg);
+  if (rc && WSAGetLastError() == kNtErrorIoPending) {
+    if (nonblock) {
+      CancelIoEx(handle, &overlap);
+      reason = EAGAIN;
+    } else {
+      struct PosixThread *pt;
+      pt = _pthread_self();
+      pt->pt_blkmask = waitmask;
+      pt->pt_iohandle = handle;
+      pt->pt_ioverlap = &overlap;
+      atomic_store_explicit(&pt->pt_blocker, PT_BLOCKER_IO,
+                            memory_order_release);
+      status = WSAWaitForMultipleEvents(1, &overlap.hEvent, 0,
+                                        srwtimeout ? srwtimeout : -1u, 0);
+      atomic_store_explicit(&pt->pt_blocker, 0, memory_order_release);
+      if (status) {
+        if (status == kNtWaitTimeout) {
+          reason = EAGAIN;  // SO_RCVTIMEO or SO_SNDTIMEO elapsed
         } else {
-          other_error = GetLastError();
-          CancelIoEx(handle, &overlap);
+          reason = WSAGetLastError();  // ENETDOWN or ENOBUFS
         }
+        CancelIoEx(handle, &overlap);
       }
-      ok = true;
     }
-    uint32_t exchanged = 0;
-    if (ok)
-      ok = WSAGetOverlappedResult(handle, &overlap, &exchanged, true, &flags);
-    uint32_t io_error = WSAGetLastError();
-    WSACloseEvent(event);
+    rc = 0;
+  }
+  if (!rc) {
+    rc = WSAGetOverlappedResult(handle, &overlap, &exchanged, true, &flags)
+             ? 0
+             : -1;
+  }
+  WSACloseEvent(overlap.hEvent);
 
-    // check if i/o completed
-    // this could forseeably happen even if CancelIoEx was called
-    if (ok) {
-      if (got_sig)  // swallow dequeued signal
-        _weaken(__sig_relay)(got_sig, SI_KERNEL, waitmask);
-      return exchanged;
-    }
-
-    // check if i/o failed
-    if (io_error != kNtErrorOperationAborted) {
-      if (got_sig)  // swallow dequeued signal
-        _weaken(__sig_relay)(got_sig, SI_KERNEL, waitmask);
-      errno = __dos2errno(io_error);
+  if (!rc) {
+    return exchanged;
+  }
+  if (WSAGetLastError() == kNtErrorOperationAborted) {
+    if (reason) {
+      errno = reason;
       return -1;
     }
-
-    // it's now reasonable to report semaphore creation error
-    if (other_error) {
-      unassert(!got_sig);
-      errno = __dos2errno(other_error);
-      return -1;
-    }
-
-    // check for non-block cancel or timeout
-    if (got_eagain && !got_sig && !got_cancel)
-      return eagain();
-
-    // check for thread cancelation and acknowledge
-    if (_check_cancel() == -1)
-      return -1;
-
-    // if signal module has been linked, then
-    if (_weaken(__sig_get)) {
-
-      // gobble up all unmasked pending signals
-      // it's now safe to recurse into signal handlers
-      int handler_was_called = 0;
-      do {
-        if (got_sig)
-          handler_was_called |=
-              _weaken(__sig_relay)(got_sig, SI_KERNEL, waitmask);
-      } while ((got_sig = _weaken(__sig_get)(waitmask)));
-
-      // check if SIGTHR handler was called
+    if (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
+    HandleInterrupt:
+      int handler_was_called = _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
       if (_check_cancel() == -1)
         return -1;
-
-      // check if signal handler without SA_RESTART was called
-      if (handler_was_called & SIG_HANDLED_NO_RESTART)
-        return eintr();
-
-      // emulates linux behavior of having timeouts @norestart
-      if (handler_was_called & SIG_HANDLED_SA_RESTART)
-        if (srwtimeout)
-          return eintr();
+      if (handler_was_called != 1)
+        goto RestartOperation;
     }
-
-    // otherwise try the i/o operation again
+    return eintr();
   }
+  return __winsockerr();
 }
 
 #endif /* __x86_64__ */
diff --git a/libc/stdalign.h b/libc/stdalign.internal.h
similarity index 52%
rename from libc/stdalign.h
rename to libc/stdalign.internal.h
index 293b33799..9b4d39de4 100644
--- a/libc/stdalign.h
+++ b/libc/stdalign.internal.h
@@ -1,5 +1,5 @@
-#ifndef COSMOPOLITAN_LIBC_STDALIGN_H_
-#define COSMOPOLITAN_LIBC_STDALIGN_H_
+#ifndef COSMOPOLITAN_LIBC_STDALIGN_INTERNAL_H_
+#define COSMOPOLITAN_LIBC_STDALIGN_INTERNAL_H_
 
 #ifndef __cplusplus
 #define alignas _Alignas
@@ -9,4 +9,4 @@
 #define __alignas_is_defined 1
 #define __alignof_is_defined 1
 
-#endif /* COSMOPOLITAN_LIBC_STDALIGN_H_ */
+#endif /* COSMOPOLITAN_LIBC_STDALIGN_INTERNAL_H_ */
diff --git a/libc/stdckdint.h b/libc/stdckdint.h
index f7117b502..2f9afb785 100644
--- a/libc/stdckdint.h
+++ b/libc/stdckdint.h
@@ -38,13 +38,14 @@
  * Instead, you'll get a pretty good pure C11 and C++11 implementation.
  *
  * @see https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3096.pdf
- * @see https://github.com/jart/jtckdint
- * @version 1.0 (2024-12-07)
+ * @version 0.1 (2023-07-22)
  */
 
 #define __STDC_VERSION_STDCKDINT_H__ 202311L
 
-#if (!defined(__STRICT_ANSI__) && defined(__SIZEOF_INT128__))
+#if ((defined(__llvm__) ||                                              \
+      (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 406)) && \
+     !defined(__STRICT_ANSI__))
 #define __ckd_have_int128
 #define __ckd_intmax __int128
 #elif ((defined(__cplusplus) && __cplusplus >= 201103L) ||              \
@@ -57,22 +58,19 @@
 typedef signed __ckd_intmax __ckd_intmax_t;
 typedef unsigned __ckd_intmax __ckd_uintmax_t;
 
-#if (!defined(__STRICT_ANSI__) &&                                       \
-     ((defined(__GNUC__) && __GNUC__ >= 5 && !defined(__ICC)) ||        \
-      (__has_builtin(__builtin_add_overflow) &&                         \
-       __has_builtin(__builtin_sub_overflow) &&                         \
+#if (!defined(__STRICT_ANSI__) &&                       \
+     ((defined(__GNUC__) && __GNUC__ >= 5 &&            \
+       !defined(__chibicc__) && !defined(__ICC)) ||     \
+      (__has_builtin(__builtin_add_overflow) &&         \
+       __has_builtin(__builtin_sub_overflow) &&         \
        __has_builtin(__builtin_mul_overflow))))
 #define ckd_add(res, x, y) __builtin_add_overflow((x), (y), (res))
 #define ckd_sub(res, x, y) __builtin_sub_overflow((x), (y), (res))
 #define ckd_mul(res, x, y) __builtin_mul_overflow((x), (y), (res))
 
-#elif (defined(__cplusplus) &&                          \
-       (__cplusplus >= 201103L ||                       \
-        (defined(_MSC_VER) && __cplusplus >= 199711L && \
-         __ckd_has_include(<type_traits>) &&            \
-         __ckd_has_include(<limits>))))
-#include <type_traits>
-#include <limits>
+#elif defined(__cplusplus) && __cplusplus >= 201103L
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/limits"
 
 template <typename __T, typename __U, typename __V>
 inline bool ckd_add(__T *__res, __U __a, __V __b) {
@@ -160,6 +158,16 @@ inline bool ckd_sub(__T *__res, __U __a, __V __b) {
   __ckd_uintmax_t __y = __b;
   __ckd_uintmax_t __z = __x - __y;
   *__res = __z;
+  if (sizeof(__z) > sizeof(__U) && sizeof(__z) > sizeof(__V)) {
+    if (sizeof(__z) > sizeof(__T) || std::is_signed<__T>::value) {
+      return static_cast<__ckd_intmax_t>(__z) != static_cast<__T>(__z);
+    } else if (!std::is_same<__T, __ckd_uintmax_t>::value) {
+      return (__z != static_cast<__T>(__z) ||
+              ((std::is_signed<__U>::value ||
+                std::is_signed<__V>::value) &&
+               static_cast<__ckd_intmax_t>(__z) < 0));
+    }
+  }
   bool __truncated = false;
   if (sizeof(__T) < sizeof(__ckd_intmax_t)) {
     __truncated = __z != static_cast<__ckd_uintmax_t>(static_cast<__T>(__z));
@@ -258,8 +266,8 @@ inline bool ckd_mul(__T *__res, __U __a, __V __b) {
     case 3: { // u = s * s
       int __o = false;
       if (static_cast<__ckd_intmax_t>(__x & __y) < 0) {
-        __x = 0 - __x;
-        __y = 0 - __y;
+        __x = -__x;
+        __y = -__y;
       } else if (static_cast<__ckd_intmax_t>(__x ^ __y) < 0) {
         __o = __x && __y;
       }
@@ -278,12 +286,12 @@ inline bool ckd_mul(__T *__res, __U __a, __V __b) {
                __z != static_cast<__ckd_uintmax_t>(*__res)));
     }
     case 5: {  // s = u * s
-      __ckd_uintmax_t __t = 0 - __y;
+      __ckd_uintmax_t __t = -__y;
       __t = static_cast<__ckd_intmax_t>(__t) < 0 ? __y : __t;
       __ckd_uintmax_t __p = __t * __x;
       int __o = __t && __p / __t != __x;
       int __n = static_cast<__ckd_intmax_t>(__y) < 0;
-      __ckd_uintmax_t __z = __n ? 0 - __p : __p;
+      __ckd_uintmax_t __z = __n ? -__p : __p;
       *__res = __z;
       __ckd_uintmax_t __m = std::numeric_limits<__ckd_intmax_t>::max();
       return (__o | (__p > __m + __n) |
@@ -291,12 +299,12 @@ inline bool ckd_mul(__T *__res, __U __a, __V __b) {
                __z != static_cast<__ckd_uintmax_t>(*__res)));
     }
     case 6: {  // s = s * u
-      __ckd_uintmax_t __t = 0 - __x;
+      __ckd_uintmax_t __t = -__x;
       __t = static_cast<__ckd_intmax_t>(__t) < 0 ? __x : __t;
       __ckd_uintmax_t __p = __t * __y;
       int __o = __t && __p / __t != __y;
       int __n = static_cast<__ckd_intmax_t>(__x) < 0;
-      __ckd_uintmax_t __z = __n ? 0 - __p : __p;
+      __ckd_uintmax_t __z = __n ? -__p : __p;
       *__res = __z;
       __ckd_uintmax_t __m = std::numeric_limits<__ckd_intmax_t>::max();
       return (__o | (__p > __m + __n) |
@@ -532,8 +540,8 @@ __ckd_declare_sub(__ckd_sub_uint128, unsigned __int128)
       case 3: {  /* u = s * s */                                \
         int __o = 0;                                            \
         if ((__ckd_intmax_t)(__x & __y) < 0) {                  \
-          __x = 0 - __x;                                        \
-          __y = 0 - __y;                                        \
+          __x = -__x;                                           \
+          __y = -__y;                                           \
         } else if ((__ckd_intmax_t)(__x ^ __y) < 0) {           \
           __o = __x && __y;                                     \
         }                                                       \
@@ -552,12 +560,12 @@ __ckd_declare_sub(__ckd_sub_uint128, unsigned __int128)
                  __z != (__ckd_uintmax_t)*(T *)__res));         \
       }                                                         \
       case 5: {  /* s = u * s */                                \
-        __ckd_uintmax_t __t = 0 - __y;                          \
+        __ckd_uintmax_t __t = -__y;                             \
         __t = (__ckd_intmax_t)(__t) < 0 ? __y : __t;            \
         __ckd_uintmax_t __p = __t * __x;                        \
         int __o = __t && __p / __t != __x;                      \
         int __n = (__ckd_intmax_t)__y < 0;                      \
-        __ckd_uintmax_t __z = __n ? 0 - __p : __p;              \
+        __ckd_uintmax_t __z = __n ? -__p : __p;                 \
         *(T *)__res = __z;                                      \
         __ckd_uintmax_t __m = __ckd_sign(__ckd_uintmax_t) - 1;  \
         return (__o | (__p > __m + __n) |                       \
@@ -565,12 +573,12 @@ __ckd_declare_sub(__ckd_sub_uint128, unsigned __int128)
                  __z != (__ckd_uintmax_t)*(T *)__res));         \
       }                                                         \
       case 6: {  /* s = s * u */                                \
-        __ckd_uintmax_t __t = 0 - __x;                          \
+        __ckd_uintmax_t __t = -__x;                             \
         __t = (__ckd_intmax_t)(__t) < 0 ? __x : __t;            \
         __ckd_uintmax_t __p = __t * __y;                        \
         int __o = __t && __p / __t != __y;                      \
         int __n = (__ckd_intmax_t)__x < 0;                      \
-        __ckd_uintmax_t __z = __n ? 0 - __p : __p;              \
+        __ckd_uintmax_t __z = __n ? -__p : __p;                 \
         *(T *)__res = __z;                                      \
         __ckd_uintmax_t __m = __ckd_sign(__ckd_uintmax_t) - 1;  \
         return (__o | (__p > __m + __n) |                       \
diff --git a/libc/stdio/BUILD.mk b/libc/stdio/BUILD.mk
index c4d60fc7f..069e5cf08 100644
--- a/libc/stdio/BUILD.mk
+++ b/libc/stdio/BUILD.mk
@@ -32,13 +32,12 @@ LIBC_STDIO_A_DIRECTDEPS =				\
 	LIBC_NEXGEN32E					\
 	LIBC_NT_ADVAPI32				\
 	LIBC_NT_KERNEL32				\
-	LIBC_PROC					\
 	LIBC_RUNTIME					\
+	LIBC_PROC					\
 	LIBC_STR					\
 	LIBC_SYSV					\
 	LIBC_SYSV_CALLS					\
-	THIRD_PARTY_DLMALLOC				\
-	THIRD_PARTY_GDTOA				\
+	THIRD_PARTY_GDTOA
 
 LIBC_STDIO_A_DEPS :=					\
 	$(call uniq,$(foreach x,$(LIBC_STDIO_A_DIRECTDEPS),$($(x))))
diff --git a/libc/stdio/alloc.c b/libc/stdio/alloc.c
index ace00fa2f..ce348098d 100644
--- a/libc/stdio/alloc.c
+++ b/libc/stdio/alloc.c
@@ -22,14 +22,20 @@
 
 FILE *__stdio_alloc(void) {
   FILE *f;
-  __stdio_lock();
   if ((f = calloc(1, sizeof(FILE)))) {
-    f->freethis = 1;
-    f->fd = -1;
-    f->lock = (pthread_mutex_t)PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
-    dll_init(&f->elem);
-    dll_make_last(&__stdio.files, &f->elem);
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+    pthread_mutex_init(&f->lock, &attr);
+    pthread_mutexattr_destroy(&attr);
+    f->dynamic = 1;
   }
-  __stdio_unlock();
   return f;
 }
+
+void __stdio_free(FILE *f) {
+  pthread_mutex_destroy(&f->lock);
+  if (f->dynamic) {
+    free(f);
+  }
+}
diff --git a/libc/stdio/appendd.c b/libc/stdio/appendd.c
index 025a67055..409f5d550 100644
--- a/libc/stdio/appendd.c
+++ b/libc/stdio/appendd.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/append.h"
 #include "libc/str/str.h"
diff --git a/libc/stdio/appendr.c b/libc/stdio/appendr.c
index fbe47be03..d5554531a 100644
--- a/libc/stdio/appendr.c
+++ b/libc/stdio/appendr.c
@@ -19,7 +19,7 @@
 #include "libc/assert.h"
 #include "libc/dce.h"
 #include "libc/intrin/bsr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/append.h"
 #include "libc/str/str.h"
diff --git a/libc/stdio/appendw.c b/libc/stdio/appendw.c
index ffff19be9..9c90ff6d0 100644
--- a/libc/stdio/appendw.c
+++ b/libc/stdio/appendw.c
@@ -19,7 +19,7 @@
 #include "libc/assert.h"
 #include "libc/dce.h"
 #include "libc/intrin/bsr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/serialize.h"
 #include "libc/stdio/append.h"
diff --git a/libc/stdio/dirstream.c b/libc/stdio/dirstream.c
index f77f4e06c..215eb9093 100644
--- a/libc/stdio/dirstream.c
+++ b/libc/stdio/dirstream.c
@@ -29,7 +29,7 @@
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/critbit0.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/createfile.h"
@@ -49,10 +49,9 @@
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/s.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * @fileoverview Directory Streams for Linux+Mac+Windows+FreeBSD+OpenBSD.
@@ -135,11 +134,11 @@ struct dirent_netbsd {
 };
 
 static void lockdir(DIR *dir) {
-  _pthread_mutex_lock(&dir->lock);
+  pthread_mutex_lock(&dir->lock);
 }
 
 static void unlockdir(DIR *dir) {
-  _pthread_mutex_unlock(&dir->lock);
+  pthread_mutex_unlock(&dir->lock);
 }
 
 static textwindows dontinline int fdopendir_nt(DIR *res, int fd) {
diff --git a/libc/stdio/dumphexc.c b/libc/stdio/dumphexc.c
index 90bab43eb..3bf900cb6 100644
--- a/libc/stdio/dumphexc.c
+++ b/libc/stdio/dumphexc.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/append.h"
 #include "libc/stdio/hex.internal.h"
diff --git a/libc/stdio/ecvt.c b/libc/stdio/ecvt.c
index 612a66335..ebb72075c 100644
--- a/libc/stdio/ecvt.c
+++ b/libc/stdio/ecvt.c
@@ -22,7 +22,6 @@
 │ Materiel Command, USAF, under agreement number F39502-99-1-0512.             │
 │ SUCH DAMAGE.                                                                 │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/math.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
@@ -54,11 +53,8 @@ __cvt(double value, int ndigit, int *decpt, int *sign, int fmode, int pad)
 	char *p, *rve, c;
 	size_t siz;
 
-	// Note that we exclude the case of fmode here, since for fcvt having
-	// `ndigit == 0` just means we have to output 0 digits *after* the radix
-	// character
-	if (ndigit == 0 && !fmode) {
-		*sign = signbit(value);
+	if (ndigit == 0) {
+		*sign = value < 0.0;
 		*decpt = 0;
 		return ("");
 	}
@@ -75,12 +71,10 @@ __cvt(double value, int ndigit, int *decpt, int *sign, int fmode, int pad)
 	/* __dtoa() doesn't allocate space for 0 so we do it by hand */
 	if (value == 0.0) {
 		*decpt = 1 - fmode;	/* 1 for 'e', 0 for 'f' */
-		*sign = signbit(value);
+		*sign = 0;
 		if ((rve = s = malloc(siz)) == NULL)
 			return(NULL);
-		// handle fcvt(0, 0, ...) by returning ""
-		if (siz > 1)
-			*rve++ = '0';
+		*rve++ = '0';
 		*rve = '\0';
 	} else {
 		p = dtoa(value, fmode + 2, ndigit, decpt, sign, &rve);
diff --git a/libc/stdio/fclose.c b/libc/stdio/fclose.c
index 2fcf0f790..b02c8bf20 100644
--- a/libc/stdio/fclose.c
+++ b/libc/stdio/fclose.c
@@ -16,26 +16,47 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/errno.h"
+#include "libc/intrin/weaken.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
 #include "libc/stdio/internal.h"
+#include "libc/stdio/stdio.h"
 
 /**
  * Closes standard i/o stream and its underlying thing.
- * @return 0 on success, or EOF w/ errno
+ *
+ * @param f is the file object
+ * @return 0 on success or -1 on error, which can be a trick for
+ *     differentiating between EOF and real errors during previous
+ *     i/o calls, without needing to call ferror()
  */
 int fclose(FILE *f) {
-  int rc = 0;
-  if (f) {
-    flockfile(f);
-    rc |= fflush(f);
-    int fd = f->fd;
-    f->fd = -1;
-    f->state = EOF;
-    if (fd != -1)
-      rc |= close(fd);
-    funlockfile(f);
-    __stdio_unref(f);
+  int rc;
+  if (!f)
+    return 0;
+  __fflush_unregister(f);
+  fflush(f);
+  if (_weaken(free)) {
+    _weaken(free)(f->getln);
+    if (!f->nofree && f->buf != f->mem) {
+      _weaken(free)(f->buf);
+    }
   }
+  f->state = EOF;
+  if (f->noclose) {
+    f->fd = -1;
+  } else if (f->fd != -1 && close(f->fd) == -1) {
+    f->state = errno;
+  }
+  if (f->state == EOF) {
+    rc = 0;
+  } else {
+    errno = f->state;
+    rc = EOF;
+  }
+  __stdio_free(f);
   return rc;
 }
diff --git a/libc/stdio/fdopen.c b/libc/stdio/fdopen.c
index 5f7191d07..eb4437a0a 100644
--- a/libc/stdio/fdopen.c
+++ b/libc/stdio/fdopen.c
@@ -16,12 +16,14 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
 #include "libc/calls/struct/stat.h"
-#include "libc/mem/mem.h"
 #include "libc/stdio/internal.h"
+#include "libc/stdio/stdio.h"
+#include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/s.h"
-
-__static_yoink("fflush");
+#include "libc/sysv/errfuns.h"
+#include "libc/thread/thread.h"
 
 /**
  * Allocates stream object for already-opened file descriptor.
@@ -36,16 +38,16 @@ FILE *fdopen(int fd, const char *mode) {
   struct stat st;
   if (fstat(fd, &st))
     return 0;
-  if (!(f = __stdio_alloc()))
-    return 0;
-  f->bufmode = S_ISCHR(st.st_mode) ? _IONBF : _IOFBF;
-  f->oflags = fopenflags(mode);
-  f->size = BUFSIZ;
-  if (!(f->buf = malloc(f->size))) {
-    __stdio_unref(f);
-    return 0;
+  if ((f = __stdio_alloc())) {
+    f->fd = fd;
+    f->bufmode = S_ISREG(st.st_mode) ? _IOFBF : _IONBF;
+    f->iomode = fopenflags(mode);
+    f->buf = f->mem;
+    f->size = BUFSIZ;
+    if ((f->iomode & O_ACCMODE) != O_RDONLY) {
+      __fflush_register(f);
+    }
+    return f;
   }
-  f->freebuf = 1;
-  f->fd = fd;
-  return f;
+  return NULL;
 }
diff --git a/libc/stdio/fflush.c b/libc/stdio/fflush.c
index 4a408d313..4a9ef6c8e 100644
--- a/libc/stdio/fflush.c
+++ b/libc/stdio/fflush.c
@@ -16,38 +16,20 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/cxxabi.h"
-#include "libc/stdio/internal.h"
+#include "libc/stdio/stdio.h"
 
 /**
  * Blocks until data from stream buffer is written out.
  *
  * @param f is the stream handle, or 0 for all streams
- * @return is 0 on success or EOF on error
+ * @return is 0 on success or -1 on error
  */
 int fflush(FILE *f) {
   int rc;
-  if (f) {
+  if (f)
     flockfile(f);
-    rc = fflush_unlocked(f);
+  rc = fflush_unlocked(f);
+  if (f)
     funlockfile(f);
-  } else {
-    __stdio_lock();
-    struct Dll *e, *e2;
-    for (rc = 0, e = dll_last(__stdio.files); e; e = e2) {
-      f = FILE_CONTAINER(e);
-      __stdio_ref(f);
-      __stdio_unlock();
-      rc |= fflush(FILE_CONTAINER(e));
-      __stdio_lock();
-      e2 = dll_prev(__stdio.files, e);
-      __stdio_unref_unlocked(f);
-    }
-    __stdio_unlock();
-  }
   return rc;
 }
-
-__attribute__((__constructor__(60))) static textstartup void fflush_init(void) {
-  __cxa_atexit((void *)fflush, 0, 0);
-}
diff --git a/libc/stdio/fflush.internal.h b/libc/stdio/fflush.internal.h
new file mode 100644
index 000000000..75e3f3fc2
--- /dev/null
+++ b/libc/stdio/fflush.internal.h
@@ -0,0 +1,25 @@
+#ifndef COSMOPOLITAN_LIBC_STDIO_FFLUSH_H_
+#define COSMOPOLITAN_LIBC_STDIO_FFLUSH_H_
+#include "libc/stdio/stdio.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/tls.h"
+COSMOPOLITAN_C_START_
+
+struct StdioFlushHandles {
+  size_t i, n;
+  FILE **p;
+};
+
+struct StdioFlush {
+  struct StdioFlushHandles handles;
+  FILE *handles_initmem[8];
+};
+
+extern struct StdioFlush __fflush;
+extern pthread_mutex_t __fflush_lock_obj;
+
+void __fflush_lock(void);
+void __fflush_unlock(void);
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_STDIO_FFLUSH_H_ */
diff --git a/libc/stdio/fflush_unlocked.c b/libc/stdio/fflush_unlocked.c
index 9532bf9d5..49099d7e7 100644
--- a/libc/stdio/fflush_unlocked.c
+++ b/libc/stdio/fflush_unlocked.c
@@ -16,46 +16,75 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
-#include "libc/errno.h"
-#include "libc/intrin/weaken.h"
-#include "libc/mem/mem.h"
+#include "libc/cxxabi.h"
+#include "libc/intrin/pushpop.h"
+#include "libc/mem/arraylist.internal.h"
+#include "libc/stdio/fflush.internal.h"
 #include "libc/stdio/internal.h"
-#include "libc/sysv/consts/o.h"
 
 /**
  * Blocks until data from stream buffer is written out.
  *
- * @param f is the stream handle, which must not be null
- * @return is 0 on success or EOF on error
+ * @param f is the stream handle, or 0 for all streams
+ * @return is 0 on success or -1 on error
  */
 int fflush_unlocked(FILE *f) {
+  int rc = 0;
   size_t i;
-  if (f->getln) {
-    if (_weaken(free))
-      _weaken(free)(f->getln);
-    f->getln = 0;
-  }
-  if (f->fd != -1) {
-    if (f->beg && !f->end && (f->oflags & O_ACCMODE) != O_RDONLY) {
-      ssize_t rc;
-      for (i = 0; i < f->beg; i += rc) {
-        if ((rc = write(f->fd, f->buf + i, f->beg - i)) == -1) {
-          f->state = errno;
-          return EOF;
+  if (!f) {
+    __fflush_lock();
+    for (i = __fflush.handles.i; i; --i) {
+      if ((f = __fflush.handles.p[i - 1])) {
+        if (fflush(f) == -1) {
+          rc = -1;
         }
       }
-      f->beg = 0;
     }
-    if (f->beg < f->end && (f->oflags & O_ACCMODE) != O_WRONLY) {
-      if (lseek(f->fd, -(int)(f->end - f->beg), SEEK_CUR) == -1) {
-        f->state = errno;
-        return EOF;
-      }
-      f->end = f->beg;
+    __fflush_unlock();
+  } else if (f->fd != -1) {
+    if (__fflush_impl(f) == -1) {
+      rc = -1;
+    }
+  } else if (f->beg && f->beg < f->size) {
+    f->buf[f->beg] = 0;
+  }
+  return rc;
+}
+
+textstartup int __fflush_register(FILE *f) {
+  int rc;
+  size_t i;
+  struct StdioFlush *sf;
+  __fflush_lock();
+  sf = &__fflush;
+  if (!sf->handles.p) {
+    sf->handles.p = sf->handles_initmem;
+    pushmov(&sf->handles.n, ARRAYLEN(sf->handles_initmem));
+    __cxa_atexit((void *)fflush_unlocked, 0, 0);
+  }
+  for (i = sf->handles.i; i; --i) {
+    if (!sf->handles.p[i - 1]) {
+      sf->handles.p[i - 1] = f;
+      __fflush_unlock();
+      return 0;
     }
   }
-  if (f->buf && f->beg && f->beg < f->size)
-    f->buf[f->beg] = 0;
-  return 0;
+  rc = append(&sf->handles, &f);
+  __fflush_unlock();
+  return rc;
+}
+
+void __fflush_unregister(FILE *f) {
+  size_t i;
+  struct StdioFlush *sf;
+  __fflush_lock();
+  sf = &__fflush;
+  sf = pushpop(sf);
+  for (i = sf->handles.i; i; --i) {
+    if (sf->handles.p[i - 1] == f) {
+      pushmov(&sf->handles.p[i - 1], 0);
+      break;
+    }
+  }
+  __fflush_unlock();
 }
diff --git a/tool/viz/malloc_scalability.c b/libc/stdio/fflushimpl.c
similarity index 66%
rename from tool/viz/malloc_scalability.c
rename to libc/stdio/fflushimpl.c
index 6f9c71fbf..41e047f01 100644
--- a/tool/viz/malloc_scalability.c
+++ b/libc/stdio/fflushimpl.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,41 +16,41 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/timespec.h"
+#include "libc/calls/calls.h"
+#include "libc/errno.h"
+#include "libc/intrin/weaken.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
+#include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
-#include "libc/thread/thread.h"
+#include "libc/sysv/consts/o.h"
 
-#define ALLOCATIONS 1000
-
-void *worker(void *arg) {
-  void **ptrs = malloc(ALLOCATIONS * sizeof(void *));
-  for (int i = 0; i < ALLOCATIONS; ++i)
-    ptrs[i] = malloc(1);
-  for (int i = 0; i < ALLOCATIONS; ++i)
-    free(ptrs[i]);
-  free(ptrs);
+int __fflush_impl(FILE *f) {
+  size_t i;
+  ssize_t rc;
+  if (f->getln) {
+    if (_weaken(free)) {
+      _weaken(free)(f->getln);
+    }
+    f->getln = 0;
+  }
+  if (f->fd != -1) {
+    if (f->beg && !f->end && (f->iomode & O_ACCMODE) != O_RDONLY) {
+      for (i = 0; i < f->beg; i += rc) {
+        if ((rc = write(f->fd, f->buf + i, f->beg - i)) == -1) {
+          f->state = errno;
+          return -1;
+        }
+      }
+      f->beg = 0;
+    }
+    if (f->beg < f->end && (f->iomode & O_ACCMODE) != O_WRONLY) {
+      if (lseek(f->fd, -(int)(f->end - f->beg), SEEK_CUR) == -1) {
+        f->state = errno;
+        return -1;
+      }
+      f->end = f->beg;
+    }
+  }
   return 0;
 }
-
-void test(int n) {
-  struct timespec start = timespec_mono();
-  pthread_t *th = malloc(sizeof(pthread_t) * n);
-  for (int i = 0; i < n; ++i)
-    pthread_create(th + i, 0, worker, 0);
-  for (int i = 0; i < n; ++i)
-    pthread_join(th[i], 0);
-  free(th);
-  struct timespec end = timespec_mono();
-  printf("%2d threads * %d allocs = %ld us\n", n, ALLOCATIONS,
-         timespec_tomicros(timespec_sub(end, start)));
-}
-
-int main(int argc, char *argv[]) {
-  int n = __get_cpu_count();
-  if (n < 8)
-    n = 8;
-  for (int i = 1; i <= n; ++i)
-    test(i);
-}
diff --git a/libc/stdio/fgets_unlocked.c b/libc/stdio/fgets_unlocked.c
index 0210dc829..9fb38e00c 100644
--- a/libc/stdio/fgets_unlocked.c
+++ b/libc/stdio/fgets_unlocked.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
diff --git a/libc/system/fleaks.c b/libc/stdio/fleaks.c
similarity index 97%
rename from libc/system/fleaks.c
rename to libc/stdio/fleaks.c
index e4f0bfd90..20a1d4a7b 100644
--- a/libc/system/fleaks.c
+++ b/libc/stdio/fleaks.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
-#include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/fmt/itoa.h"
 #include "libc/runtime/runtime.h"
@@ -32,8 +31,6 @@ void CheckForFileLeaks(void) {
   char *p = msg;
   char *pe = msg + 256;
   bool gotsome = false;
-  if (IsQemuUser())
-    usleep(10000);  // weird qemu mt flake
   for (int fd = 3; fd < MIN_CLANDESTINE_FD; ++fd) {
     if (fcntl(fd, F_GETFL) != -1) {
       if (!gotsome) {
diff --git a/libc/stdio/flockfile.c b/libc/stdio/flockfile.c
index 4b16a0778..2c381295f 100644
--- a/libc/stdio/flockfile.c
+++ b/libc/stdio/flockfile.c
@@ -16,15 +16,51 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
+#include "libc/stdio/fflush.internal.h"
 #include "libc/stdio/internal.h"
-#include "libc/thread/posixthread.internal.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
 #include "libc/thread/thread.h"
 
 /**
  * Acquires reentrant lock on stdio object, blocking if needed.
  */
 void flockfile(FILE *f) {
-  unassert(f != NULL);
-  _pthread_mutex_lock(&f->lock);
+  pthread_mutex_lock(&f->lock);
+}
+
+void(__fflush_lock)(void) {
+  pthread_mutex_lock(&__fflush_lock_obj);
+}
+
+void(__fflush_unlock)(void) {
+  pthread_mutex_unlock(&__fflush_lock_obj);
+}
+
+static void __stdio_fork_prepare(void) {
+  FILE *f;
+  __fflush_lock();
+  for (int i = 0; i < __fflush.handles.i; ++i)
+    if ((f = __fflush.handles.p[i]))
+      pthread_mutex_lock(&f->lock);
+}
+
+static void __stdio_fork_parent(void) {
+  FILE *f;
+  for (int i = __fflush.handles.i; i--;)
+    if ((f = __fflush.handles.p[i]))
+      pthread_mutex_unlock(&f->lock);
+  __fflush_unlock();
+}
+
+static void __stdio_fork_child(void) {
+  FILE *f;
+  for (int i = __fflush.handles.i; i--;)
+    if ((f = __fflush.handles.p[i]))
+      f->lock = (pthread_mutex_t)PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+  pthread_mutex_init(&__fflush_lock_obj, 0);
+}
+
+__attribute__((__constructor__(60))) static textstartup void stdioinit(void) {
+  pthread_atfork(__stdio_fork_prepare, __stdio_fork_parent, __stdio_fork_child);
 }
diff --git a/libc/stdio/flushlbf.c b/libc/stdio/flushlbf.c
index 860e093b1..53a7d1a80 100644
--- a/libc/stdio/flushlbf.c
+++ b/libc/stdio/flushlbf.c
@@ -17,6 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
+#include "libc/stdio/fflush.internal.h"
 #include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/stdio/stdio_ext.h"
@@ -25,18 +26,17 @@
  * Flushes all line-buffered streams.
  */
 void _flushlbf(void) {
-  __stdio_lock();
-  struct Dll *e, *e2;
-  for (e = dll_last(__stdio.files); e; e = e2) {
-    FILE *f = FILE_CONTAINER(e);
-    if (f->bufmode == _IOLBF) {
-      __stdio_ref(f);
-      __stdio_unlock();
-      fflush(FILE_CONTAINER(e));
-      __stdio_lock();
-      e2 = dll_prev(__stdio.files, e);
-      __stdio_unref_unlocked(f);
+  int i;
+  FILE *f;
+  __fflush_lock();
+  for (i = 0; i < __fflush.handles.i; ++i) {
+    if ((f = __fflush.handles.p[i])) {
+      flockfile(f);
+      if (f->bufmode == _IOLBF) {
+        fflush_unlocked(f);
+      }
+      funlockfile(f);
     }
   }
-  __stdio_unlock();
+  __fflush_unlock();
 }
diff --git a/libc/stdio/fmemopen.c b/libc/stdio/fmemopen.c
index 3834a7d1e..21945de76 100644
--- a/libc/stdio/fmemopen.c
+++ b/libc/stdio/fmemopen.c
@@ -37,31 +37,36 @@
 FILE *fmemopen(void *buf, size_t size, const char *mode) {
   FILE *f;
   char *p;
-  int oflags;
-  oflags = fopenflags(mode);
+  int iomode;
+  iomode = fopenflags(mode);
   if ((size && size > 0x7ffff000) ||  //
-      (!buf && (oflags & O_ACCMODE) != O_RDWR)) {
+      (!buf && (iomode & O_ACCMODE) != O_RDWR)) {
     einval();
     return NULL;
   }
-  if (!(f = __stdio_alloc()))
+  if (!(f = __stdio_alloc())) {
     return NULL;
-  if (!buf) {
+  }
+  if (buf) {
+    f->nofree = true;
+  } else {
     if (!size)
       size = BUFSIZ;
-    if (!(buf = malloc(size))) {
-      __stdio_unref(f);
+    // TODO(jart): Why do we need calloc()?
+    if (!_weaken(calloc) || !(buf = _weaken(calloc)(1, size))) {
+      __stdio_free(f);
       enomem();
       return NULL;
     }
-    f->freebuf = 1;
   }
+  f->fd = -1;
   f->buf = buf;
-  if (!(oflags & O_TRUNC))
+  if (!(iomode & O_TRUNC)) {
     f->end = size;
+  }
   f->size = size;
-  f->oflags = oflags;
-  if (oflags & O_APPEND) {
+  f->iomode = iomode;
+  if (iomode & O_APPEND) {
     if ((p = memchr(buf, '\0', size))) {
       f->beg = p - (char *)buf;
     } else {
diff --git a/libc/stdio/fmt.c b/libc/stdio/fmt.c
index 5f476e1a0..7ba1ac506 100644
--- a/libc/stdio/fmt.c
+++ b/libc/stdio/fmt.c
@@ -43,22 +43,20 @@
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/fmt/divmod10.internal.h"
-#include "libc/fmt/internal.h"
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/bsr.h"
 #include "libc/intrin/nomultics.h"
 #include "libc/intrin/safemacros.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/mem.h"
 #include "libc/mem/reverse.internal.h"
-#include "libc/runtime/fenv.h"
 #include "libc/runtime/internal.h"
 #include "libc/serialize.h"
 #include "libc/str/str.h"
 #include "libc/str/strwidth.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/str/thompike.h"
 #include "libc/str/unicode.h"
 #include "libc/str/utf16.h"
@@ -77,9 +75,9 @@
 #define FLAGS_PRECISION 0x20
 #define FLAGS_ISSIGNED  0x40
 #define FLAGS_NOQUOTE   0x80
-#define FLAGS_REPR      0x100
-#define FLAGS_QUOTE     0x200
+#define FLAGS_QUOTE     FLAGS_SPACE
 #define FLAGS_GROUPING  FLAGS_NOQUOTE
+#define FLAGS_REPR      FLAGS_PLUS
 
 #define __FMT_PUT(C)              \
   do {                            \
@@ -91,7 +89,7 @@
 
 struct FPBits {
   uint32_t bits[4];
-  FPI fpi;
+  const FPI *fpi;
   int sign;
   int ex;  // exponent
   int kind;
@@ -564,13 +562,7 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
 
 static void __fmt_dfpbits(union U *u, struct FPBits *b) {
   int ex, i;
-  b->fpi = kFpiDbl;
-
-  // dtoa doesn't need this, unlike gdtoa, but we use it for __fmt_bround
-  i = FLT_ROUNDS;
-  if (i != -1)
-    b->fpi.rounding = i;
-
+  b->fpi = &kFpiDbl;
   b->sign = u->ui[1] & 0x80000000L;
   b->bits[1] = u->ui[1] & 0xfffff;
   b->bits[0] = u->ui[0];
@@ -589,7 +581,6 @@ static void __fmt_dfpbits(union U *u, struct FPBits *b) {
   } else {
     i = STRTOG_Zero;
   }
-  i |= signbit(u->d) ? STRTOG_Neg : 0;
   b->kind = i;
   b->ex = ex - (0x3ff + 52);
 }
@@ -615,15 +606,7 @@ static void __fmt_ldfpbits(union U *u, struct FPBits *b) {
 #else
 #error "unsupported architecture"
 #endif
-  b->fpi = kFpiLdbl;
-
-  // gdtoa doesn't check for FLT_ROUNDS but for fpi.rounding (which has the
-  // same valid values as FLT_ROUNDS), so handle this here
-  // (we also use this in __fmt_bround now)
-  i = FLT_ROUNDS;
-  if (i != -1)
-    b->fpi.rounding = i;
-
+  b->fpi = &kFpiLdbl;
   b->sign = sex & 0x8000;
   if ((ex = sex & 0x7fff) != 0) {
     if (ex != 0x7fff) {
@@ -631,7 +614,7 @@ static void __fmt_ldfpbits(union U *u, struct FPBits *b) {
 #if LDBL_MANT_DIG == 113
       b->bits[3] |= 1 << (112 - 32 * 3);  // set lowest exponent bit
 #endif
-    } else if (isnan(u->ld)) {
+    } else if (b->bits[0] | b->bits[1] | b->bits[2] | b->bits[3]) {
       i = STRTOG_NaN;
     } else {
       i = STRTOG_Infinite;
@@ -642,7 +625,6 @@ static void __fmt_ldfpbits(union U *u, struct FPBits *b) {
   } else {
     i = STRTOG_Zero;
   }
-  i |= signbit(u->ld) ? STRTOG_Neg : 0;
   b->kind = i;
   b->ex = ex - (0x3fff + (LDBL_MANT_DIG - 1));
 #endif
@@ -653,9 +635,9 @@ static int __fmt_fpiprec(struct FPBits *b) {
   const FPI *fpi;
   int i, j, k, m;
   uint32_t *bits;
-  if ((b->kind & STRTOG_Retmask) == STRTOG_Zero)
+  if (b->kind == STRTOG_Zero)
     return (b->ex = 0);
-  fpi = &b->fpi;
+  fpi = b->fpi;
   bits = b->bits;
   for (k = (fpi->nbits - 1) >> 2; k > 0; --k) {
     if ((bits[k >> 3] >> 4 * (k & 7)) & 0xf) {
@@ -694,47 +676,26 @@ static int __fmt_fpiprec(struct FPBits *b) {
 // prec1 = incoming precision (after ".")
 static int __fmt_bround(struct FPBits *b, int prec, int prec1) {
   uint32_t *bits, t;
-  int i, j, k, m, n;
-  bool inc = false;
+  int i, inc, j, k, m, n;
   m = prec1 - prec;
   bits = b->bits;
+  inc = 0;
   k = m - 1;
-
-  // The first two ifs here handle cases where rounding is simple, i.e. where we
-  // always know in which direction we must round because of the current
-  // rounding mode (note that if the correct value for inc is `false` then it
-  // doesn't need to be set as we have already done so above)
-  // They use the FLT_ROUNDS value, which are the same as gdtoa's FPI_Round_*
-  // enum values
-  if (b->fpi.rounding == FPI_Round_zero ||
-      (b->fpi.rounding == FPI_Round_up && b->sign) ||
-      (b->fpi.rounding == FPI_Round_down && !b->sign))
-    goto have_inc;
-  if ((b->fpi.rounding == FPI_Round_up && !b->sign) ||
-      (b->fpi.rounding == FPI_Round_down && b->sign))
-    goto inc_true;
-
-  // Rounding to nearest, ties to even
   if ((t = bits[k >> 3] >> (j = (k & 7) * 4)) & 8) {
     if (t & 7)
-      goto inc_true;
-    // ((1 << (j * 4)) - 1) will mask appropriately for the lower bits
-    if ((bits[k >> 3] & ((1 << (j * 4)) - 1)) != 0)
-      goto inc_true;
-    // If exactly halfway and all lower bits are zero (tie), round to even
-    if ((bits[k >> 3] >> (j + 1) * 4) & 1)
-      goto inc_true;
+      goto inc1;
+    if (j && bits[k >> 3] << (32 - j))
+      goto inc1;
     while (k >= 8) {
       k -= 8;
       if (bits[k >> 3]) {
-      inc_true:
-        inc = true;
-        goto have_inc;
+      inc1:
+        inc = 1;
+        goto haveinc;
       }
     }
   }
-
-have_inc:
+haveinc:
   b->ex += m * 4;
   i = m >> 3;
   k = prec1 >> 3;
@@ -758,12 +719,7 @@ have_inc:
       donothing;
     if (j > k) {
     onebit:
-      // We use 0x10 instead of 1 here to ensure that the digit before the
-      // decimal-point is non-0 (the C standard mandates this, i.e. considers
-      // that printing 0x0.1p+5 is illegal where 0x1.0p+1 is even though both
-      // evaluate to the same value because the first has 0 as the digit before
-      // the decimal-point character)
-      bits[0] = 0x10;
+      bits[0] = 1;
       b->ex += 4 * prec;
       return 1;
     }
@@ -864,7 +820,7 @@ static int __fmt_noop(const char *, void *, size_t) {
  * @asyncsignalsafe if floating point isn't used
  * @vforksafe if floating point isn't used
  */
-int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
+int __fmt(void *fn, void *arg, const char *format, va_list va) {
   long ld;
   void *p;
   double x;
@@ -1090,10 +1046,9 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
       case 'x':
         log2base = 4;
         goto FormatNumber;
-      case 'B':
       case 'b':
         log2base = 1;
-        alphabet = (d == 'b' ? "0123456789abcdefpb" : "0123456789ABCDEFPB");
+        alphabet = "0123456789abcdefpb";
         goto FormatNumber;
       case 'o':
         log2base = 3;
@@ -1117,9 +1072,6 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
         }
         break;
       }
-      case 'C':
-        signbit = 63;
-        // fallthrough
       case 'c':
         if ((charbuf[0] = va_arg(va, int))) {
           p = charbuf;
@@ -1169,25 +1121,7 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
         }
         break;
       case 'n':
-        switch (signbit) {
-          case 7:
-            *va_arg(va, int8_t *) = *wrote;
-            break;
-          case 15:
-            *va_arg(va, int16_t *) = *wrote;
-            break;
-          case 31:
-            *va_arg(va, int32_t *) = *wrote;
-            break;
-          case 63:
-            *va_arg(va, int64_t *) = *wrote;
-            break;
-          case 127:
-            *va_arg(va, int128_t *) = *wrote;
-            break;
-          default:
-            npassert(false);
-        }
+        __FMT_PUT('\n');
         break;
 
       case 'F':
@@ -1207,13 +1141,11 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
         } else {
           un.ld = va_arg(va, long double);
           __fmt_ldfpbits(&un, &fpb);
-          s = s0 = gdtoa(&fpb.fpi, fpb.ex, fpb.bits, &fpb.kind, 3, prec, &decpt,
-                         &se);
+          s = s0 =
+              gdtoa(fpb.fpi, fpb.ex, fpb.bits, &fpb.kind, 3, prec, &decpt, &se);
         }
-        if (s0 == NULL)
-          return -1;
-        if (decpt == 9999 || decpt == -32768) {
-        FormatDecpt9999Or32768:
+        if (decpt == 9999) {
+        Format9999:
           if (s0)
             freedtoa(s0);
           bzero(special, sizeof(special));
@@ -1225,10 +1157,7 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
           } else if (flags & FLAGS_SPACE) {
             *q++ = ' ';
           }
-          memcpy(q,
-                 kSpecialFloats[(fpb.kind & STRTOG_Retmask) == STRTOG_NaN]
-                               [d >= 'a'],
-                 4);
+          memcpy(q, kSpecialFloats[fpb.kind == STRTOG_NaN][d >= 'a'], 4);
           flags &= ~(FLAGS_PRECISION | FLAGS_PLUS | FLAGS_HASH | FLAGS_SPACE);
           prec = 0;
           rc = __fmt_stoa(out, arg, s, flags, prec, width, signbit, qchar);
@@ -1325,13 +1254,11 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
         } else {
           un.ld = va_arg(va, long double);
           __fmt_ldfpbits(&un, &fpb);
-          s = s0 = gdtoa(&fpb.fpi, fpb.ex, fpb.bits, &fpb.kind, prec ? 2 : 0,
+          s = s0 = gdtoa(fpb.fpi, fpb.ex, fpb.bits, &fpb.kind, prec ? 2 : 0,
                          prec, &decpt, &se);
         }
-        if (s0 == NULL)
-          return -1;
-        if (decpt == 9999 || decpt == -32768)
-          goto FormatDecpt9999Or32768;
+        if (decpt == 9999)
+          goto Format9999;
         c = se - s;
         prec1 = prec;
         if (!prec) {
@@ -1373,13 +1300,11 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
         } else {
           un.ld = va_arg(va, long double);
           __fmt_ldfpbits(&un, &fpb);
-          s = s0 = gdtoa(&fpb.fpi, fpb.ex, fpb.bits, &fpb.kind, prec ? 2 : 0,
-                         prec + 1, &decpt, &se);
+          s = s0 = gdtoa(fpb.fpi, fpb.ex, fpb.bits, &fpb.kind, prec ? 2 : 0,
+                         prec, &decpt, &se);
         }
-        if (s0 == NULL)
-          return -1;
-        if (decpt == 9999 || decpt == -32768)
-          goto FormatDecpt9999Or32768;
+        if (decpt == 9999)
+          goto Format9999;
       FormatExpo:
         if (fpb.sign /* && (x || sign) */)
           sign = '-';
@@ -1458,10 +1383,9 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
           un.d = va_arg(va, double);
           __fmt_dfpbits(&un, &fpb);
         }
-        if ((fpb.kind & STRTOG_Retmask) == STRTOG_Infinite ||
-            (fpb.kind & STRTOG_Retmask) == STRTOG_NaN) {
+        if (fpb.kind == STRTOG_Infinite || fpb.kind == STRTOG_NaN) {
           s0 = 0;
-          goto FormatDecpt9999Or32768;
+          goto Format9999;
         }
         prec1 = __fmt_fpiprec(&fpb);
         if ((flags & FLAGS_PRECISION) && prec < prec1) {
@@ -1477,7 +1401,7 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
             i1 /= 10;
           }
         }
-        if (fpb.sign /* && (sign || (fpb.kind & STRTOG_Retmask) != STRTOG_Zero) */) {
+        if (fpb.sign /* && (sign || fpb.kind != STRTOG_Zero) */) {
           sign = '-';
         }
         if ((width -= bw + 5) > 0) {
@@ -1504,13 +1428,9 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
         i1 = prec1 & 7;
         k = prec1 >> 3;
         __FMT_PUT(alphabet[(fpb.bits[k] >> 4 * i1) & 0xf]);
-
-        // decimal-point character appears if the precision isn't 0
-        // or the # flag is specified
-        if (prec1 > 0 || prec > 0 || (flags & FLAGS_HASH)) {
+        if (prec1 > 0 || prec > 0) {
           __FMT_PUT('.');
         }
-
         while (prec1 > 0) {
           if (--i1 < 0) {
             if (--k < 0)
diff --git a/libc/stdio/fopen.c b/libc/stdio/fopen.c
index d077f3c01..0294c5c2d 100644
--- a/libc/stdio/fopen.c
+++ b/libc/stdio/fopen.c
@@ -17,9 +17,36 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
+#include "libc/mem/mem.h"
+#include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/errfuns.h"
 
-__static_yoink("fflush");
+static const char *fixpathname(const char *pathname, int flags) {
+  if ((flags & O_ACCMODE) == O_RDONLY && strcmp(pathname, "-") == 0) {
+    return "/dev/stdin";
+  } else if ((flags & O_ACCMODE) == O_WRONLY && strcmp(pathname, "-") == 0) {
+    return "/dev/stdout";
+  } else {
+    return pathname;
+  }
+}
+
+static int openpathname(const char *pathname, int flags, bool *out_noclose) {
+  if ((flags & O_ACCMODE) == O_RDONLY && strcmp(pathname, "/dev/stdin") == 0) {
+    *out_noclose = true;
+    return fileno(stdin);
+  } else if ((flags & O_ACCMODE) == O_WRONLY &&
+             strcmp(pathname, "/dev/stdout") == 0) {
+    *out_noclose = true;
+    return fileno(stdout);
+  } else {
+    *out_noclose = false;
+    return open(pathname, flags, 0666);
+  }
+}
 
 /**
  * Opens file as stream object.
@@ -30,13 +57,21 @@ __static_yoink("fflush");
  * @note microsoft unilaterally deprecated this function lool
  */
 FILE *fopen(const char *pathname, const char *mode) {
-  int fd;
-  if ((fd = open(pathname, fopenflags(mode), 0666)) == -1)
-    return 0;
-  FILE *f;
-  if (!(f = fdopen(fd, mode))) {
-    close(fd);
+  FILE *f = 0;
+  bool noclose;
+  int fd, flags;
+  if (!pathname) {
+    efault();
     return 0;
   }
+  flags = fopenflags(mode);
+  pathname = fixpathname(pathname, flags);
+  if ((fd = openpathname(pathname, flags, &noclose)) != -1) {
+    if ((f = fdopen(fd, mode)) != NULL) {
+      f->noclose = noclose;
+    } else if (!noclose) {
+      close(fd);
+    }
+  }
   return f;
 }
diff --git a/libc/stdio/fread_unlocked.c b/libc/stdio/fread_unlocked.c
index 98179bf52..d76bd3216 100644
--- a/libc/stdio/fread_unlocked.c
+++ b/libc/stdio/fread_unlocked.c
@@ -20,7 +20,7 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/iovec.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdckdint.h"
 #include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
@@ -28,27 +28,21 @@
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/errfuns.h"
 
-static ssize_t readvall(FILE *f, struct iovec *iov, int iovlen, size_t need) {
+static ssize_t readvall(int fd, struct iovec *iov, int iovlen) {
   ssize_t rc;
   size_t got, toto;
-  for (toto = 0;;) {
-
-    // perform i/o
-    if ((rc = readv(f->fd, iov, iovlen)) == -1) {
-      f->state = errno;
-      if (toto)
+  toto = 0;
+  do {
+    if ((rc = readv(fd, iov, iovlen)) == -1) {
+      if (toto) {
+        if (errno == EINTR)
+          continue;
         return toto;
+      }
       return -1;
     }
     got = rc;
     toto += got;
-    if (!got) {
-      f->state = EOF;
-      return toto;
-    }
-
-    // roll forward iov
-    // skip over empty elements
     for (;;) {
       if (!iov->iov_len) {
         --iovlen;
@@ -62,14 +56,9 @@ static ssize_t readvall(FILE *f, struct iovec *iov, int iovlen, size_t need) {
         iov->iov_len -= got;
         break;
       }
-      if (!iovlen)
-        return toto;
     }
-
-    // don't trigger eof condition if we're rolling greed to fill buffer
-    if (toto >= need)
-      return toto;
-  }
+  } while (got && iovlen);
+  return toto;
 }
 
 /**
@@ -86,7 +75,7 @@ size_t fread_unlocked(void *buf, size_t stride, size_t count, FILE *f) {
   size_t n, m, got, need;
 
   // check state and parameters
-  if ((f->oflags & O_ACCMODE) == O_WRONLY) {
+  if ((f->iomode & O_ACCMODE) == O_WRONLY) {
     f->state = errno = EBADF;
     return 0;
   }
@@ -145,16 +134,27 @@ size_t fread_unlocked(void *buf, size_t stride, size_t count, FILE *f) {
     iov[1].iov_base = NULL;
     iov[1].iov_len = 0;
   }
-  rc = readvall(f, iov, 2, need);
-  if (rc == -1)
+  if (f->bufmode == _IONBF) {
+    rc = readv(f->fd, iov, 2);
+  } else {
+    rc = readvall(f->fd, iov, 2);
+  }
+  if (rc == -1) {
+    f->state = errno;
     return 0;
+  }
   got = rc;
 
   // handle partial fulfillment
   if (got < need) {
     got += m;
+    if (got % stride) {
+      f->state = eio();
+      return 0;
+    }
     f->beg = 0;
     f->end = 0;
+    f->state = EOF;
     return got / stride;
   }
 
diff --git a/libc/stdio/freadable.c b/libc/stdio/freadable.c
index 8a623623a..ff78a7a84 100644
--- a/libc/stdio/freadable.c
+++ b/libc/stdio/freadable.c
@@ -24,6 +24,6 @@
  * Returns nonzero if stream allows reading.
  */
 int __freadable(FILE *f) {
-  return (f->oflags & O_ACCMODE) == O_RDONLY ||
-         (f->oflags & O_ACCMODE) == O_RDWR;
+  return (f->iomode & O_ACCMODE) == O_RDONLY ||
+         (f->iomode & O_ACCMODE) == O_RDWR;
 }
diff --git a/libc/stdio/freading.c b/libc/stdio/freading.c
index 0f447bf5b..2e3782b3f 100644
--- a/libc/stdio/freading.c
+++ b/libc/stdio/freading.c
@@ -24,5 +24,5 @@
  * Returns nonzero if stream is read only.
  */
 int __freading(FILE *f) {
-  return (f->oflags & O_ACCMODE) == O_RDONLY;
+  return (f->iomode & O_ACCMODE) == O_RDONLY;
 }
diff --git a/libc/stdio/freopen.c b/libc/stdio/freopen.c
index c2db89e60..aabcfa340 100644
--- a/libc/stdio/freopen.c
+++ b/libc/stdio/freopen.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
@@ -38,8 +37,8 @@
  * @return stream object if successful, or NULL w/ errno
  */
 FILE *freopen(const char *pathname, const char *mode, FILE *stream) {
+  int fd;
   FILE *res;
-  int fd, fd2;
   unsigned flags;
   flags = fopenflags(mode);
   flockfile(stream);
@@ -47,30 +46,19 @@ FILE *freopen(const char *pathname, const char *mode, FILE *stream) {
   if (pathname) {
     /* open new stream, overwriting existing alloc */
     if ((fd = open(pathname, flags, 0666)) != -1) {
-      fd2 = dup3(fd, stream->fd, flags & O_CLOEXEC);
+      dup3(fd, stream->fd, flags & O_CLOEXEC);
       close(fd);
-      if (fd2 != -1) {
-        stream->fd = fd2;
-        stream->oflags = flags;
-        stream->beg = 0;
-        stream->end = 0;
-        res = stream;
-      } else {
-        res = NULL;
-      }
-    } else {
-      res = NULL;
-    }
-  } else {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wanalyzer-fd-use-without-check"
-    if (fcntl(stream->fd, F_SETFD, !!(flags & O_CLOEXEC)) != -1 &&
-        fcntl(stream->fd, F_SETFL, flags & ~O_CLOEXEC) != -1) {
-#pragma GCC diagnostic pop
+      stream->iomode = flags;
+      stream->beg = 0;
+      stream->end = 0;
       res = stream;
     } else {
       res = NULL;
     }
+  } else {
+    fcntl(stream->fd, F_SETFD, !!(flags & O_CLOEXEC));
+    fcntl(stream->fd, F_SETFL, flags & ~O_CLOEXEC);
+    res = stream;
   }
   funlockfile(stream);
   return res;
diff --git a/libc/stdio/fseek_unlocked.c b/libc/stdio/fseek_unlocked.c
index b3fd2bbbe..6703a59ec 100644
--- a/libc/stdio/fseek_unlocked.c
+++ b/libc/stdio/fseek_unlocked.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/sysv/consts/o.h"
@@ -34,13 +34,13 @@
  * @param f is a non-null stream handle
  * @param offset is the byte delta
  * @param whence can be SEET_SET, SEEK_CUR, or SEEK_END
- * @returns 0 on success or -1 w/ errno
+ * @returns 0 on success or -1 on error
  */
 int fseek_unlocked(FILE *f, int64_t offset, int whence) {
   int res;
   int64_t pos;
   if (f->fd != -1) {
-    if (fflush_unlocked(f) == EOF)
+    if (__fflush_impl(f) == -1)
       return -1;
     if (whence == SEEK_CUR && f->beg < f->end) {
       offset -= f->end - f->beg;
diff --git a/libc/stdio/ftell.c b/libc/stdio/ftell.c
index 103a7d217..7330e35d6 100644
--- a/libc/stdio/ftell.c
+++ b/libc/stdio/ftell.c
@@ -26,7 +26,7 @@
 static inline int64_t ftell_unlocked(FILE *f) {
   int64_t pos;
   if (f->fd != -1) {
-    if (fflush_unlocked(f) == EOF)
+    if (__fflush_impl(f) == -1)
       return -1;
     if ((pos = lseek(f->fd, 0, SEEK_CUR)) != -1) {
       if (f->beg < f->end)
diff --git a/libc/stdio/funlockfile.c b/libc/stdio/funlockfile.c
index cfeb7f534..b47f8ab9d 100644
--- a/libc/stdio/funlockfile.c
+++ b/libc/stdio/funlockfile.c
@@ -18,12 +18,11 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
 /**
  * Releases lock on stdio object.
  */
 void funlockfile(FILE *f) {
-  _pthread_mutex_unlock(&f->lock);
+  pthread_mutex_unlock(&f->lock);
 }
diff --git a/libc/stdio/fwritable.c b/libc/stdio/fwritable.c
index ef1205bb8..df10a0aea 100644
--- a/libc/stdio/fwritable.c
+++ b/libc/stdio/fwritable.c
@@ -24,6 +24,6 @@
  * Returns nonzero if stream allows reading.
  */
 int __fwritable(FILE *f) {
-  return (f->oflags & O_ACCMODE) == O_WRONLY ||
-         (f->oflags & O_ACCMODE) == O_RDWR;
+  return (f->iomode & O_ACCMODE) == O_WRONLY ||
+         (f->iomode & O_ACCMODE) == O_RDWR;
 }
diff --git a/libc/stdio/fwrite_unlocked.c b/libc/stdio/fwrite_unlocked.c
index ed1cc7c39..ef29022fe 100644
--- a/libc/stdio/fwrite_unlocked.c
+++ b/libc/stdio/fwrite_unlocked.c
@@ -20,7 +20,7 @@
 #include "libc/calls/struct/iovec.internal.h"
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdckdint.h"
 #include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
@@ -74,9 +74,10 @@ size_t fwrite_unlocked(const void *data, size_t stride, size_t count, FILE *f) {
   size_t n, m;
   const char *p;
   struct iovec iov[2];
-  if (!stride || !count)
+  if (!stride || !count) {
     return 0;
-  if ((f->oflags & O_ACCMODE) == O_RDONLY) {
+  }
+  if ((f->iomode & O_ACCMODE) == O_RDONLY) {
     f->state = errno = EBADF;
     return 0;
   }
diff --git a/libc/stdio/fwriting.c b/libc/stdio/fwriting.c
index 8a755bcb2..8a4f012a1 100644
--- a/libc/stdio/fwriting.c
+++ b/libc/stdio/fwriting.c
@@ -24,5 +24,5 @@
  * Returns nonzero if stream is write only.
  */
 int __fwriting(FILE *f) {
-  return (f->oflags & O_ACCMODE) == O_WRONLY;
+  return (f->iomode & O_ACCMODE) == O_WRONLY;
 }
diff --git a/libc/intrin/sigvar.c b/libc/stdio/g_rando.c
similarity index 93%
rename from libc/intrin/sigvar.c
rename to libc/stdio/g_rando.c
index 21c1d2945..c702b2fd8 100644
--- a/libc/intrin/sigvar.c
+++ b/libc/stdio/g_rando.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,6 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/sig.internal.h"
+#include "libc/stdio/stdio.h"
 
-struct Signals __sig;
+uint64_t g_rando = 1;
diff --git a/libc/stdio/getdelim_unlocked.c b/libc/stdio/getdelim_unlocked.c
index 569040836..036017097 100644
--- a/libc/stdio/getdelim_unlocked.c
+++ b/libc/stdio/getdelim_unlocked.c
@@ -32,7 +32,7 @@ ssize_t getdelim_unlocked(char **s, size_t *n, int delim, FILE *f) {
   ssize_t rc;
   char *p, *s2;
   size_t i, m, n2;
-  if ((f->oflags & O_ACCMODE) == O_WRONLY) {
+  if ((f->iomode & O_ACCMODE) == O_WRONLY) {
     f->state = errno = EBADF;
     return -1;
   }
@@ -44,8 +44,9 @@ ssize_t getdelim_unlocked(char **s, size_t *n, int delim, FILE *f) {
     *n = 0;
   for (i = 0;; i += m) {
     m = f->end - f->beg;
-    if ((p = memchr(f->buf + f->beg, delim, m)))
+    if ((p = memchr(f->buf + f->beg, delim, m))) {
       m = p + 1 - (f->buf + f->beg);
+    }
     if (i + m + 1 > *n) {
       n2 = i + m + 1;
       s2 = realloc(*s, n2);
@@ -58,9 +59,10 @@ ssize_t getdelim_unlocked(char **s, size_t *n, int delim, FILE *f) {
       }
     }
     memcpy(*s + i, f->buf + f->beg, m);
-    (*s)[i + m] = 0;
-    if ((f->beg += m) == f->end)
+    (*s)[i + m] = '\0';
+    if ((f->beg += m) == f->end) {
       f->beg = f->end = 0;
+    }
     if (p) {
       return i + m;
     } else if (f->fd == -1) {
@@ -69,7 +71,7 @@ ssize_t getdelim_unlocked(char **s, size_t *n, int delim, FILE *f) {
       if (!rc)
         break;
       f->end = rc;
-    } else {
+    } else if (errno != EINTR) {
       f->state = errno;
       return -1;
     }
diff --git a/libc/stdio/getentropy.c b/libc/stdio/getentropy.c
index ad8d357a8..45b5b918e 100644
--- a/libc/stdio/getentropy.c
+++ b/libc/stdio/getentropy.c
@@ -17,17 +17,17 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/blockcancel.internal.h"
+#include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/syscall_support-sysv.internal.h"
 #include "libc/dce.h"
-#include "libc/errno.h"
 #include "libc/intrin/strace.h"
-#include "libc/runtime/syslib.internal.h"
+#include "libc/stdio/rand.h"
 #include "libc/sysv/errfuns.h"
 
 int sys_getentropy(void *, size_t) asm("sys_getrandom");
 
 /**
- * Returns random seeding bytes, the POSIX way.
+ * Returns random seeding bytes, the XNU/OpenBSD way.
  *
  * @return 0 on success, or -1 w/ errno
  * @raise EFAULT if the `n` bytes at `p` aren't valid memory
@@ -40,26 +40,18 @@ int getentropy(void *p, size_t n) {
     rc = eio();
   } else if ((!p && n)) {
     rc = efault();
-  } else if (IsXnuSilicon()) {
-    rc = __syslib->__getentropy(p, n);
   } else if (IsXnu() || IsOpenbsd()) {
-    rc = sys_getentropy(p, n);
-  } else {
-    ssize_t got;
-    BLOCK_CANCELATION;
+    if (sys_getentropy(p, n))
+      notpossible;
     rc = 0;
-    for (size_t i = 0; i < n; i += got) {
-      got = __getrandom(p + i, n - i, 0);
-      if (got == -1) {
-        if (errno == EAGAIN || errno == EINTR) {
-          got = 0;
-        } else {
-          rc = -1;
-          break;
-        }
-      }
-    }
+  } else {
+    BLOCK_SIGNALS;
+    BLOCK_CANCELATION;
+    if (__getrandom(p, n, 0) != n)
+      notpossible;
     ALLOW_CANCELATION;
+    ALLOW_SIGNALS;
+    rc = 0;
   }
   STRACE("getentropy(%p, %'zu) → %'ld% m", p, n, rc);
   return rc;
diff --git a/third_party/musl/iconv.c b/libc/stdio/iconv.c
similarity index 96%
rename from third_party/musl/iconv.c
rename to libc/stdio/iconv.c
index cea641664..3b04b6b8d 100644
--- a/third_party/musl/iconv.c
+++ b/libc/stdio/iconv.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set noet ft=c ts=2 sts=2 sw=2 fenc=utf-8                             :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -29,12 +29,12 @@
 #include "libc/errno.h"
 #include "libc/mem/mem.h"
 #include "libc/str/locale.h"
-#include "libc/str/locale.internal.h"
 #include "libc/str/str.h"
 #include "libc/thread/tls.h"
 // clang-format off
 __static_yoink("musl_libc_notice");
 
+
 #define UTF_32BE    0300
 #define UTF_16LE    0301
 #define UTF_16BE    0302
@@ -77,10 +77,10 @@ static const unsigned char charmaps[] =
 "ucs4\0utf32\0\0\313"
 "ucs2\0\0\314"
 "eucjp\0\0\320"
-"shiftjis\0sjis\0cp932\0\0\321"
+"shiftjis\0sjis\0\0\321"
 "iso2022jp\0\0\322"
 "gb18030\0\0\330"
-"gbk\0cp936\0windows936\0\0\331"
+"gbk\0\0\331"
 "gb2312\0\0\332"
 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
@@ -88,7 +88,6 @@ static const unsigned char charmaps[] =
 ;
 
 #pragma GCC diagnostic ignored "-Wmissing-braces"
-#pragma GCC diagnostic ignored "-Wparentheses"
 
 /* Table of characters that appear in legacy 8-bit codepages,
  * limited to 1024 slots (10 bit indices). The first 256 entries
@@ -238,7 +237,7 @@ static unsigned legacy_map(const unsigned char *map, unsigned c)
 {
 	if (c < 4*map[-1]) return c;
 	unsigned x = c - 4*map[-1];
-	x = map[x*5/4]>>2*x%8 | map[x*5/4+1]<<8-2*x%8 & 1023;
+	x = (map[x*5/4]>>(2*x%8)) | ((map[x*5/4+1]<<(8-(2*x%8))) & 1023);
 	return x < 256 ? x : legacy_chars[x-256];
 }
 
@@ -280,11 +279,12 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 	int err;
 	unsigned char type = map[-1];
 	unsigned char totype = tomap[-1];
-	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
+	locale_t *ploc = (locale_t *)&__get_tls()->tib_locale;
+        locale_t loc = *ploc;
 
 	if (!in || !*in || !*inb) return 0;
 
-	*ploc = UTF8_LOCALE;
+	*ploc = 0;  // TODO(jart): UTF8_LOCALE?
 
 	for (; *inb; *in+=l, *inb-=l) {
 		c = *(unsigned char *)*in;
@@ -377,7 +377,6 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 				c++;
 				d -= 159;
 			}
-			if (c>=84) goto ilseq;
 			c = jis0208[c][d];
 			if (!c) goto ilseq;
 			break;
@@ -441,10 +440,6 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			if (c < 128) break;
 			if (c < 0xa1) goto ilseq;
 		case GBK:
-			if (c == 128) {
-				c = 0x20ac;
-				break;
-			}
 		case GB18030:
 			if (c < 128) break;
 			c -= 0x81;
@@ -537,7 +532,7 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			if (c >= 93 || d >= 94) {
 				c += (0xa1-0x81);
 				d += 0xa1;
-				if (c >= 93 || c>=0xc6-0x81 && d>0x52)
+				if (c >= 93 || ((c>=0xc6-0x81) && d>0x52))
 					goto ilseq;
 				if (d-'A'<26) d = d-'A';
 				else if (d-'a'<26) d = d-'a'+26;
diff --git a/libc/stdio/internal.h b/libc/stdio/internal.h
index 2f4857a71..e5f848f80 100644
--- a/libc/stdio/internal.h
+++ b/libc/stdio/internal.h
@@ -1,49 +1,39 @@
 #ifndef COSMOPOLITAN_LIBC_STDIO_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_STDIO_INTERNAL_H_
-#include "libc/atomic.h"
-#include "libc/intrin/dll.h"
 #include "libc/stdio/stdio.h"
 #include "libc/thread/thread.h"
 
 #define PUSHBACK 12
 
-#define FILE_CONTAINER(e) DLL_CONTAINER(struct FILE, elem, e)
-
 COSMOPOLITAN_C_START_
 
 struct FILE {
-  char bufmode;  /* _IOFBF, _IOLBF, or _IONBF */
-  char freethis; /* fclose() should free(this) */
-  char freebuf;  /* fclose() should free(this->buf) */
-  char forking;  /* used by fork() implementation */
-  int oflags;    /* O_RDONLY, etc. */
-  int state;     /* 0=OK, -1=EOF, >0=errno */
-  int fd;        /* ≥0=fd, -1=closed|buffer */
-  int pid;
-  atomic_int refs;
-  unsigned size;
-  unsigned beg;
-  unsigned end;
+  uint8_t bufmode; /* _IOFBF, etc. (ignored if fd=-1) */
+  char noclose;    /* for fake dup() todo delete! */
+  char dynamic;    /* did malloc() create this object? */
+  uint32_t iomode; /* O_RDONLY, etc. (ignored if fd=-1) */
+  int32_t state;   /* 0=OK, -1=EOF, >0=errno */
+  int fd;          /* ≥0=fd, -1=closed|buffer */
+  uint32_t beg;
+  uint32_t end;
   char *buf;
-  pthread_mutex_t lock;
-  struct Dll elem;
+  uint32_t size;
+  uint32_t nofree;
+  int pid;
   char *getln;
+  pthread_mutex_t lock;
+  struct FILE *next;
+  char mem[BUFSIZ];
 };
 
-struct Stdio {
-  pthread_mutex_t lock; /* Subordinate to FILE::lock */
-  struct Dll *files;
-};
+extern uint64_t g_rando;
 
-extern struct Stdio __stdio;
-
-void __stdio_lock(void);
-void __stdio_unlock(void);
-void __stdio_ref(FILE *);
-void __stdio_unref(FILE *);
-void __stdio_unref_unlocked(FILE *);
+int __fflush_impl(FILE *);
+int __fflush_register(FILE *);
+void __fflush_unregister(FILE *);
 bool __stdio_isok(FILE *);
 FILE *__stdio_alloc(void);
+void __stdio_free(FILE *);
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_STDIO_INTERNAL_H_ */
diff --git a/libc/stdio/kvappendf.c b/libc/stdio/kvappendf.c
index cdbf85b9c..5c171a1c4 100644
--- a/libc/stdio/kvappendf.c
+++ b/libc/stdio/kvappendf.c
@@ -19,7 +19,7 @@
 #include "libc/assert.h"
 #include "libc/dce.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/append.h"
 
diff --git a/libc/stdio/mt19937.c b/libc/stdio/mt19937.c
index 778cf2f1f..72fd5fce7 100644
--- a/libc/stdio/mt19937.c
+++ b/libc/stdio/mt19937.c
@@ -35,7 +35,7 @@
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/likely.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdio/rand.h"
 
 __notice(mt19937_notice, "mt19937 (BSD-3)\n\
diff --git a/libc/stdio/nftw.c b/libc/stdio/nftw.c
index 5bcd8dc06..53c5ca2da 100644
--- a/libc/stdio/nftw.c
+++ b/libc/stdio/nftw.c
@@ -102,24 +102,15 @@ static int do_nftw(char *path,
 		dfd = open(path, O_RDONLY | O_DIRECTORY);
 		err = errno;
 		if (dfd < 0 && err == EACCES) type = FTW_DNR;
-		if (!fd_limit) {
-			close(dfd);
-			dfd = -1;
-		}
+		if (!fd_limit) close(dfd);
         }
 
-	if (!(flags & FTW_DEPTH) && (r=fn(path, &st, type, &lev))) {
-		if (dfd != -1)
-			close(dfd);
+	if (!(flags & FTW_DEPTH) && (r=fn(path, &st, type, &lev)))
 		return r;
-	}
 
 	for (; h; h = h->chain)
-		if (h->dev == st.st_dev && h->ino == st.st_ino) {
-			if (dfd != -1)
-				close(dfd);
+		if (h->dev == st.st_dev && h->ino == st.st_ino)
 			return 0;
-		}
 
 	if ((type == FTW_D || type == FTW_DP) && fd_limit) {
 		if (dfd < 0) {
diff --git a/libc/system/pclose.c b/libc/stdio/pclose.c
similarity index 100%
rename from libc/system/pclose.c
rename to libc/stdio/pclose.c
diff --git a/libc/system/popen.c b/libc/stdio/popen.c
similarity index 93%
rename from libc/system/popen.c
rename to libc/stdio/popen.c
index 2636cc5ff..d53d5a426 100644
--- a/libc/system/popen.c
+++ b/libc/stdio/popen.c
@@ -22,6 +22,7 @@
 #include "libc/intrin/weaken.h"
 #include "libc/paths.h"
 #include "libc/runtime/runtime.h"
+#include "libc/stdio/fflush.internal.h"
 #include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/sysv/consts/f.h"
@@ -53,7 +54,7 @@
  * @cancelationpoint
  */
 FILE *popen(const char *cmdline, const char *mode) {
-  FILE *f;
+  FILE *f, *f2;
   int e, rc, pid, dir, flags, pipefds[2];
   flags = fopenflags(mode);
   if ((flags & O_ACCMODE) == O_RDONLY) {
@@ -74,28 +75,20 @@ FILE *popen(const char *cmdline, const char *mode) {
   if ((f = fdopen(pipefds[dir], mode))) {
     switch ((pid = fork())) {
       case 0:
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wanalyzer-fd-leak"
         unassert(dup2(pipefds[!dir], !dir) == !dir);
-#pragma GCC diagnostic pop
         // we can't rely on cloexec because cocmd builtins don't execve
         if (pipefds[0] != !dir)
           unassert(!close(pipefds[0]));
         if (pipefds[1] != !dir)
           unassert(!close(pipefds[1]));
-
         // "The popen() function shall ensure that any streams from
         //  previous popen() calls that remain open in the parent
         //  process are closed in the new child process." -POSIX
-        for (struct Dll *e = dll_first(__stdio.files); e;
-             e = dll_next(__stdio.files, e)) {
-          FILE *f2 = FILE_CONTAINER(e);
-          if (f != f2 && f2->pid && f2->fd != -1) {
+        for (int i = 0; i < __fflush.handles.i; ++i) {
+          if ((f2 = __fflush.handles.p[i]) && f2->pid) {
             close(f2->fd);
-            f2->fd = -1;
           }
         }
-
         _Exit(_cocmd(3,
                      (char *[]){
                          "popen",
diff --git a/libc/stdio/printargs.c b/libc/stdio/printargs.c
index b21008128..3e2ba0bb7 100644
--- a/libc/stdio/printargs.c
+++ b/libc/stdio/printargs.c
@@ -32,7 +32,7 @@
 #include "libc/intrin/promises.h"
 #include "libc/intrin/strace.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/cpuid4.internal.h"
 #include "libc/nexgen32e/kcpuids.h"
 #include "libc/nexgen32e/x86feature.h"
@@ -306,7 +306,7 @@ textstartup void __printargs(const char *prologue) {
       if (i && (u.pfds[i].revents & POLLNVAL))
         continue;
       PRINT(" ☼ %d (revents=%#hx fcntl(F_GETFL)=%s isatty()=%hhhd)", i,
-            u.pfds[i].revents, _DescribeOpenFlags(oflagbuf, fcntl(i, F_GETFL)),
+            u.pfds[i].revents, (DescribeOpenFlags)(oflagbuf, fcntl(i, F_GETFL)),
             isatty(i));
     }
   } else {
@@ -375,7 +375,7 @@ textstartup void __printargs(const char *prologue) {
         rlim.rlim_cur = -1;
       if (rlim.rlim_max == RLIM_INFINITY)
         rlim.rlim_max = -1;
-      PRINT(" ☼ %-20s %,16ld %,16ld", _DescribeRlimitName(buf, i),
+      PRINT(" ☼ %-20s %,16ld %,16ld", (DescribeRlimitName)(buf, i),
             rlim.rlim_cur, rlim.rlim_max);
       gotsome = true;
     }
diff --git a/libc/stdio/rand.c b/libc/stdio/rand.c
index 1a5aad654..1802d99b2 100644
--- a/libc/stdio/rand.c
+++ b/libc/stdio/rand.c
@@ -17,17 +17,9 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/stdio/rand.h"
+#include "libc/stdio/internal.h"
 #include "libc/stdio/lcg.internal.h"
 
-static uint64_t rando;
-
-/**
- * Seeds random number generator that's used by rand().
- */
-void srand(unsigned seed) {
-  rando = seed;
-}
-
 /**
  * Returns 31-bit linear congruential pseudorandom number, e.g.
  *
@@ -47,5 +39,5 @@ void srand(unsigned seed) {
  * @threadunsafe
  */
 int rand(void) {
-  return KnuthLinearCongruentialGenerator(&rando) >> 33;
+  return KnuthLinearCongruentialGenerator(&g_rando) >> 33;
 }
diff --git a/libc/stdio/scandir.c b/libc/stdio/scandir.c
index 2c1d9811d..e9be6e664 100644
--- a/libc/stdio/scandir.c
+++ b/libc/stdio/scandir.c
@@ -69,9 +69,7 @@ int scandir(const char *path, struct dirent ***res,
 	}
 	errno = old_errno;
 
-	if (cmp && names) {
-		qsort(names, cnt, sizeof *names, (int (*)(const void *, const void *))cmp);
-	}
+	if (cmp) qsort(names, cnt, sizeof *names, (int (*)(const void *, const void *))cmp);
 	*res = names;
 	return cnt;
 }
diff --git a/libc/stdio/setvbuf.c b/libc/stdio/setvbuf.c
index e3ef1d1fb..6be7ca74b 100644
--- a/libc/stdio/setvbuf.c
+++ b/libc/stdio/setvbuf.c
@@ -38,13 +38,15 @@ int setvbuf(FILE *f, char *buf, int mode, size_t size) {
   if (buf) {
     if (!size)
       size = BUFSIZ;
-    if (f->freebuf)
-      if (f->buf != buf)
-        if (_weaken(free))
-          _weaken(free)(f->buf);
+    if (!f->nofree &&        //
+        f->buf != buf &&     //
+        f->buf != f->mem &&  //
+        _weaken(free)) {
+      _weaken(free)(f->buf);
+    }
     f->buf = buf;
     f->size = size;
-    f->freebuf = 0;
+    f->nofree = true;
   }
   f->bufmode = mode;
   funlockfile(f);
diff --git a/libc/stdio/srand.c b/libc/stdio/srand.c
new file mode 100644
index 000000000..8b072163e
--- /dev/null
+++ b/libc/stdio/srand.c
@@ -0,0 +1,28 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/stdio/rand.h"
+
+extern uint64_t g_rando;
+
+/**
+ * Seeds random number generator that's used by rand().
+ */
+void srand(unsigned seed) {
+  g_rando = seed;
+}
diff --git a/libc/stdio/stderr.c b/libc/stdio/stderr.c
index 72af4c828..de694ed62 100644
--- a/libc/stdio/stderr.c
+++ b/libc/stdio/stderr.c
@@ -1,5 +1,5 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+│ vi: set et ft=c ts=8 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
 │ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
@@ -16,17 +16,18 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/dll.h"
 #include "libc/stdio/internal.h"
 #include "libc/sysv/consts/fileno.h"
 #include "libc/sysv/consts/o.h"
+#include "libc/thread/thread.h"
 
 static FILE __stderr = {
     .fd = STDERR_FILENO,
     .bufmode = _IONBF,
-    .oflags = O_WRONLY,
+    .iomode = O_WRONLY,
+    .buf = __stderr.mem,
+    .size = sizeof(stderr->mem),
     .lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP,
-    .elem = {&__stderr.elem, &__stderr.elem},
 };
 
 /**
@@ -34,6 +35,6 @@ static FILE __stderr = {
  */
 FILE *stderr = &__stderr;
 
-__attribute__((__constructor__(60))) static textstartup void stderr_init(void) {
-  dll_make_last(&__stdio.files, &__stderr.elem);
+__attribute__((__constructor__(60))) static textstartup void errinit(void) {
+  __fflush_register(stderr);
 }
diff --git a/libc/stdio/stdin.c b/libc/stdio/stdin.c
index 8b1b44b9d..c5c3f6c2e 100644
--- a/libc/stdio/stdin.c
+++ b/libc/stdio/stdin.c
@@ -1,5 +1,5 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+│ vi: set et ft=c ts=8 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
 │ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
@@ -17,25 +17,19 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/stat.h"
-#include "libc/intrin/dll.h"
 #include "libc/stdio/internal.h"
 #include "libc/sysv/consts/fileno.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/s.h"
 #include "libc/thread/thread.h"
 
-__static_yoink("fflush");
-
-static char __stdin_buf[BUFSIZ];
-
 static FILE __stdin = {
     .fd = STDIN_FILENO,
-    .oflags = O_RDONLY,
+    .iomode = O_RDONLY,
     .bufmode = _IOFBF,
-    .buf = __stdin_buf,
-    .size = sizeof(__stdin_buf),
+    .buf = __stdin.mem,
+    .size = sizeof(stdin->mem),
     .lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP,
-    .elem = {&__stdin.elem, &__stdin.elem},
 };
 
 /**
@@ -43,9 +37,9 @@ static FILE __stdin = {
  */
 FILE *stdin = &__stdin;
 
-__attribute__((__constructor__(60))) static textstartup void stdin_init(void) {
+__attribute__((__constructor__(60))) static textstartup void initin(void) {
   struct stat st;
-  if (fstat(STDIN_FILENO, &st) || S_ISCHR(st.st_mode))
+  if (fstat(STDIN_FILENO, &st) || !S_ISREG(st.st_mode))
     stdin->bufmode = _IONBF;
-  dll_make_last(&__stdio.files, &__stdin.elem);
+  __fflush_register(stdin);
 }
diff --git a/libc/stdio/stdio.h b/libc/stdio/stdio.h
index 8944d949f..02773651b 100644
--- a/libc/stdio/stdio.h
+++ b/libc/stdio/stdio.h
@@ -19,7 +19,6 @@ COSMOPOLITAN_C_START_
 /*───────────────────────────────────────────────────────────────────────────│─╗
 │ cosmopolitan § standard i/o                                              ─╬─│┼
 ╚────────────────────────────────────────────────────────────────────────────│*/
-/* clang-format off */
 
 struct FILE;
 typedef struct FILE FILE;
@@ -28,146 +27,147 @@ extern FILE *stdin;
 extern FILE *stdout;
 extern FILE *stderr;
 
-errno_t ferror(FILE *) libcesque paramsnonnull() __read_write(1);
-void clearerr(FILE *) libcesque paramsnonnull() __read_write(1);
-int feof(FILE *) libcesque paramsnonnull() __read_write(1);
-int getc(FILE *) libcesque paramsnonnull() __read_write(1);
-int putc(int, FILE *) libcesque paramsnonnull() __read_write(2);
-int fflush(FILE *) libcesque __read_write(1);
-int fpurge(FILE *) libcesque __read_write(1);
-int fgetc(FILE *) libcesque paramsnonnull() __read_write(1);
-char *fgetln(FILE *, size_t *) libcesque paramsnonnull((1)) __read_write(1) __write_only(2);
-int ungetc(int, FILE *) libcesque paramsnonnull() __write_only(2);
-int fileno(FILE *) libcesque paramsnonnull() nosideeffect __write_only(1);
-int fputc(int, FILE *) libcesque paramsnonnull() __write_only(2);
-int fputs(const char *, FILE *) libcesque paramsnonnull() __write_only(2);
-int fputws(const wchar_t *, FILE *) libcesque paramsnonnull() __write_only(2);
-void flockfile(FILE *) libcesque __write_only(1);
-void funlockfile(FILE *) libcesque paramsnonnull() __write_only(1);
-int ftrylockfile(FILE *) libcesque paramsnonnull() __write_only(1);
-char *fgets(char *, int, FILE *) libcesque paramsnonnull() __write_only(1, 2) __read_write(3);
-wchar_t *fgetws(wchar_t *, int, FILE *) libcesque paramsnonnull() __write_only(1, 2) __read_write(3);
-wint_t putwc(wchar_t, FILE *) libcesque paramsnonnull() __write_only(2);
-wint_t fputwc(wchar_t, FILE *) libcesque paramsnonnull() __write_only(2);
+errno_t ferror(FILE *) libcesque paramsnonnull();
+void clearerr(FILE *) libcesque paramsnonnull();
+int feof(FILE *) libcesque paramsnonnull();
+int getc(FILE *) libcesque paramsnonnull();
+int putc(int, FILE *) libcesque paramsnonnull();
+int fflush(FILE *) libcesque;
+int fpurge(FILE *) libcesque;
+int fgetc(FILE *) libcesque paramsnonnull();
+char *fgetln(FILE *, size_t *) libcesque paramsnonnull((1));
+int ungetc(int, FILE *) libcesque paramsnonnull();
+int fileno(FILE *) libcesque paramsnonnull() nosideeffect;
+int fputc(int, FILE *) libcesque paramsnonnull();
+int fputs(const char *, FILE *) libcesque paramsnonnull();
+int fputws(const wchar_t *, FILE *) libcesque paramsnonnull();
+void flockfile(FILE *) libcesque paramsnonnull();
+void funlockfile(FILE *) libcesque paramsnonnull();
+int ftrylockfile(FILE *) libcesque paramsnonnull();
+char *fgets(char *, int, FILE *) libcesque paramsnonnull();
+wchar_t *fgetws(wchar_t *, int, FILE *) libcesque paramsnonnull();
+wint_t putwc(wchar_t, FILE *) libcesque paramsnonnull();
+wint_t fputwc(wchar_t, FILE *) libcesque paramsnonnull();
 wint_t putwchar(wchar_t) libcesque;
 wint_t getwchar(void) libcesque;
-wint_t getwc(FILE *) libcesque paramsnonnull() __write_only(1);
-wint_t fgetwc(FILE *) libcesque paramsnonnull() __write_only(1);
-wint_t ungetwc(wint_t, FILE *) libcesque paramsnonnull() __write_only(2);
+wint_t getwc(FILE *) libcesque paramsnonnull();
+wint_t fgetwc(FILE *) libcesque paramsnonnull();
+wint_t ungetwc(wint_t, FILE *) libcesque paramsnonnull();
 int getchar(void) libcesque;
 int putchar(int) libcesque;
-int puts(const char *) libcesque __read_only(1);
-
-ssize_t getline(char **, size_t *, FILE *) libcesque paramsnonnull() __read_write(1) __read_write(2) __read_write(3);
-ssize_t getdelim(char **, size_t *, int, FILE *) libcesque paramsnonnull() __read_write(1) __read_write(2) __read_write(4);
-FILE *fopen(const char *, const char *) libcesque paramsnonnull((2)) __read_only(1) __read_only(2) __wur;
-FILE *fdopen(int, const char *) libcesque paramsnonnull() __read_only(2) __wur;
-FILE *fmemopen(void *, size_t, const char *) libcesque paramsnonnull((3)) __read_write(1) __read_only(3) __wur;
-FILE *freopen(const char *, const char *, FILE *) paramsnonnull((2, 3)) __read_only(1) __read_only(2) __read_write(3);
-size_t fread(void *, size_t, size_t, FILE *) libcesque paramsnonnull((4)) __write_only(1) __read_write(4);
-size_t fwrite(const void *, size_t, size_t, FILE *) paramsnonnull((4)) __read_only(1) __read_write(4);
-int fclose(FILE *) libcesque __read_write(1);
-int fseek(FILE *, long, int) libcesque paramsnonnull() __read_write(1);
-long ftell(FILE *) libcesque paramsnonnull() __read_write(1);
-int fseeko(FILE *, int64_t, int) libcesque paramsnonnull() __read_write(1);
-int64_t ftello(FILE *) libcesque paramsnonnull() __read_write(1);
-void rewind(FILE *) libcesque paramsnonnull() __read_write(1);
-int fopenflags(const char *) libcesque paramsnonnull() __read_only(1);
-void setlinebuf(FILE *) libcesque __read_write(1);
-void setbuf(FILE *, char *) libcesque __read_write(1) __write_only(2);
-void setbuffer(FILE *, char *, size_t) libcesque __read_write(1) __write_only(2);
-int setvbuf(FILE *, char *, int, size_t) libcesque __read_write(1);
-int pclose(FILE *) libcesque __read_write(1);
-char *ctermid(char *) libcesque __write_only(1);
-void perror(const char *) libcesque relegated __read_only(1);
+int puts(const char *) libcesque;
+ssize_t getline(char **, size_t *, FILE *) libcesque paramsnonnull();
+ssize_t getdelim(char **, size_t *, int, FILE *) libcesque paramsnonnull();
+FILE *fopen(const char *, const char *) libcesque paramsnonnull((2)) __wur;
+FILE *fdopen(int, const char *) libcesque paramsnonnull() __wur;
+FILE *fmemopen(void *, size_t, const char *) libcesque paramsnonnull((3)) __wur;
+FILE *freopen(const char *, const char *, FILE *) paramsnonnull((2, 3));
+size_t fread(void *, size_t, size_t, FILE *) libcesque paramsnonnull((4));
+size_t fwrite(const void *, size_t, size_t, FILE *) paramsnonnull((4));
+int fclose(FILE *) libcesque;
+int fseek(FILE *, long, int) libcesque paramsnonnull();
+long ftell(FILE *) libcesque paramsnonnull();
+int fseeko(FILE *, int64_t, int) libcesque paramsnonnull();
+int64_t ftello(FILE *) libcesque paramsnonnull();
+void rewind(FILE *) libcesque paramsnonnull();
+int fopenflags(const char *) libcesque paramsnonnull();
+void setlinebuf(FILE *) libcesque;
+void setbuf(FILE *, char *) libcesque;
+void setbuffer(FILE *, char *, size_t) libcesque;
+int setvbuf(FILE *, char *, int, size_t) libcesque;
+int pclose(FILE *) libcesque;
+char *ctermid(char *) libcesque;
+void perror(const char *) libcesque relegated;
 
 typedef uint64_t fpos_t;
-char *gets(char *) libcesque paramsnonnull() __write_only(1);
-int fgetpos(FILE *, fpos_t *) libcesque paramsnonnull() __read_write(1) __write_only(2);
-int fsetpos(FILE *, const fpos_t *) libcesque paramsnonnull() __read_write(1) __read_only(2);
+char *gets(char *) libcesque paramsnonnull();
+int fgetpos(FILE *, fpos_t *) libcesque paramsnonnull();
+int fsetpos(FILE *, const fpos_t *) libcesque paramsnonnull();
 
 FILE *tmpfile(void) libcesque __wur;
-char *tmpnam(char *) libcesque __write_only(1) __wur;
-char *tmpnam_r(char *) libcesque __write_only(1) __wur;
+char *tmpnam(char *) libcesque __wur;
+char *tmpnam_r(char *) libcesque __wur;
 
-FILE *popen(const char *, const char *) libcesque __read_only(1) __read_only(2);
+FILE *popen(const char *, const char *) libcesque;
 
 /*───────────────────────────────────────────────────────────────────────────│─╗
 │ cosmopolitan § standard i/o » formatting                                 ─╬─│┼
 ╚────────────────────────────────────────────────────────────────────────────│*/
 
-int printf(const char *, ...) printfesque(1) paramsnonnull((1)) libcesque __read_only(1);
-int vprintf(const char *, va_list) paramsnonnull() libcesque __read_only(1);
-int fprintf(FILE *, const char *, ...) printfesque(2) paramsnonnull((1, 2)) libcesque __read_write(1) __read_only(2);
-int vfprintf(FILE *, const char *, va_list) paramsnonnull() libcesque __read_write(1) __read_only(2);
-int scanf(const char *, ...) libcesque scanfesque(1) __read_only(1);
-int vscanf(const char *, va_list) libcesque __read_only(1);
-int fscanf(FILE *, const char *, ...) libcesque scanfesque(2) __read_write(1) __read_only(2);
-int vfscanf(FILE *, const char *, va_list) libcesque __read_write(1) __read_only(2);
+int printf(const char *, ...) printfesque(1) paramsnonnull((1)) libcesque;
+int vprintf(const char *, va_list) paramsnonnull() libcesque;
+int fprintf(FILE *, const char *, ...) printfesque(2)
+    paramsnonnull((1, 2)) libcesque;
+int vfprintf(FILE *, const char *, va_list) paramsnonnull() libcesque;
+int scanf(const char *, ...) libcesque scanfesque(1);
+int vscanf(const char *, va_list) libcesque;
+int fscanf(FILE *, const char *, ...) libcesque scanfesque(2);
+int vfscanf(FILE *, const char *, va_list) libcesque;
 
-int snprintf(char *, size_t, const char *, ...) printfesque(3) libcesque __write_only(1) __read_only(3);
-int vsnprintf(char *, size_t, const char *, va_list) libcesque __write_only(1) __read_only(3);
-int sprintf(char *, const char *, ...) libcesque __write_only(1) __read_only(2);
-int vsprintf(char *, const char *, va_list) libcesque __write_only(1) __read_only(2);
+int snprintf(char *, size_t, const char *, ...) printfesque(3) libcesque;
+int vsnprintf(char *, size_t, const char *, va_list) libcesque;
+int sprintf(char *, const char *, ...) libcesque;
+int vsprintf(char *, const char *, va_list) libcesque;
 
-int fwprintf(FILE *, const wchar_t *, ...) libcesque __read_write(1) __read_only(2);
-int fwscanf(FILE *, const wchar_t *, ...) libcesque __read_write(1) __read_only(2);
-int swprintf(wchar_t *, size_t, const wchar_t *, ...) libcesque __write_only(1) __read_only(3);
-int swscanf(const wchar_t *, const wchar_t *, ...) libcesque __read_only(1) __read_only(2);
-int vfwprintf(FILE *, const wchar_t *, va_list) libcesque __read_write(1) __read_only(2);
-int vfwscanf(FILE *, const wchar_t *, va_list) libcesque __read_write(1) __read_only(2);
-int vswprintf(wchar_t *, size_t, const wchar_t *, va_list) libcesque __write_only(1) __read_only(3);
-int vswscanf(const wchar_t *, const wchar_t *, va_list) libcesque __read_only(1) __read_only(2);
-int vwprintf(const wchar_t *, va_list) libcesque __read_only(1);
-int vwscanf(const wchar_t *, va_list) libcesque __read_only(1);
-int wprintf(const wchar_t *, ...) libcesque __read_only(1);
-int wscanf(const wchar_t *, ...) libcesque __read_only(1);
-int fwide(FILE *, int) libcesque __read_write(1);
+int fwprintf(FILE *, const wchar_t *, ...) libcesque;
+int fwscanf(FILE *, const wchar_t *, ...) libcesque;
+int swprintf(wchar_t *, size_t, const wchar_t *, ...) libcesque;
+int swscanf(const wchar_t *, const wchar_t *, ...) libcesque;
+int vfwprintf(FILE *, const wchar_t *, va_list) libcesque;
+int vfwscanf(FILE *, const wchar_t *, va_list) libcesque;
+int vswprintf(wchar_t *, size_t, const wchar_t *, va_list) libcesque;
+int vswscanf(const wchar_t *, const wchar_t *, va_list) libcesque;
+int vwprintf(const wchar_t *, va_list) libcesque;
+int vwscanf(const wchar_t *, va_list) libcesque;
+int wprintf(const wchar_t *, ...) libcesque;
+int wscanf(const wchar_t *, ...) libcesque;
+int fwide(FILE *, int) libcesque;
 
-int sscanf(const char *, const char *, ...) libcesque scanfesque(2) __read_only(1) __read_only(2);
-int vsscanf(const char *, const char *, va_list) libcesque __read_only(1) __read_only(2);
+int sscanf(const char *, const char *, ...) libcesque scanfesque(2);
+int vsscanf(const char *, const char *, va_list) libcesque;
 
 /*───────────────────────────────────────────────────────────────────────────│─╗
 │ cosmopolitan § standard i/o » allocating                                 ─╬─│┼
 ╚────────────────────────────────────────────────────────────────────────────│*/
 
-int asprintf(char **, const char *, ...) printfesque(2) paramsnonnull((1, 2)) libcesque __write_only(1);
-int vasprintf(char **, const char *, va_list) paramsnonnull() libcesque __write_only(1);
+int asprintf(char **, const char *, ...) printfesque(2)
+    paramsnonnull((1, 2)) libcesque;
+int vasprintf(char **, const char *, va_list) paramsnonnull() libcesque;
 
 /*───────────────────────────────────────────────────────────────────────────│─╗
 │ cosmopolitan § standard i/o » without mutexes                            ─╬─│┼
 ╚────────────────────────────────────────────────────────────────────────────│*/
 
-int getc_unlocked(FILE *) libcesque paramsnonnull() __read_write(1);
-int puts_unlocked(const char *) libcesque __read_only(1);
+int getc_unlocked(FILE *) libcesque paramsnonnull();
+int puts_unlocked(const char *) libcesque;
 int getchar_unlocked(void) libcesque;
-int putc_unlocked(int, FILE *) libcesque paramsnonnull() __read_write(2);
+int putc_unlocked(int, FILE *) libcesque paramsnonnull();
 int putchar_unlocked(int) libcesque;
-void clearerr_unlocked(FILE *) libcesque __write_only(1);
-int feof_unlocked(FILE *) libcesque __read_only(1);
-int ferror_unlocked(FILE *) libcesque __read_only(1);
-int fileno_unlocked(FILE *) libcesque __read_only(1);
-int fflush_unlocked(FILE *) libcesque __read_write(1);
-int fgetc_unlocked(FILE *) libcesque __read_write(1);
-int fputc_unlocked(int, FILE *) libcesque __read_write(2);
-size_t fread_unlocked(void *, size_t, size_t, FILE *) libcesque __write_only(1) __read_write(4);
-size_t fwrite_unlocked(const void *, size_t, size_t, FILE *) libcesque __read_only(1) __read_write(4);
-char *fgets_unlocked(char *, int, FILE *) libcesque __write_only(1) __read_write(3);
-int fputs_unlocked(const char *, FILE *) libcesque __read_only(1) __read_write(2);
-wint_t getwc_unlocked(FILE *) libcesque __read_write(1);
+void clearerr_unlocked(FILE *) libcesque;
+int feof_unlocked(FILE *) libcesque;
+int ferror_unlocked(FILE *) libcesque;
+int fileno_unlocked(FILE *) libcesque;
+int fflush_unlocked(FILE *) libcesque;
+int fgetc_unlocked(FILE *) libcesque;
+int fputc_unlocked(int, FILE *) libcesque;
+size_t fread_unlocked(void *, size_t, size_t, FILE *) libcesque;
+size_t fwrite_unlocked(const void *, size_t, size_t, FILE *) libcesque;
+char *fgets_unlocked(char *, int, FILE *) libcesque;
+int fputs_unlocked(const char *, FILE *) libcesque;
+wint_t getwc_unlocked(FILE *) libcesque;
 wint_t getwchar_unlocked(void) libcesque;
-wint_t fgetwc_unlocked(FILE *) libcesque __read_write(1);
-wint_t fputwc_unlocked(wchar_t, FILE *) libcesque __read_write(2);
-wint_t putwc_unlocked(wchar_t, FILE *) libcesque __read_write(2);
+wint_t fgetwc_unlocked(FILE *) libcesque;
+wint_t fputwc_unlocked(wchar_t, FILE *) libcesque;
+wint_t putwc_unlocked(wchar_t, FILE *) libcesque;
 wint_t putwchar_unlocked(wchar_t) libcesque;
-wchar_t *fgetws_unlocked(wchar_t *, int, FILE *) libcesque __write_only(1, 2) __read_write(3);
-int fputws_unlocked(const wchar_t *, FILE *) libcesque __read_only(1) __read_write(2);
-wint_t ungetwc_unlocked(wint_t, FILE *) libcesque paramsnonnull() __read_write(2);
-int ungetc_unlocked(int, FILE *) libcesque paramsnonnull() __read_write(2);
-int fseek_unlocked(FILE *, int64_t, int) libcesque paramsnonnull() __read_write(1);
-ssize_t getdelim_unlocked(char **, size_t *, int, FILE *) paramsnonnull() __read_write(1) __read_write(2) __read_write(4);
-int fprintf_unlocked(FILE *, const char *, ...) printfesque(2) libcesque __read_write(1) __read_only(2);
-int vfprintf_unlocked(FILE *, const char *, va_list) paramsnonnull() libcesque __read_write(1) __read_only(2);
+wchar_t *fgetws_unlocked(wchar_t *, int, FILE *) libcesque;
+int fputws_unlocked(const wchar_t *, FILE *) libcesque;
+wint_t ungetwc_unlocked(wint_t, FILE *) libcesque paramsnonnull();
+int ungetc_unlocked(int, FILE *) libcesque paramsnonnull();
+int fseek_unlocked(FILE *, int64_t, int) libcesque paramsnonnull();
+ssize_t getdelim_unlocked(char **, size_t *, int, FILE *) paramsnonnull();
+int fprintf_unlocked(FILE *, const char *, ...) printfesque(2) libcesque;
+int vfprintf_unlocked(FILE *, const char *, va_list) paramsnonnull() libcesque;
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_STDIO_H_ */
diff --git a/libc/stdio/stdout.c b/libc/stdio/stdout.c
index 86a34f9f3..4c6b9b2d6 100644
--- a/libc/stdio/stdout.c
+++ b/libc/stdio/stdout.c
@@ -1,5 +1,5 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+│ vi: set et ft=c ts=8 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
 │ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
@@ -16,22 +16,17 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/dll.h"
 #include "libc/stdio/internal.h"
 #include "libc/sysv/consts/fileno.h"
 #include "libc/sysv/consts/o.h"
-
-__static_yoink("fflush");
-
-static char __stdout_buf[BUFSIZ];
+#include "libc/thread/thread.h"
 
 static FILE __stdout = {
     .fd = STDOUT_FILENO,
-    .oflags = O_WRONLY,
-    .buf = __stdout_buf,
-    .size = sizeof(__stdout_buf),
+    .iomode = O_WRONLY,
+    .buf = __stdout.mem,
+    .size = sizeof(stdout->mem),
     .lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP,
-    .elem = {&__stdout.elem, &__stdout.elem},
 
     // Unlike other C libraries we don't bother calling fstat() to check
     // if stdio is a character device and we instead choose to always
@@ -47,6 +42,6 @@ static FILE __stdout = {
  */
 FILE *stdout = &__stdout;
 
-__attribute__((__constructor__(60))) static textstartup void stdout_init(void) {
-  dll_make_last(&__stdio.files, &__stdout.elem);
+__attribute__((__constructor__(60))) static textstartup void outinit(void) {
+  __fflush_register(stdout);
 }
diff --git a/libc/stdio/strfry.c b/libc/stdio/strfry.c
index 56a703cbd..eac05107d 100644
--- a/libc/stdio/strfry.c
+++ b/libc/stdio/strfry.c
@@ -16,23 +16,14 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/mem/shuffle.internal.h"
 #include "libc/stdio/rand.h"
 #include "libc/str/str.h"
 
 /**
- * Performs Fisher-Yates shuffle on string in-place to create anagram.
- *
- * This implementation uses rand() so `srand(time(0))` may be desired.
+ * Jumbles up string.
  */
 char *strfry(char *s) {
-  size_t i = strlen(s);
-  while (i > 1) {
-    size_t x = rand();
-    size_t y = rand();
-    size_t j = ((x << 31) ^ y) % i--;
-    char t = s[j];
-    s[j] = s[i];
-    s[i] = t;
-  }
+  shuffle(rand, s, strlen(s));
   return s;
 }
diff --git a/libc/stdio/vappendf.c b/libc/stdio/vappendf.c
index 5ed5ab10f..726e59450 100644
--- a/libc/stdio/vappendf.c
+++ b/libc/stdio/vappendf.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/append.h"
 #include "libc/stdio/stdio.h"
diff --git a/libc/stdio/vasprintf.c b/libc/stdio/vasprintf.c
index 907ca8ffa..3f05529d0 100644
--- a/libc/stdio/vasprintf.c
+++ b/libc/stdio/vasprintf.c
@@ -52,7 +52,6 @@ int vasprintf(char **strp, const char *fmt, va_list va) {
     *strp = p;
     return rc;
   } else {
-    free(p);
     return -1;
   }
 }
diff --git a/libc/stdio/vcscanf.c b/libc/stdio/vcscanf.c
index ddfa01097..ad91f88d8 100644
--- a/libc/stdio/vcscanf.c
+++ b/libc/stdio/vcscanf.c
@@ -23,7 +23,7 @@
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/str/tpdecodecb.internal.h"
 #include "libc/str/utf16.h"
 #include "libc/sysv/errfuns.h"
@@ -254,15 +254,11 @@ int __vcscanf(int callback(void *),    //
                 c = READ;
               }
               fpbufsize = FP_BUFFER_GROW;
-              if ((fpbuf = malloc(fpbufsize))) {
-                fpbufcur = 0;
-                fpbuf[fpbufcur++] = c;
-                fpbuf[fpbufcur] = '\0';
-                goto ConsumeFloatingPointNumber;
-              } else {
-                items = -1;
-                goto Done;
-              }
+              fpbuf = malloc(fpbufsize);
+              fpbufcur = 0;
+              fpbuf[fpbufcur++] = c;
+              fpbuf[fpbufcur] = '\0';
+              goto ConsumeFloatingPointNumber;
             default:
               items = einval();
               goto Done;
@@ -517,16 +513,12 @@ int __vcscanf(int callback(void *),    //
         if (discard) {
           buf = NULL;
         } else if (ismalloc) {
-          if ((buf = malloc(bufsize * charbytes))) {
-            struct FreeMe *entry;
-            if (buf && (entry = calloc(1, sizeof(struct FreeMe)))) {
-              entry->ptr = buf;
-              entry->next = freeme;
-              freeme = entry;
-            }
-          } else {
-            items = -1;
-            goto Done;
+          buf = malloc(bufsize * charbytes);
+          struct FreeMe *entry;
+          if (buf && (entry = calloc(1, sizeof(struct FreeMe)))) {
+            entry->ptr = buf;
+            entry->next = freeme;
+            freeme = entry;
           }
         } else {
           buf = va_arg(va, void *);
diff --git a/libc/stdio/vdprintf.c b/libc/stdio/vdprintf.c
index 15d2d7909..dc344804e 100644
--- a/libc/stdio/vdprintf.c
+++ b/libc/stdio/vdprintf.c
@@ -20,7 +20,7 @@
 #include "libc/dce.h"
 #include "libc/fmt/internal.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/files.h"
 #include "libc/sock/sock.h"
 #include "libc/str/str.h"
@@ -63,7 +63,7 @@ int vdprintf(int fd, const char *fmt, va_list va) {
   t.n = 0;
   t.t = 0;
   t.fd = fd;
-  if (__fmt(vdprintf_putc, &t, fmt, va, &t.t) == -1)
+  if (__fmt(vdprintf_putc, &t, fmt, va) == -1)
     return -1;
   if (t.n) {
     iov[0].iov_base = t.b;
diff --git a/libc/stdio/vfprintf_unlocked.c b/libc/stdio/vfprintf_unlocked.c
index d0285dcbd..a6ed81dbd 100644
--- a/libc/stdio/vfprintf_unlocked.c
+++ b/libc/stdio/vfprintf_unlocked.c
@@ -19,7 +19,6 @@
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/fmt/internal.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/stdckdint.h"
 #include "libc/stdio/internal.h"
 #include "libc/stdio/stdio.h"
@@ -47,8 +46,9 @@ static int __vfprintf_flbuf(const char *s, struct state *t, size_t n) {
     } else {
       rc = -1;
     }
-    if (ckd_add(&t->n, t->n, n))
+    if (ckd_add(&t->n, t->n, n)) {
       rc = eoverflow();
+    }
   } else {
     rc = 0;
   }
@@ -60,8 +60,9 @@ static int __vfprintf_nbuf(const char *s, struct state *t, size_t n) {
   for (i = 0; i < n; ++i) {
     t->b.p[t->b.n++] = s[i];
     if (t->b.n == sizeof(t->b.p)) {
-      if (!fwrite_unlocked(t->b.p, 1, t->b.n, t->f))
+      if (!fwrite_unlocked(t->b.p, 1, t->b.n, t->f)) {
         return -1;
+      }
       t->b.n = 0;
     } else if (ckd_add(&t->n, t->n, 1)) {
       return eoverflow();
@@ -86,11 +87,13 @@ int vfprintf_unlocked(FILE *f, const char *fmt, va_list va) {
   st.f = f;
   st.n = 0;
   st.b.n = 0;
-  if ((rc = __fmt(out, &st, fmt, va, &st.n)) != -1) {
+  if ((rc = __fmt(out, &st, fmt, va)) != -1) {
     if (!st.b.n) {
       rc = st.n;
     } else if (fwrite_unlocked(st.b.p, 1, st.b.n, st.f)) {
-      rc = st.n;
+      if (ckd_add(&rc, st.n, st.b.n)) {
+        rc = eoverflow();
+      }
     } else {
       rc = -1;
     }
diff --git a/libc/stdio/vsnprintf.c b/libc/stdio/vsnprintf.c
index a113335df..cdc7d3521 100644
--- a/libc/stdio/vsnprintf.c
+++ b/libc/stdio/vsnprintf.c
@@ -20,14 +20,14 @@
 #include "libc/dce.h"
 #include "libc/fmt/internal.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
 
 struct SprintfStr {
   char *p;
-  int i;
-  int n;
+  size_t i;
+  size_t n;
 };
 
 static int vsnprintfputchar(const char *s, struct SprintfStr *t, size_t n) {
@@ -58,7 +58,7 @@ static int vsnprintfputchar(const char *s, struct SprintfStr *t, size_t n) {
  */
 int vsnprintf(char *buf, size_t size, const char *fmt, va_list va) {
   struct SprintfStr str = {buf, 0, size};
-  int rc = __fmt(vsnprintfputchar, &str, fmt, va, &str.i);
+  int rc = __fmt(vsnprintfputchar, &str, fmt, va);
   if (rc < 0)
     return rc;
   if (str.n)
diff --git a/libc/str/BUILD.mk b/libc/str/BUILD.mk
index b0b2a163a..ab0193593 100644
--- a/libc/str/BUILD.mk
+++ b/libc/str/BUILD.mk
@@ -12,19 +12,16 @@ LIBC_STR_A_INCS = $(filter %.inc,$(LIBC_STR_A_FILES))
 LIBC_STR_A_SRCS_A = $(filter %.s,$(LIBC_STR_A_FILES))
 LIBC_STR_A_SRCS_S = $(filter %.S,$(LIBC_STR_A_FILES))
 LIBC_STR_A_SRCS_C = $(filter %.c,$(LIBC_STR_A_FILES))
-LIBC_STR_A_SRCS_CC = $(filter %.cc,$(LIBC_STR_A_FILES))
 
 LIBC_STR_A_SRCS =						\
 	$(LIBC_STR_A_SRCS_A)					\
 	$(LIBC_STR_A_SRCS_S)					\
-	$(LIBC_STR_A_SRCS_C)					\
-	$(LIBC_STR_A_SRCS_CC)
+	$(LIBC_STR_A_SRCS_C)
 
 LIBC_STR_A_OBJS =						\
 	$(LIBC_STR_A_SRCS_A:%.s=o/$(MODE)/%.o)			\
 	$(LIBC_STR_A_SRCS_S:%.S=o/$(MODE)/%.o)			\
-	$(LIBC_STR_A_SRCS_C:%.c=o/$(MODE)/%.o)			\
-	$(LIBC_STR_A_SRCS_CC:%.cc=o/$(MODE)/%.o)
+	$(LIBC_STR_A_SRCS_C:%.c=o/$(MODE)/%.o)
 
 LIBC_STR_A_CHECKS =						\
 	$(LIBC_STR_A).pkg					\
@@ -47,6 +44,9 @@ $(LIBC_STR_A).pkg:						\
 		$(LIBC_STR_A_OBJS)				\
 		$(foreach x,$(LIBC_STR_A_DIRECTDEPS),$($(x)_A).pkg)
 
+o/$(MODE)/libc/str/wow.o: private				\
+		CC = gcc
+
 o/$(MODE)/libc/str/wmemset.o					\
 o/$(MODE)/libc/str/memset16.o					\
 o/$(MODE)/libc/str/dosdatetimetounix.o: private			\
@@ -77,7 +77,13 @@ o/$(MODE)/libc/str/iswseparator.o: private			\
 
 # ensure that division is optimized
 o/$(MODE)/libc/str/bcmp.o					\
-o/$(MODE)/libc/str/strcmp.o: private				\
+o/$(MODE)/libc/str/strcmp.o					\
+o/$(MODE)/libc/str/windowsdurationtotimeval.o			\
+o/$(MODE)/libc/str/windowsdurationtotimespec.o			\
+o/$(MODE)/libc/str/timevaltowindowstime.o			\
+o/$(MODE)/libc/str/timespectowindowstime.o			\
+o/$(MODE)/libc/str/windowstimetotimeval.o			\
+o/$(MODE)/libc/str/windowstimetotimespec.o: private		\
 		CFLAGS +=					\
 			-O2
 
diff --git a/libc/str/blake2.c b/libc/str/blake2.c
index f2e10416a..acbeb1b70 100644
--- a/libc/str/blake2.c
+++ b/libc/str/blake2.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/blake2.h"
 #include "libc/assert.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 
 #define ROR(v, n) (((v) >> (n)) | ((v) << (64 - (n))))
diff --git a/third_party/musl/btowc.c b/libc/str/btowc.c
similarity index 86%
rename from third_party/musl/btowc.c
rename to libc/str/btowc.c
index 557146e94..4e3cb74ab 100644
--- a/third_party/musl/btowc.c
+++ b/libc/str/btowc.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,14 +25,13 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <stdio.h>
-#include <wchar.h>
-#include <stdlib.h>
-#include "multibyte.h"
+#include "libc/limits.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/mb.internal.h"
+#include "libc/str/str.h"
 __static_yoink("musl_libc_notice");
 
-wint_t btowc(int c)
-{
-	int b = (unsigned char)c;
-	return b<128U ? b : (MB_CUR_MAX==1 && c!=EOF) ? CODEUNIT(c) : WEOF;
+wint_t btowc(int c) {
+  int b = (unsigned char)c;
+  return b < 128U ? b : (MB_CUR_MAX == 1 && c != EOF) ? CODEUNIT(c) : WEOF;
 }
diff --git a/third_party/musl/c16rtomb.c b/libc/str/c16rtomb.c
similarity index 76%
rename from third_party/musl/c16rtomb.c
rename to libc/str/c16rtomb.c
index 10fbcfa89..546f40741 100644
--- a/third_party/musl/c16rtomb.c
+++ b/libc/str/c16rtomb.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,41 +25,40 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <uchar.h>
-#include <errno.h>
-#include <wchar.h>
+#include "libc/calls/calls.h"
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/str/mb.internal.h"
+#include "libc/str/str.h"
 __static_yoink("musl_libc_notice");
 
-#pragma GCC diagnostic ignored "-Wparentheses"
-
-size_t c16rtomb(char *restrict s, char16_t c16, mbstate_t *restrict ps)
-{
-	static unsigned internal_state;
-	if (!ps) ps = (void *)&internal_state;
-	unsigned *x = (unsigned *)ps;
-	wchar_t wc;
-
-	if (!s) {
-		if (*x) goto ilseq;
-		return 1;
-	}
-
-	if (!*x && c16 - 0xd800u < 0x400) {
-		*x = c16 - 0xd7c0 << 10;
-		return 0;
-	}
-
-	if (*x) {
-		if (c16 - 0xdc00u >= 0x400) goto ilseq;
-		else wc = *x + c16 - 0xdc00;
-		*x = 0;
-	} else {
-		wc = c16;
-	}
-	return wcrtomb(s, wc, 0);
-
+size_t c16rtomb(char *restrict s, char16_t c16, mbstate_t *restrict ps) {
+  static unsigned internal_state;
+  if (!ps)
+    ps = (void *)&internal_state;
+  unsigned *x = (unsigned *)ps;
+  wchar_t wc;
+  if (!s) {
+    if (*x)
+      goto ilseq;
+    return 1;
+  }
+  if (!*x && c16 - 0xd800u < 0x400) {
+    *x = (c16 - 0xd7c0) << 10;
+    return 0;
+  }
+  if (*x) {
+    if (c16 - 0xdc00u >= 0x400)
+      goto ilseq;
+    else
+      wc = *x + c16 - 0xdc00;
+    *x = 0;
+  } else {
+    wc = c16;
+  }
+  return wcrtomb(s, wc, 0);
 ilseq:
-	*x = 0;
-	errno = EILSEQ;
-	return -1;
+  *x = 0;
+  errno = EILSEQ;
+  return -1;
 }
diff --git a/libc/str/c32rtomb.c b/libc/str/c32rtomb.c
new file mode 100644
index 000000000..4d258deda
--- /dev/null
+++ b/libc/str/c32rtomb.c
@@ -0,0 +1,23 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/str.h"
+
+size_t c32rtomb(char *s, char32_t c, mbstate_t *t) {
+  return wcrtomb(s, c, t);
+}
diff --git a/libc/str/compareslices.c b/libc/str/compareslices.c
index 0bca09438..b57b902a6 100644
--- a/libc/str/compareslices.c
+++ b/libc/str/compareslices.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/slice.h"
 
 int CompareSlices(const char *a, size_t n, const char *b, size_t m) {
diff --git a/libc/str/compareslicescase.c b/libc/str/compareslicescase.c
index 129d9b22c..a4f881450 100644
--- a/libc/str/compareslicescase.c
+++ b/libc/str/compareslicescase.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/slice.h"
 
 int CompareSlicesCase(const char *a, size_t n, const char *b, size_t m) {
diff --git a/libc/str/dosdatetimetounix.c b/libc/str/dosdatetimetounix.c
index a4cd18109..7cc956fd3 100644
--- a/libc/str/dosdatetimetounix.c
+++ b/libc/str/dosdatetimetounix.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/conv.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/time.h"
 
 /**
diff --git a/libc/str/eastasianwidth.txt b/libc/str/eastasianwidth.txt
index 02df4df47..8e2a738fe 100644
--- a/libc/str/eastasianwidth.txt
+++ b/libc/str/eastasianwidth.txt
@@ -1,11 +1,11 @@
-# EastAsianWidth-15.1.0.txt
-# Date: 2023-07-28, 23:34:08 GMT
-# © 2023 Unicode®, Inc.
+# EastAsianWidth-15.0.0.txt
+# Date: 2022-01-28, 13:07:15 GMT [KW, LI]
+# © 2022 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see https://www.unicode.org/terms_of_use.html
 #
 # Unicode Character Database
-#   For documentation, see https://www.unicode.org/reports/tr44/
+# For documentation, see https://www.unicode.org/reports/tr44/
 #
 # East_Asian_Width Property
 #
@@ -30,2592 +30,2590 @@
 # Character ranges are specified as for other property files in the
 # Unicode Character Database.
 #
-# The comments following the number sign "#" list the General_Category
-# property value or the L& alias of the derived value LC, the Unicode
-# character name or names, and, in lines with ranges of code points,
-# the code point count in square brackets.
+# For legacy reasons, there are no spaces before or after the semicolon
+# which separates the two fields. The comments following the number sign
+# "#" list the General_Category property value or the L& alias of the
+# derived value LC, the Unicode character name or names, and, in lines
+# with ranges of code points, the code point count in square brackets.
 #
 # For more information, see UAX #11: East Asian Width,
 # at https://www.unicode.org/reports/tr11/
 #
 # @missing: 0000..10FFFF; N
-0000..001F     ; N  # Cc    [32] <control-0000>..<control-001F>
-0020           ; Na # Zs         SPACE
-0021..0023     ; Na # Po     [3] EXCLAMATION MARK..NUMBER SIGN
-0024           ; Na # Sc         DOLLAR SIGN
-0025..0027     ; Na # Po     [3] PERCENT SIGN..APOSTROPHE
-0028           ; Na # Ps         LEFT PARENTHESIS
-0029           ; Na # Pe         RIGHT PARENTHESIS
-002A           ; Na # Po         ASTERISK
-002B           ; Na # Sm         PLUS SIGN
-002C           ; Na # Po         COMMA
-002D           ; Na # Pd         HYPHEN-MINUS
-002E..002F     ; Na # Po     [2] FULL STOP..SOLIDUS
-0030..0039     ; Na # Nd    [10] DIGIT ZERO..DIGIT NINE
-003A..003B     ; Na # Po     [2] COLON..SEMICOLON
-003C..003E     ; Na # Sm     [3] LESS-THAN SIGN..GREATER-THAN SIGN
-003F..0040     ; Na # Po     [2] QUESTION MARK..COMMERCIAL AT
-0041..005A     ; Na # Lu    [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
-005B           ; Na # Ps         LEFT SQUARE BRACKET
-005C           ; Na # Po         REVERSE SOLIDUS
-005D           ; Na # Pe         RIGHT SQUARE BRACKET
-005E           ; Na # Sk         CIRCUMFLEX ACCENT
-005F           ; Na # Pc         LOW LINE
-0060           ; Na # Sk         GRAVE ACCENT
-0061..007A     ; Na # Ll    [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
-007B           ; Na # Ps         LEFT CURLY BRACKET
-007C           ; Na # Sm         VERTICAL LINE
-007D           ; Na # Pe         RIGHT CURLY BRACKET
-007E           ; Na # Sm         TILDE
-007F           ; N  # Cc         <control-007F>
-0080..009F     ; N  # Cc    [32] <control-0080>..<control-009F>
-00A0           ; N  # Zs         NO-BREAK SPACE
-00A1           ; A  # Po         INVERTED EXCLAMATION MARK
-00A2..00A3     ; Na # Sc     [2] CENT SIGN..POUND SIGN
-00A4           ; A  # Sc         CURRENCY SIGN
-00A5           ; Na # Sc         YEN SIGN
-00A6           ; Na # So         BROKEN BAR
-00A7           ; A  # Po         SECTION SIGN
-00A8           ; A  # Sk         DIAERESIS
-00A9           ; N  # So         COPYRIGHT SIGN
-00AA           ; A  # Lo         FEMININE ORDINAL INDICATOR
-00AB           ; N  # Pi         LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
-00AC           ; Na # Sm         NOT SIGN
-00AD           ; A  # Cf         SOFT HYPHEN
-00AE           ; A  # So         REGISTERED SIGN
-00AF           ; Na # Sk         MACRON
-00B0           ; A  # So         DEGREE SIGN
-00B1           ; A  # Sm         PLUS-MINUS SIGN
-00B2..00B3     ; A  # No     [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE
-00B4           ; A  # Sk         ACUTE ACCENT
-00B5           ; N  # Ll         MICRO SIGN
-00B6..00B7     ; A  # Po     [2] PILCROW SIGN..MIDDLE DOT
-00B8           ; A  # Sk         CEDILLA
-00B9           ; A  # No         SUPERSCRIPT ONE
-00BA           ; A  # Lo         MASCULINE ORDINAL INDICATOR
-00BB           ; N  # Pf         RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
-00BC..00BE     ; A  # No     [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS
-00BF           ; A  # Po         INVERTED QUESTION MARK
-00C0..00C5     ; N  # Lu     [6] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER A WITH RING ABOVE
-00C6           ; A  # Lu         LATIN CAPITAL LETTER AE
-00C7..00CF     ; N  # Lu     [9] LATIN CAPITAL LETTER C WITH CEDILLA..LATIN CAPITAL LETTER I WITH DIAERESIS
-00D0           ; A  # Lu         LATIN CAPITAL LETTER ETH
-00D1..00D6     ; N  # Lu     [6] LATIN CAPITAL LETTER N WITH TILDE..LATIN CAPITAL LETTER O WITH DIAERESIS
-00D7           ; A  # Sm         MULTIPLICATION SIGN
-00D8           ; A  # Lu         LATIN CAPITAL LETTER O WITH STROKE
-00D9..00DD     ; N  # Lu     [5] LATIN CAPITAL LETTER U WITH GRAVE..LATIN CAPITAL LETTER Y WITH ACUTE
-00DE..00E1     ; A  # L&     [4] LATIN CAPITAL LETTER THORN..LATIN SMALL LETTER A WITH ACUTE
-00E2..00E5     ; N  # Ll     [4] LATIN SMALL LETTER A WITH CIRCUMFLEX..LATIN SMALL LETTER A WITH RING ABOVE
-00E6           ; A  # Ll         LATIN SMALL LETTER AE
-00E7           ; N  # Ll         LATIN SMALL LETTER C WITH CEDILLA
-00E8..00EA     ; A  # Ll     [3] LATIN SMALL LETTER E WITH GRAVE..LATIN SMALL LETTER E WITH CIRCUMFLEX
-00EB           ; N  # Ll         LATIN SMALL LETTER E WITH DIAERESIS
-00EC..00ED     ; A  # Ll     [2] LATIN SMALL LETTER I WITH GRAVE..LATIN SMALL LETTER I WITH ACUTE
-00EE..00EF     ; N  # Ll     [2] LATIN SMALL LETTER I WITH CIRCUMFLEX..LATIN SMALL LETTER I WITH DIAERESIS
-00F0           ; A  # Ll         LATIN SMALL LETTER ETH
-00F1           ; N  # Ll         LATIN SMALL LETTER N WITH TILDE
-00F2..00F3     ; A  # Ll     [2] LATIN SMALL LETTER O WITH GRAVE..LATIN SMALL LETTER O WITH ACUTE
-00F4..00F6     ; N  # Ll     [3] LATIN SMALL LETTER O WITH CIRCUMFLEX..LATIN SMALL LETTER O WITH DIAERESIS
-00F7           ; A  # Sm         DIVISION SIGN
-00F8..00FA     ; A  # Ll     [3] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER U WITH ACUTE
-00FB           ; N  # Ll         LATIN SMALL LETTER U WITH CIRCUMFLEX
-00FC           ; A  # Ll         LATIN SMALL LETTER U WITH DIAERESIS
-00FD           ; N  # Ll         LATIN SMALL LETTER Y WITH ACUTE
-00FE           ; A  # Ll         LATIN SMALL LETTER THORN
-00FF           ; N  # Ll         LATIN SMALL LETTER Y WITH DIAERESIS
-0100           ; N  # Lu         LATIN CAPITAL LETTER A WITH MACRON
-0101           ; A  # Ll         LATIN SMALL LETTER A WITH MACRON
-0102..0110     ; N  # L&    [15] LATIN CAPITAL LETTER A WITH BREVE..LATIN CAPITAL LETTER D WITH STROKE
-0111           ; A  # Ll         LATIN SMALL LETTER D WITH STROKE
-0112           ; N  # Lu         LATIN CAPITAL LETTER E WITH MACRON
-0113           ; A  # Ll         LATIN SMALL LETTER E WITH MACRON
-0114..011A     ; N  # L&     [7] LATIN CAPITAL LETTER E WITH BREVE..LATIN CAPITAL LETTER E WITH CARON
-011B           ; A  # Ll         LATIN SMALL LETTER E WITH CARON
-011C..0125     ; N  # L&    [10] LATIN CAPITAL LETTER G WITH CIRCUMFLEX..LATIN SMALL LETTER H WITH CIRCUMFLEX
-0126..0127     ; A  # L&     [2] LATIN CAPITAL LETTER H WITH STROKE..LATIN SMALL LETTER H WITH STROKE
-0128..012A     ; N  # L&     [3] LATIN CAPITAL LETTER I WITH TILDE..LATIN CAPITAL LETTER I WITH MACRON
-012B           ; A  # Ll         LATIN SMALL LETTER I WITH MACRON
-012C..0130     ; N  # L&     [5] LATIN CAPITAL LETTER I WITH BREVE..LATIN CAPITAL LETTER I WITH DOT ABOVE
-0131..0133     ; A  # L&     [3] LATIN SMALL LETTER DOTLESS I..LATIN SMALL LIGATURE IJ
-0134..0137     ; N  # L&     [4] LATIN CAPITAL LETTER J WITH CIRCUMFLEX..LATIN SMALL LETTER K WITH CEDILLA
-0138           ; A  # Ll         LATIN SMALL LETTER KRA
-0139..013E     ; N  # L&     [6] LATIN CAPITAL LETTER L WITH ACUTE..LATIN SMALL LETTER L WITH CARON
-013F..0142     ; A  # L&     [4] LATIN CAPITAL LETTER L WITH MIDDLE DOT..LATIN SMALL LETTER L WITH STROKE
-0143           ; N  # Lu         LATIN CAPITAL LETTER N WITH ACUTE
-0144           ; A  # Ll         LATIN SMALL LETTER N WITH ACUTE
-0145..0147     ; N  # L&     [3] LATIN CAPITAL LETTER N WITH CEDILLA..LATIN CAPITAL LETTER N WITH CARON
-0148..014B     ; A  # L&     [4] LATIN SMALL LETTER N WITH CARON..LATIN SMALL LETTER ENG
-014C           ; N  # Lu         LATIN CAPITAL LETTER O WITH MACRON
-014D           ; A  # Ll         LATIN SMALL LETTER O WITH MACRON
-014E..0151     ; N  # L&     [4] LATIN CAPITAL LETTER O WITH BREVE..LATIN SMALL LETTER O WITH DOUBLE ACUTE
-0152..0153     ; A  # L&     [2] LATIN CAPITAL LIGATURE OE..LATIN SMALL LIGATURE OE
-0154..0165     ; N  # L&    [18] LATIN CAPITAL LETTER R WITH ACUTE..LATIN SMALL LETTER T WITH CARON
-0166..0167     ; A  # L&     [2] LATIN CAPITAL LETTER T WITH STROKE..LATIN SMALL LETTER T WITH STROKE
-0168..016A     ; N  # L&     [3] LATIN CAPITAL LETTER U WITH TILDE..LATIN CAPITAL LETTER U WITH MACRON
-016B           ; A  # Ll         LATIN SMALL LETTER U WITH MACRON
-016C..017F     ; N  # L&    [20] LATIN CAPITAL LETTER U WITH BREVE..LATIN SMALL LETTER LONG S
-0180..01BA     ; N  # L&    [59] LATIN SMALL LETTER B WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL
-01BB           ; N  # Lo         LATIN LETTER TWO WITH STROKE
-01BC..01BF     ; N  # L&     [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN
-01C0..01C3     ; N  # Lo     [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK
-01C4..01CD     ; N  # L&    [10] LATIN CAPITAL LETTER DZ WITH CARON..LATIN CAPITAL LETTER A WITH CARON
-01CE           ; A  # Ll         LATIN SMALL LETTER A WITH CARON
-01CF           ; N  # Lu         LATIN CAPITAL LETTER I WITH CARON
-01D0           ; A  # Ll         LATIN SMALL LETTER I WITH CARON
-01D1           ; N  # Lu         LATIN CAPITAL LETTER O WITH CARON
-01D2           ; A  # Ll         LATIN SMALL LETTER O WITH CARON
-01D3           ; N  # Lu         LATIN CAPITAL LETTER U WITH CARON
-01D4           ; A  # Ll         LATIN SMALL LETTER U WITH CARON
-01D5           ; N  # Lu         LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
-01D6           ; A  # Ll         LATIN SMALL LETTER U WITH DIAERESIS AND MACRON
-01D7           ; N  # Lu         LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE
-01D8           ; A  # Ll         LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE
-01D9           ; N  # Lu         LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON
-01DA           ; A  # Ll         LATIN SMALL LETTER U WITH DIAERESIS AND CARON
-01DB           ; N  # Lu         LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE
-01DC           ; A  # Ll         LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE
-01DD..024F     ; N  # L&   [115] LATIN SMALL LETTER TURNED E..LATIN SMALL LETTER Y WITH STROKE
-0250           ; N  # Ll         LATIN SMALL LETTER TURNED A
-0251           ; A  # Ll         LATIN SMALL LETTER ALPHA
-0252..0260     ; N  # Ll    [15] LATIN SMALL LETTER TURNED ALPHA..LATIN SMALL LETTER G WITH HOOK
-0261           ; A  # Ll         LATIN SMALL LETTER SCRIPT G
-0262..0293     ; N  # Ll    [50] LATIN LETTER SMALL CAPITAL G..LATIN SMALL LETTER EZH WITH CURL
-0294           ; N  # Lo         LATIN LETTER GLOTTAL STOP
-0295..02AF     ; N  # Ll    [27] LATIN LETTER PHARYNGEAL VOICED FRICATIVE..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
-02B0..02C1     ; N  # Lm    [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP
-02C2..02C3     ; N  # Sk     [2] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER RIGHT ARROWHEAD
-02C4           ; A  # Sk         MODIFIER LETTER UP ARROWHEAD
-02C5           ; N  # Sk         MODIFIER LETTER DOWN ARROWHEAD
-02C6           ; N  # Lm         MODIFIER LETTER CIRCUMFLEX ACCENT
-02C7           ; A  # Lm         CARON
-02C8           ; N  # Lm         MODIFIER LETTER VERTICAL LINE
-02C9..02CB     ; A  # Lm     [3] MODIFIER LETTER MACRON..MODIFIER LETTER GRAVE ACCENT
-02CC           ; N  # Lm         MODIFIER LETTER LOW VERTICAL LINE
-02CD           ; A  # Lm         MODIFIER LETTER LOW MACRON
-02CE..02CF     ; N  # Lm     [2] MODIFIER LETTER LOW GRAVE ACCENT..MODIFIER LETTER LOW ACUTE ACCENT
-02D0           ; A  # Lm         MODIFIER LETTER TRIANGULAR COLON
-02D1           ; N  # Lm         MODIFIER LETTER HALF TRIANGULAR COLON
-02D2..02D7     ; N  # Sk     [6] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER MINUS SIGN
-02D8..02DB     ; A  # Sk     [4] BREVE..OGONEK
-02DC           ; N  # Sk         SMALL TILDE
-02DD           ; A  # Sk         DOUBLE ACUTE ACCENT
-02DE           ; N  # Sk         MODIFIER LETTER RHOTIC HOOK
-02DF           ; A  # Sk         MODIFIER LETTER CROSS ACCENT
-02E0..02E4     ; N  # Lm     [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
-02E5..02EB     ; N  # Sk     [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK
-02EC           ; N  # Lm         MODIFIER LETTER VOICING
-02ED           ; N  # Sk         MODIFIER LETTER UNASPIRATED
-02EE           ; N  # Lm         MODIFIER LETTER DOUBLE APOSTROPHE
-02EF..02FF     ; N  # Sk    [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW
-0300..036F     ; A  # Mn   [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X
-0370..0373     ; N  # L&     [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI
-0374           ; N  # Lm         GREEK NUMERAL SIGN
-0375           ; N  # Sk         GREEK LOWER NUMERAL SIGN
-0376..0377     ; N  # L&     [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
-037A           ; N  # Lm         GREEK YPOGEGRAMMENI
-037B..037D     ; N  # Ll     [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL
-037E           ; N  # Po         GREEK QUESTION MARK
-037F           ; N  # Lu         GREEK CAPITAL LETTER YOT
-0384..0385     ; N  # Sk     [2] GREEK TONOS..GREEK DIALYTIKA TONOS
-0386           ; N  # Lu         GREEK CAPITAL LETTER ALPHA WITH TONOS
-0387           ; N  # Po         GREEK ANO TELEIA
-0388..038A     ; N  # Lu     [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS
-038C           ; N  # Lu         GREEK CAPITAL LETTER OMICRON WITH TONOS
-038E..0390     ; N  # L&     [3] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
-0391..03A1     ; A  # Lu    [17] GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LETTER RHO
-03A3..03A9     ; A  # Lu     [7] GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LETTER OMEGA
-03AA..03B0     ; N  # L&     [7] GREEK CAPITAL LETTER IOTA WITH DIALYTIKA..GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-03B1..03C1     ; A  # Ll    [17] GREEK SMALL LETTER ALPHA..GREEK SMALL LETTER RHO
-03C2           ; N  # Ll         GREEK SMALL LETTER FINAL SIGMA
-03C3..03C9     ; A  # Ll     [7] GREEK SMALL LETTER SIGMA..GREEK SMALL LETTER OMEGA
-03CA..03F5     ; N  # L&    [44] GREEK SMALL LETTER IOTA WITH DIALYTIKA..GREEK LUNATE EPSILON SYMBOL
-03F6           ; N  # Sm         GREEK REVERSED LUNATE EPSILON SYMBOL
-03F7..03FF     ; N  # L&     [9] GREEK CAPITAL LETTER SHO..GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL
-0400           ; N  # Lu         CYRILLIC CAPITAL LETTER IE WITH GRAVE
-0401           ; A  # Lu         CYRILLIC CAPITAL LETTER IO
-0402..040F     ; N  # Lu    [14] CYRILLIC CAPITAL LETTER DJE..CYRILLIC CAPITAL LETTER DZHE
-0410..044F     ; A  # L&    [64] CYRILLIC CAPITAL LETTER A..CYRILLIC SMALL LETTER YA
-0450           ; N  # Ll         CYRILLIC SMALL LETTER IE WITH GRAVE
-0451           ; A  # Ll         CYRILLIC SMALL LETTER IO
-0452..0481     ; N  # L&    [48] CYRILLIC SMALL LETTER DJE..CYRILLIC SMALL LETTER KOPPA
-0482           ; N  # So         CYRILLIC THOUSANDS SIGN
-0483..0487     ; N  # Mn     [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE
-0488..0489     ; N  # Me     [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
-048A..04FF     ; N  # L&   [118] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER HA WITH STROKE
-0500..052F     ; N  # L&    [48] CYRILLIC CAPITAL LETTER KOMI DE..CYRILLIC SMALL LETTER EL WITH DESCENDER
-0531..0556     ; N  # Lu    [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH
-0559           ; N  # Lm         ARMENIAN MODIFIER LETTER LEFT HALF RING
-055A..055F     ; N  # Po     [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK
-0560..0588     ; N  # Ll    [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE
-0589           ; N  # Po         ARMENIAN FULL STOP
-058A           ; N  # Pd         ARMENIAN HYPHEN
-058D..058E     ; N  # So     [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN
-058F           ; N  # Sc         ARMENIAN DRAM SIGN
-0591..05BD     ; N  # Mn    [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG
-05BE           ; N  # Pd         HEBREW PUNCTUATION MAQAF
-05BF           ; N  # Mn         HEBREW POINT RAFE
-05C0           ; N  # Po         HEBREW PUNCTUATION PASEQ
-05C1..05C2     ; N  # Mn     [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT
-05C3           ; N  # Po         HEBREW PUNCTUATION SOF PASUQ
-05C4..05C5     ; N  # Mn     [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT
-05C6           ; N  # Po         HEBREW PUNCTUATION NUN HAFUKHA
-05C7           ; N  # Mn         HEBREW POINT QAMATS QATAN
-05D0..05EA     ; N  # Lo    [27] HEBREW LETTER ALEF..HEBREW LETTER TAV
-05EF..05F2     ; N  # Lo     [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD
-05F3..05F4     ; N  # Po     [2] HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATION GERSHAYIM
-0600..0605     ; N  # Cf     [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE
-0606..0608     ; N  # Sm     [3] ARABIC-INDIC CUBE ROOT..ARABIC RAY
-0609..060A     ; N  # Po     [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN
-060B           ; N  # Sc         AFGHANI SIGN
-060C..060D     ; N  # Po     [2] ARABIC COMMA..ARABIC DATE SEPARATOR
-060E..060F     ; N  # So     [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA
-0610..061A     ; N  # Mn    [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA
-061B           ; N  # Po         ARABIC SEMICOLON
-061C           ; N  # Cf         ARABIC LETTER MARK
-061D..061F     ; N  # Po     [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK
-0620..063F     ; N  # Lo    [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
-0640           ; N  # Lm         ARABIC TATWEEL
-0641..064A     ; N  # Lo    [10] ARABIC LETTER FEH..ARABIC LETTER YEH
-064B..065F     ; N  # Mn    [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW
-0660..0669     ; N  # Nd    [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
-066A..066D     ; N  # Po     [4] ARABIC PERCENT SIGN..ARABIC FIVE POINTED STAR
-066E..066F     ; N  # Lo     [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF
-0670           ; N  # Mn         ARABIC LETTER SUPERSCRIPT ALEF
-0671..06D3     ; N  # Lo    [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
-06D4           ; N  # Po         ARABIC FULL STOP
-06D5           ; N  # Lo         ARABIC LETTER AE
-06D6..06DC     ; N  # Mn     [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN
-06DD           ; N  # Cf         ARABIC END OF AYAH
-06DE           ; N  # So         ARABIC START OF RUB EL HIZB
-06DF..06E4     ; N  # Mn     [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA
-06E5..06E6     ; N  # Lm     [2] ARABIC SMALL WAW..ARABIC SMALL YEH
-06E7..06E8     ; N  # Mn     [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON
-06E9           ; N  # So         ARABIC PLACE OF SAJDAH
-06EA..06ED     ; N  # Mn     [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM
-06EE..06EF     ; N  # Lo     [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V
-06F0..06F9     ; N  # Nd    [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE
-06FA..06FC     ; N  # Lo     [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW
-06FD..06FE     ; N  # So     [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN
-06FF           ; N  # Lo         ARABIC LETTER HEH WITH INVERTED V
-0700..070D     ; N  # Po    [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS
-070F           ; N  # Cf         SYRIAC ABBREVIATION MARK
-0710           ; N  # Lo         SYRIAC LETTER ALAPH
-0711           ; N  # Mn         SYRIAC LETTER SUPERSCRIPT ALAPH
-0712..072F     ; N  # Lo    [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH
-0730..074A     ; N  # Mn    [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH
-074D..074F     ; N  # Lo     [3] SYRIAC LETTER SOGDIAN ZHAIN..SYRIAC LETTER SOGDIAN FE
-0750..077F     ; N  # Lo    [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE
-0780..07A5     ; N  # Lo    [38] THAANA LETTER HAA..THAANA LETTER WAAVU
-07A6..07B0     ; N  # Mn    [11] THAANA ABAFILI..THAANA SUKUN
-07B1           ; N  # Lo         THAANA LETTER NAA
-07C0..07C9     ; N  # Nd    [10] NKO DIGIT ZERO..NKO DIGIT NINE
-07CA..07EA     ; N  # Lo    [33] NKO LETTER A..NKO LETTER JONA RA
-07EB..07F3     ; N  # Mn     [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE
-07F4..07F5     ; N  # Lm     [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE
-07F6           ; N  # So         NKO SYMBOL OO DENNEN
-07F7..07F9     ; N  # Po     [3] NKO SYMBOL GBAKURUNEN..NKO EXCLAMATION MARK
-07FA           ; N  # Lm         NKO LAJANYALAN
-07FD           ; N  # Mn         NKO DANTAYALAN
-07FE..07FF     ; N  # Sc     [2] NKO DOROME SIGN..NKO TAMAN SIGN
-0800..0815     ; N  # Lo    [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF
-0816..0819     ; N  # Mn     [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH
-081A           ; N  # Lm         SAMARITAN MODIFIER LETTER EPENTHETIC YUT
-081B..0823     ; N  # Mn     [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A
-0824           ; N  # Lm         SAMARITAN MODIFIER LETTER SHORT A
-0825..0827     ; N  # Mn     [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
-0828           ; N  # Lm         SAMARITAN MODIFIER LETTER I
-0829..082D     ; N  # Mn     [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
-0830..083E     ; N  # Po    [15] SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION ANNAAU
-0840..0858     ; N  # Lo    [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN
-0859..085B     ; N  # Mn     [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
-085E           ; N  # Po         MANDAIC PUNCTUATION
-0860..086A     ; N  # Lo    [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
-0870..0887     ; N  # Lo    [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
-0888           ; N  # Sk         ARABIC RAISED ROUND DOT
-0889..088E     ; N  # Lo     [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
-0890..0891     ; N  # Cf     [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
-0898..089F     ; N  # Mn     [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
-08A0..08C8     ; N  # Lo    [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
-08C9           ; N  # Lm         ARABIC SMALL FARSI YEH
-08CA..08E1     ; N  # Mn    [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
-08E2           ; N  # Cf         ARABIC DISPUTED END OF AYAH
-08E3..08FF     ; N  # Mn    [29] ARABIC TURNED DAMMA BELOW..ARABIC MARK SIDEWAYS NOON GHUNNA
-0900..0902     ; N  # Mn     [3] DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANAGARI SIGN ANUSVARA
-0903           ; N  # Mc         DEVANAGARI SIGN VISARGA
-0904..0939     ; N  # Lo    [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA
-093A           ; N  # Mn         DEVANAGARI VOWEL SIGN OE
-093B           ; N  # Mc         DEVANAGARI VOWEL SIGN OOE
-093C           ; N  # Mn         DEVANAGARI SIGN NUKTA
-093D           ; N  # Lo         DEVANAGARI SIGN AVAGRAHA
-093E..0940     ; N  # Mc     [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II
-0941..0948     ; N  # Mn     [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI
-0949..094C     ; N  # Mc     [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU
-094D           ; N  # Mn         DEVANAGARI SIGN VIRAMA
-094E..094F     ; N  # Mc     [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW
-0950           ; N  # Lo         DEVANAGARI OM
-0951..0957     ; N  # Mn     [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE
-0958..0961     ; N  # Lo    [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL
-0962..0963     ; N  # Mn     [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL
-0964..0965     ; N  # Po     [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
-0966..096F     ; N  # Nd    [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
-0970           ; N  # Po         DEVANAGARI ABBREVIATION SIGN
-0971           ; N  # Lm         DEVANAGARI SIGN HIGH SPACING DOT
-0972..097F     ; N  # Lo    [14] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER BBA
-0980           ; N  # Lo         BENGALI ANJI
-0981           ; N  # Mn         BENGALI SIGN CANDRABINDU
-0982..0983     ; N  # Mc     [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA
-0985..098C     ; N  # Lo     [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L
-098F..0990     ; N  # Lo     [2] BENGALI LETTER E..BENGALI LETTER AI
-0993..09A8     ; N  # Lo    [22] BENGALI LETTER O..BENGALI LETTER NA
-09AA..09B0     ; N  # Lo     [7] BENGALI LETTER PA..BENGALI LETTER RA
-09B2           ; N  # Lo         BENGALI LETTER LA
-09B6..09B9     ; N  # Lo     [4] BENGALI LETTER SHA..BENGALI LETTER HA
-09BC           ; N  # Mn         BENGALI SIGN NUKTA
-09BD           ; N  # Lo         BENGALI SIGN AVAGRAHA
-09BE..09C0     ; N  # Mc     [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II
-09C1..09C4     ; N  # Mn     [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR
-09C7..09C8     ; N  # Mc     [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI
-09CB..09CC     ; N  # Mc     [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU
-09CD           ; N  # Mn         BENGALI SIGN VIRAMA
-09CE           ; N  # Lo         BENGALI LETTER KHANDA TA
-09D7           ; N  # Mc         BENGALI AU LENGTH MARK
-09DC..09DD     ; N  # Lo     [2] BENGALI LETTER RRA..BENGALI LETTER RHA
-09DF..09E1     ; N  # Lo     [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL
-09E2..09E3     ; N  # Mn     [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL
-09E6..09EF     ; N  # Nd    [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
-09F0..09F1     ; N  # Lo     [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL
-09F2..09F3     ; N  # Sc     [2] BENGALI RUPEE MARK..BENGALI RUPEE SIGN
-09F4..09F9     ; N  # No     [6] BENGALI CURRENCY NUMERATOR ONE..BENGALI CURRENCY DENOMINATOR SIXTEEN
-09FA           ; N  # So         BENGALI ISSHAR
-09FB           ; N  # Sc         BENGALI GANDA MARK
-09FC           ; N  # Lo         BENGALI LETTER VEDIC ANUSVARA
-09FD           ; N  # Po         BENGALI ABBREVIATION SIGN
-09FE           ; N  # Mn         BENGALI SANDHI MARK
-0A01..0A02     ; N  # Mn     [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI
-0A03           ; N  # Mc         GURMUKHI SIGN VISARGA
-0A05..0A0A     ; N  # Lo     [6] GURMUKHI LETTER A..GURMUKHI LETTER UU
-0A0F..0A10     ; N  # Lo     [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI
-0A13..0A28     ; N  # Lo    [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA
-0A2A..0A30     ; N  # Lo     [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA
-0A32..0A33     ; N  # Lo     [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA
-0A35..0A36     ; N  # Lo     [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA
-0A38..0A39     ; N  # Lo     [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA
-0A3C           ; N  # Mn         GURMUKHI SIGN NUKTA
-0A3E..0A40     ; N  # Mc     [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II
-0A41..0A42     ; N  # Mn     [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU
-0A47..0A48     ; N  # Mn     [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI
-0A4B..0A4D     ; N  # Mn     [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA
-0A51           ; N  # Mn         GURMUKHI SIGN UDAAT
-0A59..0A5C     ; N  # Lo     [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA
-0A5E           ; N  # Lo         GURMUKHI LETTER FA
-0A66..0A6F     ; N  # Nd    [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
-0A70..0A71     ; N  # Mn     [2] GURMUKHI TIPPI..GURMUKHI ADDAK
-0A72..0A74     ; N  # Lo     [3] GURMUKHI IRI..GURMUKHI EK ONKAR
-0A75           ; N  # Mn         GURMUKHI SIGN YAKASH
-0A76           ; N  # Po         GURMUKHI ABBREVIATION SIGN
-0A81..0A82     ; N  # Mn     [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA
-0A83           ; N  # Mc         GUJARATI SIGN VISARGA
-0A85..0A8D     ; N  # Lo     [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E
-0A8F..0A91     ; N  # Lo     [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O
-0A93..0AA8     ; N  # Lo    [22] GUJARATI LETTER O..GUJARATI LETTER NA
-0AAA..0AB0     ; N  # Lo     [7] GUJARATI LETTER PA..GUJARATI LETTER RA
-0AB2..0AB3     ; N  # Lo     [2] GUJARATI LETTER LA..GUJARATI LETTER LLA
-0AB5..0AB9     ; N  # Lo     [5] GUJARATI LETTER VA..GUJARATI LETTER HA
-0ABC           ; N  # Mn         GUJARATI SIGN NUKTA
-0ABD           ; N  # Lo         GUJARATI SIGN AVAGRAHA
-0ABE..0AC0     ; N  # Mc     [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II
-0AC1..0AC5     ; N  # Mn     [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E
-0AC7..0AC8     ; N  # Mn     [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI
-0AC9           ; N  # Mc         GUJARATI VOWEL SIGN CANDRA O
-0ACB..0ACC     ; N  # Mc     [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU
-0ACD           ; N  # Mn         GUJARATI SIGN VIRAMA
-0AD0           ; N  # Lo         GUJARATI OM
-0AE0..0AE1     ; N  # Lo     [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL
-0AE2..0AE3     ; N  # Mn     [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL
-0AE6..0AEF     ; N  # Nd    [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
-0AF0           ; N  # Po         GUJARATI ABBREVIATION SIGN
-0AF1           ; N  # Sc         GUJARATI RUPEE SIGN
-0AF9           ; N  # Lo         GUJARATI LETTER ZHA
-0AFA..0AFF     ; N  # Mn     [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE
-0B01           ; N  # Mn         ORIYA SIGN CANDRABINDU
-0B02..0B03     ; N  # Mc     [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA
-0B05..0B0C     ; N  # Lo     [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L
-0B0F..0B10     ; N  # Lo     [2] ORIYA LETTER E..ORIYA LETTER AI
-0B13..0B28     ; N  # Lo    [22] ORIYA LETTER O..ORIYA LETTER NA
-0B2A..0B30     ; N  # Lo     [7] ORIYA LETTER PA..ORIYA LETTER RA
-0B32..0B33     ; N  # Lo     [2] ORIYA LETTER LA..ORIYA LETTER LLA
-0B35..0B39     ; N  # Lo     [5] ORIYA LETTER VA..ORIYA LETTER HA
-0B3C           ; N  # Mn         ORIYA SIGN NUKTA
-0B3D           ; N  # Lo         ORIYA SIGN AVAGRAHA
-0B3E           ; N  # Mc         ORIYA VOWEL SIGN AA
-0B3F           ; N  # Mn         ORIYA VOWEL SIGN I
-0B40           ; N  # Mc         ORIYA VOWEL SIGN II
-0B41..0B44     ; N  # Mn     [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR
-0B47..0B48     ; N  # Mc     [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI
-0B4B..0B4C     ; N  # Mc     [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU
-0B4D           ; N  # Mn         ORIYA SIGN VIRAMA
-0B55..0B56     ; N  # Mn     [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK
-0B57           ; N  # Mc         ORIYA AU LENGTH MARK
-0B5C..0B5D     ; N  # Lo     [2] ORIYA LETTER RRA..ORIYA LETTER RHA
-0B5F..0B61     ; N  # Lo     [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL
-0B62..0B63     ; N  # Mn     [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL
-0B66..0B6F     ; N  # Nd    [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE
-0B70           ; N  # So         ORIYA ISSHAR
-0B71           ; N  # Lo         ORIYA LETTER WA
-0B72..0B77     ; N  # No     [6] ORIYA FRACTION ONE QUARTER..ORIYA FRACTION THREE SIXTEENTHS
-0B82           ; N  # Mn         TAMIL SIGN ANUSVARA
-0B83           ; N  # Lo         TAMIL SIGN VISARGA
-0B85..0B8A     ; N  # Lo     [6] TAMIL LETTER A..TAMIL LETTER UU
-0B8E..0B90     ; N  # Lo     [3] TAMIL LETTER E..TAMIL LETTER AI
-0B92..0B95     ; N  # Lo     [4] TAMIL LETTER O..TAMIL LETTER KA
-0B99..0B9A     ; N  # Lo     [2] TAMIL LETTER NGA..TAMIL LETTER CA
-0B9C           ; N  # Lo         TAMIL LETTER JA
-0B9E..0B9F     ; N  # Lo     [2] TAMIL LETTER NYA..TAMIL LETTER TTA
-0BA3..0BA4     ; N  # Lo     [2] TAMIL LETTER NNA..TAMIL LETTER TA
-0BA8..0BAA     ; N  # Lo     [3] TAMIL LETTER NA..TAMIL LETTER PA
-0BAE..0BB9     ; N  # Lo    [12] TAMIL LETTER MA..TAMIL LETTER HA
-0BBE..0BBF     ; N  # Mc     [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I
-0BC0           ; N  # Mn         TAMIL VOWEL SIGN II
-0BC1..0BC2     ; N  # Mc     [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU
-0BC6..0BC8     ; N  # Mc     [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI
-0BCA..0BCC     ; N  # Mc     [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU
-0BCD           ; N  # Mn         TAMIL SIGN VIRAMA
-0BD0           ; N  # Lo         TAMIL OM
-0BD7           ; N  # Mc         TAMIL AU LENGTH MARK
-0BE6..0BEF     ; N  # Nd    [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
-0BF0..0BF2     ; N  # No     [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND
-0BF3..0BF8     ; N  # So     [6] TAMIL DAY SIGN..TAMIL AS ABOVE SIGN
-0BF9           ; N  # Sc         TAMIL RUPEE SIGN
-0BFA           ; N  # So         TAMIL NUMBER SIGN
-0C00           ; N  # Mn         TELUGU SIGN COMBINING CANDRABINDU ABOVE
-0C01..0C03     ; N  # Mc     [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
-0C04           ; N  # Mn         TELUGU SIGN COMBINING ANUSVARA ABOVE
-0C05..0C0C     ; N  # Lo     [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L
-0C0E..0C10     ; N  # Lo     [3] TELUGU LETTER E..TELUGU LETTER AI
-0C12..0C28     ; N  # Lo    [23] TELUGU LETTER O..TELUGU LETTER NA
-0C2A..0C39     ; N  # Lo    [16] TELUGU LETTER PA..TELUGU LETTER HA
-0C3C           ; N  # Mn         TELUGU SIGN NUKTA
-0C3D           ; N  # Lo         TELUGU SIGN AVAGRAHA
-0C3E..0C40     ; N  # Mn     [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
-0C41..0C44     ; N  # Mc     [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR
-0C46..0C48     ; N  # Mn     [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
-0C4A..0C4D     ; N  # Mn     [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
-0C55..0C56     ; N  # Mn     [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
-0C58..0C5A     ; N  # Lo     [3] TELUGU LETTER TSA..TELUGU LETTER RRRA
-0C5D           ; N  # Lo         TELUGU LETTER NAKAARA POLLU
-0C60..0C61     ; N  # Lo     [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL
-0C62..0C63     ; N  # Mn     [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL
-0C66..0C6F     ; N  # Nd    [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE
-0C77           ; N  # Po         TELUGU SIGN SIDDHAM
-0C78..0C7E     ; N  # No     [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR
-0C7F           ; N  # So         TELUGU SIGN TUUMU
-0C80           ; N  # Lo         KANNADA SIGN SPACING CANDRABINDU
-0C81           ; N  # Mn         KANNADA SIGN CANDRABINDU
-0C82..0C83     ; N  # Mc     [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
-0C84           ; N  # Po         KANNADA SIGN SIDDHAM
-0C85..0C8C     ; N  # Lo     [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
-0C8E..0C90     ; N  # Lo     [3] KANNADA LETTER E..KANNADA LETTER AI
-0C92..0CA8     ; N  # Lo    [23] KANNADA LETTER O..KANNADA LETTER NA
-0CAA..0CB3     ; N  # Lo    [10] KANNADA LETTER PA..KANNADA LETTER LLA
-0CB5..0CB9     ; N  # Lo     [5] KANNADA LETTER VA..KANNADA LETTER HA
-0CBC           ; N  # Mn         KANNADA SIGN NUKTA
-0CBD           ; N  # Lo         KANNADA SIGN AVAGRAHA
-0CBE           ; N  # Mc         KANNADA VOWEL SIGN AA
-0CBF           ; N  # Mn         KANNADA VOWEL SIGN I
-0CC0..0CC4     ; N  # Mc     [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR
-0CC6           ; N  # Mn         KANNADA VOWEL SIGN E
-0CC7..0CC8     ; N  # Mc     [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI
-0CCA..0CCB     ; N  # Mc     [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO
-0CCC..0CCD     ; N  # Mn     [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
-0CD5..0CD6     ; N  # Mc     [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
-0CDD..0CDE     ; N  # Lo     [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA
-0CE0..0CE1     ; N  # Lo     [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
-0CE2..0CE3     ; N  # Mn     [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
-0CE6..0CEF     ; N  # Nd    [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
-0CF1..0CF2     ; N  # Lo     [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
-0CF3           ; N  # Mc         KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
-0D00..0D01     ; N  # Mn     [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
-0D02..0D03     ; N  # Mc     [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
-0D04..0D0C     ; N  # Lo     [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
-0D0E..0D10     ; N  # Lo     [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
-0D12..0D3A     ; N  # Lo    [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA
-0D3B..0D3C     ; N  # Mn     [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
-0D3D           ; N  # Lo         MALAYALAM SIGN AVAGRAHA
-0D3E..0D40     ; N  # Mc     [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
-0D41..0D44     ; N  # Mn     [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
-0D46..0D48     ; N  # Mc     [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI
-0D4A..0D4C     ; N  # Mc     [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU
-0D4D           ; N  # Mn         MALAYALAM SIGN VIRAMA
-0D4E           ; N  # Lo         MALAYALAM LETTER DOT REPH
-0D4F           ; N  # So         MALAYALAM SIGN PARA
-0D54..0D56     ; N  # Lo     [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL
-0D57           ; N  # Mc         MALAYALAM AU LENGTH MARK
-0D58..0D5E     ; N  # No     [7] MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH..MALAYALAM FRACTION ONE FIFTH
-0D5F..0D61     ; N  # Lo     [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL
-0D62..0D63     ; N  # Mn     [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL
-0D66..0D6F     ; N  # Nd    [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE
-0D70..0D78     ; N  # No     [9] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE SIXTEENTHS
-0D79           ; N  # So         MALAYALAM DATE MARK
-0D7A..0D7F     ; N  # Lo     [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K
-0D81           ; N  # Mn         SINHALA SIGN CANDRABINDU
-0D82..0D83     ; N  # Mc     [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA
-0D85..0D96     ; N  # Lo    [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA
-0D9A..0DB1     ; N  # Lo    [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA
-0DB3..0DBB     ; N  # Lo     [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA
-0DBD           ; N  # Lo         SINHALA LETTER DANTAJA LAYANNA
-0DC0..0DC6     ; N  # Lo     [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA
-0DCA           ; N  # Mn         SINHALA SIGN AL-LAKUNA
-0DCF..0DD1     ; N  # Mc     [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA
-0DD2..0DD4     ; N  # Mn     [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA
-0DD6           ; N  # Mn         SINHALA VOWEL SIGN DIGA PAA-PILLA
-0DD8..0DDF     ; N  # Mc     [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA
-0DE6..0DEF     ; N  # Nd    [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE
-0DF2..0DF3     ; N  # Mc     [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA
-0DF4           ; N  # Po         SINHALA PUNCTUATION KUNDDALIYA
-0E01..0E30     ; N  # Lo    [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A
-0E31           ; N  # Mn         THAI CHARACTER MAI HAN-AKAT
-0E32..0E33     ; N  # Lo     [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM
-0E34..0E3A     ; N  # Mn     [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU
-0E3F           ; N  # Sc         THAI CURRENCY SYMBOL BAHT
-0E40..0E45     ; N  # Lo     [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO
-0E46           ; N  # Lm         THAI CHARACTER MAIYAMOK
-0E47..0E4E     ; N  # Mn     [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
-0E4F           ; N  # Po         THAI CHARACTER FONGMAN
-0E50..0E59     ; N  # Nd    [10] THAI DIGIT ZERO..THAI DIGIT NINE
-0E5A..0E5B     ; N  # Po     [2] THAI CHARACTER ANGKHANKHU..THAI CHARACTER KHOMUT
-0E81..0E82     ; N  # Lo     [2] LAO LETTER KO..LAO LETTER KHO SUNG
-0E84           ; N  # Lo         LAO LETTER KHO TAM
-0E86..0E8A     ; N  # Lo     [5] LAO LETTER PALI GHA..LAO LETTER SO TAM
-0E8C..0EA3     ; N  # Lo    [24] LAO LETTER PALI JHA..LAO LETTER LO LING
-0EA5           ; N  # Lo         LAO LETTER LO LOOT
-0EA7..0EB0     ; N  # Lo    [10] LAO LETTER WO..LAO VOWEL SIGN A
-0EB1           ; N  # Mn         LAO VOWEL SIGN MAI KAN
-0EB2..0EB3     ; N  # Lo     [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM
-0EB4..0EBC     ; N  # Mn     [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
-0EBD           ; N  # Lo         LAO SEMIVOWEL SIGN NYO
-0EC0..0EC4     ; N  # Lo     [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
-0EC6           ; N  # Lm         LAO KO LA
-0EC8..0ECE     ; N  # Mn     [7] LAO TONE MAI EK..LAO YAMAKKAN
-0ED0..0ED9     ; N  # Nd    [10] LAO DIGIT ZERO..LAO DIGIT NINE
-0EDC..0EDF     ; N  # Lo     [4] LAO HO NO..LAO LETTER KHMU NYO
-0F00           ; N  # Lo         TIBETAN SYLLABLE OM
-0F01..0F03     ; N  # So     [3] TIBETAN MARK GTER YIG MGO TRUNCATED A..TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA
-0F04..0F12     ; N  # Po    [15] TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK RGYA GRAM SHAD
-0F13           ; N  # So         TIBETAN MARK CARET -DZUD RTAGS ME LONG CAN
-0F14           ; N  # Po         TIBETAN MARK GTER TSHEG
-0F15..0F17     ; N  # So     [3] TIBETAN LOGOTYPE SIGN CHAD RTAGS..TIBETAN ASTROLOGICAL SIGN SGRA GCAN -CHAR RTAGS
-0F18..0F19     ; N  # Mn     [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
-0F1A..0F1F     ; N  # So     [6] TIBETAN SIGN RDEL DKAR GCIG..TIBETAN SIGN RDEL DKAR RDEL NAG
-0F20..0F29     ; N  # Nd    [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE
-0F2A..0F33     ; N  # No    [10] TIBETAN DIGIT HALF ONE..TIBETAN DIGIT HALF ZERO
-0F34           ; N  # So         TIBETAN MARK BSDUS RTAGS
-0F35           ; N  # Mn         TIBETAN MARK NGAS BZUNG NYI ZLA
-0F36           ; N  # So         TIBETAN MARK CARET -DZUD RTAGS BZHI MIG CAN
-0F37           ; N  # Mn         TIBETAN MARK NGAS BZUNG SGOR RTAGS
-0F38           ; N  # So         TIBETAN MARK CHE MGO
-0F39           ; N  # Mn         TIBETAN MARK TSA -PHRU
-0F3A           ; N  # Ps         TIBETAN MARK GUG RTAGS GYON
-0F3B           ; N  # Pe         TIBETAN MARK GUG RTAGS GYAS
-0F3C           ; N  # Ps         TIBETAN MARK ANG KHANG GYON
-0F3D           ; N  # Pe         TIBETAN MARK ANG KHANG GYAS
-0F3E..0F3F     ; N  # Mc     [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES
-0F40..0F47     ; N  # Lo     [8] TIBETAN LETTER KA..TIBETAN LETTER JA
-0F49..0F6C     ; N  # Lo    [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA
-0F71..0F7E     ; N  # Mn    [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO
-0F7F           ; N  # Mc         TIBETAN SIGN RNAM BCAD
-0F80..0F84     ; N  # Mn     [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA
-0F85           ; N  # Po         TIBETAN MARK PALUTA
-0F86..0F87     ; N  # Mn     [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS
-0F88..0F8C     ; N  # Lo     [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN
-0F8D..0F97     ; N  # Mn    [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA
-0F99..0FBC     ; N  # Mn    [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA
-0FBE..0FC5     ; N  # So     [8] TIBETAN KU RU KHA..TIBETAN SYMBOL RDO RJE
-0FC6           ; N  # Mn         TIBETAN SYMBOL PADMA GDAN
-0FC7..0FCC     ; N  # So     [6] TIBETAN SYMBOL RDO RJE RGYA GRAM..TIBETAN SYMBOL NOR BU BZHI -KHYIL
-0FCE..0FCF     ; N  # So     [2] TIBETAN SIGN RDEL NAG RDEL DKAR..TIBETAN SIGN RDEL NAG GSUM
-0FD0..0FD4     ; N  # Po     [5] TIBETAN MARK BSKA- SHOG GI MGO RGYAN..TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA
-0FD5..0FD8     ; N  # So     [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS
-0FD9..0FDA     ; N  # Po     [2] TIBETAN MARK LEADING MCHAN RTAGS..TIBETAN MARK TRAILING MCHAN RTAGS
-1000..102A     ; N  # Lo    [43] MYANMAR LETTER KA..MYANMAR LETTER AU
-102B..102C     ; N  # Mc     [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA
-102D..1030     ; N  # Mn     [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU
-1031           ; N  # Mc         MYANMAR VOWEL SIGN E
-1032..1037     ; N  # Mn     [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW
-1038           ; N  # Mc         MYANMAR SIGN VISARGA
-1039..103A     ; N  # Mn     [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT
-103B..103C     ; N  # Mc     [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA
-103D..103E     ; N  # Mn     [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA
-103F           ; N  # Lo         MYANMAR LETTER GREAT SA
-1040..1049     ; N  # Nd    [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
-104A..104F     ; N  # Po     [6] MYANMAR SIGN LITTLE SECTION..MYANMAR SYMBOL GENITIVE
-1050..1055     ; N  # Lo     [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL
-1056..1057     ; N  # Mc     [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR
-1058..1059     ; N  # Mn     [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL
-105A..105D     ; N  # Lo     [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE
-105E..1060     ; N  # Mn     [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA
-1061           ; N  # Lo         MYANMAR LETTER SGAW KAREN SHA
-1062..1064     ; N  # Mc     [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO
-1065..1066     ; N  # Lo     [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA
-1067..106D     ; N  # Mc     [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5
-106E..1070     ; N  # Lo     [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA
-1071..1074     ; N  # Mn     [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE
-1075..1081     ; N  # Lo    [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA
-1082           ; N  # Mn         MYANMAR CONSONANT SIGN SHAN MEDIAL WA
-1083..1084     ; N  # Mc     [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E
-1085..1086     ; N  # Mn     [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y
-1087..108C     ; N  # Mc     [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3
-108D           ; N  # Mn         MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE
-108E           ; N  # Lo         MYANMAR LETTER RUMAI PALAUNG FA
-108F           ; N  # Mc         MYANMAR SIGN RUMAI PALAUNG TONE-5
-1090..1099     ; N  # Nd    [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE
-109A..109C     ; N  # Mc     [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A
-109D           ; N  # Mn         MYANMAR VOWEL SIGN AITON AI
-109E..109F     ; N  # So     [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION
-10A0..10C5     ; N  # Lu    [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE
-10C7           ; N  # Lu         GEORGIAN CAPITAL LETTER YN
-10CD           ; N  # Lu         GEORGIAN CAPITAL LETTER AEN
-10D0..10FA     ; N  # Ll    [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
-10FB           ; N  # Po         GEORGIAN PARAGRAPH SEPARATOR
-10FC           ; N  # Lm         MODIFIER LETTER GEORGIAN NAR
-10FD..10FF     ; N  # Ll     [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
-1100..115F     ; W  # Lo    [96] HANGUL CHOSEONG KIYEOK..HANGUL CHOSEONG FILLER
-1160..11FF     ; N  # Lo   [160] HANGUL JUNGSEONG FILLER..HANGUL JONGSEONG SSANGNIEUN
-1200..1248     ; N  # Lo    [73] ETHIOPIC SYLLABLE HA..ETHIOPIC SYLLABLE QWA
-124A..124D     ; N  # Lo     [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE
-1250..1256     ; N  # Lo     [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO
-1258           ; N  # Lo         ETHIOPIC SYLLABLE QHWA
-125A..125D     ; N  # Lo     [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE
-1260..1288     ; N  # Lo    [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA
-128A..128D     ; N  # Lo     [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE
-1290..12B0     ; N  # Lo    [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA
-12B2..12B5     ; N  # Lo     [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE
-12B8..12BE     ; N  # Lo     [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO
-12C0           ; N  # Lo         ETHIOPIC SYLLABLE KXWA
-12C2..12C5     ; N  # Lo     [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE
-12C8..12D6     ; N  # Lo    [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O
-12D8..1310     ; N  # Lo    [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA
-1312..1315     ; N  # Lo     [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE
-1318..135A     ; N  # Lo    [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA
-135D..135F     ; N  # Mn     [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
-1360..1368     ; N  # Po     [9] ETHIOPIC SECTION MARK..ETHIOPIC PARAGRAPH SEPARATOR
-1369..137C     ; N  # No    [20] ETHIOPIC DIGIT ONE..ETHIOPIC NUMBER TEN THOUSAND
-1380..138F     ; N  # Lo    [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE
-1390..1399     ; N  # So    [10] ETHIOPIC TONAL MARK YIZET..ETHIOPIC TONAL MARK KURT
-13A0..13F5     ; N  # Lu    [86] CHEROKEE LETTER A..CHEROKEE LETTER MV
-13F8..13FD     ; N  # Ll     [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
-1400           ; N  # Pd         CANADIAN SYLLABICS HYPHEN
-1401..166C     ; N  # Lo   [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA
-166D           ; N  # So         CANADIAN SYLLABICS CHI SIGN
-166E           ; N  # Po         CANADIAN SYLLABICS FULL STOP
-166F..167F     ; N  # Lo    [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W
-1680           ; N  # Zs         OGHAM SPACE MARK
-1681..169A     ; N  # Lo    [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH
-169B           ; N  # Ps         OGHAM FEATHER MARK
-169C           ; N  # Pe         OGHAM REVERSED FEATHER MARK
-16A0..16EA     ; N  # Lo    [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
-16EB..16ED     ; N  # Po     [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION
-16EE..16F0     ; N  # Nl     [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL
-16F1..16F8     ; N  # Lo     [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
-1700..1711     ; N  # Lo    [18] TAGALOG LETTER A..TAGALOG LETTER HA
-1712..1714     ; N  # Mn     [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
-1715           ; N  # Mc         TAGALOG SIGN PAMUDPOD
-171F           ; N  # Lo         TAGALOG LETTER ARCHAIC RA
-1720..1731     ; N  # Lo    [18] HANUNOO LETTER A..HANUNOO LETTER HA
-1732..1733     ; N  # Mn     [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
-1734           ; N  # Mc         HANUNOO SIGN PAMUDPOD
-1735..1736     ; N  # Po     [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
-1740..1751     ; N  # Lo    [18] BUHID LETTER A..BUHID LETTER HA
-1752..1753     ; N  # Mn     [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
-1760..176C     ; N  # Lo    [13] TAGBANWA LETTER A..TAGBANWA LETTER YA
-176E..1770     ; N  # Lo     [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA
-1772..1773     ; N  # Mn     [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
-1780..17B3     ; N  # Lo    [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU
-17B4..17B5     ; N  # Mn     [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
-17B6           ; N  # Mc         KHMER VOWEL SIGN AA
-17B7..17BD     ; N  # Mn     [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA
-17BE..17C5     ; N  # Mc     [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
-17C6           ; N  # Mn         KHMER SIGN NIKAHIT
-17C7..17C8     ; N  # Mc     [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
-17C9..17D3     ; N  # Mn    [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
-17D4..17D6     ; N  # Po     [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH
-17D7           ; N  # Lm         KHMER SIGN LEK TOO
-17D8..17DA     ; N  # Po     [3] KHMER SIGN BEYYAL..KHMER SIGN KOOMUUT
-17DB           ; N  # Sc         KHMER CURRENCY SYMBOL RIEL
-17DC           ; N  # Lo         KHMER SIGN AVAKRAHASANYA
-17DD           ; N  # Mn         KHMER SIGN ATTHACAN
-17E0..17E9     ; N  # Nd    [10] KHMER DIGIT ZERO..KHMER DIGIT NINE
-17F0..17F9     ; N  # No    [10] KHMER SYMBOL LEK ATTAK SON..KHMER SYMBOL LEK ATTAK PRAM-BUON
-1800..1805     ; N  # Po     [6] MONGOLIAN BIRGA..MONGOLIAN FOUR DOTS
-1806           ; N  # Pd         MONGOLIAN TODO SOFT HYPHEN
-1807..180A     ; N  # Po     [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU
-180B..180D     ; N  # Mn     [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
-180E           ; N  # Cf         MONGOLIAN VOWEL SEPARATOR
-180F           ; N  # Mn         MONGOLIAN FREE VARIATION SELECTOR FOUR
-1810..1819     ; N  # Nd    [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE
-1820..1842     ; N  # Lo    [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
-1843           ; N  # Lm         MONGOLIAN LETTER TODO LONG VOWEL SIGN
-1844..1878     ; N  # Lo    [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS
-1880..1884     ; N  # Lo     [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA
-1885..1886     ; N  # Mn     [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
-1887..18A8     ; N  # Lo    [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA
-18A9           ; N  # Mn         MONGOLIAN LETTER ALI GALI DAGALGA
-18AA           ; N  # Lo         MONGOLIAN LETTER MANCHU ALI GALI LHA
-18B0..18F5     ; N  # Lo    [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S
-1900..191E     ; N  # Lo    [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA
-1920..1922     ; N  # Mn     [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
-1923..1926     ; N  # Mc     [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU
-1927..1928     ; N  # Mn     [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O
-1929..192B     ; N  # Mc     [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA
-1930..1931     ; N  # Mc     [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA
-1932           ; N  # Mn         LIMBU SMALL LETTER ANUSVARA
-1933..1938     ; N  # Mc     [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA
-1939..193B     ; N  # Mn     [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I
-1940           ; N  # So         LIMBU SIGN LOO
-1944..1945     ; N  # Po     [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
-1946..194F     ; N  # Nd    [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE
-1950..196D     ; N  # Lo    [30] TAI LE LETTER KA..TAI LE LETTER AI
-1970..1974     ; N  # Lo     [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6
-1980..19AB     ; N  # Lo    [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA
-19B0..19C9     ; N  # Lo    [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2
-19D0..19D9     ; N  # Nd    [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE
-19DA           ; N  # No         NEW TAI LUE THAM DIGIT ONE
-19DE..19DF     ; N  # So     [2] NEW TAI LUE SIGN LAE..NEW TAI LUE SIGN LAEV
-19E0..19FF     ; N  # So    [32] KHMER SYMBOL PATHAMASAT..KHMER SYMBOL DAP-PRAM ROC
-1A00..1A16     ; N  # Lo    [23] BUGINESE LETTER KA..BUGINESE LETTER HA
-1A17..1A18     ; N  # Mn     [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
-1A19..1A1A     ; N  # Mc     [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O
-1A1B           ; N  # Mn         BUGINESE VOWEL SIGN AE
-1A1E..1A1F     ; N  # Po     [2] BUGINESE PALLAWA..BUGINESE END OF SECTION
-1A20..1A54     ; N  # Lo    [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA
-1A55           ; N  # Mc         TAI THAM CONSONANT SIGN MEDIAL RA
-1A56           ; N  # Mn         TAI THAM CONSONANT SIGN MEDIAL LA
-1A57           ; N  # Mc         TAI THAM CONSONANT SIGN LA TANG LAI
-1A58..1A5E     ; N  # Mn     [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA
-1A60           ; N  # Mn         TAI THAM SIGN SAKOT
-1A61           ; N  # Mc         TAI THAM VOWEL SIGN A
-1A62           ; N  # Mn         TAI THAM VOWEL SIGN MAI SAT
-1A63..1A64     ; N  # Mc     [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA
-1A65..1A6C     ; N  # Mn     [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW
-1A6D..1A72     ; N  # Mc     [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI
-1A73..1A7C     ; N  # Mn    [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN
-1A7F           ; N  # Mn         TAI THAM COMBINING CRYPTOGRAMMIC DOT
-1A80..1A89     ; N  # Nd    [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE
-1A90..1A99     ; N  # Nd    [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE
-1AA0..1AA6     ; N  # Po     [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA
-1AA7           ; N  # Lm         TAI THAM SIGN MAI YAMOK
-1AA8..1AAD     ; N  # Po     [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG
-1AB0..1ABD     ; N  # Mn    [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
-1ABE           ; N  # Me         COMBINING PARENTHESES OVERLAY
-1ABF..1ACE     ; N  # Mn    [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T
-1B00..1B03     ; N  # Mn     [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
-1B04           ; N  # Mc         BALINESE SIGN BISAH
-1B05..1B33     ; N  # Lo    [47] BALINESE LETTER AKARA..BALINESE LETTER HA
-1B34           ; N  # Mn         BALINESE SIGN REREKAN
-1B35           ; N  # Mc         BALINESE VOWEL SIGN TEDUNG
-1B36..1B3A     ; N  # Mn     [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA
-1B3B           ; N  # Mc         BALINESE VOWEL SIGN RA REPA TEDUNG
-1B3C           ; N  # Mn         BALINESE VOWEL SIGN LA LENGA
-1B3D..1B41     ; N  # Mc     [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG
-1B42           ; N  # Mn         BALINESE VOWEL SIGN PEPET
-1B43..1B44     ; N  # Mc     [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG
-1B45..1B4C     ; N  # Lo     [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA
-1B50..1B59     ; N  # Nd    [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE
-1B5A..1B60     ; N  # Po     [7] BALINESE PANTI..BALINESE PAMENENG
-1B61..1B6A     ; N  # So    [10] BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE
-1B6B..1B73     ; N  # Mn     [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG
-1B74..1B7C     ; N  # So     [9] BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING
-1B7D..1B7E     ; N  # Po     [2] BALINESE PANTI LANTANG..BALINESE PAMADA LANTANG
-1B80..1B81     ; N  # Mn     [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR
-1B82           ; N  # Mc         SUNDANESE SIGN PANGWISAD
-1B83..1BA0     ; N  # Lo    [30] SUNDANESE LETTER A..SUNDANESE LETTER HA
-1BA1           ; N  # Mc         SUNDANESE CONSONANT SIGN PAMINGKAL
-1BA2..1BA5     ; N  # Mn     [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU
-1BA6..1BA7     ; N  # Mc     [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG
-1BA8..1BA9     ; N  # Mn     [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
-1BAA           ; N  # Mc         SUNDANESE SIGN PAMAAEH
-1BAB..1BAD     ; N  # Mn     [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA
-1BAE..1BAF     ; N  # Lo     [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
-1BB0..1BB9     ; N  # Nd    [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE
-1BBA..1BBF     ; N  # Lo     [6] SUNDANESE AVAGRAHA..SUNDANESE LETTER FINAL M
-1BC0..1BE5     ; N  # Lo    [38] BATAK LETTER A..BATAK LETTER U
-1BE6           ; N  # Mn         BATAK SIGN TOMPI
-1BE7           ; N  # Mc         BATAK VOWEL SIGN E
-1BE8..1BE9     ; N  # Mn     [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE
-1BEA..1BEC     ; N  # Mc     [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O
-1BED           ; N  # Mn         BATAK VOWEL SIGN KARO O
-1BEE           ; N  # Mc         BATAK VOWEL SIGN U
-1BEF..1BF1     ; N  # Mn     [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H
-1BF2..1BF3     ; N  # Mc     [2] BATAK PANGOLAT..BATAK PANONGONAN
-1BFC..1BFF     ; N  # Po     [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT
-1C00..1C23     ; N  # Lo    [36] LEPCHA LETTER KA..LEPCHA LETTER A
-1C24..1C2B     ; N  # Mc     [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU
-1C2C..1C33     ; N  # Mn     [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T
-1C34..1C35     ; N  # Mc     [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG
-1C36..1C37     ; N  # Mn     [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA
-1C3B..1C3F     ; N  # Po     [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK
-1C40..1C49     ; N  # Nd    [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE
-1C4D..1C4F     ; N  # Lo     [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA
-1C50..1C59     ; N  # Nd    [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE
-1C5A..1C77     ; N  # Lo    [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH
-1C78..1C7D     ; N  # Lm     [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD
-1C7E..1C7F     ; N  # Po     [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
-1C80..1C88     ; N  # Ll     [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK
-1C90..1CBA     ; N  # Lu    [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN
-1CBD..1CBF     ; N  # Lu     [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN
-1CC0..1CC7     ; N  # Po     [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA
-1CD0..1CD2     ; N  # Mn     [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
-1CD3           ; N  # Po         VEDIC SIGN NIHSHVASA
-1CD4..1CE0     ; N  # Mn    [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
-1CE1           ; N  # Mc         VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
-1CE2..1CE8     ; N  # Mn     [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
-1CE9..1CEC     ; N  # Lo     [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
-1CED           ; N  # Mn         VEDIC SIGN TIRYAK
-1CEE..1CF3     ; N  # Lo     [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA
-1CF4           ; N  # Mn         VEDIC TONE CANDRA ABOVE
-1CF5..1CF6     ; N  # Lo     [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
-1CF7           ; N  # Mc         VEDIC SIGN ATIKRAMA
-1CF8..1CF9     ; N  # Mn     [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
-1CFA           ; N  # Lo         VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA
-1D00..1D2B     ; N  # Ll    [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL
-1D2C..1D6A     ; N  # Lm    [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI
-1D6B..1D77     ; N  # Ll    [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G
-1D78           ; N  # Lm         MODIFIER LETTER CYRILLIC EN
-1D79..1D7F     ; N  # Ll     [7] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER UPSILON WITH STROKE
-1D80..1D9A     ; N  # Ll    [27] LATIN SMALL LETTER B WITH PALATAL HOOK..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK
-1D9B..1DBF     ; N  # Lm    [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA
-1DC0..1DFF     ; N  # Mn    [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
-1E00..1EFF     ; N  # L&   [256] LATIN CAPITAL LETTER A WITH RING BELOW..LATIN SMALL LETTER Y WITH LOOP
-1F00..1F15     ; N  # L&    [22] GREEK SMALL LETTER ALPHA WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA
-1F18..1F1D     ; N  # Lu     [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
-1F20..1F45     ; N  # L&    [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA
-1F48..1F4D     ; N  # Lu     [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA
-1F50..1F57     ; N  # Ll     [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI
-1F59           ; N  # Lu         GREEK CAPITAL LETTER UPSILON WITH DASIA
-1F5B           ; N  # Lu         GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA
-1F5D           ; N  # Lu         GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA
-1F5F..1F7D     ; N  # L&    [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA
-1F80..1FB4     ; N  # L&    [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
-1FB6..1FBC     ; N  # L&     [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
-1FBD           ; N  # Sk         GREEK KORONIS
-1FBE           ; N  # Ll         GREEK PROSGEGRAMMENI
-1FBF..1FC1     ; N  # Sk     [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
-1FC2..1FC4     ; N  # Ll     [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
-1FC6..1FCC     ; N  # L&     [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
-1FCD..1FCF     ; N  # Sk     [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI
-1FD0..1FD3     ; N  # Ll     [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
-1FD6..1FDB     ; N  # L&     [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA
-1FDD..1FDF     ; N  # Sk     [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI
-1FE0..1FEC     ; N  # L&    [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA
-1FED..1FEF     ; N  # Sk     [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA
-1FF2..1FF4     ; N  # Ll     [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
-1FF6..1FFC     ; N  # L&     [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
-1FFD..1FFE     ; N  # Sk     [2] GREEK OXIA..GREEK DASIA
-2000..200A     ; N  # Zs    [11] EN QUAD..HAIR SPACE
-200B..200F     ; N  # Cf     [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK
-2010           ; A  # Pd         HYPHEN
-2011..2012     ; N  # Pd     [2] NON-BREAKING HYPHEN..FIGURE DASH
-2013..2015     ; A  # Pd     [3] EN DASH..HORIZONTAL BAR
-2016           ; A  # Po         DOUBLE VERTICAL LINE
-2017           ; N  # Po         DOUBLE LOW LINE
-2018           ; A  # Pi         LEFT SINGLE QUOTATION MARK
-2019           ; A  # Pf         RIGHT SINGLE QUOTATION MARK
-201A           ; N  # Ps         SINGLE LOW-9 QUOTATION MARK
-201B           ; N  # Pi         SINGLE HIGH-REVERSED-9 QUOTATION MARK
-201C           ; A  # Pi         LEFT DOUBLE QUOTATION MARK
-201D           ; A  # Pf         RIGHT DOUBLE QUOTATION MARK
-201E           ; N  # Ps         DOUBLE LOW-9 QUOTATION MARK
-201F           ; N  # Pi         DOUBLE HIGH-REVERSED-9 QUOTATION MARK
-2020..2022     ; A  # Po     [3] DAGGER..BULLET
-2023           ; N  # Po         TRIANGULAR BULLET
-2024..2027     ; A  # Po     [4] ONE DOT LEADER..HYPHENATION POINT
-2028           ; N  # Zl         LINE SEPARATOR
-2029           ; N  # Zp         PARAGRAPH SEPARATOR
-202A..202E     ; N  # Cf     [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
-202F           ; N  # Zs         NARROW NO-BREAK SPACE
-2030           ; A  # Po         PER MILLE SIGN
-2031           ; N  # Po         PER TEN THOUSAND SIGN
-2032..2033     ; A  # Po     [2] PRIME..DOUBLE PRIME
-2034           ; N  # Po         TRIPLE PRIME
-2035           ; A  # Po         REVERSED PRIME
-2036..2038     ; N  # Po     [3] REVERSED DOUBLE PRIME..CARET
-2039           ; N  # Pi         SINGLE LEFT-POINTING ANGLE QUOTATION MARK
-203A           ; N  # Pf         SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
-203B           ; A  # Po         REFERENCE MARK
-203C..203D     ; N  # Po     [2] DOUBLE EXCLAMATION MARK..INTERROBANG
-203E           ; A  # Po         OVERLINE
-203F..2040     ; N  # Pc     [2] UNDERTIE..CHARACTER TIE
-2041..2043     ; N  # Po     [3] CARET INSERTION POINT..HYPHEN BULLET
-2044           ; N  # Sm         FRACTION SLASH
-2045           ; N  # Ps         LEFT SQUARE BRACKET WITH QUILL
-2046           ; N  # Pe         RIGHT SQUARE BRACKET WITH QUILL
-2047..2051     ; N  # Po    [11] DOUBLE QUESTION MARK..TWO ASTERISKS ALIGNED VERTICALLY
-2052           ; N  # Sm         COMMERCIAL MINUS SIGN
-2053           ; N  # Po         SWUNG DASH
-2054           ; N  # Pc         INVERTED UNDERTIE
-2055..205E     ; N  # Po    [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS
-205F           ; N  # Zs         MEDIUM MATHEMATICAL SPACE
-2060..2064     ; N  # Cf     [5] WORD JOINER..INVISIBLE PLUS
-2066..206F     ; N  # Cf    [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
-2070           ; N  # No         SUPERSCRIPT ZERO
-2071           ; N  # Lm         SUPERSCRIPT LATIN SMALL LETTER I
-2074           ; A  # No         SUPERSCRIPT FOUR
-2075..2079     ; N  # No     [5] SUPERSCRIPT FIVE..SUPERSCRIPT NINE
-207A..207C     ; N  # Sm     [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN
-207D           ; N  # Ps         SUPERSCRIPT LEFT PARENTHESIS
-207E           ; N  # Pe         SUPERSCRIPT RIGHT PARENTHESIS
-207F           ; A  # Lm         SUPERSCRIPT LATIN SMALL LETTER N
-2080           ; N  # No         SUBSCRIPT ZERO
-2081..2084     ; A  # No     [4] SUBSCRIPT ONE..SUBSCRIPT FOUR
-2085..2089     ; N  # No     [5] SUBSCRIPT FIVE..SUBSCRIPT NINE
-208A..208C     ; N  # Sm     [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN
-208D           ; N  # Ps         SUBSCRIPT LEFT PARENTHESIS
-208E           ; N  # Pe         SUBSCRIPT RIGHT PARENTHESIS
-2090..209C     ; N  # Lm    [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T
-20A0..20A8     ; N  # Sc     [9] EURO-CURRENCY SIGN..RUPEE SIGN
-20A9           ; H  # Sc         WON SIGN
-20AA..20AB     ; N  # Sc     [2] NEW SHEQEL SIGN..DONG SIGN
-20AC           ; A  # Sc         EURO SIGN
-20AD..20C0     ; N  # Sc    [20] KIP SIGN..SOM SIGN
-20D0..20DC     ; N  # Mn    [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
-20DD..20E0     ; N  # Me     [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
-20E1           ; N  # Mn         COMBINING LEFT RIGHT ARROW ABOVE
-20E2..20E4     ; N  # Me     [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE
-20E5..20F0     ; N  # Mn    [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE
-2100..2101     ; N  # So     [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT
-2102           ; N  # Lu         DOUBLE-STRUCK CAPITAL C
-2103           ; A  # So         DEGREE CELSIUS
-2104           ; N  # So         CENTRE LINE SYMBOL
-2105           ; A  # So         CARE OF
-2106           ; N  # So         CADA UNA
-2107           ; N  # Lu         EULER CONSTANT
-2108           ; N  # So         SCRUPLE
-2109           ; A  # So         DEGREE FAHRENHEIT
-210A..2112     ; N  # L&     [9] SCRIPT SMALL G..SCRIPT CAPITAL L
-2113           ; A  # Ll         SCRIPT SMALL L
-2114           ; N  # So         L B BAR SYMBOL
-2115           ; N  # Lu         DOUBLE-STRUCK CAPITAL N
-2116           ; A  # So         NUMERO SIGN
-2117           ; N  # So         SOUND RECORDING COPYRIGHT
-2118           ; N  # Sm         SCRIPT CAPITAL P
-2119..211D     ; N  # Lu     [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R
-211E..2120     ; N  # So     [3] PRESCRIPTION TAKE..SERVICE MARK
-2121..2122     ; A  # So     [2] TELEPHONE SIGN..TRADE MARK SIGN
-2123           ; N  # So         VERSICLE
-2124           ; N  # Lu         DOUBLE-STRUCK CAPITAL Z
-2125           ; N  # So         OUNCE SIGN
-2126           ; A  # Lu         OHM SIGN
-2127           ; N  # So         INVERTED OHM SIGN
-2128           ; N  # Lu         BLACK-LETTER CAPITAL Z
-2129           ; N  # So         TURNED GREEK SMALL LETTER IOTA
-212A           ; N  # Lu         KELVIN SIGN
-212B           ; A  # Lu         ANGSTROM SIGN
-212C..212D     ; N  # Lu     [2] SCRIPT CAPITAL B..BLACK-LETTER CAPITAL C
-212E           ; N  # So         ESTIMATED SYMBOL
-212F..2134     ; N  # L&     [6] SCRIPT SMALL E..SCRIPT SMALL O
-2135..2138     ; N  # Lo     [4] ALEF SYMBOL..DALET SYMBOL
-2139           ; N  # Ll         INFORMATION SOURCE
-213A..213B     ; N  # So     [2] ROTATED CAPITAL Q..FACSIMILE SIGN
-213C..213F     ; N  # L&     [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI
-2140..2144     ; N  # Sm     [5] DOUBLE-STRUCK N-ARY SUMMATION..TURNED SANS-SERIF CAPITAL Y
-2145..2149     ; N  # L&     [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J
-214A           ; N  # So         PROPERTY LINE
-214B           ; N  # Sm         TURNED AMPERSAND
-214C..214D     ; N  # So     [2] PER SIGN..AKTIESELSKAB
-214E           ; N  # Ll         TURNED SMALL F
-214F           ; N  # So         SYMBOL FOR SAMARITAN SOURCE
-2150..2152     ; N  # No     [3] VULGAR FRACTION ONE SEVENTH..VULGAR FRACTION ONE TENTH
-2153..2154     ; A  # No     [2] VULGAR FRACTION ONE THIRD..VULGAR FRACTION TWO THIRDS
-2155..215A     ; N  # No     [6] VULGAR FRACTION ONE FIFTH..VULGAR FRACTION FIVE SIXTHS
-215B..215E     ; A  # No     [4] VULGAR FRACTION ONE EIGHTH..VULGAR FRACTION SEVEN EIGHTHS
-215F           ; N  # No         FRACTION NUMERATOR ONE
-2160..216B     ; A  # Nl    [12] ROMAN NUMERAL ONE..ROMAN NUMERAL TWELVE
-216C..216F     ; N  # Nl     [4] ROMAN NUMERAL FIFTY..ROMAN NUMERAL ONE THOUSAND
-2170..2179     ; A  # Nl    [10] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL TEN
-217A..2182     ; N  # Nl     [9] SMALL ROMAN NUMERAL ELEVEN..ROMAN NUMERAL TEN THOUSAND
-2183..2184     ; N  # L&     [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C
-2185..2188     ; N  # Nl     [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND
-2189           ; A  # No         VULGAR FRACTION ZERO THIRDS
-218A..218B     ; N  # So     [2] TURNED DIGIT TWO..TURNED DIGIT THREE
-2190..2194     ; A  # Sm     [5] LEFTWARDS ARROW..LEFT RIGHT ARROW
-2195..2199     ; A  # So     [5] UP DOWN ARROW..SOUTH WEST ARROW
-219A..219B     ; N  # Sm     [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE
-219C..219F     ; N  # So     [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW
-21A0           ; N  # Sm         RIGHTWARDS TWO HEADED ARROW
-21A1..21A2     ; N  # So     [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL
-21A3           ; N  # Sm         RIGHTWARDS ARROW WITH TAIL
-21A4..21A5     ; N  # So     [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR
-21A6           ; N  # Sm         RIGHTWARDS ARROW FROM BAR
-21A7..21AD     ; N  # So     [7] DOWNWARDS ARROW FROM BAR..LEFT RIGHT WAVE ARROW
-21AE           ; N  # Sm         LEFT RIGHT ARROW WITH STROKE
-21AF..21B7     ; N  # So     [9] DOWNWARDS ZIGZAG ARROW..CLOCKWISE TOP SEMICIRCLE ARROW
-21B8..21B9     ; A  # So     [2] NORTH WEST ARROW TO LONG BAR..LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR
-21BA..21CD     ; N  # So    [20] ANTICLOCKWISE OPEN CIRCLE ARROW..LEFTWARDS DOUBLE ARROW WITH STROKE
-21CE..21CF     ; N  # Sm     [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE
-21D0..21D1     ; N  # So     [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW
-21D2           ; A  # Sm         RIGHTWARDS DOUBLE ARROW
-21D3           ; N  # So         DOWNWARDS DOUBLE ARROW
-21D4           ; A  # Sm         LEFT RIGHT DOUBLE ARROW
-21D5..21E6     ; N  # So    [18] UP DOWN DOUBLE ARROW..LEFTWARDS WHITE ARROW
-21E7           ; A  # So         UPWARDS WHITE ARROW
-21E8..21F3     ; N  # So    [12] RIGHTWARDS WHITE ARROW..UP DOWN WHITE ARROW
-21F4..21FF     ; N  # Sm    [12] RIGHT ARROW WITH SMALL CIRCLE..LEFT RIGHT OPEN-HEADED ARROW
-2200           ; A  # Sm         FOR ALL
-2201           ; N  # Sm         COMPLEMENT
-2202..2203     ; A  # Sm     [2] PARTIAL DIFFERENTIAL..THERE EXISTS
-2204..2206     ; N  # Sm     [3] THERE DOES NOT EXIST..INCREMENT
-2207..2208     ; A  # Sm     [2] NABLA..ELEMENT OF
-2209..220A     ; N  # Sm     [2] NOT AN ELEMENT OF..SMALL ELEMENT OF
-220B           ; A  # Sm         CONTAINS AS MEMBER
-220C..220E     ; N  # Sm     [3] DOES NOT CONTAIN AS MEMBER..END OF PROOF
-220F           ; A  # Sm         N-ARY PRODUCT
-2210           ; N  # Sm         N-ARY COPRODUCT
-2211           ; A  # Sm         N-ARY SUMMATION
-2212..2214     ; N  # Sm     [3] MINUS SIGN..DOT PLUS
-2215           ; A  # Sm         DIVISION SLASH
-2216..2219     ; N  # Sm     [4] SET MINUS..BULLET OPERATOR
-221A           ; A  # Sm         SQUARE ROOT
-221B..221C     ; N  # Sm     [2] CUBE ROOT..FOURTH ROOT
-221D..2220     ; A  # Sm     [4] PROPORTIONAL TO..ANGLE
-2221..2222     ; N  # Sm     [2] MEASURED ANGLE..SPHERICAL ANGLE
-2223           ; A  # Sm         DIVIDES
-2224           ; N  # Sm         DOES NOT DIVIDE
-2225           ; A  # Sm         PARALLEL TO
-2226           ; N  # Sm         NOT PARALLEL TO
-2227..222C     ; A  # Sm     [6] LOGICAL AND..DOUBLE INTEGRAL
-222D           ; N  # Sm         TRIPLE INTEGRAL
-222E           ; A  # Sm         CONTOUR INTEGRAL
-222F..2233     ; N  # Sm     [5] SURFACE INTEGRAL..ANTICLOCKWISE CONTOUR INTEGRAL
-2234..2237     ; A  # Sm     [4] THEREFORE..PROPORTION
-2238..223B     ; N  # Sm     [4] DOT MINUS..HOMOTHETIC
-223C..223D     ; A  # Sm     [2] TILDE OPERATOR..REVERSED TILDE
-223E..2247     ; N  # Sm    [10] INVERTED LAZY S..NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
-2248           ; A  # Sm         ALMOST EQUAL TO
-2249..224B     ; N  # Sm     [3] NOT ALMOST EQUAL TO..TRIPLE TILDE
-224C           ; A  # Sm         ALL EQUAL TO
-224D..2251     ; N  # Sm     [5] EQUIVALENT TO..GEOMETRICALLY EQUAL TO
-2252           ; A  # Sm         APPROXIMATELY EQUAL TO OR THE IMAGE OF
-2253..225F     ; N  # Sm    [13] IMAGE OF OR APPROXIMATELY EQUAL TO..QUESTIONED EQUAL TO
-2260..2261     ; A  # Sm     [2] NOT EQUAL TO..IDENTICAL TO
-2262..2263     ; N  # Sm     [2] NOT IDENTICAL TO..STRICTLY EQUIVALENT TO
-2264..2267     ; A  # Sm     [4] LESS-THAN OR EQUAL TO..GREATER-THAN OVER EQUAL TO
-2268..2269     ; N  # Sm     [2] LESS-THAN BUT NOT EQUAL TO..GREATER-THAN BUT NOT EQUAL TO
-226A..226B     ; A  # Sm     [2] MUCH LESS-THAN..MUCH GREATER-THAN
-226C..226D     ; N  # Sm     [2] BETWEEN..NOT EQUIVALENT TO
-226E..226F     ; A  # Sm     [2] NOT LESS-THAN..NOT GREATER-THAN
-2270..2281     ; N  # Sm    [18] NEITHER LESS-THAN NOR EQUAL TO..DOES NOT SUCCEED
-2282..2283     ; A  # Sm     [2] SUBSET OF..SUPERSET OF
-2284..2285     ; N  # Sm     [2] NOT A SUBSET OF..NOT A SUPERSET OF
-2286..2287     ; A  # Sm     [2] SUBSET OF OR EQUAL TO..SUPERSET OF OR EQUAL TO
-2288..2294     ; N  # Sm    [13] NEITHER A SUBSET OF NOR EQUAL TO..SQUARE CUP
-2295           ; A  # Sm         CIRCLED PLUS
-2296..2298     ; N  # Sm     [3] CIRCLED MINUS..CIRCLED DIVISION SLASH
-2299           ; A  # Sm         CIRCLED DOT OPERATOR
-229A..22A4     ; N  # Sm    [11] CIRCLED RING OPERATOR..DOWN TACK
-22A5           ; A  # Sm         UP TACK
-22A6..22BE     ; N  # Sm    [25] ASSERTION..RIGHT ANGLE WITH ARC
-22BF           ; A  # Sm         RIGHT TRIANGLE
-22C0..22FF     ; N  # Sm    [64] N-ARY LOGICAL AND..Z NOTATION BAG MEMBERSHIP
-2300..2307     ; N  # So     [8] DIAMETER SIGN..WAVY LINE
-2308           ; N  # Ps         LEFT CEILING
-2309           ; N  # Pe         RIGHT CEILING
-230A           ; N  # Ps         LEFT FLOOR
-230B           ; N  # Pe         RIGHT FLOOR
-230C..2311     ; N  # So     [6] BOTTOM RIGHT CROP..SQUARE LOZENGE
-2312           ; A  # So         ARC
-2313..2319     ; N  # So     [7] SEGMENT..TURNED NOT SIGN
-231A..231B     ; W  # So     [2] WATCH..HOURGLASS
-231C..231F     ; N  # So     [4] TOP LEFT CORNER..BOTTOM RIGHT CORNER
-2320..2321     ; N  # Sm     [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL
-2322..2328     ; N  # So     [7] FROWN..KEYBOARD
-2329           ; W  # Ps         LEFT-POINTING ANGLE BRACKET
-232A           ; W  # Pe         RIGHT-POINTING ANGLE BRACKET
-232B..237B     ; N  # So    [81] ERASE TO THE LEFT..NOT CHECK MARK
-237C           ; N  # Sm         RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW
-237D..239A     ; N  # So    [30] SHOULDERED OPEN BOX..CLEAR SCREEN SYMBOL
-239B..23B3     ; N  # Sm    [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
-23B4..23DB     ; N  # So    [40] TOP SQUARE BRACKET..FUSE
-23DC..23E1     ; N  # Sm     [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
-23E2..23E8     ; N  # So     [7] WHITE TRAPEZIUM..DECIMAL EXPONENT SYMBOL
-23E9..23EC     ; W  # So     [4] BLACK RIGHT-POINTING DOUBLE TRIANGLE..BLACK DOWN-POINTING DOUBLE TRIANGLE
-23ED..23EF     ; N  # So     [3] BLACK RIGHT-POINTING DOUBLE TRIANGLE WITH VERTICAL BAR..BLACK RIGHT-POINTING TRIANGLE WITH DOUBLE VERTICAL BAR
-23F0           ; W  # So         ALARM CLOCK
-23F1..23F2     ; N  # So     [2] STOPWATCH..TIMER CLOCK
-23F3           ; W  # So         HOURGLASS WITH FLOWING SAND
-23F4..23FF     ; N  # So    [12] BLACK MEDIUM LEFT-POINTING TRIANGLE..OBSERVER EYE SYMBOL
-2400..2426     ; N  # So    [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
-2440..244A     ; N  # So    [11] OCR HOOK..OCR DOUBLE BACKSLASH
-2460..249B     ; A  # No    [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP
-249C..24E9     ; A  # So    [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
-24EA           ; N  # No         CIRCLED DIGIT ZERO
-24EB..24FF     ; A  # No    [21] NEGATIVE CIRCLED NUMBER ELEVEN..NEGATIVE CIRCLED DIGIT ZERO
-2500..254B     ; A  # So    [76] BOX DRAWINGS LIGHT HORIZONTAL..BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL
-254C..254F     ; N  # So     [4] BOX DRAWINGS LIGHT DOUBLE DASH HORIZONTAL..BOX DRAWINGS HEAVY DOUBLE DASH VERTICAL
-2550..2573     ; A  # So    [36] BOX DRAWINGS DOUBLE HORIZONTAL..BOX DRAWINGS LIGHT DIAGONAL CROSS
-2574..257F     ; N  # So    [12] BOX DRAWINGS LIGHT LEFT..BOX DRAWINGS HEAVY UP AND LIGHT DOWN
-2580..258F     ; A  # So    [16] UPPER HALF BLOCK..LEFT ONE EIGHTH BLOCK
-2590..2591     ; N  # So     [2] RIGHT HALF BLOCK..LIGHT SHADE
-2592..2595     ; A  # So     [4] MEDIUM SHADE..RIGHT ONE EIGHTH BLOCK
-2596..259F     ; N  # So    [10] QUADRANT LOWER LEFT..QUADRANT UPPER RIGHT AND LOWER LEFT AND LOWER RIGHT
-25A0..25A1     ; A  # So     [2] BLACK SQUARE..WHITE SQUARE
-25A2           ; N  # So         WHITE SQUARE WITH ROUNDED CORNERS
-25A3..25A9     ; A  # So     [7] WHITE SQUARE CONTAINING BLACK SMALL SQUARE..SQUARE WITH DIAGONAL CROSSHATCH FILL
-25AA..25B1     ; N  # So     [8] BLACK SMALL SQUARE..WHITE PARALLELOGRAM
-25B2..25B3     ; A  # So     [2] BLACK UP-POINTING TRIANGLE..WHITE UP-POINTING TRIANGLE
-25B4..25B5     ; N  # So     [2] BLACK UP-POINTING SMALL TRIANGLE..WHITE UP-POINTING SMALL TRIANGLE
-25B6           ; A  # So         BLACK RIGHT-POINTING TRIANGLE
-25B7           ; A  # Sm         WHITE RIGHT-POINTING TRIANGLE
-25B8..25BB     ; N  # So     [4] BLACK RIGHT-POINTING SMALL TRIANGLE..WHITE RIGHT-POINTING POINTER
-25BC..25BD     ; A  # So     [2] BLACK DOWN-POINTING TRIANGLE..WHITE DOWN-POINTING TRIANGLE
-25BE..25BF     ; N  # So     [2] BLACK DOWN-POINTING SMALL TRIANGLE..WHITE DOWN-POINTING SMALL TRIANGLE
-25C0           ; A  # So         BLACK LEFT-POINTING TRIANGLE
-25C1           ; A  # Sm         WHITE LEFT-POINTING TRIANGLE
-25C2..25C5     ; N  # So     [4] BLACK LEFT-POINTING SMALL TRIANGLE..WHITE LEFT-POINTING POINTER
-25C6..25C8     ; A  # So     [3] BLACK DIAMOND..WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND
-25C9..25CA     ; N  # So     [2] FISHEYE..LOZENGE
-25CB           ; A  # So         WHITE CIRCLE
-25CC..25CD     ; N  # So     [2] DOTTED CIRCLE..CIRCLE WITH VERTICAL FILL
-25CE..25D1     ; A  # So     [4] BULLSEYE..CIRCLE WITH RIGHT HALF BLACK
-25D2..25E1     ; N  # So    [16] CIRCLE WITH LOWER HALF BLACK..LOWER HALF CIRCLE
-25E2..25E5     ; A  # So     [4] BLACK LOWER RIGHT TRIANGLE..BLACK UPPER RIGHT TRIANGLE
-25E6..25EE     ; N  # So     [9] WHITE BULLET..UP-POINTING TRIANGLE WITH RIGHT HALF BLACK
-25EF           ; A  # So         LARGE CIRCLE
-25F0..25F7     ; N  # So     [8] WHITE SQUARE WITH UPPER LEFT QUADRANT..WHITE CIRCLE WITH UPPER RIGHT QUADRANT
-25F8..25FC     ; N  # Sm     [5] UPPER LEFT TRIANGLE..BLACK MEDIUM SQUARE
-25FD..25FE     ; W  # Sm     [2] WHITE MEDIUM SMALL SQUARE..BLACK MEDIUM SMALL SQUARE
-25FF           ; N  # Sm         LOWER RIGHT TRIANGLE
-2600..2604     ; N  # So     [5] BLACK SUN WITH RAYS..COMET
-2605..2606     ; A  # So     [2] BLACK STAR..WHITE STAR
-2607..2608     ; N  # So     [2] LIGHTNING..THUNDERSTORM
-2609           ; A  # So         SUN
-260A..260D     ; N  # So     [4] ASCENDING NODE..OPPOSITION
-260E..260F     ; A  # So     [2] BLACK TELEPHONE..WHITE TELEPHONE
-2610..2613     ; N  # So     [4] BALLOT BOX..SALTIRE
-2614..2615     ; W  # So     [2] UMBRELLA WITH RAIN DROPS..HOT BEVERAGE
-2616..261B     ; N  # So     [6] WHITE SHOGI PIECE..BLACK RIGHT POINTING INDEX
-261C           ; A  # So         WHITE LEFT POINTING INDEX
-261D           ; N  # So         WHITE UP POINTING INDEX
-261E           ; A  # So         WHITE RIGHT POINTING INDEX
-261F..263F     ; N  # So    [33] WHITE DOWN POINTING INDEX..MERCURY
-2640           ; A  # So         FEMALE SIGN
-2641           ; N  # So         EARTH
-2642           ; A  # So         MALE SIGN
-2643..2647     ; N  # So     [5] JUPITER..PLUTO
-2648..2653     ; W  # So    [12] ARIES..PISCES
-2654..265F     ; N  # So    [12] WHITE CHESS KING..BLACK CHESS PAWN
-2660..2661     ; A  # So     [2] BLACK SPADE SUIT..WHITE HEART SUIT
-2662           ; N  # So         WHITE DIAMOND SUIT
-2663..2665     ; A  # So     [3] BLACK CLUB SUIT..BLACK HEART SUIT
-2666           ; N  # So         BLACK DIAMOND SUIT
-2667..266A     ; A  # So     [4] WHITE CLUB SUIT..EIGHTH NOTE
-266B           ; N  # So         BEAMED EIGHTH NOTES
-266C..266D     ; A  # So     [2] BEAMED SIXTEENTH NOTES..MUSIC FLAT SIGN
-266E           ; N  # So         MUSIC NATURAL SIGN
-266F           ; A  # Sm         MUSIC SHARP SIGN
-2670..267E     ; N  # So    [15] WEST SYRIAC CROSS..PERMANENT PAPER SIGN
-267F           ; W  # So         WHEELCHAIR SYMBOL
-2680..2692     ; N  # So    [19] DIE FACE-1..HAMMER AND PICK
-2693           ; W  # So         ANCHOR
-2694..269D     ; N  # So    [10] CROSSED SWORDS..OUTLINED WHITE STAR
-269E..269F     ; A  # So     [2] THREE LINES CONVERGING RIGHT..THREE LINES CONVERGING LEFT
-26A0           ; N  # So         WARNING SIGN
-26A1           ; W  # So         HIGH VOLTAGE SIGN
-26A2..26A9     ; N  # So     [8] DOUBLED FEMALE SIGN..HORIZONTAL MALE WITH STROKE SIGN
-26AA..26AB     ; W  # So     [2] MEDIUM WHITE CIRCLE..MEDIUM BLACK CIRCLE
-26AC..26BC     ; N  # So    [17] MEDIUM SMALL WHITE CIRCLE..SESQUIQUADRATE
-26BD..26BE     ; W  # So     [2] SOCCER BALL..BASEBALL
-26BF           ; A  # So         SQUARED KEY
-26C0..26C3     ; N  # So     [4] WHITE DRAUGHTS MAN..BLACK DRAUGHTS KING
-26C4..26C5     ; W  # So     [2] SNOWMAN WITHOUT SNOW..SUN BEHIND CLOUD
-26C6..26CD     ; A  # So     [8] RAIN..DISABLED CAR
-26CE           ; W  # So         OPHIUCHUS
-26CF..26D3     ; A  # So     [5] PICK..CHAINS
-26D4           ; W  # So         NO ENTRY
-26D5..26E1     ; A  # So    [13] ALTERNATE ONE-WAY LEFT WAY TRAFFIC..RESTRICTED LEFT ENTRY-2
-26E2           ; N  # So         ASTRONOMICAL SYMBOL FOR URANUS
-26E3           ; A  # So         HEAVY CIRCLE WITH STROKE AND TWO DOTS ABOVE
-26E4..26E7     ; N  # So     [4] PENTAGRAM..INVERTED PENTAGRAM
-26E8..26E9     ; A  # So     [2] BLACK CROSS ON SHIELD..SHINTO SHRINE
-26EA           ; W  # So         CHURCH
-26EB..26F1     ; A  # So     [7] CASTLE..UMBRELLA ON GROUND
-26F2..26F3     ; W  # So     [2] FOUNTAIN..FLAG IN HOLE
-26F4           ; A  # So         FERRY
-26F5           ; W  # So         SAILBOAT
-26F6..26F9     ; A  # So     [4] SQUARE FOUR CORNERS..PERSON WITH BALL
-26FA           ; W  # So         TENT
-26FB..26FC     ; A  # So     [2] JAPANESE BANK SYMBOL..HEADSTONE GRAVEYARD SYMBOL
-26FD           ; W  # So         FUEL PUMP
-26FE..26FF     ; A  # So     [2] CUP ON BLACK SQUARE..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE
-2700..2704     ; N  # So     [5] BLACK SAFETY SCISSORS..WHITE SCISSORS
-2705           ; W  # So         WHITE HEAVY CHECK MARK
-2706..2709     ; N  # So     [4] TELEPHONE LOCATION SIGN..ENVELOPE
-270A..270B     ; W  # So     [2] RAISED FIST..RAISED HAND
-270C..2727     ; N  # So    [28] VICTORY HAND..WHITE FOUR POINTED STAR
-2728           ; W  # So         SPARKLES
-2729..273C     ; N  # So    [20] STRESS OUTLINED WHITE STAR..OPEN CENTRE TEARDROP-SPOKED ASTERISK
-273D           ; A  # So         HEAVY TEARDROP-SPOKED ASTERISK
-273E..274B     ; N  # So    [14] SIX PETALLED BLACK AND WHITE FLORETTE..HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK
-274C           ; W  # So         CROSS MARK
-274D           ; N  # So         SHADOWED WHITE CIRCLE
-274E           ; W  # So         NEGATIVE SQUARED CROSS MARK
-274F..2752     ; N  # So     [4] LOWER RIGHT DROP-SHADOWED WHITE SQUARE..UPPER RIGHT SHADOWED WHITE SQUARE
-2753..2755     ; W  # So     [3] BLACK QUESTION MARK ORNAMENT..WHITE EXCLAMATION MARK ORNAMENT
-2756           ; N  # So         BLACK DIAMOND MINUS WHITE X
-2757           ; W  # So         HEAVY EXCLAMATION MARK SYMBOL
-2758..2767     ; N  # So    [16] LIGHT VERTICAL BAR..ROTATED FLORAL HEART BULLET
-2768           ; N  # Ps         MEDIUM LEFT PARENTHESIS ORNAMENT
-2769           ; N  # Pe         MEDIUM RIGHT PARENTHESIS ORNAMENT
-276A           ; N  # Ps         MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
-276B           ; N  # Pe         MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
-276C           ; N  # Ps         MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
-276D           ; N  # Pe         MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
-276E           ; N  # Ps         HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
-276F           ; N  # Pe         HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
-2770           ; N  # Ps         HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
-2771           ; N  # Pe         HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
-2772           ; N  # Ps         LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
-2773           ; N  # Pe         LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
-2774           ; N  # Ps         MEDIUM LEFT CURLY BRACKET ORNAMENT
-2775           ; N  # Pe         MEDIUM RIGHT CURLY BRACKET ORNAMENT
-2776..277F     ; A  # No    [10] DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED NUMBER TEN
-2780..2793     ; N  # No    [20] DINGBAT CIRCLED SANS-SERIF DIGIT ONE..DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN
-2794           ; N  # So         HEAVY WIDE-HEADED RIGHTWARDS ARROW
-2795..2797     ; W  # So     [3] HEAVY PLUS SIGN..HEAVY DIVISION SIGN
-2798..27AF     ; N  # So    [24] HEAVY SOUTH EAST ARROW..NOTCHED LOWER RIGHT-SHADOWED WHITE RIGHTWARDS ARROW
-27B0           ; W  # So         CURLY LOOP
-27B1..27BE     ; N  # So    [14] NOTCHED UPPER RIGHT-SHADOWED WHITE RIGHTWARDS ARROW..OPEN-OUTLINED RIGHTWARDS ARROW
-27BF           ; W  # So         DOUBLE CURLY LOOP
-27C0..27C4     ; N  # Sm     [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET
-27C5           ; N  # Ps         LEFT S-SHAPED BAG DELIMITER
-27C6           ; N  # Pe         RIGHT S-SHAPED BAG DELIMITER
-27C7..27E5     ; N  # Sm    [31] OR WITH DOT INSIDE..WHITE SQUARE WITH RIGHTWARDS TICK
-27E6           ; Na # Ps         MATHEMATICAL LEFT WHITE SQUARE BRACKET
-27E7           ; Na # Pe         MATHEMATICAL RIGHT WHITE SQUARE BRACKET
-27E8           ; Na # Ps         MATHEMATICAL LEFT ANGLE BRACKET
-27E9           ; Na # Pe         MATHEMATICAL RIGHT ANGLE BRACKET
-27EA           ; Na # Ps         MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
-27EB           ; Na # Pe         MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
-27EC           ; Na # Ps         MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
-27ED           ; Na # Pe         MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
-27EE           ; N  # Ps         MATHEMATICAL LEFT FLATTENED PARENTHESIS
-27EF           ; N  # Pe         MATHEMATICAL RIGHT FLATTENED PARENTHESIS
-27F0..27FF     ; N  # Sm    [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW
-2800..28FF     ; N  # So   [256] BRAILLE PATTERN BLANK..BRAILLE PATTERN DOTS-12345678
-2900..297F     ; N  # Sm   [128] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..DOWN FISH TAIL
-2980..2982     ; N  # Sm     [3] TRIPLE VERTICAL BAR DELIMITER..Z NOTATION TYPE COLON
-2983           ; N  # Ps         LEFT WHITE CURLY BRACKET
-2984           ; N  # Pe         RIGHT WHITE CURLY BRACKET
-2985           ; Na # Ps         LEFT WHITE PARENTHESIS
-2986           ; Na # Pe         RIGHT WHITE PARENTHESIS
-2987           ; N  # Ps         Z NOTATION LEFT IMAGE BRACKET
-2988           ; N  # Pe         Z NOTATION RIGHT IMAGE BRACKET
-2989           ; N  # Ps         Z NOTATION LEFT BINDING BRACKET
-298A           ; N  # Pe         Z NOTATION RIGHT BINDING BRACKET
-298B           ; N  # Ps         LEFT SQUARE BRACKET WITH UNDERBAR
-298C           ; N  # Pe         RIGHT SQUARE BRACKET WITH UNDERBAR
-298D           ; N  # Ps         LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
-298E           ; N  # Pe         RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
-298F           ; N  # Ps         LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
-2990           ; N  # Pe         RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
-2991           ; N  # Ps         LEFT ANGLE BRACKET WITH DOT
-2992           ; N  # Pe         RIGHT ANGLE BRACKET WITH DOT
-2993           ; N  # Ps         LEFT ARC LESS-THAN BRACKET
-2994           ; N  # Pe         RIGHT ARC GREATER-THAN BRACKET
-2995           ; N  # Ps         DOUBLE LEFT ARC GREATER-THAN BRACKET
-2996           ; N  # Pe         DOUBLE RIGHT ARC LESS-THAN BRACKET
-2997           ; N  # Ps         LEFT BLACK TORTOISE SHELL BRACKET
-2998           ; N  # Pe         RIGHT BLACK TORTOISE SHELL BRACKET
-2999..29D7     ; N  # Sm    [63] DOTTED FENCE..BLACK HOURGLASS
-29D8           ; N  # Ps         LEFT WIGGLY FENCE
-29D9           ; N  # Pe         RIGHT WIGGLY FENCE
-29DA           ; N  # Ps         LEFT DOUBLE WIGGLY FENCE
-29DB           ; N  # Pe         RIGHT DOUBLE WIGGLY FENCE
-29DC..29FB     ; N  # Sm    [32] INCOMPLETE INFINITY..TRIPLE PLUS
-29FC           ; N  # Ps         LEFT-POINTING CURVED ANGLE BRACKET
-29FD           ; N  # Pe         RIGHT-POINTING CURVED ANGLE BRACKET
-29FE..29FF     ; N  # Sm     [2] TINY..MINY
-2A00..2AFF     ; N  # Sm   [256] N-ARY CIRCLED DOT OPERATOR..N-ARY WHITE VERTICAL BAR
-2B00..2B1A     ; N  # So    [27] NORTH EAST WHITE ARROW..DOTTED SQUARE
-2B1B..2B1C     ; W  # So     [2] BLACK LARGE SQUARE..WHITE LARGE SQUARE
-2B1D..2B2F     ; N  # So    [19] BLACK VERY SMALL SQUARE..WHITE VERTICAL ELLIPSE
-2B30..2B44     ; N  # Sm    [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET
-2B45..2B46     ; N  # So     [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW
-2B47..2B4C     ; N  # Sm     [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR
-2B4D..2B4F     ; N  # So     [3] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..SHORT BACKSLANTED SOUTH ARROW
-2B50           ; W  # So         WHITE MEDIUM STAR
-2B51..2B54     ; N  # So     [4] BLACK SMALL STAR..WHITE RIGHT-POINTING PENTAGON
-2B55           ; W  # So         HEAVY LARGE CIRCLE
-2B56..2B59     ; A  # So     [4] HEAVY OVAL WITH OVAL INSIDE..HEAVY CIRCLED SALTIRE
-2B5A..2B73     ; N  # So    [26] SLANTED NORTH ARROW WITH HOOKED HEAD..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR
-2B76..2B95     ; N  # So    [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW
-2B97..2BFF     ; N  # So   [105] SYMBOL FOR TYPE A ELECTRONICS..HELLSCHREIBER PAUSE SYMBOL
-2C00..2C5F     ; N  # L&    [96] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI
-2C60..2C7B     ; N  # L&    [28] LATIN CAPITAL LETTER L WITH DOUBLE BAR..LATIN LETTER SMALL CAPITAL TURNED E
-2C7C..2C7D     ; N  # Lm     [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V
-2C7E..2C7F     ; N  # Lu     [2] LATIN CAPITAL LETTER S WITH SWASH TAIL..LATIN CAPITAL LETTER Z WITH SWASH TAIL
-2C80..2CE4     ; N  # L&   [101] COPTIC CAPITAL LETTER ALFA..COPTIC SYMBOL KAI
-2CE5..2CEA     ; N  # So     [6] COPTIC SYMBOL MI RO..COPTIC SYMBOL SHIMA SIMA
-2CEB..2CEE     ; N  # L&     [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA
-2CEF..2CF1     ; N  # Mn     [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS
-2CF2..2CF3     ; N  # L&     [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI
-2CF9..2CFC     ; N  # Po     [4] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN VERSE DIVIDER
-2CFD           ; N  # No         COPTIC FRACTION ONE HALF
-2CFE..2CFF     ; N  # Po     [2] COPTIC FULL STOP..COPTIC MORPHOLOGICAL DIVIDER
-2D00..2D25     ; N  # Ll    [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE
-2D27           ; N  # Ll         GEORGIAN SMALL LETTER YN
-2D2D           ; N  # Ll         GEORGIAN SMALL LETTER AEN
-2D30..2D67     ; N  # Lo    [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO
-2D6F           ; N  # Lm         TIFINAGH MODIFIER LETTER LABIALIZATION MARK
-2D70           ; N  # Po         TIFINAGH SEPARATOR MARK
-2D7F           ; N  # Mn         TIFINAGH CONSONANT JOINER
-2D80..2D96     ; N  # Lo    [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE
-2DA0..2DA6     ; N  # Lo     [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO
-2DA8..2DAE     ; N  # Lo     [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO
-2DB0..2DB6     ; N  # Lo     [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO
-2DB8..2DBE     ; N  # Lo     [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO
-2DC0..2DC6     ; N  # Lo     [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO
-2DC8..2DCE     ; N  # Lo     [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO
-2DD0..2DD6     ; N  # Lo     [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO
-2DD8..2DDE     ; N  # Lo     [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO
-2DE0..2DFF     ; N  # Mn    [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
-2E00..2E01     ; N  # Po     [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
-2E02           ; N  # Pi         LEFT SUBSTITUTION BRACKET
-2E03           ; N  # Pf         RIGHT SUBSTITUTION BRACKET
-2E04           ; N  # Pi         LEFT DOTTED SUBSTITUTION BRACKET
-2E05           ; N  # Pf         RIGHT DOTTED SUBSTITUTION BRACKET
-2E06..2E08     ; N  # Po     [3] RAISED INTERPOLATION MARKER..DOTTED TRANSPOSITION MARKER
-2E09           ; N  # Pi         LEFT TRANSPOSITION BRACKET
-2E0A           ; N  # Pf         RIGHT TRANSPOSITION BRACKET
-2E0B           ; N  # Po         RAISED SQUARE
-2E0C           ; N  # Pi         LEFT RAISED OMISSION BRACKET
-2E0D           ; N  # Pf         RIGHT RAISED OMISSION BRACKET
-2E0E..2E16     ; N  # Po     [9] EDITORIAL CORONIS..DOTTED RIGHT-POINTING ANGLE
-2E17           ; N  # Pd         DOUBLE OBLIQUE HYPHEN
-2E18..2E19     ; N  # Po     [2] INVERTED INTERROBANG..PALM BRANCH
-2E1A           ; N  # Pd         HYPHEN WITH DIAERESIS
-2E1B           ; N  # Po         TILDE WITH RING ABOVE
-2E1C           ; N  # Pi         LEFT LOW PARAPHRASE BRACKET
-2E1D           ; N  # Pf         RIGHT LOW PARAPHRASE BRACKET
-2E1E..2E1F     ; N  # Po     [2] TILDE WITH DOT ABOVE..TILDE WITH DOT BELOW
-2E20           ; N  # Pi         LEFT VERTICAL BAR WITH QUILL
-2E21           ; N  # Pf         RIGHT VERTICAL BAR WITH QUILL
-2E22           ; N  # Ps         TOP LEFT HALF BRACKET
-2E23           ; N  # Pe         TOP RIGHT HALF BRACKET
-2E24           ; N  # Ps         BOTTOM LEFT HALF BRACKET
-2E25           ; N  # Pe         BOTTOM RIGHT HALF BRACKET
-2E26           ; N  # Ps         LEFT SIDEWAYS U BRACKET
-2E27           ; N  # Pe         RIGHT SIDEWAYS U BRACKET
-2E28           ; N  # Ps         LEFT DOUBLE PARENTHESIS
-2E29           ; N  # Pe         RIGHT DOUBLE PARENTHESIS
-2E2A..2E2E     ; N  # Po     [5] TWO DOTS OVER ONE DOT PUNCTUATION..REVERSED QUESTION MARK
-2E2F           ; N  # Lm         VERTICAL TILDE
-2E30..2E39     ; N  # Po    [10] RING POINT..TOP HALF SECTION SIGN
-2E3A..2E3B     ; N  # Pd     [2] TWO-EM DASH..THREE-EM DASH
-2E3C..2E3F     ; N  # Po     [4] STENOGRAPHIC FULL STOP..CAPITULUM
-2E40           ; N  # Pd         DOUBLE HYPHEN
-2E41           ; N  # Po         REVERSED COMMA
-2E42           ; N  # Ps         DOUBLE LOW-REVERSED-9 QUOTATION MARK
-2E43..2E4F     ; N  # Po    [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER
-2E50..2E51     ; N  # So     [2] CROSS PATTY WITH RIGHT CROSSBAR..CROSS PATTY WITH LEFT CROSSBAR
-2E52..2E54     ; N  # Po     [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK
-2E55           ; N  # Ps         LEFT SQUARE BRACKET WITH STROKE
-2E56           ; N  # Pe         RIGHT SQUARE BRACKET WITH STROKE
-2E57           ; N  # Ps         LEFT SQUARE BRACKET WITH DOUBLE STROKE
-2E58           ; N  # Pe         RIGHT SQUARE BRACKET WITH DOUBLE STROKE
-2E59           ; N  # Ps         TOP HALF LEFT PARENTHESIS
-2E5A           ; N  # Pe         TOP HALF RIGHT PARENTHESIS
-2E5B           ; N  # Ps         BOTTOM HALF LEFT PARENTHESIS
-2E5C           ; N  # Pe         BOTTOM HALF RIGHT PARENTHESIS
-2E5D           ; N  # Pd         OBLIQUE HYPHEN
-2E80..2E99     ; W  # So    [26] CJK RADICAL REPEAT..CJK RADICAL RAP
-2E9B..2EF3     ; W  # So    [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE
-2F00..2FD5     ; W  # So   [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE
-2FF0..2FFF     ; W  # So    [16] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION
-3000           ; F  # Zs         IDEOGRAPHIC SPACE
-3001..3003     ; W  # Po     [3] IDEOGRAPHIC COMMA..DITTO MARK
-3004           ; W  # So         JAPANESE INDUSTRIAL STANDARD SYMBOL
-3005           ; W  # Lm         IDEOGRAPHIC ITERATION MARK
-3006           ; W  # Lo         IDEOGRAPHIC CLOSING MARK
-3007           ; W  # Nl         IDEOGRAPHIC NUMBER ZERO
-3008           ; W  # Ps         LEFT ANGLE BRACKET
-3009           ; W  # Pe         RIGHT ANGLE BRACKET
-300A           ; W  # Ps         LEFT DOUBLE ANGLE BRACKET
-300B           ; W  # Pe         RIGHT DOUBLE ANGLE BRACKET
-300C           ; W  # Ps         LEFT CORNER BRACKET
-300D           ; W  # Pe         RIGHT CORNER BRACKET
-300E           ; W  # Ps         LEFT WHITE CORNER BRACKET
-300F           ; W  # Pe         RIGHT WHITE CORNER BRACKET
-3010           ; W  # Ps         LEFT BLACK LENTICULAR BRACKET
-3011           ; W  # Pe         RIGHT BLACK LENTICULAR BRACKET
-3012..3013     ; W  # So     [2] POSTAL MARK..GETA MARK
-3014           ; W  # Ps         LEFT TORTOISE SHELL BRACKET
-3015           ; W  # Pe         RIGHT TORTOISE SHELL BRACKET
-3016           ; W  # Ps         LEFT WHITE LENTICULAR BRACKET
-3017           ; W  # Pe         RIGHT WHITE LENTICULAR BRACKET
-3018           ; W  # Ps         LEFT WHITE TORTOISE SHELL BRACKET
-3019           ; W  # Pe         RIGHT WHITE TORTOISE SHELL BRACKET
-301A           ; W  # Ps         LEFT WHITE SQUARE BRACKET
-301B           ; W  # Pe         RIGHT WHITE SQUARE BRACKET
-301C           ; W  # Pd         WAVE DASH
-301D           ; W  # Ps         REVERSED DOUBLE PRIME QUOTATION MARK
-301E..301F     ; W  # Pe     [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
-3020           ; W  # So         POSTAL MARK FACE
-3021..3029     ; W  # Nl     [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE
-302A..302D     ; W  # Mn     [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
-302E..302F     ; W  # Mc     [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
-3030           ; W  # Pd         WAVY DASH
-3031..3035     ; W  # Lm     [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
-3036..3037     ; W  # So     [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
-3038..303A     ; W  # Nl     [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
-303B           ; W  # Lm         VERTICAL IDEOGRAPHIC ITERATION MARK
-303C           ; W  # Lo         MASU MARK
-303D           ; W  # Po         PART ALTERNATION MARK
-303E           ; W  # So         IDEOGRAPHIC VARIATION INDICATOR
-303F           ; N  # So         IDEOGRAPHIC HALF FILL SPACE
-3041..3096     ; W  # Lo    [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
-3099..309A     ; W  # Mn     [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
-309B..309C     ; W  # Sk     [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
-309D..309E     ; W  # Lm     [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK
-309F           ; W  # Lo         HIRAGANA DIGRAPH YORI
-30A0           ; W  # Pd         KATAKANA-HIRAGANA DOUBLE HYPHEN
-30A1..30FA     ; W  # Lo    [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO
-30FB           ; W  # Po         KATAKANA MIDDLE DOT
-30FC..30FE     ; W  # Lm     [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK
-30FF           ; W  # Lo         KATAKANA DIGRAPH KOTO
-3105..312F     ; W  # Lo    [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN
-3131..318E     ; W  # Lo    [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE
-3190..3191     ; W  # So     [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
-3192..3195     ; W  # No     [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
-3196..319F     ; W  # So    [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
-31A0..31BF     ; W  # Lo    [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
-31C0..31E3     ; W  # So    [36] CJK STROKE T..CJK STROKE Q
-31EF           ; W  # So         IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION
-31F0..31FF     ; W  # Lo    [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
-3200..321E     ; W  # So    [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU
-3220..3229     ; W  # No    [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
-322A..3247     ; W  # So    [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
-3248..324F     ; A  # No     [8] CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE
-3250           ; W  # So         PARTNERSHIP SIGN
-3251..325F     ; W  # No    [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE
-3260..327F     ; W  # So    [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL
-3280..3289     ; W  # No    [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
-328A..32B0     ; W  # So    [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
-32B1..32BF     ; W  # No    [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY
-32C0..32FF     ; W  # So    [64] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE ERA NAME REIWA
-3300..33FF     ; W  # So   [256] SQUARE APAATO..SQUARE GAL
-3400..4DBF     ; W  # Lo  [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
-4DC0..4DFF     ; N  # So    [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION
-4E00..9FFF     ; W  # Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF
-A000..A014     ; W  # Lo    [21] YI SYLLABLE IT..YI SYLLABLE E
-A015           ; W  # Lm         YI SYLLABLE WU
-A016..A48C     ; W  # Lo  [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
-A490..A4C6     ; W  # So    [55] YI RADICAL QOT..YI RADICAL KE
-A4D0..A4F7     ; N  # Lo    [40] LISU LETTER BA..LISU LETTER OE
-A4F8..A4FD     ; N  # Lm     [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU
-A4FE..A4FF     ; N  # Po     [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP
-A500..A60B     ; N  # Lo   [268] VAI SYLLABLE EE..VAI SYLLABLE NG
-A60C           ; N  # Lm         VAI SYLLABLE LENGTHENER
-A60D..A60F     ; N  # Po     [3] VAI COMMA..VAI QUESTION MARK
-A610..A61F     ; N  # Lo    [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG
-A620..A629     ; N  # Nd    [10] VAI DIGIT ZERO..VAI DIGIT NINE
-A62A..A62B     ; N  # Lo     [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO
-A640..A66D     ; N  # L&    [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O
-A66E           ; N  # Lo         CYRILLIC LETTER MULTIOCULAR O
-A66F           ; N  # Mn         COMBINING CYRILLIC VZMET
-A670..A672     ; N  # Me     [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN
-A673           ; N  # Po         SLAVONIC ASTERISK
-A674..A67D     ; N  # Mn    [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK
-A67E           ; N  # Po         CYRILLIC KAVYKA
-A67F           ; N  # Lm         CYRILLIC PAYEROK
-A680..A69B     ; N  # L&    [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O
-A69C..A69D     ; N  # Lm     [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
-A69E..A69F     ; N  # Mn     [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E
-A6A0..A6E5     ; N  # Lo    [70] BAMUM LETTER A..BAMUM LETTER KI
-A6E6..A6EF     ; N  # Nl    [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM
-A6F0..A6F1     ; N  # Mn     [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS
-A6F2..A6F7     ; N  # Po     [6] BAMUM NJAEMLI..BAMUM QUESTION MARK
-A700..A716     ; N  # Sk    [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR
-A717..A71F     ; N  # Lm     [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
-A720..A721     ; N  # Sk     [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE
-A722..A76F     ; N  # L&    [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON
-A770           ; N  # Lm         MODIFIER LETTER US
-A771..A787     ; N  # L&    [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
-A788           ; N  # Lm         MODIFIER LETTER LOW CIRCUMFLEX ACCENT
-A789..A78A     ; N  # Sk     [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN
-A78B..A78E     ; N  # L&     [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
-A78F           ; N  # Lo         LATIN LETTER SINOLOGICAL DOT
-A790..A7CA     ; N  # L&    [59] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
-A7D0..A7D1     ; N  # L&     [2] LATIN CAPITAL LETTER CLOSED INSULAR G..LATIN SMALL LETTER CLOSED INSULAR G
-A7D3           ; N  # Ll         LATIN SMALL LETTER DOUBLE THORN
-A7D5..A7D9     ; N  # L&     [5] LATIN SMALL LETTER DOUBLE WYNN..LATIN SMALL LETTER SIGMOID S
-A7F2..A7F4     ; N  # Lm     [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
-A7F5..A7F6     ; N  # L&     [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H
-A7F7           ; N  # Lo         LATIN EPIGRAPHIC LETTER SIDEWAYS I
-A7F8..A7F9     ; N  # Lm     [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
-A7FA           ; N  # Ll         LATIN LETTER SMALL CAPITAL TURNED M
-A7FB..A7FF     ; N  # Lo     [5] LATIN EPIGRAPHIC LETTER REVERSED F..LATIN EPIGRAPHIC LETTER ARCHAIC M
-A800..A801     ; N  # Lo     [2] SYLOTI NAGRI LETTER A..SYLOTI NAGRI LETTER I
-A802           ; N  # Mn         SYLOTI NAGRI SIGN DVISVARA
-A803..A805     ; N  # Lo     [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O
-A806           ; N  # Mn         SYLOTI NAGRI SIGN HASANTA
-A807..A80A     ; N  # Lo     [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO
-A80B           ; N  # Mn         SYLOTI NAGRI SIGN ANUSVARA
-A80C..A822     ; N  # Lo    [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO
-A823..A824     ; N  # Mc     [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I
-A825..A826     ; N  # Mn     [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E
-A827           ; N  # Mc         SYLOTI NAGRI VOWEL SIGN OO
-A828..A82B     ; N  # So     [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-4
-A82C           ; N  # Mn         SYLOTI NAGRI SIGN ALTERNATE HASANTA
-A830..A835     ; N  # No     [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE SIXTEENTHS
-A836..A837     ; N  # So     [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
-A838           ; N  # Sc         NORTH INDIC RUPEE MARK
-A839           ; N  # So         NORTH INDIC QUANTITY MARK
-A840..A873     ; N  # Lo    [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU
-A874..A877     ; N  # Po     [4] PHAGS-PA SINGLE HEAD MARK..PHAGS-PA MARK DOUBLE SHAD
-A880..A881     ; N  # Mc     [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA
-A882..A8B3     ; N  # Lo    [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA
-A8B4..A8C3     ; N  # Mc    [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU
-A8C4..A8C5     ; N  # Mn     [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU
-A8CE..A8CF     ; N  # Po     [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA
-A8D0..A8D9     ; N  # Nd    [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE
-A8E0..A8F1     ; N  # Mn    [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA
-A8F2..A8F7     ; N  # Lo     [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA
-A8F8..A8FA     ; N  # Po     [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET
-A8FB           ; N  # Lo         DEVANAGARI HEADSTROKE
-A8FC           ; N  # Po         DEVANAGARI SIGN SIDDHAM
-A8FD..A8FE     ; N  # Lo     [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY
-A8FF           ; N  # Mn         DEVANAGARI VOWEL SIGN AY
-A900..A909     ; N  # Nd    [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE
-A90A..A925     ; N  # Lo    [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO
-A926..A92D     ; N  # Mn     [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU
-A92E..A92F     ; N  # Po     [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA
-A930..A946     ; N  # Lo    [23] REJANG LETTER KA..REJANG LETTER A
-A947..A951     ; N  # Mn    [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R
-A952..A953     ; N  # Mc     [2] REJANG CONSONANT SIGN H..REJANG VIRAMA
-A95F           ; N  # Po         REJANG SECTION MARK
-A960..A97C     ; W  # Lo    [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH
-A980..A982     ; N  # Mn     [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR
-A983           ; N  # Mc         JAVANESE SIGN WIGNYAN
-A984..A9B2     ; N  # Lo    [47] JAVANESE LETTER A..JAVANESE LETTER HA
-A9B3           ; N  # Mn         JAVANESE SIGN CECAK TELU
-A9B4..A9B5     ; N  # Mc     [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG
-A9B6..A9B9     ; N  # Mn     [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT
-A9BA..A9BB     ; N  # Mc     [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE
-A9BC..A9BD     ; N  # Mn     [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET
-A9BE..A9C0     ; N  # Mc     [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON
-A9C1..A9CD     ; N  # Po    [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH
-A9CF           ; N  # Lm         JAVANESE PANGRANGKEP
-A9D0..A9D9     ; N  # Nd    [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE
-A9DE..A9DF     ; N  # Po     [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN
-A9E0..A9E4     ; N  # Lo     [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA
-A9E5           ; N  # Mn         MYANMAR SIGN SHAN SAW
-A9E6           ; N  # Lm         MYANMAR MODIFIER LETTER SHAN REDUPLICATION
-A9E7..A9EF     ; N  # Lo     [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA
-A9F0..A9F9     ; N  # Nd    [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE
-A9FA..A9FE     ; N  # Lo     [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA
-AA00..AA28     ; N  # Lo    [41] CHAM LETTER A..CHAM LETTER HA
-AA29..AA2E     ; N  # Mn     [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE
-AA2F..AA30     ; N  # Mc     [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI
-AA31..AA32     ; N  # Mn     [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE
-AA33..AA34     ; N  # Mc     [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA
-AA35..AA36     ; N  # Mn     [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA
-AA40..AA42     ; N  # Lo     [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG
-AA43           ; N  # Mn         CHAM CONSONANT SIGN FINAL NG
-AA44..AA4B     ; N  # Lo     [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS
-AA4C           ; N  # Mn         CHAM CONSONANT SIGN FINAL M
-AA4D           ; N  # Mc         CHAM CONSONANT SIGN FINAL H
-AA50..AA59     ; N  # Nd    [10] CHAM DIGIT ZERO..CHAM DIGIT NINE
-AA5C..AA5F     ; N  # Po     [4] CHAM PUNCTUATION SPIRAL..CHAM PUNCTUATION TRIPLE DANDA
-AA60..AA6F     ; N  # Lo    [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA
-AA70           ; N  # Lm         MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION
-AA71..AA76     ; N  # Lo     [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM
-AA77..AA79     ; N  # So     [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO
-AA7A           ; N  # Lo         MYANMAR LETTER AITON RA
-AA7B           ; N  # Mc         MYANMAR SIGN PAO KAREN TONE
-AA7C           ; N  # Mn         MYANMAR SIGN TAI LAING TONE-2
-AA7D           ; N  # Mc         MYANMAR SIGN TAI LAING TONE-5
-AA7E..AA7F     ; N  # Lo     [2] MYANMAR LETTER SHWE PALAUNG CHA..MYANMAR LETTER SHWE PALAUNG SHA
-AA80..AAAF     ; N  # Lo    [48] TAI VIET LETTER LOW KO..TAI VIET LETTER HIGH O
-AAB0           ; N  # Mn         TAI VIET MAI KANG
-AAB1           ; N  # Lo         TAI VIET VOWEL AA
-AAB2..AAB4     ; N  # Mn     [3] TAI VIET VOWEL I..TAI VIET VOWEL U
-AAB5..AAB6     ; N  # Lo     [2] TAI VIET VOWEL E..TAI VIET VOWEL O
-AAB7..AAB8     ; N  # Mn     [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA
-AAB9..AABD     ; N  # Lo     [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN
-AABE..AABF     ; N  # Mn     [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK
-AAC0           ; N  # Lo         TAI VIET TONE MAI NUENG
-AAC1           ; N  # Mn         TAI VIET TONE MAI THO
-AAC2           ; N  # Lo         TAI VIET TONE MAI SONG
-AADB..AADC     ; N  # Lo     [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG
-AADD           ; N  # Lm         TAI VIET SYMBOL SAM
-AADE..AADF     ; N  # Po     [2] TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI KOI
-AAE0..AAEA     ; N  # Lo    [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA
-AAEB           ; N  # Mc         MEETEI MAYEK VOWEL SIGN II
-AAEC..AAED     ; N  # Mn     [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI
-AAEE..AAEF     ; N  # Mc     [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU
-AAF0..AAF1     ; N  # Po     [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM
-AAF2           ; N  # Lo         MEETEI MAYEK ANJI
-AAF3..AAF4     ; N  # Lm     [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK
-AAF5           ; N  # Mc         MEETEI MAYEK VOWEL SIGN VISARGA
-AAF6           ; N  # Mn         MEETEI MAYEK VIRAMA
-AB01..AB06     ; N  # Lo     [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO
-AB09..AB0E     ; N  # Lo     [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO
-AB11..AB16     ; N  # Lo     [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO
-AB20..AB26     ; N  # Lo     [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO
-AB28..AB2E     ; N  # Lo     [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO
-AB30..AB5A     ; N  # Ll    [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
-AB5B           ; N  # Sk         MODIFIER BREVE WITH INVERTED BREVE
-AB5C..AB5F     ; N  # Lm     [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
-AB60..AB68     ; N  # Ll     [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
-AB69           ; N  # Lm         MODIFIER LETTER SMALL TURNED W
-AB6A..AB6B     ; N  # Sk     [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK
-AB70..ABBF     ; N  # Ll    [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
-ABC0..ABE2     ; N  # Lo    [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM
-ABE3..ABE4     ; N  # Mc     [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP
-ABE5           ; N  # Mn         MEETEI MAYEK VOWEL SIGN ANAP
-ABE6..ABE7     ; N  # Mc     [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP
-ABE8           ; N  # Mn         MEETEI MAYEK VOWEL SIGN UNAP
-ABE9..ABEA     ; N  # Mc     [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG
-ABEB           ; N  # Po         MEETEI MAYEK CHEIKHEI
-ABEC           ; N  # Mc         MEETEI MAYEK LUM IYEK
-ABED           ; N  # Mn         MEETEI MAYEK APUN IYEK
-ABF0..ABF9     ; N  # Nd    [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE
-AC00..D7A3     ; W  # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
-D7B0..D7C6     ; N  # Lo    [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E
-D7CB..D7FB     ; N  # Lo    [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH
-D800..DB7F     ; N  # Cs   [896] <surrogate-D800>..<surrogate-DB7F>
-DB80..DBFF     ; N  # Cs   [128] <surrogate-DB80>..<surrogate-DBFF>
-DC00..DFFF     ; N  # Cs  [1024] <surrogate-DC00>..<surrogate-DFFF>
-E000..F8FF     ; A  # Co  [6400] <private-use-E000>..<private-use-F8FF>
-F900..FA6D     ; W  # Lo   [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
-FA6E..FA6F     ; W  # Cn     [2] <reserved-FA6E>..<reserved-FA6F>
-FA70..FAD9     ; W  # Lo   [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
-FADA..FAFF     ; W  # Cn    [38] <reserved-FADA>..<reserved-FAFF>
-FB00..FB06     ; N  # Ll     [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
-FB13..FB17     ; N  # Ll     [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
-FB1D           ; N  # Lo         HEBREW LETTER YOD WITH HIRIQ
-FB1E           ; N  # Mn         HEBREW POINT JUDEO-SPANISH VARIKA
-FB1F..FB28     ; N  # Lo    [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV
-FB29           ; N  # Sm         HEBREW LETTER ALTERNATIVE PLUS SIGN
-FB2A..FB36     ; N  # Lo    [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH
-FB38..FB3C     ; N  # Lo     [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH
-FB3E           ; N  # Lo         HEBREW LETTER MEM WITH DAGESH
-FB40..FB41     ; N  # Lo     [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH
-FB43..FB44     ; N  # Lo     [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH
-FB46..FB4F     ; N  # Lo    [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATURE ALEF LAMED
-FB50..FBB1     ; N  # Lo    [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
-FBB2..FBC2     ; N  # Sk    [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE
-FBD3..FD3D     ; N  # Lo   [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM
-FD3E           ; N  # Pe         ORNATE LEFT PARENTHESIS
-FD3F           ; N  # Ps         ORNATE RIGHT PARENTHESIS
-FD40..FD4F     ; N  # So    [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH
-FD50..FD8F     ; N  # Lo    [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM
-FD92..FDC7     ; N  # Lo    [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
-FDCF           ; N  # So         ARABIC LIGATURE SALAAMUHU ALAYNAA
-FDF0..FDFB     ; N  # Lo    [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU
-FDFC           ; N  # Sc         RIAL SIGN
-FDFD..FDFF     ; N  # So     [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL
-FE00..FE0F     ; A  # Mn    [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
-FE10..FE16     ; W  # Po     [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK
-FE17           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
-FE18           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
-FE19           ; W  # Po         PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
-FE20..FE2F     ; N  # Mn    [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF
-FE30           ; W  # Po         PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
-FE31..FE32     ; W  # Pd     [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH
-FE33..FE34     ; W  # Pc     [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
-FE35           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
-FE36           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
-FE37           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
-FE38           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
-FE39           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
-FE3A           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
-FE3B           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
-FE3C           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
-FE3D           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
-FE3E           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
-FE3F           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
-FE40           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
-FE41           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
-FE42           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
-FE43           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
-FE44           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
-FE45..FE46     ; W  # Po     [2] SESAME DOT..WHITE SESAME DOT
-FE47           ; W  # Ps         PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET
-FE48           ; W  # Pe         PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET
-FE49..FE4C     ; W  # Po     [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE
-FE4D..FE4F     ; W  # Pc     [3] DASHED LOW LINE..WAVY LOW LINE
-FE50..FE52     ; W  # Po     [3] SMALL COMMA..SMALL FULL STOP
-FE54..FE57     ; W  # Po     [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK
-FE58           ; W  # Pd         SMALL EM DASH
-FE59           ; W  # Ps         SMALL LEFT PARENTHESIS
-FE5A           ; W  # Pe         SMALL RIGHT PARENTHESIS
-FE5B           ; W  # Ps         SMALL LEFT CURLY BRACKET
-FE5C           ; W  # Pe         SMALL RIGHT CURLY BRACKET
-FE5D           ; W  # Ps         SMALL LEFT TORTOISE SHELL BRACKET
-FE5E           ; W  # Pe         SMALL RIGHT TORTOISE SHELL BRACKET
-FE5F..FE61     ; W  # Po     [3] SMALL NUMBER SIGN..SMALL ASTERISK
-FE62           ; W  # Sm         SMALL PLUS SIGN
-FE63           ; W  # Pd         SMALL HYPHEN-MINUS
-FE64..FE66     ; W  # Sm     [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN
-FE68           ; W  # Po         SMALL REVERSE SOLIDUS
-FE69           ; W  # Sc         SMALL DOLLAR SIGN
-FE6A..FE6B     ; W  # Po     [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT
-FE70..FE74     ; N  # Lo     [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM
-FE76..FEFC     ; N  # Lo   [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
-FEFF           ; N  # Cf         ZERO WIDTH NO-BREAK SPACE
-FF01..FF03     ; F  # Po     [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN
-FF04           ; F  # Sc         FULLWIDTH DOLLAR SIGN
-FF05..FF07     ; F  # Po     [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE
-FF08           ; F  # Ps         FULLWIDTH LEFT PARENTHESIS
-FF09           ; F  # Pe         FULLWIDTH RIGHT PARENTHESIS
-FF0A           ; F  # Po         FULLWIDTH ASTERISK
-FF0B           ; F  # Sm         FULLWIDTH PLUS SIGN
-FF0C           ; F  # Po         FULLWIDTH COMMA
-FF0D           ; F  # Pd         FULLWIDTH HYPHEN-MINUS
-FF0E..FF0F     ; F  # Po     [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS
-FF10..FF19     ; F  # Nd    [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
-FF1A..FF1B     ; F  # Po     [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON
-FF1C..FF1E     ; F  # Sm     [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN
-FF1F..FF20     ; F  # Po     [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT
-FF21..FF3A     ; F  # Lu    [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
-FF3B           ; F  # Ps         FULLWIDTH LEFT SQUARE BRACKET
-FF3C           ; F  # Po         FULLWIDTH REVERSE SOLIDUS
-FF3D           ; F  # Pe         FULLWIDTH RIGHT SQUARE BRACKET
-FF3E           ; F  # Sk         FULLWIDTH CIRCUMFLEX ACCENT
-FF3F           ; F  # Pc         FULLWIDTH LOW LINE
-FF40           ; F  # Sk         FULLWIDTH GRAVE ACCENT
-FF41..FF5A     ; F  # Ll    [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
-FF5B           ; F  # Ps         FULLWIDTH LEFT CURLY BRACKET
-FF5C           ; F  # Sm         FULLWIDTH VERTICAL LINE
-FF5D           ; F  # Pe         FULLWIDTH RIGHT CURLY BRACKET
-FF5E           ; F  # Sm         FULLWIDTH TILDE
-FF5F           ; F  # Ps         FULLWIDTH LEFT WHITE PARENTHESIS
-FF60           ; F  # Pe         FULLWIDTH RIGHT WHITE PARENTHESIS
-FF61           ; H  # Po         HALFWIDTH IDEOGRAPHIC FULL STOP
-FF62           ; H  # Ps         HALFWIDTH LEFT CORNER BRACKET
-FF63           ; H  # Pe         HALFWIDTH RIGHT CORNER BRACKET
-FF64..FF65     ; H  # Po     [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
-FF66..FF6F     ; H  # Lo    [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU
-FF70           ; H  # Lm         HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
-FF71..FF9D     ; H  # Lo    [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N
-FF9E..FF9F     ; H  # Lm     [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
-FFA0..FFBE     ; H  # Lo    [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH
-FFC2..FFC7     ; H  # Lo     [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E
-FFCA..FFCF     ; H  # Lo     [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE
-FFD2..FFD7     ; H  # Lo     [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU
-FFDA..FFDC     ; H  # Lo     [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
-FFE0..FFE1     ; F  # Sc     [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN
-FFE2           ; F  # Sm         FULLWIDTH NOT SIGN
-FFE3           ; F  # Sk         FULLWIDTH MACRON
-FFE4           ; F  # So         FULLWIDTH BROKEN BAR
-FFE5..FFE6     ; F  # Sc     [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
-FFE8           ; H  # So         HALFWIDTH FORMS LIGHT VERTICAL
-FFE9..FFEC     ; H  # Sm     [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW
-FFED..FFEE     ; H  # So     [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE
-FFF9..FFFB     ; N  # Cf     [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
-FFFC           ; N  # So         OBJECT REPLACEMENT CHARACTER
-FFFD           ; A  # So         REPLACEMENT CHARACTER
-10000..1000B   ; N  # Lo    [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE
-1000D..10026   ; N  # Lo    [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO
-10028..1003A   ; N  # Lo    [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO
-1003C..1003D   ; N  # Lo     [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE
-1003F..1004D   ; N  # Lo    [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO
-10050..1005D   ; N  # Lo    [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089
-10080..100FA   ; N  # Lo   [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305
-10100..10102   ; N  # Po     [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK
-10107..10133   ; N  # No    [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
-10137..1013F   ; N  # So     [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
-10140..10174   ; N  # Nl    [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS
-10175..10178   ; N  # No     [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN
-10179..10189   ; N  # So    [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN
-1018A..1018B   ; N  # No     [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN
-1018C..1018E   ; N  # So     [3] GREEK SINUSOID SIGN..NOMISMA SIGN
-10190..1019C   ; N  # So    [13] ROMAN SEXTANS SIGN..ASCIA SYMBOL
-101A0          ; N  # So         GREEK SYMBOL TAU RHO
-101D0..101FC   ; N  # So    [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND
-101FD          ; N  # Mn         PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
-10280..1029C   ; N  # Lo    [29] LYCIAN LETTER A..LYCIAN LETTER X
-102A0..102D0   ; N  # Lo    [49] CARIAN LETTER A..CARIAN LETTER UUU3
-102E0          ; N  # Mn         COPTIC EPACT THOUSANDS MARK
-102E1..102FB   ; N  # No    [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
-10300..1031F   ; N  # Lo    [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS
-10320..10323   ; N  # No     [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY
-1032D..1032F   ; N  # Lo     [3] OLD ITALIC LETTER YE..OLD ITALIC LETTER SOUTHERN TSE
-10330..10340   ; N  # Lo    [17] GOTHIC LETTER AHSA..GOTHIC LETTER PAIRTHRA
-10341          ; N  # Nl         GOTHIC LETTER NINETY
-10342..10349   ; N  # Lo     [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL
-1034A          ; N  # Nl         GOTHIC LETTER NINE HUNDRED
-10350..10375   ; N  # Lo    [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA
-10376..1037A   ; N  # Mn     [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII
-10380..1039D   ; N  # Lo    [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU
-1039F          ; N  # Po         UGARITIC WORD DIVIDER
-103A0..103C3   ; N  # Lo    [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA
-103C8..103CF   ; N  # Lo     [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH
-103D0          ; N  # Po         OLD PERSIAN WORD DIVIDER
-103D1..103D5   ; N  # Nl     [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED
-10400..1044F   ; N  # L&    [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW
-10450..1047F   ; N  # Lo    [48] SHAVIAN LETTER PEEP..SHAVIAN LETTER YEW
-10480..1049D   ; N  # Lo    [30] OSMANYA LETTER ALEF..OSMANYA LETTER OO
-104A0..104A9   ; N  # Nd    [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE
-104B0..104D3   ; N  # Lu    [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA
-104D8..104FB   ; N  # Ll    [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA
-10500..10527   ; N  # Lo    [40] ELBASAN LETTER A..ELBASAN LETTER KHE
-10530..10563   ; N  # Lo    [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW
-1056F          ; N  # Po         CAUCASIAN ALBANIAN CITATION MARK
-10570..1057A   ; N  # Lu    [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA
-1057C..1058A   ; N  # Lu    [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE
-1058C..10592   ; N  # Lu     [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE
-10594..10595   ; N  # Lu     [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE
-10597..105A1   ; N  # Ll    [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA
-105A3..105B1   ; N  # Ll    [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE
-105B3..105B9   ; N  # Ll     [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE
-105BB..105BC   ; N  # Ll     [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE
-10600..10736   ; N  # Lo   [311] LINEAR A SIGN AB001..LINEAR A SIGN A664
-10740..10755   ; N  # Lo    [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE
-10760..10767   ; N  # Lo     [8] LINEAR A SIGN A800..LINEAR A SIGN A807
-10780..10785   ; N  # Lm     [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK
-10787..107B0   ; N  # Lm    [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK
-107B2..107BA   ; N  # Lm     [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL
-10800..10805   ; N  # Lo     [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA
-10808          ; N  # Lo         CYPRIOT SYLLABLE JO
-1080A..10835   ; N  # Lo    [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO
-10837..10838   ; N  # Lo     [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE
-1083C          ; N  # Lo         CYPRIOT SYLLABLE ZA
-1083F          ; N  # Lo         CYPRIOT SYLLABLE ZO
-10840..10855   ; N  # Lo    [22] IMPERIAL ARAMAIC LETTER ALEPH..IMPERIAL ARAMAIC LETTER TAW
-10857          ; N  # Po         IMPERIAL ARAMAIC SECTION SIGN
-10858..1085F   ; N  # No     [8] IMPERIAL ARAMAIC NUMBER ONE..IMPERIAL ARAMAIC NUMBER TEN THOUSAND
-10860..10876   ; N  # Lo    [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW
-10877..10878   ; N  # So     [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON
-10879..1087F   ; N  # No     [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY
-10880..1089E   ; N  # Lo    [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW
-108A7..108AF   ; N  # No     [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED
-108E0..108F2   ; N  # Lo    [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH
-108F4..108F5   ; N  # Lo     [2] HATRAN LETTER SHIN..HATRAN LETTER TAW
-108FB..108FF   ; N  # No     [5] HATRAN NUMBER ONE..HATRAN NUMBER ONE HUNDRED
-10900..10915   ; N  # Lo    [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU
-10916..1091B   ; N  # No     [6] PHOENICIAN NUMBER ONE..PHOENICIAN NUMBER THREE
-1091F          ; N  # Po         PHOENICIAN WORD SEPARATOR
-10920..10939   ; N  # Lo    [26] LYDIAN LETTER A..LYDIAN LETTER C
-1093F          ; N  # Po         LYDIAN TRIANGULAR MARK
-10980..1099F   ; N  # Lo    [32] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC HIEROGLYPHIC SYMBOL VIDJ-2
-109A0..109B7   ; N  # Lo    [24] MEROITIC CURSIVE LETTER A..MEROITIC CURSIVE LETTER DA
-109BC..109BD   ; N  # No     [2] MEROITIC CURSIVE FRACTION ELEVEN TWELFTHS..MEROITIC CURSIVE FRACTION ONE HALF
-109BE..109BF   ; N  # Lo     [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN
-109C0..109CF   ; N  # No    [16] MEROITIC CURSIVE NUMBER ONE..MEROITIC CURSIVE NUMBER SEVENTY
-109D2..109FF   ; N  # No    [46] MEROITIC CURSIVE NUMBER ONE HUNDRED..MEROITIC CURSIVE FRACTION TEN TWELFTHS
-10A00          ; N  # Lo         KHAROSHTHI LETTER A
-10A01..10A03   ; N  # Mn     [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R
-10A05..10A06   ; N  # Mn     [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O
-10A0C..10A0F   ; N  # Mn     [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA
-10A10..10A13   ; N  # Lo     [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA
-10A15..10A17   ; N  # Lo     [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA
-10A19..10A35   ; N  # Lo    [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA
-10A38..10A3A   ; N  # Mn     [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW
-10A3F          ; N  # Mn         KHAROSHTHI VIRAMA
-10A40..10A48   ; N  # No     [9] KHAROSHTHI DIGIT ONE..KHAROSHTHI FRACTION ONE HALF
-10A50..10A58   ; N  # Po     [9] KHAROSHTHI PUNCTUATION DOT..KHAROSHTHI PUNCTUATION LINES
-10A60..10A7C   ; N  # Lo    [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH
-10A7D..10A7E   ; N  # No     [2] OLD SOUTH ARABIAN NUMBER ONE..OLD SOUTH ARABIAN NUMBER FIFTY
-10A7F          ; N  # Po         OLD SOUTH ARABIAN NUMERIC INDICATOR
-10A80..10A9C   ; N  # Lo    [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH
-10A9D..10A9F   ; N  # No     [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY
-10AC0..10AC7   ; N  # Lo     [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW
-10AC8          ; N  # So         MANICHAEAN SIGN UD
-10AC9..10AE4   ; N  # Lo    [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW
-10AE5..10AE6   ; N  # Mn     [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
-10AEB..10AEF   ; N  # No     [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED
-10AF0..10AF6   ; N  # Po     [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER
-10B00..10B35   ; N  # Lo    [54] AVESTAN LETTER A..AVESTAN LETTER HE
-10B39..10B3F   ; N  # Po     [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION
-10B40..10B55   ; N  # Lo    [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW
-10B58..10B5F   ; N  # No     [8] INSCRIPTIONAL PARTHIAN NUMBER ONE..INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND
-10B60..10B72   ; N  # Lo    [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW
-10B78..10B7F   ; N  # No     [8] INSCRIPTIONAL PAHLAVI NUMBER ONE..INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND
-10B80..10B91   ; N  # Lo    [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW
-10B99..10B9C   ; N  # Po     [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
-10BA9..10BAF   ; N  # No     [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED
-10C00..10C48   ; N  # Lo    [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH
-10C80..10CB2   ; N  # Lu    [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US
-10CC0..10CF2   ; N  # Ll    [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US
-10CFA..10CFF   ; N  # No     [6] OLD HUNGARIAN NUMBER ONE..OLD HUNGARIAN NUMBER ONE THOUSAND
-10D00..10D23   ; N  # Lo    [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
-10D24..10D27   ; N  # Mn     [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
-10D30..10D39   ; N  # Nd    [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE
-10E60..10E7E   ; N  # No    [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS
-10E80..10EA9   ; N  # Lo    [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
-10EAB..10EAC   ; N  # Mn     [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
-10EAD          ; N  # Pd         YEZIDI HYPHENATION MARK
-10EB0..10EB1   ; N  # Lo     [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
-10EFD..10EFF   ; N  # Mn     [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
-10F00..10F1C   ; N  # Lo    [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
-10F1D..10F26   ; N  # No    [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
-10F27          ; N  # Lo         OLD SOGDIAN LIGATURE AYIN-DALETH
-10F30..10F45   ; N  # Lo    [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
-10F46..10F50   ; N  # Mn    [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
-10F51..10F54   ; N  # No     [4] SOGDIAN NUMBER ONE..SOGDIAN NUMBER ONE HUNDRED
-10F55..10F59   ; N  # Po     [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT
-10F70..10F81   ; N  # Lo    [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH
-10F82..10F85   ; N  # Mn     [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
-10F86..10F89   ; N  # Po     [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS
-10FB0..10FC4   ; N  # Lo    [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW
-10FC5..10FCB   ; N  # No     [7] CHORASMIAN NUMBER ONE..CHORASMIAN NUMBER ONE HUNDRED
-10FE0..10FF6   ; N  # Lo    [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH
-11000          ; N  # Mc         BRAHMI SIGN CANDRABINDU
-11001          ; N  # Mn         BRAHMI SIGN ANUSVARA
-11002          ; N  # Mc         BRAHMI SIGN VISARGA
-11003..11037   ; N  # Lo    [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA
-11038..11046   ; N  # Mn    [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
-11047..1104D   ; N  # Po     [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
-11052..11065   ; N  # No    [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND
-11066..1106F   ; N  # Nd    [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE
-11070          ; N  # Mn         BRAHMI SIGN OLD TAMIL VIRAMA
-11071..11072   ; N  # Lo     [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O
-11073..11074   ; N  # Mn     [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
-11075          ; N  # Lo         BRAHMI LETTER OLD TAMIL LLA
-1107F          ; N  # Mn         BRAHMI NUMBER JOINER
-11080..11081   ; N  # Mn     [2] KAITHI SIGN CANDRABINDU..KAITHI SIGN ANUSVARA
-11082          ; N  # Mc         KAITHI SIGN VISARGA
-11083..110AF   ; N  # Lo    [45] KAITHI LETTER A..KAITHI LETTER HA
-110B0..110B2   ; N  # Mc     [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II
-110B3..110B6   ; N  # Mn     [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
-110B7..110B8   ; N  # Mc     [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU
-110B9..110BA   ; N  # Mn     [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
-110BB..110BC   ; N  # Po     [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN
-110BD          ; N  # Cf         KAITHI NUMBER SIGN
-110BE..110C1   ; N  # Po     [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
-110C2          ; N  # Mn         KAITHI VOWEL SIGN VOCALIC R
-110CD          ; N  # Cf         KAITHI NUMBER SIGN ABOVE
-110D0..110E8   ; N  # Lo    [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE
-110F0..110F9   ; N  # Nd    [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE
-11100..11102   ; N  # Mn     [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA
-11103..11126   ; N  # Lo    [36] CHAKMA LETTER AA..CHAKMA LETTER HAA
-11127..1112B   ; N  # Mn     [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU
-1112C          ; N  # Mc         CHAKMA VOWEL SIGN E
-1112D..11134   ; N  # Mn     [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA
-11136..1113F   ; N  # Nd    [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE
-11140..11143   ; N  # Po     [4] CHAKMA SECTION MARK..CHAKMA QUESTION MARK
-11144          ; N  # Lo         CHAKMA LETTER LHAA
-11145..11146   ; N  # Mc     [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI
-11147          ; N  # Lo         CHAKMA LETTER VAA
-11150..11172   ; N  # Lo    [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA
-11173          ; N  # Mn         MAHAJANI SIGN NUKTA
-11174..11175   ; N  # Po     [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK
-11176          ; N  # Lo         MAHAJANI LIGATURE SHRI
-11180..11181   ; N  # Mn     [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA
-11182          ; N  # Mc         SHARADA SIGN VISARGA
-11183..111B2   ; N  # Lo    [48] SHARADA LETTER A..SHARADA LETTER HA
-111B3..111B5   ; N  # Mc     [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II
-111B6..111BE   ; N  # Mn     [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O
-111BF..111C0   ; N  # Mc     [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA
-111C1..111C4   ; N  # Lo     [4] SHARADA SIGN AVAGRAHA..SHARADA OM
-111C5..111C8   ; N  # Po     [4] SHARADA DANDA..SHARADA SEPARATOR
-111C9..111CC   ; N  # Mn     [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK
-111CD          ; N  # Po         SHARADA SUTRA MARK
-111CE          ; N  # Mc         SHARADA VOWEL SIGN PRISHTHAMATRA E
-111CF          ; N  # Mn         SHARADA SIGN INVERTED CANDRABINDU
-111D0..111D9   ; N  # Nd    [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE
-111DA          ; N  # Lo         SHARADA EKAM
-111DB          ; N  # Po         SHARADA SIGN SIDDHAM
-111DC          ; N  # Lo         SHARADA HEADSTROKE
-111DD..111DF   ; N  # Po     [3] SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2
-111E1..111F4   ; N  # No    [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND
-11200..11211   ; N  # Lo    [18] KHOJKI LETTER A..KHOJKI LETTER JJA
-11213..1122B   ; N  # Lo    [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
-1122C..1122E   ; N  # Mc     [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II
-1122F..11231   ; N  # Mn     [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI
-11232..11233   ; N  # Mc     [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
-11234          ; N  # Mn         KHOJKI SIGN ANUSVARA
-11235          ; N  # Mc         KHOJKI SIGN VIRAMA
-11236..11237   ; N  # Mn     [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
-11238..1123D   ; N  # Po     [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
-1123E          ; N  # Mn         KHOJKI SIGN SUKUN
-1123F..11240   ; N  # Lo     [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
-11241          ; N  # Mn         KHOJKI VOWEL SIGN VOCALIC R
-11280..11286   ; N  # Lo     [7] MULTANI LETTER A..MULTANI LETTER GA
-11288          ; N  # Lo         MULTANI LETTER GHA
-1128A..1128D   ; N  # Lo     [4] MULTANI LETTER CA..MULTANI LETTER JJA
-1128F..1129D   ; N  # Lo    [15] MULTANI LETTER NYA..MULTANI LETTER BA
-1129F..112A8   ; N  # Lo    [10] MULTANI LETTER BHA..MULTANI LETTER RHA
-112A9          ; N  # Po         MULTANI SECTION MARK
-112B0..112DE   ; N  # Lo    [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA
-112DF          ; N  # Mn         KHUDAWADI SIGN ANUSVARA
-112E0..112E2   ; N  # Mc     [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
-112E3..112EA   ; N  # Mn     [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
-112F0..112F9   ; N  # Nd    [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE
-11300..11301   ; N  # Mn     [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU
-11302..11303   ; N  # Mc     [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA
-11305..1130C   ; N  # Lo     [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L
-1130F..11310   ; N  # Lo     [2] GRANTHA LETTER EE..GRANTHA LETTER AI
-11313..11328   ; N  # Lo    [22] GRANTHA LETTER OO..GRANTHA LETTER NA
-1132A..11330   ; N  # Lo     [7] GRANTHA LETTER PA..GRANTHA LETTER RA
-11332..11333   ; N  # Lo     [2] GRANTHA LETTER LA..GRANTHA LETTER LLA
-11335..11339   ; N  # Lo     [5] GRANTHA LETTER VA..GRANTHA LETTER HA
-1133B..1133C   ; N  # Mn     [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
-1133D          ; N  # Lo         GRANTHA SIGN AVAGRAHA
-1133E..1133F   ; N  # Mc     [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I
-11340          ; N  # Mn         GRANTHA VOWEL SIGN II
-11341..11344   ; N  # Mc     [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR
-11347..11348   ; N  # Mc     [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI
-1134B..1134D   ; N  # Mc     [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA
-11350          ; N  # Lo         GRANTHA OM
-11357          ; N  # Mc         GRANTHA AU LENGTH MARK
-1135D..11361   ; N  # Lo     [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL
-11362..11363   ; N  # Mc     [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
-11366..1136C   ; N  # Mn     [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
-11370..11374   ; N  # Mn     [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
-11400..11434   ; N  # Lo    [53] NEWA LETTER A..NEWA LETTER HA
-11435..11437   ; N  # Mc     [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II
-11438..1143F   ; N  # Mn     [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI
-11440..11441   ; N  # Mc     [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU
-11442..11444   ; N  # Mn     [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA
-11445          ; N  # Mc         NEWA SIGN VISARGA
-11446          ; N  # Mn         NEWA SIGN NUKTA
-11447..1144A   ; N  # Lo     [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI
-1144B..1144F   ; N  # Po     [5] NEWA DANDA..NEWA ABBREVIATION SIGN
-11450..11459   ; N  # Nd    [10] NEWA DIGIT ZERO..NEWA DIGIT NINE
-1145A..1145B   ; N  # Po     [2] NEWA DOUBLE COMMA..NEWA PLACEHOLDER MARK
-1145D          ; N  # Po         NEWA INSERTION SIGN
-1145E          ; N  # Mn         NEWA SANDHI MARK
-1145F..11461   ; N  # Lo     [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA
-11480..114AF   ; N  # Lo    [48] TIRHUTA ANJI..TIRHUTA LETTER HA
-114B0..114B2   ; N  # Mc     [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
-114B3..114B8   ; N  # Mn     [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
-114B9          ; N  # Mc         TIRHUTA VOWEL SIGN E
-114BA          ; N  # Mn         TIRHUTA VOWEL SIGN SHORT E
-114BB..114BE   ; N  # Mc     [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU
-114BF..114C0   ; N  # Mn     [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA
-114C1          ; N  # Mc         TIRHUTA SIGN VISARGA
-114C2..114C3   ; N  # Mn     [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
-114C4..114C5   ; N  # Lo     [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG
-114C6          ; N  # Po         TIRHUTA ABBREVIATION SIGN
-114C7          ; N  # Lo         TIRHUTA OM
-114D0..114D9   ; N  # Nd    [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE
-11580..115AE   ; N  # Lo    [47] SIDDHAM LETTER A..SIDDHAM LETTER HA
-115AF..115B1   ; N  # Mc     [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II
-115B2..115B5   ; N  # Mn     [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR
-115B8..115BB   ; N  # Mc     [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU
-115BC..115BD   ; N  # Mn     [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA
-115BE          ; N  # Mc         SIDDHAM SIGN VISARGA
-115BF..115C0   ; N  # Mn     [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
-115C1..115D7   ; N  # Po    [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
-115D8..115DB   ; N  # Lo     [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U
-115DC..115DD   ; N  # Mn     [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU
-11600..1162F   ; N  # Lo    [48] MODI LETTER A..MODI LETTER LLA
-11630..11632   ; N  # Mc     [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II
-11633..1163A   ; N  # Mn     [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI
-1163B..1163C   ; N  # Mc     [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU
-1163D          ; N  # Mn         MODI SIGN ANUSVARA
-1163E          ; N  # Mc         MODI SIGN VISARGA
-1163F..11640   ; N  # Mn     [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA
-11641..11643   ; N  # Po     [3] MODI DANDA..MODI ABBREVIATION SIGN
-11644          ; N  # Lo         MODI SIGN HUVA
-11650..11659   ; N  # Nd    [10] MODI DIGIT ZERO..MODI DIGIT NINE
-11660..1166C   ; N  # Po    [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT
-11680..116AA   ; N  # Lo    [43] TAKRI LETTER A..TAKRI LETTER RRA
-116AB          ; N  # Mn         TAKRI SIGN ANUSVARA
-116AC          ; N  # Mc         TAKRI SIGN VISARGA
-116AD          ; N  # Mn         TAKRI VOWEL SIGN AA
-116AE..116AF   ; N  # Mc     [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II
-116B0..116B5   ; N  # Mn     [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU
-116B6          ; N  # Mc         TAKRI SIGN VIRAMA
-116B7          ; N  # Mn         TAKRI SIGN NUKTA
-116B8          ; N  # Lo         TAKRI LETTER ARCHAIC KHA
-116B9          ; N  # Po         TAKRI ABBREVIATION SIGN
-116C0..116C9   ; N  # Nd    [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE
-11700..1171A   ; N  # Lo    [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA
-1171D..1171F   ; N  # Mn     [3] AHOM CONSONANT SIGN MEDIAL LA..AHOM CONSONANT SIGN MEDIAL LIGATING RA
-11720..11721   ; N  # Mc     [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA
-11722..11725   ; N  # Mn     [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU
-11726          ; N  # Mc         AHOM VOWEL SIGN E
-11727..1172B   ; N  # Mn     [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER
-11730..11739   ; N  # Nd    [10] AHOM DIGIT ZERO..AHOM DIGIT NINE
-1173A..1173B   ; N  # No     [2] AHOM NUMBER TEN..AHOM NUMBER TWENTY
-1173C..1173E   ; N  # Po     [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
-1173F          ; N  # So         AHOM SYMBOL VI
-11740..11746   ; N  # Lo     [7] AHOM LETTER CA..AHOM LETTER LLA
-11800..1182B   ; N  # Lo    [44] DOGRA LETTER A..DOGRA LETTER RRA
-1182C..1182E   ; N  # Mc     [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II
-1182F..11837   ; N  # Mn     [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA
-11838          ; N  # Mc         DOGRA SIGN VISARGA
-11839..1183A   ; N  # Mn     [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA
-1183B          ; N  # Po         DOGRA ABBREVIATION SIGN
-118A0..118DF   ; N  # L&    [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO
-118E0..118E9   ; N  # Nd    [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE
-118EA..118F2   ; N  # No     [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY
-118FF          ; N  # Lo         WARANG CITI OM
-11900..11906   ; N  # Lo     [7] DIVES AKURU LETTER A..DIVES AKURU LETTER E
-11909          ; N  # Lo         DIVES AKURU LETTER O
-1190C..11913   ; N  # Lo     [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA
-11915..11916   ; N  # Lo     [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA
-11918..1192F   ; N  # Lo    [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA
-11930..11935   ; N  # Mc     [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E
-11937..11938   ; N  # Mc     [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O
-1193B..1193C   ; N  # Mn     [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU
-1193D          ; N  # Mc         DIVES AKURU SIGN HALANTA
-1193E          ; N  # Mn         DIVES AKURU VIRAMA
-1193F          ; N  # Lo         DIVES AKURU PREFIXED NASAL SIGN
-11940          ; N  # Mc         DIVES AKURU MEDIAL YA
-11941          ; N  # Lo         DIVES AKURU INITIAL RA
-11942          ; N  # Mc         DIVES AKURU MEDIAL RA
-11943          ; N  # Mn         DIVES AKURU SIGN NUKTA
-11944..11946   ; N  # Po     [3] DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK
-11950..11959   ; N  # Nd    [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE
-119A0..119A7   ; N  # Lo     [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR
-119AA..119D0   ; N  # Lo    [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA
-119D1..119D3   ; N  # Mc     [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II
-119D4..119D7   ; N  # Mn     [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR
-119DA..119DB   ; N  # Mn     [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI
-119DC..119DF   ; N  # Mc     [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA
-119E0          ; N  # Mn         NANDINAGARI SIGN VIRAMA
-119E1          ; N  # Lo         NANDINAGARI SIGN AVAGRAHA
-119E2          ; N  # Po         NANDINAGARI SIGN SIDDHAM
-119E3          ; N  # Lo         NANDINAGARI HEADSTROKE
-119E4          ; N  # Mc         NANDINAGARI VOWEL SIGN PRISHTHAMATRA E
-11A00          ; N  # Lo         ZANABAZAR SQUARE LETTER A
-11A01..11A0A   ; N  # Mn    [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK
-11A0B..11A32   ; N  # Lo    [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA
-11A33..11A38   ; N  # Mn     [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA
-11A39          ; N  # Mc         ZANABAZAR SQUARE SIGN VISARGA
-11A3A          ; N  # Lo         ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA
-11A3B..11A3E   ; N  # Mn     [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA
-11A3F..11A46   ; N  # Po     [8] ZANABAZAR SQUARE INITIAL HEAD MARK..ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK
-11A47          ; N  # Mn         ZANABAZAR SQUARE SUBJOINER
-11A50          ; N  # Lo         SOYOMBO LETTER A
-11A51..11A56   ; N  # Mn     [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE
-11A57..11A58   ; N  # Mc     [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU
-11A59..11A5B   ; N  # Mn     [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK
-11A5C..11A89   ; N  # Lo    [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA
-11A8A..11A96   ; N  # Mn    [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA
-11A97          ; N  # Mc         SOYOMBO SIGN VISARGA
-11A98..11A99   ; N  # Mn     [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER
-11A9A..11A9C   ; N  # Po     [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD
-11A9D          ; N  # Lo         SOYOMBO MARK PLUTA
-11A9E..11AA2   ; N  # Po     [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2
-11AB0..11ABF   ; N  # Lo    [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA
-11AC0..11AF8   ; N  # Lo    [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
-11B00..11B09   ; N  # Po    [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
-11C00..11C08   ; N  # Lo     [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
-11C0A..11C2E   ; N  # Lo    [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
-11C2F          ; N  # Mc         BHAIKSUKI VOWEL SIGN AA
-11C30..11C36   ; N  # Mn     [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L
-11C38..11C3D   ; N  # Mn     [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA
-11C3E          ; N  # Mc         BHAIKSUKI SIGN VISARGA
-11C3F          ; N  # Mn         BHAIKSUKI SIGN VIRAMA
-11C40          ; N  # Lo         BHAIKSUKI SIGN AVAGRAHA
-11C41..11C45   ; N  # Po     [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2
-11C50..11C59   ; N  # Nd    [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE
-11C5A..11C6C   ; N  # No    [19] BHAIKSUKI NUMBER ONE..BHAIKSUKI HUNDREDS UNIT MARK
-11C70..11C71   ; N  # Po     [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD
-11C72..11C8F   ; N  # Lo    [30] MARCHEN LETTER KA..MARCHEN LETTER A
-11C92..11CA7   ; N  # Mn    [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA
-11CA9          ; N  # Mc         MARCHEN SUBJOINED LETTER YA
-11CAA..11CB0   ; N  # Mn     [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA
-11CB1          ; N  # Mc         MARCHEN VOWEL SIGN I
-11CB2..11CB3   ; N  # Mn     [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E
-11CB4          ; N  # Mc         MARCHEN VOWEL SIGN O
-11CB5..11CB6   ; N  # Mn     [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU
-11D00..11D06   ; N  # Lo     [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E
-11D08..11D09   ; N  # Lo     [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O
-11D0B..11D30   ; N  # Lo    [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA
-11D31..11D36   ; N  # Mn     [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R
-11D3A          ; N  # Mn         MASARAM GONDI VOWEL SIGN E
-11D3C..11D3D   ; N  # Mn     [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O
-11D3F..11D45   ; N  # Mn     [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA
-11D46          ; N  # Lo         MASARAM GONDI REPHA
-11D47          ; N  # Mn         MASARAM GONDI RA-KARA
-11D50..11D59   ; N  # Nd    [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
-11D60..11D65   ; N  # Lo     [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU
-11D67..11D68   ; N  # Lo     [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI
-11D6A..11D89   ; N  # Lo    [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
-11D8A..11D8E   ; N  # Mc     [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU
-11D90..11D91   ; N  # Mn     [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI
-11D93..11D94   ; N  # Mc     [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU
-11D95          ; N  # Mn         GUNJALA GONDI SIGN ANUSVARA
-11D96          ; N  # Mc         GUNJALA GONDI SIGN VISARGA
-11D97          ; N  # Mn         GUNJALA GONDI VIRAMA
-11D98          ; N  # Lo         GUNJALA GONDI OM
-11DA0..11DA9   ; N  # Nd    [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
-11EE0..11EF2   ; N  # Lo    [19] MAKASAR LETTER KA..MAKASAR ANGKA
-11EF3..11EF4   ; N  # Mn     [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
-11EF5..11EF6   ; N  # Mc     [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
-11EF7..11EF8   ; N  # Po     [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
-11F00..11F01   ; N  # Mn     [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
-11F02          ; N  # Lo         KAWI SIGN REPHA
-11F03          ; N  # Mc         KAWI SIGN VISARGA
-11F04..11F10   ; N  # Lo    [13] KAWI LETTER A..KAWI LETTER O
-11F12..11F33   ; N  # Lo    [34] KAWI LETTER KA..KAWI LETTER JNYA
-11F34..11F35   ; N  # Mc     [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
-11F36..11F3A   ; N  # Mn     [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
-11F3E..11F3F   ; N  # Mc     [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
-11F40          ; N  # Mn         KAWI VOWEL SIGN EU
-11F41          ; N  # Mc         KAWI SIGN KILLER
-11F42          ; N  # Mn         KAWI CONJOINER
-11F43..11F4F   ; N  # Po    [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
-11F50..11F59   ; N  # Nd    [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
-11FB0          ; N  # Lo         LISU LETTER YHA
-11FC0..11FD4   ; N  # No    [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
-11FD5..11FDC   ; N  # So     [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
-11FDD..11FE0   ; N  # Sc     [4] TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN
-11FE1..11FF1   ; N  # So    [17] TAMIL SIGN PAARAM..TAMIL SIGN VAKAIYARAA
-11FFF          ; N  # Po         TAMIL PUNCTUATION END OF TEXT
-12000..12399   ; N  # Lo   [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
-12400..1246E   ; N  # Nl   [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
-12470..12474   ; N  # Po     [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
-12480..12543   ; N  # Lo   [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
-12F90..12FF0   ; N  # Lo    [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
-12FF1..12FF2   ; N  # Po     [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
-13000..1342F   ; N  # Lo  [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
-13430..1343F   ; N  # Cf    [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
-13440          ; N  # Mn         EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
-13441..13446   ; N  # Lo     [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
-13447..13455   ; N  # Mn    [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
-14400..14646   ; N  # Lo   [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
-16800..16A38   ; N  # Lo   [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
-16A40..16A5E   ; N  # Lo    [31] MRO LETTER TA..MRO LETTER TEK
-16A60..16A69   ; N  # Nd    [10] MRO DIGIT ZERO..MRO DIGIT NINE
-16A6E..16A6F   ; N  # Po     [2] MRO DANDA..MRO DOUBLE DANDA
-16A70..16ABE   ; N  # Lo    [79] TANGSA LETTER OZ..TANGSA LETTER ZA
-16AC0..16AC9   ; N  # Nd    [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
-16AD0..16AED   ; N  # Lo    [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I
-16AF0..16AF4   ; N  # Mn     [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
-16AF5          ; N  # Po         BASSA VAH FULL STOP
-16B00..16B2F   ; N  # Lo    [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
-16B30..16B36   ; N  # Mn     [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
-16B37..16B3B   ; N  # Po     [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM
-16B3C..16B3F   ; N  # So     [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB
-16B40..16B43   ; N  # Lm     [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
-16B44          ; N  # Po         PAHAWH HMONG SIGN XAUS
-16B45          ; N  # So         PAHAWH HMONG SIGN CIM TSOV ROG
-16B50..16B59   ; N  # Nd    [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
-16B5B..16B61   ; N  # No     [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS
-16B63..16B77   ; N  # Lo    [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS
-16B7D..16B8F   ; N  # Lo    [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ
-16E40..16E7F   ; N  # L&    [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y
-16E80..16E96   ; N  # No    [23] MEDEFAIDRIN DIGIT ZERO..MEDEFAIDRIN DIGIT THREE ALTERNATE FORM
-16E97..16E9A   ; N  # Po     [4] MEDEFAIDRIN COMMA..MEDEFAIDRIN EXCLAMATION OH
-16F00..16F4A   ; N  # Lo    [75] MIAO LETTER PA..MIAO LETTER RTE
-16F4F          ; N  # Mn         MIAO SIGN CONSONANT MODIFIER BAR
-16F50          ; N  # Lo         MIAO LETTER NASALIZATION
-16F51..16F87   ; N  # Mc    [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
-16F8F..16F92   ; N  # Mn     [4] MIAO TONE RIGHT..MIAO TONE BELOW
-16F93..16F9F   ; N  # Lm    [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
-16FE0..16FE1   ; W  # Lm     [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
-16FE2          ; W  # Po         OLD CHINESE HOOK MARK
-16FE3          ; W  # Lm         OLD CHINESE ITERATION MARK
-16FE4          ; W  # Mn         KHITAN SMALL SCRIPT FILLER
-16FF0..16FF1   ; W  # Mc     [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
-17000..187F7   ; W  # Lo  [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7
-18800..18AFF   ; W  # Lo   [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768
-18B00..18CD5   ; W  # Lo   [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5
-18D00..18D08   ; W  # Lo     [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08
-1AFF0..1AFF3   ; W  # Lm     [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
-1AFF5..1AFFB   ; W  # Lm     [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
-1AFFD..1AFFE   ; W  # Lm     [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
-1B000..1B0FF   ; W  # Lo   [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
-1B100..1B122   ; W  # Lo    [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
-1B132          ; W  # Lo         HIRAGANA LETTER SMALL KO
-1B150..1B152   ; W  # Lo     [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
-1B155          ; W  # Lo         KATAKANA LETTER SMALL KO
-1B164..1B167   ; W  # Lo     [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
-1B170..1B2FB   ; W  # Lo   [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
-1BC00..1BC6A   ; N  # Lo   [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
-1BC70..1BC7C   ; N  # Lo    [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
-1BC80..1BC88   ; N  # Lo     [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
-1BC90..1BC99   ; N  # Lo    [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
-1BC9C          ; N  # So         DUPLOYAN SIGN O WITH CROSS
-1BC9D..1BC9E   ; N  # Mn     [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
-1BC9F          ; N  # Po         DUPLOYAN PUNCTUATION CHINOOK FULL STOP
-1BCA0..1BCA3   ; N  # Cf     [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
-1CF00..1CF2D   ; N  # Mn    [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
-1CF30..1CF46   ; N  # Mn    [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
-1CF50..1CFC3   ; N  # So   [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK
-1D000..1D0F5   ; N  # So   [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
-1D100..1D126   ; N  # So    [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
-1D129..1D164   ; N  # So    [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
-1D165..1D166   ; N  # Mc     [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM
-1D167..1D169   ; N  # Mn     [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
-1D16A..1D16C   ; N  # So     [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3
-1D16D..1D172   ; N  # Mc     [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5
-1D173..1D17A   ; N  # Cf     [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
-1D17B..1D182   ; N  # Mn     [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
-1D183..1D184   ; N  # So     [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN
-1D185..1D18B   ; N  # Mn     [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
-1D18C..1D1A9   ; N  # So    [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH
-1D1AA..1D1AD   ; N  # Mn     [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
-1D1AE..1D1EA   ; N  # So    [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON
-1D200..1D241   ; N  # So    [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
-1D242..1D244   ; N  # Mn     [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
-1D245          ; N  # So         GREEK MUSICAL LEIMMA
-1D2C0..1D2D3   ; N  # No    [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
-1D2E0..1D2F3   ; N  # No    [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
-1D300..1D356   ; N  # So    [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
-1D360..1D378   ; N  # No    [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
-1D400..1D454   ; N  # L&    [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G
-1D456..1D49C   ; N  # L&    [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A
-1D49E..1D49F   ; N  # Lu     [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D
-1D4A2          ; N  # Lu         MATHEMATICAL SCRIPT CAPITAL G
-1D4A5..1D4A6   ; N  # Lu     [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K
-1D4A9..1D4AC   ; N  # Lu     [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q
-1D4AE..1D4B9   ; N  # L&    [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D
-1D4BB          ; N  # Ll         MATHEMATICAL SCRIPT SMALL F
-1D4BD..1D4C3   ; N  # Ll     [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N
-1D4C5..1D505   ; N  # L&    [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B
-1D507..1D50A   ; N  # Lu     [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G
-1D50D..1D514   ; N  # Lu     [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q
-1D516..1D51C   ; N  # Lu     [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y
-1D51E..1D539   ; N  # L&    [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B
-1D53B..1D53E   ; N  # Lu     [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G
-1D540..1D544   ; N  # Lu     [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M
-1D546          ; N  # Lu         MATHEMATICAL DOUBLE-STRUCK CAPITAL O
-1D54A..1D550   ; N  # Lu     [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y
-1D552..1D6A5   ; N  # L&   [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J
-1D6A8..1D6C0   ; N  # Lu    [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA
-1D6C1          ; N  # Sm         MATHEMATICAL BOLD NABLA
-1D6C2..1D6DA   ; N  # Ll    [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA
-1D6DB          ; N  # Sm         MATHEMATICAL BOLD PARTIAL DIFFERENTIAL
-1D6DC..1D6FA   ; N  # L&    [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA
-1D6FB          ; N  # Sm         MATHEMATICAL ITALIC NABLA
-1D6FC..1D714   ; N  # Ll    [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA
-1D715          ; N  # Sm         MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL
-1D716..1D734   ; N  # L&    [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA
-1D735          ; N  # Sm         MATHEMATICAL BOLD ITALIC NABLA
-1D736..1D74E   ; N  # Ll    [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA
-1D74F          ; N  # Sm         MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL
-1D750..1D76E   ; N  # L&    [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA
-1D76F          ; N  # Sm         MATHEMATICAL SANS-SERIF BOLD NABLA
-1D770..1D788   ; N  # Ll    [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA
-1D789          ; N  # Sm         MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL
-1D78A..1D7A8   ; N  # L&    [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA
-1D7A9          ; N  # Sm         MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA
-1D7AA..1D7C2   ; N  # Ll    [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA
-1D7C3          ; N  # Sm         MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
-1D7C4..1D7CB   ; N  # L&     [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA
-1D7CE..1D7FF   ; N  # Nd    [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
-1D800..1D9FF   ; N  # So   [512] SIGNWRITING HAND-FIST INDEX..SIGNWRITING HEAD
-1DA00..1DA36   ; N  # Mn    [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN
-1DA37..1DA3A   ; N  # So     [4] SIGNWRITING AIR BLOW SMALL ROTATIONS..SIGNWRITING BREATH EXHALE
-1DA3B..1DA6C   ; N  # Mn    [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT
-1DA6D..1DA74   ; N  # So     [8] SIGNWRITING SHOULDER HIP SPINE..SIGNWRITING TORSO-FLOORPLANE TWISTING
-1DA75          ; N  # Mn         SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS
-1DA76..1DA83   ; N  # So    [14] SIGNWRITING LIMB COMBINATION..SIGNWRITING LOCATION DEPTH
-1DA84          ; N  # Mn         SIGNWRITING LOCATION HEAD NECK
-1DA85..1DA86   ; N  # So     [2] SIGNWRITING LOCATION TORSO..SIGNWRITING LOCATION LIMBS DIGITS
-1DA87..1DA8B   ; N  # Po     [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS
-1DA9B..1DA9F   ; N  # Mn     [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6
-1DAA1..1DAAF   ; N  # Mn    [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16
-1DF00..1DF09   ; N  # Ll    [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
-1DF0A          ; N  # Lo         LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
-1DF0B..1DF1E   ; N  # Ll    [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
-1DF25..1DF2A   ; N  # Ll     [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
-1E000..1E006   ; N  # Mn     [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
-1E008..1E018   ; N  # Mn    [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
-1E01B..1E021   ; N  # Mn     [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
-1E023..1E024   ; N  # Mn     [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
-1E026..1E02A   ; N  # Mn     [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
-1E030..1E06D   ; N  # Lm    [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
-1E08F          ; N  # Mn         COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
-1E100..1E12C   ; N  # Lo    [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
-1E130..1E136   ; N  # Mn     [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
-1E137..1E13D   ; N  # Lm     [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
-1E140..1E149   ; N  # Nd    [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
-1E14E          ; N  # Lo         NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
-1E14F          ; N  # So         NYIAKENG PUACHUE HMONG CIRCLED CA
-1E290..1E2AD   ; N  # Lo    [30] TOTO LETTER PA..TOTO LETTER A
-1E2AE          ; N  # Mn         TOTO SIGN RISING TONE
-1E2C0..1E2EB   ; N  # Lo    [44] WANCHO LETTER AA..WANCHO LETTER YIH
-1E2EC..1E2EF   ; N  # Mn     [4] WANCHO TONE TUP..WANCHO TONE KOINI
-1E2F0..1E2F9   ; N  # Nd    [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
-1E2FF          ; N  # Sc         WANCHO NGUN SIGN
-1E4D0..1E4EA   ; N  # Lo    [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
-1E4EB          ; N  # Lm         NAG MUNDARI SIGN OJOD
-1E4EC..1E4EF   ; N  # Mn     [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
-1E4F0..1E4F9   ; N  # Nd    [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
-1E7E0..1E7E6   ; N  # Lo     [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
-1E7E8..1E7EB   ; N  # Lo     [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
-1E7ED..1E7EE   ; N  # Lo     [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
-1E7F0..1E7FE   ; N  # Lo    [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE
-1E800..1E8C4   ; N  # Lo   [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON
-1E8C7..1E8CF   ; N  # No     [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE
-1E8D0..1E8D6   ; N  # Mn     [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
-1E900..1E943   ; N  # L&    [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA
-1E944..1E94A   ; N  # Mn     [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
-1E94B          ; N  # Lm         ADLAM NASALIZATION MARK
-1E950..1E959   ; N  # Nd    [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
-1E95E..1E95F   ; N  # Po     [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK
-1EC71..1ECAB   ; N  # No    [59] INDIC SIYAQ NUMBER ONE..INDIC SIYAQ NUMBER PREFIXED NINE
-1ECAC          ; N  # So         INDIC SIYAQ PLACEHOLDER
-1ECAD..1ECAF   ; N  # No     [3] INDIC SIYAQ FRACTION ONE QUARTER..INDIC SIYAQ FRACTION THREE QUARTERS
-1ECB0          ; N  # Sc         INDIC SIYAQ RUPEE MARK
-1ECB1..1ECB4   ; N  # No     [4] INDIC SIYAQ NUMBER ALTERNATE ONE..INDIC SIYAQ ALTERNATE LAKH MARK
-1ED01..1ED2D   ; N  # No    [45] OTTOMAN SIYAQ NUMBER ONE..OTTOMAN SIYAQ NUMBER NINETY THOUSAND
-1ED2E          ; N  # So         OTTOMAN SIYAQ MARRATAN
-1ED2F..1ED3D   ; N  # No    [15] OTTOMAN SIYAQ ALTERNATE NUMBER TWO..OTTOMAN SIYAQ FRACTION ONE SIXTH
-1EE00..1EE03   ; N  # Lo     [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL
-1EE05..1EE1F   ; N  # Lo    [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF
-1EE21..1EE22   ; N  # Lo     [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM
-1EE24          ; N  # Lo         ARABIC MATHEMATICAL INITIAL HEH
-1EE27          ; N  # Lo         ARABIC MATHEMATICAL INITIAL HAH
-1EE29..1EE32   ; N  # Lo    [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF
-1EE34..1EE37   ; N  # Lo     [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH
-1EE39          ; N  # Lo         ARABIC MATHEMATICAL INITIAL DAD
-1EE3B          ; N  # Lo         ARABIC MATHEMATICAL INITIAL GHAIN
-1EE42          ; N  # Lo         ARABIC MATHEMATICAL TAILED JEEM
-1EE47          ; N  # Lo         ARABIC MATHEMATICAL TAILED HAH
-1EE49          ; N  # Lo         ARABIC MATHEMATICAL TAILED YEH
-1EE4B          ; N  # Lo         ARABIC MATHEMATICAL TAILED LAM
-1EE4D..1EE4F   ; N  # Lo     [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN
-1EE51..1EE52   ; N  # Lo     [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF
-1EE54          ; N  # Lo         ARABIC MATHEMATICAL TAILED SHEEN
-1EE57          ; N  # Lo         ARABIC MATHEMATICAL TAILED KHAH
-1EE59          ; N  # Lo         ARABIC MATHEMATICAL TAILED DAD
-1EE5B          ; N  # Lo         ARABIC MATHEMATICAL TAILED GHAIN
-1EE5D          ; N  # Lo         ARABIC MATHEMATICAL TAILED DOTLESS NOON
-1EE5F          ; N  # Lo         ARABIC MATHEMATICAL TAILED DOTLESS QAF
-1EE61..1EE62   ; N  # Lo     [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM
-1EE64          ; N  # Lo         ARABIC MATHEMATICAL STRETCHED HEH
-1EE67..1EE6A   ; N  # Lo     [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF
-1EE6C..1EE72   ; N  # Lo     [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF
-1EE74..1EE77   ; N  # Lo     [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH
-1EE79..1EE7C   ; N  # Lo     [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH
-1EE7E          ; N  # Lo         ARABIC MATHEMATICAL STRETCHED DOTLESS FEH
-1EE80..1EE89   ; N  # Lo    [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH
-1EE8B..1EE9B   ; N  # Lo    [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN
-1EEA1..1EEA3   ; N  # Lo     [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL
-1EEA5..1EEA9   ; N  # Lo     [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
-1EEAB..1EEBB   ; N  # Lo    [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
-1EEF0..1EEF1   ; N  # Sm     [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL
-1F000..1F003   ; N  # So     [4] MAHJONG TILE EAST WIND..MAHJONG TILE NORTH WIND
-1F004          ; W  # So         MAHJONG TILE RED DRAGON
-1F005..1F02B   ; N  # So    [39] MAHJONG TILE GREEN DRAGON..MAHJONG TILE BACK
-1F030..1F093   ; N  # So   [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06
-1F0A0..1F0AE   ; N  # So    [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES
-1F0B1..1F0BF   ; N  # So    [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER
-1F0C1..1F0CE   ; N  # So    [14] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD KING OF DIAMONDS
-1F0CF          ; W  # So         PLAYING CARD BLACK JOKER
-1F0D1..1F0F5   ; N  # So    [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21
-1F100..1F10A   ; A  # No    [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA
-1F10B..1F10C   ; N  # No     [2] DINGBAT CIRCLED SANS-SERIF DIGIT ZERO..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO
-1F10D..1F10F   ; N  # So     [3] CIRCLED ZERO WITH SLASH..CIRCLED DOLLAR SIGN WITH OVERLAID BACKSLASH
-1F110..1F12D   ; A  # So    [30] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED CD
-1F12E..1F12F   ; N  # So     [2] CIRCLED WZ..COPYLEFT SYMBOL
-1F130..1F169   ; A  # So    [58] SQUARED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
-1F16A..1F16F   ; N  # So     [6] RAISED MC SIGN..CIRCLED HUMAN FIGURE
-1F170..1F18D   ; A  # So    [30] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED SA
-1F18E          ; W  # So         NEGATIVE SQUARED AB
-1F18F..1F190   ; A  # So     [2] NEGATIVE SQUARED WC..SQUARE DJ
-1F191..1F19A   ; W  # So    [10] SQUARED CL..SQUARED VS
-1F19B..1F1AC   ; A  # So    [18] SQUARED THREE D..SQUARED VOD
-1F1AD          ; N  # So         MASK WORK SYMBOL
-1F1E6..1F1FF   ; N  # So    [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z
-1F200..1F202   ; W  # So     [3] SQUARE HIRAGANA HOKA..SQUARED KATAKANA SA
-1F210..1F23B   ; W  # So    [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D
-1F240..1F248   ; W  # So     [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
-1F250..1F251   ; W  # So     [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
-1F260..1F265   ; W  # So     [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
-1F300..1F320   ; W  # So    [33] CYCLONE..SHOOTING STAR
-1F321..1F32C   ; N  # So    [12] THERMOMETER..WIND BLOWING FACE
-1F32D..1F335   ; W  # So     [9] HOT DOG..CACTUS
-1F336          ; N  # So         HOT PEPPER
-1F337..1F37C   ; W  # So    [70] TULIP..BABY BOTTLE
-1F37D          ; N  # So         FORK AND KNIFE WITH PLATE
-1F37E..1F393   ; W  # So    [22] BOTTLE WITH POPPING CORK..GRADUATION CAP
-1F394..1F39F   ; N  # So    [12] HEART WITH TIP ON THE LEFT..ADMISSION TICKETS
-1F3A0..1F3CA   ; W  # So    [43] CAROUSEL HORSE..SWIMMER
-1F3CB..1F3CE   ; N  # So     [4] WEIGHT LIFTER..RACING CAR
-1F3CF..1F3D3   ; W  # So     [5] CRICKET BAT AND BALL..TABLE TENNIS PADDLE AND BALL
-1F3D4..1F3DF   ; N  # So    [12] SNOW CAPPED MOUNTAIN..STADIUM
-1F3E0..1F3F0   ; W  # So    [17] HOUSE BUILDING..EUROPEAN CASTLE
-1F3F1..1F3F3   ; N  # So     [3] WHITE PENNANT..WAVING WHITE FLAG
-1F3F4          ; W  # So         WAVING BLACK FLAG
-1F3F5..1F3F7   ; N  # So     [3] ROSETTE..LABEL
-1F3F8..1F3FA   ; W  # So     [3] BADMINTON RACQUET AND SHUTTLECOCK..AMPHORA
-1F3FB..1F3FF   ; W  # Sk     [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
-1F400..1F43E   ; W  # So    [63] RAT..PAW PRINTS
-1F43F          ; N  # So         CHIPMUNK
-1F440          ; W  # So         EYES
-1F441          ; N  # So         EYE
-1F442..1F4FC   ; W  # So   [187] EAR..VIDEOCASSETTE
-1F4FD..1F4FE   ; N  # So     [2] FILM PROJECTOR..PORTABLE STEREO
-1F4FF..1F53D   ; W  # So    [63] PRAYER BEADS..DOWN-POINTING SMALL RED TRIANGLE
-1F53E..1F54A   ; N  # So    [13] LOWER RIGHT SHADOWED WHITE CIRCLE..DOVE OF PEACE
-1F54B..1F54E   ; W  # So     [4] KAABA..MENORAH WITH NINE BRANCHES
-1F54F          ; N  # So         BOWL OF HYGIEIA
-1F550..1F567   ; W  # So    [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
-1F568..1F579   ; N  # So    [18] RIGHT SPEAKER..JOYSTICK
-1F57A          ; W  # So         MAN DANCING
-1F57B..1F594   ; N  # So    [26] LEFT HAND TELEPHONE RECEIVER..REVERSED VICTORY HAND
-1F595..1F596   ; W  # So     [2] REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS
-1F597..1F5A3   ; N  # So    [13] WHITE DOWN POINTING LEFT HAND INDEX..BLACK DOWN POINTING BACKHAND INDEX
-1F5A4          ; W  # So         BLACK HEART
-1F5A5..1F5FA   ; N  # So    [86] DESKTOP COMPUTER..WORLD MAP
-1F5FB..1F5FF   ; W  # So     [5] MOUNT FUJI..MOYAI
-1F600..1F64F   ; W  # So    [80] GRINNING FACE..PERSON WITH FOLDED HANDS
-1F650..1F67F   ; N  # So    [48] NORTH WEST POINTING LEAF..REVERSE CHECKER BOARD
-1F680..1F6C5   ; W  # So    [70] ROCKET..LEFT LUGGAGE
-1F6C6..1F6CB   ; N  # So     [6] TRIANGLE WITH ROUNDED CORNERS..COUCH AND LAMP
-1F6CC          ; W  # So         SLEEPING ACCOMMODATION
-1F6CD..1F6CF   ; N  # So     [3] SHOPPING BAGS..BED
-1F6D0..1F6D2   ; W  # So     [3] PLACE OF WORSHIP..SHOPPING TROLLEY
-1F6D3..1F6D4   ; N  # So     [2] STUPA..PAGODA
-1F6D5..1F6D7   ; W  # So     [3] HINDU TEMPLE..ELEVATOR
-1F6DC..1F6DF   ; W  # So     [4] WIRELESS..RING BUOY
-1F6E0..1F6EA   ; N  # So    [11] HAMMER AND WRENCH..NORTHEAST-POINTING AIRPLANE
-1F6EB..1F6EC   ; W  # So     [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING
-1F6F0..1F6F3   ; N  # So     [4] SATELLITE..PASSENGER SHIP
-1F6F4..1F6FC   ; W  # So     [9] SCOOTER..ROLLER SKATE
-1F700..1F776   ; N  # So   [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE
-1F77B..1F77F   ; N  # So     [5] HAUMEA..ORCUS
-1F780..1F7D9   ; N  # So    [90] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NINE POINTED WHITE STAR
-1F7E0..1F7EB   ; W  # So    [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
-1F7F0          ; W  # So         HEAVY EQUALS SIGN
-1F800..1F80B   ; N  # So    [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
-1F810..1F847   ; N  # So    [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW
-1F850..1F859   ; N  # So    [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
-1F860..1F887   ; N  # So    [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
-1F890..1F8AD   ; N  # So    [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
-1F8B0..1F8B1   ; N  # So     [2] ARROW POINTING UPWARDS THEN NORTH WEST..ARROW POINTING RIGHTWARDS THEN CURVING SOUTH WEST
-1F900..1F90B   ; N  # So    [12] CIRCLED CROSS FORMEE WITH FOUR DOTS..DOWNWARD FACING NOTCHED HOOK WITH DOT
-1F90C..1F93A   ; W  # So    [47] PINCHED FINGERS..FENCER
-1F93B          ; N  # So         MODERN PENTATHLON
-1F93C..1F945   ; W  # So    [10] WRESTLERS..GOAL NET
-1F946          ; N  # So         RIFLE
-1F947..1F9FF   ; W  # So   [185] FIRST PLACE MEDAL..NAZAR AMULET
-1FA00..1FA53   ; N  # So    [84] NEUTRAL CHESS KING..BLACK CHESS KNIGHT-BISHOP
-1FA60..1FA6D   ; N  # So    [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
-1FA70..1FA7C   ; W  # So    [13] BALLET SHOES..CRUTCH
-1FA80..1FA88   ; W  # So     [9] YO-YO..FLUTE
-1FA90..1FABD   ; W  # So    [46] RINGED PLANET..WING
-1FABF..1FAC5   ; W  # So     [7] GOOSE..PERSON WITH CROWN
-1FACE..1FADB   ; W  # So    [14] MOOSE..PEA POD
-1FAE0..1FAE8   ; W  # So     [9] MELTING FACE..SHAKING FACE
-1FAF0..1FAF8   ; W  # So     [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
-1FB00..1FB92   ; N  # So   [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
-1FB94..1FBCA   ; N  # So    [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
-1FBF0..1FBF9   ; N  # Nd    [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
-20000..2A6DF   ; W  # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
-2A6E0..2A6FF   ; W  # Cn    [32] <reserved-2A6E0>..<reserved-2A6FF>
-2A700..2B739   ; W  # Lo  [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
-2B73A..2B73F   ; W  # Cn     [6] <reserved-2B73A>..<reserved-2B73F>
-2B740..2B81D   ; W  # Lo   [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
-2B81E..2B81F   ; W  # Cn     [2] <reserved-2B81E>..<reserved-2B81F>
-2B820..2CEA1   ; W  # Lo  [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
-2CEA2..2CEAF   ; W  # Cn    [14] <reserved-2CEA2>..<reserved-2CEAF>
-2CEB0..2EBE0   ; W  # Lo  [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
-2EBE1..2EBEF   ; W  # Cn    [15] <reserved-2EBE1>..<reserved-2EBEF>
-2EBF0..2EE5D   ; W  # Lo   [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D
-2EE5E..2F7FF   ; W  # Cn  [2466] <reserved-2EE5E>..<reserved-2F7FF>
-2F800..2FA1D   ; W  # Lo   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
-2FA1E..2FA1F   ; W  # Cn     [2] <reserved-2FA1E>..<reserved-2FA1F>
-2FA20..2FFFD   ; W  # Cn  [1502] <reserved-2FA20>..<reserved-2FFFD>
-30000..3134A   ; W  # Lo  [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
-3134B..3134F   ; W  # Cn     [5] <reserved-3134B>..<reserved-3134F>
-31350..323AF   ; W  # Lo  [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-323B0..3FFFD   ; W  # Cn [56398] <reserved-323B0>..<reserved-3FFFD>
-E0001          ; N  # Cf         LANGUAGE TAG
-E0020..E007F   ; N  # Cf    [96] TAG SPACE..CANCEL TAG
-E0100..E01EF   ; A  # Mn   [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-F0000..FFFFD   ; A  # Co [65534] <private-use-F0000>..<private-use-FFFFD>
-100000..10FFFD ; A  # Co [65534] <private-use-100000>..<private-use-10FFFD>
+0000..001F;N     # Cc    [32] <control-0000>..<control-001F>
+0020;Na          # Zs         SPACE
+0021..0023;Na    # Po     [3] EXCLAMATION MARK..NUMBER SIGN
+0024;Na          # Sc         DOLLAR SIGN
+0025..0027;Na    # Po     [3] PERCENT SIGN..APOSTROPHE
+0028;Na          # Ps         LEFT PARENTHESIS
+0029;Na          # Pe         RIGHT PARENTHESIS
+002A;Na          # Po         ASTERISK
+002B;Na          # Sm         PLUS SIGN
+002C;Na          # Po         COMMA
+002D;Na          # Pd         HYPHEN-MINUS
+002E..002F;Na    # Po     [2] FULL STOP..SOLIDUS
+0030..0039;Na    # Nd    [10] DIGIT ZERO..DIGIT NINE
+003A..003B;Na    # Po     [2] COLON..SEMICOLON
+003C..003E;Na    # Sm     [3] LESS-THAN SIGN..GREATER-THAN SIGN
+003F..0040;Na    # Po     [2] QUESTION MARK..COMMERCIAL AT
+0041..005A;Na    # Lu    [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+005B;Na          # Ps         LEFT SQUARE BRACKET
+005C;Na          # Po         REVERSE SOLIDUS
+005D;Na          # Pe         RIGHT SQUARE BRACKET
+005E;Na          # Sk         CIRCUMFLEX ACCENT
+005F;Na          # Pc         LOW LINE
+0060;Na          # Sk         GRAVE ACCENT
+0061..007A;Na    # Ll    [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+007B;Na          # Ps         LEFT CURLY BRACKET
+007C;Na          # Sm         VERTICAL LINE
+007D;Na          # Pe         RIGHT CURLY BRACKET
+007E;Na          # Sm         TILDE
+007F;N           # Cc         <control-007F>
+0080..009F;N     # Cc    [32] <control-0080>..<control-009F>
+00A0;N           # Zs         NO-BREAK SPACE
+00A1;A           # Po         INVERTED EXCLAMATION MARK
+00A2..00A3;Na    # Sc     [2] CENT SIGN..POUND SIGN
+00A4;A           # Sc         CURRENCY SIGN
+00A5;Na          # Sc         YEN SIGN
+00A6;Na          # So         BROKEN BAR
+00A7;A           # Po         SECTION SIGN
+00A8;A           # Sk         DIAERESIS
+00A9;N           # So         COPYRIGHT SIGN
+00AA;A           # Lo         FEMININE ORDINAL INDICATOR
+00AB;N           # Pi         LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+00AC;Na          # Sm         NOT SIGN
+00AD;A           # Cf         SOFT HYPHEN
+00AE;A           # So         REGISTERED SIGN
+00AF;Na          # Sk         MACRON
+00B0;A           # So         DEGREE SIGN
+00B1;A           # Sm         PLUS-MINUS SIGN
+00B2..00B3;A     # No     [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE
+00B4;A           # Sk         ACUTE ACCENT
+00B5;N           # Ll         MICRO SIGN
+00B6..00B7;A     # Po     [2] PILCROW SIGN..MIDDLE DOT
+00B8;A           # Sk         CEDILLA
+00B9;A           # No         SUPERSCRIPT ONE
+00BA;A           # Lo         MASCULINE ORDINAL INDICATOR
+00BB;N           # Pf         RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+00BC..00BE;A     # No     [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS
+00BF;A           # Po         INVERTED QUESTION MARK
+00C0..00C5;N     # Lu     [6] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER A WITH RING ABOVE
+00C6;A           # Lu         LATIN CAPITAL LETTER AE
+00C7..00CF;N     # Lu     [9] LATIN CAPITAL LETTER C WITH CEDILLA..LATIN CAPITAL LETTER I WITH DIAERESIS
+00D0;A           # Lu         LATIN CAPITAL LETTER ETH
+00D1..00D6;N     # Lu     [6] LATIN CAPITAL LETTER N WITH TILDE..LATIN CAPITAL LETTER O WITH DIAERESIS
+00D7;A           # Sm         MULTIPLICATION SIGN
+00D8;A           # Lu         LATIN CAPITAL LETTER O WITH STROKE
+00D9..00DD;N     # Lu     [5] LATIN CAPITAL LETTER U WITH GRAVE..LATIN CAPITAL LETTER Y WITH ACUTE
+00DE..00E1;A     # L&     [4] LATIN CAPITAL LETTER THORN..LATIN SMALL LETTER A WITH ACUTE
+00E2..00E5;N     # Ll     [4] LATIN SMALL LETTER A WITH CIRCUMFLEX..LATIN SMALL LETTER A WITH RING ABOVE
+00E6;A           # Ll         LATIN SMALL LETTER AE
+00E7;N           # Ll         LATIN SMALL LETTER C WITH CEDILLA
+00E8..00EA;A     # Ll     [3] LATIN SMALL LETTER E WITH GRAVE..LATIN SMALL LETTER E WITH CIRCUMFLEX
+00EB;N           # Ll         LATIN SMALL LETTER E WITH DIAERESIS
+00EC..00ED;A     # Ll     [2] LATIN SMALL LETTER I WITH GRAVE..LATIN SMALL LETTER I WITH ACUTE
+00EE..00EF;N     # Ll     [2] LATIN SMALL LETTER I WITH CIRCUMFLEX..LATIN SMALL LETTER I WITH DIAERESIS
+00F0;A           # Ll         LATIN SMALL LETTER ETH
+00F1;N           # Ll         LATIN SMALL LETTER N WITH TILDE
+00F2..00F3;A     # Ll     [2] LATIN SMALL LETTER O WITH GRAVE..LATIN SMALL LETTER O WITH ACUTE
+00F4..00F6;N     # Ll     [3] LATIN SMALL LETTER O WITH CIRCUMFLEX..LATIN SMALL LETTER O WITH DIAERESIS
+00F7;A           # Sm         DIVISION SIGN
+00F8..00FA;A     # Ll     [3] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER U WITH ACUTE
+00FB;N           # Ll         LATIN SMALL LETTER U WITH CIRCUMFLEX
+00FC;A           # Ll         LATIN SMALL LETTER U WITH DIAERESIS
+00FD;N           # Ll         LATIN SMALL LETTER Y WITH ACUTE
+00FE;A           # Ll         LATIN SMALL LETTER THORN
+00FF;N           # Ll         LATIN SMALL LETTER Y WITH DIAERESIS
+0100;N           # Lu         LATIN CAPITAL LETTER A WITH MACRON
+0101;A           # Ll         LATIN SMALL LETTER A WITH MACRON
+0102..0110;N     # L&    [15] LATIN CAPITAL LETTER A WITH BREVE..LATIN CAPITAL LETTER D WITH STROKE
+0111;A           # Ll         LATIN SMALL LETTER D WITH STROKE
+0112;N           # Lu         LATIN CAPITAL LETTER E WITH MACRON
+0113;A           # Ll         LATIN SMALL LETTER E WITH MACRON
+0114..011A;N     # L&     [7] LATIN CAPITAL LETTER E WITH BREVE..LATIN CAPITAL LETTER E WITH CARON
+011B;A           # Ll         LATIN SMALL LETTER E WITH CARON
+011C..0125;N     # L&    [10] LATIN CAPITAL LETTER G WITH CIRCUMFLEX..LATIN SMALL LETTER H WITH CIRCUMFLEX
+0126..0127;A     # L&     [2] LATIN CAPITAL LETTER H WITH STROKE..LATIN SMALL LETTER H WITH STROKE
+0128..012A;N     # L&     [3] LATIN CAPITAL LETTER I WITH TILDE..LATIN CAPITAL LETTER I WITH MACRON
+012B;A           # Ll         LATIN SMALL LETTER I WITH MACRON
+012C..0130;N     # L&     [5] LATIN CAPITAL LETTER I WITH BREVE..LATIN CAPITAL LETTER I WITH DOT ABOVE
+0131..0133;A     # L&     [3] LATIN SMALL LETTER DOTLESS I..LATIN SMALL LIGATURE IJ
+0134..0137;N     # L&     [4] LATIN CAPITAL LETTER J WITH CIRCUMFLEX..LATIN SMALL LETTER K WITH CEDILLA
+0138;A           # Ll         LATIN SMALL LETTER KRA
+0139..013E;N     # L&     [6] LATIN CAPITAL LETTER L WITH ACUTE..LATIN SMALL LETTER L WITH CARON
+013F..0142;A     # L&     [4] LATIN CAPITAL LETTER L WITH MIDDLE DOT..LATIN SMALL LETTER L WITH STROKE
+0143;N           # Lu         LATIN CAPITAL LETTER N WITH ACUTE
+0144;A           # Ll         LATIN SMALL LETTER N WITH ACUTE
+0145..0147;N     # L&     [3] LATIN CAPITAL LETTER N WITH CEDILLA..LATIN CAPITAL LETTER N WITH CARON
+0148..014B;A     # L&     [4] LATIN SMALL LETTER N WITH CARON..LATIN SMALL LETTER ENG
+014C;N           # Lu         LATIN CAPITAL LETTER O WITH MACRON
+014D;A           # Ll         LATIN SMALL LETTER O WITH MACRON
+014E..0151;N     # L&     [4] LATIN CAPITAL LETTER O WITH BREVE..LATIN SMALL LETTER O WITH DOUBLE ACUTE
+0152..0153;A     # L&     [2] LATIN CAPITAL LIGATURE OE..LATIN SMALL LIGATURE OE
+0154..0165;N     # L&    [18] LATIN CAPITAL LETTER R WITH ACUTE..LATIN SMALL LETTER T WITH CARON
+0166..0167;A     # L&     [2] LATIN CAPITAL LETTER T WITH STROKE..LATIN SMALL LETTER T WITH STROKE
+0168..016A;N     # L&     [3] LATIN CAPITAL LETTER U WITH TILDE..LATIN CAPITAL LETTER U WITH MACRON
+016B;A           # Ll         LATIN SMALL LETTER U WITH MACRON
+016C..017F;N     # L&    [20] LATIN CAPITAL LETTER U WITH BREVE..LATIN SMALL LETTER LONG S
+0180..01BA;N     # L&    [59] LATIN SMALL LETTER B WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL
+01BB;N           # Lo         LATIN LETTER TWO WITH STROKE
+01BC..01BF;N     # L&     [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN
+01C0..01C3;N     # Lo     [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK
+01C4..01CD;N     # L&    [10] LATIN CAPITAL LETTER DZ WITH CARON..LATIN CAPITAL LETTER A WITH CARON
+01CE;A           # Ll         LATIN SMALL LETTER A WITH CARON
+01CF;N           # Lu         LATIN CAPITAL LETTER I WITH CARON
+01D0;A           # Ll         LATIN SMALL LETTER I WITH CARON
+01D1;N           # Lu         LATIN CAPITAL LETTER O WITH CARON
+01D2;A           # Ll         LATIN SMALL LETTER O WITH CARON
+01D3;N           # Lu         LATIN CAPITAL LETTER U WITH CARON
+01D4;A           # Ll         LATIN SMALL LETTER U WITH CARON
+01D5;N           # Lu         LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
+01D6;A           # Ll         LATIN SMALL LETTER U WITH DIAERESIS AND MACRON
+01D7;N           # Lu         LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE
+01D8;A           # Ll         LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE
+01D9;N           # Lu         LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON
+01DA;A           # Ll         LATIN SMALL LETTER U WITH DIAERESIS AND CARON
+01DB;N           # Lu         LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE
+01DC;A           # Ll         LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE
+01DD..024F;N     # L&   [115] LATIN SMALL LETTER TURNED E..LATIN SMALL LETTER Y WITH STROKE
+0250;N           # Ll         LATIN SMALL LETTER TURNED A
+0251;A           # Ll         LATIN SMALL LETTER ALPHA
+0252..0260;N     # Ll    [15] LATIN SMALL LETTER TURNED ALPHA..LATIN SMALL LETTER G WITH HOOK
+0261;A           # Ll         LATIN SMALL LETTER SCRIPT G
+0262..0293;N     # Ll    [50] LATIN LETTER SMALL CAPITAL G..LATIN SMALL LETTER EZH WITH CURL
+0294;N           # Lo         LATIN LETTER GLOTTAL STOP
+0295..02AF;N     # Ll    [27] LATIN LETTER PHARYNGEAL VOICED FRICATIVE..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
+02B0..02C1;N     # Lm    [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP
+02C2..02C3;N     # Sk     [2] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER RIGHT ARROWHEAD
+02C4;A           # Sk         MODIFIER LETTER UP ARROWHEAD
+02C5;N           # Sk         MODIFIER LETTER DOWN ARROWHEAD
+02C6;N           # Lm         MODIFIER LETTER CIRCUMFLEX ACCENT
+02C7;A           # Lm         CARON
+02C8;N           # Lm         MODIFIER LETTER VERTICAL LINE
+02C9..02CB;A     # Lm     [3] MODIFIER LETTER MACRON..MODIFIER LETTER GRAVE ACCENT
+02CC;N           # Lm         MODIFIER LETTER LOW VERTICAL LINE
+02CD;A           # Lm         MODIFIER LETTER LOW MACRON
+02CE..02CF;N     # Lm     [2] MODIFIER LETTER LOW GRAVE ACCENT..MODIFIER LETTER LOW ACUTE ACCENT
+02D0;A           # Lm         MODIFIER LETTER TRIANGULAR COLON
+02D1;N           # Lm         MODIFIER LETTER HALF TRIANGULAR COLON
+02D2..02D7;N     # Sk     [6] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER MINUS SIGN
+02D8..02DB;A     # Sk     [4] BREVE..OGONEK
+02DC;N           # Sk         SMALL TILDE
+02DD;A           # Sk         DOUBLE ACUTE ACCENT
+02DE;N           # Sk         MODIFIER LETTER RHOTIC HOOK
+02DF;A           # Sk         MODIFIER LETTER CROSS ACCENT
+02E0..02E4;N     # Lm     [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
+02E5..02EB;N     # Sk     [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK
+02EC;N           # Lm         MODIFIER LETTER VOICING
+02ED;N           # Sk         MODIFIER LETTER UNASPIRATED
+02EE;N           # Lm         MODIFIER LETTER DOUBLE APOSTROPHE
+02EF..02FF;N     # Sk    [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW
+0300..036F;A     # Mn   [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X
+0370..0373;N     # L&     [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI
+0374;N           # Lm         GREEK NUMERAL SIGN
+0375;N           # Sk         GREEK LOWER NUMERAL SIGN
+0376..0377;N     # L&     [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
+037A;N           # Lm         GREEK YPOGEGRAMMENI
+037B..037D;N     # Ll     [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL
+037E;N           # Po         GREEK QUESTION MARK
+037F;N           # Lu         GREEK CAPITAL LETTER YOT
+0384..0385;N     # Sk     [2] GREEK TONOS..GREEK DIALYTIKA TONOS
+0386;N           # Lu         GREEK CAPITAL LETTER ALPHA WITH TONOS
+0387;N           # Po         GREEK ANO TELEIA
+0388..038A;N     # Lu     [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS
+038C;N           # Lu         GREEK CAPITAL LETTER OMICRON WITH TONOS
+038E..0390;N     # L&     [3] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+0391..03A1;A     # Lu    [17] GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LETTER RHO
+03A3..03A9;A     # Lu     [7] GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LETTER OMEGA
+03AA..03B0;N     # L&     [7] GREEK CAPITAL LETTER IOTA WITH DIALYTIKA..GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+03B1..03C1;A     # Ll    [17] GREEK SMALL LETTER ALPHA..GREEK SMALL LETTER RHO
+03C2;N           # Ll         GREEK SMALL LETTER FINAL SIGMA
+03C3..03C9;A     # Ll     [7] GREEK SMALL LETTER SIGMA..GREEK SMALL LETTER OMEGA
+03CA..03F5;N     # L&    [44] GREEK SMALL LETTER IOTA WITH DIALYTIKA..GREEK LUNATE EPSILON SYMBOL
+03F6;N           # Sm         GREEK REVERSED LUNATE EPSILON SYMBOL
+03F7..03FF;N     # L&     [9] GREEK CAPITAL LETTER SHO..GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL
+0400;N           # Lu         CYRILLIC CAPITAL LETTER IE WITH GRAVE
+0401;A           # Lu         CYRILLIC CAPITAL LETTER IO
+0402..040F;N     # Lu    [14] CYRILLIC CAPITAL LETTER DJE..CYRILLIC CAPITAL LETTER DZHE
+0410..044F;A     # L&    [64] CYRILLIC CAPITAL LETTER A..CYRILLIC SMALL LETTER YA
+0450;N           # Ll         CYRILLIC SMALL LETTER IE WITH GRAVE
+0451;A           # Ll         CYRILLIC SMALL LETTER IO
+0452..0481;N     # L&    [48] CYRILLIC SMALL LETTER DJE..CYRILLIC SMALL LETTER KOPPA
+0482;N           # So         CYRILLIC THOUSANDS SIGN
+0483..0487;N     # Mn     [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE
+0488..0489;N     # Me     [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
+048A..04FF;N     # L&   [118] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER HA WITH STROKE
+0500..052F;N     # L&    [48] CYRILLIC CAPITAL LETTER KOMI DE..CYRILLIC SMALL LETTER EL WITH DESCENDER
+0531..0556;N     # Lu    [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH
+0559;N           # Lm         ARMENIAN MODIFIER LETTER LEFT HALF RING
+055A..055F;N     # Po     [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK
+0560..0588;N     # Ll    [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE
+0589;N           # Po         ARMENIAN FULL STOP
+058A;N           # Pd         ARMENIAN HYPHEN
+058D..058E;N     # So     [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN
+058F;N           # Sc         ARMENIAN DRAM SIGN
+0591..05BD;N     # Mn    [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG
+05BE;N           # Pd         HEBREW PUNCTUATION MAQAF
+05BF;N           # Mn         HEBREW POINT RAFE
+05C0;N           # Po         HEBREW PUNCTUATION PASEQ
+05C1..05C2;N     # Mn     [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT
+05C3;N           # Po         HEBREW PUNCTUATION SOF PASUQ
+05C4..05C5;N     # Mn     [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT
+05C6;N           # Po         HEBREW PUNCTUATION NUN HAFUKHA
+05C7;N           # Mn         HEBREW POINT QAMATS QATAN
+05D0..05EA;N     # Lo    [27] HEBREW LETTER ALEF..HEBREW LETTER TAV
+05EF..05F2;N     # Lo     [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD
+05F3..05F4;N     # Po     [2] HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATION GERSHAYIM
+0600..0605;N     # Cf     [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE
+0606..0608;N     # Sm     [3] ARABIC-INDIC CUBE ROOT..ARABIC RAY
+0609..060A;N     # Po     [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN
+060B;N           # Sc         AFGHANI SIGN
+060C..060D;N     # Po     [2] ARABIC COMMA..ARABIC DATE SEPARATOR
+060E..060F;N     # So     [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA
+0610..061A;N     # Mn    [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA
+061B;N           # Po         ARABIC SEMICOLON
+061C;N           # Cf         ARABIC LETTER MARK
+061D..061F;N     # Po     [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK
+0620..063F;N     # Lo    [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
+0640;N           # Lm         ARABIC TATWEEL
+0641..064A;N     # Lo    [10] ARABIC LETTER FEH..ARABIC LETTER YEH
+064B..065F;N     # Mn    [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW
+0660..0669;N     # Nd    [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
+066A..066D;N     # Po     [4] ARABIC PERCENT SIGN..ARABIC FIVE POINTED STAR
+066E..066F;N     # Lo     [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF
+0670;N           # Mn         ARABIC LETTER SUPERSCRIPT ALEF
+0671..06D3;N     # Lo    [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
+06D4;N           # Po         ARABIC FULL STOP
+06D5;N           # Lo         ARABIC LETTER AE
+06D6..06DC;N     # Mn     [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN
+06DD;N           # Cf         ARABIC END OF AYAH
+06DE;N           # So         ARABIC START OF RUB EL HIZB
+06DF..06E4;N     # Mn     [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA
+06E5..06E6;N     # Lm     [2] ARABIC SMALL WAW..ARABIC SMALL YEH
+06E7..06E8;N     # Mn     [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON
+06E9;N           # So         ARABIC PLACE OF SAJDAH
+06EA..06ED;N     # Mn     [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM
+06EE..06EF;N     # Lo     [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V
+06F0..06F9;N     # Nd    [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE
+06FA..06FC;N     # Lo     [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW
+06FD..06FE;N     # So     [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN
+06FF;N           # Lo         ARABIC LETTER HEH WITH INVERTED V
+0700..070D;N     # Po    [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS
+070F;N           # Cf         SYRIAC ABBREVIATION MARK
+0710;N           # Lo         SYRIAC LETTER ALAPH
+0711;N           # Mn         SYRIAC LETTER SUPERSCRIPT ALAPH
+0712..072F;N     # Lo    [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH
+0730..074A;N     # Mn    [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH
+074D..074F;N     # Lo     [3] SYRIAC LETTER SOGDIAN ZHAIN..SYRIAC LETTER SOGDIAN FE
+0750..077F;N     # Lo    [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE
+0780..07A5;N     # Lo    [38] THAANA LETTER HAA..THAANA LETTER WAAVU
+07A6..07B0;N     # Mn    [11] THAANA ABAFILI..THAANA SUKUN
+07B1;N           # Lo         THAANA LETTER NAA
+07C0..07C9;N     # Nd    [10] NKO DIGIT ZERO..NKO DIGIT NINE
+07CA..07EA;N     # Lo    [33] NKO LETTER A..NKO LETTER JONA RA
+07EB..07F3;N     # Mn     [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE
+07F4..07F5;N     # Lm     [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE
+07F6;N           # So         NKO SYMBOL OO DENNEN
+07F7..07F9;N     # Po     [3] NKO SYMBOL GBAKURUNEN..NKO EXCLAMATION MARK
+07FA;N           # Lm         NKO LAJANYALAN
+07FD;N           # Mn         NKO DANTAYALAN
+07FE..07FF;N     # Sc     [2] NKO DOROME SIGN..NKO TAMAN SIGN
+0800..0815;N     # Lo    [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF
+0816..0819;N     # Mn     [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH
+081A;N           # Lm         SAMARITAN MODIFIER LETTER EPENTHETIC YUT
+081B..0823;N     # Mn     [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A
+0824;N           # Lm         SAMARITAN MODIFIER LETTER SHORT A
+0825..0827;N     # Mn     [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
+0828;N           # Lm         SAMARITAN MODIFIER LETTER I
+0829..082D;N     # Mn     [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
+0830..083E;N     # Po    [15] SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION ANNAAU
+0840..0858;N     # Lo    [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN
+0859..085B;N     # Mn     [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
+085E;N           # Po         MANDAIC PUNCTUATION
+0860..086A;N     # Lo    [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
+0870..0887;N     # Lo    [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
+0888;N           # Sk         ARABIC RAISED ROUND DOT
+0889..088E;N     # Lo     [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
+0890..0891;N     # Cf     [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE
+0898..089F;N     # Mn     [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA
+08A0..08C8;N     # Lo    [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
+08C9;N           # Lm         ARABIC SMALL FARSI YEH
+08CA..08E1;N     # Mn    [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA
+08E2;N           # Cf         ARABIC DISPUTED END OF AYAH
+08E3..08FF;N     # Mn    [29] ARABIC TURNED DAMMA BELOW..ARABIC MARK SIDEWAYS NOON GHUNNA
+0900..0902;N     # Mn     [3] DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANAGARI SIGN ANUSVARA
+0903;N           # Mc         DEVANAGARI SIGN VISARGA
+0904..0939;N     # Lo    [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA
+093A;N           # Mn         DEVANAGARI VOWEL SIGN OE
+093B;N           # Mc         DEVANAGARI VOWEL SIGN OOE
+093C;N           # Mn         DEVANAGARI SIGN NUKTA
+093D;N           # Lo         DEVANAGARI SIGN AVAGRAHA
+093E..0940;N     # Mc     [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II
+0941..0948;N     # Mn     [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI
+0949..094C;N     # Mc     [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU
+094D;N           # Mn         DEVANAGARI SIGN VIRAMA
+094E..094F;N     # Mc     [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW
+0950;N           # Lo         DEVANAGARI OM
+0951..0957;N     # Mn     [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE
+0958..0961;N     # Lo    [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL
+0962..0963;N     # Mn     [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL
+0964..0965;N     # Po     [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
+0966..096F;N     # Nd    [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
+0970;N           # Po         DEVANAGARI ABBREVIATION SIGN
+0971;N           # Lm         DEVANAGARI SIGN HIGH SPACING DOT
+0972..097F;N     # Lo    [14] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER BBA
+0980;N           # Lo         BENGALI ANJI
+0981;N           # Mn         BENGALI SIGN CANDRABINDU
+0982..0983;N     # Mc     [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA
+0985..098C;N     # Lo     [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L
+098F..0990;N     # Lo     [2] BENGALI LETTER E..BENGALI LETTER AI
+0993..09A8;N     # Lo    [22] BENGALI LETTER O..BENGALI LETTER NA
+09AA..09B0;N     # Lo     [7] BENGALI LETTER PA..BENGALI LETTER RA
+09B2;N           # Lo         BENGALI LETTER LA
+09B6..09B9;N     # Lo     [4] BENGALI LETTER SHA..BENGALI LETTER HA
+09BC;N           # Mn         BENGALI SIGN NUKTA
+09BD;N           # Lo         BENGALI SIGN AVAGRAHA
+09BE..09C0;N     # Mc     [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II
+09C1..09C4;N     # Mn     [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR
+09C7..09C8;N     # Mc     [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI
+09CB..09CC;N     # Mc     [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU
+09CD;N           # Mn         BENGALI SIGN VIRAMA
+09CE;N           # Lo         BENGALI LETTER KHANDA TA
+09D7;N           # Mc         BENGALI AU LENGTH MARK
+09DC..09DD;N     # Lo     [2] BENGALI LETTER RRA..BENGALI LETTER RHA
+09DF..09E1;N     # Lo     [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL
+09E2..09E3;N     # Mn     [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL
+09E6..09EF;N     # Nd    [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
+09F0..09F1;N     # Lo     [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL
+09F2..09F3;N     # Sc     [2] BENGALI RUPEE MARK..BENGALI RUPEE SIGN
+09F4..09F9;N     # No     [6] BENGALI CURRENCY NUMERATOR ONE..BENGALI CURRENCY DENOMINATOR SIXTEEN
+09FA;N           # So         BENGALI ISSHAR
+09FB;N           # Sc         BENGALI GANDA MARK
+09FC;N           # Lo         BENGALI LETTER VEDIC ANUSVARA
+09FD;N           # Po         BENGALI ABBREVIATION SIGN
+09FE;N           # Mn         BENGALI SANDHI MARK
+0A01..0A02;N     # Mn     [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI
+0A03;N           # Mc         GURMUKHI SIGN VISARGA
+0A05..0A0A;N     # Lo     [6] GURMUKHI LETTER A..GURMUKHI LETTER UU
+0A0F..0A10;N     # Lo     [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI
+0A13..0A28;N     # Lo    [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA
+0A2A..0A30;N     # Lo     [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA
+0A32..0A33;N     # Lo     [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA
+0A35..0A36;N     # Lo     [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA
+0A38..0A39;N     # Lo     [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA
+0A3C;N           # Mn         GURMUKHI SIGN NUKTA
+0A3E..0A40;N     # Mc     [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II
+0A41..0A42;N     # Mn     [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU
+0A47..0A48;N     # Mn     [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI
+0A4B..0A4D;N     # Mn     [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA
+0A51;N           # Mn         GURMUKHI SIGN UDAAT
+0A59..0A5C;N     # Lo     [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA
+0A5E;N           # Lo         GURMUKHI LETTER FA
+0A66..0A6F;N     # Nd    [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
+0A70..0A71;N     # Mn     [2] GURMUKHI TIPPI..GURMUKHI ADDAK
+0A72..0A74;N     # Lo     [3] GURMUKHI IRI..GURMUKHI EK ONKAR
+0A75;N           # Mn         GURMUKHI SIGN YAKASH
+0A76;N           # Po         GURMUKHI ABBREVIATION SIGN
+0A81..0A82;N     # Mn     [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA
+0A83;N           # Mc         GUJARATI SIGN VISARGA
+0A85..0A8D;N     # Lo     [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E
+0A8F..0A91;N     # Lo     [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O
+0A93..0AA8;N     # Lo    [22] GUJARATI LETTER O..GUJARATI LETTER NA
+0AAA..0AB0;N     # Lo     [7] GUJARATI LETTER PA..GUJARATI LETTER RA
+0AB2..0AB3;N     # Lo     [2] GUJARATI LETTER LA..GUJARATI LETTER LLA
+0AB5..0AB9;N     # Lo     [5] GUJARATI LETTER VA..GUJARATI LETTER HA
+0ABC;N           # Mn         GUJARATI SIGN NUKTA
+0ABD;N           # Lo         GUJARATI SIGN AVAGRAHA
+0ABE..0AC0;N     # Mc     [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II
+0AC1..0AC5;N     # Mn     [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E
+0AC7..0AC8;N     # Mn     [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI
+0AC9;N           # Mc         GUJARATI VOWEL SIGN CANDRA O
+0ACB..0ACC;N     # Mc     [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU
+0ACD;N           # Mn         GUJARATI SIGN VIRAMA
+0AD0;N           # Lo         GUJARATI OM
+0AE0..0AE1;N     # Lo     [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL
+0AE2..0AE3;N     # Mn     [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL
+0AE6..0AEF;N     # Nd    [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
+0AF0;N           # Po         GUJARATI ABBREVIATION SIGN
+0AF1;N           # Sc         GUJARATI RUPEE SIGN
+0AF9;N           # Lo         GUJARATI LETTER ZHA
+0AFA..0AFF;N     # Mn     [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE
+0B01;N           # Mn         ORIYA SIGN CANDRABINDU
+0B02..0B03;N     # Mc     [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA
+0B05..0B0C;N     # Lo     [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L
+0B0F..0B10;N     # Lo     [2] ORIYA LETTER E..ORIYA LETTER AI
+0B13..0B28;N     # Lo    [22] ORIYA LETTER O..ORIYA LETTER NA
+0B2A..0B30;N     # Lo     [7] ORIYA LETTER PA..ORIYA LETTER RA
+0B32..0B33;N     # Lo     [2] ORIYA LETTER LA..ORIYA LETTER LLA
+0B35..0B39;N     # Lo     [5] ORIYA LETTER VA..ORIYA LETTER HA
+0B3C;N           # Mn         ORIYA SIGN NUKTA
+0B3D;N           # Lo         ORIYA SIGN AVAGRAHA
+0B3E;N           # Mc         ORIYA VOWEL SIGN AA
+0B3F;N           # Mn         ORIYA VOWEL SIGN I
+0B40;N           # Mc         ORIYA VOWEL SIGN II
+0B41..0B44;N     # Mn     [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR
+0B47..0B48;N     # Mc     [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI
+0B4B..0B4C;N     # Mc     [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU
+0B4D;N           # Mn         ORIYA SIGN VIRAMA
+0B55..0B56;N     # Mn     [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK
+0B57;N           # Mc         ORIYA AU LENGTH MARK
+0B5C..0B5D;N     # Lo     [2] ORIYA LETTER RRA..ORIYA LETTER RHA
+0B5F..0B61;N     # Lo     [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL
+0B62..0B63;N     # Mn     [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL
+0B66..0B6F;N     # Nd    [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE
+0B70;N           # So         ORIYA ISSHAR
+0B71;N           # Lo         ORIYA LETTER WA
+0B72..0B77;N     # No     [6] ORIYA FRACTION ONE QUARTER..ORIYA FRACTION THREE SIXTEENTHS
+0B82;N           # Mn         TAMIL SIGN ANUSVARA
+0B83;N           # Lo         TAMIL SIGN VISARGA
+0B85..0B8A;N     # Lo     [6] TAMIL LETTER A..TAMIL LETTER UU
+0B8E..0B90;N     # Lo     [3] TAMIL LETTER E..TAMIL LETTER AI
+0B92..0B95;N     # Lo     [4] TAMIL LETTER O..TAMIL LETTER KA
+0B99..0B9A;N     # Lo     [2] TAMIL LETTER NGA..TAMIL LETTER CA
+0B9C;N           # Lo         TAMIL LETTER JA
+0B9E..0B9F;N     # Lo     [2] TAMIL LETTER NYA..TAMIL LETTER TTA
+0BA3..0BA4;N     # Lo     [2] TAMIL LETTER NNA..TAMIL LETTER TA
+0BA8..0BAA;N     # Lo     [3] TAMIL LETTER NA..TAMIL LETTER PA
+0BAE..0BB9;N     # Lo    [12] TAMIL LETTER MA..TAMIL LETTER HA
+0BBE..0BBF;N     # Mc     [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I
+0BC0;N           # Mn         TAMIL VOWEL SIGN II
+0BC1..0BC2;N     # Mc     [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU
+0BC6..0BC8;N     # Mc     [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI
+0BCA..0BCC;N     # Mc     [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU
+0BCD;N           # Mn         TAMIL SIGN VIRAMA
+0BD0;N           # Lo         TAMIL OM
+0BD7;N           # Mc         TAMIL AU LENGTH MARK
+0BE6..0BEF;N     # Nd    [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
+0BF0..0BF2;N     # No     [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND
+0BF3..0BF8;N     # So     [6] TAMIL DAY SIGN..TAMIL AS ABOVE SIGN
+0BF9;N           # Sc         TAMIL RUPEE SIGN
+0BFA;N           # So         TAMIL NUMBER SIGN
+0C00;N           # Mn         TELUGU SIGN COMBINING CANDRABINDU ABOVE
+0C01..0C03;N     # Mc     [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
+0C04;N           # Mn         TELUGU SIGN COMBINING ANUSVARA ABOVE
+0C05..0C0C;N     # Lo     [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L
+0C0E..0C10;N     # Lo     [3] TELUGU LETTER E..TELUGU LETTER AI
+0C12..0C28;N     # Lo    [23] TELUGU LETTER O..TELUGU LETTER NA
+0C2A..0C39;N     # Lo    [16] TELUGU LETTER PA..TELUGU LETTER HA
+0C3C;N           # Mn         TELUGU SIGN NUKTA
+0C3D;N           # Lo         TELUGU SIGN AVAGRAHA
+0C3E..0C40;N     # Mn     [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
+0C41..0C44;N     # Mc     [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR
+0C46..0C48;N     # Mn     [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
+0C4A..0C4D;N     # Mn     [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
+0C55..0C56;N     # Mn     [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
+0C58..0C5A;N     # Lo     [3] TELUGU LETTER TSA..TELUGU LETTER RRRA
+0C5D;N           # Lo         TELUGU LETTER NAKAARA POLLU
+0C60..0C61;N     # Lo     [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL
+0C62..0C63;N     # Mn     [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL
+0C66..0C6F;N     # Nd    [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE
+0C77;N           # Po         TELUGU SIGN SIDDHAM
+0C78..0C7E;N     # No     [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR
+0C7F;N           # So         TELUGU SIGN TUUMU
+0C80;N           # Lo         KANNADA SIGN SPACING CANDRABINDU
+0C81;N           # Mn         KANNADA SIGN CANDRABINDU
+0C82..0C83;N     # Mc     [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
+0C84;N           # Po         KANNADA SIGN SIDDHAM
+0C85..0C8C;N     # Lo     [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
+0C8E..0C90;N     # Lo     [3] KANNADA LETTER E..KANNADA LETTER AI
+0C92..0CA8;N     # Lo    [23] KANNADA LETTER O..KANNADA LETTER NA
+0CAA..0CB3;N     # Lo    [10] KANNADA LETTER PA..KANNADA LETTER LLA
+0CB5..0CB9;N     # Lo     [5] KANNADA LETTER VA..KANNADA LETTER HA
+0CBC;N           # Mn         KANNADA SIGN NUKTA
+0CBD;N           # Lo         KANNADA SIGN AVAGRAHA
+0CBE;N           # Mc         KANNADA VOWEL SIGN AA
+0CBF;N           # Mn         KANNADA VOWEL SIGN I
+0CC0..0CC4;N     # Mc     [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR
+0CC6;N           # Mn         KANNADA VOWEL SIGN E
+0CC7..0CC8;N     # Mc     [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI
+0CCA..0CCB;N     # Mc     [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO
+0CCC..0CCD;N     # Mn     [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
+0CD5..0CD6;N     # Mc     [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
+0CDD..0CDE;N     # Lo     [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA
+0CE0..0CE1;N     # Lo     [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
+0CE2..0CE3;N     # Mn     [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
+0CE6..0CEF;N     # Nd    [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
+0CF1..0CF2;N     # Lo     [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
+0CF3;N           # Mc         KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
+0D00..0D01;N     # Mn     [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
+0D02..0D03;N     # Mc     [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
+0D04..0D0C;N     # Lo     [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
+0D0E..0D10;N     # Lo     [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
+0D12..0D3A;N     # Lo    [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA
+0D3B..0D3C;N     # Mn     [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
+0D3D;N           # Lo         MALAYALAM SIGN AVAGRAHA
+0D3E..0D40;N     # Mc     [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
+0D41..0D44;N     # Mn     [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
+0D46..0D48;N     # Mc     [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI
+0D4A..0D4C;N     # Mc     [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU
+0D4D;N           # Mn         MALAYALAM SIGN VIRAMA
+0D4E;N           # Lo         MALAYALAM LETTER DOT REPH
+0D4F;N           # So         MALAYALAM SIGN PARA
+0D54..0D56;N     # Lo     [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL
+0D57;N           # Mc         MALAYALAM AU LENGTH MARK
+0D58..0D5E;N     # No     [7] MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH..MALAYALAM FRACTION ONE FIFTH
+0D5F..0D61;N     # Lo     [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL
+0D62..0D63;N     # Mn     [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL
+0D66..0D6F;N     # Nd    [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE
+0D70..0D78;N     # No     [9] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE SIXTEENTHS
+0D79;N           # So         MALAYALAM DATE MARK
+0D7A..0D7F;N     # Lo     [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K
+0D81;N           # Mn         SINHALA SIGN CANDRABINDU
+0D82..0D83;N     # Mc     [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA
+0D85..0D96;N     # Lo    [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA
+0D9A..0DB1;N     # Lo    [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA
+0DB3..0DBB;N     # Lo     [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA
+0DBD;N           # Lo         SINHALA LETTER DANTAJA LAYANNA
+0DC0..0DC6;N     # Lo     [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA
+0DCA;N           # Mn         SINHALA SIGN AL-LAKUNA
+0DCF..0DD1;N     # Mc     [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA
+0DD2..0DD4;N     # Mn     [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA
+0DD6;N           # Mn         SINHALA VOWEL SIGN DIGA PAA-PILLA
+0DD8..0DDF;N     # Mc     [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA
+0DE6..0DEF;N     # Nd    [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE
+0DF2..0DF3;N     # Mc     [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA
+0DF4;N           # Po         SINHALA PUNCTUATION KUNDDALIYA
+0E01..0E30;N     # Lo    [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A
+0E31;N           # Mn         THAI CHARACTER MAI HAN-AKAT
+0E32..0E33;N     # Lo     [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM
+0E34..0E3A;N     # Mn     [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU
+0E3F;N           # Sc         THAI CURRENCY SYMBOL BAHT
+0E40..0E45;N     # Lo     [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO
+0E46;N           # Lm         THAI CHARACTER MAIYAMOK
+0E47..0E4E;N     # Mn     [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
+0E4F;N           # Po         THAI CHARACTER FONGMAN
+0E50..0E59;N     # Nd    [10] THAI DIGIT ZERO..THAI DIGIT NINE
+0E5A..0E5B;N     # Po     [2] THAI CHARACTER ANGKHANKHU..THAI CHARACTER KHOMUT
+0E81..0E82;N     # Lo     [2] LAO LETTER KO..LAO LETTER KHO SUNG
+0E84;N           # Lo         LAO LETTER KHO TAM
+0E86..0E8A;N     # Lo     [5] LAO LETTER PALI GHA..LAO LETTER SO TAM
+0E8C..0EA3;N     # Lo    [24] LAO LETTER PALI JHA..LAO LETTER LO LING
+0EA5;N           # Lo         LAO LETTER LO LOOT
+0EA7..0EB0;N     # Lo    [10] LAO LETTER WO..LAO VOWEL SIGN A
+0EB1;N           # Mn         LAO VOWEL SIGN MAI KAN
+0EB2..0EB3;N     # Lo     [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM
+0EB4..0EBC;N     # Mn     [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
+0EBD;N           # Lo         LAO SEMIVOWEL SIGN NYO
+0EC0..0EC4;N     # Lo     [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
+0EC6;N           # Lm         LAO KO LA
+0EC8..0ECE;N     # Mn     [7] LAO TONE MAI EK..LAO YAMAKKAN
+0ED0..0ED9;N     # Nd    [10] LAO DIGIT ZERO..LAO DIGIT NINE
+0EDC..0EDF;N     # Lo     [4] LAO HO NO..LAO LETTER KHMU NYO
+0F00;N           # Lo         TIBETAN SYLLABLE OM
+0F01..0F03;N     # So     [3] TIBETAN MARK GTER YIG MGO TRUNCATED A..TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA
+0F04..0F12;N     # Po    [15] TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK RGYA GRAM SHAD
+0F13;N           # So         TIBETAN MARK CARET -DZUD RTAGS ME LONG CAN
+0F14;N           # Po         TIBETAN MARK GTER TSHEG
+0F15..0F17;N     # So     [3] TIBETAN LOGOTYPE SIGN CHAD RTAGS..TIBETAN ASTROLOGICAL SIGN SGRA GCAN -CHAR RTAGS
+0F18..0F19;N     # Mn     [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
+0F1A..0F1F;N     # So     [6] TIBETAN SIGN RDEL DKAR GCIG..TIBETAN SIGN RDEL DKAR RDEL NAG
+0F20..0F29;N     # Nd    [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE
+0F2A..0F33;N     # No    [10] TIBETAN DIGIT HALF ONE..TIBETAN DIGIT HALF ZERO
+0F34;N           # So         TIBETAN MARK BSDUS RTAGS
+0F35;N           # Mn         TIBETAN MARK NGAS BZUNG NYI ZLA
+0F36;N           # So         TIBETAN MARK CARET -DZUD RTAGS BZHI MIG CAN
+0F37;N           # Mn         TIBETAN MARK NGAS BZUNG SGOR RTAGS
+0F38;N           # So         TIBETAN MARK CHE MGO
+0F39;N           # Mn         TIBETAN MARK TSA -PHRU
+0F3A;N           # Ps         TIBETAN MARK GUG RTAGS GYON
+0F3B;N           # Pe         TIBETAN MARK GUG RTAGS GYAS
+0F3C;N           # Ps         TIBETAN MARK ANG KHANG GYON
+0F3D;N           # Pe         TIBETAN MARK ANG KHANG GYAS
+0F3E..0F3F;N     # Mc     [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES
+0F40..0F47;N     # Lo     [8] TIBETAN LETTER KA..TIBETAN LETTER JA
+0F49..0F6C;N     # Lo    [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA
+0F71..0F7E;N     # Mn    [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO
+0F7F;N           # Mc         TIBETAN SIGN RNAM BCAD
+0F80..0F84;N     # Mn     [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA
+0F85;N           # Po         TIBETAN MARK PALUTA
+0F86..0F87;N     # Mn     [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS
+0F88..0F8C;N     # Lo     [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN
+0F8D..0F97;N     # Mn    [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA
+0F99..0FBC;N     # Mn    [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA
+0FBE..0FC5;N     # So     [8] TIBETAN KU RU KHA..TIBETAN SYMBOL RDO RJE
+0FC6;N           # Mn         TIBETAN SYMBOL PADMA GDAN
+0FC7..0FCC;N     # So     [6] TIBETAN SYMBOL RDO RJE RGYA GRAM..TIBETAN SYMBOL NOR BU BZHI -KHYIL
+0FCE..0FCF;N     # So     [2] TIBETAN SIGN RDEL NAG RDEL DKAR..TIBETAN SIGN RDEL NAG GSUM
+0FD0..0FD4;N     # Po     [5] TIBETAN MARK BSKA- SHOG GI MGO RGYAN..TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA
+0FD5..0FD8;N     # So     [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS
+0FD9..0FDA;N     # Po     [2] TIBETAN MARK LEADING MCHAN RTAGS..TIBETAN MARK TRAILING MCHAN RTAGS
+1000..102A;N     # Lo    [43] MYANMAR LETTER KA..MYANMAR LETTER AU
+102B..102C;N     # Mc     [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA
+102D..1030;N     # Mn     [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU
+1031;N           # Mc         MYANMAR VOWEL SIGN E
+1032..1037;N     # Mn     [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW
+1038;N           # Mc         MYANMAR SIGN VISARGA
+1039..103A;N     # Mn     [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT
+103B..103C;N     # Mc     [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA
+103D..103E;N     # Mn     [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA
+103F;N           # Lo         MYANMAR LETTER GREAT SA
+1040..1049;N     # Nd    [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
+104A..104F;N     # Po     [6] MYANMAR SIGN LITTLE SECTION..MYANMAR SYMBOL GENITIVE
+1050..1055;N     # Lo     [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL
+1056..1057;N     # Mc     [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR
+1058..1059;N     # Mn     [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL
+105A..105D;N     # Lo     [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE
+105E..1060;N     # Mn     [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA
+1061;N           # Lo         MYANMAR LETTER SGAW KAREN SHA
+1062..1064;N     # Mc     [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO
+1065..1066;N     # Lo     [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA
+1067..106D;N     # Mc     [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5
+106E..1070;N     # Lo     [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA
+1071..1074;N     # Mn     [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE
+1075..1081;N     # Lo    [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA
+1082;N           # Mn         MYANMAR CONSONANT SIGN SHAN MEDIAL WA
+1083..1084;N     # Mc     [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E
+1085..1086;N     # Mn     [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y
+1087..108C;N     # Mc     [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3
+108D;N           # Mn         MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE
+108E;N           # Lo         MYANMAR LETTER RUMAI PALAUNG FA
+108F;N           # Mc         MYANMAR SIGN RUMAI PALAUNG TONE-5
+1090..1099;N     # Nd    [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE
+109A..109C;N     # Mc     [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A
+109D;N           # Mn         MYANMAR VOWEL SIGN AITON AI
+109E..109F;N     # So     [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION
+10A0..10C5;N     # Lu    [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE
+10C7;N           # Lu         GEORGIAN CAPITAL LETTER YN
+10CD;N           # Lu         GEORGIAN CAPITAL LETTER AEN
+10D0..10FA;N     # Ll    [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
+10FB;N           # Po         GEORGIAN PARAGRAPH SEPARATOR
+10FC;N           # Lm         MODIFIER LETTER GEORGIAN NAR
+10FD..10FF;N     # Ll     [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
+1100..115F;W     # Lo    [96] HANGUL CHOSEONG KIYEOK..HANGUL CHOSEONG FILLER
+1160..11FF;N     # Lo   [160] HANGUL JUNGSEONG FILLER..HANGUL JONGSEONG SSANGNIEUN
+1200..1248;N     # Lo    [73] ETHIOPIC SYLLABLE HA..ETHIOPIC SYLLABLE QWA
+124A..124D;N     # Lo     [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE
+1250..1256;N     # Lo     [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO
+1258;N           # Lo         ETHIOPIC SYLLABLE QHWA
+125A..125D;N     # Lo     [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE
+1260..1288;N     # Lo    [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA
+128A..128D;N     # Lo     [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE
+1290..12B0;N     # Lo    [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA
+12B2..12B5;N     # Lo     [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE
+12B8..12BE;N     # Lo     [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO
+12C0;N           # Lo         ETHIOPIC SYLLABLE KXWA
+12C2..12C5;N     # Lo     [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE
+12C8..12D6;N     # Lo    [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O
+12D8..1310;N     # Lo    [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA
+1312..1315;N     # Lo     [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE
+1318..135A;N     # Lo    [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA
+135D..135F;N     # Mn     [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
+1360..1368;N     # Po     [9] ETHIOPIC SECTION MARK..ETHIOPIC PARAGRAPH SEPARATOR
+1369..137C;N     # No    [20] ETHIOPIC DIGIT ONE..ETHIOPIC NUMBER TEN THOUSAND
+1380..138F;N     # Lo    [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE
+1390..1399;N     # So    [10] ETHIOPIC TONAL MARK YIZET..ETHIOPIC TONAL MARK KURT
+13A0..13F5;N     # Lu    [86] CHEROKEE LETTER A..CHEROKEE LETTER MV
+13F8..13FD;N     # Ll     [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
+1400;N           # Pd         CANADIAN SYLLABICS HYPHEN
+1401..166C;N     # Lo   [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA
+166D;N           # So         CANADIAN SYLLABICS CHI SIGN
+166E;N           # Po         CANADIAN SYLLABICS FULL STOP
+166F..167F;N     # Lo    [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W
+1680;N           # Zs         OGHAM SPACE MARK
+1681..169A;N     # Lo    [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH
+169B;N           # Ps         OGHAM FEATHER MARK
+169C;N           # Pe         OGHAM REVERSED FEATHER MARK
+16A0..16EA;N     # Lo    [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
+16EB..16ED;N     # Po     [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION
+16EE..16F0;N     # Nl     [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL
+16F1..16F8;N     # Lo     [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
+1700..1711;N     # Lo    [18] TAGALOG LETTER A..TAGALOG LETTER HA
+1712..1714;N     # Mn     [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
+1715;N           # Mc         TAGALOG SIGN PAMUDPOD
+171F;N           # Lo         TAGALOG LETTER ARCHAIC RA
+1720..1731;N     # Lo    [18] HANUNOO LETTER A..HANUNOO LETTER HA
+1732..1733;N     # Mn     [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U
+1734;N           # Mc         HANUNOO SIGN PAMUDPOD
+1735..1736;N     # Po     [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
+1740..1751;N     # Lo    [18] BUHID LETTER A..BUHID LETTER HA
+1752..1753;N     # Mn     [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
+1760..176C;N     # Lo    [13] TAGBANWA LETTER A..TAGBANWA LETTER YA
+176E..1770;N     # Lo     [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA
+1772..1773;N     # Mn     [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
+1780..17B3;N     # Lo    [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU
+17B4..17B5;N     # Mn     [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
+17B6;N           # Mc         KHMER VOWEL SIGN AA
+17B7..17BD;N     # Mn     [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA
+17BE..17C5;N     # Mc     [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU
+17C6;N           # Mn         KHMER SIGN NIKAHIT
+17C7..17C8;N     # Mc     [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU
+17C9..17D3;N     # Mn    [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
+17D4..17D6;N     # Po     [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH
+17D7;N           # Lm         KHMER SIGN LEK TOO
+17D8..17DA;N     # Po     [3] KHMER SIGN BEYYAL..KHMER SIGN KOOMUUT
+17DB;N           # Sc         KHMER CURRENCY SYMBOL RIEL
+17DC;N           # Lo         KHMER SIGN AVAKRAHASANYA
+17DD;N           # Mn         KHMER SIGN ATTHACAN
+17E0..17E9;N     # Nd    [10] KHMER DIGIT ZERO..KHMER DIGIT NINE
+17F0..17F9;N     # No    [10] KHMER SYMBOL LEK ATTAK SON..KHMER SYMBOL LEK ATTAK PRAM-BUON
+1800..1805;N     # Po     [6] MONGOLIAN BIRGA..MONGOLIAN FOUR DOTS
+1806;N           # Pd         MONGOLIAN TODO SOFT HYPHEN
+1807..180A;N     # Po     [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU
+180B..180D;N     # Mn     [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
+180E;N           # Cf         MONGOLIAN VOWEL SEPARATOR
+180F;N           # Mn         MONGOLIAN FREE VARIATION SELECTOR FOUR
+1810..1819;N     # Nd    [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE
+1820..1842;N     # Lo    [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
+1843;N           # Lm         MONGOLIAN LETTER TODO LONG VOWEL SIGN
+1844..1878;N     # Lo    [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS
+1880..1884;N     # Lo     [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA
+1885..1886;N     # Mn     [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
+1887..18A8;N     # Lo    [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA
+18A9;N           # Mn         MONGOLIAN LETTER ALI GALI DAGALGA
+18AA;N           # Lo         MONGOLIAN LETTER MANCHU ALI GALI LHA
+18B0..18F5;N     # Lo    [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S
+1900..191E;N     # Lo    [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA
+1920..1922;N     # Mn     [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
+1923..1926;N     # Mc     [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU
+1927..1928;N     # Mn     [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O
+1929..192B;N     # Mc     [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA
+1930..1931;N     # Mc     [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA
+1932;N           # Mn         LIMBU SMALL LETTER ANUSVARA
+1933..1938;N     # Mc     [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA
+1939..193B;N     # Mn     [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I
+1940;N           # So         LIMBU SIGN LOO
+1944..1945;N     # Po     [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
+1946..194F;N     # Nd    [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE
+1950..196D;N     # Lo    [30] TAI LE LETTER KA..TAI LE LETTER AI
+1970..1974;N     # Lo     [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6
+1980..19AB;N     # Lo    [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA
+19B0..19C9;N     # Lo    [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2
+19D0..19D9;N     # Nd    [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE
+19DA;N           # No         NEW TAI LUE THAM DIGIT ONE
+19DE..19DF;N     # So     [2] NEW TAI LUE SIGN LAE..NEW TAI LUE SIGN LAEV
+19E0..19FF;N     # So    [32] KHMER SYMBOL PATHAMASAT..KHMER SYMBOL DAP-PRAM ROC
+1A00..1A16;N     # Lo    [23] BUGINESE LETTER KA..BUGINESE LETTER HA
+1A17..1A18;N     # Mn     [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
+1A19..1A1A;N     # Mc     [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O
+1A1B;N           # Mn         BUGINESE VOWEL SIGN AE
+1A1E..1A1F;N     # Po     [2] BUGINESE PALLAWA..BUGINESE END OF SECTION
+1A20..1A54;N     # Lo    [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA
+1A55;N           # Mc         TAI THAM CONSONANT SIGN MEDIAL RA
+1A56;N           # Mn         TAI THAM CONSONANT SIGN MEDIAL LA
+1A57;N           # Mc         TAI THAM CONSONANT SIGN LA TANG LAI
+1A58..1A5E;N     # Mn     [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA
+1A60;N           # Mn         TAI THAM SIGN SAKOT
+1A61;N           # Mc         TAI THAM VOWEL SIGN A
+1A62;N           # Mn         TAI THAM VOWEL SIGN MAI SAT
+1A63..1A64;N     # Mc     [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA
+1A65..1A6C;N     # Mn     [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW
+1A6D..1A72;N     # Mc     [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI
+1A73..1A7C;N     # Mn    [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN
+1A7F;N           # Mn         TAI THAM COMBINING CRYPTOGRAMMIC DOT
+1A80..1A89;N     # Nd    [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE
+1A90..1A99;N     # Nd    [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE
+1AA0..1AA6;N     # Po     [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA
+1AA7;N           # Lm         TAI THAM SIGN MAI YAMOK
+1AA8..1AAD;N     # Po     [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG
+1AB0..1ABD;N     # Mn    [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
+1ABE;N           # Me         COMBINING PARENTHESES OVERLAY
+1ABF..1ACE;N     # Mn    [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T
+1B00..1B03;N     # Mn     [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
+1B04;N           # Mc         BALINESE SIGN BISAH
+1B05..1B33;N     # Lo    [47] BALINESE LETTER AKARA..BALINESE LETTER HA
+1B34;N           # Mn         BALINESE SIGN REREKAN
+1B35;N           # Mc         BALINESE VOWEL SIGN TEDUNG
+1B36..1B3A;N     # Mn     [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA
+1B3B;N           # Mc         BALINESE VOWEL SIGN RA REPA TEDUNG
+1B3C;N           # Mn         BALINESE VOWEL SIGN LA LENGA
+1B3D..1B41;N     # Mc     [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG
+1B42;N           # Mn         BALINESE VOWEL SIGN PEPET
+1B43..1B44;N     # Mc     [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG
+1B45..1B4C;N     # Lo     [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA
+1B50..1B59;N     # Nd    [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE
+1B5A..1B60;N     # Po     [7] BALINESE PANTI..BALINESE PAMENENG
+1B61..1B6A;N     # So    [10] BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE
+1B6B..1B73;N     # Mn     [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG
+1B74..1B7C;N     # So     [9] BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING
+1B7D..1B7E;N     # Po     [2] BALINESE PANTI LANTANG..BALINESE PAMADA LANTANG
+1B80..1B81;N     # Mn     [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR
+1B82;N           # Mc         SUNDANESE SIGN PANGWISAD
+1B83..1BA0;N     # Lo    [30] SUNDANESE LETTER A..SUNDANESE LETTER HA
+1BA1;N           # Mc         SUNDANESE CONSONANT SIGN PAMINGKAL
+1BA2..1BA5;N     # Mn     [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU
+1BA6..1BA7;N     # Mc     [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG
+1BA8..1BA9;N     # Mn     [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
+1BAA;N           # Mc         SUNDANESE SIGN PAMAAEH
+1BAB..1BAD;N     # Mn     [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA
+1BAE..1BAF;N     # Lo     [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
+1BB0..1BB9;N     # Nd    [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE
+1BBA..1BBF;N     # Lo     [6] SUNDANESE AVAGRAHA..SUNDANESE LETTER FINAL M
+1BC0..1BE5;N     # Lo    [38] BATAK LETTER A..BATAK LETTER U
+1BE6;N           # Mn         BATAK SIGN TOMPI
+1BE7;N           # Mc         BATAK VOWEL SIGN E
+1BE8..1BE9;N     # Mn     [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE
+1BEA..1BEC;N     # Mc     [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O
+1BED;N           # Mn         BATAK VOWEL SIGN KARO O
+1BEE;N           # Mc         BATAK VOWEL SIGN U
+1BEF..1BF1;N     # Mn     [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H
+1BF2..1BF3;N     # Mc     [2] BATAK PANGOLAT..BATAK PANONGONAN
+1BFC..1BFF;N     # Po     [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT
+1C00..1C23;N     # Lo    [36] LEPCHA LETTER KA..LEPCHA LETTER A
+1C24..1C2B;N     # Mc     [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU
+1C2C..1C33;N     # Mn     [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T
+1C34..1C35;N     # Mc     [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG
+1C36..1C37;N     # Mn     [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA
+1C3B..1C3F;N     # Po     [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK
+1C40..1C49;N     # Nd    [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE
+1C4D..1C4F;N     # Lo     [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA
+1C50..1C59;N     # Nd    [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE
+1C5A..1C77;N     # Lo    [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH
+1C78..1C7D;N     # Lm     [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD
+1C7E..1C7F;N     # Po     [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD
+1C80..1C88;N     # Ll     [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK
+1C90..1CBA;N     # Lu    [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN
+1CBD..1CBF;N     # Lu     [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN
+1CC0..1CC7;N     # Po     [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA
+1CD0..1CD2;N     # Mn     [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
+1CD3;N           # Po         VEDIC SIGN NIHSHVASA
+1CD4..1CE0;N     # Mn    [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
+1CE1;N           # Mc         VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
+1CE2..1CE8;N     # Mn     [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
+1CE9..1CEC;N     # Lo     [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
+1CED;N           # Mn         VEDIC SIGN TIRYAK
+1CEE..1CF3;N     # Lo     [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA
+1CF4;N           # Mn         VEDIC TONE CANDRA ABOVE
+1CF5..1CF6;N     # Lo     [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
+1CF7;N           # Mc         VEDIC SIGN ATIKRAMA
+1CF8..1CF9;N     # Mn     [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
+1CFA;N           # Lo         VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA
+1D00..1D2B;N     # Ll    [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL
+1D2C..1D6A;N     # Lm    [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI
+1D6B..1D77;N     # Ll    [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G
+1D78;N           # Lm         MODIFIER LETTER CYRILLIC EN
+1D79..1D7F;N     # Ll     [7] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER UPSILON WITH STROKE
+1D80..1D9A;N     # Ll    [27] LATIN SMALL LETTER B WITH PALATAL HOOK..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK
+1D9B..1DBF;N     # Lm    [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA
+1DC0..1DFF;N     # Mn    [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
+1E00..1EFF;N     # L&   [256] LATIN CAPITAL LETTER A WITH RING BELOW..LATIN SMALL LETTER Y WITH LOOP
+1F00..1F15;N     # L&    [22] GREEK SMALL LETTER ALPHA WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA
+1F18..1F1D;N     # Lu     [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
+1F20..1F45;N     # L&    [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA
+1F48..1F4D;N     # Lu     [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA
+1F50..1F57;N     # Ll     [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI
+1F59;N           # Lu         GREEK CAPITAL LETTER UPSILON WITH DASIA
+1F5B;N           # Lu         GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA
+1F5D;N           # Lu         GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA
+1F5F..1F7D;N     # L&    [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA
+1F80..1FB4;N     # L&    [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+1FB6..1FBC;N     # L&     [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+1FBD;N           # Sk         GREEK KORONIS
+1FBE;N           # Ll         GREEK PROSGEGRAMMENI
+1FBF..1FC1;N     # Sk     [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
+1FC2..1FC4;N     # Ll     [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+1FC6..1FCC;N     # L&     [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+1FCD..1FCF;N     # Sk     [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI
+1FD0..1FD3;N     # Ll     [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+1FD6..1FDB;N     # L&     [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA
+1FDD..1FDF;N     # Sk     [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI
+1FE0..1FEC;N     # L&    [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA
+1FED..1FEF;N     # Sk     [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA
+1FF2..1FF4;N     # Ll     [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+1FF6..1FFC;N     # L&     [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+1FFD..1FFE;N     # Sk     [2] GREEK OXIA..GREEK DASIA
+2000..200A;N     # Zs    [11] EN QUAD..HAIR SPACE
+200B..200F;N     # Cf     [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK
+2010;A           # Pd         HYPHEN
+2011..2012;N     # Pd     [2] NON-BREAKING HYPHEN..FIGURE DASH
+2013..2015;A     # Pd     [3] EN DASH..HORIZONTAL BAR
+2016;A           # Po         DOUBLE VERTICAL LINE
+2017;N           # Po         DOUBLE LOW LINE
+2018;A           # Pi         LEFT SINGLE QUOTATION MARK
+2019;A           # Pf         RIGHT SINGLE QUOTATION MARK
+201A;N           # Ps         SINGLE LOW-9 QUOTATION MARK
+201B;N           # Pi         SINGLE HIGH-REVERSED-9 QUOTATION MARK
+201C;A           # Pi         LEFT DOUBLE QUOTATION MARK
+201D;A           # Pf         RIGHT DOUBLE QUOTATION MARK
+201E;N           # Ps         DOUBLE LOW-9 QUOTATION MARK
+201F;N           # Pi         DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+2020..2022;A     # Po     [3] DAGGER..BULLET
+2023;N           # Po         TRIANGULAR BULLET
+2024..2027;A     # Po     [4] ONE DOT LEADER..HYPHENATION POINT
+2028;N           # Zl         LINE SEPARATOR
+2029;N           # Zp         PARAGRAPH SEPARATOR
+202A..202E;N     # Cf     [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
+202F;N           # Zs         NARROW NO-BREAK SPACE
+2030;A           # Po         PER MILLE SIGN
+2031;N           # Po         PER TEN THOUSAND SIGN
+2032..2033;A     # Po     [2] PRIME..DOUBLE PRIME
+2034;N           # Po         TRIPLE PRIME
+2035;A           # Po         REVERSED PRIME
+2036..2038;N     # Po     [3] REVERSED DOUBLE PRIME..CARET
+2039;N           # Pi         SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+203A;N           # Pf         SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+203B;A           # Po         REFERENCE MARK
+203C..203D;N     # Po     [2] DOUBLE EXCLAMATION MARK..INTERROBANG
+203E;A           # Po         OVERLINE
+203F..2040;N     # Pc     [2] UNDERTIE..CHARACTER TIE
+2041..2043;N     # Po     [3] CARET INSERTION POINT..HYPHEN BULLET
+2044;N           # Sm         FRACTION SLASH
+2045;N           # Ps         LEFT SQUARE BRACKET WITH QUILL
+2046;N           # Pe         RIGHT SQUARE BRACKET WITH QUILL
+2047..2051;N     # Po    [11] DOUBLE QUESTION MARK..TWO ASTERISKS ALIGNED VERTICALLY
+2052;N           # Sm         COMMERCIAL MINUS SIGN
+2053;N           # Po         SWUNG DASH
+2054;N           # Pc         INVERTED UNDERTIE
+2055..205E;N     # Po    [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS
+205F;N           # Zs         MEDIUM MATHEMATICAL SPACE
+2060..2064;N     # Cf     [5] WORD JOINER..INVISIBLE PLUS
+2066..206F;N     # Cf    [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
+2070;N           # No         SUPERSCRIPT ZERO
+2071;N           # Lm         SUPERSCRIPT LATIN SMALL LETTER I
+2074;A           # No         SUPERSCRIPT FOUR
+2075..2079;N     # No     [5] SUPERSCRIPT FIVE..SUPERSCRIPT NINE
+207A..207C;N     # Sm     [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN
+207D;N           # Ps         SUPERSCRIPT LEFT PARENTHESIS
+207E;N           # Pe         SUPERSCRIPT RIGHT PARENTHESIS
+207F;A           # Lm         SUPERSCRIPT LATIN SMALL LETTER N
+2080;N           # No         SUBSCRIPT ZERO
+2081..2084;A     # No     [4] SUBSCRIPT ONE..SUBSCRIPT FOUR
+2085..2089;N     # No     [5] SUBSCRIPT FIVE..SUBSCRIPT NINE
+208A..208C;N     # Sm     [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN
+208D;N           # Ps         SUBSCRIPT LEFT PARENTHESIS
+208E;N           # Pe         SUBSCRIPT RIGHT PARENTHESIS
+2090..209C;N     # Lm    [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T
+20A0..20A8;N     # Sc     [9] EURO-CURRENCY SIGN..RUPEE SIGN
+20A9;H           # Sc         WON SIGN
+20AA..20AB;N     # Sc     [2] NEW SHEQEL SIGN..DONG SIGN
+20AC;A           # Sc         EURO SIGN
+20AD..20C0;N     # Sc    [20] KIP SIGN..SOM SIGN
+20D0..20DC;N     # Mn    [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
+20DD..20E0;N     # Me     [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
+20E1;N           # Mn         COMBINING LEFT RIGHT ARROW ABOVE
+20E2..20E4;N     # Me     [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE
+20E5..20F0;N     # Mn    [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE
+2100..2101;N     # So     [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT
+2102;N           # Lu         DOUBLE-STRUCK CAPITAL C
+2103;A           # So         DEGREE CELSIUS
+2104;N           # So         CENTRE LINE SYMBOL
+2105;A           # So         CARE OF
+2106;N           # So         CADA UNA
+2107;N           # Lu         EULER CONSTANT
+2108;N           # So         SCRUPLE
+2109;A           # So         DEGREE FAHRENHEIT
+210A..2112;N     # L&     [9] SCRIPT SMALL G..SCRIPT CAPITAL L
+2113;A           # Ll         SCRIPT SMALL L
+2114;N           # So         L B BAR SYMBOL
+2115;N           # Lu         DOUBLE-STRUCK CAPITAL N
+2116;A           # So         NUMERO SIGN
+2117;N           # So         SOUND RECORDING COPYRIGHT
+2118;N           # Sm         SCRIPT CAPITAL P
+2119..211D;N     # Lu     [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R
+211E..2120;N     # So     [3] PRESCRIPTION TAKE..SERVICE MARK
+2121..2122;A     # So     [2] TELEPHONE SIGN..TRADE MARK SIGN
+2123;N           # So         VERSICLE
+2124;N           # Lu         DOUBLE-STRUCK CAPITAL Z
+2125;N           # So         OUNCE SIGN
+2126;A           # Lu         OHM SIGN
+2127;N           # So         INVERTED OHM SIGN
+2128;N           # Lu         BLACK-LETTER CAPITAL Z
+2129;N           # So         TURNED GREEK SMALL LETTER IOTA
+212A;N           # Lu         KELVIN SIGN
+212B;A           # Lu         ANGSTROM SIGN
+212C..212D;N     # Lu     [2] SCRIPT CAPITAL B..BLACK-LETTER CAPITAL C
+212E;N           # So         ESTIMATED SYMBOL
+212F..2134;N     # L&     [6] SCRIPT SMALL E..SCRIPT SMALL O
+2135..2138;N     # Lo     [4] ALEF SYMBOL..DALET SYMBOL
+2139;N           # Ll         INFORMATION SOURCE
+213A..213B;N     # So     [2] ROTATED CAPITAL Q..FACSIMILE SIGN
+213C..213F;N     # L&     [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI
+2140..2144;N     # Sm     [5] DOUBLE-STRUCK N-ARY SUMMATION..TURNED SANS-SERIF CAPITAL Y
+2145..2149;N     # L&     [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J
+214A;N           # So         PROPERTY LINE
+214B;N           # Sm         TURNED AMPERSAND
+214C..214D;N     # So     [2] PER SIGN..AKTIESELSKAB
+214E;N           # Ll         TURNED SMALL F
+214F;N           # So         SYMBOL FOR SAMARITAN SOURCE
+2150..2152;N     # No     [3] VULGAR FRACTION ONE SEVENTH..VULGAR FRACTION ONE TENTH
+2153..2154;A     # No     [2] VULGAR FRACTION ONE THIRD..VULGAR FRACTION TWO THIRDS
+2155..215A;N     # No     [6] VULGAR FRACTION ONE FIFTH..VULGAR FRACTION FIVE SIXTHS
+215B..215E;A     # No     [4] VULGAR FRACTION ONE EIGHTH..VULGAR FRACTION SEVEN EIGHTHS
+215F;N           # No         FRACTION NUMERATOR ONE
+2160..216B;A     # Nl    [12] ROMAN NUMERAL ONE..ROMAN NUMERAL TWELVE
+216C..216F;N     # Nl     [4] ROMAN NUMERAL FIFTY..ROMAN NUMERAL ONE THOUSAND
+2170..2179;A     # Nl    [10] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL TEN
+217A..2182;N     # Nl     [9] SMALL ROMAN NUMERAL ELEVEN..ROMAN NUMERAL TEN THOUSAND
+2183..2184;N     # L&     [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C
+2185..2188;N     # Nl     [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND
+2189;A           # No         VULGAR FRACTION ZERO THIRDS
+218A..218B;N     # So     [2] TURNED DIGIT TWO..TURNED DIGIT THREE
+2190..2194;A     # Sm     [5] LEFTWARDS ARROW..LEFT RIGHT ARROW
+2195..2199;A     # So     [5] UP DOWN ARROW..SOUTH WEST ARROW
+219A..219B;N     # Sm     [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE
+219C..219F;N     # So     [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW
+21A0;N           # Sm         RIGHTWARDS TWO HEADED ARROW
+21A1..21A2;N     # So     [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL
+21A3;N           # Sm         RIGHTWARDS ARROW WITH TAIL
+21A4..21A5;N     # So     [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR
+21A6;N           # Sm         RIGHTWARDS ARROW FROM BAR
+21A7..21AD;N     # So     [7] DOWNWARDS ARROW FROM BAR..LEFT RIGHT WAVE ARROW
+21AE;N           # Sm         LEFT RIGHT ARROW WITH STROKE
+21AF..21B7;N     # So     [9] DOWNWARDS ZIGZAG ARROW..CLOCKWISE TOP SEMICIRCLE ARROW
+21B8..21B9;A     # So     [2] NORTH WEST ARROW TO LONG BAR..LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR
+21BA..21CD;N     # So    [20] ANTICLOCKWISE OPEN CIRCLE ARROW..LEFTWARDS DOUBLE ARROW WITH STROKE
+21CE..21CF;N     # Sm     [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE
+21D0..21D1;N     # So     [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW
+21D2;A           # Sm         RIGHTWARDS DOUBLE ARROW
+21D3;N           # So         DOWNWARDS DOUBLE ARROW
+21D4;A           # Sm         LEFT RIGHT DOUBLE ARROW
+21D5..21E6;N     # So    [18] UP DOWN DOUBLE ARROW..LEFTWARDS WHITE ARROW
+21E7;A           # So         UPWARDS WHITE ARROW
+21E8..21F3;N     # So    [12] RIGHTWARDS WHITE ARROW..UP DOWN WHITE ARROW
+21F4..21FF;N     # Sm    [12] RIGHT ARROW WITH SMALL CIRCLE..LEFT RIGHT OPEN-HEADED ARROW
+2200;A           # Sm         FOR ALL
+2201;N           # Sm         COMPLEMENT
+2202..2203;A     # Sm     [2] PARTIAL DIFFERENTIAL..THERE EXISTS
+2204..2206;N     # Sm     [3] THERE DOES NOT EXIST..INCREMENT
+2207..2208;A     # Sm     [2] NABLA..ELEMENT OF
+2209..220A;N     # Sm     [2] NOT AN ELEMENT OF..SMALL ELEMENT OF
+220B;A           # Sm         CONTAINS AS MEMBER
+220C..220E;N     # Sm     [3] DOES NOT CONTAIN AS MEMBER..END OF PROOF
+220F;A           # Sm         N-ARY PRODUCT
+2210;N           # Sm         N-ARY COPRODUCT
+2211;A           # Sm         N-ARY SUMMATION
+2212..2214;N     # Sm     [3] MINUS SIGN..DOT PLUS
+2215;A           # Sm         DIVISION SLASH
+2216..2219;N     # Sm     [4] SET MINUS..BULLET OPERATOR
+221A;A           # Sm         SQUARE ROOT
+221B..221C;N     # Sm     [2] CUBE ROOT..FOURTH ROOT
+221D..2220;A     # Sm     [4] PROPORTIONAL TO..ANGLE
+2221..2222;N     # Sm     [2] MEASURED ANGLE..SPHERICAL ANGLE
+2223;A           # Sm         DIVIDES
+2224;N           # Sm         DOES NOT DIVIDE
+2225;A           # Sm         PARALLEL TO
+2226;N           # Sm         NOT PARALLEL TO
+2227..222C;A     # Sm     [6] LOGICAL AND..DOUBLE INTEGRAL
+222D;N           # Sm         TRIPLE INTEGRAL
+222E;A           # Sm         CONTOUR INTEGRAL
+222F..2233;N     # Sm     [5] SURFACE INTEGRAL..ANTICLOCKWISE CONTOUR INTEGRAL
+2234..2237;A     # Sm     [4] THEREFORE..PROPORTION
+2238..223B;N     # Sm     [4] DOT MINUS..HOMOTHETIC
+223C..223D;A     # Sm     [2] TILDE OPERATOR..REVERSED TILDE
+223E..2247;N     # Sm    [10] INVERTED LAZY S..NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
+2248;A           # Sm         ALMOST EQUAL TO
+2249..224B;N     # Sm     [3] NOT ALMOST EQUAL TO..TRIPLE TILDE
+224C;A           # Sm         ALL EQUAL TO
+224D..2251;N     # Sm     [5] EQUIVALENT TO..GEOMETRICALLY EQUAL TO
+2252;A           # Sm         APPROXIMATELY EQUAL TO OR THE IMAGE OF
+2253..225F;N     # Sm    [13] IMAGE OF OR APPROXIMATELY EQUAL TO..QUESTIONED EQUAL TO
+2260..2261;A     # Sm     [2] NOT EQUAL TO..IDENTICAL TO
+2262..2263;N     # Sm     [2] NOT IDENTICAL TO..STRICTLY EQUIVALENT TO
+2264..2267;A     # Sm     [4] LESS-THAN OR EQUAL TO..GREATER-THAN OVER EQUAL TO
+2268..2269;N     # Sm     [2] LESS-THAN BUT NOT EQUAL TO..GREATER-THAN BUT NOT EQUAL TO
+226A..226B;A     # Sm     [2] MUCH LESS-THAN..MUCH GREATER-THAN
+226C..226D;N     # Sm     [2] BETWEEN..NOT EQUIVALENT TO
+226E..226F;A     # Sm     [2] NOT LESS-THAN..NOT GREATER-THAN
+2270..2281;N     # Sm    [18] NEITHER LESS-THAN NOR EQUAL TO..DOES NOT SUCCEED
+2282..2283;A     # Sm     [2] SUBSET OF..SUPERSET OF
+2284..2285;N     # Sm     [2] NOT A SUBSET OF..NOT A SUPERSET OF
+2286..2287;A     # Sm     [2] SUBSET OF OR EQUAL TO..SUPERSET OF OR EQUAL TO
+2288..2294;N     # Sm    [13] NEITHER A SUBSET OF NOR EQUAL TO..SQUARE CUP
+2295;A           # Sm         CIRCLED PLUS
+2296..2298;N     # Sm     [3] CIRCLED MINUS..CIRCLED DIVISION SLASH
+2299;A           # Sm         CIRCLED DOT OPERATOR
+229A..22A4;N     # Sm    [11] CIRCLED RING OPERATOR..DOWN TACK
+22A5;A           # Sm         UP TACK
+22A6..22BE;N     # Sm    [25] ASSERTION..RIGHT ANGLE WITH ARC
+22BF;A           # Sm         RIGHT TRIANGLE
+22C0..22FF;N     # Sm    [64] N-ARY LOGICAL AND..Z NOTATION BAG MEMBERSHIP
+2300..2307;N     # So     [8] DIAMETER SIGN..WAVY LINE
+2308;N           # Ps         LEFT CEILING
+2309;N           # Pe         RIGHT CEILING
+230A;N           # Ps         LEFT FLOOR
+230B;N           # Pe         RIGHT FLOOR
+230C..2311;N     # So     [6] BOTTOM RIGHT CROP..SQUARE LOZENGE
+2312;A           # So         ARC
+2313..2319;N     # So     [7] SEGMENT..TURNED NOT SIGN
+231A..231B;W     # So     [2] WATCH..HOURGLASS
+231C..231F;N     # So     [4] TOP LEFT CORNER..BOTTOM RIGHT CORNER
+2320..2321;N     # Sm     [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL
+2322..2328;N     # So     [7] FROWN..KEYBOARD
+2329;W           # Ps         LEFT-POINTING ANGLE BRACKET
+232A;W           # Pe         RIGHT-POINTING ANGLE BRACKET
+232B..237B;N     # So    [81] ERASE TO THE LEFT..NOT CHECK MARK
+237C;N           # Sm         RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW
+237D..239A;N     # So    [30] SHOULDERED OPEN BOX..CLEAR SCREEN SYMBOL
+239B..23B3;N     # Sm    [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
+23B4..23DB;N     # So    [40] TOP SQUARE BRACKET..FUSE
+23DC..23E1;N     # Sm     [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
+23E2..23E8;N     # So     [7] WHITE TRAPEZIUM..DECIMAL EXPONENT SYMBOL
+23E9..23EC;W     # So     [4] BLACK RIGHT-POINTING DOUBLE TRIANGLE..BLACK DOWN-POINTING DOUBLE TRIANGLE
+23ED..23EF;N     # So     [3] BLACK RIGHT-POINTING DOUBLE TRIANGLE WITH VERTICAL BAR..BLACK RIGHT-POINTING TRIANGLE WITH DOUBLE VERTICAL BAR
+23F0;W           # So         ALARM CLOCK
+23F1..23F2;N     # So     [2] STOPWATCH..TIMER CLOCK
+23F3;W           # So         HOURGLASS WITH FLOWING SAND
+23F4..23FF;N     # So    [12] BLACK MEDIUM LEFT-POINTING TRIANGLE..OBSERVER EYE SYMBOL
+2400..2426;N     # So    [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
+2440..244A;N     # So    [11] OCR HOOK..OCR DOUBLE BACKSLASH
+2460..249B;A     # No    [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP
+249C..24E9;A     # So    [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
+24EA;N           # No         CIRCLED DIGIT ZERO
+24EB..24FF;A     # No    [21] NEGATIVE CIRCLED NUMBER ELEVEN..NEGATIVE CIRCLED DIGIT ZERO
+2500..254B;A     # So    [76] BOX DRAWINGS LIGHT HORIZONTAL..BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL
+254C..254F;N     # So     [4] BOX DRAWINGS LIGHT DOUBLE DASH HORIZONTAL..BOX DRAWINGS HEAVY DOUBLE DASH VERTICAL
+2550..2573;A     # So    [36] BOX DRAWINGS DOUBLE HORIZONTAL..BOX DRAWINGS LIGHT DIAGONAL CROSS
+2574..257F;N     # So    [12] BOX DRAWINGS LIGHT LEFT..BOX DRAWINGS HEAVY UP AND LIGHT DOWN
+2580..258F;A     # So    [16] UPPER HALF BLOCK..LEFT ONE EIGHTH BLOCK
+2590..2591;N     # So     [2] RIGHT HALF BLOCK..LIGHT SHADE
+2592..2595;A     # So     [4] MEDIUM SHADE..RIGHT ONE EIGHTH BLOCK
+2596..259F;N     # So    [10] QUADRANT LOWER LEFT..QUADRANT UPPER RIGHT AND LOWER LEFT AND LOWER RIGHT
+25A0..25A1;A     # So     [2] BLACK SQUARE..WHITE SQUARE
+25A2;N           # So         WHITE SQUARE WITH ROUNDED CORNERS
+25A3..25A9;A     # So     [7] WHITE SQUARE CONTAINING BLACK SMALL SQUARE..SQUARE WITH DIAGONAL CROSSHATCH FILL
+25AA..25B1;N     # So     [8] BLACK SMALL SQUARE..WHITE PARALLELOGRAM
+25B2..25B3;A     # So     [2] BLACK UP-POINTING TRIANGLE..WHITE UP-POINTING TRIANGLE
+25B4..25B5;N     # So     [2] BLACK UP-POINTING SMALL TRIANGLE..WHITE UP-POINTING SMALL TRIANGLE
+25B6;A           # So         BLACK RIGHT-POINTING TRIANGLE
+25B7;A           # Sm         WHITE RIGHT-POINTING TRIANGLE
+25B8..25BB;N     # So     [4] BLACK RIGHT-POINTING SMALL TRIANGLE..WHITE RIGHT-POINTING POINTER
+25BC..25BD;A     # So     [2] BLACK DOWN-POINTING TRIANGLE..WHITE DOWN-POINTING TRIANGLE
+25BE..25BF;N     # So     [2] BLACK DOWN-POINTING SMALL TRIANGLE..WHITE DOWN-POINTING SMALL TRIANGLE
+25C0;A           # So         BLACK LEFT-POINTING TRIANGLE
+25C1;A           # Sm         WHITE LEFT-POINTING TRIANGLE
+25C2..25C5;N     # So     [4] BLACK LEFT-POINTING SMALL TRIANGLE..WHITE LEFT-POINTING POINTER
+25C6..25C8;A     # So     [3] BLACK DIAMOND..WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND
+25C9..25CA;N     # So     [2] FISHEYE..LOZENGE
+25CB;A           # So         WHITE CIRCLE
+25CC..25CD;N     # So     [2] DOTTED CIRCLE..CIRCLE WITH VERTICAL FILL
+25CE..25D1;A     # So     [4] BULLSEYE..CIRCLE WITH RIGHT HALF BLACK
+25D2..25E1;N     # So    [16] CIRCLE WITH LOWER HALF BLACK..LOWER HALF CIRCLE
+25E2..25E5;A     # So     [4] BLACK LOWER RIGHT TRIANGLE..BLACK UPPER RIGHT TRIANGLE
+25E6..25EE;N     # So     [9] WHITE BULLET..UP-POINTING TRIANGLE WITH RIGHT HALF BLACK
+25EF;A           # So         LARGE CIRCLE
+25F0..25F7;N     # So     [8] WHITE SQUARE WITH UPPER LEFT QUADRANT..WHITE CIRCLE WITH UPPER RIGHT QUADRANT
+25F8..25FC;N     # Sm     [5] UPPER LEFT TRIANGLE..BLACK MEDIUM SQUARE
+25FD..25FE;W     # Sm     [2] WHITE MEDIUM SMALL SQUARE..BLACK MEDIUM SMALL SQUARE
+25FF;N           # Sm         LOWER RIGHT TRIANGLE
+2600..2604;N     # So     [5] BLACK SUN WITH RAYS..COMET
+2605..2606;A     # So     [2] BLACK STAR..WHITE STAR
+2607..2608;N     # So     [2] LIGHTNING..THUNDERSTORM
+2609;A           # So         SUN
+260A..260D;N     # So     [4] ASCENDING NODE..OPPOSITION
+260E..260F;A     # So     [2] BLACK TELEPHONE..WHITE TELEPHONE
+2610..2613;N     # So     [4] BALLOT BOX..SALTIRE
+2614..2615;W     # So     [2] UMBRELLA WITH RAIN DROPS..HOT BEVERAGE
+2616..261B;N     # So     [6] WHITE SHOGI PIECE..BLACK RIGHT POINTING INDEX
+261C;A           # So         WHITE LEFT POINTING INDEX
+261D;N           # So         WHITE UP POINTING INDEX
+261E;A           # So         WHITE RIGHT POINTING INDEX
+261F..263F;N     # So    [33] WHITE DOWN POINTING INDEX..MERCURY
+2640;A           # So         FEMALE SIGN
+2641;N           # So         EARTH
+2642;A           # So         MALE SIGN
+2643..2647;N     # So     [5] JUPITER..PLUTO
+2648..2653;W     # So    [12] ARIES..PISCES
+2654..265F;N     # So    [12] WHITE CHESS KING..BLACK CHESS PAWN
+2660..2661;A     # So     [2] BLACK SPADE SUIT..WHITE HEART SUIT
+2662;N           # So         WHITE DIAMOND SUIT
+2663..2665;A     # So     [3] BLACK CLUB SUIT..BLACK HEART SUIT
+2666;N           # So         BLACK DIAMOND SUIT
+2667..266A;A     # So     [4] WHITE CLUB SUIT..EIGHTH NOTE
+266B;N           # So         BEAMED EIGHTH NOTES
+266C..266D;A     # So     [2] BEAMED SIXTEENTH NOTES..MUSIC FLAT SIGN
+266E;N           # So         MUSIC NATURAL SIGN
+266F;A           # Sm         MUSIC SHARP SIGN
+2670..267E;N     # So    [15] WEST SYRIAC CROSS..PERMANENT PAPER SIGN
+267F;W           # So         WHEELCHAIR SYMBOL
+2680..2692;N     # So    [19] DIE FACE-1..HAMMER AND PICK
+2693;W           # So         ANCHOR
+2694..269D;N     # So    [10] CROSSED SWORDS..OUTLINED WHITE STAR
+269E..269F;A     # So     [2] THREE LINES CONVERGING RIGHT..THREE LINES CONVERGING LEFT
+26A0;N           # So         WARNING SIGN
+26A1;W           # So         HIGH VOLTAGE SIGN
+26A2..26A9;N     # So     [8] DOUBLED FEMALE SIGN..HORIZONTAL MALE WITH STROKE SIGN
+26AA..26AB;W     # So     [2] MEDIUM WHITE CIRCLE..MEDIUM BLACK CIRCLE
+26AC..26BC;N     # So    [17] MEDIUM SMALL WHITE CIRCLE..SESQUIQUADRATE
+26BD..26BE;W     # So     [2] SOCCER BALL..BASEBALL
+26BF;A           # So         SQUARED KEY
+26C0..26C3;N     # So     [4] WHITE DRAUGHTS MAN..BLACK DRAUGHTS KING
+26C4..26C5;W     # So     [2] SNOWMAN WITHOUT SNOW..SUN BEHIND CLOUD
+26C6..26CD;A     # So     [8] RAIN..DISABLED CAR
+26CE;W           # So         OPHIUCHUS
+26CF..26D3;A     # So     [5] PICK..CHAINS
+26D4;W           # So         NO ENTRY
+26D5..26E1;A     # So    [13] ALTERNATE ONE-WAY LEFT WAY TRAFFIC..RESTRICTED LEFT ENTRY-2
+26E2;N           # So         ASTRONOMICAL SYMBOL FOR URANUS
+26E3;A           # So         HEAVY CIRCLE WITH STROKE AND TWO DOTS ABOVE
+26E4..26E7;N     # So     [4] PENTAGRAM..INVERTED PENTAGRAM
+26E8..26E9;A     # So     [2] BLACK CROSS ON SHIELD..SHINTO SHRINE
+26EA;W           # So         CHURCH
+26EB..26F1;A     # So     [7] CASTLE..UMBRELLA ON GROUND
+26F2..26F3;W     # So     [2] FOUNTAIN..FLAG IN HOLE
+26F4;A           # So         FERRY
+26F5;W           # So         SAILBOAT
+26F6..26F9;A     # So     [4] SQUARE FOUR CORNERS..PERSON WITH BALL
+26FA;W           # So         TENT
+26FB..26FC;A     # So     [2] JAPANESE BANK SYMBOL..HEADSTONE GRAVEYARD SYMBOL
+26FD;W           # So         FUEL PUMP
+26FE..26FF;A     # So     [2] CUP ON BLACK SQUARE..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE
+2700..2704;N     # So     [5] BLACK SAFETY SCISSORS..WHITE SCISSORS
+2705;W           # So         WHITE HEAVY CHECK MARK
+2706..2709;N     # So     [4] TELEPHONE LOCATION SIGN..ENVELOPE
+270A..270B;W     # So     [2] RAISED FIST..RAISED HAND
+270C..2727;N     # So    [28] VICTORY HAND..WHITE FOUR POINTED STAR
+2728;W           # So         SPARKLES
+2729..273C;N     # So    [20] STRESS OUTLINED WHITE STAR..OPEN CENTRE TEARDROP-SPOKED ASTERISK
+273D;A           # So         HEAVY TEARDROP-SPOKED ASTERISK
+273E..274B;N     # So    [14] SIX PETALLED BLACK AND WHITE FLORETTE..HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK
+274C;W           # So         CROSS MARK
+274D;N           # So         SHADOWED WHITE CIRCLE
+274E;W           # So         NEGATIVE SQUARED CROSS MARK
+274F..2752;N     # So     [4] LOWER RIGHT DROP-SHADOWED WHITE SQUARE..UPPER RIGHT SHADOWED WHITE SQUARE
+2753..2755;W     # So     [3] BLACK QUESTION MARK ORNAMENT..WHITE EXCLAMATION MARK ORNAMENT
+2756;N           # So         BLACK DIAMOND MINUS WHITE X
+2757;W           # So         HEAVY EXCLAMATION MARK SYMBOL
+2758..2767;N     # So    [16] LIGHT VERTICAL BAR..ROTATED FLORAL HEART BULLET
+2768;N           # Ps         MEDIUM LEFT PARENTHESIS ORNAMENT
+2769;N           # Pe         MEDIUM RIGHT PARENTHESIS ORNAMENT
+276A;N           # Ps         MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
+276B;N           # Pe         MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
+276C;N           # Ps         MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
+276D;N           # Pe         MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
+276E;N           # Ps         HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
+276F;N           # Pe         HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
+2770;N           # Ps         HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
+2771;N           # Pe         HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
+2772;N           # Ps         LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
+2773;N           # Pe         LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
+2774;N           # Ps         MEDIUM LEFT CURLY BRACKET ORNAMENT
+2775;N           # Pe         MEDIUM RIGHT CURLY BRACKET ORNAMENT
+2776..277F;A     # No    [10] DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED NUMBER TEN
+2780..2793;N     # No    [20] DINGBAT CIRCLED SANS-SERIF DIGIT ONE..DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN
+2794;N           # So         HEAVY WIDE-HEADED RIGHTWARDS ARROW
+2795..2797;W     # So     [3] HEAVY PLUS SIGN..HEAVY DIVISION SIGN
+2798..27AF;N     # So    [24] HEAVY SOUTH EAST ARROW..NOTCHED LOWER RIGHT-SHADOWED WHITE RIGHTWARDS ARROW
+27B0;W           # So         CURLY LOOP
+27B1..27BE;N     # So    [14] NOTCHED UPPER RIGHT-SHADOWED WHITE RIGHTWARDS ARROW..OPEN-OUTLINED RIGHTWARDS ARROW
+27BF;W           # So         DOUBLE CURLY LOOP
+27C0..27C4;N     # Sm     [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET
+27C5;N           # Ps         LEFT S-SHAPED BAG DELIMITER
+27C6;N           # Pe         RIGHT S-SHAPED BAG DELIMITER
+27C7..27E5;N     # Sm    [31] OR WITH DOT INSIDE..WHITE SQUARE WITH RIGHTWARDS TICK
+27E6;Na          # Ps         MATHEMATICAL LEFT WHITE SQUARE BRACKET
+27E7;Na          # Pe         MATHEMATICAL RIGHT WHITE SQUARE BRACKET
+27E8;Na          # Ps         MATHEMATICAL LEFT ANGLE BRACKET
+27E9;Na          # Pe         MATHEMATICAL RIGHT ANGLE BRACKET
+27EA;Na          # Ps         MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
+27EB;Na          # Pe         MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
+27EC;Na          # Ps         MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
+27ED;Na          # Pe         MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
+27EE;N           # Ps         MATHEMATICAL LEFT FLATTENED PARENTHESIS
+27EF;N           # Pe         MATHEMATICAL RIGHT FLATTENED PARENTHESIS
+27F0..27FF;N     # Sm    [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW
+2800..28FF;N     # So   [256] BRAILLE PATTERN BLANK..BRAILLE PATTERN DOTS-12345678
+2900..297F;N     # Sm   [128] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..DOWN FISH TAIL
+2980..2982;N     # Sm     [3] TRIPLE VERTICAL BAR DELIMITER..Z NOTATION TYPE COLON
+2983;N           # Ps         LEFT WHITE CURLY BRACKET
+2984;N           # Pe         RIGHT WHITE CURLY BRACKET
+2985;Na          # Ps         LEFT WHITE PARENTHESIS
+2986;Na          # Pe         RIGHT WHITE PARENTHESIS
+2987;N           # Ps         Z NOTATION LEFT IMAGE BRACKET
+2988;N           # Pe         Z NOTATION RIGHT IMAGE BRACKET
+2989;N           # Ps         Z NOTATION LEFT BINDING BRACKET
+298A;N           # Pe         Z NOTATION RIGHT BINDING BRACKET
+298B;N           # Ps         LEFT SQUARE BRACKET WITH UNDERBAR
+298C;N           # Pe         RIGHT SQUARE BRACKET WITH UNDERBAR
+298D;N           # Ps         LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
+298E;N           # Pe         RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
+298F;N           # Ps         LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
+2990;N           # Pe         RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
+2991;N           # Ps         LEFT ANGLE BRACKET WITH DOT
+2992;N           # Pe         RIGHT ANGLE BRACKET WITH DOT
+2993;N           # Ps         LEFT ARC LESS-THAN BRACKET
+2994;N           # Pe         RIGHT ARC GREATER-THAN BRACKET
+2995;N           # Ps         DOUBLE LEFT ARC GREATER-THAN BRACKET
+2996;N           # Pe         DOUBLE RIGHT ARC LESS-THAN BRACKET
+2997;N           # Ps         LEFT BLACK TORTOISE SHELL BRACKET
+2998;N           # Pe         RIGHT BLACK TORTOISE SHELL BRACKET
+2999..29D7;N     # Sm    [63] DOTTED FENCE..BLACK HOURGLASS
+29D8;N           # Ps         LEFT WIGGLY FENCE
+29D9;N           # Pe         RIGHT WIGGLY FENCE
+29DA;N           # Ps         LEFT DOUBLE WIGGLY FENCE
+29DB;N           # Pe         RIGHT DOUBLE WIGGLY FENCE
+29DC..29FB;N     # Sm    [32] INCOMPLETE INFINITY..TRIPLE PLUS
+29FC;N           # Ps         LEFT-POINTING CURVED ANGLE BRACKET
+29FD;N           # Pe         RIGHT-POINTING CURVED ANGLE BRACKET
+29FE..29FF;N     # Sm     [2] TINY..MINY
+2A00..2AFF;N     # Sm   [256] N-ARY CIRCLED DOT OPERATOR..N-ARY WHITE VERTICAL BAR
+2B00..2B1A;N     # So    [27] NORTH EAST WHITE ARROW..DOTTED SQUARE
+2B1B..2B1C;W     # So     [2] BLACK LARGE SQUARE..WHITE LARGE SQUARE
+2B1D..2B2F;N     # So    [19] BLACK VERY SMALL SQUARE..WHITE VERTICAL ELLIPSE
+2B30..2B44;N     # Sm    [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET
+2B45..2B46;N     # So     [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW
+2B47..2B4C;N     # Sm     [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR
+2B4D..2B4F;N     # So     [3] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..SHORT BACKSLANTED SOUTH ARROW
+2B50;W           # So         WHITE MEDIUM STAR
+2B51..2B54;N     # So     [4] BLACK SMALL STAR..WHITE RIGHT-POINTING PENTAGON
+2B55;W           # So         HEAVY LARGE CIRCLE
+2B56..2B59;A     # So     [4] HEAVY OVAL WITH OVAL INSIDE..HEAVY CIRCLED SALTIRE
+2B5A..2B73;N     # So    [26] SLANTED NORTH ARROW WITH HOOKED HEAD..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR
+2B76..2B95;N     # So    [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW
+2B97..2BFF;N     # So   [105] SYMBOL FOR TYPE A ELECTRONICS..HELLSCHREIBER PAUSE SYMBOL
+2C00..2C5F;N     # L&    [96] GLAGOLITIC CAPITAL LETTER AZU..GLAGOLITIC SMALL LETTER CAUDATE CHRIVI
+2C60..2C7B;N     # L&    [28] LATIN CAPITAL LETTER L WITH DOUBLE BAR..LATIN LETTER SMALL CAPITAL TURNED E
+2C7C..2C7D;N     # Lm     [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V
+2C7E..2C7F;N     # Lu     [2] LATIN CAPITAL LETTER S WITH SWASH TAIL..LATIN CAPITAL LETTER Z WITH SWASH TAIL
+2C80..2CE4;N     # L&   [101] COPTIC CAPITAL LETTER ALFA..COPTIC SYMBOL KAI
+2CE5..2CEA;N     # So     [6] COPTIC SYMBOL MI RO..COPTIC SYMBOL SHIMA SIMA
+2CEB..2CEE;N     # L&     [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA
+2CEF..2CF1;N     # Mn     [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS
+2CF2..2CF3;N     # L&     [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI
+2CF9..2CFC;N     # Po     [4] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN VERSE DIVIDER
+2CFD;N           # No         COPTIC FRACTION ONE HALF
+2CFE..2CFF;N     # Po     [2] COPTIC FULL STOP..COPTIC MORPHOLOGICAL DIVIDER
+2D00..2D25;N     # Ll    [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE
+2D27;N           # Ll         GEORGIAN SMALL LETTER YN
+2D2D;N           # Ll         GEORGIAN SMALL LETTER AEN
+2D30..2D67;N     # Lo    [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO
+2D6F;N           # Lm         TIFINAGH MODIFIER LETTER LABIALIZATION MARK
+2D70;N           # Po         TIFINAGH SEPARATOR MARK
+2D7F;N           # Mn         TIFINAGH CONSONANT JOINER
+2D80..2D96;N     # Lo    [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE
+2DA0..2DA6;N     # Lo     [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO
+2DA8..2DAE;N     # Lo     [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO
+2DB0..2DB6;N     # Lo     [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO
+2DB8..2DBE;N     # Lo     [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO
+2DC0..2DC6;N     # Lo     [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO
+2DC8..2DCE;N     # Lo     [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO
+2DD0..2DD6;N     # Lo     [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO
+2DD8..2DDE;N     # Lo     [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO
+2DE0..2DFF;N     # Mn    [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
+2E00..2E01;N     # Po     [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
+2E02;N           # Pi         LEFT SUBSTITUTION BRACKET
+2E03;N           # Pf         RIGHT SUBSTITUTION BRACKET
+2E04;N           # Pi         LEFT DOTTED SUBSTITUTION BRACKET
+2E05;N           # Pf         RIGHT DOTTED SUBSTITUTION BRACKET
+2E06..2E08;N     # Po     [3] RAISED INTERPOLATION MARKER..DOTTED TRANSPOSITION MARKER
+2E09;N           # Pi         LEFT TRANSPOSITION BRACKET
+2E0A;N           # Pf         RIGHT TRANSPOSITION BRACKET
+2E0B;N           # Po         RAISED SQUARE
+2E0C;N           # Pi         LEFT RAISED OMISSION BRACKET
+2E0D;N           # Pf         RIGHT RAISED OMISSION BRACKET
+2E0E..2E16;N     # Po     [9] EDITORIAL CORONIS..DOTTED RIGHT-POINTING ANGLE
+2E17;N           # Pd         DOUBLE OBLIQUE HYPHEN
+2E18..2E19;N     # Po     [2] INVERTED INTERROBANG..PALM BRANCH
+2E1A;N           # Pd         HYPHEN WITH DIAERESIS
+2E1B;N           # Po         TILDE WITH RING ABOVE
+2E1C;N           # Pi         LEFT LOW PARAPHRASE BRACKET
+2E1D;N           # Pf         RIGHT LOW PARAPHRASE BRACKET
+2E1E..2E1F;N     # Po     [2] TILDE WITH DOT ABOVE..TILDE WITH DOT BELOW
+2E20;N           # Pi         LEFT VERTICAL BAR WITH QUILL
+2E21;N           # Pf         RIGHT VERTICAL BAR WITH QUILL
+2E22;N           # Ps         TOP LEFT HALF BRACKET
+2E23;N           # Pe         TOP RIGHT HALF BRACKET
+2E24;N           # Ps         BOTTOM LEFT HALF BRACKET
+2E25;N           # Pe         BOTTOM RIGHT HALF BRACKET
+2E26;N           # Ps         LEFT SIDEWAYS U BRACKET
+2E27;N           # Pe         RIGHT SIDEWAYS U BRACKET
+2E28;N           # Ps         LEFT DOUBLE PARENTHESIS
+2E29;N           # Pe         RIGHT DOUBLE PARENTHESIS
+2E2A..2E2E;N     # Po     [5] TWO DOTS OVER ONE DOT PUNCTUATION..REVERSED QUESTION MARK
+2E2F;N           # Lm         VERTICAL TILDE
+2E30..2E39;N     # Po    [10] RING POINT..TOP HALF SECTION SIGN
+2E3A..2E3B;N     # Pd     [2] TWO-EM DASH..THREE-EM DASH
+2E3C..2E3F;N     # Po     [4] STENOGRAPHIC FULL STOP..CAPITULUM
+2E40;N           # Pd         DOUBLE HYPHEN
+2E41;N           # Po         REVERSED COMMA
+2E42;N           # Ps         DOUBLE LOW-REVERSED-9 QUOTATION MARK
+2E43..2E4F;N     # Po    [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER
+2E50..2E51;N     # So     [2] CROSS PATTY WITH RIGHT CROSSBAR..CROSS PATTY WITH LEFT CROSSBAR
+2E52..2E54;N     # Po     [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK
+2E55;N           # Ps         LEFT SQUARE BRACKET WITH STROKE
+2E56;N           # Pe         RIGHT SQUARE BRACKET WITH STROKE
+2E57;N           # Ps         LEFT SQUARE BRACKET WITH DOUBLE STROKE
+2E58;N           # Pe         RIGHT SQUARE BRACKET WITH DOUBLE STROKE
+2E59;N           # Ps         TOP HALF LEFT PARENTHESIS
+2E5A;N           # Pe         TOP HALF RIGHT PARENTHESIS
+2E5B;N           # Ps         BOTTOM HALF LEFT PARENTHESIS
+2E5C;N           # Pe         BOTTOM HALF RIGHT PARENTHESIS
+2E5D;N           # Pd         OBLIQUE HYPHEN
+2E80..2E99;W     # So    [26] CJK RADICAL REPEAT..CJK RADICAL RAP
+2E9B..2EF3;W     # So    [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE
+2F00..2FD5;W     # So   [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE
+2FF0..2FFB;W     # So    [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
+3000;F           # Zs         IDEOGRAPHIC SPACE
+3001..3003;W     # Po     [3] IDEOGRAPHIC COMMA..DITTO MARK
+3004;W           # So         JAPANESE INDUSTRIAL STANDARD SYMBOL
+3005;W           # Lm         IDEOGRAPHIC ITERATION MARK
+3006;W           # Lo         IDEOGRAPHIC CLOSING MARK
+3007;W           # Nl         IDEOGRAPHIC NUMBER ZERO
+3008;W           # Ps         LEFT ANGLE BRACKET
+3009;W           # Pe         RIGHT ANGLE BRACKET
+300A;W           # Ps         LEFT DOUBLE ANGLE BRACKET
+300B;W           # Pe         RIGHT DOUBLE ANGLE BRACKET
+300C;W           # Ps         LEFT CORNER BRACKET
+300D;W           # Pe         RIGHT CORNER BRACKET
+300E;W           # Ps         LEFT WHITE CORNER BRACKET
+300F;W           # Pe         RIGHT WHITE CORNER BRACKET
+3010;W           # Ps         LEFT BLACK LENTICULAR BRACKET
+3011;W           # Pe         RIGHT BLACK LENTICULAR BRACKET
+3012..3013;W     # So     [2] POSTAL MARK..GETA MARK
+3014;W           # Ps         LEFT TORTOISE SHELL BRACKET
+3015;W           # Pe         RIGHT TORTOISE SHELL BRACKET
+3016;W           # Ps         LEFT WHITE LENTICULAR BRACKET
+3017;W           # Pe         RIGHT WHITE LENTICULAR BRACKET
+3018;W           # Ps         LEFT WHITE TORTOISE SHELL BRACKET
+3019;W           # Pe         RIGHT WHITE TORTOISE SHELL BRACKET
+301A;W           # Ps         LEFT WHITE SQUARE BRACKET
+301B;W           # Pe         RIGHT WHITE SQUARE BRACKET
+301C;W           # Pd         WAVE DASH
+301D;W           # Ps         REVERSED DOUBLE PRIME QUOTATION MARK
+301E..301F;W     # Pe     [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
+3020;W           # So         POSTAL MARK FACE
+3021..3029;W     # Nl     [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE
+302A..302D;W     # Mn     [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
+302E..302F;W     # Mc     [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
+3030;W           # Pd         WAVY DASH
+3031..3035;W     # Lm     [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
+3036..3037;W     # So     [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
+3038..303A;W     # Nl     [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
+303B;W           # Lm         VERTICAL IDEOGRAPHIC ITERATION MARK
+303C;W           # Lo         MASU MARK
+303D;W           # Po         PART ALTERNATION MARK
+303E;W           # So         IDEOGRAPHIC VARIATION INDICATOR
+303F;N           # So         IDEOGRAPHIC HALF FILL SPACE
+3041..3096;W     # Lo    [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
+3099..309A;W     # Mn     [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+309B..309C;W     # Sk     [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+309D..309E;W     # Lm     [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK
+309F;W           # Lo         HIRAGANA DIGRAPH YORI
+30A0;W           # Pd         KATAKANA-HIRAGANA DOUBLE HYPHEN
+30A1..30FA;W     # Lo    [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO
+30FB;W           # Po         KATAKANA MIDDLE DOT
+30FC..30FE;W     # Lm     [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK
+30FF;W           # Lo         KATAKANA DIGRAPH KOTO
+3105..312F;W     # Lo    [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN
+3131..318E;W     # Lo    [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE
+3190..3191;W     # So     [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
+3192..3195;W     # No     [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
+3196..319F;W     # So    [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
+31A0..31BF;W     # Lo    [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
+31C0..31E3;W     # So    [36] CJK STROKE T..CJK STROKE Q
+31F0..31FF;W     # Lo    [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
+3200..321E;W     # So    [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU
+3220..3229;W     # No    [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
+322A..3247;W     # So    [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
+3248..324F;A     # No     [8] CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE
+3250;W           # So         PARTNERSHIP SIGN
+3251..325F;W     # No    [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE
+3260..327F;W     # So    [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL
+3280..3289;W     # No    [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
+328A..32B0;W     # So    [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
+32B1..32BF;W     # No    [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY
+32C0..32FF;W     # So    [64] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE ERA NAME REIWA
+3300..33FF;W     # So   [256] SQUARE APAATO..SQUARE GAL
+3400..4DBF;W     # Lo  [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
+4DC0..4DFF;N     # So    [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION
+4E00..9FFF;W     # Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF
+A000..A014;W     # Lo    [21] YI SYLLABLE IT..YI SYLLABLE E
+A015;W           # Lm         YI SYLLABLE WU
+A016..A48C;W     # Lo  [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
+A490..A4C6;W     # So    [55] YI RADICAL QOT..YI RADICAL KE
+A4D0..A4F7;N     # Lo    [40] LISU LETTER BA..LISU LETTER OE
+A4F8..A4FD;N     # Lm     [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU
+A4FE..A4FF;N     # Po     [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP
+A500..A60B;N     # Lo   [268] VAI SYLLABLE EE..VAI SYLLABLE NG
+A60C;N           # Lm         VAI SYLLABLE LENGTHENER
+A60D..A60F;N     # Po     [3] VAI COMMA..VAI QUESTION MARK
+A610..A61F;N     # Lo    [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG
+A620..A629;N     # Nd    [10] VAI DIGIT ZERO..VAI DIGIT NINE
+A62A..A62B;N     # Lo     [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO
+A640..A66D;N     # L&    [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O
+A66E;N           # Lo         CYRILLIC LETTER MULTIOCULAR O
+A66F;N           # Mn         COMBINING CYRILLIC VZMET
+A670..A672;N     # Me     [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN
+A673;N           # Po         SLAVONIC ASTERISK
+A674..A67D;N     # Mn    [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK
+A67E;N           # Po         CYRILLIC KAVYKA
+A67F;N           # Lm         CYRILLIC PAYEROK
+A680..A69B;N     # L&    [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O
+A69C..A69D;N     # Lm     [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
+A69E..A69F;N     # Mn     [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E
+A6A0..A6E5;N     # Lo    [70] BAMUM LETTER A..BAMUM LETTER KI
+A6E6..A6EF;N     # Nl    [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM
+A6F0..A6F1;N     # Mn     [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS
+A6F2..A6F7;N     # Po     [6] BAMUM NJAEMLI..BAMUM QUESTION MARK
+A700..A716;N     # Sk    [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR
+A717..A71F;N     # Lm     [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
+A720..A721;N     # Sk     [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE
+A722..A76F;N     # L&    [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON
+A770;N           # Lm         MODIFIER LETTER US
+A771..A787;N     # L&    [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
+A788;N           # Lm         MODIFIER LETTER LOW CIRCUMFLEX ACCENT
+A789..A78A;N     # Sk     [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN
+A78B..A78E;N     # L&     [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
+A78F;N           # Lo         LATIN LETTER SINOLOGICAL DOT
+A790..A7CA;N     # L&    [59] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
+A7D0..A7D1;N     # L&     [2] LATIN CAPITAL LETTER CLOSED INSULAR G..LATIN SMALL LETTER CLOSED INSULAR G
+A7D3;N           # Ll         LATIN SMALL LETTER DOUBLE THORN
+A7D5..A7D9;N     # L&     [5] LATIN SMALL LETTER DOUBLE WYNN..LATIN SMALL LETTER SIGMOID S
+A7F2..A7F4;N     # Lm     [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
+A7F5..A7F6;N     # L&     [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H
+A7F7;N           # Lo         LATIN EPIGRAPHIC LETTER SIDEWAYS I
+A7F8..A7F9;N     # Lm     [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
+A7FA;N           # Ll         LATIN LETTER SMALL CAPITAL TURNED M
+A7FB..A7FF;N     # Lo     [5] LATIN EPIGRAPHIC LETTER REVERSED F..LATIN EPIGRAPHIC LETTER ARCHAIC M
+A800..A801;N     # Lo     [2] SYLOTI NAGRI LETTER A..SYLOTI NAGRI LETTER I
+A802;N           # Mn         SYLOTI NAGRI SIGN DVISVARA
+A803..A805;N     # Lo     [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O
+A806;N           # Mn         SYLOTI NAGRI SIGN HASANTA
+A807..A80A;N     # Lo     [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO
+A80B;N           # Mn         SYLOTI NAGRI SIGN ANUSVARA
+A80C..A822;N     # Lo    [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO
+A823..A824;N     # Mc     [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I
+A825..A826;N     # Mn     [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E
+A827;N           # Mc         SYLOTI NAGRI VOWEL SIGN OO
+A828..A82B;N     # So     [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-4
+A82C;N           # Mn         SYLOTI NAGRI SIGN ALTERNATE HASANTA
+A830..A835;N     # No     [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE SIXTEENTHS
+A836..A837;N     # So     [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
+A838;N           # Sc         NORTH INDIC RUPEE MARK
+A839;N           # So         NORTH INDIC QUANTITY MARK
+A840..A873;N     # Lo    [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU
+A874..A877;N     # Po     [4] PHAGS-PA SINGLE HEAD MARK..PHAGS-PA MARK DOUBLE SHAD
+A880..A881;N     # Mc     [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA
+A882..A8B3;N     # Lo    [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA
+A8B4..A8C3;N     # Mc    [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU
+A8C4..A8C5;N     # Mn     [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU
+A8CE..A8CF;N     # Po     [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA
+A8D0..A8D9;N     # Nd    [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE
+A8E0..A8F1;N     # Mn    [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA
+A8F2..A8F7;N     # Lo     [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA
+A8F8..A8FA;N     # Po     [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET
+A8FB;N           # Lo         DEVANAGARI HEADSTROKE
+A8FC;N           # Po         DEVANAGARI SIGN SIDDHAM
+A8FD..A8FE;N     # Lo     [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY
+A8FF;N           # Mn         DEVANAGARI VOWEL SIGN AY
+A900..A909;N     # Nd    [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE
+A90A..A925;N     # Lo    [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO
+A926..A92D;N     # Mn     [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU
+A92E..A92F;N     # Po     [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA
+A930..A946;N     # Lo    [23] REJANG LETTER KA..REJANG LETTER A
+A947..A951;N     # Mn    [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R
+A952..A953;N     # Mc     [2] REJANG CONSONANT SIGN H..REJANG VIRAMA
+A95F;N           # Po         REJANG SECTION MARK
+A960..A97C;W     # Lo    [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH
+A980..A982;N     # Mn     [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR
+A983;N           # Mc         JAVANESE SIGN WIGNYAN
+A984..A9B2;N     # Lo    [47] JAVANESE LETTER A..JAVANESE LETTER HA
+A9B3;N           # Mn         JAVANESE SIGN CECAK TELU
+A9B4..A9B5;N     # Mc     [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG
+A9B6..A9B9;N     # Mn     [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT
+A9BA..A9BB;N     # Mc     [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE
+A9BC..A9BD;N     # Mn     [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET
+A9BE..A9C0;N     # Mc     [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON
+A9C1..A9CD;N     # Po    [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH
+A9CF;N           # Lm         JAVANESE PANGRANGKEP
+A9D0..A9D9;N     # Nd    [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE
+A9DE..A9DF;N     # Po     [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN
+A9E0..A9E4;N     # Lo     [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA
+A9E5;N           # Mn         MYANMAR SIGN SHAN SAW
+A9E6;N           # Lm         MYANMAR MODIFIER LETTER SHAN REDUPLICATION
+A9E7..A9EF;N     # Lo     [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA
+A9F0..A9F9;N     # Nd    [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE
+A9FA..A9FE;N     # Lo     [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA
+AA00..AA28;N     # Lo    [41] CHAM LETTER A..CHAM LETTER HA
+AA29..AA2E;N     # Mn     [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE
+AA2F..AA30;N     # Mc     [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI
+AA31..AA32;N     # Mn     [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE
+AA33..AA34;N     # Mc     [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA
+AA35..AA36;N     # Mn     [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA
+AA40..AA42;N     # Lo     [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG
+AA43;N           # Mn         CHAM CONSONANT SIGN FINAL NG
+AA44..AA4B;N     # Lo     [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS
+AA4C;N           # Mn         CHAM CONSONANT SIGN FINAL M
+AA4D;N           # Mc         CHAM CONSONANT SIGN FINAL H
+AA50..AA59;N     # Nd    [10] CHAM DIGIT ZERO..CHAM DIGIT NINE
+AA5C..AA5F;N     # Po     [4] CHAM PUNCTUATION SPIRAL..CHAM PUNCTUATION TRIPLE DANDA
+AA60..AA6F;N     # Lo    [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA
+AA70;N           # Lm         MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION
+AA71..AA76;N     # Lo     [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM
+AA77..AA79;N     # So     [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO
+AA7A;N           # Lo         MYANMAR LETTER AITON RA
+AA7B;N           # Mc         MYANMAR SIGN PAO KAREN TONE
+AA7C;N           # Mn         MYANMAR SIGN TAI LAING TONE-2
+AA7D;N           # Mc         MYANMAR SIGN TAI LAING TONE-5
+AA7E..AA7F;N     # Lo     [2] MYANMAR LETTER SHWE PALAUNG CHA..MYANMAR LETTER SHWE PALAUNG SHA
+AA80..AAAF;N     # Lo    [48] TAI VIET LETTER LOW KO..TAI VIET LETTER HIGH O
+AAB0;N           # Mn         TAI VIET MAI KANG
+AAB1;N           # Lo         TAI VIET VOWEL AA
+AAB2..AAB4;N     # Mn     [3] TAI VIET VOWEL I..TAI VIET VOWEL U
+AAB5..AAB6;N     # Lo     [2] TAI VIET VOWEL E..TAI VIET VOWEL O
+AAB7..AAB8;N     # Mn     [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA
+AAB9..AABD;N     # Lo     [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN
+AABE..AABF;N     # Mn     [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK
+AAC0;N           # Lo         TAI VIET TONE MAI NUENG
+AAC1;N           # Mn         TAI VIET TONE MAI THO
+AAC2;N           # Lo         TAI VIET TONE MAI SONG
+AADB..AADC;N     # Lo     [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG
+AADD;N           # Lm         TAI VIET SYMBOL SAM
+AADE..AADF;N     # Po     [2] TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI KOI
+AAE0..AAEA;N     # Lo    [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA
+AAEB;N           # Mc         MEETEI MAYEK VOWEL SIGN II
+AAEC..AAED;N     # Mn     [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI
+AAEE..AAEF;N     # Mc     [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU
+AAF0..AAF1;N     # Po     [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM
+AAF2;N           # Lo         MEETEI MAYEK ANJI
+AAF3..AAF4;N     # Lm     [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK
+AAF5;N           # Mc         MEETEI MAYEK VOWEL SIGN VISARGA
+AAF6;N           # Mn         MEETEI MAYEK VIRAMA
+AB01..AB06;N     # Lo     [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO
+AB09..AB0E;N     # Lo     [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO
+AB11..AB16;N     # Lo     [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO
+AB20..AB26;N     # Lo     [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO
+AB28..AB2E;N     # Lo     [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO
+AB30..AB5A;N     # Ll    [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
+AB5B;N           # Sk         MODIFIER BREVE WITH INVERTED BREVE
+AB5C..AB5F;N     # Lm     [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
+AB60..AB68;N     # Ll     [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
+AB69;N           # Lm         MODIFIER LETTER SMALL TURNED W
+AB6A..AB6B;N     # Sk     [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK
+AB70..ABBF;N     # Ll    [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
+ABC0..ABE2;N     # Lo    [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM
+ABE3..ABE4;N     # Mc     [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP
+ABE5;N           # Mn         MEETEI MAYEK VOWEL SIGN ANAP
+ABE6..ABE7;N     # Mc     [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP
+ABE8;N           # Mn         MEETEI MAYEK VOWEL SIGN UNAP
+ABE9..ABEA;N     # Mc     [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG
+ABEB;N           # Po         MEETEI MAYEK CHEIKHEI
+ABEC;N           # Mc         MEETEI MAYEK LUM IYEK
+ABED;N           # Mn         MEETEI MAYEK APUN IYEK
+ABF0..ABF9;N     # Nd    [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE
+AC00..D7A3;W     # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
+D7B0..D7C6;N     # Lo    [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E
+D7CB..D7FB;N     # Lo    [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH
+D800..DB7F;N     # Cs   [896] <surrogate-D800>..<surrogate-DB7F>
+DB80..DBFF;N     # Cs   [128] <surrogate-DB80>..<surrogate-DBFF>
+DC00..DFFF;N     # Cs  [1024] <surrogate-DC00>..<surrogate-DFFF>
+E000..F8FF;A     # Co  [6400] <private-use-E000>..<private-use-F8FF>
+F900..FA6D;W     # Lo   [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
+FA6E..FA6F;W     # Cn     [2] <reserved-FA6E>..<reserved-FA6F>
+FA70..FAD9;W     # Lo   [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
+FADA..FAFF;W     # Cn    [38] <reserved-FADA>..<reserved-FAFF>
+FB00..FB06;N     # Ll     [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
+FB13..FB17;N     # Ll     [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
+FB1D;N           # Lo         HEBREW LETTER YOD WITH HIRIQ
+FB1E;N           # Mn         HEBREW POINT JUDEO-SPANISH VARIKA
+FB1F..FB28;N     # Lo    [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV
+FB29;N           # Sm         HEBREW LETTER ALTERNATIVE PLUS SIGN
+FB2A..FB36;N     # Lo    [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH
+FB38..FB3C;N     # Lo     [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH
+FB3E;N           # Lo         HEBREW LETTER MEM WITH DAGESH
+FB40..FB41;N     # Lo     [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH
+FB43..FB44;N     # Lo     [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH
+FB46..FB4F;N     # Lo    [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATURE ALEF LAMED
+FB50..FBB1;N     # Lo    [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
+FBB2..FBC2;N     # Sk    [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE
+FBD3..FD3D;N     # Lo   [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM
+FD3E;N           # Pe         ORNATE LEFT PARENTHESIS
+FD3F;N           # Ps         ORNATE RIGHT PARENTHESIS
+FD40..FD4F;N     # So    [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH
+FD50..FD8F;N     # Lo    [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM
+FD92..FDC7;N     # Lo    [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
+FDCF;N           # So         ARABIC LIGATURE SALAAMUHU ALAYNAA
+FDF0..FDFB;N     # Lo    [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU
+FDFC;N           # Sc         RIAL SIGN
+FDFD..FDFF;N     # So     [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL
+FE00..FE0F;A     # Mn    [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
+FE10..FE16;W     # Po     [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK
+FE17;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
+FE18;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
+FE19;W           # Po         PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
+FE20..FE2F;N     # Mn    [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF
+FE30;W           # Po         PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
+FE31..FE32;W     # Pd     [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH
+FE33..FE34;W     # Pc     [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
+FE35;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
+FE36;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
+FE37;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
+FE38;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
+FE39;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
+FE3A;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
+FE3B;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
+FE3C;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
+FE3D;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
+FE3E;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
+FE3F;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
+FE40;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
+FE41;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
+FE42;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
+FE43;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
+FE44;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
+FE45..FE46;W     # Po     [2] SESAME DOT..WHITE SESAME DOT
+FE47;W           # Ps         PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET
+FE48;W           # Pe         PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET
+FE49..FE4C;W     # Po     [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE
+FE4D..FE4F;W     # Pc     [3] DASHED LOW LINE..WAVY LOW LINE
+FE50..FE52;W     # Po     [3] SMALL COMMA..SMALL FULL STOP
+FE54..FE57;W     # Po     [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK
+FE58;W           # Pd         SMALL EM DASH
+FE59;W           # Ps         SMALL LEFT PARENTHESIS
+FE5A;W           # Pe         SMALL RIGHT PARENTHESIS
+FE5B;W           # Ps         SMALL LEFT CURLY BRACKET
+FE5C;W           # Pe         SMALL RIGHT CURLY BRACKET
+FE5D;W           # Ps         SMALL LEFT TORTOISE SHELL BRACKET
+FE5E;W           # Pe         SMALL RIGHT TORTOISE SHELL BRACKET
+FE5F..FE61;W     # Po     [3] SMALL NUMBER SIGN..SMALL ASTERISK
+FE62;W           # Sm         SMALL PLUS SIGN
+FE63;W           # Pd         SMALL HYPHEN-MINUS
+FE64..FE66;W     # Sm     [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN
+FE68;W           # Po         SMALL REVERSE SOLIDUS
+FE69;W           # Sc         SMALL DOLLAR SIGN
+FE6A..FE6B;W     # Po     [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT
+FE70..FE74;N     # Lo     [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM
+FE76..FEFC;N     # Lo   [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
+FEFF;N           # Cf         ZERO WIDTH NO-BREAK SPACE
+FF01..FF03;F     # Po     [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN
+FF04;F           # Sc         FULLWIDTH DOLLAR SIGN
+FF05..FF07;F     # Po     [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE
+FF08;F           # Ps         FULLWIDTH LEFT PARENTHESIS
+FF09;F           # Pe         FULLWIDTH RIGHT PARENTHESIS
+FF0A;F           # Po         FULLWIDTH ASTERISK
+FF0B;F           # Sm         FULLWIDTH PLUS SIGN
+FF0C;F           # Po         FULLWIDTH COMMA
+FF0D;F           # Pd         FULLWIDTH HYPHEN-MINUS
+FF0E..FF0F;F     # Po     [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS
+FF10..FF19;F     # Nd    [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
+FF1A..FF1B;F     # Po     [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON
+FF1C..FF1E;F     # Sm     [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN
+FF1F..FF20;F     # Po     [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT
+FF21..FF3A;F     # Lu    [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
+FF3B;F           # Ps         FULLWIDTH LEFT SQUARE BRACKET
+FF3C;F           # Po         FULLWIDTH REVERSE SOLIDUS
+FF3D;F           # Pe         FULLWIDTH RIGHT SQUARE BRACKET
+FF3E;F           # Sk         FULLWIDTH CIRCUMFLEX ACCENT
+FF3F;F           # Pc         FULLWIDTH LOW LINE
+FF40;F           # Sk         FULLWIDTH GRAVE ACCENT
+FF41..FF5A;F     # Ll    [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
+FF5B;F           # Ps         FULLWIDTH LEFT CURLY BRACKET
+FF5C;F           # Sm         FULLWIDTH VERTICAL LINE
+FF5D;F           # Pe         FULLWIDTH RIGHT CURLY BRACKET
+FF5E;F           # Sm         FULLWIDTH TILDE
+FF5F;F           # Ps         FULLWIDTH LEFT WHITE PARENTHESIS
+FF60;F           # Pe         FULLWIDTH RIGHT WHITE PARENTHESIS
+FF61;H           # Po         HALFWIDTH IDEOGRAPHIC FULL STOP
+FF62;H           # Ps         HALFWIDTH LEFT CORNER BRACKET
+FF63;H           # Pe         HALFWIDTH RIGHT CORNER BRACKET
+FF64..FF65;H     # Po     [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
+FF66..FF6F;H     # Lo    [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU
+FF70;H           # Lm         HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
+FF71..FF9D;H     # Lo    [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N
+FF9E..FF9F;H     # Lm     [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
+FFA0..FFBE;H     # Lo    [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH
+FFC2..FFC7;H     # Lo     [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E
+FFCA..FFCF;H     # Lo     [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE
+FFD2..FFD7;H     # Lo     [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU
+FFDA..FFDC;H     # Lo     [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
+FFE0..FFE1;F     # Sc     [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN
+FFE2;F           # Sm         FULLWIDTH NOT SIGN
+FFE3;F           # Sk         FULLWIDTH MACRON
+FFE4;F           # So         FULLWIDTH BROKEN BAR
+FFE5..FFE6;F     # Sc     [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
+FFE8;H           # So         HALFWIDTH FORMS LIGHT VERTICAL
+FFE9..FFEC;H     # Sm     [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW
+FFED..FFEE;H     # So     [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE
+FFF9..FFFB;N     # Cf     [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
+FFFC;N           # So         OBJECT REPLACEMENT CHARACTER
+FFFD;A           # So         REPLACEMENT CHARACTER
+10000..1000B;N   # Lo    [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE
+1000D..10026;N   # Lo    [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO
+10028..1003A;N   # Lo    [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO
+1003C..1003D;N   # Lo     [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE
+1003F..1004D;N   # Lo    [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO
+10050..1005D;N   # Lo    [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089
+10080..100FA;N   # Lo   [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305
+10100..10102;N   # Po     [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK
+10107..10133;N   # No    [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
+10137..1013F;N   # So     [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
+10140..10174;N   # Nl    [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS
+10175..10178;N   # No     [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN
+10179..10189;N   # So    [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN
+1018A..1018B;N   # No     [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN
+1018C..1018E;N   # So     [3] GREEK SINUSOID SIGN..NOMISMA SIGN
+10190..1019C;N   # So    [13] ROMAN SEXTANS SIGN..ASCIA SYMBOL
+101A0;N          # So         GREEK SYMBOL TAU RHO
+101D0..101FC;N   # So    [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND
+101FD;N          # Mn         PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
+10280..1029C;N   # Lo    [29] LYCIAN LETTER A..LYCIAN LETTER X
+102A0..102D0;N   # Lo    [49] CARIAN LETTER A..CARIAN LETTER UUU3
+102E0;N          # Mn         COPTIC EPACT THOUSANDS MARK
+102E1..102FB;N   # No    [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
+10300..1031F;N   # Lo    [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS
+10320..10323;N   # No     [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY
+1032D..1032F;N   # Lo     [3] OLD ITALIC LETTER YE..OLD ITALIC LETTER SOUTHERN TSE
+10330..10340;N   # Lo    [17] GOTHIC LETTER AHSA..GOTHIC LETTER PAIRTHRA
+10341;N          # Nl         GOTHIC LETTER NINETY
+10342..10349;N   # Lo     [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL
+1034A;N          # Nl         GOTHIC LETTER NINE HUNDRED
+10350..10375;N   # Lo    [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA
+10376..1037A;N   # Mn     [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII
+10380..1039D;N   # Lo    [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU
+1039F;N          # Po         UGARITIC WORD DIVIDER
+103A0..103C3;N   # Lo    [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA
+103C8..103CF;N   # Lo     [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH
+103D0;N          # Po         OLD PERSIAN WORD DIVIDER
+103D1..103D5;N   # Nl     [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED
+10400..1044F;N   # L&    [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW
+10450..1047F;N   # Lo    [48] SHAVIAN LETTER PEEP..SHAVIAN LETTER YEW
+10480..1049D;N   # Lo    [30] OSMANYA LETTER ALEF..OSMANYA LETTER OO
+104A0..104A9;N   # Nd    [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE
+104B0..104D3;N   # Lu    [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA
+104D8..104FB;N   # Ll    [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA
+10500..10527;N   # Lo    [40] ELBASAN LETTER A..ELBASAN LETTER KHE
+10530..10563;N   # Lo    [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW
+1056F;N          # Po         CAUCASIAN ALBANIAN CITATION MARK
+10570..1057A;N   # Lu    [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA
+1057C..1058A;N   # Lu    [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE
+1058C..10592;N   # Lu     [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE
+10594..10595;N   # Lu     [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE
+10597..105A1;N   # Ll    [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA
+105A3..105B1;N   # Ll    [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE
+105B3..105B9;N   # Ll     [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE
+105BB..105BC;N   # Ll     [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE
+10600..10736;N   # Lo   [311] LINEAR A SIGN AB001..LINEAR A SIGN A664
+10740..10755;N   # Lo    [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE
+10760..10767;N   # Lo     [8] LINEAR A SIGN A800..LINEAR A SIGN A807
+10780..10785;N   # Lm     [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK
+10787..107B0;N   # Lm    [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK
+107B2..107BA;N   # Lm     [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL
+10800..10805;N   # Lo     [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA
+10808;N          # Lo         CYPRIOT SYLLABLE JO
+1080A..10835;N   # Lo    [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO
+10837..10838;N   # Lo     [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE
+1083C;N          # Lo         CYPRIOT SYLLABLE ZA
+1083F;N          # Lo         CYPRIOT SYLLABLE ZO
+10840..10855;N   # Lo    [22] IMPERIAL ARAMAIC LETTER ALEPH..IMPERIAL ARAMAIC LETTER TAW
+10857;N          # Po         IMPERIAL ARAMAIC SECTION SIGN
+10858..1085F;N   # No     [8] IMPERIAL ARAMAIC NUMBER ONE..IMPERIAL ARAMAIC NUMBER TEN THOUSAND
+10860..10876;N   # Lo    [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW
+10877..10878;N   # So     [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON
+10879..1087F;N   # No     [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY
+10880..1089E;N   # Lo    [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW
+108A7..108AF;N   # No     [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED
+108E0..108F2;N   # Lo    [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH
+108F4..108F5;N   # Lo     [2] HATRAN LETTER SHIN..HATRAN LETTER TAW
+108FB..108FF;N   # No     [5] HATRAN NUMBER ONE..HATRAN NUMBER ONE HUNDRED
+10900..10915;N   # Lo    [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU
+10916..1091B;N   # No     [6] PHOENICIAN NUMBER ONE..PHOENICIAN NUMBER THREE
+1091F;N          # Po         PHOENICIAN WORD SEPARATOR
+10920..10939;N   # Lo    [26] LYDIAN LETTER A..LYDIAN LETTER C
+1093F;N          # Po         LYDIAN TRIANGULAR MARK
+10980..1099F;N   # Lo    [32] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC HIEROGLYPHIC SYMBOL VIDJ-2
+109A0..109B7;N   # Lo    [24] MEROITIC CURSIVE LETTER A..MEROITIC CURSIVE LETTER DA
+109BC..109BD;N   # No     [2] MEROITIC CURSIVE FRACTION ELEVEN TWELFTHS..MEROITIC CURSIVE FRACTION ONE HALF
+109BE..109BF;N   # Lo     [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN
+109C0..109CF;N   # No    [16] MEROITIC CURSIVE NUMBER ONE..MEROITIC CURSIVE NUMBER SEVENTY
+109D2..109FF;N   # No    [46] MEROITIC CURSIVE NUMBER ONE HUNDRED..MEROITIC CURSIVE FRACTION TEN TWELFTHS
+10A00;N          # Lo         KHAROSHTHI LETTER A
+10A01..10A03;N   # Mn     [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R
+10A05..10A06;N   # Mn     [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O
+10A0C..10A0F;N   # Mn     [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA
+10A10..10A13;N   # Lo     [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA
+10A15..10A17;N   # Lo     [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA
+10A19..10A35;N   # Lo    [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA
+10A38..10A3A;N   # Mn     [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW
+10A3F;N          # Mn         KHAROSHTHI VIRAMA
+10A40..10A48;N   # No     [9] KHAROSHTHI DIGIT ONE..KHAROSHTHI FRACTION ONE HALF
+10A50..10A58;N   # Po     [9] KHAROSHTHI PUNCTUATION DOT..KHAROSHTHI PUNCTUATION LINES
+10A60..10A7C;N   # Lo    [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH
+10A7D..10A7E;N   # No     [2] OLD SOUTH ARABIAN NUMBER ONE..OLD SOUTH ARABIAN NUMBER FIFTY
+10A7F;N          # Po         OLD SOUTH ARABIAN NUMERIC INDICATOR
+10A80..10A9C;N   # Lo    [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH
+10A9D..10A9F;N   # No     [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY
+10AC0..10AC7;N   # Lo     [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW
+10AC8;N          # So         MANICHAEAN SIGN UD
+10AC9..10AE4;N   # Lo    [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW
+10AE5..10AE6;N   # Mn     [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
+10AEB..10AEF;N   # No     [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED
+10AF0..10AF6;N   # Po     [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER
+10B00..10B35;N   # Lo    [54] AVESTAN LETTER A..AVESTAN LETTER HE
+10B39..10B3F;N   # Po     [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION
+10B40..10B55;N   # Lo    [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW
+10B58..10B5F;N   # No     [8] INSCRIPTIONAL PARTHIAN NUMBER ONE..INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND
+10B60..10B72;N   # Lo    [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW
+10B78..10B7F;N   # No     [8] INSCRIPTIONAL PAHLAVI NUMBER ONE..INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND
+10B80..10B91;N   # Lo    [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW
+10B99..10B9C;N   # Po     [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
+10BA9..10BAF;N   # No     [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED
+10C00..10C48;N   # Lo    [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH
+10C80..10CB2;N   # Lu    [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US
+10CC0..10CF2;N   # Ll    [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US
+10CFA..10CFF;N   # No     [6] OLD HUNGARIAN NUMBER ONE..OLD HUNGARIAN NUMBER ONE THOUSAND
+10D00..10D23;N   # Lo    [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
+10D24..10D27;N   # Mn     [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
+10D30..10D39;N   # Nd    [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE
+10E60..10E7E;N   # No    [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS
+10E80..10EA9;N   # Lo    [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
+10EAB..10EAC;N   # Mn     [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
+10EAD;N          # Pd         YEZIDI HYPHENATION MARK
+10EB0..10EB1;N   # Lo     [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
+10EFD..10EFF;N   # Mn     [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
+10F00..10F1C;N   # Lo    [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
+10F1D..10F26;N   # No    [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
+10F27;N          # Lo         OLD SOGDIAN LIGATURE AYIN-DALETH
+10F30..10F45;N   # Lo    [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
+10F46..10F50;N   # Mn    [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
+10F51..10F54;N   # No     [4] SOGDIAN NUMBER ONE..SOGDIAN NUMBER ONE HUNDRED
+10F55..10F59;N   # Po     [5] SOGDIAN PUNCTUATION TWO VERTICAL BARS..SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT
+10F70..10F81;N   # Lo    [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH
+10F82..10F85;N   # Mn     [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
+10F86..10F89;N   # Po     [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS
+10FB0..10FC4;N   # Lo    [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW
+10FC5..10FCB;N   # No     [7] CHORASMIAN NUMBER ONE..CHORASMIAN NUMBER ONE HUNDRED
+10FE0..10FF6;N   # Lo    [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH
+11000;N          # Mc         BRAHMI SIGN CANDRABINDU
+11001;N          # Mn         BRAHMI SIGN ANUSVARA
+11002;N          # Mc         BRAHMI SIGN VISARGA
+11003..11037;N   # Lo    [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA
+11038..11046;N   # Mn    [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
+11047..1104D;N   # Po     [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
+11052..11065;N   # No    [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND
+11066..1106F;N   # Nd    [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE
+11070;N          # Mn         BRAHMI SIGN OLD TAMIL VIRAMA
+11071..11072;N   # Lo     [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O
+11073..11074;N   # Mn     [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
+11075;N          # Lo         BRAHMI LETTER OLD TAMIL LLA
+1107F;N          # Mn         BRAHMI NUMBER JOINER
+11080..11081;N   # Mn     [2] KAITHI SIGN CANDRABINDU..KAITHI SIGN ANUSVARA
+11082;N          # Mc         KAITHI SIGN VISARGA
+11083..110AF;N   # Lo    [45] KAITHI LETTER A..KAITHI LETTER HA
+110B0..110B2;N   # Mc     [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II
+110B3..110B6;N   # Mn     [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
+110B7..110B8;N   # Mc     [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU
+110B9..110BA;N   # Mn     [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
+110BB..110BC;N   # Po     [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN
+110BD;N          # Cf         KAITHI NUMBER SIGN
+110BE..110C1;N   # Po     [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
+110C2;N          # Mn         KAITHI VOWEL SIGN VOCALIC R
+110CD;N          # Cf         KAITHI NUMBER SIGN ABOVE
+110D0..110E8;N   # Lo    [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE
+110F0..110F9;N   # Nd    [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE
+11100..11102;N   # Mn     [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA
+11103..11126;N   # Lo    [36] CHAKMA LETTER AA..CHAKMA LETTER HAA
+11127..1112B;N   # Mn     [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU
+1112C;N          # Mc         CHAKMA VOWEL SIGN E
+1112D..11134;N   # Mn     [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA
+11136..1113F;N   # Nd    [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE
+11140..11143;N   # Po     [4] CHAKMA SECTION MARK..CHAKMA QUESTION MARK
+11144;N          # Lo         CHAKMA LETTER LHAA
+11145..11146;N   # Mc     [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI
+11147;N          # Lo         CHAKMA LETTER VAA
+11150..11172;N   # Lo    [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA
+11173;N          # Mn         MAHAJANI SIGN NUKTA
+11174..11175;N   # Po     [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK
+11176;N          # Lo         MAHAJANI LIGATURE SHRI
+11180..11181;N   # Mn     [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA
+11182;N          # Mc         SHARADA SIGN VISARGA
+11183..111B2;N   # Lo    [48] SHARADA LETTER A..SHARADA LETTER HA
+111B3..111B5;N   # Mc     [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II
+111B6..111BE;N   # Mn     [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O
+111BF..111C0;N   # Mc     [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA
+111C1..111C4;N   # Lo     [4] SHARADA SIGN AVAGRAHA..SHARADA OM
+111C5..111C8;N   # Po     [4] SHARADA DANDA..SHARADA SEPARATOR
+111C9..111CC;N   # Mn     [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK
+111CD;N          # Po         SHARADA SUTRA MARK
+111CE;N          # Mc         SHARADA VOWEL SIGN PRISHTHAMATRA E
+111CF;N          # Mn         SHARADA SIGN INVERTED CANDRABINDU
+111D0..111D9;N   # Nd    [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE
+111DA;N          # Lo         SHARADA EKAM
+111DB;N          # Po         SHARADA SIGN SIDDHAM
+111DC;N          # Lo         SHARADA HEADSTROKE
+111DD..111DF;N   # Po     [3] SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2
+111E1..111F4;N   # No    [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND
+11200..11211;N   # Lo    [18] KHOJKI LETTER A..KHOJKI LETTER JJA
+11213..1122B;N   # Lo    [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
+1122C..1122E;N   # Mc     [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II
+1122F..11231;N   # Mn     [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI
+11232..11233;N   # Mc     [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
+11234;N          # Mn         KHOJKI SIGN ANUSVARA
+11235;N          # Mc         KHOJKI SIGN VIRAMA
+11236..11237;N   # Mn     [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
+11238..1123D;N   # Po     [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
+1123E;N          # Mn         KHOJKI SIGN SUKUN
+1123F..11240;N   # Lo     [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
+11241;N          # Mn         KHOJKI VOWEL SIGN VOCALIC R
+11280..11286;N   # Lo     [7] MULTANI LETTER A..MULTANI LETTER GA
+11288;N          # Lo         MULTANI LETTER GHA
+1128A..1128D;N   # Lo     [4] MULTANI LETTER CA..MULTANI LETTER JJA
+1128F..1129D;N   # Lo    [15] MULTANI LETTER NYA..MULTANI LETTER BA
+1129F..112A8;N   # Lo    [10] MULTANI LETTER BHA..MULTANI LETTER RHA
+112A9;N          # Po         MULTANI SECTION MARK
+112B0..112DE;N   # Lo    [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA
+112DF;N          # Mn         KHUDAWADI SIGN ANUSVARA
+112E0..112E2;N   # Mc     [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
+112E3..112EA;N   # Mn     [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
+112F0..112F9;N   # Nd    [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE
+11300..11301;N   # Mn     [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU
+11302..11303;N   # Mc     [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA
+11305..1130C;N   # Lo     [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L
+1130F..11310;N   # Lo     [2] GRANTHA LETTER EE..GRANTHA LETTER AI
+11313..11328;N   # Lo    [22] GRANTHA LETTER OO..GRANTHA LETTER NA
+1132A..11330;N   # Lo     [7] GRANTHA LETTER PA..GRANTHA LETTER RA
+11332..11333;N   # Lo     [2] GRANTHA LETTER LA..GRANTHA LETTER LLA
+11335..11339;N   # Lo     [5] GRANTHA LETTER VA..GRANTHA LETTER HA
+1133B..1133C;N   # Mn     [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
+1133D;N          # Lo         GRANTHA SIGN AVAGRAHA
+1133E..1133F;N   # Mc     [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I
+11340;N          # Mn         GRANTHA VOWEL SIGN II
+11341..11344;N   # Mc     [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR
+11347..11348;N   # Mc     [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI
+1134B..1134D;N   # Mc     [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA
+11350;N          # Lo         GRANTHA OM
+11357;N          # Mc         GRANTHA AU LENGTH MARK
+1135D..11361;N   # Lo     [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL
+11362..11363;N   # Mc     [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
+11366..1136C;N   # Mn     [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
+11370..11374;N   # Mn     [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
+11400..11434;N   # Lo    [53] NEWA LETTER A..NEWA LETTER HA
+11435..11437;N   # Mc     [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II
+11438..1143F;N   # Mn     [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI
+11440..11441;N   # Mc     [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU
+11442..11444;N   # Mn     [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA
+11445;N          # Mc         NEWA SIGN VISARGA
+11446;N          # Mn         NEWA SIGN NUKTA
+11447..1144A;N   # Lo     [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI
+1144B..1144F;N   # Po     [5] NEWA DANDA..NEWA ABBREVIATION SIGN
+11450..11459;N   # Nd    [10] NEWA DIGIT ZERO..NEWA DIGIT NINE
+1145A..1145B;N   # Po     [2] NEWA DOUBLE COMMA..NEWA PLACEHOLDER MARK
+1145D;N          # Po         NEWA INSERTION SIGN
+1145E;N          # Mn         NEWA SANDHI MARK
+1145F..11461;N   # Lo     [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA
+11480..114AF;N   # Lo    [48] TIRHUTA ANJI..TIRHUTA LETTER HA
+114B0..114B2;N   # Mc     [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
+114B3..114B8;N   # Mn     [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
+114B9;N          # Mc         TIRHUTA VOWEL SIGN E
+114BA;N          # Mn         TIRHUTA VOWEL SIGN SHORT E
+114BB..114BE;N   # Mc     [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU
+114BF..114C0;N   # Mn     [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA
+114C1;N          # Mc         TIRHUTA SIGN VISARGA
+114C2..114C3;N   # Mn     [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
+114C4..114C5;N   # Lo     [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG
+114C6;N          # Po         TIRHUTA ABBREVIATION SIGN
+114C7;N          # Lo         TIRHUTA OM
+114D0..114D9;N   # Nd    [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE
+11580..115AE;N   # Lo    [47] SIDDHAM LETTER A..SIDDHAM LETTER HA
+115AF..115B1;N   # Mc     [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II
+115B2..115B5;N   # Mn     [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR
+115B8..115BB;N   # Mc     [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU
+115BC..115BD;N   # Mn     [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA
+115BE;N          # Mc         SIDDHAM SIGN VISARGA
+115BF..115C0;N   # Mn     [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
+115C1..115D7;N   # Po    [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
+115D8..115DB;N   # Lo     [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U
+115DC..115DD;N   # Mn     [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU
+11600..1162F;N   # Lo    [48] MODI LETTER A..MODI LETTER LLA
+11630..11632;N   # Mc     [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II
+11633..1163A;N   # Mn     [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI
+1163B..1163C;N   # Mc     [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU
+1163D;N          # Mn         MODI SIGN ANUSVARA
+1163E;N          # Mc         MODI SIGN VISARGA
+1163F..11640;N   # Mn     [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA
+11641..11643;N   # Po     [3] MODI DANDA..MODI ABBREVIATION SIGN
+11644;N          # Lo         MODI SIGN HUVA
+11650..11659;N   # Nd    [10] MODI DIGIT ZERO..MODI DIGIT NINE
+11660..1166C;N   # Po    [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT
+11680..116AA;N   # Lo    [43] TAKRI LETTER A..TAKRI LETTER RRA
+116AB;N          # Mn         TAKRI SIGN ANUSVARA
+116AC;N          # Mc         TAKRI SIGN VISARGA
+116AD;N          # Mn         TAKRI VOWEL SIGN AA
+116AE..116AF;N   # Mc     [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II
+116B0..116B5;N   # Mn     [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU
+116B6;N          # Mc         TAKRI SIGN VIRAMA
+116B7;N          # Mn         TAKRI SIGN NUKTA
+116B8;N          # Lo         TAKRI LETTER ARCHAIC KHA
+116B9;N          # Po         TAKRI ABBREVIATION SIGN
+116C0..116C9;N   # Nd    [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE
+11700..1171A;N   # Lo    [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA
+1171D..1171F;N   # Mn     [3] AHOM CONSONANT SIGN MEDIAL LA..AHOM CONSONANT SIGN MEDIAL LIGATING RA
+11720..11721;N   # Mc     [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA
+11722..11725;N   # Mn     [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU
+11726;N          # Mc         AHOM VOWEL SIGN E
+11727..1172B;N   # Mn     [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER
+11730..11739;N   # Nd    [10] AHOM DIGIT ZERO..AHOM DIGIT NINE
+1173A..1173B;N   # No     [2] AHOM NUMBER TEN..AHOM NUMBER TWENTY
+1173C..1173E;N   # Po     [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI
+1173F;N          # So         AHOM SYMBOL VI
+11740..11746;N   # Lo     [7] AHOM LETTER CA..AHOM LETTER LLA
+11800..1182B;N   # Lo    [44] DOGRA LETTER A..DOGRA LETTER RRA
+1182C..1182E;N   # Mc     [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II
+1182F..11837;N   # Mn     [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA
+11838;N          # Mc         DOGRA SIGN VISARGA
+11839..1183A;N   # Mn     [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA
+1183B;N          # Po         DOGRA ABBREVIATION SIGN
+118A0..118DF;N   # L&    [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO
+118E0..118E9;N   # Nd    [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE
+118EA..118F2;N   # No     [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY
+118FF;N          # Lo         WARANG CITI OM
+11900..11906;N   # Lo     [7] DIVES AKURU LETTER A..DIVES AKURU LETTER E
+11909;N          # Lo         DIVES AKURU LETTER O
+1190C..11913;N   # Lo     [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA
+11915..11916;N   # Lo     [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA
+11918..1192F;N   # Lo    [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA
+11930..11935;N   # Mc     [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E
+11937..11938;N   # Mc     [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O
+1193B..1193C;N   # Mn     [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU
+1193D;N          # Mc         DIVES AKURU SIGN HALANTA
+1193E;N          # Mn         DIVES AKURU VIRAMA
+1193F;N          # Lo         DIVES AKURU PREFIXED NASAL SIGN
+11940;N          # Mc         DIVES AKURU MEDIAL YA
+11941;N          # Lo         DIVES AKURU INITIAL RA
+11942;N          # Mc         DIVES AKURU MEDIAL RA
+11943;N          # Mn         DIVES AKURU SIGN NUKTA
+11944..11946;N   # Po     [3] DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK
+11950..11959;N   # Nd    [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE
+119A0..119A7;N   # Lo     [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR
+119AA..119D0;N   # Lo    [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA
+119D1..119D3;N   # Mc     [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II
+119D4..119D7;N   # Mn     [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR
+119DA..119DB;N   # Mn     [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI
+119DC..119DF;N   # Mc     [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA
+119E0;N          # Mn         NANDINAGARI SIGN VIRAMA
+119E1;N          # Lo         NANDINAGARI SIGN AVAGRAHA
+119E2;N          # Po         NANDINAGARI SIGN SIDDHAM
+119E3;N          # Lo         NANDINAGARI HEADSTROKE
+119E4;N          # Mc         NANDINAGARI VOWEL SIGN PRISHTHAMATRA E
+11A00;N          # Lo         ZANABAZAR SQUARE LETTER A
+11A01..11A0A;N   # Mn    [10] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL LENGTH MARK
+11A0B..11A32;N   # Lo    [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA
+11A33..11A38;N   # Mn     [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA
+11A39;N          # Mc         ZANABAZAR SQUARE SIGN VISARGA
+11A3A;N          # Lo         ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA
+11A3B..11A3E;N   # Mn     [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA
+11A3F..11A46;N   # Po     [8] ZANABAZAR SQUARE INITIAL HEAD MARK..ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK
+11A47;N          # Mn         ZANABAZAR SQUARE SUBJOINER
+11A50;N          # Lo         SOYOMBO LETTER A
+11A51..11A56;N   # Mn     [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE
+11A57..11A58;N   # Mc     [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU
+11A59..11A5B;N   # Mn     [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK
+11A5C..11A89;N   # Lo    [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA
+11A8A..11A96;N   # Mn    [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA
+11A97;N          # Mc         SOYOMBO SIGN VISARGA
+11A98..11A99;N   # Mn     [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER
+11A9A..11A9C;N   # Po     [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD
+11A9D;N          # Lo         SOYOMBO MARK PLUTA
+11A9E..11AA2;N   # Po     [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2
+11AB0..11ABF;N   # Lo    [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA
+11AC0..11AF8;N   # Lo    [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
+11B00..11B09;N   # Po    [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
+11C00..11C08;N   # Lo     [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
+11C0A..11C2E;N   # Lo    [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
+11C2F;N          # Mc         BHAIKSUKI VOWEL SIGN AA
+11C30..11C36;N   # Mn     [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L
+11C38..11C3D;N   # Mn     [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA
+11C3E;N          # Mc         BHAIKSUKI SIGN VISARGA
+11C3F;N          # Mn         BHAIKSUKI SIGN VIRAMA
+11C40;N          # Lo         BHAIKSUKI SIGN AVAGRAHA
+11C41..11C45;N   # Po     [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2
+11C50..11C59;N   # Nd    [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE
+11C5A..11C6C;N   # No    [19] BHAIKSUKI NUMBER ONE..BHAIKSUKI HUNDREDS UNIT MARK
+11C70..11C71;N   # Po     [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD
+11C72..11C8F;N   # Lo    [30] MARCHEN LETTER KA..MARCHEN LETTER A
+11C92..11CA7;N   # Mn    [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA
+11CA9;N          # Mc         MARCHEN SUBJOINED LETTER YA
+11CAA..11CB0;N   # Mn     [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA
+11CB1;N          # Mc         MARCHEN VOWEL SIGN I
+11CB2..11CB3;N   # Mn     [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E
+11CB4;N          # Mc         MARCHEN VOWEL SIGN O
+11CB5..11CB6;N   # Mn     [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU
+11D00..11D06;N   # Lo     [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E
+11D08..11D09;N   # Lo     [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O
+11D0B..11D30;N   # Lo    [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA
+11D31..11D36;N   # Mn     [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R
+11D3A;N          # Mn         MASARAM GONDI VOWEL SIGN E
+11D3C..11D3D;N   # Mn     [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O
+11D3F..11D45;N   # Mn     [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA
+11D46;N          # Lo         MASARAM GONDI REPHA
+11D47;N          # Mn         MASARAM GONDI RA-KARA
+11D50..11D59;N   # Nd    [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
+11D60..11D65;N   # Lo     [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU
+11D67..11D68;N   # Lo     [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI
+11D6A..11D89;N   # Lo    [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
+11D8A..11D8E;N   # Mc     [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU
+11D90..11D91;N   # Mn     [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI
+11D93..11D94;N   # Mc     [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU
+11D95;N          # Mn         GUNJALA GONDI SIGN ANUSVARA
+11D96;N          # Mc         GUNJALA GONDI SIGN VISARGA
+11D97;N          # Mn         GUNJALA GONDI VIRAMA
+11D98;N          # Lo         GUNJALA GONDI OM
+11DA0..11DA9;N   # Nd    [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
+11EE0..11EF2;N   # Lo    [19] MAKASAR LETTER KA..MAKASAR ANGKA
+11EF3..11EF4;N   # Mn     [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
+11EF5..11EF6;N   # Mc     [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
+11EF7..11EF8;N   # Po     [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
+11F00..11F01;N   # Mn     [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
+11F02;N          # Lo         KAWI SIGN REPHA
+11F03;N          # Mc         KAWI SIGN VISARGA
+11F04..11F10;N   # Lo    [13] KAWI LETTER A..KAWI LETTER O
+11F12..11F33;N   # Lo    [34] KAWI LETTER KA..KAWI LETTER JNYA
+11F34..11F35;N   # Mc     [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
+11F36..11F3A;N   # Mn     [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
+11F3E..11F3F;N   # Mc     [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
+11F40;N          # Mn         KAWI VOWEL SIGN EU
+11F41;N          # Mc         KAWI SIGN KILLER
+11F42;N          # Mn         KAWI CONJOINER
+11F43..11F4F;N   # Po    [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
+11F50..11F59;N   # Nd    [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
+11FB0;N          # Lo         LISU LETTER YHA
+11FC0..11FD4;N   # No    [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
+11FD5..11FDC;N   # So     [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
+11FDD..11FE0;N   # Sc     [4] TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN
+11FE1..11FF1;N   # So    [17] TAMIL SIGN PAARAM..TAMIL SIGN VAKAIYARAA
+11FFF;N          # Po         TAMIL PUNCTUATION END OF TEXT
+12000..12399;N   # Lo   [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
+12400..1246E;N   # Nl   [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
+12470..12474;N   # Po     [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
+12480..12543;N   # Lo   [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
+12F90..12FF0;N   # Lo    [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
+12FF1..12FF2;N   # Po     [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
+13000..1342F;N   # Lo  [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
+13430..13440;N   # Cf    [17] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
+13441..13446;N   # Lo     [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
+13447..13455;N   # Mn    [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
+14400..14646;N   # Lo   [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
+16800..16A38;N   # Lo   [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
+16A40..16A5E;N   # Lo    [31] MRO LETTER TA..MRO LETTER TEK
+16A60..16A69;N   # Nd    [10] MRO DIGIT ZERO..MRO DIGIT NINE
+16A6E..16A6F;N   # Po     [2] MRO DANDA..MRO DOUBLE DANDA
+16A70..16ABE;N   # Lo    [79] TANGSA LETTER OZ..TANGSA LETTER ZA
+16AC0..16AC9;N   # Nd    [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
+16AD0..16AED;N   # Lo    [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I
+16AF0..16AF4;N   # Mn     [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
+16AF5;N          # Po         BASSA VAH FULL STOP
+16B00..16B2F;N   # Lo    [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
+16B30..16B36;N   # Mn     [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
+16B37..16B3B;N   # Po     [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM
+16B3C..16B3F;N   # So     [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB
+16B40..16B43;N   # Lm     [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
+16B44;N          # Po         PAHAWH HMONG SIGN XAUS
+16B45;N          # So         PAHAWH HMONG SIGN CIM TSOV ROG
+16B50..16B59;N   # Nd    [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
+16B5B..16B61;N   # No     [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS
+16B63..16B77;N   # Lo    [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS
+16B7D..16B8F;N   # Lo    [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ
+16E40..16E7F;N   # L&    [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y
+16E80..16E96;N   # No    [23] MEDEFAIDRIN DIGIT ZERO..MEDEFAIDRIN DIGIT THREE ALTERNATE FORM
+16E97..16E9A;N   # Po     [4] MEDEFAIDRIN COMMA..MEDEFAIDRIN EXCLAMATION OH
+16F00..16F4A;N   # Lo    [75] MIAO LETTER PA..MIAO LETTER RTE
+16F4F;N          # Mn         MIAO SIGN CONSONANT MODIFIER BAR
+16F50;N          # Lo         MIAO LETTER NASALIZATION
+16F51..16F87;N   # Mc    [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
+16F8F..16F92;N   # Mn     [4] MIAO TONE RIGHT..MIAO TONE BELOW
+16F93..16F9F;N   # Lm    [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
+16FE0..16FE1;W   # Lm     [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
+16FE2;W          # Po         OLD CHINESE HOOK MARK
+16FE3;W          # Lm         OLD CHINESE ITERATION MARK
+16FE4;W          # Mn         KHITAN SMALL SCRIPT FILLER
+16FF0..16FF1;W   # Mc     [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
+17000..187F7;W   # Lo  [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7
+18800..18AFF;W   # Lo   [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768
+18B00..18CD5;W   # Lo   [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5
+18D00..18D08;W   # Lo     [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08
+1AFF0..1AFF3;W   # Lm     [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
+1AFF5..1AFFB;W   # Lm     [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
+1AFFD..1AFFE;W   # Lm     [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
+1B000..1B0FF;W   # Lo   [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
+1B100..1B122;W   # Lo    [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
+1B132;W          # Lo         HIRAGANA LETTER SMALL KO
+1B150..1B152;W   # Lo     [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+1B155;W          # Lo         KATAKANA LETTER SMALL KO
+1B164..1B167;W   # Lo     [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
+1B170..1B2FB;W   # Lo   [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
+1BC00..1BC6A;N   # Lo   [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
+1BC70..1BC7C;N   # Lo    [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
+1BC80..1BC88;N   # Lo     [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
+1BC90..1BC99;N   # Lo    [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
+1BC9C;N          # So         DUPLOYAN SIGN O WITH CROSS
+1BC9D..1BC9E;N   # Mn     [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
+1BC9F;N          # Po         DUPLOYAN PUNCTUATION CHINOOK FULL STOP
+1BCA0..1BCA3;N   # Cf     [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
+1CF00..1CF2D;N   # Mn    [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT
+1CF30..1CF46;N   # Mn    [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG
+1CF50..1CFC3;N   # So   [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK
+1D000..1D0F5;N   # So   [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
+1D100..1D126;N   # So    [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
+1D129..1D164;N   # So    [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
+1D165..1D166;N   # Mc     [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM
+1D167..1D169;N   # Mn     [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
+1D16A..1D16C;N   # So     [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3
+1D16D..1D172;N   # Mc     [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5
+1D173..1D17A;N   # Cf     [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
+1D17B..1D182;N   # Mn     [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
+1D183..1D184;N   # So     [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN
+1D185..1D18B;N   # Mn     [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
+1D18C..1D1A9;N   # So    [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH
+1D1AA..1D1AD;N   # Mn     [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
+1D1AE..1D1EA;N   # So    [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON
+1D200..1D241;N   # So    [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
+1D242..1D244;N   # Mn     [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
+1D245;N          # So         GREEK MUSICAL LEIMMA
+1D2C0..1D2D3;N   # No    [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
+1D2E0..1D2F3;N   # No    [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
+1D300..1D356;N   # So    [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
+1D360..1D378;N   # No    [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
+1D400..1D454;N   # L&    [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G
+1D456..1D49C;N   # L&    [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A
+1D49E..1D49F;N   # Lu     [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D
+1D4A2;N          # Lu         MATHEMATICAL SCRIPT CAPITAL G
+1D4A5..1D4A6;N   # Lu     [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K
+1D4A9..1D4AC;N   # Lu     [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q
+1D4AE..1D4B9;N   # L&    [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D
+1D4BB;N          # Ll         MATHEMATICAL SCRIPT SMALL F
+1D4BD..1D4C3;N   # Ll     [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N
+1D4C5..1D505;N   # L&    [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B
+1D507..1D50A;N   # Lu     [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G
+1D50D..1D514;N   # Lu     [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q
+1D516..1D51C;N   # Lu     [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y
+1D51E..1D539;N   # L&    [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B
+1D53B..1D53E;N   # Lu     [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G
+1D540..1D544;N   # Lu     [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M
+1D546;N          # Lu         MATHEMATICAL DOUBLE-STRUCK CAPITAL O
+1D54A..1D550;N   # Lu     [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y
+1D552..1D6A5;N   # L&   [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J
+1D6A8..1D6C0;N   # Lu    [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA
+1D6C1;N          # Sm         MATHEMATICAL BOLD NABLA
+1D6C2..1D6DA;N   # Ll    [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA
+1D6DB;N          # Sm         MATHEMATICAL BOLD PARTIAL DIFFERENTIAL
+1D6DC..1D6FA;N   # L&    [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA
+1D6FB;N          # Sm         MATHEMATICAL ITALIC NABLA
+1D6FC..1D714;N   # Ll    [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA
+1D715;N          # Sm         MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL
+1D716..1D734;N   # L&    [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA
+1D735;N          # Sm         MATHEMATICAL BOLD ITALIC NABLA
+1D736..1D74E;N   # Ll    [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA
+1D74F;N          # Sm         MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL
+1D750..1D76E;N   # L&    [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA
+1D76F;N          # Sm         MATHEMATICAL SANS-SERIF BOLD NABLA
+1D770..1D788;N   # Ll    [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA
+1D789;N          # Sm         MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL
+1D78A..1D7A8;N   # L&    [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA
+1D7A9;N          # Sm         MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA
+1D7AA..1D7C2;N   # Ll    [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA
+1D7C3;N          # Sm         MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
+1D7C4..1D7CB;N   # L&     [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA
+1D7CE..1D7FF;N   # Nd    [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
+1D800..1D9FF;N   # So   [512] SIGNWRITING HAND-FIST INDEX..SIGNWRITING HEAD
+1DA00..1DA36;N   # Mn    [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN
+1DA37..1DA3A;N   # So     [4] SIGNWRITING AIR BLOW SMALL ROTATIONS..SIGNWRITING BREATH EXHALE
+1DA3B..1DA6C;N   # Mn    [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT
+1DA6D..1DA74;N   # So     [8] SIGNWRITING SHOULDER HIP SPINE..SIGNWRITING TORSO-FLOORPLANE TWISTING
+1DA75;N          # Mn         SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS
+1DA76..1DA83;N   # So    [14] SIGNWRITING LIMB COMBINATION..SIGNWRITING LOCATION DEPTH
+1DA84;N          # Mn         SIGNWRITING LOCATION HEAD NECK
+1DA85..1DA86;N   # So     [2] SIGNWRITING LOCATION TORSO..SIGNWRITING LOCATION LIMBS DIGITS
+1DA87..1DA8B;N   # Po     [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS
+1DA9B..1DA9F;N   # Mn     [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6
+1DAA1..1DAAF;N   # Mn    [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16
+1DF00..1DF09;N   # Ll    [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
+1DF0A;N          # Lo         LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
+1DF0B..1DF1E;N   # Ll    [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
+1DF25..1DF2A;N   # Ll     [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
+1E000..1E006;N   # Mn     [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
+1E008..1E018;N   # Mn    [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
+1E01B..1E021;N   # Mn     [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
+1E023..1E024;N   # Mn     [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
+1E026..1E02A;N   # Mn     [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
+1E030..1E06D;N   # Lm    [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
+1E08F;N          # Mn         COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
+1E100..1E12C;N   # Lo    [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
+1E130..1E136;N   # Mn     [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
+1E137..1E13D;N   # Lm     [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
+1E140..1E149;N   # Nd    [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
+1E14E;N          # Lo         NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
+1E14F;N          # So         NYIAKENG PUACHUE HMONG CIRCLED CA
+1E290..1E2AD;N   # Lo    [30] TOTO LETTER PA..TOTO LETTER A
+1E2AE;N          # Mn         TOTO SIGN RISING TONE
+1E2C0..1E2EB;N   # Lo    [44] WANCHO LETTER AA..WANCHO LETTER YIH
+1E2EC..1E2EF;N   # Mn     [4] WANCHO TONE TUP..WANCHO TONE KOINI
+1E2F0..1E2F9;N   # Nd    [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
+1E2FF;N          # Sc         WANCHO NGUN SIGN
+1E4D0..1E4EA;N   # Lo    [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
+1E4EB;N          # Lm         NAG MUNDARI SIGN OJOD
+1E4EC..1E4EF;N   # Mn     [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
+1E4F0..1E4F9;N   # Nd    [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
+1E7E0..1E7E6;N   # Lo     [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
+1E7E8..1E7EB;N   # Lo     [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
+1E7ED..1E7EE;N   # Lo     [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
+1E7F0..1E7FE;N   # Lo    [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE
+1E800..1E8C4;N   # Lo   [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON
+1E8C7..1E8CF;N   # No     [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE
+1E8D0..1E8D6;N   # Mn     [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
+1E900..1E943;N   # L&    [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA
+1E944..1E94A;N   # Mn     [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
+1E94B;N          # Lm         ADLAM NASALIZATION MARK
+1E950..1E959;N   # Nd    [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
+1E95E..1E95F;N   # Po     [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK
+1EC71..1ECAB;N   # No    [59] INDIC SIYAQ NUMBER ONE..INDIC SIYAQ NUMBER PREFIXED NINE
+1ECAC;N          # So         INDIC SIYAQ PLACEHOLDER
+1ECAD..1ECAF;N   # No     [3] INDIC SIYAQ FRACTION ONE QUARTER..INDIC SIYAQ FRACTION THREE QUARTERS
+1ECB0;N          # Sc         INDIC SIYAQ RUPEE MARK
+1ECB1..1ECB4;N   # No     [4] INDIC SIYAQ NUMBER ALTERNATE ONE..INDIC SIYAQ ALTERNATE LAKH MARK
+1ED01..1ED2D;N   # No    [45] OTTOMAN SIYAQ NUMBER ONE..OTTOMAN SIYAQ NUMBER NINETY THOUSAND
+1ED2E;N          # So         OTTOMAN SIYAQ MARRATAN
+1ED2F..1ED3D;N   # No    [15] OTTOMAN SIYAQ ALTERNATE NUMBER TWO..OTTOMAN SIYAQ FRACTION ONE SIXTH
+1EE00..1EE03;N   # Lo     [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL
+1EE05..1EE1F;N   # Lo    [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF
+1EE21..1EE22;N   # Lo     [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM
+1EE24;N          # Lo         ARABIC MATHEMATICAL INITIAL HEH
+1EE27;N          # Lo         ARABIC MATHEMATICAL INITIAL HAH
+1EE29..1EE32;N   # Lo    [10] ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF
+1EE34..1EE37;N   # Lo     [4] ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH
+1EE39;N          # Lo         ARABIC MATHEMATICAL INITIAL DAD
+1EE3B;N          # Lo         ARABIC MATHEMATICAL INITIAL GHAIN
+1EE42;N          # Lo         ARABIC MATHEMATICAL TAILED JEEM
+1EE47;N          # Lo         ARABIC MATHEMATICAL TAILED HAH
+1EE49;N          # Lo         ARABIC MATHEMATICAL TAILED YEH
+1EE4B;N          # Lo         ARABIC MATHEMATICAL TAILED LAM
+1EE4D..1EE4F;N   # Lo     [3] ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN
+1EE51..1EE52;N   # Lo     [2] ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF
+1EE54;N          # Lo         ARABIC MATHEMATICAL TAILED SHEEN
+1EE57;N          # Lo         ARABIC MATHEMATICAL TAILED KHAH
+1EE59;N          # Lo         ARABIC MATHEMATICAL TAILED DAD
+1EE5B;N          # Lo         ARABIC MATHEMATICAL TAILED GHAIN
+1EE5D;N          # Lo         ARABIC MATHEMATICAL TAILED DOTLESS NOON
+1EE5F;N          # Lo         ARABIC MATHEMATICAL TAILED DOTLESS QAF
+1EE61..1EE62;N   # Lo     [2] ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM
+1EE64;N          # Lo         ARABIC MATHEMATICAL STRETCHED HEH
+1EE67..1EE6A;N   # Lo     [4] ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF
+1EE6C..1EE72;N   # Lo     [7] ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF
+1EE74..1EE77;N   # Lo     [4] ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH
+1EE79..1EE7C;N   # Lo     [4] ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH
+1EE7E;N          # Lo         ARABIC MATHEMATICAL STRETCHED DOTLESS FEH
+1EE80..1EE89;N   # Lo    [10] ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH
+1EE8B..1EE9B;N   # Lo    [17] ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN
+1EEA1..1EEA3;N   # Lo     [3] ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL
+1EEA5..1EEA9;N   # Lo     [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
+1EEAB..1EEBB;N   # Lo    [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
+1EEF0..1EEF1;N   # Sm     [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL
+1F000..1F003;N   # So     [4] MAHJONG TILE EAST WIND..MAHJONG TILE NORTH WIND
+1F004;W          # So         MAHJONG TILE RED DRAGON
+1F005..1F02B;N   # So    [39] MAHJONG TILE GREEN DRAGON..MAHJONG TILE BACK
+1F030..1F093;N   # So   [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06
+1F0A0..1F0AE;N   # So    [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES
+1F0B1..1F0BF;N   # So    [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER
+1F0C1..1F0CE;N   # So    [14] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD KING OF DIAMONDS
+1F0CF;W          # So         PLAYING CARD BLACK JOKER
+1F0D1..1F0F5;N   # So    [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21
+1F100..1F10A;A   # No    [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA
+1F10B..1F10C;N   # No     [2] DINGBAT CIRCLED SANS-SERIF DIGIT ZERO..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO
+1F10D..1F10F;N   # So     [3] CIRCLED ZERO WITH SLASH..CIRCLED DOLLAR SIGN WITH OVERLAID BACKSLASH
+1F110..1F12D;A   # So    [30] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED CD
+1F12E..1F12F;N   # So     [2] CIRCLED WZ..COPYLEFT SYMBOL
+1F130..1F169;A   # So    [58] SQUARED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
+1F16A..1F16F;N   # So     [6] RAISED MC SIGN..CIRCLED HUMAN FIGURE
+1F170..1F18D;A   # So    [30] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED SA
+1F18E;W          # So         NEGATIVE SQUARED AB
+1F18F..1F190;A   # So     [2] NEGATIVE SQUARED WC..SQUARE DJ
+1F191..1F19A;W   # So    [10] SQUARED CL..SQUARED VS
+1F19B..1F1AC;A   # So    [18] SQUARED THREE D..SQUARED VOD
+1F1AD;N          # So         MASK WORK SYMBOL
+1F1E6..1F1FF;N   # So    [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z
+1F200..1F202;W   # So     [3] SQUARE HIRAGANA HOKA..SQUARED KATAKANA SA
+1F210..1F23B;W   # So    [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D
+1F240..1F248;W   # So     [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
+1F250..1F251;W   # So     [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
+1F260..1F265;W   # So     [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
+1F300..1F320;W   # So    [33] CYCLONE..SHOOTING STAR
+1F321..1F32C;N   # So    [12] THERMOMETER..WIND BLOWING FACE
+1F32D..1F335;W   # So     [9] HOT DOG..CACTUS
+1F336;N          # So         HOT PEPPER
+1F337..1F37C;W   # So    [70] TULIP..BABY BOTTLE
+1F37D;N          # So         FORK AND KNIFE WITH PLATE
+1F37E..1F393;W   # So    [22] BOTTLE WITH POPPING CORK..GRADUATION CAP
+1F394..1F39F;N   # So    [12] HEART WITH TIP ON THE LEFT..ADMISSION TICKETS
+1F3A0..1F3CA;W   # So    [43] CAROUSEL HORSE..SWIMMER
+1F3CB..1F3CE;N   # So     [4] WEIGHT LIFTER..RACING CAR
+1F3CF..1F3D3;W   # So     [5] CRICKET BAT AND BALL..TABLE TENNIS PADDLE AND BALL
+1F3D4..1F3DF;N   # So    [12] SNOW CAPPED MOUNTAIN..STADIUM
+1F3E0..1F3F0;W   # So    [17] HOUSE BUILDING..EUROPEAN CASTLE
+1F3F1..1F3F3;N   # So     [3] WHITE PENNANT..WAVING WHITE FLAG
+1F3F4;W          # So         WAVING BLACK FLAG
+1F3F5..1F3F7;N   # So     [3] ROSETTE..LABEL
+1F3F8..1F3FA;W   # So     [3] BADMINTON RACQUET AND SHUTTLECOCK..AMPHORA
+1F3FB..1F3FF;W   # Sk     [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
+1F400..1F43E;W   # So    [63] RAT..PAW PRINTS
+1F43F;N          # So         CHIPMUNK
+1F440;W          # So         EYES
+1F441;N          # So         EYE
+1F442..1F4FC;W   # So   [187] EAR..VIDEOCASSETTE
+1F4FD..1F4FE;N   # So     [2] FILM PROJECTOR..PORTABLE STEREO
+1F4FF..1F53D;W   # So    [63] PRAYER BEADS..DOWN-POINTING SMALL RED TRIANGLE
+1F53E..1F54A;N   # So    [13] LOWER RIGHT SHADOWED WHITE CIRCLE..DOVE OF PEACE
+1F54B..1F54E;W   # So     [4] KAABA..MENORAH WITH NINE BRANCHES
+1F54F;N          # So         BOWL OF HYGIEIA
+1F550..1F567;W   # So    [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
+1F568..1F579;N   # So    [18] RIGHT SPEAKER..JOYSTICK
+1F57A;W          # So         MAN DANCING
+1F57B..1F594;N   # So    [26] LEFT HAND TELEPHONE RECEIVER..REVERSED VICTORY HAND
+1F595..1F596;W   # So     [2] REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS
+1F597..1F5A3;N   # So    [13] WHITE DOWN POINTING LEFT HAND INDEX..BLACK DOWN POINTING BACKHAND INDEX
+1F5A4;W          # So         BLACK HEART
+1F5A5..1F5FA;N   # So    [86] DESKTOP COMPUTER..WORLD MAP
+1F5FB..1F5FF;W   # So     [5] MOUNT FUJI..MOYAI
+1F600..1F64F;W   # So    [80] GRINNING FACE..PERSON WITH FOLDED HANDS
+1F650..1F67F;N   # So    [48] NORTH WEST POINTING LEAF..REVERSE CHECKER BOARD
+1F680..1F6C5;W   # So    [70] ROCKET..LEFT LUGGAGE
+1F6C6..1F6CB;N   # So     [6] TRIANGLE WITH ROUNDED CORNERS..COUCH AND LAMP
+1F6CC;W          # So         SLEEPING ACCOMMODATION
+1F6CD..1F6CF;N   # So     [3] SHOPPING BAGS..BED
+1F6D0..1F6D2;W   # So     [3] PLACE OF WORSHIP..SHOPPING TROLLEY
+1F6D3..1F6D4;N   # So     [2] STUPA..PAGODA
+1F6D5..1F6D7;W   # So     [3] HINDU TEMPLE..ELEVATOR
+1F6DC..1F6DF;W   # So     [4] WIRELESS..RING BUOY
+1F6E0..1F6EA;N   # So    [11] HAMMER AND WRENCH..NORTHEAST-POINTING AIRPLANE
+1F6EB..1F6EC;W   # So     [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING
+1F6F0..1F6F3;N   # So     [4] SATELLITE..PASSENGER SHIP
+1F6F4..1F6FC;W   # So     [9] SCOOTER..ROLLER SKATE
+1F700..1F776;N   # So   [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE
+1F77B..1F77F;N   # So     [5] HAUMEA..ORCUS
+1F780..1F7D9;N   # So    [90] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NINE POINTED WHITE STAR
+1F7E0..1F7EB;W   # So    [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
+1F7F0;W          # So         HEAVY EQUALS SIGN
+1F800..1F80B;N   # So    [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
+1F810..1F847;N   # So    [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW
+1F850..1F859;N   # So    [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
+1F860..1F887;N   # So    [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
+1F890..1F8AD;N   # So    [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
+1F8B0..1F8B1;N   # So     [2] ARROW POINTING UPWARDS THEN NORTH WEST..ARROW POINTING RIGHTWARDS THEN CURVING SOUTH WEST
+1F900..1F90B;N   # So    [12] CIRCLED CROSS FORMEE WITH FOUR DOTS..DOWNWARD FACING NOTCHED HOOK WITH DOT
+1F90C..1F93A;W   # So    [47] PINCHED FINGERS..FENCER
+1F93B;N          # So         MODERN PENTATHLON
+1F93C..1F945;W   # So    [10] WRESTLERS..GOAL NET
+1F946;N          # So         RIFLE
+1F947..1F9FF;W   # So   [185] FIRST PLACE MEDAL..NAZAR AMULET
+1FA00..1FA53;N   # So    [84] NEUTRAL CHESS KING..BLACK CHESS KNIGHT-BISHOP
+1FA60..1FA6D;N   # So    [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
+1FA70..1FA7C;W   # So    [13] BALLET SHOES..CRUTCH
+1FA80..1FA88;W   # So     [9] YO-YO..FLUTE
+1FA90..1FABD;W   # So    [46] RINGED PLANET..WING
+1FABE;W          # Cn         <reserved-1FABE>
+1FABF..1FAC5;W   # So     [7] GOOSE..PERSON WITH CROWN
+1FACE..1FADB;W   # So    [14] MOOSE..PEA POD
+1FAE0..1FAE8;W   # So     [9] MELTING FACE..SHAKING FACE
+1FAF0..1FAF8;W   # So     [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
+1FB00..1FB92;N   # So   [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
+1FB94..1FBCA;N   # So    [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
+1FBF0..1FBF9;N   # Nd    [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
+20000..2A6DF;W   # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
+2A6E0..2A6FF;W   # Cn    [32] <reserved-2A6E0>..<reserved-2A6FF>
+2A700..2B738;W   # Lo  [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
+2B739..2B73F;W   # Cn     [7] <reserved-2B739>..<reserved-2B73F>
+2B740..2B81D;W   # Lo   [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
+2B81E..2B81F;W   # Cn     [2] <reserved-2B81E>..<reserved-2B81F>
+2B820..2CEA1;W   # Lo  [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
+2CEA2..2CEAF;W   # Cn    [14] <reserved-2CEA2>..<reserved-2CEAF>
+2CEB0..2EBE0;W   # Lo  [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
+2EBE1..2F7FF;W   # Cn  [3103] <reserved-2EBE1>..<reserved-2F7FF>
+2F800..2FA1D;W   # Lo   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
+2FA1E..2FA1F;W   # Cn     [2] <reserved-2FA1E>..<reserved-2FA1F>
+2FA20..2FFFD;W   # Cn  [1502] <reserved-2FA20>..<reserved-2FFFD>
+30000..3134A;W   # Lo  [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
+3134B..3134F;W   # Cn     [5] <reserved-3134B>..<reserved-3134F>
+31350..323AF;W   # Lo  [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
+323B0..3FFFD;W   # Cn [56398] <reserved-323B0>..<reserved-3FFFD>
+E0001;N          # Cf         LANGUAGE TAG
+E0020..E007F;N   # Cf    [96] TAG SPACE..CANCEL TAG
+E0100..E01EF;A   # Mn   [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
+F0000..FFFFD;A   # Co [65534] <private-use-F0000>..<private-use-FFFFD>
+100000..10FFFD;A # Co [65534] <private-use-100000>..<private-use-10FFFD>
 
 # EOF
diff --git a/test/libc/proc/execve_test_prog2.c b/libc/str/freelocale.c
similarity index 91%
rename from test/libc/proc/execve_test_prog2.c
rename to libc/str/freelocale.c
index 2369ec9c9..eba6fad5c 100644
--- a/test/libc/proc/execve_test_prog2.c
+++ b/libc/str/freelocale.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,8 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/proc/proc.h"
-#include "libc/testlib/testlib.h"
+#include "libc/str/locale.h"
 
-int main(int argc, char *argv[]) {
+void freelocale(locale_t l) {
+  // TODO: implement me
 }
diff --git a/libc/str/getx86processormodel.c b/libc/str/getx86processormodel.c
index 9746cc4be..3ac7c77fe 100644
--- a/libc/str/getx86processormodel.c
+++ b/libc/str/getx86processormodel.c
@@ -16,6 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/mem/bisect.internal.h"
 #include "libc/nexgen32e/x86info.h"
 
 static int CmpX86ProcModelKey(const struct X86ProcessorModel *a,
@@ -31,8 +32,7 @@ static int CmpX86ProcModelKey(const struct X86ProcessorModel *a,
  * @see https://a4lg.com/tech/x86/database/x86-families-and-models.en.html
  */
 const struct X86ProcessorModel *getx86processormodel(short key) {
-  for (int i = 0; kX86ProcessorModels[i].key; ++i)
-    if (kX86ProcessorModels[i].key == key)
-      return &kX86ProcessorModels[i];
-  return 0;
+  return bisect(&(struct X86ProcessorModel){key}, kX86ProcessorModels,
+                kX86ProcessorModelCount, sizeof(struct X86ProcessorModel),
+                (void *)CmpX86ProcModelKey, NULL);
 }
diff --git a/libc/str/getzipcdircomment.c b/libc/str/getzipcdircomment.c
index 85d8d8ce7..35ca8b38f 100644
--- a/libc/str/getzipcdircomment.c
+++ b/libc/str/getzipcdircomment.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns comment of zip central directory.
diff --git a/libc/str/getzipcdircommentsize.c b/libc/str/getzipcdircommentsize.c
index c4fbc77d8..ac83f995b 100644
--- a/libc/str/getzipcdircommentsize.c
+++ b/libc/str/getzipcdircommentsize.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns comment of zip central directory.
diff --git a/libc/str/getzipcdiroffset.c b/libc/str/getzipcdiroffset.c
index 0aaefb03d..6698290ce 100644
--- a/libc/str/getzipcdiroffset.c
+++ b/libc/str/getzipcdiroffset.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns offset of zip central directory.
diff --git a/libc/str/getzipcdirrecords.c b/libc/str/getzipcdirrecords.c
index f127dd299..20f0cef59 100644
--- a/libc/str/getzipcdirrecords.c
+++ b/libc/str/getzipcdirrecords.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns number of records in zip central directory.
diff --git a/libc/str/getzipcdirsize.c b/libc/str/getzipcdirsize.c
index c8a1e3da9..3ea16c201 100644
--- a/libc/str/getzipcdirsize.c
+++ b/libc/str/getzipcdirsize.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns size of zip central directory.
diff --git a/libc/str/getzipcfilecompressedsize.c b/libc/str/getzipcfilecompressedsize.c
index 96a492e84..f2c26ba36 100644
--- a/libc/str/getzipcfilecompressedsize.c
+++ b/libc/str/getzipcfilecompressedsize.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns compressed size in bytes from zip central directory header.
diff --git a/libc/str/getzipcfilemode.c b/libc/str/getzipcfilemode.c
index d73e708a4..f4a944301 100644
--- a/libc/str/getzipcfilemode.c
+++ b/libc/str/getzipcfilemode.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/nt/enum/fileflagandattributes.h"
 #include "libc/sysv/consts/s.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 static int ConvertWindowsToUnixMode(int x) {
   int m;
diff --git a/libc/str/getzipcfileoffset.c b/libc/str/getzipcfileoffset.c
index 66726cea8..5a18f6a92 100644
--- a/libc/str/getzipcfileoffset.c
+++ b/libc/str/getzipcfileoffset.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns offset of local file header.
diff --git a/libc/str/getzipcfiletimestamps.c b/libc/str/getzipcfiletimestamps.c
index 4d246c4ab..0a06b6992 100644
--- a/libc/str/getzipcfiletimestamps.c
+++ b/libc/str/getzipcfiletimestamps.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/wintime.internal.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 static inline int pop(int x) {
   return !!(x & 1) + !!(x & 2) + !!(x & 4);
diff --git a/libc/str/getzipcfileuncompressedsize.c b/libc/str/getzipcfileuncompressedsize.c
index d129b55e5..3a38bd29e 100644
--- a/libc/str/getzipcfileuncompressedsize.c
+++ b/libc/str/getzipcfileuncompressedsize.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns uncompressed size in bytes from zip central directory header.
diff --git a/libc/str/getzipeocd.c b/libc/str/getzipeocd.c
index 86c1cc1e7..ee862cc19 100644
--- a/libc/str/getzipeocd.c
+++ b/libc/str/getzipeocd.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 typedef char v16qi __attribute__((__vector_size__(16)));
 typedef short v8hi __attribute__((__vector_size__(16)));
diff --git a/libc/str/getziplfilecompressedsize.c b/libc/str/getziplfilecompressedsize.c
index 5147c8934..26403d300 100644
--- a/libc/str/getziplfilecompressedsize.c
+++ b/libc/str/getziplfilecompressedsize.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns compressed size in bytes from zip local file header.
diff --git a/libc/str/getziplfileuncompressedsize.c b/libc/str/getziplfileuncompressedsize.c
index e76b74bc9..86a6b3621 100644
--- a/libc/str/getziplfileuncompressedsize.c
+++ b/libc/str/getziplfileuncompressedsize.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns uncompressed size in bytes from zip local file header.
diff --git a/libc/str/has_char.h b/libc/str/has_char.h
deleted file mode 100644
index 64c5a4763..000000000
--- a/libc/str/has_char.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// -*- c++ -*-
-#ifndef COSMOPOLITAN_LIBC_STR_HAS_CHAR_H_
-#define COSMOPOLITAN_LIBC_STR_HAS_CHAR_H_
-#ifdef __cplusplus
-
-template <typename T>
-static bool has_char(const T (*ranges)[2], size_t n, T c) {
-  unsigned l = 0;
-  unsigned r = n;
-  while (l < r) {
-    unsigned m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
-    if (c < ranges[m][0]) {
-      r = m;
-    } else if (c > ranges[m][1]) {
-      l = m + 1;
-    } else {
-      return true;
-    }
-  }
-  return false;
-}
-
-#endif /* __cplusplus */
-#endif /* COSMOPOLITAN_LIBC_STR_HAS_CHAR_H_ */
diff --git a/third_party/musl/iswalnum.c b/libc/str/iswalnum.c
similarity index 100%
rename from third_party/musl/iswalnum.c
rename to libc/str/iswalnum.c
diff --git a/libc/str/iswalpha.c b/libc/str/iswalpha.c
new file mode 100644
index 000000000..573392d42
--- /dev/null
+++ b/libc/str/iswalpha.c
@@ -0,0 +1,28 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/wctype.h"
+
+/**
+ * Returns nonzero if c is alphabetical.
+ */
+int iswalpha(wint_t c) {
+  return iswupper(c) || iswlower(c);
+}
+
+__weak_reference(iswalpha, iswalpha_l);
diff --git a/libc/str/iswcntrl.c b/libc/str/iswcntrl.c
index a8f63aba3..b67dbf854 100644
--- a/libc/str/iswcntrl.c
+++ b/libc/str/iswcntrl.c
@@ -19,15 +19,10 @@
 #include "libc/wctype.h"
 
 /**
- * Returns nonzero if `c` is control code.
- *
- * This includes C0 or C1 control codes, in addition to the "LINE
- * SEPARATOR" and "PARAGRAPH SEPARATOR" characters.
+ * Returns nonzero if c is C0 or C1 control code.
  */
 int iswcntrl(wint_t c) {
-  return (0x0000 <= c && c <= 0x001F) ||  //
-         (0x007F <= c && c <= 0x009F) ||  //
-         (0x2028 <= c && c <= 0x2029);
+  return (0x00 <= c && c <= 0x1F) || (0x7F <= c && c <= 0x9F);
 }
 
 __weak_reference(iswcntrl, iswcntrl_l);
diff --git a/third_party/musl/iswctype.c b/libc/str/iswctype.c
similarity index 98%
rename from third_party/musl/iswctype.c
rename to libc/str/iswctype.c
index 553e82820..0c2b83fab 100644
--- a/third_party/musl/iswctype.c
+++ b/libc/str/iswctype.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/wctype.h"
 
 typedef int (*isw_f)(wint_t);
diff --git a/libc/str/iswlower.c b/libc/str/iswlower.c
new file mode 100644
index 000000000..546ee379f
--- /dev/null
+++ b/libc/str/iswlower.c
@@ -0,0 +1,520 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/wctype.h"
+
+/**
+ * Returns nonzero if c is lowercase letter.
+ */
+int iswlower(wint_t c) {
+  if (c < 0200) {
+    return 'a' <= c && c <= 'z';
+  } else {
+    if (towupper(c) != c)
+      return 1;
+    switch (c) {
+      case 0x00df:  /* ß Watin */
+      case 0x0138:  /* ĸ Watin-A */
+      case 0x0149:  /* ŉ Watin-A */
+      case 0x018d:  /* ƍ Watin-B */
+      case 0x019b:  /* ƛ Watin-B */
+      case 0x01aa:  /* ƪ Watin-B */
+      case 0x01ab:  /* ƫ Watin-B */
+      case 0x01ba:  /* ƺ Watin-B */
+      case 0x01be:  /* ƾ Watin-B */
+      case 0x01f0:  /* ǰ Watin-B */
+      case 0x0221:  /* ȡ Watin-B */
+      case 0x0234:  /* ȴ Watin-B */
+      case 0x0235:  /* ȵ Watin-B */
+      case 0x0236:  /* ȶ Watin-B */
+      case 0x0237:  /* ȷ Watin-B */
+      case 0x0238:  /* ȸ Watin-B */
+      case 0x0239:  /* ȹ Watin-B */
+      case 0x0255:  /* ɕ IPA */
+      case 0x0258:  /* ɘ IPA */
+      case 0x025a:  /* ɚ IPA */
+      case 0x025d:  /* ɝ IPA */
+      case 0x025e:  /* ɞ IPA */
+      case 0x025f:  /* ɟ IPA */
+      case 0x0262:  /* ɢ IPA */
+      case 0x0264:  /* ɤ IPA */
+      case 0x0267:  /* ɧ IPA */
+      case 0x026d:  /* ɭ IPA */
+      case 0x026e:  /* ɮ IPA */
+      case 0x0270:  /* ɰ IPA */
+      case 0x0273:  /* ɳ IPA */
+      case 0x0274:  /* ɴ IPA */
+      case 0x0276:  /* ɶ IPA */
+      case 0x0277:  /* ɷ IPA */
+      case 0x0278:  /* ɸ IPA */
+      case 0x0279:  /* ɹ IPA */
+      case 0x027a:  /* ɺ IPA */
+      case 0x027b:  /* ɻ IPA */
+      case 0x027c:  /* ɼ IPA */
+      case 0x027e:  /* ɾ IPA */
+      case 0x027f:  /* ɿ IPA */
+      case 0x0281:  /* ʁ IPA */
+      case 0x0284:  /* ʄ IPA */
+      case 0x0285:  /* ʅ IPA */
+      case 0x0286:  /* ʆ IPA */
+      case 0x028d:  /* ʍ IPA */
+      case 0x028e:  /* ʎ IPA */
+      case 0x028f:  /* ʏ IPA */
+      case 0x0290:  /* ʐ IPA */
+      case 0x0291:  /* ʑ IPA */
+      case 0x0293:  /* ʓ IPA */
+      case 0x0295:  /* ʕ IPA */
+      case 0x0296:  /* ʖ IPA */
+      case 0x0297:  /* ʗ IPA */
+      case 0x0298:  /* ʘ IPA */
+      case 0x0299:  /* ʙ IPA */
+      case 0x029a:  /* ʚ IPA */
+      case 0x029b:  /* ʛ IPA */
+      case 0x029c:  /* ʜ IPA */
+      case 0x029f:  /* ʟ IPA */
+      case 0x02a0:  /* ʠ IPA */
+      case 0x02a1:  /* ʡ IPA */
+      case 0x02a2:  /* ʢ IPA */
+      case 0x02a3:  /* ʣ IPA */
+      case 0x02a4:  /* ʤ IPA */
+      case 0x02a5:  /* ʥ IPA */
+      case 0x02a6:  /* ʦ IPA */
+      case 0x02a7:  /* ʧ IPA */
+      case 0x02a8:  /* ʨ IPA */
+      case 0x02a9:  /* ʩ IPA */
+      case 0x02aa:  /* ʪ IPA */
+      case 0x02ab:  /* ʫ IPA */
+      case 0x02ac:  /* ʬ IPA */
+      case 0x02ad:  /* ʭ IPA */
+      case 0x02ae:  /* ʮ IPA */
+      case 0x02af:  /* ʯ IPA */
+      case 0x0390:  /* ΐ Greek */
+      case 0x03b0:  /* ΰ Greek */
+      case 0x03fc:  /* ϼ Greek */
+      case 0x0560:  /* ՠ Armenian */
+      case 0x0587:  /* և Armenian */
+      case 0x0588:  /* ֈ Armenian */
+      case 0x1d00:  /* ᴀ Phonetic Extensions */
+      case 0x1d01:  /* ᴁ Phonetic Extensions */
+      case 0x1d02:  /* ᴂ Phonetic Extensions */
+      case 0x1d03:  /* ᴃ Phonetic Extensions */
+      case 0x1d04:  /* ᴄ Phonetic Extensions */
+      case 0x1d05:  /* ᴅ Phonetic Extensions */
+      case 0x1d06:  /* ᴆ Phonetic Extensions */
+      case 0x1d07:  /* ᴇ Phonetic Extensions */
+      case 0x1d08:  /* ᴈ Phonetic Extensions */
+      case 0x1d09:  /* ᴉ Phonetic Extensions */
+      case 0x1d0a:  /* ᴊ Phonetic Extensions */
+      case 0x1d0b:  /* ᴋ Phonetic Extensions */
+      case 0x1d0c:  /* ᴌ Phonetic Extensions */
+      case 0x1d0d:  /* ᴍ Phonetic Extensions */
+      case 0x1d0e:  /* ᴎ Phonetic Extensions */
+      case 0x1d0f:  /* ᴏ Phonetic Extensions */
+      case 0x1d10:  /* ᴐ Phonetic Extensions */
+      case 0x1d11:  /* ᴑ Phonetic Extensions */
+      case 0x1d12:  /* ᴒ Phonetic Extensions */
+      case 0x1d13:  /* ᴓ Phonetic Extensions */
+      case 0x1d14:  /* ᴔ Phonetic Extensions */
+      case 0x1d15:  /* ᴕ Phonetic Extensions */
+      case 0x1d16:  /* ᴖ Phonetic Extensions */
+      case 0x1d17:  /* ᴗ Phonetic Extensions */
+      case 0x1d18:  /* ᴘ Phonetic Extensions */
+      case 0x1d19:  /* ᴙ Phonetic Extensions */
+      case 0x1d1a:  /* ᴚ Phonetic Extensions */
+      case 0x1d1b:  /* ᴛ Phonetic Extensions */
+      case 0x1d1c:  /* ᴜ Phonetic Extensions */
+      case 0x1d1d:  /* ᴝ Phonetic Extensions */
+      case 0x1d1e:  /* ᴞ Phonetic Extensions */
+      case 0x1d1f:  /* ᴟ Phonetic Extensions */
+      case 0x1d20:  /* ᴠ Phonetic Extensions */
+      case 0x1d21:  /* ᴡ Phonetic Extensions */
+      case 0x1d22:  /* ᴢ Phonetic Extensions */
+      case 0x1d23:  /* ᴣ Phonetic Extensions */
+      case 0x1d24:  /* ᴤ Phonetic Extensions */
+      case 0x1d25:  /* ᴥ Phonetic Extensions */
+      case 0x1d26:  /* ᴦ Phonetic Extensions */
+      case 0x1d27:  /* ᴧ Phonetic Extensions */
+      case 0x1d28:  /* ᴨ Phonetic Extensions */
+      case 0x1d29:  /* ᴩ Phonetic Extensions */
+      case 0x1d2a:  /* ᴪ Phonetic Extensions */
+      case 0x1d2b:  /* ᴫ Phonetic Extensions */
+      case 0x1d6b:  /* ᵫ Phonetic Extensions */
+      case 0x1d6c:  /* ᵬ Phonetic Extensions */
+      case 0x1d6d:  /* ᵭ Phonetic Extensions */
+      case 0x1d6e:  /* ᵮ Phonetic Extensions */
+      case 0x1d6f:  /* ᵯ Phonetic Extensions */
+      case 0x1d70:  /* ᵰ Phonetic Extensions */
+      case 0x1d71:  /* ᵱ Phonetic Extensions */
+      case 0x1d72:  /* ᵲ Phonetic Extensions */
+      case 0x1d73:  /* ᵳ Phonetic Extensions */
+      case 0x1d74:  /* ᵴ Phonetic Extensions */
+      case 0x1d75:  /* ᵵ Phonetic Extensions */
+      case 0x1d76:  /* ᵶ Phonetic Extensions */
+      case 0x1d77:  /* ᵷ Phonetic Extensions */
+      case 0x1d7a:  /* ᵺ Phonetic Extensions */
+      case 0x1d7b:  /* ᵻ Phonetic Extensions */
+      case 0x1d7c:  /* ᵼ Phonetic Extensions */
+      case 0x1d7e:  /* ᵾ Phonetic Extensions */
+      case 0x1d7f:  /* ᵿ Phonetic Extensions */
+      case 0x1d80:  /* . Phonetic Extensions Supplement */
+      case 0x1d81:  /* . Phonetic Extensions Supplement */
+      case 0x1d82:  /* . Phonetic Extensions Supplement */
+      case 0x1d83:  /* . Phonetic Extensions Supplement */
+      case 0x1d84:  /* . Phonetic Extensions Supplement */
+      case 0x1d85:  /* . Phonetic Extensions Supplement */
+      case 0x1d86:  /* . Phonetic Extensions Supplement */
+      case 0x1d87:  /* . Phonetic Extensions Supplement */
+      case 0x1d88:  /* . Phonetic Extensions Supplement */
+      case 0x1d89:  /* . Phonetic Extensions Supplement */
+      case 0x1d8a:  /* . Phonetic Extensions Supplement */
+      case 0x1d8b:  /* . Phonetic Extensions Supplement */
+      case 0x1d8c:  /* . Phonetic Extensions Supplement */
+      case 0x1d8d:  /* . Phonetic Extensions Supplement */
+      case 0x1d8f:  /* . Phonetic Extensions Supplement */
+      case 0x1d90:  /* . Phonetic Extensions Supplement */
+      case 0x1d91:  /* . Phonetic Extensions Supplement */
+      case 0x1d92:  /* . Phonetic Extensions Supplement */
+      case 0x1d93:  /* . Phonetic Extensions Supplement */
+      case 0x1d94:  /* . Phonetic Extensions Supplement */
+      case 0x1d95:  /* . Phonetic Extensions Supplement */
+      case 0x1d96:  /* . Phonetic Extensions Supplement */
+      case 0x1d97:  /* . Phonetic Extensions Supplement */
+      case 0x1d98:  /* . Phonetic Extensions Supplement */
+      case 0x1d99:  /* . Phonetic Extensions Supplement */
+      case 0x1d9a:  /* . Phonetic Extensions Supplement */
+      case 0x1e96:  /* ẖ Watin-C */
+      case 0x1e97:  /* ẗ Watin-C */
+      case 0x1e98:  /* ẘ Watin-C */
+      case 0x1e99:  /* ẙ Watin-C */
+      case 0x1e9a:  /* ẚ Watin-C */
+      case 0x1e9c:  /* ẜ Watin-C */
+      case 0x1e9d:  /* ẝ Watin-C */
+      case 0x1e9f:  /* ẟ Watin-C */
+      case 0x1f50:  /* ὐ Greek2 */
+      case 0x1f52:  /* ὒ Greek2 */
+      case 0x1f54:  /* ὔ Greek2 */
+      case 0x1f56:  /* ὖ Greek2 */
+      case 0x1fb2:  /* ᾲ Greek2 */
+      case 0x1fb4:  /* ᾴ Greek2 */
+      case 0x1fb6:  /* ᾶ Greek2 */
+      case 0x1fb7:  /* ᾷ Greek2 */
+      case 0x1fc2:  /* ῂ Greek2 */
+      case 0x1fc4:  /* ῄ Greek2 */
+      case 0x1fc6:  /* ῆ Greek2 */
+      case 0x1fc7:  /* ῇ Greek2 */
+      case 0x1fd2:  /* ῒ Greek2 */
+      case 0x1fd3:  /* ΐ Greek2 */
+      case 0x1fd6:  /* ῖ Greek2 */
+      case 0x1fd7:  /* ῗ Greek2 */
+      case 0x1fe2:  /* ῢ Greek2 */
+      case 0x1fe3:  /* ΰ Greek2 */
+      case 0x1fe4:  /* ῤ Greek2 */
+      case 0x1fe6:  /* ῦ Greek2 */
+      case 0x1fe7:  /* ῧ Greek2 */
+      case 0x1ff2:  /* ῲ Greek2 */
+      case 0x1ff4:  /* ῴ Greek2 */
+      case 0x1ff6:  /* ῶ Greek2 */
+      case 0x1ff7:  /* ῷ Greek2 */
+      case 0x210a:  /* ℊ Letterlike */
+      case 0x210e:  /* ℎ Letterlike */
+      case 0x210f:  /* ℏ Letterlike */
+      case 0x2113:  /* ℓ Letterlike */
+      case 0x212f:  /* ℯ Letterlike */
+      case 0x2134:  /* ℴ Letterlike */
+      case 0x2139:  /* ℹ Letterlike */
+      case 0x213c:  /* ℼ Letterlike */
+      case 0x213d:  /* ℽ Letterlike */
+      case 0x2146:  /* ⅆ Letterlike */
+      case 0x2147:  /* ⅇ Letterlike */
+      case 0x2148:  /* ⅈ Letterlike */
+      case 0x2149:  /* ⅉ Letterlike */
+      case 0x2c71:  /* . Watin-D */
+      case 0x2c74:  /* . Watin-D */
+      case 0x2c77:  /* . Watin-D */
+      case 0x2c78:  /* . Watin-D */
+      case 0x2c79:  /* . Watin-D */
+      case 0x2c7a:  /* . Watin-D */
+      case 0x2c7b:  /* . Watin-D */
+      case 0x2ce4:  /* . Coptic */
+      case 0xa730:  /* . Latin Extended-D */
+      case 0xa731:  /* . Latin Extended-D */
+      case 0xa771:  /* . Latin Extended-D */
+      case 0xa772:  /* . Latin Extended-D */
+      case 0xa773:  /* . Latin Extended-D */
+      case 0xa774:  /* . Latin Extended-D */
+      case 0xa775:  /* . Latin Extended-D */
+      case 0xa776:  /* . Latin Extended-D */
+      case 0xa777:  /* . Latin Extended-D */
+      case 0xa778:  /* . Latin Extended-D */
+      case 0xa78e:  /* . Latin Extended-D */
+      case 0xa795:  /* . Latin Extended-D */
+      case 0xa7af:  /* . Latin Extended-D */
+      case 0xa7fa:  /* . Latin Extended-D */
+      case 0xab30:  /* . Latin Extended-E */
+      case 0xab31:  /* . Latin Extended-E */
+      case 0xab32:  /* . Latin Extended-E */
+      case 0xab33:  /* . Latin Extended-E */
+      case 0xab34:  /* . Latin Extended-E */
+      case 0xab35:  /* . Latin Extended-E */
+      case 0xab36:  /* . Latin Extended-E */
+      case 0xab37:  /* . Latin Extended-E */
+      case 0xab38:  /* . Latin Extended-E */
+      case 0xab39:  /* . Latin Extended-E */
+      case 0xab3a:  /* . Latin Extended-E */
+      case 0xab3b:  /* . Latin Extended-E */
+      case 0xab3c:  /* . Latin Extended-E */
+      case 0xab3d:  /* . Latin Extended-E */
+      case 0xab3e:  /* . Latin Extended-E */
+      case 0xab3f:  /* . Latin Extended-E */
+      case 0xab40:  /* . Latin Extended-E */
+      case 0xab41:  /* . Latin Extended-E */
+      case 0xab42:  /* . Latin Extended-E */
+      case 0xab43:  /* . Latin Extended-E */
+      case 0xab44:  /* . Latin Extended-E */
+      case 0xab45:  /* . Latin Extended-E */
+      case 0xab46:  /* . Latin Extended-E */
+      case 0xab47:  /* . Latin Extended-E */
+      case 0xab48:  /* . Latin Extended-E */
+      case 0xab49:  /* . Latin Extended-E */
+      case 0xab4a:  /* . Latin Extended-E */
+      case 0xab4b:  /* . Latin Extended-E */
+      case 0xab4c:  /* . Latin Extended-E */
+      case 0xab4d:  /* . Latin Extended-E */
+      case 0xab4e:  /* . Latin Extended-E */
+      case 0xab4f:  /* . Latin Extended-E */
+      case 0xab50:  /* . Latin Extended-E */
+      case 0xab51:  /* . Latin Extended-E */
+      case 0xab52:  /* . Latin Extended-E */
+      case 0xab54:  /* . Latin Extended-E */
+      case 0xab55:  /* . Latin Extended-E */
+      case 0xab56:  /* . Latin Extended-E */
+      case 0xab57:  /* . Latin Extended-E */
+      case 0xab58:  /* . Latin Extended-E */
+      case 0xab59:  /* . Latin Extended-E */
+      case 0xab5a:  /* . Latin Extended-E */
+      case 0xab60:  /* . Latin Extended-E */
+      case 0xab61:  /* . Latin Extended-E */
+      case 0xab62:  /* . Latin Extended-E */
+      case 0xab63:  /* . Latin Extended-E */
+      case 0xab64:  /* . Latin Extended-E */
+      case 0xab65:  /* . Latin Extended-E */
+      case 0xab66:  /* . Latin Extended-E */
+      case 0xab67:  /* . Latin Extended-E */
+      case 0xfb00:  /* . Alphabetic Presentation Forms */
+      case 0xfb01:  /* . Alphabetic Presentation Forms */
+      case 0xfb02:  /* . Alphabetic Presentation Forms */
+      case 0xfb03:  /* . Alphabetic Presentation Forms */
+      case 0xfb04:  /* . Alphabetic Presentation Forms */
+      case 0xfb05:  /* . Alphabetic Presentation Forms */
+      case 0xfb06:  /* . Alphabetic Presentation Forms */
+      case 0xfb13:  /* . Alphabetic Presentation Forms */
+      case 0xfb14:  /* . Alphabetic Presentation Forms */
+      case 0xfb15:  /* . Alphabetic Presentation Forms */
+      case 0xfb16:  /* . Alphabetic Presentation Forms */
+      case 0xfb17:  /* . Alphabetic Presentation Forms */
+      case 0x1d44e: /* 𝑎 Math */
+      case 0x1d44f: /* 𝑏 Math */
+      case 0x1d450: /* 𝑐 Math */
+      case 0x1d451: /* 𝑑 Math */
+      case 0x1d452: /* 𝑒 Math */
+      case 0x1d453: /* 𝑓 Math */
+      case 0x1d454: /* 𝑔 Math */
+      case 0x1d45e: /* 𝑞 Math */
+      case 0x1d45f: /* 𝑟 Math */
+      case 0x1d460: /* 𝑠 Math */
+      case 0x1d461: /* 𝑡 Math */
+      case 0x1d462: /* 𝑢 Math */
+      case 0x1d463: /* 𝑣 Math */
+      case 0x1d464: /* 𝑤 Math */
+      case 0x1d465: /* 𝑥 Math */
+      case 0x1d466: /* 𝑦 Math */
+      case 0x1d467: /* 𝑧 Math */
+      case 0x1d4b6: /* 𝒶 Math */
+      case 0x1d4b7: /* 𝒷 Math */
+      case 0x1d4b8: /* 𝒸 Math */
+      case 0x1d4b9: /* 𝒹 Math */
+      case 0x1d4bb: /* 𝒻 Math */
+      case 0x1d4bd: /* 𝒽 Math */
+      case 0x1d4be: /* 𝒾 Math */
+      case 0x1d4bf: /* 𝒿 Math */
+      case 0x1d4c0: /* 𝓀 Math */
+      case 0x1d4c1: /* 𝓁 Math */
+      case 0x1d4c2: /* 𝓂 Math */
+      case 0x1d4c3: /* 𝓃 Math */
+      case 0x1d4c5: /* 𝓅 Math */
+      case 0x1d4c6: /* 𝓆 Math */
+      case 0x1d4c7: /* 𝓇 Math */
+      case 0x1d51e: /* 𝔞 Math */
+      case 0x1d51f: /* 𝔟 Math */
+      case 0x1d520: /* 𝔠 Math */
+      case 0x1d521: /* 𝔡 Math */
+      case 0x1d522: /* 𝔢 Math */
+      case 0x1d523: /* 𝔣 Math */
+      case 0x1d524: /* 𝔤 Math */
+      case 0x1d525: /* 𝔥 Math */
+      case 0x1d526: /* 𝔦 Math */
+      case 0x1d52f: /* 𝔯 Math */
+      case 0x1d530: /* 𝔰 Math */
+      case 0x1d531: /* 𝔱 Math */
+      case 0x1d532: /* 𝔲 Math */
+      case 0x1d533: /* 𝔳 Math */
+      case 0x1d534: /* 𝔴 Math */
+      case 0x1d535: /* 𝔵 Math */
+      case 0x1d536: /* 𝔶 Math */
+      case 0x1d537: /* 𝔷 Math */
+      case 0x1d552: /* 𝕒 Math */
+      case 0x1d553: /* 𝕓 Math */
+      case 0x1d554: /* 𝕔 Math */
+      case 0x1d555: /* 𝕕 Math */
+      case 0x1d556: /* 𝕖 Math */
+      case 0x1d557: /* 𝕗 Math */
+      case 0x1d558: /* 𝕘 Math */
+      case 0x1d559: /* 𝕙 Math */
+      case 0x1d55a: /* 𝕚 Math */
+      case 0x1d55b: /* 𝕛 Math */
+      case 0x1d55c: /* 𝕜 Math */
+      case 0x1d55d: /* 𝕝 Math */
+      case 0x1d55e: /* 𝕞 Math */
+      case 0x1d55f: /* 𝕟 Math */
+      case 0x1d560: /* 𝕠 Math */
+      case 0x1d561: /* 𝕡 Math */
+      case 0x1d562: /* 𝕢 Math */
+      case 0x1d563: /* 𝕣 Math */
+      case 0x1d564: /* 𝕤 Math */
+      case 0x1d565: /* 𝕥 Math */
+      case 0x1d566: /* 𝕦 Math */
+      case 0x1d567: /* 𝕧 Math */
+      case 0x1d568: /* 𝕨 Math */
+      case 0x1d569: /* 𝕩 Math */
+      case 0x1d56a: /* 𝕪 Math */
+      case 0x1d56b: /* 𝕫 Math */
+      case 0x1d656: /* 𝙖 Math */
+      case 0x1d657: /* 𝙗 Math */
+      case 0x1d658: /* 𝙘 Math */
+      case 0x1d659: /* 𝙙 Math */
+      case 0x1d65a: /* 𝙚 Math */
+      case 0x1d65b: /* 𝙛 Math */
+      case 0x1d65c: /* 𝙜 Math */
+      case 0x1d65d: /* 𝙝 Math */
+      case 0x1d65e: /* 𝙞 Math */
+      case 0x1d65f: /* 𝙟 Math */
+      case 0x1d660: /* 𝙠 Math */
+      case 0x1d661: /* 𝙡 Math */
+      case 0x1d662: /* 𝙢 Math */
+      case 0x1d663: /* 𝙣 Math */
+      case 0x1d664: /* 𝙤 Math */
+      case 0x1d665: /* 𝙥 Math */
+      case 0x1d666: /* 𝙦 Math */
+      case 0x1d667: /* 𝙧 Math */
+      case 0x1d668: /* 𝙨 Math */
+      case 0x1d669: /* 𝙩 Math */
+      case 0x1d66a: /* 𝙪 Math */
+      case 0x1d66b: /* 𝙫 Math */
+      case 0x1d66c: /* 𝙬 Math */
+      case 0x1d66d: /* 𝙭 Math */
+      case 0x1d66e: /* 𝙮 Math */
+      case 0x1d66f: /* 𝙯 Math */
+      case 0x1d6da: /* 𝛚 Math */
+      case 0x1d6dc: /* 𝛜 Math */
+      case 0x1d6dd: /* 𝛝 Math */
+      case 0x1d6de: /* 𝛞 Math */
+      case 0x1d6df: /* 𝛟 Math */
+      case 0x1d6e0: /* 𝛠 Math */
+      case 0x1d6e1: /* 𝛡 Math */
+      case 0x1d70d: /* 𝜍 Math */
+      case 0x1d70e: /* 𝜎 Math */
+      case 0x1d70f: /* 𝜏 Math */
+      case 0x1d710: /* 𝜐 Math */
+      case 0x1d711: /* 𝜑 Math */
+      case 0x1d712: /* 𝜒 Math */
+      case 0x1d713: /* 𝜓 Math */
+      case 0x1d714: /* 𝜔 Math */
+      case 0x1d716: /* 𝜖 Math */
+      case 0x1d717: /* 𝜗 Math */
+      case 0x1d718: /* 𝜘 Math */
+      case 0x1d719: /* 𝜙 Math */
+      case 0x1d71a: /* 𝜚 Math */
+      case 0x1d71b: /* 𝜛 Math */
+      case 0x1d747: /* 𝝇 Math */
+      case 0x1d748: /* 𝝈 Math */
+      case 0x1d749: /* 𝝉 Math */
+      case 0x1d74a: /* 𝝊 Math */
+      case 0x1d74b: /* 𝝋 Math */
+      case 0x1d74c: /* 𝝌 Math */
+      case 0x1d74d: /* 𝝍 Math */
+      case 0x1d74e: /* 𝝎 Math */
+      case 0x1d750: /* 𝝐 Math */
+      case 0x1d751: /* 𝝑 Math */
+      case 0x1d752: /* 𝝒 Math */
+      case 0x1d753: /* 𝝓 Math */
+      case 0x1d754: /* 𝝔 Math */
+      case 0x1d755: /* 𝝕 Math */
+      case 0x1d781: /* 𝞁 Math */
+      case 0x1d782: /* 𝞂 Math */
+      case 0x1d783: /* 𝞃 Math */
+      case 0x1d784: /* 𝞄 Math */
+      case 0x1d785: /* 𝞅 Math */
+      case 0x1d786: /* 𝞆 Math */
+      case 0x1d787: /* 𝞇 Math */
+      case 0x1d788: /* 𝞈 Math */
+      case 0x1d78a: /* 𝞊 Math */
+      case 0x1d78b: /* 𝞋 Math */
+      case 0x1d78c: /* 𝞌 Math */
+      case 0x1d78d: /* 𝞍 Math */
+      case 0x1d78e: /* 𝞎 Math */
+      case 0x1d78f: /* 𝞏 Math */
+      case 0x1d7aa: /* 𝞪 Math */
+      case 0x1d7ab: /* 𝞫 Math */
+      case 0x1d7ac: /* 𝞬 Math */
+      case 0x1d7ad: /* 𝞭 Math */
+      case 0x1d7ae: /* 𝞮 Math */
+      case 0x1d7af: /* 𝞯 Math */
+      case 0x1d7b0: /* 𝞰 Math */
+      case 0x1d7b1: /* 𝞱 Math */
+      case 0x1d7b2: /* 𝞲 Math */
+      case 0x1d7b3: /* 𝞳 Math */
+      case 0x1d7b4: /* 𝞴 Math */
+      case 0x1d7b5: /* 𝞵 Math */
+      case 0x1d7b6: /* 𝞶 Math */
+      case 0x1d7b7: /* 𝞷 Math */
+      case 0x1d7b8: /* 𝞸 Math */
+      case 0x1d7b9: /* 𝞹 Math */
+      case 0x1d7ba: /* 𝞺 Math */
+      case 0x1d7bb: /* 𝞻 Math */
+      case 0x1d7bc: /* 𝞼 Math */
+      case 0x1d7bd: /* 𝞽 Math */
+      case 0x1d7be: /* 𝞾 Math */
+      case 0x1d7bf: /* 𝞿 Math */
+      case 0x1d7c0: /* 𝟀 Math */
+      case 0x1d7c1: /* 𝟁 Math */
+      case 0x1d7c2: /* 𝟂 Math */
+      case 0x1d7c4: /* 𝟄 Math */
+      case 0x1d7c5: /* 𝟅 Math */
+      case 0x1d7c6: /* 𝟆 Math */
+      case 0x1d7c7: /* 𝟇 Math */
+      case 0x1d7c8: /* 𝟈 Math */
+      case 0x1d7c9: /* 𝟉 Math */
+      case 0x1d7cb: /* 𝟋 Math */
+        return 1;
+      default:
+        return 0;
+    }
+  }
+}
+
+__weak_reference(iswlower, iswlower_l);
diff --git a/libc/str/iswlower.cc b/libc/str/iswlower.cc
deleted file mode 100644
index a0b5778d6..000000000
--- a/libc/str/iswlower.cc
+++ /dev/null
@@ -1,712 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                             :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/dce.h"
-#include "libc/macros.h"
-#include "libc/str/has_char.h"
-#include "libc/wctype.h"
-
-static const unsigned short kLower[][2] = {
-    {0x61, 0x7a},      //
-    {0xaa, 0xaa},      //
-    {0xb5, 0xb5},      //
-    {0xba, 0xba},      //
-    {0xdf, 0xf6},      //
-    {0xf8, 0xff},      //
-    {0x101, 0x101},    //
-    {0x103, 0x103},    //
-    {0x105, 0x105},    //
-    {0x107, 0x107},    //
-    {0x109, 0x109},    //
-    {0x10b, 0x10b},    //
-    {0x10d, 0x10d},    //
-    {0x10f, 0x10f},    //
-    {0x111, 0x111},    //
-    {0x113, 0x113},    //
-    {0x115, 0x115},    //
-    {0x117, 0x117},    //
-    {0x119, 0x119},    //
-    {0x11b, 0x11b},    //
-    {0x11d, 0x11d},    //
-    {0x11f, 0x11f},    //
-    {0x121, 0x121},    //
-    {0x123, 0x123},    //
-    {0x125, 0x125},    //
-    {0x127, 0x127},    //
-    {0x129, 0x129},    //
-    {0x12b, 0x12b},    //
-    {0x12d, 0x12d},    //
-    {0x12f, 0x12f},    //
-    {0x131, 0x131},    //
-    {0x133, 0x133},    //
-    {0x135, 0x135},    //
-    {0x137, 0x138},    //
-    {0x13a, 0x13a},    //
-    {0x13c, 0x13c},    //
-    {0x13e, 0x13e},    //
-    {0x140, 0x140},    //
-    {0x142, 0x142},    //
-    {0x144, 0x144},    //
-    {0x146, 0x146},    //
-    {0x148, 0x149},    //
-    {0x14b, 0x14b},    //
-    {0x14d, 0x14d},    //
-    {0x14f, 0x14f},    //
-    {0x151, 0x151},    //
-    {0x153, 0x153},    //
-    {0x155, 0x155},    //
-    {0x157, 0x157},    //
-    {0x159, 0x159},    //
-    {0x15b, 0x15b},    //
-    {0x15d, 0x15d},    //
-    {0x15f, 0x15f},    //
-    {0x161, 0x161},    //
-    {0x163, 0x163},    //
-    {0x165, 0x165},    //
-    {0x167, 0x167},    //
-    {0x169, 0x169},    //
-    {0x16b, 0x16b},    //
-    {0x16d, 0x16d},    //
-    {0x16f, 0x16f},    //
-    {0x171, 0x171},    //
-    {0x173, 0x173},    //
-    {0x175, 0x175},    //
-    {0x177, 0x177},    //
-    {0x17a, 0x17a},    //
-    {0x17c, 0x17c},    //
-    {0x17e, 0x180},    //
-    {0x183, 0x183},    //
-    {0x185, 0x185},    //
-    {0x188, 0x188},    //
-    {0x18c, 0x18d},    //
-    {0x192, 0x192},    //
-    {0x195, 0x195},    //
-    {0x199, 0x19b},    //
-    {0x19e, 0x19e},    //
-    {0x1a1, 0x1a1},    //
-    {0x1a3, 0x1a3},    //
-    {0x1a5, 0x1a5},    //
-    {0x1a8, 0x1a8},    //
-    {0x1aa, 0x1ab},    //
-    {0x1ad, 0x1ad},    //
-    {0x1b0, 0x1b0},    //
-    {0x1b4, 0x1b4},    //
-    {0x1b6, 0x1b6},    //
-    {0x1b9, 0x1ba},    //
-    {0x1bd, 0x1bf},    //
-    {0x1c5, 0x1c6},    //
-    {0x1c8, 0x1c9},    //
-    {0x1cb, 0x1cc},    //
-    {0x1ce, 0x1ce},    //
-    {0x1d0, 0x1d0},    //
-    {0x1d2, 0x1d2},    //
-    {0x1d4, 0x1d4},    //
-    {0x1d6, 0x1d6},    //
-    {0x1d8, 0x1d8},    //
-    {0x1da, 0x1da},    //
-    {0x1dc, 0x1dd},    //
-    {0x1df, 0x1df},    //
-    {0x1e1, 0x1e1},    //
-    {0x1e3, 0x1e3},    //
-    {0x1e5, 0x1e5},    //
-    {0x1e7, 0x1e7},    //
-    {0x1e9, 0x1e9},    //
-    {0x1eb, 0x1eb},    //
-    {0x1ed, 0x1ed},    //
-    {0x1ef, 0x1f0},    //
-    {0x1f2, 0x1f3},    //
-    {0x1f5, 0x1f5},    //
-    {0x1f9, 0x1f9},    //
-    {0x1fb, 0x1fb},    //
-    {0x1fd, 0x1fd},    //
-    {0x1ff, 0x1ff},    //
-    {0x201, 0x201},    //
-    {0x203, 0x203},    //
-    {0x205, 0x205},    //
-    {0x207, 0x207},    //
-    {0x209, 0x209},    //
-    {0x20b, 0x20b},    //
-    {0x20d, 0x20d},    //
-    {0x20f, 0x20f},    //
-    {0x211, 0x211},    //
-    {0x213, 0x213},    //
-    {0x215, 0x215},    //
-    {0x217, 0x217},    //
-    {0x219, 0x219},    //
-    {0x21b, 0x21b},    //
-    {0x21d, 0x21d},    //
-    {0x21f, 0x21f},    //
-    {0x221, 0x221},    //
-    {0x223, 0x223},    //
-    {0x225, 0x225},    //
-    {0x227, 0x227},    //
-    {0x229, 0x229},    //
-    {0x22b, 0x22b},    //
-    {0x22d, 0x22d},    //
-    {0x22f, 0x22f},    //
-    {0x231, 0x231},    //
-    {0x233, 0x239},    //
-    {0x23c, 0x23c},    //
-    {0x23f, 0x240},    //
-    {0x242, 0x242},    //
-    {0x247, 0x247},    //
-    {0x249, 0x249},    //
-    {0x24b, 0x24b},    //
-    {0x24d, 0x24d},    //
-    {0x24f, 0x293},    //
-    {0x295, 0x2b8},    //
-    {0x2c0, 0x2c1},    //
-    {0x2e0, 0x2e4},    //
-    {0x345, 0x345},    //
-    {0x371, 0x371},    //
-    {0x373, 0x373},    //
-    {0x377, 0x377},    //
-    {0x37a, 0x37d},    //
-    {0x390, 0x390},    //
-    {0x3ac, 0x3ce},    //
-    {0x3d0, 0x3d1},    //
-    {0x3d5, 0x3d7},    //
-    {0x3d9, 0x3d9},    //
-    {0x3db, 0x3db},    //
-    {0x3dd, 0x3dd},    //
-    {0x3df, 0x3df},    //
-    {0x3e1, 0x3e1},    //
-    {0x3e3, 0x3e3},    //
-    {0x3e5, 0x3e5},    //
-    {0x3e7, 0x3e7},    //
-    {0x3e9, 0x3e9},    //
-    {0x3eb, 0x3eb},    //
-    {0x3ed, 0x3ed},    //
-    {0x3ef, 0x3f3},    //
-    {0x3f5, 0x3f5},    //
-    {0x3f8, 0x3f8},    //
-    {0x3fb, 0x3fc},    //
-    {0x430, 0x45f},    //
-    {0x461, 0x461},    //
-    {0x463, 0x463},    //
-    {0x465, 0x465},    //
-    {0x467, 0x467},    //
-    {0x469, 0x469},    //
-    {0x46b, 0x46b},    //
-    {0x46d, 0x46d},    //
-    {0x46f, 0x46f},    //
-    {0x471, 0x471},    //
-    {0x473, 0x473},    //
-    {0x475, 0x475},    //
-    {0x477, 0x477},    //
-    {0x479, 0x479},    //
-    {0x47b, 0x47b},    //
-    {0x47d, 0x47d},    //
-    {0x47f, 0x47f},    //
-    {0x481, 0x481},    //
-    {0x48b, 0x48b},    //
-    {0x48d, 0x48d},    //
-    {0x48f, 0x48f},    //
-    {0x491, 0x491},    //
-    {0x493, 0x493},    //
-    {0x495, 0x495},    //
-    {0x497, 0x497},    //
-    {0x499, 0x499},    //
-    {0x49b, 0x49b},    //
-    {0x49d, 0x49d},    //
-    {0x49f, 0x49f},    //
-    {0x4a1, 0x4a1},    //
-    {0x4a3, 0x4a3},    //
-    {0x4a5, 0x4a5},    //
-    {0x4a7, 0x4a7},    //
-    {0x4a9, 0x4a9},    //
-    {0x4ab, 0x4ab},    //
-    {0x4ad, 0x4ad},    //
-    {0x4af, 0x4af},    //
-    {0x4b1, 0x4b1},    //
-    {0x4b3, 0x4b3},    //
-    {0x4b5, 0x4b5},    //
-    {0x4b7, 0x4b7},    //
-    {0x4b9, 0x4b9},    //
-    {0x4bb, 0x4bb},    //
-    {0x4bd, 0x4bd},    //
-    {0x4bf, 0x4bf},    //
-    {0x4c2, 0x4c2},    //
-    {0x4c4, 0x4c4},    //
-    {0x4c6, 0x4c6},    //
-    {0x4c8, 0x4c8},    //
-    {0x4ca, 0x4ca},    //
-    {0x4cc, 0x4cc},    //
-    {0x4ce, 0x4cf},    //
-    {0x4d1, 0x4d1},    //
-    {0x4d3, 0x4d3},    //
-    {0x4d5, 0x4d5},    //
-    {0x4d7, 0x4d7},    //
-    {0x4d9, 0x4d9},    //
-    {0x4db, 0x4db},    //
-    {0x4dd, 0x4dd},    //
-    {0x4df, 0x4df},    //
-    {0x4e1, 0x4e1},    //
-    {0x4e3, 0x4e3},    //
-    {0x4e5, 0x4e5},    //
-    {0x4e7, 0x4e7},    //
-    {0x4e9, 0x4e9},    //
-    {0x4eb, 0x4eb},    //
-    {0x4ed, 0x4ed},    //
-    {0x4ef, 0x4ef},    //
-    {0x4f1, 0x4f1},    //
-    {0x4f3, 0x4f3},    //
-    {0x4f5, 0x4f5},    //
-    {0x4f7, 0x4f7},    //
-    {0x4f9, 0x4f9},    //
-    {0x4fb, 0x4fb},    //
-    {0x4fd, 0x4fd},    //
-    {0x4ff, 0x4ff},    //
-    {0x501, 0x501},    //
-    {0x503, 0x503},    //
-    {0x505, 0x505},    //
-    {0x507, 0x507},    //
-    {0x509, 0x509},    //
-    {0x50b, 0x50b},    //
-    {0x50d, 0x50d},    //
-    {0x50f, 0x50f},    //
-    {0x511, 0x511},    //
-    {0x513, 0x513},    //
-    {0x515, 0x515},    //
-    {0x517, 0x517},    //
-    {0x519, 0x519},    //
-    {0x51b, 0x51b},    //
-    {0x51d, 0x51d},    //
-    {0x51f, 0x51f},    //
-    {0x521, 0x521},    //
-    {0x523, 0x523},    //
-    {0x525, 0x525},    //
-    {0x527, 0x527},    //
-    {0x529, 0x529},    //
-    {0x52b, 0x52b},    //
-    {0x52d, 0x52d},    //
-    {0x52f, 0x52f},    //
-    {0x560, 0x588},    //
-    {0x10d0, 0x10fa},  //
-    {0x10fc, 0x10ff},  //
-    {0x13f8, 0x13fd},  //
-    {0x1c80, 0x1c88},  //
-    {0x1d00, 0x1dbf},  //
-    {0x1e01, 0x1e01},  //
-    {0x1e03, 0x1e03},  //
-    {0x1e05, 0x1e05},  //
-    {0x1e07, 0x1e07},  //
-    {0x1e09, 0x1e09},  //
-    {0x1e0b, 0x1e0b},  //
-    {0x1e0d, 0x1e0d},  //
-    {0x1e0f, 0x1e0f},  //
-    {0x1e11, 0x1e11},  //
-    {0x1e13, 0x1e13},  //
-    {0x1e15, 0x1e15},  //
-    {0x1e17, 0x1e17},  //
-    {0x1e19, 0x1e19},  //
-    {0x1e1b, 0x1e1b},  //
-    {0x1e1d, 0x1e1d},  //
-    {0x1e1f, 0x1e1f},  //
-    {0x1e21, 0x1e21},  //
-    {0x1e23, 0x1e23},  //
-    {0x1e25, 0x1e25},  //
-    {0x1e27, 0x1e27},  //
-    {0x1e29, 0x1e29},  //
-    {0x1e2b, 0x1e2b},  //
-    {0x1e2d, 0x1e2d},  //
-    {0x1e2f, 0x1e2f},  //
-    {0x1e31, 0x1e31},  //
-    {0x1e33, 0x1e33},  //
-    {0x1e35, 0x1e35},  //
-    {0x1e37, 0x1e37},  //
-    {0x1e39, 0x1e39},  //
-    {0x1e3b, 0x1e3b},  //
-    {0x1e3d, 0x1e3d},  //
-    {0x1e3f, 0x1e3f},  //
-    {0x1e41, 0x1e41},  //
-    {0x1e43, 0x1e43},  //
-    {0x1e45, 0x1e45},  //
-    {0x1e47, 0x1e47},  //
-    {0x1e49, 0x1e49},  //
-    {0x1e4b, 0x1e4b},  //
-    {0x1e4d, 0x1e4d},  //
-    {0x1e4f, 0x1e4f},  //
-    {0x1e51, 0x1e51},  //
-    {0x1e53, 0x1e53},  //
-    {0x1e55, 0x1e55},  //
-    {0x1e57, 0x1e57},  //
-    {0x1e59, 0x1e59},  //
-    {0x1e5b, 0x1e5b},  //
-    {0x1e5d, 0x1e5d},  //
-    {0x1e5f, 0x1e5f},  //
-    {0x1e61, 0x1e61},  //
-    {0x1e63, 0x1e63},  //
-    {0x1e65, 0x1e65},  //
-    {0x1e67, 0x1e67},  //
-    {0x1e69, 0x1e69},  //
-    {0x1e6b, 0x1e6b},  //
-    {0x1e6d, 0x1e6d},  //
-    {0x1e6f, 0x1e6f},  //
-    {0x1e71, 0x1e71},  //
-    {0x1e73, 0x1e73},  //
-    {0x1e75, 0x1e75},  //
-    {0x1e77, 0x1e77},  //
-    {0x1e79, 0x1e79},  //
-    {0x1e7b, 0x1e7b},  //
-    {0x1e7d, 0x1e7d},  //
-    {0x1e7f, 0x1e7f},  //
-    {0x1e81, 0x1e81},  //
-    {0x1e83, 0x1e83},  //
-    {0x1e85, 0x1e85},  //
-    {0x1e87, 0x1e87},  //
-    {0x1e89, 0x1e89},  //
-    {0x1e8b, 0x1e8b},  //
-    {0x1e8d, 0x1e8d},  //
-    {0x1e8f, 0x1e8f},  //
-    {0x1e91, 0x1e91},  //
-    {0x1e93, 0x1e93},  //
-    {0x1e95, 0x1e9d},  //
-    {0x1e9f, 0x1e9f},  //
-    {0x1ea1, 0x1ea1},  //
-    {0x1ea3, 0x1ea3},  //
-    {0x1ea5, 0x1ea5},  //
-    {0x1ea7, 0x1ea7},  //
-    {0x1ea9, 0x1ea9},  //
-    {0x1eab, 0x1eab},  //
-    {0x1ead, 0x1ead},  //
-    {0x1eaf, 0x1eaf},  //
-    {0x1eb1, 0x1eb1},  //
-    {0x1eb3, 0x1eb3},  //
-    {0x1eb5, 0x1eb5},  //
-    {0x1eb7, 0x1eb7},  //
-    {0x1eb9, 0x1eb9},  //
-    {0x1ebb, 0x1ebb},  //
-    {0x1ebd, 0x1ebd},  //
-    {0x1ebf, 0x1ebf},  //
-    {0x1ec1, 0x1ec1},  //
-    {0x1ec3, 0x1ec3},  //
-    {0x1ec5, 0x1ec5},  //
-    {0x1ec7, 0x1ec7},  //
-    {0x1ec9, 0x1ec9},  //
-    {0x1ecb, 0x1ecb},  //
-    {0x1ecd, 0x1ecd},  //
-    {0x1ecf, 0x1ecf},  //
-    {0x1ed1, 0x1ed1},  //
-    {0x1ed3, 0x1ed3},  //
-    {0x1ed5, 0x1ed5},  //
-    {0x1ed7, 0x1ed7},  //
-    {0x1ed9, 0x1ed9},  //
-    {0x1edb, 0x1edb},  //
-    {0x1edd, 0x1edd},  //
-    {0x1edf, 0x1edf},  //
-    {0x1ee1, 0x1ee1},  //
-    {0x1ee3, 0x1ee3},  //
-    {0x1ee5, 0x1ee5},  //
-    {0x1ee7, 0x1ee7},  //
-    {0x1ee9, 0x1ee9},  //
-    {0x1eeb, 0x1eeb},  //
-    {0x1eed, 0x1eed},  //
-    {0x1eef, 0x1eef},  //
-    {0x1ef1, 0x1ef1},  //
-    {0x1ef3, 0x1ef3},  //
-    {0x1ef5, 0x1ef5},  //
-    {0x1ef7, 0x1ef7},  //
-    {0x1ef9, 0x1ef9},  //
-    {0x1efb, 0x1efb},  //
-    {0x1efd, 0x1efd},  //
-    {0x1eff, 0x1f07},  //
-    {0x1f10, 0x1f15},  //
-    {0x1f20, 0x1f27},  //
-    {0x1f30, 0x1f37},  //
-    {0x1f40, 0x1f45},  //
-    {0x1f50, 0x1f57},  //
-    {0x1f60, 0x1f67},  //
-    {0x1f70, 0x1f7d},  //
-    {0x1f80, 0x1f87},  //
-    {0x1f90, 0x1f97},  //
-    {0x1fa0, 0x1fa7},  //
-    {0x1fb0, 0x1fb4},  //
-    {0x1fb6, 0x1fb7},  //
-    {0x1fbe, 0x1fbe},  //
-    {0x1fc2, 0x1fc4},  //
-    {0x1fc6, 0x1fc7},  //
-    {0x1fd0, 0x1fd3},  //
-    {0x1fd6, 0x1fd7},  //
-    {0x1fe0, 0x1fe7},  //
-    {0x1ff2, 0x1ff4},  //
-    {0x1ff6, 0x1ff7},  //
-    {0x2071, 0x2071},  //
-    {0x207f, 0x207f},  //
-    {0x2090, 0x209c},  //
-    {0x210a, 0x210a},  //
-    {0x210e, 0x210f},  //
-    {0x2113, 0x2113},  //
-    {0x212f, 0x212f},  //
-    {0x2134, 0x2134},  //
-    {0x2139, 0x2139},  //
-    {0x213c, 0x213d},  //
-    {0x2146, 0x2149},  //
-    {0x214e, 0x214e},  //
-    {0x2170, 0x217f},  //
-    {0x2184, 0x2184},  //
-    {0x24d0, 0x24e9},  //
-    {0x2c30, 0x2c5f},  //
-    {0x2c61, 0x2c61},  //
-    {0x2c65, 0x2c66},  //
-    {0x2c68, 0x2c68},  //
-    {0x2c6a, 0x2c6a},  //
-    {0x2c6c, 0x2c6c},  //
-    {0x2c71, 0x2c71},  //
-    {0x2c73, 0x2c74},  //
-    {0x2c76, 0x2c7d},  //
-    {0x2c81, 0x2c81},  //
-    {0x2c83, 0x2c83},  //
-    {0x2c85, 0x2c85},  //
-    {0x2c87, 0x2c87},  //
-    {0x2c89, 0x2c89},  //
-    {0x2c8b, 0x2c8b},  //
-    {0x2c8d, 0x2c8d},  //
-    {0x2c8f, 0x2c8f},  //
-    {0x2c91, 0x2c91},  //
-    {0x2c93, 0x2c93},  //
-    {0x2c95, 0x2c95},  //
-    {0x2c97, 0x2c97},  //
-    {0x2c99, 0x2c99},  //
-    {0x2c9b, 0x2c9b},  //
-    {0x2c9d, 0x2c9d},  //
-    {0x2c9f, 0x2c9f},  //
-    {0x2ca1, 0x2ca1},  //
-    {0x2ca3, 0x2ca3},  //
-    {0x2ca5, 0x2ca5},  //
-    {0x2ca7, 0x2ca7},  //
-    {0x2ca9, 0x2ca9},  //
-    {0x2cab, 0x2cab},  //
-    {0x2cad, 0x2cad},  //
-    {0x2caf, 0x2caf},  //
-    {0x2cb1, 0x2cb1},  //
-    {0x2cb3, 0x2cb3},  //
-    {0x2cb5, 0x2cb5},  //
-    {0x2cb7, 0x2cb7},  //
-    {0x2cb9, 0x2cb9},  //
-    {0x2cbb, 0x2cbb},  //
-    {0x2cbd, 0x2cbd},  //
-    {0x2cbf, 0x2cbf},  //
-    {0x2cc1, 0x2cc1},  //
-    {0x2cc3, 0x2cc3},  //
-    {0x2cc5, 0x2cc5},  //
-    {0x2cc7, 0x2cc7},  //
-    {0x2cc9, 0x2cc9},  //
-    {0x2ccb, 0x2ccb},  //
-    {0x2ccd, 0x2ccd},  //
-    {0x2ccf, 0x2ccf},  //
-    {0x2cd1, 0x2cd1},  //
-    {0x2cd3, 0x2cd3},  //
-    {0x2cd5, 0x2cd5},  //
-    {0x2cd7, 0x2cd7},  //
-    {0x2cd9, 0x2cd9},  //
-    {0x2cdb, 0x2cdb},  //
-    {0x2cdd, 0x2cdd},  //
-    {0x2cdf, 0x2cdf},  //
-    {0x2ce1, 0x2ce1},  //
-    {0x2ce3, 0x2ce4},  //
-    {0x2cec, 0x2cec},  //
-    {0x2cee, 0x2cee},  //
-    {0x2cf3, 0x2cf3},  //
-    {0x2d00, 0x2d25},  //
-    {0x2d27, 0x2d27},  //
-    {0x2d2d, 0x2d2d},  //
-    {0xa641, 0xa641},  //
-    {0xa643, 0xa643},  //
-    {0xa645, 0xa645},  //
-    {0xa647, 0xa647},  //
-    {0xa649, 0xa649},  //
-    {0xa64b, 0xa64b},  //
-    {0xa64d, 0xa64d},  //
-    {0xa64f, 0xa64f},  //
-    {0xa651, 0xa651},  //
-    {0xa653, 0xa653},  //
-    {0xa655, 0xa655},  //
-    {0xa657, 0xa657},  //
-    {0xa659, 0xa659},  //
-    {0xa65b, 0xa65b},  //
-    {0xa65d, 0xa65d},  //
-    {0xa65f, 0xa65f},  //
-    {0xa661, 0xa661},  //
-    {0xa663, 0xa663},  //
-    {0xa665, 0xa665},  //
-    {0xa667, 0xa667},  //
-    {0xa669, 0xa669},  //
-    {0xa66b, 0xa66b},  //
-    {0xa66d, 0xa66d},  //
-    {0xa681, 0xa681},  //
-    {0xa683, 0xa683},  //
-    {0xa685, 0xa685},  //
-    {0xa687, 0xa687},  //
-    {0xa689, 0xa689},  //
-    {0xa68b, 0xa68b},  //
-    {0xa68d, 0xa68d},  //
-    {0xa68f, 0xa68f},  //
-    {0xa691, 0xa691},  //
-    {0xa693, 0xa693},  //
-    {0xa695, 0xa695},  //
-    {0xa697, 0xa697},  //
-    {0xa699, 0xa699},  //
-    {0xa69b, 0xa69d},  //
-    {0xa723, 0xa723},  //
-    {0xa725, 0xa725},  //
-    {0xa727, 0xa727},  //
-    {0xa729, 0xa729},  //
-    {0xa72b, 0xa72b},  //
-    {0xa72d, 0xa72d},  //
-    {0xa72f, 0xa731},  //
-    {0xa733, 0xa733},  //
-    {0xa735, 0xa735},  //
-    {0xa737, 0xa737},  //
-    {0xa739, 0xa739},  //
-    {0xa73b, 0xa73b},  //
-    {0xa73d, 0xa73d},  //
-    {0xa73f, 0xa73f},  //
-    {0xa741, 0xa741},  //
-    {0xa743, 0xa743},  //
-    {0xa745, 0xa745},  //
-    {0xa747, 0xa747},  //
-    {0xa749, 0xa749},  //
-    {0xa74b, 0xa74b},  //
-    {0xa74d, 0xa74d},  //
-    {0xa74f, 0xa74f},  //
-    {0xa751, 0xa751},  //
-    {0xa753, 0xa753},  //
-    {0xa755, 0xa755},  //
-    {0xa757, 0xa757},  //
-    {0xa759, 0xa759},  //
-    {0xa75b, 0xa75b},  //
-    {0xa75d, 0xa75d},  //
-    {0xa75f, 0xa75f},  //
-    {0xa761, 0xa761},  //
-    {0xa763, 0xa763},  //
-    {0xa765, 0xa765},  //
-    {0xa767, 0xa767},  //
-    {0xa769, 0xa769},  //
-    {0xa76b, 0xa76b},  //
-    {0xa76d, 0xa76d},  //
-    {0xa76f, 0xa778},  //
-    {0xa77a, 0xa77a},  //
-    {0xa77c, 0xa77c},  //
-    {0xa77f, 0xa77f},  //
-    {0xa781, 0xa781},  //
-    {0xa783, 0xa783},  //
-    {0xa785, 0xa785},  //
-    {0xa787, 0xa787},  //
-    {0xa78c, 0xa78c},  //
-    {0xa78e, 0xa78e},  //
-    {0xa791, 0xa791},  //
-    {0xa793, 0xa795},  //
-    {0xa797, 0xa797},  //
-    {0xa799, 0xa799},  //
-    {0xa79b, 0xa79b},  //
-    {0xa79d, 0xa79d},  //
-    {0xa79f, 0xa79f},  //
-    {0xa7a1, 0xa7a1},  //
-    {0xa7a3, 0xa7a3},  //
-    {0xa7a5, 0xa7a5},  //
-    {0xa7a7, 0xa7a7},  //
-    {0xa7a9, 0xa7a9},  //
-    {0xa7af, 0xa7af},  //
-    {0xa7b5, 0xa7b5},  //
-    {0xa7b7, 0xa7b7},  //
-    {0xa7b9, 0xa7b9},  //
-    {0xa7bb, 0xa7bb},  //
-    {0xa7bd, 0xa7bd},  //
-    {0xa7bf, 0xa7bf},  //
-    {0xa7c1, 0xa7c1},  //
-    {0xa7c3, 0xa7c3},  //
-    {0xa7c8, 0xa7c8},  //
-    {0xa7ca, 0xa7ca},  //
-    {0xa7d1, 0xa7d1},  //
-    {0xa7d3, 0xa7d3},  //
-    {0xa7d5, 0xa7d5},  //
-    {0xa7d7, 0xa7d7},  //
-    {0xa7d9, 0xa7d9},  //
-    {0xa7f2, 0xa7f4},  //
-    {0xa7f6, 0xa7f6},  //
-    {0xa7f8, 0xa7fa},  //
-    {0xab30, 0xab5a},  //
-    {0xab5c, 0xab69},  //
-    {0xab70, 0xabbf},  //
-    {0xfb00, 0xfb06},  //
-    {0xfb13, 0xfb17},  //
-    {0xff41, 0xff5a},  //
-};
-
-static const unsigned kLowerAstral[][2] = {
-    {0x10428, 0x1044f},  //
-    {0x104d8, 0x104fb},  //
-    {0x10597, 0x105a1},  //
-    {0x105a3, 0x105b1},  //
-    {0x105b3, 0x105b9},  //
-    {0x105bb, 0x105bc},  //
-    {0x10780, 0x10780},  //
-    {0x10783, 0x10785},  //
-    {0x10787, 0x107b0},  //
-    {0x107b2, 0x107ba},  //
-    {0x10cc0, 0x10cf2},  //
-    {0x118c0, 0x118df},  //
-    {0x16e60, 0x16e7f},  //
-    {0x1d41a, 0x1d433},  //
-    {0x1d44e, 0x1d454},  //
-    {0x1d456, 0x1d467},  //
-    {0x1d482, 0x1d49b},  //
-    {0x1d4b6, 0x1d4b9},  //
-    {0x1d4bb, 0x1d4bb},  //
-    {0x1d4bd, 0x1d4c3},  //
-    {0x1d4c5, 0x1d4cf},  //
-    {0x1d4ea, 0x1d503},  //
-    {0x1d51e, 0x1d537},  //
-    {0x1d552, 0x1d56b},  //
-    {0x1d586, 0x1d59f},  //
-    {0x1d5ba, 0x1d5d3},  //
-    {0x1d5ee, 0x1d607},  //
-    {0x1d622, 0x1d63b},  //
-    {0x1d656, 0x1d66f},  //
-    {0x1d68a, 0x1d6a5},  //
-    {0x1d6c2, 0x1d6da},  //
-    {0x1d6dc, 0x1d6e1},  //
-    {0x1d6fc, 0x1d714},  //
-    {0x1d716, 0x1d71b},  //
-    {0x1d736, 0x1d74e},  //
-    {0x1d750, 0x1d755},  //
-    {0x1d770, 0x1d788},  //
-    {0x1d78a, 0x1d78f},  //
-    {0x1d7aa, 0x1d7c2},  //
-    {0x1d7c4, 0x1d7c9},  //
-    {0x1d7cb, 0x1d7cb},  //
-    {0x1df00, 0x1df09},  //
-    {0x1df0b, 0x1df1e},  //
-    {0x1df25, 0x1df2a},  //
-    {0x1e030, 0x1e06d},  //
-    {0x1e922, 0x1e943},  //
-};
-
-/**
- * Returns nonzero if c is lowercase letter.
- */
-int iswlower(wint_t c) {
-  if (!IsTiny() && c < 128)
-    return 'a' <= c && c <= 'z';
-  if (c < 65536)
-    return has_char(kLower, ARRAYLEN(kLower), (unsigned short)c);
-  return has_char(kLowerAstral, ARRAYLEN(kLowerAstral), (unsigned)c);
-}
-
-__weak_reference(iswlower, iswlower_l);
diff --git a/libc/str/iswprint.c b/libc/str/iswprint.c
index 9a4875c3e..030e45d46 100644
--- a/libc/str/iswprint.c
+++ b/libc/str/iswprint.c
@@ -22,11 +22,8 @@
  * Returns nonzero if c is printable.
  */
 int iswprint(wint_t c) {
-  return (0 <= c && c <= 0x10FFFD) &&      // legal unicode
-         !(0x0000 <= c && c <= 0x001F) &&  // c0 control codes
-         !(0x007F <= c && c <= 0x009F) &&  // c1 control codes
-         !(0x2028 <= c && c <= 0x2029) &&  // line / paragraph separator
-         !(0xFFF9 <= c && c <= 0xFFFB);    // interlinear annotation controls
+  return !((0x00 <= c && c <= 0x1F) || (0x7F <= c && c <= 0x9F) ||
+           (0xFFF9 <= c && c <= 0xFFFB) || c == 0x2028 || c == 0x2029);
 }
 
 __weak_reference(iswprint, iswprint_l);
diff --git a/libc/str/iswpunct.c b/libc/str/iswpunct.c
new file mode 100644
index 000000000..d66779b76
--- /dev/null
+++ b/libc/str/iswpunct.c
@@ -0,0 +1,543 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/wctype.h"
+
+/**
+ * Returns nonzero if c is punctuation mark.
+ */
+int iswpunct(wint_t c) {
+  if (c < 0xa0) {
+    switch (c) {
+      case '!':
+      case '"':
+      case '#':
+      case '$':
+      case '%':
+      case '&':
+      case '\'':
+      case '(':
+      case ')':
+      case '*':
+      case '+':
+      case ',':
+      case '-':
+      case '.':
+      case '/':
+      case ':':
+      case ';':
+      case '<':
+      case '=':
+      case '>':
+      case '?':
+      case '@':
+      case '[':
+      case '\\':
+      case ']':
+      case '^':
+      case '_':
+      case '`':
+      case '{':
+      case '|':
+      case '}':
+      case '~':
+        return 1;
+      default:
+        return 0;
+    }
+  }
+  switch (c) {
+    case u'¡':    // INVERTED EXCLAMATION MARK (0x00a1 Po)
+    case u'§':    // SECTION SIGN (0x00a7 Po)
+    case u'«':    // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK (0x00ab Pi)
+    case u'¶':    // PILCROW SIGN (0x00b6 Po)
+    case u'·':    // MIDDLE DOT (0x00b7 Po)
+    case u'»':    // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK (0x00bb Pf)
+    case u'¿':    // INVERTED QUESTION MARK (0x00bf Po)
+    case u';':    // GREEK QUESTION MARK (0x037e Po)
+    case u'·':    // GREEK ANO TELEIA (0x0387 Po)
+    case u'՚':    // ARMENIAN APOSTROPHE (0x055a Po)
+    case u'՛':    // ARMENIAN EMPHASIS MARK (0x055b Po)
+    case u'՜':    // ARMENIAN EXCLAMATION MARK (0x055c Po)
+    case u'՝':    // ARMENIAN COMMA (0x055d Po)
+    case u'՞':    // ARMENIAN QUESTION MARK (0x055e Po)
+    case u'՟':    // ARMENIAN ABBREVIATION MARK (0x055f Po)
+    case u'։':    // ARMENIAN FULL STOP (0x0589 Po)
+    case u'֊':    // ARMENIAN HYPHEN (0x058a Pd)
+    case 0x05be:  // HEBREW PUNCTUATION MAQAF (0x05be Pd)
+    case 0x05c0:  // HEBREW PUNCTUATION PASEQ (0x05c0 Po)
+    case 0x05c3:  // HEBREW PUNCTUATION SOF PASUQ (0x05c3 Po)
+    case 0x05c6:  // HEBREW PUNCTUATION NUN HAFUKHA (0x05c6 Po)
+    case 0x05f3:  // HEBREW PUNCTUATION GERESH (0x05f3 Po)
+    case 0x05f4:  // HEBREW PUNCTUATION GERSHAYIM (0x05f4 Po)
+    case 0x0609:  // ARABIC-INDIC PER MILLE SIGN (0x0609 Po)
+    case 0x060a:  // ARABIC-INDIC PER TEN THOUSAND SIGN (0x060a Po)
+    case 0x060c:  // ARABIC COMMA (0x060c Po)
+    case 0x060d:  // ARABIC DATE SEPARATOR (0x060d Po)
+    case 0x061b:  // ARABIC SEMICOLON (0x061b Po)
+    case u'؞':    // ARABIC TRIPLE DOT PUNCTUATION MARK (0x061e Po)
+    case u'؟':    // ARABIC QUESTION MARK (0x061f Po)
+    case u'٪':    // ARABIC PERCENT SIGN (0x066a Po)
+    case u'٫':    // ARABIC DECIMAL SEPARATOR (0x066b Po)
+    case u'٬':    // ARABIC THOUSANDS SEPARATOR (0x066c Po)
+    case u'٭':    // ARABIC FIVE POINTED STAR (0x066d Po)
+    case u'۔':    // ARABIC FULL STOP (0x06d4 Po)
+    case u'߷':    // NKO SYMBOL GBAKURUNEN (0x07f7 Po)
+    case u'߸':    // NKO COMMA (0x07f8 Po)
+    case u'߹':    // NKO EXCLAMATION MARK (0x07f9 Po)
+    case u'।':    // DEVANAGARI DANDA (0x0964 Po)
+    case u'॥':    // DEVANAGARI DOUBLE DANDA (0x0965 Po)
+    case u'॰':    // DEVANAGARI ABBREVIATION SIGN (0x0970 Po)
+    case 0x09fd:  // BENGALI ABBREVIATION SIGN (0x09fd Po)
+    case 0x0a76:  // GURMUKHI ABBREVIATION SIGN (0x0a76 Po)
+    case 0x0af0:  // GUJARATI ABBREVIATION SIGN (0x0af0 Po)
+    case 0x0c77:  // TELUGU SIGN SIDDHAM (0x0c77 Po)
+    case 0x0c84:  // KANNADA SIGN SIDDHAM (0x0c84 Po)
+    case u'෴':    // SINHALA PUNCTUATION KUNDDALIYA (0x0df4 Po)
+    case u'๏':    // THAI CHARACTER FONGMAN (0x0e4f Po)
+    case u'๚':    // THAI CHARACTER ANGKHANKHU (0x0e5a Po)
+    case u'๛':    // THAI CHARACTER KHOMUT (0x0e5b Po)
+    case u'༄':    // TIBETAN MARK INITIAL YIG MGO MDUN MA (0x0f04 Po)
+    case u'༅':    // TIBETAN MARK CLOSING YIG MGO SGAB MA (0x0f05 Po)
+    case u'༆':    // TIBETAN MARK CARET YIG MGO PHUR SHAD MA (0x0f06 Po)
+    case u'༇':    // TIBETAN MARK YIG MGO TSHEG SHAD MA (0x0f07 Po)
+    case u'༈':    // TIBETAN MARK SBRUL SHAD (0x0f08 Po)
+    case u'༉':    // TIBETAN MARK BSKUR YIG MGO (0x0f09 Po)
+    case u'༊':    // TIBETAN MARK BKA- SHOG YIG MGO (0x0f0a Po)
+    case u'་':    // TIBETAN MARK INTERSYLLABIC TSHEG (0x0f0b Po)
+    case u'༌':    // TIBETAN MARK DELIMITER TSHEG BSTAR (0x0f0c Po)
+    case u'།':    // TIBETAN MARK SHAD (0x0f0d Po)
+    case u'༎':    // TIBETAN MARK NYIS SHAD (0x0f0e Po)
+    case u'༏':    // TIBETAN MARK TSHEG SHAD (0x0f0f Po)
+    case u'༐':    // TIBETAN MARK NYIS TSHEG SHAD (0x0f10 Po)
+    case u'༑':    // TIBETAN MARK RIN CHEN SPUNGS SHAD (0x0f11 Po)
+    case u'༒':    // TIBETAN MARK RGYA GRAM SHAD (0x0f12 Po)
+    case u'༔':    // TIBETAN MARK GTER TSHEG (0x0f14 Po)
+    case u'༺':    // TIBETAN MARK GUG RTAGS GYON (0x0f3a Ps)
+    case u'༻':    // TIBETAN MARK GUG RTAGS GYAS (0x0f3b Pe)
+    case u'༼':    // TIBETAN MARK ANG KHANG GYON (0x0f3c Ps)
+    case u'༽':    // TIBETAN MARK ANG KHANG GYAS (0x0f3d Pe)
+    case u'྅':    // TIBETAN MARK PALUTA (0x0f85 Po)
+    case u'࿐':    // TIBETAN MARK BSKA- SHOG GI MGO RGYAN (0x0fd0 Po)
+    case u'࿑':    // TIBETAN MARK MNYAM YIG GI MGO RGYAN (0x0fd1 Po)
+    case u'࿒':    // TIBETAN MARK NYIS TSHEG (0x0fd2 Po)
+    case u'࿓':   // TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA (0x0fd3 Po)
+    case u'࿔':   // TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA (0x0fd4 Po)
+    case u'࿙':   // TIBETAN MARK LEADING MCHAN RTAGS (0x0fd9 Po)
+    case u'࿚':   // TIBETAN MARK TRAILING MCHAN RTAGS (0x0fda Po)
+    case u'၊':   // MYANMAR SIGN LITTLE SECTION (0x104a Po)
+    case u'။':   // MYANMAR SIGN SECTION (0x104b Po)
+    case u'၌':   // MYANMAR SYMBOL LOCATIVE (0x104c Po)
+    case u'၍':   // MYANMAR SYMBOL COMPLETED (0x104d Po)
+    case u'၎':   // MYANMAR SYMBOL AFOREMENTIONED (0x104e Po)
+    case u'၏':   // MYANMAR SYMBOL GENITIVE (0x104f Po)
+    case u'჻':   // GEORGIAN PARAGRAPH SEPARATOR (0x10fb Po)
+    case u'፠':   // ETHIOPIC SECTION MARK (0x1360 Po)
+    case u'፡':   // ETHIOPIC WORDSPACE (0x1361 Po)
+    case u'።':   // ETHIOPIC FULL STOP (0x1362 Po)
+    case u'፣':   // ETHIOPIC COMMA (0x1363 Po)
+    case u'፤':   // ETHIOPIC SEMICOLON (0x1364 Po)
+    case u'፥':   // ETHIOPIC COLON (0x1365 Po)
+    case u'፦':   // ETHIOPIC PREFACE COLON (0x1366 Po)
+    case u'፧':   // ETHIOPIC QUESTION MARK (0x1367 Po)
+    case u'፨':   // ETHIOPIC PARAGRAPH SEPARATOR (0x1368 Po)
+    case u'᐀':   // CANADIAN SYLLABICS HYPHEN (0x1400 Pd)
+    case u'᙮':   // CANADIAN SYLLABICS FULL STOP (0x166e Po)
+    case u'᚛':   // OGHAM FEATHER MARK (0x169b Ps)
+    case u'᚜':   // OGHAM REVERSED FEATHER MARK (0x169c Pe)
+    case u'᛫':   // RUNIC SINGLE PUNCTUATION (0x16eb Po)
+    case u'᛬':   // RUNIC MULTIPLE PUNCTUATION (0x16ec Po)
+    case u'᛭':   // RUNIC CROSS PUNCTUATION (0x16ed Po)
+    case u'᜵':   // PHILIPPINE SINGLE PUNCTUATION (0x1735 Po)
+    case u'᜶':   // PHILIPPINE DOUBLE PUNCTUATION (0x1736 Po)
+    case u'។':   // KHMER SIGN KHAN (0x17d4 Po)
+    case u'៕':   // KHMER SIGN BARIYOOSAN (0x17d5 Po)
+    case u'៖':   // KHMER SIGN CAMNUC PII KUUH (0x17d6 Po)
+    case u'៘':   // KHMER SIGN BEYYAL (0x17d8 Po)
+    case u'៙':   // KHMER SIGN PHNAEK MUAN (0x17d9 Po)
+    case u'៚':   // KHMER SIGN KOOMUUT (0x17da Po)
+    case u'᠀':   // MONGOLIAN BIRGA (0x1800 Po)
+    case u'᠁':   // MONGOLIAN ELLIPSIS (0x1801 Po)
+    case u'᠂':   // MONGOLIAN COMMA (0x1802 Po)
+    case u'᠃':   // MONGOLIAN FULL STOP (0x1803 Po)
+    case u'᠄':   // MONGOLIAN COLON (0x1804 Po)
+    case u'᠅':   // MONGOLIAN FOUR DOTS (0x1805 Po)
+    case u'᠆':   // MONGOLIAN TODO SOFT HYPHEN (0x1806 Pd)
+    case u'᠇':   // MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER (0x1807 Po)
+    case u'᠈':   // MONGOLIAN MANCHU COMMA (0x1808 Po)
+    case u'᠉':   // MONGOLIAN MANCHU FULL STOP (0x1809 Po)
+    case u'᠊':   // MONGOLIAN NIRUGU (0x180a Po)
+    case u'᥄':   // LIMBU EXCLAMATION MARK (0x1944 Po)
+    case u'᥅':   // LIMBU QUESTION MARK (0x1945 Po)
+    case u'᨞':   // BUGINESE PALLAWA (0x1a1e Po)
+    case u'᨟':   // BUGINESE END OF SECTION (0x1a1f Po)
+    case u'᱾':   // OL CHIKI PUNCTUATION MUCAAD (0x1c7e Po)
+    case u'᱿':   // OL CHIKI PUNCTUATION DOUBLE MUCAAD (0x1c7f Po)
+    case u'‐':   // HYPHEN (0x2010 Pd)
+    case u'‑':   // NON-BREAKING HYPHEN (0x2011 Pd)
+    case u'‒':   // FIGURE DASH (0x2012 Pd)
+    case u'–':   // EN DASH (0x2013 Pd)
+    case u'—':   // EM DASH (0x2014 Pd)
+    case u'―':   // HORIZONTAL BAR (0x2015 Pd)
+    case u'‖':   // DOUBLE VERTICAL LINE (0x2016 Po)
+    case u'‗':   // DOUBLE LOW LINE (0x2017 Po)
+    case u'‘':   // LEFT SINGLE QUOTATION MARK (0x2018 Pi)
+    case u'’':   // RIGHT SINGLE QUOTATION MARK (0x2019 Pf)
+    case u'‚':   // SINGLE LOW-9 QUOTATION MARK (0x201a Ps)
+    case u'‛':   // SINGLE HIGH-REVERSED-9 QUOTATION MARK (0x201b Pi)
+    case u'“':   // LEFT DOUBLE QUOTATION MARK (0x201c Pi)
+    case u'”':   // RIGHT DOUBLE QUOTATION MARK (0x201d Pf)
+    case u'„':   // DOUBLE LOW-9 QUOTATION MARK (0x201e Ps)
+    case u'‟':   // DOUBLE HIGH-REVERSED-9 QUOTATION MARK (0x201f Pi)
+    case u'†':   // DAGGER (0x2020 Po)
+    case u'‡':   // DOUBLE DAGGER (0x2021 Po)
+    case u'•':   // BULLET (0x2022 Po)
+    case u'‣':   // TRIANGULAR BULLET (0x2023 Po)
+    case u'․':   // ONE DOT LEADER (0x2024 Po)
+    case u'‥':   // TWO DOT LEADER (0x2025 Po)
+    case u'…':   // HORIZONTAL ELLIPSIS (0x2026 Po)
+    case u'‧':   // HYPHENATION POINT (0x2027 Po)
+    case u'‰':   // PER MILLE SIGN (0x2030 Po)
+    case u'‱':   // PER TEN THOUSAND SIGN (0x2031 Po)
+    case u'′':   // PRIME (0x2032 Po)
+    case u'″':   // DOUBLE PRIME (0x2033 Po)
+    case u'‴':   // TRIPLE PRIME (0x2034 Po)
+    case u'‵':   // REVERSED PRIME (0x2035 Po)
+    case u'‶':   // REVERSED DOUBLE PRIME (0x2036 Po)
+    case u'‷':   // REVERSED TRIPLE PRIME (0x2037 Po)
+    case u'‸':   // CARET (0x2038 Po)
+    case u'‹':   // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (0x2039 Pi)
+    case u'›':   // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (0x203a Pf)
+    case u'※':   // REFERENCE MARK (0x203b Po)
+    case u'‼':   // DOUBLE EXCLAMATION MARK (0x203c Po)
+    case u'‽':   // INTERROBANG (0x203d Po)
+    case u'‾':   // OVERLINE (0x203e Po)
+    case u'‿':   // UNDERTIE (0x203f Pc)
+    case u'⁀':   // CHARACTER TIE (0x2040 Pc)
+    case u'⁁':   // CARET INSERTION POINT (0x2041 Po)
+    case u'⁂':   // ASTERISM (0x2042 Po)
+    case u'⁃':   // HYPHEN BULLET (0x2043 Po)
+    case u'⁅':   // LEFT SQUARE BRACKET WITH QUILL (0x2045 Ps)
+    case u'⁆':   // RIGHT SQUARE BRACKET WITH QUILL (0x2046 Pe)
+    case u'⁇':   // DOUBLE QUESTION MARK (0x2047 Po)
+    case u'⁈':   // QUESTION EXCLAMATION MARK (0x2048 Po)
+    case u'⁉':   // EXCLAMATION QUESTION MARK (0x2049 Po)
+    case u'⁊':   // TIRONIAN SIGN ET (0x204a Po)
+    case u'⁋':   // REVERSED PILCROW SIGN (0x204b Po)
+    case u'⁌':   // BLACK LEFTWARDS BULLET (0x204c Po)
+    case u'⁍':   // BLACK RIGHTWARDS BULLET (0x204d Po)
+    case u'⁎':   // LOW ASTERISK (0x204e Po)
+    case u'⁏':   // REVERSED SEMICOLON (0x204f Po)
+    case u'⁐':   // CLOSE UP (0x2050 Po)
+    case u'⁑':   // TWO ASTERISKS ALIGNED VERTICALLY (0x2051 Po)
+    case u'⁓':   // SWUNG DASH (0x2053 Po)
+    case u'⁔':   // INVERTED UNDERTIE (0x2054 Pc)
+    case u'⁕':   // FLOWER PUNCTUATION MARK (0x2055 Po)
+    case u'⁖':   // THREE DOT PUNCTUATION (0x2056 Po)
+    case u'⁗':   // QUADRUPLE PRIME (0x2057 Po)
+    case u'⁘':   // FOUR DOT PUNCTUATION (0x2058 Po)
+    case u'⁙':   // FIVE DOT PUNCTUATION (0x2059 Po)
+    case u'⁚':   // TWO DOT PUNCTUATION (0x205a Po)
+    case u'⁛':   // FOUR DOT MARK (0x205b Po)
+    case u'⁜':   // DOTTED CROSS (0x205c Po)
+    case u'⁝':   // TRICOLON (0x205d Po)
+    case u'⁞':   // VERTICAL FOUR DOTS (0x205e Po)
+    case u'⁽':   // SUPERSCRIPT LEFT PARENTHESIS (0x207d Ps)
+    case u'⁾':   // SUPERSCRIPT RIGHT PARENTHESIS (0x207e Pe)
+    case u'₍':   // SUBSCRIPT LEFT PARENTHESIS (0x208d Ps)
+    case u'₎':   // SUBSCRIPT RIGHT PARENTHESIS (0x208e Pe)
+    case u'⌈':   // LEFT CEILING (0x2308 Ps)
+    case u'⌉':   // RIGHT CEILING (0x2309 Pe)
+    case u'⌊':   // LEFT FLOOR (0x230a Ps)
+    case u'⌋':   // RIGHT FLOOR (0x230b Pe)
+    case u'〈':  // LEFT-POINTING ANGLE BRACKET (0x2329 Ps)
+    case u'〉':  // RIGHT-POINTING ANGLE BRACKET (0x232a Pe)
+    case u'❨':   // MEDIUM LEFT PARENTHESIS ORNAMENT (0x2768 Ps)
+    case u'❩':   // MEDIUM RIGHT PARENTHESIS ORNAMENT (0x2769 Pe)
+    case u'❪':   // MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT (0x276a Ps)
+    case u'❫':   // MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT (0x276b Pe)
+    case u'❬':   // MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT (0x276c Ps)
+    case u'❭':   // MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT (0x276d Pe)
+    case u'❮':  // HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT (0x276e Ps)
+    case u'❯':  // HEAVY RIGHT-POINTING ANGLE QUOT MARK ORNAMENT (0x276f Pe)
+    case u'❰':  // HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT (0x2770 Ps)
+    case u'❱':  // HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT (0x2771 Pe)
+    case u'❲':  // LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT (0x2772 Ps)
+    case u'❳':  // LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT (0x2773 Pe)
+    case u'❴':  // MEDIUM LEFT CURLY BRACKET ORNAMENT (0x2774 Ps)
+    case u'❵':  // MEDIUM RIGHT CURLY BRACKET ORNAMENT (0x2775 Pe)
+    case u'⟅':  // LEFT S-SHAPED BAG DELIMITER (0x27c5 Ps)
+    case u'⟆':  // RIGHT S-SHAPED BAG DELIMITER (0x27c6 Pe)
+    case u'⟦':  // MATHEMATICAL LEFT WHITE SQUARE BRACKET (0x27e6 Ps)
+    case u'⟧':  // MATHEMATICAL RIGHT WHITE SQUARE BRACKET (0x27e7 Pe)
+    case u'⟨':  // MATHEMATICAL LEFT ANGLE BRACKET (0x27e8 Ps)
+    case u'⟩':  // MATHEMATICAL RIGHT ANGLE BRACKET (0x27e9 Pe)
+    case u'⟪':  // MATHEMATICAL LEFT DOUBLE ANGLE BRACKET (0x27ea Ps)
+    case u'⟫':  // MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET (0x27eb Pe)
+    case u'⟬':  // MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET (0x27ec Ps)
+    case u'⟭':  // MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET (0x27ed Pe)
+    case u'⟮':  // MATHEMATICAL LEFT FLATTENED PARENTHESIS (0x27ee Ps)
+    case u'⟯':  // MATHEMATICAL RIGHT FLATTENED PARENTHESIS (0x27ef Pe)
+    case u'⦃':  // LEFT WHITE CURLY BRACKET (0x2983 Ps)
+    case u'⦄':  // RIGHT WHITE CURLY BRACKET (0x2984 Pe)
+    case u'⦅':  // LEFT WHITE PARENTHESIS (0x2985 Ps)
+    case u'⦆':  // RIGHT WHITE PARENTHESIS (0x2986 Pe)
+    case u'⦇':  // Z NOTATION LEFT IMAGE BRACKET (0x2987 Ps)
+    case u'⦈':  // Z NOTATION RIGHT IMAGE BRACKET (0x2988 Pe)
+    case u'⦉':  // Z NOTATION LEFT BINDING BRACKET (0x2989 Ps)
+    case u'⦊':  // Z NOTATION RIGHT BINDING BRACKET (0x298a Pe)
+    case u'⦋':  // LEFT SQUARE BRACKET WITH UNDERBAR (0x298b Ps)
+    case u'⦌':  // RIGHT SQUARE BRACKET WITH UNDERBAR (0x298c Pe)
+    case u'⦍':  // LEFT SQUARE BRACKET WITH TICK IN TOP CORNER (0x298d Ps)
+    case u'⦎':  // RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER (0x298e Pe)
+    case u'⦏':  // LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER (0x298f Ps)
+    case u'⦐':  // RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER (0x2990 Pe)
+    case u'⦑':  // LEFT ANGLE BRACKET WITH DOT (0x2991 Ps)
+    case u'⦒':  // RIGHT ANGLE BRACKET WITH DOT (0x2992 Pe)
+    case u'⦓':  // LEFT ARC LESS-THAN BRACKET (0x2993 Ps)
+    case u'⦔':  // RIGHT ARC GREATER-THAN BRACKET (0x2994 Pe)
+    case u'⦗':  // LEFT BLACK TORTOISE SHELL BRACKET (0x2997 Ps)
+    case u'⦘':  // RIGHT BLACK TORTOISE SHELL BRACKET (0x2998 Pe)
+    case u'⧘':  // LEFT WIGGLY FENCE (0x29d8 Ps)
+    case u'⧙':  // RIGHT WIGGLY FENCE (0x29d9 Pe)
+    case u'⧚':  // LEFT DOUBLE WIGGLY FENCE (0x29da Ps)
+    case u'⧛':  // RIGHT DOUBLE WIGGLY FENCE (0x29db Pe)
+    case u'⧼':  // LEFT-POINTING CURVED ANGLE BRACKET (0x29fc Ps)
+    case u'⧽':  // RIGHT-POINTING CURVED ANGLE BRACKET (0x29fd Pe)
+    case u'⵰':  // TIFINAGH SEPARATOR MARK (0x2d70 Po)
+    case u'⸎':  // EDITORIAL CORONIS (0x2e0e Po)
+    case u'⸏':  // PARAGRAPHOS (0x2e0f Po)
+    case u'⸐':  // FORKED PARAGRAPHOS (0x2e10 Po)
+    case u'⸑':  // REVERSED FORKED PARAGRAPHOS (0x2e11 Po)
+    case u'⸒':  // HYPODIASTOLE (0x2e12 Po)
+    case u'⸓':  // DOTTED OBELOS (0x2e13 Po)
+    case u'⸔':  // DOWNWARDS ANCORA (0x2e14 Po)
+    case u'⸕':  // UPWARDS ANCORA (0x2e15 Po)
+    case u'⸖':  // DOTTED RIGHT-POINTING ANGLE (0x2e16 Po)
+    case u'⸗':  // DOUBLE OBLIQUE HYPHEN (0x2e17 Pd)
+    case u'⸙':  // PALM BRANCH (0x2e19 Po)
+    case u'⸚':  // HYPHEN WITH DIAERESIS (0x2e1a Pd)
+    case u'⸛':  // TILDE WITH RING ABOVE (0x2e1b Po)
+    case u'⸞':  // TILDE WITH DOT ABOVE (0x2e1e Po)
+    case u'⸟':  // TILDE WITH DOT BELOW (0x2e1f Po)
+    case u'⸪':  // TWO DOTS OVER ONE DOT PUNCTUATION (0x2e2a Po)
+    case u'⸫':  // ONE DOT OVER TWO DOTS PUNCTUATION (0x2e2b Po)
+    case u'⸬':  // SQUARED FOUR DOT PUNCTUATION (0x2e2c Po)
+    case u'⸭':  // FIVE DOT MARK (0x2e2d Po)
+    case u'⸮':  // REVERSED QUESTION MARK (0x2e2e Po)
+    case u'⸰':  // RING POINT (0x2e30 Po)
+    case u'⸱':  // WORD SEPARATOR MIDDLE DOT (0x2e31 Po)
+    case u'⸲':  // TURNED COMMA (0x2e32 Po)
+    case u'⸳':  // RAISED DOT (0x2e33 Po)
+    case u'⸴':  // RAISED COMMA (0x2e34 Po)
+    case u'⸵':  // TURNED SEMICOLON (0x2e35 Po)
+    case u'⸶':  // DAGGER WITH LEFT GUARD (0x2e36 Po)
+    case u'⸷':  // DAGGER WITH RIGHT GUARD (0x2e37 Po)
+    case u'⸸':  // TURNED DAGGER (0x2e38 Po)
+    case u'⸹':  // TOP HALF SECTION SIGN (0x2e39 Po)
+    case u'⸺':  // TWO-EM DASH (0x2e3a Pd)
+    case u'⸻':  // THREE-EM DASH (0x2e3b Pd)
+    case u'⸼':  // STENOGRAPHIC FULL STOP (0x2e3c Po)
+    case u'⸽':  // VERTICAL SIX DOTS (0x2e3d Po)
+    case u'⸾':  // WIGGLY VERTICAL LINE (0x2e3e Po)
+    case u'⸿':  // CAPITULUM (0x2e3f Po)
+    case u'⹀':  // DOUBLE HYPHEN (0x2e40 Pd)
+    case u'⹁':  // REVERSED COMMA (0x2e41 Po)
+    case u'⹂':  // DOUBLE LOW-REVERSED-9 QUOTATION MARK (0x2e42 Ps)
+    case u'⹃':  // DASH WITH LEFT UPTURN (0x2e43 Po)
+    case u'⹄':  // DOUBLE SUSPENSION MARK (0x2e44 Po)
+    case u'⹅':  // INVERTED LOW KAVYKA (0x2e45 Po)
+    case u'⹆':  // INVERTED LOW KAVYKA WITH KAVYKA ABOVE (0x2e46 Po)
+    case u'⹇':  // LOW KAVYKA (0x2e47 Po)
+    case u'⹈':  // LOW KAVYKA WITH DOT (0x2e48 Po)
+    case u'⹉':  // DOUBLE STACKED COMMA (0x2e49 Po)
+    case u'⹊':  // DOTTED SOLIDUS (0x2e4a Po)
+    case u'⹋':  // TRIPLE DAGGER (0x2e4b Po)
+    case u'⹌':  // MEDIEVAL COMMA (0x2e4c Po)
+    case u'⹍':  // PARAGRAPHUS MARK (0x2e4d Po)
+    case u'⹎':  // PUNCTUS ELEVATUS MARK (0x2e4e Po)
+    case u'⹏':  // CORNISH VERSE DIVIDER (0x2e4f Po)
+    case u'、':  // IDEOGRAPHIC COMMA (0x3001 Po)
+    case u'。':  // IDEOGRAPHIC FULL STOP (0x3002 Po)
+    case u'〃':  // DITTO MARK (0x3003 Po)
+    case u'〈':  // LEFT ANGLE BRACKET (0x3008 Ps)
+    case u'〉':  // RIGHT ANGLE BRACKET (0x3009 Pe)
+    case u'《':  // LEFT DOUBLE ANGLE BRACKET (0x300a Ps)
+    case u'》':  // RIGHT DOUBLE ANGLE BRACKET (0x300b Pe)
+    case u'「':  // LEFT CORNER BRACKET (0x300c Ps)
+    case u'」':  // RIGHT CORNER BRACKET (0x300d Pe)
+    case u'『':  // LEFT WHITE CORNER BRACKET (0x300e Ps)
+    case u'』':  // RIGHT WHITE CORNER BRACKET (0x300f Pe)
+    case u'【':  // LEFT BLACK LENTICULAR BRACKET (0x3010 Ps)
+    case u'】':  // RIGHT BLACK LENTICULAR BRACKET (0x3011 Pe)
+    case u'〔':  // LEFT TORTOISE SHELL BRACKET (0x3014 Ps)
+    case u'〕':  // RIGHT TORTOISE SHELL BRACKET (0x3015 Pe)
+    case u'〖':  // LEFT WHITE LENTICULAR BRACKET (0x3016 Ps)
+    case u'〗':  // RIGHT WHITE LENTICULAR BRACKET (0x3017 Pe)
+    case u'〘':  // LEFT WHITE TORTOISE SHELL BRACKET (0x3018 Ps)
+    case u'〙':  // RIGHT WHITE TORTOISE SHELL BRACKET (0x3019 Pe)
+    case u'〚':  // LEFT WHITE SQUARE BRACKET (0x301a Ps)
+    case u'〛':  // RIGHT WHITE SQUARE BRACKET (0x301b Pe)
+    case u'〜':  // WAVE DASH (0x301c Pd)
+    case u'〝':  // REVERSED DOUBLE PRIME QUOTATION MARK (0x301d Ps)
+    case u'〞':  // DOUBLE PRIME QUOTATION MARK (0x301e Pe)
+    case u'〟':  // LOW DOUBLE PRIME QUOTATION MARK (0x301f Pe)
+    case u'〰':  // WAVY DASH (0x3030 Pd)
+    case u'〽':  // PART ALTERNATION MARK (0x303d Po)
+    case u'゠':  // KATAKANA-HIRAGANA DOUBLE HYPHEN (0x30a0 Pd)
+    case u'・':  // KATAKANA MIDDLE DOT (0x30fb Po)
+    case u'꓾':   // LISU PUNCTUATION COMMA (0xa4fe Po)
+    case u'꓿':   // LISU PUNCTUATION FULL STOP (0xa4ff Po)
+    case u'꘍':   // VAI COMMA (0xa60d Po)
+    case u'꘎':   // VAI FULL STOP (0xa60e Po)
+    case u'꘏':   // VAI QUESTION MARK (0xa60f Po)
+    case u'꙾':   // CYRILLIC KAVYKA (0xa67e Po)
+    case u'꡴':   // PHAGS-PA SINGLE HEAD MARK (0xa874 Po)
+    case u'꡵':   // PHAGS-PA DOUBLE HEAD MARK (0xa875 Po)
+    case u'꡶':   // PHAGS-PA MARK SHAD (0xa876 Po)
+    case u'꡷':   // PHAGS-PA MARK DOUBLE SHAD (0xa877 Po)
+    case u'꣎':   // SAURASHTRA DANDA (0xa8ce Po)
+    case u'꣏':   // SAURASHTRA DOUBLE DANDA (0xa8cf Po)
+    case u'꣸':   // DEVANAGARI SIGN PUSHPIKA (0xa8f8 Po)
+    case u'꣹':   // DEVANAGARI GAP FILLER (0xa8f9 Po)
+    case u'꣺':   // DEVANAGARI CARET (0xa8fa Po)
+    case u'꣼':   // DEVANAGARI SIGN SIDDHAM (0xa8fc Po)
+    case u'꧁':   // JAVANESE LEFT RERENGGAN (0xa9c1 Po)
+    case u'꧂':   // JAVANESE RIGHT RERENGGAN (0xa9c2 Po)
+    case u'꧃':   // JAVANESE PADA ANDAP (0xa9c3 Po)
+    case u'꧄':   // JAVANESE PADA MADYA (0xa9c4 Po)
+    case u'꧅':   // JAVANESE PADA LUHUR (0xa9c5 Po)
+    case u'꧆':   // JAVANESE PADA WINDU (0xa9c6 Po)
+    case u'꧇':   // JAVANESE PADA PANGKAT (0xa9c7 Po)
+    case u'꧈':   // JAVANESE PADA LINGSA (0xa9c8 Po)
+    case u'꧉':   // JAVANESE PADA LUNGSI (0xa9c9 Po)
+    case u'꧊':   // JAVANESE PADA ADEG (0xa9ca Po)
+    case u'꧋':   // JAVANESE PADA ADEG ADEG (0xa9cb Po)
+    case u'꧌':   // JAVANESE PADA PISELEH (0xa9cc Po)
+    case u'꧍':   // JAVANESE TURNED PADA PISELEH (0xa9cd Po)
+    case u'꧞':   // JAVANESE PADA TIRTA TUMETES (0xa9de Po)
+    case u'꧟':   // JAVANESE PADA ISEN-ISEN (0xa9df Po)
+    case u'꩜':   // CHAM PUNCTUATION SPIRAL (0xaa5c Po)
+    case u'꩝':   // CHAM PUNCTUATION DANDA (0xaa5d Po)
+    case u'꩞':   // CHAM PUNCTUATION DOUBLE DANDA (0xaa5e Po)
+    case u'꩟':   // CHAM PUNCTUATION TRIPLE DANDA (0xaa5f Po)
+    case u'꫞':   // TAI VIET SYMBOL HO HOI (0xaade Po)
+    case u'꫟':   // TAI VIET SYMBOL KOI KOI (0xaadf Po)
+    case u'꫰':   // MEETEI MAYEK CHEIKHAN (0xaaf0 Po)
+    case u'꫱':   // MEETEI MAYEK AHANG KHUDAM (0xaaf1 Po)
+    case u'꯫':   // MEETEI MAYEK CHEIKHEI (0xabeb Po)
+    case u'︐':  // PRESENTATION FORM FOR VERTICAL COMMA (0xfe10 Po)
+    case u'︑':  // PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA (0xfe11 Po)
+    case u'︒':  // PRESENTATION FORM FOR VERTICAL IDEO FULL STOP (0xfe12 Po)
+    case u'︓':  // PRESENTATION FORM FOR VERTICAL COLON (0xfe13 Po)
+    case u'︔':  // PRESENTATION FORM FOR VERTICAL SEMICOLON (0xfe14 Po)
+    case u'︕':  // PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK (0xfe15 Po)
+    case u'︖':  // PRESENTATION FORM FOR VERTICAL QUESTION MARK (0xfe16 Po)
+    case u'︗':  // PRESENTATION ... LEFT WHITE LENTICULAR BRACKET (0xfe17 Ps)
+    case u'︘':  // PRESENTATION ... RIGHT WHITE LENTICULAR BRAKCET (0xfe18 Pe)
+    case u'︙':  // PRESENTATION ... VERTICAL HORIZONTAL ELLIPSIS (0xfe19 Po)
+    case u'︰':  // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER (0xfe30 Po)
+    case u'︱':  // PRESENTATION FORM FOR VERTICAL EM DASH (0xfe31 Pd)
+    case u'︲':  // PRESENTATION FORM FOR VERTICAL EN DASH (0xfe32 Pd)
+    case u'︳':  // PRESENTATION FORM FOR VERTICAL LOW LINE (0xfe33 Pc)
+    case u'︴':  // PRESENTATION FORM FOR VERTICAL WAVY LOW LINE (0xfe34 Pc)
+    case u'︵':  // PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS (0xfe35 Ps)
+    case u'︶':  // PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS (0xfe36 Pe)
+    case u'︷':  // PRESENTATION ... VERTICAL LEFT CURLY BRACKET (0xfe37 Ps)
+    case u'︸':  // PRESENTATION ... VERTICAL RIGHT CURLY BRACKET (0xfe38 Pe)
+    case u'︹':  // PRESENTATION ... LEFT TORTOISE SHELL BRACKET (0xfe39 Ps)
+    case u'︺':  // PRESENTATION ... RIGHT TORTOISE SHELL BRACKET (0xfe3a Pe)
+    case u'︻':  // PRESENTATION ... LEFT BLACK LENTICULAR BRACKET (0xfe3b Ps)
+    case u'︼':  // PRESENTATION ... RIGHT BLACK LENTICULAR BRACKET (0xfe3c Pe)
+    case u'︽':  // PRESENTATION ... LEFT DOUBLE ANGLE BRACKET (0xfe3d Ps)
+    case u'︾':  // PRESENTATION ... RIGHT DOUBLE ANGLE BRACKET (0xfe3e Pe)
+    case u'︿':  // PRESENTATION ... LEFT ANGLE BRACKET (0xfe3f Ps)
+    case u'﹀':  // PRESENTATION ... RIGHT ANGLE BRACKET (0xfe40 Pe)
+    case u'﹁':  // PRESENTATION ... LEFT CORNER BRACKET (0xfe41 Ps)
+    case u'﹂':  // PRESENTATION ... RIGHT CORNER BRACKET (0xfe42 Pe)
+    case u'﹃':  // PRESENTATION ... LEFT WHITE CORNER BRACKET (0xfe43 Ps)
+    case u'﹄':  // PRESENTATION ... RIGHT WHITE CORNER BRACKET Pe)
+    case u'﹅':  // SESAME DOT (0xfe45 Po)
+    case u'﹆':  // WHITE SESAME DOT (0xfe46 Po)
+    case u'﹇':  // PRESENTATION ... VERTICAL LEFT SQUARE BRACKET (0xfe47 Ps)
+    case u'﹈':  // PRESENTATION ... VERTICAL RIGHT SQUARE BRACKET (0xfe48 Pe)
+    case u'﹉':  // DASHED OVERLINE (0xfe49 Po)
+    case u'﹊':  // CENTRELINE OVERLINE (0xfe4a Po)
+    case u'﹋':  // WAVY OVERLINE (0xfe4b Po)
+    case u'﹌':  // DOUBLE WAVY OVERLINE (0xfe4c Po)
+    case u'﹍':  // DASHED LOW LINE (0xfe4d Pc)
+    case u'﹎':  // CENTRELINE LOW LINE (0xfe4e Pc)
+    case u'﹏':  // WAVY LOW LINE (0xfe4f Pc)
+    case u'﹐':  // SMALL COMMA (0xfe50 Po)
+    case u'﹑':  // SMALL IDEOGRAPHIC COMMA (0xfe51 Po)
+    case u'﹒':  // SMALL FULL STOP (0xfe52 Po)
+    case u'﹔':  // SMALL SEMICOLON (0xfe54 Po)
+    case u'﹕':  // SMALL COLON (0xfe55 Po)
+    case u'﹖':  // SMALL QUESTION MARK (0xfe56 Po)
+    case u'﹗':  // SMALL EXCLAMATION MARK (0xfe57 Po)
+    case u'﹘':  // SMALL EM DASH (0xfe58 Pd)
+    case u'﹙':  // SMALL LEFT PARENTHESIS (0xfe59 Ps)
+    case u'﹚':  // SMALL RIGHT PARENTHESIS (0xfe5a Pe)
+    case u'﹛':  // SMALL LEFT CURLY BRACKET (0xfe5b Ps)
+    case u'﹜':  // SMALL RIGHT CURLY BRACKET (0xfe5c Pe)
+    case u'﹝':  // SMALL LEFT TORTOISE SHELL BRACKET (0xfe5d Ps)
+    case u'﹞':  // SMALL RIGHT TORTOISE SHELL BRACKET (0xfe5e Pe)
+    case u'﹟':  // SMALL NUMBER SIGN (0xfe5f Po)
+    case u'﹠':  // SMALL AMPERSAND (0xfe60 Po)
+    case u'﹡':  // SMALL ASTERISK (0xfe61 Po)
+    case u'﹣':  // SMALL HYPHEN-MINUS (0xfe63 Pd)
+    case u'﹨':  // SMALL REVERSE SOLIDUS (0xfe68 Po)
+    case u'﹪':  // SMALL PERCENT SIGN (0xfe6a Po)
+    case u'﹫':  // SMALL COMMERCIAL AT (0xfe6b Po)
+    case u'！':  // FULLWIDTH EXCLAMATION MARK (0xff01 Po)
+    case u'＂':  // FULLWIDTH QUOTATION MARK (0xff02 Po)
+    case u'＃':  // FULLWIDTH NUMBER SIGN (0xff03 Po)
+    case u'％':  // FULLWIDTH PERCENT SIGN (0xff05 Po)
+    case u'＆':  // FULLWIDTH AMPERSAND (0xff06 Po)
+    case u'＇':  // FULLWIDTH APOSTROPHE (0xff07 Po)
+    case u'（':  // FULLWIDTH LEFT PARENTHESIS (0xff08 Ps)
+    case u'）':  // FULLWIDTH RIGHT PARENTHESIS (0xff09 Pe)
+    case u'＊':  // FULLWIDTH ASTERISK (0xff0a Po)
+    case u'，':  // FULLWIDTH COMMA (0xff0c Po)
+    case u'－':  // FULLWIDTH HYPHEN-MINUS (0xff0d Pd)
+    case u'．':  // FULLWIDTH FULL STOP (0xff0e Po)
+    case u'／':  // FULLWIDTH SOLIDUS (0xff0f Po)
+    case u'：':  // FULLWIDTH COLON (0xff1a Po)
+    case u'；':  // FULLWIDTH SEMICOLON (0xff1b Po)
+    case u'？':  // FULLWIDTH QUESTION MARK (0xff1f Po)
+    case u'＠':  // FULLWIDTH COMMERCIAL AT (0xff20 Po)
+    case u'［':  // FULLWIDTH LEFT SQUARE BRACKET (0xff3b Ps)
+    case u'＼':  // FULLWIDTH REVERSE SOLIDUS (0xff3c Po)
+    case u'］':  // FULLWIDTH RIGHT SQUARE BRACKET (0xff3d Pe)
+    case u'＿':  // FULLWIDTH LOW LINE (0xff3f Pc)
+    case u'｛':  // FULLWIDTH LEFT CURLY BRACKET (0xff5b Ps)
+    case u'｝':  // FULLWIDTH RIGHT CURLY BRACKET (0xff5d Pe)
+    case u'｟':  // FULLWIDTH LEFT WHITE PARENTHESIS (0xff5f Ps)
+    case u'｠':  // FULLWIDTH RIGHT WHITE PARENTHESIS (0xff60 Pe)
+    case u'｡':   // HALFWIDTH IDEOGRAPHIC FULL STOP (0xff61 Po)
+    case u'｢':   // HALFWIDTH LEFT CORNER BRACKET (0xff62 Ps)
+    case u'｣':   // HALFWIDTH RIGHT CORNER BRACKET (0xff63 Pe)
+    case u'､':   // HALFWIDTH IDEOGRAPHIC COMMA (0xff64 Po)
+    case u'･':   // HALFWIDTH KATAKANA MIDDLE DOT (0xff65 Po)
+      return 1;
+    default:
+      return 0;
+  }
+}
+
+__weak_reference(iswpunct, iswpunct_l);
diff --git a/libc/str/iswseparator.cc b/libc/str/iswseparator.c
similarity index 94%
rename from libc/str/iswseparator.cc
rename to libc/str/iswseparator.c
index 6ed2c7788..224fec28a 100644
--- a/libc/str/iswseparator.cc
+++ b/libc/str/iswseparator.c
@@ -1,7 +1,7 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                             :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,11 +16,9 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
-#include "libc/str/has_char.h"
 #include "libc/wctype.h"
 
-static const unsigned short kSeparators[][2] = {
+static const unsigned short kCodes[][2] = {
     {0x00aa, 0x00aa}, /*     1x English */
     {0x00b2, 0x00b3}, /*     2x English Arabic */
     {0x00b5, 0x00b5}, /*     1x Greek */
@@ -174,7 +172,7 @@ static const unsigned short kSeparators[][2] = {
     {0xffda, 0xffdc}, /*     3x Dubs */
 };
 
-static const unsigned kAstralSeparators[][2] = {
+static const unsigned kAstralCodes[][2] = {
     {0x10107, 0x10133}, /*    45x Aegean */
     {0x10140, 0x10178}, /*    57x Ancient Greek Numbers */
     {0x1018a, 0x1018b}, /*     2x Ancient Greek Numbers */
@@ -392,11 +390,34 @@ static const unsigned kAstralSeparators[][2] = {
  * other things like blocks and emoji (So).
  */
 int iswseparator(wint_t c) {
-  if (c < 128)
-    return !(('0' <= c && c <= '9') ||  //
-             ('A' <= c && c <= 'Z') ||  //
+  int m, l, r, n;
+  if (c < 0200) {
+    return !(('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') ||
              ('a' <= c && c <= 'z'));
-  if (c < 65536)
-    return has_char(kSeparators, ARRAYLEN(kSeparators), (unsigned short)c);
-  return has_char(kAstralSeparators, ARRAYLEN(kAstralSeparators), (unsigned)c);
+  }
+  if (c <= 0xffff) {
+    l = 0;
+    r = n = sizeof(kCodes) / sizeof(kCodes[0]);
+    while (l < r) {
+      m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
+      if (kCodes[m][1] < c) {
+        l = m + 1;
+      } else {
+        r = m;
+      }
+    }
+    return !(l < n && kCodes[l][0] <= c && c <= kCodes[l][1]);
+  } else {
+    l = 0;
+    r = n = sizeof(kAstralCodes) / sizeof(kAstralCodes[0]);
+    while (l < r) {
+      m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
+      if (kAstralCodes[m][1] < c) {
+        l = m + 1;
+      } else {
+        r = m;
+      }
+    }
+    return !(l < n && kAstralCodes[l][0] <= c && c <= kAstralCodes[l][1]);
+  }
 }
diff --git a/libc/str/iswspace.c b/libc/str/iswspace.c
index 097e6ce51..44d62af9d 100644
--- a/libc/str/iswspace.c
+++ b/libc/str/iswspace.c
@@ -41,6 +41,7 @@ int iswspace(wint_t c) {
     case 0x2004:  // THREE-PER-EM SPACE (Zs)
     case 0x2005:  // FOUR-PER-EM SPACE (Zs)
     case 0x2006:  // SIX-PER-EM SPACE (Zs)
+    case 0x2007:  // FIGURE SPACE (Zs)
     case 0x2008:  // PUNCTUATION SPACE (Zs)
     case 0x2009:  // THIN SPACE (Zs)
     case 0x200a:  // HAIR SPACE (Zs)
diff --git a/libc/str/iswupper.c b/libc/str/iswupper.c
new file mode 100644
index 000000000..aad3dd6e7
--- /dev/null
+++ b/libc/str/iswupper.c
@@ -0,0 +1,164 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/wctype.h"
+
+/**
+ * Returns nonzero if c is uppercase letter.
+ */
+int iswupper(wint_t c) {
+  if (c < 0200) {
+    return 'A' <= c && c <= 'Z';
+  } else {
+    if (towlower(c) != c)
+      return 1;
+    switch (c) {
+      case 0x03d2:  /* ϒ Greek */
+      case 0x03d3:  /* ϓ Greek */
+      case 0x03d4:  /* ϔ Greek */
+      case 0x2102:  /* ℂ Letterlike */
+      case 0x2107:  /* ℇ Letterlike */
+      case 0x210b:  /* ℋ Letterlike */
+      case 0x210c:  /* ℌ Letterlike */
+      case 0x210d:  /* ℍ Letterlike */
+      case 0x2110:  /* ℐ Letterlike */
+      case 0x2111:  /* ℑ Letterlike */
+      case 0x2112:  /* ℒ Letterlike */
+      case 0x2115:  /* ℕ Letterlike */
+      case 0x2119:  /* ℙ Letterlike */
+      case 0x211a:  /* ℚ Letterlike */
+      case 0x211b:  /* ℛ Letterlike */
+      case 0x211c:  /* ℜ Letterlike */
+      case 0x211d:  /* ℝ Letterlike */
+      case 0x2124:  /* ℤ Letterlike */
+      case 0x2128:  /* ℨ Letterlike */
+      case 0x212c:  /* ℬ Letterlike */
+      case 0x212d:  /* ℭ Letterlike */
+      case 0x2130:  /* ℰ Letterlike */
+      case 0x2131:  /* ℱ Letterlike */
+      case 0x2133:  /* ℳ Letterlike */
+      case 0x213e:  /* ℾ Letterlike */
+      case 0x213f:  /* ℿ Letterlike */
+      case 0x2145:  /* ⅅ Letterlike */
+      case 0x1d434: /* 𝐴 Math */
+      case 0x1d435: /* 𝐵 Math */
+      case 0x1d436: /* 𝐶 Math */
+      case 0x1d437: /* 𝐷 Math */
+      case 0x1d438: /* 𝐸 Math */
+      case 0x1d439: /* 𝐹 Math */
+      case 0x1d43a: /* 𝐺 Math */
+      case 0x1d43b: /* 𝐻 Math */
+      case 0x1d49c: /* 𝒜 Math */
+      case 0x1d49e: /* 𝒞 Math */
+      case 0x1d49f: /* 𝒟 Math */
+      case 0x1d4a2: /* 𝒢 Math */
+      case 0x1d4a5: /* 𝒥 Math */
+      case 0x1d4a6: /* 𝒦 Math */
+      case 0x1d4a9: /* 𝒩 Math */
+      case 0x1d4aa: /* 𝒪 Math */
+      case 0x1d4ab: /* 𝒫 Math */
+      case 0x1d4ac: /* 𝒬 Math */
+      case 0x1d504: /* 𝔄 Math */
+      case 0x1d505: /* 𝔅 Math */
+      case 0x1d507: /* 𝔇 Math */
+      case 0x1d508: /* 𝔈 Math */
+      case 0x1d509: /* 𝔉 Math */
+      case 0x1d50a: /* 𝔊 Math */
+      case 0x1d516: /* 𝔖 Math */
+      case 0x1d517: /* 𝔗 Math */
+      case 0x1d518: /* 𝔘 Math */
+      case 0x1d519: /* 𝔙 Math */
+      case 0x1d51a: /* 𝔚 Math */
+      case 0x1d51b: /* 𝔛 Math */
+      case 0x1d51c: /* 𝔜 Math */
+      case 0x1d538: /* 𝔸 Math */
+      case 0x1d539: /* 𝔹 Math */
+      case 0x1d53b: /* 𝔻 Math */
+      case 0x1d53c: /* 𝔼 Math */
+      case 0x1d53d: /* 𝔽 Math */
+      case 0x1d53e: /* 𝔾 Math */
+      case 0x1d540: /* 𝕀 Math */
+      case 0x1d541: /* 𝕁 Math */
+      case 0x1d542: /* 𝕂 Math */
+      case 0x1d543: /* 𝕃 Math */
+      case 0x1d544: /* 𝕄 Math */
+      case 0x1d546: /* 𝕆 Math */
+      case 0x1d54a: /* 𝕊 Math */
+      case 0x1d54b: /* 𝕋 Math */
+      case 0x1d54c: /* 𝕌 Math */
+      case 0x1d54d: /* 𝕍 Math */
+      case 0x1d54e: /* 𝕎 Math */
+      case 0x1d54f: /* 𝕏 Math */
+      case 0x1d550: /* 𝕐 Math */
+      case 0x1d6e3: /* 𝛣 Math */
+      case 0x1d6e4: /* 𝛤 Math */
+      case 0x1d6e5: /* 𝛥 Math */
+      case 0x1d6e6: /* 𝛦 Math */
+      case 0x1d6e7: /* 𝛧 Math */
+      case 0x1d6e8: /* 𝛨 Math */
+      case 0x1d6e9: /* 𝛩 Math */
+      case 0x1d6ea: /* 𝛪 Math */
+      case 0x1d6eb: /* 𝛫 Math */
+      case 0x1d6ec: /* 𝛬 Math */
+      case 0x1d6ed: /* 𝛭 Math */
+      case 0x1d6ee: /* 𝛮 Math */
+      case 0x1d6ef: /* 𝛯 Math */
+      case 0x1d6f0: /* 𝛰 Math */
+      case 0x1d6f1: /* 𝛱 Math */
+      case 0x1d6f2: /* 𝛲 Math */
+      case 0x1d6f3: /* 𝛳 Math */
+      case 0x1d6f4: /* 𝛴 Math */
+      case 0x1d6f5: /* 𝛵 Math */
+      case 0x1d6f6: /* 𝛶 Math */
+      case 0x1d6f7: /* 𝛷 Math */
+      case 0x1d6f8: /* 𝛸 Math */
+      case 0x1d6f9: /* 𝛹 Math */
+      case 0x1d6fa: /* 𝛺 Math */
+      case 0x1d72d: /* 𝜭 Math */
+      case 0x1d72e: /* 𝜮 Math */
+      case 0x1d72f: /* 𝜯 Math */
+      case 0x1d730: /* 𝜰 Math */
+      case 0x1d731: /* 𝜱 Math */
+      case 0x1d732: /* 𝜲 Math */
+      case 0x1d733: /* 𝜳 Math */
+      case 0x1d734: /* 𝜴 Math */
+      case 0x1d767: /* 𝝧 Math */
+      case 0x1d768: /* 𝝨 Math */
+      case 0x1d769: /* 𝝩 Math */
+      case 0x1d76a: /* 𝝪 Math */
+      case 0x1d76b: /* 𝝫 Math */
+      case 0x1d76c: /* 𝝬 Math */
+      case 0x1d76d: /* 𝝭 Math */
+      case 0x1d76e: /* 𝝮 Math */
+      case 0x1d7a1: /* 𝞡 Math */
+      case 0x1d7a2: /* 𝞢 Math */
+      case 0x1d7a3: /* 𝞣 Math */
+      case 0x1d7a4: /* 𝞤 Math */
+      case 0x1d7a5: /* 𝞥 Math */
+      case 0x1d7a6: /* 𝞦 Math */
+      case 0x1d7a7: /* 𝞧 Math */
+      case 0x1d7a8: /* 𝞨 Math */
+      case 0x1d7ca: /* 𝟊 Math */
+        return 1;
+      default:
+        return 0;
+    }
+  }
+}
+
+__weak_reference(iswupper, iswupper_l);
diff --git a/libc/str/iswupper.cc b/libc/str/iswupper.cc
deleted file mode 100644
index 4db11a3f4..000000000
--- a/libc/str/iswupper.cc
+++ /dev/null
@@ -1,695 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                             :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/dce.h"
-#include "libc/macros.h"
-#include "libc/str/has_char.h"
-#include "libc/wctype.h"
-
-static const unsigned short kUpper[][2] = {
-    {0x41, 0x5a},      //
-    {0xc0, 0xd6},      //
-    {0xd8, 0xde},      //
-    {0x100, 0x100},    //
-    {0x102, 0x102},    //
-    {0x104, 0x104},    //
-    {0x106, 0x106},    //
-    {0x108, 0x108},    //
-    {0x10a, 0x10a},    //
-    {0x10c, 0x10c},    //
-    {0x10e, 0x10e},    //
-    {0x110, 0x110},    //
-    {0x112, 0x112},    //
-    {0x114, 0x114},    //
-    {0x116, 0x116},    //
-    {0x118, 0x118},    //
-    {0x11a, 0x11a},    //
-    {0x11c, 0x11c},    //
-    {0x11e, 0x11e},    //
-    {0x120, 0x120},    //
-    {0x122, 0x122},    //
-    {0x124, 0x124},    //
-    {0x126, 0x126},    //
-    {0x128, 0x128},    //
-    {0x12a, 0x12a},    //
-    {0x12c, 0x12c},    //
-    {0x12e, 0x12e},    //
-    {0x130, 0x130},    //
-    {0x132, 0x132},    //
-    {0x134, 0x134},    //
-    {0x136, 0x136},    //
-    {0x139, 0x139},    //
-    {0x13b, 0x13b},    //
-    {0x13d, 0x13d},    //
-    {0x13f, 0x13f},    //
-    {0x141, 0x141},    //
-    {0x143, 0x143},    //
-    {0x145, 0x145},    //
-    {0x147, 0x147},    //
-    {0x14a, 0x14a},    //
-    {0x14c, 0x14c},    //
-    {0x14e, 0x14e},    //
-    {0x150, 0x150},    //
-    {0x152, 0x152},    //
-    {0x154, 0x154},    //
-    {0x156, 0x156},    //
-    {0x158, 0x158},    //
-    {0x15a, 0x15a},    //
-    {0x15c, 0x15c},    //
-    {0x15e, 0x15e},    //
-    {0x160, 0x160},    //
-    {0x162, 0x162},    //
-    {0x164, 0x164},    //
-    {0x166, 0x166},    //
-    {0x168, 0x168},    //
-    {0x16a, 0x16a},    //
-    {0x16c, 0x16c},    //
-    {0x16e, 0x16e},    //
-    {0x170, 0x170},    //
-    {0x172, 0x172},    //
-    {0x174, 0x174},    //
-    {0x176, 0x176},    //
-    {0x178, 0x179},    //
-    {0x17b, 0x17b},    //
-    {0x17d, 0x17d},    //
-    {0x181, 0x182},    //
-    {0x184, 0x184},    //
-    {0x186, 0x187},    //
-    {0x189, 0x18b},    //
-    {0x18e, 0x191},    //
-    {0x193, 0x194},    //
-    {0x196, 0x198},    //
-    {0x19c, 0x19d},    //
-    {0x19f, 0x1a0},    //
-    {0x1a2, 0x1a2},    //
-    {0x1a4, 0x1a4},    //
-    {0x1a6, 0x1a7},    //
-    {0x1a9, 0x1a9},    //
-    {0x1ac, 0x1ac},    //
-    {0x1ae, 0x1af},    //
-    {0x1b1, 0x1b3},    //
-    {0x1b5, 0x1b5},    //
-    {0x1b7, 0x1b8},    //
-    {0x1bc, 0x1bc},    //
-    {0x1c4, 0x1c5},    //
-    {0x1c7, 0x1c8},    //
-    {0x1ca, 0x1cb},    //
-    {0x1cd, 0x1cd},    //
-    {0x1cf, 0x1cf},    //
-    {0x1d1, 0x1d1},    //
-    {0x1d3, 0x1d3},    //
-    {0x1d5, 0x1d5},    //
-    {0x1d7, 0x1d7},    //
-    {0x1d9, 0x1d9},    //
-    {0x1db, 0x1db},    //
-    {0x1de, 0x1de},    //
-    {0x1e0, 0x1e0},    //
-    {0x1e2, 0x1e2},    //
-    {0x1e4, 0x1e4},    //
-    {0x1e6, 0x1e6},    //
-    {0x1e8, 0x1e8},    //
-    {0x1ea, 0x1ea},    //
-    {0x1ec, 0x1ec},    //
-    {0x1ee, 0x1ee},    //
-    {0x1f1, 0x1f2},    //
-    {0x1f4, 0x1f4},    //
-    {0x1f6, 0x1f8},    //
-    {0x1fa, 0x1fa},    //
-    {0x1fc, 0x1fc},    //
-    {0x1fe, 0x1fe},    //
-    {0x200, 0x200},    //
-    {0x202, 0x202},    //
-    {0x204, 0x204},    //
-    {0x206, 0x206},    //
-    {0x208, 0x208},    //
-    {0x20a, 0x20a},    //
-    {0x20c, 0x20c},    //
-    {0x20e, 0x20e},    //
-    {0x210, 0x210},    //
-    {0x212, 0x212},    //
-    {0x214, 0x214},    //
-    {0x216, 0x216},    //
-    {0x218, 0x218},    //
-    {0x21a, 0x21a},    //
-    {0x21c, 0x21c},    //
-    {0x21e, 0x21e},    //
-    {0x220, 0x220},    //
-    {0x222, 0x222},    //
-    {0x224, 0x224},    //
-    {0x226, 0x226},    //
-    {0x228, 0x228},    //
-    {0x22a, 0x22a},    //
-    {0x22c, 0x22c},    //
-    {0x22e, 0x22e},    //
-    {0x230, 0x230},    //
-    {0x232, 0x232},    //
-    {0x23a, 0x23b},    //
-    {0x23d, 0x23e},    //
-    {0x241, 0x241},    //
-    {0x243, 0x246},    //
-    {0x248, 0x248},    //
-    {0x24a, 0x24a},    //
-    {0x24c, 0x24c},    //
-    {0x24e, 0x24e},    //
-    {0x370, 0x370},    //
-    {0x372, 0x372},    //
-    {0x376, 0x376},    //
-    {0x37f, 0x37f},    //
-    {0x386, 0x386},    //
-    {0x388, 0x38a},    //
-    {0x38c, 0x38c},    //
-    {0x38e, 0x38f},    //
-    {0x391, 0x3a1},    //
-    {0x3a3, 0x3ab},    //
-    {0x3cf, 0x3cf},    //
-    {0x3d2, 0x3d4},    //
-    {0x3d8, 0x3d8},    //
-    {0x3da, 0x3da},    //
-    {0x3dc, 0x3dc},    //
-    {0x3de, 0x3de},    //
-    {0x3e0, 0x3e0},    //
-    {0x3e2, 0x3e2},    //
-    {0x3e4, 0x3e4},    //
-    {0x3e6, 0x3e6},    //
-    {0x3e8, 0x3e8},    //
-    {0x3ea, 0x3ea},    //
-    {0x3ec, 0x3ec},    //
-    {0x3ee, 0x3ee},    //
-    {0x3f4, 0x3f4},    //
-    {0x3f7, 0x3f7},    //
-    {0x3f9, 0x3fa},    //
-    {0x3fd, 0x42f},    //
-    {0x460, 0x460},    //
-    {0x462, 0x462},    //
-    {0x464, 0x464},    //
-    {0x466, 0x466},    //
-    {0x468, 0x468},    //
-    {0x46a, 0x46a},    //
-    {0x46c, 0x46c},    //
-    {0x46e, 0x46e},    //
-    {0x470, 0x470},    //
-    {0x472, 0x472},    //
-    {0x474, 0x474},    //
-    {0x476, 0x476},    //
-    {0x478, 0x478},    //
-    {0x47a, 0x47a},    //
-    {0x47c, 0x47c},    //
-    {0x47e, 0x47e},    //
-    {0x480, 0x480},    //
-    {0x48a, 0x48a},    //
-    {0x48c, 0x48c},    //
-    {0x48e, 0x48e},    //
-    {0x490, 0x490},    //
-    {0x492, 0x492},    //
-    {0x494, 0x494},    //
-    {0x496, 0x496},    //
-    {0x498, 0x498},    //
-    {0x49a, 0x49a},    //
-    {0x49c, 0x49c},    //
-    {0x49e, 0x49e},    //
-    {0x4a0, 0x4a0},    //
-    {0x4a2, 0x4a2},    //
-    {0x4a4, 0x4a4},    //
-    {0x4a6, 0x4a6},    //
-    {0x4a8, 0x4a8},    //
-    {0x4aa, 0x4aa},    //
-    {0x4ac, 0x4ac},    //
-    {0x4ae, 0x4ae},    //
-    {0x4b0, 0x4b0},    //
-    {0x4b2, 0x4b2},    //
-    {0x4b4, 0x4b4},    //
-    {0x4b6, 0x4b6},    //
-    {0x4b8, 0x4b8},    //
-    {0x4ba, 0x4ba},    //
-    {0x4bc, 0x4bc},    //
-    {0x4be, 0x4be},    //
-    {0x4c0, 0x4c1},    //
-    {0x4c3, 0x4c3},    //
-    {0x4c5, 0x4c5},    //
-    {0x4c7, 0x4c7},    //
-    {0x4c9, 0x4c9},    //
-    {0x4cb, 0x4cb},    //
-    {0x4cd, 0x4cd},    //
-    {0x4d0, 0x4d0},    //
-    {0x4d2, 0x4d2},    //
-    {0x4d4, 0x4d4},    //
-    {0x4d6, 0x4d6},    //
-    {0x4d8, 0x4d8},    //
-    {0x4da, 0x4da},    //
-    {0x4dc, 0x4dc},    //
-    {0x4de, 0x4de},    //
-    {0x4e0, 0x4e0},    //
-    {0x4e2, 0x4e2},    //
-    {0x4e4, 0x4e4},    //
-    {0x4e6, 0x4e6},    //
-    {0x4e8, 0x4e8},    //
-    {0x4ea, 0x4ea},    //
-    {0x4ec, 0x4ec},    //
-    {0x4ee, 0x4ee},    //
-    {0x4f0, 0x4f0},    //
-    {0x4f2, 0x4f2},    //
-    {0x4f4, 0x4f4},    //
-    {0x4f6, 0x4f6},    //
-    {0x4f8, 0x4f8},    //
-    {0x4fa, 0x4fa},    //
-    {0x4fc, 0x4fc},    //
-    {0x4fe, 0x4fe},    //
-    {0x500, 0x500},    //
-    {0x502, 0x502},    //
-    {0x504, 0x504},    //
-    {0x506, 0x506},    //
-    {0x508, 0x508},    //
-    {0x50a, 0x50a},    //
-    {0x50c, 0x50c},    //
-    {0x50e, 0x50e},    //
-    {0x510, 0x510},    //
-    {0x512, 0x512},    //
-    {0x514, 0x514},    //
-    {0x516, 0x516},    //
-    {0x518, 0x518},    //
-    {0x51a, 0x51a},    //
-    {0x51c, 0x51c},    //
-    {0x51e, 0x51e},    //
-    {0x520, 0x520},    //
-    {0x522, 0x522},    //
-    {0x524, 0x524},    //
-    {0x526, 0x526},    //
-    {0x528, 0x528},    //
-    {0x52a, 0x52a},    //
-    {0x52c, 0x52c},    //
-    {0x52e, 0x52e},    //
-    {0x531, 0x556},    //
-    {0x10a0, 0x10c5},  //
-    {0x10c7, 0x10c7},  //
-    {0x10cd, 0x10cd},  //
-    {0x13a0, 0x13f5},  //
-    {0x1c90, 0x1cba},  //
-    {0x1cbd, 0x1cbf},  //
-    {0x1e00, 0x1e00},  //
-    {0x1e02, 0x1e02},  //
-    {0x1e04, 0x1e04},  //
-    {0x1e06, 0x1e06},  //
-    {0x1e08, 0x1e08},  //
-    {0x1e0a, 0x1e0a},  //
-    {0x1e0c, 0x1e0c},  //
-    {0x1e0e, 0x1e0e},  //
-    {0x1e10, 0x1e10},  //
-    {0x1e12, 0x1e12},  //
-    {0x1e14, 0x1e14},  //
-    {0x1e16, 0x1e16},  //
-    {0x1e18, 0x1e18},  //
-    {0x1e1a, 0x1e1a},  //
-    {0x1e1c, 0x1e1c},  //
-    {0x1e1e, 0x1e1e},  //
-    {0x1e20, 0x1e20},  //
-    {0x1e22, 0x1e22},  //
-    {0x1e24, 0x1e24},  //
-    {0x1e26, 0x1e26},  //
-    {0x1e28, 0x1e28},  //
-    {0x1e2a, 0x1e2a},  //
-    {0x1e2c, 0x1e2c},  //
-    {0x1e2e, 0x1e2e},  //
-    {0x1e30, 0x1e30},  //
-    {0x1e32, 0x1e32},  //
-    {0x1e34, 0x1e34},  //
-    {0x1e36, 0x1e36},  //
-    {0x1e38, 0x1e38},  //
-    {0x1e3a, 0x1e3a},  //
-    {0x1e3c, 0x1e3c},  //
-    {0x1e3e, 0x1e3e},  //
-    {0x1e40, 0x1e40},  //
-    {0x1e42, 0x1e42},  //
-    {0x1e44, 0x1e44},  //
-    {0x1e46, 0x1e46},  //
-    {0x1e48, 0x1e48},  //
-    {0x1e4a, 0x1e4a},  //
-    {0x1e4c, 0x1e4c},  //
-    {0x1e4e, 0x1e4e},  //
-    {0x1e50, 0x1e50},  //
-    {0x1e52, 0x1e52},  //
-    {0x1e54, 0x1e54},  //
-    {0x1e56, 0x1e56},  //
-    {0x1e58, 0x1e58},  //
-    {0x1e5a, 0x1e5a},  //
-    {0x1e5c, 0x1e5c},  //
-    {0x1e5e, 0x1e5e},  //
-    {0x1e60, 0x1e60},  //
-    {0x1e62, 0x1e62},  //
-    {0x1e64, 0x1e64},  //
-    {0x1e66, 0x1e66},  //
-    {0x1e68, 0x1e68},  //
-    {0x1e6a, 0x1e6a},  //
-    {0x1e6c, 0x1e6c},  //
-    {0x1e6e, 0x1e6e},  //
-    {0x1e70, 0x1e70},  //
-    {0x1e72, 0x1e72},  //
-    {0x1e74, 0x1e74},  //
-    {0x1e76, 0x1e76},  //
-    {0x1e78, 0x1e78},  //
-    {0x1e7a, 0x1e7a},  //
-    {0x1e7c, 0x1e7c},  //
-    {0x1e7e, 0x1e7e},  //
-    {0x1e80, 0x1e80},  //
-    {0x1e82, 0x1e82},  //
-    {0x1e84, 0x1e84},  //
-    {0x1e86, 0x1e86},  //
-    {0x1e88, 0x1e88},  //
-    {0x1e8a, 0x1e8a},  //
-    {0x1e8c, 0x1e8c},  //
-    {0x1e8e, 0x1e8e},  //
-    {0x1e90, 0x1e90},  //
-    {0x1e92, 0x1e92},  //
-    {0x1e94, 0x1e94},  //
-    {0x1e9e, 0x1e9e},  //
-    {0x1ea0, 0x1ea0},  //
-    {0x1ea2, 0x1ea2},  //
-    {0x1ea4, 0x1ea4},  //
-    {0x1ea6, 0x1ea6},  //
-    {0x1ea8, 0x1ea8},  //
-    {0x1eaa, 0x1eaa},  //
-    {0x1eac, 0x1eac},  //
-    {0x1eae, 0x1eae},  //
-    {0x1eb0, 0x1eb0},  //
-    {0x1eb2, 0x1eb2},  //
-    {0x1eb4, 0x1eb4},  //
-    {0x1eb6, 0x1eb6},  //
-    {0x1eb8, 0x1eb8},  //
-    {0x1eba, 0x1eba},  //
-    {0x1ebc, 0x1ebc},  //
-    {0x1ebe, 0x1ebe},  //
-    {0x1ec0, 0x1ec0},  //
-    {0x1ec2, 0x1ec2},  //
-    {0x1ec4, 0x1ec4},  //
-    {0x1ec6, 0x1ec6},  //
-    {0x1ec8, 0x1ec8},  //
-    {0x1eca, 0x1eca},  //
-    {0x1ecc, 0x1ecc},  //
-    {0x1ece, 0x1ece},  //
-    {0x1ed0, 0x1ed0},  //
-    {0x1ed2, 0x1ed2},  //
-    {0x1ed4, 0x1ed4},  //
-    {0x1ed6, 0x1ed6},  //
-    {0x1ed8, 0x1ed8},  //
-    {0x1eda, 0x1eda},  //
-    {0x1edc, 0x1edc},  //
-    {0x1ede, 0x1ede},  //
-    {0x1ee0, 0x1ee0},  //
-    {0x1ee2, 0x1ee2},  //
-    {0x1ee4, 0x1ee4},  //
-    {0x1ee6, 0x1ee6},  //
-    {0x1ee8, 0x1ee8},  //
-    {0x1eea, 0x1eea},  //
-    {0x1eec, 0x1eec},  //
-    {0x1eee, 0x1eee},  //
-    {0x1ef0, 0x1ef0},  //
-    {0x1ef2, 0x1ef2},  //
-    {0x1ef4, 0x1ef4},  //
-    {0x1ef6, 0x1ef6},  //
-    {0x1ef8, 0x1ef8},  //
-    {0x1efa, 0x1efa},  //
-    {0x1efc, 0x1efc},  //
-    {0x1efe, 0x1efe},  //
-    {0x1f08, 0x1f0f},  //
-    {0x1f18, 0x1f1d},  //
-    {0x1f28, 0x1f2f},  //
-    {0x1f38, 0x1f3f},  //
-    {0x1f48, 0x1f4d},  //
-    {0x1f59, 0x1f59},  //
-    {0x1f5b, 0x1f5b},  //
-    {0x1f5d, 0x1f5d},  //
-    {0x1f5f, 0x1f5f},  //
-    {0x1f68, 0x1f6f},  //
-    {0x1f88, 0x1f8f},  //
-    {0x1f98, 0x1f9f},  //
-    {0x1fa8, 0x1faf},  //
-    {0x1fb8, 0x1fbc},  //
-    {0x1fc8, 0x1fcc},  //
-    {0x1fd8, 0x1fdb},  //
-    {0x1fe8, 0x1fec},  //
-    {0x1ff8, 0x1ffc},  //
-    {0x2102, 0x2102},  //
-    {0x2107, 0x2107},  //
-    {0x210b, 0x210d},  //
-    {0x2110, 0x2112},  //
-    {0x2115, 0x2115},  //
-    {0x2119, 0x211d},  //
-    {0x2124, 0x2124},  //
-    {0x2126, 0x2126},  //
-    {0x2128, 0x2128},  //
-    {0x212a, 0x212d},  //
-    {0x2130, 0x2133},  //
-    {0x213e, 0x213f},  //
-    {0x2145, 0x2145},  //
-    {0x2160, 0x216f},  //
-    {0x2183, 0x2183},  //
-    {0x24b6, 0x24cf},  //
-    {0x2c00, 0x2c2f},  //
-    {0x2c60, 0x2c60},  //
-    {0x2c62, 0x2c64},  //
-    {0x2c67, 0x2c67},  //
-    {0x2c69, 0x2c69},  //
-    {0x2c6b, 0x2c6b},  //
-    {0x2c6d, 0x2c70},  //
-    {0x2c72, 0x2c72},  //
-    {0x2c75, 0x2c75},  //
-    {0x2c7e, 0x2c80},  //
-    {0x2c82, 0x2c82},  //
-    {0x2c84, 0x2c84},  //
-    {0x2c86, 0x2c86},  //
-    {0x2c88, 0x2c88},  //
-    {0x2c8a, 0x2c8a},  //
-    {0x2c8c, 0x2c8c},  //
-    {0x2c8e, 0x2c8e},  //
-    {0x2c90, 0x2c90},  //
-    {0x2c92, 0x2c92},  //
-    {0x2c94, 0x2c94},  //
-    {0x2c96, 0x2c96},  //
-    {0x2c98, 0x2c98},  //
-    {0x2c9a, 0x2c9a},  //
-    {0x2c9c, 0x2c9c},  //
-    {0x2c9e, 0x2c9e},  //
-    {0x2ca0, 0x2ca0},  //
-    {0x2ca2, 0x2ca2},  //
-    {0x2ca4, 0x2ca4},  //
-    {0x2ca6, 0x2ca6},  //
-    {0x2ca8, 0x2ca8},  //
-    {0x2caa, 0x2caa},  //
-    {0x2cac, 0x2cac},  //
-    {0x2cae, 0x2cae},  //
-    {0x2cb0, 0x2cb0},  //
-    {0x2cb2, 0x2cb2},  //
-    {0x2cb4, 0x2cb4},  //
-    {0x2cb6, 0x2cb6},  //
-    {0x2cb8, 0x2cb8},  //
-    {0x2cba, 0x2cba},  //
-    {0x2cbc, 0x2cbc},  //
-    {0x2cbe, 0x2cbe},  //
-    {0x2cc0, 0x2cc0},  //
-    {0x2cc2, 0x2cc2},  //
-    {0x2cc4, 0x2cc4},  //
-    {0x2cc6, 0x2cc6},  //
-    {0x2cc8, 0x2cc8},  //
-    {0x2cca, 0x2cca},  //
-    {0x2ccc, 0x2ccc},  //
-    {0x2cce, 0x2cce},  //
-    {0x2cd0, 0x2cd0},  //
-    {0x2cd2, 0x2cd2},  //
-    {0x2cd4, 0x2cd4},  //
-    {0x2cd6, 0x2cd6},  //
-    {0x2cd8, 0x2cd8},  //
-    {0x2cda, 0x2cda},  //
-    {0x2cdc, 0x2cdc},  //
-    {0x2cde, 0x2cde},  //
-    {0x2ce0, 0x2ce0},  //
-    {0x2ce2, 0x2ce2},  //
-    {0x2ceb, 0x2ceb},  //
-    {0x2ced, 0x2ced},  //
-    {0x2cf2, 0x2cf2},  //
-    {0xa640, 0xa640},  //
-    {0xa642, 0xa642},  //
-    {0xa644, 0xa644},  //
-    {0xa646, 0xa646},  //
-    {0xa648, 0xa648},  //
-    {0xa64a, 0xa64a},  //
-    {0xa64c, 0xa64c},  //
-    {0xa64e, 0xa64e},  //
-    {0xa650, 0xa650},  //
-    {0xa652, 0xa652},  //
-    {0xa654, 0xa654},  //
-    {0xa656, 0xa656},  //
-    {0xa658, 0xa658},  //
-    {0xa65a, 0xa65a},  //
-    {0xa65c, 0xa65c},  //
-    {0xa65e, 0xa65e},  //
-    {0xa660, 0xa660},  //
-    {0xa662, 0xa662},  //
-    {0xa664, 0xa664},  //
-    {0xa666, 0xa666},  //
-    {0xa668, 0xa668},  //
-    {0xa66a, 0xa66a},  //
-    {0xa66c, 0xa66c},  //
-    {0xa680, 0xa680},  //
-    {0xa682, 0xa682},  //
-    {0xa684, 0xa684},  //
-    {0xa686, 0xa686},  //
-    {0xa688, 0xa688},  //
-    {0xa68a, 0xa68a},  //
-    {0xa68c, 0xa68c},  //
-    {0xa68e, 0xa68e},  //
-    {0xa690, 0xa690},  //
-    {0xa692, 0xa692},  //
-    {0xa694, 0xa694},  //
-    {0xa696, 0xa696},  //
-    {0xa698, 0xa698},  //
-    {0xa69a, 0xa69a},  //
-    {0xa722, 0xa722},  //
-    {0xa724, 0xa724},  //
-    {0xa726, 0xa726},  //
-    {0xa728, 0xa728},  //
-    {0xa72a, 0xa72a},  //
-    {0xa72c, 0xa72c},  //
-    {0xa72e, 0xa72e},  //
-    {0xa732, 0xa732},  //
-    {0xa734, 0xa734},  //
-    {0xa736, 0xa736},  //
-    {0xa738, 0xa738},  //
-    {0xa73a, 0xa73a},  //
-    {0xa73c, 0xa73c},  //
-    {0xa73e, 0xa73e},  //
-    {0xa740, 0xa740},  //
-    {0xa742, 0xa742},  //
-    {0xa744, 0xa744},  //
-    {0xa746, 0xa746},  //
-    {0xa748, 0xa748},  //
-    {0xa74a, 0xa74a},  //
-    {0xa74c, 0xa74c},  //
-    {0xa74e, 0xa74e},  //
-    {0xa750, 0xa750},  //
-    {0xa752, 0xa752},  //
-    {0xa754, 0xa754},  //
-    {0xa756, 0xa756},  //
-    {0xa758, 0xa758},  //
-    {0xa75a, 0xa75a},  //
-    {0xa75c, 0xa75c},  //
-    {0xa75e, 0xa75e},  //
-    {0xa760, 0xa760},  //
-    {0xa762, 0xa762},  //
-    {0xa764, 0xa764},  //
-    {0xa766, 0xa766},  //
-    {0xa768, 0xa768},  //
-    {0xa76a, 0xa76a},  //
-    {0xa76c, 0xa76c},  //
-    {0xa76e, 0xa76e},  //
-    {0xa779, 0xa779},  //
-    {0xa77b, 0xa77b},  //
-    {0xa77d, 0xa77e},  //
-    {0xa780, 0xa780},  //
-    {0xa782, 0xa782},  //
-    {0xa784, 0xa784},  //
-    {0xa786, 0xa786},  //
-    {0xa78b, 0xa78b},  //
-    {0xa78d, 0xa78d},  //
-    {0xa790, 0xa790},  //
-    {0xa792, 0xa792},  //
-    {0xa796, 0xa796},  //
-    {0xa798, 0xa798},  //
-    {0xa79a, 0xa79a},  //
-    {0xa79c, 0xa79c},  //
-    {0xa79e, 0xa79e},  //
-    {0xa7a0, 0xa7a0},  //
-    {0xa7a2, 0xa7a2},  //
-    {0xa7a4, 0xa7a4},  //
-    {0xa7a6, 0xa7a6},  //
-    {0xa7a8, 0xa7a8},  //
-    {0xa7aa, 0xa7ae},  //
-    {0xa7b0, 0xa7b4},  //
-    {0xa7b6, 0xa7b6},  //
-    {0xa7b8, 0xa7b8},  //
-    {0xa7ba, 0xa7ba},  //
-    {0xa7bc, 0xa7bc},  //
-    {0xa7be, 0xa7be},  //
-    {0xa7c0, 0xa7c0},  //
-    {0xa7c2, 0xa7c2},  //
-    {0xa7c4, 0xa7c7},  //
-    {0xa7c9, 0xa7c9},  //
-    {0xa7d0, 0xa7d0},  //
-    {0xa7d6, 0xa7d6},  //
-    {0xa7d8, 0xa7d8},  //
-    {0xa7f5, 0xa7f5},  //
-    {0xff21, 0xff3a},  //
-};
-
-static const unsigned kUpperAstral[][2] = {
-    {0x10400, 0x10427},  //
-    {0x104b0, 0x104d3},  //
-    {0x10570, 0x1057a},  //
-    {0x1057c, 0x1058a},  //
-    {0x1058c, 0x10592},  //
-    {0x10594, 0x10595},  //
-    {0x10c80, 0x10cb2},  //
-    {0x118a0, 0x118bf},  //
-    {0x16e40, 0x16e5f},  //
-    {0x1d400, 0x1d419},  //
-    {0x1d434, 0x1d44d},  //
-    {0x1d468, 0x1d481},  //
-    {0x1d49c, 0x1d49c},  //
-    {0x1d49e, 0x1d49f},  //
-    {0x1d4a2, 0x1d4a2},  //
-    {0x1d4a5, 0x1d4a6},  //
-    {0x1d4a9, 0x1d4ac},  //
-    {0x1d4ae, 0x1d4b5},  //
-    {0x1d4d0, 0x1d4e9},  //
-    {0x1d504, 0x1d505},  //
-    {0x1d507, 0x1d50a},  //
-    {0x1d50d, 0x1d514},  //
-    {0x1d516, 0x1d51c},  //
-    {0x1d538, 0x1d539},  //
-    {0x1d53b, 0x1d53e},  //
-    {0x1d540, 0x1d544},  //
-    {0x1d546, 0x1d546},  //
-    {0x1d54a, 0x1d550},  //
-    {0x1d56c, 0x1d585},  //
-    {0x1d5a0, 0x1d5b9},  //
-    {0x1d5d4, 0x1d5ed},  //
-    {0x1d608, 0x1d621},  //
-    {0x1d63c, 0x1d655},  //
-    {0x1d670, 0x1d689},  //
-    {0x1d6a8, 0x1d6c0},  //
-    {0x1d6e2, 0x1d6fa},  //
-    {0x1d71c, 0x1d734},  //
-    {0x1d756, 0x1d76e},  //
-    {0x1d790, 0x1d7a8},  //
-    {0x1d7ca, 0x1d7ca},  //
-    {0x1e900, 0x1e921},  //
-    {0x1f130, 0x1f149},  //
-    {0x1f150, 0x1f169},  //
-    {0x1f170, 0x1f189},  //
-};
-
-/**
- * Returns nonzero if c is uppercase letter.
- */
-int iswupper(wint_t c) {
-  if (!IsTiny() && c < 128)
-    return 'A' <= c && c <= 'Z';
-  if (c < 65536)
-    return has_char(kUpper, ARRAYLEN(kUpper), (unsigned short)c);
-  return has_char(kUpperAstral, ARRAYLEN(kUpperAstral), (unsigned)c);
-}
-
-__weak_reference(iswupper, iswupper_l);
diff --git a/libc/str/iswxdigit.c b/libc/str/iswxdigit.c
index cccf9e4de..75e4347f2 100644
--- a/libc/str/iswxdigit.c
+++ b/libc/str/iswxdigit.c
@@ -22,8 +22,7 @@
  * Returns nonzero if c is ascii hex digit.
  */
 int iswxdigit(wint_t c) {
-  return ('0' <= c && c <= '9') ||  //
-         ('A' <= c && c <= 'F') ||  //
+  return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F') ||
          ('a' <= c && c <= 'f');
 }
 
diff --git a/libc/str/isxdigit.c b/libc/str/isxdigit.c
index a5f325698..03af7c9cc 100644
--- a/libc/str/isxdigit.c
+++ b/libc/str/isxdigit.c
@@ -22,8 +22,7 @@
  * Returns true if c is hexadecimal digit.
  */
 int isxdigit(int c) {
-  return ('0' <= c && c <= '9') ||  //
-         ('A' <= c && c <= 'F') ||  //
+  return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F') ||
          ('a' <= c && c <= 'f');
 }
 
diff --git a/libc/str/iszipeocd32.c b/libc/str/iszipeocd32.c
index a516b0f21..883bf93db 100644
--- a/libc/str/iszipeocd32.c
+++ b/libc/str/iszipeocd32.c
@@ -17,30 +17,39 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/stdckdint.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Determines if ZIP EOCD record seems legit.
  */
 int IsZipEocd32(const uint8_t *p, size_t n, size_t i) {
   size_t offset;
-  if (i > n || n - i < kZipCdirHdrMinSize)
+  if (i > n || n - i < kZipCdirHdrMinSize) {
     return kZipErrorEocdOffsetOverflow;
-  if (ZIP_READ32(p + i) != kZipCdirHdrMagic)
+  }
+  if (ZIP_READ32(p + i) != kZipCdirHdrMagic) {
     return kZipErrorEocdMagicNotFound;
-  if (i + ZIP_CDIR_HDRSIZE(p + i) > n)
+  }
+  if (i + ZIP_CDIR_HDRSIZE(p + i) > n) {
     return kZipErrorEocdSizeOverflow;
-  if (ZIP_CDIR_DISK(p + i) != ZIP_CDIR_STARTINGDISK(p + i))
+  }
+  if (ZIP_CDIR_DISK(p + i) != ZIP_CDIR_STARTINGDISK(p + i)) {
     return kZipErrorEocdDiskMismatch;
-  if (ZIP_CDIR_RECORDSONDISK(p + i) != ZIP_CDIR_RECORDS(p + i))
+  }
+  if (ZIP_CDIR_RECORDSONDISK(p + i) != ZIP_CDIR_RECORDS(p + i)) {
     return kZipErrorEocdRecordsMismatch;
-  if (ZIP_CDIR_RECORDS(p + i) * kZipCfileHdrMinSize > ZIP_CDIR_SIZE(p + i))
+  }
+  if (ZIP_CDIR_RECORDS(p + i) * kZipCfileHdrMinSize > ZIP_CDIR_SIZE(p + i)) {
     return kZipErrorEocdRecordsOverflow;
-  if (ZIP_CDIR_OFFSET(p + i) == 0xFFFFFFFFu)
+  }
+  if (ZIP_CDIR_OFFSET(p + i) == 0xFFFFFFFFu) {
     return kZipErrorEocdRecordsOverflow;
-  if (ckd_add(&offset, ZIP_CDIR_OFFSET(p + i), ZIP_CDIR_SIZE(p + i)))
+  }
+  if (ckd_add(&offset, ZIP_CDIR_OFFSET(p + i), ZIP_CDIR_SIZE(p + i))) {
     return kZipErrorEocdOffsetSizeOverflow;
-  if (offset > i)
+  }
+  if (offset > i) {
     return kZipErrorCdirOffsetPastEocd;
+  }
   return kZipOk;
 }
diff --git a/libc/str/iszipeocd64.c b/libc/str/iszipeocd64.c
index ff29795d6..cbc37af29 100644
--- a/libc/str/iszipeocd64.c
+++ b/libc/str/iszipeocd64.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/stdckdint.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 /**
  * Returns kZipOk if zip64 end of central directory header seems legit.
diff --git a/libc/str/khextoint.c b/libc/str/khextoint.c
index fd30e72cf..53e2093d0 100644
--- a/libc/str/khextoint.c
+++ b/libc/str/khextoint.c
@@ -16,8 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/stdalign.h"
-#include "libc/str/tab.h"
+#include "libc/stdalign.internal.h"
+#include "libc/str/tab.internal.h"
 
 alignas(int8_t) const int8_t kHexToInt[256] = {
     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 0x00
diff --git a/libc/str/kmp.c b/libc/str/kmp.c
deleted file mode 100644
index d904a4378..000000000
--- a/libc/str/kmp.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/kmp.h"
-#include "libc/mem/alloca.h"
-#include "libc/runtime/stack.h"
-
-static void computeLPS(const char *pattern, long M, long *lps) {
-  long len = 0;
-  lps[0] = 0;
-  long i = 1;
-  while (i < M) {
-    if (pattern[i] == pattern[len]) {
-      len++;
-      lps[i] = len;
-      i++;
-    } else {
-      if (len != 0) {
-        len = lps[len - 1];
-      } else {
-        lps[i] = 0;
-        i++;
-      }
-    }
-  }
-}
-
-char *__memmem_kmp(const char *s, size_t n, const char *ss, size_t m) {
-  if (!m)
-    return (char *)s;
-  if (n < m)
-    return NULL;
-#pragma GCC push_options
-#pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
-  long need = sizeof(long) * m;
-  long *lps = (long *)alloca(need);
-  CheckLargeStackAllocation(lps, need);
-#pragma GCC pop_options
-  computeLPS(ss, m, lps);
-  long i = 0;
-  long j = 0;
-  while (i < n) {
-    if (ss[j] == s[i]) {
-      i++;
-      j++;
-    }
-    if (j == m) {
-      return (char *)(s + i - j);
-    } else if (i < n && ss[j] != s[i]) {
-      if (j != 0) {
-        j = lps[j - 1];
-      } else {
-        i++;
-      }
-    }
-  }
-  return NULL;
-}
diff --git a/libc/str/kmp.h b/libc/str/kmp.h
deleted file mode 100644
index 5c5a85736..000000000
--- a/libc/str/kmp.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_STR_KMP_H_
-#define COSMOPOLITAN_LIBC_STR_KMP_H_
-COSMOPOLITAN_C_START_
-
-char *__memmem_kmp(const char *, size_t, const char *, size_t);
-char16_t *__memmem_kmp16(const char16_t *, size_t, const char16_t *, size_t);
-wchar_t *__memmem_kmp32(const wchar_t *, size_t, const wchar_t *, size_t);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_LIBC_STR_KMP_H_ */
diff --git a/libc/str/kmp16.c b/libc/str/kmp16.c
deleted file mode 100644
index 0e30f57ad..000000000
--- a/libc/str/kmp16.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/mem/alloca.h"
-#include "libc/runtime/stack.h"
-#include "libc/str/kmp.h"
-
-static void computeLPS(const char16_t *pattern, long M, long *lps) {
-  long len = 0;
-  lps[0] = 0;
-  long i = 1;
-  while (i < M) {
-    if (pattern[i] == pattern[len]) {
-      len++;
-      lps[i] = len;
-      i++;
-    } else {
-      if (len != 0) {
-        len = lps[len - 1];
-      } else {
-        lps[i] = 0;
-        i++;
-      }
-    }
-  }
-}
-
-char16_t *__memmem_kmp16(const char16_t *s, size_t n, const char16_t *ss,
-                         size_t m) {
-  if (!m)
-    return (char16_t *)s;
-  if (n < m)
-    return NULL;
-#pragma GCC push_options
-#pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
-  long need = sizeof(long) * m;
-  long *lps = (long *)alloca(need);
-  CheckLargeStackAllocation(lps, need);
-#pragma GCC pop_options
-  computeLPS(ss, m, lps);
-  long i = 0;
-  long j = 0;
-  while (i < n) {
-    if (ss[j] == s[i]) {
-      i++;
-      j++;
-    }
-    if (j == m) {
-      return (char16_t *)(s + i - j);
-    } else if (i < n && ss[j] != s[i]) {
-      if (j != 0) {
-        j = lps[j - 1];
-      } else {
-        i++;
-      }
-    }
-  }
-  return NULL;
-}
diff --git a/libc/str/kmp32.c b/libc/str/kmp32.c
deleted file mode 100644
index efd1a07a8..000000000
--- a/libc/str/kmp32.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/mem/alloca.h"
-#include "libc/runtime/stack.h"
-#include "libc/str/kmp.h"
-
-static void computeLPS(const wchar_t *pattern, long M, long *lps) {
-  long len = 0;
-  lps[0] = 0;
-  long i = 1;
-  while (i < M) {
-    if (pattern[i] == pattern[len]) {
-      len++;
-      lps[i] = len;
-      i++;
-    } else {
-      if (len != 0) {
-        len = lps[len - 1];
-      } else {
-        lps[i] = 0;
-        i++;
-      }
-    }
-  }
-}
-
-wchar_t *__memmem_kmp32(const wchar_t *s, size_t n, const wchar_t *ss,
-                        size_t m) {
-  if (!m)
-    return (wchar_t *)s;
-  if (n < m)
-    return NULL;
-#pragma GCC push_options
-#pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
-  long need = sizeof(long) * m;
-  long *lps = (long *)alloca(need);
-  CheckLargeStackAllocation(lps, need);
-#pragma GCC pop_options
-  computeLPS(ss, m, lps);
-  long i = 0;
-  long j = 0;
-  while (i < n) {
-    if (ss[j] == s[i]) {
-      i++;
-      j++;
-    }
-    if (j == m) {
-      return (wchar_t *)(s + i - j);
-    } else if (i < n && ss[j] != s[i]) {
-      if (j != 0) {
-        j = lps[j - 1];
-      } else {
-        i++;
-      }
-    }
-  }
-  return NULL;
-}
diff --git a/libc/str/kx86processormodels.c b/libc/str/kx86processormodels.c
index ba055d84b..d5f62ea3c 100644
--- a/libc/str/kx86processormodels.c
+++ b/libc/str/kx86processormodels.c
@@ -16,10 +16,11 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/x86info.h"
 
 const struct X86ProcessorModel kX86ProcessorModels[] = {
+    /* <SORTED> */
     {0x060F, X86_MARCH_CORE2, X86_GRADE_CLIENT},
     {0x0616, X86_MARCH_CORE2, X86_GRADE_MOBILE},
     {0x0617, X86_MARCH_CORE2, X86_GRADE_SERVER},
@@ -84,5 +85,7 @@ const struct X86ProcessorModel kX86ProcessorModels[] = {
     {0x06A7, X86_MARCH_ROCKETLAKE, X86_GRADE_CLIENT},
     {0x06B7, X86_MARCH_RAPTORLAKE, X86_GRADE_CLIENT},
     {0x06BA, X86_MARCH_RAPTORLAKE, X86_GRADE_CLIENT},
-    {0},
+    /* </SORTED> */
 };
+
+const size_t kX86ProcessorModelCount = ARRAYLEN(kX86ProcessorModels);
diff --git a/third_party/musl/langinfo.c b/libc/str/langinfo.c
similarity index 93%
rename from third_party/musl/langinfo.c
rename to libc/str/langinfo.c
index 5dfa38271..fe88cd54a 100644
--- a/third_party/musl/langinfo.c
+++ b/libc/str/langinfo.c
@@ -25,12 +25,14 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <locale.h>
-#include <langinfo.h>
-#include "libc/intrin/kprintf.h"
-#include "libc/str/locale.internal.h"
+#include "libc/str/langinfo.h"
+#include "libc/str/locale.h"
+#include "libc/str/nltypes.h"
+#include "libc/thread/tls.h"
 __static_yoink("musl_libc_notice");
 
+// clang-format off
+
 static const char c_time[] =
 	"Sun\0" "Mon\0" "Tue\0" "Wed\0" "Thu\0" "Fri\0" "Sat\0"
 	"Sunday\0" "Monday\0" "Tuesday\0" "Wednesday\0"
@@ -61,6 +63,9 @@ char *nl_langinfo_l(nl_item item, locale_t loc)
 	int idx = item & 65535;
 	const char *str;
 
+	if (!loc)
+		return "";
+
 	if (item == CODESET) return loc->cat[LC_CTYPE] ? "UTF-8" : "ASCII";
 
 	/* _NL_LOCALE_NAME extension */
@@ -89,11 +94,11 @@ char *nl_langinfo_l(nl_item item, locale_t loc)
 	}
 
 	for (; idx; idx--, str++) for (; *str; str++);
-	if (cat != LC_NUMERIC && *str) str = LCTRANS(str, cat, loc);
+	// if (cat != LC_NUMERIC && *str) str = LCTRANS(str, cat, loc);
 	return (char *)str;
 }
 
 char *nl_langinfo(nl_item item)
 {
-	return nl_langinfo_l(item, CURRENT_LOCALE);
+	return nl_langinfo_l(item, (locale_t)__get_tls()->tib_locale);
 }
diff --git a/libc/str/langinfo.h b/libc/str/langinfo.h
index 5427efa4d..edb8ce75e 100644
--- a/libc/str/langinfo.h
+++ b/libc/str/langinfo.h
@@ -1,7 +1,5 @@
 #ifndef COSMOPOLITAN_LIBC_STR_LANGINFO_H_
 #define COSMOPOLITAN_LIBC_STR_LANGINFO_H_
-#include "libc/str/locale.h"
-#include "libc/str/nltypes.h"
 COSMOPOLITAN_C_START_
 
 #define ABDAY_1 0x20000
@@ -80,8 +78,7 @@ COSMOPOLITAN_C_START_
 #define NOSTR  0x50003
 #endif
 
-char *nl_langinfo(nl_item);
-char *nl_langinfo_l(nl_item, locale_t);
+char *nl_langinfo(int);
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_STR_LANGINFO_H_ */
diff --git a/libc/str/locale.c b/libc/str/locale.c
index d818e76d4..8db705895 100644
--- a/libc/str/locale.c
+++ b/libc/str/locale.c
@@ -16,12 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/locale.internal.h"
+#include "libc/str/locale.h"
 #include "libc/str/str.h"
 
-static const uint32_t empty_mo[] = {
-    0x950412de, 0, -1, -1, -1,
-};
+static const uint32_t empty_mo[] = {0x950412de, 0, -1, -1, -1};
 
 const struct __locale_map __c_dot_utf8 = {
     .map = empty_mo,
@@ -29,11 +27,8 @@ const struct __locale_map __c_dot_utf8 = {
     .name = "C.UTF-8",
 };
 
-const struct __locale_struct __c_locale = {0};
+const struct __locale_struct __c_locale;
+
 const struct __locale_struct __c_dot_utf8_locale = {
     .cat[LC_CTYPE] = &__c_dot_utf8,
 };
-
-struct __locale_struct __global_locale;
-
-pthread_mutex_t __locale_lock = PTHREAD_MUTEX_INITIALIZER;
diff --git a/libc/str/locale.h b/libc/str/locale.h
index 2b772639f..ec66e45dd 100644
--- a/libc/str/locale.h
+++ b/libc/str/locale.h
@@ -17,13 +17,29 @@
 #define LC_MONETARY_MASK 16
 #define LC_MESSAGES_MASK 32
 #define LC_ALL_MASK      0x1fbf
+#define LOCALE_NAME_MAX  23
 
 COSMOPOLITAN_C_START_
 
 #define LC_GLOBAL_LOCALE ((locale_t) - 1)
 
+struct __locale_map {
+  const void *map;
+  size_t map_size;
+  char name[LOCALE_NAME_MAX + 1];
+  const struct __locale_map *next;
+};
+
+struct __locale_struct {
+  const struct __locale_map *cat[6];
+};
+
 typedef struct __locale_struct *locale_t;
 
+extern const struct __locale_map __c_dot_utf8;
+extern const struct __locale_struct __c_locale;
+extern const struct __locale_struct __c_dot_utf8_locale;
+
 char *nl_langinfo_l(int, locale_t) libcesque;
 char *setlocale(int, const char *) libcesque;
 double strtod_l(const char *, char **, locale_t) libcesque;
diff --git a/libc/str/locale.internal.h b/libc/str/locale.internal.h
deleted file mode 100644
index e6dad969c..000000000
--- a/libc/str/locale.internal.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_STR_LOCALE_INTERNAL_H_
-#define COSMOPOLITAN_LIBC_STR_LOCALE_INTERNAL_H_
-#include "libc/limits.h"
-#include "libc/str/locale.h"
-#include "libc/thread/posixthread.internal.h"
-COSMOPOLITAN_C_START_
-
-#define LOCALE_NAME_MAX 23
-
-struct __locale_map {
-  const void *map;
-  size_t map_size;
-  char name[LOCALE_NAME_MAX + 1];
-  const struct __locale_map *next;
-};
-
-struct __locale_struct {
-  const struct __locale_map *cat[6];
-};
-
-extern pthread_mutex_t __locale_lock;
-
-extern struct __locale_struct __global_locale;
-extern const struct __locale_map __c_dot_utf8;
-extern const struct __locale_struct __c_locale;
-extern const struct __locale_struct __c_dot_utf8_locale;
-
-const struct __locale_map *__get_locale(int, const char *);
-const char *__mo_lookup(const void *, size_t, const char *);
-const char *__lctrans(const char *, const struct __locale_map *);
-const char *__lctrans_cur(const char *);
-const char *__lctrans_impl(const char *, const struct __locale_map *);
-int __loc_is_allocated(locale_t);
-char *__gettextdomain(void);
-
-#define LOC_MAP_FAILED ((const struct __locale_map *)-1)
-
-#define LCTRANS(msg, lc, loc) __lctrans(msg, (loc)->cat[(lc)])
-#define LCTRANS_CUR(msg)      __lctrans_cur(msg)
-
-#define C_LOCALE    ((locale_t) & __c_locale)
-#define UTF8_LOCALE ((locale_t) & __c_dot_utf8_locale)
-
-#define CURRENT_LOCALE _pthread_self()->pt_locale
-
-#define CURRENT_UTF8 (!!_pthread_self()->pt_locale->cat[LC_CTYPE])
-
-#undef MB_CUR_MAX
-#define MB_CUR_MAX (CURRENT_UTF8 ? 4 : 1)
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_LIBC_STR_LOCALE_INTERNAL_H_ */
diff --git a/third_party/musl/wcsrtombs.c b/libc/str/mb.c
similarity index 69%
rename from third_party/musl/wcsrtombs.c
rename to libc/str/mb.c
index cb8bd206e..98cbf47dd 100644
--- a/third_party/musl/wcsrtombs.c
+++ b/libc/str/mb.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,59 +25,28 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <wchar.h>
+#include "libc/str/mb.internal.h"
 __static_yoink("musl_libc_notice");
 
-size_t wcsrtombs(char *restrict s, const wchar_t **restrict ws, size_t n, mbstate_t *restrict st)
-{
-	const wchar_t *ws2;
-	char buf[4];
-	size_t N = n, l;
-	if (!s) {
-		for (n=0, ws2=*ws; *ws2; ws2++) {
-			if (*ws2 >= 0x80u) {
-				l = wcrtomb(buf, *ws2, 0);
-				if (!(l+1)) return -1;
-				n += l;
-			} else n++;
-		}
-		return n;
-	}
-	while (n>=4) {
-		if (**ws-1u >= 0x7fu) {
-			if (!**ws) {
-				*s = 0;
-				*ws = 0;
-				return N-n;
-			}
-			l = wcrtomb(s, **ws, 0);
-			if (!(l+1)) return -1;
-			s += l;
-			n -= l;
-		} else {
-			*s++ = **ws;
-			n--;
-		}
-		(*ws)++;
-	}
-	while (n) {
-		if (**ws-1u >= 0x7fu) {
-			if (!**ws) {
-				*s = 0;
-				*ws = 0;
-				return N-n;
-			}
-			l = wcrtomb(buf, **ws, 0);
-			if (!(l+1)) return -1;
-			if (l>n) return N-n;
-			wcrtomb(s, **ws, 0);
-			s += l;
-			n -= l;
-		} else {
-			*s++ = **ws;
-			n--;
-		}
-		(*ws)++;
-	}
-	return N;
-}
+#define C(x) (x < 2 ? -1 : (R(0x80, 0xc0) | x))
+#define D(x) C((x + 16))
+#define E(x)                      \
+  ((x == 0     ? R(0xa0, 0xc0)    \
+    : x == 0xd ? R(0x80, 0xa0)    \
+               : R(0x80, 0xc0)) | \
+   (R(0x80, 0xc0) >> 6) | x)
+#define F(x)                    \
+  ((x >= 5   ? 0                \
+    : x == 0 ? R(0x90, 0xc0)    \
+    : x == 4 ? R(0x80, 0x90)    \
+             : R(0x80, 0xc0)) | \
+   (R(0x80, 0xc0) >> 6) | (R(0x80, 0xc0) >> 12) | x)
+
+const uint32_t kMbBittab[51 /* ?! */] = {
+    C(0x2), C(0x3), C(0x4), C(0x5), C(0x6), C(0x7), C(0x8), C(0x9), C(0xa),
+    C(0xb), C(0xc), C(0xd), C(0xe), C(0xf), D(0x0), D(0x1), D(0x2), D(0x3),
+    D(0x4), D(0x5), D(0x6), D(0x7), D(0x8), D(0x9), D(0xa), D(0xb), D(0xc),
+    D(0xd), D(0xe), D(0xf), E(0x0), E(0x1), E(0x2), E(0x3), E(0x4), E(0x5),
+    E(0x6), E(0x7), E(0x8), E(0x9), E(0xa), E(0xb), E(0xc), E(0xd), E(0xe),
+    E(0xf), F(0x0), F(0x1), F(0x2), F(0x3), F(0x4),
+};
diff --git a/libc/str/mb.internal.h b/libc/str/mb.internal.h
new file mode 100644
index 000000000..d5f2748c6
--- /dev/null
+++ b/libc/str/mb.internal.h
@@ -0,0 +1,17 @@
+#ifndef COSMOPOLITAN_LIBC_STR_MB_INTERNAL_H_
+#define COSMOPOLITAN_LIBC_STR_MB_INTERNAL_H_
+COSMOPOLITAN_C_START_
+
+#define SA             0xc2u
+#define SB             0xf4u
+#define CODEUNIT(c)    (0xdfff & (signed char)(c))
+#define IS_CODEUNIT(c) ((unsigned)(c) - 0xdf80 < 0x80)
+#define R(a, b)        ((uint32_t)((a == 0x80 ? 0x40u - b : 0u - a) << 23))
+#define FAILSTATE      R(0x80, 0x80)
+#define OOB(c, b) \
+  (((((b) >> 3) - 0x10) | (((b) >> 3) + ((int32_t)(c) >> 26))) & ~7)
+
+extern const uint32_t kMbBittab[51];
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_STR_MB_INTERNAL_H_ */
diff --git a/libc/str/mblen.c b/libc/str/mblen.c
new file mode 100644
index 000000000..807cfbd51
--- /dev/null
+++ b/libc/str/mblen.c
@@ -0,0 +1,23 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/str.h"
+
+int mblen(const char *s, size_t n) {
+  return mbtowc(0, s, n);
+}
diff --git a/libc/str/mbrlen.c b/libc/str/mbrlen.c
new file mode 100644
index 000000000..4f20fe1a0
--- /dev/null
+++ b/libc/str/mbrlen.c
@@ -0,0 +1,26 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/str.h"
+
+size_t mbrlen(const char *s, size_t n, mbstate_t *t) {
+  static mbstate_t ss;
+  if (!t)
+    t = &ss;
+  return mbrtowc(0, s, n, t);
+}
diff --git a/third_party/musl/freelocale.c b/libc/str/mbrtoc16.c
similarity index 72%
rename from third_party/musl/freelocale.c
rename to libc/str/mbrtoc16.c
index 341afa33b..492ae68b6 100644
--- a/third_party/musl/freelocale.c
+++ b/libc/str/mbrtoc16.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,17 +25,36 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/runtime/runtime.h"
+#include "libc/calls/calls.h"
+#include "libc/limits.h"
+#include "libc/str/mb.internal.h"
 #include "libc/str/str.h"
-#include "libc/str/locale.internal.h"
 __static_yoink("musl_libc_notice");
 
-#define malloc undef
-#define calloc undef
-#define realloc undef
-#define free(p) munmap(p, sizeof(struct __locale_struct))
-
-void freelocale(locale_t l)
-{
-	if (__loc_is_allocated(l)) free(l);
+size_t mbrtoc16(char16_t *pc16, const char *s, size_t n, mbstate_t *ps) {
+  static unsigned internal_state;
+  if (!ps)
+    ps = (void *)&internal_state;
+  unsigned *pending = (unsigned *)ps;
+  if (!s)
+    return mbrtoc16(0, "", 1, ps);
+  /* mbrtowc states for partial UTF-8 characters have the high bit set;
+   * we use nonzero states without high bit for pending surrogates. */
+  if ((int)*pending > 0) {
+    if (pc16)
+      *pc16 = *pending;
+    *pending = 0;
+    return -3;
+  }
+  wchar_t wc;
+  size_t ret = mbrtowc(&wc, s, n, ps);
+  if (ret <= 4) {
+    if (wc >= 0x10000) {
+      *pending = (wc & 0x3ff) + 0xdc00;
+      wc = 0xd7c0 + (wc >> 10);
+    }
+    if (pc16)
+      *pc16 = wc;
+  }
+  return ret;
 }
diff --git a/third_party/musl/mbrtoc32.c b/libc/str/mbrtoc32.c
similarity index 81%
rename from third_party/musl/mbrtoc32.c
rename to libc/str/mbrtoc32.c
index df14183cd..535cb4f2f 100644
--- a/third_party/musl/mbrtoc32.c
+++ b/libc/str/mbrtoc32.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,17 +25,21 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <uchar.h>
-#include <wchar.h>
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/macros.internal.h"
+#include "libc/str/str.h"
 __static_yoink("musl_libc_notice");
 
-size_t mbrtoc32(char32_t *restrict pc32, const char *restrict s, size_t n, mbstate_t *restrict ps)
-{
-	static unsigned internal_state;
-	if (!ps) ps = (void *)&internal_state;
-	if (!s) return mbrtoc32(0, "", 1, ps);
-	wchar_t wc;
-	size_t ret = mbrtowc(&wc, s, n, ps);
-	if (ret <= 4 && pc32) *pc32 = wc;
-	return ret;
+size_t mbrtoc32(char32_t *pc32, const char *s, size_t n, mbstate_t *ps) {
+  static unsigned internal_state;
+  if (!ps)
+    ps = (void *)&internal_state;
+  if (!s)
+    return mbrtoc32(0, "", 1, ps);
+  wchar_t wc;
+  size_t ret = mbrtowc(&wc, s, n, ps);
+  if (ret <= 4 && pc32)
+    *pc32 = wc;
+  return ret;
 }
diff --git a/third_party/musl/mbrtowc.c b/libc/str/mbrtowc.c
similarity index 64%
rename from third_party/musl/mbrtowc.c
rename to libc/str/mbrtowc.c
index a221e3d7c..c62d98dfc 100644
--- a/third_party/musl/mbrtowc.c
+++ b/libc/str/mbrtowc.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,57 +25,65 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <stdlib.h>
-#include <wchar.h>
-#include <errno.h>
-#include "multibyte.h"
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/macros.internal.h"
+#include "libc/str/mb.internal.h"
+#include "libc/str/str.h"
 __static_yoink("musl_libc_notice");
 
-#pragma GCC diagnostic ignored "-Wparentheses"
-
-size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate_t *restrict st)
-{
-	static unsigned internal_state;
-	unsigned c;
-	const unsigned char *s = (const void *)src;
-	const size_t N = n;
-	wchar_t dummy;
-
-	if (!st) st = (void *)&internal_state;
-	c = *(unsigned *)st;
-	
-	if (!s) {
-		if (c) goto ilseq;
-		return 0;
-	} else if (!wc) wc = &dummy;
-
-	if (!n) return -2;
-	if (!c) {
-		if (*s < 0x80) return !!(*wc = *s);
-		if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
-		if (*s-SA > SB-SA) goto ilseq;
-		c = bittab[*s++-SA]; n--;
-	}
-
-	if (n) {
-		if (OOB(c,*s)) goto ilseq;
-loop:
-		c = c<<6 | *s++-0x80; n--;
-		if (!(c&(1U<<31))) {
-			*(unsigned *)st = 0;
-			*wc = c;
-			return N-n;
-		}
-		if (n) {
-			if (*s-0x80u >= 0x40) goto ilseq;
-			goto loop;
-		}
-	}
-
-	*(unsigned *)st = c;
-	return -2;
+size_t mbrtowc(wchar_t *wc, const char *src, size_t n, mbstate_t *st) {
+  static unsigned internal_state;
+  long wut;
+  unsigned c;
+  const unsigned char *s = (const void *)src;
+  const unsigned N = n;
+  wchar_t dummy;
+  if (!st)
+    st = (void *)&internal_state;
+  c = *(unsigned *)st;
+  if (!s) {
+    if (c)
+      goto ilseq;
+    return 0;
+  } else if (!wc) {
+    wc = &dummy;
+  }
+  if (!n)
+    return -2;
+  if (!c) {
+    if (*s < 0x80)
+      return !!(*wc = *s);
+    if (MB_CUR_MAX == 1)
+      return (*wc = CODEUNIT(*s)), 1;
+    if (*s - SA > SB - SA)
+      goto ilseq;
+    wut = *s++ - SA;
+    wut = MAX(0, MIN(ARRAYLEN(kMbBittab) - 1, wut));
+    c = kMbBittab[wut];
+    n--;
+  }
+  if (n) {
+    if (OOB(c, *s))
+      goto ilseq;
+  loop:
+    c = c << 6 | (*s++ - 0x80);
+    n--;
+    if (!(c & (1U << 31))) {
+      *(unsigned *)st = 0;
+      *wc = c;
+      return N - n;
+    }
+    if (n) {
+      if (*s - 0x80u >= 0x40)
+        goto ilseq;
+      goto loop;
+    }
+  }
+  *(unsigned *)st = c;
+  return -2;
 ilseq:
-	*(unsigned *)st = 0;
-	errno = EILSEQ;
-	return -1;
+  *(unsigned *)st = 0;
+  errno = EILSEQ;
+  return -1;
 }
diff --git a/libc/intrin/personality.c b/libc/str/mbsinit.c
similarity index 92%
rename from libc/intrin/personality.c
rename to libc/str/mbsinit.c
index de4dce7a5..e7bac0d7f 100644
--- a/libc/intrin/personality.c
+++ b/libc/str/mbsinit.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,7 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/str.h"
 
-__attribute__((__weak__)) void __gxx_personality_v0() {
-  __builtin_trap();
+int mbsinit(const mbstate_t *t) {
+  return !t || !*t;
 }
diff --git a/third_party/musl/mbsnrtowcs.c b/libc/str/mbsnrtowcs.c
similarity index 61%
rename from third_party/musl/mbsnrtowcs.c
rename to libc/str/mbsnrtowcs.c
index ffd778bc1..9884cc3a9 100644
--- a/third_party/musl/mbsnrtowcs.c
+++ b/libc/str/mbsnrtowcs.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,59 +25,68 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <wchar.h>
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/macros.internal.h"
+#include "libc/str/mb.internal.h"
+#include "libc/str/str.h"
 __static_yoink("musl_libc_notice");
 
-size_t mbsnrtowcs(wchar_t *restrict wcs, const char **restrict src, size_t n, size_t wn, mbstate_t *restrict st)
-{
-	size_t l, cnt=0, n2;
-	wchar_t *ws, wbuf[256];
-	const char *s = *src;
-	const char *tmp_s;
-
-	if (!wcs) ws = wbuf, wn = sizeof wbuf / sizeof *wbuf;
-	else ws = wcs;
-
-	/* making sure output buffer size is at most n/4 will ensure
-	 * that mbsrtowcs never reads more than n input bytes. thus
-	 * we can use mbsrtowcs as long as it's practical.. */
-
-	while ( s && wn && ( (n2=n/4)>=wn || n2>32 ) ) {
-		if (n2>=wn) n2=wn;
-		tmp_s = s;
-		l = mbsrtowcs(ws, &s, n2, st);
-		if (!(l+1)) {
-			cnt = l;
-			wn = 0;
-			break;
-		}
-		if (ws != wbuf) {
-			ws += l;
-			wn -= l;
-		}
-		n = s ? n - (s - tmp_s) : 0;
-		cnt += l;
-	}
-	if (s) while (wn && n) {
-		l = mbrtowc(ws, s, n, st);
-		if (l+2<=2) {
-			if (!(l+1)) {
-				cnt = l;
-				break;
-			}
-			if (!l) {
-				s = 0;
-				break;
-			}
-			/* have to roll back partial character */
-			*(unsigned *)st = 0;
-			break;
-		}
-		s += l; n -= l;
-		/* safe - this loop runs fewer than sizeof(wbuf)/8 times */
-		ws++; wn--;
-		cnt++;
-	}
-	if (wcs) *src = s;
-	return cnt;
+size_t mbsnrtowcs(wchar_t *wcs, const char **src, size_t n, size_t wn,
+                  mbstate_t *st) {
+  size_t l, cnt = 0, n2;
+  wchar_t *ws, wbuf[256];
+  const char *s = *src;
+  const char *tmp_s;
+  if (!wcs) {
+    ws = wbuf, wn = sizeof(wbuf) / sizeof(*wbuf);
+  } else {
+    ws = wcs;
+  }
+  /* making sure output buffer size is at most n/4 will ensure
+   * that mbsrtowcs never reads more than n input bytes. thus
+   * we can use mbsrtowcs as long as it's practical.. */
+  while (s && wn && ((n2 = n / 4) >= wn || n2 > 32)) {
+    if (n2 >= wn)
+      n2 = wn;
+    tmp_s = s;
+    l = mbsrtowcs(ws, &s, n2, st);
+    if (!(l + 1)) {
+      cnt = l;
+      wn = 0;
+      break;
+    }
+    if (ws != wbuf) {
+      ws += l;
+      wn -= l;
+    }
+    n = s ? n - (s - tmp_s) : 0;
+    cnt += l;
+  }
+  if (s)
+    while (wn && n) {
+      l = mbrtowc(ws, s, n, st);
+      if (l + 2 <= 2) {
+        if (!(l + 1)) {
+          cnt = l;
+          break;
+        }
+        if (!l) {
+          s = 0;
+          break;
+        }
+        /* have to roll back partial character */
+        *(unsigned *)st = 0;
+        break;
+      }
+      s += l;
+      n -= l;
+      /* safe - this loop runs fewer than sizeof(wbuf)/8 times */
+      ws++;
+      wn--;
+      cnt++;
+    }
+  if (wcs)
+    *src = s;
+  return cnt;
 }
diff --git a/third_party/musl/newlocale.c b/libc/str/mbsrtowcs.c
similarity index 51%
rename from third_party/musl/newlocale.c
rename to libc/str/mbsrtowcs.c
index fb63d7410..eaebe234c 100644
--- a/third_party/musl/newlocale.c
+++ b/libc/str/mbsrtowcs.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,70 +25,123 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/runtime/runtime.h"
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/macros.internal.h"
+#include "libc/str/mb.internal.h"
 #include "libc/str/str.h"
-#include "libc/str/locale.internal.h"
 __static_yoink("musl_libc_notice");
 
-#define malloc _mapanon
-#define calloc undef
-#define realloc undef
-#define free undef
-
-static int default_locale_init_done;
-static struct __locale_struct default_locale, default_ctype_locale;
-
-int __loc_is_allocated(locale_t loc)
-{
-	return loc && loc != C_LOCALE && loc != UTF8_LOCALE
-		&& loc != &default_locale && loc != &default_ctype_locale;
-}
-
-static locale_t do_newlocale(int mask, const char *name, locale_t loc)
-{
-	struct __locale_struct tmp;
-
-	for (int i=0; i<LC_ALL; i++) {
-		tmp.cat[i] = (!(mask & (1<<i)) && loc) ? loc->cat[i] :
-			__get_locale(i, (mask & (1<<i)) ? name : "");
-		if (tmp.cat[i] == LOC_MAP_FAILED)
-			return 0;
-	}
-
-	/* For locales with allocated storage, modify in-place. */
-	if (__loc_is_allocated(loc)) {
-		*loc = tmp;
-		return loc;
-	}
-
-	/* Otherwise, first see if we can use one of the builtin locales.
-	 * This makes the common usage case for newlocale, getting a C locale
-	 * with predictable behavior, very fast, and more importantly, fail-safe. */
-	if (!memcmp(&tmp, C_LOCALE, sizeof tmp)) return C_LOCALE;
-	if (!memcmp(&tmp, UTF8_LOCALE, sizeof tmp)) return UTF8_LOCALE;
-
-	/* And provide builtins for the initial default locale, and a
-	 * variant of the C locale honoring the default locale's encoding. */
-	if (!default_locale_init_done) {
-		for (int i=0; i<LC_ALL; i++)
-			default_locale.cat[i] = __get_locale(i, "");
-		default_ctype_locale.cat[LC_CTYPE] = default_locale.cat[LC_CTYPE];
-		default_locale_init_done = 1;
-	}
-	if (!memcmp(&tmp, &default_locale, sizeof tmp)) return &default_locale;
-	if (!memcmp(&tmp, &default_ctype_locale, sizeof tmp))
-		return &default_ctype_locale;
-
-	/* If no builtin locale matched, attempt to allocate and copy. */
-	if ((loc = malloc(sizeof *loc))) *loc = tmp;
-
-	return loc;
-}
-
-locale_t newlocale(int mask, const char *name, locale_t loc)
-{
-	pthread_mutex_lock(&__locale_lock);
-	loc = do_newlocale(mask, name, loc);
-	pthread_mutex_unlock(&__locale_lock);
-	return loc;
+size_t mbsrtowcs(wchar_t *ws, const char **src, size_t wn, mbstate_t *st) {
+  const unsigned char *s = (const void *)*src;
+  size_t wn0 = wn;
+  unsigned c = 0;
+  if (st && (c = *(unsigned *)st)) {
+    if (ws) {
+      *(unsigned *)st = 0;
+      goto resume;
+    } else {
+      goto resume0;
+    }
+  }
+  if (MB_CUR_MAX == 1) {
+    if (!ws)
+      return strlen((const char *)s);
+    for (;;) {
+      if (!wn) {
+        *src = (const void *)s;
+        return wn0;
+      }
+      if (!*s)
+        break;
+      c = *s++;
+      *ws++ = CODEUNIT(c);
+      wn--;
+    }
+    *ws = 0;
+    *src = 0;
+    return wn0 - wn;
+  }
+  if (!ws)
+    for (;;) {
+      if (*s - 1u < 0x7f) {
+        s++;
+        wn--;
+        continue;
+      }
+      if (*s - SA > SB - SA)
+        break;
+      c = kMbBittab[*s++ - SA];
+    resume0:
+      if (OOB(c, *s)) {
+        s--;
+        break;
+      }
+      s++;
+      if (c & (1U << 25)) {
+        if (*s - 0x80u >= 0x40) {
+          s -= 2;
+          break;
+        }
+        s++;
+        if (c & (1U << 19)) {
+          if (*s - 0x80u >= 0x40) {
+            s -= 3;
+            break;
+          }
+          s++;
+        }
+      }
+      wn--;
+      c = 0;
+    }
+  else
+    for (;;) {
+      if (!wn) {
+        *src = (const void *)s;
+        return wn0;
+      }
+      if (*s - 1u < 0x7f) {
+        *ws++ = *s++;
+        wn--;
+        continue;
+      }
+      if (*s - SA > SB - SA)
+        break;
+      c = kMbBittab[*s++ - SA];
+    resume:
+      if (OOB(c, *s)) {
+        s--;
+        break;
+      }
+      c = (c << 6) | (*s++ - 0x80);
+      if (c & (1U << 31)) {
+        if (*s - 0x80u >= 0x40) {
+          s -= 2;
+          break;
+        }
+        c = (c << 6) | (*s++ - 0x80);
+        if (c & (1U << 31)) {
+          if (*s - 0x80u >= 0x40) {
+            s -= 3;
+            break;
+          }
+          c = (c << 6) | (*s++ - 0x80);
+        }
+      }
+      *ws++ = c;
+      wn--;
+      c = 0;
+    }
+  if (!c && !*s) {
+    if (ws) {
+      *ws = 0;
+      *src = 0;
+    }
+    return wn0 - wn;
+  }
+  errno = EILSEQ;
+  if (ws)
+    *src = (const void *)s;
+  return -1;
 }
diff --git a/libc/str/mbstowcs.c b/libc/str/mbstowcs.c
new file mode 100644
index 000000000..936499e03
--- /dev/null
+++ b/libc/str/mbstowcs.c
@@ -0,0 +1,23 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/str.h"
+
+size_t mbstowcs(wchar_t *pwc, const char *s, size_t wn) {
+  return mbsrtowcs(pwc, (void *)&s, wn, 0);
+}
diff --git a/third_party/musl/mbtowc.c b/libc/str/mbtowc.c
similarity index 68%
rename from third_party/musl/mbtowc.c
rename to libc/str/mbtowc.c
index 849d40898..34b5f773d 100644
--- a/third_party/musl/mbtowc.c
+++ b/libc/str/mbtowc.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,53 +25,53 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <stdlib.h>
-#include <wchar.h>
-#include <errno.h>
-#include "multibyte.h"
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/str/mb.internal.h"
+#include "libc/str/str.h"
 __static_yoink("musl_libc_notice");
 
-#pragma GCC diagnostic ignored "-Wparentheses"
-
-int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n)
-{
-	unsigned c;
-	const unsigned char *s = (const void *)src;
-	wchar_t dummy;
-
-	if (!s) return 0;
-	if (!n) goto ilseq;
-	if (!wc) wc = &dummy;
-
-	if (*s < 0x80) return !!(*wc = *s);
-	if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
-	if (*s-SA > SB-SA) goto ilseq;
-	c = bittab[*s++-SA];
-
-	/* Avoid excessive checks against n: If shifting the state n-1
-	 * times does not clear the high bit, then the value of n is
-	 * insufficient to read a character */
-	if (n<4 && ((c<<(6*n-6)) & (1U<<31))) goto ilseq;
-
-	if (OOB(c,*s)) goto ilseq;
-	c = c<<6 | *s++-0x80;
-	if (!(c&(1U<<31))) {
-		*wc = c;
-		return 2;
-	}
-
-	if (*s-0x80u >= 0x40) goto ilseq;
-	c = c<<6 | *s++-0x80;
-	if (!(c&(1U<<31))) {
-		*wc = c;
-		return 3;
-	}
-
-	if (*s-0x80u >= 0x40) goto ilseq;
-	*wc = c<<6 | *s++-0x80;
-	return 4;
-
+int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n) {
+  unsigned c;
+  const unsigned char *s = (const void *)src;
+  wchar_t dummy;
+  if (!s)
+    return 0;
+  if (!n)
+    goto ilseq;
+  if (!wc)
+    wc = &dummy;
+  if (*s < 0x80)
+    return !!(*wc = *s);
+  if (MB_CUR_MAX == 1)
+    return (*wc = CODEUNIT(*s)), 1;
+  if (*s - SA > SB - SA)
+    goto ilseq;
+  c = kMbBittab[*s++ - SA];
+  /* Avoid excessive checks against n: If shifting the state n-1
+   * times does not clear the high bit, then the value of n is
+   * insufficient to read a character */
+  if (n < 4 && ((c << (6 * n - 6)) & (1U << 31)))
+    goto ilseq;
+  if (OOB(c, *s))
+    goto ilseq;
+  c = c << 6 | (*s++ - 0x80);
+  if (!(c & (1U << 31))) {
+    *wc = c;
+    return 2;
+  }
+  if (*s - 0x80u >= 0x40)
+    goto ilseq;
+  c = c << 6 | (*s++ - 0x80);
+  if (!(c & (1U << 31))) {
+    *wc = c;
+    return 3;
+  }
+  if (*s - 0x80u >= 0x40)
+    goto ilseq;
+  *wc = c << 6 | (*s++ - 0x80);
+  return 4;
 ilseq:
-	errno = EILSEQ;
-	return -1;
+  errno = EILSEQ;
+  return -1;
 }
diff --git a/libc/str/memcasecmp.c b/libc/str/memcasecmp.c
index 28a221c5f..05e552f0b 100644
--- a/libc/str/memcasecmp.c
+++ b/libc/str/memcasecmp.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/serialize.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * Compares memory case-insensitively.
diff --git a/libc/str/memccpy.c b/libc/str/memccpy.c
index 2d42920a6..e910f0929 100644
--- a/libc/str/memccpy.c
+++ b/libc/str/memccpy.c
@@ -45,14 +45,13 @@
  * @asyncsignalsafe
  */
 void *memccpy(void *dst, const void *src, int c, size_t n) {
-  const char *p;
-  // this memchr() call is only correct if your memchr() implementation
-  // offers the same readahead safety guarantees as cosmopolitan's does
-  if ((p = memchr(src, c, n))) {
-    size_t m = p + 1 - (const char *)src;
-    memmove(dst, src, m);
-    return (char *)dst + m;
+  char *d;
+  size_t i;
+  const char *s;
+  for (d = dst, s = src, i = 0; i < n; ++i) {
+    if (((d[i] = s[i]) & 255) == (c & 255)) {
+      return d + i + 1;
+    }
   }
-  memmove(dst, src, n);
   return 0;
 }
diff --git a/libc/str/memmem.c b/libc/str/memmem.c
index f2b072275..ef3f721f0 100644
--- a/libc/str/memmem.c
+++ b/libc/str/memmem.c
@@ -16,60 +16,49 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/dce.h"
 #include "libc/intrin/likely.h"
-#include "libc/str/kmp.h"
 #include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/intel/emmintrin.internal.h"
+
+typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
 
 /**
  * Searches for fixed-length substring in memory region.
  *
- * This function offers assurances against pathological cases, using KMP
- * if no progress is being made on the O(nm) vectorized fast path. It is
- * important to note that, if `needle` is untrusted, that it not be long
- * enough to overflow the stack. That's because KMP needs to allocate an
- * array of longs the same length as `needle` and it needs to do it with
- * stack memory because this function is safe to call in signal handlers
- *
  * @param haystack is the region of memory to be searched
  * @param haystacklen is its character count
  * @param needle contains the memory for which we're searching
  * @param needlelen is its character count
  * @return pointer to first result or NULL if not found
- * @asyncsignalsafe
  */
 __vex void *memmem(const void *haystack, size_t haystacklen, const void *needle,
                    size_t needlelen) {
 #if defined(__x86_64__) && !defined(__chibicc__)
   char c;
-  __m128i n;
-  const __m128i *v;
+  xmm_t n;
+  const xmm_t *v;
   unsigned i, k, m;
-  long progress = 0;
   const char *p, *q, *e;
-  long scare = -(needlelen * 10);
   if (!needlelen)
     return (void *)haystack;
   if (UNLIKELY(needlelen > haystacklen))
     return 0;
   q = needle;
   c = *q;
-  n = _mm_set1_epi8(c);
+  n = (xmm_t){c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
   p = haystack;
   e = p + haystacklen;
   k = (uintptr_t)p & 15;
-  v = (const __m128i *)((uintptr_t)p & -16);
-  m = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_load_si128(v), n));
+  v = (const xmm_t *)((uintptr_t)p & -16);
+  m = __builtin_ia32_pmovmskb128(*v == n);
   m >>= k;
   m <<= k;
   for (;;) {
     while (!m) {
       ++v;
-      progress += 16;
       if ((const char *)v >= e)
         return 0;
-      m = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_load_si128(v), n));
+      m = __builtin_ia32_pmovmskb128(*v == n);
     }
     do {
       k = __builtin_ctzl(m);
@@ -77,8 +66,6 @@ __vex void *memmem(const void *haystack, size_t haystacklen, const void *needle,
       if (UNLIKELY(p + needlelen > e))
         return 0;
       for (i = 1;; ++i) {
-        if (--progress <= scare)
-          goto OfferPathologicalAssurances;
         if (i == needlelen)
           return (/*unconst*/ char *)p;
         if (p[i] != q[i])
@@ -87,59 +74,22 @@ __vex void *memmem(const void *haystack, size_t haystacklen, const void *needle,
       m &= ~(1 << k);
     } while (m);
   }
-OfferPathologicalAssurances:
-#elif defined(__aarch64__) && defined(__ARM_NEON)
-  char c;
-  uint8x16_t n;
-  const uint8x16_t *v;
-  size_t i, k;
-  uint64_t m;
-  long progress = 0;
-  const char *p, *q, *e;
-  long scare = -(needlelen * 10);
+#else
+  size_t i, j;
   if (!needlelen)
     return (void *)haystack;
-  if (UNLIKELY(needlelen > haystacklen))
+  if (needlelen > haystacklen)
     return 0;
-  q = needle;
-  c = *q;
-  n = vdupq_n_u8(c);
-  p = haystack;
-  e = p + haystacklen;
-  k = (uintptr_t)p & 15;
-  v = (const uint8x16_t *)((uintptr_t)p & -16);
-  uint8x16_t cmp = vceqq_u8(vld1q_u8((const uint8_t *)v), n);
-  uint8x8_t mask = vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4);
-  vst1_u8((uint8_t *)&m, mask);
-  m >>= k * 4;
-  m <<= k * 4;
-  for (;;) {
-    while (!m) {
-      ++v;
-      progress += 16;
-      if ((const char *)v >= e)
-        return 0;
-      cmp = vceqq_u8(vld1q_u8((const uint8_t *)v), n);
-      mask = vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4);
-      vst1_u8((uint8_t *)&m, mask);
+  for (i = 0; i < haystacklen; ++i) {
+    for (j = 0;; ++j) {
+      if (j == needlelen)
+        return (/*unconst*/ char *)haystack + i;
+      if (i + j == haystacklen)
+        break;
+      if (((char *)haystack)[i + j] != ((char *)needle)[j])
+        break;
     }
-    do {
-      k = __builtin_ctzll(m) >> 2;
-      p = (const char *)v + k;
-      if (UNLIKELY(p + needlelen > e))
-        return 0;
-      for (i = 1;; ++i) {
-        if (--progress <= scare)
-          goto OfferPathologicalAssurances;
-        if (i == needlelen)
-          return (/*unconst*/ char *)p;
-        if (p[i] != q[i])
-          break;
-      }
-      m &= ~(0xFULL << (k * 4));
-    } while (m);
   }
-OfferPathologicalAssurances:
+  return 0;
 #endif
-  return __memmem_kmp(haystack, haystacklen, needle, needlelen);
 }
diff --git a/libc/str/newlocale.c b/libc/str/newlocale.c
new file mode 100644
index 000000000..ad0d5ff2e
--- /dev/null
+++ b/libc/str/newlocale.c
@@ -0,0 +1,25 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/locale.h"
+#include "libc/sysv/errfuns.h"
+
+locale_t newlocale(int catmask, const char *locale, locale_t base) {
+  // TODO: implement me
+  return 0;
+}
diff --git a/libc/str/nonspacing.inc b/libc/str/nonspacing.inc
deleted file mode 100644
index 7746f3b60..000000000
--- a/libc/str/nonspacing.inc
+++ /dev/null
@@ -1,91 +0,0 @@
-16,16,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,16,33,16,16,16,34,35,36,
-37,38,39,40,16,16,41,16,16,16,16,16,16,16,16,16,16,16,42,43,16,16,44,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,45,16,46,47,48,49,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,50,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,51,16,16,52,
-53,16,54,55,56,16,16,16,16,16,16,57,16,16,58,16,59,60,61,62,63,64,65,66,67,68,
-69,70,16,71,72,73,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,74,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,75,76,16,16,16,77,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,78,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,79,80,16,16,16,16,16,16,16,81,16,16,16,16,16,82,83,84,16,16,16,16,16,85,
-86,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,248,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,254,255,255,255,255,191,182,0,0,0,0,0,0,0,63,0,255,23,0,0,0,0,0,248,255,
-255,0,0,1,0,0,0,0,0,0,0,0,0,0,0,192,191,159,61,0,0,0,128,2,0,0,0,255,255,255,
-7,0,0,0,0,0,0,0,0,0,0,192,255,1,0,0,0,0,0,0,248,15,32,0,0,192,251,239,62,0,0,
-0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248,255,255,255,255,
-255,7,0,0,0,0,0,0,20,254,33,254,0,12,0,0,0,2,0,0,0,0,0,0,16,30,32,0,0,12,0,0,
-64,6,0,0,0,0,0,0,16,134,57,2,0,0,0,35,0,6,0,0,0,0,0,0,16,190,33,0,0,12,0,0,
-252,2,0,0,0,0,0,0,144,30,32,64,0,12,0,0,0,4,0,0,0,0,0,0,0,1,32,0,0,0,0,0,0,17,
-0,0,0,0,0,0,192,193,61,96,0,12,0,0,0,2,0,0,0,0,0,0,144,64,48,0,0,12,0,0,0,3,0,
-0,0,0,0,0,24,30,32,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,4,92,0,0,0,0,0,0,0,0,0,0,0,
-242,7,128,127,0,0,0,0,0,0,0,0,0,0,0,0,242,31,0,63,0,0,0,0,0,0,0,0,0,3,0,0,160,
-2,0,0,0,0,0,0,254,127,223,224,255,254,255,255,255,31,64,0,0,0,0,0,0,0,0,0,0,0,
-0,224,253,102,0,0,0,195,1,0,30,0,100,32,0,32,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,224,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,28,0,0,0,28,0,0,0,12,0,0,0,12,0,0,0,0,0,0,0,176,63,64,254,
-15,32,0,0,0,0,0,120,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,0,2,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,135,1,4,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-128,9,0,0,0,0,0,0,64,127,229,31,248,159,0,0,0,0,0,0,255,127,0,0,0,0,0,0,0,0,
-15,0,0,0,0,0,208,23,4,0,0,0,0,248,15,0,3,0,0,0,60,59,0,0,0,0,0,0,64,163,3,0,0,
-0,0,0,0,240,207,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,247,255,253,33,16,
-3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,
-251,0,248,0,0,0,124,0,0,0,0,0,0,223,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,
-255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,3,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0,
-0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,128,247,63,0,0,0,192,0,0,0,0,0,0,0,0,0,0,3,0,68,8,0,0,96,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0,255,255,3,128,0,0,0,0,192,63,0,0,128,255,3,0,
-0,0,0,0,7,0,0,0,0,0,200,51,0,0,0,0,32,0,0,
-0,0,0,0,0,0,126,102,0,8,16,0,0,0,0,0,16,0,0,0,0,0,0,157,193,2,0,0,0,0,48,64,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,33,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,0,0,0,
-64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,
-255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,110,240,0,
-0,0,0,0,135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,0,0,0,0,240,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,255,1,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,255,127,0,0,0,0,0,0,128,
-3,0,0,0,0,0,120,38,0,32,0,0,0,0,0,0,7,0,0,0,128,239,31,0,0,0,0,0,0,0,8,0,3,0,
-0,0,0,0,192,127,0,30,0,0,0,0,0,0,0,0,0,0,0,128,211,64,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,128,248,7,0,0,3,0,0,0,0,0,0,24,1,0,0,0,192,31,31,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,255,92,0,0,64,0,0,0,0,0,0,0,0,0,0,248,133,13,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60,176,1,0,0,48,0,0,0,0,0,0,0,0,0,0,
-248,167,1,0,0,0,0,0,0,0,0,0,0,0,0,40,191,0,0,0,0,0,0,0,0,0,0,0,0,224,188,15,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,255,6,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,240,12,1,0,0,0,254,7,0,0,0,0,248,121,128,0,126,14,0,0,0,0,0,252,
-127,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,191,0,0,0,0,0,0,0,0,0,0,252,255,
-255,252,109,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,126,180,191,0,0,0,0,0,0,0,0,0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,255,
-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,7,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,15,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,3,248,255,231,15,0,0,0,60,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,
-255,255,255,255,127,248,255,255,255,255,255,31,32,0,16,0,0,248,254,255,0,0,0,
-0,0,0,0,0,0,0,127,255,255,249,219,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,240,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,240,7,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
diff --git a/libc/intrin/pthread_mutex_wipe_np.c b/libc/str/setlocale.c
similarity index 72%
rename from libc/intrin/pthread_mutex_wipe_np.c
rename to libc/str/setlocale.c
index 9c19f6d0a..9e4ecbeed 100644
--- a/libc/intrin/pthread_mutex_wipe_np.c
+++ b/libc/str/setlocale.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,24 +16,28 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/atomic.h"
+#include "libc/intrin/safemacros.h"
+#include "libc/intrin/strace.h"
+#include "libc/str/locale.h"
 #include "libc/str/str.h"
-#include "libc/thread/lock.h"
-#include "libc/thread/posixthread.internal.h"
-#include "libc/thread/thread.h"
 
 /**
- * Unlocks mutex from child process after fork.
+ * Sets program locale.
+ *
+ * Cosmopolitan only supports the C or POSIX locale with UTF-8.
  */
-int _pthread_mutex_wipe_np(pthread_mutex_t *mutex) {
-  atomic_init(&mutex->_word, MUTEX_UNLOCK(atomic_load_explicit(
-                                 &mutex->_word, memory_order_relaxed)));
-  atomic_init(&mutex->_futex, 0);
-  mutex->_pid = 0;
-  mutex->_nsync[0] = 0;
-  atomic_signal_fence(memory_order_relaxed);  // avoid xmm
-  mutex->_nsync[1] = 0;
-  return 0;
+char *setlocale(int category, const char *locale) {
+  char *res;
+  if (!locale || (*locale == '\0')) {
+    res = "C";
+  } else if (!strcmp(locale, "C") ||        //
+             !strcmp(locale, "POSIX") ||    //
+             !strcmp(locale, "C.UTF-8") ||  //
+             !strcmp(locale, "en_US.UTF-8")) {
+    res = (char *)locale;
+  } else {
+    res = NULL;
+  }
+  STRACE("setlocale(%d, %#s) → %s", category, locale, res);
+  return res;
 }
-
-__weak_reference(_pthread_mutex_wipe_np, pthread_mutex_wipe_np);
diff --git a/libc/str/startswithi.c b/libc/str/startswithi.c
index 136192abf..974ece794 100644
--- a/libc/str/startswithi.c
+++ b/libc/str/startswithi.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * Checks if string starts with prefix, case insensitively.
diff --git a/libc/str/stpncpy.c b/libc/str/stpncpy.c
index 66d2d3e55..30b6f2bb9 100644
--- a/libc/str/stpncpy.c
+++ b/libc/str/stpncpy.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 
 /**
diff --git a/libc/str/str.h b/libc/str/str.h
index 2075dcf4b..538acdeaa 100644
--- a/libc/str/str.h
+++ b/libc/str/str.h
@@ -27,7 +27,10 @@ COSMOPOLITAN_C_START_
 void *memset(void *, int, size_t) memcpyesque;
 void *memmove(void *, const void *, size_t) memcpyesque;
 void *memcpy(void *, const void *, size_t) memcpyesque;
+void *mempcpy(void *, const void *, size_t) memcpyesque;
 char *hexpcpy(char *, const void *, size_t) memcpyesque;
+void *memccpy(void *, const void *, int, size_t) memcpyesque;
+void explicit_bzero(void *, size_t);
 
 int memcmp(const void *, const void *, size_t) strlenesque;
 int timingsafe_bcmp(const void *, const void *, size_t) libcesque;
@@ -38,6 +41,7 @@ size_t strnlen(const char *, size_t) strlenesque;
 size_t strnlen_s(const char *, size_t) libcesque;
 char *strchr(const char *, int) strlenesque;
 void *memchr(const void *, int, size_t) strlenesque;
+char *strchrnul(const char *, int) strlenesque returnsnonnull;
 void *rawmemchr(const void *, int) strlenesque returnsnonnull;
 size_t wcslen(const wchar_t *) strlenesque;
 size_t wcsnlen(const wchar_t *, size_t) strlenesque;
@@ -47,6 +51,7 @@ wchar_t *wmemchr(const wchar_t *, wchar_t, size_t) strlenesque;
 wchar_t *wcschrnul(const wchar_t *, wchar_t)
 strlenesque returnsnonnull;
 char *strstr(const char *, const char *) strlenesque;
+char *strcasestr(const char *, const char *) strlenesque;
 wchar_t *wcsstr(const wchar_t *, const wchar_t *) strlenesque;
 int strcmp(const char *, const char *) strlenesque;
 int strncmp(const char *, const char *, size_t) strlenesque;
@@ -54,11 +59,14 @@ int wcscmp(const wchar_t *, const wchar_t *) strlenesque;
 int wcsncmp(const wchar_t *, const wchar_t *, size_t) strlenesque;
 int wmemcmp(const wchar_t *, const wchar_t *, size_t) strlenesque;
 int strcasecmp(const char *, const char *) strlenesque;
+int memcasecmp(const void *, const void *, size_t) strlenesque;
 int wcscasecmp(const wchar_t *, const wchar_t *) strlenesque;
 int strncasecmp(const char *, const char *, size_t) strlenesque;
 int wcsncasecmp(const wchar_t *, const wchar_t *, size_t) strlenesque;
 char *strrchr(const char *, int) strlenesque;
+void *memrchr(const void *, int, size_t) strlenesque;
 wchar_t *wcsrchr(const wchar_t *, wchar_t) strlenesque;
+void *wmemrchr(const wchar_t *, wchar_t, size_t) strlenesque;
 char *strpbrk(const char *, const char *) strlenesque;
 wchar_t *wcspbrk(const wchar_t *, const wchar_t *) strlenesque;
 size_t strspn(const char *, const char *) strlenesque;
@@ -67,10 +75,13 @@ size_t strcspn(const char *, const char *) strlenesque;
 size_t wcscspn(const wchar_t *, const wchar_t *) strlenesque;
 void *memfrob(void *, size_t) memcpyesque;
 int strcoll(const char *, const char *) strlenesque;
+char *strsep(char **, const char *) libcesque paramsnonnull();
 char *stpcpy(char *, const char *) memcpyesque;
 char *stpncpy(char *, const char *, size_t) memcpyesque;
 char *strcat(char *, const char *) memcpyesque;
 wchar_t *wcscat(wchar_t *, const wchar_t *) memcpyesque;
+size_t strlcpy(char *, const char *, size_t) libcesque;
+size_t strlcat(char *, const char *, size_t) libcesque;
 size_t strxfrm(char *, const char *, size_t) libcesque;
 char *strcpy(char *, const char *) memcpyesque;
 wchar_t *wcscpy(wchar_t *, const wchar_t *) memcpyesque;
@@ -80,9 +91,13 @@ char *strncpy(char *, const char *, size_t) memcpyesque;
 char *strtok(char *, const char *) paramsnonnull((2)) libcesque;
 char *strtok_r(char *, const char *, char **) paramsnonnull((2, 3));
 wchar_t *wcstok(wchar_t *, const wchar_t *, wchar_t **) paramsnonnull((2, 3));
+int strverscmp(const char *, const char *) libcesque;
 wchar_t *wmemset(wchar_t *, wchar_t, size_t) memcpyesque;
 wchar_t *wmemcpy(wchar_t *, const wchar_t *, size_t) memcpyesque;
+wchar_t *wmempcpy(wchar_t *, const wchar_t *, size_t) memcpyesque;
 wchar_t *wmemmove(wchar_t *, const wchar_t *, size_t) memcpyesque;
+void *memmem(const void *, size_t, const void *, size_t)
+libcesque nosideeffect;
 ssize_t strfmon(char *, size_t, const char *, ...) libcesque;
 long a64l(const char *) libcesque;
 char *l64a(long) libcesque;
@@ -116,34 +131,6 @@ char *strerror(int) returnsnonnull dontthrow dontcallback;
 errno_t strerror_r(int, char *, size_t) libcesque;
 char *__xpg_strerror_r(int, char *, size_t) libcesque;
 
-int bcmp(const void *, const void *, size_t) strlenesque;
-void bcopy(const void *, void *, size_t) memcpyesque;
-void bzero(void *, size_t) memcpyesque;
-char *index(const char *, int) strlenesque;
-char *rindex(const char *, int) strlenesque;
-
-#if defined(_COSMO_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE) || \
-    defined(_XOPEN_SOURCE)
-void *memccpy(void *, const void *, int, size_t) memcpyesque;
-#endif
-
-#if defined(_COSMO_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
-char *strsep(char **, const char *) libcesque paramsnonnull();
-void explicit_bzero(void *, size_t);
-size_t strlcpy(char *, const char *, size_t) libcesque;
-size_t strlcat(char *, const char *, size_t) libcesque;
-#endif
-
-#if defined(_COSMO_SOURCE) || defined(_GNU_SOURCE)
-int strverscmp(const char *, const char *) libcesque;
-char *strchrnul(const char *, int) strlenesque returnsnonnull;
-char *strcasestr(const char *, const char *) strlenesque;
-void *memmem(const void *, size_t, const void *, size_t) libcesque;
-void *memrchr(const void *, int, size_t) strlenesque;
-void *mempcpy(void *, const void *, size_t) memcpyesque;
-wchar_t *wmempcpy(wchar_t *, const wchar_t *, size_t) memcpyesque;
-#endif
-
 #ifdef _COSMO_SOURCE
 pureconst uint64_t tpenc(uint32_t) libcesque;
 char *chomp(char *) libcesque;
@@ -154,7 +141,6 @@ bool32 startswithi(const char *, const char *) strlenesque;
 bool32 endswith(const char *, const char *) strlenesque;
 bool32 istext(const void *, size_t) libcesque;
 bool32 isutf8(const void *, size_t) libcesque;
-void *wmemrchr(const wchar_t *, wchar_t, size_t) strlenesque;
 const char *strsignal_r(int, char[21]) returnsnonnull libcesque __wur;
 char16_t *chomp16(char16_t *) libcesque;
 size_t strlen16(const char16_t *) strlenesque;
@@ -164,7 +150,6 @@ void *memchr16(const void *, int, size_t) strlenesque;
 char16_t *strchrnul16(const char16_t *, int) strlenesque returnsnonnull;
 void *rawmemchr16(const void *, int) strlenesque returnsnonnull;
 char16_t *strstr16(const char16_t *, const char16_t *) strlenesque;
-int memcasecmp(const void *, const void *, size_t) strlenesque;
 int strcmp16(const char16_t *, const char16_t *) strlenesque;
 int strncmp16(const char16_t *, const char16_t *, size_t) strlenesque;
 int strcasecmp16(const char16_t *, const char16_t *) strlenesque;
@@ -188,6 +173,12 @@ char *__join_paths(char *, size_t, const char *, const char *) libcesque __wur;
 int __mkntpathat(int, const char *, int, char16_t[hasatleast 1024]);
 #endif /* _COSMO_SOURCE */
 
+int bcmp(const void *, const void *, size_t) strlenesque;
+void bcopy(const void *, void *, size_t) memcpyesque;
+void bzero(void *, size_t) memcpyesque;
+char *index(const char *, int) strlenesque;
+char *rindex(const char *, int) strlenesque;
+
 COSMOPOLITAN_C_END_
 #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
 #endif /* COSMOPOLITAN_LIBC_STR_STR_H_ */
diff --git a/libc/str/strcasecmp.c b/libc/str/strcasecmp.c
index 8510f6034..2443afd93 100644
--- a/libc/str/strcasecmp.c
+++ b/libc/str/strcasecmp.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * Compares NUL-terminated strings ascii case-insensitively.
diff --git a/third_party/musl/strcasecmp16.c b/libc/str/strcasecmp16.c
similarity index 100%
rename from third_party/musl/strcasecmp16.c
rename to libc/str/strcasecmp16.c
diff --git a/libc/str/strcasestr.c b/libc/str/strcasestr.c
index cf0cfe2d5..26f969c09 100644
--- a/libc/str/strcasestr.c
+++ b/libc/str/strcasestr.c
@@ -17,69 +17,10 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/ctype.h"
-#include "libc/mem/alloca.h"
-#include "libc/runtime/stack.h"
-#include "libc/str/tab.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/intel/immintrin.internal.h"
+#include "libc/dce.h"
+#include "libc/str/tab.internal.h"
 
-static int ToUpper(int c) {
-  return 'a' <= c && c <= 'z' ? c - ('a' - 'A') : c;
-}
-
-static void computeLPS(const char *pattern, long M, long *lps) {
-  long len = 0;
-  lps[0] = 0;
-  long i = 1;
-  while (i < M) {
-    if (kToLower[pattern[i] & 255] == kToLower[pattern[len] & 255]) {
-      len++;
-      lps[i] = len;
-      i++;
-    } else {
-      if (len != 0) {
-        len = lps[len - 1];
-      } else {
-        lps[i] = 0;
-        i++;
-      }
-    }
-  }
-}
-
-static char *kmp(const char *s, size_t n, const char *ss, size_t m) {
-  if (!m)
-    return (char *)s;
-  if (n < m)
-    return NULL;
-#pragma GCC push_options
-#pragma GCC diagnostic ignored "-Walloca-larger-than="
-#pragma GCC diagnostic ignored "-Wanalyzer-out-of-bounds"
-  long need = sizeof(long) * m;
-  long *lps = (long *)alloca(need);
-  CheckLargeStackAllocation(lps, need);
-#pragma GCC pop_options
-  computeLPS(ss, m, lps);
-  long i = 0;
-  long j = 0;
-  while (i < n) {
-    if (kToLower[ss[j] & 255] == kToLower[s[i] & 255]) {
-      i++;
-      j++;
-    }
-    if (j == m) {
-      return (char *)(s + i - j);
-    } else if (i < n && kToLower[ss[j] & 255] != kToLower[s[i] & 255]) {
-      if (j != 0) {
-        j = lps[j - 1];
-      } else {
-        i++;
-      }
-    }
-  }
-  return NULL;
-}
+typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
 
 /**
  * Searches for substring case-insensitively.
@@ -87,108 +28,65 @@ static char *kmp(const char *s, size_t n, const char *ss, size_t m) {
  * @param haystack is the search area, as a NUL-terminated string
  * @param needle is the desired substring, also NUL-terminated
  * @return pointer to first substring within haystack, or NULL
+ * @note this implementation goes fast in practice but isn't hardened
+ *     against pathological cases, and therefore shouldn't be used on
+ *     untrustworthy data
  * @asyncsignalsafe
  * @see strstr()
  */
-char *strcasestr(const char *haystack, const char *needle) {
-  if (haystack == needle || !*needle)
-    return (char *)haystack;
+__vex char *strcasestr(const char *haystack, const char *needle) {
 #if defined(__x86_64__) && !defined(__chibicc__)
+  char c;
   size_t i;
   unsigned k, m;
-  const __m128i *p;
-  long progress = 0;
-  __m128i v, nl, nu, z = _mm_setzero_si128();
-  const char *hay = haystack;
-  char first_lower = kToLower[*needle & 255];
-  char first_upper = ToUpper(*needle);
-  nl = _mm_set1_epi8(first_lower);
-  nu = _mm_set1_epi8(first_upper);
+  const xmm_t *p;
+  xmm_t v, n1, n2, z = {0};
+  if (haystack == needle || !*needle)
+    return (char *)haystack;
+  c = *needle;
+  n1 = (xmm_t){c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
+  c = kToLower[c & 255];
+  n2 = (xmm_t){c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
   for (;;) {
-    k = (uintptr_t)hay & 15;
-    p = (const __m128i *)((uintptr_t)hay & -16);
-    v = _mm_load_si128(p);
-    m = _mm_movemask_epi8(_mm_or_si128(
-        _mm_or_si128(_mm_cmpeq_epi8(v, z),    // Check for null terminator
-                     _mm_cmpeq_epi8(v, nl)),  // Check lowercase
-        _mm_cmpeq_epi8(v, nu)));              // Check uppercase
+    k = (uintptr_t)haystack & 15;
+    p = (const xmm_t *)((uintptr_t)haystack & -16);
+    v = *p;
+    m = __builtin_ia32_pmovmskb128((v == z) | (v == n1) | (v == n2));
     m >>= k;
     m <<= k;
     while (!m) {
-      progress += 16;
-      v = _mm_load_si128(++p);
-      m = _mm_movemask_epi8(_mm_or_si128(
-          _mm_or_si128(_mm_cmpeq_epi8(v, z), _mm_cmpeq_epi8(v, nl)),
-          _mm_cmpeq_epi8(v, nu)));
+      v = *++p;
+      m = __builtin_ia32_pmovmskb128((v == z) | (v == n1) | (v == n2));
     }
-    int offset = __builtin_ctzl(m);
-    progress += offset;
-    hay = (const char *)p + offset;
+    haystack = (const char *)p + __builtin_ctzl(m);
     for (i = 0;; ++i) {
-      if (--progress <= -512)
-        goto OfferPathologicalAssurances;
       if (!needle[i])
-        return (char *)hay;
-      if (!hay[i])
+        return (/*unconst*/ char *)haystack;
+      if (!haystack[i])
         break;
-      if (kToLower[needle[i] & 255] != kToLower[hay[i] & 255])
+      if (kToLower[needle[i] & 255] != kToLower[haystack[i] & 255])
         break;
     }
-    if (!*hay++)
+    if (!*haystack++)
       break;
   }
   return 0;
-#elif defined(__aarch64__) && defined(__ARM_NEON)
+#else
   size_t i;
-  const char *hay = haystack;
-  uint8_t first_lower = kToLower[*needle & 255];
-  uint8_t first_upper = ToUpper(*needle);
-  uint8x16_t nl = vdupq_n_u8(first_lower);
-  uint8x16_t nu = vdupq_n_u8(first_upper);
-  uint8x16_t z = vdupq_n_u8(0);
-  long progress = 0;
+  if (haystack == needle || !*needle)
+    return (void *)haystack;
   for (;;) {
-    int k = (uintptr_t)hay & 15;
-    hay = (const char *)((uintptr_t)hay & -16);
-    uint8x16_t v = vld1q_u8((const uint8_t *)hay);
-    uint8x16_t cmp_lower = vceqq_u8(v, nl);
-    uint8x16_t cmp_upper = vceqq_u8(v, nu);
-    uint8x16_t cmp_null = vceqq_u8(v, z);
-    uint8x16_t cmp = vorrq_u8(vorrq_u8(cmp_lower, cmp_upper), cmp_null);
-    uint8x8_t mask = vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4);
-    uint64_t m;
-    vst1_u8((uint8_t *)&m, mask);
-    m >>= k * 4;
-    m <<= k * 4;
-    while (!m) {
-      hay += 16;
-      progress += 16;
-      v = vld1q_u8((const uint8_t *)hay);
-      cmp_lower = vceqq_u8(v, nl);
-      cmp_upper = vceqq_u8(v, nu);
-      cmp_null = vceqq_u8(v, z);
-      cmp = vorrq_u8(vorrq_u8(cmp_lower, cmp_upper), cmp_null);
-      mask = vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4);
-      vst1_u8((uint8_t *)&m, mask);
-    }
-    int offset = __builtin_ctzll(m) >> 2;
-    progress += offset;
-    hay += offset;
     for (i = 0;; ++i) {
-      if (--progress <= -512)
-        goto OfferPathologicalAssurances;
       if (!needle[i])
-        return (char *)hay;
-      if (!hay[i])
+        return (/*unconst*/ char *)haystack;
+      if (!haystack[i])
         break;
-      if (kToLower[needle[i] & 255] != kToLower[hay[i] & 255])
+      if (kToLower[needle[i] & 255] != kToLower[haystack[i] & 255])
         break;
     }
-    if (!*hay++)
+    if (!*haystack++)
       break;
   }
   return 0;
 #endif
-OfferPathologicalAssurances:
-  return kmp(haystack, strlen(haystack), needle, strlen(needle));
 }
diff --git a/libc/str/strncasecmp.c b/libc/str/strncasecmp.c
index 756732454..419b7eec8 100644
--- a/libc/str/strncasecmp.c
+++ b/libc/str/strncasecmp.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * Compares NUL-terminated strings case-insensitively w/ limit.
diff --git a/third_party/musl/strncasecmp16.c b/libc/str/strncasecmp16.c
similarity index 100%
rename from third_party/musl/strncasecmp16.c
rename to libc/str/strncasecmp16.c
diff --git a/libc/str/strncpy.c b/libc/str/strncpy.c
index 052b00217..26ac33e02 100644
--- a/libc/str/strncpy.c
+++ b/libc/str/strncpy.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 
 /**
diff --git a/libc/str/strnwidth.c b/libc/str/strnwidth.c
index b67436e57..0e39fc70a 100644
--- a/libc/str/strnwidth.c
+++ b/libc/str/strnwidth.c
@@ -17,7 +17,9 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/bsf.h"
-#include "libc/macros.h"
+#include "libc/intrin/pcmpgtb.h"
+#include "libc/intrin/pmovmskb.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/str/thompike.h"
 #include "libc/str/unicode.h"
diff --git a/libc/str/strstr.c b/libc/str/strstr.c
index 6b16e51d5..6557ac91a 100644
--- a/libc/str/strstr.c
+++ b/libc/str/strstr.c
@@ -17,113 +17,74 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/str/kmp.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/intel/immintrin.internal.h"
+#include "libc/dce.h"
+
+typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
 
 /**
  * Searches for substring.
  *
- * This function offers assurances against pathological cases, using KMP
- * if no progress is being made on the O(nm) vectorized fast path. It is
- * important to note that, if `needle` is untrusted, that it not be long
- * enough to overflow the stack. That's because KMP needs to allocate an
- * array of longs the same length as `needle` and it needs to do it with
- * stack memory since POSIX requires this function to be safe to call in
- * signal handlers.
- *
  * @param haystack is the search area, as a NUL-terminated string
  * @param needle is the desired substring, also NUL-terminated
  * @return pointer to first substring within haystack, or NULL
+ * @note this implementation goes fast in practice but isn't hardened
+ *     against pathological cases, and therefore shouldn't be used on
+ *     untrustworthy data
  * @asyncsignalsafe
  * @see strcasestr()
  * @see memmem()
  */
 __vex char *strstr(const char *haystack, const char *needle) {
-  if (haystack == needle || !*needle)
-    return (char *)haystack;
 #if defined(__x86_64__) && !defined(__chibicc__)
   size_t i;
   unsigned k, m;
-  const __m128i *p;
-  long progress = 0;
-  __m128i v, n, z = _mm_setzero_si128();
-  const char *hay = haystack;
-  n = _mm_set1_epi8(*needle);
+  const xmm_t *p;
+  xmm_t v, n, z = {0};
+  if (haystack == needle || !*needle)
+    return (char *)haystack;
+  n = (xmm_t){*needle, *needle, *needle, *needle, *needle, *needle,
+              *needle, *needle, *needle, *needle, *needle, *needle,
+              *needle, *needle, *needle, *needle};
   for (;;) {
-    k = (uintptr_t)hay & 15;
-    p = (const __m128i *)((uintptr_t)hay & -16);
-    v = _mm_load_si128(p);
-    m = _mm_movemask_epi8(
-        _mm_or_si128(_mm_cmpeq_epi8(v, z), _mm_cmpeq_epi8(v, n)));
+    k = (uintptr_t)haystack & 15;
+    p = (const xmm_t *)((uintptr_t)haystack & -16);
+    v = *p;
+    m = __builtin_ia32_pmovmskb128((v == z) | (v == n));
     m >>= k;
     m <<= k;
     while (!m) {
-      progress += 16;
-      v = _mm_load_si128(++p);
-      m = _mm_movemask_epi8(
-          _mm_or_si128(_mm_cmpeq_epi8(v, z), _mm_cmpeq_epi8(v, n)));
+      v = *++p;
+      m = __builtin_ia32_pmovmskb128((v == z) | (v == n));
     }
-    int offset = __builtin_ctzl(m);
-    progress += offset;
-    hay = (const char *)p + offset;
+    haystack = (const char *)p + __builtin_ctzl(m);
     for (i = 0;; ++i) {
-      if (--progress <= -512)
-        goto OfferPathologicalAssurances;
       if (!needle[i])
-        return (/*unconst*/ char *)hay;
-      if (!hay[i])
+        return (/*unconst*/ char *)haystack;
+      if (!haystack[i])
         break;
-      if (needle[i] != hay[i])
+      if (needle[i] != haystack[i])
         break;
     }
-    if (!*hay++)
+    if (!*haystack++)
       break;
   }
   return 0;
-OfferPathologicalAssurances:
-#elif defined(__aarch64__) && defined(__ARM_NEON)
+#else
   size_t i;
-  const char *hay = haystack;
-  uint8x16_t n = vdupq_n_u8(*needle);
-  uint8x16_t z = vdupq_n_u8(0);
-  long progress = 0;
+  if (haystack == needle || !*needle)
+    return (void *)haystack;
   for (;;) {
-    int k = (uintptr_t)hay & 15;
-    hay = (const char *)((uintptr_t)hay & -16);
-    uint8x16_t v = vld1q_u8((const uint8_t *)hay);
-    uint8x16_t cmp = vorrq_u8(vceqq_u8(v, z), vceqq_u8(v, n));
-    uint8x8_t mask = vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4);
-    uint64_t m;
-    vst1_u8((uint8_t *)&m, mask);
-    m >>= k * 4;
-    m <<= k * 4;
-    while (!m) {
-      hay += 16;
-      progress += 16;
-      v = vld1q_u8((const uint8_t *)hay);
-      cmp = vorrq_u8(vceqq_u8(v, z), vceqq_u8(v, n));
-      mask = vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4);
-      vst1_u8((uint8_t *)&m, mask);
-    }
-    int offset = __builtin_ctzll(m) >> 2;
-    progress += offset;
-    hay += offset;
     for (i = 0;; ++i) {
-      if (--progress <= -512)
-        goto OfferPathologicalAssurances;
       if (!needle[i])
-        return (/*unconst*/ char *)hay;
-      if (!hay[i])
+        return (/*unconst*/ char *)haystack;
+      if (!haystack[i])
         break;
-      if (needle[i] != hay[i])
+      if (needle[i] != haystack[i])
         break;
     }
-    if (!*hay++)
+    if (!*haystack++)
       break;
   }
   return 0;
-OfferPathologicalAssurances:
 #endif
-  return __memmem_kmp(haystack, strlen(haystack), needle, strlen(needle));
 }
diff --git a/libc/str/strstr16.c b/libc/str/strstr16.c
index a139d0fd4..aac0f8e3e 100644
--- a/libc/str/strstr16.c
+++ b/libc/str/strstr16.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/kmp.h"
 #include "libc/str/str.h"
 
 /**
@@ -29,5 +28,19 @@
  * @see memmem()
  */
 char16_t *strstr16(const char16_t *haystack, const char16_t *needle) {
-  return __memmem_kmp16(haystack, strlen16(haystack), needle, strlen16(needle));
+  size_t i;
+  for (;;) {
+    for (i = 0;;) {
+      if (!needle[i])
+        return (/*unconst*/ char16_t *)haystack;
+      if (!haystack[i])
+        break;
+      if (needle[i] != haystack[i])
+        break;
+      ++i;
+    }
+    if (!*haystack++)
+      break;
+  }
+  return NULL;
 }
diff --git a/libc/str/strtol.c b/libc/str/strtol.c
index aa7bdd3b2..0548bb265 100644
--- a/libc/str/strtol.c
+++ b/libc/str/strtol.c
@@ -22,7 +22,7 @@
 #include "libc/limits.h"
 #include "libc/stdckdint.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * Decodes signed integer from ASCII string.
diff --git a/libc/str/strtoul.c b/libc/str/strtoul.c
index 37f28fe1e..ee8f01fbd 100644
--- a/libc/str/strtoul.c
+++ b/libc/str/strtoul.c
@@ -22,7 +22,7 @@
 #include "libc/limits.h"
 #include "libc/stdckdint.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * Decodes unsigned integer from ASCII string.
diff --git a/libc/str/tab.h b/libc/str/tab.internal.h
similarity index 77%
rename from libc/str/tab.h
rename to libc/str/tab.internal.h
index ea1310bbc..c40d9f663 100644
--- a/libc/str/tab.h
+++ b/libc/str/tab.internal.h
@@ -1,5 +1,5 @@
-#ifndef COSMOPOLITAN_LIBC_STR_TAB_H_
-#define COSMOPOLITAN_LIBC_STR_TAB_H_
+#ifndef COSMOPOLITAN_LIBC_STR_TAB_INTERNAL_H_
+#define COSMOPOLITAN_LIBC_STR_TAB_INTERNAL_H_
 
 #define kHexToInt __kHexToInt
 #define kToLower  __kToLower
@@ -20,4 +20,4 @@ extern const char16_t kCp437[256];
 
 COSMOPOLITAN_C_END_
 #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
-#endif /* COSMOPOLITAN_LIBC_STR_TAB_H_ */
+#endif /* COSMOPOLITAN_LIBC_STR_TAB_INTERNAL_H_ */
diff --git a/libc/intrin/timespectowindowstime.c b/libc/str/timespectowindowstime.c
similarity index 86%
rename from libc/intrin/timespectowindowstime.c
rename to libc/str/timespectowindowstime.c
index 03e8c631c..af7cb9507 100644
--- a/libc/intrin/timespectowindowstime.c
+++ b/libc/str/timespectowindowstime.c
@@ -17,14 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/wintime.internal.h"
-#include "libc/limits.h"
-#include "libc/stdckdint.h"
 
-int64_t TimeSpecToWindowsTime(struct timespec time) {
-  int64_t wt;
-  if (ckd_add(&wt, time.tv_sec, MODERNITYSECONDS) ||
-      ckd_mul(&wt, wt, HECTONANOSECONDS) ||
-      ckd_add(&wt, wt, time.tv_nsec / 100))
-    wt = INT64_MAX;
-  return wt;
+int64_t TimeSpecToWindowsTime(struct timespec t) {
+  return t.tv_nsec / 100 + (t.tv_sec + MODERNITYSECONDS) * HECTONANOSECONDS;
 }
diff --git a/libc/intrin/timevaltowindowstime.c b/libc/str/timevaltowindowstime.c
similarity index 100%
rename from libc/intrin/timevaltowindowstime.c
rename to libc/str/timevaltowindowstime.c
diff --git a/libc/str/towctrans.c b/libc/str/towctrans.c
new file mode 100644
index 000000000..ed928df67
--- /dev/null
+++ b/libc/str/towctrans.c
@@ -0,0 +1,27 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/wctype.h"
+
+wint_t towctrans(wint_t c, wctrans_t t) {
+  if (t == (wctrans_t)1)
+    return towupper(c);
+  if (t == (wctrans_t)2)
+    return towlower(c);
+  return c;
+}
diff --git a/libc/str/towlower.c b/libc/str/towlower.c
new file mode 100644
index 000000000..313a53079
--- /dev/null
+++ b/libc/str/towlower.c
@@ -0,0 +1,236 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/dce.h"
+#include "libc/macros.internal.h"
+#include "libc/str/str.h"
+/* clang-format off */
+
+static const struct {
+  unsigned short x;
+  unsigned short y;
+  short d;
+} kLower[] = {
+    {0x00c0, 0x00d6, +32},    /* 23x À ..Ö  → à ..ö  Watin */
+    {0x00d8, 0x00de, +32},    /*  7x Ø ..Þ  → ø ..þ  Watin */
+    {0x0178, 0x0178, -121},   /*  1x Ÿ ..Ÿ  → ÿ ..ÿ  Watin-A */
+    {0x0179, 0x0179, +1},     /*  1x Ź ..Ź  → ź ..ź  Watin-A */
+    {0x017b, 0x017b, +1},     /*  1x Ż ..Ż  → ż ..ż  Watin-A */
+    {0x017d, 0x017d, +1},     /*  1x Ž ..Ž  → ž ..ž  Watin-A */
+    {0x0181, 0x0181, +210},   /*  1x Ɓ ..Ɓ  → ɓ ..ɓ  Watin-B */
+    {0x0182, 0x0182, +1},     /*  1x Ƃ ..Ƃ  → ƃ ..ƃ  Watin-B */
+    {0x0184, 0x0184, +1},     /*  1x Ƅ ..Ƅ  → ƅ ..ƅ  Watin-B */
+    {0x0186, 0x0186, +206},   /*  1x Ɔ ..Ɔ  → ɔ ..ɔ  Watin-B */
+    {0x0187, 0x0187, +1},     /*  1x Ƈ ..Ƈ  → ƈ ..ƈ  Watin-B */
+    {0x0189, 0x018a, +205},   /*  2x Ɖ ..Ɗ  → ɖ ..ɗ  Watin-B */
+    {0x018b, 0x018b, +1},     /*  1x Ƌ ..Ƌ  → ƌ ..ƌ  Watin-B */
+    {0x018e, 0x018e, +79},    /*  1x Ǝ ..Ǝ  → ǝ ..ǝ  Watin-B */
+    {0x018f, 0x018f, +202},   /*  1x Ə ..Ə  → ə ..ə  Watin-B */
+    {0x0190, 0x0190, +203},   /*  1x Ɛ ..Ɛ  → ɛ ..ɛ  Watin-B */
+    {0x0191, 0x0191, +1},     /*  1x Ƒ ..Ƒ  → ƒ ..ƒ  Watin-B */
+    {0x0193, 0x0193, +205},   /*  1x Ɠ ..Ɠ  → ɠ ..ɠ  Watin-B */
+    {0x0194, 0x0194, +207},   /*  1x Ɣ ..Ɣ  → ɣ ..ɣ  Watin-B */
+    {0x0196, 0x0196, +211},   /*  1x Ɩ ..Ɩ  → ɩ ..ɩ  Watin-B */
+    {0x0197, 0x0197, +209},   /*  1x Ɨ ..Ɨ  → ɨ ..ɨ  Watin-B */
+    {0x0198, 0x0198, +1},     /*  1x Ƙ ..Ƙ  → ƙ ..ƙ  Watin-B */
+    {0x019c, 0x019c, +211},   /*  1x Ɯ ..Ɯ  → ɯ ..ɯ  Watin-B */
+    {0x019d, 0x019d, +213},   /*  1x Ɲ ..Ɲ  → ɲ ..ɲ  Watin-B */
+    {0x019f, 0x019f, +214},   /*  1x Ɵ ..Ɵ  → ɵ ..ɵ  Watin-B */
+    {0x01a0, 0x01a0, +1},     /*  1x Ơ ..Ơ  → ơ ..ơ  Watin-B */
+    {0x01a2, 0x01a2, +1},     /*  1x Ƣ ..Ƣ  → ƣ ..ƣ  Watin-B */
+    {0x01a4, 0x01a4, +1},     /*  1x Ƥ ..Ƥ  → ƥ ..ƥ  Watin-B */
+    {0x01a6, 0x01a6, +218},   /*  1x Ʀ ..Ʀ  → ʀ ..ʀ  Watin-B */
+    {0x01a7, 0x01a7, +1},     /*  1x Ƨ ..Ƨ  → ƨ ..ƨ  Watin-B */
+    {0x01a9, 0x01a9, +218},   /*  1x Ʃ ..Ʃ  → ʃ ..ʃ  Watin-B */
+    {0x01ac, 0x01ac, +1},     /*  1x Ƭ ..Ƭ  → ƭ ..ƭ  Watin-B */
+    {0x01ae, 0x01ae, +218},   /*  1x Ʈ ..Ʈ  → ʈ ..ʈ  Watin-B */
+    {0x01af, 0x01af, +1},     /*  1x Ư ..Ư  → ư ..ư  Watin-B */
+    {0x01b1, 0x01b2, +217},   /*  2x Ʊ ..Ʋ  → ʊ ..ʋ  Watin-B */
+    {0x01b3, 0x01b3, +1},     /*  1x Ƴ ..Ƴ  → ƴ ..ƴ  Watin-B */
+    {0x01b5, 0x01b5, +1},     /*  1x Ƶ ..Ƶ  → ƶ ..ƶ  Watin-B */
+    {0x01b7, 0x01b7, +219},   /*  1x Ʒ ..Ʒ  → ʒ ..ʒ  Watin-B */
+    {0x01b8, 0x01b8, +1},     /*  1x Ƹ ..Ƹ  → ƹ ..ƹ  Watin-B */
+    {0x01bc, 0x01bc, +1},     /*  1x Ƽ ..Ƽ  → ƽ ..ƽ  Watin-B */
+    {0x01c4, 0x01c4, +2},     /*  1x Ǆ ..Ǆ  → ǆ ..ǆ  Watin-B */
+    {0x01c5, 0x01c5, +1},     /*  1x ǅ ..ǅ  → ǆ ..ǆ  Watin-B */
+    {0x01c7, 0x01c7, +2},     /*  1x Ǉ ..Ǉ  → ǉ ..ǉ  Watin-B */
+    {0x01c8, 0x01c8, +1},     /*  1x ǈ ..ǈ  → ǉ ..ǉ  Watin-B */
+    {0x01ca, 0x01ca, +2},     /*  1x Ǌ ..Ǌ  → ǌ ..ǌ  Watin-B */
+    {0x01cb, 0x01cb, +1},     /*  1x ǋ ..ǋ  → ǌ ..ǌ  Watin-B */
+    {0x01cd, 0x01cd, +1},     /*  1x Ǎ ..Ǎ  → ǎ ..ǎ  Watin-B */
+    {0x01f1, 0x01f1, +2},     /*  1x Ǳ ..Ǳ  → ǳ ..ǳ  Watin-B */
+    {0x01f2, 0x01f2, +1},     /*  1x ǲ ..ǲ  → ǳ ..ǳ  Watin-B */
+    {0x01f4, 0x01f4, +1},     /*  1x Ǵ ..Ǵ  → ǵ ..ǵ  Watin-B */
+    {0x01f6, 0x01f6, -97},    /*  1x Ƕ ..Ƕ  → ƕ ..ƕ  Watin-B */
+    {0x01f7, 0x01f7, -56},    /*  1x Ƿ ..Ƿ  → ƿ ..ƿ  Watin-B */
+    {0x0220, 0x0220, -130},   /*  1x Ƞ ..Ƞ  → ƞ ..ƞ  Watin-B */
+    {0x023b, 0x023b, +1},     /*  1x Ȼ ..Ȼ  → ȼ ..ȼ  Watin-B */
+    {0x023d, 0x023d, -163},   /*  1x Ƚ ..Ƚ  → ƚ ..ƚ  Watin-B */
+    {0x0241, 0x0241, +1},     /*  1x Ɂ ..Ɂ  → ɂ ..ɂ  Watin-B */
+    {0x0243, 0x0243, -195},   /*  1x Ƀ ..Ƀ  → ƀ ..ƀ  Watin-B */
+    {0x0244, 0x0244, +69},    /*  1x Ʉ ..Ʉ  → ʉ ..ʉ  Watin-B */
+    {0x0245, 0x0245, +71},    /*  1x Ʌ ..Ʌ  → ʌ ..ʌ  Watin-B */
+    {0x0246, 0x0246, +1},     /*  1x Ɇ ..Ɇ  → ɇ ..ɇ  Watin-B */
+    {0x0248, 0x0248, +1},     /*  1x Ɉ ..Ɉ  → ɉ ..ɉ  Watin-B */
+    {0x024a, 0x024a, +1},     /*  1x Ɋ ..Ɋ  → ɋ ..ɋ  Watin-B */
+    {0x024c, 0x024c, +1},     /*  1x Ɍ ..Ɍ  → ɍ ..ɍ  Watin-B */
+    {0x024e, 0x024e, +1},     /*  1x Ɏ ..Ɏ  → ɏ ..ɏ  Watin-B */
+    {0x0386, 0x0386, +38},    /*  1x Ά ..Ά  → ά ..ά  Greek */
+    {0x0388, 0x038a, +37},    /*  3x Έ ..Ί  → έ ..ί  Greek */
+    {0x038c, 0x038c, +64},    /*  1x Ό ..Ό  → ό ..ό  Greek */
+    {0x038e, 0x038f, +63},    /*  2x Ύ ..Ώ  → ύ ..ώ  Greek */
+    {0x0391, 0x03a1, +32},    /* 17x Α ..Ρ  → α ..ρ  Greek */
+    {0x03a3, 0x03ab, +32},    /*  9x Σ ..Ϋ  → σ ..ϋ  Greek */
+    {0x03dc, 0x03dc, +1},     /*  1x Ϝ ..Ϝ  → ϝ ..ϝ  Greek */
+    {0x03f4, 0x03f4, -60},    /*  1x ϴ ..ϴ  → θ ..θ  Greek */
+    {0x0400, 0x040f, +80},    /* 16x Ѐ ..Џ  → ѐ ..џ  Cyrillic */
+    {0x0410, 0x042f, +32},    /* 32x А ..Я  → а ..я  Cyrillic */
+    {0x0460, 0x0460, +1},     /*  1x Ѡ ..Ѡ  → ѡ ..ѡ  Cyrillic */
+    {0x0462, 0x0462, +1},     /*  1x Ѣ ..Ѣ  → ѣ ..ѣ  Cyrillic */
+    {0x0464, 0x0464, +1},     /*  1x Ѥ ..Ѥ  → ѥ ..ѥ  Cyrillic */
+    {0x0472, 0x0472, +1},     /*  1x Ѳ ..Ѳ  → ѳ ..ѳ  Cyrillic */
+    {0x0490, 0x0490, +1},     /*  1x Ґ ..Ґ  → ґ ..ґ  Cyrillic */
+    {0x0498, 0x0498, +1},     /*  1x Ҙ ..Ҙ  → ҙ ..ҙ  Cyrillic */
+    {0x049a, 0x049a, +1},     /*  1x Қ ..Қ  → қ ..қ  Cyrillic */
+    {0x0531, 0x0556, +48},    /* 38x Ա ..Ֆ  → ա ..ֆ  Armenian */
+    {0x10a0, 0x10c5, +7264},  /* 38x Ⴀ ..Ⴥ  → ⴀ ..ⴥ  Georgian */
+    {0x10c7, 0x10c7, +7264},  /*  1x Ⴧ ..Ⴧ  → ⴧ ..ⴧ  Georgian */
+    {0x10cd, 0x10cd, +7264},  /*  1x Ⴭ ..Ⴭ  → ⴭ ..ⴭ  Georgian */
+    {0x13f0, 0x13f5, +8},     /*  6x Ᏸ ..Ᏽ  → ᏸ ..ᏽ  Cherokee */
+    {0x1c90, 0x1cba, -3008}, /* 43x Ა ..Ჺ  → ა ..ჺ  Georgian2 */
+    {0x1cbd, 0x1cbf, -3008}, /*  3x Ჽ ..Ჿ  → ჽ ..ჿ  Georgian2 */
+    {0x1f08, 0x1f0f, -8},    /*  8x Ἀ ..Ἇ  → ἀ ..ἇ  Greek2 */
+    {0x1f18, 0x1f1d, -8},    /*  6x Ἐ ..Ἕ  → ἐ ..ἕ  Greek2 */
+    {0x1f28, 0x1f2f, -8},    /*  8x Ἠ ..Ἧ  → ἠ ..ἧ  Greek2 */
+    {0x1f38, 0x1f3f, -8},    /*  8x Ἰ ..Ἷ  → ἰ ..ἷ  Greek2 */
+    {0x1f48, 0x1f4d, -8},    /*  6x Ὀ ..Ὅ  → ὀ ..ὅ  Greek2 */
+    {0x1f59, 0x1f59, -8},    /*  1x Ὑ ..Ὑ  → ὑ ..ὑ  Greek2 */
+    {0x1f5b, 0x1f5b, -8},    /*  1x Ὓ ..Ὓ  → ὓ ..ὓ  Greek2 */
+    {0x1f5d, 0x1f5d, -8},    /*  1x Ὕ ..Ὕ  → ὕ ..ὕ  Greek2 */
+    {0x1f5f, 0x1f5f, -8},    /*  1x Ὗ ..Ὗ  → ὗ ..ὗ  Greek2 */
+    {0x1f68, 0x1f6f, -8},    /*  8x Ὠ ..Ὧ  → ὠ ..ὧ  Greek2 */
+    {0x1f88, 0x1f8f, -8},    /*  8x ᾈ ..ᾏ  → ᾀ ..ᾇ  Greek2 */
+    {0x1f98, 0x1f9f, -8},    /*  8x ᾘ ..ᾟ  → ᾐ ..ᾗ  Greek2 */
+    {0x1fa8, 0x1faf, -8},    /*  8x ᾨ ..ᾯ  → ᾠ ..ᾧ  Greek2 */
+    {0x1fb8, 0x1fb9, -8},    /*  2x Ᾰ ..Ᾱ  → ᾰ ..ᾱ  Greek2 */
+    {0x1fba, 0x1fbb, -74},   /*  2x Ὰ ..Ά  → ὰ ..ά  Greek2 */
+    {0x1fbc, 0x1fbc, -9},    /*  1x ᾼ ..ᾼ  → ᾳ ..ᾳ  Greek2 */
+    {0x1fc8, 0x1fcb, -86},   /*  4x Ὲ ..Ή  → ὲ ..ή  Greek2 */
+    {0x1fcc, 0x1fcc, -9},    /*  1x ῌ ..ῌ  → ῃ ..ῃ  Greek2 */
+    {0x1fd8, 0x1fd9, -8},    /*  2x Ῐ ..Ῑ  → ῐ ..ῑ  Greek2 */
+    {0x1fda, 0x1fdb, -100},  /*  2x Ὶ ..Ί  → ὶ ..ί  Greek2 */
+    {0x1fe8, 0x1fe9, -8},    /*  2x Ῠ ..Ῡ  → ῠ ..ῡ  Greek2 */
+    {0x1fea, 0x1feb, -112},  /*  2x Ὺ ..Ύ  → ὺ ..ύ  Greek2 */
+    {0x1fec, 0x1fec, -7},    /*  1x Ῥ ..Ῥ  → ῥ ..ῥ  Greek2 */
+    {0x1ff8, 0x1ff9, -128},  /*  2x Ὸ ..Ό  → ὸ ..ό  Greek2 */
+    {0x1ffa, 0x1ffb, -126},  /*  2x Ὼ ..Ώ  → ὼ ..ώ  Greek2 */
+    {0x1ffc, 0x1ffc, -9},    /*  1x ῼ ..ῼ  → ῳ ..ῳ  Greek2 */
+    {0x2126, 0x2126, -7517}, /*  1x Ω ..Ω  → ω ..ω  Letterlike */
+    {0x212a, 0x212a, -8383}, /*  1x K ..K  → k ..k  Letterlike */
+    {0x212b, 0x212b, -8262}, /*  1x Å ..Å  → å ..å  Letterlike */
+    {0x2132, 0x2132, +28}, /*  1x Ⅎ ..Ⅎ  → ⅎ ..ⅎ  Letterlike */
+    {0x2160, 0x216f, +16}, /* 16x Ⅰ ..Ⅿ  → ⅰ ..ⅿ  Numbery */
+    {0x2183, 0x2183, +1},  /*  1x Ↄ ..Ↄ  → ↄ ..ↄ  Numbery */
+    {0x24b6, 0x24cf, +26}, /* 26x Ⓐ ..Ⓩ  → ⓐ ..ⓩ  Enclosed */
+    {0x2c00, 0x2c2e, +48}, /* 47x Ⰰ ..Ⱞ  → ⰰ ..ⱞ  Glagolitic */
+    {0xff21, 0xff3a, +32}, /* 26x Ａ..Ｚ → ａ..ｚ Dubs */
+};
+
+static const int kAstralLower[][3] = {
+    {0x10400, 0x10427, +40}, /* 40x 𐐀 ..𐐧  → 𐐨 ..𐑏  Deseret */
+    {0x104b0, 0x104d3, +40}, /* 36x 𐒰 ..𐓓  → 𐓘 ..𐓻  Osage */
+    {0x1d400, 0x1d419, +26}, /* 26x 𝐀 ..𝐙  → 𝐚 ..𝐳  Math */
+    {0x1d43c, 0x1d44d, +26}, /* 18x 𝐼 ..𝑍  → 𝑖 ..𝑧  Math */
+    {0x1d468, 0x1d481, +26}, /* 26x 𝑨 ..𝒁  → 𝒂 ..𝒛  Math */
+    {0x1d4ae, 0x1d4b5, +26}, /*  8x 𝒮 ..𝒵  → 𝓈 ..𝓏  Math */
+    {0x1d4d0, 0x1d4e9, +26}, /* 26x 𝓐 ..𝓩  → 𝓪 ..𝔃  Math */
+    {0x1d50d, 0x1d514, +26}, /*  8x 𝔍 ..𝔔  → 𝔧 ..𝔮  Math */
+    {0x1d56c, 0x1d585, +26}, /* 26x 𝕬 ..𝖅  → 𝖆 ..𝖟  Math */
+    {0x1d5a0, 0x1d5b9, +26}, /* 26x 𝖠 ..𝖹  → 𝖺 ..𝗓  Math */
+    {0x1d5d4, 0x1d5ed, +26}, /* 26x 𝗔 ..𝗭  → 𝗮 ..𝘇  Math */
+    {0x1d608, 0x1d621, +26}, /* 26x 𝘈 ..𝘡  → 𝘢 ..𝘻  Math */
+    {0x1d63c, 0x1d655, -442}, /* 26x 𝘼 ..𝙕  → 𝒂 ..𝒛  Math */
+    {0x1d670, 0x1d689, +26},  /* 26x 𝙰 ..𝚉  → 𝚊 ..𝚣  Math */
+    {0x1d6a8, 0x1d6b8, +26},  /* 17x 𝚨 ..𝚸  → 𝛂 ..𝛒  Math */
+    {0x1d6e2, 0x1d6f2, +26},  /* 17x 𝛢 ..𝛲  → 𝛼 ..𝜌  Math */
+    {0x1d71c, 0x1d72c, +26},  /* 17x 𝜜 ..𝜬  → 𝜶 ..𝝆  Math */
+    {0x1d756, 0x1d766, +26},  /* 17x 𝝖 ..𝝦  → 𝝰 ..𝞀  Math */
+    {0x1d790, 0x1d7a0, -90},  /* 17x 𝞐 ..𝞠  → 𝜶 ..𝝆  Math */
+};
+
+/**
+ * Converts wide character to lower case.
+ */
+wint_t towlower(wint_t c) {
+  int m, l, r, n;
+  if (c < 0200) {
+    if ('A' <= c && c <= 'Z') {
+      return c + 32;
+    } else {
+      return c;
+    }
+  } else if (c <= 0xffff) {
+    if ((0x0100 <= c && c <= 0x0176) || /* 60x Ā..ā → ā..ŵ Watin-A */
+        (0x01de <= c && c <= 0x01ee) || /*  9x Ǟ..Ǯ → ǟ..ǯ Watin-B */
+        (0x01f8 <= c && c <= 0x021e) || /* 20x Ǹ..Ȟ → ǹ..ȟ Watin-B */
+        (0x0222 <= c && c <= 0x0232) || /*  9x Ȣ..Ȳ → ȣ..ȳ Watin-B */
+        (0x1e00 <= c && c <= 0x1eff)) { /*256x Ḁ..Ỿ → ḁ..ỿ Watin-C */
+      if (c == 0x0130) return c - 199;
+      if (c == 0x1e9e) return c;
+      return c + (~c & 1);
+    } else if (0x01cf <= c && c <= 0x01db) {
+      return c + (c & 1); /* 7x Ǐ..Ǜ → ǐ..ǜ Watin-B */
+    } else if (0x13a0 <= c && c <= 0x13ef) {
+      return c + 38864; /* 80x Ꭰ ..Ꮿ  → ꭰ ..ꮿ  Cherokee */
+    } else {
+      l = 0;
+      r = n = sizeof(kLower) / sizeof(kLower[0]);
+      while (l < r) {
+        m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
+        if (kLower[m].y < c) {
+          l = m + 1;
+        } else {
+          r = m;
+        }
+      }
+      if (l < n && kLower[l].x <= c && c <= kLower[l].y) {
+        return c + kLower[l].d;
+      } else {
+        return c;
+      }
+    }
+  } else {
+    l = 0;
+    r = n = sizeof(kAstralLower) / sizeof(kAstralLower[0]);
+    while (l < r) {
+      m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
+      if (kAstralLower[m][1] < c) {
+        l = m + 1;
+      } else {
+        r = m;
+      }
+    }
+    if (l < n && kAstralLower[l][0] <= c && c <= kAstralLower[l][1]) {
+      return c + kAstralLower[l][2];
+    } else {
+      return c;
+    }
+  }
+}
+
+__weak_reference(towlower, towlower_l);
diff --git a/libc/str/towupper.c b/libc/str/towupper.c
new file mode 100644
index 000000000..e946cd142
--- /dev/null
+++ b/libc/str/towupper.c
@@ -0,0 +1,199 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/dce.h"
+#include "libc/macros.internal.h"
+#include "libc/str/str.h"
+// clang-format off
+
+static const struct {
+  unsigned short x;
+  unsigned short y;
+  short d;
+} kUpper[] = {
+    {0x00b5, 0x00b5, +743},   /*  1x µ ..µ  → Μ ..Μ  Watin */
+    {0x00e0, 0x00f6, -32},    /* 23x à ..ö  → À ..Ö  Watin */
+    {0x00f8, 0x00fe, -32},    /*  7x ø ..þ  → Ø ..Þ  Watin */
+    {0x00ff, 0x00ff, +121},   /*  1x ÿ ..ÿ  → Ÿ ..Ÿ  Watin */
+    {0x017a, 0x017a, -1},     /*  1x ź ..ź  → Ź ..Ź  Watin-A */
+    {0x017c, 0x017c, -1},     /*  1x ż ..ż  → Ż ..Ż  Watin-A */
+    {0x017e, 0x017e, -1},     /*  1x ž ..ž  → Ž ..Ž  Watin-A */
+    {0x017f, 0x017f, -300},   /*  1x ſ ..ſ  → S ..S  Watin-A */
+    {0x0180, 0x0180, +195},   /*  1x ƀ ..ƀ  → Ƀ ..Ƀ  Watin-B */
+    {0x0183, 0x0183, -1},     /*  1x ƃ ..ƃ  → Ƃ ..Ƃ  Watin-B */
+    {0x0185, 0x0185, -1},     /*  1x ƅ ..ƅ  → Ƅ ..Ƅ  Watin-B */
+    {0x0188, 0x0188, -1},     /*  1x ƈ ..ƈ  → Ƈ ..Ƈ  Watin-B */
+    {0x018c, 0x018c, -1},     /*  1x ƌ ..ƌ  → Ƌ ..Ƌ  Watin-B */
+    {0x0192, 0x0192, -1},     /*  1x ƒ ..ƒ  → Ƒ ..Ƒ  Watin-B */
+    {0x0195, 0x0195, +97},    /*  1x ƕ ..ƕ  → Ƕ ..Ƕ  Watin-B */
+    {0x0199, 0x0199, -1},     /*  1x ƙ ..ƙ  → Ƙ ..Ƙ  Watin-B */
+    {0x019a, 0x019a, +163},   /*  1x ƚ ..ƚ  → Ƚ ..Ƚ  Watin-B */
+    {0x019e, 0x019e, +130},   /*  1x ƞ ..ƞ  → Ƞ ..Ƞ  Watin-B */
+    {0x01a1, 0x01a1, -1},     /*  1x ơ ..ơ  → Ơ ..Ơ  Watin-B */
+    {0x01a3, 0x01a3, -1},     /*  1x ƣ ..ƣ  → Ƣ ..Ƣ  Watin-B */
+    {0x01a5, 0x01a5, -1},     /*  1x ƥ ..ƥ  → Ƥ ..Ƥ  Watin-B */
+    {0x01a8, 0x01a8, -1},     /*  1x ƨ ..ƨ  → Ƨ ..Ƨ  Watin-B */
+    {0x01ad, 0x01ad, -1},     /*  1x ƭ ..ƭ  → Ƭ ..Ƭ  Watin-B */
+    {0x01b0, 0x01b0, -1},     /*  1x ư ..ư  → Ư ..Ư  Watin-B */
+    {0x01b4, 0x01b4, -1},     /*  1x ƴ ..ƴ  → Ƴ ..Ƴ  Watin-B */
+    {0x01b6, 0x01b6, -1},     /*  1x ƶ ..ƶ  → Ƶ ..Ƶ  Watin-B */
+    {0x01b9, 0x01b9, -1},     /*  1x ƹ ..ƹ  → Ƹ ..Ƹ  Watin-B */
+    {0x01bd, 0x01bd, -1},     /*  1x ƽ ..ƽ  → Ƽ ..Ƽ  Watin-B */
+    {0x01bf, 0x01bf, +56},    /*  1x ƿ ..ƿ  → Ƿ ..Ƿ  Watin-B */
+    {0x01c5, 0x01c5, -1},     /*  1x ǅ ..ǅ  → Ǆ ..Ǆ  Watin-B */
+    {0x01c6, 0x01c6, -2},     /*  1x ǆ ..ǆ  → Ǆ ..Ǆ  Watin-B */
+    {0x01c8, 0x01c8, -1},     /*  1x ǈ ..ǈ  → Ǉ ..Ǉ  Watin-B */
+    {0x01c9, 0x01c9, -2},     /*  1x ǉ ..ǉ  → Ǉ ..Ǉ  Watin-B */
+    {0x01cb, 0x01cb, -1},     /*  1x ǋ ..ǋ  → Ǌ ..Ǌ  Watin-B */
+    {0x01cc, 0x01cc, -2},     /*  1x ǌ ..ǌ  → Ǌ ..Ǌ  Watin-B */
+    {0x01ce, 0x01ce, -1},     /*  1x ǎ ..ǎ  → Ǎ ..Ǎ  Watin-B */
+    {0x01dd, 0x01dd, -79},    /*  1x ǝ ..ǝ  → Ǝ ..Ǝ  Watin-B */
+    {0x01f2, 0x01f2, -1},     /*  1x ǲ ..ǲ  → Ǳ ..Ǳ  Watin-B */
+    {0x01f3, 0x01f3, -2},     /*  1x ǳ ..ǳ  → Ǳ ..Ǳ  Watin-B */
+    {0x01f5, 0x01f5, -1},     /*  1x ǵ ..ǵ  → Ǵ ..Ǵ  Watin-B */
+    {0x023c, 0x023c, -1},     /*  1x ȼ ..ȼ  → Ȼ ..Ȼ  Watin-B */
+    {0x023f, 0x0240, +10815}, /*  2x ȿ ..ɀ  → Ȿ ..Ɀ  Watin-B */
+    {0x0242, 0x0242, -1},     /*  1x ɂ ..ɂ  → Ɂ ..Ɂ  Watin-B */
+    {0x0247, 0x0247, -1},     /*  1x ɇ ..ɇ  → Ɇ ..Ɇ  Watin-B */
+    {0x0249, 0x0249, -1},     /*  1x ɉ ..ɉ  → Ɉ ..Ɉ  Watin-B */
+    {0x024b, 0x024b, -1},     /*  1x ɋ ..ɋ  → Ɋ ..Ɋ  Watin-B */
+    {0x024d, 0x024d, -1},     /*  1x ɍ ..ɍ  → Ɍ ..Ɍ  Watin-B */
+    {0x024f, 0x024f, -1},     /*  1x ɏ ..ɏ  → Ɏ ..Ɏ  Watin-B */
+    {0x037b, 0x037d, +130},   /*  3x ͻ ..ͽ  → Ͻ ..Ͽ  Greek */
+    {0x03ac, 0x03ac, -38},    /*  1x ά ..ά  → Ά ..Ά  Greek */
+    {0x03ad, 0x03af, -37},    /*  3x έ ..ί  → Έ ..Ί  Greek */
+    {0x03b1, 0x03c1, -32},    /* 17x α ..ρ  → Α ..Ρ  Greek */
+    {0x03c2, 0x03c2, -31},    /*  1x ς ..ς  → Σ ..Σ  Greek */
+    {0x03c3, 0x03cb, -32},    /*  9x σ ..ϋ  → Σ ..Ϋ  Greek */
+    {0x03cc, 0x03cc, -64},    /*  1x ό ..ό  → Ό ..Ό  Greek */
+    {0x03cd, 0x03ce, -63},    /*  2x ύ ..ώ  → Ύ ..Ώ  Greek */
+    {0x03d0, 0x03d0, -62},    /*  1x ϐ ..ϐ  → Β ..Β  Greek */
+    {0x03d1, 0x03d1, -57},    /*  1x ϑ ..ϑ  → Θ ..Θ  Greek */
+    {0x03d5, 0x03d5, -47},    /*  1x ϕ ..ϕ  → Φ ..Φ  Greek */
+    {0x03d6, 0x03d6, -54},    /*  1x ϖ ..ϖ  → Π ..Π  Greek */
+    {0x03dd, 0x03dd, -1},     /*  1x ϝ ..ϝ  → Ϝ ..Ϝ  Greek */
+    {0x03f0, 0x03f0, -86},    /*  1x ϰ ..ϰ  → Κ ..Κ  Greek */
+    {0x03f1, 0x03f1, -80},    /*  1x ϱ ..ϱ  → Ρ ..Ρ  Greek */
+    {0x03f5, 0x03f5, -96},    /*  1x ϵ ..ϵ  → Ε ..Ε  Greek */
+    {0x0430, 0x044f, -32},    /* 32x а ..я  → А ..Я  Cyrillic */
+    {0x0450, 0x045f, -80},    /* 16x ѐ ..џ  → Ѐ ..Џ  Cyrillic */
+    {0x0461, 0x0461, -1},     /*  1x ѡ ..ѡ  → Ѡ ..Ѡ  Cyrillic */
+    {0x0463, 0x0463, -1},     /*  1x ѣ ..ѣ  → Ѣ ..Ѣ  Cyrillic */
+    {0x0465, 0x0465, -1},     /*  1x ѥ ..ѥ  → Ѥ ..Ѥ  Cyrillic */
+    {0x0473, 0x0473, -1},     /*  1x ѳ ..ѳ  → Ѳ ..Ѳ  Cyrillic */
+    {0x0491, 0x0491, -1},     /*  1x ґ ..ґ  → Ґ ..Ґ  Cyrillic */
+    {0x0499, 0x0499, -1},     /*  1x ҙ ..ҙ  → Ҙ ..Ҙ  Cyrillic */
+    {0x049b, 0x049b, -1},     /*  1x қ ..қ  → Қ ..Қ  Cyrillic */
+    {0x0561, 0x0586, -48},    /* 38x ա ..ֆ  → Ա ..Ֆ  Armenian */
+    {0x10d0, 0x10fa, +3008},  /* 43x ა ..ჺ  → Ა ..Ჺ  Georgian */
+    {0x10fd, 0x10ff, +3008},  /*  3x ჽ ..ჿ  → Ჽ ..Ჿ  Georgian */
+    {0x13f8, 0x13fd, -8},     /*  6x ᏸ ..ᏽ  → Ᏸ ..Ᏽ  Cherokee */
+    {0x214e, 0x214e, -28},    /*  1x ⅎ ..ⅎ  → Ⅎ ..Ⅎ  Letterlike */
+    {0x2170, 0x217f, -16},    /* 16x ⅰ ..ⅿ  → Ⅰ ..Ⅿ  Numbery */
+    {0x2184, 0x2184, -1},     /*  1x ↄ ..ↄ  → Ↄ ..Ↄ  Numbery */
+    {0x24d0, 0x24e9, -26},    /* 26x ⓐ ..ⓩ  → Ⓐ ..Ⓩ  Enclosed */
+    {0x2c30, 0x2c5e, -48},    /* 47x ⰰ ..ⱞ  → Ⰰ ..Ⱞ  Glagolitic */
+    {0x2d00, 0x2d25, -7264},  /* 38x ⴀ ..ⴥ  → Ⴀ ..Ⴥ  Georgian2 */
+    {0x2d27, 0x2d27, -7264},  /*  1x ⴧ ..ⴧ  → Ⴧ ..Ⴧ  Georgian2 */
+    {0x2d2d, 0x2d2d, -7264},  /*  1x ⴭ ..ⴭ  → Ⴭ ..Ⴭ  Georgian2 */
+    {0xff41, 0xff5a, -32},    /* 26x ａ..ｚ → Ａ..Ｚ Dubs */
+};
+
+static const int kAstralUpper[][3] = {
+    {0x10428, 0x1044f, -40},  /* 40x 𐐨..𐑏 → 𐐀..𐐧 Deseret */
+    {0x104d8, 0x104fb, -40},  /* 36x 𐓘..𐓻 → 𐒰..𐓓 Osage */
+    {0x1d41a, 0x1d433, -26},  /* 26x 𝐚..𝐳 → 𝐀..𝐙 Math */
+    {0x1d456, 0x1d467, -26},  /* 18x 𝑖..𝑧 → 𝐼..𝑍 Math */
+    {0x1d482, 0x1d49b, -26},  /* 26x 𝒂..𝒛 → 𝑨..𝒁 Math */
+    {0x1d4c8, 0x1d4cf, -26},  /*  8x 𝓈..𝓏 → 𝒮..𝒵 Math */
+    {0x1d4ea, 0x1d503, -26},  /* 26x 𝓪..𝔃 → 𝓐..𝓩 Math */
+    {0x1d527, 0x1d52e, -26},  /*  8x 𝔧..𝔮 → 𝔍..𝔔 Math */
+    {0x1d586, 0x1d59f, -26},  /* 26x 𝖆..𝖟 → 𝕬..𝖅 Math */
+    {0x1d5ba, 0x1d5d3, -26},  /* 26x 𝖺..𝗓 → 𝖠..𝖹 Math */
+    {0x1d5ee, 0x1d607, -26},  /* 26x 𝗮..𝘇 → 𝗔..𝗭 Math */
+    {0x1d622, 0x1d63b, -26},  /* 26x 𝘢..𝘻 → 𝘈..𝘡 Math */
+    {0x1d68a, 0x1d6a3, +442}, /* 26x 𝒂..𝒛 → 𝘼..𝙕 Math */
+    {0x1d6c2, 0x1d6d2, -26},  /* 26x 𝚊..𝚣 → 𝙰..𝚉 Math */
+    {0x1d6fc, 0x1d70c, -26},  /* 17x 𝛂..𝛒 → 𝚨..𝚸 Math */
+    {0x1d736, 0x1d746, -26},  /* 17x 𝛼..𝜌 → 𝛢..𝛲 Math */
+    {0x1d770, 0x1d780, -26},  /* 17x 𝜶..𝝆 → 𝜜..𝜬 Math */
+    {0x1d770, 0x1d756, -26},  /* 17x 𝝰..𝞀 → 𝝖..𝝦 Math */
+    {0x1d736, 0x1d790, -90},  /* 17x 𝜶..𝝆 → 𝞐..𝞠 Math */
+};
+
+/**
+ * Converts wide character to upper case.
+ */
+wint_t towupper(wint_t c) {
+  int m, l, r, n;
+  if (c < 0200) {
+    if ('a' <= c && c <= 'z') {
+      return c - 32;
+    } else {
+      return c;
+    }
+  } else if (c <= 0xffff) {
+    if ((0x0101 <= c && c <= 0x0177) || /* 60x ā..ŵ → Ā..ā Watin-A */
+        (0x01df <= c && c <= 0x01ef) || /*  9x ǟ..ǯ → Ǟ..Ǯ Watin-B */
+        (0x01f8 <= c && c <= 0x021e) || /* 20x ǹ..ȟ → Ǹ..Ȟ Watin-B */
+        (0x0222 <= c && c <= 0x0232) || /*  9x ȣ..ȳ → Ȣ..Ȳ Watin-B */
+        (0x1e01 <= c && c <= 0x1eff)) { /*256x ḁ..ỿ → Ḁ..Ỿ Watin-C */
+      if (c == 0x0131) return c + 232;
+      if (c == 0x1e9e) return c;
+      return c - (c & 1);
+    } else if (0x01d0 <= c && c <= 0x01dc) {
+      return c - (~c & 1); /* 7x ǐ..ǜ → Ǐ..Ǜ Watin-B */
+    } else if (0xab70 <= c && c <= 0xabbf) {
+      return c - 38864; /* 80x ꭰ ..ꮿ  → Ꭰ ..Ꮿ  Cherokee Supplement */
+    } else {
+      l = 0;
+      r = n = sizeof(kUpper) / sizeof(kUpper[0]);
+      while (l < r) {
+        m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
+        if (kUpper[m].y < c) {
+          l = m + 1;
+        } else {
+          r = m;
+        }
+      }
+      if (l < n && kUpper[l].x <= c && c <= kUpper[l].y) {
+        return c + kUpper[l].d;
+      } else {
+        return c;
+      }
+    }
+  } else {
+    l = 0;
+    r = n = sizeof(kAstralUpper) / sizeof(kAstralUpper[0]);
+    while (l < r) {
+      m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
+      if (kAstralUpper[m][1] < c) {
+        l = m + 1;
+      } else {
+        r = m;
+      }
+    }
+    if (l < n && kAstralUpper[l][0] <= c && c <= kAstralUpper[l][1]) {
+      return c + kAstralUpper[l][2];
+    } else {
+      return c;
+    }
+  }
+}
+
+__weak_reference(towupper, towupper_l);
diff --git a/libc/str/tprecode16to8.c b/libc/str/tprecode16to8.c
index d23eb0b5d..9bea83682 100644
--- a/libc/str/tprecode16to8.c
+++ b/libc/str/tprecode16to8.c
@@ -18,55 +18,35 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/fmt/conv.h"
+#include "libc/intrin/packsswb.h"
+#include "libc/intrin/pandn.h"
+#include "libc/intrin/pcmpgtw.h"
+#include "libc/intrin/pmovmskb.h"
 #include "libc/str/str.h"
 #include "libc/str/utf16.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/intel/emmintrin.internal.h"
 
-#if !IsModeDbg()
-#if defined(__x86_64__)
+static const int16_t kDel16[8] = {127, 127, 127, 127, 127, 127, 127, 127};
 
+/* 10x speedup for ascii */
 static axdx_t tprecode16to8_sse2(char *dst, size_t dstsize, const char16_t *src,
                                  axdx_t r) {
-  __m128i v1, v2, v3, vz;
-  vz = _mm_setzero_si128();
+  int16_t v1[8], v2[8], v3[8], vz[8];
+  memset(vz, 0, 16);
   while (r.ax + 8 < dstsize) {
-    v1 = _mm_loadu_si128((__m128i *)(src + r.dx));
-    v2 = _mm_cmpgt_epi16(v1, vz);
-    v3 = _mm_cmpgt_epi16(v1, _mm_set1_epi16(0x7F));
-    v2 = _mm_andnot_si128(v3, v2);
-    if (_mm_movemask_epi8(v2) != 0xFFFF)
+    memcpy(v1, src + r.dx, 16);
+    pcmpgtw(v2, v1, vz);
+    pcmpgtw(v3, v1, kDel16);
+    pandn((void *)v2, (void *)v3, (void *)v2);
+    if (pmovmskb((void *)v2) != 0xFFFF)
       break;
-    v1 = _mm_packs_epi16(v1, v1);
-    _mm_storel_epi64((__m128i *)(dst + r.ax), v1);
+    packsswb((void *)v1, v1, v1);
+    memcpy(dst + r.ax, v1, 8);
     r.ax += 8;
     r.dx += 8;
   }
   return r;
 }
 
-#elif defined(__aarch64__)
-
-static axdx_t tprecode16to8_neon(char *dst, size_t dstsize, const char16_t *src,
-                                 axdx_t r) {
-  uint16x8_t v1, v2, v3;
-  while (r.ax + 8 < dstsize) {
-    v1 = vld1q_u16((const uint16_t *)(src + r.dx));
-    v2 = vcgtq_u16(v1, vdupq_n_u16(0));
-    v3 = vcgtq_u16(v1, vdupq_n_u16(0x7F));
-    v2 = vbicq_u16(v2, v3);
-    if (vaddvq_u16(v2) != 8 * 0xFFFF)
-      break;
-    vst1_u8((uint8_t *)(dst + r.ax), vqmovn_u16(v1));
-    r.ax += 8;
-    r.dx += 8;
-  }
-  return r;
-}
-
-#endif
-#endif
-
 /**
  * Transcodes UTF-16 to UTF-8.
  *
@@ -86,14 +66,10 @@ axdx_t tprecode16to8(char *dst, size_t dstsize, const char16_t *src) {
   r.ax = 0;
   r.dx = 0;
   for (;;) {
-#if !IsModeDbg()
-#if defined(__x86_64__)
-    if (!((uintptr_t)(src + r.dx) & 15))
+#if defined(__x86_64__) && !IsModeDbg() && !IsTiny()
+    if (!((uintptr_t)(src + r.dx) & 15)) {
       r = tprecode16to8_sse2(dst, dstsize, src, r);
-#elif defined(__aarch64__)
-    if (!((uintptr_t)(src + r.dx) & 15))
-      r = tprecode16to8_neon(dst, dstsize, src, r);
-#endif
+    }
 #endif
     if (!(x = src[r.dx++]))
       break;
diff --git a/libc/str/tprecode8to16.c b/libc/str/tprecode8to16.c
index 2924184f8..d823f3163 100644
--- a/libc/str/tprecode8to16.c
+++ b/libc/str/tprecode8to16.c
@@ -16,61 +16,34 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <arm_neon.h>
-#include <stdint.h>
-#include <string.h>
-#include "libc/dce.h"
+#include "libc/intrin/pcmpgtb.h"
+#include "libc/intrin/pmovmskb.h"
+#include "libc/intrin/punpckhbw.h"
+#include "libc/intrin/punpcklbw.h"
 #include "libc/str/str.h"
 #include "libc/str/thompike.h"
 #include "libc/str/utf16.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/intel/emmintrin.internal.h"
-
-#if !IsModeDbg()
-#if defined(__x86_64__)
 
+// 34x speedup for ascii
 static inline axdx_t tprecode8to16_sse2(char16_t *dst, size_t dstsize,
                                         const char *src, axdx_t r) {
-  __m128i v1, v2, vz;
-  vz = _mm_setzero_si128();
+  uint8_t v1[16], v2[16], vz[16];
+  memset(vz, 0, 16);
   while (r.ax + 16 < dstsize) {
-    v1 = _mm_loadu_si128((__m128i *)(src + r.dx));
-    v2 = _mm_cmpgt_epi8(v1, vz);
-    if (_mm_movemask_epi8(v2) != 0xFFFF)
+    memcpy(v1, src + r.dx, 16);
+    pcmpgtb((int8_t *)v2, (int8_t *)v1, (int8_t *)vz);
+    if (pmovmskb(v2) != 0xFFFF)
       break;
-    __m128i lo = _mm_unpacklo_epi8(v1, vz);
-    __m128i hi = _mm_unpackhi_epi8(v1, vz);
-    _mm_storeu_si128((__m128i *)(dst + r.ax), lo);
-    _mm_storeu_si128((__m128i *)(dst + r.ax + 8), hi);
+    punpcklbw(v2, v1, vz);
+    punpckhbw(v1, v1, vz);
+    memcpy(dst + r.ax + 0, v2, 16);
+    memcpy(dst + r.ax + 8, v1, 16);
     r.ax += 16;
     r.dx += 16;
   }
   return r;
 }
 
-#elif defined(__aarch64__)
-
-static inline axdx_t tprecode8to16_neon(char16_t *dst, size_t dstsize,
-                                        const char *src, axdx_t r) {
-  uint8x16_t v1;
-  while (r.ax + 16 < dstsize) {
-    v1 = vld1q_u8((const uint8_t *)(src + r.dx));
-    uint8x16_t cmp = vcgtq_u8(v1, vdupq_n_u8(0));
-    if (vaddvq_u8(cmp) != 16 * 0xFF)
-      break;
-    uint16x8_t lo = vmovl_u8(vget_low_u8(v1));
-    uint16x8_t hi = vmovl_u8(vget_high_u8(v1));
-    vst1q_u16((uint16_t *)(dst + r.ax), lo);
-    vst1q_u16((uint16_t *)(dst + r.ax + 8), hi);
-    r.ax += 16;
-    r.dx += 16;
-  }
-  return r;
-}
-
-#endif
-#endif
-
 /**
  * Transcodes UTF-8 to UTF-16.
  *
@@ -91,14 +64,10 @@ axdx_t tprecode8to16(char16_t *dst, size_t dstsize, const char *src) {
   r.ax = 0;
   r.dx = 0;
   for (;;) {
-#if !IsModeDbg()
-#if defined(__x86_64__)
-    if (!((uintptr_t)(src + r.dx) & 15))
+#if defined(__x86_64__) && !IsModeDbg()
+    if (!((uintptr_t)(src + r.dx) & 15)) {
       r = tprecode8to16_sse2(dst, dstsize, src, r);
-#elif defined(__aarch64__)
-    if (!((uintptr_t)(src + r.dx) & 15))
-      r = tprecode8to16_neon(dst, dstsize, src, r);
-#endif
+    }
 #endif
     x = src[r.dx++] & 0377;
     if (x >= 0300) {
diff --git a/libc/str/uselocale.c b/libc/str/uselocale.c
new file mode 100644
index 000000000..408c1ce5d
--- /dev/null
+++ b/libc/str/uselocale.c
@@ -0,0 +1,25 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/locale.h"
+#include "libc/sysv/errfuns.h"
+
+locale_t uselocale(locale_t l) {
+  // TODO: implement me!
+  return 0;
+}
diff --git a/third_party/musl/wcrtomb.c b/libc/str/wcrtomb.c
similarity index 70%
rename from third_party/musl/wcrtomb.c
rename to libc/str/wcrtomb.c
index 39690948f..1596c63e6 100644
--- a/third_party/musl/wcrtomb.c
+++ b/libc/str/wcrtomb.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,41 +25,41 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <stdlib.h>
-#include <wchar.h>
-#include <errno.h>
-#include "multibyte.h"
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/str/mb.internal.h"
+#include "libc/str/str.h"
 __static_yoink("musl_libc_notice");
 
-size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
-{
-	if (!s) return 1;
-	if ((unsigned)wc < 0x80) {
-		*s = wc;
-		return 1;
-	} else if (MB_CUR_MAX == 1) {
-		if (!IS_CODEUNIT(wc)) {
-			errno = EILSEQ;
-			return -1;
-		}
-		*s = wc;
-		return 1;
-	} else if ((unsigned)wc < 0x800) {
-		*s++ = 0xc0 | (wc>>6);
-		*s = 0x80 | (wc&0x3f);
-		return 2;
-	} else if ((unsigned)wc < 0xd800 || (unsigned)wc-0xe000 < 0x2000) {
-		*s++ = 0xe0 | (wc>>12);
-		*s++ = 0x80 | ((wc>>6)&0x3f);
-		*s = 0x80 | (wc&0x3f);
-		return 3;
-	} else if ((unsigned)wc-0x10000 < 0x100000) {
-		*s++ = 0xf0 | (wc>>18);
-		*s++ = 0x80 | ((wc>>12)&0x3f);
-		*s++ = 0x80 | ((wc>>6)&0x3f);
-		*s = 0x80 | (wc&0x3f);
-		return 4;
-	}
-	errno = EILSEQ;
-	return -1;
+size_t wcrtomb(char *s, wchar_t wc, mbstate_t *st) {
+  if (!s)
+    return 1;
+  if ((unsigned)wc < 0x80) {
+    *s = wc;
+    return 1;
+  } else if (MB_CUR_MAX == 1) {
+    if (!IS_CODEUNIT(wc)) {
+      errno = EILSEQ;
+      return -1;
+    }
+    *s = wc;
+    return 1;
+  } else if ((unsigned)wc < 0x800) {
+    *s++ = 0xc0 | (wc >> 6);
+    *s = 0x80 | (wc & 0x3f);
+    return 2;
+  } else if ((unsigned)wc < 0xd800 || (unsigned)wc - 0xe000 < 0x2000) {
+    *s++ = 0xe0 | (wc >> 12);
+    *s++ = 0x80 | ((wc >> 6) & 0x3f);
+    *s = 0x80 | (wc & 0x3f);
+    return 3;
+  } else if ((unsigned)wc - 0x10000 < 0x100000) {
+    *s++ = 0xf0 | (wc >> 18);
+    *s++ = 0x80 | ((wc >> 12) & 0x3f);
+    *s++ = 0x80 | ((wc >> 6) & 0x3f);
+    *s = 0x80 | (wc & 0x3f);
+    return 4;
+  }
+  errno = EILSEQ;
+  return -1;
 }
diff --git a/third_party/musl/wcscasecmp.c b/libc/str/wcscasecmp.c
similarity index 100%
rename from third_party/musl/wcscasecmp.c
rename to libc/str/wcscasecmp.c
diff --git a/third_party/musl/wcsncasecmp.c b/libc/str/wcsncasecmp.c
similarity index 100%
rename from third_party/musl/wcsncasecmp.c
rename to libc/str/wcsncasecmp.c
diff --git a/third_party/musl/duplocale.c b/libc/str/wcsnrtombs.c
similarity index 74%
rename from third_party/musl/duplocale.c
rename to libc/str/wcsnrtombs.c
index 7633f6e4b..549a706f9 100644
--- a/third_party/musl/duplocale.c
+++ b/libc/str/wcsnrtombs.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,21 +25,43 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/runtime/runtime.h"
-#include "libc/str/locale.internal.h"
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/str/mb.internal.h"
 #include "libc/str/str.h"
 __static_yoink("musl_libc_notice");
 
-#define malloc _mapanon
-#define calloc undef
-#define realloc undef
-#define free undef
-
-locale_t duplocale(locale_t old)
-{
-	locale_t new = malloc(sizeof *new);
-	if (!new) return 0;
-	if (old == LC_GLOBAL_LOCALE) old = &__global_locale;
-	*new = *old;
-	return new;
+size_t wcsnrtombs(char *dst, const wchar_t **wcs, size_t wn, size_t n,
+                  mbstate_t *st) {
+  const wchar_t *ws = *wcs;
+  size_t cnt = 0;
+  if (!dst)
+    n = 0;
+  while (ws && wn) {
+    char tmp[MB_LEN_MAX];
+    size_t l = wcrtomb(n < MB_LEN_MAX ? tmp : dst, *ws, 0);
+    if (l == -1) {
+      cnt = -1;
+      break;
+    }
+    if (dst) {
+      if (n < MB_LEN_MAX) {
+        if (l > n)
+          break;
+        memcpy(dst, tmp, l);
+      }
+      dst += l;
+      n -= l;
+    }
+    if (!*ws) {
+      ws = 0;
+      break;
+    }
+    ws++;
+    wn--;
+    cnt += l;
+  }
+  if (dst)
+    *wcs = ws;
+  return cnt;
 }
diff --git a/third_party/musl/__mo_lookup.c b/libc/str/wcsrtombs.c
similarity index 66%
rename from third_party/musl/__mo_lookup.c
rename to libc/str/wcsrtombs.c
index 1ef350aec..70d115684 100644
--- a/third_party/musl/__mo_lookup.c
+++ b/libc/str/wcsrtombs.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,48 +25,66 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/str/mb.internal.h"
 #include "libc/str/str.h"
-#include "libc/str/locale.internal.h"
 __static_yoink("musl_libc_notice");
 
-#pragma GCC diagnostic ignored "-Wparentheses"
-
-static inline uint32_t swapc(uint32_t x, int c)
-{
-	return c ? x>>24 | x>>8&0xff00 | x<<8&0xff0000 | x<<24 : x;
-}
-
-const char *__mo_lookup(const void *p, size_t size, const char *s)
-{
-	const uint32_t *mo = p;
-	int sw = *mo - 0x950412de;
-	uint32_t b = 0, n = swapc(mo[2], sw);
-	uint32_t o = swapc(mo[3], sw);
-	uint32_t t = swapc(mo[4], sw);
-	if (n>=size/4 || o>=size-4*n || t>=size-4*n || ((o|t)%4))
-		return 0;
-	o/=4;
-	t/=4;
-	for (;;) {
-		uint32_t ol = swapc(mo[o+2*(b+n/2)], sw);
-		uint32_t os = swapc(mo[o+2*(b+n/2)+1], sw);
-		if (os >= size || ol >= size-os || ((char *)p)[os+ol])
-			return 0;
-		int sign = strcmp(s, (char *)p + os);
-		if (!sign) {
-			uint32_t tl = swapc(mo[t+2*(b+n/2)], sw);
-			uint32_t ts = swapc(mo[t+2*(b+n/2)+1], sw);
-			if (ts >= size || tl >= size-ts || ((char *)p)[ts+tl])
-				return 0;
-			return (char *)p + ts;
-		}
-		else if (n == 1) return 0;
-		else if (sign < 0)
-			n /= 2;
-		else {
-			b += n/2;
-			n -= n/2;
-		}
-	}
-	return 0;
+size_t wcsrtombs(char *s, const wchar_t **ws, size_t n, mbstate_t *st) {
+  const wchar_t *ws2;
+  char buf[4];
+  size_t N = n, l;
+  if (!s) {
+    for (n = 0, ws2 = *ws; *ws2; ws2++) {
+      if (*ws2 >= 0x80u) {
+        l = wcrtomb(buf, *ws2, 0);
+        if (!(l + 1))
+          return -1;
+        n += l;
+      } else
+        n++;
+    }
+    return n;
+  }
+  while (n >= 4) {
+    if (**ws - 1u >= 0x7fu) {
+      if (!**ws) {
+        *s = 0;
+        *ws = 0;
+        return N - n;
+      }
+      l = wcrtomb(s, **ws, 0);
+      if (!(l + 1))
+        return -1;
+      s += l;
+      n -= l;
+    } else {
+      *s++ = **ws;
+      n--;
+    }
+    (*ws)++;
+  }
+  while (n) {
+    if (**ws - 1u >= 0x7fu) {
+      if (!**ws) {
+        *s = 0;
+        *ws = 0;
+        return N - n;
+      }
+      l = wcrtomb(buf, **ws, 0);
+      if (!(l + 1))
+        return -1;
+      if (l > n)
+        return N - n;
+      wcrtomb(s, **ws, 0);
+      s += l;
+      n -= l;
+    } else {
+      *s++ = **ws;
+      n--;
+    }
+    (*ws)++;
+  }
+  return N;
 }
diff --git a/libc/str/wcsstr.c b/libc/str/wcsstr.c
index bbf064c59..1867ecd93 100644
--- a/libc/str/wcsstr.c
+++ b/libc/str/wcsstr.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/kmp.h"
 #include "libc/str/str.h"
 
 /**
@@ -29,5 +28,19 @@
  * @see memmem()
  */
 wchar_t *wcsstr(const wchar_t *haystack, const wchar_t *needle) {
-  return __memmem_kmp32(haystack, wcslen(haystack), needle, wcslen(needle));
+  size_t i;
+  for (;;) {
+    for (i = 0;;) {
+      if (!needle[i])
+        return (/*unconst*/ wchar_t *)haystack;
+      if (!haystack[i])
+        break;
+      if (needle[i] != haystack[i])
+        break;
+      ++i;
+    }
+    if (!*haystack++)
+      break;
+  }
+  return NULL;
 }
diff --git a/libc/str/wcstombs.c b/libc/str/wcstombs.c
new file mode 100644
index 000000000..b269fce52
--- /dev/null
+++ b/libc/str/wcstombs.c
@@ -0,0 +1,23 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/str.h"
+
+size_t wcstombs(char *s, const wchar_t *ws, size_t n) {
+  return wcsrtombs(s, &(const wchar_t *){ws}, n, 0);
+}
diff --git a/third_party/musl/wctob.c b/libc/str/wctob.c
similarity index 86%
rename from third_party/musl/wctob.c
rename to libc/str/wctob.c
index 425aa9ad4..4fba0c5fc 100644
--- a/third_party/musl/wctob.c
+++ b/libc/str/wctob.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  Musl Libc                                                                   │
@@ -25,15 +25,16 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <wchar.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "multibyte.h"
+#include "libc/limits.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/mb.internal.h"
+#include "libc/str/str.h"
 __static_yoink("musl_libc_notice");
 
-int wctob(wint_t c)
-{
-	if (c < 128U) return c;
-	if (MB_CUR_MAX==1 && IS_CODEUNIT(c)) return (unsigned char)c;
-	return EOF;
+int wctob(wint_t c) {
+  if (c < 128U)
+    return c;
+  if (MB_CUR_MAX == 1 && IS_CODEUNIT(c))
+    return (unsigned char)c;
+  return EOF;
 }
diff --git a/libc/str/wctomb.c b/libc/str/wctomb.c
new file mode 100644
index 000000000..e2ca942d1
--- /dev/null
+++ b/libc/str/wctomb.c
@@ -0,0 +1,26 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/limits.h"
+#include "libc/str/str.h"
+
+int wctomb(char *s, wchar_t wc) {
+  if (!s)
+    return 0;
+  return wcrtomb(s, wc, 0);
+}
diff --git a/libc/str/wctrans.c b/libc/str/wctrans.c
new file mode 100644
index 000000000..19c4fa376
--- /dev/null
+++ b/libc/str/wctrans.c
@@ -0,0 +1,28 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/str.h"
+#include "libc/wctype.h"
+
+wctrans_t wctrans(const char *s) {
+  if (!strcmp(s, "toupper"))
+    return (wctrans_t)1;
+  if (!strcmp(s, "tolower"))
+    return (wctrans_t)2;
+  return 0;
+}
diff --git a/libc/str/wctype.c b/libc/str/wctype.c
index e20ef27be..20516dc47 100644
--- a/libc/str/wctype.c
+++ b/libc/str/wctype.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/wctype.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/serialize.h"
 
 static const char kWcTypeNames[][8] = {
diff --git a/libc/str/wcwidth.c b/libc/str/wcwidth.c
index 379a11454..66a7a9113 100644
--- a/libc/str/wcwidth.c
+++ b/libc/str/wcwidth.c
@@ -1,61 +1,44 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
 │                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/likely.h"
 #include "libc/str/unicode.h"
-__static_yoink("musl_libc_notice");
-// clang-format off
+#include "libc/str/wcwidth_osx.internal.h"
+#include "libc/wctype.h"
 
-static const unsigned char table[] = {
-#include "nonspacing.inc"
-};
-
-static const unsigned char wtable[] = {
-#include "wide.inc"
-};
-
-int wcwidth(wchar_t wc)
-{
-	if ((int)wc < 0xff) {
-		if ((int)wc >= 0)
-			return ((wc+1) & 0x7f) >= 0x21 ? 1 : wc ? -1 : 0;
-		return -1;
-	}
-	if ((wc & 0xfffeffffU) < 0xfffe) {
-		if ((table[table[wc>>8]*32+((wc&255)>>3)]>>(wc&7))&1)
-			return 0;
-		if ((wtable[wtable[wc>>8]*32+((wc&255)>>3)]>>(wc&7))&1)
-			return 2;
-		return 1;
-	}
-	if ((wc & 0xfffe) == 0xfffe)
-		return -1;
-	if (wc-0x20000U < 0x20000)
-		return 2;
-	if (wc == 0xe0001 || wc-0xe0020U < 0x5f || wc-0xe0100U < 0xef)
-		return 0;
-	return 1;
+/**
+ * Returns cell width of monospace character.
+ */
+int wcwidth(wchar_t c) {
+  int res;
+  if (LIKELY(32 <= c && c < 127))
+    return 1;
+  if (VERY_UNLIKELY((uint32_t)c >= 0x100000)) {
+    if ((uint32_t)c <= 0x10FFFD)
+      return 1;
+    return -1;
+  }
+  res = _wcwidth_osx(c);
+  if (VERY_UNLIKELY(!res)) {
+    if (!c)
+      return 0;
+    if (iswcntrl(c))
+      return -1;
+  }
+  return res;
 }
diff --git a/libc/str/wcwidth_osx.c b/libc/str/wcwidth_osx.c
new file mode 100644
index 000000000..6a2b64c01
--- /dev/null
+++ b/libc/str/wcwidth_osx.c
@@ -0,0 +1,238 @@
+// Copyright (c) 2012 Byron Lai
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "libc/macros.internal.h"
+#include "libc/str/wcwidth_osx.internal.h"
+
+const uint8_t kWcwidthOsxIndex1[] = {
+    0,   16,  26,  33,  34,  50,  56,  72,  88,  104, 107, 107, 107, 107,
+    115, 127, 143, 143, 143, 143, 143, 156, 160, 164, 178, 178, 178, 178,
+    178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178,
+    178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178,
+    178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178,
+    178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178,
+    178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178,
+    178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178,
+    178, 178, 178, 178, 178, 178, 178, 178, 194, 194, 194, 194, 194, 194,
+    194, 195, 211, 211, 211, 211, 211, 211, 211, 212,
+};
+
+const uint16_t kWcwidthOsxIndex2[] = {
+    0,   8,   22,  38,  54,  70,  86,  102, 118, 134, 150, 163, 179, 195, 211,
+    227, 243, 256, 272, 284, 299, 305, 321, 336, 352, 368, 376, 376, 376, 376,
+    376, 376, 379, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393,
+    393, 393, 393, 393, 396, 412, 412, 424, 439, 455, 471, 487, 487, 487, 487,
+    487, 487, 487, 487, 487, 487, 487, 490, 504, 504, 504, 504, 520, 520, 520,
+    520, 520, 520, 520, 520, 520, 520, 520, 520, 529, 544, 559, 575, 591, 607,
+    623, 629, 645, 661, 664, 664, 664, 664, 664, 664, 664, 664, 664, 664, 680,
+    685, 701, 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, 705,
+    705, 705, 705, 721, 737, 753, 764, 780, 780, 780, 780, 780, 780, 780, 780,
+    796, 801, 801, 801, 801, 801, 801, 801, 817, 817, 817, 817, 817, 817, 817,
+    817, 817, 817, 817, 817, 817, 817, 817, 817, 827, 834, 834, 834, 834, 834,
+    834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 850, 866, 867, 867,
+    867, 867, 867, 867, 867, 867, 867, 867, 867, 867, 867, 867, 867, 867, 883,
+    883, 883, 883, 883, 883, 883, 883, 883, 883, 883, 883, 883, 883, 883, 883,
+    884, 900, 900, 900, 900, 900, 900, 900, 900, 900, 900, 900, 900, 900, 900,
+    900, 900, 901,
+};
+
+const uint32_t kWcwidthOsxIndex3[] = {
+    0,    32,   32,   33,   64,   96,   96,   96,   96,   96,   96,   96,
+    96,   96,   96,   96,   128,  128,  128,  144,  175,  205,  208,  208,
+    208,  208,  237,  247,  247,  247,  247,  275,  292,  316,  340,  351,
+    381,  402,  428,  457,  478,  510,  527,  527,  537,  564,  582,  600,
+    619,  632,  632,  658,  690,  711,  738,  738,  738,  738,  738,  738,
+    738,  738,  767,  773,  804,  834,  866,  889,  920,  951,  980,  1003,
+    1034, 1065, 1094, 1117, 1148, 1180, 1210, 1233, 1263, 1294, 1323, 1355,
+    1384, 1410, 1441, 1464, 1495, 1527, 1559, 1582, 1611, 1643, 1673, 1696,
+    1727, 1759, 1791, 1817, 1849, 1881, 1912, 1927, 1958, 1986, 2017, 2049,
+    2081, 2111, 2143, 2169, 2195, 2214, 2240, 2252, 2282, 2303, 2335, 2354,
+    2380, 2406, 2412, 2442, 2468, 2484, 2516, 2516, 2522, 2554, 2554, 2554,
+    2554, 2554, 2586, 2586, 2609, 2641, 2664, 2680, 2710, 2734, 2749, 2773,
+    2778, 2810, 2813, 2845, 2845, 2856, 2887, 2888, 2888, 2888, 2888, 2888,
+    2888, 2888, 2888, 2888, 2888, 2888, 2888, 2888, 2888, 2888, 2897, 2929,
+    2961, 2961, 2976, 3008, 3040, 3072, 3104, 3136, 3148, 3178, 3210, 3242,
+    3274, 3274, 3282, 3314, 3337, 3348, 3348, 3380, 3409, 3441, 3459, 3491,
+    3513, 3535, 3565, 3574, 3606, 3606, 3606, 3606, 3606, 3606, 3606, 3634,
+    3646, 3676, 3697, 3729, 3750, 3776, 3776, 3808, 3816, 3830, 3843, 3875,
+    3875, 3875, 3875, 3907, 3907, 3907, 3907, 3907, 3907, 3939, 3964, 3996,
+    3996, 3996, 3996, 3996, 3996, 3996, 3996, 4006, 4038, 4064, 4095, 4127,
+    4138, 4154, 4183, 4215, 4239, 4254, 4286, 4306, 4338, 4360, 4376, 4408,
+    4408, 4424, 4443, 4466, 4482, 4482, 4482, 4482, 4482, 4482, 4482, 4482,
+    4482, 4505, 4516, 4516, 4516, 4516, 4516, 4540, 4572, 4597, 4629, 4661,
+    4661, 4661, 4661, 4661, 4661, 4661, 4661, 4661, 4661, 4661, 4661, 4661,
+    4663, 4695, 4723, 4727, 4758, 4782, 4802, 4833, 4844, 4868, 4888, 4904,
+    4904, 4904, 4904, 4904, 4904, 4904, 4904, 4904, 4904, 4904, 4904, 4904,
+    4904, 4904, 4904, 4923, 4944, 4944, 4944, 4944, 4944, 4976, 4993, 5009,
+    5024, 5056, 5056, 5056, 5077, 5102, 5128, 5144, 5170, 5202, 5234, 5234,
+    5266, 5281, 5298, 5298, 5330, 5357, 5357, 5369, 5401, 5401, 5401, 5401,
+    5401, 5401, 5411, 5433, 5465, 5487, 5519, 5520, 5529, 5556, 5556, 5556,
+    5588, 5606, 5623, 5623, 5640, 5656, 5664, 5680, 5696, 5728, 5756, 5772,
+    5772, 5772, 5772, 5773, 5805, 5805, 5805, 5805, 5805, 5805, 5805, 5805,
+    5805, 5805, 5805, 5805, 5805, 5805, 5805, 5805, 5815, 5847, 5847, 5847,
+    5847, 5847, 5847, 5847, 5847, 5847, 5847, 5847, 5847, 5847, 5847, 5847,
+    5847, 5851, 5879, 5879, 5911, 5911, 5911, 5911, 5911, 5911, 5911, 5911,
+    5911, 5911, 5911, 5911, 5911, 5911, 5911, 5911, 5930, 5946, 5971, 5978,
+    6010, 6010, 6010, 6010, 6010, 6010, 6010, 6010, 6030, 6062, 6094, 6122,
+    6146, 6146, 6146, 6178, 6178, 6178, 6178, 6197, 6210, 6210, 6215, 6245,
+    6272, 6304, 6312, 6344, 6344, 6371, 6397, 6429, 6429, 6441, 6473, 6473,
+    6473, 6473, 6473, 6505, 6514, 6546, 6578, 6578, 6578, 6578, 6578, 6578,
+    6578, 6578, 6578, 6578, 6578, 6578, 6578, 6610, 6610, 6610, 6610, 6610,
+    6610, 6610, 6610, 6610, 6610, 6610, 6610, 6610, 6610, 6610, 6610, 6638,
+    6642, 6642, 6642, 6642, 6642, 6642, 6642, 6642, 6642, 6642, 6642, 6642,
+    6642, 6642, 6642, 6642, 6674, 6674, 6674, 6674, 6674, 6674, 6674, 6674,
+    6674, 6674, 6674, 6674, 6674, 6674, 6674, 6674, 6690, 6722, 6722, 6722,
+    6722, 6722, 6722, 6722, 6722, 6740, 6756, 6777, 6793, 6793, 6799, 6825,
+    6857, 6888, 6920, 6926, 6926, 6940, 6958, 6977, 6977, 6977, 6977, 6977,
+    6977, 6977, 6977, 6977, 6977, 7009, 7025, 7041, 7059, 7083, 7099, 7129,
+    7157, 7173, 7198, 7220, 7220, 7220, 7223, 7254, 7255, 7255, 7286, 7287,
+    7288, 7319, 7351, 7383, 7388, 7419, 7449, 7481, 7481, 7481, 7486, 7518,
+    7530, 7553, 7553, 7574, 7602, 7618, 7634, 7664, 7664, 7664, 7664, 7696,
+    7728, 7743, 7760, 7792, 7819, 7840, 7851, 7883, 7914, 7942, 7964, 7996,
+    7996, 7996, 7996, 7998, 8018, 8028, 8028, 8028, 8028, 8028, 8028, 8028,
+    8028, 8028, 8028, 8028, 8028, 8028, 8028, 8028, 8028, 8060, 8070, 8102,
+    8102, 8102, 8102, 8102, 8102, 8134, 8166, 8166, 8166, 8166, 8166, 8166,
+    8166, 8198, 8223, 8255, 8280, 8280, 8280, 8280, 8280, 8280, 8280, 8280,
+    8280, 8280, 8280, 8280, 8280, 8280, 8280, 8280, 8312, 8312, 8312, 8312,
+    8312, 8312, 8312, 8312, 8312, 8312, 8312, 8312, 8312, 8312, 8312, 8312,
+    8329, 8344, 8344, 8344, 8344, 8376, 8376, 8376, 8405, 8425, 8425, 8425,
+    8425, 8425, 8425, 8425, 8425, 8425, 8425, 8425, 8425, 8425, 8425, 8425,
+    8425, 8457, 8457, 8457, 8457, 8457, 8457, 8457, 8467, 8499, 8524, 8533,
+    8558, 8587, 8609, 8623, 8653, 8685, 8685, 8715, 8721, 8721, 8721, 8721,
+    8721, 8753, 8753, 8762, 8767, 8785, 8785, 8785, 8785, 8817, 8817, 8828,
+    8850, 8853, 8885, 8914, 8919, 8945, 8975, 9007, 9025, 9025, 9025, 9025,
+    9025, 9051, 9059, 9059, 9059, 9059, 9059, 9059, 9059, 9059, 9079, 9093,
+    9125, 9125, 9125, 9125, 9125, 9125, 9125, 9125, 9125, 9125, 9125, 9125,
+    9125, 9125, 9125, 9125, 9157, 9177, 9193, 9193, 9205, 9225, 9225, 9225,
+    9225, 9225, 9225, 9225, 9225, 9225, 9225, 9225, 9225, 9225, 9225, 9225,
+    9225, 9257, 9257, 9257, 9257, 9257, 9257, 9257, 9257, 9257, 9257, 9257,
+    9257, 9257, 9257, 9257, 9257, 9266, 9289, 9289, 9289, 9289, 9289, 9289,
+    9289, 9289, 9289, 9289, 9289, 9289, 9289, 9289, 9289, 9289, 9321, 9321,
+    9321, 9321, 9321, 9321, 9321, 9321, 9321, 9321, 9321, 9321, 9321, 9321,
+    9321, 9321, 9323, 9353, 9353, 9353, 9353, 9353, 9353, 9353, 9353, 9353,
+    9353, 9353, 9353, 9353, 9353, 9353, 9353, 9385, 9385, 9385, 9385, 9385,
+    9385, 9385, 9385, 9385, 9385, 9385, 9385, 9385, 9385, 9385, 9385, 9387,
+    9419, 9419, 9419, 9419, 9419, 9419, 9419, 9419, 9419, 9419, 9419, 9419,
+    9419, 9419, 9419, 9419, 9421,
+};
+
+const uint32_t kWcwidthOsx[] = {
+    0x00000000, 0x00000000, 0x55555555, 0x55555555, 0x00000000, 0x00000000,
+    0x55555555, 0x55555555, 0x00000000, 0x00000000, 0x15505555, 0x54455540,
+    0x15555555, 0x55555555, 0x55555555, 0x55554000, 0x55555555, 0x00001555,
+    0x55555500, 0x54155555, 0x55555555, 0x14555555, 0x00000000, 0x04000000,
+    0x54000041, 0x01555555, 0x00001550, 0x00555550, 0x55505550, 0x55555555,
+    0x00015555, 0x50000000, 0x45555555, 0x55555555, 0x15555555, 0x04140000,
+    0x55555550, 0x55551055, 0x00005555, 0x55550000, 0x55555555, 0x00005555,
+    0x00000040, 0x55555550, 0x55555555, 0x55400005, 0x00000005, 0x00000000,
+    0x55555550, 0x15555555, 0x54000150, 0x55000101, 0x55555055, 0x54000155,
+    0x15554505, 0x55555414, 0x40455545, 0x40015015, 0x40001141, 0x54014500,
+    0x55555555, 0x15544005, 0x55554140, 0x14555455, 0x00140145, 0x00400000,
+    0x40011540, 0x15415555, 0x55440000, 0x55545455, 0x45554555, 0x01501551,
+    0x01014400, 0x05000000, 0x04555550, 0x45000000, 0x54141555, 0x55455555,
+    0x50155145, 0x00505040, 0x51401000, 0x55555505, 0x10000000, 0x54540555,
+    0x50144501, 0x55540540, 0x50140155, 0x40010151, 0x55550000, 0x01555555,
+    0x45555150, 0x55555545, 0x54555551, 0x00550405, 0x40000000, 0x54154001,
+    0x40001555, 0x55141555, 0x55545455, 0x55551555, 0x55405545, 0x00001454,
+    0x01440005, 0x05155554, 0x51400000, 0x55454555, 0x55515555, 0x54055555,
+    0x00545440, 0x40001000, 0x55555415, 0x15550155, 0x55555514, 0x55540555,
+    0x55155555, 0x55541155, 0x00150000, 0x00015554, 0x05400000, 0x55540000,
+    0x55555555, 0x00145555, 0x01555000, 0x55555400, 0x00000005, 0x00000000,
+    0x10450450, 0x55515400, 0x51411151, 0x10000145, 0x00004554, 0x14155554,
+    0x00000000, 0x40000000, 0x55555555, 0x55541555, 0x45555555, 0x55155544,
+    0x55555555, 0x00000015, 0x00550400, 0x00000000, 0x55500000, 0x15551554,
+    0x00000000, 0x40000000, 0x55555555, 0x15555555, 0x55105440, 0x55555555,
+    0x55555055, 0x55555555, 0x55500555, 0x55555555, 0x00055555, 0x55555500,
+    0x55555555, 0xaaaaaa01, 0xaaaaaaaa, 0x000800aa, 0x00000000, 0x55500000,
+    0x55555555, 0x15455555, 0x15445554, 0x55555554, 0x55555555, 0x55550551,
+    0x05515555, 0x50551555, 0x51555555, 0x55555555, 0x45555555, 0x55555415,
+    0x55555555, 0x55500155, 0x55555555, 0x54001555, 0x55555555, 0x01555555,
+    0x55550000, 0x55555555, 0x00005555, 0x55555554, 0x05555555, 0x55555554,
+    0x55555555, 0x00000001, 0x51555555, 0x00000005, 0x55555555, 0x00001405,
+    0x55555555, 0x00000005, 0x51555555, 0x00000001, 0x55555555, 0x55555555,
+    0x55500010, 0x50000014, 0x55501555, 0x55500055, 0x55500055, 0x55510155,
+    0x55500055, 0x55555555, 0x00055555, 0x55555550, 0x55555555, 0x00000045,
+    0x00000000, 0x55555500, 0x55555555, 0x00005501, 0x00055514, 0x55555404,
+    0x55555555, 0x00005541, 0x55555540, 0x55555555, 0x55540015, 0x40015555,
+    0x54015555, 0x55555555, 0x41555555, 0x00000505, 0x00000000, 0x55555000,
+    0x55555555, 0x45440045, 0x55005555, 0x00555555, 0x05555400, 0x55555554,
+    0x55555555, 0x55555501, 0x00000000, 0x00000000, 0x55555555, 0x55555555,
+    0x55555540, 0x55555555, 0x00000015, 0x00000000, 0x55555540, 0x55555555,
+    0x50000015, 0x55555555, 0x00000015, 0x55000000, 0x55555555, 0x50555555,
+    0x55555055, 0x55555555, 0x05550555, 0x44445555, 0x55555555, 0x41555555,
+    0x55555555, 0x15555555, 0x05555555, 0x55554555, 0x54541555, 0x55554555,
+    0x55554005, 0x50001555, 0x55555555, 0x05555555, 0x50000010, 0x55555550,
+    0x00001551, 0x55555550, 0x00005555, 0x00000000, 0x00010000, 0x55550000,
+    0x55555555, 0x55405555, 0x55555555, 0x00155555, 0x55555550, 0x55555555,
+    0x555555a5, 0x55555555, 0x00000055, 0x55000000, 0x55555555, 0x00555555,
+    0x00000000, 0x55555400, 0x00000000, 0x55555400, 0x55555555, 0x55554155,
+    0x55555555, 0x00001555, 0x00000000, 0x55154000, 0x55555550, 0x55554555,
+    0x45555555, 0x55510154, 0x55555551, 0x55555555, 0x55555501, 0x55555455,
+    0x55550115, 0x55555555, 0x55405555, 0x00000000, 0x00000000, 0x55555555,
+    0x55555555, 0x55555554, 0x55555555, 0x05555554, 0x55555555, 0x55555555,
+    0x50000000, 0x55555555, 0x05555555, 0x55550000, 0x55555555, 0x00005555,
+    0x00000004, 0x55555550, 0x00015555, 0x55515550, 0x55515551, 0x55555551,
+    0x55555555, 0x00000005, 0x00000000, 0xaaaaaaa0, 0xa8aaaaaa, 0xaaaaaaaa,
+    0x02aaaaaa, 0xaaa80000, 0xaaaaaaaa, 0x0002aaaa, 0xaaa80000, 0xaaa802aa,
+    0xaaaaaaaa, 0x8002aaaa, 0x1aaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaa00,
+    0xaaaaaaaa, 0xaaa800aa, 0xaaaaaaaa, 0xaaaa80aa, 0xaaaaaaaa, 0xaaaa2aaa,
+    0xaaaaaaaa, 0x00000000, 0xaaaaaaaa, 0x2aaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
+    0xaa000000, 0xaaaaaaaa, 0xa8aaaaaa, 0xaaaaaaaa, 0x02aaaaaa, 0xaaaa8000,
+    0xaaaaaaaa, 0x00002aaa, 0x00000000, 0xaaaa8000, 0xaaaaaaaa, 0xaaa02aaa,
+    0xaaaaaaaa, 0x000aaaaa, 0x00000000, 0x55500000, 0x55555555, 0x00055555,
+    0x50000000, 0x55555555, 0x05555555, 0x55555555, 0x55500005, 0x55555555,
+    0x00000005, 0x00000000, 0x55555550, 0x55555555, 0x00000005, 0x00000000,
+    0x55151550, 0x55555554, 0x00554155, 0x00000000, 0x55555555, 0x55555555,
+    0x55550000, 0x55555555, 0x00005555, 0x01555554, 0x00000000, 0x54000000,
+    0x55555555, 0x01555555, 0x00010000, 0x00000000, 0x55540000, 0x55555555,
+    0x00015555, 0x55555550, 0x50555550, 0x00000005, 0x00000000, 0xaaaaaaa0,
+    0xaaaaaaaa, 0x0000000a, 0x00000000, 0x55555550, 0x55555555, 0x00000005,
+    0xaaaaaaa0, 0xaaaaaaaa, 0xaaaaaa0a, 0xaaaaaaaa, 0xaaa800aa, 0xaaaaaaaa,
+    0x0002aaaa, 0x00000000, 0x55540000, 0x55000000, 0x55551001, 0x15555555,
+    0x51451155, 0x55555555, 0x05555555, 0x00000000, 0x55555554, 0x55555555,
+    0x00000001, 0x55555554, 0x55555555, 0x55555541, 0x55555555, 0x00000015,
+    0x55400000, 0x00015555, 0xaaa80000, 0x0054002a, 0xaaaaa800, 0xaaa8aaaa,
+    0x500aa2aa, 0x55555515, 0x55555555, 0xaaaa8055, 0xaaaaaaaa, 0x55556aaa,
+    0x55555555, 0x15541555, 0x15541554, 0x4aaa8054, 0x00000555, 0x55554140,
+    0x55555515, 0x55451555, 0x55415555, 0x00015555, 0x00000000, 0x55540000,
+    0x55555555, 0x50015555, 0x55555401, 0x05555555, 0x55555554, 0x55555555,
+    0x55555001, 0x00000005, 0x00000000, 0x55555550, 0x55555555, 0x00000000,
+    0x00000000, 0x55555555, 0x01555555, 0x55555555, 0x55555555, 0x00000000,
+    0x00000000, 0x55555555, 0x15555555, 0x55400000, 0x00155555, 0x00000000,
+    0x55400000, 0x55555555, 0x55515555, 0x55555555, 0x50055555, 0x00555555,
+    0x00000000, 0x55000000, 0x55555555, 0x00555555, 0x00000000, 0x55000000,
+    0x55555105, 0x14555555, 0x00000410, 0x00000000, 0x55555000, 0x55555555,
+    0x00000400, 0x00000000, 0x00001000, 0x45455000, 0x55555555, 0x40000015,
+    0x40001555, 0x00005555, 0x00000000, 0x55550000, 0x55555555, 0x00005555,
+    0x00000000, 0x55550000, 0x55555555, 0x00005555, 0x00015400, 0x00000000,
+    0x55540000, 0x55555555, 0x00015555, 0x55555540, 0x55555555, 0x55555415,
+    0x55555555, 0x55550155, 0x50000001, 0x55554000, 0x40155555, 0x55555555,
+    0x01555555, 0x00000000, 0x54000000, 0x55555555, 0x01555555, 0x00000001,
+    0x00000000, 0x55555554, 0x55555555, 0x00000001, 0x00000000, 0x55555554,
+    0x55555555, 0x55555551, 0x55555555, 0x50504145, 0x15555545, 0x55554551,
+    0x55555555, 0x50551555, 0x45554555, 0x55555555, 0x45515555, 0x55540455,
+    0x55555554, 0x55555555, 0x55555541, 0x55555555, 0x55555415, 0x55555555,
+    0x00000155, 0x00000000, 0x55555400, 0x55555555, 0x55540155, 0x55555555,
+    0x00015555, 0x00000000, 0xaaa80000, 0xaaaaaaaa, 0x0002aaaa, 0x00000000,
+    0xaaa80000, 0xaaaaaaaa, 0x0002aaaa, 0x00000000, 0x55540000, 0x55555555,
+    0x55415555, 0x55555555, 0x00155555,
+};
diff --git a/libc/str/wcwidth_osx.internal.h b/libc/str/wcwidth_osx.internal.h
new file mode 100644
index 000000000..89ff0808c
--- /dev/null
+++ b/libc/str/wcwidth_osx.internal.h
@@ -0,0 +1,20 @@
+#ifndef COSMOPOLITAN_LIBC_STR_WCWIDTH_OSX_H_
+#define COSMOPOLITAN_LIBC_STR_WCWIDTH_OSX_H_
+COSMOPOLITAN_C_START_
+
+extern const uint32_t kWcwidthOsx[591];
+extern const uint8_t kWcwidthOsxIndex1[136];
+extern const uint16_t kWcwidthOsxIndex2[228];
+extern const uint32_t kWcwidthOsxIndex3[917];
+
+static inline int _wcwidth_osx(uint32_t codePoint) {
+  uint32_t a, b, c, d;
+  a = kWcwidthOsxIndex1[codePoint >> 13];
+  b = kWcwidthOsxIndex2[a + ((codePoint >> 9) & 0xf)];
+  c = kWcwidthOsxIndex3[b + ((codePoint >> 5) & 0xf)];
+  d = c + (codePoint & 0x1f);
+  return (kWcwidthOsx[d >> 4] >> ((d & 0xf) << 1)) & 3;
+}
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_STR_WCWIDTH_OSX_H_ */
diff --git a/libc/str/wide.inc b/libc/str/wide.inc
deleted file mode 100644
index e403c9a5a..000000000
--- a/libc/str/wide.inc
+++ /dev/null
@@ -1,65 +0,0 @@
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,19,16,20,21,22,16,16,16,23,16,16,24,25,26,27,28,17,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,29,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,17,30,16,16,16,16,31,16,16,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,32,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,17,16,16,16,33,
-34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,35,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,36,17,17,37,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,38,39,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,40,41,42,43,44,45,46,47,16,48,49,16,16,16,16,
-16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,6,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,30,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,48,0,0,0,0,0,0,255,15,0,0,0,0,128,0,0,8,
-0,2,12,0,96,48,64,16,0,0,4,44,36,32,12,0,0,0,1,0,0,0,80,184,0,0,0,0,0,0,0,224,
-0,0,0,1,128,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,251,255,255,255,255,255,255,255,
-255,255,255,15,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,63,0,0,0,255,15,255,255,255,255,
-255,255,255,127,254,255,255,255,255,255,255,255,255,255,127,254,255,255,255,
-255,255,255,255,255,255,255,255,255,224,255,255,255,255,255,254,255,255,255,
-255,255,255,255,255,255,255,127,255,255,255,255,255,7,255,255,255,255,15,0,
-255,255,255,255,255,127,255,255,255,255,255,0,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,
-0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,31,255,255,255,255,255,255,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,
-255,255,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,15,0,0,0,0,0,0,0,0,0,0,0,0,0,255,3,0,0,255,255,255,255,247,255,127,15,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,254,255,255,255,255,255,255,255,255,255,255,
-255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,7,0,255,255,255,127,0,0,0,0,0,
-0,7,0,240,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-15,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,64,254,7,0,0,0,0,0,0,0,0,0,0,0,0,7,0,255,255,255,
-255,255,15,255,1,3,0,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,
-1,224,191,255,255,255,255,255,255,255,255,223,255,255,15,0,255,255,255,255,
-255,135,15,0,255,255,17,255,255,255,255,255,255,255,255,127,253,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-159,255,255,255,255,255,255,255,63,0,120,255,255,255,0,0,4,0,0,96,0,16,0,0,0,
-0,0,0,0,0,0,0,248,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,
-255,255,255,255,255,255,63,16,39,0,0,24,240,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,255,15,0,
-0,0,224,255,255,255,255,255,255,255,255,255,255,255,255,123,252,255,255,255,
-255,231,199,255,255,255,231,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,15,7,7,0,63,0,0,0,0,0,0,0,0,0,0,0,0,0,
diff --git a/libc/intrin/windowsdurationtotimespec.c b/libc/str/windowsdurationtotimespec.c
similarity index 100%
rename from libc/intrin/windowsdurationtotimespec.c
rename to libc/str/windowsdurationtotimespec.c
diff --git a/libc/intrin/windowsdurationtotimeval.c b/libc/str/windowsdurationtotimeval.c
similarity index 100%
rename from libc/intrin/windowsdurationtotimeval.c
rename to libc/str/windowsdurationtotimeval.c
diff --git a/libc/intrin/windowstimetotimespec.c b/libc/str/windowstimetotimespec.c
similarity index 100%
rename from libc/intrin/windowstimetotimespec.c
rename to libc/str/windowstimetotimespec.c
diff --git a/libc/intrin/windowstimetotimeval.c b/libc/str/windowstimetotimeval.c
similarity index 100%
rename from libc/intrin/windowstimetotimeval.c
rename to libc/str/windowstimetotimeval.c
diff --git a/libc/system/BUILD.mk b/libc/system/BUILD.mk
deleted file mode 100644
index 00d4aaae9..000000000
--- a/libc/system/BUILD.mk
+++ /dev/null
@@ -1,83 +0,0 @@
-#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
-#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
-
-PKGS += LIBC_SYSTEM
-
-LIBC_SYSTEM_ARTIFACTS += LIBC_SYSTEM_A
-LIBC_SYSTEM = $(LIBC_SYSTEM_A_DEPS) $(LIBC_SYSTEM_A)
-LIBC_SYSTEM_A = o/$(MODE)/libc/system/system.a
-LIBC_SYSTEM_A_FILES := $(wildcard libc/system/*)
-LIBC_SYSTEM_A_HDRS = $(filter %.h,$(LIBC_SYSTEM_A_FILES))
-LIBC_SYSTEM_A_INCS = $(filter %.inc,$(LIBC_SYSTEM_A_FILES))
-LIBC_SYSTEM_A_SRCS_S = $(filter %.S,$(LIBC_SYSTEM_A_FILES))
-LIBC_SYSTEM_A_SRCS_C = $(filter %.c,$(LIBC_SYSTEM_A_FILES))
-
-LIBC_SYSTEM_A_SRCS =					\
-	$(LIBC_SYSTEM_A_SRCS_S)				\
-	$(LIBC_SYSTEM_A_SRCS_C)
-
-LIBC_SYSTEM_A_OBJS =					\
-	$(LIBC_SYSTEM_A_SRCS_S:%.S=o/$(MODE)/%.o)	\
-	$(LIBC_SYSTEM_A_SRCS_C:%.c=o/$(MODE)/%.o)
-
-LIBC_SYSTEM_A_CHECKS =					\
-	$(LIBC_SYSTEM_A).pkg				\
-	$(LIBC_SYSTEM_A_HDRS:%=o/$(MODE)/%.ok)
-
-LIBC_SYSTEM_A_DIRECTDEPS =				\
-	LIBC_CALLS					\
-	LIBC_FMT					\
-	LIBC_INTRIN					\
-	LIBC_NEXGEN32E					\
-	LIBC_PROC					\
-	LIBC_RUNTIME					\
-	LIBC_STDIO					\
-	LIBC_STR					\
-	LIBC_SYSV					\
-	THIRD_PARTY_MUSL				\
-	THIRD_PARTY_SED					\
-	THIRD_PARTY_TR					\
-
-LIBC_SYSTEM_A_DEPS :=					\
-	$(call uniq,$(foreach x,$(LIBC_SYSTEM_A_DIRECTDEPS),$($(x))))
-
-$(LIBC_SYSTEM_A):libc/system/				\
-		$(LIBC_SYSTEM_A).pkg			\
-		$(LIBC_SYSTEM_A_OBJS)
-
-$(LIBC_SYSTEM_A).pkg:					\
-		$(LIBC_SYSTEM_A_OBJS)			\
-		$(foreach x,$(LIBC_SYSTEM_A_DIRECTDEPS),$($(x)_A).pkg)
-
-# offer assurances about the stack safety of cosmo libc
-$(LIBC_SYSTEM_A_OBJS): private COPTS += -Wframe-larger-than=4096 -Walloca-larger-than=4096
-
-$(LIBC_SYSTEM_A_OBJS): private				\
-		CFLAGS +=				\
-			-fno-sanitize=all		\
-			-Wframe-larger-than=4096	\
-			-Walloca-larger-than=4096
-
-o/$(MODE)/libc/system/fputc.o: private			\
-		CFLAGS +=				\
-			-O3
-
-o//libc/system/appendw.o: private			\
-		CFLAGS +=				\
-			-Os
-
-o/$(MODE)/libc/system/dirstream.o			\
-o/$(MODE)/libc/system/mt19937.o: private			\
-		CFLAGS +=				\
-			-ffunction-sections
-
-LIBC_SYSTEM_LIBS = $(foreach x,$(LIBC_SYSTEM_ARTIFACTS),$($(x)))
-LIBC_SYSTEM_SRCS = $(foreach x,$(LIBC_SYSTEM_ARTIFACTS),$($(x)_SRCS))
-LIBC_SYSTEM_HDRS = $(foreach x,$(LIBC_SYSTEM_ARTIFACTS),$($(x)_HDRS))
-LIBC_SYSTEM_INCS = $(foreach x,$(LIBC_SYSTEM_ARTIFACTS),$($(x)_INCS))
-LIBC_SYSTEM_CHECKS = $(foreach x,$(LIBC_SYSTEM_ARTIFACTS),$($(x)_CHECKS))
-LIBC_SYSTEM_OBJS = $(foreach x,$(LIBC_SYSTEM_ARTIFACTS),$($(x)_OBJS))
-$(LIBC_SYSTEM_OBJS): $(BUILD_FILES) libc/system/BUILD.mk
-
-.PHONY: o/$(MODE)/libc/system
-o/$(MODE)/libc/system: $(LIBC_SYSTEM_CHECKS)
diff --git a/libc/sysv/BUILD.mk b/libc/sysv/BUILD.mk
index 19f328784..ec1b3b1fb 100644
--- a/libc/sysv/BUILD.mk
+++ b/libc/sysv/BUILD.mk
@@ -84,12 +84,11 @@ o/$(MODE)/libc/sysv/sysret.o: private			\
 		CFLAGS +=				\
 			-ffreestanding			\
 			-fno-stack-protector		\
-			-fno-sanitize=all		\
-			-mgeneral-regs-only
+			-fno-sanitize=all
 
 ifeq ($(ARCH),aarch64)
 o/$(MODE)/libc/sysv/sysv.o: private			\
-		OVERRIDE_CFLAGS +=			\
+		CFLAGS +=				\
 			-ffixed-x0			\
 			-ffixed-x1			\
 			-ffixed-x2			\
diff --git a/libc/sysv/consts.sh b/libc/sysv/consts.sh
index 353bfc464..b6ff0405a 100755
--- a/libc/sysv/consts.sh
+++ b/libc/sysv/consts.sh
@@ -227,6 +227,7 @@ syscon	mmap	MAP_LOCKED				0x00002000		0x00002000		0			0			0			0			0			0
 syscon	mmap	MAP_NORESERVE				0x00004000		0x00004000		0x00000040		0x00000040		0			0			0x00000040		0			# Linux calls it "reserve"; NT calls it "commit"? which is default?
 syscon	mmap	MAP_POPULATE				0x00008000		0x00008000		0			0			0x00040000		0			0			0			# MAP_PREFAULT_READ on FreeBSD; can avoid madvise(MADV_WILLNEED) on private file mapping
 syscon	mmap	MAP_NONBLOCK				0x00010000		0x00010000		0			0			0			0			0			0
+syscon	mmap	MAP_NOFORK				0			0			0			0			0			0			0			0x10000000		# used on pages internal to our mmap() implemention on windows
 syscon	mmap	MAP_SYNC				0x00080000		0x00080000		0			0			0			0			0			0			# perform synchronous page faults for mapping (Linux 4.15+)
 syscon	mmap	MAP_HUGETLB				0x00040000		-1			-1			-1			-1			-1			-1			-1			# make it inherit across execve()
 syscon	mmap	MAP_INHERIT				-1			-1			-1			-1			-1			-1			0x00000080		-1			# make it inherit across execve()
@@ -464,7 +465,7 @@ syscon	rlimit	RLIMIT_RSS				5			5			5			5			5			5			5			127			# max physical mem
 syscon	rlimit	RLIMIT_NPROC				6			6			7			7			7			7			7			127			# max number of processes; see fork()→EAGAIN; bsd consensus
 syscon	rlimit	RLIMIT_NOFILE				7			7			8			8			8			8			8			127			# max number of open files; see accept()→EMFILE/ENFILE; bsd consensus
 syscon	rlimit	RLIMIT_MEMLOCK				8			8			6			6			6			6			6			127			# max locked-in-memory address space; bsd consensus
-syscon	rlimit	RLIMIT_AS				9			9			5			5			10			2			10			0			# max virtual memory size in bytes; this one actually works; fudged as RLIMIT_DATA on OpenBSD
+syscon	rlimit	RLIMIT_AS				9\			9			5			5			10			2			10			0			# max virtual memory size in bytes; this one actually works; fudged as RLIMIT_DATA on OpenBSD
 syscon	rlimit	RLIMIT_LOCKS				10			10			127			127			127			127			127			127			# max flock() / fcntl() locks; bsd consensus
 syscon	rlimit	RLIMIT_SIGPENDING			11			11			127			127			127			127			127			127			# max sigqueue() can enqueue; bsd consensus
 syscon	rlimit	RLIMIT_MSGQUEUE				12			12			127			127			127			127			127			127			# meh posix message queues; bsd consensus
@@ -570,60 +571,27 @@ syscon	close	CLOSE_RANGE_CLOEXEC			4			4			-1			-1			-1			-1			-1			-1			#
 
 #	clock_{gettime,settime} timers
 #
-#	Executive Summary
-#	- CLOCK_MONOTONIC shouldn't count suspended time
-#	- CLOCK_BOOTTIME is monotonic and should count suspended time
-#	- Only CLOCK_REALTIME and CLOCK_MONOTONIC can be used with futexes
-#	- CLOCK_MONOTONIC_RAW should fail with EINVAL if host lacks support
-#	- CLOCK_MONOTONIC and CLOCK_BOOTTIME should be relative to system boot time
-#	- COARSE can be CLK_TCK behind (~20ms) and will EINVAL on RHEL5 which isn't worth polyfilling
-#
-#	FreeBSD defines the following rosetta stone
-#	Taken from freebsd/sys/compat/linux/linux_time.c
-#	- Linux CLOCK_MONOTONIC        -> FreeBSD CLOCK_UPTIME [5]
-#	- Linux CLOCK_MONOTONIC_RAW    -> FreeBSD CLOCK_UPTIME_FAST [8]
-#	- Linux CLOCK_REALTIME_COARSE  -> FreeBSD CLOCK_REALTIME_FAST [10]
-#	- Linux CLOCK_MONOTONIC_COARSE -> FreeBSD CLOCK_UPTIME_FAST [8]
-#	- Linux CLOCK_BOOTTIME         -> FreeBSD CLOCK_MONOTONIC [4]
-#
-#	For MacOS we define the following mappings
-#	- Linux CLOCK_MONOTONIC        -> MacOS CLOCK_UPTIME_RAW [8]
-#	- Linux CLOCK_MONOTONIC_RAW    -> MacOS CLOCK_UPTIME_RAW [8]
-#	- Linux CLOCK_REALTIME_COARSE  -> MacOS CLOCK_REALTIME [0]
-#	- Linux CLOCK_MONOTONIC_COARSE -> MacOS CLOCK_UPTIME_RAW_APPROX [9]
-#	- Linux CLOCK_BOOTTIME         -> MacOS CLOCK_MONOTONIC [6]
-#
-#	For OpenBSD we define the following mappings
-#	- Linux CLOCK_MONOTONIC        -> OpenBSD CLOCK_UPTIME [5]
-#	- Linux CLOCK_MONOTONIC_RAW    -> EINVAL because OpenBSD ntpd can adjfreq(2)
-#	- Linux CLOCK_REALTIME_COARSE  -> OpenBSD CLOCK_REALTIME [0]
-#	- Linux CLOCK_MONOTONIC_COARSE -> OpenBSD CLOCK_UPTIME [5]
-#	- Linux CLOCK_BOOTTIME         -> OpenBSD CLOCK_MONOTONIC [3]
-#
-#	For NetBSD we define the following mappings
-#	- Linux CLOCK_MONOTONIC        -> NetBSD CLOCK_MONOTONIC [3] TODO: suspend?
-#	- Linux CLOCK_MONOTONIC_RAW    -> NetBSD CLOCK_MONOTONIC [3] NetBSD clock_gettime(2) says it isn't impacted by adjfreq(2)
-#	- Linux CLOCK_REALTIME_COARSE  -> NetBSD CLOCK_REALTIME [0]
-#	- Linux CLOCK_MONOTONIC_COARSE -> NetBSD CLOCK_MONOTONIC [3]
-#	- Linux CLOCK_BOOTTIME         -> NetBSD CLOCK_MONOTONIC [3] TODO: suspend?
-#
-#	For Windows we define the following mappings
-#	- Linux CLOCK_REALTIME         -> GetSystemTimePreciseAsFileTime()
-#	- Linux CLOCK_MONOTONIC        -> QueryUnbiasedInterruptTimePrecise()
-#	- Linux CLOCK_MONOTONIC_RAW    -> QueryUnbiasedInterruptTimePrecise()
-#	- Linux CLOCK_REALTIME_COARSE  -> GetSystemTimeAsFileTime()
-#	- Linux CLOCK_MONOTONIC_COARSE -> QueryUnbiasedInterruptTime()
-#	- Linux CLOCK_BOOTTIME         -> QueryInterruptTimePrecise()
-#
 #	group	name					GNU/Systemd		GNU/Systemd (Aarch64)	XNU's Not UNIX!		MacOS (Arm64)		FreeBSD			OpenBSD			NetBSD			The New Technology	Commentary
-syscon	clock	CLOCK_REALTIME				0			0			0			0			0			0			0			0			#
-syscon	clock	CLOCK_MONOTONIC				1			1			8			8			5			5			3			1			#
-syscon	clock	CLOCK_PROCESS_CPUTIME_ID		2			2			12			12			15			2			0x40000000		4			#
+syscon	clock	CLOCK_REALTIME				0			0			0			0			0			0			0			0			# consensus
+syscon	clock	CLOCK_REALTIME_PRECISE			0			0			0			0			9			0			0			0			#
+syscon	clock	CLOCK_REALTIME_FAST			0			0			0			0			10			0			0			0			#
+syscon	clock	CLOCK_REALTIME_COARSE			5			5			0			0			10			0			0			2			# Linux 2.6.32+; bsd consensus; not available on RHEL5
+syscon	clock	CLOCK_MONOTONIC				1			1			6			6			4			3			3			1			# XNU/NT faked; could move backwards if NTP introduces negative leap second
+syscon	clock	CLOCK_MONOTONIC_PRECISE			1			1			6			6			11			3			3			1			#
+syscon	clock	CLOCK_MONOTONIC_FAST			1			1			6			6			12			3			3			1			#
+syscon	clock	CLOCK_MONOTONIC_COARSE			6			6			5			5			12			3			3			1			# Linux 2.6.32+; bsd consensus; not available on RHEL5
+syscon	clock	CLOCK_MONOTONIC_RAW			4			4			4			4			127			127			127			127			# actually monotonic; not subject to NTP adjustments; Linux 2.6.28+; XNU/NT/FreeBSD/OpenBSD faked; not available on RHEL5
+syscon	clock	CLOCK_PROCESS_CPUTIME_ID		2			2			12			12			15			2			0x40000000		4			# NetBSD lets you bitwise a PID into clockid_t
 syscon	clock	CLOCK_THREAD_CPUTIME_ID			3			3			16			16			14			4			0x20000000		5			#
-syscon	clock	CLOCK_MONOTONIC_RAW			4			4			127			8			8			127			3			1			# Linux 2.6.28+
-syscon	clock	CLOCK_REALTIME_COARSE			5			5			0			0			10			0			0			2			# Linux 2.6.32+
-syscon	clock	CLOCK_MONOTONIC_COARSE			6			6			9			9			8			5			3			6			# Linux 2.6.32+
-syscon	clock	CLOCK_BOOTTIME				7			7			6			6			4			3			3			3			# Linux 2.6.39+
+syscon	clock	CLOCK_PROF				127			127			127			127			2			127			2			127			#
+syscon	clock	CLOCK_BOOTTIME				7			7			7			127			127			6			127			3			#
+syscon	clock	CLOCK_REALTIME_ALARM			8			8			127			127			127			127			127			127			#
+syscon	clock	CLOCK_BOOTTIME_ALARM			9			9			127			127			127			127			127			127			#
+syscon	clock	CLOCK_TAI				11			11			127			127			127			127			127			127			#
+syscon	clock	CLOCK_UPTIME				127			127			8			8			5			5			127			127			#
+syscon	clock	CLOCK_UPTIME_PRECISE			127			127			127			127			7			127			127			127			#
+syscon	clock	CLOCK_UPTIME_FAST			127			127			127			127			8			127			127			127			#
+syscon	clock	CLOCK_SECOND				127			127			127			127			13			127			127			127			#
 
 #	poll()
 #
@@ -650,14 +618,14 @@ syscon	so	SO_DEBUG				1			1			1			1			1			1			1			1			# debugging is enabled; co
 syscon	so	SO_TYPE					3			3			0x1008			0x1008			0x1008			0x1008			0x1008			0x1008			# bsd consensus
 syscon	so	SO_ERROR				4			4			0x1007			0x1007			0x1007			0x1007			0x1007			0x1007			# takes int pointer and stores/clears the pending error code; bsd consensus
 syscon	so	SO_ACCEPTCONN				30			30			2			2			2			2			2			2			# takes int pointer and stores boolean indicating if listen() was called on fd; bsd consensus
-syscon	so	SO_REUSEPORT				15			15			512			512			512			512			512			0			# bsd consensus; no windows support
+syscon	so	SO_REUSEPORT				15			15			0x0200			0x0200			0x0200			0x0200			0x0200			0			# bsd consensus; no windows support
 syscon	so	SO_REUSEADDR				2			2			4			4			4			4			4			-5			# SO_EXCLUSIVEADDRUSE on Windows (see third_party/python/Lib/test/support/__init__.py)
 syscon	so	SO_KEEPALIVE				9			9			8			8			8			8			8			8			# bsd consensus
-syscon	so	SO_DONTROUTE				5			5			16			16			16			16			16			16			# bsd consensus
-syscon	so	SO_BROADCAST				6			6			32			32			32			32			32			32			# socket is configured for broadcast messages; bsd consensus
-syscon	so	SO_USELOOPBACK				0			0			64			64			64			64			64			64			# bsd consensus
-syscon	so	SO_LINGER				13			13			4224			4224			128			128			128			128			# takes struct linger; causes close() return value to actually mean something; SO_LINGER_SEC on XNU; bsd consensus
-syscon	so	SO_OOBINLINE				10			10			256			256			256			256			256			256			# bsd consensus
+syscon	so	SO_DONTROUTE				5			5			0x10			0x10			0x10			0x10			0x10			0x10			# bsd consensus
+syscon	so	SO_BROADCAST				6			6			0x20			0x20			0x20			0x20			0x20			0x20			# socket is configured for broadcast messages; bsd consensus
+syscon	so	SO_USELOOPBACK				0			0			0x40			0x40			0x40			0x40			0x40			0x40			# bsd consensus
+syscon	so	SO_LINGER				13			13			0x1080			0x1080			0x80			0x80			0x80			0x80			# takes struct linger; causes close() return value to actually mean something; SO_LINGER_SEC on XNU; bsd consensus
+syscon	so	SO_OOBINLINE				10			10			0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			# bsd consensus
 syscon	so	SO_SNDBUF				7			7			0x1001			0x1001			0x1001			0x1001			0x1001			0x1001			# bsd consensus
 syscon	so	SO_RCVBUF				8			8			0x1002			0x1002			0x1002			0x1002			0x1002			0x1002			# bsd consensus
 syscon	so	SO_RCVTIMEO				20			20			0x1006			0x1006			0x1006			0x1006			0x100c			0x1006			# recv timeout; takes struct timeval (overrides SA_RESTART restoring EINTR behavior on recv/send/connect/accept/etc.; bsd consensus)
@@ -677,18 +645,18 @@ syscon	tcp	TCP_CORK				3			3			4			4			4			16			4			0			# nagle's algorithm stri
 syscon	tcp	TCP_MAXSEG				2			2			2			2			2			2			2			0			# reduces tcp segment size; see also tcp offloading
 syscon	tcp	TCP_FASTOPEN				23			23			0x105			0x105			0x0401			0			0			15			# reduces roundtrips; for listener; Linux 3.7+ (c. 2012) / or is windows it 0x22? /proc/sys/net/ipv4/tcp_fastopen TODO(jart): MSG_FASTOPEN; XNU sources say 261 but not sure if that's true
 syscon	tcp	TCP_FASTOPEN_CONNECT			30			30			0			0			0			0			0			0			# reduces roundtrips; for listener; Linux 3.7+ (c. 2012) / or is windows it 0x22? /proc/sys/net/ipv4/tcp_fastopen TODO(jart): MSG_FASTOPEN; XNU sources say 261 but not sure if that's true
-syscon	tcp	TCP_KEEPIDLE				4			4			0			0			0x100			0			3			3			# start keepalives after this period
-syscon	tcp	TCP_KEEPINTVL				5			5			0x101			0x101			0x200			0			5			17			# interval between keepalives
-syscon	tcp	TCP_KEEPCNT				6			6			0x102			0x102			0x400			0			6			16			# number of keepalives before death
-syscon	tcp	TCP_INFO				11			11			0x200			0x200			32			9			9			0			# get connection info
-syscon	tcp	TCP_NOTSENT_LOWAT			25			25			513			513			0			0			0			0			# limit unset byte queue
-syscon	tcp	TCP_MD5SIG				14			14			0			0			16			4			16			0			# what is it (rfc2385)
-syscon	tcp	TCP_CONGESTION				13			13			0			0			64			0			0			0			# set traffic control
+syscon	tcp	TCP_KEEPIDLE				4			4			0			0			0x100			0			3			0			# start keepalives after this period
+syscon	tcp	TCP_KEEPINTVL				5			5			0x101			0x101			0x200			0			5			0			# interval between keepalives
+syscon	tcp	TCP_KEEPCNT				6			6			0x102			0x102			0x400			0			6			0			# number of keepalives before death
 syscon	tcp	TCP_SYNCNT				7			7			0			0			0			0			0			0			# how hard to syn packet the enemy
 syscon	tcp	TCP_ULP					31			31			0			0			0			0			0			0			# setsockopt(sock, IPPROTO_TCP, TCP_ULP, "tls", 4)
 syscon	tcp	TCP_COOKIE_TRANSACTIONS			15			15			0			0			0			0			0			0			# defense against the syn packets
 syscon	tcp	TCP_LINGER2				8			8			0			0			0			0			0			0			# orphaned fin-wait-2 lifetime cf. net.ipv4.tcp_fin_timeout see cloudflare blog
+syscon	tcp	TCP_NOTSENT_LOWAT			25			25			513			513			0			0			0			0			# limit unset byte queue
+syscon	tcp	TCP_INFO				11			11			0			0			0x20			0			9			0			# get connection info
 syscon	tcp	TCP_CC_INFO				26			26			0			0			0			0			0			0			# get congestion control info
+syscon	tcp	TCP_CONGESTION				13			13			0			0			0x40			0			0			0			# set traffic control
+syscon	tcp	TCP_MD5SIG				14			14			0			0			0x10			4			16			0			# what is it (rfc2385)
 syscon	tcp	TCP_MD5SIG_MAXKEYLEN			80			80			0			0			0			0			0			0			# what is it
 syscon	tcp	TCP_TIMESTAMP				24			24			0			0			0			0			0			0			# what is it
 syscon	tcp	TCP_USER_TIMEOUT			18			18			0			0			0			0			0			0			# what is it
@@ -704,79 +672,6 @@ syscon	tcp	TCP_REPAIR_OPTIONS			22			22			0			0			0			0			0			0			# what is it
 syscon	tcp	TCP_REPAIR_QUEUE			20			20			0			0			0			0			0			0			# what is it
 syscon	tcp	TCP_THIN_LINEAR_TIMEOUTS		16			16			0			0			0			0			0			0			# what is it
 
-#	IPPROTO_IP (or SOL_IP) socket options
-#
-#	group	name					GNU/Systemd		GNU/Systemd (Aarch64)	XNU's Not UNIX!		MacOS (Arm64)		FreeBSD			OpenBSD			NetBSD			The New Technology	Commentary
-syscon	ip	IP_TOS					1			1			3			3			3			3			3			3			# bsd consensus
-syscon	ip	IP_TTL					2			2			4			4			4			4			4			4			# bsd consensus
-syscon	ip	IP_MTU					14			14			0			0			0			0			0			73			# bsd consensus
-syscon	ip	IP_HDRINCL				3			3			2			2			2			2			2			2			# bsd consensus
-syscon	ip	IP_OPTIONS				4			4			1			1			1			1			1			1			# bsd consensus
-syscon	ip	IP_RECVTTL				12			12			24			24			65			31			23			21
-syscon	ip	IP_ADD_MEMBERSHIP			35			35			12			12			12			12			12			12			# bsd consensus
-syscon	ip	IP_DROP_MEMBERSHIP			36			36			13			13			13			13			13			13			# bsd consensus
-syscon	ip	IP_MULTICAST_IF				32			32			9			9			9			9			9			9			# bsd consensus
-syscon	ip	IP_MULTICAST_LOOP			34			34			11			11			11			11			11			11			# bsd consensus
-syscon	ip	IP_MULTICAST_TTL			33			33			10			10			10			10			10			10			# bsd consensus
-syscon	ip	IP_PKTINFO				8			8			26			26			0			0			25			19
-syscon	ip	IP_RECVTOS				13			13			0			0			68			0			0			40
-
-#	IPPROTO_IPV6 (or SOL_IPV6) socket options
-#
-#	group	name					GNU/Systemd		GNU/Systemd (Aarch64)	XNU's Not UNIX!		MacOS (Arm64)		FreeBSD			OpenBSD			NetBSD			The New Technology	Commentary
-syscon	ipv6	IPV6_V6ONLY				26			26			27			27			27			27			27			27			# bsd consensus
-syscon	ipv6	IPV6_CHECKSUM				7			7			26			26			26			26			26			26			# bsd consensus
-syscon	ipv6	IPV6_JOIN_GROUP				20			20			12			12			12			12			12			12			# bsd consensus
-syscon	ipv6	IPV6_LEAVE_GROUP			21			21			13			13			13			13			13			13			# bsd consensus
-syscon	ipv6	IPV6_MULTICAST_HOPS			18			18			10			10			10			10			10			10			# bsd consensus
-syscon	ipv6	IPV6_MULTICAST_IF			17			17			9			9			9			9			9			9			# bsd consensus
-syscon	ipv6	IPV6_MULTICAST_LOOP			19			19			11			11			11			11			11			11			# bsd consensus
-syscon	ipv6	IPV6_UNICAST_HOPS			16			16			4			4			4			4			4			4			# bsd consensus
-syscon	ipv6	IPV6_RECVTCLASS				66			66			35			35			57			57			57			40
-syscon	ipv6	IPV6_TCLASS				67			67			36			36			61			61			61			39
-syscon	ipv6	IPV6_DONTFRAG				62			62			0			0			62			62			62			14
-syscon	ipv6	IPV6_HOPLIMIT				52			52			0			0			47			47			47			21
-syscon	ipv6	IPV6_HOPOPTS				54			54			0			0			49			49			49			1
-syscon	ipv6	IPV6_PKTINFO				50			50			0			0			46			46			46			19
-syscon	ipv6	IPV6_RECVRTHDR				56			56			0			0			38			38			38			38
-syscon	ipv6	IPV6_RTHDR				57			57			0			0			51			51			51			32
-
-#	IPPROTO_ICMPV6 (or SOL_ICMPV6) socket options
-#
-#	group	name					GNU/Systemd		GNU/Systemd (Aarch64)	XNU's Not UNIX!		MacOS (Arm64)		FreeBSD			OpenBSD			NetBSD			The New Technology	Commentary
-syscon	icmp6	ICMP6_DST_UNREACH_NOROUTE		0			0			0			0			0			0			0			0			# consensus
-syscon	icmp6	ICMP6_PARAMPROB_HEADER			0			0			0			0			0			0			0			0			# consensus
-syscon	icmp6	ICMP6_TIME_EXCEED_TRANSIT		0			0			0			0			0			0			0			0			# consensus
-syscon	icmp6	ICMP6_DST_UNREACH_ADMIN			1			1			1			1			1			1			1			1			# consensus
-syscon	icmp6	ICMP6_PARAMPROB_NEXTHEADER		1			1			1			1			1			1			1			1			# consensus
-syscon	icmp6	ICMP6_TIME_EXCEED_REASSEMBLY		1			1			1			1			1			1			1			1			# consensus
-syscon	icmp6	ICMP6_DST_UNREACH			1			1			1			1			1			1			1			0			# unix consensus
-syscon	icmp6	ICMP6_FILTER				1			1			18			18			18			18			18			0			# bsd consensus
-syscon	icmp6	ICMP6_DST_UNREACH_BEYONDSCOPE		2			2			2			2			2			2			2			2			# consensus
-syscon	icmp6	ICMP6_PARAMPROB_OPTION			2			2			2			2			2			2			2			2			# consensus
-syscon	icmp6	ICMP6_PACKET_TOO_BIG			2			2			2			2			2			2			2			0			# unix consensus
-syscon	icmp6	ICMP6_DST_UNREACH_ADDR			3			3			3			3			3			3			3			3			# consensus
-syscon	icmp6	ICMP6_TIME_EXCEEDED			3			3			3			3			3			3			3			0			# unix consensus
-syscon	icmp6	ICMP6_DST_UNREACH_NOPORT		4			4			4			4			4			4			4			4			# consensus
-syscon	icmp6	ICMP6_PARAM_PROB			4			4			4			4			4			4			4			0			# unix consensus
-syscon	icmp6	ICMP6_RR_FLAGS_PREVDONE			8			8			8			8			8			8			8			0			# unix consensus
-syscon	icmp6	ICMP6_RR_FLAGS_SPECSITE			0x10			0x10			0x10			0x10			0x10			0x10			0x10			0			# unix consensus
-syscon	icmp6	ICMP6_RR_PCOUSE_RAFLAGS_AUTO		0x10			0x10			0x40			0x40			0x40			0x40			0x40			0			# bsd consensus
-syscon	icmp6	ICMP6_RR_FLAGS_FORCEAPPLY		0x20			0x20			0x20			0x20			0x20			0x20			0x20			0			# unix consensus
-syscon	icmp6	ICMP6_RR_PCOUSE_RAFLAGS_ONLINK		0x20			0x20			0x80			0x80			0x80			0x80			0x80			0			# bsd consensus
-syscon	icmp6	ICMP6_RR_FLAGS_REQRESULT		0x40			0x40			0x40			0x40			0x40			0x40			0x40			0			# unix consensus
-syscon	icmp6	ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME	0x40			0x40			0x40			0x40			0x40			0x40			0x40			0			# unix consensus
-syscon	icmp6	ICMP6_INFOMSG_MASK			0x80			0x80			0x80			0x80			0x80			0x80			0x80			0x80			# consensus
-syscon	icmp6	ICMP6_ECHO_REQUEST			0x80			0x80			0x80			0x80			0x80			0x80			0x80			0			# unix consensus
-syscon	icmp6	ICMP6_RR_FLAGS_TEST			0x80			0x80			0x80			0x80			0x80			0x80			0x80			0			# unix consensus
-syscon	icmp6	ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME	0x80			0x80			0x80			0x80			0x80			0x80			0x80			0			# unix consensus
-syscon	icmp6	ICMP6_ECHO_REPLY			129			129			129			129			129			129			129			0			# unix consensus
-syscon	icmp6	ICMP6_ROUTER_RENUMBERING		138			138			138			138			138			138			138			0			# unix consensus
-syscon	icmp6	ICMP6_RR_RESULT_FLAGS_FORBIDDEN		0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			0			# unix consensus
-syscon	icmp6	ICMP6_RR_RESULT_FLAGS_OOB		0x0200			0x0200			0x0200			0x0200			0x0200			0x0200			0x0200			0			# unix consensus
-
-syscon	ipport	IPPORT_USERRESERVED			5000			5000			5000			5000			5000			49151			5000			5000
-
 #	https://blog.cloudflare.com/know-your-scm_rights/
 #
 #	group	name					GNU/Systemd		GNU/Systemd (Aarch64)	XNU's Not UNIX!		MacOS (Arm64)		FreeBSD			OpenBSD			NetBSD			The New Technology	Commentary
@@ -787,6 +682,60 @@ syscon	scm	SCM_TIMESTAMPING			37			37			0			0			0			0			0			0
 syscon	scm	SCM_TIMESTAMPNS				35			35			0			0			0			0			0			0
 syscon	scm	SCM_WIFI_STATUS				41			41			0			0			0			0			0			0
 
+#	group	name					GNU/Systemd		GNU/Systemd (Aarch64)	XNU's Not UNIX!		MacOS (Arm64)		FreeBSD			OpenBSD			NetBSD			The New Technology	Commentary
+syscon	ip	IP_TOS					1			1			3			3			3			3			3			3			# bsd consensus
+syscon	ip	IP_TTL					2			2			4			4			4			4			4			4			# bsd consensus
+syscon	ip	IP_HDRINCL				3			3			2			2			2			2			2			2			# bsd consensus
+syscon	ip	IP_DEFAULT_MULTICAST_LOOP		1			1			1			1			1			1			1			1			# consensus
+syscon	ip	IP_DEFAULT_MULTICAST_TTL		1			1			1			1			1			1			1			1			# consensus
+syscon	ip	IP_PMTUDISC_DONT			0			0			0			0			0			0			0			0			# consensus
+syscon	ip	IP_MAX_MEMBERSHIPS			20			20			0x0fff			0x0fff			0x0fff			0x0fff			0x0fff			20			# bsd consensus
+syscon	ip	IP_OPTIONS				4			4			1			1			1			1			1			1			# bsd consensus
+syscon	ip	IP_RECVTTL				12			12			24			24			65			31			23			21
+syscon	ip	IP_ADD_MEMBERSHIP			35			35			12			12			12			12			12			12			# bsd consensus
+syscon	ip	IP_DROP_MEMBERSHIP			36			36			13			13			13			13			13			13			# bsd consensus
+syscon	ip	IP_MULTICAST_IF				0x20			0x20			9			9			9			9			9			9			# bsd consensus
+syscon	ip	IP_MULTICAST_LOOP			34			34			11			11			11			11			11			11			# bsd consensus
+syscon	ip	IP_MULTICAST_TTL			33			33			10			10			10			10			10			10			# bsd consensus
+syscon	ip	IP_RECVOPTS				6			6			5			5			5			5			5			0			# bsd consensus
+syscon	ip	IP_RECVRETOPTS				7			7			6			6			6			6			6			0			# bsd consensus
+syscon	ip	IP_RECVDSTADDR				0			0			7			7			7			7			7			0			# bsd consensus
+syscon	ip	IP_RETOPTS				7			7			8			8			8			8			8			0			# bsd consensus
+syscon	ip	IP_ADD_SOURCE_MEMBERSHIP		39			39			70			70			70			0			0			15
+syscon	ip	IP_BLOCK_SOURCE				38			38			72			72			72			0			0			17
+syscon	ip	IP_DROP_SOURCE_MEMBERSHIP		40			40			71			71			71			0			0			16
+syscon	ip	IP_UNBLOCK_SOURCE			37			37			73			73			73			0			0			18
+syscon	ip	IP_IPSEC_POLICY				0x10			0x10			21			21			21			0			0			0
+syscon	ip	IP_MINTTL				21			21			0			0			66			32			24			0			# minimum ttl for packet or drop
+syscon	ip	IP_MSFILTER				41			41			74			74			74			0			0			0
+syscon	ip	IP_PKTINFO				8			8			26			26			0			0			25			19
+syscon	ip	IP_RECVTOS				13			13			0			0			68			0			0			40
+syscon	ip	IP_MTU					14			14			0			0			0			0			0			73			# bsd consensus
+syscon	ip	IP_MTU_DISCOVER				10			10			0			0			0			0			0			71			# bsd consensus
+syscon	ip	IP_RECVERR				11			11			0			0			0			0			0			75			# bsd consensus
+syscon	ip	IP_UNICAST_IF				50			50			0			0			0			0			0			31			# bsd consensus
+syscon	ip	IP_ORIGDSTADDR				20			20			0			0			27			0			0			0
+syscon	ip	IP_RECVORIGDSTADDR			20			20			0			0			27			0			0			0
+syscon	ip	IP_BIND_ADDRESS_NO_PORT			24			24			0			0			0			0			0			0
+syscon	ip	IP_CHECKSUM				23			23			0			0			0			0			0			0
+syscon	ip	IP_FREEBIND				15			15			0			0			0			0			0			0
+syscon	ip	IP_MULTICAST_ALL			49			49			0			0			0			0			0			0
+syscon	ip	IP_NODEFRAG				22			22			0			0			0			0			0			0
+syscon	ip	IP_PASSSEC				18			18			0			0			0			0			0			0
+syscon	ip	IP_PKTOPTIONS				9			9			0			0			0			0			0			0
+syscon	ip	IP_PMTUDISC				10			10			0			0			0			0			0			0
+syscon	ip	IP_PMTUDISC_DO				2			2			0			0			0			0			0			0
+syscon	ip	IP_PMTUDISC_INTERFACE			4			4			0			0			0			0			0			0
+syscon	ip	IP_PMTUDISC_OMIT			5			5			0			0			0			0			0			0
+syscon	ip	IP_PMTUDISC_PROBE			3			3			0			0			0			0			0			0
+syscon	ip	IP_PMTUDISC_WANT			1			1			0			0			0			0			0			0
+syscon	ip	IP_ROUTER_ALERT				5			5			0			0			0			0			0			0
+syscon	ip	IP_TRANSPARENT				19			19			0			0			0			0			0			0
+syscon	ip	IP_XFRM_POLICY				17			17			0			0			0			0			0			0
+syscon	ip	INET_ADDRSTRLEN				0x10			0x10			0x10			0x10			0x10			0x10			0x10			22			# unix consensus
+
+syscon	ipport	IPPORT_USERRESERVED			5000			5000			5000			5000			5000			49151			5000			5000
+
 #	ptrace() codes
 #
 #	group	name					GNU/Systemd		GNU/Systemd (Aarch64)	XNU's Not UNIX!		MacOS (Arm64)		FreeBSD			OpenBSD			NetBSD			The New Technology	Commentary
@@ -1103,8 +1052,8 @@ syscon	limits	_ARG_MAX				128*1024		128*1024		1024*1024		1024*1024		512*1024		51
 syscon	limits	_NAME_MAX				255			255			255			255			255			255			511			255			# probably higher on windows?
 syscon	limits	_PATH_MAX				4096			4096			1024			1024			1024			1024			1024			260			#
 syscon	limits	_NSIG					64			64			32			32			128			32			64			64			# _SIG_MAXSIG on FreeBSD
-syscon	limits	_MINSIGSTKSZ				2048			6144			8192			32768			6656			14336			8192			2048			# FreeBSD upscaled a bit for ARM
-syscon	limits	_SIGSTKSZ				10240			10240			131072			131072			36864			28672			40960			10240			#
+syscon	limits	_MINSIGSTKSZ				2048			2048			32768			32768			4096			12288			8192			2048			#
+syscon	limits	_SIGSTKSZ				8192			2048			131072			131072			36864			28672			40960			8192			#
 
 #	unmount() flags
 #	a.k.a. umount2() on linux
@@ -1137,12 +1086,32 @@ syscon	reboot	RB_NOSYNC				0x20000000		0x20000000		4			4			4			4			4			0x2000000
 syscon	msg	MSG_OOB					1			1			1			1			1			1			1			1			# consensus
 syscon	msg	MSG_PEEK				2			2			2			2			2			2			2			2			# consensus
 syscon	msg	MSG_DONTROUTE				4			4			4			4			4			4			4			4			# consensus
-syscon	msg	MSG_DONTWAIT				0x40			0x40			0x80			0x80			0x80			0x80			0x80			0x40			# send/sendto: manual non-blocking
+syscon	msg	MSG_FASTOPEN				0x20000000		0x20000000		0			0			0			0			0			0			# TODO
 syscon	msg	MSG_WAITALL				0x0100			0x0100			0x40			0x40			0x40			0x40			0x40			8			# bsd consensus
-syscon	msg	MSG_NOSIGNAL				0x4000			0x4000			0x80000			0x80000			0x020000		0x0400			0x0400			0x10000000		# send/sendto: don't raise sigpipe on local shutdown
+syscon	msg	MSG_MORE				0x8000			0x8000			0			0			0			0			0			0			# send/sendto: manual TCP_CORK hbasically
+syscon	msg	MSG_NOSIGNAL				0x4000			0x4000			0x80000			0x80000			0x020000		0x0400			0x0400			0			# send/sendto: don't SIGPIPE on EOF
+syscon	msg	MSG_DONTWAIT				0x40			0x40			0x80			0x80			0x80			0x80			0x80			0x40			# send/sendto: manual non-blocking
 syscon	msg	MSG_TRUNC				0x20			0x20			0x10			0x10			0x10			0x10			0x10			0x0100			# bsd consensus
 syscon	msg	MSG_CTRUNC				8			8			0x20			0x20			0x20			0x20			0x20			0x0200			# bsd consensus
-syscon	msg	MSG_FASTOPEN				0x20000000		0x20000000		-1			-1			-1			-1			-1			-1			#
+syscon	msg	MSG_ERRQUEUE				0x2000			0x2000			0			0			0			0			0			0x1000			# bsd consensus
+syscon	msg	MSG_NOERROR				0x1000			0x1000			0x1000			0x1000			0x1000			0x1000			0x1000			0			# unix consensus
+syscon	msg	MSG_EOR					0x80			0x80			8			8			8			8			8			0			# bsd consensus
+syscon	msg	MSG_CMSG_CLOEXEC			0x40000000		0x40000000		0			0			0x040000		0x0800			0x0800			0
+syscon	msg	MSG_WAITFORONE				0x010000		0x010000		0			0			0x080000		0			0x2000			0
+syscon	msg	MSG_BATCH				0x040000		0x040000		0			0			0			0			0			0
+syscon	msg	MSG_CONFIRM				0x0800			0x0800			0			0			0			0			0			0
+syscon	msg	MSG_EXCEPT				0x2000			0x2000			0			0			0			0			0			0
+syscon	msg	MSG_FIN					0x0200			0x0200			0x0100			0x0100			0x0100			0			0			0
+syscon	msg	MSG_EOF					0x0200			0x0200			0x0100			0x0100			0x0100			0			0			0
+syscon	msg	MSG_INFO				12			12			0			0			0			0			0			0
+syscon	msg	MSG_PARITY_ERROR			9			9			0			0			0			0			0			0
+syscon	msg	MSG_PROXY				0x10			0x10			0			0			0			0			0			0
+syscon	msg	MSG_RST					0x1000			0x1000			0			0			0			0			0			0
+syscon	msg	MSG_STAT				11			11			0			0			0			0			0			0
+syscon	msg	MSG_SYN					0x0400			0x0400			0			0			0			0			0			0
+syscon	msg	MSG_BCAST				0			0			0			0			0			0x100			0x100			0
+syscon	msg	MSG_MCAST				0			0			0			0			0			0x200			0x200			0
+syscon	msg	MSG_NOTIFICATION			0x8000			0x8000			0			0			0x2000			0			0x4000			0
 
 #	getpriority() / setpriority() magnums (a.k.a. nice)
 #
@@ -1414,6 +1383,16 @@ syscon	shm	SHM_NORESERVE				0x1000			0x1000			0			0			0			0			0			0
 syscon	shm	SHM_REMAP				0x4000			0x4000			0			0			0			0			0			0
 syscon	shm	SHM_ANON				0			0			0			0			1			0			0			0
 
+syscon	lock	LOCK_UNLOCK_CACHE			54			54			0			0			0			0			0			0			# wut
+
+syscon	misc	IP6F_MORE_FRAG				0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			# consensus
+syscon	misc	IP6F_OFF_MASK				0xf8ff			0xf8ff			0xf8ff			0xf8ff			0xf8ff			0xf8ff			0xf8ff			0xf8ff			# consensus
+syscon	misc	IP6F_RESERVED_MASK			0x0600			0x0600			0x0600			0x0600			0x0600			0x0600			0x0600			0x0600			# consensus
+
+syscon	misc	L_SET					0			0			0			0			0			0			0			0			# consensus
+syscon	misc	L_INCR					1			1			1			1			1			1			1			0			# unix consensus
+syscon	misc	L_XTND					2			2			2			2			2			2			2			0			# unix consensus
+
 syscon	misc	SHUT_RD					0			0			0			0			0			0			0			0			# consensus (SD_RECEIVE)
 syscon	misc	SHUT_WR					1			1			1			1			1			1			1			1			# consensus (SD_SEND)
 syscon	misc	SHUT_RDWR				2			2			2			2			2			2			2			2			# consensus (SD_BOTH)
@@ -1459,6 +1438,11 @@ syscon	misc	ACCT_COMM				0x10			0x10			0			0			0			0			0			0
 syscon	misc	COMMAND_COMPLETE			0			0			0			0			0			0			0			0			# consensus
 syscon	misc	COMMAND_TERMINATED			17			17			0			0			0			0			0			0
 
+syscon	select	FD_SETSIZE				0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			# forced consensus (0x40 on NT)
+
+syscon	misc	MATH_ERREXCEPT				2			2			2			2			2			2			2			0			# unix consensus
+syscon	misc	MATH_ERRNO				1			1			1			1			1			1			1			0			# unix consensus
+
 syscon	misc	MCAST_BLOCK_SOURCE			43			43			84			84			84			0			0			43
 syscon	misc	MCAST_JOIN_GROUP			42			42			80			80			80			0			0			41
 syscon	misc	MCAST_JOIN_SOURCE_GROUP			46			46			82			82			82			0			0			45
@@ -1927,8 +1911,180 @@ syscon	junkerr	EKEYREJECTED				129			129			-1			-1			-1			-1			-1			-1
 syscon	junkerr	ERFKILL					132			132			-1			-1			-1			-1			-1			-1
 syscon	junkerr	EHWPOISON				133			133			-1			-1			-1			-1			-1			-1
 
-syscon	select	FD_SETSIZE				0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			# forced consensus (0x40 on NT)
-syscon	misc	IOV_MAX					1024			1024			1024			1024			1024			1024			1024			16			# unix consensus & MSG_MAXIOVLEN
+#	arpanet fork combating human-induced exhaustion of our ipv4 address space
+#
+#	group	name					GNU/Systemd		GNU/Systemd (Aarch64)	XNU's Not UNIX!		MacOS (Arm64)		FreeBSD			OpenBSD			NetBSD			The New Technology	Commentary
+syscon	ipv6	IPV6_PMTUDISC_DONT			0			0			0			0			0			0			0			0			# consensus
+syscon	ipv6	IPV6_RTHDR_LOOSE			0			0			0			0			0			0			0			0			# consensus
+syscon	ipv6	IPV6_RTHDR_TYPE_0			0			0			0			0			0			0			0			0			# consensus
+syscon	ipv6	IPV6_CHECKSUM				7			7			26			26			26			26			26			26			# bsd consensus
+syscon	ipv6	IPV6_JOIN_GROUP				20			20			12			12			12			12			12			12			# bsd consensus
+syscon	ipv6	IPV6_LEAVE_GROUP			21			21			13			13			13			13			13			13			# bsd consensus
+syscon	ipv6	IPV6_MULTICAST_HOPS			18			18			10			10			10			10			10			10			# bsd consensus
+syscon	ipv6	IPV6_MULTICAST_IF			17			17			9			9			9			9			9			9			# bsd consensus
+syscon	ipv6	IPV6_MULTICAST_LOOP			19			19			11			11			11			11			11			11			# bsd consensus
+syscon	ipv6	IPV6_UNICAST_HOPS			0x10			0x10			4			4			4			4			4			4			# bsd consensus
+syscon	ipv6	IPV6_V6ONLY				26			26			27			27			27			27			27			27			# bsd consensus
+syscon	ipv6	IPV6_RECVTCLASS				66			66			35			35			57			57			57			40
+syscon	ipv6	IPV6_TCLASS				67			67			36			36			61			61			61			39
+syscon	ipv6	IPV6_DONTFRAG				62			62			0			0			62			62			62			14
+syscon	ipv6	IPV6_HOPLIMIT				52			52			0			0			47			47			47			21
+syscon	ipv6	IPV6_HOPOPTS				54			54			0			0			49			49			49			1
+syscon	ipv6	IPV6_PKTINFO				50			50			0			0			46			46			46			19
+syscon	ipv6	IPV6_RECVRTHDR				56			56			0			0			38			38			38			38
+syscon	ipv6	IPV6_RTHDR				57			57			0			0			51			51			51			0x20
+syscon	ipv6	IPV6_DSTOPTS				59			59			0			0			50			50			50			0
+syscon	ipv6	IPV6_IPSEC_POLICY			34			34			28			28			28			0			0			0
+syscon	ipv6	IPV6_NEXTHOP				9			9			0			0			48			48			48			0
+syscon	ipv6	IPV6_PATHMTU				61			61			0			0			44			44			44			0
+syscon	ipv6	IPV6_RECVDSTOPTS			58			58			0			0			40			40			40			0
+syscon	ipv6	IPV6_RECVHOPLIMIT			51			51			0			0			37			37			37			0
+syscon	ipv6	IPV6_RECVHOPOPTS			53			53			0			0			39			39			39			0
+syscon	ipv6	IPV6_RECVPATHMTU			60			60			0			0			43			43			43			0
+syscon	ipv6	IPV6_RECVPKTINFO			49			49			0			0			36			36			36			0
+syscon	ipv6	IPV6_RTHDRDSTOPTS			55			55			0			0			35			35			35			0
+syscon	ipv6	IPV6_RTHDR_STRICT			1			1			1			1			1			0			0			0
+syscon	ipv6	IPV6_ADD_MEMBERSHIP			20			20			0			0			0			0			0			12			# bsd consensus
+syscon	ipv6	IPV6_DROP_MEMBERSHIP			21			21			0			0			0			0			0			13			# bsd consensus
+syscon	ipv6	IPV6_HDRINCL				36			36			0			0			0			0			0			2			# bsd consensus
+syscon	ipv6	IPV6_MTU				24			24			0			0			0			0			0			72			# bsd consensus
+syscon	ipv6	IPV6_MTU_DISCOVER			23			23			0			0			0			0			0			71			# bsd consensus
+syscon	ipv6	IPV6_RECVERR				25			25			0			0			0			0			0			75			# bsd consensus
+syscon	ipv6	IPV6_2292DSTOPTS			4			4			23			23			0			0			0			0
+syscon	ipv6	IPV6_2292HOPLIMIT			8			8			20			20			0			0			0			0
+syscon	ipv6	IPV6_2292HOPOPTS			3			3			22			22			0			0			0			0
+syscon	ipv6	IPV6_2292PKTINFO			2			2			19			19			0			0			0			0
+syscon	ipv6	IPV6_2292PKTOPTIONS			6			6			25			25			0			0			0			0
+syscon	ipv6	IPV6_2292RTHDR				5			5			24			24			0			0			0			0
+syscon	ipv6	IPV6_AUTOFLOWLABEL			0			0			0			0			59			59			59			0
+syscon	ipv6	IPV6_ADDRFORM				1			1			0			0			0			0			0			0
+syscon	ipv6	IPV6_AUTHHDR				10			10			0			0			0			0			0			0
+syscon	ipv6	IPV6_JOIN_ANYCAST			27			27			0			0			0			0			0			0
+syscon	ipv6	IPV6_LEAVE_ANYCAST			28			28			0			0			0			0			0			0
+syscon	ipv6	IPV6_PMTUDISC_DO			2			2			0			0			0			0			0			0
+syscon	ipv6	IPV6_PMTUDISC_INTERFACE			4			4			0			0			0			0			0			0
+syscon	ipv6	IPV6_PMTUDISC_OMIT			5			5			0			0			0			0			0			0
+syscon	ipv6	IPV6_PMTUDISC_PROBE			3			3			0			0			0			0			0			0
+syscon	ipv6	IPV6_PMTUDISC_WANT			1			1			0			0			0			0			0			0
+syscon	ipv6	IPV6_ROUTER_ALERT			22			22			0			0			0			0			0			0
+syscon	ipv6	IPV6_RXDSTOPTS				59			59			0			0			0			0			0			0
+syscon	ipv6	IPV6_RXHOPOPTS				54			54			0			0			0			0			0			0
+syscon	ipv6	IPV6_XFRM_POLICY			35			35			0			0			0			0			0			0
+syscon	ipv6	IPV6_MINHOPCOUNT			0			0			0			0			0			65			65			0
+syscon	ipv6	IPV6_ORIGDSTADDR			0			0			0			0			72			0			0			0
+syscon	ipv6	IPV6_RECVORIGDSTADDR			0			0			0			0			72			0			0			0
+syscon	ipv6	INET6_ADDRSTRLEN			46			46			46			46			46			46			46			65			# unix consensus
+syscon	icmp6	ICMP6_DST_UNREACH_NOROUTE		0			0			0			0			0			0			0			0			# consensus
+syscon	icmp6	ICMP6_PARAMPROB_HEADER			0			0			0			0			0			0			0			0			# consensus
+syscon	icmp6	ICMP6_TIME_EXCEED_TRANSIT		0			0			0			0			0			0			0			0			# consensus
+syscon	icmp6	ICMP6_DST_UNREACH_ADMIN			1			1			1			1			1			1			1			1			# consensus
+syscon	icmp6	ICMP6_PARAMPROB_NEXTHEADER		1			1			1			1			1			1			1			1			# consensus
+syscon	icmp6	ICMP6_TIME_EXCEED_REASSEMBLY		1			1			1			1			1			1			1			1			# consensus
+syscon	icmp6	ICMP6_DST_UNREACH			1			1			1			1			1			1			1			0			# unix consensus
+syscon	icmp6	ICMP6_FILTER				1			1			18			18			18			18			18			0			# bsd consensus
+syscon	icmp6	ICMP6_DST_UNREACH_BEYONDSCOPE		2			2			2			2			2			2			2			2			# consensus
+syscon	icmp6	ICMP6_PARAMPROB_OPTION			2			2			2			2			2			2			2			2			# consensus
+syscon	icmp6	ICMP6_PACKET_TOO_BIG			2			2			2			2			2			2			2			0			# unix consensus
+syscon	icmp6	ICMP6_DST_UNREACH_ADDR			3			3			3			3			3			3			3			3			# consensus
+syscon	icmp6	ICMP6_TIME_EXCEEDED			3			3			3			3			3			3			3			0			# unix consensus
+syscon	icmp6	ICMP6_DST_UNREACH_NOPORT		4			4			4			4			4			4			4			4			# consensus
+syscon	icmp6	ICMP6_PARAM_PROB			4			4			4			4			4			4			4			0			# unix consensus
+syscon	icmp6	ICMP6_RR_FLAGS_PREVDONE			8			8			8			8			8			8			8			0			# unix consensus
+syscon	icmp6	ICMP6_RR_FLAGS_SPECSITE			0x10			0x10			0x10			0x10			0x10			0x10			0x10			0			# unix consensus
+syscon	icmp6	ICMP6_RR_PCOUSE_RAFLAGS_AUTO		0x10			0x10			0x40			0x40			0x40			0x40			0x40			0			# bsd consensus
+syscon	icmp6	ICMP6_RR_FLAGS_FORCEAPPLY		0x20			0x20			0x20			0x20			0x20			0x20			0x20			0			# unix consensus
+syscon	icmp6	ICMP6_RR_PCOUSE_RAFLAGS_ONLINK		0x20			0x20			0x80			0x80			0x80			0x80			0x80			0			# bsd consensus
+syscon	icmp6	ICMP6_RR_FLAGS_REQRESULT		0x40			0x40			0x40			0x40			0x40			0x40			0x40			0			# unix consensus
+syscon	icmp6	ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME	0x40			0x40			0x40			0x40			0x40			0x40			0x40			0			# unix consensus
+syscon	icmp6	ICMP6_INFOMSG_MASK			0x80			0x80			0x80			0x80			0x80			0x80			0x80			0x80			# consensus
+syscon	icmp6	ICMP6_ECHO_REQUEST			0x80			0x80			0x80			0x80			0x80			0x80			0x80			0			# unix consensus
+syscon	icmp6	ICMP6_RR_FLAGS_TEST			0x80			0x80			0x80			0x80			0x80			0x80			0x80			0			# unix consensus
+syscon	icmp6	ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME	0x80			0x80			0x80			0x80			0x80			0x80			0x80			0			# unix consensus
+syscon	icmp6	ICMP6_ECHO_REPLY			129			129			129			129			129			129			129			0			# unix consensus
+syscon	icmp6	ICMP6_ROUTER_RENUMBERING		138			138			138			138			138			138			138			0			# unix consensus
+syscon	icmp6	ICMP6_RR_RESULT_FLAGS_FORBIDDEN		0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			0			# unix consensus
+syscon	icmp6	ICMP6_RR_RESULT_FLAGS_OOB		0x0200			0x0200			0x0200			0x0200			0x0200			0x0200			0x0200			0			# unix consensus
+
+syscon	misc	FIFOTYPE				54			54			54			54			54			54			54			0			# unix consensus
+syscon	misc	GRPQUOTA				1			1			1			1			1			1			1			0			# unix consensus
+syscon	misc	IF_NAMESIZE				0x10			0x10			0x10			0x10			0x10			0x10			0x10			0			# unix consensus
+syscon	misc	INTERMEDIATE_C_GOOD			10			10			0			0			0			0			0			0
+syscon	misc	INTERMEDIATE_GOOD			8			8			0			0			0			0			0			0
+
+syscon	misc	IOV_MAX					0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			16			# unix consensus & MSG_MAXIOVLEN
+syscon	misc	LINE_MAX				0x0800			0x0800			0x0800			0x0800			0x0800			0x0800			0x0800			0			# unix consensus
+syscon	misc	LINKED_CMD_COMPLETE			10			10			0			0			0			0			0			0
+syscon	misc	LINKED_FLG_CMD_COMPLETE			11			11			0			0			0			0			0			0
+
+syscon	misc	LNKTYPE					49			49			49			49			49			49			49			0			# unix consensus
+syscon	misc	MAXNAMLEN				255			255			255			255			255			255			255			0			# unix consensus
+syscon	misc	MAXQUOTAS				2			2			2			2			2			2			2			0			# unix consensus
+syscon	misc	MEDIUM_ERROR				3			3			0			0			0			0			0			0
+syscon	misc	MEDIUM_SCAN				56			56			0			0			0			0			0			0
+
+syscon	misc	NBBY					8			8			8			8			8			8			8			0			# unix consensus
+syscon	misc	NR_DQHASH				43			43			0			0			0			0			0			0
+syscon	misc	NR_DQUOTS				0x0100			0x0100			0			0			0			0			0			0
+
+syscon	misc	PERSISTENT_RESERVE_IN			94			94			0			0			0			0			0			0
+syscon	misc	PERSISTENT_RESERVE_OUT			95			95			0			0			0			0			0			0
+
+syscon	misc	PRELIM					1			1			1			1			1			1			1			0			# unix consensus
+syscon	misc	REGTYPE					48			48			48			48			48			48			48			0			# unix consensus
+syscon	misc	RES_PRF_CLASS				4			4			4			4			4			4			4			0			# unix consensus
+syscon	misc	RHF_GUARANTEE_START_INIT		0x80			0x80			0			0			0			0			0			0
+syscon	misc	RHF_NO_LIBRARY_REPLACEMENT		4			4			0			0			0			0			0			0
+
+syscon	misc	RRQ					1			1			1			1			1			1			1			0			# unix consensus
+syscon	misc	RTF_NOFORWARD				0x1000			0x1000			0			0			0			0			0			0
+syscon	misc	RTF_NOPMTUDISC				0x4000			0x4000			0			0			0			0			0			0
+
+syscon	misc	SARMAG					8			8			8			8			8			8			8			0			# unix consensus
+syscon	misc	SEGSIZE					0x0200			0x0200			0x0200			0x0200			0x0200			0x0200			0x0200			0			# unix consensus
+syscon	misc	SEND_DIAGNOSTIC				29			29			0			0			0			0			0			0
+syscon	misc	SEND_VOLUME_TAG				182			182			0			0			0			0			0			0
+
+syscon	misc	SET_LIMITS				51			51			0			0			0			0			0			0
+syscon	misc	SET_WINDOW				36			36			0			0			0			0			0			0
+
+syscon	misc	SFD_CLOEXEC				0x080000		0x080000		0			0			0			0			0			0
+syscon	misc	SFD_NONBLOCK				0x0800			0x0800			0			0			0			0			0			0
+
+syscon	misc	SUBCMDMASK				255			255			255			255			255			255			255			0			# unix consensus
+syscon	misc	SUBCMDSHIFT				8			8			8			8			8			8			8			0			# unix consensus
+syscon	misc	SYMTYPE					50			50			50			50			50			50			50			0			# unix consensus
+syscon	misc	TGEXEC					8			8			8			8			8			8			8			0			# unix consensus
+syscon	misc	TGREAD					0x20			0x20			0x20			0x20			0x20			0x20			0x20			0			# unix consensus
+syscon	misc	TGWRITE					0x10			0x10			0x10			0x10			0x10			0x10			0x10			0			# unix consensus
+syscon	misc	TMAGLEN					6			6			6			6			6			6			6			0			# unix consensus
+syscon	misc	TOEXEC					1			1			1			1			1			1			1			0			# unix consensus
+syscon	misc	TOREAD					4			4			4			4			4			4			4			0			# unix consensus
+syscon	misc	TOWRITE					2			2			2			2			2			2			2			0			# unix consensus
+syscon	misc	TRANSIENT				4			4			4			4			4			4			4			0			# unix consensus
+syscon	misc	TSGID					0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			0x0400			0			# unix consensus
+syscon	misc	TSUID					0x0800			0x0800			0x0800			0x0800			0x0800			0x0800			0x0800			0			# unix consensus
+syscon	misc	TSVTX					0x0200			0x0200			0x0200			0x0200			0x0200			0x0200			0x0200			0			# unix consensus
+syscon	misc	TUEXEC					0x40			0x40			0x40			0x40			0x40			0x40			0x40			0			# unix consensus
+syscon	misc	TUREAD					0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			0x0100			0			# unix consensus
+syscon	misc	TUWRITE					0x80			0x80			0x80			0x80			0x80			0x80			0x80			0			# unix consensus
+syscon	misc	TVERSLEN				2			2			2			2			2			2			2			0			# unix consensus
+syscon	misc	WORD_BIT				0x20			0x20			0x20			0x20			0x20			0x20			0x20			0			# unix consensus
+syscon	misc	WRQ					2			2			2			2			2			2			2			0			# unix consensus
+syscon	misc	SIGEV_THREAD				2			2			3			3			2			0			2			0
+syscon	misc	SIGEV_SIGNAL				0			0			1			1			1			0			1			0
+syscon	misc	SIGEV_NONE				1			1			0			0			0			0			0			0
+
+syscon	misc	BC_BASE_MAX				99			99			99			99			99			0x7fffffff		0x7fffffff		0
+syscon	misc	BC_DIM_MAX				0x0800			0x0800			0x0800			0x0800			0x0800			0xffff			0xffff			0
+syscon	misc	BC_SCALE_MAX				99			99			99			99			99			0x7fffffff		0x7fffffff		0
+syscon	misc	BC_STRING_MAX				0x03e8			0x03e8			0x03e8			0x03e8			0x03e8			0x7fffffff		0x7fffffff		0
+
+syscon	misc	ABORTED_COMMAND				11			11			0			0			0			0			0			0
+syscon	misc	ACORE					0			0			8			8			8			8			8			0			# bsd consensus
+syscon	misc	AFORK					0			0			1			1			1			1			1			0			# bsd consensus
+syscon	misc	AIO_ALLDONE				2			2			1			1			3			0			0			0
+syscon	misc	AIO_NOTCANCELED				1			1			4			4			2			0			0			0
+syscon	misc	AIO_CANCELED				0			0			2			2			1			0			0			0
 
 #	baud rates
 #
@@ -1981,4 +2137,4 @@ syscon	misc	UL_SETFSIZE				2			2			2			2			2			0			0			0
 syscon	misc	XATTR_CREATE				1			1			2			2			0			0			0			0
 syscon	misc	XATTR_REPLACE				2			2			4			4			0			0			0			0
 
-# https://youtu.be/3SNBXoWs4GM
+# https://youtu.be/GUQUD3IMbb4?t=85
diff --git a/libc/sysv/consts/ABORTED_COMMAND.S b/libc/sysv/consts/ABORTED_COMMAND.S
new file mode 100644
index 000000000..afb9abbae
--- /dev/null
+++ b/libc/sysv/consts/ABORTED_COMMAND.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,ABORTED_COMMAND,11,11,0,0,0,0,0,0
diff --git a/libc/sysv/consts/ACORE.S b/libc/sysv/consts/ACORE.S
new file mode 100644
index 000000000..72447040a
--- /dev/null
+++ b/libc/sysv/consts/ACORE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,ACORE,0,0,8,8,8,8,8,0
diff --git a/libc/sysv/consts/AFORK.S b/libc/sysv/consts/AFORK.S
new file mode 100644
index 000000000..992cacc80
--- /dev/null
+++ b/libc/sysv/consts/AFORK.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,AFORK,0,0,1,1,1,1,1,0
diff --git a/libc/sysv/consts/AIO_ALLDONE.S b/libc/sysv/consts/AIO_ALLDONE.S
new file mode 100644
index 000000000..63584ff7a
--- /dev/null
+++ b/libc/sysv/consts/AIO_ALLDONE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,AIO_ALLDONE,2,2,1,1,3,0,0,0
diff --git a/libc/sysv/consts/AIO_CANCELED.S b/libc/sysv/consts/AIO_CANCELED.S
new file mode 100644
index 000000000..6124168e4
--- /dev/null
+++ b/libc/sysv/consts/AIO_CANCELED.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,AIO_CANCELED,0,0,2,2,1,0,0,0
diff --git a/libc/sysv/consts/AIO_NOTCANCELED.S b/libc/sysv/consts/AIO_NOTCANCELED.S
new file mode 100644
index 000000000..6ef85d564
--- /dev/null
+++ b/libc/sysv/consts/AIO_NOTCANCELED.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,AIO_NOTCANCELED,1,1,4,4,2,0,0,0
diff --git a/libc/sysv/consts/BC_BASE_MAX.S b/libc/sysv/consts/BC_BASE_MAX.S
new file mode 100644
index 000000000..b6bc0ec08
--- /dev/null
+++ b/libc/sysv/consts/BC_BASE_MAX.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,BC_BASE_MAX,99,99,99,99,99,0x7fffffff,0x7fffffff,0
diff --git a/libc/sysv/consts/BC_DIM_MAX.S b/libc/sysv/consts/BC_DIM_MAX.S
new file mode 100644
index 000000000..267a893bc
--- /dev/null
+++ b/libc/sysv/consts/BC_DIM_MAX.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,BC_DIM_MAX,0x0800,0x0800,0x0800,0x0800,0x0800,0xffff,0xffff,0
diff --git a/libc/sysv/consts/BC_SCALE_MAX.S b/libc/sysv/consts/BC_SCALE_MAX.S
new file mode 100644
index 000000000..c7fd64227
--- /dev/null
+++ b/libc/sysv/consts/BC_SCALE_MAX.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,BC_SCALE_MAX,99,99,99,99,99,0x7fffffff,0x7fffffff,0
diff --git a/libc/sysv/consts/BC_STRING_MAX.S b/libc/sysv/consts/BC_STRING_MAX.S
new file mode 100644
index 000000000..883450d84
--- /dev/null
+++ b/libc/sysv/consts/BC_STRING_MAX.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,BC_STRING_MAX,0x03e8,0x03e8,0x03e8,0x03e8,0x03e8,0x7fffffff,0x7fffffff,0
diff --git a/libc/sysv/consts/CLOCK_BOOTTIME.S b/libc/sysv/consts/CLOCK_BOOTTIME.S
index 966970d27..ead5f9008 100644
--- a/libc/sysv/consts/CLOCK_BOOTTIME.S
+++ b/libc/sysv/consts/CLOCK_BOOTTIME.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon clock,CLOCK_BOOTTIME,7,7,6,6,4,3,3,3
+.syscon clock,CLOCK_BOOTTIME,7,7,7,127,127,6,127,3
diff --git a/libc/sysv/consts/CLOCK_BOOTTIME_ALARM.S b/libc/sysv/consts/CLOCK_BOOTTIME_ALARM.S
new file mode 100644
index 000000000..88265bcc8
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_BOOTTIME_ALARM.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_BOOTTIME_ALARM,9,9,127,127,127,127,127,127
diff --git a/libc/sysv/consts/CLOCK_MONOTONIC.S b/libc/sysv/consts/CLOCK_MONOTONIC.S
index 66d85a317..2275c6cf1 100644
--- a/libc/sysv/consts/CLOCK_MONOTONIC.S
+++ b/libc/sysv/consts/CLOCK_MONOTONIC.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon clock,CLOCK_MONOTONIC,1,1,8,8,5,5,3,1
+.syscon clock,CLOCK_MONOTONIC,1,1,6,6,4,3,3,1
diff --git a/libc/sysv/consts/CLOCK_MONOTONIC_COARSE.S b/libc/sysv/consts/CLOCK_MONOTONIC_COARSE.S
index 494834f22..225972c1d 100644
--- a/libc/sysv/consts/CLOCK_MONOTONIC_COARSE.S
+++ b/libc/sysv/consts/CLOCK_MONOTONIC_COARSE.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon clock,CLOCK_MONOTONIC_COARSE,6,6,9,9,8,5,3,6
+.syscon clock,CLOCK_MONOTONIC_COARSE,6,6,5,5,12,3,3,1
diff --git a/libc/sysv/consts/CLOCK_MONOTONIC_FAST.S b/libc/sysv/consts/CLOCK_MONOTONIC_FAST.S
new file mode 100644
index 000000000..0069c82cf
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_MONOTONIC_FAST.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_MONOTONIC_FAST,1,1,6,6,12,3,3,1
diff --git a/libc/sysv/consts/CLOCK_MONOTONIC_PRECISE.S b/libc/sysv/consts/CLOCK_MONOTONIC_PRECISE.S
new file mode 100644
index 000000000..e9e77f345
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_MONOTONIC_PRECISE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_MONOTONIC_PRECISE,1,1,6,6,11,3,3,1
diff --git a/libc/sysv/consts/CLOCK_MONOTONIC_RAW.S b/libc/sysv/consts/CLOCK_MONOTONIC_RAW.S
index 45dcd0e7d..1c158565e 100644
--- a/libc/sysv/consts/CLOCK_MONOTONIC_RAW.S
+++ b/libc/sysv/consts/CLOCK_MONOTONIC_RAW.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon clock,CLOCK_MONOTONIC_RAW,4,4,127,8,8,127,3,1
+.syscon clock,CLOCK_MONOTONIC_RAW,4,4,4,4,127,127,127,127
diff --git a/libc/sysv/consts/CLOCK_PROF.S b/libc/sysv/consts/CLOCK_PROF.S
new file mode 100644
index 000000000..a91213a17
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_PROF.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_PROF,127,127,127,127,2,127,2,127
diff --git a/libc/sysv/consts/CLOCK_REALTIME_ALARM.S b/libc/sysv/consts/CLOCK_REALTIME_ALARM.S
new file mode 100644
index 000000000..0d497e110
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_REALTIME_ALARM.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_REALTIME_ALARM,8,8,127,127,127,127,127,127
diff --git a/libc/sysv/consts/CLOCK_REALTIME_FAST.S b/libc/sysv/consts/CLOCK_REALTIME_FAST.S
new file mode 100644
index 000000000..fd0774e29
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_REALTIME_FAST.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_REALTIME_FAST,0,0,0,0,10,0,0,0
diff --git a/libc/sysv/consts/CLOCK_REALTIME_PRECISE.S b/libc/sysv/consts/CLOCK_REALTIME_PRECISE.S
new file mode 100644
index 000000000..e3ce38e79
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_REALTIME_PRECISE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_REALTIME_PRECISE,0,0,0,0,9,0,0,0
diff --git a/libc/sysv/consts/CLOCK_SECOND.S b/libc/sysv/consts/CLOCK_SECOND.S
new file mode 100644
index 000000000..f39711a0c
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_SECOND.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_SECOND,127,127,127,127,13,127,127,127
diff --git a/libc/sysv/consts/CLOCK_TAI.S b/libc/sysv/consts/CLOCK_TAI.S
new file mode 100644
index 000000000..041e63585
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_TAI.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_TAI,11,11,127,127,127,127,127,127
diff --git a/libc/sysv/consts/CLOCK_UPTIME.S b/libc/sysv/consts/CLOCK_UPTIME.S
new file mode 100644
index 000000000..281eaa508
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_UPTIME.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_UPTIME,127,127,8,8,5,5,127,127
diff --git a/libc/sysv/consts/CLOCK_UPTIME_FAST.S b/libc/sysv/consts/CLOCK_UPTIME_FAST.S
new file mode 100644
index 000000000..3d1ecfd4c
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_UPTIME_FAST.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_UPTIME_FAST,127,127,127,127,8,127,127,127
diff --git a/libc/sysv/consts/CLOCK_UPTIME_PRECISE.S b/libc/sysv/consts/CLOCK_UPTIME_PRECISE.S
new file mode 100644
index 000000000..581afe2dd
--- /dev/null
+++ b/libc/sysv/consts/CLOCK_UPTIME_PRECISE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon clock,CLOCK_UPTIME_PRECISE,127,127,127,127,7,127,127,127
diff --git a/libc/sysv/consts/FIFOTYPE.S b/libc/sysv/consts/FIFOTYPE.S
new file mode 100644
index 000000000..544cea69d
--- /dev/null
+++ b/libc/sysv/consts/FIFOTYPE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,FIFOTYPE,54,54,54,54,54,54,54,0
diff --git a/libc/sysv/consts/GRPQUOTA.S b/libc/sysv/consts/GRPQUOTA.S
new file mode 100644
index 000000000..0418f4462
--- /dev/null
+++ b/libc/sysv/consts/GRPQUOTA.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,GRPQUOTA,1,1,1,1,1,1,1,0
diff --git a/libc/sysv/consts/IF_NAMESIZE.S b/libc/sysv/consts/IF_NAMESIZE.S
new file mode 100644
index 000000000..314fb2084
--- /dev/null
+++ b/libc/sysv/consts/IF_NAMESIZE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,IF_NAMESIZE,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0
diff --git a/libc/sysv/consts/INET6_ADDRSTRLEN.S b/libc/sysv/consts/INET6_ADDRSTRLEN.S
new file mode 100644
index 000000000..df458d2b1
--- /dev/null
+++ b/libc/sysv/consts/INET6_ADDRSTRLEN.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,INET6_ADDRSTRLEN,46,46,46,46,46,46,46,65
diff --git a/libc/sysv/consts/INET_ADDRSTRLEN.S b/libc/sysv/consts/INET_ADDRSTRLEN.S
new file mode 100644
index 000000000..ae2997334
--- /dev/null
+++ b/libc/sysv/consts/INET_ADDRSTRLEN.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,INET_ADDRSTRLEN,0x10,0x10,0x10,0x10,0x10,0x10,0x10,22
diff --git a/libc/sysv/consts/INTERMEDIATE_C_GOOD.S b/libc/sysv/consts/INTERMEDIATE_C_GOOD.S
new file mode 100644
index 000000000..ced82c458
--- /dev/null
+++ b/libc/sysv/consts/INTERMEDIATE_C_GOOD.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,INTERMEDIATE_C_GOOD,10,10,0,0,0,0,0,0
diff --git a/libc/sysv/consts/INTERMEDIATE_GOOD.S b/libc/sysv/consts/INTERMEDIATE_GOOD.S
new file mode 100644
index 000000000..9a500bbf2
--- /dev/null
+++ b/libc/sysv/consts/INTERMEDIATE_GOOD.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,INTERMEDIATE_GOOD,8,8,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IOV_MAX.S b/libc/sysv/consts/IOV_MAX.S
index bc75cf2c3..120790cc7 100644
--- a/libc/sysv/consts/IOV_MAX.S
+++ b/libc/sysv/consts/IOV_MAX.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon misc,IOV_MAX,1024,1024,1024,1024,1024,1024,1024,16
+.syscon misc,IOV_MAX,0x0400,0x0400,0x0400,0x0400,0x0400,0x0400,0x0400,16
diff --git a/libc/sysv/consts/IP6F_MORE_FRAG.S b/libc/sysv/consts/IP6F_MORE_FRAG.S
new file mode 100644
index 000000000..cfa0f7681
--- /dev/null
+++ b/libc/sysv/consts/IP6F_MORE_FRAG.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,IP6F_MORE_FRAG,0x0100,0x0100,0x0100,0x0100,0x0100,0x0100,0x0100,0x0100
diff --git a/libc/sysv/consts/IP6F_OFF_MASK.S b/libc/sysv/consts/IP6F_OFF_MASK.S
new file mode 100644
index 000000000..1c927ebfe
--- /dev/null
+++ b/libc/sysv/consts/IP6F_OFF_MASK.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,IP6F_OFF_MASK,0xf8ff,0xf8ff,0xf8ff,0xf8ff,0xf8ff,0xf8ff,0xf8ff,0xf8ff
diff --git a/libc/sysv/consts/IP6F_RESERVED_MASK.S b/libc/sysv/consts/IP6F_RESERVED_MASK.S
new file mode 100644
index 000000000..19997c100
--- /dev/null
+++ b/libc/sysv/consts/IP6F_RESERVED_MASK.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,IP6F_RESERVED_MASK,0x0600,0x0600,0x0600,0x0600,0x0600,0x0600,0x0600,0x0600
diff --git a/libc/sysv/consts/IPV6_2292DSTOPTS.S b/libc/sysv/consts/IPV6_2292DSTOPTS.S
new file mode 100644
index 000000000..2a6dbbf2e
--- /dev/null
+++ b/libc/sysv/consts/IPV6_2292DSTOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_2292DSTOPTS,4,4,23,23,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_2292HOPLIMIT.S b/libc/sysv/consts/IPV6_2292HOPLIMIT.S
new file mode 100644
index 000000000..26e86fd91
--- /dev/null
+++ b/libc/sysv/consts/IPV6_2292HOPLIMIT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_2292HOPLIMIT,8,8,20,20,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_2292HOPOPTS.S b/libc/sysv/consts/IPV6_2292HOPOPTS.S
new file mode 100644
index 000000000..e10f84c01
--- /dev/null
+++ b/libc/sysv/consts/IPV6_2292HOPOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_2292HOPOPTS,3,3,22,22,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_2292PKTINFO.S b/libc/sysv/consts/IPV6_2292PKTINFO.S
new file mode 100644
index 000000000..1673a96d6
--- /dev/null
+++ b/libc/sysv/consts/IPV6_2292PKTINFO.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_2292PKTINFO,2,2,19,19,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_2292PKTOPTIONS.S b/libc/sysv/consts/IPV6_2292PKTOPTIONS.S
new file mode 100644
index 000000000..069cf9603
--- /dev/null
+++ b/libc/sysv/consts/IPV6_2292PKTOPTIONS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_2292PKTOPTIONS,6,6,25,25,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_2292RTHDR.S b/libc/sysv/consts/IPV6_2292RTHDR.S
new file mode 100644
index 000000000..1fa611ef1
--- /dev/null
+++ b/libc/sysv/consts/IPV6_2292RTHDR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_2292RTHDR,5,5,24,24,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_ADDRFORM.S b/libc/sysv/consts/IPV6_ADDRFORM.S
new file mode 100644
index 000000000..dd171ca8f
--- /dev/null
+++ b/libc/sysv/consts/IPV6_ADDRFORM.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_ADDRFORM,1,1,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_ADD_MEMBERSHIP.S b/libc/sysv/consts/IPV6_ADD_MEMBERSHIP.S
new file mode 100644
index 000000000..d65647e8b
--- /dev/null
+++ b/libc/sysv/consts/IPV6_ADD_MEMBERSHIP.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_ADD_MEMBERSHIP,20,20,0,0,0,0,0,12
diff --git a/libc/sysv/consts/IPV6_AUTHHDR.S b/libc/sysv/consts/IPV6_AUTHHDR.S
new file mode 100644
index 000000000..8e9919123
--- /dev/null
+++ b/libc/sysv/consts/IPV6_AUTHHDR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_AUTHHDR,10,10,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_AUTOFLOWLABEL.S b/libc/sysv/consts/IPV6_AUTOFLOWLABEL.S
new file mode 100644
index 000000000..d086a55b7
--- /dev/null
+++ b/libc/sysv/consts/IPV6_AUTOFLOWLABEL.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_AUTOFLOWLABEL,0,0,0,0,59,59,59,0
diff --git a/libc/sysv/consts/IPV6_DROP_MEMBERSHIP.S b/libc/sysv/consts/IPV6_DROP_MEMBERSHIP.S
new file mode 100644
index 000000000..3cc6ff63b
--- /dev/null
+++ b/libc/sysv/consts/IPV6_DROP_MEMBERSHIP.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_DROP_MEMBERSHIP,21,21,0,0,0,0,0,13
diff --git a/libc/sysv/consts/IPV6_DSTOPTS.S b/libc/sysv/consts/IPV6_DSTOPTS.S
new file mode 100644
index 000000000..6ca099353
--- /dev/null
+++ b/libc/sysv/consts/IPV6_DSTOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_DSTOPTS,59,59,0,0,50,50,50,0
diff --git a/libc/sysv/consts/IPV6_HDRINCL.S b/libc/sysv/consts/IPV6_HDRINCL.S
new file mode 100644
index 000000000..a2efd7c08
--- /dev/null
+++ b/libc/sysv/consts/IPV6_HDRINCL.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_HDRINCL,36,36,0,0,0,0,0,2
diff --git a/libc/sysv/consts/IPV6_IPSEC_POLICY.S b/libc/sysv/consts/IPV6_IPSEC_POLICY.S
new file mode 100644
index 000000000..d02507ea3
--- /dev/null
+++ b/libc/sysv/consts/IPV6_IPSEC_POLICY.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_IPSEC_POLICY,34,34,28,28,28,0,0,0
diff --git a/libc/sysv/consts/IPV6_JOIN_ANYCAST.S b/libc/sysv/consts/IPV6_JOIN_ANYCAST.S
new file mode 100644
index 000000000..7dc3a28cd
--- /dev/null
+++ b/libc/sysv/consts/IPV6_JOIN_ANYCAST.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_JOIN_ANYCAST,27,27,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_LEAVE_ANYCAST.S b/libc/sysv/consts/IPV6_LEAVE_ANYCAST.S
new file mode 100644
index 000000000..59bfa4c19
--- /dev/null
+++ b/libc/sysv/consts/IPV6_LEAVE_ANYCAST.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_LEAVE_ANYCAST,28,28,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_MINHOPCOUNT.S b/libc/sysv/consts/IPV6_MINHOPCOUNT.S
new file mode 100644
index 000000000..7cc084a51
--- /dev/null
+++ b/libc/sysv/consts/IPV6_MINHOPCOUNT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_MINHOPCOUNT,0,0,0,0,0,65,65,0
diff --git a/libc/sysv/consts/IPV6_MTU.S b/libc/sysv/consts/IPV6_MTU.S
new file mode 100644
index 000000000..17264a0f5
--- /dev/null
+++ b/libc/sysv/consts/IPV6_MTU.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_MTU,24,24,0,0,0,0,0,72
diff --git a/libc/sysv/consts/IPV6_MTU_DISCOVER.S b/libc/sysv/consts/IPV6_MTU_DISCOVER.S
new file mode 100644
index 000000000..c2bb09127
--- /dev/null
+++ b/libc/sysv/consts/IPV6_MTU_DISCOVER.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_MTU_DISCOVER,23,23,0,0,0,0,0,71
diff --git a/libc/sysv/consts/IPV6_NEXTHOP.S b/libc/sysv/consts/IPV6_NEXTHOP.S
new file mode 100644
index 000000000..66707028f
--- /dev/null
+++ b/libc/sysv/consts/IPV6_NEXTHOP.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_NEXTHOP,9,9,0,0,48,48,48,0
diff --git a/libc/sysv/consts/IPV6_ORIGDSTADDR.S b/libc/sysv/consts/IPV6_ORIGDSTADDR.S
new file mode 100644
index 000000000..d4b63d6fb
--- /dev/null
+++ b/libc/sysv/consts/IPV6_ORIGDSTADDR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_ORIGDSTADDR,0,0,0,0,72,0,0,0
diff --git a/libc/sysv/consts/IPV6_PATHMTU.S b/libc/sysv/consts/IPV6_PATHMTU.S
new file mode 100644
index 000000000..451946e08
--- /dev/null
+++ b/libc/sysv/consts/IPV6_PATHMTU.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_PATHMTU,61,61,0,0,44,44,44,0
diff --git a/libc/sysv/consts/IPV6_PMTUDISC_DO.S b/libc/sysv/consts/IPV6_PMTUDISC_DO.S
new file mode 100644
index 000000000..01bec78bd
--- /dev/null
+++ b/libc/sysv/consts/IPV6_PMTUDISC_DO.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_PMTUDISC_DO,2,2,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_PMTUDISC_DONT.S b/libc/sysv/consts/IPV6_PMTUDISC_DONT.S
new file mode 100644
index 000000000..f9463aa4c
--- /dev/null
+++ b/libc/sysv/consts/IPV6_PMTUDISC_DONT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_PMTUDISC_DONT,0,0,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_PMTUDISC_INTERFACE.S b/libc/sysv/consts/IPV6_PMTUDISC_INTERFACE.S
new file mode 100644
index 000000000..cd2a558a8
--- /dev/null
+++ b/libc/sysv/consts/IPV6_PMTUDISC_INTERFACE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_PMTUDISC_INTERFACE,4,4,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_PMTUDISC_OMIT.S b/libc/sysv/consts/IPV6_PMTUDISC_OMIT.S
new file mode 100644
index 000000000..99d88d940
--- /dev/null
+++ b/libc/sysv/consts/IPV6_PMTUDISC_OMIT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_PMTUDISC_OMIT,5,5,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_PMTUDISC_PROBE.S b/libc/sysv/consts/IPV6_PMTUDISC_PROBE.S
new file mode 100644
index 000000000..ab10f54f4
--- /dev/null
+++ b/libc/sysv/consts/IPV6_PMTUDISC_PROBE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_PMTUDISC_PROBE,3,3,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_PMTUDISC_WANT.S b/libc/sysv/consts/IPV6_PMTUDISC_WANT.S
new file mode 100644
index 000000000..e9bf56f99
--- /dev/null
+++ b/libc/sysv/consts/IPV6_PMTUDISC_WANT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_PMTUDISC_WANT,1,1,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_RECVDSTOPTS.S b/libc/sysv/consts/IPV6_RECVDSTOPTS.S
new file mode 100644
index 000000000..cd0aa3257
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RECVDSTOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RECVDSTOPTS,58,58,0,0,40,40,40,0
diff --git a/libc/sysv/consts/IPV6_RECVERR.S b/libc/sysv/consts/IPV6_RECVERR.S
new file mode 100644
index 000000000..1c36df28f
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RECVERR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RECVERR,25,25,0,0,0,0,0,75
diff --git a/libc/sysv/consts/IPV6_RECVHOPLIMIT.S b/libc/sysv/consts/IPV6_RECVHOPLIMIT.S
new file mode 100644
index 000000000..2e7b97112
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RECVHOPLIMIT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RECVHOPLIMIT,51,51,0,0,37,37,37,0
diff --git a/libc/sysv/consts/IPV6_RECVHOPOPTS.S b/libc/sysv/consts/IPV6_RECVHOPOPTS.S
new file mode 100644
index 000000000..d2bb20708
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RECVHOPOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RECVHOPOPTS,53,53,0,0,39,39,39,0
diff --git a/libc/sysv/consts/IPV6_RECVORIGDSTADDR.S b/libc/sysv/consts/IPV6_RECVORIGDSTADDR.S
new file mode 100644
index 000000000..c4e179301
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RECVORIGDSTADDR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RECVORIGDSTADDR,0,0,0,0,72,0,0,0
diff --git a/libc/sysv/consts/IPV6_RECVPATHMTU.S b/libc/sysv/consts/IPV6_RECVPATHMTU.S
new file mode 100644
index 000000000..4a8fdc77d
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RECVPATHMTU.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RECVPATHMTU,60,60,0,0,43,43,43,0
diff --git a/libc/sysv/consts/IPV6_RECVPKTINFO.S b/libc/sysv/consts/IPV6_RECVPKTINFO.S
new file mode 100644
index 000000000..49141d5f1
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RECVPKTINFO.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RECVPKTINFO,49,49,0,0,36,36,36,0
diff --git a/libc/sysv/consts/IPV6_ROUTER_ALERT.S b/libc/sysv/consts/IPV6_ROUTER_ALERT.S
new file mode 100644
index 000000000..ea8557cd9
--- /dev/null
+++ b/libc/sysv/consts/IPV6_ROUTER_ALERT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_ROUTER_ALERT,22,22,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_RTHDR.S b/libc/sysv/consts/IPV6_RTHDR.S
index eddbf9ed8..7e464ebd8 100644
--- a/libc/sysv/consts/IPV6_RTHDR.S
+++ b/libc/sysv/consts/IPV6_RTHDR.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon ipv6,IPV6_RTHDR,57,57,0,0,51,51,51,32
+.syscon ipv6,IPV6_RTHDR,57,57,0,0,51,51,51,0x20
diff --git a/libc/sysv/consts/IPV6_RTHDRDSTOPTS.S b/libc/sysv/consts/IPV6_RTHDRDSTOPTS.S
new file mode 100644
index 000000000..1df68a921
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RTHDRDSTOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RTHDRDSTOPTS,55,55,0,0,35,35,35,0
diff --git a/libc/sysv/consts/IPV6_RTHDR_LOOSE.S b/libc/sysv/consts/IPV6_RTHDR_LOOSE.S
new file mode 100644
index 000000000..a61ea0a6e
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RTHDR_LOOSE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RTHDR_LOOSE,0,0,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_RTHDR_STRICT.S b/libc/sysv/consts/IPV6_RTHDR_STRICT.S
new file mode 100644
index 000000000..52e7e5561
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RTHDR_STRICT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RTHDR_STRICT,1,1,1,1,1,0,0,0
diff --git a/libc/sysv/consts/IPV6_RTHDR_TYPE_0.S b/libc/sysv/consts/IPV6_RTHDR_TYPE_0.S
new file mode 100644
index 000000000..79bbce89d
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RTHDR_TYPE_0.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RTHDR_TYPE_0,0,0,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_RXDSTOPTS.S b/libc/sysv/consts/IPV6_RXDSTOPTS.S
new file mode 100644
index 000000000..81f8647a3
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RXDSTOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RXDSTOPTS,59,59,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_RXHOPOPTS.S b/libc/sysv/consts/IPV6_RXHOPOPTS.S
new file mode 100644
index 000000000..a5a089d0f
--- /dev/null
+++ b/libc/sysv/consts/IPV6_RXHOPOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_RXHOPOPTS,54,54,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IPV6_UNICAST_HOPS.S b/libc/sysv/consts/IPV6_UNICAST_HOPS.S
index 40a3a4686..dd3c23fbe 100644
--- a/libc/sysv/consts/IPV6_UNICAST_HOPS.S
+++ b/libc/sysv/consts/IPV6_UNICAST_HOPS.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon ipv6,IPV6_UNICAST_HOPS,16,16,4,4,4,4,4,4
+.syscon ipv6,IPV6_UNICAST_HOPS,0x10,0x10,4,4,4,4,4,4
diff --git a/libc/sysv/consts/IPV6_XFRM_POLICY.S b/libc/sysv/consts/IPV6_XFRM_POLICY.S
new file mode 100644
index 000000000..bf31a4bd6
--- /dev/null
+++ b/libc/sysv/consts/IPV6_XFRM_POLICY.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ipv6,IPV6_XFRM_POLICY,35,35,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_ADD_SOURCE_MEMBERSHIP.S b/libc/sysv/consts/IP_ADD_SOURCE_MEMBERSHIP.S
new file mode 100644
index 000000000..fc8e01943
--- /dev/null
+++ b/libc/sysv/consts/IP_ADD_SOURCE_MEMBERSHIP.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_ADD_SOURCE_MEMBERSHIP,39,39,70,70,70,0,0,15
diff --git a/libc/sysv/consts/IP_BIND_ADDRESS_NO_PORT.S b/libc/sysv/consts/IP_BIND_ADDRESS_NO_PORT.S
new file mode 100644
index 000000000..ac616ee14
--- /dev/null
+++ b/libc/sysv/consts/IP_BIND_ADDRESS_NO_PORT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_BIND_ADDRESS_NO_PORT,24,24,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_BLOCK_SOURCE.S b/libc/sysv/consts/IP_BLOCK_SOURCE.S
new file mode 100644
index 000000000..3a00c226f
--- /dev/null
+++ b/libc/sysv/consts/IP_BLOCK_SOURCE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_BLOCK_SOURCE,38,38,72,72,72,0,0,17
diff --git a/libc/sysv/consts/IP_CHECKSUM.S b/libc/sysv/consts/IP_CHECKSUM.S
new file mode 100644
index 000000000..3e1cb4251
--- /dev/null
+++ b/libc/sysv/consts/IP_CHECKSUM.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_CHECKSUM,23,23,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_DEFAULT_MULTICAST_LOOP.S b/libc/sysv/consts/IP_DEFAULT_MULTICAST_LOOP.S
new file mode 100644
index 000000000..7bc60ef5d
--- /dev/null
+++ b/libc/sysv/consts/IP_DEFAULT_MULTICAST_LOOP.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_DEFAULT_MULTICAST_LOOP,1,1,1,1,1,1,1,1
diff --git a/libc/sysv/consts/IP_DEFAULT_MULTICAST_TTL.S b/libc/sysv/consts/IP_DEFAULT_MULTICAST_TTL.S
new file mode 100644
index 000000000..2685d71c8
--- /dev/null
+++ b/libc/sysv/consts/IP_DEFAULT_MULTICAST_TTL.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_DEFAULT_MULTICAST_TTL,1,1,1,1,1,1,1,1
diff --git a/libc/sysv/consts/IP_DROP_SOURCE_MEMBERSHIP.S b/libc/sysv/consts/IP_DROP_SOURCE_MEMBERSHIP.S
new file mode 100644
index 000000000..9e070555d
--- /dev/null
+++ b/libc/sysv/consts/IP_DROP_SOURCE_MEMBERSHIP.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_DROP_SOURCE_MEMBERSHIP,40,40,71,71,71,0,0,16
diff --git a/libc/sysv/consts/IP_FREEBIND.S b/libc/sysv/consts/IP_FREEBIND.S
new file mode 100644
index 000000000..db3eb4129
--- /dev/null
+++ b/libc/sysv/consts/IP_FREEBIND.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_FREEBIND,15,15,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_IPSEC_POLICY.S b/libc/sysv/consts/IP_IPSEC_POLICY.S
new file mode 100644
index 000000000..d5b73607d
--- /dev/null
+++ b/libc/sysv/consts/IP_IPSEC_POLICY.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_IPSEC_POLICY,0x10,0x10,21,21,21,0,0,0
diff --git a/libc/sysv/consts/IP_MAX_MEMBERSHIPS.S b/libc/sysv/consts/IP_MAX_MEMBERSHIPS.S
new file mode 100644
index 000000000..648c49960
--- /dev/null
+++ b/libc/sysv/consts/IP_MAX_MEMBERSHIPS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_MAX_MEMBERSHIPS,20,20,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,20
diff --git a/libc/sysv/consts/IP_MINTTL.S b/libc/sysv/consts/IP_MINTTL.S
new file mode 100644
index 000000000..5295aa259
--- /dev/null
+++ b/libc/sysv/consts/IP_MINTTL.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_MINTTL,21,21,0,0,66,32,24,0
diff --git a/libc/sysv/consts/IP_MSFILTER.S b/libc/sysv/consts/IP_MSFILTER.S
new file mode 100644
index 000000000..07628afaa
--- /dev/null
+++ b/libc/sysv/consts/IP_MSFILTER.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_MSFILTER,41,41,74,74,74,0,0,0
diff --git a/libc/sysv/consts/IP_MTU_DISCOVER.S b/libc/sysv/consts/IP_MTU_DISCOVER.S
new file mode 100644
index 000000000..b86381b44
--- /dev/null
+++ b/libc/sysv/consts/IP_MTU_DISCOVER.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_MTU_DISCOVER,10,10,0,0,0,0,0,71
diff --git a/libc/sysv/consts/IP_MULTICAST_ALL.S b/libc/sysv/consts/IP_MULTICAST_ALL.S
new file mode 100644
index 000000000..58533ee80
--- /dev/null
+++ b/libc/sysv/consts/IP_MULTICAST_ALL.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_MULTICAST_ALL,49,49,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_MULTICAST_IF.S b/libc/sysv/consts/IP_MULTICAST_IF.S
index 1e2e5e93c..d1cdf8a7d 100644
--- a/libc/sysv/consts/IP_MULTICAST_IF.S
+++ b/libc/sysv/consts/IP_MULTICAST_IF.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon ip,IP_MULTICAST_IF,32,32,9,9,9,9,9,9
+.syscon ip,IP_MULTICAST_IF,0x20,0x20,9,9,9,9,9,9
diff --git a/libc/sysv/consts/IP_NODEFRAG.S b/libc/sysv/consts/IP_NODEFRAG.S
new file mode 100644
index 000000000..c70a3ba11
--- /dev/null
+++ b/libc/sysv/consts/IP_NODEFRAG.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_NODEFRAG,22,22,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_ORIGDSTADDR.S b/libc/sysv/consts/IP_ORIGDSTADDR.S
new file mode 100644
index 000000000..60293c97a
--- /dev/null
+++ b/libc/sysv/consts/IP_ORIGDSTADDR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_ORIGDSTADDR,20,20,0,0,27,0,0,0
diff --git a/libc/sysv/consts/IP_PASSSEC.S b/libc/sysv/consts/IP_PASSSEC.S
new file mode 100644
index 000000000..baa3afcfe
--- /dev/null
+++ b/libc/sysv/consts/IP_PASSSEC.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_PASSSEC,18,18,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_PKTOPTIONS.S b/libc/sysv/consts/IP_PKTOPTIONS.S
new file mode 100644
index 000000000..fdeefe4d6
--- /dev/null
+++ b/libc/sysv/consts/IP_PKTOPTIONS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_PKTOPTIONS,9,9,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_PMTUDISC.S b/libc/sysv/consts/IP_PMTUDISC.S
new file mode 100644
index 000000000..f96b04640
--- /dev/null
+++ b/libc/sysv/consts/IP_PMTUDISC.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_PMTUDISC,10,10,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_PMTUDISC_DO.S b/libc/sysv/consts/IP_PMTUDISC_DO.S
new file mode 100644
index 000000000..1dfb7eff0
--- /dev/null
+++ b/libc/sysv/consts/IP_PMTUDISC_DO.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_PMTUDISC_DO,2,2,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_PMTUDISC_DONT.S b/libc/sysv/consts/IP_PMTUDISC_DONT.S
new file mode 100644
index 000000000..2eca2c25f
--- /dev/null
+++ b/libc/sysv/consts/IP_PMTUDISC_DONT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_PMTUDISC_DONT,0,0,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_PMTUDISC_INTERFACE.S b/libc/sysv/consts/IP_PMTUDISC_INTERFACE.S
new file mode 100644
index 000000000..bf21b44ec
--- /dev/null
+++ b/libc/sysv/consts/IP_PMTUDISC_INTERFACE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_PMTUDISC_INTERFACE,4,4,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_PMTUDISC_OMIT.S b/libc/sysv/consts/IP_PMTUDISC_OMIT.S
new file mode 100644
index 000000000..737719692
--- /dev/null
+++ b/libc/sysv/consts/IP_PMTUDISC_OMIT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_PMTUDISC_OMIT,5,5,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_PMTUDISC_PROBE.S b/libc/sysv/consts/IP_PMTUDISC_PROBE.S
new file mode 100644
index 000000000..a8f3d6963
--- /dev/null
+++ b/libc/sysv/consts/IP_PMTUDISC_PROBE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_PMTUDISC_PROBE,3,3,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_PMTUDISC_WANT.S b/libc/sysv/consts/IP_PMTUDISC_WANT.S
new file mode 100644
index 000000000..6e6a2a910
--- /dev/null
+++ b/libc/sysv/consts/IP_PMTUDISC_WANT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_PMTUDISC_WANT,1,1,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_RECVDSTADDR.S b/libc/sysv/consts/IP_RECVDSTADDR.S
new file mode 100644
index 000000000..f2c5257ba
--- /dev/null
+++ b/libc/sysv/consts/IP_RECVDSTADDR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_RECVDSTADDR,0,0,7,7,7,7,7,0
diff --git a/libc/sysv/consts/IP_RECVERR.S b/libc/sysv/consts/IP_RECVERR.S
new file mode 100644
index 000000000..0e861735a
--- /dev/null
+++ b/libc/sysv/consts/IP_RECVERR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_RECVERR,11,11,0,0,0,0,0,75
diff --git a/libc/sysv/consts/IP_RECVOPTS.S b/libc/sysv/consts/IP_RECVOPTS.S
new file mode 100644
index 000000000..76f152929
--- /dev/null
+++ b/libc/sysv/consts/IP_RECVOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_RECVOPTS,6,6,5,5,5,5,5,0
diff --git a/libc/sysv/consts/IP_RECVORIGDSTADDR.S b/libc/sysv/consts/IP_RECVORIGDSTADDR.S
new file mode 100644
index 000000000..ebcad0265
--- /dev/null
+++ b/libc/sysv/consts/IP_RECVORIGDSTADDR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_RECVORIGDSTADDR,20,20,0,0,27,0,0,0
diff --git a/libc/sysv/consts/IP_RECVRETOPTS.S b/libc/sysv/consts/IP_RECVRETOPTS.S
new file mode 100644
index 000000000..2eb706438
--- /dev/null
+++ b/libc/sysv/consts/IP_RECVRETOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_RECVRETOPTS,7,7,6,6,6,6,6,0
diff --git a/libc/sysv/consts/IP_RETOPTS.S b/libc/sysv/consts/IP_RETOPTS.S
new file mode 100644
index 000000000..8056e7551
--- /dev/null
+++ b/libc/sysv/consts/IP_RETOPTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_RETOPTS,7,7,8,8,8,8,8,0
diff --git a/libc/sysv/consts/IP_ROUTER_ALERT.S b/libc/sysv/consts/IP_ROUTER_ALERT.S
new file mode 100644
index 000000000..f4306aef7
--- /dev/null
+++ b/libc/sysv/consts/IP_ROUTER_ALERT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_ROUTER_ALERT,5,5,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_TRANSPARENT.S b/libc/sysv/consts/IP_TRANSPARENT.S
new file mode 100644
index 000000000..9dab2efa4
--- /dev/null
+++ b/libc/sysv/consts/IP_TRANSPARENT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_TRANSPARENT,19,19,0,0,0,0,0,0
diff --git a/libc/sysv/consts/IP_UNBLOCK_SOURCE.S b/libc/sysv/consts/IP_UNBLOCK_SOURCE.S
new file mode 100644
index 000000000..c33465f09
--- /dev/null
+++ b/libc/sysv/consts/IP_UNBLOCK_SOURCE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_UNBLOCK_SOURCE,37,37,73,73,73,0,0,18
diff --git a/libc/sysv/consts/IP_UNICAST_IF.S b/libc/sysv/consts/IP_UNICAST_IF.S
new file mode 100644
index 000000000..8e6ffbb85
--- /dev/null
+++ b/libc/sysv/consts/IP_UNICAST_IF.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_UNICAST_IF,50,50,0,0,0,0,0,31
diff --git a/libc/sysv/consts/IP_XFRM_POLICY.S b/libc/sysv/consts/IP_XFRM_POLICY.S
new file mode 100644
index 000000000..c5ac226bc
--- /dev/null
+++ b/libc/sysv/consts/IP_XFRM_POLICY.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon ip,IP_XFRM_POLICY,17,17,0,0,0,0,0,0
diff --git a/libc/sysv/consts/LINE_MAX.S b/libc/sysv/consts/LINE_MAX.S
new file mode 100644
index 000000000..38afae7a3
--- /dev/null
+++ b/libc/sysv/consts/LINE_MAX.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,LINE_MAX,0x0800,0x0800,0x0800,0x0800,0x0800,0x0800,0x0800,0
diff --git a/libc/sysv/consts/LINKED_CMD_COMPLETE.S b/libc/sysv/consts/LINKED_CMD_COMPLETE.S
new file mode 100644
index 000000000..cbfd020ff
--- /dev/null
+++ b/libc/sysv/consts/LINKED_CMD_COMPLETE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,LINKED_CMD_COMPLETE,10,10,0,0,0,0,0,0
diff --git a/libc/sysv/consts/LINKED_FLG_CMD_COMPLETE.S b/libc/sysv/consts/LINKED_FLG_CMD_COMPLETE.S
new file mode 100644
index 000000000..e1b654e9c
--- /dev/null
+++ b/libc/sysv/consts/LINKED_FLG_CMD_COMPLETE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,LINKED_FLG_CMD_COMPLETE,11,11,0,0,0,0,0,0
diff --git a/libc/sysv/consts/LNKTYPE.S b/libc/sysv/consts/LNKTYPE.S
new file mode 100644
index 000000000..5c92f4df1
--- /dev/null
+++ b/libc/sysv/consts/LNKTYPE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,LNKTYPE,49,49,49,49,49,49,49,0
diff --git a/libc/sysv/consts/LOCK_UNLOCK_CACHE.S b/libc/sysv/consts/LOCK_UNLOCK_CACHE.S
new file mode 100644
index 000000000..79f898e0f
--- /dev/null
+++ b/libc/sysv/consts/LOCK_UNLOCK_CACHE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon lock,LOCK_UNLOCK_CACHE,54,54,0,0,0,0,0,0
diff --git a/libc/sysv/consts/L_INCR.S b/libc/sysv/consts/L_INCR.S
new file mode 100644
index 000000000..7fa6739c4
--- /dev/null
+++ b/libc/sysv/consts/L_INCR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,L_INCR,1,1,1,1,1,1,1,0
diff --git a/libc/sysv/consts/L_SET.S b/libc/sysv/consts/L_SET.S
new file mode 100644
index 000000000..17fa679f8
--- /dev/null
+++ b/libc/sysv/consts/L_SET.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,L_SET,0,0,0,0,0,0,0,0
diff --git a/libc/sysv/consts/L_XTND.S b/libc/sysv/consts/L_XTND.S
new file mode 100644
index 000000000..d21b49d21
--- /dev/null
+++ b/libc/sysv/consts/L_XTND.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,L_XTND,2,2,2,2,2,2,2,0
diff --git a/libc/sysv/consts/MAP_NOFORK.S b/libc/sysv/consts/MAP_NOFORK.S
new file mode 100644
index 000000000..04b0363b6
--- /dev/null
+++ b/libc/sysv/consts/MAP_NOFORK.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon mmap,MAP_NOFORK,0,0,0,0,0,0,0,0x10000000
diff --git a/libc/sysv/consts/MATH_ERREXCEPT.S b/libc/sysv/consts/MATH_ERREXCEPT.S
new file mode 100644
index 000000000..b4b21e1d4
--- /dev/null
+++ b/libc/sysv/consts/MATH_ERREXCEPT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,MATH_ERREXCEPT,2,2,2,2,2,2,2,0
diff --git a/libc/sysv/consts/MATH_ERRNO.S b/libc/sysv/consts/MATH_ERRNO.S
new file mode 100644
index 000000000..1feafb8f4
--- /dev/null
+++ b/libc/sysv/consts/MATH_ERRNO.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,MATH_ERRNO,1,1,1,1,1,1,1,0
diff --git a/libc/sysv/consts/MAXNAMLEN.S b/libc/sysv/consts/MAXNAMLEN.S
new file mode 100644
index 000000000..39e41287f
--- /dev/null
+++ b/libc/sysv/consts/MAXNAMLEN.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,MAXNAMLEN,255,255,255,255,255,255,255,0
diff --git a/libc/sysv/consts/MAXQUOTAS.S b/libc/sysv/consts/MAXQUOTAS.S
new file mode 100644
index 000000000..6f75185e0
--- /dev/null
+++ b/libc/sysv/consts/MAXQUOTAS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,MAXQUOTAS,2,2,2,2,2,2,2,0
diff --git a/libc/sysv/consts/MEDIUM_ERROR.S b/libc/sysv/consts/MEDIUM_ERROR.S
new file mode 100644
index 000000000..5197ba6e3
--- /dev/null
+++ b/libc/sysv/consts/MEDIUM_ERROR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,MEDIUM_ERROR,3,3,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MEDIUM_SCAN.S b/libc/sysv/consts/MEDIUM_SCAN.S
new file mode 100644
index 000000000..abe6fdede
--- /dev/null
+++ b/libc/sysv/consts/MEDIUM_SCAN.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,MEDIUM_SCAN,56,56,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_BATCH.S b/libc/sysv/consts/MSG_BATCH.S
new file mode 100644
index 000000000..e171f4f2f
--- /dev/null
+++ b/libc/sysv/consts/MSG_BATCH.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_BATCH,0x040000,0x040000,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_BCAST.S b/libc/sysv/consts/MSG_BCAST.S
new file mode 100644
index 000000000..082634257
--- /dev/null
+++ b/libc/sysv/consts/MSG_BCAST.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_BCAST,0,0,0,0,0,0x100,0x100,0
diff --git a/libc/sysv/consts/MSG_CMSG_CLOEXEC.S b/libc/sysv/consts/MSG_CMSG_CLOEXEC.S
new file mode 100644
index 000000000..ea66034e3
--- /dev/null
+++ b/libc/sysv/consts/MSG_CMSG_CLOEXEC.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_CMSG_CLOEXEC,0x40000000,0x40000000,0,0,0x040000,0x0800,0x0800,0
diff --git a/libc/sysv/consts/MSG_CONFIRM.S b/libc/sysv/consts/MSG_CONFIRM.S
new file mode 100644
index 000000000..1948be63a
--- /dev/null
+++ b/libc/sysv/consts/MSG_CONFIRM.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_CONFIRM,0x0800,0x0800,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_EOF.S b/libc/sysv/consts/MSG_EOF.S
new file mode 100644
index 000000000..3e8168a2a
--- /dev/null
+++ b/libc/sysv/consts/MSG_EOF.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_EOF,0x0200,0x0200,0x0100,0x0100,0x0100,0,0,0
diff --git a/libc/sysv/consts/MSG_EOR.S b/libc/sysv/consts/MSG_EOR.S
new file mode 100644
index 000000000..fe773b16a
--- /dev/null
+++ b/libc/sysv/consts/MSG_EOR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_EOR,0x80,0x80,8,8,8,8,8,0
diff --git a/libc/sysv/consts/MSG_ERRQUEUE.S b/libc/sysv/consts/MSG_ERRQUEUE.S
new file mode 100644
index 000000000..02f1934ff
--- /dev/null
+++ b/libc/sysv/consts/MSG_ERRQUEUE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_ERRQUEUE,0x2000,0x2000,0,0,0,0,0,0x1000
diff --git a/libc/sysv/consts/MSG_EXCEPT.S b/libc/sysv/consts/MSG_EXCEPT.S
new file mode 100644
index 000000000..da072f0a9
--- /dev/null
+++ b/libc/sysv/consts/MSG_EXCEPT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_EXCEPT,0x2000,0x2000,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_FASTOPEN.S b/libc/sysv/consts/MSG_FASTOPEN.S
index 17aa7bab6..ff034a75b 100644
--- a/libc/sysv/consts/MSG_FASTOPEN.S
+++ b/libc/sysv/consts/MSG_FASTOPEN.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon msg,MSG_FASTOPEN,0x20000000,0x20000000,-1,-1,-1,-1,-1,-1
+.syscon msg,MSG_FASTOPEN,0x20000000,0x20000000,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_FIN.S b/libc/sysv/consts/MSG_FIN.S
new file mode 100644
index 000000000..f0631e333
--- /dev/null
+++ b/libc/sysv/consts/MSG_FIN.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_FIN,0x0200,0x0200,0x0100,0x0100,0x0100,0,0,0
diff --git a/libc/sysv/consts/MSG_INFO.S b/libc/sysv/consts/MSG_INFO.S
new file mode 100644
index 000000000..d882e3a75
--- /dev/null
+++ b/libc/sysv/consts/MSG_INFO.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_INFO,12,12,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_MCAST.S b/libc/sysv/consts/MSG_MCAST.S
new file mode 100644
index 000000000..0f68f5321
--- /dev/null
+++ b/libc/sysv/consts/MSG_MCAST.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_MCAST,0,0,0,0,0,0x200,0x200,0
diff --git a/libc/sysv/consts/MSG_MORE.S b/libc/sysv/consts/MSG_MORE.S
new file mode 100644
index 000000000..fa8717d4f
--- /dev/null
+++ b/libc/sysv/consts/MSG_MORE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_MORE,0x8000,0x8000,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_NOERROR.S b/libc/sysv/consts/MSG_NOERROR.S
new file mode 100644
index 000000000..22764bb8d
--- /dev/null
+++ b/libc/sysv/consts/MSG_NOERROR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_NOERROR,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0
diff --git a/libc/sysv/consts/MSG_NOSIGNAL.S b/libc/sysv/consts/MSG_NOSIGNAL.S
index 1a0539eef..48be7e922 100644
--- a/libc/sysv/consts/MSG_NOSIGNAL.S
+++ b/libc/sysv/consts/MSG_NOSIGNAL.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon msg,MSG_NOSIGNAL,0x4000,0x4000,0x80000,0x80000,0x020000,0x0400,0x0400,0x10000000
+.syscon msg,MSG_NOSIGNAL,0x4000,0x4000,0x80000,0x80000,0x020000,0x0400,0x0400,0
diff --git a/libc/sysv/consts/MSG_NOTIFICATION.S b/libc/sysv/consts/MSG_NOTIFICATION.S
new file mode 100644
index 000000000..7503b5a50
--- /dev/null
+++ b/libc/sysv/consts/MSG_NOTIFICATION.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_NOTIFICATION,0x8000,0x8000,0,0,0x2000,0,0x4000,0
diff --git a/libc/sysv/consts/MSG_PARITY_ERROR.S b/libc/sysv/consts/MSG_PARITY_ERROR.S
new file mode 100644
index 000000000..7abd7fafd
--- /dev/null
+++ b/libc/sysv/consts/MSG_PARITY_ERROR.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_PARITY_ERROR,9,9,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_PROXY.S b/libc/sysv/consts/MSG_PROXY.S
new file mode 100644
index 000000000..e6216ce64
--- /dev/null
+++ b/libc/sysv/consts/MSG_PROXY.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_PROXY,0x10,0x10,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_RST.S b/libc/sysv/consts/MSG_RST.S
new file mode 100644
index 000000000..3bed0246b
--- /dev/null
+++ b/libc/sysv/consts/MSG_RST.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_RST,0x1000,0x1000,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_STAT.S b/libc/sysv/consts/MSG_STAT.S
new file mode 100644
index 000000000..cdee6a110
--- /dev/null
+++ b/libc/sysv/consts/MSG_STAT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_STAT,11,11,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_SYN.S b/libc/sysv/consts/MSG_SYN.S
new file mode 100644
index 000000000..b57fbeb86
--- /dev/null
+++ b/libc/sysv/consts/MSG_SYN.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_SYN,0x0400,0x0400,0,0,0,0,0,0
diff --git a/libc/sysv/consts/MSG_WAITFORONE.S b/libc/sysv/consts/MSG_WAITFORONE.S
new file mode 100644
index 000000000..e89b311e2
--- /dev/null
+++ b/libc/sysv/consts/MSG_WAITFORONE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon msg,MSG_WAITFORONE,0x010000,0x010000,0,0,0x080000,0,0x2000,0
diff --git a/libc/sysv/consts/NBBY.S b/libc/sysv/consts/NBBY.S
new file mode 100644
index 000000000..72b368fb5
--- /dev/null
+++ b/libc/sysv/consts/NBBY.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,NBBY,8,8,8,8,8,8,8,0
diff --git a/libc/sysv/consts/NR_DQHASH.S b/libc/sysv/consts/NR_DQHASH.S
new file mode 100644
index 000000000..c485c1cd4
--- /dev/null
+++ b/libc/sysv/consts/NR_DQHASH.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,NR_DQHASH,43,43,0,0,0,0,0,0
diff --git a/libc/sysv/consts/NR_DQUOTS.S b/libc/sysv/consts/NR_DQUOTS.S
new file mode 100644
index 000000000..43ac329ef
--- /dev/null
+++ b/libc/sysv/consts/NR_DQUOTS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,NR_DQUOTS,0x0100,0x0100,0,0,0,0,0,0
diff --git a/libc/sysv/consts/PERSISTENT_RESERVE_IN.S b/libc/sysv/consts/PERSISTENT_RESERVE_IN.S
new file mode 100644
index 000000000..de777b3c6
--- /dev/null
+++ b/libc/sysv/consts/PERSISTENT_RESERVE_IN.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,PERSISTENT_RESERVE_IN,94,94,0,0,0,0,0,0
diff --git a/libc/sysv/consts/PERSISTENT_RESERVE_OUT.S b/libc/sysv/consts/PERSISTENT_RESERVE_OUT.S
new file mode 100644
index 000000000..791a63475
--- /dev/null
+++ b/libc/sysv/consts/PERSISTENT_RESERVE_OUT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,PERSISTENT_RESERVE_OUT,95,95,0,0,0,0,0,0
diff --git a/libc/sysv/consts/PRELIM.S b/libc/sysv/consts/PRELIM.S
new file mode 100644
index 000000000..0d97154e6
--- /dev/null
+++ b/libc/sysv/consts/PRELIM.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,PRELIM,1,1,1,1,1,1,1,0
diff --git a/libc/sysv/consts/REGTYPE.S b/libc/sysv/consts/REGTYPE.S
new file mode 100644
index 000000000..23ea4bcc2
--- /dev/null
+++ b/libc/sysv/consts/REGTYPE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,REGTYPE,48,48,48,48,48,48,48,0
diff --git a/libc/sysv/consts/RES_PRF_CLASS.S b/libc/sysv/consts/RES_PRF_CLASS.S
new file mode 100644
index 000000000..ec3d33e78
--- /dev/null
+++ b/libc/sysv/consts/RES_PRF_CLASS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,RES_PRF_CLASS,4,4,4,4,4,4,4,0
diff --git a/libc/sysv/consts/RHF_GUARANTEE_START_INIT.S b/libc/sysv/consts/RHF_GUARANTEE_START_INIT.S
new file mode 100644
index 000000000..6cfba6bb4
--- /dev/null
+++ b/libc/sysv/consts/RHF_GUARANTEE_START_INIT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,RHF_GUARANTEE_START_INIT,0x80,0x80,0,0,0,0,0,0
diff --git a/libc/sysv/consts/RHF_NO_LIBRARY_REPLACEMENT.S b/libc/sysv/consts/RHF_NO_LIBRARY_REPLACEMENT.S
new file mode 100644
index 000000000..7129c23c6
--- /dev/null
+++ b/libc/sysv/consts/RHF_NO_LIBRARY_REPLACEMENT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,RHF_NO_LIBRARY_REPLACEMENT,4,4,0,0,0,0,0,0
diff --git a/libc/sysv/consts/RLIMIT_AS.S b/libc/sysv/consts/RLIMIT_AS.S
index 7c5fc850c..03c20c065 100644
--- a/libc/sysv/consts/RLIMIT_AS.S
+++ b/libc/sysv/consts/RLIMIT_AS.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon rlimit,RLIMIT_AS,9,9,5,5,10,2,10,0
+.syscon rlimit,RLIMIT_AS,9	,9,5,5,10,2,10,0
diff --git a/libc/sysv/consts/RRQ.S b/libc/sysv/consts/RRQ.S
new file mode 100644
index 000000000..2d714fad7
--- /dev/null
+++ b/libc/sysv/consts/RRQ.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,RRQ,1,1,1,1,1,1,1,0
diff --git a/libc/sysv/consts/RTF_NOFORWARD.S b/libc/sysv/consts/RTF_NOFORWARD.S
new file mode 100644
index 000000000..bd1a57d66
--- /dev/null
+++ b/libc/sysv/consts/RTF_NOFORWARD.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,RTF_NOFORWARD,0x1000,0x1000,0,0,0,0,0,0
diff --git a/libc/sysv/consts/RTF_NOPMTUDISC.S b/libc/sysv/consts/RTF_NOPMTUDISC.S
new file mode 100644
index 000000000..b31c84352
--- /dev/null
+++ b/libc/sysv/consts/RTF_NOPMTUDISC.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,RTF_NOPMTUDISC,0x4000,0x4000,0,0,0,0,0,0
diff --git a/libc/sysv/consts/SARMAG.S b/libc/sysv/consts/SARMAG.S
new file mode 100644
index 000000000..cae2eee96
--- /dev/null
+++ b/libc/sysv/consts/SARMAG.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SARMAG,8,8,8,8,8,8,8,0
diff --git a/libc/sysv/consts/SEGSIZE.S b/libc/sysv/consts/SEGSIZE.S
new file mode 100644
index 000000000..ae243d93f
--- /dev/null
+++ b/libc/sysv/consts/SEGSIZE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SEGSIZE,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200,0
diff --git a/libc/sysv/consts/SEND_DIAGNOSTIC.S b/libc/sysv/consts/SEND_DIAGNOSTIC.S
new file mode 100644
index 000000000..83da87ff2
--- /dev/null
+++ b/libc/sysv/consts/SEND_DIAGNOSTIC.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SEND_DIAGNOSTIC,29,29,0,0,0,0,0,0
diff --git a/libc/sysv/consts/SEND_VOLUME_TAG.S b/libc/sysv/consts/SEND_VOLUME_TAG.S
new file mode 100644
index 000000000..4303e011d
--- /dev/null
+++ b/libc/sysv/consts/SEND_VOLUME_TAG.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SEND_VOLUME_TAG,182,182,0,0,0,0,0,0
diff --git a/libc/sysv/consts/SET_LIMITS.S b/libc/sysv/consts/SET_LIMITS.S
new file mode 100644
index 000000000..de5e6f1b8
--- /dev/null
+++ b/libc/sysv/consts/SET_LIMITS.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SET_LIMITS,51,51,0,0,0,0,0,0
diff --git a/libc/sysv/consts/SET_WINDOW.S b/libc/sysv/consts/SET_WINDOW.S
new file mode 100644
index 000000000..5c39025c8
--- /dev/null
+++ b/libc/sysv/consts/SET_WINDOW.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SET_WINDOW,36,36,0,0,0,0,0,0
diff --git a/libc/sysv/consts/SFD_CLOEXEC.S b/libc/sysv/consts/SFD_CLOEXEC.S
new file mode 100644
index 000000000..8cdb0e1ce
--- /dev/null
+++ b/libc/sysv/consts/SFD_CLOEXEC.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SFD_CLOEXEC,0x080000,0x080000,0,0,0,0,0,0
diff --git a/libc/sysv/consts/SFD_NONBLOCK.S b/libc/sysv/consts/SFD_NONBLOCK.S
new file mode 100644
index 000000000..7c81b0a84
--- /dev/null
+++ b/libc/sysv/consts/SFD_NONBLOCK.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SFD_NONBLOCK,0x0800,0x0800,0,0,0,0,0,0
diff --git a/libc/sysv/consts/SIGEV_NONE.S b/libc/sysv/consts/SIGEV_NONE.S
new file mode 100644
index 000000000..f36a315c9
--- /dev/null
+++ b/libc/sysv/consts/SIGEV_NONE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SIGEV_NONE,1,1,0,0,0,0,0,0
diff --git a/libc/sysv/consts/SIGEV_SIGNAL.S b/libc/sysv/consts/SIGEV_SIGNAL.S
new file mode 100644
index 000000000..083f4d54c
--- /dev/null
+++ b/libc/sysv/consts/SIGEV_SIGNAL.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SIGEV_SIGNAL,0,0,1,1,1,0,1,0
diff --git a/libc/sysv/consts/SIGEV_THREAD.S b/libc/sysv/consts/SIGEV_THREAD.S
new file mode 100644
index 000000000..0b4173693
--- /dev/null
+++ b/libc/sysv/consts/SIGEV_THREAD.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SIGEV_THREAD,2,2,3,3,2,0,2,0
diff --git a/libc/sysv/consts/SO_BROADCAST.S b/libc/sysv/consts/SO_BROADCAST.S
index 7b8652b81..47cf8c307 100644
--- a/libc/sysv/consts/SO_BROADCAST.S
+++ b/libc/sysv/consts/SO_BROADCAST.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon so,SO_BROADCAST,6,6,32,32,32,32,32,32
+.syscon so,SO_BROADCAST,6,6,0x20,0x20,0x20,0x20,0x20,0x20
diff --git a/libc/sysv/consts/SO_DONTROUTE.S b/libc/sysv/consts/SO_DONTROUTE.S
index 4bd9d3746..0b29ceb10 100644
--- a/libc/sysv/consts/SO_DONTROUTE.S
+++ b/libc/sysv/consts/SO_DONTROUTE.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon so,SO_DONTROUTE,5,5,16,16,16,16,16,16
+.syscon so,SO_DONTROUTE,5,5,0x10,0x10,0x10,0x10,0x10,0x10
diff --git a/libc/sysv/consts/SO_LINGER.S b/libc/sysv/consts/SO_LINGER.S
index f839d09e0..6b40dbf92 100644
--- a/libc/sysv/consts/SO_LINGER.S
+++ b/libc/sysv/consts/SO_LINGER.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon so,SO_LINGER,13,13,4224,4224,128,128,128,128
+.syscon so,SO_LINGER,13,13,0x1080,0x1080,0x80,0x80,0x80,0x80
diff --git a/libc/sysv/consts/SO_OOBINLINE.S b/libc/sysv/consts/SO_OOBINLINE.S
index 0d3690b39..b7fa38130 100644
--- a/libc/sysv/consts/SO_OOBINLINE.S
+++ b/libc/sysv/consts/SO_OOBINLINE.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon so,SO_OOBINLINE,10,10,256,256,256,256,256,256
+.syscon so,SO_OOBINLINE,10,10,0x0100,0x0100,0x0100,0x0100,0x0100,0x0100
diff --git a/libc/sysv/consts/SO_REUSEPORT.S b/libc/sysv/consts/SO_REUSEPORT.S
index ea4cdc97d..c1b3e8dd0 100644
--- a/libc/sysv/consts/SO_REUSEPORT.S
+++ b/libc/sysv/consts/SO_REUSEPORT.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon so,SO_REUSEPORT,15,15,512,512,512,512,512,0
+.syscon so,SO_REUSEPORT,15,15,0x0200,0x0200,0x0200,0x0200,0x0200,0
diff --git a/libc/sysv/consts/SO_USELOOPBACK.S b/libc/sysv/consts/SO_USELOOPBACK.S
index 6b58c7841..c9b46d3f7 100644
--- a/libc/sysv/consts/SO_USELOOPBACK.S
+++ b/libc/sysv/consts/SO_USELOOPBACK.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon so,SO_USELOOPBACK,0,0,64,64,64,64,64,64
+.syscon so,SO_USELOOPBACK,0,0,0x40,0x40,0x40,0x40,0x40,0x40
diff --git a/libc/sysv/consts/SUBCMDMASK.S b/libc/sysv/consts/SUBCMDMASK.S
new file mode 100644
index 000000000..84d9e6756
--- /dev/null
+++ b/libc/sysv/consts/SUBCMDMASK.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SUBCMDMASK,255,255,255,255,255,255,255,0
diff --git a/libc/sysv/consts/SUBCMDSHIFT.S b/libc/sysv/consts/SUBCMDSHIFT.S
new file mode 100644
index 000000000..bb41322bc
--- /dev/null
+++ b/libc/sysv/consts/SUBCMDSHIFT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SUBCMDSHIFT,8,8,8,8,8,8,8,0
diff --git a/libc/sysv/consts/SYMTYPE.S b/libc/sysv/consts/SYMTYPE.S
new file mode 100644
index 000000000..a285acd5e
--- /dev/null
+++ b/libc/sysv/consts/SYMTYPE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,SYMTYPE,50,50,50,50,50,50,50,0
diff --git a/libc/sysv/consts/TCP_CONGESTION.S b/libc/sysv/consts/TCP_CONGESTION.S
index 5817b15c7..aed94356e 100644
--- a/libc/sysv/consts/TCP_CONGESTION.S
+++ b/libc/sysv/consts/TCP_CONGESTION.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon tcp,TCP_CONGESTION,13,13,0,0,64,0,0,0
+.syscon tcp,TCP_CONGESTION,13,13,0,0,0x40,0,0,0
diff --git a/libc/sysv/consts/TCP_INFO.S b/libc/sysv/consts/TCP_INFO.S
index ed51f0771..a1aba3722 100644
--- a/libc/sysv/consts/TCP_INFO.S
+++ b/libc/sysv/consts/TCP_INFO.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon tcp,TCP_INFO,11,11,0x200,0x200,32,9,9,0
+.syscon tcp,TCP_INFO,11,11,0,0,0x20,0,9,0
diff --git a/libc/sysv/consts/TCP_KEEPCNT.S b/libc/sysv/consts/TCP_KEEPCNT.S
index 4bc84b47a..97c36dcfc 100644
--- a/libc/sysv/consts/TCP_KEEPCNT.S
+++ b/libc/sysv/consts/TCP_KEEPCNT.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon tcp,TCP_KEEPCNT,6,6,0x102,0x102,0x400,0,6,16
+.syscon tcp,TCP_KEEPCNT,6,6,0x102,0x102,0x400,0,6,0
diff --git a/libc/sysv/consts/TCP_KEEPIDLE.S b/libc/sysv/consts/TCP_KEEPIDLE.S
index f5886f4cc..5751644c2 100644
--- a/libc/sysv/consts/TCP_KEEPIDLE.S
+++ b/libc/sysv/consts/TCP_KEEPIDLE.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon tcp,TCP_KEEPIDLE,4,4,0,0,0x100,0,3,3
+.syscon tcp,TCP_KEEPIDLE,4,4,0,0,0x100,0,3,0
diff --git a/libc/sysv/consts/TCP_KEEPINTVL.S b/libc/sysv/consts/TCP_KEEPINTVL.S
index 6f94429df..3b94ca585 100644
--- a/libc/sysv/consts/TCP_KEEPINTVL.S
+++ b/libc/sysv/consts/TCP_KEEPINTVL.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon tcp,TCP_KEEPINTVL,5,5,0x101,0x101,0x200,0,5,17
+.syscon tcp,TCP_KEEPINTVL,5,5,0x101,0x101,0x200,0,5,0
diff --git a/libc/sysv/consts/TCP_MD5SIG.S b/libc/sysv/consts/TCP_MD5SIG.S
index fc1388aa4..7ee7876b3 100644
--- a/libc/sysv/consts/TCP_MD5SIG.S
+++ b/libc/sysv/consts/TCP_MD5SIG.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon tcp,TCP_MD5SIG,14,14,0,0,16,4,16,0
+.syscon tcp,TCP_MD5SIG,14,14,0,0,0x10,4,16,0
diff --git a/libc/sysv/consts/TGEXEC.S b/libc/sysv/consts/TGEXEC.S
new file mode 100644
index 000000000..67216cff5
--- /dev/null
+++ b/libc/sysv/consts/TGEXEC.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TGEXEC,8,8,8,8,8,8,8,0
diff --git a/libc/sysv/consts/TGREAD.S b/libc/sysv/consts/TGREAD.S
new file mode 100644
index 000000000..0d8ac6ba5
--- /dev/null
+++ b/libc/sysv/consts/TGREAD.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TGREAD,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0
diff --git a/libc/sysv/consts/TGWRITE.S b/libc/sysv/consts/TGWRITE.S
new file mode 100644
index 000000000..f4b54f296
--- /dev/null
+++ b/libc/sysv/consts/TGWRITE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TGWRITE,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0
diff --git a/libc/sysv/consts/TMAGLEN.S b/libc/sysv/consts/TMAGLEN.S
new file mode 100644
index 000000000..132be740f
--- /dev/null
+++ b/libc/sysv/consts/TMAGLEN.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TMAGLEN,6,6,6,6,6,6,6,0
diff --git a/libc/sysv/consts/TOEXEC.S b/libc/sysv/consts/TOEXEC.S
new file mode 100644
index 000000000..55f00072e
--- /dev/null
+++ b/libc/sysv/consts/TOEXEC.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TOEXEC,1,1,1,1,1,1,1,0
diff --git a/libc/sysv/consts/TOREAD.S b/libc/sysv/consts/TOREAD.S
new file mode 100644
index 000000000..802694d71
--- /dev/null
+++ b/libc/sysv/consts/TOREAD.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TOREAD,4,4,4,4,4,4,4,0
diff --git a/libc/sysv/consts/TOWRITE.S b/libc/sysv/consts/TOWRITE.S
new file mode 100644
index 000000000..14bb0c7ca
--- /dev/null
+++ b/libc/sysv/consts/TOWRITE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TOWRITE,2,2,2,2,2,2,2,0
diff --git a/libc/sysv/consts/TRANSIENT.S b/libc/sysv/consts/TRANSIENT.S
new file mode 100644
index 000000000..c7f7807e2
--- /dev/null
+++ b/libc/sysv/consts/TRANSIENT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TRANSIENT,4,4,4,4,4,4,4,0
diff --git a/libc/sysv/consts/TSGID.S b/libc/sysv/consts/TSGID.S
new file mode 100644
index 000000000..253a91ac7
--- /dev/null
+++ b/libc/sysv/consts/TSGID.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TSGID,0x0400,0x0400,0x0400,0x0400,0x0400,0x0400,0x0400,0
diff --git a/libc/sysv/consts/TSUID.S b/libc/sysv/consts/TSUID.S
new file mode 100644
index 000000000..c369b647b
--- /dev/null
+++ b/libc/sysv/consts/TSUID.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TSUID,0x0800,0x0800,0x0800,0x0800,0x0800,0x0800,0x0800,0
diff --git a/libc/sysv/consts/TSVTX.S b/libc/sysv/consts/TSVTX.S
new file mode 100644
index 000000000..8269bb0ff
--- /dev/null
+++ b/libc/sysv/consts/TSVTX.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TSVTX,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200,0
diff --git a/libc/sysv/consts/TUEXEC.S b/libc/sysv/consts/TUEXEC.S
new file mode 100644
index 000000000..0a810cd15
--- /dev/null
+++ b/libc/sysv/consts/TUEXEC.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TUEXEC,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0
diff --git a/libc/sysv/consts/TUREAD.S b/libc/sysv/consts/TUREAD.S
new file mode 100644
index 000000000..aa151e389
--- /dev/null
+++ b/libc/sysv/consts/TUREAD.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TUREAD,0x0100,0x0100,0x0100,0x0100,0x0100,0x0100,0x0100,0
diff --git a/libc/sysv/consts/TUWRITE.S b/libc/sysv/consts/TUWRITE.S
new file mode 100644
index 000000000..46e307643
--- /dev/null
+++ b/libc/sysv/consts/TUWRITE.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TUWRITE,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0
diff --git a/libc/sysv/consts/TVERSLEN.S b/libc/sysv/consts/TVERSLEN.S
new file mode 100644
index 000000000..d09006b08
--- /dev/null
+++ b/libc/sysv/consts/TVERSLEN.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,TVERSLEN,2,2,2,2,2,2,2,0
diff --git a/libc/sysv/consts/WORD_BIT.S b/libc/sysv/consts/WORD_BIT.S
new file mode 100644
index 000000000..1dc138b99
--- /dev/null
+++ b/libc/sysv/consts/WORD_BIT.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,WORD_BIT,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0
diff --git a/libc/sysv/consts/WRQ.S b/libc/sysv/consts/WRQ.S
new file mode 100644
index 000000000..c101d655b
--- /dev/null
+++ b/libc/sysv/consts/WRQ.S
@@ -0,0 +1,2 @@
+#include "libc/sysv/consts/syscon.internal.h"
+.syscon misc,WRQ,2,2,2,2,2,2,2,0
diff --git a/libc/sysv/consts/_MINSIGSTKSZ.S b/libc/sysv/consts/_MINSIGSTKSZ.S
index 94d7efe53..b55cbcca9 100644
--- a/libc/sysv/consts/_MINSIGSTKSZ.S
+++ b/libc/sysv/consts/_MINSIGSTKSZ.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon limits,_MINSIGSTKSZ,2048,6144,8192,32768,6656,14336,8192,2048
+.syscon limits,_MINSIGSTKSZ,2048,2048,32768,32768,4096,12288,8192,2048
diff --git a/libc/sysv/consts/_SIGSTKSZ.S b/libc/sysv/consts/_SIGSTKSZ.S
index 8484e6596..6347f2877 100644
--- a/libc/sysv/consts/_SIGSTKSZ.S
+++ b/libc/sysv/consts/_SIGSTKSZ.S
@@ -1,2 +1,2 @@
 #include "libc/sysv/consts/syscon.internal.h"
-.syscon limits,_SIGSTKSZ,10240,10240,131072,131072,36864,28672,40960,10240
+.syscon limits,_SIGSTKSZ,8192,2048,131072,131072,36864,28672,40960,8192
diff --git a/libc/sysv/consts/clock.h b/libc/sysv/consts/clock.h
index eefa79217..4b6e7b193 100644
--- a/libc/sysv/consts/clock.h
+++ b/libc/sysv/consts/clock.h
@@ -2,26 +2,31 @@
 #define COSMOPOLITAN_LIBC_SYSV_CONSTS_CLOCK_H_
 COSMOPOLITAN_C_START_
 
-extern int CLOCK_REALTIME_COARSE;
-extern const int CLOCK_MONOTONIC;
-extern int CLOCK_MONOTONIC_RAW;
-extern int CLOCK_MONOTONIC_COARSE;
-extern const int CLOCK_THREAD_CPUTIME_ID;
-extern const int CLOCK_PROCESS_CPUTIME_ID;
 extern const int CLOCK_BOOTTIME;
+extern const int CLOCK_BOOTTIME_ALARM;
+extern const int CLOCK_MONOTONIC;
+extern const int CLOCK_MONOTONIC_COARSE;
+extern const int CLOCK_MONOTONIC_FAST;
+extern const int CLOCK_MONOTONIC_PRECISE;
+extern const int CLOCK_MONOTONIC_RAW;
+extern const int CLOCK_PROCESS_CPUTIME_ID;
+extern const int CLOCK_PROF;
+extern const int CLOCK_REALTIME_ALARM;
+extern const int CLOCK_REALTIME_COARSE;
+extern const int CLOCK_REALTIME_FAST;
+extern const int CLOCK_REALTIME_PRECISE;
+extern const int CLOCK_SECOND;
+extern const int CLOCK_TAI;
+extern const int CLOCK_THREAD_CPUTIME_ID;
+extern const int CLOCK_UPTIME;
+extern const int CLOCK_UPTIME_FAST;
+extern const int CLOCK_UPTIME_PRECISE;
 
 COSMOPOLITAN_C_END_
 
-#define CLOCK_REALTIME        0
-#define CLOCK_REALTIME_COARSE CLOCK_REALTIME_COARSE
-
-#define CLOCK_MONOTONIC        CLOCK_MONOTONIC
-#define CLOCK_MONOTONIC_RAW    CLOCK_MONOTONIC_RAW
-#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC_COARSE
-
-#define CLOCK_THREAD_CPUTIME_ID  CLOCK_THREAD_CPUTIME_ID
+#define CLOCK_REALTIME           0
+#define CLOCK_MONOTONIC          CLOCK_MONOTONIC
 #define CLOCK_PROCESS_CPUTIME_ID CLOCK_PROCESS_CPUTIME_ID
-
-#define CLOCK_BOOTTIME CLOCK_BOOTTIME
+#define CLOCK_THREAD_CPUTIME_ID  CLOCK_THREAD_CPUTIME_ID
 
 #endif /* COSMOPOLITAN_LIBC_SYSV_CONSTS_CLOCK_H_ */
diff --git a/libc/sysv/consts/epoll.h b/libc/sysv/consts/epoll.h
new file mode 100644
index 000000000..a2d11e643
--- /dev/null
+++ b/libc/sysv/consts/epoll.h
@@ -0,0 +1,31 @@
+#ifndef COSMOPOLITAN_LIBC_SYSV_CONSTS_EPOLL_H_
+#define COSMOPOLITAN_LIBC_SYSV_CONSTS_EPOLL_H_
+#include "libc/sysv/consts/o.h"
+
+#define EPOLL_CTL_ADD 1
+#define EPOLL_CTL_DEL 2
+#define EPOLL_CTL_MOD 3
+
+#define EPOLLIN        1
+#define EPOLLPRI       2
+#define EPOLLOUT       4
+#define EPOLLERR       8
+#define EPOLLHUP       0x10
+#define EPOLLRDNORM    0x40
+#define EPOLLRDBAND    0x80
+#define EPOLLWRNORM    0x0100
+#define EPOLLWRBAND    0x0200
+#define EPOLLMSG       0x0400
+#define EPOLLRDHUP     0x2000
+#define EPOLLEXCLUSIVE 0x10000000
+#define EPOLLWAKEUP    0x20000000
+#define EPOLLONESHOT   0x40000000
+#define EPOLLET        0x80000000
+
+COSMOPOLITAN_C_START_
+
+extern const int EPOLL_CLOEXEC;
+#define EPOLL_CLOEXEC O_CLOEXEC
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_SYSV_CONSTS_EPOLL_H_ */
diff --git a/libc/sysv/consts/icmp6.h b/libc/sysv/consts/icmp6.h
index 56a63774e..43ce452b8 100644
--- a/libc/sysv/consts/icmp6.h
+++ b/libc/sysv/consts/icmp6.h
@@ -1,5 +1,37 @@
 #ifndef COSMOPOLITAN_LIBC_SYSV_CONSTS_ICMP6_H_
 #define COSMOPOLITAN_LIBC_SYSV_CONSTS_ICMP6_H_
+
+#define ICMP6_DST_UNREACH                ICMP6_DST_UNREACH
+#define ICMP6_DST_UNREACH_ADDR           ICMP6_DST_UNREACH_ADDR
+#define ICMP6_DST_UNREACH_ADMIN          ICMP6_DST_UNREACH_ADMIN
+#define ICMP6_DST_UNREACH_BEYONDSCOPE    ICMP6_DST_UNREACH_BEYONDSCOPE
+#define ICMP6_DST_UNREACH_NOPORT         ICMP6_DST_UNREACH_NOPORT
+#define ICMP6_DST_UNREACH_NOROUTE        ICMP6_DST_UNREACH_NOROUTE
+#define ICMP6_ECHO_REPLY                 ICMP6_ECHO_REPLY
+#define ICMP6_ECHO_REQUEST               ICMP6_ECHO_REQUEST
+#define ICMP6_FILTER                     ICMP6_FILTER
+#define ICMP6_INFOMSG_MASK               ICMP6_INFOMSG_MASK
+#define ICMP6_PACKET_TOO_BIG             ICMP6_PACKET_TOO_BIG
+#define ICMP6_PARAMPROB_HEADER           ICMP6_PARAMPROB_HEADER
+#define ICMP6_PARAMPROB_NEXTHEADER       ICMP6_PARAMPROB_NEXTHEADER
+#define ICMP6_PARAMPROB_OPTION           ICMP6_PARAMPROB_OPTION
+#define ICMP6_PARAM_PROB                 ICMP6_PARAM_PROB
+#define ICMP6_ROUTER_RENUMBERING         ICMP6_ROUTER_RENUMBERING
+#define ICMP6_RR_FLAGS_FORCEAPPLY        ICMP6_RR_FLAGS_FORCEAPPLY
+#define ICMP6_RR_FLAGS_PREVDONE          ICMP6_RR_FLAGS_PREVDONE
+#define ICMP6_RR_FLAGS_REQRESULT         ICMP6_RR_FLAGS_REQRESULT
+#define ICMP6_RR_FLAGS_SPECSITE          ICMP6_RR_FLAGS_SPECSITE
+#define ICMP6_RR_FLAGS_TEST              ICMP6_RR_FLAGS_TEST
+#define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME
+#define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME
+#define ICMP6_RR_PCOUSE_RAFLAGS_AUTO     ICMP6_RR_PCOUSE_RAFLAGS_AUTO
+#define ICMP6_RR_PCOUSE_RAFLAGS_ONLINK   ICMP6_RR_PCOUSE_RAFLAGS_ONLINK
+#define ICMP6_RR_RESULT_FLAGS_FORBIDDEN  ICMP6_RR_RESULT_FLAGS_FORBIDDEN
+#define ICMP6_RR_RESULT_FLAGS_OOB        ICMP6_RR_RESULT_FLAGS_OOB
+#define ICMP6_TIME_EXCEEDED              ICMP6_TIME_EXCEEDED
+#define ICMP6_TIME_EXCEED_REASSEMBLY     ICMP6_TIME_EXCEED_REASSEMBLY
+#define ICMP6_TIME_EXCEED_TRANSIT        ICMP6_TIME_EXCEED_TRANSIT
+
 COSMOPOLITAN_C_START_
 
 extern const uint8_t ICMP6_DST_UNREACH;
diff --git a/libc/sysv/consts/ip.h b/libc/sysv/consts/ip.h
index 68c4d8b3f..3d3c82b08 100644
--- a/libc/sysv/consts/ip.h
+++ b/libc/sysv/consts/ip.h
@@ -2,34 +2,105 @@
 #define COSMOPOLITAN_LIBC_SYSV_CONSTS_IP_H_
 COSMOPOLITAN_C_START_
 
-extern const int IP_TOS;
-extern const int IP_TTL;
-extern const int IP_MTU;
-extern const int IP_HDRINCL;
-extern const int IP_OPTIONS;
-extern const int IP_RECVTTL;
 extern const int IP_ADD_MEMBERSHIP;
+extern const int IP_ADD_SOURCE_MEMBERSHIP;
+extern const int IP_BIND_ADDRESS_NO_PORT;
+extern const int IP_BLOCK_SOURCE;
+extern const int IP_CHECKSUM;
+extern const int IP_DEFAULT_MULTICAST_LOOP;
+extern const int IP_DEFAULT_MULTICAST_TTL;
 extern const int IP_DROP_MEMBERSHIP;
+extern const int IP_DROP_SOURCE_MEMBERSHIP;
+extern const int IP_FREEBIND;
+extern const int IP_HDRINCL;
+extern const int IP_IPSEC_POLICY;
+extern const int IP_MAX_MEMBERSHIPS;
+extern const int IP_MINTTL;
+extern const int IP_MSFILTER;
+extern const int IP_MTU;
+extern const int IP_MTU_DISCOVER;
+extern const int IP_MULTICAST_ALL;
 extern const int IP_MULTICAST_IF;
 extern const int IP_MULTICAST_LOOP;
 extern const int IP_MULTICAST_TTL;
+extern const int IP_NODEFRAG;
+extern const int IP_OPTIONS;
+extern const int IP_ORIGDSTADDR;
+extern const int IP_PASSSEC;
 extern const int IP_PKTINFO;
+extern const int IP_PKTOPTIONS;
+extern const int IP_PMTUDISC;
+extern const int IP_PMTUDISC_DO;
+extern const int IP_PMTUDISC_DONT;
+extern const int IP_PMTUDISC_INTERFACE;
+extern const int IP_PMTUDISC_OMIT;
+extern const int IP_PMTUDISC_PROBE;
+extern const int IP_PMTUDISC_WANT;
+extern const int IP_RECVDSTADDR;
+extern const int IP_RECVERR;
+extern const int IP_RECVOPTS;
+extern const int IP_RECVORIGDSTADDR;
+extern const int IP_RECVRETOPTS;
 extern const int IP_RECVTOS;
+extern const int IP_RECVTTL;
+extern const int IP_RETOPTS;
+extern const int IP_ROUTER_ALERT;
+extern const int IP_TOS;
+extern const int IP_TRANSPARENT;
+extern const int IP_TTL;
+extern const int IP_UNBLOCK_SOURCE;
+extern const int IP_UNICAST_IF;
+extern const int IP_XFRM_POLICY;
 
-#define IP_TOS             IP_TOS
-#define IP_TTL             IP_TTL
-#define IP_MTU             IP_MTU
-#define IP_HDRINCL         IP_HDRINCL
-#define IP_OPTIONS         IP_OPTIONS
-#define IP_RECVTTL         IP_RECVTTL
-#define IP_ADD_MEMBERSHIP  IP_ADD_MEMBERSHIP
-#define IP_DROP_MEMBERSHIP IP_DROP_MEMBERSHIP
-#define IP_MULTICAST_IF    IP_MULTICAST_IF
-#define IP_MULTICAST_LOOP  IP_MULTICAST_LOOP
-#define IP_MULTICAST_TTL   IP_MULTICAST_TTL
-
-#define IP_DEFAULT_MULTICAST_TTL  1
-#define IP_DEFAULT_MULTICAST_LOOP 1
+#define IP_ADD_MEMBERSHIP         IP_ADD_MEMBERSHIP
+#define IP_ADD_SOURCE_MEMBERSHIP  IP_ADD_SOURCE_MEMBERSHIP
+#define IP_BIND_ADDRESS_NO_PORT   IP_BIND_ADDRESS_NO_PORT
+#define IP_BLOCK_SOURCE           IP_BLOCK_SOURCE
+#define IP_CHECKSUM               IP_CHECKSUM
+#define IP_DEFAULT_MULTICAST_LOOP IP_DEFAULT_MULTICAST_LOOP
+#define IP_DEFAULT_MULTICAST_TTL  IP_DEFAULT_MULTICAST_TTL
+#define IP_DROP_MEMBERSHIP        IP_DROP_MEMBERSHIP
+#define IP_DROP_SOURCE_MEMBERSHIP IP_DROP_SOURCE_MEMBERSHIP
+#define IP_FREEBIND               IP_FREEBIND
+#define IP_HDRINCL                IP_HDRINCL
+#define IP_IPSEC_POLICY           IP_IPSEC_POLICY
+#define IP_MAX_MEMBERSHIPS        IP_MAX_MEMBERSHIPS
+#define IP_MINTTL                 IP_MINTTL
+#define IP_MSFILTER               IP_MSFILTER
+#define IP_MTU                    IP_MTU
+#define IP_MTU_DISCOVER           IP_MTU_DISCOVER
+#define IP_MULTICAST_ALL          IP_MULTICAST_ALL
+#define IP_MULTICAST_IF           IP_MULTICAST_IF
+#define IP_MULTICAST_LOOP         IP_MULTICAST_LOOP
+#define IP_MULTICAST_TTL          IP_MULTICAST_TTL
+#define IP_NODEFRAG               IP_NODEFRAG
+#define IP_OPTIONS                IP_OPTIONS
+#define IP_ORIGDSTADDR            IP_ORIGDSTADDR
+#define IP_PASSSEC                IP_PASSSEC
+#define IP_PKTINFO                IP_PKTINFO
+#define IP_PKTOPTIONS             IP_PKTOPTIONS
+#define IP_PMTUDISC               IP_PMTUDISC
+#define IP_PMTUDISC_DO            IP_PMTUDISC_DO
+#define IP_PMTUDISC_DONT          IP_PMTUDISC_DONT
+#define IP_PMTUDISC_INTERFACE     IP_PMTUDISC_INTERFACE
+#define IP_PMTUDISC_OMIT          IP_PMTUDISC_OMIT
+#define IP_PMTUDISC_PROBE         IP_PMTUDISC_PROBE
+#define IP_PMTUDISC_WANT          IP_PMTUDISC_WANT
+#define IP_RECVDSTADDR            IP_RECVDSTADDR
+#define IP_RECVERR                IP_RECVERR
+#define IP_RECVOPTS               IP_RECVOPTS
+#define IP_RECVORIGDSTADDR        IP_RECVORIGDSTADDR
+#define IP_RECVRETOPTS            IP_RECVRETOPTS
+#define IP_RECVTOS                IP_RECVTOS
+#define IP_RECVTTL                IP_RECVTTL
+#define IP_RETOPTS                IP_RETOPTS
+#define IP_ROUTER_ALERT           IP_ROUTER_ALERT
+#define IP_TOS                    IP_TOS
+#define IP_TRANSPARENT            IP_TRANSPARENT
+#define IP_TTL                    IP_TTL
+#define IP_UNBLOCK_SOURCE         IP_UNBLOCK_SOURCE
+#define IP_UNICAST_IF             IP_UNICAST_IF
+#define IP_XFRM_POLICY            IP_XFRM_POLICY
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_SYSV_CONSTS_IP_H_ */
diff --git a/libc/sysv/consts/ipv6.h b/libc/sysv/consts/ipv6.h
index febc3389b..4e5c5471c 100644
--- a/libc/sysv/consts/ipv6.h
+++ b/libc/sysv/consts/ipv6.h
@@ -2,39 +2,125 @@
 #define COSMOPOLITAN_LIBC_SYSV_CONSTS_IPV6_H_
 COSMOPOLITAN_C_START_
 
-extern const int IPV6_V6ONLY;
+extern const int IPV6_2292DSTOPTS;
+extern const int IPV6_2292HOPLIMIT;
+extern const int IPV6_2292HOPOPTS;
+extern const int IPV6_2292PKTINFO;
+extern const int IPV6_2292PKTOPTIONS;
+extern const int IPV6_2292RTHDR;
+extern const int IPV6_ADDRFORM;
+extern const int IPV6_ADD_MEMBERSHIP;
+extern const int IPV6_AUTHHDR;
+extern const int IPV6_AUTOFLOWLABEL;
 extern const int IPV6_CHECKSUM;
+extern const int IPV6_DONTFRAG;
+extern const int IPV6_DROP_MEMBERSHIP;
+extern const int IPV6_DSTOPTS;
+extern const int IPV6_HDRINCL;
+extern const int IPV6_HOPLIMIT;
+extern const int IPV6_HOPOPTS;
+extern const int IPV6_IPSEC_POLICY;
+extern const int IPV6_JOIN_ANYCAST;
 extern const int IPV6_JOIN_GROUP;
+extern const int IPV6_LEAVE_ANYCAST;
 extern const int IPV6_LEAVE_GROUP;
+extern const int IPV6_MINHOPCOUNT;
+extern const int IPV6_MTU;
+extern const int IPV6_MTU_DISCOVER;
 extern const int IPV6_MULTICAST_HOPS;
 extern const int IPV6_MULTICAST_IF;
 extern const int IPV6_MULTICAST_LOOP;
-extern const int IPV6_UNICAST_HOPS;
-extern const int IPV6_RECVTCLASS;
-extern const int IPV6_TCLASS;
-extern const int IPV6_DONTFRAG;
-extern const int IPV6_HOPLIMIT;
-extern const int IPV6_HOPOPTS;
+extern const int IPV6_NEXTHOP;
+extern const int IPV6_ORIGDSTADDR;
+extern const int IPV6_PATHMTU;
 extern const int IPV6_PKTINFO;
+extern const int IPV6_PMTUDISC_DO;
+extern const int IPV6_PMTUDISC_DONT;
+extern const int IPV6_PMTUDISC_INTERFACE;
+extern const int IPV6_PMTUDISC_OMIT;
+extern const int IPV6_PMTUDISC_PROBE;
+extern const int IPV6_PMTUDISC_WANT;
+extern const int IPV6_RECVDSTOPTS;
+extern const int IPV6_RECVERR;
+extern const int IPV6_RECVHOPLIMIT;
+extern const int IPV6_RECVHOPOPTS;
+extern const int IPV6_RECVORIGDSTADDR;
+extern const int IPV6_RECVPATHMTU;
+extern const int IPV6_RECVPKTINFO;
 extern const int IPV6_RECVRTHDR;
+extern const int IPV6_RECVTCLASS;
+extern const int IPV6_ROUTER_ALERT;
 extern const int IPV6_RTHDR;
+extern const int IPV6_RTHDRDSTOPTS;
+extern const int IPV6_RTHDR_LOOSE;
+extern const int IPV6_RTHDR_STRICT;
+extern const int IPV6_RTHDR_TYPE_0;
+extern const int IPV6_RXDSTOPTS;
+extern const int IPV6_RXHOPOPTS;
+extern const int IPV6_TCLASS;
+extern const int IPV6_UNICAST_HOPS;
+extern const int IPV6_V6ONLY;
+extern const int IPV6_XFRM_POLICY;
 
-#define IPV6_V6ONLY         IPV6_V6ONLY
-#define IPV6_CHECKSUM       IPV6_CHECKSUM
-#define IPV6_JOIN_GROUP     IPV6_JOIN_GROUP
-#define IPV6_LEAVE_GROUP    IPV6_LEAVE_GROUP
-#define IPV6_MULTICAST_HOPS IPV6_MULTICAST_HOPS
-#define IPV6_MULTICAST_IF   IPV6_MULTICAST_IF
-#define IPV6_MULTICAST_LOOP IPV6_MULTICAST_LOOP
-#define IPV6_UNICAST_HOPS   IPV6_UNICAST_HOPS
-#define IPV6_RECVTCLASS     IPV6_RECVTCLASS
-#define IPV6_TCLASS         IPV6_TCLASS
-#define IPV6_DONTFRAG       IPV6_DONTFRAG
-#define IPV6_HOPLIMIT       IPV6_HOPLIMIT
-#define IPV6_HOPOPTS        IPV6_HOPOPTS
-#define IPV6_PKTINFO        IPV6_PKTINFO
-#define IPV6_RECVRTHDR      IPV6_RECVRTHDR
-#define IPV6_RTHDR          IPV6_RTHDR
+#define IPV6_2292DSTOPTS        IPV6_2292DSTOPTS
+#define IPV6_2292HOPLIMIT       IPV6_2292HOPLIMIT
+#define IPV6_2292HOPOPTS        IPV6_2292HOPOPTS
+#define IPV6_2292PKTINFO        IPV6_2292PKTINFO
+#define IPV6_2292PKTOPTIONS     IPV6_2292PKTOPTIONS
+#define IPV6_2292RTHDR          IPV6_2292RTHDR
+#define IPV6_ADDRFORM           IPV6_ADDRFORM
+#define IPV6_ADD_MEMBERSHIP     IPV6_ADD_MEMBERSHIP
+#define IPV6_AUTHHDR            IPV6_AUTHHDR
+#define IPV6_AUTOFLOWLABEL      IPV6_AUTOFLOWLABEL
+#define IPV6_CHECKSUM           IPV6_CHECKSUM
+#define IPV6_DONTFRAG           IPV6_DONTFRAG
+#define IPV6_DROP_MEMBERSHIP    IPV6_DROP_MEMBERSHIP
+#define IPV6_DSTOPTS            IPV6_DSTOPTS
+#define IPV6_HDRINCL            IPV6_HDRINCL
+#define IPV6_HOPLIMIT           IPV6_HOPLIMIT
+#define IPV6_HOPOPTS            IPV6_HOPOPTS
+#define IPV6_IPSEC_POLICY       IPV6_IPSEC_POLICY
+#define IPV6_JOIN_ANYCAST       IPV6_JOIN_ANYCAST
+#define IPV6_JOIN_GROUP         IPV6_JOIN_GROUP
+#define IPV6_LEAVE_ANYCAST      IPV6_LEAVE_ANYCAST
+#define IPV6_LEAVE_GROUP        IPV6_LEAVE_GROUP
+#define IPV6_MINHOPCOUNT        IPV6_MINHOPCOUNT
+#define IPV6_MTU                IPV6_MTU
+#define IPV6_MTU_DISCOVER       IPV6_MTU_DISCOVER
+#define IPV6_MULTICAST_HOPS     IPV6_MULTICAST_HOPS
+#define IPV6_MULTICAST_IF       IPV6_MULTICAST_IF
+#define IPV6_MULTICAST_LOOP     IPV6_MULTICAST_LOOP
+#define IPV6_NEXTHOP            IPV6_NEXTHOP
+#define IPV6_ORIGDSTADDR        IPV6_ORIGDSTADDR
+#define IPV6_PATHMTU            IPV6_PATHMTU
+#define IPV6_PKTINFO            IPV6_PKTINFO
+#define IPV6_PMTUDISC_DO        IPV6_PMTUDISC_DO
+#define IPV6_PMTUDISC_DONT      IPV6_PMTUDISC_DONT
+#define IPV6_PMTUDISC_INTERFACE IPV6_PMTUDISC_INTERFACE
+#define IPV6_PMTUDISC_OMIT      IPV6_PMTUDISC_OMIT
+#define IPV6_PMTUDISC_PROBE     IPV6_PMTUDISC_PROBE
+#define IPV6_PMTUDISC_WANT      IPV6_PMTUDISC_WANT
+#define IPV6_RECVDSTOPTS        IPV6_RECVDSTOPTS
+#define IPV6_RECVERR            IPV6_RECVERR
+#define IPV6_RECVHOPLIMIT       IPV6_RECVHOPLIMIT
+#define IPV6_RECVHOPOPTS        IPV6_RECVHOPOPTS
+#define IPV6_RECVORIGDSTADDR    IPV6_RECVORIGDSTADDR
+#define IPV6_RECVPATHMTU        IPV6_RECVPATHMTU
+#define IPV6_RECVPKTINFO        IPV6_RECVPKTINFO
+#define IPV6_RECVRTHDR          IPV6_RECVRTHDR
+#define IPV6_RECVTCLASS         IPV6_RECVTCLASS
+#define IPV6_ROUTER_ALERT       IPV6_ROUTER_ALERT
+#define IPV6_RTHDR              IPV6_RTHDR
+#define IPV6_RTHDRDSTOPTS       IPV6_RTHDRDSTOPTS
+#define IPV6_RTHDR_LOOSE        IPV6_RTHDR_LOOSE
+#define IPV6_RTHDR_STRICT       IPV6_RTHDR_STRICT
+#define IPV6_RTHDR_TYPE_0       IPV6_RTHDR_TYPE_0
+#define IPV6_RXDSTOPTS          IPV6_RXDSTOPTS
+#define IPV6_RXHOPOPTS          IPV6_RXHOPOPTS
+#define IPV6_TCLASS             IPV6_TCLASS
+#define IPV6_UNICAST_HOPS       IPV6_UNICAST_HOPS
+#define IPV6_V6ONLY             IPV6_V6ONLY
+#define IPV6_XFRM_POLICY        IPV6_XFRM_POLICY
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_SYSV_CONSTS_IPV6_H_ */
diff --git a/libc/sysv/consts/map.h b/libc/sysv/consts/map.h
index 20ed8bf51..ae719ea0b 100644
--- a/libc/sysv/consts/map.h
+++ b/libc/sysv/consts/map.h
@@ -19,6 +19,7 @@ extern const int MAP_JIT;
 extern const int MAP_LOCKED;
 extern const int MAP_NOCACHE;
 extern const int MAP_NOEXTEND;
+extern const int MAP_NOFORK;
 extern const int MAP_NONBLOCK;
 extern const int MAP_NORESERVE;
 extern const int MAP_NOSYNC;
diff --git a/libc/sysv/consts/msg.h b/libc/sysv/consts/msg.h
index 3694fc6f6..74deaca26 100644
--- a/libc/sysv/consts/msg.h
+++ b/libc/sysv/consts/msg.h
@@ -2,18 +2,41 @@
 #define COSMOPOLITAN_LIBC_SYSV_CONSTS_MSG_H_
 COSMOPOLITAN_C_START_
 
-extern const int MSG_DONTWAIT;
-extern const int MSG_WAITALL;
-extern const int MSG_NOSIGNAL;
-extern const int MSG_TRUNC;
+extern const int MSG_BATCH;
+extern const int MSG_BCAST;
+extern const int MSG_CMSG_CLOEXEC;
+extern const int MSG_CONFIRM;
 extern const int MSG_CTRUNC;
-extern const int MSG_FASTOPEN; /* linux only */
+extern const int MSG_DONTROUTE;
+extern const int MSG_DONTWAIT;
+extern const int MSG_EOF;
+extern const int MSG_EOR;
+extern const int MSG_ERRQUEUE;
+extern const int MSG_EXCEPT;
+extern const int MSG_FASTOPEN;
+extern const int MSG_FIN;
+extern const int MSG_INFO;
+extern const int MSG_MCAST;
+extern const int MSG_MORE;
+extern const int MSG_NOERROR;
+extern const int MSG_NOSIGNAL;
+extern const int MSG_NOTIFICATION;
+extern const int MSG_OOB;
+extern const int MSG_PARITY_ERROR;
+extern const int MSG_PEEK;
+extern const int MSG_PROXY;
+extern const int MSG_RST;
+extern const int MSG_STAT;
+extern const int MSG_SYN;
+extern const int MSG_TRUNC;
+extern const int MSG_WAITALL;
+extern const int MSG_WAITFORONE;
 
 #define MSG_OOB       1
 #define MSG_PEEK      2
 #define MSG_DONTROUTE 4
 #define MSG_DONTWAIT  MSG_DONTWAIT
-#define MSG_NOSIGNAL  MSG_NOSIGNAL
+#define MSG_FASTOPEN  MSG_FASTOPEN
 #define MSG_WAITALL   MSG_WAITALL
 #define MSG_TRUNC     MSG_TRUNC
 #define MSG_CTRUNC    MSG_CTRUNC
diff --git a/libc/sysv/consts/sig.h b/libc/sysv/consts/sig.h
index d0c6808d2..3a66c711b 100644
--- a/libc/sysv/consts/sig.h
+++ b/libc/sysv/consts/sig.h
@@ -2,25 +2,45 @@
 #define COSMOPOLITAN_LIBC_SYSV_CONSTS_SIG_H_
 COSMOPOLITAN_C_START_
 
+extern const int SIGABRT;
+extern const int SIGALRM;
 extern const int SIGBUS;
 extern const int SIGTHR;
 extern const int SIGCHLD;
 extern const int SIGCONT;
 extern const int SIGEMT;
+extern const int SIGFPE;
+extern const int SIGHUP;
+extern const int SIGILL;
 extern const int SIGINFO;
+extern const int SIGINT;
 extern const int SIGIO;
+extern const int SIGIOT;
+extern const int SIGKILL;
+extern const int SIGPIPE;
 extern const int SIGPOLL;
+extern const int SIGPROF;
 extern const int SIGPWR;
+extern const int SIGQUIT;
 extern const int SIGRTMAX;
 extern const int SIGRTMIN;
+extern const int SIGSEGV;
 extern const int SIGSTKFLT;
 extern const int SIGSTOP;
 extern const int SIGSYS;
+extern const int SIGTERM;
+extern const int SIGTRAP;
 extern const int SIGTSTP;
+extern const int SIGTTIN;
+extern const int SIGTTOU;
 extern const int SIGUNUSED;
 extern const int SIGURG;
 extern const int SIGUSR1;
 extern const int SIGUSR2;
+extern const int SIGVTALRM;
+extern const int SIGWINCH;
+extern const int SIGXCPU;
+extern const int SIGXFSZ;
 
 extern const int SIG_BLOCK;
 extern const int SIG_SETMASK;
diff --git a/libc/sysv/consts/ss.h b/libc/sysv/consts/ss.h
index 626a696d6..ef83ec6ec 100644
--- a/libc/sysv/consts/ss.h
+++ b/libc/sysv/consts/ss.h
@@ -8,7 +8,7 @@ extern const int _MINSIGSTKSZ;
 
 COSMOPOLITAN_C_END_
 
-#define SIGSTKSZ    32768 /* just itself believed to be safe */
+#define SIGSTKSZ    32768
 #define MINSIGSTKSZ 32768 /* xnu defines the highest minimum */
 #define SS_ONSTACK  1
 #define SS_DISABLE  SS_DISABLE
diff --git a/libc/sysv/consts/syscon.internal.h b/libc/sysv/consts/syscon.internal.h
index 396f0b671..33b97c9a0 100644
--- a/libc/sysv/consts/syscon.internal.h
+++ b/libc/sysv/consts/syscon.internal.h
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 // clang-format off
 
 #ifdef __x86_64__
diff --git a/libc/sysv/errfun.S b/libc/sysv/errfun.S
index e1ef6908a..debb98d2f 100644
--- a/libc/sysv/errfun.S
+++ b/libc/sysv/errfun.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 __errfun:
diff --git a/libc/sysv/errfuns/e2big.S b/libc/sysv/errfuns/e2big.S
index 7b6fd6a20..32ac86542 100644
--- a/libc/sysv/errfuns/e2big.S
+++ b/libc/sysv/errfuns/e2big.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eacces.S b/libc/sysv/errfuns/eacces.S
index 5657ed497..2ba289170 100644
--- a/libc/sysv/errfuns/eacces.S
+++ b/libc/sysv/errfuns/eacces.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eaddrinuse.S b/libc/sysv/errfuns/eaddrinuse.S
index 77dee3e32..4d8c41d34 100644
--- a/libc/sysv/errfuns/eaddrinuse.S
+++ b/libc/sysv/errfuns/eaddrinuse.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eaddrnotavail.S b/libc/sysv/errfuns/eaddrnotavail.S
index 28a3a3998..101b936a6 100644
--- a/libc/sysv/errfuns/eaddrnotavail.S
+++ b/libc/sysv/errfuns/eaddrnotavail.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eadv.S b/libc/sysv/errfuns/eadv.S
index acf8e5f86..aeadda509 100644
--- a/libc/sysv/errfuns/eadv.S
+++ b/libc/sysv/errfuns/eadv.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eafnosupport.S b/libc/sysv/errfuns/eafnosupport.S
index 4e4787cc8..0f4fa0b5a 100644
--- a/libc/sysv/errfuns/eafnosupport.S
+++ b/libc/sysv/errfuns/eafnosupport.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eagain.S b/libc/sysv/errfuns/eagain.S
index e19df5a4f..5124a43f1 100644
--- a/libc/sysv/errfuns/eagain.S
+++ b/libc/sysv/errfuns/eagain.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ealready.S b/libc/sysv/errfuns/ealready.S
index 4f39c44d8..e8b32b0eb 100644
--- a/libc/sysv/errfuns/ealready.S
+++ b/libc/sysv/errfuns/ealready.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ebade.S b/libc/sysv/errfuns/ebade.S
index 0eb25fca3..8d0a0c19f 100644
--- a/libc/sysv/errfuns/ebade.S
+++ b/libc/sysv/errfuns/ebade.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ebadf.S b/libc/sysv/errfuns/ebadf.S
index 34318ce18..693fe7c46 100644
--- a/libc/sysv/errfuns/ebadf.S
+++ b/libc/sysv/errfuns/ebadf.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ebadfd.S b/libc/sysv/errfuns/ebadfd.S
index a2900bc3d..fb4fc502d 100644
--- a/libc/sysv/errfuns/ebadfd.S
+++ b/libc/sysv/errfuns/ebadfd.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ebadmsg.S b/libc/sysv/errfuns/ebadmsg.S
index 5781527bd..e650ed296 100644
--- a/libc/sysv/errfuns/ebadmsg.S
+++ b/libc/sysv/errfuns/ebadmsg.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ebadr.S b/libc/sysv/errfuns/ebadr.S
index f80316c45..e8f9ab68d 100644
--- a/libc/sysv/errfuns/ebadr.S
+++ b/libc/sysv/errfuns/ebadr.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ebadrqc.S b/libc/sysv/errfuns/ebadrqc.S
index 1debb93f5..e5fd44915 100644
--- a/libc/sysv/errfuns/ebadrqc.S
+++ b/libc/sysv/errfuns/ebadrqc.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ebadslt.S b/libc/sysv/errfuns/ebadslt.S
index 38a091a3e..4bc407c0e 100644
--- a/libc/sysv/errfuns/ebadslt.S
+++ b/libc/sysv/errfuns/ebadslt.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ebusy.S b/libc/sysv/errfuns/ebusy.S
index a677d254b..54a436001 100644
--- a/libc/sysv/errfuns/ebusy.S
+++ b/libc/sysv/errfuns/ebusy.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ecanceled.S b/libc/sysv/errfuns/ecanceled.S
index ac974c3f4..58007cb2d 100644
--- a/libc/sysv/errfuns/ecanceled.S
+++ b/libc/sysv/errfuns/ecanceled.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/echild.S b/libc/sysv/errfuns/echild.S
index 7733922f5..e3e79409b 100644
--- a/libc/sysv/errfuns/echild.S
+++ b/libc/sysv/errfuns/echild.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/echrng.S b/libc/sysv/errfuns/echrng.S
index 0f20dda56..0ffa5c922 100644
--- a/libc/sysv/errfuns/echrng.S
+++ b/libc/sysv/errfuns/echrng.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ecomm.S b/libc/sysv/errfuns/ecomm.S
index 4afe7326b..1a9a98286 100644
--- a/libc/sysv/errfuns/ecomm.S
+++ b/libc/sysv/errfuns/ecomm.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/econnaborted.S b/libc/sysv/errfuns/econnaborted.S
index 7f8c2737f..ec62e3e67 100644
--- a/libc/sysv/errfuns/econnaborted.S
+++ b/libc/sysv/errfuns/econnaborted.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/econnrefused.S b/libc/sysv/errfuns/econnrefused.S
index 20adf6b88..b16e4ce73 100644
--- a/libc/sysv/errfuns/econnrefused.S
+++ b/libc/sysv/errfuns/econnrefused.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/econnreset.S b/libc/sysv/errfuns/econnreset.S
index 8255f0bdc..0c8fd94f4 100644
--- a/libc/sysv/errfuns/econnreset.S
+++ b/libc/sysv/errfuns/econnreset.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/edeadlk.S b/libc/sysv/errfuns/edeadlk.S
index b812e07e9..caf4dbc2f 100644
--- a/libc/sysv/errfuns/edeadlk.S
+++ b/libc/sysv/errfuns/edeadlk.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/edestaddrreq.S b/libc/sysv/errfuns/edestaddrreq.S
index bd3b8d56f..74dd51f96 100644
--- a/libc/sysv/errfuns/edestaddrreq.S
+++ b/libc/sysv/errfuns/edestaddrreq.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/edom.S b/libc/sysv/errfuns/edom.S
index a3f40bb7c..c26449ac2 100644
--- a/libc/sysv/errfuns/edom.S
+++ b/libc/sysv/errfuns/edom.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/edotdot.S b/libc/sysv/errfuns/edotdot.S
index 6cdc94739..e1a347a3a 100644
--- a/libc/sysv/errfuns/edotdot.S
+++ b/libc/sysv/errfuns/edotdot.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/edquot.S b/libc/sysv/errfuns/edquot.S
index 2d62f6124..0e3e6fe79 100644
--- a/libc/sysv/errfuns/edquot.S
+++ b/libc/sysv/errfuns/edquot.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eexist.S b/libc/sysv/errfuns/eexist.S
index 65cc2e363..d1fb3a314 100644
--- a/libc/sysv/errfuns/eexist.S
+++ b/libc/sysv/errfuns/eexist.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/efault.S b/libc/sysv/errfuns/efault.S
index 802d9f12f..a219fff84 100644
--- a/libc/sysv/errfuns/efault.S
+++ b/libc/sysv/errfuns/efault.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/efbig.S b/libc/sysv/errfuns/efbig.S
index 32be5b137..0d614e720 100644
--- a/libc/sysv/errfuns/efbig.S
+++ b/libc/sysv/errfuns/efbig.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ehostdown.S b/libc/sysv/errfuns/ehostdown.S
index b472f021a..55a766ca9 100644
--- a/libc/sysv/errfuns/ehostdown.S
+++ b/libc/sysv/errfuns/ehostdown.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ehostunreach.S b/libc/sysv/errfuns/ehostunreach.S
index 8fa84fca1..bc5c311f0 100644
--- a/libc/sysv/errfuns/ehostunreach.S
+++ b/libc/sysv/errfuns/ehostunreach.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ehwpoison.S b/libc/sysv/errfuns/ehwpoison.S
index d7df790b1..d1d261565 100644
--- a/libc/sysv/errfuns/ehwpoison.S
+++ b/libc/sysv/errfuns/ehwpoison.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eidrm.S b/libc/sysv/errfuns/eidrm.S
index 3ba2f2bca..0c0d72542 100644
--- a/libc/sysv/errfuns/eidrm.S
+++ b/libc/sysv/errfuns/eidrm.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eilseq.S b/libc/sysv/errfuns/eilseq.S
index 25fdac2c6..d34482cea 100644
--- a/libc/sysv/errfuns/eilseq.S
+++ b/libc/sysv/errfuns/eilseq.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/einprogress.S b/libc/sysv/errfuns/einprogress.S
index db22b1bc6..9d356fd2a 100644
--- a/libc/sysv/errfuns/einprogress.S
+++ b/libc/sysv/errfuns/einprogress.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eintr.S b/libc/sysv/errfuns/eintr.S
index 96d03d28a..e6a3f88d3 100644
--- a/libc/sysv/errfuns/eintr.S
+++ b/libc/sysv/errfuns/eintr.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/einval.S b/libc/sysv/errfuns/einval.S
index 1b23f8ba1..537af95c8 100644
--- a/libc/sysv/errfuns/einval.S
+++ b/libc/sysv/errfuns/einval.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eio.S b/libc/sysv/errfuns/eio.S
index dade87a54..cdf541c95 100644
--- a/libc/sysv/errfuns/eio.S
+++ b/libc/sysv/errfuns/eio.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eisconn.S b/libc/sysv/errfuns/eisconn.S
index 36db72604..148423224 100644
--- a/libc/sysv/errfuns/eisconn.S
+++ b/libc/sysv/errfuns/eisconn.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eisdir.S b/libc/sysv/errfuns/eisdir.S
index 533071086..83e775f51 100644
--- a/libc/sysv/errfuns/eisdir.S
+++ b/libc/sysv/errfuns/eisdir.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eisnam.S b/libc/sysv/errfuns/eisnam.S
index a8b632d20..4450ca1ea 100644
--- a/libc/sysv/errfuns/eisnam.S
+++ b/libc/sysv/errfuns/eisnam.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ekeyexpired.S b/libc/sysv/errfuns/ekeyexpired.S
index 810d2078f..89f4507bc 100644
--- a/libc/sysv/errfuns/ekeyexpired.S
+++ b/libc/sysv/errfuns/ekeyexpired.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ekeyrejected.S b/libc/sysv/errfuns/ekeyrejected.S
index 874b25707..fda4019bd 100644
--- a/libc/sysv/errfuns/ekeyrejected.S
+++ b/libc/sysv/errfuns/ekeyrejected.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/ekeyrevoked.S b/libc/sysv/errfuns/ekeyrevoked.S
index fa5b4ce5c..d8e2e11c2 100644
--- a/libc/sysv/errfuns/ekeyrevoked.S
+++ b/libc/sysv/errfuns/ekeyrevoked.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/el2hlt.S b/libc/sysv/errfuns/el2hlt.S
index b8400dca8..5462d3ec9 100644
--- a/libc/sysv/errfuns/el2hlt.S
+++ b/libc/sysv/errfuns/el2hlt.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/el2nsync.S b/libc/sysv/errfuns/el2nsync.S
index bbf443613..bb249b131 100644
--- a/libc/sysv/errfuns/el2nsync.S
+++ b/libc/sysv/errfuns/el2nsync.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/el3hlt.S b/libc/sysv/errfuns/el3hlt.S
index c527a2cf4..40535b6d6 100644
--- a/libc/sysv/errfuns/el3hlt.S
+++ b/libc/sysv/errfuns/el3hlt.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/el3rst.S b/libc/sysv/errfuns/el3rst.S
index b277a12dd..036afd48e 100644
--- a/libc/sysv/errfuns/el3rst.S
+++ b/libc/sysv/errfuns/el3rst.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/elibacc.S b/libc/sysv/errfuns/elibacc.S
index ecaa787ce..b598c2e47 100644
--- a/libc/sysv/errfuns/elibacc.S
+++ b/libc/sysv/errfuns/elibacc.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/elibbad.S b/libc/sysv/errfuns/elibbad.S
index 6929d6c18..4a0f7f72b 100644
--- a/libc/sysv/errfuns/elibbad.S
+++ b/libc/sysv/errfuns/elibbad.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/elibexec.S b/libc/sysv/errfuns/elibexec.S
index 141f1acef..6b5093216 100644
--- a/libc/sysv/errfuns/elibexec.S
+++ b/libc/sysv/errfuns/elibexec.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/elibmax.S b/libc/sysv/errfuns/elibmax.S
index 5738b1ebd..73dc1d570 100644
--- a/libc/sysv/errfuns/elibmax.S
+++ b/libc/sysv/errfuns/elibmax.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/elibscn.S b/libc/sysv/errfuns/elibscn.S
index 730a3da62..fd3fbe245 100644
--- a/libc/sysv/errfuns/elibscn.S
+++ b/libc/sysv/errfuns/elibscn.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/elnrng.S b/libc/sysv/errfuns/elnrng.S
index 7db38cca6..0e0e37ed4 100644
--- a/libc/sysv/errfuns/elnrng.S
+++ b/libc/sysv/errfuns/elnrng.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eloop.S b/libc/sysv/errfuns/eloop.S
index e7edbbc92..faef0e95c 100644
--- a/libc/sysv/errfuns/eloop.S
+++ b/libc/sysv/errfuns/eloop.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/emediumtype.S b/libc/sysv/errfuns/emediumtype.S
index 0d7de3b51..448080938 100644
--- a/libc/sysv/errfuns/emediumtype.S
+++ b/libc/sysv/errfuns/emediumtype.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/emfile.S b/libc/sysv/errfuns/emfile.S
index a7f1d6e2b..6adb57077 100644
--- a/libc/sysv/errfuns/emfile.S
+++ b/libc/sysv/errfuns/emfile.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/emlink.S b/libc/sysv/errfuns/emlink.S
index e0d810f9c..8c6b7d95a 100644
--- a/libc/sysv/errfuns/emlink.S
+++ b/libc/sysv/errfuns/emlink.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/emsgsize.S b/libc/sysv/errfuns/emsgsize.S
index d2c22cb06..9870b85a1 100644
--- a/libc/sysv/errfuns/emsgsize.S
+++ b/libc/sysv/errfuns/emsgsize.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/emultihop.S b/libc/sysv/errfuns/emultihop.S
index cb54b9520..27140db9f 100644
--- a/libc/sysv/errfuns/emultihop.S
+++ b/libc/sysv/errfuns/emultihop.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enametoolong.S b/libc/sysv/errfuns/enametoolong.S
index d085fc6bf..d46f2bfa1 100644
--- a/libc/sysv/errfuns/enametoolong.S
+++ b/libc/sysv/errfuns/enametoolong.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enavail.S b/libc/sysv/errfuns/enavail.S
index a86a5b386..d7aa15d49 100644
--- a/libc/sysv/errfuns/enavail.S
+++ b/libc/sysv/errfuns/enavail.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enetdown.S b/libc/sysv/errfuns/enetdown.S
index fe4f2f908..7850a5892 100644
--- a/libc/sysv/errfuns/enetdown.S
+++ b/libc/sysv/errfuns/enetdown.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enetreset.S b/libc/sysv/errfuns/enetreset.S
index 31126c73b..fd6cf0c53 100644
--- a/libc/sysv/errfuns/enetreset.S
+++ b/libc/sysv/errfuns/enetreset.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enetunreach.S b/libc/sysv/errfuns/enetunreach.S
index ec730c637..36a6ee69e 100644
--- a/libc/sysv/errfuns/enetunreach.S
+++ b/libc/sysv/errfuns/enetunreach.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enfile.S b/libc/sysv/errfuns/enfile.S
index 5df56837a..a2b5157dd 100644
--- a/libc/sysv/errfuns/enfile.S
+++ b/libc/sysv/errfuns/enfile.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enoano.S b/libc/sysv/errfuns/enoano.S
index 81724c4b3..f3a713901 100644
--- a/libc/sysv/errfuns/enoano.S
+++ b/libc/sysv/errfuns/enoano.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enobufs.S b/libc/sysv/errfuns/enobufs.S
index 3ba3a8b9c..7936a1e1f 100644
--- a/libc/sysv/errfuns/enobufs.S
+++ b/libc/sysv/errfuns/enobufs.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enocsi.S b/libc/sysv/errfuns/enocsi.S
index c6e6fdd22..ab63ad071 100644
--- a/libc/sysv/errfuns/enocsi.S
+++ b/libc/sysv/errfuns/enocsi.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enodata.S b/libc/sysv/errfuns/enodata.S
index 4394c5f8b..e5fd06425 100644
--- a/libc/sysv/errfuns/enodata.S
+++ b/libc/sysv/errfuns/enodata.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enodev.S b/libc/sysv/errfuns/enodev.S
index da1741b1d..1be5d5846 100644
--- a/libc/sysv/errfuns/enodev.S
+++ b/libc/sysv/errfuns/enodev.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enoent.S b/libc/sysv/errfuns/enoent.S
index 4bd569d9d..3929a214d 100644
--- a/libc/sysv/errfuns/enoent.S
+++ b/libc/sysv/errfuns/enoent.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enoexec.S b/libc/sysv/errfuns/enoexec.S
index 4b6f00228..30cee1e8a 100644
--- a/libc/sysv/errfuns/enoexec.S
+++ b/libc/sysv/errfuns/enoexec.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enokey.S b/libc/sysv/errfuns/enokey.S
index 5c5c20048..399aa97c9 100644
--- a/libc/sysv/errfuns/enokey.S
+++ b/libc/sysv/errfuns/enokey.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enolck.S b/libc/sysv/errfuns/enolck.S
index 9d5b995e9..ceda33862 100644
--- a/libc/sysv/errfuns/enolck.S
+++ b/libc/sysv/errfuns/enolck.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enolink.S b/libc/sysv/errfuns/enolink.S
index e35ae060e..189d8ad7b 100644
--- a/libc/sysv/errfuns/enolink.S
+++ b/libc/sysv/errfuns/enolink.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enomedium.S b/libc/sysv/errfuns/enomedium.S
index 3bf1573fb..c7a6893ec 100644
--- a/libc/sysv/errfuns/enomedium.S
+++ b/libc/sysv/errfuns/enomedium.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enomem.S b/libc/sysv/errfuns/enomem.S
index 62a49eb7c..d9a7050c8 100644
--- a/libc/sysv/errfuns/enomem.S
+++ b/libc/sysv/errfuns/enomem.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enomsg.S b/libc/sysv/errfuns/enomsg.S
index 49d558667..0307c3ec2 100644
--- a/libc/sysv/errfuns/enomsg.S
+++ b/libc/sysv/errfuns/enomsg.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enonet.S b/libc/sysv/errfuns/enonet.S
index e84f00121..fc2e5468c 100644
--- a/libc/sysv/errfuns/enonet.S
+++ b/libc/sysv/errfuns/enonet.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enopkg.S b/libc/sysv/errfuns/enopkg.S
index 5431b0c9e..3b08e87a5 100644
--- a/libc/sysv/errfuns/enopkg.S
+++ b/libc/sysv/errfuns/enopkg.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enoprotoopt.S b/libc/sysv/errfuns/enoprotoopt.S
index 8ffec6963..80fca7bb5 100644
--- a/libc/sysv/errfuns/enoprotoopt.S
+++ b/libc/sysv/errfuns/enoprotoopt.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enospc.S b/libc/sysv/errfuns/enospc.S
index afbe6a497..69023893d 100644
--- a/libc/sysv/errfuns/enospc.S
+++ b/libc/sysv/errfuns/enospc.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enosr.S b/libc/sysv/errfuns/enosr.S
index 289c4b645..9d7119e53 100644
--- a/libc/sysv/errfuns/enosr.S
+++ b/libc/sysv/errfuns/enosr.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enostr.S b/libc/sysv/errfuns/enostr.S
index e9f64241e..6705b7155 100644
--- a/libc/sysv/errfuns/enostr.S
+++ b/libc/sysv/errfuns/enostr.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enosys.S b/libc/sysv/errfuns/enosys.S
index 4f2132726..c0f6e7c36 100644
--- a/libc/sysv/errfuns/enosys.S
+++ b/libc/sysv/errfuns/enosys.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotblk.S b/libc/sysv/errfuns/enotblk.S
index c04ba77b4..c4812cb03 100644
--- a/libc/sysv/errfuns/enotblk.S
+++ b/libc/sysv/errfuns/enotblk.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotconn.S b/libc/sysv/errfuns/enotconn.S
index 1ea6d6eca..b6c26d55c 100644
--- a/libc/sysv/errfuns/enotconn.S
+++ b/libc/sysv/errfuns/enotconn.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotdir.S b/libc/sysv/errfuns/enotdir.S
index 14103122d..7afc1762f 100644
--- a/libc/sysv/errfuns/enotdir.S
+++ b/libc/sysv/errfuns/enotdir.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotempty.S b/libc/sysv/errfuns/enotempty.S
index dd014acbc..67bbdc837 100644
--- a/libc/sysv/errfuns/enotempty.S
+++ b/libc/sysv/errfuns/enotempty.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotnam.S b/libc/sysv/errfuns/enotnam.S
index 259a7bdae..7a1094e5b 100644
--- a/libc/sysv/errfuns/enotnam.S
+++ b/libc/sysv/errfuns/enotnam.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotrecoverable.S b/libc/sysv/errfuns/enotrecoverable.S
index 85ed3a663..e50430452 100644
--- a/libc/sysv/errfuns/enotrecoverable.S
+++ b/libc/sysv/errfuns/enotrecoverable.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotsock.S b/libc/sysv/errfuns/enotsock.S
index 02e2b93ff..8f0753ce5 100644
--- a/libc/sysv/errfuns/enotsock.S
+++ b/libc/sysv/errfuns/enotsock.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotsup.S b/libc/sysv/errfuns/enotsup.S
index 01888f280..59d130623 100644
--- a/libc/sysv/errfuns/enotsup.S
+++ b/libc/sysv/errfuns/enotsup.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotty.S b/libc/sysv/errfuns/enotty.S
index e07629a98..beed92e95 100644
--- a/libc/sysv/errfuns/enotty.S
+++ b/libc/sysv/errfuns/enotty.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enotuniq.S b/libc/sysv/errfuns/enotuniq.S
index a40cf8982..57b04e9ba 100644
--- a/libc/sysv/errfuns/enotuniq.S
+++ b/libc/sysv/errfuns/enotuniq.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/enxio.S b/libc/sysv/errfuns/enxio.S
index 6e827d4c6..637241908 100644
--- a/libc/sysv/errfuns/enxio.S
+++ b/libc/sysv/errfuns/enxio.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eopnotsupp.S b/libc/sysv/errfuns/eopnotsupp.S
index 0c740d2ab..fa974a998 100644
--- a/libc/sysv/errfuns/eopnotsupp.S
+++ b/libc/sysv/errfuns/eopnotsupp.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eoverflow.S b/libc/sysv/errfuns/eoverflow.S
index 98fae88a9..81aad5b64 100644
--- a/libc/sysv/errfuns/eoverflow.S
+++ b/libc/sysv/errfuns/eoverflow.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eownerdead.S b/libc/sysv/errfuns/eownerdead.S
index 033cab8bf..5968f3807 100644
--- a/libc/sysv/errfuns/eownerdead.S
+++ b/libc/sysv/errfuns/eownerdead.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eperm.S b/libc/sysv/errfuns/eperm.S
index 54df77607..cc4fefa78 100644
--- a/libc/sysv/errfuns/eperm.S
+++ b/libc/sysv/errfuns/eperm.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/epfnosupport.S b/libc/sysv/errfuns/epfnosupport.S
index 9732440f5..73da7dd25 100644
--- a/libc/sysv/errfuns/epfnosupport.S
+++ b/libc/sysv/errfuns/epfnosupport.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/epipe.S b/libc/sysv/errfuns/epipe.S
index fcf57e4c8..5c1f01948 100644
--- a/libc/sysv/errfuns/epipe.S
+++ b/libc/sysv/errfuns/epipe.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eproto.S b/libc/sysv/errfuns/eproto.S
index f5c7e0c1f..22ceb6bd7 100644
--- a/libc/sysv/errfuns/eproto.S
+++ b/libc/sysv/errfuns/eproto.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eprotonosupport.S b/libc/sysv/errfuns/eprotonosupport.S
index 3934683f9..f8f9bcd6a 100644
--- a/libc/sysv/errfuns/eprotonosupport.S
+++ b/libc/sysv/errfuns/eprotonosupport.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eprototype.S b/libc/sysv/errfuns/eprototype.S
index 208d4989f..ceb8efbb3 100644
--- a/libc/sysv/errfuns/eprototype.S
+++ b/libc/sysv/errfuns/eprototype.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/erange.S b/libc/sysv/errfuns/erange.S
index e9e8e727c..1dc1b07e3 100644
--- a/libc/sysv/errfuns/erange.S
+++ b/libc/sysv/errfuns/erange.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eremchg.S b/libc/sysv/errfuns/eremchg.S
index 2bdcac692..201ae2688 100644
--- a/libc/sysv/errfuns/eremchg.S
+++ b/libc/sysv/errfuns/eremchg.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eremote.S b/libc/sysv/errfuns/eremote.S
index 515e7d5b6..db3ddb627 100644
--- a/libc/sysv/errfuns/eremote.S
+++ b/libc/sysv/errfuns/eremote.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eremoteio.S b/libc/sysv/errfuns/eremoteio.S
index d57d9e119..12e2a74b5 100644
--- a/libc/sysv/errfuns/eremoteio.S
+++ b/libc/sysv/errfuns/eremoteio.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/erestart.S b/libc/sysv/errfuns/erestart.S
index 67bc02b9b..30eaa1d0b 100644
--- a/libc/sysv/errfuns/erestart.S
+++ b/libc/sysv/errfuns/erestart.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/erfkill.S b/libc/sysv/errfuns/erfkill.S
index 49be57d27..141b8e85a 100644
--- a/libc/sysv/errfuns/erfkill.S
+++ b/libc/sysv/errfuns/erfkill.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/erofs.S b/libc/sysv/errfuns/erofs.S
index 6d926c890..fdab8c9a8 100644
--- a/libc/sysv/errfuns/erofs.S
+++ b/libc/sysv/errfuns/erofs.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eshutdown.S b/libc/sysv/errfuns/eshutdown.S
index dfbcce120..1f6d8cfd1 100644
--- a/libc/sysv/errfuns/eshutdown.S
+++ b/libc/sysv/errfuns/eshutdown.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/esocktnosupport.S b/libc/sysv/errfuns/esocktnosupport.S
index 6891eb9d9..6a29e9c81 100644
--- a/libc/sysv/errfuns/esocktnosupport.S
+++ b/libc/sysv/errfuns/esocktnosupport.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/espipe.S b/libc/sysv/errfuns/espipe.S
index 4b8bbd3b9..373a08844 100644
--- a/libc/sysv/errfuns/espipe.S
+++ b/libc/sysv/errfuns/espipe.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/esrch.S b/libc/sysv/errfuns/esrch.S
index d7220c067..91dc831d6 100644
--- a/libc/sysv/errfuns/esrch.S
+++ b/libc/sysv/errfuns/esrch.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/esrmnt.S b/libc/sysv/errfuns/esrmnt.S
index 3cea68263..109dc659e 100644
--- a/libc/sysv/errfuns/esrmnt.S
+++ b/libc/sysv/errfuns/esrmnt.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/estale.S b/libc/sysv/errfuns/estale.S
index 8653aa5b4..88a27e0c8 100644
--- a/libc/sysv/errfuns/estale.S
+++ b/libc/sysv/errfuns/estale.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/estrpipe.S b/libc/sysv/errfuns/estrpipe.S
index 037d0644d..52a5c083e 100644
--- a/libc/sysv/errfuns/estrpipe.S
+++ b/libc/sysv/errfuns/estrpipe.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/etime.S b/libc/sysv/errfuns/etime.S
index b3f273c25..a0071f600 100644
--- a/libc/sysv/errfuns/etime.S
+++ b/libc/sysv/errfuns/etime.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/etimedout.S b/libc/sysv/errfuns/etimedout.S
index 878b9a7e9..7787c672a 100644
--- a/libc/sysv/errfuns/etimedout.S
+++ b/libc/sysv/errfuns/etimedout.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/etoomanyrefs.S b/libc/sysv/errfuns/etoomanyrefs.S
index 7f22e9ef6..5e2fe7174 100644
--- a/libc/sysv/errfuns/etoomanyrefs.S
+++ b/libc/sysv/errfuns/etoomanyrefs.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/etxtbsy.S b/libc/sysv/errfuns/etxtbsy.S
index 8cf8c93fb..672146f58 100644
--- a/libc/sysv/errfuns/etxtbsy.S
+++ b/libc/sysv/errfuns/etxtbsy.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/euclean.S b/libc/sysv/errfuns/euclean.S
index 80aeddd80..891e5f866 100644
--- a/libc/sysv/errfuns/euclean.S
+++ b/libc/sysv/errfuns/euclean.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eunatch.S b/libc/sysv/errfuns/eunatch.S
index fc675e49d..655cfc37d 100644
--- a/libc/sysv/errfuns/eunatch.S
+++ b/libc/sysv/errfuns/eunatch.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/eusers.S b/libc/sysv/errfuns/eusers.S
index b6ffd4571..de75227d8 100644
--- a/libc/sysv/errfuns/eusers.S
+++ b/libc/sysv/errfuns/eusers.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/exdev.S b/libc/sysv/errfuns/exdev.S
index 76fb8f71f..679c3a67b 100644
--- a/libc/sysv/errfuns/exdev.S
+++ b/libc/sysv/errfuns/exdev.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errfuns/exfull.S b/libc/sysv/errfuns/exfull.S
index 2d8f37848..fdc83132a 100644
--- a/libc/sysv/errfuns/exfull.S
+++ b/libc/sysv/errfuns/exfull.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .text.unlikely
 
 	.ftrace1
diff --git a/libc/sysv/errno.c b/libc/sysv/errno.c
index 038eca137..438ee9508 100644
--- a/libc/sysv/errno.c
+++ b/libc/sysv/errno.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
-#include "libc/thread/tls.h"
+#include "libc/thread/tls2.internal.h"
 
 /**
  * Global variable for last error.
@@ -35,10 +35,8 @@ errno_t __errno;
 
 /**
  * Returns address of `errno` variable.
- *
- * This function promises to not clobber argument registers.
  */
-nocallersavedregisters errno_t *__errno_location(void) {
+errno_t *__errno_location(void) {
   if (__tls_enabled) {
     return &__get_tls()->tib_errno;
   } else {
diff --git a/libc/sysv/gen.sh b/libc/sysv/gen.sh
index 7abdf723d..da514a903 100644
--- a/libc/sysv/gen.sh
+++ b/libc/sysv/gen.sh
@@ -66,7 +66,7 @@ errfun() {
   NAME="$1"
   ERRNO="$2"
   {
-    printf '#include "libc/macros.h"\n.text.unlikely\n\n'
+    printf '#include "libc/macros.internal.h"\n.text.unlikely\n\n'
     printf '\t.ftrace1\n'
     printf '%s:\n' "$NAME"
     printf '\t.ftrace2\n'
diff --git a/libc/sysv/hostos.S b/libc/sysv/hostos.S
index 5adcfc603..a378a5c70 100644
--- a/libc/sysv/hostos.S
+++ b/libc/sysv/hostos.S
@@ -16,16 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.bss
 	.balign	8
 __hostos:
 	.quad	0
-	.endobj	__hostos,globl
-__tls_index:
-	.long	0
-	.endobj	__tls_index,globl
-__tls_enabled:
-	.long	0
-	.endobj	__tls_enabled,globl
+	.endfn	__hostos,globl
diff --git a/libc/sysv/macros.internal.h b/libc/sysv/macros.internal.h
index 4d348d5da..da2ec43d6 100644
--- a/libc/sysv/macros.internal.h
+++ b/libc/sysv/macros.internal.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_SYSV_MACROS_H_
 #define COSMOPOLITAN_LIBC_SYSV_MACROS_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #ifdef __ASSEMBLER__
 /* clang-format off */
 
diff --git a/libc/sysv/restorert.S b/libc/sysv/restorert.S
index 356fa575e..864e1d5d9 100644
--- a/libc/sysv/restorert.S
+++ b/libc/sysv/restorert.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/sysv/consts/nrlinux.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .privileged
 
 //	Linux Signal Trampoline (HOLY CODE)
diff --git a/libc/sysv/syscall2.S b/libc/sysv/syscall2.S
index 30cf685b1..c420783ef 100644
--- a/libc/sysv/syscall2.S
+++ b/libc/sysv/syscall2.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Invokes system call w/ arity of two.
 //
diff --git a/libc/sysv/syscall3.S b/libc/sysv/syscall3.S
index fc9ee1051..06058bb29 100644
--- a/libc/sysv/syscall3.S
+++ b/libc/sysv/syscall3.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Invokes system call w/ arity of three.
 //
diff --git a/libc/sysv/syscall4.S b/libc/sysv/syscall4.S
index 30aa9f4e3..c5d37788a 100644
--- a/libc/sysv/syscall4.S
+++ b/libc/sysv/syscall4.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Invokes system call w/ arity of four.
 //
diff --git a/libc/sysv/syscon.S b/libc/sysv/syscon.S
index e3d6e812c..9aeae721a 100644
--- a/libc/sysv/syscon.S
+++ b/libc/sysv/syscon.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Sections for varint encoded magic numbers.
 //
diff --git a/libc/sysv/syscount.S b/libc/sysv/syscount.S
index 4bf4ec6d0..6ede1dc3e 100644
--- a/libc/sysv/syscount.S
+++ b/libc/sysv/syscount.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	RII System Five system call counter.
 //
diff --git a/libc/sysv/syslib.S b/libc/sysv/syslib.S
index 9c24943ad..e9c165588 100644
--- a/libc/sysv/syslib.S
+++ b/libc/sysv/syslib.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 	.bss
 	.balign	8
diff --git a/libc/sysv/systemfive.S b/libc/sysv/systemfive.S
index 178892482..879edaf05 100644
--- a/libc/sysv/systemfive.S
+++ b/libc/sysv/systemfive.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/thread/pt.internal.h"
 
 #define SIG_IGN 1
@@ -102,8 +102,8 @@ __pid:	.quad	0
 	.previous
 
 systemfive_cp:
-	beg
-	pro
+	push	%rbp
+	mov	%rsp,%rbp		// so backtraces work
 systemfive_cancellable:			// our pthread_cancel() miracle code
 	cmpb	$0,__tls_enabled(%rip)	// inspired by the musl libc design!
 	je	1f			// we handle linux and bsd together!
@@ -123,7 +123,7 @@ systemfive_cancellable:			// our pthread_cancel() miracle code
 	clc				// no cancellable system calls exist
 	syscall				// that have 7+ args on the bsd OSes
 systemfive_cancellable_end:		// i/o calls park here for long time
-	epi
+	pop	%rbp
 	jnc	2f
 	neg	%rax			// turns bsd errno to system v errno
 2:	cmp	$-4095,%rax		// but we still check again on eintr
@@ -144,13 +144,11 @@ systemfive_cancellable_end:		// i/o calls park here for long time
 	je	systemfive_errno	// we aren't actually cancelled
 	jmp	4f			// now we are in fact cancelled
 systemfive_cancel:			// SIGTHR will jump here too
-	epi
+	pop	%rbp
 4:	jmp	_pthread_cancel_ack	// tail call
 	.weak	_pthread_cancel_ack	// must be linked if we're cancelled
-	end
 #if IsModeDbg()
 not_a_cancellation_point:		// need BEGIN/END_CANCELLATION_POINT
-	beg
 	nop
 	.weak	report_cancellation_point
 5:	ezlea	report_cancellation_point,cx
@@ -159,7 +157,6 @@ not_a_cancellation_point:		// need BEGIN/END_CANCELLATION_POINT
 	call	*%rcx
 6:	ud2
 	nop
-	end
 #endif
 	.globl	systemfive_cancellable_end
 	.globl	systemfive_cancellable
@@ -169,20 +166,19 @@ not_a_cancellation_point:		// need BEGIN/END_CANCELLATION_POINT
 .Lanchorpoint:
 #if SupportsLinux() || SupportsMetal()
 systemfive_linux:
-	beg
 	and	$0xfff,%eax		// remove nonlinux bits from ordinal
 	cmp	$0xfff,%eax		// checks if unsupported by platform
 	je	systemfive_enosys	// never taken branches cost nothing
 	btr	$11,%eax		// 0x800 means a call is cancellable
 	jc	systemfive_cp		// it is handled by the holiest code
 	mov	%rcx,%r10		// syscall instruction clobbers %rcx
-	pro				// linux never reads args from stack
+	push	%rbp			// linux never reads args from stack
+	mov	%rsp,%rbp		// having frame will help backtraces
 	syscall				// this is known as a context switch
-	epi				// next we check to see if it failed
+	pop	%rbp			// next we check to see if it failed
 	cmp	$-4095,%rax		// system five nexgen32e abi § a.2.1
 	jae	systemfive_error	// encodes errno as neg return value
 	ret
-	end
 	.endfn	systemfive_linux,globl,hidden
 systemfive_error:
 	neg	%eax
@@ -190,35 +186,27 @@ systemfive_error:
 	.endfn	systemfive_error,globl,hidden
 #endif
 systemfive_errno:
-	beg
 	xchg	%eax,%ecx
-	call	__errno_location
+	.errno
 	mov	%ecx,(%rax)		// normalize to c library convention
 	push	$-1			// negative one is only error result
 	pop	%rax			// the push pop is to save code size
 	ret
-	end
 	.endfn	systemfive_errno,globl,hidden
 systemfive_enosys:
-	beg
 	mov	ENOSYS(%rip),%eax
 	jmp	systemfive_errno
-	end
 	.endfn	systemfive_enosys,globl,hidden
 #if SupportsNetbsd()
 systemfive_netbsd:
-	beg
 	shr	$4*13,%rax
 	jmp	systemfive_bsdscrub
-	end
 	.endfn	systemfive_netbsd,globl,hidden
 #endif
 #if SupportsOpenbsd()
 systemfive_openbsd:
-	beg
 	shr	$4*10,%rax
 	jmp	systemfive_bsdscrub
-	end
 	.endfn	systemfive_openbsd,globl,hidden
 #endif
 #if SupportsFreebsd()
@@ -234,7 +222,6 @@ systemfive_bsdscrub:
 //	𝑠𝑙𝑖𝑑𝑒
 	.endfn	systemfive_bsdscrub,globl,hidden
 systemfive_bsd:
-	beg
 	cmp	$0xfff,%ax
 	je	systemfive_enosys
 	btr	$11,%eax		// checks/reset the 800 cancellable bit
@@ -243,7 +230,6 @@ systemfive_bsd:
 	syscall				// bsd will need arg on stack sometimes
 	jc	systemfive_errno	// bsd sets carry flag if %rax is errno
 	ret
-	end
 	.endfn	systemfive_bsd
 #endif
 #if SupportsXnu()
diff --git a/libc/sysv/sysv.c b/libc/sysv/sysv.c
index 8c78202bf..afb3496b8 100644
--- a/libc/sysv/sysv.c
+++ b/libc/sysv/sysv.c
@@ -37,7 +37,7 @@ register long freebsd_ordinal asm("x9");
 register long xnu_ordinal asm("x16");
 register long cosmo_tls_register asm("x28");
 
-void report_cancelation_point(int, int);
+void report_cancelation_point(void);
 
 dontinline long systemfive_cancel(void) {
   return _weaken(_pthread_cancel_ack)();
@@ -58,9 +58,9 @@ dontinline long systemfive_cancellable(void) {
       return systemfive_cancel();
     }
 #if IsModeDbg()
-    if (!(pth->pt_flags & PT_INCANCEL) && !(pth->pt_flags & PT_NOCANCEL)) {
+    if (!(pth->pt_flags & PT_INCANCEL)) {
       if (_weaken(report_cancelation_point)) {
-        _weaken(report_cancelation_point)(sysv_ordinal, xnu_ordinal);
+        _weaken(report_cancelation_point)();
       }
       __builtin_trap();
     }
diff --git a/libc/testlib/BUILD.mk b/libc/testlib/BUILD.mk
index a81790c86..236d8ab96 100644
--- a/libc/testlib/BUILD.mk
+++ b/libc/testlib/BUILD.mk
@@ -22,17 +22,14 @@ LIBC_TESTLIB_A_ASSETS =						\
 LIBC_TESTLIB_A_HDRS =						\
 	libc/testlib/aspect.internal.h				\
 	libc/testlib/bench.h					\
-	libc/testlib/benchmark.h				\
 	libc/testlib/blocktronics.h				\
 	libc/testlib/ezbench.h					\
 	libc/testlib/fastrandomstring.h				\
 	libc/testlib/hyperion.h					\
-	libc/testlib/manystack.h				\
 	libc/testlib/moby.h					\
 	libc/testlib/subprocess.h				\
 	libc/testlib/testlib.h					\
-	libc/testlib/trace.h					\
-	libc/testlib/viewables.h				\
+	libc/testlib/viewables.h
 
 LIBC_TESTLIB_A_SRCS_S =						\
 	libc/testlib/bench.S					\
@@ -52,7 +49,6 @@ LIBC_TESTLIB_A_SRCS_C =						\
 	libc/testlib/clearxmmregisters.c			\
 	libc/testlib/contains.c					\
 	libc/testlib/endswith.c					\
-	libc/testlib/exactlyequallongdouble.c		 	\
 	libc/testlib/extract.c					\
 	libc/testlib/ezbenchcontrol.c				\
 	libc/testlib/ezbenchreport.c				\
@@ -71,7 +67,6 @@ LIBC_TESTLIB_A_SRCS_C =						\
 	libc/testlib/globals.c					\
 	libc/testlib/hexequals.c				\
 	libc/testlib/incrementfailed.c				\
-	libc/testlib/manystack.c				\
 	libc/testlib/memoryexists.c				\
 	libc/testlib/seterrno.c					\
 	libc/testlib/shoulddebugbreak.c				\
@@ -83,10 +78,9 @@ LIBC_TESTLIB_A_SRCS_C =						\
 	libc/testlib/testrunner.c				\
 	libc/testlib/thunks.c					\
 	libc/testlib/tmptest.c					\
-	libc/testlib/trace.c					\
 	libc/testlib/waitforexit.c				\
 	libc/testlib/waitforterm.c				\
-	libc/testlib/yield.c					\
+	libc/testlib/yield.c
 
 LIBC_TESTLIB_A_SRCS =						\
 	$(LIBC_TESTLIB_A_SRCS_S)				\
@@ -112,15 +106,13 @@ LIBC_TESTLIB_A_DIRECTDEPS =					\
 	LIBC_STR						\
 	LIBC_SYSV						\
 	LIBC_SYSV_CALLS						\
-	LIBC_THREAD						\
 	LIBC_TINYMATH						\
 	LIBC_X							\
 	THIRD_PARTY_COMPILER_RT					\
 	THIRD_PARTY_DLMALLOC					\
 	THIRD_PARTY_GDTOA					\
-	THIRD_PARTY_MUSL					\
-	THIRD_PARTY_TZ						\
 	THIRD_PARTY_XED						\
+	THIRD_PARTY_TZ
 
 LIBC_TESTLIB_A_DEPS :=						\
 	$(call uniq,$(foreach x,$(LIBC_TESTLIB_A_DIRECTDEPS),$($(x))))
@@ -215,7 +207,6 @@ LIBC_TESTMAIN_DIRECTDEPS =					\
 	LIBC_LOG						\
 	LIBC_MEM						\
 	LIBC_NEXGEN32E						\
-	LIBC_PROC						\
 	LIBC_RUNTIME						\
 	LIBC_STDIO						\
 	LIBC_SYSV						\
@@ -223,7 +214,7 @@ LIBC_TESTMAIN_DIRECTDEPS =					\
 	LIBC_TESTLIB						\
 	LIBC_TESTLIB_RUNNER					\
 	THIRD_PARTY_DLMALLOC					\
-	THIRD_PARTY_GETOPT					\
+	THIRD_PARTY_GETOPT
 
 LIBC_TESTMAIN_DEPS :=						\
 	$(call uniq,$(foreach x,$(LIBC_TESTMAIN_DIRECTDEPS),$($(x))))
diff --git a/libc/testlib/bench.S b/libc/testlib/bench.S
index 49e1e83db..730d0beb7 100644
--- a/libc/testlib/bench.S
+++ b/libc/testlib/bench.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .yoink	testlib_runallbenchmarks
 
 //	Decentralized section for benchmark registration.
diff --git a/libc/testlib/benchmark.h b/libc/testlib/benchmark.h
deleted file mode 100644
index 8915dfb6e..000000000
--- a/libc/testlib/benchmark.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_TESTLIB_BENCHMARK_H_
-#define COSMOPOLITAN_LIBC_TESTLIB_BENCHMARK_H_
-#include "libc/calls/struct/timespec.h"
-#include "libc/stdio/stdio.h"
-COSMOPOLITAN_C_START_
-
-#define X(x) __expropriate(x)
-#define V(x) __veil("r", x)
-
-#define BENCHMARK(ITERATIONS, WORK_PER_RUN, CODE)                     \
-  do {                                                                \
-    struct timespec start = timespec_real();                          \
-    for (int __i = 0; __i < ITERATIONS; ++__i) {                      \
-      asm volatile("" ::: "memory");                                  \
-      CODE;                                                           \
-    }                                                                 \
-    long ns = timespec_tonanos(timespec_sub(timespec_real(), start)); \
-    _print_benchmark_result(ns, WORK_PER_RUN, ITERATIONS, #CODE);     \
-  } while (0)
-
-static void _print_benchmark_result(double total_nanos, double work_per_run,
-                                    int iterations, const char* code) {
-  double time_per_op = total_nanos / (work_per_run * iterations);
-  double throughput = work_per_run / (total_nanos / iterations * 1e-9);
-
-  const char* throughput_unit;
-  const char* time_unit;
-  double time_value;
-
-  // Determine throughput unit
-  if (throughput >= 1e9) {
-    throughput /= 1e9;
-    throughput_unit = "G";
-  } else if (throughput >= 1e6) {
-    throughput /= 1e6;
-    throughput_unit = "M";
-  } else if (throughput >= 1e3) {
-    throughput /= 1e3;
-    throughput_unit = "K";
-  } else {
-    throughput_unit = " ";
-  }
-
-  // Determine time unit
-  if (time_per_op >= 1e6) {
-    time_value = time_per_op / 1e6;
-    time_unit = "ms";
-  } else if (time_per_op >= 1e3) {
-    time_value = time_per_op / 1e3;
-    time_unit = "µs";
-  } else if (time_per_op >= .01) {
-    time_value = time_per_op;
-    time_unit = "ns";
-  } else {
-    time_value = time_per_op * 1e3;
-    time_unit = "ps";
-  }
-
-  // Determine work unit
-  const char* work_unit;
-  if (work_per_run >= 1e9) {
-    work_per_run /= 1e9;
-    work_unit = "G";
-  } else if (work_per_run >= 1e6) {
-    work_per_run /= 1e6;
-    work_unit = "M";
-  } else if (work_per_run >= 1e3) {
-    work_per_run /= 1e3;
-    work_unit = "K";
-  } else {
-    work_unit = " ";
-  }
-
-  printf("%8.2f %-2s %8.2f %s/s %6.2f %s %3dx %s\n", time_value, time_unit,
-         throughput, throughput_unit, work_per_run, work_unit, iterations,
-         code);
-}
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_LIBC_TESTLIB_BENCHMARK_H_ */
diff --git a/libc/testlib/binequals.c b/libc/testlib/binequals.c
index 79ab6c9f5..f56000a38 100644
--- a/libc/testlib/binequals.c
+++ b/libc/testlib/binequals.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/testlib/testlib.h"
 
 /**
diff --git a/libc/testlib/blake2b256_tests.S b/libc/testlib/blake2b256_tests.S
index 50d02704f..d5d600e19 100644
--- a/libc/testlib/blake2b256_tests.S
+++ b/libc/testlib/blake2b256_tests.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .rodata
 
 //	Blake2B256 test vectors.
diff --git a/libc/testlib/blocktronics.S b/libc/testlib/blocktronics.S
index dcbff573b..055803694 100644
--- a/libc/testlib/blocktronics.S
+++ b/libc/testlib/blocktronics.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .rodata
 
 //	Nontrivial NUL-terminated string test vector.
diff --git a/libc/testlib/ezbench.h b/libc/testlib/ezbench.h
index 70bb39dd3..35fa23e33 100644
--- a/libc/testlib/ezbench.h
+++ b/libc/testlib/ezbench.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_TESTLIB_EZBENCH_H_
 #define COSMOPOLITAN_LIBC_TESTLIB_EZBENCH_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/nexgen32e/bench.h"
 #include "libc/nexgen32e/x86feature.h"
diff --git a/libc/testlib/fixture.S b/libc/testlib/fixture.S
index bea3297bd..03139e2ab 100644
--- a/libc/testlib/fixture.S
+++ b/libc/testlib/fixture.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Decentralized section for test fixture registration.
 //
diff --git a/libc/testlib/hexequals.c b/libc/testlib/hexequals.c
index 1fadd63ed..a4f8b5cf9 100644
--- a/libc/testlib/hexequals.c
+++ b/libc/testlib/hexequals.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/testlib/testlib.h"
 
 /**
diff --git a/libc/testlib/hyperion.S b/libc/testlib/hyperion.S
index 66bdf3cf8..0e2925b5e 100644
--- a/libc/testlib/hyperion.S
+++ b/libc/testlib/hyperion.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .rodata
 
 //	Nontrivial NUL-terminated string test vector.
diff --git a/libc/testlib/manystack.c b/libc/testlib/manystack.c
deleted file mode 100644
index b0b022ba1..000000000
--- a/libc/testlib/manystack.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/testlib/manystack.h"
-#include "libc/atomic.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/sigset.h"
-#include "libc/intrin/dll.h"
-#include "libc/sysv/consts/sig.h"
-#include "libc/thread/posixthread.internal.h"
-
-static atomic_int manystack_gotsig;
-static atomic_bool manystack_shutdown;
-
-static void manystack_signal(int sig) {
-  manystack_gotsig = sig;
-}
-
-static void *manystack_thread(void *arg) {
-  sigset_t ss;
-  sigfillset(&ss);
-  sigdelset(&ss, SIGUSR2);
-  while (!manystack_shutdown) {
-    sigsuspend(&ss);
-    if (!manystack_shutdown) {
-      _pthread_lock();
-      for (struct Dll *e = dll_first(_pthread_list); e;
-           e = dll_next(_pthread_list, e)) {
-        pthread_t th = (pthread_t)POSIXTHREAD_CONTAINER(e);
-        if (!pthread_equal(th, pthread_self()))
-          pthread_kill(th, SIGQUIT);
-      }
-      _pthread_unlock();
-    }
-  }
-  return 0;
-}
-
-pthread_t manystack_start(void) {
-  sigset_t ss;
-  pthread_t msh;
-  sigemptyset(&ss);
-  sigaddset(&ss, SIGUSR2);
-  sigprocmask(SIG_BLOCK, &ss, 0);
-  signal(SIGUSR2, manystack_signal);
-  pthread_create(&msh, 0, manystack_thread, 0);
-  return msh;
-}
-
-void manystack_stop(pthread_t msh) {
-  manystack_shutdown = true;
-  pthread_kill(msh, SIGUSR2);
-  pthread_join(msh, 0);
-}
diff --git a/libc/testlib/manystack.h b/libc/testlib/manystack.h
deleted file mode 100644
index a175ecbea..000000000
--- a/libc/testlib/manystack.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_TESTLIB_MANYSTACK_H_
-#define COSMOPOLITAN_LIBC_TESTLIB_MANYSTACK_H_
-#include "libc/thread/thread.h"
-COSMOPOLITAN_C_START_
-
-pthread_t manystack_start(void);
-void manystack_stop(pthread_t);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_LIBC_TESTLIB_MANYSTACK_H_ */
diff --git a/libc/testlib/moby.S b/libc/testlib/moby.S
index fff6bcaaa..aa382f5e4 100644
--- a/libc/testlib/moby.S
+++ b/libc/testlib/moby.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .rodata
 
 //	Nontrivial NUL-terminated string test vector.
diff --git a/libc/testlib/polluteregisters.S b/libc/testlib/polluteregisters.S
index 50cf2d5e4..b26144eb4 100644
--- a/libc/testlib/polluteregisters.S
+++ b/libc/testlib/polluteregisters.S
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/nexgen32e/x86feature.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 __polluteregisters:
 #ifdef __x86_64__
diff --git a/libc/testlib/subprocess.h b/libc/testlib/subprocess.h
index c09ee56c0..0d396de75 100644
--- a/libc/testlib/subprocess.h
+++ b/libc/testlib/subprocess.h
@@ -1,7 +1,7 @@
 #ifndef COSMOPOLITAN_LIBC_TESTLIB_SUBPROCESS_H_
 #define COSMOPOLITAN_LIBC_TESTLIB_SUBPROCESS_H_
 #include "libc/calls/calls.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/testlib/testlib.h"
 COSMOPOLITAN_C_START_
diff --git a/libc/testlib/testcase.S b/libc/testlib/testcase.S
index 46a76c374..272521378 100644
--- a/libc/testlib/testcase.S
+++ b/libc/testlib/testcase.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Decentralized section for test testcase registration.
 //
diff --git a/libc/testlib/testlib.h b/libc/testlib/testlib.h
index 9c8053c71..398c50004 100644
--- a/libc/testlib/testlib.h
+++ b/libc/testlib/testlib.h
@@ -195,13 +195,6 @@ void TearDownOnce(void);
 #define ASSERT_LDBL_LT(VAL, GOT) \
   assertLongDoubleLessThan(VAL, GOT, #VAL " < " #GOT, true)
 
-#define ASSERT_FLOAT_EXACTLY_EQ(WANT, GOT) \
-  assertLongDoubleExactlyEquals(FILIFU WANT, GOT, #GOT, true)
-#define ASSERT_DOUBLE_EXACTLY_EQ(WANT, GOT) \
-  assertLongDoubleExactlyEquals(FILIFU WANT, GOT, #GOT, true)
-#define ASSERT_LDBL_EXACTLY_EQ(WANT, GOT) \
-  assertLongDoubleExactlyEquals(FILIFU WANT, GOT, #GOT, true)
-
 /*───────────────────────────────────────────────────────────────────────────│─╗
 │ cosmopolitan § testing library » assert or log                           ─╬─│┼
 ╚────────────────────────────────────────────────────────────────────────────│*/
@@ -278,13 +271,6 @@ void TearDownOnce(void);
 #define EXPECT_LGBL_LT(VAL, GOT) \
   expectLongDoubleLessThan(VAL, GOT, #VAL " < " #GOT, false)
 
-#define EXPECT_FLOAT_EXACTLY_EQ(WANT, GOT) \
-  assertLongDoubleExactlyEquals(FILIFU WANT, GOT, #GOT, false)
-#define EXPECT_DOUBLE_EXACTLY_EQ(WANT, GOT) \
-  assertLongDoubleExactlyEquals(FILIFU WANT, GOT, #GOT, false)
-#define EXPECT_LDBL_EXACTLY_EQ(WANT, GOT) \
-  assertLongDoubleExactlyEquals(FILIFU WANT, GOT, #GOT, false)
-
 /*───────────────────────────────────────────────────────────────────────────│─╗
 │ cosmopolitan § testing library » implementation details                  ─╬─│┼
 ╚────────────────────────────────────────────────────────────────────────────│*/
@@ -418,7 +404,6 @@ void testlib_formatbinaryashex(const char *, const void *, size_t, char **,
 void testlib_formatbinaryasglyphs(const char16_t *, const void *, size_t,
                                   char **, char **);
 bool testlib_almostequallongdouble(long double, long double);
-bool testlib_exactlyequallongdouble(long double, long double);
 void testlib_incrementfailed(void);
 void testlib_clearxmmregisters(void);
 
@@ -711,20 +696,5 @@ forceinline void assertLongDoubleEquals(FILIFU_ARGS long double want,
   testlib_onfail2(isfatal);
 }
 
-forceinline void assertLongDoubleExactlyEquals(FILIFU_ARGS long double want,
-                                               long double got,
-                                               const char *gotcode,
-                                               bool isfatal) {
-  ++g_testlib_ran;
-  if (testlib_exactlyequallongdouble(want, got))
-    return;
-  if (g_testlib_shoulddebugbreak)
-    DebugBreak();
-  testlib_showerror(file, line, func, "assertLongDoubleExactlyEquals", "≠",
-                    gotcode, testlib_formatfloat(want),
-                    testlib_formatfloat(got));
-  testlib_onfail2(isfatal);
-}
-
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_TESTLIB_H_ */
diff --git a/libc/testlib/testmain.c b/libc/testlib/testmain.c
index 538ca3ec5..e211f8564 100644
--- a/libc/testlib/testmain.c
+++ b/libc/testlib/testmain.c
@@ -18,7 +18,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
-#include "libc/calls/struct/cpuset.h"
 #include "libc/calls/struct/rlimit.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/siginfo.h"
@@ -32,20 +31,16 @@
 #include "libc/intrin/strace.h"
 #include "libc/intrin/ubsan.h"
 #include "libc/intrin/weaken.h"
-#include "libc/limits.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/leaks.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/nexgen32e.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/symbols.internal.h"
-#include "libc/stdio/rand.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/f.h"
-#include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/rlimit.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/testlib/aspect.internal.h"
@@ -54,8 +49,6 @@
 #include "libc/thread/tls.h"
 #include "third_party/getopt/getopt.internal.h"
 
-#pragma weak main
-
 #define USAGE \
   " [FLAGS]\n\
 \n\
@@ -92,41 +85,7 @@ static void GetOpts(int argc, char *argv[]) {
   }
 }
 
-static int rando(void) {
-  return _rand64() & INT_MAX;
-}
-
-static void limit_process_to_single_cpu(void) {
-  extern int disable_limit_process_to_single_cpu;
-  if (_weaken(disable_limit_process_to_single_cpu))
-    return;
-  if (!(IsLinux() || IsFreebsd() || IsNetbsd() || IsWindows()))
-    return;
-  if (IsFreebsd() && getuid())
-    return;
-  cpu_set_t legal;
-  if (sched_getaffinity(0, sizeof(cpu_set_t), &legal) == -1) {
-    perror("sched_setaffinity failed");
-    exit(1);
-  }
-  int count = CPU_COUNT(&legal);
-  cpu_set_t newset;
-  CPU_ZERO(&newset);
-  bool done = false;
-  while (!done) {
-    for (int i = 0; i < CPU_SETSIZE; ++i) {
-      if (CPU_ISSET(i, &legal) && !(rando() % count)) {
-        CPU_SET(rando() % count, &newset);
-        done = true;
-        break;
-      }
-    }
-  }
-  if (sched_setaffinity(0, sizeof(cpu_set_t), &newset) == -1) {
-    perror("sched_setaffinity failed");
-    exit(1);
-  }
-}
+#pragma weak main
 
 /**
  * Generic test program main function.
@@ -136,33 +95,19 @@ int main(int argc, char *argv[]) {
   struct Dll *e;
   struct TestAspect *a;
 
-  // some settings
-  __ubsan_strict = true;
-  __log_level = kLogInfo;
-
   if (errno) {
     tinyprint(2, "error: the errno variable was contaminated by constructors\n",
               NULL);
     return 1;
   }
 
-  // // this sometimes helps tease out mt bugs
-  // limit_process_to_single_cpu();
-
-  // test huge pointers by enabling pml5t
-  if (rando() % 2) {
-    errno_t e = errno;
-    mmap((char *)0x80000000000000, 1, PROT_NONE,  //
-         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-    errno = e;
-  }
-
+  __ubsan_strict = true;
+  __log_level = kLogInfo;
   GetOpts(argc, argv);
 
-  int oe = errno;
-  for (fd = 3; fd < 100; ++fd)
+  for (fd = 3; fd < 10; ++fd) {
     close(fd);
-  errno = oe;
+  }
 
 #ifndef TINY
   setenv("GDB", "", true);
@@ -211,14 +156,13 @@ int main(int argc, char *argv[]) {
 
   // make sure threads are in a good state
   if (_weaken(_pthread_decimate))
-    _weaken(_pthread_decimate)(kPosixThreadZombie);
+    _weaken(_pthread_decimate)(false);
   if (_weaken(pthread_orphan_np) && !_weaken(pthread_orphan_np)()) {
     tinyprint(2, "error: tests ended with threads still active\n", NULL);
     _Exit(1);
   }
 
   // check for memory leaks
-  AssertNoLocksAreHeld();
   if (!g_testlib_failed)
     CheckForMemoryLeaks();
 
diff --git a/libc/testlib/testrunner.c b/libc/testlib/testrunner.c
index b8498e1eb..efe4483e2 100644
--- a/libc/testlib/testrunner.c
+++ b/libc/testlib/testrunner.c
@@ -26,7 +26,7 @@
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/process.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/rand.h"
@@ -34,7 +34,6 @@
 #include "libc/str/str.h"
 #include "libc/testlib/aspect.internal.h"
 #include "libc/testlib/testlib.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/x/x.h"
 
@@ -53,7 +52,7 @@ void testlib_finish(void) {
 void testlib_error_enter(const char *file, const char *func) {
   ftrace_enabled(-1);
   strace_enabled(-1);
-  _pthread_mutex_lock(&testlib_error_lock);
+  pthread_mutex_lock(&testlib_error_lock);
   if (!IsWindows())
     sys_getpid(); /* make strace easier to read */
   if (!IsWindows())
@@ -68,7 +67,7 @@ void testlib_error_enter(const char *file, const char *func) {
 void testlib_error_leave(void) {
   strace_enabled(+1);
   ftrace_enabled(+1);
-  _pthread_mutex_unlock(&testlib_error_lock);
+  pthread_mutex_unlock(&testlib_error_lock);
 }
 
 wontreturn void testlib_abort(void) {
diff --git a/libc/testlib/trace.c b/libc/testlib/trace.c
deleted file mode 100644
index ca9b753c8..000000000
--- a/libc/testlib/trace.c
+++ /dev/null
@@ -1,151 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "trace.h"
-#include <pthread.h>
-#include <stdatomic.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <threads.h>
-#include <unistd.h>
-
-struct TraceEvent {
-  unsigned long long ts;
-  int pid;
-  int tid;
-  const char* name;
-  const char* cat;
-  char ph;
-};
-
-static int g_pid;
-static atomic_bool g_oom;
-static atomic_int g_count;
-static thread_local int g_id;
-static thread_local int g_ids;
-static thread_local int g_tid;
-static unsigned long g_start_rdtsc;
-static struct TraceEvent g_events[1000000];
-
-static unsigned long rdtsc(void) {
-#ifdef __x86_64__
-  unsigned ax, dx;
-  __asm__ volatile("rdtsc" : "=a"(ax), "=d"(dx));
-  return (unsigned long)dx << 32 | ax;
-#else
-  unsigned long c;
-  __asm__ volatile("mrs %0, cntvct_el0" : "=r"(c));
-  return c * 48;  // the fudge factor
-#endif
-}
-
-static int cosmo_trace_oom(void) {
-  if (atomic_load_explicit(&g_oom, memory_order_relaxed))
-    return -1;
-  if (atomic_exchange_explicit(&g_oom, true, memory_order_acq_rel))
-    return -1;
-  fprintf(stderr, "warning: ran out of trace event memory\n");
-  return -1;
-}
-
-static int cosmo_trace_reserve(int count) {
-  int id = atomic_load_explicit(&g_count, memory_order_relaxed);
-  if (id + count > sizeof(g_events) / sizeof(*g_events))
-    return cosmo_trace_oom();
-  id = atomic_fetch_add_explicit(&g_count, count, memory_order_acq_rel);
-  if (id + count > sizeof(g_events) / sizeof(*g_events))
-    return cosmo_trace_oom();
-  return id;
-}
-
-static void cosmo_trace_event(int id, const char* name, const char* cat,
-                              char ph) {
-  g_events[id].ts = rdtsc();
-  g_events[id].pid = g_pid ? g_pid - 1 : getpid();
-  g_events[id].tid = g_tid ? g_tid - 1 : gettid();
-  g_events[id].name = name;
-  g_events[id].cat = cat;
-  g_events[id].ph = ph;
-}
-
-void cosmo_trace_set_pid(int pid) {
-  g_pid = pid + 1;
-}
-
-void cosmo_trace_set_tid(int tid) {
-  g_tid = tid + 1;
-}
-
-void cosmo_trace_begin(const char* name) {
-  if (g_ids < 2) {
-    g_ids = 20;
-    g_id = cosmo_trace_reserve(g_ids);
-    if (g_id == -1) {
-      g_ids = 0;
-      return;
-    }
-  }
-  cosmo_trace_event(g_id++, name, "category", 'B');
-  --g_ids;
-}
-
-void cosmo_trace_end(const char* name) {
-  if (g_ids < 1)
-    return;
-  cosmo_trace_event(g_id++, name, "category", 'E');
-  --g_ids;
-}
-
-static void cosmo_trace_save(const char* filename) {
-  int count = atomic_load_explicit(&g_count, memory_order_relaxed);
-  if (!count)
-    return;
-  fprintf(stderr, "saving trace to %s...\n", filename);
-  FILE* file = fopen(filename, "w");
-  if (!file) {
-    perror(filename);
-    return;
-  }
-  fprintf(file, "[\n");
-  bool once = false;
-  for (int i = 0; i < count; i++) {
-    if (!g_events[i].name)
-      continue;
-    if (!once) {
-      once = true;
-    } else {
-      fputs(",\n", file);
-    }
-    fprintf(file,
-            "{\"name\": \"%s\", \"cat\": \"%s\", \"ph\": \"%c\", "
-            "\"ts\": %.3f, \"pid\": %d, \"tid\": %d}",
-            g_events[i].name, g_events[i].cat, g_events[i].ph,
-            (g_events[i].ts - g_start_rdtsc) / 3000., g_events[i].pid,
-            g_events[i].tid);
-  }
-  fprintf(file, "\n]\n");
-  fclose(file);
-}
-
-__attribute__((__constructor__)) static void trace_startup(void) {
-  g_start_rdtsc = rdtsc();
-}
-
-__attribute__((__destructor__)) static void trace_shutdown(void) {
-  cosmo_trace_save("trace.json");  // see chrome://tracing/
-}
diff --git a/libc/testlib/trace.h b/libc/testlib/trace.h
deleted file mode 100644
index 05d438ff5..000000000
--- a/libc/testlib/trace.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef COSMOPOLITAN_LIBC_TESTLIB_TRACE_H_
-#define COSMOPOLITAN_LIBC_TESTLIB_TRACE_H_
-COSMOPOLITAN_C_START_
-
-void cosmo_trace_set_pid(int);
-void cosmo_trace_set_tid(int);
-void cosmo_trace_begin(const char*);
-void cosmo_trace_end(const char*);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_LIBC_TESTLIB_TRACE_H_ */
diff --git a/libc/testlib/viewables.S b/libc/testlib/viewables.S
index f367d3577..aa5154a93 100644
--- a/libc/testlib/viewables.S
+++ b/libc/testlib/viewables.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 .rodata
 
 //	Nontrivial NUL-terminated string test vector.
diff --git a/libc/thread/freebsd.internal.h b/libc/thread/freebsd.internal.h
index f7fec2ed5..97a7c9e06 100644
--- a/libc/thread/freebsd.internal.h
+++ b/libc/thread/freebsd.internal.h
@@ -47,8 +47,7 @@ struct _umtx_time {
   uint32_t _clockid;
 };
 
-int sys_umtx_timedwait_uint(_Atomic(int) *, int, bool, int,
-                            const struct timespec *);
+int sys_umtx_timedwait_uint(_Atomic(int) *, int, bool, const struct timespec *);
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_THREAD_FREEBSD_INTERNAL_H_ */
diff --git a/libc/thread/itimer.c b/libc/thread/itimer.c
index fd93cf00d..91a55580a 100644
--- a/libc/thread/itimer.c
+++ b/libc/thread/itimer.c
@@ -24,38 +24,27 @@
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/struct/timeval.h"
 #include "libc/cosmo.h"
-#include "libc/intrin/maps.h"
 #include "libc/intrin/strace.h"
 #include "libc/nt/enum/processcreationflags.h"
 #include "libc/nt/thread.h"
-#include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
-#include "libc/sysv/consts/clock.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/sicode.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/thread/itimer.h"
-#include "libc/thread/posixthread.internal.h"
-#include "libc/thread/thread2.h"
+#include "libc/thread/itimer.internal.h"
 #include "libc/thread/tls.h"
+#include "third_party/nsync/mu.h"
 #ifdef __x86_64__
 
-#define STACK_SIZE 65536
+struct IntervalTimer __itimer;
 
-textwindows dontinstrument static uint32_t __itimer_worker(void *arg) {
+static textwindows dontinstrument uint32_t __itimer_worker(void *arg) {
   struct CosmoTib tls;
-  char *sp = __builtin_frame_address(0);
-  __bootstrap_tls(&tls, sp);
-  __maps_track(
-      (char *)(((uintptr_t)sp + __pagesize - 1) & -__pagesize) - STACK_SIZE,
-      STACK_SIZE, PROT_READ | PROT_WRITE,
-      MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK);
+  __bootstrap_tls(&tls, __builtin_frame_address(0));
   for (;;) {
     bool dosignal = false;
     struct timeval now, waituntil;
-    __itimer_lock();
+    nsync_mu_lock(&__itimer.lock);
     now = timeval_real();
     if (timeval_iszero(__itimer.it.it_value)) {
       waituntil = timeval_max;
@@ -76,46 +65,56 @@ textwindows dontinstrument static uint32_t __itimer_worker(void *arg) {
         dosignal = true;
       }
     }
-    __itimer_unlock();
-    if (dosignal)
+    nsync_mu_unlock(&__itimer.lock);
+    if (dosignal) {
       __sig_generate(SIGALRM, SI_TIMER);
-    __itimer_lock();
-    struct timespec deadline = timeval_totimespec(waituntil);
-    _pthread_cond_timedwait(&__itimer.cond, &__itimer.lock, &deadline);
-    __itimer_unlock();
+    }
+    nsync_mu_lock(&__itimer.lock);
+    nsync_cv_wait_with_deadline(&__itimer.cond, &__itimer.lock,
+                                timeval_totimespec(waituntil), 0);
+    nsync_mu_unlock(&__itimer.lock);
   }
   return 0;
 }
 
-textwindows static void __itimer_setup(void) {
-  __itimer.thread = CreateThread(0, STACK_SIZE, __itimer_worker, 0,
+static textwindows void __itimer_setup(void) {
+  __itimer.thread = CreateThread(0, 65536, __itimer_worker, 0,
                                  kNtStackSizeParamIsAReservation, 0);
 }
 
+textwindows void __itimer_wipe(void) {
+  // this function is called by fork(), because
+  // timers aren't inherited by forked subprocesses
+  bzero(&__itimer, sizeof(__itimer));
+}
+
 textwindows int sys_setitimer_nt(int which, const struct itimerval *neu,
                                  struct itimerval *old) {
   struct itimerval config;
   cosmo_once(&__itimer.once, __itimer_setup);
   if (which != ITIMER_REAL || (neu && (!timeval_isvalid(neu->it_value) ||
-                                       !timeval_isvalid(neu->it_interval))))
+                                       !timeval_isvalid(neu->it_interval)))) {
     return einval();
-  if (neu)
+  }
+  if (neu) {
     // POSIX defines setitimer() with the restrict keyword but let's
     // accommodate the usage setitimer(ITIMER_REAL, &it, &it) anyway
     config = *neu;
+  }
   BLOCK_SIGNALS;
-  __itimer_lock();
+  nsync_mu_lock(&__itimer.lock);
   if (old) {
     old->it_interval = __itimer.it.it_interval;
     old->it_value = timeval_subz(__itimer.it.it_value, timeval_real());
   }
   if (neu) {
-    if (!timeval_iszero(config.it_value))
+    if (!timeval_iszero(config.it_value)) {
       config.it_value = timeval_add(config.it_value, timeval_real());
+    }
     __itimer.it = config;
-    _pthread_cond_signal(&__itimer.cond);
+    nsync_cv_signal(&__itimer.cond);
   }
-  __itimer_unlock();
+  nsync_mu_unlock(&__itimer.lock);
   ALLOW_SIGNALS;
   return 0;
 }
diff --git a/libc/thread/itimer.h b/libc/thread/itimer.internal.h
similarity index 67%
rename from libc/thread/itimer.h
rename to libc/thread/itimer.internal.h
index a5193d987..41d721216 100644
--- a/libc/thread/itimer.h
+++ b/libc/thread/itimer.internal.h
@@ -2,22 +2,21 @@
 #define COSMOPOLITAN_LIBC_ITIMER_H_
 #include "libc/atomic.h"
 #include "libc/calls/struct/itimerval.h"
-#include "libc/thread/thread.h"
+#include "third_party/nsync/cv.h"
+#include "third_party/nsync/mu.h"
 COSMOPOLITAN_C_START_
 
 struct IntervalTimer {
   atomic_uint once;
   intptr_t thread;
-  pthread_mutex_t lock;
-  pthread_cond_t cond;
+  nsync_mu lock;
+  nsync_cv cond;
   struct itimerval it;
 };
 
 extern struct IntervalTimer __itimer;
 
-void __itimer_lock(void);
-void __itimer_unlock(void);
-void __itimer_wipe_and_reset(void);
+void __itimer_wipe(void);
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_ITIMER_H_ */
diff --git a/libc/thread/lock.h b/libc/thread/lock.h
index 75b0177a2..5a095679f 100644
--- a/libc/thread/lock.h
+++ b/libc/thread/lock.h
@@ -2,40 +2,17 @@
 #define COSMOPOLITAN_LIBC_THREAD_LOCK_H_
 COSMOPOLITAN_C_START_
 
-//
-//                                                     ┌undead
-//                                                     │
-//                                                     │┌dead
-//                                                     ││
-//                                                     ││┌robust
-//                                                     │││
-//                                                     │││  ┌depth
-//                                                     │││  │
-//         COSMOPOLITAN MUTEXES                        │││  │   ┌waited
-//                                                     │││  │   │
-//                                                     │││  │   │┌locked
-//                                                     │││  │   ││
-//                                                     │││  │   ││┌pshared
-//                owner                                │││  │   │││
-//                 tid                                 │││  │   │││┌type
-//                  │                                  │││  │   ││││
-//   ┌──────────────┴───────────────┐                  │││┌─┴──┐│││├┐
-// 0b0000000000000000000000000000000000000000000000000000000000000000
-//
-
-#define MUTEX_DEPTH_MIN 0x00000020ull
-#define MUTEX_DEPTH_MAX 0x000007e0ull
+#define MUTEX_DEPTH_MIN 0x00000010ull
+#define MUTEX_DEPTH_MAX 0x000003f0ull
 
 #define MUTEX_TYPE(word)    ((word) & 3)
 #define MUTEX_PSHARED(word) ((word) & 4)
 #define MUTEX_LOCKED(word)  ((word) & 8)
-#define MUTEX_WAITED(word)  ((word) & 16)
 #define MUTEX_DEPTH(word)   ((word) & MUTEX_DEPTH_MAX)
 #define MUTEX_OWNER(word)   ((word) >> 32)
 
 #define MUTEX_LOCK(word)                 (((word) & 7) | 8)
 #define MUTEX_UNLOCK(word)               ((word) & 7)
-#define MUTEX_SET_WAITED(word)           ((word) | 16)
 #define MUTEX_SET_TYPE(word, type)       (((word) & ~3ull) | (type))
 #define MUTEX_SET_PSHARED(word, pshared) (((word) & ~4ull) | (pshared))
 #define MUTEX_INC_DEPTH(word)            ((word) + MUTEX_DEPTH_MIN)
diff --git a/libc/thread/makecontext.c b/libc/thread/makecontext.c
index 328ff6a4e..0108979f7 100644
--- a/libc/thread/makecontext.c
+++ b/libc/thread/makecontext.c
@@ -23,7 +23,7 @@
 #include "libc/nexgen32e/nexgen32e.h"
 #include "libc/nexgen32e/stackframe.h"
 #include "libc/runtime/runtime.h"
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #include "libc/str/str.h"
 #include "libc/thread/thread.h"
 
diff --git a/libc/thread/mktls.c b/libc/thread/mktls.c
index 20d574b93..2f7129a5c 100644
--- a/libc/thread/mktls.c
+++ b/libc/thread/mktls.c
@@ -19,7 +19,7 @@
 #include "ape/sections.internal.h"
 #include "libc/dce.h"
 #include "libc/intrin/atomic.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
@@ -40,9 +40,11 @@ static char *_mktls_finish(struct CosmoTib **out_tib, char *mem,
   tib->tib_ftrace = old->tib_ftrace;
   tib->tib_strace = old->tib_strace;
   tib->tib_sigmask = old->tib_sigmask;
-  atomic_init(&tib->tib_ctid, -1);
-  if (out_tib)
+  tib->tib_locale = (intptr_t)&__c_dot_utf8_locale;
+  atomic_store_explicit(&tib->tib_tid, -1, memory_order_relaxed);
+  if (out_tib) {
     *out_tib = tib;
+  }
   return mem;
 }
 
diff --git a/libc/thread/posixthread.internal.h b/libc/thread/posixthread.internal.h
index 2a4ca4c19..4afebc85d 100644
--- a/libc/thread/posixthread.internal.h
+++ b/libc/thread/posixthread.internal.h
@@ -9,7 +9,8 @@
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 
-#define PT_BLOCKER_EVENT ((atomic_int *)-1)
+#define PT_BLOCKER_SEM ((atomic_int *)-1)
+#define PT_BLOCKER_IO  ((atomic_int *)-2)
 
 COSMOPOLITAN_C_START_
 
@@ -68,67 +69,55 @@ enum PosixThreadStatus {
 
 #define POSIXTHREAD_CONTAINER(e) DLL_CONTAINER(struct PosixThread, list, e)
 
-typedef struct __locale_struct *locale_t;
-
 struct PosixThread {
   int pt_flags;            // 0x00: see PT_* constants
   atomic_int pt_canceled;  // 0x04: thread has bad beliefs
   _Atomic(enum PosixThreadStatus) pt_status;
-  _Atomic(atomic_int *) pt_blocker;
+  atomic_int ptid;            // transitions 0 → tid
   atomic_int pt_refs;         // prevents decimation
   void *(*pt_start)(void *);  // creation callback
-  void *pt_val;               // start param / return val
+  void *pt_arg;               // start's parameter
+  void *pt_rc;                // start's return value
   char *pt_tls;               // bottom of tls allocation
   struct CosmoTib *tib;       // middle of tls allocation
   struct Dll list;            // list of threads
   struct _pthread_cleanup_buffer *pt_cleanup;
+  _Atomic(atomic_int *) pt_blocker;
   uint64_t pt_blkmask;
-  int64_t pt_event;
-  locale_t pt_locale;
-  intptr_t pt_exiter[5];
+  int64_t pt_semaphore;
+  intptr_t pt_iohandle;
+  void *pt_ioverlap;
+  jmp_buf pt_exiter;
   pthread_attr_t pt_attr;
-  atomic_bool pt_intoff;
 };
 
 typedef void (*atfork_f)(void);
 
 extern struct Dll *_pthread_list;
-extern atomic_uint _pthread_count;
 extern struct PosixThread _pthread_static;
 extern _Atomic(pthread_key_dtor) _pthread_key_dtor[PTHREAD_KEYS_MAX];
 
-int _pthread_cond_signal(pthread_cond_t *) dontthrow paramsnonnull();
-int _pthread_mutex_lock(pthread_mutex_t *) dontthrow paramsnonnull();
-int _pthread_mutex_trylock(pthread_mutex_t *) dontthrow paramsnonnull();
-int _pthread_mutex_unlock(pthread_mutex_t *) dontthrow paramsnonnull();
-int _pthread_mutex_wipe_np(pthread_mutex_t *) libcesque paramsnonnull();
+int _pthread_atfork(atfork_f, atfork_f, atfork_f) libcesque;
 int _pthread_reschedule(struct PosixThread *) libcesque;
 int _pthread_setschedparam_freebsd(int, int, const struct sched_param *);
 int _pthread_tid(struct PosixThread *) libcesque;
 intptr_t _pthread_syshand(struct PosixThread *) libcesque;
 long _pthread_cancel_ack(void) libcesque;
-void _pthread_decimate(enum PosixThreadStatus) dontthrow;
-void _pthread_free(struct PosixThread *) libcesque paramsnonnull();
-void _pthread_lock(void) dontthrow;
-void _pthread_onfork_child(void) dontthrow;
-void _pthread_onfork_parent(void) dontthrow;
-void _pthread_onfork_prepare(void) dontthrow;
-void _pthread_unlock(void) dontthrow;
-void _pthread_zombify(struct PosixThread *) dontthrow;
-
-int _pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *) dontthrow
-    paramsnonnull();
-
-int _pthread_cond_timedwait(pthread_cond_t *, pthread_mutex_t *,
-                            const struct timespec *) dontthrow
-    paramsnonnull((1, 2));
+void _pthread_decimate(bool) libcesque;
+void _pthread_free(struct PosixThread *) libcesque;
+void _pthread_lock(void) libcesque;
+void _pthread_onfork_child(void) libcesque;
+void _pthread_onfork_parent(void) libcesque;
+void _pthread_onfork_prepare(void) libcesque;
+void _pthread_unlock(void) libcesque;
+void _pthread_zombify(struct PosixThread *) libcesque;
 
 forceinline pureconst struct PosixThread *_pthread_self(void) {
   return (struct PosixThread *)__get_tls()->tib_pthread;
 }
 
 forceinline void _pthread_ref(struct PosixThread *pt) {
-  atomic_fetch_add_explicit(&pt->pt_refs, 1, memory_order_relaxed);
+  atomic_fetch_add_explicit(&pt->pt_refs, 1, memory_order_acq_rel);
 }
 
 forceinline void _pthread_unref(struct PosixThread *pt) {
diff --git a/libc/thread/pthread_atfork.c b/libc/thread/pthread_atfork.c
deleted file mode 100644
index ec8cc05fc..000000000
--- a/libc/thread/pthread_atfork.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/atomic.h"
-#include "libc/cosmo.h"
-#include "libc/errno.h"
-#include "libc/intrin/strace.h"
-#include "libc/mem/mem.h"
-#include "libc/thread/posixthread.internal.h"
-#include "libc/thread/thread.h"
-
-struct AtFork {
-  struct AtFork *p[2];
-  atfork_f f[3];
-};
-
-struct AtForks {
-  atomic_uint once;
-  pthread_mutex_t lock;
-  struct AtFork *list;
-};
-
-static struct AtForks _atforks = {
-    .lock = PTHREAD_MUTEX_INITIALIZER,
-};
-
-static void pthread_atfork_clear(void) {
-  struct AtFork *a, *b;
-  for (a = _atforks.list; a; a = b) {
-    b = a->p[0];
-    free(a);
-  }
-}
-
-static void pthread_atfork_init(void) {
-  atexit(pthread_atfork_clear);
-}
-
-static void _pthread_onfork(int i, const char *op) {
-  struct AtFork *a;
-  for (a = _atforks.list; a; a = a->p[!i]) {
-    if (a->f[i]) {
-      STRACE("pthread_atfork(%s, %t)", op, a->f[i]);
-      a->f[i]();
-    }
-    _atforks.list = a;
-  }
-}
-
-void _pthread_onfork_prepare(void) {
-  _pthread_mutex_lock(&_atforks.lock);
-  _pthread_onfork(0, "prepare");
-}
-
-void _pthread_onfork_parent(void) {
-  _pthread_onfork(1, "parent");
-  _pthread_mutex_unlock(&_atforks.lock);
-}
-
-void _pthread_onfork_child(void) {
-  _pthread_mutex_wipe_np(&_atforks.lock);
-  _pthread_onfork(2, "child");
-}
-
-/**
- * Registers fork callbacks.
- *
- * When fork happens, your prepare functions will be called in the
- * reverse order they were registered. Then, in the parent and child
- * processes, their callbacks will be called in the same order they were
- * registered.
- *
- * One big caveat with fork() is that it hard kills all threads except
- * the calling thread. So let's say one of those threads was printing to
- * stdout while it was killed. In that case, the stdout lock will still
- * be held when your child process comes alive, which means that the
- * child will deadlock next time it tries to print.
- *
- * The solution for that is simple. Every lock in your process should be
- * registered with this interface. However there's one highly important
- * thing you need to know. Locks must follow a consistent hierarchy. So
- * the order in which you register locks matters. If nested locks aren't
- * acquired in the same order globally, then rarely occurring deadlocks
- * will happen. So what we recommend is that you hunt down all the locks
- * that exist in your app and its dependencies, and register them all at
- * once from your main() function at startup. This ensures a clear order
- * and if you aren't sure what that order should be, cosmo libc has got
- * you covered. Simply link your program with the `cosmocc -mdbg` flag
- * and cosmo will detect locking violations with your `pthread_mutex_t`
- * objects and report them by printing the strongly connected component.
- * This will include the demangled symbol name of each mutex, assuming
- * the `pthread_mutex_t` objects are stored in static memory. cosmo.h
- * also exposes a deadlock API that lets you incorporate your own lock
- * object types into this error checking system, which we also use to
- * verify the entire libc runtime itself. See libc/intrin/deadlock.c.
- *
- * Special care should be taken when using this interface in libraries.
- * While it may seem tempting to use something like a `__constructor__`
- * attribute to register your mutexes in a clean and abstracted way, it
- * is only appropriate if your mutex is guarding pure memory operations
- * and poses zero risk of nesting with locks outside your library. For
- * example, calling open() or printf() while holding your lock will do
- * just that, since the C runtime functions you may consider pure will
- * actually use mutexes under the hood, which are also validated under
- * `cosmocc -mdbg` builds. So if your locks can't be made unnestable
- * pure memory operations, then you should consider revealing their
- * existence to users of your library.
- *
- * Here's an example of how pthread_atfork() can be used:
- *
- *     static struct {
- *       pthread_once_t once;
- *       pthread_mutex_t lock;
- *       // data structures...
- *     } g_lib;
- *
- *     static void lib_lock(void) {
- *       pthread_mutex_lock(&g_lib.lock);
- *     }
- *
- *     static void lib_unlock(void) {
- *       pthread_mutex_unlock(&g_lib.lock);
- *     }
- *
- *     static void lib_wipe(void) {
- *       pthread_mutex_wipe_np(&g_lib.lock);
- *     }
- *
- *     static void lib_setup(void) {
- *       pthread_mutex_init(&g_lib.lock, 0);
- *       pthread_atfork(lib_lock, lib_unlock, lib_wipe);
- *     }
- *
- *     static void lib_init(void) {
- *       pthread_once(&g_lib.once, lib_setup);
- *     }
- *
- *     void lib(void) {
- *       lib_init();
- *       lib_lock();
- *       // do stuff...
- *       lib_unlock();
- *     }
- *
- * @param prepare is run by fork() before forking happens
- * @param parent is run by fork() after forking happens in parent process
- * @param child is run by fork() after forking happens in childe process
- * @return 0 on success, or errno on error
- * @raise ENOMEM if we require more vespene gas
- */
-int pthread_atfork(atfork_f prepare, atfork_f parent, atfork_f child) {
-  cosmo_once(&_atforks.once, pthread_atfork_init);
-  struct AtFork *a;
-  if (!(a = calloc(1, sizeof(struct AtFork))))
-    return ENOMEM;
-  a->f[0] = prepare;
-  a->f[1] = parent;
-  a->f[2] = child;
-  _pthread_mutex_lock(&_atforks.lock);
-  a->p[0] = 0;
-  a->p[1] = _atforks.list;
-  if (_atforks.list)
-    _atforks.list->p[0] = a;
-  _atforks.list = a;
-  _pthread_mutex_unlock(&_atforks.lock);
-  return 0;
-}
diff --git a/libc/thread/pthread_attr_getguardsize.c b/libc/thread/pthread_attr_getguardsize.c
index ba10c3014..fd4524efb 100644
--- a/libc/thread/pthread_attr_getguardsize.c
+++ b/libc/thread/pthread_attr_getguardsize.c
@@ -19,7 +19,7 @@
 #include "libc/thread/thread.h"
 
 /**
- * Returns size of protected region beneath thread stack.
+ * Returns size of protected region at bottom of thread stack.
  *
  * @param guardsize will be set to guard size in bytes
  * @return 0 on success, or errno on error
diff --git a/libc/thread/pthread_attr_getstack.c b/libc/thread/pthread_attr_getstack.c
index 27c744d81..8b9a9c06d 100644
--- a/libc/thread/pthread_attr_getstack.c
+++ b/libc/thread/pthread_attr_getstack.c
@@ -20,13 +20,15 @@
 #include "libc/thread/thread.h"
 
 /**
- * Returns configuration for custom thread stack.
+ * Returns configuration for thread stack.
  *
- * If zero is returned to `*stackaddr` then a custom stack hasn't been
- * specified by a previous call to pthread_attr_setstack().
+ * This is a getter for a configuration attribute. By default, zeros are
+ * returned. If pthread_attr_setstack() was called earlier, then this'll
+ * return those earlier supplied values.
  *
  * @param stackaddr will be set to stack address in bytes
  * @return 0 on success, or errno on error
+ * @see pthread_attr_setstacksize()
  */
 errno_t pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr,
                               size_t *stacksize) {
diff --git a/libc/thread/pthread_attr_init.c b/libc/thread/pthread_attr_init.c
index 4ef1e9207..b4c82204e 100644
--- a/libc/thread/pthread_attr_init.c
+++ b/libc/thread/pthread_attr_init.c
@@ -34,13 +34,11 @@
  * @see pthread_attr_setschedpolicy()
  * @see pthread_attr_setinheritsched()
  * @see pthread_attr_setscope()
- * @see pthread_attr_setsigaltstack_np()
- * @see pthread_attr_setsigaltstacksize_np()
  */
 errno_t pthread_attr_init(pthread_attr_t *attr) {
   *attr = (pthread_attr_t){
       .__stacksize = GetStackSize(),
-      .__guardsize = GetGuardSize(),
+      .__guardsize = __pagesize,
   };
   return 0;
 }
diff --git a/libc/thread/pthread_attr_setdetachstate.c b/libc/thread/pthread_attr_setdetachstate.c
index e9a57a084..253f04495 100644
--- a/libc/thread/pthread_attr_setdetachstate.c
+++ b/libc/thread/pthread_attr_setdetachstate.c
@@ -28,10 +28,6 @@
  *     pthread_create(0, &attr, func, 0);
  *     pthread_attr_destroy(&attr);
  *
- * If you use this, please be warned that your thread might run and exit
- * before pthread_create() even returns. You really should assume it can
- * not be used with any pthread APIs from the calling thread.
- *
  * @param detachstate can be one of
  *     - `PTHREAD_CREATE_JOINABLE` (default)
  *     - `PTHREAD_CREATE_DETACHED`
diff --git a/libc/thread/pthread_attr_setguardsize.c b/libc/thread/pthread_attr_setguardsize.c
index 4b776cdd9..e404ea04f 100644
--- a/libc/thread/pthread_attr_setguardsize.c
+++ b/libc/thread/pthread_attr_setguardsize.c
@@ -19,7 +19,13 @@
 #include "libc/thread/thread.h"
 
 /**
- * Sets minimum size of protected region beneath thread stack.
+ * Sets size of protected region at bottom of thread stack.
+ *
+ * Cosmopolitan sets this value to `sysconf(_SC_PAGESIZE)` by default.
+ *
+ * You may set `guardsize` to disable the stack guard feature and gain a
+ * slight performance advantage by avoiding mprotect() calls. Note that
+ * it could make your code more prone to silent unreported corruption.
  *
  * @param guardsize contains guard size in bytes, which is implicitly
  *     rounded up to `sysconf(_SC_PAGESIZE)`, or zero to disable
diff --git a/libc/thread/pthread_attr_setsigmask_np.c b/libc/thread/pthread_attr_setsigmask_np.c
index a42e8b055..b46c94e57 100644
--- a/libc/thread/pthread_attr_setsigmask_np.c
+++ b/libc/thread/pthread_attr_setsigmask_np.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/sysv/consts/sig.h"
 #include "libc/thread/thread2.h"
 
 /**
diff --git a/libc/thread/pthread_attr_setstack.c b/libc/thread/pthread_attr_setstack.c
index 9017362af..8bfaed866 100644
--- a/libc/thread/pthread_attr_setstack.c
+++ b/libc/thread/pthread_attr_setstack.c
@@ -16,42 +16,64 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/runtime/stack.h"
+#include "libc/limits.h"
 #include "libc/thread/thread.h"
 
 /**
- * Configures custom stack for thread.
+ * Configures custom allocated stack for thread, e.g.
  *
- * Normally you want to use pthread_attr_setstacksize() and
- * pthread_attr_setguardsize() to configure how pthread_create()
- * allocates stack memory for newly created threads. Cosmopolitan is
- * very good at managing stack memory. However if you still want to
- * allocate stack memory on your own, POSIX defines this function.
+ *     pthread_t id;
+ *     pthread_attr_t attr;
+ *     char *stk = NewCosmoStack();
+ *     pthread_attr_init(&attr);
+ *     pthread_attr_setstack(&attr, stk, GetStackSize());
+ *     pthread_create(&id, &attr, func, 0);
+ *     pthread_attr_destroy(&attr);
+ *     pthread_join(id, 0);
+ *     FreeCosmoStack(stk);
  *
- * Your `stackaddr` points to the byte at the very bottom of your stack.
- * You are responsible for this memory. Your POSIX threads runtime will
- * not free or unmap this allocation when the thread has terminated. If
- * `stackaddr` is null then `stacksize` is ignored and default behavior
- * is restored, i.e. pthread_create() will manage stack allocations.
+ * Your stack must have at least `PTHREAD_STACK_MIN` bytes, which
+ * Cosmpolitan Libc defines as `GetStackSize()`. It's a link-time
+ * constant used by Actually Portable Executable that's 128 kb by
+ * default. See libc/runtime/stack.h for docs on your stack limit
+ * since the APE ELF phdrs are the one true source of truth here.
  *
- * Your `stackaddr` could be created by malloc(). On OpenBSD,
- * pthread_create() will augment your custom allocation so it's
- * permissable by the kernel to use as a stack. You may also call
- * Cosmopolitan APIs such NewCosmoStack() and cosmo_stack_alloc().
- * Static memory can be used, but it won't reduce pthread footprint.
+ * Cosmpolitan Libc runtime magic (e.g. ftrace) and memory safety
+ * (e.g. kprintf) assumes that stack sizes are two-powers and are
+ * aligned to that two-power. Conformance isn't required since we
+ * say caveat emptor to those who don't maintain these invariants
+ * please consider using NewCosmoStack(), which is always perfect
+ * or use `mmap(0, GetStackSize() << 1, ...)` for a bigger stack.
  *
+ * Unlike pthread_attr_setstacksize(), this function permits just
+ * about any parameters and will change the values and allocation
+ * as needed to conform to the mandatory requirements of the host
+ * operating system even if it doesn't meet the stricter needs of
+ * Cosmopolitan Libc userspace libraries. For example with malloc
+ * allocations, things like page size alignment, shall be handled
+ * automatically for compatibility with existing codebases.
+ *
+ * The same stack shouldn't be used for two separate threads. Use
+ * fresh stacks for each thread so that ASAN can be much happier.
+ *
+ * @param stackaddr is address of stack allocated by caller, and
+ *     may be NULL in which case default behavior is restored
+ * @param stacksize is size of caller allocated stack
  * @return 0 on success, or errno on error
- * @raise EINVAL if `stacksize` is less than `PTHREAD_STACK_MIN`
+ * @raise EINVAL if parameters were unacceptable
  * @see pthread_attr_setstacksize()
  */
 errno_t pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr,
                               size_t stacksize) {
   if (!stackaddr) {
     attr->__stackaddr = 0;
-    attr->__stacksize = GetStackSize();
+    attr->__stacksize = 0;
     return 0;
   }
+  if (stacksize > INT_MAX)
+    return EINVAL;
   if (stacksize < PTHREAD_STACK_MIN)
     return EINVAL;
   attr->__stackaddr = stackaddr;
diff --git a/libc/thread/pthread_attr_setstacksize.c b/libc/thread/pthread_attr_setstacksize.c
index 217b62fad..58e69eb15 100644
--- a/libc/thread/pthread_attr_setstacksize.c
+++ b/libc/thread/pthread_attr_setstacksize.c
@@ -17,36 +17,19 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
+#include "libc/limits.h"
 #include "libc/thread/thread.h"
 
 /**
- * Specifies minimum stack size for thread, e.g.
- *
- *     pthread_t th;
- *     pthread_attr_t attr;
- *     pthread_attr_init(&attr);
- *     pthread_attr_setguardsize(&attr, 4096);
- *     pthread_attr_setstacksize(&attr, 61440);
- *     pthread_create(&th, &attr, thfunc, arg);
- *     pthread_attr_destroy(&attr);
- *
- * On Linux, if you're not using `cosmocc -mtiny`, and you're not using
- * cosmo_dlopen(), and guard size is nonzero, then `MAP_GROWSDOWN` will
- * be used to create your stack memory. This helps minimize virtual
- * memory consumption. Please note this is only possible if `stacksize`
- * is no larger than the current `RLIMIT_STACK`, otherwise the runtime
- * will map your stack using plain old mmap().
- *
- * Non-custom stacks may be recycled by the cosmo runtime. You can
- * control this behavior by calling cosmo_stack_setmaxstacks(). It's
- * useful for both tuning performance and hardening security. See also
- * pthread_attr_setguardsize() which is important for security too.
+ * Defines minimum stack size for thread.
  *
  * @param stacksize contains stack size in bytes
  * @return 0 on success, or errno on error
  * @raise EINVAL if `stacksize` is less than `PTHREAD_STACK_MIN`
  */
 errno_t pthread_attr_setstacksize(pthread_attr_t *a, size_t stacksize) {
+  if (stacksize > INT_MAX)
+    return EINVAL;
   if (stacksize < PTHREAD_STACK_MIN)
     return EINVAL;
   a->__stacksize = stacksize;
diff --git a/libc/thread/pthread_barrier_wait.c b/libc/thread/pthread_barrier_wait.c
index 5a318feed..27f64773e 100644
--- a/libc/thread/pthread_barrier_wait.c
+++ b/libc/thread/pthread_barrier_wait.c
@@ -17,11 +17,11 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/blockcancel.internal.h"
-#include "libc/cosmo.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
 #include "libc/limits.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 
 /**
  * Waits for all threads to arrive at barrier.
@@ -54,14 +54,14 @@ errno_t pthread_barrier_wait(pthread_barrier_t *barrier) {
     atomic_store_explicit(&barrier->_counter, barrier->_count,
                           memory_order_release);
     atomic_store_explicit(&barrier->_waiters, 0, memory_order_release);
-    cosmo_futex_wake(&barrier->_waiters, INT_MAX, barrier->_pshared);
+    nsync_futex_wake_(&barrier->_waiters, INT_MAX, barrier->_pshared);
     return PTHREAD_BARRIER_SERIAL_THREAD;
   }
 
   // wait for everyone else to arrive at barrier
   BLOCK_CANCELATION;
   while ((n = atomic_load_explicit(&barrier->_waiters, memory_order_acquire)))
-    cosmo_futex_wait(&barrier->_waiters, n, barrier->_pshared, 0, 0);
+    nsync_futex_wait_(&barrier->_waiters, n, barrier->_pshared, 0);
   ALLOW_CANCELATION;
 
   return 0;
diff --git a/libc/thread/pthread_cancel.c b/libc/thread/pthread_cancel.c
index 2ac5cf305..5ddbea0db 100644
--- a/libc/thread/pthread_cancel.c
+++ b/libc/thread/pthread_cancel.c
@@ -188,6 +188,7 @@ static errno_t _pthread_cancel_everyone(void) {
  * - `connect`
  * - `copy_file_range`
  * - `creat`
+ * - `epoll_wait`
  * - `fcntl(F_OFD_SETLKW)`
  * - `fcntl(F_SETLKW)`
  * - `fdatasync`
diff --git a/libc/thread/pthread_cond_broadcast.c b/libc/thread/pthread_cond_broadcast.c
index 894a76fb4..b757c867c 100644
--- a/libc/thread/pthread_cond_broadcast.c
+++ b/libc/thread/pthread_cond_broadcast.c
@@ -16,16 +16,11 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/cosmo.h"
-#include "libc/dce.h"
 #include "libc/intrin/atomic.h"
 #include "libc/limits.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/cv.h"
-
-__static_yoink("nsync_mu_lock");
-__static_yoink("nsync_mu_unlock");
-__static_yoink("nsync_mu_trylock");
+#include "third_party/nsync/futex.internal.h"
 
 /**
  * Wakes all threads waiting on condition, e.g.
@@ -47,15 +42,10 @@ __static_yoink("nsync_mu_trylock");
 errno_t pthread_cond_broadcast(pthread_cond_t *cond) {
 
 #if PTHREAD_USE_NSYNC
-  // do nothing if pthread_cond_timedwait() hasn't been called yet
-  // this is because we dont know for certain if nsync use is safe
-  if (!atomic_load_explicit(&cond->_waited, memory_order_acquire))
-    return 0;
-
   // favor *NSYNC if this is a process private condition variable
   // if using Mike Burrows' code isn't possible, use a naive impl
-  if (!cond->_footek) {
-    nsync_cv_broadcast((nsync_cv *)cond->_nsync);
+  if (!cond->_pshared) {
+    nsync_cv_broadcast((nsync_cv *)cond);
     return 0;
   }
 #endif
@@ -63,6 +53,6 @@ errno_t pthread_cond_broadcast(pthread_cond_t *cond) {
   // roll forward the monotonic sequence
   atomic_fetch_add_explicit(&cond->_sequence, 1, memory_order_acq_rel);
   if (atomic_load_explicit(&cond->_waiters, memory_order_acquire))
-    cosmo_futex_wake((atomic_int *)&cond->_sequence, INT_MAX, cond->_pshared);
+    nsync_futex_wake_((atomic_int *)&cond->_sequence, INT_MAX, cond->_pshared);
   return 0;
 }
diff --git a/libc/thread/pthread_cond_destroy.c b/libc/thread/pthread_cond_destroy.c
index bb0783671..c5a180e4a 100644
--- a/libc/thread/pthread_cond_destroy.c
+++ b/libc/thread/pthread_cond_destroy.c
@@ -33,7 +33,7 @@ errno_t pthread_cond_destroy(pthread_cond_t *cond) {
   // check if there's active waiters
 #if PTHREAD_USE_NSYNC
   if (!cond->_pshared) {
-    if (((nsync_cv *)cond->_nsync)->waiters)
+    if (((nsync_cv *)cond)->waiters)
       return EINVAL;
   } else {
     if (atomic_load_explicit(&cond->_waiters, memory_order_relaxed))
diff --git a/libc/thread/pthread_cond_init.c b/libc/thread/pthread_cond_init.c
index 731ff48c1..8f6fbe298 100644
--- a/libc/thread/pthread_cond_init.c
+++ b/libc/thread/pthread_cond_init.c
@@ -16,12 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/dce.h"
-#include "libc/sysv/consts/clock.h"
 #include "libc/thread/thread.h"
 
 /**
- * Initializes condition variable.
+ * Initializes condition.
  *
  * @param attr may be null
  * @return 0 on success, or error number on failure
@@ -29,9 +27,7 @@
 errno_t pthread_cond_init(pthread_cond_t *cond,
                           const pthread_condattr_t *attr) {
   *cond = (pthread_cond_t){0};
-  if (attr) {
-    cond->_pshared = attr->_pshared;
-    cond->_clock = attr->_clock;
-  }
+  if (attr)
+    cond->_pshared = *attr;
   return 0;
 }
diff --git a/libc/thread/pthread_cond_signal.c b/libc/thread/pthread_cond_signal.c
index fe6244d1e..e2a615df0 100644
--- a/libc/thread/pthread_cond_signal.c
+++ b/libc/thread/pthread_cond_signal.c
@@ -16,15 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/cosmo.h"
-#include "libc/dce.h"
 #include "libc/intrin/atomic.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/cv.h"
-
-__static_yoink("nsync_mu_lock");
-__static_yoink("nsync_mu_unlock");
-__static_yoink("nsync_mu_trylock");
+#include "third_party/nsync/futex.internal.h"
 
 /**
  * Wakes at least one thread waiting on condition, e.g.
@@ -43,18 +38,13 @@ __static_yoink("nsync_mu_trylock");
  * @see pthread_cond_broadcast
  * @see pthread_cond_wait
  */
-errno_t _pthread_cond_signal(pthread_cond_t *cond) {
+errno_t pthread_cond_signal(pthread_cond_t *cond) {
 
 #if PTHREAD_USE_NSYNC
-  // do nothing if pthread_cond_timedwait() hasn't been called yet
-  // this is because we dont know for certain if nsync use is safe
-  if (!atomic_load_explicit(&cond->_waited, memory_order_acquire))
-    return 0;
-
   // favor *NSYNC if this is a process private condition variable
   // if using Mike Burrows' code isn't possible, use a naive impl
-  if (!cond->_footek) {
-    nsync_cv_signal((nsync_cv *)cond->_nsync);
+  if (!cond->_pshared) {
+    nsync_cv_signal((nsync_cv *)cond);
     return 0;
   }
 #endif
@@ -62,8 +52,6 @@ errno_t _pthread_cond_signal(pthread_cond_t *cond) {
   // roll forward the monotonic sequence
   atomic_fetch_add_explicit(&cond->_sequence, 1, memory_order_acq_rel);
   if (atomic_load_explicit(&cond->_waiters, memory_order_acquire))
-    cosmo_futex_wake((atomic_int *)&cond->_sequence, 1, cond->_pshared);
+    nsync_futex_wake_((atomic_int *)&cond->_sequence, 1, cond->_pshared);
   return 0;
 }
-
-__weak_reference(_pthread_cond_signal, pthread_cond_signal);
diff --git a/libc/thread/pthread_cond_timedwait.c b/libc/thread/pthread_cond_timedwait.c
index 9e4daff39..22ea8c240 100644
--- a/libc/thread/pthread_cond_timedwait.c
+++ b/libc/thread/pthread_cond_timedwait.c
@@ -18,38 +18,24 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/calls/cp.internal.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/cosmo.h"
-#include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/intrin/atomic.h"
-#include "libc/sysv/consts/clock.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/thread2.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/cv.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/time.h"
 
-__static_yoink("nsync_mu_lock");
-__static_yoink("nsync_mu_unlock");
-__static_yoink("nsync_mu_trylock");
-
 struct PthreadWait {
   pthread_cond_t *cond;
   pthread_mutex_t *mutex;
 };
 
-static bool can_use_nsync(uint64_t muword) {
-  return !IsXnuSilicon() &&  //
-         MUTEX_TYPE(muword) != PTHREAD_MUTEX_RECURSIVE &&
-         MUTEX_PSHARED(muword) == PTHREAD_PROCESS_PRIVATE;
-}
-
 static void pthread_cond_leave(void *arg) {
   struct PthreadWait *wait = (struct PthreadWait *)arg;
-  if (_pthread_mutex_lock(wait->mutex))
+  if (pthread_mutex_lock(wait->mutex))
     __builtin_trap();
   atomic_fetch_sub_explicit(&wait->cond->_waiters, 1, memory_order_acq_rel);
 }
@@ -68,15 +54,15 @@ static errno_t pthread_cond_timedwait_impl(pthread_cond_t *cond,
 
   // start waiting on condition variable
   atomic_fetch_add_explicit(&cond->_waiters, 1, memory_order_acq_rel);
-  if (_pthread_mutex_unlock(mutex))
+  if (pthread_mutex_unlock(mutex))
     __builtin_trap();
 
   // wait for sequence change, timeout, or cancelation
   int rc;
   struct PthreadWait waiter = {cond, mutex};
   pthread_cleanup_push(pthread_cond_leave, &waiter);
-  rc = cosmo_futex_wait((atomic_int *)&cond->_sequence, seq1, cond->_pshared,
-                        cond->_clock, abstime);
+  rc = nsync_futex_wait_((atomic_int *)&cond->_sequence, seq1, cond->_pshared,
+                         abstime);
   pthread_cleanup_pop(true);
   if (rc == -EAGAIN)
     rc = 0;
@@ -95,10 +81,8 @@ static errno_t pthread_cond_timedwait_impl(pthread_cond_t *cond,
  *     }
  *
  * @param mutex needs to be held by thread when calling this function
- * @param abstime is an absolute timestamp, which may be null to wait
- *     forever; it's relative to `clock_gettime(CLOCK_REALTIME)` by
- *     default; pthread_condattr_setclock() may be used to customize
- *     which system clock is used
+ * @param abstime may be null to wait indefinitely and should contain
+ *     some arbitrary interval added to a `CLOCK_REALTIME` timestamp
  * @return 0 on success, or errno on error
  * @raise ETIMEDOUT if `abstime` was specified and the current time
  *     exceeded its value
@@ -110,8 +94,8 @@ static errno_t pthread_cond_timedwait_impl(pthread_cond_t *cond,
  * @see pthread_cond_signal()
  * @cancelationpoint
  */
-errno_t _pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
-                                const struct timespec *abstime) {
+errno_t pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
+                               const struct timespec *abstime) {
 
   // validate arguments
   struct PosixThread *pt;
@@ -124,37 +108,23 @@ errno_t _pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
   uint64_t muword = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
 
   // check that mutex is held by caller
-  if (IsModeDbg() || MUTEX_TYPE(muword) == PTHREAD_MUTEX_ERRORCHECK)
-    if (__deadlock_tracked(mutex) == 0)
-      return EPERM;
+  if (MUTEX_TYPE(muword) == PTHREAD_MUTEX_ERRORCHECK &&
+      MUTEX_OWNER(muword) != gettid())
+    return EPERM;
 
-  // if the cond is process shared then the mutex needs to be too
-  if ((cond->_pshared == PTHREAD_PROCESS_SHARED) ^
-      (MUTEX_PSHARED(muword) == PTHREAD_PROCESS_SHARED))
-    return EINVAL;
-
-#if PTHREAD_USE_NSYNC
-  // the first time pthread_cond_timedwait() is called we learn if the
-  // associated mutex is normal and private. that means *NSYNC is safe
-  // this decision is permanent. you can't use a recursive mutex later
-  if (!atomic_load_explicit(&cond->_waited, memory_order_acquire)) {
-    cond->_footek = !can_use_nsync(muword);
-    atomic_store_explicit(&cond->_waited, true, memory_order_release);
-  } else if (!cond->_footek) {
-    if (!can_use_nsync(muword))
+  // if condition variable is shared then mutex must be too
+  if (cond->_pshared)
+    if (MUTEX_PSHARED(muword) != PTHREAD_PROCESS_SHARED)
       return EINVAL;
-  }
-#endif
 
-  // now perform the actual wait
   errno_t err;
   BEGIN_CANCELATION_POINT;
 #if PTHREAD_USE_NSYNC
   // favor *NSYNC if this is a process private condition variable
   // if using Mike Burrows' code isn't possible, use a naive impl
-  if (!cond->_footek) {
+  if (!cond->_pshared) {
     err = nsync_cv_wait_with_deadline(
-        (nsync_cv *)cond->_nsync, (nsync_mu *)mutex->_nsync, cond->_clock,
+        (nsync_cv *)cond, (nsync_mu *)mutex,
         abstime ? *abstime : nsync_time_no_deadline, 0);
   } else {
     err = pthread_cond_timedwait_impl(cond, mutex, abstime);
@@ -165,5 +135,3 @@ errno_t _pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
   END_CANCELATION_POINT;
   return err;
 }
-
-__weak_reference(_pthread_cond_timedwait, pthread_cond_timedwait);
diff --git a/libc/thread/pthread_cond_wait.c b/libc/thread/pthread_cond_wait.c
index e6ffd619c..df7d42dd1 100644
--- a/libc/thread/pthread_cond_wait.c
+++ b/libc/thread/pthread_cond_wait.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/thread2.h"
 
@@ -40,8 +39,6 @@
  * @see pthread_cond_signal
  * @cancelationpoint
  */
-errno_t _pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) {
+errno_t pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) {
   return pthread_cond_timedwait(cond, mutex, 0);
 }
-
-__weak_reference(_pthread_cond_wait, pthread_cond_wait);
diff --git a/libc/thread/pthread_condattr_getpshared.c b/libc/thread/pthread_condattr_getpshared.c
index e1b5dc7fc..81963c4c3 100644
--- a/libc/thread/pthread_condattr_getpshared.c
+++ b/libc/thread/pthread_condattr_getpshared.c
@@ -28,6 +28,6 @@
  */
 errno_t pthread_condattr_getpshared(const pthread_condattr_t *attr,
                                     int *pshared) {
-  *pshared = attr->_pshared;
+  *pshared = *attr;
   return 0;
 }
diff --git a/libc/thread/pthread_condattr_init.c b/libc/thread/pthread_condattr_init.c
index cd377f087..575912210 100644
--- a/libc/thread/pthread_condattr_init.c
+++ b/libc/thread/pthread_condattr_init.c
@@ -24,6 +24,6 @@
  * @return 0 on success, or error on failure
  */
 errno_t pthread_condattr_init(pthread_condattr_t *attr) {
-  *attr = (pthread_condattr_t){0};
+  *attr = 0;
   return 0;
 }
diff --git a/libc/thread/pthread_condattr_setpshared.c b/libc/thread/pthread_condattr_setpshared.c
index f109fe518..8d59b2fa7 100644
--- a/libc/thread/pthread_condattr_setpshared.c
+++ b/libc/thread/pthread_condattr_setpshared.c
@@ -32,7 +32,7 @@ errno_t pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared) {
   switch (pshared) {
     case PTHREAD_PROCESS_SHARED:
     case PTHREAD_PROCESS_PRIVATE:
-      attr->_pshared = pshared;
+      *attr = pshared;
       return 0;
     default:
       return EINVAL;
diff --git a/libc/thread/pthread_create.c b/libc/thread/pthread_create.c
index 974a3b592..6f9c86469 100644
--- a/libc/thread/pthread_create.c
+++ b/libc/thread/pthread_create.c
@@ -18,12 +18,10 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
-#include "libc/calls/sig.internal.h"
 #include "libc/calls/struct/sigaltstack.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/syscall-sysv.internal.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/fmt/itoa.h"
@@ -31,11 +29,10 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/dll.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/intrin/stack.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/log/internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alloca.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/crc32.h"
@@ -47,27 +44,30 @@
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
 #include "libc/runtime/syslib.internal.h"
-#include "libc/str/locale.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/auxv.h"
 #include "libc/sysv/consts/clone.h"
+#include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/consts/ss.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
-#include "third_party/nsync/wait_s.internal.h"
 
 __static_yoink("nsync_mu_lock");
 __static_yoink("nsync_mu_unlock");
 __static_yoink("nsync_mu_trylock");
 __static_yoink("nsync_mu_rlock");
 __static_yoink("nsync_mu_runlock");
+__static_yoink("_pthread_atfork");
 __static_yoink("_pthread_onfork_prepare");
 __static_yoink("_pthread_onfork_parent");
 __static_yoink("_pthread_onfork_child");
 
+#define MAP_ANON_OPENBSD  0x1000
+#define MAP_STACK_OPENBSD 0x4000
+
 void _pthread_free(struct PosixThread *pt) {
 
   // thread must be removed from _pthread_list before calling
@@ -79,17 +79,12 @@ void _pthread_free(struct PosixThread *pt) {
 
   // unmap stack if the cosmo runtime was responsible for mapping it
   if (pt->pt_flags & PT_OWNSTACK)
-    cosmo_stack_free(pt->pt_attr.__stackaddr, pt->pt_attr.__stacksize,
-                     pt->pt_attr.__guardsize);
-
-  // reclaim thread's cached nsync waiter object
-  if (pt->tib->tib_nsync)
-    nsync_waiter_destroy_(pt->tib->tib_nsync);
+    unassert(!munmap(pt->pt_attr.__stackaddr, pt->pt_attr.__stacksize));
 
   // free any additional upstream system resources
   // our fork implementation wipes this handle in child automatically
   uint64_t syshand =
-      atomic_load_explicit(&pt->tib->tib_syshand, memory_order_relaxed);
+      atomic_load_explicit(&pt->tib->tib_syshand, memory_order_acquire);
   if (syshand) {
     if (IsWindows())
       unassert(CloseHandle(syshand));  // non-inheritable
@@ -98,16 +93,13 @@ void _pthread_free(struct PosixThread *pt) {
   }
 
   // free heap memory associated with thread
-  bulk_free(
-      (void *[]){
-          pt->pt_flags & PT_OWNSIGALTSTACK ? pt->pt_attr.__sigaltstackaddr : 0,
-          pt->pt_tls,
-          pt,
-      },
-      3);
+  if (pt->pt_flags & PT_OWNSIGALTSTACK)
+    free(pt->pt_attr.__sigaltstackaddr);
+  free(pt->pt_tls);
+  free(pt);
 }
 
-void _pthread_decimate(enum PosixThreadStatus threshold) {
+void _pthread_decimate(bool annihilation_only) {
   struct PosixThread *pt;
   struct Dll *e, *e2, *list = 0;
   enum PosixThreadStatus status;
@@ -122,22 +114,26 @@ void _pthread_decimate(enum PosixThreadStatus threshold) {
     pt = POSIXTHREAD_CONTAINER(e);
     if (atomic_load_explicit(&pt->pt_refs, memory_order_acquire) > 0)
       continue;  // pthread_kill() has a lease on this thread
-    if (atomic_load_explicit(&pt->tib->tib_ctid, memory_order_acquire))
-      continue;  // thread is still using stack so leave alone
     status = atomic_load_explicit(&pt->pt_status, memory_order_acquire);
-    if (status < threshold) {
-      if (threshold == kPosixThreadZombie)
-        break;  // zombies only exist at the end of the linked list
-      continue;
-    }
-    if (status == kPosixThreadTerminated)
-      if (!(pt->pt_flags & PT_STATIC))
-        STRACE("warning: you forgot to join or detach thread id %d",
-               atomic_load_explicit(&pt->tib->tib_ptid, memory_order_acquire));
+    if (status != kPosixThreadZombie)
+      break;  // zombies only exist at the end of the linked list
+    if (atomic_load_explicit(&pt->tib->tib_tid, memory_order_acquire))
+      continue;  // undead thread that should stop existing soon
     dll_remove(&_pthread_list, e);
     dll_make_first(&list, e);
   }
 
+  // code like pthread_exit() needs to call this in order to know if
+  // it's appropriate to run exit() handlers however we really don't
+  // want to have a thread exiting block on a bunch of __maps locks!
+  // therefore we only take action if we'll destroy all but the self
+  if (annihilation_only)
+    if (!(_pthread_list == _pthread_list->prev &&
+          _pthread_list == _pthread_list->next)) {
+      dll_make_last(&_pthread_list, list);
+      list = 0;
+    }
+
   // release posix threads gil
   _pthread_unlock();
 
@@ -151,7 +147,8 @@ void _pthread_decimate(enum PosixThreadStatus threshold) {
   }
 }
 
-static int PosixThread(void *arg) {
+static int PosixThread(void *arg, int tid) {
+  void *rc;
   struct PosixThread *pt = arg;
 
   // setup scheduling
@@ -162,30 +159,27 @@ static int PosixThread(void *arg) {
 
   // setup signal stack
   if (pt->pt_attr.__sigaltstacksize) {
-    struct sigaltstack *ss = alloca(sizeof(struct sigaltstack));
-    ss->ss_sp = pt->pt_attr.__sigaltstackaddr;
-    ss->ss_size = pt->pt_attr.__sigaltstacksize;
-    ss->ss_flags = 0;
-    unassert(!sigaltstack(ss, 0));
+    struct sigaltstack ss;
+    ss.ss_sp = pt->pt_attr.__sigaltstackaddr;
+    ss.ss_size = pt->pt_attr.__sigaltstacksize;
+    ss.ss_flags = 0;
+    unassert(!sigaltstack(&ss, 0));
   }
 
   // set long jump handler so pthread_exit can bring control back here
-  if (!__builtin_setjmp(pt->pt_exiter)) {
-    // setup signals for new thread
-    pt->pt_attr.__sigmask &= ~(1ull << (SIGTHR - 1));
+  if (!setjmp(pt->pt_exiter)) {
+    sigdelset(&pt->pt_attr.__sigmask, SIGTHR);
     if (IsWindows() || IsMetal()) {
       atomic_store_explicit(&__get_tls()->tib_sigmask, pt->pt_attr.__sigmask,
                             memory_order_release);
-      if (_weaken(__sig_check))
-        _weaken(__sig_check)();
     } else {
       sys_sigprocmask(SIG_SETMASK, &pt->pt_attr.__sigmask, 0);
     }
-    void *ret = pt->pt_start(pt->pt_val);
+    rc = pt->pt_start(pt->pt_arg);
     // ensure pthread_cleanup_pop(), and pthread_exit() popped cleanup
     unassert(!pt->pt_cleanup);
     // calling pthread_exit() will either jump back here, or call exit
-    pthread_exit(ret);
+    pthread_exit(rc);
   }
 
   // avoid signal handler being triggered after we trash our own stack
@@ -195,24 +189,59 @@ static int PosixThread(void *arg) {
   return 0;
 }
 
+static bool TellOpenbsdThisIsStackMemory(void *addr, size_t size) {
+  return __sys_mmap(
+             addr, size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_FIXED | MAP_ANON_OPENBSD | MAP_STACK_OPENBSD, -1,
+             0, 0) == addr;
+}
+
+// OpenBSD only permits RSP to occupy memory that's been explicitly
+// defined as stack memory, i.e. `lo <= %rsp < hi` must be the case
+static errno_t FixupCustomStackOnOpenbsd(pthread_attr_t *attr) {
+
+  // get interval
+  uintptr_t lo = (uintptr_t)attr->__stackaddr;
+  uintptr_t hi = lo + attr->__stacksize;
+
+  // squeeze interval
+  lo = (lo + __pagesize - 1) & -__pagesize;
+  hi = hi & -__pagesize;
+
+  // tell os it's stack memory
+  errno_t olderr = errno;
+  if (!TellOpenbsdThisIsStackMemory((void *)lo, hi - lo)) {
+    errno_t err = errno;
+    errno = olderr;
+    return err;
+  }
+
+  // update attributes with usable stack address
+  attr->__stackaddr = (void *)lo;
+  attr->__stacksize = hi - lo;
+  return 0;
+}
+
 static errno_t pthread_create_impl(pthread_t *thread,
                                    const pthread_attr_t *attr,
                                    void *(*start_routine)(void *), void *arg,
                                    sigset_t oldsigs) {
-  errno_t err;
+  int rc, e = errno;
   struct PosixThread *pt;
 
   // create posix thread object
-  if (!(pt = calloc(1, sizeof(struct PosixThread))))
+  if (!(pt = calloc(1, sizeof(struct PosixThread)))) {
+    errno = e;
     return EAGAIN;
+  }
   dll_init(&pt->list);
-  pt->pt_locale = &__global_locale;
   pt->pt_start = start_routine;
-  pt->pt_val = arg;
+  pt->pt_arg = arg;
 
   // create thread local storage memory
   if (!(pt->pt_tls = _mktls(&pt->tib))) {
     free(pt);
+    errno = e;
     return EAGAIN;
   }
 
@@ -229,25 +258,44 @@ static errno_t pthread_create_impl(pthread_t *thread,
     // caller supplied their own stack
     // assume they know what they're doing as much as possible
     if (IsOpenbsd()) {
-      if (!FixupCustomStackOnOpenbsd(&pt->pt_attr)) {
+      if ((rc = FixupCustomStackOnOpenbsd(&pt->pt_attr))) {
         _pthread_free(pt);
-        return EPERM;
+        return rc;
       }
     }
   } else {
     // cosmo is managing the stack
-    pt->pt_flags |= PT_OWNSTACK;
-    errno_t err =
-        cosmo_stack_alloc(&pt->pt_attr.__stacksize, &pt->pt_attr.__guardsize,
-                          &pt->pt_attr.__stackaddr);
-    if (err) {
+    int pagesize = __pagesize;
+    pt->pt_attr.__guardsize = ROUNDUP(pt->pt_attr.__guardsize, pagesize);
+    pt->pt_attr.__stacksize = pt->pt_attr.__stacksize;
+    if (pt->pt_attr.__guardsize + pagesize > pt->pt_attr.__stacksize) {
       _pthread_free(pt);
-      if (err == EINVAL || err == EOVERFLOW) {
+      return EINVAL;
+    }
+    pt->pt_attr.__stackaddr =
+        mmap(0, pt->pt_attr.__stacksize, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (pt->pt_attr.__stackaddr != MAP_FAILED) {
+      if (IsOpenbsd())
+        if (!TellOpenbsdThisIsStackMemory(pt->pt_attr.__stackaddr,
+                                          pt->pt_attr.__stacksize))
+          notpossible;
+      if (pt->pt_attr.__guardsize)
+        if (mprotect(pt->pt_attr.__stackaddr, pt->pt_attr.__guardsize,
+                     PROT_NONE | PROT_GUARD))
+          notpossible;
+    }
+    if (!pt->pt_attr.__stackaddr || pt->pt_attr.__stackaddr == MAP_FAILED) {
+      rc = errno;
+      _pthread_free(pt);
+      errno = e;
+      if (rc == EINVAL || rc == EOVERFLOW) {
         return EINVAL;
       } else {
         return EAGAIN;
       }
     }
+    pt->pt_flags |= PT_OWNSTACK;
   }
 
   // setup signal stack
@@ -256,7 +304,7 @@ static errno_t pthread_create_impl(pthread_t *thread,
       if (!(pt->pt_attr.__sigaltstackaddr =
                 malloc(pt->pt_attr.__sigaltstacksize))) {
         _pthread_free(pt);
-        return EAGAIN;
+        return errno;
       }
       pt->pt_flags |= PT_OWNSIGALTSTACK;
     }
@@ -279,41 +327,31 @@ static errno_t pthread_create_impl(pthread_t *thread,
                             memory_order_relaxed);
       break;
     default:
-      // pthread_attr_setdetachstate() makes this impossible
-      __builtin_unreachable();
+      _pthread_free(pt);
+      return EINVAL;
   }
 
-  // if pthread_attr_setdetachstate() was used then it's possible for
-  // the `pt` object to be freed before this clone call has returned!
-  atomic_store_explicit(&pt->pt_refs, 1, memory_order_relaxed);
-
   // add thread to global list
   // we add it to the beginning since zombies go at the end
   _pthread_lock();
   dll_make_first(&_pthread_list, &pt->list);
   _pthread_unlock();
 
-  // we don't normally do this, but it's important to write the result
-  // memory before spawning the thread, so it's visible to the threads
-  *thread = (pthread_t)pt;
-
   // launch PosixThread(pt) in new thread
-  if ((err = clone(
-           PosixThread, pt->pt_attr.__stackaddr, pt->pt_attr.__stacksize,
-           CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
-               CLONE_SYSVSEM | CLONE_SETTLS | CLONE_PARENT_SETTID |
-               CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
-           pt, &pt->tib->tib_ptid, __adj_tls(pt->tib), &pt->tib->tib_ctid))) {
-    *thread = 0;  // posix doesn't require we do this
+  if ((rc = clone(PosixThread, pt->pt_attr.__stackaddr, pt->pt_attr.__stacksize,
+                  CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES |
+                      CLONE_SIGHAND | CLONE_SYSVSEM | CLONE_SETTLS |
+                      CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
+                      CLONE_CHILD_CLEARTID,
+                  pt, &pt->ptid, __adj_tls(pt->tib), &pt->tib->tib_tid))) {
     _pthread_lock();
     dll_remove(&_pthread_list, &pt->list);
     _pthread_unlock();
     _pthread_free(pt);
-    if (err == ENOMEM)
-      err = EAGAIN;
-    return err;
+    return rc;
   }
 
+  *thread = (pthread_t)pt;
   return 0;
 }
 
@@ -361,8 +399,8 @@ static const char *DescribeHandle(char buf[12], errno_t err, pthread_t *th) {
  *                 │ _lwp_create  │
  *                 └──────────────┘
  *
- * @param thread is used to output the thread id upon success, which
- *     must be non-null; upon failure, its value is undefined
+ * @param thread if non-null is used to output the thread id
+ *     upon successful completion
  * @param attr points to launch configuration, or may be null
  *     to use sensible defaults; it must be initialized using
  *     pthread_attr_init()
@@ -378,18 +416,12 @@ static const char *DescribeHandle(char buf[12], errno_t err, pthread_t *th) {
 errno_t pthread_create(pthread_t *thread, const pthread_attr_t *attr,
                        void *(*start_routine)(void *), void *arg) {
   errno_t err;
-  errno_t olderr = errno;
-  _pthread_decimate(kPosixThreadZombie);
+  _pthread_decimate(false);
   BLOCK_SIGNALS;
   err = pthread_create_impl(thread, attr, start_routine, arg, _SigMask);
   ALLOW_SIGNALS;
   STRACE("pthread_create([%s], %p, %t, %p) → %s",
          DescribeHandle(alloca(12), err, thread), attr, start_routine, arg,
          DescribeErrno(err));
-  if (!err) {
-    _pthread_unref(*(struct PosixThread **)thread);
-  } else {
-    errno = olderr;
-  }
   return err;
 }
diff --git a/libc/thread/pthread_decimate_np.c b/libc/thread/pthread_decimate_np.c
index 8299db3a2..3027dc7fa 100644
--- a/libc/thread/pthread_decimate_np.c
+++ b/libc/thread/pthread_decimate_np.c
@@ -16,32 +16,22 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/cosmo.h"
-#include "libc/intrin/stack.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
 /**
  * Garbage collects POSIX threads runtime.
  *
- * This function frees unreferenced zombie threads and empties cache
- * memory associated with the Cosmopolitan POSIX threads runtime.
- *
- * Here's an example use case for this function. Let's say you want to
- * create a malloc() memory leak detector. If your program was running
- * threads earlier, then there might still be allocations lingering
- * around, that'll give you false positives. To fix this, what you would
- * do is call the following, right before running your leak detector:
+ * Let's say you want to run a memory leak detector. You can say:
  *
  *     while (!pthread_orphan_np())
  *       pthread_decimate_np();
  *
- * Which will wait until all threads have exited and their memory freed.
+ * To wait until all threads have exited.
  *
  * @return 0 on success, or errno on error
  */
 int pthread_decimate_np(void) {
-  _pthread_decimate(kPosixThreadZombie);
-  cosmo_stack_clear();
+  _pthread_decimate(false);
   return 0;
 }
diff --git a/libc/thread/pthread_detach.c b/libc/thread/pthread_detach.c
index 5e9db049a..2456ec69f 100644
--- a/libc/thread/pthread_detach.c
+++ b/libc/thread/pthread_detach.c
@@ -21,15 +21,15 @@
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
 static errno_t pthread_detach_impl(struct PosixThread *pt) {
   enum PosixThreadStatus status, transition;
-  status = atomic_load_explicit(&pt->pt_status, memory_order_relaxed);
   for (;;) {
+    status = atomic_load_explicit(&pt->pt_status, memory_order_acquire);
     if (status == kPosixThreadJoinable) {
       transition = kPosixThreadDetached;
     } else if (status == kPosixThreadTerminated) {
@@ -50,6 +50,10 @@ static errno_t pthread_detach_impl(struct PosixThread *pt) {
 /**
  * Asks POSIX thread to free itself automatically upon termination.
  *
+ * If this function is used, then it's important to use pthread_exit()
+ * rather than exit() since otherwise your program isn't guaranteed to
+ * gracefully terminate.
+ *
  * Detaching a non-joinable thread is undefined behavior. For example,
  * pthread_detach() can't be called twice on the same thread.
  *
@@ -58,12 +62,8 @@ static errno_t pthread_detach_impl(struct PosixThread *pt) {
  * @returnserrno
  */
 errno_t pthread_detach(pthread_t thread) {
-  unassert(thread);
   struct PosixThread *pt = (struct PosixThread *)thread;
-  _pthread_ref(pt);
-  int tid = _pthread_tid(pt);
   errno_t err = pthread_detach_impl(pt);
-  _pthread_unref(pt);
-  STRACE("pthread_detach(%d) → %s", tid, DescribeErrno(err));
+  STRACE("pthread_detach(%d) → %s", _pthread_tid(pt), DescribeErrno(err));
   return err;
 }
diff --git a/libc/thread/pthread_exit.c b/libc/thread/pthread_exit.c
index 6f8199203..933f041a2 100644
--- a/libc/thread/pthread_exit.c
+++ b/libc/thread/pthread_exit.c
@@ -18,13 +18,10 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/atomic.h"
-#include "libc/calls/calls.h"
-#include "libc/cosmo.h"
 #include "libc/cxxabi.h"
 #include "libc/dce.h"
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/cxaatexit.h"
-#include "libc/intrin/describebacktrace.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/limits.h"
@@ -36,6 +33,7 @@
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/wait_s.internal.h"
 
 /**
@@ -71,7 +69,7 @@
  * @noreturn
  */
 wontreturn void pthread_exit(void *rc) {
-  unsigned population;
+  int orphan;
   struct CosmoTib *tib;
   struct PosixThread *pt;
   enum PosixThreadStatus status, transition;
@@ -90,29 +88,16 @@ wontreturn void pthread_exit(void *rc) {
 
   // set state
   pt->pt_flags |= PT_NOCANCEL | PT_EXITING;
-  pt->pt_val = rc;
+  pt->pt_rc = rc;
 
   // free resources
   __cxa_thread_finalize();
 
   // run atexit handlers if orphaned thread
-  // notice how we avoid acquiring the pthread gil
-  if (!(population = atomic_fetch_sub(&_pthread_count, 1) - 1)) {
-    // we know for certain we're an orphan. any other threads that
-    // exist, will terminate and clear their tid very soon. but some
-    // goofball could spawn more threads from atexit() handlers. we'd
-    // also like to avoid looping forever here, by auto-joining threads
-    // that leaked, because the user forgot to join them or detach them
-    for (;;) {
-      if (_weaken(__cxa_finalize))
-        _weaken(__cxa_finalize)(NULL);
-      _pthread_decimate(kPosixThreadTerminated);
-      if (pthread_orphan_np()) {
-        population = atomic_load(&_pthread_count);
-        break;
-      }
-    }
-  }
+  _pthread_decimate(true);
+  if ((orphan = pthread_orphan_np()))
+    if (_weaken(__cxa_finalize))
+      _weaken(__cxa_finalize)(NULL);
 
   // transition the thread to a terminated state
   status = atomic_load_explicit(&pt->pt_status, memory_order_acquire);
@@ -142,23 +127,23 @@ wontreturn void pthread_exit(void *rc) {
   //  thread has been terminated. The behavior shall be as if the
   //  implementation called exit() with a zero argument at thread
   //  termination time." ──Quoth POSIX.1-2017
-  if (!population) {
-    for (int i = __fini_array_end - __fini_array_start; i--;)
-      ((void (*)(void))__fini_array_start[i])();
+  if (orphan) {
+    for (const uintptr_t *p = __fini_array_end; p > __fini_array_start;)
+      ((void (*)(void))(*--p))();
     _Exit(0);
   }
 
   // check if the main thread has died whilst children live
   // note that the main thread is joinable by child threads
   if (pt->pt_flags & PT_STATIC) {
-    atomic_store_explicit(&tib->tib_ctid, 0, memory_order_release);
-    cosmo_futex_wake((atomic_int *)&tib->tib_ctid, INT_MAX,
-                     !IsWindows() && !IsXnu());
+    atomic_store_explicit(&tib->tib_tid, 0, memory_order_release);
+    nsync_futex_wake_((atomic_int *)&tib->tib_tid, INT_MAX,
+                      !IsWindows() && !IsXnu());
     _Exit1(0);
   }
 
   // this is a child thread
-  __builtin_longjmp(pt->pt_exiter, 1);
+  longjmp(pt->pt_exiter, 1);
 }
 
 __weak_reference(pthread_exit, thr_exit);
diff --git a/libc/thread/pthread_getaffinity_np.c b/libc/thread/pthread_getaffinity_np.c
index 83c134ac9..50cd3e011 100644
--- a/libc/thread/pthread_getaffinity_np.c
+++ b/libc/thread/pthread_getaffinity_np.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/sched-sysv.internal.h"
 #include "libc/calls/struct/cpuset.h"
 #include "libc/dce.h"
@@ -40,8 +39,6 @@
 errno_t pthread_getaffinity_np(pthread_t thread, size_t size,
                                cpu_set_t *bitset) {
   int rc, tid;
-  unassert(thread);
-  unassert(bitset);
   tid = _pthread_tid((struct PosixThread *)thread);
 
   if (size != sizeof(cpu_set_t)) {
diff --git a/libc/thread/pthread_getattr_np.c b/libc/thread/pthread_getattr_np.c
index a57472149..65f8c470a 100644
--- a/libc/thread/pthread_getattr_np.c
+++ b/libc/thread/pthread_getattr_np.c
@@ -16,7 +16,18 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "libc/calls/struct/rlimit.h"
+#include "libc/dce.h"
+#include "libc/intrin/atomic.h"
+#include "libc/intrin/maps.h"
+#include "libc/limits.h"
+#include "libc/macros.internal.h"
+#include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
+#include "libc/sysv/consts/auxv.h"
+#include "libc/sysv/consts/rlim.h"
+#include "libc/sysv/consts/rlimit.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 
@@ -61,5 +72,10 @@ errno_t pthread_getattr_np(pthread_t thread, pthread_attr_t *attr) {
     default:
       __builtin_unreachable();
   }
+  if (!attr->__stacksize && (pt->pt_flags & PT_STATIC)) {
+    attr->__stackaddr = __maps.stack.addr;
+    attr->__stacksize = __maps.stack.size;
+    attr->__guardsize = 0;
+  }
   return 0;
 }
diff --git a/libc/thread/pthread_getname_np.c b/libc/thread/pthread_getname_np.c
index 5f31b8287..cca44d59d 100644
--- a/libc/thread/pthread_getname_np.c
+++ b/libc/thread/pthread_getname_np.c
@@ -24,7 +24,7 @@
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/asmflag.h"
 #include "libc/intrin/atomic.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/at.h"
 #include "libc/sysv/consts/o.h"
diff --git a/libc/thread/pthread_kill.c b/libc/thread/pthread_kill.c
index 6c1722965..472da205e 100644
--- a/libc/thread/pthread_kill.c
+++ b/libc/thread/pthread_kill.c
@@ -24,7 +24,6 @@
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
-#include "libc/runtime/internal.h"
 #include "libc/runtime/syslib.internal.h"
 #include "libc/sysv/consts/sicode.h"
 #include "libc/thread/posixthread.internal.h"
@@ -44,15 +43,7 @@ errno_t pthread_kill(pthread_t thread, int sig) {
   int err = 0;
   struct PosixThread *pt;
   pt = (struct PosixThread *)thread;
-  if (pt)
-    _pthread_ref(pt);
-  if (!thread) {
-    // avoid crashing on easily predictable npe
-    // chances are you need a barrier to synchronize startup
-    err = EFAULT;
-  } else if (!(1 <= sig && sig <= 64)) {
-    // cosmo only supports this many signals
-    // some platforms have more but we're not sure what they do
+  if (!(1 <= sig && sig <= 64)) {
     err = EINVAL;
   } else if (thread == __get_tls()->tib_pthread) {
     err = raise(sig);  // XNU will EDEADLK it otherwise
@@ -65,25 +56,17 @@ errno_t pthread_kill(pthread_t thread, int sig) {
     if (IsXnuSilicon()) {
       err = __syslib->__pthread_kill(_pthread_syshand(pt), sig);
     } else {
-      int r = 0;
       int e = errno;
-      int tid = _pthread_tid(pt);
-      if (IsLinux() || IsFreebsd()) {
-        r = sys_tgkill(__pid, tid, sig);
-      } else {
-        r = sys_tkill(tid, sig, pt->tib);
-      }
-      if (r) {
+      if (sys_tkill(_pthread_tid(pt), sig, pt->tib)) {
         err = errno;
         errno = e;
       }
     }
-    if (err == ESRCH)
+    if (err == ESRCH) {
       err = 0;  // we already reported this
+    }
   }
   STRACE("pthread_kill(%d, %G) → %s", _pthread_tid(pt), sig,
          DescribeErrno(err));
-  if (pt)
-    _pthread_unref(pt);
   return err;
 }
diff --git a/libc/thread/pthread_rwlock_destroy.c b/libc/thread/pthread_rwlock_destroy.c
index a3b693d6f..39942c2d0 100644
--- a/libc/thread/pthread_rwlock_destroy.c
+++ b/libc/thread/pthread_rwlock_destroy.c
@@ -16,30 +16,16 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/errno.h"
-#include "libc/intrin/atomic.h"
 #include "libc/str/str.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/mu.h"
 
 /**
  * Destroys read-write lock.
  *
  * @return 0 on success, or error number on failure
- * @raise EBUSY if any threads still hold the lock
+ * @raise EINVAL if any threads still hold the lock
  */
 errno_t pthread_rwlock_destroy(pthread_rwlock_t *rwlock) {
-
-  // check if lock is held
-  if (!rwlock->_pshared) {
-    nsync_mu *mu = (nsync_mu *)rwlock->_nsync;
-    if (atomic_load_explicit(&mu->word, memory_order_relaxed))
-      return EBUSY;
-  } else {
-    if (atomic_load_explicit(&rwlock->_word, memory_order_relaxed))
-      return EBUSY;
-  }
-
   memset(rwlock, -1, sizeof(*rwlock));
   return 0;
 }
diff --git a/libc/thread/pthread_rwlock_init.c b/libc/thread/pthread_rwlock_init.c
index dea3f67a9..54fe08ece 100644
--- a/libc/thread/pthread_rwlock_init.c
+++ b/libc/thread/pthread_rwlock_init.c
@@ -26,8 +26,6 @@
  */
 errno_t pthread_rwlock_init(pthread_rwlock_t *rwlock,
                             const pthread_rwlockattr_t *attr) {
-  *rwlock = (pthread_rwlock_t){
-      ._pshared = attr ? *attr : PTHREAD_PROCESS_PRIVATE,
-  };
+  *rwlock = (pthread_rwlock_t){0};
   return 0;
 }
diff --git a/libc/thread/pthread_rwlock_rdlock.c b/libc/thread/pthread_rwlock_rdlock.c
index e097bb0ef..781c0b6c9 100644
--- a/libc/thread/pthread_rwlock_rdlock.c
+++ b/libc/thread/pthread_rwlock_rdlock.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/atomic.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/mu.h"
 
@@ -25,28 +24,7 @@
  *
  * @return 0 on success, or errno on error
  */
-errno_t pthread_rwlock_rdlock(pthread_rwlock_t *lk) {
-
-#if PTHREAD_USE_NSYNC
-  // use nsync if possible
-  if (!lk->_pshared) {
-    nsync_mu_rlock((nsync_mu *)lk->_nsync);
-    return 0;
-  }
-#endif
-
-  // naive implementation
-  uint32_t w = 0;
-  for (;;) {
-    if (w & 1)
-      for (;;)
-        if (~(w = atomic_load_explicit(&lk->_word, memory_order_relaxed)) & 1)
-          break;
-    // xxx: avoid writer starvation in pthread_rwlock_rdlock_test
-    while (atomic_load(&lk->_waiters))
-      pthread_yield_np();
-    if (atomic_compare_exchange_weak_explicit(
-            &lk->_word, &w, w + 2, memory_order_acquire, memory_order_relaxed))
-      return 0;
-  }
+errno_t pthread_rwlock_rdlock(pthread_rwlock_t *rwlock) {
+  nsync_mu_rlock((nsync_mu *)rwlock);
+  return 0;
 }
diff --git a/libc/thread/pthread_rwlock_tryrdlock.c b/libc/thread/pthread_rwlock_tryrdlock.c
index 1969c3a41..35d51a051 100644
--- a/libc/thread/pthread_rwlock_tryrdlock.c
+++ b/libc/thread/pthread_rwlock_tryrdlock.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
-#include "libc/intrin/atomic.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/mu.h"
 
@@ -30,26 +29,9 @@
  * @raise EINVAL if `rwlock` doesn't refer to an initialized r/w lock
  */
 errno_t pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock) {
-
-#if PTHREAD_USE_NSYNC
-  // use nsync if possible
-  if (!rwlock->_pshared) {
-    if (nsync_mu_rtrylock((nsync_mu *)rwlock->_nsync)) {
-      return 0;
-    } else {
-      return EBUSY;
-    }
-  }
-#endif
-
-  // naive implementation
-  uint32_t word = 0;
-  for (;;) {
-    if (word & 1)
-      return EBUSY;
-    if (atomic_compare_exchange_weak_explicit(&rwlock->_word, &word, word + 2,
-                                              memory_order_acquire,
-                                              memory_order_relaxed))
-      return 0;
+  if (nsync_mu_rtrylock((nsync_mu *)rwlock)) {
+    return 0;
+  } else {
+    return EBUSY;
   }
 }
diff --git a/libc/thread/pthread_rwlock_trywrlock.c b/libc/thread/pthread_rwlock_trywrlock.c
index 49b39e38f..c685a39dc 100644
--- a/libc/thread/pthread_rwlock_trywrlock.c
+++ b/libc/thread/pthread_rwlock_trywrlock.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
-#include "libc/intrin/atomic.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/mu.h"
 
@@ -29,23 +28,10 @@
  * @raise EINVAL if `rwlock` doesn't refer to an initialized r/w lock
  */
 errno_t pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock) {
-
-#if PTHREAD_USE_NSYNC
-  // use nsync if possible
-  if (!rwlock->_pshared) {
-    if (nsync_mu_trylock((nsync_mu *)rwlock->_nsync)) {
-      rwlock->_iswrite = 1;
-      return 0;
-    } else {
-      return EBUSY;
-    }
-  }
-#endif
-
-  // naive implementation
-  uint32_t word = 0;
-  if (atomic_compare_exchange_strong_explicit(
-          &rwlock->_word, &word, 1, memory_order_acquire, memory_order_relaxed))
+  if (nsync_mu_trylock((nsync_mu *)rwlock)) {
+    rwlock->_iswrite = 1;
     return 0;
-  return EBUSY;
+  } else {
+    return EBUSY;
+  }
 }
diff --git a/libc/thread/pthread_rwlock_unlock.c b/libc/thread/pthread_rwlock_unlock.c
index 5b5feaa02..1918491c8 100644
--- a/libc/thread/pthread_rwlock_unlock.c
+++ b/libc/thread/pthread_rwlock_unlock.c
@@ -16,8 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/errno.h"
-#include "libc/intrin/atomic.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/mu.h"
 
@@ -28,33 +26,11 @@
  * @raise EINVAL if lock is in a bad state
  */
 errno_t pthread_rwlock_unlock(pthread_rwlock_t *rwlock) {
-
-#if PTHREAD_USE_NSYNC
-  // use nsync if possible
-  if (!rwlock->_pshared) {
-    if (rwlock->_iswrite) {
-      rwlock->_iswrite = 0;
-      nsync_mu_unlock((nsync_mu *)rwlock->_nsync);
-    } else {
-      nsync_mu_runlock((nsync_mu *)rwlock->_nsync);
-    }
-    return 0;
-  }
-#endif
-
-  // naive implementation
-  uint32_t word = atomic_load_explicit(&rwlock->_word, memory_order_relaxed);
-  for (;;) {
-    if (word & 1) {
-      atomic_store_explicit(&rwlock->_word, 0, memory_order_release);
-      return 0;
-    } else if (word) {
-      if (atomic_compare_exchange_weak_explicit(&rwlock->_word, &word, word - 2,
-                                                memory_order_release,
-                                                memory_order_relaxed))
-        return 0;
-    } else {
-      return EPERM;
-    }
+  if (rwlock->_iswrite) {
+    rwlock->_iswrite = 0;
+    nsync_mu_unlock((nsync_mu *)rwlock);
+  } else {
+    nsync_mu_runlock((nsync_mu *)rwlock);
   }
+  return 0;
 }
diff --git a/libc/thread/pthread_rwlock_wrlock.c b/libc/thread/pthread_rwlock_wrlock.c
index 382eba828..3eea88db7 100644
--- a/libc/thread/pthread_rwlock_wrlock.c
+++ b/libc/thread/pthread_rwlock_wrlock.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/atomic.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/mu.h"
 
@@ -26,26 +25,7 @@
  * @return 0 on success, or errno on error
  */
 errno_t pthread_rwlock_wrlock(pthread_rwlock_t *rwlock) {
-
-#if PTHREAD_USE_NSYNC
-  // use nsync if possible
-  if (!rwlock->_pshared) {
-    nsync_mu_lock((nsync_mu *)rwlock->_nsync);
-    rwlock->_iswrite = 1;
-    return 0;
-  }
-#endif
-
-  // naive implementation
-  uint32_t w = 0;
-  for (;;) {
-    if (atomic_compare_exchange_weak_explicit(
-            &rwlock->_word, &w, 1, memory_order_acquire, memory_order_relaxed))
-      return 0;
-    atomic_fetch_add(&rwlock->_waiters, 1);
-    for (;;)
-      if (!(w = atomic_load_explicit(&rwlock->_word, memory_order_relaxed)))
-        break;
-    atomic_fetch_sub(&rwlock->_waiters, 1);
-  }
+  nsync_mu_lock((nsync_mu *)rwlock);
+  rwlock->_iswrite = 1;
+  return 0;
 }
diff --git a/libc/thread/pthread_rwlockattr_getpshared.c b/libc/thread/pthread_rwlockattr_getpshared.c
index 5ebfb765b..05507dcd5 100644
--- a/libc/thread/pthread_rwlockattr_getpshared.c
+++ b/libc/thread/pthread_rwlockattr_getpshared.c
@@ -23,7 +23,7 @@
  *
  * @param pshared is set to one of the following
  *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED`
+ *     - `PTHREAD_PROCESS_SHARED` (unsupported)
  * @return 0 on success, or error on failure
  */
 errno_t pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *attr,
diff --git a/libc/thread/pthread_rwlockattr_setpshared.c b/libc/thread/pthread_rwlockattr_setpshared.c
index 49bf21efe..d7378d6e8 100644
--- a/libc/thread/pthread_rwlockattr_setpshared.c
+++ b/libc/thread/pthread_rwlockattr_setpshared.c
@@ -24,14 +24,13 @@
  *
  * @param pshared can be one of
  *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED`
+ *     - `PTHREAD_PROCESS_SHARED` (unsupported)
  * @return 0 on success, or error on failure
  * @raises EINVAL if `pshared` is invalid
  */
 errno_t pthread_rwlockattr_setpshared(pthread_rwlockattr_t *attr, int pshared) {
   switch (pshared) {
     case PTHREAD_PROCESS_PRIVATE:
-    case PTHREAD_PROCESS_SHARED:
       *attr = pshared;
       return 0;
     default:
diff --git a/libc/thread/pthread_setaffinity_np.c b/libc/thread/pthread_setaffinity_np.c
index 96aa4466b..4c1b3aa46 100644
--- a/libc/thread/pthread_setaffinity_np.c
+++ b/libc/thread/pthread_setaffinity_np.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/sched-sysv.internal.h"
 #include "libc/calls/struct/cpuset.h"
 #include "libc/calls/syscall_support-nt.internal.h"
@@ -55,8 +54,6 @@ errno_t pthread_setaffinity_np(pthread_t thread, size_t size,
   int e, rc, tid;
   cpu_set_t bs = {0};
   struct PosixThread *pt;
-  unassert(thread);
-  unassert(bitset);
   e = errno;
   if (size < sizeof(cpu_set_t)) {
     memcpy(&bs, bitset, size);
diff --git a/libc/thread/pthread_setcanceltype.c b/libc/thread/pthread_setcanceltype.c
index 6aad36c6e..f65187104 100644
--- a/libc/thread/pthread_setcanceltype.c
+++ b/libc/thread/pthread_setcanceltype.c
@@ -76,10 +76,8 @@ errno_t pthread_setcanceltype(int type, int *oldtype) {
       err = EINVAL;
       break;
   }
-#ifdef MODE_DBG
   STRACE("pthread_setcanceltype(%s, [%s]) → %s",
          DescribeCancelType(alloca(12), 0, &type),
          DescribeCancelType(alloca(12), err, oldtype), DescribeErrno(err));
-#endif
   return err;
 }
diff --git a/libc/thread/pthread_timedjoin_np.c b/libc/thread/pthread_timedjoin_np.c
index cd1643b8d..9dcc410a0 100644
--- a/libc/thread/pthread_timedjoin_np.c
+++ b/libc/thread/pthread_timedjoin_np.c
@@ -20,18 +20,16 @@
 #include "libc/calls/cp.internal.h"
 #include "libc/calls/struct/timespec.h"
 #include "libc/calls/struct/timespec.internal.h"
-#include "libc/cosmo.h"
-#include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/dll.h"
 #include "libc/intrin/strace.h"
-#include "libc/sysv/consts/clock.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread2.h"
 #include "libc/thread/tls.h"
+#include "third_party/nsync/futex.internal.h"
 
 static const char *DescribeReturnValue(char buf[30], int err, void **value) {
   char *p = buf;
@@ -62,34 +60,32 @@ static const char *DescribeReturnValue(char buf[30], int err, void **value) {
  * @cancelationpoint
  */
 static errno_t _pthread_wait(atomic_int *ctid, struct timespec *abstime) {
-
-  // "If an implementation detects that the value specified by the
-  //  thread argument to pthread_join() refers to the calling thread,
-  //  it is recommended that the function should fail and report an
-  //  [EDEADLK] error." ──Quoth POSIX.1-2017
-  if (ctid == &__get_tls()->tib_ctid)
-    return EDEADLK;
-
-  // "If the thread calling pthread_join() is canceled, then the target
-  //  thread shall not be detached."  ──Quoth POSIX.1-2017
-  errno_t err;
-  if ((err = pthread_testcancel_np()))
-    return err;
-
-  BEGIN_CANCELATION_POINT;
-  int x;
-  while ((x = atomic_load_explicit(ctid, memory_order_acquire))) {
-    int e = cosmo_futex_wait(ctid, x, !IsWindows() && !IsXnu(), CLOCK_REALTIME,
-                             abstime);
-    if (e == -ECANCELED) {
-      err = ECANCELED;
-      break;
-    } else if (e == -ETIMEDOUT) {
-      err = EBUSY;
-      break;
+  int x, e;
+  errno_t err = 0;
+  if (ctid == &__get_tls()->tib_tid) {
+    // "If an implementation detects that the value specified by the
+    //  thread argument to pthread_join() refers to the calling thread,
+    //  it is recommended that the function should fail and report an
+    //  [EDEADLK] error." ──Quoth POSIX.1-2017
+    err = EDEADLK;
+  } else {
+    // "If the thread calling pthread_join() is canceled, then the target
+    //  thread shall not be detached."  ──Quoth POSIX.1-2017
+    if (!(err = pthread_testcancel_np())) {
+      BEGIN_CANCELATION_POINT;
+      while ((x = atomic_load_explicit(ctid, memory_order_acquire))) {
+        e = nsync_futex_wait_(ctid, x, !IsWindows() && !IsXnu(), abstime);
+        if (e == -ECANCELED) {
+          err = ECANCELED;
+          break;
+        } else if (e == -ETIMEDOUT) {
+          err = EBUSY;
+          break;
+        }
+      }
+      END_CANCELATION_POINT;
     }
   }
-  END_CANCELATION_POINT;
   return err;
 }
 
@@ -118,11 +114,10 @@ static errno_t _pthread_wait(atomic_int *ctid, struct timespec *abstime) {
 errno_t pthread_timedjoin_np(pthread_t thread, void **value_ptr,
                              struct timespec *abstime) {
   int tid;
-  errno_t err;
+  errno_t err = 0;
   struct PosixThread *pt;
   enum PosixThreadStatus status;
   pt = (struct PosixThread *)thread;
-  unassert(thread);
 
   // "The behavior is undefined if the value specified by the thread
   //  argument to pthread_join() does not refer to a joinable thread."
@@ -134,22 +129,12 @@ errno_t pthread_timedjoin_np(pthread_t thread, void **value_ptr,
   // "The results of multiple simultaneous calls to pthread_join()
   //  specifying the same target thread are undefined."
   //                                  ──Quoth POSIX.1-2017
-  if (!(err = _pthread_wait(&pt->tib->tib_ctid, abstime))) {
+  if (!(err = _pthread_wait(&pt->tib->tib_tid, abstime))) {
+    atomic_store_explicit(&pt->pt_status, kPosixThreadZombie,
+                          memory_order_release);
+    _pthread_zombify(pt);
     if (value_ptr)
-      *value_ptr = pt->pt_val;
-    if (atomic_load_explicit(&pt->pt_refs, memory_order_acquire)) {
-      _pthread_lock();
-      dll_remove(&_pthread_list, &pt->list);
-      dll_make_last(&_pthread_list, &pt->list);
-      atomic_store_explicit(&pt->pt_status, kPosixThreadZombie,
-                            memory_order_release);
-      _pthread_unlock();
-    } else {
-      _pthread_lock();
-      dll_remove(&_pthread_list, &pt->list);
-      _pthread_unlock();
-      _pthread_free(pt);
-    }
+      *value_ptr = pt->pt_rc;
   }
 
   STRACE("pthread_timedjoin_np(%d, %s, %s) → %s", tid,
diff --git a/libc/thread/pthread_tryjoin_np.c b/libc/thread/pthread_tryjoin_np.c
index 39929646e..248b26928 100644
--- a/libc/thread/pthread_tryjoin_np.c
+++ b/libc/thread/pthread_tryjoin_np.c
@@ -32,7 +32,6 @@
  *     if the thread called pthread_exit(), or `PTHREAD_CANCELED` if
  *     pthread_cancel() destroyed the thread instead
  * @return 0 on success, or errno on error
- * @raise EBUSY if thread has not yet terminated
  * @raise ECANCELED if calling thread was cancelled in masked mode
  * @cancelationpoint
  * @returnserrno
diff --git a/libc/thread/sem_destroy.c b/libc/thread/sem_destroy.c
index 053295c1b..fb0e3c356 100644
--- a/libc/thread/sem_destroy.c
+++ b/libc/thread/sem_destroy.c
@@ -18,7 +18,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/strace.h"
 #include "libc/limits.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/semaphore.h"
@@ -41,20 +40,14 @@
  * @raise EBUSY if `sem` has waiters
  */
 int sem_destroy(sem_t *sem) {
-  int rc, waiters;
+  int waiters;
   npassert(sem->sem_magic != SEM_MAGIC_NAMED);
-  if (sem->sem_magic != SEM_MAGIC_UNNAMED) {
-    rc = einval();
-  } else {
-    waiters = atomic_load_explicit(&sem->sem_waiters, memory_order_relaxed);
-    unassert(waiters >= 0);
-    if (waiters) {
-      rc = ebusy();
-    } else {
-      atomic_store_explicit(&sem->sem_value, INT_MIN, memory_order_relaxed);
-      rc = 0;
-    }
-  }
-  STRACE("sem_destroy(%p) → %d% m", sem, rc);
-  return rc;
+  if (sem->sem_magic != SEM_MAGIC_UNNAMED)
+    return einval();
+  waiters = atomic_load_explicit(&sem->sem_waiters, memory_order_relaxed);
+  unassert(waiters >= 0);
+  if (waiters)
+    return ebusy();
+  atomic_store_explicit(&sem->sem_value, INT_MIN, memory_order_relaxed);
+  return 0;
 }
diff --git a/libc/thread/sem_init.c b/libc/thread/sem_init.c
index da976752c..e86f18313 100644
--- a/libc/thread/sem_init.c
+++ b/libc/thread/sem_init.c
@@ -19,7 +19,6 @@
 #include "libc/calls/calls.h"
 #include "libc/dce.h"
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/strace.h"
 #include "libc/limits.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/semaphore.h"
@@ -38,17 +37,12 @@
  * @raise EINVAL if `value` exceeds `SEM_VALUE_MAX`
  */
 int sem_init(sem_t *sem, int pshared, unsigned value) {
-  int rc;
-  if (value > SEM_VALUE_MAX) {
-    rc = einval();
-  } else {
-    sem->sem_magic = SEM_MAGIC_UNNAMED;
-    atomic_store_explicit(&sem->sem_value, value, memory_order_relaxed);
-    sem->sem_pshared = !!pshared;
-    sem->sem_pid = getpid();
-    sem->sem_waiters = 0;
-    rc = 0;
-  }
-  STRACE("sem_init(%p, %hhhd, %u) → %d% m", sem, pshared, value, rc);
-  return rc;
+  if (value > SEM_VALUE_MAX)
+    return einval();
+  sem->sem_magic = SEM_MAGIC_UNNAMED;
+  atomic_store_explicit(&sem->sem_value, value, memory_order_relaxed);
+  sem->sem_pshared = !!pshared;
+  sem->sem_pid = getpid();
+  sem->sem_waiters = 0;
+  return 0;
 }
diff --git a/libc/thread/sem_open.c b/libc/thread/sem_open.c
index 156bbc868..d708ef7e4 100644
--- a/libc/thread/sem_open.c
+++ b/libc/thread/sem_open.c
@@ -17,12 +17,10 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
-#include "libc/atomic.h"
 #include "libc/calls/blockcancel.internal.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/stat.h"
 #include "libc/calls/syscall-sysv.internal.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
@@ -37,13 +35,12 @@
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/semaphore.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 
 static struct Semaphores {
-  atomic_uint once;
+  pthread_once_t once;
   pthread_mutex_t lock;
   struct Semaphore {
     struct Semaphore *next;
@@ -52,28 +49,27 @@ static struct Semaphores {
     bool dead;
     int refs;
   } *list;
-} g_semaphores = {
-    .lock = PTHREAD_MUTEX_INITIALIZER,
-};
+} g_semaphores;
 
 static void sem_open_lock(void) {
-  _pthread_mutex_lock(&g_semaphores.lock);
+  pthread_mutex_lock(&g_semaphores.lock);
 }
 
 static void sem_open_unlock(void) {
-  _pthread_mutex_unlock(&g_semaphores.lock);
+  pthread_mutex_unlock(&g_semaphores.lock);
 }
 
 static void sem_open_wipe(void) {
-  _pthread_mutex_wipe_np(&g_semaphores.lock);
+  pthread_mutex_init(&g_semaphores.lock, 0);
 }
 
 static void sem_open_setup(void) {
+  sem_open_wipe();
   pthread_atfork(sem_open_lock, sem_open_unlock, sem_open_wipe);
 }
 
 static void sem_open_init(void) {
-  cosmo_once(&g_semaphores.once, sem_open_setup);
+  pthread_once(&g_semaphores.once, sem_open_setup);
 }
 
 static sem_t *sem_open_impl(const char *path, int oflag, unsigned mode,
diff --git a/libc/thread/sem_post.c b/libc/thread/sem_post.c
index 8da481cc0..83fab3409 100644
--- a/libc/thread/sem_post.c
+++ b/libc/thread/sem_post.c
@@ -19,7 +19,6 @@
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/syscall-sysv.internal.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
@@ -27,6 +26,7 @@
 #include "libc/runtime/syslib.internal.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/semaphore.h"
+#include "third_party/nsync/futex.internal.h"
 
 /**
  * Unlocks semaphore.
@@ -46,7 +46,7 @@ int sem_post(sem_t *sem) {
   old = atomic_fetch_add_explicit(&sem->sem_value, 1, memory_order_acq_rel);
   unassert(old > INT_MIN);
   if (old >= 0) {
-    wakeups = cosmo_futex_wake(&sem->sem_value, 1, sem->sem_pshared);
+    wakeups = nsync_futex_wake_(&sem->sem_value, 1, true);
     npassert(wakeups >= 0);
     rc = 0;
   } else {
diff --git a/libc/thread/sem_timedwait.c b/libc/thread/sem_timedwait.c
index b68193fe6..bd2e5d9d9 100644
--- a/libc/thread/sem_timedwait.c
+++ b/libc/thread/sem_timedwait.c
@@ -22,17 +22,16 @@
 #include "libc/calls/struct/timespec.h"
 #include "libc/calls/struct/timespec.internal.h"
 #include "libc/calls/syscall-sysv.internal.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/weaken.h"
 #include "libc/limits.h"
 #include "libc/runtime/syslib.internal.h"
-#include "libc/sysv/consts/clock.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/semaphore.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 
 static void sem_delay(int n) {
   volatile int i;
@@ -59,7 +58,7 @@ static void sem_timedwait_cleanup(void *arg) {
  * @cancelationpoint
  */
 int sem_timedwait(sem_t *sem, const struct timespec *abstime) {
-  int v, rc, e = errno;
+  int i, v, rc, e = errno;
 
 #if 0
   if (IsXnuSilicon() && sem->sem_magic == SEM_MAGIC_KERNEL) {
@@ -103,13 +102,16 @@ int sem_timedwait(sem_t *sem, const struct timespec *abstime) {
   }
 #endif
 
-  rc = sem_trywait(sem);
-  if (!rc) {
-    return rc;
-  } else if (errno == EAGAIN) {
-    errno = e;
-  } else {
-    return rc;
+  for (i = 0; i < 7; ++i) {
+    rc = sem_trywait(sem);
+    if (!rc) {
+      return rc;
+    } else if (errno == EAGAIN) {
+      errno = e;
+      sem_delay(i);
+    } else {
+      return rc;
+    }
   }
 
   BEGIN_CANCELATION_POINT;
@@ -119,8 +121,7 @@ int sem_timedwait(sem_t *sem, const struct timespec *abstime) {
 
   do {
     if (!(v = atomic_load_explicit(&sem->sem_value, memory_order_relaxed))) {
-      rc = cosmo_futex_wait(&sem->sem_value, v, sem->sem_pshared,
-                            CLOCK_REALTIME, abstime);
+      rc = nsync_futex_wait_(&sem->sem_value, v, true, abstime);
       if (rc == -EINTR || rc == -ECANCELED) {
         errno = -rc;
         rc = -1;
diff --git a/libc/thread/semaphore.h b/libc/thread/semaphore.h
index ee03fe926..64119e2be 100644
--- a/libc/thread/semaphore.h
+++ b/libc/thread/semaphore.h
@@ -34,10 +34,10 @@ typedef struct {
 
 int sem_init(sem_t *, int, unsigned) libcesque;
 int sem_destroy(sem_t *) libcesque;
-int sem_post(sem_t *) dontthrow;
-int sem_wait(sem_t *) dontthrow;
-int sem_trywait(sem_t *) dontthrow;
-int sem_timedwait(sem_t *, const struct timespec *) dontthrow;
+int sem_post(sem_t *) libcesque;
+int sem_wait(sem_t *) libcesque;
+int sem_trywait(sem_t *) libcesque;
+int sem_timedwait(sem_t *, const struct timespec *) libcesque;
 int sem_getvalue(sem_t *, int *) libcesque;
 sem_t *sem_open(const char *, int, ...) libcesque;
 int sem_close(sem_t *) libcesque;
diff --git a/libc/thread/thread.h b/libc/thread/thread.h
index b7ac8a119..2e4448f2e 100644
--- a/libc/thread/thread.h
+++ b/libc/thread/thread.h
@@ -2,19 +2,18 @@
 #define COSMOPOLITAN_LIBC_THREAD_THREAD_H_
 
 #define PTHREAD_KEYS_MAX              46
-#define PTHREAD_STACK_MIN             32768
+#define PTHREAD_STACK_MIN             65536
 #define PTHREAD_USE_NSYNC             1
 #define PTHREAD_DESTRUCTOR_ITERATIONS 4
 
 #define PTHREAD_BARRIER_SERIAL_THREAD 31337
 
 #define PTHREAD_MUTEX_DEFAULT    0
-#define PTHREAD_MUTEX_NORMAL     1
-#define PTHREAD_MUTEX_RECURSIVE  2
-#define PTHREAD_MUTEX_ERRORCHECK 3
-
-#define PTHREAD_MUTEX_STALLED 0
-#define PTHREAD_MUTEX_ROBUST  2048
+#define PTHREAD_MUTEX_NORMAL     0
+#define PTHREAD_MUTEX_RECURSIVE  1
+#define PTHREAD_MUTEX_ERRORCHECK 2
+#define PTHREAD_MUTEX_STALLED    0
+#define PTHREAD_MUTEX_ROBUST     1
 
 #define PTHREAD_PROCESS_PRIVATE 0
 #define PTHREAD_PROCESS_SHARED  4
@@ -45,12 +44,9 @@ COSMOPOLITAN_C_START_
 #define PTHREAD_ONCE_INIT          {0}
 #define PTHREAD_COND_INITIALIZER   {0}
 #define PTHREAD_RWLOCK_INITIALIZER {0}
+#define PTHREAD_MUTEX_INITIALIZER  {0}
 
-#define PTHREAD_MUTEX_INITIALIZER               {0, PTHREAD_MUTEX_DEFAULT}
-#define PTHREAD_NORMAL_MUTEX_INITIALIZER_NP     {0, PTHREAD_MUTEX_NORMAL}
-#define PTHREAD_SHARED_MUTEX_INITIALIZER_NP     {0, PTHREAD_PROCESS_SHARED}
-#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP  {0, PTHREAD_MUTEX_RECURSIVE}
-#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP {0, PTHREAD_MUTEX_ERRORCHECK}
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE}
 
 #ifndef __cplusplus
 #define _PTHREAD_ATOMIC(x) _Atomic(x)
@@ -60,6 +56,7 @@ COSMOPOLITAN_C_START_
 
 typedef uintptr_t pthread_t;
 typedef int pthread_id_np_t;
+typedef char pthread_condattr_t;
 typedef char pthread_rwlockattr_t;
 typedef char pthread_barrierattr_t;
 typedef unsigned pthread_key_t;
@@ -74,43 +71,33 @@ typedef struct pthread_spinlock_s {
 } pthread_spinlock_t;
 
 typedef struct pthread_mutex_s {
-  void *_edges;
+  uint32_t _nsync;
+  union {
+    int32_t _pid;
+    _PTHREAD_ATOMIC(int32_t) _futex;
+  };
   _PTHREAD_ATOMIC(uint64_t) _word;
-  _PTHREAD_ATOMIC(int) _futex;
-  int _pid;
-  void *_nsync[2];
 } pthread_mutex_t;
 
 typedef struct pthread_mutexattr_s {
   unsigned _word;
 } pthread_mutexattr_t;
 
-typedef struct pthread_condattr_s {
-  char _pshared;
-  char _clock;
-} pthread_condattr_t;
-
 typedef struct pthread_cond_s {
-  char _pshared;
-  char _clock;
-  char _footek;
-  _PTHREAD_ATOMIC(char) _waited;
+  union {
+    void *_align;
+    struct {
+      uint32_t _nsync;
+      char _pshared;
+    };
+  };
   _PTHREAD_ATOMIC(uint32_t) _sequence;
   _PTHREAD_ATOMIC(uint32_t) _waiters;
-  void *_nsync[2];
 } pthread_cond_t;
 
 typedef struct pthread_rwlock_s {
-  union {
-    void *_nsync[2];
-    struct {
-      uint32_t _nsync_word;
-      char _pshared;
-      char _iswrite;
-      _PTHREAD_ATOMIC(uint32_t) _word;
-      _PTHREAD_ATOMIC(uint32_t) _waiters;
-    };
-  };
+  void *_nsync[2];
+  char _iswrite;
 } pthread_rwlock_t;
 
 typedef struct pthread_barrier_s {
@@ -127,10 +114,10 @@ typedef struct pthread_attr_s {
   int __schedparam;
   int __schedpolicy;
   int __contentionscope;
+  int __guardsize;
+  int __stacksize;
   int __sigaltstacksize;
   uint64_t __sigmask;
-  size_t __guardsize;
-  size_t __stacksize;
   void *__stackaddr;
   void *__sigaltstackaddr;
 } pthread_attr_t;
@@ -151,58 +138,53 @@ int pthread_attr_getguardsize(const pthread_attr_t *, size_t *) libcesque params
 int pthread_attr_getinheritsched(const pthread_attr_t *, int *) libcesque paramsnonnull();
 int pthread_attr_getschedpolicy(const pthread_attr_t *, int *) libcesque paramsnonnull();
 int pthread_attr_getscope(const pthread_attr_t *, int *) libcesque paramsnonnull();
-int pthread_attr_getsigaltstack_np(const pthread_attr_t *, void **, size_t *) libcesque paramsnonnull();
-int pthread_attr_getsigaltstacksize_np(const pthread_attr_t *, size_t *) libcesque paramsnonnull();
 int pthread_attr_getstack(const pthread_attr_t *, void **, size_t *) libcesque paramsnonnull();
 int pthread_attr_getstacksize(const pthread_attr_t *, size_t *) libcesque paramsnonnull();
+int pthread_attr_getsigaltstack_np(const pthread_attr_t *, void **, size_t *) libcesque paramsnonnull();
+int pthread_attr_getsigaltstacksize_np(const pthread_attr_t *, size_t *) libcesque paramsnonnull();
 int pthread_attr_init(pthread_attr_t *) libcesque paramsnonnull();
 int pthread_attr_setdetachstate(pthread_attr_t *, int) libcesque paramsnonnull();
 int pthread_attr_setguardsize(pthread_attr_t *, size_t) libcesque paramsnonnull();
 int pthread_attr_setinheritsched(pthread_attr_t *, int) libcesque paramsnonnull();
 int pthread_attr_setschedpolicy(pthread_attr_t *, int) libcesque paramsnonnull();
 int pthread_attr_setscope(pthread_attr_t *, int) libcesque paramsnonnull();
-int pthread_attr_setsigaltstack_np(pthread_attr_t *, void *, size_t) libcesque paramsnonnull((1));
-int pthread_attr_setsigaltstacksize_np(pthread_attr_t *, size_t);
 int pthread_attr_setstack(pthread_attr_t *, void *, size_t) libcesque paramsnonnull((1));
 int pthread_attr_setstacksize(pthread_attr_t *, size_t) libcesque paramsnonnull();
+int pthread_attr_setsigaltstack_np(pthread_attr_t *, void *, size_t) libcesque paramsnonnull((1));
+int pthread_attr_setsigaltstacksize_np(pthread_attr_t *, size_t);
 int pthread_barrier_destroy(pthread_barrier_t *) libcesque paramsnonnull();
 int pthread_barrier_init(pthread_barrier_t *, const pthread_barrierattr_t *, unsigned) libcesque paramsnonnull((1));
-int pthread_barrier_wait(pthread_barrier_t *) dontthrow paramsnonnull();
+int pthread_barrier_wait(pthread_barrier_t *) libcesque paramsnonnull();
 int pthread_barrierattr_destroy(pthread_barrierattr_t *) libcesque paramsnonnull();
 int pthread_barrierattr_getpshared(const pthread_barrierattr_t *, int *) libcesque paramsnonnull();
 int pthread_barrierattr_init(pthread_barrierattr_t *) libcesque paramsnonnull();
 int pthread_barrierattr_setpshared(pthread_barrierattr_t *, int) libcesque paramsnonnull();
-int pthread_cancel(pthread_t) dontthrow;
-int pthread_cond_broadcast(pthread_cond_t *) dontthrow paramsnonnull();
+int pthread_cancel(pthread_t) libcesque;
+int pthread_cond_broadcast(pthread_cond_t *) libcesque paramsnonnull();
 int pthread_cond_destroy(pthread_cond_t *) libcesque paramsnonnull();
 int pthread_cond_init(pthread_cond_t *, const pthread_condattr_t *) libcesque paramsnonnull((1));
-int pthread_cond_signal(pthread_cond_t *) dontthrow paramsnonnull();
-int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *) dontthrow paramsnonnull();
+int pthread_cond_signal(pthread_cond_t *) libcesque paramsnonnull();
+int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *) libcesque paramsnonnull();
 int pthread_condattr_destroy(pthread_condattr_t *) libcesque paramsnonnull();
-int pthread_condattr_getclock(const pthread_condattr_t *, int *) libcesque paramsnonnull();
 int pthread_condattr_getpshared(const pthread_condattr_t *, int *) libcesque paramsnonnull();
 int pthread_condattr_init(pthread_condattr_t *) libcesque paramsnonnull();
-int pthread_condattr_setclock(pthread_condattr_t *, int) libcesque paramsnonnull();
 int pthread_condattr_setpshared(pthread_condattr_t *, int) libcesque paramsnonnull();
 int pthread_create(pthread_t *, const pthread_attr_t *, void *(*)(void *), void *) dontthrow paramsnonnull((1));
-int pthread_decimate_np(void) dontthrow;
-int pthread_delay_np(const void *, int) dontthrow;
-int pthread_detach(pthread_t) dontthrow;
+int pthread_detach(pthread_t) libcesque;
 int pthread_equal(pthread_t, pthread_t) libcesque;
 int pthread_getattr_np(pthread_t, pthread_attr_t *) libcesque paramsnonnull();
 int pthread_getname_np(pthread_t, char *, size_t) libcesque paramsnonnull();
 int pthread_getunique_np(pthread_t, pthread_id_np_t *) libcesque paramsnonnull();
-int pthread_join(pthread_t, void **) dontthrow;
+int pthread_join(pthread_t, void **) libcesque;
 int pthread_key_create(pthread_key_t *, pthread_key_dtor) libcesque paramsnonnull((1));
 int pthread_key_delete(pthread_key_t) libcesque;
-int pthread_kill(pthread_t, int) dontthrow;
-int pthread_mutex_consistent(pthread_mutex_t *) dontthrow paramsnonnull();
+int pthread_kill(pthread_t, int) libcesque;
+int pthread_mutex_consistent(pthread_mutex_t *) libcesque paramsnonnull();
 int pthread_mutex_destroy(pthread_mutex_t *) libcesque paramsnonnull();
 int pthread_mutex_init(pthread_mutex_t *, const pthread_mutexattr_t *) libcesque paramsnonnull((1));
-int pthread_mutex_lock(pthread_mutex_t *) dontthrow paramsnonnull();
-int pthread_mutex_trylock(pthread_mutex_t *) dontthrow paramsnonnull();
-int pthread_mutex_unlock(pthread_mutex_t *) dontthrow paramsnonnull();
-int pthread_mutex_wipe_np(pthread_mutex_t *) libcesque paramsnonnull();
+int pthread_mutex_lock(pthread_mutex_t *) libcesque paramsnonnull();
+int pthread_mutex_trylock(pthread_mutex_t *) libcesque paramsnonnull();
+int pthread_mutex_unlock(pthread_mutex_t *) libcesque paramsnonnull();
 int pthread_mutexattr_destroy(pthread_mutexattr_t *) libcesque paramsnonnull();
 int pthread_mutexattr_getpshared(const pthread_mutexattr_t *, int *) libcesque paramsnonnull();
 int pthread_mutexattr_gettype(const pthread_mutexattr_t *, int *) libcesque paramsnonnull();
@@ -211,13 +193,14 @@ int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int) libcesque paramsnon
 int pthread_mutexattr_settype(pthread_mutexattr_t *, int) libcesque paramsnonnull();
 int pthread_once(pthread_once_t *, void (*)(void)) paramsnonnull();
 int pthread_orphan_np(void) libcesque;
+int pthread_decimate_np(void) libcesque;
 int pthread_rwlock_destroy(pthread_rwlock_t *) libcesque paramsnonnull();
 int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *) libcesque paramsnonnull((1));
-int pthread_rwlock_rdlock(pthread_rwlock_t *) dontthrow paramsnonnull();
-int pthread_rwlock_tryrdlock(pthread_rwlock_t *) dontthrow paramsnonnull();
-int pthread_rwlock_trywrlock(pthread_rwlock_t *) dontthrow paramsnonnull();
-int pthread_rwlock_unlock(pthread_rwlock_t *) dontthrow paramsnonnull();
-int pthread_rwlock_wrlock(pthread_rwlock_t *) dontthrow paramsnonnull();
+int pthread_rwlock_rdlock(pthread_rwlock_t *) libcesque paramsnonnull();
+int pthread_rwlock_tryrdlock(pthread_rwlock_t *) libcesque paramsnonnull();
+int pthread_rwlock_trywrlock(pthread_rwlock_t *) libcesque paramsnonnull();
+int pthread_rwlock_unlock(pthread_rwlock_t *) libcesque paramsnonnull();
+int pthread_rwlock_wrlock(pthread_rwlock_t *) libcesque paramsnonnull();
 int pthread_rwlockattr_destroy(pthread_rwlockattr_t *) libcesque paramsnonnull();
 int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *) libcesque paramsnonnull();
 int pthread_rwlockattr_init(pthread_rwlockattr_t *) libcesque paramsnonnull();
@@ -229,21 +212,22 @@ int pthread_setschedprio(pthread_t, int) libcesque;
 int pthread_setspecific(pthread_key_t, const void *) libcesque;
 int pthread_spin_destroy(pthread_spinlock_t *) libcesque paramsnonnull();
 int pthread_spin_init(pthread_spinlock_t *, int) libcesque paramsnonnull();
-int pthread_spin_lock(pthread_spinlock_t *) dontthrow paramsnonnull();
-int pthread_spin_trylock(pthread_spinlock_t *) dontthrow paramsnonnull();
-int pthread_spin_unlock(pthread_spinlock_t *) dontthrow paramsnonnull();
-int pthread_testcancel_np(void) dontthrow;
-int pthread_tryjoin_np(pthread_t, void **) dontthrow;
-int pthread_yield(void) dontthrow;
-int pthread_yield_np(void) dontthrow;
+int pthread_spin_lock(pthread_spinlock_t *) libcesque paramsnonnull();
+int pthread_spin_trylock(pthread_spinlock_t *) libcesque paramsnonnull();
+int pthread_spin_unlock(pthread_spinlock_t *) libcesque paramsnonnull();
+int pthread_testcancel_np(void) libcesque;
+int pthread_tryjoin_np(pthread_t, void **) libcesque;
+int pthread_delay_np(const void *, int) libcesque;
+int pthread_yield_np(void) libcesque;
+int pthread_yield(void) libcesque;
 pthread_id_np_t pthread_getthreadid_np(void) libcesque;
 pthread_t pthread_self(void) libcesque pureconst;
 void *pthread_getspecific(pthread_key_t) libcesque;
-void pthread_cleanup_pop(struct _pthread_cleanup_buffer *, int) dontthrow paramsnonnull();
-void pthread_cleanup_push(struct _pthread_cleanup_buffer *, void (*)(void *), void *) dontthrow paramsnonnull((1));
-void pthread_exit(void *) wontreturn;
-void pthread_pause_np(void) dontthrow;
-void pthread_testcancel(void) dontthrow;
+void pthread_cleanup_pop(struct _pthread_cleanup_buffer *, int) libcesque paramsnonnull();
+void pthread_cleanup_push(struct _pthread_cleanup_buffer *, void (*)(void *), void *) libcesque paramsnonnull((1));
+void pthread_exit(void *) libcesque wontreturn;
+void pthread_testcancel(void) libcesque;
+void pthread_pause_np(void) libcesque;
 
 /* clang-format on */
 
diff --git a/libc/thread/thread2.h b/libc/thread/thread2.h
index a51e48ce2..db1d845ab 100644
--- a/libc/thread/thread2.h
+++ b/libc/thread/thread2.h
@@ -13,12 +13,12 @@ int pthread_attr_getschedparam(const pthread_attr_t *, struct sched_param *) lib
 int pthread_attr_getsigmask_np(const pthread_attr_t *, sigset_t *) libcesque paramsnonnull((1));
 int pthread_attr_setschedparam(pthread_attr_t *, const struct sched_param *) libcesque paramsnonnull();
 int pthread_attr_setsigmask_np(pthread_attr_t *, const sigset_t *) libcesque paramsnonnull((1));
-int pthread_cond_timedwait(pthread_cond_t *, pthread_mutex_t *, const struct timespec *) dontthrow paramsnonnull((1, 2));
+int pthread_cond_timedwait(pthread_cond_t *, pthread_mutex_t *, const struct timespec *) libcesque paramsnonnull((1, 2));
 int pthread_getaffinity_np(pthread_t, size_t, cpu_set_t *) libcesque paramsnonnull();
 int pthread_getschedparam(pthread_t, int *, struct sched_param *) libcesque paramsnonnull();
 int pthread_setaffinity_np(pthread_t, size_t, const cpu_set_t *) libcesque paramsnonnull();
 int pthread_setschedparam(pthread_t, int, const struct sched_param *) libcesque paramsnonnull();
-int pthread_timedjoin_np(pthread_t, void **, struct timespec *) dontthrow;
+int pthread_timedjoin_np(pthread_t, void **, struct timespec *) libcesque;
 
 /* clang-format off */
 COSMOPOLITAN_C_END_
diff --git a/libc/thread/tls.h b/libc/thread/tls.h
index e4c2a73b1..8880e1e8a 100644
--- a/libc/thread/tls.h
+++ b/libc/thread/tls.h
@@ -10,11 +10,12 @@ COSMOPOLITAN_C_START_
 
 struct CosmoFtrace {   /* 16 */
   char ft_once;        /*  0 */
+  char ft_noreentry;   /*  1 */
   int ft_skew;         /*  4 */
   int64_t ft_lastaddr; /*  8 */
 };
 
-/* cosmopolitan thread information block (1024 bytes) */
+/* cosmopolitan thread information block (512 bytes) */
 /* NOTE: update aarch64 libc/errno.h if sizeof changes */
 /* NOTE: update aarch64 libc/proc/vfork.S if sizeof changes */
 /* NOTE: update aarch64 libc/nexgen32e/gc.S if sizeof changes */
@@ -22,10 +23,10 @@ struct CosmoTib {
   struct CosmoTib *tib_self;      /* 0x00 */
   struct CosmoFtrace tib_ftracer; /* 0x08 */
   void *tib_garbages;             /* 0x18 */
-  _Atomic(int32_t) tib_ptid;      /* 0x20 transitions 0 → tid */
+  intptr_t tib_locale;            /* 0x20 */
   intptr_t tib_pthread;           /* 0x28 */
   struct CosmoTib *tib_self2;     /* 0x30 */
-  _Atomic(int32_t) tib_ctid;      /* 0x38 transitions -1 → tid → 0 */
+  _Atomic(int32_t) tib_tid;       /* 0x38 transitions -1 → tid → 0 */
   int32_t tib_errno;              /* 0x3c */
   uint64_t tib_flags;             /* 0x40 */
   int tib_ftrace;                 /* inherited */
@@ -36,12 +37,13 @@ struct CosmoTib {
   char *tib_sigstack_addr;
   uint32_t tib_sigstack_size;
   uint32_t tib_sigstack_flags;
+  _Atomic(int) tib_relock_maps;
   void *tib_nsync;
   void *tib_atexit;
   _Atomic(void *) tib_keys[46];
-  void *tib_locks[64];
 } __attribute__((__aligned__(64)));
 
+extern int __threaded;
 extern char __tls_morphed;
 extern unsigned __tls_index;
 
@@ -78,10 +80,6 @@ forceinline pureconst struct CosmoTib *__get_tls(void) {
 #endif
 }
 
-struct CosmoTib *__get_tls_privileged(void) dontthrow pureconst;
-struct CosmoTib *__get_tls_win32(void) dontthrow;
-void __set_tls_win32(void *) libcesque;
-
 #ifdef __x86_64__
 #define __adj_tls(tib) (tib)
 #elif defined(__aarch64__)
diff --git a/libc/thread/tls2.internal.h b/libc/thread/tls2.internal.h
new file mode 100644
index 000000000..be2e1c02a
--- /dev/null
+++ b/libc/thread/tls2.internal.h
@@ -0,0 +1,43 @@
+#ifndef COSMOPOLITAN_LIBC_THREAD_TLS2_H_
+#define COSMOPOLITAN_LIBC_THREAD_TLS2_H_
+#include "libc/dce.h"
+#include "libc/thread/tls.h"
+COSMOPOLITAN_C_START_
+#if defined(__GNUC__) && defined(__x86_64__)
+
+/**
+ * Returns location of thread information block.
+ *
+ * This should be favored over __get_tls() for .privileged code that
+ * can't be self-modified by __enable_tls().
+ */
+forceinline struct CosmoTib *__get_tls_privileged(void) {
+  char *tib, *lin = (char *)0x30;
+  if (IsNetbsd() || IsOpenbsd()) {
+    __asm__("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");
+  } else {
+    __asm__("mov\t%%gs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");
+    if (IsWindows())
+      tib = *(char **)(tib + 0x1480 + __tls_index * 8);
+  }
+  return (struct CosmoTib *)tib;
+}
+
+forceinline struct CosmoTib *__get_tls_win32(void) {
+  char *tib, *lin = (char *)0x30;
+  __asm__("mov\t%%gs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");
+  tib = *(char **)(tib + 0x1480 + __tls_index * 8);
+  return (struct CosmoTib *)tib;
+}
+
+forceinline void __set_tls_win32(void *tls) {
+  __asm__("mov\t%1,%%gs:%0" : "=m"(*((long *)0x1480 + __tls_index)) : "r"(tls));
+}
+
+#elif defined(__aarch64__)
+#define __get_tls_privileged() __get_tls()
+#define __get_tls_win32()      ((struct CosmoTib *)0)
+#define __set_tls_win32(tls)   (void)0
+#endif /* GNU x86-64 */
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_THREAD_TLS2_H_ */
diff --git a/libc/thread/ualarm.c b/libc/thread/ualarm.c
index db23b3877..625199c8a 100644
--- a/libc/thread/ualarm.c
+++ b/libc/thread/ualarm.c
@@ -20,7 +20,6 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/itimerval.h"
 #include "libc/calls/struct/timeval.h"
-#include "libc/stdio/sysparam.h"
 #include "libc/sysv/consts/itimer.h"
 
 /**
@@ -37,6 +36,5 @@ unsigned ualarm(unsigned usecs, unsigned reload) {
   it.it_value = timeval_frommicros(usecs);
   it.it_interval = timeval_frommicros(reload);
   npassert(!setitimer(ITIMER_REAL, &it, &old));
-  int64_t us = timeval_tomicros(old.it_value);
-  return MIN(us, -1u);
+  return timeval_tomicros(old.it_value);
 }
diff --git a/libc/tinymath/exp10.c b/libc/tinymath/exp10.c
index f58f0f3f0..f42d7d574 100644
--- a/libc/tinymath/exp10.c
+++ b/libc/tinymath/exp10.c
@@ -43,7 +43,7 @@ special_case (uint64_t sbits, double_t tmp, uint64_t ki)
 {
   double_t scale, y;
 
-  if ((ki & 0x80000000) == 0)
+  if (ki - (1ull << 16) < 0x80000000)
     {
       /* The exponent of scale might have overflowed by 1.  */
       sbits -= 1ull << 52;
@@ -109,14 +109,14 @@ exp10 (double x)
   /* Reduce x: z = x * N / log10(2), k = round(z).  */
   double_t z = __exp_data.invlog10_2N * x;
   double_t kd;
-  uint64_t ki;
+  int64_t ki;
 #if TOINT_INTRINSICS
   kd = roundtoint (z);
   ki = converttoint (z);
 #else
   kd = eval_as_double (z + Shift);
-  ki = asuint64 (kd);
   kd -= Shift;
+  ki = kd;
 #endif
 
   /* r = x - k * log10(2), r in [-0.5, 0.5].  */
@@ -155,6 +155,5 @@ exp10 (double x)
 
 __strong_reference(exp10, pow10);
 #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
-__weak_reference(exp10, pow10l);
 __weak_reference(exp10, exp10l);
 #endif
diff --git a/libc/tinymath/exp10l.c b/libc/tinymath/exp10l.c
index d41fd35d3..1111bde6e 100644
--- a/libc/tinymath/exp10l.c
+++ b/libc/tinymath/exp10l.c
@@ -52,6 +52,6 @@ long double exp10l(long double x)
 	return powl(10.0, x);
 }
 
-__strong_reference(exp10l, pow10l);
+__weak_reference(exp10l, pow10l);
 
 #endif /* long double is long */
diff --git a/libc/tinymath/powl.c b/libc/tinymath/powl.c
index 0e015082d..3ed4cd9e4 100644
--- a/libc/tinymath/powl.c
+++ b/libc/tinymath/powl.c
@@ -930,7 +930,7 @@ powl(long double x, long double y)
   z = one - (r - z);
   o.value = z;
   j = o.parts32.mswhi;
-  j += (int32_t)((uint32_t)n << 16); // TODO(jart): why ubsan
+  j += (n << 16);
   if ((j >> 16) <= 0)
     z = scalbnl (z, n);	/* subnormal output */
   else
diff --git a/libc/vga/rlinit-init-vga.S b/libc/vga/rlinit-init-vga.S
index 6a759cb73..70ff1ccb6 100644
--- a/libc/vga/rlinit-init-vga.S
+++ b/libc/vga/rlinit-init-vga.S
@@ -24,7 +24,7 @@
 │ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR        │
 │ OTHER DEALINGS IN THE SOFTWARE.                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/mman.internal.h"
 #include "libc/vga/vga.internal.h"
 
diff --git a/libc/vga/rlinit-vesa.S b/libc/vga/rlinit-vesa.S
index dfa1922f8..f1a91b1a0 100644
--- a/libc/vga/rlinit-vesa.S
+++ b/libc/vga/rlinit-vesa.S
@@ -25,7 +25,7 @@
 │ OTHER DEALINGS IN THE SOFTWARE.                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "ape/relocations.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/mman.internal.h"
 #include "libc/vga/vga.internal.h"
 
diff --git a/libc/vga/tty-graph.c b/libc/vga/tty-graph.c
index 1c2ad77df..55c4c409f 100644
--- a/libc/vga/tty-graph.c
+++ b/libc/vga/tty-graph.c
@@ -25,7 +25,7 @@
 │ OTHER DEALINGS IN THE SOFTWARE.                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/newbie.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/vga/vga.internal.h"
 
diff --git a/libc/vga/tty-graph.inc b/libc/vga/tty-graph.inc
index dc646ad2d..392e447f0 100644
--- a/libc/vga/tty-graph.inc
+++ b/libc/vga/tty-graph.inc
@@ -25,7 +25,7 @@
 │ OTHER DEALINGS IN THE SOFTWARE.                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/newbie.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdckdint.h"
 #include "libc/str/str.h"
 #include "libc/vga/vga.internal.h"
diff --git a/libc/vga/tty-klog.greg.c b/libc/vga/tty-klog.greg.c
index 35afe5380..e841034b2 100644
--- a/libc/vga/tty-klog.greg.c
+++ b/libc/vga/tty-klog.greg.c
@@ -25,7 +25,7 @@
 │ OTHER DEALINGS IN THE SOFTWARE.                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/newbie.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/mman.internal.h"
 #include "libc/str/str.h"
 #include "libc/vga/vga.internal.h"
diff --git a/libc/vga/tty.greg.c b/libc/vga/tty.greg.c
index 7b2738a3b..ad1f009d7 100644
--- a/libc/vga/tty.greg.c
+++ b/libc/vga/tty.greg.c
@@ -167,6 +167,7 @@ void _StartTty(struct Tty *tty, unsigned char type, unsigned short yp,
                unsigned short startx, unsigned char yc, unsigned char xc,
                void *fb, unsigned init_flags) {
   unsigned short yn, xn, xs = xp * sizeof(TtyCanvasColor);
+  struct DirectMap dm;
   bzero(tty, sizeof(struct Tty));
   SetYp(tty, yp);
   SetXp(tty, xp);
@@ -182,9 +183,9 @@ void _StartTty(struct Tty *tty, unsigned char type, unsigned short yp,
       tty->canvas = fb;
       xs = xsfb;
     } else {
-      void *addr = sys_mmap_metal(NULL, (size_t)yp * xs, PROT_READ | PROT_WRITE,
-                                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-      if (addr == (void *)-1) {
+      dm = sys_mmap_metal(NULL, (size_t)yp * xs, PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      if (dm.addr == (void *)-1) {
         /*
          * We are a bit low on memory.  Try to go on anyway, & initialize
          * our tty as an emergency console.
@@ -193,7 +194,7 @@ void _StartTty(struct Tty *tty, unsigned char type, unsigned short yp,
         tty->canvas = fb;
         xs = xsfb;
       } else
-        tty->canvas = addr;
+        tty->canvas = dm.addr;
     }
   }
   SetYn(tty, yn);
diff --git a/libc/x/BUILD.mk b/libc/x/BUILD.mk
index 343fcb5a1..dc1de97a8 100644
--- a/libc/x/BUILD.mk
+++ b/libc/x/BUILD.mk
@@ -33,14 +33,11 @@ LIBC_X_A_DIRECTDEPS =				\
 	LIBC_PROC				\
 	LIBC_RUNTIME				\
 	LIBC_NT_KERNEL32			\
-	LIBC_NT_ADVAPI32			\
 	LIBC_STDIO				\
-	LIBC_SOCK				\
 	LIBC_STR				\
 	LIBC_SYSV				\
 	THIRD_PARTY_GDTOA			\
-	THIRD_PARTY_MUSL			\
-	THIRD_PARTY_TZ				\
+	THIRD_PARTY_MUSL
 
 LIBC_X_A_DEPS :=				\
 	$(call uniq,$(foreach x,$(LIBC_X_A_DIRECTDEPS),$($(x))))
diff --git a/libc/x/utf16to8.c b/libc/x/utf16to8.c
index dfcd4dea3..219c2e2a9 100644
--- a/libc/x/utf16to8.c
+++ b/libc/x/utf16to8.c
@@ -17,13 +17,21 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/bsr.h"
+#include "libc/intrin/packsswb.h"
+#include "libc/intrin/pandn.h"
+#include "libc/intrin/pcmpgtb.h"
+#include "libc/intrin/pcmpgtw.h"
+#include "libc/intrin/pmovmskb.h"
+#include "libc/intrin/punpckhbw.h"
+#include "libc/intrin/punpcklbw.h"
 #include "libc/mem/mem.h"
 #include "libc/serialize.h"
 #include "libc/str/str.h"
 #include "libc/str/thompike.h"
 #include "libc/str/utf16.h"
 #include "libc/x/x.h"
-#include "third_party/intel/emmintrin.internal.h"
+
+static const int16_t kDel16[8] = {127, 127, 127, 127, 127, 127, 127, 127};
 
 /**
  * Transcodes UTF-16 to UTF-8.
@@ -37,27 +45,28 @@ char *utf16to8(const char16_t *p, size_t n, size_t *z) {
   char *r, *q;
   wint_t x, y;
   const char16_t *e;
+  int16_t v1[8], v2[8], v3[8], vz[8];
   if (z)
     *z = 0;
   if (n == -1)
     n = p ? strlen16(p) : 0;
   if ((q = r = malloc(n * 4 + 8 + 1))) {
     for (e = p + n; p < e;) {
-#if defined(__x86_64__)
-      if (p + 8 < e) {
+      if (p + 8 < e) { /* 17x ascii */
+        bzero(vz, 16);
         do {
-          __m128i v1 = _mm_loadu_si128((__m128i *)p);
-          __m128i v2 = _mm_cmpgt_epi16(v1, _mm_setzero_si128());
-          __m128i v3 = _mm_cmpgt_epi16(v1, _mm_set1_epi16(127));
-          v2 = _mm_andnot_si128(v3, v2);
-          if (_mm_movemask_epi8(v2) != 0xFFFF)
+          memcpy(v1, p, 16);
+          pcmpgtw(v2, v1, vz);
+          pcmpgtw(v3, v1, kDel16);
+          pandn((void *)v2, (void *)v3, (void *)v2);
+          if (pmovmskb((void *)v2) != 0xFFFF)
             break;
-          _mm_storel_epi64((__m128i *)q, _mm_packs_epi16(v1, v1));
+          packsswb((void *)v1, v1, v1);
+          memcpy(q, v1, 8);
           p += 8;
           q += 8;
         } while (p + 8 < e);
       }
-#endif
       x = *p++ & 0xffff;
       if (!IsUcs2(x)) {
         if (p < e) {
diff --git a/libc/x/utf8to32.c b/libc/x/utf8to32.c
index 15170e1a2..f1a8568cc 100644
--- a/libc/x/utf8to32.c
+++ b/libc/x/utf8to32.c
@@ -16,12 +16,18 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/likely.h"
+#include "libc/intrin/pcmpgtb.h"
+#include "libc/intrin/pmovmskb.h"
+#include "libc/intrin/punpckhbw.h"
+#include "libc/intrin/punpckhwd.h"
+#include "libc/intrin/punpcklbw.h"
+#include "libc/intrin/punpcklwd.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "libc/str/thompike.h"
 #include "libc/str/utf16.h"
 #include "libc/x/x.h"
-#include "third_party/intel/emmintrin.internal.h"
 
 /**
  * Transcodes UTF-8 to UTF-32.
@@ -35,35 +41,35 @@ wchar_t *utf8to32(const char *p, size_t n, size_t *z) {
   unsigned m, j;
   wint_t x, a, b;
   wchar_t *r, *q;
+  uint8_t v1[16], v2[16], v3[16], v4[16], vz[16];
   if (z)
     *z = 0;
   if (n == -1)
     n = p ? strlen(p) : 0;
   if ((q = r = malloc(n * sizeof(wchar_t) + sizeof(wchar_t)))) {
     for (i = 0; i < n;) {
-#ifdef __x86_64__
       if (!((uintptr_t)(p + i) & 15) && i + 16 < n) {
+        /* 10x speedup for ascii */
+        bzero(vz, 16);
         do {
-          __m128i v1, v2, v3, v4;
-          v1 = _mm_loadu_si128((__m128i *)(p + i));
-          v2 = _mm_cmpgt_epi8(v1, _mm_setzero_si128());
-          if (_mm_movemask_epi8(v2) != 0xFFFF)
+          memcpy(v1, p + i, 16);
+          pcmpgtb((int8_t *)v2, (int8_t *)v1, (int8_t *)vz);
+          if (pmovmskb(v2) != 0xFFFF)
             break;
-          v3 = _mm_unpacklo_epi8(v1, _mm_setzero_si128());
-          v1 = _mm_unpackhi_epi8(v1, _mm_setzero_si128());
-          v4 = _mm_unpacklo_epi16(v3, _mm_setzero_si128());
-          v3 = _mm_unpackhi_epi16(v3, _mm_setzero_si128());
-          v2 = _mm_unpacklo_epi16(v1, _mm_setzero_si128());
-          v1 = _mm_unpackhi_epi16(v1, _mm_setzero_si128());
-          _mm_storeu_si128((__m128i *)(q + 0), v4);
-          _mm_storeu_si128((__m128i *)(q + 4), v3);
-          _mm_storeu_si128((__m128i *)(q + 8), v2);
-          _mm_storeu_si128((__m128i *)(q + 12), v1);
+          punpcklbw(v3, v1, vz);
+          punpckhbw(v1, v1, vz);
+          punpcklwd((void *)v4, (void *)v3, (void *)vz);
+          punpckhwd((void *)v3, (void *)v3, (void *)vz);
+          punpcklwd((void *)v2, (void *)v1, (void *)vz);
+          punpckhwd((void *)v1, (void *)v1, (void *)vz);
+          memcpy(q + 0, v4, 16);
+          memcpy(q + 4, v3, 16);
+          memcpy(q + 8, v2, 16);
+          memcpy(q + 12, v1, 16);
           i += 16;
           q += 16;
         } while (i + 16 < n);
       }
-#endif
       x = p[i++] & 0xff;
       if (x >= 0300) {
         a = ThomPikeByte(x);
diff --git a/libc/zip.h b/libc/zip.internal.h
similarity index 100%
rename from libc/zip.h
rename to libc/zip.internal.h
diff --git a/net/finger/fingersyn.c b/net/finger/fingersyn.c
index e46a091e2..eb3d24fab 100644
--- a/net/finger/fingersyn.c
+++ b/net/finger/fingersyn.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/bsr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 /**
  * Fingers IP+TCP SYN packet.
diff --git a/net/http/base32.c b/net/http/base32.c
index 3d8ed1160..c9a3af12f 100644
--- a/net/http/base32.c
+++ b/net/http/base32.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 
diff --git a/net/http/decodebase64.c b/net/http/decodebase64.c
index a3aeaa38d..a9b19288a 100644
--- a/net/http/decodebase64.c
+++ b/net/http/decodebase64.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "net/http/escape.h"
diff --git a/net/http/decodelatin1.c b/net/http/decodelatin1.c
index ce9aed209..4799d8a9d 100644
--- a/net/http/decodelatin1.c
+++ b/net/http/decodelatin1.c
@@ -16,6 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/pcmpgtb.h"
+#include "libc/intrin/pmovmskb.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "net/http/escape.h"
@@ -32,12 +34,23 @@ char *DecodeLatin1(const char *p, size_t n, size_t *z) {
   int c;
   size_t i;
   char *r, *q;
+  int8_t v1[16], v2[16], vz[16];
   if (z)
     *z = 0;
   if (n == -1)
     n = p ? strlen(p) : 0;
   if ((q = r = malloc(n * 2 + 1))) {
     for (i = 0; i < n;) {
+      bzero(vz, 16); /* 3x speedup for ASCII */
+      while (i + 16 < n) {
+        memcpy(v1, p + i, 16);
+        pcmpgtb(v2, v1, vz);
+        if (pmovmskb((void *)v2) != 0xFFFF)
+          break;
+        memcpy(q, v1, 16);
+        q += 16;
+        i += 16;
+      }
       c = p[i++] & 0xff;
       if (c < 0200) {
         *q++ = c;
diff --git a/net/http/encodelatin1.c b/net/http/encodelatin1.c
index 9d3bc0ed8..4d6798ec7 100644
--- a/net/http/encodelatin1.c
+++ b/net/http/encodelatin1.c
@@ -17,6 +17,8 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
+#include "libc/intrin/pcmpgtb.h"
+#include "libc/intrin/pmovmskb.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
diff --git a/net/http/encodeurl.c b/net/http/encodeurl.c
index 455a2bdc3..acb7e09d3 100644
--- a/net/http/encodeurl.c
+++ b/net/http/encodeurl.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "net/http/escape.h"
 #include "net/http/url.h"
 
diff --git a/net/http/findcontenttype.c b/net/http/findcontenttype.c
index 587a05cad..5ad6a3a81 100644
--- a/net/http/findcontenttype.c
+++ b/net/http/findcontenttype.c
@@ -18,10 +18,10 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/intrin/bswap.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/serialize.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "net/http/http.h"
 
 static const struct ContentTypeExtension {
diff --git a/net/http/formathttpdatetime.c b/net/http/formathttpdatetime.c
index e0b6d5c1b..80089e3d0 100644
--- a/net/http/formathttpdatetime.c
+++ b/net/http/formathttpdatetime.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/time.h"
 #include "net/http/http.h"
diff --git a/net/http/gethttpheader.inc b/net/http/gethttpheader.inc
index 8e58c9279..72f3b7afe 100644
--- a/net/http/gethttpheader.inc
+++ b/net/http/gethttpheader.inc
@@ -33,7 +33,7 @@
 #line 1 "gethttpheader.gperf"
 
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "net/http/http.h"
 #define GPERF_DOWNCASE
 #line 12 "gethttpheader.gperf"
diff --git a/net/http/gethttpreason.c b/net/http/gethttpreason.c
index e4492f914..276a75baa 100644
--- a/net/http/gethttpreason.c
+++ b/net/http/gethttpreason.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/itoa.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "net/http/http.h"
 
 static const struct thatispacked HttpReason {
diff --git a/net/http/ismimetype.c b/net/http/ismimetype.c
index 17a0956d6..a810a0585 100644
--- a/net/http/ismimetype.c
+++ b/net/http/ismimetype.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "net/http/http.h"
 
 /**
diff --git a/net/http/isnocompressext.c b/net/http/isnocompressext.c
index 49b21be28..10b6b7e72 100644
--- a/net/http/isnocompressext.c
+++ b/net/http/isnocompressext.c
@@ -17,10 +17,10 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/bswap.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/serialize.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "net/http/http.h"
 
 static const char kNoCompressExts[][8] = {
diff --git a/net/http/parsehttpmessage.c b/net/http/parsehttpmessage.c
index 8a0429a41..1a52fce57 100644
--- a/net/http/parsehttpmessage.c
+++ b/net/http/parsehttpmessage.c
@@ -25,7 +25,7 @@
 #include "libc/serialize.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/x/x.h"
 #include "net/http/http.h"
@@ -47,7 +47,6 @@ void DestroyHttpMessage(struct HttpMessage *r) {
     free(r->xheaders.p);
     r->xheaders.p = NULL;
     r->xheaders.n = 0;
-    r->xheaders.c = 0;
   }
 }
 
diff --git a/net/http/parsehttpmethod.c b/net/http/parsehttpmethod.c
index 152dadc2e..bb71041d4 100644
--- a/net/http/parsehttpmethod.c
+++ b/net/http/parsehttpmethod.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "net/http/http.h"
 
 /**
diff --git a/net/http/parsehttprange.c b/net/http/parsehttprange.c
index 4974482dc..80aa738b7 100644
--- a/net/http/parsehttprange.c
+++ b/net/http/parsehttprange.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdckdint.h"
 #include "libc/str/str.h"
 #include "net/http/http.h"
diff --git a/net/http/parseurl.c b/net/http/parseurl.c
index 16ffd4b58..c967f4543 100644
--- a/net/http/parseurl.c
+++ b/net/http/parseurl.c
@@ -20,7 +20,7 @@
 #include "libc/limits.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/x/x.h"
 #include "net/http/escape.h"
 #include "net/http/url.h"
diff --git a/net/http/unchunk.c b/net/http/unchunk.c
index cbacd179e..d3fe94bfe 100644
--- a/net/http/unchunk.c
+++ b/net/http/unchunk.c
@@ -16,9 +16,9 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/errfuns.h"
 #include "net/http/escape.h"
 #include "net/http/http.h"
diff --git a/net/http/underlong.c b/net/http/underlong.c
index 1d0906582..a48e7f48c 100644
--- a/net/http/underlong.c
+++ b/net/http/underlong.c
@@ -16,6 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/pcmpgtb.h"
+#include "libc/intrin/pmovmskb.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "libc/str/thompike.h"
@@ -38,12 +40,23 @@ char *Underlong(const char *p, size_t n, size_t *z) {
   char *r, *q;
   size_t i, j, m;
   wint_t x, a, b;
+  int8_t v1[16], v2[16], vz[16];
   if (z)
     *z = 0;
   if (n == -1)
     n = p ? strlen(p) : 0;
   if ((q = r = malloc(n * 2 + 1))) {
     for (i = 0; i < n;) {
+      bzero(vz, 16); /* 50x speedup for ASCII */
+      while (i + 16 < n) {
+        memcpy(v1, p + i, 16);
+        pcmpgtb(v2, v1, vz);
+        if (pmovmskb((void *)v2) != 0xFFFF)
+          break;
+        memcpy(q, v1, 16);
+        q += 16;
+        i += 16;
+      }
       x = p[i++] & 0xff;
       if (x >= 0300) {
         a = ThomPikeByte(x);
diff --git a/net/https/describesslverifyfailure.c b/net/https/describesslverifyfailure.c
index 44b22a809..46e0056ab 100644
--- a/net/https/describesslverifyfailure.c
+++ b/net/https/describesslverifyfailure.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "net/https/https.h"
diff --git a/net/turfwar/BUILD.mk b/net/turfwar/BUILD.mk
index 0a49ad12f..ffa291ea9 100644
--- a/net/turfwar/BUILD.mk
+++ b/net/turfwar/BUILD.mk
@@ -9,8 +9,8 @@ NET_TURFWAR_OBJS =					\
 	$(NET_TURFWAR_SRCS:%.c=o/$(MODE)/%.o)
 
 NET_TURFWAR_COMS =					\
-	$(NET_TURFWAR_SRCS:%.c=o/$(MODE)/%)		\
-	o/$(MODE)/net/turfwar/turfbean
+	$(NET_TURFWAR_SRCS:%.c=o/$(MODE)/%.com)		\
+	o/$(MODE)/net/turfwar/turfbean.com
 
 NET_TURFWAR_BINS =					\
 	$(NET_TURFWAR_COMS)				\
@@ -28,7 +28,6 @@ NET_TURFWAR_DIRECTDEPS =				\
 	LIBC_SOCK					\
 	LIBC_STDIO					\
 	LIBC_STR					\
-	LIBC_SYSTEM					\
 	LIBC_SYSV					\
 	LIBC_THREAD					\
 	LIBC_X						\
@@ -40,7 +39,7 @@ NET_TURFWAR_DIRECTDEPS =				\
 	THIRD_PARTY_SQLITE3				\
 	THIRD_PARTY_STB					\
 	THIRD_PARTY_TZ					\
-	THIRD_PARTY_ZLIB				\
+	THIRD_PARTY_ZLIB
 
 NET_TURFWAR_DEPS :=					\
 	$(call uniq,$(foreach x,$(NET_TURFWAR_DIRECTDEPS),$($(x))))
@@ -49,7 +48,7 @@ o/$(MODE)/net/turfwar/turfwar.pkg:			\
 		$(NET_TURFWAR_OBJS)			\
 		$(foreach x,$(NET_TURFWAR_DIRECTDEPS),$($(x)_A).pkg)
 
-o/$(MODE)/net/turfwar/%.dbg:				\
+o/$(MODE)/net/turfwar/%.com.dbg:			\
 		$(NET_TURFWAR_DEPS)			\
 		o/$(MODE)/net/turfwar/%.o		\
 		o/$(MODE)/net/turfwar/turfwar.pkg	\
@@ -57,7 +56,7 @@ o/$(MODE)/net/turfwar/%.dbg:				\
 		$(APE_NO_MODIFY_SELF)
 	@$(APELINK)
 
-o/$(MODE)/net/turfwar/turfbean.dbg:			\
+o/$(MODE)/net/turfwar/turfbean.com.dbg:			\
 		$(TOOL_NET_DEPS)			\
 		o/$(MODE)/tool/net/redbean.o		\
 		$(TOOL_NET_REDBEAN_LUA_MODULES)		\
diff --git a/net/turfwar/turfwar.c b/net/turfwar/turfwar.c
index f2cf5ee3f..6a0b956cc 100644
--- a/net/turfwar/turfwar.c
+++ b/net/turfwar/turfwar.c
@@ -17,67 +17,80 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
-#include "libc/atomic.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/pledge.h"
 #include "libc/calls/struct/iovec.h"
 #include "libc/calls/struct/rusage.h"
 #include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/siginfo.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/stat.h"
 #include "libc/calls/struct/sysinfo.h"
 #include "libc/calls/struct/timespec.h"
-#include "libc/calls/struct/ucontext.internal.h"
-#include "libc/calls/ucontext.h"
+#include "libc/calls/struct/timeval.h"
 #include "libc/ctype.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/atomic.h"
-#include "libc/intrin/iscall.h"
+#include "libc/intrin/bsr.h"
+#include "libc/intrin/hilbert.h"
 #include "libc/intrin/kprintf.h"
+#include "libc/intrin/strace.h"
+#include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/mem/sortedints.internal.h"
-#include "libc/nexgen32e/stackframe.h"
+#include "libc/nexgen32e/crc32.h"
 #include "libc/paths.h"
+#include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
+#include "libc/runtime/stack.h"
 #include "libc/runtime/sysconf.h"
+#include "libc/serialize.h"
 #include "libc/sock/sock.h"
+#include "libc/sock/struct/pollfd.h"
 #include "libc/sock/struct/sockaddr.h"
 #include "libc/stdio/append.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/stdio.h"
 #include "libc/str/slice.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/af.h"
 #include "libc/sysv/consts/clock.h"
-#include "libc/sysv/consts/f.h"
 #include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/poll.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/rusage.h"
-#include "libc/sysv/consts/sa.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/consts/so.h"
 #include "libc/sysv/consts/sock.h"
 #include "libc/sysv/consts/sol.h"
 #include "libc/sysv/consts/tcp.h"
-#include "libc/sysv/consts/timer.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/thread2.h"
 #include "libc/time.h"
 #include "libc/x/x.h"
-#include "libc/zip.h"
+#include "libc/x/xasprintf.h"
+#include "libc/zip.internal.h"
 #include "net/http/escape.h"
 #include "net/http/http.h"
 #include "net/http/ip.h"
 #include "net/http/tokenbucket.h"
 #include "net/http/url.h"
 #include "third_party/getopt/getopt.internal.h"
+#include "third_party/nsync/counter.h"
+#include "third_party/nsync/cv.h"
+#include "third_party/nsync/mu.h"
+#include "third_party/nsync/note.h"
+#include "third_party/nsync/time.h"
 #include "third_party/sqlite3/sqlite3.h"
+#include "third_party/stb/stb_image_write.h"
+#include "third_party/zlib/zconf.h"
 #include "third_party/zlib/zlib.h"
+#include "tool/net/lfuncs.h"
 
 /**
  * @fileoverview production webserver for turfwar online game
@@ -85,6 +98,8 @@
 
 #define PORT               8080    // default server listening port
 #define CPUS               64      // number of cpus to actually use
+#define XN                 64      // plot width in pixels
+#define YN                 64      // plot height in pixels
 #define WORKERS            500     // size of http client thread pool
 #define SUPERVISE_MS       1000    // how often to stat() asset files
 #define KEEPALIVE_MS       60000   // max time to keep idle conn open
@@ -94,6 +109,7 @@
 #define SCORE_W_UPDATE_MS  70000   // how often to regenerate /score/week
 #define SCORE_M_UPDATE_MS  100000  // how often to regenerate /score/month
 #define SCORE_UPDATE_MS    210000  // how often to regenerate /score
+#define PLOTS_UPDATE_MS    999000  // how often to regenerate /plot/xxx
 #define ACCEPT_DEADLINE_MS 100     // how long accept() can take to find worker
 #define CLAIM_DEADLINE_MS  100     // how long /claim may block if queue is full
 #define CONCERN_LOAD       .75     // avoid keepalive, upon this connection load
@@ -108,7 +124,7 @@
 #define MSG_BUF            512     // small response lookaside
 
 #define INBUF_SIZE  65536
-#define OUTBUF_SIZE 65536
+#define OUTBUF_SIZE 8192
 
 #define TB_BYTES (1u << TB_CIDR)
 #define TB_WORDS (TB_BYTES / 8)
@@ -219,10 +235,9 @@ struct Data {
 };
 
 struct Asset {
-  atomic_bool ready;
   int cash;
   char *path;
-  pthread_rwlock_t lock;
+  nsync_mu lock;
   const char *type;
   struct Data data;
   struct Data gzip;
@@ -241,23 +256,18 @@ struct Blackhole {
 // cli flags
 bool g_integrity;
 bool g_daemonize;
-int g_crash_fd;
 int g_port = PORT;
 int g_workers = WORKERS;
 int g_keepalive = KEEPALIVE_MS;
 struct SortedInts g_whitelisted;
-thread_local char last_message[INBUF_SIZE];
-sig_atomic_t is_shutting_down;
-
-// threads
-pthread_t g_listener;
-pthread_t scorer, recenter, claimer, replenisher;
-pthread_t scorer_hour, scorer_day, scorer_week, scorer_month;
 
 // lifecycle vars
-struct timespec g_started;
+pthread_t g_listener;
+nsync_time g_started;
+nsync_counter g_ready;
 atomic_int g_connections;
-atomic_int g_worker_threads;
+nsync_note g_shutdown[3];
+int g_hilbert[YN * XN][2];
 
 // whitebox metrics
 atomic_long g_banned;
@@ -298,23 +308,25 @@ union TokenBucket {
 // http worker objects
 struct Worker {
   pthread_t th;
-  atomic_bool dead;
   atomic_int msgcount;
+  atomic_int shutdown;
   atomic_int connected;
   struct timespec startread;
-  char *msgbuf;
-  char *inbuf;
-  char *outbuf;
-  struct HttpMessage *msg;
-  struct Client *client;
 } *g_worker;
 
 // recentworker wakeup
 struct Recent {
-  pthread_mutex_t mu;
-  pthread_cond_t cv;
+  nsync_mu mu;
+  nsync_cv cv;
 } g_recent;
 
+// global date header
+struct Nowish {
+  nsync_mu lock;
+  struct timespec ts;
+  struct tm tm;
+} g_nowish;
+
 // static assets
 struct Assets {
   struct Asset index;
@@ -327,15 +339,16 @@ struct Assets {
   struct Asset score_month;
   struct Asset recent;
   struct Asset favicon;
+  struct Asset plot[256];
 } g_asset;
 
 // queues ListenWorker() to HttpWorker()
 struct Clients {
   int pos;
   int count;
-  pthread_mutex_t mu;
-  pthread_cond_t non_full;
-  pthread_cond_t non_empty;
+  nsync_mu mu;
+  nsync_cv non_full;
+  nsync_cv non_empty;
   struct Client {
     int sock;
     uint32_t size;
@@ -347,9 +360,9 @@ struct Clients {
 struct Claims {
   int pos;
   int count;
-  pthread_mutex_t mu;
-  pthread_cond_t non_full;
-  pthread_cond_t non_empty;
+  nsync_mu mu;
+  nsync_cv non_full;
+  nsync_cv non_empty;
   struct Claim {
     uint32_t ip;
     int64_t created;
@@ -378,25 +391,25 @@ struct timespec WaitFor(int millis) {
 bool CheckMem(const char *file, int line, void *ptr) {
   if (ptr)
     return true;
-  kprintf("%s:%d: %H: out of memory: %s\n", file, line, strerror(errno));
+  kprintf("%s:%d: %P: out of memory: %s\n", file, line, strerror(errno));
   return false;
 }
 bool CheckSys(const char *file, int line, long rc) {
   if (rc != -1)
     return true;
-  kprintf("%s:%d: %H: %s\n", file, line, strerror(errno));
+  kprintf("%s:%d: %P: %s\n", file, line, strerror(errno));
   return false;
 }
 bool CheckSql(const char *file, int line, int rc) {
   if (rc == SQLITE_OK)
     return true;
-  kprintf("%s:%d: %H: %s\n", file, line, sqlite3_errstr(rc));
+  kprintf("%s:%d: %P: %s\n", file, line, sqlite3_errstr(rc));
   return false;
 }
 bool CheckDb(const char *file, int line, int rc, sqlite3 *db) {
   if (rc == SQLITE_OK)
     return true;
-  kprintf("%s:%d: %H: %s: %s\n", file, line, sqlite3_errstr(rc),
+  kprintf("%s:%d: %P: %s: %s\n", file, line, sqlite3_errstr(rc),
           sqlite3_errmsg(db));
   return false;
 }
@@ -488,187 +501,51 @@ bool IsValidNick(const char *s, size_t n) {
   return true;
 }
 
-struct Clock {
-  atomic_uint roll;
-  atomic_ulong time;
-  atomic_ulong date;
-};
-
-static struct Clock g_clck[2];
-static pthread_t g_time_thread;
-
-static void set_clck(struct Clock *clck, long time, long date) {
-  unsigned long roll;
-  roll = atomic_fetch_add_explicit(&clck->roll, 1, memory_order_relaxed);
-  time &= 0xffffffffffff;
-  date &= 0xffffffffffff;
-  time |= roll << 48;
-  date |= roll << 48;
-  atomic_store_explicit(&clck->time, time, memory_order_relaxed);
-  atomic_store_explicit(&clck->date, date, memory_order_relaxed);
-}
-
-static void get_clck(struct Clock *clck, long *out_time, long *out_date) {
-  long time, date;
-  do {
-    time = atomic_load_explicit(&clck->time, memory_order_relaxed);
-    date = atomic_load_explicit(&clck->date, memory_order_relaxed);
-  } while ((time >> 48) != (date >> 48));
-  *out_date = date & 0xffffffffffff;
-  *out_time = time & 0xffffffffffff;
-}
-
-static long encode_date(const struct tm *tm) {
-  long date;
-  date = tm->tm_year;
-  date <<= 4;
-  date |= tm->tm_isdst == 1;
-  date <<= 1;
-  date |= tm->tm_mon;
-  date <<= 5;
-  date |= tm->tm_mday;
-  date <<= 3;
-  date |= tm->tm_wday;
-  date <<= 5;
-  date |= tm->tm_hour;
-  date <<= 6;
-  date |= tm->tm_min;
-  date <<= 6;
-  date |= tm->tm_sec;
-  return date;
-}
-
-static void decode_date(long date, struct tm *tm) {
-  tm->tm_sec = date & 63;
-  date >>= 6;
-  tm->tm_min = date & 63;
-  date >>= 6;
-  tm->tm_hour = date & 31;
-  date >>= 5;
-  tm->tm_wday = date & 7;
-  date >>= 3;
-  tm->tm_mday = date & 31;
-  date >>= 5;
-  tm->tm_mon = date & 15;
-  date >>= 4;
-  tm->tm_isdst = date & 1;
-  date >>= 1;
-  tm->tm_year = date;
-  tm->tm_gmtoff = 0;  // unsupported
-  tm->tm_zone = 0;    // unsupported
-  tm->tm_yday = 0;    // unsupported
-}
-
-static void update_time() {
-  struct tm tm;
-  struct timespec ts;
-  clock_gettime(0, &ts);
-  gmtime_r(&ts.tv_sec, &tm);
-  set_clck(&g_clck[0], ts.tv_sec, encode_date(&tm));
-  localtime_r(&ts.tv_sec, &tm);
-  set_clck(&g_clck[1], ts.tv_sec, encode_date(&tm));
-}
-
-static void *time_worker(void *arg) {
-  sigset_t ss;
-  sigemptyset(&ss);
-  sigaddset(&ss, SIGHUP);
-  sigaddset(&ss, SIGINT);
-  sigaddset(&ss, SIGQUIT);
-  sigaddset(&ss, SIGTERM);
-  sigaddset(&ss, SIGUSR1);
-  sigaddset(&ss, SIGALRM);
-  pthread_sigmask(SIG_SETMASK, &ss, 0);
-  pthread_setname_np(pthread_self(), "localtime");
-  for (;;) {
-    sleep(10);
-    update_time();
-  }
-  return nullptr;
-}
-
-void time_init() {
-  update_time();
-  if (pthread_create(&g_time_thread, 0, time_worker, 0))
-    __builtin_trap();
-}
-
-void time_destroy() {
-  pthread_cancel(g_time_thread);
-  if (pthread_join(g_time_thread, 0))
-    __builtin_trap();
-}
-
-static const char kMonDays[2][12] = {
-    {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
-    {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
-};
-
-static void time_lockless(struct Clock *clck, long now, struct tm *tm) {
-  long time, date, since;
-  get_clck(clck, &time, &date);
-  decode_date(date, tm);
-  since = now - time;
-  since = since < 60 ? since : 60;
-  for (; since > 0; --since) {
-    if (++tm->tm_sec >= 60) {
-      tm->tm_sec = 0;
-      if (++tm->tm_min >= 60) {
-        tm->tm_min = 0;
-        if (++tm->tm_hour >= 24) {
-          tm->tm_hour = 0;
-          if (++tm->tm_mday >= 7)
-            tm->tm_mday = 0;
-          if (++tm->tm_mday > kMonDays[!!tm->tm_isdst][tm->tm_mon]) {
-            tm->tm_mday = 1;
-            if (++tm->tm_mon >= 12) {
-              tm->tm_mon = 0;
-              ++tm->tm_year;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void gmtime_lockless(long now, struct tm *tm) {
-  time_lockless(&g_clck[0], now, tm);
-}
-
-void localtime_lockless(long now, struct tm *tm) {
-  time_lockless(&g_clck[1], now, tm);
-}
-
 // turn unix timestamp into string the easy way
 char *FormatUnixHttpDateTime(char *s, int64_t t) {
   struct tm tm;
-  gmtime_lockless(t, &tm);
+  gmtime_r(&t, &tm);
   FormatHttpDateTime(s, &tm);
   return s;
 }
 
+// gmtime_r() does a shocking amount of compute
+// so we try to handle that globally right here
+void UpdateNow(void) {
+  int64_t secs;
+  struct tm tm;
+  g_nowish.ts = timespec_real();
+  secs = g_nowish.ts.tv_sec;
+  gmtime_r(&secs, &tm);
+  //!//!//!//!//!//!//!//!//!//!//!//!//!/
+  nsync_mu_lock(&g_nowish.lock);
+  g_nowish.tm = tm;
+  nsync_mu_unlock(&g_nowish.lock);
+  //!//!//!//!//!//!//!//!//!//!//!//!//!/
+}
+
 // the standard strftime() function is dismally slow
 // this function is non-generalized for just http so
 // it needs 25 cycles rather than 709 cycles so cool
 char *FormatDate(char *p) {
-  return FormatUnixHttpDateTime(p, timespec_real().tv_sec);
+  ////////////////////////////////////////
+  nsync_mu_rlock(&g_nowish.lock);
+  p = FormatHttpDateTime(p, &g_nowish.tm);
+  nsync_mu_runlock(&g_nowish.lock);
+  ////////////////////////////////////////
+  return p;
 }
 
-void unlock_mutex(void *arg) {
-  pthread_mutex_t *lock = arg;
-  pthread_mutex_unlock(lock);
-}
-
-bool AddClient(struct Clients *q, const struct Client *v,
-               struct timespec dead) {
+bool AddClient(struct Clients *q, const struct Client *v, nsync_time dead) {
   bool wake = false;
   bool added = false;
-  pthread_mutex_lock(&q->mu);
-  pthread_cleanup_push(unlock_mutex, &q->mu);
-  while (q->count == ARRAYLEN(q->data))
-    if (pthread_cond_timedwait(&q->non_full, &q->mu, &dead))
-      break;  // must be ETIMEDOUT
+  nsync_mu_lock(&q->mu);
+  while (q->count == ARRAYLEN(q->data)) {
+    if (nsync_cv_wait_with_deadline(&q->non_full, &q->mu, dead,
+                                    g_shutdown[0])) {
+      break;  // must be ETIMEDOUT or ECANCELED
+    }
+  }
   if (q->count != ARRAYLEN(q->data)) {
     int i = q->pos + q->count;
     if (ARRAYLEN(q->data) <= i)
@@ -679,44 +556,52 @@ bool AddClient(struct Clients *q, const struct Client *v,
     q->count++;
     added = true;
   }
-  pthread_cleanup_pop(true);
-  if (wake)
-    pthread_cond_broadcast(&q->non_empty);
+  nsync_mu_unlock(&q->mu);
+  if (wake) {
+    nsync_cv_broadcast(&q->non_empty);
+  }
   return added;
 }
 
 int GetClient(struct Clients *q, struct Client *out) {
   int got = 0;
   int len = 1;
-  pthread_mutex_lock(&q->mu);
-  pthread_cleanup_push(unlock_mutex, &q->mu);
-  while (!q->count)
-    pthread_cond_timedwait(&q->non_empty, &q->mu, 0);
+  nsync_mu_lock(&q->mu);
+  while (!q->count) {
+    if (nsync_cv_wait_with_deadline(&q->non_empty, &q->mu,
+                                    nsync_time_no_deadline, g_shutdown[1])) {
+      break;  // must be ECANCELED
+    }
+  }
   while (got < len && q->count) {
     memcpy(out + got, q->data + q->pos, sizeof(*out));
-    if (q->count == ARRAYLEN(q->data))
-      pthread_cond_broadcast(&q->non_full);
+    if (q->count == ARRAYLEN(q->data)) {
+      nsync_cv_broadcast(&q->non_full);
+    }
     ++got;
     q->pos++;
     q->count--;
-    if (q->pos == ARRAYLEN(q->data))
+    if (q->pos == ARRAYLEN(q->data)) {
       q->pos = 0;
+    }
   }
-  pthread_cleanup_pop(true);
+  nsync_mu_unlock(&q->mu);
   return got;
 }
 
 // inserts ip:name claim into blocking message queue
 // may be interrupted by absolute deadline
 // may be cancelled by server shutdown
-bool AddClaim(struct Claims *q, const struct Claim *v, struct timespec dead) {
+bool AddClaim(struct Claims *q, const struct Claim *v, nsync_time dead) {
   bool wake = false;
   bool added = false;
-  pthread_mutex_lock(&q->mu);
-  pthread_cleanup_push(unlock_mutex, &q->mu);
-  while (q->count == ARRAYLEN(q->data))
-    if (pthread_cond_timedwait(&q->non_full, &q->mu, &dead))
+  nsync_mu_lock(&q->mu);
+  while (q->count == ARRAYLEN(q->data)) {
+    if (nsync_cv_wait_with_deadline(&q->non_full, &q->mu, dead,
+                                    g_shutdown[1])) {
       break;  // must be ETIMEDOUT or ECANCELED
+    }
+  }
   if (q->count != ARRAYLEN(q->data)) {
     int i = q->pos + q->count;
     if (ARRAYLEN(q->data) <= i)
@@ -727,9 +612,10 @@ bool AddClaim(struct Claims *q, const struct Claim *v, struct timespec dead) {
     q->count++;
     added = true;
   }
-  pthread_cleanup_pop(true);
-  if (wake)
-    pthread_cond_broadcast(&q->non_empty);
+  nsync_mu_unlock(&q->mu);
+  if (wake) {
+    nsync_cv_broadcast(&q->non_empty);
+  }
   return added;
 }
 
@@ -737,25 +623,26 @@ bool AddClaim(struct Claims *q, const struct Claim *v, struct timespec dead) {
 // has no deadline or cancellation; enqueued must be processed
 int GetClaims(struct Claims *q, struct Claim *out, int len) {
   int got = 0;
-  pthread_mutex_lock(&q->mu);
-  pthread_cleanup_push(unlock_mutex, &q->mu);
-  pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
-  while (!q->count)
-    if (pthread_cond_timedwait(&q->non_empty, &q->mu, 0))
+  nsync_mu_lock(&q->mu);
+  while (!q->count) {
+    if (nsync_cv_wait_with_deadline(&q->non_empty, &q->mu,
+                                    nsync_time_no_deadline, g_shutdown[2])) {
       break;  // must be ECANCELED
-  pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);
+    }
+  }
   while (got < len && q->count) {
     memcpy(out + got, q->data + q->pos, sizeof(*out));
     if (q->count == ARRAYLEN(q->data)) {
-      pthread_cond_broadcast(&q->non_full);
+      nsync_cv_broadcast(&q->non_full);
     }
     ++got;
     q->pos++;
     q->count--;
-    if (q->pos == ARRAYLEN(q->data))
+    if (q->pos == ARRAYLEN(q->data)) {
       q->pos = 0;
+    }
   }
-  pthread_cleanup_pop(true);
+  nsync_mu_unlock(&q->mu);
   return got;
 }
 
@@ -807,14 +694,14 @@ void FreeSafeBuffer(void *p) {
 void BlockSignals(void) {
   sigset_t mask;
   sigfillset(&mask);
-  sigdelset(&mask, SIGABRT);
-  sigdelset(&mask, SIGTRAP);
-  sigdelset(&mask, SIGFPE);
-  sigdelset(&mask, SIGBUS);
-  sigdelset(&mask, SIGSEGV);
-  sigdelset(&mask, SIGILL);
-  sigdelset(&mask, SIGXCPU);
-  sigdelset(&mask, SIGXFSZ);
+  sigprocmask(SIG_SETMASK, &mask, 0);
+}
+
+// main thread uses sigusr1 to deliver io cancellations
+void AllowSigusr1(void) {
+  sigset_t mask;
+  sigfillset(&mask);
+  sigdelset(&mask, SIGUSR1);
   sigprocmask(SIG_SETMASK, &mask, 0);
 }
 
@@ -844,7 +731,6 @@ void ServeStatusz(int client, char *outbuf) {
   p = Statusz(p, "now", now.tv_sec);
   p = Statusz(p, "messages", g_messages);
   p = Statusz(p, "connections", g_connections);
-  p = Statusz(p, "worker_threads", g_worker_threads);
   p = Statusz(p, "banned", g_banned);
   p = Statusz(p, "workers", g_workers);
   p = Statusz(p, "accepts", g_accepts);
@@ -905,8 +791,9 @@ void *ListenWorker(void *arg) {
   struct Client client;
   struct timeval timeo = {g_keepalive / 1000, g_keepalive % 1000};
   struct sockaddr_in addr = {.sin_family = AF_INET, .sin_port = htons(g_port)};
+  AllowSigusr1();
   pthread_setname_np(pthread_self(), "Listener");
-  npassert((server = socket(AF_INET, SOCK_STREAM, 0)) != -1);
+  CHECK_NE(-1, (server = socket(AF_INET, SOCK_STREAM, 0)));
   setsockopt(server, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
   setsockopt(server, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
   setsockopt(server, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes));
@@ -915,8 +802,8 @@ void *ListenWorker(void *arg) {
   setsockopt(server, SOL_TCP, TCP_CORK, &no, sizeof(no));
   setsockopt(server, SOL_TCP, TCP_NODELAY, &yes, sizeof(yes));
   bind(server, (struct sockaddr *)&addr, sizeof(addr));
-  npassert(!listen(server, 1));
-  for (;;) {
+  CHECK_NE(-1, listen(server, 1));
+  while (!nsync_note_is_notified(g_shutdown[0])) {
     client.size = sizeof(client.addr);
     client.sock = accept(server, (struct sockaddr *)&client.addr, &client.size);
     if (client.sock == -1) {
@@ -928,7 +815,6 @@ void *ListenWorker(void *arg) {
     if (!AddClient(&g_clients, &client, WaitFor(ACCEPT_DEADLINE_MS))) {
       ++g_rejected;
       LOG("503 Accept Queue Full\n");
-      fcntl(client.sock, F_SETFL, fcntl(client.sock, F_GETFL) | O_NONBLOCK);
       Write(client.sock, "HTTP/1.1 503 Accept Queue Full\r\n"
                          "Content-Type: text/plain\r\n"
                          "Connection: close\r\n"
@@ -937,18 +823,9 @@ void *ListenWorker(void *arg) {
       close(client.sock);
     }
   }
-}
-
-void OnHttpWorkerCancel(void *arg) {
-  struct Worker *w = arg;
-  if (w->client->sock != -1)
-    close(w->client->sock);
-  FreeSafeBuffer(w->outbuf);
-  FreeSafeBuffer(w->inbuf);
-  DestroyHttpMessage(w->msg);
-  free(w->msgbuf);
-  --g_worker_threads;
-  w->dead = true;
+  close(server);
+  nsync_note_notify(g_shutdown[1]);
+  return 0;
 }
 
 // make thousands of http client handler threads
@@ -956,26 +833,14 @@ void OnHttpWorkerCancel(void *arg) {
 // hangup on any browser clients that lag for more than a few seconds
 void *HttpWorker(void *arg) {
   struct Client client;
-  client.sock = -1;
   int id = (intptr_t)arg;
-  char *msgbuf = malloc(MSG_BUF);
+  char *msgbuf = gc(xmalloc(MSG_BUF));
   char *inbuf = NewSafeBuffer(INBUF_SIZE);
   char *outbuf = NewSafeBuffer(OUTBUF_SIZE);
-  struct HttpMessage msg[1];
-  InitHttpMessage(msg, kHttpRequest);
+  struct HttpMessage *msg = gc(xcalloc(1, sizeof(struct HttpMessage)));
 
-  g_worker[id].msgbuf = msgbuf;
-  g_worker[id].inbuf = inbuf;
-  g_worker[id].outbuf = outbuf;
-  g_worker[id].msg = msg;
-  g_worker[id].client = &client;
-  pthread_cleanup_push(OnHttpWorkerCancel, g_worker + id);
-
-  char name[32];
-  sprintf(name, "HTTP%d", id);
-  pthread_setname_np(pthread_self(), name);
-  pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
-  ++g_worker_threads;
+  BlockSignals();
+  pthread_setname_np(pthread_self(), gc(xasprintf("HTTP%d", id)));
 
   // connection loop
   while (GetClient(&g_clients, &client)) {
@@ -1002,14 +867,12 @@ void *HttpWorker(void *arg) {
       bool comp, ipv6;
 
       // wait for http message
-      ResetHttpMessage(msg, kHttpRequest);
+      // this may be cancelled by sigusr1
+      AllowSigusr1();
+      DestroyHttpMessage(msg);
+      InitHttpMessage(msg, kHttpRequest);
       g_worker[id].startread = timespec_real();
-      got = read(client.sock, inbuf, INBUF_SIZE - 1);
-      if (got >= 0) {
-        memcpy(last_message, inbuf, got);
-        last_message[got] = 0;
-      }
-      if (got <= 0) {
+      if ((got = read(client.sock, inbuf, INBUF_SIZE)) <= 0) {
         ++g_readfails;
         break;
       }
@@ -1029,7 +892,9 @@ void *HttpWorker(void *arg) {
 
       // get client address from frontend
       if (HasHeader(kHttpXForwardedFor)) {
-        if (!IsLoopbackIp(clientip) && !IsPrivateIp(clientip)) {
+        if (!IsLoopbackIp(clientip) &&  //
+            !IsPrivateIp(clientip) &&   //
+            !IsCloudflareIp(clientip)) {
           LOG("Got X-Forwarded-For from untrusted IPv4 client address "
               "%hhu.%hhu.%hhu.%hhu\n",
               clientip >> 24, clientip >> 16, clientip >> 8, clientip);
@@ -1056,6 +921,9 @@ void *HttpWorker(void *arg) {
       ksnprintf(ipbuf, sizeof(ipbuf), "%hhu.%hhu.%hhu.%hhu", ip >> 24, ip >> 16,
                 ip >> 8, ip);
 
+      if (UrlStartsWith("/plot/") && (_rand64() % 256)) {
+        goto SkipSecurity;
+      }
       if (!ipv6 && !ContainsInt(&g_whitelisted, ip) &&
           (tok = AcquireToken(g_tok.b, ip, TB_CIDR)) < 32) {
         if (tok > 4) {
@@ -1072,6 +940,7 @@ void *HttpWorker(void *arg) {
         ++g_ratelimits;
         break;
       }
+    SkipSecurity:
 
       // we don't support http/1.0 and http/0.9 right now
       if (msg->version != 11) {
@@ -1122,15 +991,18 @@ void *HttpWorker(void *arg) {
         a = &g_asset.score;
       } else if (UrlStartsWith("/recent")) {
         a = &g_asset.recent;
+      } else if (UrlStartsWith("/plot/")) {
+        int i, block = 0;
+        for (i = msg->uri.a + 6; i < msg->uri.b && isdigit(inbuf[i]); ++i) {
+          block *= 10;
+          block += inbuf[i] - '0';
+          block &= 255;
+        }
+        a = g_asset.plot + block;
       } else {
         a = 0;
       }
 
-      // wait for server initialization
-      while (a)
-        if (a->ready)
-          break;
-
       // assert serving
       if (a) {
         struct iovec iov[2];
@@ -1138,7 +1010,7 @@ void *HttpWorker(void *arg) {
         comp = a->gzip.n < a->data.n &&
                HeaderHas(msg, inbuf, kHttpAcceptEncoding, "gzip", 4);
         ////////////////////////////////////////
-        pthread_rwlock_rdlock(&a->lock);
+        nsync_mu_rlock(&a->lock);
         if (HasHeader(kHttpIfModifiedSince) &&
             a->mtim.tv_sec <=
                 ParseHttpDateTime(HeaderData(kHttpIfModifiedSince),
@@ -1185,7 +1057,7 @@ void *HttpWorker(void *arg) {
           outmsglen = iov[0].iov_len + iov[1].iov_len;
           sent = writev(client.sock, iov, 2);
         }
-        pthread_rwlock_unlock(&a->lock);
+        nsync_mu_runlock(&a->lock);
         ////////////////////////////////////////
 
       } else if (UrlStartsWith("/ip")) {
@@ -1232,7 +1104,7 @@ void *HttpWorker(void *arg) {
         ++g_claimrequests;
         if (ipv6)
           goto Ipv6Warning;
-        struct Claim v = {.ip = ip, .created = timespec_real().tv_sec};
+        struct Claim v = {.ip = ip, .created = g_nowish.ts.tv_sec};
         if (GetNick(inbuf, msg, &v)) {
           if (AddClaim(&g_claims, &v,
                        timespec_add(timespec_real(),
@@ -1368,22 +1240,25 @@ void *HttpWorker(void *arg) {
       // amount, then since we sent the content length and checked
       // that the client didn't attach a payload, we are so synced
       // thus we can safely process more messages
-    } while (got == inmsglen &&                             //
-             sent == outmsglen &&                           //
-             !HasHeader(kHttpContentLength) &&              //
-             !HasHeader(kHttpTransferEncoding) &&           //
-             !HeaderEqualCase(kHttpConnection, "close") &&  //
-             (msg->method == kHttpGet ||                    //
-              msg->method == kHttpHead) &&                  //
-             1. / g_workers * g_connections < CONCERN_LOAD);
+    } while (got == inmsglen &&                                //
+             sent == outmsglen &&                              //
+             !HasHeader(kHttpContentLength) &&                 //
+             !HasHeader(kHttpTransferEncoding) &&              //
+             !HeaderEqualCase(kHttpConnection, "close") &&     //
+             (msg->method == kHttpGet ||                       //
+              msg->method == kHttpHead) &&                     //
+             1. / g_workers * g_connections < CONCERN_LOAD &&  //
+             !nsync_note_is_notified(g_shutdown[1]));
     DestroyHttpMessage(msg);
     close(client.sock);
-    client.sock = -1;
     g_worker[id].connected = false;
     --g_connections;
   }
 
-  pthread_cleanup_pop(true);
+  LOG("HttpWorker #%d exiting", id);
+  g_worker[id].shutdown = true;
+  FreeSafeBuffer(outbuf);
+  FreeSafeBuffer(inbuf);
   return 0;
 }
 
@@ -1409,8 +1284,8 @@ struct Data Gzip(struct Data data) {
     deflateEnd(&zs);
     return (struct Data){0};
   }
-  npassert(Z_STREAM_END == deflate(&zs, Z_FINISH));
-  npassert(Z_OK == deflateEnd(&zs));
+  CHECK_EQ(Z_STREAM_END, deflate(&zs, Z_FINISH));
+  CHECK_EQ(Z_OK, deflateEnd(&zs));
   res.n = sizeof(kGzipHeader) + zs.total_out + sizeof(footer);
   if (!(p = res.p = malloc(res.n))) {
     free(tmp);
@@ -1427,16 +1302,14 @@ struct Data Gzip(struct Data data) {
 struct Asset LoadAsset(const char *path, const char *type, int cash) {
   struct stat st;
   struct Asset a = {0};
-  pthread_rwlock_init(&a.lock, 0);
-  npassert(!stat(path, &st));
-  npassert((a.data.p = xslurp(path, &a.data.n)));
+  CHECK_EQ(0, stat(path, &st));
+  CHECK_NOTNULL((a.data.p = xslurp(path, &a.data.n)));
   a.type = type;
   a.cash = cash;
-  unassert((a.path = strdup(path)));
+  CHECK_NOTNULL((a.path = strdup(path)));
   a.mtim = st.st_mtim;
-  unassert((a.gzip = Gzip(a.data)).p);
+  CHECK_NOTNULL((a.gzip = Gzip(a.data)).p);
   FormatUnixHttpDateTime(a.lastmodified, a.mtim.tv_sec);
-  a.ready = true;
   return a;
 }
 
@@ -1460,16 +1333,15 @@ bool ReloadAsset(struct Asset *a) {
       goto OnError;
     CHECK_MEM((gzip = Gzip(data)).p);
     //!//!//!//!//!//!//!//!//!//!//!//!//!/
-    pthread_rwlock_wrlock(&a->lock);
+    nsync_mu_lock(&a->lock);
     f[0] = a->data.p;
     f[1] = a->gzip.p;
     a->data = data;
     a->gzip = gzip;
     a->mtim = st.st_mtim;
     memcpy(a->lastmodified, lastmodified, 32);
-    pthread_rwlock_unlock(&a->lock);
+    nsync_mu_unlock(&a->lock);
     //!//!//!//!//!//!//!//!//!//!//!//!//!/
-    a->ready = true;
     free(f[0]);
     free(f[1]);
   }
@@ -1483,14 +1355,34 @@ OnError:
 }
 
 void FreeAsset(struct Asset *a) {
-  pthread_rwlock_destroy(&a->lock);
   free(a->path);
   free(a->data.p);
   free(a->gzip.p);
 }
 
+void IgnoreSignal(int sig) {
+  // so worker i/o routines may eintr safely
+}
+
+// asynchronous handler of sigint, sigterm, and sighup signals
+// this handler is always invoked from within the main thread,
+// because our helper and worker threads always block signals.
 void OnCtrlC(int sig) {
-  is_shutting_down = 1;
+  if (!nsync_note_is_notified(g_shutdown[0])) {
+    LOG("Received %s shutting down...\n", strsignal(sig));
+    nsync_note_notify(g_shutdown[0]);
+  } else {
+    // there's no way to deliver signals to workers atomically, unless
+    // we pay the cost of ppoll() which isn't necessary in this design
+    // so if a user smashes that ctrl-c then we tkill the workers more
+    LOG("Received %s again so sending another volley...\n", strsignal(sig));
+    for (int i = 0; i < g_workers; ++i) {
+      pthread_kill(g_listener, SIGUSR1);
+      if (!g_worker[i].shutdown) {
+        pthread_kill(g_worker[i].th, SIGUSR1);
+      }
+    }
+  }
 }
 
 // parses cli arguments
@@ -1542,10 +1434,9 @@ void Update(struct Asset *a, bool gen(struct Asset *, long, long), long x,
             long y) {
   void *f[2];
   struct Asset t;
-  pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);
   if (gen(&t, x, y)) {
     //!//!//!//!//!//!//!//!//!//!//!//!//!/
-    pthread_rwlock_wrlock(&a->lock);
+    nsync_mu_lock(&a->lock);
     f[0] = a->data.p;
     f[1] = a->gzip.p;
     a->data = t.data;
@@ -1554,13 +1445,11 @@ void Update(struct Asset *a, bool gen(struct Asset *, long, long), long x,
     a->type = t.type;
     a->cash = t.cash;
     memcpy(a->lastmodified, t.lastmodified, 32);
-    pthread_rwlock_unlock(&a->lock);
+    nsync_mu_unlock(&a->lock);
     //!//!//!//!//!//!//!//!//!//!//!//!//!/
-    a->ready = true;
     free(f[0]);
     free(f[1]);
   }
-  pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
 }
 
 // generator function for the big board
@@ -1641,74 +1530,182 @@ OnError:
   return false;
 }
 
+// generator function for the big board
+bool GeneratePlot(struct Asset *out, long block, long cash) {
+  _Static_assert(IS2POW(XN * YN), "area must be 2-power");
+  _Static_assert(XN == YN, "hilbert algorithm needs square");
+  int rc, out_len;
+  sqlite3 *db = 0;
+  struct Asset a = {0};
+  unsigned char *rgba;
+  sqlite3_stmt *stmt = 0;
+  unsigned x, y, i, ip, area, mask, clump;
+  DEBUG("GeneratePlot %ld\n", block);
+  a.type = "image/png";
+  a.cash = cash;
+  a.mtim = timespec_real();
+  FormatUnixHttpDateTime(a.lastmodified, a.mtim.tv_sec);
+  CHECK_MEM((rgba = calloc(4, YN * XN)));
+  for (y = 0; y < YN; ++y) {
+    for (x = 0; x < XN; ++x) {
+      rgba[y * XN * 4 + x * 4 + 0] = 255;
+      rgba[y * XN * 4 + x * 4 + 1] = 255;
+      rgba[y * XN * 4 + x * 4 + 2] = 255;
+    }
+  }
+  CHECK_SQL(DbOpen("db.sqlite3", &db));
+  CHECK_DB(DbPrepare(db, &stmt,
+                     "SELECT ip\n"
+                     " FROM land\n"
+                     "WHERE ip >= ?1\n"
+                     "  AND ip <= ?2"));
+  CHECK_DB(sqlite3_bind_int64(stmt, 1, block << 24 | 0x000000));
+  CHECK_DB(sqlite3_bind_int64(stmt, 2, block << 24 | 0xffffff));
+  CHECK_SQL(sqlite3_exec(db, "BEGIN TRANSACTION", 0, 0, 0));
+  area = XN * YN;
+  mask = area - 1;
+  clump = 32 - bsr(area) - 8;
+  while ((rc = DbStep(stmt)) != SQLITE_DONE) {
+    if (rc != SQLITE_ROW)
+      CHECK_DB(rc);
+    ip = sqlite3_column_int64(stmt, 0);
+    i = (ip >> clump) & mask;
+    y = g_hilbert[i][0];
+    x = g_hilbert[i][1];
+    if (rgba[y * XN * 4 + x * 4 + 3] < 255) {
+      ++rgba[y * XN * 4 + x * 4 + 3];
+    }
+  }
+  CHECK_SQL(sqlite3_exec(db, "END TRANSACTION", 0, 0, 0));
+  CHECK_DB(sqlite3_finalize(stmt));
+  CHECK_SQL(sqlite3_close(db));
+  a.data.p = (char *)stbi_write_png_to_mem(rgba, XN * 4, XN, YN, 4, &out_len);
+  a.data.n = out_len;
+  a.gzip = Gzip(a.data);
+  free(rgba);
+  *out = a;
+  return true;
+OnError:
+  sqlite3_finalize(stmt);
+  sqlite3_close(db);
+  free(a.data.p);
+  free(rgba);
+  return false;
+}
+
 // single thread for regenerating the user scores json
 void *ScoreWorker(void *arg) {
+  BlockSignals();
   pthread_setname_np(pthread_self(), "ScoreAll");
-  for (;;) {
-    LOG("%H regenerating score...\n");
-    Update(&g_asset.score, GenerateScore, -1, MS2CASH(SCORE_UPDATE_MS));
-    usleep(SCORE_UPDATE_MS * 1000);
-  }
+  LOG("%P Score started\n");
+  long wait = SCORE_UPDATE_MS;
+  Update(&g_asset.score, GenerateScore, -1, MS2CASH(wait));
+  nsync_counter_add(g_ready, -1);  // #1
+  do {
+    Update(&g_asset.score, GenerateScore, -1, MS2CASH(wait));
+  } while (!nsync_note_wait(g_shutdown[1], WaitFor(wait)));
+  LOG("Score exiting\n");
+  return 0;
 }
 
 // single thread for regenerating the user scores json
 void *ScoreHourWorker(void *arg) {
+  BlockSignals();
   pthread_setname_np(pthread_self(), "ScoreHour");
-  for (;;) {
-    LOG("%H regenerating hour score...\n");
-    Update(&g_asset.score_hour, GenerateScore, 60L * 60,
-           MS2CASH(SCORE_H_UPDATE_MS));
-    usleep(SCORE_H_UPDATE_MS * 1000);
-  }
+  LOG("%P ScoreHour started\n");
+  long secs = 60L * 60;
+  long wait = SCORE_H_UPDATE_MS;
+  Update(&g_asset.score_hour, GenerateScore, secs, MS2CASH(wait));
+  nsync_counter_add(g_ready, -1);  // #2
+  do {
+    Update(&g_asset.score_hour, GenerateScore, secs, MS2CASH(wait));
+  } while (!nsync_note_wait(g_shutdown[1], WaitFor(wait)));
+  LOG("ScoreHour exiting\n");
+  return 0;
 }
 
 // single thread for regenerating the user scores json
 void *ScoreDayWorker(void *arg) {
+  BlockSignals();
   pthread_setname_np(pthread_self(), "ScoreDay");
-  for (;;) {
-    LOG("%H regenerating day score...\n");
-    Update(&g_asset.score_day, GenerateScore, 60L * 60 * 24,
-           MS2CASH(SCORE_D_UPDATE_MS));
-    usleep(SCORE_D_UPDATE_MS * 1000);
-  }
+  LOG("%P ScoreDay started\n");
+  long secs = 60L * 60 * 24;
+  long wait = SCORE_D_UPDATE_MS;
+  Update(&g_asset.score_day, GenerateScore, secs, MS2CASH(wait));
+  nsync_counter_add(g_ready, -1);  // #3
+  do {
+    Update(&g_asset.score_day, GenerateScore, secs, MS2CASH(wait));
+  } while (!nsync_note_wait(g_shutdown[1], WaitFor(wait)));
+  LOG("ScoreDay exiting\n");
+  return 0;
 }
 
 // single thread for regenerating the user scores json
 void *ScoreWeekWorker(void *arg) {
+  BlockSignals();
   pthread_setname_np(pthread_self(), "ScoreWeek");
-  for (;;) {
-    LOG("%H regenerating week score...\n");
-    Update(&g_asset.score_week, GenerateScore, 60L * 60 * 24 * 7,
-           MS2CASH(SCORE_W_UPDATE_MS));
-    usleep(SCORE_W_UPDATE_MS * 1000);
-  }
+  LOG("%P ScoreWeek started\n");
+  long secs = 60L * 60 * 24 * 7;
+  long wait = SCORE_W_UPDATE_MS;
+  Update(&g_asset.score_week, GenerateScore, secs, MS2CASH(wait));
+  nsync_counter_add(g_ready, -1);  // #4
+  do {
+    Update(&g_asset.score_week, GenerateScore, secs, MS2CASH(wait));
+  } while (!nsync_note_wait(g_shutdown[1], WaitFor(wait)));
+  LOG("ScoreWeek exiting\n");
+  return 0;
 }
 
 // single thread for regenerating the user scores json
 void *ScoreMonthWorker(void *arg) {
+  BlockSignals();
   pthread_setname_np(pthread_self(), "ScoreMonth");
-  for (;;) {
-    LOG("%H regenerating month score...\n");
-    Update(&g_asset.score_month, GenerateScore, 60L * 60 * 24 * 30,
-           MS2CASH(SCORE_M_UPDATE_MS));
-    usleep(SCORE_M_UPDATE_MS * 1000);
+  LOG("%P ScoreMonth started\n");
+  long secs = 60L * 60 * 24 * 30;
+  long wait = SCORE_M_UPDATE_MS;
+  Update(&g_asset.score_month, GenerateScore, secs, MS2CASH(wait));
+  nsync_counter_add(g_ready, -1);  // #5
+  do {
+    Update(&g_asset.score_month, GenerateScore, secs, MS2CASH(wait));
+  } while (!nsync_note_wait(g_shutdown[1], WaitFor(wait)));
+  LOG("ScoreMonth exiting\n");
+  return 0;
+}
+
+// single thread for regenerating /8 cell background image charts
+void *PlotWorker(void *arg) {
+  long i, wait;
+  BlockSignals();
+  pthread_setname_np(pthread_self(), "Plotter");
+  LOG("%P Plotter started\n");
+  wait = PLOTS_UPDATE_MS;
+  for (i = 0; i < 256; ++i) {
+    Update(g_asset.plot + i, GeneratePlot, i, MS2CASH(wait));
   }
+  nsync_counter_add(g_ready, -1);  // #6
+  do {
+    for (i = 0; i < 256; ++i) {
+      Update(g_asset.plot + i, GeneratePlot, i, MS2CASH(wait));
+    }
+  } while (!nsync_note_wait(g_shutdown[1], WaitFor(wait)));
+  LOG("Plotter exiting\n");
+  return 0;
 }
 
 // thread for realtime json generation of recent successful claims
 void *RecentWorker(void *arg) {
-  int rc;
   bool once;
   void *f[2];
+  int rc, err;
   sqlite3 *db;
   char *sb = 0;
   size_t sblen = 0;
-  const char *text;
   sqlite3_stmt *stmt;
   struct Asset *a, t;
-  sleep(2);
+  bool warmedup = false;
+  BlockSignals();
   pthread_setname_np(pthread_self(), "RecentWorker");
-  pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);
+  LOG("%P RecentWorker started\n");
 StartOver:
   db = 0;
   stmt = 0;
@@ -1720,7 +1717,7 @@ StartOver:
                      "WHERE created NOT NULL\n"
                      "ORDER BY created DESC\n"
                      "LIMIT 50"));
-  for (;;) {
+  do {
     // regenerate json
     t.mtim = timespec_real();
     FormatUnixHttpDateTime(t.lastmodified, t.mtim.tv_sec);
@@ -1732,14 +1729,13 @@ StartOver:
     for (once = false; (rc = DbStep(stmt)) != SQLITE_DONE; once = true) {
       if (rc != SQLITE_ROW)
         CHECK_SQL(rc);
-      if ((text = (const char *)sqlite3_column_text(stmt, 1))) {
-        if (once)
-          CHECK_SYS(appends(&t.data.p, ",\n"));
-        CHECK_SYS(appendf(&t.data.p, "[%ld,\"%s\",%ld]",
-                          sqlite3_column_int64(stmt, 0),
-                          EscapeJsStringLiteral(&sb, &sblen, text, -1, 0),
-                          sqlite3_column_int64(stmt, 2)));
-      }
+      if (once)
+        CHECK_SYS(appends(&t.data.p, ",\n"));
+      CHECK_SYS(
+          appendf(&t.data.p, "[%ld,\"%s\",%ld]", sqlite3_column_int64(stmt, 0),
+                  EscapeJsStringLiteral(
+                      &sb, &sblen, (void *)sqlite3_column_text(stmt, 1), -1, 0),
+                  sqlite3_column_int64(stmt, 2)));
     }
     CHECK_SQL(sqlite3_reset(stmt));
     CHECK_SQL(sqlite3_exec(db, "END TRANSACTION", 0, 0, 0));
@@ -1749,7 +1745,7 @@ StartOver:
     // deploy json
     a = &g_asset.recent;
     //!//!//!//!//!//!//!//!//!//!//!//!//!/
-    pthread_rwlock_wrlock(&a->lock);
+    nsync_mu_lock(&a->lock);
     f[0] = a->data.p;
     f[1] = a->gzip.p;
     a->data = t.data;
@@ -1758,22 +1754,25 @@ StartOver:
     a->type = "application/json";
     a->cash = 0;
     memcpy(a->lastmodified, t.lastmodified, 32);
-    pthread_rwlock_unlock(&a->lock);
+    nsync_mu_unlock(&a->lock);
     //!//!//!//!//!//!//!//!//!//!//!//!//!/
-    a->ready = true;
     bzero(&t, sizeof(t));
     free(f[0]);
     free(f[1]);
+    // handle startup condition
+    if (!warmedup) {
+      nsync_counter_add(g_ready, -1);  // #7
+      warmedup = true;
+    }
     // wait for wakeup or cancel
-    pthread_mutex_lock(&g_recent.mu);
-    pthread_cleanup_push(unlock_mutex, &g_recent.mu);
-    pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
-    pthread_cond_timedwait(&g_recent.cv, &g_recent.mu, 0);
-    pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);
-    pthread_cleanup_pop(true);
-  }
+    nsync_mu_lock(&g_recent.mu);
+    err = nsync_cv_wait_with_deadline(&g_recent.cv, &g_recent.mu,
+                                      nsync_time_no_deadline, g_shutdown[1]);
+    nsync_mu_unlock(&g_recent.mu);
+  } while (err != ECANCELED);
   CHECK_DB(sqlite3_finalize(stmt));
   CHECK_SQL(sqlite3_close(db));
+  LOG("RecentWorker exiting\n");
   free(sb);
   return 0;
 OnError:
@@ -1791,9 +1790,11 @@ void *ClaimWorker(void *arg) {
   int i, n, rc;
   long processed;
   sqlite3_stmt *stmt;
-  struct Claim *v = gc(calloc(BATCH_MAX, sizeof(struct Claim)));
-  pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);
+  bool warmedup = false;
+  struct Claim *v = gc(xcalloc(BATCH_MAX, sizeof(struct Claim)));
+  BlockSignals();
   pthread_setname_np(pthread_self(), "ClaimWorker");
+  LOG("%P ClaimWorker started\n");
 StartOver:
   db = 0;
   stmt = 0;
@@ -1806,6 +1807,10 @@ StartOver:
                      " WHERE nick != ?2\n"
                      "    OR created IS NULL\n"
                      "    OR ?3 - created > 3600"));
+  if (!warmedup) {
+    nsync_counter_add(g_ready, -1);  // #8
+    warmedup = true;
+  }
   while ((n = GetClaims(&g_claims, v, BATCH_MAX))) {
     processed = 0;
     CHECK_SQL(sqlite3_exec(db, "BEGIN TRANSACTION", 0, 0, 0));
@@ -1821,12 +1826,13 @@ StartOver:
     atomic_fetch_add(&g_claimsprocessed, processed);
     DEBUG("Committed %d claims\n", n);
     // wake up RecentWorker()
-    pthread_mutex_lock(&g_recent.mu);
-    pthread_cond_signal(&g_recent.cv);
-    pthread_mutex_unlock(&g_recent.mu);
+    nsync_mu_lock(&g_recent.mu);
+    nsync_cv_signal(&g_recent.cv);
+    nsync_mu_unlock(&g_recent.mu);
   }
   CHECK_DB(sqlite3_finalize(stmt));
   CHECK_SQL(sqlite3_close(db));
+  LOG("ClaimWorker exiting\n");
   return 0;
 OnError:
   sqlite3_finalize(stmt);
@@ -1834,36 +1840,40 @@ OnError:
   goto StartOver;
 }
 
-// worker for refilling token buckets
-void *ReplenishWorker(void *arg) {
-  pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
-  pthread_setname_np(pthread_self(), "Replenisher");
-  for (struct timespec ts = timespec_real();;
-       ts = timespec_add(ts, timespec_frommillis(TB_INTERVAL))) {
-    clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &ts, 0);
-    ReplenishTokens(g_tok.w, TB_WORDS);
+// single thread for computing HTTP Date header
+void *NowWorker(void *arg) {
+  BlockSignals();
+  pthread_setname_np(pthread_self(), "NowWorker");
+  LOG("%P NowWorker started\n");
+  UpdateNow();
+  nsync_counter_add(g_ready, -1);  // #9
+  for (struct timespec ts = {timespec_real().tv_sec};; ++ts.tv_sec) {
+    if (!nsync_note_wait(g_shutdown[1], ts)) {
+      UpdateNow();
+    } else {
+      break;
+    }
   }
+  LOG("NowWorker exiting\n");
+  return 0;
 }
 
-void SpawnWorker(intptr_t i) {
-  sigset_t thmask;
-  pthread_attr_t attr;
-  sigfillset(&thmask);
-  sigdelset(&thmask, SIGABRT);
-  sigdelset(&thmask, SIGTRAP);
-  sigdelset(&thmask, SIGFPE);
-  sigdelset(&thmask, SIGBUS);
-  sigdelset(&thmask, SIGSEGV);
-  sigdelset(&thmask, SIGILL);
-  sigdelset(&thmask, SIGXCPU);
-  sigdelset(&thmask, SIGXFSZ);
-  pthread_attr_init(&attr);
-  pthread_attr_setsigmask_np(&attr, &thmask);
-  pthread_attr_setstacksize(&attr, 128 * 1024);
-  pthread_attr_setguardsize(&attr, sysconf(_SC_PAGESIZE));
-  pthread_attr_setsigaltstacksize_np(&attr, sysconf(_SC_MINSIGSTKSZ) + 32768);
-  pthread_create(&g_worker[i].th, &attr, HttpWorker, (void *)i);
-  pthread_attr_destroy(&attr);
+// worker for refilling token buckets
+void *ReplenishWorker(void *arg) {
+  BlockSignals();
+  pthread_setname_np(pthread_self(), "Replenisher");
+  LOG("%P Replenisher started\n");
+  UpdateNow();
+  for (struct timespec ts = timespec_real();;
+       ts = timespec_add(ts, timespec_frommillis(TB_INTERVAL))) {
+    if (!nsync_note_wait(g_shutdown[1], ts)) {
+      ReplenishTokens(g_tok.w, TB_WORDS);
+    } else {
+      break;
+    }
+  }
+  LOG("Replenisher exiting\n");
+  return 0;
 }
 
 // we're permissive in allowing http connection keepalive until the
@@ -1874,7 +1884,7 @@ void Meltdown(void) {
   int i, marks;
   struct timespec now;
   ++g_meltdowns;
-  LOG("%H panicking because %d out of %d workers is connected\n", g_connections,
+  LOG("Panicking because %d out of %d workers is connected\n", g_connections,
       g_workers);
   now = timespec_real();
   for (marks = i = 0; i < g_workers; ++i) {
@@ -1882,9 +1892,7 @@ void Meltdown(void) {
         (g_worker[i].msgcount > PANIC_MSGS ||
          timespec_cmp(timespec_sub(now, g_worker[i].startread),
                       timespec_frommillis(MELTALIVE_MS)) >= 0)) {
-      pthread_cancel(g_worker[i].th);
-      pthread_join(g_worker[i].th, 0);
-      SpawnWorker(i);
+      pthread_kill(g_worker[i].th, SIGUSR1);
       ++marks;
     }
   }
@@ -1893,29 +1901,18 @@ void Meltdown(void) {
 
 // main thread worker
 void *Supervisor(void *arg) {
-  while (!is_shutting_down) {
-
-    // check for updates to web assets on disk
-    ReloadAsset(&g_asset.index);
-    ReloadAsset(&g_asset.about);
-    ReloadAsset(&g_asset.user);
-    ReloadAsset(&g_asset.favicon);
-
-    // check if server is about to explode
-    if (g_workers > 1 && 1. / g_workers * g_connections > PANIC_LOAD)
-      Meltdown();
-
-    // spawn replacements for crashed workers
-    for (int i = 0; i < g_workers; ++i) {
-      if (g_worker[i].dead) {
-        pthread_join(g_worker[i].th, 0);
-        SpawnWorker(i);
+  for (;;) {
+    if (!nsync_note_wait(g_shutdown[0], WaitFor(SUPERVISE_MS))) {
+      if (g_workers > 1 && 1. / g_workers * g_connections > PANIC_LOAD) {
+        Meltdown();
       }
+      ReloadAsset(&g_asset.index);
+      ReloadAsset(&g_asset.about);
+      ReloadAsset(&g_asset.user);
+      ReloadAsset(&g_asset.favicon);
+    } else {
+      break;
     }
-
-    // wait a little bit
-    if (!is_shutting_down)
-      usleep(SUPERVISE_MS * 1000);
   }
   return 0;
 }
@@ -1924,9 +1921,9 @@ void CheckDatabase(void) {
   sqlite3 *db;
   if (g_integrity) {
     CHECK_SQL(DbOpen("db.sqlite3", &db));
-    LOG("%H Checking database integrity...\n");
+    LOG("Checking database integrity...\n");
     CHECK_SQL(sqlite3_exec(db, "PRAGMA integrity_check", 0, 0, 0));
-    LOG("%H Vacuuming database...\n");
+    LOG("Vacuuming database...\n");
     CHECK_SQL(sqlite3_exec(db, "VACUUM", 0, 0, 0));
     CHECK_SQL(sqlite3_close(db));
   }
@@ -1935,159 +1932,8 @@ OnError:
   exit(1);
 }
 
-char *hexcpy(char *p, unsigned long x) {
-  int k = x ? (__builtin_clzl(x) ^ 63) + 1 : 1;
-  k = (k + 3) & -4;
-  while (k > 0)
-    *p++ = "0123456789abcdef"[(x >> (k -= 4)) & 15];
-  *p = '\0';
-  return p;
-}
-
-char *describe_backtrace(char *p, size_t len, const struct StackFrame *sf) {
-  char *pe = p + len;
-  bool gotsome = false;
-
-  // show address of each function
-  while (sf) {
-    if (kisdangerous(sf)) {
-      if (p + 1 + 9 + 1 < pe) {
-        if (gotsome)
-          *p++ = ' ';
-        p = stpcpy(p, "DANGEROUS");
-        if (p + 16 + 1 < pe) {
-          *p++ = ' ';
-          p = hexcpy(p, (long)sf);
-        }
-      }
-      break;
-    }
-    if (p + 16 + 1 < pe) {
-      unsigned char *ip = (unsigned char *)sf->addr;
-#ifdef __x86_64__
-      // x86 advances the progrem counter before an instruction
-      // begins executing. return addresses in backtraces shall
-      // point to code after the call, which means addr2line is
-      // going to print unrelated code unless we fixup the addr
-      if (!kisdangerous(ip))
-        ip -= __is_call(ip);
-#endif
-      if (gotsome)
-        *p++ = ' ';
-      else
-        gotsome = true;
-      p = hexcpy(p, (long)ip);
-    } else {
-      break;
-    }
-    sf = sf->next;
-  }
-
-  // terminate string
-  if (p < pe)
-    *p = '\0';
-  return p;
-}
-
-//                         abashed the devil stood
-//                      and felt how awful goodness is
-char *describe_crash(char *buf, size_t len, int sig, siginfo_t *si, void *arg) {
-  char *p = buf;
-
-  // check minimum length
-  if (len < 64)
-    return p;
-
-  // describe crash
-  char signame[21];
-  p = stpcpy(p, strsignal_r(sig, signame));
-  if (si &&               //
-      (sig == SIGFPE ||   //
-       sig == SIGILL ||   //
-       sig == SIGBUS ||   //
-       sig == SIGSEGV ||  //
-       sig == SIGTRAP)) {
-    p = stpcpy(p, " at ");
-    p = hexcpy(p, (long)si->si_addr);
-  }
-
-  // get stack frame daisy chain
-  struct StackFrame pc;
-  struct StackFrame *sf;
-  ucontext_t *ctx;
-  if ((ctx = (ucontext_t *)arg)) {
-    pc.addr = ctx->uc_mcontext.PC;
-    pc.next = (struct StackFrame *)ctx->uc_mcontext.BP;
-    sf = &pc;
-  } else {
-    sf = (struct StackFrame *)__builtin_frame_address(0);
-  }
-
-  // describe backtrace
-  p = stpcpy(p, " bt ");
-  p = describe_backtrace(p, len - (p - buf), sf);
-
-  return p;
-}
-
-void on_crash_signal(int sig, siginfo_t *si, void *arg) {
-  char *p;
-  char message[512];
-  write(2, "crash!\n", 7);
-  p = describe_crash(message, sizeof(message), sig, si, arg);
-  write(g_crash_fd, "crash: ", 7);
-  write(g_crash_fd, message, p - message);
-  write(g_crash_fd, "\n", 1);
-  write(g_crash_fd, last_message, strlen(last_message));
-  write(g_crash_fd, "\n", 1);
-  pthread_exit(PTHREAD_CANCELED);
-}
-
-void make_server_crash_resistant(void) {
-  const char *path = "crash.log";
-  if ((g_crash_fd = open(path, O_CREAT | O_WRONLY | O_APPEND, 0644)) == -1) {
-    fprintf(stderr, "%s: %s\n", path, strerror(errno));
-    exit(1);
-  }
-
-  struct sigaction sa;
-  sa.sa_flags = SA_SIGINFO;
-  sigemptyset(&sa.sa_mask);
-
-  sa.sa_sigaction = on_crash_signal;
-  sigaddset(&sa.sa_mask, SIGABRT);
-  sigaddset(&sa.sa_mask, SIGTRAP);
-  sigaddset(&sa.sa_mask, SIGFPE);
-  sigaddset(&sa.sa_mask, SIGBUS);
-  sigaddset(&sa.sa_mask, SIGSEGV);
-  sigaddset(&sa.sa_mask, SIGILL);
-  sigaddset(&sa.sa_mask, SIGXCPU);
-  sigaddset(&sa.sa_mask, SIGXFSZ);
-
-  sigaction(SIGABRT, &sa, 0);
-  sigaction(SIGTRAP, &sa, 0);
-  sigaction(SIGFPE, &sa, 0);
-  sigaction(SIGILL, &sa, 0);
-  sigaction(SIGXCPU, &sa, 0);
-  sigaction(SIGXFSZ, &sa, 0);
-
-  sa.sa_flags |= SA_ONSTACK;
-  sigaction(SIGBUS, &sa, 0);
-  sigaction(SIGSEGV, &sa, 0);
-}
-
 int main(int argc, char *argv[]) {
-  make_server_crash_resistant();
-
-  if (pledge(0, 0)) {
-    fprintf(stderr, "%s: this OS doesn't support pledge() security\n", argv[0]);
-    exit(1);
-  }
-
-  if (unveil("", 0) < 2) {
-    fprintf(stderr, "%s: need OpenBSD or Landlock LSM v3+\n", argv[0]);
-    exit(1);
-  }
+  // ShowCrashReports();
 
   if (IsLinux()) {
     Write(2, "Enabling TCP_FASTOPEN for server sockets...\n");
@@ -2107,7 +1953,7 @@ int main(int argc, char *argv[]) {
  __| |   |  __| | \\ \\  \\   / _` |  __|\n\
  |   |   | |    __|\\ \\  \\ / (   | |\n\
 \\__|\\__,_|_|   _|   \\_/\\_/ \\__,_|_|\n");
-  npassert(!chdir("/opt/turfwar"));
+  CHECK_EQ(0, chdir("/opt/turfwar"));
   putenv("TMPDIR=/opt/turfwar/tmp");
 
   if ((g_blackhole.fd = socket(AF_UNIX, SOCK_DGRAM, 0)) == -1) {
@@ -2135,6 +1981,13 @@ int main(int argc, char *argv[]) {
     npassert(2 == open("turfwar.log", O_CREAT | O_WRONLY | O_APPEND, 0644));
   }
 
+  LOG("Generating Hilbert Curve...\n");
+  for (int i = 0; i < YN * XN; ++i) {
+    axdx_t h = unhilbert(XN, i);
+    g_hilbert[i][0] = h.ax;
+    g_hilbert[i][1] = h.dx;
+  }
+
   // library init
   sqlite3_initialize();
   CheckDatabase();
@@ -2145,6 +1998,9 @@ int main(int argc, char *argv[]) {
 
   // server lifecycle locks
   g_started = timespec_real();
+  for (int i = 0; i < ARRAYLEN(g_shutdown); ++i) {
+    g_shutdown[i] = nsync_note_new(0, nsync_time_no_deadline);
+  }
 
   // load static assets into memory and pre-zip them
   g_asset.index = LoadAsset("index.html", "text/html; charset=utf-8", 900);
@@ -2154,11 +2010,11 @@ int main(int argc, char *argv[]) {
 
   // sandbox ourselves
   __pledge_mode = PLEDGE_PENALTY_RETURN_EPERM;
-  npassert(!unveil("/opt/turfwar", "rwc"));
-  npassert(!unveil(0, 0));
+  CHECK_EQ(0, unveil("/opt/turfwar", "rwc"));
+  CHECK_EQ(0, unveil(0, 0));
   if (!IsOpenbsd()) {
     // TODO(jart): why isn't pledge working on openbsd?
-    npassert(!pledge("stdio flock rpath wpath cpath inet", 0));
+    CHECK_EQ(0, pledge("stdio flock rpath wpath cpath inet", 0));
   }
 
   // shutdown signals
@@ -2169,89 +2025,83 @@ int main(int argc, char *argv[]) {
   sigaction(SIGHUP, &sa, 0);
   sigaction(SIGINT, &sa, 0);
   sigaction(SIGTERM, &sa, 0);
+  sa.sa_handler = IgnoreSignal;
+  sigaction(SIGUSR1, &sa, 0);
 
-  time_init();
+  // make 9 helper threads
+  g_ready = nsync_counter_new(10);
+  pthread_t scorer, recenter, claimer, nower, replenisher, plotter;
+  pthread_t scorer_hour, scorer_day, scorer_week, scorer_month;
+  CHECK_EQ(0, pthread_create(&scorer, 0, ScoreWorker, 0));
+  CHECK_EQ(0, pthread_create(&scorer_hour, 0, ScoreHourWorker, 0));
+  CHECK_EQ(0, pthread_create(&scorer_day, 0, ScoreDayWorker, 0));
+  CHECK_EQ(0, pthread_create(&scorer_week, 0, ScoreWeekWorker, 0));
+  CHECK_EQ(0, pthread_create(&scorer_month, 0, ScoreMonthWorker, 0));
+  CHECK_EQ(0, pthread_create(&replenisher, 0, ReplenishWorker, 0));
+  CHECK_EQ(0, pthread_create(&recenter, 0, RecentWorker, 0));
+  CHECK_EQ(0, pthread_create(&claimer, 0, ClaimWorker, 0));
+  CHECK_EQ(0, pthread_create(&plotter, 0, PlotWorker, 0));
+  CHECK_EQ(0, pthread_create(&nower, 0, NowWorker, 0));
 
-  sigset_t thmask;
-  sigfillset(&thmask);
-  sigdelset(&thmask, SIGABRT);
-  sigdelset(&thmask, SIGTRAP);
-  sigdelset(&thmask, SIGFPE);
-  sigdelset(&thmask, SIGBUS);
-  sigdelset(&thmask, SIGSEGV);
-  sigdelset(&thmask, SIGILL);
-  sigdelset(&thmask, SIGXCPU);
-  sigdelset(&thmask, SIGXFSZ);
+  // wait for helper threads to warm up creating assets
+  if (nsync_counter_add(g_ready, -1)) {  // #10
+    nsync_counter_wait(g_ready, nsync_time_no_deadline);
+  }
 
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setsigmask_np(&attr, &thmask);
-  pthread_attr_setstacksize(&attr, 128 * 1024);
-  pthread_attr_setguardsize(&attr, sysconf(_SC_PAGESIZE));
-  pthread_attr_setsigaltstacksize_np(&attr, sysconf(_SC_MINSIGSTKSZ) + 32768);
-  npassert(!pthread_create(&scorer, &attr, ScoreWorker, 0));
-  npassert(!pthread_create(&scorer_hour, &attr, ScoreHourWorker, 0));
-  npassert(!pthread_create(&scorer_day, &attr, ScoreDayWorker, 0));
-  npassert(!pthread_create(&scorer_week, &attr, ScoreWeekWorker, 0));
-  npassert(!pthread_create(&scorer_month, &attr, ScoreMonthWorker, 0));
-  npassert(!pthread_create(&replenisher, &attr, ReplenishWorker, 0));
-  npassert(!pthread_create(&recenter, &attr, RecentWorker, 0));
-  npassert(!pthread_create(&claimer, &attr, ClaimWorker, 0));
-  npassert(!pthread_create(&g_listener, &attr, ListenWorker, 0));
-  unassert((g_worker = calloc(g_workers, sizeof(*g_worker))));
-  for (intptr_t i = 0; i < g_workers; ++i)
-    npassert(!pthread_create(&g_worker[i].th, &attr, HttpWorker, (void *)i));
-  pthread_attr_destroy(&attr);
+  // create one thread to listen
+  CHECK_EQ(0, pthread_create(&g_listener, 0, ListenWorker, 0));
+
+  // create lots of http workers to serve those assets
+  LOG("Online\n");
+  g_worker = xcalloc(g_workers, sizeof(*g_worker));
+  for (intptr_t i = 0; i < g_workers; ++i) {
+    CHECK_EQ(0, pthread_create(&g_worker[i].th, 0, HttpWorker, (void *)i));
+  }
 
   // time to serve
-  LOG("%H ready\n");
+  LOG("Ready\n");
   Supervisor(0);
 
-  // cancel listen()
-  LOG("%H interrupting services...\n");
-  pthread_cancel(scorer);
-  pthread_cancel(recenter);
-  pthread_cancel(g_listener);
-  pthread_cancel(scorer_day);
-  pthread_cancel(scorer_hour);
-  pthread_cancel(scorer_week);
-  pthread_cancel(scorer_month);
-  pthread_cancel(replenisher);
-
-  LOG("%H joining services...\n");
-  unassert(!pthread_join(scorer, 0));
-  unassert(!pthread_join(recenter, 0));
-  unassert(!pthread_join(g_listener, 0));
-  unassert(!pthread_join(scorer_day, 0));
-  unassert(!pthread_join(scorer_hour, 0));
-  unassert(!pthread_join(scorer_week, 0));
-  unassert(!pthread_join(scorer_month, 0));
-  unassert(!pthread_join(replenisher, 0));
+  // cancel listen() so we stop accepting new clients
+  LOG("Interrupting listen...\n");
+  pthread_kill(g_listener, SIGUSR1);
+  pthread_join(g_listener, 0);
 
   // cancel read() so that keepalive clients finish faster
-  LOG("%H interrupting workers...\n");
-  for (int i = 0; i < g_workers; ++i)
-    if (!g_worker[i].dead)
-      pthread_cancel(g_worker[i].th);
+  LOG("Interrupting workers...\n");
+  for (int i = 0; i < g_workers; ++i) {
+    pthread_kill(g_worker[i].th, SIGUSR1);
+  }
 
   // wait for producers to finish
-  LOG("%H joining workers...\n");
-  for (int i = 0; i < g_workers; ++i)
-    unassert(!pthread_join(g_worker[i].th, 0));
+  LOG("Waiting for workers to finish...\n");
+  for (int i = 0; i < g_workers; ++i) {
+    CHECK_EQ(0, pthread_join(g_worker[i].th, 0));
+  }
+  LOG("Waiting for helpers to finish...\n");
+  CHECK_EQ(0, pthread_join(nower, 0));
+  CHECK_EQ(0, pthread_join(scorer, 0));
+  CHECK_EQ(0, pthread_join(plotter, 0));
+  CHECK_EQ(0, pthread_join(recenter, 0));
+  CHECK_EQ(0, pthread_join(scorer_day, 0));
+  CHECK_EQ(0, pthread_join(scorer_hour, 0));
+  CHECK_EQ(0, pthread_join(scorer_week, 0));
+  CHECK_EQ(0, pthread_join(scorer_month, 0));
+  CHECK_EQ(0, pthread_join(replenisher, 0));
 
   // now that all workers have terminated, the claims queue must be
   // empty, therefore, it is now safe to send a cancellation to the
   // claims worker thread which waits forever for new claims.
-  unassert(!g_claims.count);
-  pthread_cancel(claimer);
-  LOG("%H waiting for claims worker...\n");
-  unassert(!pthread_join(claimer, 0));
+  CHECK_EQ(0, g_claims.count);
+  LOG("waiting for claims worker...\n");
+  nsync_note_notify(g_shutdown[2]);
+  CHECK_EQ(0, pthread_join(claimer, 0));
 
   // perform some sanity checks
-  unassert(g_claimsprocessed == g_claimsenqueued);
+  CHECK_EQ(g_claimsprocessed, g_claimsenqueued);
 
   // free memory
-  LOG("%H freeing memory...\n");
+  LOG("Freeing memory...\n");
   FreeAsset(&g_asset.user);
   FreeAsset(&g_asset.about);
   FreeAsset(&g_asset.index);
@@ -2262,11 +2112,13 @@ int main(int argc, char *argv[]) {
   FreeAsset(&g_asset.score_month);
   FreeAsset(&g_asset.recent);
   FreeAsset(&g_asset.favicon);
+  for (int i = 0; i < ARRAYLEN(g_shutdown); ++i) {
+    nsync_note_free(g_shutdown[i]);
+  }
+  nsync_counter_free(g_ready);
   free(g_worker);
   free(g_tok.b);
 
-  time_destroy();
-
-  LOG("%H goodbye\n");
+  LOG("Goodbye\n");
   // CheckForMemoryLeaks();
 }
diff --git a/test/ctl/BUILD.mk b/test/ctl/BUILD.mk
index a6944bbf8..6b36e766d 100644
--- a/test/ctl/BUILD.mk
+++ b/test/ctl/BUILD.mk
@@ -20,7 +20,7 @@ TEST_CTL_DIRECTDEPS =				\
 	LIBC_NEXGEN32E				\
 	LIBC_PROC				\
 	LIBC_STDIO				\
-	LIBC_SYSTEM				\
+	LIBC_STDIO				\
 	LIBC_THREAD				\
 	THIRD_PARTY_LIBCXX			\
 	THIRD_PARTY_LIBCXXABI			\
diff --git a/test/ctl/set_bench.cc b/test/ctl/set_bench.cc
index 4cc1f31e6..4c565848e 100644
--- a/test/ctl/set_bench.cc
+++ b/test/ctl/set_bench.cc
@@ -22,12 +22,26 @@
 #include "libc/mem/leaks.h"
 #include "libc/stdio/stdio.h"
 #include "libc/sysv/consts/rusage.h"
-#include "libc/testlib/benchmark.h"
 
 // #include <set>
 // #define ctl std
 // #define check() size()
 
+#define BENCH(ITERATIONS, WORK_PER_RUN, CODE) \
+    do { \
+        struct timespec start = timespec_real(); \
+        for (int __i = 0; __i < ITERATIONS; ++__i) { \
+            asm volatile("" ::: "memory"); \
+            CODE; \
+        } \
+        long long work = (WORK_PER_RUN) * (ITERATIONS); \
+        double nanos = \
+          (timespec_tonanos(timespec_sub(timespec_real(), start)) + work - \
+           1) / \
+          (double)work; \
+        printf("%10g ns %2dx %s\n", nanos, (ITERATIONS), #CODE); \
+    } while (0)
+
 int
 rand32(void)
 {
@@ -54,19 +68,19 @@ main()
     {
         long x = 0;
         ctl::set<long> s;
-        BENCHMARK(1000000, 1, s.insert(rand32() % 1000000));
+        BENCH(1000000, 1, s.insert(rand32() % 1000000));
         // s.check();
-        BENCHMARK(1000000, 1, {
+        BENCH(1000000, 1, {
             auto i = s.find(rand32() % 1000000);
             if (i != s.end())
                 x += *i;
         });
-        BENCHMARK(1000000, 1, {
+        BENCH(1000000, 1, {
             auto i = s.lower_bound(rand32() % 1000000);
             if (i != s.end())
                 x += *i;
         });
-        BENCHMARK(1000000, 1, s.erase(rand32() % 1000000));
+        BENCH(1000000, 1, s.erase(rand32() % 1000000));
         eat(x);
     }
 
diff --git a/test/ctl/shared_ptr_test.cc b/test/ctl/shared_ptr_test.cc
deleted file mode 100644
index f9a8dd597..000000000
--- a/test/ctl/shared_ptr_test.cc
+++ /dev/null
@@ -1,313 +0,0 @@
-// -*- mode:c++; indent-tabs-mode:nil; c-basic-offset:4; coding:utf-8 -*-
-// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
-//
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include "ctl/is_same.h"
-#include "ctl/shared_ptr.h"
-#include "ctl/vector.h"
-#include "libc/mem/leaks.h"
-
-// #include <memory>
-// #include <vector>
-// #define ctl std
-
-using ctl::bad_weak_ptr;
-using ctl::enable_shared_from_this;
-using ctl::make_shared;
-using ctl::move;
-using ctl::shared_ptr;
-using ctl::unique_ptr;
-using ctl::vector;
-using ctl::weak_ptr;
-
-#undef ctl
-
-static int g = 0;
-
-struct ConstructG
-{
-    ConstructG()
-    {
-        ++g;
-    }
-};
-
-struct DestructG
-{
-    ~DestructG()
-    {
-        ++g;
-    }
-};
-
-struct CallG
-{
-    void operator()(auto*) const noexcept
-    {
-        ++g;
-    }
-};
-
-struct Base
-{};
-
-struct Derived : Base
-{};
-
-class SharedThis : public enable_shared_from_this<SharedThis>
-{
-    struct Private
-    {
-        explicit Private() = default;
-    };
-
-  public:
-    SharedThis(Private)
-    {
-    }
-
-    static shared_ptr<SharedThis> create()
-    {
-        return make_shared<SharedThis>(Private());
-    }
-};
-
-class CanShareThis : public enable_shared_from_this<CanShareThis>
-{};
-
-// Sample class used to demonstrate the CTL shared_ptr's weak_self feature.
-struct Tree : ctl::weak_self_base
-{
-    ctl::shared_ptr<Tree> l, r;
-    ctl::weak_ptr<Tree> p;
-    Tree(ctl::weak_ptr<Tree> const& self, auto&& l2, auto&& r2)
-      : l(ctl::forward<decltype(l2)>(l2)), r(ctl::forward<decltype(r2)>(r2))
-    {
-        if (l)
-            l->p = self;
-        if (r)
-            r->p = self;
-    }
-};
-
-int
-main()
-{
-    int a, b;
-
-    {
-        // Shouldn't cause memory leaks.
-        shared_ptr<int> x(new int(5));
-    }
-
-    {
-        // Objects get destroyed when the last shared_ptr is reset.
-        shared_ptr<int> x(&a, CallG());
-        shared_ptr<int> y(x);
-        x.reset();
-        if (g)
-            return 1;
-        y.reset();
-        if (g != 1)
-            return 2;
-    }
-
-    {
-        g = 0;
-        // Weak pointers don't prevent object destruction.
-        shared_ptr<int> x(&a, CallG());
-        weak_ptr<int> y(x);
-        x.reset();
-        if (g != 1)
-            return 3;
-    }
-
-    {
-        g = 0;
-        // Weak pointers can be promoted to shared pointers.
-        shared_ptr<int> x(&a, CallG());
-        weak_ptr<int> y(x);
-        auto z = y.lock();
-        x.reset();
-        if (g)
-            return 4;
-        y.reset();
-        if (g)
-            return 5;
-        z.reset();
-        if (g != 1)
-            return 6;
-    }
-
-    {
-        // Shared null pointers are falsey.
-        shared_ptr<int> x;
-        if (x)
-            return 7;
-        x.reset(new int);
-        if (!x)
-            return 8;
-    }
-
-    {
-        // You can cast a shared pointer validly.
-        shared_ptr<Derived> x(new Derived);
-        shared_ptr<Base> y(x);
-        // But not invalidly:
-        // shared_ptr<Base> x(new Derived);
-        // shared_ptr<Derived> y(x);
-    }
-
-    {
-        // You can cast a shared pointer to void to retain a reference.
-        shared_ptr<int> x(new int);
-        shared_ptr<void> y(x);
-    }
-
-    {
-        // You can also create a shared pointer to void in the first place.
-        shared_ptr<void> x(new int);
-    }
-
-    {
-        // You can take a shared pointer to a subobject, and it will free the
-        // base object.
-        shared_ptr<vector<int>> x(new vector<int>);
-        x->push_back(5);
-        shared_ptr<int> y(x, &x->at(0));
-        x.reset();
-        if (*y != 5)
-            return 9;
-    }
-
-    {
-        g = 0;
-        // You can create a shared_ptr from a unique_ptr.
-        unique_ptr<int, CallG> x(&a, CallG());
-        shared_ptr<int> y(move(x));
-        if (x)
-            return 10;
-        y.reset();
-        if (g != 1)
-            return 11;
-    }
-
-    {
-        g = 0;
-        // You can reassign shared_ptrs.
-        shared_ptr<int> x(&a, CallG());
-        shared_ptr<int> y;
-        y = x;
-        x.reset();
-        if (g)
-            return 12;
-        y.reset();
-        if (g != 1)
-            return 13;
-    }
-
-    {
-        // owner_before shows equivalence only for equivalent objects.
-        shared_ptr<int> x(&a, CallG());
-        shared_ptr<int> y(&b, CallG());
-        shared_ptr<void> z(x, &b);
-        if (z.owner_before(x) || x.owner_before(z))
-            return 14;
-        if (!z.owner_before(y) && !y.owner_before(z))
-            return 15;
-    }
-
-    {
-        // Use counts work like you'd expect
-        shared_ptr<int> x(new int);
-        if (x.use_count() != 1)
-            return 16;
-        shared_ptr<int> y(x);
-        if (x.use_count() != 2 || y.use_count() != 2)
-            return 17;
-        x.reset();
-        if (x.use_count() != 0 || y.use_count() != 1)
-            return 18;
-    }
-
-    {
-        // There is a make_shared that will allocate an object for you safely.
-        auto x = make_shared<int>(5);
-        if (!x)
-            return 19;
-        if (*x != 5)
-            return 20;
-    }
-
-    {
-        // Expired weak pointers lock to nullptr, and throw when promoted to
-        // shared pointer by constructor.
-        auto x = make_shared<int>();
-        weak_ptr<int> y(x);
-        x.reset();
-        if (y.lock())
-            return 21;
-        int caught = 0;
-        try {
-            shared_ptr<int> z(y);
-        } catch (bad_weak_ptr& e) {
-            caught = 1;
-        }
-        if (!caught)
-            return 22;
-    }
-
-    {
-        // nullptr is always expired.
-        shared_ptr<int> x(nullptr);
-        weak_ptr<int> y(x);
-        if (!y.expired())
-            return 23;
-    }
-
-    {
-        // enable_shared_from_this allows shared pointers to self.
-        auto x = SharedThis::create();
-        auto y = x->shared_from_this();
-        if (x.use_count() != 2 || x.get() != y.get())
-            return 24;
-        auto z = new CanShareThis();
-        auto w = shared_ptr<CanShareThis>(z);
-        auto v = w->shared_from_this();
-        if (w.use_count() != 2 || w.get() != v.get())
-            return 25;
-    }
-
-    if constexpr (ctl::is_same_v<shared_ptr<Tree>, ctl::shared_ptr<Tree>>) {
-        // Exercise our off-STL make_shared with weak self support.
-        auto t = ctl::make_shared<Tree>(
-          ctl::make_shared<Tree>(ctl::make_shared<Tree>(nullptr, nullptr),
-                                 nullptr),
-          ctl::make_shared<Tree>(nullptr, nullptr));
-        auto t2 = t->l->l->p.lock()->p.lock();
-        if (t.owner_before(t2) || t2.owner_before(t))
-            return 26;
-        if (!t.owner_before(t->l) && !t->l.owner_before(t))
-            return 27;
-    }
-
-    // TODO(mrdomino): exercise threads / races. The reference count should be
-    // atomically maintained.
-
-    CheckForMemoryLeaks();
-    return 0;
-}
diff --git a/test/ctl/string_bench.cc b/test/ctl/string_bench.cc
index b84aa98a4..b7b30c935 100644
--- a/test/ctl/string_bench.cc
+++ b/test/ctl/string_bench.cc
@@ -18,121 +18,128 @@
 
 #include "ctl/string.h"
 #include "ctl/utility.h"
-#include "libc/dce.h"
 #include "libc/mem/leaks.h"
-#include "libc/testlib/benchmark.h"
 
 #include "libc/calls/struct/timespec.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 
+#define BENCH(ITERATIONS, WORK_PER_RUN, CODE) \
+    do { \
+        struct timespec start = timespec_real(); \
+        for (int __i = 0; __i < ITERATIONS; ++__i) { \
+            asm volatile("" ::: "memory"); \
+            CODE; \
+        } \
+        long long work = (WORK_PER_RUN) * (ITERATIONS); \
+        double nanos = \
+          (timespec_tonanos(timespec_sub(timespec_real(), start)) + work - \
+           1) / \
+          (double)work; \
+        printf("%10g ns %2dx %s\n", nanos, (ITERATIONS), #CODE); \
+    } while (0)
+
 const char* big_c = "aaaaaaaaaaaaaaaaaaaaaaaa";
 const char* small_c = "aaaaaaaaaaaaaaaaaaaaaaa";
 
-#if IsModeDbg()
-#define ITERATIONS 1000 // because qemu in dbg mode is very slow
-#else
-#define ITERATIONS 1000000
-#endif
-
 int
 main()
 {
     const ctl::string_view big(big_c), small(small_c);
 
-    BENCHMARK(ITERATIONS * 10, 1, {
+    BENCH(10000000, 1, {
         ctl::string s;
         s.append("hello ");
         s.append("world");
     });
 
-    BENCHMARK(ITERATIONS, 8, {
+    BENCH(1000000, 8, {
         ctl::string s;
         for (int i = 0; i < 8; ++i) {
             s.append('a');
         }
     });
 
-    BENCHMARK(ITERATIONS, 16, {
+    BENCH(1000000, 16, {
         ctl::string s;
         for (int i = 0; i < 16; ++i) {
             s.append('a');
         }
     });
 
-    BENCHMARK(ITERATIONS, 23, {
+    BENCH(1000000, 23, {
         ctl::string s;
         for (int i = 0; i < 23; ++i) {
             s.append('a');
         }
     });
 
-    BENCHMARK(ITERATIONS, 24, {
+    BENCH(1000000, 24, {
         ctl::string s;
         for (int i = 0; i < 24; ++i) {
             s.append('a');
         }
     });
 
-    BENCHMARK(ITERATIONS, 32, {
+    BENCH(1000000, 32, {
         ctl::string s;
         for (int i = 0; i < 32; ++i) {
             s.append('a');
         }
     });
 
-    BENCHMARK(ITERATIONS, 1, { ctl::string s(small_c); });
+    BENCH(1000000, 1, { ctl::string s(small_c); });
 
-    BENCHMARK(ITERATIONS, 1, { ctl::string s(small); });
+    BENCH(1000000, 1, { ctl::string s(small); });
 
     {
         ctl::string small_copy("hello world");
-        BENCHMARK(ITERATIONS, 1, { ctl::string s2(small_copy); });
+        BENCH(1000000, 1, { ctl::string s2(small_copy); });
     }
 
-    BENCHMARK(ITERATIONS, 1, {
+    BENCH(1000000, 1, {
         ctl::string s(small);
         ctl::string s2(ctl::move(s));
     });
 
-    BENCHMARK(ITERATIONS, 1, {
+    BENCH(1000000, 1, {
         ctl::string s(small);
         ctl::string s2(s);
     });
 
-    BENCHMARK(ITERATIONS, 1, { ctl::string s(big_c); });
+    BENCH(1000000, 1, { ctl::string s(big_c); });
 
-    BENCHMARK(ITERATIONS, 1, { ctl::string s(big); });
+    BENCH(1000000, 1, { ctl::string s(big); });
 
     {
         ctl::string big_copy(big);
-        BENCHMARK(ITERATIONS, 1, { ctl::string s2(big_copy); });
+        BENCH(1000000, 1, { ctl::string s2(big_copy); });
     }
 
-    BENCHMARK(ITERATIONS, 1, {
+    BENCH(1000000, 1, {
         ctl::string s(big);
         ctl::string s2(ctl::move(s));
     });
 
-    BENCHMARK(ITERATIONS, 1, {
+    BENCH(1000000, 1, {
         ctl::string s(big);
         ctl::string s2(s);
     });
 
-    BENCHMARK(ITERATIONS, 1, { ctl::string s(23, 'a'); });
+    BENCH(1000000, 1, { ctl::string s(23, 'a'); });
 
-    BENCHMARK(ITERATIONS, 1, { ctl::string s(24, 'a'); });
+    BENCH(1000000, 1, { ctl::string s(24, 'a'); });
 
     {
         ctl::string s(5, 'a');
-        BENCHMARK(ITERATIONS, 1, { ctl::string_view s2(s); });
+        BENCH(1000000, 1, { ctl::string_view s2(s); });
     }
 
     {
         ctl::string big_trunc(48, 'a');
         big_trunc.resize(4);
-        BENCHMARK(ITERATIONS, 1, { ctl::string s(big_trunc); });
+        BENCH(1000000, 1, { ctl::string s(big_trunc); });
     }
 
     CheckForMemoryLeaks();
diff --git a/test/ctl/string_test.cc b/test/ctl/string_test.cc
index 6319a82da..90b19e9c8 100644
--- a/test/ctl/string_test.cc
+++ b/test/ctl/string_test.cc
@@ -404,69 +404,5 @@ main()
         }
     }
 
-    {
-        String s = "love";
-        String b;
-        b.append(s, 1, 2);
-        if (b != "ov")
-            return 107;
-    }
-
-    {
-        String s = "ee";
-        if (s.find_last_of('E') != String::npos)
-            return 108;
-        if (s.find_last_of('e') != 1)
-            return 109;
-    }
-
-    {
-        String e = "";
-        String s = "ee";
-        if (e.find_last_of("") != String::npos)
-            return 110;
-        if (s.find_last_of("") != String::npos)
-            return 111;
-        if (s.find_last_of("AE") != String::npos)
-            return 112;
-        if (s.find_last_of("ae") != 1)
-            return 113;
-        if (s.find_last_of("ae", 1) != 1)
-            return 114;
-        if (s.find_last_of("ae", 0) != 0)
-            return 115;
-        if (s.find_last_of("ae", 10) != 1)
-            return 116;
-    }
-
-    {
-        String s = "ee";
-        if (s.find_first_of('E') != String::npos)
-            return 117;
-        if (s.find_first_of('e') != 0)
-            return 118;
-        if (s.find_first_of('e', 1) != 1)
-            return 119;
-    }
-
-    {
-        String e = "";
-        String s = "ee";
-        if (e.find_first_of("") != String::npos)
-            return 120;
-        if (s.find_first_of("") != String::npos)
-            return 121;
-        if (s.find_first_of("AE") != String::npos)
-            return 122;
-        if (s.find_first_of("ae") != 0)
-            return 123;
-        if (s.find_first_of("ae", 1) != 1)
-            return 124;
-        if (s.find_first_of("ae", 0) != 0)
-            return 125;
-        if (s.find_first_of("ae", 10) != String::npos)
-            return 126;
-    }
-
     CheckForMemoryLeaks();
 }
diff --git a/test/ctl/string_view_test.cc b/test/ctl/string_view_test.cc
index a371fdfb5..a82743e15 100644
--- a/test/ctl/string_view_test.cc
+++ b/test/ctl/string_view_test.cc
@@ -16,7 +16,6 @@
 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 // PERFORMANCE OF THIS SOFTWARE.
 
-#include "ctl/string.h"
 #include "ctl/string_view.h"
 #include "libc/mem/leaks.h"
 #include "libc/str/str.h"
@@ -173,70 +172,5 @@ main(int argc, char* argv[])
             return 32;
     }
 
-    {
-        ctl::string b;
-        const ctl::string_view s = "hi";
-        for (char c : s)
-            b += c;
-        if (b != "hi")
-            return 2;
-    }
-
-    {
-        ctl::string_view s = "ee";
-        if (s.find_last_of('E') != ctl::string_view::npos)
-            return 108;
-        if (s.find_last_of('e') != 1)
-            return 109;
-    }
-
-    {
-        ctl::string_view e = "";
-        ctl::string_view s = "ee";
-        if (e.find_last_of("") != ctl::string_view::npos)
-            return 110;
-        if (s.find_last_of("") != ctl::string_view::npos)
-            return 111;
-        if (s.find_last_of("AE") != ctl::string_view::npos)
-            return 112;
-        if (s.find_last_of("ae") != 1)
-            return 113;
-        if (s.find_last_of("ae", 1) != 1)
-            return 114;
-        if (s.find_last_of("ae", 0) != 0)
-            return 115;
-        if (s.find_last_of("ae", 10) != 1)
-            return 116;
-    }
-
-    {
-        ctl::string_view s = "ee";
-        if (s.find_first_of('E') != ctl::string_view::npos)
-            return 117;
-        if (s.find_first_of('e') != 0)
-            return 118;
-        if (s.find_first_of('e', 1) != 1)
-            return 119;
-    }
-
-    {
-        ctl::string_view e = "";
-        ctl::string_view s = "ee";
-        if (e.find_first_of("") != ctl::string_view::npos)
-            return 120;
-        if (s.find_first_of("") != ctl::string_view::npos)
-            return 121;
-        if (s.find_first_of("AE") != ctl::string_view::npos)
-            return 122;
-        if (s.find_first_of("ae") != 0)
-            return 123;
-        if (s.find_first_of("ae", 1) != 1)
-            return 124;
-        if (s.find_first_of("ae", 0) != 0)
-            return 125;
-        if (s.find_first_of("ae", 10) != ctl::string_view::npos)
-            return 126;
-    }
-
     CheckForMemoryLeaks();
 }
diff --git a/test/ctl/to_string_test.cc b/test/ctl/to_string_test.cc
index 8970d0a0f..69841c7ad 100644
--- a/test/ctl/to_string_test.cc
+++ b/test/ctl/to_string_test.cc
@@ -116,12 +116,10 @@ main()
             return 31;
         if (ctl::to_string(3.14L) != "3.14")
             return 32;
-#if LDBL_MANT_DIG > 64
         if (ctl::to_string(LDBL_MAX) != "1.189731495357232e+4932")
             return 33;
         if (ctl::to_string(-LDBL_MAX) != "-1.189731495357232e+4932")
             return 34;
-#endif
     }
 
     CheckForMemoryLeaks();
diff --git a/test/ctl/unique_ptr_test.cc b/test/ctl/unique_ptr_test.cc
index 239902b83..75ed674ba 100644
--- a/test/ctl/unique_ptr_test.cc
+++ b/test/ctl/unique_ptr_test.cc
@@ -24,44 +24,43 @@
 // #include <type_traits>
 // #define ctl std
 
-using ctl::unique_ptr;
-using ctl::make_unique;
-using ctl::make_unique_for_overwrite;
+template<typename T, typename D = ctl::default_delete<T>>
+using Ptr = ctl::unique_ptr<T, D>;
 
-#undef ctl
+template<typename T, typename... Args>
+Ptr<T>
+Mk(Args&&... args)
+{
+    return ctl::make_unique<T, Args...>(ctl::forward<Args>(args)...);
+}
+
+template<typename T>
+Ptr<T>
+MkRaw()
+{
+    return ctl::make_unique_for_overwrite<T>();
+}
+
+// #undef ctl
 
-// The following few definitions are used to get observability into aspects of
-// an object's lifecycle, to make sure that e.g. constructing a unique_ptr of a
-// type does not construct an object, and that make_unique does construct an
-// object.
 static int g = 0;
 
-struct ConstructG
-{
-    ConstructG()
-    {
-        ++g;
-    }
-};
-
-struct DestructG
-{
-    ~DestructG()
-    {
-        ++g;
-    }
-};
-
-struct CallG
+struct SetsGDeleter
 {
     void operator()(auto* x) const noexcept
     {
         ++g;
+        delete x;
     }
 };
 
-// A unique_ptr with an empty deleter should be the same size as a raw pointer.
-static_assert(sizeof(unique_ptr<int, decltype([] {})>) == sizeof(int*));
+struct StatefulDeleter
+{
+    char state;
+    void operator()(auto* x) const noexcept
+    {
+    }
+};
 
 struct FinalDeleter final
 {
@@ -70,10 +69,27 @@ struct FinalDeleter final
     }
 };
 
-// ctl::unique_ptr does not need to inherit from its deleter for this property;
-// the STL often does, though, so we don't hold them to the following.
-static_assert(!ctl::is_same_v<unique_ptr<int>, ctl::unique_ptr<int>> ||
-              sizeof(unique_ptr<int, FinalDeleter>) == sizeof(int*));
+static_assert(sizeof(Ptr<int, SetsGDeleter>) == sizeof(int*));
+
+// not everyone uses [[no_unique_address]]...
+static_assert(!ctl::is_same_v<Ptr<int>, ctl::unique_ptr<int>> ||
+              sizeof(Ptr<int, FinalDeleter>) == sizeof(int*));
+
+struct SetsGCtor
+{
+    SetsGCtor()
+    {
+        ++g;
+    }
+};
+
+struct SetsGDtor
+{
+    ~SetsGDtor()
+    {
+        ++g;
+    }
+};
 
 struct Base
 {};
@@ -84,16 +100,13 @@ struct Derived : Base
 int
 main()
 {
-    int a;
 
     {
-        // Shouldn't cause any memory leaks.
-        unique_ptr<int> x(new int(5));
+        Ptr<int> x(new int(5));
     }
 
     {
-        // Deleter is called if the pointer is non-null when reset.
-        unique_ptr<int, CallG> x(&a);
+        Ptr<int, SetsGDeleter> x(new int());
         x.reset();
         if (g != 1)
             return 1;
@@ -101,45 +114,22 @@ main()
 
     {
         g = 0;
-        // Deleter is not called if the pointer is null when reset.
-        unique_ptr<int, CallG> x(&a);
-        x.release();
+        Ptr<int, SetsGDeleter> x(new int());
+        delete x.release();
         x.reset();
         if (g)
             return 17;
     }
 
     {
-        g = 0;
-        // Deleter is called when the pointer goes out of scope.
-        {
-            unique_ptr<int, CallG> x(&a);
-        }
-        if (!g)
-            return 18;
-    }
-
-    {
-        g = 0;
-        // Deleter is called if scope ends exceptionally.
-        try {
-            unique_ptr<int, CallG> x(&a);
-            throw 'a';
-        } catch (char) {
-        }
-        if (!g)
-            return 19;
-    }
-
-    {
-        unique_ptr<int> x(new int(5)), y(new int(6));
+        Ptr<int> x(new int(5)), y(new int(6));
         x.swap(y);
         if (*x != 6 || *y != 5)
             return 2;
     }
 
     {
-        unique_ptr<int> x;
+        Ptr<int> x;
         if (x)
             return 3;
         x.reset(new int(5));
@@ -149,17 +139,17 @@ main()
 
     {
         g = 0;
-        unique_ptr<ConstructG> x;
+        Ptr<SetsGCtor> x;
         if (g)
             return 5;
-        x = make_unique<ConstructG>();
+        x = Mk<SetsGCtor>();
         if (g != 1)
             return 6;
     }
 
     {
         g = 0;
-        auto x = make_unique<DestructG>();
+        auto x = Mk<SetsGDtor>();
         if (g)
             return 7;
         x.reset();
@@ -171,9 +161,9 @@ main()
 
     {
         g = 0;
-        unique_ptr<DestructG> x, y;
-        x = make_unique<DestructG>();
-        y = make_unique<DestructG>();
+        Ptr<SetsGDtor> x, y;
+        x = Mk<SetsGDtor>();
+        y = Mk<SetsGDtor>();
 #if 0
         // shouldn't compile
         x = y;
@@ -188,7 +178,7 @@ main()
     {
         g = 0;
         {
-            auto x = make_unique<DestructG>();
+            auto x = Mk<SetsGDtor>();
         }
         if (g != 1)
             return 12;
@@ -197,7 +187,7 @@ main()
     {
         g = 0;
         {
-            auto x = make_unique<DestructG>();
+            auto x = Mk<SetsGDtor>();
             delete x.release();
         }
         if (g != 1)
@@ -209,13 +199,13 @@ main()
     // side effects it has are illegal to detect?
     {
         g = 0;
-        auto x = make_unique_for_overwrite<DefaultInitialized>();
+        auto x = MkRaw<DefaultInitialized>();
         if (g)
             return 14;
         x.reset();
         if (g)
             return 15;
-        x = make_unique<DefaultInitialized>();
+        x = Mk<DefaultInitialized>();
         if (g != 1)
             return 16;
     }
@@ -224,15 +214,16 @@ main()
     {
         int a;
         // Should compile.
-        unique_ptr<int, FinalDeleter> x(&a);
+        Ptr<int, FinalDeleter> x(&a);
+        Ptr<int, StatefulDeleter> y(&a);
     }
 
     {
-        unique_ptr<Base> x(new Base);
+        Ptr<Base> x(new Base);
         x.reset(new Derived);
 
-        unique_ptr<Derived> y(new Derived);
-        unique_ptr<Base> z(ctl::move(y));
+        Ptr<Derived> y(new Derived);
+        Ptr<Base> z(ctl::move(y));
     }
 
     CheckForMemoryLeaks();
diff --git a/test/dsp/core/alaw_test.c b/test/dsp/core/alaw_test.c
index 5f91bf176..ec434c8da 100644
--- a/test/dsp/core/alaw_test.c
+++ b/test/dsp/core/alaw_test.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/core/core.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/testlib/ezbench.h"
diff --git a/test/dsp/core/getintegercoefficients_test.c b/test/dsp/core/getintegercoefficients_test.c
index ec7cd03e3..4fa0e3009 100644
--- a/test/dsp/core/getintegercoefficients_test.c
+++ b/test/dsp/core/getintegercoefficients_test.c
@@ -19,7 +19,7 @@
 #include "dsp/core/core.h"
 #include "dsp/core/q.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/str/str.h"
 #include "libc/testlib/ezbench.h"
diff --git a/test/dsp/core/mulaw_test.c b/test/dsp/core/mulaw_test.c
index d2c0f2863..f19eda5c9 100644
--- a/test/dsp/core/mulaw_test.c
+++ b/test/dsp/core/mulaw_test.c
@@ -19,7 +19,7 @@
 #include "dsp/core/core.h"
 #include "dsp/core/ituround.h"
 #include "libc/assert.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
diff --git a/test/dsp/core/scalevolume_test.c b/test/dsp/core/scalevolume_test.c
index a6247fc54..3940a7055 100644
--- a/test/dsp/core/scalevolume_test.c
+++ b/test/dsp/core/scalevolume_test.c
@@ -17,9 +17,10 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/core/core.h"
+#include "dsp/mpeg/mpeg.h"
 #include "libc/limits.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/stdio/rand.h"
 #include "libc/testlib/ezbench.h"
diff --git a/test/dsp/scale/scale_test.c b/test/dsp/scale/scale_test.c
index b43368f43..be2ae9d6e 100644
--- a/test/dsp/scale/scale_test.c
+++ b/test/dsp/scale/scale_test.c
@@ -22,7 +22,7 @@
 #include "dsp/core/core.h"
 #include "dsp/core/half.h"
 #include "libc/fmt/bing.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/stdio.h"
diff --git a/test/libc/calls/cachestat_test.c b/test/libc/calls/cachestat_test.c
index 8f91781f6..63c3e8088 100644
--- a/test/libc/calls/cachestat_test.c
+++ b/test/libc/calls/cachestat_test.c
@@ -24,12 +24,11 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/sysconf.h"
 #include "libc/stdio/rand.h"
-#include "libc/stdio/stdio.h"
 #include "libc/sysv/consts/auxv.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/testlib/testlib.h"
@@ -51,9 +50,6 @@ void SetUpOnce(void) {
   // ASSERT_SYS(0, 0, pledge("stdio rpath wpath cpath", 0));
 }
 
-// TODO(jart): fix this test
-#if 0
-
 TEST(cachestat, testCachestatOnDevices) {
   const char *const files[] = {
       "/dev/zero", "/dev/null", "/dev/urandom", "/proc/version", "/proc",
@@ -67,8 +63,6 @@ TEST(cachestat, testCachestatOnDevices) {
   }
 }
 
-#endif
-
 TEST(cachestat, testCachestatAfterWrite) {
   size_t size = 4 * pagesize;
   char *data = gc(xmalloc(size));
@@ -110,14 +104,12 @@ done:
 }
 
 TEST(cachestat, testCachestatShmem) {
-  char name[64];
-  sprintf(name, "/cachestat_test-%ld", _rand64());
   size_t filesize = 512 * 2 * pagesize;  // 2 2MB huge pages.
   size_t compute_len = 512 * pagesize;
   unsigned long num_pages = compute_len / pagesize;
   char *data = gc(xmalloc(filesize));
   ASSERT_SYS(0, filesize, getrandom(data, filesize, 0));
-  ASSERT_SYS(0, 3, shm_open(name, O_CREAT | O_RDWR, 0600));
+  ASSERT_SYS(0, 3, shm_open("tmpshmcstat", O_CREAT | O_RDWR, 0600));
   ASSERT_SYS(0, 0, ftruncate(3, filesize));
   ASSERT_SYS(0, filesize, write(3, data, filesize));
   struct cachestat_range range = {pagesize, compute_len};
@@ -125,6 +117,6 @@ TEST(cachestat, testCachestatShmem) {
   ASSERT_SYS(0, 0, cachestat(3, &range, &cs, 0));
   ASSERT_EQ(num_pages, cs.nr_cache + cs.nr_evicted,
             "total number of cached and evicted pages is off.\n");
-  ASSERT_SYS(0, 0, shm_unlink(name));
+  ASSERT_SYS(0, 0, shm_unlink("tmpshmcstat"));
   ASSERT_SYS(0, 0, close(3));
 }
diff --git a/test/libc/calls/clock_getres_test.c b/test/libc/calls/clock_getres_test.c
index 27aa3a447..ad296e319 100644
--- a/test/libc/calls/clock_getres_test.c
+++ b/test/libc/calls/clock_getres_test.c
@@ -32,6 +32,13 @@ TEST(clock_getres, realtimeHasMillisecondPrecisionOrBetter) {
   EXPECT_GT(ts.tv_nsec, 0);
 }
 
+TEST(clock_getres, realtimeFastHasMillisecondPrecisionOrBetter) {
+  ASSERT_EQ(0, clock_getres(CLOCK_REALTIME_FAST, &ts));
+  EXPECT_EQ(0, ts.tv_sec);
+  EXPECT_LT(ts.tv_nsec, 1000000);
+  EXPECT_GT(ts.tv_nsec, 0);
+}
+
 TEST(clock_getres, realtimeCoarseHasMillisecondPrecisionOrBetter) {
   if (clock_getres(CLOCK_REALTIME_COARSE, &ts))
     return;
@@ -40,6 +47,14 @@ TEST(clock_getres, realtimeCoarseHasMillisecondPrecisionOrBetter) {
   EXPECT_GT(ts.tv_nsec, 0);
 }
 
+TEST(clock_getres, realtimePreciseHasMillisecondPrecisionOrBetter) {
+  if (clock_getres(CLOCK_REALTIME_PRECISE, &ts))
+    return;
+  EXPECT_EQ(0, ts.tv_sec);
+  EXPECT_LT(ts.tv_nsec, 100000000);
+  EXPECT_GT(ts.tv_nsec, 0);
+}
+
 TEST(clock_getres, monotonicHasMillisecondPrecisionOrBetter) {
   ASSERT_EQ(0, clock_getres(CLOCK_MONOTONIC, &ts));
   EXPECT_EQ(0, ts.tv_sec);
@@ -47,6 +62,13 @@ TEST(clock_getres, monotonicHasMillisecondPrecisionOrBetter) {
   EXPECT_GT(ts.tv_nsec, 0);
 }
 
+TEST(clock_getres, monotonicFastHasMillisecondPrecisionOrBetter) {
+  ASSERT_EQ(0, clock_getres(CLOCK_MONOTONIC_FAST, &ts));
+  EXPECT_EQ(0, ts.tv_sec);
+  EXPECT_LT(ts.tv_nsec, 1000000);
+  EXPECT_GT(ts.tv_nsec, 0);
+}
+
 TEST(clock_getres, monotonicCoarseHasMillisecondPrecisionOrBetter) {
   if (clock_getres(CLOCK_MONOTONIC_COARSE, &ts))
     return;
@@ -54,3 +76,11 @@ TEST(clock_getres, monotonicCoarseHasMillisecondPrecisionOrBetter) {
   EXPECT_LT(ts.tv_nsec, 100000000);
   EXPECT_GT(ts.tv_nsec, 0);
 }
+
+TEST(clock_getres, monotonicPreciseHasMillisecondPrecisionOrBetter) {
+  if (clock_getres(CLOCK_MONOTONIC_PRECISE, &ts))
+    return;
+  EXPECT_EQ(0, ts.tv_sec);
+  EXPECT_LT(ts.tv_nsec, 100000000);
+  EXPECT_GT(ts.tv_nsec, 0);
+}
diff --git a/test/libc/calls/clock_gettime_test.c b/test/libc/calls/clock_gettime_test.c
index 4c76c4d8a..c86a92abf 100644
--- a/test/libc/calls/clock_gettime_test.c
+++ b/test/libc/calls/clock_gettime_test.c
@@ -16,13 +16,20 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/internal.h"
 #include "libc/calls/struct/timespec.h"
+#include "libc/calls/struct/timespec.internal.h"
 #include "libc/calls/struct/timeval.h"
+#include "libc/calls/syscall_support-sysv.internal.h"
+#include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/nexgen32e/rdtsc.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/auxv.h"
 #include "libc/sysv/consts/clock.h"
-#include "libc/testlib/benchmark.h"
+#include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
+#include "libc/time.h"
 
 TEST(clock_gettime, nullResult_validatesClockParam) {
   ASSERT_SYS(EINVAL, -1, clock_gettime(666, 0));
@@ -44,19 +51,26 @@ TEST(clock_gettime, testClockRealtime) {
 }
 
 TEST(clock_gettime, bench) {
+  struct timeval tv;
   struct timespec ts;
-  BENCHMARK(1, 1, timespec_real());
-  BENCHMARK(1000, 1, timespec_real());
-  if (!clock_gettime(CLOCK_REALTIME, 0))
-    BENCHMARK(1000, 1, clock_gettime(CLOCK_REALTIME, &ts));
-  if (!clock_gettime(CLOCK_REALTIME_COARSE, 0))
-    BENCHMARK(1000, 1, clock_gettime(CLOCK_REALTIME_COARSE, &ts));
-  if (!clock_gettime(CLOCK_MONOTONIC, 0))
-    BENCHMARK(1000, 1, clock_gettime(CLOCK_MONOTONIC, &ts));
-  if (!clock_gettime(CLOCK_MONOTONIC_COARSE, 0))
-    BENCHMARK(1000, 1, clock_gettime(CLOCK_MONOTONIC_COARSE, &ts));
-  if (!clock_gettime(CLOCK_MONOTONIC_RAW, 0))
-    BENCHMARK(1000, 1, clock_gettime(CLOCK_MONOTONIC_RAW, &ts));
-  if (!clock_gettime(CLOCK_BOOTTIME, 0))
-    BENCHMARK(1000, 1, clock_gettime(CLOCK_BOOTTIME, &ts));
+  gettimeofday(&tv, 0);   // trigger init
+  clock_gettime(0, &ts);  // trigger init
+  EZBENCH2("rdtsc", donothing, rdtsc());
+  EZBENCH2("clock_gettime(mono)", donothing,
+           clock_gettime(CLOCK_MONOTONIC_FAST, &ts));
+  EZBENCH2("clock_gettime(real)", donothing,
+           clock_gettime(CLOCK_REALTIME_FAST, &ts));
+  EZBENCH2("timespec_real", donothing, timespec_real());
+  EZBENCH2("gettimeofday", donothing, gettimeofday(&tv, 0));
+  if (IsWindows()) {
+    EZBENCH2("sys_clock_gettime r", donothing,
+             sys_clock_gettime_nt(CLOCK_REALTIME_FAST, &ts));
+    EZBENCH2("sys_clock_gettime m", donothing,
+             sys_clock_gettime_nt(CLOCK_MONOTONIC_FAST, &ts));
+  } else {
+    EZBENCH2("sys_clock_gettime r", donothing,
+             sys_clock_gettime(CLOCK_REALTIME_FAST, &ts));
+    EZBENCH2("sys_clock_gettime m", donothing,
+             sys_clock_gettime(CLOCK_MONOTONIC_FAST, &ts));
+  }
 }
diff --git a/test/libc/calls/commandv_test.c b/test/libc/calls/commandv_test.c
index 043b3164e..f2681cd4c 100644
--- a/test/libc/calls/commandv_test.c
+++ b/test/libc/calls/commandv_test.c
@@ -100,6 +100,8 @@ TEST(commandv, test_DirPaths_wontConsiderDirectoriesExecutable2) {
 }
 
 TEST(commandv, test_nonExecutableFile_willEacces) {
+  if (IsWindows())
+    return;  // TODO: fixme
   setenv("PATH", "foo", true);
   EXPECT_SYS(0, 0, mkdir("foo", 0755));
   EXPECT_SYS(0, 0, touch("foo/bar", 0400));
diff --git a/test/libc/calls/dup_test.c b/test/libc/calls/dup_test.c
index 9421fecbb..cad66f18e 100644
--- a/test/libc/calls/dup_test.c
+++ b/test/libc/calls/dup_test.c
@@ -94,6 +94,7 @@ TEST(dup2, zipossrc) {
   ASSERT_SYS(0, 0, close(3));
 }
 
+#ifdef __x86_64__
 TEST(dup, clearsCloexecFlag) {
   static bool once;
   int ws;
@@ -111,3 +112,4 @@ TEST(dup, clearsCloexecFlag) {
   ASSERT_EQ(72 << 8, ws);
   ASSERT_SYS(0, 0, close(3));
 }
+#endif
diff --git a/test/libc/system/systemvpe_test.c b/test/libc/calls/fchmod_test.c
similarity index 79%
rename from test/libc/system/systemvpe_test.c
rename to test/libc/calls/fchmod_test.c
index bcbc5a30c..df8ffcc2b 100644
--- a/test/libc/system/systemvpe_test.c
+++ b/test/libc/calls/fchmod_test.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -17,18 +17,25 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
-#include "libc/cosmo.h"
-#include "libc/runtime/runtime.h"
+#include "libc/calls/struct/stat.h"
 #include "libc/testlib/testlib.h"
 
 void SetUpOnce(void) {
   testlib_enable_tmp_setup_teardown();
 }
 
-TEST(systemvpe, test) {
-  ASSERT_SYS(0, 0, mkdir("bin", 0755));
-  ASSERT_SYS(0, 0, setenv("PATH", "bin", true));
-  testlib_extract("/zip/life", "bin/life", 0755);
-  ASSERT_SYS(0, 42 << 8,
-             systemvpe("life", (char *[]){"life", 0}, (char *[]){0}));
+uint32_t GetMode(int fd) {
+  struct stat st;
+  ASSERT_SYS(0, 0, fstat(fd, &st));
+  return st.st_mode & 0777;
+}
+
+TEST(fchmod, canChangeReadOnlyBit) {
+  ASSERT_SYS(0, 3, creat("foo", 0600));
+  ASSERT_EQ(0600, GetMode(3));
+  ASSERT_SYS(0, 0, fchmod(3, 0400));
+  ASSERT_EQ(0400, GetMode(3));
+  ASSERT_SYS(0, 0, fchmod(3, 0600));
+  ASSERT_EQ(0600, GetMode(3));
+  ASSERT_SYS(0, 0, close(3));
 }
diff --git a/test/libc/calls/fchmodat_test.c b/test/libc/calls/fchmodat_test.c
index e103e3569..cb6d99d40 100644
--- a/test/libc/calls/fchmodat_test.c
+++ b/test/libc/calls/fchmodat_test.c
@@ -20,7 +20,7 @@
 #include "libc/calls/struct/stat.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/at.h"
 #include "libc/sysv/consts/o.h"
diff --git a/test/libc/calls/fcntl_test.c b/test/libc/calls/fcntl_test.c
index d816d27fe..84675e66a 100644
--- a/test/libc/calls/fcntl_test.c
+++ b/test/libc/calls/fcntl_test.c
@@ -22,7 +22,7 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/f.h"
 #include "libc/sysv/consts/fd.h"
diff --git a/test/libc/calls/getcontext_test.c b/test/libc/calls/getcontext_test.c
index c80219c7f..35a9db833 100644
--- a/test/libc/calls/getcontext_test.c
+++ b/test/libc/calls/getcontext_test.c
@@ -18,7 +18,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/sigset.h"
-#include "libc/calls/struct/ucontext.internal.h"
 #include "libc/calls/ucontext.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/sig.h"
@@ -61,7 +60,6 @@ TEST(getcontext, canReadAndWriteSignalMask) {
   ASSERT_EQ(0, getcontext(&context));
   if (!n) {
     n = 1;
-    context.uc_mcontext.RES0 = 0;
     ASSERT_TRUE(sigismember(&context.uc_sigmask, SIGUSR1));
     sigaddset(&context.uc_sigmask, SIGUSR2);
     setcontext(&context);
diff --git a/test/libc/calls/getcwd_test.c b/test/libc/calls/getcwd_test.c
index f76653466..0cd9919a6 100644
--- a/test/libc/calls/getcwd_test.c
+++ b/test/libc/calls/getcwd_test.c
@@ -22,7 +22,7 @@
 #include "libc/errno.h"
 #include "libc/fmt/libgen.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/serialize.h"
 #include "libc/str/str.h"
diff --git a/test/libc/calls/getgroups_test.c b/test/libc/calls/getgroups_test.c
index e19372297..7c221e899 100644
--- a/test/libc/calls/getgroups_test.c
+++ b/test/libc/calls/getgroups_test.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/testlib/testlib.h"
 
 TEST(getgroups, test) {
diff --git a/test/libc/calls/getrandom_test.c b/test/libc/calls/getrandom_test.c
index 42fb33c65..3e2386a7a 100644
--- a/test/libc/calls/getrandom_test.c
+++ b/test/libc/calls/getrandom_test.c
@@ -23,7 +23,7 @@
 #include "libc/calls/struct/sigset.h"
 #include "libc/errno.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
@@ -34,7 +34,7 @@
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/consts/grnd.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/testlib/ezbench.h"
@@ -59,9 +59,11 @@ void *TortureWorker(void *arg) {
   ASSERT_SYS(0, 0, sigprocmask(SIG_SETMASK, &ss, 0));
   ready = true;
   while (!done) {
-    pthread_kill(parent, SIGUSR1);
+    if (!IsWindows())
+      pthread_kill(parent, SIGUSR1);
     usleep(1);
-    pthread_kill(parent, SIGUSR2);
+    if (!IsWindows())
+      pthread_kill(parent, SIGUSR2);
     usleep(1);
   }
   return 0;
diff --git a/test/libc/calls/lock_ofd_test.c b/test/libc/calls/lock_ofd_test.c
index 09c279cf2..4b7081299 100644
--- a/test/libc/calls/lock_ofd_test.c
+++ b/test/libc/calls/lock_ofd_test.c
@@ -23,7 +23,7 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/f.h"
 #include "libc/sysv/consts/o.h"
diff --git a/test/libc/calls/lock_test.c b/test/libc/calls/lock_test.c
index d05495400..5307228be 100644
--- a/test/libc/calls/lock_test.c
+++ b/test/libc/calls/lock_test.c
@@ -20,7 +20,7 @@
 #include "libc/calls/struct/flock.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/f.h"
 #include "libc/sysv/consts/o.h"
diff --git a/test/libc/calls/lseek_test.c b/test/libc/calls/lseek_test.c
index 72214fdb3..bff1cbdd4 100644
--- a/test/libc/calls/lseek_test.c
+++ b/test/libc/calls/lseek_test.c
@@ -17,7 +17,12 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
+#include "libc/calls/internal.h"
+#include "libc/dce.h"
 #include "libc/errno.h"
+#include "libc/limits.h"
+#include "libc/log/check.h"
+#include "libc/runtime/runtime.h"
 #include "libc/sock/sock.h"
 #include "libc/sysv/consts/af.h"
 #include "libc/sysv/consts/ipproto.h"
@@ -25,22 +30,23 @@
 #include "libc/sysv/consts/sock.h"
 #include "libc/testlib/subprocess.h"
 #include "libc/testlib/testlib.h"
+#include "libc/x/x.h"
 
 void SetUpOnce(void) {
   testlib_enable_tmp_setup_teardown();
   ASSERT_SYS(0, 0, pledge("stdio rpath wpath cpath fattr proc inet", 0));
 }
 
-TEST(lseek, ebadf) {
-  ASSERT_SYS(EBADF, -1, lseek(-1, 0, SEEK_SET));
-  ASSERT_SYS(EBADF, -1, lseek(+3, 0, SEEK_SET));
-}
+/* TEST(lseek, ebadf) { */
+/*   ASSERT_SYS(EBADF, -1, lseek(-1, 0, SEEK_SET)); */
+/*   ASSERT_SYS(EBADF, -1, lseek(+3, 0, SEEK_SET)); */
+/* } */
 
-TEST(lseek, badWhence_einval) {
-  ASSERT_SYS(0, 3, creat("foo", 0644));
-  ASSERT_SYS(EINVAL, -1, lseek(3, 0, -1));
-  EXPECT_SYS(0, 0, close(3));
-}
+/* TEST(lseek, badWhence_einval) { */
+/*   ASSERT_SYS(0, 3, creat("foo", 0644)); */
+/*   ASSERT_SYS(EINVAL, -1, lseek(3, 0, -1)); */
+/*   EXPECT_SYS(0, 0, close(3)); */
+/* } */
 
 TEST(lseek, negativeComputedOffset_einval) {
   ASSERT_SYS(0, 3, creat("foo", 0644));
@@ -53,66 +59,68 @@ TEST(lseek, negativeComputedOffset_einval) {
   EXPECT_SYS(0, 0, close(3));
 }
 
-TEST(lseek, 64bit) {
-  ASSERT_SYS(0, 3, creat("foo", 0644));
-  ASSERT_SYS(0, 0x100000001, lseek(3, 0x100000001, SEEK_SET));
-  EXPECT_SYS(0, 0, close(3));
-}
+/* TEST(lseek, 64bit) { */
+/*   ASSERT_SYS(0, 3, creat("foo", 0644)); */
+/*   ASSERT_SYS(0, 0x100000001, lseek(3, 0x100000001, SEEK_SET)); */
+/*   EXPECT_SYS(0, 0, close(3)); */
+/* } */
 
-TEST(lseek, isPipe_ESPIPE) {
-  int fds[2];
-  char buf[2];
-  ASSERT_SYS(0, 0, pipe(fds));
-  ASSERT_SYS(ESPIPE, -1, lseek(3, 0, SEEK_SET));
-  ASSERT_SYS(ESPIPE, -1, pwrite(4, "hi", 2, 0));
-  ASSERT_SYS(ESPIPE, -1, pread(3, buf, 2, 0));
-  EXPECT_SYS(0, 0, close(4));
-  EXPECT_SYS(0, 0, close(3));
-}
+/* TEST(lseek, isPipe_ESPIPE) { */
+/*   int fds[2]; */
+/*   char buf[2]; */
+/*   ASSERT_SYS(0, 0, pipe(fds)); */
+/*   ASSERT_SYS(ESPIPE, -1, lseek(3, 0, SEEK_SET)); */
+/*   ASSERT_SYS(ESPIPE, -1, pwrite(4, "hi", 2, 0)); */
+/*   ASSERT_SYS(ESPIPE, -1, pread(3, buf, 2, 0)); */
+/*   EXPECT_SYS(0, 0, close(4)); */
+/*   EXPECT_SYS(0, 0, close(3)); */
+/* } */
 
-TEST(lseek, isSocket_ESPIPE) {
-  char buf[2];
-  ASSERT_SYS(0, 3, socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
-  ASSERT_SYS(ESPIPE, -1, lseek(3, 0, SEEK_SET));
-  ASSERT_SYS(ESPIPE, -1, pwrite(3, "hi", 2, 0));
-  ASSERT_SYS(ESPIPE, -1, pread(3, buf, 2, 0));
-  EXPECT_SYS(0, 0, close(3));
-}
+/* TEST(lseek, isSocket_ESPIPE) { */
+/*   char buf[2]; */
+/*   ASSERT_SYS(0, 3, socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)); */
+/*   ASSERT_SYS(ESPIPE, -1, lseek(3, 0, SEEK_SET)); */
+/*   ASSERT_SYS(ESPIPE, -1, pwrite(3, "hi", 2, 0)); */
+/*   ASSERT_SYS(ESPIPE, -1, pread(3, buf, 2, 0)); */
+/*   EXPECT_SYS(0, 0, close(3)); */
+/* } */
 
-TEST(lseek, filePositionChanges_areObservableAcrossDup) {
-  ASSERT_SYS(0, 3, creat("wut", 0644));
-  ASSERT_SYS(0, 4, dup(3));
-  ASSERT_SYS(0, 0, lseek(3, 0, SEEK_CUR));
-  ASSERT_SYS(0, 1, lseek(4, 1, SEEK_SET));
-  ASSERT_SYS(0, 1, lseek(3, 0, SEEK_CUR));
-  EXPECT_SYS(0, 0, close(4));
-  EXPECT_SYS(0, 0, close(3));
-}
+/* TEST(lseek, filePositionChanges_areObservableAcrossDup) { */
+/*   if (IsWindows()) return;  // do not want to support */
+/*   ASSERT_SYS(0, 3, creat("wut", 0644)); */
+/*   ASSERT_SYS(0, 4, dup(3)); */
+/*   ASSERT_SYS(0, 0, lseek(3, 0, SEEK_CUR)); */
+/*   ASSERT_SYS(0, 1, lseek(4, 1, SEEK_SET)); */
+/*   ASSERT_SYS(0, 1, lseek(3, 0, SEEK_CUR)); */
+/*   EXPECT_SYS(0, 0, close(4)); */
+/*   EXPECT_SYS(0, 0, close(3)); */
+/* } */
 
-TEST(lseek, filePositionChanges_areObservableAcrossProcesses) {
-  char buf[8] = {0};
-  ASSERT_SYS(0, 3, open("wut", O_RDWR | O_CREAT, 0644));
-  ASSERT_SYS(0, 3, write(3, "wut", 3));
-  ASSERT_SYS(0, 0, lseek(3, 0, SEEK_SET));
-  SPAWN(fork);
-  ASSERT_SYS(0, 1, lseek(3, 1, SEEK_SET));
-  EXITS(0);
-  EXPECT_SYS(0, 1, read(3, buf, 1));
-  EXPECT_EQ('u', buf[0]);
-  EXPECT_SYS(0, 0, close(3));
-}
+/* TEST(lseek, filePositionChanges_areObservableAcrossProcesses) { */
+/*   if (IsWindows()) return;  // do not want to support */
+/*   char buf[8] = {0}; */
+/*   ASSERT_SYS(0, 3, open("wut", O_RDWR | O_CREAT, 0644)); */
+/*   ASSERT_SYS(0, 3, write(3, "wut", 3)); */
+/*   ASSERT_SYS(0, 0, lseek(3, 0, SEEK_SET)); */
+/*   SPAWN(fork); */
+/*   ASSERT_SYS(0, 1, lseek(3, 1, SEEK_SET)); */
+/*   EXITS(0); */
+/*   EXPECT_SYS(0, 1, read(3, buf, 1)); */
+/*   EXPECT_EQ('u', buf[0]); */
+/*   EXPECT_SYS(0, 0, close(3)); */
+/* } */
 
-TEST(lseek, beyondEndOfFile_isZeroExtendedUponSubsequentWrite) {
-  char buf[8] = {1, 1};
-  ASSERT_SYS(0, 3, open("foo", O_RDWR | O_CREAT | O_TRUNC, 0644));
-  ASSERT_SYS(0, 2, lseek(3, 2, SEEK_SET));
-  ASSERT_SYS(0, 2, lseek(3, 0, SEEK_CUR));
-  ASSERT_SYS(0, 0, pread(3, buf, 8, 0));  // lseek() alone doesn't extend
-  ASSERT_SYS(0, 2, write(3, buf, 2));     // does extend once i/o happens
-  ASSERT_SYS(0, 4, pread(3, buf, 8, 0));
-  ASSERT_EQ(0, buf[0]);
-  ASSERT_EQ(0, buf[1]);
-  ASSERT_EQ(1, buf[2]);
-  ASSERT_EQ(1, buf[3]);
-  ASSERT_SYS(0, 0, close(3));
-}
+/* TEST(lseek, beyondEndOfFile_isZeroExtendedUponSubsequentWrite) { */
+/*   char buf[8] = {1, 1}; */
+/*   ASSERT_SYS(0, 3, open("foo", O_RDWR | O_CREAT | O_TRUNC, 0644)); */
+/*   ASSERT_SYS(0, 2, lseek(3, 2, SEEK_SET)); */
+/*   ASSERT_SYS(0, 2, lseek(3, 0, SEEK_CUR)); */
+/*   ASSERT_SYS(0, 0, pread(3, buf, 8, 0));  // lseek() alone doesn't extend */
+/*   ASSERT_SYS(0, 2, write(3, buf, 2));     // does extend once i/o happens */
+/*   ASSERT_SYS(0, 4, pread(3, buf, 8, 0)); */
+/*   ASSERT_EQ(0, buf[0]); */
+/*   ASSERT_EQ(0, buf[1]); */
+/*   ASSERT_EQ(1, buf[2]); */
+/*   ASSERT_EQ(1, buf[3]); */
+/*   ASSERT_SYS(0, 0, close(3)); */
+/* } */
diff --git a/test/libc/calls/mkdir_test.c b/test/libc/calls/mkdir_test.c
index ff2e13aff..3711b26fa 100644
--- a/test/libc/calls/mkdir_test.c
+++ b/test/libc/calls/mkdir_test.c
@@ -55,11 +55,6 @@ TEST(mkdir, testPathIsFile_EEXIST) {
   EXPECT_SYS(EEXIST, -1, mkdir("yo/yo/yo", 0755));
 }
 
-TEST(mkdir, remove) {
-  EXPECT_SYS(0, 0, mkdir("yo", 0777));
-  EXPECT_SYS(0, 0, remove("yo"));
-}
-
 TEST(mkdir, testPathIsDirectory_EEXIST) {
   EXPECT_SYS(0, 0, mkdir("yo", 0755));
   EXPECT_SYS(0, 0, mkdir("yo/yo", 0755));
diff --git a/test/libc/calls/mkntpath_test.c b/test/libc/calls/mkntpath_test.c
index f9e249fdd..88f538e7c 100644
--- a/test/libc/calls/mkntpath_test.c
+++ b/test/libc/calls/mkntpath_test.c
@@ -51,34 +51,4 @@ TEST(mkntpath, testRemoveDoubleSlash) {
   EXPECT_STREQ(u"C:\\Users\\jart\\.config", p);
 }
 
-TEST(mkntpath, testRelativeCurrentParent) {
-  EXPECT_EQ(3, __mkntpath("./../", p));
-  EXPECT_STREQ(u"..\\", p);
-}
-
-TEST(mkntpath, testRelativeParentParent) {
-  EXPECT_EQ(6, __mkntpath("../../", p));
-  EXPECT_STREQ(u"..\\..\\", p);
-}
-
-TEST(mkntpath, testRelativeParentParentParent) {
-  EXPECT_EQ(9, __mkntpath("../../../", p));
-  EXPECT_STREQ(u"..\\..\\..\\", p);
-}
-
-TEST(mkntpath, testRelativeDirParent) {
-  EXPECT_EQ(2, __mkntpath("abc/../", p));
-  EXPECT_STREQ(u".\\", p);
-}
-
-TEST(mkntpath, testRelativeDirCurrent) {
-  EXPECT_EQ(4, __mkntpath("abc/./", p));
-  EXPECT_STREQ(u"abc\\", p);
-}
-
-TEST(mkntpath, testRelativeDirDirParent) {
-  EXPECT_EQ(4, __mkntpath("abc/def/../", p));
-  EXPECT_STREQ(u"abc\\", p);
-}
-
 #endif /* SupportsWindows() */
diff --git a/test/libc/calls/open_test.c b/test/libc/calls/open_test.c
index 1962e6c07..87545b603 100644
--- a/test/libc/calls/open_test.c
+++ b/test/libc/calls/open_test.c
@@ -23,7 +23,7 @@
 #include "libc/calls/syscall-sysv.internal.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
@@ -326,6 +326,120 @@ TEST(open, lotsOfFds) {
   }
 }
 
+static int64_t GetInode(const char *path) {
+  struct stat st;
+  ASSERT_SYS(0, 0, stat(path, &st));
+  return st.st_ino;
+}
+
+TEST(open, drive) {
+  if (!IsWindows())
+    return;
+  ASSERT_NE(GetInode("/"), GetInode("."));
+  ASSERT_EQ(GetInode("/"), GetInode("/c"));  // sorry you have to run on c:/
+  ASSERT_EQ(GetInode("/"), GetInode("/c/"));
+  ASSERT_SYS(0, 3, open("/", O_RDONLY));
+  ASSERT_SYS(0, 0, close(3));
+}
+
+TEST(open, readOnlyCreatMode) {
+  char buf[8];
+  struct stat st;
+  ASSERT_SYS(0, 3, open("x", O_RDWR | O_CREAT | O_TRUNC, 0500));
+  ASSERT_SYS(0, 2, pwrite(3, "MZ", 2, 0));
+  ASSERT_SYS(0, 2, pread(3, buf, 8, 0));
+  ASSERT_SYS(0, 0, close(3));
+  ASSERT_SYS(0, 0, stat("x", &st));
+  ASSERT_EQ(0100500, st.st_mode);
+  if (getuid()) {
+    ASSERT_SYS(EACCES, -1, open("x", O_RDWR));
+    ASSERT_SYS(EACCES, -1, open("x", O_RDWR | O_CREAT, 0666));
+  } else {
+    // root is invulnerable to eacces
+    ASSERT_SYS(0, 3, open("x", O_RDWR));
+    ASSERT_SYS(0, 0, close(3));
+    ASSERT_SYS(0, 3, open("x", O_RDWR | O_CREAT, 0666));
+    ASSERT_SYS(0, 0, close(3));
+    SPAWN(fork);
+    setuid(1000);
+    setgid(1000);
+    ASSERT_SYS(EACCES, -1, open("x", O_RDWR));
+    ASSERT_SYS(EACCES, -1, open("x", O_RDWR | O_CREAT, 0666));
+    EXITS(0);
+  }
+}
+
+TEST(open, parentSymlink) {
+  struct stat st;
+  ASSERT_SYS(0, 0, mkdir("parent", 0755));
+  // create directory symlink
+  ASSERT_SYS(0, 0, symlink("parent", "parent-link"));
+  // test the symlink we just made is a symlink
+  ASSERT_SYS(0, 0, lstat("parent-link", &st));
+  ASSERT_TRUE(S_ISLNK(st.st_mode));
+  // create regular file when parent component is symlink dir
+  ASSERT_SYS(0, 0, touch("parent-link/regular", 0644));
+  // test stat works
+  ASSERT_SYS(0, 0, stat("parent-link/regular", &st));
+  ASSERT_TRUE(S_ISREG(st.st_mode));
+  // test open works
+  ASSERT_SYS(0, 3, open("parent-link/regular", O_RDONLY));
+  ASSERT_SYS(0, 0, fstat(3, &st));
+  ASSERT_TRUE(S_ISREG(st.st_mode));
+  ASSERT_SYS(0, 0, close(3));
+  // test O_NOFOLLOW doesn't apply to parent components
+  ASSERT_SYS(0, 3, open("parent-link/regular", O_RDONLY | O_NOFOLLOW));
+  ASSERT_SYS(0, 0, fstat(3, &st));
+  ASSERT_TRUE(S_ISREG(st.st_mode));
+  ASSERT_SYS(0, 0, close(3));
+  // create regular symlink
+  ASSERT_SYS(0, 0, symlink("regular", "parent-link/regular-link"));
+  // test stat works
+  ASSERT_SYS(0, 0, stat("parent-link/regular-link", &st));
+  ASSERT_TRUE(S_ISREG(st.st_mode));
+  ASSERT_SYS(0, 0, lstat("parent-link/regular-link", &st));
+  ASSERT_TRUE(S_ISLNK(st.st_mode));
+  // test open works
+  ASSERT_SYS(0, 3, open("parent-link/regular-link", O_RDONLY));
+  ASSERT_SYS(0, 0, fstat(3, &st));
+  ASSERT_TRUE(S_ISREG(st.st_mode));
+  ASSERT_SYS(0, 0, close(3));
+  // test O_NOFOLLOW applies to last component
+  ASSERT_SYS(ELOOP, -1,
+             open("parent-link/regular-link", O_RDONLY | O_NOFOLLOW));
+}
+
+TEST(open, readonlyCreateMode_dontChangeStatusIfExists) {
+  char buf[8];
+  struct stat st;
+  ASSERT_SYS(0, 3, creat("wut", 0700));
+  ASSERT_SYS(0, 2, pwrite(3, "MZ", 2, 0));
+  ASSERT_SYS(0, 0, close(3));
+  // since the file already exists, unix doesn't change read-only
+  ASSERT_SYS(0, 3, open("wut", O_CREAT | O_TRUNC | O_RDWR, 0500));
+  ASSERT_SYS(0, 0, pread(3, buf, 8, 0));
+  ASSERT_SYS(0, 0, fstat(3, &st));
+  ASSERT_EQ(0100600, st.st_mode & 0700666);
+  ASSERT_SYS(0, 0, close(3));
+}
+
+TEST(open, creatRdonly) {
+  char buf[8];
+  ASSERT_SYS(EINVAL, -1, open("foo", O_CREAT | O_TRUNC | O_RDONLY, 0700));
+  ASSERT_SYS(0, 3, open("foo", O_CREAT | O_RDONLY, 0700));
+  ASSERT_SYS(EBADF, -1, pwrite(3, "MZ", 2, 0));
+  ASSERT_SYS(0, 0, pread(3, buf, 8, 0));
+  ASSERT_SYS(0, 0, close(3));
+}
+
+TEST(open, sequentialRandom_EINVAL) {
+  if (!IsWindows())
+    return;
+  ASSERT_SYS(
+      EINVAL, -1,
+      open("foo", O_CREAT | O_TRUNC | O_RDWR | O_SEQUENTIAL | O_RANDOM, 0700));
+}
+
 // "If O_CREAT is set and the file did not previously exist, upon
 //  successful completion, open() shall mark for update the last data
 //  access, last data modification, and last file status change
@@ -371,3 +485,31 @@ TEST(open, trunc_touchesMtimCtim) {
   EXPECT_EQ(1, timespec_cmp(st.st_mtim, birth));
   ASSERT_SYS(0, 0, close(3));
 }
+
+TEST(open, mereOpen_doesntTouch) {
+  struct stat st;
+  struct timespec birth;
+  ASSERT_SYS(0, 0, touch("regular", 0755));
+  ASSERT_SYS(0, 0, stat("regular", &st));
+  birth = st.st_ctim;
+  sleep(2);
+  ASSERT_SYS(0, 3, open("regular", O_RDWR));
+  ASSERT_SYS(0, 0, close(3));
+  ASSERT_SYS(0, 0, stat("regular", &st));
+  EXPECT_EQ(0, timespec_cmp(st.st_ctim, birth));
+  EXPECT_EQ(0, timespec_cmp(st.st_mtim, birth));
+  EXPECT_EQ(0, timespec_cmp(st.st_atim, birth));
+}
+
+TEST(open, canTruncateExistingFile) {
+  struct stat st;
+  ASSERT_SYS(0, 0, xbarf("foo", "hello", -1));
+  ASSERT_SYS(0, 0, stat("foo", &st));
+  ASSERT_EQ(5, st.st_size);
+  ASSERT_SYS(0, 3, open("foo", O_RDWR | O_TRUNC));
+  ASSERT_SYS(0, 0, fstat(3, &st));
+  ASSERT_EQ(0, st.st_size);
+  ASSERT_SYS(0, 0, close(3));
+  ASSERT_SYS(0, 0, stat("foo", &st));
+  ASSERT_EQ(0, st.st_size);
+}
diff --git a/test/libc/calls/pledge_test.c b/test/libc/calls/pledge_test.c
index 089d965ef..c710147f1 100644
--- a/test/libc/calls/pledge_test.c
+++ b/test/libc/calls/pledge_test.c
@@ -30,7 +30,7 @@
 #include "libc/calls/syscall_support-sysv.internal.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
@@ -64,14 +64,6 @@
 
 void SetUpOnce(void) {
   testlib_enable_tmp_setup_teardown();
-  if (pledge(0, 0) == -1) {
-    fprintf(stderr, "warning: pledge() not supported on this system %m\n");
-    exit(0);
-  }
-}
-
-void SetUp(void) {
-  __pledge_mode = PLEDGE_PENALTY_RETURN_EPERM;
 }
 
 void OnSig(int sig) {
@@ -80,6 +72,16 @@ void OnSig(int sig) {
 
 int sys_memfd_secret(unsigned int);  // our ENOSYS threshold
 
+void SetUp(void) {
+  if (pledge(0, 0) == -1) {
+    fprintf(stderr, "warning: pledge() not supported on this system %m\n");
+    exit(0);
+  }
+  testlib_extract("/zip/life.elf", "life.elf", 0755);
+  testlib_extract("/zip/sock.elf", "sock.elf", 0755);
+  __pledge_mode = PLEDGE_PENALTY_RETURN_EPERM;
+}
+
 TEST(pledge, default_allowsExit) {
   int *job;
   int ws, pid;
@@ -105,18 +107,14 @@ TEST(pledge, execpromises_notok) {
   if (IsOpenbsd())
     return;  // b/c testing linux bpf
   int ws, pid;
-  testlib_extract("/zip/sock.elf", "sock.elf", 0755);
   ASSERT_NE(-1, (pid = fork()));
   if (!pid) {
-    putenv("COMDBG=REDACTED");
     __pledge_mode = PLEDGE_PENALTY_RETURN_EPERM;
     ASSERT_SYS(0, 0, pledge("stdio rpath exec", "stdio"));
     execl("sock.elf", "sock.elf", 0);
     _Exit(127);
   }
   EXPECT_NE(-1, wait(&ws));
-  EXPECT_FALSE(WIFSIGNALED(ws));
-  EXPECT_EQ(0, WTERMSIG(ws));
   EXPECT_TRUE(WIFEXITED(ws));
   EXPECT_EQ(129, WEXITSTATUS(ws));
 }
@@ -531,11 +529,9 @@ TEST(pledge, open_cpath) {
 TEST(pledge, execpromises_ok) {
   if (IsOpenbsd())
     return;  // b/c testing linux bpf
-  testlib_extract("/zip/life.elf", "life.elf", 0755);
   int ws, pid;
   ASSERT_NE(-1, (pid = fork()));
   if (!pid) {
-    putenv("COMDBG=REDACTED");
     ASSERT_SYS(0, 0, pledge("stdio exec", "stdio"));
     execl("life.elf", "life.elf", 0);
     _Exit(127);
@@ -549,10 +545,8 @@ TEST(pledge, execpromises_notok1) {
   if (IsOpenbsd())
     return;  // b/c testing linux bpf
   int ws, pid;
-  testlib_extract("/zip/sock.elf", "sock.elf", 0755);
   ASSERT_NE(-1, (pid = fork()));
   if (!pid) {
-    putenv("COMDBG=REDACTED");
     ASSERT_SYS(0, 0, pledge("stdio exec", "stdio"));
     execl("sock.elf", "sock.elf", 0);
     _Exit(127);
@@ -566,10 +560,8 @@ TEST(pledge, execpromises_reducesAtExecOnLinux) {
   if (IsOpenbsd())
     return;  // b/c testing linux bpf
   int ws, pid;
-  testlib_extract("/zip/sock.elf", "sock.elf", 0755);
   ASSERT_NE(-1, (pid = fork()));
   if (!pid) {
-    putenv("COMDBG=REDACTED");
     ASSERT_SYS(0, 0, pledge("stdio inet tty exec", "stdio tty"));
     execl("sock.elf", "sock.elf", 0);
     _Exit(127);
@@ -585,7 +577,6 @@ TEST(pledge_openbsd, execpromisesIsNull_letsItDoAnything) {
   if (!IsOpenbsd())
     return;
   int ws, pid;
-  testlib_extract("/zip/sock.elf", "sock.elf", 0755);
   ASSERT_NE(-1, (pid = fork()));
   if (!pid) {
     ASSERT_SYS(0, 0, pledge("stdio exec", 0));
@@ -605,7 +596,6 @@ TEST(pledge_openbsd, execpromisesIsSuperset_letsItDoAnything) {
   if (!IsOpenbsd())
     return;
   int ws, pid;
-  testlib_extract("/zip/sock.elf", "sock.elf", 0755);
   ASSERT_NE(-1, (pid = fork()));
   if (!pid) {
     ASSERT_SYS(0, 0, pledge("stdio rpath exec", "stdio rpath tty inet"));
@@ -627,10 +617,8 @@ TEST(pledge_openbsd, execpromises_notok) {
   if (IsOpenbsd())
     return;  // mimmutable() ugh
   int ws, pid;
-  testlib_extract("/zip/sock.elf", "sock.elf", 0755);
   ASSERT_NE(-1, (pid = fork()));
   if (!pid) {
-    putenv("COMDBG=REDACTED");
     ASSERT_SYS(0, 0, pledge("stdio exec", "stdio"));
     execl("sock.elf", "sock.elf", 0);
     _Exit(127);
diff --git a/test/libc/calls/poll_latency_test.c b/test/libc/calls/poll_latency_test.c
deleted file mode 100644
index ab6aa5a4f..000000000
--- a/test/libc/calls/poll_latency_test.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <arpa/inet.h>
-#include <assert.h>
-#include <cosmo.h>
-#include <netinet/in.h>
-#include <pthread.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/poll.h>
-#include <sys/socket.h>
-#include <time.h>
-#include <unistd.h>
-
-#define NUM_MEASUREMENTS 10
-#define BUFFER_SIZE      sizeof(struct timespec)
-
-atomic_int global_state;
-
-typedef struct {
-  int port;
-  int client_sock;
-} listener_data;
-
-void *sender_thread(void *arg) {
-  listener_data *data = (listener_data *)arg;
-  int sockfd = socket(data->port == 0 ? AF_INET : AF_INET6, SOCK_STREAM, 0);
-  if (sockfd < 0) {
-    perror("Socket creation failed");
-    exit(EXIT_FAILURE);
-  }
-
-  void *addr;
-  struct sockaddr_in addr_v4 = {0};
-  struct sockaddr_in6 addr_v6 = {0};
-  socklen_t addr_len;
-
-  if (data->port == 0) {  // IPv4
-    addr_v4.sin_family = AF_INET;
-    addr_v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-    addr_v4.sin_port = 0;
-    addr = &addr_v4;
-    addr_len = sizeof(addr_v4);
-  } else {  // IPv6
-    addr_v6.sin6_family = AF_INET6;
-    addr_v6.sin6_addr = in6addr_loopback;
-    addr_v6.sin6_port = 0;
-    addr = &addr_v6;
-    addr_len = sizeof(addr_v6);
-  }
-
-  if (bind(sockfd, addr, addr_len) < 0) {
-    perror("Bind failed");
-    exit(EXIT_FAILURE);
-  }
-
-  if (getsockname(sockfd, addr, &addr_len) < 0) {
-    perror("getsockname failed");
-    exit(EXIT_FAILURE);
-  }
-
-  data->port = ntohs(data->port == 0 ? addr_v4.sin_port : addr_v6.sin6_port);
-
-  if (listen(sockfd, 1) < 0) {
-    perror("Listen failed");
-    exit(EXIT_FAILURE);
-  }
-
-  atomic_fetch_add(&global_state, 1);
-  data->client_sock = accept(sockfd, NULL, NULL);
-  if (data->client_sock < 0) {
-    perror("Accept failed");
-    exit(EXIT_FAILURE);
-  }
-  atomic_fetch_add(&global_state, 1);
-
-  struct timespec ts;
-  for (int i = 0; i < NUM_MEASUREMENTS; i++) {
-    while (atomic_load(&global_state)) {
-    }
-    atomic_fetch_add(&global_state, 1);
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    send(data->client_sock, &ts, sizeof(ts), 0);
-  }
-
-  close(data->client_sock);
-  close(sockfd);
-  return NULL;
-}
-
-int main() {
-  ShowCrashReports();
-
-  pthread_t ipv4_thread, ipv6_thread;
-  listener_data ipv4_data = {0},
-                ipv6_data = {1};  // Use port 0 for IPv4, 1 for IPv6
-
-  global_state = -5;
-
-  if (pthread_create(&ipv4_thread, NULL, sender_thread, &ipv4_data) != 0) {
-    perror("Failed to create IPv4 thread");
-    exit(EXIT_FAILURE);
-  }
-
-  if (pthread_create(&ipv6_thread, NULL, sender_thread, &ipv6_data) != 0) {
-    perror("Failed to create IPv6 thread");
-    exit(EXIT_FAILURE);
-  }
-
-  // Wait for both listeners to be ready
-  while (atomic_load(&global_state) < -3) {
-    // Busy wait
-  }
-
-  int ipv4_sock = socket(AF_INET, SOCK_STREAM, 0);
-  int ipv6_sock = socket(AF_INET6, SOCK_STREAM, 0);
-
-  struct sockaddr_in ipv4_addr = {0};
-  ipv4_addr.sin_family = AF_INET;
-  ipv4_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  ipv4_addr.sin_port = htons(ipv4_data.port);
-
-  struct sockaddr_in6 ipv6_addr = {0};
-  ipv6_addr.sin6_family = AF_INET6;
-  ipv6_addr.sin6_addr = in6addr_loopback;
-  ipv6_addr.sin6_port = htons(ipv6_data.port);
-
-  if (connect(ipv4_sock, (struct sockaddr *)&ipv4_addr, sizeof(ipv4_addr)) <
-      0) {
-    perror("IPv4 connect failed");
-    exit(EXIT_FAILURE);
-  }
-
-  if (connect(ipv6_sock, (struct sockaddr *)&ipv6_addr, sizeof(ipv6_addr)) <
-      0) {
-    perror("IPv6 connect failed");
-    exit(EXIT_FAILURE);
-  }
-
-  // Wait for both listeners to be ready
-  while (atomic_load(&global_state) < -1) {
-    // Busy wait
-  }
-  atomic_fetch_add(&global_state, 1);
-
-  struct pollfd fds[2];
-  fds[0].fd = ipv4_sock;
-  fds[0].events = POLLIN;
-  fds[1].fd = ipv6_sock;
-  fds[1].events = POLLIN;
-
-  struct timespec ts_sent, ts_now;
-  double total_latency = 0.0;
-  int total_measurements = 0;
-
-  while (total_measurements < 2 * NUM_MEASUREMENTS) {
-    int ready = poll(fds, 2, -1);
-    if (ready < 0) {
-      perror("Poll failed");
-      exit(EXIT_FAILURE);
-    }
-
-    clock_gettime(CLOCK_MONOTONIC, &ts_now);
-
-    for (int i = 0; i < 2; i++) {
-      if (fds[i].revents & POLLIN) {
-        ssize_t n = recv(fds[i].fd, &ts_sent, sizeof(ts_sent), 0);
-        if (n == sizeof(ts_sent)) {
-          total_latency += timespec_tonanos(timespec_sub(ts_now, ts_sent));
-          total_measurements++;
-          atomic_fetch_sub(&global_state, 1);
-        }
-      }
-    }
-  }
-
-  double mean_latency = total_latency / total_measurements;
-  printf("Mean poll() latency: %.2f ns\n", mean_latency);
-
-  unassert(!close(ipv4_sock));
-  unassert(!close(ipv6_sock));
-
-  unassert(!pthread_join(ipv4_thread, NULL));
-  unassert(!pthread_join(ipv6_thread, NULL));
-
-  CheckForMemoryLeaks();
-}
diff --git a/test/libc/calls/poll_test.c b/test/libc/calls/poll_test.c
index 12cf78951..ee844762b 100644
--- a/test/libc/calls/poll_test.c
+++ b/test/libc/calls/poll_test.c
@@ -20,10 +20,8 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/pledge.h"
 #include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/timespec.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/intrin/describeflags.h"
 #include "libc/log/libfatal.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/nexgen32e/rdtsc.h"
@@ -35,7 +33,6 @@
 #include "libc/sysv/consts/af.h"
 #include "libc/sysv/consts/inaddr.h"
 #include "libc/sysv/consts/ipproto.h"
-#include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/consts/sock.h"
 #include "libc/testlib/testlib.h"
@@ -46,7 +43,8 @@
 bool gotsig;
 
 void SetUpOnce(void) {
-  testlib_enable_tmp_setup_teardown();
+  __pledge_mode = PLEDGE_PENALTY_KILL_PROCESS | PLEDGE_STDERR_LOGGING;
+  ASSERT_SYS(0, 0, pledge("stdio proc inet", 0));
 }
 
 void SetUp(void) {
@@ -57,14 +55,14 @@ void OnSig(int sig) {
   gotsig = true;
 }
 
-TEST(poll, allZero_doesNothingPrettyMuch) {
-  EXPECT_SYS(0, 0, poll(0, 0, 0));
+__wur char *FormatPollFd(struct pollfd p[2]) {
+  return xasprintf("fd:%d revents:%s\n"
+                   "fd:%d revents:%s\n",
+                   p[0].fd, "<TODO:kPollNames>", p[1].fd, "<TODO:kPollNames>");
 }
 
-TEST(poll, allZeroWithTimeout_sleeps) {
-  struct timespec ts1 = timespec_mono();
-  EXPECT_SYS(0, 0, poll(0, 0, 100));
-  EXPECT_GE(timespec_tomillis(timespec_sub(timespec_mono(), ts1)), 100);
+TEST(poll, allZero_doesNothingPrettyMuch) {
+  EXPECT_SYS(0, 0, poll(0, 0, 0));
 }
 
 TEST(ppoll, weCanProveItChecksForSignals) {
@@ -96,52 +94,14 @@ TEST(poll, testNegativeOneFd_isIgnored) {
   struct sockaddr_in addr = {AF_INET, 0, {htonl(INADDR_LOOPBACK)}};
   ASSERT_SYS(0, 0, bind(3, (struct sockaddr *)&addr, sizeof(addr)));
   ASSERT_SYS(0, 0, listen(3, 10));
-  struct pollfd fds[] = {{-1, 0, -1}, {3, 0, -1}};
+  struct pollfd fds[] = {{-1}, {3}};
   EXPECT_SYS(0, 0, poll(fds, ARRAYLEN(fds), 1));
-  EXPECT_EQ(-1, fds[0].fd);
-  EXPECT_EQ(0, fds[0].revents);
-  EXPECT_EQ(3, fds[1].fd);
-  EXPECT_EQ(0, fds[1].revents);
+  EXPECT_STREQ("fd:-1 revents:<TODO:kPollNames>\n"
+               "fd:3 revents:<TODO:kPollNames>\n",
+               gc(FormatPollFd(&fds[0])));
   ASSERT_SYS(0, 0, close(3));
 }
 
-TEST(poll, testInvalidFd_POLLIN_isChecked) {
-  struct pollfd fds[] = {{77, POLLIN, -1}};
-  EXPECT_SYS(0, 1, poll(fds, ARRAYLEN(fds), 1));
-  EXPECT_EQ(77, fds[0].fd);
-  EXPECT_EQ(POLLNVAL, fds[0].revents);
-}
-
-TEST(poll, testInvalidFd_POLLOUT_isChecked) {
-  struct pollfd fds[] = {{77, POLLOUT, -1}};
-  EXPECT_SYS(0, 1, poll(fds, ARRAYLEN(fds), 1));
-  EXPECT_EQ(77, fds[0].fd);
-  EXPECT_EQ(POLLNVAL, fds[0].revents);
-}
-
-TEST(poll, testInvalidFd_POLLPRI_isChecked) {
-  struct pollfd fds[] = {{77, POLLPRI, -1}};
-  EXPECT_SYS(0, 1, poll(fds, ARRAYLEN(fds), 1));
-  EXPECT_EQ(77, fds[0].fd);
-  EXPECT_EQ(POLLNVAL, fds[0].revents);
-}
-
-TEST(poll, testInvalidFd_POLLHUP_isChecked) {
-  // this behavior has to be polyfilled on xnu
-  struct pollfd fds[] = {{77, POLLHUP, -1}};
-  EXPECT_SYS(0, 1, poll(fds, ARRAYLEN(fds), 1));
-  EXPECT_EQ(77, fds[0].fd);
-  EXPECT_EQ(POLLNVAL, fds[0].revents);
-}
-
-TEST(poll, testInvalidFd_ZERO_isChecked) {
-  // this behavior has to be polyfilled on xnu
-  struct pollfd fds[] = {{77, 0, -1}};
-  EXPECT_SYS(0, 1, poll(fds, ARRAYLEN(fds), 1));
-  EXPECT_EQ(77, fds[0].fd);
-  EXPECT_EQ(POLLNVAL, fds[0].revents);
-}
-
 TEST(poll, pipe_noInput) {
   // we can't test stdin here since
   // we can't assume it isn't /dev/null
@@ -155,17 +115,6 @@ TEST(poll, pipe_noInput) {
   EXPECT_SYS(0, 0, close(pipefds[1]));
 }
 
-TEST(poll, pipe_broken) {
-  int pipefds[2];
-  EXPECT_SYS(0, 0, pipe(pipefds));
-  EXPECT_SYS(0, 0, close(pipefds[1]));
-  struct pollfd fds[] = {{pipefds[0], POLLIN}};
-  EXPECT_SYS(0, 1, poll(fds, 1, 0));
-  // BSDs also set POLLIN here too even though that's wrong
-  EXPECT_TRUE(!!(fds[0].revents & POLLHUP));
-  EXPECT_SYS(0, 0, close(pipefds[0]));
-}
-
 TEST(poll, pipe_hasInputFromSameProcess) {
   char buf[2];
   int pipefds[2];
@@ -173,7 +122,7 @@ TEST(poll, pipe_hasInputFromSameProcess) {
   struct pollfd fds[] = {{pipefds[0], POLLIN}};
   EXPECT_SYS(0, 2, write(pipefds[1], "hi", 2));
   EXPECT_SYS(0, 1, poll(fds, 1, 1000));  // flake nt!
-  EXPECT_TRUE(!!(fds[0].revents & POLLIN));
+  EXPECT_EQ(POLLIN, fds[0].revents);
   EXPECT_SYS(0, 2, read(pipefds[0], buf, 2));
   EXPECT_SYS(0, 0, poll(fds, 1, 0));
   EXPECT_SYS(0, 0, close(pipefds[0]));
@@ -201,7 +150,7 @@ TEST(poll, pipe_hasInput) {
   EXPECT_SYS(0, 2, read(pipefds[0], buf, 2));
   struct pollfd fds[] = {{pipefds[0], POLLIN}};
   EXPECT_SYS(0, 1, poll(fds, 1, -1));
-  EXPECT_TRUE(!!(fds[0].revents & POLLIN));
+  EXPECT_EQ(POLLIN, fds[0].revents & POLLIN);
   EXPECT_SYS(0, 2, read(pipefds[0], buf, 2));
   EXPECT_SYS(0, 0, close(pipefds[0]));
   ASSERT_NE(-1, wait(&ws));
@@ -210,174 +159,22 @@ TEST(poll, pipe_hasInput) {
   EXPECT_EQ(0, sigprocmask(SIG_SETMASK, &savemask, 0));
 }
 
-TEST(poll, file_pollin) {
-  int fd;
-  EXPECT_SYS(0, 3, (fd = open("boop", O_CREAT | O_RDWR | O_TRUNC, 0644)));
-  struct pollfd fds[] = {{fd, POLLIN}};
-  EXPECT_SYS(0, 1, poll(fds, 1, -1));
-  EXPECT_TRUE(!!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!(fds[0].revents & POLLOUT));
-  EXPECT_SYS(0, 0, close(fd));
-}
-
-TEST(poll, file_pollout) {
-  int fd;
-  EXPECT_SYS(0, 3, (fd = open("boop", O_CREAT | O_RDWR | O_TRUNC, 0644)));
-  struct pollfd fds[] = {{fd, POLLOUT}};
-  EXPECT_SYS(0, 1, poll(fds, 1, -1));
-  EXPECT_TRUE(!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!!(fds[0].revents & POLLOUT));
-  EXPECT_SYS(0, 0, close(fd));
-}
-
-TEST(poll, file_pollinout) {
-  int fd;
-  EXPECT_SYS(0, 3, (fd = open("boop", O_CREAT | O_RDWR | O_TRUNC, 0644)));
-  struct pollfd fds[] = {{fd, POLLIN | POLLOUT}};
-  EXPECT_SYS(0, 1, poll(fds, 1, -1));
-  EXPECT_TRUE(!!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!!(fds[0].revents & POLLOUT));
-  EXPECT_SYS(0, 0, close(fd));
-}
-
-TEST(poll, file_rdonly_pollinout) {
-  int fd;
-  EXPECT_SYS(0, 3, (fd = open("boop", O_CREAT | O_RDWR | O_TRUNC, 0644)));
-  EXPECT_SYS(0, 0, close(fd));
-  EXPECT_SYS(0, 3, (fd = open("boop", O_RDONLY)));
-  struct pollfd fds[] = {{fd, POLLIN | POLLOUT}};
-  EXPECT_SYS(0, 1, poll(fds, 1, -1));
-  EXPECT_TRUE(!!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!!(fds[0].revents & POLLOUT));  // counter-intuitive
-  EXPECT_SYS(0, 0, close(fd));
-}
-
-TEST(poll, file_wronly_pollin) {
-  int fd;
-  EXPECT_SYS(0, 3, (fd = creat("boop", 0644)));
-  struct pollfd fds[] = {{fd, POLLIN}};
-  EXPECT_SYS(0, 1, poll(fds, 1, -1));
-  EXPECT_TRUE(!!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!(fds[0].revents & POLLOUT));
-  EXPECT_SYS(0, 0, close(fd));
-}
-
-TEST(poll, file_wronly_pollout) {
-  int fd;
-  EXPECT_SYS(0, 3, (fd = creat("boop", 0644)));
-  struct pollfd fds[] = {{fd, POLLOUT}};
-  EXPECT_SYS(0, 1, poll(fds, 1, -1));
-  EXPECT_TRUE(!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!!(fds[0].revents & POLLOUT));
-  EXPECT_SYS(0, 0, close(fd));
-}
-
-TEST(poll, file_wronly_pollinout) {
-  int fd;
-  EXPECT_SYS(0, 3, (fd = creat("boop", 0644)));
-  struct pollfd fds[] = {{fd, POLLIN | POLLOUT}};
-  EXPECT_SYS(0, 1, poll(fds, 1, -1));
-  EXPECT_TRUE(!!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!!(fds[0].revents & POLLOUT));
-  EXPECT_SYS(0, 0, close(fd));
-}
-
-TEST(poll, file_rdwr_pollinoutpri) {
-  int fd;
-  EXPECT_SYS(0, 3, (fd = open("boop", O_CREAT | O_RDWR | O_TRUNC, 0644)));
-  struct pollfd fds[] = {{fd, POLLIN | POLLOUT | POLLPRI}};
-  EXPECT_SYS(0, 1, poll(fds, 1, -1));
-  EXPECT_TRUE(!!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!!(fds[0].revents & POLLOUT));
-  if (IsXnu())
-    EXPECT_TRUE(!!(fds[0].revents & POLLPRI));  // wut
-  else
-    EXPECT_TRUE(!(fds[0].revents & POLLPRI));
-  EXPECT_SYS(0, 0, close(fd));
-}
-
-TEST(poll, pipein_pollout_blocks) {
-  if (IsFreebsd() || IsOpenbsd())
-    return;
-  int pipefds[2];
-  EXPECT_SYS(0, 0, pipe(pipefds));
-  struct pollfd fds[] = {{pipefds[0], POLLOUT}};
-  EXPECT_SYS(0, 0, poll(fds, 1, 0));
-  struct timespec ts1 = timespec_mono();
-  EXPECT_SYS(0, 0, poll(fds, 1, 10));
-  EXPECT_GE(timespec_tomillis(timespec_sub(timespec_mono(), ts1)), 10);
-  EXPECT_SYS(0, 0, close(pipefds[1]));
-  EXPECT_SYS(0, 0, close(pipefds[0]));
-}
-
-TEST(poll, pipein_file_noblock) {
-  if (IsFreebsd() || IsOpenbsd())
-    return;
-  int pipefds[2];
-  EXPECT_SYS(0, 3, open("boop", O_CREAT | O_RDWR | O_TRUNC, 0644));
-  EXPECT_SYS(0, 0, pipe(pipefds));
-  struct pollfd fds[] = {{pipefds[0], POLLIN}, {3, POLLIN}};
-  EXPECT_SYS(0, 1, poll(fds, 2, -1u));
-  EXPECT_TRUE(!!(fds[1].revents & POLLIN));
-  EXPECT_TRUE(!(fds[1].revents & POLLOUT));
-  EXPECT_SYS(0, 0, close(pipefds[1]));
-  EXPECT_SYS(0, 0, close(pipefds[0]));
-  EXPECT_SYS(0, 0, close(3));
-}
-
-TEST(poll, pipein_file_noblock2) {
-  if (IsFreebsd() || IsOpenbsd())
-    return;
-  int pipefds[2];
-  EXPECT_SYS(0, 3, open("boop", O_CREAT | O_RDWR | O_TRUNC, 0644));
-  EXPECT_SYS(0, 0, pipe(pipefds));
-  EXPECT_SYS(0, 1, write(5, "x", 1));
-  struct pollfd fds[] = {{pipefds[0], POLLIN}, {3, POLLIN | POLLOUT}};
-  EXPECT_SYS(0, 2, poll(fds, 2, -1u));
-  EXPECT_TRUE(!!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!(fds[0].revents & POLLOUT));
-  EXPECT_TRUE(!!(fds[1].revents & POLLIN));
-  EXPECT_TRUE(!!(fds[1].revents & POLLOUT));
-  EXPECT_SYS(0, 0, close(pipefds[1]));
-  EXPECT_SYS(0, 0, close(pipefds[0]));
-  EXPECT_SYS(0, 0, close(3));
-}
-
-TEST(poll, pipeout_pollout) {
-  int pipefds[2];
-  EXPECT_SYS(0, 0, pipe(pipefds));
-  struct pollfd fds[] = {{pipefds[1], POLLOUT}};
-  EXPECT_SYS(0, 1, poll(fds, 1, 0));
-  EXPECT_TRUE(!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!!(fds[0].revents & POLLOUT));
-  EXPECT_SYS(0, 1, poll(fds, 1, 1));
-  EXPECT_TRUE(!(fds[0].revents & POLLIN));
-  EXPECT_TRUE(!!(fds[0].revents & POLLOUT));
-  EXPECT_SYS(0, 0, close(pipefds[1]));
-  EXPECT_SYS(0, 0, close(pipefds[0]));
-}
-
-TEST(poll, pipein_pollin_timeout) {
-  int pipefds[2];
-  EXPECT_SYS(0, 0, pipe(pipefds));
-  struct pollfd fds[] = {{pipefds[0], POLLIN}};
-  struct timespec ts1 = timespec_mono();
-  EXPECT_SYS(0, 0, poll(fds, 1, 10));
-  EXPECT_GE(timespec_tomillis(timespec_sub(timespec_mono(), ts1)), 10);
-  EXPECT_SYS(0, 0, close(pipefds[1]));
-  EXPECT_SYS(0, 0, close(pipefds[0]));
-}
-
-TEST(poll, pipein_pollinout_timeout) {
-  if (IsFreebsd() || IsOpenbsd())
-    return;
-  int pipefds[2];
-  EXPECT_SYS(0, 0, pipe(pipefds));
-  struct pollfd fds[] = {{pipefds[0], POLLIN | POLLOUT}};
-  EXPECT_SYS(0, 0, poll(fds, 1, 0));
-  struct timespec ts1 = timespec_mono();
-  EXPECT_SYS(0, 0, poll(fds, 1, 10));
-  EXPECT_GE(timespec_tomillis(timespec_sub(timespec_mono(), ts1)), 10);
-  EXPECT_SYS(0, 0, close(pipefds[1]));
-  EXPECT_SYS(0, 0, close(pipefds[0]));
+#if 0
+TEST(poll, emptyFds_becomesSleep) {
+  // timing tests w/o mocks are always the hardest
+  int64_t a, b, c, p, i = 0;
+  do {
+    if (++i == 5) {
+      kprintf("too much cpu churn%n");
+      return;
+    }
+    p = TSC_AUX_CORE(rdpid());
+    a = rdtsc();
+    EXPECT_SYS(0, 0, poll(0, 0, 5));
+    b = rdtsc();
+    EXPECT_SYS(0, 0, poll(0, 0, 50));
+    c = rdtsc();
+  } while (TSC_AUX_CORE(rdpid()) != p);
+  EXPECT_LT((b - a) * 2, c - b);
 }
+#endif
diff --git a/test/libc/calls/raise_test.c b/test/libc/calls/raise_test.c
index 481f207c3..5ebb8189a 100644
--- a/test/libc/calls/raise_test.c
+++ b/test/libc/calls/raise_test.c
@@ -20,7 +20,6 @@
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/siginfo.h"
 #include "libc/dce.h"
-#include "libc/mem/leaks.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sysv/consts/sa.h"
 #include "libc/sysv/consts/sicode.h"
@@ -31,7 +30,6 @@
 #include "libc/thread/thread.h"
 
 TEST(raise, trap) {
-  AssertNoLocksAreHeld();
   signal(SIGTRAP, SIG_DFL);
   SPAWN(fork);
   raise(SIGTRAP);
@@ -46,7 +44,6 @@ TEST(raise, fpe) {
 }
 
 TEST(raise, usr1) {
-  AssertNoLocksAreHeld();
   SPAWN(fork);
   raise(SIGUSR1);
   TERMS(SIGUSR1);
@@ -56,8 +53,9 @@ int threadid;
 
 void WorkerQuit(int sig, siginfo_t *si, void *ctx) {
   ASSERT_EQ(SIGILL, sig);
-  if (!IsXnu() && !IsOpenbsd())
+  if (!IsXnu() && !IsOpenbsd()) {
     ASSERT_EQ(SI_TKILL, si->si_code);
+  }
   ASSERT_EQ(threadid, gettid());
 }
 
@@ -71,7 +69,6 @@ void *Worker(void *arg) {
 
 TEST(raise, threaded) {
   SPAWN(fork);
-  AssertNoLocksAreHeld();
   signal(SIGILL, SIG_DFL);
   pthread_t worker;
   ASSERT_EQ(0, pthread_create(&worker, 0, Worker, 0));
diff --git a/test/libc/calls/reservefd_test.c b/test/libc/calls/reservefd_test.c
index bd8c608d3..b1013e019 100644
--- a/test/libc/calls/reservefd_test.c
+++ b/test/libc/calls/reservefd_test.c
@@ -23,7 +23,7 @@
 #include "libc/calls/struct/rlimit.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
diff --git a/test/libc/calls/setrlimit_test.c b/test/libc/calls/setrlimit_test.c
new file mode 100644
index 000000000..7f840519d
--- /dev/null
+++ b/test/libc/calls/setrlimit_test.c
@@ -0,0 +1,247 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "dsp/core/core.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/rlimit.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/dce.h"
+#include "libc/errno.h"
+#include "libc/intrin/directmap.h"
+#include "libc/intrin/safemacros.h"
+#include "libc/limits.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/stdio.h"
+#include "libc/sysv/consts/auxv.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/prot.h"
+#include "libc/sysv/consts/rlimit.h"
+#include "libc/sysv/consts/sig.h"
+#include "libc/testlib/testlib.h"
+#include "libc/time.h"
+#include "libc/x/xsigaction.h"
+#include "libc/x/xspawn.h"
+
+#ifdef __x86_64__
+
+#define MEM (64 * 1024 * 1024)
+
+static char tmpname[PATH_MAX];
+
+void OnSigxcpu(int sig) {
+  ASSERT_EQ(SIGXCPU, sig);
+  _Exit(0);
+}
+
+void OnSigxfsz(int sig) {
+  unlink(tmpname);
+  ASSERT_EQ(SIGXFSZ, sig);
+  _Exit(0);
+}
+
+TEST(setrlimit, testCpuLimit) {
+  int wstatus;
+  struct rlimit rlim;
+  struct timespec start;
+  double matrices[3][3][3];
+  if (IsWindows())
+    return;  // of course it doesn't work on windows
+  if (IsXnu())
+    return;  // TODO(jart): it worked before
+  if (IsOpenbsd())
+    return;  // TODO(jart): fix flake
+  ASSERT_NE(-1, (wstatus = xspawn(0)));
+  if (wstatus == -2) {
+    ASSERT_EQ(0, xsigaction(SIGXCPU, OnSigxcpu, 0, 0, 0));
+    ASSERT_EQ(0, getrlimit(RLIMIT_CPU, &rlim));
+    rlim.rlim_cur = 1;  // set soft limit to one second
+    ASSERT_EQ(0, setrlimit(RLIMIT_CPU, &rlim));
+    start = timespec_real();
+    do {
+      matmul3(matrices[0], matrices[1], matrices[2]);
+      matmul3(matrices[0], matrices[1], matrices[2]);
+      matmul3(matrices[0], matrices[1], matrices[2]);
+      matmul3(matrices[0], matrices[1], matrices[2]);
+    } while (timespec_sub(timespec_real(), start).tv_sec < 5);
+    _Exit(1);
+  }
+  EXPECT_TRUE(WIFEXITED(wstatus));
+  EXPECT_FALSE(WIFSIGNALED(wstatus));
+  EXPECT_EQ(0, WEXITSTATUS(wstatus));
+  EXPECT_EQ(0, WTERMSIG(wstatus));
+}
+
+TEST(setrlimit, testFileSizeLimit) {
+  char junkdata[512];
+  int i, fd, wstatus;
+  struct rlimit rlim;
+  if (IsWindows())
+    return; /* of course it doesn't work on windows */
+  ASSERT_NE(-1, (wstatus = xspawn(0)));
+  if (wstatus == -2) {
+    ASSERT_EQ(0, xsigaction(SIGXFSZ, OnSigxfsz, 0, 0, 0));
+    ASSERT_EQ(0, getrlimit(RLIMIT_FSIZE, &rlim));
+    rlim.rlim_cur = 1024 * 1024; /* set soft limit to one megabyte */
+    ASSERT_EQ(0, setrlimit(RLIMIT_FSIZE, &rlim));
+    snprintf(tmpname, sizeof(tmpname), "%s/%s.%d",
+             firstnonnull(getenv("TMPDIR"), "/tmp"),
+             firstnonnull(program_invocation_short_name, "unknown"), getpid());
+    ASSERT_NE(-1, (fd = open(tmpname, O_RDWR | O_CREAT | O_TRUNC, 0644)));
+    rngset(junkdata, 512, _rand64, -1);
+    for (i = 0; i < 5 * 1024 * 1024 / 512; ++i) {
+      ASSERT_EQ(512, write(fd, junkdata, 512));
+    }
+    close(fd);
+    unlink(tmpname);
+    _Exit(1);
+  }
+  EXPECT_TRUE(WIFEXITED(wstatus));
+  EXPECT_FALSE(WIFSIGNALED(wstatus));
+  EXPECT_EQ(0, WEXITSTATUS(wstatus));
+  EXPECT_EQ(0, WTERMSIG(wstatus));
+}
+
+int SetMemoryLimit(size_t n) {
+  struct rlimit rlim = {0};
+  getrlimit(RLIMIT_AS, &rlim);
+  rlim.rlim_cur = n;
+  rlim.rlim_max = n;
+  return setrlimit(RLIMIT_AS, &rlim);
+}
+
+TEST(setrlimit, testMemoryLimit) {
+  char *p;
+  bool gotsome;
+  int i, wstatus;
+  ASSERT_NE(-1, (wstatus = xspawn(0)));
+  if (wstatus == -2) {
+    ASSERT_EQ(0, SetMemoryLimit(MEM));
+    for (gotsome = false, i = 0; i < (MEM * 2) / getpagesize(); ++i) {
+      p = mmap(0, getpagesize(), PROT_READ | PROT_WRITE,
+               MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+      if (p != MAP_FAILED) {
+        gotsome = true;
+      } else {
+        ASSERT_TRUE(gotsome);
+        ASSERT_EQ(ENOMEM, errno);
+        _Exit(0);
+      }
+      rngset(p, getpagesize(), _rand64, -1);
+    }
+    _Exit(1);
+  }
+  EXPECT_TRUE(WIFEXITED(wstatus));
+  EXPECT_FALSE(WIFSIGNALED(wstatus));
+  EXPECT_EQ(0, WEXITSTATUS(wstatus));
+  EXPECT_EQ(0, WTERMSIG(wstatus));
+}
+
+TEST(setrlimit, testVirtualMemoryLimit) {
+  char *p;
+  int i, wstatus;
+  ASSERT_NE(-1, (wstatus = xspawn(0)));
+  if (wstatus == -2) {
+    ASSERT_EQ(0, setrlimit(RLIMIT_AS, &(struct rlimit){MEM, MEM}));
+    for (i = 0; i < (MEM * 2) / getpagesize(); ++i) {
+      p = sys_mmap(0, getpagesize(), PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE | MAP_POPULATE, -1, 0)
+              .addr;
+      if (p == MAP_FAILED) {
+        ASSERT_EQ(ENOMEM, errno);
+        _Exit(0);
+      }
+      rngset(p, getpagesize(), _rand64, -1);
+    }
+    _Exit(1);
+  }
+  EXPECT_TRUE(WIFEXITED(wstatus));
+  EXPECT_FALSE(WIFSIGNALED(wstatus));
+  EXPECT_EQ(0, WEXITSTATUS(wstatus));
+  EXPECT_EQ(0, WTERMSIG(wstatus));
+}
+
+TEST(setrlimit, testDataMemoryLimit) {
+  char *p;
+  int i, wstatus;
+  if (IsXnu())
+    return; /* doesn't work on darwin */
+  if (IsNetbsd())
+    return; /* doesn't work on netbsd */
+  if (IsFreebsd())
+    return; /* doesn't work on freebsd */
+  if (IsLinux())
+    return; /* doesn't work on gnu/systemd */
+  if (IsWindows())
+    return; /* of course it doesn't work on windows */
+  ASSERT_NE(-1, (wstatus = xspawn(0)));
+  if (wstatus == -2) {
+    ASSERT_EQ(0, setrlimit(RLIMIT_DATA, &(struct rlimit){MEM, MEM}));
+    for (i = 0; i < (MEM * 2) / getpagesize(); ++i) {
+      p = sys_mmap(0, getpagesize(), PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE | MAP_POPULATE, -1, 0)
+              .addr;
+      if (p == MAP_FAILED) {
+        ASSERT_EQ(ENOMEM, errno);
+        _Exit(0);
+      }
+      rngset(p, getpagesize(), _rand64, -1);
+    }
+    _Exit(1);
+  }
+  EXPECT_TRUE(WIFEXITED(wstatus));
+  EXPECT_FALSE(WIFSIGNALED(wstatus));
+  EXPECT_EQ(0, WEXITSTATUS(wstatus));
+  EXPECT_EQ(0, WTERMSIG(wstatus));
+}
+
+TEST(setrlimit, testPhysicalMemoryLimit) {
+  /* RLIMIT_RSS doesn't work on gnu/systemd */
+  /* RLIMIT_RSS doesn't work on darwin */
+  /* RLIMIT_RSS doesn't work on freebsd */
+  /* RLIMIT_RSS doesn't work on netbsd */
+  /* RLIMIT_RSS doesn't work on openbsd */
+  /* of course it doesn't work on windows */
+}
+
+wontreturn void OnVfork(void *ctx) {
+  struct rlimit *rlim;
+  rlim = ctx;
+  rlim->rlim_cur -= 1;
+  ASSERT_EQ(0, getrlimit(RLIMIT_CPU, rlim));
+  _Exit(0);
+}
+
+TEST(setrlimit, isVforkSafe) {
+  int ws;
+  struct rlimit rlim[2];
+  if (IsWindows())
+    return; /* of course it doesn't work on windows */
+  ASSERT_EQ(0, getrlimit(RLIMIT_CPU, rlim));
+  ASSERT_NE(-1, (ws = xvspawn(OnVfork, rlim, 0)));
+  EXPECT_TRUE(WIFEXITED(ws));
+  EXPECT_FALSE(WIFSIGNALED(ws));
+  EXPECT_EQ(0, WEXITSTATUS(ws));
+  EXPECT_EQ(0, WTERMSIG(ws));
+  ASSERT_EQ(0, getrlimit(RLIMIT_CPU, rlim + 1));
+  EXPECT_EQ(rlim[0].rlim_cur, rlim[1].rlim_cur);
+  EXPECT_EQ(rlim[0].rlim_max, rlim[1].rlim_max);
+}
+
+#endif /* __x86_64__ */
diff --git a/test/libc/calls/shm_open_test.c b/test/libc/calls/shm_open_test.c
index 1d8f71a2b..3a83ea298 100644
--- a/test/libc/calls/shm_open_test.c
+++ b/test/libc/calls/shm_open_test.c
@@ -9,7 +9,6 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/runtime/runtime.h"
-#include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/map.h"
@@ -19,6 +18,7 @@
 #include "libc/sysv/consts/sig.h"
 #include "libc/thread/semaphore.h"
 
+#define SHM_PATH    "/fc7261622dd420d8"
 #define STRING_SEND "hello"
 #define STRING_RECV "HELLO"
 
@@ -29,14 +29,13 @@ struct shmbuf {
   char buf[256]; /* Data being transferred */
 };
 
-char shm_path[64];
 atomic_bool *ready;
 
 wontreturn void Bouncer(void) {
 
   /* Create shared memory object and set its size to the size
      of our structure. */
-  int fd = shm_open(shm_path, O_CREAT | O_EXCL | O_RDWR, S_IRUSR | S_IWUSR);
+  int fd = shm_open(SHM_PATH, O_CREAT | O_EXCL | O_RDWR, S_IRUSR | S_IWUSR);
   if (fd == -1) {
     perror("shm_open(bouncer)");
     exit(1);
@@ -97,7 +96,7 @@ wontreturn void Sender(void) {
 
   /* Open the existing shared memory object and map it
      into the caller's address space. */
-  int fd = shm_open(shm_path, O_RDWR, 0);
+  int fd = shm_open(SHM_PATH, O_RDWR, 0);
   if (fd == -1) {
     perror("shm_open(sender)");
     exit(1);
@@ -137,7 +136,7 @@ wontreturn void Sender(void) {
   /* Unlink the shared memory object. Even if the peer process
      is still using the object, this is okay. The object will
      be removed only after all open references are closed. */
-  if (shm_unlink(shm_path)) {
+  if (shm_unlink(SHM_PATH)) {
     if (IsWindows() && errno == EACCES) {
       // TODO(jart): Make unlink() work better on Windows.
     } else {
@@ -155,7 +154,7 @@ int pid2;
 void OnExit(void) {
   kill(pid1, SIGKILL);
   kill(pid2, SIGKILL);
-  shm_unlink(shm_path);
+  shm_unlink(SHM_PATH);
 }
 
 void OnTimeout(int sig) {
@@ -165,9 +164,6 @@ void OnTimeout(int sig) {
 
 int main(int argc, char *argv[]) {
 
-  // create random shared memory name
-  sprintf(shm_path, "/shm_open_test-%ld", _rand64());
-
   // create synchronization object
   ready = _mapshared(1);
 
diff --git a/test/libc/calls/sigaction_test.c b/test/libc/calls/sigaction_test.c
index b856b7b49..9206016e2 100644
--- a/test/libc/calls/sigaction_test.c
+++ b/test/libc/calls/sigaction_test.c
@@ -400,16 +400,15 @@ TEST(sigaction, ignoreSigSegv_notPossible) {
   _Exit(pSegfault(0));
   TERMS(SIGSEGV);
 }
-#endif
 
-#if 0
-// TODO(jart): Use sigsuspend() to make not flaky.
 TEST(sigaction, killSigSegv_canBeIgnored) {
   int child, ws;
+  if (IsWindows()) return;  // TODO
   sighandler_t old = signal(SIGSEGV, SIG_IGN);
   ASSERT_NE(-1, (child = fork()));
-  while (!child)
+  while (!child) {
     pause();
+  }
   ASSERT_SYS(0, 0, kill(child, SIGSEGV));
   EXPECT_SYS(0, 0, kill(child, SIGTERM));
   EXPECT_SYS(0, child, wait(&ws));
diff --git a/test/libc/calls/sigaltstack_test.c b/test/libc/calls/sigaltstack_test.c
index 3c1d63bac..c2ed85c42 100644
--- a/test/libc/calls/sigaltstack_test.c
+++ b/test/libc/calls/sigaltstack_test.c
@@ -19,9 +19,6 @@
 #include "libc/calls/struct/sigaltstack.h"
 #include "libc/calls/calls.h"
 #include "libc/errno.h"
-#include "libc/mem/gc.h"
-#include "libc/mem/mem.h"
-#include "libc/runtime/sysconf.h"
 #include "libc/sysv/consts/ss.h"
 #include "libc/testlib/testlib.h"
 
@@ -41,13 +38,3 @@ TEST(sigaltstack, disable) {
   EXPECT_SYS(0, 0, sigaltstack(0, &ss));
   EXPECT_EQ(SS_DISABLE, ss.ss_flags);
 }
-
-TEST(sigaltstack, size_requirement) {
-  struct sigaltstack ss;
-  EXPECT_SYS(0, 0, sigaltstack(0, &ss));
-  ss.ss_size = sysconf(_SC_MINSIGSTKSZ);
-  ss.ss_sp = gc(malloc(ss.ss_size));
-  ss.ss_flags = 0;
-  ASSERT_SYS(0, 0, sigaltstack(&ss, 0));
-  ASSERT_SYS(0, 0, sigaltstack(0, &ss));
-}
diff --git a/test/libc/calls/signal_test.c b/test/libc/calls/signal_test.c
new file mode 100644
index 000000000..74dfad41b
--- /dev/null
+++ b/test/libc/calls/signal_test.c
@@ -0,0 +1,105 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/sigset.h"
+#include "libc/calls/ucontext.h"
+#include "libc/dce.h"
+#include "libc/log/check.h"
+#include "libc/log/log.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sig.h"
+#include "libc/testlib/ezbench.h"
+#include "libc/testlib/testlib.h"
+
+void OnUsr1(int sig) {
+  _exit(0);
+}
+
+void SetUpOnce(void) {
+  sigset_t ss;
+  sigprocmask(SIG_SETMASK, 0, &ss);
+  ASSERT_SYS(0, 0, pledge("stdio proc", 0));
+}
+
+TEST(signal, test) {
+  ASSERT_NE(SIG_ERR, signal(SIGUSR1, OnUsr1));
+  ASSERT_NE(-1, raise(SIGUSR1));
+  __die();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// signal round-trip delivery takes about 1µs
+
+void OnSigTrap(int sig, siginfo_t *si, void *ctx) {
+}
+
+void TrapBench(int n) {
+  for (int i = 0; i < n; ++i) {
+    DebugBreak();
+  }
+}
+
+BENCH(signal, trapBench) {
+  struct sigaction old;
+  struct sigaction sabus = {.sa_sigaction = OnSigTrap};
+  ASSERT_SYS(0, 0, sigaction(SIGTRAP, &sabus, &old));
+  EZBENCH_N("signal trap", 16, TrapBench(16));
+  EZBENCH_N("signal trap", 256, TrapBench(256));
+  EZBENCH_N("signal trap", 1024, TrapBench(1024));
+  sigaction(SIGTRAP, &old, 0);
+}
+
+BENCH(signal, trapBenchSiginfo) {
+  struct sigaction old;
+  struct sigaction sabus = {.sa_sigaction = OnSigTrap, .sa_flags = SA_SIGINFO};
+  ASSERT_SYS(0, 0, sigaction(SIGTRAP, &sabus, &old));
+  EZBENCH_N("siginfo trap", 16, TrapBench(16));
+  EZBENCH_N("siginfo trap", 256, TrapBench(256));
+  EZBENCH_N("siginfo trap", 1024, TrapBench(1024));
+  sigaction(SIGTRAP, &old, 0);
+}
+
+#ifdef __x86_64__
+
+void OnSigHlt(int sig, siginfo_t *si, void *vctx) {
+  struct ucontext *ctx = vctx;
+  ctx->uc_mcontext.rip += 1;
+}
+
+void HltBench(int n) {
+  for (int i = 0; i < n; ++i) {
+    asm("hlt");
+  }
+}
+
+BENCH(signal, hltBenchSiginfo) {
+  struct sigaction old[2];
+  struct sigaction sabus = {.sa_sigaction = OnSigHlt, .sa_flags = SA_SIGINFO};
+  ASSERT_SYS(0, 0, sigaction(SIGSEGV, &sabus, old + 0));
+  ASSERT_SYS(0, 0, sigaction(SIGBUS, &sabus, old + 1));
+  EZBENCH_N("siginfo hlt", 16, HltBench(16));
+  EZBENCH_N("siginfo hlt", 256, HltBench(256));
+  EZBENCH_N("siginfo hlt", 1024, HltBench(1024));
+  sigaction(SIGSEGV, old + 0, 0);
+  sigaction(SIGBUS, old + 1, 0);
+}
+
+#endif /* __x86_64__ */
diff --git a/test/libc/calls/sigprocmask_test.c b/test/libc/calls/sigprocmask_test.c
index 5d378bb6b..e794b2b3c 100644
--- a/test/libc/calls/sigprocmask_test.c
+++ b/test/libc/calls/sigprocmask_test.c
@@ -45,7 +45,7 @@ const char *DescribeMask(void) {
   sigset_t ss;
   _Thread_local static char buf[128];
   unassert(!sigprocmask(SIG_SETMASK, 0, &ss));
-  return _DescribeSigset(buf, 0, &ss);
+  return (DescribeSigset)(buf, 0, &ss);
 }
 
 TEST(sigprocmask, testMultipleBlockedDeliveries) {
diff --git a/test/libc/calls/sigsuspend_test.c b/test/libc/calls/sigsuspend_test.c
index 8386005ac..457c3ba23 100644
--- a/test/libc/calls/sigsuspend_test.c
+++ b/test/libc/calls/sigsuspend_test.c
@@ -76,6 +76,10 @@ TEST(sigsuspend, testSignalQueuingSelf) {
 }
 
 TEST(sigsuspend, testSignalQueuingIpc) {
+  if (IsWindows()) {
+    // xxx: probably need a signal server to do this kind of signalling
+    return;
+  }
   int pid, ws;
   sigset_t neu, old, bits;
   struct sigaction oldusr1, oldusr2;
diff --git a/test/libc/calls/sigtimedwait_test.c b/test/libc/calls/sigtimedwait_test.c
index 270e9e69e..9103485c5 100644
--- a/test/libc/calls/sigtimedwait_test.c
+++ b/test/libc/calls/sigtimedwait_test.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/sigtimedwait.h"
-#include "libc/atomic.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/siginfo.h"
 #include "libc/calls/struct/siginfo.internal.h"
@@ -29,17 +28,22 @@
 #include "libc/sysv/consts/sicode.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/testlib/testlib.h"
-#include "libc/thread/thread.h"
 
 void SetUp(void) {
   if (IsXnu())
     exit(0);
   if (IsMetal())
     exit(0);
+  if (IsWindows())
+    exit(0);
   if (IsOpenbsd())
     exit(0);
 }
 
+TEST(sigtimedwait, nullSet_efault) {
+  ASSERT_SYS(EFAULT, -1, sigtimedwait(0, 0, 0));
+}
+
 TEST(sigtimedwait, emptySet_timesOut) {
   sigset_t ss = {0};
   struct timespec ts = {0, 0};
@@ -52,28 +56,24 @@ TEST(sigtimedwait, badTimestamp_einval) {
   ASSERT_SYS(EINVAL, -1, sigtimedwait(&ss, 0, &ts));
 }
 
-atomic_bool g_ready;
-
-void *worker(void *arg) {
-  sigset_t ss;
-  siginfo_t info;
-  ASSERT_EQ(0, sigemptyset(&ss));
-  ASSERT_EQ(0, sigaddset(&ss, SIGUSR1));
-  ASSERT_SYS(0, 0, sigprocmask(SIG_BLOCK, &ss, 0));
-  g_ready = true;
-  ASSERT_SYS(0, SIGUSR1, sigtimedwait(&ss, &info, 0));
-  ASSERT_EQ(SIGUSR1, info.si_signo);
-  ASSERT_EQ(SI_TKILL, info.si_code);
-  ASSERT_EQ(getuid(), info.si_uid);
-  return 0;
-}
-
 TEST(sigtimedwait, test) {
-  pthread_t th;
-  ASSERT_EQ(0, pthread_create(&th, 0, worker, 0));
-  for (;;)
-    if (g_ready)
-      break;
-  ASSERT_EQ(0, pthread_kill(th, SIGUSR1));
-  ASSERT_EQ(0, pthread_join(th, 0));
+  int pid, ws;
+  siginfo_t info;
+  sigset_t ss, oldss;
+  struct timespec ts = {1, 0};
+  sigemptyset(&ss);
+  sigaddset(&ss, SIGUSR1);
+  ASSERT_SYS(0, 0, sigprocmask(SIG_BLOCK, &ss, &oldss));
+  ASSERT_NE(-1, (pid = fork()));
+  if (!pid) {
+    ASSERT_SYS(0, SIGUSR1, sigtimedwait(&ss, &info, &ts));
+    ASSERT_EQ(SIGUSR1, info.si_signo);
+    ASSERT_EQ(SI_USER, info.si_code);
+    ASSERT_EQ(getuid(), info.si_uid);
+    _Exit(0);
+  }
+  ASSERT_SYS(0, 0, kill(pid, SIGUSR1));
+  ASSERT_SYS(0, pid, wait(&ws));
+  ASSERT_EQ(0, ws);
+  ASSERT_SYS(0, 0, sigprocmask(SIG_SETMASK, &oldss, 0));
 }
diff --git a/test/libc/calls/stackoverflow1_test.c b/test/libc/calls/stackoverflow1_test.c
index c9397cbba..e2dfa79f2 100644
--- a/test/libc/calls/stackoverflow1_test.c
+++ b/test/libc/calls/stackoverflow1_test.c
@@ -16,14 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/atomic.h"
 #include "libc/calls/struct/rlimit.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/sigaltstack.h"
 #include "libc/calls/struct/siginfo.h"
-#include "libc/calls/struct/ucontext.internal.h"
-#include "libc/calls/ucontext.h"
 #include "libc/dce.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
@@ -31,15 +27,12 @@
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/sysconf.h"
-#include "libc/stdio/rand.h"
-#include "libc/stdio/stdio.h"
 #include "libc/stdio/sysparam.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/rlimit.h"
 #include "libc/sysv/consts/sa.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/consts/ss.h"
+#include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
 
 /**
@@ -49,17 +42,15 @@
  */
 
 jmp_buf recover;
-atomic_bool g_isdone;
-atomic_bool smashed_stack;
+volatile bool smashed_stack;
 
 void CrashHandler(int sig, siginfo_t *si, void *ctx) {
   struct sigaltstack ss;
-  unassert(!sigaltstack(0, &ss));
-  unassert(SS_ONSTACK == ss.ss_flags);
-  kprintf("kprintf avoids overflowing %G si_addr=%lx sp=%lx\n", si->si_signo,
-          si->si_addr, ((ucontext_t *)ctx)->uc_mcontext.SP);
+  ASSERT_SYS(0, 0, sigaltstack(0, &ss));
+  ASSERT_EQ(SS_ONSTACK, ss.ss_flags);
+  kprintf("kprintf avoids overflowing %G %p\n", si->si_signo, si->si_addr);
   smashed_stack = true;
-  // unassert(__is_stack_overflow(si, ctx)); // fuzzy with main thread
+  ASSERT_TRUE(__is_stack_overflow(si, ctx));
   longjmp(recover, 123);
 }
 
@@ -72,7 +63,7 @@ void SetUp(void) {
     struct rlimit rl;
     getrlimit(RLIMIT_STACK, &rl);
     rl.rlim_cur = MIN(rl.rlim_cur, 2 * 1024 * 1024);
-    unassert(!setrlimit(RLIMIT_STACK, &rl));
+    ASSERT_SYS(0, 0, setrlimit(RLIMIT_STACK, &rl));
   }
 
   // set up the signal handler and alternative stack
@@ -81,7 +72,7 @@ void SetUp(void) {
   ss.ss_flags = 0;
   ss.ss_size = sysconf(_SC_MINSIGSTKSZ) + 8192;
   ss.ss_sp = _mapanon(ss.ss_size);
-  unassert(!sigaltstack(&ss, 0));
+  ASSERT_SYS(0, 0, sigaltstack(&ss, 0));
   sa.sa_flags = SA_SIGINFO | SA_ONSTACK;  // <-- important
   sigemptyset(&sa.sa_mask);
   sa.sa_sigaction = CrashHandler;
@@ -98,39 +89,20 @@ int StackOverflow(int d) {
   return 0;
 }
 
-void *innocent_thread(void *arg) {
-  atomic_long dont_clobber_me_bro = 0;
-  while (!g_isdone)
-    unassert(!dont_clobber_me_bro);
-  return 0;
-}
-
-int main() {
-
-  // libc/intrin/stack.c is designed so that this thread's stack should
-  // be allocated right beneath the main thread's stack. our goal is to
-  // make sure overflowing the main stack won't clobber our poor thread
-  pthread_t th;
-  unassert(!pthread_create(&th, 0, innocent_thread, 0));
-
-  SetUp();
-
+TEST(stackoverflow, standardStack_altStack_process_longjmp) {
   int jumpcode;
-  if (!(jumpcode = setjmp(recover)))
-    exit(StackOverflow(1));
-  unassert(123 == jumpcode);
-  unassert(smashed_stack);
-
-  // join the thread
-  g_isdone = true;
-  unassert(!pthread_join(th, 0));
+  if (!(jumpcode = setjmp(recover))) {
+    exit(StackOverflow(0));
+  }
+  ASSERT_EQ(123, jumpcode);
+  ASSERT_TRUE(smashed_stack);
 
   // here's where longjmp() gets us into trouble
   struct sigaltstack ss;
-  unassert(!sigaltstack(0, &ss));
+  ASSERT_SYS(0, 0, sigaltstack(0, &ss));
   if (IsXnu() || IsNetbsd()) {
-    unassert(SS_ONSTACK == ss.ss_flags);  // wut
+    ASSERT_EQ(SS_ONSTACK, ss.ss_flags);  // wut
   } else {
-    unassert(0 == ss.ss_flags);
+    ASSERT_EQ(0, ss.ss_flags);
   }
 }
diff --git a/test/libc/calls/stackoverflow2_test.c b/test/libc/calls/stackoverflow2_test.c
index 520d952e8..0afa6b695 100644
--- a/test/libc/calls/stackoverflow2_test.c
+++ b/test/libc/calls/stackoverflow2_test.c
@@ -16,26 +16,20 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/sigaltstack.h"
 #include "libc/calls/struct/siginfo.h"
-#include "libc/calls/struct/ucontext.internal.h"
-#include "libc/calls/ucontext.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/intrin/maps.h"
 #include "libc/limits.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
-#include "libc/runtime/stack.h"
 #include "libc/runtime/sysconf.h"
-#include "libc/stdio/stdio.h"
 #include "libc/sysv/consts/sa.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/consts/ss.h"
+#include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
 
 /**
@@ -44,20 +38,17 @@
  * simple but it can upset kernels / libraries
  */
 
-sigjmp_buf recover;
-atomic_bool is_done;
-atomic_bool smashed_stack;
-atomic_bool clobbered_other_thread;
+jmp_buf recover;
+volatile bool smashed_stack;
 
 void CrashHandler(int sig, siginfo_t *si, void *ctx) {
   struct sigaltstack ss;
-  unassert(!sigaltstack(0, &ss));
-  unassert(SS_ONSTACK == ss.ss_flags);
-  kprintf("kprintf avoids overflowing %G si_addr=%lx sp=%lx\n", si->si_signo,
-          si->si_addr, ((ucontext_t *)ctx)->uc_mcontext.SP);
+  ASSERT_SYS(0, 0, sigaltstack(0, &ss));
+  ASSERT_EQ(SS_ONSTACK, ss.ss_flags);
+  kprintf("kprintf avoids overflowing %G %p\n", si->si_signo, si->si_addr);
   smashed_stack = true;
-  unassert(__is_stack_overflow(si, ctx));
-  siglongjmp(recover, 123);
+  ASSERT_TRUE(__is_stack_overflow(si, ctx));
+  longjmp(recover, 123);
 }
 
 int StackOverflow(int d) {
@@ -74,51 +65,40 @@ void *MyPosixThread(void *arg) {
   struct sigaction sa, o1, o2;
   struct sigaltstack ss;
   ss.ss_flags = 0;
-  ss.ss_size = sysconf(_SC_MINSIGSTKSZ) + 2048;
+  ss.ss_size = sysconf(_SC_MINSIGSTKSZ) + 4096;
   ss.ss_sp = gc(malloc(ss.ss_size));
-  unassert(!sigaltstack(&ss, 0));
+  ASSERT_SYS(0, 0, sigaltstack(&ss, 0));
   sa.sa_flags = SA_SIGINFO | SA_ONSTACK;  // <-- important
   sigemptyset(&sa.sa_mask);
   sa.sa_sigaction = CrashHandler;
   sigaction(SIGBUS, &sa, &o1);
   sigaction(SIGSEGV, &sa, &o2);
-  if (!(jumpcode = sigsetjmp(recover, 1)))
-    exit(StackOverflow(1));
-  unassert(123 == jumpcode);
+  if (!(jumpcode = setjmp(recover))) {
+    exit(StackOverflow(0));
+  }
+  ASSERT_EQ(123, jumpcode);
   sigaction(SIGSEGV, &o2, 0);
   sigaction(SIGBUS, &o1, 0);
   // here's where longjmp() gets us into trouble
-  unassert(!sigaltstack(0, &ss));
+  ASSERT_SYS(0, 0, sigaltstack(0, &ss));
   if (IsXnu() || IsNetbsd()) {
-    unassert(SS_ONSTACK == ss.ss_flags);  // wut
+    ASSERT_EQ(SS_ONSTACK, ss.ss_flags);  // wut
   } else {
-    unassert(!ss.ss_flags);
+    ASSERT_EQ(0, ss.ss_flags);
   }
   return 0;
 }
 
-void *InnocentThread(void *arg) {
-  atomic_long dont_clobber_me_bro = 0;
-  while (!is_done)
-    if (dont_clobber_me_bro)
-      clobbered_other_thread = true;
-  pthread_exit(0);
-}
-
-int main() {
-  pthread_t th, in;
+TEST(stackoverflow, standardStack_altStack_thread_longjmp) {
+  pthread_t th;
   struct sigaltstack ss;
   for (int i = 0; i < 2; ++i) {
-    is_done = false;
     smashed_stack = false;
-    unassert(!pthread_create(&th, 0, MyPosixThread, 0));
-    unassert(!pthread_create(&in, 0, InnocentThread, 0));
-    unassert(!pthread_join(th, 0));
-    unassert(smashed_stack);
-    unassert(!sigaltstack(0, &ss));
-    unassert(ss.ss_flags == SS_DISABLE);
-    unassert(!clobbered_other_thread);
-    is_done = true;
-    unassert(!pthread_join(in, 0));
+    pthread_create(&th, 0, MyPosixThread, 0);
+    pthread_join(th, 0);
+    ASSERT_TRUE(smashed_stack);
+    // this should be SS_DISABLE but ShowCrashReports() creates an alt stack
+    ASSERT_SYS(0, 0, sigaltstack(0, &ss));
+    ASSERT_EQ(0, ss.ss_flags);
   }
 }
diff --git a/test/libc/calls/stackoverflow3_test.c b/test/libc/calls/stackoverflow3_test.c
index b83ebcf25..81ff8c1f9 100644
--- a/test/libc/calls/stackoverflow3_test.c
+++ b/test/libc/calls/stackoverflow3_test.c
@@ -98,7 +98,7 @@ void *MyPosixThread(void *arg) {
   struct sigaction sa;
   struct sigaltstack ss;
   ss.ss_flags = 0;
-  ss.ss_size = sysconf(_SC_MINSIGSTKSZ) + 8192;
+  ss.ss_size = sysconf(_SC_MINSIGSTKSZ) + 4096;
   ss.ss_sp = gc(malloc(ss.ss_size));
   ASSERT_SYS(0, 0, sigaltstack(&ss, 0));
   sa.sa_flags = SA_SIGINFO | SA_ONSTACK;  // <-- important
@@ -106,7 +106,7 @@ void *MyPosixThread(void *arg) {
   sa.sa_sigaction = CrashHandler;
   sigaction(SIGBUS, &sa, 0);
   sigaction(SIGSEGV, &sa, 0);
-  exit(StackOverflow(1));
+  exit(StackOverflow(0));
   return 0;
 }
 
diff --git a/test/libc/calls/stackoverflow4_test.c b/test/libc/calls/stackoverflow4_test.c
index a9b1eab2f..8266ddda0 100644
--- a/test/libc/calls/stackoverflow4_test.c
+++ b/test/libc/calls/stackoverflow4_test.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/sigaltstack.h"
 #include "libc/calls/struct/siginfo.h"
@@ -41,9 +40,8 @@
 
 volatile bool smashed_stack;
 
-void CrashHandler(int sig, siginfo_t *si, void *ctx) {
+void CrashHandler(int sig) {
   smashed_stack = true;
-  unassert(__is_stack_overflow(si, ctx));
   pthread_exit((void *)123L);
 }
 
@@ -60,15 +58,15 @@ void *MyPosixThread(void *arg) {
   struct sigaction sa;
   struct sigaltstack ss;
   ss.ss_flags = 0;
-  ss.ss_size = sysconf(_SC_MINSIGSTKSZ) + 1024;
+  ss.ss_size = sysconf(_SC_MINSIGSTKSZ) + 4096;
   ss.ss_sp = gc(malloc(ss.ss_size));
   ASSERT_SYS(0, 0, sigaltstack(&ss, 0));
   sa.sa_flags = SA_SIGINFO | SA_ONSTACK;  // <-- important
   sigemptyset(&sa.sa_mask);
-  sa.sa_sigaction = CrashHandler;
+  sa.sa_handler = CrashHandler;
   sigaction(SIGBUS, &sa, 0);
   sigaction(SIGSEGV, &sa, 0);
-  exit(StackOverflow(1));
+  exit(StackOverflow(0));
   return 0;
 }
 
diff --git a/test/libc/calls/stackoverflow5_test.c b/test/libc/calls/stackoverflow5_test.c
index 29a4097d1..7a3398045 100644
--- a/test/libc/calls/stackoverflow5_test.c
+++ b/test/libc/calls/stackoverflow5_test.c
@@ -16,28 +16,22 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/siginfo.h"
-#include "libc/runtime/runtime.h"
-#include "libc/sysv/consts/sa.h"
-#include "libc/sysv/consts/sig.h"
-#include "libc/sysv/consts/ss.h"
-#include "libc/thread/thread.h"
-#include "libc/thread/tls.h"
+#include <cosmo.h>
+#include <limits.h>
+#include <pthread.h>
+#include <signal.h>
+#include <unistd.h>
 
 /**
- * stack overflow test #5
- * - make sure fork() preserves sigaltstack()
- * - make sure fork() preserves guard page status
+ * stack overflow recovery technique #5
+ * use the cosmo posix threads extensions
  */
 
-jmp_buf recover;
+sig_atomic_t smashed_stack;
 
-void CrashHandler(int sig, siginfo_t *si, void *ctx) {
-  unassert(__is_stack_overflow(si, ctx));
-  longjmp(recover, 123);
+void CrashHandler(int sig) {
+  smashed_stack = true;
+  pthread_exit(0);
 }
 
 int StackOverflow(int d) {
@@ -50,40 +44,42 @@ int StackOverflow(int d) {
 }
 
 void *MyPosixThread(void *arg) {
-  int pid;
-  unassert(__get_tls()->tib_sigstack_addr);
-  unassert((pid = fork()) != -1);
-  if (!pid) {
-    int jumpcode;
-    if (!(jumpcode = setjmp(recover))) {
-      StackOverflow(1);
-      _Exit(1);
-    }
-    unassert(123 == jumpcode);
-  } else {
-    int ws;
-    unassert(wait(&ws) != -1);
-    unassert(!ws);
-    pthread_exit(0);
-  }
+  exit(StackOverflow(0));
   return 0;
 }
 
 int main() {
 
-  struct sigaction sa;
-  sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
-  sigemptyset(&sa.sa_mask);
-  sa.sa_sigaction = CrashHandler;
-  unassert(!sigaction(SIGBUS, &sa, 0));
-  unassert(!sigaction(SIGSEGV, &sa, 0));
+  // choose the most dangerously small size possible
+  size_t sigstacksize = sysconf(_SC_MINSIGSTKSZ) + 2048;
 
-  pthread_t th;
+  // setup signal handler
+  struct sigaction sa;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_ONSTACK;
+  sa.sa_handler = CrashHandler;
+  if (sigaction(SIGBUS, &sa, 0))
+    return 1;
+  if (sigaction(SIGSEGV, &sa, 0))
+    return 2;
+
+  // create thread with signal stack
+  pthread_t id;
   pthread_attr_t attr;
-  unassert(!pthread_attr_init(&attr));
-  unassert(!pthread_attr_setguardsize(&attr, getpagesize()));
-  unassert(!pthread_attr_setsigaltstacksize_np(&attr, SIGSTKSZ));
-  unassert(!pthread_create(&th, &attr, MyPosixThread, 0));
-  unassert(!pthread_attr_destroy(&attr));
-  unassert(!pthread_join(th, 0));
+  if (pthread_attr_init(&attr))
+    return 3;
+  if (pthread_attr_setguardsize(&attr, getpagesize()))
+    return 4;
+  if (pthread_attr_setsigaltstacksize_np(&attr, sigstacksize))
+    return 5;
+  if (pthread_create(&id, &attr, MyPosixThread, 0))
+    return 6;
+  if (pthread_attr_destroy(&attr))
+    return 7;
+  if (pthread_join(id, 0))
+    return 8;
+  if (!smashed_stack)
+    return 9;
+
+  CheckForMemoryLeaks();
 }
diff --git a/test/libc/calls/writev_test.c b/test/libc/calls/writev_test.c
index 834ae1a90..c6a9a9540 100644
--- a/test/libc/calls/writev_test.c
+++ b/test/libc/calls/writev_test.c
@@ -21,7 +21,7 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/test/libc/fmt/formatint64thousands_test.c b/test/libc/fmt/formatint64thousands_test.c
index b0b1398b6..c52389ebc 100644
--- a/test/libc/fmt/formatint64thousands_test.c
+++ b/test/libc/fmt/formatint64thousands_test.c
@@ -19,7 +19,7 @@
 #include "libc/fmt/conv.h"
 #include "libc/fmt/itoa.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/stdio/stdio.h"
 #include "libc/testlib/ezbench.h"
diff --git a/test/libc/intrin/BUILD.mk b/test/libc/intrin/BUILD.mk
index 7a1ce1756..dcde4ae37 100644
--- a/test/libc/intrin/BUILD.mk
+++ b/test/libc/intrin/BUILD.mk
@@ -37,16 +37,15 @@ TEST_LIBC_INTRIN_DIRECTDEPS =					\
 	LIBC_STR						\
 	LIBC_SYSV						\
 	LIBC_SYSV_CALLS						\
-	LIBC_TESTLIB						\
 	LIBC_THREAD						\
+	LIBC_TESTLIB						\
 	LIBC_TINYMATH						\
 	LIBC_X							\
+	TOOL_VIZ_LIB						\
 	THIRD_PARTY_COMPILER_RT					\
-	THIRD_PARTY_MUSL					\
 	THIRD_PARTY_NSYNC					\
 	THIRD_PARTY_OPENMP					\
-	THIRD_PARTY_XED						\
-	TOOL_VIZ_LIB						\
+	THIRD_PARTY_XED
 
 TEST_LIBC_INTRIN_DEPS :=					\
 	$(call uniq,$(foreach x,$(TEST_LIBC_INTRIN_DIRECTDEPS),$($(x))))
@@ -59,15 +58,6 @@ o/$(MODE)/test/libc/intrin/%.dbg:				\
 		$(TEST_LIBC_INTRIN_DEPS)			\
 		o/$(MODE)/test/libc/intrin/%.o			\
 		o/$(MODE)/test/libc/intrin/intrin.pkg		\
-		$(LIBC_TESTMAIN)				\
-		$(CRT)						\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/test/libc/intrin/mmap_test.dbg:			\
-		$(TEST_LIBC_INTRIN_DEPS)			\
-		o/$(MODE)/test/libc/intrin/mmap_test.o		\
-		o/$(MODE)/test/libc/intrin/intrin.pkg		\
 		o/$(MODE)/test/libc/mem/prog/life.elf.zip.o	\
 		$(LIBC_TESTMAIN)				\
 		$(CRT)						\
diff --git a/test/libc/intrin/demangle_test.c b/test/libc/intrin/demangle_test.c
index 27fce05ae..3ec28860c 100644
--- a/test/libc/intrin/demangle_test.c
+++ b/test/libc/intrin/demangle_test.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "demangle_cases.inc"
 #include "libc/cosmo.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdio/internal.h"
 #include "libc/str/str.h"
 
diff --git a/test/libc/intrin/describeflags_test.c b/test/libc/intrin/describeflags_test.c
index a805f5605..14f2892a0 100644
--- a/test/libc/intrin/describeflags_test.c
+++ b/test/libc/intrin/describeflags_test.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/testlib/testlib.h"
 
 static const struct DescribeFlags kFlags[] = {
@@ -27,7 +27,7 @@ static const struct DescribeFlags kFlags[] = {
 
 const char *DescribeIt(uint32_t x) {
   static char s[64];
-  return _DescribeFlags(s, ARRAYLEN(s), kFlags, ARRAYLEN(kFlags), "x", x);
+  return DescribeFlags(s, ARRAYLEN(s), kFlags, ARRAYLEN(kFlags), "x", x);
 }
 
 TEST(describeflags, test) {
diff --git a/test/libc/intrin/kprintf_test.c b/test/libc/intrin/kprintf_test.c
index eeabf193a..e6d17b336 100644
--- a/test/libc/intrin/kprintf_test.c
+++ b/test/libc/intrin/kprintf_test.c
@@ -23,7 +23,7 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
 #include "libc/log/libfatal.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/symbols.internal.h"
diff --git a/test/libc/intrin/lock_test.c b/test/libc/intrin/lock_test.c
index b73a94f85..491700d38 100644
--- a/test/libc/intrin/lock_test.c
+++ b/test/libc/intrin/lock_test.c
@@ -18,7 +18,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/atomic.h"
 #include "libc/calls/calls.h"
-#include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/timespec.h"
 #include "libc/errno.h"
 #include "libc/fmt/itoa.h"
@@ -29,10 +28,8 @@
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
-#include "libc/runtime/symbols.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/clone.h"
-#include "libc/sysv/consts/sig.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 #include "third_party/nsync/mu.h"
@@ -65,9 +62,6 @@ pthread_mutex_t mu;
       __assert_eq_fail(__FILE__, __LINE__, #WANT, #GOT, _want, _got); \
   } while (0)
 
-void ignore_signal(int sig) {
-}
-
 void __assert_eq_fail(const char *file, int line, const char *wantstr,
                       const char *gotstr, long want, long got) {
   kprintf("%s:%d: %s vs. %s was %ld vs. %ld (%s)\n", file, line, wantstr,
@@ -118,15 +112,10 @@ void TestContendedLock(const char *name, int kind) {
   char *stk;
   double ns;
   errno_t rc;
-  int x, i, n = 10000;
   struct timespec t1, t2;
   pthread_mutexattr_t attr;
-  struct CosmoTib tib = {
-      .tib_self = &tib,
-      .tib_self2 = &tib,
-      .tib_ctid = -1,
-      .tib_ptid = 0,
-  };
+  int tid, x, i, n = 10000;
+  struct CosmoTib tib = {.tib_self = &tib, .tib_self2 = &tib, .tib_tid = -1};
   pthread_mutexattr_init(&attr);
   pthread_mutexattr_settype(&attr, kind);
   pthread_mutex_init(&mu, &attr);
@@ -138,7 +127,7 @@ void TestContendedLock(const char *name, int kind) {
              CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
                  CLONE_SYSVSEM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
                  CLONE_CHILD_CLEARTID | CLONE_SETTLS,
-             0, &tib.tib_ptid, &tib, &tib.tib_ctid);
+             0, &tid, &tib, &tib.tib_tid);
   if (rc) {
     kprintf("clone failed: %s\n", strerror(rc));
     _Exit(1);
@@ -154,7 +143,7 @@ void TestContendedLock(const char *name, int kind) {
     ASSERT_EQ(0, pthread_mutex_unlock(&mu));
   }
   t2 = timespec_real();
-  while (tib.tib_ctid)
+  while (tib.tib_tid)
     donothing;
   ASSERT_EQ(1, atomic_load(&success));
   ASSERT_EQ(0, atomic_load(&counter));
@@ -188,12 +177,6 @@ void TestUncontendedLock(const char *name, int kind) {
 int main(int argc, char *argv[]) {
   pthread_mutexattr_t attr;
 
-#ifdef MODE_DBG
-  GetSymbolTable();
-  signal(SIGTRAP, ignore_signal);
-  kprintf("running %s\n", argv[0]);
-#endif
-
 #ifdef __aarch64__
   // our usage of raw clone() is probably broken in aarch64
   // we should just get rid of clone()
@@ -206,8 +189,10 @@ int main(int argc, char *argv[]) {
     _Exit(1);
   }
 
+  __threaded = 1;
+
   ASSERT_EQ(0, pthread_mutexattr_init(&attr));
-  ASSERT_EQ(0, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT));
+  ASSERT_EQ(0, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL));
   ASSERT_EQ(0, pthread_mutex_init(&mu, &attr));
   ASSERT_EQ(0, pthread_mutexattr_destroy(&attr));
   ASSERT_EQ(0, pthread_mutex_lock(&mu));
@@ -233,12 +218,28 @@ int main(int argc, char *argv[]) {
   ASSERT_EQ(0, pthread_mutex_unlock(&mu));
   ASSERT_EQ(0, pthread_mutex_destroy(&mu));
 
-  TestUncontendedLock("PTHREAD_MUTEX_DEFAULT RAW TLS", PTHREAD_MUTEX_DEFAULT);
+  ASSERT_EQ(1, __tls_enabled);
+
+  TestUncontendedLock("PTHREAD_MUTEX_NORMAL RAW TLS", PTHREAD_MUTEX_NORMAL);
   TestUncontendedLock("PTHREAD_MUTEX_RECURSIVE RAW TLS",
                       PTHREAD_MUTEX_RECURSIVE);
+  TestUncontendedLock("PTHREAD_MUTEX_ERRORCHECK RAW TLS",
+                      PTHREAD_MUTEX_ERRORCHECK);
 
-  TestContendedLock("PTHREAD_MUTEX_DEFAULT RAW TLS", PTHREAD_MUTEX_DEFAULT);
+  TestContendedLock("PTHREAD_MUTEX_NORMAL RAW TLS", PTHREAD_MUTEX_NORMAL);
   TestContendedLock("PTHREAD_MUTEX_RECURSIVE RAW TLS", PTHREAD_MUTEX_RECURSIVE);
+  TestContendedLock("PTHREAD_MUTEX_ERRORCHECK RAW TLS",
+                    PTHREAD_MUTEX_ERRORCHECK);
+
+  __tls_enabled_set(false);
+
+  TestUncontendedLock("PTHREAD_MUTEX_NORMAL RAW", PTHREAD_MUTEX_NORMAL);
+  TestUncontendedLock("PTHREAD_MUTEX_RECURSIVE RAW", PTHREAD_MUTEX_RECURSIVE);
+  TestUncontendedLock("PTHREAD_MUTEX_ERRORCHECK RAW", PTHREAD_MUTEX_ERRORCHECK);
+
+  TestContendedLock("PTHREAD_MUTEX_NORMAL RAW", PTHREAD_MUTEX_NORMAL);
+  TestContendedLock("PTHREAD_MUTEX_RECURSIVE RAW", PTHREAD_MUTEX_RECURSIVE);
+  TestContendedLock("PTHREAD_MUTEX_ERRORCHECK RAW", PTHREAD_MUTEX_ERRORCHECK);
 
   //
 }
diff --git a/test/libc/intrin/lockipc_test.c b/test/libc/intrin/lockipc_test.c
index 30878c699..6e9b84d52 100644
--- a/test/libc/intrin/lockipc_test.c
+++ b/test/libc/intrin/lockipc_test.c
@@ -19,7 +19,7 @@
 #include "libc/calls/calls.h"
 #include "libc/errno.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
@@ -52,7 +52,7 @@ TEST(lockipc, mutex) {
   // create shared mutex
   pthread_mutexattr_t mattr;
   pthread_mutexattr_init(&mattr);
-  pthread_mutexattr_settype(&mattr, PTHREAD_MUTEX_DEFAULT);
+  pthread_mutexattr_settype(&mattr, PTHREAD_MUTEX_NORMAL);
   pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
   pthread_mutex_init(&shm->mutex, &mattr);
   pthread_mutexattr_destroy(&mattr);
diff --git a/test/libc/intrin/magicu_test.c b/test/libc/intrin/magicu_test.c
index 351a861bb..4a5753e2a 100644
--- a/test/libc/intrin/magicu_test.c
+++ b/test/libc/intrin/magicu_test.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/magicu.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
diff --git a/test/libc/intrin/memmove_test.c b/test/libc/intrin/memmove_test.c
index 7a4dbeb03..701ef044a 100644
--- a/test/libc/intrin/memmove_test.c
+++ b/test/libc/intrin/memmove_test.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/test/libc/intrin/memset_test.c b/test/libc/intrin/memset_test.c
index cd05645e9..4d818845a 100644
--- a/test/libc/intrin/memset_test.c
+++ b/test/libc/intrin/memset_test.c
@@ -21,7 +21,7 @@
 #include "libc/mem/mem.h"
 #include "libc/stdio/rand.h"
 #include "libc/str/str.h"
-#include "libc/testlib/benchmark.h"
+#include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 
 static void *golden(void *p, int c, size_t n) {
@@ -64,13 +64,36 @@ TEST(bzero, hug) {
   }
 }
 
-#define N (256 * 1024 * 1024)
-
 BENCH(memset, bench) {
-  void *memset_(void *, int, size_t) asm("memset");
-  printf("\n");
-  static char A[N];
-  memset(A, 2, N);
-  for (int n = 1; n <= N; n *= 2)
-    BENCHMARK(100, n, X(memset_(V(A), 0, n)));
+  int n, max = 8 * 1024 * 1024;
+  char *volatile p = gc(malloc(max));
+
+  EZBENCH_N("memset", 0, memset(p, -1, 0));
+  for (n = 2; n <= max; n *= 2) {
+    EZBENCH_N("memset", n - 1, memset(p, -1, n - 1));
+    EZBENCH_N("memset", n, memset(p, -1, n));
+  }
+
+  EZBENCH_N("memset16", 0, memset16((char16_t *)p, -1, 0));
+  for (n = 2; n <= max; n *= 2) {
+    EZBENCH_N("memset16", n, memset16((char16_t *)p, -1, n / 2));
+  }
+
+  EZBENCH_N("bzero", 0, bzero(p, 0));
+  for (n = 2; n <= max; n *= 2) {
+    EZBENCH_N("bzero", n - 1, bzero(p, n - 1));
+    EZBENCH_N("bzero", n, bzero(p, n));
+  }
+}
+
+BENCH(strlen, bench) {
+  int n, max = 8 * 1024 * 1024;
+  char *volatile p = gc(calloc(max + 1, 1));
+  EZBENCH_N("strlen", 0, strlen(p));
+  for (n = 2; n <= max; n *= 2) {
+    memset(p, -1, n - 1);
+    EZBENCH_N("strlen", n - 1, strlen(p));
+    p[n - 1] = -1;
+    EZBENCH_N("strlen", n, strlen(p));
+  }
 }
diff --git a/test/libc/intrin/mmap_test.c b/test/libc/intrin/mmap_test.c
index e44ee223e..5044d6f96 100644
--- a/test/libc/intrin/mmap_test.c
+++ b/test/libc/intrin/mmap_test.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "ape/sections.internal.h"
-#include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
@@ -28,14 +27,12 @@
 #include "libc/runtime/sysconf.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
-#include "libc/stdio/sysparam.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/msync.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
-#include "libc/testlib/benchmark.h"
-#include "libc/testlib/subprocess.h"
+#include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 #include "libc/x/xspawn.h"
 
@@ -59,10 +56,6 @@ void SetUpOnce(void) {
   // ASSERT_SYS(0, 0, pledge("stdio rpath wpath cpath proc", 0));
 }
 
-void TearDown(void) {
-  ASSERT_FALSE(__maps_held());
-}
-
 TEST(mmap, zeroSize) {
   ASSERT_SYS(EINVAL, MAP_FAILED,
              mmap(NULL, 0, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
@@ -102,7 +95,7 @@ TEST(mmap, pageBeyondGone) {
            MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
   ASSERT_NE(MAP_FAILED, p);
   EXPECT_TRUE(testlib_memoryexists(p));
-  EXPECT_TRUE(testlib_memoryexists(p + 1));
+  EXPECT_FALSE(testlib_memoryexists(p + 1));  // b/c kisdangerous
   EXPECT_FALSE(testlib_memoryexists(p + pagesz));
   ASSERT_EQ(0, munmap(p, 1));
 }
@@ -116,42 +109,6 @@ TEST(mmap, fixedTaken) {
   EXPECT_SYS(0, 0, munmap(p, 1));
 }
 
-TEST(mmap, anon_rw_to_rx) {
-  char *p;
-  ASSERT_NE(MAP_FAILED, (p = mmap(0, 1, PROT_READ | PROT_WRITE,
-                                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0)));
-  ASSERT_SYS(0, 0, mprotect(p, 1, PROT_READ | PROT_EXEC));
-  ASSERT_SYS(0, 0, munmap(p, 1));
-}
-
-TEST(mmap, anon_rw_fork_to_rx) {
-  char *p;
-  ASSERT_NE(MAP_FAILED, (p = mmap(0, 1, PROT_READ | PROT_WRITE,
-                                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0)));
-  SPAWN(fork);
-  ASSERT_SYS(0, 0, mprotect(p, 1, PROT_READ | PROT_EXEC));
-  EXITS(0);
-  ASSERT_SYS(0, 0, munmap(p, 1));
-}
-
-TEST(mmap, anon_r_to_rw) {
-  char *p;
-  ASSERT_NE(MAP_FAILED,
-            (p = mmap(0, 1, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0)));
-  ASSERT_SYS(0, 0, mprotect(p, 1, PROT_READ | PROT_WRITE));
-  ASSERT_SYS(0, 0, munmap(p, 1));
-}
-
-TEST(mmap, anon_r_fork_to_rw) {
-  char *p;
-  ASSERT_NE(MAP_FAILED,
-            (p = mmap(0, 1, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0)));
-  SPAWN(fork);
-  ASSERT_SYS(0, 0, mprotect(p, 1, PROT_READ | PROT_WRITE));
-  EXITS(0);
-  ASSERT_SYS(0, 0, munmap(p, 1));
-}
-
 TEST(mmap, hint) {
   char *p;
 
@@ -227,7 +184,7 @@ TEST(mmap, smallerThanPage_mapsRemainder) {
   ASSERT_NE(MAP_FAILED, map);
   EXPECT_TRUE(testlib_memoryexists(map));
   EXPECT_TRUE(testlib_pokememory(map + (pagesz - 1)));
-  EXPECT_TRUE(testlib_memoryexists(map + (pagesz - 1)));
+  EXPECT_TRUE(!testlib_memoryexists(map + (pagesz - 1)));
   EXPECT_SYS(0, 0, munmap(map, 1));
   EXPECT_FALSE(testlib_memoryexists(map));
   EXPECT_FALSE(testlib_memoryexists(map + (pagesz - 1)));
@@ -374,172 +331,6 @@ TEST(mmap, pml5t) {
   }
 }
 
-TEST(mmap, windows) {
-  if (!IsWindows())
-    return;
-  int count = __maps.count;
-  char *base = __maps_randaddr();
-
-  ASSERT_EQ(base, mmap(base, pagesz * 3, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_EQ((count += 1), __maps.count);
-
-  // isn't granularity aligned
-  ASSERT_SYS(EINVAL, -1, munmap(base + pagesz, pagesz));
-
-  // doesn't overlap any maps
-  ASSERT_SYS(0, 0, munmap(base + gransz, pagesz));
-  ASSERT_EQ(count, __maps.count);
-
-  // doesn't overlap any maps
-  ASSERT_SYS(0, 0, munmap(base - gransz, gransz));
-  ASSERT_EQ(count, __maps.count);
-
-  // partially overlaps map
-  ASSERT_SYS(ENOTSUP, -1, munmap(base, pagesz));
-  ASSERT_EQ(count, __maps.count);
-
-  // envelops map
-  ASSERT_SYS(0, 0, munmap(base - gransz, gransz + pagesz * 4));
-  ASSERT_EQ((count -= 1), __maps.count);
-
-  // win32 actually unmapped map
-  ASSERT_EQ(base, mmap(base, pagesz * 3, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_EQ((count += 1), __maps.count);
-
-  // change status of middle page results in three fragments
-  ASSERT_SYS(0, 0, mprotect(base + pagesz, pagesz, PROT_NONE));
-  ASSERT_EQ((count += 2), __maps.count);
-
-  // change status back (todo: should reunite fragments)
-  ASSERT_SYS(0, 0, mprotect(base + pagesz, pagesz, PROT_READ | PROT_WRITE));
-  ASSERT_EQ(count, __maps.count);
-
-  // clean up
-  ASSERT_SYS(0, 0, munmap(base, pagesz * 3));
-  ASSERT_EQ((count -= 3), __maps.count);
-}
-
-TEST(mmap, windows_partial_overlap_enotsup) {
-  if (!IsWindows())
-    return;
-  int count = __maps.count;
-  char *base = __maps_randaddr();
-
-  ASSERT_EQ(base, mmap(base, gransz * 3, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_EQ((count += 1), __maps.count);
-
-  // partially overlaps on left
-  ASSERT_SYS(ENOTSUP, -1, munmap(base - gransz, gransz * 2));
-  ASSERT_SYS(ENOTSUP, -1, munmap(base, gransz * 2));
-  ASSERT_EQ(count, __maps.count);
-
-  // partially overlaps the middle
-  ASSERT_SYS(ENOTSUP, -1, munmap(base + gransz * 1, gransz));
-  ASSERT_SYS(ENOTSUP, -1, munmap(base + gransz * 1, gransz * 2));
-  ASSERT_EQ(count, __maps.count);
-
-  // partially overlaps on right
-  ASSERT_SYS(ENOTSUP, -1, munmap(base + gransz * 2, gransz * 2));
-  ASSERT_EQ(count, __maps.count);
-
-  // doesn't overlap any maps
-  ASSERT_SYS(0, 0, munmap(base - gransz, gransz));
-  ASSERT_SYS(0, 0, munmap(base + gransz * 3, gransz));
-  ASSERT_EQ(count, __maps.count);
-
-  // unmap envelops
-  ASSERT_SYS(0, 0, munmap(base - gransz, gransz * 4));
-  ASSERT_EQ((count -= 1), __maps.count);
-
-  // win32 actually removed the memory
-  ASSERT_EQ(base, mmap(base, gransz * 3, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_EQ((count += 1), __maps.count);
-
-  // clean up
-  ASSERT_SYS(0, 0, munmap(base, gransz * 3));
-  ASSERT_EQ((count -= 1), __maps.count);
-}
-
-TEST(munmap, windows_not_all_fragments_included_enotsup) {
-  if (!IsWindows())
-    return;
-  int count = __maps.count;
-  char *base = __maps_randaddr();
-
-  ASSERT_EQ(base, mmap(base, gransz * 3, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_EQ((count += 1), __maps.count);
-
-  // win32 memory actually exists
-  ASSERT_SYS(EEXIST, MAP_FAILED,
-             mmap(base, gransz * 3, PROT_READ | PROT_WRITE,
-                  MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_SYS(EEXIST, MAP_FAILED,
-             mmap(base + gransz * 0, gransz, PROT_READ | PROT_WRITE,
-                  MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_SYS(EEXIST, MAP_FAILED,
-             mmap(base + gransz * 1, gransz, PROT_READ | PROT_WRITE,
-                  MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_SYS(EEXIST, MAP_FAILED,
-             mmap(base + gransz * 2, gransz, PROT_READ | PROT_WRITE,
-                  MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-
-  // change status of middle page results in three fragments
-  ASSERT_SYS(0, 0, mprotect(base + gransz, gransz, PROT_NONE));
-  ASSERT_EQ((count += 2), __maps.count);
-
-  // partially overlaps on left
-  ASSERT_SYS(ENOTSUP, -1, munmap(base - gransz, gransz * 2));
-  ASSERT_SYS(ENOTSUP, -1, munmap(base, gransz * 2));
-  ASSERT_EQ(count, __maps.count);
-
-  // partially overlaps the middle
-  ASSERT_SYS(ENOTSUP, -1, munmap(base + gransz * 1, gransz));
-  ASSERT_SYS(ENOTSUP, -1, munmap(base + gransz * 1, gransz * 2));
-  ASSERT_EQ(count, __maps.count);
-
-  // partially overlaps on right
-  ASSERT_SYS(ENOTSUP, -1, munmap(base + gransz * 2, gransz * 2));
-  ASSERT_EQ(count, __maps.count);
-
-  // doesn't overlap any maps
-  ASSERT_SYS(0, 0, munmap(base - gransz, gransz));
-  ASSERT_SYS(0, 0, munmap(base + gransz * 3, gransz));
-  ASSERT_EQ(count, __maps.count);
-
-  // unmap envelops
-  ASSERT_SYS(0, 0, munmap(base - gransz, gransz * 4));
-  ASSERT_EQ((count -= 3), __maps.count);
-
-  // win32 actually removed the memory
-  ASSERT_EQ(base, mmap(base, gransz * 3, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_EQ((count += 1), __maps.count);
-
-  // clean up
-  ASSERT_SYS(0, 0, munmap(base, gransz * 3));
-  ASSERT_EQ((count -= 1), __maps.count);
-}
-
-TEST(mmap, windows_private_memory_fork_uses_virtualfree) {
-  if (IsFreebsd())
-    return;  // freebsd can't take a hint
-  char *base;
-  ASSERT_NE(MAP_FAILED, (base = mmap(0, gransz * 3, PROT_READ | PROT_WRITE,
-                                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)));
-  SPAWN(fork);
-  ASSERT_SYS(0, 0, munmap(base, gransz * 3));
-  ASSERT_EQ(base, mmap(base, gransz * 3, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  ASSERT_SYS(0, 0, munmap(base, gransz * 3));
-  EXITS(0);
-  ASSERT_SYS(0, 0, munmap(base, gransz * 3));
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // zipos NON-SHARED READ-ONLY FILE MEMORY
 
@@ -733,7 +524,7 @@ TEST(mmap, sharedFileMapFork) {
 ////////////////////////////////////////////////////////////////////////////////
 // BENCHMARKS
 
-#define N 1000
+#define N (EZBENCH_COUNT * EZBENCH_TRIES)
 
 int count;
 void *ptrs[N];
@@ -743,31 +534,35 @@ void BenchMmapPrivate(void) {
   void *p;
   p = mmap(0, (sizes[count] = rand() % (pagesz * 500)), PROT_READ | PROT_WRITE,
            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  ASSERT_NE(MAP_FAILED, p);
+  if (p == MAP_FAILED)
+    __builtin_trap();
   ptrs[count] = p;
   ++count;
 }
 
 void BenchUnmap(void) {
   --count;
-  ASSERT_SYS(0, 0, munmap(ptrs[count], sizes[count]));
+  if (munmap(ptrs[count], sizes[count]))
+    __builtin_trap();
 }
 
 void BenchBigMmap(void) {
   void *p;
   p = mmap(0, 101 * 1024 * 1024, PROT_READ | PROT_WRITE,
            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  ASSERT_NE(MAP_FAILED, p);
+  if (p == MAP_FAILED)
+    __builtin_trap();
   ptrs[count++] = p;
 }
 
 void BenchBigMunmap(void) {
-  ASSERT_SYS(0, 0, munmap(ptrs[--count], 101 * 1024 * 1024));
+  if (munmap(ptrs[--count], 101 * 1024 * 1024))
+    __builtin_trap();
 }
 
 TEST(mmap, bench) {
-  BENCHMARK(N, 1, BenchMmapPrivate());
-  BENCHMARK(N, 1, BenchUnmap());
-  /* BENCHMARK(N, 1, BenchBigMmap()); */
-  /* BENCHMARK(N, 1, BenchBigMunmap()); */
+  EZBENCH2("mmap", donothing, BenchMmapPrivate());
+  EZBENCH2("munmap", donothing, BenchUnmap());
+  // EZBENCH2("big mmap", donothing, BenchBigMmap());
+  // EZBENCH2("big munmap", donothing, BenchBigMunmap());
 }
diff --git a/test/libc/intrin/mprotect_test.c b/test/libc/intrin/mprotect_test.c
index a04af31c2..4c2a6a5c3 100644
--- a/test/libc/intrin/mprotect_test.c
+++ b/test/libc/intrin/mprotect_test.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/ucontext.h"
@@ -109,15 +108,15 @@ void SetUp(void) {
                             .sa_flags = SA_SIGINFO | SA_RESETHAND};
   struct sigaction sasegv = {.sa_sigaction = OnSigSegv,
                              .sa_flags = SA_SIGINFO | SA_RESETHAND};
-  unassert(!sigaction(SIGBUS, &sabus, old + 0));
-  unassert(!sigaction(SIGSEGV, &sasegv, old + 1));
+  sigaction(SIGBUS, &sabus, old + 0);
+  sigaction(SIGSEGV, &sasegv, old + 1);
   gotbusted = false;
   gotsegv = false;
 }
 
 void TearDown(void) {
-  unassert(!sigaction(SIGBUS, old + 0, 0));
-  unassert(!sigaction(SIGSEGV, old + 1, 0));
+  sigaction(SIGBUS, old + 0, 0);
+  sigaction(SIGSEGV, old + 1, 0);
 }
 
 TEST(mprotect, testOkMemory) {
diff --git a/test/libc/intrin/munmap_test.c b/test/libc/intrin/munmap_test.c
index 630325687..9bf2ef344 100644
--- a/test/libc/intrin/munmap_test.c
+++ b/test/libc/intrin/munmap_test.c
@@ -53,106 +53,26 @@ TEST(munmap, test) {
   EXPECT_FALSE(testlib_memoryexists(p));
 }
 
-TEST(munmap, carveMemory) {
-  if (IsWindows())
-    return;  // needs carving
-  char *p;
-  int count = __maps.count;
-  ASSERT_NE(MAP_FAILED,
-            (p = mmap(__maps_randaddr(), gransz * 3, PROT_READ | PROT_WRITE,
-                      MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0)));
-  EXPECT_EQ(count + 1, __maps.count);
-  count = __maps.count;
-  EXPECT_TRUE(testlib_memoryexists(p + gransz * 0));
-  EXPECT_TRUE(testlib_memoryexists(p + gransz * 1));
-  EXPECT_TRUE(testlib_memoryexists(p + gransz * 2));
-  EXPECT_SYS(0, 0, munmap(p + gransz * 0, gransz));
-  EXPECT_EQ(count + 0, __maps.count);
-  count = __maps.count;
-  EXPECT_FALSE(testlib_memoryexists(p + gransz * 0));
-  EXPECT_TRUE(testlib_memoryexists(p + gransz * 1));
-  EXPECT_TRUE(testlib_memoryexists(p + gransz * 2));
-  EXPECT_SYS(0, 0, munmap(p + gransz * 2, gransz));
-  EXPECT_EQ(count + 0, __maps.count);
-  count = __maps.count;
-  EXPECT_FALSE(testlib_memoryexists(p + gransz * 0));
-  EXPECT_TRUE(testlib_memoryexists(p + gransz * 1));
-  EXPECT_FALSE(testlib_memoryexists(p + gransz * 2));
-  EXPECT_SYS(0, 0, munmap(p + gransz * 1, gransz));
-  EXPECT_EQ(count - 1, __maps.count);
-  count = __maps.count;
-  EXPECT_FALSE(testlib_memoryexists(p + gransz * 0));
-  EXPECT_FALSE(testlib_memoryexists(p + gransz * 1));
-  EXPECT_FALSE(testlib_memoryexists(p + gransz * 2));
-}
-
 TEST(munmap, punchHoleInMemory) {
   if (IsWindows())
     return;  // needs carving
   char *p;
-  int count = __maps.count;
-  ASSERT_NE(MAP_FAILED,
-            (p = mmap(__maps_randaddr(), gransz * 3, PROT_READ | PROT_WRITE,
-                      MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0)));
-  EXPECT_EQ(count + 1, __maps.count);
-  count = __maps.count;
+  ASSERT_NE(MAP_FAILED, (p = mmap(0, gransz * 3, PROT_READ | PROT_WRITE,
+                                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0)));
   EXPECT_TRUE(testlib_memoryexists(p + gransz * 0));
   EXPECT_TRUE(testlib_memoryexists(p + gransz * 1));
   EXPECT_TRUE(testlib_memoryexists(p + gransz * 2));
   EXPECT_SYS(0, 0, munmap(p + gransz, gransz));
-  EXPECT_EQ(count + 1, __maps.count);
-  count = __maps.count;
   EXPECT_TRUE(testlib_memoryexists(p + gransz * 0));
   EXPECT_FALSE(testlib_memoryexists(p + gransz * 1));
   EXPECT_TRUE(testlib_memoryexists(p + gransz * 2));
   EXPECT_SYS(0, 0, munmap(p, gransz));
-  EXPECT_EQ(count - 1, __maps.count);
-  count = __maps.count;
   EXPECT_SYS(0, 0, munmap(p + gransz * 2, gransz));
-  EXPECT_EQ(count - 1, __maps.count);
-  count = __maps.count;
   EXPECT_FALSE(testlib_memoryexists(p + gransz * 0));
   EXPECT_FALSE(testlib_memoryexists(p + gransz * 1));
   EXPECT_FALSE(testlib_memoryexists(p + gransz * 2));
 }
 
-TEST(munmap, fillHoleInMemory) {
-  if (IsWindows())
-    return;  // needs fungible memory
-  int count = __maps.count;
-  char *base = __maps_randaddr();
-  EXPECT_EQ(base + gransz * 0,
-            mmap(base + gransz * 0, gransz, PROT_READ | PROT_WRITE,
-                 MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
-  EXPECT_EQ(count + 1, __maps.count);
-  count = __maps.count;
-  EXPECT_TRUE(testlib_memoryexists(base + gransz * 0));
-  EXPECT_FALSE(testlib_memoryexists(base + gransz * 1));
-  EXPECT_FALSE(testlib_memoryexists(base + gransz * 2));
-  EXPECT_EQ(base + gransz * 2,
-            mmap(base + gransz * 2, gransz, PROT_READ | PROT_WRITE,
-                 MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
-  EXPECT_EQ(count + 1, __maps.count);
-  count = __maps.count;
-  EXPECT_TRUE(testlib_memoryexists(base + gransz * 0));
-  EXPECT_FALSE(testlib_memoryexists(base + gransz * 1));
-  EXPECT_TRUE(testlib_memoryexists(base + gransz * 2));
-  EXPECT_EQ(base + gransz * 1,
-            mmap(base + gransz * 1, gransz, PROT_READ | PROT_WRITE,
-                 MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
-  EXPECT_EQ(count - 1, __maps.count);
-  count = __maps.count;
-  EXPECT_TRUE(testlib_memoryexists(base + gransz * 0));
-  EXPECT_TRUE(testlib_memoryexists(base + gransz * 1));
-  EXPECT_TRUE(testlib_memoryexists(base + gransz * 2));
-  EXPECT_SYS(0, 0, munmap(base, gransz * 3));
-  EXPECT_EQ(count - 1, __maps.count);
-  count = __maps.count;
-  EXPECT_FALSE(testlib_memoryexists(base + gransz * 0));
-  EXPECT_FALSE(testlib_memoryexists(base + gransz * 1));
-  EXPECT_FALSE(testlib_memoryexists(base + gransz * 2));
-}
-
 TEST(munmap, memoryHasHole) {
   if (IsWindows())
     return;  // needs carving
@@ -265,8 +185,8 @@ TEST(munmap, tinyFile_preciseUnmapSize) {
 TEST(munmap, tinyFile_mapThriceUnmapOnce) {
   char *p;
   ASSERT_NE(MAP_FAILED, (p = mmap(0, gransz*5, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)));
-  ASSERT_SYS(0, 3, open("doge", O_RDWR | O_CREAT | O_TRUNC, 0644));
   ASSERT_SYS(0, 0, munmap(p, gransz*5));
+  ASSERT_SYS(0, 3, open("doge", O_RDWR | O_CREAT | O_TRUNC, 0644));
   ASSERT_SYS (0, 5, write(3, "hello", 5));
   ASSERT_EQ(p+gransz*0, mmap(p+gransz*0, gransz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0));
   ASSERT_EQ(p+gransz*1, mmap(p+gransz*1, 5, PROT_READ, MAP_PRIVATE|MAP_FIXED, 3, 0));
diff --git a/test/libc/intrin/pthread_mutex_lock2_test.c b/test/libc/intrin/pthread_mutex_lock2_test.c
index b530ac04b..93224da84 100644
--- a/test/libc/intrin/pthread_mutex_lock2_test.c
+++ b/test/libc/intrin/pthread_mutex_lock2_test.c
@@ -40,7 +40,7 @@ pthread_mutexattr_t attr;
 
 FIXTURE(pthread_mutex_lock, normal) {
   ASSERT_EQ(0, pthread_mutexattr_init(&attr));
-  ASSERT_EQ(0, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT));
+  ASSERT_EQ(0, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL));
   ASSERT_EQ(0, pthread_mutex_init(&lock, &attr));
   ASSERT_EQ(0, pthread_mutexattr_destroy(&attr));
 }
@@ -79,7 +79,7 @@ TEST(pthread_mutex_lock, contention) {
   int i;
   pthread_t *th = gc(malloc(sizeof(pthread_t) * THREADS));
   pthread_mutexattr_init(&attr);
-  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
+  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL);
   pthread_mutex_init(&lock, &attr);
   pthread_mutexattr_destroy(&attr);
   count = 0;
@@ -128,7 +128,7 @@ BENCH(pthread_mutex_lock, bench_uncontended) {
     pthread_mutex_t m;
     pthread_mutexattr_t attr;
     pthread_mutexattr_init(&attr);
-    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL);
     pthread_mutex_init(&m, &attr);
     EZBENCH2("normal 1x", donothing, BenchLockUnlock(&m));
   }
@@ -226,7 +226,7 @@ BENCH(pthread_mutex_lock, bench_contended) {
     pthread_mutex_t m;
     pthread_mutexattr_t attr;
     pthread_mutexattr_init(&attr);
-    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL);
     pthread_mutex_init(&m, &attr);
     struct MutexContentionArgs a = {&m};
     pthread_create(&t, 0, MutexContentionWorker, &a);
diff --git a/test/libc/intrin/pthread_mutex_lock_test.c b/test/libc/intrin/pthread_mutex_lock_test.c
index 0a5514a98..e2e97cda1 100644
--- a/test/libc/intrin/pthread_mutex_lock_test.c
+++ b/test/libc/intrin/pthread_mutex_lock_test.c
@@ -20,16 +20,12 @@
 #include "libc/atomic.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/state.internal.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/cosmo.h"
 #include "libc/errno.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/intrin/strace.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
-#include "libc/mem/leaks.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
@@ -38,7 +34,6 @@
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/rlimit.h"
-#include "libc/sysv/consts/sig.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
@@ -53,38 +48,16 @@
 int count;
 atomic_int started;
 atomic_int finished;
-pthread_mutex_t lock;
 pthread_mutex_t mylock;
 pthread_spinlock_t slock;
 pthread_t th[THREADS];
 
-void ignore_signal(int sig) {
-}
-
 void SetUpOnce(void) {
   ASSERT_SYS(0, 0, pledge("stdio rpath", 0));
-  kprintf("running %s\n", program_invocation_name);
-  signal(SIGTRAP, ignore_signal);
-}
-
-TEST(pthread_mutex_lock, default) {
-  pthread_mutexattr_t attr;
-  ASSERT_EQ(0, pthread_mutexattr_init(&attr));
-  ASSERT_EQ(0, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT));
-  ASSERT_EQ(0, pthread_mutex_init(&lock, &attr));
-  ASSERT_EQ(0, pthread_mutexattr_destroy(&attr));
-  ASSERT_EQ(0, pthread_mutex_init(&lock, 0));
-  ASSERT_EQ(0, pthread_mutex_lock(&lock));
-  ASSERT_EQ(EBUSY, pthread_mutex_trylock(&lock));
-  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(0, pthread_mutex_trylock(&lock));
-  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(0, pthread_mutex_lock(&lock));
-  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(0, pthread_mutex_destroy(&lock));
 }
 
 TEST(pthread_mutex_lock, normal) {
+  pthread_mutex_t lock;
   pthread_mutexattr_t attr;
   ASSERT_EQ(0, pthread_mutexattr_init(&attr));
   ASSERT_EQ(0, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL));
@@ -102,6 +75,7 @@ TEST(pthread_mutex_lock, normal) {
 }
 
 TEST(pthread_mutex_lock, recursive) {
+  pthread_mutex_t lock;
   pthread_mutexattr_t attr;
   ASSERT_EQ(0, pthread_mutexattr_init(&attr));
   ASSERT_EQ(0, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE));
@@ -121,19 +95,20 @@ TEST(pthread_mutex_lock, recursive) {
   }
   ASSERT_EQ(0, pthread_mutex_lock(&lock));
   ASSERT_EQ(0, pthread_mutex_unlock(&lock));
+  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
   ASSERT_EQ(0, pthread_mutex_destroy(&lock));
 }
 
 TEST(pthread_mutex_lock, errorcheck) {
+  pthread_mutex_t lock;
   pthread_mutexattr_t attr;
   ASSERT_EQ(0, pthread_mutexattr_init(&attr));
   ASSERT_EQ(0, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK));
   ASSERT_EQ(0, pthread_mutex_init(&lock, &attr));
   ASSERT_EQ(0, pthread_mutexattr_destroy(&attr));
   ASSERT_EQ(0, pthread_mutex_lock(&lock));
-  ASSERT_EQ(1, __deadlock_tracked(&lock));
   ASSERT_EQ(EDEADLK, pthread_mutex_lock(&lock));
-  ASSERT_EQ(EBUSY, pthread_mutex_trylock(&lock));
+  ASSERT_EQ(EDEADLK, pthread_mutex_trylock(&lock));
   ASSERT_EQ(0, pthread_mutex_unlock(&lock));
   ASSERT_EQ(0, pthread_mutex_destroy(&lock));
 }
@@ -156,7 +131,7 @@ TEST(pthread_mutex_lock, contention) {
   int i;
   pthread_mutexattr_t attr;
   pthread_mutexattr_init(&attr);
-  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
+  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_NORMAL);
   pthread_mutex_init(&mylock, &attr);
   pthread_mutexattr_destroy(&attr);
   count = 0;
diff --git a/test/libc/thread/pthread_spin_lock_test.c b/test/libc/intrin/pthread_spin_lock_test.c
similarity index 100%
rename from test/libc/thread/pthread_spin_lock_test.c
rename to test/libc/intrin/pthread_spin_lock_test.c
diff --git a/test/libc/intrin/rand64_test.c b/test/libc/intrin/rand64_test.c
index 0e0062656..dc0c19a4b 100644
--- a/test/libc/intrin/rand64_test.c
+++ b/test/libc/intrin/rand64_test.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/sigaction.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/internal.h"
 #include "libc/stdio/rand.h"
 #include "libc/str/str.h"
diff --git a/test/libc/intrin/strcmp_test.c b/test/libc/intrin/strcmp_test.c
index c2be5af6c..742dac914 100644
--- a/test/libc/intrin/strcmp_test.c
+++ b/test/libc/intrin/strcmp_test.c
@@ -19,7 +19,7 @@
 #include "libc/assert.h"
 #include "libc/ctype.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/cachesize.h"
diff --git a/test/libc/intrin/strlen_test.c b/test/libc/intrin/strlen_test.c
index 3d0619ee4..87c316fa0 100644
--- a/test/libc/intrin/strlen_test.c
+++ b/test/libc/intrin/strlen_test.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
diff --git a/test/libc/intrin/tree_test.c b/test/libc/intrin/tree_test.c
index fbddf18f7..0f9d3f04d 100644
--- a/test/libc/intrin/tree_test.c
+++ b/test/libc/intrin/tree_test.c
@@ -17,7 +17,7 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/maps.h"
 #include "libc/intrin/tree.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/rand.h"
@@ -178,9 +178,6 @@ void search_test(void) {
   //       ↑           ↑               ↑
   //       4           3               8
   //
-  x = tree_floor(tree, (void *)0l, number_search);
-  if (x)
-    exit(4);
   x = tree_floor(tree, (void *)4l, number_search);
   if (!x)
     exit(4);
diff --git a/test/libc/log/BUILD.mk b/test/libc/log/BUILD.mk
index ee7bf2c1e..5850736bf 100644
--- a/test/libc/log/BUILD.mk
+++ b/test/libc/log/BUILD.mk
@@ -5,8 +5,14 @@ PKGS += TEST_LIBC_LOG
 
 TEST_LIBC_LOG_SRCS := $(wildcard test/libc/log/*.c)
 TEST_LIBC_LOG_SRCS_TEST = $(filter %_test.c,$(TEST_LIBC_LOG_SRCS))
-TEST_LIBC_LOG_OBJS = $(TEST_LIBC_LOG_SRCS:%.c=o/$(MODE)/%.o)
-TEST_LIBC_LOG_COMS = $(TEST_LIBC_LOG_SRCS:%.c=o/$(MODE)/%)
+
+TEST_LIBC_LOG_OBJS =						\
+	$(TEST_LIBC_LOG_SRCS:%.c=o/$(MODE)/%.o)			\
+	o/$(MODE)/test/libc/log/backtrace.zip.o			\
+	o/$(MODE)/test/libc/log/backtrace.dbg.zip.o
+
+TEST_LIBC_LOG_COMS =						\
+	$(TEST_LIBC_LOG_SRCS:%.c=o/$(MODE)/%)
 
 TEST_LIBC_LOG_BINS =						\
 	$(TEST_LIBC_LOG_COMS)					\
@@ -20,17 +26,19 @@ TEST_LIBC_LOG_CHECKS =						\
 
 TEST_LIBC_LOG_DIRECTDEPS =					\
 	LIBC_CALLS						\
-	LIBC_FMT						\
+	LIBC_RUNTIME						\
+	NET_HTTP						\
+	LIBC_STDIO						\
+	LIBC_X							\
 	LIBC_INTRIN						\
-	LIBC_LOG						\
+	LIBC_FMT						\
 	LIBC_MEM						\
 	LIBC_NEXGEN32E						\
-	LIBC_PROC						\
-	LIBC_RUNTIME						\
-	LIBC_STDIO						\
+	LIBC_LOG						\
 	LIBC_STR						\
-	LIBC_SYSV						\
 	LIBC_TESTLIB						\
+	LIBC_SYSV						\
+	LIBC_LOG
 
 TEST_LIBC_LOG_DEPS :=						\
 	$(call uniq,$(foreach x,$(TEST_LIBC_LOG_DIRECTDEPS),$($(x))))
@@ -48,6 +56,29 @@ o/$(MODE)/test/libc/log/%.dbg:					\
 		$(APE_NO_MODIFY_SELF)
 	@$(APELINK)
 
+o/$(MODE)/test/libc/log/backtrace_test.dbg:			\
+		$(TEST_LIBC_LOG_DEPS)				\
+		o/$(MODE)/test/libc/log/backtrace.zip.o		\
+		o/$(MODE)/test/libc/log/backtrace.dbg.zip.o	\
+		o/$(MODE)/test/libc/log/backtrace_test.o	\
+		o/$(MODE)/test/libc/log/log.pkg			\
+		$(LIBC_TESTMAIN)				\
+		$(CRT)						\
+		$(APE_NO_MODIFY_SELF)
+	@$(APELINK)
+
+o/$(MODE)/test/libc/log/backtrace.dbg:				\
+		$(TEST_LIBC_LOG_DEPS)				\
+		o/$(MODE)/test/libc/log/backtrace.o		\
+		$(CRT)						\
+		$(APE_NO_MODIFY_SELF)
+	@$(APELINK)
+
+o/$(MODE)/test/libc/log/backtrace.zip.o				\
+o/$(MODE)/test/libc/log/backtrace.dbg.zip.o: private		\
+		ZIPOBJ_FLAGS +=					\
+			-B
+
 .PHONY: o/$(MODE)/test/libc/log
 o/$(MODE)/test/libc/log:					\
 		$(TEST_LIBC_LOG_BINS)				\
diff --git a/test/libc/log/backtrace.c b/test/libc/log/backtrace.c
new file mode 100644
index 000000000..4eb39eda3
--- /dev/null
+++ b/test/libc/log/backtrace.c
@@ -0,0 +1,154 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/fmt/conv.h"
+#include "libc/intrin/weaken.h"
+#include "libc/limits.h"
+#include "libc/log/log.h"
+#include "libc/macros.internal.h"
+#include "libc/mem/leaks.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/internal.h"
+#include "libc/runtime/symbols.internal.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#ifdef __x86_64__
+
+#include <stdlib.h>
+
+int StackOverflow(int d) {
+  char A[8];
+  for (int i = 0; i < sizeof(A); i++)
+    A[i] = d + i;
+  if (__veil("r", d))
+    return StackOverflow(d + 1) + A[d % sizeof(A)];
+  return 0;
+}
+
+void FpuCrash(void) {
+  typedef char xmm_t __attribute__((__vector_size__(16)));
+  xmm_t v = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+             0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf};
+  volatile int x = 0;
+  asm volatile("fldpi");
+  asm volatile("mov\t%0,%%r15" : /* no outputs */ : "g"(0x3133731337));
+  asm volatile("movaps\t%0,%%xmm15" : /* no outputs */ : "x"(v));
+  fputc(7 / x, stdout);
+}
+
+char bss[10];
+void BssOverrunCrash(int n) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    bss[i] = i;
+  }
+}
+
+char data[10] = "abcdeabcde";
+void DataOverrunCrash(int n) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    data[i] = i;
+  }
+}
+
+const char rodata[10] = "abcdeabcde";
+int RodataOverrunCrash(int i) {
+  return rodata[i];
+}
+
+char *StackOverrunCrash(int n) {
+  int i;
+  char stack[10];
+  bzero(stack, sizeof(stack));
+  for (i = 0; i < n; ++i) {
+    stack[i] = i;
+  }
+  return strdup(stack);
+}
+
+char *MemoryLeakCrash(void) {
+  char *p = strdup("doge");
+  CheckForMemoryLeaks();
+  return p;
+}
+
+int NpeCrash(char *p) {
+  asm("nop");  // xxx: due to backtrace addr-1 thing
+  return *p;
+}
+
+int StackOverflowCrash(int d) {
+  char A[8];
+  for (int i = 0; i < sizeof(A); i++)
+    A[i] = d + i;
+  if (__veil("r", d))
+    return StackOverflowCrash(d + 1) + A[d % sizeof(A)];
+  return 0;
+}
+
+void (*pFpuCrash)(void) = FpuCrash;
+void (*pBssOverrunCrash)(int) = BssOverrunCrash;
+void (*pDataOverrunCrash)(int) = DataOverrunCrash;
+int (*pRodataOverrunCrash)(int) = RodataOverrunCrash;
+char *(*pMemoryLeakCrash)(void) = MemoryLeakCrash;
+int (*pNpeCrash)(char *) = NpeCrash;
+
+int main(int argc, char *argv[]) {
+  ShowCrashReports();
+  if (argc > 1) {
+    switch (atoi(argv[1])) {
+      case 0:
+        break;
+      case 1:
+        pFpuCrash();
+        exit(0);
+      case 2:
+        pBssOverrunCrash(10 + 1);
+        exit(0);
+      case 3:
+        exit(pRodataOverrunCrash(10 + 1));
+      case 4:
+        pDataOverrunCrash(10 + 1);
+        exit(0);
+      case 5:
+        exit(StackOverflowCrash(0));
+      case 6:
+        exit((intptr_t)pMemoryLeakCrash());
+      case 7:
+        exit(pNpeCrash(0));
+      case 8:
+        exit(pNpeCrash(0));
+      case 9:
+        exit(StackOverflow(0));
+      default:
+        fputs("error: unrecognized argument\n", stderr);
+        exit(1);
+    }
+  } else {
+    fputs("error: too few args\n", stderr);
+    exit(1);
+  }
+}
+
+#else
+
+int main(int argc, char *argv[]) {
+}
+
+#endif /* __x86_64__ */
diff --git a/test/libc/log/backtrace_test.c b/test/libc/log/backtrace_test.c
new file mode 100644
index 000000000..ab735e4cf
--- /dev/null
+++ b/test/libc/log/backtrace_test.c
@@ -0,0 +1,402 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "libc/calls/calls.h"
+#include "libc/dce.h"
+#include "libc/errno.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/log/libfatal.internal.h"
+#include "libc/log/log.h"
+#include "libc/mem/gc.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/internal.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/symbols.internal.h"
+#include "libc/stdio/append.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/sig.h"
+#include "libc/testlib/testlib.h"
+#include "libc/x/xasprintf.h"
+#include "net/http/escape.h"
+#ifdef __x86_64__
+
+#if 0
+__static_yoink("backtrace");
+__static_yoink("backtrace.dbg");
+
+void SetUpOnce(void) {
+  testlib_enable_tmp_setup_teardown_once();
+  ASSERT_NE(-1, mkdir("bin", 0755));
+  testlib_extract("/zip/backtrace", "bin/backtrace", 0755);
+  testlib_extract("/zip/backtrace.dbg", "bin/backtrace.dbg", 0755);
+}
+
+static bool OutputHasSymbol(const char *output, const char *s) {
+  return strstr(output, s) || (!FindDebugBinary() && strstr(output, "NULL"));
+}
+
+// UNFREED MEMORY
+// o/dbg/test/libc/log/backtrace_test
+// max allocated space            655,360
+// total allocated space               80
+// total free space               327,600
+// releasable space                     0
+// mmaped space                    65,360
+// non-mmapped space              327,680
+//
+// 100080040020 64 bytes 5 used
+//       421871 strdup
+//       416529 MemoryLeakCrash
+//       41666d SetUp
+//       45428c testlib_runtestcases
+//
+// 00007fff0000-000080010000 rw-pa-F 2x shadow of 000000000000
+// 000080070000-0000800a0000 rw-pa-F 3x shadow of 0000003c0000
+// 02008fff0000-020090020000 rw-pa-F 3x shadow of 10007ffc0000
+// 020090060000-020090080000 rw-pa-F 2x shadow of 100080340000
+// 0e007fff0000-0e0080010000 rw-pa-F 2x shadow of 6ffffffc0000
+// 100006560000-100006580000 rw-pa-F 2x shadow of 7ffc32b40000
+// 100080000000-100080050000 rw-pa-- 5x automap w/ 50 frame hole
+// 100080370000-100080390000 rw-pa-- 2x automap w/ 1 frame hole
+// 1000803a0000-1000803b0000 rw-pa-- 1x automap
+// 6ffffffe0000-700000000000 rw-paSF 2x stack
+// # 24 frames mapped w/ 51 frames gapped
+TEST(ShowCrashReports, testMemoryLeakCrash) {
+  size_t got;
+  ssize_t rc;
+  int ws, pid, fds[2];
+  char *output, buf[512];
+  ASSERT_NE(-1, pipe2(fds, O_CLOEXEC));
+  ASSERT_NE(-1, (pid = fork()));
+  if (!pid) {
+    dup2(fds[1], 1);
+    dup2(fds[1], 2);
+    execv("bin/backtrace", (char *const[]){"bin/backtrace", "6", 0});
+    _Exit(127);
+  }
+  close(fds[1]);
+  output = 0;
+  appends(&output, "");
+  for (;;) {
+    rc = read(fds[0], buf, sizeof(buf));
+    if (rc == -1) {
+      ASSERT_EQ(EINTR, errno);
+      continue;
+    }
+    if ((got = rc)) {
+      appendd(&output, buf, got);
+    } else {
+      break;
+    }
+  }
+  close(fds[0]);
+  ASSERT_NE(-1, wait(&ws));
+  // tinyprint(2, gc(IndentLines(output, -1, 0, 4)), "\n", NULL);
+  EXPECT_EQ(78 << 8, ws);
+  ASSERT_TRUE(!!strstr(output, "UNFREED MEMORY"));
+  if (IsAsan()) {
+    ASSERT_TRUE(OutputHasSymbol(output, "strdup") &&
+                OutputHasSymbol(output, "MemoryLeakCrash"));
+  }
+  free(output);
+}
+
+// error: Uncaught SIGFPE (FPE_INTDIV) on nightmare pid 11724
+//   /home/jart/cosmo/o/dbg/test/libc/log/backtrace_test.tmp.11721
+//   ENOTTY[25]
+//   Linux nightmare SMP Thu, 12 Aug 2021 06:16:45 UTC
+//
+// 0x0000000000414659: FpuCrash at test/libc/log/backtrace_test.c:35
+// 0x000000000045003b: testlib_runtestcases at libc/testlib/testrunner.c:98
+// 0x000000000044b770: testlib_runalltests at libc/testlib/runner.c:37
+// 0x000000000040278e: main at libc/testlib/testmain.c:86
+// 0x0000000000403210: cosmo at libc/runtime/cosmo.S:65
+// 0x0000000000402247: _start at libc/crt/crt.S:67
+//
+// RAX 0000000000000007 RBX 00006fffffffff10 RDI 00007ffe0745fde1 ST(0) 0.0
+// RCX 0000000000000000 RDX 0000000000000000 RSI 0000000000489900 ST(1) 0.0
+// RBP 00006fffffffff70 RSP 00006fffffffff10 RIP 000000000041465a ST(2) 0.0
+//  R8 0000000000000001  R9 00006ffffffffcc0 R10 00006ffffffffe60 ST(3) 0.0
+// R11 000000000000000d R12 00000dffffffffe2 R13 00006fffffffff10 ST(4) 0.0
+// R14 0000000000000003 R15 000000000049b700 VF PF ZF IF
+//
+// XMM0  00000000000000000000000000000000 XMM8  00000000000000000000000000000000
+// XMM1  000000008000000400000000004160ea XMM9  00000000000000000000000000000000
+// XMM2  00000000000000000000000000000000 XMM10 00000000000000000000000000000000
+// XMM3  00000000000000000000000000000000 XMM11 00000000000000000000000000000000
+// XMM4  00000000000000000000000000000000 XMM12 00000000000000000000000000000000
+// XMM5  00000000000000000000000000000000 XMM13 00000000000000000000000000000000
+// XMM6  00000000000000000000000000000000 XMM14 00000000000000000000000000000000
+// XMM7  00000000000000000000000000000000 XMM15 00000000000000000000000000000000
+//
+// mm->i == 4;
+// mm->p[  0]=={0x00008007,0x00008008,-1,3,50}; /* 2 */
+// /* 234,881,012 */
+// mm->p[  1]=={0x0e007ffd,0x0e007fff,-1,3,50}; /* 3 */
+// /* 33,538,280 */
+// mm->p[  2]=={0x100040e8,0x100040e8,-1,3,50}; /* 1 */
+// /* 1,610,596,103 */
+// mm->p[  3]=={0x6ffffff0,0x6fffffff,12884901888,306,0}; /* 16 */
+// /* 22 frames mapped w/ 1,879,015,395 frames gapped */
+//
+// 00400000-0045b000 r-xp 00000000 08:03 4587526
+// /home/jart/cosmo/o/dbg/test/libc/log/backtrace_test.tmp.11721
+// 0045b000-00461000 rw-p 0005b000 08:03 4587526
+// /home/jart/cosmo/o/dbg/test/libc/log/backtrace_test.tmp.11721
+// 00461000-004a0000 rw-p 00000000 00:00 0
+// 80070000-80090000 rw-p 00000000 00:00 0
+// e007ffd0000-e0080000000 rw-p 00000000 00:00 0
+// 100040e80000-100040e90000 rw-p 00000000 00:00 0
+// 6ffffff00000-700000000000 rw-p 00000000 00:00 0
+// 7ffe0743f000-7ffe07460000 rw-p 00000000 00:00 0 [stack]
+// 7ffe075a8000-7ffe075ab000 r--p 00000000 00:00 0 [vvar]
+// 7ffe075ab000-7ffe075ac000 r-xp 00000000 00:00 0 [vdso]
+//
+// /home/jart/cosmo/o/dbg/test/libc/log/backtrace_test.tmp.11721 1
+TEST(ShowCrashReports, testDivideByZero) {
+  size_t got;
+  ssize_t rc;
+  int ws, pid, fds[2];
+  char *output, buf[512];
+  ASSERT_NE(-1, pipe2(fds, O_CLOEXEC));
+  ASSERT_NE(-1, (pid = fork()));
+  if (!pid) {
+    dup2(fds[1], 1);
+    dup2(fds[1], 2);
+    execv("bin/backtrace", (char *const[]){"bin/backtrace", "1", 0});
+    _Exit(127);
+  }
+  close(fds[1]);
+  output = 0;
+  appends(&output, "");
+  for (;;) {
+    rc = read(fds[0], buf, sizeof(buf));
+    if (rc == -1) {
+      ASSERT_EQ(EINTR, errno);
+      continue;
+    }
+    if ((got = rc)) {
+      appendd(&output, buf, got);
+    } else {
+      break;
+    }
+  }
+  close(fds[0]);
+  ASSERT_NE(-1, wait(&ws));
+  // tinyprint(2, gc(IndentLines(output, -1, 0, 4)), "\n", NULL);
+  if (IsModeDbg()) {
+    EXPECT_EQ(77 << 8, ws);
+  } else {
+    EXPECT_TRUE(WIFSIGNALED(ws));
+    EXPECT_EQ(SIGFPE, WTERMSIG(ws));
+  }
+  /* NULL is stopgap until we can copy symbol tables into binary */
+#ifdef __FNO_OMIT_FRAME_POINTER__
+  ASSERT_TRUE(OutputHasSymbol(output, "FpuCrash"));
+#endif
+  if (strstr(output, "divrem overflow")) {
+    // UBSAN handled it
+  } else {
+    // ShowCrashReports() handled it
+    if (!strstr(output, gc(xasprintf("%d", pid)))) {
+      fprintf(stderr, "ERROR: crash report didn't have pid\n%s\n",
+              gc(IndentLines(output, -1, 0, 4)));
+      __die();
+    }
+    if (!strstr(output, "SIGFPE")) {
+      fprintf(stderr, "ERROR: crash report didn't have signal name\n%s\n",
+              gc(IndentLines(output, -1, 0, 4)));
+      __die();
+    }
+    // XXX: WSL doesn't save and restore x87 registers to ucontext_t
+    if (!__iswsl1()) {
+      if (!strstr(output, "3.141")) {
+        fprintf(stderr, "ERROR: crash report didn't have fpu register\n%s\n",
+                gc(IndentLines(output, -1, 0, 4)));
+        __die();
+      }
+    }
+    if (!strstr(output, "0f0e0d0c0b0a09080706050403020100")) {
+      fprintf(stderr, "ERROR: crash report didn't have sse register\n%s\n",
+              gc(IndentLines(output, -1, 0, 4)));
+      __die();
+    }
+    if (!strstr(output, "3133731337")) {
+      fprintf(stderr, "ERROR: crash report didn't have general register\n%s\n",
+              gc(IndentLines(output, -1, 0, 4)));
+      __die();
+    }
+  }
+  free(output);
+}
+
+TEST(ShowCrashReports, testBssOverrunCrash) {
+  if (!IsAsan()) return;
+  size_t got;
+  ssize_t rc;
+  int ws, pid, fds[2];
+  char *output, buf[512];
+  ASSERT_NE(-1, pipe2(fds, O_CLOEXEC));
+  ASSERT_NE(-1, (pid = fork()));
+  if (!pid) {
+    dup2(fds[1], 1);
+    dup2(fds[1], 2);
+    execv("bin/backtrace", (char *const[]){"bin/backtrace", "2", 0});
+    _Exit(127);
+  }
+  close(fds[1]);
+  output = 0;
+  appends(&output, "");
+  for (;;) {
+    rc = read(fds[0], buf, sizeof(buf));
+    if (rc == -1) {
+      ASSERT_EQ(EINTR, errno);
+      continue;
+    }
+    if ((got = rc)) {
+      appendd(&output, buf, got);
+    } else {
+      break;
+    }
+  }
+  close(fds[0]);
+  ASSERT_NE(-1, wait(&ws));
+  // tinyprint(2, gc(IndentLines(output, -1, 0, 4)), "\n", NULL);
+  EXPECT_EQ(77 << 8, ws);
+  /* NULL is stopgap until we can copy symbol tablces into binary */
+#ifdef __FNO_OMIT_FRAME_POINTER__
+  ASSERT_TRUE(OutputHasSymbol(output, "BssOverrunCrash"));
+#endif
+  if (IsAsan()) {
+    ASSERT_TRUE(
+        !!strstr(output, "'int' index 10 into 'char [10]' out of bounds"));
+  } else {
+    ASSERT_TRUE(!!strstr(output, "☺☻♥♦♣♠•◘○"));
+    ASSERT_TRUE(!!strstr(output, "global redzone"));
+  }
+  free(output);
+}
+
+TEST(ShowCrashReports, testDataOverrunCrash) {
+  if (!IsAsan()) return;
+  size_t got;
+  ssize_t rc;
+  int ws, pid, fds[2];
+  char *output, buf[512];
+  ASSERT_NE(-1, pipe2(fds, O_CLOEXEC));
+  ASSERT_NE(-1, (pid = fork()));
+  if (!pid) {
+    dup2(fds[1], 1);
+    dup2(fds[1], 2);
+    execv("bin/backtrace", (char *const[]){"bin/backtrace", "4", 0});
+    _Exit(127);
+  }
+  close(fds[1]);
+  output = 0;
+  appends(&output, "");
+  for (;;) {
+    rc = read(fds[0], buf, sizeof(buf));
+    if (rc == -1) {
+      ASSERT_EQ(EINTR, errno);
+      continue;
+    }
+    if ((got = rc)) {
+      appendd(&output, buf, got);
+    } else {
+      break;
+    }
+  }
+  close(fds[0]);
+  ASSERT_NE(-1, wait(&ws));
+  // tinyprint(2, gc(IndentLines(output, -1, 0, 4)), "\n", NULL);
+  EXPECT_EQ(77 << 8, ws);
+  /* NULL is stopgap until we can copy symbol tablces into binary */
+#ifdef __FNO_OMIT_FRAME_POINTER__
+  ASSERT_TRUE(OutputHasSymbol(output, "DataOverrunCrash"));
+#endif
+  if (!strstr(output, "'int' index 10 into 'char [10]' out")) {  // ubsan
+    ASSERT_TRUE(!!strstr(output, "☺☻♥♦♣♠•◘○"));                  // asan
+    ASSERT_TRUE(!!strstr(output, "global redzone"));             // asan
+  }
+  free(output);
+}
+
+TEST(ShowCrashReports, testNpeCrashAfterFinalize) {
+  /*
+   * this test makes sure we're not doing things like depending on
+   * environment variables after __cxa_finalize is called in cases
+   * where putenv() is used
+   */
+  size_t got;
+  ssize_t rc;
+  int ws, pid, fds[2];
+  char *output, buf[512];
+  ASSERT_NE(-1, pipe2(fds, O_CLOEXEC));
+  ASSERT_NE(-1, (pid = fork()));
+  if (!pid) {
+    dup2(fds[1], 1);
+    dup2(fds[1], 2);
+    execv("bin/backtrace", (char *const[]){"bin/backtrace", "8", 0});
+    _Exit(127);
+  }
+  close(fds[1]);
+  output = 0;
+  appends(&output, "");
+  for (;;) {
+    rc = read(fds[0], buf, sizeof(buf));
+    if (rc == -1) {
+      ASSERT_EQ(EINTR, errno);
+      continue;
+    }
+    if ((got = rc)) {
+      appendd(&output, buf, got);
+    } else {
+      break;
+    }
+  }
+  close(fds[0]);
+  ASSERT_NE(-1, wait(&ws));
+  // tinyprint(2, gc(IndentLines(output, -1, 0, 4)), "\n", NULL);
+  if (IsModeDbg()) {
+    EXPECT_EQ(77 << 8, ws);
+  } else {
+    EXPECT_TRUE(WIFSIGNALED(ws));
+    EXPECT_EQ(SIGSEGV, WTERMSIG(ws));
+  }
+  /* NULL is stopgap until we can copy symbol tables into binary */
+  if (!strstr(output, IsAsan() ? "null pointer" : "Uncaught SIGSEGV (SEGV_")) {
+    fprintf(stderr, "ERROR: crash report didn't diagnose the problem\n%s\n",
+            gc(IndentLines(output, -1, 0, 4)));
+    __die();
+  }
+#ifdef __FNO_OMIT_FRAME_POINTER__
+  if (!OutputHasSymbol(output, "NpeCrash")) {
+    fprintf(stderr, "ERROR: crash report didn't have backtrace\n%s\n",
+            gc(IndentLines(output, -1, 0, 4)));
+    __die();
+  }
+#endif
+  free(output);
+}
+#endif
+
+#endif /* __x86_64__ */
diff --git a/test/libc/mem/djbsort_test.c b/test/libc/mem/djbsort_test.c
index e4c53ff94..3f1f9db9b 100644
--- a/test/libc/mem/djbsort_test.c
+++ b/test/libc/mem/djbsort_test.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/limits.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
diff --git a/test/libc/mem/malloc_test.c b/test/libc/mem/malloc_test.c
index b1b7d2609..18d2829ba 100644
--- a/test/libc/mem/malloc_test.c
+++ b/test/libc/mem/malloc_test.c
@@ -22,8 +22,9 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/cxaatexit.h"
+#include "libc/intrin/kprintf.h"
 #include "libc/intrin/safemacros.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/internal.h"
@@ -161,7 +162,7 @@ void *bulk[1024];
 void BulkFreeBenchSetup(void) {
   size_t i;
   for (i = 0; i < ARRAYLEN(bulk); ++i) {
-    bulk[i] = rand() % 64 ? malloc(rand() % 64) : 0;
+    bulk[i] = malloc(rand() % 64);
   }
 }
 
@@ -178,15 +179,47 @@ void MallocFree(void) {
   free(p);
 }
 
-void eat(void *p) {
-}
-
-void (*pEat)(void *) = eat;
-
 BENCH(bulk_free, bench) {
-  /* pEat(pthread_create); */
   EZBENCH2("free() bulk", BulkFreeBenchSetup(), FreeBulk());
   EZBENCH2("bulk_free()", BulkFreeBenchSetup(),
            bulk_free(bulk, ARRAYLEN(bulk)));
-  EZBENCH2("free(malloc(16))", donothing, MallocFree());
+  EZBENCH2("free(malloc(16)) ST", donothing, MallocFree());
+  __enable_threads();
+  EZBENCH2("free(malloc(16)) MT", donothing, MallocFree());
+}
+
+#define ITERATIONS 1000
+#define THREADS    10
+#define SIZE       1024
+
+void *Worker(void *arg) {
+  for (int i = 0; i < ITERATIONS; ++i) {
+    char *p;
+    ASSERT_NE(NULL, (p = malloc(lemur64() % SIZE)));
+    ASSERT_NE(NULL, (p = realloc(p, max(lemur64() % SIZE, 1))));
+    free(p);
+  }
+  return 0;
+}
+
+TEST(malloc, torture) {
+  int i, n = THREADS;
+  pthread_t *t = gc(malloc(sizeof(pthread_t) * n));
+  if (!n)
+    return;
+  printf("\nmalloc torture test w/ %d threads and %d iterations\n", n,
+         ITERATIONS);
+  SPAWN(fork);
+  struct timespec t1 = timespec_real();
+  for (i = 0; i < n; ++i) {
+    ASSERT_EQ(0, pthread_create(t + i, 0, Worker, 0));
+  }
+  for (i = 0; i < n; ++i) {
+    ASSERT_EQ(0, pthread_join(t[i], 0));
+  }
+  struct timespec t2 = timespec_real();
+  printf("consumed %g wall and %g cpu seconds\n",
+         timespec_tomicros(timespec_sub(t2, t1)) * 1e-6,
+         (double)clock() / CLOCKS_PER_SEC);
+  EXITS(0);
 }
diff --git a/test/libc/mem/malloc_torture_test.c b/test/libc/mem/malloc_torture_test.c
deleted file mode 100644
index f20c1dc20..000000000
--- a/test/libc/mem/malloc_torture_test.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/timespec.h"
-#include "libc/intrin/safemacros.h"
-#include "libc/mem/gc.h"
-#include "libc/mem/leaks.h"
-#include "libc/mem/mem.h"
-#include "libc/stdio/rand.h"
-#include "libc/stdio/stdio.h"
-#include "libc/testlib/subprocess.h"
-#include "libc/testlib/testlib.h"
-#include "libc/thread/thread.h"
-
-#define ITERATIONS 1000
-#define THREADS    10
-#define SIZE       1024
-
-void *Worker(void *arg) {
-  for (int i = 0; i < ITERATIONS; ++i) {
-    char *p;
-    ASSERT_NE(NULL, (p = malloc(rand() % SIZE)));
-    ASSERT_NE(NULL, (p = realloc(p, rand() % SIZE)));
-    free(p);
-  }
-  return 0;
-}
-
-TEST(malloc, torture) {
-  int i, n = THREADS;
-  pthread_t *t = gc(malloc(sizeof(pthread_t) * n));
-  if (!n)
-    return;
-  printf("\nmalloc torture test w/ %d threads and %d iterations\n", n,
-         ITERATIONS);
-  SPAWN(fork);
-  AssertNoLocksAreHeld();
-  struct timespec t1 = timespec_real();
-  for (i = 0; i < n; ++i)
-    ASSERT_EQ(0, pthread_create(t + i, 0, Worker, 0));
-  for (i = 0; i < n; ++i)
-    ASSERT_EQ(0, pthread_join(t[i], 0));
-  struct timespec t2 = timespec_real();
-  printf("consumed %g wall and %g cpu seconds\n",
-         timespec_tomicros(timespec_sub(t2, t1)) * 1e-6,
-         (double)clock() / CLOCKS_PER_SEC);
-  EXITS(0);
-}
diff --git a/test/libc/mem/qsort_test.c b/test/libc/mem/qsort_test.c
index de279d59e..e693da6b6 100644
--- a/test/libc/mem/qsort_test.c
+++ b/test/libc/mem/qsort_test.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
diff --git a/test/libc/nexgen32e/kbase36_test.c b/test/libc/nexgen32e/kbase36_test.c
index cd23d5fd0..aea4b9959 100644
--- a/test/libc/nexgen32e/kbase36_test.c
+++ b/test/libc/nexgen32e/kbase36_test.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/testlib/testlib.h"
 
 TEST(kBase36, test) {
diff --git a/test/libc/nexgen32e/kcp437_test.c b/test/libc/nexgen32e/kcp437_test.c
index 018453744..46ce97b07 100644
--- a/test/libc/nexgen32e/kcp437_test.c
+++ b/test/libc/nexgen32e/kcp437_test.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/str/unicode.h"
 #include "libc/testlib/testlib.h"
 
diff --git a/test/libc/nexgen32e/memmove_test.c b/test/libc/nexgen32e/memmove_test.c
index 92625b768..8c54d5d08 100644
--- a/test/libc/nexgen32e/memmove_test.c
+++ b/test/libc/nexgen32e/memmove_test.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/rand.h"
 #include "libc/str/str.h"
diff --git a/test/libc/proc/BUILD.mk b/test/libc/proc/BUILD.mk
index 4175234ec..11f37d91d 100644
--- a/test/libc/proc/BUILD.mk
+++ b/test/libc/proc/BUILD.mk
@@ -29,17 +29,15 @@ TEST_LIBC_PROC_DIRECTDEPS =						\
 	LIBC_MEM							\
 	LIBC_NEXGEN32E							\
 	LIBC_NT_KERNEL32						\
-	LIBC_PROC							\
 	LIBC_RUNTIME							\
-	LIBC_LOG							\
-	LIBC_STDIO							\
+	LIBC_PROC							\
 	LIBC_STR							\
 	LIBC_SYSV							\
 	LIBC_TESTLIB							\
 	LIBC_THREAD							\
 	LIBC_X								\
 	THIRD_PARTY_MUSL						\
-	THIRD_PARTY_TR							\
+	THIRD_PARTY_TR
 
 TEST_LIBC_PROC_DEPS :=							\
 	$(call uniq,$(foreach x,$(TEST_LIBC_PROC_DIRECTDEPS),$($(x))))
@@ -61,17 +59,6 @@ o/$(MODE)/test/libc/proc/%.dbg:						\
 o/$(MODE)/test/libc/proc/posix_spawn_test.runs:				\
 		private QUOTA += -M8192m
 
-o/$(MODE)/test/libc/proc/fork_test.dbg:					\
-		$(TEST_LIBC_PROC_DEPS)					\
-		o/$(MODE)/test/libc/proc/fork_test.o			\
-		o/$(MODE)/test/libc/proc/proc.pkg			\
-		o/$(MODE)/tool/hello/life-pe.ape.zip.o			\
-		o/$(MODE)/test/libc/proc/life.zip.o			\
-		$(LIBC_TESTMAIN)					\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
 o/$(MODE)/test/libc/proc/posix_spawn_test.dbg:				\
 		$(TEST_LIBC_PROC_DEPS)					\
 		o/$(MODE)/test/libc/proc/posix_spawn_test.o		\
@@ -86,12 +73,23 @@ o/$(MODE)/test/libc/proc/posix_spawn_test.dbg:				\
 		$(APE_NO_MODIFY_SELF)
 	@$(APELINK)
 
+o/$(MODE)/test/libc/proc/system_test.dbg:				\
+		$(TEST_LIBC_PROC_DEPS)					\
+		o/$(MODE)/test/libc/proc/system_test.o			\
+		o/$(MODE)/test/libc/proc/proc.pkg			\
+		o/$(MODE)/tool/build/echo.zip.o				\
+		o/$(MODE)/tool/build/cocmd.zip.o			\
+		o/$(MODE)/tool/build/false.zip.o			\
+		$(LIBC_TESTMAIN)					\
+		$(CRT)							\
+		$(APE_NO_MODIFY_SELF)
+	@$(APELINK)
+
 o/$(MODE)/test/libc/proc/execve_test.dbg:				\
 		$(TEST_LIBC_PROC_DEPS)					\
 		o/$(MODE)/test/libc/proc/execve_test.o			\
 		o/$(MODE)/test/libc/calls/life-nomod.zip.o		\
 		o/$(MODE)/test/libc/proc/execve_test_prog1.zip.o	\
-		o/$(MODE)/test/libc/proc/execve_test_prog2.zip.o	\
 		o/$(MODE)/test/libc/mem/prog/life.elf.zip.o		\
 		o/$(MODE)/test/libc/mem/prog/sock.elf.zip.o		\
 		o/$(MODE)/test/libc/proc/proc.pkg			\
@@ -112,16 +110,7 @@ o/$(MODE)/test/libc/proc/fexecve_test.dbg:				\
 		$(APE_NO_MODIFY_SELF)
 	@$(APELINK)
 
-o/$(MODE)/test/libc/proc/life.dbg:					\
-		$(TEST_LIBC_PROC_DEPS)					\
-		o/$(MODE)/test/libc/proc/life.o				\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/test/libc/proc/life.zip.o					\
 o/$(MODE)/test/libc/proc/execve_test_prog1.zip.o			\
-o/$(MODE)/test/libc/proc/execve_test_prog2.zip.o			\
 o/$(MODE)/test/libc/proc/life-pe.zip.o: private				\
 		ZIPOBJ_FLAGS +=						\
 			-B
diff --git a/test/libc/proc/execve_test.c b/test/libc/proc/execve_test.c
index 7bfd7b102..01573483e 100644
--- a/test/libc/proc/execve_test.c
+++ b/test/libc/proc/execve_test.c
@@ -53,12 +53,12 @@ TEST(execve, testArgPassing) {
   char ibuf[12], buf[8];
   const char *prog = "./execve_test_prog1";
   testlib_extract("/zip/execve_test_prog1", prog, 0755);
-  testlib_extract("/zip/execve_test_prog2", "execve_test_prog2", 0755);
   for (i = 0; i < N; ++i) {
     FormatInt32(ibuf, i);
     GenBuf(buf, i);
     SPAWN(vfork);
-    execl(prog, prog, "-", ibuf, buf, NULL);
+    execve(prog, (char *const[]){(char *)prog, "-", ibuf, buf, 0},
+           (char *const[]){0});
     kprintf("execve failed: %m\n");
     EXITS(0);
   }
diff --git a/test/libc/proc/execve_test_prog1.c b/test/libc/proc/execve_test_prog1.c
index 901c9b6bc..5a1ea77e8 100644
--- a/test/libc/proc/execve_test_prog1.c
+++ b/test/libc/proc/execve_test_prog1.c
@@ -18,7 +18,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/fmt/conv.h"
-#include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
 
 void GenBuf(char buf[8], int x) {
@@ -41,15 +40,5 @@ int main(int argc, char *argv[]) {
     tinyprint(2, "error: buf check failed\n", NULL);
     return 10;
   }
-  const char *prog = "./execve_test_prog2";
-  if (!fork()) {
-    execl(prog, prog, NULL);
-    _Exit(127);
-  }
-  int ws;
-  if (wait(&ws) == -1)
-    return 30;
-  if (ws)
-    return 31;
   return 0;
 }
diff --git a/test/libc/proc/fork_test.c b/test/libc/proc/fork_test.c
index 264f226d3..641909045 100644
--- a/test/libc/proc/fork_test.c
+++ b/test/libc/proc/fork_test.c
@@ -21,32 +21,22 @@
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/timespec.h"
-#include "libc/calls/syscall-sysv.internal.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/rdtsc.h"
-#include "libc/proc/posix_spawn.h"
 #include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/msync.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/sig.h"
-#include "libc/testlib/benchmark.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/subprocess.h"
 #include "libc/testlib/testlib.h"
-#include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 
-void SetUpOnce(void) {
-  testlib_enable_tmp_setup_teardown();
-}
-
 TEST(fork, testPipes) {
   int a, b;
   int ws, pid;
@@ -73,27 +63,32 @@ TEST(fork, testSharedMemory) {
   int *sharedvar;
   int *privatevar;
   EXPECT_NE(MAP_FAILED,
-            (sharedvar = mmap(0, getpagesize(), PROT_READ | PROT_WRITE,
+            (sharedvar = mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE,
                               MAP_SHARED | MAP_ANONYMOUS, -1, 0)));
   EXPECT_NE(MAP_FAILED,
-            (privatevar = mmap(0, getpagesize(), PROT_READ | PROT_WRITE,
+            (privatevar = mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE,
                                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)));
   stackvar = 1;
   *sharedvar = 1;
   *privatevar = 1;
   EXPECT_NE(-1, (pid = fork()));
   if (!pid) {
+    EXPECT_EQ(NULL, getenv("_FORK"));
     ++stackvar;
-    ++*privatevar;
     ++*sharedvar;
+    ++*privatevar;
+    msync((void *)ROUNDDOWN((intptr_t)&stackvar, getpagesize()), getpagesize(),
+          MS_SYNC);
+    EXPECT_NE(-1, msync(privatevar, getpagesize(), MS_SYNC));
+    EXPECT_NE(-1, msync(sharedvar, getpagesize(), MS_SYNC));
     _exit(0);
   }
   EXPECT_NE(-1, waitpid(pid, &ws, 0));
   EXPECT_EQ(1, stackvar);
   EXPECT_EQ(2, *sharedvar);
   EXPECT_EQ(1, *privatevar);
-  EXPECT_SYS(0, 0, munmap(sharedvar, getpagesize()));
-  EXPECT_SYS(0, 0, munmap(privatevar, getpagesize()));
+  EXPECT_NE(-1, munmap(sharedvar, getpagesize()));
+  EXPECT_NE(-1, munmap(privatevar, getpagesize()));
 }
 
 static volatile bool gotsigusr1;
@@ -108,6 +103,8 @@ static void OnSigusr2(int sig) {
 }
 
 TEST(fork, childToChild) {
+  if (IsWindows())
+    return;  // :'(
   sigset_t mask, oldmask;
   int ws, parent, child1, child2;
   gotsigusr1 = false;
@@ -121,20 +118,14 @@ TEST(fork, childToChild) {
   sigprocmask(SIG_BLOCK, &mask, &oldmask);
   ASSERT_NE(-1, (child1 = fork()));
   if (!child1) {
-    if (kill(parent, SIGUSR2)) {
-      kprintf("%s:%d: error: failed to kill parent: %m\n", __FILE__, __LINE__);
-      _Exit(1);
-    }
-    ASSERT_SYS(EINTR, -1, sigsuspend(0));
+    kill(parent, SIGUSR2);
+    sigsuspend(0);
     _Exit(!gotsigusr1);
   }
-  EXPECT_SYS(EINTR, -1, sigsuspend(0));
+  sigsuspend(0);
   ASSERT_NE(-1, (child2 = fork()));
   if (!child2) {
-    if (kill(child1, SIGUSR1)) {
-      kprintf("%s:%d: error: failed to kill child1: %m\n", __FILE__, __LINE__);
-      _Exit(1);
-    }
+    kill(child1, SIGUSR1);
     _Exit(0);
   }
   ASSERT_NE(-1, wait(&ws));
@@ -151,111 +142,16 @@ TEST(fork, preservesTlsMemory) {
   EXITS(0);
 }
 
-TEST(fork, privateExtraPageData_getsCopiedByFork) {
-  char *p;
-  ASSERT_NE(MAP_FAILED, (p = mmap(0, 1, PROT_WRITE | PROT_READ,
-                                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0)));
-  p[0] = 1;
-  p[1] = 2;
-  SPAWN(fork);
-  ASSERT_EQ(1, p[0]);
-  ASSERT_EQ(2, p[1]);
-  EXITS(0);
-  ASSERT_SYS(0, 0, munmap(p, 1));
-}
-
-TEST(fork, sharedExtraPageData_getsResurrectedByFork) {
-  char *p;
-  ASSERT_NE(MAP_FAILED, (p = mmap(0, 1, PROT_WRITE | PROT_READ,
-                                  MAP_ANONYMOUS | MAP_SHARED, -1, 0)));
-  p[0] = 1;
-  p[1] = 2;
-  SPAWN(fork);
-  ASSERT_EQ(1, p[0]);
-  ASSERT_EQ(2, p[1]);
-  EXITS(0);
-  ASSERT_SYS(0, 0, munmap(p, 1));
-}
-
-#define CHECK_TERMSIG                                                    \
-  if (WIFSIGNALED(ws)) {                                                 \
-    kprintf("%s:%d: error: forked life subprocess terminated with %G\n", \
-            __FILE__, __LINE__, WTERMSIG(ws));                           \
-    exit(1);                                                             \
-  }
-
-void fork_wait_in_serial(void) {
+void ForkInSerial(void) {
   int pid, ws;
   ASSERT_NE(-1, (pid = fork()));
   if (!pid)
     _Exit(0);
   ASSERT_NE(-1, waitpid(pid, &ws, 0));
-  CHECK_TERMSIG;
   ASSERT_TRUE(WIFEXITED(ws));
   ASSERT_EQ(0, WEXITSTATUS(ws));
 }
 
-void vfork_execl_wait_in_serial(void) {
-  int pid, ws;
-  ASSERT_NE(-1, (pid = vfork()));
-  if (!pid) {
-    execl("./life", "./life", NULL);
-    _Exit(127);
-  }
-  ASSERT_NE(-1, waitpid(pid, &ws, 0));
-  CHECK_TERMSIG;
-  ASSERT_TRUE(WIFEXITED(ws));
-  ASSERT_EQ(42, WEXITSTATUS(ws));
-}
-
-void vfork_wait_in_serial(void) {
-  int pid, ws;
-  ASSERT_NE(-1, (pid = vfork()));
-  if (!pid)
-    _Exit(0);
-  ASSERT_NE(-1, waitpid(pid, &ws, 0));
-  CHECK_TERMSIG;
-  ASSERT_TRUE(WIFEXITED(ws));
-  ASSERT_EQ(0, WEXITSTATUS(ws));
-}
-
-void sys_fork_wait_in_serial(void) {
-  int pid, ws;
-  ASSERT_NE(-1, (pid = sys_fork()));
-  if (!pid)
-    _Exit(0);
-  ASSERT_NE(-1, waitpid(pid, &ws, 0));
-  CHECK_TERMSIG;
-  ASSERT_TRUE(WIFEXITED(ws));
-  ASSERT_EQ(0, WEXITSTATUS(ws));
-}
-
-void posix_spawn_in_serial(void) {
-  int ws, pid;
-  char *prog = "./life";
-  char *args[] = {prog, NULL};
-  char *envs[] = {NULL};
-  ASSERT_EQ(0, posix_spawn(&pid, prog, NULL, NULL, args, envs));
-  ASSERT_NE(-1, waitpid(pid, &ws, 0));
-  CHECK_TERMSIG;
-  ASSERT_TRUE(WIFEXITED(ws));
-  ASSERT_EQ(42, WEXITSTATUS(ws));
-}
-
-TEST(fork, bench) {
-  if (IsWindows()) {
-    testlib_extract("/zip/life-pe.ape", "life", 0755);
-  } else {
-    testlib_extract("/zip/life", "life", 0755);
-  }
-  vfork_wait_in_serial();
-  vfork_execl_wait_in_serial();
-  posix_spawn_in_serial();
-  BENCHMARK(10, 1, vfork_wait_in_serial());
-  if (!IsWindows())
-    BENCHMARK(10, 1, sys_fork_wait_in_serial());
-  fork_wait_in_serial();
-  BENCHMARK(10, 1, fork_wait_in_serial());
-  BENCHMARK(10, 1, posix_spawn_in_serial());
-  BENCHMARK(10, 1, vfork_execl_wait_in_serial());
+BENCH(fork, bench) {
+  EZBENCH2("fork a", donothing, ForkInSerial());
 }
diff --git a/test/libc/proc/handkill_test.c b/test/libc/proc/handkill_test.c
index a669eeb75..07284b4ee 100644
--- a/test/libc/proc/handkill_test.c
+++ b/test/libc/proc/handkill_test.c
@@ -125,6 +125,8 @@ TEST(handkill, thread2thread_async) {
 }
 
 TEST(handkill, process_async) {
+  if (IsWindows())
+    return;
   SPAWN(fork);
   shm->ready = true;
   while (!shm->got_signal)
@@ -140,6 +142,8 @@ TEST(handkill, process_async) {
 }
 
 TEST(handkill, process_pause) {
+  if (IsWindows())
+    return;
   SPAWN(fork);
   shm->ready = true;
   pause();
diff --git a/test/libc/proc/life.c b/test/libc/proc/life.c
deleted file mode 100644
index 6c67c3b22..000000000
--- a/test/libc/proc/life.c
+++ /dev/null
@@ -1,3 +0,0 @@
-int main(int argc, char *argv[]) {
-  return 42;
-}
diff --git a/test/libc/proc/posix_spawn_test.c b/test/libc/proc/posix_spawn_test.c
index 809fc865d..044c24dcf 100644
--- a/test/libc/proc/posix_spawn_test.c
+++ b/test/libc/proc/posix_spawn_test.c
@@ -37,7 +37,7 @@
 #include "libc/limits.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
-#include "libc/proc/proc.h"
+#include "libc/proc/proc.internal.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
@@ -262,6 +262,8 @@ void EmptySigHandler(int sig) {
 }
 
 TEST(posix_spawn, etxtbsy) {
+  if (IsWindows())
+    return;  // can't deliver signals between processes
   if (IsXnu())
     return;  // they don't appear impacted by this race condition
   if (IsNetbsd())
diff --git a/test/libc/proc/sched_getaffinity_test.c b/test/libc/proc/sched_getaffinity_test.c
index dad9c1b26..0ca0d8d53 100644
--- a/test/libc/proc/sched_getaffinity_test.c
+++ b/test/libc/proc/sched_getaffinity_test.c
@@ -30,8 +30,6 @@
 #include "libc/thread/thread.h"
 #include "libc/thread/thread2.h"
 
-int disable_limit_process_to_single_cpu;
-
 void SetUp(void) {
   if (!IsLinux() && !IsFreebsd() && !IsWindows()) {
     exit(0);
@@ -92,6 +90,7 @@ __attribute__((__constructor__)) static void init(void) {
   }
 }
 
+#ifdef __x86_64__
 TEST(sched_setaffinity, isInheritedAcrossExecve) {
   cpu_set_t x;
   CPU_ZERO(&x);
@@ -106,6 +105,7 @@ TEST(sched_setaffinity, isInheritedAcrossExecve) {
   EXPECT_TRUE(WIFEXITED(ws));
   EXPECT_EQ(42, WEXITSTATUS(ws));
 }
+#endif /* __x86_64__ */
 
 TEST(sched_getaffinity, getpid) {
   cpu_set_t x, y;
diff --git a/test/libc/system/system_test.c b/test/libc/proc/system_test.c
similarity index 94%
rename from test/libc/system/system_test.c
rename to test/libc/proc/system_test.c
index 3773a64e0..b80873320 100644
--- a/test/libc/system/system_test.c
+++ b/test/libc/proc/system_test.c
@@ -27,9 +27,10 @@
 #include "libc/str/str.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/sig.h"
-#include "libc/testlib/benchmark.h"
+#include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 #include "libc/x/x.h"
+#ifdef __x86_64__
 
 #define GETEXITSTATUS(x)                                                \
   ({                                                                    \
@@ -165,12 +166,13 @@ TEST(system, notequals) {
 }
 
 TEST(system, usleep) {
-  ASSERT_EQ(0, GETEXITSTATUS(system("usleep & kill $!; wait")));
+  ASSERT_EQ(0, GETEXITSTATUS(system("usleep & kill $!")));
 }
 
 TEST(system, kill) {
   int ws = system("kill -TERM $$; usleep");
-  ASSERT_EQ(SIGTERM, WTERMSIG(ws));
+  if (!IsWindows())
+    ASSERT_EQ(SIGTERM, WTERMSIG(ws));
 }
 
 TEST(system, exitStatusPreservedAfterSemiColon) {
@@ -275,9 +277,15 @@ TEST(system, pipelineCanOutputBackToSelf) {
   RestoreStdout();
 }
 
-TEST(system, bench) {
+int system2(const char *);
+
+BENCH(system, bench) {
   testlib_extract("/zip/echo", "echo", 0755);
-  BENCHMARK(10, 1, system("./echo hi >/dev/null"));
-  BENCHMARK(10, 1, system("echo hi >/dev/null"));
-  BENCHMARK(10, 1, system("exit"));
+  EZBENCH2("system cmd", donothing, system("./echo hi >/dev/null"));
+  EZBENCH2("systemvpe cmd", donothing,
+           systemvpe("./echo", (char *[]){"./echo", "hi", 0}, 0));
+  EZBENCH2("cocmd echo", donothing, system("echo hi >/dev/null"));
+  EZBENCH2("cocmd exit", donothing, system("exit"));
 }
+
+#endif /* __x86_64__ */
diff --git a/test/libc/sock/connect_test.c b/test/libc/sock/connect_test.c
index c7e33f567..806961eb0 100644
--- a/test/libc/sock/connect_test.c
+++ b/test/libc/sock/connect_test.c
@@ -34,61 +34,6 @@
 #include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
 
-TEST(connect, blocking) {
-  char buf[16] = {0};
-  atomic_uint *sem = _mapshared(sizeof(unsigned));
-  uint32_t addrsize = sizeof(struct sockaddr_in);
-  struct sockaddr_in addr = {
-      .sin_family = AF_INET,
-      .sin_addr.s_addr = htonl(0x7f000001),
-  };
-  ASSERT_SYS(0, 3, socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
-  ASSERT_SYS(0, 0, bind(3, (struct sockaddr *)&addr, sizeof(addr)));
-  ASSERT_SYS(0, 0, getsockname(3, (struct sockaddr *)&addr, &addrsize));
-  ASSERT_SYS(0, 0, listen(3, SOMAXCONN));
-
-  SPAWN(fork);
-
-  while (!*sem)
-    pthread_yield();
-  ASSERT_SYS(0, 4, accept(3, (struct sockaddr *)&addr, &addrsize));
-  ASSERT_SYS(0, 2, read(4, buf, 16));  // hi
-  ASSERT_SYS(0, 5, write(4, "hello", 5));
-  ASSERT_SYS(0, 3, read(4, buf, 16));  // bye
-
-  PARENT();
-
-  ASSERT_SYS(0, 0, close(3));
-  ASSERT_SYS(0, 3, socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
-  ASSERT_SYS(0, 0, connect(3, (struct sockaddr *)&addr, sizeof(addr)));
-  *sem = 1;
-  {  // wait until connected
-    struct pollfd pfd = {3, POLLOUT};
-    ASSERT_SYS(0, 1, poll(&pfd, 1, -1));
-    ASSERT_TRUE(!!(POLLOUT & pfd.revents));
-  }
-  struct sockaddr_in peer;
-  uint32_t sz = sizeof(peer);
-  ASSERT_SYS(0, 0, getsockname(3, (struct sockaddr *)&peer, &sz));
-  ASSERT_EQ(htonl(0x7f000001), peer.sin_addr.s_addr);
-  ASSERT_SYS(0, 0, getpeername(3, (struct sockaddr *)&peer, &sz));
-  ASSERT_EQ(htonl(0x7f000001), peer.sin_addr.s_addr);
-  ASSERT_SYS(0, 2, write(3, "hi", 2));
-  {  // wait for other process to send us stuff
-    struct pollfd pfd = {3, POLLIN};
-    ASSERT_SYS(0, 1, poll(&pfd, 1, -1));
-    ASSERT_TRUE(!!(POLLIN & pfd.revents));
-  }
-  ASSERT_SYS(0, 5, read(3, buf, 16));
-  ASSERT_STREQ("hello", buf);
-  ASSERT_SYS(0, 3, write(3, "bye", 3));
-  ASSERT_SYS(0, 0, close(3));
-
-  WAIT(exit, 0);
-
-  munmap(sem, sizeof(unsigned));
-}
-
 TEST(connect, nonblocking) {
   if (IsFreebsd())
     return;  // TODO(jart): why did this start flaking?
@@ -105,22 +50,23 @@ TEST(connect, nonblocking) {
   ASSERT_SYS(0, 0, bind(3, (struct sockaddr *)&addr, sizeof(addr)));
   ASSERT_SYS(0, 0, getsockname(3, (struct sockaddr *)&addr, &addrsize));
   ASSERT_SYS(0, 0, listen(3, SOMAXCONN));
-
   SPAWN(fork);
-
   while (!*sem)
     pthread_yield();
   ASSERT_SYS(0, 4, accept(3, (struct sockaddr *)&addr, &addrsize));
   ASSERT_SYS(0, 2, read(4, buf, 16));  // hi
   ASSERT_SYS(0, 5, write(4, "hello", 5));
   ASSERT_SYS(0, 3, read(4, buf, 16));  // bye
-
   PARENT();
-
   ASSERT_SYS(0, 0, close(3));
   ASSERT_SYS(0, 3, socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
   ASSERT_SYS(EINPROGRESS, -1,
              connect(3, (struct sockaddr *)&addr, sizeof(addr)));
+  if (!(IsLinux() || IsNetbsd())) {
+    // this doens't work on rhel7 and netbsd
+    ASSERT_SYS(EALREADY, -1,
+               connect(3, (struct sockaddr *)&addr, sizeof(addr)));
+  }
   ASSERT_SYS(EAGAIN, -1, read(3, buf, 16));
   *sem = 1;
   {  // wait until connected
@@ -128,10 +74,6 @@ TEST(connect, nonblocking) {
     ASSERT_SYS(0, 1, poll(&pfd, 1, -1));
     ASSERT_TRUE(!!(POLLOUT & pfd.revents));
   }
-  struct sockaddr_in peer;
-  uint32_t sz = sizeof(peer);
-  ASSERT_SYS(0, 0, getpeername(3, (struct sockaddr *)&peer, &sz));
-  ASSERT_EQ(htonl(0x7f000001), peer.sin_addr.s_addr);
   ASSERT_SYS(0, 2, write(3, "hi", 2));
   {  // wait for other process to send us stuff
     struct pollfd pfd = {3, POLLIN};
@@ -142,8 +84,6 @@ TEST(connect, nonblocking) {
   ASSERT_STREQ("hello", buf);
   ASSERT_SYS(0, 3, write(3, "bye", 3));
   ASSERT_SYS(0, 0, close(3));
-
   WAIT(exit, 0);
-
   munmap(sem, sizeof(unsigned));
 }
diff --git a/test/libc/sock/ipv4v6poll_test.c b/test/libc/sock/ipv4v6poll_test.c
deleted file mode 100644
index 7fc6f9ed0..000000000
--- a/test/libc/sock/ipv4v6poll_test.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <arpa/inet.h>
-#include <assert.h>
-#include <cosmo.h>
-#include <netinet/in.h>
-#include <pthread.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/poll.h>
-#include <sys/socket.h>
-#include <unistd.h>
-
-#define BUFFER_SIZE 1024
-
-// States:
-// 0: Initial state
-// 1: IPv4 listener ready
-// 2: IPv6 listener ready
-// 3: Both listeners ready, main can connect
-// 4: Main connected, IPv4 can send
-// 5: IPv4 sent, IPv6 can send
-// 6: All communication complete
-atomic_int global_state = 0;
-
-typedef struct {
-  int port;
-  int client_sock;
-} listener_data;
-
-void *ipv4_listener(void *arg) {
-  listener_data *data = (listener_data *)arg;
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  if (sockfd < 0) {
-    perror("IPv4 socket creation failed");
-    exit(EXIT_FAILURE);
-  }
-
-  struct sockaddr_in addr = {0};
-  addr.sin_family = AF_INET;
-  addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  addr.sin_port = 0;  // Random port
-
-  if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
-    perror("IPv4 bind failed");
-    exit(EXIT_FAILURE);
-  }
-
-  socklen_t len = sizeof(addr);
-  if (getsockname(sockfd, (struct sockaddr *)&addr, &len) < 0) {
-    perror("getsockname failed");
-    exit(EXIT_FAILURE);
-  }
-
-  data->port = ntohs(addr.sin_port);
-  // printf("IPv4 listening on port %d\n", data->port);
-
-  if (listen(sockfd, 1) < 0) {
-    perror("IPv4 listen failed");
-    exit(EXIT_FAILURE);
-  }
-
-  // Signal that IPv4 listener is ready
-  atomic_fetch_add(&global_state, 1);
-
-  // Wait for IPv6 to be ready before accepting
-  while (atomic_load(&global_state) < 3) {
-    // Busy wait
-  }
-
-  data->client_sock = accept(sockfd, NULL, NULL);
-  if (data->client_sock < 0) {
-    perror("IPv4 accept failed");
-    exit(EXIT_FAILURE);
-  }
-
-  while (atomic_load(&global_state) < 4) {
-    // Wait for main to signal it's connected
-  }
-
-  const char *message = "Hello from IPv4!";
-  unassert(send(data->client_sock, message, strlen(message), 0) > 0);
-
-  unassert(!close(sockfd));
-  return NULL;
-}
-
-void *ipv6_listener(void *arg) {
-  listener_data *data = (listener_data *)arg;
-  int sockfd = socket(AF_INET6, SOCK_STREAM, 0);
-  if (sockfd < 0) {
-    perror("IPv6 socket creation failed");
-    exit(EXIT_FAILURE);
-  }
-
-  struct sockaddr_in6 addr = {0};
-  addr.sin6_family = AF_INET6;
-  addr.sin6_addr = in6addr_loopback;
-  addr.sin6_port = 0;  // Random port
-
-  if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
-    perror("IPv6 bind failed");
-    exit(EXIT_FAILURE);
-  }
-
-  socklen_t len = sizeof(addr);
-  if (getsockname(sockfd, (struct sockaddr *)&addr, &len) < 0) {
-    perror("getsockname failed");
-    exit(EXIT_FAILURE);
-  }
-
-  data->port = ntohs(addr.sin6_port);
-  // printf("IPv6 listening on port %d\n", data->port);
-
-  if (listen(sockfd, 1) < 0) {
-    perror("IPv6 listen failed");
-    exit(EXIT_FAILURE);
-  }
-
-  // Signal that IPv6 listener is ready and wait for IPv4
-  int expected = 1;
-  while (!atomic_compare_exchange_weak(&global_state, &expected, 3)) {
-    expected = 1;  // Reset expected value if CAS failed
-  }
-
-  data->client_sock = accept(sockfd, NULL, NULL);
-  if (data->client_sock < 0) {
-    perror("IPv6 accept failed");
-    exit(EXIT_FAILURE);
-  }
-
-  while (atomic_load(&global_state) < 5) {
-    // Wait for IPv4 to send its message
-  }
-
-  const char *message = "Hello from IPv6!";
-  unassert(send(data->client_sock, message, strlen(message), 0) > 0);
-
-  unassert(!close(sockfd));
-  return NULL;
-}
-
-int main() {
-  ShowCrashReports();
-
-  pthread_t ipv4_thread, ipv6_thread;
-  listener_data ipv4_data = {0}, ipv6_data = {0};
-
-  if (pthread_create(&ipv4_thread, NULL, ipv4_listener, &ipv4_data) != 0) {
-    perror("Failed to create IPv4 thread");
-    exit(EXIT_FAILURE);
-  }
-
-  if (pthread_create(&ipv6_thread, NULL, ipv6_listener, &ipv6_data) != 0) {
-    perror("Failed to create IPv6 thread");
-    exit(EXIT_FAILURE);
-  }
-
-  // Wait for both listeners to be ready
-  while (atomic_load(&global_state) < 3) {
-    // Busy wait
-  }
-
-  int ipv4_sock = socket(AF_INET, SOCK_STREAM, 0);
-  int ipv6_sock = socket(AF_INET6, SOCK_STREAM, 0);
-
-  struct sockaddr_in ipv4_addr = {0};
-  ipv4_addr.sin_family = AF_INET;
-  ipv4_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  ipv4_addr.sin_port = htons(ipv4_data.port);
-
-  struct sockaddr_in6 ipv6_addr = {0};
-  ipv6_addr.sin6_family = AF_INET6;
-  ipv6_addr.sin6_addr = in6addr_loopback;
-  ipv6_addr.sin6_port = htons(ipv6_data.port);
-
-  if (connect(ipv4_sock, (struct sockaddr *)&ipv4_addr, sizeof(ipv4_addr)) <
-      0) {
-    perror("IPv4 connect failed");
-    exit(EXIT_FAILURE);
-  }
-
-  if (connect(ipv6_sock, (struct sockaddr *)&ipv6_addr, sizeof(ipv6_addr)) <
-      0) {
-    perror("IPv6 connect failed");
-    exit(EXIT_FAILURE);
-  }
-
-  // Signal that main thread is connected
-  atomic_store(&global_state, 4);
-
-  struct pollfd fds[2];
-  fds[0].fd = ipv4_sock;
-  fds[0].events = POLLIN;
-  fds[1].fd = ipv6_sock;
-  fds[1].events = POLLIN;
-
-  char buffer[BUFFER_SIZE];
-
-  while (atomic_load(&global_state) < 6) {
-    if (poll(fds, 2, -1) > 0) {
-      if (fds[0].revents & POLLIN) {
-        ssize_t n = recv(ipv4_sock, buffer, BUFFER_SIZE - 1, 0);
-        unassert(n != -1);
-        buffer[n] = '\0';
-        // printf("Received from IPv4: %s\n", buffer);
-        unassert(atomic_load(&global_state) == 4);
-        atomic_store(&global_state, 5);
-      }
-      if (fds[1].revents & POLLIN) {
-        ssize_t n = recv(ipv6_sock, buffer, BUFFER_SIZE - 1, 0);
-        unassert(n != -1);
-        buffer[n] = '\0';
-        // printf("Received from IPv6: %s\n", buffer);
-        unassert(atomic_load(&global_state) == 5);
-        atomic_store(&global_state, 6);
-      }
-    }
-  }
-
-  unassert(!close(ipv4_sock));
-  unassert(!close(ipv6_sock));
-
-  unassert(!pthread_join(ipv4_thread, NULL));
-  unassert(!pthread_join(ipv6_thread, NULL));
-
-  CheckForMemoryLeaks();
-  return 0;
-}
diff --git a/test/libc/sock/nonblock_test.c b/test/libc/sock/nonblock_test.c
index e27281ce1..117582a57 100644
--- a/test/libc/sock/nonblock_test.c
+++ b/test/libc/sock/nonblock_test.c
@@ -37,6 +37,9 @@
 #include "libc/thread/thread.h"
 
 TEST(O_NONBLOCK, canBeSetBySocket_toMakeListenNonBlocking) {
+  // TODO(jart): this doesn't make any sense on windows
+  if (IsWindows())
+    return;
   char buf[16] = {0};
   uint32_t addrsize = sizeof(struct sockaddr_in);
   struct sockaddr_in addr = {
diff --git a/test/libc/sock/recvfrom_test.c b/test/libc/sock/recvfrom_test.c
index a662d914d..4ef3c7a11 100644
--- a/test/libc/sock/recvfrom_test.c
+++ b/test/libc/sock/recvfrom_test.c
@@ -33,6 +33,8 @@
 // two clients send a udp packet containing their local address
 // server verifies content of packet matches the peer's address
 TEST(recvfrom, test) {
+  if (!IsWindows())
+    return;
   uint32_t addrsize = sizeof(struct sockaddr_in);
   struct sockaddr_in server = {
       .sin_family = AF_INET,
diff --git a/test/libc/calls/select_test.c b/test/libc/sock/select_test.c
similarity index 100%
rename from test/libc/calls/select_test.c
rename to test/libc/sock/select_test.c
diff --git a/test/libc/sock/sendfile_test.c b/test/libc/sock/sendfile_test.c
index 2254b1529..c63e2cb1c 100644
--- a/test/libc/sock/sendfile_test.c
+++ b/test/libc/sock/sendfile_test.c
@@ -41,9 +41,9 @@
 
 void SetUpOnce(void) {
   if (IsNetbsd())
-    exit(0);  // no sendfile support
+    exit(0);
   if (IsOpenbsd())
-    exit(0);  // no sendfile support
+    exit(0);
   testlib_enable_tmp_setup_teardown();
   ASSERT_SYS(0, 0, pledge("stdio rpath wpath cpath proc inet", 0));
 }
@@ -102,6 +102,9 @@ TEST(sendfile, testSeeking) {
 }
 
 TEST(sendfile, testPositioning) {
+  // TODO(jart): fix test regression on windows
+  if (IsWindows())
+    return;
   char buf[1024];
   uint32_t addrsize = sizeof(struct sockaddr_in);
   struct sockaddr_in addr = {
@@ -127,8 +130,9 @@ TEST(sendfile, testPositioning) {
     ASSERT_TRUE(errno == EINVAL || errno == EPIPE);
     errno = 0;
     // XXX: WSL1 clobbers file offset on failure!
-    if (!__iswsl1())
+    if (!__iswsl1()) {
       ASSERT_EQ(12, GetFileOffset(5));
+    }
     _Exit(0);
   }
   ASSERT_SYS(0, 0, close(3));
diff --git a/test/libc/sock/socket_test.c b/test/libc/sock/socket_test.c
index ff161888c..f79b0d7a7 100644
--- a/test/libc/sock/socket_test.c
+++ b/test/libc/sock/socket_test.c
@@ -18,8 +18,8 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
-#include "libc/dce.h"
 #include "libc/intrin/fds.h"
+#include "libc/dce.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/nt/winsock.h"
 #include "libc/runtime/runtime.h"
@@ -141,6 +141,8 @@ TEST(socket, canBeInheritedByForkedWorker) {
   WAIT(exit, 0);
 }
 
+#ifdef __x86_64__
+
 __attribute__((__constructor__)) static void StdioPro(int argc, char *argv[]) {
   if (argc >= 2 && !strcmp(argv[1], "StdioProg")) {
     ASSERT_EQ(NULL, getenv("__STDIO_SOCKETS"));
@@ -182,3 +184,5 @@ TEST(socket, canBeUsedAsExecutedStdio) {
   EXPECT_SYS(0, 0, close(3));
   WAIT(exit, 0);
 }
+
+#endif /* __x86_64__ */
diff --git a/test/libc/stdio/BUILD.mk b/test/libc/stdio/BUILD.mk
index 3cc6f6d5f..f40be5396 100644
--- a/test/libc/stdio/BUILD.mk
+++ b/test/libc/stdio/BUILD.mk
@@ -28,27 +28,25 @@ TEST_LIBC_STDIO_DIRECTDEPS =					\
 	LIBC_CALLS						\
 	LIBC_FMT						\
 	LIBC_INTRIN						\
-	LIBC_LOG						\
 	LIBC_MEM						\
 	LIBC_NEXGEN32E						\
 	LIBC_PROC						\
 	LIBC_RUNTIME						\
 	LIBC_STDIO						\
 	LIBC_STR						\
-	LIBC_SYSTEM						\
 	LIBC_SYSV						\
+	LIBC_TINYMATH						\
 	LIBC_TESTLIB						\
 	LIBC_THREAD						\
-	LIBC_TINYMATH						\
+	LIBC_LOG						\
 	LIBC_X							\
-	THIRD_PARTY_COMPILER_RT					\
 	THIRD_PARTY_GDTOA					\
 	THIRD_PARTY_MBEDTLS					\
 	THIRD_PARTY_MUSL					\
 	THIRD_PARTY_NSYNC					\
-	THIRD_PARTY_TZ						\
 	THIRD_PARTY_ZLIB					\
 	THIRD_PARTY_ZLIB_GZ					\
+	THIRD_PARTY_TZ
 
 TEST_LIBC_STDIO_DEPS :=						\
 	$(call uniq,$(foreach x,$(TEST_LIBC_STDIO_DIRECTDEPS),$($(x))))
@@ -67,6 +65,16 @@ o/$(MODE)/test/libc/stdio/%.dbg:				\
 		$(APE_NO_MODIFY_SELF)
 	@$(APELINK)
 
+o/$(MODE)/test/libc/stdio/popen_test.dbg:			\
+		$(TEST_LIBC_STDIO_DEPS)				\
+		o/$(MODE)/test/libc/stdio/popen_test.o		\
+		o/$(MODE)/test/libc/stdio/stdio.pkg		\
+		o/$(MODE)/tool/build/echo.zip.o			\
+		$(LIBC_TESTMAIN)				\
+		$(CRT)						\
+		$(APE_NO_MODIFY_SELF)
+	@$(APELINK)
+
 $(TEST_LIBC_STDIO_OBJS): private				\
 		DEFAULT_CCFLAGS +=				\
 			-fno-builtin
diff --git a/test/libc/stdio/dtoa_test.c b/test/libc/stdio/dtoa_test.c
index 7929d3daf..738dbc841 100644
--- a/test/libc/stdio/dtoa_test.c
+++ b/test/libc/stdio/dtoa_test.c
@@ -19,7 +19,7 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/sched_param.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
diff --git a/test/libc/stdio/ecvt_test.c b/test/libc/stdio/ecvt_test.c
index 8806896bb..2813d8d3c 100644
--- a/test/libc/stdio/ecvt_test.c
+++ b/test/libc/stdio/ecvt_test.c
@@ -31,38 +31,3 @@ TEST(fcvt, test) {
   ASSERT_EQ(1, decpt);
   ASSERT_EQ(0, sign);
 }
-
-TEST(ecvt, minus0) {
-  int decpt = 110000000, sign = 110000000;
-
-  ASSERT_STREQ("00000", ecvt(-0.0, 5, &decpt, &sign));
-  ASSERT_LE(0, decpt);
-  ASSERT_GE(1, decpt);
-  ASSERT_EQ(1, sign);
-}
-
-TEST(ecvt, minus0ndigits0) {
-  int decpt = 110000000, sign = 110000000;
-
-  ASSERT_STREQ("", ecvt(-0.0, 0, &decpt, &sign));
-  ASSERT_LE(0, decpt);
-  ASSERT_GE(1, decpt);
-  ASSERT_EQ(1, sign);
-}
-
-TEST(fcvt, ndigits0) {
-  int decpt = 110000000, sign = 110000000;
-
-  ASSERT_STREQ("1", fcvt(0.6, 0, &decpt, &sign));
-  ASSERT_EQ(1, decpt);
-  ASSERT_EQ(0, sign);
-}
-
-TEST(fcvt, minus0ndigits0) {
-  int decpt = 110000000, sign = 110000000;
-
-  ASSERT_STREQ("", fcvt(-0.0, 0, &decpt, &sign));
-  ASSERT_LE(0, decpt);
-  ASSERT_GE(1, decpt);
-  ASSERT_EQ(1, sign);
-}
diff --git a/test/libc/stdio/fgetwc_test.c b/test/libc/stdio/fgetwc_test.c
index 4f6bd7ccd..e7a55ceff 100644
--- a/test/libc/stdio/fgetwc_test.c
+++ b/test/libc/stdio/fgetwc_test.c
@@ -16,9 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/weaken.h"
-#include "libc/mem/mem.h"
-#include "libc/stdio/internal.h"
+#include "libc/stdio/stdio.h"
 #include "libc/testlib/testlib.h"
 
 TEST(fgetwc, testAscii_oneChar) {
diff --git a/test/libc/stdio/fmt_test.c b/test/libc/stdio/fmt_test.c
index afb317259..026ff761d 100644
--- a/test/libc/stdio/fmt_test.c
+++ b/test/libc/stdio/fmt_test.c
@@ -20,7 +20,6 @@
 #include "libc/log/log.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
-#include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/testlib/testlib.h"
 #include "libc/x/xasprintf.h"
@@ -432,10 +431,3 @@ TEST(fmt, regress) {
       "User-Agent: hurl/1.o (https://github.com/jart/cosmopolitan)\r\n",
       buf);
 }
-
-TEST(fmt, n) {
-  int n;
-  char buf[8];
-  snprintf(buf, 8, ".%c%c.%n", 0, 1, &n);
-  ASSERT_EQ(4, n);
-}
diff --git a/test/libc/stdio/fputc_test.c b/test/libc/stdio/fputc_test.c
index b63ecc1c3..6520e77bb 100644
--- a/test/libc/stdio/fputc_test.c
+++ b/test/libc/stdio/fputc_test.c
@@ -33,13 +33,9 @@ void SetUpOnce(void) {
 TEST(fputc, test) {
   ASSERT_NE(NULL, (f = fopen("hog", "w+")));
   EXPECT_EQ('h', fputc('h', f));
-  EXPECT_FALSE(feof(f));
   EXPECT_EQ(0xFF, fputc(-1, f));
-  EXPECT_FALSE(feof(f));
   EXPECT_NE(-1, fseek(f, 0, SEEK_SET));
-  EXPECT_FALSE(feof(f));
   EXPECT_EQ('h', fgetc(f));
-  EXPECT_FALSE(feof(f));
   EXPECT_EQ(0, fread(NULL, 0, 0, f));
   EXPECT_FALSE(feof(f));
   EXPECT_EQ(0xFF, fgetc(f));
diff --git a/test/libc/stdio/fread_test.c b/test/libc/stdio/fread_test.c
index 2fc4f00a5..8e5e690e5 100644
--- a/test/libc/stdio/fread_test.c
+++ b/test/libc/stdio/fread_test.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
-#include "libc/errno.h"
 #include "libc/stdio/stdio.h"
 #include "libc/testlib/testlib.h"
 
@@ -46,7 +45,7 @@ TEST(fread, eofIsSticky) {
 }
 
 TEST(fread, seekWithBuffer) {
-  FILE* f;
+  FILE *f;
   char b[8] = "hellosup";
   char c[8] = {0};
   char d[8] = {0};
@@ -61,31 +60,3 @@ TEST(fread, seekWithBuffer) {
   ASSERT_STREQ("ellos", d);
   ASSERT_EQ(0, fclose(f));
 }
-
-TEST(fread, zero) {
-  FILE* f;
-  char buf[8] = {0};
-  ASSERT_NE(NULL, (f = fopen("foo", "w")));
-  ASSERT_EQ(2, fwrite("hi", 1, 2, f));
-  ASSERT_EQ(0, fclose(f));
-  ASSERT_NE(NULL, (f = fopen("foo", "r")));
-  ASSERT_EQ(0, fread(buf, 0, 0, f));
-  ASSERT_EQ(0, ferror(stdin));
-  ASSERT_EQ(0, feof(stdin));
-  ASSERT_STREQ("", buf);
-  ASSERT_EQ(0, fclose(f));
-}
-
-TEST(fread, partial) {
-  FILE* f;
-  char buf[8] = {0};
-  ASSERT_NE(NULL, (f = fopen("foo", "w")));
-  ASSERT_EQ(2, fwrite("hi", 1, 2, f));
-  ASSERT_EQ(0, fclose(f));
-  ASSERT_NE(NULL, (f = fopen("foo", "r")));
-  ASSERT_EQ(0, fread(buf, 8, 1, f));
-  ASSERT_EQ(0, ferror(stdin));
-  ASSERT_EQ(0, feof(stdin));
-  ASSERT_EQ(0, fclose(f));
-  ASSERT_STREQ("hi", buf);
-}
diff --git a/test/libc/stdio/fscanf_test.c b/test/libc/stdio/fscanf_test.c
index 74005f3f4..accfb4bdb 100644
--- a/test/libc/stdio/fscanf_test.c
+++ b/test/libc/stdio/fscanf_test.c
@@ -27,7 +27,7 @@ TEST(fscanf, test_readAfterFloat) {
   EXPECT_EQ(4, fscanf(f, "%f%x%f%x", &f1, &i1, &f2, &i2));
   EXPECT_TRUE(isinf(f1));
   EXPECT_EQ(0xDEAD, i1);
-  EXPECT_FLOAT_EXACTLY_EQ(-0.125e-2f, f2);
+  EXPECT_EQ(-0.125e-2f, f2);
   EXPECT_EQ(0xBEEF, i2);
   fclose(f);
 }
diff --git a/test/libc/stdio/getentropy_test.c b/test/libc/stdio/getentropy_test.c
index f6d1acdc8..532e6b098 100644
--- a/test/libc/stdio/getentropy_test.c
+++ b/test/libc/stdio/getentropy_test.c
@@ -22,17 +22,19 @@
 #include "libc/calls/struct/sigset.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
 #ifndef __aarch64__
+// TODO(jart): Make this test less resource intensive.
+// TODO(jart): Why can EINTR happen on Windows?
 
 atomic_int done;
 atomic_int ready;
@@ -49,9 +51,11 @@ void *TortureWorker(void *arg) {
   ASSERT_SYS(0, 0, sigprocmask(SIG_SETMASK, &ss, 0));
   ready = true;
   while (!done) {
-    pthread_kill(parent, SIGUSR1);
+    if (!IsWindows())
+      pthread_kill(parent, SIGUSR1);
     usleep(1);
-    pthread_kill(parent, SIGUSR2);
+    if (!IsWindows())
+      pthread_kill(parent, SIGUSR2);
     usleep(1);
   }
   return 0;
@@ -96,7 +100,8 @@ TEST(getentropy, test) {
   }
   done = true;
   ASSERT_EQ(0, pthread_join(child, 0));
-  ASSERT_GT(gotsome, 0);
+  if (!IsWindows())
+    ASSERT_GT(gotsome, 0);
 }
 
 #endif /* __aarch64__ */
diff --git a/test/libc/stdio/lemur64_test.c b/test/libc/stdio/lemur64_test.c
index f17700398..fb6af0b2e 100644
--- a/test/libc/stdio/lemur64_test.c
+++ b/test/libc/stdio/lemur64_test.c
@@ -16,34 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
 #include "libc/stdio/rand.h"
-#include "libc/testlib/benchmark.h"
 #include "libc/testlib/testlib.h"
 
-uint64_t getrandom64(void) {
-  uint64_t x;
-  unassert(getrandom(&x, 8, 0) == 8);
-  return x;
-}
-
-uint64_t getentropy64(void) {
-  uint64_t x;
-  unassert(!getentropy(&x, 8));
-  return x;
-}
-
 TEST(lemur64, test) {
   EXPECT_EQ(1819718037028923529, lemur64());
   EXPECT_EQ(-3120132252617434764, lemur64());
 }
-
-BENCH(lemur64, bench) {
-  BENCHMARK(10000, 8, X(lemur64()));
-  BENCHMARK(10000, 4, X(rand()));
-  BENCHMARK(10000, 8, X(_rand64()));
-  BENCHMARK(10000, 8, X(rdrand()));
-  BENCHMARK(10000, 8, X(rdseed()));
-  BENCHMARK(10000, 8, X(getrandom64()));
-  BENCHMARK(10000, 8, X(getentropy64()));
-}
diff --git a/test/libc/stdio/mt19937_test.c b/test/libc/stdio/mt19937_test.c
index 17de8f5ec..1da2588a9 100644
--- a/test/libc/stdio/mt19937_test.c
+++ b/test/libc/stdio/mt19937_test.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/test/libc/system/popen_test.c b/test/libc/stdio/popen_test.c
similarity index 86%
rename from test/libc/system/popen_test.c
rename to test/libc/stdio/popen_test.c
index b0099bdc7..648e885ac 100644
--- a/test/libc/system/popen_test.c
+++ b/test/libc/stdio/popen_test.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
-#include "libc/calls/struct/itimerval.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
@@ -32,54 +31,16 @@
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/f.h"
-#include "libc/sysv/consts/itimer.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
-#include "libc/time.h"
-
-// test ability of user to override pthread mutex api
-int pthread_mutex_lock(pthread_mutex_t *mutex) {
-  abort();
-}
-int pthread_mutex_unlock(pthread_mutex_t *mutex) {
-  abort();
-}
-int pthread_mutex_trylock(pthread_mutex_t *mutex) {
-  abort();
-}
-int pthread_mutex_wipe_np(pthread_mutex_t *mutex) {
-  abort();
-}
+#ifdef __x86_64__
 
 FILE *f;
 char buf[32];
 
-void OnAlarm(int sig) {
-}
-
-void *LolThread(void *arg) {
-  return 0;
-}
-
 void SetUpOnce(void) {
   testlib_enable_tmp_setup_teardown();
-
-  // give deadlock detector more information
-  int64_t t = 0x5cd04d0e;
-  localtime(&t);
-  pthread_t th;
-  pthread_create(&th, 0, LolThread, 0);
-  pthread_join(th, 0);
-  char buf[32];
-  sprintf(buf, "%g", 3.14);
-  atexit((void *)LolThread);
-  FILE *f = fopen("/zip/.cosmo", "r");
-  fgetc(f);
-  fclose(f);
-  signal(SIGALRM, OnAlarm);
-  struct itimerval it = {{0, 1000}, {0, 1}};
-  setitimer(ITIMER_REAL, &it, 0);
 }
 
 void CheckForFdLeaks(void) {
@@ -159,6 +120,8 @@ void OnSig(int sig) {
 }
 
 TEST(popen, complicated) {
+  if (IsWindows())
+    return;  // windows treats sigusr1 as terminate
   char cmd[64];
   signal(SIGUSR1, OnSig);
   sprintf(cmd, "read a ; test \"x$a\" = xhello && kill -USR1 %d", getpid());
@@ -208,3 +171,5 @@ TEST(popen, torture) {
     ASSERT_EQ(0, pthread_join(t[i], 0));
   CheckForFdLeaks();
 }
+
+#endif /* __x86_64__ */
diff --git a/test/libc/stdio/snprintf_enomem_test.c b/test/libc/stdio/snprintf_enomem_test.c
deleted file mode 100644
index 57e305487..000000000
--- a/test/libc/stdio/snprintf_enomem_test.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Gabriel Ravier                                                │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/rlimit.h"
-#include "libc/errno.h"
-#include "libc/stdio/stdio.h"
-#include "libc/sysv/consts/rlim.h"
-#include "libc/sysv/consts/rlimit.h"
-#include "libc/testlib/testlib.h"
-
-static void limit_memory_to_1mb() {
-  struct rlimit limit = {};
-  ASSERT_GE(getrlimit(RLIMIT_AS, &limit), 0);
-
-  if (limit.rlim_max > 1000000 || limit.rlim_max == RLIM_INFINITY) {
-    limit.rlim_max = 1000000;
-    limit.rlim_cur = limit.rlim_max;
-    ASSERT_GE(setrlimit(RLIMIT_AS, &limit), 0);
-  }
-}
-
-static void check_double_format_enomem(const char *fmt) {
-  errno = 0;
-  int result = printf(fmt, 1.0);
-  ASSERT_LE(result, 0);
-  ASSERT_EQ(errno, ENOMEM);
-}
-
-static void check_long_double_format_enomem(const char *fmt) {
-  errno = 0;
-  int result = printf(fmt, 1.0L);
-  ASSERT_LE(result, 0);
-  ASSERT_EQ(errno, ENOMEM);
-}
-
-TEST(snprintf, enomemFloat) {
-  limit_memory_to_1mb();
-
-  check_double_format_enomem("%.1000000f");
-  check_double_format_enomem("%.1000000g");
-  check_double_format_enomem("%.1000000e");
-
-  check_long_double_format_enomem("%.1000000Lf");
-  check_long_double_format_enomem("%.1000000Lg");
-  check_long_double_format_enomem("%.1000000Le");
-}
diff --git a/test/libc/stdio/snprintf_test.c b/test/libc/stdio/snprintf_test.c
index 0fb38489e..63a702bf2 100644
--- a/test/libc/stdio/snprintf_test.c
+++ b/test/libc/stdio/snprintf_test.c
@@ -16,268 +16,14 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/runtime/fenv.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/testlib/testlib.h"
 
-static void check_single_double(const char *fmt, const char *expected_str,
-                                double value) {
-  char buf[30] = {0};
-  int i = snprintf(buf, sizeof(buf), fmt, value);
-
-  ASSERT_GE(sizeof(buf), strlen(expected_str));
-  ASSERT_EQ(strlen(expected_str), i);
-  ASSERT_STREQ(expected_str, buf);
-  while (i < sizeof(buf))
-    ASSERT_EQ('\0', buf[i++]);
-}
-
-static void check_single_long_double(const char *fmt, const char *expected_str,
-                                     long double value) {
-  char buf[30] = {0};
-  int i = snprintf(buf, sizeof(buf), fmt, value);
-
-  ASSERT_GE(sizeof(buf), strlen(expected_str));
-  ASSERT_EQ(strlen(expected_str), i);
-  ASSERT_STREQ(expected_str, buf);
-  while (i < sizeof(buf))
-    ASSERT_EQ('\0', buf[i++]);
-}
-
-void check_single_long_double_arr_allowed(
-    const char *fmt, const char *allowed_strs[], long double value) {
-  char buf[30] = {0};
-  int res = snprintf(buf, sizeof(buf), fmt, value);
-
-  for (size_t i = 0; allowed_strs[i] != NULL; ++i)
-    if (strlen(allowed_strs[i]) == res && strcmp(allowed_strs[i], buf) == 0)
-      return;
-
-  printf("Failed to find matching str for %`'s, allowed strs:\n", buf);
-  for (size_t i = 0; allowed_strs[i] != NULL; ++i)
-    printf("- %`'s\n", allowed_strs[i]);
-  fflush(stdout);
-  ASSERT_EQ(false, true);
-}
-
-static void check_single_int(const char *fmt, const char *expected_str,
-                             int value) {
-  char buf[30] = {0};
-  int i = snprintf(buf, sizeof(buf), fmt, value);
-
-  ASSERT_GE(sizeof(buf), strlen(expected_str));
-  ASSERT_EQ(strlen(expected_str), i);
-  ASSERT_STREQ(expected_str, buf);
-  while (i < sizeof(buf))
-    ASSERT_EQ('\0', buf[i++]);
-}
-
-static void check_single_wint_t(const char *fmt, const char *expected_str,
-                                wint_t value) {
-  char buf[30] = {0};
-  int i = snprintf(buf, sizeof(buf), fmt, value);
-
-  ASSERT_GE(sizeof(buf), strlen(expected_str));
-  ASSERT_EQ(strlen(expected_str), i);
-  ASSERT_STREQ(expected_str, buf);
-  while (i < sizeof(buf))
-    ASSERT_EQ('\0', buf[i++]);
-}
-
 TEST(snprintf, testVeryLargePrecision) {
   char buf[512] = {};
   int i = snprintf(buf, sizeof(buf), "%.9999u", 10);
 
-  ASSERT_EQ(9999, i);
-  ASSERT_EQ(511, strlen(buf));
-}
-
-TEST(snprintf, testPlusFlagOnChar) {
-  check_single_int("%+c", "=", '=');
-}
-
-TEST(snprintf, testInf) {
-  check_single_double("%f", "inf", 1.0 / 0.0);
-  check_single_long_double("%Lf", "inf", 1.0L / 0.0L);
-  check_single_double("%e", "inf", 1.0 / 0.0);
-  check_single_long_double("%Le", "inf", 1.0L / 0.0L);
-  check_single_double("%g", "inf", 1.0 / 0.0);
-  check_single_long_double("%Lg", "inf", 1.0L / 0.0L);
-}
-
-TEST(snprintf, testUppercaseCConversionSpecifier) {
-  check_single_wint_t("%C", "a", L'a');
-  check_single_wint_t("%C", "☺", L'☺');
-}
-
-// Make sure we don't va_arg the wrong argument size on wide character
-// conversion specifiers
-TEST(snprintf,
-     testWideCConversionSpecifierWithLotsOfArgumentsBeforeAndOneAfter) {
-  char buf[20] = {};
-  int i = snprintf(buf, sizeof(buf), "%d%d%d%d%d%d%d%d%lc%d", 0, 0, 0, 0, 0, 0,
-                   0, 0, L'x', 1);
-
-  ASSERT_EQ(10, i);
-  ASSERT_STREQ("00000000x1", buf);
-
-  memset(buf, 0, sizeof(buf));
-  i = snprintf(buf, sizeof(buf), "%d%d%d%d%d%d%d%d%C%d", 0, 0, 0, 0, 0, 0, 0, 0,
-               L'x', 1);
-  ASSERT_EQ(10, i);
-  ASSERT_STREQ("00000000x1", buf);
-}
-
-static void check_n_buffer_contents(char buf[350]) {
-  for (int i = 0; i < 284; ++i)
-    ASSERT_EQ(' ', buf[i]);
-  ASSERT_STREQ("428463", &buf[284]);
-  for (int i = 290; i < 350; ++i)
-    ASSERT_EQ('\0', buf[i]);
-}
-
-TEST(snprintf, testNConversionSpecifier) {
-  char buf[350] = {};
-
-  int n_res_int = -1;
-  int i = snprintf(buf, sizeof(buf), "%286d%d%n%d", 42, 84, &n_res_int, 63);
-  ASSERT_EQ(290, i);
-  check_n_buffer_contents(buf);
-  ASSERT_EQ(288, n_res_int);
-
-  memset(&buf, '\0', sizeof(buf));
-  long n_res_long = -1;
-  i = snprintf(buf, sizeof(buf), "%286ld%ld%ln%ld", 42L, 84L, &n_res_long, 63L);
-  ASSERT_EQ(290, i);
-  check_n_buffer_contents(buf);
-  ASSERT_EQ(288, n_res_long);
-
-  memset(&buf, '\0', sizeof(buf));
-  long long n_res_long_long = -1;
-  i = snprintf(buf, sizeof(buf), "%286lld%lld%lln%lld", 42LL, 84LL,
-               &n_res_long_long, 63LL);
-  ASSERT_EQ(290, i);
-  check_n_buffer_contents(buf);
-  ASSERT_EQ(288, n_res_long_long);
-
-  ASSERT_EQ(sizeof(short), 2);
-  ASSERT_EQ(sizeof(int), 4);
-  memset(&buf, '\0', sizeof(buf));
-  short n_res_short = -1;
-  i = snprintf(buf, sizeof(buf), "%286hd%hd%hn%hd", (42 | 0xFFFF0000),
-               (84 | 0xFFFF0000), &n_res_short, (63 | 0xFFFF0000));
-  ASSERT_EQ(290, i);
-  check_n_buffer_contents(buf);
-  ASSERT_EQ(288, n_res_short);
-
-  ASSERT_EQ(sizeof(unsigned char), 1);
-  memset(&buf, '\0', sizeof(buf));
-  signed char n_res_char = -1;
-  i = snprintf(buf, sizeof(buf), "%286hhd%hhd%hhn%hhd", (42 | 0xFFFFFF00),
-               (84 | 0xFFFFFF00), &n_res_char, (63 | 0xFFFFFF00));
-  ASSERT_EQ(290, i);
-  check_n_buffer_contents(buf);
-  ASSERT_EQ((signed char)288, n_res_char);
-
-  memset(&buf, '\0', sizeof(buf));
-  ssize_t n_res_size_t = -1;
-  i = snprintf(buf, sizeof(buf), "%286zd%zd%zn%zd", (ssize_t)42, (ssize_t)84,
-               &n_res_size_t, (ssize_t)63);
-  ASSERT_EQ(290, i);
-  check_n_buffer_contents(buf);
-  ASSERT_EQ(288, n_res_size_t);
-
-  memset(&buf, '\0', sizeof(buf));
-  intmax_t n_res_intmax_t = -1;
-  i = snprintf(buf, sizeof(buf), "%286jd%jd%jn%jd", (intmax_t)42, (intmax_t)84,
-               &n_res_intmax_t, (intmax_t)63);
-  ASSERT_EQ(290, i);
-  check_n_buffer_contents(buf);
-  ASSERT_EQ(288, n_res_intmax_t);
-
-  memset(&buf, '\0', sizeof(buf));
-  int128_t n_res_int128_t = -1;
-  i = snprintf(buf, sizeof(buf), "%286jjd%jjd%jjn%jjd", (int128_t)42,
-               (int128_t)84, &n_res_int128_t, (int128_t)63);
-  ASSERT_EQ(290, i);
-  check_n_buffer_contents(buf);
-  ASSERT_EQ(288, n_res_int128_t);
-
-  memset(&buf, '\0', sizeof(buf));
-  ptrdiff_t n_res_ptrdiff_t = -1;
-  i = snprintf(buf, sizeof(buf), "%286td%td%tn%td", (ptrdiff_t)42,
-               (ptrdiff_t)84, &n_res_ptrdiff_t, (ptrdiff_t)63);
-  ASSERT_EQ(290, i);
-  check_n_buffer_contents(buf);
-  ASSERT_EQ(288, n_res_ptrdiff_t);
-}
-
-TEST(snprintf, testLongDoubleEConversionSpecifier) {
-  check_single_long_double("%Le", "1.234568e+06", 1234567.8L);
-}
-
-TEST(snprintf, testLongDoubleRounding) {
-  int previous_rounding = fegetround();
-  ASSERT_EQ(0, fesetround(FE_DOWNWARD));
-
-  check_single_long_double("%.3Lf", "4.437", 4.4375L);
-  check_single_long_double("%.3Lf", "-4.438", -4.4375L);
-
-  ASSERT_EQ(0, fesetround(FE_TOWARDZERO));
-
-  check_single_long_double("%.3Lf", "-4.437", -4.4375L);
-
-  ASSERT_EQ(0, fesetround(previous_rounding));
-}
-
-void check_a_conversion_specifier_double_prec_1(const char *expected_str,
-                                                double value) {
-  check_single_double("%.1a", expected_str, value);
-}
-
-TEST(snprintf, testAConversionSpecifierRounding) {
-  int previous_rounding = fegetround();
-
-  ASSERT_EQ(0, fesetround(FE_DOWNWARD));
-  check_a_conversion_specifier_double_prec_1("0x1.fp+4", 0x1.fffffp+4);
-
-  ASSERT_EQ(0, fesetround(FE_UPWARD));
-  check_a_conversion_specifier_double_prec_1("0x2.0p+4", 0x1.f8p+4);
-
-  ASSERT_EQ(0, fesetround(previous_rounding));
-}
-
-// This test specifically checks that we round to even, accordingly to IEEE
-// rules
-TEST(snprintf, testAConversionSpecifier) {
-  check_a_conversion_specifier_double_prec_1("0x1.8p+4", 0x1.7800000000001p+4);
-  check_a_conversion_specifier_double_prec_1("0x1.8p+4", 0x1.78p+4);
-  check_a_conversion_specifier_double_prec_1("0x1.8p+4", 0x1.88p+4);
-  check_a_conversion_specifier_double_prec_1("0x1.6p+4", 0x1.58p+4);
-  check_a_conversion_specifier_double_prec_1("0x1.6p+4", 0x1.68p+4);
-  check_a_conversion_specifier_double_prec_1("0x1.ap+4", 0x1.98p+4);
-  check_a_conversion_specifier_double_prec_1("0x1.ap+4", 0x1.a8p+4);
-
-  check_single_double("%#a", "0x0.p+0", 0x0.0p0);
-  check_single_double("%#A", "0X0.P+0", 0x0.0p0);
-  check_single_long_double("%#La", "0x0.p+0", 0x0.0p0L);
-  check_single_long_double("%#LA", "0X0.P+0", 0x0.0p0L);
-
-  check_single_double("%.2a", "0x1.00p-1026", 0xf.fffp-1030);
-
-  check_single_double("%.1a", "0x2.0p+0", 1.999);
-  const char *acceptable_results1[] = {"0x1.0p+1", "0x2.0p+0", NULL};
-  check_single_long_double_arr_allowed(
-      "%.1La", acceptable_results1, 1.999L);
-}
-
-TEST(snprintf, testApostropheFlag) {
-  check_single_int("%'d", "10000000", 10000000);
-}
-
-TEST(snprintf, testUppercaseBConversionSpecifier) {
-  check_single_int("%B", "0", 0);
-  check_single_int("%B", "10", 2);
-  check_single_int("%#B", "0B10011", 19);
+  ASSERT_EQ(i, 9999);
+  ASSERT_EQ(strlen(buf), 511);
 }
diff --git a/test/libc/stdio/sscanf_test.c b/test/libc/stdio/sscanf_test.c
index c134fa265..40e35f1c4 100644
--- a/test/libc/stdio/sscanf_test.c
+++ b/test/libc/stdio/sscanf_test.c
@@ -338,17 +338,17 @@ TEST(sscanf, flexdecimal_hex) {
 TEST(sscanf, floating_point_simple) {
   float x = 666.666f, y = x, z = y;
   EXPECT_EQ(3, sscanf("0.3715 .3715 3715", "%f %f %f", &x, &y, &z));
-  EXPECT_FLOAT_EXACTLY_EQ(0.3715f, x);
-  EXPECT_FLOAT_EXACTLY_EQ(0.3715f, y);
-  EXPECT_FLOAT_EXACTLY_EQ(3715.0f, z);
+  EXPECT_EQ(0.3715f, x);
+  EXPECT_EQ(0.3715f, y);
+  EXPECT_EQ(3715.0f, z);
 }
 
 TEST(sscanf, floating_point_simple_double_precision) {
   double x = 666.666, y = x, z = y;
   EXPECT_EQ(3, sscanf("0.3715 .3715 3715", "%lf %lf %lf", &x, &y, &z));
-  EXPECT_DOUBLE_EXACTLY_EQ(0.3715, x);
-  EXPECT_DOUBLE_EXACTLY_EQ(0.3715, y);
-  EXPECT_DOUBLE_EXACTLY_EQ(3715.0, z);
+  EXPECT_EQ(0.3715, x);
+  EXPECT_EQ(0.3715, y);
+  EXPECT_EQ(3715.0, z);
 }
 
 TEST(sscanf, floating_point_nan) {
@@ -426,12 +426,12 @@ TEST(sscanf, floating_point_documentation_examples) {
       2, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk",
                 "%f %f %f %f %f", &f, &g, &h, &i, &j));
 
-  EXPECT_FLOAT_EXACTLY_EQ(111.11f, a);
-  EXPECT_FLOAT_EXACTLY_EQ(-2.22f, b);
+  EXPECT_EQ(111.11f, a);
+  EXPECT_EQ(-2.22f, b);
   EXPECT_TRUE(isnan(c));
   EXPECT_TRUE(isnan(d));
   EXPECT_TRUE(isinf(e));
-  EXPECT_FLOAT_EXACTLY_EQ(0X1.BC70A3D70A3D7P+6f, f);
+  EXPECT_EQ(0X1.BC70A3D70A3D7P+6f, f);
   EXPECT_TRUE(isinf(g));
 }
 
@@ -445,12 +445,12 @@ TEST(sscanf, floating_point_documentation_examples_double_precision) {
       2, sscanf("0X1.BC70A3D70A3D7P+6 1.18973e+4932zzz -0.0000000123junk junk",
                 "%lf %lf %lf %lf %lf", &f, &g, &h, &i, &j));
 
-  EXPECT_DOUBLE_EXACTLY_EQ(111.11, a);
-  EXPECT_DOUBLE_EXACTLY_EQ(-2.22, b);
+  EXPECT_EQ(111.11, a);
+  EXPECT_EQ(-2.22, b);
   EXPECT_TRUE(isnan(c));
   EXPECT_TRUE(isnan(d));
   EXPECT_TRUE(isinf(e));
-  EXPECT_DOUBLE_EXACTLY_EQ(0X1.BC70A3D70A3D7P+6, f);
+  EXPECT_EQ(0X1.BC70A3D70A3D7P+6, f);
   EXPECT_TRUE(isinf(g));
 }
 
@@ -506,9 +506,3 @@ TEST(scanf, n) {
   ASSERT_EQ(1848, port);
   ASSERT_EQ(12, len);
 }
-
-TEST(sscanf, floating_point_hexadecimal) {
-  double a = 0;
-  ASSERT_EQ(1, sscanf("0x1.5014c3472bc2c0000000p-123", "%lf", &a));
-  ASSERT_DOUBLE_EXACTLY_EQ(0x1.5014c3472bc2c0000000p-123, a);
-}
diff --git a/test/libc/stdio/zipdir_test.c b/test/libc/stdio/zipdir_test.c
index 141bb9ad3..83dda5aff 100644
--- a/test/libc/stdio/zipdir_test.c
+++ b/test/libc/stdio/zipdir_test.c
@@ -20,7 +20,7 @@
 #include "libc/calls/struct/dirent.h"
 #include "libc/calls/struct/stat.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/test/libc/str/BUILD.mk b/test/libc/str/BUILD.mk
index 7468b3f05..cae03bb64 100644
--- a/test/libc/str/BUILD.mk
+++ b/test/libc/str/BUILD.mk
@@ -36,6 +36,7 @@ TEST_LIBC_STR_DIRECTDEPS =					\
 	LIBC_FMT						\
 	LIBC_INTRIN						\
 	LIBC_LOG						\
+	LIBC_TINYMATH						\
 	LIBC_MEM						\
 	LIBC_NEXGEN32E						\
 	LIBC_RUNTIME						\
@@ -44,16 +45,14 @@ TEST_LIBC_STR_DIRECTDEPS =					\
 	LIBC_SYSV						\
 	LIBC_SYSV_CALLS						\
 	LIBC_TESTLIB						\
-	LIBC_TINYMATH						\
 	LIBC_X							\
 	THIRD_PARTY_COMPILER_RT					\
-	THIRD_PARTY_LIBCXX					\
 	THIRD_PARTY_MBEDTLS					\
-	THIRD_PARTY_MUSL					\
 	THIRD_PARTY_REGEX					\
-	THIRD_PARTY_SMALLZ4					\
-	THIRD_PARTY_VQSORT					\
 	THIRD_PARTY_ZLIB					\
+	THIRD_PARTY_LIBCXX					\
+	THIRD_PARTY_SMALLZ4					\
+	THIRD_PARTY_VQSORT
 
 TEST_LIBC_STR_DEPS :=						\
 	$(call uniq,$(foreach x,$(TEST_LIBC_STR_DIRECTDEPS),$($(x))))
diff --git a/test/libc/str/blake2_test.c b/test/libc/str/blake2_test.c
index 4fe6d1427..65f2f34e0 100644
--- a/test/libc/str/blake2_test.c
+++ b/test/libc/str/blake2_test.c
@@ -18,13 +18,12 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/blake2.h"
 #include "libc/assert.h"
-#include "libc/calls/struct/timespec.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
-#include "libc/testlib/benchmark.h"
+#include "libc/str/tab.internal.h"
+#include "libc/testlib/ezbench.h"
 #include "libc/testlib/hyperion.h"
 #include "libc/testlib/testlib.h"
 
@@ -91,18 +90,17 @@ TEST(BLAKE2B256Test, vectors) {
   free(line);
 }
 
-BENCH(blake2, benchmark) {
+BENCH(blake2, bench) {
   char fun[256];
   rngset(fun, 256, _rand64, -1);
-  BENCHMARK(100, 0, __expropriate(EZBLAKE2B256(0, 0)));
-  BENCHMARK(100, 1, __expropriate(EZBLAKE2B256("h", 1)));
-  BENCHMARK(100, 8, __expropriate(EZBLAKE2B256("helloooo", 8)));
-  BENCHMARK(100, 31, __expropriate(EZBLAKE2B256(fun, 31)));
-  BENCHMARK(100, 32, __expropriate(EZBLAKE2B256(fun, 32)));
-  BENCHMARK(100, 63, __expropriate(EZBLAKE2B256(fun, 63)));
-  BENCHMARK(100, 64, __expropriate(EZBLAKE2B256(fun, 64)));
-  BENCHMARK(100, 128, __expropriate(EZBLAKE2B256(fun, 128)));
-  BENCHMARK(100, 256, __expropriate(EZBLAKE2B256(fun, 256)));
-  BENCHMARK(100, kHyperionSize,
-            __expropriate(EZBLAKE2B256(kHyperion, kHyperionSize)));
+  EZBENCH_N("blake2b256", 0, EZBLAKE2B256(0, 0));
+  EZBENCH_N("blake2b256", 8, EZBLAKE2B256("helloooo", 8));
+  EZBENCH_N("blake2b256", 31, EZBLAKE2B256(fun, 31));
+  EZBENCH_N("blake2b256", 32, EZBLAKE2B256(fun, 32));
+  EZBENCH_N("blake2b256", 63, EZBLAKE2B256(fun, 63));
+  EZBENCH_N("blake2b256", 64, EZBLAKE2B256(fun, 64));
+  EZBENCH_N("blake2b256", 128, EZBLAKE2B256(fun, 128));
+  EZBENCH_N("blake2b256", 256, EZBLAKE2B256(fun, 256));
+  EZBENCH_N("blake2b256", kHyperionSize,
+            EZBLAKE2B256(kHyperion, kHyperionSize));
 }
diff --git a/test/libc/str/hexpcpy_test.c b/test/libc/str/hexpcpy_test.c
index 532342d64..0a1c1c93d 100644
--- a/test/libc/str/hexpcpy_test.c
+++ b/test/libc/str/hexpcpy_test.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdio/rand.h"
 #include "libc/str/str.h"
 #include "libc/testlib/testlib.h"
diff --git a/test/libc/str/highwayhash64_test.c b/test/libc/str/highwayhash64_test.c
index 2ac093389..6ba2f443a 100644
--- a/test/libc/str/highwayhash64_test.c
+++ b/test/libc/str/highwayhash64_test.c
@@ -16,14 +16,13 @@
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/highwayhash64.h"
-#include "libc/calls/struct/timespec.h"
 #include "libc/inttypes.h"
 #include "libc/nexgen32e/crc32.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/testlib/benchmark.h"
+#include "libc/testlib/ezbench.h"
 #include "libc/testlib/hyperion.h"
 #include "libc/testlib/testlib.h"
 #include "third_party/zlib/zlib.h"
@@ -101,31 +100,33 @@ TEST(highwayhash64, test) {
 BENCH(highwayhash64, newbench) {
   char fun[256];
   rngset(fun, 256, _rand64, -1);
-  BENCHMARK(10, 0, HighwayHash64(0, 0, kTestKey1));
-  BENCHMARK(10, 8, HighwayHash64("helloooo", 8, kTestKey1));
-  BENCHMARK(10, 31, HighwayHash64(fun, 31, kTestKey1));
-  BENCHMARK(10, 32, HighwayHash64(fun, 32, kTestKey1));
-  BENCHMARK(10, 63, HighwayHash64(fun, 63, kTestKey1));
-  BENCHMARK(10, 64, HighwayHash64(fun, 64, kTestKey1));
-  BENCHMARK(10, 128, HighwayHash64(fun, 128, kTestKey1));
-  BENCHMARK(10, 256, HighwayHash64(fun, 256, kTestKey1));
-  BENCHMARK(10, kHyperionSize,
+  EZBENCH_N("highwayhash64", 0, HighwayHash64(0, 0, kTestKey1));
+  EZBENCH_N("highwayhash64", 8, HighwayHash64("helloooo", 8, kTestKey1));
+  EZBENCH_N("highwayhash64", 31, HighwayHash64(fun, 31, kTestKey1));
+  EZBENCH_N("highwayhash64", 32, HighwayHash64(fun, 32, kTestKey1));
+  EZBENCH_N("highwayhash64", 63, HighwayHash64(fun, 63, kTestKey1));
+  EZBENCH_N("highwayhash64", 64, HighwayHash64(fun, 64, kTestKey1));
+  EZBENCH_N("highwayhash64", 128, HighwayHash64(fun, 128, kTestKey1));
+  EZBENCH_N("highwayhash64", 256, HighwayHash64(fun, 256, kTestKey1));
+  EZBENCH_N("highwayhash64", kHyperionSize,
             HighwayHash64(kHyperion, kHyperionSize, kTestKey1));
 }
 
 BENCH(highwayhash64, bench) {
-  BENCHMARK(10, 5,
-            __expropriate(KnuthMultiplicativeHash32(__veil("r", "hello"), 5)));
-  BENCHMARK(10, 5, __expropriate(crc32c(0, "hello", 5)));
-  BENCHMARK(10, 5, __expropriate(crc32_z(0, __veil("r", "hello"), 5)));
-  BENCHMARK(10, 5, HighwayHash64((void *)"hello", 5, kTestKey1));
-  BENCHMARK(10, kHyperionSize,
-            __expropriate(crc32_z(0, kHyperion, kHyperionSize)));
-  BENCHMARK(10, kHyperionSize,
-            __expropriate(crc32c(0, kHyperion, kHyperionSize)));
-  BENCHMARK(10, kHyperionSize,
-            HighwayHash64((void *)kHyperion, kHyperionSize, kTestKey1));
-  BENCHMARK(10, kHyperionSize,
-            __expropriate(KnuthMultiplicativeHash32(__veil("r", kHyperion),
-                                                    kHyperionSize)));
+  EZBENCH2("knuth small", donothing,
+           __expropriate(KnuthMultiplicativeHash32(__veil("r", "hello"), 5)));
+  EZBENCH2("crc32c small", donothing, __expropriate(crc32c(0, "hello", 5)));
+  EZBENCH2("crc32 small", donothing,
+           __expropriate(crc32_z(0, __veil("r", "hello"), 5)));
+  EZBENCH2("highwayhash64 small", donothing,
+           HighwayHash64((void *)"hello", 5, kTestKey1));
+  EZBENCH2("crc32 big", donothing,
+           __expropriate(crc32_z(0, kHyperion, kHyperionSize)));
+  EZBENCH2("crc32c big", donothing,
+           __expropriate(crc32c(0, kHyperion, kHyperionSize)));
+  EZBENCH2("highwayhash64 big", donothing,
+           HighwayHash64((void *)kHyperion, kHyperionSize, kTestKey1));
+  EZBENCH2("knuth big", donothing,
+           __expropriate(KnuthMultiplicativeHash32(__veil("r", kHyperion),
+                                                   kHyperionSize)));
 }
diff --git a/test/libc/str/memccpy_test.c b/test/libc/str/memccpy_test.c
index aebe301bb..5b54c189f 100644
--- a/test/libc/str/memccpy_test.c
+++ b/test/libc/str/memccpy_test.c
@@ -16,18 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
-#include "libc/intrin/safemacros.h"
 #include "libc/mem/mem.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/testlib/benchmark.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 
@@ -58,40 +50,6 @@ TEST(memccpy, testZeroLength_doesNothing) {
   EXPECT_EQ(NULL, memccpy(buf, "hi", '\0', 0));
 }
 
-TEST(memccpy, fuzz) {
-  int pagesz = sysconf(_SC_PAGESIZE);
-  char *map1 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
-                            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  npassert(map1 != MAP_FAILED);
-  npassert(!mprotect(map1 + pagesz, pagesz, PROT_NONE));
-  char *map2 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
-                            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  npassert(map2 != MAP_FAILED);
-  npassert(!mprotect(map2 + pagesz, pagesz, PROT_NONE));
-  char *map3 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
-                            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  npassert(map3 != MAP_FAILED);
-  npassert(!mprotect(map3 + pagesz, pagesz, PROT_NONE));
-  for (int dsize = 1; dsize < 128; ++dsize) {
-    char *volatile dst1 = map1 + pagesz - dsize;
-    char *volatile dst2 = map1 + pagesz - dsize;
-    for (int i = 0; i < dsize; ++i)
-      dst1[i] = dst2[i] = rand();
-    for (int ssize = 1; ssize < dsize * 2; ++ssize) {
-      char *volatile src = map3 + pagesz - (ssize + 1);
-      for (int i = 0; i < ssize; ++i)
-        src[i] = max(rand() & 255, 1);
-      src[ssize] = 0;
-      ASSERT_EQ(memccpy_pure(dst1, src, 0, dsize),
-                memccpy(dst2, src, 0, dsize));
-      ASSERT_EQ(0, memcmp(dst1, dst2, dsize));
-    }
-  }
-  npassert(!munmap(map3, pagesz * 2));
-  npassert(!munmap(map2, pagesz * 2));
-  npassert(!munmap(map1, pagesz * 2));
-}
-
 TEST(memccpy, memcpy) {
   unsigned n, n1, n2;
   char *b1, *b2, *b3, *e1, *e2;
@@ -120,26 +78,3 @@ TEST(memccpy, memcpy) {
     free(b1);
   }
 }
-
-#define N 4096
-
-BENCH(memccpy, bench) {
-  char dst[N];
-  char src[N + 1];
-
-  printf("\n");
-  for (int n = 1; n <= N; n *= 2) {
-    for (int i = 0; i < n; ++i)
-      src[i] = max(rand() & 255, 1);
-    src[n] = 0;
-    BENCHMARK(100, n, X(memccpy(dst, src, 0, V(N))));
-  }
-
-  printf("\n");
-  for (int n = 1; n <= N; n *= 2) {
-    for (int i = 0; i < n; ++i)
-      src[i] = max(rand() & 255, 1);
-    src[n] = 0;
-    BENCHMARK(100, n, X(memccpy_pure(dst, src, 0, V(N))));
-  }
-}
diff --git a/test/libc/str/memmem_test.c b/test/libc/str/memmem_test.c
index 881700537..413397be8 100644
--- a/test/libc/str/memmem_test.c
+++ b/test/libc/str/memmem_test.c
@@ -17,17 +17,10 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/mem/mem.h"
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
 #include "libc/intrin/likely.h"
-#include "libc/intrin/safemacros.h"
 #include "libc/mem/alg.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
 #include "libc/stdio/rand.h"
 #include "libc/str/str.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/hyperion.h"
 #include "libc/testlib/testlib.h"
@@ -179,26 +172,6 @@ TEST(memmem, fuzz) {
   }
 }
 
-TEST(memmem, safety) {
-  int pagesz = sysconf(_SC_PAGESIZE);
-  char *map = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
-                           MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  npassert(map != MAP_FAILED);
-  npassert(!mprotect(map + pagesz, pagesz, PROT_NONE));
-  for (int haylen = 1; haylen < 128; ++haylen) {
-    char *hay = map + pagesz - (haylen + 1);
-    for (int i = 0; i < haylen; ++i)
-      hay[i] = max(rand() & 255, 1);
-    hay[haylen] = 0;
-    for (int neelen = 1; neelen < haylen; ++neelen) {
-      char *nee = hay + (haylen + 1) - (neelen + 1);
-      ASSERT_EQ(memmem_naive(hay, haylen, nee, neelen),
-                memmem(hay, haylen, nee, neelen));
-    }
-  }
-  munmap(map, pagesz * 2);
-}
-
 /*
  *     memmem naive        l:    43,783c    14,142ns   m:    31,285c    10,105ns
  *     memmem              l:     2,597c       839ns   m:     2,612c       844ns
@@ -228,12 +201,7 @@ BENCH(memmem, bench) {
   EZBENCH2("memmem", donothing,
            __expropriate(memmem(kHyperion, kHyperionSize, "THE END", 7)));
   EZBENCH2("memmem", donothing,
-           __expropriate(
-               memmem("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-                      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-                      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab",
-                      152,
-                      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-                      "aaaaaaaaaaaaaaaaaaaaaaaab",
-                      81)));
+           __expropriate(memmem(
+               "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab",
+               62, "aaaaaab", 7)));
 }
diff --git a/test/libc/str/regex_test.c b/test/libc/str/regex_test.c
index 5c23594d3..f51b7f557 100644
--- a/test/libc/str/regex_test.c
+++ b/test/libc/str/regex_test.c
@@ -19,15 +19,10 @@
 #include "third_party/regex/regex.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
-#include "libc/str/locale.h"
 #include "libc/str/str.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 
-void SetUpOnce(void) {
-  setlocale(LC_ALL, "C.UTF-8");
-}
-
 TEST(regex, test) {
   regex_t rx;
   EXPECT_EQ(REG_OK, regcomp(&rx, "^[A-Za-z\x7f-\uffff]{2}$", REG_EXTENDED));
diff --git a/test/libc/stdio/strfry_test.c b/test/libc/str/setlocale_test.c
similarity index 80%
rename from test/libc/stdio/strfry_test.c
rename to test/libc/str/setlocale_test.c
index a87ee54ba..cc2669066 100644
--- a/test/libc/stdio/strfry_test.c
+++ b/test/libc/str/setlocale_test.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2022 Gavin Arthur Hayes                                            │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,23 +16,15 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/stdio/rand.h"
+#include "libc/str/locale.h"
 #include "libc/testlib/testlib.h"
 
-TEST(strfry, empty) {
-  char s[1] = "";
-  EXPECT_EQ(s, strfry(s));
-  EXPECT_STREQ("", s);
-}
-
-TEST(strfry, one) {
-  char s[2] = "a";
-  EXPECT_EQ(s, strfry(s));
-  EXPECT_STREQ("a", s);
-}
-
-TEST(strfry, test) {
-  char s[5] = "abcd";
-  EXPECT_EQ(s, strfry(s));
-  EXPECT_STREQ("cbda", s);
+TEST(setlocale, test) {
+  EXPECT_STREQ("C", setlocale(LC_ALL, NULL));
+  EXPECT_STREQ("C", setlocale(LC_ALL, "C"));
+  EXPECT_STREQ("C", setlocale(LC_ALL, NULL));
+  EXPECT_STREQ("POSIX", setlocale(LC_ALL, "POSIX"));
+  EXPECT_STREQ("C", setlocale(LC_ALL, ""));
+  EXPECT_EQ(0, setlocale(LC_ALL, "ja_JP.PCK"));
+  EXPECT_STREQ("C", setlocale(LC_ALL, NULL));
 }
diff --git a/test/libc/str/strcasestr_test.c b/test/libc/str/strcasestr_test.c
index cf012f866..a4b29daff 100644
--- a/test/libc/str/strcasestr_test.c
+++ b/test/libc/str/strcasestr_test.c
@@ -17,20 +17,12 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
 #include "libc/dce.h"
-#include "libc/intrin/safemacros.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/x86feature.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
-#include "libc/stdio/rand.h"
-#include "libc/str/tab.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
+#include "libc/str/tab.internal.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/hyperion.h"
 #include "libc/testlib/testlib.h"
@@ -57,30 +49,6 @@ char *strcasestr_naive(const char *haystack, const char *needle) {
   return 0;
 }
 
-TEST(strcasestr, tester) {
-  const char *haystack = "Windows";
-  ASSERT_STREQ(haystack, strcasestr(haystack, "win"));
-}
-
-TEST(strcasestr, safety) {
-  int pagesz = sysconf(_SC_PAGESIZE);
-  char *map = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
-                           MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  npassert(map != MAP_FAILED);
-  npassert(!mprotect(map + pagesz, pagesz, PROT_NONE));
-  for (int haylen = 1; haylen < 128; ++haylen) {
-    char *hay = map + pagesz - (haylen + 1);
-    for (int i = 0; i < haylen; ++i)
-      hay[i] = max(rand() & 255, 1);
-    hay[haylen] = 0;
-    for (int neelen = 1; neelen < haylen; ++neelen) {
-      char *nee = hay + (haylen + 1) - (neelen + 1);
-      ASSERT_EQ(strcasestr_naive(hay, nee), strcasestr(hay, nee));
-    }
-  }
-  munmap(map, pagesz * 2);
-}
-
 TEST(strcasestr, test_emptyString_isFoundAtBeginning) {
   MAKESTRING(haystack, "abc123def");
   ASSERT_STREQ(&haystack[0], strcasestr(haystack, gc(strdup(""))));
diff --git a/test/libc/str/strlcpy_test.c b/test/libc/str/strlcpy_test.c
index 2d21841f7..e65600119 100644
--- a/test/libc/str/strlcpy_test.c
+++ b/test/libc/str/strlcpy_test.c
@@ -16,71 +16,12 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/intrin/safemacros.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
-#include "libc/stdio/rand.h"
 #include "libc/str/str.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/testlib/benchmark.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 
-size_t todd(char *dst, const char *src, size_t dsize) {
-  const char *osrc = src;
-  size_t nleft = dsize;
-  if (nleft != 0)
-    while (--nleft != 0)
-      if ((*dst++ = *src++) == '\0')
-        break;
-  if (nleft == 0) {
-    if (dsize != 0)
-      *dst = '\0';
-    while (*src++)
-      ;
-  }
-  return src - osrc - 1;
-}
-
-TEST(strlcpy, fuzz) {
-  int pagesz = sysconf(_SC_PAGESIZE);
-  char *map1 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
-                            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  npassert(map1 != MAP_FAILED);
-  npassert(!mprotect(map1 + pagesz, pagesz, PROT_NONE));
-  char *map2 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
-                            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  npassert(map2 != MAP_FAILED);
-  npassert(!mprotect(map2 + pagesz, pagesz, PROT_NONE));
-  char *map3 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
-                            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  npassert(map3 != MAP_FAILED);
-  npassert(!mprotect(map3 + pagesz, pagesz, PROT_NONE));
-  for (int dsize = 1; dsize < 128; ++dsize) {
-    char *volatile dst1 = map1 + pagesz - dsize;
-    char *volatile dst2 = map1 + pagesz - dsize;
-    for (int i = 0; i < dsize; ++i)
-      dst1[i] = dst2[i] = max(rand() & 255, 1);
-    for (int ssize = 1; ssize < dsize * 2; ++ssize) {
-      char *volatile src = map3 + pagesz - (ssize + 1);
-      for (int i = 0; i < ssize; ++i)
-        src[i] = max(rand() & 255, 1);
-      src[ssize] = 0;
-      ASSERT_EQ(todd(dst1, src, dsize), strlcpy(dst2, src, dsize));
-      ASSERT_EQ(0, memcmp(dst1, dst2, dsize));
-    }
-  }
-  npassert(!munmap(map3, pagesz * 2));
-  npassert(!munmap(map2, pagesz * 2));
-  npassert(!munmap(map1, pagesz * 2));
-}
-
 TEST(strlcpy, testEmptyBuffer_doesNothing) {
   EXPECT_EQ(5, strlcpy(NULL, "hello", 0));
 }
@@ -97,25 +38,12 @@ TEST(strlcpy, testShortBuffer_copies) {
   EXPECT_STREQ("h", buf);
 }
 
-#define N 4096
-
 BENCH(strlcpy, bench) {
-  char dst[N];
-  char src[N + 1];
-
-  printf("\n");
-  for (int n = 1; n <= N; n *= 2) {
-    for (int i = 0; i < n; ++i)
-      src[i] = max(rand() & 255, 1);
-    src[n] = 0;
-    BENCHMARK(100, n, X(strlcpy(dst, src, V(N))));
-  }
-
-  printf("\n");
-  for (int n = 1; n <= N; n *= 2) {
-    for (int i = 0; i < n; ++i)
-      src[i] = max(rand() & 255, 1);
-    src[n] = 0;
-    BENCHMARK(100, n, X(todd(dst, src, V(N))));
-  }
+  char buf[256];
+  EZBENCH2(
+      "strlcpy", donothing,
+      __expropriate(strlcpy(__veil("r", buf), "hello there", sizeof(buf))));
+  EZBENCH2(
+      "strncpy", donothing,
+      __expropriate(strncpy(__veil("r", buf), "hello there", sizeof(buf))));
 }
diff --git a/test/libc/str/strstr_test.c b/test/libc/str/strstr_test.c
index 086dd6e15..929185e6f 100644
--- a/test/libc/str/strstr_test.c
+++ b/test/libc/str/strstr_test.c
@@ -17,23 +17,11 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
 #include "libc/dce.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/intrin/safemacros.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/x86feature.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
-#include "libc/stdalign.h"
-#include "libc/stdio/rand.h"
-#include "libc/stdio/stdio.h"
-#include "libc/stdio/sysparam.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/hyperion.h"
 #include "libc/testlib/testlib.h"
@@ -60,13 +48,6 @@ char *strstr_naive(const char *haystack, const char *needle) {
   return 0;
 }
 
-TEST(strstr, special) {
-  MAKESTRING(haystack, "abc123def");
-  ASSERT_STREQ(&haystack[0], strstr(haystack, haystack));
-  ASSERT_STREQ(&haystack[0], strstr(haystack, ""));
-  free(haystack);
-}
-
 TEST(strstr, test_emptyString_isFoundAtBeginning) {
   MAKESTRING(haystack, "abc123def");
   ASSERT_STREQ(&haystack[0], strstr(haystack, gc(strdup(""))));
@@ -86,8 +67,7 @@ TEST(strstr, test_notFound1) {
 }
 
 TEST(strstr, test_middleOfString) {
-  alignas(16) char hog[] = "abc123def";
-  MAKESTRING(haystack, hog);
+  MAKESTRING(haystack, "abc123def");
   ASSERT_STREQ(&haystack[3], strstr(haystack, gc(strdup("123"))));
   free(haystack);
 }
@@ -118,25 +98,6 @@ TEST(strstr, test) {
   ASSERT_STREQ("x", strstr("x", "x"));
 }
 
-TEST(strstr, safety) {
-  int pagesz = sysconf(_SC_PAGESIZE);
-  char *map = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
-                           MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  npassert(map != MAP_FAILED);
-  npassert(!mprotect(map + pagesz, pagesz, PROT_NONE));
-  for (int haylen = 1; haylen < 128; ++haylen) {
-    char *hay = map + pagesz - (haylen + 1);
-    for (int i = 0; i < haylen; ++i)
-      hay[i] = max(rand() & 255, 1);
-    hay[haylen] = 0;
-    for (int neelen = 1; neelen < haylen; ++neelen) {
-      char *nee = hay + (haylen + 1) - (neelen + 1);
-      ASSERT_EQ(strstr_naive(hay, nee), strstr(hay, nee));
-    }
-  }
-  munmap(map, pagesz * 2);
-}
-
 TEST(strstr, breakit) {
   char *p;
   p = gc(calloc(1, 32));
diff --git a/test/libc/str/towupper_test.c b/test/libc/str/towupper_test.c
index b5096be03..b0779d608 100644
--- a/test/libc/str/towupper_test.c
+++ b/test/libc/str/towupper_test.c
@@ -30,7 +30,7 @@ TEST(towupper, test) {
   EXPECT_EQ(u'!', towupper(u'!'));
   EXPECT_EQ(u'A', towupper(u'a'));
   EXPECT_EQ(u'À', towupper(u'à'));
-  /* EXPECT_EQ(L'𝛥', towupper(L'𝛿')); */
+  EXPECT_EQ(L'𝛥', towupper(L'𝛿'));
   EXPECT_EQ(L'Ｂ', towupper(L'ｂ'));
   EXPECT_EQ(u'Ꭰ', towupper(u'ꭰ'));
 }
@@ -39,7 +39,7 @@ TEST(towlower, test) {
   EXPECT_EQ(u'!', towlower(u'!'));
   EXPECT_EQ(u'a', towlower(u'A'));
   EXPECT_EQ(u'à', towlower(u'À'));
-  /* EXPECT_EQ(L'𝛿', towlower(L'𝛥')); */
+  EXPECT_EQ(L'𝛿', towlower(L'𝛥'));
   EXPECT_EQ(L'ｂ', towlower(L'Ｂ'));
   EXPECT_EQ(u'ꭰ', towlower(u'Ꭰ'));
 }
diff --git a/test/libc/str/wcwidth_test.c b/test/libc/str/wcwidth_test.c
index e79ea59f4..a57e837c9 100644
--- a/test/libc/str/wcwidth_test.c
+++ b/test/libc/str/wcwidth_test.c
@@ -16,11 +16,9 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/str/strwidth.h"
 #include "libc/str/unicode.h"
-#include "libc/testlib/benchmark.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 
@@ -30,7 +28,6 @@ TEST(wcwidth, test) {
   ASSERT_EQ(-1, wcwidth(-7));
   ASSERT_EQ(1, wcwidth(0x10FFFD));
   ASSERT_EQ(-1, wcwidth(0x10FFFD + 1));
-  ASSERT_EQ(2, wcwidth(L'😀'));
 }
 
 TEST(strwidth, testCjkWidesAndCombiningLowLines_withThompsonPikeEncoding) {
@@ -77,12 +74,6 @@ TEST(strwidth, testTextDelimitingControlCodes_dontHaveSubstance) {
   EXPECT_EQ(0, strwidth("\1", 0));
 }
 
-#define WCWIDTH(x) __expropriate(wcwidth(__veil("r", x)))
-
 BENCH(wcwidth, bench) {
-  BENCHMARK(1000, 1, WCWIDTH(u'a'));
-  BENCHMARK(1000, 1, WCWIDTH(u'a'));
-  BENCHMARK(1000, 1, WCWIDTH(u'→'));
-  BENCHMARK(1000, 1, WCWIDTH(L'😀'));
-  BENCHMARK(1000, 1, WCWIDTH(0));
+  EZBENCH2("wcwidth", donothing, __expropriate(wcwidth(__veil("r", u'→'))));
 }
diff --git a/test/libc/system/BUILD.mk b/test/libc/system/BUILD.mk
deleted file mode 100644
index ef1a6036d..000000000
--- a/test/libc/system/BUILD.mk
+++ /dev/null
@@ -1,116 +0,0 @@
-#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
-#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
-
-PKGS += TEST_LIBC_SYSTEM
-
-TEST_LIBC_SYSTEM_FILES := $(wildcard test/libc/system/*)
-TEST_LIBC_SYSTEM_SRCS = $(filter %.c,$(TEST_LIBC_SYSTEM_FILES))
-TEST_LIBC_SYSTEM_INCS = $(filter %.inc,$(TEST_LIBC_SYSTEM_FILES))
-TEST_LIBC_SYSTEM_SRCS_TEST = $(filter %_test.c,$(TEST_LIBC_SYSTEM_SRCS))
-
-TEST_LIBC_SYSTEM_OBJS =						\
-	$(TEST_LIBC_SYSTEM_SRCS:%.c=o/$(MODE)/%.o)
-
-TEST_LIBC_SYSTEM_COMS =						\
-	$(TEST_LIBC_SYSTEM_SRCS:%.c=o/$(MODE)/%)
-
-TEST_LIBC_SYSTEM_BINS =						\
-	$(TEST_LIBC_SYSTEM_COMS)				\
-	$(TEST_LIBC_SYSTEM_COMS:%=%.dbg)
-
-TEST_LIBC_SYSTEM_TESTS =					\
-	$(TEST_LIBC_SYSTEM_SRCS_TEST:%.c=o/$(MODE)/%.ok)
-
-TEST_LIBC_SYSTEM_CHECKS =					\
-	$(TEST_LIBC_SYSTEM_SRCS_TEST:%.c=o/$(MODE)/%.runs)
-
-TEST_LIBC_SYSTEM_DIRECTDEPS =					\
-	LIBC_CALLS						\
-	LIBC_INTRIN						\
-	LIBC_LOG						\
-	LIBC_MEM						\
-	LIBC_NEXGEN32E						\
-	LIBC_PROC						\
-	LIBC_RUNTIME						\
-	LIBC_STDIO						\
-	LIBC_STR						\
-	LIBC_SYSTEM						\
-	LIBC_SYSV						\
-	LIBC_TESTLIB						\
-	LIBC_THREAD						\
-	LIBC_X							\
-	THIRD_PARTY_MUSL					\
-	THIRD_PARTY_TR						\
-	THIRD_PARTY_TZ						\
-
-TEST_LIBC_SYSTEM_DEPS :=					\
-	$(call uniq,$(foreach x,$(TEST_LIBC_SYSTEM_DIRECTDEPS),$($(x))))
-
-o/$(MODE)/test/libc/system/system.pkg:				\
-		$(TEST_LIBC_SYSTEM_OBJS)			\
-		$(foreach x,$(TEST_LIBC_SYSTEM_DIRECTDEPS),$($(x)_A).pkg)
-
-o/$(MODE)/test/libc/system/%.dbg:				\
-		$(TEST_LIBC_SYSTEM_DEPS)			\
-		o/$(MODE)/test/libc/system/%.o			\
-		o/$(MODE)/test/libc/system/system.pkg		\
-		o/$(MODE)/tool/build/echo.zip.o			\
-		$(LIBC_TESTMAIN)				\
-		$(CRT)						\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/test/libc/system/popen_test.dbg:			\
-		$(TEST_LIBC_SYSTEM_DEPS)			\
-		o/$(MODE)/test/libc/system/popen_test.o		\
-		o/$(MODE)/test/libc/system/system.pkg		\
-		o/$(MODE)/tool/build/echo.zip.o			\
-		$(LIBC_TESTMAIN)				\
-		$(CRT)						\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/test/libc/system/system_test.dbg:			\
-		$(TEST_LIBC_SYSTEM_DEPS)			\
-		o/$(MODE)/test/libc/system/system_test.o	\
-		o/$(MODE)/test/libc/system/system.pkg		\
-		o/$(MODE)/tool/build/echo.zip.o			\
-		o/$(MODE)/tool/build/cocmd.zip.o		\
-		o/$(MODE)/tool/build/false.zip.o		\
-		$(LIBC_TESTMAIN)				\
-		$(CRT)						\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/test/libc/system/trace_test.dbg:			\
-		$(TEST_LIBC_SYSTEM_DEPS)			\
-		o/$(MODE)/test/libc/system/trace_test.o		\
-		o/$(MODE)/test/libc/system/system.pkg		\
-		o/$(MODE)/test/libc/system/popen_test.zip.o	\
-		o/$(MODE)/test/libc/system/popen_test.dbg.zip.o	\
-		o/$(MODE)/tool/build/echo.zip.o			\
-		$(LIBC_TESTMAIN)				\
-		$(CRT)						\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/test/libc/system/systemvpe_test.dbg:			\
-		$(TEST_LIBC_SYSTEM_DEPS)			\
-		o/$(MODE)/test/libc/system/systemvpe_test.o	\
-		o/$(MODE)/test/libc/system/system.pkg		\
-		o/$(MODE)/test/libc/proc/life.zip.o		\
-		$(LIBC_TESTMAIN)				\
-		$(CRT)						\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-
-o/$(MODE)/test/libc/system/popen_test.zip.o: private ZIPOBJ_FLAGS += -B
-o/$(MODE)/test/libc/system/popen_test.dbg.zip.o: private ZIPOBJ_FLAGS += -B
-
-$(TEST_LIBC_SYSTEM_OBJS): test/libc/system/BUILD.mk
-
-.PHONY: o/$(MODE)/test/libc/system
-o/$(MODE)/test/libc/system:					\
-		$(TEST_LIBC_SYSTEM_BINS)			\
-		$(TEST_LIBC_SYSTEM_CHECKS)
diff --git a/test/libc/system/trace_test.c b/test/libc/system/trace_test.c
deleted file mode 100644
index 7a661a2fc..000000000
--- a/test/libc/system/trace_test.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
-#include "libc/mem/mem.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/testlib/testlib.h"
-#include "libc/x/x.h"
-
-// make sure that running `popen_test --ftrace --strace` doesn't crash
-//
-// function and system call tracing are invasive runtime features that
-// can easily break if interrupting the other magical, deeply embedded
-// parts of the runtime, like mutations to the rbtree ftrace needs for
-// validating stack pointers (kisdangerous() locks the mmap lock), and
-// that's why we use dontinstrument in so many places in the codebase.
-//
-// we like popen_test because it tests the intersection of forking and
-// threads, and it activates other subsystems like the signal / itimer
-// worker threads on windows. if we can ftrace and strace it, then you
-// can be assured cosmo's tracing support works right on all platforms
-
-void SetUpOnce(void) {
-  testlib_enable_tmp_setup_teardown();
-}
-
-TEST(trace, test) {
-  unsetenv("MAKEFLAGS");  // avoid testmain.c 254 status
-  testlib_extract("/zip/popen_test", "popen_test", 0755);
-  testlib_extract("/zip/popen_test.dbg", "popen_test.dbg", 0755);
-  if (!fork()) {
-    close(1);
-    close(2);
-    open("log", O_CREAT | O_TRUNC | O_WRONLY | O_APPEND, 0644);
-    dup(1);
-    execl("./popen_test", "./popen_test", "--ftrace", "--strace", NULL);
-    _Exit(128);
-  }
-  int ws;
-  unassert(wait(&ws));
-  if (WIFSIGNALED(ws)) {
-    fprintf(stderr,
-            "%s:%d: error: trace_test got %s signal running "
-            "popen_test --strace --ftrace (see %s for output)\n",
-            __FILE__, __LINE__, strsignal(WTERMSIG(ws)), realpath("log", 0));
-    _Exit(1);
-  }
-  if (WEXITSTATUS(ws)) {
-    fprintf(stderr,
-            "%s:%d: error: trace_test got %d exit status running "
-            "popen_test --strace --ftrace (see %s for output)\n",
-            __FILE__, __LINE__, WEXITSTATUS(ws), realpath("log", 0));
-    _Exit(1);
-  }
-}
diff --git a/test/libc/thread/BUILD.mk b/test/libc/thread/BUILD.mk
index 1ac2769be..78e185361 100644
--- a/test/libc/thread/BUILD.mk
+++ b/test/libc/thread/BUILD.mk
@@ -41,7 +41,6 @@ TEST_LIBC_THREAD_DIRECTDEPS =				\
 	LIBC_SOCK					\
 	LIBC_STDIO					\
 	LIBC_STR					\
-	LIBC_SYSTEM					\
 	LIBC_SYSV					\
 	LIBC_SYSV_CALLS					\
 	LIBC_TESTLIB					\
@@ -50,7 +49,7 @@ TEST_LIBC_THREAD_DIRECTDEPS =				\
 	THIRD_PARTY_LIBCXXABI				\
 	THIRD_PARTY_NSYNC				\
 	THIRD_PARTY_NSYNC_MEM				\
-	THIRD_PARTY_TZ					\
+	THIRD_PARTY_TZ
 
 TEST_LIBC_THREAD_DEPS :=				\
 	$(call uniq,$(foreach x,$(TEST_LIBC_THREAD_DIRECTDEPS),$($(x))))
diff --git a/test/libc/thread/footek_test.c b/test/libc/thread/footek_test.c
index b08846ae3..98e07e5e9 100644
--- a/test/libc/thread/footek_test.c
+++ b/test/libc/thread/footek_test.c
@@ -1,262 +1,114 @@
-#define USE        POSIX
-#define ITERATIONS 100000
-#define THREADS    30
-
-#define SPIN            1
-#define FUTEX           2
-#define FUTEX_SHARED    3
-#define POSIX           4
-#define POSIX_RECURSIVE 5
-#define RWLOCK          6
-#define RWLOCK_SHARED   7
-
-#ifdef __COSMOPOLITAN__
-#include <cosmo.h>
-#endif
-
 #include <assert.h>
+#include <cosmo.h>
+#include <linux/futex.h>
 #include <pthread.h>
 #include <stdatomic.h>
 #include <stdio.h>
 #include <sys/resource.h>
-#include <sys/time.h>
+#include <sys/syscall.h>
 #include <time.h>
 #include <unistd.h>
+#include "third_party/nsync/futex.internal.h"
 
-#ifdef __linux__
-#include <linux/futex.h>
-#include <sys/syscall.h>
-static inline long nsync_futex_wait_(atomic_int *uaddr, int val, char pshare,
-                                     int clock,
-                                     const struct timespec *timeout) {
-  return syscall(SYS_futex, uaddr, pshare ? FUTEX_WAIT : FUTEX_WAIT_PRIVATE,
-                 val, timeout, NULL, 0);
-}
-static inline long nsync_futex_wake_(atomic_int *uaddr, int num_to_wake,
-                                     char pshare) {
-  return syscall(SYS_futex, uaddr, pshare ? FUTEX_WAKE : FUTEX_WAKE_PRIVATE,
-                 num_to_wake, NULL, NULL, 0);
-}
-#endif
+// THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT
+// YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN
+// MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER.
 
-// x86 fleet
-// with spin lock
-// 30 threads / 100000 iterations
-//
-// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX
-//        1,570,224 us real
-//       42,690,880 us user
-//            1,999 us sys
-//
-// footek_test on rhel7.test.           423 µs    2'638 µs     912'241 µs
-//          897,815 us real
-//        1,763,705 us user
-//            9,696 us sys
-//
-// footek_test on xnu.test.          98'468 µs    5'242 µs   5'191'724 µs
-//        4,225,726 us real
-//       16,679,456 us user
-//           16,265 us sys
-//
-// footek_test on freebsd.test.         690 µs    3'011 µs   2'925'997 µs
-//        2,916,033 us real
-//       17,236,103 us user
-//                0 us sys
-//
-// footek_test on netbsd.test.        1'151 µs    2'634 µs   1'014'867 µs
-//          790,332 us real
-//        2,359,967 us user
-//                0 us sys
-//
-// footek_test on openbsd.test.         557 µs    3'020 µs   2'554'648 µs
-//        2,332,724 us real
-//        9,150,000 us user
-//           10,000 us sys
-//
-// footek_test on win10.test.           962 µs    9'698 µs   2'751'905 µs
-//        2,528,863 us real
-//       56,546,875 us user
-//        1,671,875 us sys
-
-// x86 fleet
+// arm fleet
 // with futexes
 // 30 threads / 100000 iterations
 //
-// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX
-//          100,746 us real
-//          234,451 us user
-//        2,638,333 us sys
-//
-// footek_test on rhel7.test.           376 µs    2'259 µs     153'024 µs
-//          146,015 us real
-//          169,427 us user
-//           68,939 us sys
-//
-// footek_test on xnu.test.          11'143 µs    9'159 µs     164'865 µs
-//          144,917 us real
-//          383,317 us user
-//          191,203 us sys
-//
-// footek_test on freebsd.test.         394 µs    2'165 µs     256'227 µs
-//          244,286 us real
-//          405,395 us user
-//          956,122 us sys
-//
-// footek_test on netbsd.test.          502 µs    2'020 µs     261'895 µs
-//          209,095 us real
-//          616,634 us user
-//            9,945 us sys
-//
-// footek_test on openbsd.test.         457 µs    2'737 µs     396'342 µs
-//          344,876 us real
-//           50,000 us user
-//        1,240,000 us sys
-//
-// footek_test on win10.test.           462 µs   59'528 µs   1'348'265 µs
-//        1,193,906 us real
-//       17,546,875 us user
-//        3,000,000 us sys
-
-// x86 fleet
-// with posix
-// 30 threads / 100000 iterations
-//
-// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX (glibc)
-//          111,560 us real
-//          153,985 us user
-//        2,988,121 us sys
-//
-// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX (musl)
-//          392,765 us real
-//        1,885,558 us user
-//        9,667,865 us sys
-//
-// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX (cosmo)
-//           40,965 us real
-//           47,168 us user
-//           25,398 us sys
-//
-// footek_test on rhel7.test.           683 µs    1'340 µs     105'977 µs
-//          101,934 us real
-//          104,771 us user
-//            4,068 us sys
-//
-// footek_test on xnu.test.           2'054 µs    5'352 µs     210'306 µs
-//          181,540 us real
-//          216,236 us user
-//          127,344 us sys
-//
-// footek_test on freebsd.test. (cosmo)
-//          126,803 us real
-//            3,100 us user
-//          176,744 us sys
-//
-// footek_test on freebsd.test. (freebsd libc)
-//          219,073 us real
-//          158,103 us user
-//        1,146,252 us sys
-//
-// footek_test on netbsd.test.          350 µs    3'570 µs     262'186 µs
-//          199,882 us real
-//          138,178 us user
-//          329,501 us sys
-//
-// footek_test on openbsd.test. (cosmo)
-//          138,619 us real
-//           30,000 us user
-//          110,000 us sys
-//
-// footek_test on openbsd.test. (openbsd libc)
-//          385,431 us real
-//           80,000 us user
-//        1,350,000 us sys
-//
-// footek_test on win10.test. (cosmo)
-//          156,382 us real
-//          312,500 us user
-//           31,250 us sys
-//
-// footek_test on win10.test. (cygwin)
-//        9,334,610 us real
-//        1,562,000 us user
-//        6,093,000 us sys
+//          242,604 us real
+//        4,222,946 us user
+//        1,079,229 us sys
+// footek_test on studio.test.          630 µs   17'415 µs     256'782 µs
+//        1,362,557 us real
+//        3,232,978 us user
+//        2,104,824 us sys
+// footek_test on pi.test.              611 µs   21'708 µs   1'385'129 µs
+//        1,346,482 us real
+//        3,370,513 us user
+//        1,992,383 us sys
+// footek_test on freebsdarm.test.      427 µs   19'967 µs   1'393'476 µs
 
 // arm fleet
-// with spin lock
+// without futexes
 // 30 threads / 100000 iterations
 //
-// footek_test on studio.test.          961 µs   12'907 µs   1'287'983 µs
 //        1,282,084 us real
 //       29,359,582 us user
 //           34,553 us sys
-//
-// footek_test on pi.test.              459 µs   16'376 µs   4'095'512 µs
+// footek_test on studio.test.          961 µs   12'907 µs   1'287'983 µs
 //        4,070,988 us real
 //       16,203,990 us user
 //            7,999 us sys
-//
-// footek_test on freebsdarm.test.      502 µs   16'446 µs   7'051'545 µs
+// footek_test on pi.test.              459 µs   16'376 µs   4'095'512 µs
 //        7,012,493 us real
 //       27,936,725 us user
 //            7,871 us sys
+// footek_test on freebsdarm.test.      502 µs   16'446 µs   7'051'545 µs
 
-// arm fleet
+// x86 fleet
 // with futexes
 // 30 threads / 100000 iterations
 //
-// footek_test on studio.test.          585 µs   13'597 µs      57'473 µs
-//           46,481 us real
-//           68,745 us user
-//          586,871 us sys
-//
-// footek_test on pi5.test.             335 µs   13'034 µs     432'358 µs
-//          389,619 us real
-//          839,848 us user
-//          679,112 us sys
-//
-// footek_test on pi.test.              479 µs   16'539 µs     476'395 µs
-//          463,799 us real
-//        1,259,267 us user
-//          547,681 us sys
-//
-// footek_test on freebsdarm.test.      364 µs   16'898 µs   1'288'594 µs
-//        1,256,134 us real
-//        3,770,473 us user
-//        1,214,755 us sys
+//          146,015 us real
+//          169,427 us user
+//           68,939 us sys
+// footek_test on rhel7.test.           376 µs    2'259 µs     153'024 µs
+//          144,917 us real
+//          383,317 us user
+//          191,203 us sys
+// footek_test on xnu.test.          11'143 µs    9'159 µs     164'865 µs
+//          244,286 us real
+//          405,395 us user
+//          956,122 us sys
+// footek_test on freebsd.test.         394 µs    2'165 µs     256'227 µs
+//          209,095 us real
+//          616,634 us user
+//            9,945 us sys
+// footek_test on netbsd.test.          502 µs    2'020 µs     261'895 µs
+//          344,876 us real
+//           50,000 us user
+//        1,240,000 us sys
+// footek_test on openbsd.test.         457 µs    2'737 µs     396'342 µs
+//        1,193,906 us real
+//       17,546,875 us user
+//        3,000,000 us sys
+// footek_test on win10.test.           462 µs   59'528 µs   1'348'265 µs
 
-// arm fleet
-// with posix
+// x86 fleet
+// without futexes
 // 30 threads / 100000 iterations
 //
-// footek_test on Apple M2 Ultra (Apple Libc)
-//           45,443 us real
-//           30,201 us user
-//          864,650 us sys
-//
-// footek_test on Apple M2 Ultra (Cosmo Libc)
-//           65,118 us real
-//           77,891 us user
-//        1,023,575 us sys
-//
-// footek_test on pi5.test.             407 µs   12'661 µs     198'133 µs
-//          152,791 us real
-//          143,678 us user
-//           14,736 us sys
-//
-// footek_test on studio.test.          463 µs   13'286 µs     234'742 µs
-//          227,916 us real
-//          294,162 us user
-//          155,062 us sys
-//
-// footek_test on pi.test.              374 µs   15'720 µs     249'245 µs
-//          233,504 us real
-//          301,072 us user
-//          187,153 us sys
-//
-// footek_test on freebsdarm.test.      328 µs   16'614 µs     918'647 µs
-//          877,124 us real
-//        1,377,338 us user
-//          798,230 us sys
+//          897,815 us real
+//        1,763,705 us user
+//            9,696 us sys
+// footek_test on rhel7.test.           423 µs    2'638 µs     912'241 µs
+//          790,332 us real
+//        2,359,967 us user
+//                0 us sys
+// footek_test on netbsd.test.        1'151 µs    2'634 µs   1'014'867 µs
+//        2,332,724 us real
+//        9,150,000 us user
+//           10,000 us sys
+// footek_test on openbsd.test.         557 µs    3'020 µs   2'554'648 µs
+//        2,528,863 us real
+//       56,546,875 us user
+//        1,671,875 us sys
+// footek_test on win10.test.           962 µs    9'698 µs   2'751'905 µs
+//        2,916,033 us real
+//       17,236,103 us user
+//                0 us sys
+// footek_test on freebsd.test.         690 µs    3'011 µs   2'925'997 µs
+//        4,225,726 us real
+//       16,679,456 us user
+//           16,265 us sys
+// footek_test on xnu.test.          98'468 µs    5'242 µs   5'191'724 µs
+
+#define USE_FUTEX  1
+#define THREADS    30
+#define ITERATIONS 30000
 
 #define MUTEX_LOCKED(word)  ((word) & 8)
 #define MUTEX_WAITING(word) ((word) & 16)
@@ -272,18 +124,14 @@ void lock(atomic_int *futex) {
     if (atomic_compare_exchange_strong_explicit(
             futex, &word, 1, memory_order_acquire, memory_order_acquire))
       return;
+    pthread_pause_np();
   }
   if (word == 1)
     word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
   while (word > 0) {
     pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
-#if USE == FUTEX || USE == FUTEX_SHARED
-    cosmo_futex_wait(
-        futex, 2,
-        USE == FUTEX_SHARED ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE,
-        0, 0);
-#else
-    pthread_yield_np();
+#if USE_FUTEX
+    nsync_futex_wait_(futex, 2, 0, 0);
 #endif
     pthread_setcancelstate(cs, 0);
     word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
@@ -294,10 +142,8 @@ void unlock(atomic_int *futex) {
   int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
   if (word == 2) {
     atomic_store_explicit(futex, 0, memory_order_release);
-#if USE == FUTEX || USE == FUTEX_SHARED
-    cosmo_futex_wake(
-        futex, 1,
-        USE == FUTEX_SHARED ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE);
+#if USE_FUTEX
+    nsync_futex_wake_(futex, 1, 0);
 #endif
   }
 }
@@ -305,85 +151,86 @@ void unlock(atomic_int *futex) {
 int g_chores;
 atomic_int g_lock;
 pthread_mutex_t g_locker;
-pthread_rwlock_t g_rwlocker;
 
 void *worker(void *arg) {
   for (int i = 0; i < ITERATIONS; ++i) {
-#if USE == POSIX || USE == POSIX_RECURSIVE
-    pthread_mutex_lock(&g_locker);
-    ++g_chores;
-    pthread_mutex_unlock(&g_locker);
-#elif USE == RWLOCK || USE == RWLOCK_SHARED
-    pthread_rwlock_wrlock(&g_rwlocker);
-    ++g_chores;
-    pthread_rwlock_unlock(&g_rwlocker);
-#else
     lock(&g_lock);
     ++g_chores;
     unlock(&g_lock);
-#endif
   }
   return 0;
 }
 
-struct timeval tub(struct timeval a, struct timeval b) {
-  a.tv_sec -= b.tv_sec;
-  if (a.tv_usec < b.tv_usec) {
-    a.tv_usec += 1000000;
-    a.tv_sec--;
-  }
-  a.tv_usec -= b.tv_usec;
-  return a;
-}
-
-long tomicros(struct timeval x) {
-  return x.tv_sec * 1000000ul + x.tv_usec;
-}
-
 int main() {
   struct timeval start;
   gettimeofday(&start, 0);
 
-  pthread_mutexattr_t attr;
-  pthread_mutexattr_init(&attr);
-#if USE == POSIX_RECURSIVE
-  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
-#else
-  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
-#endif
-  pthread_mutex_init(&g_locker, &attr);
-  pthread_mutexattr_destroy(&attr);
-
-  pthread_rwlockattr_t rwattr;
-  pthread_rwlockattr_init(&rwattr);
-#if USE == RWLOCK_SHARED
-  pthread_rwlockattr_setpshared(&rwattr, PTHREAD_PROCESS_SHARED);
-#endif
-  pthread_rwlock_init(&g_rwlocker, &rwattr);
-  pthread_rwlockattr_destroy(&rwattr);
-
   pthread_t th[THREADS];
   for (int i = 0; i < THREADS; ++i)
     pthread_create(&th[i], 0, worker, 0);
   for (int i = 0; i < THREADS; ++i)
     pthread_join(th[i], 0);
-  assert(g_chores == THREADS * ITERATIONS);
+  npassert(g_chores == THREADS * ITERATIONS);
 
   struct rusage ru;
   struct timeval end;
   gettimeofday(&end, 0);
   getrusage(RUSAGE_SELF, &ru);
-  printf("%16ld us real\n"
-         "%16ld us user\n"
-         "%16ld us sys\n",
-         tomicros(tub(end, start)),  //
-         tomicros(ru.ru_utime),      //
-         tomicros(ru.ru_stime));
+  printf("%,16ld us real\n"
+         "%,16ld us user\n"
+         "%,16ld us sys\n",
+         timeval_tomicros(timeval_sub(end, start)),  //
+         timeval_tomicros(ru.ru_utime),              //
+         timeval_tomicros(ru.ru_stime));
 
-  pthread_rwlock_destroy(&g_rwlocker);
-  pthread_mutex_destroy(&g_locker);
-
-#ifdef __COSMOPOLITAN__
   CheckForMemoryLeaks();
-#endif
 }
+
+// COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC
+// WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES
+
+// x86 fleet
+// with pthread_mutex_t
+// 30 threads / 100000 iterations
+//
+//          186,976 us real
+//           43,609 us user
+//          205,585 us sys
+// footek_test on freebsd.test.         410 µs    2'054 µs     195'339 µs
+//          238,902 us real
+//          235,743 us user
+//           97,881 us sys
+// footek_test on rhel7.test.           343 µs    2'339 µs     246'926 µs
+//          201,285 us real
+//          249,612 us user
+//          141,230 us sys
+// footek_test on xnu.test.           1'960 µs    5'350 µs     265'758 µs
+//          303,363 us real
+//           60,000 us user
+//          410,000 us sys
+// footek_test on openbsd.test.         545 µs    3'023 µs     326'200 µs
+//          386,085 us real
+//          586,455 us user
+//          466,991 us sys
+// footek_test on netbsd.test.          344 µs    2'421 µs     413'440 µs
+//          245,010 us real
+//          437,500 us user
+//          140,625 us sys
+// footek_test on win10.test.           300 µs   18'574 µs     441'225 µs
+
+// arm fleet
+// with pthread_mutex_t
+// 30 threads / 100000 iterations
+//
+//           87,132 us real
+//          183,517 us user
+//           20,020 us sys
+// footek_test on studio.test.          560 µs   12'418 µs      92'825 µs
+//          679,374 us real
+//          957,678 us user
+//          605,078 us sys
+// footek_test on pi.test.              462 µs   16'574 µs     702'833 µs
+//          902,343 us real
+//        1,459,706 us user
+//          781,140 us sys
+// footek_test on freebsdarm.test.      400 µs   16'261 µs     970'022 µs
diff --git a/test/libc/thread/nsync_test.c b/test/libc/thread/nsync_test.c
index ac2d64514..d781c5243 100644
--- a/test/libc/thread/nsync_test.c
+++ b/test/libc/thread/nsync_test.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
-#include "libc/sysv/consts/clock.h"
 #include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/cv.h"
@@ -35,8 +34,7 @@ int Put(long v, nsync_time abs_deadline) {
   int err, added = 0, wake = 0;
   nsync_mu_lock(&mu);
   while (count == limit) {
-    if ((err = nsync_cv_wait_with_deadline(&non_full, &mu, CLOCK_REALTIME,
-                                           abs_deadline, 0))) {
+    if ((err = nsync_cv_wait_with_deadline(&non_full, &mu, abs_deadline, 0))) {
       ASSERT_EQ(ETIMEDOUT, err);
       ASSERT_NE(0, nsync_time_cmp(nsync_time_no_deadline, abs_deadline));
     }
@@ -61,8 +59,7 @@ long Get(nsync_time abs_deadline) {
   long err, v = 0;
   nsync_mu_lock(&mu);
   while (!count) {
-    if ((err = nsync_cv_wait_with_deadline(&non_empty, &mu, CLOCK_REALTIME,
-                                           abs_deadline, 0))) {
+    if ((err = nsync_cv_wait_with_deadline(&non_empty, &mu, abs_deadline, 0))) {
       ASSERT_EQ(ETIMEDOUT, err);
       ASSERT_NE(0, nsync_time_cmp(nsync_time_no_deadline, abs_deadline));
     }
diff --git a/test/libc/thread/pthread_atfork_test.c b/test/libc/thread/pthread_atfork_test.c
index 8a6d5d4d0..00a19cec5 100644
--- a/test/libc/thread/pthread_atfork_test.c
+++ b/test/libc/thread/pthread_atfork_test.c
@@ -22,7 +22,6 @@
 #include "libc/intrin/atomic.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/mem/gc.h"
-#include "libc/mem/leaks.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
@@ -49,9 +48,11 @@ void *ForceThreadingMode(void *arg) {
 }
 
 TEST(pthread_atfork, test) {
+  __enable_threads();
   SPAWN(fork);
   ASSERT_EQ(0, pthread_atfork(prepare1, parent1, child1));
   ASSERT_EQ(0, pthread_atfork(prepare2, parent2, child2));
+  flockfile(stdout);
   SPAWN(fork);
   flockfile(stdout);
   ASSERT_STREQ("prepare2", A[0]);
@@ -60,6 +61,7 @@ TEST(pthread_atfork, test) {
   ASSERT_STREQ("child2", A[3]);
   funlockfile(stdout);
   EXITS(0);
+  funlockfile(stdout);
   ASSERT_STREQ("prepare2", A[0]);
   ASSERT_STREQ("prepare1", A[1]);
   ASSERT_STREQ("parent1", A[2]);
@@ -78,7 +80,7 @@ void mu_unlock(void) {
 }
 
 void mu_wipe(void) {
-  pthread_mutex_wipe_np(&mu);
+  pthread_mutex_init(&mu, 0);
 }
 
 void *Worker(void *arg) {
diff --git a/test/libc/thread/pthread_cancel_deferred_cond_test.c b/test/libc/thread/pthread_cancel_deferred_cond_test.c
index 4bba81a18..7bf8e1045 100644
--- a/test/libc/thread/pthread_cancel_deferred_cond_test.c
+++ b/test/libc/thread/pthread_cancel_deferred_cond_test.c
@@ -1,27 +1,11 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
 #include <errno.h>
 #include <pthread.h>
-#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 
 int got_cleanup;
-pthread_cond_t cv;
-pthread_mutex_t mu;
+pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+pthread_mutex_t mu = PTHREAD_MUTEX_INITIALIZER;
 
 void cleanup(void* arg) {
   got_cleanup = 1;
@@ -39,12 +23,6 @@ void* worker(void* arg) {
 int main(int argc, char* argv[]) {
   void* rc;
   pthread_t th;
-  pthread_mutexattr_t at;
-  pthread_mutexattr_init(&at);
-  pthread_mutexattr_settype(&at, PTHREAD_MUTEX_DEFAULT);
-  pthread_mutex_init(&mu, &at);
-  pthread_mutexattr_destroy(&at);
-  pthread_cond_init(&cv, 0);
   if (pthread_create(&th, 0, worker, 0))
     return 2;
   if (pthread_cancel(th))
@@ -57,6 +35,6 @@ int main(int argc, char* argv[]) {
     return 6;
   if (pthread_mutex_trylock(&mu) != EBUSY)
     return 7;
-  pthread_mutex_destroy(&mu);
-  pthread_cond_destroy(&cv);
+  if (pthread_mutex_unlock(&mu))
+    return 8;
 }
diff --git a/test/libc/thread/pthread_cancel_test.c b/test/libc/thread/pthread_cancel_test.c
index 06fb5093e..c43aacc04 100644
--- a/test/libc/thread/pthread_cancel_test.c
+++ b/test/libc/thread/pthread_cancel_test.c
@@ -19,7 +19,6 @@
 #include "libc/atomic.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/sigaltstack.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/kprintf.h"
@@ -28,7 +27,6 @@
 #include "libc/nexgen32e/nexgen32e.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
@@ -42,8 +40,6 @@ atomic_int gotcleanup;
 
 void SetUpOnce(void) {
   testlib_enable_tmp_setup_teardown();
-  pthread_mutex_init(&mu, 0);
-  pthread_cond_init(&cv, 0);
 }
 
 void SetUp(void) {
@@ -101,6 +97,8 @@ TEST(pthread_cancel, synchronous) {
 TEST(pthread_cancel, synchronous_deferred) {
   void *rc;
   pthread_t th;
+  if (!IsWindows())
+    return;
   ASSERT_SYS(0, 0, pipe(pfds));
   ASSERT_EQ(0, pthread_create(&th, 0, Worker, 0));
   while (!ready)
@@ -192,7 +190,6 @@ TEST(pthread_cancel, condDeferredWait_reacquiresMutex) {
   ASSERT_EQ(0, pthread_join(th, &rc));
   ASSERT_EQ(PTHREAD_CANCELED, rc);
   ASSERT_EQ(EBUSY, pthread_mutex_trylock(&mu));
-  ASSERT_EQ(0, pthread_mutex_consistent(&mu));
   ASSERT_EQ(0, pthread_mutex_unlock(&mu));
 }
 
@@ -205,7 +202,6 @@ TEST(pthread_cancel, condDeferredWaitDelayed) {
   ASSERT_EQ(0, pthread_join(th, &rc));
   ASSERT_EQ(PTHREAD_CANCELED, rc);
   ASSERT_EQ(EBUSY, pthread_mutex_trylock(&mu));
-  ASSERT_EQ(0, pthread_mutex_consistent(&mu));
   ASSERT_EQ(0, pthread_mutex_unlock(&mu));
 }
 
diff --git a/test/libc/thread/pthread_cond_timedwait_test.c b/test/libc/thread/pthread_cond_timedwait_test.c
deleted file mode 100644
index 57ed46e38..000000000
--- a/test/libc/thread/pthread_cond_timedwait_test.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/struct/timespec.h"
-#include "libc/errno.h"
-#include "libc/sysv/consts/clock.h"
-#include "libc/testlib/testlib.h"
-#include "libc/thread/thread.h"
-#include "libc/thread/thread2.h"
-
-TEST(pthread_cond_timedwait, real) {
-  pthread_cond_t cv;
-  pthread_mutex_t mu;
-  pthread_condattr_t ca;
-  ASSERT_EQ(0, pthread_condattr_init(&ca));
-  ASSERT_EQ(0, pthread_condattr_setclock(&ca, CLOCK_REALTIME));
-  ASSERT_EQ(0, pthread_cond_init(&cv, &ca));
-  ASSERT_EQ(0, pthread_condattr_destroy(&ca));
-  ASSERT_EQ(0, pthread_mutex_init(&mu, 0));
-  ASSERT_EQ(0, pthread_mutex_lock(&mu));
-  struct timespec start = timespec_real();
-  struct timespec deadline = timespec_add(start, timespec_frommillis(100));
-  ASSERT_EQ(ETIMEDOUT, pthread_cond_timedwait(&cv, &mu, &deadline));
-  struct timespec end = timespec_real();
-  ASSERT_GE(timespec_tomillis(timespec_sub(end, start)), 100);
-  ASSERT_EQ(0, pthread_mutex_unlock(&mu));
-  ASSERT_EQ(0, pthread_mutex_destroy(&mu));
-  ASSERT_EQ(0, pthread_cond_destroy(&cv));
-}
-
-TEST(pthread_cond_timedwait, mono) {
-  pthread_cond_t cv;
-  pthread_mutex_t mu;
-  pthread_condattr_t ca;
-  ASSERT_EQ(0, pthread_condattr_init(&ca));
-  ASSERT_EQ(0, pthread_condattr_setclock(&ca, CLOCK_MONOTONIC));
-  ASSERT_EQ(0, pthread_cond_init(&cv, &ca));
-  ASSERT_EQ(0, pthread_condattr_destroy(&ca));
-  ASSERT_EQ(0, pthread_mutex_init(&mu, 0));
-  ASSERT_EQ(0, pthread_mutex_lock(&mu));
-  struct timespec start = timespec_mono();
-  struct timespec deadline = timespec_add(start, timespec_frommillis(100));
-  ASSERT_EQ(ETIMEDOUT, pthread_cond_timedwait(&cv, &mu, &deadline));
-  struct timespec end = timespec_mono();
-  ASSERT_GE(timespec_tomillis(timespec_sub(end, start)), 100);
-  ASSERT_EQ(0, pthread_mutex_unlock(&mu));
-  ASSERT_EQ(0, pthread_mutex_destroy(&mu));
-  ASSERT_EQ(0, pthread_cond_destroy(&cv));
-}
diff --git a/test/libc/thread/pthread_create_test.c b/test/libc/thread/pthread_create_test.c
index 92b6c28db..d977dd0dc 100644
--- a/test/libc/thread/pthread_create_test.c
+++ b/test/libc/thread/pthread_create_test.c
@@ -22,13 +22,11 @@
 #include "libc/calls/struct/sched_param.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/siginfo.h"
-#include "libc/calls/struct/sigset.h"
-#include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/nexgen32e.h"
@@ -42,36 +40,16 @@
 #include "libc/sysv/consts/sched.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/consts/ss.h"
-#include "libc/testlib/benchmark.h"
 #include "libc/testlib/ezbench.h"
-#include "libc/testlib/manystack.h"
 #include "libc/testlib/subprocess.h"
 #include "libc/testlib/testlib.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/thread2.h"
 
-// test ability of user to override pthread mutex api
-int pthread_mutex_lock(pthread_mutex_t *mutex) {
-  abort();
-}
-int pthread_mutex_unlock(pthread_mutex_t *mutex) {
-  abort();
-}
-int pthread_mutex_trylock(pthread_mutex_t *mutex) {
-  abort();
-}
-int pthread_mutex_wipe_np(pthread_mutex_t *mutex) {
-  abort();
-}
-
 void OnUsr1(int sig, siginfo_t *si, void *vctx) {
 }
 
-void SetUpOnce(void) {
-  cosmo_stack_setmaxstacks((_rand64() & 7) - 1);
-}
-
 void SetUp(void) {
   struct sigaction sig = {.sa_sigaction = OnUsr1, .sa_flags = SA_SIGINFO};
   sigaction(SIGUSR1, &sig, 0);
@@ -302,60 +280,10 @@ static void CreateDetached(void) {
   ASSERT_EQ(0, pthread_attr_destroy(&attr));
 }
 
-#define LAUNCHES  10
-#define LAUNCHERS 10
-
-errno_t pthread_create2(pthread_t *thread, const pthread_attr_t *attr,
-                        void *(*start_routine)(void *), void *arg) {
-  for (int i = 1;; i <<= 1) {
-    errno_t err = pthread_create(thread, attr, start_routine, arg);
-    if (err != EAGAIN)
-      return err;
-    usleep(i);
-  }
-}
-
-static void *CreateDetachedParallelThreads(void *arg) {
-  for (int i = 0; i < LAUNCHES; ++i)
-    CreateDetached();
-  return 0;
-}
-
-static void CreateDetachedParallel(void) {
-  pthread_t th[LAUNCHERS];
-  for (int i = 0; i < LAUNCHERS; ++i)
-    ASSERT_EQ(0, pthread_create2(&th[i], 0, CreateDetachedParallelThreads, 0));
-  for (int i = 0; i < LAUNCHERS; ++i)
-    ASSERT_EQ(0, pthread_join(th[i], 0));
-}
-
-static void *CreateJoinParallelThreads(void *arg) {
-  for (int i = 0; i < LAUNCHES; ++i)
-    CreateJoin();
-  return 0;
-}
-
-static void CreateJoinParallel(void) {
-  pthread_t th[LAUNCHERS];
-  for (int i = 0; i < LAUNCHERS; ++i)
-    ASSERT_EQ(0, pthread_create2(&th[i], 0, CreateJoinParallelThreads, 0));
-  for (int i = 0; i < LAUNCHERS; ++i)
-    ASSERT_EQ(0, pthread_join(th[i], 0));
-}
-
 TEST(pthread_create, bench) {
-  kprintf("cosmo_stack_getmaxstacks() = %d\n", cosmo_stack_getmaxstacks());
-  pthread_t msh = manystack_start();
-  BENCHMARK(100, 1, CreateJoin());
-  BENCHMARK(100, 1, CreateDetach());
-  usleep(10000);
-  pthread_decimate_np();
-  BENCHMARK(100, 1, CreateDetached());
-  usleep(10000);
-  pthread_decimate_np();
-  BENCHMARK(1, LAUNCHERS + LAUNCHERS * LAUNCHES, CreateJoinParallel());
-  BENCHMARK(1, LAUNCHERS + LAUNCHERS * LAUNCHES, CreateDetachedParallel());
-  manystack_stop(msh);
+  EZBENCH2("CreateJoin", donothing, CreateJoin());
+  EZBENCH2("CreateDetach", donothing, CreateDetach());
+  EZBENCH2("CreateDetached", donothing, CreateDetached());
   while (!pthread_orphan_np())
     pthread_decimate_np();
 }
diff --git a/test/libc/thread/pthread_kill_test.c b/test/libc/thread/pthread_kill_test.c
index 2ac31f4be..25a8f0b3c 100644
--- a/test/libc/thread/pthread_kill_test.c
+++ b/test/libc/thread/pthread_kill_test.c
@@ -193,6 +193,8 @@ void *SocketAcceptWorker(void *arg) {
 }
 
 TEST(pthread_kill, canInterruptSocketAcceptOperation) {
+  if (IsWindows())
+    return;  // TODO(jart): BAH
   pthread_t t;
   struct sigaction oldsa;
   struct sigaction sa = {.sa_handler = OnSig};
@@ -259,6 +261,7 @@ void *CpuWorker(void *arg) {
 }
 
 TEST(pthread_kill, canAsynchronouslyRunHandlerInsideTargetThread) {
+  ASSERT_NE(0, __get_tls()->tib_tid);
   pthread_t t;
   struct sigaction oldsa;
   struct sigaction sa = {.sa_handler = OnSigAsync};
@@ -272,6 +275,7 @@ TEST(pthread_kill, canAsynchronouslyRunHandlerInsideTargetThread) {
   ASSERT_TRUE(exited_original_loop);
   ASSERT_SYS(0, 0, sigaction(SIGUSR1, &oldsa, 0));
   ASSERT_EQ(0, gotsig);
+  ASSERT_NE(0, __get_tls()->tib_tid);
 }
 
 volatile int is_having_fun;
@@ -285,6 +289,7 @@ void *FunWorker(void *arg) {
 }
 
 TEST(pthread_kill, defaultThreadSignalHandlerWillKillWholeProcess) {
+  ASSERT_NE(0, __get_tls()->tib_tid);
   SPAWN(fork);
   pthread_t t;
   ASSERT_EQ(0, pthread_create(&t, 0, FunWorker, 0));
@@ -294,6 +299,7 @@ TEST(pthread_kill, defaultThreadSignalHandlerWillKillWholeProcess) {
   for (;;)
     sched_yield();
   TERMS(SIGKILL);
+  ASSERT_NE(0, __get_tls()->tib_tid);
 }
 
 void *SuspendWorker(void *arg) {
@@ -304,6 +310,7 @@ void *SuspendWorker(void *arg) {
 }
 
 TEST(pthread_kill, canInterruptSigsuspend) {
+  ASSERT_NE(0, __get_tls()->tib_tid);
   int tid;
   void *res;
   pthread_t t;
diff --git a/test/libc/thread/pthread_rwlock_rdlock_test.c b/test/libc/thread/pthread_rwlock_rdlock_test.c
index 4fba1f503..e7ad11cc3 100644
--- a/test/libc/thread/pthread_rwlock_rdlock_test.c
+++ b/test/libc/thread/pthread_rwlock_rdlock_test.c
@@ -17,55 +17,26 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/atomic.h"
-#include "libc/calls/calls.h"
-#include "libc/intrin/atomic.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
-#include "libc/stdalign.h"
-#include "libc/stdio/rand.h"
 #include "libc/testlib/testlib.h"
 #include "libc/thread/thread.h"
 
+#define ITERATIONS 50000
 #define READERS    8
 #define WRITERS    2
-#define ITERATIONS 1000
 
-atomic_bool done;
-alignas(128) int foo;
-alignas(128) int bar;
+atomic_int reads;
+atomic_int writes;
 pthread_rwlock_t lock;
-pthread_rwlockattr_t attr;
 pthread_barrier_t barrier;
 
-void delay(int k) {
-  int n = rand() % k;
-  for (volatile int i = 0; i < n; ++i) {
-  }
-}
-
-void SetUp(void) {
-  ASSERT_EQ(0, pthread_rwlockattr_init(&attr));
-  ASSERT_EQ(0, pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED));
-  ASSERT_EQ(0, pthread_rwlock_init(&lock, &attr));
-  ASSERT_EQ(0, pthread_rwlockattr_destroy(&attr));
-}
-
-void TearDown(void) {
-  ASSERT_EQ(0, pthread_rwlock_destroy(&lock));
-}
-
 void *Reader(void *arg) {
   pthread_barrier_wait(&barrier);
-  while (!atomic_load_explicit(&done, memory_order_relaxed)) {
+  for (int i = 0; i < ITERATIONS; ++i) {
     ASSERT_EQ(0, pthread_rwlock_rdlock(&lock));
-    // cosmo_trace_begin("reader");
-    int x = foo;
-    usleep(1);  // delay(100000);
-    int y = bar;
-    ASSERT_EQ(x, y);
-    // cosmo_trace_end("reader");
+    ++reads;
     ASSERT_EQ(0, pthread_rwlock_unlock(&lock));
-    usleep(1);  // delay(100000);
   }
   return 0;
 }
@@ -74,15 +45,9 @@ void *Writer(void *arg) {
   pthread_barrier_wait(&barrier);
   for (int i = 0; i < ITERATIONS; ++i) {
     ASSERT_EQ(0, pthread_rwlock_wrlock(&lock));
-    // cosmo_trace_begin("writer");
-    ++foo;
-    delay(10);
-    ++bar;
-    // cosmo_trace_end("writer");
+    ++writes;
     ASSERT_EQ(0, pthread_rwlock_unlock(&lock));
-    delay(10);
   }
-  done = true;
   return 0;
 }
 
@@ -90,12 +55,14 @@ TEST(pthread_rwlock_rdlock, test) {
   int i;
   pthread_t *t = gc(malloc(sizeof(pthread_t) * (READERS + WRITERS)));
   ASSERT_EQ(0, pthread_barrier_init(&barrier, 0, READERS + WRITERS));
-  for (i = 0; i < READERS + WRITERS; ++i)
+  for (i = 0; i < READERS + WRITERS; ++i) {
     ASSERT_SYS(0, 0,
                pthread_create(t + i, 0, i < READERS ? Reader : Writer, 0));
-  for (i = 0; i < READERS + WRITERS; ++i)
+  }
+  for (i = 0; i < READERS + WRITERS; ++i) {
     EXPECT_SYS(0, 0, pthread_join(t[i], 0));
-  EXPECT_EQ(WRITERS * ITERATIONS, foo);
-  EXPECT_EQ(WRITERS * ITERATIONS, bar);
+  }
+  EXPECT_EQ(READERS * ITERATIONS, reads);
+  EXPECT_EQ(WRITERS * ITERATIONS, writes);
   ASSERT_EQ(0, pthread_barrier_destroy(&barrier));
 }
diff --git a/test/libc/thread/setitimer_test.c b/test/libc/thread/setitimer_test.c
index 061faf459..6ab5d42d0 100644
--- a/test/libc/thread/setitimer_test.c
+++ b/test/libc/thread/setitimer_test.c
@@ -16,7 +16,6 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/sysv/consts/itimer.h"
 #include "libc/atomic.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/itimerval.h"
@@ -27,8 +26,10 @@
 #include "libc/calls/ucontext.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
+#include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
 #include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/itimer.h"
 #include "libc/sysv/consts/sa.h"
 #include "libc/sysv/consts/sicode.h"
 #include "libc/sysv/consts/sig.h"
diff --git a/test/libc/time/BUILD.mk b/test/libc/time/BUILD.mk
index 092444a95..937b2d09e 100644
--- a/test/libc/time/BUILD.mk
+++ b/test/libc/time/BUILD.mk
@@ -28,8 +28,7 @@ TEST_LIBC_TIME_DIRECTDEPS =				\
 	LIBC_SYSV					\
 	LIBC_TESTLIB					\
 	LIBC_X						\
-	THIRD_PARTY_MUSL				\
-	THIRD_PARTY_TZ					\
+	THIRD_PARTY_TZ
 
 TEST_LIBC_TIME_DEPS :=					\
 	$(call uniq,$(foreach x,$(TEST_LIBC_TIME_DIRECTDEPS),$($(x))))
diff --git a/test/libc/tinymath/BUILD.mk b/test/libc/tinymath/BUILD.mk
index 4b304dc67..ef755a29d 100644
--- a/test/libc/tinymath/BUILD.mk
+++ b/test/libc/tinymath/BUILD.mk
@@ -3,21 +3,14 @@
 
 PKGS += TEST_LIBC_TINYMATH
 
-TEST_LIBC_TINYMATH_SRCS_C := $(wildcard test/libc/tinymath/*.c)
-TEST_LIBC_TINYMATH_SRCS_CC := $(wildcard test/libc/tinymath/*.cc)
+TEST_LIBC_TINYMATH_SRCS := $(wildcard test/libc/tinymath/*.c)
 TEST_LIBC_TINYMATH_SRCS_TEST = $(filter %_test.c,$(TEST_LIBC_TINYMATH_SRCS))
 
-TEST_LIBC_TINYMATH_SRCS =					\
-	$(TEST_LIBC_TINYMATH_SRCS_C)				\
-	$(TEST_LIBC_TINYMATH_SRCS_CC)
-
 TEST_LIBC_TINYMATH_OBJS =					\
-	$(TEST_LIBC_TINYMATH_SRCS_C:%.c=o/$(MODE)/%.o)		\
-	$(TEST_LIBC_TINYMATH_SRCS_CC:%.cc=o/$(MODE)/%.o)
+	$(TEST_LIBC_TINYMATH_SRCS:%.c=o/$(MODE)/%.o)
 
 TEST_LIBC_TINYMATH_COMS =					\
-	$(TEST_LIBC_TINYMATH_SRCS_C:%.c=o/$(MODE)/%)		\
-	$(TEST_LIBC_TINYMATH_SRCS_CC:%.cc=o/$(MODE)/%)
+	$(TEST_LIBC_TINYMATH_SRCS:%.c=o/$(MODE)/%)
 
 TEST_LIBC_TINYMATH_BINS =					\
 	$(TEST_LIBC_TINYMATH_COMS)				\
@@ -33,23 +26,19 @@ TEST_LIBC_TINYMATH_DIRECTDEPS =					\
 	LIBC_CALLS						\
 	LIBC_FMT						\
 	LIBC_INTRIN						\
-	LIBC_LOG						\
 	LIBC_MEM						\
 	LIBC_NEXGEN32E						\
-	LIBC_RUNTIME						\
 	LIBC_STDIO						\
+	LIBC_RUNTIME						\
 	LIBC_STR						\
 	LIBC_SYSV						\
 	LIBC_TESTLIB						\
 	LIBC_TINYMATH						\
 	LIBC_X							\
 	THIRD_PARTY_COMPILER_RT					\
-	THIRD_PARTY_COMPILER_RT					\
-	THIRD_PARTY_DOUBLECONVERSION				\
 	THIRD_PARTY_GDTOA					\
-	THIRD_PARTY_LIBCXX					\
-	THIRD_PARTY_LIBCXXABI					\
-	THIRD_PARTY_LIBUNWIND					\
+	THIRD_PARTY_COMPILER_RT					\
+	THIRD_PARTY_DOUBLECONVERSION
 
 TEST_LIBC_TINYMATH_DEPS :=					\
 	$(call uniq,$(foreach x,$(TEST_LIBC_TINYMATH_DIRECTDEPS),$($(x))))
diff --git a/test/libc/tinymath/acosh_test.c b/test/libc/tinymath/acosh_test.c
index 639782c04..008972047 100644
--- a/test/libc/tinymath/acosh_test.c
+++ b/test/libc/tinymath/acosh_test.c
@@ -48,9 +48,7 @@ TEST(acoshf, test) {
 TEST(acoshl, test) {
   volatile long double x = 16;
   EXPECT_STREQ("4", gc(xdtoal(sqrtl(x))));
-#if LDBL_MANT_DIG > 64
   EXPECT_STREQ(".9624236501192069", gc(xdtoal(_acoshl(1.5))));
-#endif
   EXPECT_STREQ("0", gc(xdtoal(_acoshl(1))));
   EXPECT_TRUE(isnan(_acoshl(NAN)));
   EXPECT_TRUE(isnan(_acoshl(.5)));
diff --git a/test/libc/tinymath/asinh_test.c b/test/libc/tinymath/asinh_test.c
index 1a2599534..1584a1ad6 100644
--- a/test/libc/tinymath/asinh_test.c
+++ b/test/libc/tinymath/asinh_test.c
@@ -46,10 +46,8 @@ TEST(asinhf, test) {
 }
 
 TEST(asinhl, test) {
-#if LDBL_MANT_DIG > 64
   EXPECT_STREQ(".4812118250596034", gc(xdtoal(_asinhl(+.5))));
   EXPECT_STREQ("-.4812118250596034", gc(xdtoal(_asinhl(-.5))));
-#endif
   EXPECT_STREQ("0", gc(xdtoal(_asinhl(0))));
   EXPECT_STREQ("NAN", gc(xdtoal(_asinhl(NAN))));
   EXPECT_STREQ("INFINITY", gc(xdtoal(_asinhl(INFINITY))));
diff --git a/test/libc/tinymath/fdot_test.cc b/test/libc/tinymath/fdot_test.cc
deleted file mode 100644
index 9cd724926..000000000
--- a/test/libc/tinymath/fdot_test.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-#include "libc/assert.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/intrin/bsr.h"
-#include "libc/macros.h"
-#include "libc/math.h"
-#include "libc/mem/gc.h"
-#include "libc/mem/leaks.h"
-#include "libc/mem/mem.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "libc/testlib/benchmark.h"
-#include "libc/x/xasprintf.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/intel/immintrin.internal.h"
-
-#define EXPENSIVE_TESTS 0
-
-#define CHUNK 8
-
-#define FASTMATH __attribute__((__optimize__("-O3,-ffast-math")))
-#define PORTABLE __target_clones("avx512f,avx")
-
-int rand32(void) {
-  /* Knuth, D.E., "The Art of Computer Programming," Vol 2,
-     Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998,
-     p. 106 (line 26) & p. 108 */
-  static unsigned long long lcg = 1;
-  lcg *= 6364136223846793005;
-  lcg += 1442695040888963407;
-  return lcg >> 32;
-}
-
-float float01(unsigned x) {  // (0,1)
-  return 1.f / 8388608 * ((x >> 9) + .5f);
-}
-
-float numba(void) {  // (-1,1)
-  return float01(rand32()) * 2 - 1;
-}
-
-PORTABLE float fdotf_dubble(const float *A, const float *B, size_t n) {
-  double s = 0;
-  for (size_t i = 0; i < n; ++i)
-    s = fma(A[i], B[i], s);
-  return s;
-}
-
-float fdotf_kahan(const float *A, const float *B, size_t n) {
-  size_t i;
-  float err, sum, t, y;
-  sum = err = 0;
-  for (i = 0; i < n; ++i) {
-    y = A[i] * B[i] - err;
-    t = sum + y;
-    err = (t - sum) - y;
-    sum = t;
-  }
-  return sum;
-}
-
-float fdotf_naive(const float *A, const float *B, size_t n) {
-  float s = 0;
-  for (size_t i = 0; i < n; ++i)
-    s = fmaf(A[i], B[i], s);
-  return s;
-}
-
-#define fdotf_naive_tester(A, B, n, tol)                                       \
-  do {                                                                         \
-    float err = fabsf(fdotf_naive(A, B, n) - fdotf_dubble(A, B, n));           \
-    if (err > tol) {                                                           \
-      printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
-             err);                                                             \
-      exit(1);                                                                 \
-    }                                                                          \
-  } while (0)
-
-void test_fdotf_naive(void) {
-  float *A = new float[2 * 1024 * 1024 + 1];
-  float *B = new float[2 * 1024 * 1024 + 1];
-  for (size_t i = 0; i < 2 * 1024 * 1024 + 1; ++i) {
-    A[i] = numba();
-    B[i] = numba();
-  }
-  for (size_t n = 0; n < 1024; ++n)
-    fdotf_naive_tester(A, B, n, 1e-4);
-#if EXPENSIVE_TESTS
-  fdotf_naive_tester(A, B, 128 * 1024, 1e-2);
-  fdotf_naive_tester(A, B, 256 * 1024, 1e-2);
-  fdotf_naive_tester(A, B, 1024 * 1024, 1e-1);
-  fdotf_naive_tester(A, B, 1024 * 1024 - 1, 1e-1);
-  fdotf_naive_tester(A, B, 1024 * 1024 + 1, 1e-1);
-  fdotf_naive_tester(A, B, 2 * 1024 * 1024, 1e-1);
-  fdotf_naive_tester(A, B, 2 * 1024 * 1024 - 1, 1e-1);
-  fdotf_naive_tester(A, B, 2 * 1024 * 1024 + 1, 1e-1);
-#endif
-  delete[] B;
-  delete[] A;
-}
-
-template <int N>
-forceinline float hdot(const float *A, const float *B) {
-  return hdot<N / 2>(A, B) + hdot<N / 2>(A + N / 2, B + N / 2);
-}
-
-template <>
-forceinline float hdot<1>(const float *A, const float *B) {
-  return A[0] * B[0];
-}
-
-float fdotf_recursive(const float *A, const float *B, size_t n) {
-  if (n > 32) {
-    float x, y;
-    x = fdotf_recursive(A, B, n / 2);
-    y = fdotf_recursive(A + n / 2, B + n / 2, n - n / 2);
-    return x + y;
-  } else {
-    float s;
-    size_t i;
-    for (s = i = 0; i < n; ++i)
-      s = fmaf(A[i], B[i], s);
-    return s;
-  }
-}
-
-optimizespeed float fdotf_intrin(const float *A, const float *B, size_t n) {
-  size_t i = 0;
-#ifdef __AVX512F__
-  __m512 vec[CHUNK] = {};
-  for (; i + CHUNK * 16 <= n; i += CHUNK * 16)
-    for (int j = 0; j < CHUNK; ++j)
-      vec[j] = _mm512_fmadd_ps(_mm512_loadu_ps(A + i + j * 16),
-                               _mm512_loadu_ps(B + i + j * 16), vec[j]);
-  float res = 0;
-  for (int j = 0; j < CHUNK; ++j)
-    res += _mm512_reduce_add_ps(vec[j]);
-#elif defined(__aarch64__)
-  float32x4_t vec[CHUNK] = {};
-  for (; i + CHUNK * 4 <= n; i += CHUNK * 4)
-    for (int j = 0; j < CHUNK; ++j)
-      vec[j] =
-          vfmaq_f32(vec[j], vld1q_f32(A + i + j * 4), vld1q_f32(B + i + j * 4));
-  float res = 0;
-  for (int j = 0; j < CHUNK; ++j)
-    res += vaddvq_f32(vec[j]);
-#else
-  float res = 0;
-#endif
-  for (; i < n; ++i)
-    res += A[i] * B[i];
-  return res;
-}
-
-FASTMATH float fdotf_ruler(const float *A, const float *B, size_t n) {
-  int rule, step = 2;
-  size_t chunk, sp = 0;
-  float stack[bsr(n / CHUNK + 1) + 1];
-  for (chunk = 0; chunk + CHUNK * 4 <= n; chunk += CHUNK * 4, step += 2) {
-    float sum = 0;
-    for (size_t elem = 0; elem < CHUNK * 4; ++elem)
-      sum += A[chunk + elem] * B[chunk + elem];
-    for (rule = bsr(step & -step); --rule;)
-      sum += stack[--sp];
-    stack[sp++] = sum;
-  }
-  float res = 0;
-  while (sp)
-    res += stack[--sp];
-  for (; chunk < n; ++chunk)
-    res += A[chunk] * B[chunk];
-  return res;
-}
-
-#define fdotf_ruler_tester(A, B, n, tol)                                       \
-  do {                                                                         \
-    float err = fabsf(fdotf_ruler(A, B, n) - fdotf_dubble(A, B, n));           \
-    if (err > tol) {                                                           \
-      printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
-             err);                                                             \
-      exit(1);                                                                 \
-    }                                                                          \
-  } while (0)
-
-void test_fdotf_ruler(void) {
-  float *A = new float[10 * 1024 * 1024 + 1];
-  float *B = new float[10 * 1024 * 1024 + 1];
-  for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i) {
-    A[i] = numba();
-    B[i] = numba();
-  }
-  fdotf_ruler_tester(A, B, 96, 1e-6);
-  for (size_t n = 0; n < 4096; ++n)
-    fdotf_ruler_tester(A, B, n, 1e-5);
-#if EXPENSIVE_TESTS
-  fdotf_ruler_tester(A, B, 128 * 1024, 1e-4);
-  fdotf_ruler_tester(A, B, 256 * 1024, 1e-4);
-  fdotf_ruler_tester(A, B, 1024 * 1024, 1e-3);
-  fdotf_ruler_tester(A, B, 1024 * 1024 - 1, 1e-3);
-  fdotf_ruler_tester(A, B, 1024 * 1024 + 1, 1e-3);
-  fdotf_ruler_tester(A, B, 2 * 1024 * 1024, 1e-3);
-  fdotf_ruler_tester(A, B, 2 * 1024 * 1024 - 1, 1e-3);
-  fdotf_ruler_tester(A, B, 2 * 1024 * 1024 + 1, 1e-3);
-  fdotf_ruler_tester(A, B, 8 * 1024 * 1024, 1e-3);
-  fdotf_ruler_tester(A, B, 10 * 1024 * 1024, 1e-3);
-#endif
-  delete[] B;
-  delete[] A;
-}
-
-PORTABLE float fdotf_hefty(const float *A, const float *B, size_t n) {
-  if (1)
-    return 0;
-  unsigned i, par, len = 0;
-  float sum, res[n / CHUNK + 1];
-  for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK)
-    res[len++] = hdot<CHUNK>(A + i, B + i);
-  if (i < n) {
-    for (sum = 0; i < n; i++)
-      sum = fmaf(A[i], B[i], sum);
-    res[len++] = sum;
-  }
-  for (par = len >> 1; par; par >>= 1, len >>= 1) {
-    for (i = 0; i < par; ++i)
-      res[i] += res[par + i];
-    if (len & 1)
-      res[par - 1] += res[len - 1];
-  }
-  return res[0];
-}
-
-#define fdotf_hefty_tester(A, B, n, tol)                                       \
-  do {                                                                         \
-    float err = fabsf(fdotf_hefty(A, B, n) - fdotf_dubble(A, B, n));           \
-    if (err > tol) {                                                           \
-      printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
-             err);                                                             \
-      exit(1);                                                                 \
-    }                                                                          \
-  } while (0)
-
-void test_fdotf_hefty(void) {
-  float *A = new float[10 * 1024 * 1024 + 1];
-  float *B = new float[10 * 1024 * 1024 + 1];
-  for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i) {
-    A[i] = numba();
-    B[i] = numba();
-  }
-  for (size_t n = 0; n < 1024; ++n)
-    fdotf_hefty_tester(A, B, n, 1e-5);
-#if EXPENSIVE_TESTS
-  fdotf_hefty_tester(A, B, 128 * 1024, 1e-4);
-  fdotf_hefty_tester(A, B, 256 * 1024, 1e-4);
-  fdotf_hefty_tester(A, B, 1024 * 1024, 1e-3);
-  fdotf_hefty_tester(A, B, 1024 * 1024 - 1, 1e-3);
-  fdotf_hefty_tester(A, B, 1024 * 1024 + 1, 1e-3);
-  fdotf_hefty_tester(A, B, 2 * 1024 * 1024, 1e-3);
-  fdotf_hefty_tester(A, B, 2 * 1024 * 1024 - 1, 1e-3);
-  fdotf_hefty_tester(A, B, 2 * 1024 * 1024 + 1, 1e-3);
-  fdotf_hefty_tester(A, B, 8 * 1024 * 1024, 1e-3);
-  fdotf_hefty_tester(A, B, 10 * 1024 * 1024, 1e-3);
-#endif
-  delete[] B;
-  delete[] A;
-}
-
-float nothing(float x) {
-  return x;
-}
-
-float (*barrier)(float) = nothing;
-
-int main() {
-  ShowCrashReports();
-
-#if EXPENSIVE_TESTS
-  size_t n = 512 * 1024;
-#else
-  size_t n = 4096;
-#endif
-
-  float *A = new float[n];
-  float *B = new float[n];
-  for (size_t i = 0; i < n; ++i) {
-    A[i] = numba();
-    B[i] = numba();
-  }
-  float kahan, naive, dubble, recursive, ruler, intrin;
-  test_fdotf_naive();
-  // test_fdotf_hefty();
-  test_fdotf_ruler();
-  BENCHMARK(20, 1, (kahan = barrier(fdotf_kahan(A, B, n))));
-  BENCHMARK(20, 1, (dubble = barrier(fdotf_dubble(A, B, n))));
-  BENCHMARK(20, 1, (naive = barrier(fdotf_naive(A, B, n))));
-  BENCHMARK(20, 1, (recursive = barrier(fdotf_recursive(A, B, n))));
-  BENCHMARK(20, 1, (intrin = barrier(fdotf_intrin(A, B, n))));
-  BENCHMARK(20, 1, (ruler = barrier(fdotf_ruler(A, B, n))));
-  // BENCHMARK(20, 1, (hefty = barrier(fdotf_hefty(A, B, n))));
-  printf("dubble    = %f (%g)\n", dubble, fabs(dubble - dubble));
-  printf("kahan     = %f (%g)\n", kahan, fabs(kahan - dubble));
-  printf("naive     = %f (%g)\n", naive, fabs(naive - dubble));
-  printf("recursive = %f (%g)\n", recursive, fabs(recursive - dubble));
-  printf("intrin    = %f (%g)\n", intrin, fabs(intrin - dubble));
-  printf("ruler     = %f (%g)\n", ruler, fabs(ruler - dubble));
-  // printf("hefty     = %f (%g)\n", hefty, fabs(hefty - dubble));
-  delete[] B;
-  delete[] A;
-
-  CheckForMemoryLeaks();
-}
diff --git a/libc/testlib/exactlyequallongdouble.c b/test/libc/tinymath/fsum_test.c
similarity index 71%
rename from libc/testlib/exactlyequallongdouble.c
rename to test/libc/tinymath/fsum_test.c
index fa6f73e6c..7936e440a 100644
--- a/libc/testlib/exactlyequallongdouble.c
+++ b/test/libc/tinymath/fsum_test.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -16,20 +16,39 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/macros.internal.h"
 #include "libc/math.h"
+#include "libc/mem/gc.h"
+#include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
+#include "libc/x/xasprintf.h"
 
-bool testlib_exactlyequallongdouble(long double x, long double y) {
-  if (isnan(x) && isnan(y))
-    return true;
-  // Check that we don't have e.g. one input denormal and the other not
-  // (a denormal and a non-denormal can sometimes compare equal)
-  if (fpclassify(x) != fpclassify(y))
-    return false;
-  // Check that we don't have -0 and 0
-  if (signbit(x) != signbit(y))
-    return false;
-  if (x != y)
-    return false;
-  return true;
+#define N 100000
+
+float F[N];
+double D[N];
+
+void SetUp(void) {
+  int i;
+  for (i = 0; i < N / 2; ++i) {
+    D[i * 2 + 0] = 1000000000.1;
+    D[i * 2 + 1] = 1.1;
+  }
+  for (i = 0; i < N / 2; ++i) {
+    F[i * 2 + 0] = 1000.1;
+    F[i * 2 + 1] = 1.1;
+  }
+}
+
+TEST(fsum, test) {
+  EXPECT_STREQ("500000000.6", gc(xasprintf("%.15g", fsum(D, N) / N)));
+}
+
+TEST(fsumf, test) {
+  EXPECT_STREQ("500.6", gc(xasprintf("%.7g", fsumf(F, N) / N)));
+}
+
+BENCH(fsum, bench) {
+  EZBENCH2("fsum", donothing, fsum(D, N));
+  EZBENCH2("fsumf", donothing, fsumf(F, N));
 }
diff --git a/test/libc/tinymath/fsum_test.cc b/test/libc/tinymath/fsum_test.cc
deleted file mode 100644
index 5e51c4cc2..000000000
--- a/test/libc/tinymath/fsum_test.cc
+++ /dev/null
@@ -1,260 +0,0 @@
-#include "libc/assert.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/intrin/bsr.h"
-#include "libc/macros.h"
-#include "libc/math.h"
-#include "libc/mem/gc.h"
-#include "libc/mem/leaks.h"
-#include "libc/mem/mem.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "libc/testlib/benchmark.h"
-#include "libc/x/xasprintf.h"
-
-#define EXPENSIVE_TESTS 0
-
-#define CHUNK 8
-
-#define FASTMATH __attribute__((__optimize__("-O3,-ffast-math")))
-#define PORTABLE __target_clones("avx512f,avx")
-
-static unsigned long long lcg = 1;
-
-int rand32(void) {
-  /* Knuth, D.E., "The Art of Computer Programming," Vol 2,
-     Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998,
-     p. 106 (line 26) & p. 108 */
-  lcg *= 6364136223846793005;
-  lcg += 1442695040888963407;
-  return lcg >> 32;
-}
-
-float float01(unsigned x) {  // (0,1)
-  return 1.f / 8388608 * ((x >> 9) + .5f);
-}
-
-float numba(void) {  // (-1,1)
-  return float01(rand32()) * 2 - 1;
-}
-
-FASTMATH PORTABLE float fsumf_dubble(const float *p, size_t n) {
-  double s = 0;
-  for (size_t i = 0; i < n; ++i)
-    s += p[i];
-  return s;
-}
-
-PORTABLE float fsumf_kahan(const float *p, size_t n) {
-  size_t i;
-  float err, sum, t, y;
-  sum = err = 0;
-  for (i = 0; i < n; ++i) {
-    y = p[i] - err;
-    t = sum + y;
-    err = (t - sum) - y;
-    sum = t;
-  }
-  return sum;
-}
-
-FASTMATH PORTABLE float fsumf_naive(const float *p, size_t n) {
-  float s = 0;
-  for (size_t i = 0; i < n; ++i)
-    s += p[i];
-  return s;
-}
-
-#define fsumf_naive_tester(A, n, tol)                                          \
-  do {                                                                         \
-    float err = fabsf(fsumf_naive(A, n) - fsumf_dubble(A, n));                 \
-    if (err > tol) {                                                           \
-      printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
-             err);                                                             \
-      exit(1);                                                                 \
-    }                                                                          \
-  } while (0)
-
-void test_fsumf_naive(void) {
-  float *A = new float[2 * 1024 * 1024 + 1];
-  for (size_t i = 0; i < 2 * 1024 * 1024 + 1; ++i)
-    A[i] = numba();
-  for (size_t n = 0; n < 1024; ++n)
-    fsumf_naive_tester(A, n, 1e-4);
-#if EXPENSIVE_TESTS
-  fsumf_naive_tester(A, 128 * 1024, 1e-2);
-  fsumf_naive_tester(A, 256 * 1024, 1e-2);
-  fsumf_naive_tester(A, 1024 * 1024, 1e-1);
-  fsumf_naive_tester(A, 1024 * 1024 - 1, 1e-1);
-  fsumf_naive_tester(A, 1024 * 1024 + 1, 1e-1);
-  fsumf_naive_tester(A, 2 * 1024 * 1024, 1e-1);
-  fsumf_naive_tester(A, 2 * 1024 * 1024 - 1, 1e-1);
-  fsumf_naive_tester(A, 2 * 1024 * 1024 + 1, 1e-1);
-#endif
-  delete[] A;
-}
-
-template <int N>
-forceinline float hsum(const float *p) {
-  return hsum<N / 2>(p) + hsum<N / 2>(p + N / 2);
-}
-
-template <>
-forceinline float hsum<1>(const float *p) {
-  return *p;
-}
-
-FASTMATH PORTABLE float fsumf_recursive(const float *p, size_t n) {
-  if (n > 32) {
-    float x, y;
-    x = fsumf_recursive(p, n / 2);
-    y = fsumf_recursive(p + n / 2, n - n / 2);
-    return x + y;
-  } else {
-    float s;
-    size_t i;
-    for (s = i = 0; i < n; ++i)
-      s += p[i];
-    return s;
-  }
-}
-
-FASTMATH PORTABLE float fsumf_ruler(const float *p, size_t n) {
-  size_t i, sp = 0;
-  int rule, step = 2;
-  float stack[bsr(n / CHUNK + 1) + 1];
-  for (i = 0; i + CHUNK * 4 <= n; i += CHUNK * 4, step += 2) {
-    float sum = 0;
-    for (size_t j = 0; j < CHUNK * 4; ++j)
-      sum += p[i + j];
-    for (rule = bsr(step & -step); --rule;)
-      sum += stack[--sp];
-    stack[sp++] = sum;
-  }
-  float res = 0;
-  while (sp)
-    res += stack[--sp];
-  while (i < n)
-    res += p[i++];
-  return res;
-}
-
-#define fsumf_ruler_tester(A, n, tol)                                          \
-  do {                                                                         \
-    float err = fabsf(fsumf_ruler(A, n) - fsumf_dubble(A, n));                 \
-    if (err > tol) {                                                           \
-      printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
-             err);                                                             \
-      exit(1);                                                                 \
-    }                                                                          \
-  } while (0)
-
-void test_fsumf_ruler(void) {
-  float *A = new float[10 * 1024 * 1024 + 1];
-  for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i)
-    A[i] = numba();
-  fsumf_ruler_tester(A, 96, 1e-6);
-  for (size_t n = 0; n < 1024; ++n)
-    fsumf_ruler_tester(A, n, 1e-5);
-#if EXPENSIVE_TESTS
-  fsumf_ruler_tester(A, 128 * 1024, 1e-4);
-  fsumf_ruler_tester(A, 256 * 1024, 1e-4);
-  fsumf_ruler_tester(A, 1024 * 1024, 1e-3);
-  fsumf_ruler_tester(A, 1024 * 1024 - 1, 1e-3);
-  fsumf_ruler_tester(A, 1024 * 1024 + 1, 1e-3);
-  fsumf_ruler_tester(A, 2 * 1024 * 1024, 1e-3);
-  fsumf_ruler_tester(A, 2 * 1024 * 1024 - 1, 1e-3);
-  fsumf_ruler_tester(A, 2 * 1024 * 1024 + 1, 1e-3);
-  fsumf_ruler_tester(A, 8 * 1024 * 1024, 1e-3);
-  fsumf_ruler_tester(A, 10 * 1024 * 1024, 1e-3);
-#endif
-  delete[] A;
-}
-
-FASTMATH PORTABLE float fsumf_hefty(const float *p, size_t n) {
-  unsigned i, par, len = 0;
-  float sum, res[n / CHUNK + 1];
-  for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK)
-    res[len++] = hsum<CHUNK>(p + i);
-  if (i < n) {
-    for (sum = 0; i < n; i++)
-      sum += p[i];
-    res[len++] = sum;
-  }
-  for (par = len >> 1; par; par >>= 1, len >>= 1) {
-    for (i = 0; i < par; ++i)
-      res[i] += res[par + i];
-    if (len & 1)
-      res[par - 1] += res[len - 1];
-  }
-  return res[0];
-}
-
-#define fsumf_hefty_tester(A, n, tol)                                          \
-  do {                                                                         \
-    float err = fabsf(fsumf_hefty(A, n) - fsumf_dubble(A, n));                 \
-    if (err > tol) {                                                           \
-      printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
-             err);                                                             \
-      exit(1);                                                                 \
-    }                                                                          \
-  } while (0)
-
-void test_fsumf_hefty(void) {
-  float *A = new float[10 * 1024 * 1024 + 1];
-  for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i)
-    A[i] = numba();
-  for (size_t n = 0; n < 1024; ++n)
-    fsumf_hefty_tester(A, n, 1e-5);
-#if EXPENSIVE_TESTS
-  fsumf_hefty_tester(A, 128 * 1024, 1e-4);
-  fsumf_hefty_tester(A, 256 * 1024, 1e-4);
-  fsumf_hefty_tester(A, 1024 * 1024, 1e-3);
-  fsumf_hefty_tester(A, 1024 * 1024 - 1, 1e-3);
-  fsumf_hefty_tester(A, 1024 * 1024 + 1, 1e-3);
-  fsumf_hefty_tester(A, 2 * 1024 * 1024, 1e-3);
-  fsumf_hefty_tester(A, 2 * 1024 * 1024 - 1, 1e-3);
-  fsumf_hefty_tester(A, 2 * 1024 * 1024 + 1, 1e-3);
-  fsumf_hefty_tester(A, 8 * 1024 * 1024, 1e-3);
-  fsumf_hefty_tester(A, 10 * 1024 * 1024, 1e-3);
-#endif
-  delete[] A;
-}
-
-float nothing(float x) {
-  return x;
-}
-
-float (*barrier)(float) = nothing;
-
-int main() {
-  ShowCrashReports();
-
-#if EXPENSIVE_TESTS
-  size_t n = 4 * 1024 * 1024;
-#else
-  size_t n = 1024;
-#endif
-
-  float *p = new float[n];
-  for (size_t i = 0; i < n; ++i)
-    p[i] = numba();
-  float kahan, naive, dubble, recursive, hefty, ruler;
-  test_fsumf_naive();
-  test_fsumf_hefty();
-  test_fsumf_ruler();
-  BENCHMARK(20, 1, (kahan = barrier(fsumf_kahan(p, n))));
-  BENCHMARK(20, 1, (dubble = barrier(fsumf_dubble(p, n))));
-  BENCHMARK(20, 1, (naive = barrier(fsumf_naive(p, n))));
-  BENCHMARK(20, 1, (recursive = barrier(fsumf_recursive(p, n))));
-  BENCHMARK(20, 1, (ruler = barrier(fsumf_ruler(p, n))));
-  BENCHMARK(20, 1, (hefty = barrier(fsumf_hefty(p, n))));
-  printf("dubble    = %f (%g)\n", dubble, fabs(dubble - dubble));
-  printf("kahan     = %f (%g)\n", kahan, fabs(kahan - dubble));
-  printf("naive     = %f (%g)\n", naive, fabs(naive - dubble));
-  printf("recursive = %f (%g)\n", recursive, fabs(recursive - dubble));
-  printf("ruler     = %f (%g)\n", ruler, fabs(ruler - dubble));
-  printf("hefty     = %f (%g)\n", hefty, fabs(hefty - dubble));
-  delete[] p;
-
-  CheckForMemoryLeaks();
-}
diff --git a/test/libc/tinymath/hypot_test.c b/test/libc/tinymath/hypot_test.c
index 37ede41a0..802d994ed 100644
--- a/test/libc/tinymath/hypot_test.c
+++ b/test/libc/tinymath/hypot_test.c
@@ -99,10 +99,8 @@ TEST(hypotll, test) {
   EXPECT_STREQ("1.414213562373095", gc(xdtoal(_hypotl(-1, 1))));
   EXPECT_STREQ("1.414213626012708", gc(xdtoal(_hypotl(1.0000001, .99999999))));
   EXPECT_STREQ("1.414213626012708", gc(xdtoal(_hypotl(.99999999, 1.0000001))));
-#if LDBL_MANT_DIG > 64
   EXPECT_STREQ("1.414213562373095e+4931",
                gc(xdtoal(_hypotl(1e4931L, 1e4931L))));
-#endif
   EXPECT_STREQ("NAN", gc(xdtoal(_hypotl(0, NAN))));
   EXPECT_STREQ("NAN", gc(xdtoal(_hypotl(NAN, 0))));
   EXPECT_STREQ("NAN", gc(xdtoal(_hypotl(NAN, NAN))));
diff --git a/test/libc/tinymath/powl_test.c b/test/libc/tinymath/powl_test.c
index 8d25a7186..0c2f3bada 100644
--- a/test/libc/tinymath/powl_test.c
+++ b/test/libc/tinymath/powl_test.c
@@ -20,7 +20,7 @@
 #include "libc/calls/struct/siginfo.h"
 #include "libc/calls/ucontext.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/runtime/pc.internal.h"
diff --git a/test/libc/tinymath/strtod_test.c b/test/libc/tinymath/strtod_test.c
index e6a64531c..05e46173e 100644
--- a/test/libc/tinymath/strtod_test.c
+++ b/test/libc/tinymath/strtod_test.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/conv.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/fenv.h"
diff --git a/test/libc/tinymath/tanh_test.c b/test/libc/tinymath/tanh_test.c
index bf3062602..c59e5511c 100644
--- a/test/libc/tinymath/tanh_test.c
+++ b/test/libc/tinymath/tanh_test.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/conv.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/runtime/runtime.h"
diff --git a/test/libc/x/utf8to32_test.c b/test/libc/x/utf8to32_test.c
index 5f54e2d17..cf17662bb 100644
--- a/test/libc/x/utf8to32_test.c
+++ b/test/libc/x/utf8to32_test.c
@@ -18,6 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
+#include "libc/mem/shuffle.internal.h"
 #include "libc/stdio/rand.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/hyperion.h"
@@ -81,20 +82,11 @@ TEST(utf32to8, testLargeThompsonPikeEncoded) {
           -1, 0)));
 }
 
-void shuffle(wchar_t *a, int n) {
-  for (int i = n - 1; i >= 1; --i) {
-    int r = rand() % (i + 1);
-    wchar_t t = a[r];
-    a[r] = a[i];
-    a[i] = t;
-  }
-}
-
 char *GenerateBranchyUtf8Text(size_t *out_n) {
   char *p;
   size_t n;
   wchar_t *q = gc(utf8to32(kViewables, kViewablesSize, &n));
-  shuffle(q, n);
+  shuffle(lemur64, q, n);
   p = utf32to8(q, n, &n);
   if (out_n)
     *out_n = n;
diff --git a/test/libc/xed/x86ild_lib.c b/test/libc/xed/x86ild_lib.c
index af8d699ae..e7cc1b0e7 100644
--- a/test/libc/xed/x86ild_lib.c
+++ b/test/libc/xed/x86ild_lib.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/bing.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/str/str.h"
 #include "libc/testlib/testlib.h"
diff --git a/test/libcxx/cexception_test.cc b/test/libcxx/cexception_test.cc
index 28aa20d48..0e09c014b 100644
--- a/test/libcxx/cexception_test.cc
+++ b/test/libcxx/cexception_test.cc
@@ -21,6 +21,13 @@
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 
+// this dontthrow keyword SHOULD break this test. it's probably passing
+// because we're currently using SjLj exceptions. the day we can change
+// things, remove `dontthrow` and this test will still be a useful help
+extern "C" dontthrow void qsort_(void *, size_t, size_t,
+                                 int (*)(const void *,
+                                         const void *)) asm("qsort");
+
 struct Resource {
   char *p;
   Resource() {
@@ -53,7 +60,7 @@ int A[3] = {3, 2, 1};
 int Work(void) {
   Resource r;
   pPoke(r.p);
-  qsort(A, 3, sizeof(int), cmp);
+  qsort_(A, 3, sizeof(int), cmp);
   return A[0];
 }
 int (*pWork)(void) = Work;
diff --git a/test/libcxx/errc_test.cc b/test/libcxx/errc_test.cc
deleted file mode 100644
index 373cdf78c..000000000
--- a/test/libcxx/errc_test.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <cerrno>
-#include <system_error>
-
-bool test_errc_mapping(int posix_error, std::errc expected_errc) {
-  std::error_code ec(posix_error, std::generic_category());
-  return ec.value() == posix_error &&  //
-         std::error_condition(expected_errc) == ec;
-}
-
-int main() {
-  if (!test_errc_mapping(EACCES, std::errc::permission_denied))
-    return 1;
-  if (!test_errc_mapping(ENOENT, std::errc::no_such_file_or_directory))
-    return 2;
-  if (!test_errc_mapping(EEXIST, std::errc::file_exists))
-    return 3;
-  if (!test_errc_mapping(EINVAL, std::errc::invalid_argument))
-    return 4;
-  if (!test_errc_mapping(ENOSPC, std::errc::no_space_on_device))
-    return 5;
-  return 0;
-}
diff --git a/test/libcxx/filesystem_test.cc b/test/libcxx/filesystem_test.cc
deleted file mode 100644
index 762e650b2..000000000
--- a/test/libcxx/filesystem_test.cc
+++ /dev/null
@@ -1,341 +0,0 @@
-#include <chrono>
-#include <cstdio>
-#include <filesystem>
-#include <fstream>
-#include <random>
-#include <string>
-#include <thread>
-
-#define ASSERT(condition)                                           \
-  if (!(condition)) {                                               \
-    fprintf(stderr, "%s:%d: test failed: %s\n", __FILE__, __LINE__, \
-            #condition);                                            \
-    return 1;                                                       \
-  }
-
-namespace fs = std::filesystem;
-
-fs::path g_temp_path;
-fs::path g_orig_path;
-std::string g_tmpdir;
-
-void setup() {
-  g_orig_path = fs::current_path();
-  fs::path temp_path = fs::temp_directory_path();
-  auto now = std::chrono::system_clock::now();
-  auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
-  auto value = now_ms.time_since_epoch();
-  long duration = value.count();
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<> dis(0, 999999);
-  int random_number = dis(gen);
-  std::string dir_name =
-      "temp_" + std::to_string(duration) + "_" + std::to_string(random_number);
-  g_temp_path = temp_path / dir_name;
-  fs::create_directory(g_temp_path);
-  fs::current_path(g_temp_path);
-  fs::create_directory("tmp");
-  g_tmpdir = fs::absolute("tmp");
-  setenv("TMPDIR", g_tmpdir.c_str(), true);
-}
-
-void teardown() {
-  fs::current_path(g_orig_path);
-  fs::remove_all(g_temp_path);
-}
-
-int test_create_directory() {
-  fs::path dir = "test_dir";
-  ASSERT(fs::create_directory(dir));
-  ASSERT(fs::is_directory(dir));
-  ASSERT(!fs::create_directory(dir));
-  fs::remove(dir);
-  return 0;
-}
-
-int test_create_directories() {
-  fs::path dirs = "test_dir/nested/deep";
-  ASSERT(fs::create_directories(dirs));
-  ASSERT(fs::is_directory(dirs));
-  ASSERT(!fs::create_directories(dirs));
-  fs::remove_all("test_dir");
-  return 0;
-}
-
-int test_remove() {
-  fs::path file = "test_file.txt";
-  std::ofstream(file).put('a');
-  ASSERT(fs::remove(file));
-  ASSERT(!fs::remove(file));
-  return 0;
-}
-
-int test_remove_all() {
-  fs::path dir = "test_dir/nested/deep";
-  fs::create_directories(dir);
-  ASSERT(fs::remove_all("test_dir") > 0);
-  ASSERT(fs::remove_all("test_dir") == 0);
-  return 0;
-}
-
-int test_rename() {
-  fs::path old_name = "old.txt";
-  fs::path new_name = "new.txt";
-  std::ofstream(old_name).put('a');
-  fs::rename(old_name, new_name);
-  ASSERT(!fs::exists(old_name));
-  ASSERT(fs::exists(new_name));
-  fs::remove(new_name);
-  return 0;
-}
-
-int test_copy() {
-  fs::path src = "src.txt";
-  fs::path dst = "dst.txt";
-  std::ofstream(src) << "test";
-  fs::copy(src, dst);
-  ASSERT(fs::exists(dst));
-  ASSERT(fs::file_size(src) == fs::file_size(dst));
-  fs::remove(src);
-  fs::remove(dst);
-  return 0;
-}
-
-int test_copy_file() {
-  fs::path src = "src.txt";
-  fs::path dst = "dst.txt";
-  std::ofstream(src) << "test";
-  ASSERT(fs::copy_file(src, dst));
-  ASSERT(!fs::copy_file(src, dst, fs::copy_options::skip_existing));
-  fs::remove(src);
-  fs::remove(dst);
-  return 0;
-}
-
-int test_exists() {
-  fs::path file = "test.txt";
-  ASSERT(!fs::exists(file));
-  std::ofstream(file).put('a');
-  ASSERT(fs::exists(file));
-  fs::remove(file);
-  return 0;
-}
-
-int test_is_regular_file() {
-  fs::path file = "test.txt";
-  fs::path dir = "test_dir";
-  std::ofstream(file).put('a');
-  fs::create_directory(dir);
-  ASSERT(fs::is_regular_file(file));
-  ASSERT(!fs::is_regular_file(dir));
-  fs::remove(file);
-  fs::remove(dir);
-  return 0;
-}
-
-int test_is_directory() {
-  fs::path file = "test.txt";
-  fs::path dir = "test_dir";
-  std::ofstream(file).put('a');
-  fs::create_directory(dir);
-  ASSERT(!fs::is_directory(file));
-  ASSERT(fs::is_directory(dir));
-  fs::remove(file);
-  fs::remove(dir);
-  return 0;
-}
-
-int test_is_symlink() {
-  fs::path file = "test.txt";
-  fs::path link = "test_link";
-  std::ofstream(file).put('a');
-  fs::create_symlink(file, link);
-  ASSERT(!fs::is_symlink(file));
-  ASSERT(fs::is_symlink(link));
-  fs::remove(file);
-  fs::remove(link);
-  return 0;
-}
-
-int test_file_size() {
-  fs::path file = "test.txt";
-  std::ofstream(file) << "test";
-  ASSERT(fs::file_size(file) == 4);
-  fs::remove(file);
-  return 0;
-}
-
-int test_last_write_time() {
-  fs::path file = "test.txt";
-  auto now = fs::file_time_type::clock::now();
-  std::ofstream(file).put('a');
-  fs::last_write_time(file, now);
-  ASSERT(fs::last_write_time(file) == now);
-  fs::remove(file);
-  return 0;
-}
-
-int test_permissions() {
-  fs::path file = "test.txt";
-  std::ofstream(file).put('a');
-  fs::permissions(file, fs::perms::owner_read | fs::perms::owner_write);
-  auto perms = fs::status(file).permissions();
-  ASSERT((perms & fs::perms::owner_read) != fs::perms::none);
-  ASSERT((perms & fs::perms::owner_write) != fs::perms::none);
-  ASSERT((perms & fs::perms::owner_exec) == fs::perms::none);
-  fs::remove(file);
-  return 0;
-}
-
-int test_current_path() {
-  auto original_path = fs::current_path();
-  fs::path new_path = fs::temp_directory_path();
-  fs::current_path(new_path);
-  ASSERT(fs::current_path() == new_path);
-  fs::current_path(original_path);
-  return 0;
-}
-
-int test_absolute() {
-  fs::path relative = "test.txt";
-  auto abs_path = fs::absolute(relative);
-  ASSERT(abs_path.is_absolute());
-  return 0;
-}
-
-int test_canonical() {
-  fs::path dir = "test_dir";
-  fs::path file = dir / "test.txt";
-  fs::create_directories(dir);
-  std::ofstream(file).put('a');
-  auto can_path = fs::canonical(file);
-  ASSERT(can_path.is_absolute());
-  ASSERT(!can_path.lexically_normal().string().empty());
-  fs::remove_all(dir);
-  return 0;
-}
-
-int test_read_symlink() {
-  fs::path file = "test.txt";
-  fs::path link = "test_link";
-  std::ofstream(file).put('a');
-  fs::create_symlink(file, link);
-  ASSERT(fs::read_symlink(link) == file);
-  fs::remove(file);
-  fs::remove(link);
-  return 0;
-}
-
-int test_create_symlink_and_hard_link() {
-  fs::path file = "test.txt";
-  fs::path symlink = "test_symlink";
-  fs::path hardlink = "test_hardlink";
-  std::ofstream(file).put('a');
-  fs::create_symlink(file, symlink);
-  fs::create_hard_link(file, hardlink);
-  ASSERT(fs::exists(symlink));
-  ASSERT(fs::exists(hardlink));
-  ASSERT(fs::is_symlink(symlink));
-  ASSERT(!fs::is_symlink(hardlink));
-  fs::remove(file);
-  fs::remove(symlink);
-  fs::remove(hardlink);
-  return 0;
-}
-
-int test_space() {
-  auto space_info = fs::space(".");
-  ASSERT(space_info.capacity > 0);
-  ASSERT(space_info.free > 0);
-  ASSERT(space_info.available > 0);
-  return 0;
-}
-
-int test_equivalent() {
-  fs::path file1 = "test1.txt";
-  fs::path file2 = "test2.txt";
-  std::ofstream(file1).put('a');
-  fs::create_hard_link(file1, file2);
-  ASSERT(fs::equivalent(file1, file2));
-  fs::remove(file1);
-  fs::remove(file2);
-  return 0;
-}
-
-int test_resize_file() {
-  fs::path file = "test.txt";
-  std::ofstream(file) << "test";
-  fs::resize_file(file, 10);
-  ASSERT(fs::file_size(file) == 10);
-  fs::remove(file);
-  return 0;
-}
-
-int test_status() {
-  fs::path file = "test.txt";
-  std::ofstream(file).put('a');
-  auto status = fs::status(file);
-  ASSERT(status.type() == fs::file_type::regular);
-  fs::remove(file);
-  return 0;
-}
-
-int test_copy_enoent() {
-  fs::path src = "non_existent_file.txt";
-  fs::path dst = "destination.txt";
-  try {
-    fs::copy(src, dst);
-    ASSERT(false);
-  } catch (const fs::filesystem_error& e) {
-    if (e.code() == std::errc::no_such_file_or_directory) {
-      return 0;
-    } else {
-      ASSERT(false);
-    }
-  } catch (const std::exception& e) {
-    ASSERT(false);
-  }
-}
-
-#define RUN(func)  \
-  result = func(); \
-  if (result)      \
-  return result
-
-int test() {
-  int result = 0;
-  RUN(test_copy_enoent);
-  RUN(test_create_directory);
-  RUN(test_create_directories);
-  RUN(test_remove);
-  RUN(test_remove_all);
-  RUN(test_rename);
-  RUN(test_copy);
-  RUN(test_copy_file);
-  RUN(test_exists);
-  RUN(test_is_regular_file);
-  RUN(test_is_directory);
-  RUN(test_is_symlink);
-  RUN(test_file_size);
-  RUN(test_last_write_time);
-  RUN(test_permissions);
-  RUN(test_current_path);
-  RUN(test_absolute);
-  RUN(test_canonical);
-  RUN(test_read_symlink);
-  RUN(test_create_symlink_and_hard_link);
-  RUN(test_space);
-  RUN(test_equivalent);
-  RUN(test_resize_file);
-  RUN(test_status);
-  return 0;
-}
-
-int main() {
-  int rc;
-  setup();
-  rc = test();
-  teardown();
-  return rc;
-}
diff --git a/test/math/bf16_test.c b/test/math/bf16_test.c
deleted file mode 100644
index 05df1f63f..000000000
--- a/test/math/bf16_test.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <math.h>
-
-#define CHECK(x) \
-  if (!(x))      \
-  return __LINE__
-#define FALSE(x)          \
-  {                       \
-    volatile bool x_ = x; \
-    if (x_)               \
-      return __LINE__;    \
-  }
-#define TRUE(x)           \
-  {                       \
-    volatile bool x_ = x; \
-    if (!x_)              \
-      return __LINE__;    \
-  }
-
-__bf16 identity(__bf16 x) {
-  return x;
-}
-__bf16 (*half)(__bf16) = identity;
-
-unsigned toint(float f) {
-  union {
-    float f;
-    unsigned i;
-  } u = {f};
-  return u.i;
-}
-
-int main() {
-  volatile float f;
-  volatile double d;
-  volatile __bf16 pi = 3.141;
-
-  // half → float → half
-  f = pi;
-  pi = f;
-
-  // half → float
-  float __extendbfsf2(__bf16);
-  CHECK(0.f == __extendbfsf2(0));
-  CHECK(3.140625f == __extendbfsf2(pi));
-  CHECK(3.140625f == pi);
-
-  // half → double → half
-  d = pi;
-  pi = d;
-
-  // float → half
-  __bf16 __truncsfbf2(float);
-  CHECK(0 == (float)__truncsfbf2(0));
-  CHECK(pi == (float)__truncsfbf2(3.141f));
-  CHECK(3.140625f == (float)__truncsfbf2(3.141f));
-
-  // double → half
-  __bf16 __truncdfbf2(double);
-  CHECK(0 == (double)__truncdfbf2(0));
-  CHECK(3.140625 == (double)__truncdfbf2(3.141));
-
-  // specials
-  volatile __bf16 nan = NAN;
-  volatile __bf16 positive_infinity = +INFINITY;
-  volatile __bf16 negative_infinity = -INFINITY;
-  CHECK(isnan(nan));
-  CHECK(!isinf(pi));
-  CHECK(!isnan(pi));
-  CHECK(isinf(positive_infinity));
-  CHECK(isinf(negative_infinity));
-  CHECK(!isnan(positive_infinity));
-  CHECK(!isnan(negative_infinity));
-  CHECK(!signbit(pi));
-  CHECK(signbit(half(-pi)));
-  CHECK(!signbit(half(+0.)));
-  CHECK(signbit(half(-0.)));
-
-  // arithmetic
-  CHECK(half(-3) == -half(3));
-  CHECK(half(9) == half(3) * half(3));
-  CHECK(half(0) == half(pi) - half(pi));
-  CHECK(half(6.28125) == half(pi) + half(pi));
-
-  // comparisons
-  CHECK(half(3) > half(2));
-  CHECK(half(3) < half(4));
-  CHECK(half(3) <= half(3));
-  CHECK(half(3) >= half(3));
-  TRUE(half(NAN) != half(NAN));
-  FALSE(half(NAN) == half(NAN));
-  TRUE(half(3) != half(NAN));
-  FALSE(half(3) == half(NAN));
-  TRUE(half(NAN) != half(3));
-  FALSE(half(NAN) == half(3));
-}
diff --git a/test/net/http/parsehttpmessage_test.c b/test/net/http/parsehttpmessage_test.c
index 9b63cd629..5a8412614 100644
--- a/test/net/http/parsehttpmessage_test.c
+++ b/test/net/http/parsehttpmessage_test.c
@@ -17,7 +17,6 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
-#include "libc/intrin/kprintf.h"
 #include "libc/log/check.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
@@ -435,24 +434,6 @@ TEST(ParseHttpResponse, testHttp100) {
   EXPECT_EQ(10, req->version);
 }
 
-TEST(ParseHttpMessage, issue1315) {
-  static const char m[] = "\
-HTTP/1.1 200 OK\r\n\
-padding: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\r\n\
-Date: Sat, 12 Oct 2024 22:50:55 GMT\r\n\
-Server: redbean/3.0.0\r\n\
-Connection: close\r\n\
-Content-Type: text/html; charset=utf-8\r\n\
-Referrer-Policy: no-referrer-when-downgrade\r\n\
-Content-Length: 12\r\n\
-\r\n";
-  InitHttpMessage(req, kHttpResponse);
-  EXPECT_EQ(0, ParseHttpMessage(req, m, strlen(m) - 1, strlen(m)));
-  EXPECT_EQ(200, req->status);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 void DoTiniestHttpRequest(void) {
   static const char m[] = "\
 GET /\r\n\
diff --git a/test/net/https/mbedtls_test.c b/test/net/https/mbedtls_test.c
index 992ffdf02..34eea7401 100644
--- a/test/net/https/mbedtls_test.c
+++ b/test/net/https/mbedtls_test.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/bswap.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/nexgen32e/crc32.h"
 #include "libc/nexgen32e/nexgen32e.h"
diff --git a/test/posix/BUILD.mk b/test/posix/BUILD.mk
index aabd202d6..dafa1d9c1 100644
--- a/test/posix/BUILD.mk
+++ b/test/posix/BUILD.mk
@@ -3,76 +3,59 @@
 
 PKGS += TEST_POSIX
 
-TEST_POSIX_SRCS :=							\
+TEST_POSIX_SRCS :=				\
 	$(wildcard test/posix/*.c)
 
-TEST_POSIX_SRCS_TEST =							\
+TEST_POSIX_SRCS_TEST =				\
 	$(filter %_test.c,$(TEST_POSIX_SRCS))
 
-TEST_POSIX_OBJS =							\
+TEST_POSIX_OBJS =				\
 	$(TEST_POSIX_SRCS:%.c=o/$(MODE)/%.o)
 
-TEST_POSIX_COMS =							\
-	$(TEST_POSIX_SRCS_TEST:%.c=o/$(MODE)/%)				\
-	o/$(MODE)/test/posix/file_offset_exec_prog
+TEST_POSIX_COMS =				\
+	$(TEST_POSIX_SRCS_TEST:%.c=o/$(MODE)/%)
 
-TEST_POSIX_BINS =							\
-	$(TEST_POSIX_COMS)						\
+TEST_POSIX_BINS =				\
+	$(TEST_POSIX_COMS)			\
 	$(TEST_POSIX_COMS:%=%.dbg)
 
-TEST_POSIX_TESTS =							\
+TEST_POSIX_TESTS =				\
 	$(TEST_POSIX_SRCS_TEST:%.c=o/$(MODE)/%.ok)
 
-TEST_POSIX_CHECKS =							\
+TEST_POSIX_CHECKS =				\
 	$(TEST_POSIX_SRCS_TEST:%.c=o/$(MODE)/%.runs)
 
-TEST_POSIX_DIRECTDEPS =							\
-	LIBC_CALLS							\
-	LIBC_FMT							\
-	LIBC_INTRIN							\
-	LIBC_MEM							\
-	LIBC_PROC							\
-	LIBC_LOG							\
-	LIBC_RUNTIME							\
-	LIBC_SOCK							\
-	LIBC_STDIO							\
-	LIBC_STR							\
-	LIBC_SYSV							\
-	LIBC_THREAD							\
-	THIRD_PARTY_MUSL						\
+TEST_POSIX_DIRECTDEPS =				\
+	LIBC_CALLS				\
+	LIBC_FMT				\
+	LIBC_INTRIN				\
+	LIBC_MEM				\
+	LIBC_PROC				\
+	LIBC_RUNTIME				\
+	LIBC_STDIO				\
+	LIBC_STR				\
+	LIBC_SYSV				\
+	LIBC_THREAD
 
-TEST_POSIX_DEPS :=							\
+TEST_POSIX_DEPS :=				\
 	$(call uniq,$(foreach x,$(TEST_POSIX_DIRECTDEPS),$($(x))))
 
-o/$(MODE)/test/posix/posix.pkg:						\
-		$(TEST_POSIX_OBJS)					\
+o/$(MODE)/test/posix/posix.pkg:			\
+		$(TEST_POSIX_OBJS)		\
 		$(foreach x,$(TEST_POSIX_DIRECTDEPS),$($(x)_A).pkg)
 
-o/$(MODE)/test/posix/%.dbg:						\
-		$(TEST_POSIX_DEPS)					\
-		o/$(MODE)/test/posix/%.o				\
-		o/$(MODE)/test/posix/posix.pkg				\
-		$(CRT)							\
+o/$(MODE)/test/posix/%.dbg:			\
+		$(TEST_POSIX_DEPS)		\
+		o/$(MODE)/test/posix/%.o	\
+		o/$(MODE)/test/posix/posix.pkg	\
+		$(CRT)				\
 		$(APE_NO_MODIFY_SELF)
 	@$(APELINK)
 
-o/$(MODE)/test/posix/file_offset_exec_test.dbg:				\
-		$(TEST_POSIX_DEPS)					\
-		o/$(MODE)/test/posix/file_offset_exec_test.o		\
-		o/$(MODE)/test/posix/file_offset_exec_prog.zip.o	\
-		o/$(MODE)/test/posix/posix.pkg				\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/test/posix/file_offset_exec_prog.zip.o: private		\
-		ZIPOBJ_FLAGS +=						\
-			-B
-
-o/$(MODE)/test/posix/fread3gb_test.runs:				\
+o/$(MODE)/test/posix/fread3gb_test.runs:	\
 		private QUOTA += -F5gb -M5gb
 
 .PHONY: o/$(MODE)/test/posix
-o/$(MODE)/test/posix:							\
-		$(TEST_POSIX_BINS)					\
+o/$(MODE)/test/posix:				\
+		$(TEST_POSIX_BINS)		\
 		$(TEST_POSIX_CHECKS)
diff --git a/test/posix/accept4_nonblock_test.c b/test/posix/accept4_nonblock_test.c
deleted file mode 100644
index 2033e3d76..000000000
--- a/test/posix/accept4_nonblock_test.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <arpa/inet.h>
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <netinet/in.h>
-#include <poll.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-int main() {
-
-  // Create server socket
-  int server_fd;
-  struct sockaddr_in address;
-  int addrlen = sizeof(address);
-  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
-    return 1;
-  address.sin_family = AF_INET;
-  address.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  address.sin_port = 0;  // let os assign random port
-  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)))
-    return 2;
-  if (getsockname(server_fd, (struct sockaddr *)&address,
-                  (socklen_t *)&addrlen))
-    return 3;
-  if (listen(server_fd, SOMAXCONN))
-    return 4;
-
-  {
-    // poll server
-    struct pollfd fds[2] = {
-        {server_fd, POLLIN | POLLOUT},
-    };
-    int ret = poll(fds, 1, 0);
-    if (ret != 0)
-      return 5;
-  }
-
-  // create client socket
-  int client_fd;
-  if ((client_fd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
-    return 6;
-  if (connect(client_fd, (struct sockaddr *)&address, sizeof(address)))
-    return 7;
-
-  // accept client
-  int server_client_fd;
-  if ((server_client_fd = accept4(server_fd, 0, 0, SOCK_NONBLOCK)) == -1)
-    return 8;
-
-  // check that it's non-blocking
-  char buf[1];
-  if (read(server_client_fd, buf, 1) != -1)
-    return 9;
-  if (errno != EAGAIN && errno != EWOULDBLOCK)
-    return 10;
-
-  // Clean up
-  if (close(server_client_fd))
-    return 12;
-  if (close(client_fd))
-    return 13;
-  if (close(server_fd))
-    return 14;
-
-  CheckForMemoryLeaks();
-}
diff --git a/test/posix/accept_inherit_nonblock_test.c b/test/posix/accept_inherit_nonblock_test.c
deleted file mode 100644
index 42a938e06..000000000
--- a/test/posix/accept_inherit_nonblock_test.c
+++ /dev/null
@@ -1,85 +0,0 @@
-#include <arpa/inet.h>
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <netinet/in.h>
-#include <poll.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-void on_signal(int sig) {
-}
-
-int main() {
-
-  // Create server socket
-  int server_fd;
-  struct sockaddr_in address;
-  int addrlen = sizeof(address);
-  if ((server_fd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0)) == -1)
-    return 1;
-  address.sin_family = AF_INET;
-  address.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  address.sin_port = 0;  // let os assign random port
-  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)))
-    return 2;
-  if (getsockname(server_fd, (struct sockaddr *)&address,
-                  (socklen_t *)&addrlen))
-    return 3;
-  if (listen(server_fd, SOMAXCONN))
-    return 4;
-
-  {
-    // poll server
-    struct pollfd fds[] = {{server_fd, POLLIN | POLLOUT}};
-    int ret = poll(fds, 1, 0);
-    if (ret != 0)
-      return 5;
-  }
-
-  // verify server socket is non-blocking
-  if (accept(server_fd, 0, 0) != -1)
-    return 20;
-  if (errno != EAGAIN && errno != EWOULDBLOCK)
-    return 21;
-
-  // create client socket
-  int client_fd;
-  if ((client_fd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
-    return 6;
-  if (connect(client_fd, (struct sockaddr *)&address, sizeof(address)))
-    return 7;
-
-  // prevent race condition
-  // impacts platforms like openbsd
-  fcntl(server_fd, F_SETFL, fcntl(server_fd, F_GETFL) & ~O_NONBLOCK);
-
-  // accept client
-  int server_client_fd;
-  if ((server_client_fd = accept(server_fd, 0, 0)) == -1)
-    return 8;
-
-  // check that non-blocking wasn't inherited from listener
-  char buf[1];
-  sigaction(SIGALRM, &(struct sigaction){.sa_handler = on_signal}, 0);
-  ualarm(100000, 0);
-  if (read(server_client_fd, buf, 1) != -1)
-    return 9;
-  if (errno != EINTR)
-    return 10;
-
-  // Clean up
-  if (close(server_client_fd))
-    return 12;
-  if (close(client_fd))
-    return 13;
-  if (close(server_fd))
-    return 14;
-
-  CheckForMemoryLeaks();
-}
diff --git a/test/posix/accept_poll_test.c b/test/posix/accept_poll_test.c
deleted file mode 100644
index da6dfbcba..000000000
--- a/test/posix/accept_poll_test.c
+++ /dev/null
@@ -1,94 +0,0 @@
-#include <arpa/inet.h>
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <netinet/in.h>
-#include <poll.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-int main() {
-
-  // Create server socket
-  int server_fd;
-  struct sockaddr_in address;
-  int addrlen = sizeof(address);
-  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
-    return 1;
-  address.sin_family = AF_INET;
-  address.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  address.sin_port = 0;  // let os assign random port
-  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)))
-    return 2;
-  if (getsockname(server_fd, (struct sockaddr *)&address,
-                  (socklen_t *)&addrlen))
-    return 3;
-  if (listen(server_fd, SOMAXCONN))
-    return 4;
-
-  {
-    // poll server
-    struct pollfd fds[2] = {
-        {server_fd, POLLIN | POLLOUT},
-    };
-    int ret = poll(fds, 1, 0);
-    if (ret != 0)
-      return 5;
-  }
-
-  // create client socket
-  int client_fd;
-  if ((client_fd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
-    return 6;
-  if (connect(client_fd, (struct sockaddr *)&address, sizeof(address)))
-    return 7;
-
-  {
-    // poll server
-    struct pollfd fds[] = {{server_fd, POLLIN | POLLOUT}};
-    int ret = poll(fds, 1, -1u);
-    if (ret != 1)
-      return 8;
-    if (!(fds[0].revents & POLLIN))
-      return 9;
-    if (fds[0].revents & POLLOUT)
-      return 10;
-    if (fds[0].revents & POLLHUP)
-      return 11;
-    if (fds[0].revents & POLLERR)
-      return 12;
-  }
-
-  {
-    // poll server with invalid thing
-    struct pollfd fds[] = {
-        {server_fd, POLLIN | POLLOUT},
-        {666, POLLIN | POLLOUT},
-    };
-    int ret = poll(fds, 2, -1u);
-    if (ret != 2)
-      return 18;
-    if (!(fds[0].revents & POLLIN))
-      return 19;
-    if (fds[0].revents & POLLOUT)
-      return 20;
-    if (fds[1].revents & POLLIN)
-      return 21;
-    if (fds[1].revents & POLLOUT)
-      return 22;
-    if (!(fds[1].revents & POLLNVAL))
-      return 23;
-  }
-
-  // Clean up
-  if (close(client_fd))
-    return 13;
-  if (close(server_fd))
-    return 14;
-
-  CheckForMemoryLeaks();
-}
diff --git a/test/posix/connect_nonblock_test.c b/test/posix/connect_nonblock_test.c
deleted file mode 100644
index 7655ef1a0..000000000
--- a/test/posix/connect_nonblock_test.c
+++ /dev/null
@@ -1,260 +0,0 @@
-#include <arpa/inet.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/select.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-int main() {
-  char buffer[1024];
-  fd_set wset;
-  int listenfd, connfd, sockfd;
-  int s, error;
-  pid_t pid;
-  socklen_t len;
-  struct sockaddr_in serv_addr, cli_addr;
-  uint16_t port;
-
-  printf("\n");
-
-  /* Create listening socket */
-  listenfd = socket(AF_INET, SOCK_STREAM, 0);
-  if (listenfd < 0) {
-    perror("socket() failed");
-    exit(1);
-  }
-
-  /* Initialize server address */
-  memset(&serv_addr, 0, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  serv_addr.sin_port = htons(0);
-
-  /* Bind socket */
-  if (bind(listenfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
-    perror("bind");
-    exit(2);
-  }
-
-  /* Get the assigned port number */
-  len = sizeof(serv_addr);
-  if (getsockname(listenfd, (struct sockaddr *)&serv_addr, &len) < 0) {
-    perror("getsockname");
-    exit(3);
-  }
-  port = ntohs(serv_addr.sin_port);
-
-  /* Listen on the socket */
-  if (listen(listenfd, 1) < 0) {
-    perror("listen");
-    exit(4);
-  }
-
-  /* Fork a child process */
-  pid = fork();
-  if (pid < 0) {
-    perror("fork");
-    exit(5);
-  } else if (pid == 0) {
-    /* Child process: acts as the client */
-    close(listenfd); /* Close the listening socket in the child */
-
-    /* Create socket */
-    sockfd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0);
-    if (sockfd < 0) {
-      perror("socket");
-      exit(6);
-    }
-
-    /* Initialize server address */
-    memset(&serv_addr, 0, sizeof(serv_addr));
-    serv_addr.sin_family = AF_INET;
-    serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); /* 127.0.0.1 */
-    serv_addr.sin_port = htons(port);                   /* Assigned port */
-
-    /* Try calling read() before connection is established */
-    s = read(sockfd, buffer, sizeof(buffer));
-    if (s < 0) {
-      if (errno == ENOTCONN) {
-        printf("read #1 enotconn\n");
-        /* good */
-      } else {
-        perror("read #1");
-        exit(6);
-      }
-    } else {
-      printf("read #1 succeeded\n");
-      exit(6);
-    }
-
-#if 0
-    /* Try calling read() before connection is established */
-    s = write(sockfd, buffer, sizeof(buffer));
-    if (s < 0) {
-      if (errno == ENOTCONN) {
-        /* good */
-      } else {
-        perror("write");
-      }
-    } else {
-      printf("Wrote %d bytes: %.*s\n", s, s, buffer);
-    }
-#endif
-
-    /* Attempt to connect */
-    s = connect(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr));
-    if (s == 0) {
-      printf("connect #1 success\n");
-    } else if (s < 0 && errno == EINPROGRESS) {
-      printf("connect #1 einprogress\n");
-    } else {
-      perror("connect #1");
-      exit(10);
-    }
-
-    /* Try calling read() before connection is established */
-    s = read(sockfd, buffer, sizeof(buffer));
-    if (s < 0) {
-      if (errno == EAGAIN) {
-        printf("read #2 eagain\n");
-      } else {
-        perror("read #2");
-        exit(10);
-      }
-    } else {
-      printf("read #2 succeeded\n");
-      exit(10);
-    }
-
-    /* Try calling connect() again to trigger EALREADY */
-    s = connect(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr));
-    if (!s) {
-      printf("connect #2 succeeded\n");
-    } else if (s < 0 && errno == EALREADY) {
-      printf("connect #2 ealready\n");
-    } else if (s < 0 && errno == EISCONN) {
-      printf("connect #2 eisconn\n");
-    } else if (s < 0) {
-      perror("connect #2");
-      exit(11);
-    }
-
-    /* Try calling connect() again to trigger EALREADY */
-    s = connect(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr));
-    if (!s) {
-      printf("connect #3 succeeded\n");
-    } else if (errno == EALREADY) {
-      printf("connect #3 ealready\n");
-    } else if (errno == EISCONN) {
-      printf("connect #3 eisconn\n");
-    } else {
-      perror("connect");
-      exit(11);
-    }
-
-    /* Try calling read() before connection is established */
-    s = read(sockfd, buffer, sizeof(buffer));
-    if (s < 0) {
-      if (errno == EAGAIN) {
-        /* good */
-      } else {
-        perror("read");
-      }
-    } else {
-      printf("Read %d bytes: %.*s\n", s, s, buffer);
-    }
-
-    /* Use select() to wait for the socket to be writable */
-    FD_ZERO(&wset);
-    FD_SET(sockfd, &wset);
-
-    s = select(sockfd + 1, NULL, &wset, NULL, 0);
-    if (s == 0) {
-      printf("not possible\n");
-      exit(11);
-    } else if (s < 0) {
-      perror("select");
-      exit(12);
-    }
-
-    /* Check if socket is writable */
-    if (FD_ISSET(sockfd, &wset)) {
-      /* Check for error */
-      len = sizeof(error);
-      if (getsockopt(sockfd, SOL_SOCKET, SO_ERROR, &error, &len) < 0)
-        exit(13);
-      if (error) {
-        printf("connection failed after select(): %s\n", strerror(error));
-        exit(14);
-      }
-    } else {
-      exit(16);
-    }
-
-    /* Try calling connect() again to trigger EISCONN */
-    s = connect(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr));
-    if (!s) {
-      printf("connect #4 succeeded\n");
-    } else if (s < 0 && errno == EISCONN) {
-      printf("connect #4 eisconn\n");
-    } else if (s < 0) {
-      exit(17);
-    }
-
-    if (close(sockfd))
-      exit(15);
-    exit(0);
-  } else {
-    /* Accept connection */
-    len = sizeof(cli_addr);
-    connfd = accept(listenfd, (struct sockaddr *)&cli_addr, &len);
-    if (connfd < 0) {
-      close(listenfd);
-      wait(NULL);
-      exit(18);
-    }
-
-    /* Read data from client */
-    s = read(connfd, buffer, sizeof(buffer));
-    if (s < 0) {
-      exit(51);
-    } else if (!s) {
-      /* got close */
-    } else {
-      exit(50);
-    }
-
-    /* Close connected socket */
-    if (close(connfd)) {
-      close(listenfd);
-      wait(NULL);
-      exit(19);
-    }
-
-    /* Close listening socket */
-    if (close(listenfd)) {
-      wait(NULL);
-      exit(20);
-    }
-
-    /* Wait for child process to finish */
-    int status;
-    if (waitpid(pid, &status, 0) < 0)
-      exit(21);
-
-    printf("\n");
-    if (WIFEXITED(status)) {
-      exit(WEXITSTATUS(status)); /* Return child's exit status */
-    } else {
-      exit(22);
-    }
-  }
-
-  exit(23);
-}
diff --git a/test/posix/cyclic_mutex_test.c b/test/posix/cyclic_mutex_test.c
deleted file mode 100644
index 28c733751..000000000
--- a/test/posix/cyclic_mutex_test.c
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <cosmo.h>
-#include <errno.h>
-#include <pthread.h>
-#include <signal.h>
-
-pthread_mutex_t x;
-pthread_mutex_t y;
-
-void ignore_signal(int sig) {
-}
-
-int main(int argc, char *argv[]) {
-
-#ifdef MODE_DBG
-  GetSymbolTable();
-  signal(SIGTRAP, ignore_signal);
-  kprintf("running %s\n", argv[0]);
-#endif
-
-  pthread_mutexattr_t attr;
-  if (pthread_mutexattr_init(&attr))
-    return 1;
-  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK))
-    return 2;
-  if (pthread_mutex_init(&x, &attr))
-    return 3;
-  if (pthread_mutex_init(&y, &attr))
-    return 4;
-  if (pthread_mutexattr_destroy(&attr))
-    return 5;
-
-  if (pthread_mutex_lock(&x))
-    return 6;
-  if (pthread_mutex_lock(&y))
-    return 7;
-  if (pthread_mutex_unlock(&y))
-    return 8;
-  if (pthread_mutex_unlock(&x))
-    return 9;
-
-  if (pthread_mutex_lock(&y))
-    return 10;
-  if (pthread_mutex_lock(&y) != EDEADLK)
-    return 11;
-  if (pthread_mutex_lock(&x) != EDEADLK)
-    return 12;
-  if (pthread_mutex_unlock(&x) != EPERM)
-    return 13;
-  if (pthread_mutex_unlock(&y))
-    return 14;
-
-  if (pthread_mutex_destroy(&y))
-    return 15;
-  if (pthread_mutex_destroy(&x))
-    return 16;
-}
diff --git a/test/posix/file_offset_exec_prog.c b/test/posix/file_offset_exec_prog.c
deleted file mode 100644
index 31f19560b..000000000
--- a/test/posix/file_offset_exec_prog.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <stdatomic.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <unistd.h>
-
-// subprogram for testing that lseek() is shared across execve()
-
-atomic_int *phase;
-
-int main(int argc, char *argv[]) {
-
-  if (argc != 3)
-    return 101;
-
-  int fd = atoi(argv[1]);
-  int mapfd = atoi(argv[2]);
-
-  if ((phase = mmap(0, sizeof(atomic_int), PROT_READ | PROT_WRITE, MAP_SHARED,
-                    mapfd, 0)) == MAP_FAILED)
-    return 102;
-
-  if (write(fd, "1", 1) != 1)
-    return 103;
-  if (lseek(fd, 0, SEEK_CUR) != 2)
-    return 104;
-
-  *phase = 1;
-  for (;;)
-    if (*phase == 2)
-      break;
-
-  if (write(fd, "3", 1) != 1)
-    return 105;
-  if (lseek(fd, 0, SEEK_CUR) != 4)
-    return 106;
-
-  *phase = 3;
-  for (;;)
-    if (*phase == 4)
-      break;
-
-  if (munmap(phase, sizeof(atomic_int)))
-    return 107;
-  if (close(mapfd))
-    return 108;
-  if (close(fd))
-    return 109;
-}
diff --git a/test/posix/file_offset_exec_test.c b/test/posix/file_offset_exec_test.c
deleted file mode 100644
index e9b9e94ba..000000000
--- a/test/posix/file_offset_exec_test.c
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <cosmo.h>
-#include <signal.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <unistd.h>
-
-// test that lseek() is shared across execve()
-
-__static_yoink("zipos");
-
-void on_unexpected_death(int sig) {
-  int ws;
-  if (wait(&ws) == -1)
-    _Exit(33);
-  if (!WIFEXITED(ws))
-    _Exit(34);
-  if (!(WEXITSTATUS(ws) & 255))
-    _Exit(35);
-  _Exit(WEXITSTATUS(ws));
-}
-
-int main() {
-
-  signal(SIGCHLD, on_unexpected_death);
-
-  // extract test program
-  int exefd;
-  int zipfd;
-  ssize_t got;
-  char exepath[] = "/tmp/file_offset_exec_prog.XXXXXX";
-  if ((exefd = mkstemp(exepath)) == -1)
-    return 2;
-  if (fchmod(exefd, 0755))
-    return 3;
-  if ((zipfd = open("/zip/file_offset_exec_prog", O_RDONLY)) == -1)
-    return 4;
-  for (;;) {
-    char chunk[512];
-    if ((got = read(zipfd, chunk, sizeof(chunk))) == -1)
-      return 5;
-    if (!got)
-      break;
-    if (write(exefd, chunk, got) != got)
-      return 6;
-  }
-  if (close(zipfd))
-    return 7;
-  if (close(exefd))
-    return 8;
-
-  // create file shared memory mapping for synchronization
-  int mapfd;
-  atomic_int *phase;
-  char mappath[] = "/tmp/file_offset_exec_phase.XXXXXX";
-  if ((mapfd = mkstemp(mappath)) == -1)
-    return 9;
-  if (ftruncate(mapfd, sizeof(atomic_int)))
-    return 10;
-  if ((phase = mmap(0, sizeof(atomic_int), PROT_READ | PROT_WRITE, MAP_SHARED,
-                    mapfd, 0)) == MAP_FAILED)
-    return 11;
-
-  // create test file to which both processes shall be writing
-  int fd;
-  char path[] = "/tmp/file_offset_exec_file.XXXXXX";
-  if ((fd = mkstemp(path)) == -1)
-    return 12;
-  if (lseek(fd, 0, SEEK_CUR) != 0)
-    return 13;
-
-  // start writing to file
-  if (write(fd, "0", 1) != 1)
-    return 14;
-  if (lseek(fd, 0, SEEK_CUR) != 1)
-    return 15;
-
-  // spawn program
-  int pid;
-  if ((pid = fork()) == -1)
-    return 16;
-  if (!pid) {
-    char str[2][12];
-    char *envs[] = {0};
-    char *args[] = {exepath, str[0], str[1], 0};
-    sprintf(str[0], "%d", fd);
-    sprintf(str[1], "%d", mapfd);
-    execve(exepath, args, envs);
-    _Exit(17);
-  }
-
-  for (;;)
-    if (*phase == 1)
-      break;
-
-  if (write(fd, "2", 1) != 1)
-    return 18;
-  if (lseek(fd, 0, SEEK_CUR) != 3)
-    return 19;
-
-  *phase = 2;
-  for (;;)
-    if (*phase == 3)
-      break;
-
-  if (write(fd, "4", 1) != 1)
-    return 20;
-  if (lseek(fd, 0, SEEK_CUR) != 5)
-    return 21;
-
-  signal(SIGCHLD, SIG_DFL);
-  *phase = 4;
-
-  int ws;
-  if (wait(&ws) == -1)
-    return 22;
-  if (!WIFEXITED(ws))
-    return 23;
-  if (WEXITSTATUS(ws))
-    return WEXITSTATUS(ws);
-
-  char buf[16] = {0};
-  if (pread(fd, buf, 15, 0) != 5)
-    return 24;
-  if (lseek(fd, 0, SEEK_CUR) != 5)
-    return 25;
-
-  if (close(fd))
-    return 26;
-
-  if (unlink(path))
-    return 27;
-
-  if (unlink(exepath))
-    return 28;
-
-  if (munmap(phase, sizeof(atomic_int)))
-    return 29;
-
-  if (close(mapfd))
-    return 30;
-
-  if (unlink(mappath))
-    return 31;
-
-  if (strcmp(buf, "01234"))
-    return 32;
-}
diff --git a/test/posix/file_offset_fork_test.c b/test/posix/file_offset_fork_test.c
deleted file mode 100644
index 72a02014b..000000000
--- a/test/posix/file_offset_fork_test.c
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <signal.h>
-#include <stdatomic.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <unistd.h>
-
-// test that lseek() is shared across fork()
-
-void on_unexpected_death(int sig) {
-  int ws;
-  if (wait(&ws) == -1)
-    _Exit(33);
-  if (!WIFEXITED(ws))
-    _Exit(34);
-  if (!(WEXITSTATUS(ws) & 255))
-    _Exit(35);
-  _Exit(WEXITSTATUS(ws));
-}
-
-int main() {
-  signal(SIGCHLD, on_unexpected_death);
-
-  atomic_int *phase;
-  if ((phase = mmap(0, sizeof(atomic_int), PROT_READ | PROT_WRITE,
-                    MAP_SHARED | MAP_ANONYMOUS, -1, 0)) == MAP_FAILED)
-    return 2;
-
-  int fd;
-  char path[] = "/tmp/file_offset_fork_test.XXXXXX";
-  if ((fd = mkstemp(path)) == -1)
-    return 3;
-  if (lseek(fd, 0, SEEK_CUR) != 0)
-    return 4;
-
-  if (write(fd, "0", 1) != 1)
-    return 5;
-  if (lseek(fd, 0, SEEK_CUR) != 1)
-    return 6;
-
-  int pid;
-  if ((pid = fork()) == -1)
-    return 7;
-
-  if (!pid) {
-    if (write(fd, "1", 1) != 1)
-      _Exit(8);
-    if (lseek(fd, 0, SEEK_CUR) != 2)
-      _Exit(9);
-
-    *phase = 1;
-    for (;;)
-      if (*phase == 2)
-        break;
-
-    if (write(fd, "3", 1) != 1)
-      _Exit(10);
-    if (lseek(fd, 0, SEEK_CUR) != 4)
-      _Exit(11);
-
-    *phase = 3;
-    for (;;)
-      if (*phase == 4)
-        break;
-
-    _Exit(0);
-  }
-
-  for (;;)
-    if (*phase == 1)
-      break;
-
-  if (write(fd, "2", 1) != 1)
-    return 12;
-  if (lseek(fd, 0, SEEK_CUR) != 3)
-    return 13;
-
-  *phase = 2;
-  for (;;)
-    if (*phase == 3)
-      break;
-
-  if (write(fd, "4", 1) != 1)
-    return 14;
-  if (lseek(fd, 0, SEEK_CUR) != 5)
-    return 15;
-
-  signal(SIGCHLD, SIG_DFL);
-  *phase = 4;
-
-  int ws;
-  if (wait(&ws) == -1)
-    return 16;
-  if (!WIFEXITED(ws))
-    return 17;
-  if (WEXITSTATUS(ws))
-    return WEXITSTATUS(ws);
-
-  char buf[16] = {0};
-  if (pread(fd, buf, 15, 0) != 5)
-    return 18;
-  if (lseek(fd, 0, SEEK_CUR) != 5)
-    return 19;
-
-  if (close(fd))
-    return 20;
-
-  if (munmap(phase, sizeof(atomic_int)))
-    return 21;
-
-  if (unlink(path))
-    return 22;
-
-  if (strcmp(buf, "01234"))
-    return 23;
-}
diff --git a/test/posix/forjustine_test.c b/test/posix/forjustine_test.c
deleted file mode 100644
index 58663a2a3..000000000
--- a/test/posix/forjustine_test.c
+++ /dev/null
@@ -1,49 +0,0 @@
-// for justine with love 2025-01-02
-#include <poll.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdatomic.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
-static bool altstack_installed;
-
-static void* chump(void* v) {
-  stack_t* s = v;
-  if (sigaltstack(s, NULL)) {
-    pthread_mutex_lock(&lock);
-    altstack_installed = true;
-    pthread_mutex_unlock(&lock);
-    pthread_cond_signal(&cond);
-    return NULL;
-  }
-  pthread_mutex_lock(&lock);
-  altstack_installed = true;
-  pthread_cond_signal(&cond);
-  pthread_mutex_unlock(&lock);
-  while (1)
-    poll(NULL, 0, -1);
-  return NULL;
-}
-
-int main(void) {
-  void* v;
-  stack_t s = {.ss_size = sysconf(_SC_SIGSTKSZ)};
-  s.ss_sp = malloc(s.ss_size);
-  if (s.ss_sp == NULL)
-    return EXIT_FAILURE;
-  pthread_t tid;
-  if (pthread_create(&tid, NULL, chump, &s))
-    return EXIT_FAILURE;
-  pthread_mutex_lock(&lock);
-  while (!altstack_installed)
-    pthread_cond_wait(&cond, &lock);
-  pthread_mutex_unlock(&lock);
-  free(s.ss_sp);
-  if (pthread_cancel(tid) || pthread_join(tid, &v))
-    return EXIT_FAILURE;
-  return v == PTHREAD_CANCELED ? EXIT_SUCCESS : EXIT_FAILURE;
-}
diff --git a/test/posix/fork_bench_test.c b/test/posix/fork_bench_test.c
deleted file mode 100644
index 6f962f89c..000000000
--- a/test/posix/fork_bench_test.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <stdlib.h>
-#include <unistd.h>
-#include "libc/testlib/benchmark.h"
-
-void TestFork(void) {
-  int pid;
-  if (!(pid = fork()))
-    _Exit(0);
-  wait(0);
-}
-
-int main(int argc, char *argv[]) {
-  BENCHMARK(100, 1, TestFork());
-}
diff --git a/test/posix/iconv_utf8_utf16_test.c b/test/posix/iconv_utf8_utf16_test.c
deleted file mode 100644
index f582749b9..000000000
--- a/test/posix/iconv_utf8_utf16_test.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <errno.h>
-#include <iconv.h>
-#include <stdlib.h>
-#include <string.h>
-#include <uchar.h>
-
-#define INBUF_SIZE  1024
-#define OUTBUF_SIZE 2048
-
-int g_count;
-
-int check_conversion(const char* input, size_t input_len,
-                     const char16_t* expected_output, size_t expected_len) {
-  iconv_t cd;
-  char inbuf[INBUF_SIZE];
-  char outbuf[OUTBUF_SIZE];
-  char* inptr = inbuf;
-  char* outptr = outbuf;
-  size_t inbytesleft = input_len;
-  size_t outbytesleft = OUTBUF_SIZE;
-  size_t result;
-
-  ++g_count;
-
-  memcpy(inbuf, input, input_len);
-
-  cd = iconv_open("UTF-16LE", "UTF-8");
-  if (cd == (iconv_t)-1) {
-    return 10 + g_count;  // iconv_open failed
-  }
-
-  result = iconv(cd, &inptr, &inbytesleft, &outptr, &outbytesleft);
-  if (result == (size_t)-1) {
-    iconv_close(cd);
-    return 20 + g_count;  // iconv failed, return 20 + specific errno
-  }
-
-  if (inbytesleft != 0) {
-    iconv_close(cd);
-    return 40 + g_count;  // Not all input was converted
-  }
-
-  size_t output_len = OUTBUF_SIZE - outbytesleft;
-  if (output_len != expected_len) {
-    iconv_close(cd);
-    return 50 + g_count;  // Output length mismatch
-  }
-
-  if (memcmp(outbuf, expected_output, output_len) != 0) {
-    iconv_close(cd);
-    return 60 + g_count;  // Output content mismatch
-  }
-
-  if (iconv_close(cd) == -1)
-    return 70 + g_count;  // iconv_close failed
-
-  // Reverse direction check: UTF-16LE back to UTF-8
-  cd = iconv_open("UTF-8", "UTF-16LE");
-  if (cd == (iconv_t)-1) {
-    return 80 + g_count;  // iconv_open failed for reverse direction
-  }
-
-  char reverse_inbuf[OUTBUF_SIZE];
-  char reverse_outbuf[INBUF_SIZE];
-  char* reverse_inptr = reverse_inbuf;
-  char* reverse_outptr = reverse_outbuf;
-  size_t reverse_inbytesleft = output_len;
-  size_t reverse_outbytesleft = INBUF_SIZE;
-
-  memcpy(reverse_inbuf, outbuf, output_len);
-
-  result = iconv(cd, &reverse_inptr, &reverse_inbytesleft, &reverse_outptr,
-                 &reverse_outbytesleft);
-  if (result == (size_t)-1) {
-    iconv_close(cd);
-    return 90 + g_count;  // iconv failed for reverse direction
-  }
-
-  if (reverse_inbytesleft != 0) {
-    iconv_close(cd);
-    return 100 + g_count;  // Not all input was converted in reverse direction
-  }
-
-  size_t reverse_output_len = INBUF_SIZE - reverse_outbytesleft;
-  if (reverse_output_len != input_len) {
-    iconv_close(cd);
-    return 110 + g_count;  // Reverse output length mismatch
-  }
-
-  if (memcmp(reverse_outbuf, input, input_len) != 0) {
-    iconv_close(cd);
-    return 120 + g_count;  // Reverse output content mismatch
-  }
-
-  if (iconv_close(cd) == -1)
-    return 130 + g_count;  // iconv_close failed for reverse direction
-
-  return 0;  // Success
-}
-
-int main() {
-  // Test case 1: Basic ASCII
-  const char input1[] = "Hello, world!";
-  const char16_t expected1[] = u"Hello, world!";
-  int result = check_conversion(input1, sizeof(input1) - 1, expected1,
-                                sizeof(expected1) - 2);
-  if (result != 0)
-    return result;
-
-  // Test case 2: Non-ASCII characters and newline
-  const char input2[] = "こんにちは\nWorld! ☺";
-  const char16_t expected2[] = u"こんにちは\nWorld! ☺";
-  result = check_conversion(input2, sizeof(input2) - 1, expected2,
-                            sizeof(expected2) - 2);
-  if (result != 0)
-    return result;
-
-  // Test case 3: Empty string
-  const char input3[] = "";
-  const char16_t expected3[] = u"";
-  result = check_conversion(input3, 0, expected3, 0);
-  if (result != 0)
-    return result;
-
-  // Test case 4: String with null characters
-  const char input4[] = "Hello\0World";
-  const char16_t expected4[] = u"Hello\0World";
-  result = check_conversion(input4, sizeof(input4) - 1, expected4,
-                            sizeof(expected4) - 2);
-  if (result != 0)
-    return result;
-
-  // Test case 5: Long string to test buffer handling
-  char input5[INBUF_SIZE];
-  char16_t expected5[INBUF_SIZE];
-  memset(input5, 'A', INBUF_SIZE - 1);
-  input5[INBUF_SIZE - 1] = '\0';
-  for (int i = 0; i < INBUF_SIZE - 1; i++) {
-    expected5[i] = u'A';
-  }
-  result =
-      check_conversion(input5, INBUF_SIZE - 1, expected5, (INBUF_SIZE - 1) * 2);
-  if (result != 0)
-    return result;
-
-  // Test case 6: Invalid UTF-8 sequence
-  const char input6[] = {0xC0, 0x80};
-  result = check_conversion(input6, sizeof(input6), NULL, 0);
-  if (result != 26) {
-    if (errno != EILSEQ)
-      return 201;
-    return 200;
-  }
-
-  // Test case 7: Mixing ASCII and non-ASCII
-  const char input7[] = "Hello, 世界!";
-  const char16_t expected7[] = u"Hello, 世界!";
-  result = check_conversion(input7, sizeof(input7) - 1, expected7,
-                            sizeof(expected7) - 2);
-  if (result != 0)
-    return result;
-
-  // Test case 8: Surrogate pairs
-  const char input8[] = "𐐷";  // U+10437
-  const char16_t expected8[] =
-      u"𐐷";  // This will be encoded as a surrogate pair
-  result = check_conversion(input8, sizeof(input8) - 1, expected8,
-                            sizeof(expected8) - 2);
-  if (result != 0)
-    return result;
-
-  return 0;  // All tests passed
-}
diff --git a/test/posix/iconv_utf8_utf32_test.c b/test/posix/iconv_utf8_utf32_test.c
deleted file mode 100644
index 2486bdb22..000000000
--- a/test/posix/iconv_utf8_utf32_test.c
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <errno.h>
-#include <iconv.h>
-#include <stdlib.h>
-#include <string.h>
-#include <uchar.h>
-
-#define INBUF_SIZE  1024
-#define OUTBUF_SIZE 4096
-
-int g_count;
-
-int check_conversion(const char* input, size_t input_len,
-                     const wchar_t* expected_output, size_t expected_len) {
-  iconv_t cd;
-  char inbuf[INBUF_SIZE];
-  char outbuf[OUTBUF_SIZE];
-  char* inptr = inbuf;
-  char* outptr = outbuf;
-  size_t inbytesleft = input_len;
-  size_t outbytesleft = OUTBUF_SIZE;
-  size_t result;
-
-  ++g_count;
-
-  memcpy(inbuf, input, input_len);
-
-  cd = iconv_open("UTF-32LE", "UTF-8");
-  if (cd == (iconv_t)-1) {
-    return 10 + g_count;  // iconv_open failed
-  }
-
-  result = iconv(cd, &inptr, &inbytesleft, &outptr, &outbytesleft);
-  if (result == (size_t)-1) {
-    iconv_close(cd);
-    return 20 + g_count;  // iconv failed, return 20 + specific errno
-  }
-
-  if (inbytesleft != 0) {
-    iconv_close(cd);
-    return 40 + g_count;  // Not all input was converted
-  }
-
-  size_t output_len = OUTBUF_SIZE - outbytesleft;
-  if (output_len != expected_len) {
-    iconv_close(cd);
-    return 50 + g_count;  // Output length mismatch
-  }
-
-  if (memcmp(outbuf, expected_output, output_len) != 0) {
-    iconv_close(cd);
-    return 60 + g_count;  // Output content mismatch
-  }
-
-  if (iconv_close(cd) == -1)
-    return 70 + g_count;  // iconv_close failed
-
-  // Reverse direction check: UTF-32LE back to UTF-8
-  cd = iconv_open("UTF-8", "UTF-32LE");
-  if (cd == (iconv_t)-1) {
-    return 80 + g_count;  // iconv_open failed for reverse direction
-  }
-
-  char reverse_inbuf[OUTBUF_SIZE];
-  char reverse_outbuf[INBUF_SIZE];
-  char* reverse_inptr = reverse_inbuf;
-  char* reverse_outptr = reverse_outbuf;
-  size_t reverse_inbytesleft = output_len;
-  size_t reverse_outbytesleft = INBUF_SIZE;
-
-  memcpy(reverse_inbuf, outbuf, output_len);
-
-  result = iconv(cd, &reverse_inptr, &reverse_inbytesleft, &reverse_outptr,
-                 &reverse_outbytesleft);
-  if (result == (size_t)-1) {
-    iconv_close(cd);
-    return 90 + g_count;  // iconv failed for reverse direction
-  }
-
-  if (reverse_inbytesleft != 0) {
-    iconv_close(cd);
-    return 100 + g_count;  // Not all input was converted in reverse direction
-  }
-
-  size_t reverse_output_len = INBUF_SIZE - reverse_outbytesleft;
-  if (reverse_output_len != input_len) {
-    iconv_close(cd);
-    return 110 + g_count;  // Reverse output length mismatch
-  }
-
-  if (memcmp(reverse_outbuf, input, input_len) != 0) {
-    iconv_close(cd);
-    return 120 + g_count;  // Reverse output content mismatch
-  }
-
-  if (iconv_close(cd) == -1)
-    return 130 + g_count;  // iconv_close failed for reverse direction
-
-  return 0;  // Success
-}
-
-int main() {
-  // Test case 1: Basic ASCII
-  const char input1[] = "Hello, world!";
-  const wchar_t expected1[] = L"Hello, world!";
-  int result = check_conversion(input1, sizeof(input1) - 1, expected1,
-                                sizeof(expected1) - 4);
-  if (result != 0)
-    return result;
-
-  // Test case 2: Non-ASCII characters and newline
-  const char input2[] = "こんにちは\nWorld! ☺";
-  const wchar_t expected2[] = L"こんにちは\nWorld! ☺";
-  result = check_conversion(input2, sizeof(input2) - 1, expected2,
-                            sizeof(expected2) - 4);
-  if (result != 0)
-    return result;
-
-  // Test case 3: Empty string
-  const char input3[] = "";
-  const wchar_t expected3[] = L"";
-  result = check_conversion(input3, 0, expected3, 0);
-  if (result != 0)
-    return result;
-
-  // Test case 4: String with null characters
-  const char input4[] = "Hello\0World";
-  const wchar_t expected4[] = L"Hello\0World";
-  result = check_conversion(input4, sizeof(input4) - 1, expected4,
-                            sizeof(expected4) - 4);
-  if (result != 0)
-    return result;
-
-  // Test case 5: Long string to test buffer handling
-  char input5[INBUF_SIZE];
-  wchar_t expected5[INBUF_SIZE];
-  memset(input5, 'A', INBUF_SIZE - 1);
-  input5[INBUF_SIZE - 1] = '\0';
-  for (int i = 0; i < INBUF_SIZE - 1; i++) {
-    expected5[i] = u'A';
-  }
-  result =
-      check_conversion(input5, INBUF_SIZE - 1, expected5, (INBUF_SIZE - 1) * 4);
-  if (result != 0)
-    return result;
-
-  // Test case 6: Invalid UTF-8 sequence
-  const char input6[] = {0xC0, 0x80};
-  result = check_conversion(input6, sizeof(input6), NULL, 0);
-  if (result != 26) {
-    if (errno != EILSEQ)
-      return 201;
-    return 200;
-  }
-
-  // Test case 7: Mixing ASCII and non-ASCII
-  const char input7[] = "Hello, 世界!";
-  const wchar_t expected7[] = L"Hello, 世界!";
-  result = check_conversion(input7, sizeof(input7) - 1, expected7,
-                            sizeof(expected7) - 4);
-  if (result != 0)
-    return result;
-
-  // Test case 8: Surrogate pairs
-  const char input8[] = "𐐷";         // U+10437
-  const wchar_t expected8[] = L"𐐷";  // This will be encoded as a surrogate pair
-  result = check_conversion(input8, sizeof(input8) - 1, expected8,
-                            sizeof(expected8) - 4);
-  if (result != 0)
-    return result;
-
-  return 0;  // All tests passed
-}
diff --git a/test/posix/interprocess_signaling_test.c b/test/posix/interprocess_signaling_test.c
deleted file mode 100644
index d6372492e..000000000
--- a/test/posix/interprocess_signaling_test.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <signal.h>
-#include <stdatomic.h>
-#include <sys/mman.h>
-#include <unistd.h>
-
-atomic_int *got;
-
-void onsig(int sig) {
-  *got = sig;
-}
-
-int main(int argc, char *argv[]) {
-
-  // create process shared memory
-  got = mmap(0, 4, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-  if (got == MAP_FAILED)
-    return 5;
-
-  // listen for signal
-  if (signal(SIGUSR1, onsig))
-    return 6;
-
-  // block signals
-  sigset_t full;
-  if (sigfillset(&full))
-    return 7;
-  if (sigprocmask(SIG_BLOCK, &full, 0))
-    return 8;
-
-  // create child process
-  int pid;
-  if (!(pid = fork())) {
-    sigset_t empty;
-    sigemptyset(&empty);
-    sigsuspend(&empty);
-    *got |= 128;
-    _exit(0);
-  }
-
-  // send signal
-  if (kill(pid, SIGUSR1))
-    return 9;
-
-  // wait for child to die
-  int ws;
-  if (wait(&ws) != pid)
-    return 10;
-  if (ws)
-    return 11;
-  if (*got != (128 | SIGUSR1))
-    return 12;
-}
diff --git a/test/posix/listen_timeout_test.c b/test/posix/listen_timeout_test.c
deleted file mode 100644
index 952c8a83a..000000000
--- a/test/posix/listen_timeout_test.c
+++ /dev/null
@@ -1,79 +0,0 @@
-#include <arpa/inet.h>
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-int main() {
-  int listenfd, connfd;
-  struct sockaddr_in serv_addr;
-  struct timeval timeout;
-  socklen_t len;
-  struct sockaddr_in cli_addr;
-
-  // only linux really does this
-  if (!IsLinux())
-    return 0;
-
-  // create listening socket
-  listenfd = socket(AF_INET, SOCK_STREAM, 0);
-  if (listenfd < 0) {
-    perror("socket");
-    exit(1);
-  }
-
-  // initialize server address
-  memset(&serv_addr, 0, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  serv_addr.sin_port = htons(0);
-
-  // bind socket
-  if (bind(listenfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
-    perror("bind");
-    close(listenfd);
-    exit(2);
-  }
-
-  // listen on the socket
-  if (listen(listenfd, 5) < 0) {
-    perror("listen");
-    close(listenfd);
-    exit(3);
-  }
-
-  // accept for 200ms
-  timeout.tv_sec = 0;
-  timeout.tv_usec = 200e3;
-  if (setsockopt(listenfd, SOL_SOCKET, SO_RCVTIMEO, &timeout,
-                 sizeof(timeout))) {
-    perror("setsockopt");
-    close(listenfd);
-    exit(4);
-  }
-
-  // Accept connection
-  len = sizeof(cli_addr);
-  connfd = accept(listenfd, (struct sockaddr *)&cli_addr, &len);
-  if (connfd < 0) {
-    if (errno == EAGAIN || errno == EWOULDBLOCK) {
-      /* printf("accept() timed out\n"); */
-    } else {
-      perror("accept");
-    }
-  } else {
-    printf("Connection accepted from client.\n");
-    // Close connected socket
-    close(connfd);
-  }
-
-  // Close listening socket
-  close(listenfd);
-}
diff --git a/test/posix/lowest_fd_test.c b/test/posix/lowest_fd_test.c
deleted file mode 100644
index 661212f83..000000000
--- a/test/posix/lowest_fd_test.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <fcntl.h>
-#include <unistd.h>
-
-int main(int argc, char *argv[]) {
-
-  // ensure most file descriptors are closed
-  for (int fildes = 3; fildes < 80; ++fildes)
-    close(fildes);
-
-  // create new file descriptor
-  if (open("/dev/urandom", O_RDONLY) != 3)
-    return 2;
-
-  // copy file descriptor to higher number
-  if (fcntl(3, F_DUPFD, 70) != 70)
-    return 3;
-
-  // new file descriptor should go for lowest number
-  int fd;
-  if ((fd = open("/dev/urandom", O_RDONLY)) != 4)
-    return 4;
-
-  // move file descriptor to higher number
-  if (close(3))
-    return 5;
-
-  // new file descriptor should go for lowest number
-  if (open("/dev/urandom", O_RDONLY) != 3)
-    return 6;
-}
diff --git a/test/posix/msg_nosignal_test.c b/test/posix/msg_nosignal_test.c
deleted file mode 100644
index c164a4b9b..000000000
--- a/test/posix/msg_nosignal_test.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <arpa/inet.h>
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <netinet/in.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-/**
- * @fileoverview send(MSG_NOSIGNAL) test
- *
- * It's possible when writing to a socket for SIGPIPE to be raised. It
- * can happen for a variety of reasons. The one reason that has broad
- * consensus across OSes and is officially documented, is if shutdown()
- * is used on the local end.
- */
-
-struct sockaddr_in serv_addr;
-atomic_bool g_ready_for_conn;
-atomic_bool g_ready_for_data;
-atomic_bool g_ready_for_more;
-atomic_bool g_ready_for_exit;
-volatile sig_atomic_t g_got_signal;
-
-void onsig(int sig) {
-  g_got_signal = sig;
-}
-
-void *server_thread(void *arg) {
-  socklen_t len;
-  int server, client;
-  struct sockaddr_in cli_addr;
-
-  // create listening socket
-  server = socket(AF_INET, SOCK_STREAM, 0);
-  if (server == -1) {
-    perror("socket");
-    exit(10);
-  }
-
-  // initialize server address
-  memset(&serv_addr, 0, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  serv_addr.sin_port = htons(0);
-
-  // bind socket
-  if (bind(server, (struct sockaddr *)&serv_addr, sizeof(serv_addr))) {
-    perror("bind");
-    exit(11);
-  }
-
-  // get assigned port
-  len = sizeof(serv_addr);
-  if (getsockname(server, (struct sockaddr *)&serv_addr, &len)) {
-    perror("getsockname");
-    exit(12);
-  }
-
-  // listen on the socket
-  if (listen(server, SOMAXCONN)) {
-    perror("listen");
-    exit(13);
-  }
-
-  // wake main thread
-  g_ready_for_conn = true;
-
-  // accept connection
-  len = sizeof(cli_addr);
-  client = accept(server, (struct sockaddr *)&cli_addr, &len);
-  if (client == -1) {
-    perror("accept");
-    exit(14);
-  }
-
-  // wake main thread
-  g_ready_for_data = true;
-
-  // wait for thread
-  for (;;)
-    if (g_ready_for_exit)
-      break;
-
-  // close sockets
-  if (close(client))
-    exit(29);
-  if (close(server))
-    exit(28);
-
-  return 0;
-}
-
-int main() {
-
-  // create server thread
-  pthread_t th;
-  if (pthread_create(&th, 0, server_thread, 0))
-    return 1;
-
-  // wait for thread
-  for (;;)
-    if (g_ready_for_conn)
-      break;
-
-  // create socket
-  int client = socket(AF_INET, SOCK_STREAM, 0);
-  if (client == -1) {
-    perror("socket");
-    return 2;
-  }
-
-  // connect to server
-  if (connect(client, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) == -1) {
-    perror("connect");
-    return 3;
-  }
-
-  // wait for thread
-  for (;;)
-    if (g_ready_for_data)
-      break;
-
-  // handle signals
-  struct sigaction sa;
-  sa.sa_flags = 0;
-  sa.sa_handler = onsig;
-  sigemptyset(&sa.sa_mask);
-  sigaction(SIGPIPE, &sa, 0);
-
-  // half close socket
-  if (shutdown(client, SHUT_WR))
-    return 15;
-
-  // send first transmission
-  int rc;
-  for (;;) {
-    rc = write(client, "x", 1);
-    if (rc == 1)
-      continue;
-    if (rc != -1)
-      return 4;
-    if (errno != EPIPE) {
-      perror("write");
-      return 5;
-    }
-    // NetBSD is oddly lazy about sending SIGPIPE
-    if (IsNetbsd())
-      for (;;)
-        if (g_got_signal)
-          break;
-    if (g_got_signal != SIGPIPE) {
-      fprintf(stderr, "expected SIGPIPE but got %s\n", strsignal(g_got_signal));
-      return 6;
-    }
-    g_got_signal = 0;
-    break;
-  }
-
-  // send first transmission
-  rc = send(client, "x", 1, MSG_NOSIGNAL);
-  if (rc != -1)
-    return 7;
-  if (errno != EPIPE)
-    return 8;
-  if (g_got_signal)
-    return 9;
-
-  g_ready_for_exit = true;
-
-  if (pthread_join(th, 0))
-    return 6;
-}
diff --git a/test/posix/msg_waitall_test.c b/test/posix/msg_waitall_test.c
deleted file mode 100644
index 99942383d..000000000
--- a/test/posix/msg_waitall_test.c
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <arpa/inet.h>
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <netinet/in.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-/**
- * @fileoverview recv(MSG_WAITALL) test
- */
-
-struct sockaddr_in serv_addr;
-atomic_bool g_ready_for_conn;
-atomic_bool g_ready_for_data;
-atomic_bool g_ready_for_more;
-atomic_bool g_ready_for_exit;
-
-void *server_thread(void *arg) {
-  socklen_t len;
-  int server, client;
-  struct sockaddr_in cli_addr;
-
-  // create listening socket
-  server = socket(AF_INET, SOCK_STREAM, 0);
-  if (server == -1) {
-    perror("socket");
-    exit(10);
-  }
-
-  // initialize server address
-  memset(&serv_addr, 0, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  serv_addr.sin_port = htons(0);
-
-  // bind socket
-  if (bind(server, (struct sockaddr *)&serv_addr, sizeof(serv_addr))) {
-    perror("bind");
-    exit(11);
-  }
-
-  // get assigned port
-  len = sizeof(serv_addr);
-  if (getsockname(server, (struct sockaddr *)&serv_addr, &len)) {
-    perror("getsockname");
-    exit(12);
-  }
-
-  // listen on the socket
-  if (listen(server, SOMAXCONN)) {
-    perror("listen");
-    exit(13);
-  }
-
-  // wake main thread
-  g_ready_for_conn = true;
-
-  // accept connection
-  len = sizeof(cli_addr);
-  client = accept(server, (struct sockaddr *)&cli_addr, &len);
-  if (client == -1) {
-    perror("accept");
-    exit(14);
-  }
-
-  // check waitall + dontwait
-  char buf[2];
-  int rc = recv(client, buf, 2, MSG_WAITALL | MSG_DONTWAIT);
-  if (rc != -1)
-    exit(15);
-  if (errno != EAGAIN)
-    exit(16);
-
-  // wake main thread
-  g_ready_for_data = true;
-
-  // check peek
-  rc = recv(client, buf, 2, MSG_PEEK);
-  if (rc == -1) {
-    perror("recv1");
-    exit(17);
-  }
-  if (rc != 1)
-    exit(18);
-  if (buf[0] != 'x')
-    exit(19);
-
-  // check read() has @restartable behavior
-  rc = recv(client, buf, 2, MSG_WAITALL);
-  if (rc == -1) {
-    perror("recv2");
-    exit(21);
-  }
-  if (rc != 2)
-    exit(22);
-  if (buf[0] != 'x')
-    exit(23);
-  if (buf[1] != 'y')
-    exit(24);
-
-  // wake main thread
-  g_ready_for_more = true;
-
-  // check normal recv won't wait
-  rc = read(client, buf, 2);
-  if (rc == -1) {
-    perror("recv3");
-    exit(25);
-  }
-  if (rc != 1)
-    exit(26);
-  if (buf[0] != 'x')
-    exit(27);
-
-  // wait for main thread
-  for (;;)
-    if (g_ready_for_exit)
-      break;
-
-  // close listening socket
-  if (close(server))
-    exit(28);
-  if (close(client))
-    exit(29);
-  return 0;
-}
-
-int main() {
-
-  // create server thread
-  pthread_t th;
-  if (pthread_create(&th, 0, server_thread, 0))
-    return 1;
-
-  // wait for thread
-  for (;;)
-    if (g_ready_for_conn)
-      break;
-
-  // create socket
-  int client = socket(AF_INET, SOCK_STREAM, 0);
-  if (client == -1) {
-    perror("socket");
-    return 2;
-  }
-
-  // connect to server
-  if (connect(client, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) == -1) {
-    perror("connect");
-    return 3;
-  }
-
-  // wait for thread
-  for (;;)
-    if (g_ready_for_data)
-      break;
-
-  // send first transmission
-  usleep(100e3);
-  if (write(client, "x", 1) != 1)
-    return 4;
-  usleep(100e3);
-  if (write(client, "y", 1) != 1)
-    return 5;
-
-  // wait for thread
-  for (;;)
-    if (g_ready_for_more)
-      break;
-
-  // send second transmission
-  usleep(100e3);
-  if (write(client, "x", 1) != 1)
-    return 4;
-  usleep(100e3);
-  if (write(client, "y", 1) != 1)
-    return 5;
-
-  g_ready_for_exit = true;
-
-  if (pthread_join(th, 0))
-    return 6;
-}
diff --git a/test/posix/mutex_async_signal_safety_test.c b/test/posix/mutex_async_signal_safety_test.c
new file mode 100644
index 000000000..5102ab2fb
--- /dev/null
+++ b/test/posix/mutex_async_signal_safety_test.c
@@ -0,0 +1,82 @@
+#include <pthread.h>
+#include <signal.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+// tests that recursive mutexes are implemented atomically
+//
+// glibc fails this test
+
+atomic_bool done;
+atomic_bool ready;
+pthread_mutex_t lock;
+
+void hand(int sig) {
+  if (pthread_mutex_lock(&lock))
+    _Exit(50);
+  if (pthread_mutex_unlock(&lock))
+    _Exit(51);
+}
+
+void* work(void* arg) {
+  ready = true;
+  while (!done) {
+    if (pthread_mutex_lock(&lock))
+      _Exit(60);
+    if (pthread_mutex_unlock(&lock))
+      _Exit(61);
+  }
+  return 0;
+}
+
+int main() {
+
+  struct sigaction sa;
+  sa.sa_handler = hand;
+  sa.sa_flags = SA_NODEFER;
+  sigemptyset(&sa.sa_mask);
+  if (sigaction(SIGUSR1, &sa, 0))
+    _Exit(1);
+
+  pthread_mutexattr_t attr;
+  if (pthread_mutexattr_init(&attr))
+    _Exit(2);
+  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE))
+    _Exit(3);
+  if (pthread_mutex_init(&lock, &attr))
+    _Exit(4);
+  if (pthread_mutexattr_destroy(&attr))
+    _Exit(5);
+
+  pthread_t th;
+  pthread_attr_t tattr;
+  if (pthread_attr_init(&tattr))
+    _Exit(6);
+  if (pthread_attr_setstacksize(&tattr, 8 * 1024 * 1024))
+    _Exit(7);
+  if (pthread_attr_setguardsize(&tattr, 64 * 1024))
+    _Exit(8);
+  if (pthread_create(&th, &tattr, work, 0))
+    _Exit(9);
+  if (pthread_attr_destroy(&tattr))
+    _Exit(10);
+  for (;;)
+    if (ready)
+      break;
+
+  for (int i = 0; i < 100; ++i) {
+    if (pthread_kill(th, SIGUSR1))
+      _Exit(11);
+    if (pthread_kill(th, SIGUSR1))
+      _Exit(12);
+    usleep(1);
+  }
+
+  done = true;
+  if (pthread_join(th, 0))
+    _Exit(13);
+  if (pthread_mutex_destroy(&lock))
+    _Exit(14);
+}
diff --git a/test/posix/nonblock_pipe2_test.c b/test/posix/nonblock_pipe2_test.c
deleted file mode 100644
index 3049cb804..000000000
--- a/test/posix/nonblock_pipe2_test.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-int main() {
-  int pipefd[2];
-  char buf[PIPE_BUF];
-  char buf2[PIPE_BUF];
-  ssize_t bytes_read;
-  ssize_t bytes_written;
-
-  // Create a pipe
-  if (pipe2(pipefd, O_NONBLOCK) == -1)
-    exit(1);
-
-  // Test 1: Reading from an empty pipe should fail with EAGAIN
-  bytes_read = read(pipefd[0], buf, PIPE_BUF);
-  if (bytes_read != -1 || errno != EAGAIN)
-    exit(4);
-
-  // Test 2: Writing to the pipe
-  bytes_written = write(pipefd[1], buf, PIPE_BUF);
-  if (bytes_written != PIPE_BUF)
-    exit(5);
-
-  // Test 3: Reading from the pipe after writing
-  bytes_read = read(pipefd[0], buf2, PIPE_BUF);
-  if (bytes_read != PIPE_BUF || memcmp(buf, buf2, PIPE_BUF))
-    exit(6);
-
-  // Test 4: Fill the pipe buffer
-  int ch = 10;
-  size_t total_written = 0;
-  for (;;) {
-    memset(buf, ch, PIPE_BUF);
-    bytes_written = write(pipefd[1], buf, PIPE_BUF);
-    if (bytes_written == -1) {
-      if (errno == EAGAIN || errno == EWOULDBLOCK) {
-        break;  // Pipe is full
-      } else {
-        exit(7);  // Unexpected error
-      }
-    }
-    total_written += bytes_written;
-  }
-
-  // Test 5: Verify that we can read all the data we wrote
-  ch = 10;
-  size_t total_read = 0;
-  while (total_read < total_written) {
-    bytes_read = read(pipefd[0], buf2, PIPE_BUF);
-    if (bytes_read == -1)
-      exit(8);
-    memset(buf, ch, PIPE_BUF);
-    if (memcmp(buf, buf2, PIPE_BUF))
-      exit(9);
-    total_read += bytes_read;
-  }
-  if (total_read != total_written)
-    exit(10);
-
-  // Clean up
-  if (close(pipefd[0]))
-    exit(11);
-  if (close(pipefd[1]))
-    exit(12);
-
-  CheckForMemoryLeaks();
-}
diff --git a/test/posix/nonblock_pipe_test.c b/test/posix/nonblock_pipe_test.c
deleted file mode 100644
index c9d21e5f8..000000000
--- a/test/posix/nonblock_pipe_test.c
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-int main() {
-  int pipefd[2];
-  char buf[PIPE_BUF];
-  char buf2[PIPE_BUF];
-  ssize_t bytes_read;
-  ssize_t bytes_written;
-
-  // Create a pipe
-  if (pipe(pipefd) == -1)
-    exit(1);
-
-  // Set O_NONBLOCK flag on the pipe
-  for (int i = 0; i < 2; ++i) {
-    int flags;
-    if ((flags = fcntl(pipefd[i], F_GETFL, 0)) == -1)
-      exit(2);
-    if (fcntl(pipefd[i], F_SETFL, flags | O_NONBLOCK) == -1)
-      exit(3);
-  }
-
-  // Test 1: Reading from an empty pipe should fail with EAGAIN
-  bytes_read = read(pipefd[0], buf, PIPE_BUF);
-  if (bytes_read != -1 || errno != EAGAIN)
-    exit(4);
-
-  // Test 2: Writing to the pipe
-  bytes_written = write(pipefd[1], buf, PIPE_BUF);
-  if (bytes_written != PIPE_BUF)
-    exit(5);
-
-  // Test 3: Reading from the pipe after writing
-  bytes_read = read(pipefd[0], buf2, PIPE_BUF);
-  if (bytes_read != PIPE_BUF || memcmp(buf, buf2, PIPE_BUF))
-    exit(6);
-
-  // Test 4: Fill the pipe buffer
-  int ch = 10;
-  size_t total_written = 0;
-  for (;;) {
-    memset(buf, ch, PIPE_BUF);
-    bytes_written = write(pipefd[1], buf, PIPE_BUF);
-    if (bytes_written == -1) {
-      if (errno == EAGAIN || errno == EWOULDBLOCK) {
-        break;  // Pipe is full
-      } else {
-        exit(7);  // Unexpected error
-      }
-    }
-    total_written += bytes_written;
-  }
-
-  // Test 5: Verify that we can read all the data we wrote
-  ch = 10;
-  size_t total_read = 0;
-  while (total_read < total_written) {
-    bytes_read = read(pipefd[0], buf2, PIPE_BUF);
-    if (bytes_read == -1)
-      exit(8);
-    memset(buf, ch, PIPE_BUF);
-    if (memcmp(buf, buf2, PIPE_BUF))
-      exit(9);
-    total_read += bytes_read;
-  }
-  if (total_read != total_written)
-    exit(10);
-
-  // Clean up
-  if (close(pipefd[0]))
-    exit(11);
-  if (close(pipefd[1]))
-    exit(12);
-
-  CheckForMemoryLeaks();
-}
diff --git a/test/posix/pending_signal_execve_test.c b/test/posix/pending_signal_execve_test.c
deleted file mode 100644
index 0b97b794b..000000000
--- a/test/posix/pending_signal_execve_test.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <cosmo.h>
-#include <signal.h>
-#include <string.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-sig_atomic_t gotsig;
-
-void onsig(int sig) {
-  gotsig = sig;
-}
-
-int main(int argc, char* argv[]) {
-  sigset_t ss;
-  sigfillset(&ss);
-  sigprocmask(SIG_BLOCK, &ss, 0);
-  if (argc >= 2 && !strcmp(argv[1], "childe")) {
-    signal(SIGUSR1, onsig);
-    sigemptyset(&ss);
-    sigsuspend(&ss);
-    if (gotsig != SIGUSR1)
-      return 2;
-  } else {
-    int child;
-    if ((child = fork()) == -1)
-      return 2;
-    if (!child) {
-      execlp(argv[0], argv[0], "childe", NULL);
-      _Exit(127);
-    }
-    if (IsNetbsd() || IsOpenbsd()) {
-      // NetBSD has a bug where pending signals don't inherit across
-      // execve, even though POSIX.1 literally says you must do this
-      sleep(1);
-    }
-    if (kill(child, SIGUSR1))
-      return 3;
-    int ws;
-    if (wait(&ws) != child)
-      return 4;
-    if (ws)
-      return 5;
-  }
-}
diff --git a/test/posix/pipe_write_eagain_test.c b/test/posix/pipe_write_eagain_test.c
deleted file mode 100644
index 66a1c42ea..000000000
--- a/test/posix/pipe_write_eagain_test.c
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <errno.h>
-#include <poll.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-/**
- * @fileoverview Tests that EAGAIN won't corrupt pipe.
- *
- * This is a real bug when using CancelIoEx() on winsock writes, so we
- * need to make sure it doesn't happen on pipes too.
- */
-
-#define ITERATIONS 100000
-#define ASYMMETRY  3
-
-int fds[2];
-int got_read_eagains;
-int got_write_eagains;
-
-void *worker(void *arg) {
-  for (int expect = 0; expect < ITERATIONS;) {
-    int number;
-    ssize_t rc = read(fds[0], &number, sizeof(number));
-    if (rc == -1) {
-      if (errno == EAGAIN) {
-        ++got_read_eagains;
-        if (poll(&(struct pollfd){fds[0], POLLIN}, 1, -1) == -1)
-          exit(11);
-        continue;
-      }
-      perror("read");
-      exit(8);
-    }
-    size_t got = rc;
-    if (got != sizeof(int))
-      exit(9);
-    if (expect != number)
-      exit(10);
-    ++expect;
-  }
-  return 0;
-}
-
-int main(int argc, char *argv[]) {
-
-  if (pipe2(fds, O_NONBLOCK))
-    return 1;
-
-  pthread_t th;
-  if (pthread_create(&th, 0, worker, 0))
-    return 2;
-
-  int number = 0;
-  for (;;) {
-    int chunk = 0;
-    int numbers[ASYMMETRY];
-    for (;;) {
-      numbers[chunk] = number + chunk;
-      if (++chunk == ASYMMETRY)
-        break;
-      if (number + chunk == ITERATIONS)
-        break;
-    }
-    for (;;) {
-      ssize_t rc = write(fds[1], numbers, chunk * sizeof(int));
-      if (rc == -1) {
-        if (errno == EAGAIN) {
-          ++got_write_eagains;
-          if (poll(&(struct pollfd){fds[1], POLLOUT}, 1, -1) == -1)
-            return 10;
-          continue;
-        }
-        return 3;
-      }
-      if (rc % sizeof(int))
-        return 4;
-      chunk = rc / sizeof(int);
-      number += chunk;
-      break;
-    }
-    if (number == ITERATIONS)
-      break;
-  }
-
-  if (pthread_join(th, 0))
-    return 5;
-
-  if (!got_read_eagains && !got_write_eagains)
-    return 7;
-}
diff --git a/test/posix/printf_return_test.c b/test/posix/printf_return_test.c
deleted file mode 100644
index 377bb9c7f..000000000
--- a/test/posix/printf_return_test.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <fcntl.h>
-#include <stdio.h>
-#include <unistd.h>
-
-int main() {
-
-  if (close(1))
-    return 1;
-  if (open("/dev/null", O_WRONLY) != 1)
-    return 2;
-
-  if (printf("a") != 1)
-    return 4;
-  if (printf("%s", "") != 0)
-    return 5;
-  if (printf("%s", "a") != 1)
-    return 6;
-  if (printf("%10s", "a") != 10)
-    return 6;
-  if (printf("%-10s", "a") != 10)
-    return 6;
-  if (printf("%-10s%-40s %9s %8s  %8s %8s\n", "Benchmark", "prog", "ops",
-             "time", "ops/sec", "time/op") != 89)
-    return 7;
-  if (fprintf(stdout, "%-10s%-40s %9s %8s  %8s %8s\n", "Benchmark", "prog",
-              "ops", "time", "ops/sec", "time/op") != 89)
-    return 8;
-}
diff --git a/test/posix/sa_resethand2_test.c b/test/posix/sa_resethand2_test.c
index 3a6dc34da..c66f8cb8d 100644
--- a/test/posix/sa_resethand2_test.c
+++ b/test/posix/sa_resethand2_test.c
@@ -1,20 +1,27 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <signal.h>
-#include <unistd.h>
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/sigset.h"
+#include "libc/dce.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sig.h"
 
 volatile int handler_invoked;
 
@@ -26,17 +33,24 @@ int main() {
   sigset_t mask, oldmask;
   struct sigaction sa, current_sa;
 
+  if (IsWindows()) {
+    // TODO(jart): support non-fatal signals between processes
+    return 0;
+  }
+
   sa.sa_handler = signal_handler;
   sa.sa_flags = SA_RESETHAND;
   sigemptyset(&sa.sa_mask);
 
-  if (sigaction(SIGINT, &sa, 0) == -1)
+  if (sigaction(SIGINT, &sa, 0) == -1) {
     return 1;
+  }
 
   sigemptyset(&mask);
   sigaddset(&mask, SIGINT);
-  if (sigprocmask(SIG_BLOCK, &mask, &oldmask) == -1)
+  if (sigprocmask(SIG_BLOCK, &mask, &oldmask) == -1) {
     return 2;
+  }
 
   int pid = fork();
   if (pid == -1) {
@@ -46,12 +60,15 @@ int main() {
     return 0;
   } else {
     sigsuspend(&oldmask);
-    if (!handler_invoked)
+    if (!handler_invoked) {
       return 4;
-    if (sigaction(SIGINT, 0, &current_sa) == -1)
+    }
+    if (sigaction(SIGINT, 0, &current_sa) == -1) {
       return 5;
-    if (current_sa.sa_handler != SIG_DFL)
+    }
+    if (current_sa.sa_handler != SIG_DFL) {
       return 6;
+    }
     return 0;
   }
 }
diff --git a/test/posix/sa_resethand_test.c b/test/posix/sa_resethand_test.c
index eed501bf6..c582d90fe 100644
--- a/test/posix/sa_resethand_test.c
+++ b/test/posix/sa_resethand_test.c
@@ -1,21 +1,22 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <errno.h>
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
 #include <signal.h>
-#include <stdlib.h>
 
 volatile int gotsig;
 
@@ -23,68 +24,23 @@ void OnSig(int sig) {
   gotsig = sig;
 }
 
-void test_sa_resethand_raise(void) {
+int main() {
   struct sigaction sa;
   sa.sa_handler = OnSig;
   sa.sa_flags = SA_RESETHAND;
   sigemptyset(&sa.sa_mask);
   if (sigaction(SIGUSR1, &sa, 0))
-    exit(1);
+    return 1;
   if (sigaction(SIGUSR1, 0, &sa))
-    exit(2);
+    return 2;
   if (sa.sa_handler != OnSig)
-    exit(3);
+    return 3;
   if (raise(SIGUSR1))
-    exit(4);
+    return 4;
   if (gotsig != SIGUSR1)
-    exit(5);
+    return 5;
   if (sigaction(SIGUSR1, 0, &sa))
-    exit(6);
+    return 6;
   if (sa.sa_handler != SIG_DFL)
-    exit(7);
-}
-
-void test_sa_resethand_pause(void) {
-  struct sigaction sa;
-  sa.sa_handler = OnSig;
-  sa.sa_flags = SA_RESETHAND;
-  sigemptyset(&sa.sa_mask);
-  if (sigaction(SIGALRM, &sa, 0))
-    exit(10);
-  ualarm(10000, 0);
-  if (pause() != -1 || errno != EINTR)
-    exit(11);
-  if (gotsig != SIGALRM)
-    exit(12);
-  if (sigaction(SIGALRM, 0, &sa))
-    exit(13);
-  if (sa.sa_handler != SIG_DFL)
-    exit(14);
-}
-
-void test_sa_resethand_read(void) {
-  struct sigaction sa;
-  sa.sa_handler = OnSig;
-  sa.sa_flags = SA_RESETHAND;
-  sigemptyset(&sa.sa_mask);
-  if (sigaction(SIGALRM, &sa, 0))
-    exit(20);
-  int fds[2];
-  if (pipe(fds))
-    exit(21);
-  ualarm(10000, 0);
-  if (read(fds[0], (char[]){0}, 1) != -1 || errno != EINTR)
-    exit(22);
-  if (gotsig != SIGALRM)
-    exit(23);
-  if (sigaction(SIGALRM, 0, &sa))
-    exit(24);
-  if (sa.sa_handler != SIG_DFL)
-    exit(25);
-}
-
-int main() {
-  test_sa_resethand_raise();
-  test_sa_resethand_pause();
-  test_sa_resethand_read();
+    return 7;
 }
diff --git a/test/posix/sigchld_test.c b/test/posix/sigchld_test.c
index 6915f7cf2..36cf1f032 100644
--- a/test/posix/sigchld_test.c
+++ b/test/posix/sigchld_test.c
@@ -32,7 +32,7 @@
 #include <unistd.h>
 
 // clang-format off
-// sh -c '.cosmocc/current/bin/make -j8 V=1 o//test/posix/sigchld_test.runs'
+// sh -c 'build/bootstrap/make -j8 V=1 o//test/posix/sigchld_test.runs'
 // clang-format on
 
 void Assert(const char *file, int line, bool ok) {
diff --git a/test/posix/signal_fight_test.c b/test/posix/signal_fight_test.c
deleted file mode 100644
index 910a7e04a..000000000
--- a/test/posix/signal_fight_test.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <pthread.h>
-#include <signal.h>
-#include <stdlib.h>
-
-/**
- * @fileoverview Tests two threads killing each other won't deadlock.
- *
- * Our Windows implementation of signals has surprisingly high
- * throughput on this test. About 10x more signals get delivered than
- * any other OS and in the same amount of time. The only exception was
- * OpenBSD, which delivered a similar number of signals, but it took 10x
- * longer for the process to execute.
- */
-
-#define ITERATIONS 10000
-
-int gotsigs[2];
-pthread_t threads[2];
-pthread_t thread_ids[2];
-pthread_barrier_t barrier;
-pthread_barrier_t barrier2;
-
-void sig_handler(int signo) {
-  if (pthread_equal(pthread_self(), threads[0]))
-    ++gotsigs[0];
-  if (pthread_equal(pthread_self(), threads[1]))
-    ++gotsigs[1];
-}
-
-void *thread_func(void *arg) {
-  int idx = *(int *)arg;
-  int other_idx = 1 - idx;
-
-  thread_ids[idx] = pthread_self();
-
-  int s = pthread_barrier_wait(&barrier);
-  if (s != 0 && s != PTHREAD_BARRIER_SERIAL_THREAD)
-    exit(1);
-
-  pthread_t other_thread = thread_ids[other_idx];
-
-  for (int i = 0; i < ITERATIONS; ++i)
-    if (pthread_kill(other_thread, SIGUSR1))
-      exit(2);
-
-  s = pthread_barrier_wait(&barrier2);
-  if (s != 0 && s != PTHREAD_BARRIER_SERIAL_THREAD)
-    exit(1);
-
-  return 0;
-}
-
-int main() {
-  struct sigaction sa;
-  sa.sa_handler = sig_handler;
-  sa.sa_flags = 0;
-  sigemptyset(&sa.sa_mask);
-
-  if (sigaction(SIGUSR1, &sa, 0) == -1)
-    exit(3);
-
-  if (pthread_barrier_init(&barrier, 0, 2))
-    exit(4);
-  if (pthread_barrier_init(&barrier2, 0, 2))
-    exit(4);
-
-  int idx0 = 0, idx1 = 1;
-
-  if (pthread_create(&threads[0], 0, thread_func, &idx0))
-    exit(5);
-  if (pthread_create(&threads[1], 0, thread_func, &idx1))
-    exit(6);
-
-  if (pthread_join(threads[0], 0))
-    exit(7);
-  if (pthread_join(threads[1], 0))
-    exit(8);
-
-  if (pthread_barrier_destroy(&barrier2))
-    exit(9);
-  if (pthread_barrier_destroy(&barrier))
-    exit(9);
-
-  if (!gotsigs[0])
-    exit(10);
-  if (!gotsigs[1])
-    exit(11);
-
-  return 0;
-}
diff --git a/test/posix/signal_latency_async_test.c b/test/posix/signal_latency_async_test.c
deleted file mode 100644
index d507d8b1a..000000000
--- a/test/posix/signal_latency_async_test.c
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <cosmo.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <unistd.h>
-
-#define ITERATIONS 10000
-
-pthread_t sender_thread;
-pthread_t receiver_thread;
-struct timespec send_time;
-atomic_int sender_got_signal;
-atomic_int receiver_got_signal;
-double latencies[ITERATIONS];
-
-void sender_signal_handler(int signo) {
-  sender_got_signal = 1;
-}
-
-void receiver_signal_handler(int signo) {
-  receiver_got_signal = 1;
-}
-
-void *sender_func(void *arg) {
-  for (int i = 0; i < ITERATIONS; i++) {
-
-    // Wait a bit sometimes
-    if (rand() % 2) {
-      volatile unsigned v = 0;
-      for (;;)
-        if (++v == 4000)
-          break;
-    }
-
-    // Ping receiver
-    clock_gettime(CLOCK_MONOTONIC, &send_time);
-    if (pthread_kill(receiver_thread, SIGUSR1))
-      exit(6);
-
-    // Wait for pong
-    for (;;)
-      if (atomic_load_explicit(&sender_got_signal, memory_order_relaxed))
-        break;
-    sender_got_signal = 0;
-  }
-
-  return 0;
-}
-
-void *receiver_func(void *arg) {
-  static int iteration = 0;
-  do {
-    // wait for signal handler to be called
-    if (atomic_exchange_explicit(&receiver_got_signal, 0,
-                                 memory_order_acq_rel)) {
-
-      // record received time
-      struct timespec receive_time;
-      clock_gettime(CLOCK_MONOTONIC, &receive_time);
-      long sec_diff = receive_time.tv_sec - send_time.tv_sec;
-      long nsec_diff = receive_time.tv_nsec - send_time.tv_nsec;
-      double latency_ns = sec_diff * 1e9 + nsec_diff;
-      latencies[iteration++] = latency_ns;
-
-      // pong sender
-      if (pthread_kill(sender_thread, SIGUSR2))
-        exit(2);
-    }
-  } while (iteration < ITERATIONS);
-  return 0;
-}
-
-int compare(const void *a, const void *b) {
-  const double *x = a, *y = b;
-  if (*x < *y)
-    return -1;
-  else if (*x > *y)
-    return 1;
-  else
-    return 0;
-}
-
-int main() {
-
-  // install handlers
-  struct sigaction sa;
-  sa.sa_handler = receiver_signal_handler;
-  sa.sa_flags = 0;
-  sigemptyset(&sa.sa_mask);
-  sigaction(SIGUSR1, &sa, 0);
-  sa.sa_handler = sender_signal_handler;
-  sigaction(SIGUSR2, &sa, 0);
-
-  // create receiver thread first
-  if (pthread_create(&receiver_thread, 0, receiver_func, 0))
-    exit(11);
-
-  // create sender thread
-  if (pthread_create(&sender_thread, 0, sender_func, 0))
-    exit(12);
-
-  // wait for threads to finish
-  if (pthread_join(sender_thread, 0))
-    exit(13);
-  if (pthread_join(receiver_thread, 0))
-    exit(14);
-
-  // compute mean latency
-  double total_latency = 0;
-  for (int i = 0; i < ITERATIONS; i++)
-    total_latency += latencies[i];
-  double mean_latency = total_latency / ITERATIONS;
-
-  // sort latencies to compute percentiles
-  qsort(latencies, ITERATIONS, sizeof(double), compare);
-
-  double p50 = latencies[(int)(0.50 * ITERATIONS)];
-  double p90 = latencies[(int)(0.90 * ITERATIONS)];
-  double p95 = latencies[(int)(0.95 * ITERATIONS)];
-  double p99 = latencies[(int)(0.99 * ITERATIONS)];
-
-  printf("Mean latency: %.2f ns\n", mean_latency);
-  printf("50th percentile latency: %.2f ns\n", p50);
-  printf("90th percentile latency: %.2f ns\n", p90);
-  printf("95th percentile latency: %.2f ns\n", p95);
-  printf("99th percentile latency: %.2f ns\n", p99);
-}
diff --git a/test/posix/signal_latency_test.c b/test/posix/signal_latency_test.c
deleted file mode 100644
index aa73cb771..000000000
--- a/test/posix/signal_latency_test.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <assert.h>
-#include <cosmo.h>
-#include <errno.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <unistd.h>
-
-#define ITERATIONS 10000
-
-atomic_bool got_sigusr2;
-pthread_t sender_thread;
-pthread_t receiver_thread;
-struct timespec send_time;
-double latencies[ITERATIONS];
-
-void sender_signal_handler(int signo) {
-  got_sigusr2 = true;
-}
-
-void receiver_signal_handler(int signo) {
-  struct timespec receive_time;
-  if (clock_gettime(CLOCK_MONOTONIC, &receive_time) == -1)
-    exit(1);
-
-  long sec_diff = receive_time.tv_sec - send_time.tv_sec;
-  long nsec_diff = receive_time.tv_nsec - send_time.tv_nsec;
-  double latency_ns = sec_diff * 1e9 + nsec_diff;
-
-  static int iteration = 0;
-  if (iteration < ITERATIONS)
-    latencies[iteration++] = latency_ns;
-
-  // Send SIGUSR2 back to sender_thread
-  if (pthread_kill(sender_thread, SIGUSR2))
-    exit(2);
-
-  // Exit if we're done.
-  if (iteration >= ITERATIONS)
-    pthread_exit(0);
-}
-
-void *sender_func(void *arg) {
-  // Block SIGUSR2
-  sigset_t block_set;
-  sigemptyset(&block_set);
-  sigaddset(&block_set, SIGUSR2);
-  if (pthread_sigmask(SIG_BLOCK, &block_set, 0))
-    exit(3);
-
-  // Install signal handler for SIGUSR2
-  struct sigaction sa;
-  sa.sa_handler = sender_signal_handler;
-  sa.sa_flags = 0;
-  sigemptyset(&sa.sa_mask);
-  if (sigaction(SIGUSR2, &sa, 0))
-    exit(4);
-
-  for (int i = 0; i < ITERATIONS; i++) {
-    if (clock_gettime(CLOCK_MONOTONIC, &send_time))
-      exit(5);
-
-    // Send SIGUSR1 to receiver_thread
-    got_sigusr2 = false;
-    if (pthread_kill(receiver_thread, SIGUSR1))
-      exit(6);
-
-    // Unblock SIGUSR2 and wait for it
-    sigset_t wait_set;
-    sigemptyset(&wait_set);
-    while (!got_sigusr2)
-      if (sigsuspend(&wait_set) && errno != EINTR)
-        exit(7);
-  }
-
-  return 0;
-}
-
-void *receiver_func(void *arg) {
-  // Install signal handler for SIGUSR1
-  struct sigaction sa;
-  sa.sa_handler = receiver_signal_handler;
-  sa.sa_flags = 0;
-  sigemptyset(&sa.sa_mask);
-  if (sigaction(SIGUSR1, &sa, 0))
-    exit(8);
-
-  // Block all signals except SIGUSR1
-  sigset_t block_set;
-  sigfillset(&block_set);
-  sigdelset(&block_set, SIGUSR1);
-  if (pthread_sigmask(SIG_SETMASK, &block_set, 0))
-    exit(9);
-
-  // Wait indefinitely for signals
-  while (1)
-    pause();
-
-  return 0;
-}
-
-int compare(const void *a, const void *b) {
-  const double *x = a, *y = b;
-  if (*x < *y)
-    return -1;
-  else if (*x > *y)
-    return 1;
-  else
-    return 0;
-}
-
-int main() {
-
-  // this test probably exposes a bug in openbsd
-  if (IsOpenbsd())
-    return 0;
-
-  // Block SIGUSR1 and SIGUSR2 in main thread
-  sigset_t block_set;
-  sigemptyset(&block_set);
-  sigaddset(&block_set, SIGUSR1);
-  sigaddset(&block_set, SIGUSR2);
-  if (pthread_sigmask(SIG_BLOCK, &block_set, 0))
-    exit(10);
-
-  // Create receiver thread first
-  if (pthread_create(&receiver_thread, 0, receiver_func, 0))
-    exit(11);
-
-  // Create sender thread
-  if (pthread_create(&sender_thread, 0, sender_func, 0))
-    exit(12);
-
-  // Wait for threads to finish
-  if (pthread_join(sender_thread, 0))
-    exit(13);
-  if (pthread_join(receiver_thread, 0))
-    exit(14);
-
-  // Compute mean latency
-  double total_latency = 0;
-  for (int i = 0; i < ITERATIONS; i++)
-    total_latency += latencies[i];
-  double mean_latency = total_latency / ITERATIONS;
-
-  // Sort latencies to compute percentiles
-  qsort(latencies, ITERATIONS, sizeof(double), compare);
-
-  double p50 = latencies[(int)(0.50 * ITERATIONS)];
-  double p90 = latencies[(int)(0.90 * ITERATIONS)];
-  double p95 = latencies[(int)(0.95 * ITERATIONS)];
-  double p99 = latencies[(int)(0.99 * ITERATIONS)];
-
-  printf("Mean latency: %.2f ns\n", mean_latency);
-  printf("50th percentile latency: %.2f ns\n", p50);
-  printf("90th percentile latency: %.2f ns\n", p90);
-  printf("95th percentile latency: %.2f ns\n", p95);
-  printf("99th percentile latency: %.2f ns\n", p99);
-}
diff --git a/test/posix/signal_torture_read_test.c b/test/posix/signal_torture_read_test.c
deleted file mode 100644
index 51af76521..000000000
--- a/test/posix/signal_torture_read_test.c
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <cosmo.h>
-#include <errno.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-/**
- * @fileoverview i/o signal handling torture test
- *
- * This test tries to trigger race conditions in the kernel's read()
- * implementation, by sending a massive amount of SA_RESTART signals
- * which cause churn in its internal code, and finally an interrupt.
- * This should reveal if the kernel code that checks for any pending
- * signals before blocking on i/o happens non-atomically. Expect the
- * test to hang indefinitely in such cases.
- *
- * "This flag affects the behavior of interruptible functions; that is,
- *  those specified to fail with errno set to EINTR. If set, and a
- *  function specified as interruptible is interrupted by this signal,
- *  the function shall restart and shall not fail with EINTR unless
- *  otherwise specified. If an interruptible function which uses a
- *  timeout is restarted, the duration of the timeout following the
- *  restart is set to an unspecified value that does not exceed the
- *  original timeout value. If the flag is not set, interruptible
- *  functions interrupted by this signal shall fail with errno set to
- *  EINTR." —Quoth IEEE Std 1003.1-2017 (POSIX.1) on SA_RESTART
- *
- * Every OS except Windows fails this test.
- *
- * @see sys_readwrite_nt()
- */
-
-#define COUNT 1000
-
-volatile sig_atomic_t got_sigusr1;
-volatile sig_atomic_t got_sigusr2;
-volatile sig_atomic_t thread_ready;
-volatile sig_atomic_t read_interrupted;
-
-void sigusr1_handler(int signo) {
-  ++got_sigusr1;
-  // printf("got %d sigusr1\n", got_sigusr1);
-}
-
-void sigusr2_handler(int signo) {
-  ++got_sigusr2;
-  // printf("got %d sigusr2\n", got_sigusr2);
-}
-
-void setup_signal_handlers() {
-  struct sigaction sa;
-
-  // Set up SIGUSR1 handler with SA_RESTART
-  sa.sa_handler = sigusr1_handler;
-  sa.sa_flags = SA_RESTART;  // Signal handler with SA_RESTART
-  sigemptyset(&sa.sa_mask);
-  if (sigaction(SIGUSR1, &sa, NULL) == -1)
-    exit(97);
-
-  // Set up SIGUSR2 handler without SA_RESTART
-  sa.sa_handler = sigusr2_handler;
-  sa.sa_flags = 0;  // Signal handler without SA_RESTART
-  sigemptyset(&sa.sa_mask);
-  if (sigaction(SIGUSR2, &sa, NULL) == -1)
-    exit(98);
-}
-
-void block_signals() {
-  sigset_t set;
-  sigemptyset(&set);
-  sigaddset(&set, SIGUSR1);
-  sigaddset(&set, SIGUSR2);
-  if (pthread_sigmask(SIG_BLOCK, &set, 0))
-    exit(99);
-}
-
-void *thread_func(void *arg) {
-  int *pipefd = (int *)arg;
-  char buf[1];
-  ssize_t ret;
-
-  // Unblock SIGUSR1 and SIGUSR2 in this thread
-  sigset_t set;
-  sigemptyset(&set);
-  sigaddset(&set, SIGUSR1);
-  sigaddset(&set, SIGUSR2);
-  if (pthread_sigmask(SIG_UNBLOCK, &set, 0))
-    exit(100);
-
-  // Indicate that the thread is ready
-  thread_ready = 1;
-
-  // Call read() on the pipe
-  ret = read(pipefd[0], buf, 1);
-  if (ret == -1) {
-    if (errno == EINTR) {
-      read_interrupted = 1;
-      // printf("read interrupted\n");
-    } else {
-      perror("read");
-      exit(78);
-    }
-  } else {
-    exit(77);
-  }
-
-  return NULL;
-}
-
-int main() {
-  int pipefd[2];
-  pthread_t thread;
-
-  // this test exposes bugs in macos
-  if (IsXnu())
-    return 0;
-
-  // this test exposes bugs in linux
-  if (IsLinux())
-    return 0;
-
-  // this test exposes bugs in netbsd
-  if (IsNetbsd())
-    return 0;
-
-  // this test exposes bugs in freebsd
-  if (IsFreebsd())
-    return 0;
-
-  // this test exposes bugs in openbsd
-  if (IsOpenbsd())
-    return 0;
-
-  ShowCrashReports();
-
-  // Block SIGUSR1 and SIGUSR2 in the main thread
-  block_signals();
-
-  // Set up signal handlers
-  setup_signal_handlers();
-
-  // Create a pipe
-  if (pipe(pipefd) == -1)
-    exit(95);
-
-  // Create a thread
-  if (pthread_create(&thread, NULL, thread_func, pipefd) != 0)
-    exit(90);
-
-  // Wait until the thread is ready
-  while (!thread_ready)
-    if (pthread_yield_np())
-      exit(101);
-
-  // Send SIGUSR1 signals
-  // This will cause read() to restart internally
-  for (int i = 0; i < COUNT; i++) {
-    if (pthread_kill(thread, SIGUSR1) != 0)
-      exit(91);
-    if (i % (COUNT / 10) == 0)
-      usleep(1);
-  }
-
-  // Send SIGUSR2 to the thread
-  // This will trigger an EINTR
-  fflush(stdout);
-  if (pthread_kill(thread, SIGUSR2))
-    exit(92);
-
-  // Join the thread
-  if (pthread_join(thread, NULL))
-    exit(93);
-
-  // Close the pipe
-  close(pipefd[0]);
-  close(pipefd[1]);
-
-  // Check if read() was interrupted by EINTR
-  if (!read_interrupted)
-    exit(94);
-
-  if (!got_sigusr1)
-    exit(60);
-  if (!got_sigusr2)
-    exit(61);
-
-  // printf("got %d got_sigusr1\n", got_sigusr1);
-
-  CheckForMemoryLeaks();
-  return 0;
-}
diff --git a/test/posix/socket_fionread_test.c b/test/posix/socket_fionread_test.c
deleted file mode 100644
index c6fd81413..000000000
--- a/test/posix/socket_fionread_test.c
+++ /dev/null
@@ -1,69 +0,0 @@
-#include <arpa/inet.h>
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <netinet/in.h>
-#include <poll.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-int main() {
-
-  // create server socket
-  int server_fd;
-  struct sockaddr_in address;
-  int addrlen = sizeof(address);
-  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
-    return 1;
-  address.sin_family = AF_INET;
-  address.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  address.sin_port = 0;  // let os assign random port
-  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)))
-    return 2;
-  if (getsockname(server_fd, (struct sockaddr *)&address,
-                  (socklen_t *)&addrlen))
-    return 3;
-  if (listen(server_fd, SOMAXCONN))
-    return 4;
-
-  // create client socket
-  int client_fd;
-  if ((client_fd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
-    return 6;
-  if (connect(client_fd, (struct sockaddr *)&address, sizeof(address)))
-    return 7;
-
-  // accept client and send data
-  int server_client_fd;
-  if ((server_client_fd = accept(server_fd, 0, 0)) == -1)
-    return 8;
-  if (write(server_client_fd, "hi", 2) != 2)
-    return 9;
-
-  // poll to be safe (important for mac/bsd)
-  struct pollfd fds[] = {{client_fd, POLLIN}};
-  if (poll(fds, 1, -1u) != 1)
-    return 10;
-
-  // ask how many bytes we can read
-  uint32_t bytes_available;
-  if (ioctl(client_fd, FIONREAD, &bytes_available))
-    return 11;
-  if (bytes_available != 2) {
-    printf("wut %d\n", bytes_available);
-    return 12;
-  }
-
-  // clean up
-  if (close(client_fd))
-    return 13;
-  if (close(server_fd))
-    return 14;
-
-  CheckForMemoryLeaks();
-}
diff --git a/test/posix/socket_timeout_signal_test.c b/test/posix/socket_timeout_signal_test.c
deleted file mode 100644
index 376f86fa6..000000000
--- a/test/posix/socket_timeout_signal_test.c
+++ /dev/null
@@ -1,214 +0,0 @@
-#include <arpa/inet.h>
-#include <cosmo.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <netinet/in.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-/**
- * @fileoverview SO_RCVTIMEO + SA_RESTART interaction test
- *
- * This code tests that setting a read timeout on a socket will cause
- * read() to change its signal handling behavior from @restartable to
- * @norestart. This is currently the case on GNU/Systemd and Windows.
- */
-
-struct sockaddr_in serv_addr;
-atomic_bool g_ready_for_conn;
-atomic_bool g_ready_for_data;
-atomic_bool g_ready_for_more;
-atomic_bool g_ready_for_exit;
-atomic_bool got_sigusr1;
-atomic_bool got_sigusr2;
-
-void on_sigusr1(int sig) {
-  got_sigusr1 = true;
-}
-
-void on_sigusr2(int sig) {
-  got_sigusr2 = true;
-}
-
-void *server_thread(void *arg) {
-  int server, client;
-  struct timeval timeout;
-  socklen_t len;
-  struct sockaddr_in cli_addr;
-
-  // create listening socket
-  server = socket(AF_INET, SOCK_STREAM, 0);
-  if (server == -1) {
-    perror("socket");
-    exit(31);
-  }
-
-  // initialize server address
-  memset(&serv_addr, 0, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-  serv_addr.sin_port = htons(0);
-
-  // bind socket
-  if (bind(server, (struct sockaddr *)&serv_addr, sizeof(serv_addr))) {
-    perror("bind");
-    exit(32);
-  }
-
-  // get assigned port
-  len = sizeof(serv_addr);
-  if (getsockname(server, (struct sockaddr *)&serv_addr, &len)) {
-    perror("getsockname");
-    exit(30);
-  }
-
-  // listen on the socket
-  if (listen(server, SOMAXCONN)) {
-    perror("listen");
-    exit(33);
-  }
-
-  // wake main thread
-  g_ready_for_conn = true;
-
-  // accept connection
-  len = sizeof(cli_addr);
-  client = accept(server, (struct sockaddr *)&cli_addr, &len);
-  if (client == -1) {
-    perror("accept");
-    exit(35);
-  }
-
-  // wake main thread
-  g_ready_for_data = true;
-
-  // check read() has @restartable behavior
-  char buf[1];
-  int rc = read(client, buf, 1);
-  if (rc != -1)
-    exit(35);
-  if (errno != EINTR)
-    exit(36);
-  if (!got_sigusr1)
-    exit(37);
-  if (!got_sigusr2)
-    exit(38);
-  got_sigusr1 = false;
-  got_sigusr2 = false;
-
-  // install a socket receive timeout
-  timeout.tv_sec = 5000000;
-  timeout.tv_usec = 0;
-  if (setsockopt(client, SOL_SOCKET, SO_RCVTIMEO, &timeout,
-                 sizeof(timeout) + (!IsNetbsd() && !IsQemuUser()))) {
-    perror("setsockopt");
-    exit(34);
-  }
-
-  // wake main thread
-  g_ready_for_more = true;
-
-  // check read() has @norestart behavior
-  rc = read(client, buf, 1);
-  if (rc != -1)
-    exit(35);
-  if (errno != EINTR)
-    exit(36);
-  if (!got_sigusr1)
-    exit(37);
-
-  // here's the whammy
-  if (IsLinux() || IsWindows()) {
-    if (got_sigusr2)
-      exit(38);
-  } else {
-    if (!got_sigusr2)
-      exit(38);
-  }
-
-  // wait for main thread
-  for (;;)
-    if (g_ready_for_exit)
-      break;
-
-  // close listening socket
-  if (close(server))
-    exit(40);
-  if (close(client))
-    exit(39);
-  return 0;
-}
-
-int main() {
-
-  // handle signals
-  struct sigaction sa = {0};
-  sa.sa_handler = on_sigusr1;
-  sa.sa_flags = SA_RESTART;
-  sigaction(SIGUSR1, &sa, 0);
-  sa.sa_handler = on_sigusr2;
-  sa.sa_flags = 0;
-  sigaction(SIGUSR2, &sa, 0);
-
-  // create server thread
-  pthread_t th;
-  if (pthread_create(&th, 0, server_thread, 0))
-    return 1;
-
-  // wait for thread
-  for (;;)
-    if (g_ready_for_conn)
-      break;
-
-  // create socket
-  int client = socket(AF_INET, SOCK_STREAM, 0);
-  if (client == -1) {
-    perror("socket");
-    return 2;
-  }
-
-  // connect to server
-  if (connect(client, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) == -1) {
-    perror("connect");
-    return 3;
-  }
-
-  // wait for thread
-  for (;;)
-    if (g_ready_for_data)
-      break;
-
-  usleep(100e3);
-  if (pthread_kill(th, SIGUSR1))
-    return 4;
-
-  usleep(100e3);
-  if (pthread_kill(th, SIGUSR2))
-    return 5;
-
-  // wait for thread
-  for (;;)
-    if (g_ready_for_more)
-      break;
-
-  usleep(100e3);
-  if (pthread_kill(th, SIGUSR1))
-    return 4;
-
-  usleep(400e3);
-  if (pthread_kill(th, SIGUSR2))
-    return 5;
-
-  g_ready_for_exit = true;
-
-  if (pthread_join(th, 0))
-    return 20;
-}
diff --git a/test/posix/unnamed_semaphore_test.c b/test/posix/unnamed_semaphore_test.c
deleted file mode 100644
index f406f82b3..000000000
--- a/test/posix/unnamed_semaphore_test.c
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <pthread.h>
-#include <semaphore.h>
-
-#define THREADS    10
-#define ITERATIONS 100000
-
-int g_count;
-sem_t g_sem;
-
-void *worker(void *arg) {
-  for (int i = 0; i < ITERATIONS; ++i) {
-    if (sem_wait(&g_sem))
-      exit(6);
-    ++g_count;
-    if (sem_post(&g_sem))
-      exit(7);
-  }
-  return 0;
-}
-
-int main(int argc, char *argv[]) {
-  pthread_t th[THREADS];
-  if (sem_init(&g_sem, 0, 1))
-    return 1;
-  for (int i = 0; i < THREADS; ++i)
-    if (pthread_create(&th[i], 0, worker, 0))
-      return 2;
-  for (int i = 0; i < THREADS; ++i)
-    if (pthread_join(th[i], 0))
-      return 3;
-  if (g_count != THREADS * ITERATIONS)
-    return 4;
-  if (sem_destroy(&g_sem))
-    return 5;
-}
diff --git a/test/posix/wait2x_test.c b/test/posix/wait2x_test.c
deleted file mode 100644
index 06c900ec4..000000000
--- a/test/posix/wait2x_test.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2024 Justine Alexandra Roberts Tunney
-//
-// Permission to use, copy, modify, and/or distribute this software for
-// any purpose with or without fee is hereby granted, provided that the
-// above copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
-// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-// PERFORMANCE OF THIS SOFTWARE.
-
-#include <errno.h>
-#include <stdatomic.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-int main(int argc, char *argv[]) {
-  int ws, rc;
-
-  // create shared memory for synchronization
-  atomic_int *ready =
-      mmap(0, 4, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-
-  if ((rc = waitpid(-1, &ws, WNOHANG)) != -1)
-    return 1;
-  if (errno != ECHILD)
-    return 2;
-
-  // create process
-  int pid = fork();
-  if (pid == -1)
-    return 3;
-  if (!pid) {
-    for (;;)
-      if (*ready)
-        break;
-    _Exit(0);
-  }
-
-  // wait on process
-  if ((rc = waitpid(pid, &ws, WNOHANG)) == -1)
-    return 4;
-  if (rc != 0)
-    return 5;
-  if (ws)
-    return 6;
-
-  // signal subprocess
-  *ready = 1;
-
-  if ((rc = waitpid(pid, &ws, 0)) == -1)
-    return 7;
-  if (rc != pid)
-    return 8;
-  if (ws)
-    return 9;
-
-  // wait again
-  if ((rc = waitpid(pid, &ws, WNOHANG)) != -1)
-    return 10;
-  if (errno != ECHILD)
-    return 11;
-}
diff --git a/test/posix/writev_test.c b/test/posix/writev_test.c
deleted file mode 100644
index 400233397..000000000
--- a/test/posix/writev_test.c
+++ /dev/null
@@ -1,122 +0,0 @@
-#include <netinet/in.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <sys/uio.h>
-#include <unistd.h>
-
-int main() {
-  int sockfd, newsockfd, clientfd;
-  struct sockaddr_in serv_addr, cli_addr;
-  socklen_t clilen;
-  struct iovec iov[2];
-  char buffer[256];
-  ssize_t n;
-  const char *str1 = "";
-  const char *str2 = "Hello from server!\n";
-  const char *str3 = "Hello from client!\n";
-
-  // Create server socket
-  sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  if (sockfd < 0) {
-    perror("socket");
-    return 18;
-  }
-
-  // Bind server socket to localhost:PORT
-  memset(&serv_addr, 0, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);  // localhost
-  serv_addr.sin_port = 0;
-  if (bind(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
-    perror("bind");
-    close(sockfd);
-    return 19;
-  }
-  uint32_t addrsize = sizeof(serv_addr);
-  if (getsockname(sockfd, (struct sockaddr *)&serv_addr, &addrsize)) {
-    perror("getsockname");
-    return 20;
-  }
-
-  // Listen for incoming connections
-  if (listen(sockfd, 1) < 0) {
-    perror("listen");
-    close(sockfd);
-    return 21;
-  }
-
-  // Create client socket
-  clientfd = socket(AF_INET, SOCK_STREAM, 0);
-  if (clientfd < 0) {
-    perror("socket");
-    close(sockfd);
-    return 22;
-  }
-
-  // Connect client socket to server
-  if (connect(clientfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
-    perror("connect");
-    close(sockfd);
-    close(clientfd);
-    return 23;
-  }
-
-  // Accept connection on server side
-  clilen = sizeof(cli_addr);
-  newsockfd = accept(sockfd, (struct sockaddr *)&cli_addr, &clilen);
-  if (newsockfd < 0) {
-    perror("accept");
-    close(sockfd);
-    close(clientfd);
-    return 34;
-  }
-
-  // Server writes to client using writev with zero-length iovec
-  iov[0].iov_base = (void *)str1;
-  iov[0].iov_len = 0;  // Zero-length iovec
-  iov[1].iov_base = (void *)str2;
-  iov[1].iov_len = strlen(str2);
-
-  n = writev(newsockfd, iov, 2);
-  if (n != 19)
-    return 13;
-
-  // Client reads data from server
-  memset(buffer, 0, sizeof(buffer));
-  n = read(clientfd, buffer, sizeof(buffer) - 1);
-  if (n < 0) {
-    perror("read");
-    return 14;
-  } else {
-    if (n != 19)
-      return 8;
-  }
-
-  // Client writes to server using writev with zero-length iovec
-  iov[0].iov_base = (void *)str1;
-  iov[0].iov_len = 0;  // Zero-length iovec
-  iov[1].iov_base = (void *)str3;
-  iov[1].iov_len = strlen(str3);
-
-  n = writev(clientfd, iov, 2);
-  if (n != 19)
-    return 9;
-
-  // Server reads data from client
-  memset(buffer, 0, sizeof(buffer));
-  n = read(newsockfd, buffer, sizeof(buffer) - 1);
-  if (n < 0) {
-    perror("ERROR reading from client");
-    return 10;
-  } else if (n != 19) {
-    return 11;
-  }
-
-  // Close all sockets
-  close(newsockfd);
-  close(clientfd);
-  close(sockfd);
-}
diff --git a/test/tool/args/args2_test.c b/test/tool/args/args2_test.c
deleted file mode 100644
index 8f208aa63..000000000
--- a/test/tool/args/args2_test.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/cosmo.h"
-#include "libc/dce.h"
-#include "libc/mem/mem.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/rand.h"
-#include "libc/testlib/testlib.h"
-#include "libc/x/x.h"
-
-void SetUpOnce(void) {
-  testlib_enable_tmp_setup_teardown();
-}
-
-TEST(cosmo_args, normalize) {
-  char *args[] = {0};
-  char **argv = args;
-  ASSERT_EQ(1, cosmo_args(0, &argv));
-  ASSERT_STREQ(GetProgramExecutableName(), argv[0]);
-}
-
-TEST(cosmo_args, test) {
-  xbarf(".args", "a b c", -1);
-  char *args[] = {"prog", "arg", 0};
-  char **argv = args;
-  ASSERT_EQ(5, cosmo_args(".args", &argv));
-  ASSERT_STREQ("prog", argv[0]);
-  ASSERT_STREQ("a", argv[1]);
-  ASSERT_STREQ("b", argv[2]);
-  ASSERT_STREQ("c", argv[3]);
-  ASSERT_STREQ("arg", argv[4]);
-}
-
-TEST(cosmo_args, perline) {
-  xbarf(".args", "a\nb\nc\n", -1);
-  char *args[] = {"prog", "arg", 0};
-  char **argv = args;
-  ASSERT_EQ(5, cosmo_args(".args", &argv));
-  ASSERT_STREQ("prog", argv[0]);
-  ASSERT_STREQ("a", argv[1]);
-  ASSERT_STREQ("b", argv[2]);
-  ASSERT_STREQ("c", argv[3]);
-  ASSERT_STREQ("arg", argv[4]);
-}
-
-TEST(cosmo_args, dots_end) {
-  xbarf(".args", "a b c ...", -1);
-  char *args[] = {"prog", "arg", 0};
-  char **argv = args;
-  ASSERT_EQ(5, cosmo_args(".args", &argv));
-  ASSERT_STREQ("prog", argv[0]);
-  ASSERT_STREQ("a", argv[1]);
-  ASSERT_STREQ("b", argv[2]);
-  ASSERT_STREQ("c", argv[3]);
-  ASSERT_STREQ("arg", argv[4]);
-}
-
-TEST(cosmo_args, dots_middle) {
-  xbarf(".args", "a ... b c", -1);
-  char *args[] = {"prog", "arg", 0};
-  char **argv = args;
-  ASSERT_EQ(5, cosmo_args(".args", &argv));
-  ASSERT_STREQ("prog", argv[0]);
-  ASSERT_STREQ("a", argv[1]);
-  ASSERT_STREQ("arg", argv[2]);
-  ASSERT_STREQ("b", argv[3]);
-  ASSERT_STREQ("c", argv[4]);
-}
-
-TEST(cosmo_args, quote) {
-  xbarf(".args", " 'hi \\n there'# ", -1);
-  char *args[] = {0};
-  char **argv = args;
-  ASSERT_EQ(2, cosmo_args(".args", &argv));
-  ASSERT_STREQ("hi \\n there#", argv[1]);
-}
-
-TEST(cosmo_args, dquote) {
-  xbarf(".args", " \"hi \\a\\b\\t\\n\\v\\f\\r\\e\\0\\11 \\111 \xab there\"# ",
-        -1);
-  char *args[] = {0};
-  char **argv = args;
-  ASSERT_EQ(2, cosmo_args(".args", &argv));
-  ASSERT_STREQ("hi \a\b\t\n\v\f\r\e\0\11 \111 \xab there#", argv[1]);
-}
-
-TEST(cosmo_args, comment) {
-  xbarf(".args",
-        "# comment\n"
-        "a # hello there\n"
-        "b # yup\n",
-        -1);
-  char *args[] = {0};
-  char **argv = args;
-  ASSERT_EQ(3, cosmo_args(".args", &argv));
-  ASSERT_STREQ("a", argv[1]);
-  ASSERT_STREQ("b", argv[2]);
-}
-
-TEST(cosmo_args, backslash_newline) {
-  xbarf(".args",
-        "a\\\n"
-        "b\n",
-        -1);
-  char *args[] = {0};
-  char **argv = args;
-  ASSERT_EQ(2, cosmo_args(".args", &argv));
-  ASSERT_STREQ("ab", argv[1]);
-}
-
-TEST(cosmo_args, dotz) {
-  xbarf(".args", ". .. ...x", -1);
-  char *args[] = {0};
-  char **argv = args;
-  ASSERT_EQ(4, cosmo_args(".args", &argv));
-  ASSERT_STREQ(".", argv[1]);
-  ASSERT_STREQ("..", argv[2]);
-  ASSERT_STREQ("...x", argv[3]);
-}
-
-TEST(cosmo_args, env) {
-  setenv("foo", "bar", true);
-  xbarf(".args", "$foo x${foo}x \"$foo\" \"${foo}\" $foo", -1);
-  char *args[] = {0};
-  char **argv = args;
-  ASSERT_EQ(6, cosmo_args(".args", &argv));
-  ASSERT_STREQ("bar", argv[1]);
-  ASSERT_STREQ("xbarx", argv[2]);
-  ASSERT_STREQ("bar", argv[3]);
-  ASSERT_STREQ("bar", argv[4]);
-  ASSERT_STREQ("bar", argv[5]);
-}
-
-TEST(cosmo_args, dquote_backslash_newline) {
-  setenv("foo", "bar", true);
-  xbarf(".args",
-        "-p \"\\\n"
-        "hello\"\n",
-        -1);
-  char *args[] = {0};
-  char **argv = args;
-  ASSERT_EQ(3, cosmo_args(".args", &argv));
-  ASSERT_STREQ("-p", argv[1]);
-  ASSERT_STREQ("hello", argv[2]);
-}
-
-TEST(cosmo_args, dquote_plain_old_newline) {
-  setenv("foo", "bar", true);
-  xbarf(".args",
-        "-p \"\n"
-        "hello\"\n",
-        -1);
-  char *args[] = {0};
-  char **argv = args;
-  ASSERT_EQ(3, cosmo_args(".args", &argv));
-  ASSERT_STREQ("-p", argv[1]);
-  ASSERT_STREQ("\nhello", argv[2]);
-}
-
-#define LENGTH     128
-#define ITERATIONS 5000
-#define CHARSET    "abc#'\"$.\\{} \r\n"
-
-TEST(cosmo_args, fuzz) {
-  if (IsWindows())
-    return;  // not worth it fs too slow
-  char s[LENGTH + 1] = {0};
-  for (int i = 0; i < ITERATIONS; ++i) {
-    for (int j = 0; j < LENGTH; ++j)
-      s[j] = CHARSET[rand() % (sizeof(CHARSET) - 1)];
-    xbarf(".args", s, -1);
-    char *args[] = {0};
-    char **argv = args;
-    cosmo_args(".args", &argv);
-    for (int j = 0; argv[j]; ++j)
-      free(argv[j]);
-    argv[0] = 0;
-  }
-}
diff --git a/test/tool/args/args_test.c b/test/tool/args/args_test.c
index 0f7f6c47d..ec57b1044 100644
--- a/test/tool/args/args_test.c
+++ b/test/tool/args/args_test.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/cosmo.h"
+#include "tool/args/args.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/testlib/testlib.h"
diff --git a/test/tool/build/lib/asmdown_test.c b/test/tool/build/lib/asmdown_test.c
index 3524c2599..0d3b2259f 100644
--- a/test/tool/build/lib/asmdown_test.c
+++ b/test/tool/build/lib/asmdown_test.c
@@ -23,7 +23,7 @@
 TEST(ParseAsmdown, test) {
   struct Asmdown *ad;
   const char *s = "\
-#include \"libc/macros.h\"\n\
+#include \"libc/macros.internal.h\"\n\
 .source	__FILE__\n\
 \n\
 /	Returns absolute value of double.\n\
@@ -87,7 +87,7 @@ tinymath_acos:\n\
 TEST(ParseAsmdown, testAlias) {
   struct Asmdown *ad;
   const char *s = "\
-#include \"libc/macros.h\"\n\
+#include \"libc/macros.internal.h\"\n\
 .source	__FILE__\n\
 \n\
 /	Returns arc cosine of 𝑥.\n\
@@ -137,7 +137,7 @@ tinymath_acos:\n\
 TEST(ParseAsmdown, testClangIsEvil) {
   struct Asmdown *ad;
   const char *s = "\
-#include \"libc/macros.h\"\n\
+#include \"libc/macros.internal.h\"\n\
 .source	__FILE__\n\
 \n\
 //	Returns arc cosine of 𝑥.\n\
diff --git a/test/tool/net/redbean_test.c b/test/tool/net/redbean_test.c
index e13c3cfdd..685db685f 100644
--- a/test/tool/net/redbean_test.c
+++ b/test/tool/net/redbean_test.c
@@ -39,6 +39,7 @@
 #include "libc/testlib/testlib.h"
 #include "libc/x/x.h"
 #include "third_party/regex/regex.h"
+#ifdef __x86_64__
 
 __static_yoink("zipos");
 __static_yoink("o/" MODE "/test/tool/net/redbean-tester");
@@ -262,7 +263,7 @@ Last-Modified: .*\r\n\
 Accept-Ranges: bytes\r\n\
 X-Content-Type-Options: nosniff\r\n\
 Date: .*\r\n\
-Server: redbean/.*\r\n\
+Server: redbean/2.2.0\r\n\
 Content-Length: 34\r\n\
 \r\n\
 J\n\
@@ -291,3 +292,5 @@ Z\n",
   EXPECT_NE(-1, wait(0));
   EXPECT_NE(-1, sigprocmask(SIG_SETMASK, &savemask, 0));
 }
+
+#endif /* __x86_64__ */
diff --git a/test/tool/net/sqlite_test.c b/test/tool/net/sqlite_test.c
index 5df4ff716..96c5bacc3 100644
--- a/test/tool/net/sqlite_test.c
+++ b/test/tool/net/sqlite_test.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/test/tool/plinko/plinko_test.c b/test/tool/plinko/plinko_test.c
index aa260d554..087663223 100644
--- a/test/tool/plinko/plinko_test.c
+++ b/test/tool/plinko/plinko_test.c
@@ -20,7 +20,7 @@
 #include "libc/calls/struct/sigaction.h"
 #include "libc/errno.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
diff --git a/test/tool/viz/lib/fun_test.c b/test/tool/viz/lib/fun_test.c
index a158d8e82..dc5340631 100644
--- a/test/tool/viz/lib/fun_test.c
+++ b/test/tool/viz/lib/fun_test.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/stdio/rand.h"
 #include "libc/str/str.h"
@@ -212,7 +212,7 @@ void ExpandLuminosityRange(unsigned n, unsigned char *Y) {
 }
 
 TEST(ExpandLuminosityRange, test) {
-  _Alignas(16) unsigned char Y[32];
+  unsigned char Y[32];
   Y[0] = 0;
   ExpandLuminosityRange(16, Y);
   EXPECT_EQ(0, Y[0]);
diff --git a/test/tool/viz/lib/ycbcr2rgb2_test.c b/test/tool/viz/lib/ycbcr2rgb2_test.c
index bb2d6e90b..1e0f9e0e6 100644
--- a/test/tool/viz/lib/ycbcr2rgb2_test.c
+++ b/test/tool/viz/lib/ycbcr2rgb2_test.c
@@ -16,7 +16,8 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "dsp/mpeg/mpeg.h"
+#include "libc/macros.internal.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
diff --git a/third_party/aarch64/BUILD.mk b/third_party/aarch64/BUILD.mk
index ffff966a9..7ed5ff404 100644
--- a/third_party/aarch64/BUILD.mk
+++ b/third_party/aarch64/BUILD.mk
@@ -3,4 +3,4 @@
 
 PKGS += THIRD_PARTY_AARCH64
 THIRD_PARTY_AARCH64_HDRS = $(filter %.h,$(THIRD_PARTY_AARCH64_FILES))
-THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*) $(wildcard third_party/aarch64/clang/*)
+THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*)
diff --git a/third_party/aarch64/clang/arm64intr.h b/third_party/aarch64/clang/arm64intr.h
deleted file mode 100644
index 4943b2db6..000000000
--- a/third_party/aarch64/clang/arm64intr.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-/* Only include this if we're compiling for the windows platform. */
-#ifndef _MSC_VER
-#include_next <arm64intr.h>
-#else
-
-#ifndef __ARM64INTR_H
-#define __ARM64INTR_H
-
-typedef enum
-{
-  _ARM64_BARRIER_SY    = 0xF,
-  _ARM64_BARRIER_ST    = 0xE,
-  _ARM64_BARRIER_LD    = 0xD,
-  _ARM64_BARRIER_ISH   = 0xB,
-  _ARM64_BARRIER_ISHST = 0xA,
-  _ARM64_BARRIER_ISHLD = 0x9,
-  _ARM64_BARRIER_NSH   = 0x7,
-  _ARM64_BARRIER_NSHST = 0x6,
-  _ARM64_BARRIER_NSHLD = 0x5,
-  _ARM64_BARRIER_OSH   = 0x3,
-  _ARM64_BARRIER_OSHST = 0x2,
-  _ARM64_BARRIER_OSHLD = 0x1
-} _ARM64INTR_BARRIER_TYPE;
-
-#endif /* __ARM64INTR_H */
-#endif /* _MSC_VER */
diff --git a/third_party/aarch64/clang/arm_acle.h b/third_party/aarch64/clang/arm_acle.h
deleted file mode 100644
index 1518b0c4c..000000000
--- a/third_party/aarch64/clang/arm_acle.h
+++ /dev/null
@@ -1,888 +0,0 @@
-/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- * The Arm C Language Extensions specifications can be found in the following
- * link: https://github.com/ARM-software/acle/releases
- *
- * The ACLE section numbers are subject to change. When consulting the
- * specifications, it is recommended to search using section titles if
- * the section numbers look outdated.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_ACLE_H
-#define __ARM_ACLE_H
-
-#ifndef __ARM_ACLE
-#error "ACLE intrinsics support not enabled."
-#endif
-
-#include <stdint.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
-/* 7.3 Memory barriers */
-#if !__has_builtin(__dmb)
-#define __dmb(i) __builtin_arm_dmb(i)
-#endif
-#if !__has_builtin(__dsb)
-#define __dsb(i) __builtin_arm_dsb(i)
-#endif
-#if !__has_builtin(__isb)
-#define __isb(i) __builtin_arm_isb(i)
-#endif
-
-/* 7.4 Hints */
-
-#if !__has_builtin(__wfi)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
-  __builtin_arm_wfi();
-}
-#endif
-
-#if !__has_builtin(__wfe)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
-  __builtin_arm_wfe();
-}
-#endif
-
-#if !__has_builtin(__sev)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
-  __builtin_arm_sev();
-}
-#endif
-
-#if !__has_builtin(__sevl)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
-  __builtin_arm_sevl();
-}
-#endif
-
-#if !__has_builtin(__yield)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
-  __builtin_arm_yield();
-}
-#endif
-
-#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-#define __dbg(t) __builtin_arm_dbg(t)
-#endif
-
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-#define _CHKFEAT_GCS 1
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-__chkfeat(uint64_t __features) {
-  return __builtin_arm_chkfeat(__features) ^ __features;
-}
-#endif
-
-/* 7.5 Swap */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__swp(uint32_t __x, volatile uint32_t *__p) {
-  uint32_t v;
-  do
-    v = __builtin_arm_ldrex(__p);
-  while (__builtin_arm_strex(__x, __p));
-  return v;
-}
-
-/* 7.6 Memory prefetch intrinsics */
-/* 7.6.1 Data prefetch */
-#define __pld(addr) __pldx(0, 0, 0, addr)
-
-#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-#define __pldx(access_kind, cache_level, retention_policy, addr) \
-  __builtin_arm_prefetch(addr, access_kind, 1)
-#else
-#define __pldx(access_kind, cache_level, retention_policy, addr) \
-  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
-#endif
-
-/* 7.6.2 Instruction prefetch */
-#define __pli(addr) __plix(0, 0, addr)
-
-#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-#define __plix(cache_level, retention_policy, addr) \
-  __builtin_arm_prefetch(addr, 0, 0)
-#else
-#define __plix(cache_level, retention_policy, addr) \
-  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
-#endif
-
-/* 7.7 NOP */
-#if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__))
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
-  __builtin_arm_nop();
-}
-#endif
-
-/* 8 DATA-PROCESSING INTRINSICS */
-/* 8.2 Miscellaneous data-processing intrinsics */
-/* ROR */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__ror(uint32_t __x, uint32_t __y) {
-  __y %= 32;
-  if (__y == 0)
-    return __x;
-  return (__x >> __y) | (__x << (32 - __y));
-}
-
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-__rorll(uint64_t __x, uint32_t __y) {
-  __y %= 64;
-  if (__y == 0)
-    return __x;
-  return (__x >> __y) | (__x << (64 - __y));
-}
-
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-__rorl(unsigned long __x, uint32_t __y) {
-#if __SIZEOF_LONG__ == 4
-  return __ror(__x, __y);
-#else
-  return __rorll(__x, __y);
-#endif
-}
-
-
-/* CLZ */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clz(uint32_t __t) {
-  return __builtin_arm_clz(__t);
-}
-
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clzl(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-  return __builtin_arm_clz(__t);
-#else
-  return __builtin_arm_clz64(__t);
-#endif
-}
-
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clzll(uint64_t __t) {
-  return __builtin_arm_clz64(__t);
-}
-
-/* CLS */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__cls(uint32_t __t) {
-  return __builtin_arm_cls(__t);
-}
-
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clsl(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-  return __builtin_arm_cls(__t);
-#else
-  return __builtin_arm_cls64(__t);
-#endif
-}
-
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clsll(uint64_t __t) {
-  return __builtin_arm_cls64(__t);
-}
-
-/* REV */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__rev(uint32_t __t) {
-  return __builtin_bswap32(__t);
-}
-
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-__revl(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-  return __builtin_bswap32(__t);
-#else
-  return __builtin_bswap64(__t);
-#endif
-}
-
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-__revll(uint64_t __t) {
-  return __builtin_bswap64(__t);
-}
-
-/* REV16 */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__rev16(uint32_t __t) {
-  return __ror(__rev(__t), 16);
-}
-
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-__rev16ll(uint64_t __t) {
-  return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
-}
-
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-__rev16l(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-    return __rev16(__t);
-#else
-    return __rev16ll(__t);
-#endif
-}
-
-/* REVSH */
-static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
-__revsh(int16_t __t) {
-  return (int16_t)__builtin_bswap16((uint16_t)__t);
-}
-
-/* RBIT */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__rbit(uint32_t __t) {
-  return __builtin_arm_rbit(__t);
-}
-
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-__rbitll(uint64_t __t) {
-#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
-         __builtin_arm_rbit(__t >> 32);
-#else
-  return __builtin_arm_rbit64(__t);
-#endif
-}
-
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-__rbitl(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-  return __rbit(__t);
-#else
-  return __rbitll(__t);
-#endif
-}
-
-/* 8.3 16-bit multiplications */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smulbb(int32_t __a, int32_t __b) {
-  return __builtin_arm_smulbb(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smulbt(int32_t __a, int32_t __b) {
-  return __builtin_arm_smulbt(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smultb(int32_t __a, int32_t __b) {
-  return __builtin_arm_smultb(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smultt(int32_t __a, int32_t __b) {
-  return __builtin_arm_smultt(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smulwb(int32_t __a, int32_t __b) {
-  return __builtin_arm_smulwb(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smulwt(int32_t __a, int32_t __b) {
-  return __builtin_arm_smulwt(__a, __b);
-}
-#endif
-
-/*
- * 8.4 Saturating intrinsics
- *
- * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
- * intrinsics are implemented and the flag is enabled.
- */
-/* 8.4.1 Width-specified saturation intrinsics */
-#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
-#define __ssat(x, y) __builtin_arm_ssat(x, y)
-#define __usat(x, y) __builtin_arm_usat(x, y)
-#endif
-
-/* 8.4.2 Saturating addition and subtraction intrinsics */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__qadd(int32_t __t, int32_t __v) {
-  return __builtin_arm_qadd(__t, __v);
-}
-
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__qsub(int32_t __t, int32_t __v) {
-  return __builtin_arm_qsub(__t, __v);
-}
-
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__qdbl(int32_t __t) {
-  return __builtin_arm_qadd(__t, __t);
-}
-#endif
-
-/* 8.4.3 Accumulating multiplications */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlabb(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlabb(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlabt(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlabt(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlatb(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlatb(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlatt(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlatt(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlawb(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlawb(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlawt(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlawt(__a, __b, __c);
-}
-#endif
-
-
-/* 8.5.4 Parallel 16-bit saturation */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
-#define __usat16(x, y) __builtin_arm_usat16(x, y)
-#endif
-
-/* 8.5.5 Packing and unpacking */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-typedef int32_t int8x4_t;
-typedef int32_t int16x2_t;
-typedef uint32_t uint8x4_t;
-typedef uint32_t uint16x2_t;
-
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__sxtab16(int16x2_t __a, int8x4_t __b) {
-  return __builtin_arm_sxtab16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__sxtb16(int8x4_t __a) {
-  return __builtin_arm_sxtb16(__a);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__uxtab16(int16x2_t __a, int8x4_t __b) {
-  return __builtin_arm_uxtab16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__uxtb16(int8x4_t __a) {
-  return __builtin_arm_uxtb16(__a);
-}
-#endif
-
-/* 8.5.6 Parallel selection */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__sel(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_sel(__a, __b);
-}
-#endif
-
-/* 8.5.7 Parallel 8-bit addition and subtraction */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__qadd8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_qadd8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__qsub8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_qsub8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__sadd8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_sadd8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__shadd8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_shadd8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__shsub8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_shsub8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__ssub8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_ssub8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uadd8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uadd8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uhadd8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uhadd8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uhsub8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uhsub8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uqadd8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uqadd8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uqsub8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uqsub8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__usub8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_usub8(__a, __b);
-}
-#endif
-
-/* 8.5.8 Sum of 8-bit absolute differences */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__usad8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_usad8(__a, __b);
-}
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
-  return __builtin_arm_usada8(__a, __b, __c);
-}
-#endif
-
-/* 8.5.9 Parallel 16-bit addition and subtraction */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__qadd16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_qadd16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__qasx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_qasx(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__qsax(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_qsax(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__qsub16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_qsub16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__sadd16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_sadd16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__sasx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_sasx(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__shadd16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_shadd16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__shasx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_shasx(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__shsax(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_shsax(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__shsub16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_shsub16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__ssax(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_ssax(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__ssub16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_ssub16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uadd16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uadd16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uasx(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uasx(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uhadd16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uhadd16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uhasx(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uhasx(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uhsax(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uhsax(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uhsub16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uhsub16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uqadd16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uqadd16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uqasx(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uqasx(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uqsax(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uqsax(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uqsub16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uqsub16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__usax(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_usax(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__usub16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_usub16(__a, __b);
-}
-#endif
-
-/* 8.5.10 Parallel 16-bit multiplication */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
-  return __builtin_arm_smlad(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
-  return __builtin_arm_smladx(__a, __b, __c);
-}
-static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
-__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
-  return __builtin_arm_smlald(__a, __b, __c);
-}
-static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
-__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
-  return __builtin_arm_smlaldx(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
-  return __builtin_arm_smlsd(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
-  return __builtin_arm_smlsdx(__a, __b, __c);
-}
-static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
-__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
-  return __builtin_arm_smlsld(__a, __b, __c);
-}
-static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
-__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
-  return __builtin_arm_smlsldx(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smuad(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_smuad(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smuadx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_smuadx(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smusd(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_smusd(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smusdx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_smusdx(__a, __b);
-}
-#endif
-
-/* 8.6 Floating-point data-processing intrinsics */
-#if (defined(__ARM_FEATURE_DIRECTED_ROUNDING)    &&                         \
-  (__ARM_FEATURE_DIRECTED_ROUNDING))             &&                         \
-  (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
-static __inline__ double __attribute__((__always_inline__, __nodebug__))
-__rintn(double __a) {
-  return __builtin_roundeven(__a);
-}
-
-static __inline__ float __attribute__((__always_inline__, __nodebug__))
-__rintnf(float __a) {
-  return __builtin_roundevenf(__a);
-}
-#endif
-
-/* 8.8 CRC32 intrinsics */
-#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
-    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32b(uint32_t __a, uint8_t __b) {
-  return __builtin_arm_crc32b(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32h(uint32_t __a, uint16_t __b) {
-  return __builtin_arm_crc32h(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32w(uint32_t __a, uint32_t __b) {
-  return __builtin_arm_crc32w(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32d(uint32_t __a, uint64_t __b) {
-  return __builtin_arm_crc32d(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32cb(uint32_t __a, uint8_t __b) {
-  return __builtin_arm_crc32cb(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32ch(uint32_t __a, uint16_t __b) {
-  return __builtin_arm_crc32ch(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32cw(uint32_t __a, uint32_t __b) {
-  return __builtin_arm_crc32cw(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32cd(uint32_t __a, uint64_t __b) {
-  return __builtin_arm_crc32cd(__a, __b);
-}
-#endif
-
-/* 8.6 Floating-point data-processing intrinsics */
-/* Armv8.3-A Javascript conversion intrinsic */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
-__jcvt(double __a) {
-  return __builtin_arm_jcvt(__a);
-}
-#endif
-
-/* Armv8.5-A FP rounding intrinsics */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint32zf(float __a) {
-  return __builtin_arm_rint32zf(__a);
-}
-
-static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint32z(double __a) {
-  return __builtin_arm_rint32z(__a);
-}
-
-static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint64zf(float __a) {
-  return __builtin_arm_rint64zf(__a);
-}
-
-static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint64z(double __a) {
-  return __builtin_arm_rint64z(__a);
-}
-
-static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint32xf(float __a) {
-  return __builtin_arm_rint32xf(__a);
-}
-
-static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint32x(double __a) {
-  return __builtin_arm_rint32x(__a);
-}
-
-static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint64xf(float __a) {
-  return __builtin_arm_rint64xf(__a);
-}
-
-static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint64x(double __a) {
-  return __builtin_arm_rint64x(__a);
-}
-#endif
-
-/* 8.9 Armv8.7-A load/store 64-byte intrinsics */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-typedef struct {
-    uint64_t val[8];
-} data512_t;
-
-static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
-__arm_ld64b(const void *__addr) {
-  data512_t __value;
-  __builtin_arm_ld64b(__addr, __value.val);
-  return __value;
-}
-static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
-__arm_st64b(void *__addr, data512_t __value) {
-  __builtin_arm_st64b(__addr, __value.val);
-}
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
-__arm_st64bv(void *__addr, data512_t __value) {
-  return __builtin_arm_st64bv(__addr, __value.val);
-}
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
-__arm_st64bv0(void *__addr, data512_t __value) {
-  return __builtin_arm_st64bv0(__addr, __value.val);
-}
-#endif
-
-/* 11.1 Special register intrinsics */
-#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
-#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
-#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
-#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
-#define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
-#define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
-#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
-#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
-#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
-#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
-#define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
-#define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
-
-/* 10.3 MTE intrinsics */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-#define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
-#define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
-#define __arm_mte_exclude_tag(__ptr, __excluded)  __builtin_arm_gmi(__ptr, __excluded)
-#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
-#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
-#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
-
-/* 18 memcpy family of operations intrinsics - MOPS */
-#define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
-  __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
-#endif
-
-/* 11.3 Coprocessor Intrinsics */
-#if defined(__ARM_FEATURE_COPROC)
-
-#if (__ARM_FEATURE_COPROC & 0x1)
-
-#if (__ARM_ARCH < 8)
-#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
-  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
-#endif /* __ARM_ARCH < 8 */
-
-#define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p)
-#define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p)
-
-#define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2)                         \
-  __builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2)
-#define __arm_mrc(coproc, opc1, CRn, CRm, opc2)                                \
-  __builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2)
-
-#if (__ARM_ARCH != 4) && (__ARM_ARCH < 8)
-#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
-#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
-#endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */
-
-#if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__)
-#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
-  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
-#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
-#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
-#endif /* ___ARM_ARCH_8M_MAIN__ */
-
-#endif /* __ARM_FEATURE_COPROC & 0x1 */
-
-#if (__ARM_FEATURE_COPROC & 0x2)
-#define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)                          \
-  __builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)
-#define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p)
-#define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p)
-#define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p)
-#define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p)
-#define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)                        \
-  __builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)
-#define __arm_mrc2(coproc, opc1, CRn, CRm, opc2)                               \
-  __builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2)
-#endif
-
-#if (__ARM_FEATURE_COPROC & 0x4)
-#define __arm_mcrr(coproc, opc1, value, CRm)                                   \
-  __builtin_arm_mcrr(coproc, opc1, value, CRm)
-#define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm)
-#endif
-
-#if (__ARM_FEATURE_COPROC & 0x8)
-#define __arm_mcrr2(coproc, opc1, value, CRm)                                  \
-  __builtin_arm_mcrr2(coproc, opc1, value, CRm)
-#define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm)
-#endif
-
-#endif // __ARM_FEATURE_COPROC
-
-/* 17 Transactional Memory Extension (TME) Intrinsics */
-#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
-
-#define _TMFAILURE_REASON  0x00007fffu
-#define _TMFAILURE_RTRY    0x00008000u
-#define _TMFAILURE_CNCL    0x00010000u
-#define _TMFAILURE_MEM     0x00020000u
-#define _TMFAILURE_IMP     0x00040000u
-#define _TMFAILURE_ERR     0x00080000u
-#define _TMFAILURE_SIZE    0x00100000u
-#define _TMFAILURE_NEST    0x00200000u
-#define _TMFAILURE_DBG     0x00400000u
-#define _TMFAILURE_INT     0x00800000u
-#define _TMFAILURE_TRIVIAL 0x01000000u
-
-#define __tstart()        __builtin_arm_tstart()
-#define __tcommit()       __builtin_arm_tcommit()
-#define __tcancel(__arg)  __builtin_arm_tcancel(__arg)
-#define __ttest()         __builtin_arm_ttest()
-
-#endif /* __ARM_FEATURE_TME */
-
-/* 8.7 Armv8.5-A Random number generation intrinsics */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
-__rndr(uint64_t *__p) {
-  return __builtin_arm_rndr(__p);
-}
-static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
-__rndrrs(uint64_t *__p) {
-  return __builtin_arm_rndrrs(__p);
-}
-#endif
-
-/* 11.2 Guarded Control Stack intrinsics */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-static __inline__ void * __attribute__((__always_inline__, __nodebug__))
-__gcspr() {
-  return (void *)__builtin_arm_rsr64("gcspr_el0");
-}
-
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("gcs")))
-__gcspopm() {
-  return __builtin_arm_gcspopm(0);
-}
-
-static __inline__ const void * __attribute__((__always_inline__, __nodebug__, target("gcs")))
-__gcsss(const void *__stack) {
-  return __builtin_arm_gcsss(__stack);
-}
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* __ARM_ACLE_H */
diff --git a/third_party/aarch64/clang/arm_bf16.h b/third_party/aarch64/clang/arm_bf16.h
deleted file mode 100644
index 329ae39e6..000000000
--- a/third_party/aarch64/clang/arm_bf16.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*===---- arm_bf16.h - ARM BF16 intrinsics -----------------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_BF16_H
-#define __ARM_BF16_H
-
-typedef __bf16 bfloat16_t;
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-
-#undef __ai
-
-#endif
diff --git a/third_party/aarch64/clang/arm_cde.h b/third_party/aarch64/clang/arm_cde.h
deleted file mode 100644
index 4ad5d825d..000000000
--- a/third_party/aarch64/clang/arm_cde.h
+++ /dev/null
@@ -1,410 +0,0 @@
-/*===---- arm_cde.h - ARM CDE intrinsics -----------------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_CDE_H
-#define __ARM_CDE_H
-
-#if !__ARM_FEATURE_CDE
-#error "CDE support not enabled"
-#endif
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1)))
-uint32_t __arm_cx1(int, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1a)))
-uint32_t __arm_cx1a(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1d)))
-uint64_t __arm_cx1d(int, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1da)))
-uint64_t __arm_cx1da(int, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2)))
-uint32_t __arm_cx2(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2a)))
-uint32_t __arm_cx2a(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2d)))
-uint64_t __arm_cx2d(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2da)))
-uint64_t __arm_cx2da(int, uint64_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3)))
-uint32_t __arm_cx3(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3a)))
-uint32_t __arm_cx3a(int, uint32_t, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3d)))
-uint64_t __arm_cx3d(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3da)))
-uint64_t __arm_cx3da(int, uint64_t, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1_u32)))
-uint32_t __arm_vcx1_u32(int, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1a_u32)))
-uint32_t __arm_vcx1a_u32(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1d_u64)))
-uint64_t __arm_vcx1d_u64(int, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1da_u64)))
-uint64_t __arm_vcx1da_u64(int, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2_u32)))
-uint32_t __arm_vcx2_u32(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2a_u32)))
-uint32_t __arm_vcx2a_u32(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2d_u64)))
-uint64_t __arm_vcx2d_u64(int, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2da_u64)))
-uint64_t __arm_vcx2da_u64(int, uint64_t, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3_u32)))
-uint32_t __arm_vcx3_u32(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3a_u32)))
-uint32_t __arm_vcx3a_u32(int, uint32_t, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3d_u64)))
-uint64_t __arm_vcx3d_u64(int, uint64_t, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3da_u64)))
-uint64_t __arm_vcx3da_u64(int, uint64_t, uint64_t, uint64_t, uint32_t);
-
-#if __ARM_FEATURE_MVE
-
-typedef uint16_t mve_pred16_t;
-typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
-typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
-typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
-typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
-typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
-typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
-typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
-typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
-
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s16)))
-int16x8_t __arm_vcx1q_m(int, int16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s32)))
-int32x4_t __arm_vcx1q_m(int, int32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s64)))
-int64x2_t __arm_vcx1q_m(int, int64x2_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s8)))
-int8x16_t __arm_vcx1q_m(int, int8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u16)))
-uint16x8_t __arm_vcx1q_m(int, uint16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u32)))
-uint32x4_t __arm_vcx1q_m(int, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u64)))
-uint64x2_t __arm_vcx1q_m(int, uint64x2_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u8)))
-uint8x16_t __arm_vcx1q_m(int, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_u8)))
-uint8x16_t __arm_vcx1q_u8(int, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s16)))
-int16x8_t __arm_vcx1qa_m(int, int16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s32)))
-int32x4_t __arm_vcx1qa_m(int, int32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s64)))
-int64x2_t __arm_vcx1qa_m(int, int64x2_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s8)))
-int8x16_t __arm_vcx1qa_m(int, int8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u16)))
-uint16x8_t __arm_vcx1qa_m(int, uint16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u32)))
-uint32x4_t __arm_vcx1qa_m(int, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u64)))
-uint64x2_t __arm_vcx1qa_m(int, uint64x2_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u8)))
-uint8x16_t __arm_vcx1qa_m(int, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s16)))
-int16x8_t __arm_vcx1qa(int, int16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s32)))
-int32x4_t __arm_vcx1qa(int, int32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s64)))
-int64x2_t __arm_vcx1qa(int, int64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s8)))
-int8x16_t __arm_vcx1qa(int, int8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u16)))
-uint16x8_t __arm_vcx1qa(int, uint16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u32)))
-uint32x4_t __arm_vcx1qa(int, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u64)))
-uint64x2_t __arm_vcx1qa(int, uint64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u8)))
-uint8x16_t __arm_vcx1qa(int, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s16)))
-int16x8_t __arm_vcx2q_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s32)))
-int32x4_t __arm_vcx2q_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s64)))
-int64x2_t __arm_vcx2q_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s8)))
-int8x16_t __arm_vcx2q_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u16)))
-uint16x8_t __arm_vcx2q_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u32)))
-uint32x4_t __arm_vcx2q_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u64)))
-uint64x2_t __arm_vcx2q_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u8)))
-uint8x16_t __arm_vcx2q_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s16)))
-int16x8_t __arm_vcx2q(int, int16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s32)))
-int32x4_t __arm_vcx2q(int, int32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s64)))
-int64x2_t __arm_vcx2q(int, int64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s8)))
-int8x16_t __arm_vcx2q(int, int8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u16)))
-uint16x8_t __arm_vcx2q(int, uint16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u32)))
-uint32x4_t __arm_vcx2q(int, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u64)))
-uint64x2_t __arm_vcx2q(int, uint64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8)))
-uint8x16_t __arm_vcx2q(int, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s16)))
-uint8x16_t __arm_vcx2q_u8(int, int16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s32)))
-uint8x16_t __arm_vcx2q_u8(int, int32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s64)))
-uint8x16_t __arm_vcx2q_u8(int, int64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s8)))
-uint8x16_t __arm_vcx2q_u8(int, int8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u16)))
-uint8x16_t __arm_vcx2q_u8(int, uint16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u32)))
-uint8x16_t __arm_vcx2q_u8(int, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u64)))
-uint8x16_t __arm_vcx2q_u8(int, uint64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u8)))
-uint8x16_t __arm_vcx2q_u8(int, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s16)))
-int16x8_t __arm_vcx2qa_impl(int, int16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s32)))
-int32x4_t __arm_vcx2qa_impl(int, int32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s64)))
-int64x2_t __arm_vcx2qa_impl(int, int64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s8)))
-int8x16_t __arm_vcx2qa_impl(int, int8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u16)))
-uint16x8_t __arm_vcx2qa_impl(int, uint16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u32)))
-uint32x4_t __arm_vcx2qa_impl(int, uint32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u64)))
-uint64x2_t __arm_vcx2qa_impl(int, uint64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u8)))
-uint8x16_t __arm_vcx2qa_impl(int, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s16)))
-int16x8_t __arm_vcx2qa_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s32)))
-int32x4_t __arm_vcx2qa_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s64)))
-int64x2_t __arm_vcx2qa_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s8)))
-int8x16_t __arm_vcx2qa_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u16)))
-uint16x8_t __arm_vcx2qa_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u32)))
-uint32x4_t __arm_vcx2qa_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u64)))
-uint64x2_t __arm_vcx2qa_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u8)))
-uint8x16_t __arm_vcx2qa_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s16)))
-int16x8_t __arm_vcx3q_impl(int, int16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s32)))
-int32x4_t __arm_vcx3q_impl(int, int32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s64)))
-int64x2_t __arm_vcx3q_impl(int, int64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s8)))
-int8x16_t __arm_vcx3q_impl(int, int8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u16)))
-uint16x8_t __arm_vcx3q_impl(int, uint16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u32)))
-uint32x4_t __arm_vcx3q_impl(int, uint32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u64)))
-uint64x2_t __arm_vcx3q_impl(int, uint64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u8)))
-uint8x16_t __arm_vcx3q_impl(int, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s16)))
-int16x8_t __arm_vcx3q_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s32)))
-int32x4_t __arm_vcx3q_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s64)))
-int64x2_t __arm_vcx3q_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s8)))
-int8x16_t __arm_vcx3q_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u16)))
-uint16x8_t __arm_vcx3q_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u32)))
-uint32x4_t __arm_vcx3q_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u64)))
-uint64x2_t __arm_vcx3q_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u8)))
-uint8x16_t __arm_vcx3q_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s16)))
-uint8x16_t __arm_vcx3q_u8_impl(int, int16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s32)))
-uint8x16_t __arm_vcx3q_u8_impl(int, int32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s64)))
-uint8x16_t __arm_vcx3q_u8_impl(int, int64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s8)))
-uint8x16_t __arm_vcx3q_u8_impl(int, int8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u16)))
-uint8x16_t __arm_vcx3q_u8_impl(int, uint16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u32)))
-uint8x16_t __arm_vcx3q_u8_impl(int, uint32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u64)))
-uint8x16_t __arm_vcx3q_u8_impl(int, uint64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u8)))
-uint8x16_t __arm_vcx3q_u8_impl(int, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s16)))
-int16x8_t __arm_vcx3qa_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s32)))
-int32x4_t __arm_vcx3qa_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s64)))
-int64x2_t __arm_vcx3qa_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s8)))
-int8x16_t __arm_vcx3qa_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u16)))
-uint16x8_t __arm_vcx3qa_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u32)))
-uint32x4_t __arm_vcx3qa_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u64)))
-uint64x2_t __arm_vcx3qa_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u8)))
-uint8x16_t __arm_vcx3qa_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s16)))
-int16x8_t __arm_vcx3qa_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s32)))
-int32x4_t __arm_vcx3qa_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s64)))
-int64x2_t __arm_vcx3qa_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s8)))
-int8x16_t __arm_vcx3qa_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u16)))
-uint16x8_t __arm_vcx3qa_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u32)))
-uint32x4_t __arm_vcx3qa_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u64)))
-uint64x2_t __arm_vcx3qa_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u8)))
-uint8x16_t __arm_vcx3qa_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
-int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
-int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
-int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
-int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
-uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
-uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
-uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
-uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
-uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
-uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
-uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
-uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
-uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
-uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vreinterpretq_u8_u8)))
-uint8x16_t __arm_vreinterpretq_u8(uint8x16_t);
-#define __arm_vcx2q_m(cp, inactive, n, imm, pred) __arm_vcx2q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), (imm), (pred))
-#define __arm_vcx2qa(cp, acc, n, imm) __arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))
-#define __arm_vcx2qa_m(cp, acc, n, imm, pred) __arm_vcx2qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm), (pred))
-#define __arm_vcx3q(cp, n, m, imm) __arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
-#define __arm_vcx3q_m(cp, inactive, n, m, imm, pred) __arm_vcx3q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
-#define __arm_vcx3q_u8(cp, n, m, imm) __arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
-#define __arm_vcx3qa(cp, acc, n, m, imm) __arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm))
-#define __arm_vcx3qa_m(cp, acc, n, m, imm, pred) __arm_vcx3qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
-
-#endif /* __ARM_FEATURE_MVE */
-
-#if __ARM_FEATURE_MVE & 2
-
-typedef __fp16 float16_t;
-typedef float float32_t;
-typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
-typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
-
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f16)))
-float16x8_t __arm_vcx1q_m(int, float16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f32)))
-float32x4_t __arm_vcx1q_m(int, float32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f16)))
-float16x8_t __arm_vcx1qa(int, float16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f32)))
-float32x4_t __arm_vcx1qa(int, float32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f16)))
-float16x8_t __arm_vcx1qa_m(int, float16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f32)))
-float32x4_t __arm_vcx1qa_m(int, float32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f16)))
-float16x8_t __arm_vcx2q(int, float16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f32)))
-float32x4_t __arm_vcx2q(int, float32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f16)))
-float16x8_t __arm_vcx2q_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f32)))
-float32x4_t __arm_vcx2q_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f16)))
-uint8x16_t __arm_vcx2q_u8(int, float16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f32)))
-uint8x16_t __arm_vcx2q_u8(int, float32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f16)))
-float16x8_t __arm_vcx2qa_impl(int, float16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f32)))
-float32x4_t __arm_vcx2qa_impl(int, float32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f16)))
-float16x8_t __arm_vcx2qa_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f32)))
-float32x4_t __arm_vcx2qa_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f16)))
-float16x8_t __arm_vcx3q_impl(int, float16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f32)))
-float32x4_t __arm_vcx3q_impl(int, float32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f16)))
-float16x8_t __arm_vcx3q_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f32)))
-float32x4_t __arm_vcx3q_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f16)))
-uint8x16_t __arm_vcx3q_u8_impl(int, float16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f32)))
-uint8x16_t __arm_vcx3q_u8_impl(int, float32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f16)))
-float16x8_t __arm_vcx3qa_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f32)))
-float32x4_t __arm_vcx3qa_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f16)))
-float16x8_t __arm_vcx3qa_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f32)))
-float32x4_t __arm_vcx3qa_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
-float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
-float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
-uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
-uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
-
-#endif /* __ARM_FEATURE_MVE & 2 */
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* __ARM_CDE_H */
diff --git a/third_party/aarch64/clang/arm_cmse.h b/third_party/aarch64/clang/arm_cmse.h
deleted file mode 100644
index ecf50ecc5..000000000
--- a/third_party/aarch64/clang/arm_cmse.h
+++ /dev/null
@@ -1,217 +0,0 @@
-//===---- arm_cmse.h - Arm CMSE support -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __ARM_CMSE_H
-#define __ARM_CMSE_H
-
-#if (__ARM_FEATURE_CMSE & 0x1)
-#include <stddef.h>
-#include <stdint.h>
-
-#define __ARM_CMSE_SECURE_MODE (__ARM_FEATURE_CMSE & 0x2)
-#define CMSE_MPU_READWRITE 1 /* checks if readwrite_ok field is set */
-#define CMSE_AU_NONSECURE  2 /* checks if permissions have secure field unset */
-#define CMSE_MPU_UNPRIV    4 /* sets T flag on TT insrtuction */
-#define CMSE_MPU_READ      8 /* checks if read_ok field is set */
-#define CMSE_MPU_NONSECURE 16 /* sets A flag, checks if secure field unset */
-#define CMSE_NONSECURE (CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE)
-
-#define cmse_check_pointed_object(p, f) \
-  cmse_check_address_range((p), sizeof(*(p)), (f))
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-typedef union {
-  struct cmse_address_info {
-#ifdef __ARM_BIG_ENDIAN
-    /* __ARM_BIG_ENDIAN */
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned idau_region : 8;
-    unsigned idau_region_valid : 1;
-    unsigned secure : 1;
-    unsigned nonsecure_readwrite_ok : 1;
-    unsigned nonsecure_read_ok : 1;
-#else
-    unsigned : 12;
-#endif
-    unsigned readwrite_ok : 1;
-    unsigned read_ok : 1;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned sau_region_valid : 1;
-#else
-    unsigned : 1;
-#endif
-    unsigned mpu_region_valid : 1;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned sau_region : 8;
-#else
-    unsigned : 8;
-#endif
-    unsigned mpu_region : 8;
-
-#else /* __ARM_LITTLE_ENDIAN */
-    unsigned mpu_region : 8;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned sau_region : 8;
-#else
-    unsigned : 8;
-#endif
-    unsigned mpu_region_valid : 1;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned sau_region_valid : 1;
-#else
-    unsigned : 1;
-#endif
-    unsigned read_ok : 1;
-    unsigned readwrite_ok : 1;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned nonsecure_read_ok : 1;
-    unsigned nonsecure_readwrite_ok : 1;
-    unsigned secure : 1;
-    unsigned idau_region_valid : 1;
-    unsigned idau_region : 8;
-#else
-    unsigned : 12;
-#endif
-#endif /*__ARM_LITTLE_ENDIAN */
-  } flags;
-  unsigned value;
-} cmse_address_info_t;
-
-static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
-cmse_TT(void *__p) {
-  cmse_address_info_t __u;
-  __u.value = __builtin_arm_cmse_TT(__p);
-  return __u;
-}
-static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
-cmse_TTT(void *__p) {
-  cmse_address_info_t __u;
-  __u.value = __builtin_arm_cmse_TTT(__p);
-  return __u;
-}
-
-#if __ARM_CMSE_SECURE_MODE
-static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
-cmse_TTA(void *__p) {
-  cmse_address_info_t __u;
-  __u.value = __builtin_arm_cmse_TTA(__p);
-  return __u;
-}
-static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
-cmse_TTAT(void *__p) {
-  cmse_address_info_t __u;
-  __u.value = __builtin_arm_cmse_TTAT(__p);
-  return __u;
-}
-#endif
-
-#define cmse_TT_fptr(p) cmse_TT(__builtin_bit_cast(void *, (p)))
-#define cmse_TTT_fptr(p) cmse_TTT(__builtin_bit_cast(void *, (p)))
-
-#if __ARM_CMSE_SECURE_MODE
-#define cmse_TTA_fptr(p) cmse_TTA(__builtin_bit_cast(void *, (p)))
-#define cmse_TTAT_fptr(p) cmse_TTAT(__builtin_bit_cast(void *, (p)))
-#endif
-
-static void *__attribute__((__always_inline__))
-cmse_check_address_range(void *__pb, size_t __s, int __flags) {
-  uintptr_t __begin = (uintptr_t)__pb;
-  uintptr_t __end = __begin + __s - 1;
-
-  if (__end < __begin)
-    return NULL; /* wrap around check */
-
-  /* Check whether the range crosses a 32-bytes aligned address */
-  const int __single_check = (__begin ^ __end) < 0x20u;
-
-  /* execute the right variant of the TT instructions */
-  void *__pe = (void *)__end;
-  cmse_address_info_t __permb, __perme;
-  switch (__flags & (CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
-  case 0:
-    __permb = cmse_TT(__pb);
-    __perme = __single_check ? __permb : cmse_TT(__pe);
-    break;
-  case CMSE_MPU_UNPRIV:
-    __permb = cmse_TTT(__pb);
-    __perme = __single_check ? __permb : cmse_TTT(__pe);
-    break;
-#if __ARM_CMSE_SECURE_MODE
-  case CMSE_MPU_NONSECURE:
-    __permb = cmse_TTA(__pb);
-    __perme = __single_check ? __permb : cmse_TTA(__pe);
-    break;
-  case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
-    __permb = cmse_TTAT(__pb);
-    __perme = __single_check ? __permb : cmse_TTAT(__pe);
-    break;
-#endif
-  /* if CMSE_NONSECURE is specified w/o __ARM_CMSE_SECURE_MODE */
-  default:
-    return NULL;
-  }
-
-  /* check that the range does not cross MPU, SAU, or IDAU region boundaries */
-  if (__permb.value != __perme.value)
-    return NULL;
-#if !(__ARM_CMSE_SECURE_MODE)
-  /* CMSE_AU_NONSECURE is only supported when __ARM_FEATURE_CMSE & 0x2 */
-  if (__flags & CMSE_AU_NONSECURE)
-    return NULL;
-#endif
-
-  /* check the permission on the range */
-  switch (__flags & ~(CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
-#if (__ARM_CMSE_SECURE_MODE)
-  case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
-  case CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
-    return __permb.flags.nonsecure_readwrite_ok ? __pb : NULL;
-
-  case CMSE_MPU_READ | CMSE_AU_NONSECURE:
-    return __permb.flags.nonsecure_read_ok ? __pb : NULL;
-
-  case CMSE_AU_NONSECURE:
-    return __permb.flags.secure ? NULL : __pb;
-#endif
-  case CMSE_MPU_READ | CMSE_MPU_READWRITE:
-  case CMSE_MPU_READWRITE:
-    return __permb.flags.readwrite_ok ? __pb : NULL;
-
-  case CMSE_MPU_READ:
-    return __permb.flags.read_ok ? __pb : NULL;
-
-  default:
-    return NULL;
-  }
-}
-
-#if __ARM_CMSE_SECURE_MODE
-static int __attribute__((__always_inline__, __nodebug__))
-cmse_nonsecure_caller(void) {
-  return !((uintptr_t)__builtin_return_address(0) & 1);
-}
-
-#define cmse_nsfptr_create(p)                                                  \
-  __builtin_bit_cast(__typeof__(p),                                            \
-                     (__builtin_bit_cast(uintptr_t, p) & ~(uintptr_t)1))
-
-#define cmse_is_nsfptr(p) ((__builtin_bit_cast(uintptr_t, p) & 1) == 0)
-
-#endif /* __ARM_CMSE_SECURE_MODE */
-
-void __attribute__((__noreturn__)) cmse_abort(void);
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* (__ARM_FEATURE_CMSE & 0x1) */
-
-#endif /* __ARM_CMSE_H */
diff --git a/third_party/aarch64/clang/arm_fp16.h b/third_party/aarch64/clang/arm_fp16.h
deleted file mode 100644
index 2dd0653ab..000000000
--- a/third_party/aarch64/clang/arm_fp16.h
+++ /dev/null
@@ -1,596 +0,0 @@
-/*===---- arm_fp16.h - ARM FP16 intrinsics ---------------------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_FP16_H
-#define __ARM_FP16_H
-
-#include <stdint.h>
-
-typedef __fp16 float16_t;
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-#if defined(__aarch64__) || defined(__arm64ec__)
-#define vabdh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vabsh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \
-  __ret; \
-})
-#define vaddh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcageh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcagth_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcaleh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcalth_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \
-  __ret; \
-})
-#define vceqh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vceqzh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \
-  __ret; \
-})
-#define vcgeh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcgezh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \
-  __ret; \
-})
-#define vcgth_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcgtzh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \
-  __ret; \
-})
-#define vcleh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vclezh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \
-  __ret; \
-})
-#define vclth_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcltzh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \
-  __ret; \
-})
-#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvth_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvth_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvth_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvth_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvth_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \
-  __ret; \
-})
-#define vcvth_f16_u16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  uint16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_u16(__s0); \
-  __ret; \
-})
-#define vcvth_f16_s16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_s16(__s0); \
-  __ret; \
-})
-#define vcvth_f16_u32(__p0) __extension__ ({ \
-  float16_t __ret; \
-  uint32_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_u32(__s0); \
-  __ret; \
-})
-#define vcvth_f16_s32(__p0) __extension__ ({ \
-  float16_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_s32(__s0); \
-  __ret; \
-})
-#define vcvth_f16_u64(__p0) __extension__ ({ \
-  float16_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_u64(__s0); \
-  __ret; \
-})
-#define vcvth_f16_s64(__p0) __extension__ ({ \
-  float16_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_s64(__s0); \
-  __ret; \
-})
-#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  uint32_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  uint16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \
-  __ret; \
-})
-#define vcvtmh_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \
-  __ret; \
-})
-#define vdivh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  __ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \
-  __ret; \
-})
-#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  __ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \
-  __ret; \
-})
-#define vmaxh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vminh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vminnmh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vmulh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vmulxh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vnegh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \
-  __ret; \
-})
-#define vrecpeh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \
-  __ret; \
-})
-#define vrecpsh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vrecpxh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \
-  __ret; \
-})
-#define vrndh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \
-  __ret; \
-})
-#define vrndah_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \
-  __ret; \
-})
-#define vrndih_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \
-  __ret; \
-})
-#define vrndmh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \
-  __ret; \
-})
-#define vrndnh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \
-  __ret; \
-})
-#define vrndph_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \
-  __ret; \
-})
-#define vrndxh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \
-  __ret; \
-})
-#define vrsqrteh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \
-  __ret; \
-})
-#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vsqrth_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \
-  __ret; \
-})
-#define vsubh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \
-  __ret; \
-})
-#endif
-
-#undef __ai
-
-#endif /* __ARM_FP16_H */
diff --git a/third_party/aarch64/clang/arm_mve.h b/third_party/aarch64/clang/arm_mve.h
deleted file mode 100644
index 4da41dc3c..000000000
--- a/third_party/aarch64/clang/arm_mve.h
+++ /dev/null
@@ -1,19187 +0,0 @@
-/*===---- arm_mve.h - ARM MVE intrinsics -----------------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_MVE_H
-#define __ARM_MVE_H
-
-#if !__ARM_FEATURE_MVE
-#error "MVE support not enabled"
-#endif
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef uint16_t mve_pred16_t;
-typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
-typedef struct { int16x8_t val[2]; } int16x8x2_t;
-typedef struct { int16x8_t val[4]; } int16x8x4_t;
-typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
-typedef struct { int32x4_t val[2]; } int32x4x2_t;
-typedef struct { int32x4_t val[4]; } int32x4x4_t;
-typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
-typedef struct { int64x2_t val[2]; } int64x2x2_t;
-typedef struct { int64x2_t val[4]; } int64x2x4_t;
-typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
-typedef struct { int8x16_t val[2]; } int8x16x2_t;
-typedef struct { int8x16_t val[4]; } int8x16x4_t;
-typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
-typedef struct { uint16x8_t val[2]; } uint16x8x2_t;
-typedef struct { uint16x8_t val[4]; } uint16x8x4_t;
-typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
-typedef struct { uint32x4_t val[2]; } uint32x4x2_t;
-typedef struct { uint32x4_t val[4]; } uint32x4x4_t;
-typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
-typedef struct { uint64x2_t val[2]; } uint64x2x2_t;
-typedef struct { uint64x2_t val[4]; } uint64x2x4_t;
-typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
-typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
-typedef struct { uint8x16_t val[4]; } uint8x16x4_t;
-
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_asrl)))
-int64_t __arm_asrl(int64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_lsll)))
-uint64_t __arm_lsll(uint64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqrshr)))
-int32_t __arm_sqrshr(int32_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqrshrl)))
-int64_t __arm_sqrshrl(int64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqrshrl_sat48)))
-int64_t __arm_sqrshrl_sat48(int64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqshl)))
-int32_t __arm_sqshl(int32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqshll)))
-int64_t __arm_sqshll(int64_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_srshr)))
-int32_t __arm_srshr(int32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_srshrl)))
-int64_t __arm_srshrl(int64_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqrshl)))
-uint32_t __arm_uqrshl(uint32_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqrshll)))
-uint64_t __arm_uqrshll(uint64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqrshll_sat48)))
-uint64_t __arm_uqrshll_sat48(uint64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqshl)))
-uint32_t __arm_uqshl(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqshll)))
-uint64_t __arm_uqshll(uint64_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_urshr)))
-uint32_t __arm_urshr(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_urshrl)))
-uint64_t __arm_urshrl(uint64_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s16)))
-uint32_t __arm_vabavq_p_s16(uint32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s16)))
-uint32_t __arm_vabavq_p(uint32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s32)))
-uint32_t __arm_vabavq_p_s32(uint32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s32)))
-uint32_t __arm_vabavq_p(uint32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s8)))
-uint32_t __arm_vabavq_p_s8(uint32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s8)))
-uint32_t __arm_vabavq_p(uint32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u16)))
-uint32_t __arm_vabavq_p_u16(uint32_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u16)))
-uint32_t __arm_vabavq_p(uint32_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u32)))
-uint32_t __arm_vabavq_p_u32(uint32_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u32)))
-uint32_t __arm_vabavq_p(uint32_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u8)))
-uint32_t __arm_vabavq_p_u8(uint32_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u8)))
-uint32_t __arm_vabavq_p(uint32_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s16)))
-uint32_t __arm_vabavq_s16(uint32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s16)))
-uint32_t __arm_vabavq(uint32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s32)))
-uint32_t __arm_vabavq_s32(uint32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s32)))
-uint32_t __arm_vabavq(uint32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s8)))
-uint32_t __arm_vabavq_s8(uint32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s8)))
-uint32_t __arm_vabavq(uint32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u16)))
-uint32_t __arm_vabavq_u16(uint32_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u16)))
-uint32_t __arm_vabavq(uint32_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u32)))
-uint32_t __arm_vabavq_u32(uint32_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u32)))
-uint32_t __arm_vabavq(uint32_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u8)))
-uint32_t __arm_vabavq_u8(uint32_t, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u8)))
-uint32_t __arm_vabavq(uint32_t, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s16)))
-int16x8_t __arm_vabdq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s16)))
-int16x8_t __arm_vabdq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s32)))
-int32x4_t __arm_vabdq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s32)))
-int32x4_t __arm_vabdq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s8)))
-int8x16_t __arm_vabdq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s8)))
-int8x16_t __arm_vabdq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u16)))
-uint16x8_t __arm_vabdq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u16)))
-uint16x8_t __arm_vabdq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u32)))
-uint32x4_t __arm_vabdq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u32)))
-uint32x4_t __arm_vabdq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u8)))
-uint8x16_t __arm_vabdq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u8)))
-uint8x16_t __arm_vabdq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s16)))
-int16x8_t __arm_vabdq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s16)))
-int16x8_t __arm_vabdq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s32)))
-int32x4_t __arm_vabdq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s32)))
-int32x4_t __arm_vabdq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s8)))
-int8x16_t __arm_vabdq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s8)))
-int8x16_t __arm_vabdq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u16)))
-uint16x8_t __arm_vabdq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u16)))
-uint16x8_t __arm_vabdq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u32)))
-uint32x4_t __arm_vabdq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u32)))
-uint32x4_t __arm_vabdq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u8)))
-uint8x16_t __arm_vabdq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u8)))
-uint8x16_t __arm_vabdq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s16)))
-int16x8_t __arm_vabdq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s16)))
-int16x8_t __arm_vabdq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s32)))
-int32x4_t __arm_vabdq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s32)))
-int32x4_t __arm_vabdq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s8)))
-int8x16_t __arm_vabdq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s8)))
-int8x16_t __arm_vabdq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u16)))
-uint16x8_t __arm_vabdq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u16)))
-uint16x8_t __arm_vabdq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u32)))
-uint32x4_t __arm_vabdq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u32)))
-uint32x4_t __arm_vabdq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u8)))
-uint8x16_t __arm_vabdq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u8)))
-uint8x16_t __arm_vabdq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s16)))
-int16x8_t __arm_vabsq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s16)))
-int16x8_t __arm_vabsq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s32)))
-int32x4_t __arm_vabsq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s32)))
-int32x4_t __arm_vabsq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s8)))
-int8x16_t __arm_vabsq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s8)))
-int8x16_t __arm_vabsq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s16)))
-int16x8_t __arm_vabsq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s16)))
-int16x8_t __arm_vabsq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s32)))
-int32x4_t __arm_vabsq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s32)))
-int32x4_t __arm_vabsq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s8)))
-int8x16_t __arm_vabsq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s8)))
-int8x16_t __arm_vabsq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s16)))
-int16x8_t __arm_vabsq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s16)))
-int16x8_t __arm_vabsq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s32)))
-int32x4_t __arm_vabsq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s32)))
-int32x4_t __arm_vabsq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s8)))
-int8x16_t __arm_vabsq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s8)))
-int8x16_t __arm_vabsq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadciq_m_s32)))
-int32x4_t __arm_vadciq_m_s32(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadciq_m_s32)))
-int32x4_t __arm_vadciq_m(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadciq_m_u32)))
-uint32x4_t __arm_vadciq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadciq_m_u32)))
-uint32x4_t __arm_vadciq_m(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadciq_s32)))
-int32x4_t __arm_vadciq_s32(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadciq_s32)))
-int32x4_t __arm_vadciq(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadciq_u32)))
-uint32x4_t __arm_vadciq_u32(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadciq_u32)))
-uint32x4_t __arm_vadciq(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadcq_m_s32)))
-int32x4_t __arm_vadcq_m_s32(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadcq_m_s32)))
-int32x4_t __arm_vadcq_m(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadcq_m_u32)))
-uint32x4_t __arm_vadcq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadcq_m_u32)))
-uint32x4_t __arm_vadcq_m(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadcq_s32)))
-int32x4_t __arm_vadcq_s32(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadcq_s32)))
-int32x4_t __arm_vadcq(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadcq_u32)))
-uint32x4_t __arm_vadcq_u32(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadcq_u32)))
-uint32x4_t __arm_vadcq(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_p_s32)))
-int64_t __arm_vaddlvaq_p_s32(int64_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_p_s32)))
-int64_t __arm_vaddlvaq_p(int64_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_p_u32)))
-uint64_t __arm_vaddlvaq_p_u32(uint64_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_p_u32)))
-uint64_t __arm_vaddlvaq_p(uint64_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_s32)))
-int64_t __arm_vaddlvaq_s32(int64_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_s32)))
-int64_t __arm_vaddlvaq(int64_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_u32)))
-uint64_t __arm_vaddlvaq_u32(uint64_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_u32)))
-uint64_t __arm_vaddlvaq(uint64_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_p_s32)))
-int64_t __arm_vaddlvq_p_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_p_s32)))
-int64_t __arm_vaddlvq_p(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_p_u32)))
-uint64_t __arm_vaddlvq_p_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_p_u32)))
-uint64_t __arm_vaddlvq_p(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_s32)))
-int64_t __arm_vaddlvq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_s32)))
-int64_t __arm_vaddlvq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_u32)))
-uint64_t __arm_vaddlvq_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_u32)))
-uint64_t __arm_vaddlvq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s16)))
-int16x8_t __arm_vaddq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s16)))
-int16x8_t __arm_vaddq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s32)))
-int32x4_t __arm_vaddq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s32)))
-int32x4_t __arm_vaddq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s8)))
-int8x16_t __arm_vaddq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s8)))
-int8x16_t __arm_vaddq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u16)))
-uint16x8_t __arm_vaddq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u16)))
-uint16x8_t __arm_vaddq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u32)))
-uint32x4_t __arm_vaddq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u32)))
-uint32x4_t __arm_vaddq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u8)))
-uint8x16_t __arm_vaddq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u8)))
-uint8x16_t __arm_vaddq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s16)))
-int16x8_t __arm_vaddq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s16)))
-int16x8_t __arm_vaddq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s32)))
-int32x4_t __arm_vaddq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s32)))
-int32x4_t __arm_vaddq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s8)))
-int8x16_t __arm_vaddq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s8)))
-int8x16_t __arm_vaddq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u16)))
-uint16x8_t __arm_vaddq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u16)))
-uint16x8_t __arm_vaddq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u32)))
-uint32x4_t __arm_vaddq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u32)))
-uint32x4_t __arm_vaddq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u8)))
-uint8x16_t __arm_vaddq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u8)))
-uint8x16_t __arm_vaddq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s16)))
-int16x8_t __arm_vaddq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s16)))
-int16x8_t __arm_vaddq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s32)))
-int32x4_t __arm_vaddq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s32)))
-int32x4_t __arm_vaddq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s8)))
-int8x16_t __arm_vaddq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s8)))
-int8x16_t __arm_vaddq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u16)))
-uint16x8_t __arm_vaddq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u16)))
-uint16x8_t __arm_vaddq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u32)))
-uint32x4_t __arm_vaddq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u32)))
-uint32x4_t __arm_vaddq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u8)))
-uint8x16_t __arm_vaddq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u8)))
-uint8x16_t __arm_vaddq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s16)))
-int16x8_t __arm_vaddq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s16)))
-int16x8_t __arm_vaddq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s32)))
-int32x4_t __arm_vaddq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s32)))
-int32x4_t __arm_vaddq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s8)))
-int8x16_t __arm_vaddq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s8)))
-int8x16_t __arm_vaddq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u16)))
-uint16x8_t __arm_vaddq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u16)))
-uint16x8_t __arm_vaddq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u32)))
-uint32x4_t __arm_vaddq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u32)))
-uint32x4_t __arm_vaddq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u8)))
-uint8x16_t __arm_vaddq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u8)))
-uint8x16_t __arm_vaddq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s16)))
-int16x8_t __arm_vaddq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s16)))
-int16x8_t __arm_vaddq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s32)))
-int32x4_t __arm_vaddq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s32)))
-int32x4_t __arm_vaddq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s8)))
-int8x16_t __arm_vaddq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s8)))
-int8x16_t __arm_vaddq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u16)))
-uint16x8_t __arm_vaddq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u16)))
-uint16x8_t __arm_vaddq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u32)))
-uint32x4_t __arm_vaddq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u32)))
-uint32x4_t __arm_vaddq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u8)))
-uint8x16_t __arm_vaddq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u8)))
-uint8x16_t __arm_vaddq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s16)))
-int16x8_t __arm_vaddq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s16)))
-int16x8_t __arm_vaddq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s32)))
-int32x4_t __arm_vaddq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s32)))
-int32x4_t __arm_vaddq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s8)))
-int8x16_t __arm_vaddq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s8)))
-int8x16_t __arm_vaddq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u16)))
-uint16x8_t __arm_vaddq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u16)))
-uint16x8_t __arm_vaddq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u32)))
-uint32x4_t __arm_vaddq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u32)))
-uint32x4_t __arm_vaddq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u8)))
-uint8x16_t __arm_vaddq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u8)))
-uint8x16_t __arm_vaddq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s16)))
-int32_t __arm_vaddvaq_p_s16(int32_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s16)))
-int32_t __arm_vaddvaq_p(int32_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s32)))
-int32_t __arm_vaddvaq_p_s32(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s32)))
-int32_t __arm_vaddvaq_p(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s8)))
-int32_t __arm_vaddvaq_p_s8(int32_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s8)))
-int32_t __arm_vaddvaq_p(int32_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u16)))
-uint32_t __arm_vaddvaq_p_u16(uint32_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u16)))
-uint32_t __arm_vaddvaq_p(uint32_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u32)))
-uint32_t __arm_vaddvaq_p_u32(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u32)))
-uint32_t __arm_vaddvaq_p(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u8)))
-uint32_t __arm_vaddvaq_p_u8(uint32_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u8)))
-uint32_t __arm_vaddvaq_p(uint32_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s16)))
-int32_t __arm_vaddvaq_s16(int32_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s16)))
-int32_t __arm_vaddvaq(int32_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s32)))
-int32_t __arm_vaddvaq_s32(int32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s32)))
-int32_t __arm_vaddvaq(int32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s8)))
-int32_t __arm_vaddvaq_s8(int32_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s8)))
-int32_t __arm_vaddvaq(int32_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u16)))
-uint32_t __arm_vaddvaq_u16(uint32_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u16)))
-uint32_t __arm_vaddvaq(uint32_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u32)))
-uint32_t __arm_vaddvaq_u32(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u32)))
-uint32_t __arm_vaddvaq(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u8)))
-uint32_t __arm_vaddvaq_u8(uint32_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u8)))
-uint32_t __arm_vaddvaq(uint32_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s16)))
-int32_t __arm_vaddvq_p_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s16)))
-int32_t __arm_vaddvq_p(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s32)))
-int32_t __arm_vaddvq_p_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s32)))
-int32_t __arm_vaddvq_p(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s8)))
-int32_t __arm_vaddvq_p_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s8)))
-int32_t __arm_vaddvq_p(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u16)))
-uint32_t __arm_vaddvq_p_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u16)))
-uint32_t __arm_vaddvq_p(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u32)))
-uint32_t __arm_vaddvq_p_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u32)))
-uint32_t __arm_vaddvq_p(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u8)))
-uint32_t __arm_vaddvq_p_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u8)))
-uint32_t __arm_vaddvq_p(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s16)))
-int32_t __arm_vaddvq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s16)))
-int32_t __arm_vaddvq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s32)))
-int32_t __arm_vaddvq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s32)))
-int32_t __arm_vaddvq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s8)))
-int32_t __arm_vaddvq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s8)))
-int32_t __arm_vaddvq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u16)))
-uint32_t __arm_vaddvq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u16)))
-uint32_t __arm_vaddvq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u32)))
-uint32_t __arm_vaddvq_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u32)))
-uint32_t __arm_vaddvq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u8)))
-uint32_t __arm_vaddvq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u8)))
-uint32_t __arm_vaddvq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s16)))
-int16x8_t __arm_vandq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s16)))
-int16x8_t __arm_vandq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s32)))
-int32x4_t __arm_vandq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s32)))
-int32x4_t __arm_vandq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s8)))
-int8x16_t __arm_vandq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s8)))
-int8x16_t __arm_vandq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u16)))
-uint16x8_t __arm_vandq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u16)))
-uint16x8_t __arm_vandq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u32)))
-uint32x4_t __arm_vandq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u32)))
-uint32x4_t __arm_vandq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u8)))
-uint8x16_t __arm_vandq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u8)))
-uint8x16_t __arm_vandq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_s16)))
-int16x8_t __arm_vandq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_s16)))
-int16x8_t __arm_vandq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_s32)))
-int32x4_t __arm_vandq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_s32)))
-int32x4_t __arm_vandq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_s8)))
-int8x16_t __arm_vandq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_s8)))
-int8x16_t __arm_vandq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_u16)))
-uint16x8_t __arm_vandq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_u16)))
-uint16x8_t __arm_vandq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_u32)))
-uint32x4_t __arm_vandq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_u32)))
-uint32x4_t __arm_vandq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_u8)))
-uint8x16_t __arm_vandq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_u8)))
-uint8x16_t __arm_vandq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s16)))
-int16x8_t __arm_vandq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s16)))
-int16x8_t __arm_vandq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s32)))
-int32x4_t __arm_vandq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s32)))
-int32x4_t __arm_vandq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s8)))
-int8x16_t __arm_vandq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s8)))
-int8x16_t __arm_vandq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u16)))
-uint16x8_t __arm_vandq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u16)))
-uint16x8_t __arm_vandq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u32)))
-uint32x4_t __arm_vandq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u32)))
-uint32x4_t __arm_vandq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u8)))
-uint8x16_t __arm_vandq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u8)))
-uint8x16_t __arm_vandq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_s16)))
-int16x8_t __arm_vbicq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_s16)))
-int16x8_t __arm_vbicq_m_n(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_s32)))
-int32x4_t __arm_vbicq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_s32)))
-int32x4_t __arm_vbicq_m_n(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_u16)))
-uint16x8_t __arm_vbicq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_u16)))
-uint16x8_t __arm_vbicq_m_n(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_u32)))
-uint32x4_t __arm_vbicq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_u32)))
-uint32x4_t __arm_vbicq_m_n(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s16)))
-int16x8_t __arm_vbicq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s16)))
-int16x8_t __arm_vbicq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s32)))
-int32x4_t __arm_vbicq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s32)))
-int32x4_t __arm_vbicq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s8)))
-int8x16_t __arm_vbicq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s8)))
-int8x16_t __arm_vbicq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u16)))
-uint16x8_t __arm_vbicq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u16)))
-uint16x8_t __arm_vbicq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u32)))
-uint32x4_t __arm_vbicq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u32)))
-uint32x4_t __arm_vbicq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u8)))
-uint8x16_t __arm_vbicq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u8)))
-uint8x16_t __arm_vbicq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_s16)))
-int16x8_t __arm_vbicq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_s16)))
-int16x8_t __arm_vbicq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_s32)))
-int32x4_t __arm_vbicq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_s32)))
-int32x4_t __arm_vbicq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_u16)))
-uint16x8_t __arm_vbicq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_u16)))
-uint16x8_t __arm_vbicq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_u32)))
-uint32x4_t __arm_vbicq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_u32)))
-uint32x4_t __arm_vbicq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s16)))
-int16x8_t __arm_vbicq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s16)))
-int16x8_t __arm_vbicq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s32)))
-int32x4_t __arm_vbicq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s32)))
-int32x4_t __arm_vbicq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s8)))
-int8x16_t __arm_vbicq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s8)))
-int8x16_t __arm_vbicq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u16)))
-uint16x8_t __arm_vbicq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u16)))
-uint16x8_t __arm_vbicq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u32)))
-uint32x4_t __arm_vbicq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u32)))
-uint32x4_t __arm_vbicq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u8)))
-uint8x16_t __arm_vbicq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u8)))
-uint8x16_t __arm_vbicq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s16)))
-int16x8_t __arm_vbicq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s16)))
-int16x8_t __arm_vbicq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s32)))
-int32x4_t __arm_vbicq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s32)))
-int32x4_t __arm_vbicq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s8)))
-int8x16_t __arm_vbicq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s8)))
-int8x16_t __arm_vbicq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u16)))
-uint16x8_t __arm_vbicq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u16)))
-uint16x8_t __arm_vbicq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u32)))
-uint32x4_t __arm_vbicq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u32)))
-uint32x4_t __arm_vbicq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u8)))
-uint8x16_t __arm_vbicq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u8)))
-uint8x16_t __arm_vbicq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s16)))
-int16x8_t __arm_vbrsrq_m_n_s16(int16x8_t, int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s16)))
-int16x8_t __arm_vbrsrq_m(int16x8_t, int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s32)))
-int32x4_t __arm_vbrsrq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s32)))
-int32x4_t __arm_vbrsrq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s8)))
-int8x16_t __arm_vbrsrq_m_n_s8(int8x16_t, int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s8)))
-int8x16_t __arm_vbrsrq_m(int8x16_t, int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u16)))
-uint16x8_t __arm_vbrsrq_m_n_u16(uint16x8_t, uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u16)))
-uint16x8_t __arm_vbrsrq_m(uint16x8_t, uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u32)))
-uint32x4_t __arm_vbrsrq_m_n_u32(uint32x4_t, uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u32)))
-uint32x4_t __arm_vbrsrq_m(uint32x4_t, uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u8)))
-uint8x16_t __arm_vbrsrq_m_n_u8(uint8x16_t, uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u8)))
-uint8x16_t __arm_vbrsrq_m(uint8x16_t, uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s16)))
-int16x8_t __arm_vbrsrq_n_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s16)))
-int16x8_t __arm_vbrsrq(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s32)))
-int32x4_t __arm_vbrsrq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s32)))
-int32x4_t __arm_vbrsrq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s8)))
-int8x16_t __arm_vbrsrq_n_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s8)))
-int8x16_t __arm_vbrsrq(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u16)))
-uint16x8_t __arm_vbrsrq_n_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u16)))
-uint16x8_t __arm_vbrsrq(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u32)))
-uint32x4_t __arm_vbrsrq_n_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u32)))
-uint32x4_t __arm_vbrsrq(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u8)))
-uint8x16_t __arm_vbrsrq_n_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u8)))
-uint8x16_t __arm_vbrsrq(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s16)))
-int16x8_t __arm_vbrsrq_x_n_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s16)))
-int16x8_t __arm_vbrsrq_x(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s32)))
-int32x4_t __arm_vbrsrq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s32)))
-int32x4_t __arm_vbrsrq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s8)))
-int8x16_t __arm_vbrsrq_x_n_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s8)))
-int8x16_t __arm_vbrsrq_x(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u16)))
-uint16x8_t __arm_vbrsrq_x_n_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u16)))
-uint16x8_t __arm_vbrsrq_x(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u32)))
-uint32x4_t __arm_vbrsrq_x_n_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u32)))
-uint32x4_t __arm_vbrsrq_x(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u8)))
-uint8x16_t __arm_vbrsrq_x_n_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u8)))
-uint8x16_t __arm_vbrsrq_x(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s16)))
-int16x8_t __arm_vcaddq_rot270_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s16)))
-int16x8_t __arm_vcaddq_rot270_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s32)))
-int32x4_t __arm_vcaddq_rot270_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s32)))
-int32x4_t __arm_vcaddq_rot270_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s8)))
-int8x16_t __arm_vcaddq_rot270_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s8)))
-int8x16_t __arm_vcaddq_rot270_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u16)))
-uint16x8_t __arm_vcaddq_rot270_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u16)))
-uint16x8_t __arm_vcaddq_rot270_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u32)))
-uint32x4_t __arm_vcaddq_rot270_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u32)))
-uint32x4_t __arm_vcaddq_rot270_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u8)))
-uint8x16_t __arm_vcaddq_rot270_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u8)))
-uint8x16_t __arm_vcaddq_rot270_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s16)))
-int16x8_t __arm_vcaddq_rot270_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s16)))
-int16x8_t __arm_vcaddq_rot270(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s32)))
-int32x4_t __arm_vcaddq_rot270_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s32)))
-int32x4_t __arm_vcaddq_rot270(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s8)))
-int8x16_t __arm_vcaddq_rot270_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s8)))
-int8x16_t __arm_vcaddq_rot270(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u16)))
-uint16x8_t __arm_vcaddq_rot270_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u16)))
-uint16x8_t __arm_vcaddq_rot270(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u32)))
-uint32x4_t __arm_vcaddq_rot270_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u32)))
-uint32x4_t __arm_vcaddq_rot270(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u8)))
-uint8x16_t __arm_vcaddq_rot270_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u8)))
-uint8x16_t __arm_vcaddq_rot270(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s16)))
-int16x8_t __arm_vcaddq_rot270_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s16)))
-int16x8_t __arm_vcaddq_rot270_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s32)))
-int32x4_t __arm_vcaddq_rot270_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s32)))
-int32x4_t __arm_vcaddq_rot270_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s8)))
-int8x16_t __arm_vcaddq_rot270_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s8)))
-int8x16_t __arm_vcaddq_rot270_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u16)))
-uint16x8_t __arm_vcaddq_rot270_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u16)))
-uint16x8_t __arm_vcaddq_rot270_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u32)))
-uint32x4_t __arm_vcaddq_rot270_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u32)))
-uint32x4_t __arm_vcaddq_rot270_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u8)))
-uint8x16_t __arm_vcaddq_rot270_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u8)))
-uint8x16_t __arm_vcaddq_rot270_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s16)))
-int16x8_t __arm_vcaddq_rot90_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s16)))
-int16x8_t __arm_vcaddq_rot90_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s32)))
-int32x4_t __arm_vcaddq_rot90_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s32)))
-int32x4_t __arm_vcaddq_rot90_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s8)))
-int8x16_t __arm_vcaddq_rot90_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s8)))
-int8x16_t __arm_vcaddq_rot90_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u16)))
-uint16x8_t __arm_vcaddq_rot90_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u16)))
-uint16x8_t __arm_vcaddq_rot90_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u32)))
-uint32x4_t __arm_vcaddq_rot90_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u32)))
-uint32x4_t __arm_vcaddq_rot90_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u8)))
-uint8x16_t __arm_vcaddq_rot90_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u8)))
-uint8x16_t __arm_vcaddq_rot90_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s16)))
-int16x8_t __arm_vcaddq_rot90_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s16)))
-int16x8_t __arm_vcaddq_rot90(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s32)))
-int32x4_t __arm_vcaddq_rot90_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s32)))
-int32x4_t __arm_vcaddq_rot90(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s8)))
-int8x16_t __arm_vcaddq_rot90_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s8)))
-int8x16_t __arm_vcaddq_rot90(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u16)))
-uint16x8_t __arm_vcaddq_rot90_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u16)))
-uint16x8_t __arm_vcaddq_rot90(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u32)))
-uint32x4_t __arm_vcaddq_rot90_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u32)))
-uint32x4_t __arm_vcaddq_rot90(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u8)))
-uint8x16_t __arm_vcaddq_rot90_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u8)))
-uint8x16_t __arm_vcaddq_rot90(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s16)))
-int16x8_t __arm_vcaddq_rot90_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s16)))
-int16x8_t __arm_vcaddq_rot90_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s32)))
-int32x4_t __arm_vcaddq_rot90_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s32)))
-int32x4_t __arm_vcaddq_rot90_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s8)))
-int8x16_t __arm_vcaddq_rot90_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s8)))
-int8x16_t __arm_vcaddq_rot90_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u16)))
-uint16x8_t __arm_vcaddq_rot90_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u16)))
-uint16x8_t __arm_vcaddq_rot90_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u32)))
-uint32x4_t __arm_vcaddq_rot90_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u32)))
-uint32x4_t __arm_vcaddq_rot90_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u8)))
-uint8x16_t __arm_vcaddq_rot90_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u8)))
-uint8x16_t __arm_vcaddq_rot90_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s16)))
-int16x8_t __arm_vclsq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s16)))
-int16x8_t __arm_vclsq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s32)))
-int32x4_t __arm_vclsq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s32)))
-int32x4_t __arm_vclsq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s8)))
-int8x16_t __arm_vclsq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s8)))
-int8x16_t __arm_vclsq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s16)))
-int16x8_t __arm_vclsq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s16)))
-int16x8_t __arm_vclsq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s32)))
-int32x4_t __arm_vclsq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s32)))
-int32x4_t __arm_vclsq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s8)))
-int8x16_t __arm_vclsq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s8)))
-int8x16_t __arm_vclsq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s16)))
-int16x8_t __arm_vclsq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s16)))
-int16x8_t __arm_vclsq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s32)))
-int32x4_t __arm_vclsq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s32)))
-int32x4_t __arm_vclsq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s8)))
-int8x16_t __arm_vclsq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s8)))
-int8x16_t __arm_vclsq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s16)))
-int16x8_t __arm_vclzq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s16)))
-int16x8_t __arm_vclzq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s32)))
-int32x4_t __arm_vclzq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s32)))
-int32x4_t __arm_vclzq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s8)))
-int8x16_t __arm_vclzq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s8)))
-int8x16_t __arm_vclzq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u16)))
-uint16x8_t __arm_vclzq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u16)))
-uint16x8_t __arm_vclzq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u32)))
-uint32x4_t __arm_vclzq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u32)))
-uint32x4_t __arm_vclzq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u8)))
-uint8x16_t __arm_vclzq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u8)))
-uint8x16_t __arm_vclzq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s16)))
-int16x8_t __arm_vclzq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s16)))
-int16x8_t __arm_vclzq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s32)))
-int32x4_t __arm_vclzq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s32)))
-int32x4_t __arm_vclzq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s8)))
-int8x16_t __arm_vclzq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s8)))
-int8x16_t __arm_vclzq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u16)))
-uint16x8_t __arm_vclzq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u16)))
-uint16x8_t __arm_vclzq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u32)))
-uint32x4_t __arm_vclzq_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u32)))
-uint32x4_t __arm_vclzq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u8)))
-uint8x16_t __arm_vclzq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u8)))
-uint8x16_t __arm_vclzq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s16)))
-int16x8_t __arm_vclzq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s16)))
-int16x8_t __arm_vclzq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s32)))
-int32x4_t __arm_vclzq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s32)))
-int32x4_t __arm_vclzq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s8)))
-int8x16_t __arm_vclzq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s8)))
-int8x16_t __arm_vclzq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u16)))
-uint16x8_t __arm_vclzq_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u16)))
-uint16x8_t __arm_vclzq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u32)))
-uint32x4_t __arm_vclzq_x_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u32)))
-uint32x4_t __arm_vclzq_x(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u8)))
-uint8x16_t __arm_vclzq_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u8)))
-uint8x16_t __arm_vclzq_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u16)))
-mve_pred16_t __arm_vcmpcsq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u16)))
-mve_pred16_t __arm_vcmpcsq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u32)))
-mve_pred16_t __arm_vcmpcsq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u32)))
-mve_pred16_t __arm_vcmpcsq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u8)))
-mve_pred16_t __arm_vcmpcsq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u8)))
-mve_pred16_t __arm_vcmpcsq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u16)))
-mve_pred16_t __arm_vcmpcsq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u16)))
-mve_pred16_t __arm_vcmpcsq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u32)))
-mve_pred16_t __arm_vcmpcsq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u32)))
-mve_pred16_t __arm_vcmpcsq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u8)))
-mve_pred16_t __arm_vcmpcsq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u8)))
-mve_pred16_t __arm_vcmpcsq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u16)))
-mve_pred16_t __arm_vcmpcsq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u16)))
-mve_pred16_t __arm_vcmpcsq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u32)))
-mve_pred16_t __arm_vcmpcsq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u32)))
-mve_pred16_t __arm_vcmpcsq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u8)))
-mve_pred16_t __arm_vcmpcsq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u8)))
-mve_pred16_t __arm_vcmpcsq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u16)))
-mve_pred16_t __arm_vcmpcsq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u16)))
-mve_pred16_t __arm_vcmpcsq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u32)))
-mve_pred16_t __arm_vcmpcsq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u32)))
-mve_pred16_t __arm_vcmpcsq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u8)))
-mve_pred16_t __arm_vcmpcsq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u8)))
-mve_pred16_t __arm_vcmpcsq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s16)))
-mve_pred16_t __arm_vcmpeqq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s16)))
-mve_pred16_t __arm_vcmpeqq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s32)))
-mve_pred16_t __arm_vcmpeqq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s32)))
-mve_pred16_t __arm_vcmpeqq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s8)))
-mve_pred16_t __arm_vcmpeqq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s8)))
-mve_pred16_t __arm_vcmpeqq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u16)))
-mve_pred16_t __arm_vcmpeqq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u16)))
-mve_pred16_t __arm_vcmpeqq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u32)))
-mve_pred16_t __arm_vcmpeqq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u32)))
-mve_pred16_t __arm_vcmpeqq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u8)))
-mve_pred16_t __arm_vcmpeqq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u8)))
-mve_pred16_t __arm_vcmpeqq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s16)))
-mve_pred16_t __arm_vcmpeqq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s16)))
-mve_pred16_t __arm_vcmpeqq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s32)))
-mve_pred16_t __arm_vcmpeqq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s32)))
-mve_pred16_t __arm_vcmpeqq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s8)))
-mve_pred16_t __arm_vcmpeqq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s8)))
-mve_pred16_t __arm_vcmpeqq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u16)))
-mve_pred16_t __arm_vcmpeqq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u16)))
-mve_pred16_t __arm_vcmpeqq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u32)))
-mve_pred16_t __arm_vcmpeqq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u32)))
-mve_pred16_t __arm_vcmpeqq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u8)))
-mve_pred16_t __arm_vcmpeqq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u8)))
-mve_pred16_t __arm_vcmpeqq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s16)))
-mve_pred16_t __arm_vcmpeqq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s16)))
-mve_pred16_t __arm_vcmpeqq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s32)))
-mve_pred16_t __arm_vcmpeqq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s32)))
-mve_pred16_t __arm_vcmpeqq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s8)))
-mve_pred16_t __arm_vcmpeqq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s8)))
-mve_pred16_t __arm_vcmpeqq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u16)))
-mve_pred16_t __arm_vcmpeqq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u16)))
-mve_pred16_t __arm_vcmpeqq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u32)))
-mve_pred16_t __arm_vcmpeqq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u32)))
-mve_pred16_t __arm_vcmpeqq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u8)))
-mve_pred16_t __arm_vcmpeqq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u8)))
-mve_pred16_t __arm_vcmpeqq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s16)))
-mve_pred16_t __arm_vcmpeqq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s16)))
-mve_pred16_t __arm_vcmpeqq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s32)))
-mve_pred16_t __arm_vcmpeqq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s32)))
-mve_pred16_t __arm_vcmpeqq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s8)))
-mve_pred16_t __arm_vcmpeqq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s8)))
-mve_pred16_t __arm_vcmpeqq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u16)))
-mve_pred16_t __arm_vcmpeqq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u16)))
-mve_pred16_t __arm_vcmpeqq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u32)))
-mve_pred16_t __arm_vcmpeqq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u32)))
-mve_pred16_t __arm_vcmpeqq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u8)))
-mve_pred16_t __arm_vcmpeqq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u8)))
-mve_pred16_t __arm_vcmpeqq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s16)))
-mve_pred16_t __arm_vcmpgeq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s16)))
-mve_pred16_t __arm_vcmpgeq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s32)))
-mve_pred16_t __arm_vcmpgeq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s32)))
-mve_pred16_t __arm_vcmpgeq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s8)))
-mve_pred16_t __arm_vcmpgeq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s8)))
-mve_pred16_t __arm_vcmpgeq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s16)))
-mve_pred16_t __arm_vcmpgeq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s16)))
-mve_pred16_t __arm_vcmpgeq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s32)))
-mve_pred16_t __arm_vcmpgeq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s32)))
-mve_pred16_t __arm_vcmpgeq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s8)))
-mve_pred16_t __arm_vcmpgeq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s8)))
-mve_pred16_t __arm_vcmpgeq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s16)))
-mve_pred16_t __arm_vcmpgeq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s16)))
-mve_pred16_t __arm_vcmpgeq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s32)))
-mve_pred16_t __arm_vcmpgeq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s32)))
-mve_pred16_t __arm_vcmpgeq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s8)))
-mve_pred16_t __arm_vcmpgeq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s8)))
-mve_pred16_t __arm_vcmpgeq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s16)))
-mve_pred16_t __arm_vcmpgeq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s16)))
-mve_pred16_t __arm_vcmpgeq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s32)))
-mve_pred16_t __arm_vcmpgeq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s32)))
-mve_pred16_t __arm_vcmpgeq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s8)))
-mve_pred16_t __arm_vcmpgeq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s8)))
-mve_pred16_t __arm_vcmpgeq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s16)))
-mve_pred16_t __arm_vcmpgtq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s16)))
-mve_pred16_t __arm_vcmpgtq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s32)))
-mve_pred16_t __arm_vcmpgtq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s32)))
-mve_pred16_t __arm_vcmpgtq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s8)))
-mve_pred16_t __arm_vcmpgtq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s8)))
-mve_pred16_t __arm_vcmpgtq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s16)))
-mve_pred16_t __arm_vcmpgtq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s16)))
-mve_pred16_t __arm_vcmpgtq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s32)))
-mve_pred16_t __arm_vcmpgtq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s32)))
-mve_pred16_t __arm_vcmpgtq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s8)))
-mve_pred16_t __arm_vcmpgtq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s8)))
-mve_pred16_t __arm_vcmpgtq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s16)))
-mve_pred16_t __arm_vcmpgtq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s16)))
-mve_pred16_t __arm_vcmpgtq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s32)))
-mve_pred16_t __arm_vcmpgtq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s32)))
-mve_pred16_t __arm_vcmpgtq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s8)))
-mve_pred16_t __arm_vcmpgtq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s8)))
-mve_pred16_t __arm_vcmpgtq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s16)))
-mve_pred16_t __arm_vcmpgtq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s16)))
-mve_pred16_t __arm_vcmpgtq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s32)))
-mve_pred16_t __arm_vcmpgtq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s32)))
-mve_pred16_t __arm_vcmpgtq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s8)))
-mve_pred16_t __arm_vcmpgtq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s8)))
-mve_pred16_t __arm_vcmpgtq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u16)))
-mve_pred16_t __arm_vcmphiq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u16)))
-mve_pred16_t __arm_vcmphiq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u32)))
-mve_pred16_t __arm_vcmphiq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u32)))
-mve_pred16_t __arm_vcmphiq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u8)))
-mve_pred16_t __arm_vcmphiq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u8)))
-mve_pred16_t __arm_vcmphiq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u16)))
-mve_pred16_t __arm_vcmphiq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u16)))
-mve_pred16_t __arm_vcmphiq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u32)))
-mve_pred16_t __arm_vcmphiq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u32)))
-mve_pred16_t __arm_vcmphiq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u8)))
-mve_pred16_t __arm_vcmphiq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u8)))
-mve_pred16_t __arm_vcmphiq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u16)))
-mve_pred16_t __arm_vcmphiq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u16)))
-mve_pred16_t __arm_vcmphiq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u32)))
-mve_pred16_t __arm_vcmphiq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u32)))
-mve_pred16_t __arm_vcmphiq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u8)))
-mve_pred16_t __arm_vcmphiq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u8)))
-mve_pred16_t __arm_vcmphiq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u16)))
-mve_pred16_t __arm_vcmphiq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u16)))
-mve_pred16_t __arm_vcmphiq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u32)))
-mve_pred16_t __arm_vcmphiq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u32)))
-mve_pred16_t __arm_vcmphiq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u8)))
-mve_pred16_t __arm_vcmphiq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u8)))
-mve_pred16_t __arm_vcmphiq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s16)))
-mve_pred16_t __arm_vcmpleq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s16)))
-mve_pred16_t __arm_vcmpleq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s32)))
-mve_pred16_t __arm_vcmpleq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s32)))
-mve_pred16_t __arm_vcmpleq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s8)))
-mve_pred16_t __arm_vcmpleq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s8)))
-mve_pred16_t __arm_vcmpleq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s16)))
-mve_pred16_t __arm_vcmpleq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s16)))
-mve_pred16_t __arm_vcmpleq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s32)))
-mve_pred16_t __arm_vcmpleq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s32)))
-mve_pred16_t __arm_vcmpleq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s8)))
-mve_pred16_t __arm_vcmpleq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s8)))
-mve_pred16_t __arm_vcmpleq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s16)))
-mve_pred16_t __arm_vcmpleq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s16)))
-mve_pred16_t __arm_vcmpleq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s32)))
-mve_pred16_t __arm_vcmpleq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s32)))
-mve_pred16_t __arm_vcmpleq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s8)))
-mve_pred16_t __arm_vcmpleq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s8)))
-mve_pred16_t __arm_vcmpleq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s16)))
-mve_pred16_t __arm_vcmpleq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s16)))
-mve_pred16_t __arm_vcmpleq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s32)))
-mve_pred16_t __arm_vcmpleq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s32)))
-mve_pred16_t __arm_vcmpleq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s8)))
-mve_pred16_t __arm_vcmpleq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s8)))
-mve_pred16_t __arm_vcmpleq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s16)))
-mve_pred16_t __arm_vcmpltq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s16)))
-mve_pred16_t __arm_vcmpltq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s32)))
-mve_pred16_t __arm_vcmpltq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s32)))
-mve_pred16_t __arm_vcmpltq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s8)))
-mve_pred16_t __arm_vcmpltq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s8)))
-mve_pred16_t __arm_vcmpltq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s16)))
-mve_pred16_t __arm_vcmpltq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s16)))
-mve_pred16_t __arm_vcmpltq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s32)))
-mve_pred16_t __arm_vcmpltq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s32)))
-mve_pred16_t __arm_vcmpltq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s8)))
-mve_pred16_t __arm_vcmpltq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s8)))
-mve_pred16_t __arm_vcmpltq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s16)))
-mve_pred16_t __arm_vcmpltq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s16)))
-mve_pred16_t __arm_vcmpltq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s32)))
-mve_pred16_t __arm_vcmpltq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s32)))
-mve_pred16_t __arm_vcmpltq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s8)))
-mve_pred16_t __arm_vcmpltq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s8)))
-mve_pred16_t __arm_vcmpltq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s16)))
-mve_pred16_t __arm_vcmpltq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s16)))
-mve_pred16_t __arm_vcmpltq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s32)))
-mve_pred16_t __arm_vcmpltq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s32)))
-mve_pred16_t __arm_vcmpltq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s8)))
-mve_pred16_t __arm_vcmpltq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s8)))
-mve_pred16_t __arm_vcmpltq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s16)))
-mve_pred16_t __arm_vcmpneq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s16)))
-mve_pred16_t __arm_vcmpneq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s32)))
-mve_pred16_t __arm_vcmpneq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s32)))
-mve_pred16_t __arm_vcmpneq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s8)))
-mve_pred16_t __arm_vcmpneq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s8)))
-mve_pred16_t __arm_vcmpneq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u16)))
-mve_pred16_t __arm_vcmpneq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u16)))
-mve_pred16_t __arm_vcmpneq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u32)))
-mve_pred16_t __arm_vcmpneq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u32)))
-mve_pred16_t __arm_vcmpneq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u8)))
-mve_pred16_t __arm_vcmpneq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u8)))
-mve_pred16_t __arm_vcmpneq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s16)))
-mve_pred16_t __arm_vcmpneq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s16)))
-mve_pred16_t __arm_vcmpneq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s32)))
-mve_pred16_t __arm_vcmpneq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s32)))
-mve_pred16_t __arm_vcmpneq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s8)))
-mve_pred16_t __arm_vcmpneq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s8)))
-mve_pred16_t __arm_vcmpneq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u16)))
-mve_pred16_t __arm_vcmpneq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u16)))
-mve_pred16_t __arm_vcmpneq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u32)))
-mve_pred16_t __arm_vcmpneq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u32)))
-mve_pred16_t __arm_vcmpneq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u8)))
-mve_pred16_t __arm_vcmpneq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u8)))
-mve_pred16_t __arm_vcmpneq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s16)))
-mve_pred16_t __arm_vcmpneq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s16)))
-mve_pred16_t __arm_vcmpneq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s32)))
-mve_pred16_t __arm_vcmpneq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s32)))
-mve_pred16_t __arm_vcmpneq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s8)))
-mve_pred16_t __arm_vcmpneq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s8)))
-mve_pred16_t __arm_vcmpneq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u16)))
-mve_pred16_t __arm_vcmpneq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u16)))
-mve_pred16_t __arm_vcmpneq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u32)))
-mve_pred16_t __arm_vcmpneq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u32)))
-mve_pred16_t __arm_vcmpneq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u8)))
-mve_pred16_t __arm_vcmpneq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u8)))
-mve_pred16_t __arm_vcmpneq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s16)))
-mve_pred16_t __arm_vcmpneq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s16)))
-mve_pred16_t __arm_vcmpneq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s32)))
-mve_pred16_t __arm_vcmpneq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s32)))
-mve_pred16_t __arm_vcmpneq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s8)))
-mve_pred16_t __arm_vcmpneq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s8)))
-mve_pred16_t __arm_vcmpneq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u16)))
-mve_pred16_t __arm_vcmpneq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u16)))
-mve_pred16_t __arm_vcmpneq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u32)))
-mve_pred16_t __arm_vcmpneq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u32)))
-mve_pred16_t __arm_vcmpneq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u8)))
-mve_pred16_t __arm_vcmpneq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u8)))
-mve_pred16_t __arm_vcmpneq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_s16)))
-int16x8_t __arm_vcreateq_s16(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_s32)))
-int32x4_t __arm_vcreateq_s32(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_s64)))
-int64x2_t __arm_vcreateq_s64(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_s8)))
-int8x16_t __arm_vcreateq_s8(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_u16)))
-uint16x8_t __arm_vcreateq_u16(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_u32)))
-uint32x4_t __arm_vcreateq_u32(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_u64)))
-uint64x2_t __arm_vcreateq_u64(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_u8)))
-uint8x16_t __arm_vcreateq_u8(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp16q)))
-mve_pred16_t __arm_vctp16q(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp16q_m)))
-mve_pred16_t __arm_vctp16q_m(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp32q)))
-mve_pred16_t __arm_vctp32q(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp32q_m)))
-mve_pred16_t __arm_vctp32q_m(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp64q)))
-mve_pred16_t __arm_vctp64q(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp64q_m)))
-mve_pred16_t __arm_vctp64q_m(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp8q)))
-mve_pred16_t __arm_vctp8q(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp8q_m)))
-mve_pred16_t __arm_vctp8q_m(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u16)))
-uint16x8_t __arm_vddupq_m_n_u16(uint16x8_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u16)))
-uint16x8_t __arm_vddupq_m(uint16x8_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u32)))
-uint32x4_t __arm_vddupq_m_n_u32(uint32x4_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u32)))
-uint32x4_t __arm_vddupq_m(uint32x4_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u8)))
-uint8x16_t __arm_vddupq_m_n_u8(uint8x16_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u8)))
-uint8x16_t __arm_vddupq_m(uint8x16_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u16)))
-uint16x8_t __arm_vddupq_m_wb_u16(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u16)))
-uint16x8_t __arm_vddupq_m(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u32)))
-uint32x4_t __arm_vddupq_m_wb_u32(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u32)))
-uint32x4_t __arm_vddupq_m(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u8)))
-uint8x16_t __arm_vddupq_m_wb_u8(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u8)))
-uint8x16_t __arm_vddupq_m(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u16)))
-uint16x8_t __arm_vddupq_n_u16(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u16)))
-uint16x8_t __arm_vddupq_u16(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u32)))
-uint32x4_t __arm_vddupq_n_u32(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u32)))
-uint32x4_t __arm_vddupq_u32(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u8)))
-uint8x16_t __arm_vddupq_n_u8(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u8)))
-uint8x16_t __arm_vddupq_u8(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u16)))
-uint16x8_t __arm_vddupq_wb_u16(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u16)))
-uint16x8_t __arm_vddupq_u16(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u32)))
-uint32x4_t __arm_vddupq_wb_u32(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u32)))
-uint32x4_t __arm_vddupq_u32(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u8)))
-uint8x16_t __arm_vddupq_wb_u8(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u8)))
-uint8x16_t __arm_vddupq_u8(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u16)))
-uint16x8_t __arm_vddupq_x_n_u16(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u16)))
-uint16x8_t __arm_vddupq_x_u16(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u32)))
-uint32x4_t __arm_vddupq_x_n_u32(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u32)))
-uint32x4_t __arm_vddupq_x_u32(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u8)))
-uint8x16_t __arm_vddupq_x_n_u8(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u8)))
-uint8x16_t __arm_vddupq_x_u8(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u16)))
-uint16x8_t __arm_vddupq_x_wb_u16(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u16)))
-uint16x8_t __arm_vddupq_x_u16(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u32)))
-uint32x4_t __arm_vddupq_x_wb_u32(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u32)))
-uint32x4_t __arm_vddupq_x_u32(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u8)))
-uint8x16_t __arm_vddupq_x_wb_u8(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u8)))
-uint8x16_t __arm_vddupq_x_u8(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s16)))
-int16x8_t __arm_vdupq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s16)))
-int16x8_t __arm_vdupq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s32)))
-int32x4_t __arm_vdupq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s32)))
-int32x4_t __arm_vdupq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s8)))
-int8x16_t __arm_vdupq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s8)))
-int8x16_t __arm_vdupq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u16)))
-uint16x8_t __arm_vdupq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u16)))
-uint16x8_t __arm_vdupq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u32)))
-uint32x4_t __arm_vdupq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u32)))
-uint32x4_t __arm_vdupq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u8)))
-uint8x16_t __arm_vdupq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u8)))
-uint8x16_t __arm_vdupq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_s16)))
-int16x8_t __arm_vdupq_n_s16(int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_s32)))
-int32x4_t __arm_vdupq_n_s32(int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_s8)))
-int8x16_t __arm_vdupq_n_s8(int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_u16)))
-uint16x8_t __arm_vdupq_n_u16(uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_u32)))
-uint32x4_t __arm_vdupq_n_u32(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_u8)))
-uint8x16_t __arm_vdupq_n_u8(uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_s16)))
-int16x8_t __arm_vdupq_x_n_s16(int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_s32)))
-int32x4_t __arm_vdupq_x_n_s32(int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_s8)))
-int8x16_t __arm_vdupq_x_n_s8(int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_u16)))
-uint16x8_t __arm_vdupq_x_n_u16(uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_u32)))
-uint32x4_t __arm_vdupq_x_n_u32(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_u8)))
-uint8x16_t __arm_vdupq_x_n_u8(uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u16)))
-uint16x8_t __arm_vdwdupq_m_n_u16(uint16x8_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u16)))
-uint16x8_t __arm_vdwdupq_m(uint16x8_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u32)))
-uint32x4_t __arm_vdwdupq_m_n_u32(uint32x4_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u32)))
-uint32x4_t __arm_vdwdupq_m(uint32x4_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u8)))
-uint8x16_t __arm_vdwdupq_m_n_u8(uint8x16_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u8)))
-uint8x16_t __arm_vdwdupq_m(uint8x16_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u16)))
-uint16x8_t __arm_vdwdupq_m_wb_u16(uint16x8_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u16)))
-uint16x8_t __arm_vdwdupq_m(uint16x8_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u32)))
-uint32x4_t __arm_vdwdupq_m_wb_u32(uint32x4_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u32)))
-uint32x4_t __arm_vdwdupq_m(uint32x4_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u8)))
-uint8x16_t __arm_vdwdupq_m_wb_u8(uint8x16_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u8)))
-uint8x16_t __arm_vdwdupq_m(uint8x16_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u16)))
-uint16x8_t __arm_vdwdupq_n_u16(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u16)))
-uint16x8_t __arm_vdwdupq_u16(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u32)))
-uint32x4_t __arm_vdwdupq_n_u32(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u32)))
-uint32x4_t __arm_vdwdupq_u32(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u8)))
-uint8x16_t __arm_vdwdupq_n_u8(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u8)))
-uint8x16_t __arm_vdwdupq_u8(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u16)))
-uint16x8_t __arm_vdwdupq_wb_u16(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u16)))
-uint16x8_t __arm_vdwdupq_u16(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u32)))
-uint32x4_t __arm_vdwdupq_wb_u32(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u32)))
-uint32x4_t __arm_vdwdupq_u32(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u8)))
-uint8x16_t __arm_vdwdupq_wb_u8(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u8)))
-uint8x16_t __arm_vdwdupq_u8(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u16)))
-uint16x8_t __arm_vdwdupq_x_n_u16(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u16)))
-uint16x8_t __arm_vdwdupq_x_u16(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u32)))
-uint32x4_t __arm_vdwdupq_x_n_u32(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u32)))
-uint32x4_t __arm_vdwdupq_x_u32(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u8)))
-uint8x16_t __arm_vdwdupq_x_n_u8(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u8)))
-uint8x16_t __arm_vdwdupq_x_u8(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u16)))
-uint16x8_t __arm_vdwdupq_x_wb_u16(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u16)))
-uint16x8_t __arm_vdwdupq_x_u16(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u32)))
-uint32x4_t __arm_vdwdupq_x_wb_u32(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u32)))
-uint32x4_t __arm_vdwdupq_x_u32(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u8)))
-uint8x16_t __arm_vdwdupq_x_wb_u8(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u8)))
-uint8x16_t __arm_vdwdupq_x_u8(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s16)))
-int16x8_t __arm_veorq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s16)))
-int16x8_t __arm_veorq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s32)))
-int32x4_t __arm_veorq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s32)))
-int32x4_t __arm_veorq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s8)))
-int8x16_t __arm_veorq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s8)))
-int8x16_t __arm_veorq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u16)))
-uint16x8_t __arm_veorq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u16)))
-uint16x8_t __arm_veorq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u32)))
-uint32x4_t __arm_veorq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u32)))
-uint32x4_t __arm_veorq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u8)))
-uint8x16_t __arm_veorq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u8)))
-uint8x16_t __arm_veorq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_s16)))
-int16x8_t __arm_veorq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_s16)))
-int16x8_t __arm_veorq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_s32)))
-int32x4_t __arm_veorq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_s32)))
-int32x4_t __arm_veorq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_s8)))
-int8x16_t __arm_veorq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_s8)))
-int8x16_t __arm_veorq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_u16)))
-uint16x8_t __arm_veorq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_u16)))
-uint16x8_t __arm_veorq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_u32)))
-uint32x4_t __arm_veorq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_u32)))
-uint32x4_t __arm_veorq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_u8)))
-uint8x16_t __arm_veorq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_u8)))
-uint8x16_t __arm_veorq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s16)))
-int16x8_t __arm_veorq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s16)))
-int16x8_t __arm_veorq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s32)))
-int32x4_t __arm_veorq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s32)))
-int32x4_t __arm_veorq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s8)))
-int8x16_t __arm_veorq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s8)))
-int8x16_t __arm_veorq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u16)))
-uint16x8_t __arm_veorq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u16)))
-uint16x8_t __arm_veorq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u32)))
-uint32x4_t __arm_veorq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u32)))
-uint32x4_t __arm_veorq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u8)))
-uint8x16_t __arm_veorq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u8)))
-uint8x16_t __arm_veorq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s16)))
-int16_t __arm_vgetq_lane_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s16)))
-int16_t __arm_vgetq_lane(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s32)))
-int32_t __arm_vgetq_lane_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s32)))
-int32_t __arm_vgetq_lane(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s64)))
-int64_t __arm_vgetq_lane_s64(int64x2_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s64)))
-int64_t __arm_vgetq_lane(int64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s8)))
-int8_t __arm_vgetq_lane_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s8)))
-int8_t __arm_vgetq_lane(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u16)))
-uint16_t __arm_vgetq_lane_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u16)))
-uint16_t __arm_vgetq_lane(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u32)))
-uint32_t __arm_vgetq_lane_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u32)))
-uint32_t __arm_vgetq_lane(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u64)))
-uint64_t __arm_vgetq_lane_u64(uint64x2_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u64)))
-uint64_t __arm_vgetq_lane(uint64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u8)))
-uint8_t __arm_vgetq_lane_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u8)))
-uint8_t __arm_vgetq_lane(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s16)))
-int16x8_t __arm_vhaddq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s16)))
-int16x8_t __arm_vhaddq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s32)))
-int32x4_t __arm_vhaddq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s32)))
-int32x4_t __arm_vhaddq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s8)))
-int8x16_t __arm_vhaddq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s8)))
-int8x16_t __arm_vhaddq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u16)))
-uint16x8_t __arm_vhaddq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u16)))
-uint16x8_t __arm_vhaddq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u32)))
-uint32x4_t __arm_vhaddq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u32)))
-uint32x4_t __arm_vhaddq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u8)))
-uint8x16_t __arm_vhaddq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u8)))
-uint8x16_t __arm_vhaddq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s16)))
-int16x8_t __arm_vhaddq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s16)))
-int16x8_t __arm_vhaddq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s32)))
-int32x4_t __arm_vhaddq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s32)))
-int32x4_t __arm_vhaddq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s8)))
-int8x16_t __arm_vhaddq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s8)))
-int8x16_t __arm_vhaddq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u16)))
-uint16x8_t __arm_vhaddq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u16)))
-uint16x8_t __arm_vhaddq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u32)))
-uint32x4_t __arm_vhaddq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u32)))
-uint32x4_t __arm_vhaddq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u8)))
-uint8x16_t __arm_vhaddq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u8)))
-uint8x16_t __arm_vhaddq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s16)))
-int16x8_t __arm_vhaddq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s16)))
-int16x8_t __arm_vhaddq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s32)))
-int32x4_t __arm_vhaddq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s32)))
-int32x4_t __arm_vhaddq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s8)))
-int8x16_t __arm_vhaddq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s8)))
-int8x16_t __arm_vhaddq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u16)))
-uint16x8_t __arm_vhaddq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u16)))
-uint16x8_t __arm_vhaddq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u32)))
-uint32x4_t __arm_vhaddq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u32)))
-uint32x4_t __arm_vhaddq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u8)))
-uint8x16_t __arm_vhaddq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u8)))
-uint8x16_t __arm_vhaddq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s16)))
-int16x8_t __arm_vhaddq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s16)))
-int16x8_t __arm_vhaddq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s32)))
-int32x4_t __arm_vhaddq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s32)))
-int32x4_t __arm_vhaddq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s8)))
-int8x16_t __arm_vhaddq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s8)))
-int8x16_t __arm_vhaddq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u16)))
-uint16x8_t __arm_vhaddq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u16)))
-uint16x8_t __arm_vhaddq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u32)))
-uint32x4_t __arm_vhaddq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u32)))
-uint32x4_t __arm_vhaddq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u8)))
-uint8x16_t __arm_vhaddq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u8)))
-uint8x16_t __arm_vhaddq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s16)))
-int16x8_t __arm_vhaddq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s16)))
-int16x8_t __arm_vhaddq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s32)))
-int32x4_t __arm_vhaddq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s32)))
-int32x4_t __arm_vhaddq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s8)))
-int8x16_t __arm_vhaddq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s8)))
-int8x16_t __arm_vhaddq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u16)))
-uint16x8_t __arm_vhaddq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u16)))
-uint16x8_t __arm_vhaddq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u32)))
-uint32x4_t __arm_vhaddq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u32)))
-uint32x4_t __arm_vhaddq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u8)))
-uint8x16_t __arm_vhaddq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u8)))
-uint8x16_t __arm_vhaddq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s16)))
-int16x8_t __arm_vhaddq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s16)))
-int16x8_t __arm_vhaddq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s32)))
-int32x4_t __arm_vhaddq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s32)))
-int32x4_t __arm_vhaddq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s8)))
-int8x16_t __arm_vhaddq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s8)))
-int8x16_t __arm_vhaddq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u16)))
-uint16x8_t __arm_vhaddq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u16)))
-uint16x8_t __arm_vhaddq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u32)))
-uint32x4_t __arm_vhaddq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u32)))
-uint32x4_t __arm_vhaddq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u8)))
-uint8x16_t __arm_vhaddq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u8)))
-uint8x16_t __arm_vhaddq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s16)))
-int16x8_t __arm_vhcaddq_rot270_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s16)))
-int16x8_t __arm_vhcaddq_rot270_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s32)))
-int32x4_t __arm_vhcaddq_rot270_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s32)))
-int32x4_t __arm_vhcaddq_rot270_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s8)))
-int8x16_t __arm_vhcaddq_rot270_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s8)))
-int8x16_t __arm_vhcaddq_rot270_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s16)))
-int16x8_t __arm_vhcaddq_rot270_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s16)))
-int16x8_t __arm_vhcaddq_rot270(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s32)))
-int32x4_t __arm_vhcaddq_rot270_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s32)))
-int32x4_t __arm_vhcaddq_rot270(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s8)))
-int8x16_t __arm_vhcaddq_rot270_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s8)))
-int8x16_t __arm_vhcaddq_rot270(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s16)))
-int16x8_t __arm_vhcaddq_rot270_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s16)))
-int16x8_t __arm_vhcaddq_rot270_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s32)))
-int32x4_t __arm_vhcaddq_rot270_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s32)))
-int32x4_t __arm_vhcaddq_rot270_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s8)))
-int8x16_t __arm_vhcaddq_rot270_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s8)))
-int8x16_t __arm_vhcaddq_rot270_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s16)))
-int16x8_t __arm_vhcaddq_rot90_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s16)))
-int16x8_t __arm_vhcaddq_rot90_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s32)))
-int32x4_t __arm_vhcaddq_rot90_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s32)))
-int32x4_t __arm_vhcaddq_rot90_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s8)))
-int8x16_t __arm_vhcaddq_rot90_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s8)))
-int8x16_t __arm_vhcaddq_rot90_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s16)))
-int16x8_t __arm_vhcaddq_rot90_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s16)))
-int16x8_t __arm_vhcaddq_rot90(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s32)))
-int32x4_t __arm_vhcaddq_rot90_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s32)))
-int32x4_t __arm_vhcaddq_rot90(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s8)))
-int8x16_t __arm_vhcaddq_rot90_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s8)))
-int8x16_t __arm_vhcaddq_rot90(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s16)))
-int16x8_t __arm_vhcaddq_rot90_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s16)))
-int16x8_t __arm_vhcaddq_rot90_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s32)))
-int32x4_t __arm_vhcaddq_rot90_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s32)))
-int32x4_t __arm_vhcaddq_rot90_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s8)))
-int8x16_t __arm_vhcaddq_rot90_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s8)))
-int8x16_t __arm_vhcaddq_rot90_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s16)))
-int16x8_t __arm_vhsubq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s16)))
-int16x8_t __arm_vhsubq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s32)))
-int32x4_t __arm_vhsubq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s32)))
-int32x4_t __arm_vhsubq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s8)))
-int8x16_t __arm_vhsubq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s8)))
-int8x16_t __arm_vhsubq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u16)))
-uint16x8_t __arm_vhsubq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u16)))
-uint16x8_t __arm_vhsubq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u32)))
-uint32x4_t __arm_vhsubq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u32)))
-uint32x4_t __arm_vhsubq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u8)))
-uint8x16_t __arm_vhsubq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u8)))
-uint8x16_t __arm_vhsubq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s16)))
-int16x8_t __arm_vhsubq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s16)))
-int16x8_t __arm_vhsubq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s32)))
-int32x4_t __arm_vhsubq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s32)))
-int32x4_t __arm_vhsubq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s8)))
-int8x16_t __arm_vhsubq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s8)))
-int8x16_t __arm_vhsubq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u16)))
-uint16x8_t __arm_vhsubq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u16)))
-uint16x8_t __arm_vhsubq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u32)))
-uint32x4_t __arm_vhsubq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u32)))
-uint32x4_t __arm_vhsubq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u8)))
-uint8x16_t __arm_vhsubq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u8)))
-uint8x16_t __arm_vhsubq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s16)))
-int16x8_t __arm_vhsubq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s16)))
-int16x8_t __arm_vhsubq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s32)))
-int32x4_t __arm_vhsubq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s32)))
-int32x4_t __arm_vhsubq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s8)))
-int8x16_t __arm_vhsubq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s8)))
-int8x16_t __arm_vhsubq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u16)))
-uint16x8_t __arm_vhsubq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u16)))
-uint16x8_t __arm_vhsubq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u32)))
-uint32x4_t __arm_vhsubq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u32)))
-uint32x4_t __arm_vhsubq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u8)))
-uint8x16_t __arm_vhsubq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u8)))
-uint8x16_t __arm_vhsubq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s16)))
-int16x8_t __arm_vhsubq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s16)))
-int16x8_t __arm_vhsubq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s32)))
-int32x4_t __arm_vhsubq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s32)))
-int32x4_t __arm_vhsubq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s8)))
-int8x16_t __arm_vhsubq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s8)))
-int8x16_t __arm_vhsubq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u16)))
-uint16x8_t __arm_vhsubq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u16)))
-uint16x8_t __arm_vhsubq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u32)))
-uint32x4_t __arm_vhsubq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u32)))
-uint32x4_t __arm_vhsubq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u8)))
-uint8x16_t __arm_vhsubq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u8)))
-uint8x16_t __arm_vhsubq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s16)))
-int16x8_t __arm_vhsubq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s16)))
-int16x8_t __arm_vhsubq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s32)))
-int32x4_t __arm_vhsubq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s32)))
-int32x4_t __arm_vhsubq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s8)))
-int8x16_t __arm_vhsubq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s8)))
-int8x16_t __arm_vhsubq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u16)))
-uint16x8_t __arm_vhsubq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u16)))
-uint16x8_t __arm_vhsubq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u32)))
-uint32x4_t __arm_vhsubq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u32)))
-uint32x4_t __arm_vhsubq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u8)))
-uint8x16_t __arm_vhsubq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u8)))
-uint8x16_t __arm_vhsubq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s16)))
-int16x8_t __arm_vhsubq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s16)))
-int16x8_t __arm_vhsubq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s32)))
-int32x4_t __arm_vhsubq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s32)))
-int32x4_t __arm_vhsubq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s8)))
-int8x16_t __arm_vhsubq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s8)))
-int8x16_t __arm_vhsubq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u16)))
-uint16x8_t __arm_vhsubq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u16)))
-uint16x8_t __arm_vhsubq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u32)))
-uint32x4_t __arm_vhsubq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u32)))
-uint32x4_t __arm_vhsubq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u8)))
-uint8x16_t __arm_vhsubq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u8)))
-uint8x16_t __arm_vhsubq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u16)))
-uint16x8_t __arm_vidupq_m_n_u16(uint16x8_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u16)))
-uint16x8_t __arm_vidupq_m(uint16x8_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u32)))
-uint32x4_t __arm_vidupq_m_n_u32(uint32x4_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u32)))
-uint32x4_t __arm_vidupq_m(uint32x4_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u8)))
-uint8x16_t __arm_vidupq_m_n_u8(uint8x16_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u8)))
-uint8x16_t __arm_vidupq_m(uint8x16_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u16)))
-uint16x8_t __arm_vidupq_m_wb_u16(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u16)))
-uint16x8_t __arm_vidupq_m(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u32)))
-uint32x4_t __arm_vidupq_m_wb_u32(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u32)))
-uint32x4_t __arm_vidupq_m(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u8)))
-uint8x16_t __arm_vidupq_m_wb_u8(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u8)))
-uint8x16_t __arm_vidupq_m(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u16)))
-uint16x8_t __arm_vidupq_n_u16(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u16)))
-uint16x8_t __arm_vidupq_u16(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u32)))
-uint32x4_t __arm_vidupq_n_u32(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u32)))
-uint32x4_t __arm_vidupq_u32(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u8)))
-uint8x16_t __arm_vidupq_n_u8(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u8)))
-uint8x16_t __arm_vidupq_u8(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u16)))
-uint16x8_t __arm_vidupq_wb_u16(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u16)))
-uint16x8_t __arm_vidupq_u16(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u32)))
-uint32x4_t __arm_vidupq_wb_u32(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u32)))
-uint32x4_t __arm_vidupq_u32(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u8)))
-uint8x16_t __arm_vidupq_wb_u8(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u8)))
-uint8x16_t __arm_vidupq_u8(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u16)))
-uint16x8_t __arm_vidupq_x_n_u16(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u16)))
-uint16x8_t __arm_vidupq_x_u16(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u32)))
-uint32x4_t __arm_vidupq_x_n_u32(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u32)))
-uint32x4_t __arm_vidupq_x_u32(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u8)))
-uint8x16_t __arm_vidupq_x_n_u8(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u8)))
-uint8x16_t __arm_vidupq_x_u8(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u16)))
-uint16x8_t __arm_vidupq_x_wb_u16(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u16)))
-uint16x8_t __arm_vidupq_x_u16(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u32)))
-uint32x4_t __arm_vidupq_x_wb_u32(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u32)))
-uint32x4_t __arm_vidupq_x_u32(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u8)))
-uint8x16_t __arm_vidupq_x_wb_u8(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u8)))
-uint8x16_t __arm_vidupq_x_u8(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u16)))
-uint16x8_t __arm_viwdupq_m_n_u16(uint16x8_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u16)))
-uint16x8_t __arm_viwdupq_m(uint16x8_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u32)))
-uint32x4_t __arm_viwdupq_m_n_u32(uint32x4_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u32)))
-uint32x4_t __arm_viwdupq_m(uint32x4_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u8)))
-uint8x16_t __arm_viwdupq_m_n_u8(uint8x16_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u8)))
-uint8x16_t __arm_viwdupq_m(uint8x16_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u16)))
-uint16x8_t __arm_viwdupq_m_wb_u16(uint16x8_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u16)))
-uint16x8_t __arm_viwdupq_m(uint16x8_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u32)))
-uint32x4_t __arm_viwdupq_m_wb_u32(uint32x4_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u32)))
-uint32x4_t __arm_viwdupq_m(uint32x4_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u8)))
-uint8x16_t __arm_viwdupq_m_wb_u8(uint8x16_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u8)))
-uint8x16_t __arm_viwdupq_m(uint8x16_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u16)))
-uint16x8_t __arm_viwdupq_n_u16(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u16)))
-uint16x8_t __arm_viwdupq_u16(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u32)))
-uint32x4_t __arm_viwdupq_n_u32(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u32)))
-uint32x4_t __arm_viwdupq_u32(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u8)))
-uint8x16_t __arm_viwdupq_n_u8(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u8)))
-uint8x16_t __arm_viwdupq_u8(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u16)))
-uint16x8_t __arm_viwdupq_wb_u16(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u16)))
-uint16x8_t __arm_viwdupq_u16(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u32)))
-uint32x4_t __arm_viwdupq_wb_u32(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u32)))
-uint32x4_t __arm_viwdupq_u32(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u8)))
-uint8x16_t __arm_viwdupq_wb_u8(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u8)))
-uint8x16_t __arm_viwdupq_u8(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u16)))
-uint16x8_t __arm_viwdupq_x_n_u16(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u16)))
-uint16x8_t __arm_viwdupq_x_u16(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u32)))
-uint32x4_t __arm_viwdupq_x_n_u32(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u32)))
-uint32x4_t __arm_viwdupq_x_u32(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u8)))
-uint8x16_t __arm_viwdupq_x_n_u8(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u8)))
-uint8x16_t __arm_viwdupq_x_u8(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u16)))
-uint16x8_t __arm_viwdupq_x_wb_u16(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u16)))
-uint16x8_t __arm_viwdupq_x_u16(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u32)))
-uint32x4_t __arm_viwdupq_x_wb_u32(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u32)))
-uint32x4_t __arm_viwdupq_x_u32(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u8)))
-uint8x16_t __arm_viwdupq_x_wb_u8(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u8)))
-uint8x16_t __arm_viwdupq_x_u8(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s16)))
-int16x8_t __arm_vld1q_s16(const int16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s16)))
-int16x8_t __arm_vld1q(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s32)))
-int32x4_t __arm_vld1q_s32(const int32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s32)))
-int32x4_t __arm_vld1q(const int32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s8)))
-int8x16_t __arm_vld1q_s8(const int8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s8)))
-int8x16_t __arm_vld1q(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u16)))
-uint16x8_t __arm_vld1q_u16(const uint16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u16)))
-uint16x8_t __arm_vld1q(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u32)))
-uint32x4_t __arm_vld1q_u32(const uint32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u32)))
-uint32x4_t __arm_vld1q(const uint32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u8)))
-uint8x16_t __arm_vld1q_u8(const uint8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u8)))
-uint8x16_t __arm_vld1q(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s16)))
-int16x8_t __arm_vld1q_z_s16(const int16_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s16)))
-int16x8_t __arm_vld1q_z(const int16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s32)))
-int32x4_t __arm_vld1q_z_s32(const int32_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s32)))
-int32x4_t __arm_vld1q_z(const int32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s8)))
-int8x16_t __arm_vld1q_z_s8(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s8)))
-int8x16_t __arm_vld1q_z(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u16)))
-uint16x8_t __arm_vld1q_z_u16(const uint16_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u16)))
-uint16x8_t __arm_vld1q_z(const uint16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u32)))
-uint32x4_t __arm_vld1q_z_u32(const uint32_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u32)))
-uint32x4_t __arm_vld1q_z(const uint32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u8)))
-uint8x16_t __arm_vld1q_z_u8(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u8)))
-uint8x16_t __arm_vld1q_z(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s16)))
-int16x8x2_t __arm_vld2q_s16(const int16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s16)))
-int16x8x2_t __arm_vld2q(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s32)))
-int32x4x2_t __arm_vld2q_s32(const int32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s32)))
-int32x4x2_t __arm_vld2q(const int32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s8)))
-int8x16x2_t __arm_vld2q_s8(const int8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s8)))
-int8x16x2_t __arm_vld2q(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u16)))
-uint16x8x2_t __arm_vld2q_u16(const uint16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u16)))
-uint16x8x2_t __arm_vld2q(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u32)))
-uint32x4x2_t __arm_vld2q_u32(const uint32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u32)))
-uint32x4x2_t __arm_vld2q(const uint32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u8)))
-uint8x16x2_t __arm_vld2q_u8(const uint8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u8)))
-uint8x16x2_t __arm_vld2q(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s16)))
-int16x8x4_t __arm_vld4q_s16(const int16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s16)))
-int16x8x4_t __arm_vld4q(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s32)))
-int32x4x4_t __arm_vld4q_s32(const int32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s32)))
-int32x4x4_t __arm_vld4q(const int32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s8)))
-int8x16x4_t __arm_vld4q_s8(const int8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s8)))
-int8x16x4_t __arm_vld4q(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u16)))
-uint16x8x4_t __arm_vld4q_u16(const uint16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u16)))
-uint16x8x4_t __arm_vld4q(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u32)))
-uint32x4x4_t __arm_vld4q_u32(const uint32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u32)))
-uint32x4x4_t __arm_vld4q(const uint32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u8)))
-uint8x16x4_t __arm_vld4q_u8(const uint8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u8)))
-uint8x16x4_t __arm_vld4q(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s16)))
-int16x8_t __arm_vldrbq_gather_offset_s16(const int8_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s16)))
-int16x8_t __arm_vldrbq_gather_offset(const int8_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s32)))
-int32x4_t __arm_vldrbq_gather_offset_s32(const int8_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s32)))
-int32x4_t __arm_vldrbq_gather_offset(const int8_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s8)))
-int8x16_t __arm_vldrbq_gather_offset_s8(const int8_t *, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s8)))
-int8x16_t __arm_vldrbq_gather_offset(const int8_t *, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u16)))
-uint16x8_t __arm_vldrbq_gather_offset_u16(const uint8_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u16)))
-uint16x8_t __arm_vldrbq_gather_offset(const uint8_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u32)))
-uint32x4_t __arm_vldrbq_gather_offset_u32(const uint8_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u32)))
-uint32x4_t __arm_vldrbq_gather_offset(const uint8_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u8)))
-uint8x16_t __arm_vldrbq_gather_offset_u8(const uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u8)))
-uint8x16_t __arm_vldrbq_gather_offset(const uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s16)))
-int16x8_t __arm_vldrbq_gather_offset_z_s16(const int8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s16)))
-int16x8_t __arm_vldrbq_gather_offset_z(const int8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s32)))
-int32x4_t __arm_vldrbq_gather_offset_z_s32(const int8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s32)))
-int32x4_t __arm_vldrbq_gather_offset_z(const int8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s8)))
-int8x16_t __arm_vldrbq_gather_offset_z_s8(const int8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s8)))
-int8x16_t __arm_vldrbq_gather_offset_z(const int8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u16)))
-uint16x8_t __arm_vldrbq_gather_offset_z_u16(const uint8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u16)))
-uint16x8_t __arm_vldrbq_gather_offset_z(const uint8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u32)))
-uint32x4_t __arm_vldrbq_gather_offset_z_u32(const uint8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u32)))
-uint32x4_t __arm_vldrbq_gather_offset_z(const uint8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u8)))
-uint8x16_t __arm_vldrbq_gather_offset_z_u8(const uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u8)))
-uint8x16_t __arm_vldrbq_gather_offset_z(const uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_s16)))
-int16x8_t __arm_vldrbq_s16(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_s32)))
-int32x4_t __arm_vldrbq_s32(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_s8)))
-int8x16_t __arm_vldrbq_s8(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_u16)))
-uint16x8_t __arm_vldrbq_u16(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_u32)))
-uint32x4_t __arm_vldrbq_u32(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_u8)))
-uint8x16_t __arm_vldrbq_u8(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_s16)))
-int16x8_t __arm_vldrbq_z_s16(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_s32)))
-int32x4_t __arm_vldrbq_z_s32(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_s8)))
-int8x16_t __arm_vldrbq_z_s8(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_u16)))
-uint16x8_t __arm_vldrbq_z_u16(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_u32)))
-uint32x4_t __arm_vldrbq_z_u32(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_u8)))
-uint8x16_t __arm_vldrbq_z_u8(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_s64)))
-int64x2_t __arm_vldrdq_gather_base_s64(uint64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_u64)))
-uint64x2_t __arm_vldrdq_gather_base_u64(uint64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_wb_s64)))
-int64x2_t __arm_vldrdq_gather_base_wb_s64(uint64x2_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_wb_u64)))
-uint64x2_t __arm_vldrdq_gather_base_wb_u64(uint64x2_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_wb_z_s64)))
-int64x2_t __arm_vldrdq_gather_base_wb_z_s64(uint64x2_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_wb_z_u64)))
-uint64x2_t __arm_vldrdq_gather_base_wb_z_u64(uint64x2_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_z_s64)))
-int64x2_t __arm_vldrdq_gather_base_z_s64(uint64x2_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_z_u64)))
-uint64x2_t __arm_vldrdq_gather_base_z_u64(uint64x2_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_s64)))
-int64x2_t __arm_vldrdq_gather_offset_s64(const int64_t *, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_s64)))
-int64x2_t __arm_vldrdq_gather_offset(const int64_t *, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_u64)))
-uint64x2_t __arm_vldrdq_gather_offset_u64(const uint64_t *, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_u64)))
-uint64x2_t __arm_vldrdq_gather_offset(const uint64_t *, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_z_s64)))
-int64x2_t __arm_vldrdq_gather_offset_z_s64(const int64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_z_s64)))
-int64x2_t __arm_vldrdq_gather_offset_z(const int64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_z_u64)))
-uint64x2_t __arm_vldrdq_gather_offset_z_u64(const uint64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_z_u64)))
-uint64x2_t __arm_vldrdq_gather_offset_z(const uint64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_s64)))
-int64x2_t __arm_vldrdq_gather_shifted_offset_s64(const int64_t *, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_s64)))
-int64x2_t __arm_vldrdq_gather_shifted_offset(const int64_t *, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_u64)))
-uint64x2_t __arm_vldrdq_gather_shifted_offset_u64(const uint64_t *, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_u64)))
-uint64x2_t __arm_vldrdq_gather_shifted_offset(const uint64_t *, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_z_s64)))
-int64x2_t __arm_vldrdq_gather_shifted_offset_z_s64(const int64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_z_s64)))
-int64x2_t __arm_vldrdq_gather_shifted_offset_z(const int64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_z_u64)))
-uint64x2_t __arm_vldrdq_gather_shifted_offset_z_u64(const uint64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_z_u64)))
-uint64x2_t __arm_vldrdq_gather_shifted_offset_z(const uint64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_s16)))
-int16x8_t __arm_vldrhq_gather_offset_s16(const int16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_s16)))
-int16x8_t __arm_vldrhq_gather_offset(const int16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_s32)))
-int32x4_t __arm_vldrhq_gather_offset_s32(const int16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_s32)))
-int32x4_t __arm_vldrhq_gather_offset(const int16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_u16)))
-uint16x8_t __arm_vldrhq_gather_offset_u16(const uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_u16)))
-uint16x8_t __arm_vldrhq_gather_offset(const uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_u32)))
-uint32x4_t __arm_vldrhq_gather_offset_u32(const uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_u32)))
-uint32x4_t __arm_vldrhq_gather_offset(const uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_s16)))
-int16x8_t __arm_vldrhq_gather_offset_z_s16(const int16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_s16)))
-int16x8_t __arm_vldrhq_gather_offset_z(const int16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_s32)))
-int32x4_t __arm_vldrhq_gather_offset_z_s32(const int16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_s32)))
-int32x4_t __arm_vldrhq_gather_offset_z(const int16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_u16)))
-uint16x8_t __arm_vldrhq_gather_offset_z_u16(const uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_u16)))
-uint16x8_t __arm_vldrhq_gather_offset_z(const uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_u32)))
-uint32x4_t __arm_vldrhq_gather_offset_z_u32(const uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_u32)))
-uint32x4_t __arm_vldrhq_gather_offset_z(const uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_s16)))
-int16x8_t __arm_vldrhq_gather_shifted_offset_s16(const int16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_s16)))
-int16x8_t __arm_vldrhq_gather_shifted_offset(const int16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_s32)))
-int32x4_t __arm_vldrhq_gather_shifted_offset_s32(const int16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_s32)))
-int32x4_t __arm_vldrhq_gather_shifted_offset(const int16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_u16)))
-uint16x8_t __arm_vldrhq_gather_shifted_offset_u16(const uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_u16)))
-uint16x8_t __arm_vldrhq_gather_shifted_offset(const uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_u32)))
-uint32x4_t __arm_vldrhq_gather_shifted_offset_u32(const uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_u32)))
-uint32x4_t __arm_vldrhq_gather_shifted_offset(const uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_s16)))
-int16x8_t __arm_vldrhq_gather_shifted_offset_z_s16(const int16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_s16)))
-int16x8_t __arm_vldrhq_gather_shifted_offset_z(const int16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_s32)))
-int32x4_t __arm_vldrhq_gather_shifted_offset_z_s32(const int16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_s32)))
-int32x4_t __arm_vldrhq_gather_shifted_offset_z(const int16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_u16)))
-uint16x8_t __arm_vldrhq_gather_shifted_offset_z_u16(const uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_u16)))
-uint16x8_t __arm_vldrhq_gather_shifted_offset_z(const uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_u32)))
-uint32x4_t __arm_vldrhq_gather_shifted_offset_z_u32(const uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_u32)))
-uint32x4_t __arm_vldrhq_gather_shifted_offset_z(const uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_s16)))
-int16x8_t __arm_vldrhq_s16(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_s32)))
-int32x4_t __arm_vldrhq_s32(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_u16)))
-uint16x8_t __arm_vldrhq_u16(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_u32)))
-uint32x4_t __arm_vldrhq_u32(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_s16)))
-int16x8_t __arm_vldrhq_z_s16(const int16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_s32)))
-int32x4_t __arm_vldrhq_z_s32(const int16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_u16)))
-uint16x8_t __arm_vldrhq_z_u16(const uint16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_u32)))
-uint32x4_t __arm_vldrhq_z_u32(const uint16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_s32)))
-int32x4_t __arm_vldrwq_gather_base_s32(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_u32)))
-uint32x4_t __arm_vldrwq_gather_base_u32(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_s32)))
-int32x4_t __arm_vldrwq_gather_base_wb_s32(uint32x4_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_u32)))
-uint32x4_t __arm_vldrwq_gather_base_wb_u32(uint32x4_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_z_s32)))
-int32x4_t __arm_vldrwq_gather_base_wb_z_s32(uint32x4_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_z_u32)))
-uint32x4_t __arm_vldrwq_gather_base_wb_z_u32(uint32x4_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_z_s32)))
-int32x4_t __arm_vldrwq_gather_base_z_s32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_z_u32)))
-uint32x4_t __arm_vldrwq_gather_base_z_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_s32)))
-int32x4_t __arm_vldrwq_gather_offset_s32(const int32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_s32)))
-int32x4_t __arm_vldrwq_gather_offset(const int32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_u32)))
-uint32x4_t __arm_vldrwq_gather_offset_u32(const uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_u32)))
-uint32x4_t __arm_vldrwq_gather_offset(const uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_s32)))
-int32x4_t __arm_vldrwq_gather_offset_z_s32(const int32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_s32)))
-int32x4_t __arm_vldrwq_gather_offset_z(const int32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_u32)))
-uint32x4_t __arm_vldrwq_gather_offset_z_u32(const uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_u32)))
-uint32x4_t __arm_vldrwq_gather_offset_z(const uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_s32)))
-int32x4_t __arm_vldrwq_gather_shifted_offset_s32(const int32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_s32)))
-int32x4_t __arm_vldrwq_gather_shifted_offset(const int32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_u32)))
-uint32x4_t __arm_vldrwq_gather_shifted_offset_u32(const uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_u32)))
-uint32x4_t __arm_vldrwq_gather_shifted_offset(const uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_s32)))
-int32x4_t __arm_vldrwq_gather_shifted_offset_z_s32(const int32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_s32)))
-int32x4_t __arm_vldrwq_gather_shifted_offset_z(const int32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_u32)))
-uint32x4_t __arm_vldrwq_gather_shifted_offset_z_u32(const uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_u32)))
-uint32x4_t __arm_vldrwq_gather_shifted_offset_z(const uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_s32)))
-int32x4_t __arm_vldrwq_s32(const int32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_u32)))
-uint32x4_t __arm_vldrwq_u32(const uint32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_z_s32)))
-int32x4_t __arm_vldrwq_z_s32(const int32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_z_u32)))
-uint32x4_t __arm_vldrwq_z_u32(const uint32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s16)))
-uint16x8_t __arm_vmaxaq_m_s16(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s16)))
-uint16x8_t __arm_vmaxaq_m(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s32)))
-uint32x4_t __arm_vmaxaq_m_s32(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s32)))
-uint32x4_t __arm_vmaxaq_m(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s8)))
-uint8x16_t __arm_vmaxaq_m_s8(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s8)))
-uint8x16_t __arm_vmaxaq_m(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s16)))
-uint16x8_t __arm_vmaxaq_s16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s16)))
-uint16x8_t __arm_vmaxaq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s32)))
-uint32x4_t __arm_vmaxaq_s32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s32)))
-uint32x4_t __arm_vmaxaq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s8)))
-uint8x16_t __arm_vmaxaq_s8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s8)))
-uint8x16_t __arm_vmaxaq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s16)))
-uint16_t __arm_vmaxavq_p_s16(uint16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s16)))
-uint16_t __arm_vmaxavq_p(uint16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s32)))
-uint32_t __arm_vmaxavq_p_s32(uint32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s32)))
-uint32_t __arm_vmaxavq_p(uint32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s8)))
-uint8_t __arm_vmaxavq_p_s8(uint8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s8)))
-uint8_t __arm_vmaxavq_p(uint8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s16)))
-uint16_t __arm_vmaxavq_s16(uint16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s16)))
-uint16_t __arm_vmaxavq(uint16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s32)))
-uint32_t __arm_vmaxavq_s32(uint32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s32)))
-uint32_t __arm_vmaxavq(uint32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s8)))
-uint8_t __arm_vmaxavq_s8(uint8_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s8)))
-uint8_t __arm_vmaxavq(uint8_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s16)))
-int16x8_t __arm_vmaxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s16)))
-int16x8_t __arm_vmaxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s32)))
-int32x4_t __arm_vmaxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s32)))
-int32x4_t __arm_vmaxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s8)))
-int8x16_t __arm_vmaxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s8)))
-int8x16_t __arm_vmaxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u16)))
-uint16x8_t __arm_vmaxq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u16)))
-uint16x8_t __arm_vmaxq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u32)))
-uint32x4_t __arm_vmaxq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u32)))
-uint32x4_t __arm_vmaxq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u8)))
-uint8x16_t __arm_vmaxq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u8)))
-uint8x16_t __arm_vmaxq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s16)))
-int16x8_t __arm_vmaxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s16)))
-int16x8_t __arm_vmaxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s32)))
-int32x4_t __arm_vmaxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s32)))
-int32x4_t __arm_vmaxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s8)))
-int8x16_t __arm_vmaxq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s8)))
-int8x16_t __arm_vmaxq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u16)))
-uint16x8_t __arm_vmaxq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u16)))
-uint16x8_t __arm_vmaxq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u32)))
-uint32x4_t __arm_vmaxq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u32)))
-uint32x4_t __arm_vmaxq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u8)))
-uint8x16_t __arm_vmaxq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u8)))
-uint8x16_t __arm_vmaxq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s16)))
-int16x8_t __arm_vmaxq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s16)))
-int16x8_t __arm_vmaxq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s32)))
-int32x4_t __arm_vmaxq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s32)))
-int32x4_t __arm_vmaxq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s8)))
-int8x16_t __arm_vmaxq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s8)))
-int8x16_t __arm_vmaxq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u16)))
-uint16x8_t __arm_vmaxq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u16)))
-uint16x8_t __arm_vmaxq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u32)))
-uint32x4_t __arm_vmaxq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u32)))
-uint32x4_t __arm_vmaxq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u8)))
-uint8x16_t __arm_vmaxq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u8)))
-uint8x16_t __arm_vmaxq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s16)))
-int16_t __arm_vmaxvq_p_s16(int16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s16)))
-int16_t __arm_vmaxvq_p(int16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s32)))
-int32_t __arm_vmaxvq_p_s32(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s32)))
-int32_t __arm_vmaxvq_p(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s8)))
-int8_t __arm_vmaxvq_p_s8(int8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s8)))
-int8_t __arm_vmaxvq_p(int8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u16)))
-uint16_t __arm_vmaxvq_p_u16(uint16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u16)))
-uint16_t __arm_vmaxvq_p(uint16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u32)))
-uint32_t __arm_vmaxvq_p_u32(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u32)))
-uint32_t __arm_vmaxvq_p(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u8)))
-uint8_t __arm_vmaxvq_p_u8(uint8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u8)))
-uint8_t __arm_vmaxvq_p(uint8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s16)))
-int16_t __arm_vmaxvq_s16(int16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s16)))
-int16_t __arm_vmaxvq(int16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s32)))
-int32_t __arm_vmaxvq_s32(int32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s32)))
-int32_t __arm_vmaxvq(int32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s8)))
-int8_t __arm_vmaxvq_s8(int8_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s8)))
-int8_t __arm_vmaxvq(int8_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u16)))
-uint16_t __arm_vmaxvq_u16(uint16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u16)))
-uint16_t __arm_vmaxvq(uint16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u32)))
-uint32_t __arm_vmaxvq_u32(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u32)))
-uint32_t __arm_vmaxvq(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u8)))
-uint8_t __arm_vmaxvq_u8(uint8_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u8)))
-uint8_t __arm_vmaxvq(uint8_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s16)))
-uint16x8_t __arm_vminaq_m_s16(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s16)))
-uint16x8_t __arm_vminaq_m(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s32)))
-uint32x4_t __arm_vminaq_m_s32(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s32)))
-uint32x4_t __arm_vminaq_m(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s8)))
-uint8x16_t __arm_vminaq_m_s8(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s8)))
-uint8x16_t __arm_vminaq_m(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s16)))
-uint16x8_t __arm_vminaq_s16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s16)))
-uint16x8_t __arm_vminaq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s32)))
-uint32x4_t __arm_vminaq_s32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s32)))
-uint32x4_t __arm_vminaq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s8)))
-uint8x16_t __arm_vminaq_s8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s8)))
-uint8x16_t __arm_vminaq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s16)))
-uint16_t __arm_vminavq_p_s16(uint16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s16)))
-uint16_t __arm_vminavq_p(uint16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s32)))
-uint32_t __arm_vminavq_p_s32(uint32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s32)))
-uint32_t __arm_vminavq_p(uint32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s8)))
-uint8_t __arm_vminavq_p_s8(uint8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s8)))
-uint8_t __arm_vminavq_p(uint8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s16)))
-uint16_t __arm_vminavq_s16(uint16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s16)))
-uint16_t __arm_vminavq(uint16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s32)))
-uint32_t __arm_vminavq_s32(uint32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s32)))
-uint32_t __arm_vminavq(uint32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s8)))
-uint8_t __arm_vminavq_s8(uint8_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s8)))
-uint8_t __arm_vminavq(uint8_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s16)))
-int16x8_t __arm_vminq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s16)))
-int16x8_t __arm_vminq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s32)))
-int32x4_t __arm_vminq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s32)))
-int32x4_t __arm_vminq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s8)))
-int8x16_t __arm_vminq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s8)))
-int8x16_t __arm_vminq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u16)))
-uint16x8_t __arm_vminq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u16)))
-uint16x8_t __arm_vminq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u32)))
-uint32x4_t __arm_vminq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u32)))
-uint32x4_t __arm_vminq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u8)))
-uint8x16_t __arm_vminq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u8)))
-uint8x16_t __arm_vminq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_s16)))
-int16x8_t __arm_vminq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_s16)))
-int16x8_t __arm_vminq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_s32)))
-int32x4_t __arm_vminq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_s32)))
-int32x4_t __arm_vminq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_s8)))
-int8x16_t __arm_vminq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_s8)))
-int8x16_t __arm_vminq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_u16)))
-uint16x8_t __arm_vminq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_u16)))
-uint16x8_t __arm_vminq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_u32)))
-uint32x4_t __arm_vminq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_u32)))
-uint32x4_t __arm_vminq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_u8)))
-uint8x16_t __arm_vminq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_u8)))
-uint8x16_t __arm_vminq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s16)))
-int16x8_t __arm_vminq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s16)))
-int16x8_t __arm_vminq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s32)))
-int32x4_t __arm_vminq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s32)))
-int32x4_t __arm_vminq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s8)))
-int8x16_t __arm_vminq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s8)))
-int8x16_t __arm_vminq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u16)))
-uint16x8_t __arm_vminq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u16)))
-uint16x8_t __arm_vminq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u32)))
-uint32x4_t __arm_vminq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u32)))
-uint32x4_t __arm_vminq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u8)))
-uint8x16_t __arm_vminq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u8)))
-uint8x16_t __arm_vminq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s16)))
-int16_t __arm_vminvq_p_s16(int16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s16)))
-int16_t __arm_vminvq_p(int16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s32)))
-int32_t __arm_vminvq_p_s32(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s32)))
-int32_t __arm_vminvq_p(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s8)))
-int8_t __arm_vminvq_p_s8(int8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s8)))
-int8_t __arm_vminvq_p(int8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u16)))
-uint16_t __arm_vminvq_p_u16(uint16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u16)))
-uint16_t __arm_vminvq_p(uint16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u32)))
-uint32_t __arm_vminvq_p_u32(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u32)))
-uint32_t __arm_vminvq_p(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u8)))
-uint8_t __arm_vminvq_p_u8(uint8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u8)))
-uint8_t __arm_vminvq_p(uint8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s16)))
-int16_t __arm_vminvq_s16(int16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s16)))
-int16_t __arm_vminvq(int16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s32)))
-int32_t __arm_vminvq_s32(int32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s32)))
-int32_t __arm_vminvq(int32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s8)))
-int8_t __arm_vminvq_s8(int8_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s8)))
-int8_t __arm_vminvq(int8_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u16)))
-uint16_t __arm_vminvq_u16(uint16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u16)))
-uint16_t __arm_vminvq(uint16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u32)))
-uint32_t __arm_vminvq_u32(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u32)))
-uint32_t __arm_vminvq(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u8)))
-uint8_t __arm_vminvq_u8(uint8_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u8)))
-uint8_t __arm_vminvq(uint8_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s16)))
-int32_t __arm_vmladavaq_p_s16(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s16)))
-int32_t __arm_vmladavaq_p(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s32)))
-int32_t __arm_vmladavaq_p_s32(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s32)))
-int32_t __arm_vmladavaq_p(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s8)))
-int32_t __arm_vmladavaq_p_s8(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s8)))
-int32_t __arm_vmladavaq_p(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u16)))
-uint32_t __arm_vmladavaq_p_u16(uint32_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u16)))
-uint32_t __arm_vmladavaq_p(uint32_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u32)))
-uint32_t __arm_vmladavaq_p_u32(uint32_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u32)))
-uint32_t __arm_vmladavaq_p(uint32_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u8)))
-uint32_t __arm_vmladavaq_p_u8(uint32_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u8)))
-uint32_t __arm_vmladavaq_p(uint32_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s16)))
-int32_t __arm_vmladavaq_s16(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s16)))
-int32_t __arm_vmladavaq(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s32)))
-int32_t __arm_vmladavaq_s32(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s32)))
-int32_t __arm_vmladavaq(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s8)))
-int32_t __arm_vmladavaq_s8(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s8)))
-int32_t __arm_vmladavaq(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u16)))
-uint32_t __arm_vmladavaq_u16(uint32_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u16)))
-uint32_t __arm_vmladavaq(uint32_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u32)))
-uint32_t __arm_vmladavaq_u32(uint32_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u32)))
-uint32_t __arm_vmladavaq(uint32_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u8)))
-uint32_t __arm_vmladavaq_u8(uint32_t, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u8)))
-uint32_t __arm_vmladavaq(uint32_t, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s16)))
-int32_t __arm_vmladavaxq_p_s16(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s16)))
-int32_t __arm_vmladavaxq_p(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s32)))
-int32_t __arm_vmladavaxq_p_s32(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s32)))
-int32_t __arm_vmladavaxq_p(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s8)))
-int32_t __arm_vmladavaxq_p_s8(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s8)))
-int32_t __arm_vmladavaxq_p(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s16)))
-int32_t __arm_vmladavaxq_s16(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s16)))
-int32_t __arm_vmladavaxq(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s32)))
-int32_t __arm_vmladavaxq_s32(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s32)))
-int32_t __arm_vmladavaxq(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s8)))
-int32_t __arm_vmladavaxq_s8(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s8)))
-int32_t __arm_vmladavaxq(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s16)))
-int32_t __arm_vmladavq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s16)))
-int32_t __arm_vmladavq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s32)))
-int32_t __arm_vmladavq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s32)))
-int32_t __arm_vmladavq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s8)))
-int32_t __arm_vmladavq_p_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s8)))
-int32_t __arm_vmladavq_p(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u16)))
-uint32_t __arm_vmladavq_p_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u16)))
-uint32_t __arm_vmladavq_p(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u32)))
-uint32_t __arm_vmladavq_p_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u32)))
-uint32_t __arm_vmladavq_p(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u8)))
-uint32_t __arm_vmladavq_p_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u8)))
-uint32_t __arm_vmladavq_p(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s16)))
-int32_t __arm_vmladavq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s16)))
-int32_t __arm_vmladavq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s32)))
-int32_t __arm_vmladavq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s32)))
-int32_t __arm_vmladavq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s8)))
-int32_t __arm_vmladavq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s8)))
-int32_t __arm_vmladavq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u16)))
-uint32_t __arm_vmladavq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u16)))
-uint32_t __arm_vmladavq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u32)))
-uint32_t __arm_vmladavq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u32)))
-uint32_t __arm_vmladavq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u8)))
-uint32_t __arm_vmladavq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u8)))
-uint32_t __arm_vmladavq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s16)))
-int32_t __arm_vmladavxq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s16)))
-int32_t __arm_vmladavxq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s32)))
-int32_t __arm_vmladavxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s32)))
-int32_t __arm_vmladavxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s8)))
-int32_t __arm_vmladavxq_p_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s8)))
-int32_t __arm_vmladavxq_p(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s16)))
-int32_t __arm_vmladavxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s16)))
-int32_t __arm_vmladavxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s32)))
-int32_t __arm_vmladavxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s32)))
-int32_t __arm_vmladavxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s8)))
-int32_t __arm_vmladavxq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s8)))
-int32_t __arm_vmladavxq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_s16)))
-int64_t __arm_vmlaldavaq_p_s16(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_s16)))
-int64_t __arm_vmlaldavaq_p(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_s32)))
-int64_t __arm_vmlaldavaq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_s32)))
-int64_t __arm_vmlaldavaq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_u16)))
-uint64_t __arm_vmlaldavaq_p_u16(uint64_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_u16)))
-uint64_t __arm_vmlaldavaq_p(uint64_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_u32)))
-uint64_t __arm_vmlaldavaq_p_u32(uint64_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_u32)))
-uint64_t __arm_vmlaldavaq_p(uint64_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_s16)))
-int64_t __arm_vmlaldavaq_s16(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_s16)))
-int64_t __arm_vmlaldavaq(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_s32)))
-int64_t __arm_vmlaldavaq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_s32)))
-int64_t __arm_vmlaldavaq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_u16)))
-uint64_t __arm_vmlaldavaq_u16(uint64_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_u16)))
-uint64_t __arm_vmlaldavaq(uint64_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_u32)))
-uint64_t __arm_vmlaldavaq_u32(uint64_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_u32)))
-uint64_t __arm_vmlaldavaq(uint64_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_p_s16)))
-int64_t __arm_vmlaldavaxq_p_s16(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_p_s16)))
-int64_t __arm_vmlaldavaxq_p(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_p_s32)))
-int64_t __arm_vmlaldavaxq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_p_s32)))
-int64_t __arm_vmlaldavaxq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_s16)))
-int64_t __arm_vmlaldavaxq_s16(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_s16)))
-int64_t __arm_vmlaldavaxq(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_s32)))
-int64_t __arm_vmlaldavaxq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_s32)))
-int64_t __arm_vmlaldavaxq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_s16)))
-int64_t __arm_vmlaldavq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_s16)))
-int64_t __arm_vmlaldavq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_s32)))
-int64_t __arm_vmlaldavq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_s32)))
-int64_t __arm_vmlaldavq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_u16)))
-uint64_t __arm_vmlaldavq_p_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_u16)))
-uint64_t __arm_vmlaldavq_p(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_u32)))
-uint64_t __arm_vmlaldavq_p_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_u32)))
-uint64_t __arm_vmlaldavq_p(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_s16)))
-int64_t __arm_vmlaldavq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_s16)))
-int64_t __arm_vmlaldavq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_s32)))
-int64_t __arm_vmlaldavq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_s32)))
-int64_t __arm_vmlaldavq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_u16)))
-uint64_t __arm_vmlaldavq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_u16)))
-uint64_t __arm_vmlaldavq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_u32)))
-uint64_t __arm_vmlaldavq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_u32)))
-uint64_t __arm_vmlaldavq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_p_s16)))
-int64_t __arm_vmlaldavxq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_p_s16)))
-int64_t __arm_vmlaldavxq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_p_s32)))
-int64_t __arm_vmlaldavxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_p_s32)))
-int64_t __arm_vmlaldavxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_s16)))
-int64_t __arm_vmlaldavxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_s16)))
-int64_t __arm_vmlaldavxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_s32)))
-int64_t __arm_vmlaldavxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_s32)))
-int64_t __arm_vmlaldavxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s16)))
-int16x8_t __arm_vmlaq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s16)))
-int16x8_t __arm_vmlaq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s32)))
-int32x4_t __arm_vmlaq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s32)))
-int32x4_t __arm_vmlaq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s8)))
-int8x16_t __arm_vmlaq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s8)))
-int8x16_t __arm_vmlaq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u16)))
-uint16x8_t __arm_vmlaq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u16)))
-uint16x8_t __arm_vmlaq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u32)))
-uint32x4_t __arm_vmlaq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u32)))
-uint32x4_t __arm_vmlaq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u8)))
-uint8x16_t __arm_vmlaq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u8)))
-uint8x16_t __arm_vmlaq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s16)))
-int16x8_t __arm_vmlaq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s16)))
-int16x8_t __arm_vmlaq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s32)))
-int32x4_t __arm_vmlaq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s32)))
-int32x4_t __arm_vmlaq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s8)))
-int8x16_t __arm_vmlaq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s8)))
-int8x16_t __arm_vmlaq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u16)))
-uint16x8_t __arm_vmlaq_n_u16(uint16x8_t, uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u16)))
-uint16x8_t __arm_vmlaq(uint16x8_t, uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u32)))
-uint32x4_t __arm_vmlaq_n_u32(uint32x4_t, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u32)))
-uint32x4_t __arm_vmlaq(uint32x4_t, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u8)))
-uint8x16_t __arm_vmlaq_n_u8(uint8x16_t, uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u8)))
-uint8x16_t __arm_vmlaq(uint8x16_t, uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s16)))
-int16x8_t __arm_vmlasq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s16)))
-int16x8_t __arm_vmlasq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s32)))
-int32x4_t __arm_vmlasq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s32)))
-int32x4_t __arm_vmlasq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s8)))
-int8x16_t __arm_vmlasq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s8)))
-int8x16_t __arm_vmlasq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u16)))
-uint16x8_t __arm_vmlasq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u16)))
-uint16x8_t __arm_vmlasq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u32)))
-uint32x4_t __arm_vmlasq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u32)))
-uint32x4_t __arm_vmlasq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u8)))
-uint8x16_t __arm_vmlasq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u8)))
-uint8x16_t __arm_vmlasq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s16)))
-int16x8_t __arm_vmlasq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s16)))
-int16x8_t __arm_vmlasq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s32)))
-int32x4_t __arm_vmlasq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s32)))
-int32x4_t __arm_vmlasq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s8)))
-int8x16_t __arm_vmlasq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s8)))
-int8x16_t __arm_vmlasq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u16)))
-uint16x8_t __arm_vmlasq_n_u16(uint16x8_t, uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u16)))
-uint16x8_t __arm_vmlasq(uint16x8_t, uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u32)))
-uint32x4_t __arm_vmlasq_n_u32(uint32x4_t, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u32)))
-uint32x4_t __arm_vmlasq(uint32x4_t, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u8)))
-uint8x16_t __arm_vmlasq_n_u8(uint8x16_t, uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u8)))
-uint8x16_t __arm_vmlasq(uint8x16_t, uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s16)))
-int32_t __arm_vmlsdavaq_p_s16(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s16)))
-int32_t __arm_vmlsdavaq_p(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s32)))
-int32_t __arm_vmlsdavaq_p_s32(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s32)))
-int32_t __arm_vmlsdavaq_p(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s8)))
-int32_t __arm_vmlsdavaq_p_s8(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s8)))
-int32_t __arm_vmlsdavaq_p(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s16)))
-int32_t __arm_vmlsdavaq_s16(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s16)))
-int32_t __arm_vmlsdavaq(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s32)))
-int32_t __arm_vmlsdavaq_s32(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s32)))
-int32_t __arm_vmlsdavaq(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s8)))
-int32_t __arm_vmlsdavaq_s8(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s8)))
-int32_t __arm_vmlsdavaq(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s16)))
-int32_t __arm_vmlsdavaxq_p_s16(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s16)))
-int32_t __arm_vmlsdavaxq_p(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s32)))
-int32_t __arm_vmlsdavaxq_p_s32(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s32)))
-int32_t __arm_vmlsdavaxq_p(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s8)))
-int32_t __arm_vmlsdavaxq_p_s8(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s8)))
-int32_t __arm_vmlsdavaxq_p(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s16)))
-int32_t __arm_vmlsdavaxq_s16(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s16)))
-int32_t __arm_vmlsdavaxq(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s32)))
-int32_t __arm_vmlsdavaxq_s32(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s32)))
-int32_t __arm_vmlsdavaxq(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s8)))
-int32_t __arm_vmlsdavaxq_s8(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s8)))
-int32_t __arm_vmlsdavaxq(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s16)))
-int32_t __arm_vmlsdavq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s16)))
-int32_t __arm_vmlsdavq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s32)))
-int32_t __arm_vmlsdavq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s32)))
-int32_t __arm_vmlsdavq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s8)))
-int32_t __arm_vmlsdavq_p_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s8)))
-int32_t __arm_vmlsdavq_p(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s16)))
-int32_t __arm_vmlsdavq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s16)))
-int32_t __arm_vmlsdavq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s32)))
-int32_t __arm_vmlsdavq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s32)))
-int32_t __arm_vmlsdavq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s8)))
-int32_t __arm_vmlsdavq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s8)))
-int32_t __arm_vmlsdavq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s16)))
-int32_t __arm_vmlsdavxq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s16)))
-int32_t __arm_vmlsdavxq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s32)))
-int32_t __arm_vmlsdavxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s32)))
-int32_t __arm_vmlsdavxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s8)))
-int32_t __arm_vmlsdavxq_p_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s8)))
-int32_t __arm_vmlsdavxq_p(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s16)))
-int32_t __arm_vmlsdavxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s16)))
-int32_t __arm_vmlsdavxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s32)))
-int32_t __arm_vmlsdavxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s32)))
-int32_t __arm_vmlsdavxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s8)))
-int32_t __arm_vmlsdavxq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s8)))
-int32_t __arm_vmlsdavxq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_p_s16)))
-int64_t __arm_vmlsldavaq_p_s16(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_p_s16)))
-int64_t __arm_vmlsldavaq_p(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_p_s32)))
-int64_t __arm_vmlsldavaq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_p_s32)))
-int64_t __arm_vmlsldavaq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_s16)))
-int64_t __arm_vmlsldavaq_s16(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_s16)))
-int64_t __arm_vmlsldavaq(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_s32)))
-int64_t __arm_vmlsldavaq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_s32)))
-int64_t __arm_vmlsldavaq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_p_s16)))
-int64_t __arm_vmlsldavaxq_p_s16(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_p_s16)))
-int64_t __arm_vmlsldavaxq_p(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_p_s32)))
-int64_t __arm_vmlsldavaxq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_p_s32)))
-int64_t __arm_vmlsldavaxq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_s16)))
-int64_t __arm_vmlsldavaxq_s16(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_s16)))
-int64_t __arm_vmlsldavaxq(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_s32)))
-int64_t __arm_vmlsldavaxq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_s32)))
-int64_t __arm_vmlsldavaxq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_p_s16)))
-int64_t __arm_vmlsldavq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_p_s16)))
-int64_t __arm_vmlsldavq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_p_s32)))
-int64_t __arm_vmlsldavq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_p_s32)))
-int64_t __arm_vmlsldavq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_s16)))
-int64_t __arm_vmlsldavq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_s16)))
-int64_t __arm_vmlsldavq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_s32)))
-int64_t __arm_vmlsldavq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_s32)))
-int64_t __arm_vmlsldavq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_p_s16)))
-int64_t __arm_vmlsldavxq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_p_s16)))
-int64_t __arm_vmlsldavxq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_p_s32)))
-int64_t __arm_vmlsldavxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_p_s32)))
-int64_t __arm_vmlsldavxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_s16)))
-int64_t __arm_vmlsldavxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_s16)))
-int64_t __arm_vmlsldavxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_s32)))
-int64_t __arm_vmlsldavxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_s32)))
-int64_t __arm_vmlsldavxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_s16)))
-int32x4_t __arm_vmovlbq_m_s16(int32x4_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_s16)))
-int32x4_t __arm_vmovlbq_m(int32x4_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_s8)))
-int16x8_t __arm_vmovlbq_m_s8(int16x8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_s8)))
-int16x8_t __arm_vmovlbq_m(int16x8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_u16)))
-uint32x4_t __arm_vmovlbq_m_u16(uint32x4_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_u16)))
-uint32x4_t __arm_vmovlbq_m(uint32x4_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_u8)))
-uint16x8_t __arm_vmovlbq_m_u8(uint16x8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_u8)))
-uint16x8_t __arm_vmovlbq_m(uint16x8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_s16)))
-int32x4_t __arm_vmovlbq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_s16)))
-int32x4_t __arm_vmovlbq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_s8)))
-int16x8_t __arm_vmovlbq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_s8)))
-int16x8_t __arm_vmovlbq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_u16)))
-uint32x4_t __arm_vmovlbq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_u16)))
-uint32x4_t __arm_vmovlbq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_u8)))
-uint16x8_t __arm_vmovlbq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_u8)))
-uint16x8_t __arm_vmovlbq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_s16)))
-int32x4_t __arm_vmovlbq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_s16)))
-int32x4_t __arm_vmovlbq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_s8)))
-int16x8_t __arm_vmovlbq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_s8)))
-int16x8_t __arm_vmovlbq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_u16)))
-uint32x4_t __arm_vmovlbq_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_u16)))
-uint32x4_t __arm_vmovlbq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_u8)))
-uint16x8_t __arm_vmovlbq_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_u8)))
-uint16x8_t __arm_vmovlbq_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_s16)))
-int32x4_t __arm_vmovltq_m_s16(int32x4_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_s16)))
-int32x4_t __arm_vmovltq_m(int32x4_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_s8)))
-int16x8_t __arm_vmovltq_m_s8(int16x8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_s8)))
-int16x8_t __arm_vmovltq_m(int16x8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_u16)))
-uint32x4_t __arm_vmovltq_m_u16(uint32x4_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_u16)))
-uint32x4_t __arm_vmovltq_m(uint32x4_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_u8)))
-uint16x8_t __arm_vmovltq_m_u8(uint16x8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_u8)))
-uint16x8_t __arm_vmovltq_m(uint16x8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_s16)))
-int32x4_t __arm_vmovltq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_s16)))
-int32x4_t __arm_vmovltq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_s8)))
-int16x8_t __arm_vmovltq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_s8)))
-int16x8_t __arm_vmovltq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_u16)))
-uint32x4_t __arm_vmovltq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_u16)))
-uint32x4_t __arm_vmovltq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_u8)))
-uint16x8_t __arm_vmovltq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_u8)))
-uint16x8_t __arm_vmovltq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_s16)))
-int32x4_t __arm_vmovltq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_s16)))
-int32x4_t __arm_vmovltq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_s8)))
-int16x8_t __arm_vmovltq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_s8)))
-int16x8_t __arm_vmovltq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_u16)))
-uint32x4_t __arm_vmovltq_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_u16)))
-uint32x4_t __arm_vmovltq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_u8)))
-uint16x8_t __arm_vmovltq_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_u8)))
-uint16x8_t __arm_vmovltq_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_s16)))
-int8x16_t __arm_vmovnbq_m_s16(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_s16)))
-int8x16_t __arm_vmovnbq_m(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_s32)))
-int16x8_t __arm_vmovnbq_m_s32(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_s32)))
-int16x8_t __arm_vmovnbq_m(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_u16)))
-uint8x16_t __arm_vmovnbq_m_u16(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_u16)))
-uint8x16_t __arm_vmovnbq_m(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_u32)))
-uint16x8_t __arm_vmovnbq_m_u32(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_u32)))
-uint16x8_t __arm_vmovnbq_m(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_s16)))
-int8x16_t __arm_vmovnbq_s16(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_s16)))
-int8x16_t __arm_vmovnbq(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_s32)))
-int16x8_t __arm_vmovnbq_s32(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_s32)))
-int16x8_t __arm_vmovnbq(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_u16)))
-uint8x16_t __arm_vmovnbq_u16(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_u16)))
-uint8x16_t __arm_vmovnbq(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_u32)))
-uint16x8_t __arm_vmovnbq_u32(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_u32)))
-uint16x8_t __arm_vmovnbq(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_s16)))
-int8x16_t __arm_vmovntq_m_s16(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_s16)))
-int8x16_t __arm_vmovntq_m(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_s32)))
-int16x8_t __arm_vmovntq_m_s32(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_s32)))
-int16x8_t __arm_vmovntq_m(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_u16)))
-uint8x16_t __arm_vmovntq_m_u16(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_u16)))
-uint8x16_t __arm_vmovntq_m(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_u32)))
-uint16x8_t __arm_vmovntq_m_u32(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_u32)))
-uint16x8_t __arm_vmovntq_m(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_s16)))
-int8x16_t __arm_vmovntq_s16(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_s16)))
-int8x16_t __arm_vmovntq(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_s32)))
-int16x8_t __arm_vmovntq_s32(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_s32)))
-int16x8_t __arm_vmovntq(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_u16)))
-uint8x16_t __arm_vmovntq_u16(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_u16)))
-uint8x16_t __arm_vmovntq(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_u32)))
-uint16x8_t __arm_vmovntq_u32(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_u32)))
-uint16x8_t __arm_vmovntq(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s16)))
-int16x8_t __arm_vmulhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s16)))
-int16x8_t __arm_vmulhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s32)))
-int32x4_t __arm_vmulhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s32)))
-int32x4_t __arm_vmulhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s8)))
-int8x16_t __arm_vmulhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s8)))
-int8x16_t __arm_vmulhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u16)))
-uint16x8_t __arm_vmulhq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u16)))
-uint16x8_t __arm_vmulhq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u32)))
-uint32x4_t __arm_vmulhq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u32)))
-uint32x4_t __arm_vmulhq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u8)))
-uint8x16_t __arm_vmulhq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u8)))
-uint8x16_t __arm_vmulhq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s16)))
-int16x8_t __arm_vmulhq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s16)))
-int16x8_t __arm_vmulhq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s32)))
-int32x4_t __arm_vmulhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s32)))
-int32x4_t __arm_vmulhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s8)))
-int8x16_t __arm_vmulhq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s8)))
-int8x16_t __arm_vmulhq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u16)))
-uint16x8_t __arm_vmulhq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u16)))
-uint16x8_t __arm_vmulhq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u32)))
-uint32x4_t __arm_vmulhq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u32)))
-uint32x4_t __arm_vmulhq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u8)))
-uint8x16_t __arm_vmulhq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u8)))
-uint8x16_t __arm_vmulhq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s16)))
-int16x8_t __arm_vmulhq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s16)))
-int16x8_t __arm_vmulhq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s32)))
-int32x4_t __arm_vmulhq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s32)))
-int32x4_t __arm_vmulhq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s8)))
-int8x16_t __arm_vmulhq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s8)))
-int8x16_t __arm_vmulhq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u16)))
-uint16x8_t __arm_vmulhq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u16)))
-uint16x8_t __arm_vmulhq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u32)))
-uint32x4_t __arm_vmulhq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u32)))
-uint32x4_t __arm_vmulhq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u8)))
-uint8x16_t __arm_vmulhq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u8)))
-uint8x16_t __arm_vmulhq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s16)))
-int32x4_t __arm_vmullbq_int_m_s16(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s16)))
-int32x4_t __arm_vmullbq_int_m(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s32)))
-int64x2_t __arm_vmullbq_int_m_s32(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s32)))
-int64x2_t __arm_vmullbq_int_m(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s8)))
-int16x8_t __arm_vmullbq_int_m_s8(int16x8_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s8)))
-int16x8_t __arm_vmullbq_int_m(int16x8_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u16)))
-uint32x4_t __arm_vmullbq_int_m_u16(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u16)))
-uint32x4_t __arm_vmullbq_int_m(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u32)))
-uint64x2_t __arm_vmullbq_int_m_u32(uint64x2_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u32)))
-uint64x2_t __arm_vmullbq_int_m(uint64x2_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u8)))
-uint16x8_t __arm_vmullbq_int_m_u8(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u8)))
-uint16x8_t __arm_vmullbq_int_m(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s16)))
-int32x4_t __arm_vmullbq_int_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s16)))
-int32x4_t __arm_vmullbq_int(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s32)))
-int64x2_t __arm_vmullbq_int_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s32)))
-int64x2_t __arm_vmullbq_int(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s8)))
-int16x8_t __arm_vmullbq_int_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s8)))
-int16x8_t __arm_vmullbq_int(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u16)))
-uint32x4_t __arm_vmullbq_int_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u16)))
-uint32x4_t __arm_vmullbq_int(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u32)))
-uint64x2_t __arm_vmullbq_int_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u32)))
-uint64x2_t __arm_vmullbq_int(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u8)))
-uint16x8_t __arm_vmullbq_int_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u8)))
-uint16x8_t __arm_vmullbq_int(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s16)))
-int32x4_t __arm_vmullbq_int_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s16)))
-int32x4_t __arm_vmullbq_int_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s32)))
-int64x2_t __arm_vmullbq_int_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s32)))
-int64x2_t __arm_vmullbq_int_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s8)))
-int16x8_t __arm_vmullbq_int_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s8)))
-int16x8_t __arm_vmullbq_int_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u16)))
-uint32x4_t __arm_vmullbq_int_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u16)))
-uint32x4_t __arm_vmullbq_int_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u32)))
-uint64x2_t __arm_vmullbq_int_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u32)))
-uint64x2_t __arm_vmullbq_int_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u8)))
-uint16x8_t __arm_vmullbq_int_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u8)))
-uint16x8_t __arm_vmullbq_int_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_m_p16)))
-uint32x4_t __arm_vmullbq_poly_m_p16(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_m_p16)))
-uint32x4_t __arm_vmullbq_poly_m(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_m_p8)))
-uint16x8_t __arm_vmullbq_poly_m_p8(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_m_p8)))
-uint16x8_t __arm_vmullbq_poly_m(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_p16)))
-uint32x4_t __arm_vmullbq_poly_p16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_p16)))
-uint32x4_t __arm_vmullbq_poly(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_p8)))
-uint16x8_t __arm_vmullbq_poly_p8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_p8)))
-uint16x8_t __arm_vmullbq_poly(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_x_p16)))
-uint32x4_t __arm_vmullbq_poly_x_p16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_x_p16)))
-uint32x4_t __arm_vmullbq_poly_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_x_p8)))
-uint16x8_t __arm_vmullbq_poly_x_p8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_x_p8)))
-uint16x8_t __arm_vmullbq_poly_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s16)))
-int32x4_t __arm_vmulltq_int_m_s16(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s16)))
-int32x4_t __arm_vmulltq_int_m(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s32)))
-int64x2_t __arm_vmulltq_int_m_s32(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s32)))
-int64x2_t __arm_vmulltq_int_m(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s8)))
-int16x8_t __arm_vmulltq_int_m_s8(int16x8_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s8)))
-int16x8_t __arm_vmulltq_int_m(int16x8_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u16)))
-uint32x4_t __arm_vmulltq_int_m_u16(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u16)))
-uint32x4_t __arm_vmulltq_int_m(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u32)))
-uint64x2_t __arm_vmulltq_int_m_u32(uint64x2_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u32)))
-uint64x2_t __arm_vmulltq_int_m(uint64x2_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u8)))
-uint16x8_t __arm_vmulltq_int_m_u8(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u8)))
-uint16x8_t __arm_vmulltq_int_m(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s16)))
-int32x4_t __arm_vmulltq_int_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s16)))
-int32x4_t __arm_vmulltq_int(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s32)))
-int64x2_t __arm_vmulltq_int_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s32)))
-int64x2_t __arm_vmulltq_int(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s8)))
-int16x8_t __arm_vmulltq_int_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s8)))
-int16x8_t __arm_vmulltq_int(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u16)))
-uint32x4_t __arm_vmulltq_int_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u16)))
-uint32x4_t __arm_vmulltq_int(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u32)))
-uint64x2_t __arm_vmulltq_int_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u32)))
-uint64x2_t __arm_vmulltq_int(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u8)))
-uint16x8_t __arm_vmulltq_int_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u8)))
-uint16x8_t __arm_vmulltq_int(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s16)))
-int32x4_t __arm_vmulltq_int_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s16)))
-int32x4_t __arm_vmulltq_int_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s32)))
-int64x2_t __arm_vmulltq_int_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s32)))
-int64x2_t __arm_vmulltq_int_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s8)))
-int16x8_t __arm_vmulltq_int_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s8)))
-int16x8_t __arm_vmulltq_int_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u16)))
-uint32x4_t __arm_vmulltq_int_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u16)))
-uint32x4_t __arm_vmulltq_int_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u32)))
-uint64x2_t __arm_vmulltq_int_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u32)))
-uint64x2_t __arm_vmulltq_int_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u8)))
-uint16x8_t __arm_vmulltq_int_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u8)))
-uint16x8_t __arm_vmulltq_int_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_m_p16)))
-uint32x4_t __arm_vmulltq_poly_m_p16(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_m_p16)))
-uint32x4_t __arm_vmulltq_poly_m(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_m_p8)))
-uint16x8_t __arm_vmulltq_poly_m_p8(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_m_p8)))
-uint16x8_t __arm_vmulltq_poly_m(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_p16)))
-uint32x4_t __arm_vmulltq_poly_p16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_p16)))
-uint32x4_t __arm_vmulltq_poly(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_p8)))
-uint16x8_t __arm_vmulltq_poly_p8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_p8)))
-uint16x8_t __arm_vmulltq_poly(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_x_p16)))
-uint32x4_t __arm_vmulltq_poly_x_p16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_x_p16)))
-uint32x4_t __arm_vmulltq_poly_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_x_p8)))
-uint16x8_t __arm_vmulltq_poly_x_p8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_x_p8)))
-uint16x8_t __arm_vmulltq_poly_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s16)))
-int16x8_t __arm_vmulq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s16)))
-int16x8_t __arm_vmulq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s32)))
-int32x4_t __arm_vmulq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s32)))
-int32x4_t __arm_vmulq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s8)))
-int8x16_t __arm_vmulq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s8)))
-int8x16_t __arm_vmulq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u16)))
-uint16x8_t __arm_vmulq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u16)))
-uint16x8_t __arm_vmulq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u32)))
-uint32x4_t __arm_vmulq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u32)))
-uint32x4_t __arm_vmulq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u8)))
-uint8x16_t __arm_vmulq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u8)))
-uint8x16_t __arm_vmulq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s16)))
-int16x8_t __arm_vmulq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s16)))
-int16x8_t __arm_vmulq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s32)))
-int32x4_t __arm_vmulq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s32)))
-int32x4_t __arm_vmulq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s8)))
-int8x16_t __arm_vmulq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s8)))
-int8x16_t __arm_vmulq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u16)))
-uint16x8_t __arm_vmulq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u16)))
-uint16x8_t __arm_vmulq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u32)))
-uint32x4_t __arm_vmulq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u32)))
-uint32x4_t __arm_vmulq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u8)))
-uint8x16_t __arm_vmulq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u8)))
-uint8x16_t __arm_vmulq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s16)))
-int16x8_t __arm_vmulq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s16)))
-int16x8_t __arm_vmulq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s32)))
-int32x4_t __arm_vmulq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s32)))
-int32x4_t __arm_vmulq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s8)))
-int8x16_t __arm_vmulq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s8)))
-int8x16_t __arm_vmulq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u16)))
-uint16x8_t __arm_vmulq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u16)))
-uint16x8_t __arm_vmulq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u32)))
-uint32x4_t __arm_vmulq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u32)))
-uint32x4_t __arm_vmulq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u8)))
-uint8x16_t __arm_vmulq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u8)))
-uint8x16_t __arm_vmulq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s16)))
-int16x8_t __arm_vmulq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s16)))
-int16x8_t __arm_vmulq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s32)))
-int32x4_t __arm_vmulq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s32)))
-int32x4_t __arm_vmulq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s8)))
-int8x16_t __arm_vmulq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s8)))
-int8x16_t __arm_vmulq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u16)))
-uint16x8_t __arm_vmulq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u16)))
-uint16x8_t __arm_vmulq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u32)))
-uint32x4_t __arm_vmulq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u32)))
-uint32x4_t __arm_vmulq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u8)))
-uint8x16_t __arm_vmulq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u8)))
-uint8x16_t __arm_vmulq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s16)))
-int16x8_t __arm_vmulq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s16)))
-int16x8_t __arm_vmulq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s32)))
-int32x4_t __arm_vmulq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s32)))
-int32x4_t __arm_vmulq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s8)))
-int8x16_t __arm_vmulq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s8)))
-int8x16_t __arm_vmulq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u16)))
-uint16x8_t __arm_vmulq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u16)))
-uint16x8_t __arm_vmulq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u32)))
-uint32x4_t __arm_vmulq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u32)))
-uint32x4_t __arm_vmulq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u8)))
-uint8x16_t __arm_vmulq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u8)))
-uint8x16_t __arm_vmulq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s16)))
-int16x8_t __arm_vmulq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s16)))
-int16x8_t __arm_vmulq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s32)))
-int32x4_t __arm_vmulq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s32)))
-int32x4_t __arm_vmulq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s8)))
-int8x16_t __arm_vmulq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s8)))
-int8x16_t __arm_vmulq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u16)))
-uint16x8_t __arm_vmulq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u16)))
-uint16x8_t __arm_vmulq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u32)))
-uint32x4_t __arm_vmulq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u32)))
-uint32x4_t __arm_vmulq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u8)))
-uint8x16_t __arm_vmulq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u8)))
-uint8x16_t __arm_vmulq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_s16)))
-int16x8_t __arm_vmvnq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_s16)))
-int16x8_t __arm_vmvnq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_s32)))
-int32x4_t __arm_vmvnq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_s32)))
-int32x4_t __arm_vmvnq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_u16)))
-uint16x8_t __arm_vmvnq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_u16)))
-uint16x8_t __arm_vmvnq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_u32)))
-uint32x4_t __arm_vmvnq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_u32)))
-uint32x4_t __arm_vmvnq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s16)))
-int16x8_t __arm_vmvnq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s16)))
-int16x8_t __arm_vmvnq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s32)))
-int32x4_t __arm_vmvnq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s32)))
-int32x4_t __arm_vmvnq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s8)))
-int8x16_t __arm_vmvnq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s8)))
-int8x16_t __arm_vmvnq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u16)))
-uint16x8_t __arm_vmvnq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u16)))
-uint16x8_t __arm_vmvnq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u32)))
-uint32x4_t __arm_vmvnq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u32)))
-uint32x4_t __arm_vmvnq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u8)))
-uint8x16_t __arm_vmvnq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u8)))
-uint8x16_t __arm_vmvnq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_n_s16)))
-int16x8_t __arm_vmvnq_n_s16(int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_n_s32)))
-int32x4_t __arm_vmvnq_n_s32(int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_n_u16)))
-uint16x8_t __arm_vmvnq_n_u16(uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_n_u32)))
-uint32x4_t __arm_vmvnq_n_u32(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s16)))
-int16x8_t __arm_vmvnq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s16)))
-int16x8_t __arm_vmvnq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s32)))
-int32x4_t __arm_vmvnq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s32)))
-int32x4_t __arm_vmvnq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s8)))
-int8x16_t __arm_vmvnq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s8)))
-int8x16_t __arm_vmvnq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u16)))
-uint16x8_t __arm_vmvnq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u16)))
-uint16x8_t __arm_vmvnq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u32)))
-uint32x4_t __arm_vmvnq_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u32)))
-uint32x4_t __arm_vmvnq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u8)))
-uint8x16_t __arm_vmvnq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u8)))
-uint8x16_t __arm_vmvnq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_n_s16)))
-int16x8_t __arm_vmvnq_x_n_s16(int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_n_s32)))
-int32x4_t __arm_vmvnq_x_n_s32(int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_n_u16)))
-uint16x8_t __arm_vmvnq_x_n_u16(uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_n_u32)))
-uint32x4_t __arm_vmvnq_x_n_u32(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s16)))
-int16x8_t __arm_vmvnq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s16)))
-int16x8_t __arm_vmvnq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s32)))
-int32x4_t __arm_vmvnq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s32)))
-int32x4_t __arm_vmvnq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s8)))
-int8x16_t __arm_vmvnq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s8)))
-int8x16_t __arm_vmvnq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u16)))
-uint16x8_t __arm_vmvnq_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u16)))
-uint16x8_t __arm_vmvnq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u32)))
-uint32x4_t __arm_vmvnq_x_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u32)))
-uint32x4_t __arm_vmvnq_x(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u8)))
-uint8x16_t __arm_vmvnq_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u8)))
-uint8x16_t __arm_vmvnq_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s16)))
-int16x8_t __arm_vnegq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s16)))
-int16x8_t __arm_vnegq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s32)))
-int32x4_t __arm_vnegq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s32)))
-int32x4_t __arm_vnegq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s8)))
-int8x16_t __arm_vnegq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s8)))
-int8x16_t __arm_vnegq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s16)))
-int16x8_t __arm_vnegq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s16)))
-int16x8_t __arm_vnegq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s32)))
-int32x4_t __arm_vnegq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s32)))
-int32x4_t __arm_vnegq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s8)))
-int8x16_t __arm_vnegq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s8)))
-int8x16_t __arm_vnegq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s16)))
-int16x8_t __arm_vnegq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s16)))
-int16x8_t __arm_vnegq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s32)))
-int32x4_t __arm_vnegq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s32)))
-int32x4_t __arm_vnegq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s8)))
-int8x16_t __arm_vnegq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s8)))
-int8x16_t __arm_vnegq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s16)))
-int16x8_t __arm_vornq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s16)))
-int16x8_t __arm_vornq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s32)))
-int32x4_t __arm_vornq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s32)))
-int32x4_t __arm_vornq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s8)))
-int8x16_t __arm_vornq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s8)))
-int8x16_t __arm_vornq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u16)))
-uint16x8_t __arm_vornq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u16)))
-uint16x8_t __arm_vornq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u32)))
-uint32x4_t __arm_vornq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u32)))
-uint32x4_t __arm_vornq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u8)))
-uint8x16_t __arm_vornq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u8)))
-uint8x16_t __arm_vornq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_s16)))
-int16x8_t __arm_vornq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_s16)))
-int16x8_t __arm_vornq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_s32)))
-int32x4_t __arm_vornq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_s32)))
-int32x4_t __arm_vornq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_s8)))
-int8x16_t __arm_vornq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_s8)))
-int8x16_t __arm_vornq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_u16)))
-uint16x8_t __arm_vornq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_u16)))
-uint16x8_t __arm_vornq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_u32)))
-uint32x4_t __arm_vornq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_u32)))
-uint32x4_t __arm_vornq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_u8)))
-uint8x16_t __arm_vornq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_u8)))
-uint8x16_t __arm_vornq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s16)))
-int16x8_t __arm_vornq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s16)))
-int16x8_t __arm_vornq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s32)))
-int32x4_t __arm_vornq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s32)))
-int32x4_t __arm_vornq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s8)))
-int8x16_t __arm_vornq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s8)))
-int8x16_t __arm_vornq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u16)))
-uint16x8_t __arm_vornq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u16)))
-uint16x8_t __arm_vornq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u32)))
-uint32x4_t __arm_vornq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u32)))
-uint32x4_t __arm_vornq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u8)))
-uint8x16_t __arm_vornq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u8)))
-uint8x16_t __arm_vornq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_s16)))
-int16x8_t __arm_vorrq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_s16)))
-int16x8_t __arm_vorrq_m_n(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_s32)))
-int32x4_t __arm_vorrq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_s32)))
-int32x4_t __arm_vorrq_m_n(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_u16)))
-uint16x8_t __arm_vorrq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_u16)))
-uint16x8_t __arm_vorrq_m_n(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_u32)))
-uint32x4_t __arm_vorrq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_u32)))
-uint32x4_t __arm_vorrq_m_n(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s16)))
-int16x8_t __arm_vorrq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s16)))
-int16x8_t __arm_vorrq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s32)))
-int32x4_t __arm_vorrq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s32)))
-int32x4_t __arm_vorrq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s8)))
-int8x16_t __arm_vorrq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s8)))
-int8x16_t __arm_vorrq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u16)))
-uint16x8_t __arm_vorrq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u16)))
-uint16x8_t __arm_vorrq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u32)))
-uint32x4_t __arm_vorrq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u32)))
-uint32x4_t __arm_vorrq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u8)))
-uint8x16_t __arm_vorrq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u8)))
-uint8x16_t __arm_vorrq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_s16)))
-int16x8_t __arm_vorrq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_s16)))
-int16x8_t __arm_vorrq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_s32)))
-int32x4_t __arm_vorrq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_s32)))
-int32x4_t __arm_vorrq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_u16)))
-uint16x8_t __arm_vorrq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_u16)))
-uint16x8_t __arm_vorrq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_u32)))
-uint32x4_t __arm_vorrq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_u32)))
-uint32x4_t __arm_vorrq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s16)))
-int16x8_t __arm_vorrq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s16)))
-int16x8_t __arm_vorrq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s32)))
-int32x4_t __arm_vorrq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s32)))
-int32x4_t __arm_vorrq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s8)))
-int8x16_t __arm_vorrq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s8)))
-int8x16_t __arm_vorrq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u16)))
-uint16x8_t __arm_vorrq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u16)))
-uint16x8_t __arm_vorrq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u32)))
-uint32x4_t __arm_vorrq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u32)))
-uint32x4_t __arm_vorrq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u8)))
-uint8x16_t __arm_vorrq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u8)))
-uint8x16_t __arm_vorrq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s16)))
-int16x8_t __arm_vorrq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s16)))
-int16x8_t __arm_vorrq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s32)))
-int32x4_t __arm_vorrq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s32)))
-int32x4_t __arm_vorrq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s8)))
-int8x16_t __arm_vorrq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s8)))
-int8x16_t __arm_vorrq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u16)))
-uint16x8_t __arm_vorrq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u16)))
-uint16x8_t __arm_vorrq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u32)))
-uint32x4_t __arm_vorrq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u32)))
-uint32x4_t __arm_vorrq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u8)))
-uint8x16_t __arm_vorrq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u8)))
-uint8x16_t __arm_vorrq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpnot)))
-mve_pred16_t __arm_vpnot(mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s16)))
-int16x8_t __arm_vpselq_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s16)))
-int16x8_t __arm_vpselq(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s32)))
-int32x4_t __arm_vpselq_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s32)))
-int32x4_t __arm_vpselq(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s64)))
-int64x2_t __arm_vpselq_s64(int64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s64)))
-int64x2_t __arm_vpselq(int64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s8)))
-int8x16_t __arm_vpselq_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s8)))
-int8x16_t __arm_vpselq(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u16)))
-uint16x8_t __arm_vpselq_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u16)))
-uint16x8_t __arm_vpselq(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u32)))
-uint32x4_t __arm_vpselq_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u32)))
-uint32x4_t __arm_vpselq(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u64)))
-uint64x2_t __arm_vpselq_u64(uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u64)))
-uint64x2_t __arm_vpselq(uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u8)))
-uint8x16_t __arm_vpselq_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u8)))
-uint8x16_t __arm_vpselq(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s16)))
-int16x8_t __arm_vqabsq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s16)))
-int16x8_t __arm_vqabsq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s32)))
-int32x4_t __arm_vqabsq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s32)))
-int32x4_t __arm_vqabsq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s8)))
-int8x16_t __arm_vqabsq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s8)))
-int8x16_t __arm_vqabsq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s16)))
-int16x8_t __arm_vqabsq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s16)))
-int16x8_t __arm_vqabsq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s32)))
-int32x4_t __arm_vqabsq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s32)))
-int32x4_t __arm_vqabsq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s8)))
-int8x16_t __arm_vqabsq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s8)))
-int8x16_t __arm_vqabsq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s16)))
-int16x8_t __arm_vqaddq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s16)))
-int16x8_t __arm_vqaddq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s32)))
-int32x4_t __arm_vqaddq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s32)))
-int32x4_t __arm_vqaddq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s8)))
-int8x16_t __arm_vqaddq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s8)))
-int8x16_t __arm_vqaddq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u16)))
-uint16x8_t __arm_vqaddq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u16)))
-uint16x8_t __arm_vqaddq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u32)))
-uint32x4_t __arm_vqaddq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u32)))
-uint32x4_t __arm_vqaddq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u8)))
-uint8x16_t __arm_vqaddq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u8)))
-uint8x16_t __arm_vqaddq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s16)))
-int16x8_t __arm_vqaddq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s16)))
-int16x8_t __arm_vqaddq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s32)))
-int32x4_t __arm_vqaddq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s32)))
-int32x4_t __arm_vqaddq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s8)))
-int8x16_t __arm_vqaddq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s8)))
-int8x16_t __arm_vqaddq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u16)))
-uint16x8_t __arm_vqaddq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u16)))
-uint16x8_t __arm_vqaddq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u32)))
-uint32x4_t __arm_vqaddq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u32)))
-uint32x4_t __arm_vqaddq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u8)))
-uint8x16_t __arm_vqaddq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u8)))
-uint8x16_t __arm_vqaddq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s16)))
-int16x8_t __arm_vqaddq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s16)))
-int16x8_t __arm_vqaddq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s32)))
-int32x4_t __arm_vqaddq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s32)))
-int32x4_t __arm_vqaddq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s8)))
-int8x16_t __arm_vqaddq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s8)))
-int8x16_t __arm_vqaddq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u16)))
-uint16x8_t __arm_vqaddq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u16)))
-uint16x8_t __arm_vqaddq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u32)))
-uint32x4_t __arm_vqaddq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u32)))
-uint32x4_t __arm_vqaddq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u8)))
-uint8x16_t __arm_vqaddq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u8)))
-uint8x16_t __arm_vqaddq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s16)))
-int16x8_t __arm_vqaddq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s16)))
-int16x8_t __arm_vqaddq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s32)))
-int32x4_t __arm_vqaddq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s32)))
-int32x4_t __arm_vqaddq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s8)))
-int8x16_t __arm_vqaddq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s8)))
-int8x16_t __arm_vqaddq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u16)))
-uint16x8_t __arm_vqaddq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u16)))
-uint16x8_t __arm_vqaddq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u32)))
-uint32x4_t __arm_vqaddq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u32)))
-uint32x4_t __arm_vqaddq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u8)))
-uint8x16_t __arm_vqaddq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u8)))
-uint8x16_t __arm_vqaddq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s16)))
-int16x8_t __arm_vqdmladhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s16)))
-int16x8_t __arm_vqdmladhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s32)))
-int32x4_t __arm_vqdmladhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s32)))
-int32x4_t __arm_vqdmladhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s8)))
-int8x16_t __arm_vqdmladhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s8)))
-int8x16_t __arm_vqdmladhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s16)))
-int16x8_t __arm_vqdmladhq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s16)))
-int16x8_t __arm_vqdmladhq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s32)))
-int32x4_t __arm_vqdmladhq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s32)))
-int32x4_t __arm_vqdmladhq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s8)))
-int8x16_t __arm_vqdmladhq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s8)))
-int8x16_t __arm_vqdmladhq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s16)))
-int16x8_t __arm_vqdmladhxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s16)))
-int16x8_t __arm_vqdmladhxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s32)))
-int32x4_t __arm_vqdmladhxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s32)))
-int32x4_t __arm_vqdmladhxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s8)))
-int8x16_t __arm_vqdmladhxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s8)))
-int8x16_t __arm_vqdmladhxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s16)))
-int16x8_t __arm_vqdmladhxq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s16)))
-int16x8_t __arm_vqdmladhxq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s32)))
-int32x4_t __arm_vqdmladhxq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s32)))
-int32x4_t __arm_vqdmladhxq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s8)))
-int8x16_t __arm_vqdmladhxq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s8)))
-int8x16_t __arm_vqdmladhxq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s16)))
-int16x8_t __arm_vqdmlahq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s16)))
-int16x8_t __arm_vqdmlahq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s32)))
-int32x4_t __arm_vqdmlahq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s32)))
-int32x4_t __arm_vqdmlahq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s8)))
-int8x16_t __arm_vqdmlahq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s8)))
-int8x16_t __arm_vqdmlahq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s16)))
-int16x8_t __arm_vqdmlahq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s16)))
-int16x8_t __arm_vqdmlahq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s32)))
-int32x4_t __arm_vqdmlahq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s32)))
-int32x4_t __arm_vqdmlahq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s8)))
-int8x16_t __arm_vqdmlahq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s8)))
-int8x16_t __arm_vqdmlahq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s16)))
-int16x8_t __arm_vqdmlashq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s16)))
-int16x8_t __arm_vqdmlashq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s32)))
-int32x4_t __arm_vqdmlashq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s32)))
-int32x4_t __arm_vqdmlashq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s8)))
-int8x16_t __arm_vqdmlashq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s8)))
-int8x16_t __arm_vqdmlashq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s16)))
-int16x8_t __arm_vqdmlashq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s16)))
-int16x8_t __arm_vqdmlashq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s32)))
-int32x4_t __arm_vqdmlashq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s32)))
-int32x4_t __arm_vqdmlashq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s8)))
-int8x16_t __arm_vqdmlashq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s8)))
-int8x16_t __arm_vqdmlashq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s16)))
-int16x8_t __arm_vqdmlsdhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s16)))
-int16x8_t __arm_vqdmlsdhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s32)))
-int32x4_t __arm_vqdmlsdhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s32)))
-int32x4_t __arm_vqdmlsdhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s8)))
-int8x16_t __arm_vqdmlsdhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s8)))
-int8x16_t __arm_vqdmlsdhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s16)))
-int16x8_t __arm_vqdmlsdhq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s16)))
-int16x8_t __arm_vqdmlsdhq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s32)))
-int32x4_t __arm_vqdmlsdhq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s32)))
-int32x4_t __arm_vqdmlsdhq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s8)))
-int8x16_t __arm_vqdmlsdhq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s8)))
-int8x16_t __arm_vqdmlsdhq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s16)))
-int16x8_t __arm_vqdmlsdhxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s16)))
-int16x8_t __arm_vqdmlsdhxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s32)))
-int32x4_t __arm_vqdmlsdhxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s32)))
-int32x4_t __arm_vqdmlsdhxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s8)))
-int8x16_t __arm_vqdmlsdhxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s8)))
-int8x16_t __arm_vqdmlsdhxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s16)))
-int16x8_t __arm_vqdmlsdhxq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s16)))
-int16x8_t __arm_vqdmlsdhxq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s32)))
-int32x4_t __arm_vqdmlsdhxq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s32)))
-int32x4_t __arm_vqdmlsdhxq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s8)))
-int8x16_t __arm_vqdmlsdhxq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s8)))
-int8x16_t __arm_vqdmlsdhxq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s16)))
-int16x8_t __arm_vqdmulhq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s16)))
-int16x8_t __arm_vqdmulhq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s32)))
-int32x4_t __arm_vqdmulhq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s32)))
-int32x4_t __arm_vqdmulhq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s8)))
-int8x16_t __arm_vqdmulhq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s8)))
-int8x16_t __arm_vqdmulhq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s16)))
-int16x8_t __arm_vqdmulhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s16)))
-int16x8_t __arm_vqdmulhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s32)))
-int32x4_t __arm_vqdmulhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s32)))
-int32x4_t __arm_vqdmulhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s8)))
-int8x16_t __arm_vqdmulhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s8)))
-int8x16_t __arm_vqdmulhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s16)))
-int16x8_t __arm_vqdmulhq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s16)))
-int16x8_t __arm_vqdmulhq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s32)))
-int32x4_t __arm_vqdmulhq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s32)))
-int32x4_t __arm_vqdmulhq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s8)))
-int8x16_t __arm_vqdmulhq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s8)))
-int8x16_t __arm_vqdmulhq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s16)))
-int16x8_t __arm_vqdmulhq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s16)))
-int16x8_t __arm_vqdmulhq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s32)))
-int32x4_t __arm_vqdmulhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s32)))
-int32x4_t __arm_vqdmulhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s8)))
-int8x16_t __arm_vqdmulhq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s8)))
-int8x16_t __arm_vqdmulhq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_n_s16)))
-int32x4_t __arm_vqdmullbq_m_n_s16(int32x4_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_n_s16)))
-int32x4_t __arm_vqdmullbq_m(int32x4_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_n_s32)))
-int64x2_t __arm_vqdmullbq_m_n_s32(int64x2_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_n_s32)))
-int64x2_t __arm_vqdmullbq_m(int64x2_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_s16)))
-int32x4_t __arm_vqdmullbq_m_s16(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_s16)))
-int32x4_t __arm_vqdmullbq_m(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_s32)))
-int64x2_t __arm_vqdmullbq_m_s32(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_s32)))
-int64x2_t __arm_vqdmullbq_m(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_n_s16)))
-int32x4_t __arm_vqdmullbq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_n_s16)))
-int32x4_t __arm_vqdmullbq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_n_s32)))
-int64x2_t __arm_vqdmullbq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_n_s32)))
-int64x2_t __arm_vqdmullbq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_s16)))
-int32x4_t __arm_vqdmullbq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_s16)))
-int32x4_t __arm_vqdmullbq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_s32)))
-int64x2_t __arm_vqdmullbq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_s32)))
-int64x2_t __arm_vqdmullbq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_n_s16)))
-int32x4_t __arm_vqdmulltq_m_n_s16(int32x4_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_n_s16)))
-int32x4_t __arm_vqdmulltq_m(int32x4_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_n_s32)))
-int64x2_t __arm_vqdmulltq_m_n_s32(int64x2_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_n_s32)))
-int64x2_t __arm_vqdmulltq_m(int64x2_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_s16)))
-int32x4_t __arm_vqdmulltq_m_s16(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_s16)))
-int32x4_t __arm_vqdmulltq_m(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_s32)))
-int64x2_t __arm_vqdmulltq_m_s32(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_s32)))
-int64x2_t __arm_vqdmulltq_m(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_n_s16)))
-int32x4_t __arm_vqdmulltq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_n_s16)))
-int32x4_t __arm_vqdmulltq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_n_s32)))
-int64x2_t __arm_vqdmulltq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_n_s32)))
-int64x2_t __arm_vqdmulltq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_s16)))
-int32x4_t __arm_vqdmulltq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_s16)))
-int32x4_t __arm_vqdmulltq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_s32)))
-int64x2_t __arm_vqdmulltq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_s32)))
-int64x2_t __arm_vqdmulltq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_s16)))
-int8x16_t __arm_vqmovnbq_m_s16(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_s16)))
-int8x16_t __arm_vqmovnbq_m(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_s32)))
-int16x8_t __arm_vqmovnbq_m_s32(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_s32)))
-int16x8_t __arm_vqmovnbq_m(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_u16)))
-uint8x16_t __arm_vqmovnbq_m_u16(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_u16)))
-uint8x16_t __arm_vqmovnbq_m(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_u32)))
-uint16x8_t __arm_vqmovnbq_m_u32(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_u32)))
-uint16x8_t __arm_vqmovnbq_m(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_s16)))
-int8x16_t __arm_vqmovnbq_s16(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_s16)))
-int8x16_t __arm_vqmovnbq(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_s32)))
-int16x8_t __arm_vqmovnbq_s32(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_s32)))
-int16x8_t __arm_vqmovnbq(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_u16)))
-uint8x16_t __arm_vqmovnbq_u16(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_u16)))
-uint8x16_t __arm_vqmovnbq(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_u32)))
-uint16x8_t __arm_vqmovnbq_u32(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_u32)))
-uint16x8_t __arm_vqmovnbq(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_s16)))
-int8x16_t __arm_vqmovntq_m_s16(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_s16)))
-int8x16_t __arm_vqmovntq_m(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_s32)))
-int16x8_t __arm_vqmovntq_m_s32(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_s32)))
-int16x8_t __arm_vqmovntq_m(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_u16)))
-uint8x16_t __arm_vqmovntq_m_u16(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_u16)))
-uint8x16_t __arm_vqmovntq_m(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_u32)))
-uint16x8_t __arm_vqmovntq_m_u32(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_u32)))
-uint16x8_t __arm_vqmovntq_m(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_s16)))
-int8x16_t __arm_vqmovntq_s16(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_s16)))
-int8x16_t __arm_vqmovntq(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_s32)))
-int16x8_t __arm_vqmovntq_s32(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_s32)))
-int16x8_t __arm_vqmovntq(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_u16)))
-uint8x16_t __arm_vqmovntq_u16(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_u16)))
-uint8x16_t __arm_vqmovntq(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_u32)))
-uint16x8_t __arm_vqmovntq_u32(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_u32)))
-uint16x8_t __arm_vqmovntq(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_m_s16)))
-uint8x16_t __arm_vqmovunbq_m_s16(uint8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_m_s16)))
-uint8x16_t __arm_vqmovunbq_m(uint8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_m_s32)))
-uint16x8_t __arm_vqmovunbq_m_s32(uint16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_m_s32)))
-uint16x8_t __arm_vqmovunbq_m(uint16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_s16)))
-uint8x16_t __arm_vqmovunbq_s16(uint8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_s16)))
-uint8x16_t __arm_vqmovunbq(uint8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_s32)))
-uint16x8_t __arm_vqmovunbq_s32(uint16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_s32)))
-uint16x8_t __arm_vqmovunbq(uint16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_m_s16)))
-uint8x16_t __arm_vqmovuntq_m_s16(uint8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_m_s16)))
-uint8x16_t __arm_vqmovuntq_m(uint8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_m_s32)))
-uint16x8_t __arm_vqmovuntq_m_s32(uint16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_m_s32)))
-uint16x8_t __arm_vqmovuntq_m(uint16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_s16)))
-uint8x16_t __arm_vqmovuntq_s16(uint8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_s16)))
-uint8x16_t __arm_vqmovuntq(uint8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_s32)))
-uint16x8_t __arm_vqmovuntq_s32(uint16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_s32)))
-uint16x8_t __arm_vqmovuntq(uint16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s16)))
-int16x8_t __arm_vqnegq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s16)))
-int16x8_t __arm_vqnegq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s32)))
-int32x4_t __arm_vqnegq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s32)))
-int32x4_t __arm_vqnegq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s8)))
-int8x16_t __arm_vqnegq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s8)))
-int8x16_t __arm_vqnegq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s16)))
-int16x8_t __arm_vqnegq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s16)))
-int16x8_t __arm_vqnegq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s32)))
-int32x4_t __arm_vqnegq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s32)))
-int32x4_t __arm_vqnegq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s8)))
-int8x16_t __arm_vqnegq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s8)))
-int8x16_t __arm_vqnegq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s16)))
-int16x8_t __arm_vqrdmladhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s16)))
-int16x8_t __arm_vqrdmladhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s32)))
-int32x4_t __arm_vqrdmladhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s32)))
-int32x4_t __arm_vqrdmladhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s8)))
-int8x16_t __arm_vqrdmladhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s8)))
-int8x16_t __arm_vqrdmladhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s16)))
-int16x8_t __arm_vqrdmladhq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s16)))
-int16x8_t __arm_vqrdmladhq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s32)))
-int32x4_t __arm_vqrdmladhq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s32)))
-int32x4_t __arm_vqrdmladhq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s8)))
-int8x16_t __arm_vqrdmladhq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s8)))
-int8x16_t __arm_vqrdmladhq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s16)))
-int16x8_t __arm_vqrdmladhxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s16)))
-int16x8_t __arm_vqrdmladhxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s32)))
-int32x4_t __arm_vqrdmladhxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s32)))
-int32x4_t __arm_vqrdmladhxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s8)))
-int8x16_t __arm_vqrdmladhxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s8)))
-int8x16_t __arm_vqrdmladhxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s16)))
-int16x8_t __arm_vqrdmladhxq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s16)))
-int16x8_t __arm_vqrdmladhxq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s32)))
-int32x4_t __arm_vqrdmladhxq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s32)))
-int32x4_t __arm_vqrdmladhxq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s8)))
-int8x16_t __arm_vqrdmladhxq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s8)))
-int8x16_t __arm_vqrdmladhxq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s16)))
-int16x8_t __arm_vqrdmlahq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s16)))
-int16x8_t __arm_vqrdmlahq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s32)))
-int32x4_t __arm_vqrdmlahq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s32)))
-int32x4_t __arm_vqrdmlahq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s8)))
-int8x16_t __arm_vqrdmlahq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s8)))
-int8x16_t __arm_vqrdmlahq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s16)))
-int16x8_t __arm_vqrdmlahq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s16)))
-int16x8_t __arm_vqrdmlahq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s32)))
-int32x4_t __arm_vqrdmlahq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s32)))
-int32x4_t __arm_vqrdmlahq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s8)))
-int8x16_t __arm_vqrdmlahq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s8)))
-int8x16_t __arm_vqrdmlahq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s16)))
-int16x8_t __arm_vqrdmlashq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s16)))
-int16x8_t __arm_vqrdmlashq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s32)))
-int32x4_t __arm_vqrdmlashq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s32)))
-int32x4_t __arm_vqrdmlashq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s8)))
-int8x16_t __arm_vqrdmlashq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s8)))
-int8x16_t __arm_vqrdmlashq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s16)))
-int16x8_t __arm_vqrdmlashq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s16)))
-int16x8_t __arm_vqrdmlashq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s32)))
-int32x4_t __arm_vqrdmlashq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s32)))
-int32x4_t __arm_vqrdmlashq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s8)))
-int8x16_t __arm_vqrdmlashq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s8)))
-int8x16_t __arm_vqrdmlashq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s16)))
-int16x8_t __arm_vqrdmlsdhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s16)))
-int16x8_t __arm_vqrdmlsdhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s32)))
-int32x4_t __arm_vqrdmlsdhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s32)))
-int32x4_t __arm_vqrdmlsdhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s8)))
-int8x16_t __arm_vqrdmlsdhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s8)))
-int8x16_t __arm_vqrdmlsdhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s16)))
-int16x8_t __arm_vqrdmlsdhq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s16)))
-int16x8_t __arm_vqrdmlsdhq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s32)))
-int32x4_t __arm_vqrdmlsdhq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s32)))
-int32x4_t __arm_vqrdmlsdhq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s8)))
-int8x16_t __arm_vqrdmlsdhq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s8)))
-int8x16_t __arm_vqrdmlsdhq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s16)))
-int16x8_t __arm_vqrdmlsdhxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s16)))
-int16x8_t __arm_vqrdmlsdhxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s32)))
-int32x4_t __arm_vqrdmlsdhxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s32)))
-int32x4_t __arm_vqrdmlsdhxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s8)))
-int8x16_t __arm_vqrdmlsdhxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s8)))
-int8x16_t __arm_vqrdmlsdhxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s16)))
-int16x8_t __arm_vqrdmlsdhxq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s16)))
-int16x8_t __arm_vqrdmlsdhxq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s32)))
-int32x4_t __arm_vqrdmlsdhxq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s32)))
-int32x4_t __arm_vqrdmlsdhxq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s8)))
-int8x16_t __arm_vqrdmlsdhxq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s8)))
-int8x16_t __arm_vqrdmlsdhxq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s16)))
-int16x8_t __arm_vqrdmulhq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s16)))
-int16x8_t __arm_vqrdmulhq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s32)))
-int32x4_t __arm_vqrdmulhq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s32)))
-int32x4_t __arm_vqrdmulhq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s8)))
-int8x16_t __arm_vqrdmulhq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s8)))
-int8x16_t __arm_vqrdmulhq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s16)))
-int16x8_t __arm_vqrdmulhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s16)))
-int16x8_t __arm_vqrdmulhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s32)))
-int32x4_t __arm_vqrdmulhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s32)))
-int32x4_t __arm_vqrdmulhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s8)))
-int8x16_t __arm_vqrdmulhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s8)))
-int8x16_t __arm_vqrdmulhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s16)))
-int16x8_t __arm_vqrdmulhq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s16)))
-int16x8_t __arm_vqrdmulhq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s32)))
-int32x4_t __arm_vqrdmulhq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s32)))
-int32x4_t __arm_vqrdmulhq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s8)))
-int8x16_t __arm_vqrdmulhq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s8)))
-int8x16_t __arm_vqrdmulhq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s16)))
-int16x8_t __arm_vqrdmulhq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s16)))
-int16x8_t __arm_vqrdmulhq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s32)))
-int32x4_t __arm_vqrdmulhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s32)))
-int32x4_t __arm_vqrdmulhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s8)))
-int8x16_t __arm_vqrdmulhq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s8)))
-int8x16_t __arm_vqrdmulhq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s16)))
-int16x8_t __arm_vqrshlq_m_n_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s16)))
-int16x8_t __arm_vqrshlq_m_n(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s32)))
-int32x4_t __arm_vqrshlq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s32)))
-int32x4_t __arm_vqrshlq_m_n(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s8)))
-int8x16_t __arm_vqrshlq_m_n_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s8)))
-int8x16_t __arm_vqrshlq_m_n(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u16)))
-uint16x8_t __arm_vqrshlq_m_n_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u16)))
-uint16x8_t __arm_vqrshlq_m_n(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u32)))
-uint32x4_t __arm_vqrshlq_m_n_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u32)))
-uint32x4_t __arm_vqrshlq_m_n(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u8)))
-uint8x16_t __arm_vqrshlq_m_n_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u8)))
-uint8x16_t __arm_vqrshlq_m_n(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s16)))
-int16x8_t __arm_vqrshlq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s16)))
-int16x8_t __arm_vqrshlq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s32)))
-int32x4_t __arm_vqrshlq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s32)))
-int32x4_t __arm_vqrshlq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s8)))
-int8x16_t __arm_vqrshlq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s8)))
-int8x16_t __arm_vqrshlq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u16)))
-uint16x8_t __arm_vqrshlq_m_u16(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u16)))
-uint16x8_t __arm_vqrshlq_m(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u32)))
-uint32x4_t __arm_vqrshlq_m_u32(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u32)))
-uint32x4_t __arm_vqrshlq_m(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u8)))
-uint8x16_t __arm_vqrshlq_m_u8(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u8)))
-uint8x16_t __arm_vqrshlq_m(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s16)))
-int16x8_t __arm_vqrshlq_n_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s16)))
-int16x8_t __arm_vqrshlq(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s32)))
-int32x4_t __arm_vqrshlq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s32)))
-int32x4_t __arm_vqrshlq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s8)))
-int8x16_t __arm_vqrshlq_n_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s8)))
-int8x16_t __arm_vqrshlq(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u16)))
-uint16x8_t __arm_vqrshlq_n_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u16)))
-uint16x8_t __arm_vqrshlq(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u32)))
-uint32x4_t __arm_vqrshlq_n_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u32)))
-uint32x4_t __arm_vqrshlq(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u8)))
-uint8x16_t __arm_vqrshlq_n_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u8)))
-uint8x16_t __arm_vqrshlq(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s16)))
-int16x8_t __arm_vqrshlq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s16)))
-int16x8_t __arm_vqrshlq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s32)))
-int32x4_t __arm_vqrshlq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s32)))
-int32x4_t __arm_vqrshlq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s8)))
-int8x16_t __arm_vqrshlq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s8)))
-int8x16_t __arm_vqrshlq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u16)))
-uint16x8_t __arm_vqrshlq_u16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u16)))
-uint16x8_t __arm_vqrshlq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u32)))
-uint32x4_t __arm_vqrshlq_u32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u32)))
-uint32x4_t __arm_vqrshlq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u8)))
-uint8x16_t __arm_vqrshlq_u8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u8)))
-uint8x16_t __arm_vqrshlq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_s16)))
-int8x16_t __arm_vqrshrnbq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_s16)))
-int8x16_t __arm_vqrshrnbq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_s32)))
-int16x8_t __arm_vqrshrnbq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_s32)))
-int16x8_t __arm_vqrshrnbq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_u16)))
-uint8x16_t __arm_vqrshrnbq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_u16)))
-uint8x16_t __arm_vqrshrnbq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_u32)))
-uint16x8_t __arm_vqrshrnbq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_u32)))
-uint16x8_t __arm_vqrshrnbq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_s16)))
-int8x16_t __arm_vqrshrnbq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_s16)))
-int8x16_t __arm_vqrshrnbq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_s32)))
-int16x8_t __arm_vqrshrnbq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_s32)))
-int16x8_t __arm_vqrshrnbq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_u16)))
-uint8x16_t __arm_vqrshrnbq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_u16)))
-uint8x16_t __arm_vqrshrnbq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_u32)))
-uint16x8_t __arm_vqrshrnbq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_u32)))
-uint16x8_t __arm_vqrshrnbq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_s16)))
-int8x16_t __arm_vqrshrntq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_s16)))
-int8x16_t __arm_vqrshrntq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_s32)))
-int16x8_t __arm_vqrshrntq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_s32)))
-int16x8_t __arm_vqrshrntq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_u16)))
-uint8x16_t __arm_vqrshrntq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_u16)))
-uint8x16_t __arm_vqrshrntq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_u32)))
-uint16x8_t __arm_vqrshrntq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_u32)))
-uint16x8_t __arm_vqrshrntq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_s16)))
-int8x16_t __arm_vqrshrntq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_s16)))
-int8x16_t __arm_vqrshrntq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_s32)))
-int16x8_t __arm_vqrshrntq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_s32)))
-int16x8_t __arm_vqrshrntq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_u16)))
-uint8x16_t __arm_vqrshrntq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_u16)))
-uint8x16_t __arm_vqrshrntq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_u32)))
-uint16x8_t __arm_vqrshrntq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_u32)))
-uint16x8_t __arm_vqrshrntq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_m_n_s16)))
-uint8x16_t __arm_vqrshrunbq_m_n_s16(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_m_n_s16)))
-uint8x16_t __arm_vqrshrunbq_m(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_m_n_s32)))
-uint16x8_t __arm_vqrshrunbq_m_n_s32(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_m_n_s32)))
-uint16x8_t __arm_vqrshrunbq_m(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_n_s16)))
-uint8x16_t __arm_vqrshrunbq_n_s16(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_n_s16)))
-uint8x16_t __arm_vqrshrunbq(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_n_s32)))
-uint16x8_t __arm_vqrshrunbq_n_s32(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_n_s32)))
-uint16x8_t __arm_vqrshrunbq(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_m_n_s16)))
-uint8x16_t __arm_vqrshruntq_m_n_s16(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_m_n_s16)))
-uint8x16_t __arm_vqrshruntq_m(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_m_n_s32)))
-uint16x8_t __arm_vqrshruntq_m_n_s32(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_m_n_s32)))
-uint16x8_t __arm_vqrshruntq_m(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_n_s16)))
-uint8x16_t __arm_vqrshruntq_n_s16(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_n_s16)))
-uint8x16_t __arm_vqrshruntq(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_n_s32)))
-uint16x8_t __arm_vqrshruntq_n_s32(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_n_s32)))
-uint16x8_t __arm_vqrshruntq(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s16)))
-int16x8_t __arm_vqshlq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s16)))
-int16x8_t __arm_vqshlq_m_n(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s32)))
-int32x4_t __arm_vqshlq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s32)))
-int32x4_t __arm_vqshlq_m_n(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s8)))
-int8x16_t __arm_vqshlq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s8)))
-int8x16_t __arm_vqshlq_m_n(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u16)))
-uint16x8_t __arm_vqshlq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u16)))
-uint16x8_t __arm_vqshlq_m_n(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u32)))
-uint32x4_t __arm_vqshlq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u32)))
-uint32x4_t __arm_vqshlq_m_n(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u8)))
-uint8x16_t __arm_vqshlq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u8)))
-uint8x16_t __arm_vqshlq_m_n(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s16)))
-int16x8_t __arm_vqshlq_m_r_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s16)))
-int16x8_t __arm_vqshlq_m_r(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s32)))
-int32x4_t __arm_vqshlq_m_r_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s32)))
-int32x4_t __arm_vqshlq_m_r(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s8)))
-int8x16_t __arm_vqshlq_m_r_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s8)))
-int8x16_t __arm_vqshlq_m_r(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u16)))
-uint16x8_t __arm_vqshlq_m_r_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u16)))
-uint16x8_t __arm_vqshlq_m_r(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u32)))
-uint32x4_t __arm_vqshlq_m_r_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u32)))
-uint32x4_t __arm_vqshlq_m_r(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u8)))
-uint8x16_t __arm_vqshlq_m_r_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u8)))
-uint8x16_t __arm_vqshlq_m_r(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s16)))
-int16x8_t __arm_vqshlq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s16)))
-int16x8_t __arm_vqshlq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s32)))
-int32x4_t __arm_vqshlq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s32)))
-int32x4_t __arm_vqshlq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s8)))
-int8x16_t __arm_vqshlq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s8)))
-int8x16_t __arm_vqshlq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u16)))
-uint16x8_t __arm_vqshlq_m_u16(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u16)))
-uint16x8_t __arm_vqshlq_m(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u32)))
-uint32x4_t __arm_vqshlq_m_u32(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u32)))
-uint32x4_t __arm_vqshlq_m(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u8)))
-uint8x16_t __arm_vqshlq_m_u8(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u8)))
-uint8x16_t __arm_vqshlq_m(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s16)))
-int16x8_t __arm_vqshlq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s16)))
-int16x8_t __arm_vqshlq_n(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s32)))
-int32x4_t __arm_vqshlq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s32)))
-int32x4_t __arm_vqshlq_n(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s8)))
-int8x16_t __arm_vqshlq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s8)))
-int8x16_t __arm_vqshlq_n(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u16)))
-uint16x8_t __arm_vqshlq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u16)))
-uint16x8_t __arm_vqshlq_n(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u32)))
-uint32x4_t __arm_vqshlq_n_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u32)))
-uint32x4_t __arm_vqshlq_n(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u8)))
-uint8x16_t __arm_vqshlq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u8)))
-uint8x16_t __arm_vqshlq_n(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s16)))
-int16x8_t __arm_vqshlq_r_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s16)))
-int16x8_t __arm_vqshlq_r(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s32)))
-int32x4_t __arm_vqshlq_r_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s32)))
-int32x4_t __arm_vqshlq_r(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s8)))
-int8x16_t __arm_vqshlq_r_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s8)))
-int8x16_t __arm_vqshlq_r(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u16)))
-uint16x8_t __arm_vqshlq_r_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u16)))
-uint16x8_t __arm_vqshlq_r(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u32)))
-uint32x4_t __arm_vqshlq_r_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u32)))
-uint32x4_t __arm_vqshlq_r(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u8)))
-uint8x16_t __arm_vqshlq_r_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u8)))
-uint8x16_t __arm_vqshlq_r(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s16)))
-int16x8_t __arm_vqshlq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s16)))
-int16x8_t __arm_vqshlq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s32)))
-int32x4_t __arm_vqshlq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s32)))
-int32x4_t __arm_vqshlq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s8)))
-int8x16_t __arm_vqshlq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s8)))
-int8x16_t __arm_vqshlq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u16)))
-uint16x8_t __arm_vqshlq_u16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u16)))
-uint16x8_t __arm_vqshlq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u32)))
-uint32x4_t __arm_vqshlq_u32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u32)))
-uint32x4_t __arm_vqshlq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u8)))
-uint8x16_t __arm_vqshlq_u8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u8)))
-uint8x16_t __arm_vqshlq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s16)))
-uint16x8_t __arm_vqshluq_m_n_s16(uint16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s16)))
-uint16x8_t __arm_vqshluq_m(uint16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s32)))
-uint32x4_t __arm_vqshluq_m_n_s32(uint32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s32)))
-uint32x4_t __arm_vqshluq_m(uint32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s8)))
-uint8x16_t __arm_vqshluq_m_n_s8(uint8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s8)))
-uint8x16_t __arm_vqshluq_m(uint8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s16)))
-uint16x8_t __arm_vqshluq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s16)))
-uint16x8_t __arm_vqshluq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s32)))
-uint32x4_t __arm_vqshluq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s32)))
-uint32x4_t __arm_vqshluq(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s8)))
-uint8x16_t __arm_vqshluq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s8)))
-uint8x16_t __arm_vqshluq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_s16)))
-int8x16_t __arm_vqshrnbq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_s16)))
-int8x16_t __arm_vqshrnbq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_s32)))
-int16x8_t __arm_vqshrnbq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_s32)))
-int16x8_t __arm_vqshrnbq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_u16)))
-uint8x16_t __arm_vqshrnbq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_u16)))
-uint8x16_t __arm_vqshrnbq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_u32)))
-uint16x8_t __arm_vqshrnbq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_u32)))
-uint16x8_t __arm_vqshrnbq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_s16)))
-int8x16_t __arm_vqshrnbq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_s16)))
-int8x16_t __arm_vqshrnbq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_s32)))
-int16x8_t __arm_vqshrnbq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_s32)))
-int16x8_t __arm_vqshrnbq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_u16)))
-uint8x16_t __arm_vqshrnbq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_u16)))
-uint8x16_t __arm_vqshrnbq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_u32)))
-uint16x8_t __arm_vqshrnbq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_u32)))
-uint16x8_t __arm_vqshrnbq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_s16)))
-int8x16_t __arm_vqshrntq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_s16)))
-int8x16_t __arm_vqshrntq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_s32)))
-int16x8_t __arm_vqshrntq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_s32)))
-int16x8_t __arm_vqshrntq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_u16)))
-uint8x16_t __arm_vqshrntq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_u16)))
-uint8x16_t __arm_vqshrntq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_u32)))
-uint16x8_t __arm_vqshrntq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_u32)))
-uint16x8_t __arm_vqshrntq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_s16)))
-int8x16_t __arm_vqshrntq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_s16)))
-int8x16_t __arm_vqshrntq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_s32)))
-int16x8_t __arm_vqshrntq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_s32)))
-int16x8_t __arm_vqshrntq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_u16)))
-uint8x16_t __arm_vqshrntq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_u16)))
-uint8x16_t __arm_vqshrntq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_u32)))
-uint16x8_t __arm_vqshrntq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_u32)))
-uint16x8_t __arm_vqshrntq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_m_n_s16)))
-uint8x16_t __arm_vqshrunbq_m_n_s16(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_m_n_s16)))
-uint8x16_t __arm_vqshrunbq_m(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_m_n_s32)))
-uint16x8_t __arm_vqshrunbq_m_n_s32(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_m_n_s32)))
-uint16x8_t __arm_vqshrunbq_m(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_n_s16)))
-uint8x16_t __arm_vqshrunbq_n_s16(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_n_s16)))
-uint8x16_t __arm_vqshrunbq(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_n_s32)))
-uint16x8_t __arm_vqshrunbq_n_s32(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_n_s32)))
-uint16x8_t __arm_vqshrunbq(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_m_n_s16)))
-uint8x16_t __arm_vqshruntq_m_n_s16(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_m_n_s16)))
-uint8x16_t __arm_vqshruntq_m(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_m_n_s32)))
-uint16x8_t __arm_vqshruntq_m_n_s32(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_m_n_s32)))
-uint16x8_t __arm_vqshruntq_m(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_n_s16)))
-uint8x16_t __arm_vqshruntq_n_s16(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_n_s16)))
-uint8x16_t __arm_vqshruntq(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_n_s32)))
-uint16x8_t __arm_vqshruntq_n_s32(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_n_s32)))
-uint16x8_t __arm_vqshruntq(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s16)))
-int16x8_t __arm_vqsubq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s16)))
-int16x8_t __arm_vqsubq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s32)))
-int32x4_t __arm_vqsubq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s32)))
-int32x4_t __arm_vqsubq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s8)))
-int8x16_t __arm_vqsubq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s8)))
-int8x16_t __arm_vqsubq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u16)))
-uint16x8_t __arm_vqsubq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u16)))
-uint16x8_t __arm_vqsubq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u32)))
-uint32x4_t __arm_vqsubq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u32)))
-uint32x4_t __arm_vqsubq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u8)))
-uint8x16_t __arm_vqsubq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u8)))
-uint8x16_t __arm_vqsubq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s16)))
-int16x8_t __arm_vqsubq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s16)))
-int16x8_t __arm_vqsubq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s32)))
-int32x4_t __arm_vqsubq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s32)))
-int32x4_t __arm_vqsubq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s8)))
-int8x16_t __arm_vqsubq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s8)))
-int8x16_t __arm_vqsubq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u16)))
-uint16x8_t __arm_vqsubq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u16)))
-uint16x8_t __arm_vqsubq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u32)))
-uint32x4_t __arm_vqsubq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u32)))
-uint32x4_t __arm_vqsubq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u8)))
-uint8x16_t __arm_vqsubq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u8)))
-uint8x16_t __arm_vqsubq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s16)))
-int16x8_t __arm_vqsubq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s16)))
-int16x8_t __arm_vqsubq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s32)))
-int32x4_t __arm_vqsubq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s32)))
-int32x4_t __arm_vqsubq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s8)))
-int8x16_t __arm_vqsubq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s8)))
-int8x16_t __arm_vqsubq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u16)))
-uint16x8_t __arm_vqsubq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u16)))
-uint16x8_t __arm_vqsubq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u32)))
-uint32x4_t __arm_vqsubq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u32)))
-uint32x4_t __arm_vqsubq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u8)))
-uint8x16_t __arm_vqsubq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u8)))
-uint8x16_t __arm_vqsubq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s16)))
-int16x8_t __arm_vqsubq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s16)))
-int16x8_t __arm_vqsubq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s32)))
-int32x4_t __arm_vqsubq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s32)))
-int32x4_t __arm_vqsubq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s8)))
-int8x16_t __arm_vqsubq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s8)))
-int8x16_t __arm_vqsubq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u16)))
-uint16x8_t __arm_vqsubq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u16)))
-uint16x8_t __arm_vqsubq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u32)))
-uint32x4_t __arm_vqsubq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u32)))
-uint32x4_t __arm_vqsubq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u8)))
-uint8x16_t __arm_vqsubq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u8)))
-uint8x16_t __arm_vqsubq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s32)))
-int16x8_t __arm_vreinterpretq_s16_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s32)))
-int16x8_t __arm_vreinterpretq_s16(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s64)))
-int16x8_t __arm_vreinterpretq_s16_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s64)))
-int16x8_t __arm_vreinterpretq_s16(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s8)))
-int16x8_t __arm_vreinterpretq_s16_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s8)))
-int16x8_t __arm_vreinterpretq_s16(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u16)))
-int16x8_t __arm_vreinterpretq_s16_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u16)))
-int16x8_t __arm_vreinterpretq_s16(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u32)))
-int16x8_t __arm_vreinterpretq_s16_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u32)))
-int16x8_t __arm_vreinterpretq_s16(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u64)))
-int16x8_t __arm_vreinterpretq_s16_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u64)))
-int16x8_t __arm_vreinterpretq_s16(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
-int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
-int16x8_t __arm_vreinterpretq_s16(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s16)))
-int32x4_t __arm_vreinterpretq_s32_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s16)))
-int32x4_t __arm_vreinterpretq_s32(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s64)))
-int32x4_t __arm_vreinterpretq_s32_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s64)))
-int32x4_t __arm_vreinterpretq_s32(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s8)))
-int32x4_t __arm_vreinterpretq_s32_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s8)))
-int32x4_t __arm_vreinterpretq_s32(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u16)))
-int32x4_t __arm_vreinterpretq_s32_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u16)))
-int32x4_t __arm_vreinterpretq_s32(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u32)))
-int32x4_t __arm_vreinterpretq_s32_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u32)))
-int32x4_t __arm_vreinterpretq_s32(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u64)))
-int32x4_t __arm_vreinterpretq_s32_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u64)))
-int32x4_t __arm_vreinterpretq_s32(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
-int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
-int32x4_t __arm_vreinterpretq_s32(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s16)))
-int64x2_t __arm_vreinterpretq_s64_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s16)))
-int64x2_t __arm_vreinterpretq_s64(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s32)))
-int64x2_t __arm_vreinterpretq_s64_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s32)))
-int64x2_t __arm_vreinterpretq_s64(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s8)))
-int64x2_t __arm_vreinterpretq_s64_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s8)))
-int64x2_t __arm_vreinterpretq_s64(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u16)))
-int64x2_t __arm_vreinterpretq_s64_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u16)))
-int64x2_t __arm_vreinterpretq_s64(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u32)))
-int64x2_t __arm_vreinterpretq_s64_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u32)))
-int64x2_t __arm_vreinterpretq_s64(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u64)))
-int64x2_t __arm_vreinterpretq_s64_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u64)))
-int64x2_t __arm_vreinterpretq_s64(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
-int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
-int64x2_t __arm_vreinterpretq_s64(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s16)))
-int8x16_t __arm_vreinterpretq_s8_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s16)))
-int8x16_t __arm_vreinterpretq_s8(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s32)))
-int8x16_t __arm_vreinterpretq_s8_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s32)))
-int8x16_t __arm_vreinterpretq_s8(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s64)))
-int8x16_t __arm_vreinterpretq_s8_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s64)))
-int8x16_t __arm_vreinterpretq_s8(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u16)))
-int8x16_t __arm_vreinterpretq_s8_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u16)))
-int8x16_t __arm_vreinterpretq_s8(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u32)))
-int8x16_t __arm_vreinterpretq_s8_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u32)))
-int8x16_t __arm_vreinterpretq_s8(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u64)))
-int8x16_t __arm_vreinterpretq_s8_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u64)))
-int8x16_t __arm_vreinterpretq_s8(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
-int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
-int8x16_t __arm_vreinterpretq_s8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s16)))
-uint16x8_t __arm_vreinterpretq_u16_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s16)))
-uint16x8_t __arm_vreinterpretq_u16(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s32)))
-uint16x8_t __arm_vreinterpretq_u16_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s32)))
-uint16x8_t __arm_vreinterpretq_u16(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s64)))
-uint16x8_t __arm_vreinterpretq_u16_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s64)))
-uint16x8_t __arm_vreinterpretq_u16(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s8)))
-uint16x8_t __arm_vreinterpretq_u16_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s8)))
-uint16x8_t __arm_vreinterpretq_u16(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u32)))
-uint16x8_t __arm_vreinterpretq_u16_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u32)))
-uint16x8_t __arm_vreinterpretq_u16(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u64)))
-uint16x8_t __arm_vreinterpretq_u16_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u64)))
-uint16x8_t __arm_vreinterpretq_u16(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
-uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
-uint16x8_t __arm_vreinterpretq_u16(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s16)))
-uint32x4_t __arm_vreinterpretq_u32_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s16)))
-uint32x4_t __arm_vreinterpretq_u32(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s32)))
-uint32x4_t __arm_vreinterpretq_u32_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s32)))
-uint32x4_t __arm_vreinterpretq_u32(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s64)))
-uint32x4_t __arm_vreinterpretq_u32_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s64)))
-uint32x4_t __arm_vreinterpretq_u32(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s8)))
-uint32x4_t __arm_vreinterpretq_u32_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s8)))
-uint32x4_t __arm_vreinterpretq_u32(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u16)))
-uint32x4_t __arm_vreinterpretq_u32_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u16)))
-uint32x4_t __arm_vreinterpretq_u32(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u64)))
-uint32x4_t __arm_vreinterpretq_u32_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u64)))
-uint32x4_t __arm_vreinterpretq_u32(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
-uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
-uint32x4_t __arm_vreinterpretq_u32(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s16)))
-uint64x2_t __arm_vreinterpretq_u64_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s16)))
-uint64x2_t __arm_vreinterpretq_u64(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s32)))
-uint64x2_t __arm_vreinterpretq_u64_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s32)))
-uint64x2_t __arm_vreinterpretq_u64(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s64)))
-uint64x2_t __arm_vreinterpretq_u64_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s64)))
-uint64x2_t __arm_vreinterpretq_u64(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s8)))
-uint64x2_t __arm_vreinterpretq_u64_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s8)))
-uint64x2_t __arm_vreinterpretq_u64(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u16)))
-uint64x2_t __arm_vreinterpretq_u64_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u16)))
-uint64x2_t __arm_vreinterpretq_u64(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u32)))
-uint64x2_t __arm_vreinterpretq_u64_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u32)))
-uint64x2_t __arm_vreinterpretq_u64(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
-uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
-uint64x2_t __arm_vreinterpretq_u64(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
-uint8x16_t __arm_vreinterpretq_u8_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
-uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
-uint8x16_t __arm_vreinterpretq_u8_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
-uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
-uint8x16_t __arm_vreinterpretq_u8_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
-uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
-uint8x16_t __arm_vreinterpretq_u8_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
-uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
-uint8x16_t __arm_vreinterpretq_u8_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
-uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
-uint8x16_t __arm_vreinterpretq_u8_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
-uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
-uint8x16_t __arm_vreinterpretq_u8_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
-uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_m_s8)))
-int8x16_t __arm_vrev16q_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_m_s8)))
-int8x16_t __arm_vrev16q_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_m_u8)))
-uint8x16_t __arm_vrev16q_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_m_u8)))
-uint8x16_t __arm_vrev16q_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_s8)))
-int8x16_t __arm_vrev16q_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_s8)))
-int8x16_t __arm_vrev16q(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_u8)))
-uint8x16_t __arm_vrev16q_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_u8)))
-uint8x16_t __arm_vrev16q(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_x_s8)))
-int8x16_t __arm_vrev16q_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_x_s8)))
-int8x16_t __arm_vrev16q_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_x_u8)))
-uint8x16_t __arm_vrev16q_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_x_u8)))
-uint8x16_t __arm_vrev16q_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_s16)))
-int16x8_t __arm_vrev32q_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_s16)))
-int16x8_t __arm_vrev32q_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_s8)))
-int8x16_t __arm_vrev32q_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_s8)))
-int8x16_t __arm_vrev32q_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_u16)))
-uint16x8_t __arm_vrev32q_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_u16)))
-uint16x8_t __arm_vrev32q_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_u8)))
-uint8x16_t __arm_vrev32q_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_u8)))
-uint8x16_t __arm_vrev32q_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_s16)))
-int16x8_t __arm_vrev32q_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_s16)))
-int16x8_t __arm_vrev32q(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_s8)))
-int8x16_t __arm_vrev32q_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_s8)))
-int8x16_t __arm_vrev32q(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_u16)))
-uint16x8_t __arm_vrev32q_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_u16)))
-uint16x8_t __arm_vrev32q(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_u8)))
-uint8x16_t __arm_vrev32q_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_u8)))
-uint8x16_t __arm_vrev32q(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_s16)))
-int16x8_t __arm_vrev32q_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_s16)))
-int16x8_t __arm_vrev32q_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_s8)))
-int8x16_t __arm_vrev32q_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_s8)))
-int8x16_t __arm_vrev32q_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_u16)))
-uint16x8_t __arm_vrev32q_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_u16)))
-uint16x8_t __arm_vrev32q_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_u8)))
-uint8x16_t __arm_vrev32q_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_u8)))
-uint8x16_t __arm_vrev32q_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s16)))
-int16x8_t __arm_vrev64q_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s16)))
-int16x8_t __arm_vrev64q_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s32)))
-int32x4_t __arm_vrev64q_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s32)))
-int32x4_t __arm_vrev64q_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s8)))
-int8x16_t __arm_vrev64q_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s8)))
-int8x16_t __arm_vrev64q_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u16)))
-uint16x8_t __arm_vrev64q_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u16)))
-uint16x8_t __arm_vrev64q_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u32)))
-uint32x4_t __arm_vrev64q_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u32)))
-uint32x4_t __arm_vrev64q_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u8)))
-uint8x16_t __arm_vrev64q_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u8)))
-uint8x16_t __arm_vrev64q_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s16)))
-int16x8_t __arm_vrev64q_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s16)))
-int16x8_t __arm_vrev64q(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s32)))
-int32x4_t __arm_vrev64q_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s32)))
-int32x4_t __arm_vrev64q(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s8)))
-int8x16_t __arm_vrev64q_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s8)))
-int8x16_t __arm_vrev64q(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u16)))
-uint16x8_t __arm_vrev64q_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u16)))
-uint16x8_t __arm_vrev64q(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u32)))
-uint32x4_t __arm_vrev64q_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u32)))
-uint32x4_t __arm_vrev64q(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u8)))
-uint8x16_t __arm_vrev64q_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u8)))
-uint8x16_t __arm_vrev64q(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s16)))
-int16x8_t __arm_vrev64q_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s16)))
-int16x8_t __arm_vrev64q_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s32)))
-int32x4_t __arm_vrev64q_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s32)))
-int32x4_t __arm_vrev64q_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s8)))
-int8x16_t __arm_vrev64q_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s8)))
-int8x16_t __arm_vrev64q_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u16)))
-uint16x8_t __arm_vrev64q_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u16)))
-uint16x8_t __arm_vrev64q_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u32)))
-uint32x4_t __arm_vrev64q_x_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u32)))
-uint32x4_t __arm_vrev64q_x(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u8)))
-uint8x16_t __arm_vrev64q_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u8)))
-uint8x16_t __arm_vrev64q_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s16)))
-int16x8_t __arm_vrhaddq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s16)))
-int16x8_t __arm_vrhaddq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s32)))
-int32x4_t __arm_vrhaddq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s32)))
-int32x4_t __arm_vrhaddq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s8)))
-int8x16_t __arm_vrhaddq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s8)))
-int8x16_t __arm_vrhaddq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u16)))
-uint16x8_t __arm_vrhaddq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u16)))
-uint16x8_t __arm_vrhaddq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u32)))
-uint32x4_t __arm_vrhaddq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u32)))
-uint32x4_t __arm_vrhaddq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u8)))
-uint8x16_t __arm_vrhaddq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u8)))
-uint8x16_t __arm_vrhaddq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s16)))
-int16x8_t __arm_vrhaddq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s16)))
-int16x8_t __arm_vrhaddq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s32)))
-int32x4_t __arm_vrhaddq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s32)))
-int32x4_t __arm_vrhaddq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s8)))
-int8x16_t __arm_vrhaddq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s8)))
-int8x16_t __arm_vrhaddq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u16)))
-uint16x8_t __arm_vrhaddq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u16)))
-uint16x8_t __arm_vrhaddq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u32)))
-uint32x4_t __arm_vrhaddq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u32)))
-uint32x4_t __arm_vrhaddq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u8)))
-uint8x16_t __arm_vrhaddq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u8)))
-uint8x16_t __arm_vrhaddq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s16)))
-int16x8_t __arm_vrhaddq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s16)))
-int16x8_t __arm_vrhaddq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s32)))
-int32x4_t __arm_vrhaddq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s32)))
-int32x4_t __arm_vrhaddq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s8)))
-int8x16_t __arm_vrhaddq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s8)))
-int8x16_t __arm_vrhaddq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u16)))
-uint16x8_t __arm_vrhaddq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u16)))
-uint16x8_t __arm_vrhaddq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u32)))
-uint32x4_t __arm_vrhaddq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u32)))
-uint32x4_t __arm_vrhaddq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u8)))
-uint8x16_t __arm_vrhaddq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u8)))
-uint8x16_t __arm_vrhaddq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_p_s32)))
-int64_t __arm_vrmlaldavhaq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_p_s32)))
-int64_t __arm_vrmlaldavhaq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_p_u32)))
-uint64_t __arm_vrmlaldavhaq_p_u32(uint64_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_p_u32)))
-uint64_t __arm_vrmlaldavhaq_p(uint64_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_s32)))
-int64_t __arm_vrmlaldavhaq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_s32)))
-int64_t __arm_vrmlaldavhaq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_u32)))
-uint64_t __arm_vrmlaldavhaq_u32(uint64_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_u32)))
-uint64_t __arm_vrmlaldavhaq(uint64_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaxq_p_s32)))
-int64_t __arm_vrmlaldavhaxq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaxq_p_s32)))
-int64_t __arm_vrmlaldavhaxq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaxq_s32)))
-int64_t __arm_vrmlaldavhaxq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaxq_s32)))
-int64_t __arm_vrmlaldavhaxq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_p_s32)))
-int64_t __arm_vrmlaldavhq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_p_s32)))
-int64_t __arm_vrmlaldavhq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_p_u32)))
-uint64_t __arm_vrmlaldavhq_p_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_p_u32)))
-uint64_t __arm_vrmlaldavhq_p(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_s32)))
-int64_t __arm_vrmlaldavhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_s32)))
-int64_t __arm_vrmlaldavhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_u32)))
-uint64_t __arm_vrmlaldavhq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_u32)))
-uint64_t __arm_vrmlaldavhq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhxq_p_s32)))
-int64_t __arm_vrmlaldavhxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhxq_p_s32)))
-int64_t __arm_vrmlaldavhxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhxq_s32)))
-int64_t __arm_vrmlaldavhxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhxq_s32)))
-int64_t __arm_vrmlaldavhxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaq_p_s32)))
-int64_t __arm_vrmlsldavhaq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaq_p_s32)))
-int64_t __arm_vrmlsldavhaq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaq_s32)))
-int64_t __arm_vrmlsldavhaq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaq_s32)))
-int64_t __arm_vrmlsldavhaq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaxq_p_s32)))
-int64_t __arm_vrmlsldavhaxq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaxq_p_s32)))
-int64_t __arm_vrmlsldavhaxq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaxq_s32)))
-int64_t __arm_vrmlsldavhaxq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaxq_s32)))
-int64_t __arm_vrmlsldavhaxq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhq_p_s32)))
-int64_t __arm_vrmlsldavhq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhq_p_s32)))
-int64_t __arm_vrmlsldavhq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhq_s32)))
-int64_t __arm_vrmlsldavhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhq_s32)))
-int64_t __arm_vrmlsldavhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhxq_p_s32)))
-int64_t __arm_vrmlsldavhxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhxq_p_s32)))
-int64_t __arm_vrmlsldavhxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhxq_s32)))
-int64_t __arm_vrmlsldavhxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhxq_s32)))
-int64_t __arm_vrmlsldavhxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s16)))
-int16x8_t __arm_vrmulhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s16)))
-int16x8_t __arm_vrmulhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s32)))
-int32x4_t __arm_vrmulhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s32)))
-int32x4_t __arm_vrmulhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s8)))
-int8x16_t __arm_vrmulhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s8)))
-int8x16_t __arm_vrmulhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u16)))
-uint16x8_t __arm_vrmulhq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u16)))
-uint16x8_t __arm_vrmulhq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u32)))
-uint32x4_t __arm_vrmulhq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u32)))
-uint32x4_t __arm_vrmulhq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u8)))
-uint8x16_t __arm_vrmulhq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u8)))
-uint8x16_t __arm_vrmulhq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s16)))
-int16x8_t __arm_vrmulhq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s16)))
-int16x8_t __arm_vrmulhq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s32)))
-int32x4_t __arm_vrmulhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s32)))
-int32x4_t __arm_vrmulhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s8)))
-int8x16_t __arm_vrmulhq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s8)))
-int8x16_t __arm_vrmulhq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u16)))
-uint16x8_t __arm_vrmulhq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u16)))
-uint16x8_t __arm_vrmulhq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u32)))
-uint32x4_t __arm_vrmulhq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u32)))
-uint32x4_t __arm_vrmulhq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u8)))
-uint8x16_t __arm_vrmulhq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u8)))
-uint8x16_t __arm_vrmulhq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s16)))
-int16x8_t __arm_vrmulhq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s16)))
-int16x8_t __arm_vrmulhq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s32)))
-int32x4_t __arm_vrmulhq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s32)))
-int32x4_t __arm_vrmulhq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s8)))
-int8x16_t __arm_vrmulhq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s8)))
-int8x16_t __arm_vrmulhq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u16)))
-uint16x8_t __arm_vrmulhq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u16)))
-uint16x8_t __arm_vrmulhq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u32)))
-uint32x4_t __arm_vrmulhq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u32)))
-uint32x4_t __arm_vrmulhq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u8)))
-uint8x16_t __arm_vrmulhq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u8)))
-uint8x16_t __arm_vrmulhq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s16)))
-int16x8_t __arm_vrshlq_m_n_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s16)))
-int16x8_t __arm_vrshlq_m_n(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s32)))
-int32x4_t __arm_vrshlq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s32)))
-int32x4_t __arm_vrshlq_m_n(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s8)))
-int8x16_t __arm_vrshlq_m_n_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s8)))
-int8x16_t __arm_vrshlq_m_n(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u16)))
-uint16x8_t __arm_vrshlq_m_n_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u16)))
-uint16x8_t __arm_vrshlq_m_n(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u32)))
-uint32x4_t __arm_vrshlq_m_n_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u32)))
-uint32x4_t __arm_vrshlq_m_n(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u8)))
-uint8x16_t __arm_vrshlq_m_n_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u8)))
-uint8x16_t __arm_vrshlq_m_n(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s16)))
-int16x8_t __arm_vrshlq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s16)))
-int16x8_t __arm_vrshlq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s32)))
-int32x4_t __arm_vrshlq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s32)))
-int32x4_t __arm_vrshlq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s8)))
-int8x16_t __arm_vrshlq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s8)))
-int8x16_t __arm_vrshlq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u16)))
-uint16x8_t __arm_vrshlq_m_u16(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u16)))
-uint16x8_t __arm_vrshlq_m(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u32)))
-uint32x4_t __arm_vrshlq_m_u32(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u32)))
-uint32x4_t __arm_vrshlq_m(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u8)))
-uint8x16_t __arm_vrshlq_m_u8(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u8)))
-uint8x16_t __arm_vrshlq_m(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s16)))
-int16x8_t __arm_vrshlq_n_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s16)))
-int16x8_t __arm_vrshlq(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s32)))
-int32x4_t __arm_vrshlq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s32)))
-int32x4_t __arm_vrshlq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s8)))
-int8x16_t __arm_vrshlq_n_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s8)))
-int8x16_t __arm_vrshlq(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u16)))
-uint16x8_t __arm_vrshlq_n_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u16)))
-uint16x8_t __arm_vrshlq(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u32)))
-uint32x4_t __arm_vrshlq_n_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u32)))
-uint32x4_t __arm_vrshlq(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u8)))
-uint8x16_t __arm_vrshlq_n_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u8)))
-uint8x16_t __arm_vrshlq(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s16)))
-int16x8_t __arm_vrshlq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s16)))
-int16x8_t __arm_vrshlq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s32)))
-int32x4_t __arm_vrshlq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s32)))
-int32x4_t __arm_vrshlq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s8)))
-int8x16_t __arm_vrshlq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s8)))
-int8x16_t __arm_vrshlq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u16)))
-uint16x8_t __arm_vrshlq_u16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u16)))
-uint16x8_t __arm_vrshlq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u32)))
-uint32x4_t __arm_vrshlq_u32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u32)))
-uint32x4_t __arm_vrshlq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u8)))
-uint8x16_t __arm_vrshlq_u8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u8)))
-uint8x16_t __arm_vrshlq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s16)))
-int16x8_t __arm_vrshlq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s16)))
-int16x8_t __arm_vrshlq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s32)))
-int32x4_t __arm_vrshlq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s32)))
-int32x4_t __arm_vrshlq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s8)))
-int8x16_t __arm_vrshlq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s8)))
-int8x16_t __arm_vrshlq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u16)))
-uint16x8_t __arm_vrshlq_x_u16(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u16)))
-uint16x8_t __arm_vrshlq_x(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u32)))
-uint32x4_t __arm_vrshlq_x_u32(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u32)))
-uint32x4_t __arm_vrshlq_x(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u8)))
-uint8x16_t __arm_vrshlq_x_u8(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u8)))
-uint8x16_t __arm_vrshlq_x(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_s16)))
-int8x16_t __arm_vrshrnbq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_s16)))
-int8x16_t __arm_vrshrnbq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_s32)))
-int16x8_t __arm_vrshrnbq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_s32)))
-int16x8_t __arm_vrshrnbq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_u16)))
-uint8x16_t __arm_vrshrnbq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_u16)))
-uint8x16_t __arm_vrshrnbq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_u32)))
-uint16x8_t __arm_vrshrnbq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_u32)))
-uint16x8_t __arm_vrshrnbq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_s16)))
-int8x16_t __arm_vrshrnbq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_s16)))
-int8x16_t __arm_vrshrnbq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_s32)))
-int16x8_t __arm_vrshrnbq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_s32)))
-int16x8_t __arm_vrshrnbq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_u16)))
-uint8x16_t __arm_vrshrnbq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_u16)))
-uint8x16_t __arm_vrshrnbq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_u32)))
-uint16x8_t __arm_vrshrnbq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_u32)))
-uint16x8_t __arm_vrshrnbq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_s16)))
-int8x16_t __arm_vrshrntq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_s16)))
-int8x16_t __arm_vrshrntq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_s32)))
-int16x8_t __arm_vrshrntq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_s32)))
-int16x8_t __arm_vrshrntq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_u16)))
-uint8x16_t __arm_vrshrntq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_u16)))
-uint8x16_t __arm_vrshrntq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_u32)))
-uint16x8_t __arm_vrshrntq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_u32)))
-uint16x8_t __arm_vrshrntq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_s16)))
-int8x16_t __arm_vrshrntq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_s16)))
-int8x16_t __arm_vrshrntq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_s32)))
-int16x8_t __arm_vrshrntq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_s32)))
-int16x8_t __arm_vrshrntq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_u16)))
-uint8x16_t __arm_vrshrntq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_u16)))
-uint8x16_t __arm_vrshrntq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_u32)))
-uint16x8_t __arm_vrshrntq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_u32)))
-uint16x8_t __arm_vrshrntq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s16)))
-int16x8_t __arm_vrshrq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s16)))
-int16x8_t __arm_vrshrq_m(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s32)))
-int32x4_t __arm_vrshrq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s32)))
-int32x4_t __arm_vrshrq_m(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s8)))
-int8x16_t __arm_vrshrq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s8)))
-int8x16_t __arm_vrshrq_m(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u16)))
-uint16x8_t __arm_vrshrq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u16)))
-uint16x8_t __arm_vrshrq_m(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u32)))
-uint32x4_t __arm_vrshrq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u32)))
-uint32x4_t __arm_vrshrq_m(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u8)))
-uint8x16_t __arm_vrshrq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u8)))
-uint8x16_t __arm_vrshrq_m(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s16)))
-int16x8_t __arm_vrshrq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s16)))
-int16x8_t __arm_vrshrq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s32)))
-int32x4_t __arm_vrshrq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s32)))
-int32x4_t __arm_vrshrq(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s8)))
-int8x16_t __arm_vrshrq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s8)))
-int8x16_t __arm_vrshrq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u16)))
-uint16x8_t __arm_vrshrq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u16)))
-uint16x8_t __arm_vrshrq(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u32)))
-uint32x4_t __arm_vrshrq_n_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u32)))
-uint32x4_t __arm_vrshrq(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u8)))
-uint8x16_t __arm_vrshrq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u8)))
-uint8x16_t __arm_vrshrq(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s16)))
-int16x8_t __arm_vrshrq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s16)))
-int16x8_t __arm_vrshrq_x(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s32)))
-int32x4_t __arm_vrshrq_x_n_s32(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s32)))
-int32x4_t __arm_vrshrq_x(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s8)))
-int8x16_t __arm_vrshrq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s8)))
-int8x16_t __arm_vrshrq_x(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u16)))
-uint16x8_t __arm_vrshrq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u16)))
-uint16x8_t __arm_vrshrq_x(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u32)))
-uint32x4_t __arm_vrshrq_x_n_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u32)))
-uint32x4_t __arm_vrshrq_x(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u8)))
-uint8x16_t __arm_vrshrq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u8)))
-uint8x16_t __arm_vrshrq_x(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_m_s32)))
-int32x4_t __arm_vsbciq_m_s32(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_m_s32)))
-int32x4_t __arm_vsbciq_m(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_m_u32)))
-uint32x4_t __arm_vsbciq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_m_u32)))
-uint32x4_t __arm_vsbciq_m(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_s32)))
-int32x4_t __arm_vsbciq_s32(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_s32)))
-int32x4_t __arm_vsbciq(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_u32)))
-uint32x4_t __arm_vsbciq_u32(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_u32)))
-uint32x4_t __arm_vsbciq(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_m_s32)))
-int32x4_t __arm_vsbcq_m_s32(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_m_s32)))
-int32x4_t __arm_vsbcq_m(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_m_u32)))
-uint32x4_t __arm_vsbcq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_m_u32)))
-uint32x4_t __arm_vsbcq_m(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_s32)))
-int32x4_t __arm_vsbcq_s32(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_s32)))
-int32x4_t __arm_vsbcq(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_u32)))
-uint32x4_t __arm_vsbcq_u32(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_u32)))
-uint32x4_t __arm_vsbcq(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s16)))
-int16x8_t __arm_vsetq_lane_s16(int16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s16)))
-int16x8_t __arm_vsetq_lane(int16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s32)))
-int32x4_t __arm_vsetq_lane_s32(int32_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s32)))
-int32x4_t __arm_vsetq_lane(int32_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s64)))
-int64x2_t __arm_vsetq_lane_s64(int64_t, int64x2_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s64)))
-int64x2_t __arm_vsetq_lane(int64_t, int64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s8)))
-int8x16_t __arm_vsetq_lane_s8(int8_t, int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s8)))
-int8x16_t __arm_vsetq_lane(int8_t, int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u16)))
-uint16x8_t __arm_vsetq_lane_u16(uint16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u16)))
-uint16x8_t __arm_vsetq_lane(uint16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u32)))
-uint32x4_t __arm_vsetq_lane_u32(uint32_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u32)))
-uint32x4_t __arm_vsetq_lane(uint32_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u64)))
-uint64x2_t __arm_vsetq_lane_u64(uint64_t, uint64x2_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u64)))
-uint64x2_t __arm_vsetq_lane(uint64_t, uint64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u8)))
-uint8x16_t __arm_vsetq_lane_u8(uint8_t, uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u8)))
-uint8x16_t __arm_vsetq_lane(uint8_t, uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s16)))
-int16x8_t __arm_vshlcq_m_s16(int16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s16)))
-int16x8_t __arm_vshlcq_m(int16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s32)))
-int32x4_t __arm_vshlcq_m_s32(int32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s32)))
-int32x4_t __arm_vshlcq_m(int32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s8)))
-int8x16_t __arm_vshlcq_m_s8(int8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s8)))
-int8x16_t __arm_vshlcq_m(int8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u16)))
-uint16x8_t __arm_vshlcq_m_u16(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u16)))
-uint16x8_t __arm_vshlcq_m(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u32)))
-uint32x4_t __arm_vshlcq_m_u32(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u32)))
-uint32x4_t __arm_vshlcq_m(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u8)))
-uint8x16_t __arm_vshlcq_m_u8(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u8)))
-uint8x16_t __arm_vshlcq_m(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s16)))
-int16x8_t __arm_vshlcq_s16(int16x8_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s16)))
-int16x8_t __arm_vshlcq(int16x8_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s32)))
-int32x4_t __arm_vshlcq_s32(int32x4_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s32)))
-int32x4_t __arm_vshlcq(int32x4_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s8)))
-int8x16_t __arm_vshlcq_s8(int8x16_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s8)))
-int8x16_t __arm_vshlcq(int8x16_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u16)))
-uint16x8_t __arm_vshlcq_u16(uint16x8_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u16)))
-uint16x8_t __arm_vshlcq(uint16x8_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u32)))
-uint32x4_t __arm_vshlcq_u32(uint32x4_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u32)))
-uint32x4_t __arm_vshlcq(uint32x4_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u8)))
-uint8x16_t __arm_vshlcq_u8(uint8x16_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u8)))
-uint8x16_t __arm_vshlcq(uint8x16_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_s16)))
-int32x4_t __arm_vshllbq_m_n_s16(int32x4_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_s16)))
-int32x4_t __arm_vshllbq_m(int32x4_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_s8)))
-int16x8_t __arm_vshllbq_m_n_s8(int16x8_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_s8)))
-int16x8_t __arm_vshllbq_m(int16x8_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_u16)))
-uint32x4_t __arm_vshllbq_m_n_u16(uint32x4_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_u16)))
-uint32x4_t __arm_vshllbq_m(uint32x4_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_u8)))
-uint16x8_t __arm_vshllbq_m_n_u8(uint16x8_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_u8)))
-uint16x8_t __arm_vshllbq_m(uint16x8_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_s16)))
-int32x4_t __arm_vshllbq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_s16)))
-int32x4_t __arm_vshllbq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_s8)))
-int16x8_t __arm_vshllbq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_s8)))
-int16x8_t __arm_vshllbq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_u16)))
-uint32x4_t __arm_vshllbq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_u16)))
-uint32x4_t __arm_vshllbq(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_u8)))
-uint16x8_t __arm_vshllbq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_u8)))
-uint16x8_t __arm_vshllbq(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_s16)))
-int32x4_t __arm_vshllbq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_s16)))
-int32x4_t __arm_vshllbq_x(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_s8)))
-int16x8_t __arm_vshllbq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_s8)))
-int16x8_t __arm_vshllbq_x(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_u16)))
-uint32x4_t __arm_vshllbq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_u16)))
-uint32x4_t __arm_vshllbq_x(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_u8)))
-uint16x8_t __arm_vshllbq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_u8)))
-uint16x8_t __arm_vshllbq_x(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_s16)))
-int32x4_t __arm_vshlltq_m_n_s16(int32x4_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_s16)))
-int32x4_t __arm_vshlltq_m(int32x4_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_s8)))
-int16x8_t __arm_vshlltq_m_n_s8(int16x8_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_s8)))
-int16x8_t __arm_vshlltq_m(int16x8_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_u16)))
-uint32x4_t __arm_vshlltq_m_n_u16(uint32x4_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_u16)))
-uint32x4_t __arm_vshlltq_m(uint32x4_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_u8)))
-uint16x8_t __arm_vshlltq_m_n_u8(uint16x8_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_u8)))
-uint16x8_t __arm_vshlltq_m(uint16x8_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_s16)))
-int32x4_t __arm_vshlltq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_s16)))
-int32x4_t __arm_vshlltq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_s8)))
-int16x8_t __arm_vshlltq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_s8)))
-int16x8_t __arm_vshlltq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_u16)))
-uint32x4_t __arm_vshlltq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_u16)))
-uint32x4_t __arm_vshlltq(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_u8)))
-uint16x8_t __arm_vshlltq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_u8)))
-uint16x8_t __arm_vshlltq(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_s16)))
-int32x4_t __arm_vshlltq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_s16)))
-int32x4_t __arm_vshlltq_x(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_s8)))
-int16x8_t __arm_vshlltq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_s8)))
-int16x8_t __arm_vshlltq_x(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_u16)))
-uint32x4_t __arm_vshlltq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_u16)))
-uint32x4_t __arm_vshlltq_x(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_u8)))
-uint16x8_t __arm_vshlltq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_u8)))
-uint16x8_t __arm_vshlltq_x(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s16)))
-int16x8_t __arm_vshlq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s16)))
-int16x8_t __arm_vshlq_m_n(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s32)))
-int32x4_t __arm_vshlq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s32)))
-int32x4_t __arm_vshlq_m_n(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s8)))
-int8x16_t __arm_vshlq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s8)))
-int8x16_t __arm_vshlq_m_n(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u16)))
-uint16x8_t __arm_vshlq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u16)))
-uint16x8_t __arm_vshlq_m_n(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u32)))
-uint32x4_t __arm_vshlq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u32)))
-uint32x4_t __arm_vshlq_m_n(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u8)))
-uint8x16_t __arm_vshlq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u8)))
-uint8x16_t __arm_vshlq_m_n(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s16)))
-int16x8_t __arm_vshlq_m_r_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s16)))
-int16x8_t __arm_vshlq_m_r(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s32)))
-int32x4_t __arm_vshlq_m_r_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s32)))
-int32x4_t __arm_vshlq_m_r(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s8)))
-int8x16_t __arm_vshlq_m_r_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s8)))
-int8x16_t __arm_vshlq_m_r(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u16)))
-uint16x8_t __arm_vshlq_m_r_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u16)))
-uint16x8_t __arm_vshlq_m_r(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u32)))
-uint32x4_t __arm_vshlq_m_r_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u32)))
-uint32x4_t __arm_vshlq_m_r(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u8)))
-uint8x16_t __arm_vshlq_m_r_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u8)))
-uint8x16_t __arm_vshlq_m_r(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s16)))
-int16x8_t __arm_vshlq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s16)))
-int16x8_t __arm_vshlq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s32)))
-int32x4_t __arm_vshlq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s32)))
-int32x4_t __arm_vshlq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s8)))
-int8x16_t __arm_vshlq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s8)))
-int8x16_t __arm_vshlq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u16)))
-uint16x8_t __arm_vshlq_m_u16(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u16)))
-uint16x8_t __arm_vshlq_m(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u32)))
-uint32x4_t __arm_vshlq_m_u32(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u32)))
-uint32x4_t __arm_vshlq_m(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u8)))
-uint8x16_t __arm_vshlq_m_u8(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u8)))
-uint8x16_t __arm_vshlq_m(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s16)))
-int16x8_t __arm_vshlq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s16)))
-int16x8_t __arm_vshlq_n(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s32)))
-int32x4_t __arm_vshlq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s32)))
-int32x4_t __arm_vshlq_n(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s8)))
-int8x16_t __arm_vshlq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s8)))
-int8x16_t __arm_vshlq_n(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u16)))
-uint16x8_t __arm_vshlq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u16)))
-uint16x8_t __arm_vshlq_n(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u32)))
-uint32x4_t __arm_vshlq_n_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u32)))
-uint32x4_t __arm_vshlq_n(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u8)))
-uint8x16_t __arm_vshlq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u8)))
-uint8x16_t __arm_vshlq_n(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s16)))
-int16x8_t __arm_vshlq_r_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s16)))
-int16x8_t __arm_vshlq_r(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s32)))
-int32x4_t __arm_vshlq_r_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s32)))
-int32x4_t __arm_vshlq_r(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s8)))
-int8x16_t __arm_vshlq_r_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s8)))
-int8x16_t __arm_vshlq_r(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u16)))
-uint16x8_t __arm_vshlq_r_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u16)))
-uint16x8_t __arm_vshlq_r(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u32)))
-uint32x4_t __arm_vshlq_r_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u32)))
-uint32x4_t __arm_vshlq_r(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u8)))
-uint8x16_t __arm_vshlq_r_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u8)))
-uint8x16_t __arm_vshlq_r(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s16)))
-int16x8_t __arm_vshlq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s16)))
-int16x8_t __arm_vshlq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s32)))
-int32x4_t __arm_vshlq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s32)))
-int32x4_t __arm_vshlq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s8)))
-int8x16_t __arm_vshlq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s8)))
-int8x16_t __arm_vshlq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u16)))
-uint16x8_t __arm_vshlq_u16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u16)))
-uint16x8_t __arm_vshlq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u32)))
-uint32x4_t __arm_vshlq_u32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u32)))
-uint32x4_t __arm_vshlq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u8)))
-uint8x16_t __arm_vshlq_u8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u8)))
-uint8x16_t __arm_vshlq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s16)))
-int16x8_t __arm_vshlq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s16)))
-int16x8_t __arm_vshlq_x_n(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s32)))
-int32x4_t __arm_vshlq_x_n_s32(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s32)))
-int32x4_t __arm_vshlq_x_n(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s8)))
-int8x16_t __arm_vshlq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s8)))
-int8x16_t __arm_vshlq_x_n(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u16)))
-uint16x8_t __arm_vshlq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u16)))
-uint16x8_t __arm_vshlq_x_n(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u32)))
-uint32x4_t __arm_vshlq_x_n_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u32)))
-uint32x4_t __arm_vshlq_x_n(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u8)))
-uint8x16_t __arm_vshlq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u8)))
-uint8x16_t __arm_vshlq_x_n(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s16)))
-int16x8_t __arm_vshlq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s16)))
-int16x8_t __arm_vshlq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s32)))
-int32x4_t __arm_vshlq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s32)))
-int32x4_t __arm_vshlq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s8)))
-int8x16_t __arm_vshlq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s8)))
-int8x16_t __arm_vshlq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u16)))
-uint16x8_t __arm_vshlq_x_u16(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u16)))
-uint16x8_t __arm_vshlq_x(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u32)))
-uint32x4_t __arm_vshlq_x_u32(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u32)))
-uint32x4_t __arm_vshlq_x(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u8)))
-uint8x16_t __arm_vshlq_x_u8(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u8)))
-uint8x16_t __arm_vshlq_x(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_s16)))
-int8x16_t __arm_vshrnbq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_s16)))
-int8x16_t __arm_vshrnbq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_s32)))
-int16x8_t __arm_vshrnbq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_s32)))
-int16x8_t __arm_vshrnbq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_u16)))
-uint8x16_t __arm_vshrnbq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_u16)))
-uint8x16_t __arm_vshrnbq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_u32)))
-uint16x8_t __arm_vshrnbq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_u32)))
-uint16x8_t __arm_vshrnbq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_s16)))
-int8x16_t __arm_vshrnbq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_s16)))
-int8x16_t __arm_vshrnbq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_s32)))
-int16x8_t __arm_vshrnbq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_s32)))
-int16x8_t __arm_vshrnbq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_u16)))
-uint8x16_t __arm_vshrnbq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_u16)))
-uint8x16_t __arm_vshrnbq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_u32)))
-uint16x8_t __arm_vshrnbq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_u32)))
-uint16x8_t __arm_vshrnbq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_s16)))
-int8x16_t __arm_vshrntq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_s16)))
-int8x16_t __arm_vshrntq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_s32)))
-int16x8_t __arm_vshrntq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_s32)))
-int16x8_t __arm_vshrntq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_u16)))
-uint8x16_t __arm_vshrntq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_u16)))
-uint8x16_t __arm_vshrntq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_u32)))
-uint16x8_t __arm_vshrntq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_u32)))
-uint16x8_t __arm_vshrntq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_s16)))
-int8x16_t __arm_vshrntq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_s16)))
-int8x16_t __arm_vshrntq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_s32)))
-int16x8_t __arm_vshrntq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_s32)))
-int16x8_t __arm_vshrntq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_u16)))
-uint8x16_t __arm_vshrntq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_u16)))
-uint8x16_t __arm_vshrntq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_u32)))
-uint16x8_t __arm_vshrntq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_u32)))
-uint16x8_t __arm_vshrntq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s16)))
-int16x8_t __arm_vshrq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s16)))
-int16x8_t __arm_vshrq_m(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s32)))
-int32x4_t __arm_vshrq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s32)))
-int32x4_t __arm_vshrq_m(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s8)))
-int8x16_t __arm_vshrq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s8)))
-int8x16_t __arm_vshrq_m(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u16)))
-uint16x8_t __arm_vshrq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u16)))
-uint16x8_t __arm_vshrq_m(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u32)))
-uint32x4_t __arm_vshrq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u32)))
-uint32x4_t __arm_vshrq_m(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u8)))
-uint8x16_t __arm_vshrq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u8)))
-uint8x16_t __arm_vshrq_m(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s16)))
-int16x8_t __arm_vshrq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s16)))
-int16x8_t __arm_vshrq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s32)))
-int32x4_t __arm_vshrq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s32)))
-int32x4_t __arm_vshrq(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s8)))
-int8x16_t __arm_vshrq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s8)))
-int8x16_t __arm_vshrq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u16)))
-uint16x8_t __arm_vshrq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u16)))
-uint16x8_t __arm_vshrq(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u32)))
-uint32x4_t __arm_vshrq_n_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u32)))
-uint32x4_t __arm_vshrq(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u8)))
-uint8x16_t __arm_vshrq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u8)))
-uint8x16_t __arm_vshrq(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s16)))
-int16x8_t __arm_vshrq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s16)))
-int16x8_t __arm_vshrq_x(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s32)))
-int32x4_t __arm_vshrq_x_n_s32(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s32)))
-int32x4_t __arm_vshrq_x(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s8)))
-int8x16_t __arm_vshrq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s8)))
-int8x16_t __arm_vshrq_x(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u16)))
-uint16x8_t __arm_vshrq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u16)))
-uint16x8_t __arm_vshrq_x(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u32)))
-uint32x4_t __arm_vshrq_x_n_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u32)))
-uint32x4_t __arm_vshrq_x(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u8)))
-uint8x16_t __arm_vshrq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u8)))
-uint8x16_t __arm_vshrq_x(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s16)))
-int16x8_t __arm_vsliq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s16)))
-int16x8_t __arm_vsliq_m(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s32)))
-int32x4_t __arm_vsliq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s32)))
-int32x4_t __arm_vsliq_m(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s8)))
-int8x16_t __arm_vsliq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s8)))
-int8x16_t __arm_vsliq_m(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u16)))
-uint16x8_t __arm_vsliq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u16)))
-uint16x8_t __arm_vsliq_m(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u32)))
-uint32x4_t __arm_vsliq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u32)))
-uint32x4_t __arm_vsliq_m(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u8)))
-uint8x16_t __arm_vsliq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u8)))
-uint8x16_t __arm_vsliq_m(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s16)))
-int16x8_t __arm_vsliq_n_s16(int16x8_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s16)))
-int16x8_t __arm_vsliq(int16x8_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s32)))
-int32x4_t __arm_vsliq_n_s32(int32x4_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s32)))
-int32x4_t __arm_vsliq(int32x4_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s8)))
-int8x16_t __arm_vsliq_n_s8(int8x16_t, int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s8)))
-int8x16_t __arm_vsliq(int8x16_t, int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u16)))
-uint16x8_t __arm_vsliq_n_u16(uint16x8_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u16)))
-uint16x8_t __arm_vsliq(uint16x8_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u32)))
-uint32x4_t __arm_vsliq_n_u32(uint32x4_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u32)))
-uint32x4_t __arm_vsliq(uint32x4_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u8)))
-uint8x16_t __arm_vsliq_n_u8(uint8x16_t, uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u8)))
-uint8x16_t __arm_vsliq(uint8x16_t, uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s16)))
-int16x8_t __arm_vsriq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s16)))
-int16x8_t __arm_vsriq_m(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s32)))
-int32x4_t __arm_vsriq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s32)))
-int32x4_t __arm_vsriq_m(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s8)))
-int8x16_t __arm_vsriq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s8)))
-int8x16_t __arm_vsriq_m(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u16)))
-uint16x8_t __arm_vsriq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u16)))
-uint16x8_t __arm_vsriq_m(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u32)))
-uint32x4_t __arm_vsriq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u32)))
-uint32x4_t __arm_vsriq_m(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u8)))
-uint8x16_t __arm_vsriq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u8)))
-uint8x16_t __arm_vsriq_m(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s16)))
-int16x8_t __arm_vsriq_n_s16(int16x8_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s16)))
-int16x8_t __arm_vsriq(int16x8_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s32)))
-int32x4_t __arm_vsriq_n_s32(int32x4_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s32)))
-int32x4_t __arm_vsriq(int32x4_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s8)))
-int8x16_t __arm_vsriq_n_s8(int8x16_t, int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s8)))
-int8x16_t __arm_vsriq(int8x16_t, int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u16)))
-uint16x8_t __arm_vsriq_n_u16(uint16x8_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u16)))
-uint16x8_t __arm_vsriq(uint16x8_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u32)))
-uint32x4_t __arm_vsriq_n_u32(uint32x4_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u32)))
-uint32x4_t __arm_vsriq(uint32x4_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u8)))
-uint8x16_t __arm_vsriq_n_u8(uint8x16_t, uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u8)))
-uint8x16_t __arm_vsriq(uint8x16_t, uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s16)))
-void __arm_vst1q_p_s16(int16_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s16)))
-void __arm_vst1q_p(int16_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s32)))
-void __arm_vst1q_p_s32(int32_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s32)))
-void __arm_vst1q_p(int32_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s8)))
-void __arm_vst1q_p_s8(int8_t *, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s8)))
-void __arm_vst1q_p(int8_t *, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u16)))
-void __arm_vst1q_p_u16(uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u16)))
-void __arm_vst1q_p(uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u32)))
-void __arm_vst1q_p_u32(uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u32)))
-void __arm_vst1q_p(uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u8)))
-void __arm_vst1q_p_u8(uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u8)))
-void __arm_vst1q_p(uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s16)))
-void __arm_vst1q_s16(int16_t *, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s16)))
-void __arm_vst1q(int16_t *, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s32)))
-void __arm_vst1q_s32(int32_t *, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s32)))
-void __arm_vst1q(int32_t *, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s8)))
-void __arm_vst1q_s8(int8_t *, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s8)))
-void __arm_vst1q(int8_t *, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u16)))
-void __arm_vst1q_u16(uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u16)))
-void __arm_vst1q(uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u32)))
-void __arm_vst1q_u32(uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u32)))
-void __arm_vst1q(uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u8)))
-void __arm_vst1q_u8(uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u8)))
-void __arm_vst1q(uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s16)))
-void __arm_vst2q_s16(int16_t *, int16x8x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s16)))
-void __arm_vst2q(int16_t *, int16x8x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s32)))
-void __arm_vst2q_s32(int32_t *, int32x4x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s32)))
-void __arm_vst2q(int32_t *, int32x4x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s8)))
-void __arm_vst2q_s8(int8_t *, int8x16x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s8)))
-void __arm_vst2q(int8_t *, int8x16x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u16)))
-void __arm_vst2q_u16(uint16_t *, uint16x8x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u16)))
-void __arm_vst2q(uint16_t *, uint16x8x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u32)))
-void __arm_vst2q_u32(uint32_t *, uint32x4x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u32)))
-void __arm_vst2q(uint32_t *, uint32x4x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u8)))
-void __arm_vst2q_u8(uint8_t *, uint8x16x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u8)))
-void __arm_vst2q(uint8_t *, uint8x16x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s16)))
-void __arm_vst4q_s16(int16_t *, int16x8x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s16)))
-void __arm_vst4q(int16_t *, int16x8x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s32)))
-void __arm_vst4q_s32(int32_t *, int32x4x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s32)))
-void __arm_vst4q(int32_t *, int32x4x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s8)))
-void __arm_vst4q_s8(int8_t *, int8x16x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s8)))
-void __arm_vst4q(int8_t *, int8x16x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u16)))
-void __arm_vst4q_u16(uint16_t *, uint16x8x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u16)))
-void __arm_vst4q(uint16_t *, uint16x8x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u32)))
-void __arm_vst4q_u32(uint32_t *, uint32x4x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u32)))
-void __arm_vst4q(uint32_t *, uint32x4x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u8)))
-void __arm_vst4q_u8(uint8_t *, uint8x16x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u8)))
-void __arm_vst4q(uint8_t *, uint8x16x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s16)))
-void __arm_vstrbq_p_s16(int8_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s16)))
-void __arm_vstrbq_p(int8_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s32)))
-void __arm_vstrbq_p_s32(int8_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s32)))
-void __arm_vstrbq_p(int8_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s8)))
-void __arm_vstrbq_p_s8(int8_t *, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s8)))
-void __arm_vstrbq_p(int8_t *, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u16)))
-void __arm_vstrbq_p_u16(uint8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u16)))
-void __arm_vstrbq_p(uint8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u32)))
-void __arm_vstrbq_p_u32(uint8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u32)))
-void __arm_vstrbq_p(uint8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u8)))
-void __arm_vstrbq_p_u8(uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u8)))
-void __arm_vstrbq_p(uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s16)))
-void __arm_vstrbq_s16(int8_t *, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s16)))
-void __arm_vstrbq(int8_t *, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s32)))
-void __arm_vstrbq_s32(int8_t *, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s32)))
-void __arm_vstrbq(int8_t *, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s8)))
-void __arm_vstrbq_s8(int8_t *, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s8)))
-void __arm_vstrbq(int8_t *, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s16)))
-void __arm_vstrbq_scatter_offset_p_s16(int8_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s16)))
-void __arm_vstrbq_scatter_offset_p(int8_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s32)))
-void __arm_vstrbq_scatter_offset_p_s32(int8_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s32)))
-void __arm_vstrbq_scatter_offset_p(int8_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s8)))
-void __arm_vstrbq_scatter_offset_p_s8(int8_t *, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s8)))
-void __arm_vstrbq_scatter_offset_p(int8_t *, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u16)))
-void __arm_vstrbq_scatter_offset_p_u16(uint8_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u16)))
-void __arm_vstrbq_scatter_offset_p(uint8_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u32)))
-void __arm_vstrbq_scatter_offset_p_u32(uint8_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u32)))
-void __arm_vstrbq_scatter_offset_p(uint8_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u8)))
-void __arm_vstrbq_scatter_offset_p_u8(uint8_t *, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u8)))
-void __arm_vstrbq_scatter_offset_p(uint8_t *, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s16)))
-void __arm_vstrbq_scatter_offset_s16(int8_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s16)))
-void __arm_vstrbq_scatter_offset(int8_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s32)))
-void __arm_vstrbq_scatter_offset_s32(int8_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s32)))
-void __arm_vstrbq_scatter_offset(int8_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s8)))
-void __arm_vstrbq_scatter_offset_s8(int8_t *, uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s8)))
-void __arm_vstrbq_scatter_offset(int8_t *, uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u16)))
-void __arm_vstrbq_scatter_offset_u16(uint8_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u16)))
-void __arm_vstrbq_scatter_offset(uint8_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u32)))
-void __arm_vstrbq_scatter_offset_u32(uint8_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u32)))
-void __arm_vstrbq_scatter_offset(uint8_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u8)))
-void __arm_vstrbq_scatter_offset_u8(uint8_t *, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u8)))
-void __arm_vstrbq_scatter_offset(uint8_t *, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u16)))
-void __arm_vstrbq_u16(uint8_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u16)))
-void __arm_vstrbq(uint8_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u32)))
-void __arm_vstrbq_u32(uint8_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u32)))
-void __arm_vstrbq(uint8_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u8)))
-void __arm_vstrbq_u8(uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u8)))
-void __arm_vstrbq(uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_p_s64)))
-void __arm_vstrdq_scatter_base_p_s64(uint64x2_t, int, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_p_s64)))
-void __arm_vstrdq_scatter_base_p(uint64x2_t, int, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_p_u64)))
-void __arm_vstrdq_scatter_base_p_u64(uint64x2_t, int, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_p_u64)))
-void __arm_vstrdq_scatter_base_p(uint64x2_t, int, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_s64)))
-void __arm_vstrdq_scatter_base_s64(uint64x2_t, int, int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_s64)))
-void __arm_vstrdq_scatter_base(uint64x2_t, int, int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_u64)))
-void __arm_vstrdq_scatter_base_u64(uint64x2_t, int, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_u64)))
-void __arm_vstrdq_scatter_base(uint64x2_t, int, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_p_s64)))
-void __arm_vstrdq_scatter_base_wb_p_s64(uint64x2_t *, int, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_p_s64)))
-void __arm_vstrdq_scatter_base_wb_p(uint64x2_t *, int, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_p_u64)))
-void __arm_vstrdq_scatter_base_wb_p_u64(uint64x2_t *, int, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_p_u64)))
-void __arm_vstrdq_scatter_base_wb_p(uint64x2_t *, int, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_s64)))
-void __arm_vstrdq_scatter_base_wb_s64(uint64x2_t *, int, int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_s64)))
-void __arm_vstrdq_scatter_base_wb(uint64x2_t *, int, int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_u64)))
-void __arm_vstrdq_scatter_base_wb_u64(uint64x2_t *, int, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_u64)))
-void __arm_vstrdq_scatter_base_wb(uint64x2_t *, int, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_p_s64)))
-void __arm_vstrdq_scatter_offset_p_s64(int64_t *, uint64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_p_s64)))
-void __arm_vstrdq_scatter_offset_p(int64_t *, uint64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_p_u64)))
-void __arm_vstrdq_scatter_offset_p_u64(uint64_t *, uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_p_u64)))
-void __arm_vstrdq_scatter_offset_p(uint64_t *, uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_s64)))
-void __arm_vstrdq_scatter_offset_s64(int64_t *, uint64x2_t, int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_s64)))
-void __arm_vstrdq_scatter_offset(int64_t *, uint64x2_t, int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_u64)))
-void __arm_vstrdq_scatter_offset_u64(uint64_t *, uint64x2_t, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_u64)))
-void __arm_vstrdq_scatter_offset(uint64_t *, uint64x2_t, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_p_s64)))
-void __arm_vstrdq_scatter_shifted_offset_p_s64(int64_t *, uint64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_p_s64)))
-void __arm_vstrdq_scatter_shifted_offset_p(int64_t *, uint64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_p_u64)))
-void __arm_vstrdq_scatter_shifted_offset_p_u64(uint64_t *, uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_p_u64)))
-void __arm_vstrdq_scatter_shifted_offset_p(uint64_t *, uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_s64)))
-void __arm_vstrdq_scatter_shifted_offset_s64(int64_t *, uint64x2_t, int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_s64)))
-void __arm_vstrdq_scatter_shifted_offset(int64_t *, uint64x2_t, int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_u64)))
-void __arm_vstrdq_scatter_shifted_offset_u64(uint64_t *, uint64x2_t, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_u64)))
-void __arm_vstrdq_scatter_shifted_offset(uint64_t *, uint64x2_t, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_s16)))
-void __arm_vstrhq_p_s16(int16_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_s16)))
-void __arm_vstrhq_p(int16_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_s32)))
-void __arm_vstrhq_p_s32(int16_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_s32)))
-void __arm_vstrhq_p(int16_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_u16)))
-void __arm_vstrhq_p_u16(uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_u16)))
-void __arm_vstrhq_p(uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_u32)))
-void __arm_vstrhq_p_u32(uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_u32)))
-void __arm_vstrhq_p(uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_s16)))
-void __arm_vstrhq_s16(int16_t *, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_s16)))
-void __arm_vstrhq(int16_t *, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_s32)))
-void __arm_vstrhq_s32(int16_t *, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_s32)))
-void __arm_vstrhq(int16_t *, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_s16)))
-void __arm_vstrhq_scatter_offset_p_s16(int16_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_s16)))
-void __arm_vstrhq_scatter_offset_p(int16_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_s32)))
-void __arm_vstrhq_scatter_offset_p_s32(int16_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_s32)))
-void __arm_vstrhq_scatter_offset_p(int16_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_u16)))
-void __arm_vstrhq_scatter_offset_p_u16(uint16_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_u16)))
-void __arm_vstrhq_scatter_offset_p(uint16_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_u32)))
-void __arm_vstrhq_scatter_offset_p_u32(uint16_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_u32)))
-void __arm_vstrhq_scatter_offset_p(uint16_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_s16)))
-void __arm_vstrhq_scatter_offset_s16(int16_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_s16)))
-void __arm_vstrhq_scatter_offset(int16_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_s32)))
-void __arm_vstrhq_scatter_offset_s32(int16_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_s32)))
-void __arm_vstrhq_scatter_offset(int16_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_u16)))
-void __arm_vstrhq_scatter_offset_u16(uint16_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_u16)))
-void __arm_vstrhq_scatter_offset(uint16_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_u32)))
-void __arm_vstrhq_scatter_offset_u32(uint16_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_u32)))
-void __arm_vstrhq_scatter_offset(uint16_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_s16)))
-void __arm_vstrhq_scatter_shifted_offset_p_s16(int16_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_s16)))
-void __arm_vstrhq_scatter_shifted_offset_p(int16_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_s32)))
-void __arm_vstrhq_scatter_shifted_offset_p_s32(int16_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_s32)))
-void __arm_vstrhq_scatter_shifted_offset_p(int16_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_u16)))
-void __arm_vstrhq_scatter_shifted_offset_p_u16(uint16_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_u16)))
-void __arm_vstrhq_scatter_shifted_offset_p(uint16_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_u32)))
-void __arm_vstrhq_scatter_shifted_offset_p_u32(uint16_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_u32)))
-void __arm_vstrhq_scatter_shifted_offset_p(uint16_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_s16)))
-void __arm_vstrhq_scatter_shifted_offset_s16(int16_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_s16)))
-void __arm_vstrhq_scatter_shifted_offset(int16_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_s32)))
-void __arm_vstrhq_scatter_shifted_offset_s32(int16_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_s32)))
-void __arm_vstrhq_scatter_shifted_offset(int16_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_u16)))
-void __arm_vstrhq_scatter_shifted_offset_u16(uint16_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_u16)))
-void __arm_vstrhq_scatter_shifted_offset(uint16_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_u32)))
-void __arm_vstrhq_scatter_shifted_offset_u32(uint16_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_u32)))
-void __arm_vstrhq_scatter_shifted_offset(uint16_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_u16)))
-void __arm_vstrhq_u16(uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_u16)))
-void __arm_vstrhq(uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_u32)))
-void __arm_vstrhq_u32(uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_u32)))
-void __arm_vstrhq(uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_s32)))
-void __arm_vstrwq_p_s32(int32_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_s32)))
-void __arm_vstrwq_p(int32_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_u32)))
-void __arm_vstrwq_p_u32(uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_u32)))
-void __arm_vstrwq_p(uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_s32)))
-void __arm_vstrwq_s32(int32_t *, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_s32)))
-void __arm_vstrwq(int32_t *, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_s32)))
-void __arm_vstrwq_scatter_base_p_s32(uint32x4_t, int, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_s32)))
-void __arm_vstrwq_scatter_base_p(uint32x4_t, int, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_u32)))
-void __arm_vstrwq_scatter_base_p_u32(uint32x4_t, int, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_u32)))
-void __arm_vstrwq_scatter_base_p(uint32x4_t, int, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_s32)))
-void __arm_vstrwq_scatter_base_s32(uint32x4_t, int, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_s32)))
-void __arm_vstrwq_scatter_base(uint32x4_t, int, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_u32)))
-void __arm_vstrwq_scatter_base_u32(uint32x4_t, int, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_u32)))
-void __arm_vstrwq_scatter_base(uint32x4_t, int, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_s32)))
-void __arm_vstrwq_scatter_base_wb_p_s32(uint32x4_t *, int, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_s32)))
-void __arm_vstrwq_scatter_base_wb_p(uint32x4_t *, int, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_u32)))
-void __arm_vstrwq_scatter_base_wb_p_u32(uint32x4_t *, int, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_u32)))
-void __arm_vstrwq_scatter_base_wb_p(uint32x4_t *, int, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_s32)))
-void __arm_vstrwq_scatter_base_wb_s32(uint32x4_t *, int, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_s32)))
-void __arm_vstrwq_scatter_base_wb(uint32x4_t *, int, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_u32)))
-void __arm_vstrwq_scatter_base_wb_u32(uint32x4_t *, int, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_u32)))
-void __arm_vstrwq_scatter_base_wb(uint32x4_t *, int, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_s32)))
-void __arm_vstrwq_scatter_offset_p_s32(int32_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_s32)))
-void __arm_vstrwq_scatter_offset_p(int32_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_u32)))
-void __arm_vstrwq_scatter_offset_p_u32(uint32_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_u32)))
-void __arm_vstrwq_scatter_offset_p(uint32_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_s32)))
-void __arm_vstrwq_scatter_offset_s32(int32_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_s32)))
-void __arm_vstrwq_scatter_offset(int32_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_u32)))
-void __arm_vstrwq_scatter_offset_u32(uint32_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_u32)))
-void __arm_vstrwq_scatter_offset(uint32_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_s32)))
-void __arm_vstrwq_scatter_shifted_offset_p_s32(int32_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_s32)))
-void __arm_vstrwq_scatter_shifted_offset_p(int32_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_u32)))
-void __arm_vstrwq_scatter_shifted_offset_p_u32(uint32_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_u32)))
-void __arm_vstrwq_scatter_shifted_offset_p(uint32_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_s32)))
-void __arm_vstrwq_scatter_shifted_offset_s32(int32_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_s32)))
-void __arm_vstrwq_scatter_shifted_offset(int32_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_u32)))
-void __arm_vstrwq_scatter_shifted_offset_u32(uint32_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_u32)))
-void __arm_vstrwq_scatter_shifted_offset(uint32_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_u32)))
-void __arm_vstrwq_u32(uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_u32)))
-void __arm_vstrwq(uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s16)))
-int16x8_t __arm_vsubq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s16)))
-int16x8_t __arm_vsubq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s32)))
-int32x4_t __arm_vsubq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s32)))
-int32x4_t __arm_vsubq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s8)))
-int8x16_t __arm_vsubq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s8)))
-int8x16_t __arm_vsubq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u16)))
-uint16x8_t __arm_vsubq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u16)))
-uint16x8_t __arm_vsubq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u32)))
-uint32x4_t __arm_vsubq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u32)))
-uint32x4_t __arm_vsubq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u8)))
-uint8x16_t __arm_vsubq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u8)))
-uint8x16_t __arm_vsubq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s16)))
-int16x8_t __arm_vsubq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s16)))
-int16x8_t __arm_vsubq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s32)))
-int32x4_t __arm_vsubq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s32)))
-int32x4_t __arm_vsubq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s8)))
-int8x16_t __arm_vsubq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s8)))
-int8x16_t __arm_vsubq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u16)))
-uint16x8_t __arm_vsubq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u16)))
-uint16x8_t __arm_vsubq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u32)))
-uint32x4_t __arm_vsubq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u32)))
-uint32x4_t __arm_vsubq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u8)))
-uint8x16_t __arm_vsubq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u8)))
-uint8x16_t __arm_vsubq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s16)))
-int16x8_t __arm_vsubq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s16)))
-int16x8_t __arm_vsubq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s32)))
-int32x4_t __arm_vsubq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s32)))
-int32x4_t __arm_vsubq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s8)))
-int8x16_t __arm_vsubq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s8)))
-int8x16_t __arm_vsubq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u16)))
-uint16x8_t __arm_vsubq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u16)))
-uint16x8_t __arm_vsubq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u32)))
-uint32x4_t __arm_vsubq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u32)))
-uint32x4_t __arm_vsubq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u8)))
-uint8x16_t __arm_vsubq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u8)))
-uint8x16_t __arm_vsubq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s16)))
-int16x8_t __arm_vsubq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s16)))
-int16x8_t __arm_vsubq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s32)))
-int32x4_t __arm_vsubq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s32)))
-int32x4_t __arm_vsubq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s8)))
-int8x16_t __arm_vsubq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s8)))
-int8x16_t __arm_vsubq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u16)))
-uint16x8_t __arm_vsubq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u16)))
-uint16x8_t __arm_vsubq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u32)))
-uint32x4_t __arm_vsubq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u32)))
-uint32x4_t __arm_vsubq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u8)))
-uint8x16_t __arm_vsubq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u8)))
-uint8x16_t __arm_vsubq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s16)))
-int16x8_t __arm_vsubq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s16)))
-int16x8_t __arm_vsubq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s32)))
-int32x4_t __arm_vsubq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s32)))
-int32x4_t __arm_vsubq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s8)))
-int8x16_t __arm_vsubq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s8)))
-int8x16_t __arm_vsubq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u16)))
-uint16x8_t __arm_vsubq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u16)))
-uint16x8_t __arm_vsubq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u32)))
-uint32x4_t __arm_vsubq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u32)))
-uint32x4_t __arm_vsubq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u8)))
-uint8x16_t __arm_vsubq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u8)))
-uint8x16_t __arm_vsubq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s16)))
-int16x8_t __arm_vsubq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s16)))
-int16x8_t __arm_vsubq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s32)))
-int32x4_t __arm_vsubq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s32)))
-int32x4_t __arm_vsubq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s8)))
-int8x16_t __arm_vsubq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s8)))
-int8x16_t __arm_vsubq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u16)))
-uint16x8_t __arm_vsubq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u16)))
-uint16x8_t __arm_vsubq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u32)))
-uint32x4_t __arm_vsubq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u32)))
-uint32x4_t __arm_vsubq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u8)))
-uint8x16_t __arm_vsubq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u8)))
-uint8x16_t __arm_vsubq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_s16)))
-int16x8_t __arm_vuninitializedq(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_s32)))
-int32x4_t __arm_vuninitializedq(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_s64)))
-int64x2_t __arm_vuninitializedq(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_s8)))
-int8x16_t __arm_vuninitializedq(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_u16)))
-uint16x8_t __arm_vuninitializedq(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_u32)))
-uint32x4_t __arm_vuninitializedq(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_u64)))
-uint64x2_t __arm_vuninitializedq(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_u8)))
-uint8x16_t __arm_vuninitializedq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_s16)))
-int16x8_t __arm_vuninitializedq_s16();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_s32)))
-int32x4_t __arm_vuninitializedq_s32();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_s64)))
-int64x2_t __arm_vuninitializedq_s64();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_s8)))
-int8x16_t __arm_vuninitializedq_s8();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_u16)))
-uint16x8_t __arm_vuninitializedq_u16();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_u32)))
-uint32x4_t __arm_vuninitializedq_u32();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_u64)))
-uint64x2_t __arm_vuninitializedq_u64();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_u8)))
-uint8x16_t __arm_vuninitializedq_u8();
-
-#if (__ARM_FEATURE_MVE & 2)
-
-typedef __fp16 float16_t;
-typedef float float32_t;
-typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
-typedef struct { float16x8_t val[2]; } float16x8x2_t;
-typedef struct { float16x8_t val[4]; } float16x8x4_t;
-typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
-typedef struct { float32x4_t val[2]; } float32x4x2_t;
-typedef struct { float32x4_t val[4]; } float32x4x4_t;
-
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_f16)))
-float16x8_t __arm_vabdq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_f16)))
-float16x8_t __arm_vabdq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_f32)))
-float32x4_t __arm_vabdq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_f32)))
-float32x4_t __arm_vabdq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_f16)))
-float16x8_t __arm_vabdq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_f16)))
-float16x8_t __arm_vabdq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_f32)))
-float32x4_t __arm_vabdq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_f32)))
-float32x4_t __arm_vabdq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_f16)))
-float16x8_t __arm_vabdq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_f16)))
-float16x8_t __arm_vabdq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_f32)))
-float32x4_t __arm_vabdq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_f32)))
-float32x4_t __arm_vabdq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_f16)))
-float16x8_t __arm_vabsq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_f16)))
-float16x8_t __arm_vabsq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_f32)))
-float32x4_t __arm_vabsq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_f32)))
-float32x4_t __arm_vabsq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_f16)))
-float16x8_t __arm_vabsq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_f16)))
-float16x8_t __arm_vabsq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_f32)))
-float32x4_t __arm_vabsq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_f32)))
-float32x4_t __arm_vabsq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_f16)))
-float16x8_t __arm_vabsq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_f16)))
-float16x8_t __arm_vabsq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_f32)))
-float32x4_t __arm_vabsq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_f32)))
-float32x4_t __arm_vabsq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_f16)))
-float16x8_t __arm_vaddq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_f16)))
-float16x8_t __arm_vaddq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_f32)))
-float32x4_t __arm_vaddq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_f32)))
-float32x4_t __arm_vaddq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_f16)))
-float16x8_t __arm_vaddq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_f16)))
-float16x8_t __arm_vaddq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_f32)))
-float32x4_t __arm_vaddq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_f32)))
-float32x4_t __arm_vaddq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_f16)))
-float16x8_t __arm_vaddq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_f16)))
-float16x8_t __arm_vaddq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_f32)))
-float32x4_t __arm_vaddq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_f32)))
-float32x4_t __arm_vaddq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_f16)))
-float16x8_t __arm_vaddq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_f16)))
-float16x8_t __arm_vaddq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_f32)))
-float32x4_t __arm_vaddq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_f32)))
-float32x4_t __arm_vaddq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_f16)))
-float16x8_t __arm_vaddq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_f16)))
-float16x8_t __arm_vaddq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_f32)))
-float32x4_t __arm_vaddq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_f32)))
-float32x4_t __arm_vaddq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_f16)))
-float16x8_t __arm_vaddq_x_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_f16)))
-float16x8_t __arm_vaddq_x(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_f32)))
-float32x4_t __arm_vaddq_x_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_f32)))
-float32x4_t __arm_vaddq_x(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_f16)))
-float16x8_t __arm_vandq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_f16)))
-float16x8_t __arm_vandq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_f32)))
-float32x4_t __arm_vandq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_f32)))
-float32x4_t __arm_vandq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_f16)))
-float16x8_t __arm_vandq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_f16)))
-float16x8_t __arm_vandq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_f32)))
-float32x4_t __arm_vandq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_f32)))
-float32x4_t __arm_vandq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_f16)))
-float16x8_t __arm_vandq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_f16)))
-float16x8_t __arm_vandq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_f32)))
-float32x4_t __arm_vandq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_f32)))
-float32x4_t __arm_vandq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_f16)))
-float16x8_t __arm_vbicq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_f16)))
-float16x8_t __arm_vbicq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_f32)))
-float32x4_t __arm_vbicq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_f32)))
-float32x4_t __arm_vbicq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_f16)))
-float16x8_t __arm_vbicq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_f16)))
-float16x8_t __arm_vbicq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_f32)))
-float32x4_t __arm_vbicq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_f32)))
-float32x4_t __arm_vbicq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_f16)))
-float16x8_t __arm_vbicq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_f16)))
-float16x8_t __arm_vbicq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_f32)))
-float32x4_t __arm_vbicq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_f32)))
-float32x4_t __arm_vbicq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_f16)))
-float16x8_t __arm_vbrsrq_m_n_f16(float16x8_t, float16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_f16)))
-float16x8_t __arm_vbrsrq_m(float16x8_t, float16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_f32)))
-float32x4_t __arm_vbrsrq_m_n_f32(float32x4_t, float32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_f32)))
-float32x4_t __arm_vbrsrq_m(float32x4_t, float32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_f16)))
-float16x8_t __arm_vbrsrq_n_f16(float16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_f16)))
-float16x8_t __arm_vbrsrq(float16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_f32)))
-float32x4_t __arm_vbrsrq_n_f32(float32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_f32)))
-float32x4_t __arm_vbrsrq(float32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_f16)))
-float16x8_t __arm_vbrsrq_x_n_f16(float16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_f16)))
-float16x8_t __arm_vbrsrq_x(float16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_f32)))
-float32x4_t __arm_vbrsrq_x_n_f32(float32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_f32)))
-float32x4_t __arm_vbrsrq_x(float32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_f16)))
-float16x8_t __arm_vcaddq_rot270_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_f16)))
-float16x8_t __arm_vcaddq_rot270(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_f32)))
-float32x4_t __arm_vcaddq_rot270_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_f32)))
-float32x4_t __arm_vcaddq_rot270(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_f16)))
-float16x8_t __arm_vcaddq_rot270_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_f16)))
-float16x8_t __arm_vcaddq_rot270_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_f32)))
-float32x4_t __arm_vcaddq_rot270_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_f32)))
-float32x4_t __arm_vcaddq_rot270_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_f16)))
-float16x8_t __arm_vcaddq_rot270_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_f16)))
-float16x8_t __arm_vcaddq_rot270_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_f32)))
-float32x4_t __arm_vcaddq_rot270_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_f32)))
-float32x4_t __arm_vcaddq_rot270_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_f16)))
-float16x8_t __arm_vcaddq_rot90_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_f16)))
-float16x8_t __arm_vcaddq_rot90(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_f32)))
-float32x4_t __arm_vcaddq_rot90_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_f32)))
-float32x4_t __arm_vcaddq_rot90(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_f16)))
-float16x8_t __arm_vcaddq_rot90_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_f16)))
-float16x8_t __arm_vcaddq_rot90_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_f32)))
-float32x4_t __arm_vcaddq_rot90_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_f32)))
-float32x4_t __arm_vcaddq_rot90_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_f16)))
-float16x8_t __arm_vcaddq_rot90_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_f16)))
-float16x8_t __arm_vcaddq_rot90_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_f32)))
-float32x4_t __arm_vcaddq_rot90_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_f32)))
-float32x4_t __arm_vcaddq_rot90_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_f16)))
-float16x8_t __arm_vcmlaq_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_f16)))
-float16x8_t __arm_vcmlaq(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_f32)))
-float32x4_t __arm_vcmlaq_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_f32)))
-float32x4_t __arm_vcmlaq(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_m_f16)))
-float16x8_t __arm_vcmlaq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_m_f16)))
-float16x8_t __arm_vcmlaq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_m_f32)))
-float32x4_t __arm_vcmlaq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_m_f32)))
-float32x4_t __arm_vcmlaq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_f16)))
-float16x8_t __arm_vcmlaq_rot180_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_f16)))
-float16x8_t __arm_vcmlaq_rot180(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_f32)))
-float32x4_t __arm_vcmlaq_rot180_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_f32)))
-float32x4_t __arm_vcmlaq_rot180(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_m_f16)))
-float16x8_t __arm_vcmlaq_rot180_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_m_f16)))
-float16x8_t __arm_vcmlaq_rot180_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_m_f32)))
-float32x4_t __arm_vcmlaq_rot180_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_m_f32)))
-float32x4_t __arm_vcmlaq_rot180_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_f16)))
-float16x8_t __arm_vcmlaq_rot270_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_f16)))
-float16x8_t __arm_vcmlaq_rot270(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_f32)))
-float32x4_t __arm_vcmlaq_rot270_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_f32)))
-float32x4_t __arm_vcmlaq_rot270(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_m_f16)))
-float16x8_t __arm_vcmlaq_rot270_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_m_f16)))
-float16x8_t __arm_vcmlaq_rot270_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_m_f32)))
-float32x4_t __arm_vcmlaq_rot270_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_m_f32)))
-float32x4_t __arm_vcmlaq_rot270_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_f16)))
-float16x8_t __arm_vcmlaq_rot90_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_f16)))
-float16x8_t __arm_vcmlaq_rot90(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_f32)))
-float32x4_t __arm_vcmlaq_rot90_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_f32)))
-float32x4_t __arm_vcmlaq_rot90(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_m_f16)))
-float16x8_t __arm_vcmlaq_rot90_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_m_f16)))
-float16x8_t __arm_vcmlaq_rot90_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_m_f32)))
-float32x4_t __arm_vcmlaq_rot90_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_m_f32)))
-float32x4_t __arm_vcmlaq_rot90_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_f16)))
-mve_pred16_t __arm_vcmpeqq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_f16)))
-mve_pred16_t __arm_vcmpeqq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_f32)))
-mve_pred16_t __arm_vcmpeqq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_f32)))
-mve_pred16_t __arm_vcmpeqq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_f16)))
-mve_pred16_t __arm_vcmpeqq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_f16)))
-mve_pred16_t __arm_vcmpeqq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_f32)))
-mve_pred16_t __arm_vcmpeqq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_f32)))
-mve_pred16_t __arm_vcmpeqq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_f16)))
-mve_pred16_t __arm_vcmpeqq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_f16)))
-mve_pred16_t __arm_vcmpeqq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_f32)))
-mve_pred16_t __arm_vcmpeqq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_f32)))
-mve_pred16_t __arm_vcmpeqq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_f16)))
-mve_pred16_t __arm_vcmpeqq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_f16)))
-mve_pred16_t __arm_vcmpeqq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_f32)))
-mve_pred16_t __arm_vcmpeqq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_f32)))
-mve_pred16_t __arm_vcmpeqq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_f16)))
-mve_pred16_t __arm_vcmpgeq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_f16)))
-mve_pred16_t __arm_vcmpgeq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_f32)))
-mve_pred16_t __arm_vcmpgeq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_f32)))
-mve_pred16_t __arm_vcmpgeq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_f16)))
-mve_pred16_t __arm_vcmpgeq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_f16)))
-mve_pred16_t __arm_vcmpgeq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_f32)))
-mve_pred16_t __arm_vcmpgeq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_f32)))
-mve_pred16_t __arm_vcmpgeq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_f16)))
-mve_pred16_t __arm_vcmpgeq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_f16)))
-mve_pred16_t __arm_vcmpgeq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_f32)))
-mve_pred16_t __arm_vcmpgeq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_f32)))
-mve_pred16_t __arm_vcmpgeq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_f16)))
-mve_pred16_t __arm_vcmpgeq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_f16)))
-mve_pred16_t __arm_vcmpgeq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_f32)))
-mve_pred16_t __arm_vcmpgeq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_f32)))
-mve_pred16_t __arm_vcmpgeq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_f16)))
-mve_pred16_t __arm_vcmpgtq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_f16)))
-mve_pred16_t __arm_vcmpgtq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_f32)))
-mve_pred16_t __arm_vcmpgtq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_f32)))
-mve_pred16_t __arm_vcmpgtq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_f16)))
-mve_pred16_t __arm_vcmpgtq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_f16)))
-mve_pred16_t __arm_vcmpgtq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_f32)))
-mve_pred16_t __arm_vcmpgtq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_f32)))
-mve_pred16_t __arm_vcmpgtq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_f16)))
-mve_pred16_t __arm_vcmpgtq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_f16)))
-mve_pred16_t __arm_vcmpgtq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_f32)))
-mve_pred16_t __arm_vcmpgtq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_f32)))
-mve_pred16_t __arm_vcmpgtq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_f16)))
-mve_pred16_t __arm_vcmpgtq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_f16)))
-mve_pred16_t __arm_vcmpgtq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_f32)))
-mve_pred16_t __arm_vcmpgtq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_f32)))
-mve_pred16_t __arm_vcmpgtq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_f16)))
-mve_pred16_t __arm_vcmpleq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_f16)))
-mve_pred16_t __arm_vcmpleq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_f32)))
-mve_pred16_t __arm_vcmpleq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_f32)))
-mve_pred16_t __arm_vcmpleq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_f16)))
-mve_pred16_t __arm_vcmpleq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_f16)))
-mve_pred16_t __arm_vcmpleq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_f32)))
-mve_pred16_t __arm_vcmpleq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_f32)))
-mve_pred16_t __arm_vcmpleq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_f16)))
-mve_pred16_t __arm_vcmpleq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_f16)))
-mve_pred16_t __arm_vcmpleq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_f32)))
-mve_pred16_t __arm_vcmpleq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_f32)))
-mve_pred16_t __arm_vcmpleq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_f16)))
-mve_pred16_t __arm_vcmpleq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_f16)))
-mve_pred16_t __arm_vcmpleq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_f32)))
-mve_pred16_t __arm_vcmpleq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_f32)))
-mve_pred16_t __arm_vcmpleq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_f16)))
-mve_pred16_t __arm_vcmpltq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_f16)))
-mve_pred16_t __arm_vcmpltq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_f32)))
-mve_pred16_t __arm_vcmpltq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_f32)))
-mve_pred16_t __arm_vcmpltq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_f16)))
-mve_pred16_t __arm_vcmpltq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_f16)))
-mve_pred16_t __arm_vcmpltq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_f32)))
-mve_pred16_t __arm_vcmpltq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_f32)))
-mve_pred16_t __arm_vcmpltq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_f16)))
-mve_pred16_t __arm_vcmpltq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_f16)))
-mve_pred16_t __arm_vcmpltq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_f32)))
-mve_pred16_t __arm_vcmpltq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_f32)))
-mve_pred16_t __arm_vcmpltq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_f16)))
-mve_pred16_t __arm_vcmpltq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_f16)))
-mve_pred16_t __arm_vcmpltq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_f32)))
-mve_pred16_t __arm_vcmpltq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_f32)))
-mve_pred16_t __arm_vcmpltq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_f16)))
-mve_pred16_t __arm_vcmpneq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_f16)))
-mve_pred16_t __arm_vcmpneq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_f32)))
-mve_pred16_t __arm_vcmpneq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_f32)))
-mve_pred16_t __arm_vcmpneq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_f16)))
-mve_pred16_t __arm_vcmpneq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_f16)))
-mve_pred16_t __arm_vcmpneq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_f32)))
-mve_pred16_t __arm_vcmpneq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_f32)))
-mve_pred16_t __arm_vcmpneq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_f16)))
-mve_pred16_t __arm_vcmpneq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_f16)))
-mve_pred16_t __arm_vcmpneq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_f32)))
-mve_pred16_t __arm_vcmpneq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_f32)))
-mve_pred16_t __arm_vcmpneq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_f16)))
-mve_pred16_t __arm_vcmpneq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_f16)))
-mve_pred16_t __arm_vcmpneq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_f32)))
-mve_pred16_t __arm_vcmpneq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_f32)))
-mve_pred16_t __arm_vcmpneq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_f16)))
-float16x8_t __arm_vcmulq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_f16)))
-float16x8_t __arm_vcmulq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_f32)))
-float32x4_t __arm_vcmulq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_f32)))
-float32x4_t __arm_vcmulq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_m_f16)))
-float16x8_t __arm_vcmulq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_m_f16)))
-float16x8_t __arm_vcmulq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_m_f32)))
-float32x4_t __arm_vcmulq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_m_f32)))
-float32x4_t __arm_vcmulq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_f16)))
-float16x8_t __arm_vcmulq_rot180_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_f16)))
-float16x8_t __arm_vcmulq_rot180(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_f32)))
-float32x4_t __arm_vcmulq_rot180_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_f32)))
-float32x4_t __arm_vcmulq_rot180(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_m_f16)))
-float16x8_t __arm_vcmulq_rot180_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_m_f16)))
-float16x8_t __arm_vcmulq_rot180_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_m_f32)))
-float32x4_t __arm_vcmulq_rot180_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_m_f32)))
-float32x4_t __arm_vcmulq_rot180_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_x_f16)))
-float16x8_t __arm_vcmulq_rot180_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_x_f16)))
-float16x8_t __arm_vcmulq_rot180_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_x_f32)))
-float32x4_t __arm_vcmulq_rot180_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_x_f32)))
-float32x4_t __arm_vcmulq_rot180_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_f16)))
-float16x8_t __arm_vcmulq_rot270_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_f16)))
-float16x8_t __arm_vcmulq_rot270(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_f32)))
-float32x4_t __arm_vcmulq_rot270_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_f32)))
-float32x4_t __arm_vcmulq_rot270(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_m_f16)))
-float16x8_t __arm_vcmulq_rot270_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_m_f16)))
-float16x8_t __arm_vcmulq_rot270_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_m_f32)))
-float32x4_t __arm_vcmulq_rot270_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_m_f32)))
-float32x4_t __arm_vcmulq_rot270_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_x_f16)))
-float16x8_t __arm_vcmulq_rot270_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_x_f16)))
-float16x8_t __arm_vcmulq_rot270_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_x_f32)))
-float32x4_t __arm_vcmulq_rot270_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_x_f32)))
-float32x4_t __arm_vcmulq_rot270_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_f16)))
-float16x8_t __arm_vcmulq_rot90_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_f16)))
-float16x8_t __arm_vcmulq_rot90(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_f32)))
-float32x4_t __arm_vcmulq_rot90_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_f32)))
-float32x4_t __arm_vcmulq_rot90(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_m_f16)))
-float16x8_t __arm_vcmulq_rot90_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_m_f16)))
-float16x8_t __arm_vcmulq_rot90_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_m_f32)))
-float32x4_t __arm_vcmulq_rot90_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_m_f32)))
-float32x4_t __arm_vcmulq_rot90_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_x_f16)))
-float16x8_t __arm_vcmulq_rot90_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_x_f16)))
-float16x8_t __arm_vcmulq_rot90_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_x_f32)))
-float32x4_t __arm_vcmulq_rot90_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_x_f32)))
-float32x4_t __arm_vcmulq_rot90_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_x_f16)))
-float16x8_t __arm_vcmulq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_x_f16)))
-float16x8_t __arm_vcmulq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_x_f32)))
-float32x4_t __arm_vcmulq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_x_f32)))
-float32x4_t __arm_vcmulq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_f16)))
-float16x8_t __arm_vcreateq_f16(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_f32)))
-float32x4_t __arm_vcreateq_f32(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_s16_f16)))
-int16x8_t __arm_vcvtaq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_s16_f16)))
-int16x8_t __arm_vcvtaq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_s32_f32)))
-int32x4_t __arm_vcvtaq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_s32_f32)))
-int32x4_t __arm_vcvtaq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_u16_f16)))
-uint16x8_t __arm_vcvtaq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_u16_f16)))
-uint16x8_t __arm_vcvtaq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_u32_f32)))
-uint32x4_t __arm_vcvtaq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_u32_f32)))
-uint32x4_t __arm_vcvtaq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_s16_f16)))
-int16x8_t __arm_vcvtaq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_s32_f32)))
-int32x4_t __arm_vcvtaq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_u16_f16)))
-uint16x8_t __arm_vcvtaq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_u32_f32)))
-uint32x4_t __arm_vcvtaq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_x_s16_f16)))
-int16x8_t __arm_vcvtaq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_x_s32_f32)))
-int32x4_t __arm_vcvtaq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_x_u16_f16)))
-uint16x8_t __arm_vcvtaq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_x_u32_f32)))
-uint32x4_t __arm_vcvtaq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_f16_f32)))
-float16x8_t __arm_vcvtbq_f16_f32(float16x8_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_f32_f16)))
-float32x4_t __arm_vcvtbq_f32_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_m_f16_f32)))
-float16x8_t __arm_vcvtbq_m_f16_f32(float16x8_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_m_f32_f16)))
-float32x4_t __arm_vcvtbq_m_f32_f16(float32x4_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_x_f32_f16)))
-float32x4_t __arm_vcvtbq_x_f32_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_s16_f16)))
-int16x8_t __arm_vcvtmq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_s16_f16)))
-int16x8_t __arm_vcvtmq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_s32_f32)))
-int32x4_t __arm_vcvtmq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_s32_f32)))
-int32x4_t __arm_vcvtmq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_u16_f16)))
-uint16x8_t __arm_vcvtmq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_u16_f16)))
-uint16x8_t __arm_vcvtmq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_u32_f32)))
-uint32x4_t __arm_vcvtmq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_u32_f32)))
-uint32x4_t __arm_vcvtmq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_s16_f16)))
-int16x8_t __arm_vcvtmq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_s32_f32)))
-int32x4_t __arm_vcvtmq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_u16_f16)))
-uint16x8_t __arm_vcvtmq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_u32_f32)))
-uint32x4_t __arm_vcvtmq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_x_s16_f16)))
-int16x8_t __arm_vcvtmq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_x_s32_f32)))
-int32x4_t __arm_vcvtmq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_x_u16_f16)))
-uint16x8_t __arm_vcvtmq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_x_u32_f32)))
-uint32x4_t __arm_vcvtmq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_s16_f16)))
-int16x8_t __arm_vcvtnq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_s16_f16)))
-int16x8_t __arm_vcvtnq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_s32_f32)))
-int32x4_t __arm_vcvtnq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_s32_f32)))
-int32x4_t __arm_vcvtnq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_u16_f16)))
-uint16x8_t __arm_vcvtnq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_u16_f16)))
-uint16x8_t __arm_vcvtnq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_u32_f32)))
-uint32x4_t __arm_vcvtnq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_u32_f32)))
-uint32x4_t __arm_vcvtnq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_s16_f16)))
-int16x8_t __arm_vcvtnq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_s32_f32)))
-int32x4_t __arm_vcvtnq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_u16_f16)))
-uint16x8_t __arm_vcvtnq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_u32_f32)))
-uint32x4_t __arm_vcvtnq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_x_s16_f16)))
-int16x8_t __arm_vcvtnq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_x_s32_f32)))
-int32x4_t __arm_vcvtnq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_x_u16_f16)))
-uint16x8_t __arm_vcvtnq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_x_u32_f32)))
-uint32x4_t __arm_vcvtnq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_s16_f16)))
-int16x8_t __arm_vcvtpq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_s16_f16)))
-int16x8_t __arm_vcvtpq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_s32_f32)))
-int32x4_t __arm_vcvtpq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_s32_f32)))
-int32x4_t __arm_vcvtpq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_u16_f16)))
-uint16x8_t __arm_vcvtpq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_u16_f16)))
-uint16x8_t __arm_vcvtpq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_u32_f32)))
-uint32x4_t __arm_vcvtpq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_u32_f32)))
-uint32x4_t __arm_vcvtpq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_s16_f16)))
-int16x8_t __arm_vcvtpq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_s32_f32)))
-int32x4_t __arm_vcvtpq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_u16_f16)))
-uint16x8_t __arm_vcvtpq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_u32_f32)))
-uint32x4_t __arm_vcvtpq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_x_s16_f16)))
-int16x8_t __arm_vcvtpq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_x_s32_f32)))
-int32x4_t __arm_vcvtpq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_x_u16_f16)))
-uint16x8_t __arm_vcvtpq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_x_u32_f32)))
-uint32x4_t __arm_vcvtpq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f16_s16)))
-float16x8_t __arm_vcvtq_f16_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f16_s16)))
-float16x8_t __arm_vcvtq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f16_u16)))
-float16x8_t __arm_vcvtq_f16_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f16_u16)))
-float16x8_t __arm_vcvtq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f32_s32)))
-float32x4_t __arm_vcvtq_f32_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f32_s32)))
-float32x4_t __arm_vcvtq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f32_u32)))
-float32x4_t __arm_vcvtq_f32_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f32_u32)))
-float32x4_t __arm_vcvtq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f16_s16)))
-float16x8_t __arm_vcvtq_m_f16_s16(float16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f16_s16)))
-float16x8_t __arm_vcvtq_m(float16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f16_u16)))
-float16x8_t __arm_vcvtq_m_f16_u16(float16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f16_u16)))
-float16x8_t __arm_vcvtq_m(float16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f32_s32)))
-float32x4_t __arm_vcvtq_m_f32_s32(float32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f32_s32)))
-float32x4_t __arm_vcvtq_m(float32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f32_u32)))
-float32x4_t __arm_vcvtq_m_f32_u32(float32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f32_u32)))
-float32x4_t __arm_vcvtq_m(float32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f16_s16)))
-float16x8_t __arm_vcvtq_m_n_f16_s16(float16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f16_s16)))
-float16x8_t __arm_vcvtq_m_n(float16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f16_u16)))
-float16x8_t __arm_vcvtq_m_n_f16_u16(float16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f16_u16)))
-float16x8_t __arm_vcvtq_m_n(float16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f32_s32)))
-float32x4_t __arm_vcvtq_m_n_f32_s32(float32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f32_s32)))
-float32x4_t __arm_vcvtq_m_n(float32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f32_u32)))
-float32x4_t __arm_vcvtq_m_n_f32_u32(float32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f32_u32)))
-float32x4_t __arm_vcvtq_m_n(float32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_s16_f16)))
-int16x8_t __arm_vcvtq_m_n_s16_f16(int16x8_t, float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_s16_f16)))
-int16x8_t __arm_vcvtq_m_n(int16x8_t, float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_s32_f32)))
-int32x4_t __arm_vcvtq_m_n_s32_f32(int32x4_t, float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_s32_f32)))
-int32x4_t __arm_vcvtq_m_n(int32x4_t, float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_u16_f16)))
-uint16x8_t __arm_vcvtq_m_n_u16_f16(uint16x8_t, float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_u16_f16)))
-uint16x8_t __arm_vcvtq_m_n(uint16x8_t, float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_u32_f32)))
-uint32x4_t __arm_vcvtq_m_n_u32_f32(uint32x4_t, float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_u32_f32)))
-uint32x4_t __arm_vcvtq_m_n(uint32x4_t, float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_s16_f16)))
-int16x8_t __arm_vcvtq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_s16_f16)))
-int16x8_t __arm_vcvtq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_s32_f32)))
-int32x4_t __arm_vcvtq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_s32_f32)))
-int32x4_t __arm_vcvtq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_u16_f16)))
-uint16x8_t __arm_vcvtq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_u16_f16)))
-uint16x8_t __arm_vcvtq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_u32_f32)))
-uint32x4_t __arm_vcvtq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_u32_f32)))
-uint32x4_t __arm_vcvtq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f16_s16)))
-float16x8_t __arm_vcvtq_n_f16_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f16_s16)))
-float16x8_t __arm_vcvtq_n(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f16_u16)))
-float16x8_t __arm_vcvtq_n_f16_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f16_u16)))
-float16x8_t __arm_vcvtq_n(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f32_s32)))
-float32x4_t __arm_vcvtq_n_f32_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f32_s32)))
-float32x4_t __arm_vcvtq_n(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f32_u32)))
-float32x4_t __arm_vcvtq_n_f32_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f32_u32)))
-float32x4_t __arm_vcvtq_n(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_s16_f16)))
-int16x8_t __arm_vcvtq_n_s16_f16(float16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_s32_f32)))
-int32x4_t __arm_vcvtq_n_s32_f32(float32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_u16_f16)))
-uint16x8_t __arm_vcvtq_n_u16_f16(float16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_u32_f32)))
-uint32x4_t __arm_vcvtq_n_u32_f32(float32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_s16_f16)))
-int16x8_t __arm_vcvtq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_s32_f32)))
-int32x4_t __arm_vcvtq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_u16_f16)))
-uint16x8_t __arm_vcvtq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_u32_f32)))
-uint32x4_t __arm_vcvtq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f16_s16)))
-float16x8_t __arm_vcvtq_x_f16_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f16_s16)))
-float16x8_t __arm_vcvtq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f16_u16)))
-float16x8_t __arm_vcvtq_x_f16_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f16_u16)))
-float16x8_t __arm_vcvtq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f32_s32)))
-float32x4_t __arm_vcvtq_x_f32_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f32_s32)))
-float32x4_t __arm_vcvtq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f32_u32)))
-float32x4_t __arm_vcvtq_x_f32_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f32_u32)))
-float32x4_t __arm_vcvtq_x(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f16_s16)))
-float16x8_t __arm_vcvtq_x_n_f16_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f16_s16)))
-float16x8_t __arm_vcvtq_x_n(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f16_u16)))
-float16x8_t __arm_vcvtq_x_n_f16_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f16_u16)))
-float16x8_t __arm_vcvtq_x_n(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f32_s32)))
-float32x4_t __arm_vcvtq_x_n_f32_s32(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f32_s32)))
-float32x4_t __arm_vcvtq_x_n(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f32_u32)))
-float32x4_t __arm_vcvtq_x_n_f32_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f32_u32)))
-float32x4_t __arm_vcvtq_x_n(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_s16_f16)))
-int16x8_t __arm_vcvtq_x_n_s16_f16(float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_s32_f32)))
-int32x4_t __arm_vcvtq_x_n_s32_f32(float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_u16_f16)))
-uint16x8_t __arm_vcvtq_x_n_u16_f16(float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_u32_f32)))
-uint32x4_t __arm_vcvtq_x_n_u32_f32(float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_s16_f16)))
-int16x8_t __arm_vcvtq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_s32_f32)))
-int32x4_t __arm_vcvtq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_u16_f16)))
-uint16x8_t __arm_vcvtq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_u32_f32)))
-uint32x4_t __arm_vcvtq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_f16_f32)))
-float16x8_t __arm_vcvttq_f16_f32(float16x8_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_f32_f16)))
-float32x4_t __arm_vcvttq_f32_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_m_f16_f32)))
-float16x8_t __arm_vcvttq_m_f16_f32(float16x8_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_m_f32_f16)))
-float32x4_t __arm_vcvttq_m_f32_f16(float32x4_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_x_f32_f16)))
-float32x4_t __arm_vcvttq_x_f32_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_f16)))
-float16x8_t __arm_vdupq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_f16)))
-float16x8_t __arm_vdupq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_f32)))
-float32x4_t __arm_vdupq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_f32)))
-float32x4_t __arm_vdupq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_f16)))
-float16x8_t __arm_vdupq_n_f16(float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_f32)))
-float32x4_t __arm_vdupq_n_f32(float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_f16)))
-float16x8_t __arm_vdupq_x_n_f16(float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_f32)))
-float32x4_t __arm_vdupq_x_n_f32(float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_f16)))
-float16x8_t __arm_veorq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_f16)))
-float16x8_t __arm_veorq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_f32)))
-float32x4_t __arm_veorq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_f32)))
-float32x4_t __arm_veorq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_f16)))
-float16x8_t __arm_veorq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_f16)))
-float16x8_t __arm_veorq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_f32)))
-float32x4_t __arm_veorq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_f32)))
-float32x4_t __arm_veorq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_f16)))
-float16x8_t __arm_veorq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_f16)))
-float16x8_t __arm_veorq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_f32)))
-float32x4_t __arm_veorq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_f32)))
-float32x4_t __arm_veorq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_f16)))
-float16x8_t __arm_vfmaq_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_f16)))
-float16x8_t __arm_vfmaq(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_f32)))
-float32x4_t __arm_vfmaq_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_f32)))
-float32x4_t __arm_vfmaq(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_f16)))
-float16x8_t __arm_vfmaq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_f16)))
-float16x8_t __arm_vfmaq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_f32)))
-float32x4_t __arm_vfmaq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_f32)))
-float32x4_t __arm_vfmaq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_n_f16)))
-float16x8_t __arm_vfmaq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_n_f16)))
-float16x8_t __arm_vfmaq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_n_f32)))
-float32x4_t __arm_vfmaq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_n_f32)))
-float32x4_t __arm_vfmaq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_n_f16)))
-float16x8_t __arm_vfmaq_n_f16(float16x8_t, float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_n_f16)))
-float16x8_t __arm_vfmaq(float16x8_t, float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_n_f32)))
-float32x4_t __arm_vfmaq_n_f32(float32x4_t, float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_n_f32)))
-float32x4_t __arm_vfmaq(float32x4_t, float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_m_n_f16)))
-float16x8_t __arm_vfmasq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_m_n_f16)))
-float16x8_t __arm_vfmasq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_m_n_f32)))
-float32x4_t __arm_vfmasq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_m_n_f32)))
-float32x4_t __arm_vfmasq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_n_f16)))
-float16x8_t __arm_vfmasq_n_f16(float16x8_t, float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_n_f16)))
-float16x8_t __arm_vfmasq(float16x8_t, float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_n_f32)))
-float32x4_t __arm_vfmasq_n_f32(float32x4_t, float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_n_f32)))
-float32x4_t __arm_vfmasq(float32x4_t, float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_f16)))
-float16x8_t __arm_vfmsq_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_f16)))
-float16x8_t __arm_vfmsq(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_f32)))
-float32x4_t __arm_vfmsq_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_f32)))
-float32x4_t __arm_vfmsq(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_m_f16)))
-float16x8_t __arm_vfmsq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_m_f16)))
-float16x8_t __arm_vfmsq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_m_f32)))
-float32x4_t __arm_vfmsq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_m_f32)))
-float32x4_t __arm_vfmsq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_f16)))
-float16_t __arm_vgetq_lane_f16(float16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_f16)))
-float16_t __arm_vgetq_lane(float16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_f32)))
-float32_t __arm_vgetq_lane_f32(float32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_f32)))
-float32_t __arm_vgetq_lane(float32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_f16)))
-float16x8_t __arm_vld1q_f16(const float16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_f16)))
-float16x8_t __arm_vld1q(const float16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_f32)))
-float32x4_t __arm_vld1q_f32(const float32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_f32)))
-float32x4_t __arm_vld1q(const float32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_f16)))
-float16x8_t __arm_vld1q_z_f16(const float16_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_f16)))
-float16x8_t __arm_vld1q_z(const float16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_f32)))
-float32x4_t __arm_vld1q_z_f32(const float32_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_f32)))
-float32x4_t __arm_vld1q_z(const float32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_f16)))
-float16x8x2_t __arm_vld2q_f16(const float16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_f16)))
-float16x8x2_t __arm_vld2q(const float16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_f32)))
-float32x4x2_t __arm_vld2q_f32(const float32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_f32)))
-float32x4x2_t __arm_vld2q(const float32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_f16)))
-float16x8x4_t __arm_vld4q_f16(const float16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_f16)))
-float16x8x4_t __arm_vld4q(const float16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_f32)))
-float32x4x4_t __arm_vld4q_f32(const float32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_f32)))
-float32x4x4_t __arm_vld4q(const float32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_f16)))
-float16x8_t __arm_vldrhq_f16(const float16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_f16)))
-float16x8_t __arm_vldrhq_gather_offset_f16(const float16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_f16)))
-float16x8_t __arm_vldrhq_gather_offset(const float16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_f16)))
-float16x8_t __arm_vldrhq_gather_offset_z_f16(const float16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_f16)))
-float16x8_t __arm_vldrhq_gather_offset_z(const float16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_f16)))
-float16x8_t __arm_vldrhq_gather_shifted_offset_f16(const float16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_f16)))
-float16x8_t __arm_vldrhq_gather_shifted_offset(const float16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_f16)))
-float16x8_t __arm_vldrhq_gather_shifted_offset_z_f16(const float16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_f16)))
-float16x8_t __arm_vldrhq_gather_shifted_offset_z(const float16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_f16)))
-float16x8_t __arm_vldrhq_z_f16(const float16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_f32)))
-float32x4_t __arm_vldrwq_f32(const float32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_f32)))
-float32x4_t __arm_vldrwq_gather_base_f32(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_f32)))
-float32x4_t __arm_vldrwq_gather_base_wb_f32(uint32x4_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_z_f32)))
-float32x4_t __arm_vldrwq_gather_base_wb_z_f32(uint32x4_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_z_f32)))
-float32x4_t __arm_vldrwq_gather_base_z_f32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_f32)))
-float32x4_t __arm_vldrwq_gather_offset_f32(const float32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_f32)))
-float32x4_t __arm_vldrwq_gather_offset(const float32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_f32)))
-float32x4_t __arm_vldrwq_gather_offset_z_f32(const float32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_f32)))
-float32x4_t __arm_vldrwq_gather_offset_z(const float32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_f32)))
-float32x4_t __arm_vldrwq_gather_shifted_offset_f32(const float32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_f32)))
-float32x4_t __arm_vldrwq_gather_shifted_offset(const float32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_f32)))
-float32x4_t __arm_vldrwq_gather_shifted_offset_z_f32(const float32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_f32)))
-float32x4_t __arm_vldrwq_gather_shifted_offset_z(const float32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_z_f32)))
-float32x4_t __arm_vldrwq_z_f32(const float32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_f16)))
-float16x8_t __arm_vmaxnmaq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_f16)))
-float16x8_t __arm_vmaxnmaq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_f32)))
-float32x4_t __arm_vmaxnmaq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_f32)))
-float32x4_t __arm_vmaxnmaq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_m_f16)))
-float16x8_t __arm_vmaxnmaq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_m_f16)))
-float16x8_t __arm_vmaxnmaq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_m_f32)))
-float32x4_t __arm_vmaxnmaq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_m_f32)))
-float32x4_t __arm_vmaxnmaq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_f16)))
-float16_t __arm_vmaxnmavq_f16(float16_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_f16)))
-float16_t __arm_vmaxnmavq(float16_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_f32)))
-float32_t __arm_vmaxnmavq_f32(float32_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_f32)))
-float32_t __arm_vmaxnmavq(float32_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_p_f16)))
-float16_t __arm_vmaxnmavq_p_f16(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_p_f16)))
-float16_t __arm_vmaxnmavq_p(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_p_f32)))
-float32_t __arm_vmaxnmavq_p_f32(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_p_f32)))
-float32_t __arm_vmaxnmavq_p(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_f16)))
-float16x8_t __arm_vmaxnmq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_f16)))
-float16x8_t __arm_vmaxnmq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_f32)))
-float32x4_t __arm_vmaxnmq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_f32)))
-float32x4_t __arm_vmaxnmq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_m_f16)))
-float16x8_t __arm_vmaxnmq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_m_f16)))
-float16x8_t __arm_vmaxnmq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_m_f32)))
-float32x4_t __arm_vmaxnmq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_m_f32)))
-float32x4_t __arm_vmaxnmq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_x_f16)))
-float16x8_t __arm_vmaxnmq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_x_f16)))
-float16x8_t __arm_vmaxnmq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_x_f32)))
-float32x4_t __arm_vmaxnmq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_x_f32)))
-float32x4_t __arm_vmaxnmq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_f16)))
-float16_t __arm_vmaxnmvq_f16(float16_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_f16)))
-float16_t __arm_vmaxnmvq(float16_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_f32)))
-float32_t __arm_vmaxnmvq_f32(float32_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_f32)))
-float32_t __arm_vmaxnmvq(float32_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_p_f16)))
-float16_t __arm_vmaxnmvq_p_f16(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_p_f16)))
-float16_t __arm_vmaxnmvq_p(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_p_f32)))
-float32_t __arm_vmaxnmvq_p_f32(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_p_f32)))
-float32_t __arm_vmaxnmvq_p(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_f16)))
-float16x8_t __arm_vminnmaq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_f16)))
-float16x8_t __arm_vminnmaq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_f32)))
-float32x4_t __arm_vminnmaq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_f32)))
-float32x4_t __arm_vminnmaq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_m_f16)))
-float16x8_t __arm_vminnmaq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_m_f16)))
-float16x8_t __arm_vminnmaq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_m_f32)))
-float32x4_t __arm_vminnmaq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_m_f32)))
-float32x4_t __arm_vminnmaq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_f16)))
-float16_t __arm_vminnmavq_f16(float16_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_f16)))
-float16_t __arm_vminnmavq(float16_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_f32)))
-float32_t __arm_vminnmavq_f32(float32_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_f32)))
-float32_t __arm_vminnmavq(float32_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_p_f16)))
-float16_t __arm_vminnmavq_p_f16(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_p_f16)))
-float16_t __arm_vminnmavq_p(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_p_f32)))
-float32_t __arm_vminnmavq_p_f32(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_p_f32)))
-float32_t __arm_vminnmavq_p(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_f16)))
-float16x8_t __arm_vminnmq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_f16)))
-float16x8_t __arm_vminnmq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_f32)))
-float32x4_t __arm_vminnmq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_f32)))
-float32x4_t __arm_vminnmq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_m_f16)))
-float16x8_t __arm_vminnmq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_m_f16)))
-float16x8_t __arm_vminnmq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_m_f32)))
-float32x4_t __arm_vminnmq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_m_f32)))
-float32x4_t __arm_vminnmq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_x_f16)))
-float16x8_t __arm_vminnmq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_x_f16)))
-float16x8_t __arm_vminnmq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_x_f32)))
-float32x4_t __arm_vminnmq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_x_f32)))
-float32x4_t __arm_vminnmq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_f16)))
-float16_t __arm_vminnmvq_f16(float16_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_f16)))
-float16_t __arm_vminnmvq(float16_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_f32)))
-float32_t __arm_vminnmvq_f32(float32_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_f32)))
-float32_t __arm_vminnmvq(float32_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_p_f16)))
-float16_t __arm_vminnmvq_p_f16(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_p_f16)))
-float16_t __arm_vminnmvq_p(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_p_f32)))
-float32_t __arm_vminnmvq_p_f32(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_p_f32)))
-float32_t __arm_vminnmvq_p(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_f16)))
-float16x8_t __arm_vmulq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_f16)))
-float16x8_t __arm_vmulq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_f32)))
-float32x4_t __arm_vmulq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_f32)))
-float32x4_t __arm_vmulq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_f16)))
-float16x8_t __arm_vmulq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_f16)))
-float16x8_t __arm_vmulq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_f32)))
-float32x4_t __arm_vmulq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_f32)))
-float32x4_t __arm_vmulq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_f16)))
-float16x8_t __arm_vmulq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_f16)))
-float16x8_t __arm_vmulq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_f32)))
-float32x4_t __arm_vmulq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_f32)))
-float32x4_t __arm_vmulq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_f16)))
-float16x8_t __arm_vmulq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_f16)))
-float16x8_t __arm_vmulq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_f32)))
-float32x4_t __arm_vmulq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_f32)))
-float32x4_t __arm_vmulq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_f16)))
-float16x8_t __arm_vmulq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_f16)))
-float16x8_t __arm_vmulq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_f32)))
-float32x4_t __arm_vmulq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_f32)))
-float32x4_t __arm_vmulq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_f16)))
-float16x8_t __arm_vmulq_x_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_f16)))
-float16x8_t __arm_vmulq_x(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_f32)))
-float32x4_t __arm_vmulq_x_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_f32)))
-float32x4_t __arm_vmulq_x(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_f16)))
-float16x8_t __arm_vnegq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_f16)))
-float16x8_t __arm_vnegq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_f32)))
-float32x4_t __arm_vnegq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_f32)))
-float32x4_t __arm_vnegq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_f16)))
-float16x8_t __arm_vnegq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_f16)))
-float16x8_t __arm_vnegq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_f32)))
-float32x4_t __arm_vnegq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_f32)))
-float32x4_t __arm_vnegq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_f16)))
-float16x8_t __arm_vnegq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_f16)))
-float16x8_t __arm_vnegq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_f32)))
-float32x4_t __arm_vnegq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_f32)))
-float32x4_t __arm_vnegq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_f16)))
-float16x8_t __arm_vornq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_f16)))
-float16x8_t __arm_vornq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_f32)))
-float32x4_t __arm_vornq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_f32)))
-float32x4_t __arm_vornq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_f16)))
-float16x8_t __arm_vornq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_f16)))
-float16x8_t __arm_vornq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_f32)))
-float32x4_t __arm_vornq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_f32)))
-float32x4_t __arm_vornq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_f16)))
-float16x8_t __arm_vornq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_f16)))
-float16x8_t __arm_vornq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_f32)))
-float32x4_t __arm_vornq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_f32)))
-float32x4_t __arm_vornq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_f16)))
-float16x8_t __arm_vorrq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_f16)))
-float16x8_t __arm_vorrq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_f32)))
-float32x4_t __arm_vorrq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_f32)))
-float32x4_t __arm_vorrq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_f16)))
-float16x8_t __arm_vorrq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_f16)))
-float16x8_t __arm_vorrq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_f32)))
-float32x4_t __arm_vorrq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_f32)))
-float32x4_t __arm_vorrq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_f16)))
-float16x8_t __arm_vorrq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_f16)))
-float16x8_t __arm_vorrq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_f32)))
-float32x4_t __arm_vorrq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_f32)))
-float32x4_t __arm_vorrq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_f16)))
-float16x8_t __arm_vpselq_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_f16)))
-float16x8_t __arm_vpselq(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_f32)))
-float32x4_t __arm_vpselq_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_f32)))
-float32x4_t __arm_vpselq(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_f32)))
-float16x8_t __arm_vreinterpretq_f16_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_f32)))
-float16x8_t __arm_vreinterpretq_f16(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s16)))
-float16x8_t __arm_vreinterpretq_f16_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s16)))
-float16x8_t __arm_vreinterpretq_f16(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s32)))
-float16x8_t __arm_vreinterpretq_f16_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s32)))
-float16x8_t __arm_vreinterpretq_f16(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s64)))
-float16x8_t __arm_vreinterpretq_f16_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s64)))
-float16x8_t __arm_vreinterpretq_f16(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s8)))
-float16x8_t __arm_vreinterpretq_f16_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s8)))
-float16x8_t __arm_vreinterpretq_f16(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u16)))
-float16x8_t __arm_vreinterpretq_f16_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u16)))
-float16x8_t __arm_vreinterpretq_f16(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u32)))
-float16x8_t __arm_vreinterpretq_f16_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u32)))
-float16x8_t __arm_vreinterpretq_f16(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u64)))
-float16x8_t __arm_vreinterpretq_f16_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u64)))
-float16x8_t __arm_vreinterpretq_f16(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
-float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
-float16x8_t __arm_vreinterpretq_f16(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_f16)))
-float32x4_t __arm_vreinterpretq_f32_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_f16)))
-float32x4_t __arm_vreinterpretq_f32(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s16)))
-float32x4_t __arm_vreinterpretq_f32_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s16)))
-float32x4_t __arm_vreinterpretq_f32(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s32)))
-float32x4_t __arm_vreinterpretq_f32_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s32)))
-float32x4_t __arm_vreinterpretq_f32(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s64)))
-float32x4_t __arm_vreinterpretq_f32_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s64)))
-float32x4_t __arm_vreinterpretq_f32(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s8)))
-float32x4_t __arm_vreinterpretq_f32_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s8)))
-float32x4_t __arm_vreinterpretq_f32(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u16)))
-float32x4_t __arm_vreinterpretq_f32_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u16)))
-float32x4_t __arm_vreinterpretq_f32(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u32)))
-float32x4_t __arm_vreinterpretq_f32_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u32)))
-float32x4_t __arm_vreinterpretq_f32(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u64)))
-float32x4_t __arm_vreinterpretq_f32_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u64)))
-float32x4_t __arm_vreinterpretq_f32(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
-float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
-float32x4_t __arm_vreinterpretq_f32(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_f16)))
-int16x8_t __arm_vreinterpretq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_f16)))
-int16x8_t __arm_vreinterpretq_s16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_f32)))
-int16x8_t __arm_vreinterpretq_s16_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_f32)))
-int16x8_t __arm_vreinterpretq_s16(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_f16)))
-int32x4_t __arm_vreinterpretq_s32_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_f16)))
-int32x4_t __arm_vreinterpretq_s32(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_f32)))
-int32x4_t __arm_vreinterpretq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_f32)))
-int32x4_t __arm_vreinterpretq_s32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_f16)))
-int64x2_t __arm_vreinterpretq_s64_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_f16)))
-int64x2_t __arm_vreinterpretq_s64(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_f32)))
-int64x2_t __arm_vreinterpretq_s64_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_f32)))
-int64x2_t __arm_vreinterpretq_s64(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_f16)))
-int8x16_t __arm_vreinterpretq_s8_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_f16)))
-int8x16_t __arm_vreinterpretq_s8(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_f32)))
-int8x16_t __arm_vreinterpretq_s8_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_f32)))
-int8x16_t __arm_vreinterpretq_s8(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_f16)))
-uint16x8_t __arm_vreinterpretq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_f16)))
-uint16x8_t __arm_vreinterpretq_u16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_f32)))
-uint16x8_t __arm_vreinterpretq_u16_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_f32)))
-uint16x8_t __arm_vreinterpretq_u16(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_f16)))
-uint32x4_t __arm_vreinterpretq_u32_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_f16)))
-uint32x4_t __arm_vreinterpretq_u32(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_f32)))
-uint32x4_t __arm_vreinterpretq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_f32)))
-uint32x4_t __arm_vreinterpretq_u32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_f16)))
-uint64x2_t __arm_vreinterpretq_u64_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_f16)))
-uint64x2_t __arm_vreinterpretq_u64(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_f32)))
-uint64x2_t __arm_vreinterpretq_u64_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_f32)))
-uint64x2_t __arm_vreinterpretq_u64(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
-uint8x16_t __arm_vreinterpretq_u8_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
-uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
-uint8x16_t __arm_vreinterpretq_u8_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
-uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_f16)))
-float16x8_t __arm_vrev32q_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_f16)))
-float16x8_t __arm_vrev32q(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_f16)))
-float16x8_t __arm_vrev32q_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_f16)))
-float16x8_t __arm_vrev32q_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_f16)))
-float16x8_t __arm_vrev32q_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_f16)))
-float16x8_t __arm_vrev32q_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_f16)))
-float16x8_t __arm_vrev64q_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_f16)))
-float16x8_t __arm_vrev64q(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_f32)))
-float32x4_t __arm_vrev64q_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_f32)))
-float32x4_t __arm_vrev64q(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_f16)))
-float16x8_t __arm_vrev64q_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_f16)))
-float16x8_t __arm_vrev64q_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_f32)))
-float32x4_t __arm_vrev64q_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_f32)))
-float32x4_t __arm_vrev64q_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_f16)))
-float16x8_t __arm_vrev64q_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_f16)))
-float16x8_t __arm_vrev64q_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_f32)))
-float32x4_t __arm_vrev64q_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_f32)))
-float32x4_t __arm_vrev64q_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_f16)))
-float16x8_t __arm_vrndaq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_f16)))
-float16x8_t __arm_vrndaq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_f32)))
-float32x4_t __arm_vrndaq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_f32)))
-float32x4_t __arm_vrndaq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_m_f16)))
-float16x8_t __arm_vrndaq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_m_f16)))
-float16x8_t __arm_vrndaq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_m_f32)))
-float32x4_t __arm_vrndaq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_m_f32)))
-float32x4_t __arm_vrndaq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_x_f16)))
-float16x8_t __arm_vrndaq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_x_f16)))
-float16x8_t __arm_vrndaq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_x_f32)))
-float32x4_t __arm_vrndaq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_x_f32)))
-float32x4_t __arm_vrndaq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_f16)))
-float16x8_t __arm_vrndmq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_f16)))
-float16x8_t __arm_vrndmq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_f32)))
-float32x4_t __arm_vrndmq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_f32)))
-float32x4_t __arm_vrndmq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_m_f16)))
-float16x8_t __arm_vrndmq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_m_f16)))
-float16x8_t __arm_vrndmq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_m_f32)))
-float32x4_t __arm_vrndmq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_m_f32)))
-float32x4_t __arm_vrndmq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_x_f16)))
-float16x8_t __arm_vrndmq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_x_f16)))
-float16x8_t __arm_vrndmq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_x_f32)))
-float32x4_t __arm_vrndmq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_x_f32)))
-float32x4_t __arm_vrndmq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_f16)))
-float16x8_t __arm_vrndnq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_f16)))
-float16x8_t __arm_vrndnq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_f32)))
-float32x4_t __arm_vrndnq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_f32)))
-float32x4_t __arm_vrndnq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_m_f16)))
-float16x8_t __arm_vrndnq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_m_f16)))
-float16x8_t __arm_vrndnq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_m_f32)))
-float32x4_t __arm_vrndnq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_m_f32)))
-float32x4_t __arm_vrndnq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_x_f16)))
-float16x8_t __arm_vrndnq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_x_f16)))
-float16x8_t __arm_vrndnq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_x_f32)))
-float32x4_t __arm_vrndnq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_x_f32)))
-float32x4_t __arm_vrndnq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_f16)))
-float16x8_t __arm_vrndpq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_f16)))
-float16x8_t __arm_vrndpq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_f32)))
-float32x4_t __arm_vrndpq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_f32)))
-float32x4_t __arm_vrndpq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_m_f16)))
-float16x8_t __arm_vrndpq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_m_f16)))
-float16x8_t __arm_vrndpq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_m_f32)))
-float32x4_t __arm_vrndpq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_m_f32)))
-float32x4_t __arm_vrndpq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_x_f16)))
-float16x8_t __arm_vrndpq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_x_f16)))
-float16x8_t __arm_vrndpq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_x_f32)))
-float32x4_t __arm_vrndpq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_x_f32)))
-float32x4_t __arm_vrndpq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_f16)))
-float16x8_t __arm_vrndq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_f16)))
-float16x8_t __arm_vrndq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_f32)))
-float32x4_t __arm_vrndq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_f32)))
-float32x4_t __arm_vrndq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_m_f16)))
-float16x8_t __arm_vrndq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_m_f16)))
-float16x8_t __arm_vrndq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_m_f32)))
-float32x4_t __arm_vrndq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_m_f32)))
-float32x4_t __arm_vrndq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_x_f16)))
-float16x8_t __arm_vrndq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_x_f16)))
-float16x8_t __arm_vrndq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_x_f32)))
-float32x4_t __arm_vrndq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_x_f32)))
-float32x4_t __arm_vrndq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_f16)))
-float16x8_t __arm_vrndxq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_f16)))
-float16x8_t __arm_vrndxq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_f32)))
-float32x4_t __arm_vrndxq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_f32)))
-float32x4_t __arm_vrndxq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_m_f16)))
-float16x8_t __arm_vrndxq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_m_f16)))
-float16x8_t __arm_vrndxq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_m_f32)))
-float32x4_t __arm_vrndxq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_m_f32)))
-float32x4_t __arm_vrndxq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_x_f16)))
-float16x8_t __arm_vrndxq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_x_f16)))
-float16x8_t __arm_vrndxq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_x_f32)))
-float32x4_t __arm_vrndxq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_x_f32)))
-float32x4_t __arm_vrndxq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_f16)))
-float16x8_t __arm_vsetq_lane_f16(float16_t, float16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_f16)))
-float16x8_t __arm_vsetq_lane(float16_t, float16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_f32)))
-float32x4_t __arm_vsetq_lane_f32(float32_t, float32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_f32)))
-float32x4_t __arm_vsetq_lane(float32_t, float32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_f16)))
-void __arm_vst1q_f16(float16_t *, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_f16)))
-void __arm_vst1q(float16_t *, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_f32)))
-void __arm_vst1q_f32(float32_t *, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_f32)))
-void __arm_vst1q(float32_t *, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_f16)))
-void __arm_vst1q_p_f16(float16_t *, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_f16)))
-void __arm_vst1q_p(float16_t *, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_f32)))
-void __arm_vst1q_p_f32(float32_t *, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_f32)))
-void __arm_vst1q_p(float32_t *, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_f16)))
-void __arm_vst2q_f16(float16_t *, float16x8x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_f16)))
-void __arm_vst2q(float16_t *, float16x8x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_f32)))
-void __arm_vst2q_f32(float32_t *, float32x4x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_f32)))
-void __arm_vst2q(float32_t *, float32x4x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_f16)))
-void __arm_vst4q_f16(float16_t *, float16x8x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_f16)))
-void __arm_vst4q(float16_t *, float16x8x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_f32)))
-void __arm_vst4q_f32(float32_t *, float32x4x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_f32)))
-void __arm_vst4q(float32_t *, float32x4x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_f16)))
-void __arm_vstrhq_f16(float16_t *, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_f16)))
-void __arm_vstrhq(float16_t *, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_f16)))
-void __arm_vstrhq_p_f16(float16_t *, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_f16)))
-void __arm_vstrhq_p(float16_t *, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_f16)))
-void __arm_vstrhq_scatter_offset_f16(float16_t *, uint16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_f16)))
-void __arm_vstrhq_scatter_offset(float16_t *, uint16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_f16)))
-void __arm_vstrhq_scatter_offset_p_f16(float16_t *, uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_f16)))
-void __arm_vstrhq_scatter_offset_p(float16_t *, uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_f16)))
-void __arm_vstrhq_scatter_shifted_offset_f16(float16_t *, uint16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_f16)))
-void __arm_vstrhq_scatter_shifted_offset(float16_t *, uint16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_f16)))
-void __arm_vstrhq_scatter_shifted_offset_p_f16(float16_t *, uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_f16)))
-void __arm_vstrhq_scatter_shifted_offset_p(float16_t *, uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_f32)))
-void __arm_vstrwq_f32(float32_t *, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_f32)))
-void __arm_vstrwq(float32_t *, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_f32)))
-void __arm_vstrwq_p_f32(float32_t *, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_f32)))
-void __arm_vstrwq_p(float32_t *, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_f32)))
-void __arm_vstrwq_scatter_base_f32(uint32x4_t, int, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_f32)))
-void __arm_vstrwq_scatter_base(uint32x4_t, int, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_f32)))
-void __arm_vstrwq_scatter_base_p_f32(uint32x4_t, int, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_f32)))
-void __arm_vstrwq_scatter_base_p(uint32x4_t, int, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_f32)))
-void __arm_vstrwq_scatter_base_wb_f32(uint32x4_t *, int, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_f32)))
-void __arm_vstrwq_scatter_base_wb(uint32x4_t *, int, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_f32)))
-void __arm_vstrwq_scatter_base_wb_p_f32(uint32x4_t *, int, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_f32)))
-void __arm_vstrwq_scatter_base_wb_p(uint32x4_t *, int, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_f32)))
-void __arm_vstrwq_scatter_offset_f32(float32_t *, uint32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_f32)))
-void __arm_vstrwq_scatter_offset(float32_t *, uint32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_f32)))
-void __arm_vstrwq_scatter_offset_p_f32(float32_t *, uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_f32)))
-void __arm_vstrwq_scatter_offset_p(float32_t *, uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_f32)))
-void __arm_vstrwq_scatter_shifted_offset_f32(float32_t *, uint32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_f32)))
-void __arm_vstrwq_scatter_shifted_offset(float32_t *, uint32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_f32)))
-void __arm_vstrwq_scatter_shifted_offset_p_f32(float32_t *, uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_f32)))
-void __arm_vstrwq_scatter_shifted_offset_p(float32_t *, uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_f16)))
-float16x8_t __arm_vsubq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_f16)))
-float16x8_t __arm_vsubq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_f32)))
-float32x4_t __arm_vsubq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_f32)))
-float32x4_t __arm_vsubq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_f16)))
-float16x8_t __arm_vsubq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_f16)))
-float16x8_t __arm_vsubq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_f32)))
-float32x4_t __arm_vsubq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_f32)))
-float32x4_t __arm_vsubq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_f16)))
-float16x8_t __arm_vsubq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_f16)))
-float16x8_t __arm_vsubq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_f32)))
-float32x4_t __arm_vsubq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_f32)))
-float32x4_t __arm_vsubq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_f16)))
-float16x8_t __arm_vsubq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_f16)))
-float16x8_t __arm_vsubq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_f32)))
-float32x4_t __arm_vsubq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_f32)))
-float32x4_t __arm_vsubq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_f16)))
-float16x8_t __arm_vsubq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_f16)))
-float16x8_t __arm_vsubq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_f32)))
-float32x4_t __arm_vsubq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_f32)))
-float32x4_t __arm_vsubq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_f16)))
-float16x8_t __arm_vsubq_x_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_f16)))
-float16x8_t __arm_vsubq_x(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_f32)))
-float32x4_t __arm_vsubq_x_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_f32)))
-float32x4_t __arm_vsubq_x(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_f16)))
-float16x8_t __arm_vuninitializedq_f16();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_f32)))
-float32x4_t __arm_vuninitializedq_f32();
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_f16)))
-float16x8_t __arm_vuninitializedq(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_f32)))
-float32x4_t __arm_vuninitializedq(float32x4_t);
-
-#endif /* (__ARM_FEATURE_MVE & 2) */
-
-#if (!defined __ARM_MVE_PRESERVE_USER_NAMESPACE)
-
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_asrl)))
-int64_t asrl(int64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_lsll)))
-uint64_t lsll(uint64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqrshr)))
-int32_t sqrshr(int32_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqrshrl)))
-int64_t sqrshrl(int64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqrshrl_sat48)))
-int64_t sqrshrl_sat48(int64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqshl)))
-int32_t sqshl(int32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_sqshll)))
-int64_t sqshll(int64_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_srshr)))
-int32_t srshr(int32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_srshrl)))
-int64_t srshrl(int64_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqrshl)))
-uint32_t uqrshl(uint32_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqrshll)))
-uint64_t uqrshll(uint64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqrshll_sat48)))
-uint64_t uqrshll_sat48(uint64_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqshl)))
-uint32_t uqshl(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_uqshll)))
-uint64_t uqshll(uint64_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_urshr)))
-uint32_t urshr(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_urshrl)))
-uint64_t urshrl(uint64_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s16)))
-uint32_t vabavq_p_s16(uint32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s16)))
-uint32_t vabavq_p(uint32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s32)))
-uint32_t vabavq_p_s32(uint32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s32)))
-uint32_t vabavq_p(uint32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s8)))
-uint32_t vabavq_p_s8(uint32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_s8)))
-uint32_t vabavq_p(uint32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u16)))
-uint32_t vabavq_p_u16(uint32_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u16)))
-uint32_t vabavq_p(uint32_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u32)))
-uint32_t vabavq_p_u32(uint32_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u32)))
-uint32_t vabavq_p(uint32_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u8)))
-uint32_t vabavq_p_u8(uint32_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_p_u8)))
-uint32_t vabavq_p(uint32_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s16)))
-uint32_t vabavq_s16(uint32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s16)))
-uint32_t vabavq(uint32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s32)))
-uint32_t vabavq_s32(uint32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s32)))
-uint32_t vabavq(uint32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s8)))
-uint32_t vabavq_s8(uint32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_s8)))
-uint32_t vabavq(uint32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u16)))
-uint32_t vabavq_u16(uint32_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u16)))
-uint32_t vabavq(uint32_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u32)))
-uint32_t vabavq_u32(uint32_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u32)))
-uint32_t vabavq(uint32_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u8)))
-uint32_t vabavq_u8(uint32_t, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabavq_u8)))
-uint32_t vabavq(uint32_t, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s16)))
-int16x8_t vabdq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s16)))
-int16x8_t vabdq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s32)))
-int32x4_t vabdq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s32)))
-int32x4_t vabdq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s8)))
-int8x16_t vabdq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_s8)))
-int8x16_t vabdq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u16)))
-uint16x8_t vabdq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u16)))
-uint16x8_t vabdq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u32)))
-uint32x4_t vabdq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u32)))
-uint32x4_t vabdq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u8)))
-uint8x16_t vabdq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_u8)))
-uint8x16_t vabdq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s16)))
-int16x8_t vabdq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s16)))
-int16x8_t vabdq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s32)))
-int32x4_t vabdq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s32)))
-int32x4_t vabdq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s8)))
-int8x16_t vabdq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_s8)))
-int8x16_t vabdq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u16)))
-uint16x8_t vabdq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u16)))
-uint16x8_t vabdq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u32)))
-uint32x4_t vabdq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u32)))
-uint32x4_t vabdq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u8)))
-uint8x16_t vabdq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_u8)))
-uint8x16_t vabdq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s16)))
-int16x8_t vabdq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s16)))
-int16x8_t vabdq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s32)))
-int32x4_t vabdq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s32)))
-int32x4_t vabdq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s8)))
-int8x16_t vabdq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_s8)))
-int8x16_t vabdq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u16)))
-uint16x8_t vabdq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u16)))
-uint16x8_t vabdq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u32)))
-uint32x4_t vabdq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u32)))
-uint32x4_t vabdq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u8)))
-uint8x16_t vabdq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_u8)))
-uint8x16_t vabdq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s16)))
-int16x8_t vabsq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s16)))
-int16x8_t vabsq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s32)))
-int32x4_t vabsq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s32)))
-int32x4_t vabsq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s8)))
-int8x16_t vabsq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_s8)))
-int8x16_t vabsq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s16)))
-int16x8_t vabsq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s16)))
-int16x8_t vabsq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s32)))
-int32x4_t vabsq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s32)))
-int32x4_t vabsq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s8)))
-int8x16_t vabsq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_s8)))
-int8x16_t vabsq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s16)))
-int16x8_t vabsq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s16)))
-int16x8_t vabsq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s32)))
-int32x4_t vabsq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s32)))
-int32x4_t vabsq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s8)))
-int8x16_t vabsq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_s8)))
-int8x16_t vabsq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadciq_m_s32)))
-int32x4_t vadciq_m_s32(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadciq_m_s32)))
-int32x4_t vadciq_m(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadciq_m_u32)))
-uint32x4_t vadciq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadciq_m_u32)))
-uint32x4_t vadciq_m(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadciq_s32)))
-int32x4_t vadciq_s32(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadciq_s32)))
-int32x4_t vadciq(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadciq_u32)))
-uint32x4_t vadciq_u32(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadciq_u32)))
-uint32x4_t vadciq(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadcq_m_s32)))
-int32x4_t vadcq_m_s32(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadcq_m_s32)))
-int32x4_t vadcq_m(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadcq_m_u32)))
-uint32x4_t vadcq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadcq_m_u32)))
-uint32x4_t vadcq_m(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadcq_s32)))
-int32x4_t vadcq_s32(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadcq_s32)))
-int32x4_t vadcq(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vadcq_u32)))
-uint32x4_t vadcq_u32(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vadcq_u32)))
-uint32x4_t vadcq(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_p_s32)))
-int64_t vaddlvaq_p_s32(int64_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_p_s32)))
-int64_t vaddlvaq_p(int64_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_p_u32)))
-uint64_t vaddlvaq_p_u32(uint64_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_p_u32)))
-uint64_t vaddlvaq_p(uint64_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_s32)))
-int64_t vaddlvaq_s32(int64_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_s32)))
-int64_t vaddlvaq(int64_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_u32)))
-uint64_t vaddlvaq_u32(uint64_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvaq_u32)))
-uint64_t vaddlvaq(uint64_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_p_s32)))
-int64_t vaddlvq_p_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_p_s32)))
-int64_t vaddlvq_p(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_p_u32)))
-uint64_t vaddlvq_p_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_p_u32)))
-uint64_t vaddlvq_p(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_s32)))
-int64_t vaddlvq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_s32)))
-int64_t vaddlvq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_u32)))
-uint64_t vaddlvq_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddlvq_u32)))
-uint64_t vaddlvq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s16)))
-int16x8_t vaddq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s16)))
-int16x8_t vaddq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s32)))
-int32x4_t vaddq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s32)))
-int32x4_t vaddq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s8)))
-int8x16_t vaddq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_s8)))
-int8x16_t vaddq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u16)))
-uint16x8_t vaddq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u16)))
-uint16x8_t vaddq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u32)))
-uint32x4_t vaddq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u32)))
-uint32x4_t vaddq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u8)))
-uint8x16_t vaddq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_u8)))
-uint8x16_t vaddq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s16)))
-int16x8_t vaddq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s16)))
-int16x8_t vaddq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s32)))
-int32x4_t vaddq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s32)))
-int32x4_t vaddq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s8)))
-int8x16_t vaddq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_s8)))
-int8x16_t vaddq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u16)))
-uint16x8_t vaddq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u16)))
-uint16x8_t vaddq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u32)))
-uint32x4_t vaddq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u32)))
-uint32x4_t vaddq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u8)))
-uint8x16_t vaddq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_u8)))
-uint8x16_t vaddq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s16)))
-int16x8_t vaddq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s16)))
-int16x8_t vaddq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s32)))
-int32x4_t vaddq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s32)))
-int32x4_t vaddq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s8)))
-int8x16_t vaddq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_s8)))
-int8x16_t vaddq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u16)))
-uint16x8_t vaddq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u16)))
-uint16x8_t vaddq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u32)))
-uint32x4_t vaddq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u32)))
-uint32x4_t vaddq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u8)))
-uint8x16_t vaddq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_u8)))
-uint8x16_t vaddq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s16)))
-int16x8_t vaddq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s16)))
-int16x8_t vaddq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s32)))
-int32x4_t vaddq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s32)))
-int32x4_t vaddq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s8)))
-int8x16_t vaddq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_s8)))
-int8x16_t vaddq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u16)))
-uint16x8_t vaddq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u16)))
-uint16x8_t vaddq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u32)))
-uint32x4_t vaddq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u32)))
-uint32x4_t vaddq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u8)))
-uint8x16_t vaddq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_u8)))
-uint8x16_t vaddq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s16)))
-int16x8_t vaddq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s16)))
-int16x8_t vaddq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s32)))
-int32x4_t vaddq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s32)))
-int32x4_t vaddq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s8)))
-int8x16_t vaddq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_s8)))
-int8x16_t vaddq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u16)))
-uint16x8_t vaddq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u16)))
-uint16x8_t vaddq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u32)))
-uint32x4_t vaddq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u32)))
-uint32x4_t vaddq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u8)))
-uint8x16_t vaddq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_u8)))
-uint8x16_t vaddq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s16)))
-int16x8_t vaddq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s16)))
-int16x8_t vaddq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s32)))
-int32x4_t vaddq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s32)))
-int32x4_t vaddq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s8)))
-int8x16_t vaddq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_s8)))
-int8x16_t vaddq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u16)))
-uint16x8_t vaddq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u16)))
-uint16x8_t vaddq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u32)))
-uint32x4_t vaddq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u32)))
-uint32x4_t vaddq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u8)))
-uint8x16_t vaddq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_u8)))
-uint8x16_t vaddq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s16)))
-int32_t vaddvaq_p_s16(int32_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s16)))
-int32_t vaddvaq_p(int32_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s32)))
-int32_t vaddvaq_p_s32(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s32)))
-int32_t vaddvaq_p(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s8)))
-int32_t vaddvaq_p_s8(int32_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_s8)))
-int32_t vaddvaq_p(int32_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u16)))
-uint32_t vaddvaq_p_u16(uint32_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u16)))
-uint32_t vaddvaq_p(uint32_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u32)))
-uint32_t vaddvaq_p_u32(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u32)))
-uint32_t vaddvaq_p(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u8)))
-uint32_t vaddvaq_p_u8(uint32_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_p_u8)))
-uint32_t vaddvaq_p(uint32_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s16)))
-int32_t vaddvaq_s16(int32_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s16)))
-int32_t vaddvaq(int32_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s32)))
-int32_t vaddvaq_s32(int32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s32)))
-int32_t vaddvaq(int32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s8)))
-int32_t vaddvaq_s8(int32_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_s8)))
-int32_t vaddvaq(int32_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u16)))
-uint32_t vaddvaq_u16(uint32_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u16)))
-uint32_t vaddvaq(uint32_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u32)))
-uint32_t vaddvaq_u32(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u32)))
-uint32_t vaddvaq(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u8)))
-uint32_t vaddvaq_u8(uint32_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvaq_u8)))
-uint32_t vaddvaq(uint32_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s16)))
-int32_t vaddvq_p_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s16)))
-int32_t vaddvq_p(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s32)))
-int32_t vaddvq_p_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s32)))
-int32_t vaddvq_p(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s8)))
-int32_t vaddvq_p_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_s8)))
-int32_t vaddvq_p(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u16)))
-uint32_t vaddvq_p_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u16)))
-uint32_t vaddvq_p(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u32)))
-uint32_t vaddvq_p_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u32)))
-uint32_t vaddvq_p(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u8)))
-uint32_t vaddvq_p_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_p_u8)))
-uint32_t vaddvq_p(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s16)))
-int32_t vaddvq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s16)))
-int32_t vaddvq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s32)))
-int32_t vaddvq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s32)))
-int32_t vaddvq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s8)))
-int32_t vaddvq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_s8)))
-int32_t vaddvq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u16)))
-uint32_t vaddvq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u16)))
-uint32_t vaddvq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u32)))
-uint32_t vaddvq_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u32)))
-uint32_t vaddvq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u8)))
-uint32_t vaddvq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddvq_u8)))
-uint32_t vaddvq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s16)))
-int16x8_t vandq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s16)))
-int16x8_t vandq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s32)))
-int32x4_t vandq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s32)))
-int32x4_t vandq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s8)))
-int8x16_t vandq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_s8)))
-int8x16_t vandq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u16)))
-uint16x8_t vandq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u16)))
-uint16x8_t vandq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u32)))
-uint32x4_t vandq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u32)))
-uint32x4_t vandq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u8)))
-uint8x16_t vandq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_u8)))
-uint8x16_t vandq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_s16)))
-int16x8_t vandq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_s16)))
-int16x8_t vandq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_s32)))
-int32x4_t vandq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_s32)))
-int32x4_t vandq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_s8)))
-int8x16_t vandq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_s8)))
-int8x16_t vandq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_u16)))
-uint16x8_t vandq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_u16)))
-uint16x8_t vandq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_u32)))
-uint32x4_t vandq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_u32)))
-uint32x4_t vandq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_u8)))
-uint8x16_t vandq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_u8)))
-uint8x16_t vandq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s16)))
-int16x8_t vandq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s16)))
-int16x8_t vandq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s32)))
-int32x4_t vandq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s32)))
-int32x4_t vandq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s8)))
-int8x16_t vandq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_s8)))
-int8x16_t vandq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u16)))
-uint16x8_t vandq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u16)))
-uint16x8_t vandq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u32)))
-uint32x4_t vandq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u32)))
-uint32x4_t vandq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u8)))
-uint8x16_t vandq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_u8)))
-uint8x16_t vandq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_s16)))
-int16x8_t vbicq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_s16)))
-int16x8_t vbicq_m_n(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_s32)))
-int32x4_t vbicq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_s32)))
-int32x4_t vbicq_m_n(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_u16)))
-uint16x8_t vbicq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_u16)))
-uint16x8_t vbicq_m_n(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_u32)))
-uint32x4_t vbicq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_n_u32)))
-uint32x4_t vbicq_m_n(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s16)))
-int16x8_t vbicq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s16)))
-int16x8_t vbicq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s32)))
-int32x4_t vbicq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s32)))
-int32x4_t vbicq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s8)))
-int8x16_t vbicq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_s8)))
-int8x16_t vbicq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u16)))
-uint16x8_t vbicq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u16)))
-uint16x8_t vbicq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u32)))
-uint32x4_t vbicq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u32)))
-uint32x4_t vbicq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u8)))
-uint8x16_t vbicq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_u8)))
-uint8x16_t vbicq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_s16)))
-int16x8_t vbicq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_s16)))
-int16x8_t vbicq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_s32)))
-int32x4_t vbicq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_s32)))
-int32x4_t vbicq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_u16)))
-uint16x8_t vbicq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_u16)))
-uint16x8_t vbicq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_u32)))
-uint32x4_t vbicq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_n_u32)))
-uint32x4_t vbicq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s16)))
-int16x8_t vbicq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s16)))
-int16x8_t vbicq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s32)))
-int32x4_t vbicq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s32)))
-int32x4_t vbicq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s8)))
-int8x16_t vbicq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_s8)))
-int8x16_t vbicq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u16)))
-uint16x8_t vbicq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u16)))
-uint16x8_t vbicq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u32)))
-uint32x4_t vbicq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u32)))
-uint32x4_t vbicq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u8)))
-uint8x16_t vbicq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_u8)))
-uint8x16_t vbicq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s16)))
-int16x8_t vbicq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s16)))
-int16x8_t vbicq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s32)))
-int32x4_t vbicq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s32)))
-int32x4_t vbicq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s8)))
-int8x16_t vbicq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_s8)))
-int8x16_t vbicq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u16)))
-uint16x8_t vbicq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u16)))
-uint16x8_t vbicq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u32)))
-uint32x4_t vbicq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u32)))
-uint32x4_t vbicq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u8)))
-uint8x16_t vbicq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_u8)))
-uint8x16_t vbicq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s16)))
-int16x8_t vbrsrq_m_n_s16(int16x8_t, int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s16)))
-int16x8_t vbrsrq_m(int16x8_t, int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s32)))
-int32x4_t vbrsrq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s32)))
-int32x4_t vbrsrq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s8)))
-int8x16_t vbrsrq_m_n_s8(int8x16_t, int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_s8)))
-int8x16_t vbrsrq_m(int8x16_t, int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u16)))
-uint16x8_t vbrsrq_m_n_u16(uint16x8_t, uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u16)))
-uint16x8_t vbrsrq_m(uint16x8_t, uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u32)))
-uint32x4_t vbrsrq_m_n_u32(uint32x4_t, uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u32)))
-uint32x4_t vbrsrq_m(uint32x4_t, uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u8)))
-uint8x16_t vbrsrq_m_n_u8(uint8x16_t, uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_u8)))
-uint8x16_t vbrsrq_m(uint8x16_t, uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s16)))
-int16x8_t vbrsrq_n_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s16)))
-int16x8_t vbrsrq(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s32)))
-int32x4_t vbrsrq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s32)))
-int32x4_t vbrsrq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s8)))
-int8x16_t vbrsrq_n_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_s8)))
-int8x16_t vbrsrq(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u16)))
-uint16x8_t vbrsrq_n_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u16)))
-uint16x8_t vbrsrq(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u32)))
-uint32x4_t vbrsrq_n_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u32)))
-uint32x4_t vbrsrq(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u8)))
-uint8x16_t vbrsrq_n_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_u8)))
-uint8x16_t vbrsrq(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s16)))
-int16x8_t vbrsrq_x_n_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s16)))
-int16x8_t vbrsrq_x(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s32)))
-int32x4_t vbrsrq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s32)))
-int32x4_t vbrsrq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s8)))
-int8x16_t vbrsrq_x_n_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_s8)))
-int8x16_t vbrsrq_x(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u16)))
-uint16x8_t vbrsrq_x_n_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u16)))
-uint16x8_t vbrsrq_x(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u32)))
-uint32x4_t vbrsrq_x_n_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u32)))
-uint32x4_t vbrsrq_x(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u8)))
-uint8x16_t vbrsrq_x_n_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_u8)))
-uint8x16_t vbrsrq_x(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s16)))
-int16x8_t vcaddq_rot270_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s16)))
-int16x8_t vcaddq_rot270_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s32)))
-int32x4_t vcaddq_rot270_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s32)))
-int32x4_t vcaddq_rot270_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s8)))
-int8x16_t vcaddq_rot270_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_s8)))
-int8x16_t vcaddq_rot270_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u16)))
-uint16x8_t vcaddq_rot270_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u16)))
-uint16x8_t vcaddq_rot270_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u32)))
-uint32x4_t vcaddq_rot270_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u32)))
-uint32x4_t vcaddq_rot270_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u8)))
-uint8x16_t vcaddq_rot270_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_u8)))
-uint8x16_t vcaddq_rot270_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s16)))
-int16x8_t vcaddq_rot270_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s16)))
-int16x8_t vcaddq_rot270(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s32)))
-int32x4_t vcaddq_rot270_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s32)))
-int32x4_t vcaddq_rot270(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s8)))
-int8x16_t vcaddq_rot270_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_s8)))
-int8x16_t vcaddq_rot270(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u16)))
-uint16x8_t vcaddq_rot270_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u16)))
-uint16x8_t vcaddq_rot270(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u32)))
-uint32x4_t vcaddq_rot270_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u32)))
-uint32x4_t vcaddq_rot270(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u8)))
-uint8x16_t vcaddq_rot270_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_u8)))
-uint8x16_t vcaddq_rot270(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s16)))
-int16x8_t vcaddq_rot270_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s16)))
-int16x8_t vcaddq_rot270_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s32)))
-int32x4_t vcaddq_rot270_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s32)))
-int32x4_t vcaddq_rot270_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s8)))
-int8x16_t vcaddq_rot270_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_s8)))
-int8x16_t vcaddq_rot270_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u16)))
-uint16x8_t vcaddq_rot270_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u16)))
-uint16x8_t vcaddq_rot270_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u32)))
-uint32x4_t vcaddq_rot270_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u32)))
-uint32x4_t vcaddq_rot270_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u8)))
-uint8x16_t vcaddq_rot270_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_u8)))
-uint8x16_t vcaddq_rot270_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s16)))
-int16x8_t vcaddq_rot90_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s16)))
-int16x8_t vcaddq_rot90_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s32)))
-int32x4_t vcaddq_rot90_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s32)))
-int32x4_t vcaddq_rot90_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s8)))
-int8x16_t vcaddq_rot90_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_s8)))
-int8x16_t vcaddq_rot90_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u16)))
-uint16x8_t vcaddq_rot90_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u16)))
-uint16x8_t vcaddq_rot90_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u32)))
-uint32x4_t vcaddq_rot90_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u32)))
-uint32x4_t vcaddq_rot90_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u8)))
-uint8x16_t vcaddq_rot90_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_u8)))
-uint8x16_t vcaddq_rot90_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s16)))
-int16x8_t vcaddq_rot90_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s16)))
-int16x8_t vcaddq_rot90(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s32)))
-int32x4_t vcaddq_rot90_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s32)))
-int32x4_t vcaddq_rot90(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s8)))
-int8x16_t vcaddq_rot90_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_s8)))
-int8x16_t vcaddq_rot90(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u16)))
-uint16x8_t vcaddq_rot90_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u16)))
-uint16x8_t vcaddq_rot90(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u32)))
-uint32x4_t vcaddq_rot90_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u32)))
-uint32x4_t vcaddq_rot90(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u8)))
-uint8x16_t vcaddq_rot90_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_u8)))
-uint8x16_t vcaddq_rot90(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s16)))
-int16x8_t vcaddq_rot90_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s16)))
-int16x8_t vcaddq_rot90_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s32)))
-int32x4_t vcaddq_rot90_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s32)))
-int32x4_t vcaddq_rot90_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s8)))
-int8x16_t vcaddq_rot90_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_s8)))
-int8x16_t vcaddq_rot90_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u16)))
-uint16x8_t vcaddq_rot90_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u16)))
-uint16x8_t vcaddq_rot90_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u32)))
-uint32x4_t vcaddq_rot90_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u32)))
-uint32x4_t vcaddq_rot90_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u8)))
-uint8x16_t vcaddq_rot90_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_u8)))
-uint8x16_t vcaddq_rot90_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s16)))
-int16x8_t vclsq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s16)))
-int16x8_t vclsq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s32)))
-int32x4_t vclsq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s32)))
-int32x4_t vclsq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s8)))
-int8x16_t vclsq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_m_s8)))
-int8x16_t vclsq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s16)))
-int16x8_t vclsq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s16)))
-int16x8_t vclsq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s32)))
-int32x4_t vclsq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s32)))
-int32x4_t vclsq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s8)))
-int8x16_t vclsq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_s8)))
-int8x16_t vclsq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s16)))
-int16x8_t vclsq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s16)))
-int16x8_t vclsq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s32)))
-int32x4_t vclsq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s32)))
-int32x4_t vclsq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s8)))
-int8x16_t vclsq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclsq_x_s8)))
-int8x16_t vclsq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s16)))
-int16x8_t vclzq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s16)))
-int16x8_t vclzq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s32)))
-int32x4_t vclzq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s32)))
-int32x4_t vclzq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s8)))
-int8x16_t vclzq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_s8)))
-int8x16_t vclzq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u16)))
-uint16x8_t vclzq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u16)))
-uint16x8_t vclzq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u32)))
-uint32x4_t vclzq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u32)))
-uint32x4_t vclzq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u8)))
-uint8x16_t vclzq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_m_u8)))
-uint8x16_t vclzq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s16)))
-int16x8_t vclzq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s16)))
-int16x8_t vclzq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s32)))
-int32x4_t vclzq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s32)))
-int32x4_t vclzq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s8)))
-int8x16_t vclzq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_s8)))
-int8x16_t vclzq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u16)))
-uint16x8_t vclzq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u16)))
-uint16x8_t vclzq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u32)))
-uint32x4_t vclzq_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u32)))
-uint32x4_t vclzq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u8)))
-uint8x16_t vclzq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_u8)))
-uint8x16_t vclzq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s16)))
-int16x8_t vclzq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s16)))
-int16x8_t vclzq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s32)))
-int32x4_t vclzq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s32)))
-int32x4_t vclzq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s8)))
-int8x16_t vclzq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_s8)))
-int8x16_t vclzq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u16)))
-uint16x8_t vclzq_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u16)))
-uint16x8_t vclzq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u32)))
-uint32x4_t vclzq_x_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u32)))
-uint32x4_t vclzq_x(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u8)))
-uint8x16_t vclzq_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vclzq_x_u8)))
-uint8x16_t vclzq_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u16)))
-mve_pred16_t vcmpcsq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u16)))
-mve_pred16_t vcmpcsq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u32)))
-mve_pred16_t vcmpcsq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u32)))
-mve_pred16_t vcmpcsq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u8)))
-mve_pred16_t vcmpcsq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_n_u8)))
-mve_pred16_t vcmpcsq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u16)))
-mve_pred16_t vcmpcsq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u16)))
-mve_pred16_t vcmpcsq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u32)))
-mve_pred16_t vcmpcsq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u32)))
-mve_pred16_t vcmpcsq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u8)))
-mve_pred16_t vcmpcsq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_m_u8)))
-mve_pred16_t vcmpcsq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u16)))
-mve_pred16_t vcmpcsq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u16)))
-mve_pred16_t vcmpcsq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u32)))
-mve_pred16_t vcmpcsq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u32)))
-mve_pred16_t vcmpcsq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u8)))
-mve_pred16_t vcmpcsq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_n_u8)))
-mve_pred16_t vcmpcsq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u16)))
-mve_pred16_t vcmpcsq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u16)))
-mve_pred16_t vcmpcsq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u32)))
-mve_pred16_t vcmpcsq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u32)))
-mve_pred16_t vcmpcsq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u8)))
-mve_pred16_t vcmpcsq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpcsq_u8)))
-mve_pred16_t vcmpcsq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s16)))
-mve_pred16_t vcmpeqq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s16)))
-mve_pred16_t vcmpeqq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s32)))
-mve_pred16_t vcmpeqq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s32)))
-mve_pred16_t vcmpeqq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s8)))
-mve_pred16_t vcmpeqq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_s8)))
-mve_pred16_t vcmpeqq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u16)))
-mve_pred16_t vcmpeqq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u16)))
-mve_pred16_t vcmpeqq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u32)))
-mve_pred16_t vcmpeqq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u32)))
-mve_pred16_t vcmpeqq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u8)))
-mve_pred16_t vcmpeqq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_u8)))
-mve_pred16_t vcmpeqq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s16)))
-mve_pred16_t vcmpeqq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s16)))
-mve_pred16_t vcmpeqq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s32)))
-mve_pred16_t vcmpeqq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s32)))
-mve_pred16_t vcmpeqq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s8)))
-mve_pred16_t vcmpeqq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_s8)))
-mve_pred16_t vcmpeqq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u16)))
-mve_pred16_t vcmpeqq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u16)))
-mve_pred16_t vcmpeqq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u32)))
-mve_pred16_t vcmpeqq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u32)))
-mve_pred16_t vcmpeqq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u8)))
-mve_pred16_t vcmpeqq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_u8)))
-mve_pred16_t vcmpeqq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s16)))
-mve_pred16_t vcmpeqq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s16)))
-mve_pred16_t vcmpeqq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s32)))
-mve_pred16_t vcmpeqq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s32)))
-mve_pred16_t vcmpeqq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s8)))
-mve_pred16_t vcmpeqq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_s8)))
-mve_pred16_t vcmpeqq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u16)))
-mve_pred16_t vcmpeqq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u16)))
-mve_pred16_t vcmpeqq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u32)))
-mve_pred16_t vcmpeqq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u32)))
-mve_pred16_t vcmpeqq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u8)))
-mve_pred16_t vcmpeqq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_u8)))
-mve_pred16_t vcmpeqq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s16)))
-mve_pred16_t vcmpeqq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s16)))
-mve_pred16_t vcmpeqq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s32)))
-mve_pred16_t vcmpeqq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s32)))
-mve_pred16_t vcmpeqq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s8)))
-mve_pred16_t vcmpeqq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_s8)))
-mve_pred16_t vcmpeqq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u16)))
-mve_pred16_t vcmpeqq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u16)))
-mve_pred16_t vcmpeqq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u32)))
-mve_pred16_t vcmpeqq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u32)))
-mve_pred16_t vcmpeqq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u8)))
-mve_pred16_t vcmpeqq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_u8)))
-mve_pred16_t vcmpeqq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s16)))
-mve_pred16_t vcmpgeq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s16)))
-mve_pred16_t vcmpgeq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s32)))
-mve_pred16_t vcmpgeq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s32)))
-mve_pred16_t vcmpgeq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s8)))
-mve_pred16_t vcmpgeq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_s8)))
-mve_pred16_t vcmpgeq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s16)))
-mve_pred16_t vcmpgeq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s16)))
-mve_pred16_t vcmpgeq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s32)))
-mve_pred16_t vcmpgeq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s32)))
-mve_pred16_t vcmpgeq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s8)))
-mve_pred16_t vcmpgeq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_s8)))
-mve_pred16_t vcmpgeq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s16)))
-mve_pred16_t vcmpgeq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s16)))
-mve_pred16_t vcmpgeq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s32)))
-mve_pred16_t vcmpgeq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s32)))
-mve_pred16_t vcmpgeq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s8)))
-mve_pred16_t vcmpgeq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_s8)))
-mve_pred16_t vcmpgeq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s16)))
-mve_pred16_t vcmpgeq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s16)))
-mve_pred16_t vcmpgeq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s32)))
-mve_pred16_t vcmpgeq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s32)))
-mve_pred16_t vcmpgeq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s8)))
-mve_pred16_t vcmpgeq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_s8)))
-mve_pred16_t vcmpgeq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s16)))
-mve_pred16_t vcmpgtq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s16)))
-mve_pred16_t vcmpgtq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s32)))
-mve_pred16_t vcmpgtq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s32)))
-mve_pred16_t vcmpgtq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s8)))
-mve_pred16_t vcmpgtq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_s8)))
-mve_pred16_t vcmpgtq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s16)))
-mve_pred16_t vcmpgtq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s16)))
-mve_pred16_t vcmpgtq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s32)))
-mve_pred16_t vcmpgtq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s32)))
-mve_pred16_t vcmpgtq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s8)))
-mve_pred16_t vcmpgtq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_s8)))
-mve_pred16_t vcmpgtq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s16)))
-mve_pred16_t vcmpgtq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s16)))
-mve_pred16_t vcmpgtq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s32)))
-mve_pred16_t vcmpgtq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s32)))
-mve_pred16_t vcmpgtq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s8)))
-mve_pred16_t vcmpgtq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_s8)))
-mve_pred16_t vcmpgtq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s16)))
-mve_pred16_t vcmpgtq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s16)))
-mve_pred16_t vcmpgtq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s32)))
-mve_pred16_t vcmpgtq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s32)))
-mve_pred16_t vcmpgtq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s8)))
-mve_pred16_t vcmpgtq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_s8)))
-mve_pred16_t vcmpgtq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u16)))
-mve_pred16_t vcmphiq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u16)))
-mve_pred16_t vcmphiq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u32)))
-mve_pred16_t vcmphiq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u32)))
-mve_pred16_t vcmphiq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u8)))
-mve_pred16_t vcmphiq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_n_u8)))
-mve_pred16_t vcmphiq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u16)))
-mve_pred16_t vcmphiq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u16)))
-mve_pred16_t vcmphiq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u32)))
-mve_pred16_t vcmphiq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u32)))
-mve_pred16_t vcmphiq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u8)))
-mve_pred16_t vcmphiq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_m_u8)))
-mve_pred16_t vcmphiq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u16)))
-mve_pred16_t vcmphiq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u16)))
-mve_pred16_t vcmphiq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u32)))
-mve_pred16_t vcmphiq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u32)))
-mve_pred16_t vcmphiq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u8)))
-mve_pred16_t vcmphiq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_n_u8)))
-mve_pred16_t vcmphiq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u16)))
-mve_pred16_t vcmphiq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u16)))
-mve_pred16_t vcmphiq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u32)))
-mve_pred16_t vcmphiq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u32)))
-mve_pred16_t vcmphiq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u8)))
-mve_pred16_t vcmphiq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmphiq_u8)))
-mve_pred16_t vcmphiq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s16)))
-mve_pred16_t vcmpleq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s16)))
-mve_pred16_t vcmpleq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s32)))
-mve_pred16_t vcmpleq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s32)))
-mve_pred16_t vcmpleq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s8)))
-mve_pred16_t vcmpleq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_s8)))
-mve_pred16_t vcmpleq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s16)))
-mve_pred16_t vcmpleq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s16)))
-mve_pred16_t vcmpleq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s32)))
-mve_pred16_t vcmpleq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s32)))
-mve_pred16_t vcmpleq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s8)))
-mve_pred16_t vcmpleq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_s8)))
-mve_pred16_t vcmpleq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s16)))
-mve_pred16_t vcmpleq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s16)))
-mve_pred16_t vcmpleq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s32)))
-mve_pred16_t vcmpleq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s32)))
-mve_pred16_t vcmpleq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s8)))
-mve_pred16_t vcmpleq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_s8)))
-mve_pred16_t vcmpleq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s16)))
-mve_pred16_t vcmpleq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s16)))
-mve_pred16_t vcmpleq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s32)))
-mve_pred16_t vcmpleq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s32)))
-mve_pred16_t vcmpleq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s8)))
-mve_pred16_t vcmpleq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_s8)))
-mve_pred16_t vcmpleq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s16)))
-mve_pred16_t vcmpltq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s16)))
-mve_pred16_t vcmpltq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s32)))
-mve_pred16_t vcmpltq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s32)))
-mve_pred16_t vcmpltq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s8)))
-mve_pred16_t vcmpltq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_s8)))
-mve_pred16_t vcmpltq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s16)))
-mve_pred16_t vcmpltq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s16)))
-mve_pred16_t vcmpltq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s32)))
-mve_pred16_t vcmpltq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s32)))
-mve_pred16_t vcmpltq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s8)))
-mve_pred16_t vcmpltq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_s8)))
-mve_pred16_t vcmpltq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s16)))
-mve_pred16_t vcmpltq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s16)))
-mve_pred16_t vcmpltq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s32)))
-mve_pred16_t vcmpltq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s32)))
-mve_pred16_t vcmpltq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s8)))
-mve_pred16_t vcmpltq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_s8)))
-mve_pred16_t vcmpltq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s16)))
-mve_pred16_t vcmpltq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s16)))
-mve_pred16_t vcmpltq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s32)))
-mve_pred16_t vcmpltq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s32)))
-mve_pred16_t vcmpltq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s8)))
-mve_pred16_t vcmpltq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_s8)))
-mve_pred16_t vcmpltq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s16)))
-mve_pred16_t vcmpneq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s16)))
-mve_pred16_t vcmpneq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s32)))
-mve_pred16_t vcmpneq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s32)))
-mve_pred16_t vcmpneq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s8)))
-mve_pred16_t vcmpneq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_s8)))
-mve_pred16_t vcmpneq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u16)))
-mve_pred16_t vcmpneq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u16)))
-mve_pred16_t vcmpneq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u32)))
-mve_pred16_t vcmpneq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u32)))
-mve_pred16_t vcmpneq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u8)))
-mve_pred16_t vcmpneq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_u8)))
-mve_pred16_t vcmpneq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s16)))
-mve_pred16_t vcmpneq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s16)))
-mve_pred16_t vcmpneq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s32)))
-mve_pred16_t vcmpneq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s32)))
-mve_pred16_t vcmpneq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s8)))
-mve_pred16_t vcmpneq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_s8)))
-mve_pred16_t vcmpneq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u16)))
-mve_pred16_t vcmpneq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u16)))
-mve_pred16_t vcmpneq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u32)))
-mve_pred16_t vcmpneq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u32)))
-mve_pred16_t vcmpneq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u8)))
-mve_pred16_t vcmpneq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_u8)))
-mve_pred16_t vcmpneq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s16)))
-mve_pred16_t vcmpneq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s16)))
-mve_pred16_t vcmpneq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s32)))
-mve_pred16_t vcmpneq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s32)))
-mve_pred16_t vcmpneq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s8)))
-mve_pred16_t vcmpneq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_s8)))
-mve_pred16_t vcmpneq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u16)))
-mve_pred16_t vcmpneq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u16)))
-mve_pred16_t vcmpneq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u32)))
-mve_pred16_t vcmpneq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u32)))
-mve_pred16_t vcmpneq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u8)))
-mve_pred16_t vcmpneq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_u8)))
-mve_pred16_t vcmpneq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s16)))
-mve_pred16_t vcmpneq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s16)))
-mve_pred16_t vcmpneq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s32)))
-mve_pred16_t vcmpneq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s32)))
-mve_pred16_t vcmpneq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s8)))
-mve_pred16_t vcmpneq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_s8)))
-mve_pred16_t vcmpneq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u16)))
-mve_pred16_t vcmpneq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u16)))
-mve_pred16_t vcmpneq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u32)))
-mve_pred16_t vcmpneq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u32)))
-mve_pred16_t vcmpneq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u8)))
-mve_pred16_t vcmpneq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_u8)))
-mve_pred16_t vcmpneq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_s16)))
-int16x8_t vcreateq_s16(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_s32)))
-int32x4_t vcreateq_s32(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_s64)))
-int64x2_t vcreateq_s64(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_s8)))
-int8x16_t vcreateq_s8(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_u16)))
-uint16x8_t vcreateq_u16(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_u32)))
-uint32x4_t vcreateq_u32(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_u64)))
-uint64x2_t vcreateq_u64(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_u8)))
-uint8x16_t vcreateq_u8(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp16q)))
-mve_pred16_t vctp16q(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp16q_m)))
-mve_pred16_t vctp16q_m(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp32q)))
-mve_pred16_t vctp32q(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp32q_m)))
-mve_pred16_t vctp32q_m(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp64q)))
-mve_pred16_t vctp64q(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp64q_m)))
-mve_pred16_t vctp64q_m(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp8q)))
-mve_pred16_t vctp8q(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vctp8q_m)))
-mve_pred16_t vctp8q_m(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u16)))
-uint16x8_t vddupq_m_n_u16(uint16x8_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u16)))
-uint16x8_t vddupq_m(uint16x8_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u32)))
-uint32x4_t vddupq_m_n_u32(uint32x4_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u32)))
-uint32x4_t vddupq_m(uint32x4_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u8)))
-uint8x16_t vddupq_m_n_u8(uint8x16_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_n_u8)))
-uint8x16_t vddupq_m(uint8x16_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u16)))
-uint16x8_t vddupq_m_wb_u16(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u16)))
-uint16x8_t vddupq_m(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u32)))
-uint32x4_t vddupq_m_wb_u32(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u32)))
-uint32x4_t vddupq_m(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u8)))
-uint8x16_t vddupq_m_wb_u8(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_m_wb_u8)))
-uint8x16_t vddupq_m(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u16)))
-uint16x8_t vddupq_n_u16(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u16)))
-uint16x8_t vddupq_u16(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u32)))
-uint32x4_t vddupq_n_u32(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u32)))
-uint32x4_t vddupq_u32(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u8)))
-uint8x16_t vddupq_n_u8(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_n_u8)))
-uint8x16_t vddupq_u8(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u16)))
-uint16x8_t vddupq_wb_u16(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u16)))
-uint16x8_t vddupq_u16(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u32)))
-uint32x4_t vddupq_wb_u32(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u32)))
-uint32x4_t vddupq_u32(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u8)))
-uint8x16_t vddupq_wb_u8(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_wb_u8)))
-uint8x16_t vddupq_u8(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u16)))
-uint16x8_t vddupq_x_n_u16(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u16)))
-uint16x8_t vddupq_x_u16(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u32)))
-uint32x4_t vddupq_x_n_u32(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u32)))
-uint32x4_t vddupq_x_u32(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u8)))
-uint8x16_t vddupq_x_n_u8(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_n_u8)))
-uint8x16_t vddupq_x_u8(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u16)))
-uint16x8_t vddupq_x_wb_u16(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u16)))
-uint16x8_t vddupq_x_u16(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u32)))
-uint32x4_t vddupq_x_wb_u32(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u32)))
-uint32x4_t vddupq_x_u32(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u8)))
-uint8x16_t vddupq_x_wb_u8(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vddupq_x_wb_u8)))
-uint8x16_t vddupq_x_u8(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s16)))
-int16x8_t vdupq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s16)))
-int16x8_t vdupq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s32)))
-int32x4_t vdupq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s32)))
-int32x4_t vdupq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s8)))
-int8x16_t vdupq_m_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_s8)))
-int8x16_t vdupq_m(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u16)))
-uint16x8_t vdupq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u16)))
-uint16x8_t vdupq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u32)))
-uint32x4_t vdupq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u32)))
-uint32x4_t vdupq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u8)))
-uint8x16_t vdupq_m_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_u8)))
-uint8x16_t vdupq_m(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_s16)))
-int16x8_t vdupq_n_s16(int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_s32)))
-int32x4_t vdupq_n_s32(int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_s8)))
-int8x16_t vdupq_n_s8(int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_u16)))
-uint16x8_t vdupq_n_u16(uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_u32)))
-uint32x4_t vdupq_n_u32(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_u8)))
-uint8x16_t vdupq_n_u8(uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_s16)))
-int16x8_t vdupq_x_n_s16(int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_s32)))
-int32x4_t vdupq_x_n_s32(int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_s8)))
-int8x16_t vdupq_x_n_s8(int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_u16)))
-uint16x8_t vdupq_x_n_u16(uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_u32)))
-uint32x4_t vdupq_x_n_u32(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_u8)))
-uint8x16_t vdupq_x_n_u8(uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u16)))
-uint16x8_t vdwdupq_m_n_u16(uint16x8_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u16)))
-uint16x8_t vdwdupq_m(uint16x8_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u32)))
-uint32x4_t vdwdupq_m_n_u32(uint32x4_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u32)))
-uint32x4_t vdwdupq_m(uint32x4_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u8)))
-uint8x16_t vdwdupq_m_n_u8(uint8x16_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_n_u8)))
-uint8x16_t vdwdupq_m(uint8x16_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u16)))
-uint16x8_t vdwdupq_m_wb_u16(uint16x8_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u16)))
-uint16x8_t vdwdupq_m(uint16x8_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u32)))
-uint32x4_t vdwdupq_m_wb_u32(uint32x4_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u32)))
-uint32x4_t vdwdupq_m(uint32x4_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u8)))
-uint8x16_t vdwdupq_m_wb_u8(uint8x16_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_m_wb_u8)))
-uint8x16_t vdwdupq_m(uint8x16_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u16)))
-uint16x8_t vdwdupq_n_u16(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u16)))
-uint16x8_t vdwdupq_u16(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u32)))
-uint32x4_t vdwdupq_n_u32(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u32)))
-uint32x4_t vdwdupq_u32(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u8)))
-uint8x16_t vdwdupq_n_u8(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_n_u8)))
-uint8x16_t vdwdupq_u8(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u16)))
-uint16x8_t vdwdupq_wb_u16(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u16)))
-uint16x8_t vdwdupq_u16(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u32)))
-uint32x4_t vdwdupq_wb_u32(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u32)))
-uint32x4_t vdwdupq_u32(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u8)))
-uint8x16_t vdwdupq_wb_u8(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_wb_u8)))
-uint8x16_t vdwdupq_u8(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u16)))
-uint16x8_t vdwdupq_x_n_u16(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u16)))
-uint16x8_t vdwdupq_x_u16(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u32)))
-uint32x4_t vdwdupq_x_n_u32(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u32)))
-uint32x4_t vdwdupq_x_u32(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u8)))
-uint8x16_t vdwdupq_x_n_u8(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_n_u8)))
-uint8x16_t vdwdupq_x_u8(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u16)))
-uint16x8_t vdwdupq_x_wb_u16(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u16)))
-uint16x8_t vdwdupq_x_u16(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u32)))
-uint32x4_t vdwdupq_x_wb_u32(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u32)))
-uint32x4_t vdwdupq_x_u32(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u8)))
-uint8x16_t vdwdupq_x_wb_u8(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdwdupq_x_wb_u8)))
-uint8x16_t vdwdupq_x_u8(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s16)))
-int16x8_t veorq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s16)))
-int16x8_t veorq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s32)))
-int32x4_t veorq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s32)))
-int32x4_t veorq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s8)))
-int8x16_t veorq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_s8)))
-int8x16_t veorq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u16)))
-uint16x8_t veorq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u16)))
-uint16x8_t veorq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u32)))
-uint32x4_t veorq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u32)))
-uint32x4_t veorq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u8)))
-uint8x16_t veorq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_u8)))
-uint8x16_t veorq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_s16)))
-int16x8_t veorq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_s16)))
-int16x8_t veorq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_s32)))
-int32x4_t veorq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_s32)))
-int32x4_t veorq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_s8)))
-int8x16_t veorq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_s8)))
-int8x16_t veorq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_u16)))
-uint16x8_t veorq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_u16)))
-uint16x8_t veorq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_u32)))
-uint32x4_t veorq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_u32)))
-uint32x4_t veorq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_u8)))
-uint8x16_t veorq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_u8)))
-uint8x16_t veorq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s16)))
-int16x8_t veorq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s16)))
-int16x8_t veorq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s32)))
-int32x4_t veorq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s32)))
-int32x4_t veorq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s8)))
-int8x16_t veorq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_s8)))
-int8x16_t veorq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u16)))
-uint16x8_t veorq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u16)))
-uint16x8_t veorq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u32)))
-uint32x4_t veorq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u32)))
-uint32x4_t veorq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u8)))
-uint8x16_t veorq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_u8)))
-uint8x16_t veorq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s16)))
-int16_t vgetq_lane_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s16)))
-int16_t vgetq_lane(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s32)))
-int32_t vgetq_lane_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s32)))
-int32_t vgetq_lane(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s64)))
-int64_t vgetq_lane_s64(int64x2_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s64)))
-int64_t vgetq_lane(int64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s8)))
-int8_t vgetq_lane_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_s8)))
-int8_t vgetq_lane(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u16)))
-uint16_t vgetq_lane_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u16)))
-uint16_t vgetq_lane(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u32)))
-uint32_t vgetq_lane_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u32)))
-uint32_t vgetq_lane(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u64)))
-uint64_t vgetq_lane_u64(uint64x2_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u64)))
-uint64_t vgetq_lane(uint64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u8)))
-uint8_t vgetq_lane_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_u8)))
-uint8_t vgetq_lane(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s16)))
-int16x8_t vhaddq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s16)))
-int16x8_t vhaddq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s32)))
-int32x4_t vhaddq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s32)))
-int32x4_t vhaddq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s8)))
-int8x16_t vhaddq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_s8)))
-int8x16_t vhaddq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u16)))
-uint16x8_t vhaddq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u16)))
-uint16x8_t vhaddq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u32)))
-uint32x4_t vhaddq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u32)))
-uint32x4_t vhaddq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u8)))
-uint8x16_t vhaddq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_n_u8)))
-uint8x16_t vhaddq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s16)))
-int16x8_t vhaddq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s16)))
-int16x8_t vhaddq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s32)))
-int32x4_t vhaddq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s32)))
-int32x4_t vhaddq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s8)))
-int8x16_t vhaddq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_s8)))
-int8x16_t vhaddq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u16)))
-uint16x8_t vhaddq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u16)))
-uint16x8_t vhaddq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u32)))
-uint32x4_t vhaddq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u32)))
-uint32x4_t vhaddq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u8)))
-uint8x16_t vhaddq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_m_u8)))
-uint8x16_t vhaddq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s16)))
-int16x8_t vhaddq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s16)))
-int16x8_t vhaddq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s32)))
-int32x4_t vhaddq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s32)))
-int32x4_t vhaddq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s8)))
-int8x16_t vhaddq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_s8)))
-int8x16_t vhaddq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u16)))
-uint16x8_t vhaddq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u16)))
-uint16x8_t vhaddq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u32)))
-uint32x4_t vhaddq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u32)))
-uint32x4_t vhaddq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u8)))
-uint8x16_t vhaddq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_n_u8)))
-uint8x16_t vhaddq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s16)))
-int16x8_t vhaddq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s16)))
-int16x8_t vhaddq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s32)))
-int32x4_t vhaddq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s32)))
-int32x4_t vhaddq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s8)))
-int8x16_t vhaddq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_s8)))
-int8x16_t vhaddq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u16)))
-uint16x8_t vhaddq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u16)))
-uint16x8_t vhaddq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u32)))
-uint32x4_t vhaddq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u32)))
-uint32x4_t vhaddq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u8)))
-uint8x16_t vhaddq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_u8)))
-uint8x16_t vhaddq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s16)))
-int16x8_t vhaddq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s16)))
-int16x8_t vhaddq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s32)))
-int32x4_t vhaddq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s32)))
-int32x4_t vhaddq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s8)))
-int8x16_t vhaddq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_s8)))
-int8x16_t vhaddq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u16)))
-uint16x8_t vhaddq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u16)))
-uint16x8_t vhaddq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u32)))
-uint32x4_t vhaddq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u32)))
-uint32x4_t vhaddq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u8)))
-uint8x16_t vhaddq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_n_u8)))
-uint8x16_t vhaddq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s16)))
-int16x8_t vhaddq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s16)))
-int16x8_t vhaddq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s32)))
-int32x4_t vhaddq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s32)))
-int32x4_t vhaddq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s8)))
-int8x16_t vhaddq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_s8)))
-int8x16_t vhaddq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u16)))
-uint16x8_t vhaddq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u16)))
-uint16x8_t vhaddq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u32)))
-uint32x4_t vhaddq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u32)))
-uint32x4_t vhaddq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u8)))
-uint8x16_t vhaddq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhaddq_x_u8)))
-uint8x16_t vhaddq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s16)))
-int16x8_t vhcaddq_rot270_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s16)))
-int16x8_t vhcaddq_rot270_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s32)))
-int32x4_t vhcaddq_rot270_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s32)))
-int32x4_t vhcaddq_rot270_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s8)))
-int8x16_t vhcaddq_rot270_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_m_s8)))
-int8x16_t vhcaddq_rot270_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s16)))
-int16x8_t vhcaddq_rot270_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s16)))
-int16x8_t vhcaddq_rot270(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s32)))
-int32x4_t vhcaddq_rot270_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s32)))
-int32x4_t vhcaddq_rot270(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s8)))
-int8x16_t vhcaddq_rot270_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_s8)))
-int8x16_t vhcaddq_rot270(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s16)))
-int16x8_t vhcaddq_rot270_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s16)))
-int16x8_t vhcaddq_rot270_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s32)))
-int32x4_t vhcaddq_rot270_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s32)))
-int32x4_t vhcaddq_rot270_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s8)))
-int8x16_t vhcaddq_rot270_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot270_x_s8)))
-int8x16_t vhcaddq_rot270_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s16)))
-int16x8_t vhcaddq_rot90_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s16)))
-int16x8_t vhcaddq_rot90_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s32)))
-int32x4_t vhcaddq_rot90_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s32)))
-int32x4_t vhcaddq_rot90_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s8)))
-int8x16_t vhcaddq_rot90_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_m_s8)))
-int8x16_t vhcaddq_rot90_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s16)))
-int16x8_t vhcaddq_rot90_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s16)))
-int16x8_t vhcaddq_rot90(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s32)))
-int32x4_t vhcaddq_rot90_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s32)))
-int32x4_t vhcaddq_rot90(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s8)))
-int8x16_t vhcaddq_rot90_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_s8)))
-int8x16_t vhcaddq_rot90(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s16)))
-int16x8_t vhcaddq_rot90_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s16)))
-int16x8_t vhcaddq_rot90_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s32)))
-int32x4_t vhcaddq_rot90_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s32)))
-int32x4_t vhcaddq_rot90_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s8)))
-int8x16_t vhcaddq_rot90_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhcaddq_rot90_x_s8)))
-int8x16_t vhcaddq_rot90_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s16)))
-int16x8_t vhsubq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s16)))
-int16x8_t vhsubq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s32)))
-int32x4_t vhsubq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s32)))
-int32x4_t vhsubq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s8)))
-int8x16_t vhsubq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_s8)))
-int8x16_t vhsubq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u16)))
-uint16x8_t vhsubq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u16)))
-uint16x8_t vhsubq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u32)))
-uint32x4_t vhsubq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u32)))
-uint32x4_t vhsubq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u8)))
-uint8x16_t vhsubq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_n_u8)))
-uint8x16_t vhsubq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s16)))
-int16x8_t vhsubq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s16)))
-int16x8_t vhsubq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s32)))
-int32x4_t vhsubq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s32)))
-int32x4_t vhsubq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s8)))
-int8x16_t vhsubq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_s8)))
-int8x16_t vhsubq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u16)))
-uint16x8_t vhsubq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u16)))
-uint16x8_t vhsubq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u32)))
-uint32x4_t vhsubq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u32)))
-uint32x4_t vhsubq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u8)))
-uint8x16_t vhsubq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_m_u8)))
-uint8x16_t vhsubq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s16)))
-int16x8_t vhsubq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s16)))
-int16x8_t vhsubq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s32)))
-int32x4_t vhsubq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s32)))
-int32x4_t vhsubq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s8)))
-int8x16_t vhsubq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_s8)))
-int8x16_t vhsubq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u16)))
-uint16x8_t vhsubq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u16)))
-uint16x8_t vhsubq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u32)))
-uint32x4_t vhsubq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u32)))
-uint32x4_t vhsubq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u8)))
-uint8x16_t vhsubq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_n_u8)))
-uint8x16_t vhsubq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s16)))
-int16x8_t vhsubq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s16)))
-int16x8_t vhsubq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s32)))
-int32x4_t vhsubq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s32)))
-int32x4_t vhsubq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s8)))
-int8x16_t vhsubq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_s8)))
-int8x16_t vhsubq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u16)))
-uint16x8_t vhsubq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u16)))
-uint16x8_t vhsubq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u32)))
-uint32x4_t vhsubq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u32)))
-uint32x4_t vhsubq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u8)))
-uint8x16_t vhsubq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_u8)))
-uint8x16_t vhsubq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s16)))
-int16x8_t vhsubq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s16)))
-int16x8_t vhsubq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s32)))
-int32x4_t vhsubq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s32)))
-int32x4_t vhsubq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s8)))
-int8x16_t vhsubq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_s8)))
-int8x16_t vhsubq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u16)))
-uint16x8_t vhsubq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u16)))
-uint16x8_t vhsubq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u32)))
-uint32x4_t vhsubq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u32)))
-uint32x4_t vhsubq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u8)))
-uint8x16_t vhsubq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_n_u8)))
-uint8x16_t vhsubq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s16)))
-int16x8_t vhsubq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s16)))
-int16x8_t vhsubq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s32)))
-int32x4_t vhsubq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s32)))
-int32x4_t vhsubq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s8)))
-int8x16_t vhsubq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_s8)))
-int8x16_t vhsubq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u16)))
-uint16x8_t vhsubq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u16)))
-uint16x8_t vhsubq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u32)))
-uint32x4_t vhsubq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u32)))
-uint32x4_t vhsubq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u8)))
-uint8x16_t vhsubq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vhsubq_x_u8)))
-uint8x16_t vhsubq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u16)))
-uint16x8_t vidupq_m_n_u16(uint16x8_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u16)))
-uint16x8_t vidupq_m(uint16x8_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u32)))
-uint32x4_t vidupq_m_n_u32(uint32x4_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u32)))
-uint32x4_t vidupq_m(uint32x4_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u8)))
-uint8x16_t vidupq_m_n_u8(uint8x16_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_n_u8)))
-uint8x16_t vidupq_m(uint8x16_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u16)))
-uint16x8_t vidupq_m_wb_u16(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u16)))
-uint16x8_t vidupq_m(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u32)))
-uint32x4_t vidupq_m_wb_u32(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u32)))
-uint32x4_t vidupq_m(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u8)))
-uint8x16_t vidupq_m_wb_u8(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_m_wb_u8)))
-uint8x16_t vidupq_m(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u16)))
-uint16x8_t vidupq_n_u16(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u16)))
-uint16x8_t vidupq_u16(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u32)))
-uint32x4_t vidupq_n_u32(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u32)))
-uint32x4_t vidupq_u32(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u8)))
-uint8x16_t vidupq_n_u8(uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_n_u8)))
-uint8x16_t vidupq_u8(uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u16)))
-uint16x8_t vidupq_wb_u16(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u16)))
-uint16x8_t vidupq_u16(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u32)))
-uint32x4_t vidupq_wb_u32(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u32)))
-uint32x4_t vidupq_u32(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u8)))
-uint8x16_t vidupq_wb_u8(uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_wb_u8)))
-uint8x16_t vidupq_u8(uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u16)))
-uint16x8_t vidupq_x_n_u16(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u16)))
-uint16x8_t vidupq_x_u16(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u32)))
-uint32x4_t vidupq_x_n_u32(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u32)))
-uint32x4_t vidupq_x_u32(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u8)))
-uint8x16_t vidupq_x_n_u8(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_n_u8)))
-uint8x16_t vidupq_x_u8(uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u16)))
-uint16x8_t vidupq_x_wb_u16(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u16)))
-uint16x8_t vidupq_x_u16(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u32)))
-uint32x4_t vidupq_x_wb_u32(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u32)))
-uint32x4_t vidupq_x_u32(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u8)))
-uint8x16_t vidupq_x_wb_u8(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vidupq_x_wb_u8)))
-uint8x16_t vidupq_x_u8(uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u16)))
-uint16x8_t viwdupq_m_n_u16(uint16x8_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u16)))
-uint16x8_t viwdupq_m(uint16x8_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u32)))
-uint32x4_t viwdupq_m_n_u32(uint32x4_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u32)))
-uint32x4_t viwdupq_m(uint32x4_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u8)))
-uint8x16_t viwdupq_m_n_u8(uint8x16_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_n_u8)))
-uint8x16_t viwdupq_m(uint8x16_t, uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u16)))
-uint16x8_t viwdupq_m_wb_u16(uint16x8_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u16)))
-uint16x8_t viwdupq_m(uint16x8_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u32)))
-uint32x4_t viwdupq_m_wb_u32(uint32x4_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u32)))
-uint32x4_t viwdupq_m(uint32x4_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u8)))
-uint8x16_t viwdupq_m_wb_u8(uint8x16_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_m_wb_u8)))
-uint8x16_t viwdupq_m(uint8x16_t, uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u16)))
-uint16x8_t viwdupq_n_u16(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u16)))
-uint16x8_t viwdupq_u16(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u32)))
-uint32x4_t viwdupq_n_u32(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u32)))
-uint32x4_t viwdupq_u32(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u8)))
-uint8x16_t viwdupq_n_u8(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_n_u8)))
-uint8x16_t viwdupq_u8(uint32_t, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u16)))
-uint16x8_t viwdupq_wb_u16(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u16)))
-uint16x8_t viwdupq_u16(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u32)))
-uint32x4_t viwdupq_wb_u32(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u32)))
-uint32x4_t viwdupq_u32(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u8)))
-uint8x16_t viwdupq_wb_u8(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_wb_u8)))
-uint8x16_t viwdupq_u8(uint32_t *, uint32_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u16)))
-uint16x8_t viwdupq_x_n_u16(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u16)))
-uint16x8_t viwdupq_x_u16(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u32)))
-uint32x4_t viwdupq_x_n_u32(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u32)))
-uint32x4_t viwdupq_x_u32(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u8)))
-uint8x16_t viwdupq_x_n_u8(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_n_u8)))
-uint8x16_t viwdupq_x_u8(uint32_t, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u16)))
-uint16x8_t viwdupq_x_wb_u16(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u16)))
-uint16x8_t viwdupq_x_u16(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u32)))
-uint32x4_t viwdupq_x_wb_u32(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u32)))
-uint32x4_t viwdupq_x_u32(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u8)))
-uint8x16_t viwdupq_x_wb_u8(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_viwdupq_x_wb_u8)))
-uint8x16_t viwdupq_x_u8(uint32_t *, uint32_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s16)))
-int16x8_t vld1q_s16(const int16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s16)))
-int16x8_t vld1q(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s32)))
-int32x4_t vld1q_s32(const int32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s32)))
-int32x4_t vld1q(const int32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s8)))
-int8x16_t vld1q_s8(const int8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_s8)))
-int8x16_t vld1q(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u16)))
-uint16x8_t vld1q_u16(const uint16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u16)))
-uint16x8_t vld1q(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u32)))
-uint32x4_t vld1q_u32(const uint32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u32)))
-uint32x4_t vld1q(const uint32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u8)))
-uint8x16_t vld1q_u8(const uint8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_u8)))
-uint8x16_t vld1q(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s16)))
-int16x8_t vld1q_z_s16(const int16_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s16)))
-int16x8_t vld1q_z(const int16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s32)))
-int32x4_t vld1q_z_s32(const int32_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s32)))
-int32x4_t vld1q_z(const int32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s8)))
-int8x16_t vld1q_z_s8(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_s8)))
-int8x16_t vld1q_z(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u16)))
-uint16x8_t vld1q_z_u16(const uint16_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u16)))
-uint16x8_t vld1q_z(const uint16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u32)))
-uint32x4_t vld1q_z_u32(const uint32_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u32)))
-uint32x4_t vld1q_z(const uint32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u8)))
-uint8x16_t vld1q_z_u8(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_u8)))
-uint8x16_t vld1q_z(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s16)))
-int16x8x2_t vld2q_s16(const int16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s16)))
-int16x8x2_t vld2q(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s32)))
-int32x4x2_t vld2q_s32(const int32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s32)))
-int32x4x2_t vld2q(const int32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s8)))
-int8x16x2_t vld2q_s8(const int8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_s8)))
-int8x16x2_t vld2q(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u16)))
-uint16x8x2_t vld2q_u16(const uint16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u16)))
-uint16x8x2_t vld2q(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u32)))
-uint32x4x2_t vld2q_u32(const uint32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u32)))
-uint32x4x2_t vld2q(const uint32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u8)))
-uint8x16x2_t vld2q_u8(const uint8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_u8)))
-uint8x16x2_t vld2q(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s16)))
-int16x8x4_t vld4q_s16(const int16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s16)))
-int16x8x4_t vld4q(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s32)))
-int32x4x4_t vld4q_s32(const int32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s32)))
-int32x4x4_t vld4q(const int32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s8)))
-int8x16x4_t vld4q_s8(const int8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_s8)))
-int8x16x4_t vld4q(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u16)))
-uint16x8x4_t vld4q_u16(const uint16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u16)))
-uint16x8x4_t vld4q(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u32)))
-uint32x4x4_t vld4q_u32(const uint32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u32)))
-uint32x4x4_t vld4q(const uint32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u8)))
-uint8x16x4_t vld4q_u8(const uint8_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_u8)))
-uint8x16x4_t vld4q(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s16)))
-int16x8_t vldrbq_gather_offset_s16(const int8_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s16)))
-int16x8_t vldrbq_gather_offset(const int8_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s32)))
-int32x4_t vldrbq_gather_offset_s32(const int8_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s32)))
-int32x4_t vldrbq_gather_offset(const int8_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s8)))
-int8x16_t vldrbq_gather_offset_s8(const int8_t *, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_s8)))
-int8x16_t vldrbq_gather_offset(const int8_t *, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u16)))
-uint16x8_t vldrbq_gather_offset_u16(const uint8_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u16)))
-uint16x8_t vldrbq_gather_offset(const uint8_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u32)))
-uint32x4_t vldrbq_gather_offset_u32(const uint8_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u32)))
-uint32x4_t vldrbq_gather_offset(const uint8_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u8)))
-uint8x16_t vldrbq_gather_offset_u8(const uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_u8)))
-uint8x16_t vldrbq_gather_offset(const uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s16)))
-int16x8_t vldrbq_gather_offset_z_s16(const int8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s16)))
-int16x8_t vldrbq_gather_offset_z(const int8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s32)))
-int32x4_t vldrbq_gather_offset_z_s32(const int8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s32)))
-int32x4_t vldrbq_gather_offset_z(const int8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s8)))
-int8x16_t vldrbq_gather_offset_z_s8(const int8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_s8)))
-int8x16_t vldrbq_gather_offset_z(const int8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u16)))
-uint16x8_t vldrbq_gather_offset_z_u16(const uint8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u16)))
-uint16x8_t vldrbq_gather_offset_z(const uint8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u32)))
-uint32x4_t vldrbq_gather_offset_z_u32(const uint8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u32)))
-uint32x4_t vldrbq_gather_offset_z(const uint8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u8)))
-uint8x16_t vldrbq_gather_offset_z_u8(const uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_gather_offset_z_u8)))
-uint8x16_t vldrbq_gather_offset_z(const uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_s16)))
-int16x8_t vldrbq_s16(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_s32)))
-int32x4_t vldrbq_s32(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_s8)))
-int8x16_t vldrbq_s8(const int8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_u16)))
-uint16x8_t vldrbq_u16(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_u32)))
-uint32x4_t vldrbq_u32(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_u8)))
-uint8x16_t vldrbq_u8(const uint8_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_s16)))
-int16x8_t vldrbq_z_s16(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_s32)))
-int32x4_t vldrbq_z_s32(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_s8)))
-int8x16_t vldrbq_z_s8(const int8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_u16)))
-uint16x8_t vldrbq_z_u16(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_u32)))
-uint32x4_t vldrbq_z_u32(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrbq_z_u8)))
-uint8x16_t vldrbq_z_u8(const uint8_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_s64)))
-int64x2_t vldrdq_gather_base_s64(uint64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_u64)))
-uint64x2_t vldrdq_gather_base_u64(uint64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_wb_s64)))
-int64x2_t vldrdq_gather_base_wb_s64(uint64x2_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_wb_u64)))
-uint64x2_t vldrdq_gather_base_wb_u64(uint64x2_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_wb_z_s64)))
-int64x2_t vldrdq_gather_base_wb_z_s64(uint64x2_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_wb_z_u64)))
-uint64x2_t vldrdq_gather_base_wb_z_u64(uint64x2_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_z_s64)))
-int64x2_t vldrdq_gather_base_z_s64(uint64x2_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_base_z_u64)))
-uint64x2_t vldrdq_gather_base_z_u64(uint64x2_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_s64)))
-int64x2_t vldrdq_gather_offset_s64(const int64_t *, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_s64)))
-int64x2_t vldrdq_gather_offset(const int64_t *, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_u64)))
-uint64x2_t vldrdq_gather_offset_u64(const uint64_t *, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_u64)))
-uint64x2_t vldrdq_gather_offset(const uint64_t *, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_z_s64)))
-int64x2_t vldrdq_gather_offset_z_s64(const int64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_z_s64)))
-int64x2_t vldrdq_gather_offset_z(const int64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_z_u64)))
-uint64x2_t vldrdq_gather_offset_z_u64(const uint64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_offset_z_u64)))
-uint64x2_t vldrdq_gather_offset_z(const uint64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_s64)))
-int64x2_t vldrdq_gather_shifted_offset_s64(const int64_t *, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_s64)))
-int64x2_t vldrdq_gather_shifted_offset(const int64_t *, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_u64)))
-uint64x2_t vldrdq_gather_shifted_offset_u64(const uint64_t *, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_u64)))
-uint64x2_t vldrdq_gather_shifted_offset(const uint64_t *, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_z_s64)))
-int64x2_t vldrdq_gather_shifted_offset_z_s64(const int64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_z_s64)))
-int64x2_t vldrdq_gather_shifted_offset_z(const int64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_z_u64)))
-uint64x2_t vldrdq_gather_shifted_offset_z_u64(const uint64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrdq_gather_shifted_offset_z_u64)))
-uint64x2_t vldrdq_gather_shifted_offset_z(const uint64_t *, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_s16)))
-int16x8_t vldrhq_gather_offset_s16(const int16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_s16)))
-int16x8_t vldrhq_gather_offset(const int16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_s32)))
-int32x4_t vldrhq_gather_offset_s32(const int16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_s32)))
-int32x4_t vldrhq_gather_offset(const int16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_u16)))
-uint16x8_t vldrhq_gather_offset_u16(const uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_u16)))
-uint16x8_t vldrhq_gather_offset(const uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_u32)))
-uint32x4_t vldrhq_gather_offset_u32(const uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_u32)))
-uint32x4_t vldrhq_gather_offset(const uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_s16)))
-int16x8_t vldrhq_gather_offset_z_s16(const int16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_s16)))
-int16x8_t vldrhq_gather_offset_z(const int16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_s32)))
-int32x4_t vldrhq_gather_offset_z_s32(const int16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_s32)))
-int32x4_t vldrhq_gather_offset_z(const int16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_u16)))
-uint16x8_t vldrhq_gather_offset_z_u16(const uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_u16)))
-uint16x8_t vldrhq_gather_offset_z(const uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_u32)))
-uint32x4_t vldrhq_gather_offset_z_u32(const uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_u32)))
-uint32x4_t vldrhq_gather_offset_z(const uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_s16)))
-int16x8_t vldrhq_gather_shifted_offset_s16(const int16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_s16)))
-int16x8_t vldrhq_gather_shifted_offset(const int16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_s32)))
-int32x4_t vldrhq_gather_shifted_offset_s32(const int16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_s32)))
-int32x4_t vldrhq_gather_shifted_offset(const int16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_u16)))
-uint16x8_t vldrhq_gather_shifted_offset_u16(const uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_u16)))
-uint16x8_t vldrhq_gather_shifted_offset(const uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_u32)))
-uint32x4_t vldrhq_gather_shifted_offset_u32(const uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_u32)))
-uint32x4_t vldrhq_gather_shifted_offset(const uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_s16)))
-int16x8_t vldrhq_gather_shifted_offset_z_s16(const int16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_s16)))
-int16x8_t vldrhq_gather_shifted_offset_z(const int16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_s32)))
-int32x4_t vldrhq_gather_shifted_offset_z_s32(const int16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_s32)))
-int32x4_t vldrhq_gather_shifted_offset_z(const int16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_u16)))
-uint16x8_t vldrhq_gather_shifted_offset_z_u16(const uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_u16)))
-uint16x8_t vldrhq_gather_shifted_offset_z(const uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_u32)))
-uint32x4_t vldrhq_gather_shifted_offset_z_u32(const uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_u32)))
-uint32x4_t vldrhq_gather_shifted_offset_z(const uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_s16)))
-int16x8_t vldrhq_s16(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_s32)))
-int32x4_t vldrhq_s32(const int16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_u16)))
-uint16x8_t vldrhq_u16(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_u32)))
-uint32x4_t vldrhq_u32(const uint16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_s16)))
-int16x8_t vldrhq_z_s16(const int16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_s32)))
-int32x4_t vldrhq_z_s32(const int16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_u16)))
-uint16x8_t vldrhq_z_u16(const uint16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_u32)))
-uint32x4_t vldrhq_z_u32(const uint16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_s32)))
-int32x4_t vldrwq_gather_base_s32(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_u32)))
-uint32x4_t vldrwq_gather_base_u32(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_s32)))
-int32x4_t vldrwq_gather_base_wb_s32(uint32x4_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_u32)))
-uint32x4_t vldrwq_gather_base_wb_u32(uint32x4_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_z_s32)))
-int32x4_t vldrwq_gather_base_wb_z_s32(uint32x4_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_z_u32)))
-uint32x4_t vldrwq_gather_base_wb_z_u32(uint32x4_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_z_s32)))
-int32x4_t vldrwq_gather_base_z_s32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_z_u32)))
-uint32x4_t vldrwq_gather_base_z_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_s32)))
-int32x4_t vldrwq_gather_offset_s32(const int32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_s32)))
-int32x4_t vldrwq_gather_offset(const int32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_u32)))
-uint32x4_t vldrwq_gather_offset_u32(const uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_u32)))
-uint32x4_t vldrwq_gather_offset(const uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_s32)))
-int32x4_t vldrwq_gather_offset_z_s32(const int32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_s32)))
-int32x4_t vldrwq_gather_offset_z(const int32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_u32)))
-uint32x4_t vldrwq_gather_offset_z_u32(const uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_u32)))
-uint32x4_t vldrwq_gather_offset_z(const uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_s32)))
-int32x4_t vldrwq_gather_shifted_offset_s32(const int32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_s32)))
-int32x4_t vldrwq_gather_shifted_offset(const int32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_u32)))
-uint32x4_t vldrwq_gather_shifted_offset_u32(const uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_u32)))
-uint32x4_t vldrwq_gather_shifted_offset(const uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_s32)))
-int32x4_t vldrwq_gather_shifted_offset_z_s32(const int32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_s32)))
-int32x4_t vldrwq_gather_shifted_offset_z(const int32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_u32)))
-uint32x4_t vldrwq_gather_shifted_offset_z_u32(const uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_u32)))
-uint32x4_t vldrwq_gather_shifted_offset_z(const uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_s32)))
-int32x4_t vldrwq_s32(const int32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_u32)))
-uint32x4_t vldrwq_u32(const uint32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_z_s32)))
-int32x4_t vldrwq_z_s32(const int32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_z_u32)))
-uint32x4_t vldrwq_z_u32(const uint32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s16)))
-uint16x8_t vmaxaq_m_s16(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s16)))
-uint16x8_t vmaxaq_m(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s32)))
-uint32x4_t vmaxaq_m_s32(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s32)))
-uint32x4_t vmaxaq_m(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s8)))
-uint8x16_t vmaxaq_m_s8(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_m_s8)))
-uint8x16_t vmaxaq_m(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s16)))
-uint16x8_t vmaxaq_s16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s16)))
-uint16x8_t vmaxaq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s32)))
-uint32x4_t vmaxaq_s32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s32)))
-uint32x4_t vmaxaq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s8)))
-uint8x16_t vmaxaq_s8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxaq_s8)))
-uint8x16_t vmaxaq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s16)))
-uint16_t vmaxavq_p_s16(uint16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s16)))
-uint16_t vmaxavq_p(uint16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s32)))
-uint32_t vmaxavq_p_s32(uint32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s32)))
-uint32_t vmaxavq_p(uint32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s8)))
-uint8_t vmaxavq_p_s8(uint8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_p_s8)))
-uint8_t vmaxavq_p(uint8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s16)))
-uint16_t vmaxavq_s16(uint16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s16)))
-uint16_t vmaxavq(uint16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s32)))
-uint32_t vmaxavq_s32(uint32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s32)))
-uint32_t vmaxavq(uint32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s8)))
-uint8_t vmaxavq_s8(uint8_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxavq_s8)))
-uint8_t vmaxavq(uint8_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s16)))
-int16x8_t vmaxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s16)))
-int16x8_t vmaxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s32)))
-int32x4_t vmaxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s32)))
-int32x4_t vmaxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s8)))
-int8x16_t vmaxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_s8)))
-int8x16_t vmaxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u16)))
-uint16x8_t vmaxq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u16)))
-uint16x8_t vmaxq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u32)))
-uint32x4_t vmaxq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u32)))
-uint32x4_t vmaxq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u8)))
-uint8x16_t vmaxq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_m_u8)))
-uint8x16_t vmaxq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s16)))
-int16x8_t vmaxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s16)))
-int16x8_t vmaxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s32)))
-int32x4_t vmaxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s32)))
-int32x4_t vmaxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s8)))
-int8x16_t vmaxq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_s8)))
-int8x16_t vmaxq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u16)))
-uint16x8_t vmaxq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u16)))
-uint16x8_t vmaxq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u32)))
-uint32x4_t vmaxq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u32)))
-uint32x4_t vmaxq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u8)))
-uint8x16_t vmaxq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_u8)))
-uint8x16_t vmaxq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s16)))
-int16x8_t vmaxq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s16)))
-int16x8_t vmaxq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s32)))
-int32x4_t vmaxq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s32)))
-int32x4_t vmaxq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s8)))
-int8x16_t vmaxq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_s8)))
-int8x16_t vmaxq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u16)))
-uint16x8_t vmaxq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u16)))
-uint16x8_t vmaxq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u32)))
-uint32x4_t vmaxq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u32)))
-uint32x4_t vmaxq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u8)))
-uint8x16_t vmaxq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxq_x_u8)))
-uint8x16_t vmaxq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s16)))
-int16_t vmaxvq_p_s16(int16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s16)))
-int16_t vmaxvq_p(int16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s32)))
-int32_t vmaxvq_p_s32(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s32)))
-int32_t vmaxvq_p(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s8)))
-int8_t vmaxvq_p_s8(int8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_s8)))
-int8_t vmaxvq_p(int8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u16)))
-uint16_t vmaxvq_p_u16(uint16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u16)))
-uint16_t vmaxvq_p(uint16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u32)))
-uint32_t vmaxvq_p_u32(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u32)))
-uint32_t vmaxvq_p(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u8)))
-uint8_t vmaxvq_p_u8(uint8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_p_u8)))
-uint8_t vmaxvq_p(uint8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s16)))
-int16_t vmaxvq_s16(int16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s16)))
-int16_t vmaxvq(int16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s32)))
-int32_t vmaxvq_s32(int32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s32)))
-int32_t vmaxvq(int32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s8)))
-int8_t vmaxvq_s8(int8_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_s8)))
-int8_t vmaxvq(int8_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u16)))
-uint16_t vmaxvq_u16(uint16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u16)))
-uint16_t vmaxvq(uint16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u32)))
-uint32_t vmaxvq_u32(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u32)))
-uint32_t vmaxvq(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u8)))
-uint8_t vmaxvq_u8(uint8_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxvq_u8)))
-uint8_t vmaxvq(uint8_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s16)))
-uint16x8_t vminaq_m_s16(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s16)))
-uint16x8_t vminaq_m(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s32)))
-uint32x4_t vminaq_m_s32(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s32)))
-uint32x4_t vminaq_m(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s8)))
-uint8x16_t vminaq_m_s8(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_m_s8)))
-uint8x16_t vminaq_m(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s16)))
-uint16x8_t vminaq_s16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s16)))
-uint16x8_t vminaq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s32)))
-uint32x4_t vminaq_s32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s32)))
-uint32x4_t vminaq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s8)))
-uint8x16_t vminaq_s8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminaq_s8)))
-uint8x16_t vminaq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s16)))
-uint16_t vminavq_p_s16(uint16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s16)))
-uint16_t vminavq_p(uint16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s32)))
-uint32_t vminavq_p_s32(uint32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s32)))
-uint32_t vminavq_p(uint32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s8)))
-uint8_t vminavq_p_s8(uint8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_p_s8)))
-uint8_t vminavq_p(uint8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s16)))
-uint16_t vminavq_s16(uint16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s16)))
-uint16_t vminavq(uint16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s32)))
-uint32_t vminavq_s32(uint32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s32)))
-uint32_t vminavq(uint32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s8)))
-uint8_t vminavq_s8(uint8_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminavq_s8)))
-uint8_t vminavq(uint8_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s16)))
-int16x8_t vminq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s16)))
-int16x8_t vminq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s32)))
-int32x4_t vminq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s32)))
-int32x4_t vminq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s8)))
-int8x16_t vminq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_s8)))
-int8x16_t vminq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u16)))
-uint16x8_t vminq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u16)))
-uint16x8_t vminq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u32)))
-uint32x4_t vminq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u32)))
-uint32x4_t vminq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u8)))
-uint8x16_t vminq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_m_u8)))
-uint8x16_t vminq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_s16)))
-int16x8_t vminq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_s16)))
-int16x8_t vminq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_s32)))
-int32x4_t vminq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_s32)))
-int32x4_t vminq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_s8)))
-int8x16_t vminq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_s8)))
-int8x16_t vminq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_u16)))
-uint16x8_t vminq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_u16)))
-uint16x8_t vminq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_u32)))
-uint32x4_t vminq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_u32)))
-uint32x4_t vminq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_u8)))
-uint8x16_t vminq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_u8)))
-uint8x16_t vminq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s16)))
-int16x8_t vminq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s16)))
-int16x8_t vminq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s32)))
-int32x4_t vminq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s32)))
-int32x4_t vminq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s8)))
-int8x16_t vminq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_s8)))
-int8x16_t vminq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u16)))
-uint16x8_t vminq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u16)))
-uint16x8_t vminq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u32)))
-uint32x4_t vminq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u32)))
-uint32x4_t vminq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u8)))
-uint8x16_t vminq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminq_x_u8)))
-uint8x16_t vminq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s16)))
-int16_t vminvq_p_s16(int16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s16)))
-int16_t vminvq_p(int16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s32)))
-int32_t vminvq_p_s32(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s32)))
-int32_t vminvq_p(int32_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s8)))
-int8_t vminvq_p_s8(int8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_s8)))
-int8_t vminvq_p(int8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u16)))
-uint16_t vminvq_p_u16(uint16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u16)))
-uint16_t vminvq_p(uint16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u32)))
-uint32_t vminvq_p_u32(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u32)))
-uint32_t vminvq_p(uint32_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u8)))
-uint8_t vminvq_p_u8(uint8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_p_u8)))
-uint8_t vminvq_p(uint8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s16)))
-int16_t vminvq_s16(int16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s16)))
-int16_t vminvq(int16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s32)))
-int32_t vminvq_s32(int32_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s32)))
-int32_t vminvq(int32_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s8)))
-int8_t vminvq_s8(int8_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_s8)))
-int8_t vminvq(int8_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u16)))
-uint16_t vminvq_u16(uint16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u16)))
-uint16_t vminvq(uint16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u32)))
-uint32_t vminvq_u32(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u32)))
-uint32_t vminvq(uint32_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u8)))
-uint8_t vminvq_u8(uint8_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminvq_u8)))
-uint8_t vminvq(uint8_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s16)))
-int32_t vmladavaq_p_s16(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s16)))
-int32_t vmladavaq_p(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s32)))
-int32_t vmladavaq_p_s32(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s32)))
-int32_t vmladavaq_p(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s8)))
-int32_t vmladavaq_p_s8(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_s8)))
-int32_t vmladavaq_p(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u16)))
-uint32_t vmladavaq_p_u16(uint32_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u16)))
-uint32_t vmladavaq_p(uint32_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u32)))
-uint32_t vmladavaq_p_u32(uint32_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u32)))
-uint32_t vmladavaq_p(uint32_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u8)))
-uint32_t vmladavaq_p_u8(uint32_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_p_u8)))
-uint32_t vmladavaq_p(uint32_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s16)))
-int32_t vmladavaq_s16(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s16)))
-int32_t vmladavaq(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s32)))
-int32_t vmladavaq_s32(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s32)))
-int32_t vmladavaq(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s8)))
-int32_t vmladavaq_s8(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_s8)))
-int32_t vmladavaq(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u16)))
-uint32_t vmladavaq_u16(uint32_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u16)))
-uint32_t vmladavaq(uint32_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u32)))
-uint32_t vmladavaq_u32(uint32_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u32)))
-uint32_t vmladavaq(uint32_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u8)))
-uint32_t vmladavaq_u8(uint32_t, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaq_u8)))
-uint32_t vmladavaq(uint32_t, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s16)))
-int32_t vmladavaxq_p_s16(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s16)))
-int32_t vmladavaxq_p(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s32)))
-int32_t vmladavaxq_p_s32(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s32)))
-int32_t vmladavaxq_p(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s8)))
-int32_t vmladavaxq_p_s8(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_p_s8)))
-int32_t vmladavaxq_p(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s16)))
-int32_t vmladavaxq_s16(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s16)))
-int32_t vmladavaxq(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s32)))
-int32_t vmladavaxq_s32(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s32)))
-int32_t vmladavaxq(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s8)))
-int32_t vmladavaxq_s8(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavaxq_s8)))
-int32_t vmladavaxq(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s16)))
-int32_t vmladavq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s16)))
-int32_t vmladavq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s32)))
-int32_t vmladavq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s32)))
-int32_t vmladavq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s8)))
-int32_t vmladavq_p_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_s8)))
-int32_t vmladavq_p(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u16)))
-uint32_t vmladavq_p_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u16)))
-uint32_t vmladavq_p(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u32)))
-uint32_t vmladavq_p_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u32)))
-uint32_t vmladavq_p(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u8)))
-uint32_t vmladavq_p_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_p_u8)))
-uint32_t vmladavq_p(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s16)))
-int32_t vmladavq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s16)))
-int32_t vmladavq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s32)))
-int32_t vmladavq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s32)))
-int32_t vmladavq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s8)))
-int32_t vmladavq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_s8)))
-int32_t vmladavq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u16)))
-uint32_t vmladavq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u16)))
-uint32_t vmladavq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u32)))
-uint32_t vmladavq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u32)))
-uint32_t vmladavq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u8)))
-uint32_t vmladavq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavq_u8)))
-uint32_t vmladavq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s16)))
-int32_t vmladavxq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s16)))
-int32_t vmladavxq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s32)))
-int32_t vmladavxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s32)))
-int32_t vmladavxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s8)))
-int32_t vmladavxq_p_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_p_s8)))
-int32_t vmladavxq_p(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s16)))
-int32_t vmladavxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s16)))
-int32_t vmladavxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s32)))
-int32_t vmladavxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s32)))
-int32_t vmladavxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s8)))
-int32_t vmladavxq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmladavxq_s8)))
-int32_t vmladavxq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_s16)))
-int64_t vmlaldavaq_p_s16(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_s16)))
-int64_t vmlaldavaq_p(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_s32)))
-int64_t vmlaldavaq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_s32)))
-int64_t vmlaldavaq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_u16)))
-uint64_t vmlaldavaq_p_u16(uint64_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_u16)))
-uint64_t vmlaldavaq_p(uint64_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_u32)))
-uint64_t vmlaldavaq_p_u32(uint64_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_p_u32)))
-uint64_t vmlaldavaq_p(uint64_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_s16)))
-int64_t vmlaldavaq_s16(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_s16)))
-int64_t vmlaldavaq(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_s32)))
-int64_t vmlaldavaq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_s32)))
-int64_t vmlaldavaq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_u16)))
-uint64_t vmlaldavaq_u16(uint64_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_u16)))
-uint64_t vmlaldavaq(uint64_t, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_u32)))
-uint64_t vmlaldavaq_u32(uint64_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaq_u32)))
-uint64_t vmlaldavaq(uint64_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_p_s16)))
-int64_t vmlaldavaxq_p_s16(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_p_s16)))
-int64_t vmlaldavaxq_p(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_p_s32)))
-int64_t vmlaldavaxq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_p_s32)))
-int64_t vmlaldavaxq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_s16)))
-int64_t vmlaldavaxq_s16(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_s16)))
-int64_t vmlaldavaxq(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_s32)))
-int64_t vmlaldavaxq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavaxq_s32)))
-int64_t vmlaldavaxq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_s16)))
-int64_t vmlaldavq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_s16)))
-int64_t vmlaldavq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_s32)))
-int64_t vmlaldavq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_s32)))
-int64_t vmlaldavq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_u16)))
-uint64_t vmlaldavq_p_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_u16)))
-uint64_t vmlaldavq_p(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_u32)))
-uint64_t vmlaldavq_p_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_p_u32)))
-uint64_t vmlaldavq_p(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_s16)))
-int64_t vmlaldavq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_s16)))
-int64_t vmlaldavq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_s32)))
-int64_t vmlaldavq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_s32)))
-int64_t vmlaldavq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_u16)))
-uint64_t vmlaldavq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_u16)))
-uint64_t vmlaldavq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_u32)))
-uint64_t vmlaldavq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavq_u32)))
-uint64_t vmlaldavq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_p_s16)))
-int64_t vmlaldavxq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_p_s16)))
-int64_t vmlaldavxq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_p_s32)))
-int64_t vmlaldavxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_p_s32)))
-int64_t vmlaldavxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_s16)))
-int64_t vmlaldavxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_s16)))
-int64_t vmlaldavxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_s32)))
-int64_t vmlaldavxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaldavxq_s32)))
-int64_t vmlaldavxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s16)))
-int16x8_t vmlaq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s16)))
-int16x8_t vmlaq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s32)))
-int32x4_t vmlaq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s32)))
-int32x4_t vmlaq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s8)))
-int8x16_t vmlaq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_s8)))
-int8x16_t vmlaq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u16)))
-uint16x8_t vmlaq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u16)))
-uint16x8_t vmlaq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u32)))
-uint32x4_t vmlaq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u32)))
-uint32x4_t vmlaq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u8)))
-uint8x16_t vmlaq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_m_n_u8)))
-uint8x16_t vmlaq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s16)))
-int16x8_t vmlaq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s16)))
-int16x8_t vmlaq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s32)))
-int32x4_t vmlaq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s32)))
-int32x4_t vmlaq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s8)))
-int8x16_t vmlaq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_s8)))
-int8x16_t vmlaq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u16)))
-uint16x8_t vmlaq_n_u16(uint16x8_t, uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u16)))
-uint16x8_t vmlaq(uint16x8_t, uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u32)))
-uint32x4_t vmlaq_n_u32(uint32x4_t, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u32)))
-uint32x4_t vmlaq(uint32x4_t, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u8)))
-uint8x16_t vmlaq_n_u8(uint8x16_t, uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlaq_n_u8)))
-uint8x16_t vmlaq(uint8x16_t, uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s16)))
-int16x8_t vmlasq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s16)))
-int16x8_t vmlasq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s32)))
-int32x4_t vmlasq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s32)))
-int32x4_t vmlasq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s8)))
-int8x16_t vmlasq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_s8)))
-int8x16_t vmlasq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u16)))
-uint16x8_t vmlasq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u16)))
-uint16x8_t vmlasq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u32)))
-uint32x4_t vmlasq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u32)))
-uint32x4_t vmlasq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u8)))
-uint8x16_t vmlasq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_m_n_u8)))
-uint8x16_t vmlasq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s16)))
-int16x8_t vmlasq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s16)))
-int16x8_t vmlasq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s32)))
-int32x4_t vmlasq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s32)))
-int32x4_t vmlasq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s8)))
-int8x16_t vmlasq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_s8)))
-int8x16_t vmlasq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u16)))
-uint16x8_t vmlasq_n_u16(uint16x8_t, uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u16)))
-uint16x8_t vmlasq(uint16x8_t, uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u32)))
-uint32x4_t vmlasq_n_u32(uint32x4_t, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u32)))
-uint32x4_t vmlasq(uint32x4_t, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u8)))
-uint8x16_t vmlasq_n_u8(uint8x16_t, uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlasq_n_u8)))
-uint8x16_t vmlasq(uint8x16_t, uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s16)))
-int32_t vmlsdavaq_p_s16(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s16)))
-int32_t vmlsdavaq_p(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s32)))
-int32_t vmlsdavaq_p_s32(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s32)))
-int32_t vmlsdavaq_p(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s8)))
-int32_t vmlsdavaq_p_s8(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_p_s8)))
-int32_t vmlsdavaq_p(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s16)))
-int32_t vmlsdavaq_s16(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s16)))
-int32_t vmlsdavaq(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s32)))
-int32_t vmlsdavaq_s32(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s32)))
-int32_t vmlsdavaq(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s8)))
-int32_t vmlsdavaq_s8(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaq_s8)))
-int32_t vmlsdavaq(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s16)))
-int32_t vmlsdavaxq_p_s16(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s16)))
-int32_t vmlsdavaxq_p(int32_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s32)))
-int32_t vmlsdavaxq_p_s32(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s32)))
-int32_t vmlsdavaxq_p(int32_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s8)))
-int32_t vmlsdavaxq_p_s8(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_p_s8)))
-int32_t vmlsdavaxq_p(int32_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s16)))
-int32_t vmlsdavaxq_s16(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s16)))
-int32_t vmlsdavaxq(int32_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s32)))
-int32_t vmlsdavaxq_s32(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s32)))
-int32_t vmlsdavaxq(int32_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s8)))
-int32_t vmlsdavaxq_s8(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavaxq_s8)))
-int32_t vmlsdavaxq(int32_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s16)))
-int32_t vmlsdavq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s16)))
-int32_t vmlsdavq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s32)))
-int32_t vmlsdavq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s32)))
-int32_t vmlsdavq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s8)))
-int32_t vmlsdavq_p_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_p_s8)))
-int32_t vmlsdavq_p(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s16)))
-int32_t vmlsdavq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s16)))
-int32_t vmlsdavq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s32)))
-int32_t vmlsdavq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s32)))
-int32_t vmlsdavq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s8)))
-int32_t vmlsdavq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavq_s8)))
-int32_t vmlsdavq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s16)))
-int32_t vmlsdavxq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s16)))
-int32_t vmlsdavxq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s32)))
-int32_t vmlsdavxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s32)))
-int32_t vmlsdavxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s8)))
-int32_t vmlsdavxq_p_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_p_s8)))
-int32_t vmlsdavxq_p(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s16)))
-int32_t vmlsdavxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s16)))
-int32_t vmlsdavxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s32)))
-int32_t vmlsdavxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s32)))
-int32_t vmlsdavxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s8)))
-int32_t vmlsdavxq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsdavxq_s8)))
-int32_t vmlsdavxq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_p_s16)))
-int64_t vmlsldavaq_p_s16(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_p_s16)))
-int64_t vmlsldavaq_p(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_p_s32)))
-int64_t vmlsldavaq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_p_s32)))
-int64_t vmlsldavaq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_s16)))
-int64_t vmlsldavaq_s16(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_s16)))
-int64_t vmlsldavaq(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_s32)))
-int64_t vmlsldavaq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaq_s32)))
-int64_t vmlsldavaq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_p_s16)))
-int64_t vmlsldavaxq_p_s16(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_p_s16)))
-int64_t vmlsldavaxq_p(int64_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_p_s32)))
-int64_t vmlsldavaxq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_p_s32)))
-int64_t vmlsldavaxq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_s16)))
-int64_t vmlsldavaxq_s16(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_s16)))
-int64_t vmlsldavaxq(int64_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_s32)))
-int64_t vmlsldavaxq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavaxq_s32)))
-int64_t vmlsldavaxq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_p_s16)))
-int64_t vmlsldavq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_p_s16)))
-int64_t vmlsldavq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_p_s32)))
-int64_t vmlsldavq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_p_s32)))
-int64_t vmlsldavq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_s16)))
-int64_t vmlsldavq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_s16)))
-int64_t vmlsldavq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_s32)))
-int64_t vmlsldavq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavq_s32)))
-int64_t vmlsldavq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_p_s16)))
-int64_t vmlsldavxq_p_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_p_s16)))
-int64_t vmlsldavxq_p(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_p_s32)))
-int64_t vmlsldavxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_p_s32)))
-int64_t vmlsldavxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_s16)))
-int64_t vmlsldavxq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_s16)))
-int64_t vmlsldavxq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_s32)))
-int64_t vmlsldavxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmlsldavxq_s32)))
-int64_t vmlsldavxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_s16)))
-int32x4_t vmovlbq_m_s16(int32x4_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_s16)))
-int32x4_t vmovlbq_m(int32x4_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_s8)))
-int16x8_t vmovlbq_m_s8(int16x8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_s8)))
-int16x8_t vmovlbq_m(int16x8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_u16)))
-uint32x4_t vmovlbq_m_u16(uint32x4_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_u16)))
-uint32x4_t vmovlbq_m(uint32x4_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_u8)))
-uint16x8_t vmovlbq_m_u8(uint16x8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_m_u8)))
-uint16x8_t vmovlbq_m(uint16x8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_s16)))
-int32x4_t vmovlbq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_s16)))
-int32x4_t vmovlbq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_s8)))
-int16x8_t vmovlbq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_s8)))
-int16x8_t vmovlbq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_u16)))
-uint32x4_t vmovlbq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_u16)))
-uint32x4_t vmovlbq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_u8)))
-uint16x8_t vmovlbq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_u8)))
-uint16x8_t vmovlbq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_s16)))
-int32x4_t vmovlbq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_s16)))
-int32x4_t vmovlbq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_s8)))
-int16x8_t vmovlbq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_s8)))
-int16x8_t vmovlbq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_u16)))
-uint32x4_t vmovlbq_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_u16)))
-uint32x4_t vmovlbq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_u8)))
-uint16x8_t vmovlbq_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovlbq_x_u8)))
-uint16x8_t vmovlbq_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_s16)))
-int32x4_t vmovltq_m_s16(int32x4_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_s16)))
-int32x4_t vmovltq_m(int32x4_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_s8)))
-int16x8_t vmovltq_m_s8(int16x8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_s8)))
-int16x8_t vmovltq_m(int16x8_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_u16)))
-uint32x4_t vmovltq_m_u16(uint32x4_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_u16)))
-uint32x4_t vmovltq_m(uint32x4_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_u8)))
-uint16x8_t vmovltq_m_u8(uint16x8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_m_u8)))
-uint16x8_t vmovltq_m(uint16x8_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_s16)))
-int32x4_t vmovltq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_s16)))
-int32x4_t vmovltq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_s8)))
-int16x8_t vmovltq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_s8)))
-int16x8_t vmovltq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_u16)))
-uint32x4_t vmovltq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_u16)))
-uint32x4_t vmovltq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_u8)))
-uint16x8_t vmovltq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_u8)))
-uint16x8_t vmovltq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_s16)))
-int32x4_t vmovltq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_s16)))
-int32x4_t vmovltq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_s8)))
-int16x8_t vmovltq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_s8)))
-int16x8_t vmovltq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_u16)))
-uint32x4_t vmovltq_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_u16)))
-uint32x4_t vmovltq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_u8)))
-uint16x8_t vmovltq_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovltq_x_u8)))
-uint16x8_t vmovltq_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_s16)))
-int8x16_t vmovnbq_m_s16(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_s16)))
-int8x16_t vmovnbq_m(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_s32)))
-int16x8_t vmovnbq_m_s32(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_s32)))
-int16x8_t vmovnbq_m(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_u16)))
-uint8x16_t vmovnbq_m_u16(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_u16)))
-uint8x16_t vmovnbq_m(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_u32)))
-uint16x8_t vmovnbq_m_u32(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_m_u32)))
-uint16x8_t vmovnbq_m(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_s16)))
-int8x16_t vmovnbq_s16(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_s16)))
-int8x16_t vmovnbq(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_s32)))
-int16x8_t vmovnbq_s32(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_s32)))
-int16x8_t vmovnbq(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_u16)))
-uint8x16_t vmovnbq_u16(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_u16)))
-uint8x16_t vmovnbq(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_u32)))
-uint16x8_t vmovnbq_u32(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovnbq_u32)))
-uint16x8_t vmovnbq(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_s16)))
-int8x16_t vmovntq_m_s16(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_s16)))
-int8x16_t vmovntq_m(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_s32)))
-int16x8_t vmovntq_m_s32(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_s32)))
-int16x8_t vmovntq_m(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_u16)))
-uint8x16_t vmovntq_m_u16(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_u16)))
-uint8x16_t vmovntq_m(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_u32)))
-uint16x8_t vmovntq_m_u32(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_m_u32)))
-uint16x8_t vmovntq_m(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_s16)))
-int8x16_t vmovntq_s16(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_s16)))
-int8x16_t vmovntq(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_s32)))
-int16x8_t vmovntq_s32(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_s32)))
-int16x8_t vmovntq(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_u16)))
-uint8x16_t vmovntq_u16(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_u16)))
-uint8x16_t vmovntq(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_u32)))
-uint16x8_t vmovntq_u32(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmovntq_u32)))
-uint16x8_t vmovntq(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s16)))
-int16x8_t vmulhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s16)))
-int16x8_t vmulhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s32)))
-int32x4_t vmulhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s32)))
-int32x4_t vmulhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s8)))
-int8x16_t vmulhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_s8)))
-int8x16_t vmulhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u16)))
-uint16x8_t vmulhq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u16)))
-uint16x8_t vmulhq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u32)))
-uint32x4_t vmulhq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u32)))
-uint32x4_t vmulhq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u8)))
-uint8x16_t vmulhq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_m_u8)))
-uint8x16_t vmulhq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s16)))
-int16x8_t vmulhq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s16)))
-int16x8_t vmulhq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s32)))
-int32x4_t vmulhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s32)))
-int32x4_t vmulhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s8)))
-int8x16_t vmulhq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_s8)))
-int8x16_t vmulhq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u16)))
-uint16x8_t vmulhq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u16)))
-uint16x8_t vmulhq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u32)))
-uint32x4_t vmulhq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u32)))
-uint32x4_t vmulhq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u8)))
-uint8x16_t vmulhq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_u8)))
-uint8x16_t vmulhq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s16)))
-int16x8_t vmulhq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s16)))
-int16x8_t vmulhq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s32)))
-int32x4_t vmulhq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s32)))
-int32x4_t vmulhq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s8)))
-int8x16_t vmulhq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_s8)))
-int8x16_t vmulhq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u16)))
-uint16x8_t vmulhq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u16)))
-uint16x8_t vmulhq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u32)))
-uint32x4_t vmulhq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u32)))
-uint32x4_t vmulhq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u8)))
-uint8x16_t vmulhq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulhq_x_u8)))
-uint8x16_t vmulhq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s16)))
-int32x4_t vmullbq_int_m_s16(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s16)))
-int32x4_t vmullbq_int_m(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s32)))
-int64x2_t vmullbq_int_m_s32(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s32)))
-int64x2_t vmullbq_int_m(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s8)))
-int16x8_t vmullbq_int_m_s8(int16x8_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_s8)))
-int16x8_t vmullbq_int_m(int16x8_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u16)))
-uint32x4_t vmullbq_int_m_u16(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u16)))
-uint32x4_t vmullbq_int_m(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u32)))
-uint64x2_t vmullbq_int_m_u32(uint64x2_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u32)))
-uint64x2_t vmullbq_int_m(uint64x2_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u8)))
-uint16x8_t vmullbq_int_m_u8(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_m_u8)))
-uint16x8_t vmullbq_int_m(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s16)))
-int32x4_t vmullbq_int_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s16)))
-int32x4_t vmullbq_int(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s32)))
-int64x2_t vmullbq_int_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s32)))
-int64x2_t vmullbq_int(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s8)))
-int16x8_t vmullbq_int_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_s8)))
-int16x8_t vmullbq_int(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u16)))
-uint32x4_t vmullbq_int_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u16)))
-uint32x4_t vmullbq_int(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u32)))
-uint64x2_t vmullbq_int_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u32)))
-uint64x2_t vmullbq_int(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u8)))
-uint16x8_t vmullbq_int_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_u8)))
-uint16x8_t vmullbq_int(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s16)))
-int32x4_t vmullbq_int_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s16)))
-int32x4_t vmullbq_int_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s32)))
-int64x2_t vmullbq_int_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s32)))
-int64x2_t vmullbq_int_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s8)))
-int16x8_t vmullbq_int_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_s8)))
-int16x8_t vmullbq_int_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u16)))
-uint32x4_t vmullbq_int_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u16)))
-uint32x4_t vmullbq_int_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u32)))
-uint64x2_t vmullbq_int_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u32)))
-uint64x2_t vmullbq_int_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u8)))
-uint16x8_t vmullbq_int_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_int_x_u8)))
-uint16x8_t vmullbq_int_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_m_p16)))
-uint32x4_t vmullbq_poly_m_p16(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_m_p16)))
-uint32x4_t vmullbq_poly_m(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_m_p8)))
-uint16x8_t vmullbq_poly_m_p8(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_m_p8)))
-uint16x8_t vmullbq_poly_m(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_p16)))
-uint32x4_t vmullbq_poly_p16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_p16)))
-uint32x4_t vmullbq_poly(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_p8)))
-uint16x8_t vmullbq_poly_p8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_p8)))
-uint16x8_t vmullbq_poly(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_x_p16)))
-uint32x4_t vmullbq_poly_x_p16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_x_p16)))
-uint32x4_t vmullbq_poly_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_x_p8)))
-uint16x8_t vmullbq_poly_x_p8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmullbq_poly_x_p8)))
-uint16x8_t vmullbq_poly_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s16)))
-int32x4_t vmulltq_int_m_s16(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s16)))
-int32x4_t vmulltq_int_m(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s32)))
-int64x2_t vmulltq_int_m_s32(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s32)))
-int64x2_t vmulltq_int_m(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s8)))
-int16x8_t vmulltq_int_m_s8(int16x8_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_s8)))
-int16x8_t vmulltq_int_m(int16x8_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u16)))
-uint32x4_t vmulltq_int_m_u16(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u16)))
-uint32x4_t vmulltq_int_m(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u32)))
-uint64x2_t vmulltq_int_m_u32(uint64x2_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u32)))
-uint64x2_t vmulltq_int_m(uint64x2_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u8)))
-uint16x8_t vmulltq_int_m_u8(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_m_u8)))
-uint16x8_t vmulltq_int_m(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s16)))
-int32x4_t vmulltq_int_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s16)))
-int32x4_t vmulltq_int(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s32)))
-int64x2_t vmulltq_int_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s32)))
-int64x2_t vmulltq_int(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s8)))
-int16x8_t vmulltq_int_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_s8)))
-int16x8_t vmulltq_int(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u16)))
-uint32x4_t vmulltq_int_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u16)))
-uint32x4_t vmulltq_int(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u32)))
-uint64x2_t vmulltq_int_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u32)))
-uint64x2_t vmulltq_int(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u8)))
-uint16x8_t vmulltq_int_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_u8)))
-uint16x8_t vmulltq_int(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s16)))
-int32x4_t vmulltq_int_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s16)))
-int32x4_t vmulltq_int_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s32)))
-int64x2_t vmulltq_int_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s32)))
-int64x2_t vmulltq_int_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s8)))
-int16x8_t vmulltq_int_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_s8)))
-int16x8_t vmulltq_int_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u16)))
-uint32x4_t vmulltq_int_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u16)))
-uint32x4_t vmulltq_int_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u32)))
-uint64x2_t vmulltq_int_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u32)))
-uint64x2_t vmulltq_int_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u8)))
-uint16x8_t vmulltq_int_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_int_x_u8)))
-uint16x8_t vmulltq_int_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_m_p16)))
-uint32x4_t vmulltq_poly_m_p16(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_m_p16)))
-uint32x4_t vmulltq_poly_m(uint32x4_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_m_p8)))
-uint16x8_t vmulltq_poly_m_p8(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_m_p8)))
-uint16x8_t vmulltq_poly_m(uint16x8_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_p16)))
-uint32x4_t vmulltq_poly_p16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_p16)))
-uint32x4_t vmulltq_poly(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_p8)))
-uint16x8_t vmulltq_poly_p8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_p8)))
-uint16x8_t vmulltq_poly(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_x_p16)))
-uint32x4_t vmulltq_poly_x_p16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_x_p16)))
-uint32x4_t vmulltq_poly_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_x_p8)))
-uint16x8_t vmulltq_poly_x_p8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulltq_poly_x_p8)))
-uint16x8_t vmulltq_poly_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s16)))
-int16x8_t vmulq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s16)))
-int16x8_t vmulq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s32)))
-int32x4_t vmulq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s32)))
-int32x4_t vmulq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s8)))
-int8x16_t vmulq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_s8)))
-int8x16_t vmulq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u16)))
-uint16x8_t vmulq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u16)))
-uint16x8_t vmulq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u32)))
-uint32x4_t vmulq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u32)))
-uint32x4_t vmulq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u8)))
-uint8x16_t vmulq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_u8)))
-uint8x16_t vmulq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s16)))
-int16x8_t vmulq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s16)))
-int16x8_t vmulq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s32)))
-int32x4_t vmulq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s32)))
-int32x4_t vmulq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s8)))
-int8x16_t vmulq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_s8)))
-int8x16_t vmulq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u16)))
-uint16x8_t vmulq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u16)))
-uint16x8_t vmulq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u32)))
-uint32x4_t vmulq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u32)))
-uint32x4_t vmulq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u8)))
-uint8x16_t vmulq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_u8)))
-uint8x16_t vmulq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s16)))
-int16x8_t vmulq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s16)))
-int16x8_t vmulq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s32)))
-int32x4_t vmulq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s32)))
-int32x4_t vmulq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s8)))
-int8x16_t vmulq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_s8)))
-int8x16_t vmulq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u16)))
-uint16x8_t vmulq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u16)))
-uint16x8_t vmulq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u32)))
-uint32x4_t vmulq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u32)))
-uint32x4_t vmulq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u8)))
-uint8x16_t vmulq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_u8)))
-uint8x16_t vmulq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s16)))
-int16x8_t vmulq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s16)))
-int16x8_t vmulq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s32)))
-int32x4_t vmulq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s32)))
-int32x4_t vmulq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s8)))
-int8x16_t vmulq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_s8)))
-int8x16_t vmulq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u16)))
-uint16x8_t vmulq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u16)))
-uint16x8_t vmulq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u32)))
-uint32x4_t vmulq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u32)))
-uint32x4_t vmulq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u8)))
-uint8x16_t vmulq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_u8)))
-uint8x16_t vmulq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s16)))
-int16x8_t vmulq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s16)))
-int16x8_t vmulq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s32)))
-int32x4_t vmulq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s32)))
-int32x4_t vmulq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s8)))
-int8x16_t vmulq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_s8)))
-int8x16_t vmulq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u16)))
-uint16x8_t vmulq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u16)))
-uint16x8_t vmulq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u32)))
-uint32x4_t vmulq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u32)))
-uint32x4_t vmulq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u8)))
-uint8x16_t vmulq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_u8)))
-uint8x16_t vmulq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s16)))
-int16x8_t vmulq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s16)))
-int16x8_t vmulq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s32)))
-int32x4_t vmulq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s32)))
-int32x4_t vmulq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s8)))
-int8x16_t vmulq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_s8)))
-int8x16_t vmulq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u16)))
-uint16x8_t vmulq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u16)))
-uint16x8_t vmulq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u32)))
-uint32x4_t vmulq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u32)))
-uint32x4_t vmulq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u8)))
-uint8x16_t vmulq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_u8)))
-uint8x16_t vmulq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_s16)))
-int16x8_t vmvnq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_s16)))
-int16x8_t vmvnq_m(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_s32)))
-int32x4_t vmvnq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_s32)))
-int32x4_t vmvnq_m(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_u16)))
-uint16x8_t vmvnq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_u16)))
-uint16x8_t vmvnq_m(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_u32)))
-uint32x4_t vmvnq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_n_u32)))
-uint32x4_t vmvnq_m(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s16)))
-int16x8_t vmvnq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s16)))
-int16x8_t vmvnq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s32)))
-int32x4_t vmvnq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s32)))
-int32x4_t vmvnq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s8)))
-int8x16_t vmvnq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_s8)))
-int8x16_t vmvnq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u16)))
-uint16x8_t vmvnq_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u16)))
-uint16x8_t vmvnq_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u32)))
-uint32x4_t vmvnq_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u32)))
-uint32x4_t vmvnq_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u8)))
-uint8x16_t vmvnq_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_m_u8)))
-uint8x16_t vmvnq_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_n_s16)))
-int16x8_t vmvnq_n_s16(int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_n_s32)))
-int32x4_t vmvnq_n_s32(int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_n_u16)))
-uint16x8_t vmvnq_n_u16(uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_n_u32)))
-uint32x4_t vmvnq_n_u32(uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s16)))
-int16x8_t vmvnq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s16)))
-int16x8_t vmvnq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s32)))
-int32x4_t vmvnq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s32)))
-int32x4_t vmvnq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s8)))
-int8x16_t vmvnq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_s8)))
-int8x16_t vmvnq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u16)))
-uint16x8_t vmvnq_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u16)))
-uint16x8_t vmvnq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u32)))
-uint32x4_t vmvnq_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u32)))
-uint32x4_t vmvnq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u8)))
-uint8x16_t vmvnq_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_u8)))
-uint8x16_t vmvnq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_n_s16)))
-int16x8_t vmvnq_x_n_s16(int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_n_s32)))
-int32x4_t vmvnq_x_n_s32(int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_n_u16)))
-uint16x8_t vmvnq_x_n_u16(uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_n_u32)))
-uint32x4_t vmvnq_x_n_u32(uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s16)))
-int16x8_t vmvnq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s16)))
-int16x8_t vmvnq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s32)))
-int32x4_t vmvnq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s32)))
-int32x4_t vmvnq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s8)))
-int8x16_t vmvnq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_s8)))
-int8x16_t vmvnq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u16)))
-uint16x8_t vmvnq_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u16)))
-uint16x8_t vmvnq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u32)))
-uint32x4_t vmvnq_x_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u32)))
-uint32x4_t vmvnq_x(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u8)))
-uint8x16_t vmvnq_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmvnq_x_u8)))
-uint8x16_t vmvnq_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s16)))
-int16x8_t vnegq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s16)))
-int16x8_t vnegq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s32)))
-int32x4_t vnegq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s32)))
-int32x4_t vnegq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s8)))
-int8x16_t vnegq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_s8)))
-int8x16_t vnegq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s16)))
-int16x8_t vnegq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s16)))
-int16x8_t vnegq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s32)))
-int32x4_t vnegq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s32)))
-int32x4_t vnegq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s8)))
-int8x16_t vnegq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_s8)))
-int8x16_t vnegq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s16)))
-int16x8_t vnegq_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s16)))
-int16x8_t vnegq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s32)))
-int32x4_t vnegq_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s32)))
-int32x4_t vnegq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s8)))
-int8x16_t vnegq_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_s8)))
-int8x16_t vnegq_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s16)))
-int16x8_t vornq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s16)))
-int16x8_t vornq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s32)))
-int32x4_t vornq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s32)))
-int32x4_t vornq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s8)))
-int8x16_t vornq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_s8)))
-int8x16_t vornq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u16)))
-uint16x8_t vornq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u16)))
-uint16x8_t vornq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u32)))
-uint32x4_t vornq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u32)))
-uint32x4_t vornq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u8)))
-uint8x16_t vornq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_u8)))
-uint8x16_t vornq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_s16)))
-int16x8_t vornq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_s16)))
-int16x8_t vornq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_s32)))
-int32x4_t vornq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_s32)))
-int32x4_t vornq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_s8)))
-int8x16_t vornq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_s8)))
-int8x16_t vornq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_u16)))
-uint16x8_t vornq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_u16)))
-uint16x8_t vornq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_u32)))
-uint32x4_t vornq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_u32)))
-uint32x4_t vornq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_u8)))
-uint8x16_t vornq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_u8)))
-uint8x16_t vornq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s16)))
-int16x8_t vornq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s16)))
-int16x8_t vornq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s32)))
-int32x4_t vornq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s32)))
-int32x4_t vornq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s8)))
-int8x16_t vornq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_s8)))
-int8x16_t vornq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u16)))
-uint16x8_t vornq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u16)))
-uint16x8_t vornq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u32)))
-uint32x4_t vornq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u32)))
-uint32x4_t vornq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u8)))
-uint8x16_t vornq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_u8)))
-uint8x16_t vornq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_s16)))
-int16x8_t vorrq_m_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_s16)))
-int16x8_t vorrq_m_n(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_s32)))
-int32x4_t vorrq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_s32)))
-int32x4_t vorrq_m_n(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_u16)))
-uint16x8_t vorrq_m_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_u16)))
-uint16x8_t vorrq_m_n(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_u32)))
-uint32x4_t vorrq_m_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_n_u32)))
-uint32x4_t vorrq_m_n(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s16)))
-int16x8_t vorrq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s16)))
-int16x8_t vorrq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s32)))
-int32x4_t vorrq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s32)))
-int32x4_t vorrq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s8)))
-int8x16_t vorrq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_s8)))
-int8x16_t vorrq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u16)))
-uint16x8_t vorrq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u16)))
-uint16x8_t vorrq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u32)))
-uint32x4_t vorrq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u32)))
-uint32x4_t vorrq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u8)))
-uint8x16_t vorrq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_u8)))
-uint8x16_t vorrq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_s16)))
-int16x8_t vorrq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_s16)))
-int16x8_t vorrq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_s32)))
-int32x4_t vorrq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_s32)))
-int32x4_t vorrq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_u16)))
-uint16x8_t vorrq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_u16)))
-uint16x8_t vorrq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_u32)))
-uint32x4_t vorrq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_n_u32)))
-uint32x4_t vorrq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s16)))
-int16x8_t vorrq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s16)))
-int16x8_t vorrq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s32)))
-int32x4_t vorrq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s32)))
-int32x4_t vorrq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s8)))
-int8x16_t vorrq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_s8)))
-int8x16_t vorrq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u16)))
-uint16x8_t vorrq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u16)))
-uint16x8_t vorrq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u32)))
-uint32x4_t vorrq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u32)))
-uint32x4_t vorrq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u8)))
-uint8x16_t vorrq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_u8)))
-uint8x16_t vorrq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s16)))
-int16x8_t vorrq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s16)))
-int16x8_t vorrq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s32)))
-int32x4_t vorrq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s32)))
-int32x4_t vorrq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s8)))
-int8x16_t vorrq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_s8)))
-int8x16_t vorrq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u16)))
-uint16x8_t vorrq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u16)))
-uint16x8_t vorrq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u32)))
-uint32x4_t vorrq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u32)))
-uint32x4_t vorrq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u8)))
-uint8x16_t vorrq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_u8)))
-uint8x16_t vorrq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpnot)))
-mve_pred16_t vpnot(mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s16)))
-int16x8_t vpselq_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s16)))
-int16x8_t vpselq(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s32)))
-int32x4_t vpselq_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s32)))
-int32x4_t vpselq(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s64)))
-int64x2_t vpselq_s64(int64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s64)))
-int64x2_t vpselq(int64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s8)))
-int8x16_t vpselq_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_s8)))
-int8x16_t vpselq(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u16)))
-uint16x8_t vpselq_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u16)))
-uint16x8_t vpselq(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u32)))
-uint32x4_t vpselq_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u32)))
-uint32x4_t vpselq(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u64)))
-uint64x2_t vpselq_u64(uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u64)))
-uint64x2_t vpselq(uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u8)))
-uint8x16_t vpselq_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_u8)))
-uint8x16_t vpselq(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s16)))
-int16x8_t vqabsq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s16)))
-int16x8_t vqabsq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s32)))
-int32x4_t vqabsq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s32)))
-int32x4_t vqabsq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s8)))
-int8x16_t vqabsq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_m_s8)))
-int8x16_t vqabsq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s16)))
-int16x8_t vqabsq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s16)))
-int16x8_t vqabsq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s32)))
-int32x4_t vqabsq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s32)))
-int32x4_t vqabsq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s8)))
-int8x16_t vqabsq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqabsq_s8)))
-int8x16_t vqabsq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s16)))
-int16x8_t vqaddq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s16)))
-int16x8_t vqaddq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s32)))
-int32x4_t vqaddq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s32)))
-int32x4_t vqaddq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s8)))
-int8x16_t vqaddq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_s8)))
-int8x16_t vqaddq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u16)))
-uint16x8_t vqaddq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u16)))
-uint16x8_t vqaddq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u32)))
-uint32x4_t vqaddq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u32)))
-uint32x4_t vqaddq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u8)))
-uint8x16_t vqaddq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_n_u8)))
-uint8x16_t vqaddq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s16)))
-int16x8_t vqaddq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s16)))
-int16x8_t vqaddq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s32)))
-int32x4_t vqaddq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s32)))
-int32x4_t vqaddq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s8)))
-int8x16_t vqaddq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_s8)))
-int8x16_t vqaddq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u16)))
-uint16x8_t vqaddq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u16)))
-uint16x8_t vqaddq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u32)))
-uint32x4_t vqaddq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u32)))
-uint32x4_t vqaddq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u8)))
-uint8x16_t vqaddq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_m_u8)))
-uint8x16_t vqaddq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s16)))
-int16x8_t vqaddq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s16)))
-int16x8_t vqaddq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s32)))
-int32x4_t vqaddq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s32)))
-int32x4_t vqaddq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s8)))
-int8x16_t vqaddq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_s8)))
-int8x16_t vqaddq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u16)))
-uint16x8_t vqaddq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u16)))
-uint16x8_t vqaddq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u32)))
-uint32x4_t vqaddq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u32)))
-uint32x4_t vqaddq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u8)))
-uint8x16_t vqaddq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_n_u8)))
-uint8x16_t vqaddq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s16)))
-int16x8_t vqaddq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s16)))
-int16x8_t vqaddq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s32)))
-int32x4_t vqaddq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s32)))
-int32x4_t vqaddq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s8)))
-int8x16_t vqaddq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_s8)))
-int8x16_t vqaddq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u16)))
-uint16x8_t vqaddq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u16)))
-uint16x8_t vqaddq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u32)))
-uint32x4_t vqaddq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u32)))
-uint32x4_t vqaddq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u8)))
-uint8x16_t vqaddq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqaddq_u8)))
-uint8x16_t vqaddq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s16)))
-int16x8_t vqdmladhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s16)))
-int16x8_t vqdmladhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s32)))
-int32x4_t vqdmladhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s32)))
-int32x4_t vqdmladhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s8)))
-int8x16_t vqdmladhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_m_s8)))
-int8x16_t vqdmladhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s16)))
-int16x8_t vqdmladhq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s16)))
-int16x8_t vqdmladhq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s32)))
-int32x4_t vqdmladhq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s32)))
-int32x4_t vqdmladhq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s8)))
-int8x16_t vqdmladhq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhq_s8)))
-int8x16_t vqdmladhq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s16)))
-int16x8_t vqdmladhxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s16)))
-int16x8_t vqdmladhxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s32)))
-int32x4_t vqdmladhxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s32)))
-int32x4_t vqdmladhxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s8)))
-int8x16_t vqdmladhxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_m_s8)))
-int8x16_t vqdmladhxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s16)))
-int16x8_t vqdmladhxq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s16)))
-int16x8_t vqdmladhxq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s32)))
-int32x4_t vqdmladhxq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s32)))
-int32x4_t vqdmladhxq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s8)))
-int8x16_t vqdmladhxq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmladhxq_s8)))
-int8x16_t vqdmladhxq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s16)))
-int16x8_t vqdmlahq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s16)))
-int16x8_t vqdmlahq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s32)))
-int32x4_t vqdmlahq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s32)))
-int32x4_t vqdmlahq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s8)))
-int8x16_t vqdmlahq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_m_n_s8)))
-int8x16_t vqdmlahq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s16)))
-int16x8_t vqdmlahq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s16)))
-int16x8_t vqdmlahq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s32)))
-int32x4_t vqdmlahq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s32)))
-int32x4_t vqdmlahq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s8)))
-int8x16_t vqdmlahq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlahq_n_s8)))
-int8x16_t vqdmlahq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s16)))
-int16x8_t vqdmlashq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s16)))
-int16x8_t vqdmlashq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s32)))
-int32x4_t vqdmlashq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s32)))
-int32x4_t vqdmlashq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s8)))
-int8x16_t vqdmlashq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_m_n_s8)))
-int8x16_t vqdmlashq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s16)))
-int16x8_t vqdmlashq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s16)))
-int16x8_t vqdmlashq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s32)))
-int32x4_t vqdmlashq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s32)))
-int32x4_t vqdmlashq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s8)))
-int8x16_t vqdmlashq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlashq_n_s8)))
-int8x16_t vqdmlashq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s16)))
-int16x8_t vqdmlsdhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s16)))
-int16x8_t vqdmlsdhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s32)))
-int32x4_t vqdmlsdhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s32)))
-int32x4_t vqdmlsdhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s8)))
-int8x16_t vqdmlsdhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_m_s8)))
-int8x16_t vqdmlsdhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s16)))
-int16x8_t vqdmlsdhq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s16)))
-int16x8_t vqdmlsdhq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s32)))
-int32x4_t vqdmlsdhq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s32)))
-int32x4_t vqdmlsdhq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s8)))
-int8x16_t vqdmlsdhq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhq_s8)))
-int8x16_t vqdmlsdhq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s16)))
-int16x8_t vqdmlsdhxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s16)))
-int16x8_t vqdmlsdhxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s32)))
-int32x4_t vqdmlsdhxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s32)))
-int32x4_t vqdmlsdhxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s8)))
-int8x16_t vqdmlsdhxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_m_s8)))
-int8x16_t vqdmlsdhxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s16)))
-int16x8_t vqdmlsdhxq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s16)))
-int16x8_t vqdmlsdhxq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s32)))
-int32x4_t vqdmlsdhxq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s32)))
-int32x4_t vqdmlsdhxq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s8)))
-int8x16_t vqdmlsdhxq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmlsdhxq_s8)))
-int8x16_t vqdmlsdhxq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s16)))
-int16x8_t vqdmulhq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s16)))
-int16x8_t vqdmulhq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s32)))
-int32x4_t vqdmulhq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s32)))
-int32x4_t vqdmulhq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s8)))
-int8x16_t vqdmulhq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_n_s8)))
-int8x16_t vqdmulhq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s16)))
-int16x8_t vqdmulhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s16)))
-int16x8_t vqdmulhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s32)))
-int32x4_t vqdmulhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s32)))
-int32x4_t vqdmulhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s8)))
-int8x16_t vqdmulhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_m_s8)))
-int8x16_t vqdmulhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s16)))
-int16x8_t vqdmulhq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s16)))
-int16x8_t vqdmulhq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s32)))
-int32x4_t vqdmulhq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s32)))
-int32x4_t vqdmulhq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s8)))
-int8x16_t vqdmulhq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_n_s8)))
-int8x16_t vqdmulhq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s16)))
-int16x8_t vqdmulhq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s16)))
-int16x8_t vqdmulhq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s32)))
-int32x4_t vqdmulhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s32)))
-int32x4_t vqdmulhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s8)))
-int8x16_t vqdmulhq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulhq_s8)))
-int8x16_t vqdmulhq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_n_s16)))
-int32x4_t vqdmullbq_m_n_s16(int32x4_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_n_s16)))
-int32x4_t vqdmullbq_m(int32x4_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_n_s32)))
-int64x2_t vqdmullbq_m_n_s32(int64x2_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_n_s32)))
-int64x2_t vqdmullbq_m(int64x2_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_s16)))
-int32x4_t vqdmullbq_m_s16(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_s16)))
-int32x4_t vqdmullbq_m(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_s32)))
-int64x2_t vqdmullbq_m_s32(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_m_s32)))
-int64x2_t vqdmullbq_m(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_n_s16)))
-int32x4_t vqdmullbq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_n_s16)))
-int32x4_t vqdmullbq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_n_s32)))
-int64x2_t vqdmullbq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_n_s32)))
-int64x2_t vqdmullbq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_s16)))
-int32x4_t vqdmullbq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_s16)))
-int32x4_t vqdmullbq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_s32)))
-int64x2_t vqdmullbq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmullbq_s32)))
-int64x2_t vqdmullbq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_n_s16)))
-int32x4_t vqdmulltq_m_n_s16(int32x4_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_n_s16)))
-int32x4_t vqdmulltq_m(int32x4_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_n_s32)))
-int64x2_t vqdmulltq_m_n_s32(int64x2_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_n_s32)))
-int64x2_t vqdmulltq_m(int64x2_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_s16)))
-int32x4_t vqdmulltq_m_s16(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_s16)))
-int32x4_t vqdmulltq_m(int32x4_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_s32)))
-int64x2_t vqdmulltq_m_s32(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_m_s32)))
-int64x2_t vqdmulltq_m(int64x2_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_n_s16)))
-int32x4_t vqdmulltq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_n_s16)))
-int32x4_t vqdmulltq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_n_s32)))
-int64x2_t vqdmulltq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_n_s32)))
-int64x2_t vqdmulltq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_s16)))
-int32x4_t vqdmulltq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_s16)))
-int32x4_t vqdmulltq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_s32)))
-int64x2_t vqdmulltq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqdmulltq_s32)))
-int64x2_t vqdmulltq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_s16)))
-int8x16_t vqmovnbq_m_s16(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_s16)))
-int8x16_t vqmovnbq_m(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_s32)))
-int16x8_t vqmovnbq_m_s32(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_s32)))
-int16x8_t vqmovnbq_m(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_u16)))
-uint8x16_t vqmovnbq_m_u16(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_u16)))
-uint8x16_t vqmovnbq_m(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_u32)))
-uint16x8_t vqmovnbq_m_u32(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_m_u32)))
-uint16x8_t vqmovnbq_m(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_s16)))
-int8x16_t vqmovnbq_s16(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_s16)))
-int8x16_t vqmovnbq(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_s32)))
-int16x8_t vqmovnbq_s32(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_s32)))
-int16x8_t vqmovnbq(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_u16)))
-uint8x16_t vqmovnbq_u16(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_u16)))
-uint8x16_t vqmovnbq(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_u32)))
-uint16x8_t vqmovnbq_u32(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovnbq_u32)))
-uint16x8_t vqmovnbq(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_s16)))
-int8x16_t vqmovntq_m_s16(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_s16)))
-int8x16_t vqmovntq_m(int8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_s32)))
-int16x8_t vqmovntq_m_s32(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_s32)))
-int16x8_t vqmovntq_m(int16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_u16)))
-uint8x16_t vqmovntq_m_u16(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_u16)))
-uint8x16_t vqmovntq_m(uint8x16_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_u32)))
-uint16x8_t vqmovntq_m_u32(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_m_u32)))
-uint16x8_t vqmovntq_m(uint16x8_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_s16)))
-int8x16_t vqmovntq_s16(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_s16)))
-int8x16_t vqmovntq(int8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_s32)))
-int16x8_t vqmovntq_s32(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_s32)))
-int16x8_t vqmovntq(int16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_u16)))
-uint8x16_t vqmovntq_u16(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_u16)))
-uint8x16_t vqmovntq(uint8x16_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_u32)))
-uint16x8_t vqmovntq_u32(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovntq_u32)))
-uint16x8_t vqmovntq(uint16x8_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_m_s16)))
-uint8x16_t vqmovunbq_m_s16(uint8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_m_s16)))
-uint8x16_t vqmovunbq_m(uint8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_m_s32)))
-uint16x8_t vqmovunbq_m_s32(uint16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_m_s32)))
-uint16x8_t vqmovunbq_m(uint16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_s16)))
-uint8x16_t vqmovunbq_s16(uint8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_s16)))
-uint8x16_t vqmovunbq(uint8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_s32)))
-uint16x8_t vqmovunbq_s32(uint16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovunbq_s32)))
-uint16x8_t vqmovunbq(uint16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_m_s16)))
-uint8x16_t vqmovuntq_m_s16(uint8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_m_s16)))
-uint8x16_t vqmovuntq_m(uint8x16_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_m_s32)))
-uint16x8_t vqmovuntq_m_s32(uint16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_m_s32)))
-uint16x8_t vqmovuntq_m(uint16x8_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_s16)))
-uint8x16_t vqmovuntq_s16(uint8x16_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_s16)))
-uint8x16_t vqmovuntq(uint8x16_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_s32)))
-uint16x8_t vqmovuntq_s32(uint16x8_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqmovuntq_s32)))
-uint16x8_t vqmovuntq(uint16x8_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s16)))
-int16x8_t vqnegq_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s16)))
-int16x8_t vqnegq_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s32)))
-int32x4_t vqnegq_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s32)))
-int32x4_t vqnegq_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s8)))
-int8x16_t vqnegq_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_m_s8)))
-int8x16_t vqnegq_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s16)))
-int16x8_t vqnegq_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s16)))
-int16x8_t vqnegq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s32)))
-int32x4_t vqnegq_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s32)))
-int32x4_t vqnegq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s8)))
-int8x16_t vqnegq_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqnegq_s8)))
-int8x16_t vqnegq(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s16)))
-int16x8_t vqrdmladhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s16)))
-int16x8_t vqrdmladhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s32)))
-int32x4_t vqrdmladhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s32)))
-int32x4_t vqrdmladhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s8)))
-int8x16_t vqrdmladhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_m_s8)))
-int8x16_t vqrdmladhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s16)))
-int16x8_t vqrdmladhq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s16)))
-int16x8_t vqrdmladhq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s32)))
-int32x4_t vqrdmladhq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s32)))
-int32x4_t vqrdmladhq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s8)))
-int8x16_t vqrdmladhq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhq_s8)))
-int8x16_t vqrdmladhq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s16)))
-int16x8_t vqrdmladhxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s16)))
-int16x8_t vqrdmladhxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s32)))
-int32x4_t vqrdmladhxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s32)))
-int32x4_t vqrdmladhxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s8)))
-int8x16_t vqrdmladhxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_m_s8)))
-int8x16_t vqrdmladhxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s16)))
-int16x8_t vqrdmladhxq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s16)))
-int16x8_t vqrdmladhxq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s32)))
-int32x4_t vqrdmladhxq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s32)))
-int32x4_t vqrdmladhxq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s8)))
-int8x16_t vqrdmladhxq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmladhxq_s8)))
-int8x16_t vqrdmladhxq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s16)))
-int16x8_t vqrdmlahq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s16)))
-int16x8_t vqrdmlahq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s32)))
-int32x4_t vqrdmlahq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s32)))
-int32x4_t vqrdmlahq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s8)))
-int8x16_t vqrdmlahq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_m_n_s8)))
-int8x16_t vqrdmlahq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s16)))
-int16x8_t vqrdmlahq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s16)))
-int16x8_t vqrdmlahq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s32)))
-int32x4_t vqrdmlahq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s32)))
-int32x4_t vqrdmlahq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s8)))
-int8x16_t vqrdmlahq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlahq_n_s8)))
-int8x16_t vqrdmlahq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s16)))
-int16x8_t vqrdmlashq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s16)))
-int16x8_t vqrdmlashq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s32)))
-int32x4_t vqrdmlashq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s32)))
-int32x4_t vqrdmlashq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s8)))
-int8x16_t vqrdmlashq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_m_n_s8)))
-int8x16_t vqrdmlashq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s16)))
-int16x8_t vqrdmlashq_n_s16(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s16)))
-int16x8_t vqrdmlashq(int16x8_t, int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s32)))
-int32x4_t vqrdmlashq_n_s32(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s32)))
-int32x4_t vqrdmlashq(int32x4_t, int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s8)))
-int8x16_t vqrdmlashq_n_s8(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlashq_n_s8)))
-int8x16_t vqrdmlashq(int8x16_t, int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s16)))
-int16x8_t vqrdmlsdhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s16)))
-int16x8_t vqrdmlsdhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s32)))
-int32x4_t vqrdmlsdhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s32)))
-int32x4_t vqrdmlsdhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s8)))
-int8x16_t vqrdmlsdhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_m_s8)))
-int8x16_t vqrdmlsdhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s16)))
-int16x8_t vqrdmlsdhq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s16)))
-int16x8_t vqrdmlsdhq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s32)))
-int32x4_t vqrdmlsdhq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s32)))
-int32x4_t vqrdmlsdhq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s8)))
-int8x16_t vqrdmlsdhq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhq_s8)))
-int8x16_t vqrdmlsdhq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s16)))
-int16x8_t vqrdmlsdhxq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s16)))
-int16x8_t vqrdmlsdhxq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s32)))
-int32x4_t vqrdmlsdhxq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s32)))
-int32x4_t vqrdmlsdhxq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s8)))
-int8x16_t vqrdmlsdhxq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_m_s8)))
-int8x16_t vqrdmlsdhxq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s16)))
-int16x8_t vqrdmlsdhxq_s16(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s16)))
-int16x8_t vqrdmlsdhxq(int16x8_t, int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s32)))
-int32x4_t vqrdmlsdhxq_s32(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s32)))
-int32x4_t vqrdmlsdhxq(int32x4_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s8)))
-int8x16_t vqrdmlsdhxq_s8(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmlsdhxq_s8)))
-int8x16_t vqrdmlsdhxq(int8x16_t, int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s16)))
-int16x8_t vqrdmulhq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s16)))
-int16x8_t vqrdmulhq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s32)))
-int32x4_t vqrdmulhq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s32)))
-int32x4_t vqrdmulhq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s8)))
-int8x16_t vqrdmulhq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_n_s8)))
-int8x16_t vqrdmulhq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s16)))
-int16x8_t vqrdmulhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s16)))
-int16x8_t vqrdmulhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s32)))
-int32x4_t vqrdmulhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s32)))
-int32x4_t vqrdmulhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s8)))
-int8x16_t vqrdmulhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_m_s8)))
-int8x16_t vqrdmulhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s16)))
-int16x8_t vqrdmulhq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s16)))
-int16x8_t vqrdmulhq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s32)))
-int32x4_t vqrdmulhq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s32)))
-int32x4_t vqrdmulhq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s8)))
-int8x16_t vqrdmulhq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_n_s8)))
-int8x16_t vqrdmulhq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s16)))
-int16x8_t vqrdmulhq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s16)))
-int16x8_t vqrdmulhq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s32)))
-int32x4_t vqrdmulhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s32)))
-int32x4_t vqrdmulhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s8)))
-int8x16_t vqrdmulhq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrdmulhq_s8)))
-int8x16_t vqrdmulhq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s16)))
-int16x8_t vqrshlq_m_n_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s16)))
-int16x8_t vqrshlq_m_n(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s32)))
-int32x4_t vqrshlq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s32)))
-int32x4_t vqrshlq_m_n(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s8)))
-int8x16_t vqrshlq_m_n_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_s8)))
-int8x16_t vqrshlq_m_n(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u16)))
-uint16x8_t vqrshlq_m_n_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u16)))
-uint16x8_t vqrshlq_m_n(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u32)))
-uint32x4_t vqrshlq_m_n_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u32)))
-uint32x4_t vqrshlq_m_n(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u8)))
-uint8x16_t vqrshlq_m_n_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_n_u8)))
-uint8x16_t vqrshlq_m_n(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s16)))
-int16x8_t vqrshlq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s16)))
-int16x8_t vqrshlq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s32)))
-int32x4_t vqrshlq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s32)))
-int32x4_t vqrshlq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s8)))
-int8x16_t vqrshlq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_s8)))
-int8x16_t vqrshlq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u16)))
-uint16x8_t vqrshlq_m_u16(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u16)))
-uint16x8_t vqrshlq_m(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u32)))
-uint32x4_t vqrshlq_m_u32(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u32)))
-uint32x4_t vqrshlq_m(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u8)))
-uint8x16_t vqrshlq_m_u8(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_m_u8)))
-uint8x16_t vqrshlq_m(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s16)))
-int16x8_t vqrshlq_n_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s16)))
-int16x8_t vqrshlq(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s32)))
-int32x4_t vqrshlq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s32)))
-int32x4_t vqrshlq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s8)))
-int8x16_t vqrshlq_n_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_s8)))
-int8x16_t vqrshlq(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u16)))
-uint16x8_t vqrshlq_n_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u16)))
-uint16x8_t vqrshlq(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u32)))
-uint32x4_t vqrshlq_n_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u32)))
-uint32x4_t vqrshlq(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u8)))
-uint8x16_t vqrshlq_n_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_n_u8)))
-uint8x16_t vqrshlq(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s16)))
-int16x8_t vqrshlq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s16)))
-int16x8_t vqrshlq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s32)))
-int32x4_t vqrshlq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s32)))
-int32x4_t vqrshlq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s8)))
-int8x16_t vqrshlq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_s8)))
-int8x16_t vqrshlq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u16)))
-uint16x8_t vqrshlq_u16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u16)))
-uint16x8_t vqrshlq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u32)))
-uint32x4_t vqrshlq_u32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u32)))
-uint32x4_t vqrshlq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u8)))
-uint8x16_t vqrshlq_u8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshlq_u8)))
-uint8x16_t vqrshlq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_s16)))
-int8x16_t vqrshrnbq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_s16)))
-int8x16_t vqrshrnbq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_s32)))
-int16x8_t vqrshrnbq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_s32)))
-int16x8_t vqrshrnbq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_u16)))
-uint8x16_t vqrshrnbq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_u16)))
-uint8x16_t vqrshrnbq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_u32)))
-uint16x8_t vqrshrnbq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_m_n_u32)))
-uint16x8_t vqrshrnbq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_s16)))
-int8x16_t vqrshrnbq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_s16)))
-int8x16_t vqrshrnbq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_s32)))
-int16x8_t vqrshrnbq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_s32)))
-int16x8_t vqrshrnbq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_u16)))
-uint8x16_t vqrshrnbq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_u16)))
-uint8x16_t vqrshrnbq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_u32)))
-uint16x8_t vqrshrnbq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrnbq_n_u32)))
-uint16x8_t vqrshrnbq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_s16)))
-int8x16_t vqrshrntq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_s16)))
-int8x16_t vqrshrntq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_s32)))
-int16x8_t vqrshrntq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_s32)))
-int16x8_t vqrshrntq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_u16)))
-uint8x16_t vqrshrntq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_u16)))
-uint8x16_t vqrshrntq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_u32)))
-uint16x8_t vqrshrntq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_m_n_u32)))
-uint16x8_t vqrshrntq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_s16)))
-int8x16_t vqrshrntq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_s16)))
-int8x16_t vqrshrntq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_s32)))
-int16x8_t vqrshrntq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_s32)))
-int16x8_t vqrshrntq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_u16)))
-uint8x16_t vqrshrntq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_u16)))
-uint8x16_t vqrshrntq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_u32)))
-uint16x8_t vqrshrntq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrntq_n_u32)))
-uint16x8_t vqrshrntq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_m_n_s16)))
-uint8x16_t vqrshrunbq_m_n_s16(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_m_n_s16)))
-uint8x16_t vqrshrunbq_m(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_m_n_s32)))
-uint16x8_t vqrshrunbq_m_n_s32(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_m_n_s32)))
-uint16x8_t vqrshrunbq_m(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_n_s16)))
-uint8x16_t vqrshrunbq_n_s16(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_n_s16)))
-uint8x16_t vqrshrunbq(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_n_s32)))
-uint16x8_t vqrshrunbq_n_s32(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshrunbq_n_s32)))
-uint16x8_t vqrshrunbq(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_m_n_s16)))
-uint8x16_t vqrshruntq_m_n_s16(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_m_n_s16)))
-uint8x16_t vqrshruntq_m(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_m_n_s32)))
-uint16x8_t vqrshruntq_m_n_s32(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_m_n_s32)))
-uint16x8_t vqrshruntq_m(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_n_s16)))
-uint8x16_t vqrshruntq_n_s16(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_n_s16)))
-uint8x16_t vqrshruntq(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_n_s32)))
-uint16x8_t vqrshruntq_n_s32(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqrshruntq_n_s32)))
-uint16x8_t vqrshruntq(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s16)))
-int16x8_t vqshlq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s16)))
-int16x8_t vqshlq_m_n(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s32)))
-int32x4_t vqshlq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s32)))
-int32x4_t vqshlq_m_n(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s8)))
-int8x16_t vqshlq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_s8)))
-int8x16_t vqshlq_m_n(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u16)))
-uint16x8_t vqshlq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u16)))
-uint16x8_t vqshlq_m_n(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u32)))
-uint32x4_t vqshlq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u32)))
-uint32x4_t vqshlq_m_n(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u8)))
-uint8x16_t vqshlq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_n_u8)))
-uint8x16_t vqshlq_m_n(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s16)))
-int16x8_t vqshlq_m_r_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s16)))
-int16x8_t vqshlq_m_r(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s32)))
-int32x4_t vqshlq_m_r_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s32)))
-int32x4_t vqshlq_m_r(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s8)))
-int8x16_t vqshlq_m_r_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_s8)))
-int8x16_t vqshlq_m_r(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u16)))
-uint16x8_t vqshlq_m_r_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u16)))
-uint16x8_t vqshlq_m_r(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u32)))
-uint32x4_t vqshlq_m_r_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u32)))
-uint32x4_t vqshlq_m_r(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u8)))
-uint8x16_t vqshlq_m_r_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_r_u8)))
-uint8x16_t vqshlq_m_r(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s16)))
-int16x8_t vqshlq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s16)))
-int16x8_t vqshlq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s32)))
-int32x4_t vqshlq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s32)))
-int32x4_t vqshlq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s8)))
-int8x16_t vqshlq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_s8)))
-int8x16_t vqshlq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u16)))
-uint16x8_t vqshlq_m_u16(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u16)))
-uint16x8_t vqshlq_m(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u32)))
-uint32x4_t vqshlq_m_u32(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u32)))
-uint32x4_t vqshlq_m(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u8)))
-uint8x16_t vqshlq_m_u8(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_m_u8)))
-uint8x16_t vqshlq_m(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s16)))
-int16x8_t vqshlq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s16)))
-int16x8_t vqshlq_n(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s32)))
-int32x4_t vqshlq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s32)))
-int32x4_t vqshlq_n(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s8)))
-int8x16_t vqshlq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_s8)))
-int8x16_t vqshlq_n(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u16)))
-uint16x8_t vqshlq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u16)))
-uint16x8_t vqshlq_n(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u32)))
-uint32x4_t vqshlq_n_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u32)))
-uint32x4_t vqshlq_n(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u8)))
-uint8x16_t vqshlq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_n_u8)))
-uint8x16_t vqshlq_n(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s16)))
-int16x8_t vqshlq_r_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s16)))
-int16x8_t vqshlq_r(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s32)))
-int32x4_t vqshlq_r_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s32)))
-int32x4_t vqshlq_r(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s8)))
-int8x16_t vqshlq_r_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_s8)))
-int8x16_t vqshlq_r(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u16)))
-uint16x8_t vqshlq_r_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u16)))
-uint16x8_t vqshlq_r(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u32)))
-uint32x4_t vqshlq_r_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u32)))
-uint32x4_t vqshlq_r(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u8)))
-uint8x16_t vqshlq_r_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_r_u8)))
-uint8x16_t vqshlq_r(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s16)))
-int16x8_t vqshlq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s16)))
-int16x8_t vqshlq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s32)))
-int32x4_t vqshlq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s32)))
-int32x4_t vqshlq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s8)))
-int8x16_t vqshlq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_s8)))
-int8x16_t vqshlq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u16)))
-uint16x8_t vqshlq_u16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u16)))
-uint16x8_t vqshlq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u32)))
-uint32x4_t vqshlq_u32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u32)))
-uint32x4_t vqshlq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u8)))
-uint8x16_t vqshlq_u8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshlq_u8)))
-uint8x16_t vqshlq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s16)))
-uint16x8_t vqshluq_m_n_s16(uint16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s16)))
-uint16x8_t vqshluq_m(uint16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s32)))
-uint32x4_t vqshluq_m_n_s32(uint32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s32)))
-uint32x4_t vqshluq_m(uint32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s8)))
-uint8x16_t vqshluq_m_n_s8(uint8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_m_n_s8)))
-uint8x16_t vqshluq_m(uint8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s16)))
-uint16x8_t vqshluq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s16)))
-uint16x8_t vqshluq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s32)))
-uint32x4_t vqshluq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s32)))
-uint32x4_t vqshluq(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s8)))
-uint8x16_t vqshluq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshluq_n_s8)))
-uint8x16_t vqshluq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_s16)))
-int8x16_t vqshrnbq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_s16)))
-int8x16_t vqshrnbq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_s32)))
-int16x8_t vqshrnbq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_s32)))
-int16x8_t vqshrnbq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_u16)))
-uint8x16_t vqshrnbq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_u16)))
-uint8x16_t vqshrnbq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_u32)))
-uint16x8_t vqshrnbq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_m_n_u32)))
-uint16x8_t vqshrnbq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_s16)))
-int8x16_t vqshrnbq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_s16)))
-int8x16_t vqshrnbq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_s32)))
-int16x8_t vqshrnbq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_s32)))
-int16x8_t vqshrnbq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_u16)))
-uint8x16_t vqshrnbq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_u16)))
-uint8x16_t vqshrnbq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_u32)))
-uint16x8_t vqshrnbq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrnbq_n_u32)))
-uint16x8_t vqshrnbq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_s16)))
-int8x16_t vqshrntq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_s16)))
-int8x16_t vqshrntq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_s32)))
-int16x8_t vqshrntq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_s32)))
-int16x8_t vqshrntq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_u16)))
-uint8x16_t vqshrntq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_u16)))
-uint8x16_t vqshrntq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_u32)))
-uint16x8_t vqshrntq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_m_n_u32)))
-uint16x8_t vqshrntq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_s16)))
-int8x16_t vqshrntq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_s16)))
-int8x16_t vqshrntq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_s32)))
-int16x8_t vqshrntq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_s32)))
-int16x8_t vqshrntq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_u16)))
-uint8x16_t vqshrntq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_u16)))
-uint8x16_t vqshrntq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_u32)))
-uint16x8_t vqshrntq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrntq_n_u32)))
-uint16x8_t vqshrntq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_m_n_s16)))
-uint8x16_t vqshrunbq_m_n_s16(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_m_n_s16)))
-uint8x16_t vqshrunbq_m(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_m_n_s32)))
-uint16x8_t vqshrunbq_m_n_s32(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_m_n_s32)))
-uint16x8_t vqshrunbq_m(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_n_s16)))
-uint8x16_t vqshrunbq_n_s16(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_n_s16)))
-uint8x16_t vqshrunbq(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_n_s32)))
-uint16x8_t vqshrunbq_n_s32(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshrunbq_n_s32)))
-uint16x8_t vqshrunbq(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_m_n_s16)))
-uint8x16_t vqshruntq_m_n_s16(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_m_n_s16)))
-uint8x16_t vqshruntq_m(uint8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_m_n_s32)))
-uint16x8_t vqshruntq_m_n_s32(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_m_n_s32)))
-uint16x8_t vqshruntq_m(uint16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_n_s16)))
-uint8x16_t vqshruntq_n_s16(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_n_s16)))
-uint8x16_t vqshruntq(uint8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_n_s32)))
-uint16x8_t vqshruntq_n_s32(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqshruntq_n_s32)))
-uint16x8_t vqshruntq(uint16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s16)))
-int16x8_t vqsubq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s16)))
-int16x8_t vqsubq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s32)))
-int32x4_t vqsubq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s32)))
-int32x4_t vqsubq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s8)))
-int8x16_t vqsubq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_s8)))
-int8x16_t vqsubq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u16)))
-uint16x8_t vqsubq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u16)))
-uint16x8_t vqsubq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u32)))
-uint32x4_t vqsubq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u32)))
-uint32x4_t vqsubq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u8)))
-uint8x16_t vqsubq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_n_u8)))
-uint8x16_t vqsubq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s16)))
-int16x8_t vqsubq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s16)))
-int16x8_t vqsubq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s32)))
-int32x4_t vqsubq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s32)))
-int32x4_t vqsubq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s8)))
-int8x16_t vqsubq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_s8)))
-int8x16_t vqsubq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u16)))
-uint16x8_t vqsubq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u16)))
-uint16x8_t vqsubq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u32)))
-uint32x4_t vqsubq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u32)))
-uint32x4_t vqsubq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u8)))
-uint8x16_t vqsubq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_m_u8)))
-uint8x16_t vqsubq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s16)))
-int16x8_t vqsubq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s16)))
-int16x8_t vqsubq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s32)))
-int32x4_t vqsubq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s32)))
-int32x4_t vqsubq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s8)))
-int8x16_t vqsubq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_s8)))
-int8x16_t vqsubq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u16)))
-uint16x8_t vqsubq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u16)))
-uint16x8_t vqsubq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u32)))
-uint32x4_t vqsubq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u32)))
-uint32x4_t vqsubq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u8)))
-uint8x16_t vqsubq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_n_u8)))
-uint8x16_t vqsubq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s16)))
-int16x8_t vqsubq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s16)))
-int16x8_t vqsubq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s32)))
-int32x4_t vqsubq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s32)))
-int32x4_t vqsubq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s8)))
-int8x16_t vqsubq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_s8)))
-int8x16_t vqsubq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u16)))
-uint16x8_t vqsubq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u16)))
-uint16x8_t vqsubq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u32)))
-uint32x4_t vqsubq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u32)))
-uint32x4_t vqsubq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u8)))
-uint8x16_t vqsubq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vqsubq_u8)))
-uint8x16_t vqsubq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s32)))
-int16x8_t vreinterpretq_s16_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s32)))
-int16x8_t vreinterpretq_s16(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s64)))
-int16x8_t vreinterpretq_s16_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s64)))
-int16x8_t vreinterpretq_s16(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s8)))
-int16x8_t vreinterpretq_s16_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_s8)))
-int16x8_t vreinterpretq_s16(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u16)))
-int16x8_t vreinterpretq_s16_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u16)))
-int16x8_t vreinterpretq_s16(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u32)))
-int16x8_t vreinterpretq_s16_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u32)))
-int16x8_t vreinterpretq_s16(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u64)))
-int16x8_t vreinterpretq_s16_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u64)))
-int16x8_t vreinterpretq_s16(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
-int16x8_t vreinterpretq_s16_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
-int16x8_t vreinterpretq_s16(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s16)))
-int32x4_t vreinterpretq_s32_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s16)))
-int32x4_t vreinterpretq_s32(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s64)))
-int32x4_t vreinterpretq_s32_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s64)))
-int32x4_t vreinterpretq_s32(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s8)))
-int32x4_t vreinterpretq_s32_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_s8)))
-int32x4_t vreinterpretq_s32(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u16)))
-int32x4_t vreinterpretq_s32_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u16)))
-int32x4_t vreinterpretq_s32(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u32)))
-int32x4_t vreinterpretq_s32_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u32)))
-int32x4_t vreinterpretq_s32(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u64)))
-int32x4_t vreinterpretq_s32_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u64)))
-int32x4_t vreinterpretq_s32(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
-int32x4_t vreinterpretq_s32_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
-int32x4_t vreinterpretq_s32(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s16)))
-int64x2_t vreinterpretq_s64_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s16)))
-int64x2_t vreinterpretq_s64(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s32)))
-int64x2_t vreinterpretq_s64_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s32)))
-int64x2_t vreinterpretq_s64(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s8)))
-int64x2_t vreinterpretq_s64_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_s8)))
-int64x2_t vreinterpretq_s64(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u16)))
-int64x2_t vreinterpretq_s64_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u16)))
-int64x2_t vreinterpretq_s64(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u32)))
-int64x2_t vreinterpretq_s64_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u32)))
-int64x2_t vreinterpretq_s64(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u64)))
-int64x2_t vreinterpretq_s64_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u64)))
-int64x2_t vreinterpretq_s64(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
-int64x2_t vreinterpretq_s64_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
-int64x2_t vreinterpretq_s64(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s16)))
-int8x16_t vreinterpretq_s8_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s16)))
-int8x16_t vreinterpretq_s8(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s32)))
-int8x16_t vreinterpretq_s8_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s32)))
-int8x16_t vreinterpretq_s8(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s64)))
-int8x16_t vreinterpretq_s8_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_s64)))
-int8x16_t vreinterpretq_s8(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u16)))
-int8x16_t vreinterpretq_s8_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u16)))
-int8x16_t vreinterpretq_s8(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u32)))
-int8x16_t vreinterpretq_s8_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u32)))
-int8x16_t vreinterpretq_s8(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u64)))
-int8x16_t vreinterpretq_s8_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u64)))
-int8x16_t vreinterpretq_s8(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
-int8x16_t vreinterpretq_s8_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
-int8x16_t vreinterpretq_s8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s16)))
-uint16x8_t vreinterpretq_u16_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s16)))
-uint16x8_t vreinterpretq_u16(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s32)))
-uint16x8_t vreinterpretq_u16_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s32)))
-uint16x8_t vreinterpretq_u16(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s64)))
-uint16x8_t vreinterpretq_u16_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s64)))
-uint16x8_t vreinterpretq_u16(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s8)))
-uint16x8_t vreinterpretq_u16_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_s8)))
-uint16x8_t vreinterpretq_u16(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u32)))
-uint16x8_t vreinterpretq_u16_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u32)))
-uint16x8_t vreinterpretq_u16(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u64)))
-uint16x8_t vreinterpretq_u16_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u64)))
-uint16x8_t vreinterpretq_u16(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
-uint16x8_t vreinterpretq_u16_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
-uint16x8_t vreinterpretq_u16(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s16)))
-uint32x4_t vreinterpretq_u32_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s16)))
-uint32x4_t vreinterpretq_u32(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s32)))
-uint32x4_t vreinterpretq_u32_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s32)))
-uint32x4_t vreinterpretq_u32(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s64)))
-uint32x4_t vreinterpretq_u32_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s64)))
-uint32x4_t vreinterpretq_u32(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s8)))
-uint32x4_t vreinterpretq_u32_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_s8)))
-uint32x4_t vreinterpretq_u32(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u16)))
-uint32x4_t vreinterpretq_u32_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u16)))
-uint32x4_t vreinterpretq_u32(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u64)))
-uint32x4_t vreinterpretq_u32_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u64)))
-uint32x4_t vreinterpretq_u32(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
-uint32x4_t vreinterpretq_u32_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
-uint32x4_t vreinterpretq_u32(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s16)))
-uint64x2_t vreinterpretq_u64_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s16)))
-uint64x2_t vreinterpretq_u64(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s32)))
-uint64x2_t vreinterpretq_u64_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s32)))
-uint64x2_t vreinterpretq_u64(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s64)))
-uint64x2_t vreinterpretq_u64_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s64)))
-uint64x2_t vreinterpretq_u64(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s8)))
-uint64x2_t vreinterpretq_u64_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_s8)))
-uint64x2_t vreinterpretq_u64(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u16)))
-uint64x2_t vreinterpretq_u64_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u16)))
-uint64x2_t vreinterpretq_u64(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u32)))
-uint64x2_t vreinterpretq_u64_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u32)))
-uint64x2_t vreinterpretq_u64(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
-uint64x2_t vreinterpretq_u64_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
-uint64x2_t vreinterpretq_u64(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
-uint8x16_t vreinterpretq_u8_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
-uint8x16_t vreinterpretq_u8(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
-uint8x16_t vreinterpretq_u8_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
-uint8x16_t vreinterpretq_u8(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
-uint8x16_t vreinterpretq_u8_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
-uint8x16_t vreinterpretq_u8(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
-uint8x16_t vreinterpretq_u8_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
-uint8x16_t vreinterpretq_u8(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
-uint8x16_t vreinterpretq_u8_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
-uint8x16_t vreinterpretq_u8(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
-uint8x16_t vreinterpretq_u8_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
-uint8x16_t vreinterpretq_u8(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
-uint8x16_t vreinterpretq_u8_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
-uint8x16_t vreinterpretq_u8(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_m_s8)))
-int8x16_t vrev16q_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_m_s8)))
-int8x16_t vrev16q_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_m_u8)))
-uint8x16_t vrev16q_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_m_u8)))
-uint8x16_t vrev16q_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_s8)))
-int8x16_t vrev16q_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_s8)))
-int8x16_t vrev16q(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_u8)))
-uint8x16_t vrev16q_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_u8)))
-uint8x16_t vrev16q(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_x_s8)))
-int8x16_t vrev16q_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_x_s8)))
-int8x16_t vrev16q_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_x_u8)))
-uint8x16_t vrev16q_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev16q_x_u8)))
-uint8x16_t vrev16q_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_s16)))
-int16x8_t vrev32q_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_s16)))
-int16x8_t vrev32q_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_s8)))
-int8x16_t vrev32q_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_s8)))
-int8x16_t vrev32q_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_u16)))
-uint16x8_t vrev32q_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_u16)))
-uint16x8_t vrev32q_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_u8)))
-uint8x16_t vrev32q_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_u8)))
-uint8x16_t vrev32q_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_s16)))
-int16x8_t vrev32q_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_s16)))
-int16x8_t vrev32q(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_s8)))
-int8x16_t vrev32q_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_s8)))
-int8x16_t vrev32q(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_u16)))
-uint16x8_t vrev32q_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_u16)))
-uint16x8_t vrev32q(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_u8)))
-uint8x16_t vrev32q_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_u8)))
-uint8x16_t vrev32q(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_s16)))
-int16x8_t vrev32q_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_s16)))
-int16x8_t vrev32q_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_s8)))
-int8x16_t vrev32q_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_s8)))
-int8x16_t vrev32q_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_u16)))
-uint16x8_t vrev32q_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_u16)))
-uint16x8_t vrev32q_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_u8)))
-uint8x16_t vrev32q_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_u8)))
-uint8x16_t vrev32q_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s16)))
-int16x8_t vrev64q_m_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s16)))
-int16x8_t vrev64q_m(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s32)))
-int32x4_t vrev64q_m_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s32)))
-int32x4_t vrev64q_m(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s8)))
-int8x16_t vrev64q_m_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_s8)))
-int8x16_t vrev64q_m(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u16)))
-uint16x8_t vrev64q_m_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u16)))
-uint16x8_t vrev64q_m(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u32)))
-uint32x4_t vrev64q_m_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u32)))
-uint32x4_t vrev64q_m(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u8)))
-uint8x16_t vrev64q_m_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_u8)))
-uint8x16_t vrev64q_m(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s16)))
-int16x8_t vrev64q_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s16)))
-int16x8_t vrev64q(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s32)))
-int32x4_t vrev64q_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s32)))
-int32x4_t vrev64q(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s8)))
-int8x16_t vrev64q_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_s8)))
-int8x16_t vrev64q(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u16)))
-uint16x8_t vrev64q_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u16)))
-uint16x8_t vrev64q(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u32)))
-uint32x4_t vrev64q_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u32)))
-uint32x4_t vrev64q(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u8)))
-uint8x16_t vrev64q_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_u8)))
-uint8x16_t vrev64q(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s16)))
-int16x8_t vrev64q_x_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s16)))
-int16x8_t vrev64q_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s32)))
-int32x4_t vrev64q_x_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s32)))
-int32x4_t vrev64q_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s8)))
-int8x16_t vrev64q_x_s8(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_s8)))
-int8x16_t vrev64q_x(int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u16)))
-uint16x8_t vrev64q_x_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u16)))
-uint16x8_t vrev64q_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u32)))
-uint32x4_t vrev64q_x_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u32)))
-uint32x4_t vrev64q_x(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u8)))
-uint8x16_t vrev64q_x_u8(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_u8)))
-uint8x16_t vrev64q_x(uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s16)))
-int16x8_t vrhaddq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s16)))
-int16x8_t vrhaddq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s32)))
-int32x4_t vrhaddq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s32)))
-int32x4_t vrhaddq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s8)))
-int8x16_t vrhaddq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_s8)))
-int8x16_t vrhaddq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u16)))
-uint16x8_t vrhaddq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u16)))
-uint16x8_t vrhaddq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u32)))
-uint32x4_t vrhaddq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u32)))
-uint32x4_t vrhaddq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u8)))
-uint8x16_t vrhaddq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_m_u8)))
-uint8x16_t vrhaddq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s16)))
-int16x8_t vrhaddq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s16)))
-int16x8_t vrhaddq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s32)))
-int32x4_t vrhaddq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s32)))
-int32x4_t vrhaddq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s8)))
-int8x16_t vrhaddq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_s8)))
-int8x16_t vrhaddq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u16)))
-uint16x8_t vrhaddq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u16)))
-uint16x8_t vrhaddq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u32)))
-uint32x4_t vrhaddq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u32)))
-uint32x4_t vrhaddq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u8)))
-uint8x16_t vrhaddq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_u8)))
-uint8x16_t vrhaddq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s16)))
-int16x8_t vrhaddq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s16)))
-int16x8_t vrhaddq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s32)))
-int32x4_t vrhaddq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s32)))
-int32x4_t vrhaddq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s8)))
-int8x16_t vrhaddq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_s8)))
-int8x16_t vrhaddq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u16)))
-uint16x8_t vrhaddq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u16)))
-uint16x8_t vrhaddq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u32)))
-uint32x4_t vrhaddq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u32)))
-uint32x4_t vrhaddq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u8)))
-uint8x16_t vrhaddq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrhaddq_x_u8)))
-uint8x16_t vrhaddq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_p_s32)))
-int64_t vrmlaldavhaq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_p_s32)))
-int64_t vrmlaldavhaq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_p_u32)))
-uint64_t vrmlaldavhaq_p_u32(uint64_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_p_u32)))
-uint64_t vrmlaldavhaq_p(uint64_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_s32)))
-int64_t vrmlaldavhaq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_s32)))
-int64_t vrmlaldavhaq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_u32)))
-uint64_t vrmlaldavhaq_u32(uint64_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaq_u32)))
-uint64_t vrmlaldavhaq(uint64_t, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaxq_p_s32)))
-int64_t vrmlaldavhaxq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaxq_p_s32)))
-int64_t vrmlaldavhaxq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaxq_s32)))
-int64_t vrmlaldavhaxq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhaxq_s32)))
-int64_t vrmlaldavhaxq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_p_s32)))
-int64_t vrmlaldavhq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_p_s32)))
-int64_t vrmlaldavhq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_p_u32)))
-uint64_t vrmlaldavhq_p_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_p_u32)))
-uint64_t vrmlaldavhq_p(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_s32)))
-int64_t vrmlaldavhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_s32)))
-int64_t vrmlaldavhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_u32)))
-uint64_t vrmlaldavhq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhq_u32)))
-uint64_t vrmlaldavhq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhxq_p_s32)))
-int64_t vrmlaldavhxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhxq_p_s32)))
-int64_t vrmlaldavhxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhxq_s32)))
-int64_t vrmlaldavhxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlaldavhxq_s32)))
-int64_t vrmlaldavhxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaq_p_s32)))
-int64_t vrmlsldavhaq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaq_p_s32)))
-int64_t vrmlsldavhaq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaq_s32)))
-int64_t vrmlsldavhaq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaq_s32)))
-int64_t vrmlsldavhaq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaxq_p_s32)))
-int64_t vrmlsldavhaxq_p_s32(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaxq_p_s32)))
-int64_t vrmlsldavhaxq_p(int64_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaxq_s32)))
-int64_t vrmlsldavhaxq_s32(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhaxq_s32)))
-int64_t vrmlsldavhaxq(int64_t, int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhq_p_s32)))
-int64_t vrmlsldavhq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhq_p_s32)))
-int64_t vrmlsldavhq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhq_s32)))
-int64_t vrmlsldavhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhq_s32)))
-int64_t vrmlsldavhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhxq_p_s32)))
-int64_t vrmlsldavhxq_p_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhxq_p_s32)))
-int64_t vrmlsldavhxq_p(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhxq_s32)))
-int64_t vrmlsldavhxq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmlsldavhxq_s32)))
-int64_t vrmlsldavhxq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s16)))
-int16x8_t vrmulhq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s16)))
-int16x8_t vrmulhq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s32)))
-int32x4_t vrmulhq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s32)))
-int32x4_t vrmulhq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s8)))
-int8x16_t vrmulhq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_s8)))
-int8x16_t vrmulhq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u16)))
-uint16x8_t vrmulhq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u16)))
-uint16x8_t vrmulhq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u32)))
-uint32x4_t vrmulhq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u32)))
-uint32x4_t vrmulhq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u8)))
-uint8x16_t vrmulhq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_m_u8)))
-uint8x16_t vrmulhq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s16)))
-int16x8_t vrmulhq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s16)))
-int16x8_t vrmulhq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s32)))
-int32x4_t vrmulhq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s32)))
-int32x4_t vrmulhq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s8)))
-int8x16_t vrmulhq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_s8)))
-int8x16_t vrmulhq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u16)))
-uint16x8_t vrmulhq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u16)))
-uint16x8_t vrmulhq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u32)))
-uint32x4_t vrmulhq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u32)))
-uint32x4_t vrmulhq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u8)))
-uint8x16_t vrmulhq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_u8)))
-uint8x16_t vrmulhq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s16)))
-int16x8_t vrmulhq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s16)))
-int16x8_t vrmulhq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s32)))
-int32x4_t vrmulhq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s32)))
-int32x4_t vrmulhq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s8)))
-int8x16_t vrmulhq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_s8)))
-int8x16_t vrmulhq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u16)))
-uint16x8_t vrmulhq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u16)))
-uint16x8_t vrmulhq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u32)))
-uint32x4_t vrmulhq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u32)))
-uint32x4_t vrmulhq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u8)))
-uint8x16_t vrmulhq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrmulhq_x_u8)))
-uint8x16_t vrmulhq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s16)))
-int16x8_t vrshlq_m_n_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s16)))
-int16x8_t vrshlq_m_n(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s32)))
-int32x4_t vrshlq_m_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s32)))
-int32x4_t vrshlq_m_n(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s8)))
-int8x16_t vrshlq_m_n_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_s8)))
-int8x16_t vrshlq_m_n(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u16)))
-uint16x8_t vrshlq_m_n_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u16)))
-uint16x8_t vrshlq_m_n(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u32)))
-uint32x4_t vrshlq_m_n_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u32)))
-uint32x4_t vrshlq_m_n(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u8)))
-uint8x16_t vrshlq_m_n_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_n_u8)))
-uint8x16_t vrshlq_m_n(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s16)))
-int16x8_t vrshlq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s16)))
-int16x8_t vrshlq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s32)))
-int32x4_t vrshlq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s32)))
-int32x4_t vrshlq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s8)))
-int8x16_t vrshlq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_s8)))
-int8x16_t vrshlq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u16)))
-uint16x8_t vrshlq_m_u16(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u16)))
-uint16x8_t vrshlq_m(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u32)))
-uint32x4_t vrshlq_m_u32(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u32)))
-uint32x4_t vrshlq_m(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u8)))
-uint8x16_t vrshlq_m_u8(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_m_u8)))
-uint8x16_t vrshlq_m(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s16)))
-int16x8_t vrshlq_n_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s16)))
-int16x8_t vrshlq(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s32)))
-int32x4_t vrshlq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s32)))
-int32x4_t vrshlq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s8)))
-int8x16_t vrshlq_n_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_s8)))
-int8x16_t vrshlq(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u16)))
-uint16x8_t vrshlq_n_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u16)))
-uint16x8_t vrshlq(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u32)))
-uint32x4_t vrshlq_n_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u32)))
-uint32x4_t vrshlq(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u8)))
-uint8x16_t vrshlq_n_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_n_u8)))
-uint8x16_t vrshlq(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s16)))
-int16x8_t vrshlq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s16)))
-int16x8_t vrshlq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s32)))
-int32x4_t vrshlq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s32)))
-int32x4_t vrshlq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s8)))
-int8x16_t vrshlq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_s8)))
-int8x16_t vrshlq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u16)))
-uint16x8_t vrshlq_u16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u16)))
-uint16x8_t vrshlq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u32)))
-uint32x4_t vrshlq_u32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u32)))
-uint32x4_t vrshlq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u8)))
-uint8x16_t vrshlq_u8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_u8)))
-uint8x16_t vrshlq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s16)))
-int16x8_t vrshlq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s16)))
-int16x8_t vrshlq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s32)))
-int32x4_t vrshlq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s32)))
-int32x4_t vrshlq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s8)))
-int8x16_t vrshlq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_s8)))
-int8x16_t vrshlq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u16)))
-uint16x8_t vrshlq_x_u16(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u16)))
-uint16x8_t vrshlq_x(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u32)))
-uint32x4_t vrshlq_x_u32(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u32)))
-uint32x4_t vrshlq_x(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u8)))
-uint8x16_t vrshlq_x_u8(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshlq_x_u8)))
-uint8x16_t vrshlq_x(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_s16)))
-int8x16_t vrshrnbq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_s16)))
-int8x16_t vrshrnbq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_s32)))
-int16x8_t vrshrnbq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_s32)))
-int16x8_t vrshrnbq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_u16)))
-uint8x16_t vrshrnbq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_u16)))
-uint8x16_t vrshrnbq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_u32)))
-uint16x8_t vrshrnbq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_m_n_u32)))
-uint16x8_t vrshrnbq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_s16)))
-int8x16_t vrshrnbq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_s16)))
-int8x16_t vrshrnbq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_s32)))
-int16x8_t vrshrnbq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_s32)))
-int16x8_t vrshrnbq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_u16)))
-uint8x16_t vrshrnbq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_u16)))
-uint8x16_t vrshrnbq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_u32)))
-uint16x8_t vrshrnbq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrnbq_n_u32)))
-uint16x8_t vrshrnbq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_s16)))
-int8x16_t vrshrntq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_s16)))
-int8x16_t vrshrntq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_s32)))
-int16x8_t vrshrntq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_s32)))
-int16x8_t vrshrntq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_u16)))
-uint8x16_t vrshrntq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_u16)))
-uint8x16_t vrshrntq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_u32)))
-uint16x8_t vrshrntq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_m_n_u32)))
-uint16x8_t vrshrntq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_s16)))
-int8x16_t vrshrntq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_s16)))
-int8x16_t vrshrntq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_s32)))
-int16x8_t vrshrntq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_s32)))
-int16x8_t vrshrntq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_u16)))
-uint8x16_t vrshrntq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_u16)))
-uint8x16_t vrshrntq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_u32)))
-uint16x8_t vrshrntq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrntq_n_u32)))
-uint16x8_t vrshrntq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s16)))
-int16x8_t vrshrq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s16)))
-int16x8_t vrshrq_m(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s32)))
-int32x4_t vrshrq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s32)))
-int32x4_t vrshrq_m(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s8)))
-int8x16_t vrshrq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_s8)))
-int8x16_t vrshrq_m(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u16)))
-uint16x8_t vrshrq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u16)))
-uint16x8_t vrshrq_m(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u32)))
-uint32x4_t vrshrq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u32)))
-uint32x4_t vrshrq_m(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u8)))
-uint8x16_t vrshrq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_m_n_u8)))
-uint8x16_t vrshrq_m(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s16)))
-int16x8_t vrshrq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s16)))
-int16x8_t vrshrq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s32)))
-int32x4_t vrshrq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s32)))
-int32x4_t vrshrq(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s8)))
-int8x16_t vrshrq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_s8)))
-int8x16_t vrshrq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u16)))
-uint16x8_t vrshrq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u16)))
-uint16x8_t vrshrq(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u32)))
-uint32x4_t vrshrq_n_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u32)))
-uint32x4_t vrshrq(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u8)))
-uint8x16_t vrshrq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_n_u8)))
-uint8x16_t vrshrq(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s16)))
-int16x8_t vrshrq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s16)))
-int16x8_t vrshrq_x(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s32)))
-int32x4_t vrshrq_x_n_s32(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s32)))
-int32x4_t vrshrq_x(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s8)))
-int8x16_t vrshrq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_s8)))
-int8x16_t vrshrq_x(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u16)))
-uint16x8_t vrshrq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u16)))
-uint16x8_t vrshrq_x(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u32)))
-uint32x4_t vrshrq_x_n_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u32)))
-uint32x4_t vrshrq_x(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u8)))
-uint8x16_t vrshrq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrshrq_x_n_u8)))
-uint8x16_t vrshrq_x(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_m_s32)))
-int32x4_t vsbciq_m_s32(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_m_s32)))
-int32x4_t vsbciq_m(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_m_u32)))
-uint32x4_t vsbciq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_m_u32)))
-uint32x4_t vsbciq_m(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_s32)))
-int32x4_t vsbciq_s32(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_s32)))
-int32x4_t vsbciq(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_u32)))
-uint32x4_t vsbciq_u32(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbciq_u32)))
-uint32x4_t vsbciq(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_m_s32)))
-int32x4_t vsbcq_m_s32(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_m_s32)))
-int32x4_t vsbcq_m(int32x4_t, int32x4_t, int32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_m_u32)))
-uint32x4_t vsbcq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_m_u32)))
-uint32x4_t vsbcq_m(uint32x4_t, uint32x4_t, uint32x4_t, unsigned *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_s32)))
-int32x4_t vsbcq_s32(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_s32)))
-int32x4_t vsbcq(int32x4_t, int32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_u32)))
-uint32x4_t vsbcq_u32(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsbcq_u32)))
-uint32x4_t vsbcq(uint32x4_t, uint32x4_t, unsigned *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s16)))
-int16x8_t vsetq_lane_s16(int16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s16)))
-int16x8_t vsetq_lane(int16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s32)))
-int32x4_t vsetq_lane_s32(int32_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s32)))
-int32x4_t vsetq_lane(int32_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s64)))
-int64x2_t vsetq_lane_s64(int64_t, int64x2_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s64)))
-int64x2_t vsetq_lane(int64_t, int64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s8)))
-int8x16_t vsetq_lane_s8(int8_t, int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_s8)))
-int8x16_t vsetq_lane(int8_t, int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u16)))
-uint16x8_t vsetq_lane_u16(uint16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u16)))
-uint16x8_t vsetq_lane(uint16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u32)))
-uint32x4_t vsetq_lane_u32(uint32_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u32)))
-uint32x4_t vsetq_lane(uint32_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u64)))
-uint64x2_t vsetq_lane_u64(uint64_t, uint64x2_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u64)))
-uint64x2_t vsetq_lane(uint64_t, uint64x2_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u8)))
-uint8x16_t vsetq_lane_u8(uint8_t, uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_u8)))
-uint8x16_t vsetq_lane(uint8_t, uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s16)))
-int16x8_t vshlcq_m_s16(int16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s16)))
-int16x8_t vshlcq_m(int16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s32)))
-int32x4_t vshlcq_m_s32(int32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s32)))
-int32x4_t vshlcq_m(int32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s8)))
-int8x16_t vshlcq_m_s8(int8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_s8)))
-int8x16_t vshlcq_m(int8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u16)))
-uint16x8_t vshlcq_m_u16(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u16)))
-uint16x8_t vshlcq_m(uint16x8_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u32)))
-uint32x4_t vshlcq_m_u32(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u32)))
-uint32x4_t vshlcq_m(uint32x4_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u8)))
-uint8x16_t vshlcq_m_u8(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_m_u8)))
-uint8x16_t vshlcq_m(uint8x16_t, uint32_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s16)))
-int16x8_t vshlcq_s16(int16x8_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s16)))
-int16x8_t vshlcq(int16x8_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s32)))
-int32x4_t vshlcq_s32(int32x4_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s32)))
-int32x4_t vshlcq(int32x4_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s8)))
-int8x16_t vshlcq_s8(int8x16_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_s8)))
-int8x16_t vshlcq(int8x16_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u16)))
-uint16x8_t vshlcq_u16(uint16x8_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u16)))
-uint16x8_t vshlcq(uint16x8_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u32)))
-uint32x4_t vshlcq_u32(uint32x4_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u32)))
-uint32x4_t vshlcq(uint32x4_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u8)))
-uint8x16_t vshlcq_u8(uint8x16_t, uint32_t *, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlcq_u8)))
-uint8x16_t vshlcq(uint8x16_t, uint32_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_s16)))
-int32x4_t vshllbq_m_n_s16(int32x4_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_s16)))
-int32x4_t vshllbq_m(int32x4_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_s8)))
-int16x8_t vshllbq_m_n_s8(int16x8_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_s8)))
-int16x8_t vshllbq_m(int16x8_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_u16)))
-uint32x4_t vshllbq_m_n_u16(uint32x4_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_u16)))
-uint32x4_t vshllbq_m(uint32x4_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_u8)))
-uint16x8_t vshllbq_m_n_u8(uint16x8_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_m_n_u8)))
-uint16x8_t vshllbq_m(uint16x8_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_s16)))
-int32x4_t vshllbq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_s16)))
-int32x4_t vshllbq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_s8)))
-int16x8_t vshllbq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_s8)))
-int16x8_t vshllbq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_u16)))
-uint32x4_t vshllbq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_u16)))
-uint32x4_t vshllbq(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_u8)))
-uint16x8_t vshllbq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_n_u8)))
-uint16x8_t vshllbq(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_s16)))
-int32x4_t vshllbq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_s16)))
-int32x4_t vshllbq_x(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_s8)))
-int16x8_t vshllbq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_s8)))
-int16x8_t vshllbq_x(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_u16)))
-uint32x4_t vshllbq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_u16)))
-uint32x4_t vshllbq_x(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_u8)))
-uint16x8_t vshllbq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshllbq_x_n_u8)))
-uint16x8_t vshllbq_x(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_s16)))
-int32x4_t vshlltq_m_n_s16(int32x4_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_s16)))
-int32x4_t vshlltq_m(int32x4_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_s8)))
-int16x8_t vshlltq_m_n_s8(int16x8_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_s8)))
-int16x8_t vshlltq_m(int16x8_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_u16)))
-uint32x4_t vshlltq_m_n_u16(uint32x4_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_u16)))
-uint32x4_t vshlltq_m(uint32x4_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_u8)))
-uint16x8_t vshlltq_m_n_u8(uint16x8_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_m_n_u8)))
-uint16x8_t vshlltq_m(uint16x8_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_s16)))
-int32x4_t vshlltq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_s16)))
-int32x4_t vshlltq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_s8)))
-int16x8_t vshlltq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_s8)))
-int16x8_t vshlltq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_u16)))
-uint32x4_t vshlltq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_u16)))
-uint32x4_t vshlltq(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_u8)))
-uint16x8_t vshlltq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_n_u8)))
-uint16x8_t vshlltq(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_s16)))
-int32x4_t vshlltq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_s16)))
-int32x4_t vshlltq_x(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_s8)))
-int16x8_t vshlltq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_s8)))
-int16x8_t vshlltq_x(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_u16)))
-uint32x4_t vshlltq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_u16)))
-uint32x4_t vshlltq_x(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_u8)))
-uint16x8_t vshlltq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlltq_x_n_u8)))
-uint16x8_t vshlltq_x(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s16)))
-int16x8_t vshlq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s16)))
-int16x8_t vshlq_m_n(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s32)))
-int32x4_t vshlq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s32)))
-int32x4_t vshlq_m_n(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s8)))
-int8x16_t vshlq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_s8)))
-int8x16_t vshlq_m_n(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u16)))
-uint16x8_t vshlq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u16)))
-uint16x8_t vshlq_m_n(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u32)))
-uint32x4_t vshlq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u32)))
-uint32x4_t vshlq_m_n(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u8)))
-uint8x16_t vshlq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_n_u8)))
-uint8x16_t vshlq_m_n(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s16)))
-int16x8_t vshlq_m_r_s16(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s16)))
-int16x8_t vshlq_m_r(int16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s32)))
-int32x4_t vshlq_m_r_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s32)))
-int32x4_t vshlq_m_r(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s8)))
-int8x16_t vshlq_m_r_s8(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_s8)))
-int8x16_t vshlq_m_r(int8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u16)))
-uint16x8_t vshlq_m_r_u16(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u16)))
-uint16x8_t vshlq_m_r(uint16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u32)))
-uint32x4_t vshlq_m_r_u32(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u32)))
-uint32x4_t vshlq_m_r(uint32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u8)))
-uint8x16_t vshlq_m_r_u8(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_r_u8)))
-uint8x16_t vshlq_m_r(uint8x16_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s16)))
-int16x8_t vshlq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s16)))
-int16x8_t vshlq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s32)))
-int32x4_t vshlq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s32)))
-int32x4_t vshlq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s8)))
-int8x16_t vshlq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_s8)))
-int8x16_t vshlq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u16)))
-uint16x8_t vshlq_m_u16(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u16)))
-uint16x8_t vshlq_m(uint16x8_t, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u32)))
-uint32x4_t vshlq_m_u32(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u32)))
-uint32x4_t vshlq_m(uint32x4_t, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u8)))
-uint8x16_t vshlq_m_u8(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_m_u8)))
-uint8x16_t vshlq_m(uint8x16_t, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s16)))
-int16x8_t vshlq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s16)))
-int16x8_t vshlq_n(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s32)))
-int32x4_t vshlq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s32)))
-int32x4_t vshlq_n(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s8)))
-int8x16_t vshlq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_s8)))
-int8x16_t vshlq_n(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u16)))
-uint16x8_t vshlq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u16)))
-uint16x8_t vshlq_n(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u32)))
-uint32x4_t vshlq_n_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u32)))
-uint32x4_t vshlq_n(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u8)))
-uint8x16_t vshlq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_n_u8)))
-uint8x16_t vshlq_n(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s16)))
-int16x8_t vshlq_r_s16(int16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s16)))
-int16x8_t vshlq_r(int16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s32)))
-int32x4_t vshlq_r_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s32)))
-int32x4_t vshlq_r(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s8)))
-int8x16_t vshlq_r_s8(int8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_s8)))
-int8x16_t vshlq_r(int8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u16)))
-uint16x8_t vshlq_r_u16(uint16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u16)))
-uint16x8_t vshlq_r(uint16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u32)))
-uint32x4_t vshlq_r_u32(uint32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u32)))
-uint32x4_t vshlq_r(uint32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u8)))
-uint8x16_t vshlq_r_u8(uint8x16_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_r_u8)))
-uint8x16_t vshlq_r(uint8x16_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s16)))
-int16x8_t vshlq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s16)))
-int16x8_t vshlq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s32)))
-int32x4_t vshlq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s32)))
-int32x4_t vshlq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s8)))
-int8x16_t vshlq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_s8)))
-int8x16_t vshlq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u16)))
-uint16x8_t vshlq_u16(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u16)))
-uint16x8_t vshlq(uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u32)))
-uint32x4_t vshlq_u32(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u32)))
-uint32x4_t vshlq(uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u8)))
-uint8x16_t vshlq_u8(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_u8)))
-uint8x16_t vshlq(uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s16)))
-int16x8_t vshlq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s16)))
-int16x8_t vshlq_x_n(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s32)))
-int32x4_t vshlq_x_n_s32(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s32)))
-int32x4_t vshlq_x_n(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s8)))
-int8x16_t vshlq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_s8)))
-int8x16_t vshlq_x_n(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u16)))
-uint16x8_t vshlq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u16)))
-uint16x8_t vshlq_x_n(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u32)))
-uint32x4_t vshlq_x_n_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u32)))
-uint32x4_t vshlq_x_n(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u8)))
-uint8x16_t vshlq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_n_u8)))
-uint8x16_t vshlq_x_n(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s16)))
-int16x8_t vshlq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s16)))
-int16x8_t vshlq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s32)))
-int32x4_t vshlq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s32)))
-int32x4_t vshlq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s8)))
-int8x16_t vshlq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_s8)))
-int8x16_t vshlq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u16)))
-uint16x8_t vshlq_x_u16(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u16)))
-uint16x8_t vshlq_x(uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u32)))
-uint32x4_t vshlq_x_u32(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u32)))
-uint32x4_t vshlq_x(uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u8)))
-uint8x16_t vshlq_x_u8(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshlq_x_u8)))
-uint8x16_t vshlq_x(uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_s16)))
-int8x16_t vshrnbq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_s16)))
-int8x16_t vshrnbq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_s32)))
-int16x8_t vshrnbq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_s32)))
-int16x8_t vshrnbq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_u16)))
-uint8x16_t vshrnbq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_u16)))
-uint8x16_t vshrnbq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_u32)))
-uint16x8_t vshrnbq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_m_n_u32)))
-uint16x8_t vshrnbq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_s16)))
-int8x16_t vshrnbq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_s16)))
-int8x16_t vshrnbq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_s32)))
-int16x8_t vshrnbq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_s32)))
-int16x8_t vshrnbq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_u16)))
-uint8x16_t vshrnbq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_u16)))
-uint8x16_t vshrnbq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_u32)))
-uint16x8_t vshrnbq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrnbq_n_u32)))
-uint16x8_t vshrnbq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_s16)))
-int8x16_t vshrntq_m_n_s16(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_s16)))
-int8x16_t vshrntq_m(int8x16_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_s32)))
-int16x8_t vshrntq_m_n_s32(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_s32)))
-int16x8_t vshrntq_m(int16x8_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_u16)))
-uint8x16_t vshrntq_m_n_u16(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_u16)))
-uint8x16_t vshrntq_m(uint8x16_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_u32)))
-uint16x8_t vshrntq_m_n_u32(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_m_n_u32)))
-uint16x8_t vshrntq_m(uint16x8_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_s16)))
-int8x16_t vshrntq_n_s16(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_s16)))
-int8x16_t vshrntq(int8x16_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_s32)))
-int16x8_t vshrntq_n_s32(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_s32)))
-int16x8_t vshrntq(int16x8_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_u16)))
-uint8x16_t vshrntq_n_u16(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_u16)))
-uint8x16_t vshrntq(uint8x16_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_u32)))
-uint16x8_t vshrntq_n_u32(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrntq_n_u32)))
-uint16x8_t vshrntq(uint16x8_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s16)))
-int16x8_t vshrq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s16)))
-int16x8_t vshrq_m(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s32)))
-int32x4_t vshrq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s32)))
-int32x4_t vshrq_m(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s8)))
-int8x16_t vshrq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_s8)))
-int8x16_t vshrq_m(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u16)))
-uint16x8_t vshrq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u16)))
-uint16x8_t vshrq_m(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u32)))
-uint32x4_t vshrq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u32)))
-uint32x4_t vshrq_m(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u8)))
-uint8x16_t vshrq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_m_n_u8)))
-uint8x16_t vshrq_m(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s16)))
-int16x8_t vshrq_n_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s16)))
-int16x8_t vshrq(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s32)))
-int32x4_t vshrq_n_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s32)))
-int32x4_t vshrq(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s8)))
-int8x16_t vshrq_n_s8(int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_s8)))
-int8x16_t vshrq(int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u16)))
-uint16x8_t vshrq_n_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u16)))
-uint16x8_t vshrq(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u32)))
-uint32x4_t vshrq_n_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u32)))
-uint32x4_t vshrq(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u8)))
-uint8x16_t vshrq_n_u8(uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_n_u8)))
-uint8x16_t vshrq(uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s16)))
-int16x8_t vshrq_x_n_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s16)))
-int16x8_t vshrq_x(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s32)))
-int32x4_t vshrq_x_n_s32(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s32)))
-int32x4_t vshrq_x(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s8)))
-int8x16_t vshrq_x_n_s8(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_s8)))
-int8x16_t vshrq_x(int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u16)))
-uint16x8_t vshrq_x_n_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u16)))
-uint16x8_t vshrq_x(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u32)))
-uint32x4_t vshrq_x_n_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u32)))
-uint32x4_t vshrq_x(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u8)))
-uint8x16_t vshrq_x_n_u8(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vshrq_x_n_u8)))
-uint8x16_t vshrq_x(uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s16)))
-int16x8_t vsliq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s16)))
-int16x8_t vsliq_m(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s32)))
-int32x4_t vsliq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s32)))
-int32x4_t vsliq_m(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s8)))
-int8x16_t vsliq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_s8)))
-int8x16_t vsliq_m(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u16)))
-uint16x8_t vsliq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u16)))
-uint16x8_t vsliq_m(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u32)))
-uint32x4_t vsliq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u32)))
-uint32x4_t vsliq_m(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u8)))
-uint8x16_t vsliq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_m_n_u8)))
-uint8x16_t vsliq_m(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s16)))
-int16x8_t vsliq_n_s16(int16x8_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s16)))
-int16x8_t vsliq(int16x8_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s32)))
-int32x4_t vsliq_n_s32(int32x4_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s32)))
-int32x4_t vsliq(int32x4_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s8)))
-int8x16_t vsliq_n_s8(int8x16_t, int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_s8)))
-int8x16_t vsliq(int8x16_t, int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u16)))
-uint16x8_t vsliq_n_u16(uint16x8_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u16)))
-uint16x8_t vsliq(uint16x8_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u32)))
-uint32x4_t vsliq_n_u32(uint32x4_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u32)))
-uint32x4_t vsliq(uint32x4_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u8)))
-uint8x16_t vsliq_n_u8(uint8x16_t, uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsliq_n_u8)))
-uint8x16_t vsliq(uint8x16_t, uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s16)))
-int16x8_t vsriq_m_n_s16(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s16)))
-int16x8_t vsriq_m(int16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s32)))
-int32x4_t vsriq_m_n_s32(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s32)))
-int32x4_t vsriq_m(int32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s8)))
-int8x16_t vsriq_m_n_s8(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_s8)))
-int8x16_t vsriq_m(int8x16_t, int8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u16)))
-uint16x8_t vsriq_m_n_u16(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u16)))
-uint16x8_t vsriq_m(uint16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u32)))
-uint32x4_t vsriq_m_n_u32(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u32)))
-uint32x4_t vsriq_m(uint32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u8)))
-uint8x16_t vsriq_m_n_u8(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_m_n_u8)))
-uint8x16_t vsriq_m(uint8x16_t, uint8x16_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s16)))
-int16x8_t vsriq_n_s16(int16x8_t, int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s16)))
-int16x8_t vsriq(int16x8_t, int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s32)))
-int32x4_t vsriq_n_s32(int32x4_t, int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s32)))
-int32x4_t vsriq(int32x4_t, int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s8)))
-int8x16_t vsriq_n_s8(int8x16_t, int8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_s8)))
-int8x16_t vsriq(int8x16_t, int8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u16)))
-uint16x8_t vsriq_n_u16(uint16x8_t, uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u16)))
-uint16x8_t vsriq(uint16x8_t, uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u32)))
-uint32x4_t vsriq_n_u32(uint32x4_t, uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u32)))
-uint32x4_t vsriq(uint32x4_t, uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u8)))
-uint8x16_t vsriq_n_u8(uint8x16_t, uint8x16_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsriq_n_u8)))
-uint8x16_t vsriq(uint8x16_t, uint8x16_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s16)))
-void vst1q_p_s16(int16_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s16)))
-void vst1q_p(int16_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s32)))
-void vst1q_p_s32(int32_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s32)))
-void vst1q_p(int32_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s8)))
-void vst1q_p_s8(int8_t *, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_s8)))
-void vst1q_p(int8_t *, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u16)))
-void vst1q_p_u16(uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u16)))
-void vst1q_p(uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u32)))
-void vst1q_p_u32(uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u32)))
-void vst1q_p(uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u8)))
-void vst1q_p_u8(uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_u8)))
-void vst1q_p(uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s16)))
-void vst1q_s16(int16_t *, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s16)))
-void vst1q(int16_t *, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s32)))
-void vst1q_s32(int32_t *, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s32)))
-void vst1q(int32_t *, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s8)))
-void vst1q_s8(int8_t *, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_s8)))
-void vst1q(int8_t *, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u16)))
-void vst1q_u16(uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u16)))
-void vst1q(uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u32)))
-void vst1q_u32(uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u32)))
-void vst1q(uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u8)))
-void vst1q_u8(uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_u8)))
-void vst1q(uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s16)))
-void vst2q_s16(int16_t *, int16x8x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s16)))
-void vst2q(int16_t *, int16x8x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s32)))
-void vst2q_s32(int32_t *, int32x4x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s32)))
-void vst2q(int32_t *, int32x4x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s8)))
-void vst2q_s8(int8_t *, int8x16x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_s8)))
-void vst2q(int8_t *, int8x16x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u16)))
-void vst2q_u16(uint16_t *, uint16x8x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u16)))
-void vst2q(uint16_t *, uint16x8x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u32)))
-void vst2q_u32(uint32_t *, uint32x4x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u32)))
-void vst2q(uint32_t *, uint32x4x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u8)))
-void vst2q_u8(uint8_t *, uint8x16x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_u8)))
-void vst2q(uint8_t *, uint8x16x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s16)))
-void vst4q_s16(int16_t *, int16x8x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s16)))
-void vst4q(int16_t *, int16x8x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s32)))
-void vst4q_s32(int32_t *, int32x4x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s32)))
-void vst4q(int32_t *, int32x4x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s8)))
-void vst4q_s8(int8_t *, int8x16x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_s8)))
-void vst4q(int8_t *, int8x16x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u16)))
-void vst4q_u16(uint16_t *, uint16x8x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u16)))
-void vst4q(uint16_t *, uint16x8x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u32)))
-void vst4q_u32(uint32_t *, uint32x4x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u32)))
-void vst4q(uint32_t *, uint32x4x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u8)))
-void vst4q_u8(uint8_t *, uint8x16x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_u8)))
-void vst4q(uint8_t *, uint8x16x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s16)))
-void vstrbq_p_s16(int8_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s16)))
-void vstrbq_p(int8_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s32)))
-void vstrbq_p_s32(int8_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s32)))
-void vstrbq_p(int8_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s8)))
-void vstrbq_p_s8(int8_t *, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_s8)))
-void vstrbq_p(int8_t *, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u16)))
-void vstrbq_p_u16(uint8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u16)))
-void vstrbq_p(uint8_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u32)))
-void vstrbq_p_u32(uint8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u32)))
-void vstrbq_p(uint8_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u8)))
-void vstrbq_p_u8(uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_p_u8)))
-void vstrbq_p(uint8_t *, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s16)))
-void vstrbq_s16(int8_t *, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s16)))
-void vstrbq(int8_t *, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s32)))
-void vstrbq_s32(int8_t *, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s32)))
-void vstrbq(int8_t *, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s8)))
-void vstrbq_s8(int8_t *, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_s8)))
-void vstrbq(int8_t *, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s16)))
-void vstrbq_scatter_offset_p_s16(int8_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s16)))
-void vstrbq_scatter_offset_p(int8_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s32)))
-void vstrbq_scatter_offset_p_s32(int8_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s32)))
-void vstrbq_scatter_offset_p(int8_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s8)))
-void vstrbq_scatter_offset_p_s8(int8_t *, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_s8)))
-void vstrbq_scatter_offset_p(int8_t *, uint8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u16)))
-void vstrbq_scatter_offset_p_u16(uint8_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u16)))
-void vstrbq_scatter_offset_p(uint8_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u32)))
-void vstrbq_scatter_offset_p_u32(uint8_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u32)))
-void vstrbq_scatter_offset_p(uint8_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u8)))
-void vstrbq_scatter_offset_p_u8(uint8_t *, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_p_u8)))
-void vstrbq_scatter_offset_p(uint8_t *, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s16)))
-void vstrbq_scatter_offset_s16(int8_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s16)))
-void vstrbq_scatter_offset(int8_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s32)))
-void vstrbq_scatter_offset_s32(int8_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s32)))
-void vstrbq_scatter_offset(int8_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s8)))
-void vstrbq_scatter_offset_s8(int8_t *, uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_s8)))
-void vstrbq_scatter_offset(int8_t *, uint8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u16)))
-void vstrbq_scatter_offset_u16(uint8_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u16)))
-void vstrbq_scatter_offset(uint8_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u32)))
-void vstrbq_scatter_offset_u32(uint8_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u32)))
-void vstrbq_scatter_offset(uint8_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u8)))
-void vstrbq_scatter_offset_u8(uint8_t *, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_scatter_offset_u8)))
-void vstrbq_scatter_offset(uint8_t *, uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u16)))
-void vstrbq_u16(uint8_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u16)))
-void vstrbq(uint8_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u32)))
-void vstrbq_u32(uint8_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u32)))
-void vstrbq(uint8_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u8)))
-void vstrbq_u8(uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrbq_u8)))
-void vstrbq(uint8_t *, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_p_s64)))
-void vstrdq_scatter_base_p_s64(uint64x2_t, int, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_p_s64)))
-void vstrdq_scatter_base_p(uint64x2_t, int, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_p_u64)))
-void vstrdq_scatter_base_p_u64(uint64x2_t, int, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_p_u64)))
-void vstrdq_scatter_base_p(uint64x2_t, int, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_s64)))
-void vstrdq_scatter_base_s64(uint64x2_t, int, int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_s64)))
-void vstrdq_scatter_base(uint64x2_t, int, int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_u64)))
-void vstrdq_scatter_base_u64(uint64x2_t, int, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_u64)))
-void vstrdq_scatter_base(uint64x2_t, int, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_p_s64)))
-void vstrdq_scatter_base_wb_p_s64(uint64x2_t *, int, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_p_s64)))
-void vstrdq_scatter_base_wb_p(uint64x2_t *, int, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_p_u64)))
-void vstrdq_scatter_base_wb_p_u64(uint64x2_t *, int, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_p_u64)))
-void vstrdq_scatter_base_wb_p(uint64x2_t *, int, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_s64)))
-void vstrdq_scatter_base_wb_s64(uint64x2_t *, int, int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_s64)))
-void vstrdq_scatter_base_wb(uint64x2_t *, int, int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_u64)))
-void vstrdq_scatter_base_wb_u64(uint64x2_t *, int, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_base_wb_u64)))
-void vstrdq_scatter_base_wb(uint64x2_t *, int, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_p_s64)))
-void vstrdq_scatter_offset_p_s64(int64_t *, uint64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_p_s64)))
-void vstrdq_scatter_offset_p(int64_t *, uint64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_p_u64)))
-void vstrdq_scatter_offset_p_u64(uint64_t *, uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_p_u64)))
-void vstrdq_scatter_offset_p(uint64_t *, uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_s64)))
-void vstrdq_scatter_offset_s64(int64_t *, uint64x2_t, int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_s64)))
-void vstrdq_scatter_offset(int64_t *, uint64x2_t, int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_u64)))
-void vstrdq_scatter_offset_u64(uint64_t *, uint64x2_t, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_offset_u64)))
-void vstrdq_scatter_offset(uint64_t *, uint64x2_t, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_p_s64)))
-void vstrdq_scatter_shifted_offset_p_s64(int64_t *, uint64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_p_s64)))
-void vstrdq_scatter_shifted_offset_p(int64_t *, uint64x2_t, int64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_p_u64)))
-void vstrdq_scatter_shifted_offset_p_u64(uint64_t *, uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_p_u64)))
-void vstrdq_scatter_shifted_offset_p(uint64_t *, uint64x2_t, uint64x2_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_s64)))
-void vstrdq_scatter_shifted_offset_s64(int64_t *, uint64x2_t, int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_s64)))
-void vstrdq_scatter_shifted_offset(int64_t *, uint64x2_t, int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_u64)))
-void vstrdq_scatter_shifted_offset_u64(uint64_t *, uint64x2_t, uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrdq_scatter_shifted_offset_u64)))
-void vstrdq_scatter_shifted_offset(uint64_t *, uint64x2_t, uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_s16)))
-void vstrhq_p_s16(int16_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_s16)))
-void vstrhq_p(int16_t *, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_s32)))
-void vstrhq_p_s32(int16_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_s32)))
-void vstrhq_p(int16_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_u16)))
-void vstrhq_p_u16(uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_u16)))
-void vstrhq_p(uint16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_u32)))
-void vstrhq_p_u32(uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_u32)))
-void vstrhq_p(uint16_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_s16)))
-void vstrhq_s16(int16_t *, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_s16)))
-void vstrhq(int16_t *, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_s32)))
-void vstrhq_s32(int16_t *, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_s32)))
-void vstrhq(int16_t *, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_s16)))
-void vstrhq_scatter_offset_p_s16(int16_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_s16)))
-void vstrhq_scatter_offset_p(int16_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_s32)))
-void vstrhq_scatter_offset_p_s32(int16_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_s32)))
-void vstrhq_scatter_offset_p(int16_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_u16)))
-void vstrhq_scatter_offset_p_u16(uint16_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_u16)))
-void vstrhq_scatter_offset_p(uint16_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_u32)))
-void vstrhq_scatter_offset_p_u32(uint16_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_u32)))
-void vstrhq_scatter_offset_p(uint16_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_s16)))
-void vstrhq_scatter_offset_s16(int16_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_s16)))
-void vstrhq_scatter_offset(int16_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_s32)))
-void vstrhq_scatter_offset_s32(int16_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_s32)))
-void vstrhq_scatter_offset(int16_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_u16)))
-void vstrhq_scatter_offset_u16(uint16_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_u16)))
-void vstrhq_scatter_offset(uint16_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_u32)))
-void vstrhq_scatter_offset_u32(uint16_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_u32)))
-void vstrhq_scatter_offset(uint16_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_s16)))
-void vstrhq_scatter_shifted_offset_p_s16(int16_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_s16)))
-void vstrhq_scatter_shifted_offset_p(int16_t *, uint16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_s32)))
-void vstrhq_scatter_shifted_offset_p_s32(int16_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_s32)))
-void vstrhq_scatter_shifted_offset_p(int16_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_u16)))
-void vstrhq_scatter_shifted_offset_p_u16(uint16_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_u16)))
-void vstrhq_scatter_shifted_offset_p(uint16_t *, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_u32)))
-void vstrhq_scatter_shifted_offset_p_u32(uint16_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_u32)))
-void vstrhq_scatter_shifted_offset_p(uint16_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_s16)))
-void vstrhq_scatter_shifted_offset_s16(int16_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_s16)))
-void vstrhq_scatter_shifted_offset(int16_t *, uint16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_s32)))
-void vstrhq_scatter_shifted_offset_s32(int16_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_s32)))
-void vstrhq_scatter_shifted_offset(int16_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_u16)))
-void vstrhq_scatter_shifted_offset_u16(uint16_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_u16)))
-void vstrhq_scatter_shifted_offset(uint16_t *, uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_u32)))
-void vstrhq_scatter_shifted_offset_u32(uint16_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_u32)))
-void vstrhq_scatter_shifted_offset(uint16_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_u16)))
-void vstrhq_u16(uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_u16)))
-void vstrhq(uint16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_u32)))
-void vstrhq_u32(uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_u32)))
-void vstrhq(uint16_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_s32)))
-void vstrwq_p_s32(int32_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_s32)))
-void vstrwq_p(int32_t *, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_u32)))
-void vstrwq_p_u32(uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_u32)))
-void vstrwq_p(uint32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_s32)))
-void vstrwq_s32(int32_t *, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_s32)))
-void vstrwq(int32_t *, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_s32)))
-void vstrwq_scatter_base_p_s32(uint32x4_t, int, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_s32)))
-void vstrwq_scatter_base_p(uint32x4_t, int, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_u32)))
-void vstrwq_scatter_base_p_u32(uint32x4_t, int, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_u32)))
-void vstrwq_scatter_base_p(uint32x4_t, int, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_s32)))
-void vstrwq_scatter_base_s32(uint32x4_t, int, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_s32)))
-void vstrwq_scatter_base(uint32x4_t, int, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_u32)))
-void vstrwq_scatter_base_u32(uint32x4_t, int, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_u32)))
-void vstrwq_scatter_base(uint32x4_t, int, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_s32)))
-void vstrwq_scatter_base_wb_p_s32(uint32x4_t *, int, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_s32)))
-void vstrwq_scatter_base_wb_p(uint32x4_t *, int, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_u32)))
-void vstrwq_scatter_base_wb_p_u32(uint32x4_t *, int, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_u32)))
-void vstrwq_scatter_base_wb_p(uint32x4_t *, int, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_s32)))
-void vstrwq_scatter_base_wb_s32(uint32x4_t *, int, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_s32)))
-void vstrwq_scatter_base_wb(uint32x4_t *, int, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_u32)))
-void vstrwq_scatter_base_wb_u32(uint32x4_t *, int, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_u32)))
-void vstrwq_scatter_base_wb(uint32x4_t *, int, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_s32)))
-void vstrwq_scatter_offset_p_s32(int32_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_s32)))
-void vstrwq_scatter_offset_p(int32_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_u32)))
-void vstrwq_scatter_offset_p_u32(uint32_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_u32)))
-void vstrwq_scatter_offset_p(uint32_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_s32)))
-void vstrwq_scatter_offset_s32(int32_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_s32)))
-void vstrwq_scatter_offset(int32_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_u32)))
-void vstrwq_scatter_offset_u32(uint32_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_u32)))
-void vstrwq_scatter_offset(uint32_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_s32)))
-void vstrwq_scatter_shifted_offset_p_s32(int32_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_s32)))
-void vstrwq_scatter_shifted_offset_p(int32_t *, uint32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_u32)))
-void vstrwq_scatter_shifted_offset_p_u32(uint32_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_u32)))
-void vstrwq_scatter_shifted_offset_p(uint32_t *, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_s32)))
-void vstrwq_scatter_shifted_offset_s32(int32_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_s32)))
-void vstrwq_scatter_shifted_offset(int32_t *, uint32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_u32)))
-void vstrwq_scatter_shifted_offset_u32(uint32_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_u32)))
-void vstrwq_scatter_shifted_offset(uint32_t *, uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_u32)))
-void vstrwq_u32(uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_u32)))
-void vstrwq(uint32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s16)))
-int16x8_t vsubq_m_n_s16(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s16)))
-int16x8_t vsubq_m(int16x8_t, int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s32)))
-int32x4_t vsubq_m_n_s32(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s32)))
-int32x4_t vsubq_m(int32x4_t, int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s8)))
-int8x16_t vsubq_m_n_s8(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_s8)))
-int8x16_t vsubq_m(int8x16_t, int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u16)))
-uint16x8_t vsubq_m_n_u16(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u16)))
-uint16x8_t vsubq_m(uint16x8_t, uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u32)))
-uint32x4_t vsubq_m_n_u32(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u32)))
-uint32x4_t vsubq_m(uint32x4_t, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u8)))
-uint8x16_t vsubq_m_n_u8(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_u8)))
-uint8x16_t vsubq_m(uint8x16_t, uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s16)))
-int16x8_t vsubq_m_s16(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s16)))
-int16x8_t vsubq_m(int16x8_t, int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s32)))
-int32x4_t vsubq_m_s32(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s32)))
-int32x4_t vsubq_m(int32x4_t, int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s8)))
-int8x16_t vsubq_m_s8(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_s8)))
-int8x16_t vsubq_m(int8x16_t, int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u16)))
-uint16x8_t vsubq_m_u16(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u16)))
-uint16x8_t vsubq_m(uint16x8_t, uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u32)))
-uint32x4_t vsubq_m_u32(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u32)))
-uint32x4_t vsubq_m(uint32x4_t, uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u8)))
-uint8x16_t vsubq_m_u8(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_u8)))
-uint8x16_t vsubq_m(uint8x16_t, uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s16)))
-int16x8_t vsubq_n_s16(int16x8_t, int16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s16)))
-int16x8_t vsubq(int16x8_t, int16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s32)))
-int32x4_t vsubq_n_s32(int32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s32)))
-int32x4_t vsubq(int32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s8)))
-int8x16_t vsubq_n_s8(int8x16_t, int8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_s8)))
-int8x16_t vsubq(int8x16_t, int8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u16)))
-uint16x8_t vsubq_n_u16(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u16)))
-uint16x8_t vsubq(uint16x8_t, uint16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u32)))
-uint32x4_t vsubq_n_u32(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u32)))
-uint32x4_t vsubq(uint32x4_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u8)))
-uint8x16_t vsubq_n_u8(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_u8)))
-uint8x16_t vsubq(uint8x16_t, uint8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s16)))
-int16x8_t vsubq_s16(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s16)))
-int16x8_t vsubq(int16x8_t, int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s32)))
-int32x4_t vsubq_s32(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s32)))
-int32x4_t vsubq(int32x4_t, int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s8)))
-int8x16_t vsubq_s8(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_s8)))
-int8x16_t vsubq(int8x16_t, int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u16)))
-uint16x8_t vsubq_u16(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u16)))
-uint16x8_t vsubq(uint16x8_t, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u32)))
-uint32x4_t vsubq_u32(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u32)))
-uint32x4_t vsubq(uint32x4_t, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u8)))
-uint8x16_t vsubq_u8(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_u8)))
-uint8x16_t vsubq(uint8x16_t, uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s16)))
-int16x8_t vsubq_x_n_s16(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s16)))
-int16x8_t vsubq_x(int16x8_t, int16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s32)))
-int32x4_t vsubq_x_n_s32(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s32)))
-int32x4_t vsubq_x(int32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s8)))
-int8x16_t vsubq_x_n_s8(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_s8)))
-int8x16_t vsubq_x(int8x16_t, int8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u16)))
-uint16x8_t vsubq_x_n_u16(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u16)))
-uint16x8_t vsubq_x(uint16x8_t, uint16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u32)))
-uint32x4_t vsubq_x_n_u32(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u32)))
-uint32x4_t vsubq_x(uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u8)))
-uint8x16_t vsubq_x_n_u8(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_u8)))
-uint8x16_t vsubq_x(uint8x16_t, uint8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s16)))
-int16x8_t vsubq_x_s16(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s16)))
-int16x8_t vsubq_x(int16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s32)))
-int32x4_t vsubq_x_s32(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s32)))
-int32x4_t vsubq_x(int32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s8)))
-int8x16_t vsubq_x_s8(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_s8)))
-int8x16_t vsubq_x(int8x16_t, int8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u16)))
-uint16x8_t vsubq_x_u16(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u16)))
-uint16x8_t vsubq_x(uint16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u32)))
-uint32x4_t vsubq_x_u32(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u32)))
-uint32x4_t vsubq_x(uint32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u8)))
-uint8x16_t vsubq_x_u8(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_u8)))
-uint8x16_t vsubq_x(uint8x16_t, uint8x16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_s16)))
-int16x8_t vuninitializedq(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_s32)))
-int32x4_t vuninitializedq(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_s64)))
-int64x2_t vuninitializedq(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_s8)))
-int8x16_t vuninitializedq(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_u16)))
-uint16x8_t vuninitializedq(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_u32)))
-uint32x4_t vuninitializedq(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_u64)))
-uint64x2_t vuninitializedq(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_u8)))
-uint8x16_t vuninitializedq(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_s16)))
-int16x8_t vuninitializedq_s16();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_s32)))
-int32x4_t vuninitializedq_s32();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_s64)))
-int64x2_t vuninitializedq_s64();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_s8)))
-int8x16_t vuninitializedq_s8();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_u16)))
-uint16x8_t vuninitializedq_u16();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_u32)))
-uint32x4_t vuninitializedq_u32();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_u64)))
-uint64x2_t vuninitializedq_u64();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_u8)))
-uint8x16_t vuninitializedq_u8();
-
-#endif /* (!defined __ARM_MVE_PRESERVE_USER_NAMESPACE) */
-
-#if (__ARM_FEATURE_MVE & 2) && (!defined __ARM_MVE_PRESERVE_USER_NAMESPACE)
-
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_f16)))
-float16x8_t vabdq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_f16)))
-float16x8_t vabdq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_f32)))
-float32x4_t vabdq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_f32)))
-float32x4_t vabdq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_f16)))
-float16x8_t vabdq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_f16)))
-float16x8_t vabdq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_f32)))
-float32x4_t vabdq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_m_f32)))
-float32x4_t vabdq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_f16)))
-float16x8_t vabdq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_f16)))
-float16x8_t vabdq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_f32)))
-float32x4_t vabdq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabdq_x_f32)))
-float32x4_t vabdq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_f16)))
-float16x8_t vabsq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_f16)))
-float16x8_t vabsq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_f32)))
-float32x4_t vabsq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_f32)))
-float32x4_t vabsq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_f16)))
-float16x8_t vabsq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_f16)))
-float16x8_t vabsq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_f32)))
-float32x4_t vabsq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_m_f32)))
-float32x4_t vabsq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_f16)))
-float16x8_t vabsq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_f16)))
-float16x8_t vabsq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_f32)))
-float32x4_t vabsq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vabsq_x_f32)))
-float32x4_t vabsq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_f16)))
-float16x8_t vaddq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_f16)))
-float16x8_t vaddq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_f32)))
-float32x4_t vaddq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_f32)))
-float32x4_t vaddq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_f16)))
-float16x8_t vaddq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_f16)))
-float16x8_t vaddq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_f32)))
-float32x4_t vaddq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_f32)))
-float32x4_t vaddq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_f16)))
-float16x8_t vaddq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_f16)))
-float16x8_t vaddq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_f32)))
-float32x4_t vaddq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_m_n_f32)))
-float32x4_t vaddq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_f16)))
-float16x8_t vaddq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_f16)))
-float16x8_t vaddq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_f32)))
-float32x4_t vaddq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_n_f32)))
-float32x4_t vaddq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_f16)))
-float16x8_t vaddq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_f16)))
-float16x8_t vaddq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_f32)))
-float32x4_t vaddq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_f32)))
-float32x4_t vaddq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_f16)))
-float16x8_t vaddq_x_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_f16)))
-float16x8_t vaddq_x(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_f32)))
-float32x4_t vaddq_x_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vaddq_x_n_f32)))
-float32x4_t vaddq_x(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_f16)))
-float16x8_t vandq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_f16)))
-float16x8_t vandq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_f32)))
-float32x4_t vandq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_f32)))
-float32x4_t vandq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_f16)))
-float16x8_t vandq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_f16)))
-float16x8_t vandq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_f32)))
-float32x4_t vandq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_m_f32)))
-float32x4_t vandq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_f16)))
-float16x8_t vandq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_f16)))
-float16x8_t vandq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_f32)))
-float32x4_t vandq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vandq_x_f32)))
-float32x4_t vandq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_f16)))
-float16x8_t vbicq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_f16)))
-float16x8_t vbicq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_f32)))
-float32x4_t vbicq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_f32)))
-float32x4_t vbicq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_f16)))
-float16x8_t vbicq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_f16)))
-float16x8_t vbicq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_f32)))
-float32x4_t vbicq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_m_f32)))
-float32x4_t vbicq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_f16)))
-float16x8_t vbicq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_f16)))
-float16x8_t vbicq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_f32)))
-float32x4_t vbicq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbicq_x_f32)))
-float32x4_t vbicq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_f16)))
-float16x8_t vbrsrq_m_n_f16(float16x8_t, float16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_f16)))
-float16x8_t vbrsrq_m(float16x8_t, float16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_f32)))
-float32x4_t vbrsrq_m_n_f32(float32x4_t, float32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_m_n_f32)))
-float32x4_t vbrsrq_m(float32x4_t, float32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_f16)))
-float16x8_t vbrsrq_n_f16(float16x8_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_f16)))
-float16x8_t vbrsrq(float16x8_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_f32)))
-float32x4_t vbrsrq_n_f32(float32x4_t, int32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_n_f32)))
-float32x4_t vbrsrq(float32x4_t, int32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_f16)))
-float16x8_t vbrsrq_x_n_f16(float16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_f16)))
-float16x8_t vbrsrq_x(float16x8_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_f32)))
-float32x4_t vbrsrq_x_n_f32(float32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vbrsrq_x_n_f32)))
-float32x4_t vbrsrq_x(float32x4_t, int32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_f16)))
-float16x8_t vcaddq_rot270_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_f16)))
-float16x8_t vcaddq_rot270(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_f32)))
-float32x4_t vcaddq_rot270_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_f32)))
-float32x4_t vcaddq_rot270(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_f16)))
-float16x8_t vcaddq_rot270_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_f16)))
-float16x8_t vcaddq_rot270_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_f32)))
-float32x4_t vcaddq_rot270_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_m_f32)))
-float32x4_t vcaddq_rot270_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_f16)))
-float16x8_t vcaddq_rot270_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_f16)))
-float16x8_t vcaddq_rot270_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_f32)))
-float32x4_t vcaddq_rot270_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot270_x_f32)))
-float32x4_t vcaddq_rot270_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_f16)))
-float16x8_t vcaddq_rot90_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_f16)))
-float16x8_t vcaddq_rot90(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_f32)))
-float32x4_t vcaddq_rot90_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_f32)))
-float32x4_t vcaddq_rot90(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_f16)))
-float16x8_t vcaddq_rot90_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_f16)))
-float16x8_t vcaddq_rot90_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_f32)))
-float32x4_t vcaddq_rot90_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_m_f32)))
-float32x4_t vcaddq_rot90_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_f16)))
-float16x8_t vcaddq_rot90_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_f16)))
-float16x8_t vcaddq_rot90_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_f32)))
-float32x4_t vcaddq_rot90_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcaddq_rot90_x_f32)))
-float32x4_t vcaddq_rot90_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_f16)))
-float16x8_t vcmlaq_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_f16)))
-float16x8_t vcmlaq(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_f32)))
-float32x4_t vcmlaq_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_f32)))
-float32x4_t vcmlaq(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_m_f16)))
-float16x8_t vcmlaq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_m_f16)))
-float16x8_t vcmlaq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_m_f32)))
-float32x4_t vcmlaq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_m_f32)))
-float32x4_t vcmlaq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_f16)))
-float16x8_t vcmlaq_rot180_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_f16)))
-float16x8_t vcmlaq_rot180(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_f32)))
-float32x4_t vcmlaq_rot180_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_f32)))
-float32x4_t vcmlaq_rot180(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_m_f16)))
-float16x8_t vcmlaq_rot180_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_m_f16)))
-float16x8_t vcmlaq_rot180_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_m_f32)))
-float32x4_t vcmlaq_rot180_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot180_m_f32)))
-float32x4_t vcmlaq_rot180_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_f16)))
-float16x8_t vcmlaq_rot270_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_f16)))
-float16x8_t vcmlaq_rot270(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_f32)))
-float32x4_t vcmlaq_rot270_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_f32)))
-float32x4_t vcmlaq_rot270(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_m_f16)))
-float16x8_t vcmlaq_rot270_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_m_f16)))
-float16x8_t vcmlaq_rot270_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_m_f32)))
-float32x4_t vcmlaq_rot270_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot270_m_f32)))
-float32x4_t vcmlaq_rot270_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_f16)))
-float16x8_t vcmlaq_rot90_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_f16)))
-float16x8_t vcmlaq_rot90(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_f32)))
-float32x4_t vcmlaq_rot90_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_f32)))
-float32x4_t vcmlaq_rot90(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_m_f16)))
-float16x8_t vcmlaq_rot90_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_m_f16)))
-float16x8_t vcmlaq_rot90_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_m_f32)))
-float32x4_t vcmlaq_rot90_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmlaq_rot90_m_f32)))
-float32x4_t vcmlaq_rot90_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_f16)))
-mve_pred16_t vcmpeqq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_f16)))
-mve_pred16_t vcmpeqq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_f32)))
-mve_pred16_t vcmpeqq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_f32)))
-mve_pred16_t vcmpeqq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_f16)))
-mve_pred16_t vcmpeqq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_f16)))
-mve_pred16_t vcmpeqq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_f32)))
-mve_pred16_t vcmpeqq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_f32)))
-mve_pred16_t vcmpeqq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_f16)))
-mve_pred16_t vcmpeqq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_f16)))
-mve_pred16_t vcmpeqq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_f32)))
-mve_pred16_t vcmpeqq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_m_n_f32)))
-mve_pred16_t vcmpeqq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_f16)))
-mve_pred16_t vcmpeqq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_f16)))
-mve_pred16_t vcmpeqq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_f32)))
-mve_pred16_t vcmpeqq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpeqq_n_f32)))
-mve_pred16_t vcmpeqq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_f16)))
-mve_pred16_t vcmpgeq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_f16)))
-mve_pred16_t vcmpgeq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_f32)))
-mve_pred16_t vcmpgeq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_f32)))
-mve_pred16_t vcmpgeq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_f16)))
-mve_pred16_t vcmpgeq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_f16)))
-mve_pred16_t vcmpgeq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_f32)))
-mve_pred16_t vcmpgeq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_f32)))
-mve_pred16_t vcmpgeq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_f16)))
-mve_pred16_t vcmpgeq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_f16)))
-mve_pred16_t vcmpgeq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_f32)))
-mve_pred16_t vcmpgeq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_m_n_f32)))
-mve_pred16_t vcmpgeq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_f16)))
-mve_pred16_t vcmpgeq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_f16)))
-mve_pred16_t vcmpgeq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_f32)))
-mve_pred16_t vcmpgeq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgeq_n_f32)))
-mve_pred16_t vcmpgeq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_f16)))
-mve_pred16_t vcmpgtq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_f16)))
-mve_pred16_t vcmpgtq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_f32)))
-mve_pred16_t vcmpgtq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_f32)))
-mve_pred16_t vcmpgtq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_f16)))
-mve_pred16_t vcmpgtq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_f16)))
-mve_pred16_t vcmpgtq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_f32)))
-mve_pred16_t vcmpgtq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_f32)))
-mve_pred16_t vcmpgtq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_f16)))
-mve_pred16_t vcmpgtq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_f16)))
-mve_pred16_t vcmpgtq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_f32)))
-mve_pred16_t vcmpgtq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_m_n_f32)))
-mve_pred16_t vcmpgtq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_f16)))
-mve_pred16_t vcmpgtq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_f16)))
-mve_pred16_t vcmpgtq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_f32)))
-mve_pred16_t vcmpgtq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpgtq_n_f32)))
-mve_pred16_t vcmpgtq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_f16)))
-mve_pred16_t vcmpleq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_f16)))
-mve_pred16_t vcmpleq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_f32)))
-mve_pred16_t vcmpleq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_f32)))
-mve_pred16_t vcmpleq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_f16)))
-mve_pred16_t vcmpleq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_f16)))
-mve_pred16_t vcmpleq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_f32)))
-mve_pred16_t vcmpleq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_f32)))
-mve_pred16_t vcmpleq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_f16)))
-mve_pred16_t vcmpleq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_f16)))
-mve_pred16_t vcmpleq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_f32)))
-mve_pred16_t vcmpleq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_m_n_f32)))
-mve_pred16_t vcmpleq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_f16)))
-mve_pred16_t vcmpleq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_f16)))
-mve_pred16_t vcmpleq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_f32)))
-mve_pred16_t vcmpleq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpleq_n_f32)))
-mve_pred16_t vcmpleq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_f16)))
-mve_pred16_t vcmpltq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_f16)))
-mve_pred16_t vcmpltq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_f32)))
-mve_pred16_t vcmpltq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_f32)))
-mve_pred16_t vcmpltq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_f16)))
-mve_pred16_t vcmpltq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_f16)))
-mve_pred16_t vcmpltq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_f32)))
-mve_pred16_t vcmpltq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_f32)))
-mve_pred16_t vcmpltq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_f16)))
-mve_pred16_t vcmpltq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_f16)))
-mve_pred16_t vcmpltq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_f32)))
-mve_pred16_t vcmpltq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_m_n_f32)))
-mve_pred16_t vcmpltq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_f16)))
-mve_pred16_t vcmpltq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_f16)))
-mve_pred16_t vcmpltq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_f32)))
-mve_pred16_t vcmpltq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpltq_n_f32)))
-mve_pred16_t vcmpltq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_f16)))
-mve_pred16_t vcmpneq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_f16)))
-mve_pred16_t vcmpneq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_f32)))
-mve_pred16_t vcmpneq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_f32)))
-mve_pred16_t vcmpneq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_f16)))
-mve_pred16_t vcmpneq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_f16)))
-mve_pred16_t vcmpneq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_f32)))
-mve_pred16_t vcmpneq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_f32)))
-mve_pred16_t vcmpneq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_f16)))
-mve_pred16_t vcmpneq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_f16)))
-mve_pred16_t vcmpneq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_f32)))
-mve_pred16_t vcmpneq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_m_n_f32)))
-mve_pred16_t vcmpneq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_f16)))
-mve_pred16_t vcmpneq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_f16)))
-mve_pred16_t vcmpneq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_f32)))
-mve_pred16_t vcmpneq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmpneq_n_f32)))
-mve_pred16_t vcmpneq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_f16)))
-float16x8_t vcmulq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_f16)))
-float16x8_t vcmulq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_f32)))
-float32x4_t vcmulq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_f32)))
-float32x4_t vcmulq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_m_f16)))
-float16x8_t vcmulq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_m_f16)))
-float16x8_t vcmulq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_m_f32)))
-float32x4_t vcmulq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_m_f32)))
-float32x4_t vcmulq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_f16)))
-float16x8_t vcmulq_rot180_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_f16)))
-float16x8_t vcmulq_rot180(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_f32)))
-float32x4_t vcmulq_rot180_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_f32)))
-float32x4_t vcmulq_rot180(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_m_f16)))
-float16x8_t vcmulq_rot180_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_m_f16)))
-float16x8_t vcmulq_rot180_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_m_f32)))
-float32x4_t vcmulq_rot180_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_m_f32)))
-float32x4_t vcmulq_rot180_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_x_f16)))
-float16x8_t vcmulq_rot180_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_x_f16)))
-float16x8_t vcmulq_rot180_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_x_f32)))
-float32x4_t vcmulq_rot180_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot180_x_f32)))
-float32x4_t vcmulq_rot180_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_f16)))
-float16x8_t vcmulq_rot270_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_f16)))
-float16x8_t vcmulq_rot270(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_f32)))
-float32x4_t vcmulq_rot270_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_f32)))
-float32x4_t vcmulq_rot270(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_m_f16)))
-float16x8_t vcmulq_rot270_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_m_f16)))
-float16x8_t vcmulq_rot270_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_m_f32)))
-float32x4_t vcmulq_rot270_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_m_f32)))
-float32x4_t vcmulq_rot270_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_x_f16)))
-float16x8_t vcmulq_rot270_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_x_f16)))
-float16x8_t vcmulq_rot270_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_x_f32)))
-float32x4_t vcmulq_rot270_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot270_x_f32)))
-float32x4_t vcmulq_rot270_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_f16)))
-float16x8_t vcmulq_rot90_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_f16)))
-float16x8_t vcmulq_rot90(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_f32)))
-float32x4_t vcmulq_rot90_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_f32)))
-float32x4_t vcmulq_rot90(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_m_f16)))
-float16x8_t vcmulq_rot90_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_m_f16)))
-float16x8_t vcmulq_rot90_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_m_f32)))
-float32x4_t vcmulq_rot90_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_m_f32)))
-float32x4_t vcmulq_rot90_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_x_f16)))
-float16x8_t vcmulq_rot90_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_x_f16)))
-float16x8_t vcmulq_rot90_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_x_f32)))
-float32x4_t vcmulq_rot90_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_rot90_x_f32)))
-float32x4_t vcmulq_rot90_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_x_f16)))
-float16x8_t vcmulq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_x_f16)))
-float16x8_t vcmulq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_x_f32)))
-float32x4_t vcmulq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcmulq_x_f32)))
-float32x4_t vcmulq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_f16)))
-float16x8_t vcreateq_f16(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcreateq_f32)))
-float32x4_t vcreateq_f32(uint64_t, uint64_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_s16_f16)))
-int16x8_t vcvtaq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_s16_f16)))
-int16x8_t vcvtaq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_s32_f32)))
-int32x4_t vcvtaq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_s32_f32)))
-int32x4_t vcvtaq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_u16_f16)))
-uint16x8_t vcvtaq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_u16_f16)))
-uint16x8_t vcvtaq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_u32_f32)))
-uint32x4_t vcvtaq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_m_u32_f32)))
-uint32x4_t vcvtaq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_s16_f16)))
-int16x8_t vcvtaq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_s32_f32)))
-int32x4_t vcvtaq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_u16_f16)))
-uint16x8_t vcvtaq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_u32_f32)))
-uint32x4_t vcvtaq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_x_s16_f16)))
-int16x8_t vcvtaq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_x_s32_f32)))
-int32x4_t vcvtaq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_x_u16_f16)))
-uint16x8_t vcvtaq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtaq_x_u32_f32)))
-uint32x4_t vcvtaq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_f16_f32)))
-float16x8_t vcvtbq_f16_f32(float16x8_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_f32_f16)))
-float32x4_t vcvtbq_f32_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_m_f16_f32)))
-float16x8_t vcvtbq_m_f16_f32(float16x8_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_m_f32_f16)))
-float32x4_t vcvtbq_m_f32_f16(float32x4_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtbq_x_f32_f16)))
-float32x4_t vcvtbq_x_f32_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_s16_f16)))
-int16x8_t vcvtmq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_s16_f16)))
-int16x8_t vcvtmq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_s32_f32)))
-int32x4_t vcvtmq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_s32_f32)))
-int32x4_t vcvtmq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_u16_f16)))
-uint16x8_t vcvtmq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_u16_f16)))
-uint16x8_t vcvtmq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_u32_f32)))
-uint32x4_t vcvtmq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_m_u32_f32)))
-uint32x4_t vcvtmq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_s16_f16)))
-int16x8_t vcvtmq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_s32_f32)))
-int32x4_t vcvtmq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_u16_f16)))
-uint16x8_t vcvtmq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_u32_f32)))
-uint32x4_t vcvtmq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_x_s16_f16)))
-int16x8_t vcvtmq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_x_s32_f32)))
-int32x4_t vcvtmq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_x_u16_f16)))
-uint16x8_t vcvtmq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtmq_x_u32_f32)))
-uint32x4_t vcvtmq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_s16_f16)))
-int16x8_t vcvtnq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_s16_f16)))
-int16x8_t vcvtnq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_s32_f32)))
-int32x4_t vcvtnq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_s32_f32)))
-int32x4_t vcvtnq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_u16_f16)))
-uint16x8_t vcvtnq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_u16_f16)))
-uint16x8_t vcvtnq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_u32_f32)))
-uint32x4_t vcvtnq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_m_u32_f32)))
-uint32x4_t vcvtnq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_s16_f16)))
-int16x8_t vcvtnq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_s32_f32)))
-int32x4_t vcvtnq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_u16_f16)))
-uint16x8_t vcvtnq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_u32_f32)))
-uint32x4_t vcvtnq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_x_s16_f16)))
-int16x8_t vcvtnq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_x_s32_f32)))
-int32x4_t vcvtnq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_x_u16_f16)))
-uint16x8_t vcvtnq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtnq_x_u32_f32)))
-uint32x4_t vcvtnq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_s16_f16)))
-int16x8_t vcvtpq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_s16_f16)))
-int16x8_t vcvtpq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_s32_f32)))
-int32x4_t vcvtpq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_s32_f32)))
-int32x4_t vcvtpq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_u16_f16)))
-uint16x8_t vcvtpq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_u16_f16)))
-uint16x8_t vcvtpq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_u32_f32)))
-uint32x4_t vcvtpq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_m_u32_f32)))
-uint32x4_t vcvtpq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_s16_f16)))
-int16x8_t vcvtpq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_s32_f32)))
-int32x4_t vcvtpq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_u16_f16)))
-uint16x8_t vcvtpq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_u32_f32)))
-uint32x4_t vcvtpq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_x_s16_f16)))
-int16x8_t vcvtpq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_x_s32_f32)))
-int32x4_t vcvtpq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_x_u16_f16)))
-uint16x8_t vcvtpq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtpq_x_u32_f32)))
-uint32x4_t vcvtpq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f16_s16)))
-float16x8_t vcvtq_f16_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f16_s16)))
-float16x8_t vcvtq(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f16_u16)))
-float16x8_t vcvtq_f16_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f16_u16)))
-float16x8_t vcvtq(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f32_s32)))
-float32x4_t vcvtq_f32_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f32_s32)))
-float32x4_t vcvtq(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f32_u32)))
-float32x4_t vcvtq_f32_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_f32_u32)))
-float32x4_t vcvtq(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f16_s16)))
-float16x8_t vcvtq_m_f16_s16(float16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f16_s16)))
-float16x8_t vcvtq_m(float16x8_t, int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f16_u16)))
-float16x8_t vcvtq_m_f16_u16(float16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f16_u16)))
-float16x8_t vcvtq_m(float16x8_t, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f32_s32)))
-float32x4_t vcvtq_m_f32_s32(float32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f32_s32)))
-float32x4_t vcvtq_m(float32x4_t, int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f32_u32)))
-float32x4_t vcvtq_m_f32_u32(float32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_f32_u32)))
-float32x4_t vcvtq_m(float32x4_t, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f16_s16)))
-float16x8_t vcvtq_m_n_f16_s16(float16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f16_s16)))
-float16x8_t vcvtq_m_n(float16x8_t, int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f16_u16)))
-float16x8_t vcvtq_m_n_f16_u16(float16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f16_u16)))
-float16x8_t vcvtq_m_n(float16x8_t, uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f32_s32)))
-float32x4_t vcvtq_m_n_f32_s32(float32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f32_s32)))
-float32x4_t vcvtq_m_n(float32x4_t, int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f32_u32)))
-float32x4_t vcvtq_m_n_f32_u32(float32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_f32_u32)))
-float32x4_t vcvtq_m_n(float32x4_t, uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_s16_f16)))
-int16x8_t vcvtq_m_n_s16_f16(int16x8_t, float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_s16_f16)))
-int16x8_t vcvtq_m_n(int16x8_t, float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_s32_f32)))
-int32x4_t vcvtq_m_n_s32_f32(int32x4_t, float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_s32_f32)))
-int32x4_t vcvtq_m_n(int32x4_t, float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_u16_f16)))
-uint16x8_t vcvtq_m_n_u16_f16(uint16x8_t, float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_u16_f16)))
-uint16x8_t vcvtq_m_n(uint16x8_t, float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_u32_f32)))
-uint32x4_t vcvtq_m_n_u32_f32(uint32x4_t, float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_n_u32_f32)))
-uint32x4_t vcvtq_m_n(uint32x4_t, float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_s16_f16)))
-int16x8_t vcvtq_m_s16_f16(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_s16_f16)))
-int16x8_t vcvtq_m(int16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_s32_f32)))
-int32x4_t vcvtq_m_s32_f32(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_s32_f32)))
-int32x4_t vcvtq_m(int32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_u16_f16)))
-uint16x8_t vcvtq_m_u16_f16(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_u16_f16)))
-uint16x8_t vcvtq_m(uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_u32_f32)))
-uint32x4_t vcvtq_m_u32_f32(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_m_u32_f32)))
-uint32x4_t vcvtq_m(uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f16_s16)))
-float16x8_t vcvtq_n_f16_s16(int16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f16_s16)))
-float16x8_t vcvtq_n(int16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f16_u16)))
-float16x8_t vcvtq_n_f16_u16(uint16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f16_u16)))
-float16x8_t vcvtq_n(uint16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f32_s32)))
-float32x4_t vcvtq_n_f32_s32(int32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f32_s32)))
-float32x4_t vcvtq_n(int32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f32_u32)))
-float32x4_t vcvtq_n_f32_u32(uint32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_f32_u32)))
-float32x4_t vcvtq_n(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_s16_f16)))
-int16x8_t vcvtq_n_s16_f16(float16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_s32_f32)))
-int32x4_t vcvtq_n_s32_f32(float32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_u16_f16)))
-uint16x8_t vcvtq_n_u16_f16(float16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_n_u32_f32)))
-uint32x4_t vcvtq_n_u32_f32(float32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_s16_f16)))
-int16x8_t vcvtq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_s32_f32)))
-int32x4_t vcvtq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_u16_f16)))
-uint16x8_t vcvtq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_u32_f32)))
-uint32x4_t vcvtq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f16_s16)))
-float16x8_t vcvtq_x_f16_s16(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f16_s16)))
-float16x8_t vcvtq_x(int16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f16_u16)))
-float16x8_t vcvtq_x_f16_u16(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f16_u16)))
-float16x8_t vcvtq_x(uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f32_s32)))
-float32x4_t vcvtq_x_f32_s32(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f32_s32)))
-float32x4_t vcvtq_x(int32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f32_u32)))
-float32x4_t vcvtq_x_f32_u32(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_f32_u32)))
-float32x4_t vcvtq_x(uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f16_s16)))
-float16x8_t vcvtq_x_n_f16_s16(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f16_s16)))
-float16x8_t vcvtq_x_n(int16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f16_u16)))
-float16x8_t vcvtq_x_n_f16_u16(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f16_u16)))
-float16x8_t vcvtq_x_n(uint16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f32_s32)))
-float32x4_t vcvtq_x_n_f32_s32(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f32_s32)))
-float32x4_t vcvtq_x_n(int32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f32_u32)))
-float32x4_t vcvtq_x_n_f32_u32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_f32_u32)))
-float32x4_t vcvtq_x_n(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_s16_f16)))
-int16x8_t vcvtq_x_n_s16_f16(float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_s32_f32)))
-int32x4_t vcvtq_x_n_s32_f32(float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_u16_f16)))
-uint16x8_t vcvtq_x_n_u16_f16(float16x8_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_n_u32_f32)))
-uint32x4_t vcvtq_x_n_u32_f32(float32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_s16_f16)))
-int16x8_t vcvtq_x_s16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_s32_f32)))
-int32x4_t vcvtq_x_s32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_u16_f16)))
-uint16x8_t vcvtq_x_u16_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvtq_x_u32_f32)))
-uint32x4_t vcvtq_x_u32_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_f16_f32)))
-float16x8_t vcvttq_f16_f32(float16x8_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_f32_f16)))
-float32x4_t vcvttq_f32_f16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_m_f16_f32)))
-float16x8_t vcvttq_m_f16_f32(float16x8_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_m_f32_f16)))
-float32x4_t vcvttq_m_f32_f16(float32x4_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vcvttq_x_f32_f16)))
-float32x4_t vcvttq_x_f32_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_f16)))
-float16x8_t vdupq_m_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_f16)))
-float16x8_t vdupq_m(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_f32)))
-float32x4_t vdupq_m_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vdupq_m_n_f32)))
-float32x4_t vdupq_m(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_f16)))
-float16x8_t vdupq_n_f16(float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_n_f32)))
-float32x4_t vdupq_n_f32(float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_f16)))
-float16x8_t vdupq_x_n_f16(float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vdupq_x_n_f32)))
-float32x4_t vdupq_x_n_f32(float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_f16)))
-float16x8_t veorq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_f16)))
-float16x8_t veorq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_f32)))
-float32x4_t veorq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_f32)))
-float32x4_t veorq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_f16)))
-float16x8_t veorq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_f16)))
-float16x8_t veorq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_f32)))
-float32x4_t veorq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_m_f32)))
-float32x4_t veorq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_f16)))
-float16x8_t veorq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_f16)))
-float16x8_t veorq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_f32)))
-float32x4_t veorq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_veorq_x_f32)))
-float32x4_t veorq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_f16)))
-float16x8_t vfmaq_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_f16)))
-float16x8_t vfmaq(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_f32)))
-float32x4_t vfmaq_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_f32)))
-float32x4_t vfmaq(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_f16)))
-float16x8_t vfmaq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_f16)))
-float16x8_t vfmaq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_f32)))
-float32x4_t vfmaq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_f32)))
-float32x4_t vfmaq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_n_f16)))
-float16x8_t vfmaq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_n_f16)))
-float16x8_t vfmaq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_n_f32)))
-float32x4_t vfmaq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_m_n_f32)))
-float32x4_t vfmaq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_n_f16)))
-float16x8_t vfmaq_n_f16(float16x8_t, float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_n_f16)))
-float16x8_t vfmaq(float16x8_t, float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_n_f32)))
-float32x4_t vfmaq_n_f32(float32x4_t, float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmaq_n_f32)))
-float32x4_t vfmaq(float32x4_t, float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_m_n_f16)))
-float16x8_t vfmasq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_m_n_f16)))
-float16x8_t vfmasq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_m_n_f32)))
-float32x4_t vfmasq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_m_n_f32)))
-float32x4_t vfmasq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_n_f16)))
-float16x8_t vfmasq_n_f16(float16x8_t, float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_n_f16)))
-float16x8_t vfmasq(float16x8_t, float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_n_f32)))
-float32x4_t vfmasq_n_f32(float32x4_t, float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmasq_n_f32)))
-float32x4_t vfmasq(float32x4_t, float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_f16)))
-float16x8_t vfmsq_f16(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_f16)))
-float16x8_t vfmsq(float16x8_t, float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_f32)))
-float32x4_t vfmsq_f32(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_f32)))
-float32x4_t vfmsq(float32x4_t, float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_m_f16)))
-float16x8_t vfmsq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_m_f16)))
-float16x8_t vfmsq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_m_f32)))
-float32x4_t vfmsq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vfmsq_m_f32)))
-float32x4_t vfmsq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_f16)))
-float16_t vgetq_lane_f16(float16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_f16)))
-float16_t vgetq_lane(float16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_f32)))
-float32_t vgetq_lane_f32(float32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vgetq_lane_f32)))
-float32_t vgetq_lane(float32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_f16)))
-float16x8_t vld1q_f16(const float16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_f16)))
-float16x8_t vld1q(const float16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_f32)))
-float32x4_t vld1q_f32(const float32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_f32)))
-float32x4_t vld1q(const float32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_f16)))
-float16x8_t vld1q_z_f16(const float16_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_f16)))
-float16x8_t vld1q_z(const float16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_f32)))
-float32x4_t vld1q_z_f32(const float32_t *, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld1q_z_f32)))
-float32x4_t vld1q_z(const float32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_f16)))
-float16x8x2_t vld2q_f16(const float16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_f16)))
-float16x8x2_t vld2q(const float16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld2q_f32)))
-float32x4x2_t vld2q_f32(const float32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld2q_f32)))
-float32x4x2_t vld2q(const float32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_f16)))
-float16x8x4_t vld4q_f16(const float16_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_f16)))
-float16x8x4_t vld4q(const float16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vld4q_f32)))
-float32x4x4_t vld4q_f32(const float32_t *);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vld4q_f32)))
-float32x4x4_t vld4q(const float32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_f16)))
-float16x8_t vldrhq_f16(const float16_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_f16)))
-float16x8_t vldrhq_gather_offset_f16(const float16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_f16)))
-float16x8_t vldrhq_gather_offset(const float16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_f16)))
-float16x8_t vldrhq_gather_offset_z_f16(const float16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_offset_z_f16)))
-float16x8_t vldrhq_gather_offset_z(const float16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_f16)))
-float16x8_t vldrhq_gather_shifted_offset_f16(const float16_t *, uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_f16)))
-float16x8_t vldrhq_gather_shifted_offset(const float16_t *, uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_f16)))
-float16x8_t vldrhq_gather_shifted_offset_z_f16(const float16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_gather_shifted_offset_z_f16)))
-float16x8_t vldrhq_gather_shifted_offset_z(const float16_t *, uint16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrhq_z_f16)))
-float16x8_t vldrhq_z_f16(const float16_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_f32)))
-float32x4_t vldrwq_f32(const float32_t *);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_f32)))
-float32x4_t vldrwq_gather_base_f32(uint32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_f32)))
-float32x4_t vldrwq_gather_base_wb_f32(uint32x4_t *, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_wb_z_f32)))
-float32x4_t vldrwq_gather_base_wb_z_f32(uint32x4_t *, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_base_z_f32)))
-float32x4_t vldrwq_gather_base_z_f32(uint32x4_t, int, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_f32)))
-float32x4_t vldrwq_gather_offset_f32(const float32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_f32)))
-float32x4_t vldrwq_gather_offset(const float32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_f32)))
-float32x4_t vldrwq_gather_offset_z_f32(const float32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_offset_z_f32)))
-float32x4_t vldrwq_gather_offset_z(const float32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_f32)))
-float32x4_t vldrwq_gather_shifted_offset_f32(const float32_t *, uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_f32)))
-float32x4_t vldrwq_gather_shifted_offset(const float32_t *, uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_f32)))
-float32x4_t vldrwq_gather_shifted_offset_z_f32(const float32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_gather_shifted_offset_z_f32)))
-float32x4_t vldrwq_gather_shifted_offset_z(const float32_t *, uint32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vldrwq_z_f32)))
-float32x4_t vldrwq_z_f32(const float32_t *, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_f16)))
-float16x8_t vmaxnmaq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_f16)))
-float16x8_t vmaxnmaq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_f32)))
-float32x4_t vmaxnmaq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_f32)))
-float32x4_t vmaxnmaq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_m_f16)))
-float16x8_t vmaxnmaq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_m_f16)))
-float16x8_t vmaxnmaq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_m_f32)))
-float32x4_t vmaxnmaq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmaq_m_f32)))
-float32x4_t vmaxnmaq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_f16)))
-float16_t vmaxnmavq_f16(float16_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_f16)))
-float16_t vmaxnmavq(float16_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_f32)))
-float32_t vmaxnmavq_f32(float32_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_f32)))
-float32_t vmaxnmavq(float32_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_p_f16)))
-float16_t vmaxnmavq_p_f16(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_p_f16)))
-float16_t vmaxnmavq_p(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_p_f32)))
-float32_t vmaxnmavq_p_f32(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmavq_p_f32)))
-float32_t vmaxnmavq_p(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_f16)))
-float16x8_t vmaxnmq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_f16)))
-float16x8_t vmaxnmq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_f32)))
-float32x4_t vmaxnmq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_f32)))
-float32x4_t vmaxnmq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_m_f16)))
-float16x8_t vmaxnmq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_m_f16)))
-float16x8_t vmaxnmq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_m_f32)))
-float32x4_t vmaxnmq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_m_f32)))
-float32x4_t vmaxnmq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_x_f16)))
-float16x8_t vmaxnmq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_x_f16)))
-float16x8_t vmaxnmq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_x_f32)))
-float32x4_t vmaxnmq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmq_x_f32)))
-float32x4_t vmaxnmq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_f16)))
-float16_t vmaxnmvq_f16(float16_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_f16)))
-float16_t vmaxnmvq(float16_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_f32)))
-float32_t vmaxnmvq_f32(float32_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_f32)))
-float32_t vmaxnmvq(float32_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_p_f16)))
-float16_t vmaxnmvq_p_f16(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_p_f16)))
-float16_t vmaxnmvq_p(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_p_f32)))
-float32_t vmaxnmvq_p_f32(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmaxnmvq_p_f32)))
-float32_t vmaxnmvq_p(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_f16)))
-float16x8_t vminnmaq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_f16)))
-float16x8_t vminnmaq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_f32)))
-float32x4_t vminnmaq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_f32)))
-float32x4_t vminnmaq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_m_f16)))
-float16x8_t vminnmaq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_m_f16)))
-float16x8_t vminnmaq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_m_f32)))
-float32x4_t vminnmaq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmaq_m_f32)))
-float32x4_t vminnmaq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_f16)))
-float16_t vminnmavq_f16(float16_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_f16)))
-float16_t vminnmavq(float16_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_f32)))
-float32_t vminnmavq_f32(float32_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_f32)))
-float32_t vminnmavq(float32_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_p_f16)))
-float16_t vminnmavq_p_f16(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_p_f16)))
-float16_t vminnmavq_p(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_p_f32)))
-float32_t vminnmavq_p_f32(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmavq_p_f32)))
-float32_t vminnmavq_p(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_f16)))
-float16x8_t vminnmq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_f16)))
-float16x8_t vminnmq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_f32)))
-float32x4_t vminnmq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_f32)))
-float32x4_t vminnmq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_m_f16)))
-float16x8_t vminnmq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_m_f16)))
-float16x8_t vminnmq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_m_f32)))
-float32x4_t vminnmq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_m_f32)))
-float32x4_t vminnmq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_x_f16)))
-float16x8_t vminnmq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_x_f16)))
-float16x8_t vminnmq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_x_f32)))
-float32x4_t vminnmq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmq_x_f32)))
-float32x4_t vminnmq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_f16)))
-float16_t vminnmvq_f16(float16_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_f16)))
-float16_t vminnmvq(float16_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_f32)))
-float32_t vminnmvq_f32(float32_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_f32)))
-float32_t vminnmvq(float32_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_p_f16)))
-float16_t vminnmvq_p_f16(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_p_f16)))
-float16_t vminnmvq_p(float16_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_p_f32)))
-float32_t vminnmvq_p_f32(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vminnmvq_p_f32)))
-float32_t vminnmvq_p(float32_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_f16)))
-float16x8_t vmulq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_f16)))
-float16x8_t vmulq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_f32)))
-float32x4_t vmulq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_f32)))
-float32x4_t vmulq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_f16)))
-float16x8_t vmulq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_f16)))
-float16x8_t vmulq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_f32)))
-float32x4_t vmulq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_f32)))
-float32x4_t vmulq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_f16)))
-float16x8_t vmulq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_f16)))
-float16x8_t vmulq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_f32)))
-float32x4_t vmulq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_m_n_f32)))
-float32x4_t vmulq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_f16)))
-float16x8_t vmulq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_f16)))
-float16x8_t vmulq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_f32)))
-float32x4_t vmulq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_n_f32)))
-float32x4_t vmulq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_f16)))
-float16x8_t vmulq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_f16)))
-float16x8_t vmulq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_f32)))
-float32x4_t vmulq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_f32)))
-float32x4_t vmulq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_f16)))
-float16x8_t vmulq_x_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_f16)))
-float16x8_t vmulq_x(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_f32)))
-float32x4_t vmulq_x_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vmulq_x_n_f32)))
-float32x4_t vmulq_x(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_f16)))
-float16x8_t vnegq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_f16)))
-float16x8_t vnegq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_f32)))
-float32x4_t vnegq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_f32)))
-float32x4_t vnegq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_f16)))
-float16x8_t vnegq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_f16)))
-float16x8_t vnegq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_f32)))
-float32x4_t vnegq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_m_f32)))
-float32x4_t vnegq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_f16)))
-float16x8_t vnegq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_f16)))
-float16x8_t vnegq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_f32)))
-float32x4_t vnegq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vnegq_x_f32)))
-float32x4_t vnegq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_f16)))
-float16x8_t vornq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_f16)))
-float16x8_t vornq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_f32)))
-float32x4_t vornq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_f32)))
-float32x4_t vornq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_f16)))
-float16x8_t vornq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_f16)))
-float16x8_t vornq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_f32)))
-float32x4_t vornq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_m_f32)))
-float32x4_t vornq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_f16)))
-float16x8_t vornq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_f16)))
-float16x8_t vornq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_f32)))
-float32x4_t vornq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vornq_x_f32)))
-float32x4_t vornq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_f16)))
-float16x8_t vorrq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_f16)))
-float16x8_t vorrq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_f32)))
-float32x4_t vorrq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_f32)))
-float32x4_t vorrq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_f16)))
-float16x8_t vorrq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_f16)))
-float16x8_t vorrq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_f32)))
-float32x4_t vorrq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_m_f32)))
-float32x4_t vorrq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_f16)))
-float16x8_t vorrq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_f16)))
-float16x8_t vorrq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_f32)))
-float32x4_t vorrq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vorrq_x_f32)))
-float32x4_t vorrq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_f16)))
-float16x8_t vpselq_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_f16)))
-float16x8_t vpselq(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vpselq_f32)))
-float32x4_t vpselq_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vpselq_f32)))
-float32x4_t vpselq(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_f32)))
-float16x8_t vreinterpretq_f16_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_f32)))
-float16x8_t vreinterpretq_f16(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s16)))
-float16x8_t vreinterpretq_f16_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s16)))
-float16x8_t vreinterpretq_f16(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s32)))
-float16x8_t vreinterpretq_f16_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s32)))
-float16x8_t vreinterpretq_f16(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s64)))
-float16x8_t vreinterpretq_f16_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s64)))
-float16x8_t vreinterpretq_f16(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s8)))
-float16x8_t vreinterpretq_f16_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_s8)))
-float16x8_t vreinterpretq_f16(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u16)))
-float16x8_t vreinterpretq_f16_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u16)))
-float16x8_t vreinterpretq_f16(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u32)))
-float16x8_t vreinterpretq_f16_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u32)))
-float16x8_t vreinterpretq_f16(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u64)))
-float16x8_t vreinterpretq_f16_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u64)))
-float16x8_t vreinterpretq_f16(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
-float16x8_t vreinterpretq_f16_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
-float16x8_t vreinterpretq_f16(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_f16)))
-float32x4_t vreinterpretq_f32_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_f16)))
-float32x4_t vreinterpretq_f32(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s16)))
-float32x4_t vreinterpretq_f32_s16(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s16)))
-float32x4_t vreinterpretq_f32(int16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s32)))
-float32x4_t vreinterpretq_f32_s32(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s32)))
-float32x4_t vreinterpretq_f32(int32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s64)))
-float32x4_t vreinterpretq_f32_s64(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s64)))
-float32x4_t vreinterpretq_f32(int64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s8)))
-float32x4_t vreinterpretq_f32_s8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_s8)))
-float32x4_t vreinterpretq_f32(int8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u16)))
-float32x4_t vreinterpretq_f32_u16(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u16)))
-float32x4_t vreinterpretq_f32(uint16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u32)))
-float32x4_t vreinterpretq_f32_u32(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u32)))
-float32x4_t vreinterpretq_f32(uint32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u64)))
-float32x4_t vreinterpretq_f32_u64(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u64)))
-float32x4_t vreinterpretq_f32(uint64x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
-float32x4_t vreinterpretq_f32_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
-float32x4_t vreinterpretq_f32(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_f16)))
-int16x8_t vreinterpretq_s16_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_f16)))
-int16x8_t vreinterpretq_s16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_f32)))
-int16x8_t vreinterpretq_s16_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_f32)))
-int16x8_t vreinterpretq_s16(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_f16)))
-int32x4_t vreinterpretq_s32_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_f16)))
-int32x4_t vreinterpretq_s32(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_f32)))
-int32x4_t vreinterpretq_s32_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_f32)))
-int32x4_t vreinterpretq_s32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_f16)))
-int64x2_t vreinterpretq_s64_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_f16)))
-int64x2_t vreinterpretq_s64(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_f32)))
-int64x2_t vreinterpretq_s64_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_f32)))
-int64x2_t vreinterpretq_s64(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_f16)))
-int8x16_t vreinterpretq_s8_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_f16)))
-int8x16_t vreinterpretq_s8(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_f32)))
-int8x16_t vreinterpretq_s8_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_f32)))
-int8x16_t vreinterpretq_s8(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_f16)))
-uint16x8_t vreinterpretq_u16_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_f16)))
-uint16x8_t vreinterpretq_u16(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_f32)))
-uint16x8_t vreinterpretq_u16_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_f32)))
-uint16x8_t vreinterpretq_u16(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_f16)))
-uint32x4_t vreinterpretq_u32_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_f16)))
-uint32x4_t vreinterpretq_u32(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_f32)))
-uint32x4_t vreinterpretq_u32_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_f32)))
-uint32x4_t vreinterpretq_u32(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_f16)))
-uint64x2_t vreinterpretq_u64_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_f16)))
-uint64x2_t vreinterpretq_u64(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_f32)))
-uint64x2_t vreinterpretq_u64_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_f32)))
-uint64x2_t vreinterpretq_u64(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
-uint8x16_t vreinterpretq_u8_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
-uint8x16_t vreinterpretq_u8(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
-uint8x16_t vreinterpretq_u8_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
-uint8x16_t vreinterpretq_u8(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_f16)))
-float16x8_t vrev32q_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_f16)))
-float16x8_t vrev32q(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_f16)))
-float16x8_t vrev32q_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_m_f16)))
-float16x8_t vrev32q_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_f16)))
-float16x8_t vrev32q_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev32q_x_f16)))
-float16x8_t vrev32q_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_f16)))
-float16x8_t vrev64q_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_f16)))
-float16x8_t vrev64q(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_f32)))
-float32x4_t vrev64q_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_f32)))
-float32x4_t vrev64q(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_f16)))
-float16x8_t vrev64q_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_f16)))
-float16x8_t vrev64q_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_f32)))
-float32x4_t vrev64q_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_m_f32)))
-float32x4_t vrev64q_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_f16)))
-float16x8_t vrev64q_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_f16)))
-float16x8_t vrev64q_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_f32)))
-float32x4_t vrev64q_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrev64q_x_f32)))
-float32x4_t vrev64q_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_f16)))
-float16x8_t vrndaq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_f16)))
-float16x8_t vrndaq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_f32)))
-float32x4_t vrndaq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_f32)))
-float32x4_t vrndaq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_m_f16)))
-float16x8_t vrndaq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_m_f16)))
-float16x8_t vrndaq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_m_f32)))
-float32x4_t vrndaq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_m_f32)))
-float32x4_t vrndaq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_x_f16)))
-float16x8_t vrndaq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_x_f16)))
-float16x8_t vrndaq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_x_f32)))
-float32x4_t vrndaq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndaq_x_f32)))
-float32x4_t vrndaq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_f16)))
-float16x8_t vrndmq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_f16)))
-float16x8_t vrndmq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_f32)))
-float32x4_t vrndmq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_f32)))
-float32x4_t vrndmq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_m_f16)))
-float16x8_t vrndmq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_m_f16)))
-float16x8_t vrndmq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_m_f32)))
-float32x4_t vrndmq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_m_f32)))
-float32x4_t vrndmq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_x_f16)))
-float16x8_t vrndmq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_x_f16)))
-float16x8_t vrndmq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_x_f32)))
-float32x4_t vrndmq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndmq_x_f32)))
-float32x4_t vrndmq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_f16)))
-float16x8_t vrndnq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_f16)))
-float16x8_t vrndnq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_f32)))
-float32x4_t vrndnq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_f32)))
-float32x4_t vrndnq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_m_f16)))
-float16x8_t vrndnq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_m_f16)))
-float16x8_t vrndnq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_m_f32)))
-float32x4_t vrndnq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_m_f32)))
-float32x4_t vrndnq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_x_f16)))
-float16x8_t vrndnq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_x_f16)))
-float16x8_t vrndnq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_x_f32)))
-float32x4_t vrndnq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndnq_x_f32)))
-float32x4_t vrndnq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_f16)))
-float16x8_t vrndpq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_f16)))
-float16x8_t vrndpq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_f32)))
-float32x4_t vrndpq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_f32)))
-float32x4_t vrndpq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_m_f16)))
-float16x8_t vrndpq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_m_f16)))
-float16x8_t vrndpq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_m_f32)))
-float32x4_t vrndpq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_m_f32)))
-float32x4_t vrndpq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_x_f16)))
-float16x8_t vrndpq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_x_f16)))
-float16x8_t vrndpq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_x_f32)))
-float32x4_t vrndpq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndpq_x_f32)))
-float32x4_t vrndpq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_f16)))
-float16x8_t vrndq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_f16)))
-float16x8_t vrndq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_f32)))
-float32x4_t vrndq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_f32)))
-float32x4_t vrndq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_m_f16)))
-float16x8_t vrndq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_m_f16)))
-float16x8_t vrndq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_m_f32)))
-float32x4_t vrndq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_m_f32)))
-float32x4_t vrndq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_x_f16)))
-float16x8_t vrndq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_x_f16)))
-float16x8_t vrndq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndq_x_f32)))
-float32x4_t vrndq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndq_x_f32)))
-float32x4_t vrndq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_f16)))
-float16x8_t vrndxq_f16(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_f16)))
-float16x8_t vrndxq(float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_f32)))
-float32x4_t vrndxq_f32(float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_f32)))
-float32x4_t vrndxq(float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_m_f16)))
-float16x8_t vrndxq_m_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_m_f16)))
-float16x8_t vrndxq_m(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_m_f32)))
-float32x4_t vrndxq_m_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_m_f32)))
-float32x4_t vrndxq_m(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_x_f16)))
-float16x8_t vrndxq_x_f16(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_x_f16)))
-float16x8_t vrndxq_x(float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_x_f32)))
-float32x4_t vrndxq_x_f32(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vrndxq_x_f32)))
-float32x4_t vrndxq_x(float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_f16)))
-float16x8_t vsetq_lane_f16(float16_t, float16x8_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_f16)))
-float16x8_t vsetq_lane(float16_t, float16x8_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_f32)))
-float32x4_t vsetq_lane_f32(float32_t, float32x4_t, int);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsetq_lane_f32)))
-float32x4_t vsetq_lane(float32_t, float32x4_t, int);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_f16)))
-void vst1q_f16(float16_t *, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_f16)))
-void vst1q(float16_t *, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_f32)))
-void vst1q_f32(float32_t *, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_f32)))
-void vst1q(float32_t *, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_f16)))
-void vst1q_p_f16(float16_t *, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_f16)))
-void vst1q_p(float16_t *, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_f32)))
-void vst1q_p_f32(float32_t *, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst1q_p_f32)))
-void vst1q_p(float32_t *, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_f16)))
-void vst2q_f16(float16_t *, float16x8x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_f16)))
-void vst2q(float16_t *, float16x8x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst2q_f32)))
-void vst2q_f32(float32_t *, float32x4x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst2q_f32)))
-void vst2q(float32_t *, float32x4x2_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_f16)))
-void vst4q_f16(float16_t *, float16x8x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_f16)))
-void vst4q(float16_t *, float16x8x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vst4q_f32)))
-void vst4q_f32(float32_t *, float32x4x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vst4q_f32)))
-void vst4q(float32_t *, float32x4x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_f16)))
-void vstrhq_f16(float16_t *, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_f16)))
-void vstrhq(float16_t *, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_f16)))
-void vstrhq_p_f16(float16_t *, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_p_f16)))
-void vstrhq_p(float16_t *, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_f16)))
-void vstrhq_scatter_offset_f16(float16_t *, uint16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_f16)))
-void vstrhq_scatter_offset(float16_t *, uint16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_f16)))
-void vstrhq_scatter_offset_p_f16(float16_t *, uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_offset_p_f16)))
-void vstrhq_scatter_offset_p(float16_t *, uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_f16)))
-void vstrhq_scatter_shifted_offset_f16(float16_t *, uint16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_f16)))
-void vstrhq_scatter_shifted_offset(float16_t *, uint16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_f16)))
-void vstrhq_scatter_shifted_offset_p_f16(float16_t *, uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrhq_scatter_shifted_offset_p_f16)))
-void vstrhq_scatter_shifted_offset_p(float16_t *, uint16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_f32)))
-void vstrwq_f32(float32_t *, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_f32)))
-void vstrwq(float32_t *, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_f32)))
-void vstrwq_p_f32(float32_t *, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_p_f32)))
-void vstrwq_p(float32_t *, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_f32)))
-void vstrwq_scatter_base_f32(uint32x4_t, int, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_f32)))
-void vstrwq_scatter_base(uint32x4_t, int, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_f32)))
-void vstrwq_scatter_base_p_f32(uint32x4_t, int, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_p_f32)))
-void vstrwq_scatter_base_p(uint32x4_t, int, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_f32)))
-void vstrwq_scatter_base_wb_f32(uint32x4_t *, int, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_f32)))
-void vstrwq_scatter_base_wb(uint32x4_t *, int, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_f32)))
-void vstrwq_scatter_base_wb_p_f32(uint32x4_t *, int, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_base_wb_p_f32)))
-void vstrwq_scatter_base_wb_p(uint32x4_t *, int, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_f32)))
-void vstrwq_scatter_offset_f32(float32_t *, uint32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_f32)))
-void vstrwq_scatter_offset(float32_t *, uint32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_f32)))
-void vstrwq_scatter_offset_p_f32(float32_t *, uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_offset_p_f32)))
-void vstrwq_scatter_offset_p(float32_t *, uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_f32)))
-void vstrwq_scatter_shifted_offset_f32(float32_t *, uint32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_f32)))
-void vstrwq_scatter_shifted_offset(float32_t *, uint32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_f32)))
-void vstrwq_scatter_shifted_offset_p_f32(float32_t *, uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vstrwq_scatter_shifted_offset_p_f32)))
-void vstrwq_scatter_shifted_offset_p(float32_t *, uint32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_f16)))
-float16x8_t vsubq_f16(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_f16)))
-float16x8_t vsubq(float16x8_t, float16x8_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_f32)))
-float32x4_t vsubq_f32(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_f32)))
-float32x4_t vsubq(float32x4_t, float32x4_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_f16)))
-float16x8_t vsubq_m_f16(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_f16)))
-float16x8_t vsubq_m(float16x8_t, float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_f32)))
-float32x4_t vsubq_m_f32(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_f32)))
-float32x4_t vsubq_m(float32x4_t, float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_f16)))
-float16x8_t vsubq_m_n_f16(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_f16)))
-float16x8_t vsubq_m(float16x8_t, float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_f32)))
-float32x4_t vsubq_m_n_f32(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_m_n_f32)))
-float32x4_t vsubq_m(float32x4_t, float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_f16)))
-float16x8_t vsubq_n_f16(float16x8_t, float16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_f16)))
-float16x8_t vsubq(float16x8_t, float16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_f32)))
-float32x4_t vsubq_n_f32(float32x4_t, float32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_n_f32)))
-float32x4_t vsubq(float32x4_t, float32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_f16)))
-float16x8_t vsubq_x_f16(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_f16)))
-float16x8_t vsubq_x(float16x8_t, float16x8_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_f32)))
-float32x4_t vsubq_x_f32(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_f32)))
-float32x4_t vsubq_x(float32x4_t, float32x4_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_f16)))
-float16x8_t vsubq_x_n_f16(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_f16)))
-float16x8_t vsubq_x(float16x8_t, float16_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_f32)))
-float32x4_t vsubq_x_n_f32(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vsubq_x_n_f32)))
-float32x4_t vsubq_x(float32x4_t, float32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_f16)))
-float16x8_t vuninitializedq_f16();
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_f32)))
-float32x4_t vuninitializedq_f32();
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_f16)))
-float16x8_t vuninitializedq(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vuninitializedq_polymorphic_f32)))
-float32x4_t vuninitializedq(float32x4_t);
-
-#endif /* (__ARM_FEATURE_MVE & 2) && (!defined __ARM_MVE_PRESERVE_USER_NAMESPACE) */
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* __ARM_MVE_H */
diff --git a/third_party/aarch64/clang/arm_neon.h b/third_party/aarch64/clang/arm_neon.h
deleted file mode 100644
index b67616134..000000000
--- a/third_party/aarch64/clang/arm_neon.h
+++ /dev/null
@@ -1,69638 +0,0 @@
-/*===---- arm_neon.h - ARM Neon intrinsics ---------------------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_NEON_H
-#define __ARM_NEON_H
-
-#ifndef __ARM_FP
-#error "NEON intrinsics not available with the soft-float ABI. Please use -mfloat-abi=softfp or -mfloat-abi=hard"
-#else
-
-#include <stdint.h>
-
-#include <arm_bf16.h>
-#include <arm_vector_types.h>
-#if defined(__aarch64__) || defined(__arm64ec__)
-typedef uint8_t poly8_t;
-typedef uint16_t poly16_t;
-typedef uint64_t poly64_t;
-typedef __uint128_t poly128_t;
-#else
-typedef int8_t poly8_t;
-typedef int16_t poly16_t;
-typedef int64_t poly64_t;
-#endif
-typedef __attribute__((neon_polyvector_type(8))) poly8_t poly8x8_t;
-typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t;
-typedef __attribute__((neon_polyvector_type(4))) poly16_t poly16x4_t;
-typedef __attribute__((neon_polyvector_type(8))) poly16_t poly16x8_t;
-typedef __attribute__((neon_polyvector_type(1))) poly64_t poly64x1_t;
-typedef __attribute__((neon_polyvector_type(2))) poly64_t poly64x2_t;
-
-typedef struct poly8x8x2_t {
-  poly8x8_t val[2];
-} poly8x8x2_t;
-
-typedef struct poly8x16x2_t {
-  poly8x16_t val[2];
-} poly8x16x2_t;
-
-typedef struct poly16x4x2_t {
-  poly16x4_t val[2];
-} poly16x4x2_t;
-
-typedef struct poly16x8x2_t {
-  poly16x8_t val[2];
-} poly16x8x2_t;
-
-typedef struct poly64x1x2_t {
-  poly64x1_t val[2];
-} poly64x1x2_t;
-
-typedef struct poly64x2x2_t {
-  poly64x2_t val[2];
-} poly64x2x2_t;
-
-typedef struct poly8x8x3_t {
-  poly8x8_t val[3];
-} poly8x8x3_t;
-
-typedef struct poly8x16x3_t {
-  poly8x16_t val[3];
-} poly8x16x3_t;
-
-typedef struct poly16x4x3_t {
-  poly16x4_t val[3];
-} poly16x4x3_t;
-
-typedef struct poly16x8x3_t {
-  poly16x8_t val[3];
-} poly16x8x3_t;
-
-typedef struct poly64x1x3_t {
-  poly64x1_t val[3];
-} poly64x1x3_t;
-
-typedef struct poly64x2x3_t {
-  poly64x2_t val[3];
-} poly64x2x3_t;
-
-typedef struct poly8x8x4_t {
-  poly8x8_t val[4];
-} poly8x8x4_t;
-
-typedef struct poly8x16x4_t {
-  poly8x16_t val[4];
-} poly8x16x4_t;
-
-typedef struct poly16x4x4_t {
-  poly16x4_t val[4];
-} poly16x4x4_t;
-
-typedef struct poly16x8x4_t {
-  poly16x8_t val[4];
-} poly16x8x4_t;
-
-typedef struct poly64x1x4_t {
-  poly64x1_t val[4];
-} poly64x1x4_t;
-
-typedef struct poly64x2x4_t {
-  poly64x2_t val[4];
-} poly64x2x4_t;
-
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  __ret = (bfloat16x8_t) __builtin_neon_splatq_lane_bf16((int8x8_t)__s0, __p1, 11); \
-  __ret; \
-})
-#else
-#define splatq_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  bfloat16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (bfloat16x8_t) __builtin_neon_splatq_lane_bf16((int8x8_t)__rev0, __p1, 11); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  __ret = (bfloat16x8_t) __builtin_neon_splatq_lane_bf16((int8x8_t)__s0, __p1, 11); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  __ret = (bfloat16x4_t) __builtin_neon_splat_lane_bf16((int8x8_t)__s0, __p1, 11); \
-  __ret; \
-})
-#else
-#define splat_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  bfloat16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (bfloat16x4_t) __builtin_neon_splat_lane_bf16((int8x8_t)__rev0, __p1, 11); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  __ret = (bfloat16x4_t) __builtin_neon_splat_lane_bf16((int8x8_t)__s0, __p1, 11); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  __ret = (bfloat16x8_t) __builtin_neon_splatq_laneq_bf16((int8x16_t)__s0, __p1, 43); \
-  __ret; \
-})
-#else
-#define splatq_laneq_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (bfloat16x8_t) __builtin_neon_splatq_laneq_bf16((int8x16_t)__rev0, __p1, 43); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  __ret = (bfloat16x8_t) __builtin_neon_splatq_laneq_bf16((int8x16_t)__s0, __p1, 43); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  __ret = (bfloat16x4_t) __builtin_neon_splat_laneq_bf16((int8x16_t)__s0, __p1, 43); \
-  __ret; \
-})
-#else
-#define splat_laneq_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (bfloat16x4_t) __builtin_neon_splat_laneq_bf16((int8x16_t)__rev0, __p1, 43); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  __ret = (bfloat16x4_t) __builtin_neon_splat_laneq_bf16((int8x16_t)__s0, __p1, 43); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) float32x4_t vbfdotq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vbfdotq_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) float32x4_t vbfdotq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  bfloat16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  bfloat16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vbfdotq_f32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float32x4_t __noswap_vbfdotq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vbfdotq_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) float32x2_t vbfdot_f32(float32x2_t __p0, bfloat16x4_t __p1, bfloat16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vbfdot_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) float32x2_t vbfdot_f32(float32x2_t __p0, bfloat16x4_t __p1, bfloat16x4_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  bfloat16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  bfloat16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vbfdot_f32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float32x2_t __noswap_vbfdot_f32(float32x2_t __p0, bfloat16x4_t __p1, bfloat16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vbfdot_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) float32x4_t vbfmlalbq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vbfmlalbq_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) float32x4_t vbfmlalbq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  bfloat16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  bfloat16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vbfmlalbq_f32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float32x4_t __noswap_vbfmlalbq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vbfmlalbq_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) float32x4_t vbfmlaltq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vbfmlaltq_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) float32x4_t vbfmlaltq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  bfloat16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  bfloat16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vbfmlaltq_f32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float32x4_t __noswap_vbfmlaltq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vbfmlaltq_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) float32x4_t vbfmmlaq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vbfmmlaq_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) float32x4_t vbfmmlaq_f32(float32x4_t __p0, bfloat16x8_t __p1, bfloat16x8_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  bfloat16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  bfloat16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vbfmmlaq_f32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcombine_bf16(bfloat16x4_t __p0, bfloat16x4_t __p1) {
-  bfloat16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcombine_bf16(bfloat16x4_t __p0, bfloat16x4_t __p1) {
-  bfloat16x8_t __ret;
-  bfloat16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  bfloat16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t __noswap_vcombine_bf16(bfloat16x4_t __p0, bfloat16x4_t __p1) {
-  bfloat16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#endif
-
-#define vcreate_bf16(__p0) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (bfloat16x4_t)(__promote); \
-  __ret; \
-})
-__ai __attribute__((target("bf16,neon"))) float32_t vcvtah_f32_bf16(bfloat16_t __p0) {
-  float32_t __ret;
-bfloat16_t __reint = __p0;
-int32_t __reint1 = (int32_t)(*(int16_t *) &__reint) << 16;
-  __ret = *(float32_t *) &__reint1;
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16_t vcvth_bf16_f32(float32_t __p0) {
-  bfloat16_t __ret;
-  __ret = (bfloat16_t) __builtin_neon_vcvth_bf16_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vduph_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  __ret = (bfloat16_t) __builtin_neon_vduph_lane_bf16((bfloat16x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  bfloat16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (bfloat16_t) __builtin_neon_vduph_lane_bf16((bfloat16x4_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_bf16(__p0_0, __p1_0) __extension__ ({ \
-  bfloat16x8_t __ret_0; \
-  bfloat16x4_t __s0_0 = __p0_0; \
-  __ret_0 = splatq_lane_bf16(__s0_0, __p1_0); \
-  __ret_0; \
-})
-#else
-#define vdupq_lane_bf16(__p0_1, __p1_1) __extension__ ({ \
-  bfloat16x8_t __ret_1; \
-  bfloat16x4_t __s0_1 = __p0_1; \
-  bfloat16x4_t __rev0_1;  __rev0_1 = __builtin_shufflevector(__s0_1, __s0_1, 3, 2, 1, 0); \
-  __ret_1 = __noswap_splatq_lane_bf16(__rev0_1, __p1_1); \
-  __ret_1 = __builtin_shufflevector(__ret_1, __ret_1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_1; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_bf16(__p0_2, __p1_2) __extension__ ({ \
-  bfloat16x4_t __ret_2; \
-  bfloat16x4_t __s0_2 = __p0_2; \
-  __ret_2 = splat_lane_bf16(__s0_2, __p1_2); \
-  __ret_2; \
-})
-#else
-#define vdup_lane_bf16(__p0_3, __p1_3) __extension__ ({ \
-  bfloat16x4_t __ret_3; \
-  bfloat16x4_t __s0_3 = __p0_3; \
-  bfloat16x4_t __rev0_3;  __rev0_3 = __builtin_shufflevector(__s0_3, __s0_3, 3, 2, 1, 0); \
-  __ret_3 = __noswap_splat_lane_bf16(__rev0_3, __p1_3); \
-  __ret_3 = __builtin_shufflevector(__ret_3, __ret_3, 3, 2, 1, 0); \
-  __ret_3; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vduph_laneq_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  __ret = (bfloat16_t) __builtin_neon_vduph_laneq_bf16((bfloat16x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_laneq_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (bfloat16_t) __builtin_neon_vduph_laneq_bf16((bfloat16x8_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_bf16(__p0_4, __p1_4) __extension__ ({ \
-  bfloat16x8_t __ret_4; \
-  bfloat16x8_t __s0_4 = __p0_4; \
-  __ret_4 = splatq_laneq_bf16(__s0_4, __p1_4); \
-  __ret_4; \
-})
-#else
-#define vdupq_laneq_bf16(__p0_5, __p1_5) __extension__ ({ \
-  bfloat16x8_t __ret_5; \
-  bfloat16x8_t __s0_5 = __p0_5; \
-  bfloat16x8_t __rev0_5;  __rev0_5 = __builtin_shufflevector(__s0_5, __s0_5, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_5 = __noswap_splatq_laneq_bf16(__rev0_5, __p1_5); \
-  __ret_5 = __builtin_shufflevector(__ret_5, __ret_5, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_5; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_bf16(__p0_6, __p1_6) __extension__ ({ \
-  bfloat16x4_t __ret_6; \
-  bfloat16x8_t __s0_6 = __p0_6; \
-  __ret_6 = splat_laneq_bf16(__s0_6, __p1_6); \
-  __ret_6; \
-})
-#else
-#define vdup_laneq_bf16(__p0_7, __p1_7) __extension__ ({ \
-  bfloat16x4_t __ret_7; \
-  bfloat16x8_t __s0_7 = __p0_7; \
-  bfloat16x8_t __rev0_7;  __rev0_7 = __builtin_shufflevector(__s0_7, __s0_7, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_7 = __noswap_splat_laneq_bf16(__rev0_7, __p1_7); \
-  __ret_7 = __builtin_shufflevector(__ret_7, __ret_7, 3, 2, 1, 0); \
-  __ret_7; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vdupq_n_bf16(bfloat16_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vdupq_n_bf16(bfloat16_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vdup_n_bf16(bfloat16_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vdup_n_bf16(bfloat16_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vget_high_bf16(bfloat16x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vget_high_bf16(bfloat16x8_t __p0) {
-  bfloat16x4_t __ret;
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t __noswap_vget_high_bf16(bfloat16x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  __ret = (bfloat16_t) __builtin_neon_vgetq_lane_bf16((bfloat16x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (bfloat16_t) __builtin_neon_vgetq_lane_bf16((bfloat16x8_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x8_t __s0 = __p0; \
-  __ret = (bfloat16_t) __builtin_neon_vgetq_lane_bf16((bfloat16x8_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  __ret = (bfloat16_t) __builtin_neon_vget_lane_bf16((bfloat16x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  bfloat16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (bfloat16_t) __builtin_neon_vget_lane_bf16((bfloat16x4_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16_t __ret; \
-  bfloat16x4_t __s0 = __p0; \
-  __ret = (bfloat16_t) __builtin_neon_vget_lane_bf16((bfloat16x4_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vget_low_bf16(bfloat16x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vget_low_bf16(bfloat16x8_t __p0) {
-  bfloat16x4_t __ret;
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t __noswap_vget_low_bf16(bfloat16x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_bf16(__p0) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  __ret = (bfloat16x8_t) __builtin_neon_vld1q_bf16(__p0, 43); \
-  __ret; \
-})
-#else
-#define vld1q_bf16(__p0) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  __ret = (bfloat16x8_t) __builtin_neon_vld1q_bf16(__p0, 43); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_bf16(__p0) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  __ret = (bfloat16x4_t) __builtin_neon_vld1_bf16(__p0, 11); \
-  __ret; \
-})
-#else
-#define vld1_bf16(__p0) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  __ret = (bfloat16x4_t) __builtin_neon_vld1_bf16(__p0, 11); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  __ret = (bfloat16x8_t) __builtin_neon_vld1q_dup_bf16(__p0, 43); \
-  __ret; \
-})
-#else
-#define vld1q_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  __ret = (bfloat16x8_t) __builtin_neon_vld1q_dup_bf16(__p0, 43); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  __ret = (bfloat16x4_t) __builtin_neon_vld1_dup_bf16(__p0, 11); \
-  __ret; \
-})
-#else
-#define vld1_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  __ret = (bfloat16x4_t) __builtin_neon_vld1_dup_bf16(__p0, 11); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16x8_t __s1 = __p1; \
-  __ret = (bfloat16x8_t) __builtin_neon_vld1q_lane_bf16(__p0, (int8x16_t)__s1, __p2, 43); \
-  __ret; \
-})
-#else
-#define vld1q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16x8_t __s1 = __p1; \
-  bfloat16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (bfloat16x8_t) __builtin_neon_vld1q_lane_bf16(__p0, (int8x16_t)__rev1, __p2, 43); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16x4_t __s1 = __p1; \
-  __ret = (bfloat16x4_t) __builtin_neon_vld1_lane_bf16(__p0, (int8x8_t)__s1, __p2, 11); \
-  __ret; \
-})
-#else
-#define vld1_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16x4_t __s1 = __p1; \
-  bfloat16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (bfloat16x4_t) __builtin_neon_vld1_lane_bf16(__p0, (int8x8_t)__rev1, __p2, 11); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_bf16_x2(__p0) __extension__ ({ \
-  bfloat16x8x2_t __ret; \
-  __builtin_neon_vld1q_bf16_x2(&__ret, __p0, 43); \
-  __ret; \
-})
-#else
-#define vld1q_bf16_x2(__p0) __extension__ ({ \
-  bfloat16x8x2_t __ret; \
-  __builtin_neon_vld1q_bf16_x2(&__ret, __p0, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_bf16_x2(__p0) __extension__ ({ \
-  bfloat16x4x2_t __ret; \
-  __builtin_neon_vld1_bf16_x2(&__ret, __p0, 11); \
-  __ret; \
-})
-#else
-#define vld1_bf16_x2(__p0) __extension__ ({ \
-  bfloat16x4x2_t __ret; \
-  __builtin_neon_vld1_bf16_x2(&__ret, __p0, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_bf16_x3(__p0) __extension__ ({ \
-  bfloat16x8x3_t __ret; \
-  __builtin_neon_vld1q_bf16_x3(&__ret, __p0, 43); \
-  __ret; \
-})
-#else
-#define vld1q_bf16_x3(__p0) __extension__ ({ \
-  bfloat16x8x3_t __ret; \
-  __builtin_neon_vld1q_bf16_x3(&__ret, __p0, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_bf16_x3(__p0) __extension__ ({ \
-  bfloat16x4x3_t __ret; \
-  __builtin_neon_vld1_bf16_x3(&__ret, __p0, 11); \
-  __ret; \
-})
-#else
-#define vld1_bf16_x3(__p0) __extension__ ({ \
-  bfloat16x4x3_t __ret; \
-  __builtin_neon_vld1_bf16_x3(&__ret, __p0, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_bf16_x4(__p0) __extension__ ({ \
-  bfloat16x8x4_t __ret; \
-  __builtin_neon_vld1q_bf16_x4(&__ret, __p0, 43); \
-  __ret; \
-})
-#else
-#define vld1q_bf16_x4(__p0) __extension__ ({ \
-  bfloat16x8x4_t __ret; \
-  __builtin_neon_vld1q_bf16_x4(&__ret, __p0, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_bf16_x4(__p0) __extension__ ({ \
-  bfloat16x4x4_t __ret; \
-  __builtin_neon_vld1_bf16_x4(&__ret, __p0, 11); \
-  __ret; \
-})
-#else
-#define vld1_bf16_x4(__p0) __extension__ ({ \
-  bfloat16x4x4_t __ret; \
-  __builtin_neon_vld1_bf16_x4(&__ret, __p0, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_bf16(__p0) __extension__ ({ \
-  bfloat16x8x2_t __ret; \
-  __builtin_neon_vld2q_bf16(&__ret, __p0, 43); \
-  __ret; \
-})
-#else
-#define vld2q_bf16(__p0) __extension__ ({ \
-  bfloat16x8x2_t __ret; \
-  __builtin_neon_vld2q_bf16(&__ret, __p0, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_bf16(__p0) __extension__ ({ \
-  bfloat16x4x2_t __ret; \
-  __builtin_neon_vld2_bf16(&__ret, __p0, 11); \
-  __ret; \
-})
-#else
-#define vld2_bf16(__p0) __extension__ ({ \
-  bfloat16x4x2_t __ret; \
-  __builtin_neon_vld2_bf16(&__ret, __p0, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_bf16(&__ret, __p0, 43); \
-  __ret; \
-})
-#else
-#define vld2q_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_bf16(&__ret, __p0, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_bf16(&__ret, __p0, 11); \
-  __ret; \
-})
-#else
-#define vld2_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_bf16(&__ret, __p0, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x2_t __ret; \
-  bfloat16x8x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_bf16(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 43); \
-  __ret; \
-})
-#else
-#define vld2q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x2_t __ret; \
-  bfloat16x8x2_t __s1 = __p1; \
-  bfloat16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_bf16(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x2_t __ret; \
-  bfloat16x4x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_bf16(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 11); \
-  __ret; \
-})
-#else
-#define vld2_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x2_t __ret; \
-  bfloat16x4x2_t __s1 = __p1; \
-  bfloat16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vld2_lane_bf16(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_bf16(__p0) __extension__ ({ \
-  bfloat16x8x3_t __ret; \
-  __builtin_neon_vld3q_bf16(&__ret, __p0, 43); \
-  __ret; \
-})
-#else
-#define vld3q_bf16(__p0) __extension__ ({ \
-  bfloat16x8x3_t __ret; \
-  __builtin_neon_vld3q_bf16(&__ret, __p0, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_bf16(__p0) __extension__ ({ \
-  bfloat16x4x3_t __ret; \
-  __builtin_neon_vld3_bf16(&__ret, __p0, 11); \
-  __ret; \
-})
-#else
-#define vld3_bf16(__p0) __extension__ ({ \
-  bfloat16x4x3_t __ret; \
-  __builtin_neon_vld3_bf16(&__ret, __p0, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_bf16(&__ret, __p0, 43); \
-  __ret; \
-})
-#else
-#define vld3q_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_bf16(&__ret, __p0, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_bf16(&__ret, __p0, 11); \
-  __ret; \
-})
-#else
-#define vld3_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_bf16(&__ret, __p0, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x3_t __ret; \
-  bfloat16x8x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_bf16(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 43); \
-  __ret; \
-})
-#else
-#define vld3q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x3_t __ret; \
-  bfloat16x8x3_t __s1 = __p1; \
-  bfloat16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_bf16(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x3_t __ret; \
-  bfloat16x4x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_bf16(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 11); \
-  __ret; \
-})
-#else
-#define vld3_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x3_t __ret; \
-  bfloat16x4x3_t __s1 = __p1; \
-  bfloat16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vld3_lane_bf16(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_bf16(__p0) __extension__ ({ \
-  bfloat16x8x4_t __ret; \
-  __builtin_neon_vld4q_bf16(&__ret, __p0, 43); \
-  __ret; \
-})
-#else
-#define vld4q_bf16(__p0) __extension__ ({ \
-  bfloat16x8x4_t __ret; \
-  __builtin_neon_vld4q_bf16(&__ret, __p0, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_bf16(__p0) __extension__ ({ \
-  bfloat16x4x4_t __ret; \
-  __builtin_neon_vld4_bf16(&__ret, __p0, 11); \
-  __ret; \
-})
-#else
-#define vld4_bf16(__p0) __extension__ ({ \
-  bfloat16x4x4_t __ret; \
-  __builtin_neon_vld4_bf16(&__ret, __p0, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_bf16(&__ret, __p0, 43); \
-  __ret; \
-})
-#else
-#define vld4q_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_bf16(&__ret, __p0, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_bf16(&__ret, __p0, 11); \
-  __ret; \
-})
-#else
-#define vld4_dup_bf16(__p0) __extension__ ({ \
-  bfloat16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_bf16(&__ret, __p0, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x4_t __ret; \
-  bfloat16x8x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_bf16(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 43); \
-  __ret; \
-})
-#else
-#define vld4q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x4_t __ret; \
-  bfloat16x8x4_t __s1 = __p1; \
-  bfloat16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_bf16(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 43); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x4_t __ret; \
-  bfloat16x4x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_bf16(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 11); \
-  __ret; \
-})
-#else
-#define vld4_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x4_t __ret; \
-  bfloat16x4x4_t __s1 = __p1; \
-  bfloat16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vld4_lane_bf16(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 11); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16_t __s0 = __p0; \
-  bfloat16x8_t __s1 = __p1; \
-  __ret = (bfloat16x8_t) __builtin_neon_vsetq_lane_bf16(__s0, (bfloat16x8_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16_t __s0 = __p0; \
-  bfloat16x8_t __s1 = __p1; \
-  bfloat16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (bfloat16x8_t) __builtin_neon_vsetq_lane_bf16(__s0, (bfloat16x8_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8_t __ret; \
-  bfloat16_t __s0 = __p0; \
-  bfloat16x8_t __s1 = __p1; \
-  __ret = (bfloat16x8_t) __builtin_neon_vsetq_lane_bf16(__s0, (bfloat16x8_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16_t __s0 = __p0; \
-  bfloat16x4_t __s1 = __p1; \
-  __ret = (bfloat16x4_t) __builtin_neon_vset_lane_bf16(__s0, (bfloat16x4_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16_t __s0 = __p0; \
-  bfloat16x4_t __s1 = __p1; \
-  bfloat16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (bfloat16x4_t) __builtin_neon_vset_lane_bf16(__s0, (bfloat16x4_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4_t __ret; \
-  bfloat16_t __s0 = __p0; \
-  bfloat16x4_t __s1 = __p1; \
-  __ret = (bfloat16x4_t) __builtin_neon_vset_lane_bf16(__s0, (bfloat16x4_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_bf16(__p0, (int8x16_t)__s1, 43); \
-})
-#else
-#define vst1q_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8_t __s1 = __p1; \
-  bfloat16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_bf16(__p0, (int8x16_t)__rev1, 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_bf16(__p0, (int8x8_t)__s1, 11); \
-})
-#else
-#define vst1_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4_t __s1 = __p1; \
-  bfloat16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_bf16(__p0, (int8x8_t)__rev1, 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_bf16(__p0, (int8x16_t)__s1, __p2, 43); \
-})
-#else
-#define vst1q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8_t __s1 = __p1; \
-  bfloat16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_bf16(__p0, (int8x16_t)__rev1, __p2, 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_bf16(__p0, (int8x8_t)__s1, __p2, 11); \
-})
-#else
-#define vst1_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4_t __s1 = __p1; \
-  bfloat16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_lane_bf16(__p0, (int8x8_t)__rev1, __p2, 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_bf16_x2(__p0, __p1) __extension__ ({ \
-  bfloat16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_bf16_x2(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 43); \
-})
-#else
-#define vst1q_bf16_x2(__p0, __p1) __extension__ ({ \
-  bfloat16x8x2_t __s1 = __p1; \
-  bfloat16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_bf16_x2(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_bf16_x2(__p0, __p1) __extension__ ({ \
-  bfloat16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1_bf16_x2(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 11); \
-})
-#else
-#define vst1_bf16_x2(__p0, __p1) __extension__ ({ \
-  bfloat16x4x2_t __s1 = __p1; \
-  bfloat16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1_bf16_x2(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_bf16_x3(__p0, __p1) __extension__ ({ \
-  bfloat16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_bf16_x3(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 43); \
-})
-#else
-#define vst1q_bf16_x3(__p0, __p1) __extension__ ({ \
-  bfloat16x8x3_t __s1 = __p1; \
-  bfloat16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_bf16_x3(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_bf16_x3(__p0, __p1) __extension__ ({ \
-  bfloat16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1_bf16_x3(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 11); \
-})
-#else
-#define vst1_bf16_x3(__p0, __p1) __extension__ ({ \
-  bfloat16x4x3_t __s1 = __p1; \
-  bfloat16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1_bf16_x3(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_bf16_x4(__p0, __p1) __extension__ ({ \
-  bfloat16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_bf16_x4(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 43); \
-})
-#else
-#define vst1q_bf16_x4(__p0, __p1) __extension__ ({ \
-  bfloat16x8x4_t __s1 = __p1; \
-  bfloat16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_bf16_x4(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_bf16_x4(__p0, __p1) __extension__ ({ \
-  bfloat16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1_bf16_x4(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 11); \
-})
-#else
-#define vst1_bf16_x4(__p0, __p1) __extension__ ({ \
-  bfloat16x4x4_t __s1 = __p1; \
-  bfloat16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1_bf16_x4(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_bf16(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 43); \
-})
-#else
-#define vst2q_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8x2_t __s1 = __p1; \
-  bfloat16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_bf16(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_bf16(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 11); \
-})
-#else
-#define vst2_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4x2_t __s1 = __p1; \
-  bfloat16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_bf16(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_bf16(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 43); \
-})
-#else
-#define vst2q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x2_t __s1 = __p1; \
-  bfloat16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_bf16(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_bf16(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 11); \
-})
-#else
-#define vst2_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x2_t __s1 = __p1; \
-  bfloat16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_lane_bf16(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_bf16(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 43); \
-})
-#else
-#define vst3q_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8x3_t __s1 = __p1; \
-  bfloat16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_bf16(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_bf16(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 11); \
-})
-#else
-#define vst3_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4x3_t __s1 = __p1; \
-  bfloat16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_bf16(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_bf16(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 43); \
-})
-#else
-#define vst3q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x3_t __s1 = __p1; \
-  bfloat16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_bf16(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_bf16(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 11); \
-})
-#else
-#define vst3_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x3_t __s1 = __p1; \
-  bfloat16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_lane_bf16(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_bf16(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 43); \
-})
-#else
-#define vst4q_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x8x4_t __s1 = __p1; \
-  bfloat16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_bf16(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_bf16(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 11); \
-})
-#else
-#define vst4_bf16(__p0, __p1) __extension__ ({ \
-  bfloat16x4x4_t __s1 = __p1; \
-  bfloat16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_bf16(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_bf16(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 43); \
-})
-#else
-#define vst4q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x8x4_t __s1 = __p1; \
-  bfloat16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_bf16(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 43); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_bf16(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 11); \
-})
-#else
-#define vst4_lane_bf16(__p0, __p1, __p2) __extension__ ({ \
-  bfloat16x4x4_t __s1 = __p1; \
-  bfloat16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_lane_bf16(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 11); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("dotprod,neon"))) uint32x4_t vdotq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vdotq_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("dotprod,neon"))) uint32x4_t vdotq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vdotq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("dotprod,neon"))) uint32x4_t __noswap_vdotq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vdotq_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("dotprod,neon"))) int32x4_t vdotq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vdotq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("dotprod,neon"))) int32x4_t vdotq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vdotq_s32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("dotprod,neon"))) int32x4_t __noswap_vdotq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vdotq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("dotprod,neon"))) uint32x2_t vdot_u32(uint32x2_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vdot_u32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("dotprod,neon"))) uint32x2_t vdot_u32(uint32x2_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vdot_u32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("dotprod,neon"))) uint32x2_t __noswap_vdot_u32(uint32x2_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vdot_u32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 18);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("dotprod,neon"))) int32x2_t vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vdot_s32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("dotprod,neon"))) int32x2_t vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vdot_s32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("dotprod,neon"))) int32x2_t __noswap_vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vdot_s32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vabdq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vabdq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vabdq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vabdq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vabd_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vabd_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vabd_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vabd_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vabsq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vabsq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vabsq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vabsq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vabs_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vabs_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vabs_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vabs_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vaddq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vaddq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vadd_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vadd_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcageq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcageq_f16((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcageq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcageq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcage_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcage_f16((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcage_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcage_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcagtq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcagtq_f16((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcagtq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcagtq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcagt_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcagt_f16((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcagt_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcagt_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcaleq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcaleq_f16((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcaleq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcaleq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcale_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcale_f16((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcale_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcale_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcaltq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcaltq_f16((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcaltq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcaltq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcalt_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcalt_f16((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcalt_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcalt_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vceqq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vceqq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vceq_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vceq_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vceqzq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vceqzq_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vceqzq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vceqzq_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vceqz_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vceqz_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vceqz_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vceqz_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcgeq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcgeq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcge_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcge_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcgezq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcgezq_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcgezq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcgezq_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcgez_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcgez_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcgez_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcgez_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcgtq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcgtq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcgt_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcgt_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcgtzq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcgtzq_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcgtzq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcgtzq_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcgtz_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcgtz_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcgtz_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcgtz_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcleq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcleq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcle_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcle_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vclezq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vclezq_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vclezq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vclezq_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vclez_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vclez_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vclez_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vclez_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcltq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcltq_f16(float16x8_t __p0, float16x8_t __p1) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vclt_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vclt_f16(float16x4_t __p0, float16x4_t __p1) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcltzq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcltzq_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcltzq_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcltzq_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcltz_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcltz_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcltz_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcltz_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vcvtq_f16_u16(uint16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcvtq_f16_u16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vcvtq_f16_u16(uint16x8_t __p0) {
-  float16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vcvtq_f16_u16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vcvtq_f16_s16(int16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcvtq_f16_s16((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vcvtq_f16_s16(int16x8_t __p0) {
-  float16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vcvtq_f16_s16((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vcvt_f16_u16(uint16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_u16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vcvt_f16_u16(uint16x4_t __p0) {
-  float16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_u16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vcvt_f16_s16(int16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_s16((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vcvt_f16_s16(int16x4_t __p0) {
-  float16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_s16((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_f16_u16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_u16((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define vcvtq_n_f16_u16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_u16((int8x16_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_f16_s16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_s16((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#else
-#define vcvtq_n_f16_s16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_s16((int8x16_t)__rev0, __p1, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvt_n_f16_u16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_u16((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vcvt_n_f16_u16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_u16((int8x8_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvt_n_f16_s16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_s16((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vcvt_n_f16_s16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_s16((int8x8_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_s16_f16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_vcvtq_n_s16_f16((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#else
-#define vcvtq_n_s16_f16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vcvtq_n_s16_f16((int8x16_t)__rev0, __p1, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvt_n_s16_f16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vcvt_n_s16_f16((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vcvt_n_s16_f16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vcvt_n_s16_f16((int8x8_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_u16_f16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_vcvtq_n_u16_f16((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define vcvtq_n_u16_f16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vcvtq_n_u16_f16((int8x16_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvt_n_u16_f16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vcvt_n_u16_f16((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vcvt_n_u16_f16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vcvt_n_u16_f16((int8x8_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vcvtq_s16_f16((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vcvtq_s16_f16((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvt_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vcvt_s16_f16((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvt_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vcvt_s16_f16((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcvtq_u16_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcvtq_u16_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvt_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcvt_u16_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvt_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcvt_u16_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtaq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vcvtaq_s16_f16((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtaq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vcvtaq_s16_f16((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvta_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vcvta_s16_f16((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvta_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vcvta_s16_f16((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtaq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcvtaq_u16_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtaq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcvtaq_u16_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvta_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcvta_u16_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvta_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcvta_u16_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtmq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vcvtmq_s16_f16((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtmq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vcvtmq_s16_f16((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvtm_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vcvtm_s16_f16((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvtm_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vcvtm_s16_f16((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtmq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcvtmq_u16_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtmq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcvtmq_u16_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvtm_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcvtm_u16_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvtm_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcvtm_u16_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtnq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vcvtnq_s16_f16((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtnq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vcvtnq_s16_f16((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvtn_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vcvtn_s16_f16((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvtn_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vcvtn_s16_f16((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtnq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcvtnq_u16_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtnq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcvtnq_u16_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvtn_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcvtn_u16_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvtn_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcvtn_u16_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtpq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vcvtpq_s16_f16((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x8_t vcvtpq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vcvtpq_s16_f16((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvtp_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vcvtp_s16_f16((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) int16x4_t vcvtp_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vcvtp_s16_f16((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtpq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcvtpq_u16_f16((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x8_t vcvtpq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcvtpq_u16_f16((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvtp_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcvtp_u16_f16((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) uint16x4_t vcvtp_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcvtp_u16_f16((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vfmaq_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vfmaq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t __noswap_vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vfmaq_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vfma_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vfma_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t __noswap_vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vfma_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = vfmaq_f16(__p0, -__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vfmaq_f16(__rev0, -__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = vfma_f16(__p0, -__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vfma_f16(__rev0, -__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vmaxq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vmaxq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vmax_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vmax_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vminq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vminq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vmin_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vmin_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_n_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = __s0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \
-  __ret; \
-})
-#else
-#define vmulq_n_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = __rev0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_n_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = __s0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \
-  __ret; \
-})
-#else
-#define vmul_n_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = __rev0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vnegq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vnegq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vneg_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vneg_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vpadd_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vpadd_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vpmax_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vpmax_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vpmin_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vpmin_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrecpeq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrecpeq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrecpeq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrecpeq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrecpe_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrecpe_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrecpe_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrecpe_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrecpsq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrecpsq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrecps_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrecps_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrsqrteq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrsqrteq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrsqrteq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrsqrteq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrsqrte_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrsqrte_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrsqrte_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrsqrte_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrsqrtsq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrsqrtsq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrsqrts_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrsqrts_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("i8mm,neon"))) uint32x4_t vmmlaq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vmmlaq_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("i8mm,neon"))) uint32x4_t vmmlaq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vmmlaq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("i8mm,neon"))) int32x4_t vmmlaq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vmmlaq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("i8mm,neon"))) int32x4_t vmmlaq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vmmlaq_s32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("i8mm,neon"))) int32x4_t vusdotq_s32(int32x4_t __p0, uint8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vusdotq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("i8mm,neon"))) int32x4_t vusdotq_s32(int32x4_t __p0, uint8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vusdotq_s32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("i8mm,neon"))) int32x4_t __noswap_vusdotq_s32(int32x4_t __p0, uint8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vusdotq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("i8mm,neon"))) int32x2_t vusdot_s32(int32x2_t __p0, uint8x8_t __p1, int8x8_t __p2) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vusdot_s32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("i8mm,neon"))) int32x2_t vusdot_s32(int32x2_t __p0, uint8x8_t __p1, int8x8_t __p2) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vusdot_s32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("i8mm,neon"))) int32x2_t __noswap_vusdot_s32(int32x2_t __p0, uint8x8_t __p1, int8x8_t __p2) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vusdot_s32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("i8mm,neon"))) int32x4_t vusmmlaq_s32(int32x4_t __p0, uint8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vusmmlaq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("i8mm,neon"))) int32x4_t vusmmlaq_s32(int32x4_t __p0, uint8x16_t __p1, int8x16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vusmmlaq_s32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  __ret = (poly8x8_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 4); \
-  __ret; \
-})
-#else
-#define splat_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x8_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 4); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  __ret = (poly8x8_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 4); \
-  __ret; \
-})
-#endif
-
-#define splat_lane_p64(__p0, __p1) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64x1_t __s0 = __p0; \
-  __ret = (poly64x1_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  __ret = (poly16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 5); \
-  __ret; \
-})
-#else
-#define splat_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (poly16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 5); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  __ret = (poly16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 5); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  __ret = (poly8x16_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 4); \
-  __ret; \
-})
-#else
-#define splatq_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x16_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 4); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  __ret = (poly8x16_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 4); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_p64(__p0, __p1) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x1_t __s0 = __p0; \
-  __ret = (poly64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 6); \
-  __ret; \
-})
-#else
-#define splatq_lane_p64(__p0, __p1) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x1_t __s0 = __p0; \
-  __ret = (poly64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 6); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_p64(__p0, __p1) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x1_t __s0 = __p0; \
-  __ret = (poly64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 6); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  __ret = (poly16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 5); \
-  __ret; \
-})
-#else
-#define splatq_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (poly16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 5); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  __ret = (poly16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 5); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8x16_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define splatq_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8x16_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 16); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define splatq_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#else
-#define splatq_lane_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 19); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define splatq_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8x16_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define splatq_lane_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8x16_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_f64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x1_t __s0 = __p0; \
-  __ret = (float64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 10); \
-  __ret; \
-})
-#else
-#define splatq_lane_f64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x1_t __s0 = __p0; \
-  __ret = (float64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 10); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_f64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x1_t __s0 = __p0; \
-  __ret = (float64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 10); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_f32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x2_t __s0 = __p0; \
-  __ret = (float32x4_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 9); \
-  __ret; \
-})
-#else
-#define splatq_lane_f32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float32x4_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 9); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_f32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x2_t __s0 = __p0; \
-  __ret = (float32x4_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 9); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (float16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 8); \
-  __ret; \
-})
-#else
-#define splatq_lane_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 8); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (float16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 8); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define splatq_lane_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 3); \
-  __ret; \
-})
-#else
-#define splatq_lane_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 3); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_lane_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define splatq_lane_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_lane_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_splatq_lane_v((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define splat_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 16); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define splat_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#endif
-
-#define splat_lane_u64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64x1_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define splat_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define splat_lane_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 0); \
-  __ret; \
-})
-#endif
-
-#define splat_lane_f64(__p0, __p1) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  __ret = (float64x1_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 10); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_f32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  __ret = (float32x2_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 9); \
-  __ret; \
-})
-#else
-#define splat_lane_f32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float32x2_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 9); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_f32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  __ret = (float32x2_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 9); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (float16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 8); \
-  __ret; \
-})
-#else
-#define splat_lane_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 8); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (float16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 8); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define splat_lane_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#endif
-
-#define splat_lane_s64(__p0, __p1) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64x1_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define splat_lane_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define splat_lane_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_lane_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_splat_lane_v((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_p8(__p0, __p1) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  __ret = (poly8x8_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 36); \
-  __ret; \
-})
-#else
-#define splat_laneq_p8(__p0, __p1) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x8_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 36); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_p8(__p0, __p1) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  __ret = (poly8x8_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 36); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_p64(__p0, __p1) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  __ret = (poly64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 38); \
-  __ret; \
-})
-#else
-#define splat_laneq_p64(__p0, __p1) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (poly64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 38); \
-  __ret; \
-})
-#define __noswap_splat_laneq_p64(__p0, __p1) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  __ret = (poly64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 38); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_p16(__p0, __p1) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  __ret = (poly16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 37); \
-  __ret; \
-})
-#else
-#define splat_laneq_p16(__p0, __p1) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 37); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_p16(__p0, __p1) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  __ret = (poly16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 37); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_p8(__p0, __p1) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  __ret = (poly8x16_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 36); \
-  __ret; \
-})
-#else
-#define splatq_laneq_p8(__p0, __p1) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x16_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 36); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_p8(__p0, __p1) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  __ret = (poly8x16_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 36); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_p64(__p0, __p1) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  __ret = (poly64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 38); \
-  __ret; \
-})
-#else
-#define splatq_laneq_p64(__p0, __p1) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (poly64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 38); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_p64(__p0, __p1) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  __ret = (poly64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 38); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_p16(__p0, __p1) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  __ret = (poly16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 37); \
-  __ret; \
-})
-#else
-#define splatq_laneq_p16(__p0, __p1) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 37); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_p16(__p0, __p1) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  __ret = (poly16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 37); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8x16_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 48); \
-  __ret; \
-})
-#else
-#define splatq_laneq_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8x16_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 48); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define splatq_laneq_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define splatq_laneq_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define splatq_laneq_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8x16_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 32); \
-  __ret; \
-})
-#else
-#define splatq_laneq_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8x16_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 32); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_f64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  __ret = (float64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 42); \
-  __ret; \
-})
-#else
-#define splatq_laneq_f64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 42); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_f64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  __ret = (float64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 42); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_f32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  __ret = (float32x4_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 41); \
-  __ret; \
-})
-#else
-#define splatq_laneq_f32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float32x4_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 41); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_f32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  __ret = (float32x4_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 41); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (float16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 40); \
-  __ret; \
-})
-#else
-#define splatq_laneq_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 40); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (float16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 40); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#else
-#define splatq_laneq_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#else
-#define splatq_laneq_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splatq_laneq_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#else
-#define splatq_laneq_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__rev0, __p1, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splatq_laneq_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_splatq_laneq_v((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 48); \
-  __ret; \
-})
-#else
-#define splat_laneq_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 48); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define splat_laneq_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_u64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define splat_laneq_u64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 51); \
-  __ret; \
-})
-#define __noswap_splat_laneq_u64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define splat_laneq_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 32); \
-  __ret; \
-})
-#else
-#define splat_laneq_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 32); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_f64(__p0, __p1) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x2_t __s0 = __p0; \
-  __ret = (float64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 42); \
-  __ret; \
-})
-#else
-#define splat_laneq_f64(__p0, __p1) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 42); \
-  __ret; \
-})
-#define __noswap_splat_laneq_f64(__p0, __p1) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x2_t __s0 = __p0; \
-  __ret = (float64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 42); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_f32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x4_t __s0 = __p0; \
-  __ret = (float32x2_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 41); \
-  __ret; \
-})
-#else
-#define splat_laneq_f32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float32x2_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 41); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_f32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x4_t __s0 = __p0; \
-  __ret = (float32x2_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 41); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (float16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 40); \
-  __ret; \
-})
-#else
-#define splat_laneq_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 40); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (float16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 40); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#else
-#define splat_laneq_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_s64(__p0, __p1) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#else
-#define splat_laneq_s64(__p0, __p1) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 35); \
-  __ret; \
-})
-#define __noswap_splat_laneq_s64(__p0, __p1) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64x1_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define splat_laneq_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#else
-#define splat_laneq_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__rev0, __p1, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_splat_laneq_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_splat_laneq_v((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vabdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vabdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t __noswap_vabdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vabdq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vabdq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vabdq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vabdq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vabdq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t __noswap_vabdq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vabdq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vabdq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t __noswap_vabdq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vabdq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vabdq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vabdq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vabdq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vabdq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vabdq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vabdq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vabdq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vabd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vabd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t __noswap_vabd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vabd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vabd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t __noswap_vabd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vabd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vabd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t __noswap_vabd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vabd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vabd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t __noswap_vabd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vabd_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vabd_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vabd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vabd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vabd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vabd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vabd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vabd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vabsq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vabsq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vabsq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vabsq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vabsq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vabsq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vabsq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vabsq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vabs_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vabs_v((int8x8_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vabs_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vabs_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vabs_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vabs_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vabs_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vabs_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vabs_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vabs_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vabs_v((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vabs_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vaddq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vaddq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vaddq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vaddq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vadd_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vadd_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vadd_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vadd_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vadd_v((int8x8_t)__p0, (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vadd_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) poly64x1_t vadd_p64(poly64x1_t __p0, poly64x1_t __p1) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t) __builtin_neon_vadd_v((int8x8_t)__p0, (int8x8_t)__p1, 6);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vadd_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t) __builtin_neon_vadd_v((int8x8_t)__p0, (int8x8_t)__p1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vadd_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (poly16x4_t) __builtin_neon_vadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 5);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vaddq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vaddq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vaddq_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t) __builtin_neon_vaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 38);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vaddq_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (poly64x2_t) __builtin_neon_vaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 38);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vaddq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t) __builtin_neon_vaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 37);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vaddq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly16x8_t) __builtin_neon_vaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 37);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vaddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vaddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t __noswap_vaddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vaddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vaddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t __noswap_vaddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vaddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vaddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t __noswap_vaddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vaddhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vaddhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vaddhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vaddhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vaddhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vaddhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vaddhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vaddhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t __noswap_vaddhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vandq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vandq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vandq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vandq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vandq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vandq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vandq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vandq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vandq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vandq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vandq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vandq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vandq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vandq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vandq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vandq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vand_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vand_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vand_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vand_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vand_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vand_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vand_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vand_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vand_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vand_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vand_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vand_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vand_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __p0 & __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vand_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 & __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vbicq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vbicq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vbicq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vbicq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vbicq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vbicq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vbicq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vbicq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vbicq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vbicq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vbicq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vbicq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vbicq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vbicq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vbicq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vbicq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vbic_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vbic_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vbic_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vbic_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vbic_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vbic_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vbic_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vbic_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vbic_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vbic_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vbic_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vbic_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vbic_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __p0 & ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vbic_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 & ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vbsl_p8(uint8x8_t __p0, poly8x8_t __p1, poly8x8_t __p2) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vbsl_p8(uint8x8_t __p0, poly8x8_t __p1, poly8x8_t __p2) {
-  poly8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vbsl_p16(uint16x4_t __p0, poly16x4_t __p1, poly16x4_t __p2) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vbsl_p16(uint16x4_t __p0, poly16x4_t __p1, poly16x4_t __p2) {
-  poly16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  poly16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (poly16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 5);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vbslq_p8(uint8x16_t __p0, poly8x16_t __p1, poly8x16_t __p2) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vbslq_p8(uint8x16_t __p0, poly8x16_t __p1, poly8x16_t __p2) {
-  poly8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vbslq_p16(uint16x8_t __p0, poly16x8_t __p1, poly16x8_t __p2) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 37);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vbslq_p16(uint16x8_t __p0, poly16x8_t __p1, poly16x8_t __p2) {
-  poly16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 37);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vbslq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vbslq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vbslq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vbslq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vbslq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vbslq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vbslq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vbslq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vbslq_s8(uint8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vbslq_s8(uint8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vbslq_f32(uint32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vbslq_f32(uint32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vbslq_s32(uint32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vbslq_s32(uint32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vbslq_s64(uint64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vbslq_s64(uint64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vbslq_s16(uint16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vbslq_s16(uint16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vbsl_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vbsl_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vbsl_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vbsl_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vbsl_u64(uint64x1_t __p0, uint64x1_t __p1, uint64x1_t __p2) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vbsl_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vbsl_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vbsl_s8(uint8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vbsl_s8(uint8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vbsl_f32(uint32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vbsl_f32(uint32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vbsl_s32(uint32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vbsl_s32(uint32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vbsl_s64(uint64x1_t __p0, int64x1_t __p1, int64x1_t __p2) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 3);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vbsl_s16(uint16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vbsl_s16(uint16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vbslq_f16(uint16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vbslq_f16(uint16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vbsl_f16(uint16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vbsl_f16(uint16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcageq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcageq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcageq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcageq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcage_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcage_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcage_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcagtq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcagtq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcagtq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcagtq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcagt_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcagt_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcagt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcaleq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcaleq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcaleq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcaleq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcale_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcale_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcale_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcaltq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcaltq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcaltq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcaltq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcalt_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcalt_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcalt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vceq_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vceq_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  uint8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vceqq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vceqq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  uint8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vceqq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vceqq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vceqq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vceqq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vceqq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vceqq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vceqq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vceqq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vceqq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vceqq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vceqq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vceqq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vceqq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vceqq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vceq_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vceq_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vceq_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vceq_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vceq_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vceq_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vceq_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vceq_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vceq_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vceq_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vceq_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vceq_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vceq_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vceq_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcgeq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcgeq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgeq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgeq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcgeq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcgeq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcgeq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcgeq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgeq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgeq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgeq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgeq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcgeq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcgeq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcge_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcge_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcge_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcge_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vcge_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vcge_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcge_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcge_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcge_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcge_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcge_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcge_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vcge_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vcge_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcgtq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcgtq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgtq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgtq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcgtq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcgtq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcgtq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcgtq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgtq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgtq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgtq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgtq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcgtq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcgtq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcgt_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcgt_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcgt_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcgt_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vcgt_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vcgt_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcgt_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcgt_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcgt_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcgt_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcgt_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcgt_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vcgt_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vcgt_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcleq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcleq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcleq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcleq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcleq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcleq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcleq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcleq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcleq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcleq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcleq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcleq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcleq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcleq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcle_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcle_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcle_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcle_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vcle_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vcle_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcle_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcle_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcle_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcle_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcle_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcle_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vcle_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vcle_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vclsq_u8(uint8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vclsq_u8(uint8x16_t __p0) {
-  int8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vclsq_u32(uint32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vclsq_u32(uint32x4_t __p0) {
-  int32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vclsq_u16(uint16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vclsq_u16(uint16x8_t __p0) {
-  int16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vclsq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vclsq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vclsq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vclsq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vclsq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vclsq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vcls_u8(uint8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vcls_v((int8x8_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vcls_u8(uint8x8_t __p0) {
-  int8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vcls_u32(uint32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vcls_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vcls_u32(uint32x2_t __p0) {
-  int32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vcls_u16(uint16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vcls_v((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vcls_u16(uint16x4_t __p0) {
-  int16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vcls_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vcls_v((int8x8_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vcls_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vcls_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vcls_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vcls_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vcls_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vcls_v((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vcls_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcltq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcltq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcltq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcltq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcltq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcltq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcltq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcltq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcltq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcltq_f32(float32x4_t __p0, float32x4_t __p1) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcltq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcltq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcltq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcltq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vclt_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vclt_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vclt_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vclt_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vclt_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vclt_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vclt_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vclt_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vclt_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vclt_f32(float32x2_t __p0, float32x2_t __p1) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vclt_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vclt_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vclt_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vclt_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vclzq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vclzq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vclzq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vclzq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vclzq_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vclzq_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vclzq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vclzq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vclzq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vclzq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vclzq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vclzq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vclz_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vclz_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vclz_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vclz_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vclz_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vclz_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vclz_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vclz_v((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vclz_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vclz_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vclz_v((int8x8_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vclz_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vclz_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vclz_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vclz_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vclz_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vclz_v((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vclz_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vcnt_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vcnt_v((int8x8_t)__p0, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vcnt_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vcnt_v((int8x8_t)__rev0, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vcntq_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vcntq_v((int8x16_t)__p0, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vcntq_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vcntq_v((int8x16_t)__rev0, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcntq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vcntq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcntq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vcntq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vcntq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vcntq_v((int8x16_t)__p0, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vcntq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vcntq_v((int8x16_t)__rev0, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcnt_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vcnt_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcnt_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vcnt_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vcnt_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vcnt_v((int8x8_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vcnt_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vcnt_v((int8x8_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vcombine_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vcombine_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x16_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vcombine_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vcombine_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x8_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x16_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t __noswap_vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x4_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcombine_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcombine_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcombine_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcombine_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x8_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t __noswap_vcombine_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vcombine_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vcombine_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x16_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t __noswap_vcombine_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vcombine_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vcombine_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x4_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t __noswap_vcombine_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vcombine_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vcombine_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x8_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t __noswap_vcombine_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vcombine_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vcombine_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x4_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vcombine_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vcombine_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vcombine_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vcombine_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vcombine_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x8_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vcombine_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#endif
-
-#define vcreate_p8(__p0) __extension__ ({ \
-  poly8x8_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (poly8x8_t)(__promote); \
-  __ret; \
-})
-#define vcreate_p16(__p0) __extension__ ({ \
-  poly16x4_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (poly16x4_t)(__promote); \
-  __ret; \
-})
-#define vcreate_u8(__p0) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (uint8x8_t)(__promote); \
-  __ret; \
-})
-#define vcreate_u32(__p0) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (uint32x2_t)(__promote); \
-  __ret; \
-})
-#define vcreate_u64(__p0) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (uint64x1_t)(__promote); \
-  __ret; \
-})
-#define vcreate_u16(__p0) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (uint16x4_t)(__promote); \
-  __ret; \
-})
-#define vcreate_s8(__p0) __extension__ ({ \
-  int8x8_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (int8x8_t)(__promote); \
-  __ret; \
-})
-#define vcreate_f32(__p0) __extension__ ({ \
-  float32x2_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (float32x2_t)(__promote); \
-  __ret; \
-})
-#define vcreate_f16(__p0) __extension__ ({ \
-  float16x4_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (float16x4_t)(__promote); \
-  __ret; \
-})
-#define vcreate_s32(__p0) __extension__ ({ \
-  int32x2_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (int32x2_t)(__promote); \
-  __ret; \
-})
-#define vcreate_s64(__p0) __extension__ ({ \
-  int64x1_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (int64x1_t)(__promote); \
-  __ret; \
-})
-#define vcreate_s16(__p0) __extension__ ({ \
-  int16x4_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (int16x4_t)(__promote); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vcvtq_f32_u32(uint32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vcvtq_f32_u32(uint32x4_t __p0) {
-  float32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vcvtq_f32_s32(int32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vcvtq_f32_s32(int32x4_t __p0) {
-  float32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vcvt_f32_u32(uint32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vcvt_f32_u32(uint32x2_t __p0) {
-  float32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vcvt_f32_s32(int32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vcvt_f32_s32(int32x2_t __p0) {
-  float32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_f32_u32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define vcvtq_n_f32_u32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_f32_s32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#else
-#define vcvtq_n_f32_s32(__p0, __p1) __extension__ ({ \
-  float32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__rev0, __p1, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvt_n_f32_u32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vcvt_n_f32_u32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvt_n_f32_s32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vcvt_n_f32_s32(__p0, __p1) __extension__ ({ \
-  float32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_s32_f32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_vcvtq_n_s32_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#else
-#define vcvtq_n_s32_f32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vcvtq_n_s32_v((int8x16_t)__rev0, __p1, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvt_n_s32_f32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vcvt_n_s32_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vcvt_n_s32_f32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vcvt_n_s32_v((int8x8_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_u32_f32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_vcvtq_n_u32_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define vcvtq_n_u32_f32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vcvtq_n_u32_v((int8x16_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvt_n_u32_f32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vcvt_n_u32_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vcvt_n_u32_f32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vcvt_n_u32_v((int8x8_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vcvtq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vcvtq_s32_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vcvtq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vcvtq_s32_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vcvt_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vcvt_s32_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vcvt_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vcvt_s32_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcvtq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcvtq_u32_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcvtq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcvtq_u32_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcvt_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcvt_u32_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcvt_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcvt_u32_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_p8(__p0_8, __p1_8) __extension__ ({ \
-  poly8x8_t __ret_8; \
-  poly8x8_t __s0_8 = __p0_8; \
-  __ret_8 = splat_lane_p8(__s0_8, __p1_8); \
-  __ret_8; \
-})
-#else
-#define vdup_lane_p8(__p0_9, __p1_9) __extension__ ({ \
-  poly8x8_t __ret_9; \
-  poly8x8_t __s0_9 = __p0_9; \
-  poly8x8_t __rev0_9;  __rev0_9 = __builtin_shufflevector(__s0_9, __s0_9, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_9 = __noswap_splat_lane_p8(__rev0_9, __p1_9); \
-  __ret_9 = __builtin_shufflevector(__ret_9, __ret_9, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_9; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_p16(__p0_10, __p1_10) __extension__ ({ \
-  poly16x4_t __ret_10; \
-  poly16x4_t __s0_10 = __p0_10; \
-  __ret_10 = splat_lane_p16(__s0_10, __p1_10); \
-  __ret_10; \
-})
-#else
-#define vdup_lane_p16(__p0_11, __p1_11) __extension__ ({ \
-  poly16x4_t __ret_11; \
-  poly16x4_t __s0_11 = __p0_11; \
-  poly16x4_t __rev0_11;  __rev0_11 = __builtin_shufflevector(__s0_11, __s0_11, 3, 2, 1, 0); \
-  __ret_11 = __noswap_splat_lane_p16(__rev0_11, __p1_11); \
-  __ret_11 = __builtin_shufflevector(__ret_11, __ret_11, 3, 2, 1, 0); \
-  __ret_11; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_p8(__p0_12, __p1_12) __extension__ ({ \
-  poly8x16_t __ret_12; \
-  poly8x8_t __s0_12 = __p0_12; \
-  __ret_12 = splatq_lane_p8(__s0_12, __p1_12); \
-  __ret_12; \
-})
-#else
-#define vdupq_lane_p8(__p0_13, __p1_13) __extension__ ({ \
-  poly8x16_t __ret_13; \
-  poly8x8_t __s0_13 = __p0_13; \
-  poly8x8_t __rev0_13;  __rev0_13 = __builtin_shufflevector(__s0_13, __s0_13, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_13 = __noswap_splatq_lane_p8(__rev0_13, __p1_13); \
-  __ret_13 = __builtin_shufflevector(__ret_13, __ret_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_13; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_p16(__p0_14, __p1_14) __extension__ ({ \
-  poly16x8_t __ret_14; \
-  poly16x4_t __s0_14 = __p0_14; \
-  __ret_14 = splatq_lane_p16(__s0_14, __p1_14); \
-  __ret_14; \
-})
-#else
-#define vdupq_lane_p16(__p0_15, __p1_15) __extension__ ({ \
-  poly16x8_t __ret_15; \
-  poly16x4_t __s0_15 = __p0_15; \
-  poly16x4_t __rev0_15;  __rev0_15 = __builtin_shufflevector(__s0_15, __s0_15, 3, 2, 1, 0); \
-  __ret_15 = __noswap_splatq_lane_p16(__rev0_15, __p1_15); \
-  __ret_15 = __builtin_shufflevector(__ret_15, __ret_15, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_15; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_u8(__p0_16, __p1_16) __extension__ ({ \
-  uint8x16_t __ret_16; \
-  uint8x8_t __s0_16 = __p0_16; \
-  __ret_16 = splatq_lane_u8(__s0_16, __p1_16); \
-  __ret_16; \
-})
-#else
-#define vdupq_lane_u8(__p0_17, __p1_17) __extension__ ({ \
-  uint8x16_t __ret_17; \
-  uint8x8_t __s0_17 = __p0_17; \
-  uint8x8_t __rev0_17;  __rev0_17 = __builtin_shufflevector(__s0_17, __s0_17, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_17 = __noswap_splatq_lane_u8(__rev0_17, __p1_17); \
-  __ret_17 = __builtin_shufflevector(__ret_17, __ret_17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_17; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_u32(__p0_18, __p1_18) __extension__ ({ \
-  uint32x4_t __ret_18; \
-  uint32x2_t __s0_18 = __p0_18; \
-  __ret_18 = splatq_lane_u32(__s0_18, __p1_18); \
-  __ret_18; \
-})
-#else
-#define vdupq_lane_u32(__p0_19, __p1_19) __extension__ ({ \
-  uint32x4_t __ret_19; \
-  uint32x2_t __s0_19 = __p0_19; \
-  uint32x2_t __rev0_19;  __rev0_19 = __builtin_shufflevector(__s0_19, __s0_19, 1, 0); \
-  __ret_19 = __noswap_splatq_lane_u32(__rev0_19, __p1_19); \
-  __ret_19 = __builtin_shufflevector(__ret_19, __ret_19, 3, 2, 1, 0); \
-  __ret_19; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_u64(__p0_20, __p1_20) __extension__ ({ \
-  uint64x2_t __ret_20; \
-  uint64x1_t __s0_20 = __p0_20; \
-  __ret_20 = splatq_lane_u64(__s0_20, __p1_20); \
-  __ret_20; \
-})
-#else
-#define vdupq_lane_u64(__p0_21, __p1_21) __extension__ ({ \
-  uint64x2_t __ret_21; \
-  uint64x1_t __s0_21 = __p0_21; \
-  __ret_21 = __noswap_splatq_lane_u64(__s0_21, __p1_21); \
-  __ret_21 = __builtin_shufflevector(__ret_21, __ret_21, 1, 0); \
-  __ret_21; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_u16(__p0_22, __p1_22) __extension__ ({ \
-  uint16x8_t __ret_22; \
-  uint16x4_t __s0_22 = __p0_22; \
-  __ret_22 = splatq_lane_u16(__s0_22, __p1_22); \
-  __ret_22; \
-})
-#else
-#define vdupq_lane_u16(__p0_23, __p1_23) __extension__ ({ \
-  uint16x8_t __ret_23; \
-  uint16x4_t __s0_23 = __p0_23; \
-  uint16x4_t __rev0_23;  __rev0_23 = __builtin_shufflevector(__s0_23, __s0_23, 3, 2, 1, 0); \
-  __ret_23 = __noswap_splatq_lane_u16(__rev0_23, __p1_23); \
-  __ret_23 = __builtin_shufflevector(__ret_23, __ret_23, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_23; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_s8(__p0_24, __p1_24) __extension__ ({ \
-  int8x16_t __ret_24; \
-  int8x8_t __s0_24 = __p0_24; \
-  __ret_24 = splatq_lane_s8(__s0_24, __p1_24); \
-  __ret_24; \
-})
-#else
-#define vdupq_lane_s8(__p0_25, __p1_25) __extension__ ({ \
-  int8x16_t __ret_25; \
-  int8x8_t __s0_25 = __p0_25; \
-  int8x8_t __rev0_25;  __rev0_25 = __builtin_shufflevector(__s0_25, __s0_25, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_25 = __noswap_splatq_lane_s8(__rev0_25, __p1_25); \
-  __ret_25 = __builtin_shufflevector(__ret_25, __ret_25, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_25; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_f32(__p0_26, __p1_26) __extension__ ({ \
-  float32x4_t __ret_26; \
-  float32x2_t __s0_26 = __p0_26; \
-  __ret_26 = splatq_lane_f32(__s0_26, __p1_26); \
-  __ret_26; \
-})
-#else
-#define vdupq_lane_f32(__p0_27, __p1_27) __extension__ ({ \
-  float32x4_t __ret_27; \
-  float32x2_t __s0_27 = __p0_27; \
-  float32x2_t __rev0_27;  __rev0_27 = __builtin_shufflevector(__s0_27, __s0_27, 1, 0); \
-  __ret_27 = __noswap_splatq_lane_f32(__rev0_27, __p1_27); \
-  __ret_27 = __builtin_shufflevector(__ret_27, __ret_27, 3, 2, 1, 0); \
-  __ret_27; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_f16(__p0_28, __p1_28) __extension__ ({ \
-  float16x8_t __ret_28; \
-  float16x4_t __s0_28 = __p0_28; \
-  __ret_28 = splatq_lane_f16(__s0_28, __p1_28); \
-  __ret_28; \
-})
-#else
-#define vdupq_lane_f16(__p0_29, __p1_29) __extension__ ({ \
-  float16x8_t __ret_29; \
-  float16x4_t __s0_29 = __p0_29; \
-  float16x4_t __rev0_29;  __rev0_29 = __builtin_shufflevector(__s0_29, __s0_29, 3, 2, 1, 0); \
-  __ret_29 = __noswap_splatq_lane_f16(__rev0_29, __p1_29); \
-  __ret_29 = __builtin_shufflevector(__ret_29, __ret_29, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_29; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_s32(__p0_30, __p1_30) __extension__ ({ \
-  int32x4_t __ret_30; \
-  int32x2_t __s0_30 = __p0_30; \
-  __ret_30 = splatq_lane_s32(__s0_30, __p1_30); \
-  __ret_30; \
-})
-#else
-#define vdupq_lane_s32(__p0_31, __p1_31) __extension__ ({ \
-  int32x4_t __ret_31; \
-  int32x2_t __s0_31 = __p0_31; \
-  int32x2_t __rev0_31;  __rev0_31 = __builtin_shufflevector(__s0_31, __s0_31, 1, 0); \
-  __ret_31 = __noswap_splatq_lane_s32(__rev0_31, __p1_31); \
-  __ret_31 = __builtin_shufflevector(__ret_31, __ret_31, 3, 2, 1, 0); \
-  __ret_31; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_s64(__p0_32, __p1_32) __extension__ ({ \
-  int64x2_t __ret_32; \
-  int64x1_t __s0_32 = __p0_32; \
-  __ret_32 = splatq_lane_s64(__s0_32, __p1_32); \
-  __ret_32; \
-})
-#else
-#define vdupq_lane_s64(__p0_33, __p1_33) __extension__ ({ \
-  int64x2_t __ret_33; \
-  int64x1_t __s0_33 = __p0_33; \
-  __ret_33 = __noswap_splatq_lane_s64(__s0_33, __p1_33); \
-  __ret_33 = __builtin_shufflevector(__ret_33, __ret_33, 1, 0); \
-  __ret_33; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_s16(__p0_34, __p1_34) __extension__ ({ \
-  int16x8_t __ret_34; \
-  int16x4_t __s0_34 = __p0_34; \
-  __ret_34 = splatq_lane_s16(__s0_34, __p1_34); \
-  __ret_34; \
-})
-#else
-#define vdupq_lane_s16(__p0_35, __p1_35) __extension__ ({ \
-  int16x8_t __ret_35; \
-  int16x4_t __s0_35 = __p0_35; \
-  int16x4_t __rev0_35;  __rev0_35 = __builtin_shufflevector(__s0_35, __s0_35, 3, 2, 1, 0); \
-  __ret_35 = __noswap_splatq_lane_s16(__rev0_35, __p1_35); \
-  __ret_35 = __builtin_shufflevector(__ret_35, __ret_35, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_35; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_u8(__p0_36, __p1_36) __extension__ ({ \
-  uint8x8_t __ret_36; \
-  uint8x8_t __s0_36 = __p0_36; \
-  __ret_36 = splat_lane_u8(__s0_36, __p1_36); \
-  __ret_36; \
-})
-#else
-#define vdup_lane_u8(__p0_37, __p1_37) __extension__ ({ \
-  uint8x8_t __ret_37; \
-  uint8x8_t __s0_37 = __p0_37; \
-  uint8x8_t __rev0_37;  __rev0_37 = __builtin_shufflevector(__s0_37, __s0_37, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_37 = __noswap_splat_lane_u8(__rev0_37, __p1_37); \
-  __ret_37 = __builtin_shufflevector(__ret_37, __ret_37, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_37; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_u32(__p0_38, __p1_38) __extension__ ({ \
-  uint32x2_t __ret_38; \
-  uint32x2_t __s0_38 = __p0_38; \
-  __ret_38 = splat_lane_u32(__s0_38, __p1_38); \
-  __ret_38; \
-})
-#else
-#define vdup_lane_u32(__p0_39, __p1_39) __extension__ ({ \
-  uint32x2_t __ret_39; \
-  uint32x2_t __s0_39 = __p0_39; \
-  uint32x2_t __rev0_39;  __rev0_39 = __builtin_shufflevector(__s0_39, __s0_39, 1, 0); \
-  __ret_39 = __noswap_splat_lane_u32(__rev0_39, __p1_39); \
-  __ret_39 = __builtin_shufflevector(__ret_39, __ret_39, 1, 0); \
-  __ret_39; \
-})
-#endif
-
-#define vdup_lane_u64(__p0_40, __p1_40) __extension__ ({ \
-  uint64x1_t __ret_40; \
-  uint64x1_t __s0_40 = __p0_40; \
-  __ret_40 = splat_lane_u64(__s0_40, __p1_40); \
-  __ret_40; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_u16(__p0_41, __p1_41) __extension__ ({ \
-  uint16x4_t __ret_41; \
-  uint16x4_t __s0_41 = __p0_41; \
-  __ret_41 = splat_lane_u16(__s0_41, __p1_41); \
-  __ret_41; \
-})
-#else
-#define vdup_lane_u16(__p0_42, __p1_42) __extension__ ({ \
-  uint16x4_t __ret_42; \
-  uint16x4_t __s0_42 = __p0_42; \
-  uint16x4_t __rev0_42;  __rev0_42 = __builtin_shufflevector(__s0_42, __s0_42, 3, 2, 1, 0); \
-  __ret_42 = __noswap_splat_lane_u16(__rev0_42, __p1_42); \
-  __ret_42 = __builtin_shufflevector(__ret_42, __ret_42, 3, 2, 1, 0); \
-  __ret_42; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_s8(__p0_43, __p1_43) __extension__ ({ \
-  int8x8_t __ret_43; \
-  int8x8_t __s0_43 = __p0_43; \
-  __ret_43 = splat_lane_s8(__s0_43, __p1_43); \
-  __ret_43; \
-})
-#else
-#define vdup_lane_s8(__p0_44, __p1_44) __extension__ ({ \
-  int8x8_t __ret_44; \
-  int8x8_t __s0_44 = __p0_44; \
-  int8x8_t __rev0_44;  __rev0_44 = __builtin_shufflevector(__s0_44, __s0_44, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_44 = __noswap_splat_lane_s8(__rev0_44, __p1_44); \
-  __ret_44 = __builtin_shufflevector(__ret_44, __ret_44, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_44; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_f32(__p0_45, __p1_45) __extension__ ({ \
-  float32x2_t __ret_45; \
-  float32x2_t __s0_45 = __p0_45; \
-  __ret_45 = splat_lane_f32(__s0_45, __p1_45); \
-  __ret_45; \
-})
-#else
-#define vdup_lane_f32(__p0_46, __p1_46) __extension__ ({ \
-  float32x2_t __ret_46; \
-  float32x2_t __s0_46 = __p0_46; \
-  float32x2_t __rev0_46;  __rev0_46 = __builtin_shufflevector(__s0_46, __s0_46, 1, 0); \
-  __ret_46 = __noswap_splat_lane_f32(__rev0_46, __p1_46); \
-  __ret_46 = __builtin_shufflevector(__ret_46, __ret_46, 1, 0); \
-  __ret_46; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_f16(__p0_47, __p1_47) __extension__ ({ \
-  float16x4_t __ret_47; \
-  float16x4_t __s0_47 = __p0_47; \
-  __ret_47 = splat_lane_f16(__s0_47, __p1_47); \
-  __ret_47; \
-})
-#else
-#define vdup_lane_f16(__p0_48, __p1_48) __extension__ ({ \
-  float16x4_t __ret_48; \
-  float16x4_t __s0_48 = __p0_48; \
-  float16x4_t __rev0_48;  __rev0_48 = __builtin_shufflevector(__s0_48, __s0_48, 3, 2, 1, 0); \
-  __ret_48 = __noswap_splat_lane_f16(__rev0_48, __p1_48); \
-  __ret_48 = __builtin_shufflevector(__ret_48, __ret_48, 3, 2, 1, 0); \
-  __ret_48; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_s32(__p0_49, __p1_49) __extension__ ({ \
-  int32x2_t __ret_49; \
-  int32x2_t __s0_49 = __p0_49; \
-  __ret_49 = splat_lane_s32(__s0_49, __p1_49); \
-  __ret_49; \
-})
-#else
-#define vdup_lane_s32(__p0_50, __p1_50) __extension__ ({ \
-  int32x2_t __ret_50; \
-  int32x2_t __s0_50 = __p0_50; \
-  int32x2_t __rev0_50;  __rev0_50 = __builtin_shufflevector(__s0_50, __s0_50, 1, 0); \
-  __ret_50 = __noswap_splat_lane_s32(__rev0_50, __p1_50); \
-  __ret_50 = __builtin_shufflevector(__ret_50, __ret_50, 1, 0); \
-  __ret_50; \
-})
-#endif
-
-#define vdup_lane_s64(__p0_51, __p1_51) __extension__ ({ \
-  int64x1_t __ret_51; \
-  int64x1_t __s0_51 = __p0_51; \
-  __ret_51 = splat_lane_s64(__s0_51, __p1_51); \
-  __ret_51; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vdup_lane_s16(__p0_52, __p1_52) __extension__ ({ \
-  int16x4_t __ret_52; \
-  int16x4_t __s0_52 = __p0_52; \
-  __ret_52 = splat_lane_s16(__s0_52, __p1_52); \
-  __ret_52; \
-})
-#else
-#define vdup_lane_s16(__p0_53, __p1_53) __extension__ ({ \
-  int16x4_t __ret_53; \
-  int16x4_t __s0_53 = __p0_53; \
-  int16x4_t __rev0_53;  __rev0_53 = __builtin_shufflevector(__s0_53, __s0_53, 3, 2, 1, 0); \
-  __ret_53 = __noswap_splat_lane_s16(__rev0_53, __p1_53); \
-  __ret_53 = __builtin_shufflevector(__ret_53, __ret_53, 3, 2, 1, 0); \
-  __ret_53; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vdup_n_p8(poly8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vdup_n_p8(poly8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vdup_n_p16(poly16_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vdup_n_p16(poly16_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vdupq_n_p8(poly8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vdupq_n_p8(poly8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vdupq_n_p16(poly16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vdupq_n_p16(poly16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vdupq_n_u8(uint8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vdupq_n_u8(uint8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vdupq_n_u32(uint32_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vdupq_n_u32(uint32_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vdupq_n_u64(uint64_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vdupq_n_u64(uint64_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vdupq_n_u16(uint16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vdupq_n_u16(uint16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vdupq_n_s8(int8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vdupq_n_s8(int8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vdupq_n_f32(float32_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vdupq_n_f32(float32_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_n_f16(__p0) __extension__ ({ \
-  float16x8_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
-  __ret; \
-})
-#else
-#define vdupq_n_f16(__p0) __extension__ ({ \
-  float16x8_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vdupq_n_s32(int32_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vdupq_n_s32(int32_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vdupq_n_s64(int64_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vdupq_n_s64(int64_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vdupq_n_s16(int16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vdupq_n_s16(int16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vdup_n_u8(uint8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vdup_n_u8(uint8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vdup_n_u32(uint32_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vdup_n_u32(uint32_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vdup_n_u64(uint64_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) {__p0};
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vdup_n_u16(uint16_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vdup_n_u16(uint16_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vdup_n_s8(int8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vdup_n_s8(int8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vdup_n_f32(float32_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vdup_n_f32(float32_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_n_f16(__p0) __extension__ ({ \
-  float16x4_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
-  __ret; \
-})
-#else
-#define vdup_n_f16(__p0) __extension__ ({ \
-  float16x4_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vdup_n_s32(int32_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vdup_n_s32(int32_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vdup_n_s64(int64_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) {__p0};
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vdup_n_s16(int16_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vdup_n_s16(int16_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t veorq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t veorq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t veorq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t veorq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t veorq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t veorq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t veorq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t veorq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t veorq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t veorq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t veorq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t veorq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t veorq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t veorq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t veorq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t veorq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t veor_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t veor_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t veor_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t veor_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t veor_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t veor_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t veor_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t veor_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t veor_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t veor_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t veor_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t veor_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t veor_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __p0 ^ __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t veor_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 ^ __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vext_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __s1 = __p1; \
-  __ret = (poly8x8_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 4); \
-  __ret; \
-})
-#else
-#define vext_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __s1 = __p1; \
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x8_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 4); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vext_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __s1 = __p1; \
-  __ret = (poly16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 5); \
-  __ret; \
-})
-#else
-#define vext_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __s1 = __p1; \
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (poly16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 5); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __s1 = __p1; \
-  __ret = (poly8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \
-  __ret; \
-})
-#else
-#define vextq_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __s1 = __p1; \
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __s1 = __p1; \
-  __ret = (poly16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \
-  __ret; \
-})
-#else
-#define vextq_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __s1 = __p1; \
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  __ret = (uint8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
-  __ret; \
-})
-#else
-#define vextq_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  __ret = (uint32x4_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
-  __ret; \
-})
-#else
-#define vextq_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
-  __ret; \
-})
-#else
-#define vextq_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  __ret = (uint16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
-  __ret; \
-})
-#else
-#define vextq_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  __ret = (int8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
-  __ret; \
-})
-#else
-#define vextq_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  __ret = (float32x4_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 41); \
-  __ret; \
-})
-#else
-#define vextq_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (float32x4_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 41); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
-  __ret; \
-})
-#else
-#define vextq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  __ret = (int64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
-  __ret; \
-})
-#else
-#define vextq_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
-  __ret; \
-})
-#else
-#define vextq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vext_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  __ret = (uint8x8_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
-  __ret; \
-})
-#else
-#define vext_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vext_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  __ret = (uint32x2_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
-  __ret; \
-})
-#else
-#define vext_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vext_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  uint64x1_t __s1 = __p1; \
-  __ret = (uint64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vext_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  __ret = (uint16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
-  __ret; \
-})
-#else
-#define vext_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vext_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  __ret = (int8x8_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
-  __ret; \
-})
-#else
-#define vext_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vext_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  __ret = (float32x2_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 9); \
-  __ret; \
-})
-#else
-#define vext_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (float32x2_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 9); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vext_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vext_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vext_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  int64x1_t __s1 = __p1; \
-  __ret = (int64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vext_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vext_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 40); \
-  __ret; \
-})
-#else
-#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 40); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vext_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 8); \
-  __ret; \
-})
-#else
-#define vext_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 8); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vget_high_p8(poly8x16_t __p0) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vget_high_p8(poly8x16_t __p0) {
-  poly8x8_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 8, 9, 10, 11, 12, 13, 14, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t __noswap_vget_high_p8(poly8x16_t __p0) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vget_high_p16(poly16x8_t __p0) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vget_high_p16(poly16x8_t __p0) {
-  poly16x4_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vget_high_u8(uint8x16_t __p0) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vget_high_u8(uint8x16_t __p0) {
-  uint8x8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 8, 9, 10, 11, 12, 13, 14, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t __noswap_vget_high_u8(uint8x16_t __p0) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vget_high_u32(uint32x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vget_high_u32(uint32x4_t __p0) {
-  uint32x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t __noswap_vget_high_u32(uint32x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x1_t vget_high_u64(uint64x2_t __p0) {
-  uint64x1_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x1_t vget_high_u64(uint64x2_t __p0) {
-  uint64x1_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vget_high_u16(uint16x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vget_high_u16(uint16x8_t __p0) {
-  uint16x4_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t __noswap_vget_high_u16(uint16x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vget_high_s8(int8x16_t __p0) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vget_high_s8(int8x16_t __p0) {
-  int8x8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 8, 9, 10, 11, 12, 13, 14, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t __noswap_vget_high_s8(int8x16_t __p0) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vget_high_f32(float32x4_t __p0) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vget_high_f32(float32x4_t __p0) {
-  float32x2_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t __noswap_vget_high_f32(float32x4_t __p0) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vget_high_f16(float16x8_t __p0) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vget_high_f16(float16x8_t __p0) {
-  float16x4_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t __noswap_vget_high_f16(float16x8_t __p0) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vget_high_s32(int32x4_t __p0) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vget_high_s32(int32x4_t __p0) {
-  int32x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vget_high_s32(int32x4_t __p0) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x1_t vget_high_s64(int64x2_t __p0) {
-  int64x1_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x1_t vget_high_s64(int64x2_t __p0) {
-  int64x1_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vget_high_s16(int16x8_t __p0) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vget_high_s16(int16x8_t __p0) {
-  int16x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vget_high_s16(int16x8_t __p0) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  __ret = (poly8_t) __builtin_neon_vget_lane_i8((poly8x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8_t) __builtin_neon_vget_lane_i8((poly8x8_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  __ret = (poly8_t) __builtin_neon_vget_lane_i8((poly8x8_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  __ret = (poly16_t) __builtin_neon_vget_lane_i16((poly16x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (poly16_t) __builtin_neon_vget_lane_i16((poly16x4_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  __ret = (poly16_t) __builtin_neon_vget_lane_i16((poly16x4_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  __ret = (poly8_t) __builtin_neon_vgetq_lane_i8((poly8x16_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8_t) __builtin_neon_vgetq_lane_i8((poly8x16_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  __ret = (poly8_t) __builtin_neon_vgetq_lane_i8((poly8x16_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  __ret = (poly16_t) __builtin_neon_vgetq_lane_i16((poly16x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly16_t) __builtin_neon_vgetq_lane_i16((poly16x8_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  __ret = (poly16_t) __builtin_neon_vgetq_lane_i16((poly16x8_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int32x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int32x4_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int32x4_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vgetq_lane_i64((int64x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64_t) __builtin_neon_vgetq_lane_i64((int64x2_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vgetq_lane_i64((int64x2_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vgetq_lane_i16((int16x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16_t) __builtin_neon_vgetq_lane_i16((int16x8_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vgetq_lane_i16((int16x8_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x4_t __s0 = __p0; \
-  __ret = (float32_t) __builtin_neon_vgetq_lane_f32((float32x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float32_t) __builtin_neon_vgetq_lane_f32((float32x4_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x4_t __s0 = __p0; \
-  __ret = (float32_t) __builtin_neon_vgetq_lane_f32((float32x4_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vgetq_lane_i32((int32x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32_t) __builtin_neon_vgetq_lane_i32((int32x4_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vgetq_lane_i32((int32x4_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vgetq_lane_i64((int64x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64_t) __builtin_neon_vgetq_lane_i64((int64x2_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vgetq_lane_i64((int64x2_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vgetq_lane_i16((int16x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16_t) __builtin_neon_vgetq_lane_i16((int16x8_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vgetq_lane_i16((int16x8_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8_t) __builtin_neon_vget_lane_i8((int8x8_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vget_lane_i32((int32x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32_t) __builtin_neon_vget_lane_i32((int32x2_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vget_lane_i32((int32x2_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#define vget_lane_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vget_lane_i64((int64x1_t)__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vget_lane_i16((int16x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16_t) __builtin_neon_vget_lane_i16((int16x4_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vget_lane_i16((int16x4_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8_t) __builtin_neon_vget_lane_i8((int8x8_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x2_t __s0 = __p0; \
-  __ret = (float32_t) __builtin_neon_vget_lane_f32((float32x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float32_t) __builtin_neon_vget_lane_f32((float32x2_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x2_t __s0 = __p0; \
-  __ret = (float32_t) __builtin_neon_vget_lane_f32((float32x2_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vget_lane_i32((int32x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32_t) __builtin_neon_vget_lane_i32((int32x2_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vget_lane_i32((int32x2_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#define vget_lane_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vget_lane_i64((int64x1_t)__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vget_lane_i16((int16x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vget_lane_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16_t) __builtin_neon_vget_lane_i16((int16x4_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vget_lane_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vget_lane_i16((int16x4_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vget_low_p8(poly8x16_t __p0) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vget_low_p8(poly8x16_t __p0) {
-  poly8x8_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vget_low_p16(poly16x8_t __p0) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vget_low_p16(poly16x8_t __p0) {
-  poly16x4_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vget_low_u8(uint8x16_t __p0) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vget_low_u8(uint8x16_t __p0) {
-  uint8x8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vget_low_u32(uint32x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vget_low_u32(uint32x4_t __p0) {
-  uint32x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x1_t vget_low_u64(uint64x2_t __p0) {
-  uint64x1_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x1_t vget_low_u64(uint64x2_t __p0) {
-  uint64x1_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vget_low_u16(uint16x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vget_low_u16(uint16x8_t __p0) {
-  uint16x4_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vget_low_s8(int8x16_t __p0) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vget_low_s8(int8x16_t __p0) {
-  int8x8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3, 4, 5, 6, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vget_low_f32(float32x4_t __p0) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vget_low_f32(float32x4_t __p0) {
-  float32x2_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vget_low_f16(float16x8_t __p0) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vget_low_f16(float16x8_t __p0) {
-  float16x4_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vget_low_s32(int32x4_t __p0) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vget_low_s32(int32x4_t __p0) {
-  int32x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x1_t vget_low_s64(int64x2_t __p0) {
-  int64x1_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x1_t vget_low_s64(int64x2_t __p0) {
-  int64x1_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vget_low_s16(int16x8_t __p0) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vget_low_s16(int16x8_t __p0) {
-  int16x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vhadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vhadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vhadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vhadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vhadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vhadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vhsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vhsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vhsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vhsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vhsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vhsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vhsubq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vhsubq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vhsubq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vhsubq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vhsubq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vhsubq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vhsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vhsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vhsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vhsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vhsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vhsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vhsub_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vhsub_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vhsub_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vhsub_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p8(__p0) __extension__ ({ \
-  poly8x8_t __ret; \
-  __ret = (poly8x8_t) __builtin_neon_vld1_v(__p0, 4); \
-  __ret; \
-})
-#else
-#define vld1_p8(__p0) __extension__ ({ \
-  poly8x8_t __ret; \
-  __ret = (poly8x8_t) __builtin_neon_vld1_v(__p0, 4); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p16(__p0) __extension__ ({ \
-  poly16x4_t __ret; \
-  __ret = (poly16x4_t) __builtin_neon_vld1_v(__p0, 5); \
-  __ret; \
-})
-#else
-#define vld1_p16(__p0) __extension__ ({ \
-  poly16x4_t __ret; \
-  __ret = (poly16x4_t) __builtin_neon_vld1_v(__p0, 5); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p8(__p0) __extension__ ({ \
-  poly8x16_t __ret; \
-  __ret = (poly8x16_t) __builtin_neon_vld1q_v(__p0, 36); \
-  __ret; \
-})
-#else
-#define vld1q_p8(__p0) __extension__ ({ \
-  poly8x16_t __ret; \
-  __ret = (poly8x16_t) __builtin_neon_vld1q_v(__p0, 36); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p16(__p0) __extension__ ({ \
-  poly16x8_t __ret; \
-  __ret = (poly16x8_t) __builtin_neon_vld1q_v(__p0, 37); \
-  __ret; \
-})
-#else
-#define vld1q_p16(__p0) __extension__ ({ \
-  poly16x8_t __ret; \
-  __ret = (poly16x8_t) __builtin_neon_vld1q_v(__p0, 37); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u8(__p0) __extension__ ({ \
-  uint8x16_t __ret; \
-  __ret = (uint8x16_t) __builtin_neon_vld1q_v(__p0, 48); \
-  __ret; \
-})
-#else
-#define vld1q_u8(__p0) __extension__ ({ \
-  uint8x16_t __ret; \
-  __ret = (uint8x16_t) __builtin_neon_vld1q_v(__p0, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u32(__p0) __extension__ ({ \
-  uint32x4_t __ret; \
-  __ret = (uint32x4_t) __builtin_neon_vld1q_v(__p0, 50); \
-  __ret; \
-})
-#else
-#define vld1q_u32(__p0) __extension__ ({ \
-  uint32x4_t __ret; \
-  __ret = (uint32x4_t) __builtin_neon_vld1q_v(__p0, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u64(__p0) __extension__ ({ \
-  uint64x2_t __ret; \
-  __ret = (uint64x2_t) __builtin_neon_vld1q_v(__p0, 51); \
-  __ret; \
-})
-#else
-#define vld1q_u64(__p0) __extension__ ({ \
-  uint64x2_t __ret; \
-  __ret = (uint64x2_t) __builtin_neon_vld1q_v(__p0, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u16(__p0) __extension__ ({ \
-  uint16x8_t __ret; \
-  __ret = (uint16x8_t) __builtin_neon_vld1q_v(__p0, 49); \
-  __ret; \
-})
-#else
-#define vld1q_u16(__p0) __extension__ ({ \
-  uint16x8_t __ret; \
-  __ret = (uint16x8_t) __builtin_neon_vld1q_v(__p0, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s8(__p0) __extension__ ({ \
-  int8x16_t __ret; \
-  __ret = (int8x16_t) __builtin_neon_vld1q_v(__p0, 32); \
-  __ret; \
-})
-#else
-#define vld1q_s8(__p0) __extension__ ({ \
-  int8x16_t __ret; \
-  __ret = (int8x16_t) __builtin_neon_vld1q_v(__p0, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f32(__p0) __extension__ ({ \
-  float32x4_t __ret; \
-  __ret = (float32x4_t) __builtin_neon_vld1q_v(__p0, 41); \
-  __ret; \
-})
-#else
-#define vld1q_f32(__p0) __extension__ ({ \
-  float32x4_t __ret; \
-  __ret = (float32x4_t) __builtin_neon_vld1q_v(__p0, 41); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s32(__p0) __extension__ ({ \
-  int32x4_t __ret; \
-  __ret = (int32x4_t) __builtin_neon_vld1q_v(__p0, 34); \
-  __ret; \
-})
-#else
-#define vld1q_s32(__p0) __extension__ ({ \
-  int32x4_t __ret; \
-  __ret = (int32x4_t) __builtin_neon_vld1q_v(__p0, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s64(__p0) __extension__ ({ \
-  int64x2_t __ret; \
-  __ret = (int64x2_t) __builtin_neon_vld1q_v(__p0, 35); \
-  __ret; \
-})
-#else
-#define vld1q_s64(__p0) __extension__ ({ \
-  int64x2_t __ret; \
-  __ret = (int64x2_t) __builtin_neon_vld1q_v(__p0, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s16(__p0) __extension__ ({ \
-  int16x8_t __ret; \
-  __ret = (int16x8_t) __builtin_neon_vld1q_v(__p0, 33); \
-  __ret; \
-})
-#else
-#define vld1q_s16(__p0) __extension__ ({ \
-  int16x8_t __ret; \
-  __ret = (int16x8_t) __builtin_neon_vld1q_v(__p0, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u8(__p0) __extension__ ({ \
-  uint8x8_t __ret; \
-  __ret = (uint8x8_t) __builtin_neon_vld1_v(__p0, 16); \
-  __ret; \
-})
-#else
-#define vld1_u8(__p0) __extension__ ({ \
-  uint8x8_t __ret; \
-  __ret = (uint8x8_t) __builtin_neon_vld1_v(__p0, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u32(__p0) __extension__ ({ \
-  uint32x2_t __ret; \
-  __ret = (uint32x2_t) __builtin_neon_vld1_v(__p0, 18); \
-  __ret; \
-})
-#else
-#define vld1_u32(__p0) __extension__ ({ \
-  uint32x2_t __ret; \
-  __ret = (uint32x2_t) __builtin_neon_vld1_v(__p0, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_u64(__p0) __extension__ ({ \
-  uint64x1_t __ret; \
-  __ret = (uint64x1_t) __builtin_neon_vld1_v(__p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u16(__p0) __extension__ ({ \
-  uint16x4_t __ret; \
-  __ret = (uint16x4_t) __builtin_neon_vld1_v(__p0, 17); \
-  __ret; \
-})
-#else
-#define vld1_u16(__p0) __extension__ ({ \
-  uint16x4_t __ret; \
-  __ret = (uint16x4_t) __builtin_neon_vld1_v(__p0, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s8(__p0) __extension__ ({ \
-  int8x8_t __ret; \
-  __ret = (int8x8_t) __builtin_neon_vld1_v(__p0, 0); \
-  __ret; \
-})
-#else
-#define vld1_s8(__p0) __extension__ ({ \
-  int8x8_t __ret; \
-  __ret = (int8x8_t) __builtin_neon_vld1_v(__p0, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f32(__p0) __extension__ ({ \
-  float32x2_t __ret; \
-  __ret = (float32x2_t) __builtin_neon_vld1_v(__p0, 9); \
-  __ret; \
-})
-#else
-#define vld1_f32(__p0) __extension__ ({ \
-  float32x2_t __ret; \
-  __ret = (float32x2_t) __builtin_neon_vld1_v(__p0, 9); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s32(__p0) __extension__ ({ \
-  int32x2_t __ret; \
-  __ret = (int32x2_t) __builtin_neon_vld1_v(__p0, 2); \
-  __ret; \
-})
-#else
-#define vld1_s32(__p0) __extension__ ({ \
-  int32x2_t __ret; \
-  __ret = (int32x2_t) __builtin_neon_vld1_v(__p0, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_s64(__p0) __extension__ ({ \
-  int64x1_t __ret; \
-  __ret = (int64x1_t) __builtin_neon_vld1_v(__p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s16(__p0) __extension__ ({ \
-  int16x4_t __ret; \
-  __ret = (int16x4_t) __builtin_neon_vld1_v(__p0, 1); \
-  __ret; \
-})
-#else
-#define vld1_s16(__p0) __extension__ ({ \
-  int16x4_t __ret; \
-  __ret = (int16x4_t) __builtin_neon_vld1_v(__p0, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_p8(__p0) __extension__ ({ \
-  poly8x8_t __ret; \
-  __ret = (poly8x8_t) __builtin_neon_vld1_dup_v(__p0, 4); \
-  __ret; \
-})
-#else
-#define vld1_dup_p8(__p0) __extension__ ({ \
-  poly8x8_t __ret; \
-  __ret = (poly8x8_t) __builtin_neon_vld1_dup_v(__p0, 4); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_p16(__p0) __extension__ ({ \
-  poly16x4_t __ret; \
-  __ret = (poly16x4_t) __builtin_neon_vld1_dup_v(__p0, 5); \
-  __ret; \
-})
-#else
-#define vld1_dup_p16(__p0) __extension__ ({ \
-  poly16x4_t __ret; \
-  __ret = (poly16x4_t) __builtin_neon_vld1_dup_v(__p0, 5); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_p8(__p0) __extension__ ({ \
-  poly8x16_t __ret; \
-  __ret = (poly8x16_t) __builtin_neon_vld1q_dup_v(__p0, 36); \
-  __ret; \
-})
-#else
-#define vld1q_dup_p8(__p0) __extension__ ({ \
-  poly8x16_t __ret; \
-  __ret = (poly8x16_t) __builtin_neon_vld1q_dup_v(__p0, 36); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_p16(__p0) __extension__ ({ \
-  poly16x8_t __ret; \
-  __ret = (poly16x8_t) __builtin_neon_vld1q_dup_v(__p0, 37); \
-  __ret; \
-})
-#else
-#define vld1q_dup_p16(__p0) __extension__ ({ \
-  poly16x8_t __ret; \
-  __ret = (poly16x8_t) __builtin_neon_vld1q_dup_v(__p0, 37); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_u8(__p0) __extension__ ({ \
-  uint8x16_t __ret; \
-  __ret = (uint8x16_t) __builtin_neon_vld1q_dup_v(__p0, 48); \
-  __ret; \
-})
-#else
-#define vld1q_dup_u8(__p0) __extension__ ({ \
-  uint8x16_t __ret; \
-  __ret = (uint8x16_t) __builtin_neon_vld1q_dup_v(__p0, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_u32(__p0) __extension__ ({ \
-  uint32x4_t __ret; \
-  __ret = (uint32x4_t) __builtin_neon_vld1q_dup_v(__p0, 50); \
-  __ret; \
-})
-#else
-#define vld1q_dup_u32(__p0) __extension__ ({ \
-  uint32x4_t __ret; \
-  __ret = (uint32x4_t) __builtin_neon_vld1q_dup_v(__p0, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_u64(__p0) __extension__ ({ \
-  uint64x2_t __ret; \
-  __ret = (uint64x2_t) __builtin_neon_vld1q_dup_v(__p0, 51); \
-  __ret; \
-})
-#else
-#define vld1q_dup_u64(__p0) __extension__ ({ \
-  uint64x2_t __ret; \
-  __ret = (uint64x2_t) __builtin_neon_vld1q_dup_v(__p0, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_u16(__p0) __extension__ ({ \
-  uint16x8_t __ret; \
-  __ret = (uint16x8_t) __builtin_neon_vld1q_dup_v(__p0, 49); \
-  __ret; \
-})
-#else
-#define vld1q_dup_u16(__p0) __extension__ ({ \
-  uint16x8_t __ret; \
-  __ret = (uint16x8_t) __builtin_neon_vld1q_dup_v(__p0, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_s8(__p0) __extension__ ({ \
-  int8x16_t __ret; \
-  __ret = (int8x16_t) __builtin_neon_vld1q_dup_v(__p0, 32); \
-  __ret; \
-})
-#else
-#define vld1q_dup_s8(__p0) __extension__ ({ \
-  int8x16_t __ret; \
-  __ret = (int8x16_t) __builtin_neon_vld1q_dup_v(__p0, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_f32(__p0) __extension__ ({ \
-  float32x4_t __ret; \
-  __ret = (float32x4_t) __builtin_neon_vld1q_dup_v(__p0, 41); \
-  __ret; \
-})
-#else
-#define vld1q_dup_f32(__p0) __extension__ ({ \
-  float32x4_t __ret; \
-  __ret = (float32x4_t) __builtin_neon_vld1q_dup_v(__p0, 41); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_s32(__p0) __extension__ ({ \
-  int32x4_t __ret; \
-  __ret = (int32x4_t) __builtin_neon_vld1q_dup_v(__p0, 34); \
-  __ret; \
-})
-#else
-#define vld1q_dup_s32(__p0) __extension__ ({ \
-  int32x4_t __ret; \
-  __ret = (int32x4_t) __builtin_neon_vld1q_dup_v(__p0, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_s64(__p0) __extension__ ({ \
-  int64x2_t __ret; \
-  __ret = (int64x2_t) __builtin_neon_vld1q_dup_v(__p0, 35); \
-  __ret; \
-})
-#else
-#define vld1q_dup_s64(__p0) __extension__ ({ \
-  int64x2_t __ret; \
-  __ret = (int64x2_t) __builtin_neon_vld1q_dup_v(__p0, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_s16(__p0) __extension__ ({ \
-  int16x8_t __ret; \
-  __ret = (int16x8_t) __builtin_neon_vld1q_dup_v(__p0, 33); \
-  __ret; \
-})
-#else
-#define vld1q_dup_s16(__p0) __extension__ ({ \
-  int16x8_t __ret; \
-  __ret = (int16x8_t) __builtin_neon_vld1q_dup_v(__p0, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_u8(__p0) __extension__ ({ \
-  uint8x8_t __ret; \
-  __ret = (uint8x8_t) __builtin_neon_vld1_dup_v(__p0, 16); \
-  __ret; \
-})
-#else
-#define vld1_dup_u8(__p0) __extension__ ({ \
-  uint8x8_t __ret; \
-  __ret = (uint8x8_t) __builtin_neon_vld1_dup_v(__p0, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_u32(__p0) __extension__ ({ \
-  uint32x2_t __ret; \
-  __ret = (uint32x2_t) __builtin_neon_vld1_dup_v(__p0, 18); \
-  __ret; \
-})
-#else
-#define vld1_dup_u32(__p0) __extension__ ({ \
-  uint32x2_t __ret; \
-  __ret = (uint32x2_t) __builtin_neon_vld1_dup_v(__p0, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_dup_u64(__p0) __extension__ ({ \
-  uint64x1_t __ret; \
-  __ret = (uint64x1_t) __builtin_neon_vld1_dup_v(__p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_u16(__p0) __extension__ ({ \
-  uint16x4_t __ret; \
-  __ret = (uint16x4_t) __builtin_neon_vld1_dup_v(__p0, 17); \
-  __ret; \
-})
-#else
-#define vld1_dup_u16(__p0) __extension__ ({ \
-  uint16x4_t __ret; \
-  __ret = (uint16x4_t) __builtin_neon_vld1_dup_v(__p0, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_s8(__p0) __extension__ ({ \
-  int8x8_t __ret; \
-  __ret = (int8x8_t) __builtin_neon_vld1_dup_v(__p0, 0); \
-  __ret; \
-})
-#else
-#define vld1_dup_s8(__p0) __extension__ ({ \
-  int8x8_t __ret; \
-  __ret = (int8x8_t) __builtin_neon_vld1_dup_v(__p0, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_f32(__p0) __extension__ ({ \
-  float32x2_t __ret; \
-  __ret = (float32x2_t) __builtin_neon_vld1_dup_v(__p0, 9); \
-  __ret; \
-})
-#else
-#define vld1_dup_f32(__p0) __extension__ ({ \
-  float32x2_t __ret; \
-  __ret = (float32x2_t) __builtin_neon_vld1_dup_v(__p0, 9); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_s32(__p0) __extension__ ({ \
-  int32x2_t __ret; \
-  __ret = (int32x2_t) __builtin_neon_vld1_dup_v(__p0, 2); \
-  __ret; \
-})
-#else
-#define vld1_dup_s32(__p0) __extension__ ({ \
-  int32x2_t __ret; \
-  __ret = (int32x2_t) __builtin_neon_vld1_dup_v(__p0, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_dup_s64(__p0) __extension__ ({ \
-  int64x1_t __ret; \
-  __ret = (int64x1_t) __builtin_neon_vld1_dup_v(__p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_s16(__p0) __extension__ ({ \
-  int16x4_t __ret; \
-  __ret = (int16x4_t) __builtin_neon_vld1_dup_v(__p0, 1); \
-  __ret; \
-})
-#else
-#define vld1_dup_s16(__p0) __extension__ ({ \
-  int16x4_t __ret; \
-  __ret = (int16x4_t) __builtin_neon_vld1_dup_v(__p0, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s1 = __p1; \
-  __ret = (poly8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 4); \
-  __ret; \
-})
-#else
-#define vld1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s1 = __p1; \
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 4); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s1 = __p1; \
-  __ret = (poly16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 5); \
-  __ret; \
-})
-#else
-#define vld1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s1 = __p1; \
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (poly16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 5); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s1 = __p1; \
-  __ret = (poly8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 36); \
-  __ret; \
-})
-#else
-#define vld1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s1 = __p1; \
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 36); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s1 = __p1; \
-  __ret = (poly16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 37); \
-  __ret; \
-})
-#else
-#define vld1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s1 = __p1; \
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 37); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s1 = __p1; \
-  __ret = (uint8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 48); \
-  __ret; \
-})
-#else
-#define vld1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s1 = __p1; \
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s1 = __p1; \
-  __ret = (uint32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 50); \
-  __ret; \
-})
-#else
-#define vld1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 51); \
-  __ret; \
-})
-#else
-#define vld1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s1 = __p1; \
-  __ret = (uint16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 49); \
-  __ret; \
-})
-#else
-#define vld1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s1 = __p1; \
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s1 = __p1; \
-  __ret = (int8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 32); \
-  __ret; \
-})
-#else
-#define vld1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s1 = __p1; \
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s1 = __p1; \
-  __ret = (float32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 41); \
-  __ret; \
-})
-#else
-#define vld1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s1 = __p1; \
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (float32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 41); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 34); \
-  __ret; \
-})
-#else
-#define vld1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s1 = __p1; \
-  __ret = (int64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 35); \
-  __ret; \
-})
-#else
-#define vld1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 33); \
-  __ret; \
-})
-#else
-#define vld1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s1 = __p1; \
-  __ret = (uint8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 16); \
-  __ret; \
-})
-#else
-#define vld1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s1 = __p1; \
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s1 = __p1; \
-  __ret = (uint32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 18); \
-  __ret; \
-})
-#else
-#define vld1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s1 = __p1; \
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s1 = __p1; \
-  __ret = (uint64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s1 = __p1; \
-  __ret = (uint16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 17); \
-  __ret; \
-})
-#else
-#define vld1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s1 = __p1; \
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s1 = __p1; \
-  __ret = (int8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 0); \
-  __ret; \
-})
-#else
-#define vld1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s1 = __p1; \
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s1 = __p1; \
-  __ret = (float32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 9); \
-  __ret; \
-})
-#else
-#define vld1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s1 = __p1; \
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (float32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 9); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vld1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s1 = __p1; \
-  __ret = (int64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vld1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p8_x2(__p0) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld1_p8_x2(__p0) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p16_x2(__p0) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld1_p16_x2(__p0) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p8_x2(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld1q_p8_x2(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p16_x2(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld1q_p16_x2(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u8_x2(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld1q_u8_x2(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u32_x2(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld1q_u32_x2(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u64_x2(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld1q_u64_x2(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u16_x2(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld1q_u16_x2(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s8_x2(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld1q_s8_x2(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f32_x2(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld1q_f32_x2(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s32_x2(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld1q_s32_x2(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s64_x2(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld1q_s64_x2(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s16_x2(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld1q_s16_x2(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u8_x2(__p0) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld1_u8_x2(__p0) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u32_x2(__p0) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld1_u32_x2(__p0) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_u64_x2(__p0) __extension__ ({ \
-  uint64x1x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u16_x2(__p0) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld1_u16_x2(__p0) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s8_x2(__p0) __extension__ ({ \
-  int8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld1_s8_x2(__p0) __extension__ ({ \
-  int8x8x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f32_x2(__p0) __extension__ ({ \
-  float32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld1_f32_x2(__p0) __extension__ ({ \
-  float32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s32_x2(__p0) __extension__ ({ \
-  int32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld1_s32_x2(__p0) __extension__ ({ \
-  int32x2x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_s64_x2(__p0) __extension__ ({ \
-  int64x1x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s16_x2(__p0) __extension__ ({ \
-  int16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld1_s16_x2(__p0) __extension__ ({ \
-  int16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p8_x3(__p0) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld1_p8_x3(__p0) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p16_x3(__p0) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld1_p16_x3(__p0) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p8_x3(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld1q_p8_x3(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p16_x3(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld1q_p16_x3(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u8_x3(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld1q_u8_x3(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u32_x3(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld1q_u32_x3(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u64_x3(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld1q_u64_x3(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u16_x3(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld1q_u16_x3(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s8_x3(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld1q_s8_x3(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f32_x3(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld1q_f32_x3(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s32_x3(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld1q_s32_x3(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s64_x3(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld1q_s64_x3(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s16_x3(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld1q_s16_x3(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u8_x3(__p0) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld1_u8_x3(__p0) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u32_x3(__p0) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld1_u32_x3(__p0) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_u64_x3(__p0) __extension__ ({ \
-  uint64x1x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u16_x3(__p0) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld1_u16_x3(__p0) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s8_x3(__p0) __extension__ ({ \
-  int8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld1_s8_x3(__p0) __extension__ ({ \
-  int8x8x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f32_x3(__p0) __extension__ ({ \
-  float32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld1_f32_x3(__p0) __extension__ ({ \
-  float32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s32_x3(__p0) __extension__ ({ \
-  int32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld1_s32_x3(__p0) __extension__ ({ \
-  int32x2x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_s64_x3(__p0) __extension__ ({ \
-  int64x1x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s16_x3(__p0) __extension__ ({ \
-  int16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld1_s16_x3(__p0) __extension__ ({ \
-  int16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p8_x4(__p0) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld1_p8_x4(__p0) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_p16_x4(__p0) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld1_p16_x4(__p0) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p8_x4(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld1q_p8_x4(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p16_x4(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld1q_p16_x4(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u8_x4(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld1q_u8_x4(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u32_x4(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld1q_u32_x4(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u64_x4(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld1q_u64_x4(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_u16_x4(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld1q_u16_x4(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s8_x4(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld1q_s8_x4(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f32_x4(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld1q_f32_x4(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s32_x4(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld1q_s32_x4(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s64_x4(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld1q_s64_x4(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_s16_x4(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld1q_s16_x4(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u8_x4(__p0) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld1_u8_x4(__p0) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u32_x4(__p0) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld1_u32_x4(__p0) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_u64_x4(__p0) __extension__ ({ \
-  uint64x1x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_u16_x4(__p0) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld1_u16_x4(__p0) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s8_x4(__p0) __extension__ ({ \
-  int8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld1_s8_x4(__p0) __extension__ ({ \
-  int8x8x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f32_x4(__p0) __extension__ ({ \
-  float32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld1_f32_x4(__p0) __extension__ ({ \
-  float32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s32_x4(__p0) __extension__ ({ \
-  int32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld1_s32_x4(__p0) __extension__ ({ \
-  int32x2x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_s64_x4(__p0) __extension__ ({ \
-  int64x1x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1_s16_x4(__p0) __extension__ ({ \
-  int16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld1_s16_x4(__p0) __extension__ ({ \
-  int16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_p8(__p0) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld2_p8(__p0) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_p16(__p0) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld2_p16(__p0) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_p8(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld2q_p8(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_p16(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld2q_p16(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_u8(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld2q_u8(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_u32(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld2q_u32(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_u16(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld2q_u16(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_s8(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld2q_s8(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_f32(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld2q_f32(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_s32(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld2q_s32(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_s16(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld2q_s16(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_u8(__p0) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld2_u8(__p0) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_u32(__p0) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld2_u32(__p0) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld2_u64(__p0) __extension__ ({ \
-  uint64x1x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld2_u16(__p0) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld2_u16(__p0) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_s8(__p0) __extension__ ({ \
-  int8x8x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld2_s8(__p0) __extension__ ({ \
-  int8x8x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_f32(__p0) __extension__ ({ \
-  float32x2x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld2_f32(__p0) __extension__ ({ \
-  float32x2x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_s32(__p0) __extension__ ({ \
-  int32x2x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld2_s32(__p0) __extension__ ({ \
-  int32x2x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld2_s64(__p0) __extension__ ({ \
-  int64x1x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld2_s16(__p0) __extension__ ({ \
-  int16x4x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld2_s16(__p0) __extension__ ({ \
-  int16x4x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_p8(__p0) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld2_dup_p8(__p0) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_p16(__p0) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld2_dup_p16(__p0) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld2q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld2q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld2q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld2q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld2q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld2q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_s8(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld2q_dup_s8(__p0) __extension__ ({ \
-  int8x16x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_f32(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld2q_dup_f32(__p0) __extension__ ({ \
-  float32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_s32(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld2q_dup_s32(__p0) __extension__ ({ \
-  int32x4x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_s64(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld2q_dup_s64(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_s16(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld2q_dup_s16(__p0) __extension__ ({ \
-  int16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_u8(__p0) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld2_dup_u8(__p0) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_u32(__p0) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld2_dup_u32(__p0) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld2_dup_u64(__p0) __extension__ ({ \
-  uint64x1x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_u16(__p0) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld2_dup_u16(__p0) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_s8(__p0) __extension__ ({ \
-  int8x8x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld2_dup_s8(__p0) __extension__ ({ \
-  int8x8x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_f32(__p0) __extension__ ({ \
-  float32x2x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld2_dup_f32(__p0) __extension__ ({ \
-  float32x2x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_s32(__p0) __extension__ ({ \
-  int32x2x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld2_dup_s32(__p0) __extension__ ({ \
-  int32x2x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld2_dup_s64(__p0) __extension__ ({ \
-  int64x1x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_s16(__p0) __extension__ ({ \
-  int16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld2_dup_s16(__p0) __extension__ ({ \
-  int16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  poly8x8x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 4); \
-  __ret; \
-})
-#else
-#define vld2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x2_t __ret; \
-  poly8x8x2_t __s1 = __p1; \
-  poly8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  poly16x4x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 5); \
-  __ret; \
-})
-#else
-#define vld2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x2_t __ret; \
-  poly16x4x2_t __s1 = __p1; \
-  poly16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  poly16x8x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 37); \
-  __ret; \
-})
-#else
-#define vld2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x2_t __ret; \
-  poly16x8x2_t __s1 = __p1; \
-  poly16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  uint32x4x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 50); \
-  __ret; \
-})
-#else
-#define vld2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x2_t __ret; \
-  uint32x4x2_t __s1 = __p1; \
-  uint32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  uint16x8x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 49); \
-  __ret; \
-})
-#else
-#define vld2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x2_t __ret; \
-  uint16x8x2_t __s1 = __p1; \
-  uint16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x2_t __ret; \
-  float32x4x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 41); \
-  __ret; \
-})
-#else
-#define vld2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x2_t __ret; \
-  float32x4x2_t __s1 = __p1; \
-  float32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x2_t __ret; \
-  int32x4x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 34); \
-  __ret; \
-})
-#else
-#define vld2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x2_t __ret; \
-  int32x4x2_t __s1 = __p1; \
-  int32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x2_t __ret; \
-  int16x8x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 33); \
-  __ret; \
-})
-#else
-#define vld2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x2_t __ret; \
-  int16x8x2_t __s1 = __p1; \
-  int16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  uint8x8x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 16); \
-  __ret; \
-})
-#else
-#define vld2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x2_t __ret; \
-  uint8x8x2_t __s1 = __p1; \
-  uint8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  uint32x2x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 18); \
-  __ret; \
-})
-#else
-#define vld2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x2_t __ret; \
-  uint32x2x2_t __s1 = __p1; \
-  uint32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  uint16x4x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 17); \
-  __ret; \
-})
-#else
-#define vld2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x2_t __ret; \
-  uint16x4x2_t __s1 = __p1; \
-  uint16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x2_t __ret; \
-  int8x8x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 0); \
-  __ret; \
-})
-#else
-#define vld2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x2_t __ret; \
-  int8x8x2_t __s1 = __p1; \
-  int8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x2_t __ret; \
-  float32x2x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 9); \
-  __ret; \
-})
-#else
-#define vld2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x2_t __ret; \
-  float32x2x2_t __s1 = __p1; \
-  float32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x2_t __ret; \
-  int32x2x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 2); \
-  __ret; \
-})
-#else
-#define vld2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x2_t __ret; \
-  int32x2x2_t __s1 = __p1; \
-  int32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x2_t __ret; \
-  int16x4x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 1); \
-  __ret; \
-})
-#else
-#define vld2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x2_t __ret; \
-  int16x4x2_t __s1 = __p1; \
-  int16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_p8(__p0) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld3_p8(__p0) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_p16(__p0) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld3_p16(__p0) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_p8(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld3q_p8(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_p16(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld3q_p16(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_u8(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld3q_u8(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_u32(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld3q_u32(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_u16(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld3q_u16(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_s8(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld3q_s8(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_f32(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld3q_f32(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_s32(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld3q_s32(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_s16(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld3q_s16(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_u8(__p0) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld3_u8(__p0) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_u32(__p0) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld3_u32(__p0) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld3_u64(__p0) __extension__ ({ \
-  uint64x1x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld3_u16(__p0) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld3_u16(__p0) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_s8(__p0) __extension__ ({ \
-  int8x8x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld3_s8(__p0) __extension__ ({ \
-  int8x8x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_f32(__p0) __extension__ ({ \
-  float32x2x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld3_f32(__p0) __extension__ ({ \
-  float32x2x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_s32(__p0) __extension__ ({ \
-  int32x2x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld3_s32(__p0) __extension__ ({ \
-  int32x2x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld3_s64(__p0) __extension__ ({ \
-  int64x1x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld3_s16(__p0) __extension__ ({ \
-  int16x4x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld3_s16(__p0) __extension__ ({ \
-  int16x4x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_p8(__p0) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld3_dup_p8(__p0) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_p16(__p0) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld3_dup_p16(__p0) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld3q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld3q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld3q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld3q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld3q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld3q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_s8(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld3q_dup_s8(__p0) __extension__ ({ \
-  int8x16x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_f32(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld3q_dup_f32(__p0) __extension__ ({ \
-  float32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_s32(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld3q_dup_s32(__p0) __extension__ ({ \
-  int32x4x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_s64(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld3q_dup_s64(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_s16(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld3q_dup_s16(__p0) __extension__ ({ \
-  int16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_u8(__p0) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld3_dup_u8(__p0) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_u32(__p0) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld3_dup_u32(__p0) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld3_dup_u64(__p0) __extension__ ({ \
-  uint64x1x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_u16(__p0) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld3_dup_u16(__p0) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_s8(__p0) __extension__ ({ \
-  int8x8x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld3_dup_s8(__p0) __extension__ ({ \
-  int8x8x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_f32(__p0) __extension__ ({ \
-  float32x2x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld3_dup_f32(__p0) __extension__ ({ \
-  float32x2x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_s32(__p0) __extension__ ({ \
-  int32x2x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld3_dup_s32(__p0) __extension__ ({ \
-  int32x2x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld3_dup_s64(__p0) __extension__ ({ \
-  int64x1x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_s16(__p0) __extension__ ({ \
-  int16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld3_dup_s16(__p0) __extension__ ({ \
-  int16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  poly8x8x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 4); \
-  __ret; \
-})
-#else
-#define vld3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x3_t __ret; \
-  poly8x8x3_t __s1 = __p1; \
-  poly8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  poly16x4x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 5); \
-  __ret; \
-})
-#else
-#define vld3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x3_t __ret; \
-  poly16x4x3_t __s1 = __p1; \
-  poly16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  poly16x8x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 37); \
-  __ret; \
-})
-#else
-#define vld3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x3_t __ret; \
-  poly16x8x3_t __s1 = __p1; \
-  poly16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  uint32x4x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 50); \
-  __ret; \
-})
-#else
-#define vld3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x3_t __ret; \
-  uint32x4x3_t __s1 = __p1; \
-  uint32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  uint16x8x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 49); \
-  __ret; \
-})
-#else
-#define vld3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x3_t __ret; \
-  uint16x8x3_t __s1 = __p1; \
-  uint16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x3_t __ret; \
-  float32x4x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 41); \
-  __ret; \
-})
-#else
-#define vld3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x3_t __ret; \
-  float32x4x3_t __s1 = __p1; \
-  float32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x3_t __ret; \
-  int32x4x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 34); \
-  __ret; \
-})
-#else
-#define vld3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x3_t __ret; \
-  int32x4x3_t __s1 = __p1; \
-  int32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x3_t __ret; \
-  int16x8x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 33); \
-  __ret; \
-})
-#else
-#define vld3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x3_t __ret; \
-  int16x8x3_t __s1 = __p1; \
-  int16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  uint8x8x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 16); \
-  __ret; \
-})
-#else
-#define vld3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x3_t __ret; \
-  uint8x8x3_t __s1 = __p1; \
-  uint8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  uint32x2x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 18); \
-  __ret; \
-})
-#else
-#define vld3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x3_t __ret; \
-  uint32x2x3_t __s1 = __p1; \
-  uint32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  uint16x4x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 17); \
-  __ret; \
-})
-#else
-#define vld3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x3_t __ret; \
-  uint16x4x3_t __s1 = __p1; \
-  uint16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x3_t __ret; \
-  int8x8x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 0); \
-  __ret; \
-})
-#else
-#define vld3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x3_t __ret; \
-  int8x8x3_t __s1 = __p1; \
-  int8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x3_t __ret; \
-  float32x2x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 9); \
-  __ret; \
-})
-#else
-#define vld3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x3_t __ret; \
-  float32x2x3_t __s1 = __p1; \
-  float32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x3_t __ret; \
-  int32x2x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 2); \
-  __ret; \
-})
-#else
-#define vld3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x3_t __ret; \
-  int32x2x3_t __s1 = __p1; \
-  int32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x3_t __ret; \
-  int16x4x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 1); \
-  __ret; \
-})
-#else
-#define vld3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x3_t __ret; \
-  int16x4x3_t __s1 = __p1; \
-  int16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_p8(__p0) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld4_p8(__p0) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_p16(__p0) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld4_p16(__p0) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_p8(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld4q_p8(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_p16(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld4q_p16(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_u8(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld4q_u8(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_u32(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld4q_u32(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_u16(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld4q_u16(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_s8(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld4q_s8(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_f32(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld4q_f32(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_s32(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld4q_s32(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_s16(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld4q_s16(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_u8(__p0) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld4_u8(__p0) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_u32(__p0) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld4_u32(__p0) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld4_u64(__p0) __extension__ ({ \
-  uint64x1x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld4_u16(__p0) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld4_u16(__p0) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_s8(__p0) __extension__ ({ \
-  int8x8x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld4_s8(__p0) __extension__ ({ \
-  int8x8x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_f32(__p0) __extension__ ({ \
-  float32x2x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld4_f32(__p0) __extension__ ({ \
-  float32x2x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_s32(__p0) __extension__ ({ \
-  int32x2x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld4_s32(__p0) __extension__ ({ \
-  int32x2x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld4_s64(__p0) __extension__ ({ \
-  int64x1x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld4_s16(__p0) __extension__ ({ \
-  int16x4x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld4_s16(__p0) __extension__ ({ \
-  int16x4x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_p8(__p0) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 4); \
-  __ret; \
-})
-#else
-#define vld4_dup_p8(__p0) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_p16(__p0) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 5); \
-  __ret; \
-})
-#else
-#define vld4_dup_p16(__p0) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
-  __ret; \
-})
-#else
-#define vld4q_dup_p8(__p0) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
-  __ret; \
-})
-#else
-#define vld4q_dup_p16(__p0) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
-  __ret; \
-})
-#else
-#define vld4q_dup_u8(__p0) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
-  __ret; \
-})
-#else
-#define vld4q_dup_u32(__p0) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld4q_dup_u64(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
-  __ret; \
-})
-#else
-#define vld4q_dup_u16(__p0) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_s8(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
-  __ret; \
-})
-#else
-#define vld4q_dup_s8(__p0) __extension__ ({ \
-  int8x16x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_f32(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
-  __ret; \
-})
-#else
-#define vld4q_dup_f32(__p0) __extension__ ({ \
-  float32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_s32(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
-  __ret; \
-})
-#else
-#define vld4q_dup_s32(__p0) __extension__ ({ \
-  int32x4x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_s64(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld4q_dup_s64(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_s16(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
-  __ret; \
-})
-#else
-#define vld4q_dup_s16(__p0) __extension__ ({ \
-  int16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_u8(__p0) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 16); \
-  __ret; \
-})
-#else
-#define vld4_dup_u8(__p0) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_u32(__p0) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 18); \
-  __ret; \
-})
-#else
-#define vld4_dup_u32(__p0) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld4_dup_u64(__p0) __extension__ ({ \
-  uint64x1x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_u16(__p0) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 17); \
-  __ret; \
-})
-#else
-#define vld4_dup_u16(__p0) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_s8(__p0) __extension__ ({ \
-  int8x8x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 0); \
-  __ret; \
-})
-#else
-#define vld4_dup_s8(__p0) __extension__ ({ \
-  int8x8x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_f32(__p0) __extension__ ({ \
-  float32x2x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 9); \
-  __ret; \
-})
-#else
-#define vld4_dup_f32(__p0) __extension__ ({ \
-  float32x2x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_s32(__p0) __extension__ ({ \
-  int32x2x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 2); \
-  __ret; \
-})
-#else
-#define vld4_dup_s32(__p0) __extension__ ({ \
-  int32x2x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld4_dup_s64(__p0) __extension__ ({ \
-  int64x1x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_s16(__p0) __extension__ ({ \
-  int16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 1); \
-  __ret; \
-})
-#else
-#define vld4_dup_s16(__p0) __extension__ ({ \
-  int16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  poly8x8x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 4); \
-  __ret; \
-})
-#else
-#define vld4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x4_t __ret; \
-  poly8x8x4_t __s1 = __p1; \
-  poly8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 4); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  poly16x4x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 5); \
-  __ret; \
-})
-#else
-#define vld4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x4_t __ret; \
-  poly16x4x4_t __s1 = __p1; \
-  poly16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 5); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  poly16x8x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 37); \
-  __ret; \
-})
-#else
-#define vld4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x4_t __ret; \
-  poly16x8x4_t __s1 = __p1; \
-  poly16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 37); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  uint32x4x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 50); \
-  __ret; \
-})
-#else
-#define vld4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x4_t __ret; \
-  uint32x4x4_t __s1 = __p1; \
-  uint32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 50); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  uint16x8x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 49); \
-  __ret; \
-})
-#else
-#define vld4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x4_t __ret; \
-  uint16x8x4_t __s1 = __p1; \
-  uint16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 49); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x4_t __ret; \
-  float32x4x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 41); \
-  __ret; \
-})
-#else
-#define vld4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x4_t __ret; \
-  float32x4x4_t __s1 = __p1; \
-  float32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 41); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x4_t __ret; \
-  int32x4x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 34); \
-  __ret; \
-})
-#else
-#define vld4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x4_t __ret; \
-  int32x4x4_t __s1 = __p1; \
-  int32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 34); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x4_t __ret; \
-  int16x8x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 33); \
-  __ret; \
-})
-#else
-#define vld4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x4_t __ret; \
-  int16x8x4_t __s1 = __p1; \
-  int16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 33); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  uint8x8x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 16); \
-  __ret; \
-})
-#else
-#define vld4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x4_t __ret; \
-  uint8x8x4_t __s1 = __p1; \
-  uint8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 16); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  uint32x2x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 18); \
-  __ret; \
-})
-#else
-#define vld4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x4_t __ret; \
-  uint32x2x4_t __s1 = __p1; \
-  uint32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 18); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  uint16x4x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 17); \
-  __ret; \
-})
-#else
-#define vld4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x4_t __ret; \
-  uint16x4x4_t __s1 = __p1; \
-  uint16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 17); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x4_t __ret; \
-  int8x8x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 0); \
-  __ret; \
-})
-#else
-#define vld4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x4_t __ret; \
-  int8x8x4_t __s1 = __p1; \
-  int8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 0); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x4_t __ret; \
-  float32x2x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 9); \
-  __ret; \
-})
-#else
-#define vld4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x4_t __ret; \
-  float32x2x4_t __s1 = __p1; \
-  float32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 9); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x4_t __ret; \
-  int32x2x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 2); \
-  __ret; \
-})
-#else
-#define vld4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x4_t __ret; \
-  int32x2x4_t __s1 = __p1; \
-  int32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 2); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x4_t __ret; \
-  int16x4x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 1); \
-  __ret; \
-})
-#else
-#define vld4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x4_t __ret; \
-  int16x4x4_t __s1 = __p1; \
-  int16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 1); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vmax_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vmax_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmax_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmax_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmax_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmax_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmax_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmax_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vminq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vminq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vminq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vminq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vminq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vminq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vminq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vminq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vmin_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vmin_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmin_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmin_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmin_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmin_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmin_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmin_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vmlaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vmlaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmlaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmlaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vmlaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vmlaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmlaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmlaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vmla_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vmla_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmla_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint32x2_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmla_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmla_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint16x4_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmla_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vmla_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vmla_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmla_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmla_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmla_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmla_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_lane_u32(__p0_54, __p1_54, __p2_54, __p3_54) __extension__ ({ \
-  uint32x4_t __ret_54; \
-  uint32x4_t __s0_54 = __p0_54; \
-  uint32x4_t __s1_54 = __p1_54; \
-  uint32x2_t __s2_54 = __p2_54; \
-  __ret_54 = __s0_54 + __s1_54 * splatq_lane_u32(__s2_54, __p3_54); \
-  __ret_54; \
-})
-#else
-#define vmlaq_lane_u32(__p0_55, __p1_55, __p2_55, __p3_55) __extension__ ({ \
-  uint32x4_t __ret_55; \
-  uint32x4_t __s0_55 = __p0_55; \
-  uint32x4_t __s1_55 = __p1_55; \
-  uint32x2_t __s2_55 = __p2_55; \
-  uint32x4_t __rev0_55;  __rev0_55 = __builtin_shufflevector(__s0_55, __s0_55, 3, 2, 1, 0); \
-  uint32x4_t __rev1_55;  __rev1_55 = __builtin_shufflevector(__s1_55, __s1_55, 3, 2, 1, 0); \
-  uint32x2_t __rev2_55;  __rev2_55 = __builtin_shufflevector(__s2_55, __s2_55, 1, 0); \
-  __ret_55 = __rev0_55 + __rev1_55 * __noswap_splatq_lane_u32(__rev2_55, __p3_55); \
-  __ret_55 = __builtin_shufflevector(__ret_55, __ret_55, 3, 2, 1, 0); \
-  __ret_55; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_lane_u16(__p0_56, __p1_56, __p2_56, __p3_56) __extension__ ({ \
-  uint16x8_t __ret_56; \
-  uint16x8_t __s0_56 = __p0_56; \
-  uint16x8_t __s1_56 = __p1_56; \
-  uint16x4_t __s2_56 = __p2_56; \
-  __ret_56 = __s0_56 + __s1_56 * splatq_lane_u16(__s2_56, __p3_56); \
-  __ret_56; \
-})
-#else
-#define vmlaq_lane_u16(__p0_57, __p1_57, __p2_57, __p3_57) __extension__ ({ \
-  uint16x8_t __ret_57; \
-  uint16x8_t __s0_57 = __p0_57; \
-  uint16x8_t __s1_57 = __p1_57; \
-  uint16x4_t __s2_57 = __p2_57; \
-  uint16x8_t __rev0_57;  __rev0_57 = __builtin_shufflevector(__s0_57, __s0_57, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_57;  __rev1_57 = __builtin_shufflevector(__s1_57, __s1_57, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev2_57;  __rev2_57 = __builtin_shufflevector(__s2_57, __s2_57, 3, 2, 1, 0); \
-  __ret_57 = __rev0_57 + __rev1_57 * __noswap_splatq_lane_u16(__rev2_57, __p3_57); \
-  __ret_57 = __builtin_shufflevector(__ret_57, __ret_57, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_57; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_lane_f32(__p0_58, __p1_58, __p2_58, __p3_58) __extension__ ({ \
-  float32x4_t __ret_58; \
-  float32x4_t __s0_58 = __p0_58; \
-  float32x4_t __s1_58 = __p1_58; \
-  float32x2_t __s2_58 = __p2_58; \
-  __ret_58 = __s0_58 + __s1_58 * splatq_lane_f32(__s2_58, __p3_58); \
-  __ret_58; \
-})
-#else
-#define vmlaq_lane_f32(__p0_59, __p1_59, __p2_59, __p3_59) __extension__ ({ \
-  float32x4_t __ret_59; \
-  float32x4_t __s0_59 = __p0_59; \
-  float32x4_t __s1_59 = __p1_59; \
-  float32x2_t __s2_59 = __p2_59; \
-  float32x4_t __rev0_59;  __rev0_59 = __builtin_shufflevector(__s0_59, __s0_59, 3, 2, 1, 0); \
-  float32x4_t __rev1_59;  __rev1_59 = __builtin_shufflevector(__s1_59, __s1_59, 3, 2, 1, 0); \
-  float32x2_t __rev2_59;  __rev2_59 = __builtin_shufflevector(__s2_59, __s2_59, 1, 0); \
-  __ret_59 = __rev0_59 + __rev1_59 * __noswap_splatq_lane_f32(__rev2_59, __p3_59); \
-  __ret_59 = __builtin_shufflevector(__ret_59, __ret_59, 3, 2, 1, 0); \
-  __ret_59; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_lane_s32(__p0_60, __p1_60, __p2_60, __p3_60) __extension__ ({ \
-  int32x4_t __ret_60; \
-  int32x4_t __s0_60 = __p0_60; \
-  int32x4_t __s1_60 = __p1_60; \
-  int32x2_t __s2_60 = __p2_60; \
-  __ret_60 = __s0_60 + __s1_60 * splatq_lane_s32(__s2_60, __p3_60); \
-  __ret_60; \
-})
-#else
-#define vmlaq_lane_s32(__p0_61, __p1_61, __p2_61, __p3_61) __extension__ ({ \
-  int32x4_t __ret_61; \
-  int32x4_t __s0_61 = __p0_61; \
-  int32x4_t __s1_61 = __p1_61; \
-  int32x2_t __s2_61 = __p2_61; \
-  int32x4_t __rev0_61;  __rev0_61 = __builtin_shufflevector(__s0_61, __s0_61, 3, 2, 1, 0); \
-  int32x4_t __rev1_61;  __rev1_61 = __builtin_shufflevector(__s1_61, __s1_61, 3, 2, 1, 0); \
-  int32x2_t __rev2_61;  __rev2_61 = __builtin_shufflevector(__s2_61, __s2_61, 1, 0); \
-  __ret_61 = __rev0_61 + __rev1_61 * __noswap_splatq_lane_s32(__rev2_61, __p3_61); \
-  __ret_61 = __builtin_shufflevector(__ret_61, __ret_61, 3, 2, 1, 0); \
-  __ret_61; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_lane_s16(__p0_62, __p1_62, __p2_62, __p3_62) __extension__ ({ \
-  int16x8_t __ret_62; \
-  int16x8_t __s0_62 = __p0_62; \
-  int16x8_t __s1_62 = __p1_62; \
-  int16x4_t __s2_62 = __p2_62; \
-  __ret_62 = __s0_62 + __s1_62 * splatq_lane_s16(__s2_62, __p3_62); \
-  __ret_62; \
-})
-#else
-#define vmlaq_lane_s16(__p0_63, __p1_63, __p2_63, __p3_63) __extension__ ({ \
-  int16x8_t __ret_63; \
-  int16x8_t __s0_63 = __p0_63; \
-  int16x8_t __s1_63 = __p1_63; \
-  int16x4_t __s2_63 = __p2_63; \
-  int16x8_t __rev0_63;  __rev0_63 = __builtin_shufflevector(__s0_63, __s0_63, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_63;  __rev1_63 = __builtin_shufflevector(__s1_63, __s1_63, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_63;  __rev2_63 = __builtin_shufflevector(__s2_63, __s2_63, 3, 2, 1, 0); \
-  __ret_63 = __rev0_63 + __rev1_63 * __noswap_splatq_lane_s16(__rev2_63, __p3_63); \
-  __ret_63 = __builtin_shufflevector(__ret_63, __ret_63, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_63; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_lane_u32(__p0_64, __p1_64, __p2_64, __p3_64) __extension__ ({ \
-  uint32x2_t __ret_64; \
-  uint32x2_t __s0_64 = __p0_64; \
-  uint32x2_t __s1_64 = __p1_64; \
-  uint32x2_t __s2_64 = __p2_64; \
-  __ret_64 = __s0_64 + __s1_64 * splat_lane_u32(__s2_64, __p3_64); \
-  __ret_64; \
-})
-#else
-#define vmla_lane_u32(__p0_65, __p1_65, __p2_65, __p3_65) __extension__ ({ \
-  uint32x2_t __ret_65; \
-  uint32x2_t __s0_65 = __p0_65; \
-  uint32x2_t __s1_65 = __p1_65; \
-  uint32x2_t __s2_65 = __p2_65; \
-  uint32x2_t __rev0_65;  __rev0_65 = __builtin_shufflevector(__s0_65, __s0_65, 1, 0); \
-  uint32x2_t __rev1_65;  __rev1_65 = __builtin_shufflevector(__s1_65, __s1_65, 1, 0); \
-  uint32x2_t __rev2_65;  __rev2_65 = __builtin_shufflevector(__s2_65, __s2_65, 1, 0); \
-  __ret_65 = __rev0_65 + __rev1_65 * __noswap_splat_lane_u32(__rev2_65, __p3_65); \
-  __ret_65 = __builtin_shufflevector(__ret_65, __ret_65, 1, 0); \
-  __ret_65; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_lane_u16(__p0_66, __p1_66, __p2_66, __p3_66) __extension__ ({ \
-  uint16x4_t __ret_66; \
-  uint16x4_t __s0_66 = __p0_66; \
-  uint16x4_t __s1_66 = __p1_66; \
-  uint16x4_t __s2_66 = __p2_66; \
-  __ret_66 = __s0_66 + __s1_66 * splat_lane_u16(__s2_66, __p3_66); \
-  __ret_66; \
-})
-#else
-#define vmla_lane_u16(__p0_67, __p1_67, __p2_67, __p3_67) __extension__ ({ \
-  uint16x4_t __ret_67; \
-  uint16x4_t __s0_67 = __p0_67; \
-  uint16x4_t __s1_67 = __p1_67; \
-  uint16x4_t __s2_67 = __p2_67; \
-  uint16x4_t __rev0_67;  __rev0_67 = __builtin_shufflevector(__s0_67, __s0_67, 3, 2, 1, 0); \
-  uint16x4_t __rev1_67;  __rev1_67 = __builtin_shufflevector(__s1_67, __s1_67, 3, 2, 1, 0); \
-  uint16x4_t __rev2_67;  __rev2_67 = __builtin_shufflevector(__s2_67, __s2_67, 3, 2, 1, 0); \
-  __ret_67 = __rev0_67 + __rev1_67 * __noswap_splat_lane_u16(__rev2_67, __p3_67); \
-  __ret_67 = __builtin_shufflevector(__ret_67, __ret_67, 3, 2, 1, 0); \
-  __ret_67; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_lane_f32(__p0_68, __p1_68, __p2_68, __p3_68) __extension__ ({ \
-  float32x2_t __ret_68; \
-  float32x2_t __s0_68 = __p0_68; \
-  float32x2_t __s1_68 = __p1_68; \
-  float32x2_t __s2_68 = __p2_68; \
-  __ret_68 = __s0_68 + __s1_68 * splat_lane_f32(__s2_68, __p3_68); \
-  __ret_68; \
-})
-#else
-#define vmla_lane_f32(__p0_69, __p1_69, __p2_69, __p3_69) __extension__ ({ \
-  float32x2_t __ret_69; \
-  float32x2_t __s0_69 = __p0_69; \
-  float32x2_t __s1_69 = __p1_69; \
-  float32x2_t __s2_69 = __p2_69; \
-  float32x2_t __rev0_69;  __rev0_69 = __builtin_shufflevector(__s0_69, __s0_69, 1, 0); \
-  float32x2_t __rev1_69;  __rev1_69 = __builtin_shufflevector(__s1_69, __s1_69, 1, 0); \
-  float32x2_t __rev2_69;  __rev2_69 = __builtin_shufflevector(__s2_69, __s2_69, 1, 0); \
-  __ret_69 = __rev0_69 + __rev1_69 * __noswap_splat_lane_f32(__rev2_69, __p3_69); \
-  __ret_69 = __builtin_shufflevector(__ret_69, __ret_69, 1, 0); \
-  __ret_69; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_lane_s32(__p0_70, __p1_70, __p2_70, __p3_70) __extension__ ({ \
-  int32x2_t __ret_70; \
-  int32x2_t __s0_70 = __p0_70; \
-  int32x2_t __s1_70 = __p1_70; \
-  int32x2_t __s2_70 = __p2_70; \
-  __ret_70 = __s0_70 + __s1_70 * splat_lane_s32(__s2_70, __p3_70); \
-  __ret_70; \
-})
-#else
-#define vmla_lane_s32(__p0_71, __p1_71, __p2_71, __p3_71) __extension__ ({ \
-  int32x2_t __ret_71; \
-  int32x2_t __s0_71 = __p0_71; \
-  int32x2_t __s1_71 = __p1_71; \
-  int32x2_t __s2_71 = __p2_71; \
-  int32x2_t __rev0_71;  __rev0_71 = __builtin_shufflevector(__s0_71, __s0_71, 1, 0); \
-  int32x2_t __rev1_71;  __rev1_71 = __builtin_shufflevector(__s1_71, __s1_71, 1, 0); \
-  int32x2_t __rev2_71;  __rev2_71 = __builtin_shufflevector(__s2_71, __s2_71, 1, 0); \
-  __ret_71 = __rev0_71 + __rev1_71 * __noswap_splat_lane_s32(__rev2_71, __p3_71); \
-  __ret_71 = __builtin_shufflevector(__ret_71, __ret_71, 1, 0); \
-  __ret_71; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_lane_s16(__p0_72, __p1_72, __p2_72, __p3_72) __extension__ ({ \
-  int16x4_t __ret_72; \
-  int16x4_t __s0_72 = __p0_72; \
-  int16x4_t __s1_72 = __p1_72; \
-  int16x4_t __s2_72 = __p2_72; \
-  __ret_72 = __s0_72 + __s1_72 * splat_lane_s16(__s2_72, __p3_72); \
-  __ret_72; \
-})
-#else
-#define vmla_lane_s16(__p0_73, __p1_73, __p2_73, __p3_73) __extension__ ({ \
-  int16x4_t __ret_73; \
-  int16x4_t __s0_73 = __p0_73; \
-  int16x4_t __s1_73 = __p1_73; \
-  int16x4_t __s2_73 = __p2_73; \
-  int16x4_t __rev0_73;  __rev0_73 = __builtin_shufflevector(__s0_73, __s0_73, 3, 2, 1, 0); \
-  int16x4_t __rev1_73;  __rev1_73 = __builtin_shufflevector(__s1_73, __s1_73, 3, 2, 1, 0); \
-  int16x4_t __rev2_73;  __rev2_73 = __builtin_shufflevector(__s2_73, __s2_73, 3, 2, 1, 0); \
-  __ret_73 = __rev0_73 + __rev1_73 * __noswap_splat_lane_s16(__rev2_73, __p3_73); \
-  __ret_73 = __builtin_shufflevector(__ret_73, __ret_73, 3, 2, 1, 0); \
-  __ret_73; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlaq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 + __p1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlaq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmlaq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 + __p1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmlaq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmlaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __ret;
-  __ret = __p0 + __p1 * (float32x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmlaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * (float32x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlaq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 + __p1 * (int32x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlaq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * (int32x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmlaq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 + __p1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmlaq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmla_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint32x2_t __ret;
-  __ret = __p0 + __p1 * (uint32x2_t) {__p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmla_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __rev1 * (uint32x2_t) {__p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmla_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint16x4_t __ret;
-  __ret = __p0 + __p1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmla_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmla_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __ret;
-  __ret = __p0 + __p1 * (float32x2_t) {__p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmla_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __rev1 * (float32x2_t) {__p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmla_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int32x2_t __ret;
-  __ret = __p0 + __p1 * (int32x2_t) {__p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmla_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __rev1 * (int32x2_t) {__p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmla_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int16x4_t __ret;
-  __ret = __p0 + __p1 * (int16x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmla_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __rev1 * (int16x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vmlsq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vmlsq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlsq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlsq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmlsq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmlsq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vmlsq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vmlsq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmlsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmlsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlsq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlsq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmlsq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmlsq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vmls_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vmls_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmls_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint32x2_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmls_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmls_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint16x4_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmls_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vmls_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vmls_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmls_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmls_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmls_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmls_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmls_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmls_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_lane_u32(__p0_74, __p1_74, __p2_74, __p3_74) __extension__ ({ \
-  uint32x4_t __ret_74; \
-  uint32x4_t __s0_74 = __p0_74; \
-  uint32x4_t __s1_74 = __p1_74; \
-  uint32x2_t __s2_74 = __p2_74; \
-  __ret_74 = __s0_74 - __s1_74 * splatq_lane_u32(__s2_74, __p3_74); \
-  __ret_74; \
-})
-#else
-#define vmlsq_lane_u32(__p0_75, __p1_75, __p2_75, __p3_75) __extension__ ({ \
-  uint32x4_t __ret_75; \
-  uint32x4_t __s0_75 = __p0_75; \
-  uint32x4_t __s1_75 = __p1_75; \
-  uint32x2_t __s2_75 = __p2_75; \
-  uint32x4_t __rev0_75;  __rev0_75 = __builtin_shufflevector(__s0_75, __s0_75, 3, 2, 1, 0); \
-  uint32x4_t __rev1_75;  __rev1_75 = __builtin_shufflevector(__s1_75, __s1_75, 3, 2, 1, 0); \
-  uint32x2_t __rev2_75;  __rev2_75 = __builtin_shufflevector(__s2_75, __s2_75, 1, 0); \
-  __ret_75 = __rev0_75 - __rev1_75 * __noswap_splatq_lane_u32(__rev2_75, __p3_75); \
-  __ret_75 = __builtin_shufflevector(__ret_75, __ret_75, 3, 2, 1, 0); \
-  __ret_75; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_lane_u16(__p0_76, __p1_76, __p2_76, __p3_76) __extension__ ({ \
-  uint16x8_t __ret_76; \
-  uint16x8_t __s0_76 = __p0_76; \
-  uint16x8_t __s1_76 = __p1_76; \
-  uint16x4_t __s2_76 = __p2_76; \
-  __ret_76 = __s0_76 - __s1_76 * splatq_lane_u16(__s2_76, __p3_76); \
-  __ret_76; \
-})
-#else
-#define vmlsq_lane_u16(__p0_77, __p1_77, __p2_77, __p3_77) __extension__ ({ \
-  uint16x8_t __ret_77; \
-  uint16x8_t __s0_77 = __p0_77; \
-  uint16x8_t __s1_77 = __p1_77; \
-  uint16x4_t __s2_77 = __p2_77; \
-  uint16x8_t __rev0_77;  __rev0_77 = __builtin_shufflevector(__s0_77, __s0_77, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_77;  __rev1_77 = __builtin_shufflevector(__s1_77, __s1_77, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev2_77;  __rev2_77 = __builtin_shufflevector(__s2_77, __s2_77, 3, 2, 1, 0); \
-  __ret_77 = __rev0_77 - __rev1_77 * __noswap_splatq_lane_u16(__rev2_77, __p3_77); \
-  __ret_77 = __builtin_shufflevector(__ret_77, __ret_77, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_77; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_lane_f32(__p0_78, __p1_78, __p2_78, __p3_78) __extension__ ({ \
-  float32x4_t __ret_78; \
-  float32x4_t __s0_78 = __p0_78; \
-  float32x4_t __s1_78 = __p1_78; \
-  float32x2_t __s2_78 = __p2_78; \
-  __ret_78 = __s0_78 - __s1_78 * splatq_lane_f32(__s2_78, __p3_78); \
-  __ret_78; \
-})
-#else
-#define vmlsq_lane_f32(__p0_79, __p1_79, __p2_79, __p3_79) __extension__ ({ \
-  float32x4_t __ret_79; \
-  float32x4_t __s0_79 = __p0_79; \
-  float32x4_t __s1_79 = __p1_79; \
-  float32x2_t __s2_79 = __p2_79; \
-  float32x4_t __rev0_79;  __rev0_79 = __builtin_shufflevector(__s0_79, __s0_79, 3, 2, 1, 0); \
-  float32x4_t __rev1_79;  __rev1_79 = __builtin_shufflevector(__s1_79, __s1_79, 3, 2, 1, 0); \
-  float32x2_t __rev2_79;  __rev2_79 = __builtin_shufflevector(__s2_79, __s2_79, 1, 0); \
-  __ret_79 = __rev0_79 - __rev1_79 * __noswap_splatq_lane_f32(__rev2_79, __p3_79); \
-  __ret_79 = __builtin_shufflevector(__ret_79, __ret_79, 3, 2, 1, 0); \
-  __ret_79; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_lane_s32(__p0_80, __p1_80, __p2_80, __p3_80) __extension__ ({ \
-  int32x4_t __ret_80; \
-  int32x4_t __s0_80 = __p0_80; \
-  int32x4_t __s1_80 = __p1_80; \
-  int32x2_t __s2_80 = __p2_80; \
-  __ret_80 = __s0_80 - __s1_80 * splatq_lane_s32(__s2_80, __p3_80); \
-  __ret_80; \
-})
-#else
-#define vmlsq_lane_s32(__p0_81, __p1_81, __p2_81, __p3_81) __extension__ ({ \
-  int32x4_t __ret_81; \
-  int32x4_t __s0_81 = __p0_81; \
-  int32x4_t __s1_81 = __p1_81; \
-  int32x2_t __s2_81 = __p2_81; \
-  int32x4_t __rev0_81;  __rev0_81 = __builtin_shufflevector(__s0_81, __s0_81, 3, 2, 1, 0); \
-  int32x4_t __rev1_81;  __rev1_81 = __builtin_shufflevector(__s1_81, __s1_81, 3, 2, 1, 0); \
-  int32x2_t __rev2_81;  __rev2_81 = __builtin_shufflevector(__s2_81, __s2_81, 1, 0); \
-  __ret_81 = __rev0_81 - __rev1_81 * __noswap_splatq_lane_s32(__rev2_81, __p3_81); \
-  __ret_81 = __builtin_shufflevector(__ret_81, __ret_81, 3, 2, 1, 0); \
-  __ret_81; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_lane_s16(__p0_82, __p1_82, __p2_82, __p3_82) __extension__ ({ \
-  int16x8_t __ret_82; \
-  int16x8_t __s0_82 = __p0_82; \
-  int16x8_t __s1_82 = __p1_82; \
-  int16x4_t __s2_82 = __p2_82; \
-  __ret_82 = __s0_82 - __s1_82 * splatq_lane_s16(__s2_82, __p3_82); \
-  __ret_82; \
-})
-#else
-#define vmlsq_lane_s16(__p0_83, __p1_83, __p2_83, __p3_83) __extension__ ({ \
-  int16x8_t __ret_83; \
-  int16x8_t __s0_83 = __p0_83; \
-  int16x8_t __s1_83 = __p1_83; \
-  int16x4_t __s2_83 = __p2_83; \
-  int16x8_t __rev0_83;  __rev0_83 = __builtin_shufflevector(__s0_83, __s0_83, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_83;  __rev1_83 = __builtin_shufflevector(__s1_83, __s1_83, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_83;  __rev2_83 = __builtin_shufflevector(__s2_83, __s2_83, 3, 2, 1, 0); \
-  __ret_83 = __rev0_83 - __rev1_83 * __noswap_splatq_lane_s16(__rev2_83, __p3_83); \
-  __ret_83 = __builtin_shufflevector(__ret_83, __ret_83, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_83; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_lane_u32(__p0_84, __p1_84, __p2_84, __p3_84) __extension__ ({ \
-  uint32x2_t __ret_84; \
-  uint32x2_t __s0_84 = __p0_84; \
-  uint32x2_t __s1_84 = __p1_84; \
-  uint32x2_t __s2_84 = __p2_84; \
-  __ret_84 = __s0_84 - __s1_84 * splat_lane_u32(__s2_84, __p3_84); \
-  __ret_84; \
-})
-#else
-#define vmls_lane_u32(__p0_85, __p1_85, __p2_85, __p3_85) __extension__ ({ \
-  uint32x2_t __ret_85; \
-  uint32x2_t __s0_85 = __p0_85; \
-  uint32x2_t __s1_85 = __p1_85; \
-  uint32x2_t __s2_85 = __p2_85; \
-  uint32x2_t __rev0_85;  __rev0_85 = __builtin_shufflevector(__s0_85, __s0_85, 1, 0); \
-  uint32x2_t __rev1_85;  __rev1_85 = __builtin_shufflevector(__s1_85, __s1_85, 1, 0); \
-  uint32x2_t __rev2_85;  __rev2_85 = __builtin_shufflevector(__s2_85, __s2_85, 1, 0); \
-  __ret_85 = __rev0_85 - __rev1_85 * __noswap_splat_lane_u32(__rev2_85, __p3_85); \
-  __ret_85 = __builtin_shufflevector(__ret_85, __ret_85, 1, 0); \
-  __ret_85; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_lane_u16(__p0_86, __p1_86, __p2_86, __p3_86) __extension__ ({ \
-  uint16x4_t __ret_86; \
-  uint16x4_t __s0_86 = __p0_86; \
-  uint16x4_t __s1_86 = __p1_86; \
-  uint16x4_t __s2_86 = __p2_86; \
-  __ret_86 = __s0_86 - __s1_86 * splat_lane_u16(__s2_86, __p3_86); \
-  __ret_86; \
-})
-#else
-#define vmls_lane_u16(__p0_87, __p1_87, __p2_87, __p3_87) __extension__ ({ \
-  uint16x4_t __ret_87; \
-  uint16x4_t __s0_87 = __p0_87; \
-  uint16x4_t __s1_87 = __p1_87; \
-  uint16x4_t __s2_87 = __p2_87; \
-  uint16x4_t __rev0_87;  __rev0_87 = __builtin_shufflevector(__s0_87, __s0_87, 3, 2, 1, 0); \
-  uint16x4_t __rev1_87;  __rev1_87 = __builtin_shufflevector(__s1_87, __s1_87, 3, 2, 1, 0); \
-  uint16x4_t __rev2_87;  __rev2_87 = __builtin_shufflevector(__s2_87, __s2_87, 3, 2, 1, 0); \
-  __ret_87 = __rev0_87 - __rev1_87 * __noswap_splat_lane_u16(__rev2_87, __p3_87); \
-  __ret_87 = __builtin_shufflevector(__ret_87, __ret_87, 3, 2, 1, 0); \
-  __ret_87; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_lane_f32(__p0_88, __p1_88, __p2_88, __p3_88) __extension__ ({ \
-  float32x2_t __ret_88; \
-  float32x2_t __s0_88 = __p0_88; \
-  float32x2_t __s1_88 = __p1_88; \
-  float32x2_t __s2_88 = __p2_88; \
-  __ret_88 = __s0_88 - __s1_88 * splat_lane_f32(__s2_88, __p3_88); \
-  __ret_88; \
-})
-#else
-#define vmls_lane_f32(__p0_89, __p1_89, __p2_89, __p3_89) __extension__ ({ \
-  float32x2_t __ret_89; \
-  float32x2_t __s0_89 = __p0_89; \
-  float32x2_t __s1_89 = __p1_89; \
-  float32x2_t __s2_89 = __p2_89; \
-  float32x2_t __rev0_89;  __rev0_89 = __builtin_shufflevector(__s0_89, __s0_89, 1, 0); \
-  float32x2_t __rev1_89;  __rev1_89 = __builtin_shufflevector(__s1_89, __s1_89, 1, 0); \
-  float32x2_t __rev2_89;  __rev2_89 = __builtin_shufflevector(__s2_89, __s2_89, 1, 0); \
-  __ret_89 = __rev0_89 - __rev1_89 * __noswap_splat_lane_f32(__rev2_89, __p3_89); \
-  __ret_89 = __builtin_shufflevector(__ret_89, __ret_89, 1, 0); \
-  __ret_89; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_lane_s32(__p0_90, __p1_90, __p2_90, __p3_90) __extension__ ({ \
-  int32x2_t __ret_90; \
-  int32x2_t __s0_90 = __p0_90; \
-  int32x2_t __s1_90 = __p1_90; \
-  int32x2_t __s2_90 = __p2_90; \
-  __ret_90 = __s0_90 - __s1_90 * splat_lane_s32(__s2_90, __p3_90); \
-  __ret_90; \
-})
-#else
-#define vmls_lane_s32(__p0_91, __p1_91, __p2_91, __p3_91) __extension__ ({ \
-  int32x2_t __ret_91; \
-  int32x2_t __s0_91 = __p0_91; \
-  int32x2_t __s1_91 = __p1_91; \
-  int32x2_t __s2_91 = __p2_91; \
-  int32x2_t __rev0_91;  __rev0_91 = __builtin_shufflevector(__s0_91, __s0_91, 1, 0); \
-  int32x2_t __rev1_91;  __rev1_91 = __builtin_shufflevector(__s1_91, __s1_91, 1, 0); \
-  int32x2_t __rev2_91;  __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 1, 0); \
-  __ret_91 = __rev0_91 - __rev1_91 * __noswap_splat_lane_s32(__rev2_91, __p3_91); \
-  __ret_91 = __builtin_shufflevector(__ret_91, __ret_91, 1, 0); \
-  __ret_91; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_lane_s16(__p0_92, __p1_92, __p2_92, __p3_92) __extension__ ({ \
-  int16x4_t __ret_92; \
-  int16x4_t __s0_92 = __p0_92; \
-  int16x4_t __s1_92 = __p1_92; \
-  int16x4_t __s2_92 = __p2_92; \
-  __ret_92 = __s0_92 - __s1_92 * splat_lane_s16(__s2_92, __p3_92); \
-  __ret_92; \
-})
-#else
-#define vmls_lane_s16(__p0_93, __p1_93, __p2_93, __p3_93) __extension__ ({ \
-  int16x4_t __ret_93; \
-  int16x4_t __s0_93 = __p0_93; \
-  int16x4_t __s1_93 = __p1_93; \
-  int16x4_t __s2_93 = __p2_93; \
-  int16x4_t __rev0_93;  __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 3, 2, 1, 0); \
-  int16x4_t __rev1_93;  __rev1_93 = __builtin_shufflevector(__s1_93, __s1_93, 3, 2, 1, 0); \
-  int16x4_t __rev2_93;  __rev2_93 = __builtin_shufflevector(__s2_93, __s2_93, 3, 2, 1, 0); \
-  __ret_93 = __rev0_93 - __rev1_93 * __noswap_splat_lane_s16(__rev2_93, __p3_93); \
-  __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 3, 2, 1, 0); \
-  __ret_93; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlsq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 - __p1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlsq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmlsq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 - __p1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmlsq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmlsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __ret;
-  __ret = __p0 - __p1 * (float32x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmlsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * (float32x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlsq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 - __p1 * (int32x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlsq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * (int32x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmlsq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 - __p1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmlsq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmls_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint32x2_t __ret;
-  __ret = __p0 - __p1 * (uint32x2_t) {__p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmls_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __rev1 * (uint32x2_t) {__p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmls_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint16x4_t __ret;
-  __ret = __p0 - __p1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmls_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmls_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __ret;
-  __ret = __p0 - __p1 * (float32x2_t) {__p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmls_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __rev1 * (float32x2_t) {__p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmls_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int32x2_t __ret;
-  __ret = __p0 - __p1 * (int32x2_t) {__p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmls_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __rev1 * (int32x2_t) {__p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmls_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int16x4_t __ret;
-  __ret = __p0 - __p1 * (int16x4_t) {__p2, __p2, __p2, __p2};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmls_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1 * (int16x4_t) {__p2, __p2, __p2, __p2};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vmov_n_p8(poly8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vmov_n_p8(poly8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vmov_n_p16(poly16_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vmov_n_p16(poly16_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vmovq_n_p8(poly8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vmovq_n_p8(poly8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vmovq_n_p16(poly16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vmovq_n_p16(poly16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vmovq_n_u8(uint8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vmovq_n_u8(uint8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmovq_n_u32(uint32_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmovq_n_u32(uint32_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmovq_n_u64(uint64_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmovq_n_u64(uint64_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmovq_n_u16(uint16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmovq_n_u16(uint16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vmovq_n_s8(int8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vmovq_n_s8(int8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmovq_n_f32(float32_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmovq_n_f32(float32_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmovq_n_f16(__p0) __extension__ ({ \
-  float16x8_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
-  __ret; \
-})
-#else
-#define vmovq_n_f16(__p0) __extension__ ({ \
-  float16x8_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmovq_n_s32(int32_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmovq_n_s32(int32_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmovq_n_s64(int64_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmovq_n_s64(int64_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmovq_n_s16(int16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmovq_n_s16(int16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vmov_n_u8(uint8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vmov_n_u8(uint8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmov_n_u32(uint32_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmov_n_u32(uint32_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vmov_n_u64(uint64_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) {__p0};
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmov_n_u16(uint16_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmov_n_u16(uint16_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vmov_n_s8(int8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vmov_n_s8(int8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmov_n_f32(float32_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmov_n_f32(float32_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmov_n_f16(__p0) __extension__ ({ \
-  float16x4_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
-  __ret; \
-})
-#else
-#define vmov_n_f16(__p0) __extension__ ({ \
-  float16x4_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmov_n_s32(int32_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmov_n_s32(int32_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vmov_n_s64(int64_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) {__p0};
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmov_n_s16(int16_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmov_n_s16(int16_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmovl_u8(uint8x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmovl_u8(uint8x8_t __p0) {
-  uint16x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t __noswap_vmovl_u8(uint8x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 49);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmovl_u32(uint32x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmovl_u32(uint32x2_t __p0) {
-  uint64x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vmovl_u32(uint32x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 51);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmovl_u16(uint16x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmovl_u16(uint16x4_t __p0) {
-  uint32x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vmovl_u16(uint16x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 50);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmovl_s8(int8x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmovl_s8(int8x8_t __p0) {
-  int16x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vmovl_s8(int8x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 33);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmovl_s32(int32x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmovl_s32(int32x2_t __p0) {
-  int64x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vmovl_s32(int32x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 35);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmovl_s16(int16x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmovl_s16(int16x4_t __p0) {
-  int32x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vmovl_s16(int16x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmovn_u32(uint32x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmovn_u32(uint32x4_t __p0) {
-  uint16x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t __noswap_vmovn_u32(uint32x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 17);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmovn_u64(uint64x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmovn_u64(uint64x2_t __p0) {
-  uint32x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t __noswap_vmovn_u64(uint64x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 18);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vmovn_u16(uint16x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vmovn_u16(uint16x8_t __p0) {
-  uint8x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t __noswap_vmovn_u16(uint16x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 16);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmovn_s32(int32x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmovn_s32(int32x4_t __p0) {
-  int16x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vmovn_s32(int32x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmovn_s64(int64x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmovn_s64(int64x2_t __p0) {
-  int32x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vmovn_s64(int64x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vmovn_s16(int16x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vmovn_s16(int16x8_t __p0) {
-  int8x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t __noswap_vmovn_s16(int16x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vmulq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vmulq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmulq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmulq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmulq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmulq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vmulq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vmulq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmulq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmulq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmulq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmulq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmulq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmulq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vmul_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vmul_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmul_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmul_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmul_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmul_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vmul_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vmul_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmul_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmul_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmul_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmul_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmul_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmul_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vmul_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vmul_v((int8x8_t)__p0, (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vmul_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vmul_v((int8x8_t)__rev0, (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vmulq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vmulq_v((int8x16_t)__p0, (int8x16_t)__p1, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vmulq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vmulq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_lane_u32(__p0_94, __p1_94, __p2_94) __extension__ ({ \
-  uint32x4_t __ret_94; \
-  uint32x4_t __s0_94 = __p0_94; \
-  uint32x2_t __s1_94 = __p1_94; \
-  __ret_94 = __s0_94 * splatq_lane_u32(__s1_94, __p2_94); \
-  __ret_94; \
-})
-#else
-#define vmulq_lane_u32(__p0_95, __p1_95, __p2_95) __extension__ ({ \
-  uint32x4_t __ret_95; \
-  uint32x4_t __s0_95 = __p0_95; \
-  uint32x2_t __s1_95 = __p1_95; \
-  uint32x4_t __rev0_95;  __rev0_95 = __builtin_shufflevector(__s0_95, __s0_95, 3, 2, 1, 0); \
-  uint32x2_t __rev1_95;  __rev1_95 = __builtin_shufflevector(__s1_95, __s1_95, 1, 0); \
-  __ret_95 = __rev0_95 * __noswap_splatq_lane_u32(__rev1_95, __p2_95); \
-  __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 3, 2, 1, 0); \
-  __ret_95; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_lane_u16(__p0_96, __p1_96, __p2_96) __extension__ ({ \
-  uint16x8_t __ret_96; \
-  uint16x8_t __s0_96 = __p0_96; \
-  uint16x4_t __s1_96 = __p1_96; \
-  __ret_96 = __s0_96 * splatq_lane_u16(__s1_96, __p2_96); \
-  __ret_96; \
-})
-#else
-#define vmulq_lane_u16(__p0_97, __p1_97, __p2_97) __extension__ ({ \
-  uint16x8_t __ret_97; \
-  uint16x8_t __s0_97 = __p0_97; \
-  uint16x4_t __s1_97 = __p1_97; \
-  uint16x8_t __rev0_97;  __rev0_97 = __builtin_shufflevector(__s0_97, __s0_97, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev1_97;  __rev1_97 = __builtin_shufflevector(__s1_97, __s1_97, 3, 2, 1, 0); \
-  __ret_97 = __rev0_97 * __noswap_splatq_lane_u16(__rev1_97, __p2_97); \
-  __ret_97 = __builtin_shufflevector(__ret_97, __ret_97, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_97; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_lane_f32(__p0_98, __p1_98, __p2_98) __extension__ ({ \
-  float32x4_t __ret_98; \
-  float32x4_t __s0_98 = __p0_98; \
-  float32x2_t __s1_98 = __p1_98; \
-  __ret_98 = __s0_98 * splatq_lane_f32(__s1_98, __p2_98); \
-  __ret_98; \
-})
-#else
-#define vmulq_lane_f32(__p0_99, __p1_99, __p2_99) __extension__ ({ \
-  float32x4_t __ret_99; \
-  float32x4_t __s0_99 = __p0_99; \
-  float32x2_t __s1_99 = __p1_99; \
-  float32x4_t __rev0_99;  __rev0_99 = __builtin_shufflevector(__s0_99, __s0_99, 3, 2, 1, 0); \
-  float32x2_t __rev1_99;  __rev1_99 = __builtin_shufflevector(__s1_99, __s1_99, 1, 0); \
-  __ret_99 = __rev0_99 * __noswap_splatq_lane_f32(__rev1_99, __p2_99); \
-  __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 3, 2, 1, 0); \
-  __ret_99; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_lane_s32(__p0_100, __p1_100, __p2_100) __extension__ ({ \
-  int32x4_t __ret_100; \
-  int32x4_t __s0_100 = __p0_100; \
-  int32x2_t __s1_100 = __p1_100; \
-  __ret_100 = __s0_100 * splatq_lane_s32(__s1_100, __p2_100); \
-  __ret_100; \
-})
-#else
-#define vmulq_lane_s32(__p0_101, __p1_101, __p2_101) __extension__ ({ \
-  int32x4_t __ret_101; \
-  int32x4_t __s0_101 = __p0_101; \
-  int32x2_t __s1_101 = __p1_101; \
-  int32x4_t __rev0_101;  __rev0_101 = __builtin_shufflevector(__s0_101, __s0_101, 3, 2, 1, 0); \
-  int32x2_t __rev1_101;  __rev1_101 = __builtin_shufflevector(__s1_101, __s1_101, 1, 0); \
-  __ret_101 = __rev0_101 * __noswap_splatq_lane_s32(__rev1_101, __p2_101); \
-  __ret_101 = __builtin_shufflevector(__ret_101, __ret_101, 3, 2, 1, 0); \
-  __ret_101; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_lane_s16(__p0_102, __p1_102, __p2_102) __extension__ ({ \
-  int16x8_t __ret_102; \
-  int16x8_t __s0_102 = __p0_102; \
-  int16x4_t __s1_102 = __p1_102; \
-  __ret_102 = __s0_102 * splatq_lane_s16(__s1_102, __p2_102); \
-  __ret_102; \
-})
-#else
-#define vmulq_lane_s16(__p0_103, __p1_103, __p2_103) __extension__ ({ \
-  int16x8_t __ret_103; \
-  int16x8_t __s0_103 = __p0_103; \
-  int16x4_t __s1_103 = __p1_103; \
-  int16x8_t __rev0_103;  __rev0_103 = __builtin_shufflevector(__s0_103, __s0_103, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev1_103;  __rev1_103 = __builtin_shufflevector(__s1_103, __s1_103, 3, 2, 1, 0); \
-  __ret_103 = __rev0_103 * __noswap_splatq_lane_s16(__rev1_103, __p2_103); \
-  __ret_103 = __builtin_shufflevector(__ret_103, __ret_103, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_103; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_lane_u32(__p0_104, __p1_104, __p2_104) __extension__ ({ \
-  uint32x2_t __ret_104; \
-  uint32x2_t __s0_104 = __p0_104; \
-  uint32x2_t __s1_104 = __p1_104; \
-  __ret_104 = __s0_104 * splat_lane_u32(__s1_104, __p2_104); \
-  __ret_104; \
-})
-#else
-#define vmul_lane_u32(__p0_105, __p1_105, __p2_105) __extension__ ({ \
-  uint32x2_t __ret_105; \
-  uint32x2_t __s0_105 = __p0_105; \
-  uint32x2_t __s1_105 = __p1_105; \
-  uint32x2_t __rev0_105;  __rev0_105 = __builtin_shufflevector(__s0_105, __s0_105, 1, 0); \
-  uint32x2_t __rev1_105;  __rev1_105 = __builtin_shufflevector(__s1_105, __s1_105, 1, 0); \
-  __ret_105 = __rev0_105 * __noswap_splat_lane_u32(__rev1_105, __p2_105); \
-  __ret_105 = __builtin_shufflevector(__ret_105, __ret_105, 1, 0); \
-  __ret_105; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_lane_u16(__p0_106, __p1_106, __p2_106) __extension__ ({ \
-  uint16x4_t __ret_106; \
-  uint16x4_t __s0_106 = __p0_106; \
-  uint16x4_t __s1_106 = __p1_106; \
-  __ret_106 = __s0_106 * splat_lane_u16(__s1_106, __p2_106); \
-  __ret_106; \
-})
-#else
-#define vmul_lane_u16(__p0_107, __p1_107, __p2_107) __extension__ ({ \
-  uint16x4_t __ret_107; \
-  uint16x4_t __s0_107 = __p0_107; \
-  uint16x4_t __s1_107 = __p1_107; \
-  uint16x4_t __rev0_107;  __rev0_107 = __builtin_shufflevector(__s0_107, __s0_107, 3, 2, 1, 0); \
-  uint16x4_t __rev1_107;  __rev1_107 = __builtin_shufflevector(__s1_107, __s1_107, 3, 2, 1, 0); \
-  __ret_107 = __rev0_107 * __noswap_splat_lane_u16(__rev1_107, __p2_107); \
-  __ret_107 = __builtin_shufflevector(__ret_107, __ret_107, 3, 2, 1, 0); \
-  __ret_107; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_lane_f32(__p0_108, __p1_108, __p2_108) __extension__ ({ \
-  float32x2_t __ret_108; \
-  float32x2_t __s0_108 = __p0_108; \
-  float32x2_t __s1_108 = __p1_108; \
-  __ret_108 = __s0_108 * splat_lane_f32(__s1_108, __p2_108); \
-  __ret_108; \
-})
-#else
-#define vmul_lane_f32(__p0_109, __p1_109, __p2_109) __extension__ ({ \
-  float32x2_t __ret_109; \
-  float32x2_t __s0_109 = __p0_109; \
-  float32x2_t __s1_109 = __p1_109; \
-  float32x2_t __rev0_109;  __rev0_109 = __builtin_shufflevector(__s0_109, __s0_109, 1, 0); \
-  float32x2_t __rev1_109;  __rev1_109 = __builtin_shufflevector(__s1_109, __s1_109, 1, 0); \
-  __ret_109 = __rev0_109 * __noswap_splat_lane_f32(__rev1_109, __p2_109); \
-  __ret_109 = __builtin_shufflevector(__ret_109, __ret_109, 1, 0); \
-  __ret_109; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_lane_s32(__p0_110, __p1_110, __p2_110) __extension__ ({ \
-  int32x2_t __ret_110; \
-  int32x2_t __s0_110 = __p0_110; \
-  int32x2_t __s1_110 = __p1_110; \
-  __ret_110 = __s0_110 * splat_lane_s32(__s1_110, __p2_110); \
-  __ret_110; \
-})
-#else
-#define vmul_lane_s32(__p0_111, __p1_111, __p2_111) __extension__ ({ \
-  int32x2_t __ret_111; \
-  int32x2_t __s0_111 = __p0_111; \
-  int32x2_t __s1_111 = __p1_111; \
-  int32x2_t __rev0_111;  __rev0_111 = __builtin_shufflevector(__s0_111, __s0_111, 1, 0); \
-  int32x2_t __rev1_111;  __rev1_111 = __builtin_shufflevector(__s1_111, __s1_111, 1, 0); \
-  __ret_111 = __rev0_111 * __noswap_splat_lane_s32(__rev1_111, __p2_111); \
-  __ret_111 = __builtin_shufflevector(__ret_111, __ret_111, 1, 0); \
-  __ret_111; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_lane_s16(__p0_112, __p1_112, __p2_112) __extension__ ({ \
-  int16x4_t __ret_112; \
-  int16x4_t __s0_112 = __p0_112; \
-  int16x4_t __s1_112 = __p1_112; \
-  __ret_112 = __s0_112 * splat_lane_s16(__s1_112, __p2_112); \
-  __ret_112; \
-})
-#else
-#define vmul_lane_s16(__p0_113, __p1_113, __p2_113) __extension__ ({ \
-  int16x4_t __ret_113; \
-  int16x4_t __s0_113 = __p0_113; \
-  int16x4_t __s1_113 = __p1_113; \
-  int16x4_t __rev0_113;  __rev0_113 = __builtin_shufflevector(__s0_113, __s0_113, 3, 2, 1, 0); \
-  int16x4_t __rev1_113;  __rev1_113 = __builtin_shufflevector(__s1_113, __s1_113, 3, 2, 1, 0); \
-  __ret_113 = __rev0_113 * __noswap_splat_lane_s16(__rev1_113, __p2_113); \
-  __ret_113 = __builtin_shufflevector(__ret_113, __ret_113, 3, 2, 1, 0); \
-  __ret_113; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmulq_n_u32(uint32x4_t __p0, uint32_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 * (uint32x4_t) {__p1, __p1, __p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmulq_n_u32(uint32x4_t __p0, uint32_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __rev0 * (uint32x4_t) {__p1, __p1, __p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmulq_n_u16(uint16x8_t __p0, uint16_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 * (uint16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmulq_n_u16(uint16x8_t __p0, uint16_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 * (uint16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmulq_n_f32(float32x4_t __p0, float32_t __p1) {
-  float32x4_t __ret;
-  __ret = __p0 * (float32x4_t) {__p1, __p1, __p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmulq_n_f32(float32x4_t __p0, float32_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __rev0 * (float32x4_t) {__p1, __p1, __p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmulq_n_s32(int32x4_t __p0, int32_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 * (int32x4_t) {__p1, __p1, __p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmulq_n_s32(int32x4_t __p0, int32_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __rev0 * (int32x4_t) {__p1, __p1, __p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmulq_n_s16(int16x8_t __p0, int16_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 * (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmulq_n_s16(int16x8_t __p0, int16_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 * (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmul_n_u32(uint32x2_t __p0, uint32_t __p1) {
-  uint32x2_t __ret;
-  __ret = __p0 * (uint32x2_t) {__p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmul_n_u32(uint32x2_t __p0, uint32_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __rev0 * (uint32x2_t) {__p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmul_n_u16(uint16x4_t __p0, uint16_t __p1) {
-  uint16x4_t __ret;
-  __ret = __p0 * (uint16x4_t) {__p1, __p1, __p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmul_n_u16(uint16x4_t __p0, uint16_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __rev0 * (uint16x4_t) {__p1, __p1, __p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmul_n_f32(float32x2_t __p0, float32_t __p1) {
-  float32x2_t __ret;
-  __ret = __p0 * (float32x2_t) {__p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmul_n_f32(float32x2_t __p0, float32_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __rev0 * (float32x2_t) {__p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmul_n_s32(int32x2_t __p0, int32_t __p1) {
-  int32x2_t __ret;
-  __ret = __p0 * (int32x2_t) {__p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmul_n_s32(int32x2_t __p0, int32_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __rev0 * (int32x2_t) {__p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmul_n_s16(int16x4_t __p0, int16_t __p1) {
-  int16x4_t __ret;
-  __ret = __p0 * (int16x4_t) {__p1, __p1, __p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmul_n_s16(int16x4_t __p0, int16_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __rev0 * (int16x4_t) {__p1, __p1, __p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vmull_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 37);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vmull_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly16x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly16x8_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 37);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t __noswap_vmull_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 37);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmull_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmull_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t __noswap_vmull_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 49);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmull_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmull_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vmull_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 51);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmull_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmull_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vmull_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 50);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmull_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmull_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vmull_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 33);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmull_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmull_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vmull_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmull_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmull_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vmull_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_lane_u32(__p0_114, __p1_114, __p2_114) __extension__ ({ \
-  uint64x2_t __ret_114; \
-  uint32x2_t __s0_114 = __p0_114; \
-  uint32x2_t __s1_114 = __p1_114; \
-  __ret_114 = vmull_u32(__s0_114, splat_lane_u32(__s1_114, __p2_114)); \
-  __ret_114; \
-})
-#else
-#define vmull_lane_u32(__p0_115, __p1_115, __p2_115) __extension__ ({ \
-  uint64x2_t __ret_115; \
-  uint32x2_t __s0_115 = __p0_115; \
-  uint32x2_t __s1_115 = __p1_115; \
-  uint32x2_t __rev0_115;  __rev0_115 = __builtin_shufflevector(__s0_115, __s0_115, 1, 0); \
-  uint32x2_t __rev1_115;  __rev1_115 = __builtin_shufflevector(__s1_115, __s1_115, 1, 0); \
-  __ret_115 = __noswap_vmull_u32(__rev0_115, __noswap_splat_lane_u32(__rev1_115, __p2_115)); \
-  __ret_115 = __builtin_shufflevector(__ret_115, __ret_115, 1, 0); \
-  __ret_115; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_lane_u16(__p0_116, __p1_116, __p2_116) __extension__ ({ \
-  uint32x4_t __ret_116; \
-  uint16x4_t __s0_116 = __p0_116; \
-  uint16x4_t __s1_116 = __p1_116; \
-  __ret_116 = vmull_u16(__s0_116, splat_lane_u16(__s1_116, __p2_116)); \
-  __ret_116; \
-})
-#else
-#define vmull_lane_u16(__p0_117, __p1_117, __p2_117) __extension__ ({ \
-  uint32x4_t __ret_117; \
-  uint16x4_t __s0_117 = __p0_117; \
-  uint16x4_t __s1_117 = __p1_117; \
-  uint16x4_t __rev0_117;  __rev0_117 = __builtin_shufflevector(__s0_117, __s0_117, 3, 2, 1, 0); \
-  uint16x4_t __rev1_117;  __rev1_117 = __builtin_shufflevector(__s1_117, __s1_117, 3, 2, 1, 0); \
-  __ret_117 = __noswap_vmull_u16(__rev0_117, __noswap_splat_lane_u16(__rev1_117, __p2_117)); \
-  __ret_117 = __builtin_shufflevector(__ret_117, __ret_117, 3, 2, 1, 0); \
-  __ret_117; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_lane_s32(__p0_118, __p1_118, __p2_118) __extension__ ({ \
-  int64x2_t __ret_118; \
-  int32x2_t __s0_118 = __p0_118; \
-  int32x2_t __s1_118 = __p1_118; \
-  __ret_118 = vmull_s32(__s0_118, splat_lane_s32(__s1_118, __p2_118)); \
-  __ret_118; \
-})
-#else
-#define vmull_lane_s32(__p0_119, __p1_119, __p2_119) __extension__ ({ \
-  int64x2_t __ret_119; \
-  int32x2_t __s0_119 = __p0_119; \
-  int32x2_t __s1_119 = __p1_119; \
-  int32x2_t __rev0_119;  __rev0_119 = __builtin_shufflevector(__s0_119, __s0_119, 1, 0); \
-  int32x2_t __rev1_119;  __rev1_119 = __builtin_shufflevector(__s1_119, __s1_119, 1, 0); \
-  __ret_119 = __noswap_vmull_s32(__rev0_119, __noswap_splat_lane_s32(__rev1_119, __p2_119)); \
-  __ret_119 = __builtin_shufflevector(__ret_119, __ret_119, 1, 0); \
-  __ret_119; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_lane_s16(__p0_120, __p1_120, __p2_120) __extension__ ({ \
-  int32x4_t __ret_120; \
-  int16x4_t __s0_120 = __p0_120; \
-  int16x4_t __s1_120 = __p1_120; \
-  __ret_120 = vmull_s16(__s0_120, splat_lane_s16(__s1_120, __p2_120)); \
-  __ret_120; \
-})
-#else
-#define vmull_lane_s16(__p0_121, __p1_121, __p2_121) __extension__ ({ \
-  int32x4_t __ret_121; \
-  int16x4_t __s0_121 = __p0_121; \
-  int16x4_t __s1_121 = __p1_121; \
-  int16x4_t __rev0_121;  __rev0_121 = __builtin_shufflevector(__s0_121, __s0_121, 3, 2, 1, 0); \
-  int16x4_t __rev1_121;  __rev1_121 = __builtin_shufflevector(__s1_121, __s1_121, 3, 2, 1, 0); \
-  __ret_121 = __noswap_vmull_s16(__rev0_121, __noswap_splat_lane_s16(__rev1_121, __p2_121)); \
-  __ret_121 = __builtin_shufflevector(__ret_121, __ret_121, 3, 2, 1, 0); \
-  __ret_121; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmull_n_u32(uint32x2_t __p0, uint32_t __p1) {
-  uint64x2_t __ret;
-  __ret = vmull_u32(__p0, (uint32x2_t) {__p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmull_n_u32(uint32x2_t __p0, uint32_t __p1) {
-  uint64x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __noswap_vmull_u32(__rev0, (uint32x2_t) {__p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vmull_n_u32(uint32x2_t __p0, uint32_t __p1) {
-  uint64x2_t __ret;
-  __ret = __noswap_vmull_u32(__p0, (uint32x2_t) {__p1, __p1});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmull_n_u16(uint16x4_t __p0, uint16_t __p1) {
-  uint32x4_t __ret;
-  __ret = vmull_u16(__p0, (uint16x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmull_n_u16(uint16x4_t __p0, uint16_t __p1) {
-  uint32x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vmull_u16(__rev0, (uint16x4_t) {__p1, __p1, __p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vmull_n_u16(uint16x4_t __p0, uint16_t __p1) {
-  uint32x4_t __ret;
-  __ret = __noswap_vmull_u16(__p0, (uint16x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmull_n_s32(int32x2_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  __ret = vmull_s32(__p0, (int32x2_t) {__p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmull_n_s32(int32x2_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __noswap_vmull_s32(__rev0, (int32x2_t) {__p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vmull_n_s32(int32x2_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  __ret = __noswap_vmull_s32(__p0, (int32x2_t) {__p1, __p1});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmull_n_s16(int16x4_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  __ret = vmull_s16(__p0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmull_n_s16(int16x4_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vmull_s16(__rev0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vmull_n_s16(int16x4_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  __ret = __noswap_vmull_s16(__p0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vmvn_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vmvn_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vmvnq_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vmvnq_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vmvnq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vmvnq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmvnq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmvnq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmvnq_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmvnq_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vmvnq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vmvnq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmvnq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmvnq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmvnq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmvnq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vmvn_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vmvn_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vmvn_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vmvn_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vmvn_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vmvn_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vmvn_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vmvn_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vmvn_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vmvn_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vmvn_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = ~__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vmvn_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = ~__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vnegq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vnegq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vnegq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vnegq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vnegq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vnegq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vnegq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vnegq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vneg_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vneg_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vneg_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vneg_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vneg_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vneg_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vneg_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vneg_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vornq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vornq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vornq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vornq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vornq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vornq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vornq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vornq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vornq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vornq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vornq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vornq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vornq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vornq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vornq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vornq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vorn_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vorn_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vorn_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vorn_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vorn_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vorn_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vorn_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vorn_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vorn_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vorn_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vorn_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vorn_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vorn_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __p0 | ~__p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vorn_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 | ~__rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vorrq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vorrq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vorrq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vorrq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vorrq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vorrq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vorrq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vorrq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vorrq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vorrq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vorrq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vorrq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vorrq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vorrq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vorrq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vorrq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vorr_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vorr_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vorr_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vorr_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vorr_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vorr_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vorr_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vorr_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vorr_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vorr_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vorr_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vorr_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vorr_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __p0 | __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vorr_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 | __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vpadalq_u8(uint16x8_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vpadalq_u8(uint16x8_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vpadalq_u32(uint64x2_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vpadalq_u32(uint64x2_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vpadalq_u16(uint32x4_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vpadalq_u16(uint32x4_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vpadalq_s8(int16x8_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vpadalq_s8(int16x8_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vpadalq_s32(int64x2_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vpadalq_s32(int64x2_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vpadalq_s16(int32x4_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vpadalq_s16(int32x4_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vpadal_u8(uint16x4_t __p0, uint8x8_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vpadal_u8(uint16x4_t __p0, uint8x8_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x1_t vpadal_u32(uint64x1_t __p0, uint32x2_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x1_t vpadal_u32(uint64x1_t __p0, uint32x2_t __p1) {
-  uint64x1_t __ret;
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__rev1, 19);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vpadal_u16(uint32x2_t __p0, uint16x4_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vpadal_u16(uint32x2_t __p0, uint16x4_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vpadal_s8(int16x4_t __p0, int8x8_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vpadal_s8(int16x4_t __p0, int8x8_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x1_t vpadal_s32(int64x1_t __p0, int32x2_t __p1) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x1_t vpadal_s32(int64x1_t __p0, int32x2_t __p1) {
-  int64x1_t __ret;
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__rev1, 3);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vpadal_s16(int32x2_t __p0, int16x4_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vpadal_s16(int32x2_t __p0, int16x4_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vpadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vpadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vpadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vpadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vpadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vpadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vpadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vpadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vpadd_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vpadd_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vpadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vpadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vpadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vpadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vpaddlq_u8(uint8x16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vpaddlq_u8(uint8x16_t __p0) {
-  uint16x8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vpaddlq_u32(uint32x4_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vpaddlq_u32(uint32x4_t __p0) {
-  uint64x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vpaddlq_u16(uint16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vpaddlq_u16(uint16x8_t __p0) {
-  uint32x4_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vpaddlq_s8(int8x16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vpaddlq_s8(int8x16_t __p0) {
-  int16x8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vpaddlq_s32(int32x4_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vpaddlq_s32(int32x4_t __p0) {
-  int64x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vpaddlq_s16(int16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vpaddlq_s16(int16x8_t __p0) {
-  int32x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vpaddl_u8(uint8x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vpaddl_u8(uint8x8_t __p0) {
-  uint16x4_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x1_t vpaddl_u32(uint32x2_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x1_t vpaddl_u32(uint32x2_t __p0) {
-  uint64x1_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 19);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vpaddl_u16(uint16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vpaddl_u16(uint16x4_t __p0) {
-  uint32x2_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vpaddl_s8(int8x8_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vpaddl_s8(int8x8_t __p0) {
-  int16x4_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x1_t vpaddl_s32(int32x2_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x1_t vpaddl_s32(int32x2_t __p0) {
-  int64x1_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 3);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vpaddl_s16(int16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vpaddl_s16(int16x4_t __p0) {
-  int32x2_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vpmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vpmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vpmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vpmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vpmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vpmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vpmax_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vpmax_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vpmax_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vpmax_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vpmax_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vpmax_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vpmax_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vpmax_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vpmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vpmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vpmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vpmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vpmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vpmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vpmin_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vpmin_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vpmin_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vpmin_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vpmin_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vpmin_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vpmin_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vpmin_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqabsq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqabsq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqabsq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqabsq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqabsq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqabsq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqabs_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqabs_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqabs_v((int8x8_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqabs_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqabs_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqabs_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqabs_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqabs_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqabs_v((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vqadd_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vqdmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlal_lane_s32(__p0_122, __p1_122, __p2_122, __p3_122) __extension__ ({ \
-  int64x2_t __ret_122; \
-  int64x2_t __s0_122 = __p0_122; \
-  int32x2_t __s1_122 = __p1_122; \
-  int32x2_t __s2_122 = __p2_122; \
-  __ret_122 = vqdmlal_s32(__s0_122, __s1_122, splat_lane_s32(__s2_122, __p3_122)); \
-  __ret_122; \
-})
-#else
-#define vqdmlal_lane_s32(__p0_123, __p1_123, __p2_123, __p3_123) __extension__ ({ \
-  int64x2_t __ret_123; \
-  int64x2_t __s0_123 = __p0_123; \
-  int32x2_t __s1_123 = __p1_123; \
-  int32x2_t __s2_123 = __p2_123; \
-  int64x2_t __rev0_123;  __rev0_123 = __builtin_shufflevector(__s0_123, __s0_123, 1, 0); \
-  int32x2_t __rev1_123;  __rev1_123 = __builtin_shufflevector(__s1_123, __s1_123, 1, 0); \
-  int32x2_t __rev2_123;  __rev2_123 = __builtin_shufflevector(__s2_123, __s2_123, 1, 0); \
-  __ret_123 = __noswap_vqdmlal_s32(__rev0_123, __rev1_123, __noswap_splat_lane_s32(__rev2_123, __p3_123)); \
-  __ret_123 = __builtin_shufflevector(__ret_123, __ret_123, 1, 0); \
-  __ret_123; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlal_lane_s16(__p0_124, __p1_124, __p2_124, __p3_124) __extension__ ({ \
-  int32x4_t __ret_124; \
-  int32x4_t __s0_124 = __p0_124; \
-  int16x4_t __s1_124 = __p1_124; \
-  int16x4_t __s2_124 = __p2_124; \
-  __ret_124 = vqdmlal_s16(__s0_124, __s1_124, splat_lane_s16(__s2_124, __p3_124)); \
-  __ret_124; \
-})
-#else
-#define vqdmlal_lane_s16(__p0_125, __p1_125, __p2_125, __p3_125) __extension__ ({ \
-  int32x4_t __ret_125; \
-  int32x4_t __s0_125 = __p0_125; \
-  int16x4_t __s1_125 = __p1_125; \
-  int16x4_t __s2_125 = __p2_125; \
-  int32x4_t __rev0_125;  __rev0_125 = __builtin_shufflevector(__s0_125, __s0_125, 3, 2, 1, 0); \
-  int16x4_t __rev1_125;  __rev1_125 = __builtin_shufflevector(__s1_125, __s1_125, 3, 2, 1, 0); \
-  int16x4_t __rev2_125;  __rev2_125 = __builtin_shufflevector(__s2_125, __s2_125, 3, 2, 1, 0); \
-  __ret_125 = __noswap_vqdmlal_s16(__rev0_125, __rev1_125, __noswap_splat_lane_s16(__rev2_125, __p3_125)); \
-  __ret_125 = __builtin_shufflevector(__ret_125, __ret_125, 3, 2, 1, 0); \
-  __ret_125; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = vqdmlal_s32(__p0, __p1, (int32x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vqdmlal_s32(__rev0, __rev1, (int32x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vqdmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = __noswap_vqdmlal_s32(__p0, __p1, (int32x2_t) {__p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = vqdmlal_s16(__p0, __p1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlal_s16(__rev0, __rev1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vqdmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = __noswap_vqdmlal_s16(__p0, __p1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vqdmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vqdmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_lane_s32(__p0_126, __p1_126, __p2_126, __p3_126) __extension__ ({ \
-  int64x2_t __ret_126; \
-  int64x2_t __s0_126 = __p0_126; \
-  int32x2_t __s1_126 = __p1_126; \
-  int32x2_t __s2_126 = __p2_126; \
-  __ret_126 = vqdmlsl_s32(__s0_126, __s1_126, splat_lane_s32(__s2_126, __p3_126)); \
-  __ret_126; \
-})
-#else
-#define vqdmlsl_lane_s32(__p0_127, __p1_127, __p2_127, __p3_127) __extension__ ({ \
-  int64x2_t __ret_127; \
-  int64x2_t __s0_127 = __p0_127; \
-  int32x2_t __s1_127 = __p1_127; \
-  int32x2_t __s2_127 = __p2_127; \
-  int64x2_t __rev0_127;  __rev0_127 = __builtin_shufflevector(__s0_127, __s0_127, 1, 0); \
-  int32x2_t __rev1_127;  __rev1_127 = __builtin_shufflevector(__s1_127, __s1_127, 1, 0); \
-  int32x2_t __rev2_127;  __rev2_127 = __builtin_shufflevector(__s2_127, __s2_127, 1, 0); \
-  __ret_127 = __noswap_vqdmlsl_s32(__rev0_127, __rev1_127, __noswap_splat_lane_s32(__rev2_127, __p3_127)); \
-  __ret_127 = __builtin_shufflevector(__ret_127, __ret_127, 1, 0); \
-  __ret_127; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_lane_s16(__p0_128, __p1_128, __p2_128, __p3_128) __extension__ ({ \
-  int32x4_t __ret_128; \
-  int32x4_t __s0_128 = __p0_128; \
-  int16x4_t __s1_128 = __p1_128; \
-  int16x4_t __s2_128 = __p2_128; \
-  __ret_128 = vqdmlsl_s16(__s0_128, __s1_128, splat_lane_s16(__s2_128, __p3_128)); \
-  __ret_128; \
-})
-#else
-#define vqdmlsl_lane_s16(__p0_129, __p1_129, __p2_129, __p3_129) __extension__ ({ \
-  int32x4_t __ret_129; \
-  int32x4_t __s0_129 = __p0_129; \
-  int16x4_t __s1_129 = __p1_129; \
-  int16x4_t __s2_129 = __p2_129; \
-  int32x4_t __rev0_129;  __rev0_129 = __builtin_shufflevector(__s0_129, __s0_129, 3, 2, 1, 0); \
-  int16x4_t __rev1_129;  __rev1_129 = __builtin_shufflevector(__s1_129, __s1_129, 3, 2, 1, 0); \
-  int16x4_t __rev2_129;  __rev2_129 = __builtin_shufflevector(__s2_129, __s2_129, 3, 2, 1, 0); \
-  __ret_129 = __noswap_vqdmlsl_s16(__rev0_129, __rev1_129, __noswap_splat_lane_s16(__rev2_129, __p3_129)); \
-  __ret_129 = __builtin_shufflevector(__ret_129, __ret_129, 3, 2, 1, 0); \
-  __ret_129; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = vqdmlsl_s32(__p0, __p1, (int32x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vqdmlsl_s32(__rev0, __rev1, (int32x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vqdmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = __noswap_vqdmlsl_s32(__p0, __p1, (int32x2_t) {__p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = vqdmlsl_s16(__p0, __p1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlsl_s16(__rev0, __rev1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vqdmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = __noswap_vqdmlsl_s16(__p0, __p1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vqdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vqdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vqdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
-  int32x4_t __ret;
-  __ret = vqdmulhq_s32(__p0, (int32x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vqdmulhq_s32(__rev0, (int32x4_t) {__p1, __p1, __p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
-  int16x8_t __ret;
-  __ret = vqdmulhq_s16(__p0, (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vqdmulhq_s16(__rev0, (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
-  int32x2_t __ret;
-  __ret = vqdmulh_s32(__p0, (int32x2_t) {__p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __noswap_vqdmulh_s32(__rev0, (int32x2_t) {__p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
-  int16x4_t __ret;
-  __ret = vqdmulh_s16(__p0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vqdmulh_s16(__rev0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmull_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmull_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vqdmull_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmull_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmull_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vqdmull_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmull_lane_s32(__p0_130, __p1_130, __p2_130) __extension__ ({ \
-  int64x2_t __ret_130; \
-  int32x2_t __s0_130 = __p0_130; \
-  int32x2_t __s1_130 = __p1_130; \
-  __ret_130 = vqdmull_s32(__s0_130, splat_lane_s32(__s1_130, __p2_130)); \
-  __ret_130; \
-})
-#else
-#define vqdmull_lane_s32(__p0_131, __p1_131, __p2_131) __extension__ ({ \
-  int64x2_t __ret_131; \
-  int32x2_t __s0_131 = __p0_131; \
-  int32x2_t __s1_131 = __p1_131; \
-  int32x2_t __rev0_131;  __rev0_131 = __builtin_shufflevector(__s0_131, __s0_131, 1, 0); \
-  int32x2_t __rev1_131;  __rev1_131 = __builtin_shufflevector(__s1_131, __s1_131, 1, 0); \
-  __ret_131 = __noswap_vqdmull_s32(__rev0_131, __noswap_splat_lane_s32(__rev1_131, __p2_131)); \
-  __ret_131 = __builtin_shufflevector(__ret_131, __ret_131, 1, 0); \
-  __ret_131; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmull_lane_s16(__p0_132, __p1_132, __p2_132) __extension__ ({ \
-  int32x4_t __ret_132; \
-  int16x4_t __s0_132 = __p0_132; \
-  int16x4_t __s1_132 = __p1_132; \
-  __ret_132 = vqdmull_s16(__s0_132, splat_lane_s16(__s1_132, __p2_132)); \
-  __ret_132; \
-})
-#else
-#define vqdmull_lane_s16(__p0_133, __p1_133, __p2_133) __extension__ ({ \
-  int32x4_t __ret_133; \
-  int16x4_t __s0_133 = __p0_133; \
-  int16x4_t __s1_133 = __p1_133; \
-  int16x4_t __rev0_133;  __rev0_133 = __builtin_shufflevector(__s0_133, __s0_133, 3, 2, 1, 0); \
-  int16x4_t __rev1_133;  __rev1_133 = __builtin_shufflevector(__s1_133, __s1_133, 3, 2, 1, 0); \
-  __ret_133 = __noswap_vqdmull_s16(__rev0_133, __noswap_splat_lane_s16(__rev1_133, __p2_133)); \
-  __ret_133 = __builtin_shufflevector(__ret_133, __ret_133, 3, 2, 1, 0); \
-  __ret_133; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmull_n_s32(int32x2_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  __ret = vqdmull_s32(__p0, (int32x2_t) {__p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmull_n_s32(int32x2_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __noswap_vqdmull_s32(__rev0, (int32x2_t) {__p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vqdmull_n_s32(int32x2_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  __ret = __noswap_vqdmull_s32(__p0, (int32x2_t) {__p1, __p1});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmull_n_s16(int16x4_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  __ret = vqdmull_s16(__p0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmull_n_s16(int16x4_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vqdmull_s16(__rev0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vqdmull_n_s16(int16x4_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  __ret = __noswap_vqdmull_s16(__p0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vqmovn_u32(uint32x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vqmovn_u32(uint32x4_t __p0) {
-  uint16x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t __noswap_vqmovn_u32(uint32x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 17);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vqmovn_u64(uint64x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vqmovn_u64(uint64x2_t __p0) {
-  uint32x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t __noswap_vqmovn_u64(uint64x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 18);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqmovn_u16(uint16x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqmovn_u16(uint16x8_t __p0) {
-  uint8x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t __noswap_vqmovn_u16(uint16x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 16);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqmovn_s32(int32x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqmovn_s32(int32x4_t __p0) {
-  int16x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vqmovn_s32(int32x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqmovn_s64(int64x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqmovn_s64(int64x2_t __p0) {
-  int32x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vqmovn_s64(int64x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqmovn_s16(int16x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqmovn_s16(int16x8_t __p0) {
-  int8x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t __noswap_vqmovn_s16(int16x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vqmovun_s32(int32x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vqmovun_s32(int32x4_t __p0) {
-  uint16x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vqmovun_v((int8x16_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t __noswap_vqmovun_s32(int32x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 17);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vqmovun_s64(int64x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vqmovun_s64(int64x2_t __p0) {
-  uint32x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vqmovun_v((int8x16_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t __noswap_vqmovun_s64(int64x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 18);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqmovun_s16(int16x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqmovun_s16(int16x8_t __p0) {
-  uint8x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqmovun_v((int8x16_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t __noswap_vqmovun_s16(int16x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 16);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqnegq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqnegq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqnegq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqnegq_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqnegq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqnegq_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqneg_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqneg_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqneg_v((int8x8_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqneg_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqneg_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqneg_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqneg_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqneg_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqneg_v((int8x8_t)__rev0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqrdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqrdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vqrdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqrdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqrdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vqrdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqrdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqrdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vqrdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqrdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqrdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vqrdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqrdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
-  int32x4_t __ret;
-  __ret = vqrdmulhq_s32(__p0, (int32x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqrdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vqrdmulhq_s32(__rev0, (int32x4_t) {__p1, __p1, __p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqrdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
-  int16x8_t __ret;
-  __ret = vqrdmulhq_s16(__p0, (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqrdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vqrdmulhq_s16(__rev0, (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqrdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
-  int32x2_t __ret;
-  __ret = vqrdmulh_s32(__p0, (int32x2_t) {__p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqrdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __noswap_vqrdmulh_s32(__rev0, (int32x2_t) {__p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqrdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
-  int16x4_t __ret;
-  __ret = vqrdmulh_s16(__p0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqrdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vqrdmulh_s16(__rev0, (int16x4_t) {__p1, __p1, __p1, __p1});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vqrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vqrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vqrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vqrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vqrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vqrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vqrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vqrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vqrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vqrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vqrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqrshl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqrshl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqrshl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqrshl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vqrshl_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqrshl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqrshl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vqrshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqrshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vqrshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqrshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vqrshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqrshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vqrshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqrshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vqrshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqrshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define vqrshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqrshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrun_n_s32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vqrshrun_n_s32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqrshrun_n_s32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrun_n_s64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vqrshrun_n_s64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqrshrun_n_s64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrun_n_s16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vqrshrun_n_s16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqrshrun_n_s16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vqshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vqshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vqshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vqshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vqshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vqshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqshlq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqshlq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqshlq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqshlq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqshlq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqshlq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqshlq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqshlq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqshl_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqshl_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vqshl_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vqshl_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vqshl_u64(uint64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vqshl_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vqshl_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqshl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqshl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqshl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqshl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vqshl_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqshl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqshl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlq_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 48); \
-  __ret; \
-})
-#else
-#define vqshlq_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlq_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define vqshlq_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlq_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define vqshlq_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlq_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define vqshlq_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlq_n_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 32); \
-  __ret; \
-})
-#else
-#define vqshlq_n_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlq_n_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#else
-#define vqshlq_n_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlq_n_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#else
-#define vqshlq_n_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlq_n_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#else
-#define vqshlq_n_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshl_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vqshl_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshl_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vqshl_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vqshl_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vqshl_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vqshl_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshl_n_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define vqshl_n_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshl_n_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vqshl_n_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vqshl_n_s64(__p0, __p1) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vqshl_n_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vqshl_n_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshluq_n_s8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (uint8x16_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 48); \
-  __ret; \
-})
-#else
-#define vqshluq_n_s8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshluq_n_s32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define vqshluq_n_s32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshluq_n_s64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define vqshluq_n_s64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshluq_n_s16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define vqshluq_n_s16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlu_n_s8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vqshlu_n_s8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vqshlu_n_v((int8x8_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshlu_n_s32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vqshlu_n_s32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vqshlu_n_v((int8x8_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vqshlu_n_s64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (uint64x1_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vqshlu_n_s16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vqshlu_n_s16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vqshlu_n_v((int8x8_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vqshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vqshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vqshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vqshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vqshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define vqshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrun_n_s32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vqshrun_n_s32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vqshrun_n_v((int8x16_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqshrun_n_s32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrun_n_s64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vqshrun_n_s64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vqshrun_n_v((int8x16_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqshrun_n_s64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrun_n_s16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vqshrun_n_s16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vqshrun_n_v((int8x16_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vqshrun_n_s16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vqsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vqsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vqsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vqsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vqsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vqsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqsubq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqsubq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqsubq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqsubq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqsubq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqsubq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqsubq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqsubq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vqsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vqsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vqsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vqsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vqsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqsub_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqsub_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vqsub_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vqsub_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vqsub_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vqsub_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vqsub_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vraddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vraddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t __noswap_vraddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vraddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vraddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t __noswap_vraddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vraddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vraddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t __noswap_vraddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vraddhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vraddhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vraddhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vraddhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vraddhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vraddhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vraddhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vraddhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t __noswap_vraddhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vrecpeq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vrecpeq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrecpeq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrecpeq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vrecpe_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vrecpe_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrecpe_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrecpe_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrecpsq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrecpsq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrecps_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrecps_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrecps_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vrev16_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vrev16_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vrev16q_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vrev16q_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vrev16q_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vrev16q_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vrev16q_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vrev16q_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vrev16_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vrev16_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vrev16_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vrev16_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vrev32_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vrev32_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vrev32_p16(poly16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vrev32_p16(poly16x4_t __p0) {
-  poly16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vrev32q_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vrev32q_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vrev32q_p16(poly16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vrev32q_p16(poly16x8_t __p0) {
-  poly16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vrev32q_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vrev32q_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vrev32q_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vrev32q_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vrev32q_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vrev32q_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vrev32q_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vrev32q_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vrev32_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vrev32_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vrev32_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vrev32_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vrev32_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vrev32_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vrev32_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vrev32_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vrev64_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vrev64_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vrev64_p16(poly16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vrev64_p16(poly16x4_t __p0) {
-  poly16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vrev64q_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vrev64q_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vrev64q_p16(poly16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vrev64q_p16(poly16x8_t __p0) {
-  poly16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vrev64q_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vrev64q_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vrev64q_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vrev64q_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vrev64q_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vrev64q_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vrev64q_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vrev64q_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrev64q_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrev64q_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vrev64q_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vrev64q_s32(int32x4_t __p0) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vrev64q_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vrev64q_s16(int16x8_t __p0) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vrev64_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vrev64_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vrev64_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vrev64_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vrev64_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vrev64_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vrev64_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vrev64_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrev64_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrev64_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vrev64_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vrev64_s32(int32x2_t __p0) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vrev64_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vrev64_s16(int16x4_t __p0) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vrev64q_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vrev64q_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vrev64_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vrev64_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vrhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vrhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vrhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vrhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vrhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vrhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vrhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vrhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vrhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vrhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vrhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vrhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vrhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vrhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vrhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vrhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vrhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vrhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vrhadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vrhadd_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vrhadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vrhadd_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vrhadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vrhadd_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vrshl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vrshl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vrshl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vrshl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vrshl_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vrshl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vrshl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrq_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 48); \
-  __ret; \
-})
-#else
-#define vrshrq_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrq_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define vrshrq_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrq_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define vrshrq_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrq_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define vrshrq_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrq_n_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 32); \
-  __ret; \
-})
-#else
-#define vrshrq_n_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrq_n_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#else
-#define vrshrq_n_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrq_n_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#else
-#define vrshrq_n_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrq_n_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#else
-#define vrshrq_n_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshr_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vrshr_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshr_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vrshr_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vrshr_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vrshr_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vrshr_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshr_n_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define vrshr_n_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshr_n_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vrshr_n_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vrshr_n_s64(__p0, __p1) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vrshr_n_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vrshr_n_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vrshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vrshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vrshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vrshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vrshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vrshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vrshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vrshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vrshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vrshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define vrshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vrshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vrsqrteq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vrsqrteq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrsqrteq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrsqrteq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vrsqrte_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vrsqrte_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrsqrte_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrsqrte_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrsqrtsq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrsqrtsq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrsqrts_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrsqrts_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrsqrts_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  __ret = (uint8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
-  __ret; \
-})
-#else
-#define vrsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  __ret = (uint32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
-  __ret; \
-})
-#else
-#define vrsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
-  __ret; \
-})
-#else
-#define vrsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  __ret = (uint16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
-  __ret; \
-})
-#else
-#define vrsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  __ret = (int8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
-  __ret; \
-})
-#else
-#define vrsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
-  __ret; \
-})
-#else
-#define vrsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  __ret = (int64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
-  __ret; \
-})
-#else
-#define vrsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
-  __ret; \
-})
-#else
-#define vrsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  __ret = (uint8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
-  __ret; \
-})
-#else
-#define vrsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  __ret = (uint32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
-  __ret; \
-})
-#else
-#define vrsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vrsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  uint64x1_t __s1 = __p1; \
-  __ret = (uint64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vrsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  __ret = (uint16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
-  __ret; \
-})
-#else
-#define vrsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  __ret = (int8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
-  __ret; \
-})
-#else
-#define vrsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vrsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vrsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  int64x1_t __s1 = __p1; \
-  __ret = (int64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vrsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vrsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vrsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vrsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t __noswap_vrsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vrsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vrsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t __noswap_vrsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vrsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vrsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t __noswap_vrsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vrsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vrsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vrsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vrsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vrsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vrsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vrsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vrsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t __noswap_vrsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8_t __s0 = __p0; \
-  poly8x8_t __s1 = __p1; \
-  __ret = (poly8x8_t) __builtin_neon_vset_lane_i8(__s0, (poly8x8_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8_t __s0 = __p0; \
-  poly8x8_t __s1 = __p1; \
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x8_t) __builtin_neon_vset_lane_i8(__s0, (poly8x8_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8_t __s0 = __p0; \
-  poly8x8_t __s1 = __p1; \
-  __ret = (poly8x8_t) __builtin_neon_vset_lane_i8(__s0, (poly8x8_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16_t __s0 = __p0; \
-  poly16x4_t __s1 = __p1; \
-  __ret = (poly16x4_t) __builtin_neon_vset_lane_i16(__s0, (poly16x4_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16_t __s0 = __p0; \
-  poly16x4_t __s1 = __p1; \
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (poly16x4_t) __builtin_neon_vset_lane_i16(__s0, (poly16x4_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16_t __s0 = __p0; \
-  poly16x4_t __s1 = __p1; \
-  __ret = (poly16x4_t) __builtin_neon_vset_lane_i16(__s0, (poly16x4_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8_t __s0 = __p0; \
-  poly8x16_t __s1 = __p1; \
-  __ret = (poly8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (poly8x16_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8_t __s0 = __p0; \
-  poly8x16_t __s1 = __p1; \
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (poly8x16_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8_t __s0 = __p0; \
-  poly8x16_t __s1 = __p1; \
-  __ret = (poly8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (poly8x16_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16_t __s0 = __p0; \
-  poly16x8_t __s1 = __p1; \
-  __ret = (poly16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (poly16x8_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16_t __s0 = __p0; \
-  poly16x8_t __s1 = __p1; \
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (poly16x8_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16_t __s0 = __p0; \
-  poly16x8_t __s1 = __p1; \
-  __ret = (poly16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (poly16x8_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  __ret = (uint8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  __ret = (uint8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int64x2_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int64x2_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int64x2_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  __ret = (uint16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int16x8_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int16x8_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  __ret = (uint16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int16x8_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  __ret = (int8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  __ret = (int8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4_t __ret; \
-  float32_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  __ret = (float32x4_t) __builtin_neon_vsetq_lane_f32(__s0, (float32x4_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4_t __ret; \
-  float32_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (float32x4_t) __builtin_neon_vsetq_lane_f32(__s0, (float32x4_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4_t __ret; \
-  float32_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  __ret = (float32x4_t) __builtin_neon_vsetq_lane_f32(__s0, (float32x4_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  __ret = (int64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int64x2_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int64x2_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  __ret = (int64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int64x2_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int16x8_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int16x8_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int16x8_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  __ret = (uint8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  __ret = (uint8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  __ret = (uint32x2_t) __builtin_neon_vset_lane_i32(__s0, (int32x2_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vset_lane_i32(__s0, (int32x2_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  __ret = (uint32x2_t) __builtin_neon_vset_lane_i32(__s0, (int32x2_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#define vset_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64_t __s0 = __p0; \
-  uint64x1_t __s1 = __p1; \
-  __ret = (uint64x1_t) __builtin_neon_vset_lane_i64(__s0, (int64x1_t)__s1, __p2); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  __ret = (uint16x4_t) __builtin_neon_vset_lane_i16(__s0, (int16x4_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vset_lane_i16(__s0, (int16x4_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  __ret = (uint16x4_t) __builtin_neon_vset_lane_i16(__s0, (int16x4_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  __ret = (int8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  __ret = (int8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2_t __ret; \
-  float32_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  __ret = (float32x2_t) __builtin_neon_vset_lane_f32(__s0, (float32x2_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2_t __ret; \
-  float32_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (float32x2_t) __builtin_neon_vset_lane_f32(__s0, (float32x2_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2_t __ret; \
-  float32_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  __ret = (float32x2_t) __builtin_neon_vset_lane_f32(__s0, (float32x2_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vset_lane_i32(__s0, (int32x2_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vset_lane_i32(__s0, (int32x2_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vset_lane_i32(__s0, (int32x2_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#define vset_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __ret; \
-  int64_t __s0 = __p0; \
-  int64x1_t __s1 = __p1; \
-  __ret = (int64x1_t) __builtin_neon_vset_lane_i64(__s0, (int64x1_t)__s1, __p2); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vset_lane_i16(__s0, (int16x4_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vset_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vset_lane_i16(__s0, (int16x4_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vset_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vset_lane_i16(__s0, (int16x4_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vshlq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vshlq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vshlq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vshlq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vshlq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vshlq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vshlq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vshlq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vshl_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vshl_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vshl_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vshl_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vshl_u64(uint64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vshl_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vshl_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vshl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vshl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vshl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vshl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vshl_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vshl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vshl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshlq_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 48); \
-  __ret; \
-})
-#else
-#define vshlq_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshlq_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define vshlq_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshlq_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define vshlq_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshlq_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define vshlq_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshlq_n_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 32); \
-  __ret; \
-})
-#else
-#define vshlq_n_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshlq_n_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#else
-#define vshlq_n_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshlq_n_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#else
-#define vshlq_n_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshlq_n_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#else
-#define vshlq_n_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshl_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vshl_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshl_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vshl_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vshl_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vshl_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vshl_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshl_n_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define vshl_n_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshl_n_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vshl_n_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vshl_n_s64(__p0, __p1) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vshl_n_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vshl_n_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_n_u8(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define vshll_n_u8(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshll_n_u8(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 49); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_n_u32(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define vshll_n_u32(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshll_n_u32(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 51); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_n_u16(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define vshll_n_u16(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshll_n_u16(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 50); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_n_s8(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 33); \
-  __ret; \
-})
-#else
-#define vshll_n_s8(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshll_n_s8(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 33); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_n_s32(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 35); \
-  __ret; \
-})
-#else
-#define vshll_n_s32(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshll_n_s32(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 35); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_n_s16(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 34); \
-  __ret; \
-})
-#else
-#define vshll_n_s16(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshll_n_s16(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 34); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrq_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 48); \
-  __ret; \
-})
-#else
-#define vshrq_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrq_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 50); \
-  __ret; \
-})
-#else
-#define vshrq_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrq_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define vshrq_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrq_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 49); \
-  __ret; \
-})
-#else
-#define vshrq_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrq_n_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 32); \
-  __ret; \
-})
-#else
-#define vshrq_n_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrq_n_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 34); \
-  __ret; \
-})
-#else
-#define vshrq_n_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrq_n_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#else
-#define vshrq_n_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrq_n_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 33); \
-  __ret; \
-})
-#else
-#define vshrq_n_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshr_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vshr_n_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshr_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vshr_n_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vshr_n_u64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vshr_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vshr_n_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshr_n_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define vshr_n_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshr_n_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vshr_n_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vshr_n_s64(__p0, __p1) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vshr_n_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vshr_n_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#else
-#define vshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshrn_n_u32(__p0, __p1) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 17); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#else
-#define vshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshrn_n_u64(__p0, __p1) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 18); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#else
-#define vshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshrn_n_u16(__p0, __p1) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 16); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 1); \
-  __ret; \
-})
-#else
-#define vshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshrn_n_s32(__p0, __p1) __extension__ ({ \
-  int16x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 2); \
-  __ret; \
-})
-#else
-#define vshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshrn_n_s64(__p0, __p1) __extension__ ({ \
-  int32x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 0); \
-  __ret; \
-})
-#else
-#define vshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vshrn_n_s16(__p0, __p1) __extension__ ({ \
-  int8x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsli_n_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __s1 = __p1; \
-  __ret = (poly8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 4); \
-  __ret; \
-})
-#else
-#define vsli_n_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __s1 = __p1; \
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 4); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsli_n_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __s1 = __p1; \
-  __ret = (poly16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 5); \
-  __ret; \
-})
-#else
-#define vsli_n_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __s1 = __p1; \
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (poly16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 5); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __s1 = __p1; \
-  __ret = (poly8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \
-  __ret; \
-})
-#else
-#define vsliq_n_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __s1 = __p1; \
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __s1 = __p1; \
-  __ret = (poly16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \
-  __ret; \
-})
-#else
-#define vsliq_n_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __s1 = __p1; \
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  __ret = (uint8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
-  __ret; \
-})
-#else
-#define vsliq_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  __ret = (uint32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
-  __ret; \
-})
-#else
-#define vsliq_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
-  __ret; \
-})
-#else
-#define vsliq_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  __ret = (uint16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
-  __ret; \
-})
-#else
-#define vsliq_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  __ret = (int8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
-  __ret; \
-})
-#else
-#define vsliq_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
-  __ret; \
-})
-#else
-#define vsliq_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  __ret = (int64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
-  __ret; \
-})
-#else
-#define vsliq_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
-  __ret; \
-})
-#else
-#define vsliq_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsli_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  __ret = (uint8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
-  __ret; \
-})
-#else
-#define vsli_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsli_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  __ret = (uint32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
-  __ret; \
-})
-#else
-#define vsli_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vsli_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  uint64x1_t __s1 = __p1; \
-  __ret = (uint64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vsli_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  __ret = (uint16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
-  __ret; \
-})
-#else
-#define vsli_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsli_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  __ret = (int8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
-  __ret; \
-})
-#else
-#define vsli_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsli_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vsli_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vsli_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  int64x1_t __s1 = __p1; \
-  __ret = (int64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vsli_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vsli_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  __ret = (uint8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
-  __ret; \
-})
-#else
-#define vsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  __ret = (uint32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
-  __ret; \
-})
-#else
-#define vsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
-  __ret; \
-})
-#else
-#define vsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  __ret = (uint16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
-  __ret; \
-})
-#else
-#define vsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  __ret = (int8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
-  __ret; \
-})
-#else
-#define vsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
-  __ret; \
-})
-#else
-#define vsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  __ret = (int64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
-  __ret; \
-})
-#else
-#define vsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
-  __ret; \
-})
-#else
-#define vsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  __ret = (uint8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
-  __ret; \
-})
-#else
-#define vsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  __ret = (uint32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
-  __ret; \
-})
-#else
-#define vsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  uint64x1_t __s1 = __p1; \
-  __ret = (uint64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  __ret = (uint16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
-  __ret; \
-})
-#else
-#define vsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  __ret = (int8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
-  __ret; \
-})
-#else
-#define vsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  int64x1_t __s1 = __p1; \
-  __ret = (int64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsri_n_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __s1 = __p1; \
-  __ret = (poly8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 4); \
-  __ret; \
-})
-#else
-#define vsri_n_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __s1 = __p1; \
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 4); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsri_n_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __s1 = __p1; \
-  __ret = (poly16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 5); \
-  __ret; \
-})
-#else
-#define vsri_n_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __s1 = __p1; \
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (poly16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 5); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __s1 = __p1; \
-  __ret = (poly8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \
-  __ret; \
-})
-#else
-#define vsriq_n_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __s1 = __p1; \
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __s1 = __p1; \
-  __ret = (poly16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \
-  __ret; \
-})
-#else
-#define vsriq_n_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __s1 = __p1; \
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  __ret = (uint8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
-  __ret; \
-})
-#else
-#define vsriq_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __s1 = __p1; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  __ret = (uint32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
-  __ret; \
-})
-#else
-#define vsriq_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
-  __ret; \
-})
-#else
-#define vsriq_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  __ret = (uint16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
-  __ret; \
-})
-#else
-#define vsriq_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __s1 = __p1; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  __ret = (int8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
-  __ret; \
-})
-#else
-#define vsriq_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __s1 = __p1; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
-  __ret; \
-})
-#else
-#define vsriq_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  __ret = (int64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
-  __ret; \
-})
-#else
-#define vsriq_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
-  __ret; \
-})
-#else
-#define vsriq_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsri_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  __ret = (uint8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
-  __ret; \
-})
-#else
-#define vsri_n_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __s1 = __p1; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsri_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  __ret = (uint32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
-  __ret; \
-})
-#else
-#define vsri_n_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __s1 = __p1; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vsri_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  uint64x1_t __s1 = __p1; \
-  __ret = (uint64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vsri_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  __ret = (uint16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
-  __ret; \
-})
-#else
-#define vsri_n_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __s1 = __p1; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (uint16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsri_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  __ret = (int8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
-  __ret; \
-})
-#else
-#define vsri_n_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __s1 = __p1; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsri_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vsri_n_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vsri_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  int64x1_t __s1 = __p1; \
-  __ret = (int64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vsri_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vsri_n_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p8(__p0, __p1) __extension__ ({ \
-  poly8x8_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 4); \
-})
-#else
-#define vst1_p8(__p0, __p1) __extension__ ({ \
-  poly8x8_t __s1 = __p1; \
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p16(__p0, __p1) __extension__ ({ \
-  poly16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 5); \
-})
-#else
-#define vst1_p16(__p0, __p1) __extension__ ({ \
-  poly16x4_t __s1 = __p1; \
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p8(__p0, __p1) __extension__ ({ \
-  poly8x16_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 36); \
-})
-#else
-#define vst1q_p8(__p0, __p1) __extension__ ({ \
-  poly8x16_t __s1 = __p1; \
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p16(__p0, __p1) __extension__ ({ \
-  poly16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 37); \
-})
-#else
-#define vst1q_p16(__p0, __p1) __extension__ ({ \
-  poly16x8_t __s1 = __p1; \
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 48); \
-})
-#else
-#define vst1q_u8(__p0, __p1) __extension__ ({ \
-  uint8x16_t __s1 = __p1; \
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 50); \
-})
-#else
-#define vst1q_u32(__p0, __p1) __extension__ ({ \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 51); \
-})
-#else
-#define vst1q_u64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 49); \
-})
-#else
-#define vst1q_u16(__p0, __p1) __extension__ ({ \
-  uint16x8_t __s1 = __p1; \
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 32); \
-})
-#else
-#define vst1q_s8(__p0, __p1) __extension__ ({ \
-  int8x16_t __s1 = __p1; \
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f32(__p0, __p1) __extension__ ({ \
-  float32x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 41); \
-})
-#else
-#define vst1q_f32(__p0, __p1) __extension__ ({ \
-  float32x4_t __s1 = __p1; \
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 34); \
-})
-#else
-#define vst1q_s32(__p0, __p1) __extension__ ({ \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 35); \
-})
-#else
-#define vst1q_s64(__p0, __p1) __extension__ ({ \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 35); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 33); \
-})
-#else
-#define vst1q_s16(__p0, __p1) __extension__ ({ \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 16); \
-})
-#else
-#define vst1_u8(__p0, __p1) __extension__ ({ \
-  uint8x8_t __s1 = __p1; \
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 18); \
-})
-#else
-#define vst1_u32(__p0, __p1) __extension__ ({ \
-  uint32x2_t __s1 = __p1; \
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 18); \
-})
-#endif
-
-#define vst1_u64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 19); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 17); \
-})
-#else
-#define vst1_u16(__p0, __p1) __extension__ ({ \
-  uint16x4_t __s1 = __p1; \
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 0); \
-})
-#else
-#define vst1_s8(__p0, __p1) __extension__ ({ \
-  int8x8_t __s1 = __p1; \
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f32(__p0, __p1) __extension__ ({ \
-  float32x2_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 9); \
-})
-#else
-#define vst1_f32(__p0, __p1) __extension__ ({ \
-  float32x2_t __s1 = __p1; \
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 2); \
-})
-#else
-#define vst1_s32(__p0, __p1) __extension__ ({ \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 2); \
-})
-#endif
-
-#define vst1_s64(__p0, __p1) __extension__ ({ \
-  int64x1_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 3); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 1); \
-})
-#else
-#define vst1_s16(__p0, __p1) __extension__ ({ \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 4); \
-})
-#else
-#define vst1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8_t __s1 = __p1; \
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 5); \
-})
-#else
-#define vst1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4_t __s1 = __p1; \
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 36); \
-})
-#else
-#define vst1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16_t __s1 = __p1; \
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 37); \
-})
-#else
-#define vst1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8_t __s1 = __p1; \
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 48); \
-})
-#else
-#define vst1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16_t __s1 = __p1; \
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 50); \
-})
-#else
-#define vst1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 51); \
-})
-#else
-#define vst1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 49); \
-})
-#else
-#define vst1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8_t __s1 = __p1; \
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 32); \
-})
-#else
-#define vst1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16_t __s1 = __p1; \
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 41); \
-})
-#else
-#define vst1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4_t __s1 = __p1; \
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 34); \
-})
-#else
-#define vst1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 35); \
-})
-#else
-#define vst1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 35); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 33); \
-})
-#else
-#define vst1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 16); \
-})
-#else
-#define vst1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8_t __s1 = __p1; \
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 18); \
-})
-#else
-#define vst1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2_t __s1 = __p1; \
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 18); \
-})
-#endif
-
-#define vst1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 17); \
-})
-#else
-#define vst1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4_t __s1 = __p1; \
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 0); \
-})
-#else
-#define vst1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8_t __s1 = __p1; \
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 9); \
-})
-#else
-#define vst1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2_t __s1 = __p1; \
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 2); \
-})
-#else
-#define vst1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 2); \
-})
-#endif
-
-#define vst1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 1); \
-})
-#else
-#define vst1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
-  poly8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \
-})
-#else
-#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
-  poly8x8x2_t __s1 = __p1; \
-  poly8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
-  poly16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \
-})
-#else
-#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
-  poly16x4x2_t __s1 = __p1; \
-  poly16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
-  poly8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \
-})
-#else
-#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
-  poly8x16x2_t __s1 = __p1; \
-  poly8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
-  poly16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \
-})
-#else
-#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
-  poly16x8x2_t __s1 = __p1; \
-  poly16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
-  uint8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \
-})
-#else
-#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
-  uint8x16x2_t __s1 = __p1; \
-  uint8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
-  uint32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \
-})
-#else
-#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
-  uint32x4x2_t __s1 = __p1; \
-  uint32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
-  uint64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \
-})
-#else
-#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
-  uint64x2x2_t __s1 = __p1; \
-  uint64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
-  uint16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \
-})
-#else
-#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
-  uint16x8x2_t __s1 = __p1; \
-  uint16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
-  int8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \
-})
-#else
-#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
-  int8x16x2_t __s1 = __p1; \
-  int8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
-  float32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 41); \
-})
-#else
-#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
-  float32x4x2_t __s1 = __p1; \
-  float32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
-  int32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 34); \
-})
-#else
-#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
-  int32x4x2_t __s1 = __p1; \
-  int32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
-  int64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 35); \
-})
-#else
-#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
-  int64x2x2_t __s1 = __p1; \
-  int64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 35); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
-  int16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 33); \
-})
-#else
-#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
-  int16x8x2_t __s1 = __p1; \
-  int16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
-  uint8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \
-})
-#else
-#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
-  uint8x8x2_t __s1 = __p1; \
-  uint8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
-  uint32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \
-})
-#else
-#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
-  uint32x2x2_t __s1 = __p1; \
-  uint32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \
-})
-#endif
-
-#define vst1_u64_x2(__p0, __p1) __extension__ ({ \
-  uint64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
-  uint16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \
-})
-#else
-#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
-  uint16x4x2_t __s1 = __p1; \
-  uint16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
-  int8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \
-})
-#else
-#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
-  int8x8x2_t __s1 = __p1; \
-  int8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
-  float32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 9); \
-})
-#else
-#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
-  float32x2x2_t __s1 = __p1; \
-  float32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
-  int32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 2); \
-})
-#else
-#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
-  int32x2x2_t __s1 = __p1; \
-  int32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 2); \
-})
-#endif
-
-#define vst1_s64_x2(__p0, __p1) __extension__ ({ \
-  int64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 3); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
-  int16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 1); \
-})
-#else
-#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
-  int16x4x2_t __s1 = __p1; \
-  int16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
-  poly8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \
-})
-#else
-#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
-  poly8x8x3_t __s1 = __p1; \
-  poly8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
-  poly16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \
-})
-#else
-#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
-  poly16x4x3_t __s1 = __p1; \
-  poly16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
-  poly8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \
-})
-#else
-#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
-  poly8x16x3_t __s1 = __p1; \
-  poly8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
-  poly16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \
-})
-#else
-#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
-  poly16x8x3_t __s1 = __p1; \
-  poly16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
-  uint8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \
-})
-#else
-#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
-  uint8x16x3_t __s1 = __p1; \
-  uint8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
-  uint32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \
-})
-#else
-#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
-  uint32x4x3_t __s1 = __p1; \
-  uint32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
-  uint64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \
-})
-#else
-#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
-  uint64x2x3_t __s1 = __p1; \
-  uint64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
-  uint16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \
-})
-#else
-#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
-  uint16x8x3_t __s1 = __p1; \
-  uint16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
-  int8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \
-})
-#else
-#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
-  int8x16x3_t __s1 = __p1; \
-  int8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
-  float32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 41); \
-})
-#else
-#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
-  float32x4x3_t __s1 = __p1; \
-  float32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
-  int32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 34); \
-})
-#else
-#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
-  int32x4x3_t __s1 = __p1; \
-  int32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
-  int64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 35); \
-})
-#else
-#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
-  int64x2x3_t __s1 = __p1; \
-  int64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 35); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
-  int16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 33); \
-})
-#else
-#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
-  int16x8x3_t __s1 = __p1; \
-  int16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
-  uint8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \
-})
-#else
-#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
-  uint8x8x3_t __s1 = __p1; \
-  uint8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
-  uint32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \
-})
-#else
-#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
-  uint32x2x3_t __s1 = __p1; \
-  uint32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \
-})
-#endif
-
-#define vst1_u64_x3(__p0, __p1) __extension__ ({ \
-  uint64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
-  uint16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \
-})
-#else
-#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
-  uint16x4x3_t __s1 = __p1; \
-  uint16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
-  int8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \
-})
-#else
-#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
-  int8x8x3_t __s1 = __p1; \
-  int8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
-  float32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 9); \
-})
-#else
-#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
-  float32x2x3_t __s1 = __p1; \
-  float32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
-  int32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 2); \
-})
-#else
-#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
-  int32x2x3_t __s1 = __p1; \
-  int32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 2); \
-})
-#endif
-
-#define vst1_s64_x3(__p0, __p1) __extension__ ({ \
-  int64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 3); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
-  int16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 1); \
-})
-#else
-#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
-  int16x4x3_t __s1 = __p1; \
-  int16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
-  poly8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \
-})
-#else
-#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
-  poly8x8x4_t __s1 = __p1; \
-  poly8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
-  poly16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \
-})
-#else
-#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
-  poly16x4x4_t __s1 = __p1; \
-  poly16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
-  poly8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \
-})
-#else
-#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
-  poly8x16x4_t __s1 = __p1; \
-  poly8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
-  poly16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \
-})
-#else
-#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
-  poly16x8x4_t __s1 = __p1; \
-  poly16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
-  uint8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \
-})
-#else
-#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
-  uint8x16x4_t __s1 = __p1; \
-  uint8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
-  uint32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \
-})
-#else
-#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
-  uint32x4x4_t __s1 = __p1; \
-  uint32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
-  uint64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \
-})
-#else
-#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
-  uint64x2x4_t __s1 = __p1; \
-  uint64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
-  uint16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \
-})
-#else
-#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
-  uint16x8x4_t __s1 = __p1; \
-  uint16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
-  int8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \
-})
-#else
-#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
-  int8x16x4_t __s1 = __p1; \
-  int8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
-  float32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 41); \
-})
-#else
-#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
-  float32x4x4_t __s1 = __p1; \
-  float32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
-  int32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 34); \
-})
-#else
-#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
-  int32x4x4_t __s1 = __p1; \
-  int32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
-  int64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 35); \
-})
-#else
-#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
-  int64x2x4_t __s1 = __p1; \
-  int64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 35); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
-  int16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 33); \
-})
-#else
-#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
-  int16x8x4_t __s1 = __p1; \
-  int16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
-  uint8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \
-})
-#else
-#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
-  uint8x8x4_t __s1 = __p1; \
-  uint8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
-  uint32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \
-})
-#else
-#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
-  uint32x2x4_t __s1 = __p1; \
-  uint32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \
-})
-#endif
-
-#define vst1_u64_x4(__p0, __p1) __extension__ ({ \
-  uint64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
-  uint16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \
-})
-#else
-#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
-  uint16x4x4_t __s1 = __p1; \
-  uint16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
-  int8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \
-})
-#else
-#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
-  int8x8x4_t __s1 = __p1; \
-  int8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
-  float32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 9); \
-})
-#else
-#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
-  float32x2x4_t __s1 = __p1; \
-  float32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
-  int32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 2); \
-})
-#else
-#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
-  int32x2x4_t __s1 = __p1; \
-  int32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 2); \
-})
-#endif
-
-#define vst1_s64_x4(__p0, __p1) __extension__ ({ \
-  int64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 3); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
-  int16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 1); \
-})
-#else
-#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
-  int16x4x4_t __s1 = __p1; \
-  int16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_p8(__p0, __p1) __extension__ ({ \
-  poly8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \
-})
-#else
-#define vst2_p8(__p0, __p1) __extension__ ({ \
-  poly8x8x2_t __s1 = __p1; \
-  poly8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_p16(__p0, __p1) __extension__ ({ \
-  poly16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \
-})
-#else
-#define vst2_p16(__p0, __p1) __extension__ ({ \
-  poly16x4x2_t __s1 = __p1; \
-  poly16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_p8(__p0, __p1) __extension__ ({ \
-  poly8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \
-})
-#else
-#define vst2q_p8(__p0, __p1) __extension__ ({ \
-  poly8x16x2_t __s1 = __p1; \
-  poly8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_p16(__p0, __p1) __extension__ ({ \
-  poly16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \
-})
-#else
-#define vst2q_p16(__p0, __p1) __extension__ ({ \
-  poly16x8x2_t __s1 = __p1; \
-  poly16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_u8(__p0, __p1) __extension__ ({ \
-  uint8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \
-})
-#else
-#define vst2q_u8(__p0, __p1) __extension__ ({ \
-  uint8x16x2_t __s1 = __p1; \
-  uint8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_u32(__p0, __p1) __extension__ ({ \
-  uint32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \
-})
-#else
-#define vst2q_u32(__p0, __p1) __extension__ ({ \
-  uint32x4x2_t __s1 = __p1; \
-  uint32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_u16(__p0, __p1) __extension__ ({ \
-  uint16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \
-})
-#else
-#define vst2q_u16(__p0, __p1) __extension__ ({ \
-  uint16x8x2_t __s1 = __p1; \
-  uint16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_s8(__p0, __p1) __extension__ ({ \
-  int8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \
-})
-#else
-#define vst2q_s8(__p0, __p1) __extension__ ({ \
-  int8x16x2_t __s1 = __p1; \
-  int8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_f32(__p0, __p1) __extension__ ({ \
-  float32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 41); \
-})
-#else
-#define vst2q_f32(__p0, __p1) __extension__ ({ \
-  float32x4x2_t __s1 = __p1; \
-  float32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_s32(__p0, __p1) __extension__ ({ \
-  int32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 34); \
-})
-#else
-#define vst2q_s32(__p0, __p1) __extension__ ({ \
-  int32x4x2_t __s1 = __p1; \
-  int32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_s16(__p0, __p1) __extension__ ({ \
-  int16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 33); \
-})
-#else
-#define vst2q_s16(__p0, __p1) __extension__ ({ \
-  int16x8x2_t __s1 = __p1; \
-  int16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_u8(__p0, __p1) __extension__ ({ \
-  uint8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \
-})
-#else
-#define vst2_u8(__p0, __p1) __extension__ ({ \
-  uint8x8x2_t __s1 = __p1; \
-  uint8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_u32(__p0, __p1) __extension__ ({ \
-  uint32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \
-})
-#else
-#define vst2_u32(__p0, __p1) __extension__ ({ \
-  uint32x2x2_t __s1 = __p1; \
-  uint32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \
-})
-#endif
-
-#define vst2_u64(__p0, __p1) __extension__ ({ \
-  uint64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst2_u16(__p0, __p1) __extension__ ({ \
-  uint16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \
-})
-#else
-#define vst2_u16(__p0, __p1) __extension__ ({ \
-  uint16x4x2_t __s1 = __p1; \
-  uint16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_s8(__p0, __p1) __extension__ ({ \
-  int8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \
-})
-#else
-#define vst2_s8(__p0, __p1) __extension__ ({ \
-  int8x8x2_t __s1 = __p1; \
-  int8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_f32(__p0, __p1) __extension__ ({ \
-  float32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 9); \
-})
-#else
-#define vst2_f32(__p0, __p1) __extension__ ({ \
-  float32x2x2_t __s1 = __p1; \
-  float32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_s32(__p0, __p1) __extension__ ({ \
-  int32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 2); \
-})
-#else
-#define vst2_s32(__p0, __p1) __extension__ ({ \
-  int32x2x2_t __s1 = __p1; \
-  int32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 2); \
-})
-#endif
-
-#define vst2_s64(__p0, __p1) __extension__ ({ \
-  int64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 3); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst2_s16(__p0, __p1) __extension__ ({ \
-  int16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 1); \
-})
-#else
-#define vst2_s16(__p0, __p1) __extension__ ({ \
-  int16x4x2_t __s1 = __p1; \
-  int16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 4); \
-})
-#else
-#define vst2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x2_t __s1 = __p1; \
-  poly8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 5); \
-})
-#else
-#define vst2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x2_t __s1 = __p1; \
-  poly16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 37); \
-})
-#else
-#define vst2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x2_t __s1 = __p1; \
-  poly16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 50); \
-})
-#else
-#define vst2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x2_t __s1 = __p1; \
-  uint32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 49); \
-})
-#else
-#define vst2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x2_t __s1 = __p1; \
-  uint16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 41); \
-})
-#else
-#define vst2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x2_t __s1 = __p1; \
-  float32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 34); \
-})
-#else
-#define vst2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x2_t __s1 = __p1; \
-  int32x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 33); \
-})
-#else
-#define vst2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x2_t __s1 = __p1; \
-  int16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 16); \
-})
-#else
-#define vst2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x2_t __s1 = __p1; \
-  uint8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 18); \
-})
-#else
-#define vst2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x2_t __s1 = __p1; \
-  uint32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 18); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 17); \
-})
-#else
-#define vst2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x2_t __s1 = __p1; \
-  uint16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 0); \
-})
-#else
-#define vst2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x2_t __s1 = __p1; \
-  int8x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 9); \
-})
-#else
-#define vst2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x2_t __s1 = __p1; \
-  float32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 2); \
-})
-#else
-#define vst2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x2_t __s1 = __p1; \
-  int32x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 2); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 1); \
-})
-#else
-#define vst2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x2_t __s1 = __p1; \
-  int16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_p8(__p0, __p1) __extension__ ({ \
-  poly8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \
-})
-#else
-#define vst3_p8(__p0, __p1) __extension__ ({ \
-  poly8x8x3_t __s1 = __p1; \
-  poly8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_p16(__p0, __p1) __extension__ ({ \
-  poly16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \
-})
-#else
-#define vst3_p16(__p0, __p1) __extension__ ({ \
-  poly16x4x3_t __s1 = __p1; \
-  poly16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_p8(__p0, __p1) __extension__ ({ \
-  poly8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \
-})
-#else
-#define vst3q_p8(__p0, __p1) __extension__ ({ \
-  poly8x16x3_t __s1 = __p1; \
-  poly8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_p16(__p0, __p1) __extension__ ({ \
-  poly16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \
-})
-#else
-#define vst3q_p16(__p0, __p1) __extension__ ({ \
-  poly16x8x3_t __s1 = __p1; \
-  poly16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_u8(__p0, __p1) __extension__ ({ \
-  uint8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \
-})
-#else
-#define vst3q_u8(__p0, __p1) __extension__ ({ \
-  uint8x16x3_t __s1 = __p1; \
-  uint8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_u32(__p0, __p1) __extension__ ({ \
-  uint32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \
-})
-#else
-#define vst3q_u32(__p0, __p1) __extension__ ({ \
-  uint32x4x3_t __s1 = __p1; \
-  uint32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_u16(__p0, __p1) __extension__ ({ \
-  uint16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \
-})
-#else
-#define vst3q_u16(__p0, __p1) __extension__ ({ \
-  uint16x8x3_t __s1 = __p1; \
-  uint16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_s8(__p0, __p1) __extension__ ({ \
-  int8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \
-})
-#else
-#define vst3q_s8(__p0, __p1) __extension__ ({ \
-  int8x16x3_t __s1 = __p1; \
-  int8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_f32(__p0, __p1) __extension__ ({ \
-  float32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 41); \
-})
-#else
-#define vst3q_f32(__p0, __p1) __extension__ ({ \
-  float32x4x3_t __s1 = __p1; \
-  float32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_s32(__p0, __p1) __extension__ ({ \
-  int32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 34); \
-})
-#else
-#define vst3q_s32(__p0, __p1) __extension__ ({ \
-  int32x4x3_t __s1 = __p1; \
-  int32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_s16(__p0, __p1) __extension__ ({ \
-  int16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 33); \
-})
-#else
-#define vst3q_s16(__p0, __p1) __extension__ ({ \
-  int16x8x3_t __s1 = __p1; \
-  int16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_u8(__p0, __p1) __extension__ ({ \
-  uint8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \
-})
-#else
-#define vst3_u8(__p0, __p1) __extension__ ({ \
-  uint8x8x3_t __s1 = __p1; \
-  uint8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_u32(__p0, __p1) __extension__ ({ \
-  uint32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \
-})
-#else
-#define vst3_u32(__p0, __p1) __extension__ ({ \
-  uint32x2x3_t __s1 = __p1; \
-  uint32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \
-})
-#endif
-
-#define vst3_u64(__p0, __p1) __extension__ ({ \
-  uint64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst3_u16(__p0, __p1) __extension__ ({ \
-  uint16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \
-})
-#else
-#define vst3_u16(__p0, __p1) __extension__ ({ \
-  uint16x4x3_t __s1 = __p1; \
-  uint16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_s8(__p0, __p1) __extension__ ({ \
-  int8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \
-})
-#else
-#define vst3_s8(__p0, __p1) __extension__ ({ \
-  int8x8x3_t __s1 = __p1; \
-  int8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_f32(__p0, __p1) __extension__ ({ \
-  float32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 9); \
-})
-#else
-#define vst3_f32(__p0, __p1) __extension__ ({ \
-  float32x2x3_t __s1 = __p1; \
-  float32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_s32(__p0, __p1) __extension__ ({ \
-  int32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 2); \
-})
-#else
-#define vst3_s32(__p0, __p1) __extension__ ({ \
-  int32x2x3_t __s1 = __p1; \
-  int32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 2); \
-})
-#endif
-
-#define vst3_s64(__p0, __p1) __extension__ ({ \
-  int64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 3); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst3_s16(__p0, __p1) __extension__ ({ \
-  int16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 1); \
-})
-#else
-#define vst3_s16(__p0, __p1) __extension__ ({ \
-  int16x4x3_t __s1 = __p1; \
-  int16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 4); \
-})
-#else
-#define vst3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x3_t __s1 = __p1; \
-  poly8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 5); \
-})
-#else
-#define vst3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x3_t __s1 = __p1; \
-  poly16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 37); \
-})
-#else
-#define vst3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x3_t __s1 = __p1; \
-  poly16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 50); \
-})
-#else
-#define vst3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x3_t __s1 = __p1; \
-  uint32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 49); \
-})
-#else
-#define vst3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x3_t __s1 = __p1; \
-  uint16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 41); \
-})
-#else
-#define vst3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x3_t __s1 = __p1; \
-  float32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 34); \
-})
-#else
-#define vst3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x3_t __s1 = __p1; \
-  int32x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 33); \
-})
-#else
-#define vst3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x3_t __s1 = __p1; \
-  int16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 16); \
-})
-#else
-#define vst3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x3_t __s1 = __p1; \
-  uint8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 18); \
-})
-#else
-#define vst3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x3_t __s1 = __p1; \
-  uint32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 18); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 17); \
-})
-#else
-#define vst3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x3_t __s1 = __p1; \
-  uint16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 0); \
-})
-#else
-#define vst3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x3_t __s1 = __p1; \
-  int8x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 9); \
-})
-#else
-#define vst3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x3_t __s1 = __p1; \
-  float32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 2); \
-})
-#else
-#define vst3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x3_t __s1 = __p1; \
-  int32x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 2); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 1); \
-})
-#else
-#define vst3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x3_t __s1 = __p1; \
-  int16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_p8(__p0, __p1) __extension__ ({ \
-  poly8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \
-})
-#else
-#define vst4_p8(__p0, __p1) __extension__ ({ \
-  poly8x8x4_t __s1 = __p1; \
-  poly8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_p16(__p0, __p1) __extension__ ({ \
-  poly16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \
-})
-#else
-#define vst4_p16(__p0, __p1) __extension__ ({ \
-  poly16x4x4_t __s1 = __p1; \
-  poly16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_p8(__p0, __p1) __extension__ ({ \
-  poly8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \
-})
-#else
-#define vst4q_p8(__p0, __p1) __extension__ ({ \
-  poly8x16x4_t __s1 = __p1; \
-  poly8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_p16(__p0, __p1) __extension__ ({ \
-  poly16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \
-})
-#else
-#define vst4q_p16(__p0, __p1) __extension__ ({ \
-  poly16x8x4_t __s1 = __p1; \
-  poly16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_u8(__p0, __p1) __extension__ ({ \
-  uint8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \
-})
-#else
-#define vst4q_u8(__p0, __p1) __extension__ ({ \
-  uint8x16x4_t __s1 = __p1; \
-  uint8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_u32(__p0, __p1) __extension__ ({ \
-  uint32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \
-})
-#else
-#define vst4q_u32(__p0, __p1) __extension__ ({ \
-  uint32x4x4_t __s1 = __p1; \
-  uint32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_u16(__p0, __p1) __extension__ ({ \
-  uint16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \
-})
-#else
-#define vst4q_u16(__p0, __p1) __extension__ ({ \
-  uint16x8x4_t __s1 = __p1; \
-  uint16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_s8(__p0, __p1) __extension__ ({ \
-  int8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \
-})
-#else
-#define vst4q_s8(__p0, __p1) __extension__ ({ \
-  int8x16x4_t __s1 = __p1; \
-  int8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_f32(__p0, __p1) __extension__ ({ \
-  float32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 41); \
-})
-#else
-#define vst4q_f32(__p0, __p1) __extension__ ({ \
-  float32x4x4_t __s1 = __p1; \
-  float32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_s32(__p0, __p1) __extension__ ({ \
-  int32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 34); \
-})
-#else
-#define vst4q_s32(__p0, __p1) __extension__ ({ \
-  int32x4x4_t __s1 = __p1; \
-  int32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_s16(__p0, __p1) __extension__ ({ \
-  int16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 33); \
-})
-#else
-#define vst4q_s16(__p0, __p1) __extension__ ({ \
-  int16x8x4_t __s1 = __p1; \
-  int16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_u8(__p0, __p1) __extension__ ({ \
-  uint8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \
-})
-#else
-#define vst4_u8(__p0, __p1) __extension__ ({ \
-  uint8x8x4_t __s1 = __p1; \
-  uint8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_u32(__p0, __p1) __extension__ ({ \
-  uint32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \
-})
-#else
-#define vst4_u32(__p0, __p1) __extension__ ({ \
-  uint32x2x4_t __s1 = __p1; \
-  uint32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \
-})
-#endif
-
-#define vst4_u64(__p0, __p1) __extension__ ({ \
-  uint64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst4_u16(__p0, __p1) __extension__ ({ \
-  uint16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \
-})
-#else
-#define vst4_u16(__p0, __p1) __extension__ ({ \
-  uint16x4x4_t __s1 = __p1; \
-  uint16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_s8(__p0, __p1) __extension__ ({ \
-  int8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \
-})
-#else
-#define vst4_s8(__p0, __p1) __extension__ ({ \
-  int8x8x4_t __s1 = __p1; \
-  int8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_f32(__p0, __p1) __extension__ ({ \
-  float32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 9); \
-})
-#else
-#define vst4_f32(__p0, __p1) __extension__ ({ \
-  float32x2x4_t __s1 = __p1; \
-  float32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_s32(__p0, __p1) __extension__ ({ \
-  int32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 2); \
-})
-#else
-#define vst4_s32(__p0, __p1) __extension__ ({ \
-  int32x2x4_t __s1 = __p1; \
-  int32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 2); \
-})
-#endif
-
-#define vst4_s64(__p0, __p1) __extension__ ({ \
-  int64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 3); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst4_s16(__p0, __p1) __extension__ ({ \
-  int16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 1); \
-})
-#else
-#define vst4_s16(__p0, __p1) __extension__ ({ \
-  int16x4x4_t __s1 = __p1; \
-  int16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 4); \
-})
-#else
-#define vst4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x8x4_t __s1 = __p1; \
-  poly8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 4); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 5); \
-})
-#else
-#define vst4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x4x4_t __s1 = __p1; \
-  poly16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 5); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 37); \
-})
-#else
-#define vst4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
-  poly16x8x4_t __s1 = __p1; \
-  poly16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 37); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 50); \
-})
-#else
-#define vst4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x4x4_t __s1 = __p1; \
-  uint32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 50); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 49); \
-})
-#else
-#define vst4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x8x4_t __s1 = __p1; \
-  uint16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 49); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 41); \
-})
-#else
-#define vst4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x4x4_t __s1 = __p1; \
-  float32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 41); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 34); \
-})
-#else
-#define vst4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4x4_t __s1 = __p1; \
-  int32x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 34); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 33); \
-})
-#else
-#define vst4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8x4_t __s1 = __p1; \
-  int16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 33); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 16); \
-})
-#else
-#define vst4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x8x4_t __s1 = __p1; \
-  uint8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 16); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 18); \
-})
-#else
-#define vst4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
-  uint32x2x4_t __s1 = __p1; \
-  uint32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 18); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 17); \
-})
-#else
-#define vst4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
-  uint16x4x4_t __s1 = __p1; \
-  uint16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 17); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 0); \
-})
-#else
-#define vst4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x8x4_t __s1 = __p1; \
-  int8x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 0); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 9); \
-})
-#else
-#define vst4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
-  float32x2x4_t __s1 = __p1; \
-  float32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 9); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 2); \
-})
-#else
-#define vst4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2x4_t __s1 = __p1; \
-  int32x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 2); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 1); \
-})
-#else
-#define vst4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4x4_t __s1 = __p1; \
-  int16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 1); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vsubq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vsubq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vsubq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vsubq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vsubq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vsubq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vsubq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vsubq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vsubq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vsubq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vsub_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vsub_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vsub_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vsub_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vsub_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vsub_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vsub_s64(int64x1_t __p0, int64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vsub_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vsub_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t __noswap_vsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t __noswap_vsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t __noswap_vsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t __noswap_vsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t __noswap_vsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t __noswap_vsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vsubl_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = vmovl_u8(__p0) - vmovl_u8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vsubl_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_u8(__rev0) - __noswap_vmovl_u8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vsubl_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = vmovl_u32(__p0) - vmovl_u32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vsubl_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vmovl_u32(__rev0) - __noswap_vmovl_u32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vsubl_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = vmovl_u16(__p0) - vmovl_u16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vsubl_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_u16(__rev0) - __noswap_vmovl_u16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vsubl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  __ret = vmovl_s8(__p0) - vmovl_s8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vsubl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_s8(__rev0) - __noswap_vmovl_s8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vsubl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = vmovl_s32(__p0) - vmovl_s32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vsubl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vmovl_s32(__rev0) - __noswap_vmovl_s32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vsubl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = vmovl_s16(__p0) - vmovl_s16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vsubl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_s16(__rev0) - __noswap_vmovl_s16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vsubw_u8(uint16x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 - vmovl_u8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vsubw_u8(uint16x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_u8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vsubw_u32(uint64x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 - vmovl_u32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vsubw_u32(uint64x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_u32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vsubw_u16(uint32x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 - vmovl_u16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vsubw_u16(uint32x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_u16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vsubw_s8(int16x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 - vmovl_s8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vsubw_s8(int16x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_s8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vsubw_s32(int64x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 - vmovl_s32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vsubw_s32(int64x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_s32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vsubw_s16(int32x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 - vmovl_s16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vsubw_s16(int32x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_s16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtbl1_p8(poly8x8_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__p0, (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtbl1_p8(poly8x8_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__rev0, (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtbl1_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtbl1_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtbl1_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtbl1_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtbl2_p8(poly8x8x2_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtbl2_p8(poly8x8x2_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8x2_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtbl2_u8(uint8x8x2_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtbl2_u8(uint8x8x2_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8x2_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtbl2_s8(int8x8x2_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtbl2_s8(int8x8x2_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8x2_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtbl3_p8(poly8x8x3_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtbl3_p8(poly8x8x3_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8x3_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtbl3_u8(uint8x8x3_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtbl3_u8(uint8x8x3_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8x3_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtbl3_s8(int8x8x3_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtbl3_s8(int8x8x3_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8x3_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtbl4_p8(poly8x8x4_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p0.val[3], (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtbl4_p8(poly8x8x4_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8x4_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev0.val[3], (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtbl4_u8(uint8x8x4_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p0.val[3], (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtbl4_u8(uint8x8x4_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8x4_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev0.val[3], (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtbl4_s8(int8x8x4_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p0.val[3], (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtbl4_s8(int8x8x4_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8x4_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev0.val[3], (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtbx1_p8(poly8x8_t __p0, poly8x8_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtbx1_p8(poly8x8_t __p0, poly8x8_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtbx1_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtbx1_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtbx1_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtbx1_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtbx2_p8(poly8x8_t __p0, poly8x8x2_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p2, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtbx2_p8(poly8x8_t __p0, poly8x8x2_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8x2_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev2, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtbx2_u8(uint8x8_t __p0, uint8x8x2_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p2, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtbx2_u8(uint8x8_t __p0, uint8x8x2_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8x2_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev2, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtbx2_s8(int8x8_t __p0, int8x8x2_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p2, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtbx2_s8(int8x8_t __p0, int8x8x2_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8x2_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev2, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtbx3_p8(poly8x8_t __p0, poly8x8x3_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p2, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtbx3_p8(poly8x8_t __p0, poly8x8x3_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8x3_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev2, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtbx3_u8(uint8x8_t __p0, uint8x8x3_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p2, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtbx3_u8(uint8x8_t __p0, uint8x8x3_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8x3_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev2, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtbx3_s8(int8x8_t __p0, int8x8x3_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p2, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtbx3_s8(int8x8_t __p0, int8x8x3_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8x3_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev2, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtbx4_p8(poly8x8_t __p0, poly8x8x4_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p1.val[3], (int8x8_t)__p2, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtbx4_p8(poly8x8_t __p0, poly8x8x4_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8x4_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], (int8x8_t)__rev2, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtbx4_u8(uint8x8_t __p0, uint8x8x4_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p1.val[3], (int8x8_t)__p2, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtbx4_u8(uint8x8_t __p0, uint8x8x4_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8x4_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], (int8x8_t)__rev2, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtbx4_s8(int8x8_t __p0, int8x8x4_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p1.val[3], (int8x8_t)__p2, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtbx4_s8(int8x8_t __p0, int8x8x4_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8x4_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], (int8x8_t)__rev2, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8x2_t vtrn_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8x2_t vtrn_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8x2_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 4);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4x2_t vtrn_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4x2_t vtrn_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4x2_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 5);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16x2_t vtrnq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16x2_t vtrnq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16x2_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 36);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8x2_t vtrnq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 37);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8x2_t vtrnq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8x2_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 37);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16x2_t vtrnq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16x2_t vtrnq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16x2_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4x2_t vtrnq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4x2_t vtrnq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8x2_t vtrnq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8x2_t vtrnq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8x2_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16x2_t vtrnq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16x2_t vtrnq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16x2_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4x2_t vtrnq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4x2_t vtrnq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4x2_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4x2_t vtrnq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4x2_t vtrnq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8x2_t vtrnq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8x2_t vtrnq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8x2_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8x2_t vtrn_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8x2_t vtrn_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8x2_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2x2_t vtrn_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2x2_t vtrn_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4x2_t vtrn_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4x2_t vtrn_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4x2_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8x2_t vtrn_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8x2_t vtrn_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8x2_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2x2_t vtrn_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2x2_t vtrn_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2x2_t vtrn_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2x2_t vtrn_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4x2_t vtrn_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4x2_t vtrn_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4x2_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8x2_t __ret;
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8x2_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4x2_t __ret;
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4x2_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtst_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtst_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  uint8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vtst_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vtst_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  uint16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vtstq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vtstq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  uint8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vtstq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vtstq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  uint16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vtstq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vtstq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vtstq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vtstq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vtstq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vtstq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vtstq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vtstq_s8(int8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vtstq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vtstq_s32(int32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vtstq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vtstq_s16(int16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtst_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtst_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vtst_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vtst_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vtst_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vtst_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtst_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtst_s8(int8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vtst_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vtst_s32(int32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vtst_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vtst_s16(int16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8x2_t vuzp_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8x2_t vuzp_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8x2_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 4);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4x2_t vuzp_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4x2_t vuzp_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4x2_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 5);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16x2_t vuzpq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16x2_t vuzpq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16x2_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 36);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8x2_t vuzpq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 37);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8x2_t vuzpq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8x2_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 37);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16x2_t vuzpq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16x2_t vuzpq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16x2_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4x2_t vuzpq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4x2_t vuzpq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8x2_t vuzpq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8x2_t vuzpq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8x2_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16x2_t vuzpq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16x2_t vuzpq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16x2_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4x2_t vuzpq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4x2_t vuzpq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4x2_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4x2_t vuzpq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4x2_t vuzpq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8x2_t vuzpq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8x2_t vuzpq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8x2_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8x2_t vuzp_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8x2_t vuzp_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8x2_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2x2_t vuzp_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2x2_t vuzp_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4x2_t vuzp_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4x2_t vuzp_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4x2_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8x2_t vuzp_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8x2_t vuzp_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8x2_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2x2_t vuzp_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2x2_t vuzp_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2x2_t vuzp_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2x2_t vuzp_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4x2_t vuzp_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4x2_t vuzp_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4x2_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8x2_t __ret;
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8x2_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4x2_t __ret;
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4x2_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8x2_t vzip_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8x2_t vzip_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8x2_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 4);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4x2_t vzip_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4x2_t vzip_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4x2_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 5);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16x2_t vzipq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16x2_t vzipq_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16x2_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 36);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8x2_t vzipq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 37);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8x2_t vzipq_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8x2_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 37);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16x2_t vzipq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16x2_t vzipq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16x2_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4x2_t vzipq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4x2_t vzipq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8x2_t vzipq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8x2_t vzipq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8x2_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16x2_t vzipq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16x2_t vzipq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16x2_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4x2_t vzipq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4x2_t vzipq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4x2_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4x2_t vzipq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4x2_t vzipq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8x2_t vzipq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8x2_t vzipq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8x2_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8x2_t vzip_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8x2_t vzip_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8x2_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2x2_t vzip_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2x2_t vzip_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4x2_t vzip_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4x2_t vzip_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4x2_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8x2_t vzip_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8x2_t vzip_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8x2_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2x2_t vzip_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2x2_t vzip_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2x2_t vzip_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2x2_t vzip_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4x2_t vzip_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4x2_t vzip_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4x2_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8x2_t __ret;
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8x2_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4x2_t __ret;
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4x2_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.1a,neon"))) int32x4_t vqrdmlahq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqrdmlahq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.1a,neon"))) int32x4_t vqrdmlahq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqrdmlahq_s32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int32x4_t __noswap_vqrdmlahq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqrdmlahq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.1a,neon"))) int16x8_t vqrdmlahq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqrdmlahq_s16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.1a,neon"))) int16x8_t vqrdmlahq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqrdmlahq_s16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int16x8_t __noswap_vqrdmlahq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqrdmlahq_s16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 33);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.1a,neon"))) int32x2_t vqrdmlah_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqrdmlah_s32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.1a,neon"))) int32x2_t vqrdmlah_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqrdmlah_s32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int32x2_t __noswap_vqrdmlah_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqrdmlah_s32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.1a,neon"))) int16x4_t vqrdmlah_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqrdmlah_s16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.1a,neon"))) int16x4_t vqrdmlah_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqrdmlah_s16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int16x4_t __noswap_vqrdmlah_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqrdmlah_s16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlahq_lane_s32(__p0_134, __p1_134, __p2_134, __p3_134) __extension__ ({ \
-  int32x4_t __ret_134; \
-  int32x4_t __s0_134 = __p0_134; \
-  int32x4_t __s1_134 = __p1_134; \
-  int32x2_t __s2_134 = __p2_134; \
-  __ret_134 = vqrdmlahq_s32(__s0_134, __s1_134, splatq_lane_s32(__s2_134, __p3_134)); \
-  __ret_134; \
-})
-#else
-#define vqrdmlahq_lane_s32(__p0_135, __p1_135, __p2_135, __p3_135) __extension__ ({ \
-  int32x4_t __ret_135; \
-  int32x4_t __s0_135 = __p0_135; \
-  int32x4_t __s1_135 = __p1_135; \
-  int32x2_t __s2_135 = __p2_135; \
-  int32x4_t __rev0_135;  __rev0_135 = __builtin_shufflevector(__s0_135, __s0_135, 3, 2, 1, 0); \
-  int32x4_t __rev1_135;  __rev1_135 = __builtin_shufflevector(__s1_135, __s1_135, 3, 2, 1, 0); \
-  int32x2_t __rev2_135;  __rev2_135 = __builtin_shufflevector(__s2_135, __s2_135, 1, 0); \
-  __ret_135 = __noswap_vqrdmlahq_s32(__rev0_135, __rev1_135, __noswap_splatq_lane_s32(__rev2_135, __p3_135)); \
-  __ret_135 = __builtin_shufflevector(__ret_135, __ret_135, 3, 2, 1, 0); \
-  __ret_135; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlahq_lane_s16(__p0_136, __p1_136, __p2_136, __p3_136) __extension__ ({ \
-  int16x8_t __ret_136; \
-  int16x8_t __s0_136 = __p0_136; \
-  int16x8_t __s1_136 = __p1_136; \
-  int16x4_t __s2_136 = __p2_136; \
-  __ret_136 = vqrdmlahq_s16(__s0_136, __s1_136, splatq_lane_s16(__s2_136, __p3_136)); \
-  __ret_136; \
-})
-#else
-#define vqrdmlahq_lane_s16(__p0_137, __p1_137, __p2_137, __p3_137) __extension__ ({ \
-  int16x8_t __ret_137; \
-  int16x8_t __s0_137 = __p0_137; \
-  int16x8_t __s1_137 = __p1_137; \
-  int16x4_t __s2_137 = __p2_137; \
-  int16x8_t __rev0_137;  __rev0_137 = __builtin_shufflevector(__s0_137, __s0_137, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_137;  __rev1_137 = __builtin_shufflevector(__s1_137, __s1_137, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_137;  __rev2_137 = __builtin_shufflevector(__s2_137, __s2_137, 3, 2, 1, 0); \
-  __ret_137 = __noswap_vqrdmlahq_s16(__rev0_137, __rev1_137, __noswap_splatq_lane_s16(__rev2_137, __p3_137)); \
-  __ret_137 = __builtin_shufflevector(__ret_137, __ret_137, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_137; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlah_lane_s32(__p0_138, __p1_138, __p2_138, __p3_138) __extension__ ({ \
-  int32x2_t __ret_138; \
-  int32x2_t __s0_138 = __p0_138; \
-  int32x2_t __s1_138 = __p1_138; \
-  int32x2_t __s2_138 = __p2_138; \
-  __ret_138 = vqrdmlah_s32(__s0_138, __s1_138, splat_lane_s32(__s2_138, __p3_138)); \
-  __ret_138; \
-})
-#else
-#define vqrdmlah_lane_s32(__p0_139, __p1_139, __p2_139, __p3_139) __extension__ ({ \
-  int32x2_t __ret_139; \
-  int32x2_t __s0_139 = __p0_139; \
-  int32x2_t __s1_139 = __p1_139; \
-  int32x2_t __s2_139 = __p2_139; \
-  int32x2_t __rev0_139;  __rev0_139 = __builtin_shufflevector(__s0_139, __s0_139, 1, 0); \
-  int32x2_t __rev1_139;  __rev1_139 = __builtin_shufflevector(__s1_139, __s1_139, 1, 0); \
-  int32x2_t __rev2_139;  __rev2_139 = __builtin_shufflevector(__s2_139, __s2_139, 1, 0); \
-  __ret_139 = __noswap_vqrdmlah_s32(__rev0_139, __rev1_139, __noswap_splat_lane_s32(__rev2_139, __p3_139)); \
-  __ret_139 = __builtin_shufflevector(__ret_139, __ret_139, 1, 0); \
-  __ret_139; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlah_lane_s16(__p0_140, __p1_140, __p2_140, __p3_140) __extension__ ({ \
-  int16x4_t __ret_140; \
-  int16x4_t __s0_140 = __p0_140; \
-  int16x4_t __s1_140 = __p1_140; \
-  int16x4_t __s2_140 = __p2_140; \
-  __ret_140 = vqrdmlah_s16(__s0_140, __s1_140, splat_lane_s16(__s2_140, __p3_140)); \
-  __ret_140; \
-})
-#else
-#define vqrdmlah_lane_s16(__p0_141, __p1_141, __p2_141, __p3_141) __extension__ ({ \
-  int16x4_t __ret_141; \
-  int16x4_t __s0_141 = __p0_141; \
-  int16x4_t __s1_141 = __p1_141; \
-  int16x4_t __s2_141 = __p2_141; \
-  int16x4_t __rev0_141;  __rev0_141 = __builtin_shufflevector(__s0_141, __s0_141, 3, 2, 1, 0); \
-  int16x4_t __rev1_141;  __rev1_141 = __builtin_shufflevector(__s1_141, __s1_141, 3, 2, 1, 0); \
-  int16x4_t __rev2_141;  __rev2_141 = __builtin_shufflevector(__s2_141, __s2_141, 3, 2, 1, 0); \
-  __ret_141 = __noswap_vqrdmlah_s16(__rev0_141, __rev1_141, __noswap_splat_lane_s16(__rev2_141, __p3_141)); \
-  __ret_141 = __builtin_shufflevector(__ret_141, __ret_141, 3, 2, 1, 0); \
-  __ret_141; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.1a,neon"))) int32x4_t vqrdmlshq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqrdmlshq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.1a,neon"))) int32x4_t vqrdmlshq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vqrdmlshq_s32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int32x4_t __noswap_vqrdmlshq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vqrdmlshq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.1a,neon"))) int16x8_t vqrdmlshq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqrdmlshq_s16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.1a,neon"))) int16x8_t vqrdmlshq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vqrdmlshq_s16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int16x8_t __noswap_vqrdmlshq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vqrdmlshq_s16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 33);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.1a,neon"))) int32x2_t vqrdmlsh_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqrdmlsh_s32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.1a,neon"))) int32x2_t vqrdmlsh_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vqrdmlsh_s32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int32x2_t __noswap_vqrdmlsh_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vqrdmlsh_s32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.1a,neon"))) int16x4_t vqrdmlsh_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqrdmlsh_s16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.1a,neon"))) int16x4_t vqrdmlsh_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vqrdmlsh_s16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int16x4_t __noswap_vqrdmlsh_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vqrdmlsh_s16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlshq_lane_s32(__p0_142, __p1_142, __p2_142, __p3_142) __extension__ ({ \
-  int32x4_t __ret_142; \
-  int32x4_t __s0_142 = __p0_142; \
-  int32x4_t __s1_142 = __p1_142; \
-  int32x2_t __s2_142 = __p2_142; \
-  __ret_142 = vqrdmlshq_s32(__s0_142, __s1_142, splatq_lane_s32(__s2_142, __p3_142)); \
-  __ret_142; \
-})
-#else
-#define vqrdmlshq_lane_s32(__p0_143, __p1_143, __p2_143, __p3_143) __extension__ ({ \
-  int32x4_t __ret_143; \
-  int32x4_t __s0_143 = __p0_143; \
-  int32x4_t __s1_143 = __p1_143; \
-  int32x2_t __s2_143 = __p2_143; \
-  int32x4_t __rev0_143;  __rev0_143 = __builtin_shufflevector(__s0_143, __s0_143, 3, 2, 1, 0); \
-  int32x4_t __rev1_143;  __rev1_143 = __builtin_shufflevector(__s1_143, __s1_143, 3, 2, 1, 0); \
-  int32x2_t __rev2_143;  __rev2_143 = __builtin_shufflevector(__s2_143, __s2_143, 1, 0); \
-  __ret_143 = __noswap_vqrdmlshq_s32(__rev0_143, __rev1_143, __noswap_splatq_lane_s32(__rev2_143, __p3_143)); \
-  __ret_143 = __builtin_shufflevector(__ret_143, __ret_143, 3, 2, 1, 0); \
-  __ret_143; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlshq_lane_s16(__p0_144, __p1_144, __p2_144, __p3_144) __extension__ ({ \
-  int16x8_t __ret_144; \
-  int16x8_t __s0_144 = __p0_144; \
-  int16x8_t __s1_144 = __p1_144; \
-  int16x4_t __s2_144 = __p2_144; \
-  __ret_144 = vqrdmlshq_s16(__s0_144, __s1_144, splatq_lane_s16(__s2_144, __p3_144)); \
-  __ret_144; \
-})
-#else
-#define vqrdmlshq_lane_s16(__p0_145, __p1_145, __p2_145, __p3_145) __extension__ ({ \
-  int16x8_t __ret_145; \
-  int16x8_t __s0_145 = __p0_145; \
-  int16x8_t __s1_145 = __p1_145; \
-  int16x4_t __s2_145 = __p2_145; \
-  int16x8_t __rev0_145;  __rev0_145 = __builtin_shufflevector(__s0_145, __s0_145, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_145;  __rev1_145 = __builtin_shufflevector(__s1_145, __s1_145, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_145;  __rev2_145 = __builtin_shufflevector(__s2_145, __s2_145, 3, 2, 1, 0); \
-  __ret_145 = __noswap_vqrdmlshq_s16(__rev0_145, __rev1_145, __noswap_splatq_lane_s16(__rev2_145, __p3_145)); \
-  __ret_145 = __builtin_shufflevector(__ret_145, __ret_145, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_145; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlsh_lane_s32(__p0_146, __p1_146, __p2_146, __p3_146) __extension__ ({ \
-  int32x2_t __ret_146; \
-  int32x2_t __s0_146 = __p0_146; \
-  int32x2_t __s1_146 = __p1_146; \
-  int32x2_t __s2_146 = __p2_146; \
-  __ret_146 = vqrdmlsh_s32(__s0_146, __s1_146, splat_lane_s32(__s2_146, __p3_146)); \
-  __ret_146; \
-})
-#else
-#define vqrdmlsh_lane_s32(__p0_147, __p1_147, __p2_147, __p3_147) __extension__ ({ \
-  int32x2_t __ret_147; \
-  int32x2_t __s0_147 = __p0_147; \
-  int32x2_t __s1_147 = __p1_147; \
-  int32x2_t __s2_147 = __p2_147; \
-  int32x2_t __rev0_147;  __rev0_147 = __builtin_shufflevector(__s0_147, __s0_147, 1, 0); \
-  int32x2_t __rev1_147;  __rev1_147 = __builtin_shufflevector(__s1_147, __s1_147, 1, 0); \
-  int32x2_t __rev2_147;  __rev2_147 = __builtin_shufflevector(__s2_147, __s2_147, 1, 0); \
-  __ret_147 = __noswap_vqrdmlsh_s32(__rev0_147, __rev1_147, __noswap_splat_lane_s32(__rev2_147, __p3_147)); \
-  __ret_147 = __builtin_shufflevector(__ret_147, __ret_147, 1, 0); \
-  __ret_147; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlsh_lane_s16(__p0_148, __p1_148, __p2_148, __p3_148) __extension__ ({ \
-  int16x4_t __ret_148; \
-  int16x4_t __s0_148 = __p0_148; \
-  int16x4_t __s1_148 = __p1_148; \
-  int16x4_t __s2_148 = __p2_148; \
-  __ret_148 = vqrdmlsh_s16(__s0_148, __s1_148, splat_lane_s16(__s2_148, __p3_148)); \
-  __ret_148; \
-})
-#else
-#define vqrdmlsh_lane_s16(__p0_149, __p1_149, __p2_149, __p3_149) __extension__ ({ \
-  int16x4_t __ret_149; \
-  int16x4_t __s0_149 = __p0_149; \
-  int16x4_t __s1_149 = __p1_149; \
-  int16x4_t __s2_149 = __p2_149; \
-  int16x4_t __rev0_149;  __rev0_149 = __builtin_shufflevector(__s0_149, __s0_149, 3, 2, 1, 0); \
-  int16x4_t __rev1_149;  __rev1_149 = __builtin_shufflevector(__s1_149, __s1_149, 3, 2, 1, 0); \
-  int16x4_t __rev2_149;  __rev2_149 = __builtin_shufflevector(__s2_149, __s2_149, 3, 2, 1, 0); \
-  __ret_149 = __noswap_vqrdmlsh_s16(__rev0_149, __rev1_149, __noswap_splat_lane_s16(__rev2_149, __p3_149)); \
-  __ret_149 = __builtin_shufflevector(__ret_149, __ret_149, 3, 2, 1, 0); \
-  __ret_149; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcadd_rot270_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcadd_rot270_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcadd_rot270_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vcadd_rot270_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcadd_rot90_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcadd_rot90_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcadd_rot90_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vcadd_rot90_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcaddq_rot270_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcaddq_rot270_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcaddq_rot270_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vcaddq_rot270_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcaddq_rot90_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcaddq_rot90_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcaddq_rot90_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vcaddq_rot90_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t __noswap_vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcmla_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vcmla_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t __noswap_vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcmla_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_lane_f16(__p0_150, __p1_150, __p2_150, __p3_150) __extension__ ({ \
-  float16x4_t __ret_150; \
-  float16x4_t __s0_150 = __p0_150; \
-  float16x4_t __s1_150 = __p1_150; \
-  float16x4_t __s2_150 = __p2_150; \
-float16x4_t __reint_150 = __s2_150; \
-uint32x2_t __reint1_150 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_150, __p3_150), vget_lane_u32(*(uint32x2_t *) &__reint_150, __p3_150)}; \
-  __ret_150 = vcmla_f16(__s0_150, __s1_150, *(float16x4_t *) &__reint1_150); \
-  __ret_150; \
-})
-#else
-#define vcmla_lane_f16(__p0_151, __p1_151, __p2_151, __p3_151) __extension__ ({ \
-  float16x4_t __ret_151; \
-  float16x4_t __s0_151 = __p0_151; \
-  float16x4_t __s1_151 = __p1_151; \
-  float16x4_t __s2_151 = __p2_151; \
-  float16x4_t __rev0_151;  __rev0_151 = __builtin_shufflevector(__s0_151, __s0_151, 3, 2, 1, 0); \
-  float16x4_t __rev1_151;  __rev1_151 = __builtin_shufflevector(__s1_151, __s1_151, 3, 2, 1, 0); \
-  float16x4_t __rev2_151;  __rev2_151 = __builtin_shufflevector(__s2_151, __s2_151, 3, 2, 1, 0); \
-float16x4_t __reint_151 = __rev2_151; \
-uint32x2_t __reint1_151 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_151, __p3_151), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_151, __p3_151)}; \
-  __ret_151 = __noswap_vcmla_f16(__rev0_151, __rev1_151, *(float16x4_t *) &__reint1_151); \
-  __ret_151 = __builtin_shufflevector(__ret_151, __ret_151, 3, 2, 1, 0); \
-  __ret_151; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_lane_f16(__p0_152, __p1_152, __p2_152, __p3_152) __extension__ ({ \
-  float16x8_t __ret_152; \
-  float16x8_t __s0_152 = __p0_152; \
-  float16x8_t __s1_152 = __p1_152; \
-  float16x4_t __s2_152 = __p2_152; \
-float16x4_t __reint_152 = __s2_152; \
-uint32x4_t __reint1_152 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_152, __p3_152), vget_lane_u32(*(uint32x2_t *) &__reint_152, __p3_152), vget_lane_u32(*(uint32x2_t *) &__reint_152, __p3_152), vget_lane_u32(*(uint32x2_t *) &__reint_152, __p3_152)}; \
-  __ret_152 = vcmlaq_f16(__s0_152, __s1_152, *(float16x8_t *) &__reint1_152); \
-  __ret_152; \
-})
-#else
-#define vcmlaq_lane_f16(__p0_153, __p1_153, __p2_153, __p3_153) __extension__ ({ \
-  float16x8_t __ret_153; \
-  float16x8_t __s0_153 = __p0_153; \
-  float16x8_t __s1_153 = __p1_153; \
-  float16x4_t __s2_153 = __p2_153; \
-  float16x8_t __rev0_153;  __rev0_153 = __builtin_shufflevector(__s0_153, __s0_153, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_153;  __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_153;  __rev2_153 = __builtin_shufflevector(__s2_153, __s2_153, 3, 2, 1, 0); \
-float16x4_t __reint_153 = __rev2_153; \
-uint32x4_t __reint1_153 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_153, __p3_153), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_153, __p3_153), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_153, __p3_153), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_153, __p3_153)}; \
-  __ret_153 = __noswap_vcmlaq_f16(__rev0_153, __rev1_153, *(float16x8_t *) &__reint1_153); \
-  __ret_153 = __builtin_shufflevector(__ret_153, __ret_153, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_153; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_laneq_f16(__p0_154, __p1_154, __p2_154, __p3_154) __extension__ ({ \
-  float16x4_t __ret_154; \
-  float16x4_t __s0_154 = __p0_154; \
-  float16x4_t __s1_154 = __p1_154; \
-  float16x8_t __s2_154 = __p2_154; \
-float16x8_t __reint_154 = __s2_154; \
-uint32x2_t __reint1_154 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_154, __p3_154), vgetq_lane_u32(*(uint32x4_t *) &__reint_154, __p3_154)}; \
-  __ret_154 = vcmla_f16(__s0_154, __s1_154, *(float16x4_t *) &__reint1_154); \
-  __ret_154; \
-})
-#else
-#define vcmla_laneq_f16(__p0_155, __p1_155, __p2_155, __p3_155) __extension__ ({ \
-  float16x4_t __ret_155; \
-  float16x4_t __s0_155 = __p0_155; \
-  float16x4_t __s1_155 = __p1_155; \
-  float16x8_t __s2_155 = __p2_155; \
-  float16x4_t __rev0_155;  __rev0_155 = __builtin_shufflevector(__s0_155, __s0_155, 3, 2, 1, 0); \
-  float16x4_t __rev1_155;  __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 3, 2, 1, 0); \
-  float16x8_t __rev2_155;  __rev2_155 = __builtin_shufflevector(__s2_155, __s2_155, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16x8_t __reint_155 = __rev2_155; \
-uint32x2_t __reint1_155 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_155, __p3_155), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_155, __p3_155)}; \
-  __ret_155 = __noswap_vcmla_f16(__rev0_155, __rev1_155, *(float16x4_t *) &__reint1_155); \
-  __ret_155 = __builtin_shufflevector(__ret_155, __ret_155, 3, 2, 1, 0); \
-  __ret_155; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_laneq_f16(__p0_156, __p1_156, __p2_156, __p3_156) __extension__ ({ \
-  float16x8_t __ret_156; \
-  float16x8_t __s0_156 = __p0_156; \
-  float16x8_t __s1_156 = __p1_156; \
-  float16x8_t __s2_156 = __p2_156; \
-float16x8_t __reint_156 = __s2_156; \
-uint32x4_t __reint1_156 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_156, __p3_156), vgetq_lane_u32(*(uint32x4_t *) &__reint_156, __p3_156), vgetq_lane_u32(*(uint32x4_t *) &__reint_156, __p3_156), vgetq_lane_u32(*(uint32x4_t *) &__reint_156, __p3_156)}; \
-  __ret_156 = vcmlaq_f16(__s0_156, __s1_156, *(float16x8_t *) &__reint1_156); \
-  __ret_156; \
-})
-#else
-#define vcmlaq_laneq_f16(__p0_157, __p1_157, __p2_157, __p3_157) __extension__ ({ \
-  float16x8_t __ret_157; \
-  float16x8_t __s0_157 = __p0_157; \
-  float16x8_t __s1_157 = __p1_157; \
-  float16x8_t __s2_157 = __p2_157; \
-  float16x8_t __rev0_157;  __rev0_157 = __builtin_shufflevector(__s0_157, __s0_157, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_157;  __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_157;  __rev2_157 = __builtin_shufflevector(__s2_157, __s2_157, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16x8_t __reint_157 = __rev2_157; \
-uint32x4_t __reint1_157 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_157, __p3_157), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_157, __p3_157), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_157, __p3_157), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_157, __p3_157)}; \
-  __ret_157 = __noswap_vcmlaq_f16(__rev0_157, __rev1_157, *(float16x8_t *) &__reint1_157); \
-  __ret_157 = __builtin_shufflevector(__ret_157, __ret_157, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_157; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t __noswap_vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcmla_rot180_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vcmla_rot180_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t __noswap_vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcmla_rot180_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot180_lane_f16(__p0_158, __p1_158, __p2_158, __p3_158) __extension__ ({ \
-  float16x4_t __ret_158; \
-  float16x4_t __s0_158 = __p0_158; \
-  float16x4_t __s1_158 = __p1_158; \
-  float16x4_t __s2_158 = __p2_158; \
-float16x4_t __reint_158 = __s2_158; \
-uint32x2_t __reint1_158 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_158, __p3_158), vget_lane_u32(*(uint32x2_t *) &__reint_158, __p3_158)}; \
-  __ret_158 = vcmla_rot180_f16(__s0_158, __s1_158, *(float16x4_t *) &__reint1_158); \
-  __ret_158; \
-})
-#else
-#define vcmla_rot180_lane_f16(__p0_159, __p1_159, __p2_159, __p3_159) __extension__ ({ \
-  float16x4_t __ret_159; \
-  float16x4_t __s0_159 = __p0_159; \
-  float16x4_t __s1_159 = __p1_159; \
-  float16x4_t __s2_159 = __p2_159; \
-  float16x4_t __rev0_159;  __rev0_159 = __builtin_shufflevector(__s0_159, __s0_159, 3, 2, 1, 0); \
-  float16x4_t __rev1_159;  __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 3, 2, 1, 0); \
-  float16x4_t __rev2_159;  __rev2_159 = __builtin_shufflevector(__s2_159, __s2_159, 3, 2, 1, 0); \
-float16x4_t __reint_159 = __rev2_159; \
-uint32x2_t __reint1_159 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_159, __p3_159), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_159, __p3_159)}; \
-  __ret_159 = __noswap_vcmla_rot180_f16(__rev0_159, __rev1_159, *(float16x4_t *) &__reint1_159); \
-  __ret_159 = __builtin_shufflevector(__ret_159, __ret_159, 3, 2, 1, 0); \
-  __ret_159; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot180_lane_f16(__p0_160, __p1_160, __p2_160, __p3_160) __extension__ ({ \
-  float16x8_t __ret_160; \
-  float16x8_t __s0_160 = __p0_160; \
-  float16x8_t __s1_160 = __p1_160; \
-  float16x4_t __s2_160 = __p2_160; \
-float16x4_t __reint_160 = __s2_160; \
-uint32x4_t __reint1_160 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_160, __p3_160), vget_lane_u32(*(uint32x2_t *) &__reint_160, __p3_160), vget_lane_u32(*(uint32x2_t *) &__reint_160, __p3_160), vget_lane_u32(*(uint32x2_t *) &__reint_160, __p3_160)}; \
-  __ret_160 = vcmlaq_rot180_f16(__s0_160, __s1_160, *(float16x8_t *) &__reint1_160); \
-  __ret_160; \
-})
-#else
-#define vcmlaq_rot180_lane_f16(__p0_161, __p1_161, __p2_161, __p3_161) __extension__ ({ \
-  float16x8_t __ret_161; \
-  float16x8_t __s0_161 = __p0_161; \
-  float16x8_t __s1_161 = __p1_161; \
-  float16x4_t __s2_161 = __p2_161; \
-  float16x8_t __rev0_161;  __rev0_161 = __builtin_shufflevector(__s0_161, __s0_161, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_161;  __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_161;  __rev2_161 = __builtin_shufflevector(__s2_161, __s2_161, 3, 2, 1, 0); \
-float16x4_t __reint_161 = __rev2_161; \
-uint32x4_t __reint1_161 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_161, __p3_161), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_161, __p3_161), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_161, __p3_161), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_161, __p3_161)}; \
-  __ret_161 = __noswap_vcmlaq_rot180_f16(__rev0_161, __rev1_161, *(float16x8_t *) &__reint1_161); \
-  __ret_161 = __builtin_shufflevector(__ret_161, __ret_161, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_161; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot180_laneq_f16(__p0_162, __p1_162, __p2_162, __p3_162) __extension__ ({ \
-  float16x4_t __ret_162; \
-  float16x4_t __s0_162 = __p0_162; \
-  float16x4_t __s1_162 = __p1_162; \
-  float16x8_t __s2_162 = __p2_162; \
-float16x8_t __reint_162 = __s2_162; \
-uint32x2_t __reint1_162 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_162, __p3_162), vgetq_lane_u32(*(uint32x4_t *) &__reint_162, __p3_162)}; \
-  __ret_162 = vcmla_rot180_f16(__s0_162, __s1_162, *(float16x4_t *) &__reint1_162); \
-  __ret_162; \
-})
-#else
-#define vcmla_rot180_laneq_f16(__p0_163, __p1_163, __p2_163, __p3_163) __extension__ ({ \
-  float16x4_t __ret_163; \
-  float16x4_t __s0_163 = __p0_163; \
-  float16x4_t __s1_163 = __p1_163; \
-  float16x8_t __s2_163 = __p2_163; \
-  float16x4_t __rev0_163;  __rev0_163 = __builtin_shufflevector(__s0_163, __s0_163, 3, 2, 1, 0); \
-  float16x4_t __rev1_163;  __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 3, 2, 1, 0); \
-  float16x8_t __rev2_163;  __rev2_163 = __builtin_shufflevector(__s2_163, __s2_163, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16x8_t __reint_163 = __rev2_163; \
-uint32x2_t __reint1_163 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_163, __p3_163), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_163, __p3_163)}; \
-  __ret_163 = __noswap_vcmla_rot180_f16(__rev0_163, __rev1_163, *(float16x4_t *) &__reint1_163); \
-  __ret_163 = __builtin_shufflevector(__ret_163, __ret_163, 3, 2, 1, 0); \
-  __ret_163; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot180_laneq_f16(__p0_164, __p1_164, __p2_164, __p3_164) __extension__ ({ \
-  float16x8_t __ret_164; \
-  float16x8_t __s0_164 = __p0_164; \
-  float16x8_t __s1_164 = __p1_164; \
-  float16x8_t __s2_164 = __p2_164; \
-float16x8_t __reint_164 = __s2_164; \
-uint32x4_t __reint1_164 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_164, __p3_164), vgetq_lane_u32(*(uint32x4_t *) &__reint_164, __p3_164), vgetq_lane_u32(*(uint32x4_t *) &__reint_164, __p3_164), vgetq_lane_u32(*(uint32x4_t *) &__reint_164, __p3_164)}; \
-  __ret_164 = vcmlaq_rot180_f16(__s0_164, __s1_164, *(float16x8_t *) &__reint1_164); \
-  __ret_164; \
-})
-#else
-#define vcmlaq_rot180_laneq_f16(__p0_165, __p1_165, __p2_165, __p3_165) __extension__ ({ \
-  float16x8_t __ret_165; \
-  float16x8_t __s0_165 = __p0_165; \
-  float16x8_t __s1_165 = __p1_165; \
-  float16x8_t __s2_165 = __p2_165; \
-  float16x8_t __rev0_165;  __rev0_165 = __builtin_shufflevector(__s0_165, __s0_165, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_165;  __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_165;  __rev2_165 = __builtin_shufflevector(__s2_165, __s2_165, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16x8_t __reint_165 = __rev2_165; \
-uint32x4_t __reint1_165 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_165, __p3_165), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_165, __p3_165), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_165, __p3_165), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_165, __p3_165)}; \
-  __ret_165 = __noswap_vcmlaq_rot180_f16(__rev0_165, __rev1_165, *(float16x8_t *) &__reint1_165); \
-  __ret_165 = __builtin_shufflevector(__ret_165, __ret_165, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_165; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t __noswap_vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcmla_rot270_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vcmla_rot270_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t __noswap_vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcmla_rot270_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot270_lane_f16(__p0_166, __p1_166, __p2_166, __p3_166) __extension__ ({ \
-  float16x4_t __ret_166; \
-  float16x4_t __s0_166 = __p0_166; \
-  float16x4_t __s1_166 = __p1_166; \
-  float16x4_t __s2_166 = __p2_166; \
-float16x4_t __reint_166 = __s2_166; \
-uint32x2_t __reint1_166 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_166, __p3_166), vget_lane_u32(*(uint32x2_t *) &__reint_166, __p3_166)}; \
-  __ret_166 = vcmla_rot270_f16(__s0_166, __s1_166, *(float16x4_t *) &__reint1_166); \
-  __ret_166; \
-})
-#else
-#define vcmla_rot270_lane_f16(__p0_167, __p1_167, __p2_167, __p3_167) __extension__ ({ \
-  float16x4_t __ret_167; \
-  float16x4_t __s0_167 = __p0_167; \
-  float16x4_t __s1_167 = __p1_167; \
-  float16x4_t __s2_167 = __p2_167; \
-  float16x4_t __rev0_167;  __rev0_167 = __builtin_shufflevector(__s0_167, __s0_167, 3, 2, 1, 0); \
-  float16x4_t __rev1_167;  __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 3, 2, 1, 0); \
-  float16x4_t __rev2_167;  __rev2_167 = __builtin_shufflevector(__s2_167, __s2_167, 3, 2, 1, 0); \
-float16x4_t __reint_167 = __rev2_167; \
-uint32x2_t __reint1_167 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_167, __p3_167), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_167, __p3_167)}; \
-  __ret_167 = __noswap_vcmla_rot270_f16(__rev0_167, __rev1_167, *(float16x4_t *) &__reint1_167); \
-  __ret_167 = __builtin_shufflevector(__ret_167, __ret_167, 3, 2, 1, 0); \
-  __ret_167; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot270_lane_f16(__p0_168, __p1_168, __p2_168, __p3_168) __extension__ ({ \
-  float16x8_t __ret_168; \
-  float16x8_t __s0_168 = __p0_168; \
-  float16x8_t __s1_168 = __p1_168; \
-  float16x4_t __s2_168 = __p2_168; \
-float16x4_t __reint_168 = __s2_168; \
-uint32x4_t __reint1_168 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_168, __p3_168), vget_lane_u32(*(uint32x2_t *) &__reint_168, __p3_168), vget_lane_u32(*(uint32x2_t *) &__reint_168, __p3_168), vget_lane_u32(*(uint32x2_t *) &__reint_168, __p3_168)}; \
-  __ret_168 = vcmlaq_rot270_f16(__s0_168, __s1_168, *(float16x8_t *) &__reint1_168); \
-  __ret_168; \
-})
-#else
-#define vcmlaq_rot270_lane_f16(__p0_169, __p1_169, __p2_169, __p3_169) __extension__ ({ \
-  float16x8_t __ret_169; \
-  float16x8_t __s0_169 = __p0_169; \
-  float16x8_t __s1_169 = __p1_169; \
-  float16x4_t __s2_169 = __p2_169; \
-  float16x8_t __rev0_169;  __rev0_169 = __builtin_shufflevector(__s0_169, __s0_169, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_169;  __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_169;  __rev2_169 = __builtin_shufflevector(__s2_169, __s2_169, 3, 2, 1, 0); \
-float16x4_t __reint_169 = __rev2_169; \
-uint32x4_t __reint1_169 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_169, __p3_169), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_169, __p3_169), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_169, __p3_169), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_169, __p3_169)}; \
-  __ret_169 = __noswap_vcmlaq_rot270_f16(__rev0_169, __rev1_169, *(float16x8_t *) &__reint1_169); \
-  __ret_169 = __builtin_shufflevector(__ret_169, __ret_169, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_169; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot270_laneq_f16(__p0_170, __p1_170, __p2_170, __p3_170) __extension__ ({ \
-  float16x4_t __ret_170; \
-  float16x4_t __s0_170 = __p0_170; \
-  float16x4_t __s1_170 = __p1_170; \
-  float16x8_t __s2_170 = __p2_170; \
-float16x8_t __reint_170 = __s2_170; \
-uint32x2_t __reint1_170 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_170, __p3_170), vgetq_lane_u32(*(uint32x4_t *) &__reint_170, __p3_170)}; \
-  __ret_170 = vcmla_rot270_f16(__s0_170, __s1_170, *(float16x4_t *) &__reint1_170); \
-  __ret_170; \
-})
-#else
-#define vcmla_rot270_laneq_f16(__p0_171, __p1_171, __p2_171, __p3_171) __extension__ ({ \
-  float16x4_t __ret_171; \
-  float16x4_t __s0_171 = __p0_171; \
-  float16x4_t __s1_171 = __p1_171; \
-  float16x8_t __s2_171 = __p2_171; \
-  float16x4_t __rev0_171;  __rev0_171 = __builtin_shufflevector(__s0_171, __s0_171, 3, 2, 1, 0); \
-  float16x4_t __rev1_171;  __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \
-  float16x8_t __rev2_171;  __rev2_171 = __builtin_shufflevector(__s2_171, __s2_171, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16x8_t __reint_171 = __rev2_171; \
-uint32x2_t __reint1_171 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_171, __p3_171), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_171, __p3_171)}; \
-  __ret_171 = __noswap_vcmla_rot270_f16(__rev0_171, __rev1_171, *(float16x4_t *) &__reint1_171); \
-  __ret_171 = __builtin_shufflevector(__ret_171, __ret_171, 3, 2, 1, 0); \
-  __ret_171; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot270_laneq_f16(__p0_172, __p1_172, __p2_172, __p3_172) __extension__ ({ \
-  float16x8_t __ret_172; \
-  float16x8_t __s0_172 = __p0_172; \
-  float16x8_t __s1_172 = __p1_172; \
-  float16x8_t __s2_172 = __p2_172; \
-float16x8_t __reint_172 = __s2_172; \
-uint32x4_t __reint1_172 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_172, __p3_172), vgetq_lane_u32(*(uint32x4_t *) &__reint_172, __p3_172), vgetq_lane_u32(*(uint32x4_t *) &__reint_172, __p3_172), vgetq_lane_u32(*(uint32x4_t *) &__reint_172, __p3_172)}; \
-  __ret_172 = vcmlaq_rot270_f16(__s0_172, __s1_172, *(float16x8_t *) &__reint1_172); \
-  __ret_172; \
-})
-#else
-#define vcmlaq_rot270_laneq_f16(__p0_173, __p1_173, __p2_173, __p3_173) __extension__ ({ \
-  float16x8_t __ret_173; \
-  float16x8_t __s0_173 = __p0_173; \
-  float16x8_t __s1_173 = __p1_173; \
-  float16x8_t __s2_173 = __p2_173; \
-  float16x8_t __rev0_173;  __rev0_173 = __builtin_shufflevector(__s0_173, __s0_173, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_173;  __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_173;  __rev2_173 = __builtin_shufflevector(__s2_173, __s2_173, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16x8_t __reint_173 = __rev2_173; \
-uint32x4_t __reint1_173 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_173, __p3_173), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_173, __p3_173), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_173, __p3_173), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_173, __p3_173)}; \
-  __ret_173 = __noswap_vcmlaq_rot270_f16(__rev0_173, __rev1_173, *(float16x8_t *) &__reint1_173); \
-  __ret_173 = __builtin_shufflevector(__ret_173, __ret_173, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_173; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x8_t __noswap_vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcmla_rot90_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vcmla_rot90_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t __noswap_vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcmla_rot90_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot90_lane_f16(__p0_174, __p1_174, __p2_174, __p3_174) __extension__ ({ \
-  float16x4_t __ret_174; \
-  float16x4_t __s0_174 = __p0_174; \
-  float16x4_t __s1_174 = __p1_174; \
-  float16x4_t __s2_174 = __p2_174; \
-float16x4_t __reint_174 = __s2_174; \
-uint32x2_t __reint1_174 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_174, __p3_174), vget_lane_u32(*(uint32x2_t *) &__reint_174, __p3_174)}; \
-  __ret_174 = vcmla_rot90_f16(__s0_174, __s1_174, *(float16x4_t *) &__reint1_174); \
-  __ret_174; \
-})
-#else
-#define vcmla_rot90_lane_f16(__p0_175, __p1_175, __p2_175, __p3_175) __extension__ ({ \
-  float16x4_t __ret_175; \
-  float16x4_t __s0_175 = __p0_175; \
-  float16x4_t __s1_175 = __p1_175; \
-  float16x4_t __s2_175 = __p2_175; \
-  float16x4_t __rev0_175;  __rev0_175 = __builtin_shufflevector(__s0_175, __s0_175, 3, 2, 1, 0); \
-  float16x4_t __rev1_175;  __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 3, 2, 1, 0); \
-  float16x4_t __rev2_175;  __rev2_175 = __builtin_shufflevector(__s2_175, __s2_175, 3, 2, 1, 0); \
-float16x4_t __reint_175 = __rev2_175; \
-uint32x2_t __reint1_175 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_175, __p3_175), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_175, __p3_175)}; \
-  __ret_175 = __noswap_vcmla_rot90_f16(__rev0_175, __rev1_175, *(float16x4_t *) &__reint1_175); \
-  __ret_175 = __builtin_shufflevector(__ret_175, __ret_175, 3, 2, 1, 0); \
-  __ret_175; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot90_lane_f16(__p0_176, __p1_176, __p2_176, __p3_176) __extension__ ({ \
-  float16x8_t __ret_176; \
-  float16x8_t __s0_176 = __p0_176; \
-  float16x8_t __s1_176 = __p1_176; \
-  float16x4_t __s2_176 = __p2_176; \
-float16x4_t __reint_176 = __s2_176; \
-uint32x4_t __reint1_176 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_176, __p3_176), vget_lane_u32(*(uint32x2_t *) &__reint_176, __p3_176), vget_lane_u32(*(uint32x2_t *) &__reint_176, __p3_176), vget_lane_u32(*(uint32x2_t *) &__reint_176, __p3_176)}; \
-  __ret_176 = vcmlaq_rot90_f16(__s0_176, __s1_176, *(float16x8_t *) &__reint1_176); \
-  __ret_176; \
-})
-#else
-#define vcmlaq_rot90_lane_f16(__p0_177, __p1_177, __p2_177, __p3_177) __extension__ ({ \
-  float16x8_t __ret_177; \
-  float16x8_t __s0_177 = __p0_177; \
-  float16x8_t __s1_177 = __p1_177; \
-  float16x4_t __s2_177 = __p2_177; \
-  float16x8_t __rev0_177;  __rev0_177 = __builtin_shufflevector(__s0_177, __s0_177, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_177;  __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_177;  __rev2_177 = __builtin_shufflevector(__s2_177, __s2_177, 3, 2, 1, 0); \
-float16x4_t __reint_177 = __rev2_177; \
-uint32x4_t __reint1_177 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_177, __p3_177), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_177, __p3_177), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_177, __p3_177), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_177, __p3_177)}; \
-  __ret_177 = __noswap_vcmlaq_rot90_f16(__rev0_177, __rev1_177, *(float16x8_t *) &__reint1_177); \
-  __ret_177 = __builtin_shufflevector(__ret_177, __ret_177, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_177; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot90_laneq_f16(__p0_178, __p1_178, __p2_178, __p3_178) __extension__ ({ \
-  float16x4_t __ret_178; \
-  float16x4_t __s0_178 = __p0_178; \
-  float16x4_t __s1_178 = __p1_178; \
-  float16x8_t __s2_178 = __p2_178; \
-float16x8_t __reint_178 = __s2_178; \
-uint32x2_t __reint1_178 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_178, __p3_178), vgetq_lane_u32(*(uint32x4_t *) &__reint_178, __p3_178)}; \
-  __ret_178 = vcmla_rot90_f16(__s0_178, __s1_178, *(float16x4_t *) &__reint1_178); \
-  __ret_178; \
-})
-#else
-#define vcmla_rot90_laneq_f16(__p0_179, __p1_179, __p2_179, __p3_179) __extension__ ({ \
-  float16x4_t __ret_179; \
-  float16x4_t __s0_179 = __p0_179; \
-  float16x4_t __s1_179 = __p1_179; \
-  float16x8_t __s2_179 = __p2_179; \
-  float16x4_t __rev0_179;  __rev0_179 = __builtin_shufflevector(__s0_179, __s0_179, 3, 2, 1, 0); \
-  float16x4_t __rev1_179;  __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 3, 2, 1, 0); \
-  float16x8_t __rev2_179;  __rev2_179 = __builtin_shufflevector(__s2_179, __s2_179, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16x8_t __reint_179 = __rev2_179; \
-uint32x2_t __reint1_179 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_179, __p3_179), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_179, __p3_179)}; \
-  __ret_179 = __noswap_vcmla_rot90_f16(__rev0_179, __rev1_179, *(float16x4_t *) &__reint1_179); \
-  __ret_179 = __builtin_shufflevector(__ret_179, __ret_179, 3, 2, 1, 0); \
-  __ret_179; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot90_laneq_f16(__p0_180, __p1_180, __p2_180, __p3_180) __extension__ ({ \
-  float16x8_t __ret_180; \
-  float16x8_t __s0_180 = __p0_180; \
-  float16x8_t __s1_180 = __p1_180; \
-  float16x8_t __s2_180 = __p2_180; \
-float16x8_t __reint_180 = __s2_180; \
-uint32x4_t __reint1_180 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_180, __p3_180), vgetq_lane_u32(*(uint32x4_t *) &__reint_180, __p3_180), vgetq_lane_u32(*(uint32x4_t *) &__reint_180, __p3_180), vgetq_lane_u32(*(uint32x4_t *) &__reint_180, __p3_180)}; \
-  __ret_180 = vcmlaq_rot90_f16(__s0_180, __s1_180, *(float16x8_t *) &__reint1_180); \
-  __ret_180; \
-})
-#else
-#define vcmlaq_rot90_laneq_f16(__p0_181, __p1_181, __p2_181, __p3_181) __extension__ ({ \
-  float16x8_t __ret_181; \
-  float16x8_t __s0_181 = __p0_181; \
-  float16x8_t __s1_181 = __p1_181; \
-  float16x8_t __s2_181 = __p2_181; \
-  float16x8_t __rev0_181;  __rev0_181 = __builtin_shufflevector(__s0_181, __s0_181, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_181;  __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_181;  __rev2_181 = __builtin_shufflevector(__s2_181, __s2_181, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16x8_t __reint_181 = __rev2_181; \
-uint32x4_t __reint1_181 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_181, __p3_181), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_181, __p3_181), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_181, __p3_181), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_181, __p3_181)}; \
-  __ret_181 = __noswap_vcmlaq_rot90_f16(__rev0_181, __rev1_181, *(float16x8_t *) &__reint1_181); \
-  __ret_181 = __builtin_shufflevector(__ret_181, __ret_181, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_181; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcadd_rot270_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcadd_rot270_f32((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcadd_rot270_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcadd_rot270_f32((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcadd_rot90_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcadd_rot90_f32((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcadd_rot90_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcadd_rot90_f32((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcaddq_rot270_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcaddq_rot270_f32((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcaddq_rot270_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vcaddq_rot270_f32((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcaddq_rot90_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcaddq_rot90_f32((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcaddq_rot90_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vcaddq_rot90_f32((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_f32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t __noswap_vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcmla_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcmla_f32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t __noswap_vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcmla_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_lane_f32(__p0_182, __p1_182, __p2_182, __p3_182) __extension__ ({ \
-  float32x2_t __ret_182; \
-  float32x2_t __s0_182 = __p0_182; \
-  float32x2_t __s1_182 = __p1_182; \
-  float32x2_t __s2_182 = __p2_182; \
-float32x2_t __reint_182 = __s2_182; \
-uint64x1_t __reint1_182 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_182, __p3_182)}; \
-  __ret_182 = vcmla_f32(__s0_182, __s1_182, *(float32x2_t *) &__reint1_182); \
-  __ret_182; \
-})
-#else
-#define vcmla_lane_f32(__p0_183, __p1_183, __p2_183, __p3_183) __extension__ ({ \
-  float32x2_t __ret_183; \
-  float32x2_t __s0_183 = __p0_183; \
-  float32x2_t __s1_183 = __p1_183; \
-  float32x2_t __s2_183 = __p2_183; \
-  float32x2_t __rev0_183;  __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 1, 0); \
-  float32x2_t __rev1_183;  __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 1, 0); \
-  float32x2_t __rev2_183;  __rev2_183 = __builtin_shufflevector(__s2_183, __s2_183, 1, 0); \
-float32x2_t __reint_183 = __rev2_183; \
-uint64x1_t __reint1_183 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_183, __p3_183)}; \
-  __ret_183 = __noswap_vcmla_f32(__rev0_183, __rev1_183, *(float32x2_t *) &__reint1_183); \
-  __ret_183 = __builtin_shufflevector(__ret_183, __ret_183, 1, 0); \
-  __ret_183; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_lane_f32(__p0_184, __p1_184, __p2_184, __p3_184) __extension__ ({ \
-  float32x4_t __ret_184; \
-  float32x4_t __s0_184 = __p0_184; \
-  float32x4_t __s1_184 = __p1_184; \
-  float32x2_t __s2_184 = __p2_184; \
-float32x2_t __reint_184 = __s2_184; \
-uint64x2_t __reint1_184 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_184, __p3_184), vget_lane_u64(*(uint64x1_t *) &__reint_184, __p3_184)}; \
-  __ret_184 = vcmlaq_f32(__s0_184, __s1_184, *(float32x4_t *) &__reint1_184); \
-  __ret_184; \
-})
-#else
-#define vcmlaq_lane_f32(__p0_185, __p1_185, __p2_185, __p3_185) __extension__ ({ \
-  float32x4_t __ret_185; \
-  float32x4_t __s0_185 = __p0_185; \
-  float32x4_t __s1_185 = __p1_185; \
-  float32x2_t __s2_185 = __p2_185; \
-  float32x4_t __rev0_185;  __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 3, 2, 1, 0); \
-  float32x4_t __rev1_185;  __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 3, 2, 1, 0); \
-  float32x2_t __rev2_185;  __rev2_185 = __builtin_shufflevector(__s2_185, __s2_185, 1, 0); \
-float32x2_t __reint_185 = __rev2_185; \
-uint64x2_t __reint1_185 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_185, __p3_185), vget_lane_u64(*(uint64x1_t *) &__reint_185, __p3_185)}; \
-  __ret_185 = __noswap_vcmlaq_f32(__rev0_185, __rev1_185, *(float32x4_t *) &__reint1_185); \
-  __ret_185 = __builtin_shufflevector(__ret_185, __ret_185, 3, 2, 1, 0); \
-  __ret_185; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_laneq_f32(__p0_186, __p1_186, __p2_186, __p3_186) __extension__ ({ \
-  float32x2_t __ret_186; \
-  float32x2_t __s0_186 = __p0_186; \
-  float32x2_t __s1_186 = __p1_186; \
-  float32x4_t __s2_186 = __p2_186; \
-float32x4_t __reint_186 = __s2_186; \
-uint64x1_t __reint1_186 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_186, __p3_186)}; \
-  __ret_186 = vcmla_f32(__s0_186, __s1_186, *(float32x2_t *) &__reint1_186); \
-  __ret_186; \
-})
-#else
-#define vcmla_laneq_f32(__p0_187, __p1_187, __p2_187, __p3_187) __extension__ ({ \
-  float32x2_t __ret_187; \
-  float32x2_t __s0_187 = __p0_187; \
-  float32x2_t __s1_187 = __p1_187; \
-  float32x4_t __s2_187 = __p2_187; \
-  float32x2_t __rev0_187;  __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 1, 0); \
-  float32x2_t __rev1_187;  __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 1, 0); \
-  float32x4_t __rev2_187;  __rev2_187 = __builtin_shufflevector(__s2_187, __s2_187, 3, 2, 1, 0); \
-float32x4_t __reint_187 = __rev2_187; \
-uint64x1_t __reint1_187 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_187, __p3_187)}; \
-  __ret_187 = __noswap_vcmla_f32(__rev0_187, __rev1_187, *(float32x2_t *) &__reint1_187); \
-  __ret_187 = __builtin_shufflevector(__ret_187, __ret_187, 1, 0); \
-  __ret_187; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_laneq_f32(__p0_188, __p1_188, __p2_188, __p3_188) __extension__ ({ \
-  float32x4_t __ret_188; \
-  float32x4_t __s0_188 = __p0_188; \
-  float32x4_t __s1_188 = __p1_188; \
-  float32x4_t __s2_188 = __p2_188; \
-float32x4_t __reint_188 = __s2_188; \
-uint64x2_t __reint1_188 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_188, __p3_188), vgetq_lane_u64(*(uint64x2_t *) &__reint_188, __p3_188)}; \
-  __ret_188 = vcmlaq_f32(__s0_188, __s1_188, *(float32x4_t *) &__reint1_188); \
-  __ret_188; \
-})
-#else
-#define vcmlaq_laneq_f32(__p0_189, __p1_189, __p2_189, __p3_189) __extension__ ({ \
-  float32x4_t __ret_189; \
-  float32x4_t __s0_189 = __p0_189; \
-  float32x4_t __s1_189 = __p1_189; \
-  float32x4_t __s2_189 = __p2_189; \
-  float32x4_t __rev0_189;  __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \
-  float32x4_t __rev1_189;  __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \
-  float32x4_t __rev2_189;  __rev2_189 = __builtin_shufflevector(__s2_189, __s2_189, 3, 2, 1, 0); \
-float32x4_t __reint_189 = __rev2_189; \
-uint64x2_t __reint1_189 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_189, __p3_189), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_189, __p3_189)}; \
-  __ret_189 = __noswap_vcmlaq_f32(__rev0_189, __rev1_189, *(float32x4_t *) &__reint1_189); \
-  __ret_189 = __builtin_shufflevector(__ret_189, __ret_189, 3, 2, 1, 0); \
-  __ret_189; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_f32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t __noswap_vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcmla_rot180_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcmla_rot180_f32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t __noswap_vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcmla_rot180_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot180_lane_f32(__p0_190, __p1_190, __p2_190, __p3_190) __extension__ ({ \
-  float32x2_t __ret_190; \
-  float32x2_t __s0_190 = __p0_190; \
-  float32x2_t __s1_190 = __p1_190; \
-  float32x2_t __s2_190 = __p2_190; \
-float32x2_t __reint_190 = __s2_190; \
-uint64x1_t __reint1_190 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_190, __p3_190)}; \
-  __ret_190 = vcmla_rot180_f32(__s0_190, __s1_190, *(float32x2_t *) &__reint1_190); \
-  __ret_190; \
-})
-#else
-#define vcmla_rot180_lane_f32(__p0_191, __p1_191, __p2_191, __p3_191) __extension__ ({ \
-  float32x2_t __ret_191; \
-  float32x2_t __s0_191 = __p0_191; \
-  float32x2_t __s1_191 = __p1_191; \
-  float32x2_t __s2_191 = __p2_191; \
-  float32x2_t __rev0_191;  __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \
-  float32x2_t __rev1_191;  __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \
-  float32x2_t __rev2_191;  __rev2_191 = __builtin_shufflevector(__s2_191, __s2_191, 1, 0); \
-float32x2_t __reint_191 = __rev2_191; \
-uint64x1_t __reint1_191 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_191, __p3_191)}; \
-  __ret_191 = __noswap_vcmla_rot180_f32(__rev0_191, __rev1_191, *(float32x2_t *) &__reint1_191); \
-  __ret_191 = __builtin_shufflevector(__ret_191, __ret_191, 1, 0); \
-  __ret_191; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot180_lane_f32(__p0_192, __p1_192, __p2_192, __p3_192) __extension__ ({ \
-  float32x4_t __ret_192; \
-  float32x4_t __s0_192 = __p0_192; \
-  float32x4_t __s1_192 = __p1_192; \
-  float32x2_t __s2_192 = __p2_192; \
-float32x2_t __reint_192 = __s2_192; \
-uint64x2_t __reint1_192 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_192, __p3_192), vget_lane_u64(*(uint64x1_t *) &__reint_192, __p3_192)}; \
-  __ret_192 = vcmlaq_rot180_f32(__s0_192, __s1_192, *(float32x4_t *) &__reint1_192); \
-  __ret_192; \
-})
-#else
-#define vcmlaq_rot180_lane_f32(__p0_193, __p1_193, __p2_193, __p3_193) __extension__ ({ \
-  float32x4_t __ret_193; \
-  float32x4_t __s0_193 = __p0_193; \
-  float32x4_t __s1_193 = __p1_193; \
-  float32x2_t __s2_193 = __p2_193; \
-  float32x4_t __rev0_193;  __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 3, 2, 1, 0); \
-  float32x4_t __rev1_193;  __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 3, 2, 1, 0); \
-  float32x2_t __rev2_193;  __rev2_193 = __builtin_shufflevector(__s2_193, __s2_193, 1, 0); \
-float32x2_t __reint_193 = __rev2_193; \
-uint64x2_t __reint1_193 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_193, __p3_193), vget_lane_u64(*(uint64x1_t *) &__reint_193, __p3_193)}; \
-  __ret_193 = __noswap_vcmlaq_rot180_f32(__rev0_193, __rev1_193, *(float32x4_t *) &__reint1_193); \
-  __ret_193 = __builtin_shufflevector(__ret_193, __ret_193, 3, 2, 1, 0); \
-  __ret_193; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot180_laneq_f32(__p0_194, __p1_194, __p2_194, __p3_194) __extension__ ({ \
-  float32x2_t __ret_194; \
-  float32x2_t __s0_194 = __p0_194; \
-  float32x2_t __s1_194 = __p1_194; \
-  float32x4_t __s2_194 = __p2_194; \
-float32x4_t __reint_194 = __s2_194; \
-uint64x1_t __reint1_194 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_194, __p3_194)}; \
-  __ret_194 = vcmla_rot180_f32(__s0_194, __s1_194, *(float32x2_t *) &__reint1_194); \
-  __ret_194; \
-})
-#else
-#define vcmla_rot180_laneq_f32(__p0_195, __p1_195, __p2_195, __p3_195) __extension__ ({ \
-  float32x2_t __ret_195; \
-  float32x2_t __s0_195 = __p0_195; \
-  float32x2_t __s1_195 = __p1_195; \
-  float32x4_t __s2_195 = __p2_195; \
-  float32x2_t __rev0_195;  __rev0_195 = __builtin_shufflevector(__s0_195, __s0_195, 1, 0); \
-  float32x2_t __rev1_195;  __rev1_195 = __builtin_shufflevector(__s1_195, __s1_195, 1, 0); \
-  float32x4_t __rev2_195;  __rev2_195 = __builtin_shufflevector(__s2_195, __s2_195, 3, 2, 1, 0); \
-float32x4_t __reint_195 = __rev2_195; \
-uint64x1_t __reint1_195 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_195, __p3_195)}; \
-  __ret_195 = __noswap_vcmla_rot180_f32(__rev0_195, __rev1_195, *(float32x2_t *) &__reint1_195); \
-  __ret_195 = __builtin_shufflevector(__ret_195, __ret_195, 1, 0); \
-  __ret_195; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot180_laneq_f32(__p0_196, __p1_196, __p2_196, __p3_196) __extension__ ({ \
-  float32x4_t __ret_196; \
-  float32x4_t __s0_196 = __p0_196; \
-  float32x4_t __s1_196 = __p1_196; \
-  float32x4_t __s2_196 = __p2_196; \
-float32x4_t __reint_196 = __s2_196; \
-uint64x2_t __reint1_196 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_196, __p3_196), vgetq_lane_u64(*(uint64x2_t *) &__reint_196, __p3_196)}; \
-  __ret_196 = vcmlaq_rot180_f32(__s0_196, __s1_196, *(float32x4_t *) &__reint1_196); \
-  __ret_196; \
-})
-#else
-#define vcmlaq_rot180_laneq_f32(__p0_197, __p1_197, __p2_197, __p3_197) __extension__ ({ \
-  float32x4_t __ret_197; \
-  float32x4_t __s0_197 = __p0_197; \
-  float32x4_t __s1_197 = __p1_197; \
-  float32x4_t __s2_197 = __p2_197; \
-  float32x4_t __rev0_197;  __rev0_197 = __builtin_shufflevector(__s0_197, __s0_197, 3, 2, 1, 0); \
-  float32x4_t __rev1_197;  __rev1_197 = __builtin_shufflevector(__s1_197, __s1_197, 3, 2, 1, 0); \
-  float32x4_t __rev2_197;  __rev2_197 = __builtin_shufflevector(__s2_197, __s2_197, 3, 2, 1, 0); \
-float32x4_t __reint_197 = __rev2_197; \
-uint64x2_t __reint1_197 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_197, __p3_197), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_197, __p3_197)}; \
-  __ret_197 = __noswap_vcmlaq_rot180_f32(__rev0_197, __rev1_197, *(float32x4_t *) &__reint1_197); \
-  __ret_197 = __builtin_shufflevector(__ret_197, __ret_197, 3, 2, 1, 0); \
-  __ret_197; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_f32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t __noswap_vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcmla_rot270_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcmla_rot270_f32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t __noswap_vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcmla_rot270_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot270_lane_f32(__p0_198, __p1_198, __p2_198, __p3_198) __extension__ ({ \
-  float32x2_t __ret_198; \
-  float32x2_t __s0_198 = __p0_198; \
-  float32x2_t __s1_198 = __p1_198; \
-  float32x2_t __s2_198 = __p2_198; \
-float32x2_t __reint_198 = __s2_198; \
-uint64x1_t __reint1_198 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_198, __p3_198)}; \
-  __ret_198 = vcmla_rot270_f32(__s0_198, __s1_198, *(float32x2_t *) &__reint1_198); \
-  __ret_198; \
-})
-#else
-#define vcmla_rot270_lane_f32(__p0_199, __p1_199, __p2_199, __p3_199) __extension__ ({ \
-  float32x2_t __ret_199; \
-  float32x2_t __s0_199 = __p0_199; \
-  float32x2_t __s1_199 = __p1_199; \
-  float32x2_t __s2_199 = __p2_199; \
-  float32x2_t __rev0_199;  __rev0_199 = __builtin_shufflevector(__s0_199, __s0_199, 1, 0); \
-  float32x2_t __rev1_199;  __rev1_199 = __builtin_shufflevector(__s1_199, __s1_199, 1, 0); \
-  float32x2_t __rev2_199;  __rev2_199 = __builtin_shufflevector(__s2_199, __s2_199, 1, 0); \
-float32x2_t __reint_199 = __rev2_199; \
-uint64x1_t __reint1_199 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_199, __p3_199)}; \
-  __ret_199 = __noswap_vcmla_rot270_f32(__rev0_199, __rev1_199, *(float32x2_t *) &__reint1_199); \
-  __ret_199 = __builtin_shufflevector(__ret_199, __ret_199, 1, 0); \
-  __ret_199; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot270_lane_f32(__p0_200, __p1_200, __p2_200, __p3_200) __extension__ ({ \
-  float32x4_t __ret_200; \
-  float32x4_t __s0_200 = __p0_200; \
-  float32x4_t __s1_200 = __p1_200; \
-  float32x2_t __s2_200 = __p2_200; \
-float32x2_t __reint_200 = __s2_200; \
-uint64x2_t __reint1_200 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_200, __p3_200), vget_lane_u64(*(uint64x1_t *) &__reint_200, __p3_200)}; \
-  __ret_200 = vcmlaq_rot270_f32(__s0_200, __s1_200, *(float32x4_t *) &__reint1_200); \
-  __ret_200; \
-})
-#else
-#define vcmlaq_rot270_lane_f32(__p0_201, __p1_201, __p2_201, __p3_201) __extension__ ({ \
-  float32x4_t __ret_201; \
-  float32x4_t __s0_201 = __p0_201; \
-  float32x4_t __s1_201 = __p1_201; \
-  float32x2_t __s2_201 = __p2_201; \
-  float32x4_t __rev0_201;  __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \
-  float32x4_t __rev1_201;  __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \
-  float32x2_t __rev2_201;  __rev2_201 = __builtin_shufflevector(__s2_201, __s2_201, 1, 0); \
-float32x2_t __reint_201 = __rev2_201; \
-uint64x2_t __reint1_201 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_201, __p3_201), vget_lane_u64(*(uint64x1_t *) &__reint_201, __p3_201)}; \
-  __ret_201 = __noswap_vcmlaq_rot270_f32(__rev0_201, __rev1_201, *(float32x4_t *) &__reint1_201); \
-  __ret_201 = __builtin_shufflevector(__ret_201, __ret_201, 3, 2, 1, 0); \
-  __ret_201; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot270_laneq_f32(__p0_202, __p1_202, __p2_202, __p3_202) __extension__ ({ \
-  float32x2_t __ret_202; \
-  float32x2_t __s0_202 = __p0_202; \
-  float32x2_t __s1_202 = __p1_202; \
-  float32x4_t __s2_202 = __p2_202; \
-float32x4_t __reint_202 = __s2_202; \
-uint64x1_t __reint1_202 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_202, __p3_202)}; \
-  __ret_202 = vcmla_rot270_f32(__s0_202, __s1_202, *(float32x2_t *) &__reint1_202); \
-  __ret_202; \
-})
-#else
-#define vcmla_rot270_laneq_f32(__p0_203, __p1_203, __p2_203, __p3_203) __extension__ ({ \
-  float32x2_t __ret_203; \
-  float32x2_t __s0_203 = __p0_203; \
-  float32x2_t __s1_203 = __p1_203; \
-  float32x4_t __s2_203 = __p2_203; \
-  float32x2_t __rev0_203;  __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \
-  float32x2_t __rev1_203;  __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \
-  float32x4_t __rev2_203;  __rev2_203 = __builtin_shufflevector(__s2_203, __s2_203, 3, 2, 1, 0); \
-float32x4_t __reint_203 = __rev2_203; \
-uint64x1_t __reint1_203 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_203, __p3_203)}; \
-  __ret_203 = __noswap_vcmla_rot270_f32(__rev0_203, __rev1_203, *(float32x2_t *) &__reint1_203); \
-  __ret_203 = __builtin_shufflevector(__ret_203, __ret_203, 1, 0); \
-  __ret_203; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot270_laneq_f32(__p0_204, __p1_204, __p2_204, __p3_204) __extension__ ({ \
-  float32x4_t __ret_204; \
-  float32x4_t __s0_204 = __p0_204; \
-  float32x4_t __s1_204 = __p1_204; \
-  float32x4_t __s2_204 = __p2_204; \
-float32x4_t __reint_204 = __s2_204; \
-uint64x2_t __reint1_204 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_204, __p3_204), vgetq_lane_u64(*(uint64x2_t *) &__reint_204, __p3_204)}; \
-  __ret_204 = vcmlaq_rot270_f32(__s0_204, __s1_204, *(float32x4_t *) &__reint1_204); \
-  __ret_204; \
-})
-#else
-#define vcmlaq_rot270_laneq_f32(__p0_205, __p1_205, __p2_205, __p3_205) __extension__ ({ \
-  float32x4_t __ret_205; \
-  float32x4_t __s0_205 = __p0_205; \
-  float32x4_t __s1_205 = __p1_205; \
-  float32x4_t __s2_205 = __p2_205; \
-  float32x4_t __rev0_205;  __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 3, 2, 1, 0); \
-  float32x4_t __rev1_205;  __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 3, 2, 1, 0); \
-  float32x4_t __rev2_205;  __rev2_205 = __builtin_shufflevector(__s2_205, __s2_205, 3, 2, 1, 0); \
-float32x4_t __reint_205 = __rev2_205; \
-uint64x2_t __reint1_205 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_205, __p3_205), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_205, __p3_205)}; \
-  __ret_205 = __noswap_vcmlaq_rot270_f32(__rev0_205, __rev1_205, *(float32x4_t *) &__reint1_205); \
-  __ret_205 = __builtin_shufflevector(__ret_205, __ret_205, 3, 2, 1, 0); \
-  __ret_205; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_f32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float32x4_t __noswap_vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_f32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcmla_rot90_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcmla_rot90_f32((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float32x2_t __noswap_vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcmla_rot90_f32((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot90_lane_f32(__p0_206, __p1_206, __p2_206, __p3_206) __extension__ ({ \
-  float32x2_t __ret_206; \
-  float32x2_t __s0_206 = __p0_206; \
-  float32x2_t __s1_206 = __p1_206; \
-  float32x2_t __s2_206 = __p2_206; \
-float32x2_t __reint_206 = __s2_206; \
-uint64x1_t __reint1_206 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_206, __p3_206)}; \
-  __ret_206 = vcmla_rot90_f32(__s0_206, __s1_206, *(float32x2_t *) &__reint1_206); \
-  __ret_206; \
-})
-#else
-#define vcmla_rot90_lane_f32(__p0_207, __p1_207, __p2_207, __p3_207) __extension__ ({ \
-  float32x2_t __ret_207; \
-  float32x2_t __s0_207 = __p0_207; \
-  float32x2_t __s1_207 = __p1_207; \
-  float32x2_t __s2_207 = __p2_207; \
-  float32x2_t __rev0_207;  __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 1, 0); \
-  float32x2_t __rev1_207;  __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 1, 0); \
-  float32x2_t __rev2_207;  __rev2_207 = __builtin_shufflevector(__s2_207, __s2_207, 1, 0); \
-float32x2_t __reint_207 = __rev2_207; \
-uint64x1_t __reint1_207 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_207, __p3_207)}; \
-  __ret_207 = __noswap_vcmla_rot90_f32(__rev0_207, __rev1_207, *(float32x2_t *) &__reint1_207); \
-  __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 1, 0); \
-  __ret_207; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot90_lane_f32(__p0_208, __p1_208, __p2_208, __p3_208) __extension__ ({ \
-  float32x4_t __ret_208; \
-  float32x4_t __s0_208 = __p0_208; \
-  float32x4_t __s1_208 = __p1_208; \
-  float32x2_t __s2_208 = __p2_208; \
-float32x2_t __reint_208 = __s2_208; \
-uint64x2_t __reint1_208 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_208, __p3_208), vget_lane_u64(*(uint64x1_t *) &__reint_208, __p3_208)}; \
-  __ret_208 = vcmlaq_rot90_f32(__s0_208, __s1_208, *(float32x4_t *) &__reint1_208); \
-  __ret_208; \
-})
-#else
-#define vcmlaq_rot90_lane_f32(__p0_209, __p1_209, __p2_209, __p3_209) __extension__ ({ \
-  float32x4_t __ret_209; \
-  float32x4_t __s0_209 = __p0_209; \
-  float32x4_t __s1_209 = __p1_209; \
-  float32x2_t __s2_209 = __p2_209; \
-  float32x4_t __rev0_209;  __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 3, 2, 1, 0); \
-  float32x4_t __rev1_209;  __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 3, 2, 1, 0); \
-  float32x2_t __rev2_209;  __rev2_209 = __builtin_shufflevector(__s2_209, __s2_209, 1, 0); \
-float32x2_t __reint_209 = __rev2_209; \
-uint64x2_t __reint1_209 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_209, __p3_209), vget_lane_u64(*(uint64x1_t *) &__reint_209, __p3_209)}; \
-  __ret_209 = __noswap_vcmlaq_rot90_f32(__rev0_209, __rev1_209, *(float32x4_t *) &__reint1_209); \
-  __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \
-  __ret_209; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot90_laneq_f32(__p0_210, __p1_210, __p2_210, __p3_210) __extension__ ({ \
-  float32x2_t __ret_210; \
-  float32x2_t __s0_210 = __p0_210; \
-  float32x2_t __s1_210 = __p1_210; \
-  float32x4_t __s2_210 = __p2_210; \
-float32x4_t __reint_210 = __s2_210; \
-uint64x1_t __reint1_210 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_210, __p3_210)}; \
-  __ret_210 = vcmla_rot90_f32(__s0_210, __s1_210, *(float32x2_t *) &__reint1_210); \
-  __ret_210; \
-})
-#else
-#define vcmla_rot90_laneq_f32(__p0_211, __p1_211, __p2_211, __p3_211) __extension__ ({ \
-  float32x2_t __ret_211; \
-  float32x2_t __s0_211 = __p0_211; \
-  float32x2_t __s1_211 = __p1_211; \
-  float32x4_t __s2_211 = __p2_211; \
-  float32x2_t __rev0_211;  __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 1, 0); \
-  float32x2_t __rev1_211;  __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 1, 0); \
-  float32x4_t __rev2_211;  __rev2_211 = __builtin_shufflevector(__s2_211, __s2_211, 3, 2, 1, 0); \
-float32x4_t __reint_211 = __rev2_211; \
-uint64x1_t __reint1_211 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_211, __p3_211)}; \
-  __ret_211 = __noswap_vcmla_rot90_f32(__rev0_211, __rev1_211, *(float32x2_t *) &__reint1_211); \
-  __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 1, 0); \
-  __ret_211; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot90_laneq_f32(__p0_212, __p1_212, __p2_212, __p3_212) __extension__ ({ \
-  float32x4_t __ret_212; \
-  float32x4_t __s0_212 = __p0_212; \
-  float32x4_t __s1_212 = __p1_212; \
-  float32x4_t __s2_212 = __p2_212; \
-float32x4_t __reint_212 = __s2_212; \
-uint64x2_t __reint1_212 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_212, __p3_212), vgetq_lane_u64(*(uint64x2_t *) &__reint_212, __p3_212)}; \
-  __ret_212 = vcmlaq_rot90_f32(__s0_212, __s1_212, *(float32x4_t *) &__reint1_212); \
-  __ret_212; \
-})
-#else
-#define vcmlaq_rot90_laneq_f32(__p0_213, __p1_213, __p2_213, __p3_213) __extension__ ({ \
-  float32x4_t __ret_213; \
-  float32x4_t __s0_213 = __p0_213; \
-  float32x4_t __s1_213 = __p1_213; \
-  float32x4_t __s2_213 = __p2_213; \
-  float32x4_t __rev0_213;  __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \
-  float32x4_t __rev1_213;  __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \
-  float32x4_t __rev2_213;  __rev2_213 = __builtin_shufflevector(__s2_213, __s2_213, 3, 2, 1, 0); \
-float32x4_t __reint_213 = __rev2_213; \
-uint64x2_t __reint1_213 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_213, __p3_213), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_213, __p3_213)}; \
-  __ret_213 = __noswap_vcmlaq_rot90_f32(__rev0_213, __rev1_213, *(float32x4_t *) &__reint1_213); \
-  __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 3, 2, 1, 0); \
-  __ret_213; \
-})
-#endif
-
-#if !defined(__aarch64__) && !defined(__arm64ec__)
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t __a32_vcvt_bf16_f32(float32x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t) __builtin_neon___a32_vcvt_bf16_f32((int8x16_t)__p0, 11);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t __a32_vcvt_bf16_f32(float32x4_t __p0) {
-  bfloat16x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (bfloat16x4_t) __builtin_neon___a32_vcvt_bf16_f32((int8x16_t)__rev0, 11);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t __noswap___a32_vcvt_bf16_f32(float32x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t) __builtin_neon___a32_vcvt_bf16_f32((int8x16_t)__p0, 11);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vcvt_bf16_f32(float32x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = __a32_vcvt_bf16_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vcvt_bf16_f32(float32x4_t __p0) {
-  bfloat16x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap___a32_vcvt_bf16_f32(__rev0);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_high_bf16_f32(bfloat16x8_t __p0, float32x4_t __p1) {
-  bfloat16x8_t __ret;
-  __ret = vcombine_bf16(__a32_vcvt_bf16_f32(__p1), vget_low_bf16(__p0));
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_high_bf16_f32(bfloat16x8_t __p0, float32x4_t __p1) {
-  bfloat16x8_t __ret;
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_bf16(__noswap___a32_vcvt_bf16_f32(__rev1), __noswap_vget_low_bf16(__rev0));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_low_bf16_f32(float32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = vcombine_bf16((bfloat16x4_t)(0ULL), __a32_vcvt_bf16_f32(__p0));
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_low_bf16_f32(float32x4_t __p0) {
-  bfloat16x8_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_bf16((bfloat16x4_t)(0ULL), __noswap___a32_vcvt_bf16_f32(__rev0));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("bf16,neon"))) poly8x8_t vreinterpret_p8_bf16(bfloat16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly64x1_t vreinterpret_p64_bf16(bfloat16x4_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly16x4_t vreinterpret_p16_bf16(bfloat16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly8x16_t vreinterpretq_p8_bf16(bfloat16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly64x2_t vreinterpretq_p64_bf16(bfloat16x8_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly16x8_t vreinterpretq_p16_bf16(bfloat16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint8x16_t vreinterpretq_u8_bf16(bfloat16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint32x4_t vreinterpretq_u32_bf16(bfloat16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint64x2_t vreinterpretq_u64_bf16(bfloat16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint16x8_t vreinterpretq_u16_bf16(bfloat16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int8x16_t vreinterpretq_s8_bf16(bfloat16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float32x4_t vreinterpretq_f32_bf16(bfloat16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float16x8_t vreinterpretq_f16_bf16(bfloat16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int32x4_t vreinterpretq_s32_bf16(bfloat16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int64x2_t vreinterpretq_s64_bf16(bfloat16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int16x8_t vreinterpretq_s16_bf16(bfloat16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint8x8_t vreinterpret_u8_bf16(bfloat16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint32x2_t vreinterpret_u32_bf16(bfloat16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint64x1_t vreinterpret_u64_bf16(bfloat16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint16x4_t vreinterpret_u16_bf16(bfloat16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int8x8_t vreinterpret_s8_bf16(bfloat16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float32x2_t vreinterpret_f32_bf16(bfloat16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float16x4_t vreinterpret_f16_bf16(bfloat16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int32x2_t vreinterpret_s32_bf16(bfloat16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int64x1_t vreinterpret_s64_bf16(bfloat16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int16x4_t vreinterpret_s16_bf16(bfloat16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_p8(poly8x16_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_p64(poly64x2_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_p16(poly16x8_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_u8(uint8x16_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_u32(uint32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_u64(uint64x2_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_u16(uint16x8_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_s8(int8x16_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_f32(float32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_f16(float16x8_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_s32(int32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_s64(int64x2_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_s16(int16x8_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_p8(poly8x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_p64(poly64x1_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_p16(poly16x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_u8(uint8x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_u32(uint32x2_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_u64(uint64x1_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_u16(uint16x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_s8(int8x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_f32(float32x2_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_f16(float16x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_s32(int32x2_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_s64(int64x1_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_s16(int16x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhq_lane_s32(__p0_214, __p1_214, __p2_214) __extension__ ({ \
-  int32x4_t __ret_214; \
-  int32x4_t __s0_214 = __p0_214; \
-  int32x2_t __s1_214 = __p1_214; \
-  __ret_214 = vqdmulhq_s32(__s0_214, splatq_lane_s32(__s1_214, __p2_214)); \
-  __ret_214; \
-})
-#else
-#define vqdmulhq_lane_s32(__p0_215, __p1_215, __p2_215) __extension__ ({ \
-  int32x4_t __ret_215; \
-  int32x4_t __s0_215 = __p0_215; \
-  int32x2_t __s1_215 = __p1_215; \
-  int32x4_t __rev0_215;  __rev0_215 = __builtin_shufflevector(__s0_215, __s0_215, 3, 2, 1, 0); \
-  int32x2_t __rev1_215;  __rev1_215 = __builtin_shufflevector(__s1_215, __s1_215, 1, 0); \
-  __ret_215 = __noswap_vqdmulhq_s32(__rev0_215, __noswap_splatq_lane_s32(__rev1_215, __p2_215)); \
-  __ret_215 = __builtin_shufflevector(__ret_215, __ret_215, 3, 2, 1, 0); \
-  __ret_215; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhq_lane_s16(__p0_216, __p1_216, __p2_216) __extension__ ({ \
-  int16x8_t __ret_216; \
-  int16x8_t __s0_216 = __p0_216; \
-  int16x4_t __s1_216 = __p1_216; \
-  __ret_216 = vqdmulhq_s16(__s0_216, splatq_lane_s16(__s1_216, __p2_216)); \
-  __ret_216; \
-})
-#else
-#define vqdmulhq_lane_s16(__p0_217, __p1_217, __p2_217) __extension__ ({ \
-  int16x8_t __ret_217; \
-  int16x8_t __s0_217 = __p0_217; \
-  int16x4_t __s1_217 = __p1_217; \
-  int16x8_t __rev0_217;  __rev0_217 = __builtin_shufflevector(__s0_217, __s0_217, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev1_217;  __rev1_217 = __builtin_shufflevector(__s1_217, __s1_217, 3, 2, 1, 0); \
-  __ret_217 = __noswap_vqdmulhq_s16(__rev0_217, __noswap_splatq_lane_s16(__rev1_217, __p2_217)); \
-  __ret_217 = __builtin_shufflevector(__ret_217, __ret_217, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_217; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulh_lane_s32(__p0_218, __p1_218, __p2_218) __extension__ ({ \
-  int32x2_t __ret_218; \
-  int32x2_t __s0_218 = __p0_218; \
-  int32x2_t __s1_218 = __p1_218; \
-  __ret_218 = vqdmulh_s32(__s0_218, splat_lane_s32(__s1_218, __p2_218)); \
-  __ret_218; \
-})
-#else
-#define vqdmulh_lane_s32(__p0_219, __p1_219, __p2_219) __extension__ ({ \
-  int32x2_t __ret_219; \
-  int32x2_t __s0_219 = __p0_219; \
-  int32x2_t __s1_219 = __p1_219; \
-  int32x2_t __rev0_219;  __rev0_219 = __builtin_shufflevector(__s0_219, __s0_219, 1, 0); \
-  int32x2_t __rev1_219;  __rev1_219 = __builtin_shufflevector(__s1_219, __s1_219, 1, 0); \
-  __ret_219 = __noswap_vqdmulh_s32(__rev0_219, __noswap_splat_lane_s32(__rev1_219, __p2_219)); \
-  __ret_219 = __builtin_shufflevector(__ret_219, __ret_219, 1, 0); \
-  __ret_219; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulh_lane_s16(__p0_220, __p1_220, __p2_220) __extension__ ({ \
-  int16x4_t __ret_220; \
-  int16x4_t __s0_220 = __p0_220; \
-  int16x4_t __s1_220 = __p1_220; \
-  __ret_220 = vqdmulh_s16(__s0_220, splat_lane_s16(__s1_220, __p2_220)); \
-  __ret_220; \
-})
-#else
-#define vqdmulh_lane_s16(__p0_221, __p1_221, __p2_221) __extension__ ({ \
-  int16x4_t __ret_221; \
-  int16x4_t __s0_221 = __p0_221; \
-  int16x4_t __s1_221 = __p1_221; \
-  int16x4_t __rev0_221;  __rev0_221 = __builtin_shufflevector(__s0_221, __s0_221, 3, 2, 1, 0); \
-  int16x4_t __rev1_221;  __rev1_221 = __builtin_shufflevector(__s1_221, __s1_221, 3, 2, 1, 0); \
-  __ret_221 = __noswap_vqdmulh_s16(__rev0_221, __noswap_splat_lane_s16(__rev1_221, __p2_221)); \
-  __ret_221 = __builtin_shufflevector(__ret_221, __ret_221, 3, 2, 1, 0); \
-  __ret_221; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhq_lane_s32(__p0_222, __p1_222, __p2_222) __extension__ ({ \
-  int32x4_t __ret_222; \
-  int32x4_t __s0_222 = __p0_222; \
-  int32x2_t __s1_222 = __p1_222; \
-  __ret_222 = vqrdmulhq_s32(__s0_222, splatq_lane_s32(__s1_222, __p2_222)); \
-  __ret_222; \
-})
-#else
-#define vqrdmulhq_lane_s32(__p0_223, __p1_223, __p2_223) __extension__ ({ \
-  int32x4_t __ret_223; \
-  int32x4_t __s0_223 = __p0_223; \
-  int32x2_t __s1_223 = __p1_223; \
-  int32x4_t __rev0_223;  __rev0_223 = __builtin_shufflevector(__s0_223, __s0_223, 3, 2, 1, 0); \
-  int32x2_t __rev1_223;  __rev1_223 = __builtin_shufflevector(__s1_223, __s1_223, 1, 0); \
-  __ret_223 = __noswap_vqrdmulhq_s32(__rev0_223, __noswap_splatq_lane_s32(__rev1_223, __p2_223)); \
-  __ret_223 = __builtin_shufflevector(__ret_223, __ret_223, 3, 2, 1, 0); \
-  __ret_223; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhq_lane_s16(__p0_224, __p1_224, __p2_224) __extension__ ({ \
-  int16x8_t __ret_224; \
-  int16x8_t __s0_224 = __p0_224; \
-  int16x4_t __s1_224 = __p1_224; \
-  __ret_224 = vqrdmulhq_s16(__s0_224, splatq_lane_s16(__s1_224, __p2_224)); \
-  __ret_224; \
-})
-#else
-#define vqrdmulhq_lane_s16(__p0_225, __p1_225, __p2_225) __extension__ ({ \
-  int16x8_t __ret_225; \
-  int16x8_t __s0_225 = __p0_225; \
-  int16x4_t __s1_225 = __p1_225; \
-  int16x8_t __rev0_225;  __rev0_225 = __builtin_shufflevector(__s0_225, __s0_225, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev1_225;  __rev1_225 = __builtin_shufflevector(__s1_225, __s1_225, 3, 2, 1, 0); \
-  __ret_225 = __noswap_vqrdmulhq_s16(__rev0_225, __noswap_splatq_lane_s16(__rev1_225, __p2_225)); \
-  __ret_225 = __builtin_shufflevector(__ret_225, __ret_225, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_225; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulh_lane_s32(__p0_226, __p1_226, __p2_226) __extension__ ({ \
-  int32x2_t __ret_226; \
-  int32x2_t __s0_226 = __p0_226; \
-  int32x2_t __s1_226 = __p1_226; \
-  __ret_226 = vqrdmulh_s32(__s0_226, splat_lane_s32(__s1_226, __p2_226)); \
-  __ret_226; \
-})
-#else
-#define vqrdmulh_lane_s32(__p0_227, __p1_227, __p2_227) __extension__ ({ \
-  int32x2_t __ret_227; \
-  int32x2_t __s0_227 = __p0_227; \
-  int32x2_t __s1_227 = __p1_227; \
-  int32x2_t __rev0_227;  __rev0_227 = __builtin_shufflevector(__s0_227, __s0_227, 1, 0); \
-  int32x2_t __rev1_227;  __rev1_227 = __builtin_shufflevector(__s1_227, __s1_227, 1, 0); \
-  __ret_227 = __noswap_vqrdmulh_s32(__rev0_227, __noswap_splat_lane_s32(__rev1_227, __p2_227)); \
-  __ret_227 = __builtin_shufflevector(__ret_227, __ret_227, 1, 0); \
-  __ret_227; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulh_lane_s16(__p0_228, __p1_228, __p2_228) __extension__ ({ \
-  int16x4_t __ret_228; \
-  int16x4_t __s0_228 = __p0_228; \
-  int16x4_t __s1_228 = __p1_228; \
-  __ret_228 = vqrdmulh_s16(__s0_228, splat_lane_s16(__s1_228, __p2_228)); \
-  __ret_228; \
-})
-#else
-#define vqrdmulh_lane_s16(__p0_229, __p1_229, __p2_229) __extension__ ({ \
-  int16x4_t __ret_229; \
-  int16x4_t __s0_229 = __p0_229; \
-  int16x4_t __s1_229 = __p1_229; \
-  int16x4_t __rev0_229;  __rev0_229 = __builtin_shufflevector(__s0_229, __s0_229, 3, 2, 1, 0); \
-  int16x4_t __rev1_229;  __rev1_229 = __builtin_shufflevector(__s1_229, __s1_229, 3, 2, 1, 0); \
-  __ret_229 = __noswap_vqrdmulh_s16(__rev0_229, __noswap_splat_lane_s16(__rev1_229, __p2_229)); \
-  __ret_229 = __builtin_shufflevector(__ret_229, __ret_229, 3, 2, 1, 0); \
-  __ret_229; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-#endif
-#if (__ARM_FP & 2)
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vcvt_f16_f32(float32x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vcvt_f16_f32(float32x4_t __p0) {
-  float16x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t __noswap_vcvt_f16_f32(float32x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vcvt_f32_f16(float16x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vcvt_f32_f16(float16x4_t __p0) {
-  float32x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t __noswap_vcvt_f32_f16(float16x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f16(__p0) __extension__ ({ \
-  float16x8_t __ret; \
-  __ret = (float16x8_t) __builtin_neon_vld1q_v(__p0, 40); \
-  __ret; \
-})
-#else
-#define vld1q_f16(__p0) __extension__ ({ \
-  float16x8_t __ret; \
-  __ret = (float16x8_t) __builtin_neon_vld1q_v(__p0, 40); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f16(__p0) __extension__ ({ \
-  float16x4_t __ret; \
-  __ret = (float16x4_t) __builtin_neon_vld1_v(__p0, 8); \
-  __ret; \
-})
-#else
-#define vld1_f16(__p0) __extension__ ({ \
-  float16x4_t __ret; \
-  __ret = (float16x4_t) __builtin_neon_vld1_v(__p0, 8); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_f16(__p0) __extension__ ({ \
-  float16x8_t __ret; \
-  __ret = (float16x8_t) __builtin_neon_vld1q_dup_v(__p0, 40); \
-  __ret; \
-})
-#else
-#define vld1q_dup_f16(__p0) __extension__ ({ \
-  float16x8_t __ret; \
-  __ret = (float16x8_t) __builtin_neon_vld1q_dup_v(__p0, 40); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_dup_f16(__p0) __extension__ ({ \
-  float16x4_t __ret; \
-  __ret = (float16x4_t) __builtin_neon_vld1_dup_v(__p0, 8); \
-  __ret; \
-})
-#else
-#define vld1_dup_f16(__p0) __extension__ ({ \
-  float16x4_t __ret; \
-  __ret = (float16x4_t) __builtin_neon_vld1_dup_v(__p0, 8); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s1 = __p1; \
-  __ret = (float16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 40); \
-  __ret; \
-})
-#else
-#define vld1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s1 = __p1; \
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 40); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s1 = __p1; \
-  __ret = (float16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 8); \
-  __ret; \
-})
-#else
-#define vld1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s1 = __p1; \
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (float16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 8); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f16_x2(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld1q_f16_x2(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f16_x2(__p0) __extension__ ({ \
-  float16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld1_f16_x2(__p0) __extension__ ({ \
-  float16x4x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f16_x3(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld1q_f16_x3(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f16_x3(__p0) __extension__ ({ \
-  float16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld1_f16_x3(__p0) __extension__ ({ \
-  float16x4x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f16_x4(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld1q_f16_x4(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1_f16_x4(__p0) __extension__ ({ \
-  float16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld1_f16_x4(__p0) __extension__ ({ \
-  float16x4x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_f16(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld2q_f16(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_f16(__p0) __extension__ ({ \
-  float16x4x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld2_f16(__p0) __extension__ ({ \
-  float16x4x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_f16(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld2q_dup_f16(__p0) __extension__ ({ \
-  float16x8x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_dup_f16(__p0) __extension__ ({ \
-  float16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld2_dup_f16(__p0) __extension__ ({ \
-  float16x4x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x2_t __ret; \
-  float16x8x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 40); \
-  __ret; \
-})
-#else
-#define vld2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x2_t __ret; \
-  float16x8x2_t __s1 = __p1; \
-  float16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x2_t __ret; \
-  float16x4x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 8); \
-  __ret; \
-})
-#else
-#define vld2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x2_t __ret; \
-  float16x4x2_t __s1 = __p1; \
-  float16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_f16(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld3q_f16(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_f16(__p0) __extension__ ({ \
-  float16x4x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld3_f16(__p0) __extension__ ({ \
-  float16x4x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_f16(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld3q_dup_f16(__p0) __extension__ ({ \
-  float16x8x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_dup_f16(__p0) __extension__ ({ \
-  float16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld3_dup_f16(__p0) __extension__ ({ \
-  float16x4x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x3_t __ret; \
-  float16x8x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 40); \
-  __ret; \
-})
-#else
-#define vld3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x3_t __ret; \
-  float16x8x3_t __s1 = __p1; \
-  float16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x3_t __ret; \
-  float16x4x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 8); \
-  __ret; \
-})
-#else
-#define vld3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x3_t __ret; \
-  float16x4x3_t __s1 = __p1; \
-  float16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_f16(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld4q_f16(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_f16(__p0) __extension__ ({ \
-  float16x4x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld4_f16(__p0) __extension__ ({ \
-  float16x4x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_f16(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
-  __ret; \
-})
-#else
-#define vld4q_dup_f16(__p0) __extension__ ({ \
-  float16x8x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_dup_f16(__p0) __extension__ ({ \
-  float16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 8); \
-  __ret; \
-})
-#else
-#define vld4_dup_f16(__p0) __extension__ ({ \
-  float16x4x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x4_t __ret; \
-  float16x8x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 40); \
-  __ret; \
-})
-#else
-#define vld4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x4_t __ret; \
-  float16x8x4_t __s1 = __p1; \
-  float16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 40); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x4_t __ret; \
-  float16x4x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 8); \
-  __ret; \
-})
-#else
-#define vld4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x4_t __ret; \
-  float16x4x4_t __s1 = __p1; \
-  float16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 8); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 40); \
-})
-#else
-#define vst1q_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __s1 = __p1; \
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 8); \
-})
-#else
-#define vst1_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __s1 = __p1; \
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 40); \
-})
-#else
-#define vst1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __s1 = __p1; \
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 8); \
-})
-#else
-#define vst1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __s1 = __p1; \
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
-  float16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 40); \
-})
-#else
-#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
-  float16x8x2_t __s1 = __p1; \
-  float16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
-  float16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 8); \
-})
-#else
-#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
-  float16x4x2_t __s1 = __p1; \
-  float16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
-  float16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 40); \
-})
-#else
-#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
-  float16x8x3_t __s1 = __p1; \
-  float16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
-  float16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 8); \
-})
-#else
-#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
-  float16x4x3_t __s1 = __p1; \
-  float16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
-  float16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 40); \
-})
-#else
-#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
-  float16x8x4_t __s1 = __p1; \
-  float16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
-  float16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 8); \
-})
-#else
-#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
-  float16x4x4_t __s1 = __p1; \
-  float16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_f16(__p0, __p1) __extension__ ({ \
-  float16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 40); \
-})
-#else
-#define vst2q_f16(__p0, __p1) __extension__ ({ \
-  float16x8x2_t __s1 = __p1; \
-  float16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_f16(__p0, __p1) __extension__ ({ \
-  float16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 8); \
-})
-#else
-#define vst2_f16(__p0, __p1) __extension__ ({ \
-  float16x4x2_t __s1 = __p1; \
-  float16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 40); \
-})
-#else
-#define vst2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x2_t __s1 = __p1; \
-  float16x8x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 8); \
-})
-#else
-#define vst2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x2_t __s1 = __p1; \
-  float16x4x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_f16(__p0, __p1) __extension__ ({ \
-  float16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 40); \
-})
-#else
-#define vst3q_f16(__p0, __p1) __extension__ ({ \
-  float16x8x3_t __s1 = __p1; \
-  float16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_f16(__p0, __p1) __extension__ ({ \
-  float16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 8); \
-})
-#else
-#define vst3_f16(__p0, __p1) __extension__ ({ \
-  float16x4x3_t __s1 = __p1; \
-  float16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 40); \
-})
-#else
-#define vst3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x3_t __s1 = __p1; \
-  float16x8x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 8); \
-})
-#else
-#define vst3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x3_t __s1 = __p1; \
-  float16x4x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_f16(__p0, __p1) __extension__ ({ \
-  float16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 40); \
-})
-#else
-#define vst4q_f16(__p0, __p1) __extension__ ({ \
-  float16x8x4_t __s1 = __p1; \
-  float16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_f16(__p0, __p1) __extension__ ({ \
-  float16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 8); \
-})
-#else
-#define vst4_f16(__p0, __p1) __extension__ ({ \
-  float16x4x4_t __s1 = __p1; \
-  float16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 8); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 40); \
-})
-#else
-#define vst4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8x4_t __s1 = __p1; \
-  float16x8x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 40); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 8); \
-})
-#else
-#define vst4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4x4_t __s1 = __p1; \
-  float16x4x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 8); \
-})
-#endif
-
-#endif
-#if (defined(__aarch64__)  || defined(__arm64ec__)) && defined(__ARM_FEATURE_NUMERIC_MAXMIN)
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vmaxnm_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vminnm_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
-  return __ret;
-}
-#endif
-#if (defined(__aarch64__) || defined(__arm64ec__)) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrndq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrndq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrnd_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrndaq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrndaq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrnda_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrndiq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrndiq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrndi_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrndmq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrndmq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrndm_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrndnq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrndnq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrndn_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrndpq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrndpq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrndp_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrndxq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrndxq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrndx_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-#endif
-#if __ARM_ARCH >= 8
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("aes,neon"))) uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaesdq_u8((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("aes,neon"))) uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vaesdq_u8((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("aes,neon"))) uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaeseq_u8((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("aes,neon"))) uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vaeseq_u8((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("aes,neon"))) uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaesimcq_u8((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("aes,neon"))) uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vaesimcq_u8((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("aes,neon"))) uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vaesmcq_u8((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("aes,neon"))) uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vaesmcq_u8((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vcvtaq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vcvtaq_s32_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vcvtaq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vcvtaq_s32_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vcvta_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vcvta_s32_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vcvta_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vcvta_s32_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcvtaq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcvtaq_u32_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcvtaq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcvtaq_u32_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcvta_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcvta_u32_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcvta_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcvta_u32_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vcvtmq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vcvtmq_s32_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vcvtmq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vcvtmq_s32_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vcvtm_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vcvtm_s32_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vcvtm_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vcvtm_s32_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcvtmq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcvtmq_u32_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcvtmq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcvtmq_u32_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcvtm_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcvtm_u32_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcvtm_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcvtm_u32_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vcvtnq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vcvtnq_s32_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vcvtnq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vcvtnq_s32_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vcvtn_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vcvtn_s32_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vcvtn_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vcvtn_s32_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcvtnq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcvtnq_u32_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcvtnq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcvtnq_u32_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcvtn_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcvtn_u32_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcvtn_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcvtn_u32_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vcvtpq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vcvtpq_s32_v((int8x16_t)__p0, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vcvtpq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vcvtpq_s32_v((int8x16_t)__rev0, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vcvtp_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vcvtp_s32_v((int8x8_t)__p0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vcvtp_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vcvtp_s32_v((int8x8_t)__rev0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcvtpq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcvtpq_u32_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcvtpq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcvtpq_u32_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcvtp_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcvtp_u32_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcvtp_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcvtp_u32_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32(__p0, __p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32(__rev0, __p1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("sha2,neon"))) uint32_t vsha1h_u32(uint32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32(__p0, __p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32(__rev0, __p1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32(__p0, __p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32(__rev0, __p1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_u32((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_u32((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256hq_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsha256hq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_u32((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_u32((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha2,neon"))) uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#endif
-#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrndq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrndq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrnd_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrnd_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrnd_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrnd_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndaq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrndaq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndaq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrndaq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrnda_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrnda_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrnda_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrnda_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndmq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrndmq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndmq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrndmq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndm_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrndm_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndm_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrndm_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndnq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrndnq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndnq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrndnq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndn_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrndn_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndn_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrndn_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndpq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrndpq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndpq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrndpq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndp_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrndp_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndp_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrndp_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndxq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrndxq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndxq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrndxq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndx_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrndx_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndx_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrndx_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrndq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrndq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrnd_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrnd_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrnd_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrndaq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrndaq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrnda_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrnda_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrnda_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrndiq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrndiq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrndi_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrndi_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrndmq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrndmq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrndm_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrndm_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrndm_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrndnq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrndnq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrndn_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrndn_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrndn_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float32_t vrndns_f32(float32_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vrndns_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrndpq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrndpq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrndp_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrndp_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrndp_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vrndxq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vrndxq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vrndx_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vrndx_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrndx_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#endif
-#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vmaxnmq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vmaxnmq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vmaxnm_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vmaxnm_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vminnmq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vminnmq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vminnm_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vminnm_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vminnm_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vminnm_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#endif
-#if defined(__ARM_FEATURE_FMA)
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vfmaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vfmaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t __noswap_vfmaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vfma_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t __noswap_vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __ret;
-  __ret = vfmaq_f32(__p0, __p1, (float32x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vfmaq_f32(__rev0, __rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __ret;
-  __ret = vfma_f32(__p0, __p1, (float32x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vfma_f32(__rev0, __rev1, (float32x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  __ret = vfmaq_f32(__p0, -__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vfmaq_f32(__rev0, -__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  __ret = vfma_f32(__p0, -__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vfma_f32(__rev0, -__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#endif
-#if defined(__aarch64__) || defined(__arm64ec__)
-__ai __attribute__((target("aes,neon"))) poly128_t vmull_p64(poly64_t __p0, poly64_t __p1) {
-  poly128_t __ret;
-  __ret = (poly128_t) __builtin_neon_vmull_p64(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t __a64_vcvtq_low_bf16_f32(float32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t) __builtin_neon___a64_vcvtq_low_bf16_f32((int8x16_t)__p0, 43);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t __a64_vcvtq_low_bf16_f32(float32x4_t __p0) {
-  bfloat16x8_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (bfloat16x8_t) __builtin_neon___a64_vcvtq_low_bf16_f32((int8x16_t)__rev0, 43);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t __noswap___a64_vcvtq_low_bf16_f32(float32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t) __builtin_neon___a64_vcvtq_low_bf16_f32((int8x16_t)__p0, 43);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_bf16(__p0_230, __p1_230, __p2_230, __p3_230) __extension__ ({ \
-  bfloat16x8_t __ret_230; \
-  bfloat16x8_t __s0_230 = __p0_230; \
-  bfloat16x4_t __s2_230 = __p2_230; \
-  __ret_230 = vsetq_lane_bf16(vget_lane_bf16(__s2_230, __p3_230), __s0_230, __p1_230); \
-  __ret_230; \
-})
-#else
-#define vcopyq_lane_bf16(__p0_231, __p1_231, __p2_231, __p3_231) __extension__ ({ \
-  bfloat16x8_t __ret_231; \
-  bfloat16x8_t __s0_231 = __p0_231; \
-  bfloat16x4_t __s2_231 = __p2_231; \
-  bfloat16x8_t __rev0_231;  __rev0_231 = __builtin_shufflevector(__s0_231, __s0_231, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x4_t __rev2_231;  __rev2_231 = __builtin_shufflevector(__s2_231, __s2_231, 3, 2, 1, 0); \
-  __ret_231 = __noswap_vsetq_lane_bf16(__noswap_vget_lane_bf16(__rev2_231, __p3_231), __rev0_231, __p1_231); \
-  __ret_231 = __builtin_shufflevector(__ret_231, __ret_231, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_231; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_bf16(__p0_232, __p1_232, __p2_232, __p3_232) __extension__ ({ \
-  bfloat16x4_t __ret_232; \
-  bfloat16x4_t __s0_232 = __p0_232; \
-  bfloat16x4_t __s2_232 = __p2_232; \
-  __ret_232 = vset_lane_bf16(vget_lane_bf16(__s2_232, __p3_232), __s0_232, __p1_232); \
-  __ret_232; \
-})
-#else
-#define vcopy_lane_bf16(__p0_233, __p1_233, __p2_233, __p3_233) __extension__ ({ \
-  bfloat16x4_t __ret_233; \
-  bfloat16x4_t __s0_233 = __p0_233; \
-  bfloat16x4_t __s2_233 = __p2_233; \
-  bfloat16x4_t __rev0_233;  __rev0_233 = __builtin_shufflevector(__s0_233, __s0_233, 3, 2, 1, 0); \
-  bfloat16x4_t __rev2_233;  __rev2_233 = __builtin_shufflevector(__s2_233, __s2_233, 3, 2, 1, 0); \
-  __ret_233 = __noswap_vset_lane_bf16(__noswap_vget_lane_bf16(__rev2_233, __p3_233), __rev0_233, __p1_233); \
-  __ret_233 = __builtin_shufflevector(__ret_233, __ret_233, 3, 2, 1, 0); \
-  __ret_233; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_bf16(__p0_234, __p1_234, __p2_234, __p3_234) __extension__ ({ \
-  bfloat16x8_t __ret_234; \
-  bfloat16x8_t __s0_234 = __p0_234; \
-  bfloat16x8_t __s2_234 = __p2_234; \
-  __ret_234 = vsetq_lane_bf16(vgetq_lane_bf16(__s2_234, __p3_234), __s0_234, __p1_234); \
-  __ret_234; \
-})
-#else
-#define vcopyq_laneq_bf16(__p0_235, __p1_235, __p2_235, __p3_235) __extension__ ({ \
-  bfloat16x8_t __ret_235; \
-  bfloat16x8_t __s0_235 = __p0_235; \
-  bfloat16x8_t __s2_235 = __p2_235; \
-  bfloat16x8_t __rev0_235;  __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x8_t __rev2_235;  __rev2_235 = __builtin_shufflevector(__s2_235, __s2_235, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_235 = __noswap_vsetq_lane_bf16(__noswap_vgetq_lane_bf16(__rev2_235, __p3_235), __rev0_235, __p1_235); \
-  __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_235; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_bf16(__p0_236, __p1_236, __p2_236, __p3_236) __extension__ ({ \
-  bfloat16x4_t __ret_236; \
-  bfloat16x4_t __s0_236 = __p0_236; \
-  bfloat16x8_t __s2_236 = __p2_236; \
-  __ret_236 = vset_lane_bf16(vgetq_lane_bf16(__s2_236, __p3_236), __s0_236, __p1_236); \
-  __ret_236; \
-})
-#else
-#define vcopy_laneq_bf16(__p0_237, __p1_237, __p2_237, __p3_237) __extension__ ({ \
-  bfloat16x4_t __ret_237; \
-  bfloat16x4_t __s0_237 = __p0_237; \
-  bfloat16x8_t __s2_237 = __p2_237; \
-  bfloat16x4_t __rev0_237;  __rev0_237 = __builtin_shufflevector(__s0_237, __s0_237, 3, 2, 1, 0); \
-  bfloat16x8_t __rev2_237;  __rev2_237 = __builtin_shufflevector(__s2_237, __s2_237, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_237 = __noswap_vset_lane_bf16(__noswap_vgetq_lane_bf16(__rev2_237, __p3_237), __rev0_237, __p1_237); \
-  __ret_237 = __builtin_shufflevector(__ret_237, __ret_237, 3, 2, 1, 0); \
-  __ret_237; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vcvt_bf16_f32(float32x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = vget_low_bf16(__a64_vcvtq_low_bf16_f32(__p0));
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vcvt_bf16_f32(float32x4_t __p0) {
-  bfloat16x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vget_low_bf16(__noswap___a64_vcvtq_low_bf16_f32(__rev0));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_high_bf16_f32(bfloat16x8_t __p0, float32x4_t __p1) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t) __builtin_neon_vcvtq_high_bf16_f32((int8x16_t)__p0, (int8x16_t)__p1, 43);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_high_bf16_f32(bfloat16x8_t __p0, float32x4_t __p1) {
-  bfloat16x8_t __ret;
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (bfloat16x8_t) __builtin_neon_vcvtq_high_bf16_f32((int8x16_t)__rev0, (int8x16_t)__rev1, 43);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_low_bf16_f32(float32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = __a64_vcvtq_low_bf16_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_low_bf16_f32(float32x4_t __p0) {
-  bfloat16x8_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap___a64_vcvtq_low_bf16_f32(__rev0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("bf16,neon"))) poly8x8_t vreinterpret_p8_bf16(bfloat16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly64x1_t vreinterpret_p64_bf16(bfloat16x4_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly16x4_t vreinterpret_p16_bf16(bfloat16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly8x16_t vreinterpretq_p8_bf16(bfloat16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly128_t vreinterpretq_p128_bf16(bfloat16x8_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly64x2_t vreinterpretq_p64_bf16(bfloat16x8_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) poly16x8_t vreinterpretq_p16_bf16(bfloat16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint8x16_t vreinterpretq_u8_bf16(bfloat16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint32x4_t vreinterpretq_u32_bf16(bfloat16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint64x2_t vreinterpretq_u64_bf16(bfloat16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint16x8_t vreinterpretq_u16_bf16(bfloat16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int8x16_t vreinterpretq_s8_bf16(bfloat16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float64x2_t vreinterpretq_f64_bf16(bfloat16x8_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float32x4_t vreinterpretq_f32_bf16(bfloat16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float16x8_t vreinterpretq_f16_bf16(bfloat16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int32x4_t vreinterpretq_s32_bf16(bfloat16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int64x2_t vreinterpretq_s64_bf16(bfloat16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int16x8_t vreinterpretq_s16_bf16(bfloat16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint8x8_t vreinterpret_u8_bf16(bfloat16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint32x2_t vreinterpret_u32_bf16(bfloat16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint64x1_t vreinterpret_u64_bf16(bfloat16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) uint16x4_t vreinterpret_u16_bf16(bfloat16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int8x8_t vreinterpret_s8_bf16(bfloat16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float64x1_t vreinterpret_f64_bf16(bfloat16x4_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float32x2_t vreinterpret_f32_bf16(bfloat16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) float16x4_t vreinterpret_f16_bf16(bfloat16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int32x2_t vreinterpret_s32_bf16(bfloat16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int64x1_t vreinterpret_s64_bf16(bfloat16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) int16x4_t vreinterpret_s16_bf16(bfloat16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_p8(poly8x16_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_p128(poly128_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_p64(poly64x2_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_p16(poly16x8_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_u8(uint8x16_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_u32(uint32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_u64(uint64x2_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_u16(uint16x8_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_s8(int8x16_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_f64(float64x2_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_f32(float32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_f16(float16x8_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_s32(int32x4_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_s64(int64x2_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x8_t vreinterpretq_bf16_s16(int16x8_t __p0) {
-  bfloat16x8_t __ret;
-  __ret = (bfloat16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_p8(poly8x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_p64(poly64x1_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_p16(poly16x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_u8(uint8x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_u32(uint32x2_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_u64(uint64x1_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_u16(uint16x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_s8(int8x8_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_f64(float64x1_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_f32(float32x2_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_f16(float16x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_s32(int32x2_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_s64(int64x1_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_s16(int16x4_t __p0) {
-  bfloat16x4_t __ret;
-  __ret = (bfloat16x4_t)(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vdotq_laneq_u32(__p0_238, __p1_238, __p2_238, __p3_238) __extension__ ({ \
-  uint32x4_t __ret_238; \
-  uint32x4_t __s0_238 = __p0_238; \
-  uint8x16_t __s1_238 = __p1_238; \
-  uint8x16_t __s2_238 = __p2_238; \
-uint8x16_t __reint_238 = __s2_238; \
-uint32x4_t __reint1_238 = splatq_laneq_u32(*(uint32x4_t *) &__reint_238, __p3_238); \
-  __ret_238 = vdotq_u32(__s0_238, __s1_238, *(uint8x16_t *) &__reint1_238); \
-  __ret_238; \
-})
-#else
-#define vdotq_laneq_u32(__p0_239, __p1_239, __p2_239, __p3_239) __extension__ ({ \
-  uint32x4_t __ret_239; \
-  uint32x4_t __s0_239 = __p0_239; \
-  uint8x16_t __s1_239 = __p1_239; \
-  uint8x16_t __s2_239 = __p2_239; \
-  uint32x4_t __rev0_239;  __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 3, 2, 1, 0); \
-  uint8x16_t __rev1_239;  __rev1_239 = __builtin_shufflevector(__s1_239, __s1_239, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_239;  __rev2_239 = __builtin_shufflevector(__s2_239, __s2_239, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-uint8x16_t __reint_239 = __rev2_239; \
-uint32x4_t __reint1_239 = __noswap_splatq_laneq_u32(*(uint32x4_t *) &__reint_239, __p3_239); \
-  __ret_239 = __noswap_vdotq_u32(__rev0_239, __rev1_239, *(uint8x16_t *) &__reint1_239); \
-  __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 3, 2, 1, 0); \
-  __ret_239; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdotq_laneq_s32(__p0_240, __p1_240, __p2_240, __p3_240) __extension__ ({ \
-  int32x4_t __ret_240; \
-  int32x4_t __s0_240 = __p0_240; \
-  int8x16_t __s1_240 = __p1_240; \
-  int8x16_t __s2_240 = __p2_240; \
-int8x16_t __reint_240 = __s2_240; \
-int32x4_t __reint1_240 = splatq_laneq_s32(*(int32x4_t *) &__reint_240, __p3_240); \
-  __ret_240 = vdotq_s32(__s0_240, __s1_240, *(int8x16_t *) &__reint1_240); \
-  __ret_240; \
-})
-#else
-#define vdotq_laneq_s32(__p0_241, __p1_241, __p2_241, __p3_241) __extension__ ({ \
-  int32x4_t __ret_241; \
-  int32x4_t __s0_241 = __p0_241; \
-  int8x16_t __s1_241 = __p1_241; \
-  int8x16_t __s2_241 = __p2_241; \
-  int32x4_t __rev0_241;  __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 3, 2, 1, 0); \
-  int8x16_t __rev1_241;  __rev1_241 = __builtin_shufflevector(__s1_241, __s1_241, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_241;  __rev2_241 = __builtin_shufflevector(__s2_241, __s2_241, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-int8x16_t __reint_241 = __rev2_241; \
-int32x4_t __reint1_241 = __noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_241, __p3_241); \
-  __ret_241 = __noswap_vdotq_s32(__rev0_241, __rev1_241, *(int8x16_t *) &__reint1_241); \
-  __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 3, 2, 1, 0); \
-  __ret_241; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdot_laneq_u32(__p0_242, __p1_242, __p2_242, __p3_242) __extension__ ({ \
-  uint32x2_t __ret_242; \
-  uint32x2_t __s0_242 = __p0_242; \
-  uint8x8_t __s1_242 = __p1_242; \
-  uint8x16_t __s2_242 = __p2_242; \
-uint8x16_t __reint_242 = __s2_242; \
-uint32x2_t __reint1_242 = splat_laneq_u32(*(uint32x4_t *) &__reint_242, __p3_242); \
-  __ret_242 = vdot_u32(__s0_242, __s1_242, *(uint8x8_t *) &__reint1_242); \
-  __ret_242; \
-})
-#else
-#define vdot_laneq_u32(__p0_243, __p1_243, __p2_243, __p3_243) __extension__ ({ \
-  uint32x2_t __ret_243; \
-  uint32x2_t __s0_243 = __p0_243; \
-  uint8x8_t __s1_243 = __p1_243; \
-  uint8x16_t __s2_243 = __p2_243; \
-  uint32x2_t __rev0_243;  __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 1, 0); \
-  uint8x8_t __rev1_243;  __rev1_243 = __builtin_shufflevector(__s1_243, __s1_243, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_243;  __rev2_243 = __builtin_shufflevector(__s2_243, __s2_243, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-uint8x16_t __reint_243 = __rev2_243; \
-uint32x2_t __reint1_243 = __noswap_splat_laneq_u32(*(uint32x4_t *) &__reint_243, __p3_243); \
-  __ret_243 = __noswap_vdot_u32(__rev0_243, __rev1_243, *(uint8x8_t *) &__reint1_243); \
-  __ret_243 = __builtin_shufflevector(__ret_243, __ret_243, 1, 0); \
-  __ret_243; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdot_laneq_s32(__p0_244, __p1_244, __p2_244, __p3_244) __extension__ ({ \
-  int32x2_t __ret_244; \
-  int32x2_t __s0_244 = __p0_244; \
-  int8x8_t __s1_244 = __p1_244; \
-  int8x16_t __s2_244 = __p2_244; \
-int8x16_t __reint_244 = __s2_244; \
-int32x2_t __reint1_244 = splat_laneq_s32(*(int32x4_t *) &__reint_244, __p3_244); \
-  __ret_244 = vdot_s32(__s0_244, __s1_244, *(int8x8_t *) &__reint1_244); \
-  __ret_244; \
-})
-#else
-#define vdot_laneq_s32(__p0_245, __p1_245, __p2_245, __p3_245) __extension__ ({ \
-  int32x2_t __ret_245; \
-  int32x2_t __s0_245 = __p0_245; \
-  int8x8_t __s1_245 = __p1_245; \
-  int8x16_t __s2_245 = __p2_245; \
-  int32x2_t __rev0_245;  __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 1, 0); \
-  int8x8_t __rev1_245;  __rev1_245 = __builtin_shufflevector(__s1_245, __s1_245, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_245;  __rev2_245 = __builtin_shufflevector(__s2_245, __s2_245, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-int8x16_t __reint_245 = __rev2_245; \
-int32x2_t __reint1_245 = __noswap_splat_laneq_s32(*(int32x4_t *) &__reint_245, __p3_245); \
-  __ret_245 = __noswap_vdot_s32(__rev0_245, __rev1_245, *(int8x8_t *) &__reint1_245); \
-  __ret_245 = __builtin_shufflevector(__ret_245, __ret_245, 1, 0); \
-  __ret_245; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmlalq_high_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vfmlalq_high_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t __noswap_vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmlalq_high_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfmlal_high_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vfmlal_high_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t __noswap_vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfmlal_high_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmlalq_low_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vfmlalq_low_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t __noswap_vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmlalq_low_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfmlal_low_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vfmlal_low_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t __noswap_vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfmlal_low_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmlslq_high_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vfmlslq_high_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t __noswap_vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmlslq_high_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfmlsl_high_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vfmlsl_high_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t __noswap_vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfmlsl_high_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmlslq_low_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vfmlslq_low_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fp16fml,neon"))) float32x4_t __noswap_vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vfmlslq_low_f16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfmlsl_low_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vfmlsl_low_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fp16fml,neon"))) float32x2_t __noswap_vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vfmlsl_low_f16((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __p0 / __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 / __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __p0 / __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 / __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x4_t __s2 = __p2; \
-  __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (float16x4_t)__s2, __p3); \
-  __ret; \
-})
-#else
-#define vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x4_t __s2 = __p2; \
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (float16x4_t)__rev2, __p3); \
-  __ret; \
-})
-#define __noswap_vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x4_t __s2 = __p2; \
-  __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (float16x4_t)__s2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16x4_t __s2 = __p2; \
-  __ret = (float16x8_t) __builtin_neon_vfmaq_lane_f16((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 40); \
-  __ret; \
-})
-#else
-#define vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16x4_t __s2 = __p2; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (float16x8_t) __builtin_neon_vfmaq_lane_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, __p3, 40); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16x4_t __s2 = __p2; \
-  __ret = (float16x8_t) __builtin_neon_vfmaq_lane_f16((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 40); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16x4_t __s2 = __p2; \
-  __ret = (float16x4_t) __builtin_neon_vfma_lane_f16((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 8); \
-  __ret; \
-})
-#else
-#define vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16x4_t __s2 = __p2; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (float16x4_t) __builtin_neon_vfma_lane_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, __p3, 8); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16x4_t __s2 = __p2; \
-  __ret = (float16x4_t) __builtin_neon_vfma_lane_f16((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 8); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x8_t __s2 = __p2; \
-  __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (float16x8_t)__s2, __p3); \
-  __ret; \
-})
-#else
-#define vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x8_t __s2 = __p2; \
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (float16x8_t)__rev2, __p3); \
-  __ret; \
-})
-#define __noswap_vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x8_t __s2 = __p2; \
-  __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (float16x8_t)__s2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16x8_t __s2 = __p2; \
-  __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_f16((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 40); \
-  __ret; \
-})
-#else
-#define vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16x8_t __s2 = __p2; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 40); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16x8_t __s2 = __p2; \
-  __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_f16((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 40); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16x8_t __s2 = __p2; \
-  __ret = (float16x4_t) __builtin_neon_vfma_laneq_f16((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 8); \
-  __ret; \
-})
-#else
-#define vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16x8_t __s2 = __p2; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16x4_t) __builtin_neon_vfma_laneq_f16((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x16_t)__rev2, __p3, 8); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16x8_t __s2 = __p2; \
-  __ret = (float16x4_t) __builtin_neon_vfma_laneq_f16((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 8); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmaq_n_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  __ret = vfmaq_f16(__s0, __s1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \
-  __ret; \
-})
-#else
-#define vfmaq_n_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = __noswap_vfmaq_f16(__rev0, __rev1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfma_n_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  __ret = vfma_f16(__s0, __s1, (float16x4_t) {__s2, __s2, __s2, __s2}); \
-  __ret; \
-})
-#else
-#define vfma_n_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = __noswap_vfma_f16(__rev0, __rev1, (float16x4_t) {__s2, __s2, __s2, __s2}); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsh_lane_f16(__p0_246, __p1_246, __p2_246, __p3_246) __extension__ ({ \
-  float16_t __ret_246; \
-  float16_t __s0_246 = __p0_246; \
-  float16_t __s1_246 = __p1_246; \
-  float16x4_t __s2_246 = __p2_246; \
-  __ret_246 = vfmah_lane_f16(__s0_246, -__s1_246, __s2_246, __p3_246); \
-  __ret_246; \
-})
-#else
-#define vfmsh_lane_f16(__p0_247, __p1_247, __p2_247, __p3_247) __extension__ ({ \
-  float16_t __ret_247; \
-  float16_t __s0_247 = __p0_247; \
-  float16_t __s1_247 = __p1_247; \
-  float16x4_t __s2_247 = __p2_247; \
-  float16x4_t __rev2_247;  __rev2_247 = __builtin_shufflevector(__s2_247, __s2_247, 3, 2, 1, 0); \
-  __ret_247 = __noswap_vfmah_lane_f16(__s0_247, -__s1_247, __rev2_247, __p3_247); \
-  __ret_247; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsq_lane_f16(__p0_248, __p1_248, __p2_248, __p3_248) __extension__ ({ \
-  float16x8_t __ret_248; \
-  float16x8_t __s0_248 = __p0_248; \
-  float16x8_t __s1_248 = __p1_248; \
-  float16x4_t __s2_248 = __p2_248; \
-  __ret_248 = vfmaq_lane_f16(__s0_248, -__s1_248, __s2_248, __p3_248); \
-  __ret_248; \
-})
-#else
-#define vfmsq_lane_f16(__p0_249, __p1_249, __p2_249, __p3_249) __extension__ ({ \
-  float16x8_t __ret_249; \
-  float16x8_t __s0_249 = __p0_249; \
-  float16x8_t __s1_249 = __p1_249; \
-  float16x4_t __s2_249 = __p2_249; \
-  float16x8_t __rev0_249;  __rev0_249 = __builtin_shufflevector(__s0_249, __s0_249, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_249;  __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_249;  __rev2_249 = __builtin_shufflevector(__s2_249, __s2_249, 3, 2, 1, 0); \
-  __ret_249 = __noswap_vfmaq_lane_f16(__rev0_249, -__rev1_249, __rev2_249, __p3_249); \
-  __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_249; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfms_lane_f16(__p0_250, __p1_250, __p2_250, __p3_250) __extension__ ({ \
-  float16x4_t __ret_250; \
-  float16x4_t __s0_250 = __p0_250; \
-  float16x4_t __s1_250 = __p1_250; \
-  float16x4_t __s2_250 = __p2_250; \
-  __ret_250 = vfma_lane_f16(__s0_250, -__s1_250, __s2_250, __p3_250); \
-  __ret_250; \
-})
-#else
-#define vfms_lane_f16(__p0_251, __p1_251, __p2_251, __p3_251) __extension__ ({ \
-  float16x4_t __ret_251; \
-  float16x4_t __s0_251 = __p0_251; \
-  float16x4_t __s1_251 = __p1_251; \
-  float16x4_t __s2_251 = __p2_251; \
-  float16x4_t __rev0_251;  __rev0_251 = __builtin_shufflevector(__s0_251, __s0_251, 3, 2, 1, 0); \
-  float16x4_t __rev1_251;  __rev1_251 = __builtin_shufflevector(__s1_251, __s1_251, 3, 2, 1, 0); \
-  float16x4_t __rev2_251;  __rev2_251 = __builtin_shufflevector(__s2_251, __s2_251, 3, 2, 1, 0); \
-  __ret_251 = __noswap_vfma_lane_f16(__rev0_251, -__rev1_251, __rev2_251, __p3_251); \
-  __ret_251 = __builtin_shufflevector(__ret_251, __ret_251, 3, 2, 1, 0); \
-  __ret_251; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsh_laneq_f16(__p0_252, __p1_252, __p2_252, __p3_252) __extension__ ({ \
-  float16_t __ret_252; \
-  float16_t __s0_252 = __p0_252; \
-  float16_t __s1_252 = __p1_252; \
-  float16x8_t __s2_252 = __p2_252; \
-  __ret_252 = vfmah_laneq_f16(__s0_252, -__s1_252, __s2_252, __p3_252); \
-  __ret_252; \
-})
-#else
-#define vfmsh_laneq_f16(__p0_253, __p1_253, __p2_253, __p3_253) __extension__ ({ \
-  float16_t __ret_253; \
-  float16_t __s0_253 = __p0_253; \
-  float16_t __s1_253 = __p1_253; \
-  float16x8_t __s2_253 = __p2_253; \
-  float16x8_t __rev2_253;  __rev2_253 = __builtin_shufflevector(__s2_253, __s2_253, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_253 = __noswap_vfmah_laneq_f16(__s0_253, -__s1_253, __rev2_253, __p3_253); \
-  __ret_253; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsq_laneq_f16(__p0_254, __p1_254, __p2_254, __p3_254) __extension__ ({ \
-  float16x8_t __ret_254; \
-  float16x8_t __s0_254 = __p0_254; \
-  float16x8_t __s1_254 = __p1_254; \
-  float16x8_t __s2_254 = __p2_254; \
-  __ret_254 = vfmaq_laneq_f16(__s0_254, -__s1_254, __s2_254, __p3_254); \
-  __ret_254; \
-})
-#else
-#define vfmsq_laneq_f16(__p0_255, __p1_255, __p2_255, __p3_255) __extension__ ({ \
-  float16x8_t __ret_255; \
-  float16x8_t __s0_255 = __p0_255; \
-  float16x8_t __s1_255 = __p1_255; \
-  float16x8_t __s2_255 = __p2_255; \
-  float16x8_t __rev0_255;  __rev0_255 = __builtin_shufflevector(__s0_255, __s0_255, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_255;  __rev1_255 = __builtin_shufflevector(__s1_255, __s1_255, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_255;  __rev2_255 = __builtin_shufflevector(__s2_255, __s2_255, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_255 = __noswap_vfmaq_laneq_f16(__rev0_255, -__rev1_255, __rev2_255, __p3_255); \
-  __ret_255 = __builtin_shufflevector(__ret_255, __ret_255, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_255; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfms_laneq_f16(__p0_256, __p1_256, __p2_256, __p3_256) __extension__ ({ \
-  float16x4_t __ret_256; \
-  float16x4_t __s0_256 = __p0_256; \
-  float16x4_t __s1_256 = __p1_256; \
-  float16x8_t __s2_256 = __p2_256; \
-  __ret_256 = vfma_laneq_f16(__s0_256, -__s1_256, __s2_256, __p3_256); \
-  __ret_256; \
-})
-#else
-#define vfms_laneq_f16(__p0_257, __p1_257, __p2_257, __p3_257) __extension__ ({ \
-  float16x4_t __ret_257; \
-  float16x4_t __s0_257 = __p0_257; \
-  float16x4_t __s1_257 = __p1_257; \
-  float16x8_t __s2_257 = __p2_257; \
-  float16x4_t __rev0_257;  __rev0_257 = __builtin_shufflevector(__s0_257, __s0_257, 3, 2, 1, 0); \
-  float16x4_t __rev1_257;  __rev1_257 = __builtin_shufflevector(__s1_257, __s1_257, 3, 2, 1, 0); \
-  float16x8_t __rev2_257;  __rev2_257 = __builtin_shufflevector(__s2_257, __s2_257, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_257 = __noswap_vfma_laneq_f16(__rev0_257, -__rev1_257, __rev2_257, __p3_257); \
-  __ret_257 = __builtin_shufflevector(__ret_257, __ret_257, 3, 2, 1, 0); \
-  __ret_257; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsq_n_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  __ret = vfmaq_f16(__s0, -__s1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \
-  __ret; \
-})
-#else
-#define vfmsq_n_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = __noswap_vfmaq_f16(__rev0, -__rev1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfms_n_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  __ret = vfma_f16(__s0, -__s1, (float16x4_t) {__s2, __s2, __s2, __s2}); \
-  __ret; \
-})
-#else
-#define vfms_n_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = __noswap_vfma_f16(__rev0, -__rev1, (float16x4_t) {__s2, __s2, __s2, __s2}); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmaxnmvq_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vmaxnmvq_f16((int8x16_t)__s0); \
-  __ret; \
-})
-#else
-#define vmaxnmvq_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vmaxnmvq_f16((int8x16_t)__rev0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmaxnmv_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vmaxnmv_f16((int8x8_t)__s0); \
-  __ret; \
-})
-#else
-#define vmaxnmv_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vmaxnmv_f16((int8x8_t)__rev0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmaxvq_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vmaxvq_f16((int8x16_t)__s0); \
-  __ret; \
-})
-#else
-#define vmaxvq_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vmaxvq_f16((int8x16_t)__rev0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmaxv_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vmaxv_f16((int8x8_t)__s0); \
-  __ret; \
-})
-#else
-#define vmaxv_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vmaxv_f16((int8x8_t)__rev0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vminnmvq_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vminnmvq_f16((int8x16_t)__s0); \
-  __ret; \
-})
-#else
-#define vminnmvq_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vminnmvq_f16((int8x16_t)__rev0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vminnmv_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vminnmv_f16((int8x8_t)__s0); \
-  __ret; \
-})
-#else
-#define vminnmv_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vminnmv_f16((int8x8_t)__rev0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vminvq_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vminvq_f16((int8x16_t)__s0); \
-  __ret; \
-})
-#else
-#define vminvq_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vminvq_f16((int8x16_t)__rev0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vminv_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vminv_f16((int8x8_t)__s0); \
-  __ret; \
-})
-#else
-#define vminv_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vminv_f16((int8x8_t)__rev0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_f16(__p0_258, __p1_258, __p2_258) __extension__ ({ \
-  float16x8_t __ret_258; \
-  float16x8_t __s0_258 = __p0_258; \
-  float16x8_t __s1_258 = __p1_258; \
-  __ret_258 = __s0_258 * splatq_laneq_f16(__s1_258, __p2_258); \
-  __ret_258; \
-})
-#else
-#define vmulq_laneq_f16(__p0_259, __p1_259, __p2_259) __extension__ ({ \
-  float16x8_t __ret_259; \
-  float16x8_t __s0_259 = __p0_259; \
-  float16x8_t __s1_259 = __p1_259; \
-  float16x8_t __rev0_259;  __rev0_259 = __builtin_shufflevector(__s0_259, __s0_259, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_259;  __rev1_259 = __builtin_shufflevector(__s1_259, __s1_259, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_259 = __rev0_259 * __noswap_splatq_laneq_f16(__rev1_259, __p2_259); \
-  __ret_259 = __builtin_shufflevector(__ret_259, __ret_259, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_259; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_f16(__p0_260, __p1_260, __p2_260) __extension__ ({ \
-  float16x4_t __ret_260; \
-  float16x4_t __s0_260 = __p0_260; \
-  float16x8_t __s1_260 = __p1_260; \
-  __ret_260 = __s0_260 * splat_laneq_f16(__s1_260, __p2_260); \
-  __ret_260; \
-})
-#else
-#define vmul_laneq_f16(__p0_261, __p1_261, __p2_261) __extension__ ({ \
-  float16x4_t __ret_261; \
-  float16x4_t __s0_261 = __p0_261; \
-  float16x8_t __s1_261 = __p1_261; \
-  float16x4_t __rev0_261;  __rev0_261 = __builtin_shufflevector(__s0_261, __s0_261, 3, 2, 1, 0); \
-  float16x8_t __rev1_261;  __rev1_261 = __builtin_shufflevector(__s1_261, __s1_261, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_261 = __rev0_261 * __noswap_splat_laneq_f16(__rev1_261, __p2_261); \
-  __ret_261 = __builtin_shufflevector(__ret_261, __ret_261, 3, 2, 1, 0); \
-  __ret_261; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vmulxq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vmulxq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vmulxq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vmulxq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t __noswap_vmulxq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vmulxq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmulx_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vmulx_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vmulx_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vmulx_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t __noswap_vmulx_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vmulx_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxh_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmulxh_lane_f16(__s0, (float16x4_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vmulxh_lane_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16x4_t __s1 = __p1; \
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vmulxh_lane_f16(__s0, (float16x4_t)__rev1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxq_lane_f16(__p0_262, __p1_262, __p2_262) __extension__ ({ \
-  float16x8_t __ret_262; \
-  float16x8_t __s0_262 = __p0_262; \
-  float16x4_t __s1_262 = __p1_262; \
-  __ret_262 = vmulxq_f16(__s0_262, splatq_lane_f16(__s1_262, __p2_262)); \
-  __ret_262; \
-})
-#else
-#define vmulxq_lane_f16(__p0_263, __p1_263, __p2_263) __extension__ ({ \
-  float16x8_t __ret_263; \
-  float16x8_t __s0_263 = __p0_263; \
-  float16x4_t __s1_263 = __p1_263; \
-  float16x8_t __rev0_263;  __rev0_263 = __builtin_shufflevector(__s0_263, __s0_263, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev1_263;  __rev1_263 = __builtin_shufflevector(__s1_263, __s1_263, 3, 2, 1, 0); \
-  __ret_263 = __noswap_vmulxq_f16(__rev0_263, __noswap_splatq_lane_f16(__rev1_263, __p2_263)); \
-  __ret_263 = __builtin_shufflevector(__ret_263, __ret_263, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_263; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulx_lane_f16(__p0_264, __p1_264, __p2_264) __extension__ ({ \
-  float16x4_t __ret_264; \
-  float16x4_t __s0_264 = __p0_264; \
-  float16x4_t __s1_264 = __p1_264; \
-  __ret_264 = vmulx_f16(__s0_264, splat_lane_f16(__s1_264, __p2_264)); \
-  __ret_264; \
-})
-#else
-#define vmulx_lane_f16(__p0_265, __p1_265, __p2_265) __extension__ ({ \
-  float16x4_t __ret_265; \
-  float16x4_t __s0_265 = __p0_265; \
-  float16x4_t __s1_265 = __p1_265; \
-  float16x4_t __rev0_265;  __rev0_265 = __builtin_shufflevector(__s0_265, __s0_265, 3, 2, 1, 0); \
-  float16x4_t __rev1_265;  __rev1_265 = __builtin_shufflevector(__s1_265, __s1_265, 3, 2, 1, 0); \
-  __ret_265 = __noswap_vmulx_f16(__rev0_265, __noswap_splat_lane_f16(__rev1_265, __p2_265)); \
-  __ret_265 = __builtin_shufflevector(__ret_265, __ret_265, 3, 2, 1, 0); \
-  __ret_265; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxh_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmulxh_laneq_f16(__s0, (float16x8_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vmulxh_laneq_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16x8_t __s1 = __p1; \
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vmulxh_laneq_f16(__s0, (float16x8_t)__rev1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxq_laneq_f16(__p0_266, __p1_266, __p2_266) __extension__ ({ \
-  float16x8_t __ret_266; \
-  float16x8_t __s0_266 = __p0_266; \
-  float16x8_t __s1_266 = __p1_266; \
-  __ret_266 = vmulxq_f16(__s0_266, splatq_laneq_f16(__s1_266, __p2_266)); \
-  __ret_266; \
-})
-#else
-#define vmulxq_laneq_f16(__p0_267, __p1_267, __p2_267) __extension__ ({ \
-  float16x8_t __ret_267; \
-  float16x8_t __s0_267 = __p0_267; \
-  float16x8_t __s1_267 = __p1_267; \
-  float16x8_t __rev0_267;  __rev0_267 = __builtin_shufflevector(__s0_267, __s0_267, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev1_267;  __rev1_267 = __builtin_shufflevector(__s1_267, __s1_267, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_267 = __noswap_vmulxq_f16(__rev0_267, __noswap_splatq_laneq_f16(__rev1_267, __p2_267)); \
-  __ret_267 = __builtin_shufflevector(__ret_267, __ret_267, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_267; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulx_laneq_f16(__p0_268, __p1_268, __p2_268) __extension__ ({ \
-  float16x4_t __ret_268; \
-  float16x4_t __s0_268 = __p0_268; \
-  float16x8_t __s1_268 = __p1_268; \
-  __ret_268 = vmulx_f16(__s0_268, splat_laneq_f16(__s1_268, __p2_268)); \
-  __ret_268; \
-})
-#else
-#define vmulx_laneq_f16(__p0_269, __p1_269, __p2_269) __extension__ ({ \
-  float16x4_t __ret_269; \
-  float16x4_t __s0_269 = __p0_269; \
-  float16x8_t __s1_269 = __p1_269; \
-  float16x4_t __rev0_269;  __rev0_269 = __builtin_shufflevector(__s0_269, __s0_269, 3, 2, 1, 0); \
-  float16x8_t __rev1_269;  __rev1_269 = __builtin_shufflevector(__s1_269, __s1_269, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_269 = __noswap_vmulx_f16(__rev0_269, __noswap_splat_laneq_f16(__rev1_269, __p2_269)); \
-  __ret_269 = __builtin_shufflevector(__ret_269, __ret_269, 3, 2, 1, 0); \
-  __ret_269; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxq_n_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = vmulxq_f16(__s0, (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}); \
-  __ret; \
-})
-#else
-#define vmulxq_n_f16(__p0, __p1) __extension__ ({ \
-  float16x8_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = __noswap_vmulxq_f16(__rev0, (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulx_n_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = vmulx_f16(__s0, (float16x4_t) {__s1, __s1, __s1, __s1}); \
-  __ret; \
-})
-#else
-#define vmulx_n_f16(__p0, __p1) __extension__ ({ \
-  float16x4_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = __noswap_vmulx_f16(__rev0, (float16x4_t) {__s1, __s1, __s1, __s1}); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vpaddq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vpaddq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vpmaxq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vpmaxq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vpmaxnmq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vpmaxnmq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpmaxnm_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vpmaxnm_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpmaxnm_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vpmaxnm_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpminq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vpminq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpminq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vpminq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpminnmq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vpminnmq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vpminnmq_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vpminnmq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpminnm_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vpminnm_f16((int8x8_t)__p0, (int8x8_t)__p1, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vpminnm_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vpminnm_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndiq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vrndiq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vrndiq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vrndiq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndi_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vrndi_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vrndi_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vrndi_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vsqrtq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t) __builtin_neon_vsqrtq_f16((int8x16_t)__p0, 40);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x8_t vsqrtq_f16(float16x8_t __p0) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (float16x8_t) __builtin_neon_vsqrtq_f16((int8x16_t)__rev0, 40);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vsqrt_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t) __builtin_neon_vsqrt_f16((int8x8_t)__p0, 8);
-  return __ret;
-}
-#else
-__ai __attribute__((target("fullfp16,neon"))) float16x4_t vsqrt_f16(float16x4_t __p0) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float16x4_t) __builtin_neon_vsqrt_f16((int8x8_t)__rev0, 8);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsudotq_laneq_s32(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
-  int32x4_t __ret_270; \
-  int32x4_t __s0_270 = __p0_270; \
-  int8x16_t __s1_270 = __p1_270; \
-  uint8x16_t __s2_270 = __p2_270; \
-uint8x16_t __reint_270 = __s2_270; \
-  __ret_270 = vusdotq_s32(__s0_270, (uint8x16_t)(splatq_laneq_s32(*(int32x4_t *) &__reint_270, __p3_270)), __s1_270); \
-  __ret_270; \
-})
-#else
-#define vsudotq_laneq_s32(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
-  int32x4_t __ret_271; \
-  int32x4_t __s0_271 = __p0_271; \
-  int8x16_t __s1_271 = __p1_271; \
-  uint8x16_t __s2_271 = __p2_271; \
-  int32x4_t __rev0_271;  __rev0_271 = __builtin_shufflevector(__s0_271, __s0_271, 3, 2, 1, 0); \
-  int8x16_t __rev1_271;  __rev1_271 = __builtin_shufflevector(__s1_271, __s1_271, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_271;  __rev2_271 = __builtin_shufflevector(__s2_271, __s2_271, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-uint8x16_t __reint_271 = __rev2_271; \
-  __ret_271 = __noswap_vusdotq_s32(__rev0_271, (uint8x16_t)(__noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_271, __p3_271)), __rev1_271); \
-  __ret_271 = __builtin_shufflevector(__ret_271, __ret_271, 3, 2, 1, 0); \
-  __ret_271; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsudot_laneq_s32(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
-  int32x2_t __ret_272; \
-  int32x2_t __s0_272 = __p0_272; \
-  int8x8_t __s1_272 = __p1_272; \
-  uint8x16_t __s2_272 = __p2_272; \
-uint8x16_t __reint_272 = __s2_272; \
-  __ret_272 = vusdot_s32(__s0_272, (uint8x8_t)(splat_laneq_s32(*(int32x4_t *) &__reint_272, __p3_272)), __s1_272); \
-  __ret_272; \
-})
-#else
-#define vsudot_laneq_s32(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
-  int32x2_t __ret_273; \
-  int32x2_t __s0_273 = __p0_273; \
-  int8x8_t __s1_273 = __p1_273; \
-  uint8x16_t __s2_273 = __p2_273; \
-  int32x2_t __rev0_273;  __rev0_273 = __builtin_shufflevector(__s0_273, __s0_273, 1, 0); \
-  int8x8_t __rev1_273;  __rev1_273 = __builtin_shufflevector(__s1_273, __s1_273, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_273;  __rev2_273 = __builtin_shufflevector(__s2_273, __s2_273, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-uint8x16_t __reint_273 = __rev2_273; \
-  __ret_273 = __noswap_vusdot_s32(__rev0_273, (uint8x8_t)(__noswap_splat_laneq_s32(*(int32x4_t *) &__reint_273, __p3_273)), __rev1_273); \
-  __ret_273 = __builtin_shufflevector(__ret_273, __ret_273, 1, 0); \
-  __ret_273; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vusdotq_laneq_s32(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
-  int32x4_t __ret_274; \
-  int32x4_t __s0_274 = __p0_274; \
-  uint8x16_t __s1_274 = __p1_274; \
-  int8x16_t __s2_274 = __p2_274; \
-int8x16_t __reint_274 = __s2_274; \
-  __ret_274 = vusdotq_s32(__s0_274, __s1_274, (int8x16_t)(splatq_laneq_s32(*(int32x4_t *) &__reint_274, __p3_274))); \
-  __ret_274; \
-})
-#else
-#define vusdotq_laneq_s32(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
-  int32x4_t __ret_275; \
-  int32x4_t __s0_275 = __p0_275; \
-  uint8x16_t __s1_275 = __p1_275; \
-  int8x16_t __s2_275 = __p2_275; \
-  int32x4_t __rev0_275;  __rev0_275 = __builtin_shufflevector(__s0_275, __s0_275, 3, 2, 1, 0); \
-  uint8x16_t __rev1_275;  __rev1_275 = __builtin_shufflevector(__s1_275, __s1_275, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_275;  __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-int8x16_t __reint_275 = __rev2_275; \
-  __ret_275 = __noswap_vusdotq_s32(__rev0_275, __rev1_275, (int8x16_t)(__noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_275, __p3_275))); \
-  __ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 3, 2, 1, 0); \
-  __ret_275; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vusdot_laneq_s32(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
-  int32x2_t __ret_276; \
-  int32x2_t __s0_276 = __p0_276; \
-  uint8x8_t __s1_276 = __p1_276; \
-  int8x16_t __s2_276 = __p2_276; \
-int8x16_t __reint_276 = __s2_276; \
-  __ret_276 = vusdot_s32(__s0_276, __s1_276, (int8x8_t)(splat_laneq_s32(*(int32x4_t *) &__reint_276, __p3_276))); \
-  __ret_276; \
-})
-#else
-#define vusdot_laneq_s32(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
-  int32x2_t __ret_277; \
-  int32x2_t __s0_277 = __p0_277; \
-  uint8x8_t __s1_277 = __p1_277; \
-  int8x16_t __s2_277 = __p2_277; \
-  int32x2_t __rev0_277;  __rev0_277 = __builtin_shufflevector(__s0_277, __s0_277, 1, 0); \
-  uint8x8_t __rev1_277;  __rev1_277 = __builtin_shufflevector(__s1_277, __s1_277, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_277;  __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-int8x16_t __reint_277 = __rev2_277; \
-  __ret_277 = __noswap_vusdot_s32(__rev0_277, __rev1_277, (int8x8_t)(__noswap_splat_laneq_s32(*(int32x4_t *) &__reint_277, __p3_277))); \
-  __ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 1, 0); \
-  __ret_277; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vabdq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vabdq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vabd_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64_t vabdd_f64(float64_t __p0, float64_t __p1) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vabdd_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32_t vabds_f32(float32_t __p0, float32_t __p1) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vabds_f32(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vabsq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vabsq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vabsq_s64(int64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vabsq_s64(int64x2_t __p0) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vabs_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vabs_s64(int64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vabsd_s64(int64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vabsd_s64(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vaddq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vaddq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vadd_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = __p0 + __p1;
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vaddd_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vaddd_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vaddd_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vaddd_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vaddq_p128(poly128_t __p0, poly128_t __p1) {
-  poly128_t __ret;
-  __ret = (poly128_t) __builtin_neon_vaddq_p128(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vaddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint16x8_t __ret;
-  __ret = vcombine_u16(__p0, vaddhn_u32(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vaddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint16x8_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u16(__rev0, __noswap_vaddhn_u32(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vaddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint32x4_t __ret;
-  __ret = vcombine_u32(__p0, vaddhn_u64(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vaddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint32x4_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vcombine_u32(__rev0, __noswap_vaddhn_u64(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vaddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint8x16_t __ret;
-  __ret = vcombine_u8(__p0, vaddhn_u16(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vaddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint8x16_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u8(__rev0, __noswap_vaddhn_u16(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vaddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int16x8_t __ret;
-  __ret = vcombine_s16(__p0, vaddhn_s32(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vaddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int16x8_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s16(__rev0, __noswap_vaddhn_s32(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vaddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int32x4_t __ret;
-  __ret = vcombine_s32(__p0, vaddhn_s64(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vaddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int32x4_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vcombine_s32(__rev0, __noswap_vaddhn_s64(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vaddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int8x16_t __ret;
-  __ret = vcombine_s8(__p0, vaddhn_s16(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vaddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int8x16_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s8(__rev0, __noswap_vaddhn_s16(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16_t vaddlvq_u8(uint8x16_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vaddlvq_u8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16_t vaddlvq_u8(uint8x16_t __p0) {
-  uint16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16_t) __builtin_neon_vaddlvq_u8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64_t vaddlvq_u32(uint32x4_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vaddlvq_u32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64_t vaddlvq_u32(uint32x4_t __p0) {
-  uint64_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint64_t) __builtin_neon_vaddlvq_u32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32_t vaddlvq_u16(uint16x8_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vaddlvq_u16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32_t vaddlvq_u16(uint16x8_t __p0) {
-  uint32_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint32_t) __builtin_neon_vaddlvq_u16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16_t vaddlvq_s8(int8x16_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vaddlvq_s8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16_t vaddlvq_s8(int8x16_t __p0) {
-  int16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16_t) __builtin_neon_vaddlvq_s8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64_t vaddlvq_s32(int32x4_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vaddlvq_s32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64_t vaddlvq_s32(int32x4_t __p0) {
-  int64_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int64_t) __builtin_neon_vaddlvq_s32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32_t vaddlvq_s16(int16x8_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vaddlvq_s16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32_t vaddlvq_s16(int16x8_t __p0) {
-  int32_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int32_t) __builtin_neon_vaddlvq_s16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16_t vaddlv_u8(uint8x8_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vaddlv_u8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16_t vaddlv_u8(uint8x8_t __p0) {
-  uint16_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16_t) __builtin_neon_vaddlv_u8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64_t vaddlv_u32(uint32x2_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vaddlv_u32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64_t vaddlv_u32(uint32x2_t __p0) {
-  uint64_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64_t) __builtin_neon_vaddlv_u32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32_t vaddlv_u16(uint16x4_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vaddlv_u16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32_t vaddlv_u16(uint16x4_t __p0) {
-  uint32_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32_t) __builtin_neon_vaddlv_u16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16_t vaddlv_s8(int8x8_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vaddlv_s8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16_t vaddlv_s8(int8x8_t __p0) {
-  int16_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16_t) __builtin_neon_vaddlv_s8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64_t vaddlv_s32(int32x2_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vaddlv_s32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64_t vaddlv_s32(int32x2_t __p0) {
-  int64_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64_t) __builtin_neon_vaddlv_s32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32_t vaddlv_s16(int16x4_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vaddlv_s16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32_t vaddlv_s16(int16x4_t __p0) {
-  int32_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32_t) __builtin_neon_vaddlv_s16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8_t vaddvq_u8(uint8x16_t __p0) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vaddvq_u8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8_t vaddvq_u8(uint8x16_t __p0) {
-  uint8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8_t) __builtin_neon_vaddvq_u8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32_t vaddvq_u32(uint32x4_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vaddvq_u32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32_t vaddvq_u32(uint32x4_t __p0) {
-  uint32_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32_t) __builtin_neon_vaddvq_u32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64_t vaddvq_u64(uint64x2_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vaddvq_u64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64_t vaddvq_u64(uint64x2_t __p0) {
-  uint64_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64_t) __builtin_neon_vaddvq_u64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16_t vaddvq_u16(uint16x8_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vaddvq_u16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16_t vaddvq_u16(uint16x8_t __p0) {
-  uint16_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16_t) __builtin_neon_vaddvq_u16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8_t vaddvq_s8(int8x16_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vaddvq_s8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8_t vaddvq_s8(int8x16_t __p0) {
-  int8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8_t) __builtin_neon_vaddvq_s8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vaddvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vaddvq_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vaddvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vaddvq_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vaddvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vaddvq_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vaddvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32_t) __builtin_neon_vaddvq_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32_t vaddvq_s32(int32x4_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vaddvq_s32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32_t vaddvq_s32(int32x4_t __p0) {
-  int32_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32_t) __builtin_neon_vaddvq_s32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64_t vaddvq_s64(int64x2_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vaddvq_s64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64_t vaddvq_s64(int64x2_t __p0) {
-  int64_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64_t) __builtin_neon_vaddvq_s64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16_t vaddvq_s16(int16x8_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vaddvq_s16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16_t vaddvq_s16(int16x8_t __p0) {
-  int16_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16_t) __builtin_neon_vaddvq_s16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8_t vaddv_u8(uint8x8_t __p0) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vaddv_u8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8_t vaddv_u8(uint8x8_t __p0) {
-  uint8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8_t) __builtin_neon_vaddv_u8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32_t vaddv_u32(uint32x2_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vaddv_u32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32_t vaddv_u32(uint32x2_t __p0) {
-  uint32_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32_t) __builtin_neon_vaddv_u32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16_t vaddv_u16(uint16x4_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vaddv_u16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16_t vaddv_u16(uint16x4_t __p0) {
-  uint16_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16_t) __builtin_neon_vaddv_u16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8_t vaddv_s8(int8x8_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vaddv_s8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8_t vaddv_s8(int8x8_t __p0) {
-  int8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8_t) __builtin_neon_vaddv_s8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vaddv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vaddv_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vaddv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vaddv_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32_t vaddv_s32(int32x2_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vaddv_s32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32_t vaddv_s32(int32x2_t __p0) {
-  int32_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32_t) __builtin_neon_vaddv_s32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16_t vaddv_s16(int16x4_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vaddv_s16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16_t vaddv_s16(int16x4_t __p0) {
-  int16_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16_t) __builtin_neon_vaddv_s16(__rev0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) poly64x1_t vbsl_p64(uint64x1_t __p0, poly64x1_t __p1, poly64x1_t __p2) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 6);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vbslq_p64(uint64x2_t __p0, poly64x2_t __p1, poly64x2_t __p2) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 38);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vbslq_p64(uint64x2_t __p0, poly64x2_t __p1, poly64x2_t __p2) {
-  poly64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  poly64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (poly64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 38);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vbslq_f64(uint64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vbslq_f64(uint64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vbsl_f64(uint64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcageq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcageq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcageq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcageq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcage_f64(float64x1_t __p0, float64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcaged_f64(float64_t __p0, float64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcaged_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcages_f32(float32_t __p0, float32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcages_f32(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcagtq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcagtq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcagtq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcagtq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcagt_f64(float64x1_t __p0, float64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcagtd_f64(float64_t __p0, float64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcagtd_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcagts_f32(float32_t __p0, float32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcagts_f32(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcaleq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcaleq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcaleq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcaleq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcale_f64(float64x1_t __p0, float64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcaled_f64(float64_t __p0, float64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcaled_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcales_f32(float32_t __p0, float32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcales_f32(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcaltq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcaltq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcaltq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcaltq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcalt_f64(float64x1_t __p0, float64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcaltd_f64(float64_t __p0, float64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcaltd_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcalts_f32(float32_t __p0, float32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcalts_f32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vceq_p64(poly64x1_t __p0, poly64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 == __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vceqq_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vceqq_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  uint64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vceqq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vceqq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vceqq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vceqq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vceqq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 == __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vceqq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 == __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vceq_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 == __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vceq_f64(float64x1_t __p0, float64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 == __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vceq_s64(int64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 == __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vceqd_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vceqd_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vceqd_s64(int64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vceqd_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vceqd_f64(float64_t __p0, float64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vceqd_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vceqs_f32(float32_t __p0, float32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vceqs_f32(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vceqz_p8(poly8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vceqz_p8(poly8x8_t __p0) {
-  uint8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vceqz_p64(poly64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vceqzq_p8(poly8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vceqzq_p8(poly8x16_t __p0) {
-  uint8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vceqzq_p64(poly64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vceqzq_p64(poly64x2_t __p0) {
-  uint64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vceqzq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vceqzq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vceqzq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vceqzq_u32(uint32x4_t __p0) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vceqzq_u64(uint64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vceqzq_u64(uint64x2_t __p0) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vceqzq_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vceqzq_u16(uint16x8_t __p0) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vceqzq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vceqzq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vceqzq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vceqzq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vceqzq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vceqzq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vceqzq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vceqzq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vceqzq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vceqzq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vceqzq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vceqzq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vceqz_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vceqz_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vceqz_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vceqz_u32(uint32x2_t __p0) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vceqz_u64(uint64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vceqz_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vceqz_u16(uint16x4_t __p0) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vceqz_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vceqz_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vceqz_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vceqz_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vceqz_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vceqz_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vceqz_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vceqz_s64(int64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vceqz_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vceqz_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64_t vceqzd_u64(uint64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vceqzd_u64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vceqzd_s64(int64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vceqzd_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vceqzd_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vceqzd_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vceqzs_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vceqzs_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgeq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgeq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgeq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgeq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgeq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 >= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgeq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 >= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcge_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 >= __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vcge_f64(float64x1_t __p0, float64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 >= __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vcge_s64(int64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 >= __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcged_s64(int64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcged_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcged_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcged_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcged_f64(float64_t __p0, float64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcged_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcges_f32(float32_t __p0, float32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcges_f32(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcgezq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcgezq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgezq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgezq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgezq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgezq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgezq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgezq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgezq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgezq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcgezq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcgezq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcgez_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcgez_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcgez_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcgez_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcgez_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcgez_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcgez_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcgez_s64(int64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vcgez_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vcgez_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64_t vcgezd_s64(int64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcgezd_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcgezd_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcgezd_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcgezs_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcgezs_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgtq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgtq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgtq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgtq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgtq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 > __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgtq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 > __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcgt_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 > __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vcgt_f64(float64x1_t __p0, float64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 > __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vcgt_s64(int64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 > __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcgtd_s64(int64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcgtd_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcgtd_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcgtd_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcgtd_f64(float64_t __p0, float64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcgtd_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcgts_f32(float32_t __p0, float32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcgts_f32(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcgtzq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcgtzq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgtzq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgtzq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgtzq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgtzq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcgtzq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcgtzq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcgtzq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcgtzq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcgtzq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcgtzq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcgtz_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcgtz_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcgtz_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcgtz_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcgtz_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcgtz_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcgtz_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcgtz_s64(int64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vcgtz_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vcgtz_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64_t vcgtzd_s64(int64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcgtzd_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcgtzd_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcgtzd_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcgtzs_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcgtzs_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcleq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcleq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcleq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcleq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcleq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 <= __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcleq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 <= __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcle_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 <= __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vcle_f64(float64x1_t __p0, float64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 <= __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vcle_s64(int64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 <= __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcled_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcled_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcled_s64(int64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcled_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcled_f64(float64_t __p0, float64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcled_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcles_f32(float32_t __p0, float32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcles_f32(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vclezq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vclezq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vclezq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vclezq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vclezq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vclezq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vclezq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vclezq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vclezq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vclezq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vclezq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vclezq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vclez_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vclez_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vclez_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vclez_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vclez_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vclez_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vclez_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vclez_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vclez_s64(int64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vclez_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vclez_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64_t vclezd_s64(int64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vclezd_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vclezd_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vclezd_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vclezs_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vclezs_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcltq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcltq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcltq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcltq_f64(float64x2_t __p0, float64x2_t __p1) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcltq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0 < __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcltq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__rev0 < __rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vclt_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 < __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vclt_f64(float64x1_t __p0, float64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 < __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vclt_s64(int64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0 < __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcltd_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcltd_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcltd_s64(int64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcltd_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcltd_f64(float64_t __p0, float64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcltd_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vclts_f32(float32_t __p0, float32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vclts_f32(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vcltzq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vcltzq_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcltzq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcltzq_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcltzq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcltzq_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vcltzq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vcltzq_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcltzq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcltzq_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vcltzq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vcltzq_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vcltz_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vcltz_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcltz_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcltz_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcltz_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vcltz_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vcltz_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcltz_s64(int64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vcltz_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vcltz_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64_t vcltzd_s64(int64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcltzd_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcltzd_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcltzd_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcltzs_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcltzs_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vcombine_p64(poly64x1_t __p0, poly64x1_t __p1) {
-  poly64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vcombine_p64(poly64x1_t __p0, poly64x1_t __p1) {
-  poly64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vcombine_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vcombine_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_p8(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
-  poly8x16_t __ret_278; \
-  poly8x16_t __s0_278 = __p0_278; \
-  poly8x8_t __s2_278 = __p2_278; \
-  __ret_278 = vsetq_lane_p8(vget_lane_p8(__s2_278, __p3_278), __s0_278, __p1_278); \
-  __ret_278; \
-})
-#else
-#define vcopyq_lane_p8(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
-  poly8x16_t __ret_279; \
-  poly8x16_t __s0_279 = __p0_279; \
-  poly8x8_t __s2_279 = __p2_279; \
-  poly8x16_t __rev0_279;  __rev0_279 = __builtin_shufflevector(__s0_279, __s0_279, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __rev2_279;  __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_279 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_279, __p3_279), __rev0_279, __p1_279); \
-  __ret_279 = __builtin_shufflevector(__ret_279, __ret_279, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_279; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_p16(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
-  poly16x8_t __ret_280; \
-  poly16x8_t __s0_280 = __p0_280; \
-  poly16x4_t __s2_280 = __p2_280; \
-  __ret_280 = vsetq_lane_p16(vget_lane_p16(__s2_280, __p3_280), __s0_280, __p1_280); \
-  __ret_280; \
-})
-#else
-#define vcopyq_lane_p16(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
-  poly16x8_t __ret_281; \
-  poly16x8_t __s0_281 = __p0_281; \
-  poly16x4_t __s2_281 = __p2_281; \
-  poly16x8_t __rev0_281;  __rev0_281 = __builtin_shufflevector(__s0_281, __s0_281, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x4_t __rev2_281;  __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 3, 2, 1, 0); \
-  __ret_281 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_281, __p3_281), __rev0_281, __p1_281); \
-  __ret_281 = __builtin_shufflevector(__ret_281, __ret_281, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_281; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u8(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \
-  uint8x16_t __ret_282; \
-  uint8x16_t __s0_282 = __p0_282; \
-  uint8x8_t __s2_282 = __p2_282; \
-  __ret_282 = vsetq_lane_u8(vget_lane_u8(__s2_282, __p3_282), __s0_282, __p1_282); \
-  __ret_282; \
-})
-#else
-#define vcopyq_lane_u8(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \
-  uint8x16_t __ret_283; \
-  uint8x16_t __s0_283 = __p0_283; \
-  uint8x8_t __s2_283 = __p2_283; \
-  uint8x16_t __rev0_283;  __rev0_283 = __builtin_shufflevector(__s0_283, __s0_283, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_283;  __rev2_283 = __builtin_shufflevector(__s2_283, __s2_283, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_283 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_283, __p3_283), __rev0_283, __p1_283); \
-  __ret_283 = __builtin_shufflevector(__ret_283, __ret_283, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_283; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u32(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \
-  uint32x4_t __ret_284; \
-  uint32x4_t __s0_284 = __p0_284; \
-  uint32x2_t __s2_284 = __p2_284; \
-  __ret_284 = vsetq_lane_u32(vget_lane_u32(__s2_284, __p3_284), __s0_284, __p1_284); \
-  __ret_284; \
-})
-#else
-#define vcopyq_lane_u32(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \
-  uint32x4_t __ret_285; \
-  uint32x4_t __s0_285 = __p0_285; \
-  uint32x2_t __s2_285 = __p2_285; \
-  uint32x4_t __rev0_285;  __rev0_285 = __builtin_shufflevector(__s0_285, __s0_285, 3, 2, 1, 0); \
-  uint32x2_t __rev2_285;  __rev2_285 = __builtin_shufflevector(__s2_285, __s2_285, 1, 0); \
-  __ret_285 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_285, __p3_285), __rev0_285, __p1_285); \
-  __ret_285 = __builtin_shufflevector(__ret_285, __ret_285, 3, 2, 1, 0); \
-  __ret_285; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u64(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \
-  uint64x2_t __ret_286; \
-  uint64x2_t __s0_286 = __p0_286; \
-  uint64x1_t __s2_286 = __p2_286; \
-  __ret_286 = vsetq_lane_u64(vget_lane_u64(__s2_286, __p3_286), __s0_286, __p1_286); \
-  __ret_286; \
-})
-#else
-#define vcopyq_lane_u64(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \
-  uint64x2_t __ret_287; \
-  uint64x2_t __s0_287 = __p0_287; \
-  uint64x1_t __s2_287 = __p2_287; \
-  uint64x2_t __rev0_287;  __rev0_287 = __builtin_shufflevector(__s0_287, __s0_287, 1, 0); \
-  __ret_287 = __noswap_vsetq_lane_u64(vget_lane_u64(__s2_287, __p3_287), __rev0_287, __p1_287); \
-  __ret_287 = __builtin_shufflevector(__ret_287, __ret_287, 1, 0); \
-  __ret_287; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_u16(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \
-  uint16x8_t __ret_288; \
-  uint16x8_t __s0_288 = __p0_288; \
-  uint16x4_t __s2_288 = __p2_288; \
-  __ret_288 = vsetq_lane_u16(vget_lane_u16(__s2_288, __p3_288), __s0_288, __p1_288); \
-  __ret_288; \
-})
-#else
-#define vcopyq_lane_u16(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \
-  uint16x8_t __ret_289; \
-  uint16x8_t __s0_289 = __p0_289; \
-  uint16x4_t __s2_289 = __p2_289; \
-  uint16x8_t __rev0_289;  __rev0_289 = __builtin_shufflevector(__s0_289, __s0_289, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev2_289;  __rev2_289 = __builtin_shufflevector(__s2_289, __s2_289, 3, 2, 1, 0); \
-  __ret_289 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_289, __p3_289), __rev0_289, __p1_289); \
-  __ret_289 = __builtin_shufflevector(__ret_289, __ret_289, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_289; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s8(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \
-  int8x16_t __ret_290; \
-  int8x16_t __s0_290 = __p0_290; \
-  int8x8_t __s2_290 = __p2_290; \
-  __ret_290 = vsetq_lane_s8(vget_lane_s8(__s2_290, __p3_290), __s0_290, __p1_290); \
-  __ret_290; \
-})
-#else
-#define vcopyq_lane_s8(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \
-  int8x16_t __ret_291; \
-  int8x16_t __s0_291 = __p0_291; \
-  int8x8_t __s2_291 = __p2_291; \
-  int8x16_t __rev0_291;  __rev0_291 = __builtin_shufflevector(__s0_291, __s0_291, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_291;  __rev2_291 = __builtin_shufflevector(__s2_291, __s2_291, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_291 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_291, __p3_291), __rev0_291, __p1_291); \
-  __ret_291 = __builtin_shufflevector(__ret_291, __ret_291, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_291; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_f32(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \
-  float32x4_t __ret_292; \
-  float32x4_t __s0_292 = __p0_292; \
-  float32x2_t __s2_292 = __p2_292; \
-  __ret_292 = vsetq_lane_f32(vget_lane_f32(__s2_292, __p3_292), __s0_292, __p1_292); \
-  __ret_292; \
-})
-#else
-#define vcopyq_lane_f32(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \
-  float32x4_t __ret_293; \
-  float32x4_t __s0_293 = __p0_293; \
-  float32x2_t __s2_293 = __p2_293; \
-  float32x4_t __rev0_293;  __rev0_293 = __builtin_shufflevector(__s0_293, __s0_293, 3, 2, 1, 0); \
-  float32x2_t __rev2_293;  __rev2_293 = __builtin_shufflevector(__s2_293, __s2_293, 1, 0); \
-  __ret_293 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_293, __p3_293), __rev0_293, __p1_293); \
-  __ret_293 = __builtin_shufflevector(__ret_293, __ret_293, 3, 2, 1, 0); \
-  __ret_293; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s32(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \
-  int32x4_t __ret_294; \
-  int32x4_t __s0_294 = __p0_294; \
-  int32x2_t __s2_294 = __p2_294; \
-  __ret_294 = vsetq_lane_s32(vget_lane_s32(__s2_294, __p3_294), __s0_294, __p1_294); \
-  __ret_294; \
-})
-#else
-#define vcopyq_lane_s32(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \
-  int32x4_t __ret_295; \
-  int32x4_t __s0_295 = __p0_295; \
-  int32x2_t __s2_295 = __p2_295; \
-  int32x4_t __rev0_295;  __rev0_295 = __builtin_shufflevector(__s0_295, __s0_295, 3, 2, 1, 0); \
-  int32x2_t __rev2_295;  __rev2_295 = __builtin_shufflevector(__s2_295, __s2_295, 1, 0); \
-  __ret_295 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_295, __p3_295), __rev0_295, __p1_295); \
-  __ret_295 = __builtin_shufflevector(__ret_295, __ret_295, 3, 2, 1, 0); \
-  __ret_295; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s64(__p0_296, __p1_296, __p2_296, __p3_296) __extension__ ({ \
-  int64x2_t __ret_296; \
-  int64x2_t __s0_296 = __p0_296; \
-  int64x1_t __s2_296 = __p2_296; \
-  __ret_296 = vsetq_lane_s64(vget_lane_s64(__s2_296, __p3_296), __s0_296, __p1_296); \
-  __ret_296; \
-})
-#else
-#define vcopyq_lane_s64(__p0_297, __p1_297, __p2_297, __p3_297) __extension__ ({ \
-  int64x2_t __ret_297; \
-  int64x2_t __s0_297 = __p0_297; \
-  int64x1_t __s2_297 = __p2_297; \
-  int64x2_t __rev0_297;  __rev0_297 = __builtin_shufflevector(__s0_297, __s0_297, 1, 0); \
-  __ret_297 = __noswap_vsetq_lane_s64(vget_lane_s64(__s2_297, __p3_297), __rev0_297, __p1_297); \
-  __ret_297 = __builtin_shufflevector(__ret_297, __ret_297, 1, 0); \
-  __ret_297; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_s16(__p0_298, __p1_298, __p2_298, __p3_298) __extension__ ({ \
-  int16x8_t __ret_298; \
-  int16x8_t __s0_298 = __p0_298; \
-  int16x4_t __s2_298 = __p2_298; \
-  __ret_298 = vsetq_lane_s16(vget_lane_s16(__s2_298, __p3_298), __s0_298, __p1_298); \
-  __ret_298; \
-})
-#else
-#define vcopyq_lane_s16(__p0_299, __p1_299, __p2_299, __p3_299) __extension__ ({ \
-  int16x8_t __ret_299; \
-  int16x8_t __s0_299 = __p0_299; \
-  int16x4_t __s2_299 = __p2_299; \
-  int16x8_t __rev0_299;  __rev0_299 = __builtin_shufflevector(__s0_299, __s0_299, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_299;  __rev2_299 = __builtin_shufflevector(__s2_299, __s2_299, 3, 2, 1, 0); \
-  __ret_299 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_299, __p3_299), __rev0_299, __p1_299); \
-  __ret_299 = __builtin_shufflevector(__ret_299, __ret_299, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_299; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_p8(__p0_300, __p1_300, __p2_300, __p3_300) __extension__ ({ \
-  poly8x8_t __ret_300; \
-  poly8x8_t __s0_300 = __p0_300; \
-  poly8x8_t __s2_300 = __p2_300; \
-  __ret_300 = vset_lane_p8(vget_lane_p8(__s2_300, __p3_300), __s0_300, __p1_300); \
-  __ret_300; \
-})
-#else
-#define vcopy_lane_p8(__p0_301, __p1_301, __p2_301, __p3_301) __extension__ ({ \
-  poly8x8_t __ret_301; \
-  poly8x8_t __s0_301 = __p0_301; \
-  poly8x8_t __s2_301 = __p2_301; \
-  poly8x8_t __rev0_301;  __rev0_301 = __builtin_shufflevector(__s0_301, __s0_301, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x8_t __rev2_301;  __rev2_301 = __builtin_shufflevector(__s2_301, __s2_301, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_301 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_301, __p3_301), __rev0_301, __p1_301); \
-  __ret_301 = __builtin_shufflevector(__ret_301, __ret_301, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_301; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_p16(__p0_302, __p1_302, __p2_302, __p3_302) __extension__ ({ \
-  poly16x4_t __ret_302; \
-  poly16x4_t __s0_302 = __p0_302; \
-  poly16x4_t __s2_302 = __p2_302; \
-  __ret_302 = vset_lane_p16(vget_lane_p16(__s2_302, __p3_302), __s0_302, __p1_302); \
-  __ret_302; \
-})
-#else
-#define vcopy_lane_p16(__p0_303, __p1_303, __p2_303, __p3_303) __extension__ ({ \
-  poly16x4_t __ret_303; \
-  poly16x4_t __s0_303 = __p0_303; \
-  poly16x4_t __s2_303 = __p2_303; \
-  poly16x4_t __rev0_303;  __rev0_303 = __builtin_shufflevector(__s0_303, __s0_303, 3, 2, 1, 0); \
-  poly16x4_t __rev2_303;  __rev2_303 = __builtin_shufflevector(__s2_303, __s2_303, 3, 2, 1, 0); \
-  __ret_303 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_303, __p3_303), __rev0_303, __p1_303); \
-  __ret_303 = __builtin_shufflevector(__ret_303, __ret_303, 3, 2, 1, 0); \
-  __ret_303; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u8(__p0_304, __p1_304, __p2_304, __p3_304) __extension__ ({ \
-  uint8x8_t __ret_304; \
-  uint8x8_t __s0_304 = __p0_304; \
-  uint8x8_t __s2_304 = __p2_304; \
-  __ret_304 = vset_lane_u8(vget_lane_u8(__s2_304, __p3_304), __s0_304, __p1_304); \
-  __ret_304; \
-})
-#else
-#define vcopy_lane_u8(__p0_305, __p1_305, __p2_305, __p3_305) __extension__ ({ \
-  uint8x8_t __ret_305; \
-  uint8x8_t __s0_305 = __p0_305; \
-  uint8x8_t __s2_305 = __p2_305; \
-  uint8x8_t __rev0_305;  __rev0_305 = __builtin_shufflevector(__s0_305, __s0_305, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_305;  __rev2_305 = __builtin_shufflevector(__s2_305, __s2_305, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_305 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_305, __p3_305), __rev0_305, __p1_305); \
-  __ret_305 = __builtin_shufflevector(__ret_305, __ret_305, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_305; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u32(__p0_306, __p1_306, __p2_306, __p3_306) __extension__ ({ \
-  uint32x2_t __ret_306; \
-  uint32x2_t __s0_306 = __p0_306; \
-  uint32x2_t __s2_306 = __p2_306; \
-  __ret_306 = vset_lane_u32(vget_lane_u32(__s2_306, __p3_306), __s0_306, __p1_306); \
-  __ret_306; \
-})
-#else
-#define vcopy_lane_u32(__p0_307, __p1_307, __p2_307, __p3_307) __extension__ ({ \
-  uint32x2_t __ret_307; \
-  uint32x2_t __s0_307 = __p0_307; \
-  uint32x2_t __s2_307 = __p2_307; \
-  uint32x2_t __rev0_307;  __rev0_307 = __builtin_shufflevector(__s0_307, __s0_307, 1, 0); \
-  uint32x2_t __rev2_307;  __rev2_307 = __builtin_shufflevector(__s2_307, __s2_307, 1, 0); \
-  __ret_307 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_307, __p3_307), __rev0_307, __p1_307); \
-  __ret_307 = __builtin_shufflevector(__ret_307, __ret_307, 1, 0); \
-  __ret_307; \
-})
-#endif
-
-#define vcopy_lane_u64(__p0_308, __p1_308, __p2_308, __p3_308) __extension__ ({ \
-  uint64x1_t __ret_308; \
-  uint64x1_t __s0_308 = __p0_308; \
-  uint64x1_t __s2_308 = __p2_308; \
-  __ret_308 = vset_lane_u64(vget_lane_u64(__s2_308, __p3_308), __s0_308, __p1_308); \
-  __ret_308; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_u16(__p0_309, __p1_309, __p2_309, __p3_309) __extension__ ({ \
-  uint16x4_t __ret_309; \
-  uint16x4_t __s0_309 = __p0_309; \
-  uint16x4_t __s2_309 = __p2_309; \
-  __ret_309 = vset_lane_u16(vget_lane_u16(__s2_309, __p3_309), __s0_309, __p1_309); \
-  __ret_309; \
-})
-#else
-#define vcopy_lane_u16(__p0_310, __p1_310, __p2_310, __p3_310) __extension__ ({ \
-  uint16x4_t __ret_310; \
-  uint16x4_t __s0_310 = __p0_310; \
-  uint16x4_t __s2_310 = __p2_310; \
-  uint16x4_t __rev0_310;  __rev0_310 = __builtin_shufflevector(__s0_310, __s0_310, 3, 2, 1, 0); \
-  uint16x4_t __rev2_310;  __rev2_310 = __builtin_shufflevector(__s2_310, __s2_310, 3, 2, 1, 0); \
-  __ret_310 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_310, __p3_310), __rev0_310, __p1_310); \
-  __ret_310 = __builtin_shufflevector(__ret_310, __ret_310, 3, 2, 1, 0); \
-  __ret_310; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s8(__p0_311, __p1_311, __p2_311, __p3_311) __extension__ ({ \
-  int8x8_t __ret_311; \
-  int8x8_t __s0_311 = __p0_311; \
-  int8x8_t __s2_311 = __p2_311; \
-  __ret_311 = vset_lane_s8(vget_lane_s8(__s2_311, __p3_311), __s0_311, __p1_311); \
-  __ret_311; \
-})
-#else
-#define vcopy_lane_s8(__p0_312, __p1_312, __p2_312, __p3_312) __extension__ ({ \
-  int8x8_t __ret_312; \
-  int8x8_t __s0_312 = __p0_312; \
-  int8x8_t __s2_312 = __p2_312; \
-  int8x8_t __rev0_312;  __rev0_312 = __builtin_shufflevector(__s0_312, __s0_312, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_312;  __rev2_312 = __builtin_shufflevector(__s2_312, __s2_312, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_312 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_312, __p3_312), __rev0_312, __p1_312); \
-  __ret_312 = __builtin_shufflevector(__ret_312, __ret_312, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_312; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_f32(__p0_313, __p1_313, __p2_313, __p3_313) __extension__ ({ \
-  float32x2_t __ret_313; \
-  float32x2_t __s0_313 = __p0_313; \
-  float32x2_t __s2_313 = __p2_313; \
-  __ret_313 = vset_lane_f32(vget_lane_f32(__s2_313, __p3_313), __s0_313, __p1_313); \
-  __ret_313; \
-})
-#else
-#define vcopy_lane_f32(__p0_314, __p1_314, __p2_314, __p3_314) __extension__ ({ \
-  float32x2_t __ret_314; \
-  float32x2_t __s0_314 = __p0_314; \
-  float32x2_t __s2_314 = __p2_314; \
-  float32x2_t __rev0_314;  __rev0_314 = __builtin_shufflevector(__s0_314, __s0_314, 1, 0); \
-  float32x2_t __rev2_314;  __rev2_314 = __builtin_shufflevector(__s2_314, __s2_314, 1, 0); \
-  __ret_314 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_314, __p3_314), __rev0_314, __p1_314); \
-  __ret_314 = __builtin_shufflevector(__ret_314, __ret_314, 1, 0); \
-  __ret_314; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s32(__p0_315, __p1_315, __p2_315, __p3_315) __extension__ ({ \
-  int32x2_t __ret_315; \
-  int32x2_t __s0_315 = __p0_315; \
-  int32x2_t __s2_315 = __p2_315; \
-  __ret_315 = vset_lane_s32(vget_lane_s32(__s2_315, __p3_315), __s0_315, __p1_315); \
-  __ret_315; \
-})
-#else
-#define vcopy_lane_s32(__p0_316, __p1_316, __p2_316, __p3_316) __extension__ ({ \
-  int32x2_t __ret_316; \
-  int32x2_t __s0_316 = __p0_316; \
-  int32x2_t __s2_316 = __p2_316; \
-  int32x2_t __rev0_316;  __rev0_316 = __builtin_shufflevector(__s0_316, __s0_316, 1, 0); \
-  int32x2_t __rev2_316;  __rev2_316 = __builtin_shufflevector(__s2_316, __s2_316, 1, 0); \
-  __ret_316 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_316, __p3_316), __rev0_316, __p1_316); \
-  __ret_316 = __builtin_shufflevector(__ret_316, __ret_316, 1, 0); \
-  __ret_316; \
-})
-#endif
-
-#define vcopy_lane_s64(__p0_317, __p1_317, __p2_317, __p3_317) __extension__ ({ \
-  int64x1_t __ret_317; \
-  int64x1_t __s0_317 = __p0_317; \
-  int64x1_t __s2_317 = __p2_317; \
-  __ret_317 = vset_lane_s64(vget_lane_s64(__s2_317, __p3_317), __s0_317, __p1_317); \
-  __ret_317; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_lane_s16(__p0_318, __p1_318, __p2_318, __p3_318) __extension__ ({ \
-  int16x4_t __ret_318; \
-  int16x4_t __s0_318 = __p0_318; \
-  int16x4_t __s2_318 = __p2_318; \
-  __ret_318 = vset_lane_s16(vget_lane_s16(__s2_318, __p3_318), __s0_318, __p1_318); \
-  __ret_318; \
-})
-#else
-#define vcopy_lane_s16(__p0_319, __p1_319, __p2_319, __p3_319) __extension__ ({ \
-  int16x4_t __ret_319; \
-  int16x4_t __s0_319 = __p0_319; \
-  int16x4_t __s2_319 = __p2_319; \
-  int16x4_t __rev0_319;  __rev0_319 = __builtin_shufflevector(__s0_319, __s0_319, 3, 2, 1, 0); \
-  int16x4_t __rev2_319;  __rev2_319 = __builtin_shufflevector(__s2_319, __s2_319, 3, 2, 1, 0); \
-  __ret_319 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_319, __p3_319), __rev0_319, __p1_319); \
-  __ret_319 = __builtin_shufflevector(__ret_319, __ret_319, 3, 2, 1, 0); \
-  __ret_319; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_p8(__p0_320, __p1_320, __p2_320, __p3_320) __extension__ ({ \
-  poly8x16_t __ret_320; \
-  poly8x16_t __s0_320 = __p0_320; \
-  poly8x16_t __s2_320 = __p2_320; \
-  __ret_320 = vsetq_lane_p8(vgetq_lane_p8(__s2_320, __p3_320), __s0_320, __p1_320); \
-  __ret_320; \
-})
-#else
-#define vcopyq_laneq_p8(__p0_321, __p1_321, __p2_321, __p3_321) __extension__ ({ \
-  poly8x16_t __ret_321; \
-  poly8x16_t __s0_321 = __p0_321; \
-  poly8x16_t __s2_321 = __p2_321; \
-  poly8x16_t __rev0_321;  __rev0_321 = __builtin_shufflevector(__s0_321, __s0_321, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __rev2_321;  __rev2_321 = __builtin_shufflevector(__s2_321, __s2_321, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_321 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_321, __p3_321), __rev0_321, __p1_321); \
-  __ret_321 = __builtin_shufflevector(__ret_321, __ret_321, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_321; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_p16(__p0_322, __p1_322, __p2_322, __p3_322) __extension__ ({ \
-  poly16x8_t __ret_322; \
-  poly16x8_t __s0_322 = __p0_322; \
-  poly16x8_t __s2_322 = __p2_322; \
-  __ret_322 = vsetq_lane_p16(vgetq_lane_p16(__s2_322, __p3_322), __s0_322, __p1_322); \
-  __ret_322; \
-})
-#else
-#define vcopyq_laneq_p16(__p0_323, __p1_323, __p2_323, __p3_323) __extension__ ({ \
-  poly16x8_t __ret_323; \
-  poly16x8_t __s0_323 = __p0_323; \
-  poly16x8_t __s2_323 = __p2_323; \
-  poly16x8_t __rev0_323;  __rev0_323 = __builtin_shufflevector(__s0_323, __s0_323, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly16x8_t __rev2_323;  __rev2_323 = __builtin_shufflevector(__s2_323, __s2_323, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_323 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_323, __p3_323), __rev0_323, __p1_323); \
-  __ret_323 = __builtin_shufflevector(__ret_323, __ret_323, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_323; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u8(__p0_324, __p1_324, __p2_324, __p3_324) __extension__ ({ \
-  uint8x16_t __ret_324; \
-  uint8x16_t __s0_324 = __p0_324; \
-  uint8x16_t __s2_324 = __p2_324; \
-  __ret_324 = vsetq_lane_u8(vgetq_lane_u8(__s2_324, __p3_324), __s0_324, __p1_324); \
-  __ret_324; \
-})
-#else
-#define vcopyq_laneq_u8(__p0_325, __p1_325, __p2_325, __p3_325) __extension__ ({ \
-  uint8x16_t __ret_325; \
-  uint8x16_t __s0_325 = __p0_325; \
-  uint8x16_t __s2_325 = __p2_325; \
-  uint8x16_t __rev0_325;  __rev0_325 = __builtin_shufflevector(__s0_325, __s0_325, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_325;  __rev2_325 = __builtin_shufflevector(__s2_325, __s2_325, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_325 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_325, __p3_325), __rev0_325, __p1_325); \
-  __ret_325 = __builtin_shufflevector(__ret_325, __ret_325, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_325; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u32(__p0_326, __p1_326, __p2_326, __p3_326) __extension__ ({ \
-  uint32x4_t __ret_326; \
-  uint32x4_t __s0_326 = __p0_326; \
-  uint32x4_t __s2_326 = __p2_326; \
-  __ret_326 = vsetq_lane_u32(vgetq_lane_u32(__s2_326, __p3_326), __s0_326, __p1_326); \
-  __ret_326; \
-})
-#else
-#define vcopyq_laneq_u32(__p0_327, __p1_327, __p2_327, __p3_327) __extension__ ({ \
-  uint32x4_t __ret_327; \
-  uint32x4_t __s0_327 = __p0_327; \
-  uint32x4_t __s2_327 = __p2_327; \
-  uint32x4_t __rev0_327;  __rev0_327 = __builtin_shufflevector(__s0_327, __s0_327, 3, 2, 1, 0); \
-  uint32x4_t __rev2_327;  __rev2_327 = __builtin_shufflevector(__s2_327, __s2_327, 3, 2, 1, 0); \
-  __ret_327 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_327, __p3_327), __rev0_327, __p1_327); \
-  __ret_327 = __builtin_shufflevector(__ret_327, __ret_327, 3, 2, 1, 0); \
-  __ret_327; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u64(__p0_328, __p1_328, __p2_328, __p3_328) __extension__ ({ \
-  uint64x2_t __ret_328; \
-  uint64x2_t __s0_328 = __p0_328; \
-  uint64x2_t __s2_328 = __p2_328; \
-  __ret_328 = vsetq_lane_u64(vgetq_lane_u64(__s2_328, __p3_328), __s0_328, __p1_328); \
-  __ret_328; \
-})
-#else
-#define vcopyq_laneq_u64(__p0_329, __p1_329, __p2_329, __p3_329) __extension__ ({ \
-  uint64x2_t __ret_329; \
-  uint64x2_t __s0_329 = __p0_329; \
-  uint64x2_t __s2_329 = __p2_329; \
-  uint64x2_t __rev0_329;  __rev0_329 = __builtin_shufflevector(__s0_329, __s0_329, 1, 0); \
-  uint64x2_t __rev2_329;  __rev2_329 = __builtin_shufflevector(__s2_329, __s2_329, 1, 0); \
-  __ret_329 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_329, __p3_329), __rev0_329, __p1_329); \
-  __ret_329 = __builtin_shufflevector(__ret_329, __ret_329, 1, 0); \
-  __ret_329; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_u16(__p0_330, __p1_330, __p2_330, __p3_330) __extension__ ({ \
-  uint16x8_t __ret_330; \
-  uint16x8_t __s0_330 = __p0_330; \
-  uint16x8_t __s2_330 = __p2_330; \
-  __ret_330 = vsetq_lane_u16(vgetq_lane_u16(__s2_330, __p3_330), __s0_330, __p1_330); \
-  __ret_330; \
-})
-#else
-#define vcopyq_laneq_u16(__p0_331, __p1_331, __p2_331, __p3_331) __extension__ ({ \
-  uint16x8_t __ret_331; \
-  uint16x8_t __s0_331 = __p0_331; \
-  uint16x8_t __s2_331 = __p2_331; \
-  uint16x8_t __rev0_331;  __rev0_331 = __builtin_shufflevector(__s0_331, __s0_331, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_331;  __rev2_331 = __builtin_shufflevector(__s2_331, __s2_331, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_331 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_331, __p3_331), __rev0_331, __p1_331); \
-  __ret_331 = __builtin_shufflevector(__ret_331, __ret_331, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_331; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s8(__p0_332, __p1_332, __p2_332, __p3_332) __extension__ ({ \
-  int8x16_t __ret_332; \
-  int8x16_t __s0_332 = __p0_332; \
-  int8x16_t __s2_332 = __p2_332; \
-  __ret_332 = vsetq_lane_s8(vgetq_lane_s8(__s2_332, __p3_332), __s0_332, __p1_332); \
-  __ret_332; \
-})
-#else
-#define vcopyq_laneq_s8(__p0_333, __p1_333, __p2_333, __p3_333) __extension__ ({ \
-  int8x16_t __ret_333; \
-  int8x16_t __s0_333 = __p0_333; \
-  int8x16_t __s2_333 = __p2_333; \
-  int8x16_t __rev0_333;  __rev0_333 = __builtin_shufflevector(__s0_333, __s0_333, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_333;  __rev2_333 = __builtin_shufflevector(__s2_333, __s2_333, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_333 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_333, __p3_333), __rev0_333, __p1_333); \
-  __ret_333 = __builtin_shufflevector(__ret_333, __ret_333, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_333; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_f32(__p0_334, __p1_334, __p2_334, __p3_334) __extension__ ({ \
-  float32x4_t __ret_334; \
-  float32x4_t __s0_334 = __p0_334; \
-  float32x4_t __s2_334 = __p2_334; \
-  __ret_334 = vsetq_lane_f32(vgetq_lane_f32(__s2_334, __p3_334), __s0_334, __p1_334); \
-  __ret_334; \
-})
-#else
-#define vcopyq_laneq_f32(__p0_335, __p1_335, __p2_335, __p3_335) __extension__ ({ \
-  float32x4_t __ret_335; \
-  float32x4_t __s0_335 = __p0_335; \
-  float32x4_t __s2_335 = __p2_335; \
-  float32x4_t __rev0_335;  __rev0_335 = __builtin_shufflevector(__s0_335, __s0_335, 3, 2, 1, 0); \
-  float32x4_t __rev2_335;  __rev2_335 = __builtin_shufflevector(__s2_335, __s2_335, 3, 2, 1, 0); \
-  __ret_335 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_335, __p3_335), __rev0_335, __p1_335); \
-  __ret_335 = __builtin_shufflevector(__ret_335, __ret_335, 3, 2, 1, 0); \
-  __ret_335; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s32(__p0_336, __p1_336, __p2_336, __p3_336) __extension__ ({ \
-  int32x4_t __ret_336; \
-  int32x4_t __s0_336 = __p0_336; \
-  int32x4_t __s2_336 = __p2_336; \
-  __ret_336 = vsetq_lane_s32(vgetq_lane_s32(__s2_336, __p3_336), __s0_336, __p1_336); \
-  __ret_336; \
-})
-#else
-#define vcopyq_laneq_s32(__p0_337, __p1_337, __p2_337, __p3_337) __extension__ ({ \
-  int32x4_t __ret_337; \
-  int32x4_t __s0_337 = __p0_337; \
-  int32x4_t __s2_337 = __p2_337; \
-  int32x4_t __rev0_337;  __rev0_337 = __builtin_shufflevector(__s0_337, __s0_337, 3, 2, 1, 0); \
-  int32x4_t __rev2_337;  __rev2_337 = __builtin_shufflevector(__s2_337, __s2_337, 3, 2, 1, 0); \
-  __ret_337 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_337, __p3_337), __rev0_337, __p1_337); \
-  __ret_337 = __builtin_shufflevector(__ret_337, __ret_337, 3, 2, 1, 0); \
-  __ret_337; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s64(__p0_338, __p1_338, __p2_338, __p3_338) __extension__ ({ \
-  int64x2_t __ret_338; \
-  int64x2_t __s0_338 = __p0_338; \
-  int64x2_t __s2_338 = __p2_338; \
-  __ret_338 = vsetq_lane_s64(vgetq_lane_s64(__s2_338, __p3_338), __s0_338, __p1_338); \
-  __ret_338; \
-})
-#else
-#define vcopyq_laneq_s64(__p0_339, __p1_339, __p2_339, __p3_339) __extension__ ({ \
-  int64x2_t __ret_339; \
-  int64x2_t __s0_339 = __p0_339; \
-  int64x2_t __s2_339 = __p2_339; \
-  int64x2_t __rev0_339;  __rev0_339 = __builtin_shufflevector(__s0_339, __s0_339, 1, 0); \
-  int64x2_t __rev2_339;  __rev2_339 = __builtin_shufflevector(__s2_339, __s2_339, 1, 0); \
-  __ret_339 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_339, __p3_339), __rev0_339, __p1_339); \
-  __ret_339 = __builtin_shufflevector(__ret_339, __ret_339, 1, 0); \
-  __ret_339; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_s16(__p0_340, __p1_340, __p2_340, __p3_340) __extension__ ({ \
-  int16x8_t __ret_340; \
-  int16x8_t __s0_340 = __p0_340; \
-  int16x8_t __s2_340 = __p2_340; \
-  __ret_340 = vsetq_lane_s16(vgetq_lane_s16(__s2_340, __p3_340), __s0_340, __p1_340); \
-  __ret_340; \
-})
-#else
-#define vcopyq_laneq_s16(__p0_341, __p1_341, __p2_341, __p3_341) __extension__ ({ \
-  int16x8_t __ret_341; \
-  int16x8_t __s0_341 = __p0_341; \
-  int16x8_t __s2_341 = __p2_341; \
-  int16x8_t __rev0_341;  __rev0_341 = __builtin_shufflevector(__s0_341, __s0_341, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_341;  __rev2_341 = __builtin_shufflevector(__s2_341, __s2_341, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_341 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_341, __p3_341), __rev0_341, __p1_341); \
-  __ret_341 = __builtin_shufflevector(__ret_341, __ret_341, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_341; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_p8(__p0_342, __p1_342, __p2_342, __p3_342) __extension__ ({ \
-  poly8x8_t __ret_342; \
-  poly8x8_t __s0_342 = __p0_342; \
-  poly8x16_t __s2_342 = __p2_342; \
-  __ret_342 = vset_lane_p8(vgetq_lane_p8(__s2_342, __p3_342), __s0_342, __p1_342); \
-  __ret_342; \
-})
-#else
-#define vcopy_laneq_p8(__p0_343, __p1_343, __p2_343, __p3_343) __extension__ ({ \
-  poly8x8_t __ret_343; \
-  poly8x8_t __s0_343 = __p0_343; \
-  poly8x16_t __s2_343 = __p2_343; \
-  poly8x8_t __rev0_343;  __rev0_343 = __builtin_shufflevector(__s0_343, __s0_343, 7, 6, 5, 4, 3, 2, 1, 0); \
-  poly8x16_t __rev2_343;  __rev2_343 = __builtin_shufflevector(__s2_343, __s2_343, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_343 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_343, __p3_343), __rev0_343, __p1_343); \
-  __ret_343 = __builtin_shufflevector(__ret_343, __ret_343, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_343; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_p16(__p0_344, __p1_344, __p2_344, __p3_344) __extension__ ({ \
-  poly16x4_t __ret_344; \
-  poly16x4_t __s0_344 = __p0_344; \
-  poly16x8_t __s2_344 = __p2_344; \
-  __ret_344 = vset_lane_p16(vgetq_lane_p16(__s2_344, __p3_344), __s0_344, __p1_344); \
-  __ret_344; \
-})
-#else
-#define vcopy_laneq_p16(__p0_345, __p1_345, __p2_345, __p3_345) __extension__ ({ \
-  poly16x4_t __ret_345; \
-  poly16x4_t __s0_345 = __p0_345; \
-  poly16x8_t __s2_345 = __p2_345; \
-  poly16x4_t __rev0_345;  __rev0_345 = __builtin_shufflevector(__s0_345, __s0_345, 3, 2, 1, 0); \
-  poly16x8_t __rev2_345;  __rev2_345 = __builtin_shufflevector(__s2_345, __s2_345, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_345 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_345, __p3_345), __rev0_345, __p1_345); \
-  __ret_345 = __builtin_shufflevector(__ret_345, __ret_345, 3, 2, 1, 0); \
-  __ret_345; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u8(__p0_346, __p1_346, __p2_346, __p3_346) __extension__ ({ \
-  uint8x8_t __ret_346; \
-  uint8x8_t __s0_346 = __p0_346; \
-  uint8x16_t __s2_346 = __p2_346; \
-  __ret_346 = vset_lane_u8(vgetq_lane_u8(__s2_346, __p3_346), __s0_346, __p1_346); \
-  __ret_346; \
-})
-#else
-#define vcopy_laneq_u8(__p0_347, __p1_347, __p2_347, __p3_347) __extension__ ({ \
-  uint8x8_t __ret_347; \
-  uint8x8_t __s0_347 = __p0_347; \
-  uint8x16_t __s2_347 = __p2_347; \
-  uint8x8_t __rev0_347;  __rev0_347 = __builtin_shufflevector(__s0_347, __s0_347, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x16_t __rev2_347;  __rev2_347 = __builtin_shufflevector(__s2_347, __s2_347, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_347 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_347, __p3_347), __rev0_347, __p1_347); \
-  __ret_347 = __builtin_shufflevector(__ret_347, __ret_347, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_347; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u32(__p0_348, __p1_348, __p2_348, __p3_348) __extension__ ({ \
-  uint32x2_t __ret_348; \
-  uint32x2_t __s0_348 = __p0_348; \
-  uint32x4_t __s2_348 = __p2_348; \
-  __ret_348 = vset_lane_u32(vgetq_lane_u32(__s2_348, __p3_348), __s0_348, __p1_348); \
-  __ret_348; \
-})
-#else
-#define vcopy_laneq_u32(__p0_349, __p1_349, __p2_349, __p3_349) __extension__ ({ \
-  uint32x2_t __ret_349; \
-  uint32x2_t __s0_349 = __p0_349; \
-  uint32x4_t __s2_349 = __p2_349; \
-  uint32x2_t __rev0_349;  __rev0_349 = __builtin_shufflevector(__s0_349, __s0_349, 1, 0); \
-  uint32x4_t __rev2_349;  __rev2_349 = __builtin_shufflevector(__s2_349, __s2_349, 3, 2, 1, 0); \
-  __ret_349 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_349, __p3_349), __rev0_349, __p1_349); \
-  __ret_349 = __builtin_shufflevector(__ret_349, __ret_349, 1, 0); \
-  __ret_349; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u64(__p0_350, __p1_350, __p2_350, __p3_350) __extension__ ({ \
-  uint64x1_t __ret_350; \
-  uint64x1_t __s0_350 = __p0_350; \
-  uint64x2_t __s2_350 = __p2_350; \
-  __ret_350 = vset_lane_u64(vgetq_lane_u64(__s2_350, __p3_350), __s0_350, __p1_350); \
-  __ret_350; \
-})
-#else
-#define vcopy_laneq_u64(__p0_351, __p1_351, __p2_351, __p3_351) __extension__ ({ \
-  uint64x1_t __ret_351; \
-  uint64x1_t __s0_351 = __p0_351; \
-  uint64x2_t __s2_351 = __p2_351; \
-  uint64x2_t __rev2_351;  __rev2_351 = __builtin_shufflevector(__s2_351, __s2_351, 1, 0); \
-  __ret_351 = vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_351, __p3_351), __s0_351, __p1_351); \
-  __ret_351; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_u16(__p0_352, __p1_352, __p2_352, __p3_352) __extension__ ({ \
-  uint16x4_t __ret_352; \
-  uint16x4_t __s0_352 = __p0_352; \
-  uint16x8_t __s2_352 = __p2_352; \
-  __ret_352 = vset_lane_u16(vgetq_lane_u16(__s2_352, __p3_352), __s0_352, __p1_352); \
-  __ret_352; \
-})
-#else
-#define vcopy_laneq_u16(__p0_353, __p1_353, __p2_353, __p3_353) __extension__ ({ \
-  uint16x4_t __ret_353; \
-  uint16x4_t __s0_353 = __p0_353; \
-  uint16x8_t __s2_353 = __p2_353; \
-  uint16x4_t __rev0_353;  __rev0_353 = __builtin_shufflevector(__s0_353, __s0_353, 3, 2, 1, 0); \
-  uint16x8_t __rev2_353;  __rev2_353 = __builtin_shufflevector(__s2_353, __s2_353, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_353 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_353, __p3_353), __rev0_353, __p1_353); \
-  __ret_353 = __builtin_shufflevector(__ret_353, __ret_353, 3, 2, 1, 0); \
-  __ret_353; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s8(__p0_354, __p1_354, __p2_354, __p3_354) __extension__ ({ \
-  int8x8_t __ret_354; \
-  int8x8_t __s0_354 = __p0_354; \
-  int8x16_t __s2_354 = __p2_354; \
-  __ret_354 = vset_lane_s8(vgetq_lane_s8(__s2_354, __p3_354), __s0_354, __p1_354); \
-  __ret_354; \
-})
-#else
-#define vcopy_laneq_s8(__p0_355, __p1_355, __p2_355, __p3_355) __extension__ ({ \
-  int8x8_t __ret_355; \
-  int8x8_t __s0_355 = __p0_355; \
-  int8x16_t __s2_355 = __p2_355; \
-  int8x8_t __rev0_355;  __rev0_355 = __builtin_shufflevector(__s0_355, __s0_355, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x16_t __rev2_355;  __rev2_355 = __builtin_shufflevector(__s2_355, __s2_355, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_355 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_355, __p3_355), __rev0_355, __p1_355); \
-  __ret_355 = __builtin_shufflevector(__ret_355, __ret_355, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_355; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_f32(__p0_356, __p1_356, __p2_356, __p3_356) __extension__ ({ \
-  float32x2_t __ret_356; \
-  float32x2_t __s0_356 = __p0_356; \
-  float32x4_t __s2_356 = __p2_356; \
-  __ret_356 = vset_lane_f32(vgetq_lane_f32(__s2_356, __p3_356), __s0_356, __p1_356); \
-  __ret_356; \
-})
-#else
-#define vcopy_laneq_f32(__p0_357, __p1_357, __p2_357, __p3_357) __extension__ ({ \
-  float32x2_t __ret_357; \
-  float32x2_t __s0_357 = __p0_357; \
-  float32x4_t __s2_357 = __p2_357; \
-  float32x2_t __rev0_357;  __rev0_357 = __builtin_shufflevector(__s0_357, __s0_357, 1, 0); \
-  float32x4_t __rev2_357;  __rev2_357 = __builtin_shufflevector(__s2_357, __s2_357, 3, 2, 1, 0); \
-  __ret_357 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_357, __p3_357), __rev0_357, __p1_357); \
-  __ret_357 = __builtin_shufflevector(__ret_357, __ret_357, 1, 0); \
-  __ret_357; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s32(__p0_358, __p1_358, __p2_358, __p3_358) __extension__ ({ \
-  int32x2_t __ret_358; \
-  int32x2_t __s0_358 = __p0_358; \
-  int32x4_t __s2_358 = __p2_358; \
-  __ret_358 = vset_lane_s32(vgetq_lane_s32(__s2_358, __p3_358), __s0_358, __p1_358); \
-  __ret_358; \
-})
-#else
-#define vcopy_laneq_s32(__p0_359, __p1_359, __p2_359, __p3_359) __extension__ ({ \
-  int32x2_t __ret_359; \
-  int32x2_t __s0_359 = __p0_359; \
-  int32x4_t __s2_359 = __p2_359; \
-  int32x2_t __rev0_359;  __rev0_359 = __builtin_shufflevector(__s0_359, __s0_359, 1, 0); \
-  int32x4_t __rev2_359;  __rev2_359 = __builtin_shufflevector(__s2_359, __s2_359, 3, 2, 1, 0); \
-  __ret_359 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_359, __p3_359), __rev0_359, __p1_359); \
-  __ret_359 = __builtin_shufflevector(__ret_359, __ret_359, 1, 0); \
-  __ret_359; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s64(__p0_360, __p1_360, __p2_360, __p3_360) __extension__ ({ \
-  int64x1_t __ret_360; \
-  int64x1_t __s0_360 = __p0_360; \
-  int64x2_t __s2_360 = __p2_360; \
-  __ret_360 = vset_lane_s64(vgetq_lane_s64(__s2_360, __p3_360), __s0_360, __p1_360); \
-  __ret_360; \
-})
-#else
-#define vcopy_laneq_s64(__p0_361, __p1_361, __p2_361, __p3_361) __extension__ ({ \
-  int64x1_t __ret_361; \
-  int64x1_t __s0_361 = __p0_361; \
-  int64x2_t __s2_361 = __p2_361; \
-  int64x2_t __rev2_361;  __rev2_361 = __builtin_shufflevector(__s2_361, __s2_361, 1, 0); \
-  __ret_361 = vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_361, __p3_361), __s0_361, __p1_361); \
-  __ret_361; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_s16(__p0_362, __p1_362, __p2_362, __p3_362) __extension__ ({ \
-  int16x4_t __ret_362; \
-  int16x4_t __s0_362 = __p0_362; \
-  int16x8_t __s2_362 = __p2_362; \
-  __ret_362 = vset_lane_s16(vgetq_lane_s16(__s2_362, __p3_362), __s0_362, __p1_362); \
-  __ret_362; \
-})
-#else
-#define vcopy_laneq_s16(__p0_363, __p1_363, __p2_363, __p3_363) __extension__ ({ \
-  int16x4_t __ret_363; \
-  int16x4_t __s0_363 = __p0_363; \
-  int16x8_t __s2_363 = __p2_363; \
-  int16x4_t __rev0_363;  __rev0_363 = __builtin_shufflevector(__s0_363, __s0_363, 3, 2, 1, 0); \
-  int16x8_t __rev2_363;  __rev2_363 = __builtin_shufflevector(__s2_363, __s2_363, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_363 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_363, __p3_363), __rev0_363, __p1_363); \
-  __ret_363 = __builtin_shufflevector(__ret_363, __ret_363, 3, 2, 1, 0); \
-  __ret_363; \
-})
-#endif
-
-#define vcreate_p64(__p0) __extension__ ({ \
-  poly64x1_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (poly64x1_t)(__promote); \
-  __ret; \
-})
-#define vcreate_f64(__p0) __extension__ ({ \
-  float64x1_t __ret; \
-  uint64_t __promote = __p0; \
-  __ret = (float64x1_t)(__promote); \
-  __ret; \
-})
-__ai __attribute__((target("neon"))) float32_t vcvts_f32_s32(int32_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vcvts_f32_s32(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32_t vcvts_f32_u32(uint32_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vcvts_f32_u32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vcvt_f32_f64(float64x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcvt_f32_f64((int8x16_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vcvt_f32_f64(float64x2_t __p0) {
-  float32x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcvt_f32_f64((int8x16_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t __noswap_vcvt_f32_f64(float64x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcvt_f32_f64((int8x16_t)__p0, 9);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64_t vcvtd_f64_s64(int64_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vcvtd_f64_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64_t vcvtd_f64_u64(uint64_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vcvtd_f64_u64(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vcvtq_f64_u64(uint64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vcvtq_f64_u64(uint64x2_t __p0) {
-  float64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vcvtq_f64_s64(int64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vcvtq_f64_s64(int64x2_t __p0) {
-  float64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vcvt_f64_u64(uint64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vcvt_f64_s64(int64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vcvt_f64_f32(float32x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcvt_f64_f32((int8x8_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vcvt_f64_f32(float32x2_t __p0) {
-  float64x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vcvt_f64_f32((int8x8_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t __noswap_vcvt_f64_f32(float32x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcvt_f64_f32((int8x8_t)__p0, 42);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vcvt_high_f16_f32(float16x4_t __p0, float32x4_t __p1) {
-  float16x8_t __ret;
-  __ret = vcombine_f16(__p0, vcvt_f16_f32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vcvt_high_f16_f32(float16x4_t __p0, float32x4_t __p1) {
-  float16x8_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_f16(__rev0, __noswap_vcvt_f16_f32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vcvt_high_f32_f16(float16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = vcvt_f32_f16(vget_high_f16(__p0));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vcvt_high_f32_f16(float16x8_t __p0) {
-  float32x4_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcvt_f32_f16(__noswap_vget_high_f16(__rev0));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vcvt_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
-  float32x4_t __ret;
-  __ret = vcombine_f32(__p0, vcvt_f32_f64(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vcvt_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
-  float32x4_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vcombine_f32(__rev0, __noswap_vcvt_f32_f64(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vcvt_high_f64_f32(float32x4_t __p0) {
-  float64x2_t __ret;
-  __ret = vcvt_f64_f32(vget_high_f32(__p0));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vcvt_high_f64_f32(float32x4_t __p0) {
-  float64x2_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vcvt_f64_f32(__noswap_vget_high_f32(__rev0));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#define vcvts_n_f32_u32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  uint32_t __s0 = __p0; \
-  __ret = (float32_t) __builtin_neon_vcvts_n_f32_u32(__s0, __p1); \
-  __ret; \
-})
-#define vcvts_n_f32_s32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (float32_t) __builtin_neon_vcvts_n_f32_s32(__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_f64_u64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define vcvtq_n_f64_u64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__rev0, __p1, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_f64_s64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#else
-#define vcvtq_n_f64_s64(__p0, __p1) __extension__ ({ \
-  float64x2_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__rev0, __p1, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vcvt_n_f64_u64(__p0, __p1) __extension__ ({ \
-  float64x1_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#define vcvt_n_f64_s64(__p0, __p1) __extension__ ({ \
-  float64x1_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 3); \
-  __ret; \
-})
-#define vcvtd_n_f64_u64(__p0, __p1) __extension__ ({ \
-  float64_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_u64(__s0, __p1); \
-  __ret; \
-})
-#define vcvtd_n_f64_s64(__p0, __p1) __extension__ ({ \
-  float64_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_s64(__s0, __p1); \
-  __ret; \
-})
-#define vcvts_n_s32_f32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  float32_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvts_n_s32_f32(__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_s64_f64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  __ret = (int64x2_t) __builtin_neon_vcvtq_n_s64_v((int8x16_t)__s0, __p1, 35); \
-  __ret; \
-})
-#else
-#define vcvtq_n_s64_f64(__p0, __p1) __extension__ ({ \
-  int64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vcvtq_n_s64_v((int8x16_t)__rev0, __p1, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vcvt_n_s64_f64(__p0, __p1) __extension__ ({ \
-  int64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  __ret = (int64x1_t) __builtin_neon_vcvt_n_s64_v((int8x8_t)__s0, __p1, 3); \
-  __ret; \
-})
-#define vcvtd_n_s64_f64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  float64_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvtd_n_s64_f64(__s0, __p1); \
-  __ret; \
-})
-#define vcvts_n_u32_f32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  float32_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvts_n_u32_f32(__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcvtq_n_u64_f64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  __ret = (uint64x2_t) __builtin_neon_vcvtq_n_u64_v((int8x16_t)__s0, __p1, 51); \
-  __ret; \
-})
-#else
-#define vcvtq_n_u64_f64(__p0, __p1) __extension__ ({ \
-  uint64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vcvtq_n_u64_v((int8x16_t)__rev0, __p1, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vcvt_n_u64_f64(__p0, __p1) __extension__ ({ \
-  uint64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  __ret = (uint64x1_t) __builtin_neon_vcvt_n_u64_v((int8x8_t)__s0, __p1, 19); \
-  __ret; \
-})
-#define vcvtd_n_u64_f64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  float64_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvtd_n_u64_f64(__s0, __p1); \
-  __ret; \
-})
-__ai __attribute__((target("neon"))) int32_t vcvts_s32_f32(float32_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vcvts_s32_f32(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vcvtd_s64_f64(float64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcvtd_s64_f64(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vcvtq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vcvtq_s64_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vcvtq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vcvtq_s64_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vcvt_s64_f64(float64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vcvt_s64_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcvts_u32_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcvts_u32_f32(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcvtd_u64_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcvtd_u64_f64(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcvtq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcvtq_u64_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcvtq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcvtq_u64_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcvt_u64_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcvt_u64_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vcvtas_s32_f32(float32_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vcvtas_s32_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vcvtaq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vcvtaq_s64_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vcvtaq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vcvtaq_s64_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vcvta_s64_f64(float64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vcvta_s64_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vcvtad_s64_f64(float64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcvtad_s64_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcvtas_u32_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcvtas_u32_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcvtaq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcvtaq_u64_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcvtaq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcvtaq_u64_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcvta_u64_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcvta_u64_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcvtad_u64_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcvtad_u64_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vcvtms_s32_f32(float32_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vcvtms_s32_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vcvtmq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vcvtmq_s64_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vcvtmq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vcvtmq_s64_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vcvtm_s64_f64(float64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vcvtm_s64_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vcvtmd_s64_f64(float64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcvtmd_s64_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcvtms_u32_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcvtms_u32_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcvtmq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcvtmq_u64_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcvtmq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcvtmq_u64_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcvtm_u64_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcvtm_u64_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcvtmd_u64_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcvtmd_u64_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vcvtns_s32_f32(float32_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vcvtns_s32_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vcvtnq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vcvtnq_s64_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vcvtnq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vcvtnq_s64_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vcvtn_s64_f64(float64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vcvtn_s64_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vcvtnd_s64_f64(float64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcvtnd_s64_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcvtns_u32_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcvtns_u32_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcvtnq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcvtnq_u64_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcvtnq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcvtnq_u64_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcvtn_u64_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcvtn_u64_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcvtnd_u64_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcvtnd_u64_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vcvtps_s32_f32(float32_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vcvtps_s32_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vcvtpq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vcvtpq_s64_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vcvtpq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vcvtpq_s64_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vcvtp_s64_f64(float64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vcvtp_s64_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vcvtpd_s64_f64(float64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vcvtpd_s64_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vcvtps_u32_f32(float32_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vcvtps_u32_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vcvtpq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vcvtpq_u64_v((int8x16_t)__p0, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vcvtpq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vcvtpq_u64_v((int8x16_t)__rev0, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vcvtp_u64_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vcvtp_u64_v((int8x8_t)__p0, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vcvtpd_u64_f64(float64_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vcvtpd_u64_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32_t vcvtxd_f32_f64(float64_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vcvtxd_f32_f64(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vcvtx_f32_f64(float64x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcvtx_f32_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vcvtx_f32_f64(float64x2_t __p0) {
-  float32x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vcvtx_f32_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t __noswap_vcvtx_f32_f64(float64x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vcvtx_f32_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vcvtx_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
-  float32x4_t __ret;
-  __ret = vcombine_f32(__p0, vcvtx_f32_f64(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vcvtx_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
-  float32x4_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vcombine_f32(__rev0, __noswap_vcvtx_f32_f64(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vdivq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __p0 / __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vdivq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 / __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vdivq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __p0 / __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vdivq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 / __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vdiv_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = __p0 / __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vdiv_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __p0 / __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vdiv_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 / __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupb_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  __ret = (poly8_t) __builtin_neon_vdupb_lane_i8((poly8x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdupb_lane_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x8_t __s0 = __p0; \
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8_t) __builtin_neon_vdupb_lane_i8((poly8x8_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vduph_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  __ret = (poly16_t) __builtin_neon_vduph_lane_i16((poly16x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_lane_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x4_t __s0 = __p0; \
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (poly16_t) __builtin_neon_vduph_lane_i16((poly16x4_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupb_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdupb_lane_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x8_t __s0 = __p0; \
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdups_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vdups_lane_i32((int32x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdups_lane_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x2_t __s0 = __p0; \
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint32_t) __builtin_neon_vdups_lane_i32((int32x2_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#define vdupd_lane_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64x1_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vdupd_lane_i64((int64x1_t)__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vduph_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vduph_lane_i16((int16x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_lane_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x4_t __s0 = __p0; \
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint16_t) __builtin_neon_vduph_lane_i16((int16x4_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupb_lane_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdupb_lane_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x8_t __s0 = __p0; \
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#define vdupd_lane_f64(__p0, __p1) __extension__ ({ \
-  float64_t __ret; \
-  float64x1_t __s0 = __p0; \
-  __ret = (float64_t) __builtin_neon_vdupd_lane_f64((float64x1_t)__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vdups_lane_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x2_t __s0 = __p0; \
-  __ret = (float32_t) __builtin_neon_vdups_lane_f32((float32x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdups_lane_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float32_t) __builtin_neon_vdups_lane_f32((float32x2_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdups_lane_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x2_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vdups_lane_i32((int32x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdups_lane_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int32_t) __builtin_neon_vdups_lane_i32((int32x2_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#define vdupd_lane_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64x1_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vdupd_lane_i64((int64x1_t)__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vduph_lane_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x4_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vduph_lane_i16((int16x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_lane_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int16_t) __builtin_neon_vduph_lane_i16((int16x4_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vduph_lane_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vduph_lane_f16((float16x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_lane_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16x4_t __s0 = __p0; \
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vduph_lane_f16((float16x4_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#define vdup_lane_p64(__p0_364, __p1_364) __extension__ ({ \
-  poly64x1_t __ret_364; \
-  poly64x1_t __s0_364 = __p0_364; \
-  __ret_364 = splat_lane_p64(__s0_364, __p1_364); \
-  __ret_364; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_p64(__p0_365, __p1_365) __extension__ ({ \
-  poly64x2_t __ret_365; \
-  poly64x1_t __s0_365 = __p0_365; \
-  __ret_365 = splatq_lane_p64(__s0_365, __p1_365); \
-  __ret_365; \
-})
-#else
-#define vdupq_lane_p64(__p0_366, __p1_366) __extension__ ({ \
-  poly64x2_t __ret_366; \
-  poly64x1_t __s0_366 = __p0_366; \
-  __ret_366 = __noswap_splatq_lane_p64(__s0_366, __p1_366); \
-  __ret_366 = __builtin_shufflevector(__ret_366, __ret_366, 1, 0); \
-  __ret_366; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_lane_f64(__p0_367, __p1_367) __extension__ ({ \
-  float64x2_t __ret_367; \
-  float64x1_t __s0_367 = __p0_367; \
-  __ret_367 = splatq_lane_f64(__s0_367, __p1_367); \
-  __ret_367; \
-})
-#else
-#define vdupq_lane_f64(__p0_368, __p1_368) __extension__ ({ \
-  float64x2_t __ret_368; \
-  float64x1_t __s0_368 = __p0_368; \
-  __ret_368 = __noswap_splatq_lane_f64(__s0_368, __p1_368); \
-  __ret_368 = __builtin_shufflevector(__ret_368, __ret_368, 1, 0); \
-  __ret_368; \
-})
-#endif
-
-#define vdup_lane_f64(__p0_369, __p1_369) __extension__ ({ \
-  float64x1_t __ret_369; \
-  float64x1_t __s0_369 = __p0_369; \
-  __ret_369 = splat_lane_f64(__s0_369, __p1_369); \
-  __ret_369; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vdupb_laneq_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  __ret = (poly8_t) __builtin_neon_vdupb_laneq_i8((poly8x16_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdupb_laneq_p8(__p0, __p1) __extension__ ({ \
-  poly8_t __ret; \
-  poly8x16_t __s0 = __p0; \
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly8_t) __builtin_neon_vdupb_laneq_i8((poly8x16_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vduph_laneq_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  __ret = (poly16_t) __builtin_neon_vduph_laneq_i16((poly16x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_laneq_p16(__p0, __p1) __extension__ ({ \
-  poly16_t __ret; \
-  poly16x8_t __s0 = __p0; \
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (poly16_t) __builtin_neon_vduph_laneq_i16((poly16x8_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupb_laneq_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdupb_laneq_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8x16_t __s0 = __p0; \
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdups_laneq_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vdups_laneq_i32((int32x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdups_laneq_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (uint32_t) __builtin_neon_vdups_laneq_i32((int32x4_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupd_laneq_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vdupd_laneq_i64((int64x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdupd_laneq_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (uint64_t) __builtin_neon_vdupd_laneq_i64((int64x2_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vduph_laneq_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vduph_laneq_i16((int16x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_laneq_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16x8_t __s0 = __p0; \
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (uint16_t) __builtin_neon_vduph_laneq_i16((int16x8_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupb_laneq_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x16_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdupb_laneq_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8x16_t __s0 = __p0; \
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupd_laneq_f64(__p0, __p1) __extension__ ({ \
-  float64_t __ret; \
-  float64x2_t __s0 = __p0; \
-  __ret = (float64_t) __builtin_neon_vdupd_laneq_f64((float64x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdupd_laneq_f64(__p0, __p1) __extension__ ({ \
-  float64_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float64_t) __builtin_neon_vdupd_laneq_f64((float64x2_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdups_laneq_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x4_t __s0 = __p0; \
-  __ret = (float32_t) __builtin_neon_vdups_laneq_f32((float32x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdups_laneq_f32(__p0, __p1) __extension__ ({ \
-  float32_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (float32_t) __builtin_neon_vdups_laneq_f32((float32x4_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdups_laneq_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x4_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vdups_laneq_i32((int32x4_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdups_laneq_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  __ret = (int32_t) __builtin_neon_vdups_laneq_i32((int32x4_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupd_laneq_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64x2_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vdupd_laneq_i64((int64x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vdupd_laneq_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64x2_t __s0 = __p0; \
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (int64_t) __builtin_neon_vdupd_laneq_i64((int64x2_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vduph_laneq_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x8_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vduph_laneq_i16((int16x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_laneq_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16_t) __builtin_neon_vduph_laneq_i16((int16x8_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vduph_laneq_f16((float16x8_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16x8_t __s0 = __p0; \
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (float16_t) __builtin_neon_vduph_laneq_f16((float16x8_t)__rev0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_p8(__p0_370, __p1_370) __extension__ ({ \
-  poly8x8_t __ret_370; \
-  poly8x16_t __s0_370 = __p0_370; \
-  __ret_370 = splat_laneq_p8(__s0_370, __p1_370); \
-  __ret_370; \
-})
-#else
-#define vdup_laneq_p8(__p0_371, __p1_371) __extension__ ({ \
-  poly8x8_t __ret_371; \
-  poly8x16_t __s0_371 = __p0_371; \
-  poly8x16_t __rev0_371;  __rev0_371 = __builtin_shufflevector(__s0_371, __s0_371, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_371 = __noswap_splat_laneq_p8(__rev0_371, __p1_371); \
-  __ret_371 = __builtin_shufflevector(__ret_371, __ret_371, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_371; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_p64(__p0_372, __p1_372) __extension__ ({ \
-  poly64x1_t __ret_372; \
-  poly64x2_t __s0_372 = __p0_372; \
-  __ret_372 = splat_laneq_p64(__s0_372, __p1_372); \
-  __ret_372; \
-})
-#else
-#define vdup_laneq_p64(__p0_373, __p1_373) __extension__ ({ \
-  poly64x1_t __ret_373; \
-  poly64x2_t __s0_373 = __p0_373; \
-  poly64x2_t __rev0_373;  __rev0_373 = __builtin_shufflevector(__s0_373, __s0_373, 1, 0); \
-  __ret_373 = __noswap_splat_laneq_p64(__rev0_373, __p1_373); \
-  __ret_373; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_p16(__p0_374, __p1_374) __extension__ ({ \
-  poly16x4_t __ret_374; \
-  poly16x8_t __s0_374 = __p0_374; \
-  __ret_374 = splat_laneq_p16(__s0_374, __p1_374); \
-  __ret_374; \
-})
-#else
-#define vdup_laneq_p16(__p0_375, __p1_375) __extension__ ({ \
-  poly16x4_t __ret_375; \
-  poly16x8_t __s0_375 = __p0_375; \
-  poly16x8_t __rev0_375;  __rev0_375 = __builtin_shufflevector(__s0_375, __s0_375, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_375 = __noswap_splat_laneq_p16(__rev0_375, __p1_375); \
-  __ret_375 = __builtin_shufflevector(__ret_375, __ret_375, 3, 2, 1, 0); \
-  __ret_375; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_p8(__p0_376, __p1_376) __extension__ ({ \
-  poly8x16_t __ret_376; \
-  poly8x16_t __s0_376 = __p0_376; \
-  __ret_376 = splatq_laneq_p8(__s0_376, __p1_376); \
-  __ret_376; \
-})
-#else
-#define vdupq_laneq_p8(__p0_377, __p1_377) __extension__ ({ \
-  poly8x16_t __ret_377; \
-  poly8x16_t __s0_377 = __p0_377; \
-  poly8x16_t __rev0_377;  __rev0_377 = __builtin_shufflevector(__s0_377, __s0_377, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_377 = __noswap_splatq_laneq_p8(__rev0_377, __p1_377); \
-  __ret_377 = __builtin_shufflevector(__ret_377, __ret_377, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_377; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_p64(__p0_378, __p1_378) __extension__ ({ \
-  poly64x2_t __ret_378; \
-  poly64x2_t __s0_378 = __p0_378; \
-  __ret_378 = splatq_laneq_p64(__s0_378, __p1_378); \
-  __ret_378; \
-})
-#else
-#define vdupq_laneq_p64(__p0_379, __p1_379) __extension__ ({ \
-  poly64x2_t __ret_379; \
-  poly64x2_t __s0_379 = __p0_379; \
-  poly64x2_t __rev0_379;  __rev0_379 = __builtin_shufflevector(__s0_379, __s0_379, 1, 0); \
-  __ret_379 = __noswap_splatq_laneq_p64(__rev0_379, __p1_379); \
-  __ret_379 = __builtin_shufflevector(__ret_379, __ret_379, 1, 0); \
-  __ret_379; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_p16(__p0_380, __p1_380) __extension__ ({ \
-  poly16x8_t __ret_380; \
-  poly16x8_t __s0_380 = __p0_380; \
-  __ret_380 = splatq_laneq_p16(__s0_380, __p1_380); \
-  __ret_380; \
-})
-#else
-#define vdupq_laneq_p16(__p0_381, __p1_381) __extension__ ({ \
-  poly16x8_t __ret_381; \
-  poly16x8_t __s0_381 = __p0_381; \
-  poly16x8_t __rev0_381;  __rev0_381 = __builtin_shufflevector(__s0_381, __s0_381, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_381 = __noswap_splatq_laneq_p16(__rev0_381, __p1_381); \
-  __ret_381 = __builtin_shufflevector(__ret_381, __ret_381, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_381; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_u8(__p0_382, __p1_382) __extension__ ({ \
-  uint8x16_t __ret_382; \
-  uint8x16_t __s0_382 = __p0_382; \
-  __ret_382 = splatq_laneq_u8(__s0_382, __p1_382); \
-  __ret_382; \
-})
-#else
-#define vdupq_laneq_u8(__p0_383, __p1_383) __extension__ ({ \
-  uint8x16_t __ret_383; \
-  uint8x16_t __s0_383 = __p0_383; \
-  uint8x16_t __rev0_383;  __rev0_383 = __builtin_shufflevector(__s0_383, __s0_383, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_383 = __noswap_splatq_laneq_u8(__rev0_383, __p1_383); \
-  __ret_383 = __builtin_shufflevector(__ret_383, __ret_383, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_383; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_u32(__p0_384, __p1_384) __extension__ ({ \
-  uint32x4_t __ret_384; \
-  uint32x4_t __s0_384 = __p0_384; \
-  __ret_384 = splatq_laneq_u32(__s0_384, __p1_384); \
-  __ret_384; \
-})
-#else
-#define vdupq_laneq_u32(__p0_385, __p1_385) __extension__ ({ \
-  uint32x4_t __ret_385; \
-  uint32x4_t __s0_385 = __p0_385; \
-  uint32x4_t __rev0_385;  __rev0_385 = __builtin_shufflevector(__s0_385, __s0_385, 3, 2, 1, 0); \
-  __ret_385 = __noswap_splatq_laneq_u32(__rev0_385, __p1_385); \
-  __ret_385 = __builtin_shufflevector(__ret_385, __ret_385, 3, 2, 1, 0); \
-  __ret_385; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_u64(__p0_386, __p1_386) __extension__ ({ \
-  uint64x2_t __ret_386; \
-  uint64x2_t __s0_386 = __p0_386; \
-  __ret_386 = splatq_laneq_u64(__s0_386, __p1_386); \
-  __ret_386; \
-})
-#else
-#define vdupq_laneq_u64(__p0_387, __p1_387) __extension__ ({ \
-  uint64x2_t __ret_387; \
-  uint64x2_t __s0_387 = __p0_387; \
-  uint64x2_t __rev0_387;  __rev0_387 = __builtin_shufflevector(__s0_387, __s0_387, 1, 0); \
-  __ret_387 = __noswap_splatq_laneq_u64(__rev0_387, __p1_387); \
-  __ret_387 = __builtin_shufflevector(__ret_387, __ret_387, 1, 0); \
-  __ret_387; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_u16(__p0_388, __p1_388) __extension__ ({ \
-  uint16x8_t __ret_388; \
-  uint16x8_t __s0_388 = __p0_388; \
-  __ret_388 = splatq_laneq_u16(__s0_388, __p1_388); \
-  __ret_388; \
-})
-#else
-#define vdupq_laneq_u16(__p0_389, __p1_389) __extension__ ({ \
-  uint16x8_t __ret_389; \
-  uint16x8_t __s0_389 = __p0_389; \
-  uint16x8_t __rev0_389;  __rev0_389 = __builtin_shufflevector(__s0_389, __s0_389, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_389 = __noswap_splatq_laneq_u16(__rev0_389, __p1_389); \
-  __ret_389 = __builtin_shufflevector(__ret_389, __ret_389, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_389; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_s8(__p0_390, __p1_390) __extension__ ({ \
-  int8x16_t __ret_390; \
-  int8x16_t __s0_390 = __p0_390; \
-  __ret_390 = splatq_laneq_s8(__s0_390, __p1_390); \
-  __ret_390; \
-})
-#else
-#define vdupq_laneq_s8(__p0_391, __p1_391) __extension__ ({ \
-  int8x16_t __ret_391; \
-  int8x16_t __s0_391 = __p0_391; \
-  int8x16_t __rev0_391;  __rev0_391 = __builtin_shufflevector(__s0_391, __s0_391, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_391 = __noswap_splatq_laneq_s8(__rev0_391, __p1_391); \
-  __ret_391 = __builtin_shufflevector(__ret_391, __ret_391, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_391; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_f64(__p0_392, __p1_392) __extension__ ({ \
-  float64x2_t __ret_392; \
-  float64x2_t __s0_392 = __p0_392; \
-  __ret_392 = splatq_laneq_f64(__s0_392, __p1_392); \
-  __ret_392; \
-})
-#else
-#define vdupq_laneq_f64(__p0_393, __p1_393) __extension__ ({ \
-  float64x2_t __ret_393; \
-  float64x2_t __s0_393 = __p0_393; \
-  float64x2_t __rev0_393;  __rev0_393 = __builtin_shufflevector(__s0_393, __s0_393, 1, 0); \
-  __ret_393 = __noswap_splatq_laneq_f64(__rev0_393, __p1_393); \
-  __ret_393 = __builtin_shufflevector(__ret_393, __ret_393, 1, 0); \
-  __ret_393; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_f32(__p0_394, __p1_394) __extension__ ({ \
-  float32x4_t __ret_394; \
-  float32x4_t __s0_394 = __p0_394; \
-  __ret_394 = splatq_laneq_f32(__s0_394, __p1_394); \
-  __ret_394; \
-})
-#else
-#define vdupq_laneq_f32(__p0_395, __p1_395) __extension__ ({ \
-  float32x4_t __ret_395; \
-  float32x4_t __s0_395 = __p0_395; \
-  float32x4_t __rev0_395;  __rev0_395 = __builtin_shufflevector(__s0_395, __s0_395, 3, 2, 1, 0); \
-  __ret_395 = __noswap_splatq_laneq_f32(__rev0_395, __p1_395); \
-  __ret_395 = __builtin_shufflevector(__ret_395, __ret_395, 3, 2, 1, 0); \
-  __ret_395; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_f16(__p0_396, __p1_396) __extension__ ({ \
-  float16x8_t __ret_396; \
-  float16x8_t __s0_396 = __p0_396; \
-  __ret_396 = splatq_laneq_f16(__s0_396, __p1_396); \
-  __ret_396; \
-})
-#else
-#define vdupq_laneq_f16(__p0_397, __p1_397) __extension__ ({ \
-  float16x8_t __ret_397; \
-  float16x8_t __s0_397 = __p0_397; \
-  float16x8_t __rev0_397;  __rev0_397 = __builtin_shufflevector(__s0_397, __s0_397, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_397 = __noswap_splatq_laneq_f16(__rev0_397, __p1_397); \
-  __ret_397 = __builtin_shufflevector(__ret_397, __ret_397, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_397; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_s32(__p0_398, __p1_398) __extension__ ({ \
-  int32x4_t __ret_398; \
-  int32x4_t __s0_398 = __p0_398; \
-  __ret_398 = splatq_laneq_s32(__s0_398, __p1_398); \
-  __ret_398; \
-})
-#else
-#define vdupq_laneq_s32(__p0_399, __p1_399) __extension__ ({ \
-  int32x4_t __ret_399; \
-  int32x4_t __s0_399 = __p0_399; \
-  int32x4_t __rev0_399;  __rev0_399 = __builtin_shufflevector(__s0_399, __s0_399, 3, 2, 1, 0); \
-  __ret_399 = __noswap_splatq_laneq_s32(__rev0_399, __p1_399); \
-  __ret_399 = __builtin_shufflevector(__ret_399, __ret_399, 3, 2, 1, 0); \
-  __ret_399; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_s64(__p0_400, __p1_400) __extension__ ({ \
-  int64x2_t __ret_400; \
-  int64x2_t __s0_400 = __p0_400; \
-  __ret_400 = splatq_laneq_s64(__s0_400, __p1_400); \
-  __ret_400; \
-})
-#else
-#define vdupq_laneq_s64(__p0_401, __p1_401) __extension__ ({ \
-  int64x2_t __ret_401; \
-  int64x2_t __s0_401 = __p0_401; \
-  int64x2_t __rev0_401;  __rev0_401 = __builtin_shufflevector(__s0_401, __s0_401, 1, 0); \
-  __ret_401 = __noswap_splatq_laneq_s64(__rev0_401, __p1_401); \
-  __ret_401 = __builtin_shufflevector(__ret_401, __ret_401, 1, 0); \
-  __ret_401; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdupq_laneq_s16(__p0_402, __p1_402) __extension__ ({ \
-  int16x8_t __ret_402; \
-  int16x8_t __s0_402 = __p0_402; \
-  __ret_402 = splatq_laneq_s16(__s0_402, __p1_402); \
-  __ret_402; \
-})
-#else
-#define vdupq_laneq_s16(__p0_403, __p1_403) __extension__ ({ \
-  int16x8_t __ret_403; \
-  int16x8_t __s0_403 = __p0_403; \
-  int16x8_t __rev0_403;  __rev0_403 = __builtin_shufflevector(__s0_403, __s0_403, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_403 = __noswap_splatq_laneq_s16(__rev0_403, __p1_403); \
-  __ret_403 = __builtin_shufflevector(__ret_403, __ret_403, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_403; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_u8(__p0_404, __p1_404) __extension__ ({ \
-  uint8x8_t __ret_404; \
-  uint8x16_t __s0_404 = __p0_404; \
-  __ret_404 = splat_laneq_u8(__s0_404, __p1_404); \
-  __ret_404; \
-})
-#else
-#define vdup_laneq_u8(__p0_405, __p1_405) __extension__ ({ \
-  uint8x8_t __ret_405; \
-  uint8x16_t __s0_405 = __p0_405; \
-  uint8x16_t __rev0_405;  __rev0_405 = __builtin_shufflevector(__s0_405, __s0_405, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_405 = __noswap_splat_laneq_u8(__rev0_405, __p1_405); \
-  __ret_405 = __builtin_shufflevector(__ret_405, __ret_405, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_405; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_u32(__p0_406, __p1_406) __extension__ ({ \
-  uint32x2_t __ret_406; \
-  uint32x4_t __s0_406 = __p0_406; \
-  __ret_406 = splat_laneq_u32(__s0_406, __p1_406); \
-  __ret_406; \
-})
-#else
-#define vdup_laneq_u32(__p0_407, __p1_407) __extension__ ({ \
-  uint32x2_t __ret_407; \
-  uint32x4_t __s0_407 = __p0_407; \
-  uint32x4_t __rev0_407;  __rev0_407 = __builtin_shufflevector(__s0_407, __s0_407, 3, 2, 1, 0); \
-  __ret_407 = __noswap_splat_laneq_u32(__rev0_407, __p1_407); \
-  __ret_407 = __builtin_shufflevector(__ret_407, __ret_407, 1, 0); \
-  __ret_407; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_u64(__p0_408, __p1_408) __extension__ ({ \
-  uint64x1_t __ret_408; \
-  uint64x2_t __s0_408 = __p0_408; \
-  __ret_408 = splat_laneq_u64(__s0_408, __p1_408); \
-  __ret_408; \
-})
-#else
-#define vdup_laneq_u64(__p0_409, __p1_409) __extension__ ({ \
-  uint64x1_t __ret_409; \
-  uint64x2_t __s0_409 = __p0_409; \
-  uint64x2_t __rev0_409;  __rev0_409 = __builtin_shufflevector(__s0_409, __s0_409, 1, 0); \
-  __ret_409 = __noswap_splat_laneq_u64(__rev0_409, __p1_409); \
-  __ret_409; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_u16(__p0_410, __p1_410) __extension__ ({ \
-  uint16x4_t __ret_410; \
-  uint16x8_t __s0_410 = __p0_410; \
-  __ret_410 = splat_laneq_u16(__s0_410, __p1_410); \
-  __ret_410; \
-})
-#else
-#define vdup_laneq_u16(__p0_411, __p1_411) __extension__ ({ \
-  uint16x4_t __ret_411; \
-  uint16x8_t __s0_411 = __p0_411; \
-  uint16x8_t __rev0_411;  __rev0_411 = __builtin_shufflevector(__s0_411, __s0_411, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_411 = __noswap_splat_laneq_u16(__rev0_411, __p1_411); \
-  __ret_411 = __builtin_shufflevector(__ret_411, __ret_411, 3, 2, 1, 0); \
-  __ret_411; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_s8(__p0_412, __p1_412) __extension__ ({ \
-  int8x8_t __ret_412; \
-  int8x16_t __s0_412 = __p0_412; \
-  __ret_412 = splat_laneq_s8(__s0_412, __p1_412); \
-  __ret_412; \
-})
-#else
-#define vdup_laneq_s8(__p0_413, __p1_413) __extension__ ({ \
-  int8x8_t __ret_413; \
-  int8x16_t __s0_413 = __p0_413; \
-  int8x16_t __rev0_413;  __rev0_413 = __builtin_shufflevector(__s0_413, __s0_413, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_413 = __noswap_splat_laneq_s8(__rev0_413, __p1_413); \
-  __ret_413 = __builtin_shufflevector(__ret_413, __ret_413, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_413; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_f64(__p0_414, __p1_414) __extension__ ({ \
-  float64x1_t __ret_414; \
-  float64x2_t __s0_414 = __p0_414; \
-  __ret_414 = splat_laneq_f64(__s0_414, __p1_414); \
-  __ret_414; \
-})
-#else
-#define vdup_laneq_f64(__p0_415, __p1_415) __extension__ ({ \
-  float64x1_t __ret_415; \
-  float64x2_t __s0_415 = __p0_415; \
-  float64x2_t __rev0_415;  __rev0_415 = __builtin_shufflevector(__s0_415, __s0_415, 1, 0); \
-  __ret_415 = __noswap_splat_laneq_f64(__rev0_415, __p1_415); \
-  __ret_415; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_f32(__p0_416, __p1_416) __extension__ ({ \
-  float32x2_t __ret_416; \
-  float32x4_t __s0_416 = __p0_416; \
-  __ret_416 = splat_laneq_f32(__s0_416, __p1_416); \
-  __ret_416; \
-})
-#else
-#define vdup_laneq_f32(__p0_417, __p1_417) __extension__ ({ \
-  float32x2_t __ret_417; \
-  float32x4_t __s0_417 = __p0_417; \
-  float32x4_t __rev0_417;  __rev0_417 = __builtin_shufflevector(__s0_417, __s0_417, 3, 2, 1, 0); \
-  __ret_417 = __noswap_splat_laneq_f32(__rev0_417, __p1_417); \
-  __ret_417 = __builtin_shufflevector(__ret_417, __ret_417, 1, 0); \
-  __ret_417; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_f16(__p0_418, __p1_418) __extension__ ({ \
-  float16x4_t __ret_418; \
-  float16x8_t __s0_418 = __p0_418; \
-  __ret_418 = splat_laneq_f16(__s0_418, __p1_418); \
-  __ret_418; \
-})
-#else
-#define vdup_laneq_f16(__p0_419, __p1_419) __extension__ ({ \
-  float16x4_t __ret_419; \
-  float16x8_t __s0_419 = __p0_419; \
-  float16x8_t __rev0_419;  __rev0_419 = __builtin_shufflevector(__s0_419, __s0_419, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_419 = __noswap_splat_laneq_f16(__rev0_419, __p1_419); \
-  __ret_419 = __builtin_shufflevector(__ret_419, __ret_419, 3, 2, 1, 0); \
-  __ret_419; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_s32(__p0_420, __p1_420) __extension__ ({ \
-  int32x2_t __ret_420; \
-  int32x4_t __s0_420 = __p0_420; \
-  __ret_420 = splat_laneq_s32(__s0_420, __p1_420); \
-  __ret_420; \
-})
-#else
-#define vdup_laneq_s32(__p0_421, __p1_421) __extension__ ({ \
-  int32x2_t __ret_421; \
-  int32x4_t __s0_421 = __p0_421; \
-  int32x4_t __rev0_421;  __rev0_421 = __builtin_shufflevector(__s0_421, __s0_421, 3, 2, 1, 0); \
-  __ret_421 = __noswap_splat_laneq_s32(__rev0_421, __p1_421); \
-  __ret_421 = __builtin_shufflevector(__ret_421, __ret_421, 1, 0); \
-  __ret_421; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_s64(__p0_422, __p1_422) __extension__ ({ \
-  int64x1_t __ret_422; \
-  int64x2_t __s0_422 = __p0_422; \
-  __ret_422 = splat_laneq_s64(__s0_422, __p1_422); \
-  __ret_422; \
-})
-#else
-#define vdup_laneq_s64(__p0_423, __p1_423) __extension__ ({ \
-  int64x1_t __ret_423; \
-  int64x2_t __s0_423 = __p0_423; \
-  int64x2_t __rev0_423;  __rev0_423 = __builtin_shufflevector(__s0_423, __s0_423, 1, 0); \
-  __ret_423 = __noswap_splat_laneq_s64(__rev0_423, __p1_423); \
-  __ret_423; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdup_laneq_s16(__p0_424, __p1_424) __extension__ ({ \
-  int16x4_t __ret_424; \
-  int16x8_t __s0_424 = __p0_424; \
-  __ret_424 = splat_laneq_s16(__s0_424, __p1_424); \
-  __ret_424; \
-})
-#else
-#define vdup_laneq_s16(__p0_425, __p1_425) __extension__ ({ \
-  int16x4_t __ret_425; \
-  int16x8_t __s0_425 = __p0_425; \
-  int16x8_t __rev0_425;  __rev0_425 = __builtin_shufflevector(__s0_425, __s0_425, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_425 = __noswap_splat_laneq_s16(__rev0_425, __p1_425); \
-  __ret_425 = __builtin_shufflevector(__ret_425, __ret_425, 3, 2, 1, 0); \
-  __ret_425; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) poly64x1_t vdup_n_p64(poly64_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t) {__p0};
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vdupq_n_p64(poly64_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vdupq_n_p64(poly64_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vdupq_n_f64(float64_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vdupq_n_f64(float64_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vdup_n_f64(float64_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) {__p0};
-  return __ret;
-}
-#define vext_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64x1_t __s0 = __p0; \
-  poly64x1_t __s1 = __p1; \
-  __ret = (poly64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vextq_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  poly64x2_t __s1 = __p1; \
-  __ret = (poly64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 38); \
-  __ret; \
-})
-#else
-#define vextq_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  poly64x2_t __s1 = __p1; \
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (poly64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 38); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vextq_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  __ret = (float64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 42); \
-  __ret; \
-})
-#else
-#define vextq_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (float64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 42); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vext_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  float64x1_t __s1 = __p1; \
-  __ret = (float64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vfmaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vfmaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t __noswap_vfmaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vfma_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
-  return __ret;
-}
-#define vfmad_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64_t __ret; \
-  float64_t __s0 = __p0; \
-  float64_t __s1 = __p1; \
-  float64x1_t __s2 = __p2; \
-  __ret = (float64_t) __builtin_neon_vfmad_lane_f64(__s0, __s1, (float64x1_t)__s2, __p3); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vfmas_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32_t __ret; \
-  float32_t __s0 = __p0; \
-  float32_t __s1 = __p1; \
-  float32x2_t __s2 = __p2; \
-  __ret = (float32_t) __builtin_neon_vfmas_lane_f32(__s0, __s1, (float32x2_t)__s2, __p3); \
-  __ret; \
-})
-#else
-#define vfmas_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32_t __ret; \
-  float32_t __s0 = __p0; \
-  float32_t __s1 = __p1; \
-  float32x2_t __s2 = __p2; \
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
-  __ret = (float32_t) __builtin_neon_vfmas_lane_f32(__s0, __s1, (float32x2_t)__rev2, __p3); \
-  __ret; \
-})
-#define __noswap_vfmas_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32_t __ret; \
-  float32_t __s0 = __p0; \
-  float32_t __s1 = __p1; \
-  float32x2_t __s2 = __p2; \
-  __ret = (float32_t) __builtin_neon_vfmas_lane_f32(__s0, __s1, (float32x2_t)__s2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmaq_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  float64x1_t __s2 = __p2; \
-  __ret = (float64x2_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 42); \
-  __ret; \
-})
-#else
-#define vfmaq_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  float64x1_t __s2 = __p2; \
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (float64x2_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__s2, __p3, 42); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfmaq_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  float64x1_t __s2 = __p2; \
-  __ret = (float64x2_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 42); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  float32x2_t __s2 = __p2; \
-  __ret = (float32x4_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 41); \
-  __ret; \
-})
-#else
-#define vfmaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  float32x2_t __s2 = __p2; \
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
-  __ret = (float32x4_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, __p3, 41); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfmaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  float32x2_t __s2 = __p2; \
-  __ret = (float32x4_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 41); \
-  __ret; \
-})
-#endif
-
-#define vfma_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  float64x1_t __s1 = __p1; \
-  float64x1_t __s2 = __p2; \
-  __ret = (float64x1_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 10); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vfma_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  float32x2_t __s2 = __p2; \
-  __ret = (float32x2_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 9); \
-  __ret; \
-})
-#else
-#define vfma_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  float32x2_t __s2 = __p2; \
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
-  __ret = (float32x2_t) __builtin_neon_vfma_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, __p3, 9); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfma_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  float32x2_t __s2 = __p2; \
-  __ret = (float32x2_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 9); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmad_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64_t __ret; \
-  float64_t __s0 = __p0; \
-  float64_t __s1 = __p1; \
-  float64x2_t __s2 = __p2; \
-  __ret = (float64_t) __builtin_neon_vfmad_laneq_f64(__s0, __s1, (float64x2_t)__s2, __p3); \
-  __ret; \
-})
-#else
-#define vfmad_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64_t __ret; \
-  float64_t __s0 = __p0; \
-  float64_t __s1 = __p1; \
-  float64x2_t __s2 = __p2; \
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
-  __ret = (float64_t) __builtin_neon_vfmad_laneq_f64(__s0, __s1, (float64x2_t)__rev2, __p3); \
-  __ret; \
-})
-#define __noswap_vfmad_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64_t __ret; \
-  float64_t __s0 = __p0; \
-  float64_t __s1 = __p1; \
-  float64x2_t __s2 = __p2; \
-  __ret = (float64_t) __builtin_neon_vfmad_laneq_f64(__s0, __s1, (float64x2_t)__s2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmas_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32_t __ret; \
-  float32_t __s0 = __p0; \
-  float32_t __s1 = __p1; \
-  float32x4_t __s2 = __p2; \
-  __ret = (float32_t) __builtin_neon_vfmas_laneq_f32(__s0, __s1, (float32x4_t)__s2, __p3); \
-  __ret; \
-})
-#else
-#define vfmas_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32_t __ret; \
-  float32_t __s0 = __p0; \
-  float32_t __s1 = __p1; \
-  float32x4_t __s2 = __p2; \
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (float32_t) __builtin_neon_vfmas_laneq_f32(__s0, __s1, (float32x4_t)__rev2, __p3); \
-  __ret; \
-})
-#define __noswap_vfmas_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32_t __ret; \
-  float32_t __s0 = __p0; \
-  float32_t __s1 = __p1; \
-  float32x4_t __s2 = __p2; \
-  __ret = (float32_t) __builtin_neon_vfmas_laneq_f32(__s0, __s1, (float32x4_t)__s2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmaq_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __s2 = __p2; \
-  __ret = (float64x2_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 42); \
-  __ret; \
-})
-#else
-#define vfmaq_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __s2 = __p2; \
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
-  __ret = (float64x2_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 42); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfmaq_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __s2 = __p2; \
-  __ret = (float64x2_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 42); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  float32x4_t __s2 = __p2; \
-  __ret = (float32x4_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 41); \
-  __ret; \
-})
-#else
-#define vfmaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  float32x4_t __s2 = __p2; \
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (float32x4_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 41); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfmaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x4_t __ret; \
-  float32x4_t __s0 = __p0; \
-  float32x4_t __s1 = __p1; \
-  float32x4_t __s2 = __p2; \
-  __ret = (float32x4_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 41); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfma_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  float64x1_t __s1 = __p1; \
-  float64x2_t __s2 = __p2; \
-  __ret = (float64x1_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 10); \
-  __ret; \
-})
-#else
-#define vfma_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  float64x1_t __s1 = __p1; \
-  float64x2_t __s2 = __p2; \
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
-  __ret = (float64x1_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__rev2, __p3, 10); \
-  __ret; \
-})
-#define __noswap_vfma_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  float64x1_t __s1 = __p1; \
-  float64x2_t __s2 = __p2; \
-  __ret = (float64x1_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 10); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfma_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  float32x4_t __s2 = __p2; \
-  __ret = (float32x2_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 9); \
-  __ret; \
-})
-#else
-#define vfma_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  float32x4_t __s2 = __p2; \
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (float32x2_t) __builtin_neon_vfma_laneq_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x16_t)__rev2, __p3, 9); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vfma_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  float32x2_t __ret; \
-  float32x2_t __s0 = __p0; \
-  float32x2_t __s1 = __p1; \
-  float32x4_t __s2 = __p2; \
-  __ret = (float32x2_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 9); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vfmaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
-  float64x2_t __ret;
-  __ret = vfmaq_f64(__p0, __p1, (float64x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vfmaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vfmaq_f64(__rev0, __rev1, (float64x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vfma_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) {
-  float64x1_t __ret;
-  __ret = vfma_f64(__p0, __p1, (float64x1_t) {__p2});
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vfmsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = vfmaq_f64(__p0, -__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vfmsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vfmaq_f64(__rev0, -__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
-  float64x1_t __ret;
-  __ret = vfma_f64(__p0, -__p1, __p2);
-  return __ret;
-}
-#define vfmsd_lane_f64(__p0_426, __p1_426, __p2_426, __p3_426) __extension__ ({ \
-  float64_t __ret_426; \
-  float64_t __s0_426 = __p0_426; \
-  float64_t __s1_426 = __p1_426; \
-  float64x1_t __s2_426 = __p2_426; \
-  __ret_426 = vfmad_lane_f64(__s0_426, -__s1_426, __s2_426, __p3_426); \
-  __ret_426; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vfmss_lane_f32(__p0_427, __p1_427, __p2_427, __p3_427) __extension__ ({ \
-  float32_t __ret_427; \
-  float32_t __s0_427 = __p0_427; \
-  float32_t __s1_427 = __p1_427; \
-  float32x2_t __s2_427 = __p2_427; \
-  __ret_427 = vfmas_lane_f32(__s0_427, -__s1_427, __s2_427, __p3_427); \
-  __ret_427; \
-})
-#else
-#define vfmss_lane_f32(__p0_428, __p1_428, __p2_428, __p3_428) __extension__ ({ \
-  float32_t __ret_428; \
-  float32_t __s0_428 = __p0_428; \
-  float32_t __s1_428 = __p1_428; \
-  float32x2_t __s2_428 = __p2_428; \
-  float32x2_t __rev2_428;  __rev2_428 = __builtin_shufflevector(__s2_428, __s2_428, 1, 0); \
-  __ret_428 = __noswap_vfmas_lane_f32(__s0_428, -__s1_428, __rev2_428, __p3_428); \
-  __ret_428; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsq_lane_f64(__p0_429, __p1_429, __p2_429, __p3_429) __extension__ ({ \
-  float64x2_t __ret_429; \
-  float64x2_t __s0_429 = __p0_429; \
-  float64x2_t __s1_429 = __p1_429; \
-  float64x1_t __s2_429 = __p2_429; \
-  __ret_429 = vfmaq_lane_f64(__s0_429, -__s1_429, __s2_429, __p3_429); \
-  __ret_429; \
-})
-#else
-#define vfmsq_lane_f64(__p0_430, __p1_430, __p2_430, __p3_430) __extension__ ({ \
-  float64x2_t __ret_430; \
-  float64x2_t __s0_430 = __p0_430; \
-  float64x2_t __s1_430 = __p1_430; \
-  float64x1_t __s2_430 = __p2_430; \
-  float64x2_t __rev0_430;  __rev0_430 = __builtin_shufflevector(__s0_430, __s0_430, 1, 0); \
-  float64x2_t __rev1_430;  __rev1_430 = __builtin_shufflevector(__s1_430, __s1_430, 1, 0); \
-  __ret_430 = __noswap_vfmaq_lane_f64(__rev0_430, -__rev1_430, __s2_430, __p3_430); \
-  __ret_430 = __builtin_shufflevector(__ret_430, __ret_430, 1, 0); \
-  __ret_430; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsq_lane_f32(__p0_431, __p1_431, __p2_431, __p3_431) __extension__ ({ \
-  float32x4_t __ret_431; \
-  float32x4_t __s0_431 = __p0_431; \
-  float32x4_t __s1_431 = __p1_431; \
-  float32x2_t __s2_431 = __p2_431; \
-  __ret_431 = vfmaq_lane_f32(__s0_431, -__s1_431, __s2_431, __p3_431); \
-  __ret_431; \
-})
-#else
-#define vfmsq_lane_f32(__p0_432, __p1_432, __p2_432, __p3_432) __extension__ ({ \
-  float32x4_t __ret_432; \
-  float32x4_t __s0_432 = __p0_432; \
-  float32x4_t __s1_432 = __p1_432; \
-  float32x2_t __s2_432 = __p2_432; \
-  float32x4_t __rev0_432;  __rev0_432 = __builtin_shufflevector(__s0_432, __s0_432, 3, 2, 1, 0); \
-  float32x4_t __rev1_432;  __rev1_432 = __builtin_shufflevector(__s1_432, __s1_432, 3, 2, 1, 0); \
-  float32x2_t __rev2_432;  __rev2_432 = __builtin_shufflevector(__s2_432, __s2_432, 1, 0); \
-  __ret_432 = __noswap_vfmaq_lane_f32(__rev0_432, -__rev1_432, __rev2_432, __p3_432); \
-  __ret_432 = __builtin_shufflevector(__ret_432, __ret_432, 3, 2, 1, 0); \
-  __ret_432; \
-})
-#endif
-
-#define vfms_lane_f64(__p0_433, __p1_433, __p2_433, __p3_433) __extension__ ({ \
-  float64x1_t __ret_433; \
-  float64x1_t __s0_433 = __p0_433; \
-  float64x1_t __s1_433 = __p1_433; \
-  float64x1_t __s2_433 = __p2_433; \
-  __ret_433 = vfma_lane_f64(__s0_433, -__s1_433, __s2_433, __p3_433); \
-  __ret_433; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vfms_lane_f32(__p0_434, __p1_434, __p2_434, __p3_434) __extension__ ({ \
-  float32x2_t __ret_434; \
-  float32x2_t __s0_434 = __p0_434; \
-  float32x2_t __s1_434 = __p1_434; \
-  float32x2_t __s2_434 = __p2_434; \
-  __ret_434 = vfma_lane_f32(__s0_434, -__s1_434, __s2_434, __p3_434); \
-  __ret_434; \
-})
-#else
-#define vfms_lane_f32(__p0_435, __p1_435, __p2_435, __p3_435) __extension__ ({ \
-  float32x2_t __ret_435; \
-  float32x2_t __s0_435 = __p0_435; \
-  float32x2_t __s1_435 = __p1_435; \
-  float32x2_t __s2_435 = __p2_435; \
-  float32x2_t __rev0_435;  __rev0_435 = __builtin_shufflevector(__s0_435, __s0_435, 1, 0); \
-  float32x2_t __rev1_435;  __rev1_435 = __builtin_shufflevector(__s1_435, __s1_435, 1, 0); \
-  float32x2_t __rev2_435;  __rev2_435 = __builtin_shufflevector(__s2_435, __s2_435, 1, 0); \
-  __ret_435 = __noswap_vfma_lane_f32(__rev0_435, -__rev1_435, __rev2_435, __p3_435); \
-  __ret_435 = __builtin_shufflevector(__ret_435, __ret_435, 1, 0); \
-  __ret_435; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsd_laneq_f64(__p0_436, __p1_436, __p2_436, __p3_436) __extension__ ({ \
-  float64_t __ret_436; \
-  float64_t __s0_436 = __p0_436; \
-  float64_t __s1_436 = __p1_436; \
-  float64x2_t __s2_436 = __p2_436; \
-  __ret_436 = vfmad_laneq_f64(__s0_436, -__s1_436, __s2_436, __p3_436); \
-  __ret_436; \
-})
-#else
-#define vfmsd_laneq_f64(__p0_437, __p1_437, __p2_437, __p3_437) __extension__ ({ \
-  float64_t __ret_437; \
-  float64_t __s0_437 = __p0_437; \
-  float64_t __s1_437 = __p1_437; \
-  float64x2_t __s2_437 = __p2_437; \
-  float64x2_t __rev2_437;  __rev2_437 = __builtin_shufflevector(__s2_437, __s2_437, 1, 0); \
-  __ret_437 = __noswap_vfmad_laneq_f64(__s0_437, -__s1_437, __rev2_437, __p3_437); \
-  __ret_437; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmss_laneq_f32(__p0_438, __p1_438, __p2_438, __p3_438) __extension__ ({ \
-  float32_t __ret_438; \
-  float32_t __s0_438 = __p0_438; \
-  float32_t __s1_438 = __p1_438; \
-  float32x4_t __s2_438 = __p2_438; \
-  __ret_438 = vfmas_laneq_f32(__s0_438, -__s1_438, __s2_438, __p3_438); \
-  __ret_438; \
-})
-#else
-#define vfmss_laneq_f32(__p0_439, __p1_439, __p2_439, __p3_439) __extension__ ({ \
-  float32_t __ret_439; \
-  float32_t __s0_439 = __p0_439; \
-  float32_t __s1_439 = __p1_439; \
-  float32x4_t __s2_439 = __p2_439; \
-  float32x4_t __rev2_439;  __rev2_439 = __builtin_shufflevector(__s2_439, __s2_439, 3, 2, 1, 0); \
-  __ret_439 = __noswap_vfmas_laneq_f32(__s0_439, -__s1_439, __rev2_439, __p3_439); \
-  __ret_439; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsq_laneq_f64(__p0_440, __p1_440, __p2_440, __p3_440) __extension__ ({ \
-  float64x2_t __ret_440; \
-  float64x2_t __s0_440 = __p0_440; \
-  float64x2_t __s1_440 = __p1_440; \
-  float64x2_t __s2_440 = __p2_440; \
-  __ret_440 = vfmaq_laneq_f64(__s0_440, -__s1_440, __s2_440, __p3_440); \
-  __ret_440; \
-})
-#else
-#define vfmsq_laneq_f64(__p0_441, __p1_441, __p2_441, __p3_441) __extension__ ({ \
-  float64x2_t __ret_441; \
-  float64x2_t __s0_441 = __p0_441; \
-  float64x2_t __s1_441 = __p1_441; \
-  float64x2_t __s2_441 = __p2_441; \
-  float64x2_t __rev0_441;  __rev0_441 = __builtin_shufflevector(__s0_441, __s0_441, 1, 0); \
-  float64x2_t __rev1_441;  __rev1_441 = __builtin_shufflevector(__s1_441, __s1_441, 1, 0); \
-  float64x2_t __rev2_441;  __rev2_441 = __builtin_shufflevector(__s2_441, __s2_441, 1, 0); \
-  __ret_441 = __noswap_vfmaq_laneq_f64(__rev0_441, -__rev1_441, __rev2_441, __p3_441); \
-  __ret_441 = __builtin_shufflevector(__ret_441, __ret_441, 1, 0); \
-  __ret_441; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmsq_laneq_f32(__p0_442, __p1_442, __p2_442, __p3_442) __extension__ ({ \
-  float32x4_t __ret_442; \
-  float32x4_t __s0_442 = __p0_442; \
-  float32x4_t __s1_442 = __p1_442; \
-  float32x4_t __s2_442 = __p2_442; \
-  __ret_442 = vfmaq_laneq_f32(__s0_442, -__s1_442, __s2_442, __p3_442); \
-  __ret_442; \
-})
-#else
-#define vfmsq_laneq_f32(__p0_443, __p1_443, __p2_443, __p3_443) __extension__ ({ \
-  float32x4_t __ret_443; \
-  float32x4_t __s0_443 = __p0_443; \
-  float32x4_t __s1_443 = __p1_443; \
-  float32x4_t __s2_443 = __p2_443; \
-  float32x4_t __rev0_443;  __rev0_443 = __builtin_shufflevector(__s0_443, __s0_443, 3, 2, 1, 0); \
-  float32x4_t __rev1_443;  __rev1_443 = __builtin_shufflevector(__s1_443, __s1_443, 3, 2, 1, 0); \
-  float32x4_t __rev2_443;  __rev2_443 = __builtin_shufflevector(__s2_443, __s2_443, 3, 2, 1, 0); \
-  __ret_443 = __noswap_vfmaq_laneq_f32(__rev0_443, -__rev1_443, __rev2_443, __p3_443); \
-  __ret_443 = __builtin_shufflevector(__ret_443, __ret_443, 3, 2, 1, 0); \
-  __ret_443; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfms_laneq_f64(__p0_444, __p1_444, __p2_444, __p3_444) __extension__ ({ \
-  float64x1_t __ret_444; \
-  float64x1_t __s0_444 = __p0_444; \
-  float64x1_t __s1_444 = __p1_444; \
-  float64x2_t __s2_444 = __p2_444; \
-  __ret_444 = vfma_laneq_f64(__s0_444, -__s1_444, __s2_444, __p3_444); \
-  __ret_444; \
-})
-#else
-#define vfms_laneq_f64(__p0_445, __p1_445, __p2_445, __p3_445) __extension__ ({ \
-  float64x1_t __ret_445; \
-  float64x1_t __s0_445 = __p0_445; \
-  float64x1_t __s1_445 = __p1_445; \
-  float64x2_t __s2_445 = __p2_445; \
-  float64x2_t __rev2_445;  __rev2_445 = __builtin_shufflevector(__s2_445, __s2_445, 1, 0); \
-  __ret_445 = __noswap_vfma_laneq_f64(__s0_445, -__s1_445, __rev2_445, __p3_445); \
-  __ret_445; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfms_laneq_f32(__p0_446, __p1_446, __p2_446, __p3_446) __extension__ ({ \
-  float32x2_t __ret_446; \
-  float32x2_t __s0_446 = __p0_446; \
-  float32x2_t __s1_446 = __p1_446; \
-  float32x4_t __s2_446 = __p2_446; \
-  __ret_446 = vfma_laneq_f32(__s0_446, -__s1_446, __s2_446, __p3_446); \
-  __ret_446; \
-})
-#else
-#define vfms_laneq_f32(__p0_447, __p1_447, __p2_447, __p3_447) __extension__ ({ \
-  float32x2_t __ret_447; \
-  float32x2_t __s0_447 = __p0_447; \
-  float32x2_t __s1_447 = __p1_447; \
-  float32x4_t __s2_447 = __p2_447; \
-  float32x2_t __rev0_447;  __rev0_447 = __builtin_shufflevector(__s0_447, __s0_447, 1, 0); \
-  float32x2_t __rev1_447;  __rev1_447 = __builtin_shufflevector(__s1_447, __s1_447, 1, 0); \
-  float32x4_t __rev2_447;  __rev2_447 = __builtin_shufflevector(__s2_447, __s2_447, 3, 2, 1, 0); \
-  __ret_447 = __noswap_vfma_laneq_f32(__rev0_447, -__rev1_447, __rev2_447, __p3_447); \
-  __ret_447 = __builtin_shufflevector(__ret_447, __ret_447, 1, 0); \
-  __ret_447; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
-  float64x2_t __ret;
-  __ret = vfmaq_f64(__p0, -__p1, (float64x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vfmaq_f64(__rev0, -__rev1, (float64x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vfmsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __ret;
-  __ret = vfmaq_f32(__p0, -__p1, (float32x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vfmsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vfmaq_f32(__rev0, -__rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vfms_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) {
-  float64x1_t __ret;
-  __ret = vfma_f64(__p0, -__p1, (float64x1_t) {__p2});
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vfms_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __ret;
-  __ret = vfma_f32(__p0, -__p1, (float32x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vfms_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vfma_f32(__rev0, -__rev1, (float32x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x1_t vget_high_p64(poly64x2_t __p0) {
-  poly64x1_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x1_t vget_high_p64(poly64x2_t __p0) {
-  poly64x1_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t __noswap_vget_high_p64(poly64x2_t __p0) {
-  poly64x1_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x1_t vget_high_f64(float64x2_t __p0) {
-  float64x1_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x1_t vget_high_f64(float64x2_t __p0) {
-  float64x1_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
-  return __ret;
-}
-#endif
-
-#define vget_lane_p64(__p0, __p1) __extension__ ({ \
-  poly64_t __ret; \
-  poly64x1_t __s0 = __p0; \
-  __ret = (poly64_t) __builtin_neon_vget_lane_i64((poly64x1_t)__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_p64(__p0, __p1) __extension__ ({ \
-  poly64_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  __ret = (poly64_t) __builtin_neon_vgetq_lane_i64((poly64x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_p64(__p0, __p1) __extension__ ({ \
-  poly64_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (poly64_t) __builtin_neon_vgetq_lane_i64((poly64x2_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_p64(__p0, __p1) __extension__ ({ \
-  poly64_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  __ret = (poly64_t) __builtin_neon_vgetq_lane_i64((poly64x2_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_f64(__p0, __p1) __extension__ ({ \
-  float64_t __ret; \
-  float64x2_t __s0 = __p0; \
-  __ret = (float64_t) __builtin_neon_vgetq_lane_f64((float64x2_t)__s0, __p1); \
-  __ret; \
-})
-#else
-#define vgetq_lane_f64(__p0, __p1) __extension__ ({ \
-  float64_t __ret; \
-  float64x2_t __s0 = __p0; \
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  __ret = (float64_t) __builtin_neon_vgetq_lane_f64((float64x2_t)__rev0, __p1); \
-  __ret; \
-})
-#define __noswap_vgetq_lane_f64(__p0, __p1) __extension__ ({ \
-  float64_t __ret; \
-  float64x2_t __s0 = __p0; \
-  __ret = (float64_t) __builtin_neon_vgetq_lane_f64((float64x2_t)__s0, __p1); \
-  __ret; \
-})
-#endif
-
-#define vget_lane_f64(__p0, __p1) __extension__ ({ \
-  float64_t __ret; \
-  float64x1_t __s0 = __p0; \
-  __ret = (float64_t) __builtin_neon_vget_lane_f64((float64x1_t)__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x1_t vget_low_p64(poly64x2_t __p0) {
-  poly64x1_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x1_t vget_low_p64(poly64x2_t __p0) {
-  poly64x1_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x1_t vget_low_f64(float64x2_t __p0) {
-  float64x1_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x1_t vget_low_f64(float64x2_t __p0) {
-  float64x1_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
-  return __ret;
-}
-#endif
-
-#define vld1_p64(__p0) __extension__ ({ \
-  poly64x1_t __ret; \
-  __ret = (poly64x1_t) __builtin_neon_vld1_v(__p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p64(__p0) __extension__ ({ \
-  poly64x2_t __ret; \
-  __ret = (poly64x2_t) __builtin_neon_vld1q_v(__p0, 38); \
-  __ret; \
-})
-#else
-#define vld1q_p64(__p0) __extension__ ({ \
-  poly64x2_t __ret; \
-  __ret = (poly64x2_t) __builtin_neon_vld1q_v(__p0, 38); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f64(__p0) __extension__ ({ \
-  float64x2_t __ret; \
-  __ret = (float64x2_t) __builtin_neon_vld1q_v(__p0, 42); \
-  __ret; \
-})
-#else
-#define vld1q_f64(__p0) __extension__ ({ \
-  float64x2_t __ret; \
-  __ret = (float64x2_t) __builtin_neon_vld1q_v(__p0, 42); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_f64(__p0) __extension__ ({ \
-  float64x1_t __ret; \
-  __ret = (float64x1_t) __builtin_neon_vld1_v(__p0, 10); \
-  __ret; \
-})
-#define vld1_dup_p64(__p0) __extension__ ({ \
-  poly64x1_t __ret; \
-  __ret = (poly64x1_t) __builtin_neon_vld1_dup_v(__p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_p64(__p0) __extension__ ({ \
-  poly64x2_t __ret; \
-  __ret = (poly64x2_t) __builtin_neon_vld1q_dup_v(__p0, 38); \
-  __ret; \
-})
-#else
-#define vld1q_dup_p64(__p0) __extension__ ({ \
-  poly64x2_t __ret; \
-  __ret = (poly64x2_t) __builtin_neon_vld1q_dup_v(__p0, 38); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_dup_f64(__p0) __extension__ ({ \
-  float64x2_t __ret; \
-  __ret = (float64x2_t) __builtin_neon_vld1q_dup_v(__p0, 42); \
-  __ret; \
-})
-#else
-#define vld1q_dup_f64(__p0) __extension__ ({ \
-  float64x2_t __ret; \
-  __ret = (float64x2_t) __builtin_neon_vld1q_dup_v(__p0, 42); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_dup_f64(__p0) __extension__ ({ \
-  float64x1_t __ret; \
-  __ret = (float64x1_t) __builtin_neon_vld1_dup_v(__p0, 10); \
-  __ret; \
-})
-#define vld1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64x1_t __s1 = __p1; \
-  __ret = (poly64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s1 = __p1; \
-  __ret = (poly64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 38); \
-  __ret; \
-})
-#else
-#define vld1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s1 = __p1; \
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (poly64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 38); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s1 = __p1; \
-  __ret = (float64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 42); \
-  __ret; \
-})
-#else
-#define vld1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (float64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 42); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s1 = __p1; \
-  __ret = (float64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
-  __ret; \
-})
-#define vld1_p64_x2(__p0) __extension__ ({ \
-  poly64x1x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p64_x2(__p0) __extension__ ({ \
-  poly64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 38); \
-  __ret; \
-})
-#else
-#define vld1q_p64_x2(__p0) __extension__ ({ \
-  poly64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f64_x2(__p0) __extension__ ({ \
-  float64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 42); \
-  __ret; \
-})
-#else
-#define vld1q_f64_x2(__p0) __extension__ ({ \
-  float64x2x2_t __ret; \
-  __builtin_neon_vld1q_x2_v(&__ret, __p0, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_f64_x2(__p0) __extension__ ({ \
-  float64x1x2_t __ret; \
-  __builtin_neon_vld1_x2_v(&__ret, __p0, 10); \
-  __ret; \
-})
-#define vld1_p64_x3(__p0) __extension__ ({ \
-  poly64x1x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p64_x3(__p0) __extension__ ({ \
-  poly64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 38); \
-  __ret; \
-})
-#else
-#define vld1q_p64_x3(__p0) __extension__ ({ \
-  poly64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f64_x3(__p0) __extension__ ({ \
-  float64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 42); \
-  __ret; \
-})
-#else
-#define vld1q_f64_x3(__p0) __extension__ ({ \
-  float64x2x3_t __ret; \
-  __builtin_neon_vld1q_x3_v(&__ret, __p0, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_f64_x3(__p0) __extension__ ({ \
-  float64x1x3_t __ret; \
-  __builtin_neon_vld1_x3_v(&__ret, __p0, 10); \
-  __ret; \
-})
-#define vld1_p64_x4(__p0) __extension__ ({ \
-  poly64x1x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_p64_x4(__p0) __extension__ ({ \
-  poly64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 38); \
-  __ret; \
-})
-#else
-#define vld1q_p64_x4(__p0) __extension__ ({ \
-  poly64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld1q_f64_x4(__p0) __extension__ ({ \
-  float64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 42); \
-  __ret; \
-})
-#else
-#define vld1q_f64_x4(__p0) __extension__ ({ \
-  float64x2x4_t __ret; \
-  __builtin_neon_vld1q_x4_v(&__ret, __p0, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld1_f64_x4(__p0) __extension__ ({ \
-  float64x1x4_t __ret; \
-  __builtin_neon_vld1_x4_v(&__ret, __p0, 10); \
-  __ret; \
-})
-#define vld2_p64(__p0) __extension__ ({ \
-  poly64x1x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_p64(__p0) __extension__ ({ \
-  poly64x2x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 38); \
-  __ret; \
-})
-#else
-#define vld2q_p64(__p0) __extension__ ({ \
-  poly64x2x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_u64(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld2q_u64(__p0) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_f64(__p0) __extension__ ({ \
-  float64x2x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 42); \
-  __ret; \
-})
-#else
-#define vld2q_f64(__p0) __extension__ ({ \
-  float64x2x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_s64(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld2q_s64(__p0) __extension__ ({ \
-  int64x2x2_t __ret; \
-  __builtin_neon_vld2q_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld2_f64(__p0) __extension__ ({ \
-  float64x1x2_t __ret; \
-  __builtin_neon_vld2_v(&__ret, __p0, 10); \
-  __ret; \
-})
-#define vld2_dup_p64(__p0) __extension__ ({ \
-  poly64x1x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_p64(__p0) __extension__ ({ \
-  poly64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 38); \
-  __ret; \
-})
-#else
-#define vld2q_dup_p64(__p0) __extension__ ({ \
-  poly64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_dup_f64(__p0) __extension__ ({ \
-  float64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 42); \
-  __ret; \
-})
-#else
-#define vld2q_dup_f64(__p0) __extension__ ({ \
-  float64x2x2_t __ret; \
-  __builtin_neon_vld2q_dup_v(&__ret, __p0, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld2_dup_f64(__p0) __extension__ ({ \
-  float64x1x2_t __ret; \
-  __builtin_neon_vld2_dup_v(&__ret, __p0, 10); \
-  __ret; \
-})
-#define vld2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1x2_t __ret; \
-  poly64x1x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  poly8x16x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 36); \
-  __ret; \
-})
-#else
-#define vld2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x2_t __ret; \
-  poly8x16x2_t __s1 = __p1; \
-  poly8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x2_t __ret; \
-  poly64x2x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 38); \
-  __ret; \
-})
-#else
-#define vld2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x2_t __ret; \
-  poly64x2x2_t __s1 = __p1; \
-  poly64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  uint8x16x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 48); \
-  __ret; \
-})
-#else
-#define vld2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x2_t __ret; \
-  uint8x16x2_t __s1 = __p1; \
-  uint8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  uint64x2x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 51); \
-  __ret; \
-})
-#else
-#define vld2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x2_t __ret; \
-  uint64x2x2_t __s1 = __p1; \
-  uint64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x2_t __ret; \
-  int8x16x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 32); \
-  __ret; \
-})
-#else
-#define vld2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x2_t __ret; \
-  int8x16x2_t __s1 = __p1; \
-  int8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x2_t __ret; \
-  float64x2x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 42); \
-  __ret; \
-})
-#else
-#define vld2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x2_t __ret; \
-  float64x2x2_t __s1 = __p1; \
-  float64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x2_t __ret; \
-  int64x2x2_t __s1 = __p1; \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 35); \
-  __ret; \
-})
-#else
-#define vld2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x2_t __ret; \
-  int64x2x2_t __s1 = __p1; \
-  int64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1x2_t __ret; \
-  uint64x1x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
-  __ret; \
-})
-#define vld2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1x2_t __ret; \
-  float64x1x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 10); \
-  __ret; \
-})
-#define vld2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1x2_t __ret; \
-  int64x1x2_t __s1 = __p1; \
-  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 3); \
-  __ret; \
-})
-#define vld3_p64(__p0) __extension__ ({ \
-  poly64x1x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_p64(__p0) __extension__ ({ \
-  poly64x2x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 38); \
-  __ret; \
-})
-#else
-#define vld3q_p64(__p0) __extension__ ({ \
-  poly64x2x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_u64(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld3q_u64(__p0) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_f64(__p0) __extension__ ({ \
-  float64x2x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 42); \
-  __ret; \
-})
-#else
-#define vld3q_f64(__p0) __extension__ ({ \
-  float64x2x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_s64(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld3q_s64(__p0) __extension__ ({ \
-  int64x2x3_t __ret; \
-  __builtin_neon_vld3q_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld3_f64(__p0) __extension__ ({ \
-  float64x1x3_t __ret; \
-  __builtin_neon_vld3_v(&__ret, __p0, 10); \
-  __ret; \
-})
-#define vld3_dup_p64(__p0) __extension__ ({ \
-  poly64x1x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_p64(__p0) __extension__ ({ \
-  poly64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 38); \
-  __ret; \
-})
-#else
-#define vld3q_dup_p64(__p0) __extension__ ({ \
-  poly64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_dup_f64(__p0) __extension__ ({ \
-  float64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 42); \
-  __ret; \
-})
-#else
-#define vld3q_dup_f64(__p0) __extension__ ({ \
-  float64x2x3_t __ret; \
-  __builtin_neon_vld3q_dup_v(&__ret, __p0, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld3_dup_f64(__p0) __extension__ ({ \
-  float64x1x3_t __ret; \
-  __builtin_neon_vld3_dup_v(&__ret, __p0, 10); \
-  __ret; \
-})
-#define vld3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1x3_t __ret; \
-  poly64x1x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  poly8x16x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 36); \
-  __ret; \
-})
-#else
-#define vld3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x3_t __ret; \
-  poly8x16x3_t __s1 = __p1; \
-  poly8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x3_t __ret; \
-  poly64x2x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 38); \
-  __ret; \
-})
-#else
-#define vld3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x3_t __ret; \
-  poly64x2x3_t __s1 = __p1; \
-  poly64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  uint8x16x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 48); \
-  __ret; \
-})
-#else
-#define vld3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x3_t __ret; \
-  uint8x16x3_t __s1 = __p1; \
-  uint8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  uint64x2x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 51); \
-  __ret; \
-})
-#else
-#define vld3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x3_t __ret; \
-  uint64x2x3_t __s1 = __p1; \
-  uint64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x3_t __ret; \
-  int8x16x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 32); \
-  __ret; \
-})
-#else
-#define vld3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x3_t __ret; \
-  int8x16x3_t __s1 = __p1; \
-  int8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x3_t __ret; \
-  float64x2x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 42); \
-  __ret; \
-})
-#else
-#define vld3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x3_t __ret; \
-  float64x2x3_t __s1 = __p1; \
-  float64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x3_t __ret; \
-  int64x2x3_t __s1 = __p1; \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 35); \
-  __ret; \
-})
-#else
-#define vld3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x3_t __ret; \
-  int64x2x3_t __s1 = __p1; \
-  int64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1x3_t __ret; \
-  uint64x1x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
-  __ret; \
-})
-#define vld3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1x3_t __ret; \
-  float64x1x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 10); \
-  __ret; \
-})
-#define vld3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1x3_t __ret; \
-  int64x1x3_t __s1 = __p1; \
-  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 3); \
-  __ret; \
-})
-#define vld4_p64(__p0) __extension__ ({ \
-  poly64x1x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_p64(__p0) __extension__ ({ \
-  poly64x2x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 38); \
-  __ret; \
-})
-#else
-#define vld4q_p64(__p0) __extension__ ({ \
-  poly64x2x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_u64(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 51); \
-  __ret; \
-})
-#else
-#define vld4q_u64(__p0) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_f64(__p0) __extension__ ({ \
-  float64x2x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 42); \
-  __ret; \
-})
-#else
-#define vld4q_f64(__p0) __extension__ ({ \
-  float64x2x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_s64(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 35); \
-  __ret; \
-})
-#else
-#define vld4q_s64(__p0) __extension__ ({ \
-  int64x2x4_t __ret; \
-  __builtin_neon_vld4q_v(&__ret, __p0, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld4_f64(__p0) __extension__ ({ \
-  float64x1x4_t __ret; \
-  __builtin_neon_vld4_v(&__ret, __p0, 10); \
-  __ret; \
-})
-#define vld4_dup_p64(__p0) __extension__ ({ \
-  poly64x1x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_p64(__p0) __extension__ ({ \
-  poly64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 38); \
-  __ret; \
-})
-#else
-#define vld4q_dup_p64(__p0) __extension__ ({ \
-  poly64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_dup_f64(__p0) __extension__ ({ \
-  float64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 42); \
-  __ret; \
-})
-#else
-#define vld4q_dup_f64(__p0) __extension__ ({ \
-  float64x2x4_t __ret; \
-  __builtin_neon_vld4q_dup_v(&__ret, __p0, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld4_dup_f64(__p0) __extension__ ({ \
-  float64x1x4_t __ret; \
-  __builtin_neon_vld4_dup_v(&__ret, __p0, 10); \
-  __ret; \
-})
-#define vld4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1x4_t __ret; \
-  poly64x1x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  poly8x16x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 36); \
-  __ret; \
-})
-#else
-#define vld4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x4_t __ret; \
-  poly8x16x4_t __s1 = __p1; \
-  poly8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 36); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x4_t __ret; \
-  poly64x2x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 38); \
-  __ret; \
-})
-#else
-#define vld4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x4_t __ret; \
-  poly64x2x4_t __s1 = __p1; \
-  poly64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 38); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  uint8x16x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 48); \
-  __ret; \
-})
-#else
-#define vld4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x4_t __ret; \
-  uint8x16x4_t __s1 = __p1; \
-  uint8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 48); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  uint64x2x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 51); \
-  __ret; \
-})
-#else
-#define vld4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x4_t __ret; \
-  uint64x2x4_t __s1 = __p1; \
-  uint64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 51); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x4_t __ret; \
-  int8x16x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 32); \
-  __ret; \
-})
-#else
-#define vld4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x4_t __ret; \
-  int8x16x4_t __s1 = __p1; \
-  int8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 32); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x4_t __ret; \
-  float64x2x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 42); \
-  __ret; \
-})
-#else
-#define vld4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x4_t __ret; \
-  float64x2x4_t __s1 = __p1; \
-  float64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 42); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vld4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x4_t __ret; \
-  int64x2x4_t __s1 = __p1; \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 35); \
-  __ret; \
-})
-#else
-#define vld4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x4_t __ret; \
-  int64x2x4_t __s1 = __p1; \
-  int64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 35); \
- \
-  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
-  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
-  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
-  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vld4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1x4_t __ret; \
-  uint64x1x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
-  __ret; \
-})
-#define vld4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1x4_t __ret; \
-  float64x1x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 10); \
-  __ret; \
-})
-#define vld4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1x4_t __ret; \
-  int64x1x4_t __s1 = __p1; \
-  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 3); \
-  __ret; \
-})
-#define vldrq_p128(__p0) __extension__ ({ \
-  poly128_t __ret; \
-  __ret = (poly128_t) __builtin_neon_vldrq_p128(__p0); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vmax_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vmaxnmvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vmaxnmvq_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vmaxnmvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vmaxnmvq_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vmaxnmvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vmaxnmvq_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vmaxnmvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32_t) __builtin_neon_vmaxnmvq_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vmaxnmv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vmaxnmv_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vmaxnmv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vmaxnmv_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8_t vmaxvq_u8(uint8x16_t __p0) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vmaxvq_u8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8_t vmaxvq_u8(uint8x16_t __p0) {
-  uint8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8_t) __builtin_neon_vmaxvq_u8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32_t vmaxvq_u32(uint32x4_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vmaxvq_u32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32_t vmaxvq_u32(uint32x4_t __p0) {
-  uint32_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32_t) __builtin_neon_vmaxvq_u32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16_t vmaxvq_u16(uint16x8_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vmaxvq_u16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16_t vmaxvq_u16(uint16x8_t __p0) {
-  uint16_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16_t) __builtin_neon_vmaxvq_u16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8_t vmaxvq_s8(int8x16_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vmaxvq_s8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8_t vmaxvq_s8(int8x16_t __p0) {
-  int8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8_t) __builtin_neon_vmaxvq_s8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vmaxvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vmaxvq_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vmaxvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vmaxvq_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vmaxvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vmaxvq_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vmaxvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32_t) __builtin_neon_vmaxvq_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32_t vmaxvq_s32(int32x4_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vmaxvq_s32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32_t vmaxvq_s32(int32x4_t __p0) {
-  int32_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32_t) __builtin_neon_vmaxvq_s32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16_t vmaxvq_s16(int16x8_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vmaxvq_s16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16_t vmaxvq_s16(int16x8_t __p0) {
-  int16_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16_t) __builtin_neon_vmaxvq_s16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8_t vmaxv_u8(uint8x8_t __p0) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vmaxv_u8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8_t vmaxv_u8(uint8x8_t __p0) {
-  uint8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8_t) __builtin_neon_vmaxv_u8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32_t vmaxv_u32(uint32x2_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vmaxv_u32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32_t vmaxv_u32(uint32x2_t __p0) {
-  uint32_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32_t) __builtin_neon_vmaxv_u32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16_t vmaxv_u16(uint16x4_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vmaxv_u16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16_t vmaxv_u16(uint16x4_t __p0) {
-  uint16_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16_t) __builtin_neon_vmaxv_u16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8_t vmaxv_s8(int8x8_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vmaxv_s8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8_t vmaxv_s8(int8x8_t __p0) {
-  int8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8_t) __builtin_neon_vmaxv_s8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vmaxv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vmaxv_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vmaxv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vmaxv_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32_t vmaxv_s32(int32x2_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vmaxv_s32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32_t vmaxv_s32(int32x2_t __p0) {
-  int32_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32_t) __builtin_neon_vmaxv_s32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16_t vmaxv_s16(int16x4_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vmaxv_s16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16_t vmaxv_s16(int16x4_t __p0) {
-  int16_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16_t) __builtin_neon_vmaxv_s16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vminq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vminq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vmin_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vminnmvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vminnmvq_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vminnmvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vminnmvq_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vminnmvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vminnmvq_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vminnmvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32_t) __builtin_neon_vminnmvq_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vminnmv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vminnmv_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vminnmv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vminnmv_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8_t vminvq_u8(uint8x16_t __p0) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vminvq_u8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8_t vminvq_u8(uint8x16_t __p0) {
-  uint8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8_t) __builtin_neon_vminvq_u8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32_t vminvq_u32(uint32x4_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vminvq_u32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32_t vminvq_u32(uint32x4_t __p0) {
-  uint32_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint32_t) __builtin_neon_vminvq_u32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16_t vminvq_u16(uint16x8_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vminvq_u16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16_t vminvq_u16(uint16x8_t __p0) {
-  uint16_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16_t) __builtin_neon_vminvq_u16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8_t vminvq_s8(int8x16_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vminvq_s8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8_t vminvq_s8(int8x16_t __p0) {
-  int8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8_t) __builtin_neon_vminvq_s8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vminvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vminvq_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vminvq_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vminvq_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vminvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vminvq_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vminvq_f32(float32x4_t __p0) {
-  float32_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32_t) __builtin_neon_vminvq_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32_t vminvq_s32(int32x4_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vminvq_s32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32_t vminvq_s32(int32x4_t __p0) {
-  int32_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int32_t) __builtin_neon_vminvq_s32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16_t vminvq_s16(int16x8_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vminvq_s16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16_t vminvq_s16(int16x8_t __p0) {
-  int16_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16_t) __builtin_neon_vminvq_s16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8_t vminv_u8(uint8x8_t __p0) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vminv_u8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8_t vminv_u8(uint8x8_t __p0) {
-  uint8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8_t) __builtin_neon_vminv_u8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32_t vminv_u32(uint32x2_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vminv_u32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32_t vminv_u32(uint32x2_t __p0) {
-  uint32_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint32_t) __builtin_neon_vminv_u32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16_t vminv_u16(uint16x4_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vminv_u16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16_t vminv_u16(uint16x4_t __p0) {
-  uint16_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (uint16_t) __builtin_neon_vminv_u16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8_t vminv_s8(int8x8_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vminv_s8(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8_t vminv_s8(int8x8_t __p0) {
-  int8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8_t) __builtin_neon_vminv_s8(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vminv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vminv_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vminv_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vminv_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32_t vminv_s32(int32x2_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vminv_s32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32_t vminv_s32(int32x2_t __p0) {
-  int32_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int32_t) __builtin_neon_vminv_s32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16_t vminv_s16(int16x4_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vminv_s16(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16_t vminv_s16(int16x4_t __p0) {
-  int16_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (int16_t) __builtin_neon_vminv_s16(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
-  float64x1_t __ret;
-  __ret = __p0 + __p1 * __p2;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_u32(__p0_448, __p1_448, __p2_448, __p3_448) __extension__ ({ \
-  uint32x4_t __ret_448; \
-  uint32x4_t __s0_448 = __p0_448; \
-  uint32x4_t __s1_448 = __p1_448; \
-  uint32x4_t __s2_448 = __p2_448; \
-  __ret_448 = __s0_448 + __s1_448 * splatq_laneq_u32(__s2_448, __p3_448); \
-  __ret_448; \
-})
-#else
-#define vmlaq_laneq_u32(__p0_449, __p1_449, __p2_449, __p3_449) __extension__ ({ \
-  uint32x4_t __ret_449; \
-  uint32x4_t __s0_449 = __p0_449; \
-  uint32x4_t __s1_449 = __p1_449; \
-  uint32x4_t __s2_449 = __p2_449; \
-  uint32x4_t __rev0_449;  __rev0_449 = __builtin_shufflevector(__s0_449, __s0_449, 3, 2, 1, 0); \
-  uint32x4_t __rev1_449;  __rev1_449 = __builtin_shufflevector(__s1_449, __s1_449, 3, 2, 1, 0); \
-  uint32x4_t __rev2_449;  __rev2_449 = __builtin_shufflevector(__s2_449, __s2_449, 3, 2, 1, 0); \
-  __ret_449 = __rev0_449 + __rev1_449 * __noswap_splatq_laneq_u32(__rev2_449, __p3_449); \
-  __ret_449 = __builtin_shufflevector(__ret_449, __ret_449, 3, 2, 1, 0); \
-  __ret_449; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_u16(__p0_450, __p1_450, __p2_450, __p3_450) __extension__ ({ \
-  uint16x8_t __ret_450; \
-  uint16x8_t __s0_450 = __p0_450; \
-  uint16x8_t __s1_450 = __p1_450; \
-  uint16x8_t __s2_450 = __p2_450; \
-  __ret_450 = __s0_450 + __s1_450 * splatq_laneq_u16(__s2_450, __p3_450); \
-  __ret_450; \
-})
-#else
-#define vmlaq_laneq_u16(__p0_451, __p1_451, __p2_451, __p3_451) __extension__ ({ \
-  uint16x8_t __ret_451; \
-  uint16x8_t __s0_451 = __p0_451; \
-  uint16x8_t __s1_451 = __p1_451; \
-  uint16x8_t __s2_451 = __p2_451; \
-  uint16x8_t __rev0_451;  __rev0_451 = __builtin_shufflevector(__s0_451, __s0_451, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_451;  __rev1_451 = __builtin_shufflevector(__s1_451, __s1_451, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_451;  __rev2_451 = __builtin_shufflevector(__s2_451, __s2_451, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_451 = __rev0_451 + __rev1_451 * __noswap_splatq_laneq_u16(__rev2_451, __p3_451); \
-  __ret_451 = __builtin_shufflevector(__ret_451, __ret_451, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_451; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_f32(__p0_452, __p1_452, __p2_452, __p3_452) __extension__ ({ \
-  float32x4_t __ret_452; \
-  float32x4_t __s0_452 = __p0_452; \
-  float32x4_t __s1_452 = __p1_452; \
-  float32x4_t __s2_452 = __p2_452; \
-  __ret_452 = __s0_452 + __s1_452 * splatq_laneq_f32(__s2_452, __p3_452); \
-  __ret_452; \
-})
-#else
-#define vmlaq_laneq_f32(__p0_453, __p1_453, __p2_453, __p3_453) __extension__ ({ \
-  float32x4_t __ret_453; \
-  float32x4_t __s0_453 = __p0_453; \
-  float32x4_t __s1_453 = __p1_453; \
-  float32x4_t __s2_453 = __p2_453; \
-  float32x4_t __rev0_453;  __rev0_453 = __builtin_shufflevector(__s0_453, __s0_453, 3, 2, 1, 0); \
-  float32x4_t __rev1_453;  __rev1_453 = __builtin_shufflevector(__s1_453, __s1_453, 3, 2, 1, 0); \
-  float32x4_t __rev2_453;  __rev2_453 = __builtin_shufflevector(__s2_453, __s2_453, 3, 2, 1, 0); \
-  __ret_453 = __rev0_453 + __rev1_453 * __noswap_splatq_laneq_f32(__rev2_453, __p3_453); \
-  __ret_453 = __builtin_shufflevector(__ret_453, __ret_453, 3, 2, 1, 0); \
-  __ret_453; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_s32(__p0_454, __p1_454, __p2_454, __p3_454) __extension__ ({ \
-  int32x4_t __ret_454; \
-  int32x4_t __s0_454 = __p0_454; \
-  int32x4_t __s1_454 = __p1_454; \
-  int32x4_t __s2_454 = __p2_454; \
-  __ret_454 = __s0_454 + __s1_454 * splatq_laneq_s32(__s2_454, __p3_454); \
-  __ret_454; \
-})
-#else
-#define vmlaq_laneq_s32(__p0_455, __p1_455, __p2_455, __p3_455) __extension__ ({ \
-  int32x4_t __ret_455; \
-  int32x4_t __s0_455 = __p0_455; \
-  int32x4_t __s1_455 = __p1_455; \
-  int32x4_t __s2_455 = __p2_455; \
-  int32x4_t __rev0_455;  __rev0_455 = __builtin_shufflevector(__s0_455, __s0_455, 3, 2, 1, 0); \
-  int32x4_t __rev1_455;  __rev1_455 = __builtin_shufflevector(__s1_455, __s1_455, 3, 2, 1, 0); \
-  int32x4_t __rev2_455;  __rev2_455 = __builtin_shufflevector(__s2_455, __s2_455, 3, 2, 1, 0); \
-  __ret_455 = __rev0_455 + __rev1_455 * __noswap_splatq_laneq_s32(__rev2_455, __p3_455); \
-  __ret_455 = __builtin_shufflevector(__ret_455, __ret_455, 3, 2, 1, 0); \
-  __ret_455; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlaq_laneq_s16(__p0_456, __p1_456, __p2_456, __p3_456) __extension__ ({ \
-  int16x8_t __ret_456; \
-  int16x8_t __s0_456 = __p0_456; \
-  int16x8_t __s1_456 = __p1_456; \
-  int16x8_t __s2_456 = __p2_456; \
-  __ret_456 = __s0_456 + __s1_456 * splatq_laneq_s16(__s2_456, __p3_456); \
-  __ret_456; \
-})
-#else
-#define vmlaq_laneq_s16(__p0_457, __p1_457, __p2_457, __p3_457) __extension__ ({ \
-  int16x8_t __ret_457; \
-  int16x8_t __s0_457 = __p0_457; \
-  int16x8_t __s1_457 = __p1_457; \
-  int16x8_t __s2_457 = __p2_457; \
-  int16x8_t __rev0_457;  __rev0_457 = __builtin_shufflevector(__s0_457, __s0_457, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_457;  __rev1_457 = __builtin_shufflevector(__s1_457, __s1_457, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_457;  __rev2_457 = __builtin_shufflevector(__s2_457, __s2_457, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_457 = __rev0_457 + __rev1_457 * __noswap_splatq_laneq_s16(__rev2_457, __p3_457); \
-  __ret_457 = __builtin_shufflevector(__ret_457, __ret_457, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_457; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_u32(__p0_458, __p1_458, __p2_458, __p3_458) __extension__ ({ \
-  uint32x2_t __ret_458; \
-  uint32x2_t __s0_458 = __p0_458; \
-  uint32x2_t __s1_458 = __p1_458; \
-  uint32x4_t __s2_458 = __p2_458; \
-  __ret_458 = __s0_458 + __s1_458 * splat_laneq_u32(__s2_458, __p3_458); \
-  __ret_458; \
-})
-#else
-#define vmla_laneq_u32(__p0_459, __p1_459, __p2_459, __p3_459) __extension__ ({ \
-  uint32x2_t __ret_459; \
-  uint32x2_t __s0_459 = __p0_459; \
-  uint32x2_t __s1_459 = __p1_459; \
-  uint32x4_t __s2_459 = __p2_459; \
-  uint32x2_t __rev0_459;  __rev0_459 = __builtin_shufflevector(__s0_459, __s0_459, 1, 0); \
-  uint32x2_t __rev1_459;  __rev1_459 = __builtin_shufflevector(__s1_459, __s1_459, 1, 0); \
-  uint32x4_t __rev2_459;  __rev2_459 = __builtin_shufflevector(__s2_459, __s2_459, 3, 2, 1, 0); \
-  __ret_459 = __rev0_459 + __rev1_459 * __noswap_splat_laneq_u32(__rev2_459, __p3_459); \
-  __ret_459 = __builtin_shufflevector(__ret_459, __ret_459, 1, 0); \
-  __ret_459; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_u16(__p0_460, __p1_460, __p2_460, __p3_460) __extension__ ({ \
-  uint16x4_t __ret_460; \
-  uint16x4_t __s0_460 = __p0_460; \
-  uint16x4_t __s1_460 = __p1_460; \
-  uint16x8_t __s2_460 = __p2_460; \
-  __ret_460 = __s0_460 + __s1_460 * splat_laneq_u16(__s2_460, __p3_460); \
-  __ret_460; \
-})
-#else
-#define vmla_laneq_u16(__p0_461, __p1_461, __p2_461, __p3_461) __extension__ ({ \
-  uint16x4_t __ret_461; \
-  uint16x4_t __s0_461 = __p0_461; \
-  uint16x4_t __s1_461 = __p1_461; \
-  uint16x8_t __s2_461 = __p2_461; \
-  uint16x4_t __rev0_461;  __rev0_461 = __builtin_shufflevector(__s0_461, __s0_461, 3, 2, 1, 0); \
-  uint16x4_t __rev1_461;  __rev1_461 = __builtin_shufflevector(__s1_461, __s1_461, 3, 2, 1, 0); \
-  uint16x8_t __rev2_461;  __rev2_461 = __builtin_shufflevector(__s2_461, __s2_461, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_461 = __rev0_461 + __rev1_461 * __noswap_splat_laneq_u16(__rev2_461, __p3_461); \
-  __ret_461 = __builtin_shufflevector(__ret_461, __ret_461, 3, 2, 1, 0); \
-  __ret_461; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_f32(__p0_462, __p1_462, __p2_462, __p3_462) __extension__ ({ \
-  float32x2_t __ret_462; \
-  float32x2_t __s0_462 = __p0_462; \
-  float32x2_t __s1_462 = __p1_462; \
-  float32x4_t __s2_462 = __p2_462; \
-  __ret_462 = __s0_462 + __s1_462 * splat_laneq_f32(__s2_462, __p3_462); \
-  __ret_462; \
-})
-#else
-#define vmla_laneq_f32(__p0_463, __p1_463, __p2_463, __p3_463) __extension__ ({ \
-  float32x2_t __ret_463; \
-  float32x2_t __s0_463 = __p0_463; \
-  float32x2_t __s1_463 = __p1_463; \
-  float32x4_t __s2_463 = __p2_463; \
-  float32x2_t __rev0_463;  __rev0_463 = __builtin_shufflevector(__s0_463, __s0_463, 1, 0); \
-  float32x2_t __rev1_463;  __rev1_463 = __builtin_shufflevector(__s1_463, __s1_463, 1, 0); \
-  float32x4_t __rev2_463;  __rev2_463 = __builtin_shufflevector(__s2_463, __s2_463, 3, 2, 1, 0); \
-  __ret_463 = __rev0_463 + __rev1_463 * __noswap_splat_laneq_f32(__rev2_463, __p3_463); \
-  __ret_463 = __builtin_shufflevector(__ret_463, __ret_463, 1, 0); \
-  __ret_463; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_s32(__p0_464, __p1_464, __p2_464, __p3_464) __extension__ ({ \
-  int32x2_t __ret_464; \
-  int32x2_t __s0_464 = __p0_464; \
-  int32x2_t __s1_464 = __p1_464; \
-  int32x4_t __s2_464 = __p2_464; \
-  __ret_464 = __s0_464 + __s1_464 * splat_laneq_s32(__s2_464, __p3_464); \
-  __ret_464; \
-})
-#else
-#define vmla_laneq_s32(__p0_465, __p1_465, __p2_465, __p3_465) __extension__ ({ \
-  int32x2_t __ret_465; \
-  int32x2_t __s0_465 = __p0_465; \
-  int32x2_t __s1_465 = __p1_465; \
-  int32x4_t __s2_465 = __p2_465; \
-  int32x2_t __rev0_465;  __rev0_465 = __builtin_shufflevector(__s0_465, __s0_465, 1, 0); \
-  int32x2_t __rev1_465;  __rev1_465 = __builtin_shufflevector(__s1_465, __s1_465, 1, 0); \
-  int32x4_t __rev2_465;  __rev2_465 = __builtin_shufflevector(__s2_465, __s2_465, 3, 2, 1, 0); \
-  __ret_465 = __rev0_465 + __rev1_465 * __noswap_splat_laneq_s32(__rev2_465, __p3_465); \
-  __ret_465 = __builtin_shufflevector(__ret_465, __ret_465, 1, 0); \
-  __ret_465; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmla_laneq_s16(__p0_466, __p1_466, __p2_466, __p3_466) __extension__ ({ \
-  int16x4_t __ret_466; \
-  int16x4_t __s0_466 = __p0_466; \
-  int16x4_t __s1_466 = __p1_466; \
-  int16x8_t __s2_466 = __p2_466; \
-  __ret_466 = __s0_466 + __s1_466 * splat_laneq_s16(__s2_466, __p3_466); \
-  __ret_466; \
-})
-#else
-#define vmla_laneq_s16(__p0_467, __p1_467, __p2_467, __p3_467) __extension__ ({ \
-  int16x4_t __ret_467; \
-  int16x4_t __s0_467 = __p0_467; \
-  int16x4_t __s1_467 = __p1_467; \
-  int16x8_t __s2_467 = __p2_467; \
-  int16x4_t __rev0_467;  __rev0_467 = __builtin_shufflevector(__s0_467, __s0_467, 3, 2, 1, 0); \
-  int16x4_t __rev1_467;  __rev1_467 = __builtin_shufflevector(__s1_467, __s1_467, 3, 2, 1, 0); \
-  int16x8_t __rev2_467;  __rev2_467 = __builtin_shufflevector(__s2_467, __s2_467, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_467 = __rev0_467 + __rev1_467 * __noswap_splat_laneq_s16(__rev2_467, __p3_467); \
-  __ret_467 = __builtin_shufflevector(__ret_467, __ret_467, 3, 2, 1, 0); \
-  __ret_467; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_high_lane_u32(__p0_468, __p1_468, __p2_468, __p3_468) __extension__ ({ \
-  uint64x2_t __ret_468; \
-  uint64x2_t __s0_468 = __p0_468; \
-  uint32x4_t __s1_468 = __p1_468; \
-  uint32x2_t __s2_468 = __p2_468; \
-  __ret_468 = __s0_468 + vmull_u32(vget_high_u32(__s1_468), splat_lane_u32(__s2_468, __p3_468)); \
-  __ret_468; \
-})
-#else
-#define vmlal_high_lane_u32(__p0_469, __p1_469, __p2_469, __p3_469) __extension__ ({ \
-  uint64x2_t __ret_469; \
-  uint64x2_t __s0_469 = __p0_469; \
-  uint32x4_t __s1_469 = __p1_469; \
-  uint32x2_t __s2_469 = __p2_469; \
-  uint64x2_t __rev0_469;  __rev0_469 = __builtin_shufflevector(__s0_469, __s0_469, 1, 0); \
-  uint32x4_t __rev1_469;  __rev1_469 = __builtin_shufflevector(__s1_469, __s1_469, 3, 2, 1, 0); \
-  uint32x2_t __rev2_469;  __rev2_469 = __builtin_shufflevector(__s2_469, __s2_469, 1, 0); \
-  __ret_469 = __rev0_469 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_469), __noswap_splat_lane_u32(__rev2_469, __p3_469)); \
-  __ret_469 = __builtin_shufflevector(__ret_469, __ret_469, 1, 0); \
-  __ret_469; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_high_lane_u16(__p0_470, __p1_470, __p2_470, __p3_470) __extension__ ({ \
-  uint32x4_t __ret_470; \
-  uint32x4_t __s0_470 = __p0_470; \
-  uint16x8_t __s1_470 = __p1_470; \
-  uint16x4_t __s2_470 = __p2_470; \
-  __ret_470 = __s0_470 + vmull_u16(vget_high_u16(__s1_470), splat_lane_u16(__s2_470, __p3_470)); \
-  __ret_470; \
-})
-#else
-#define vmlal_high_lane_u16(__p0_471, __p1_471, __p2_471, __p3_471) __extension__ ({ \
-  uint32x4_t __ret_471; \
-  uint32x4_t __s0_471 = __p0_471; \
-  uint16x8_t __s1_471 = __p1_471; \
-  uint16x4_t __s2_471 = __p2_471; \
-  uint32x4_t __rev0_471;  __rev0_471 = __builtin_shufflevector(__s0_471, __s0_471, 3, 2, 1, 0); \
-  uint16x8_t __rev1_471;  __rev1_471 = __builtin_shufflevector(__s1_471, __s1_471, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev2_471;  __rev2_471 = __builtin_shufflevector(__s2_471, __s2_471, 3, 2, 1, 0); \
-  __ret_471 = __rev0_471 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_471), __noswap_splat_lane_u16(__rev2_471, __p3_471)); \
-  __ret_471 = __builtin_shufflevector(__ret_471, __ret_471, 3, 2, 1, 0); \
-  __ret_471; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_high_lane_s32(__p0_472, __p1_472, __p2_472, __p3_472) __extension__ ({ \
-  int64x2_t __ret_472; \
-  int64x2_t __s0_472 = __p0_472; \
-  int32x4_t __s1_472 = __p1_472; \
-  int32x2_t __s2_472 = __p2_472; \
-  __ret_472 = __s0_472 + vmull_s32(vget_high_s32(__s1_472), splat_lane_s32(__s2_472, __p3_472)); \
-  __ret_472; \
-})
-#else
-#define vmlal_high_lane_s32(__p0_473, __p1_473, __p2_473, __p3_473) __extension__ ({ \
-  int64x2_t __ret_473; \
-  int64x2_t __s0_473 = __p0_473; \
-  int32x4_t __s1_473 = __p1_473; \
-  int32x2_t __s2_473 = __p2_473; \
-  int64x2_t __rev0_473;  __rev0_473 = __builtin_shufflevector(__s0_473, __s0_473, 1, 0); \
-  int32x4_t __rev1_473;  __rev1_473 = __builtin_shufflevector(__s1_473, __s1_473, 3, 2, 1, 0); \
-  int32x2_t __rev2_473;  __rev2_473 = __builtin_shufflevector(__s2_473, __s2_473, 1, 0); \
-  __ret_473 = __rev0_473 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_473), __noswap_splat_lane_s32(__rev2_473, __p3_473)); \
-  __ret_473 = __builtin_shufflevector(__ret_473, __ret_473, 1, 0); \
-  __ret_473; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_high_lane_s16(__p0_474, __p1_474, __p2_474, __p3_474) __extension__ ({ \
-  int32x4_t __ret_474; \
-  int32x4_t __s0_474 = __p0_474; \
-  int16x8_t __s1_474 = __p1_474; \
-  int16x4_t __s2_474 = __p2_474; \
-  __ret_474 = __s0_474 + vmull_s16(vget_high_s16(__s1_474), splat_lane_s16(__s2_474, __p3_474)); \
-  __ret_474; \
-})
-#else
-#define vmlal_high_lane_s16(__p0_475, __p1_475, __p2_475, __p3_475) __extension__ ({ \
-  int32x4_t __ret_475; \
-  int32x4_t __s0_475 = __p0_475; \
-  int16x8_t __s1_475 = __p1_475; \
-  int16x4_t __s2_475 = __p2_475; \
-  int32x4_t __rev0_475;  __rev0_475 = __builtin_shufflevector(__s0_475, __s0_475, 3, 2, 1, 0); \
-  int16x8_t __rev1_475;  __rev1_475 = __builtin_shufflevector(__s1_475, __s1_475, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_475;  __rev2_475 = __builtin_shufflevector(__s2_475, __s2_475, 3, 2, 1, 0); \
-  __ret_475 = __rev0_475 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_475), __noswap_splat_lane_s16(__rev2_475, __p3_475)); \
-  __ret_475 = __builtin_shufflevector(__ret_475, __ret_475, 3, 2, 1, 0); \
-  __ret_475; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_high_laneq_u32(__p0_476, __p1_476, __p2_476, __p3_476) __extension__ ({ \
-  uint64x2_t __ret_476; \
-  uint64x2_t __s0_476 = __p0_476; \
-  uint32x4_t __s1_476 = __p1_476; \
-  uint32x4_t __s2_476 = __p2_476; \
-  __ret_476 = __s0_476 + vmull_u32(vget_high_u32(__s1_476), splat_laneq_u32(__s2_476, __p3_476)); \
-  __ret_476; \
-})
-#else
-#define vmlal_high_laneq_u32(__p0_477, __p1_477, __p2_477, __p3_477) __extension__ ({ \
-  uint64x2_t __ret_477; \
-  uint64x2_t __s0_477 = __p0_477; \
-  uint32x4_t __s1_477 = __p1_477; \
-  uint32x4_t __s2_477 = __p2_477; \
-  uint64x2_t __rev0_477;  __rev0_477 = __builtin_shufflevector(__s0_477, __s0_477, 1, 0); \
-  uint32x4_t __rev1_477;  __rev1_477 = __builtin_shufflevector(__s1_477, __s1_477, 3, 2, 1, 0); \
-  uint32x4_t __rev2_477;  __rev2_477 = __builtin_shufflevector(__s2_477, __s2_477, 3, 2, 1, 0); \
-  __ret_477 = __rev0_477 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_477), __noswap_splat_laneq_u32(__rev2_477, __p3_477)); \
-  __ret_477 = __builtin_shufflevector(__ret_477, __ret_477, 1, 0); \
-  __ret_477; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_high_laneq_u16(__p0_478, __p1_478, __p2_478, __p3_478) __extension__ ({ \
-  uint32x4_t __ret_478; \
-  uint32x4_t __s0_478 = __p0_478; \
-  uint16x8_t __s1_478 = __p1_478; \
-  uint16x8_t __s2_478 = __p2_478; \
-  __ret_478 = __s0_478 + vmull_u16(vget_high_u16(__s1_478), splat_laneq_u16(__s2_478, __p3_478)); \
-  __ret_478; \
-})
-#else
-#define vmlal_high_laneq_u16(__p0_479, __p1_479, __p2_479, __p3_479) __extension__ ({ \
-  uint32x4_t __ret_479; \
-  uint32x4_t __s0_479 = __p0_479; \
-  uint16x8_t __s1_479 = __p1_479; \
-  uint16x8_t __s2_479 = __p2_479; \
-  uint32x4_t __rev0_479;  __rev0_479 = __builtin_shufflevector(__s0_479, __s0_479, 3, 2, 1, 0); \
-  uint16x8_t __rev1_479;  __rev1_479 = __builtin_shufflevector(__s1_479, __s1_479, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_479;  __rev2_479 = __builtin_shufflevector(__s2_479, __s2_479, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_479 = __rev0_479 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_479), __noswap_splat_laneq_u16(__rev2_479, __p3_479)); \
-  __ret_479 = __builtin_shufflevector(__ret_479, __ret_479, 3, 2, 1, 0); \
-  __ret_479; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_high_laneq_s32(__p0_480, __p1_480, __p2_480, __p3_480) __extension__ ({ \
-  int64x2_t __ret_480; \
-  int64x2_t __s0_480 = __p0_480; \
-  int32x4_t __s1_480 = __p1_480; \
-  int32x4_t __s2_480 = __p2_480; \
-  __ret_480 = __s0_480 + vmull_s32(vget_high_s32(__s1_480), splat_laneq_s32(__s2_480, __p3_480)); \
-  __ret_480; \
-})
-#else
-#define vmlal_high_laneq_s32(__p0_481, __p1_481, __p2_481, __p3_481) __extension__ ({ \
-  int64x2_t __ret_481; \
-  int64x2_t __s0_481 = __p0_481; \
-  int32x4_t __s1_481 = __p1_481; \
-  int32x4_t __s2_481 = __p2_481; \
-  int64x2_t __rev0_481;  __rev0_481 = __builtin_shufflevector(__s0_481, __s0_481, 1, 0); \
-  int32x4_t __rev1_481;  __rev1_481 = __builtin_shufflevector(__s1_481, __s1_481, 3, 2, 1, 0); \
-  int32x4_t __rev2_481;  __rev2_481 = __builtin_shufflevector(__s2_481, __s2_481, 3, 2, 1, 0); \
-  __ret_481 = __rev0_481 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_481), __noswap_splat_laneq_s32(__rev2_481, __p3_481)); \
-  __ret_481 = __builtin_shufflevector(__ret_481, __ret_481, 1, 0); \
-  __ret_481; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_high_laneq_s16(__p0_482, __p1_482, __p2_482, __p3_482) __extension__ ({ \
-  int32x4_t __ret_482; \
-  int32x4_t __s0_482 = __p0_482; \
-  int16x8_t __s1_482 = __p1_482; \
-  int16x8_t __s2_482 = __p2_482; \
-  __ret_482 = __s0_482 + vmull_s16(vget_high_s16(__s1_482), splat_laneq_s16(__s2_482, __p3_482)); \
-  __ret_482; \
-})
-#else
-#define vmlal_high_laneq_s16(__p0_483, __p1_483, __p2_483, __p3_483) __extension__ ({ \
-  int32x4_t __ret_483; \
-  int32x4_t __s0_483 = __p0_483; \
-  int16x8_t __s1_483 = __p1_483; \
-  int16x8_t __s2_483 = __p2_483; \
-  int32x4_t __rev0_483;  __rev0_483 = __builtin_shufflevector(__s0_483, __s0_483, 3, 2, 1, 0); \
-  int16x8_t __rev1_483;  __rev1_483 = __builtin_shufflevector(__s1_483, __s1_483, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_483;  __rev2_483 = __builtin_shufflevector(__s2_483, __s2_483, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_483 = __rev0_483 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_483), __noswap_splat_laneq_s16(__rev2_483, __p3_483)); \
-  __ret_483 = __builtin_shufflevector(__ret_483, __ret_483, 3, 2, 1, 0); \
-  __ret_483; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_laneq_u32(__p0_484, __p1_484, __p2_484, __p3_484) __extension__ ({ \
-  uint64x2_t __ret_484; \
-  uint64x2_t __s0_484 = __p0_484; \
-  uint32x2_t __s1_484 = __p1_484; \
-  uint32x4_t __s2_484 = __p2_484; \
-  __ret_484 = __s0_484 + vmull_u32(__s1_484, splat_laneq_u32(__s2_484, __p3_484)); \
-  __ret_484; \
-})
-#else
-#define vmlal_laneq_u32(__p0_485, __p1_485, __p2_485, __p3_485) __extension__ ({ \
-  uint64x2_t __ret_485; \
-  uint64x2_t __s0_485 = __p0_485; \
-  uint32x2_t __s1_485 = __p1_485; \
-  uint32x4_t __s2_485 = __p2_485; \
-  uint64x2_t __rev0_485;  __rev0_485 = __builtin_shufflevector(__s0_485, __s0_485, 1, 0); \
-  uint32x2_t __rev1_485;  __rev1_485 = __builtin_shufflevector(__s1_485, __s1_485, 1, 0); \
-  uint32x4_t __rev2_485;  __rev2_485 = __builtin_shufflevector(__s2_485, __s2_485, 3, 2, 1, 0); \
-  __ret_485 = __rev0_485 + __noswap_vmull_u32(__rev1_485, __noswap_splat_laneq_u32(__rev2_485, __p3_485)); \
-  __ret_485 = __builtin_shufflevector(__ret_485, __ret_485, 1, 0); \
-  __ret_485; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_laneq_u16(__p0_486, __p1_486, __p2_486, __p3_486) __extension__ ({ \
-  uint32x4_t __ret_486; \
-  uint32x4_t __s0_486 = __p0_486; \
-  uint16x4_t __s1_486 = __p1_486; \
-  uint16x8_t __s2_486 = __p2_486; \
-  __ret_486 = __s0_486 + vmull_u16(__s1_486, splat_laneq_u16(__s2_486, __p3_486)); \
-  __ret_486; \
-})
-#else
-#define vmlal_laneq_u16(__p0_487, __p1_487, __p2_487, __p3_487) __extension__ ({ \
-  uint32x4_t __ret_487; \
-  uint32x4_t __s0_487 = __p0_487; \
-  uint16x4_t __s1_487 = __p1_487; \
-  uint16x8_t __s2_487 = __p2_487; \
-  uint32x4_t __rev0_487;  __rev0_487 = __builtin_shufflevector(__s0_487, __s0_487, 3, 2, 1, 0); \
-  uint16x4_t __rev1_487;  __rev1_487 = __builtin_shufflevector(__s1_487, __s1_487, 3, 2, 1, 0); \
-  uint16x8_t __rev2_487;  __rev2_487 = __builtin_shufflevector(__s2_487, __s2_487, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_487 = __rev0_487 + __noswap_vmull_u16(__rev1_487, __noswap_splat_laneq_u16(__rev2_487, __p3_487)); \
-  __ret_487 = __builtin_shufflevector(__ret_487, __ret_487, 3, 2, 1, 0); \
-  __ret_487; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_laneq_s32(__p0_488, __p1_488, __p2_488, __p3_488) __extension__ ({ \
-  int64x2_t __ret_488; \
-  int64x2_t __s0_488 = __p0_488; \
-  int32x2_t __s1_488 = __p1_488; \
-  int32x4_t __s2_488 = __p2_488; \
-  __ret_488 = __s0_488 + vmull_s32(__s1_488, splat_laneq_s32(__s2_488, __p3_488)); \
-  __ret_488; \
-})
-#else
-#define vmlal_laneq_s32(__p0_489, __p1_489, __p2_489, __p3_489) __extension__ ({ \
-  int64x2_t __ret_489; \
-  int64x2_t __s0_489 = __p0_489; \
-  int32x2_t __s1_489 = __p1_489; \
-  int32x4_t __s2_489 = __p2_489; \
-  int64x2_t __rev0_489;  __rev0_489 = __builtin_shufflevector(__s0_489, __s0_489, 1, 0); \
-  int32x2_t __rev1_489;  __rev1_489 = __builtin_shufflevector(__s1_489, __s1_489, 1, 0); \
-  int32x4_t __rev2_489;  __rev2_489 = __builtin_shufflevector(__s2_489, __s2_489, 3, 2, 1, 0); \
-  __ret_489 = __rev0_489 + __noswap_vmull_s32(__rev1_489, __noswap_splat_laneq_s32(__rev2_489, __p3_489)); \
-  __ret_489 = __builtin_shufflevector(__ret_489, __ret_489, 1, 0); \
-  __ret_489; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_laneq_s16(__p0_490, __p1_490, __p2_490, __p3_490) __extension__ ({ \
-  int32x4_t __ret_490; \
-  int32x4_t __s0_490 = __p0_490; \
-  int16x4_t __s1_490 = __p1_490; \
-  int16x8_t __s2_490 = __p2_490; \
-  __ret_490 = __s0_490 + vmull_s16(__s1_490, splat_laneq_s16(__s2_490, __p3_490)); \
-  __ret_490; \
-})
-#else
-#define vmlal_laneq_s16(__p0_491, __p1_491, __p2_491, __p3_491) __extension__ ({ \
-  int32x4_t __ret_491; \
-  int32x4_t __s0_491 = __p0_491; \
-  int16x4_t __s1_491 = __p1_491; \
-  int16x8_t __s2_491 = __p2_491; \
-  int32x4_t __rev0_491;  __rev0_491 = __builtin_shufflevector(__s0_491, __s0_491, 3, 2, 1, 0); \
-  int16x4_t __rev1_491;  __rev1_491 = __builtin_shufflevector(__s1_491, __s1_491, 3, 2, 1, 0); \
-  int16x8_t __rev2_491;  __rev2_491 = __builtin_shufflevector(__s2_491, __s2_491, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_491 = __rev0_491 + __noswap_vmull_s16(__rev1_491, __noswap_splat_laneq_s16(__rev2_491, __p3_491)); \
-  __ret_491 = __builtin_shufflevector(__ret_491, __ret_491, 3, 2, 1, 0); \
-  __ret_491; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vmlsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vmlsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 - __rev1 * __rev2;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vmls_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
-  float64x1_t __ret;
-  __ret = __p0 - __p1 * __p2;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_u32(__p0_492, __p1_492, __p2_492, __p3_492) __extension__ ({ \
-  uint32x4_t __ret_492; \
-  uint32x4_t __s0_492 = __p0_492; \
-  uint32x4_t __s1_492 = __p1_492; \
-  uint32x4_t __s2_492 = __p2_492; \
-  __ret_492 = __s0_492 - __s1_492 * splatq_laneq_u32(__s2_492, __p3_492); \
-  __ret_492; \
-})
-#else
-#define vmlsq_laneq_u32(__p0_493, __p1_493, __p2_493, __p3_493) __extension__ ({ \
-  uint32x4_t __ret_493; \
-  uint32x4_t __s0_493 = __p0_493; \
-  uint32x4_t __s1_493 = __p1_493; \
-  uint32x4_t __s2_493 = __p2_493; \
-  uint32x4_t __rev0_493;  __rev0_493 = __builtin_shufflevector(__s0_493, __s0_493, 3, 2, 1, 0); \
-  uint32x4_t __rev1_493;  __rev1_493 = __builtin_shufflevector(__s1_493, __s1_493, 3, 2, 1, 0); \
-  uint32x4_t __rev2_493;  __rev2_493 = __builtin_shufflevector(__s2_493, __s2_493, 3, 2, 1, 0); \
-  __ret_493 = __rev0_493 - __rev1_493 * __noswap_splatq_laneq_u32(__rev2_493, __p3_493); \
-  __ret_493 = __builtin_shufflevector(__ret_493, __ret_493, 3, 2, 1, 0); \
-  __ret_493; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_u16(__p0_494, __p1_494, __p2_494, __p3_494) __extension__ ({ \
-  uint16x8_t __ret_494; \
-  uint16x8_t __s0_494 = __p0_494; \
-  uint16x8_t __s1_494 = __p1_494; \
-  uint16x8_t __s2_494 = __p2_494; \
-  __ret_494 = __s0_494 - __s1_494 * splatq_laneq_u16(__s2_494, __p3_494); \
-  __ret_494; \
-})
-#else
-#define vmlsq_laneq_u16(__p0_495, __p1_495, __p2_495, __p3_495) __extension__ ({ \
-  uint16x8_t __ret_495; \
-  uint16x8_t __s0_495 = __p0_495; \
-  uint16x8_t __s1_495 = __p1_495; \
-  uint16x8_t __s2_495 = __p2_495; \
-  uint16x8_t __rev0_495;  __rev0_495 = __builtin_shufflevector(__s0_495, __s0_495, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_495;  __rev1_495 = __builtin_shufflevector(__s1_495, __s1_495, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_495;  __rev2_495 = __builtin_shufflevector(__s2_495, __s2_495, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_495 = __rev0_495 - __rev1_495 * __noswap_splatq_laneq_u16(__rev2_495, __p3_495); \
-  __ret_495 = __builtin_shufflevector(__ret_495, __ret_495, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_495; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_f32(__p0_496, __p1_496, __p2_496, __p3_496) __extension__ ({ \
-  float32x4_t __ret_496; \
-  float32x4_t __s0_496 = __p0_496; \
-  float32x4_t __s1_496 = __p1_496; \
-  float32x4_t __s2_496 = __p2_496; \
-  __ret_496 = __s0_496 - __s1_496 * splatq_laneq_f32(__s2_496, __p3_496); \
-  __ret_496; \
-})
-#else
-#define vmlsq_laneq_f32(__p0_497, __p1_497, __p2_497, __p3_497) __extension__ ({ \
-  float32x4_t __ret_497; \
-  float32x4_t __s0_497 = __p0_497; \
-  float32x4_t __s1_497 = __p1_497; \
-  float32x4_t __s2_497 = __p2_497; \
-  float32x4_t __rev0_497;  __rev0_497 = __builtin_shufflevector(__s0_497, __s0_497, 3, 2, 1, 0); \
-  float32x4_t __rev1_497;  __rev1_497 = __builtin_shufflevector(__s1_497, __s1_497, 3, 2, 1, 0); \
-  float32x4_t __rev2_497;  __rev2_497 = __builtin_shufflevector(__s2_497, __s2_497, 3, 2, 1, 0); \
-  __ret_497 = __rev0_497 - __rev1_497 * __noswap_splatq_laneq_f32(__rev2_497, __p3_497); \
-  __ret_497 = __builtin_shufflevector(__ret_497, __ret_497, 3, 2, 1, 0); \
-  __ret_497; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_s32(__p0_498, __p1_498, __p2_498, __p3_498) __extension__ ({ \
-  int32x4_t __ret_498; \
-  int32x4_t __s0_498 = __p0_498; \
-  int32x4_t __s1_498 = __p1_498; \
-  int32x4_t __s2_498 = __p2_498; \
-  __ret_498 = __s0_498 - __s1_498 * splatq_laneq_s32(__s2_498, __p3_498); \
-  __ret_498; \
-})
-#else
-#define vmlsq_laneq_s32(__p0_499, __p1_499, __p2_499, __p3_499) __extension__ ({ \
-  int32x4_t __ret_499; \
-  int32x4_t __s0_499 = __p0_499; \
-  int32x4_t __s1_499 = __p1_499; \
-  int32x4_t __s2_499 = __p2_499; \
-  int32x4_t __rev0_499;  __rev0_499 = __builtin_shufflevector(__s0_499, __s0_499, 3, 2, 1, 0); \
-  int32x4_t __rev1_499;  __rev1_499 = __builtin_shufflevector(__s1_499, __s1_499, 3, 2, 1, 0); \
-  int32x4_t __rev2_499;  __rev2_499 = __builtin_shufflevector(__s2_499, __s2_499, 3, 2, 1, 0); \
-  __ret_499 = __rev0_499 - __rev1_499 * __noswap_splatq_laneq_s32(__rev2_499, __p3_499); \
-  __ret_499 = __builtin_shufflevector(__ret_499, __ret_499, 3, 2, 1, 0); \
-  __ret_499; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsq_laneq_s16(__p0_500, __p1_500, __p2_500, __p3_500) __extension__ ({ \
-  int16x8_t __ret_500; \
-  int16x8_t __s0_500 = __p0_500; \
-  int16x8_t __s1_500 = __p1_500; \
-  int16x8_t __s2_500 = __p2_500; \
-  __ret_500 = __s0_500 - __s1_500 * splatq_laneq_s16(__s2_500, __p3_500); \
-  __ret_500; \
-})
-#else
-#define vmlsq_laneq_s16(__p0_501, __p1_501, __p2_501, __p3_501) __extension__ ({ \
-  int16x8_t __ret_501; \
-  int16x8_t __s0_501 = __p0_501; \
-  int16x8_t __s1_501 = __p1_501; \
-  int16x8_t __s2_501 = __p2_501; \
-  int16x8_t __rev0_501;  __rev0_501 = __builtin_shufflevector(__s0_501, __s0_501, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_501;  __rev1_501 = __builtin_shufflevector(__s1_501, __s1_501, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_501;  __rev2_501 = __builtin_shufflevector(__s2_501, __s2_501, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_501 = __rev0_501 - __rev1_501 * __noswap_splatq_laneq_s16(__rev2_501, __p3_501); \
-  __ret_501 = __builtin_shufflevector(__ret_501, __ret_501, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_501; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_u32(__p0_502, __p1_502, __p2_502, __p3_502) __extension__ ({ \
-  uint32x2_t __ret_502; \
-  uint32x2_t __s0_502 = __p0_502; \
-  uint32x2_t __s1_502 = __p1_502; \
-  uint32x4_t __s2_502 = __p2_502; \
-  __ret_502 = __s0_502 - __s1_502 * splat_laneq_u32(__s2_502, __p3_502); \
-  __ret_502; \
-})
-#else
-#define vmls_laneq_u32(__p0_503, __p1_503, __p2_503, __p3_503) __extension__ ({ \
-  uint32x2_t __ret_503; \
-  uint32x2_t __s0_503 = __p0_503; \
-  uint32x2_t __s1_503 = __p1_503; \
-  uint32x4_t __s2_503 = __p2_503; \
-  uint32x2_t __rev0_503;  __rev0_503 = __builtin_shufflevector(__s0_503, __s0_503, 1, 0); \
-  uint32x2_t __rev1_503;  __rev1_503 = __builtin_shufflevector(__s1_503, __s1_503, 1, 0); \
-  uint32x4_t __rev2_503;  __rev2_503 = __builtin_shufflevector(__s2_503, __s2_503, 3, 2, 1, 0); \
-  __ret_503 = __rev0_503 - __rev1_503 * __noswap_splat_laneq_u32(__rev2_503, __p3_503); \
-  __ret_503 = __builtin_shufflevector(__ret_503, __ret_503, 1, 0); \
-  __ret_503; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_u16(__p0_504, __p1_504, __p2_504, __p3_504) __extension__ ({ \
-  uint16x4_t __ret_504; \
-  uint16x4_t __s0_504 = __p0_504; \
-  uint16x4_t __s1_504 = __p1_504; \
-  uint16x8_t __s2_504 = __p2_504; \
-  __ret_504 = __s0_504 - __s1_504 * splat_laneq_u16(__s2_504, __p3_504); \
-  __ret_504; \
-})
-#else
-#define vmls_laneq_u16(__p0_505, __p1_505, __p2_505, __p3_505) __extension__ ({ \
-  uint16x4_t __ret_505; \
-  uint16x4_t __s0_505 = __p0_505; \
-  uint16x4_t __s1_505 = __p1_505; \
-  uint16x8_t __s2_505 = __p2_505; \
-  uint16x4_t __rev0_505;  __rev0_505 = __builtin_shufflevector(__s0_505, __s0_505, 3, 2, 1, 0); \
-  uint16x4_t __rev1_505;  __rev1_505 = __builtin_shufflevector(__s1_505, __s1_505, 3, 2, 1, 0); \
-  uint16x8_t __rev2_505;  __rev2_505 = __builtin_shufflevector(__s2_505, __s2_505, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_505 = __rev0_505 - __rev1_505 * __noswap_splat_laneq_u16(__rev2_505, __p3_505); \
-  __ret_505 = __builtin_shufflevector(__ret_505, __ret_505, 3, 2, 1, 0); \
-  __ret_505; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_f32(__p0_506, __p1_506, __p2_506, __p3_506) __extension__ ({ \
-  float32x2_t __ret_506; \
-  float32x2_t __s0_506 = __p0_506; \
-  float32x2_t __s1_506 = __p1_506; \
-  float32x4_t __s2_506 = __p2_506; \
-  __ret_506 = __s0_506 - __s1_506 * splat_laneq_f32(__s2_506, __p3_506); \
-  __ret_506; \
-})
-#else
-#define vmls_laneq_f32(__p0_507, __p1_507, __p2_507, __p3_507) __extension__ ({ \
-  float32x2_t __ret_507; \
-  float32x2_t __s0_507 = __p0_507; \
-  float32x2_t __s1_507 = __p1_507; \
-  float32x4_t __s2_507 = __p2_507; \
-  float32x2_t __rev0_507;  __rev0_507 = __builtin_shufflevector(__s0_507, __s0_507, 1, 0); \
-  float32x2_t __rev1_507;  __rev1_507 = __builtin_shufflevector(__s1_507, __s1_507, 1, 0); \
-  float32x4_t __rev2_507;  __rev2_507 = __builtin_shufflevector(__s2_507, __s2_507, 3, 2, 1, 0); \
-  __ret_507 = __rev0_507 - __rev1_507 * __noswap_splat_laneq_f32(__rev2_507, __p3_507); \
-  __ret_507 = __builtin_shufflevector(__ret_507, __ret_507, 1, 0); \
-  __ret_507; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_s32(__p0_508, __p1_508, __p2_508, __p3_508) __extension__ ({ \
-  int32x2_t __ret_508; \
-  int32x2_t __s0_508 = __p0_508; \
-  int32x2_t __s1_508 = __p1_508; \
-  int32x4_t __s2_508 = __p2_508; \
-  __ret_508 = __s0_508 - __s1_508 * splat_laneq_s32(__s2_508, __p3_508); \
-  __ret_508; \
-})
-#else
-#define vmls_laneq_s32(__p0_509, __p1_509, __p2_509, __p3_509) __extension__ ({ \
-  int32x2_t __ret_509; \
-  int32x2_t __s0_509 = __p0_509; \
-  int32x2_t __s1_509 = __p1_509; \
-  int32x4_t __s2_509 = __p2_509; \
-  int32x2_t __rev0_509;  __rev0_509 = __builtin_shufflevector(__s0_509, __s0_509, 1, 0); \
-  int32x2_t __rev1_509;  __rev1_509 = __builtin_shufflevector(__s1_509, __s1_509, 1, 0); \
-  int32x4_t __rev2_509;  __rev2_509 = __builtin_shufflevector(__s2_509, __s2_509, 3, 2, 1, 0); \
-  __ret_509 = __rev0_509 - __rev1_509 * __noswap_splat_laneq_s32(__rev2_509, __p3_509); \
-  __ret_509 = __builtin_shufflevector(__ret_509, __ret_509, 1, 0); \
-  __ret_509; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmls_laneq_s16(__p0_510, __p1_510, __p2_510, __p3_510) __extension__ ({ \
-  int16x4_t __ret_510; \
-  int16x4_t __s0_510 = __p0_510; \
-  int16x4_t __s1_510 = __p1_510; \
-  int16x8_t __s2_510 = __p2_510; \
-  __ret_510 = __s0_510 - __s1_510 * splat_laneq_s16(__s2_510, __p3_510); \
-  __ret_510; \
-})
-#else
-#define vmls_laneq_s16(__p0_511, __p1_511, __p2_511, __p3_511) __extension__ ({ \
-  int16x4_t __ret_511; \
-  int16x4_t __s0_511 = __p0_511; \
-  int16x4_t __s1_511 = __p1_511; \
-  int16x8_t __s2_511 = __p2_511; \
-  int16x4_t __rev0_511;  __rev0_511 = __builtin_shufflevector(__s0_511, __s0_511, 3, 2, 1, 0); \
-  int16x4_t __rev1_511;  __rev1_511 = __builtin_shufflevector(__s1_511, __s1_511, 3, 2, 1, 0); \
-  int16x8_t __rev2_511;  __rev2_511 = __builtin_shufflevector(__s2_511, __s2_511, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_511 = __rev0_511 - __rev1_511 * __noswap_splat_laneq_s16(__rev2_511, __p3_511); \
-  __ret_511 = __builtin_shufflevector(__ret_511, __ret_511, 3, 2, 1, 0); \
-  __ret_511; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_lane_u32(__p0_512, __p1_512, __p2_512, __p3_512) __extension__ ({ \
-  uint64x2_t __ret_512; \
-  uint64x2_t __s0_512 = __p0_512; \
-  uint32x4_t __s1_512 = __p1_512; \
-  uint32x2_t __s2_512 = __p2_512; \
-  __ret_512 = __s0_512 - vmull_u32(vget_high_u32(__s1_512), splat_lane_u32(__s2_512, __p3_512)); \
-  __ret_512; \
-})
-#else
-#define vmlsl_high_lane_u32(__p0_513, __p1_513, __p2_513, __p3_513) __extension__ ({ \
-  uint64x2_t __ret_513; \
-  uint64x2_t __s0_513 = __p0_513; \
-  uint32x4_t __s1_513 = __p1_513; \
-  uint32x2_t __s2_513 = __p2_513; \
-  uint64x2_t __rev0_513;  __rev0_513 = __builtin_shufflevector(__s0_513, __s0_513, 1, 0); \
-  uint32x4_t __rev1_513;  __rev1_513 = __builtin_shufflevector(__s1_513, __s1_513, 3, 2, 1, 0); \
-  uint32x2_t __rev2_513;  __rev2_513 = __builtin_shufflevector(__s2_513, __s2_513, 1, 0); \
-  __ret_513 = __rev0_513 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_513), __noswap_splat_lane_u32(__rev2_513, __p3_513)); \
-  __ret_513 = __builtin_shufflevector(__ret_513, __ret_513, 1, 0); \
-  __ret_513; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_lane_u16(__p0_514, __p1_514, __p2_514, __p3_514) __extension__ ({ \
-  uint32x4_t __ret_514; \
-  uint32x4_t __s0_514 = __p0_514; \
-  uint16x8_t __s1_514 = __p1_514; \
-  uint16x4_t __s2_514 = __p2_514; \
-  __ret_514 = __s0_514 - vmull_u16(vget_high_u16(__s1_514), splat_lane_u16(__s2_514, __p3_514)); \
-  __ret_514; \
-})
-#else
-#define vmlsl_high_lane_u16(__p0_515, __p1_515, __p2_515, __p3_515) __extension__ ({ \
-  uint32x4_t __ret_515; \
-  uint32x4_t __s0_515 = __p0_515; \
-  uint16x8_t __s1_515 = __p1_515; \
-  uint16x4_t __s2_515 = __p2_515; \
-  uint32x4_t __rev0_515;  __rev0_515 = __builtin_shufflevector(__s0_515, __s0_515, 3, 2, 1, 0); \
-  uint16x8_t __rev1_515;  __rev1_515 = __builtin_shufflevector(__s1_515, __s1_515, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev2_515;  __rev2_515 = __builtin_shufflevector(__s2_515, __s2_515, 3, 2, 1, 0); \
-  __ret_515 = __rev0_515 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_515), __noswap_splat_lane_u16(__rev2_515, __p3_515)); \
-  __ret_515 = __builtin_shufflevector(__ret_515, __ret_515, 3, 2, 1, 0); \
-  __ret_515; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_lane_s32(__p0_516, __p1_516, __p2_516, __p3_516) __extension__ ({ \
-  int64x2_t __ret_516; \
-  int64x2_t __s0_516 = __p0_516; \
-  int32x4_t __s1_516 = __p1_516; \
-  int32x2_t __s2_516 = __p2_516; \
-  __ret_516 = __s0_516 - vmull_s32(vget_high_s32(__s1_516), splat_lane_s32(__s2_516, __p3_516)); \
-  __ret_516; \
-})
-#else
-#define vmlsl_high_lane_s32(__p0_517, __p1_517, __p2_517, __p3_517) __extension__ ({ \
-  int64x2_t __ret_517; \
-  int64x2_t __s0_517 = __p0_517; \
-  int32x4_t __s1_517 = __p1_517; \
-  int32x2_t __s2_517 = __p2_517; \
-  int64x2_t __rev0_517;  __rev0_517 = __builtin_shufflevector(__s0_517, __s0_517, 1, 0); \
-  int32x4_t __rev1_517;  __rev1_517 = __builtin_shufflevector(__s1_517, __s1_517, 3, 2, 1, 0); \
-  int32x2_t __rev2_517;  __rev2_517 = __builtin_shufflevector(__s2_517, __s2_517, 1, 0); \
-  __ret_517 = __rev0_517 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_517), __noswap_splat_lane_s32(__rev2_517, __p3_517)); \
-  __ret_517 = __builtin_shufflevector(__ret_517, __ret_517, 1, 0); \
-  __ret_517; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_lane_s16(__p0_518, __p1_518, __p2_518, __p3_518) __extension__ ({ \
-  int32x4_t __ret_518; \
-  int32x4_t __s0_518 = __p0_518; \
-  int16x8_t __s1_518 = __p1_518; \
-  int16x4_t __s2_518 = __p2_518; \
-  __ret_518 = __s0_518 - vmull_s16(vget_high_s16(__s1_518), splat_lane_s16(__s2_518, __p3_518)); \
-  __ret_518; \
-})
-#else
-#define vmlsl_high_lane_s16(__p0_519, __p1_519, __p2_519, __p3_519) __extension__ ({ \
-  int32x4_t __ret_519; \
-  int32x4_t __s0_519 = __p0_519; \
-  int16x8_t __s1_519 = __p1_519; \
-  int16x4_t __s2_519 = __p2_519; \
-  int32x4_t __rev0_519;  __rev0_519 = __builtin_shufflevector(__s0_519, __s0_519, 3, 2, 1, 0); \
-  int16x8_t __rev1_519;  __rev1_519 = __builtin_shufflevector(__s1_519, __s1_519, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_519;  __rev2_519 = __builtin_shufflevector(__s2_519, __s2_519, 3, 2, 1, 0); \
-  __ret_519 = __rev0_519 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_519), __noswap_splat_lane_s16(__rev2_519, __p3_519)); \
-  __ret_519 = __builtin_shufflevector(__ret_519, __ret_519, 3, 2, 1, 0); \
-  __ret_519; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_laneq_u32(__p0_520, __p1_520, __p2_520, __p3_520) __extension__ ({ \
-  uint64x2_t __ret_520; \
-  uint64x2_t __s0_520 = __p0_520; \
-  uint32x4_t __s1_520 = __p1_520; \
-  uint32x4_t __s2_520 = __p2_520; \
-  __ret_520 = __s0_520 - vmull_u32(vget_high_u32(__s1_520), splat_laneq_u32(__s2_520, __p3_520)); \
-  __ret_520; \
-})
-#else
-#define vmlsl_high_laneq_u32(__p0_521, __p1_521, __p2_521, __p3_521) __extension__ ({ \
-  uint64x2_t __ret_521; \
-  uint64x2_t __s0_521 = __p0_521; \
-  uint32x4_t __s1_521 = __p1_521; \
-  uint32x4_t __s2_521 = __p2_521; \
-  uint64x2_t __rev0_521;  __rev0_521 = __builtin_shufflevector(__s0_521, __s0_521, 1, 0); \
-  uint32x4_t __rev1_521;  __rev1_521 = __builtin_shufflevector(__s1_521, __s1_521, 3, 2, 1, 0); \
-  uint32x4_t __rev2_521;  __rev2_521 = __builtin_shufflevector(__s2_521, __s2_521, 3, 2, 1, 0); \
-  __ret_521 = __rev0_521 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1_521), __noswap_splat_laneq_u32(__rev2_521, __p3_521)); \
-  __ret_521 = __builtin_shufflevector(__ret_521, __ret_521, 1, 0); \
-  __ret_521; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_laneq_u16(__p0_522, __p1_522, __p2_522, __p3_522) __extension__ ({ \
-  uint32x4_t __ret_522; \
-  uint32x4_t __s0_522 = __p0_522; \
-  uint16x8_t __s1_522 = __p1_522; \
-  uint16x8_t __s2_522 = __p2_522; \
-  __ret_522 = __s0_522 - vmull_u16(vget_high_u16(__s1_522), splat_laneq_u16(__s2_522, __p3_522)); \
-  __ret_522; \
-})
-#else
-#define vmlsl_high_laneq_u16(__p0_523, __p1_523, __p2_523, __p3_523) __extension__ ({ \
-  uint32x4_t __ret_523; \
-  uint32x4_t __s0_523 = __p0_523; \
-  uint16x8_t __s1_523 = __p1_523; \
-  uint16x8_t __s2_523 = __p2_523; \
-  uint32x4_t __rev0_523;  __rev0_523 = __builtin_shufflevector(__s0_523, __s0_523, 3, 2, 1, 0); \
-  uint16x8_t __rev1_523;  __rev1_523 = __builtin_shufflevector(__s1_523, __s1_523, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev2_523;  __rev2_523 = __builtin_shufflevector(__s2_523, __s2_523, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_523 = __rev0_523 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1_523), __noswap_splat_laneq_u16(__rev2_523, __p3_523)); \
-  __ret_523 = __builtin_shufflevector(__ret_523, __ret_523, 3, 2, 1, 0); \
-  __ret_523; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_laneq_s32(__p0_524, __p1_524, __p2_524, __p3_524) __extension__ ({ \
-  int64x2_t __ret_524; \
-  int64x2_t __s0_524 = __p0_524; \
-  int32x4_t __s1_524 = __p1_524; \
-  int32x4_t __s2_524 = __p2_524; \
-  __ret_524 = __s0_524 - vmull_s32(vget_high_s32(__s1_524), splat_laneq_s32(__s2_524, __p3_524)); \
-  __ret_524; \
-})
-#else
-#define vmlsl_high_laneq_s32(__p0_525, __p1_525, __p2_525, __p3_525) __extension__ ({ \
-  int64x2_t __ret_525; \
-  int64x2_t __s0_525 = __p0_525; \
-  int32x4_t __s1_525 = __p1_525; \
-  int32x4_t __s2_525 = __p2_525; \
-  int64x2_t __rev0_525;  __rev0_525 = __builtin_shufflevector(__s0_525, __s0_525, 1, 0); \
-  int32x4_t __rev1_525;  __rev1_525 = __builtin_shufflevector(__s1_525, __s1_525, 3, 2, 1, 0); \
-  int32x4_t __rev2_525;  __rev2_525 = __builtin_shufflevector(__s2_525, __s2_525, 3, 2, 1, 0); \
-  __ret_525 = __rev0_525 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1_525), __noswap_splat_laneq_s32(__rev2_525, __p3_525)); \
-  __ret_525 = __builtin_shufflevector(__ret_525, __ret_525, 1, 0); \
-  __ret_525; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_high_laneq_s16(__p0_526, __p1_526, __p2_526, __p3_526) __extension__ ({ \
-  int32x4_t __ret_526; \
-  int32x4_t __s0_526 = __p0_526; \
-  int16x8_t __s1_526 = __p1_526; \
-  int16x8_t __s2_526 = __p2_526; \
-  __ret_526 = __s0_526 - vmull_s16(vget_high_s16(__s1_526), splat_laneq_s16(__s2_526, __p3_526)); \
-  __ret_526; \
-})
-#else
-#define vmlsl_high_laneq_s16(__p0_527, __p1_527, __p2_527, __p3_527) __extension__ ({ \
-  int32x4_t __ret_527; \
-  int32x4_t __s0_527 = __p0_527; \
-  int16x8_t __s1_527 = __p1_527; \
-  int16x8_t __s2_527 = __p2_527; \
-  int32x4_t __rev0_527;  __rev0_527 = __builtin_shufflevector(__s0_527, __s0_527, 3, 2, 1, 0); \
-  int16x8_t __rev1_527;  __rev1_527 = __builtin_shufflevector(__s1_527, __s1_527, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_527;  __rev2_527 = __builtin_shufflevector(__s2_527, __s2_527, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_527 = __rev0_527 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1_527), __noswap_splat_laneq_s16(__rev2_527, __p3_527)); \
-  __ret_527 = __builtin_shufflevector(__ret_527, __ret_527, 3, 2, 1, 0); \
-  __ret_527; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_laneq_u32(__p0_528, __p1_528, __p2_528, __p3_528) __extension__ ({ \
-  uint64x2_t __ret_528; \
-  uint64x2_t __s0_528 = __p0_528; \
-  uint32x2_t __s1_528 = __p1_528; \
-  uint32x4_t __s2_528 = __p2_528; \
-  __ret_528 = __s0_528 - vmull_u32(__s1_528, splat_laneq_u32(__s2_528, __p3_528)); \
-  __ret_528; \
-})
-#else
-#define vmlsl_laneq_u32(__p0_529, __p1_529, __p2_529, __p3_529) __extension__ ({ \
-  uint64x2_t __ret_529; \
-  uint64x2_t __s0_529 = __p0_529; \
-  uint32x2_t __s1_529 = __p1_529; \
-  uint32x4_t __s2_529 = __p2_529; \
-  uint64x2_t __rev0_529;  __rev0_529 = __builtin_shufflevector(__s0_529, __s0_529, 1, 0); \
-  uint32x2_t __rev1_529;  __rev1_529 = __builtin_shufflevector(__s1_529, __s1_529, 1, 0); \
-  uint32x4_t __rev2_529;  __rev2_529 = __builtin_shufflevector(__s2_529, __s2_529, 3, 2, 1, 0); \
-  __ret_529 = __rev0_529 - __noswap_vmull_u32(__rev1_529, __noswap_splat_laneq_u32(__rev2_529, __p3_529)); \
-  __ret_529 = __builtin_shufflevector(__ret_529, __ret_529, 1, 0); \
-  __ret_529; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_laneq_u16(__p0_530, __p1_530, __p2_530, __p3_530) __extension__ ({ \
-  uint32x4_t __ret_530; \
-  uint32x4_t __s0_530 = __p0_530; \
-  uint16x4_t __s1_530 = __p1_530; \
-  uint16x8_t __s2_530 = __p2_530; \
-  __ret_530 = __s0_530 - vmull_u16(__s1_530, splat_laneq_u16(__s2_530, __p3_530)); \
-  __ret_530; \
-})
-#else
-#define vmlsl_laneq_u16(__p0_531, __p1_531, __p2_531, __p3_531) __extension__ ({ \
-  uint32x4_t __ret_531; \
-  uint32x4_t __s0_531 = __p0_531; \
-  uint16x4_t __s1_531 = __p1_531; \
-  uint16x8_t __s2_531 = __p2_531; \
-  uint32x4_t __rev0_531;  __rev0_531 = __builtin_shufflevector(__s0_531, __s0_531, 3, 2, 1, 0); \
-  uint16x4_t __rev1_531;  __rev1_531 = __builtin_shufflevector(__s1_531, __s1_531, 3, 2, 1, 0); \
-  uint16x8_t __rev2_531;  __rev2_531 = __builtin_shufflevector(__s2_531, __s2_531, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_531 = __rev0_531 - __noswap_vmull_u16(__rev1_531, __noswap_splat_laneq_u16(__rev2_531, __p3_531)); \
-  __ret_531 = __builtin_shufflevector(__ret_531, __ret_531, 3, 2, 1, 0); \
-  __ret_531; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_laneq_s32(__p0_532, __p1_532, __p2_532, __p3_532) __extension__ ({ \
-  int64x2_t __ret_532; \
-  int64x2_t __s0_532 = __p0_532; \
-  int32x2_t __s1_532 = __p1_532; \
-  int32x4_t __s2_532 = __p2_532; \
-  __ret_532 = __s0_532 - vmull_s32(__s1_532, splat_laneq_s32(__s2_532, __p3_532)); \
-  __ret_532; \
-})
-#else
-#define vmlsl_laneq_s32(__p0_533, __p1_533, __p2_533, __p3_533) __extension__ ({ \
-  int64x2_t __ret_533; \
-  int64x2_t __s0_533 = __p0_533; \
-  int32x2_t __s1_533 = __p1_533; \
-  int32x4_t __s2_533 = __p2_533; \
-  int64x2_t __rev0_533;  __rev0_533 = __builtin_shufflevector(__s0_533, __s0_533, 1, 0); \
-  int32x2_t __rev1_533;  __rev1_533 = __builtin_shufflevector(__s1_533, __s1_533, 1, 0); \
-  int32x4_t __rev2_533;  __rev2_533 = __builtin_shufflevector(__s2_533, __s2_533, 3, 2, 1, 0); \
-  __ret_533 = __rev0_533 - __noswap_vmull_s32(__rev1_533, __noswap_splat_laneq_s32(__rev2_533, __p3_533)); \
-  __ret_533 = __builtin_shufflevector(__ret_533, __ret_533, 1, 0); \
-  __ret_533; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_laneq_s16(__p0_534, __p1_534, __p2_534, __p3_534) __extension__ ({ \
-  int32x4_t __ret_534; \
-  int32x4_t __s0_534 = __p0_534; \
-  int16x4_t __s1_534 = __p1_534; \
-  int16x8_t __s2_534 = __p2_534; \
-  __ret_534 = __s0_534 - vmull_s16(__s1_534, splat_laneq_s16(__s2_534, __p3_534)); \
-  __ret_534; \
-})
-#else
-#define vmlsl_laneq_s16(__p0_535, __p1_535, __p2_535, __p3_535) __extension__ ({ \
-  int32x4_t __ret_535; \
-  int32x4_t __s0_535 = __p0_535; \
-  int16x4_t __s1_535 = __p1_535; \
-  int16x8_t __s2_535 = __p2_535; \
-  int32x4_t __rev0_535;  __rev0_535 = __builtin_shufflevector(__s0_535, __s0_535, 3, 2, 1, 0); \
-  int16x4_t __rev1_535;  __rev1_535 = __builtin_shufflevector(__s1_535, __s1_535, 3, 2, 1, 0); \
-  int16x8_t __rev2_535;  __rev2_535 = __builtin_shufflevector(__s2_535, __s2_535, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_535 = __rev0_535 - __noswap_vmull_s16(__rev1_535, __noswap_splat_laneq_s16(__rev2_535, __p3_535)); \
-  __ret_535 = __builtin_shufflevector(__ret_535, __ret_535, 3, 2, 1, 0); \
-  __ret_535; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) poly64x1_t vmov_n_p64(poly64_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t) {__p0};
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vmovq_n_p64(poly64_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vmovq_n_p64(poly64_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vmovq_n_f64(float64_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) {__p0, __p0};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vmovq_n_f64(float64_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) {__p0, __p0};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vmov_n_f64(float64_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) {__p0};
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmovl_high_u8(uint8x16_t __p0_536) {
-  uint16x8_t __ret_536;
-  uint8x8_t __a1_536 = vget_high_u8(__p0_536);
-  __ret_536 = (uint16x8_t)(vshll_n_u8(__a1_536, 0));
-  return __ret_536;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmovl_high_u8(uint8x16_t __p0_537) {
-  uint16x8_t __ret_537;
-  uint8x16_t __rev0_537;  __rev0_537 = __builtin_shufflevector(__p0_537, __p0_537, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __a1_537 = __noswap_vget_high_u8(__rev0_537);
-  __ret_537 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_537, 0));
-  __ret_537 = __builtin_shufflevector(__ret_537, __ret_537, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret_537;
-}
-__ai __attribute__((target("neon"))) uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_538) {
-  uint16x8_t __ret_538;
-  uint8x8_t __a1_538 = __noswap_vget_high_u8(__p0_538);
-  __ret_538 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_538, 0));
-  return __ret_538;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmovl_high_u32(uint32x4_t __p0_539) {
-  uint64x2_t __ret_539;
-  uint32x2_t __a1_539 = vget_high_u32(__p0_539);
-  __ret_539 = (uint64x2_t)(vshll_n_u32(__a1_539, 0));
-  return __ret_539;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmovl_high_u32(uint32x4_t __p0_540) {
-  uint64x2_t __ret_540;
-  uint32x4_t __rev0_540;  __rev0_540 = __builtin_shufflevector(__p0_540, __p0_540, 3, 2, 1, 0);
-  uint32x2_t __a1_540 = __noswap_vget_high_u32(__rev0_540);
-  __ret_540 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_540, 0));
-  __ret_540 = __builtin_shufflevector(__ret_540, __ret_540, 1, 0);
-  return __ret_540;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_541) {
-  uint64x2_t __ret_541;
-  uint32x2_t __a1_541 = __noswap_vget_high_u32(__p0_541);
-  __ret_541 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_541, 0));
-  return __ret_541;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmovl_high_u16(uint16x8_t __p0_542) {
-  uint32x4_t __ret_542;
-  uint16x4_t __a1_542 = vget_high_u16(__p0_542);
-  __ret_542 = (uint32x4_t)(vshll_n_u16(__a1_542, 0));
-  return __ret_542;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmovl_high_u16(uint16x8_t __p0_543) {
-  uint32x4_t __ret_543;
-  uint16x8_t __rev0_543;  __rev0_543 = __builtin_shufflevector(__p0_543, __p0_543, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x4_t __a1_543 = __noswap_vget_high_u16(__rev0_543);
-  __ret_543 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_543, 0));
-  __ret_543 = __builtin_shufflevector(__ret_543, __ret_543, 3, 2, 1, 0);
-  return __ret_543;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_544) {
-  uint32x4_t __ret_544;
-  uint16x4_t __a1_544 = __noswap_vget_high_u16(__p0_544);
-  __ret_544 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_544, 0));
-  return __ret_544;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmovl_high_s8(int8x16_t __p0_545) {
-  int16x8_t __ret_545;
-  int8x8_t __a1_545 = vget_high_s8(__p0_545);
-  __ret_545 = (int16x8_t)(vshll_n_s8(__a1_545, 0));
-  return __ret_545;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmovl_high_s8(int8x16_t __p0_546) {
-  int16x8_t __ret_546;
-  int8x16_t __rev0_546;  __rev0_546 = __builtin_shufflevector(__p0_546, __p0_546, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __a1_546 = __noswap_vget_high_s8(__rev0_546);
-  __ret_546 = (int16x8_t)(__noswap_vshll_n_s8(__a1_546, 0));
-  __ret_546 = __builtin_shufflevector(__ret_546, __ret_546, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret_546;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_547) {
-  int16x8_t __ret_547;
-  int8x8_t __a1_547 = __noswap_vget_high_s8(__p0_547);
-  __ret_547 = (int16x8_t)(__noswap_vshll_n_s8(__a1_547, 0));
-  return __ret_547;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmovl_high_s32(int32x4_t __p0_548) {
-  int64x2_t __ret_548;
-  int32x2_t __a1_548 = vget_high_s32(__p0_548);
-  __ret_548 = (int64x2_t)(vshll_n_s32(__a1_548, 0));
-  return __ret_548;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmovl_high_s32(int32x4_t __p0_549) {
-  int64x2_t __ret_549;
-  int32x4_t __rev0_549;  __rev0_549 = __builtin_shufflevector(__p0_549, __p0_549, 3, 2, 1, 0);
-  int32x2_t __a1_549 = __noswap_vget_high_s32(__rev0_549);
-  __ret_549 = (int64x2_t)(__noswap_vshll_n_s32(__a1_549, 0));
-  __ret_549 = __builtin_shufflevector(__ret_549, __ret_549, 1, 0);
-  return __ret_549;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_550) {
-  int64x2_t __ret_550;
-  int32x2_t __a1_550 = __noswap_vget_high_s32(__p0_550);
-  __ret_550 = (int64x2_t)(__noswap_vshll_n_s32(__a1_550, 0));
-  return __ret_550;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmovl_high_s16(int16x8_t __p0_551) {
-  int32x4_t __ret_551;
-  int16x4_t __a1_551 = vget_high_s16(__p0_551);
-  __ret_551 = (int32x4_t)(vshll_n_s16(__a1_551, 0));
-  return __ret_551;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmovl_high_s16(int16x8_t __p0_552) {
-  int32x4_t __ret_552;
-  int16x8_t __rev0_552;  __rev0_552 = __builtin_shufflevector(__p0_552, __p0_552, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x4_t __a1_552 = __noswap_vget_high_s16(__rev0_552);
-  __ret_552 = (int32x4_t)(__noswap_vshll_n_s16(__a1_552, 0));
-  __ret_552 = __builtin_shufflevector(__ret_552, __ret_552, 3, 2, 1, 0);
-  return __ret_552;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_553) {
-  int32x4_t __ret_553;
-  int16x4_t __a1_553 = __noswap_vget_high_s16(__p0_553);
-  __ret_553 = (int32x4_t)(__noswap_vshll_n_s16(__a1_553, 0));
-  return __ret_553;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
-  uint16x8_t __ret;
-  __ret = vcombine_u16(__p0, vmovn_u32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
-  uint16x8_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u16(__rev0, __noswap_vmovn_u32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
-  uint32x4_t __ret;
-  __ret = vcombine_u32(__p0, vmovn_u64(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
-  uint32x4_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vcombine_u32(__rev0, __noswap_vmovn_u64(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
-  uint8x16_t __ret;
-  __ret = vcombine_u8(__p0, vmovn_u16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
-  uint8x16_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u8(__rev0, __noswap_vmovn_u16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
-  int16x8_t __ret;
-  __ret = vcombine_s16(__p0, vmovn_s32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
-  int16x8_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s16(__rev0, __noswap_vmovn_s32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
-  int32x4_t __ret;
-  __ret = vcombine_s32(__p0, vmovn_s64(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
-  int32x4_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vcombine_s32(__rev0, __noswap_vmovn_s64(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
-  int8x16_t __ret;
-  __ret = vcombine_s8(__p0, vmovn_s16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
-  int8x16_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s8(__rev0, __noswap_vmovn_s16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vmulq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vmulq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 * __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = __p0 * __p1;
-  return __ret;
-}
-#define vmuld_lane_f64(__p0_554, __p1_554, __p2_554) __extension__ ({ \
-  float64_t __ret_554; \
-  float64_t __s0_554 = __p0_554; \
-  float64x1_t __s1_554 = __p1_554; \
-  __ret_554 = __s0_554 * vget_lane_f64(__s1_554, __p2_554); \
-  __ret_554; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vmuls_lane_f32(__p0_555, __p1_555, __p2_555) __extension__ ({ \
-  float32_t __ret_555; \
-  float32_t __s0_555 = __p0_555; \
-  float32x2_t __s1_555 = __p1_555; \
-  __ret_555 = __s0_555 * vget_lane_f32(__s1_555, __p2_555); \
-  __ret_555; \
-})
-#else
-#define vmuls_lane_f32(__p0_556, __p1_556, __p2_556) __extension__ ({ \
-  float32_t __ret_556; \
-  float32_t __s0_556 = __p0_556; \
-  float32x2_t __s1_556 = __p1_556; \
-  float32x2_t __rev1_556;  __rev1_556 = __builtin_shufflevector(__s1_556, __s1_556, 1, 0); \
-  __ret_556 = __s0_556 * __noswap_vget_lane_f32(__rev1_556, __p2_556); \
-  __ret_556; \
-})
-#endif
-
-#define vmul_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  float64x1_t __s1 = __p1; \
-  __ret = (float64x1_t) __builtin_neon_vmul_lane_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_lane_f64(__p0_557, __p1_557, __p2_557) __extension__ ({ \
-  float64x2_t __ret_557; \
-  float64x2_t __s0_557 = __p0_557; \
-  float64x1_t __s1_557 = __p1_557; \
-  __ret_557 = __s0_557 * splatq_lane_f64(__s1_557, __p2_557); \
-  __ret_557; \
-})
-#else
-#define vmulq_lane_f64(__p0_558, __p1_558, __p2_558) __extension__ ({ \
-  float64x2_t __ret_558; \
-  float64x2_t __s0_558 = __p0_558; \
-  float64x1_t __s1_558 = __p1_558; \
-  float64x2_t __rev0_558;  __rev0_558 = __builtin_shufflevector(__s0_558, __s0_558, 1, 0); \
-  __ret_558 = __rev0_558 * __noswap_splatq_lane_f64(__s1_558, __p2_558); \
-  __ret_558 = __builtin_shufflevector(__ret_558, __ret_558, 1, 0); \
-  __ret_558; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmuld_laneq_f64(__p0_559, __p1_559, __p2_559) __extension__ ({ \
-  float64_t __ret_559; \
-  float64_t __s0_559 = __p0_559; \
-  float64x2_t __s1_559 = __p1_559; \
-  __ret_559 = __s0_559 * vgetq_lane_f64(__s1_559, __p2_559); \
-  __ret_559; \
-})
-#else
-#define vmuld_laneq_f64(__p0_560, __p1_560, __p2_560) __extension__ ({ \
-  float64_t __ret_560; \
-  float64_t __s0_560 = __p0_560; \
-  float64x2_t __s1_560 = __p1_560; \
-  float64x2_t __rev1_560;  __rev1_560 = __builtin_shufflevector(__s1_560, __s1_560, 1, 0); \
-  __ret_560 = __s0_560 * __noswap_vgetq_lane_f64(__rev1_560, __p2_560); \
-  __ret_560; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmuls_laneq_f32(__p0_561, __p1_561, __p2_561) __extension__ ({ \
-  float32_t __ret_561; \
-  float32_t __s0_561 = __p0_561; \
-  float32x4_t __s1_561 = __p1_561; \
-  __ret_561 = __s0_561 * vgetq_lane_f32(__s1_561, __p2_561); \
-  __ret_561; \
-})
-#else
-#define vmuls_laneq_f32(__p0_562, __p1_562, __p2_562) __extension__ ({ \
-  float32_t __ret_562; \
-  float32_t __s0_562 = __p0_562; \
-  float32x4_t __s1_562 = __p1_562; \
-  float32x4_t __rev1_562;  __rev1_562 = __builtin_shufflevector(__s1_562, __s1_562, 3, 2, 1, 0); \
-  __ret_562 = __s0_562 * __noswap_vgetq_lane_f32(__rev1_562, __p2_562); \
-  __ret_562; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  __ret = (float64x1_t) __builtin_neon_vmul_laneq_v((int8x8_t)__s0, (int8x16_t)__s1, __p2, 10); \
-  __ret; \
-})
-#else
-#define vmul_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (float64x1_t) __builtin_neon_vmul_laneq_v((int8x8_t)__s0, (int8x16_t)__rev1, __p2, 10); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_u32(__p0_563, __p1_563, __p2_563) __extension__ ({ \
-  uint32x4_t __ret_563; \
-  uint32x4_t __s0_563 = __p0_563; \
-  uint32x4_t __s1_563 = __p1_563; \
-  __ret_563 = __s0_563 * splatq_laneq_u32(__s1_563, __p2_563); \
-  __ret_563; \
-})
-#else
-#define vmulq_laneq_u32(__p0_564, __p1_564, __p2_564) __extension__ ({ \
-  uint32x4_t __ret_564; \
-  uint32x4_t __s0_564 = __p0_564; \
-  uint32x4_t __s1_564 = __p1_564; \
-  uint32x4_t __rev0_564;  __rev0_564 = __builtin_shufflevector(__s0_564, __s0_564, 3, 2, 1, 0); \
-  uint32x4_t __rev1_564;  __rev1_564 = __builtin_shufflevector(__s1_564, __s1_564, 3, 2, 1, 0); \
-  __ret_564 = __rev0_564 * __noswap_splatq_laneq_u32(__rev1_564, __p2_564); \
-  __ret_564 = __builtin_shufflevector(__ret_564, __ret_564, 3, 2, 1, 0); \
-  __ret_564; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_u16(__p0_565, __p1_565, __p2_565) __extension__ ({ \
-  uint16x8_t __ret_565; \
-  uint16x8_t __s0_565 = __p0_565; \
-  uint16x8_t __s1_565 = __p1_565; \
-  __ret_565 = __s0_565 * splatq_laneq_u16(__s1_565, __p2_565); \
-  __ret_565; \
-})
-#else
-#define vmulq_laneq_u16(__p0_566, __p1_566, __p2_566) __extension__ ({ \
-  uint16x8_t __ret_566; \
-  uint16x8_t __s0_566 = __p0_566; \
-  uint16x8_t __s1_566 = __p1_566; \
-  uint16x8_t __rev0_566;  __rev0_566 = __builtin_shufflevector(__s0_566, __s0_566, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_566;  __rev1_566 = __builtin_shufflevector(__s1_566, __s1_566, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_566 = __rev0_566 * __noswap_splatq_laneq_u16(__rev1_566, __p2_566); \
-  __ret_566 = __builtin_shufflevector(__ret_566, __ret_566, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_566; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_f64(__p0_567, __p1_567, __p2_567) __extension__ ({ \
-  float64x2_t __ret_567; \
-  float64x2_t __s0_567 = __p0_567; \
-  float64x2_t __s1_567 = __p1_567; \
-  __ret_567 = __s0_567 * splatq_laneq_f64(__s1_567, __p2_567); \
-  __ret_567; \
-})
-#else
-#define vmulq_laneq_f64(__p0_568, __p1_568, __p2_568) __extension__ ({ \
-  float64x2_t __ret_568; \
-  float64x2_t __s0_568 = __p0_568; \
-  float64x2_t __s1_568 = __p1_568; \
-  float64x2_t __rev0_568;  __rev0_568 = __builtin_shufflevector(__s0_568, __s0_568, 1, 0); \
-  float64x2_t __rev1_568;  __rev1_568 = __builtin_shufflevector(__s1_568, __s1_568, 1, 0); \
-  __ret_568 = __rev0_568 * __noswap_splatq_laneq_f64(__rev1_568, __p2_568); \
-  __ret_568 = __builtin_shufflevector(__ret_568, __ret_568, 1, 0); \
-  __ret_568; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_f32(__p0_569, __p1_569, __p2_569) __extension__ ({ \
-  float32x4_t __ret_569; \
-  float32x4_t __s0_569 = __p0_569; \
-  float32x4_t __s1_569 = __p1_569; \
-  __ret_569 = __s0_569 * splatq_laneq_f32(__s1_569, __p2_569); \
-  __ret_569; \
-})
-#else
-#define vmulq_laneq_f32(__p0_570, __p1_570, __p2_570) __extension__ ({ \
-  float32x4_t __ret_570; \
-  float32x4_t __s0_570 = __p0_570; \
-  float32x4_t __s1_570 = __p1_570; \
-  float32x4_t __rev0_570;  __rev0_570 = __builtin_shufflevector(__s0_570, __s0_570, 3, 2, 1, 0); \
-  float32x4_t __rev1_570;  __rev1_570 = __builtin_shufflevector(__s1_570, __s1_570, 3, 2, 1, 0); \
-  __ret_570 = __rev0_570 * __noswap_splatq_laneq_f32(__rev1_570, __p2_570); \
-  __ret_570 = __builtin_shufflevector(__ret_570, __ret_570, 3, 2, 1, 0); \
-  __ret_570; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_s32(__p0_571, __p1_571, __p2_571) __extension__ ({ \
-  int32x4_t __ret_571; \
-  int32x4_t __s0_571 = __p0_571; \
-  int32x4_t __s1_571 = __p1_571; \
-  __ret_571 = __s0_571 * splatq_laneq_s32(__s1_571, __p2_571); \
-  __ret_571; \
-})
-#else
-#define vmulq_laneq_s32(__p0_572, __p1_572, __p2_572) __extension__ ({ \
-  int32x4_t __ret_572; \
-  int32x4_t __s0_572 = __p0_572; \
-  int32x4_t __s1_572 = __p1_572; \
-  int32x4_t __rev0_572;  __rev0_572 = __builtin_shufflevector(__s0_572, __s0_572, 3, 2, 1, 0); \
-  int32x4_t __rev1_572;  __rev1_572 = __builtin_shufflevector(__s1_572, __s1_572, 3, 2, 1, 0); \
-  __ret_572 = __rev0_572 * __noswap_splatq_laneq_s32(__rev1_572, __p2_572); \
-  __ret_572 = __builtin_shufflevector(__ret_572, __ret_572, 3, 2, 1, 0); \
-  __ret_572; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_laneq_s16(__p0_573, __p1_573, __p2_573) __extension__ ({ \
-  int16x8_t __ret_573; \
-  int16x8_t __s0_573 = __p0_573; \
-  int16x8_t __s1_573 = __p1_573; \
-  __ret_573 = __s0_573 * splatq_laneq_s16(__s1_573, __p2_573); \
-  __ret_573; \
-})
-#else
-#define vmulq_laneq_s16(__p0_574, __p1_574, __p2_574) __extension__ ({ \
-  int16x8_t __ret_574; \
-  int16x8_t __s0_574 = __p0_574; \
-  int16x8_t __s1_574 = __p1_574; \
-  int16x8_t __rev0_574;  __rev0_574 = __builtin_shufflevector(__s0_574, __s0_574, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_574;  __rev1_574 = __builtin_shufflevector(__s1_574, __s1_574, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_574 = __rev0_574 * __noswap_splatq_laneq_s16(__rev1_574, __p2_574); \
-  __ret_574 = __builtin_shufflevector(__ret_574, __ret_574, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_574; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_u32(__p0_575, __p1_575, __p2_575) __extension__ ({ \
-  uint32x2_t __ret_575; \
-  uint32x2_t __s0_575 = __p0_575; \
-  uint32x4_t __s1_575 = __p1_575; \
-  __ret_575 = __s0_575 * splat_laneq_u32(__s1_575, __p2_575); \
-  __ret_575; \
-})
-#else
-#define vmul_laneq_u32(__p0_576, __p1_576, __p2_576) __extension__ ({ \
-  uint32x2_t __ret_576; \
-  uint32x2_t __s0_576 = __p0_576; \
-  uint32x4_t __s1_576 = __p1_576; \
-  uint32x2_t __rev0_576;  __rev0_576 = __builtin_shufflevector(__s0_576, __s0_576, 1, 0); \
-  uint32x4_t __rev1_576;  __rev1_576 = __builtin_shufflevector(__s1_576, __s1_576, 3, 2, 1, 0); \
-  __ret_576 = __rev0_576 * __noswap_splat_laneq_u32(__rev1_576, __p2_576); \
-  __ret_576 = __builtin_shufflevector(__ret_576, __ret_576, 1, 0); \
-  __ret_576; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_u16(__p0_577, __p1_577, __p2_577) __extension__ ({ \
-  uint16x4_t __ret_577; \
-  uint16x4_t __s0_577 = __p0_577; \
-  uint16x8_t __s1_577 = __p1_577; \
-  __ret_577 = __s0_577 * splat_laneq_u16(__s1_577, __p2_577); \
-  __ret_577; \
-})
-#else
-#define vmul_laneq_u16(__p0_578, __p1_578, __p2_578) __extension__ ({ \
-  uint16x4_t __ret_578; \
-  uint16x4_t __s0_578 = __p0_578; \
-  uint16x8_t __s1_578 = __p1_578; \
-  uint16x4_t __rev0_578;  __rev0_578 = __builtin_shufflevector(__s0_578, __s0_578, 3, 2, 1, 0); \
-  uint16x8_t __rev1_578;  __rev1_578 = __builtin_shufflevector(__s1_578, __s1_578, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_578 = __rev0_578 * __noswap_splat_laneq_u16(__rev1_578, __p2_578); \
-  __ret_578 = __builtin_shufflevector(__ret_578, __ret_578, 3, 2, 1, 0); \
-  __ret_578; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_f32(__p0_579, __p1_579, __p2_579) __extension__ ({ \
-  float32x2_t __ret_579; \
-  float32x2_t __s0_579 = __p0_579; \
-  float32x4_t __s1_579 = __p1_579; \
-  __ret_579 = __s0_579 * splat_laneq_f32(__s1_579, __p2_579); \
-  __ret_579; \
-})
-#else
-#define vmul_laneq_f32(__p0_580, __p1_580, __p2_580) __extension__ ({ \
-  float32x2_t __ret_580; \
-  float32x2_t __s0_580 = __p0_580; \
-  float32x4_t __s1_580 = __p1_580; \
-  float32x2_t __rev0_580;  __rev0_580 = __builtin_shufflevector(__s0_580, __s0_580, 1, 0); \
-  float32x4_t __rev1_580;  __rev1_580 = __builtin_shufflevector(__s1_580, __s1_580, 3, 2, 1, 0); \
-  __ret_580 = __rev0_580 * __noswap_splat_laneq_f32(__rev1_580, __p2_580); \
-  __ret_580 = __builtin_shufflevector(__ret_580, __ret_580, 1, 0); \
-  __ret_580; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_s32(__p0_581, __p1_581, __p2_581) __extension__ ({ \
-  int32x2_t __ret_581; \
-  int32x2_t __s0_581 = __p0_581; \
-  int32x4_t __s1_581 = __p1_581; \
-  __ret_581 = __s0_581 * splat_laneq_s32(__s1_581, __p2_581); \
-  __ret_581; \
-})
-#else
-#define vmul_laneq_s32(__p0_582, __p1_582, __p2_582) __extension__ ({ \
-  int32x2_t __ret_582; \
-  int32x2_t __s0_582 = __p0_582; \
-  int32x4_t __s1_582 = __p1_582; \
-  int32x2_t __rev0_582;  __rev0_582 = __builtin_shufflevector(__s0_582, __s0_582, 1, 0); \
-  int32x4_t __rev1_582;  __rev1_582 = __builtin_shufflevector(__s1_582, __s1_582, 3, 2, 1, 0); \
-  __ret_582 = __rev0_582 * __noswap_splat_laneq_s32(__rev1_582, __p2_582); \
-  __ret_582 = __builtin_shufflevector(__ret_582, __ret_582, 1, 0); \
-  __ret_582; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_laneq_s16(__p0_583, __p1_583, __p2_583) __extension__ ({ \
-  int16x4_t __ret_583; \
-  int16x4_t __s0_583 = __p0_583; \
-  int16x8_t __s1_583 = __p1_583; \
-  __ret_583 = __s0_583 * splat_laneq_s16(__s1_583, __p2_583); \
-  __ret_583; \
-})
-#else
-#define vmul_laneq_s16(__p0_584, __p1_584, __p2_584) __extension__ ({ \
-  int16x4_t __ret_584; \
-  int16x4_t __s0_584 = __p0_584; \
-  int16x8_t __s1_584 = __p1_584; \
-  int16x4_t __rev0_584;  __rev0_584 = __builtin_shufflevector(__s0_584, __s0_584, 3, 2, 1, 0); \
-  int16x8_t __rev1_584;  __rev1_584 = __builtin_shufflevector(__s1_584, __s1_584, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_584 = __rev0_584 * __noswap_splat_laneq_s16(__rev1_584, __p2_584); \
-  __ret_584 = __builtin_shufflevector(__ret_584, __ret_584, 3, 2, 1, 0); \
-  __ret_584; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vmul_n_f64(float64x1_t __p0, float64_t __p1) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vmul_n_f64((float64x1_t)__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vmulq_n_f64(float64x2_t __p0, float64_t __p1) {
-  float64x2_t __ret;
-  __ret = __p0 * (float64x2_t) {__p1, __p1};
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vmulq_n_f64(float64x2_t __p0, float64_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = __rev0 * (float64x2_t) {__p1, __p1};
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vmull_high_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly16x8_t __ret;
-  __ret = vmull_p8(vget_high_p8(__p0), vget_high_p8(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vmull_high_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly16x8_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmull_p8(__noswap_vget_high_p8(__rev0), __noswap_vget_high_p8(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmull_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  __ret = vmull_u8(vget_high_u8(__p0), vget_high_u8(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmull_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmull_u8(__noswap_vget_high_u8(__rev0), __noswap_vget_high_u8(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmull_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  __ret = vmull_u32(vget_high_u32(__p0), vget_high_u32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmull_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0), __noswap_vget_high_u32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmull_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  __ret = vmull_u16(vget_high_u16(__p0), vget_high_u16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmull_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0), __noswap_vget_high_u16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmull_high_s8(int8x16_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  __ret = vmull_s8(vget_high_s8(__p0), vget_high_s8(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmull_high_s8(int8x16_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmull_s8(__noswap_vget_high_s8(__rev0), __noswap_vget_high_s8(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  __ret = vmull_s32(vget_high_s32(__p0), vget_high_s32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0), __noswap_vget_high_s32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  __ret = vmull_s16(vget_high_s16(__p0), vget_high_s16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0), __noswap_vget_high_s16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_high_lane_u32(__p0_585, __p1_585, __p2_585) __extension__ ({ \
-  uint64x2_t __ret_585; \
-  uint32x4_t __s0_585 = __p0_585; \
-  uint32x2_t __s1_585 = __p1_585; \
-  __ret_585 = vmull_u32(vget_high_u32(__s0_585), splat_lane_u32(__s1_585, __p2_585)); \
-  __ret_585; \
-})
-#else
-#define vmull_high_lane_u32(__p0_586, __p1_586, __p2_586) __extension__ ({ \
-  uint64x2_t __ret_586; \
-  uint32x4_t __s0_586 = __p0_586; \
-  uint32x2_t __s1_586 = __p1_586; \
-  uint32x4_t __rev0_586;  __rev0_586 = __builtin_shufflevector(__s0_586, __s0_586, 3, 2, 1, 0); \
-  uint32x2_t __rev1_586;  __rev1_586 = __builtin_shufflevector(__s1_586, __s1_586, 1, 0); \
-  __ret_586 = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0_586), __noswap_splat_lane_u32(__rev1_586, __p2_586)); \
-  __ret_586 = __builtin_shufflevector(__ret_586, __ret_586, 1, 0); \
-  __ret_586; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_high_lane_u16(__p0_587, __p1_587, __p2_587) __extension__ ({ \
-  uint32x4_t __ret_587; \
-  uint16x8_t __s0_587 = __p0_587; \
-  uint16x4_t __s1_587 = __p1_587; \
-  __ret_587 = vmull_u16(vget_high_u16(__s0_587), splat_lane_u16(__s1_587, __p2_587)); \
-  __ret_587; \
-})
-#else
-#define vmull_high_lane_u16(__p0_588, __p1_588, __p2_588) __extension__ ({ \
-  uint32x4_t __ret_588; \
-  uint16x8_t __s0_588 = __p0_588; \
-  uint16x4_t __s1_588 = __p1_588; \
-  uint16x8_t __rev0_588;  __rev0_588 = __builtin_shufflevector(__s0_588, __s0_588, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x4_t __rev1_588;  __rev1_588 = __builtin_shufflevector(__s1_588, __s1_588, 3, 2, 1, 0); \
-  __ret_588 = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0_588), __noswap_splat_lane_u16(__rev1_588, __p2_588)); \
-  __ret_588 = __builtin_shufflevector(__ret_588, __ret_588, 3, 2, 1, 0); \
-  __ret_588; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_high_lane_s32(__p0_589, __p1_589, __p2_589) __extension__ ({ \
-  int64x2_t __ret_589; \
-  int32x4_t __s0_589 = __p0_589; \
-  int32x2_t __s1_589 = __p1_589; \
-  __ret_589 = vmull_s32(vget_high_s32(__s0_589), splat_lane_s32(__s1_589, __p2_589)); \
-  __ret_589; \
-})
-#else
-#define vmull_high_lane_s32(__p0_590, __p1_590, __p2_590) __extension__ ({ \
-  int64x2_t __ret_590; \
-  int32x4_t __s0_590 = __p0_590; \
-  int32x2_t __s1_590 = __p1_590; \
-  int32x4_t __rev0_590;  __rev0_590 = __builtin_shufflevector(__s0_590, __s0_590, 3, 2, 1, 0); \
-  int32x2_t __rev1_590;  __rev1_590 = __builtin_shufflevector(__s1_590, __s1_590, 1, 0); \
-  __ret_590 = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0_590), __noswap_splat_lane_s32(__rev1_590, __p2_590)); \
-  __ret_590 = __builtin_shufflevector(__ret_590, __ret_590, 1, 0); \
-  __ret_590; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_high_lane_s16(__p0_591, __p1_591, __p2_591) __extension__ ({ \
-  int32x4_t __ret_591; \
-  int16x8_t __s0_591 = __p0_591; \
-  int16x4_t __s1_591 = __p1_591; \
-  __ret_591 = vmull_s16(vget_high_s16(__s0_591), splat_lane_s16(__s1_591, __p2_591)); \
-  __ret_591; \
-})
-#else
-#define vmull_high_lane_s16(__p0_592, __p1_592, __p2_592) __extension__ ({ \
-  int32x4_t __ret_592; \
-  int16x8_t __s0_592 = __p0_592; \
-  int16x4_t __s1_592 = __p1_592; \
-  int16x8_t __rev0_592;  __rev0_592 = __builtin_shufflevector(__s0_592, __s0_592, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev1_592;  __rev1_592 = __builtin_shufflevector(__s1_592, __s1_592, 3, 2, 1, 0); \
-  __ret_592 = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0_592), __noswap_splat_lane_s16(__rev1_592, __p2_592)); \
-  __ret_592 = __builtin_shufflevector(__ret_592, __ret_592, 3, 2, 1, 0); \
-  __ret_592; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_high_laneq_u32(__p0_593, __p1_593, __p2_593) __extension__ ({ \
-  uint64x2_t __ret_593; \
-  uint32x4_t __s0_593 = __p0_593; \
-  uint32x4_t __s1_593 = __p1_593; \
-  __ret_593 = vmull_u32(vget_high_u32(__s0_593), splat_laneq_u32(__s1_593, __p2_593)); \
-  __ret_593; \
-})
-#else
-#define vmull_high_laneq_u32(__p0_594, __p1_594, __p2_594) __extension__ ({ \
-  uint64x2_t __ret_594; \
-  uint32x4_t __s0_594 = __p0_594; \
-  uint32x4_t __s1_594 = __p1_594; \
-  uint32x4_t __rev0_594;  __rev0_594 = __builtin_shufflevector(__s0_594, __s0_594, 3, 2, 1, 0); \
-  uint32x4_t __rev1_594;  __rev1_594 = __builtin_shufflevector(__s1_594, __s1_594, 3, 2, 1, 0); \
-  __ret_594 = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0_594), __noswap_splat_laneq_u32(__rev1_594, __p2_594)); \
-  __ret_594 = __builtin_shufflevector(__ret_594, __ret_594, 1, 0); \
-  __ret_594; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_high_laneq_u16(__p0_595, __p1_595, __p2_595) __extension__ ({ \
-  uint32x4_t __ret_595; \
-  uint16x8_t __s0_595 = __p0_595; \
-  uint16x8_t __s1_595 = __p1_595; \
-  __ret_595 = vmull_u16(vget_high_u16(__s0_595), splat_laneq_u16(__s1_595, __p2_595)); \
-  __ret_595; \
-})
-#else
-#define vmull_high_laneq_u16(__p0_596, __p1_596, __p2_596) __extension__ ({ \
-  uint32x4_t __ret_596; \
-  uint16x8_t __s0_596 = __p0_596; \
-  uint16x8_t __s1_596 = __p1_596; \
-  uint16x8_t __rev0_596;  __rev0_596 = __builtin_shufflevector(__s0_596, __s0_596, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_596;  __rev1_596 = __builtin_shufflevector(__s1_596, __s1_596, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_596 = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0_596), __noswap_splat_laneq_u16(__rev1_596, __p2_596)); \
-  __ret_596 = __builtin_shufflevector(__ret_596, __ret_596, 3, 2, 1, 0); \
-  __ret_596; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_high_laneq_s32(__p0_597, __p1_597, __p2_597) __extension__ ({ \
-  int64x2_t __ret_597; \
-  int32x4_t __s0_597 = __p0_597; \
-  int32x4_t __s1_597 = __p1_597; \
-  __ret_597 = vmull_s32(vget_high_s32(__s0_597), splat_laneq_s32(__s1_597, __p2_597)); \
-  __ret_597; \
-})
-#else
-#define vmull_high_laneq_s32(__p0_598, __p1_598, __p2_598) __extension__ ({ \
-  int64x2_t __ret_598; \
-  int32x4_t __s0_598 = __p0_598; \
-  int32x4_t __s1_598 = __p1_598; \
-  int32x4_t __rev0_598;  __rev0_598 = __builtin_shufflevector(__s0_598, __s0_598, 3, 2, 1, 0); \
-  int32x4_t __rev1_598;  __rev1_598 = __builtin_shufflevector(__s1_598, __s1_598, 3, 2, 1, 0); \
-  __ret_598 = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0_598), __noswap_splat_laneq_s32(__rev1_598, __p2_598)); \
-  __ret_598 = __builtin_shufflevector(__ret_598, __ret_598, 1, 0); \
-  __ret_598; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_high_laneq_s16(__p0_599, __p1_599, __p2_599) __extension__ ({ \
-  int32x4_t __ret_599; \
-  int16x8_t __s0_599 = __p0_599; \
-  int16x8_t __s1_599 = __p1_599; \
-  __ret_599 = vmull_s16(vget_high_s16(__s0_599), splat_laneq_s16(__s1_599, __p2_599)); \
-  __ret_599; \
-})
-#else
-#define vmull_high_laneq_s16(__p0_600, __p1_600, __p2_600) __extension__ ({ \
-  int32x4_t __ret_600; \
-  int16x8_t __s0_600 = __p0_600; \
-  int16x8_t __s1_600 = __p1_600; \
-  int16x8_t __rev0_600;  __rev0_600 = __builtin_shufflevector(__s0_600, __s0_600, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_600;  __rev1_600 = __builtin_shufflevector(__s1_600, __s1_600, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_600 = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0_600), __noswap_splat_laneq_s16(__rev1_600, __p2_600)); \
-  __ret_600 = __builtin_shufflevector(__ret_600, __ret_600, 3, 2, 1, 0); \
-  __ret_600; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmull_high_n_u32(uint32x4_t __p0, uint32_t __p1) {
-  uint64x2_t __ret;
-  __ret = vmull_n_u32(vget_high_u32(__p0), __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmull_high_n_u32(uint32x4_t __p0, uint32_t __p1) {
-  uint64x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vmull_n_u32(__noswap_vget_high_u32(__rev0), __p1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmull_high_n_u16(uint16x8_t __p0, uint16_t __p1) {
-  uint32x4_t __ret;
-  __ret = vmull_n_u16(vget_high_u16(__p0), __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmull_high_n_u16(uint16x8_t __p0, uint16_t __p1) {
-  uint32x4_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmull_n_u16(__noswap_vget_high_u16(__rev0), __p1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  __ret = vmull_n_s32(vget_high_s32(__p0), __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vmull_n_s32(__noswap_vget_high_s32(__rev0), __p1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  __ret = vmull_n_s16(vget_high_s16(__p0), __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmull_n_s16(__noswap_vget_high_s16(__rev0), __p1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_laneq_u32(__p0_601, __p1_601, __p2_601) __extension__ ({ \
-  uint64x2_t __ret_601; \
-  uint32x2_t __s0_601 = __p0_601; \
-  uint32x4_t __s1_601 = __p1_601; \
-  __ret_601 = vmull_u32(__s0_601, splat_laneq_u32(__s1_601, __p2_601)); \
-  __ret_601; \
-})
-#else
-#define vmull_laneq_u32(__p0_602, __p1_602, __p2_602) __extension__ ({ \
-  uint64x2_t __ret_602; \
-  uint32x2_t __s0_602 = __p0_602; \
-  uint32x4_t __s1_602 = __p1_602; \
-  uint32x2_t __rev0_602;  __rev0_602 = __builtin_shufflevector(__s0_602, __s0_602, 1, 0); \
-  uint32x4_t __rev1_602;  __rev1_602 = __builtin_shufflevector(__s1_602, __s1_602, 3, 2, 1, 0); \
-  __ret_602 = __noswap_vmull_u32(__rev0_602, __noswap_splat_laneq_u32(__rev1_602, __p2_602)); \
-  __ret_602 = __builtin_shufflevector(__ret_602, __ret_602, 1, 0); \
-  __ret_602; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_laneq_u16(__p0_603, __p1_603, __p2_603) __extension__ ({ \
-  uint32x4_t __ret_603; \
-  uint16x4_t __s0_603 = __p0_603; \
-  uint16x8_t __s1_603 = __p1_603; \
-  __ret_603 = vmull_u16(__s0_603, splat_laneq_u16(__s1_603, __p2_603)); \
-  __ret_603; \
-})
-#else
-#define vmull_laneq_u16(__p0_604, __p1_604, __p2_604) __extension__ ({ \
-  uint32x4_t __ret_604; \
-  uint16x4_t __s0_604 = __p0_604; \
-  uint16x8_t __s1_604 = __p1_604; \
-  uint16x4_t __rev0_604;  __rev0_604 = __builtin_shufflevector(__s0_604, __s0_604, 3, 2, 1, 0); \
-  uint16x8_t __rev1_604;  __rev1_604 = __builtin_shufflevector(__s1_604, __s1_604, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_604 = __noswap_vmull_u16(__rev0_604, __noswap_splat_laneq_u16(__rev1_604, __p2_604)); \
-  __ret_604 = __builtin_shufflevector(__ret_604, __ret_604, 3, 2, 1, 0); \
-  __ret_604; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_laneq_s32(__p0_605, __p1_605, __p2_605) __extension__ ({ \
-  int64x2_t __ret_605; \
-  int32x2_t __s0_605 = __p0_605; \
-  int32x4_t __s1_605 = __p1_605; \
-  __ret_605 = vmull_s32(__s0_605, splat_laneq_s32(__s1_605, __p2_605)); \
-  __ret_605; \
-})
-#else
-#define vmull_laneq_s32(__p0_606, __p1_606, __p2_606) __extension__ ({ \
-  int64x2_t __ret_606; \
-  int32x2_t __s0_606 = __p0_606; \
-  int32x4_t __s1_606 = __p1_606; \
-  int32x2_t __rev0_606;  __rev0_606 = __builtin_shufflevector(__s0_606, __s0_606, 1, 0); \
-  int32x4_t __rev1_606;  __rev1_606 = __builtin_shufflevector(__s1_606, __s1_606, 3, 2, 1, 0); \
-  __ret_606 = __noswap_vmull_s32(__rev0_606, __noswap_splat_laneq_s32(__rev1_606, __p2_606)); \
-  __ret_606 = __builtin_shufflevector(__ret_606, __ret_606, 1, 0); \
-  __ret_606; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmull_laneq_s16(__p0_607, __p1_607, __p2_607) __extension__ ({ \
-  int32x4_t __ret_607; \
-  int16x4_t __s0_607 = __p0_607; \
-  int16x8_t __s1_607 = __p1_607; \
-  __ret_607 = vmull_s16(__s0_607, splat_laneq_s16(__s1_607, __p2_607)); \
-  __ret_607; \
-})
-#else
-#define vmull_laneq_s16(__p0_608, __p1_608, __p2_608) __extension__ ({ \
-  int32x4_t __ret_608; \
-  int16x4_t __s0_608 = __p0_608; \
-  int16x8_t __s1_608 = __p1_608; \
-  int16x4_t __rev0_608;  __rev0_608 = __builtin_shufflevector(__s0_608, __s0_608, 3, 2, 1, 0); \
-  int16x8_t __rev1_608;  __rev1_608 = __builtin_shufflevector(__s1_608, __s1_608, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_608 = __noswap_vmull_s16(__rev0_608, __noswap_splat_laneq_s16(__rev1_608, __p2_608)); \
-  __ret_608 = __builtin_shufflevector(__ret_608, __ret_608, 3, 2, 1, 0); \
-  __ret_608; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vmulxq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vmulxq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vmulxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t __noswap_vmulxq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vmulxq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vmulxq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vmulxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t __noswap_vmulxq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vmulx_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vmulx_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vmulx_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vmulx_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t __noswap_vmulx_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64_t vmulxd_f64(float64_t __p0, float64_t __p1) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vmulxd_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32_t vmulxs_f32(float32_t __p0, float32_t __p1) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vmulxs_f32(__p0, __p1);
-  return __ret;
-}
-#define vmulxd_lane_f64(__p0_609, __p1_609, __p2_609) __extension__ ({ \
-  float64_t __ret_609; \
-  float64_t __s0_609 = __p0_609; \
-  float64x1_t __s1_609 = __p1_609; \
-  __ret_609 = vmulxd_f64(__s0_609, vget_lane_f64(__s1_609, __p2_609)); \
-  __ret_609; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vmulxs_lane_f32(__p0_610, __p1_610, __p2_610) __extension__ ({ \
-  float32_t __ret_610; \
-  float32_t __s0_610 = __p0_610; \
-  float32x2_t __s1_610 = __p1_610; \
-  __ret_610 = vmulxs_f32(__s0_610, vget_lane_f32(__s1_610, __p2_610)); \
-  __ret_610; \
-})
-#else
-#define vmulxs_lane_f32(__p0_611, __p1_611, __p2_611) __extension__ ({ \
-  float32_t __ret_611; \
-  float32_t __s0_611 = __p0_611; \
-  float32x2_t __s1_611 = __p1_611; \
-  float32x2_t __rev1_611;  __rev1_611 = __builtin_shufflevector(__s1_611, __s1_611, 1, 0); \
-  __ret_611 = vmulxs_f32(__s0_611, __noswap_vget_lane_f32(__rev1_611, __p2_611)); \
-  __ret_611; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxq_lane_f64(__p0_612, __p1_612, __p2_612) __extension__ ({ \
-  float64x2_t __ret_612; \
-  float64x2_t __s0_612 = __p0_612; \
-  float64x1_t __s1_612 = __p1_612; \
-  __ret_612 = vmulxq_f64(__s0_612, splatq_lane_f64(__s1_612, __p2_612)); \
-  __ret_612; \
-})
-#else
-#define vmulxq_lane_f64(__p0_613, __p1_613, __p2_613) __extension__ ({ \
-  float64x2_t __ret_613; \
-  float64x2_t __s0_613 = __p0_613; \
-  float64x1_t __s1_613 = __p1_613; \
-  float64x2_t __rev0_613;  __rev0_613 = __builtin_shufflevector(__s0_613, __s0_613, 1, 0); \
-  __ret_613 = __noswap_vmulxq_f64(__rev0_613, __noswap_splatq_lane_f64(__s1_613, __p2_613)); \
-  __ret_613 = __builtin_shufflevector(__ret_613, __ret_613, 1, 0); \
-  __ret_613; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxq_lane_f32(__p0_614, __p1_614, __p2_614) __extension__ ({ \
-  float32x4_t __ret_614; \
-  float32x4_t __s0_614 = __p0_614; \
-  float32x2_t __s1_614 = __p1_614; \
-  __ret_614 = vmulxq_f32(__s0_614, splatq_lane_f32(__s1_614, __p2_614)); \
-  __ret_614; \
-})
-#else
-#define vmulxq_lane_f32(__p0_615, __p1_615, __p2_615) __extension__ ({ \
-  float32x4_t __ret_615; \
-  float32x4_t __s0_615 = __p0_615; \
-  float32x2_t __s1_615 = __p1_615; \
-  float32x4_t __rev0_615;  __rev0_615 = __builtin_shufflevector(__s0_615, __s0_615, 3, 2, 1, 0); \
-  float32x2_t __rev1_615;  __rev1_615 = __builtin_shufflevector(__s1_615, __s1_615, 1, 0); \
-  __ret_615 = __noswap_vmulxq_f32(__rev0_615, __noswap_splatq_lane_f32(__rev1_615, __p2_615)); \
-  __ret_615 = __builtin_shufflevector(__ret_615, __ret_615, 3, 2, 1, 0); \
-  __ret_615; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulx_lane_f32(__p0_616, __p1_616, __p2_616) __extension__ ({ \
-  float32x2_t __ret_616; \
-  float32x2_t __s0_616 = __p0_616; \
-  float32x2_t __s1_616 = __p1_616; \
-  __ret_616 = vmulx_f32(__s0_616, splat_lane_f32(__s1_616, __p2_616)); \
-  __ret_616; \
-})
-#else
-#define vmulx_lane_f32(__p0_617, __p1_617, __p2_617) __extension__ ({ \
-  float32x2_t __ret_617; \
-  float32x2_t __s0_617 = __p0_617; \
-  float32x2_t __s1_617 = __p1_617; \
-  float32x2_t __rev0_617;  __rev0_617 = __builtin_shufflevector(__s0_617, __s0_617, 1, 0); \
-  float32x2_t __rev1_617;  __rev1_617 = __builtin_shufflevector(__s1_617, __s1_617, 1, 0); \
-  __ret_617 = __noswap_vmulx_f32(__rev0_617, __noswap_splat_lane_f32(__rev1_617, __p2_617)); \
-  __ret_617 = __builtin_shufflevector(__ret_617, __ret_617, 1, 0); \
-  __ret_617; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxd_laneq_f64(__p0_618, __p1_618, __p2_618) __extension__ ({ \
-  float64_t __ret_618; \
-  float64_t __s0_618 = __p0_618; \
-  float64x2_t __s1_618 = __p1_618; \
-  __ret_618 = vmulxd_f64(__s0_618, vgetq_lane_f64(__s1_618, __p2_618)); \
-  __ret_618; \
-})
-#else
-#define vmulxd_laneq_f64(__p0_619, __p1_619, __p2_619) __extension__ ({ \
-  float64_t __ret_619; \
-  float64_t __s0_619 = __p0_619; \
-  float64x2_t __s1_619 = __p1_619; \
-  float64x2_t __rev1_619;  __rev1_619 = __builtin_shufflevector(__s1_619, __s1_619, 1, 0); \
-  __ret_619 = vmulxd_f64(__s0_619, __noswap_vgetq_lane_f64(__rev1_619, __p2_619)); \
-  __ret_619; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxs_laneq_f32(__p0_620, __p1_620, __p2_620) __extension__ ({ \
-  float32_t __ret_620; \
-  float32_t __s0_620 = __p0_620; \
-  float32x4_t __s1_620 = __p1_620; \
-  __ret_620 = vmulxs_f32(__s0_620, vgetq_lane_f32(__s1_620, __p2_620)); \
-  __ret_620; \
-})
-#else
-#define vmulxs_laneq_f32(__p0_621, __p1_621, __p2_621) __extension__ ({ \
-  float32_t __ret_621; \
-  float32_t __s0_621 = __p0_621; \
-  float32x4_t __s1_621 = __p1_621; \
-  float32x4_t __rev1_621;  __rev1_621 = __builtin_shufflevector(__s1_621, __s1_621, 3, 2, 1, 0); \
-  __ret_621 = vmulxs_f32(__s0_621, __noswap_vgetq_lane_f32(__rev1_621, __p2_621)); \
-  __ret_621; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxq_laneq_f64(__p0_622, __p1_622, __p2_622) __extension__ ({ \
-  float64x2_t __ret_622; \
-  float64x2_t __s0_622 = __p0_622; \
-  float64x2_t __s1_622 = __p1_622; \
-  __ret_622 = vmulxq_f64(__s0_622, splatq_laneq_f64(__s1_622, __p2_622)); \
-  __ret_622; \
-})
-#else
-#define vmulxq_laneq_f64(__p0_623, __p1_623, __p2_623) __extension__ ({ \
-  float64x2_t __ret_623; \
-  float64x2_t __s0_623 = __p0_623; \
-  float64x2_t __s1_623 = __p1_623; \
-  float64x2_t __rev0_623;  __rev0_623 = __builtin_shufflevector(__s0_623, __s0_623, 1, 0); \
-  float64x2_t __rev1_623;  __rev1_623 = __builtin_shufflevector(__s1_623, __s1_623, 1, 0); \
-  __ret_623 = __noswap_vmulxq_f64(__rev0_623, __noswap_splatq_laneq_f64(__rev1_623, __p2_623)); \
-  __ret_623 = __builtin_shufflevector(__ret_623, __ret_623, 1, 0); \
-  __ret_623; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulxq_laneq_f32(__p0_624, __p1_624, __p2_624) __extension__ ({ \
-  float32x4_t __ret_624; \
-  float32x4_t __s0_624 = __p0_624; \
-  float32x4_t __s1_624 = __p1_624; \
-  __ret_624 = vmulxq_f32(__s0_624, splatq_laneq_f32(__s1_624, __p2_624)); \
-  __ret_624; \
-})
-#else
-#define vmulxq_laneq_f32(__p0_625, __p1_625, __p2_625) __extension__ ({ \
-  float32x4_t __ret_625; \
-  float32x4_t __s0_625 = __p0_625; \
-  float32x4_t __s1_625 = __p1_625; \
-  float32x4_t __rev0_625;  __rev0_625 = __builtin_shufflevector(__s0_625, __s0_625, 3, 2, 1, 0); \
-  float32x4_t __rev1_625;  __rev1_625 = __builtin_shufflevector(__s1_625, __s1_625, 3, 2, 1, 0); \
-  __ret_625 = __noswap_vmulxq_f32(__rev0_625, __noswap_splatq_laneq_f32(__rev1_625, __p2_625)); \
-  __ret_625 = __builtin_shufflevector(__ret_625, __ret_625, 3, 2, 1, 0); \
-  __ret_625; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulx_laneq_f32(__p0_626, __p1_626, __p2_626) __extension__ ({ \
-  float32x2_t __ret_626; \
-  float32x2_t __s0_626 = __p0_626; \
-  float32x4_t __s1_626 = __p1_626; \
-  __ret_626 = vmulx_f32(__s0_626, splat_laneq_f32(__s1_626, __p2_626)); \
-  __ret_626; \
-})
-#else
-#define vmulx_laneq_f32(__p0_627, __p1_627, __p2_627) __extension__ ({ \
-  float32x2_t __ret_627; \
-  float32x2_t __s0_627 = __p0_627; \
-  float32x4_t __s1_627 = __p1_627; \
-  float32x2_t __rev0_627;  __rev0_627 = __builtin_shufflevector(__s0_627, __s0_627, 1, 0); \
-  float32x4_t __rev1_627;  __rev1_627 = __builtin_shufflevector(__s1_627, __s1_627, 3, 2, 1, 0); \
-  __ret_627 = __noswap_vmulx_f32(__rev0_627, __noswap_splat_laneq_f32(__rev1_627, __p2_627)); \
-  __ret_627 = __builtin_shufflevector(__ret_627, __ret_627, 1, 0); \
-  __ret_627; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vnegq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vnegq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vnegq_s64(int64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vnegq_s64(int64x2_t __p0) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = -__rev0;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vneg_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vneg_s64(int64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = -__p0;
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vnegd_s64(int64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vnegd_s64(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vpaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vpaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vpaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vpaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vpaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vpaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vpaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vpaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vpaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vpaddq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vpaddq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vpaddq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vpaddq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vpaddq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vpaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vpaddq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vpaddq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vpaddq_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vpaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vpaddq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64_t vpaddd_u64(uint64x2_t __p0) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vpaddd_u64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64_t vpaddd_u64(uint64x2_t __p0) {
-  uint64_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (uint64_t) __builtin_neon_vpaddd_u64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vpaddd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vpaddd_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vpaddd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vpaddd_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64_t vpaddd_s64(int64x2_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vpaddd_s64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64_t vpaddd_s64(int64x2_t __p0) {
-  int64_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64_t) __builtin_neon_vpaddd_s64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vpadds_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vpadds_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vpadds_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vpadds_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vpmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vpmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vpmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vpmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vpmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vpmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vpmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vpmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vpmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vpmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vpmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vpmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vpmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vpmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vpmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vpmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vpmaxqd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vpmaxqd_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vpmaxqd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vpmaxqd_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vpmaxs_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vpmaxs_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vpmaxs_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vpmaxs_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vpmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vpmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vpmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vpmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vpmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vpmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vpmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vpmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vpmaxnmqd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vpmaxnmqd_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vpmaxnmqd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vpmaxnmqd_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vpmaxnms_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vpmaxnms_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vpmaxnms_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vpmaxnms_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vpminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vpminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vpminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vpminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vpminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vpminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vpminq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vpminq_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vpminq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vpminq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vpminq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vpminq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vpminq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vpminq_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vpminq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vpminq_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vpminqd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vpminqd_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vpminqd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vpminqd_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vpmins_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vpmins_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vpmins_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vpmins_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vpminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vpminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vpminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vpminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vpminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vpminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vpminnm_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vpminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vpminnm_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vpminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64_t vpminnmqd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vpminnmqd_f64(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64_t vpminnmqd_f64(float64x2_t __p0) {
-  float64_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64_t) __builtin_neon_vpminnmqd_f64(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32_t vpminnms_f32(float32x2_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vpminnms_f32(__p0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32_t vpminnms_f32(float32x2_t __p0) {
-  float32_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32_t) __builtin_neon_vpminnms_f32(__rev0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqabsq_s64(int64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqabsq_s64(int64x2_t __p0) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vqabs_s64(int64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8_t vqabsb_s8(int8_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqabsb_s8(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqabss_s32(int32_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqabss_s32(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vqabsd_s64(int64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqabsd_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16_t vqabsh_s16(int16_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqabsh_s16(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8_t vqaddb_u8(uint8_t __p0, uint8_t __p1) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vqaddb_u8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vqadds_u32(uint32_t __p0, uint32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vqadds_u32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vqaddd_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vqaddd_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16_t vqaddh_u16(uint16_t __p0, uint16_t __p1) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vqaddh_u16(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8_t vqaddb_s8(int8_t __p0, int8_t __p1) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqaddb_s8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqadds_s32(int32_t __p0, int32_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqadds_s32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vqaddd_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqaddd_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16_t vqaddh_s16(int16_t __p0, int16_t __p1) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqaddh_s16(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vqdmlals_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqdmlals_s32(__p0, __p1, __p2);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqdmlalh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqdmlalh_s16(__p0, __p1, __p2);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  __ret = vqdmlal_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  __ret = vqdmlal_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlal_high_lane_s32(__p0_628, __p1_628, __p2_628, __p3_628) __extension__ ({ \
-  int64x2_t __ret_628; \
-  int64x2_t __s0_628 = __p0_628; \
-  int32x4_t __s1_628 = __p1_628; \
-  int32x2_t __s2_628 = __p2_628; \
-  __ret_628 = vqdmlal_s32(__s0_628, vget_high_s32(__s1_628), splat_lane_s32(__s2_628, __p3_628)); \
-  __ret_628; \
-})
-#else
-#define vqdmlal_high_lane_s32(__p0_629, __p1_629, __p2_629, __p3_629) __extension__ ({ \
-  int64x2_t __ret_629; \
-  int64x2_t __s0_629 = __p0_629; \
-  int32x4_t __s1_629 = __p1_629; \
-  int32x2_t __s2_629 = __p2_629; \
-  int64x2_t __rev0_629;  __rev0_629 = __builtin_shufflevector(__s0_629, __s0_629, 1, 0); \
-  int32x4_t __rev1_629;  __rev1_629 = __builtin_shufflevector(__s1_629, __s1_629, 3, 2, 1, 0); \
-  int32x2_t __rev2_629;  __rev2_629 = __builtin_shufflevector(__s2_629, __s2_629, 1, 0); \
-  __ret_629 = __noswap_vqdmlal_s32(__rev0_629, __noswap_vget_high_s32(__rev1_629), __noswap_splat_lane_s32(__rev2_629, __p3_629)); \
-  __ret_629 = __builtin_shufflevector(__ret_629, __ret_629, 1, 0); \
-  __ret_629; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlal_high_lane_s16(__p0_630, __p1_630, __p2_630, __p3_630) __extension__ ({ \
-  int32x4_t __ret_630; \
-  int32x4_t __s0_630 = __p0_630; \
-  int16x8_t __s1_630 = __p1_630; \
-  int16x4_t __s2_630 = __p2_630; \
-  __ret_630 = vqdmlal_s16(__s0_630, vget_high_s16(__s1_630), splat_lane_s16(__s2_630, __p3_630)); \
-  __ret_630; \
-})
-#else
-#define vqdmlal_high_lane_s16(__p0_631, __p1_631, __p2_631, __p3_631) __extension__ ({ \
-  int32x4_t __ret_631; \
-  int32x4_t __s0_631 = __p0_631; \
-  int16x8_t __s1_631 = __p1_631; \
-  int16x4_t __s2_631 = __p2_631; \
-  int32x4_t __rev0_631;  __rev0_631 = __builtin_shufflevector(__s0_631, __s0_631, 3, 2, 1, 0); \
-  int16x8_t __rev1_631;  __rev1_631 = __builtin_shufflevector(__s1_631, __s1_631, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_631;  __rev2_631 = __builtin_shufflevector(__s2_631, __s2_631, 3, 2, 1, 0); \
-  __ret_631 = __noswap_vqdmlal_s16(__rev0_631, __noswap_vget_high_s16(__rev1_631), __noswap_splat_lane_s16(__rev2_631, __p3_631)); \
-  __ret_631 = __builtin_shufflevector(__ret_631, __ret_631, 3, 2, 1, 0); \
-  __ret_631; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlal_high_laneq_s32(__p0_632, __p1_632, __p2_632, __p3_632) __extension__ ({ \
-  int64x2_t __ret_632; \
-  int64x2_t __s0_632 = __p0_632; \
-  int32x4_t __s1_632 = __p1_632; \
-  int32x4_t __s2_632 = __p2_632; \
-  __ret_632 = vqdmlal_s32(__s0_632, vget_high_s32(__s1_632), splat_laneq_s32(__s2_632, __p3_632)); \
-  __ret_632; \
-})
-#else
-#define vqdmlal_high_laneq_s32(__p0_633, __p1_633, __p2_633, __p3_633) __extension__ ({ \
-  int64x2_t __ret_633; \
-  int64x2_t __s0_633 = __p0_633; \
-  int32x4_t __s1_633 = __p1_633; \
-  int32x4_t __s2_633 = __p2_633; \
-  int64x2_t __rev0_633;  __rev0_633 = __builtin_shufflevector(__s0_633, __s0_633, 1, 0); \
-  int32x4_t __rev1_633;  __rev1_633 = __builtin_shufflevector(__s1_633, __s1_633, 3, 2, 1, 0); \
-  int32x4_t __rev2_633;  __rev2_633 = __builtin_shufflevector(__s2_633, __s2_633, 3, 2, 1, 0); \
-  __ret_633 = __noswap_vqdmlal_s32(__rev0_633, __noswap_vget_high_s32(__rev1_633), __noswap_splat_laneq_s32(__rev2_633, __p3_633)); \
-  __ret_633 = __builtin_shufflevector(__ret_633, __ret_633, 1, 0); \
-  __ret_633; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlal_high_laneq_s16(__p0_634, __p1_634, __p2_634, __p3_634) __extension__ ({ \
-  int32x4_t __ret_634; \
-  int32x4_t __s0_634 = __p0_634; \
-  int16x8_t __s1_634 = __p1_634; \
-  int16x8_t __s2_634 = __p2_634; \
-  __ret_634 = vqdmlal_s16(__s0_634, vget_high_s16(__s1_634), splat_laneq_s16(__s2_634, __p3_634)); \
-  __ret_634; \
-})
-#else
-#define vqdmlal_high_laneq_s16(__p0_635, __p1_635, __p2_635, __p3_635) __extension__ ({ \
-  int32x4_t __ret_635; \
-  int32x4_t __s0_635 = __p0_635; \
-  int16x8_t __s1_635 = __p1_635; \
-  int16x8_t __s2_635 = __p2_635; \
-  int32x4_t __rev0_635;  __rev0_635 = __builtin_shufflevector(__s0_635, __s0_635, 3, 2, 1, 0); \
-  int16x8_t __rev1_635;  __rev1_635 = __builtin_shufflevector(__s1_635, __s1_635, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_635;  __rev2_635 = __builtin_shufflevector(__s2_635, __s2_635, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_635 = __noswap_vqdmlal_s16(__rev0_635, __noswap_vget_high_s16(__rev1_635), __noswap_splat_laneq_s16(__rev2_635, __p3_635)); \
-  __ret_635 = __builtin_shufflevector(__ret_635, __ret_635, 3, 2, 1, 0); \
-  __ret_635; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = vqdmlal_n_s32(__p0, vget_high_s32(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlal_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = vqdmlal_n_s16(__p0, vget_high_s16(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlal_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlals_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int32_t __s1 = __p1; \
-  int32x2_t __s2 = __p2; \
-  __ret = (int64_t) __builtin_neon_vqdmlals_lane_s32(__s0, __s1, __s2, __p3); \
-  __ret; \
-})
-#else
-#define vqdmlals_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int32_t __s1 = __p1; \
-  int32x2_t __s2 = __p2; \
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
-  __ret = (int64_t) __builtin_neon_vqdmlals_lane_s32(__s0, __s1, __rev2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlalh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  int16_t __s1 = __p1; \
-  int16x4_t __s2 = __p2; \
-  __ret = (int32_t) __builtin_neon_vqdmlalh_lane_s16(__s0, __s1, __s2, __p3); \
-  __ret; \
-})
-#else
-#define vqdmlalh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  int16_t __s1 = __p1; \
-  int16x4_t __s2 = __p2; \
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (int32_t) __builtin_neon_vqdmlalh_lane_s16(__s0, __s1, __rev2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlals_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int32_t __s1 = __p1; \
-  int32x4_t __s2 = __p2; \
-  __ret = (int64_t) __builtin_neon_vqdmlals_laneq_s32(__s0, __s1, __s2, __p3); \
-  __ret; \
-})
-#else
-#define vqdmlals_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int32_t __s1 = __p1; \
-  int32x4_t __s2 = __p2; \
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (int64_t) __builtin_neon_vqdmlals_laneq_s32(__s0, __s1, __rev2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlalh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  int16_t __s1 = __p1; \
-  int16x8_t __s2 = __p2; \
-  __ret = (int32_t) __builtin_neon_vqdmlalh_laneq_s16(__s0, __s1, __s2, __p3); \
-  __ret; \
-})
-#else
-#define vqdmlalh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  int16_t __s1 = __p1; \
-  int16x8_t __s2 = __p2; \
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int32_t) __builtin_neon_vqdmlalh_laneq_s16(__s0, __s1, __rev2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlal_laneq_s32(__p0_636, __p1_636, __p2_636, __p3_636) __extension__ ({ \
-  int64x2_t __ret_636; \
-  int64x2_t __s0_636 = __p0_636; \
-  int32x2_t __s1_636 = __p1_636; \
-  int32x4_t __s2_636 = __p2_636; \
-  __ret_636 = vqdmlal_s32(__s0_636, __s1_636, splat_laneq_s32(__s2_636, __p3_636)); \
-  __ret_636; \
-})
-#else
-#define vqdmlal_laneq_s32(__p0_637, __p1_637, __p2_637, __p3_637) __extension__ ({ \
-  int64x2_t __ret_637; \
-  int64x2_t __s0_637 = __p0_637; \
-  int32x2_t __s1_637 = __p1_637; \
-  int32x4_t __s2_637 = __p2_637; \
-  int64x2_t __rev0_637;  __rev0_637 = __builtin_shufflevector(__s0_637, __s0_637, 1, 0); \
-  int32x2_t __rev1_637;  __rev1_637 = __builtin_shufflevector(__s1_637, __s1_637, 1, 0); \
-  int32x4_t __rev2_637;  __rev2_637 = __builtin_shufflevector(__s2_637, __s2_637, 3, 2, 1, 0); \
-  __ret_637 = __noswap_vqdmlal_s32(__rev0_637, __rev1_637, __noswap_splat_laneq_s32(__rev2_637, __p3_637)); \
-  __ret_637 = __builtin_shufflevector(__ret_637, __ret_637, 1, 0); \
-  __ret_637; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlal_laneq_s16(__p0_638, __p1_638, __p2_638, __p3_638) __extension__ ({ \
-  int32x4_t __ret_638; \
-  int32x4_t __s0_638 = __p0_638; \
-  int16x4_t __s1_638 = __p1_638; \
-  int16x8_t __s2_638 = __p2_638; \
-  __ret_638 = vqdmlal_s16(__s0_638, __s1_638, splat_laneq_s16(__s2_638, __p3_638)); \
-  __ret_638; \
-})
-#else
-#define vqdmlal_laneq_s16(__p0_639, __p1_639, __p2_639, __p3_639) __extension__ ({ \
-  int32x4_t __ret_639; \
-  int32x4_t __s0_639 = __p0_639; \
-  int16x4_t __s1_639 = __p1_639; \
-  int16x8_t __s2_639 = __p2_639; \
-  int32x4_t __rev0_639;  __rev0_639 = __builtin_shufflevector(__s0_639, __s0_639, 3, 2, 1, 0); \
-  int16x4_t __rev1_639;  __rev1_639 = __builtin_shufflevector(__s1_639, __s1_639, 3, 2, 1, 0); \
-  int16x8_t __rev2_639;  __rev2_639 = __builtin_shufflevector(__s2_639, __s2_639, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_639 = __noswap_vqdmlal_s16(__rev0_639, __rev1_639, __noswap_splat_laneq_s16(__rev2_639, __p3_639)); \
-  __ret_639 = __builtin_shufflevector(__ret_639, __ret_639, 3, 2, 1, 0); \
-  __ret_639; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) int64_t vqdmlsls_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqdmlsls_s32(__p0, __p1, __p2);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqdmlslh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqdmlslh_s16(__p0, __p1, __p2);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  __ret = vqdmlsl_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  __ret = vqdmlsl_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_high_lane_s32(__p0_640, __p1_640, __p2_640, __p3_640) __extension__ ({ \
-  int64x2_t __ret_640; \
-  int64x2_t __s0_640 = __p0_640; \
-  int32x4_t __s1_640 = __p1_640; \
-  int32x2_t __s2_640 = __p2_640; \
-  __ret_640 = vqdmlsl_s32(__s0_640, vget_high_s32(__s1_640), splat_lane_s32(__s2_640, __p3_640)); \
-  __ret_640; \
-})
-#else
-#define vqdmlsl_high_lane_s32(__p0_641, __p1_641, __p2_641, __p3_641) __extension__ ({ \
-  int64x2_t __ret_641; \
-  int64x2_t __s0_641 = __p0_641; \
-  int32x4_t __s1_641 = __p1_641; \
-  int32x2_t __s2_641 = __p2_641; \
-  int64x2_t __rev0_641;  __rev0_641 = __builtin_shufflevector(__s0_641, __s0_641, 1, 0); \
-  int32x4_t __rev1_641;  __rev1_641 = __builtin_shufflevector(__s1_641, __s1_641, 3, 2, 1, 0); \
-  int32x2_t __rev2_641;  __rev2_641 = __builtin_shufflevector(__s2_641, __s2_641, 1, 0); \
-  __ret_641 = __noswap_vqdmlsl_s32(__rev0_641, __noswap_vget_high_s32(__rev1_641), __noswap_splat_lane_s32(__rev2_641, __p3_641)); \
-  __ret_641 = __builtin_shufflevector(__ret_641, __ret_641, 1, 0); \
-  __ret_641; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_high_lane_s16(__p0_642, __p1_642, __p2_642, __p3_642) __extension__ ({ \
-  int32x4_t __ret_642; \
-  int32x4_t __s0_642 = __p0_642; \
-  int16x8_t __s1_642 = __p1_642; \
-  int16x4_t __s2_642 = __p2_642; \
-  __ret_642 = vqdmlsl_s16(__s0_642, vget_high_s16(__s1_642), splat_lane_s16(__s2_642, __p3_642)); \
-  __ret_642; \
-})
-#else
-#define vqdmlsl_high_lane_s16(__p0_643, __p1_643, __p2_643, __p3_643) __extension__ ({ \
-  int32x4_t __ret_643; \
-  int32x4_t __s0_643 = __p0_643; \
-  int16x8_t __s1_643 = __p1_643; \
-  int16x4_t __s2_643 = __p2_643; \
-  int32x4_t __rev0_643;  __rev0_643 = __builtin_shufflevector(__s0_643, __s0_643, 3, 2, 1, 0); \
-  int16x8_t __rev1_643;  __rev1_643 = __builtin_shufflevector(__s1_643, __s1_643, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev2_643;  __rev2_643 = __builtin_shufflevector(__s2_643, __s2_643, 3, 2, 1, 0); \
-  __ret_643 = __noswap_vqdmlsl_s16(__rev0_643, __noswap_vget_high_s16(__rev1_643), __noswap_splat_lane_s16(__rev2_643, __p3_643)); \
-  __ret_643 = __builtin_shufflevector(__ret_643, __ret_643, 3, 2, 1, 0); \
-  __ret_643; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_high_laneq_s32(__p0_644, __p1_644, __p2_644, __p3_644) __extension__ ({ \
-  int64x2_t __ret_644; \
-  int64x2_t __s0_644 = __p0_644; \
-  int32x4_t __s1_644 = __p1_644; \
-  int32x4_t __s2_644 = __p2_644; \
-  __ret_644 = vqdmlsl_s32(__s0_644, vget_high_s32(__s1_644), splat_laneq_s32(__s2_644, __p3_644)); \
-  __ret_644; \
-})
-#else
-#define vqdmlsl_high_laneq_s32(__p0_645, __p1_645, __p2_645, __p3_645) __extension__ ({ \
-  int64x2_t __ret_645; \
-  int64x2_t __s0_645 = __p0_645; \
-  int32x4_t __s1_645 = __p1_645; \
-  int32x4_t __s2_645 = __p2_645; \
-  int64x2_t __rev0_645;  __rev0_645 = __builtin_shufflevector(__s0_645, __s0_645, 1, 0); \
-  int32x4_t __rev1_645;  __rev1_645 = __builtin_shufflevector(__s1_645, __s1_645, 3, 2, 1, 0); \
-  int32x4_t __rev2_645;  __rev2_645 = __builtin_shufflevector(__s2_645, __s2_645, 3, 2, 1, 0); \
-  __ret_645 = __noswap_vqdmlsl_s32(__rev0_645, __noswap_vget_high_s32(__rev1_645), __noswap_splat_laneq_s32(__rev2_645, __p3_645)); \
-  __ret_645 = __builtin_shufflevector(__ret_645, __ret_645, 1, 0); \
-  __ret_645; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_high_laneq_s16(__p0_646, __p1_646, __p2_646, __p3_646) __extension__ ({ \
-  int32x4_t __ret_646; \
-  int32x4_t __s0_646 = __p0_646; \
-  int16x8_t __s1_646 = __p1_646; \
-  int16x8_t __s2_646 = __p2_646; \
-  __ret_646 = vqdmlsl_s16(__s0_646, vget_high_s16(__s1_646), splat_laneq_s16(__s2_646, __p3_646)); \
-  __ret_646; \
-})
-#else
-#define vqdmlsl_high_laneq_s16(__p0_647, __p1_647, __p2_647, __p3_647) __extension__ ({ \
-  int32x4_t __ret_647; \
-  int32x4_t __s0_647 = __p0_647; \
-  int16x8_t __s1_647 = __p1_647; \
-  int16x8_t __s2_647 = __p2_647; \
-  int32x4_t __rev0_647;  __rev0_647 = __builtin_shufflevector(__s0_647, __s0_647, 3, 2, 1, 0); \
-  int16x8_t __rev1_647;  __rev1_647 = __builtin_shufflevector(__s1_647, __s1_647, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_647;  __rev2_647 = __builtin_shufflevector(__s2_647, __s2_647, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_647 = __noswap_vqdmlsl_s16(__rev0_647, __noswap_vget_high_s16(__rev1_647), __noswap_splat_laneq_s16(__rev2_647, __p3_647)); \
-  __ret_647 = __builtin_shufflevector(__ret_647, __ret_647, 3, 2, 1, 0); \
-  __ret_647; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = vqdmlsl_n_s32(__p0, vget_high_s32(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlsl_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = vqdmlsl_n_s16(__p0, vget_high_s16(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vqdmlsl_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int32_t __s1 = __p1; \
-  int32x2_t __s2 = __p2; \
-  __ret = (int64_t) __builtin_neon_vqdmlsls_lane_s32(__s0, __s1, __s2, __p3); \
-  __ret; \
-})
-#else
-#define vqdmlsls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int32_t __s1 = __p1; \
-  int32x2_t __s2 = __p2; \
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
-  __ret = (int64_t) __builtin_neon_vqdmlsls_lane_s32(__s0, __s1, __rev2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlslh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  int16_t __s1 = __p1; \
-  int16x4_t __s2 = __p2; \
-  __ret = (int32_t) __builtin_neon_vqdmlslh_lane_s16(__s0, __s1, __s2, __p3); \
-  __ret; \
-})
-#else
-#define vqdmlslh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  int16_t __s1 = __p1; \
-  int16x4_t __s2 = __p2; \
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (int32_t) __builtin_neon_vqdmlslh_lane_s16(__s0, __s1, __rev2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int32_t __s1 = __p1; \
-  int32x4_t __s2 = __p2; \
-  __ret = (int64_t) __builtin_neon_vqdmlsls_laneq_s32(__s0, __s1, __s2, __p3); \
-  __ret; \
-})
-#else
-#define vqdmlsls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int32_t __s1 = __p1; \
-  int32x4_t __s2 = __p2; \
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (int64_t) __builtin_neon_vqdmlsls_laneq_s32(__s0, __s1, __rev2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlslh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  int16_t __s1 = __p1; \
-  int16x8_t __s2 = __p2; \
-  __ret = (int32_t) __builtin_neon_vqdmlslh_laneq_s16(__s0, __s1, __s2, __p3); \
-  __ret; \
-})
-#else
-#define vqdmlslh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  int16_t __s1 = __p1; \
-  int16x8_t __s2 = __p2; \
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int32_t) __builtin_neon_vqdmlslh_laneq_s16(__s0, __s1, __rev2, __p3); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_laneq_s32(__p0_648, __p1_648, __p2_648, __p3_648) __extension__ ({ \
-  int64x2_t __ret_648; \
-  int64x2_t __s0_648 = __p0_648; \
-  int32x2_t __s1_648 = __p1_648; \
-  int32x4_t __s2_648 = __p2_648; \
-  __ret_648 = vqdmlsl_s32(__s0_648, __s1_648, splat_laneq_s32(__s2_648, __p3_648)); \
-  __ret_648; \
-})
-#else
-#define vqdmlsl_laneq_s32(__p0_649, __p1_649, __p2_649, __p3_649) __extension__ ({ \
-  int64x2_t __ret_649; \
-  int64x2_t __s0_649 = __p0_649; \
-  int32x2_t __s1_649 = __p1_649; \
-  int32x4_t __s2_649 = __p2_649; \
-  int64x2_t __rev0_649;  __rev0_649 = __builtin_shufflevector(__s0_649, __s0_649, 1, 0); \
-  int32x2_t __rev1_649;  __rev1_649 = __builtin_shufflevector(__s1_649, __s1_649, 1, 0); \
-  int32x4_t __rev2_649;  __rev2_649 = __builtin_shufflevector(__s2_649, __s2_649, 3, 2, 1, 0); \
-  __ret_649 = __noswap_vqdmlsl_s32(__rev0_649, __rev1_649, __noswap_splat_laneq_s32(__rev2_649, __p3_649)); \
-  __ret_649 = __builtin_shufflevector(__ret_649, __ret_649, 1, 0); \
-  __ret_649; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmlsl_laneq_s16(__p0_650, __p1_650, __p2_650, __p3_650) __extension__ ({ \
-  int32x4_t __ret_650; \
-  int32x4_t __s0_650 = __p0_650; \
-  int16x4_t __s1_650 = __p1_650; \
-  int16x8_t __s2_650 = __p2_650; \
-  __ret_650 = vqdmlsl_s16(__s0_650, __s1_650, splat_laneq_s16(__s2_650, __p3_650)); \
-  __ret_650; \
-})
-#else
-#define vqdmlsl_laneq_s16(__p0_651, __p1_651, __p2_651, __p3_651) __extension__ ({ \
-  int32x4_t __ret_651; \
-  int32x4_t __s0_651 = __p0_651; \
-  int16x4_t __s1_651 = __p1_651; \
-  int16x8_t __s2_651 = __p2_651; \
-  int32x4_t __rev0_651;  __rev0_651 = __builtin_shufflevector(__s0_651, __s0_651, 3, 2, 1, 0); \
-  int16x4_t __rev1_651;  __rev1_651 = __builtin_shufflevector(__s1_651, __s1_651, 3, 2, 1, 0); \
-  int16x8_t __rev2_651;  __rev2_651 = __builtin_shufflevector(__s2_651, __s2_651, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_651 = __noswap_vqdmlsl_s16(__rev0_651, __rev1_651, __noswap_splat_laneq_s16(__rev2_651, __p3_651)); \
-  __ret_651 = __builtin_shufflevector(__ret_651, __ret_651, 3, 2, 1, 0); \
-  __ret_651; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) int32_t vqdmulhs_s32(int32_t __p0, int32_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqdmulhs_s32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16_t vqdmulhh_s16(int16_t __p0, int16_t __p1) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqdmulhh_s16(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vqdmulhq_lane_v((int8x16_t)__s0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vqdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vqdmulhq_lane_v((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vqdmulhq_lane_v((int8x16_t)__s0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vqdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vqdmulhq_lane_v((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vqdmulh_lane_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vqdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vqdmulh_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vqdmulh_lane_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vqdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vqdmulh_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhs_lane_s32(__p0_652, __p1_652, __p2_652) __extension__ ({ \
-  int32_t __ret_652; \
-  int32_t __s0_652 = __p0_652; \
-  int32x2_t __s1_652 = __p1_652; \
-  __ret_652 = vqdmulhs_s32(__s0_652, vget_lane_s32(__s1_652, __p2_652)); \
-  __ret_652; \
-})
-#else
-#define vqdmulhs_lane_s32(__p0_653, __p1_653, __p2_653) __extension__ ({ \
-  int32_t __ret_653; \
-  int32_t __s0_653 = __p0_653; \
-  int32x2_t __s1_653 = __p1_653; \
-  int32x2_t __rev1_653;  __rev1_653 = __builtin_shufflevector(__s1_653, __s1_653, 1, 0); \
-  __ret_653 = vqdmulhs_s32(__s0_653, __noswap_vget_lane_s32(__rev1_653, __p2_653)); \
-  __ret_653; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhh_lane_s16(__p0_654, __p1_654, __p2_654) __extension__ ({ \
-  int16_t __ret_654; \
-  int16_t __s0_654 = __p0_654; \
-  int16x4_t __s1_654 = __p1_654; \
-  __ret_654 = vqdmulhh_s16(__s0_654, vget_lane_s16(__s1_654, __p2_654)); \
-  __ret_654; \
-})
-#else
-#define vqdmulhh_lane_s16(__p0_655, __p1_655, __p2_655) __extension__ ({ \
-  int16_t __ret_655; \
-  int16_t __s0_655 = __p0_655; \
-  int16x4_t __s1_655 = __p1_655; \
-  int16x4_t __rev1_655;  __rev1_655 = __builtin_shufflevector(__s1_655, __s1_655, 3, 2, 1, 0); \
-  __ret_655 = vqdmulhh_s16(__s0_655, __noswap_vget_lane_s16(__rev1_655, __p2_655)); \
-  __ret_655; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhs_laneq_s32(__p0_656, __p1_656, __p2_656) __extension__ ({ \
-  int32_t __ret_656; \
-  int32_t __s0_656 = __p0_656; \
-  int32x4_t __s1_656 = __p1_656; \
-  __ret_656 = vqdmulhs_s32(__s0_656, vgetq_lane_s32(__s1_656, __p2_656)); \
-  __ret_656; \
-})
-#else
-#define vqdmulhs_laneq_s32(__p0_657, __p1_657, __p2_657) __extension__ ({ \
-  int32_t __ret_657; \
-  int32_t __s0_657 = __p0_657; \
-  int32x4_t __s1_657 = __p1_657; \
-  int32x4_t __rev1_657;  __rev1_657 = __builtin_shufflevector(__s1_657, __s1_657, 3, 2, 1, 0); \
-  __ret_657 = vqdmulhs_s32(__s0_657, __noswap_vgetq_lane_s32(__rev1_657, __p2_657)); \
-  __ret_657; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhh_laneq_s16(__p0_658, __p1_658, __p2_658) __extension__ ({ \
-  int16_t __ret_658; \
-  int16_t __s0_658 = __p0_658; \
-  int16x8_t __s1_658 = __p1_658; \
-  __ret_658 = vqdmulhh_s16(__s0_658, vgetq_lane_s16(__s1_658, __p2_658)); \
-  __ret_658; \
-})
-#else
-#define vqdmulhh_laneq_s16(__p0_659, __p1_659, __p2_659) __extension__ ({ \
-  int16_t __ret_659; \
-  int16_t __s0_659 = __p0_659; \
-  int16x8_t __s1_659 = __p1_659; \
-  int16x8_t __rev1_659;  __rev1_659 = __builtin_shufflevector(__s1_659, __s1_659, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_659 = vqdmulhh_s16(__s0_659, __noswap_vgetq_lane_s16(__rev1_659, __p2_659)); \
-  __ret_659; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vqdmulhq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
-  __ret; \
-})
-#else
-#define vqdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vqdmulhq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vqdmulhq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
-  __ret; \
-})
-#else
-#define vqdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vqdmulhq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vqdmulh_laneq_v((int8x8_t)__s0, (int8x16_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vqdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vqdmulh_laneq_v((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vqdmulh_laneq_v((int8x8_t)__s0, (int8x16_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vqdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vqdmulh_laneq_v((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) int64_t vqdmulls_s32(int32_t __p0, int32_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqdmulls_s32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqdmullh_s16(int16_t __p0, int16_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqdmullh_s16(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  __ret = vqdmull_s32(vget_high_s32(__p0), vget_high_s32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0), __noswap_vget_high_s32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  __ret = vqdmull_s16(vget_high_s16(__p0), vget_high_s16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0), __noswap_vget_high_s16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmull_high_lane_s32(__p0_660, __p1_660, __p2_660) __extension__ ({ \
-  int64x2_t __ret_660; \
-  int32x4_t __s0_660 = __p0_660; \
-  int32x2_t __s1_660 = __p1_660; \
-  __ret_660 = vqdmull_s32(vget_high_s32(__s0_660), splat_lane_s32(__s1_660, __p2_660)); \
-  __ret_660; \
-})
-#else
-#define vqdmull_high_lane_s32(__p0_661, __p1_661, __p2_661) __extension__ ({ \
-  int64x2_t __ret_661; \
-  int32x4_t __s0_661 = __p0_661; \
-  int32x2_t __s1_661 = __p1_661; \
-  int32x4_t __rev0_661;  __rev0_661 = __builtin_shufflevector(__s0_661, __s0_661, 3, 2, 1, 0); \
-  int32x2_t __rev1_661;  __rev1_661 = __builtin_shufflevector(__s1_661, __s1_661, 1, 0); \
-  __ret_661 = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0_661), __noswap_splat_lane_s32(__rev1_661, __p2_661)); \
-  __ret_661 = __builtin_shufflevector(__ret_661, __ret_661, 1, 0); \
-  __ret_661; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmull_high_lane_s16(__p0_662, __p1_662, __p2_662) __extension__ ({ \
-  int32x4_t __ret_662; \
-  int16x8_t __s0_662 = __p0_662; \
-  int16x4_t __s1_662 = __p1_662; \
-  __ret_662 = vqdmull_s16(vget_high_s16(__s0_662), splat_lane_s16(__s1_662, __p2_662)); \
-  __ret_662; \
-})
-#else
-#define vqdmull_high_lane_s16(__p0_663, __p1_663, __p2_663) __extension__ ({ \
-  int32x4_t __ret_663; \
-  int16x8_t __s0_663 = __p0_663; \
-  int16x4_t __s1_663 = __p1_663; \
-  int16x8_t __rev0_663;  __rev0_663 = __builtin_shufflevector(__s0_663, __s0_663, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev1_663;  __rev1_663 = __builtin_shufflevector(__s1_663, __s1_663, 3, 2, 1, 0); \
-  __ret_663 = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0_663), __noswap_splat_lane_s16(__rev1_663, __p2_663)); \
-  __ret_663 = __builtin_shufflevector(__ret_663, __ret_663, 3, 2, 1, 0); \
-  __ret_663; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmull_high_laneq_s32(__p0_664, __p1_664, __p2_664) __extension__ ({ \
-  int64x2_t __ret_664; \
-  int32x4_t __s0_664 = __p0_664; \
-  int32x4_t __s1_664 = __p1_664; \
-  __ret_664 = vqdmull_s32(vget_high_s32(__s0_664), splat_laneq_s32(__s1_664, __p2_664)); \
-  __ret_664; \
-})
-#else
-#define vqdmull_high_laneq_s32(__p0_665, __p1_665, __p2_665) __extension__ ({ \
-  int64x2_t __ret_665; \
-  int32x4_t __s0_665 = __p0_665; \
-  int32x4_t __s1_665 = __p1_665; \
-  int32x4_t __rev0_665;  __rev0_665 = __builtin_shufflevector(__s0_665, __s0_665, 3, 2, 1, 0); \
-  int32x4_t __rev1_665;  __rev1_665 = __builtin_shufflevector(__s1_665, __s1_665, 3, 2, 1, 0); \
-  __ret_665 = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0_665), __noswap_splat_laneq_s32(__rev1_665, __p2_665)); \
-  __ret_665 = __builtin_shufflevector(__ret_665, __ret_665, 1, 0); \
-  __ret_665; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmull_high_laneq_s16(__p0_666, __p1_666, __p2_666) __extension__ ({ \
-  int32x4_t __ret_666; \
-  int16x8_t __s0_666 = __p0_666; \
-  int16x8_t __s1_666 = __p1_666; \
-  __ret_666 = vqdmull_s16(vget_high_s16(__s0_666), splat_laneq_s16(__s1_666, __p2_666)); \
-  __ret_666; \
-})
-#else
-#define vqdmull_high_laneq_s16(__p0_667, __p1_667, __p2_667) __extension__ ({ \
-  int32x4_t __ret_667; \
-  int16x8_t __s0_667 = __p0_667; \
-  int16x8_t __s1_667 = __p1_667; \
-  int16x8_t __rev0_667;  __rev0_667 = __builtin_shufflevector(__s0_667, __s0_667, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_667;  __rev1_667 = __builtin_shufflevector(__s1_667, __s1_667, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_667 = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0_667), __noswap_splat_laneq_s16(__rev1_667, __p2_667)); \
-  __ret_667 = __builtin_shufflevector(__ret_667, __ret_667, 3, 2, 1, 0); \
-  __ret_667; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqdmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  __ret = vqdmull_n_s32(vget_high_s32(__p0), __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqdmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
-  int64x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = __noswap_vqdmull_n_s32(__noswap_vget_high_s32(__rev0), __p1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqdmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  __ret = vqdmull_n_s16(vget_high_s16(__p0), __p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqdmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
-  int32x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vqdmull_n_s16(__noswap_vget_high_s16(__rev0), __p1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulls_lane_s32(__p0_668, __p1_668, __p2_668) __extension__ ({ \
-  int64_t __ret_668; \
-  int32_t __s0_668 = __p0_668; \
-  int32x2_t __s1_668 = __p1_668; \
-  __ret_668 = vqdmulls_s32(__s0_668, vget_lane_s32(__s1_668, __p2_668)); \
-  __ret_668; \
-})
-#else
-#define vqdmulls_lane_s32(__p0_669, __p1_669, __p2_669) __extension__ ({ \
-  int64_t __ret_669; \
-  int32_t __s0_669 = __p0_669; \
-  int32x2_t __s1_669 = __p1_669; \
-  int32x2_t __rev1_669;  __rev1_669 = __builtin_shufflevector(__s1_669, __s1_669, 1, 0); \
-  __ret_669 = vqdmulls_s32(__s0_669, __noswap_vget_lane_s32(__rev1_669, __p2_669)); \
-  __ret_669; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmullh_lane_s16(__p0_670, __p1_670, __p2_670) __extension__ ({ \
-  int32_t __ret_670; \
-  int16_t __s0_670 = __p0_670; \
-  int16x4_t __s1_670 = __p1_670; \
-  __ret_670 = vqdmullh_s16(__s0_670, vget_lane_s16(__s1_670, __p2_670)); \
-  __ret_670; \
-})
-#else
-#define vqdmullh_lane_s16(__p0_671, __p1_671, __p2_671) __extension__ ({ \
-  int32_t __ret_671; \
-  int16_t __s0_671 = __p0_671; \
-  int16x4_t __s1_671 = __p1_671; \
-  int16x4_t __rev1_671;  __rev1_671 = __builtin_shufflevector(__s1_671, __s1_671, 3, 2, 1, 0); \
-  __ret_671 = vqdmullh_s16(__s0_671, __noswap_vget_lane_s16(__rev1_671, __p2_671)); \
-  __ret_671; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmulls_laneq_s32(__p0_672, __p1_672, __p2_672) __extension__ ({ \
-  int64_t __ret_672; \
-  int32_t __s0_672 = __p0_672; \
-  int32x4_t __s1_672 = __p1_672; \
-  __ret_672 = vqdmulls_s32(__s0_672, vgetq_lane_s32(__s1_672, __p2_672)); \
-  __ret_672; \
-})
-#else
-#define vqdmulls_laneq_s32(__p0_673, __p1_673, __p2_673) __extension__ ({ \
-  int64_t __ret_673; \
-  int32_t __s0_673 = __p0_673; \
-  int32x4_t __s1_673 = __p1_673; \
-  int32x4_t __rev1_673;  __rev1_673 = __builtin_shufflevector(__s1_673, __s1_673, 3, 2, 1, 0); \
-  __ret_673 = vqdmulls_s32(__s0_673, __noswap_vgetq_lane_s32(__rev1_673, __p2_673)); \
-  __ret_673; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmullh_laneq_s16(__p0_674, __p1_674, __p2_674) __extension__ ({ \
-  int32_t __ret_674; \
-  int16_t __s0_674 = __p0_674; \
-  int16x8_t __s1_674 = __p1_674; \
-  __ret_674 = vqdmullh_s16(__s0_674, vgetq_lane_s16(__s1_674, __p2_674)); \
-  __ret_674; \
-})
-#else
-#define vqdmullh_laneq_s16(__p0_675, __p1_675, __p2_675) __extension__ ({ \
-  int32_t __ret_675; \
-  int16_t __s0_675 = __p0_675; \
-  int16x8_t __s1_675 = __p1_675; \
-  int16x8_t __rev1_675;  __rev1_675 = __builtin_shufflevector(__s1_675, __s1_675, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_675 = vqdmullh_s16(__s0_675, __noswap_vgetq_lane_s16(__rev1_675, __p2_675)); \
-  __ret_675; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmull_laneq_s32(__p0_676, __p1_676, __p2_676) __extension__ ({ \
-  int64x2_t __ret_676; \
-  int32x2_t __s0_676 = __p0_676; \
-  int32x4_t __s1_676 = __p1_676; \
-  __ret_676 = vqdmull_s32(__s0_676, splat_laneq_s32(__s1_676, __p2_676)); \
-  __ret_676; \
-})
-#else
-#define vqdmull_laneq_s32(__p0_677, __p1_677, __p2_677) __extension__ ({ \
-  int64x2_t __ret_677; \
-  int32x2_t __s0_677 = __p0_677; \
-  int32x4_t __s1_677 = __p1_677; \
-  int32x2_t __rev0_677;  __rev0_677 = __builtin_shufflevector(__s0_677, __s0_677, 1, 0); \
-  int32x4_t __rev1_677;  __rev1_677 = __builtin_shufflevector(__s1_677, __s1_677, 3, 2, 1, 0); \
-  __ret_677 = __noswap_vqdmull_s32(__rev0_677, __noswap_splat_laneq_s32(__rev1_677, __p2_677)); \
-  __ret_677 = __builtin_shufflevector(__ret_677, __ret_677, 1, 0); \
-  __ret_677; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqdmull_laneq_s16(__p0_678, __p1_678, __p2_678) __extension__ ({ \
-  int32x4_t __ret_678; \
-  int16x4_t __s0_678 = __p0_678; \
-  int16x8_t __s1_678 = __p1_678; \
-  __ret_678 = vqdmull_s16(__s0_678, splat_laneq_s16(__s1_678, __p2_678)); \
-  __ret_678; \
-})
-#else
-#define vqdmull_laneq_s16(__p0_679, __p1_679, __p2_679) __extension__ ({ \
-  int32x4_t __ret_679; \
-  int16x4_t __s0_679 = __p0_679; \
-  int16x8_t __s1_679 = __p1_679; \
-  int16x4_t __rev0_679;  __rev0_679 = __builtin_shufflevector(__s0_679, __s0_679, 3, 2, 1, 0); \
-  int16x8_t __rev1_679;  __rev1_679 = __builtin_shufflevector(__s1_679, __s1_679, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_679 = __noswap_vqdmull_s16(__rev0_679, __noswap_splat_laneq_s16(__rev1_679, __p2_679)); \
-  __ret_679 = __builtin_shufflevector(__ret_679, __ret_679, 3, 2, 1, 0); \
-  __ret_679; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) int16_t vqmovns_s32(int32_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqmovns_s32(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqmovnd_s64(int64_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqmovnd_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8_t vqmovnh_s16(int16_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqmovnh_s16(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16_t vqmovns_u32(uint32_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vqmovns_u32(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vqmovnd_u64(uint64_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vqmovnd_u64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8_t vqmovnh_u16(uint16_t __p0) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vqmovnh_u16(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vqmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
-  uint16x8_t __ret;
-  __ret = vcombine_u16(__p0, vqmovn_u32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vqmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
-  uint16x8_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u16(__rev0, __noswap_vqmovn_u32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vqmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
-  uint32x4_t __ret;
-  __ret = vcombine_u32(__p0, vqmovn_u64(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vqmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
-  uint32x4_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vcombine_u32(__rev0, __noswap_vqmovn_u64(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
-  uint8x16_t __ret;
-  __ret = vcombine_u8(__p0, vqmovn_u16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
-  uint8x16_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u8(__rev0, __noswap_vqmovn_u16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vqmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
-  int16x8_t __ret;
-  __ret = vcombine_s16(__p0, vqmovn_s32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vqmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
-  int16x8_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s16(__rev0, __noswap_vqmovn_s32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vqmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
-  int32x4_t __ret;
-  __ret = vcombine_s32(__p0, vqmovn_s64(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vqmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
-  int32x4_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vcombine_s32(__rev0, __noswap_vqmovn_s64(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
-  int8x16_t __ret;
-  __ret = vcombine_s8(__p0, vqmovn_s16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
-  int8x16_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s8(__rev0, __noswap_vqmovn_s16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint16_t vqmovuns_s32(int32_t __p0) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vqmovuns_s32(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vqmovund_s64(int64_t __p0) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vqmovund_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8_t vqmovunh_s16(int16_t __p0) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vqmovunh_s16(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vqmovun_high_s32(uint16x4_t __p0, int32x4_t __p1) {
-  uint16x8_t __ret;
-  __ret = vcombine_u16((uint16x4_t)(__p0), vqmovun_s32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vqmovun_high_s32(uint16x4_t __p0, int32x4_t __p1) {
-  uint16x8_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u16((uint16x4_t)(__rev0), __noswap_vqmovun_s32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vqmovun_high_s64(uint32x2_t __p0, int64x2_t __p1) {
-  uint32x4_t __ret;
-  __ret = vcombine_u32((uint32x2_t)(__p0), vqmovun_s64(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vqmovun_high_s64(uint32x2_t __p0, int64x2_t __p1) {
-  uint32x4_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vcombine_u32((uint32x2_t)(__rev0), __noswap_vqmovun_s64(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqmovun_high_s16(uint8x8_t __p0, int16x8_t __p1) {
-  uint8x16_t __ret;
-  __ret = vcombine_u8((uint8x8_t)(__p0), vqmovun_s16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqmovun_high_s16(uint8x8_t __p0, int16x8_t __p1) {
-  uint8x16_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u8((uint8x8_t)(__rev0), __noswap_vqmovun_s16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vqnegq_s64(int64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vqnegq_s64(int64x2_t __p0) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vqneg_s64(int64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 3);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8_t vqnegb_s8(int8_t __p0) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqnegb_s8(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqnegs_s32(int32_t __p0) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqnegs_s32(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vqnegd_s64(int64_t __p0) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqnegd_s64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16_t vqnegh_s16(int16_t __p0) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqnegh_s16(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqrdmulhs_s32(int32_t __p0, int32_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqrdmulhs_s32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16_t vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqrdmulhh_s16(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_lane_v((int8x16_t)__s0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vqrdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_lane_v((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_lane_v((int8x16_t)__s0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vqrdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_lane_v((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vqrdmulh_lane_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vqrdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x2_t __s1 = __p1; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vqrdmulh_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vqrdmulh_lane_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vqrdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x4_t __s1 = __p1; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vqrdmulh_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhs_lane_s32(__p0_680, __p1_680, __p2_680) __extension__ ({ \
-  int32_t __ret_680; \
-  int32_t __s0_680 = __p0_680; \
-  int32x2_t __s1_680 = __p1_680; \
-  __ret_680 = vqrdmulhs_s32(__s0_680, vget_lane_s32(__s1_680, __p2_680)); \
-  __ret_680; \
-})
-#else
-#define vqrdmulhs_lane_s32(__p0_681, __p1_681, __p2_681) __extension__ ({ \
-  int32_t __ret_681; \
-  int32_t __s0_681 = __p0_681; \
-  int32x2_t __s1_681 = __p1_681; \
-  int32x2_t __rev1_681;  __rev1_681 = __builtin_shufflevector(__s1_681, __s1_681, 1, 0); \
-  __ret_681 = vqrdmulhs_s32(__s0_681, __noswap_vget_lane_s32(__rev1_681, __p2_681)); \
-  __ret_681; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhh_lane_s16(__p0_682, __p1_682, __p2_682) __extension__ ({ \
-  int16_t __ret_682; \
-  int16_t __s0_682 = __p0_682; \
-  int16x4_t __s1_682 = __p1_682; \
-  __ret_682 = vqrdmulhh_s16(__s0_682, vget_lane_s16(__s1_682, __p2_682)); \
-  __ret_682; \
-})
-#else
-#define vqrdmulhh_lane_s16(__p0_683, __p1_683, __p2_683) __extension__ ({ \
-  int16_t __ret_683; \
-  int16_t __s0_683 = __p0_683; \
-  int16x4_t __s1_683 = __p1_683; \
-  int16x4_t __rev1_683;  __rev1_683 = __builtin_shufflevector(__s1_683, __s1_683, 3, 2, 1, 0); \
-  __ret_683 = vqrdmulhh_s16(__s0_683, __noswap_vget_lane_s16(__rev1_683, __p2_683)); \
-  __ret_683; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhs_laneq_s32(__p0_684, __p1_684, __p2_684) __extension__ ({ \
-  int32_t __ret_684; \
-  int32_t __s0_684 = __p0_684; \
-  int32x4_t __s1_684 = __p1_684; \
-  __ret_684 = vqrdmulhs_s32(__s0_684, vgetq_lane_s32(__s1_684, __p2_684)); \
-  __ret_684; \
-})
-#else
-#define vqrdmulhs_laneq_s32(__p0_685, __p1_685, __p2_685) __extension__ ({ \
-  int32_t __ret_685; \
-  int32_t __s0_685 = __p0_685; \
-  int32x4_t __s1_685 = __p1_685; \
-  int32x4_t __rev1_685;  __rev1_685 = __builtin_shufflevector(__s1_685, __s1_685, 3, 2, 1, 0); \
-  __ret_685 = vqrdmulhs_s32(__s0_685, __noswap_vgetq_lane_s32(__rev1_685, __p2_685)); \
-  __ret_685; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhh_laneq_s16(__p0_686, __p1_686, __p2_686) __extension__ ({ \
-  int16_t __ret_686; \
-  int16_t __s0_686 = __p0_686; \
-  int16x8_t __s1_686 = __p1_686; \
-  __ret_686 = vqrdmulhh_s16(__s0_686, vgetq_lane_s16(__s1_686, __p2_686)); \
-  __ret_686; \
-})
-#else
-#define vqrdmulhh_laneq_s16(__p0_687, __p1_687, __p2_687) __extension__ ({ \
-  int16_t __ret_687; \
-  int16_t __s0_687 = __p0_687; \
-  int16x8_t __s1_687 = __p1_687; \
-  int16x8_t __rev1_687;  __rev1_687 = __builtin_shufflevector(__s1_687, __s1_687, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_687 = vqrdmulhh_s16(__s0_687, __noswap_vgetq_lane_s16(__rev1_687, __p2_687)); \
-  __ret_687; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
-  __ret; \
-})
-#else
-#define vqrdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x4_t __ret; \
-  int32x4_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
-  __ret; \
-})
-#else
-#define vqrdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x8_t __ret; \
-  int16x8_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  __ret = (int32x2_t) __builtin_neon_vqrdmulh_laneq_v((int8x8_t)__s0, (int8x16_t)__s1, __p2, 2); \
-  __ret; \
-})
-#else
-#define vqrdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
-  int32x2_t __ret; \
-  int32x2_t __s0 = __p0; \
-  int32x4_t __s1 = __p1; \
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  __ret = (int32x2_t) __builtin_neon_vqrdmulh_laneq_v((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  __ret = (int16x4_t) __builtin_neon_vqrdmulh_laneq_v((int8x8_t)__s0, (int8x16_t)__s1, __p2, 1); \
-  __ret; \
-})
-#else
-#define vqrdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
-  int16x4_t __ret; \
-  int16x4_t __s0 = __p0; \
-  int16x8_t __s1 = __p1; \
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret = (int16x4_t) __builtin_neon_vqrdmulh_laneq_v((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 1); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) uint8_t vqrshlb_u8(uint8_t __p0, int8_t __p1) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vqrshlb_u8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vqrshls_u32(uint32_t __p0, int32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vqrshls_u32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vqrshld_u64(uint64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vqrshld_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16_t vqrshlh_u16(uint16_t __p0, int16_t __p1) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vqrshlh_u16(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8_t vqrshlb_s8(int8_t __p0, int8_t __p1) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqrshlb_s8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqrshls_s32(int32_t __p0, int32_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqrshls_s32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vqrshld_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqrshld_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqrshlh_s16(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_u32(__p0_688, __p1_688, __p2_688) __extension__ ({ \
-  uint16x8_t __ret_688; \
-  uint16x4_t __s0_688 = __p0_688; \
-  uint32x4_t __s1_688 = __p1_688; \
-  __ret_688 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_688), (uint16x4_t)(vqrshrn_n_u32(__s1_688, __p2_688)))); \
-  __ret_688; \
-})
-#else
-#define vqrshrn_high_n_u32(__p0_689, __p1_689, __p2_689) __extension__ ({ \
-  uint16x8_t __ret_689; \
-  uint16x4_t __s0_689 = __p0_689; \
-  uint32x4_t __s1_689 = __p1_689; \
-  uint16x4_t __rev0_689;  __rev0_689 = __builtin_shufflevector(__s0_689, __s0_689, 3, 2, 1, 0); \
-  uint32x4_t __rev1_689;  __rev1_689 = __builtin_shufflevector(__s1_689, __s1_689, 3, 2, 1, 0); \
-  __ret_689 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_689), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_689, __p2_689)))); \
-  __ret_689 = __builtin_shufflevector(__ret_689, __ret_689, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_689; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_u64(__p0_690, __p1_690, __p2_690) __extension__ ({ \
-  uint32x4_t __ret_690; \
-  uint32x2_t __s0_690 = __p0_690; \
-  uint64x2_t __s1_690 = __p1_690; \
-  __ret_690 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_690), (uint32x2_t)(vqrshrn_n_u64(__s1_690, __p2_690)))); \
-  __ret_690; \
-})
-#else
-#define vqrshrn_high_n_u64(__p0_691, __p1_691, __p2_691) __extension__ ({ \
-  uint32x4_t __ret_691; \
-  uint32x2_t __s0_691 = __p0_691; \
-  uint64x2_t __s1_691 = __p1_691; \
-  uint32x2_t __rev0_691;  __rev0_691 = __builtin_shufflevector(__s0_691, __s0_691, 1, 0); \
-  uint64x2_t __rev1_691;  __rev1_691 = __builtin_shufflevector(__s1_691, __s1_691, 1, 0); \
-  __ret_691 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_691), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_691, __p2_691)))); \
-  __ret_691 = __builtin_shufflevector(__ret_691, __ret_691, 3, 2, 1, 0); \
-  __ret_691; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_u16(__p0_692, __p1_692, __p2_692) __extension__ ({ \
-  uint8x16_t __ret_692; \
-  uint8x8_t __s0_692 = __p0_692; \
-  uint16x8_t __s1_692 = __p1_692; \
-  __ret_692 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_692), (uint8x8_t)(vqrshrn_n_u16(__s1_692, __p2_692)))); \
-  __ret_692; \
-})
-#else
-#define vqrshrn_high_n_u16(__p0_693, __p1_693, __p2_693) __extension__ ({ \
-  uint8x16_t __ret_693; \
-  uint8x8_t __s0_693 = __p0_693; \
-  uint16x8_t __s1_693 = __p1_693; \
-  uint8x8_t __rev0_693;  __rev0_693 = __builtin_shufflevector(__s0_693, __s0_693, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_693;  __rev1_693 = __builtin_shufflevector(__s1_693, __s1_693, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_693 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_693), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_693, __p2_693)))); \
-  __ret_693 = __builtin_shufflevector(__ret_693, __ret_693, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_693; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_s32(__p0_694, __p1_694, __p2_694) __extension__ ({ \
-  int16x8_t __ret_694; \
-  int16x4_t __s0_694 = __p0_694; \
-  int32x4_t __s1_694 = __p1_694; \
-  __ret_694 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_694), (int16x4_t)(vqrshrn_n_s32(__s1_694, __p2_694)))); \
-  __ret_694; \
-})
-#else
-#define vqrshrn_high_n_s32(__p0_695, __p1_695, __p2_695) __extension__ ({ \
-  int16x8_t __ret_695; \
-  int16x4_t __s0_695 = __p0_695; \
-  int32x4_t __s1_695 = __p1_695; \
-  int16x4_t __rev0_695;  __rev0_695 = __builtin_shufflevector(__s0_695, __s0_695, 3, 2, 1, 0); \
-  int32x4_t __rev1_695;  __rev1_695 = __builtin_shufflevector(__s1_695, __s1_695, 3, 2, 1, 0); \
-  __ret_695 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_695), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_695, __p2_695)))); \
-  __ret_695 = __builtin_shufflevector(__ret_695, __ret_695, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_695; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_s64(__p0_696, __p1_696, __p2_696) __extension__ ({ \
-  int32x4_t __ret_696; \
-  int32x2_t __s0_696 = __p0_696; \
-  int64x2_t __s1_696 = __p1_696; \
-  __ret_696 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_696), (int32x2_t)(vqrshrn_n_s64(__s1_696, __p2_696)))); \
-  __ret_696; \
-})
-#else
-#define vqrshrn_high_n_s64(__p0_697, __p1_697, __p2_697) __extension__ ({ \
-  int32x4_t __ret_697; \
-  int32x2_t __s0_697 = __p0_697; \
-  int64x2_t __s1_697 = __p1_697; \
-  int32x2_t __rev0_697;  __rev0_697 = __builtin_shufflevector(__s0_697, __s0_697, 1, 0); \
-  int64x2_t __rev1_697;  __rev1_697 = __builtin_shufflevector(__s1_697, __s1_697, 1, 0); \
-  __ret_697 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_697), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_697, __p2_697)))); \
-  __ret_697 = __builtin_shufflevector(__ret_697, __ret_697, 3, 2, 1, 0); \
-  __ret_697; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrn_high_n_s16(__p0_698, __p1_698, __p2_698) __extension__ ({ \
-  int8x16_t __ret_698; \
-  int8x8_t __s0_698 = __p0_698; \
-  int16x8_t __s1_698 = __p1_698; \
-  __ret_698 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_698), (int8x8_t)(vqrshrn_n_s16(__s1_698, __p2_698)))); \
-  __ret_698; \
-})
-#else
-#define vqrshrn_high_n_s16(__p0_699, __p1_699, __p2_699) __extension__ ({ \
-  int8x16_t __ret_699; \
-  int8x8_t __s0_699 = __p0_699; \
-  int16x8_t __s1_699 = __p1_699; \
-  int8x8_t __rev0_699;  __rev0_699 = __builtin_shufflevector(__s0_699, __s0_699, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_699;  __rev1_699 = __builtin_shufflevector(__s1_699, __s1_699, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_699 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_699), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_699, __p2_699)))); \
-  __ret_699 = __builtin_shufflevector(__ret_699, __ret_699, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_699; \
-})
-#endif
-
-#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint32_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \
-  __ret; \
-})
-#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint16_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \
-  __ret; \
-})
-#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \
-  __ret; \
-})
-#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \
-  __ret; \
-})
-#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrun_high_n_s32(__p0_700, __p1_700, __p2_700) __extension__ ({ \
-  int16x8_t __ret_700; \
-  int16x4_t __s0_700 = __p0_700; \
-  int32x4_t __s1_700 = __p1_700; \
-  __ret_700 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_700), (int16x4_t)(vqrshrun_n_s32(__s1_700, __p2_700)))); \
-  __ret_700; \
-})
-#else
-#define vqrshrun_high_n_s32(__p0_701, __p1_701, __p2_701) __extension__ ({ \
-  int16x8_t __ret_701; \
-  int16x4_t __s0_701 = __p0_701; \
-  int32x4_t __s1_701 = __p1_701; \
-  int16x4_t __rev0_701;  __rev0_701 = __builtin_shufflevector(__s0_701, __s0_701, 3, 2, 1, 0); \
-  int32x4_t __rev1_701;  __rev1_701 = __builtin_shufflevector(__s1_701, __s1_701, 3, 2, 1, 0); \
-  __ret_701 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_701), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_701, __p2_701)))); \
-  __ret_701 = __builtin_shufflevector(__ret_701, __ret_701, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_701; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrun_high_n_s64(__p0_702, __p1_702, __p2_702) __extension__ ({ \
-  int32x4_t __ret_702; \
-  int32x2_t __s0_702 = __p0_702; \
-  int64x2_t __s1_702 = __p1_702; \
-  __ret_702 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_702), (int32x2_t)(vqrshrun_n_s64(__s1_702, __p2_702)))); \
-  __ret_702; \
-})
-#else
-#define vqrshrun_high_n_s64(__p0_703, __p1_703, __p2_703) __extension__ ({ \
-  int32x4_t __ret_703; \
-  int32x2_t __s0_703 = __p0_703; \
-  int64x2_t __s1_703 = __p1_703; \
-  int32x2_t __rev0_703;  __rev0_703 = __builtin_shufflevector(__s0_703, __s0_703, 1, 0); \
-  int64x2_t __rev1_703;  __rev1_703 = __builtin_shufflevector(__s1_703, __s1_703, 1, 0); \
-  __ret_703 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_703), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_703, __p2_703)))); \
-  __ret_703 = __builtin_shufflevector(__ret_703, __ret_703, 3, 2, 1, 0); \
-  __ret_703; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrshrun_high_n_s16(__p0_704, __p1_704, __p2_704) __extension__ ({ \
-  int8x16_t __ret_704; \
-  int8x8_t __s0_704 = __p0_704; \
-  int16x8_t __s1_704 = __p1_704; \
-  __ret_704 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_704), (int8x8_t)(vqrshrun_n_s16(__s1_704, __p2_704)))); \
-  __ret_704; \
-})
-#else
-#define vqrshrun_high_n_s16(__p0_705, __p1_705, __p2_705) __extension__ ({ \
-  int8x16_t __ret_705; \
-  int8x8_t __s0_705 = __p0_705; \
-  int16x8_t __s1_705 = __p1_705; \
-  int8x8_t __rev0_705;  __rev0_705 = __builtin_shufflevector(__s0_705, __s0_705, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_705;  __rev1_705 = __builtin_shufflevector(__s1_705, __s1_705, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_705 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_705), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_705, __p2_705)))); \
-  __ret_705 = __builtin_shufflevector(__ret_705, __ret_705, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_705; \
-})
-#endif
-
-#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \
-  __ret; \
-})
-#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \
-  __ret; \
-})
-#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
-  __ret; \
-})
-__ai __attribute__((target("neon"))) uint8_t vqshlb_u8(uint8_t __p0, int8_t __p1) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vqshls_u32(uint32_t __p0, int32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vqshld_u64(uint64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16_t vqshlh_u16(uint16_t __p0, int16_t __p1) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8_t vqshlb_s8(int8_t __p0, int8_t __p1) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqshls_s32(int32_t __p0, int32_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vqshld_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1);
-  return __ret;
-}
-#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint8_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \
-  __ret; \
-})
-#define vqshls_n_u32(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint32_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \
-  __ret; \
-})
-#define vqshld_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \
-  __ret; \
-})
-#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \
-  __ret; \
-})
-#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \
-  __ret; \
-})
-#define vqshls_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \
-  __ret; \
-})
-#define vqshld_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \
-  __ret; \
-})
-#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int8_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \
-  __ret; \
-})
-#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \
-  __ret; \
-})
-#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \
-  __ret; \
-})
-#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_u32(__p0_706, __p1_706, __p2_706) __extension__ ({ \
-  uint16x8_t __ret_706; \
-  uint16x4_t __s0_706 = __p0_706; \
-  uint32x4_t __s1_706 = __p1_706; \
-  __ret_706 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_706), (uint16x4_t)(vqshrn_n_u32(__s1_706, __p2_706)))); \
-  __ret_706; \
-})
-#else
-#define vqshrn_high_n_u32(__p0_707, __p1_707, __p2_707) __extension__ ({ \
-  uint16x8_t __ret_707; \
-  uint16x4_t __s0_707 = __p0_707; \
-  uint32x4_t __s1_707 = __p1_707; \
-  uint16x4_t __rev0_707;  __rev0_707 = __builtin_shufflevector(__s0_707, __s0_707, 3, 2, 1, 0); \
-  uint32x4_t __rev1_707;  __rev1_707 = __builtin_shufflevector(__s1_707, __s1_707, 3, 2, 1, 0); \
-  __ret_707 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_707), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_707, __p2_707)))); \
-  __ret_707 = __builtin_shufflevector(__ret_707, __ret_707, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_707; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_u64(__p0_708, __p1_708, __p2_708) __extension__ ({ \
-  uint32x4_t __ret_708; \
-  uint32x2_t __s0_708 = __p0_708; \
-  uint64x2_t __s1_708 = __p1_708; \
-  __ret_708 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_708), (uint32x2_t)(vqshrn_n_u64(__s1_708, __p2_708)))); \
-  __ret_708; \
-})
-#else
-#define vqshrn_high_n_u64(__p0_709, __p1_709, __p2_709) __extension__ ({ \
-  uint32x4_t __ret_709; \
-  uint32x2_t __s0_709 = __p0_709; \
-  uint64x2_t __s1_709 = __p1_709; \
-  uint32x2_t __rev0_709;  __rev0_709 = __builtin_shufflevector(__s0_709, __s0_709, 1, 0); \
-  uint64x2_t __rev1_709;  __rev1_709 = __builtin_shufflevector(__s1_709, __s1_709, 1, 0); \
-  __ret_709 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_709), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_709, __p2_709)))); \
-  __ret_709 = __builtin_shufflevector(__ret_709, __ret_709, 3, 2, 1, 0); \
-  __ret_709; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_u16(__p0_710, __p1_710, __p2_710) __extension__ ({ \
-  uint8x16_t __ret_710; \
-  uint8x8_t __s0_710 = __p0_710; \
-  uint16x8_t __s1_710 = __p1_710; \
-  __ret_710 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_710), (uint8x8_t)(vqshrn_n_u16(__s1_710, __p2_710)))); \
-  __ret_710; \
-})
-#else
-#define vqshrn_high_n_u16(__p0_711, __p1_711, __p2_711) __extension__ ({ \
-  uint8x16_t __ret_711; \
-  uint8x8_t __s0_711 = __p0_711; \
-  uint16x8_t __s1_711 = __p1_711; \
-  uint8x8_t __rev0_711;  __rev0_711 = __builtin_shufflevector(__s0_711, __s0_711, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_711;  __rev1_711 = __builtin_shufflevector(__s1_711, __s1_711, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_711 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_711), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_711, __p2_711)))); \
-  __ret_711 = __builtin_shufflevector(__ret_711, __ret_711, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_711; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_s32(__p0_712, __p1_712, __p2_712) __extension__ ({ \
-  int16x8_t __ret_712; \
-  int16x4_t __s0_712 = __p0_712; \
-  int32x4_t __s1_712 = __p1_712; \
-  __ret_712 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_712), (int16x4_t)(vqshrn_n_s32(__s1_712, __p2_712)))); \
-  __ret_712; \
-})
-#else
-#define vqshrn_high_n_s32(__p0_713, __p1_713, __p2_713) __extension__ ({ \
-  int16x8_t __ret_713; \
-  int16x4_t __s0_713 = __p0_713; \
-  int32x4_t __s1_713 = __p1_713; \
-  int16x4_t __rev0_713;  __rev0_713 = __builtin_shufflevector(__s0_713, __s0_713, 3, 2, 1, 0); \
-  int32x4_t __rev1_713;  __rev1_713 = __builtin_shufflevector(__s1_713, __s1_713, 3, 2, 1, 0); \
-  __ret_713 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_713), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_713, __p2_713)))); \
-  __ret_713 = __builtin_shufflevector(__ret_713, __ret_713, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_713; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_s64(__p0_714, __p1_714, __p2_714) __extension__ ({ \
-  int32x4_t __ret_714; \
-  int32x2_t __s0_714 = __p0_714; \
-  int64x2_t __s1_714 = __p1_714; \
-  __ret_714 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_714), (int32x2_t)(vqshrn_n_s64(__s1_714, __p2_714)))); \
-  __ret_714; \
-})
-#else
-#define vqshrn_high_n_s64(__p0_715, __p1_715, __p2_715) __extension__ ({ \
-  int32x4_t __ret_715; \
-  int32x2_t __s0_715 = __p0_715; \
-  int64x2_t __s1_715 = __p1_715; \
-  int32x2_t __rev0_715;  __rev0_715 = __builtin_shufflevector(__s0_715, __s0_715, 1, 0); \
-  int64x2_t __rev1_715;  __rev1_715 = __builtin_shufflevector(__s1_715, __s1_715, 1, 0); \
-  __ret_715 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_715), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_715, __p2_715)))); \
-  __ret_715 = __builtin_shufflevector(__ret_715, __ret_715, 3, 2, 1, 0); \
-  __ret_715; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrn_high_n_s16(__p0_716, __p1_716, __p2_716) __extension__ ({ \
-  int8x16_t __ret_716; \
-  int8x8_t __s0_716 = __p0_716; \
-  int16x8_t __s1_716 = __p1_716; \
-  __ret_716 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_716), (int8x8_t)(vqshrn_n_s16(__s1_716, __p2_716)))); \
-  __ret_716; \
-})
-#else
-#define vqshrn_high_n_s16(__p0_717, __p1_717, __p2_717) __extension__ ({ \
-  int8x16_t __ret_717; \
-  int8x8_t __s0_717 = __p0_717; \
-  int16x8_t __s1_717 = __p1_717; \
-  int8x8_t __rev0_717;  __rev0_717 = __builtin_shufflevector(__s0_717, __s0_717, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_717;  __rev1_717 = __builtin_shufflevector(__s1_717, __s1_717, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_717 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_717), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_717, __p2_717)))); \
-  __ret_717 = __builtin_shufflevector(__ret_717, __ret_717, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_717; \
-})
-#endif
-
-#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  uint32_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \
-  __ret; \
-})
-#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  uint16_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \
-  __ret; \
-})
-#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \
-  __ret; \
-})
-#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \
-  __ret; \
-})
-#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \
-  int8_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vqshrun_high_n_s32(__p0_718, __p1_718, __p2_718) __extension__ ({ \
-  int16x8_t __ret_718; \
-  int16x4_t __s0_718 = __p0_718; \
-  int32x4_t __s1_718 = __p1_718; \
-  __ret_718 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_718), (int16x4_t)(vqshrun_n_s32(__s1_718, __p2_718)))); \
-  __ret_718; \
-})
-#else
-#define vqshrun_high_n_s32(__p0_719, __p1_719, __p2_719) __extension__ ({ \
-  int16x8_t __ret_719; \
-  int16x4_t __s0_719 = __p0_719; \
-  int32x4_t __s1_719 = __p1_719; \
-  int16x4_t __rev0_719;  __rev0_719 = __builtin_shufflevector(__s0_719, __s0_719, 3, 2, 1, 0); \
-  int32x4_t __rev1_719;  __rev1_719 = __builtin_shufflevector(__s1_719, __s1_719, 3, 2, 1, 0); \
-  __ret_719 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_719), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_719, __p2_719)))); \
-  __ret_719 = __builtin_shufflevector(__ret_719, __ret_719, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_719; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrun_high_n_s64(__p0_720, __p1_720, __p2_720) __extension__ ({ \
-  int32x4_t __ret_720; \
-  int32x2_t __s0_720 = __p0_720; \
-  int64x2_t __s1_720 = __p1_720; \
-  __ret_720 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_720), (int32x2_t)(vqshrun_n_s64(__s1_720, __p2_720)))); \
-  __ret_720; \
-})
-#else
-#define vqshrun_high_n_s64(__p0_721, __p1_721, __p2_721) __extension__ ({ \
-  int32x4_t __ret_721; \
-  int32x2_t __s0_721 = __p0_721; \
-  int64x2_t __s1_721 = __p1_721; \
-  int32x2_t __rev0_721;  __rev0_721 = __builtin_shufflevector(__s0_721, __s0_721, 1, 0); \
-  int64x2_t __rev1_721;  __rev1_721 = __builtin_shufflevector(__s1_721, __s1_721, 1, 0); \
-  __ret_721 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_721), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_721, __p2_721)))); \
-  __ret_721 = __builtin_shufflevector(__ret_721, __ret_721, 3, 2, 1, 0); \
-  __ret_721; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqshrun_high_n_s16(__p0_722, __p1_722, __p2_722) __extension__ ({ \
-  int8x16_t __ret_722; \
-  int8x8_t __s0_722 = __p0_722; \
-  int16x8_t __s1_722 = __p1_722; \
-  __ret_722 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_722), (int8x8_t)(vqshrun_n_s16(__s1_722, __p2_722)))); \
-  __ret_722; \
-})
-#else
-#define vqshrun_high_n_s16(__p0_723, __p1_723, __p2_723) __extension__ ({ \
-  int8x16_t __ret_723; \
-  int8x8_t __s0_723 = __p0_723; \
-  int16x8_t __s1_723 = __p1_723; \
-  int8x8_t __rev0_723;  __rev0_723 = __builtin_shufflevector(__s0_723, __s0_723, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_723;  __rev1_723 = __builtin_shufflevector(__s1_723, __s1_723, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_723 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_723), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_723, __p2_723)))); \
-  __ret_723 = __builtin_shufflevector(__ret_723, __ret_723, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_723; \
-})
-#endif
-
-#define vqshruns_n_s32(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vqshruns_n_s32(__s0, __p1); \
-  __ret; \
-})
-#define vqshrund_n_s64(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vqshrund_n_s64(__s0, __p1); \
-  __ret; \
-})
-#define vqshrunh_n_s16(__p0, __p1) __extension__ ({ \
-  uint8_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (uint8_t) __builtin_neon_vqshrunh_n_s16(__s0, __p1); \
-  __ret; \
-})
-__ai __attribute__((target("neon"))) uint8_t vqsubb_u8(uint8_t __p0, uint8_t __p1) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vqsubb_u8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vqsubs_u32(uint32_t __p0, uint32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vqsubs_u32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vqsubd_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vqsubd_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16_t vqsubh_u16(uint16_t __p0, uint16_t __p1) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vqsubh_u16(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8_t vqsubb_s8(int8_t __p0, int8_t __p1) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vqsubb_s8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vqsubs_s32(int32_t __p0, int32_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqsubs_s32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vqsubd_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vqsubd_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16_t vqsubh_s16(int16_t __p0, int16_t __p1) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqsubh_s16(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vqtbl1_p8(poly8x16_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vqtbl1_p8(poly8x16_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vqtbl1q_p8(poly8x16_t __p0, uint8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vqtbl1q_p8(poly8x16_t __p0, uint8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqtbl1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqtbl1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqtbl1q_s8(int8x16_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqtbl1q_s8(int8x16_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqtbl1_u8(uint8x16_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqtbl1_u8(uint8x16_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqtbl1_s8(int8x16_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqtbl1_s8(int8x16_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vqtbl2_p8(poly8x16x2_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vqtbl2_p8(poly8x16x2_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x16x2_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vqtbl2q_p8(poly8x16x2_t __p0, uint8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vqtbl2q_p8(poly8x16x2_t __p0, uint8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16x2_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqtbl2q_u8(uint8x16x2_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqtbl2q_u8(uint8x16x2_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16x2_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqtbl2q_s8(int8x16x2_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqtbl2q_s8(int8x16x2_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16x2_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqtbl2_u8(uint8x16x2_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqtbl2_u8(uint8x16x2_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x16x2_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqtbl2_s8(int8x16x2_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqtbl2_s8(int8x16x2_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  int8x16x2_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vqtbl3_p8(poly8x16x3_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vqtbl3_p8(poly8x16x3_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x16x3_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vqtbl3q_p8(poly8x16x3_t __p0, uint8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vqtbl3q_p8(poly8x16x3_t __p0, uint8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16x3_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqtbl3q_u8(uint8x16x3_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqtbl3q_u8(uint8x16x3_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16x3_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqtbl3q_s8(int8x16x3_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqtbl3q_s8(int8x16x3_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16x3_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqtbl3_u8(uint8x16x3_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqtbl3_u8(uint8x16x3_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x16x3_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqtbl3_s8(int8x16x3_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqtbl3_s8(int8x16x3_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  int8x16x3_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vqtbl4_p8(poly8x16x4_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vqtbl4_p8(poly8x16x4_t __p0, uint8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x16x4_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vqtbl4q_p8(poly8x16x4_t __p0, uint8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vqtbl4q_p8(poly8x16x4_t __p0, uint8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16x4_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqtbl4q_u8(uint8x16x4_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqtbl4q_u8(uint8x16x4_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16x4_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqtbl4q_s8(int8x16x4_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqtbl4q_s8(int8x16x4_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16x4_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqtbl4_u8(uint8x16x4_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqtbl4_u8(uint8x16x4_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x16x4_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqtbl4_s8(int8x16x4_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqtbl4_s8(int8x16x4_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  int8x16x4_t __rev0;
-  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vqtbx1_p8(poly8x8_t __p0, poly8x16_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vqtbx1_p8(poly8x8_t __p0, poly8x16_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vqtbx1q_p8(poly8x16_t __p0, poly8x16_t __p1, uint8x16_t __p2) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vqtbx1q_p8(poly8x16_t __p0, poly8x16_t __p1, uint8x16_t __p2) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqtbx1q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqtbx1q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, uint8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, uint8x16_t __p2) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqtbx1_u8(uint8x8_t __p0, uint8x16_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqtbx1_u8(uint8x8_t __p0, uint8x16_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, uint8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, uint8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vqtbx2_p8(poly8x8_t __p0, poly8x16x2_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vqtbx2_p8(poly8x8_t __p0, poly8x16x2_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16x2_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vqtbx2q_p8(poly8x16_t __p0, poly8x16x2_t __p1, uint8x16_t __p2) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vqtbx2q_p8(poly8x16_t __p0, poly8x16x2_t __p1, uint8x16_t __p2) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16x2_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqtbx2q_u8(uint8x16_t __p0, uint8x16x2_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqtbx2q_u8(uint8x16_t __p0, uint8x16x2_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16x2_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, uint8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, uint8x16_t __p2) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16x2_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqtbx2_u8(uint8x8_t __p0, uint8x16x2_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqtbx2_u8(uint8x8_t __p0, uint8x16x2_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16x2_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, uint8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, uint8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16x2_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vqtbx3_p8(poly8x8_t __p0, poly8x16x3_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vqtbx3_p8(poly8x8_t __p0, poly8x16x3_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16x3_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vqtbx3q_p8(poly8x16_t __p0, poly8x16x3_t __p1, uint8x16_t __p2) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vqtbx3q_p8(poly8x16_t __p0, poly8x16x3_t __p1, uint8x16_t __p2) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16x3_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqtbx3q_u8(uint8x16_t __p0, uint8x16x3_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqtbx3q_u8(uint8x16_t __p0, uint8x16x3_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16x3_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, uint8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, uint8x16_t __p2) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16x3_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqtbx3_u8(uint8x8_t __p0, uint8x16x3_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqtbx3_u8(uint8x8_t __p0, uint8x16x3_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16x3_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, uint8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, uint8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16x3_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vqtbx4_p8(poly8x8_t __p0, poly8x16x4_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vqtbx4_p8(poly8x8_t __p0, poly8x16x4_t __p1, uint8x8_t __p2) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16x4_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vqtbx4q_p8(poly8x16_t __p0, poly8x16x4_t __p1, uint8x16_t __p2) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vqtbx4q_p8(poly8x16_t __p0, poly8x16x4_t __p1, uint8x16_t __p2) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16x4_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vqtbx4q_u8(uint8x16_t __p0, uint8x16x4_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vqtbx4q_u8(uint8x16_t __p0, uint8x16x4_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16x4_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, uint8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, uint8x16_t __p2) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16x4_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vqtbx4_u8(uint8x8_t __p0, uint8x16x4_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vqtbx4_u8(uint8x8_t __p0, uint8x16x4_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16x4_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, uint8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, uint8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16x4_t __rev1;
-  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vraddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint16x8_t __ret;
-  __ret = vcombine_u16(__p0, vraddhn_u32(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vraddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint16x8_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u16(__rev0, __noswap_vraddhn_u32(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vraddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint32x4_t __ret;
-  __ret = vcombine_u32(__p0, vraddhn_u64(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vraddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint32x4_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vcombine_u32(__rev0, __noswap_vraddhn_u64(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vraddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint8x16_t __ret;
-  __ret = vcombine_u8(__p0, vraddhn_u16(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vraddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint8x16_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u8(__rev0, __noswap_vraddhn_u16(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vraddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int16x8_t __ret;
-  __ret = vcombine_s16(__p0, vraddhn_s32(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vraddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int16x8_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s16(__rev0, __noswap_vraddhn_s32(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vraddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int32x4_t __ret;
-  __ret = vcombine_s32(__p0, vraddhn_s64(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vraddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int32x4_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vcombine_s32(__rev0, __noswap_vraddhn_s64(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vraddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int8x16_t __ret;
-  __ret = vcombine_s8(__p0, vraddhn_s16(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vraddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int8x16_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s8(__rev0, __noswap_vraddhn_s16(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vrbit_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t) __builtin_neon_vrbit_v((int8x8_t)__p0, 4);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vrbit_p8(poly8x8_t __p0) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x8_t) __builtin_neon_vrbit_v((int8x8_t)__rev0, 4);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vrbitq_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__p0, 36);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vrbitq_p8(poly8x16_t __p0) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (poly8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__rev0, 36);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vrbitq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__p0, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vrbitq_u8(uint8x16_t __p0) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__rev0, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vrbitq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__p0, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vrbitq_s8(int8x16_t __p0) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__rev0, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vrbit_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vrbit_v((int8x8_t)__p0, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vrbit_u8(uint8x8_t __p0) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vrbit_v((int8x8_t)__rev0, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vrbit_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vrbit_v((int8x8_t)__p0, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vrbit_s8(int8x8_t __p0) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vrbit_v((int8x8_t)__rev0, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrecpeq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrecpeq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrecpe_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64_t vrecped_f64(float64_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vrecped_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32_t vrecpes_f32(float32_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vrecpes_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrecpsq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrecpsq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrecps_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64_t vrecpsd_f64(float64_t __p0, float64_t __p1) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vrecpsd_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32_t vrecpss_f32(float32_t __p0, float32_t __p1) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vrecpss_f32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64_t vrecpxd_f64(float64_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vrecpxd_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32_t vrecpxs_f32(float32_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vrecpxs_f32(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_p64(poly64x1_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_f64(float64x1_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
-  poly8x8_t __ret;
-  __ret = (poly8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_p8(poly8x8_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_p16(poly16x4_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_u8(uint8x8_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_u32(uint32x2_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_u64(uint64x1_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_u16(uint16x4_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_s8(int8x8_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_f64(float64x1_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_f32(float32x2_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_f16(float16x4_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_s32(int32x2_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_s64(int64x1_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_s16(int16x4_t __p0) {
-  poly64x1_t __ret;
-  __ret = (poly64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_p64(poly64x1_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_f64(float64x1_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
-  poly16x4_t __ret;
-  __ret = (poly16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_p128(poly128_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_p64(poly64x2_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_f64(float64x2_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
-  poly8x16_t __ret;
-  __ret = (poly8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_p8(poly8x16_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_p64(poly64x2_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_p16(poly16x8_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_u8(uint8x16_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_u32(uint32x4_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_u64(uint64x2_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_u16(uint16x8_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_s8(int8x16_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_f64(float64x2_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_f32(float32x4_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_f16(float16x8_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_s32(int32x4_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_s64(int64x2_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_s16(int16x8_t __p0) {
-  poly128_t __ret;
-  __ret = (poly128_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_p8(poly8x16_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_p128(poly128_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_p16(poly16x8_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_u8(uint8x16_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_u32(uint32x4_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_u64(uint64x2_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_u16(uint16x8_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_s8(int8x16_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_f64(float64x2_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_f32(float32x4_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_f16(float16x8_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_s32(int32x4_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_s64(int64x2_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_s16(int16x8_t __p0) {
-  poly64x2_t __ret;
-  __ret = (poly64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_p128(poly128_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_p64(poly64x2_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_f64(float64x2_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
-  poly16x8_t __ret;
-  __ret = (poly16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_p128(poly128_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_p64(poly64x2_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_f64(float64x2_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_p128(poly128_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_p64(poly64x2_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_f64(float64x2_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_p128(poly128_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_p64(poly64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_f64(float64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_p128(poly128_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_p64(poly64x2_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_f64(float64x2_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_p128(poly128_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_p64(poly64x2_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_f64(float64x2_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
-  int8x16_t __ret;
-  __ret = (int8x16_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_p8(poly8x16_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_p128(poly128_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_p64(poly64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_p16(poly16x8_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_u8(uint8x16_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_u32(uint32x4_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_u64(uint64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_u16(uint16x8_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_s8(int8x16_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_f32(float32x4_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_f16(float16x8_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_s32(int32x4_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_s64(int64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_s16(int16x8_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_p128(poly128_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_p64(poly64x2_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_f64(float64x2_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_p128(poly128_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_p64(poly64x2_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_f64(float64x2_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
-  float16x8_t __ret;
-  __ret = (float16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_p128(poly128_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_p64(poly64x2_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_f64(float64x2_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_p128(poly128_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_p64(poly64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_f64(float64x2_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_p128(poly128_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_p64(poly64x2_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_f64(float64x2_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_p64(poly64x1_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_f64(float64x1_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_p64(poly64x1_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_f64(float64x1_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_p64(poly64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_f64(float64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_p64(poly64x1_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_f64(float64x1_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_p64(poly64x1_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_f64(float64x1_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
-  int8x8_t __ret;
-  __ret = (int8x8_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_p8(poly8x8_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_p64(poly64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_p16(poly16x4_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_u8(uint8x8_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_u32(uint32x2_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_u64(uint64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_u16(uint16x4_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_s8(int8x8_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_f32(float32x2_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_f16(float16x4_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_s32(int32x2_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_s64(int64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_s16(int16x4_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_p64(poly64x1_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_f64(float64x1_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_p64(poly64x1_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_f64(float64x1_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
-  float16x4_t __ret;
-  __ret = (float16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_p64(poly64x1_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_f64(float64x1_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
-  int32x2_t __ret;
-  __ret = (int32x2_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_p64(poly64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_f64(float64x1_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
-  int64x1_t __ret;
-  __ret = (int64x1_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_p64(poly64x1_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_f64(float64x1_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
-  int16x4_t __ret;
-  __ret = (int16x4_t)(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vrshld_u64(uint64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vrshld_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vrshld_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vrshld_s64(__p0, __p1);
-  return __ret;
-}
-#define vrshrd_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vrshrd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#define vrshrd_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vrshrd_n_s64(__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_u32(__p0_724, __p1_724, __p2_724) __extension__ ({ \
-  uint16x8_t __ret_724; \
-  uint16x4_t __s0_724 = __p0_724; \
-  uint32x4_t __s1_724 = __p1_724; \
-  __ret_724 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_724), (uint16x4_t)(vrshrn_n_u32(__s1_724, __p2_724)))); \
-  __ret_724; \
-})
-#else
-#define vrshrn_high_n_u32(__p0_725, __p1_725, __p2_725) __extension__ ({ \
-  uint16x8_t __ret_725; \
-  uint16x4_t __s0_725 = __p0_725; \
-  uint32x4_t __s1_725 = __p1_725; \
-  uint16x4_t __rev0_725;  __rev0_725 = __builtin_shufflevector(__s0_725, __s0_725, 3, 2, 1, 0); \
-  uint32x4_t __rev1_725;  __rev1_725 = __builtin_shufflevector(__s1_725, __s1_725, 3, 2, 1, 0); \
-  __ret_725 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_725), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_725, __p2_725)))); \
-  __ret_725 = __builtin_shufflevector(__ret_725, __ret_725, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_725; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_u64(__p0_726, __p1_726, __p2_726) __extension__ ({ \
-  uint32x4_t __ret_726; \
-  uint32x2_t __s0_726 = __p0_726; \
-  uint64x2_t __s1_726 = __p1_726; \
-  __ret_726 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_726), (uint32x2_t)(vrshrn_n_u64(__s1_726, __p2_726)))); \
-  __ret_726; \
-})
-#else
-#define vrshrn_high_n_u64(__p0_727, __p1_727, __p2_727) __extension__ ({ \
-  uint32x4_t __ret_727; \
-  uint32x2_t __s0_727 = __p0_727; \
-  uint64x2_t __s1_727 = __p1_727; \
-  uint32x2_t __rev0_727;  __rev0_727 = __builtin_shufflevector(__s0_727, __s0_727, 1, 0); \
-  uint64x2_t __rev1_727;  __rev1_727 = __builtin_shufflevector(__s1_727, __s1_727, 1, 0); \
-  __ret_727 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_727), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_727, __p2_727)))); \
-  __ret_727 = __builtin_shufflevector(__ret_727, __ret_727, 3, 2, 1, 0); \
-  __ret_727; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_u16(__p0_728, __p1_728, __p2_728) __extension__ ({ \
-  uint8x16_t __ret_728; \
-  uint8x8_t __s0_728 = __p0_728; \
-  uint16x8_t __s1_728 = __p1_728; \
-  __ret_728 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_728), (uint8x8_t)(vrshrn_n_u16(__s1_728, __p2_728)))); \
-  __ret_728; \
-})
-#else
-#define vrshrn_high_n_u16(__p0_729, __p1_729, __p2_729) __extension__ ({ \
-  uint8x16_t __ret_729; \
-  uint8x8_t __s0_729 = __p0_729; \
-  uint16x8_t __s1_729 = __p1_729; \
-  uint8x8_t __rev0_729;  __rev0_729 = __builtin_shufflevector(__s0_729, __s0_729, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_729;  __rev1_729 = __builtin_shufflevector(__s1_729, __s1_729, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_729 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_729), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_729, __p2_729)))); \
-  __ret_729 = __builtin_shufflevector(__ret_729, __ret_729, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_729; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_s32(__p0_730, __p1_730, __p2_730) __extension__ ({ \
-  int16x8_t __ret_730; \
-  int16x4_t __s0_730 = __p0_730; \
-  int32x4_t __s1_730 = __p1_730; \
-  __ret_730 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_730), (int16x4_t)(vrshrn_n_s32(__s1_730, __p2_730)))); \
-  __ret_730; \
-})
-#else
-#define vrshrn_high_n_s32(__p0_731, __p1_731, __p2_731) __extension__ ({ \
-  int16x8_t __ret_731; \
-  int16x4_t __s0_731 = __p0_731; \
-  int32x4_t __s1_731 = __p1_731; \
-  int16x4_t __rev0_731;  __rev0_731 = __builtin_shufflevector(__s0_731, __s0_731, 3, 2, 1, 0); \
-  int32x4_t __rev1_731;  __rev1_731 = __builtin_shufflevector(__s1_731, __s1_731, 3, 2, 1, 0); \
-  __ret_731 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_731), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_731, __p2_731)))); \
-  __ret_731 = __builtin_shufflevector(__ret_731, __ret_731, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_731; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_s64(__p0_732, __p1_732, __p2_732) __extension__ ({ \
-  int32x4_t __ret_732; \
-  int32x2_t __s0_732 = __p0_732; \
-  int64x2_t __s1_732 = __p1_732; \
-  __ret_732 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_732), (int32x2_t)(vrshrn_n_s64(__s1_732, __p2_732)))); \
-  __ret_732; \
-})
-#else
-#define vrshrn_high_n_s64(__p0_733, __p1_733, __p2_733) __extension__ ({ \
-  int32x4_t __ret_733; \
-  int32x2_t __s0_733 = __p0_733; \
-  int64x2_t __s1_733 = __p1_733; \
-  int32x2_t __rev0_733;  __rev0_733 = __builtin_shufflevector(__s0_733, __s0_733, 1, 0); \
-  int64x2_t __rev1_733;  __rev1_733 = __builtin_shufflevector(__s1_733, __s1_733, 1, 0); \
-  __ret_733 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_733), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_733, __p2_733)))); \
-  __ret_733 = __builtin_shufflevector(__ret_733, __ret_733, 3, 2, 1, 0); \
-  __ret_733; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vrshrn_high_n_s16(__p0_734, __p1_734, __p2_734) __extension__ ({ \
-  int8x16_t __ret_734; \
-  int8x8_t __s0_734 = __p0_734; \
-  int16x8_t __s1_734 = __p1_734; \
-  __ret_734 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_734), (int8x8_t)(vrshrn_n_s16(__s1_734, __p2_734)))); \
-  __ret_734; \
-})
-#else
-#define vrshrn_high_n_s16(__p0_735, __p1_735, __p2_735) __extension__ ({ \
-  int8x16_t __ret_735; \
-  int8x8_t __s0_735 = __p0_735; \
-  int16x8_t __s1_735 = __p1_735; \
-  int8x8_t __rev0_735;  __rev0_735 = __builtin_shufflevector(__s0_735, __s0_735, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_735;  __rev1_735 = __builtin_shufflevector(__s1_735, __s1_735, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_735 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_735), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_735, __p2_735)))); \
-  __ret_735 = __builtin_shufflevector(__ret_735, __ret_735, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_735; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrsqrteq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrsqrteq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrsqrte_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64_t vrsqrted_f64(float64_t __p0) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vrsqrted_f64(__p0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32_t vrsqrtes_f32(float32_t __p0) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vrsqrtes_f32(__p0);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vrsqrtsq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vrsqrtsq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vrsqrts_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float64_t vrsqrtsd_f64(float64_t __p0, float64_t __p1) {
-  float64_t __ret;
-  __ret = (float64_t) __builtin_neon_vrsqrtsd_f64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) float32_t vrsqrtss_f32(float32_t __p0, float32_t __p1) {
-  float32_t __ret;
-  __ret = (float32_t) __builtin_neon_vrsqrtss_f32(__p0, __p1);
-  return __ret;
-}
-#define vrsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64_t __ret; \
-  uint64_t __s0 = __p0; \
-  uint64_t __s1 = __p1; \
-  __ret = (uint64_t) __builtin_neon_vrsrad_n_u64(__s0, __s1, __p2); \
-  __ret; \
-})
-#define vrsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int64_t __s1 = __p1; \
-  __ret = (int64_t) __builtin_neon_vrsrad_n_s64(__s0, __s1, __p2); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vrsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint16x8_t __ret;
-  __ret = vcombine_u16(__p0, vrsubhn_u32(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vrsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint16x8_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u16(__rev0, __noswap_vrsubhn_u32(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vrsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint32x4_t __ret;
-  __ret = vcombine_u32(__p0, vrsubhn_u64(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vrsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint32x4_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vcombine_u32(__rev0, __noswap_vrsubhn_u64(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vrsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint8x16_t __ret;
-  __ret = vcombine_u8(__p0, vrsubhn_u16(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vrsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint8x16_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u8(__rev0, __noswap_vrsubhn_u16(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vrsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int16x8_t __ret;
-  __ret = vcombine_s16(__p0, vrsubhn_s32(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vrsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int16x8_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s16(__rev0, __noswap_vrsubhn_s32(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vrsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int32x4_t __ret;
-  __ret = vcombine_s32(__p0, vrsubhn_s64(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vrsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int32x4_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vcombine_s32(__rev0, __noswap_vrsubhn_s64(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vrsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int8x16_t __ret;
-  __ret = vcombine_s8(__p0, vrsubhn_s16(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vrsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int8x16_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s8(__rev0, __noswap_vrsubhn_s16(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#define vset_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64_t __s0 = __p0; \
-  poly64x1_t __s1 = __p1; \
-  __ret = (poly64x1_t) __builtin_neon_vset_lane_i64(__s0, (poly64x1_t)__s1, __p2); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64_t __s0 = __p0; \
-  poly64x2_t __s1 = __p1; \
-  __ret = (poly64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (poly64x2_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64_t __s0 = __p0; \
-  poly64x2_t __s1 = __p1; \
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (poly64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (poly64x2_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64_t __s0 = __p0; \
-  poly64x2_t __s1 = __p1; \
-  __ret = (poly64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (poly64x2_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __ret; \
-  float64_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  __ret = (float64x2_t) __builtin_neon_vsetq_lane_f64(__s0, (float64x2_t)__s1, __p2); \
-  __ret; \
-})
-#else
-#define vsetq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __ret; \
-  float64_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (float64x2_t) __builtin_neon_vsetq_lane_f64(__s0, (float64x2_t)__rev1, __p2); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#define __noswap_vsetq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __ret; \
-  float64_t __s0 = __p0; \
-  float64x2_t __s1 = __p1; \
-  __ret = (float64x2_t) __builtin_neon_vsetq_lane_f64(__s0, (float64x2_t)__s1, __p2); \
-  __ret; \
-})
-#endif
-
-#define vset_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1_t __ret; \
-  float64_t __s0 = __p0; \
-  float64x1_t __s1 = __p1; \
-  __ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (float64x1_t)__s1, __p2); \
-  __ret; \
-})
-__ai __attribute__((target("neon"))) uint64_t vshld_u64(uint64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vshld_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vshld_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vshld_s64(__p0, __p1);
-  return __ret;
-}
-#define vshld_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vshld_n_u64(__s0, __p1); \
-  __ret; \
-})
-#define vshld_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vshld_n_s64(__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_u8(__p0_736, __p1_736) __extension__ ({ \
-  uint16x8_t __ret_736; \
-  uint8x16_t __s0_736 = __p0_736; \
-  __ret_736 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_736), __p1_736)); \
-  __ret_736; \
-})
-#else
-#define vshll_high_n_u8(__p0_737, __p1_737) __extension__ ({ \
-  uint16x8_t __ret_737; \
-  uint8x16_t __s0_737 = __p0_737; \
-  uint8x16_t __rev0_737;  __rev0_737 = __builtin_shufflevector(__s0_737, __s0_737, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_737 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_737), __p1_737)); \
-  __ret_737 = __builtin_shufflevector(__ret_737, __ret_737, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_737; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_u32(__p0_738, __p1_738) __extension__ ({ \
-  uint64x2_t __ret_738; \
-  uint32x4_t __s0_738 = __p0_738; \
-  __ret_738 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_738), __p1_738)); \
-  __ret_738; \
-})
-#else
-#define vshll_high_n_u32(__p0_739, __p1_739) __extension__ ({ \
-  uint64x2_t __ret_739; \
-  uint32x4_t __s0_739 = __p0_739; \
-  uint32x4_t __rev0_739;  __rev0_739 = __builtin_shufflevector(__s0_739, __s0_739, 3, 2, 1, 0); \
-  __ret_739 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_739), __p1_739)); \
-  __ret_739 = __builtin_shufflevector(__ret_739, __ret_739, 1, 0); \
-  __ret_739; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_u16(__p0_740, __p1_740) __extension__ ({ \
-  uint32x4_t __ret_740; \
-  uint16x8_t __s0_740 = __p0_740; \
-  __ret_740 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_740), __p1_740)); \
-  __ret_740; \
-})
-#else
-#define vshll_high_n_u16(__p0_741, __p1_741) __extension__ ({ \
-  uint32x4_t __ret_741; \
-  uint16x8_t __s0_741 = __p0_741; \
-  uint16x8_t __rev0_741;  __rev0_741 = __builtin_shufflevector(__s0_741, __s0_741, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_741 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_741), __p1_741)); \
-  __ret_741 = __builtin_shufflevector(__ret_741, __ret_741, 3, 2, 1, 0); \
-  __ret_741; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_s8(__p0_742, __p1_742) __extension__ ({ \
-  int16x8_t __ret_742; \
-  int8x16_t __s0_742 = __p0_742; \
-  __ret_742 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_742), __p1_742)); \
-  __ret_742; \
-})
-#else
-#define vshll_high_n_s8(__p0_743, __p1_743) __extension__ ({ \
-  int16x8_t __ret_743; \
-  int8x16_t __s0_743 = __p0_743; \
-  int8x16_t __rev0_743;  __rev0_743 = __builtin_shufflevector(__s0_743, __s0_743, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_743 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_743), __p1_743)); \
-  __ret_743 = __builtin_shufflevector(__ret_743, __ret_743, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_743; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_s32(__p0_744, __p1_744) __extension__ ({ \
-  int64x2_t __ret_744; \
-  int32x4_t __s0_744 = __p0_744; \
-  __ret_744 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_744), __p1_744)); \
-  __ret_744; \
-})
-#else
-#define vshll_high_n_s32(__p0_745, __p1_745) __extension__ ({ \
-  int64x2_t __ret_745; \
-  int32x4_t __s0_745 = __p0_745; \
-  int32x4_t __rev0_745;  __rev0_745 = __builtin_shufflevector(__s0_745, __s0_745, 3, 2, 1, 0); \
-  __ret_745 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_745), __p1_745)); \
-  __ret_745 = __builtin_shufflevector(__ret_745, __ret_745, 1, 0); \
-  __ret_745; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshll_high_n_s16(__p0_746, __p1_746) __extension__ ({ \
-  int32x4_t __ret_746; \
-  int16x8_t __s0_746 = __p0_746; \
-  __ret_746 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_746), __p1_746)); \
-  __ret_746; \
-})
-#else
-#define vshll_high_n_s16(__p0_747, __p1_747) __extension__ ({ \
-  int32x4_t __ret_747; \
-  int16x8_t __s0_747 = __p0_747; \
-  int16x8_t __rev0_747;  __rev0_747 = __builtin_shufflevector(__s0_747, __s0_747, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_747 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_747), __p1_747)); \
-  __ret_747 = __builtin_shufflevector(__ret_747, __ret_747, 3, 2, 1, 0); \
-  __ret_747; \
-})
-#endif
-
-#define vshrd_n_u64(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \
-  __ret; \
-})
-#define vshrd_n_s64(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_u32(__p0_748, __p1_748, __p2_748) __extension__ ({ \
-  uint16x8_t __ret_748; \
-  uint16x4_t __s0_748 = __p0_748; \
-  uint32x4_t __s1_748 = __p1_748; \
-  __ret_748 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_748), (uint16x4_t)(vshrn_n_u32(__s1_748, __p2_748)))); \
-  __ret_748; \
-})
-#else
-#define vshrn_high_n_u32(__p0_749, __p1_749, __p2_749) __extension__ ({ \
-  uint16x8_t __ret_749; \
-  uint16x4_t __s0_749 = __p0_749; \
-  uint32x4_t __s1_749 = __p1_749; \
-  uint16x4_t __rev0_749;  __rev0_749 = __builtin_shufflevector(__s0_749, __s0_749, 3, 2, 1, 0); \
-  uint32x4_t __rev1_749;  __rev1_749 = __builtin_shufflevector(__s1_749, __s1_749, 3, 2, 1, 0); \
-  __ret_749 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_749), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_749, __p2_749)))); \
-  __ret_749 = __builtin_shufflevector(__ret_749, __ret_749, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_749; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_u64(__p0_750, __p1_750, __p2_750) __extension__ ({ \
-  uint32x4_t __ret_750; \
-  uint32x2_t __s0_750 = __p0_750; \
-  uint64x2_t __s1_750 = __p1_750; \
-  __ret_750 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_750), (uint32x2_t)(vshrn_n_u64(__s1_750, __p2_750)))); \
-  __ret_750; \
-})
-#else
-#define vshrn_high_n_u64(__p0_751, __p1_751, __p2_751) __extension__ ({ \
-  uint32x4_t __ret_751; \
-  uint32x2_t __s0_751 = __p0_751; \
-  uint64x2_t __s1_751 = __p1_751; \
-  uint32x2_t __rev0_751;  __rev0_751 = __builtin_shufflevector(__s0_751, __s0_751, 1, 0); \
-  uint64x2_t __rev1_751;  __rev1_751 = __builtin_shufflevector(__s1_751, __s1_751, 1, 0); \
-  __ret_751 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_751), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_751, __p2_751)))); \
-  __ret_751 = __builtin_shufflevector(__ret_751, __ret_751, 3, 2, 1, 0); \
-  __ret_751; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_u16(__p0_752, __p1_752, __p2_752) __extension__ ({ \
-  uint8x16_t __ret_752; \
-  uint8x8_t __s0_752 = __p0_752; \
-  uint16x8_t __s1_752 = __p1_752; \
-  __ret_752 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_752), (uint8x8_t)(vshrn_n_u16(__s1_752, __p2_752)))); \
-  __ret_752; \
-})
-#else
-#define vshrn_high_n_u16(__p0_753, __p1_753, __p2_753) __extension__ ({ \
-  uint8x16_t __ret_753; \
-  uint8x8_t __s0_753 = __p0_753; \
-  uint16x8_t __s1_753 = __p1_753; \
-  uint8x8_t __rev0_753;  __rev0_753 = __builtin_shufflevector(__s0_753, __s0_753, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint16x8_t __rev1_753;  __rev1_753 = __builtin_shufflevector(__s1_753, __s1_753, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_753 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_753), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_753, __p2_753)))); \
-  __ret_753 = __builtin_shufflevector(__ret_753, __ret_753, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_753; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_s32(__p0_754, __p1_754, __p2_754) __extension__ ({ \
-  int16x8_t __ret_754; \
-  int16x4_t __s0_754 = __p0_754; \
-  int32x4_t __s1_754 = __p1_754; \
-  __ret_754 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_754), (int16x4_t)(vshrn_n_s32(__s1_754, __p2_754)))); \
-  __ret_754; \
-})
-#else
-#define vshrn_high_n_s32(__p0_755, __p1_755, __p2_755) __extension__ ({ \
-  int16x8_t __ret_755; \
-  int16x4_t __s0_755 = __p0_755; \
-  int32x4_t __s1_755 = __p1_755; \
-  int16x4_t __rev0_755;  __rev0_755 = __builtin_shufflevector(__s0_755, __s0_755, 3, 2, 1, 0); \
-  int32x4_t __rev1_755;  __rev1_755 = __builtin_shufflevector(__s1_755, __s1_755, 3, 2, 1, 0); \
-  __ret_755 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_755), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_755, __p2_755)))); \
-  __ret_755 = __builtin_shufflevector(__ret_755, __ret_755, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_755; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_s64(__p0_756, __p1_756, __p2_756) __extension__ ({ \
-  int32x4_t __ret_756; \
-  int32x2_t __s0_756 = __p0_756; \
-  int64x2_t __s1_756 = __p1_756; \
-  __ret_756 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_756), (int32x2_t)(vshrn_n_s64(__s1_756, __p2_756)))); \
-  __ret_756; \
-})
-#else
-#define vshrn_high_n_s64(__p0_757, __p1_757, __p2_757) __extension__ ({ \
-  int32x4_t __ret_757; \
-  int32x2_t __s0_757 = __p0_757; \
-  int64x2_t __s1_757 = __p1_757; \
-  int32x2_t __rev0_757;  __rev0_757 = __builtin_shufflevector(__s0_757, __s0_757, 1, 0); \
-  int64x2_t __rev1_757;  __rev1_757 = __builtin_shufflevector(__s1_757, __s1_757, 1, 0); \
-  __ret_757 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_757), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_757, __p2_757)))); \
-  __ret_757 = __builtin_shufflevector(__ret_757, __ret_757, 3, 2, 1, 0); \
-  __ret_757; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vshrn_high_n_s16(__p0_758, __p1_758, __p2_758) __extension__ ({ \
-  int8x16_t __ret_758; \
-  int8x8_t __s0_758 = __p0_758; \
-  int16x8_t __s1_758 = __p1_758; \
-  __ret_758 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_758), (int8x8_t)(vshrn_n_s16(__s1_758, __p2_758)))); \
-  __ret_758; \
-})
-#else
-#define vshrn_high_n_s16(__p0_759, __p1_759, __p2_759) __extension__ ({ \
-  int8x16_t __ret_759; \
-  int8x8_t __s0_759 = __p0_759; \
-  int16x8_t __s1_759 = __p1_759; \
-  int8x8_t __rev0_759;  __rev0_759 = __builtin_shufflevector(__s0_759, __s0_759, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_759;  __rev1_759 = __builtin_shufflevector(__s1_759, __s1_759, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_759 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_759), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_759, __p2_759)))); \
-  __ret_759 = __builtin_shufflevector(__ret_759, __ret_759, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_759; \
-})
-#endif
-
-#define vslid_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64_t __ret; \
-  uint64_t __s0 = __p0; \
-  uint64_t __s1 = __p1; \
-  __ret = (uint64_t) __builtin_neon_vslid_n_u64(__s0, __s1, __p2); \
-  __ret; \
-})
-#define vslid_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int64_t __s1 = __p1; \
-  __ret = (int64_t) __builtin_neon_vslid_n_s64(__s0, __s1, __p2); \
-  __ret; \
-})
-#define vsli_n_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64x1_t __s0 = __p0; \
-  poly64x1_t __s1 = __p1; \
-  __ret = (poly64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vsliq_n_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  poly64x2_t __s1 = __p1; \
-  __ret = (poly64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 38); \
-  __ret; \
-})
-#else
-#define vsliq_n_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  poly64x2_t __s1 = __p1; \
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (poly64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 38); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-__ai __attribute__((target("neon"))) uint8_t vsqaddb_u8(uint8_t __p0, int8_t __p1) {
-  uint8_t __ret;
-  __ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32_t vsqadds_u32(uint32_t __p0, int32_t __p1) {
-  uint32_t __ret;
-  __ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vsqaddd_u64(uint64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16_t vsqaddh_u16(uint16_t __p0, int16_t __p1) {
-  uint16_t __ret;
-  __ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vsqaddq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vsqaddq_u8(uint8x16_t __p0, int8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vsqaddq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vsqaddq_u32(uint32x4_t __p0, int32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vsqaddq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vsqaddq_u64(uint64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vsqaddq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vsqaddq_u16(uint16x8_t __p0, int16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vsqadd_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vsqadd_u8(uint8x8_t __p0, int8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vsqadd_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vsqadd_u32(uint32x2_t __p0, int32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vsqadd_u64(uint64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vsqadd_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vsqadd_u16(uint16x4_t __p0, int16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vsqrtq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vsqrtq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vsqrtq_v((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vsqrtq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vsqrtq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vsqrtq_v((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vsqrt_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vsqrt_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vsqrt_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vsqrt_v((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#define vsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64_t __ret; \
-  uint64_t __s0 = __p0; \
-  uint64_t __s1 = __p1; \
-  __ret = (uint64_t) __builtin_neon_vsrad_n_u64(__s0, __s1, __p2); \
-  __ret; \
-})
-#define vsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int64_t __s1 = __p1; \
-  __ret = (int64_t) __builtin_neon_vsrad_n_s64(__s0, __s1, __p2); \
-  __ret; \
-})
-#define vsrid_n_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64_t __ret; \
-  uint64_t __s0 = __p0; \
-  uint64_t __s1 = __p1; \
-  __ret = (uint64_t) __builtin_neon_vsrid_n_u64(__s0, __s1, __p2); \
-  __ret; \
-})
-#define vsrid_n_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64_t __ret; \
-  int64_t __s0 = __p0; \
-  int64_t __s1 = __p1; \
-  __ret = (int64_t) __builtin_neon_vsrid_n_s64(__s0, __s1, __p2); \
-  __ret; \
-})
-#define vsri_n_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64x1_t __s0 = __p0; \
-  poly64x1_t __s1 = __p1; \
-  __ret = (poly64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vsriq_n_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  poly64x2_t __s1 = __p1; \
-  __ret = (poly64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 38); \
-  __ret; \
-})
-#else
-#define vsriq_n_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s0 = __p0; \
-  poly64x2_t __s1 = __p1; \
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (poly64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 38); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vst1_p64(__p0, __p1) __extension__ ({ \
-  poly64x1_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p64(__p0, __p1) __extension__ ({ \
-  poly64x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 38); \
-})
-#else
-#define vst1q_p64(__p0, __p1) __extension__ ({ \
-  poly64x2_t __s1 = __p1; \
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f64(__p0, __p1) __extension__ ({ \
-  float64x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 42); \
-})
-#else
-#define vst1q_f64(__p0, __p1) __extension__ ({ \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 42); \
-})
-#endif
-
-#define vst1_f64(__p0, __p1) __extension__ ({ \
-  float64x1_t __s1 = __p1; \
-  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 10); \
-})
-#define vst1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 38); \
-})
-#else
-#define vst1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __s1 = __p1; \
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 42); \
-})
-#else
-#define vst1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 42); \
-})
-#endif
-
-#define vst1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1_t __s1 = __p1; \
-  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
-})
-#define vst1_p64_x2(__p0, __p1) __extension__ ({ \
-  poly64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p64_x2(__p0, __p1) __extension__ ({ \
-  poly64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 38); \
-})
-#else
-#define vst1q_p64_x2(__p0, __p1) __extension__ ({ \
-  poly64x2x2_t __s1 = __p1; \
-  poly64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f64_x2(__p0, __p1) __extension__ ({ \
-  float64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 42); \
-})
-#else
-#define vst1q_f64_x2(__p0, __p1) __extension__ ({ \
-  float64x2x2_t __s1 = __p1; \
-  float64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 42); \
-})
-#endif
-
-#define vst1_f64_x2(__p0, __p1) __extension__ ({ \
-  float64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 10); \
-})
-#define vst1_p64_x3(__p0, __p1) __extension__ ({ \
-  poly64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p64_x3(__p0, __p1) __extension__ ({ \
-  poly64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 38); \
-})
-#else
-#define vst1q_p64_x3(__p0, __p1) __extension__ ({ \
-  poly64x2x3_t __s1 = __p1; \
-  poly64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f64_x3(__p0, __p1) __extension__ ({ \
-  float64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 42); \
-})
-#else
-#define vst1q_f64_x3(__p0, __p1) __extension__ ({ \
-  float64x2x3_t __s1 = __p1; \
-  float64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 42); \
-})
-#endif
-
-#define vst1_f64_x3(__p0, __p1) __extension__ ({ \
-  float64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 10); \
-})
-#define vst1_p64_x4(__p0, __p1) __extension__ ({ \
-  poly64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_p64_x4(__p0, __p1) __extension__ ({ \
-  poly64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 38); \
-})
-#else
-#define vst1q_p64_x4(__p0, __p1) __extension__ ({ \
-  poly64x2x4_t __s1 = __p1; \
-  poly64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst1q_f64_x4(__p0, __p1) __extension__ ({ \
-  float64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 42); \
-})
-#else
-#define vst1q_f64_x4(__p0, __p1) __extension__ ({ \
-  float64x2x4_t __s1 = __p1; \
-  float64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 42); \
-})
-#endif
-
-#define vst1_f64_x4(__p0, __p1) __extension__ ({ \
-  float64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 10); \
-})
-#define vst2_p64(__p0, __p1) __extension__ ({ \
-  poly64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_p64(__p0, __p1) __extension__ ({ \
-  poly64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 38); \
-})
-#else
-#define vst2q_p64(__p0, __p1) __extension__ ({ \
-  poly64x2x2_t __s1 = __p1; \
-  poly64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_u64(__p0, __p1) __extension__ ({ \
-  uint64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \
-})
-#else
-#define vst2q_u64(__p0, __p1) __extension__ ({ \
-  uint64x2x2_t __s1 = __p1; \
-  uint64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_f64(__p0, __p1) __extension__ ({ \
-  float64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 42); \
-})
-#else
-#define vst2q_f64(__p0, __p1) __extension__ ({ \
-  float64x2x2_t __s1 = __p1; \
-  float64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 42); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_s64(__p0, __p1) __extension__ ({ \
-  int64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 35); \
-})
-#else
-#define vst2q_s64(__p0, __p1) __extension__ ({ \
-  int64x2x2_t __s1 = __p1; \
-  int64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 35); \
-})
-#endif
-
-#define vst2_f64(__p0, __p1) __extension__ ({ \
-  float64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 10); \
-})
-#define vst2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 36); \
-})
-#else
-#define vst2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x2_t __s1 = __p1; \
-  poly8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 38); \
-})
-#else
-#define vst2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x2_t __s1 = __p1; \
-  poly64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 48); \
-})
-#else
-#define vst2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x2_t __s1 = __p1; \
-  uint8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 51); \
-})
-#else
-#define vst2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x2_t __s1 = __p1; \
-  uint64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 32); \
-})
-#else
-#define vst2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x2_t __s1 = __p1; \
-  int8x16x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 42); \
-})
-#else
-#define vst2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x2_t __s1 = __p1; \
-  float64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 42); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x2_t __s1 = __p1; \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 35); \
-})
-#else
-#define vst2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x2_t __s1 = __p1; \
-  int64x2x2_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 35); \
-})
-#endif
-
-#define vst2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
-})
-#define vst2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 10); \
-})
-#define vst2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1x2_t __s1 = __p1; \
-  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 3); \
-})
-#define vst3_p64(__p0, __p1) __extension__ ({ \
-  poly64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_p64(__p0, __p1) __extension__ ({ \
-  poly64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 38); \
-})
-#else
-#define vst3q_p64(__p0, __p1) __extension__ ({ \
-  poly64x2x3_t __s1 = __p1; \
-  poly64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_u64(__p0, __p1) __extension__ ({ \
-  uint64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \
-})
-#else
-#define vst3q_u64(__p0, __p1) __extension__ ({ \
-  uint64x2x3_t __s1 = __p1; \
-  uint64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_f64(__p0, __p1) __extension__ ({ \
-  float64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 42); \
-})
-#else
-#define vst3q_f64(__p0, __p1) __extension__ ({ \
-  float64x2x3_t __s1 = __p1; \
-  float64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 42); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_s64(__p0, __p1) __extension__ ({ \
-  int64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 35); \
-})
-#else
-#define vst3q_s64(__p0, __p1) __extension__ ({ \
-  int64x2x3_t __s1 = __p1; \
-  int64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 35); \
-})
-#endif
-
-#define vst3_f64(__p0, __p1) __extension__ ({ \
-  float64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 10); \
-})
-#define vst3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 36); \
-})
-#else
-#define vst3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x3_t __s1 = __p1; \
-  poly8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 38); \
-})
-#else
-#define vst3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x3_t __s1 = __p1; \
-  poly64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 48); \
-})
-#else
-#define vst3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x3_t __s1 = __p1; \
-  uint8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 51); \
-})
-#else
-#define vst3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x3_t __s1 = __p1; \
-  uint64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 32); \
-})
-#else
-#define vst3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x3_t __s1 = __p1; \
-  int8x16x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 42); \
-})
-#else
-#define vst3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x3_t __s1 = __p1; \
-  float64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 42); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x3_t __s1 = __p1; \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 35); \
-})
-#else
-#define vst3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x3_t __s1 = __p1; \
-  int64x2x3_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 35); \
-})
-#endif
-
-#define vst3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
-})
-#define vst3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 10); \
-})
-#define vst3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1x3_t __s1 = __p1; \
-  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 3); \
-})
-#define vst4_p64(__p0, __p1) __extension__ ({ \
-  poly64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_p64(__p0, __p1) __extension__ ({ \
-  poly64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 38); \
-})
-#else
-#define vst4q_p64(__p0, __p1) __extension__ ({ \
-  poly64x2x4_t __s1 = __p1; \
-  poly64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_u64(__p0, __p1) __extension__ ({ \
-  uint64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \
-})
-#else
-#define vst4q_u64(__p0, __p1) __extension__ ({ \
-  uint64x2x4_t __s1 = __p1; \
-  uint64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_f64(__p0, __p1) __extension__ ({ \
-  float64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 42); \
-})
-#else
-#define vst4q_f64(__p0, __p1) __extension__ ({ \
-  float64x2x4_t __s1 = __p1; \
-  float64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 42); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_s64(__p0, __p1) __extension__ ({ \
-  int64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 35); \
-})
-#else
-#define vst4q_s64(__p0, __p1) __extension__ ({ \
-  int64x2x4_t __s1 = __p1; \
-  int64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 35); \
-})
-#endif
-
-#define vst4_f64(__p0, __p1) __extension__ ({ \
-  float64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 10); \
-})
-#define vst4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 36); \
-})
-#else
-#define vst4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
-  poly8x16x4_t __s1 = __p1; \
-  poly8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 36); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 38); \
-})
-#else
-#define vst4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2x4_t __s1 = __p1; \
-  poly64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 48); \
-})
-#else
-#define vst4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
-  uint8x16x4_t __s1 = __p1; \
-  uint8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 48); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 51); \
-})
-#else
-#define vst4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2x4_t __s1 = __p1; \
-  uint64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 32); \
-})
-#else
-#define vst4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
-  int8x16x4_t __s1 = __p1; \
-  int8x16x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 32); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 42); \
-})
-#else
-#define vst4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2x4_t __s1 = __p1; \
-  float64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 42); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vst4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x4_t __s1 = __p1; \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 35); \
-})
-#else
-#define vst4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2x4_t __s1 = __p1; \
-  int64x2x4_t __rev1; \
-  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
-  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
-  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
-  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
-  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 35); \
-})
-#endif
-
-#define vst4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
-})
-#define vst4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 10); \
-})
-#define vst4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1x4_t __s1 = __p1; \
-  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 3); \
-})
-#define vstrq_p128(__p0, __p1) __extension__ ({ \
-  poly128_t __s1 = __p1; \
-  __builtin_neon_vstrq_p128(__p0, __s1); \
-})
-__ai __attribute__((target("neon"))) uint64_t vsubd_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vsubd_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vsubd_s64(int64_t __p0, int64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vsubd_s64(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vsubq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vsubq_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __rev1;
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) float64x1_t vsub_f64(float64x1_t __p0, float64x1_t __p1) {
-  float64x1_t __ret;
-  __ret = __p0 - __p1;
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint16x8_t __ret;
-  __ret = vcombine_u16(__p0, vsubhn_u32(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint16x8_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u16(__rev0, __noswap_vsubhn_u32(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint32x4_t __ret;
-  __ret = vcombine_u32(__p0, vsubhn_u64(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint32x4_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vcombine_u32(__rev0, __noswap_vsubhn_u64(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint8x16_t __ret;
-  __ret = vcombine_u8(__p0, vsubhn_u16(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint8x16_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_u8(__rev0, __noswap_vsubhn_u16(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int16x8_t __ret;
-  __ret = vcombine_s16(__p0, vsubhn_s32(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int16x8_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s16(__rev0, __noswap_vsubhn_s32(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int32x4_t __ret;
-  __ret = vcombine_s32(__p0, vsubhn_s64(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int32x4_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __noswap_vcombine_s32(__rev0, __noswap_vsubhn_s64(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int8x16_t __ret;
-  __ret = vcombine_s8(__p0, vsubhn_s16(__p1, __p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int8x16_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcombine_s8(__rev0, __noswap_vsubhn_s16(__rev1, __rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vsubl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  __ret = vmovl_high_u8(__p0) - vmovl_high_u8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vsubl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_u8(__rev0) - __noswap_vmovl_high_u8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vsubl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  __ret = vmovl_high_u32(__p0) - vmovl_high_u32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vsubl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_u32(__rev0) - __noswap_vmovl_high_u32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vsubl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  __ret = vmovl_high_u16(__p0) - vmovl_high_u16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vsubl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_u16(__rev0) - __noswap_vmovl_high_u16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vsubl_high_s8(int8x16_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  __ret = vmovl_high_s8(__p0) - vmovl_high_s8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vsubl_high_s8(int8x16_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_s8(__rev0) - __noswap_vmovl_high_s8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vsubl_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  __ret = vmovl_high_s32(__p0) - vmovl_high_s32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vsubl_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_s32(__rev0) - __noswap_vmovl_high_s32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vsubl_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  __ret = vmovl_high_s16(__p0) - vmovl_high_s16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vsubl_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_s16(__rev0) - __noswap_vmovl_high_s16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vsubw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 - vmovl_high_u8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vsubw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_high_u8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vsubw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 - vmovl_high_u32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vsubw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_high_u32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vsubw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 - vmovl_high_u16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vsubw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_high_u16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vsubw_high_s8(int16x8_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 - vmovl_high_s8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vsubw_high_s8(int16x8_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_high_s8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vsubw_high_s32(int64x2_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 - vmovl_high_s32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vsubw_high_s32(int64x2_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_high_s32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vsubw_high_s16(int32x4_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 - vmovl_high_s16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vsubw_high_s16(int32x4_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmovl_high_s16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtrn1_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtrn1_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vtrn1_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vtrn1_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vtrn1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vtrn1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vtrn1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vtrn1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vtrn1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vtrn1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vtrn1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vtrn1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vtrn1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vtrn1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vtrn1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vtrn1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vtrn1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vtrn1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vtrn1q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vtrn1q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vtrn1q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vtrn1q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vtrn1q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vtrn1q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vtrn1q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vtrn1q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vtrn1q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vtrn1q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vtrn1q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vtrn1q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtrn1_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtrn1_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vtrn1_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vtrn1_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vtrn1_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vtrn1_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtrn1_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtrn1_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vtrn1_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vtrn1_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vtrn1_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vtrn1_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vtrn1_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vtrn1_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vtrn1q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vtrn1q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vtrn1_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vtrn1_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vtrn2_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vtrn2_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vtrn2_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vtrn2_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vtrn2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vtrn2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vtrn2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vtrn2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vtrn2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vtrn2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vtrn2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vtrn2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vtrn2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vtrn2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vtrn2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vtrn2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vtrn2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vtrn2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vtrn2q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vtrn2q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vtrn2q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vtrn2q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vtrn2q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vtrn2q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vtrn2q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vtrn2q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vtrn2q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vtrn2q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vtrn2q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vtrn2q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vtrn2_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vtrn2_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vtrn2_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vtrn2_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vtrn2_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vtrn2_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vtrn2_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vtrn2_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vtrn2_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vtrn2_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vtrn2_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vtrn2_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vtrn2_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vtrn2_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vtrn2q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vtrn2q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vtrn2_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vtrn2_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vtst_p64(poly64x1_t __p0, poly64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vtstq_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vtstq_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  uint64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vtstq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vtstq_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vtstq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vtstq_s64(int64x2_t __p0, int64x2_t __p1) {
-  uint64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) uint64x1_t vtst_u64(uint64x1_t __p0, uint64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x1_t vtst_s64(int64x1_t __p0, int64x1_t __p1) {
-  uint64x1_t __ret;
-  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vtstd_u64(uint64_t __p0, uint64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vtstd_u64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64_t vtstd_s64(int64_t __p0, int64_t __p1) {
-  uint64_t __ret;
-  __ret = (uint64_t) __builtin_neon_vtstd_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) {
-  int8_t __ret;
-  __ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32_t vuqadds_s32(int32_t __p0, uint32_t __p1) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64_t vuqaddd_s64(int64_t __p0, uint64_t __p1) {
-  int64_t __ret;
-  __ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16_t vuqaddh_s16(int16_t __p0, uint16_t __p1) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vuqaddq_s8(int8x16_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vuqaddq_s8(int8x16_t __p0, uint8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vuqaddq_s32(int32x4_t __p0, uint32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vuqaddq_s32(int32x4_t __p0, uint32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vuqaddq_s64(int64x2_t __p0, uint64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vuqaddq_s64(int64x2_t __p0, uint64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vuqaddq_s16(int16x8_t __p0, uint16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vuqaddq_s16(int16x8_t __p0, uint16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vuqadd_s8(int8x8_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vuqadd_s8(int8x8_t __p0, uint8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vuqadd_s32(int32x2_t __p0, uint32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vuqadd_s32(int32x2_t __p0, uint32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("neon"))) int64x1_t vuqadd_s64(int64x1_t __p0, uint64x1_t __p1) {
-  int64x1_t __ret;
-  __ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vuqadd_s16(int16x4_t __p0, uint16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vuqadd_s16(int16x4_t __p0, uint16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vuzp1_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vuzp1_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vuzp1_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vuzp1_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vuzp1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vuzp1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vuzp1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vuzp1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vuzp1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vuzp1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vuzp1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vuzp1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vuzp1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vuzp1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vuzp1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vuzp1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vuzp1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vuzp1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vuzp1q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vuzp1q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vuzp1q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vuzp1q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vuzp1q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vuzp1q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vuzp1q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vuzp1q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vuzp1q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vuzp1q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vuzp1q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vuzp1q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vuzp1_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vuzp1_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vuzp1_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vuzp1_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vuzp1_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vuzp1_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vuzp1_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vuzp1_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vuzp1_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vuzp1_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vuzp1_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vuzp1_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vuzp1_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vuzp1_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vuzp1q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vuzp1q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vuzp1_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vuzp1_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vuzp2_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vuzp2_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vuzp2_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vuzp2_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vuzp2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vuzp2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vuzp2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vuzp2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vuzp2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vuzp2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vuzp2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vuzp2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vuzp2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vuzp2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vuzp2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vuzp2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vuzp2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vuzp2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vuzp2q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vuzp2q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vuzp2q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vuzp2q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vuzp2q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vuzp2q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vuzp2q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vuzp2q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vuzp2q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vuzp2q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vuzp2q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vuzp2q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vuzp2_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vuzp2_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vuzp2_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vuzp2_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vuzp2_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vuzp2_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vuzp2_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vuzp2_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vuzp2_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vuzp2_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vuzp2_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vuzp2_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vuzp2_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vuzp2_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vuzp2q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vuzp2q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vuzp2_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vuzp2_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vzip1_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vzip1_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vzip1_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vzip1_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vzip1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vzip1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vzip1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vzip1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vzip1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vzip1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vzip1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vzip1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vzip1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vzip1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vzip1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vzip1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vzip1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vzip1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vzip1q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vzip1q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vzip1q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vzip1q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vzip1q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vzip1q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vzip1q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vzip1q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vzip1q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vzip1q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vzip1q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vzip1q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vzip1_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vzip1_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vzip1_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vzip1_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vzip1_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vzip1_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vzip1_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vzip1_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vzip1_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vzip1_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vzip1_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vzip1_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vzip1_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vzip1_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vzip1q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vzip1q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vzip1_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vzip1_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x8_t vzip2_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x8_t vzip2_p8(poly8x8_t __p0, poly8x8_t __p1) {
-  poly8x8_t __ret;
-  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x4_t vzip2_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x4_t vzip2_p16(poly16x4_t __p0, poly16x4_t __p1) {
-  poly16x4_t __ret;
-  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly8x16_t vzip2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly8x16_t vzip2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
-  poly8x16_t __ret;
-  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly64x2_t vzip2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly64x2_t vzip2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly64x2_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) poly16x8_t vzip2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) poly16x8_t vzip2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
-  poly16x8_t __ret;
-  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vzip2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vzip2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vzip2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vzip2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vzip2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vzip2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vzip2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vzip2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vzip2q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vzip2q_s8(int8x16_t __p0, int8x16_t __p1) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float64x2_t vzip2q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float64x2_t vzip2q_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x4_t vzip2q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x4_t vzip2q_f32(float32x4_t __p0, float32x4_t __p1) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vzip2q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vzip2q_s32(int32x4_t __p0, int32x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vzip2q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vzip2q_s64(int64x2_t __p0, int64x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vzip2q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vzip2q_s16(int16x8_t __p0, int16x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vzip2_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vzip2_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vzip2_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vzip2_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vzip2_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vzip2_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vzip2_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vzip2_s8(int8x8_t __p0, int8x8_t __p1) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float32x2_t vzip2_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float32x2_t vzip2_f32(float32x2_t __p0, float32x2_t __p1) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vzip2_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vzip2_s32(int32x2_t __p0, int32x2_t __p1) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vzip2_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vzip2_s16(int16x4_t __p0, int16x4_t __p1) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x8_t vzip2q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x8_t vzip2q_f16(float16x8_t __p0, float16x8_t __p1) {
-  float16x8_t __ret;
-  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) float16x4_t vzip2_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) float16x4_t vzip2_f16(float16x4_t __p0, float16x4_t __p1) {
-  float16x4_t __ret;
-  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#define vldap1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1_t __ret; \
-  poly64x1_t __s1 = __p1; \
-  __ret = (poly64x1_t) __builtin_neon_vldap1_lane_p64(__p0, (int8x8_t)__s1, __p2, 6); \
-  __ret; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vldap1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s1 = __p1; \
-  __ret = (poly64x2_t) __builtin_neon_vldap1q_lane_p64(__p0, (int8x16_t)__s1, __p2, 38); \
-  __ret; \
-})
-#else
-#define vldap1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __ret; \
-  poly64x2_t __s1 = __p1; \
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (poly64x2_t) __builtin_neon_vldap1q_lane_p64(__p0, (int8x16_t)__rev1, __p2, 38); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vldap1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vldap1q_lane_u64(__p0, (int8x16_t)__s1, __p2, 51); \
-  __ret; \
-})
-#else
-#define vldap1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vldap1q_lane_u64(__p0, (int8x16_t)__rev1, __p2, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vldap1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s1 = __p1; \
-  __ret = (float64x2_t) __builtin_neon_vldap1q_lane_f64(__p0, (int8x16_t)__s1, __p2, 42); \
-  __ret; \
-})
-#else
-#define vldap1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __ret; \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (float64x2_t) __builtin_neon_vldap1q_lane_f64(__p0, (int8x16_t)__rev1, __p2, 42); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vldap1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s1 = __p1; \
-  __ret = (int64x2_t) __builtin_neon_vldap1q_lane_s64(__p0, (int8x16_t)__s1, __p2, 35); \
-  __ret; \
-})
-#else
-#define vldap1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __ret; \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (int64x2_t) __builtin_neon_vldap1q_lane_s64(__p0, (int8x16_t)__rev1, __p2, 35); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#define vldap1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __ret; \
-  uint64x1_t __s1 = __p1; \
-  __ret = (uint64x1_t) __builtin_neon_vldap1_lane_u64(__p0, (int8x8_t)__s1, __p2, 19); \
-  __ret; \
-})
-#define vldap1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1_t __ret; \
-  float64x1_t __s1 = __p1; \
-  __ret = (float64x1_t) __builtin_neon_vldap1_lane_f64(__p0, (int8x8_t)__s1, __p2, 10); \
-  __ret; \
-})
-#define vldap1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __ret; \
-  int64x1_t __s1 = __p1; \
-  __ret = (int64x1_t) __builtin_neon_vldap1_lane_s64(__p0, (int8x8_t)__s1, __p2, 3); \
-  __ret; \
-})
-#define vstl1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x1_t __s1 = __p1; \
-  __builtin_neon_vstl1_lane_p64(__p0, (int8x8_t)__s1, __p2, 6); \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vstl1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __s1 = __p1; \
-  __builtin_neon_vstl1q_lane_p64(__p0, (int8x16_t)__s1, __p2, 38); \
-})
-#else
-#define vstl1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
-  poly64x2_t __s1 = __p1; \
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vstl1q_lane_p64(__p0, (int8x16_t)__rev1, __p2, 38); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vstl1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __s1 = __p1; \
-  __builtin_neon_vstl1q_lane_u64(__p0, (int8x16_t)__s1, __p2, 51); \
-})
-#else
-#define vstl1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vstl1q_lane_u64(__p0, (int8x16_t)__rev1, __p2, 51); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vstl1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __s1 = __p1; \
-  __builtin_neon_vstl1q_lane_f64(__p0, (int8x16_t)__s1, __p2, 42); \
-})
-#else
-#define vstl1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x2_t __s1 = __p1; \
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vstl1q_lane_f64(__p0, (int8x16_t)__rev1, __p2, 42); \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vstl1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __s1 = __p1; \
-  __builtin_neon_vstl1q_lane_s64(__p0, (int8x16_t)__s1, __p2, 35); \
-})
-#else
-#define vstl1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x2_t __s1 = __p1; \
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __builtin_neon_vstl1q_lane_s64(__p0, (int8x16_t)__rev1, __p2, 35); \
-})
-#endif
-
-#define vstl1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x1_t __s1 = __p1; \
-  __builtin_neon_vstl1_lane_u64(__p0, (int8x8_t)__s1, __p2, 19); \
-})
-#define vstl1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
-  float64x1_t __s1 = __p1; \
-  __builtin_neon_vstl1_lane_f64(__p0, (int8x8_t)__s1, __p2, 10); \
-})
-#define vstl1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
-  int64x1_t __s1 = __p1; \
-  __builtin_neon_vstl1_lane_s64(__p0, (int8x8_t)__s1, __p2, 3); \
-})
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint8x16_t vbcaxq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_vbcaxq_u8((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint8x16_t vbcaxq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_vbcaxq_u8((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint32x4_t vbcaxq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vbcaxq_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint32x4_t vbcaxq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vbcaxq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vbcaxq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vbcaxq_u64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vbcaxq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vbcaxq_u64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint16x8_t vbcaxq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_vbcaxq_u16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint16x8_t vbcaxq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_vbcaxq_u16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) int8x16_t vbcaxq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_vbcaxq_s8((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) int8x16_t vbcaxq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_vbcaxq_s8((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) int32x4_t vbcaxq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_vbcaxq_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) int32x4_t vbcaxq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_vbcaxq_s32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) int64x2_t vbcaxq_s64(int64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_vbcaxq_s64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) int64x2_t vbcaxq_s64(int64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_vbcaxq_s64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) int16x8_t vbcaxq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_vbcaxq_s16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) int16x8_t vbcaxq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_vbcaxq_s16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint8x16_t veor3q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = (uint8x16_t) __builtin_neon_veor3q_u8((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 48);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint8x16_t veor3q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint8x16_t) __builtin_neon_veor3q_u8((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 48);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint32x4_t veor3q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_veor3q_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint32x4_t veor3q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_veor3q_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint64x2_t veor3q_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_veor3q_u64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint64x2_t veor3q_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_veor3q_u64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint16x8_t veor3q_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t) __builtin_neon_veor3q_u16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 49);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint16x8_t veor3q_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t) __builtin_neon_veor3q_u16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 49);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) int8x16_t veor3q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = (int8x16_t) __builtin_neon_veor3q_s8((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) int8x16_t veor3q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int8x16_t) __builtin_neon_veor3q_s8((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) int32x4_t veor3q_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = (int32x4_t) __builtin_neon_veor3q_s32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) int32x4_t veor3q_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (int32x4_t) __builtin_neon_veor3q_s32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) int64x2_t veor3q_s64(int64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int64x2_t __ret;
-  __ret = (int64x2_t) __builtin_neon_veor3q_s64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 35);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) int64x2_t veor3q_s64(int64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (int64x2_t) __builtin_neon_veor3q_s64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 35);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) int16x8_t veor3q_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = (int16x8_t) __builtin_neon_veor3q_s16((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 33);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) int16x8_t veor3q_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t) __builtin_neon_veor3q_s16((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 33);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vrax1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vrax1q_u64((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vrax1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vrax1q_u64((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vsha512hq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vsha512hq_u64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vsha512hq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vsha512hq_u64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vsha512h2q_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vsha512h2q_u64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vsha512h2q_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vsha512h2q_u64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vsha512su0q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vsha512su0q_u64((int8x16_t)__p0, (int8x16_t)__p1, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vsha512su0q_u64(uint64x2_t __p0, uint64x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vsha512su0q_u64((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vsha512su1q_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t) __builtin_neon_vsha512su1q_u64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 51);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sha3,neon"))) uint64x2_t vsha512su1q_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (uint64x2_t) __builtin_neon_vsha512su1q_u64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 51);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vxarq_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  __ret = (uint64x2_t) __builtin_neon_vxarq_u64((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
-  __ret; \
-})
-#else
-#define vxarq_u64(__p0, __p1, __p2) __extension__ ({ \
-  uint64x2_t __ret; \
-  uint64x2_t __s0 = __p0; \
-  uint64x2_t __s1 = __p1; \
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
-  __ret = (uint64x2_t) __builtin_neon_vxarq_u64((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm3partw1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsm3partw1q_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm3partw1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsm3partw1q_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm3partw2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsm3partw2q_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm3partw2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsm3partw2q_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm3ss1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsm3ss1q_u32((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm3ss1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsm3ss1q_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsm3tt1aq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __s2 = __p2; \
-  __ret = (uint32x4_t) __builtin_neon_vsm3tt1aq_u32((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 50); \
-  __ret; \
-})
-#else
-#define vsm3tt1aq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __s2 = __p2; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vsm3tt1aq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsm3tt1bq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __s2 = __p2; \
-  __ret = (uint32x4_t) __builtin_neon_vsm3tt1bq_u32((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 50); \
-  __ret; \
-})
-#else
-#define vsm3tt1bq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __s2 = __p2; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vsm3tt1bq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsm3tt2aq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __s2 = __p2; \
-  __ret = (uint32x4_t) __builtin_neon_vsm3tt2aq_u32((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 50); \
-  __ret; \
-})
-#else
-#define vsm3tt2aq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __s2 = __p2; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vsm3tt2aq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsm3tt2bq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __s2 = __p2; \
-  __ret = (uint32x4_t) __builtin_neon_vsm3tt2bq_u32((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 50); \
-  __ret; \
-})
-#else
-#define vsm3tt2bq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
-  uint32x4_t __ret; \
-  uint32x4_t __s0 = __p0; \
-  uint32x4_t __s1 = __p1; \
-  uint32x4_t __s2 = __p2; \
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
-  __ret = (uint32x4_t) __builtin_neon_vsm3tt2bq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 50); \
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
-  __ret; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm4eq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsm4eq_u32((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm4eq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsm4eq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm4ekeyq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t) __builtin_neon_vsm4ekeyq_u32((int8x16_t)__p0, (int8x16_t)__p1, 50);
-  return __ret;
-}
-#else
-__ai __attribute__((target("sm4,neon"))) uint32x4_t vsm4ekeyq_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t) __builtin_neon_vsm4ekeyq_u32((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("v8.1a,neon"))) int32_t vqrdmlahs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqrdmlahs_s32(__p0, __p1, __p2);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int16_t vqrdmlahh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqrdmlahh_s16(__p0, __p1, __p2);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlahs_lane_s32(__p0_760, __p1_760, __p2_760, __p3_760) __extension__ ({ \
-  int32_t __ret_760; \
-  int32_t __s0_760 = __p0_760; \
-  int32_t __s1_760 = __p1_760; \
-  int32x2_t __s2_760 = __p2_760; \
-  __ret_760 = vqrdmlahs_s32(__s0_760, __s1_760, vget_lane_s32(__s2_760, __p3_760)); \
-  __ret_760; \
-})
-#else
-#define vqrdmlahs_lane_s32(__p0_761, __p1_761, __p2_761, __p3_761) __extension__ ({ \
-  int32_t __ret_761; \
-  int32_t __s0_761 = __p0_761; \
-  int32_t __s1_761 = __p1_761; \
-  int32x2_t __s2_761 = __p2_761; \
-  int32x2_t __rev2_761;  __rev2_761 = __builtin_shufflevector(__s2_761, __s2_761, 1, 0); \
-  __ret_761 = vqrdmlahs_s32(__s0_761, __s1_761, __noswap_vget_lane_s32(__rev2_761, __p3_761)); \
-  __ret_761; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlahh_lane_s16(__p0_762, __p1_762, __p2_762, __p3_762) __extension__ ({ \
-  int16_t __ret_762; \
-  int16_t __s0_762 = __p0_762; \
-  int16_t __s1_762 = __p1_762; \
-  int16x4_t __s2_762 = __p2_762; \
-  __ret_762 = vqrdmlahh_s16(__s0_762, __s1_762, vget_lane_s16(__s2_762, __p3_762)); \
-  __ret_762; \
-})
-#else
-#define vqrdmlahh_lane_s16(__p0_763, __p1_763, __p2_763, __p3_763) __extension__ ({ \
-  int16_t __ret_763; \
-  int16_t __s0_763 = __p0_763; \
-  int16_t __s1_763 = __p1_763; \
-  int16x4_t __s2_763 = __p2_763; \
-  int16x4_t __rev2_763;  __rev2_763 = __builtin_shufflevector(__s2_763, __s2_763, 3, 2, 1, 0); \
-  __ret_763 = vqrdmlahh_s16(__s0_763, __s1_763, __noswap_vget_lane_s16(__rev2_763, __p3_763)); \
-  __ret_763; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlahs_laneq_s32(__p0_764, __p1_764, __p2_764, __p3_764) __extension__ ({ \
-  int32_t __ret_764; \
-  int32_t __s0_764 = __p0_764; \
-  int32_t __s1_764 = __p1_764; \
-  int32x4_t __s2_764 = __p2_764; \
-  __ret_764 = vqrdmlahs_s32(__s0_764, __s1_764, vgetq_lane_s32(__s2_764, __p3_764)); \
-  __ret_764; \
-})
-#else
-#define vqrdmlahs_laneq_s32(__p0_765, __p1_765, __p2_765, __p3_765) __extension__ ({ \
-  int32_t __ret_765; \
-  int32_t __s0_765 = __p0_765; \
-  int32_t __s1_765 = __p1_765; \
-  int32x4_t __s2_765 = __p2_765; \
-  int32x4_t __rev2_765;  __rev2_765 = __builtin_shufflevector(__s2_765, __s2_765, 3, 2, 1, 0); \
-  __ret_765 = vqrdmlahs_s32(__s0_765, __s1_765, __noswap_vgetq_lane_s32(__rev2_765, __p3_765)); \
-  __ret_765; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlahh_laneq_s16(__p0_766, __p1_766, __p2_766, __p3_766) __extension__ ({ \
-  int16_t __ret_766; \
-  int16_t __s0_766 = __p0_766; \
-  int16_t __s1_766 = __p1_766; \
-  int16x8_t __s2_766 = __p2_766; \
-  __ret_766 = vqrdmlahh_s16(__s0_766, __s1_766, vgetq_lane_s16(__s2_766, __p3_766)); \
-  __ret_766; \
-})
-#else
-#define vqrdmlahh_laneq_s16(__p0_767, __p1_767, __p2_767, __p3_767) __extension__ ({ \
-  int16_t __ret_767; \
-  int16_t __s0_767 = __p0_767; \
-  int16_t __s1_767 = __p1_767; \
-  int16x8_t __s2_767 = __p2_767; \
-  int16x8_t __rev2_767;  __rev2_767 = __builtin_shufflevector(__s2_767, __s2_767, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_767 = vqrdmlahh_s16(__s0_767, __s1_767, __noswap_vgetq_lane_s16(__rev2_767, __p3_767)); \
-  __ret_767; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlahq_laneq_s32(__p0_768, __p1_768, __p2_768, __p3_768) __extension__ ({ \
-  int32x4_t __ret_768; \
-  int32x4_t __s0_768 = __p0_768; \
-  int32x4_t __s1_768 = __p1_768; \
-  int32x4_t __s2_768 = __p2_768; \
-  __ret_768 = vqrdmlahq_s32(__s0_768, __s1_768, splatq_laneq_s32(__s2_768, __p3_768)); \
-  __ret_768; \
-})
-#else
-#define vqrdmlahq_laneq_s32(__p0_769, __p1_769, __p2_769, __p3_769) __extension__ ({ \
-  int32x4_t __ret_769; \
-  int32x4_t __s0_769 = __p0_769; \
-  int32x4_t __s1_769 = __p1_769; \
-  int32x4_t __s2_769 = __p2_769; \
-  int32x4_t __rev0_769;  __rev0_769 = __builtin_shufflevector(__s0_769, __s0_769, 3, 2, 1, 0); \
-  int32x4_t __rev1_769;  __rev1_769 = __builtin_shufflevector(__s1_769, __s1_769, 3, 2, 1, 0); \
-  int32x4_t __rev2_769;  __rev2_769 = __builtin_shufflevector(__s2_769, __s2_769, 3, 2, 1, 0); \
-  __ret_769 = __noswap_vqrdmlahq_s32(__rev0_769, __rev1_769, __noswap_splatq_laneq_s32(__rev2_769, __p3_769)); \
-  __ret_769 = __builtin_shufflevector(__ret_769, __ret_769, 3, 2, 1, 0); \
-  __ret_769; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlahq_laneq_s16(__p0_770, __p1_770, __p2_770, __p3_770) __extension__ ({ \
-  int16x8_t __ret_770; \
-  int16x8_t __s0_770 = __p0_770; \
-  int16x8_t __s1_770 = __p1_770; \
-  int16x8_t __s2_770 = __p2_770; \
-  __ret_770 = vqrdmlahq_s16(__s0_770, __s1_770, splatq_laneq_s16(__s2_770, __p3_770)); \
-  __ret_770; \
-})
-#else
-#define vqrdmlahq_laneq_s16(__p0_771, __p1_771, __p2_771, __p3_771) __extension__ ({ \
-  int16x8_t __ret_771; \
-  int16x8_t __s0_771 = __p0_771; \
-  int16x8_t __s1_771 = __p1_771; \
-  int16x8_t __s2_771 = __p2_771; \
-  int16x8_t __rev0_771;  __rev0_771 = __builtin_shufflevector(__s0_771, __s0_771, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_771;  __rev1_771 = __builtin_shufflevector(__s1_771, __s1_771, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_771;  __rev2_771 = __builtin_shufflevector(__s2_771, __s2_771, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_771 = __noswap_vqrdmlahq_s16(__rev0_771, __rev1_771, __noswap_splatq_laneq_s16(__rev2_771, __p3_771)); \
-  __ret_771 = __builtin_shufflevector(__ret_771, __ret_771, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_771; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlah_laneq_s32(__p0_772, __p1_772, __p2_772, __p3_772) __extension__ ({ \
-  int32x2_t __ret_772; \
-  int32x2_t __s0_772 = __p0_772; \
-  int32x2_t __s1_772 = __p1_772; \
-  int32x4_t __s2_772 = __p2_772; \
-  __ret_772 = vqrdmlah_s32(__s0_772, __s1_772, splat_laneq_s32(__s2_772, __p3_772)); \
-  __ret_772; \
-})
-#else
-#define vqrdmlah_laneq_s32(__p0_773, __p1_773, __p2_773, __p3_773) __extension__ ({ \
-  int32x2_t __ret_773; \
-  int32x2_t __s0_773 = __p0_773; \
-  int32x2_t __s1_773 = __p1_773; \
-  int32x4_t __s2_773 = __p2_773; \
-  int32x2_t __rev0_773;  __rev0_773 = __builtin_shufflevector(__s0_773, __s0_773, 1, 0); \
-  int32x2_t __rev1_773;  __rev1_773 = __builtin_shufflevector(__s1_773, __s1_773, 1, 0); \
-  int32x4_t __rev2_773;  __rev2_773 = __builtin_shufflevector(__s2_773, __s2_773, 3, 2, 1, 0); \
-  __ret_773 = __noswap_vqrdmlah_s32(__rev0_773, __rev1_773, __noswap_splat_laneq_s32(__rev2_773, __p3_773)); \
-  __ret_773 = __builtin_shufflevector(__ret_773, __ret_773, 1, 0); \
-  __ret_773; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlah_laneq_s16(__p0_774, __p1_774, __p2_774, __p3_774) __extension__ ({ \
-  int16x4_t __ret_774; \
-  int16x4_t __s0_774 = __p0_774; \
-  int16x4_t __s1_774 = __p1_774; \
-  int16x8_t __s2_774 = __p2_774; \
-  __ret_774 = vqrdmlah_s16(__s0_774, __s1_774, splat_laneq_s16(__s2_774, __p3_774)); \
-  __ret_774; \
-})
-#else
-#define vqrdmlah_laneq_s16(__p0_775, __p1_775, __p2_775, __p3_775) __extension__ ({ \
-  int16x4_t __ret_775; \
-  int16x4_t __s0_775 = __p0_775; \
-  int16x4_t __s1_775 = __p1_775; \
-  int16x8_t __s2_775 = __p2_775; \
-  int16x4_t __rev0_775;  __rev0_775 = __builtin_shufflevector(__s0_775, __s0_775, 3, 2, 1, 0); \
-  int16x4_t __rev1_775;  __rev1_775 = __builtin_shufflevector(__s1_775, __s1_775, 3, 2, 1, 0); \
-  int16x8_t __rev2_775;  __rev2_775 = __builtin_shufflevector(__s2_775, __s2_775, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_775 = __noswap_vqrdmlah_s16(__rev0_775, __rev1_775, __noswap_splat_laneq_s16(__rev2_775, __p3_775)); \
-  __ret_775 = __builtin_shufflevector(__ret_775, __ret_775, 3, 2, 1, 0); \
-  __ret_775; \
-})
-#endif
-
-__ai __attribute__((target("v8.1a,neon"))) int32_t vqrdmlshs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
-  int32_t __ret;
-  __ret = (int32_t) __builtin_neon_vqrdmlshs_s32(__p0, __p1, __p2);
-  return __ret;
-}
-__ai __attribute__((target("v8.1a,neon"))) int16_t vqrdmlshh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
-  int16_t __ret;
-  __ret = (int16_t) __builtin_neon_vqrdmlshh_s16(__p0, __p1, __p2);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlshs_lane_s32(__p0_776, __p1_776, __p2_776, __p3_776) __extension__ ({ \
-  int32_t __ret_776; \
-  int32_t __s0_776 = __p0_776; \
-  int32_t __s1_776 = __p1_776; \
-  int32x2_t __s2_776 = __p2_776; \
-  __ret_776 = vqrdmlshs_s32(__s0_776, __s1_776, vget_lane_s32(__s2_776, __p3_776)); \
-  __ret_776; \
-})
-#else
-#define vqrdmlshs_lane_s32(__p0_777, __p1_777, __p2_777, __p3_777) __extension__ ({ \
-  int32_t __ret_777; \
-  int32_t __s0_777 = __p0_777; \
-  int32_t __s1_777 = __p1_777; \
-  int32x2_t __s2_777 = __p2_777; \
-  int32x2_t __rev2_777;  __rev2_777 = __builtin_shufflevector(__s2_777, __s2_777, 1, 0); \
-  __ret_777 = vqrdmlshs_s32(__s0_777, __s1_777, __noswap_vget_lane_s32(__rev2_777, __p3_777)); \
-  __ret_777; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlshh_lane_s16(__p0_778, __p1_778, __p2_778, __p3_778) __extension__ ({ \
-  int16_t __ret_778; \
-  int16_t __s0_778 = __p0_778; \
-  int16_t __s1_778 = __p1_778; \
-  int16x4_t __s2_778 = __p2_778; \
-  __ret_778 = vqrdmlshh_s16(__s0_778, __s1_778, vget_lane_s16(__s2_778, __p3_778)); \
-  __ret_778; \
-})
-#else
-#define vqrdmlshh_lane_s16(__p0_779, __p1_779, __p2_779, __p3_779) __extension__ ({ \
-  int16_t __ret_779; \
-  int16_t __s0_779 = __p0_779; \
-  int16_t __s1_779 = __p1_779; \
-  int16x4_t __s2_779 = __p2_779; \
-  int16x4_t __rev2_779;  __rev2_779 = __builtin_shufflevector(__s2_779, __s2_779, 3, 2, 1, 0); \
-  __ret_779 = vqrdmlshh_s16(__s0_779, __s1_779, __noswap_vget_lane_s16(__rev2_779, __p3_779)); \
-  __ret_779; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlshs_laneq_s32(__p0_780, __p1_780, __p2_780, __p3_780) __extension__ ({ \
-  int32_t __ret_780; \
-  int32_t __s0_780 = __p0_780; \
-  int32_t __s1_780 = __p1_780; \
-  int32x4_t __s2_780 = __p2_780; \
-  __ret_780 = vqrdmlshs_s32(__s0_780, __s1_780, vgetq_lane_s32(__s2_780, __p3_780)); \
-  __ret_780; \
-})
-#else
-#define vqrdmlshs_laneq_s32(__p0_781, __p1_781, __p2_781, __p3_781) __extension__ ({ \
-  int32_t __ret_781; \
-  int32_t __s0_781 = __p0_781; \
-  int32_t __s1_781 = __p1_781; \
-  int32x4_t __s2_781 = __p2_781; \
-  int32x4_t __rev2_781;  __rev2_781 = __builtin_shufflevector(__s2_781, __s2_781, 3, 2, 1, 0); \
-  __ret_781 = vqrdmlshs_s32(__s0_781, __s1_781, __noswap_vgetq_lane_s32(__rev2_781, __p3_781)); \
-  __ret_781; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlshh_laneq_s16(__p0_782, __p1_782, __p2_782, __p3_782) __extension__ ({ \
-  int16_t __ret_782; \
-  int16_t __s0_782 = __p0_782; \
-  int16_t __s1_782 = __p1_782; \
-  int16x8_t __s2_782 = __p2_782; \
-  __ret_782 = vqrdmlshh_s16(__s0_782, __s1_782, vgetq_lane_s16(__s2_782, __p3_782)); \
-  __ret_782; \
-})
-#else
-#define vqrdmlshh_laneq_s16(__p0_783, __p1_783, __p2_783, __p3_783) __extension__ ({ \
-  int16_t __ret_783; \
-  int16_t __s0_783 = __p0_783; \
-  int16_t __s1_783 = __p1_783; \
-  int16x8_t __s2_783 = __p2_783; \
-  int16x8_t __rev2_783;  __rev2_783 = __builtin_shufflevector(__s2_783, __s2_783, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_783 = vqrdmlshh_s16(__s0_783, __s1_783, __noswap_vgetq_lane_s16(__rev2_783, __p3_783)); \
-  __ret_783; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlshq_laneq_s32(__p0_784, __p1_784, __p2_784, __p3_784) __extension__ ({ \
-  int32x4_t __ret_784; \
-  int32x4_t __s0_784 = __p0_784; \
-  int32x4_t __s1_784 = __p1_784; \
-  int32x4_t __s2_784 = __p2_784; \
-  __ret_784 = vqrdmlshq_s32(__s0_784, __s1_784, splatq_laneq_s32(__s2_784, __p3_784)); \
-  __ret_784; \
-})
-#else
-#define vqrdmlshq_laneq_s32(__p0_785, __p1_785, __p2_785, __p3_785) __extension__ ({ \
-  int32x4_t __ret_785; \
-  int32x4_t __s0_785 = __p0_785; \
-  int32x4_t __s1_785 = __p1_785; \
-  int32x4_t __s2_785 = __p2_785; \
-  int32x4_t __rev0_785;  __rev0_785 = __builtin_shufflevector(__s0_785, __s0_785, 3, 2, 1, 0); \
-  int32x4_t __rev1_785;  __rev1_785 = __builtin_shufflevector(__s1_785, __s1_785, 3, 2, 1, 0); \
-  int32x4_t __rev2_785;  __rev2_785 = __builtin_shufflevector(__s2_785, __s2_785, 3, 2, 1, 0); \
-  __ret_785 = __noswap_vqrdmlshq_s32(__rev0_785, __rev1_785, __noswap_splatq_laneq_s32(__rev2_785, __p3_785)); \
-  __ret_785 = __builtin_shufflevector(__ret_785, __ret_785, 3, 2, 1, 0); \
-  __ret_785; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlshq_laneq_s16(__p0_786, __p1_786, __p2_786, __p3_786) __extension__ ({ \
-  int16x8_t __ret_786; \
-  int16x8_t __s0_786 = __p0_786; \
-  int16x8_t __s1_786 = __p1_786; \
-  int16x8_t __s2_786 = __p2_786; \
-  __ret_786 = vqrdmlshq_s16(__s0_786, __s1_786, splatq_laneq_s16(__s2_786, __p3_786)); \
-  __ret_786; \
-})
-#else
-#define vqrdmlshq_laneq_s16(__p0_787, __p1_787, __p2_787, __p3_787) __extension__ ({ \
-  int16x8_t __ret_787; \
-  int16x8_t __s0_787 = __p0_787; \
-  int16x8_t __s1_787 = __p1_787; \
-  int16x8_t __s2_787 = __p2_787; \
-  int16x8_t __rev0_787;  __rev0_787 = __builtin_shufflevector(__s0_787, __s0_787, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev1_787;  __rev1_787 = __builtin_shufflevector(__s1_787, __s1_787, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int16x8_t __rev2_787;  __rev2_787 = __builtin_shufflevector(__s2_787, __s2_787, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_787 = __noswap_vqrdmlshq_s16(__rev0_787, __rev1_787, __noswap_splatq_laneq_s16(__rev2_787, __p3_787)); \
-  __ret_787 = __builtin_shufflevector(__ret_787, __ret_787, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_787; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlsh_laneq_s32(__p0_788, __p1_788, __p2_788, __p3_788) __extension__ ({ \
-  int32x2_t __ret_788; \
-  int32x2_t __s0_788 = __p0_788; \
-  int32x2_t __s1_788 = __p1_788; \
-  int32x4_t __s2_788 = __p2_788; \
-  __ret_788 = vqrdmlsh_s32(__s0_788, __s1_788, splat_laneq_s32(__s2_788, __p3_788)); \
-  __ret_788; \
-})
-#else
-#define vqrdmlsh_laneq_s32(__p0_789, __p1_789, __p2_789, __p3_789) __extension__ ({ \
-  int32x2_t __ret_789; \
-  int32x2_t __s0_789 = __p0_789; \
-  int32x2_t __s1_789 = __p1_789; \
-  int32x4_t __s2_789 = __p2_789; \
-  int32x2_t __rev0_789;  __rev0_789 = __builtin_shufflevector(__s0_789, __s0_789, 1, 0); \
-  int32x2_t __rev1_789;  __rev1_789 = __builtin_shufflevector(__s1_789, __s1_789, 1, 0); \
-  int32x4_t __rev2_789;  __rev2_789 = __builtin_shufflevector(__s2_789, __s2_789, 3, 2, 1, 0); \
-  __ret_789 = __noswap_vqrdmlsh_s32(__rev0_789, __rev1_789, __noswap_splat_laneq_s32(__rev2_789, __p3_789)); \
-  __ret_789 = __builtin_shufflevector(__ret_789, __ret_789, 1, 0); \
-  __ret_789; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vqrdmlsh_laneq_s16(__p0_790, __p1_790, __p2_790, __p3_790) __extension__ ({ \
-  int16x4_t __ret_790; \
-  int16x4_t __s0_790 = __p0_790; \
-  int16x4_t __s1_790 = __p1_790; \
-  int16x8_t __s2_790 = __p2_790; \
-  __ret_790 = vqrdmlsh_s16(__s0_790, __s1_790, splat_laneq_s16(__s2_790, __p3_790)); \
-  __ret_790; \
-})
-#else
-#define vqrdmlsh_laneq_s16(__p0_791, __p1_791, __p2_791, __p3_791) __extension__ ({ \
-  int16x4_t __ret_791; \
-  int16x4_t __s0_791 = __p0_791; \
-  int16x4_t __s1_791 = __p1_791; \
-  int16x8_t __s2_791 = __p2_791; \
-  int16x4_t __rev0_791;  __rev0_791 = __builtin_shufflevector(__s0_791, __s0_791, 3, 2, 1, 0); \
-  int16x4_t __rev1_791;  __rev1_791 = __builtin_shufflevector(__s1_791, __s1_791, 3, 2, 1, 0); \
-  int16x8_t __rev2_791;  __rev2_791 = __builtin_shufflevector(__s2_791, __s2_791, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_791 = __noswap_vqrdmlsh_s16(__rev0_791, __rev1_791, __noswap_splat_laneq_s16(__rev2_791, __p3_791)); \
-  __ret_791 = __builtin_shufflevector(__ret_791, __ret_791, 3, 2, 1, 0); \
-  __ret_791; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcaddq_rot270_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcaddq_rot270_f64((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcaddq_rot270_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vcaddq_rot270_f64((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcaddq_rot90_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcaddq_rot90_f64((int8x16_t)__p0, (int8x16_t)__p1, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcaddq_rot90_f64(float64x2_t __p0, float64x2_t __p1) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vcaddq_rot90_f64((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_f64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t __noswap_vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("v8.3a,neon"))) float64x1_t vcmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vcmla_f64((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
-  return __ret;
-}
-#define vcmla_lane_f64(__p0_792, __p1_792, __p2_792, __p3_792) __extension__ ({ \
-  float64x1_t __ret_792; \
-  float64x1_t __s0_792 = __p0_792; \
-  float64x1_t __s1_792 = __p1_792; \
-  float64x1_t __s2_792 = __p2_792; \
-float64x1_t __reint_792 = __s2_792; \
-uint64x2_t __reint1_792 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_792, __p3_792), vgetq_lane_u64(*(uint64x2_t *) &__reint_792, __p3_792)}; \
-  __ret_792 = vcmla_f64(__s0_792, __s1_792, *(float64x1_t *) &__reint1_792); \
-  __ret_792; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_lane_f64(__p0_793, __p1_793, __p2_793, __p3_793) __extension__ ({ \
-  float64x2_t __ret_793; \
-  float64x2_t __s0_793 = __p0_793; \
-  float64x2_t __s1_793 = __p1_793; \
-  float64x1_t __s2_793 = __p2_793; \
-float64x1_t __reint_793 = __s2_793; \
-uint64x2_t __reint1_793 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_793, __p3_793), vgetq_lane_u64(*(uint64x2_t *) &__reint_793, __p3_793)}; \
-  __ret_793 = vcmlaq_f64(__s0_793, __s1_793, *(float64x2_t *) &__reint1_793); \
-  __ret_793; \
-})
-#else
-#define vcmlaq_lane_f64(__p0_794, __p1_794, __p2_794, __p3_794) __extension__ ({ \
-  float64x2_t __ret_794; \
-  float64x2_t __s0_794 = __p0_794; \
-  float64x2_t __s1_794 = __p1_794; \
-  float64x1_t __s2_794 = __p2_794; \
-  float64x2_t __rev0_794;  __rev0_794 = __builtin_shufflevector(__s0_794, __s0_794, 1, 0); \
-  float64x2_t __rev1_794;  __rev1_794 = __builtin_shufflevector(__s1_794, __s1_794, 1, 0); \
-float64x1_t __reint_794 = __s2_794; \
-uint64x2_t __reint1_794 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_794, __p3_794), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_794, __p3_794)}; \
-  __ret_794 = __noswap_vcmlaq_f64(__rev0_794, __rev1_794, *(float64x2_t *) &__reint1_794); \
-  __ret_794 = __builtin_shufflevector(__ret_794, __ret_794, 1, 0); \
-  __ret_794; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_laneq_f64(__p0_795, __p1_795, __p2_795, __p3_795) __extension__ ({ \
-  float64x1_t __ret_795; \
-  float64x1_t __s0_795 = __p0_795; \
-  float64x1_t __s1_795 = __p1_795; \
-  float64x2_t __s2_795 = __p2_795; \
-float64x2_t __reint_795 = __s2_795; \
-uint64x2_t __reint1_795 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_795, __p3_795), vgetq_lane_u64(*(uint64x2_t *) &__reint_795, __p3_795)}; \
-  __ret_795 = vcmla_f64(__s0_795, __s1_795, *(float64x1_t *) &__reint1_795); \
-  __ret_795; \
-})
-#else
-#define vcmla_laneq_f64(__p0_796, __p1_796, __p2_796, __p3_796) __extension__ ({ \
-  float64x1_t __ret_796; \
-  float64x1_t __s0_796 = __p0_796; \
-  float64x1_t __s1_796 = __p1_796; \
-  float64x2_t __s2_796 = __p2_796; \
-  float64x2_t __rev2_796;  __rev2_796 = __builtin_shufflevector(__s2_796, __s2_796, 1, 0); \
-float64x2_t __reint_796 = __rev2_796; \
-uint64x2_t __reint1_796 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_796, __p3_796), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_796, __p3_796)}; \
-  __ret_796 = vcmla_f64(__s0_796, __s1_796, *(float64x1_t *) &__reint1_796); \
-  __ret_796; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_laneq_f64(__p0_797, __p1_797, __p2_797, __p3_797) __extension__ ({ \
-  float64x2_t __ret_797; \
-  float64x2_t __s0_797 = __p0_797; \
-  float64x2_t __s1_797 = __p1_797; \
-  float64x2_t __s2_797 = __p2_797; \
-float64x2_t __reint_797 = __s2_797; \
-uint64x2_t __reint1_797 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_797, __p3_797), vgetq_lane_u64(*(uint64x2_t *) &__reint_797, __p3_797)}; \
-  __ret_797 = vcmlaq_f64(__s0_797, __s1_797, *(float64x2_t *) &__reint1_797); \
-  __ret_797; \
-})
-#else
-#define vcmlaq_laneq_f64(__p0_798, __p1_798, __p2_798, __p3_798) __extension__ ({ \
-  float64x2_t __ret_798; \
-  float64x2_t __s0_798 = __p0_798; \
-  float64x2_t __s1_798 = __p1_798; \
-  float64x2_t __s2_798 = __p2_798; \
-  float64x2_t __rev0_798;  __rev0_798 = __builtin_shufflevector(__s0_798, __s0_798, 1, 0); \
-  float64x2_t __rev1_798;  __rev1_798 = __builtin_shufflevector(__s1_798, __s1_798, 1, 0); \
-  float64x2_t __rev2_798;  __rev2_798 = __builtin_shufflevector(__s2_798, __s2_798, 1, 0); \
-float64x2_t __reint_798 = __rev2_798; \
-uint64x2_t __reint1_798 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_798, __p3_798), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_798, __p3_798)}; \
-  __ret_798 = __noswap_vcmlaq_f64(__rev0_798, __rev1_798, *(float64x2_t *) &__reint1_798); \
-  __ret_798 = __builtin_shufflevector(__ret_798, __ret_798, 1, 0); \
-  __ret_798; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_f64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t __noswap_vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("v8.3a,neon"))) float64x1_t vcmla_rot180_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vcmla_rot180_f64((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
-  return __ret;
-}
-#define vcmla_rot180_lane_f64(__p0_799, __p1_799, __p2_799, __p3_799) __extension__ ({ \
-  float64x1_t __ret_799; \
-  float64x1_t __s0_799 = __p0_799; \
-  float64x1_t __s1_799 = __p1_799; \
-  float64x1_t __s2_799 = __p2_799; \
-float64x1_t __reint_799 = __s2_799; \
-uint64x2_t __reint1_799 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_799, __p3_799), vgetq_lane_u64(*(uint64x2_t *) &__reint_799, __p3_799)}; \
-  __ret_799 = vcmla_rot180_f64(__s0_799, __s1_799, *(float64x1_t *) &__reint1_799); \
-  __ret_799; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot180_lane_f64(__p0_800, __p1_800, __p2_800, __p3_800) __extension__ ({ \
-  float64x2_t __ret_800; \
-  float64x2_t __s0_800 = __p0_800; \
-  float64x2_t __s1_800 = __p1_800; \
-  float64x1_t __s2_800 = __p2_800; \
-float64x1_t __reint_800 = __s2_800; \
-uint64x2_t __reint1_800 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_800, __p3_800), vgetq_lane_u64(*(uint64x2_t *) &__reint_800, __p3_800)}; \
-  __ret_800 = vcmlaq_rot180_f64(__s0_800, __s1_800, *(float64x2_t *) &__reint1_800); \
-  __ret_800; \
-})
-#else
-#define vcmlaq_rot180_lane_f64(__p0_801, __p1_801, __p2_801, __p3_801) __extension__ ({ \
-  float64x2_t __ret_801; \
-  float64x2_t __s0_801 = __p0_801; \
-  float64x2_t __s1_801 = __p1_801; \
-  float64x1_t __s2_801 = __p2_801; \
-  float64x2_t __rev0_801;  __rev0_801 = __builtin_shufflevector(__s0_801, __s0_801, 1, 0); \
-  float64x2_t __rev1_801;  __rev1_801 = __builtin_shufflevector(__s1_801, __s1_801, 1, 0); \
-float64x1_t __reint_801 = __s2_801; \
-uint64x2_t __reint1_801 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_801, __p3_801), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_801, __p3_801)}; \
-  __ret_801 = __noswap_vcmlaq_rot180_f64(__rev0_801, __rev1_801, *(float64x2_t *) &__reint1_801); \
-  __ret_801 = __builtin_shufflevector(__ret_801, __ret_801, 1, 0); \
-  __ret_801; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot180_laneq_f64(__p0_802, __p1_802, __p2_802, __p3_802) __extension__ ({ \
-  float64x1_t __ret_802; \
-  float64x1_t __s0_802 = __p0_802; \
-  float64x1_t __s1_802 = __p1_802; \
-  float64x2_t __s2_802 = __p2_802; \
-float64x2_t __reint_802 = __s2_802; \
-uint64x2_t __reint1_802 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_802, __p3_802), vgetq_lane_u64(*(uint64x2_t *) &__reint_802, __p3_802)}; \
-  __ret_802 = vcmla_rot180_f64(__s0_802, __s1_802, *(float64x1_t *) &__reint1_802); \
-  __ret_802; \
-})
-#else
-#define vcmla_rot180_laneq_f64(__p0_803, __p1_803, __p2_803, __p3_803) __extension__ ({ \
-  float64x1_t __ret_803; \
-  float64x1_t __s0_803 = __p0_803; \
-  float64x1_t __s1_803 = __p1_803; \
-  float64x2_t __s2_803 = __p2_803; \
-  float64x2_t __rev2_803;  __rev2_803 = __builtin_shufflevector(__s2_803, __s2_803, 1, 0); \
-float64x2_t __reint_803 = __rev2_803; \
-uint64x2_t __reint1_803 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_803, __p3_803), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_803, __p3_803)}; \
-  __ret_803 = vcmla_rot180_f64(__s0_803, __s1_803, *(float64x1_t *) &__reint1_803); \
-  __ret_803; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot180_laneq_f64(__p0_804, __p1_804, __p2_804, __p3_804) __extension__ ({ \
-  float64x2_t __ret_804; \
-  float64x2_t __s0_804 = __p0_804; \
-  float64x2_t __s1_804 = __p1_804; \
-  float64x2_t __s2_804 = __p2_804; \
-float64x2_t __reint_804 = __s2_804; \
-uint64x2_t __reint1_804 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_804, __p3_804), vgetq_lane_u64(*(uint64x2_t *) &__reint_804, __p3_804)}; \
-  __ret_804 = vcmlaq_rot180_f64(__s0_804, __s1_804, *(float64x2_t *) &__reint1_804); \
-  __ret_804; \
-})
-#else
-#define vcmlaq_rot180_laneq_f64(__p0_805, __p1_805, __p2_805, __p3_805) __extension__ ({ \
-  float64x2_t __ret_805; \
-  float64x2_t __s0_805 = __p0_805; \
-  float64x2_t __s1_805 = __p1_805; \
-  float64x2_t __s2_805 = __p2_805; \
-  float64x2_t __rev0_805;  __rev0_805 = __builtin_shufflevector(__s0_805, __s0_805, 1, 0); \
-  float64x2_t __rev1_805;  __rev1_805 = __builtin_shufflevector(__s1_805, __s1_805, 1, 0); \
-  float64x2_t __rev2_805;  __rev2_805 = __builtin_shufflevector(__s2_805, __s2_805, 1, 0); \
-float64x2_t __reint_805 = __rev2_805; \
-uint64x2_t __reint1_805 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_805, __p3_805), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_805, __p3_805)}; \
-  __ret_805 = __noswap_vcmlaq_rot180_f64(__rev0_805, __rev1_805, *(float64x2_t *) &__reint1_805); \
-  __ret_805 = __builtin_shufflevector(__ret_805, __ret_805, 1, 0); \
-  __ret_805; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_f64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t __noswap_vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("v8.3a,neon"))) float64x1_t vcmla_rot270_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vcmla_rot270_f64((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
-  return __ret;
-}
-#define vcmla_rot270_lane_f64(__p0_806, __p1_806, __p2_806, __p3_806) __extension__ ({ \
-  float64x1_t __ret_806; \
-  float64x1_t __s0_806 = __p0_806; \
-  float64x1_t __s1_806 = __p1_806; \
-  float64x1_t __s2_806 = __p2_806; \
-float64x1_t __reint_806 = __s2_806; \
-uint64x2_t __reint1_806 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_806, __p3_806), vgetq_lane_u64(*(uint64x2_t *) &__reint_806, __p3_806)}; \
-  __ret_806 = vcmla_rot270_f64(__s0_806, __s1_806, *(float64x1_t *) &__reint1_806); \
-  __ret_806; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot270_lane_f64(__p0_807, __p1_807, __p2_807, __p3_807) __extension__ ({ \
-  float64x2_t __ret_807; \
-  float64x2_t __s0_807 = __p0_807; \
-  float64x2_t __s1_807 = __p1_807; \
-  float64x1_t __s2_807 = __p2_807; \
-float64x1_t __reint_807 = __s2_807; \
-uint64x2_t __reint1_807 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_807, __p3_807), vgetq_lane_u64(*(uint64x2_t *) &__reint_807, __p3_807)}; \
-  __ret_807 = vcmlaq_rot270_f64(__s0_807, __s1_807, *(float64x2_t *) &__reint1_807); \
-  __ret_807; \
-})
-#else
-#define vcmlaq_rot270_lane_f64(__p0_808, __p1_808, __p2_808, __p3_808) __extension__ ({ \
-  float64x2_t __ret_808; \
-  float64x2_t __s0_808 = __p0_808; \
-  float64x2_t __s1_808 = __p1_808; \
-  float64x1_t __s2_808 = __p2_808; \
-  float64x2_t __rev0_808;  __rev0_808 = __builtin_shufflevector(__s0_808, __s0_808, 1, 0); \
-  float64x2_t __rev1_808;  __rev1_808 = __builtin_shufflevector(__s1_808, __s1_808, 1, 0); \
-float64x1_t __reint_808 = __s2_808; \
-uint64x2_t __reint1_808 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_808, __p3_808), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_808, __p3_808)}; \
-  __ret_808 = __noswap_vcmlaq_rot270_f64(__rev0_808, __rev1_808, *(float64x2_t *) &__reint1_808); \
-  __ret_808 = __builtin_shufflevector(__ret_808, __ret_808, 1, 0); \
-  __ret_808; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot270_laneq_f64(__p0_809, __p1_809, __p2_809, __p3_809) __extension__ ({ \
-  float64x1_t __ret_809; \
-  float64x1_t __s0_809 = __p0_809; \
-  float64x1_t __s1_809 = __p1_809; \
-  float64x2_t __s2_809 = __p2_809; \
-float64x2_t __reint_809 = __s2_809; \
-uint64x2_t __reint1_809 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_809, __p3_809), vgetq_lane_u64(*(uint64x2_t *) &__reint_809, __p3_809)}; \
-  __ret_809 = vcmla_rot270_f64(__s0_809, __s1_809, *(float64x1_t *) &__reint1_809); \
-  __ret_809; \
-})
-#else
-#define vcmla_rot270_laneq_f64(__p0_810, __p1_810, __p2_810, __p3_810) __extension__ ({ \
-  float64x1_t __ret_810; \
-  float64x1_t __s0_810 = __p0_810; \
-  float64x1_t __s1_810 = __p1_810; \
-  float64x2_t __s2_810 = __p2_810; \
-  float64x2_t __rev2_810;  __rev2_810 = __builtin_shufflevector(__s2_810, __s2_810, 1, 0); \
-float64x2_t __reint_810 = __rev2_810; \
-uint64x2_t __reint1_810 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_810, __p3_810), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_810, __p3_810)}; \
-  __ret_810 = vcmla_rot270_f64(__s0_810, __s1_810, *(float64x1_t *) &__reint1_810); \
-  __ret_810; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot270_laneq_f64(__p0_811, __p1_811, __p2_811, __p3_811) __extension__ ({ \
-  float64x2_t __ret_811; \
-  float64x2_t __s0_811 = __p0_811; \
-  float64x2_t __s1_811 = __p1_811; \
-  float64x2_t __s2_811 = __p2_811; \
-float64x2_t __reint_811 = __s2_811; \
-uint64x2_t __reint1_811 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_811, __p3_811), vgetq_lane_u64(*(uint64x2_t *) &__reint_811, __p3_811)}; \
-  __ret_811 = vcmlaq_rot270_f64(__s0_811, __s1_811, *(float64x2_t *) &__reint1_811); \
-  __ret_811; \
-})
-#else
-#define vcmlaq_rot270_laneq_f64(__p0_812, __p1_812, __p2_812, __p3_812) __extension__ ({ \
-  float64x2_t __ret_812; \
-  float64x2_t __s0_812 = __p0_812; \
-  float64x2_t __s1_812 = __p1_812; \
-  float64x2_t __s2_812 = __p2_812; \
-  float64x2_t __rev0_812;  __rev0_812 = __builtin_shufflevector(__s0_812, __s0_812, 1, 0); \
-  float64x2_t __rev1_812;  __rev1_812 = __builtin_shufflevector(__s1_812, __s1_812, 1, 0); \
-  float64x2_t __rev2_812;  __rev2_812 = __builtin_shufflevector(__s2_812, __s2_812, 1, 0); \
-float64x2_t __reint_812 = __rev2_812; \
-uint64x2_t __reint1_812 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_812, __p3_812), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_812, __p3_812)}; \
-  __ret_812 = __noswap_vcmlaq_rot270_f64(__rev0_812, __rev1_812, *(float64x2_t *) &__reint1_812); \
-  __ret_812 = __builtin_shufflevector(__ret_812, __ret_812, 1, 0); \
-  __ret_812; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_f64((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("v8.3a,neon"))) float64x2_t __noswap_vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("v8.3a,neon"))) float64x1_t vcmla_rot90_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vcmla_rot90_f64((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
-  return __ret;
-}
-#define vcmla_rot90_lane_f64(__p0_813, __p1_813, __p2_813, __p3_813) __extension__ ({ \
-  float64x1_t __ret_813; \
-  float64x1_t __s0_813 = __p0_813; \
-  float64x1_t __s1_813 = __p1_813; \
-  float64x1_t __s2_813 = __p2_813; \
-float64x1_t __reint_813 = __s2_813; \
-uint64x2_t __reint1_813 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_813, __p3_813), vgetq_lane_u64(*(uint64x2_t *) &__reint_813, __p3_813)}; \
-  __ret_813 = vcmla_rot90_f64(__s0_813, __s1_813, *(float64x1_t *) &__reint1_813); \
-  __ret_813; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot90_lane_f64(__p0_814, __p1_814, __p2_814, __p3_814) __extension__ ({ \
-  float64x2_t __ret_814; \
-  float64x2_t __s0_814 = __p0_814; \
-  float64x2_t __s1_814 = __p1_814; \
-  float64x1_t __s2_814 = __p2_814; \
-float64x1_t __reint_814 = __s2_814; \
-uint64x2_t __reint1_814 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_814, __p3_814), vgetq_lane_u64(*(uint64x2_t *) &__reint_814, __p3_814)}; \
-  __ret_814 = vcmlaq_rot90_f64(__s0_814, __s1_814, *(float64x2_t *) &__reint1_814); \
-  __ret_814; \
-})
-#else
-#define vcmlaq_rot90_lane_f64(__p0_815, __p1_815, __p2_815, __p3_815) __extension__ ({ \
-  float64x2_t __ret_815; \
-  float64x2_t __s0_815 = __p0_815; \
-  float64x2_t __s1_815 = __p1_815; \
-  float64x1_t __s2_815 = __p2_815; \
-  float64x2_t __rev0_815;  __rev0_815 = __builtin_shufflevector(__s0_815, __s0_815, 1, 0); \
-  float64x2_t __rev1_815;  __rev1_815 = __builtin_shufflevector(__s1_815, __s1_815, 1, 0); \
-float64x1_t __reint_815 = __s2_815; \
-uint64x2_t __reint1_815 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_815, __p3_815), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_815, __p3_815)}; \
-  __ret_815 = __noswap_vcmlaq_rot90_f64(__rev0_815, __rev1_815, *(float64x2_t *) &__reint1_815); \
-  __ret_815 = __builtin_shufflevector(__ret_815, __ret_815, 1, 0); \
-  __ret_815; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmla_rot90_laneq_f64(__p0_816, __p1_816, __p2_816, __p3_816) __extension__ ({ \
-  float64x1_t __ret_816; \
-  float64x1_t __s0_816 = __p0_816; \
-  float64x1_t __s1_816 = __p1_816; \
-  float64x2_t __s2_816 = __p2_816; \
-float64x2_t __reint_816 = __s2_816; \
-uint64x2_t __reint1_816 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_816, __p3_816), vgetq_lane_u64(*(uint64x2_t *) &__reint_816, __p3_816)}; \
-  __ret_816 = vcmla_rot90_f64(__s0_816, __s1_816, *(float64x1_t *) &__reint1_816); \
-  __ret_816; \
-})
-#else
-#define vcmla_rot90_laneq_f64(__p0_817, __p1_817, __p2_817, __p3_817) __extension__ ({ \
-  float64x1_t __ret_817; \
-  float64x1_t __s0_817 = __p0_817; \
-  float64x1_t __s1_817 = __p1_817; \
-  float64x2_t __s2_817 = __p2_817; \
-  float64x2_t __rev2_817;  __rev2_817 = __builtin_shufflevector(__s2_817, __s2_817, 1, 0); \
-float64x2_t __reint_817 = __rev2_817; \
-uint64x2_t __reint1_817 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_817, __p3_817), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_817, __p3_817)}; \
-  __ret_817 = vcmla_rot90_f64(__s0_817, __s1_817, *(float64x1_t *) &__reint1_817); \
-  __ret_817; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcmlaq_rot90_laneq_f64(__p0_818, __p1_818, __p2_818, __p3_818) __extension__ ({ \
-  float64x2_t __ret_818; \
-  float64x2_t __s0_818 = __p0_818; \
-  float64x2_t __s1_818 = __p1_818; \
-  float64x2_t __s2_818 = __p2_818; \
-float64x2_t __reint_818 = __s2_818; \
-uint64x2_t __reint1_818 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_818, __p3_818), vgetq_lane_u64(*(uint64x2_t *) &__reint_818, __p3_818)}; \
-  __ret_818 = vcmlaq_rot90_f64(__s0_818, __s1_818, *(float64x2_t *) &__reint1_818); \
-  __ret_818; \
-})
-#else
-#define vcmlaq_rot90_laneq_f64(__p0_819, __p1_819, __p2_819, __p3_819) __extension__ ({ \
-  float64x2_t __ret_819; \
-  float64x2_t __s0_819 = __p0_819; \
-  float64x2_t __s1_819 = __p1_819; \
-  float64x2_t __s2_819 = __p2_819; \
-  float64x2_t __rev0_819;  __rev0_819 = __builtin_shufflevector(__s0_819, __s0_819, 1, 0); \
-  float64x2_t __rev1_819;  __rev1_819 = __builtin_shufflevector(__s1_819, __s1_819, 1, 0); \
-  float64x2_t __rev2_819;  __rev2_819 = __builtin_shufflevector(__s2_819, __s2_819, 1, 0); \
-float64x2_t __reint_819 = __rev2_819; \
-uint64x2_t __reint1_819 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_819, __p3_819), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_819, __p3_819)}; \
-  __ret_819 = __noswap_vcmlaq_rot90_f64(__rev0_819, __rev1_819, *(float64x2_t *) &__reint1_819); \
-  __ret_819 = __builtin_shufflevector(__ret_819, __ret_819, 1, 0); \
-  __ret_819; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float32x4_t vrnd32xq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrnd32xq_f32((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float32x4_t vrnd32xq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrnd32xq_f32((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float32x2_t vrnd32x_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrnd32x_f32((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float32x2_t vrnd32x_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrnd32x_f32((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float64x2_t vrnd32xq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrnd32xq_f64((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float64x2_t vrnd32xq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrnd32xq_f64((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("v8.5a,neon"))) float64x1_t vrnd32x_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrnd32x_f64((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float32x4_t vrnd32zq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrnd32zq_f32((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float32x4_t vrnd32zq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrnd32zq_f32((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float32x2_t vrnd32z_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrnd32z_f32((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float32x2_t vrnd32z_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrnd32z_f32((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float64x2_t vrnd32zq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrnd32zq_f64((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float64x2_t vrnd32zq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrnd32zq_f64((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("v8.5a,neon"))) float64x1_t vrnd32z_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrnd32z_f64((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float32x4_t vrnd64xq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrnd64xq_f32((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float32x4_t vrnd64xq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrnd64xq_f32((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float32x2_t vrnd64x_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrnd64x_f32((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float32x2_t vrnd64x_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrnd64x_f32((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float64x2_t vrnd64xq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrnd64xq_f64((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float64x2_t vrnd64xq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrnd64xq_f64((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("v8.5a,neon"))) float64x1_t vrnd64x_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrnd64x_f64((int8x8_t)__p0, 10);
-  return __ret;
-}
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float32x4_t vrnd64zq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  __ret = (float32x4_t) __builtin_neon_vrnd64zq_f32((int8x16_t)__p0, 41);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float32x4_t vrnd64zq_f32(float32x4_t __p0) {
-  float32x4_t __ret;
-  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  __ret = (float32x4_t) __builtin_neon_vrnd64zq_f32((int8x16_t)__rev0, 41);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float32x2_t vrnd64z_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  __ret = (float32x2_t) __builtin_neon_vrnd64z_f32((int8x8_t)__p0, 9);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float32x2_t vrnd64z_f32(float32x2_t __p0) {
-  float32x2_t __ret;
-  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float32x2_t) __builtin_neon_vrnd64z_f32((int8x8_t)__rev0, 9);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("v8.5a,neon"))) float64x2_t vrnd64zq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  __ret = (float64x2_t) __builtin_neon_vrnd64zq_f64((int8x16_t)__p0, 42);
-  return __ret;
-}
-#else
-__ai __attribute__((target("v8.5a,neon"))) float64x2_t vrnd64zq_f64(float64x2_t __p0) {
-  float64x2_t __ret;
-  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  __ret = (float64x2_t) __builtin_neon_vrnd64zq_f64((int8x16_t)__rev0, 42);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-__ai __attribute__((target("v8.5a,neon"))) float64x1_t vrnd64z_f64(float64x1_t __p0) {
-  float64x1_t __ret;
-  __ret = (float64x1_t) __builtin_neon_vrnd64z_f64((int8x8_t)__p0, 10);
-  return __ret;
-}
-#endif
-#ifdef __LITTLE_ENDIAN__
-#define vbfdotq_lane_f32(__p0_820, __p1_820, __p2_820, __p3_820) __extension__ ({ \
-  float32x4_t __ret_820; \
-  float32x4_t __s0_820 = __p0_820; \
-  bfloat16x8_t __s1_820 = __p1_820; \
-  bfloat16x4_t __s2_820 = __p2_820; \
-bfloat16x4_t __reint_820 = __s2_820; \
-float32x4_t __reint1_820 = splatq_lane_f32(*(float32x2_t *) &__reint_820, __p3_820); \
-  __ret_820 = vbfdotq_f32(__s0_820, __s1_820, *(bfloat16x8_t *) &__reint1_820); \
-  __ret_820; \
-})
-#else
-#define vbfdotq_lane_f32(__p0_821, __p1_821, __p2_821, __p3_821) __extension__ ({ \
-  float32x4_t __ret_821; \
-  float32x4_t __s0_821 = __p0_821; \
-  bfloat16x8_t __s1_821 = __p1_821; \
-  bfloat16x4_t __s2_821 = __p2_821; \
-  float32x4_t __rev0_821;  __rev0_821 = __builtin_shufflevector(__s0_821, __s0_821, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_821;  __rev1_821 = __builtin_shufflevector(__s1_821, __s1_821, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x4_t __rev2_821;  __rev2_821 = __builtin_shufflevector(__s2_821, __s2_821, 3, 2, 1, 0); \
-bfloat16x4_t __reint_821 = __rev2_821; \
-float32x4_t __reint1_821 = __noswap_splatq_lane_f32(*(float32x2_t *) &__reint_821, __p3_821); \
-  __ret_821 = __noswap_vbfdotq_f32(__rev0_821, __rev1_821, *(bfloat16x8_t *) &__reint1_821); \
-  __ret_821 = __builtin_shufflevector(__ret_821, __ret_821, 3, 2, 1, 0); \
-  __ret_821; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vbfdot_lane_f32(__p0_822, __p1_822, __p2_822, __p3_822) __extension__ ({ \
-  float32x2_t __ret_822; \
-  float32x2_t __s0_822 = __p0_822; \
-  bfloat16x4_t __s1_822 = __p1_822; \
-  bfloat16x4_t __s2_822 = __p2_822; \
-bfloat16x4_t __reint_822 = __s2_822; \
-float32x2_t __reint1_822 = splat_lane_f32(*(float32x2_t *) &__reint_822, __p3_822); \
-  __ret_822 = vbfdot_f32(__s0_822, __s1_822, *(bfloat16x4_t *) &__reint1_822); \
-  __ret_822; \
-})
-#else
-#define vbfdot_lane_f32(__p0_823, __p1_823, __p2_823, __p3_823) __extension__ ({ \
-  float32x2_t __ret_823; \
-  float32x2_t __s0_823 = __p0_823; \
-  bfloat16x4_t __s1_823 = __p1_823; \
-  bfloat16x4_t __s2_823 = __p2_823; \
-  float32x2_t __rev0_823;  __rev0_823 = __builtin_shufflevector(__s0_823, __s0_823, 1, 0); \
-  bfloat16x4_t __rev1_823;  __rev1_823 = __builtin_shufflevector(__s1_823, __s1_823, 3, 2, 1, 0); \
-  bfloat16x4_t __rev2_823;  __rev2_823 = __builtin_shufflevector(__s2_823, __s2_823, 3, 2, 1, 0); \
-bfloat16x4_t __reint_823 = __rev2_823; \
-float32x2_t __reint1_823 = __noswap_splat_lane_f32(*(float32x2_t *) &__reint_823, __p3_823); \
-  __ret_823 = __noswap_vbfdot_f32(__rev0_823, __rev1_823, *(bfloat16x4_t *) &__reint1_823); \
-  __ret_823 = __builtin_shufflevector(__ret_823, __ret_823, 1, 0); \
-  __ret_823; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vbfdotq_laneq_f32(__p0_824, __p1_824, __p2_824, __p3_824) __extension__ ({ \
-  float32x4_t __ret_824; \
-  float32x4_t __s0_824 = __p0_824; \
-  bfloat16x8_t __s1_824 = __p1_824; \
-  bfloat16x8_t __s2_824 = __p2_824; \
-bfloat16x8_t __reint_824 = __s2_824; \
-float32x4_t __reint1_824 = splatq_laneq_f32(*(float32x4_t *) &__reint_824, __p3_824); \
-  __ret_824 = vbfdotq_f32(__s0_824, __s1_824, *(bfloat16x8_t *) &__reint1_824); \
-  __ret_824; \
-})
-#else
-#define vbfdotq_laneq_f32(__p0_825, __p1_825, __p2_825, __p3_825) __extension__ ({ \
-  float32x4_t __ret_825; \
-  float32x4_t __s0_825 = __p0_825; \
-  bfloat16x8_t __s1_825 = __p1_825; \
-  bfloat16x8_t __s2_825 = __p2_825; \
-  float32x4_t __rev0_825;  __rev0_825 = __builtin_shufflevector(__s0_825, __s0_825, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_825;  __rev1_825 = __builtin_shufflevector(__s1_825, __s1_825, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x8_t __rev2_825;  __rev2_825 = __builtin_shufflevector(__s2_825, __s2_825, 7, 6, 5, 4, 3, 2, 1, 0); \
-bfloat16x8_t __reint_825 = __rev2_825; \
-float32x4_t __reint1_825 = __noswap_splatq_laneq_f32(*(float32x4_t *) &__reint_825, __p3_825); \
-  __ret_825 = __noswap_vbfdotq_f32(__rev0_825, __rev1_825, *(bfloat16x8_t *) &__reint1_825); \
-  __ret_825 = __builtin_shufflevector(__ret_825, __ret_825, 3, 2, 1, 0); \
-  __ret_825; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vbfdot_laneq_f32(__p0_826, __p1_826, __p2_826, __p3_826) __extension__ ({ \
-  float32x2_t __ret_826; \
-  float32x2_t __s0_826 = __p0_826; \
-  bfloat16x4_t __s1_826 = __p1_826; \
-  bfloat16x8_t __s2_826 = __p2_826; \
-bfloat16x8_t __reint_826 = __s2_826; \
-float32x2_t __reint1_826 = splat_laneq_f32(*(float32x4_t *) &__reint_826, __p3_826); \
-  __ret_826 = vbfdot_f32(__s0_826, __s1_826, *(bfloat16x4_t *) &__reint1_826); \
-  __ret_826; \
-})
-#else
-#define vbfdot_laneq_f32(__p0_827, __p1_827, __p2_827, __p3_827) __extension__ ({ \
-  float32x2_t __ret_827; \
-  float32x2_t __s0_827 = __p0_827; \
-  bfloat16x4_t __s1_827 = __p1_827; \
-  bfloat16x8_t __s2_827 = __p2_827; \
-  float32x2_t __rev0_827;  __rev0_827 = __builtin_shufflevector(__s0_827, __s0_827, 1, 0); \
-  bfloat16x4_t __rev1_827;  __rev1_827 = __builtin_shufflevector(__s1_827, __s1_827, 3, 2, 1, 0); \
-  bfloat16x8_t __rev2_827;  __rev2_827 = __builtin_shufflevector(__s2_827, __s2_827, 7, 6, 5, 4, 3, 2, 1, 0); \
-bfloat16x8_t __reint_827 = __rev2_827; \
-float32x2_t __reint1_827 = __noswap_splat_laneq_f32(*(float32x4_t *) &__reint_827, __p3_827); \
-  __ret_827 = __noswap_vbfdot_f32(__rev0_827, __rev1_827, *(bfloat16x4_t *) &__reint1_827); \
-  __ret_827 = __builtin_shufflevector(__ret_827, __ret_827, 1, 0); \
-  __ret_827; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vbfmlalbq_lane_f32(__p0_828, __p1_828, __p2_828, __p3_828) __extension__ ({ \
-  float32x4_t __ret_828; \
-  float32x4_t __s0_828 = __p0_828; \
-  bfloat16x8_t __s1_828 = __p1_828; \
-  bfloat16x4_t __s2_828 = __p2_828; \
-  __ret_828 = vbfmlalbq_f32(__s0_828, __s1_828, (bfloat16x8_t) {vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828)}); \
-  __ret_828; \
-})
-#else
-#define vbfmlalbq_lane_f32(__p0_829, __p1_829, __p2_829, __p3_829) __extension__ ({ \
-  float32x4_t __ret_829; \
-  float32x4_t __s0_829 = __p0_829; \
-  bfloat16x8_t __s1_829 = __p1_829; \
-  bfloat16x4_t __s2_829 = __p2_829; \
-  float32x4_t __rev0_829;  __rev0_829 = __builtin_shufflevector(__s0_829, __s0_829, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_829;  __rev1_829 = __builtin_shufflevector(__s1_829, __s1_829, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x4_t __rev2_829;  __rev2_829 = __builtin_shufflevector(__s2_829, __s2_829, 3, 2, 1, 0); \
-  __ret_829 = __noswap_vbfmlalbq_f32(__rev0_829, __rev1_829, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829)}); \
-  __ret_829 = __builtin_shufflevector(__ret_829, __ret_829, 3, 2, 1, 0); \
-  __ret_829; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vbfmlalbq_laneq_f32(__p0_830, __p1_830, __p2_830, __p3_830) __extension__ ({ \
-  float32x4_t __ret_830; \
-  float32x4_t __s0_830 = __p0_830; \
-  bfloat16x8_t __s1_830 = __p1_830; \
-  bfloat16x8_t __s2_830 = __p2_830; \
-  __ret_830 = vbfmlalbq_f32(__s0_830, __s1_830, (bfloat16x8_t) {vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830)}); \
-  __ret_830; \
-})
-#else
-#define vbfmlalbq_laneq_f32(__p0_831, __p1_831, __p2_831, __p3_831) __extension__ ({ \
-  float32x4_t __ret_831; \
-  float32x4_t __s0_831 = __p0_831; \
-  bfloat16x8_t __s1_831 = __p1_831; \
-  bfloat16x8_t __s2_831 = __p2_831; \
-  float32x4_t __rev0_831;  __rev0_831 = __builtin_shufflevector(__s0_831, __s0_831, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_831;  __rev1_831 = __builtin_shufflevector(__s1_831, __s1_831, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x8_t __rev2_831;  __rev2_831 = __builtin_shufflevector(__s2_831, __s2_831, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_831 = __noswap_vbfmlalbq_f32(__rev0_831, __rev1_831, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831)}); \
-  __ret_831 = __builtin_shufflevector(__ret_831, __ret_831, 3, 2, 1, 0); \
-  __ret_831; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vbfmlaltq_lane_f32(__p0_832, __p1_832, __p2_832, __p3_832) __extension__ ({ \
-  float32x4_t __ret_832; \
-  float32x4_t __s0_832 = __p0_832; \
-  bfloat16x8_t __s1_832 = __p1_832; \
-  bfloat16x4_t __s2_832 = __p2_832; \
-  __ret_832 = vbfmlaltq_f32(__s0_832, __s1_832, (bfloat16x8_t) {vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832)}); \
-  __ret_832; \
-})
-#else
-#define vbfmlaltq_lane_f32(__p0_833, __p1_833, __p2_833, __p3_833) __extension__ ({ \
-  float32x4_t __ret_833; \
-  float32x4_t __s0_833 = __p0_833; \
-  bfloat16x8_t __s1_833 = __p1_833; \
-  bfloat16x4_t __s2_833 = __p2_833; \
-  float32x4_t __rev0_833;  __rev0_833 = __builtin_shufflevector(__s0_833, __s0_833, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_833;  __rev1_833 = __builtin_shufflevector(__s1_833, __s1_833, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x4_t __rev2_833;  __rev2_833 = __builtin_shufflevector(__s2_833, __s2_833, 3, 2, 1, 0); \
-  __ret_833 = __noswap_vbfmlaltq_f32(__rev0_833, __rev1_833, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833)}); \
-  __ret_833 = __builtin_shufflevector(__ret_833, __ret_833, 3, 2, 1, 0); \
-  __ret_833; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vbfmlaltq_laneq_f32(__p0_834, __p1_834, __p2_834, __p3_834) __extension__ ({ \
-  float32x4_t __ret_834; \
-  float32x4_t __s0_834 = __p0_834; \
-  bfloat16x8_t __s1_834 = __p1_834; \
-  bfloat16x8_t __s2_834 = __p2_834; \
-  __ret_834 = vbfmlaltq_f32(__s0_834, __s1_834, (bfloat16x8_t) {vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834)}); \
-  __ret_834; \
-})
-#else
-#define vbfmlaltq_laneq_f32(__p0_835, __p1_835, __p2_835, __p3_835) __extension__ ({ \
-  float32x4_t __ret_835; \
-  float32x4_t __s0_835 = __p0_835; \
-  bfloat16x8_t __s1_835 = __p1_835; \
-  bfloat16x8_t __s2_835 = __p2_835; \
-  float32x4_t __rev0_835;  __rev0_835 = __builtin_shufflevector(__s0_835, __s0_835, 3, 2, 1, 0); \
-  bfloat16x8_t __rev1_835;  __rev1_835 = __builtin_shufflevector(__s1_835, __s1_835, 7, 6, 5, 4, 3, 2, 1, 0); \
-  bfloat16x8_t __rev2_835;  __rev2_835 = __builtin_shufflevector(__s2_835, __s2_835, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_835 = __noswap_vbfmlaltq_f32(__rev0_835, __rev1_835, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835)}); \
-  __ret_835 = __builtin_shufflevector(__ret_835, __ret_835, 3, 2, 1, 0); \
-  __ret_835; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) float32x4_t vcvt_f32_bf16(bfloat16x4_t __p0_836) {
-  float32x4_t __ret_836;
-bfloat16x4_t __reint_836 = __p0_836;
-int32x4_t __reint1_836 = vshll_n_s16(*(int16x4_t *) &__reint_836, 16);
-  __ret_836 = *(float32x4_t *) &__reint1_836;
-  return __ret_836;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) float32x4_t vcvt_f32_bf16(bfloat16x4_t __p0_837) {
-  float32x4_t __ret_837;
-  bfloat16x4_t __rev0_837;  __rev0_837 = __builtin_shufflevector(__p0_837, __p0_837, 3, 2, 1, 0);
-bfloat16x4_t __reint_837 = __rev0_837;
-int32x4_t __reint1_837 = __noswap_vshll_n_s16(*(int16x4_t *) &__reint_837, 16);
-  __ret_837 = *(float32x4_t *) &__reint1_837;
-  __ret_837 = __builtin_shufflevector(__ret_837, __ret_837, 3, 2, 1, 0);
-  return __ret_837;
-}
-__ai __attribute__((target("bf16,neon"))) float32x4_t __noswap_vcvt_f32_bf16(bfloat16x4_t __p0_838) {
-  float32x4_t __ret_838;
-bfloat16x4_t __reint_838 = __p0_838;
-int32x4_t __reint1_838 = __noswap_vshll_n_s16(*(int16x4_t *) &__reint_838, 16);
-  __ret_838 = *(float32x4_t *) &__reint1_838;
-  return __ret_838;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) float32x4_t vcvtq_high_f32_bf16(bfloat16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = vcvt_f32_bf16(vget_high_bf16(__p0));
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) float32x4_t vcvtq_high_f32_bf16(bfloat16x8_t __p0) {
-  float32x4_t __ret;
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcvt_f32_bf16(__noswap_vget_high_bf16(__rev0));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("bf16,neon"))) float32x4_t vcvtq_low_f32_bf16(bfloat16x8_t __p0) {
-  float32x4_t __ret;
-  __ret = vcvt_f32_bf16(vget_low_bf16(__p0));
-  return __ret;
-}
-#else
-__ai __attribute__((target("bf16,neon"))) float32x4_t vcvtq_low_f32_bf16(bfloat16x8_t __p0) {
-  float32x4_t __ret;
-  bfloat16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vcvt_f32_bf16(__noswap_vget_low_bf16(__rev0));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdotq_lane_u32(__p0_839, __p1_839, __p2_839, __p3_839) __extension__ ({ \
-  uint32x4_t __ret_839; \
-  uint32x4_t __s0_839 = __p0_839; \
-  uint8x16_t __s1_839 = __p1_839; \
-  uint8x8_t __s2_839 = __p2_839; \
-uint8x8_t __reint_839 = __s2_839; \
-uint32x4_t __reint1_839 = splatq_lane_u32(*(uint32x2_t *) &__reint_839, __p3_839); \
-  __ret_839 = vdotq_u32(__s0_839, __s1_839, *(uint8x16_t *) &__reint1_839); \
-  __ret_839; \
-})
-#else
-#define vdotq_lane_u32(__p0_840, __p1_840, __p2_840, __p3_840) __extension__ ({ \
-  uint32x4_t __ret_840; \
-  uint32x4_t __s0_840 = __p0_840; \
-  uint8x16_t __s1_840 = __p1_840; \
-  uint8x8_t __s2_840 = __p2_840; \
-  uint32x4_t __rev0_840;  __rev0_840 = __builtin_shufflevector(__s0_840, __s0_840, 3, 2, 1, 0); \
-  uint8x16_t __rev1_840;  __rev1_840 = __builtin_shufflevector(__s1_840, __s1_840, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_840;  __rev2_840 = __builtin_shufflevector(__s2_840, __s2_840, 7, 6, 5, 4, 3, 2, 1, 0); \
-uint8x8_t __reint_840 = __rev2_840; \
-uint32x4_t __reint1_840 = __noswap_splatq_lane_u32(*(uint32x2_t *) &__reint_840, __p3_840); \
-  __ret_840 = __noswap_vdotq_u32(__rev0_840, __rev1_840, *(uint8x16_t *) &__reint1_840); \
-  __ret_840 = __builtin_shufflevector(__ret_840, __ret_840, 3, 2, 1, 0); \
-  __ret_840; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdotq_lane_s32(__p0_841, __p1_841, __p2_841, __p3_841) __extension__ ({ \
-  int32x4_t __ret_841; \
-  int32x4_t __s0_841 = __p0_841; \
-  int8x16_t __s1_841 = __p1_841; \
-  int8x8_t __s2_841 = __p2_841; \
-int8x8_t __reint_841 = __s2_841; \
-int32x4_t __reint1_841 = splatq_lane_s32(*(int32x2_t *) &__reint_841, __p3_841); \
-  __ret_841 = vdotq_s32(__s0_841, __s1_841, *(int8x16_t *) &__reint1_841); \
-  __ret_841; \
-})
-#else
-#define vdotq_lane_s32(__p0_842, __p1_842, __p2_842, __p3_842) __extension__ ({ \
-  int32x4_t __ret_842; \
-  int32x4_t __s0_842 = __p0_842; \
-  int8x16_t __s1_842 = __p1_842; \
-  int8x8_t __s2_842 = __p2_842; \
-  int32x4_t __rev0_842;  __rev0_842 = __builtin_shufflevector(__s0_842, __s0_842, 3, 2, 1, 0); \
-  int8x16_t __rev1_842;  __rev1_842 = __builtin_shufflevector(__s1_842, __s1_842, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_842;  __rev2_842 = __builtin_shufflevector(__s2_842, __s2_842, 7, 6, 5, 4, 3, 2, 1, 0); \
-int8x8_t __reint_842 = __rev2_842; \
-int32x4_t __reint1_842 = __noswap_splatq_lane_s32(*(int32x2_t *) &__reint_842, __p3_842); \
-  __ret_842 = __noswap_vdotq_s32(__rev0_842, __rev1_842, *(int8x16_t *) &__reint1_842); \
-  __ret_842 = __builtin_shufflevector(__ret_842, __ret_842, 3, 2, 1, 0); \
-  __ret_842; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdot_lane_u32(__p0_843, __p1_843, __p2_843, __p3_843) __extension__ ({ \
-  uint32x2_t __ret_843; \
-  uint32x2_t __s0_843 = __p0_843; \
-  uint8x8_t __s1_843 = __p1_843; \
-  uint8x8_t __s2_843 = __p2_843; \
-uint8x8_t __reint_843 = __s2_843; \
-uint32x2_t __reint1_843 = splat_lane_u32(*(uint32x2_t *) &__reint_843, __p3_843); \
-  __ret_843 = vdot_u32(__s0_843, __s1_843, *(uint8x8_t *) &__reint1_843); \
-  __ret_843; \
-})
-#else
-#define vdot_lane_u32(__p0_844, __p1_844, __p2_844, __p3_844) __extension__ ({ \
-  uint32x2_t __ret_844; \
-  uint32x2_t __s0_844 = __p0_844; \
-  uint8x8_t __s1_844 = __p1_844; \
-  uint8x8_t __s2_844 = __p2_844; \
-  uint32x2_t __rev0_844;  __rev0_844 = __builtin_shufflevector(__s0_844, __s0_844, 1, 0); \
-  uint8x8_t __rev1_844;  __rev1_844 = __builtin_shufflevector(__s1_844, __s1_844, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_844;  __rev2_844 = __builtin_shufflevector(__s2_844, __s2_844, 7, 6, 5, 4, 3, 2, 1, 0); \
-uint8x8_t __reint_844 = __rev2_844; \
-uint32x2_t __reint1_844 = __noswap_splat_lane_u32(*(uint32x2_t *) &__reint_844, __p3_844); \
-  __ret_844 = __noswap_vdot_u32(__rev0_844, __rev1_844, *(uint8x8_t *) &__reint1_844); \
-  __ret_844 = __builtin_shufflevector(__ret_844, __ret_844, 1, 0); \
-  __ret_844; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vdot_lane_s32(__p0_845, __p1_845, __p2_845, __p3_845) __extension__ ({ \
-  int32x2_t __ret_845; \
-  int32x2_t __s0_845 = __p0_845; \
-  int8x8_t __s1_845 = __p1_845; \
-  int8x8_t __s2_845 = __p2_845; \
-int8x8_t __reint_845 = __s2_845; \
-int32x2_t __reint1_845 = splat_lane_s32(*(int32x2_t *) &__reint_845, __p3_845); \
-  __ret_845 = vdot_s32(__s0_845, __s1_845, *(int8x8_t *) &__reint1_845); \
-  __ret_845; \
-})
-#else
-#define vdot_lane_s32(__p0_846, __p1_846, __p2_846, __p3_846) __extension__ ({ \
-  int32x2_t __ret_846; \
-  int32x2_t __s0_846 = __p0_846; \
-  int8x8_t __s1_846 = __p1_846; \
-  int8x8_t __s2_846 = __p2_846; \
-  int32x2_t __rev0_846;  __rev0_846 = __builtin_shufflevector(__s0_846, __s0_846, 1, 0); \
-  int8x8_t __rev1_846;  __rev1_846 = __builtin_shufflevector(__s1_846, __s1_846, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_846;  __rev2_846 = __builtin_shufflevector(__s2_846, __s2_846, 7, 6, 5, 4, 3, 2, 1, 0); \
-int8x8_t __reint_846 = __rev2_846; \
-int32x2_t __reint1_846 = __noswap_splat_lane_s32(*(int32x2_t *) &__reint_846, __p3_846); \
-  __ret_846 = __noswap_vdot_s32(__rev0_846, __rev1_846, *(int8x8_t *) &__reint1_846); \
-  __ret_846 = __builtin_shufflevector(__ret_846, __ret_846, 1, 0); \
-  __ret_846; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulq_lane_f16(__p0_847, __p1_847, __p2_847) __extension__ ({ \
-  float16x8_t __ret_847; \
-  float16x8_t __s0_847 = __p0_847; \
-  float16x4_t __s1_847 = __p1_847; \
-  __ret_847 = __s0_847 * splatq_lane_f16(__s1_847, __p2_847); \
-  __ret_847; \
-})
-#else
-#define vmulq_lane_f16(__p0_848, __p1_848, __p2_848) __extension__ ({ \
-  float16x8_t __ret_848; \
-  float16x8_t __s0_848 = __p0_848; \
-  float16x4_t __s1_848 = __p1_848; \
-  float16x8_t __rev0_848;  __rev0_848 = __builtin_shufflevector(__s0_848, __s0_848, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev1_848;  __rev1_848 = __builtin_shufflevector(__s1_848, __s1_848, 3, 2, 1, 0); \
-  __ret_848 = __rev0_848 * __noswap_splatq_lane_f16(__rev1_848, __p2_848); \
-  __ret_848 = __builtin_shufflevector(__ret_848, __ret_848, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_848; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmul_lane_f16(__p0_849, __p1_849, __p2_849) __extension__ ({ \
-  float16x4_t __ret_849; \
-  float16x4_t __s0_849 = __p0_849; \
-  float16x4_t __s1_849 = __p1_849; \
-  __ret_849 = __s0_849 * splat_lane_f16(__s1_849, __p2_849); \
-  __ret_849; \
-})
-#else
-#define vmul_lane_f16(__p0_850, __p1_850, __p2_850) __extension__ ({ \
-  float16x4_t __ret_850; \
-  float16x4_t __s0_850 = __p0_850; \
-  float16x4_t __s1_850 = __p1_850; \
-  float16x4_t __rev0_850;  __rev0_850 = __builtin_shufflevector(__s0_850, __s0_850, 3, 2, 1, 0); \
-  float16x4_t __rev1_850;  __rev1_850 = __builtin_shufflevector(__s1_850, __s1_850, 3, 2, 1, 0); \
-  __ret_850 = __rev0_850 * __noswap_splat_lane_f16(__rev1_850, __p2_850); \
-  __ret_850 = __builtin_shufflevector(__ret_850, __ret_850, 3, 2, 1, 0); \
-  __ret_850; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsudotq_lane_s32(__p0_851, __p1_851, __p2_851, __p3_851) __extension__ ({ \
-  int32x4_t __ret_851; \
-  int32x4_t __s0_851 = __p0_851; \
-  int8x16_t __s1_851 = __p1_851; \
-  uint8x8_t __s2_851 = __p2_851; \
-uint8x8_t __reint_851 = __s2_851; \
-  __ret_851 = vusdotq_s32(__s0_851, (uint8x16_t)(splatq_lane_s32(*(int32x2_t *) &__reint_851, __p3_851)), __s1_851); \
-  __ret_851; \
-})
-#else
-#define vsudotq_lane_s32(__p0_852, __p1_852, __p2_852, __p3_852) __extension__ ({ \
-  int32x4_t __ret_852; \
-  int32x4_t __s0_852 = __p0_852; \
-  int8x16_t __s1_852 = __p1_852; \
-  uint8x8_t __s2_852 = __p2_852; \
-  int32x4_t __rev0_852;  __rev0_852 = __builtin_shufflevector(__s0_852, __s0_852, 3, 2, 1, 0); \
-  int8x16_t __rev1_852;  __rev1_852 = __builtin_shufflevector(__s1_852, __s1_852, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_852;  __rev2_852 = __builtin_shufflevector(__s2_852, __s2_852, 7, 6, 5, 4, 3, 2, 1, 0); \
-uint8x8_t __reint_852 = __rev2_852; \
-  __ret_852 = __noswap_vusdotq_s32(__rev0_852, (uint8x16_t)(__noswap_splatq_lane_s32(*(int32x2_t *) &__reint_852, __p3_852)), __rev1_852); \
-  __ret_852 = __builtin_shufflevector(__ret_852, __ret_852, 3, 2, 1, 0); \
-  __ret_852; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsudot_lane_s32(__p0_853, __p1_853, __p2_853, __p3_853) __extension__ ({ \
-  int32x2_t __ret_853; \
-  int32x2_t __s0_853 = __p0_853; \
-  int8x8_t __s1_853 = __p1_853; \
-  uint8x8_t __s2_853 = __p2_853; \
-uint8x8_t __reint_853 = __s2_853; \
-  __ret_853 = vusdot_s32(__s0_853, (uint8x8_t)(splat_lane_s32(*(int32x2_t *) &__reint_853, __p3_853)), __s1_853); \
-  __ret_853; \
-})
-#else
-#define vsudot_lane_s32(__p0_854, __p1_854, __p2_854, __p3_854) __extension__ ({ \
-  int32x2_t __ret_854; \
-  int32x2_t __s0_854 = __p0_854; \
-  int8x8_t __s1_854 = __p1_854; \
-  uint8x8_t __s2_854 = __p2_854; \
-  int32x2_t __rev0_854;  __rev0_854 = __builtin_shufflevector(__s0_854, __s0_854, 1, 0); \
-  int8x8_t __rev1_854;  __rev1_854 = __builtin_shufflevector(__s1_854, __s1_854, 7, 6, 5, 4, 3, 2, 1, 0); \
-  uint8x8_t __rev2_854;  __rev2_854 = __builtin_shufflevector(__s2_854, __s2_854, 7, 6, 5, 4, 3, 2, 1, 0); \
-uint8x8_t __reint_854 = __rev2_854; \
-  __ret_854 = __noswap_vusdot_s32(__rev0_854, (uint8x8_t)(__noswap_splat_lane_s32(*(int32x2_t *) &__reint_854, __p3_854)), __rev1_854); \
-  __ret_854 = __builtin_shufflevector(__ret_854, __ret_854, 1, 0); \
-  __ret_854; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vusdotq_lane_s32(__p0_855, __p1_855, __p2_855, __p3_855) __extension__ ({ \
-  int32x4_t __ret_855; \
-  int32x4_t __s0_855 = __p0_855; \
-  uint8x16_t __s1_855 = __p1_855; \
-  int8x8_t __s2_855 = __p2_855; \
-int8x8_t __reint_855 = __s2_855; \
-  __ret_855 = vusdotq_s32(__s0_855, __s1_855, (int8x16_t)(splatq_lane_s32(*(int32x2_t *) &__reint_855, __p3_855))); \
-  __ret_855; \
-})
-#else
-#define vusdotq_lane_s32(__p0_856, __p1_856, __p2_856, __p3_856) __extension__ ({ \
-  int32x4_t __ret_856; \
-  int32x4_t __s0_856 = __p0_856; \
-  uint8x16_t __s1_856 = __p1_856; \
-  int8x8_t __s2_856 = __p2_856; \
-  int32x4_t __rev0_856;  __rev0_856 = __builtin_shufflevector(__s0_856, __s0_856, 3, 2, 1, 0); \
-  uint8x16_t __rev1_856;  __rev1_856 = __builtin_shufflevector(__s1_856, __s1_856, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_856;  __rev2_856 = __builtin_shufflevector(__s2_856, __s2_856, 7, 6, 5, 4, 3, 2, 1, 0); \
-int8x8_t __reint_856 = __rev2_856; \
-  __ret_856 = __noswap_vusdotq_s32(__rev0_856, __rev1_856, (int8x16_t)(__noswap_splatq_lane_s32(*(int32x2_t *) &__reint_856, __p3_856))); \
-  __ret_856 = __builtin_shufflevector(__ret_856, __ret_856, 3, 2, 1, 0); \
-  __ret_856; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vusdot_lane_s32(__p0_857, __p1_857, __p2_857, __p3_857) __extension__ ({ \
-  int32x2_t __ret_857; \
-  int32x2_t __s0_857 = __p0_857; \
-  uint8x8_t __s1_857 = __p1_857; \
-  int8x8_t __s2_857 = __p2_857; \
-int8x8_t __reint_857 = __s2_857; \
-  __ret_857 = vusdot_s32(__s0_857, __s1_857, (int8x8_t)(splat_lane_s32(*(int32x2_t *) &__reint_857, __p3_857))); \
-  __ret_857; \
-})
-#else
-#define vusdot_lane_s32(__p0_858, __p1_858, __p2_858, __p3_858) __extension__ ({ \
-  int32x2_t __ret_858; \
-  int32x2_t __s0_858 = __p0_858; \
-  uint8x8_t __s1_858 = __p1_858; \
-  int8x8_t __s2_858 = __p2_858; \
-  int32x2_t __rev0_858;  __rev0_858 = __builtin_shufflevector(__s0_858, __s0_858, 1, 0); \
-  uint8x8_t __rev1_858;  __rev1_858 = __builtin_shufflevector(__s1_858, __s1_858, 7, 6, 5, 4, 3, 2, 1, 0); \
-  int8x8_t __rev2_858;  __rev2_858 = __builtin_shufflevector(__s2_858, __s2_858, 7, 6, 5, 4, 3, 2, 1, 0); \
-int8x8_t __reint_858 = __rev2_858; \
-  __ret_858 = __noswap_vusdot_s32(__rev0_858, __rev1_858, (int8x8_t)(__noswap_splat_lane_s32(*(int32x2_t *) &__reint_858, __p3_858))); \
-  __ret_858 = __builtin_shufflevector(__ret_858, __ret_858, 1, 0); \
-  __ret_858; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x16_t vabaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  __ret = __p0 + vabdq_u8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x16_t vabaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint8x16_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdq_u8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vabaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 + vabdq_u32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vabaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdq_u32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vabaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 + vabdq_u16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vabaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdq_u16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x16_t vabaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  __ret = __p0 + vabdq_s8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x16_t vabaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int8x16_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdq_s8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vabaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 + vabdq_s32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vabaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdq_s32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vabaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 + vabdq_s16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vabaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdq_s16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint8x8_t vaba_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  __ret = __p0 + vabd_u8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint8x8_t vaba_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint8x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabd_u8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x2_t vaba_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint32x2_t __ret;
-  __ret = __p0 + vabd_u32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x2_t vaba_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint32x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __noswap_vabd_u32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x4_t vaba_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint16x4_t __ret;
-  __ret = __p0 + vabd_u16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x4_t vaba_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint16x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabd_u16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int8x8_t vaba_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  __ret = __p0 + vabd_s8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int8x8_t vaba_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int8x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabd_s8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x2_t vaba_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  __ret = __p0 + vabd_s32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x2_t vaba_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int32x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __noswap_vabd_s32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x4_t vaba_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  __ret = __p0 + vabd_s16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x4_t vaba_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int16x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabd_s16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vabdl_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(vmovl_u8((uint8x8_t)(vabd_u8(__p0, __p1))));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vabdl_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (uint16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_u8(__rev0, __rev1))));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t __noswap_vabdl_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = (uint16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_u8(__p0, __p1))));
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vabdl_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(vmovl_u32((uint32x2_t)(vabd_u32(__p0, __p1))));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vabdl_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (uint64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_u32(__rev0, __rev1))));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vabdl_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = (uint64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_u32(__p0, __p1))));
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vabdl_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(vmovl_u16((uint16x4_t)(vabd_u16(__p0, __p1))));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vabdl_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (uint32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_u16(__rev0, __rev1))));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vabdl_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = (uint32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_u16(__p0, __p1))));
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vabdl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(vmovl_u8((uint8x8_t)(vabd_s8(__p0, __p1))));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vabdl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = (int16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_s8(__rev0, __rev1))));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vabdl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  __ret = (int16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_s8(__p0, __p1))));
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vabdl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(vmovl_u32((uint32x2_t)(vabd_s32(__p0, __p1))));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vabdl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = (int64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_s32(__rev0, __rev1))));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vabdl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = (int64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_s32(__p0, __p1))));
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vabdl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(vmovl_u16((uint16x4_t)(vabd_s16(__p0, __p1))));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vabdl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = (int32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_s16(__rev0, __rev1))));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vabdl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = (int32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_s16(__p0, __p1))));
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vaddl_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = vmovl_u8(__p0) + vmovl_u8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vaddl_u8(uint8x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_u8(__rev0) + __noswap_vmovl_u8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vaddl_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = vmovl_u32(__p0) + vmovl_u32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vaddl_u32(uint32x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vmovl_u32(__rev0) + __noswap_vmovl_u32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vaddl_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = vmovl_u16(__p0) + vmovl_u16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vaddl_u16(uint16x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_u16(__rev0) + __noswap_vmovl_u16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vaddl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  __ret = vmovl_s8(__p0) + vmovl_s8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vaddl_s8(int8x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_s8(__rev0) + __noswap_vmovl_s8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vaddl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = vmovl_s32(__p0) + vmovl_s32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vaddl_s32(int32x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __noswap_vmovl_s32(__rev0) + __noswap_vmovl_s32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vaddl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = vmovl_s16(__p0) + vmovl_s16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vaddl_s16(int16x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_s16(__rev0) + __noswap_vmovl_s16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vaddw_u8(uint16x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 + vmovl_u8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vaddw_u8(uint16x8_t __p0, uint8x8_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_u8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vaddw_u32(uint64x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 + vmovl_u32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vaddw_u32(uint64x2_t __p0, uint32x2_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_u32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vaddw_u16(uint32x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 + vmovl_u16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vaddw_u16(uint32x4_t __p0, uint16x4_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_u16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vaddw_s8(int16x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 + vmovl_s8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vaddw_s8(int16x8_t __p0, int8x8_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_s8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vaddw_s32(int64x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 + vmovl_s32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vaddw_s32(int64x2_t __p0, int32x2_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_s32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vaddw_s16(int32x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 + vmovl_s16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vaddw_s16(int32x4_t __p0, int16x4_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_s16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vget_lane_f16(__p0_859, __p1_859) __extension__ ({ \
-  float16_t __ret_859; \
-  float16x4_t __s0_859 = __p0_859; \
-float16x4_t __reint_859 = __s0_859; \
-int16_t __reint1_859 = vget_lane_s16(*(int16x4_t *) &__reint_859, __p1_859); \
-  __ret_859 = *(float16_t *) &__reint1_859; \
-  __ret_859; \
-})
-#else
-#define vget_lane_f16(__p0_860, __p1_860) __extension__ ({ \
-  float16_t __ret_860; \
-  float16x4_t __s0_860 = __p0_860; \
-  float16x4_t __rev0_860;  __rev0_860 = __builtin_shufflevector(__s0_860, __s0_860, 3, 2, 1, 0); \
-float16x4_t __reint_860 = __rev0_860; \
-int16_t __reint1_860 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_860, __p1_860); \
-  __ret_860 = *(float16_t *) &__reint1_860; \
-  __ret_860; \
-})
-#define __noswap_vget_lane_f16(__p0_861, __p1_861) __extension__ ({ \
-  float16_t __ret_861; \
-  float16x4_t __s0_861 = __p0_861; \
-float16x4_t __reint_861 = __s0_861; \
-int16_t __reint1_861 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_861, __p1_861); \
-  __ret_861 = *(float16_t *) &__reint1_861; \
-  __ret_861; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vgetq_lane_f16(__p0_862, __p1_862) __extension__ ({ \
-  float16_t __ret_862; \
-  float16x8_t __s0_862 = __p0_862; \
-float16x8_t __reint_862 = __s0_862; \
-int16_t __reint1_862 = vgetq_lane_s16(*(int16x8_t *) &__reint_862, __p1_862); \
-  __ret_862 = *(float16_t *) &__reint1_862; \
-  __ret_862; \
-})
-#else
-#define vgetq_lane_f16(__p0_863, __p1_863) __extension__ ({ \
-  float16_t __ret_863; \
-  float16x8_t __s0_863 = __p0_863; \
-  float16x8_t __rev0_863;  __rev0_863 = __builtin_shufflevector(__s0_863, __s0_863, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16x8_t __reint_863 = __rev0_863; \
-int16_t __reint1_863 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_863, __p1_863); \
-  __ret_863 = *(float16_t *) &__reint1_863; \
-  __ret_863; \
-})
-#define __noswap_vgetq_lane_f16(__p0_864, __p1_864) __extension__ ({ \
-  float16_t __ret_864; \
-  float16x8_t __s0_864 = __p0_864; \
-float16x8_t __reint_864 = __s0_864; \
-int16_t __reint1_864 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_864, __p1_864); \
-  __ret_864 = *(float16_t *) &__reint1_864; \
-  __ret_864; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmlal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 + vmull_u8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmlal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmull_u8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t __noswap_vmlal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 + __noswap_vmull_u8(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmlal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 + vmull_u32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmlal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __noswap_vmull_u32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vmlal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 + __noswap_vmull_u32(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 + vmull_u16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmull_u16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vmlal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 + __noswap_vmull_u16(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmlal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 + vmull_s8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmlal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmull_s8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vmlal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 + __noswap_vmull_s8(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 + vmull_s32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __noswap_vmull_s32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 + __noswap_vmull_s32(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 + vmull_s16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmull_s16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 + __noswap_vmull_s16(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_lane_u32(__p0_865, __p1_865, __p2_865, __p3_865) __extension__ ({ \
-  uint64x2_t __ret_865; \
-  uint64x2_t __s0_865 = __p0_865; \
-  uint32x2_t __s1_865 = __p1_865; \
-  uint32x2_t __s2_865 = __p2_865; \
-  __ret_865 = __s0_865 + vmull_u32(__s1_865, splat_lane_u32(__s2_865, __p3_865)); \
-  __ret_865; \
-})
-#else
-#define vmlal_lane_u32(__p0_866, __p1_866, __p2_866, __p3_866) __extension__ ({ \
-  uint64x2_t __ret_866; \
-  uint64x2_t __s0_866 = __p0_866; \
-  uint32x2_t __s1_866 = __p1_866; \
-  uint32x2_t __s2_866 = __p2_866; \
-  uint64x2_t __rev0_866;  __rev0_866 = __builtin_shufflevector(__s0_866, __s0_866, 1, 0); \
-  uint32x2_t __rev1_866;  __rev1_866 = __builtin_shufflevector(__s1_866, __s1_866, 1, 0); \
-  uint32x2_t __rev2_866;  __rev2_866 = __builtin_shufflevector(__s2_866, __s2_866, 1, 0); \
-  __ret_866 = __rev0_866 + __noswap_vmull_u32(__rev1_866, __noswap_splat_lane_u32(__rev2_866, __p3_866)); \
-  __ret_866 = __builtin_shufflevector(__ret_866, __ret_866, 1, 0); \
-  __ret_866; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_lane_u16(__p0_867, __p1_867, __p2_867, __p3_867) __extension__ ({ \
-  uint32x4_t __ret_867; \
-  uint32x4_t __s0_867 = __p0_867; \
-  uint16x4_t __s1_867 = __p1_867; \
-  uint16x4_t __s2_867 = __p2_867; \
-  __ret_867 = __s0_867 + vmull_u16(__s1_867, splat_lane_u16(__s2_867, __p3_867)); \
-  __ret_867; \
-})
-#else
-#define vmlal_lane_u16(__p0_868, __p1_868, __p2_868, __p3_868) __extension__ ({ \
-  uint32x4_t __ret_868; \
-  uint32x4_t __s0_868 = __p0_868; \
-  uint16x4_t __s1_868 = __p1_868; \
-  uint16x4_t __s2_868 = __p2_868; \
-  uint32x4_t __rev0_868;  __rev0_868 = __builtin_shufflevector(__s0_868, __s0_868, 3, 2, 1, 0); \
-  uint16x4_t __rev1_868;  __rev1_868 = __builtin_shufflevector(__s1_868, __s1_868, 3, 2, 1, 0); \
-  uint16x4_t __rev2_868;  __rev2_868 = __builtin_shufflevector(__s2_868, __s2_868, 3, 2, 1, 0); \
-  __ret_868 = __rev0_868 + __noswap_vmull_u16(__rev1_868, __noswap_splat_lane_u16(__rev2_868, __p3_868)); \
-  __ret_868 = __builtin_shufflevector(__ret_868, __ret_868, 3, 2, 1, 0); \
-  __ret_868; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_lane_s32(__p0_869, __p1_869, __p2_869, __p3_869) __extension__ ({ \
-  int64x2_t __ret_869; \
-  int64x2_t __s0_869 = __p0_869; \
-  int32x2_t __s1_869 = __p1_869; \
-  int32x2_t __s2_869 = __p2_869; \
-  __ret_869 = __s0_869 + vmull_s32(__s1_869, splat_lane_s32(__s2_869, __p3_869)); \
-  __ret_869; \
-})
-#else
-#define vmlal_lane_s32(__p0_870, __p1_870, __p2_870, __p3_870) __extension__ ({ \
-  int64x2_t __ret_870; \
-  int64x2_t __s0_870 = __p0_870; \
-  int32x2_t __s1_870 = __p1_870; \
-  int32x2_t __s2_870 = __p2_870; \
-  int64x2_t __rev0_870;  __rev0_870 = __builtin_shufflevector(__s0_870, __s0_870, 1, 0); \
-  int32x2_t __rev1_870;  __rev1_870 = __builtin_shufflevector(__s1_870, __s1_870, 1, 0); \
-  int32x2_t __rev2_870;  __rev2_870 = __builtin_shufflevector(__s2_870, __s2_870, 1, 0); \
-  __ret_870 = __rev0_870 + __noswap_vmull_s32(__rev1_870, __noswap_splat_lane_s32(__rev2_870, __p3_870)); \
-  __ret_870 = __builtin_shufflevector(__ret_870, __ret_870, 1, 0); \
-  __ret_870; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlal_lane_s16(__p0_871, __p1_871, __p2_871, __p3_871) __extension__ ({ \
-  int32x4_t __ret_871; \
-  int32x4_t __s0_871 = __p0_871; \
-  int16x4_t __s1_871 = __p1_871; \
-  int16x4_t __s2_871 = __p2_871; \
-  __ret_871 = __s0_871 + vmull_s16(__s1_871, splat_lane_s16(__s2_871, __p3_871)); \
-  __ret_871; \
-})
-#else
-#define vmlal_lane_s16(__p0_872, __p1_872, __p2_872, __p3_872) __extension__ ({ \
-  int32x4_t __ret_872; \
-  int32x4_t __s0_872 = __p0_872; \
-  int16x4_t __s1_872 = __p1_872; \
-  int16x4_t __s2_872 = __p2_872; \
-  int32x4_t __rev0_872;  __rev0_872 = __builtin_shufflevector(__s0_872, __s0_872, 3, 2, 1, 0); \
-  int16x4_t __rev1_872;  __rev1_872 = __builtin_shufflevector(__s1_872, __s1_872, 3, 2, 1, 0); \
-  int16x4_t __rev2_872;  __rev2_872 = __builtin_shufflevector(__s2_872, __s2_872, 3, 2, 1, 0); \
-  __ret_872 = __rev0_872 + __noswap_vmull_s16(__rev1_872, __noswap_splat_lane_s16(__rev2_872, __p3_872)); \
-  __ret_872 = __builtin_shufflevector(__ret_872, __ret_872, 3, 2, 1, 0); \
-  __ret_872; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmlal_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 + vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmlal_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __noswap_vmull_u32(__rev1, (uint32x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vmlal_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 + __noswap_vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlal_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 + vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlal_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmull_u16(__rev1, (uint16x4_t) {__p2, __p2, __p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vmlal_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 + __noswap_vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 + vmull_s32(__p1, (int32x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 + __noswap_vmull_s32(__rev1, (int32x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 + __noswap_vmull_s32(__p1, (int32x2_t) {__p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 + vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmull_s16(__rev1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 + __noswap_vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmlsl_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 - vmull_u8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmlsl_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmull_u8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t __noswap_vmlsl_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 - __noswap_vmull_u8(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmlsl_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 - vmull_u32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmlsl_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 - __noswap_vmull_u32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vmlsl_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 - __noswap_vmull_u32(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlsl_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 - vmull_u16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlsl_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmull_u16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vmlsl_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 - __noswap_vmull_u16(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmlsl_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 - vmull_s8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmlsl_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmull_s8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vmlsl_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 - __noswap_vmull_s8(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 - vmull_s32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 - __noswap_vmull_s32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 - __noswap_vmull_s32(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 - vmull_s16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmull_s16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 - __noswap_vmull_s16(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_lane_u32(__p0_873, __p1_873, __p2_873, __p3_873) __extension__ ({ \
-  uint64x2_t __ret_873; \
-  uint64x2_t __s0_873 = __p0_873; \
-  uint32x2_t __s1_873 = __p1_873; \
-  uint32x2_t __s2_873 = __p2_873; \
-  __ret_873 = __s0_873 - vmull_u32(__s1_873, splat_lane_u32(__s2_873, __p3_873)); \
-  __ret_873; \
-})
-#else
-#define vmlsl_lane_u32(__p0_874, __p1_874, __p2_874, __p3_874) __extension__ ({ \
-  uint64x2_t __ret_874; \
-  uint64x2_t __s0_874 = __p0_874; \
-  uint32x2_t __s1_874 = __p1_874; \
-  uint32x2_t __s2_874 = __p2_874; \
-  uint64x2_t __rev0_874;  __rev0_874 = __builtin_shufflevector(__s0_874, __s0_874, 1, 0); \
-  uint32x2_t __rev1_874;  __rev1_874 = __builtin_shufflevector(__s1_874, __s1_874, 1, 0); \
-  uint32x2_t __rev2_874;  __rev2_874 = __builtin_shufflevector(__s2_874, __s2_874, 1, 0); \
-  __ret_874 = __rev0_874 - __noswap_vmull_u32(__rev1_874, __noswap_splat_lane_u32(__rev2_874, __p3_874)); \
-  __ret_874 = __builtin_shufflevector(__ret_874, __ret_874, 1, 0); \
-  __ret_874; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_lane_u16(__p0_875, __p1_875, __p2_875, __p3_875) __extension__ ({ \
-  uint32x4_t __ret_875; \
-  uint32x4_t __s0_875 = __p0_875; \
-  uint16x4_t __s1_875 = __p1_875; \
-  uint16x4_t __s2_875 = __p2_875; \
-  __ret_875 = __s0_875 - vmull_u16(__s1_875, splat_lane_u16(__s2_875, __p3_875)); \
-  __ret_875; \
-})
-#else
-#define vmlsl_lane_u16(__p0_876, __p1_876, __p2_876, __p3_876) __extension__ ({ \
-  uint32x4_t __ret_876; \
-  uint32x4_t __s0_876 = __p0_876; \
-  uint16x4_t __s1_876 = __p1_876; \
-  uint16x4_t __s2_876 = __p2_876; \
-  uint32x4_t __rev0_876;  __rev0_876 = __builtin_shufflevector(__s0_876, __s0_876, 3, 2, 1, 0); \
-  uint16x4_t __rev1_876;  __rev1_876 = __builtin_shufflevector(__s1_876, __s1_876, 3, 2, 1, 0); \
-  uint16x4_t __rev2_876;  __rev2_876 = __builtin_shufflevector(__s2_876, __s2_876, 3, 2, 1, 0); \
-  __ret_876 = __rev0_876 - __noswap_vmull_u16(__rev1_876, __noswap_splat_lane_u16(__rev2_876, __p3_876)); \
-  __ret_876 = __builtin_shufflevector(__ret_876, __ret_876, 3, 2, 1, 0); \
-  __ret_876; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_lane_s32(__p0_877, __p1_877, __p2_877, __p3_877) __extension__ ({ \
-  int64x2_t __ret_877; \
-  int64x2_t __s0_877 = __p0_877; \
-  int32x2_t __s1_877 = __p1_877; \
-  int32x2_t __s2_877 = __p2_877; \
-  __ret_877 = __s0_877 - vmull_s32(__s1_877, splat_lane_s32(__s2_877, __p3_877)); \
-  __ret_877; \
-})
-#else
-#define vmlsl_lane_s32(__p0_878, __p1_878, __p2_878, __p3_878) __extension__ ({ \
-  int64x2_t __ret_878; \
-  int64x2_t __s0_878 = __p0_878; \
-  int32x2_t __s1_878 = __p1_878; \
-  int32x2_t __s2_878 = __p2_878; \
-  int64x2_t __rev0_878;  __rev0_878 = __builtin_shufflevector(__s0_878, __s0_878, 1, 0); \
-  int32x2_t __rev1_878;  __rev1_878 = __builtin_shufflevector(__s1_878, __s1_878, 1, 0); \
-  int32x2_t __rev2_878;  __rev2_878 = __builtin_shufflevector(__s2_878, __s2_878, 1, 0); \
-  __ret_878 = __rev0_878 - __noswap_vmull_s32(__rev1_878, __noswap_splat_lane_s32(__rev2_878, __p3_878)); \
-  __ret_878 = __builtin_shufflevector(__ret_878, __ret_878, 1, 0); \
-  __ret_878; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmlsl_lane_s16(__p0_879, __p1_879, __p2_879, __p3_879) __extension__ ({ \
-  int32x4_t __ret_879; \
-  int32x4_t __s0_879 = __p0_879; \
-  int16x4_t __s1_879 = __p1_879; \
-  int16x4_t __s2_879 = __p2_879; \
-  __ret_879 = __s0_879 - vmull_s16(__s1_879, splat_lane_s16(__s2_879, __p3_879)); \
-  __ret_879; \
-})
-#else
-#define vmlsl_lane_s16(__p0_880, __p1_880, __p2_880, __p3_880) __extension__ ({ \
-  int32x4_t __ret_880; \
-  int32x4_t __s0_880 = __p0_880; \
-  int16x4_t __s1_880 = __p1_880; \
-  int16x4_t __s2_880 = __p2_880; \
-  int32x4_t __rev0_880;  __rev0_880 = __builtin_shufflevector(__s0_880, __s0_880, 3, 2, 1, 0); \
-  int16x4_t __rev1_880;  __rev1_880 = __builtin_shufflevector(__s1_880, __s1_880, 3, 2, 1, 0); \
-  int16x4_t __rev2_880;  __rev2_880 = __builtin_shufflevector(__s2_880, __s2_880, 3, 2, 1, 0); \
-  __ret_880 = __rev0_880 - __noswap_vmull_s16(__rev1_880, __noswap_splat_lane_s16(__rev2_880, __p3_880)); \
-  __ret_880 = __builtin_shufflevector(__ret_880, __ret_880, 3, 2, 1, 0); \
-  __ret_880; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmlsl_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 - vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmlsl_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __noswap_vmull_u32(__rev1, (uint32x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vmlsl_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 - __noswap_vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlsl_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 - vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlsl_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmull_u16(__rev1, (uint16x4_t) {__p2, __p2, __p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vmlsl_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 - __noswap_vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 - vmull_s32(__p1, (int32x2_t) {__p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = __rev0 - __noswap_vmull_s32(__rev1, (int32x2_t) {__p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 - __noswap_vmull_s32(__p1, (int32x2_t) {__p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 - vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 - __noswap_vmull_s16(__rev1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 - __noswap_vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vset_lane_f16(__p0_881, __p1_881, __p2_881) __extension__ ({ \
-  float16x4_t __ret_881; \
-  float16_t __s0_881 = __p0_881; \
-  float16x4_t __s1_881 = __p1_881; \
-float16_t __reint_881 = __s0_881; \
-float16x4_t __reint1_881 = __s1_881; \
-int16x4_t __reint2_881 = vset_lane_s16(*(int16_t *) &__reint_881, *(int16x4_t *) &__reint1_881, __p2_881); \
-  __ret_881 = *(float16x4_t *) &__reint2_881; \
-  __ret_881; \
-})
-#else
-#define vset_lane_f16(__p0_882, __p1_882, __p2_882) __extension__ ({ \
-  float16x4_t __ret_882; \
-  float16_t __s0_882 = __p0_882; \
-  float16x4_t __s1_882 = __p1_882; \
-  float16x4_t __rev1_882;  __rev1_882 = __builtin_shufflevector(__s1_882, __s1_882, 3, 2, 1, 0); \
-float16_t __reint_882 = __s0_882; \
-float16x4_t __reint1_882 = __rev1_882; \
-int16x4_t __reint2_882 = __noswap_vset_lane_s16(*(int16_t *) &__reint_882, *(int16x4_t *) &__reint1_882, __p2_882); \
-  __ret_882 = *(float16x4_t *) &__reint2_882; \
-  __ret_882 = __builtin_shufflevector(__ret_882, __ret_882, 3, 2, 1, 0); \
-  __ret_882; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vsetq_lane_f16(__p0_883, __p1_883, __p2_883) __extension__ ({ \
-  float16x8_t __ret_883; \
-  float16_t __s0_883 = __p0_883; \
-  float16x8_t __s1_883 = __p1_883; \
-float16_t __reint_883 = __s0_883; \
-float16x8_t __reint1_883 = __s1_883; \
-int16x8_t __reint2_883 = vsetq_lane_s16(*(int16_t *) &__reint_883, *(int16x8_t *) &__reint1_883, __p2_883); \
-  __ret_883 = *(float16x8_t *) &__reint2_883; \
-  __ret_883; \
-})
-#else
-#define vsetq_lane_f16(__p0_884, __p1_884, __p2_884) __extension__ ({ \
-  float16x8_t __ret_884; \
-  float16_t __s0_884 = __p0_884; \
-  float16x8_t __s1_884 = __p1_884; \
-  float16x8_t __rev1_884;  __rev1_884 = __builtin_shufflevector(__s1_884, __s1_884, 7, 6, 5, 4, 3, 2, 1, 0); \
-float16_t __reint_884 = __s0_884; \
-float16x8_t __reint1_884 = __rev1_884; \
-int16x8_t __reint2_884 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_884, *(int16x8_t *) &__reint1_884, __p2_884); \
-  __ret_884 = *(float16x8_t *) &__reint2_884; \
-  __ret_884 = __builtin_shufflevector(__ret_884, __ret_884, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_884; \
-})
-#endif
-
-#if defined(__aarch64__) || defined(__arm64ec__)
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("aes,neon"))) poly128_t vmull_high_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly128_t __ret;
-  __ret = vmull_p64((poly64_t)(vget_high_p64(__p0)), (poly64_t)(vget_high_p64(__p1)));
-  return __ret;
-}
-#else
-__ai __attribute__((target("aes,neon"))) poly128_t vmull_high_p64(poly64x2_t __p0, poly64x2_t __p1) {
-  poly128_t __ret;
-  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  __ret = vmull_p64((poly64_t)(__noswap_vget_high_p64(__rev0)), (poly64_t)(__noswap_vget_high_p64(__rev1)));
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlalq_lane_high_f16(__p0_885, __p1_885, __p2_885, __p3_885) __extension__ ({ \
-  float32x4_t __ret_885; \
-  float32x4_t __s0_885 = __p0_885; \
-  float16x8_t __s1_885 = __p1_885; \
-  float16x4_t __s2_885 = __p2_885; \
-  __ret_885 = vfmlalq_high_f16(__s0_885, __s1_885, (float16x8_t) {vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885)}); \
-  __ret_885; \
-})
-#else
-#define vfmlalq_lane_high_f16(__p0_886, __p1_886, __p2_886, __p3_886) __extension__ ({ \
-  float32x4_t __ret_886; \
-  float32x4_t __s0_886 = __p0_886; \
-  float16x8_t __s1_886 = __p1_886; \
-  float16x4_t __s2_886 = __p2_886; \
-  float32x4_t __rev0_886;  __rev0_886 = __builtin_shufflevector(__s0_886, __s0_886, 3, 2, 1, 0); \
-  float16x8_t __rev1_886;  __rev1_886 = __builtin_shufflevector(__s1_886, __s1_886, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_886;  __rev2_886 = __builtin_shufflevector(__s2_886, __s2_886, 3, 2, 1, 0); \
-  __ret_886 = __noswap_vfmlalq_high_f16(__rev0_886, __rev1_886, (float16x8_t) {__noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886)}); \
-  __ret_886 = __builtin_shufflevector(__ret_886, __ret_886, 3, 2, 1, 0); \
-  __ret_886; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlal_lane_high_f16(__p0_887, __p1_887, __p2_887, __p3_887) __extension__ ({ \
-  float32x2_t __ret_887; \
-  float32x2_t __s0_887 = __p0_887; \
-  float16x4_t __s1_887 = __p1_887; \
-  float16x4_t __s2_887 = __p2_887; \
-  __ret_887 = vfmlal_high_f16(__s0_887, __s1_887, (float16x4_t) {vget_lane_f16(__s2_887, __p3_887), vget_lane_f16(__s2_887, __p3_887), vget_lane_f16(__s2_887, __p3_887), vget_lane_f16(__s2_887, __p3_887)}); \
-  __ret_887; \
-})
-#else
-#define vfmlal_lane_high_f16(__p0_888, __p1_888, __p2_888, __p3_888) __extension__ ({ \
-  float32x2_t __ret_888; \
-  float32x2_t __s0_888 = __p0_888; \
-  float16x4_t __s1_888 = __p1_888; \
-  float16x4_t __s2_888 = __p2_888; \
-  float32x2_t __rev0_888;  __rev0_888 = __builtin_shufflevector(__s0_888, __s0_888, 1, 0); \
-  float16x4_t __rev1_888;  __rev1_888 = __builtin_shufflevector(__s1_888, __s1_888, 3, 2, 1, 0); \
-  float16x4_t __rev2_888;  __rev2_888 = __builtin_shufflevector(__s2_888, __s2_888, 3, 2, 1, 0); \
-  __ret_888 = __noswap_vfmlal_high_f16(__rev0_888, __rev1_888, (float16x4_t) {__noswap_vget_lane_f16(__rev2_888, __p3_888), __noswap_vget_lane_f16(__rev2_888, __p3_888), __noswap_vget_lane_f16(__rev2_888, __p3_888), __noswap_vget_lane_f16(__rev2_888, __p3_888)}); \
-  __ret_888 = __builtin_shufflevector(__ret_888, __ret_888, 1, 0); \
-  __ret_888; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlalq_lane_low_f16(__p0_889, __p1_889, __p2_889, __p3_889) __extension__ ({ \
-  float32x4_t __ret_889; \
-  float32x4_t __s0_889 = __p0_889; \
-  float16x8_t __s1_889 = __p1_889; \
-  float16x4_t __s2_889 = __p2_889; \
-  __ret_889 = vfmlalq_low_f16(__s0_889, __s1_889, (float16x8_t) {vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889)}); \
-  __ret_889; \
-})
-#else
-#define vfmlalq_lane_low_f16(__p0_890, __p1_890, __p2_890, __p3_890) __extension__ ({ \
-  float32x4_t __ret_890; \
-  float32x4_t __s0_890 = __p0_890; \
-  float16x8_t __s1_890 = __p1_890; \
-  float16x4_t __s2_890 = __p2_890; \
-  float32x4_t __rev0_890;  __rev0_890 = __builtin_shufflevector(__s0_890, __s0_890, 3, 2, 1, 0); \
-  float16x8_t __rev1_890;  __rev1_890 = __builtin_shufflevector(__s1_890, __s1_890, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_890;  __rev2_890 = __builtin_shufflevector(__s2_890, __s2_890, 3, 2, 1, 0); \
-  __ret_890 = __noswap_vfmlalq_low_f16(__rev0_890, __rev1_890, (float16x8_t) {__noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890)}); \
-  __ret_890 = __builtin_shufflevector(__ret_890, __ret_890, 3, 2, 1, 0); \
-  __ret_890; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlal_lane_low_f16(__p0_891, __p1_891, __p2_891, __p3_891) __extension__ ({ \
-  float32x2_t __ret_891; \
-  float32x2_t __s0_891 = __p0_891; \
-  float16x4_t __s1_891 = __p1_891; \
-  float16x4_t __s2_891 = __p2_891; \
-  __ret_891 = vfmlal_low_f16(__s0_891, __s1_891, (float16x4_t) {vget_lane_f16(__s2_891, __p3_891), vget_lane_f16(__s2_891, __p3_891), vget_lane_f16(__s2_891, __p3_891), vget_lane_f16(__s2_891, __p3_891)}); \
-  __ret_891; \
-})
-#else
-#define vfmlal_lane_low_f16(__p0_892, __p1_892, __p2_892, __p3_892) __extension__ ({ \
-  float32x2_t __ret_892; \
-  float32x2_t __s0_892 = __p0_892; \
-  float16x4_t __s1_892 = __p1_892; \
-  float16x4_t __s2_892 = __p2_892; \
-  float32x2_t __rev0_892;  __rev0_892 = __builtin_shufflevector(__s0_892, __s0_892, 1, 0); \
-  float16x4_t __rev1_892;  __rev1_892 = __builtin_shufflevector(__s1_892, __s1_892, 3, 2, 1, 0); \
-  float16x4_t __rev2_892;  __rev2_892 = __builtin_shufflevector(__s2_892, __s2_892, 3, 2, 1, 0); \
-  __ret_892 = __noswap_vfmlal_low_f16(__rev0_892, __rev1_892, (float16x4_t) {__noswap_vget_lane_f16(__rev2_892, __p3_892), __noswap_vget_lane_f16(__rev2_892, __p3_892), __noswap_vget_lane_f16(__rev2_892, __p3_892), __noswap_vget_lane_f16(__rev2_892, __p3_892)}); \
-  __ret_892 = __builtin_shufflevector(__ret_892, __ret_892, 1, 0); \
-  __ret_892; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlalq_laneq_high_f16(__p0_893, __p1_893, __p2_893, __p3_893) __extension__ ({ \
-  float32x4_t __ret_893; \
-  float32x4_t __s0_893 = __p0_893; \
-  float16x8_t __s1_893 = __p1_893; \
-  float16x8_t __s2_893 = __p2_893; \
-  __ret_893 = vfmlalq_high_f16(__s0_893, __s1_893, (float16x8_t) {vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893)}); \
-  __ret_893; \
-})
-#else
-#define vfmlalq_laneq_high_f16(__p0_894, __p1_894, __p2_894, __p3_894) __extension__ ({ \
-  float32x4_t __ret_894; \
-  float32x4_t __s0_894 = __p0_894; \
-  float16x8_t __s1_894 = __p1_894; \
-  float16x8_t __s2_894 = __p2_894; \
-  float32x4_t __rev0_894;  __rev0_894 = __builtin_shufflevector(__s0_894, __s0_894, 3, 2, 1, 0); \
-  float16x8_t __rev1_894;  __rev1_894 = __builtin_shufflevector(__s1_894, __s1_894, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_894;  __rev2_894 = __builtin_shufflevector(__s2_894, __s2_894, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_894 = __noswap_vfmlalq_high_f16(__rev0_894, __rev1_894, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894)}); \
-  __ret_894 = __builtin_shufflevector(__ret_894, __ret_894, 3, 2, 1, 0); \
-  __ret_894; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlal_laneq_high_f16(__p0_895, __p1_895, __p2_895, __p3_895) __extension__ ({ \
-  float32x2_t __ret_895; \
-  float32x2_t __s0_895 = __p0_895; \
-  float16x4_t __s1_895 = __p1_895; \
-  float16x8_t __s2_895 = __p2_895; \
-  __ret_895 = vfmlal_high_f16(__s0_895, __s1_895, (float16x4_t) {vgetq_lane_f16(__s2_895, __p3_895), vgetq_lane_f16(__s2_895, __p3_895), vgetq_lane_f16(__s2_895, __p3_895), vgetq_lane_f16(__s2_895, __p3_895)}); \
-  __ret_895; \
-})
-#else
-#define vfmlal_laneq_high_f16(__p0_896, __p1_896, __p2_896, __p3_896) __extension__ ({ \
-  float32x2_t __ret_896; \
-  float32x2_t __s0_896 = __p0_896; \
-  float16x4_t __s1_896 = __p1_896; \
-  float16x8_t __s2_896 = __p2_896; \
-  float32x2_t __rev0_896;  __rev0_896 = __builtin_shufflevector(__s0_896, __s0_896, 1, 0); \
-  float16x4_t __rev1_896;  __rev1_896 = __builtin_shufflevector(__s1_896, __s1_896, 3, 2, 1, 0); \
-  float16x8_t __rev2_896;  __rev2_896 = __builtin_shufflevector(__s2_896, __s2_896, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_896 = __noswap_vfmlal_high_f16(__rev0_896, __rev1_896, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_896, __p3_896), __noswap_vgetq_lane_f16(__rev2_896, __p3_896), __noswap_vgetq_lane_f16(__rev2_896, __p3_896), __noswap_vgetq_lane_f16(__rev2_896, __p3_896)}); \
-  __ret_896 = __builtin_shufflevector(__ret_896, __ret_896, 1, 0); \
-  __ret_896; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlalq_laneq_low_f16(__p0_897, __p1_897, __p2_897, __p3_897) __extension__ ({ \
-  float32x4_t __ret_897; \
-  float32x4_t __s0_897 = __p0_897; \
-  float16x8_t __s1_897 = __p1_897; \
-  float16x8_t __s2_897 = __p2_897; \
-  __ret_897 = vfmlalq_low_f16(__s0_897, __s1_897, (float16x8_t) {vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897)}); \
-  __ret_897; \
-})
-#else
-#define vfmlalq_laneq_low_f16(__p0_898, __p1_898, __p2_898, __p3_898) __extension__ ({ \
-  float32x4_t __ret_898; \
-  float32x4_t __s0_898 = __p0_898; \
-  float16x8_t __s1_898 = __p1_898; \
-  float16x8_t __s2_898 = __p2_898; \
-  float32x4_t __rev0_898;  __rev0_898 = __builtin_shufflevector(__s0_898, __s0_898, 3, 2, 1, 0); \
-  float16x8_t __rev1_898;  __rev1_898 = __builtin_shufflevector(__s1_898, __s1_898, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_898;  __rev2_898 = __builtin_shufflevector(__s2_898, __s2_898, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_898 = __noswap_vfmlalq_low_f16(__rev0_898, __rev1_898, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898)}); \
-  __ret_898 = __builtin_shufflevector(__ret_898, __ret_898, 3, 2, 1, 0); \
-  __ret_898; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlal_laneq_low_f16(__p0_899, __p1_899, __p2_899, __p3_899) __extension__ ({ \
-  float32x2_t __ret_899; \
-  float32x2_t __s0_899 = __p0_899; \
-  float16x4_t __s1_899 = __p1_899; \
-  float16x8_t __s2_899 = __p2_899; \
-  __ret_899 = vfmlal_low_f16(__s0_899, __s1_899, (float16x4_t) {vgetq_lane_f16(__s2_899, __p3_899), vgetq_lane_f16(__s2_899, __p3_899), vgetq_lane_f16(__s2_899, __p3_899), vgetq_lane_f16(__s2_899, __p3_899)}); \
-  __ret_899; \
-})
-#else
-#define vfmlal_laneq_low_f16(__p0_900, __p1_900, __p2_900, __p3_900) __extension__ ({ \
-  float32x2_t __ret_900; \
-  float32x2_t __s0_900 = __p0_900; \
-  float16x4_t __s1_900 = __p1_900; \
-  float16x8_t __s2_900 = __p2_900; \
-  float32x2_t __rev0_900;  __rev0_900 = __builtin_shufflevector(__s0_900, __s0_900, 1, 0); \
-  float16x4_t __rev1_900;  __rev1_900 = __builtin_shufflevector(__s1_900, __s1_900, 3, 2, 1, 0); \
-  float16x8_t __rev2_900;  __rev2_900 = __builtin_shufflevector(__s2_900, __s2_900, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_900 = __noswap_vfmlal_low_f16(__rev0_900, __rev1_900, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_900, __p3_900), __noswap_vgetq_lane_f16(__rev2_900, __p3_900), __noswap_vgetq_lane_f16(__rev2_900, __p3_900), __noswap_vgetq_lane_f16(__rev2_900, __p3_900)}); \
-  __ret_900 = __builtin_shufflevector(__ret_900, __ret_900, 1, 0); \
-  __ret_900; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlslq_lane_high_f16(__p0_901, __p1_901, __p2_901, __p3_901) __extension__ ({ \
-  float32x4_t __ret_901; \
-  float32x4_t __s0_901 = __p0_901; \
-  float16x8_t __s1_901 = __p1_901; \
-  float16x4_t __s2_901 = __p2_901; \
-  __ret_901 = vfmlslq_high_f16(__s0_901, __s1_901, (float16x8_t) {vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901)}); \
-  __ret_901; \
-})
-#else
-#define vfmlslq_lane_high_f16(__p0_902, __p1_902, __p2_902, __p3_902) __extension__ ({ \
-  float32x4_t __ret_902; \
-  float32x4_t __s0_902 = __p0_902; \
-  float16x8_t __s1_902 = __p1_902; \
-  float16x4_t __s2_902 = __p2_902; \
-  float32x4_t __rev0_902;  __rev0_902 = __builtin_shufflevector(__s0_902, __s0_902, 3, 2, 1, 0); \
-  float16x8_t __rev1_902;  __rev1_902 = __builtin_shufflevector(__s1_902, __s1_902, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_902;  __rev2_902 = __builtin_shufflevector(__s2_902, __s2_902, 3, 2, 1, 0); \
-  __ret_902 = __noswap_vfmlslq_high_f16(__rev0_902, __rev1_902, (float16x8_t) {__noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902)}); \
-  __ret_902 = __builtin_shufflevector(__ret_902, __ret_902, 3, 2, 1, 0); \
-  __ret_902; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlsl_lane_high_f16(__p0_903, __p1_903, __p2_903, __p3_903) __extension__ ({ \
-  float32x2_t __ret_903; \
-  float32x2_t __s0_903 = __p0_903; \
-  float16x4_t __s1_903 = __p1_903; \
-  float16x4_t __s2_903 = __p2_903; \
-  __ret_903 = vfmlsl_high_f16(__s0_903, __s1_903, (float16x4_t) {vget_lane_f16(__s2_903, __p3_903), vget_lane_f16(__s2_903, __p3_903), vget_lane_f16(__s2_903, __p3_903), vget_lane_f16(__s2_903, __p3_903)}); \
-  __ret_903; \
-})
-#else
-#define vfmlsl_lane_high_f16(__p0_904, __p1_904, __p2_904, __p3_904) __extension__ ({ \
-  float32x2_t __ret_904; \
-  float32x2_t __s0_904 = __p0_904; \
-  float16x4_t __s1_904 = __p1_904; \
-  float16x4_t __s2_904 = __p2_904; \
-  float32x2_t __rev0_904;  __rev0_904 = __builtin_shufflevector(__s0_904, __s0_904, 1, 0); \
-  float16x4_t __rev1_904;  __rev1_904 = __builtin_shufflevector(__s1_904, __s1_904, 3, 2, 1, 0); \
-  float16x4_t __rev2_904;  __rev2_904 = __builtin_shufflevector(__s2_904, __s2_904, 3, 2, 1, 0); \
-  __ret_904 = __noswap_vfmlsl_high_f16(__rev0_904, __rev1_904, (float16x4_t) {__noswap_vget_lane_f16(__rev2_904, __p3_904), __noswap_vget_lane_f16(__rev2_904, __p3_904), __noswap_vget_lane_f16(__rev2_904, __p3_904), __noswap_vget_lane_f16(__rev2_904, __p3_904)}); \
-  __ret_904 = __builtin_shufflevector(__ret_904, __ret_904, 1, 0); \
-  __ret_904; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlslq_lane_low_f16(__p0_905, __p1_905, __p2_905, __p3_905) __extension__ ({ \
-  float32x4_t __ret_905; \
-  float32x4_t __s0_905 = __p0_905; \
-  float16x8_t __s1_905 = __p1_905; \
-  float16x4_t __s2_905 = __p2_905; \
-  __ret_905 = vfmlslq_low_f16(__s0_905, __s1_905, (float16x8_t) {vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905)}); \
-  __ret_905; \
-})
-#else
-#define vfmlslq_lane_low_f16(__p0_906, __p1_906, __p2_906, __p3_906) __extension__ ({ \
-  float32x4_t __ret_906; \
-  float32x4_t __s0_906 = __p0_906; \
-  float16x8_t __s1_906 = __p1_906; \
-  float16x4_t __s2_906 = __p2_906; \
-  float32x4_t __rev0_906;  __rev0_906 = __builtin_shufflevector(__s0_906, __s0_906, 3, 2, 1, 0); \
-  float16x8_t __rev1_906;  __rev1_906 = __builtin_shufflevector(__s1_906, __s1_906, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x4_t __rev2_906;  __rev2_906 = __builtin_shufflevector(__s2_906, __s2_906, 3, 2, 1, 0); \
-  __ret_906 = __noswap_vfmlslq_low_f16(__rev0_906, __rev1_906, (float16x8_t) {__noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906)}); \
-  __ret_906 = __builtin_shufflevector(__ret_906, __ret_906, 3, 2, 1, 0); \
-  __ret_906; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlsl_lane_low_f16(__p0_907, __p1_907, __p2_907, __p3_907) __extension__ ({ \
-  float32x2_t __ret_907; \
-  float32x2_t __s0_907 = __p0_907; \
-  float16x4_t __s1_907 = __p1_907; \
-  float16x4_t __s2_907 = __p2_907; \
-  __ret_907 = vfmlsl_low_f16(__s0_907, __s1_907, (float16x4_t) {vget_lane_f16(__s2_907, __p3_907), vget_lane_f16(__s2_907, __p3_907), vget_lane_f16(__s2_907, __p3_907), vget_lane_f16(__s2_907, __p3_907)}); \
-  __ret_907; \
-})
-#else
-#define vfmlsl_lane_low_f16(__p0_908, __p1_908, __p2_908, __p3_908) __extension__ ({ \
-  float32x2_t __ret_908; \
-  float32x2_t __s0_908 = __p0_908; \
-  float16x4_t __s1_908 = __p1_908; \
-  float16x4_t __s2_908 = __p2_908; \
-  float32x2_t __rev0_908;  __rev0_908 = __builtin_shufflevector(__s0_908, __s0_908, 1, 0); \
-  float16x4_t __rev1_908;  __rev1_908 = __builtin_shufflevector(__s1_908, __s1_908, 3, 2, 1, 0); \
-  float16x4_t __rev2_908;  __rev2_908 = __builtin_shufflevector(__s2_908, __s2_908, 3, 2, 1, 0); \
-  __ret_908 = __noswap_vfmlsl_low_f16(__rev0_908, __rev1_908, (float16x4_t) {__noswap_vget_lane_f16(__rev2_908, __p3_908), __noswap_vget_lane_f16(__rev2_908, __p3_908), __noswap_vget_lane_f16(__rev2_908, __p3_908), __noswap_vget_lane_f16(__rev2_908, __p3_908)}); \
-  __ret_908 = __builtin_shufflevector(__ret_908, __ret_908, 1, 0); \
-  __ret_908; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlslq_laneq_high_f16(__p0_909, __p1_909, __p2_909, __p3_909) __extension__ ({ \
-  float32x4_t __ret_909; \
-  float32x4_t __s0_909 = __p0_909; \
-  float16x8_t __s1_909 = __p1_909; \
-  float16x8_t __s2_909 = __p2_909; \
-  __ret_909 = vfmlslq_high_f16(__s0_909, __s1_909, (float16x8_t) {vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909)}); \
-  __ret_909; \
-})
-#else
-#define vfmlslq_laneq_high_f16(__p0_910, __p1_910, __p2_910, __p3_910) __extension__ ({ \
-  float32x4_t __ret_910; \
-  float32x4_t __s0_910 = __p0_910; \
-  float16x8_t __s1_910 = __p1_910; \
-  float16x8_t __s2_910 = __p2_910; \
-  float32x4_t __rev0_910;  __rev0_910 = __builtin_shufflevector(__s0_910, __s0_910, 3, 2, 1, 0); \
-  float16x8_t __rev1_910;  __rev1_910 = __builtin_shufflevector(__s1_910, __s1_910, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_910;  __rev2_910 = __builtin_shufflevector(__s2_910, __s2_910, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_910 = __noswap_vfmlslq_high_f16(__rev0_910, __rev1_910, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910)}); \
-  __ret_910 = __builtin_shufflevector(__ret_910, __ret_910, 3, 2, 1, 0); \
-  __ret_910; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlsl_laneq_high_f16(__p0_911, __p1_911, __p2_911, __p3_911) __extension__ ({ \
-  float32x2_t __ret_911; \
-  float32x2_t __s0_911 = __p0_911; \
-  float16x4_t __s1_911 = __p1_911; \
-  float16x8_t __s2_911 = __p2_911; \
-  __ret_911 = vfmlsl_high_f16(__s0_911, __s1_911, (float16x4_t) {vgetq_lane_f16(__s2_911, __p3_911), vgetq_lane_f16(__s2_911, __p3_911), vgetq_lane_f16(__s2_911, __p3_911), vgetq_lane_f16(__s2_911, __p3_911)}); \
-  __ret_911; \
-})
-#else
-#define vfmlsl_laneq_high_f16(__p0_912, __p1_912, __p2_912, __p3_912) __extension__ ({ \
-  float32x2_t __ret_912; \
-  float32x2_t __s0_912 = __p0_912; \
-  float16x4_t __s1_912 = __p1_912; \
-  float16x8_t __s2_912 = __p2_912; \
-  float32x2_t __rev0_912;  __rev0_912 = __builtin_shufflevector(__s0_912, __s0_912, 1, 0); \
-  float16x4_t __rev1_912;  __rev1_912 = __builtin_shufflevector(__s1_912, __s1_912, 3, 2, 1, 0); \
-  float16x8_t __rev2_912;  __rev2_912 = __builtin_shufflevector(__s2_912, __s2_912, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_912 = __noswap_vfmlsl_high_f16(__rev0_912, __rev1_912, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_912, __p3_912), __noswap_vgetq_lane_f16(__rev2_912, __p3_912), __noswap_vgetq_lane_f16(__rev2_912, __p3_912), __noswap_vgetq_lane_f16(__rev2_912, __p3_912)}); \
-  __ret_912 = __builtin_shufflevector(__ret_912, __ret_912, 1, 0); \
-  __ret_912; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlslq_laneq_low_f16(__p0_913, __p1_913, __p2_913, __p3_913) __extension__ ({ \
-  float32x4_t __ret_913; \
-  float32x4_t __s0_913 = __p0_913; \
-  float16x8_t __s1_913 = __p1_913; \
-  float16x8_t __s2_913 = __p2_913; \
-  __ret_913 = vfmlslq_low_f16(__s0_913, __s1_913, (float16x8_t) {vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913)}); \
-  __ret_913; \
-})
-#else
-#define vfmlslq_laneq_low_f16(__p0_914, __p1_914, __p2_914, __p3_914) __extension__ ({ \
-  float32x4_t __ret_914; \
-  float32x4_t __s0_914 = __p0_914; \
-  float16x8_t __s1_914 = __p1_914; \
-  float16x8_t __s2_914 = __p2_914; \
-  float32x4_t __rev0_914;  __rev0_914 = __builtin_shufflevector(__s0_914, __s0_914, 3, 2, 1, 0); \
-  float16x8_t __rev1_914;  __rev1_914 = __builtin_shufflevector(__s1_914, __s1_914, 7, 6, 5, 4, 3, 2, 1, 0); \
-  float16x8_t __rev2_914;  __rev2_914 = __builtin_shufflevector(__s2_914, __s2_914, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_914 = __noswap_vfmlslq_low_f16(__rev0_914, __rev1_914, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914)}); \
-  __ret_914 = __builtin_shufflevector(__ret_914, __ret_914, 3, 2, 1, 0); \
-  __ret_914; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vfmlsl_laneq_low_f16(__p0_915, __p1_915, __p2_915, __p3_915) __extension__ ({ \
-  float32x2_t __ret_915; \
-  float32x2_t __s0_915 = __p0_915; \
-  float16x4_t __s1_915 = __p1_915; \
-  float16x8_t __s2_915 = __p2_915; \
-  __ret_915 = vfmlsl_low_f16(__s0_915, __s1_915, (float16x4_t) {vgetq_lane_f16(__s2_915, __p3_915), vgetq_lane_f16(__s2_915, __p3_915), vgetq_lane_f16(__s2_915, __p3_915), vgetq_lane_f16(__s2_915, __p3_915)}); \
-  __ret_915; \
-})
-#else
-#define vfmlsl_laneq_low_f16(__p0_916, __p1_916, __p2_916, __p3_916) __extension__ ({ \
-  float32x2_t __ret_916; \
-  float32x2_t __s0_916 = __p0_916; \
-  float16x4_t __s1_916 = __p1_916; \
-  float16x8_t __s2_916 = __p2_916; \
-  float32x2_t __rev0_916;  __rev0_916 = __builtin_shufflevector(__s0_916, __s0_916, 1, 0); \
-  float16x4_t __rev1_916;  __rev1_916 = __builtin_shufflevector(__s1_916, __s1_916, 3, 2, 1, 0); \
-  float16x8_t __rev2_916;  __rev2_916 = __builtin_shufflevector(__s2_916, __s2_916, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_916 = __noswap_vfmlsl_low_f16(__rev0_916, __rev1_916, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_916, __p3_916), __noswap_vgetq_lane_f16(__rev2_916, __p3_916), __noswap_vgetq_lane_f16(__rev2_916, __p3_916), __noswap_vgetq_lane_f16(__rev2_916, __p3_916)}); \
-  __ret_916 = __builtin_shufflevector(__ret_916, __ret_916, 1, 0); \
-  __ret_916; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulh_lane_f16(__p0_917, __p1_917, __p2_917) __extension__ ({ \
-  float16_t __ret_917; \
-  float16_t __s0_917 = __p0_917; \
-  float16x4_t __s1_917 = __p1_917; \
-  __ret_917 = __s0_917 * vget_lane_f16(__s1_917, __p2_917); \
-  __ret_917; \
-})
-#else
-#define vmulh_lane_f16(__p0_918, __p1_918, __p2_918) __extension__ ({ \
-  float16_t __ret_918; \
-  float16_t __s0_918 = __p0_918; \
-  float16x4_t __s1_918 = __p1_918; \
-  float16x4_t __rev1_918;  __rev1_918 = __builtin_shufflevector(__s1_918, __s1_918, 3, 2, 1, 0); \
-  __ret_918 = __s0_918 * __noswap_vget_lane_f16(__rev1_918, __p2_918); \
-  __ret_918; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vmulh_laneq_f16(__p0_919, __p1_919, __p2_919) __extension__ ({ \
-  float16_t __ret_919; \
-  float16_t __s0_919 = __p0_919; \
-  float16x8_t __s1_919 = __p1_919; \
-  __ret_919 = __s0_919 * vgetq_lane_f16(__s1_919, __p2_919); \
-  __ret_919; \
-})
-#else
-#define vmulh_laneq_f16(__p0_920, __p1_920, __p2_920) __extension__ ({ \
-  float16_t __ret_920; \
-  float16_t __s0_920 = __p0_920; \
-  float16x8_t __s1_920 = __p1_920; \
-  float16x8_t __rev1_920;  __rev1_920 = __builtin_shufflevector(__s1_920, __s1_920, 7, 6, 5, 4, 3, 2, 1, 0); \
-  __ret_920 = __s0_920 * __noswap_vgetq_lane_f16(__rev1_920, __p2_920); \
-  __ret_920; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vabdl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  __ret = vabdl_u8(vget_high_u8(__p0), vget_high_u8(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vabdl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vabdl_u8(__noswap_vget_high_u8(__rev0), __noswap_vget_high_u8(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vabdl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  __ret = vabdl_u32(vget_high_u32(__p0), vget_high_u32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vabdl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vabdl_u32(__noswap_vget_high_u32(__rev0), __noswap_vget_high_u32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vabdl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  __ret = vabdl_u16(vget_high_u16(__p0), vget_high_u16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vabdl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vabdl_u16(__noswap_vget_high_u16(__rev0), __noswap_vget_high_u16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vabdl_high_s8(int8x16_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  __ret = vabdl_s8(vget_high_s8(__p0), vget_high_s8(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vabdl_high_s8(int8x16_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vabdl_s8(__noswap_vget_high_s8(__rev0), __noswap_vget_high_s8(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vabdl_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  __ret = vabdl_s32(vget_high_s32(__p0), vget_high_s32(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vabdl_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vabdl_s32(__noswap_vget_high_s32(__rev0), __noswap_vget_high_s32(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vabdl_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  __ret = vabdl_s16(vget_high_s16(__p0), vget_high_s16(__p1));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vabdl_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vabdl_s16(__noswap_vget_high_s16(__rev0), __noswap_vget_high_s16(__rev1));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vaddl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  __ret = vmovl_high_u8(__p0) + vmovl_high_u8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vaddl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_u8(__rev0) + __noswap_vmovl_high_u8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vaddl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  __ret = vmovl_high_u32(__p0) + vmovl_high_u32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vaddl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_u32(__rev0) + __noswap_vmovl_high_u32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vaddl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  __ret = vmovl_high_u16(__p0) + vmovl_high_u16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vaddl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_u16(__rev0) + __noswap_vmovl_high_u16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vaddl_high_s8(int8x16_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  __ret = vmovl_high_s8(__p0) + vmovl_high_s8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vaddl_high_s8(int8x16_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_s8(__rev0) + __noswap_vmovl_high_s8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vaddl_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  __ret = vmovl_high_s32(__p0) + vmovl_high_s32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vaddl_high_s32(int32x4_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_s32(__rev0) + __noswap_vmovl_high_s32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vaddl_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  __ret = vmovl_high_s16(__p0) + vmovl_high_s16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vaddl_high_s16(int16x8_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmovl_high_s16(__rev0) + __noswap_vmovl_high_s16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vaddw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  __ret = __p0 + vmovl_high_u8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vaddw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_high_u8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vaddw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  __ret = __p0 + vmovl_high_u32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vaddw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_high_u32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vaddw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  __ret = __p0 + vmovl_high_u16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vaddw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_high_u16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vaddw_high_s8(int16x8_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  __ret = __p0 + vmovl_high_s8(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vaddw_high_s8(int16x8_t __p0, int8x16_t __p1) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_high_s8(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vaddw_high_s32(int64x2_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  __ret = __p0 + vmovl_high_s32(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vaddw_high_s32(int64x2_t __p0, int32x4_t __p1) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_high_s32(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vaddw_high_s16(int32x4_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  __ret = __p0 + vmovl_high_s16(__p1);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vaddw_high_s16(int32x4_t __p0, int16x8_t __p1) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vmovl_high_s16(__rev1);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_p64(__p0_921, __p1_921, __p2_921, __p3_921) __extension__ ({ \
-  poly64x2_t __ret_921; \
-  poly64x2_t __s0_921 = __p0_921; \
-  poly64x1_t __s2_921 = __p2_921; \
-  __ret_921 = vsetq_lane_p64(vget_lane_p64(__s2_921, __p3_921), __s0_921, __p1_921); \
-  __ret_921; \
-})
-#else
-#define vcopyq_lane_p64(__p0_922, __p1_922, __p2_922, __p3_922) __extension__ ({ \
-  poly64x2_t __ret_922; \
-  poly64x2_t __s0_922 = __p0_922; \
-  poly64x1_t __s2_922 = __p2_922; \
-  poly64x2_t __rev0_922;  __rev0_922 = __builtin_shufflevector(__s0_922, __s0_922, 1, 0); \
-  __ret_922 = __noswap_vsetq_lane_p64(vget_lane_p64(__s2_922, __p3_922), __rev0_922, __p1_922); \
-  __ret_922 = __builtin_shufflevector(__ret_922, __ret_922, 1, 0); \
-  __ret_922; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_lane_f64(__p0_923, __p1_923, __p2_923, __p3_923) __extension__ ({ \
-  float64x2_t __ret_923; \
-  float64x2_t __s0_923 = __p0_923; \
-  float64x1_t __s2_923 = __p2_923; \
-  __ret_923 = vsetq_lane_f64(vget_lane_f64(__s2_923, __p3_923), __s0_923, __p1_923); \
-  __ret_923; \
-})
-#else
-#define vcopyq_lane_f64(__p0_924, __p1_924, __p2_924, __p3_924) __extension__ ({ \
-  float64x2_t __ret_924; \
-  float64x2_t __s0_924 = __p0_924; \
-  float64x1_t __s2_924 = __p2_924; \
-  float64x2_t __rev0_924;  __rev0_924 = __builtin_shufflevector(__s0_924, __s0_924, 1, 0); \
-  __ret_924 = __noswap_vsetq_lane_f64(vget_lane_f64(__s2_924, __p3_924), __rev0_924, __p1_924); \
-  __ret_924 = __builtin_shufflevector(__ret_924, __ret_924, 1, 0); \
-  __ret_924; \
-})
-#endif
-
-#define vcopy_lane_p64(__p0_925, __p1_925, __p2_925, __p3_925) __extension__ ({ \
-  poly64x1_t __ret_925; \
-  poly64x1_t __s0_925 = __p0_925; \
-  poly64x1_t __s2_925 = __p2_925; \
-  __ret_925 = vset_lane_p64(vget_lane_p64(__s2_925, __p3_925), __s0_925, __p1_925); \
-  __ret_925; \
-})
-#define vcopy_lane_f64(__p0_926, __p1_926, __p2_926, __p3_926) __extension__ ({ \
-  float64x1_t __ret_926; \
-  float64x1_t __s0_926 = __p0_926; \
-  float64x1_t __s2_926 = __p2_926; \
-  __ret_926 = vset_lane_f64(vget_lane_f64(__s2_926, __p3_926), __s0_926, __p1_926); \
-  __ret_926; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_p64(__p0_927, __p1_927, __p2_927, __p3_927) __extension__ ({ \
-  poly64x2_t __ret_927; \
-  poly64x2_t __s0_927 = __p0_927; \
-  poly64x2_t __s2_927 = __p2_927; \
-  __ret_927 = vsetq_lane_p64(vgetq_lane_p64(__s2_927, __p3_927), __s0_927, __p1_927); \
-  __ret_927; \
-})
-#else
-#define vcopyq_laneq_p64(__p0_928, __p1_928, __p2_928, __p3_928) __extension__ ({ \
-  poly64x2_t __ret_928; \
-  poly64x2_t __s0_928 = __p0_928; \
-  poly64x2_t __s2_928 = __p2_928; \
-  poly64x2_t __rev0_928;  __rev0_928 = __builtin_shufflevector(__s0_928, __s0_928, 1, 0); \
-  poly64x2_t __rev2_928;  __rev2_928 = __builtin_shufflevector(__s2_928, __s2_928, 1, 0); \
-  __ret_928 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_928, __p3_928), __rev0_928, __p1_928); \
-  __ret_928 = __builtin_shufflevector(__ret_928, __ret_928, 1, 0); \
-  __ret_928; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopyq_laneq_f64(__p0_929, __p1_929, __p2_929, __p3_929) __extension__ ({ \
-  float64x2_t __ret_929; \
-  float64x2_t __s0_929 = __p0_929; \
-  float64x2_t __s2_929 = __p2_929; \
-  __ret_929 = vsetq_lane_f64(vgetq_lane_f64(__s2_929, __p3_929), __s0_929, __p1_929); \
-  __ret_929; \
-})
-#else
-#define vcopyq_laneq_f64(__p0_930, __p1_930, __p2_930, __p3_930) __extension__ ({ \
-  float64x2_t __ret_930; \
-  float64x2_t __s0_930 = __p0_930; \
-  float64x2_t __s2_930 = __p2_930; \
-  float64x2_t __rev0_930;  __rev0_930 = __builtin_shufflevector(__s0_930, __s0_930, 1, 0); \
-  float64x2_t __rev2_930;  __rev2_930 = __builtin_shufflevector(__s2_930, __s2_930, 1, 0); \
-  __ret_930 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_930, __p3_930), __rev0_930, __p1_930); \
-  __ret_930 = __builtin_shufflevector(__ret_930, __ret_930, 1, 0); \
-  __ret_930; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_p64(__p0_931, __p1_931, __p2_931, __p3_931) __extension__ ({ \
-  poly64x1_t __ret_931; \
-  poly64x1_t __s0_931 = __p0_931; \
-  poly64x2_t __s2_931 = __p2_931; \
-  __ret_931 = vset_lane_p64(vgetq_lane_p64(__s2_931, __p3_931), __s0_931, __p1_931); \
-  __ret_931; \
-})
-#else
-#define vcopy_laneq_p64(__p0_932, __p1_932, __p2_932, __p3_932) __extension__ ({ \
-  poly64x1_t __ret_932; \
-  poly64x1_t __s0_932 = __p0_932; \
-  poly64x2_t __s2_932 = __p2_932; \
-  poly64x2_t __rev2_932;  __rev2_932 = __builtin_shufflevector(__s2_932, __s2_932, 1, 0); \
-  __ret_932 = vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_932, __p3_932), __s0_932, __p1_932); \
-  __ret_932; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-#define vcopy_laneq_f64(__p0_933, __p1_933, __p2_933, __p3_933) __extension__ ({ \
-  float64x1_t __ret_933; \
-  float64x1_t __s0_933 = __p0_933; \
-  float64x2_t __s2_933 = __p2_933; \
-  __ret_933 = vset_lane_f64(vgetq_lane_f64(__s2_933, __p3_933), __s0_933, __p1_933); \
-  __ret_933; \
-})
-#else
-#define vcopy_laneq_f64(__p0_934, __p1_934, __p2_934, __p3_934) __extension__ ({ \
-  float64x1_t __ret_934; \
-  float64x1_t __s0_934 = __p0_934; \
-  float64x2_t __s2_934 = __p2_934; \
-  float64x2_t __rev2_934;  __rev2_934 = __builtin_shufflevector(__s2_934, __s2_934, 1, 0); \
-  __ret_934 = vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_934, __p3_934), __s0_934, __p1_934); \
-  __ret_934; \
-})
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmlal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint16x8_t __ret;
-  __ret = vmlal_u8(__p0, vget_high_u8(__p1), vget_high_u8(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmlal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_u8(__rev0, __noswap_vget_high_u8(__rev1), __noswap_vget_high_u8(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmlal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint64x2_t __ret;
-  __ret = vmlal_u32(__p0, vget_high_u32(__p1), vget_high_u32(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmlal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_u32(__rev0, __noswap_vget_high_u32(__rev1), __noswap_vget_high_u32(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint32x4_t __ret;
-  __ret = vmlal_u16(__p0, vget_high_u16(__p1), vget_high_u16(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_u16(__rev0, __noswap_vget_high_u16(__rev1), __noswap_vget_high_u16(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmlal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int16x8_t __ret;
-  __ret = vmlal_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmlal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_s8(__rev0, __noswap_vget_high_s8(__rev1), __noswap_vget_high_s8(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  __ret = vmlal_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  __ret = vmlal_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmlal_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  __ret = vmlal_n_u32(__p0, vget_high_u32(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmlal_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_n_u32(__rev0, __noswap_vget_high_u32(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlal_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  __ret = vmlal_n_u16(__p0, vget_high_u16(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlal_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_n_u16(__rev0, __noswap_vget_high_u16(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = vmlal_n_s32(__p0, vget_high_s32(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = vmlal_n_s16(__p0, vget_high_s16(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlal_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vmlsl_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint16x8_t __ret;
-  __ret = vmlsl_u8(__p0, vget_high_u8(__p1), vget_high_u8(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vmlsl_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_u8(__rev0, __noswap_vget_high_u8(__rev1), __noswap_vget_high_u8(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmlsl_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint64x2_t __ret;
-  __ret = vmlsl_u32(__p0, vget_high_u32(__p1), vget_high_u32(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmlsl_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_u32(__rev0, __noswap_vget_high_u32(__rev1), __noswap_vget_high_u32(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlsl_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint32x4_t __ret;
-  __ret = vmlsl_u16(__p0, vget_high_u16(__p1), vget_high_u16(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlsl_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_u16(__rev0, __noswap_vget_high_u16(__rev1), __noswap_vget_high_u16(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vmlsl_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int16x8_t __ret;
-  __ret = vmlsl_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vmlsl_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_s8(__rev0, __noswap_vget_high_s8(__rev1), __noswap_vget_high_s8(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  __ret = vmlsl_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  __ret = vmlsl_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vmlsl_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  __ret = vmlsl_n_u32(__p0, vget_high_u32(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vmlsl_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_n_u32(__rev0, __noswap_vget_high_u32(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vmlsl_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  __ret = vmlsl_n_u16(__p0, vget_high_u16(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vmlsl_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_n_u16(__rev0, __noswap_vget_high_u16(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  __ret = vmlsl_n_s32(__p0, vget_high_s32(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  __ret = vmlsl_n_s16(__p0, vget_high_s16(__p1), __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vmlsl_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#define vmulx_lane_f64(__p0_935, __p1_935, __p2_935) __extension__ ({ \
-  float64x1_t __ret_935; \
-  float64x1_t __s0_935 = __p0_935; \
-  float64x1_t __s1_935 = __p1_935; \
-  float64_t __x_935 = vget_lane_f64(__s0_935, 0); \
-  float64_t __y_935 = vget_lane_f64(__s1_935, __p2_935); \
-  float64_t __z_935 = vmulxd_f64(__x_935, __y_935); \
-  __ret_935 = vset_lane_f64(__z_935, __s0_935, __p2_935); \
-  __ret_935; \
-})
-#ifdef __LITTLE_ENDIAN__
-#define vmulx_laneq_f64(__p0_936, __p1_936, __p2_936) __extension__ ({ \
-  float64x1_t __ret_936; \
-  float64x1_t __s0_936 = __p0_936; \
-  float64x2_t __s1_936 = __p1_936; \
-  float64_t __x_936 = vget_lane_f64(__s0_936, 0); \
-  float64_t __y_936 = vgetq_lane_f64(__s1_936, __p2_936); \
-  float64_t __z_936 = vmulxd_f64(__x_936, __y_936); \
-  __ret_936 = vset_lane_f64(__z_936, __s0_936, 0); \
-  __ret_936; \
-})
-#else
-#define vmulx_laneq_f64(__p0_937, __p1_937, __p2_937) __extension__ ({ \
-  float64x1_t __ret_937; \
-  float64x1_t __s0_937 = __p0_937; \
-  float64x2_t __s1_937 = __p1_937; \
-  float64x2_t __rev1_937;  __rev1_937 = __builtin_shufflevector(__s1_937, __s1_937, 1, 0); \
-  float64_t __x_937 = vget_lane_f64(__s0_937, 0); \
-  float64_t __y_937 = __noswap_vgetq_lane_f64(__rev1_937, __p2_937); \
-  float64_t __z_937 = vmulxd_f64(__x_937, __y_937); \
-  __ret_937 = vset_lane_f64(__z_937, __s0_937, 0); \
-  __ret_937; \
-})
-#endif
-
-#endif
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vabal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 + vabdl_u8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vabal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdl_u8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint16x8_t __noswap_vabal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
-  uint16x8_t __ret;
-  __ret = __p0 + __noswap_vabdl_u8(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vabal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 + vabdl_u32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vabal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __noswap_vabdl_u32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint64x2_t __noswap_vabal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
-  uint64x2_t __ret;
-  __ret = __p0 + __noswap_vabdl_u32(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vabal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 + vabdl_u16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vabal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdl_u16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) uint32x4_t __noswap_vabal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
-  uint32x4_t __ret;
-  __ret = __p0 + __noswap_vabdl_u16(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vabal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 + vabdl_s8(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vabal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdl_s8(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int16x8_t __noswap_vabal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
-  int16x8_t __ret;
-  __ret = __p0 + __noswap_vabdl_s8(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vabal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 + vabdl_s32(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vabal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
-  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
-  __ret = __rev0 + __noswap_vabdl_s32(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int64x2_t __noswap_vabal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
-  int64x2_t __ret;
-  __ret = __p0 + __noswap_vabdl_s32(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vabal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 + vabdl_s16(__p1, __p2);
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vabal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __rev0 + __noswap_vabdl_s16(__rev1, __rev2);
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-__ai __attribute__((target("neon"))) int32x4_t __noswap_vabal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
-  int32x4_t __ret;
-  __ret = __p0 + __noswap_vabdl_s16(__p1, __p2);
-  return __ret;
-}
-#endif
-
-#if defined(__aarch64__) || defined(__arm64ec__)
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint16x8_t vabal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint16x8_t __ret;
-  __ret = vabal_u8(__p0, vget_high_u8(__p1), vget_high_u8(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint16x8_t vabal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
-  uint16x8_t __ret;
-  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vabal_u8(__rev0, __noswap_vget_high_u8(__rev1), __noswap_vget_high_u8(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint64x2_t vabal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint64x2_t __ret;
-  __ret = vabal_u32(__p0, vget_high_u32(__p1), vget_high_u32(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint64x2_t vabal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
-  uint64x2_t __ret;
-  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vabal_u32(__rev0, __noswap_vget_high_u32(__rev1), __noswap_vget_high_u32(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) uint32x4_t vabal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint32x4_t __ret;
-  __ret = vabal_u16(__p0, vget_high_u16(__p1), vget_high_u16(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) uint32x4_t vabal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
-  uint32x4_t __ret;
-  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vabal_u16(__rev0, __noswap_vget_high_u16(__rev1), __noswap_vget_high_u16(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int16x8_t vabal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int16x8_t __ret;
-  __ret = vabal_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int16x8_t vabal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
-  int16x8_t __ret;
-  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vabal_s8(__rev0, __noswap_vget_high_s8(__rev1), __noswap_vget_high_s8(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int64x2_t vabal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  __ret = vabal_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int64x2_t vabal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
-  int64x2_t __ret;
-  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
-  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
-  __ret = __noswap_vabal_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
-  return __ret;
-}
-#endif
-
-#ifdef __LITTLE_ENDIAN__
-__ai __attribute__((target("neon"))) int32x4_t vabal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  __ret = vabal_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
-  return __ret;
-}
-#else
-__ai __attribute__((target("neon"))) int32x4_t vabal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
-  int32x4_t __ret;
-  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
-  __ret = __noswap_vabal_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
-  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
-  return __ret;
-}
-#endif
-
-#endif
-
-#undef __ai
-
-#endif /* if !defined(__ARM_NEON) */
-#endif /* ifndef __ARM_FP */
diff --git a/third_party/aarch64/clang/arm_neon_sve_bridge.h b/third_party/aarch64/clang/arm_neon_sve_bridge.h
deleted file mode 100644
index a9fbdbaf4..000000000
--- a/third_party/aarch64/clang/arm_neon_sve_bridge.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_NEON_SVE_BRIDGE_H
-#define __ARM_NEON_SVE_BRIDGE_H
-
-#include <arm_neon.h>
-#include <arm_sve.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Function attributes */
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-#define __aio                                                                  \
-  static __inline__                                                            \
-      __attribute__((__always_inline__, __nodebug__, __overloadable__))
-
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
-svint8_t svset_neonq(svint8_t, int8x16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
-svint16_t svset_neonq(svint16_t, int16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
-svint32_t svset_neonq(svint32_t, int32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
-svint64_t svset_neonq(svint64_t, int64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
-svuint8_t svset_neonq(svuint8_t, uint8x16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
-svuint16_t svset_neonq(svuint16_t, uint16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
-svuint32_t svset_neonq(svuint32_t, uint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
-svuint64_t svset_neonq(svuint64_t, uint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
-svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
-svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
-svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
-svint8_t svset_neonq_s8(svint8_t, int8x16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
-svint16_t svset_neonq_s16(svint16_t, int16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
-svint32_t svset_neonq_s32(svint32_t, int32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
-svint64_t svset_neonq_s64(svint64_t, int64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
-svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
-svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
-svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
-svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
-svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
-svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
-svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
-
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
-int8x16_t svget_neonq(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
-int16x8_t svget_neonq(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
-int32x4_t svget_neonq(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
-int64x2_t svget_neonq(svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
-uint8x16_t svget_neonq(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
-uint16x8_t svget_neonq(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
-uint32x4_t svget_neonq(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
-uint64x2_t svget_neonq(svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
-float16x8_t svget_neonq(svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
-float32x4_t svget_neonq(svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
-float64x2_t svget_neonq(svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
-int8x16_t svget_neonq_s8(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
-int16x8_t svget_neonq_s16(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
-int32x4_t svget_neonq_s32(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
-int64x2_t svget_neonq_s64(svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
-uint8x16_t svget_neonq_u8(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
-uint16x8_t svget_neonq_u16(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
-uint32x4_t svget_neonq_u32(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
-uint64x2_t svget_neonq_u64(svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
-float16x8_t svget_neonq_f16(svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
-float32x4_t svget_neonq_f32(svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
-float64x2_t svget_neonq_f64(svfloat64_t);
-
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
-svint8_t svdup_neonq(int8x16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
-svint16_t svdup_neonq(int16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
-svint32_t svdup_neonq(int32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
-svint64_t svdup_neonq(int64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
-svuint8_t svdup_neonq(uint8x16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
-svuint16_t svdup_neonq(uint16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
-svuint32_t svdup_neonq(uint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
-svuint64_t svdup_neonq(uint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
-svfloat16_t svdup_neonq(float16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
-svfloat32_t svdup_neonq(float32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
-svfloat64_t svdup_neonq(float64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
-svint8_t svdup_neonq_s8(int8x16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
-svint16_t svdup_neonq_s16(int16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
-svint32_t svdup_neonq_s32(int32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
-svint64_t svdup_neonq_s64(int64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
-svuint8_t svdup_neonq_u8(uint8x16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
-svuint16_t svdup_neonq_u16(uint16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
-svuint32_t svdup_neonq_u32(uint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
-svuint64_t svdup_neonq_u64(uint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
-svfloat16_t svdup_neonq_f16(float16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
-svfloat32_t svdup_neonq_f32(float32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
-svfloat64_t svdup_neonq_f64(float64x2_t);
-
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
-svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
-svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
-bfloat16x8_t svget_neonq(svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
-bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
-svbfloat16_t svdup_neonq(bfloat16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
-svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
-
-#undef __ai
-#undef __aio
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif //__ARM_NEON_SVE_BRIDGE_H
diff --git a/third_party/aarch64/clang/arm_sme.h b/third_party/aarch64/clang/arm_sme.h
deleted file mode 100644
index cbfea38fe..000000000
--- a/third_party/aarch64/clang/arm_sme.h
+++ /dev/null
@@ -1,2819 +0,0 @@
-/*===---- arm_sme.h - ARM SME intrinsics ------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_SME_H
-#define __ARM_SME_H
-
-#if !defined(__LITTLE_ENDIAN__)
-#error "Big endian is currently not supported for arm_sme.h"
-#endif
-#include <arm_sve.h>
-
-#include <stddef.h>
-
-/* Function attributes */
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-#define __aio static __inline__ __attribute__((__always_inline__, __nodebug__, __overloadable__))
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-void __arm_za_disable(void) __arm_streaming_compatible;
-
-__ai bool __arm_has_sme(void) __arm_streaming_compatible {
-  uint64_t x0, x1;
-  __builtin_arm_get_sme_state(&x0, &x1);
-  return x0 & (1ULL << 63);
-}
-
-__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible {
-  uint64_t x0, x1;
-  __builtin_arm_get_sme_state(&x0, &x1);
-  return x0 & 1;
-}
-
-void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible;
-void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible;
-void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible;
-void *__arm_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible;
-
-__ai __attribute__((target("sme"))) void svundef_za(void) __arm_streaming_compatible __arm_out("za") { }
-
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m)))
-void svaddha_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m)))
-void svaddha_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_u32_m)))
-void svaddva_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_s32_m)))
-void svaddva_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsb)))
-uint64_t svcntsb(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsd)))
-uint64_t svcntsd(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsh)))
-uint64_t svcntsh(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsw)))
-uint64_t svcntsw(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za128)))
-void svld1_hor_vnum_za128(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za16)))
-void svld1_hor_vnum_za16(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za32)))
-void svld1_hor_vnum_za32(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za64)))
-void svld1_hor_vnum_za64(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za8)))
-void svld1_hor_vnum_za8(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za128)))
-void svld1_hor_za128(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za16)))
-void svld1_hor_za16(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za32)))
-void svld1_hor_za32(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za64)))
-void svld1_hor_za64(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za8)))
-void svld1_hor_za8(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za128)))
-void svld1_ver_vnum_za128(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za16)))
-void svld1_ver_vnum_za16(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za32)))
-void svld1_ver_vnum_za32(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za64)))
-void svld1_ver_vnum_za64(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za8)))
-void svld1_ver_vnum_za8(uint64_t, uint32_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za128)))
-void svld1_ver_za128(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za16)))
-void svld1_ver_za16(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za32)))
-void svld1_ver_za32(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za64)))
-void svld1_ver_za64(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za8)))
-void svld1_ver_za8(uint64_t, uint32_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svldr_vnum_za)))
-void svldr_vnum_za(uint32_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svldr_za)))
-void svldr_za(uint32_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f16_m)))
-void svmopa_za32_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_bf16_m)))
-void svmopa_za32_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f32_m)))
-void svmopa_za32_f32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s8_m)))
-void svmopa_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u8_m)))
-void svmopa_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f16_m)))
-void svmops_za32_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_bf16_m)))
-void svmops_za32_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f32_m)))
-void svmops_za32_f32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s8_m)))
-void svmops_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u8_m)))
-void svmops_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u8_m)))
-svuint8_t svread_hor_za128_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u32_m)))
-svuint32_t svread_hor_za128_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u64_m)))
-svuint64_t svread_hor_za128_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u16_m)))
-svuint16_t svread_hor_za128_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_bf16_m)))
-svbfloat16_t svread_hor_za128_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s8_m)))
-svint8_t svread_hor_za128_s8_m(svint8_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f64_m)))
-svfloat64_t svread_hor_za128_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f32_m)))
-svfloat32_t svread_hor_za128_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f16_m)))
-svfloat16_t svread_hor_za128_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s32_m)))
-svint32_t svread_hor_za128_s32_m(svint32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s64_m)))
-svint64_t svread_hor_za128_s64_m(svint64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s16_m)))
-svint16_t svread_hor_za128_s16_m(svint16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_m)))
-svuint16_t svread_hor_za16_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_m)))
-svbfloat16_t svread_hor_za16_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_m)))
-svfloat16_t svread_hor_za16_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_m)))
-svint16_t svread_hor_za16_s16_m(svint16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_m)))
-svuint32_t svread_hor_za32_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_m)))
-svfloat32_t svread_hor_za32_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_m)))
-svint32_t svread_hor_za32_s32_m(svint32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_m)))
-svuint64_t svread_hor_za64_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_m)))
-svfloat64_t svread_hor_za64_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_m)))
-svint64_t svread_hor_za64_s64_m(svint64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_m)))
-svuint8_t svread_hor_za8_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_m)))
-svint8_t svread_hor_za8_s8_m(svint8_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u8_m)))
-svuint8_t svread_ver_za128_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u32_m)))
-svuint32_t svread_ver_za128_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u64_m)))
-svuint64_t svread_ver_za128_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u16_m)))
-svuint16_t svread_ver_za128_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_bf16_m)))
-svbfloat16_t svread_ver_za128_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s8_m)))
-svint8_t svread_ver_za128_s8_m(svint8_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f64_m)))
-svfloat64_t svread_ver_za128_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f32_m)))
-svfloat32_t svread_ver_za128_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f16_m)))
-svfloat16_t svread_ver_za128_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s32_m)))
-svint32_t svread_ver_za128_s32_m(svint32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s64_m)))
-svint64_t svread_ver_za128_s64_m(svint64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s16_m)))
-svint16_t svread_ver_za128_s16_m(svint16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_m)))
-svuint16_t svread_ver_za16_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_m)))
-svbfloat16_t svread_ver_za16_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_m)))
-svfloat16_t svread_ver_za16_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_m)))
-svint16_t svread_ver_za16_s16_m(svint16_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_m)))
-svuint32_t svread_ver_za32_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_m)))
-svfloat32_t svread_ver_za32_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_m)))
-svint32_t svread_ver_za32_s32_m(svint32_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_m)))
-svuint64_t svread_ver_za64_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_m)))
-svfloat64_t svread_ver_za64_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_m)))
-svint64_t svread_ver_za64_s64_m(svint64_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_m)))
-svuint8_t svread_ver_za8_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_m)))
-svint8_t svread_ver_za8_s8_m(svint8_t, svbool_t, uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za128)))
-void svst1_hor_vnum_za128(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za16)))
-void svst1_hor_vnum_za16(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za32)))
-void svst1_hor_vnum_za32(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za64)))
-void svst1_hor_vnum_za64(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za8)))
-void svst1_hor_vnum_za8(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za128)))
-void svst1_hor_za128(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za16)))
-void svst1_hor_za16(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za32)))
-void svst1_hor_za32(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za64)))
-void svst1_hor_za64(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za8)))
-void svst1_hor_za8(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za128)))
-void svst1_ver_vnum_za128(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za16)))
-void svst1_ver_vnum_za16(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za32)))
-void svst1_ver_vnum_za32(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za64)))
-void svst1_ver_vnum_za64(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za8)))
-void svst1_ver_vnum_za8(uint64_t, uint32_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za128)))
-void svst1_ver_za128(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za16)))
-void svst1_ver_za16(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za32)))
-void svst1_ver_za32(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za64)))
-void svst1_ver_za64(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za8)))
-void svst1_ver_za8(uint64_t, uint32_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svstr_vnum_za)))
-void svstr_vnum_za(uint32_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svstr_za)))
-void svstr_za(uint32_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za32_s8_m)))
-void svsumopa_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za32_s8_m)))
-void svsumops_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za32_u8_m)))
-void svusmopa_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za32_u8_m)))
-void svusmops_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u8_m)))
-void svwrite_hor_za128_u8_m(uint64_t, uint32_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u32_m)))
-void svwrite_hor_za128_u32_m(uint64_t, uint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u64_m)))
-void svwrite_hor_za128_u64_m(uint64_t, uint32_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u16_m)))
-void svwrite_hor_za128_u16_m(uint64_t, uint32_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_bf16_m)))
-void svwrite_hor_za128_bf16_m(uint64_t, uint32_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s8_m)))
-void svwrite_hor_za128_s8_m(uint64_t, uint32_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f64_m)))
-void svwrite_hor_za128_f64_m(uint64_t, uint32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f32_m)))
-void svwrite_hor_za128_f32_m(uint64_t, uint32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f16_m)))
-void svwrite_hor_za128_f16_m(uint64_t, uint32_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s32_m)))
-void svwrite_hor_za128_s32_m(uint64_t, uint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s64_m)))
-void svwrite_hor_za128_s64_m(uint64_t, uint32_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s16_m)))
-void svwrite_hor_za128_s16_m(uint64_t, uint32_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_m)))
-void svwrite_hor_za16_u16_m(uint64_t, uint32_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_m)))
-void svwrite_hor_za16_bf16_m(uint64_t, uint32_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_m)))
-void svwrite_hor_za16_f16_m(uint64_t, uint32_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_m)))
-void svwrite_hor_za16_s16_m(uint64_t, uint32_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_m)))
-void svwrite_hor_za32_u32_m(uint64_t, uint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_m)))
-void svwrite_hor_za32_f32_m(uint64_t, uint32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_m)))
-void svwrite_hor_za32_s32_m(uint64_t, uint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_m)))
-void svwrite_hor_za64_u64_m(uint64_t, uint32_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_m)))
-void svwrite_hor_za64_f64_m(uint64_t, uint32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_m)))
-void svwrite_hor_za64_s64_m(uint64_t, uint32_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_m)))
-void svwrite_hor_za8_u8_m(uint64_t, uint32_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_m)))
-void svwrite_hor_za8_s8_m(uint64_t, uint32_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u8_m)))
-void svwrite_ver_za128_u8_m(uint64_t, uint32_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u32_m)))
-void svwrite_ver_za128_u32_m(uint64_t, uint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u64_m)))
-void svwrite_ver_za128_u64_m(uint64_t, uint32_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u16_m)))
-void svwrite_ver_za128_u16_m(uint64_t, uint32_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_bf16_m)))
-void svwrite_ver_za128_bf16_m(uint64_t, uint32_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s8_m)))
-void svwrite_ver_za128_s8_m(uint64_t, uint32_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f64_m)))
-void svwrite_ver_za128_f64_m(uint64_t, uint32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f32_m)))
-void svwrite_ver_za128_f32_m(uint64_t, uint32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f16_m)))
-void svwrite_ver_za128_f16_m(uint64_t, uint32_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s32_m)))
-void svwrite_ver_za128_s32_m(uint64_t, uint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s64_m)))
-void svwrite_ver_za128_s64_m(uint64_t, uint32_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s16_m)))
-void svwrite_ver_za128_s16_m(uint64_t, uint32_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_m)))
-void svwrite_ver_za16_u16_m(uint64_t, uint32_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_m)))
-void svwrite_ver_za16_bf16_m(uint64_t, uint32_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_m)))
-void svwrite_ver_za16_f16_m(uint64_t, uint32_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_m)))
-void svwrite_ver_za16_s16_m(uint64_t, uint32_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_m)))
-void svwrite_ver_za32_u32_m(uint64_t, uint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_m)))
-void svwrite_ver_za32_f32_m(uint64_t, uint32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_m)))
-void svwrite_ver_za32_s32_m(uint64_t, uint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_m)))
-void svwrite_ver_za64_u64_m(uint64_t, uint32_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_m)))
-void svwrite_ver_za64_f64_m(uint64_t, uint32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_m)))
-void svwrite_ver_za64_s64_m(uint64_t, uint32_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_m)))
-void svwrite_ver_za8_u8_m(uint64_t, uint32_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m)))
-void svwrite_ver_za8_s8_m(uint64_t, uint32_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_mask_za)))
-void svzero_mask_za(uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za)))
-void svzero_za(void);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m)))
-void svaddha_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m)))
-void svaddha_za32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_u32_m)))
-void svaddva_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_s32_m)))
-void svaddva_za32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f16_m)))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_bf16_m)))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f32_m)))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s8_m)))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u8_m)))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f16_m)))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_bf16_m)))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f32_m)))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s8_m)))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u8_m)))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u8_m)))
-svuint8_t svread_hor_za128_m(svuint8_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u32_m)))
-svuint32_t svread_hor_za128_m(svuint32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u64_m)))
-svuint64_t svread_hor_za128_m(svuint64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u16_m)))
-svuint16_t svread_hor_za128_m(svuint16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_bf16_m)))
-svbfloat16_t svread_hor_za128_m(svbfloat16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s8_m)))
-svint8_t svread_hor_za128_m(svint8_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f64_m)))
-svfloat64_t svread_hor_za128_m(svfloat64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f32_m)))
-svfloat32_t svread_hor_za128_m(svfloat32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f16_m)))
-svfloat16_t svread_hor_za128_m(svfloat16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s32_m)))
-svint32_t svread_hor_za128_m(svint32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s64_m)))
-svint64_t svread_hor_za128_m(svint64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s16_m)))
-svint16_t svread_hor_za128_m(svint16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_m)))
-svuint16_t svread_hor_za16_m(svuint16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_m)))
-svbfloat16_t svread_hor_za16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_m)))
-svfloat16_t svread_hor_za16_m(svfloat16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_m)))
-svint16_t svread_hor_za16_m(svint16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_m)))
-svuint32_t svread_hor_za32_m(svuint32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_m)))
-svfloat32_t svread_hor_za32_m(svfloat32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_m)))
-svint32_t svread_hor_za32_m(svint32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_m)))
-svuint64_t svread_hor_za64_m(svuint64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_m)))
-svfloat64_t svread_hor_za64_m(svfloat64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_m)))
-svint64_t svread_hor_za64_m(svint64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_m)))
-svuint8_t svread_hor_za8_m(svuint8_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_m)))
-svint8_t svread_hor_za8_m(svint8_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u8_m)))
-svuint8_t svread_ver_za128_m(svuint8_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u32_m)))
-svuint32_t svread_ver_za128_m(svuint32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u64_m)))
-svuint64_t svread_ver_za128_m(svuint64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u16_m)))
-svuint16_t svread_ver_za128_m(svuint16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_bf16_m)))
-svbfloat16_t svread_ver_za128_m(svbfloat16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s8_m)))
-svint8_t svread_ver_za128_m(svint8_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f64_m)))
-svfloat64_t svread_ver_za128_m(svfloat64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f32_m)))
-svfloat32_t svread_ver_za128_m(svfloat32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f16_m)))
-svfloat16_t svread_ver_za128_m(svfloat16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s32_m)))
-svint32_t svread_ver_za128_m(svint32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s64_m)))
-svint64_t svread_ver_za128_m(svint64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s16_m)))
-svint16_t svread_ver_za128_m(svint16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_m)))
-svuint16_t svread_ver_za16_m(svuint16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_m)))
-svbfloat16_t svread_ver_za16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_m)))
-svfloat16_t svread_ver_za16_m(svfloat16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_m)))
-svint16_t svread_ver_za16_m(svint16_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_m)))
-svuint32_t svread_ver_za32_m(svuint32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_m)))
-svfloat32_t svread_ver_za32_m(svfloat32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_m)))
-svint32_t svread_ver_za32_m(svint32_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_m)))
-svuint64_t svread_ver_za64_m(svuint64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_m)))
-svfloat64_t svread_ver_za64_m(svfloat64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_m)))
-svint64_t svread_ver_za64_m(svint64_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_m)))
-svuint8_t svread_ver_za8_m(svuint8_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_m)))
-svint8_t svread_ver_za8_m(svint8_t, svbool_t, uint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za32_s8_m)))
-void svsumopa_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za32_s8_m)))
-void svsumops_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za32_u8_m)))
-void svusmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za32_u8_m)))
-void svusmops_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u8_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u32_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u64_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u16_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_bf16_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s8_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f64_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f32_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f16_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s32_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s64_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s16_m)))
-void svwrite_hor_za128_m(uint64_t, uint32_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_m)))
-void svwrite_hor_za16_m(uint64_t, uint32_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_m)))
-void svwrite_hor_za16_m(uint64_t, uint32_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_m)))
-void svwrite_hor_za16_m(uint64_t, uint32_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_m)))
-void svwrite_hor_za16_m(uint64_t, uint32_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_m)))
-void svwrite_hor_za32_m(uint64_t, uint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_m)))
-void svwrite_hor_za32_m(uint64_t, uint32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_m)))
-void svwrite_hor_za32_m(uint64_t, uint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_m)))
-void svwrite_hor_za64_m(uint64_t, uint32_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_m)))
-void svwrite_hor_za64_m(uint64_t, uint32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_m)))
-void svwrite_hor_za64_m(uint64_t, uint32_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_m)))
-void svwrite_hor_za8_m(uint64_t, uint32_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_m)))
-void svwrite_hor_za8_m(uint64_t, uint32_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u8_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u32_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u64_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u16_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_bf16_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s8_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f64_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f32_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f16_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s32_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s64_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s16_m)))
-void svwrite_ver_za128_m(uint64_t, uint32_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_m)))
-void svwrite_ver_za16_m(uint64_t, uint32_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_m)))
-void svwrite_ver_za16_m(uint64_t, uint32_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_m)))
-void svwrite_ver_za16_m(uint64_t, uint32_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_m)))
-void svwrite_ver_za16_m(uint64_t, uint32_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_m)))
-void svwrite_ver_za32_m(uint64_t, uint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_m)))
-void svwrite_ver_za32_m(uint64_t, uint32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_m)))
-void svwrite_ver_za32_m(uint64_t, uint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_m)))
-void svwrite_ver_za64_m(uint64_t, uint32_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_m)))
-void svwrite_ver_za64_m(uint64_t, uint32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_m)))
-void svwrite_ver_za64_m(uint64_t, uint32_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_m)))
-void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m)))
-void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x2)))
-void svmla_single_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x4)))
-void svmla_single_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_f16_vg1x2)))
-void svmla_lane_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_f16_vg1x4)))
-void svmla_lane_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_f16_vg1x2)))
-void svmla_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_f16_vg1x4)))
-void svmla_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_f16_vg1x2)))
-void svmls_single_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_f16_vg1x4)))
-void svmls_single_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_f16_vg1x2)))
-void svmls_lane_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_f16_vg1x4)))
-void svmls_lane_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_f16_vg1x2)))
-void svmls_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_f16_vg1x4)))
-void svmls_za16_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_f16_m)))
-void svmopa_za16_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_f16_m)))
-void svmops_za16_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x2)))
-void svmla_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x4)))
-void svmla_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_f16_vg1x2)))
-void svmla_lane_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_f16_vg1x4)))
-void svmla_lane_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_f16_vg1x2)))
-void svmla_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_f16_vg1x4)))
-void svmla_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_f16_vg1x2)))
-void svmls_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_f16_vg1x4)))
-void svmls_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_f16_vg1x2)))
-void svmls_lane_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_f16_vg1x4)))
-void svmls_lane_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_f16_vg1x2)))
-void svmls_za16_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_f16_vg1x4)))
-void svmls_za16_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_f16_m)))
-void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_f16_m)))
-void svmops_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
-void svadd_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
-void svadd_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
-void svsub_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
-void svsub_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
-void svadd_za16_vg1x2(uint32_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
-void svadd_za16_vg1x4(uint32_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
-void svsub_za16_vg1x2(uint32_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
-void svsub_za16_vg1x4(uint32_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m)))
-void svmopa_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m)))
-void svmops_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m)))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m)))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m)))
-void svaddha_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m)))
-void svaddha_za64_s64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_u64_m)))
-void svaddva_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_s64_m)))
-void svaddva_za64_s64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_s16_m)))
-void svmopa_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_u16_m)))
-void svmopa_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_s16_m)))
-void svmops_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_u16_m)))
-void svmops_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za64_s16_m)))
-void svsumopa_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za64_s16_m)))
-void svsumops_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m)))
-void svusmopa_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m)))
-void svusmops_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m)))
-void svaddha_za64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m)))
-void svaddha_za64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_u64_m)))
-void svaddva_za64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_s64_m)))
-void svaddva_za64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_s16_m)))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_u16_m)))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_s16_m)))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_u16_m)))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za64_s16_m)))
-void svsumopa_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za64_s16_m)))
-void svsumops_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m)))
-void svusmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m)))
-void svusmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_u32_vg1x2)))
-void svadd_write_single_za32_u32_vg1x2(uint32_t, svuint32x2_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_s32_vg1x2)))
-void svadd_write_single_za32_s32_vg1x2(uint32_t, svint32x2_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_u32_vg1x4)))
-void svadd_write_single_za32_u32_vg1x4(uint32_t, svuint32x4_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_s32_vg1x4)))
-void svadd_write_single_za32_s32_vg1x4(uint32_t, svint32x4_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za32_u32_vg1x2)))
-void svadd_write_za32_u32_vg1x2(uint32_t, svuint32x2_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za32_s32_vg1x2)))
-void svadd_write_za32_s32_vg1x2(uint32_t, svint32x2_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za32_u32_vg1x4)))
-void svadd_write_za32_u32_vg1x4(uint32_t, svuint32x4_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za32_s32_vg1x4)))
-void svadd_write_za32_s32_vg1x4(uint32_t, svint32x4_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_u32_vg1x2)))
-void svadd_za32_u32_vg1x2(uint32_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_f32_vg1x2)))
-void svadd_za32_f32_vg1x2(uint32_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_s32_vg1x2)))
-void svadd_za32_s32_vg1x2(uint32_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_u32_vg1x4)))
-void svadd_za32_u32_vg1x4(uint32_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_f32_vg1x4)))
-void svadd_za32_f32_vg1x4(uint32_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_s32_vg1x4)))
-void svadd_za32_s32_vg1x4(uint32_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svbmopa_za32_u32_m)))
-void svbmopa_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svbmopa_za32_s32_m)))
-void svbmopa_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svbmops_za32_u32_m)))
-void svbmops_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svbmops_za32_s32_m)))
-void svbmops_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_bf16_vg1x2)))
-void svdot_single_za32_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_f16_vg1x2)))
-void svdot_single_za32_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_s8_vg1x2)))
-void svdot_single_za32_s8_vg1x2(uint32_t, svint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_s16_vg1x2)))
-void svdot_single_za32_s16_vg1x2(uint32_t, svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_u8_vg1x2)))
-void svdot_single_za32_u8_vg1x2(uint32_t, svuint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_u16_vg1x2)))
-void svdot_single_za32_u16_vg1x2(uint32_t, svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_bf16_vg1x4)))
-void svdot_single_za32_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_f16_vg1x4)))
-void svdot_single_za32_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_s8_vg1x4)))
-void svdot_single_za32_s8_vg1x4(uint32_t, svint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_s16_vg1x4)))
-void svdot_single_za32_s16_vg1x4(uint32_t, svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_u8_vg1x4)))
-void svdot_single_za32_u8_vg1x4(uint32_t, svuint8x4_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_u16_vg1x4)))
-void svdot_single_za32_u16_vg1x4(uint32_t, svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_bf16_vg1x2)))
-void svdot_lane_za32_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_f16_vg1x2)))
-void svdot_lane_za32_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_s8_vg1x2)))
-void svdot_lane_za32_s8_vg1x2(uint32_t, svint8x2_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_s16_vg1x2)))
-void svdot_lane_za32_s16_vg1x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_u8_vg1x2)))
-void svdot_lane_za32_u8_vg1x2(uint32_t, svuint8x2_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_u16_vg1x2)))
-void svdot_lane_za32_u16_vg1x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_bf16_vg1x4)))
-void svdot_lane_za32_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_f16_vg1x4)))
-void svdot_lane_za32_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_s8_vg1x4)))
-void svdot_lane_za32_s8_vg1x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_s16_vg1x4)))
-void svdot_lane_za32_s16_vg1x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_u8_vg1x4)))
-void svdot_lane_za32_u8_vg1x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_u16_vg1x4)))
-void svdot_lane_za32_u16_vg1x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_bf16_vg1x2)))
-void svdot_za32_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_f16_vg1x2)))
-void svdot_za32_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_s8_vg1x2)))
-void svdot_za32_s8_vg1x2(uint32_t, svint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_s16_vg1x2)))
-void svdot_za32_s16_vg1x2(uint32_t, svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_u8_vg1x2)))
-void svdot_za32_u8_vg1x2(uint32_t, svuint8x2_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_u16_vg1x2)))
-void svdot_za32_u16_vg1x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_bf16_vg1x4)))
-void svdot_za32_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_f16_vg1x4)))
-void svdot_za32_f16_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_s8_vg1x4)))
-void svdot_za32_s8_vg1x4(uint32_t, svint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_s16_vg1x4)))
-void svdot_za32_s16_vg1x4(uint32_t, svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_u8_vg1x4)))
-void svdot_za32_u8_vg1x4(uint32_t, svuint8x4_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_u16_vg1x4)))
-void svdot_za32_u16_vg1x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svldr_zt)))
-void svldr_zt(uint64_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_u8)))
-svuint8_t svluti2_lane_zt_u8(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_u32)))
-svuint32_t svluti2_lane_zt_u32(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_u16)))
-svuint16_t svluti2_lane_zt_u16(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_bf16)))
-svbfloat16_t svluti2_lane_zt_bf16(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_s8)))
-svint8_t svluti2_lane_zt_s8(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_f32)))
-svfloat32_t svluti2_lane_zt_f32(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_f16)))
-svfloat16_t svluti2_lane_zt_f16(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_s32)))
-svint32_t svluti2_lane_zt_s32(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_s16)))
-svint16_t svluti2_lane_zt_s16(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_u8_x2)))
-svuint8x2_t svluti2_lane_zt_u8_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_u32_x2)))
-svuint32x2_t svluti2_lane_zt_u32_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_u16_x2)))
-svuint16x2_t svluti2_lane_zt_u16_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_bf16_x2)))
-svbfloat16x2_t svluti2_lane_zt_bf16_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_s8_x2)))
-svint8x2_t svluti2_lane_zt_s8_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_f32_x2)))
-svfloat32x2_t svluti2_lane_zt_f32_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_f16_x2)))
-svfloat16x2_t svluti2_lane_zt_f16_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_s32_x2)))
-svint32x2_t svluti2_lane_zt_s32_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_s16_x2)))
-svint16x2_t svluti2_lane_zt_s16_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_u8_x4)))
-svuint8x4_t svluti2_lane_zt_u8_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_u32_x4)))
-svuint32x4_t svluti2_lane_zt_u32_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_u16_x4)))
-svuint16x4_t svluti2_lane_zt_u16_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_bf16_x4)))
-svbfloat16x4_t svluti2_lane_zt_bf16_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_s8_x4)))
-svint8x4_t svluti2_lane_zt_s8_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_f32_x4)))
-svfloat32x4_t svluti2_lane_zt_f32_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_f16_x4)))
-svfloat16x4_t svluti2_lane_zt_f16_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_s32_x4)))
-svint32x4_t svluti2_lane_zt_s32_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti2_lane_zt_s16_x4)))
-svint16x4_t svluti2_lane_zt_s16_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_u8)))
-svuint8_t svluti4_lane_zt_u8(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_u32)))
-svuint32_t svluti4_lane_zt_u32(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_u16)))
-svuint16_t svluti4_lane_zt_u16(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_bf16)))
-svbfloat16_t svluti4_lane_zt_bf16(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_s8)))
-svint8_t svluti4_lane_zt_s8(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_f32)))
-svfloat32_t svluti4_lane_zt_f32(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_f16)))
-svfloat16_t svluti4_lane_zt_f16(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_s32)))
-svint32_t svluti4_lane_zt_s32(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_s16)))
-svint16_t svluti4_lane_zt_s16(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_u8_x2)))
-svuint8x2_t svluti4_lane_zt_u8_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_u32_x2)))
-svuint32x2_t svluti4_lane_zt_u32_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_u16_x2)))
-svuint16x2_t svluti4_lane_zt_u16_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_bf16_x2)))
-svbfloat16x2_t svluti4_lane_zt_bf16_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_s8_x2)))
-svint8x2_t svluti4_lane_zt_s8_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_f32_x2)))
-svfloat32x2_t svluti4_lane_zt_f32_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_f16_x2)))
-svfloat16x2_t svluti4_lane_zt_f16_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_s32_x2)))
-svint32x2_t svluti4_lane_zt_s32_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_s16_x2)))
-svint16x2_t svluti4_lane_zt_s16_x2(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_u32_x4)))
-svuint32x4_t svluti4_lane_zt_u32_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_u16_x4)))
-svuint16x4_t svluti4_lane_zt_u16_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_bf16_x4)))
-svbfloat16x4_t svluti4_lane_zt_bf16_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_f32_x4)))
-svfloat32x4_t svluti4_lane_zt_f32_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_f16_x4)))
-svfloat16x4_t svluti4_lane_zt_f16_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_s32_x4)))
-svint32x4_t svluti4_lane_zt_s32_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_lane_zt_s16_x4)))
-svint16x4_t svluti4_lane_zt_s16_x4(uint64_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_f32_vg1x2)))
-void svmla_single_za32_f32_vg1x2(uint32_t, svfloat32x2_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_f32_vg1x4)))
-void svmla_single_za32_f32_vg1x4(uint32_t, svfloat32x4_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_bf16_vg2x2)))
-void svmla_single_za32_bf16_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_f16_vg2x2)))
-void svmla_single_za32_f16_vg2x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_s16_vg2x2)))
-void svmla_single_za32_s16_vg2x2(uint32_t, svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_u16_vg2x2)))
-void svmla_single_za32_u16_vg2x2(uint32_t, svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_bf16_vg2x4)))
-void svmla_single_za32_bf16_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_f16_vg2x4)))
-void svmla_single_za32_f16_vg2x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_s16_vg2x4)))
-void svmla_single_za32_s16_vg2x4(uint32_t, svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_u16_vg2x4)))
-void svmla_single_za32_u16_vg2x4(uint32_t, svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_s8_vg4x2)))
-void svmla_single_za32_s8_vg4x2(uint32_t, svint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_u8_vg4x2)))
-void svmla_single_za32_u8_vg4x2(uint32_t, svuint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_s8_vg4x4)))
-void svmla_single_za32_s8_vg4x4(uint32_t, svint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_u8_vg4x4)))
-void svmla_single_za32_u8_vg4x4(uint32_t, svuint8x4_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f32_vg1x2)))
-void svmla_lane_za32_f32_vg1x2(uint32_t, svfloat32x2_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f32_vg1x4)))
-void svmla_lane_za32_f32_vg1x4(uint32_t, svfloat32x4_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_bf16_vg2x1)))
-void svmla_lane_za32_bf16_vg2x1(uint32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f16_vg2x1)))
-void svmla_lane_za32_f16_vg2x1(uint32_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s16_vg2x1)))
-void svmla_lane_za32_s16_vg2x1(uint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u16_vg2x1)))
-void svmla_lane_za32_u16_vg2x1(uint32_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_bf16_vg2x2)))
-void svmla_lane_za32_bf16_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f16_vg2x2)))
-void svmla_lane_za32_f16_vg2x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s16_vg2x2)))
-void svmla_lane_za32_s16_vg2x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u16_vg2x2)))
-void svmla_lane_za32_u16_vg2x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_bf16_vg2x4)))
-void svmla_lane_za32_bf16_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f16_vg2x4)))
-void svmla_lane_za32_f16_vg2x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s16_vg2x4)))
-void svmla_lane_za32_s16_vg2x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u16_vg2x4)))
-void svmla_lane_za32_u16_vg2x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s8_vg4x1)))
-void svmla_lane_za32_s8_vg4x1(uint32_t, svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u8_vg4x1)))
-void svmla_lane_za32_u8_vg4x1(uint32_t, svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s8_vg4x2)))
-void svmla_lane_za32_s8_vg4x2(uint32_t, svint8x2_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u8_vg4x2)))
-void svmla_lane_za32_u8_vg4x2(uint32_t, svuint8x2_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s8_vg4x4)))
-void svmla_lane_za32_s8_vg4x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u8_vg4x4)))
-void svmla_lane_za32_u8_vg4x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f32_vg1x2)))
-void svmla_za32_f32_vg1x2(uint32_t, svfloat32x2_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f32_vg1x4)))
-void svmla_za32_f32_vg1x4(uint32_t, svfloat32x4_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_bf16_vg2x1)))
-void svmla_za32_bf16_vg2x1(uint32_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f16_vg2x1)))
-void svmla_za32_f16_vg2x1(uint32_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s16_vg2x1)))
-void svmla_za32_s16_vg2x1(uint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u16_vg2x1)))
-void svmla_za32_u16_vg2x1(uint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_bf16_vg2x2)))
-void svmla_za32_bf16_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f16_vg2x2)))
-void svmla_za32_f16_vg2x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s16_vg2x2)))
-void svmla_za32_s16_vg2x2(uint32_t, svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u16_vg2x2)))
-void svmla_za32_u16_vg2x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_bf16_vg2x4)))
-void svmla_za32_bf16_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f16_vg2x4)))
-void svmla_za32_f16_vg2x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s16_vg2x4)))
-void svmla_za32_s16_vg2x4(uint32_t, svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u16_vg2x4)))
-void svmla_za32_u16_vg2x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s8_vg4x1)))
-void svmla_za32_s8_vg4x1(uint32_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u8_vg4x1)))
-void svmla_za32_u8_vg4x1(uint32_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s8_vg4x2)))
-void svmla_za32_s8_vg4x2(uint32_t, svint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u8_vg4x2)))
-void svmla_za32_u8_vg4x2(uint32_t, svuint8x2_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s8_vg4x4)))
-void svmla_za32_s8_vg4x4(uint32_t, svint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u8_vg4x4)))
-void svmla_za32_u8_vg4x4(uint32_t, svuint8x4_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_f32_vg1x2)))
-void svmls_single_za32_f32_vg1x2(uint32_t, svfloat32x2_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_f32_vg1x4)))
-void svmls_single_za32_f32_vg1x4(uint32_t, svfloat32x4_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_bf16_vg2x2)))
-void svmls_single_za32_bf16_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_f16_vg2x2)))
-void svmls_single_za32_f16_vg2x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_s16_vg2x2)))
-void svmls_single_za32_s16_vg2x2(uint32_t, svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_u16_vg2x2)))
-void svmls_single_za32_u16_vg2x2(uint32_t, svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_bf16_vg2x4)))
-void svmls_single_za32_bf16_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_f16_vg2x4)))
-void svmls_single_za32_f16_vg2x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_s16_vg2x4)))
-void svmls_single_za32_s16_vg2x4(uint32_t, svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_u16_vg2x4)))
-void svmls_single_za32_u16_vg2x4(uint32_t, svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_s8_vg4x2)))
-void svmls_single_za32_s8_vg4x2(uint32_t, svint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_u8_vg4x2)))
-void svmls_single_za32_u8_vg4x2(uint32_t, svuint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_s8_vg4x4)))
-void svmls_single_za32_s8_vg4x4(uint32_t, svint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_u8_vg4x4)))
-void svmls_single_za32_u8_vg4x4(uint32_t, svuint8x4_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f32_vg1x2)))
-void svmls_lane_za32_f32_vg1x2(uint32_t, svfloat32x2_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f32_vg1x4)))
-void svmls_lane_za32_f32_vg1x4(uint32_t, svfloat32x4_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_bf16_vg2x1)))
-void svmls_lane_za32_bf16_vg2x1(uint32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f16_vg2x1)))
-void svmls_lane_za32_f16_vg2x1(uint32_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s16_vg2x1)))
-void svmls_lane_za32_s16_vg2x1(uint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u16_vg2x1)))
-void svmls_lane_za32_u16_vg2x1(uint32_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_bf16_vg2x2)))
-void svmls_lane_za32_bf16_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f16_vg2x2)))
-void svmls_lane_za32_f16_vg2x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s16_vg2x2)))
-void svmls_lane_za32_s16_vg2x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u16_vg2x2)))
-void svmls_lane_za32_u16_vg2x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_bf16_vg2x4)))
-void svmls_lane_za32_bf16_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f16_vg2x4)))
-void svmls_lane_za32_f16_vg2x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s16_vg2x4)))
-void svmls_lane_za32_s16_vg2x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u16_vg2x4)))
-void svmls_lane_za32_u16_vg2x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s8_vg4x1)))
-void svmls_lane_za32_s8_vg4x1(uint32_t, svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u8_vg4x1)))
-void svmls_lane_za32_u8_vg4x1(uint32_t, svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s8_vg4x2)))
-void svmls_lane_za32_s8_vg4x2(uint32_t, svint8x2_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u8_vg4x2)))
-void svmls_lane_za32_u8_vg4x2(uint32_t, svuint8x2_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s8_vg4x4)))
-void svmls_lane_za32_s8_vg4x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u8_vg4x4)))
-void svmls_lane_za32_u8_vg4x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f32_vg1x2)))
-void svmls_za32_f32_vg1x2(uint32_t, svfloat32x2_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f32_vg1x4)))
-void svmls_za32_f32_vg1x4(uint32_t, svfloat32x4_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_bf16_vg2x1)))
-void svmls_za32_bf16_vg2x1(uint32_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f16_vg2x1)))
-void svmls_za32_f16_vg2x1(uint32_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s16_vg2x1)))
-void svmls_za32_s16_vg2x1(uint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u16_vg2x1)))
-void svmls_za32_u16_vg2x1(uint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_bf16_vg2x2)))
-void svmls_za32_bf16_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f16_vg2x2)))
-void svmls_za32_f16_vg2x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s16_vg2x2)))
-void svmls_za32_s16_vg2x2(uint32_t, svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u16_vg2x2)))
-void svmls_za32_u16_vg2x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_bf16_vg2x4)))
-void svmls_za32_bf16_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f16_vg2x4)))
-void svmls_za32_f16_vg2x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s16_vg2x4)))
-void svmls_za32_s16_vg2x4(uint32_t, svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u16_vg2x4)))
-void svmls_za32_u16_vg2x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s8_vg4x1)))
-void svmls_za32_s8_vg4x1(uint32_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u8_vg4x1)))
-void svmls_za32_u8_vg4x1(uint32_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s8_vg4x2)))
-void svmls_za32_s8_vg4x2(uint32_t, svint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u8_vg4x2)))
-void svmls_za32_u8_vg4x2(uint32_t, svuint8x2_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s8_vg4x4)))
-void svmls_za32_s8_vg4x4(uint32_t, svint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u8_vg4x4)))
-void svmls_za32_u8_vg4x4(uint32_t, svuint8x4_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s16_m)))
-void svmopa_za32_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u16_m)))
-void svmopa_za32_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s16_m)))
-void svmops_za32_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u16_m)))
-void svmops_za32_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_vg2)))
-svuint16x2_t svread_hor_za16_u16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_vg2)))
-svbfloat16x2_t svread_hor_za16_bf16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_vg2)))
-svfloat16x2_t svread_hor_za16_f16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_vg2)))
-svint16x2_t svread_hor_za16_s16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_vg4)))
-svuint16x4_t svread_hor_za16_u16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_vg4)))
-svbfloat16x4_t svread_hor_za16_bf16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_vg4)))
-svfloat16x4_t svread_hor_za16_f16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_vg4)))
-svint16x4_t svread_hor_za16_s16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_vg2)))
-svuint32x2_t svread_hor_za32_u32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_vg2)))
-svfloat32x2_t svread_hor_za32_f32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_vg2)))
-svint32x2_t svread_hor_za32_s32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_vg4)))
-svuint32x4_t svread_hor_za32_u32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_vg4)))
-svfloat32x4_t svread_hor_za32_f32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_vg4)))
-svint32x4_t svread_hor_za32_s32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_vg2)))
-svuint64x2_t svread_hor_za64_u64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_vg2)))
-svfloat64x2_t svread_hor_za64_f64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_vg2)))
-svint64x2_t svread_hor_za64_s64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_vg4)))
-svuint64x4_t svread_hor_za64_u64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_vg4)))
-svfloat64x4_t svread_hor_za64_f64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_vg4)))
-svint64x4_t svread_hor_za64_s64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_vg2)))
-svuint8x2_t svread_hor_za8_u8_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_vg2)))
-svint8x2_t svread_hor_za8_s8_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_vg4)))
-svuint8x4_t svread_hor_za8_u8_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_vg4)))
-svint8x4_t svread_hor_za8_s8_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_vg2)))
-svuint16x2_t svread_ver_za16_u16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_vg2)))
-svbfloat16x2_t svread_ver_za16_bf16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_vg2)))
-svfloat16x2_t svread_ver_za16_f16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_vg2)))
-svint16x2_t svread_ver_za16_s16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_vg4)))
-svuint16x4_t svread_ver_za16_u16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_vg4)))
-svbfloat16x4_t svread_ver_za16_bf16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_vg4)))
-svfloat16x4_t svread_ver_za16_f16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_vg4)))
-svint16x4_t svread_ver_za16_s16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_vg2)))
-svuint32x2_t svread_ver_za32_u32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_vg2)))
-svfloat32x2_t svread_ver_za32_f32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_vg2)))
-svint32x2_t svread_ver_za32_s32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_vg4)))
-svuint32x4_t svread_ver_za32_u32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_vg4)))
-svfloat32x4_t svread_ver_za32_f32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_vg4)))
-svint32x4_t svread_ver_za32_s32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_vg2)))
-svuint64x2_t svread_ver_za64_u64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_vg2)))
-svfloat64x2_t svread_ver_za64_f64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_vg2)))
-svint64x2_t svread_ver_za64_s64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_vg4)))
-svuint64x4_t svread_ver_za64_u64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_vg4)))
-svfloat64x4_t svread_ver_za64_f64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_vg4)))
-svint64x4_t svread_ver_za64_s64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_vg2)))
-svuint8x2_t svread_ver_za8_u8_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_vg2)))
-svint8x2_t svread_ver_za8_s8_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_vg4)))
-svuint8x4_t svread_ver_za8_u8_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_vg4)))
-svint8x4_t svread_ver_za8_s8_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za16_u16_vg1x2)))
-svuint16x2_t svread_za16_u16_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za16_bf16_vg1x2)))
-svbfloat16x2_t svread_za16_bf16_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za16_f16_vg1x2)))
-svfloat16x2_t svread_za16_f16_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za16_s16_vg1x2)))
-svint16x2_t svread_za16_s16_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za16_u16_vg1x4)))
-svuint16x4_t svread_za16_u16_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za16_bf16_vg1x4)))
-svbfloat16x4_t svread_za16_bf16_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za16_f16_vg1x4)))
-svfloat16x4_t svread_za16_f16_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za16_s16_vg1x4)))
-svint16x4_t svread_za16_s16_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za32_u32_vg1x2)))
-svuint32x2_t svread_za32_u32_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za32_f32_vg1x2)))
-svfloat32x2_t svread_za32_f32_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za32_s32_vg1x2)))
-svint32x2_t svread_za32_s32_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za32_u32_vg1x4)))
-svuint32x4_t svread_za32_u32_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za32_f32_vg1x4)))
-svfloat32x4_t svread_za32_f32_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za32_s32_vg1x4)))
-svint32x4_t svread_za32_s32_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za64_u64_vg1x2)))
-svuint64x2_t svread_za64_u64_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za64_f64_vg1x2)))
-svfloat64x2_t svread_za64_f64_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za64_s64_vg1x2)))
-svint64x2_t svread_za64_s64_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za64_u64_vg1x4)))
-svuint64x4_t svread_za64_u64_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za64_f64_vg1x4)))
-svfloat64x4_t svread_za64_f64_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za64_s64_vg1x4)))
-svint64x4_t svread_za64_s64_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za8_u8_vg1x2)))
-svuint8x2_t svread_za8_u8_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za8_s8_vg1x2)))
-svint8x2_t svread_za8_s8_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za8_u8_vg1x4)))
-svuint8x4_t svread_za8_u8_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_za8_s8_vg1x4)))
-svint8x4_t svread_za8_s8_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svstr_zt)))
-void svstr_zt(uint64_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za32_u32_vg1x2)))
-void svsub_write_single_za32_u32_vg1x2(uint32_t, svuint32x2_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za32_s32_vg1x2)))
-void svsub_write_single_za32_s32_vg1x2(uint32_t, svint32x2_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za32_u32_vg1x4)))
-void svsub_write_single_za32_u32_vg1x4(uint32_t, svuint32x4_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za32_s32_vg1x4)))
-void svsub_write_single_za32_s32_vg1x4(uint32_t, svint32x4_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za32_u32_vg1x2)))
-void svsub_write_za32_u32_vg1x2(uint32_t, svuint32x2_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za32_s32_vg1x2)))
-void svsub_write_za32_s32_vg1x2(uint32_t, svint32x2_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za32_u32_vg1x4)))
-void svsub_write_za32_u32_vg1x4(uint32_t, svuint32x4_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za32_s32_vg1x4)))
-void svsub_write_za32_s32_vg1x4(uint32_t, svint32x4_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_u32_vg1x2)))
-void svsub_za32_u32_vg1x2(uint32_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_f32_vg1x2)))
-void svsub_za32_f32_vg1x2(uint32_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_s32_vg1x2)))
-void svsub_za32_s32_vg1x2(uint32_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_u32_vg1x4)))
-void svsub_za32_u32_vg1x4(uint32_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_f32_vg1x4)))
-void svsub_za32_f32_vg1x4(uint32_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_s32_vg1x4)))
-void svsub_za32_s32_vg1x4(uint32_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_single_za32_s8_vg1x2)))
-void svsudot_single_za32_s8_vg1x2(uint32_t, svint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_single_za32_s8_vg1x4)))
-void svsudot_single_za32_s8_vg1x4(uint32_t, svint8x4_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_lane_za32_s8_vg1x2)))
-void svsudot_lane_za32_s8_vg1x2(uint32_t, svint8x2_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_lane_za32_s8_vg1x4)))
-void svsudot_lane_za32_s8_vg1x4(uint32_t, svint8x4_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_za32_s8_vg1x2)))
-void svsudot_za32_s8_vg1x2(uint32_t, svint8x2_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_za32_s8_vg1x4)))
-void svsudot_za32_s8_vg1x4(uint32_t, svint8x4_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_single_za32_s8_vg4x2)))
-void svsumla_single_za32_s8_vg4x2(uint32_t, svint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_single_za32_s8_vg4x4)))
-void svsumla_single_za32_s8_vg4x4(uint32_t, svint8x4_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_lane_za32_s8_vg4x1)))
-void svsumla_lane_za32_s8_vg4x1(uint32_t, svint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_lane_za32_s8_vg4x2)))
-void svsumla_lane_za32_s8_vg4x2(uint32_t, svint8x2_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_lane_za32_s8_vg4x4)))
-void svsumla_lane_za32_s8_vg4x4(uint32_t, svint8x4_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_za32_s8_vg4x1)))
-void svsumla_za32_s8_vg4x1(uint32_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_za32_s8_vg4x2)))
-void svsumla_za32_s8_vg4x2(uint32_t, svint8x2_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_za32_s8_vg4x4)))
-void svsumla_za32_s8_vg4x4(uint32_t, svint8x4_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsuvdot_lane_za32_s8_vg1x4)))
-void svsuvdot_lane_za32_s8_vg1x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_single_za32_u8_vg1x2)))
-void svusdot_single_za32_u8_vg1x2(uint32_t, svuint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_single_za32_u8_vg1x4)))
-void svusdot_single_za32_u8_vg1x4(uint32_t, svuint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_lane_za32_u8_vg1x2)))
-void svusdot_lane_za32_u8_vg1x2(uint32_t, svuint8x2_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_lane_za32_u8_vg1x4)))
-void svusdot_lane_za32_u8_vg1x4(uint32_t, svuint8x4_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_za32_u8_vg1x2)))
-void svusdot_za32_u8_vg1x2(uint32_t, svuint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_za32_u8_vg1x4)))
-void svusdot_za32_u8_vg1x4(uint32_t, svuint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_single_za32_u8_vg4x2)))
-void svusmla_single_za32_u8_vg4x2(uint32_t, svuint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_single_za32_u8_vg4x4)))
-void svusmla_single_za32_u8_vg4x4(uint32_t, svuint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_lane_za32_u8_vg4x1)))
-void svusmla_lane_za32_u8_vg4x1(uint32_t, svuint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_lane_za32_u8_vg4x2)))
-void svusmla_lane_za32_u8_vg4x2(uint32_t, svuint8x2_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_lane_za32_u8_vg4x4)))
-void svusmla_lane_za32_u8_vg4x4(uint32_t, svuint8x4_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_za32_u8_vg4x1)))
-void svusmla_za32_u8_vg4x1(uint32_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_za32_u8_vg4x2)))
-void svusmla_za32_u8_vg4x2(uint32_t, svuint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_za32_u8_vg4x4)))
-void svusmla_za32_u8_vg4x4(uint32_t, svuint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusvdot_lane_za32_u8_vg1x4)))
-void svusvdot_lane_za32_u8_vg1x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_bf16_vg1x2)))
-void svvdot_lane_za32_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_f16_vg1x2)))
-void svvdot_lane_za32_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_s16_vg1x2)))
-void svvdot_lane_za32_s16_vg1x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_u16_vg1x2)))
-void svvdot_lane_za32_u16_vg1x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_s8_vg1x4)))
-void svvdot_lane_za32_s8_vg1x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_u8_vg1x4)))
-void svvdot_lane_za32_u8_vg1x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_vg2)))
-void svwrite_hor_za16_u16_vg2(uint64_t, uint32_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_vg2)))
-void svwrite_hor_za16_bf16_vg2(uint64_t, uint32_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_vg2)))
-void svwrite_hor_za16_f16_vg2(uint64_t, uint32_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_vg2)))
-void svwrite_hor_za16_s16_vg2(uint64_t, uint32_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_vg4)))
-void svwrite_hor_za16_u16_vg4(uint64_t, uint32_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_vg4)))
-void svwrite_hor_za16_bf16_vg4(uint64_t, uint32_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_vg4)))
-void svwrite_hor_za16_f16_vg4(uint64_t, uint32_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_vg4)))
-void svwrite_hor_za16_s16_vg4(uint64_t, uint32_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_vg2)))
-void svwrite_hor_za32_u32_vg2(uint64_t, uint32_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_vg2)))
-void svwrite_hor_za32_f32_vg2(uint64_t, uint32_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_vg2)))
-void svwrite_hor_za32_s32_vg2(uint64_t, uint32_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_vg4)))
-void svwrite_hor_za32_u32_vg4(uint64_t, uint32_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_vg4)))
-void svwrite_hor_za32_f32_vg4(uint64_t, uint32_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_vg4)))
-void svwrite_hor_za32_s32_vg4(uint64_t, uint32_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_vg2)))
-void svwrite_hor_za64_u64_vg2(uint64_t, uint32_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_vg2)))
-void svwrite_hor_za64_f64_vg2(uint64_t, uint32_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_vg2)))
-void svwrite_hor_za64_s64_vg2(uint64_t, uint32_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_vg4)))
-void svwrite_hor_za64_u64_vg4(uint64_t, uint32_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_vg4)))
-void svwrite_hor_za64_f64_vg4(uint64_t, uint32_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_vg4)))
-void svwrite_hor_za64_s64_vg4(uint64_t, uint32_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_vg2)))
-void svwrite_hor_za8_u8_vg2(uint64_t, uint32_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_vg2)))
-void svwrite_hor_za8_s8_vg2(uint64_t, uint32_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_vg4)))
-void svwrite_hor_za8_u8_vg4(uint64_t, uint32_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_vg4)))
-void svwrite_hor_za8_s8_vg4(uint64_t, uint32_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_vg2)))
-void svwrite_ver_za16_u16_vg2(uint64_t, uint32_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_vg2)))
-void svwrite_ver_za16_bf16_vg2(uint64_t, uint32_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_vg2)))
-void svwrite_ver_za16_f16_vg2(uint64_t, uint32_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_vg2)))
-void svwrite_ver_za16_s16_vg2(uint64_t, uint32_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_vg4)))
-void svwrite_ver_za16_u16_vg4(uint64_t, uint32_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_vg4)))
-void svwrite_ver_za16_bf16_vg4(uint64_t, uint32_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_vg4)))
-void svwrite_ver_za16_f16_vg4(uint64_t, uint32_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_vg4)))
-void svwrite_ver_za16_s16_vg4(uint64_t, uint32_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_vg2)))
-void svwrite_ver_za32_u32_vg2(uint64_t, uint32_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_vg2)))
-void svwrite_ver_za32_f32_vg2(uint64_t, uint32_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_vg2)))
-void svwrite_ver_za32_s32_vg2(uint64_t, uint32_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_vg4)))
-void svwrite_ver_za32_u32_vg4(uint64_t, uint32_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_vg4)))
-void svwrite_ver_za32_f32_vg4(uint64_t, uint32_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_vg4)))
-void svwrite_ver_za32_s32_vg4(uint64_t, uint32_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_vg2)))
-void svwrite_ver_za64_u64_vg2(uint64_t, uint32_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_vg2)))
-void svwrite_ver_za64_f64_vg2(uint64_t, uint32_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_vg2)))
-void svwrite_ver_za64_s64_vg2(uint64_t, uint32_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_vg4)))
-void svwrite_ver_za64_u64_vg4(uint64_t, uint32_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_vg4)))
-void svwrite_ver_za64_f64_vg4(uint64_t, uint32_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_vg4)))
-void svwrite_ver_za64_s64_vg4(uint64_t, uint32_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_vg2)))
-void svwrite_ver_za8_u8_vg2(uint64_t, uint32_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_vg2)))
-void svwrite_ver_za8_s8_vg2(uint64_t, uint32_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_vg4)))
-void svwrite_ver_za8_u8_vg4(uint64_t, uint32_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_vg4)))
-void svwrite_ver_za8_s8_vg4(uint64_t, uint32_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_u16_vg1x2)))
-void svwrite_za16_u16_vg1x2(uint32_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_bf16_vg1x2)))
-void svwrite_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_f16_vg1x2)))
-void svwrite_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_s16_vg1x2)))
-void svwrite_za16_s16_vg1x2(uint32_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_u16_vg1x4)))
-void svwrite_za16_u16_vg1x4(uint32_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_bf16_vg1x4)))
-void svwrite_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_f16_vg1x4)))
-void svwrite_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_s16_vg1x4)))
-void svwrite_za16_s16_vg1x4(uint32_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_u32_vg1x2)))
-void svwrite_za32_u32_vg1x2(uint32_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_f32_vg1x2)))
-void svwrite_za32_f32_vg1x2(uint32_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_s32_vg1x2)))
-void svwrite_za32_s32_vg1x2(uint32_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_u32_vg1x4)))
-void svwrite_za32_u32_vg1x4(uint32_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_f32_vg1x4)))
-void svwrite_za32_f32_vg1x4(uint32_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_s32_vg1x4)))
-void svwrite_za32_s32_vg1x4(uint32_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_u64_vg1x2)))
-void svwrite_za64_u64_vg1x2(uint32_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_f64_vg1x2)))
-void svwrite_za64_f64_vg1x2(uint32_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_s64_vg1x2)))
-void svwrite_za64_s64_vg1x2(uint32_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_u64_vg1x4)))
-void svwrite_za64_u64_vg1x4(uint32_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_f64_vg1x4)))
-void svwrite_za64_f64_vg1x4(uint32_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_s64_vg1x4)))
-void svwrite_za64_s64_vg1x4(uint32_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_u8_vg1x2)))
-void svwrite_za8_u8_vg1x2(uint32_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_s8_vg1x2)))
-void svwrite_za8_s8_vg1x2(uint32_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_u8_vg1x4)))
-void svwrite_za8_u8_vg1x4(uint32_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_s8_vg1x4)))
-void svwrite_za8_s8_vg1x4(uint32_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_zt)))
-void svzero_zt(uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_u32_vg1x2)))
-void svadd_write_za32_vg1x2(uint32_t, svuint32x2_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_s32_vg1x2)))
-void svadd_write_za32_vg1x2(uint32_t, svint32x2_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_u32_vg1x4)))
-void svadd_write_za32_vg1x4(uint32_t, svuint32x4_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_s32_vg1x4)))
-void svadd_write_za32_vg1x4(uint32_t, svint32x4_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za32_u32_vg1x2)))
-void svadd_write_za32_vg1x2(uint32_t, svuint32x2_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za32_s32_vg1x2)))
-void svadd_write_za32_vg1x2(uint32_t, svint32x2_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za32_u32_vg1x4)))
-void svadd_write_za32_vg1x4(uint32_t, svuint32x4_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za32_s32_vg1x4)))
-void svadd_write_za32_vg1x4(uint32_t, svint32x4_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_u32_vg1x2)))
-void svadd_za32_vg1x2(uint32_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_f32_vg1x2)))
-void svadd_za32_vg1x2(uint32_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_s32_vg1x2)))
-void svadd_za32_vg1x2(uint32_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_u32_vg1x4)))
-void svadd_za32_vg1x4(uint32_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_f32_vg1x4)))
-void svadd_za32_vg1x4(uint32_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za32_s32_vg1x4)))
-void svadd_za32_vg1x4(uint32_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svbmopa_za32_u32_m)))
-void svbmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svbmopa_za32_s32_m)))
-void svbmopa_za32_m(uint64_t, svbool_t, svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svbmops_za32_u32_m)))
-void svbmops_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svbmops_za32_s32_m)))
-void svbmops_za32_m(uint64_t, svbool_t, svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_bf16_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_f16_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_s8_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_s16_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_u8_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svuint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_u16_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_bf16_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_f16_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_s8_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_s16_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_u8_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svuint8x4_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_u16_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_bf16_vg1x2)))
-void svdot_lane_za32_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_f16_vg1x2)))
-void svdot_lane_za32_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_s8_vg1x2)))
-void svdot_lane_za32_vg1x2(uint32_t, svint8x2_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_s16_vg1x2)))
-void svdot_lane_za32_vg1x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_u8_vg1x2)))
-void svdot_lane_za32_vg1x2(uint32_t, svuint8x2_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_u16_vg1x2)))
-void svdot_lane_za32_vg1x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_bf16_vg1x4)))
-void svdot_lane_za32_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_f16_vg1x4)))
-void svdot_lane_za32_vg1x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_s8_vg1x4)))
-void svdot_lane_za32_vg1x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_s16_vg1x4)))
-void svdot_lane_za32_vg1x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_u8_vg1x4)))
-void svdot_lane_za32_vg1x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_u16_vg1x4)))
-void svdot_lane_za32_vg1x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_bf16_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_f16_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_s8_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_s16_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_u8_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svuint8x2_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_u16_vg1x2)))
-void svdot_za32_vg1x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_bf16_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_f16_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_s8_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_s16_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_u8_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svuint8x4_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_u16_vg1x4)))
-void svdot_za32_vg1x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_f32_vg1x2)))
-void svmla_za32_vg1x2(uint32_t, svfloat32x2_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_f32_vg1x4)))
-void svmla_za32_vg1x4(uint32_t, svfloat32x4_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_bf16_vg2x2)))
-void svmla_za32_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_f16_vg2x2)))
-void svmla_za32_vg2x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_s16_vg2x2)))
-void svmla_za32_vg2x2(uint32_t, svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_u16_vg2x2)))
-void svmla_za32_vg2x2(uint32_t, svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_bf16_vg2x4)))
-void svmla_za32_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_f16_vg2x4)))
-void svmla_za32_vg2x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_s16_vg2x4)))
-void svmla_za32_vg2x4(uint32_t, svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_u16_vg2x4)))
-void svmla_za32_vg2x4(uint32_t, svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_s8_vg4x2)))
-void svmla_za32_vg4x2(uint32_t, svint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_u8_vg4x2)))
-void svmla_za32_vg4x2(uint32_t, svuint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_s8_vg4x4)))
-void svmla_za32_vg4x4(uint32_t, svint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_u8_vg4x4)))
-void svmla_za32_vg4x4(uint32_t, svuint8x4_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f32_vg1x2)))
-void svmla_lane_za32_vg1x2(uint32_t, svfloat32x2_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f32_vg1x4)))
-void svmla_lane_za32_vg1x4(uint32_t, svfloat32x4_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_bf16_vg2x1)))
-void svmla_lane_za32_vg2x1(uint32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f16_vg2x1)))
-void svmla_lane_za32_vg2x1(uint32_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s16_vg2x1)))
-void svmla_lane_za32_vg2x1(uint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u16_vg2x1)))
-void svmla_lane_za32_vg2x1(uint32_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_bf16_vg2x2)))
-void svmla_lane_za32_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f16_vg2x2)))
-void svmla_lane_za32_vg2x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s16_vg2x2)))
-void svmla_lane_za32_vg2x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u16_vg2x2)))
-void svmla_lane_za32_vg2x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_bf16_vg2x4)))
-void svmla_lane_za32_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_f16_vg2x4)))
-void svmla_lane_za32_vg2x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s16_vg2x4)))
-void svmla_lane_za32_vg2x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u16_vg2x4)))
-void svmla_lane_za32_vg2x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s8_vg4x1)))
-void svmla_lane_za32_vg4x1(uint32_t, svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u8_vg4x1)))
-void svmla_lane_za32_vg4x1(uint32_t, svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s8_vg4x2)))
-void svmla_lane_za32_vg4x2(uint32_t, svint8x2_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u8_vg4x2)))
-void svmla_lane_za32_vg4x2(uint32_t, svuint8x2_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_s8_vg4x4)))
-void svmla_lane_za32_vg4x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_u8_vg4x4)))
-void svmla_lane_za32_vg4x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f32_vg1x2)))
-void svmla_za32_vg1x2(uint32_t, svfloat32x2_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f32_vg1x4)))
-void svmla_za32_vg1x4(uint32_t, svfloat32x4_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_bf16_vg2x1)))
-void svmla_za32_vg2x1(uint32_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f16_vg2x1)))
-void svmla_za32_vg2x1(uint32_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s16_vg2x1)))
-void svmla_za32_vg2x1(uint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u16_vg2x1)))
-void svmla_za32_vg2x1(uint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_bf16_vg2x2)))
-void svmla_za32_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f16_vg2x2)))
-void svmla_za32_vg2x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s16_vg2x2)))
-void svmla_za32_vg2x2(uint32_t, svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u16_vg2x2)))
-void svmla_za32_vg2x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_bf16_vg2x4)))
-void svmla_za32_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_f16_vg2x4)))
-void svmla_za32_vg2x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s16_vg2x4)))
-void svmla_za32_vg2x4(uint32_t, svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u16_vg2x4)))
-void svmla_za32_vg2x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s8_vg4x1)))
-void svmla_za32_vg4x1(uint32_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u8_vg4x1)))
-void svmla_za32_vg4x1(uint32_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s8_vg4x2)))
-void svmla_za32_vg4x2(uint32_t, svint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u8_vg4x2)))
-void svmla_za32_vg4x2(uint32_t, svuint8x2_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_s8_vg4x4)))
-void svmla_za32_vg4x4(uint32_t, svint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_u8_vg4x4)))
-void svmla_za32_vg4x4(uint32_t, svuint8x4_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_f32_vg1x2)))
-void svmls_za32_vg1x2(uint32_t, svfloat32x2_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_f32_vg1x4)))
-void svmls_za32_vg1x4(uint32_t, svfloat32x4_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_bf16_vg2x2)))
-void svmls_za32_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_f16_vg2x2)))
-void svmls_za32_vg2x2(uint32_t, svfloat16x2_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_s16_vg2x2)))
-void svmls_za32_vg2x2(uint32_t, svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_u16_vg2x2)))
-void svmls_za32_vg2x2(uint32_t, svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_bf16_vg2x4)))
-void svmls_za32_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_f16_vg2x4)))
-void svmls_za32_vg2x4(uint32_t, svfloat16x4_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_s16_vg2x4)))
-void svmls_za32_vg2x4(uint32_t, svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_u16_vg2x4)))
-void svmls_za32_vg2x4(uint32_t, svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_s8_vg4x2)))
-void svmls_za32_vg4x2(uint32_t, svint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_u8_vg4x2)))
-void svmls_za32_vg4x2(uint32_t, svuint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_s8_vg4x4)))
-void svmls_za32_vg4x4(uint32_t, svint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za32_u8_vg4x4)))
-void svmls_za32_vg4x4(uint32_t, svuint8x4_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f32_vg1x2)))
-void svmls_lane_za32_vg1x2(uint32_t, svfloat32x2_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f32_vg1x4)))
-void svmls_lane_za32_vg1x4(uint32_t, svfloat32x4_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_bf16_vg2x1)))
-void svmls_lane_za32_vg2x1(uint32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f16_vg2x1)))
-void svmls_lane_za32_vg2x1(uint32_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s16_vg2x1)))
-void svmls_lane_za32_vg2x1(uint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u16_vg2x1)))
-void svmls_lane_za32_vg2x1(uint32_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_bf16_vg2x2)))
-void svmls_lane_za32_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f16_vg2x2)))
-void svmls_lane_za32_vg2x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s16_vg2x2)))
-void svmls_lane_za32_vg2x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u16_vg2x2)))
-void svmls_lane_za32_vg2x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_bf16_vg2x4)))
-void svmls_lane_za32_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_f16_vg2x4)))
-void svmls_lane_za32_vg2x4(uint32_t, svfloat16x4_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s16_vg2x4)))
-void svmls_lane_za32_vg2x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u16_vg2x4)))
-void svmls_lane_za32_vg2x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s8_vg4x1)))
-void svmls_lane_za32_vg4x1(uint32_t, svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u8_vg4x1)))
-void svmls_lane_za32_vg4x1(uint32_t, svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s8_vg4x2)))
-void svmls_lane_za32_vg4x2(uint32_t, svint8x2_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u8_vg4x2)))
-void svmls_lane_za32_vg4x2(uint32_t, svuint8x2_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_s8_vg4x4)))
-void svmls_lane_za32_vg4x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za32_u8_vg4x4)))
-void svmls_lane_za32_vg4x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f32_vg1x2)))
-void svmls_za32_vg1x2(uint32_t, svfloat32x2_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f32_vg1x4)))
-void svmls_za32_vg1x4(uint32_t, svfloat32x4_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_bf16_vg2x1)))
-void svmls_za32_vg2x1(uint32_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f16_vg2x1)))
-void svmls_za32_vg2x1(uint32_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s16_vg2x1)))
-void svmls_za32_vg2x1(uint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u16_vg2x1)))
-void svmls_za32_vg2x1(uint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_bf16_vg2x2)))
-void svmls_za32_vg2x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f16_vg2x2)))
-void svmls_za32_vg2x2(uint32_t, svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s16_vg2x2)))
-void svmls_za32_vg2x2(uint32_t, svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u16_vg2x2)))
-void svmls_za32_vg2x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_bf16_vg2x4)))
-void svmls_za32_vg2x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_f16_vg2x4)))
-void svmls_za32_vg2x4(uint32_t, svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s16_vg2x4)))
-void svmls_za32_vg2x4(uint32_t, svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u16_vg2x4)))
-void svmls_za32_vg2x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s8_vg4x1)))
-void svmls_za32_vg4x1(uint32_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u8_vg4x1)))
-void svmls_za32_vg4x1(uint32_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s8_vg4x2)))
-void svmls_za32_vg4x2(uint32_t, svint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u8_vg4x2)))
-void svmls_za32_vg4x2(uint32_t, svuint8x2_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_s8_vg4x4)))
-void svmls_za32_vg4x4(uint32_t, svint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za32_u8_vg4x4)))
-void svmls_za32_vg4x4(uint32_t, svuint8x4_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s16_m)))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u16_m)))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s16_m)))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u16_m)))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za32_u32_vg1x2)))
-void svsub_write_za32_vg1x2(uint32_t, svuint32x2_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za32_s32_vg1x2)))
-void svsub_write_za32_vg1x2(uint32_t, svint32x2_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za32_u32_vg1x4)))
-void svsub_write_za32_vg1x4(uint32_t, svuint32x4_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za32_s32_vg1x4)))
-void svsub_write_za32_vg1x4(uint32_t, svint32x4_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za32_u32_vg1x2)))
-void svsub_write_za32_vg1x2(uint32_t, svuint32x2_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za32_s32_vg1x2)))
-void svsub_write_za32_vg1x2(uint32_t, svint32x2_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za32_u32_vg1x4)))
-void svsub_write_za32_vg1x4(uint32_t, svuint32x4_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za32_s32_vg1x4)))
-void svsub_write_za32_vg1x4(uint32_t, svint32x4_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_u32_vg1x2)))
-void svsub_za32_vg1x2(uint32_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_f32_vg1x2)))
-void svsub_za32_vg1x2(uint32_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_s32_vg1x2)))
-void svsub_za32_vg1x2(uint32_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_u32_vg1x4)))
-void svsub_za32_vg1x4(uint32_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_f32_vg1x4)))
-void svsub_za32_vg1x4(uint32_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za32_s32_vg1x4)))
-void svsub_za32_vg1x4(uint32_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_single_za32_s8_vg1x2)))
-void svsudot_za32_vg1x2(uint32_t, svint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_single_za32_s8_vg1x4)))
-void svsudot_za32_vg1x4(uint32_t, svint8x4_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_lane_za32_s8_vg1x2)))
-void svsudot_lane_za32_vg1x2(uint32_t, svint8x2_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_lane_za32_s8_vg1x4)))
-void svsudot_lane_za32_vg1x4(uint32_t, svint8x4_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_za32_s8_vg1x2)))
-void svsudot_za32_vg1x2(uint32_t, svint8x2_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsudot_za32_s8_vg1x4)))
-void svsudot_za32_vg1x4(uint32_t, svint8x4_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_single_za32_s8_vg4x2)))
-void svsumla_za32_vg4x2(uint32_t, svint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_single_za32_s8_vg4x4)))
-void svsumla_za32_vg4x4(uint32_t, svint8x4_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_lane_za32_s8_vg4x1)))
-void svsumla_lane_za32_vg4x1(uint32_t, svint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_lane_za32_s8_vg4x2)))
-void svsumla_lane_za32_vg4x2(uint32_t, svint8x2_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_lane_za32_s8_vg4x4)))
-void svsumla_lane_za32_vg4x4(uint32_t, svint8x4_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_za32_s8_vg4x1)))
-void svsumla_za32_vg4x1(uint32_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_za32_s8_vg4x2)))
-void svsumla_za32_vg4x2(uint32_t, svint8x2_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumla_za32_s8_vg4x4)))
-void svsumla_za32_vg4x4(uint32_t, svint8x4_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsuvdot_lane_za32_s8_vg1x4)))
-void svsuvdot_lane_za32_vg1x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_single_za32_u8_vg1x2)))
-void svusdot_za32_vg1x2(uint32_t, svuint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_single_za32_u8_vg1x4)))
-void svusdot_za32_vg1x4(uint32_t, svuint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_lane_za32_u8_vg1x2)))
-void svusdot_lane_za32_vg1x2(uint32_t, svuint8x2_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_lane_za32_u8_vg1x4)))
-void svusdot_lane_za32_vg1x4(uint32_t, svuint8x4_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_za32_u8_vg1x2)))
-void svusdot_za32_vg1x2(uint32_t, svuint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusdot_za32_u8_vg1x4)))
-void svusdot_za32_vg1x4(uint32_t, svuint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_single_za32_u8_vg4x2)))
-void svusmla_za32_vg4x2(uint32_t, svuint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_single_za32_u8_vg4x4)))
-void svusmla_za32_vg4x4(uint32_t, svuint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_lane_za32_u8_vg4x1)))
-void svusmla_lane_za32_vg4x1(uint32_t, svuint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_lane_za32_u8_vg4x2)))
-void svusmla_lane_za32_vg4x2(uint32_t, svuint8x2_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_lane_za32_u8_vg4x4)))
-void svusmla_lane_za32_vg4x4(uint32_t, svuint8x4_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_za32_u8_vg4x1)))
-void svusmla_za32_vg4x1(uint32_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_za32_u8_vg4x2)))
-void svusmla_za32_vg4x2(uint32_t, svuint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmla_za32_u8_vg4x4)))
-void svusmla_za32_vg4x4(uint32_t, svuint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusvdot_lane_za32_u8_vg1x4)))
-void svusvdot_lane_za32_vg1x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_bf16_vg1x2)))
-void svvdot_lane_za32_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_f16_vg1x2)))
-void svvdot_lane_za32_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_s16_vg1x2)))
-void svvdot_lane_za32_vg1x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_u16_vg1x2)))
-void svvdot_lane_za32_vg1x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_s8_vg1x4)))
-void svvdot_lane_za32_vg1x4(uint32_t, svint8x4_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za32_u8_vg1x4)))
-void svvdot_lane_za32_vg1x4(uint32_t, svuint8x4_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_vg2)))
-void svwrite_hor_za16_vg2(uint64_t, uint32_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_vg2)))
-void svwrite_hor_za16_vg2(uint64_t, uint32_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_vg2)))
-void svwrite_hor_za16_vg2(uint64_t, uint32_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_vg2)))
-void svwrite_hor_za16_vg2(uint64_t, uint32_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_vg4)))
-void svwrite_hor_za16_vg4(uint64_t, uint32_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_vg4)))
-void svwrite_hor_za16_vg4(uint64_t, uint32_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_vg4)))
-void svwrite_hor_za16_vg4(uint64_t, uint32_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_vg4)))
-void svwrite_hor_za16_vg4(uint64_t, uint32_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_vg2)))
-void svwrite_hor_za32_vg2(uint64_t, uint32_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_vg2)))
-void svwrite_hor_za32_vg2(uint64_t, uint32_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_vg2)))
-void svwrite_hor_za32_vg2(uint64_t, uint32_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_vg4)))
-void svwrite_hor_za32_vg4(uint64_t, uint32_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_vg4)))
-void svwrite_hor_za32_vg4(uint64_t, uint32_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_vg4)))
-void svwrite_hor_za32_vg4(uint64_t, uint32_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_vg2)))
-void svwrite_hor_za64_vg2(uint64_t, uint32_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_vg2)))
-void svwrite_hor_za64_vg2(uint64_t, uint32_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_vg2)))
-void svwrite_hor_za64_vg2(uint64_t, uint32_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_vg4)))
-void svwrite_hor_za64_vg4(uint64_t, uint32_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_vg4)))
-void svwrite_hor_za64_vg4(uint64_t, uint32_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_vg4)))
-void svwrite_hor_za64_vg4(uint64_t, uint32_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_vg2)))
-void svwrite_hor_za8_vg2(uint64_t, uint32_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_vg2)))
-void svwrite_hor_za8_vg2(uint64_t, uint32_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_vg4)))
-void svwrite_hor_za8_vg4(uint64_t, uint32_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_vg4)))
-void svwrite_hor_za8_vg4(uint64_t, uint32_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_vg2)))
-void svwrite_ver_za16_vg2(uint64_t, uint32_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_vg2)))
-void svwrite_ver_za16_vg2(uint64_t, uint32_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_vg2)))
-void svwrite_ver_za16_vg2(uint64_t, uint32_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_vg2)))
-void svwrite_ver_za16_vg2(uint64_t, uint32_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_vg4)))
-void svwrite_ver_za16_vg4(uint64_t, uint32_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_vg4)))
-void svwrite_ver_za16_vg4(uint64_t, uint32_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_vg4)))
-void svwrite_ver_za16_vg4(uint64_t, uint32_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_vg4)))
-void svwrite_ver_za16_vg4(uint64_t, uint32_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_vg2)))
-void svwrite_ver_za32_vg2(uint64_t, uint32_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_vg2)))
-void svwrite_ver_za32_vg2(uint64_t, uint32_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_vg2)))
-void svwrite_ver_za32_vg2(uint64_t, uint32_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_vg4)))
-void svwrite_ver_za32_vg4(uint64_t, uint32_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_vg4)))
-void svwrite_ver_za32_vg4(uint64_t, uint32_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_vg4)))
-void svwrite_ver_za32_vg4(uint64_t, uint32_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_vg2)))
-void svwrite_ver_za64_vg2(uint64_t, uint32_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_vg2)))
-void svwrite_ver_za64_vg2(uint64_t, uint32_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_vg2)))
-void svwrite_ver_za64_vg2(uint64_t, uint32_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_vg4)))
-void svwrite_ver_za64_vg4(uint64_t, uint32_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_vg4)))
-void svwrite_ver_za64_vg4(uint64_t, uint32_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_vg4)))
-void svwrite_ver_za64_vg4(uint64_t, uint32_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_vg2)))
-void svwrite_ver_za8_vg2(uint64_t, uint32_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_vg2)))
-void svwrite_ver_za8_vg2(uint64_t, uint32_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_vg4)))
-void svwrite_ver_za8_vg4(uint64_t, uint32_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_vg4)))
-void svwrite_ver_za8_vg4(uint64_t, uint32_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_u16_vg1x2)))
-void svwrite_za16_vg1x2(uint32_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_bf16_vg1x2)))
-void svwrite_za16_vg1x2(uint32_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_f16_vg1x2)))
-void svwrite_za16_vg1x2(uint32_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_s16_vg1x2)))
-void svwrite_za16_vg1x2(uint32_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_u16_vg1x4)))
-void svwrite_za16_vg1x4(uint32_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_bf16_vg1x4)))
-void svwrite_za16_vg1x4(uint32_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_f16_vg1x4)))
-void svwrite_za16_vg1x4(uint32_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za16_s16_vg1x4)))
-void svwrite_za16_vg1x4(uint32_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_u32_vg1x2)))
-void svwrite_za32_vg1x2(uint32_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_f32_vg1x2)))
-void svwrite_za32_vg1x2(uint32_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_s32_vg1x2)))
-void svwrite_za32_vg1x2(uint32_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_u32_vg1x4)))
-void svwrite_za32_vg1x4(uint32_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_f32_vg1x4)))
-void svwrite_za32_vg1x4(uint32_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za32_s32_vg1x4)))
-void svwrite_za32_vg1x4(uint32_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_u64_vg1x2)))
-void svwrite_za64_vg1x2(uint32_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_f64_vg1x2)))
-void svwrite_za64_vg1x2(uint32_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_s64_vg1x2)))
-void svwrite_za64_vg1x2(uint32_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_u64_vg1x4)))
-void svwrite_za64_vg1x4(uint32_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_f64_vg1x4)))
-void svwrite_za64_vg1x4(uint32_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za64_s64_vg1x4)))
-void svwrite_za64_vg1x4(uint32_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_u8_vg1x2)))
-void svwrite_za8_vg1x2(uint32_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_s8_vg1x2)))
-void svwrite_za8_vg1x2(uint32_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_u8_vg1x4)))
-void svwrite_za8_vg1x4(uint32_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_s8_vg1x4)))
-void svwrite_za8_vg1x4(uint32_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
-void svadd_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
-void svadd_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
-void svmla_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
-void svmla_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
-void svmla_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
-void svmla_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
-void svmla_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
-void svmla_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
-void svmls_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
-void svmls_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
-void svmls_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
-void svmls_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
-void svmls_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
-void svmls_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
-void svmopa_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
-void svmops_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
-void svsub_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
-void svsub_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
-void svadd_za16_vg1x2(uint32_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
-void svadd_za16_vg1x4(uint32_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
-void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
-void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
-void svmla_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
-void svmla_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
-void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
-void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
-void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
-void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
-void svmls_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
-void svmls_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
-void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
-void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
-void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
-void svmops_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
-void svsub_za16_vg1x2(uint32_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
-void svsub_za16_vg1x4(uint32_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x2)))
-void svadd_za64_f64_vg1x2(uint32_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x4)))
-void svadd_za64_f64_vg1x4(uint32_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_f64_vg1x2)))
-void svmla_single_za64_f64_vg1x2(uint32_t, svfloat64x2_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_f64_vg1x4)))
-void svmla_single_za64_f64_vg1x4(uint32_t, svfloat64x4_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_f64_vg1x2)))
-void svmla_lane_za64_f64_vg1x2(uint32_t, svfloat64x2_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_f64_vg1x4)))
-void svmla_lane_za64_f64_vg1x4(uint32_t, svfloat64x4_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_f64_vg1x2)))
-void svmla_za64_f64_vg1x2(uint32_t, svfloat64x2_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_f64_vg1x4)))
-void svmla_za64_f64_vg1x4(uint32_t, svfloat64x4_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_f64_vg1x2)))
-void svmls_single_za64_f64_vg1x2(uint32_t, svfloat64x2_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_f64_vg1x4)))
-void svmls_single_za64_f64_vg1x4(uint32_t, svfloat64x4_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_f64_vg1x2)))
-void svmls_lane_za64_f64_vg1x2(uint32_t, svfloat64x2_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_f64_vg1x4)))
-void svmls_lane_za64_f64_vg1x4(uint32_t, svfloat64x4_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_f64_vg1x2)))
-void svmls_za64_f64_vg1x2(uint32_t, svfloat64x2_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_f64_vg1x4)))
-void svmls_za64_f64_vg1x4(uint32_t, svfloat64x4_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_f64_vg1x2)))
-void svsub_za64_f64_vg1x2(uint32_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_f64_vg1x4)))
-void svsub_za64_f64_vg1x4(uint32_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x2)))
-void svadd_za64_vg1x2(uint32_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x4)))
-void svadd_za64_vg1x4(uint32_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_f64_vg1x2)))
-void svmla_za64_vg1x2(uint32_t, svfloat64x2_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_f64_vg1x4)))
-void svmla_za64_vg1x4(uint32_t, svfloat64x4_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_f64_vg1x2)))
-void svmla_lane_za64_vg1x2(uint32_t, svfloat64x2_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_f64_vg1x4)))
-void svmla_lane_za64_vg1x4(uint32_t, svfloat64x4_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_f64_vg1x2)))
-void svmla_za64_vg1x2(uint32_t, svfloat64x2_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_f64_vg1x4)))
-void svmla_za64_vg1x4(uint32_t, svfloat64x4_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_f64_vg1x2)))
-void svmls_za64_vg1x2(uint32_t, svfloat64x2_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_f64_vg1x4)))
-void svmls_za64_vg1x4(uint32_t, svfloat64x4_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_f64_vg1x2)))
-void svmls_lane_za64_vg1x2(uint32_t, svfloat64x2_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_f64_vg1x4)))
-void svmls_lane_za64_vg1x4(uint32_t, svfloat64x4_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_f64_vg1x2)))
-void svmls_za64_vg1x2(uint32_t, svfloat64x2_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_f64_vg1x4)))
-void svmls_za64_vg1x4(uint32_t, svfloat64x4_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_f64_vg1x2)))
-void svsub_za64_vg1x2(uint32_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_f64_vg1x4)))
-void svsub_za64_vg1x4(uint32_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za64_u64_vg1x2)))
-void svadd_write_single_za64_u64_vg1x2(uint32_t, svuint64x2_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za64_s64_vg1x2)))
-void svadd_write_single_za64_s64_vg1x2(uint32_t, svint64x2_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za64_u64_vg1x4)))
-void svadd_write_single_za64_u64_vg1x4(uint32_t, svuint64x4_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za64_s64_vg1x4)))
-void svadd_write_single_za64_s64_vg1x4(uint32_t, svint64x4_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za64_u64_vg1x2)))
-void svadd_write_za64_u64_vg1x2(uint32_t, svuint64x2_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za64_s64_vg1x2)))
-void svadd_write_za64_s64_vg1x2(uint32_t, svint64x2_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za64_u64_vg1x4)))
-void svadd_write_za64_u64_vg1x4(uint32_t, svuint64x4_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za64_s64_vg1x4)))
-void svadd_write_za64_s64_vg1x4(uint32_t, svint64x4_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_u64_vg1x2)))
-void svadd_za64_u64_vg1x2(uint32_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_s64_vg1x2)))
-void svadd_za64_s64_vg1x2(uint32_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_u64_vg1x4)))
-void svadd_za64_u64_vg1x4(uint32_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_s64_vg1x4)))
-void svadd_za64_s64_vg1x4(uint32_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za64_s16_vg1x2)))
-void svdot_single_za64_s16_vg1x2(uint32_t, svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za64_u16_vg1x2)))
-void svdot_single_za64_u16_vg1x2(uint32_t, svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za64_s16_vg1x4)))
-void svdot_single_za64_s16_vg1x4(uint32_t, svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za64_u16_vg1x4)))
-void svdot_single_za64_u16_vg1x4(uint32_t, svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za64_s16_vg1x2)))
-void svdot_lane_za64_s16_vg1x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za64_u16_vg1x2)))
-void svdot_lane_za64_u16_vg1x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za64_s16_vg1x4)))
-void svdot_lane_za64_s16_vg1x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za64_u16_vg1x4)))
-void svdot_lane_za64_u16_vg1x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za64_s16_vg1x2)))
-void svdot_za64_s16_vg1x2(uint32_t, svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za64_u16_vg1x2)))
-void svdot_za64_u16_vg1x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za64_s16_vg1x4)))
-void svdot_za64_s16_vg1x4(uint32_t, svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za64_u16_vg1x4)))
-void svdot_za64_u16_vg1x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_s16_vg4x2)))
-void svmla_single_za64_s16_vg4x2(uint32_t, svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_u16_vg4x2)))
-void svmla_single_za64_u16_vg4x2(uint32_t, svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_s16_vg4x4)))
-void svmla_single_za64_s16_vg4x4(uint32_t, svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_u16_vg4x4)))
-void svmla_single_za64_u16_vg4x4(uint32_t, svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_s16_vg4x1)))
-void svmla_lane_za64_s16_vg4x1(uint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_u16_vg4x1)))
-void svmla_lane_za64_u16_vg4x1(uint32_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_s16_vg4x2)))
-void svmla_lane_za64_s16_vg4x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_u16_vg4x2)))
-void svmla_lane_za64_u16_vg4x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_s16_vg4x4)))
-void svmla_lane_za64_s16_vg4x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_u16_vg4x4)))
-void svmla_lane_za64_u16_vg4x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_s16_vg4x1)))
-void svmla_za64_s16_vg4x1(uint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_u16_vg4x1)))
-void svmla_za64_u16_vg4x1(uint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_s16_vg4x2)))
-void svmla_za64_s16_vg4x2(uint32_t, svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_u16_vg4x2)))
-void svmla_za64_u16_vg4x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_s16_vg4x4)))
-void svmla_za64_s16_vg4x4(uint32_t, svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_u16_vg4x4)))
-void svmla_za64_u16_vg4x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_s16_vg4x2)))
-void svmls_single_za64_s16_vg4x2(uint32_t, svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_u16_vg4x2)))
-void svmls_single_za64_u16_vg4x2(uint32_t, svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_s16_vg4x4)))
-void svmls_single_za64_s16_vg4x4(uint32_t, svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_u16_vg4x4)))
-void svmls_single_za64_u16_vg4x4(uint32_t, svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_s16_vg4x1)))
-void svmls_lane_za64_s16_vg4x1(uint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_u16_vg4x1)))
-void svmls_lane_za64_u16_vg4x1(uint32_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_s16_vg4x2)))
-void svmls_lane_za64_s16_vg4x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_u16_vg4x2)))
-void svmls_lane_za64_u16_vg4x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_s16_vg4x4)))
-void svmls_lane_za64_s16_vg4x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_u16_vg4x4)))
-void svmls_lane_za64_u16_vg4x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_s16_vg4x1)))
-void svmls_za64_s16_vg4x1(uint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_u16_vg4x1)))
-void svmls_za64_u16_vg4x1(uint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_s16_vg4x2)))
-void svmls_za64_s16_vg4x2(uint32_t, svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_u16_vg4x2)))
-void svmls_za64_u16_vg4x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_s16_vg4x4)))
-void svmls_za64_s16_vg4x4(uint32_t, svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_u16_vg4x4)))
-void svmls_za64_u16_vg4x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za64_u64_vg1x2)))
-void svsub_write_single_za64_u64_vg1x2(uint32_t, svuint64x2_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za64_s64_vg1x2)))
-void svsub_write_single_za64_s64_vg1x2(uint32_t, svint64x2_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za64_u64_vg1x4)))
-void svsub_write_single_za64_u64_vg1x4(uint32_t, svuint64x4_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za64_s64_vg1x4)))
-void svsub_write_single_za64_s64_vg1x4(uint32_t, svint64x4_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za64_u64_vg1x2)))
-void svsub_write_za64_u64_vg1x2(uint32_t, svuint64x2_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za64_s64_vg1x2)))
-void svsub_write_za64_s64_vg1x2(uint32_t, svint64x2_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za64_u64_vg1x4)))
-void svsub_write_za64_u64_vg1x4(uint32_t, svuint64x4_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za64_s64_vg1x4)))
-void svsub_write_za64_s64_vg1x4(uint32_t, svint64x4_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_u64_vg1x2)))
-void svsub_za64_u64_vg1x2(uint32_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_s64_vg1x2)))
-void svsub_za64_s64_vg1x2(uint32_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_u64_vg1x4)))
-void svsub_za64_u64_vg1x4(uint32_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_s64_vg1x4)))
-void svsub_za64_s64_vg1x4(uint32_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za64_s16_vg1x4)))
-void svvdot_lane_za64_s16_vg1x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za64_u16_vg1x4)))
-void svvdot_lane_za64_u16_vg1x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za64_u64_vg1x2)))
-void svadd_write_za64_vg1x2(uint32_t, svuint64x2_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za64_s64_vg1x2)))
-void svadd_write_za64_vg1x2(uint32_t, svint64x2_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za64_u64_vg1x4)))
-void svadd_write_za64_vg1x4(uint32_t, svuint64x4_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za64_s64_vg1x4)))
-void svadd_write_za64_vg1x4(uint32_t, svint64x4_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za64_u64_vg1x2)))
-void svadd_write_za64_vg1x2(uint32_t, svuint64x2_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za64_s64_vg1x2)))
-void svadd_write_za64_vg1x2(uint32_t, svint64x2_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za64_u64_vg1x4)))
-void svadd_write_za64_vg1x4(uint32_t, svuint64x4_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_za64_s64_vg1x4)))
-void svadd_write_za64_vg1x4(uint32_t, svint64x4_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_u64_vg1x2)))
-void svadd_za64_vg1x2(uint32_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_s64_vg1x2)))
-void svadd_za64_vg1x2(uint32_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_u64_vg1x4)))
-void svadd_za64_vg1x4(uint32_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_s64_vg1x4)))
-void svadd_za64_vg1x4(uint32_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za64_s16_vg1x2)))
-void svdot_za64_vg1x2(uint32_t, svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za64_u16_vg1x2)))
-void svdot_za64_vg1x2(uint32_t, svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za64_s16_vg1x4)))
-void svdot_za64_vg1x4(uint32_t, svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za64_u16_vg1x4)))
-void svdot_za64_vg1x4(uint32_t, svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za64_s16_vg1x2)))
-void svdot_lane_za64_vg1x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za64_u16_vg1x2)))
-void svdot_lane_za64_vg1x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za64_s16_vg1x4)))
-void svdot_lane_za64_vg1x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za64_u16_vg1x4)))
-void svdot_lane_za64_vg1x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za64_s16_vg1x2)))
-void svdot_za64_vg1x2(uint32_t, svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za64_u16_vg1x2)))
-void svdot_za64_vg1x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za64_s16_vg1x4)))
-void svdot_za64_vg1x4(uint32_t, svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za64_u16_vg1x4)))
-void svdot_za64_vg1x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_s16_vg4x2)))
-void svmla_za64_vg4x2(uint32_t, svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_u16_vg4x2)))
-void svmla_za64_vg4x2(uint32_t, svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_s16_vg4x4)))
-void svmla_za64_vg4x4(uint32_t, svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za64_u16_vg4x4)))
-void svmla_za64_vg4x4(uint32_t, svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_s16_vg4x1)))
-void svmla_lane_za64_vg4x1(uint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_u16_vg4x1)))
-void svmla_lane_za64_vg4x1(uint32_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_s16_vg4x2)))
-void svmla_lane_za64_vg4x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_u16_vg4x2)))
-void svmla_lane_za64_vg4x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_s16_vg4x4)))
-void svmla_lane_za64_vg4x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za64_u16_vg4x4)))
-void svmla_lane_za64_vg4x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_s16_vg4x1)))
-void svmla_za64_vg4x1(uint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_u16_vg4x1)))
-void svmla_za64_vg4x1(uint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_s16_vg4x2)))
-void svmla_za64_vg4x2(uint32_t, svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_u16_vg4x2)))
-void svmla_za64_vg4x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_s16_vg4x4)))
-void svmla_za64_vg4x4(uint32_t, svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za64_u16_vg4x4)))
-void svmla_za64_vg4x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_s16_vg4x2)))
-void svmls_za64_vg4x2(uint32_t, svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_u16_vg4x2)))
-void svmls_za64_vg4x2(uint32_t, svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_s16_vg4x4)))
-void svmls_za64_vg4x4(uint32_t, svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za64_u16_vg4x4)))
-void svmls_za64_vg4x4(uint32_t, svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_s16_vg4x1)))
-void svmls_lane_za64_vg4x1(uint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_u16_vg4x1)))
-void svmls_lane_za64_vg4x1(uint32_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_s16_vg4x2)))
-void svmls_lane_za64_vg4x2(uint32_t, svint16x2_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_u16_vg4x2)))
-void svmls_lane_za64_vg4x2(uint32_t, svuint16x2_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_s16_vg4x4)))
-void svmls_lane_za64_vg4x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za64_u16_vg4x4)))
-void svmls_lane_za64_vg4x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_s16_vg4x1)))
-void svmls_za64_vg4x1(uint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_u16_vg4x1)))
-void svmls_za64_vg4x1(uint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_s16_vg4x2)))
-void svmls_za64_vg4x2(uint32_t, svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_u16_vg4x2)))
-void svmls_za64_vg4x2(uint32_t, svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_s16_vg4x4)))
-void svmls_za64_vg4x4(uint32_t, svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za64_u16_vg4x4)))
-void svmls_za64_vg4x4(uint32_t, svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za64_u64_vg1x2)))
-void svsub_write_za64_vg1x2(uint32_t, svuint64x2_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za64_s64_vg1x2)))
-void svsub_write_za64_vg1x2(uint32_t, svint64x2_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za64_u64_vg1x4)))
-void svsub_write_za64_vg1x4(uint32_t, svuint64x4_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_single_za64_s64_vg1x4)))
-void svsub_write_za64_vg1x4(uint32_t, svint64x4_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za64_u64_vg1x2)))
-void svsub_write_za64_vg1x2(uint32_t, svuint64x2_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za64_s64_vg1x2)))
-void svsub_write_za64_vg1x2(uint32_t, svint64x2_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za64_u64_vg1x4)))
-void svsub_write_za64_vg1x4(uint32_t, svuint64x4_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_write_za64_s64_vg1x4)))
-void svsub_write_za64_vg1x4(uint32_t, svint64x4_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_u64_vg1x2)))
-void svsub_za64_vg1x2(uint32_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_s64_vg1x2)))
-void svsub_za64_vg1x2(uint32_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_u64_vg1x4)))
-void svsub_za64_vg1x4(uint32_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za64_s64_vg1x4)))
-void svsub_za64_vg1x4(uint32_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za64_s16_vg1x4)))
-void svvdot_lane_za64_vg1x4(uint32_t, svint16x4_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za64_u16_vg1x4)))
-void svvdot_lane_za64_vg1x4(uint32_t, svuint16x4_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_u8)))
-svuint8_t svreadz_hor_za128_u8(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_u32)))
-svuint32_t svreadz_hor_za128_u32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_u64)))
-svuint64_t svreadz_hor_za128_u64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_u16)))
-svuint16_t svreadz_hor_za128_u16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_bf16)))
-svbfloat16_t svreadz_hor_za128_bf16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_s8)))
-svint8_t svreadz_hor_za128_s8(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_f64)))
-svfloat64_t svreadz_hor_za128_f64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_f32)))
-svfloat32_t svreadz_hor_za128_f32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_f16)))
-svfloat16_t svreadz_hor_za128_f16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_s32)))
-svint32_t svreadz_hor_za128_s32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_s64)))
-svint64_t svreadz_hor_za128_s64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za128_s16)))
-svint16_t svreadz_hor_za128_s16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_u16)))
-svuint16_t svreadz_hor_za16_u16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_bf16)))
-svbfloat16_t svreadz_hor_za16_bf16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_f16)))
-svfloat16_t svreadz_hor_za16_f16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_s16)))
-svint16_t svreadz_hor_za16_s16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_u16_vg2)))
-svuint16x2_t svreadz_hor_za16_u16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_bf16_vg2)))
-svbfloat16x2_t svreadz_hor_za16_bf16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_f16_vg2)))
-svfloat16x2_t svreadz_hor_za16_f16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_s16_vg2)))
-svint16x2_t svreadz_hor_za16_s16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_u16_vg4)))
-svuint16x4_t svreadz_hor_za16_u16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_bf16_vg4)))
-svbfloat16x4_t svreadz_hor_za16_bf16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_f16_vg4)))
-svfloat16x4_t svreadz_hor_za16_f16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za16_s16_vg4)))
-svint16x4_t svreadz_hor_za16_s16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_u32)))
-svuint32_t svreadz_hor_za32_u32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_f32)))
-svfloat32_t svreadz_hor_za32_f32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_s32)))
-svint32_t svreadz_hor_za32_s32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_u32_vg2)))
-svuint32x2_t svreadz_hor_za32_u32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_f32_vg2)))
-svfloat32x2_t svreadz_hor_za32_f32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_s32_vg2)))
-svint32x2_t svreadz_hor_za32_s32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_u32_vg4)))
-svuint32x4_t svreadz_hor_za32_u32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_f32_vg4)))
-svfloat32x4_t svreadz_hor_za32_f32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za32_s32_vg4)))
-svint32x4_t svreadz_hor_za32_s32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_u64)))
-svuint64_t svreadz_hor_za64_u64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_f64)))
-svfloat64_t svreadz_hor_za64_f64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_s64)))
-svint64_t svreadz_hor_za64_s64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_u64_vg2)))
-svuint64x2_t svreadz_hor_za64_u64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_f64_vg2)))
-svfloat64x2_t svreadz_hor_za64_f64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_s64_vg2)))
-svint64x2_t svreadz_hor_za64_s64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_u64_vg4)))
-svuint64x4_t svreadz_hor_za64_u64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_f64_vg4)))
-svfloat64x4_t svreadz_hor_za64_f64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za64_s64_vg4)))
-svint64x4_t svreadz_hor_za64_s64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_u8)))
-svuint8_t svreadz_hor_za8_u8(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_s8)))
-svint8_t svreadz_hor_za8_s8(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_u8_vg2)))
-svuint8x2_t svreadz_hor_za8_u8_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_s8_vg2)))
-svint8x2_t svreadz_hor_za8_s8_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_u8_vg4)))
-svuint8x4_t svreadz_hor_za8_u8_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_hor_za8_s8_vg4)))
-svint8x4_t svreadz_hor_za8_s8_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_u8)))
-svuint8_t svreadz_ver_za128_u8(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_u32)))
-svuint32_t svreadz_ver_za128_u32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_u64)))
-svuint64_t svreadz_ver_za128_u64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_u16)))
-svuint16_t svreadz_ver_za128_u16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_bf16)))
-svbfloat16_t svreadz_ver_za128_bf16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_s8)))
-svint8_t svreadz_ver_za128_s8(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_f64)))
-svfloat64_t svreadz_ver_za128_f64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_f32)))
-svfloat32_t svreadz_ver_za128_f32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_f16)))
-svfloat16_t svreadz_ver_za128_f16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_s32)))
-svint32_t svreadz_ver_za128_s32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_s64)))
-svint64_t svreadz_ver_za128_s64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za128_s16)))
-svint16_t svreadz_ver_za128_s16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_u16)))
-svuint16_t svreadz_ver_za16_u16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_bf16)))
-svbfloat16_t svreadz_ver_za16_bf16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_f16)))
-svfloat16_t svreadz_ver_za16_f16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_s16)))
-svint16_t svreadz_ver_za16_s16(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_u16_vg2)))
-svuint16x2_t svreadz_ver_za16_u16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_bf16_vg2)))
-svbfloat16x2_t svreadz_ver_za16_bf16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_f16_vg2)))
-svfloat16x2_t svreadz_ver_za16_f16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_s16_vg2)))
-svint16x2_t svreadz_ver_za16_s16_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_u16_vg4)))
-svuint16x4_t svreadz_ver_za16_u16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_bf16_vg4)))
-svbfloat16x4_t svreadz_ver_za16_bf16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_f16_vg4)))
-svfloat16x4_t svreadz_ver_za16_f16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za16_s16_vg4)))
-svint16x4_t svreadz_ver_za16_s16_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_u32)))
-svuint32_t svreadz_ver_za32_u32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_f32)))
-svfloat32_t svreadz_ver_za32_f32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_s32)))
-svint32_t svreadz_ver_za32_s32(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_u32_vg2)))
-svuint32x2_t svreadz_ver_za32_u32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_f32_vg2)))
-svfloat32x2_t svreadz_ver_za32_f32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_s32_vg2)))
-svint32x2_t svreadz_ver_za32_s32_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_u32_vg4)))
-svuint32x4_t svreadz_ver_za32_u32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_f32_vg4)))
-svfloat32x4_t svreadz_ver_za32_f32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za32_s32_vg4)))
-svint32x4_t svreadz_ver_za32_s32_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_u64)))
-svuint64_t svreadz_ver_za64_u64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_f64)))
-svfloat64_t svreadz_ver_za64_f64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_s64)))
-svint64_t svreadz_ver_za64_s64(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_u64_vg2)))
-svuint64x2_t svreadz_ver_za64_u64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_f64_vg2)))
-svfloat64x2_t svreadz_ver_za64_f64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_s64_vg2)))
-svint64x2_t svreadz_ver_za64_s64_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_u64_vg4)))
-svuint64x4_t svreadz_ver_za64_u64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_f64_vg4)))
-svfloat64x4_t svreadz_ver_za64_f64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za64_s64_vg4)))
-svint64x4_t svreadz_ver_za64_s64_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_u8)))
-svuint8_t svreadz_ver_za8_u8(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_s8)))
-svint8_t svreadz_ver_za8_s8(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_u8_vg2)))
-svuint8x2_t svreadz_ver_za8_u8_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_s8_vg2)))
-svint8x2_t svreadz_ver_za8_s8_vg2(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_u8_vg4)))
-svuint8x4_t svreadz_ver_za8_u8_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_ver_za8_s8_vg4)))
-svint8x4_t svreadz_ver_za8_s8_vg4(uint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_u16_vg1x2)))
-svuint16x2_t svreadz_za16_u16_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_bf16_vg1x2)))
-svbfloat16x2_t svreadz_za16_bf16_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_f16_vg1x2)))
-svfloat16x2_t svreadz_za16_f16_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_s16_vg1x2)))
-svint16x2_t svreadz_za16_s16_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_u16_vg1x4)))
-svuint16x4_t svreadz_za16_u16_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_bf16_vg1x4)))
-svbfloat16x4_t svreadz_za16_bf16_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_f16_vg1x4)))
-svfloat16x4_t svreadz_za16_f16_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za16_s16_vg1x4)))
-svint16x4_t svreadz_za16_s16_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_u32_vg1x2)))
-svuint32x2_t svreadz_za32_u32_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_f32_vg1x2)))
-svfloat32x2_t svreadz_za32_f32_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_s32_vg1x2)))
-svint32x2_t svreadz_za32_s32_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_u32_vg1x4)))
-svuint32x4_t svreadz_za32_u32_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_f32_vg1x4)))
-svfloat32x4_t svreadz_za32_f32_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za32_s32_vg1x4)))
-svint32x4_t svreadz_za32_s32_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_u64_vg1x2)))
-svuint64x2_t svreadz_za64_u64_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_f64_vg1x2)))
-svfloat64x2_t svreadz_za64_f64_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_s64_vg1x2)))
-svint64x2_t svreadz_za64_s64_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_u64_vg1x4)))
-svuint64x4_t svreadz_za64_u64_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_f64_vg1x4)))
-svfloat64x4_t svreadz_za64_f64_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za64_s64_vg1x4)))
-svint64x4_t svreadz_za64_s64_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za8_u8_vg1x2)))
-svuint8x2_t svreadz_za8_u8_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za8_s8_vg1x2)))
-svint8x2_t svreadz_za8_s8_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za8_u8_vg1x4)))
-svuint8x4_t svreadz_za8_u8_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svreadz_za8_s8_vg1x4)))
-svint8x4_t svreadz_za8_s8_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg1x2)))
-void svzero_za64_vg1x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg1x4)))
-void svzero_za64_vg1x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg2x1)))
-void svzero_za64_vg2x1(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg2x2)))
-void svzero_za64_vg2x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg2x4)))
-void svzero_za64_vg2x4(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg4x1)))
-void svzero_za64_vg4x1(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg4x2)))
-void svzero_za64_vg4x2(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za64_vg4x4)))
-void svzero_za64_vg4x4(uint32_t);
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#undef __ai
-
-#endif /* __ARM_SME_H */
diff --git a/third_party/aarch64/clang/arm_sve.h b/third_party/aarch64/clang/arm_sve.h
deleted file mode 100644
index f9aa68374..000000000
--- a/third_party/aarch64/clang/arm_sve.h
+++ /dev/null
@@ -1,30537 +0,0 @@
-/*===---- arm_sve.h - ARM SVE intrinsics -----------------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_SVE_H
-#define __ARM_SVE_H
-
-#if !defined(__LITTLE_ENDIAN__)
-#error "Big endian is currently not supported for arm_sve.h"
-#endif
-#include <stdint.h>
-
-#ifdef  __cplusplus
-extern "C" {
-#else
-#include <stdbool.h>
-#endif
-
-typedef __fp16 float16_t;
-typedef float float32_t;
-typedef double float64_t;
-typedef __SVInt8_t svint8_t;
-typedef __SVInt16_t svint16_t;
-typedef __SVInt32_t svint32_t;
-typedef __SVInt64_t svint64_t;
-typedef __SVUint8_t svuint8_t;
-typedef __SVUint16_t svuint16_t;
-typedef __SVUint32_t svuint32_t;
-typedef __SVUint64_t svuint64_t;
-typedef __SVFloat16_t svfloat16_t;
-
-typedef __SVBfloat16_t svbfloat16_t;
-#include <arm_bf16.h>
-#include <arm_vector_types.h>
-typedef __SVFloat32_t svfloat32_t;
-typedef __SVFloat64_t svfloat64_t;
-typedef __clang_svint8x2_t svint8x2_t;
-typedef __clang_svint16x2_t svint16x2_t;
-typedef __clang_svint32x2_t svint32x2_t;
-typedef __clang_svint64x2_t svint64x2_t;
-typedef __clang_svuint8x2_t svuint8x2_t;
-typedef __clang_svuint16x2_t svuint16x2_t;
-typedef __clang_svuint32x2_t svuint32x2_t;
-typedef __clang_svuint64x2_t svuint64x2_t;
-typedef __clang_svfloat16x2_t svfloat16x2_t;
-typedef __clang_svfloat32x2_t svfloat32x2_t;
-typedef __clang_svfloat64x2_t svfloat64x2_t;
-typedef __clang_svint8x3_t svint8x3_t;
-typedef __clang_svint16x3_t svint16x3_t;
-typedef __clang_svint32x3_t svint32x3_t;
-typedef __clang_svint64x3_t svint64x3_t;
-typedef __clang_svuint8x3_t svuint8x3_t;
-typedef __clang_svuint16x3_t svuint16x3_t;
-typedef __clang_svuint32x3_t svuint32x3_t;
-typedef __clang_svuint64x3_t svuint64x3_t;
-typedef __clang_svfloat16x3_t svfloat16x3_t;
-typedef __clang_svfloat32x3_t svfloat32x3_t;
-typedef __clang_svfloat64x3_t svfloat64x3_t;
-typedef __clang_svint8x4_t svint8x4_t;
-typedef __clang_svint16x4_t svint16x4_t;
-typedef __clang_svint32x4_t svint32x4_t;
-typedef __clang_svint64x4_t svint64x4_t;
-typedef __clang_svuint8x4_t svuint8x4_t;
-typedef __clang_svuint16x4_t svuint16x4_t;
-typedef __clang_svuint32x4_t svuint32x4_t;
-typedef __clang_svuint64x4_t svuint64x4_t;
-typedef __clang_svfloat16x4_t svfloat16x4_t;
-typedef __clang_svfloat32x4_t svfloat32x4_t;
-typedef __clang_svfloat64x4_t svfloat64x4_t;
-typedef __SVBool_t  svbool_t;
-typedef __clang_svboolx2_t  svboolx2_t;
-typedef __clang_svboolx4_t  svboolx4_t;
-
-typedef __clang_svbfloat16x2_t svbfloat16x2_t;
-typedef __clang_svbfloat16x3_t svbfloat16x3_t;
-typedef __clang_svbfloat16x4_t svbfloat16x4_t;
-typedef __SVCount_t svcount_t;
-
-enum svpattern
-{
-  SV_POW2 = 0,
-  SV_VL1 = 1,
-  SV_VL2 = 2,
-  SV_VL3 = 3,
-  SV_VL4 = 4,
-  SV_VL5 = 5,
-  SV_VL6 = 6,
-  SV_VL7 = 7,
-  SV_VL8 = 8,
-  SV_VL16 = 9,
-  SV_VL32 = 10,
-  SV_VL64 = 11,
-  SV_VL128 = 12,
-  SV_VL256 = 13,
-  SV_MUL4 = 29,
-  SV_MUL3 = 30,
-  SV_ALL = 31
-};
-
-enum svprfop
-{
-  SV_PLDL1KEEP = 0,
-  SV_PLDL1STRM = 1,
-  SV_PLDL2KEEP = 2,
-  SV_PLDL2STRM = 3,
-  SV_PLDL3KEEP = 4,
-  SV_PLDL3STRM = 5,
-  SV_PSTL1KEEP = 8,
-  SV_PSTL1STRM = 9,
-  SV_PSTL2KEEP = 10,
-  SV_PSTL2STRM = 11,
-  SV_PSTL3KEEP = 12,
-  SV_PSTL3STRM = 13
-};
-
-/* Function attributes */
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-#define __aio static __inline__ __attribute__((__always_inline__, __nodebug__, __overloadable__))
-
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8)))
-svint8_t svreinterpret_s8_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8)))
-svint8_t svreinterpret_s8_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16)))
-svint8_t svreinterpret_s8_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16)))
-svint8_t svreinterpret_s8_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s32)))
-svint8_t svreinterpret_s8_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u32)))
-svint8_t svreinterpret_s8_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s64)))
-svint8_t svreinterpret_s8_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u64)))
-svint8_t svreinterpret_s8_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f16)))
-svint8_t svreinterpret_s8_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_bf16)))
-svint8_t svreinterpret_s8_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f32)))
-svint8_t svreinterpret_s8_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f64)))
-svint8_t svreinterpret_s8_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8)))
-svuint8_t svreinterpret_u8_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8)))
-svuint8_t svreinterpret_u8_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16)))
-svuint8_t svreinterpret_u8_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16)))
-svuint8_t svreinterpret_u8_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s32)))
-svuint8_t svreinterpret_u8_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u32)))
-svuint8_t svreinterpret_u8_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s64)))
-svuint8_t svreinterpret_u8_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u64)))
-svuint8_t svreinterpret_u8_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f16)))
-svuint8_t svreinterpret_u8_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_bf16)))
-svuint8_t svreinterpret_u8_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32)))
-svuint8_t svreinterpret_u8_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64)))
-svuint8_t svreinterpret_u8_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8)))
-svint16_t svreinterpret_s16_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8)))
-svint16_t svreinterpret_s16_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16)))
-svint16_t svreinterpret_s16_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16)))
-svint16_t svreinterpret_s16_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s32)))
-svint16_t svreinterpret_s16_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u32)))
-svint16_t svreinterpret_s16_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s64)))
-svint16_t svreinterpret_s16_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u64)))
-svint16_t svreinterpret_s16_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f16)))
-svint16_t svreinterpret_s16_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_bf16)))
-svint16_t svreinterpret_s16_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f32)))
-svint16_t svreinterpret_s16_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f64)))
-svint16_t svreinterpret_s16_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8)))
-svuint16_t svreinterpret_u16_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8)))
-svuint16_t svreinterpret_u16_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16)))
-svuint16_t svreinterpret_u16_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16)))
-svuint16_t svreinterpret_u16_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s32)))
-svuint16_t svreinterpret_u16_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u32)))
-svuint16_t svreinterpret_u16_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s64)))
-svuint16_t svreinterpret_u16_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u64)))
-svuint16_t svreinterpret_u16_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f16)))
-svuint16_t svreinterpret_u16_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_bf16)))
-svuint16_t svreinterpret_u16_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f32)))
-svuint16_t svreinterpret_u16_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f64)))
-svuint16_t svreinterpret_u16_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8)))
-svint32_t svreinterpret_s32_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8)))
-svint32_t svreinterpret_s32_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16)))
-svint32_t svreinterpret_s32_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16)))
-svint32_t svreinterpret_s32_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s32)))
-svint32_t svreinterpret_s32_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u32)))
-svint32_t svreinterpret_s32_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s64)))
-svint32_t svreinterpret_s32_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u64)))
-svint32_t svreinterpret_s32_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f16)))
-svint32_t svreinterpret_s32_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_bf16)))
-svint32_t svreinterpret_s32_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f32)))
-svint32_t svreinterpret_s32_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f64)))
-svint32_t svreinterpret_s32_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8)))
-svuint32_t svreinterpret_u32_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8)))
-svuint32_t svreinterpret_u32_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16)))
-svuint32_t svreinterpret_u32_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16)))
-svuint32_t svreinterpret_u32_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s32)))
-svuint32_t svreinterpret_u32_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u32)))
-svuint32_t svreinterpret_u32_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s64)))
-svuint32_t svreinterpret_u32_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u64)))
-svuint32_t svreinterpret_u32_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f16)))
-svuint32_t svreinterpret_u32_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_bf16)))
-svuint32_t svreinterpret_u32_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f32)))
-svuint32_t svreinterpret_u32_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f64)))
-svuint32_t svreinterpret_u32_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8)))
-svint64_t svreinterpret_s64_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8)))
-svint64_t svreinterpret_s64_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16)))
-svint64_t svreinterpret_s64_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16)))
-svint64_t svreinterpret_s64_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s32)))
-svint64_t svreinterpret_s64_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u32)))
-svint64_t svreinterpret_s64_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s64)))
-svint64_t svreinterpret_s64_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u64)))
-svint64_t svreinterpret_s64_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f16)))
-svint64_t svreinterpret_s64_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_bf16)))
-svint64_t svreinterpret_s64_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f32)))
-svint64_t svreinterpret_s64_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f64)))
-svint64_t svreinterpret_s64_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8)))
-svuint64_t svreinterpret_u64_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8)))
-svuint64_t svreinterpret_u64_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16)))
-svuint64_t svreinterpret_u64_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16)))
-svuint64_t svreinterpret_u64_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s32)))
-svuint64_t svreinterpret_u64_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u32)))
-svuint64_t svreinterpret_u64_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s64)))
-svuint64_t svreinterpret_u64_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u64)))
-svuint64_t svreinterpret_u64_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f16)))
-svuint64_t svreinterpret_u64_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_bf16)))
-svuint64_t svreinterpret_u64_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f32)))
-svuint64_t svreinterpret_u64_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f64)))
-svuint64_t svreinterpret_u64_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8)))
-svfloat16_t svreinterpret_f16_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8)))
-svfloat16_t svreinterpret_f16_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16)))
-svfloat16_t svreinterpret_f16_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16)))
-svfloat16_t svreinterpret_f16_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s32)))
-svfloat16_t svreinterpret_f16_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u32)))
-svfloat16_t svreinterpret_f16_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s64)))
-svfloat16_t svreinterpret_f16_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u64)))
-svfloat16_t svreinterpret_f16_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f16)))
-svfloat16_t svreinterpret_f16_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_bf16)))
-svfloat16_t svreinterpret_f16_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f32)))
-svfloat16_t svreinterpret_f16_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f64)))
-svfloat16_t svreinterpret_f16_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8)))
-svbfloat16_t svreinterpret_bf16_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8)))
-svbfloat16_t svreinterpret_bf16_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16)))
-svbfloat16_t svreinterpret_bf16_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16)))
-svbfloat16_t svreinterpret_bf16_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s32)))
-svbfloat16_t svreinterpret_bf16_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u32)))
-svbfloat16_t svreinterpret_bf16_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s64)))
-svbfloat16_t svreinterpret_bf16_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u64)))
-svbfloat16_t svreinterpret_bf16_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f16)))
-svbfloat16_t svreinterpret_bf16_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_bf16)))
-svbfloat16_t svreinterpret_bf16_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f32)))
-svbfloat16_t svreinterpret_bf16_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f64)))
-svbfloat16_t svreinterpret_bf16_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8)))
-svfloat32_t svreinterpret_f32_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8)))
-svfloat32_t svreinterpret_f32_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16)))
-svfloat32_t svreinterpret_f32_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16)))
-svfloat32_t svreinterpret_f32_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s32)))
-svfloat32_t svreinterpret_f32_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u32)))
-svfloat32_t svreinterpret_f32_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s64)))
-svfloat32_t svreinterpret_f32_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u64)))
-svfloat32_t svreinterpret_f32_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f16)))
-svfloat32_t svreinterpret_f32_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_bf16)))
-svfloat32_t svreinterpret_f32_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f32)))
-svfloat32_t svreinterpret_f32_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f64)))
-svfloat32_t svreinterpret_f32_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8)))
-svfloat64_t svreinterpret_f64_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8)))
-svfloat64_t svreinterpret_f64_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16)))
-svfloat64_t svreinterpret_f64_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16)))
-svfloat64_t svreinterpret_f64_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s32)))
-svfloat64_t svreinterpret_f64_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u32)))
-svfloat64_t svreinterpret_f64_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s64)))
-svfloat64_t svreinterpret_f64_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u64)))
-svfloat64_t svreinterpret_f64_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f16)))
-svfloat64_t svreinterpret_f64_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_bf16)))
-svfloat64_t svreinterpret_f64_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f32)))
-svfloat64_t svreinterpret_f64_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f64)))
-svfloat64_t svreinterpret_f64_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8)))
-svint8_t svreinterpret_s8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8)))
-svint8_t svreinterpret_s8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16)))
-svint8_t svreinterpret_s8(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16)))
-svint8_t svreinterpret_s8(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s32)))
-svint8_t svreinterpret_s8(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u32)))
-svint8_t svreinterpret_s8(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s64)))
-svint8_t svreinterpret_s8(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u64)))
-svint8_t svreinterpret_s8(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f16)))
-svint8_t svreinterpret_s8(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_bf16)))
-svint8_t svreinterpret_s8(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f32)))
-svint8_t svreinterpret_s8(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f64)))
-svint8_t svreinterpret_s8(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8)))
-svuint8_t svreinterpret_u8(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8)))
-svuint8_t svreinterpret_u8(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16)))
-svuint8_t svreinterpret_u8(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16)))
-svuint8_t svreinterpret_u8(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s32)))
-svuint8_t svreinterpret_u8(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u32)))
-svuint8_t svreinterpret_u8(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s64)))
-svuint8_t svreinterpret_u8(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u64)))
-svuint8_t svreinterpret_u8(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f16)))
-svuint8_t svreinterpret_u8(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_bf16)))
-svuint8_t svreinterpret_u8(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32)))
-svuint8_t svreinterpret_u8(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64)))
-svuint8_t svreinterpret_u8(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8)))
-svint16_t svreinterpret_s16(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8)))
-svint16_t svreinterpret_s16(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16)))
-svint16_t svreinterpret_s16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16)))
-svint16_t svreinterpret_s16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s32)))
-svint16_t svreinterpret_s16(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u32)))
-svint16_t svreinterpret_s16(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s64)))
-svint16_t svreinterpret_s16(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u64)))
-svint16_t svreinterpret_s16(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f16)))
-svint16_t svreinterpret_s16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_bf16)))
-svint16_t svreinterpret_s16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f32)))
-svint16_t svreinterpret_s16(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f64)))
-svint16_t svreinterpret_s16(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8)))
-svuint16_t svreinterpret_u16(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8)))
-svuint16_t svreinterpret_u16(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16)))
-svuint16_t svreinterpret_u16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16)))
-svuint16_t svreinterpret_u16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s32)))
-svuint16_t svreinterpret_u16(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u32)))
-svuint16_t svreinterpret_u16(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s64)))
-svuint16_t svreinterpret_u16(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u64)))
-svuint16_t svreinterpret_u16(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f16)))
-svuint16_t svreinterpret_u16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_bf16)))
-svuint16_t svreinterpret_u16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f32)))
-svuint16_t svreinterpret_u16(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f64)))
-svuint16_t svreinterpret_u16(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8)))
-svint32_t svreinterpret_s32(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8)))
-svint32_t svreinterpret_s32(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16)))
-svint32_t svreinterpret_s32(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16)))
-svint32_t svreinterpret_s32(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s32)))
-svint32_t svreinterpret_s32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u32)))
-svint32_t svreinterpret_s32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s64)))
-svint32_t svreinterpret_s32(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u64)))
-svint32_t svreinterpret_s32(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f16)))
-svint32_t svreinterpret_s32(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_bf16)))
-svint32_t svreinterpret_s32(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f32)))
-svint32_t svreinterpret_s32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f64)))
-svint32_t svreinterpret_s32(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8)))
-svuint32_t svreinterpret_u32(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8)))
-svuint32_t svreinterpret_u32(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16)))
-svuint32_t svreinterpret_u32(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16)))
-svuint32_t svreinterpret_u32(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s32)))
-svuint32_t svreinterpret_u32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u32)))
-svuint32_t svreinterpret_u32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s64)))
-svuint32_t svreinterpret_u32(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u64)))
-svuint32_t svreinterpret_u32(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f16)))
-svuint32_t svreinterpret_u32(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_bf16)))
-svuint32_t svreinterpret_u32(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f32)))
-svuint32_t svreinterpret_u32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f64)))
-svuint32_t svreinterpret_u32(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8)))
-svint64_t svreinterpret_s64(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8)))
-svint64_t svreinterpret_s64(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16)))
-svint64_t svreinterpret_s64(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16)))
-svint64_t svreinterpret_s64(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s32)))
-svint64_t svreinterpret_s64(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u32)))
-svint64_t svreinterpret_s64(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s64)))
-svint64_t svreinterpret_s64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u64)))
-svint64_t svreinterpret_s64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f16)))
-svint64_t svreinterpret_s64(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_bf16)))
-svint64_t svreinterpret_s64(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f32)))
-svint64_t svreinterpret_s64(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f64)))
-svint64_t svreinterpret_s64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8)))
-svuint64_t svreinterpret_u64(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8)))
-svuint64_t svreinterpret_u64(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16)))
-svuint64_t svreinterpret_u64(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16)))
-svuint64_t svreinterpret_u64(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s32)))
-svuint64_t svreinterpret_u64(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u32)))
-svuint64_t svreinterpret_u64(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s64)))
-svuint64_t svreinterpret_u64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u64)))
-svuint64_t svreinterpret_u64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f16)))
-svuint64_t svreinterpret_u64(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_bf16)))
-svuint64_t svreinterpret_u64(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f32)))
-svuint64_t svreinterpret_u64(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f64)))
-svuint64_t svreinterpret_u64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8)))
-svfloat16_t svreinterpret_f16(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8)))
-svfloat16_t svreinterpret_f16(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16)))
-svfloat16_t svreinterpret_f16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16)))
-svfloat16_t svreinterpret_f16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s32)))
-svfloat16_t svreinterpret_f16(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u32)))
-svfloat16_t svreinterpret_f16(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s64)))
-svfloat16_t svreinterpret_f16(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u64)))
-svfloat16_t svreinterpret_f16(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f16)))
-svfloat16_t svreinterpret_f16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_bf16)))
-svfloat16_t svreinterpret_f16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f32)))
-svfloat16_t svreinterpret_f16(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f64)))
-svfloat16_t svreinterpret_f16(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8)))
-svbfloat16_t svreinterpret_bf16(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8)))
-svbfloat16_t svreinterpret_bf16(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16)))
-svbfloat16_t svreinterpret_bf16(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16)))
-svbfloat16_t svreinterpret_bf16(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s32)))
-svbfloat16_t svreinterpret_bf16(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u32)))
-svbfloat16_t svreinterpret_bf16(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s64)))
-svbfloat16_t svreinterpret_bf16(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u64)))
-svbfloat16_t svreinterpret_bf16(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f16)))
-svbfloat16_t svreinterpret_bf16(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_bf16)))
-svbfloat16_t svreinterpret_bf16(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f32)))
-svbfloat16_t svreinterpret_bf16(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f64)))
-svbfloat16_t svreinterpret_bf16(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8)))
-svfloat32_t svreinterpret_f32(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8)))
-svfloat32_t svreinterpret_f32(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16)))
-svfloat32_t svreinterpret_f32(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16)))
-svfloat32_t svreinterpret_f32(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s32)))
-svfloat32_t svreinterpret_f32(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u32)))
-svfloat32_t svreinterpret_f32(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s64)))
-svfloat32_t svreinterpret_f32(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u64)))
-svfloat32_t svreinterpret_f32(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f16)))
-svfloat32_t svreinterpret_f32(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_bf16)))
-svfloat32_t svreinterpret_f32(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f32)))
-svfloat32_t svreinterpret_f32(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f64)))
-svfloat32_t svreinterpret_f32(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8)))
-svfloat64_t svreinterpret_f64(svint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8)))
-svfloat64_t svreinterpret_f64(svuint8_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16)))
-svfloat64_t svreinterpret_f64(svint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16)))
-svfloat64_t svreinterpret_f64(svuint16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s32)))
-svfloat64_t svreinterpret_f64(svint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u32)))
-svfloat64_t svreinterpret_f64(svuint32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s64)))
-svfloat64_t svreinterpret_f64(svint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u64)))
-svfloat64_t svreinterpret_f64(svuint64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f16)))
-svfloat64_t svreinterpret_f64(svfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_bf16)))
-svfloat64_t svreinterpret_f64(svbfloat16_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f32)))
-svfloat64_t svreinterpret_f64(svfloat32_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f64)))
-svfloat64_t svreinterpret_f64(svfloat64_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x2)))
-svint8x2_t svreinterpret_s8_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x2)))
-svint8x2_t svreinterpret_s8_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x2)))
-svint8x2_t svreinterpret_s8_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x2)))
-svint8x2_t svreinterpret_s8_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s32_x2)))
-svint8x2_t svreinterpret_s8_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u32_x2)))
-svint8x2_t svreinterpret_s8_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s64_x2)))
-svint8x2_t svreinterpret_s8_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u64_x2)))
-svint8x2_t svreinterpret_s8_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f16_x2)))
-svint8x2_t svreinterpret_s8_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_bf16_x2)))
-svint8x2_t svreinterpret_s8_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f32_x2)))
-svint8x2_t svreinterpret_s8_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f64_x2)))
-svint8x2_t svreinterpret_s8_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x2)))
-svuint8x2_t svreinterpret_u8_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x2)))
-svuint8x2_t svreinterpret_u8_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x2)))
-svuint8x2_t svreinterpret_u8_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x2)))
-svuint8x2_t svreinterpret_u8_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s32_x2)))
-svuint8x2_t svreinterpret_u8_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u32_x2)))
-svuint8x2_t svreinterpret_u8_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s64_x2)))
-svuint8x2_t svreinterpret_u8_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u64_x2)))
-svuint8x2_t svreinterpret_u8_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f16_x2)))
-svuint8x2_t svreinterpret_u8_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_bf16_x2)))
-svuint8x2_t svreinterpret_u8_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_x2)))
-svuint8x2_t svreinterpret_u8_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x2)))
-svuint8x2_t svreinterpret_u8_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x2)))
-svint16x2_t svreinterpret_s16_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x2)))
-svint16x2_t svreinterpret_s16_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x2)))
-svint16x2_t svreinterpret_s16_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x2)))
-svint16x2_t svreinterpret_s16_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s32_x2)))
-svint16x2_t svreinterpret_s16_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u32_x2)))
-svint16x2_t svreinterpret_s16_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s64_x2)))
-svint16x2_t svreinterpret_s16_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u64_x2)))
-svint16x2_t svreinterpret_s16_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f16_x2)))
-svint16x2_t svreinterpret_s16_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_bf16_x2)))
-svint16x2_t svreinterpret_s16_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f32_x2)))
-svint16x2_t svreinterpret_s16_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f64_x2)))
-svint16x2_t svreinterpret_s16_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_x2)))
-svuint16x2_t svreinterpret_u16_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x2)))
-svuint16x2_t svreinterpret_u16_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x2)))
-svuint16x2_t svreinterpret_u16_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x2)))
-svuint16x2_t svreinterpret_u16_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s32_x2)))
-svuint16x2_t svreinterpret_u16_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u32_x2)))
-svuint16x2_t svreinterpret_u16_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s64_x2)))
-svuint16x2_t svreinterpret_u16_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u64_x2)))
-svuint16x2_t svreinterpret_u16_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f16_x2)))
-svuint16x2_t svreinterpret_u16_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_bf16_x2)))
-svuint16x2_t svreinterpret_u16_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f32_x2)))
-svuint16x2_t svreinterpret_u16_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f64_x2)))
-svuint16x2_t svreinterpret_u16_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_x2)))
-svint32x2_t svreinterpret_s32_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x2)))
-svint32x2_t svreinterpret_s32_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x2)))
-svint32x2_t svreinterpret_s32_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x2)))
-svint32x2_t svreinterpret_s32_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s32_x2)))
-svint32x2_t svreinterpret_s32_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u32_x2)))
-svint32x2_t svreinterpret_s32_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s64_x2)))
-svint32x2_t svreinterpret_s32_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u64_x2)))
-svint32x2_t svreinterpret_s32_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f16_x2)))
-svint32x2_t svreinterpret_s32_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_bf16_x2)))
-svint32x2_t svreinterpret_s32_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f32_x2)))
-svint32x2_t svreinterpret_s32_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f64_x2)))
-svint32x2_t svreinterpret_s32_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_x2)))
-svuint32x2_t svreinterpret_u32_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x2)))
-svuint32x2_t svreinterpret_u32_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x2)))
-svuint32x2_t svreinterpret_u32_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x2)))
-svuint32x2_t svreinterpret_u32_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s32_x2)))
-svuint32x2_t svreinterpret_u32_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u32_x2)))
-svuint32x2_t svreinterpret_u32_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s64_x2)))
-svuint32x2_t svreinterpret_u32_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u64_x2)))
-svuint32x2_t svreinterpret_u32_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f16_x2)))
-svuint32x2_t svreinterpret_u32_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_bf16_x2)))
-svuint32x2_t svreinterpret_u32_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f32_x2)))
-svuint32x2_t svreinterpret_u32_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f64_x2)))
-svuint32x2_t svreinterpret_u32_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_x2)))
-svint64x2_t svreinterpret_s64_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x2)))
-svint64x2_t svreinterpret_s64_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x2)))
-svint64x2_t svreinterpret_s64_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x2)))
-svint64x2_t svreinterpret_s64_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s32_x2)))
-svint64x2_t svreinterpret_s64_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u32_x2)))
-svint64x2_t svreinterpret_s64_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s64_x2)))
-svint64x2_t svreinterpret_s64_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u64_x2)))
-svint64x2_t svreinterpret_s64_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f16_x2)))
-svint64x2_t svreinterpret_s64_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_bf16_x2)))
-svint64x2_t svreinterpret_s64_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f32_x2)))
-svint64x2_t svreinterpret_s64_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f64_x2)))
-svint64x2_t svreinterpret_s64_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_x2)))
-svuint64x2_t svreinterpret_u64_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x2)))
-svuint64x2_t svreinterpret_u64_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x2)))
-svuint64x2_t svreinterpret_u64_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x2)))
-svuint64x2_t svreinterpret_u64_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s32_x2)))
-svuint64x2_t svreinterpret_u64_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u32_x2)))
-svuint64x2_t svreinterpret_u64_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s64_x2)))
-svuint64x2_t svreinterpret_u64_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u64_x2)))
-svuint64x2_t svreinterpret_u64_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f16_x2)))
-svuint64x2_t svreinterpret_u64_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_bf16_x2)))
-svuint64x2_t svreinterpret_u64_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f32_x2)))
-svuint64x2_t svreinterpret_u64_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f64_x2)))
-svuint64x2_t svreinterpret_u64_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_x2)))
-svfloat16x2_t svreinterpret_f16_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x2)))
-svfloat16x2_t svreinterpret_f16_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x2)))
-svfloat16x2_t svreinterpret_f16_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x2)))
-svfloat16x2_t svreinterpret_f16_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s32_x2)))
-svfloat16x2_t svreinterpret_f16_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u32_x2)))
-svfloat16x2_t svreinterpret_f16_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s64_x2)))
-svfloat16x2_t svreinterpret_f16_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u64_x2)))
-svfloat16x2_t svreinterpret_f16_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f16_x2)))
-svfloat16x2_t svreinterpret_f16_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_bf16_x2)))
-svfloat16x2_t svreinterpret_f16_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f32_x2)))
-svfloat16x2_t svreinterpret_f16_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f64_x2)))
-svfloat16x2_t svreinterpret_f16_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8_x2)))
-svbfloat16x2_t svreinterpret_bf16_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x2)))
-svbfloat16x2_t svreinterpret_bf16_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x2)))
-svbfloat16x2_t svreinterpret_bf16_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x2)))
-svbfloat16x2_t svreinterpret_bf16_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s32_x2)))
-svbfloat16x2_t svreinterpret_bf16_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u32_x2)))
-svbfloat16x2_t svreinterpret_bf16_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s64_x2)))
-svbfloat16x2_t svreinterpret_bf16_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u64_x2)))
-svbfloat16x2_t svreinterpret_bf16_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f16_x2)))
-svbfloat16x2_t svreinterpret_bf16_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_bf16_x2)))
-svbfloat16x2_t svreinterpret_bf16_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f32_x2)))
-svbfloat16x2_t svreinterpret_bf16_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f64_x2)))
-svbfloat16x2_t svreinterpret_bf16_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_x2)))
-svfloat32x2_t svreinterpret_f32_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x2)))
-svfloat32x2_t svreinterpret_f32_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x2)))
-svfloat32x2_t svreinterpret_f32_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x2)))
-svfloat32x2_t svreinterpret_f32_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s32_x2)))
-svfloat32x2_t svreinterpret_f32_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u32_x2)))
-svfloat32x2_t svreinterpret_f32_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s64_x2)))
-svfloat32x2_t svreinterpret_f32_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u64_x2)))
-svfloat32x2_t svreinterpret_f32_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f16_x2)))
-svfloat32x2_t svreinterpret_f32_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_bf16_x2)))
-svfloat32x2_t svreinterpret_f32_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f32_x2)))
-svfloat32x2_t svreinterpret_f32_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f64_x2)))
-svfloat32x2_t svreinterpret_f32_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_x2)))
-svfloat64x2_t svreinterpret_f64_s8_x2(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x2)))
-svfloat64x2_t svreinterpret_f64_u8_x2(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x2)))
-svfloat64x2_t svreinterpret_f64_s16_x2(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x2)))
-svfloat64x2_t svreinterpret_f64_u16_x2(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s32_x2)))
-svfloat64x2_t svreinterpret_f64_s32_x2(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u32_x2)))
-svfloat64x2_t svreinterpret_f64_u32_x2(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s64_x2)))
-svfloat64x2_t svreinterpret_f64_s64_x2(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u64_x2)))
-svfloat64x2_t svreinterpret_f64_u64_x2(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f16_x2)))
-svfloat64x2_t svreinterpret_f64_f16_x2(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_bf16_x2)))
-svfloat64x2_t svreinterpret_f64_bf16_x2(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f32_x2)))
-svfloat64x2_t svreinterpret_f64_f32_x2(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f64_x2)))
-svfloat64x2_t svreinterpret_f64_f64_x2(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x2)))
-svint8x2_t svreinterpret_s8(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x2)))
-svint8x2_t svreinterpret_s8(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x2)))
-svint8x2_t svreinterpret_s8(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x2)))
-svint8x2_t svreinterpret_s8(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s32_x2)))
-svint8x2_t svreinterpret_s8(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u32_x2)))
-svint8x2_t svreinterpret_s8(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s64_x2)))
-svint8x2_t svreinterpret_s8(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u64_x2)))
-svint8x2_t svreinterpret_s8(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f16_x2)))
-svint8x2_t svreinterpret_s8(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_bf16_x2)))
-svint8x2_t svreinterpret_s8(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f32_x2)))
-svint8x2_t svreinterpret_s8(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f64_x2)))
-svint8x2_t svreinterpret_s8(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x2)))
-svuint8x2_t svreinterpret_u8(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x2)))
-svuint8x2_t svreinterpret_u8(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x2)))
-svuint8x2_t svreinterpret_u8(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x2)))
-svuint8x2_t svreinterpret_u8(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s32_x2)))
-svuint8x2_t svreinterpret_u8(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u32_x2)))
-svuint8x2_t svreinterpret_u8(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s64_x2)))
-svuint8x2_t svreinterpret_u8(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u64_x2)))
-svuint8x2_t svreinterpret_u8(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f16_x2)))
-svuint8x2_t svreinterpret_u8(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_bf16_x2)))
-svuint8x2_t svreinterpret_u8(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_x2)))
-svuint8x2_t svreinterpret_u8(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x2)))
-svuint8x2_t svreinterpret_u8(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x2)))
-svint16x2_t svreinterpret_s16(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x2)))
-svint16x2_t svreinterpret_s16(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x2)))
-svint16x2_t svreinterpret_s16(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x2)))
-svint16x2_t svreinterpret_s16(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s32_x2)))
-svint16x2_t svreinterpret_s16(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u32_x2)))
-svint16x2_t svreinterpret_s16(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s64_x2)))
-svint16x2_t svreinterpret_s16(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u64_x2)))
-svint16x2_t svreinterpret_s16(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f16_x2)))
-svint16x2_t svreinterpret_s16(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_bf16_x2)))
-svint16x2_t svreinterpret_s16(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f32_x2)))
-svint16x2_t svreinterpret_s16(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f64_x2)))
-svint16x2_t svreinterpret_s16(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_x2)))
-svuint16x2_t svreinterpret_u16(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x2)))
-svuint16x2_t svreinterpret_u16(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x2)))
-svuint16x2_t svreinterpret_u16(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x2)))
-svuint16x2_t svreinterpret_u16(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s32_x2)))
-svuint16x2_t svreinterpret_u16(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u32_x2)))
-svuint16x2_t svreinterpret_u16(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s64_x2)))
-svuint16x2_t svreinterpret_u16(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u64_x2)))
-svuint16x2_t svreinterpret_u16(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f16_x2)))
-svuint16x2_t svreinterpret_u16(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_bf16_x2)))
-svuint16x2_t svreinterpret_u16(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f32_x2)))
-svuint16x2_t svreinterpret_u16(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f64_x2)))
-svuint16x2_t svreinterpret_u16(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_x2)))
-svint32x2_t svreinterpret_s32(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x2)))
-svint32x2_t svreinterpret_s32(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x2)))
-svint32x2_t svreinterpret_s32(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x2)))
-svint32x2_t svreinterpret_s32(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s32_x2)))
-svint32x2_t svreinterpret_s32(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u32_x2)))
-svint32x2_t svreinterpret_s32(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s64_x2)))
-svint32x2_t svreinterpret_s32(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u64_x2)))
-svint32x2_t svreinterpret_s32(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f16_x2)))
-svint32x2_t svreinterpret_s32(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_bf16_x2)))
-svint32x2_t svreinterpret_s32(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f32_x2)))
-svint32x2_t svreinterpret_s32(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f64_x2)))
-svint32x2_t svreinterpret_s32(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_x2)))
-svuint32x2_t svreinterpret_u32(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x2)))
-svuint32x2_t svreinterpret_u32(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x2)))
-svuint32x2_t svreinterpret_u32(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x2)))
-svuint32x2_t svreinterpret_u32(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s32_x2)))
-svuint32x2_t svreinterpret_u32(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u32_x2)))
-svuint32x2_t svreinterpret_u32(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s64_x2)))
-svuint32x2_t svreinterpret_u32(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u64_x2)))
-svuint32x2_t svreinterpret_u32(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f16_x2)))
-svuint32x2_t svreinterpret_u32(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_bf16_x2)))
-svuint32x2_t svreinterpret_u32(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f32_x2)))
-svuint32x2_t svreinterpret_u32(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f64_x2)))
-svuint32x2_t svreinterpret_u32(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_x2)))
-svint64x2_t svreinterpret_s64(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x2)))
-svint64x2_t svreinterpret_s64(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x2)))
-svint64x2_t svreinterpret_s64(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x2)))
-svint64x2_t svreinterpret_s64(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s32_x2)))
-svint64x2_t svreinterpret_s64(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u32_x2)))
-svint64x2_t svreinterpret_s64(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s64_x2)))
-svint64x2_t svreinterpret_s64(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u64_x2)))
-svint64x2_t svreinterpret_s64(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f16_x2)))
-svint64x2_t svreinterpret_s64(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_bf16_x2)))
-svint64x2_t svreinterpret_s64(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f32_x2)))
-svint64x2_t svreinterpret_s64(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f64_x2)))
-svint64x2_t svreinterpret_s64(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_x2)))
-svuint64x2_t svreinterpret_u64(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x2)))
-svuint64x2_t svreinterpret_u64(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x2)))
-svuint64x2_t svreinterpret_u64(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x2)))
-svuint64x2_t svreinterpret_u64(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s32_x2)))
-svuint64x2_t svreinterpret_u64(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u32_x2)))
-svuint64x2_t svreinterpret_u64(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s64_x2)))
-svuint64x2_t svreinterpret_u64(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u64_x2)))
-svuint64x2_t svreinterpret_u64(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f16_x2)))
-svuint64x2_t svreinterpret_u64(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_bf16_x2)))
-svuint64x2_t svreinterpret_u64(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f32_x2)))
-svuint64x2_t svreinterpret_u64(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f64_x2)))
-svuint64x2_t svreinterpret_u64(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_x2)))
-svfloat16x2_t svreinterpret_f16(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x2)))
-svfloat16x2_t svreinterpret_f16(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x2)))
-svfloat16x2_t svreinterpret_f16(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x2)))
-svfloat16x2_t svreinterpret_f16(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s32_x2)))
-svfloat16x2_t svreinterpret_f16(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u32_x2)))
-svfloat16x2_t svreinterpret_f16(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s64_x2)))
-svfloat16x2_t svreinterpret_f16(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u64_x2)))
-svfloat16x2_t svreinterpret_f16(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f16_x2)))
-svfloat16x2_t svreinterpret_f16(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_bf16_x2)))
-svfloat16x2_t svreinterpret_f16(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f32_x2)))
-svfloat16x2_t svreinterpret_f16(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f64_x2)))
-svfloat16x2_t svreinterpret_f16(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8_x2)))
-svbfloat16x2_t svreinterpret_bf16(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x2)))
-svbfloat16x2_t svreinterpret_bf16(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x2)))
-svbfloat16x2_t svreinterpret_bf16(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x2)))
-svbfloat16x2_t svreinterpret_bf16(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s32_x2)))
-svbfloat16x2_t svreinterpret_bf16(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u32_x2)))
-svbfloat16x2_t svreinterpret_bf16(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s64_x2)))
-svbfloat16x2_t svreinterpret_bf16(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u64_x2)))
-svbfloat16x2_t svreinterpret_bf16(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f16_x2)))
-svbfloat16x2_t svreinterpret_bf16(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_bf16_x2)))
-svbfloat16x2_t svreinterpret_bf16(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f32_x2)))
-svbfloat16x2_t svreinterpret_bf16(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f64_x2)))
-svbfloat16x2_t svreinterpret_bf16(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_x2)))
-svfloat32x2_t svreinterpret_f32(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x2)))
-svfloat32x2_t svreinterpret_f32(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x2)))
-svfloat32x2_t svreinterpret_f32(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x2)))
-svfloat32x2_t svreinterpret_f32(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s32_x2)))
-svfloat32x2_t svreinterpret_f32(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u32_x2)))
-svfloat32x2_t svreinterpret_f32(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s64_x2)))
-svfloat32x2_t svreinterpret_f32(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u64_x2)))
-svfloat32x2_t svreinterpret_f32(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f16_x2)))
-svfloat32x2_t svreinterpret_f32(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_bf16_x2)))
-svfloat32x2_t svreinterpret_f32(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f32_x2)))
-svfloat32x2_t svreinterpret_f32(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f64_x2)))
-svfloat32x2_t svreinterpret_f32(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_x2)))
-svfloat64x2_t svreinterpret_f64(svint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x2)))
-svfloat64x2_t svreinterpret_f64(svuint8x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x2)))
-svfloat64x2_t svreinterpret_f64(svint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x2)))
-svfloat64x2_t svreinterpret_f64(svuint16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s32_x2)))
-svfloat64x2_t svreinterpret_f64(svint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u32_x2)))
-svfloat64x2_t svreinterpret_f64(svuint32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s64_x2)))
-svfloat64x2_t svreinterpret_f64(svint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u64_x2)))
-svfloat64x2_t svreinterpret_f64(svuint64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f16_x2)))
-svfloat64x2_t svreinterpret_f64(svfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_bf16_x2)))
-svfloat64x2_t svreinterpret_f64(svbfloat16x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f32_x2)))
-svfloat64x2_t svreinterpret_f64(svfloat32x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f64_x2)))
-svfloat64x2_t svreinterpret_f64(svfloat64x2_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x3)))
-svint8x3_t svreinterpret_s8_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x3)))
-svint8x3_t svreinterpret_s8_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x3)))
-svint8x3_t svreinterpret_s8_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x3)))
-svint8x3_t svreinterpret_s8_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s32_x3)))
-svint8x3_t svreinterpret_s8_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u32_x3)))
-svint8x3_t svreinterpret_s8_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s64_x3)))
-svint8x3_t svreinterpret_s8_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u64_x3)))
-svint8x3_t svreinterpret_s8_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f16_x3)))
-svint8x3_t svreinterpret_s8_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_bf16_x3)))
-svint8x3_t svreinterpret_s8_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f32_x3)))
-svint8x3_t svreinterpret_s8_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f64_x3)))
-svint8x3_t svreinterpret_s8_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x3)))
-svuint8x3_t svreinterpret_u8_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x3)))
-svuint8x3_t svreinterpret_u8_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x3)))
-svuint8x3_t svreinterpret_u8_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x3)))
-svuint8x3_t svreinterpret_u8_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s32_x3)))
-svuint8x3_t svreinterpret_u8_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u32_x3)))
-svuint8x3_t svreinterpret_u8_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s64_x3)))
-svuint8x3_t svreinterpret_u8_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u64_x3)))
-svuint8x3_t svreinterpret_u8_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f16_x3)))
-svuint8x3_t svreinterpret_u8_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_bf16_x3)))
-svuint8x3_t svreinterpret_u8_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_x3)))
-svuint8x3_t svreinterpret_u8_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x3)))
-svuint8x3_t svreinterpret_u8_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x3)))
-svint16x3_t svreinterpret_s16_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x3)))
-svint16x3_t svreinterpret_s16_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x3)))
-svint16x3_t svreinterpret_s16_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x3)))
-svint16x3_t svreinterpret_s16_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s32_x3)))
-svint16x3_t svreinterpret_s16_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u32_x3)))
-svint16x3_t svreinterpret_s16_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s64_x3)))
-svint16x3_t svreinterpret_s16_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u64_x3)))
-svint16x3_t svreinterpret_s16_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f16_x3)))
-svint16x3_t svreinterpret_s16_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_bf16_x3)))
-svint16x3_t svreinterpret_s16_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f32_x3)))
-svint16x3_t svreinterpret_s16_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f64_x3)))
-svint16x3_t svreinterpret_s16_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_x3)))
-svuint16x3_t svreinterpret_u16_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x3)))
-svuint16x3_t svreinterpret_u16_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x3)))
-svuint16x3_t svreinterpret_u16_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x3)))
-svuint16x3_t svreinterpret_u16_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s32_x3)))
-svuint16x3_t svreinterpret_u16_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u32_x3)))
-svuint16x3_t svreinterpret_u16_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s64_x3)))
-svuint16x3_t svreinterpret_u16_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u64_x3)))
-svuint16x3_t svreinterpret_u16_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f16_x3)))
-svuint16x3_t svreinterpret_u16_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_bf16_x3)))
-svuint16x3_t svreinterpret_u16_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f32_x3)))
-svuint16x3_t svreinterpret_u16_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f64_x3)))
-svuint16x3_t svreinterpret_u16_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_x3)))
-svint32x3_t svreinterpret_s32_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x3)))
-svint32x3_t svreinterpret_s32_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x3)))
-svint32x3_t svreinterpret_s32_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x3)))
-svint32x3_t svreinterpret_s32_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s32_x3)))
-svint32x3_t svreinterpret_s32_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u32_x3)))
-svint32x3_t svreinterpret_s32_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s64_x3)))
-svint32x3_t svreinterpret_s32_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u64_x3)))
-svint32x3_t svreinterpret_s32_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f16_x3)))
-svint32x3_t svreinterpret_s32_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_bf16_x3)))
-svint32x3_t svreinterpret_s32_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f32_x3)))
-svint32x3_t svreinterpret_s32_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f64_x3)))
-svint32x3_t svreinterpret_s32_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_x3)))
-svuint32x3_t svreinterpret_u32_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x3)))
-svuint32x3_t svreinterpret_u32_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x3)))
-svuint32x3_t svreinterpret_u32_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x3)))
-svuint32x3_t svreinterpret_u32_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s32_x3)))
-svuint32x3_t svreinterpret_u32_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u32_x3)))
-svuint32x3_t svreinterpret_u32_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s64_x3)))
-svuint32x3_t svreinterpret_u32_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u64_x3)))
-svuint32x3_t svreinterpret_u32_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f16_x3)))
-svuint32x3_t svreinterpret_u32_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_bf16_x3)))
-svuint32x3_t svreinterpret_u32_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f32_x3)))
-svuint32x3_t svreinterpret_u32_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f64_x3)))
-svuint32x3_t svreinterpret_u32_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_x3)))
-svint64x3_t svreinterpret_s64_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x3)))
-svint64x3_t svreinterpret_s64_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x3)))
-svint64x3_t svreinterpret_s64_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x3)))
-svint64x3_t svreinterpret_s64_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s32_x3)))
-svint64x3_t svreinterpret_s64_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u32_x3)))
-svint64x3_t svreinterpret_s64_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s64_x3)))
-svint64x3_t svreinterpret_s64_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u64_x3)))
-svint64x3_t svreinterpret_s64_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f16_x3)))
-svint64x3_t svreinterpret_s64_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_bf16_x3)))
-svint64x3_t svreinterpret_s64_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f32_x3)))
-svint64x3_t svreinterpret_s64_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f64_x3)))
-svint64x3_t svreinterpret_s64_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_x3)))
-svuint64x3_t svreinterpret_u64_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x3)))
-svuint64x3_t svreinterpret_u64_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x3)))
-svuint64x3_t svreinterpret_u64_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x3)))
-svuint64x3_t svreinterpret_u64_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s32_x3)))
-svuint64x3_t svreinterpret_u64_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u32_x3)))
-svuint64x3_t svreinterpret_u64_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s64_x3)))
-svuint64x3_t svreinterpret_u64_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u64_x3)))
-svuint64x3_t svreinterpret_u64_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f16_x3)))
-svuint64x3_t svreinterpret_u64_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_bf16_x3)))
-svuint64x3_t svreinterpret_u64_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f32_x3)))
-svuint64x3_t svreinterpret_u64_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f64_x3)))
-svuint64x3_t svreinterpret_u64_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_x3)))
-svfloat16x3_t svreinterpret_f16_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x3)))
-svfloat16x3_t svreinterpret_f16_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x3)))
-svfloat16x3_t svreinterpret_f16_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x3)))
-svfloat16x3_t svreinterpret_f16_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s32_x3)))
-svfloat16x3_t svreinterpret_f16_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u32_x3)))
-svfloat16x3_t svreinterpret_f16_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s64_x3)))
-svfloat16x3_t svreinterpret_f16_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u64_x3)))
-svfloat16x3_t svreinterpret_f16_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f16_x3)))
-svfloat16x3_t svreinterpret_f16_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_bf16_x3)))
-svfloat16x3_t svreinterpret_f16_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f32_x3)))
-svfloat16x3_t svreinterpret_f16_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f64_x3)))
-svfloat16x3_t svreinterpret_f16_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8_x3)))
-svbfloat16x3_t svreinterpret_bf16_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x3)))
-svbfloat16x3_t svreinterpret_bf16_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x3)))
-svbfloat16x3_t svreinterpret_bf16_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x3)))
-svbfloat16x3_t svreinterpret_bf16_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s32_x3)))
-svbfloat16x3_t svreinterpret_bf16_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u32_x3)))
-svbfloat16x3_t svreinterpret_bf16_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s64_x3)))
-svbfloat16x3_t svreinterpret_bf16_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u64_x3)))
-svbfloat16x3_t svreinterpret_bf16_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f16_x3)))
-svbfloat16x3_t svreinterpret_bf16_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_bf16_x3)))
-svbfloat16x3_t svreinterpret_bf16_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f32_x3)))
-svbfloat16x3_t svreinterpret_bf16_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f64_x3)))
-svbfloat16x3_t svreinterpret_bf16_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_x3)))
-svfloat32x3_t svreinterpret_f32_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x3)))
-svfloat32x3_t svreinterpret_f32_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x3)))
-svfloat32x3_t svreinterpret_f32_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x3)))
-svfloat32x3_t svreinterpret_f32_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s32_x3)))
-svfloat32x3_t svreinterpret_f32_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u32_x3)))
-svfloat32x3_t svreinterpret_f32_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s64_x3)))
-svfloat32x3_t svreinterpret_f32_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u64_x3)))
-svfloat32x3_t svreinterpret_f32_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f16_x3)))
-svfloat32x3_t svreinterpret_f32_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_bf16_x3)))
-svfloat32x3_t svreinterpret_f32_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f32_x3)))
-svfloat32x3_t svreinterpret_f32_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f64_x3)))
-svfloat32x3_t svreinterpret_f32_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_x3)))
-svfloat64x3_t svreinterpret_f64_s8_x3(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x3)))
-svfloat64x3_t svreinterpret_f64_u8_x3(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x3)))
-svfloat64x3_t svreinterpret_f64_s16_x3(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x3)))
-svfloat64x3_t svreinterpret_f64_u16_x3(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s32_x3)))
-svfloat64x3_t svreinterpret_f64_s32_x3(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u32_x3)))
-svfloat64x3_t svreinterpret_f64_u32_x3(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s64_x3)))
-svfloat64x3_t svreinterpret_f64_s64_x3(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u64_x3)))
-svfloat64x3_t svreinterpret_f64_u64_x3(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f16_x3)))
-svfloat64x3_t svreinterpret_f64_f16_x3(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_bf16_x3)))
-svfloat64x3_t svreinterpret_f64_bf16_x3(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f32_x3)))
-svfloat64x3_t svreinterpret_f64_f32_x3(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f64_x3)))
-svfloat64x3_t svreinterpret_f64_f64_x3(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x3)))
-svint8x3_t svreinterpret_s8(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x3)))
-svint8x3_t svreinterpret_s8(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x3)))
-svint8x3_t svreinterpret_s8(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x3)))
-svint8x3_t svreinterpret_s8(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s32_x3)))
-svint8x3_t svreinterpret_s8(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u32_x3)))
-svint8x3_t svreinterpret_s8(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s64_x3)))
-svint8x3_t svreinterpret_s8(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u64_x3)))
-svint8x3_t svreinterpret_s8(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f16_x3)))
-svint8x3_t svreinterpret_s8(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_bf16_x3)))
-svint8x3_t svreinterpret_s8(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f32_x3)))
-svint8x3_t svreinterpret_s8(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f64_x3)))
-svint8x3_t svreinterpret_s8(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x3)))
-svuint8x3_t svreinterpret_u8(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x3)))
-svuint8x3_t svreinterpret_u8(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x3)))
-svuint8x3_t svreinterpret_u8(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x3)))
-svuint8x3_t svreinterpret_u8(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s32_x3)))
-svuint8x3_t svreinterpret_u8(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u32_x3)))
-svuint8x3_t svreinterpret_u8(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s64_x3)))
-svuint8x3_t svreinterpret_u8(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u64_x3)))
-svuint8x3_t svreinterpret_u8(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f16_x3)))
-svuint8x3_t svreinterpret_u8(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_bf16_x3)))
-svuint8x3_t svreinterpret_u8(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_x3)))
-svuint8x3_t svreinterpret_u8(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x3)))
-svuint8x3_t svreinterpret_u8(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x3)))
-svint16x3_t svreinterpret_s16(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x3)))
-svint16x3_t svreinterpret_s16(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x3)))
-svint16x3_t svreinterpret_s16(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x3)))
-svint16x3_t svreinterpret_s16(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s32_x3)))
-svint16x3_t svreinterpret_s16(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u32_x3)))
-svint16x3_t svreinterpret_s16(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s64_x3)))
-svint16x3_t svreinterpret_s16(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u64_x3)))
-svint16x3_t svreinterpret_s16(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f16_x3)))
-svint16x3_t svreinterpret_s16(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_bf16_x3)))
-svint16x3_t svreinterpret_s16(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f32_x3)))
-svint16x3_t svreinterpret_s16(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f64_x3)))
-svint16x3_t svreinterpret_s16(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_x3)))
-svuint16x3_t svreinterpret_u16(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x3)))
-svuint16x3_t svreinterpret_u16(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x3)))
-svuint16x3_t svreinterpret_u16(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x3)))
-svuint16x3_t svreinterpret_u16(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s32_x3)))
-svuint16x3_t svreinterpret_u16(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u32_x3)))
-svuint16x3_t svreinterpret_u16(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s64_x3)))
-svuint16x3_t svreinterpret_u16(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u64_x3)))
-svuint16x3_t svreinterpret_u16(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f16_x3)))
-svuint16x3_t svreinterpret_u16(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_bf16_x3)))
-svuint16x3_t svreinterpret_u16(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f32_x3)))
-svuint16x3_t svreinterpret_u16(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f64_x3)))
-svuint16x3_t svreinterpret_u16(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_x3)))
-svint32x3_t svreinterpret_s32(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x3)))
-svint32x3_t svreinterpret_s32(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x3)))
-svint32x3_t svreinterpret_s32(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x3)))
-svint32x3_t svreinterpret_s32(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s32_x3)))
-svint32x3_t svreinterpret_s32(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u32_x3)))
-svint32x3_t svreinterpret_s32(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s64_x3)))
-svint32x3_t svreinterpret_s32(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u64_x3)))
-svint32x3_t svreinterpret_s32(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f16_x3)))
-svint32x3_t svreinterpret_s32(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_bf16_x3)))
-svint32x3_t svreinterpret_s32(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f32_x3)))
-svint32x3_t svreinterpret_s32(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f64_x3)))
-svint32x3_t svreinterpret_s32(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_x3)))
-svuint32x3_t svreinterpret_u32(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x3)))
-svuint32x3_t svreinterpret_u32(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x3)))
-svuint32x3_t svreinterpret_u32(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x3)))
-svuint32x3_t svreinterpret_u32(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s32_x3)))
-svuint32x3_t svreinterpret_u32(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u32_x3)))
-svuint32x3_t svreinterpret_u32(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s64_x3)))
-svuint32x3_t svreinterpret_u32(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u64_x3)))
-svuint32x3_t svreinterpret_u32(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f16_x3)))
-svuint32x3_t svreinterpret_u32(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_bf16_x3)))
-svuint32x3_t svreinterpret_u32(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f32_x3)))
-svuint32x3_t svreinterpret_u32(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f64_x3)))
-svuint32x3_t svreinterpret_u32(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_x3)))
-svint64x3_t svreinterpret_s64(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x3)))
-svint64x3_t svreinterpret_s64(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x3)))
-svint64x3_t svreinterpret_s64(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x3)))
-svint64x3_t svreinterpret_s64(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s32_x3)))
-svint64x3_t svreinterpret_s64(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u32_x3)))
-svint64x3_t svreinterpret_s64(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s64_x3)))
-svint64x3_t svreinterpret_s64(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u64_x3)))
-svint64x3_t svreinterpret_s64(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f16_x3)))
-svint64x3_t svreinterpret_s64(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_bf16_x3)))
-svint64x3_t svreinterpret_s64(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f32_x3)))
-svint64x3_t svreinterpret_s64(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f64_x3)))
-svint64x3_t svreinterpret_s64(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_x3)))
-svuint64x3_t svreinterpret_u64(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x3)))
-svuint64x3_t svreinterpret_u64(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x3)))
-svuint64x3_t svreinterpret_u64(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x3)))
-svuint64x3_t svreinterpret_u64(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s32_x3)))
-svuint64x3_t svreinterpret_u64(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u32_x3)))
-svuint64x3_t svreinterpret_u64(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s64_x3)))
-svuint64x3_t svreinterpret_u64(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u64_x3)))
-svuint64x3_t svreinterpret_u64(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f16_x3)))
-svuint64x3_t svreinterpret_u64(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_bf16_x3)))
-svuint64x3_t svreinterpret_u64(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f32_x3)))
-svuint64x3_t svreinterpret_u64(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f64_x3)))
-svuint64x3_t svreinterpret_u64(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_x3)))
-svfloat16x3_t svreinterpret_f16(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x3)))
-svfloat16x3_t svreinterpret_f16(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x3)))
-svfloat16x3_t svreinterpret_f16(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x3)))
-svfloat16x3_t svreinterpret_f16(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s32_x3)))
-svfloat16x3_t svreinterpret_f16(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u32_x3)))
-svfloat16x3_t svreinterpret_f16(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s64_x3)))
-svfloat16x3_t svreinterpret_f16(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u64_x3)))
-svfloat16x3_t svreinterpret_f16(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f16_x3)))
-svfloat16x3_t svreinterpret_f16(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_bf16_x3)))
-svfloat16x3_t svreinterpret_f16(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f32_x3)))
-svfloat16x3_t svreinterpret_f16(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f64_x3)))
-svfloat16x3_t svreinterpret_f16(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8_x3)))
-svbfloat16x3_t svreinterpret_bf16(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x3)))
-svbfloat16x3_t svreinterpret_bf16(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x3)))
-svbfloat16x3_t svreinterpret_bf16(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x3)))
-svbfloat16x3_t svreinterpret_bf16(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s32_x3)))
-svbfloat16x3_t svreinterpret_bf16(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u32_x3)))
-svbfloat16x3_t svreinterpret_bf16(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s64_x3)))
-svbfloat16x3_t svreinterpret_bf16(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u64_x3)))
-svbfloat16x3_t svreinterpret_bf16(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f16_x3)))
-svbfloat16x3_t svreinterpret_bf16(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_bf16_x3)))
-svbfloat16x3_t svreinterpret_bf16(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f32_x3)))
-svbfloat16x3_t svreinterpret_bf16(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f64_x3)))
-svbfloat16x3_t svreinterpret_bf16(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_x3)))
-svfloat32x3_t svreinterpret_f32(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x3)))
-svfloat32x3_t svreinterpret_f32(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x3)))
-svfloat32x3_t svreinterpret_f32(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x3)))
-svfloat32x3_t svreinterpret_f32(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s32_x3)))
-svfloat32x3_t svreinterpret_f32(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u32_x3)))
-svfloat32x3_t svreinterpret_f32(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s64_x3)))
-svfloat32x3_t svreinterpret_f32(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u64_x3)))
-svfloat32x3_t svreinterpret_f32(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f16_x3)))
-svfloat32x3_t svreinterpret_f32(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_bf16_x3)))
-svfloat32x3_t svreinterpret_f32(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f32_x3)))
-svfloat32x3_t svreinterpret_f32(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f64_x3)))
-svfloat32x3_t svreinterpret_f32(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_x3)))
-svfloat64x3_t svreinterpret_f64(svint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x3)))
-svfloat64x3_t svreinterpret_f64(svuint8x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x3)))
-svfloat64x3_t svreinterpret_f64(svint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x3)))
-svfloat64x3_t svreinterpret_f64(svuint16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s32_x3)))
-svfloat64x3_t svreinterpret_f64(svint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u32_x3)))
-svfloat64x3_t svreinterpret_f64(svuint32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s64_x3)))
-svfloat64x3_t svreinterpret_f64(svint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u64_x3)))
-svfloat64x3_t svreinterpret_f64(svuint64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f16_x3)))
-svfloat64x3_t svreinterpret_f64(svfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_bf16_x3)))
-svfloat64x3_t svreinterpret_f64(svbfloat16x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f32_x3)))
-svfloat64x3_t svreinterpret_f64(svfloat32x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f64_x3)))
-svfloat64x3_t svreinterpret_f64(svfloat64x3_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x4)))
-svint8x4_t svreinterpret_s8_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x4)))
-svint8x4_t svreinterpret_s8_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x4)))
-svint8x4_t svreinterpret_s8_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x4)))
-svint8x4_t svreinterpret_s8_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s32_x4)))
-svint8x4_t svreinterpret_s8_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u32_x4)))
-svint8x4_t svreinterpret_s8_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s64_x4)))
-svint8x4_t svreinterpret_s8_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u64_x4)))
-svint8x4_t svreinterpret_s8_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f16_x4)))
-svint8x4_t svreinterpret_s8_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_bf16_x4)))
-svint8x4_t svreinterpret_s8_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f32_x4)))
-svint8x4_t svreinterpret_s8_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f64_x4)))
-svint8x4_t svreinterpret_s8_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x4)))
-svuint8x4_t svreinterpret_u8_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x4)))
-svuint8x4_t svreinterpret_u8_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x4)))
-svuint8x4_t svreinterpret_u8_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x4)))
-svuint8x4_t svreinterpret_u8_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s32_x4)))
-svuint8x4_t svreinterpret_u8_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u32_x4)))
-svuint8x4_t svreinterpret_u8_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s64_x4)))
-svuint8x4_t svreinterpret_u8_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u64_x4)))
-svuint8x4_t svreinterpret_u8_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f16_x4)))
-svuint8x4_t svreinterpret_u8_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_bf16_x4)))
-svuint8x4_t svreinterpret_u8_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_x4)))
-svuint8x4_t svreinterpret_u8_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x4)))
-svuint8x4_t svreinterpret_u8_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x4)))
-svint16x4_t svreinterpret_s16_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x4)))
-svint16x4_t svreinterpret_s16_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x4)))
-svint16x4_t svreinterpret_s16_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x4)))
-svint16x4_t svreinterpret_s16_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s32_x4)))
-svint16x4_t svreinterpret_s16_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u32_x4)))
-svint16x4_t svreinterpret_s16_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s64_x4)))
-svint16x4_t svreinterpret_s16_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u64_x4)))
-svint16x4_t svreinterpret_s16_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f16_x4)))
-svint16x4_t svreinterpret_s16_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_bf16_x4)))
-svint16x4_t svreinterpret_s16_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f32_x4)))
-svint16x4_t svreinterpret_s16_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f64_x4)))
-svint16x4_t svreinterpret_s16_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_x4)))
-svuint16x4_t svreinterpret_u16_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x4)))
-svuint16x4_t svreinterpret_u16_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x4)))
-svuint16x4_t svreinterpret_u16_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x4)))
-svuint16x4_t svreinterpret_u16_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s32_x4)))
-svuint16x4_t svreinterpret_u16_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u32_x4)))
-svuint16x4_t svreinterpret_u16_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s64_x4)))
-svuint16x4_t svreinterpret_u16_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u64_x4)))
-svuint16x4_t svreinterpret_u16_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f16_x4)))
-svuint16x4_t svreinterpret_u16_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_bf16_x4)))
-svuint16x4_t svreinterpret_u16_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f32_x4)))
-svuint16x4_t svreinterpret_u16_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f64_x4)))
-svuint16x4_t svreinterpret_u16_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_x4)))
-svint32x4_t svreinterpret_s32_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x4)))
-svint32x4_t svreinterpret_s32_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x4)))
-svint32x4_t svreinterpret_s32_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x4)))
-svint32x4_t svreinterpret_s32_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s32_x4)))
-svint32x4_t svreinterpret_s32_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u32_x4)))
-svint32x4_t svreinterpret_s32_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s64_x4)))
-svint32x4_t svreinterpret_s32_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u64_x4)))
-svint32x4_t svreinterpret_s32_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f16_x4)))
-svint32x4_t svreinterpret_s32_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_bf16_x4)))
-svint32x4_t svreinterpret_s32_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f32_x4)))
-svint32x4_t svreinterpret_s32_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f64_x4)))
-svint32x4_t svreinterpret_s32_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_x4)))
-svuint32x4_t svreinterpret_u32_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x4)))
-svuint32x4_t svreinterpret_u32_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x4)))
-svuint32x4_t svreinterpret_u32_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x4)))
-svuint32x4_t svreinterpret_u32_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s32_x4)))
-svuint32x4_t svreinterpret_u32_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u32_x4)))
-svuint32x4_t svreinterpret_u32_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s64_x4)))
-svuint32x4_t svreinterpret_u32_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u64_x4)))
-svuint32x4_t svreinterpret_u32_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f16_x4)))
-svuint32x4_t svreinterpret_u32_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_bf16_x4)))
-svuint32x4_t svreinterpret_u32_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f32_x4)))
-svuint32x4_t svreinterpret_u32_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f64_x4)))
-svuint32x4_t svreinterpret_u32_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_x4)))
-svint64x4_t svreinterpret_s64_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x4)))
-svint64x4_t svreinterpret_s64_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x4)))
-svint64x4_t svreinterpret_s64_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x4)))
-svint64x4_t svreinterpret_s64_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s32_x4)))
-svint64x4_t svreinterpret_s64_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u32_x4)))
-svint64x4_t svreinterpret_s64_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s64_x4)))
-svint64x4_t svreinterpret_s64_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u64_x4)))
-svint64x4_t svreinterpret_s64_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f16_x4)))
-svint64x4_t svreinterpret_s64_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_bf16_x4)))
-svint64x4_t svreinterpret_s64_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f32_x4)))
-svint64x4_t svreinterpret_s64_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f64_x4)))
-svint64x4_t svreinterpret_s64_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_x4)))
-svuint64x4_t svreinterpret_u64_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x4)))
-svuint64x4_t svreinterpret_u64_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x4)))
-svuint64x4_t svreinterpret_u64_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x4)))
-svuint64x4_t svreinterpret_u64_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s32_x4)))
-svuint64x4_t svreinterpret_u64_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u32_x4)))
-svuint64x4_t svreinterpret_u64_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s64_x4)))
-svuint64x4_t svreinterpret_u64_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u64_x4)))
-svuint64x4_t svreinterpret_u64_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f16_x4)))
-svuint64x4_t svreinterpret_u64_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_bf16_x4)))
-svuint64x4_t svreinterpret_u64_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f32_x4)))
-svuint64x4_t svreinterpret_u64_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f64_x4)))
-svuint64x4_t svreinterpret_u64_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_x4)))
-svfloat16x4_t svreinterpret_f16_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x4)))
-svfloat16x4_t svreinterpret_f16_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x4)))
-svfloat16x4_t svreinterpret_f16_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x4)))
-svfloat16x4_t svreinterpret_f16_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s32_x4)))
-svfloat16x4_t svreinterpret_f16_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u32_x4)))
-svfloat16x4_t svreinterpret_f16_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s64_x4)))
-svfloat16x4_t svreinterpret_f16_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u64_x4)))
-svfloat16x4_t svreinterpret_f16_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f16_x4)))
-svfloat16x4_t svreinterpret_f16_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_bf16_x4)))
-svfloat16x4_t svreinterpret_f16_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f32_x4)))
-svfloat16x4_t svreinterpret_f16_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f64_x4)))
-svfloat16x4_t svreinterpret_f16_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8_x4)))
-svbfloat16x4_t svreinterpret_bf16_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x4)))
-svbfloat16x4_t svreinterpret_bf16_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x4)))
-svbfloat16x4_t svreinterpret_bf16_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x4)))
-svbfloat16x4_t svreinterpret_bf16_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s32_x4)))
-svbfloat16x4_t svreinterpret_bf16_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u32_x4)))
-svbfloat16x4_t svreinterpret_bf16_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s64_x4)))
-svbfloat16x4_t svreinterpret_bf16_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u64_x4)))
-svbfloat16x4_t svreinterpret_bf16_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f16_x4)))
-svbfloat16x4_t svreinterpret_bf16_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_bf16_x4)))
-svbfloat16x4_t svreinterpret_bf16_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f32_x4)))
-svbfloat16x4_t svreinterpret_bf16_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f64_x4)))
-svbfloat16x4_t svreinterpret_bf16_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_x4)))
-svfloat32x4_t svreinterpret_f32_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x4)))
-svfloat32x4_t svreinterpret_f32_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x4)))
-svfloat32x4_t svreinterpret_f32_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x4)))
-svfloat32x4_t svreinterpret_f32_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s32_x4)))
-svfloat32x4_t svreinterpret_f32_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u32_x4)))
-svfloat32x4_t svreinterpret_f32_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s64_x4)))
-svfloat32x4_t svreinterpret_f32_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u64_x4)))
-svfloat32x4_t svreinterpret_f32_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f16_x4)))
-svfloat32x4_t svreinterpret_f32_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_bf16_x4)))
-svfloat32x4_t svreinterpret_f32_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f32_x4)))
-svfloat32x4_t svreinterpret_f32_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f64_x4)))
-svfloat32x4_t svreinterpret_f32_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_x4)))
-svfloat64x4_t svreinterpret_f64_s8_x4(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x4)))
-svfloat64x4_t svreinterpret_f64_u8_x4(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x4)))
-svfloat64x4_t svreinterpret_f64_s16_x4(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x4)))
-svfloat64x4_t svreinterpret_f64_u16_x4(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s32_x4)))
-svfloat64x4_t svreinterpret_f64_s32_x4(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u32_x4)))
-svfloat64x4_t svreinterpret_f64_u32_x4(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s64_x4)))
-svfloat64x4_t svreinterpret_f64_s64_x4(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u64_x4)))
-svfloat64x4_t svreinterpret_f64_u64_x4(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f16_x4)))
-svfloat64x4_t svreinterpret_f64_f16_x4(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_bf16_x4)))
-svfloat64x4_t svreinterpret_f64_bf16_x4(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f32_x4)))
-svfloat64x4_t svreinterpret_f64_f32_x4(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f64_x4)))
-svfloat64x4_t svreinterpret_f64_f64_x4(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x4)))
-svint8x4_t svreinterpret_s8(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x4)))
-svint8x4_t svreinterpret_s8(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x4)))
-svint8x4_t svreinterpret_s8(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x4)))
-svint8x4_t svreinterpret_s8(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s32_x4)))
-svint8x4_t svreinterpret_s8(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u32_x4)))
-svint8x4_t svreinterpret_s8(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s64_x4)))
-svint8x4_t svreinterpret_s8(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u64_x4)))
-svint8x4_t svreinterpret_s8(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f16_x4)))
-svint8x4_t svreinterpret_s8(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_bf16_x4)))
-svint8x4_t svreinterpret_s8(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f32_x4)))
-svint8x4_t svreinterpret_s8(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_f64_x4)))
-svint8x4_t svreinterpret_s8(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x4)))
-svuint8x4_t svreinterpret_u8(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x4)))
-svuint8x4_t svreinterpret_u8(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x4)))
-svuint8x4_t svreinterpret_u8(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x4)))
-svuint8x4_t svreinterpret_u8(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s32_x4)))
-svuint8x4_t svreinterpret_u8(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u32_x4)))
-svuint8x4_t svreinterpret_u8(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s64_x4)))
-svuint8x4_t svreinterpret_u8(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u64_x4)))
-svuint8x4_t svreinterpret_u8(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f16_x4)))
-svuint8x4_t svreinterpret_u8(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_bf16_x4)))
-svuint8x4_t svreinterpret_u8(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_x4)))
-svuint8x4_t svreinterpret_u8(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x4)))
-svuint8x4_t svreinterpret_u8(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x4)))
-svint16x4_t svreinterpret_s16(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x4)))
-svint16x4_t svreinterpret_s16(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x4)))
-svint16x4_t svreinterpret_s16(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x4)))
-svint16x4_t svreinterpret_s16(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s32_x4)))
-svint16x4_t svreinterpret_s16(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u32_x4)))
-svint16x4_t svreinterpret_s16(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s64_x4)))
-svint16x4_t svreinterpret_s16(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u64_x4)))
-svint16x4_t svreinterpret_s16(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f16_x4)))
-svint16x4_t svreinterpret_s16(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_bf16_x4)))
-svint16x4_t svreinterpret_s16(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f32_x4)))
-svint16x4_t svreinterpret_s16(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_f64_x4)))
-svint16x4_t svreinterpret_s16(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_x4)))
-svuint16x4_t svreinterpret_u16(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x4)))
-svuint16x4_t svreinterpret_u16(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x4)))
-svuint16x4_t svreinterpret_u16(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x4)))
-svuint16x4_t svreinterpret_u16(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s32_x4)))
-svuint16x4_t svreinterpret_u16(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u32_x4)))
-svuint16x4_t svreinterpret_u16(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s64_x4)))
-svuint16x4_t svreinterpret_u16(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u64_x4)))
-svuint16x4_t svreinterpret_u16(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f16_x4)))
-svuint16x4_t svreinterpret_u16(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_bf16_x4)))
-svuint16x4_t svreinterpret_u16(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f32_x4)))
-svuint16x4_t svreinterpret_u16(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_f64_x4)))
-svuint16x4_t svreinterpret_u16(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_x4)))
-svint32x4_t svreinterpret_s32(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x4)))
-svint32x4_t svreinterpret_s32(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x4)))
-svint32x4_t svreinterpret_s32(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x4)))
-svint32x4_t svreinterpret_s32(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s32_x4)))
-svint32x4_t svreinterpret_s32(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u32_x4)))
-svint32x4_t svreinterpret_s32(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s64_x4)))
-svint32x4_t svreinterpret_s32(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u64_x4)))
-svint32x4_t svreinterpret_s32(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f16_x4)))
-svint32x4_t svreinterpret_s32(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_bf16_x4)))
-svint32x4_t svreinterpret_s32(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f32_x4)))
-svint32x4_t svreinterpret_s32(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_f64_x4)))
-svint32x4_t svreinterpret_s32(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_x4)))
-svuint32x4_t svreinterpret_u32(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x4)))
-svuint32x4_t svreinterpret_u32(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x4)))
-svuint32x4_t svreinterpret_u32(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x4)))
-svuint32x4_t svreinterpret_u32(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s32_x4)))
-svuint32x4_t svreinterpret_u32(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u32_x4)))
-svuint32x4_t svreinterpret_u32(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s64_x4)))
-svuint32x4_t svreinterpret_u32(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u64_x4)))
-svuint32x4_t svreinterpret_u32(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f16_x4)))
-svuint32x4_t svreinterpret_u32(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_bf16_x4)))
-svuint32x4_t svreinterpret_u32(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f32_x4)))
-svuint32x4_t svreinterpret_u32(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_f64_x4)))
-svuint32x4_t svreinterpret_u32(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_x4)))
-svint64x4_t svreinterpret_s64(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x4)))
-svint64x4_t svreinterpret_s64(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x4)))
-svint64x4_t svreinterpret_s64(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x4)))
-svint64x4_t svreinterpret_s64(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s32_x4)))
-svint64x4_t svreinterpret_s64(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u32_x4)))
-svint64x4_t svreinterpret_s64(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s64_x4)))
-svint64x4_t svreinterpret_s64(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u64_x4)))
-svint64x4_t svreinterpret_s64(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f16_x4)))
-svint64x4_t svreinterpret_s64(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_bf16_x4)))
-svint64x4_t svreinterpret_s64(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f32_x4)))
-svint64x4_t svreinterpret_s64(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_f64_x4)))
-svint64x4_t svreinterpret_s64(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_x4)))
-svuint64x4_t svreinterpret_u64(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x4)))
-svuint64x4_t svreinterpret_u64(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x4)))
-svuint64x4_t svreinterpret_u64(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x4)))
-svuint64x4_t svreinterpret_u64(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s32_x4)))
-svuint64x4_t svreinterpret_u64(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u32_x4)))
-svuint64x4_t svreinterpret_u64(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s64_x4)))
-svuint64x4_t svreinterpret_u64(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u64_x4)))
-svuint64x4_t svreinterpret_u64(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f16_x4)))
-svuint64x4_t svreinterpret_u64(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_bf16_x4)))
-svuint64x4_t svreinterpret_u64(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f32_x4)))
-svuint64x4_t svreinterpret_u64(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_f64_x4)))
-svuint64x4_t svreinterpret_u64(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_x4)))
-svfloat16x4_t svreinterpret_f16(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x4)))
-svfloat16x4_t svreinterpret_f16(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x4)))
-svfloat16x4_t svreinterpret_f16(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x4)))
-svfloat16x4_t svreinterpret_f16(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s32_x4)))
-svfloat16x4_t svreinterpret_f16(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u32_x4)))
-svfloat16x4_t svreinterpret_f16(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s64_x4)))
-svfloat16x4_t svreinterpret_f16(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u64_x4)))
-svfloat16x4_t svreinterpret_f16(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f16_x4)))
-svfloat16x4_t svreinterpret_f16(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_bf16_x4)))
-svfloat16x4_t svreinterpret_f16(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f32_x4)))
-svfloat16x4_t svreinterpret_f16(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_f64_x4)))
-svfloat16x4_t svreinterpret_f16(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8_x4)))
-svbfloat16x4_t svreinterpret_bf16(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x4)))
-svbfloat16x4_t svreinterpret_bf16(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x4)))
-svbfloat16x4_t svreinterpret_bf16(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x4)))
-svbfloat16x4_t svreinterpret_bf16(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s32_x4)))
-svbfloat16x4_t svreinterpret_bf16(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u32_x4)))
-svbfloat16x4_t svreinterpret_bf16(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s64_x4)))
-svbfloat16x4_t svreinterpret_bf16(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u64_x4)))
-svbfloat16x4_t svreinterpret_bf16(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f16_x4)))
-svbfloat16x4_t svreinterpret_bf16(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_bf16_x4)))
-svbfloat16x4_t svreinterpret_bf16(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f32_x4)))
-svbfloat16x4_t svreinterpret_bf16(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_f64_x4)))
-svbfloat16x4_t svreinterpret_bf16(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_x4)))
-svfloat32x4_t svreinterpret_f32(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x4)))
-svfloat32x4_t svreinterpret_f32(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x4)))
-svfloat32x4_t svreinterpret_f32(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x4)))
-svfloat32x4_t svreinterpret_f32(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s32_x4)))
-svfloat32x4_t svreinterpret_f32(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u32_x4)))
-svfloat32x4_t svreinterpret_f32(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s64_x4)))
-svfloat32x4_t svreinterpret_f32(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u64_x4)))
-svfloat32x4_t svreinterpret_f32(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f16_x4)))
-svfloat32x4_t svreinterpret_f32(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_bf16_x4)))
-svfloat32x4_t svreinterpret_f32(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f32_x4)))
-svfloat32x4_t svreinterpret_f32(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_f64_x4)))
-svfloat32x4_t svreinterpret_f32(svfloat64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_x4)))
-svfloat64x4_t svreinterpret_f64(svint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x4)))
-svfloat64x4_t svreinterpret_f64(svuint8x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x4)))
-svfloat64x4_t svreinterpret_f64(svint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x4)))
-svfloat64x4_t svreinterpret_f64(svuint16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s32_x4)))
-svfloat64x4_t svreinterpret_f64(svint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u32_x4)))
-svfloat64x4_t svreinterpret_f64(svuint32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s64_x4)))
-svfloat64x4_t svreinterpret_f64(svint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u64_x4)))
-svfloat64x4_t svreinterpret_f64(svuint64x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f16_x4)))
-svfloat64x4_t svreinterpret_f64(svfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_bf16_x4)))
-svfloat64x4_t svreinterpret_f64(svbfloat16x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f32_x4)))
-svfloat64x4_t svreinterpret_f64(svfloat32x4_t op);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_f64_x4)))
-svfloat64x4_t svreinterpret_f64(svfloat64x4_t op);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f16_x2)))
-svfloat32x2_t svcvt_f32_f16_x2(svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl_f32_f16_x2)))
-svfloat32x2_t svcvtl_f32_f16_x2(svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f16_x2)))
-svfloat32x2_t svcvt_f32(svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl_f32_f16_x2)))
-svfloat32x2_t svcvtl_f32(svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u8_x2)))
-svuint8x2_t svadd_single_u8_x2(svuint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u32_x2)))
-svuint32x2_t svadd_single_u32_x2(svuint32x2_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u64_x2)))
-svuint64x2_t svadd_single_u64_x2(svuint64x2_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u16_x2)))
-svuint16x2_t svadd_single_u16_x2(svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s8_x2)))
-svint8x2_t svadd_single_s8_x2(svint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s32_x2)))
-svint32x2_t svadd_single_s32_x2(svint32x2_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s64_x2)))
-svint64x2_t svadd_single_s64_x2(svint64x2_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s16_x2)))
-svint16x2_t svadd_single_s16_x2(svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u8_x4)))
-svuint8x4_t svadd_single_u8_x4(svuint8x4_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u32_x4)))
-svuint32x4_t svadd_single_u32_x4(svuint32x4_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u64_x4)))
-svuint64x4_t svadd_single_u64_x4(svuint64x4_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u16_x4)))
-svuint16x4_t svadd_single_u16_x4(svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s8_x4)))
-svint8x4_t svadd_single_s8_x4(svint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s32_x4)))
-svint32x4_t svadd_single_s32_x4(svint32x4_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s64_x4)))
-svint64x4_t svadd_single_s64_x4(svint64x4_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s16_x4)))
-svint16x4_t svadd_single_s16_x4(svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f64_x2)))
-svfloat64x2_t svclamp_single_f64_x2(svfloat64x2_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f32_x2)))
-svfloat32x2_t svclamp_single_f32_x2(svfloat32x2_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f16_x2)))
-svfloat16x2_t svclamp_single_f16_x2(svfloat16x2_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s8_x2)))
-svint8x2_t svclamp_single_s8_x2(svint8x2_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s32_x2)))
-svint32x2_t svclamp_single_s32_x2(svint32x2_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s64_x2)))
-svint64x2_t svclamp_single_s64_x2(svint64x2_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s16_x2)))
-svint16x2_t svclamp_single_s16_x2(svint16x2_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u8_x2)))
-svuint8x2_t svclamp_single_u8_x2(svuint8x2_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u32_x2)))
-svuint32x2_t svclamp_single_u32_x2(svuint32x2_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u64_x2)))
-svuint64x2_t svclamp_single_u64_x2(svuint64x2_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u16_x2)))
-svuint16x2_t svclamp_single_u16_x2(svuint16x2_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f64_x4)))
-svfloat64x4_t svclamp_single_f64_x4(svfloat64x4_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f32_x4)))
-svfloat32x4_t svclamp_single_f32_x4(svfloat32x4_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f16_x4)))
-svfloat16x4_t svclamp_single_f16_x4(svfloat16x4_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s8_x4)))
-svint8x4_t svclamp_single_s8_x4(svint8x4_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s32_x4)))
-svint32x4_t svclamp_single_s32_x4(svint32x4_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s64_x4)))
-svint64x4_t svclamp_single_s64_x4(svint64x4_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s16_x4)))
-svint16x4_t svclamp_single_s16_x4(svint16x4_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u8_x4)))
-svuint8x4_t svclamp_single_u8_x4(svuint8x4_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u32_x4)))
-svuint32x4_t svclamp_single_u32_x4(svuint32x4_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u64_x4)))
-svuint64x4_t svclamp_single_u64_x4(svuint64x4_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u16_x4)))
-svuint16x4_t svclamp_single_u16_x4(svuint16x4_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_x2)))
-svbfloat16_t svcvt_bf16_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f32_x2)))
-svfloat16_t svcvt_f16_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_x2)))
-svint32x2_t svcvt_s32_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_x2)))
-svuint32x2_t svcvt_u32_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_x4)))
-svint32x4_t svcvt_s32_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_x4)))
-svuint32x4_t svcvt_u32_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_x2)))
-svfloat32x2_t svcvt_f32_s32_x2(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_x4)))
-svfloat32x4_t svcvt_f32_s32_x4(svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_x2)))
-svfloat32x2_t svcvt_f32_u32_x2(svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_x4)))
-svfloat32x4_t svcvt_f32_u32_x4(svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_bf16_f32_x2)))
-svbfloat16_t svcvtn_bf16_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_f16_f32_x2)))
-svfloat16_t svcvtn_f16_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_bf16_x2)))
-svbfloat16x2_t svmax_single_bf16_x2(svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f64_x2)))
-svfloat64x2_t svmax_single_f64_x2(svfloat64x2_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f32_x2)))
-svfloat32x2_t svmax_single_f32_x2(svfloat32x2_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f16_x2)))
-svfloat16x2_t svmax_single_f16_x2(svfloat16x2_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s8_x2)))
-svint8x2_t svmax_single_s8_x2(svint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s32_x2)))
-svint32x2_t svmax_single_s32_x2(svint32x2_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s64_x2)))
-svint64x2_t svmax_single_s64_x2(svint64x2_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s16_x2)))
-svint16x2_t svmax_single_s16_x2(svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u8_x2)))
-svuint8x2_t svmax_single_u8_x2(svuint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u32_x2)))
-svuint32x2_t svmax_single_u32_x2(svuint32x2_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u64_x2)))
-svuint64x2_t svmax_single_u64_x2(svuint64x2_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u16_x2)))
-svuint16x2_t svmax_single_u16_x2(svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_bf16_x4)))
-svbfloat16x4_t svmax_single_bf16_x4(svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f64_x4)))
-svfloat64x4_t svmax_single_f64_x4(svfloat64x4_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f32_x4)))
-svfloat32x4_t svmax_single_f32_x4(svfloat32x4_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f16_x4)))
-svfloat16x4_t svmax_single_f16_x4(svfloat16x4_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s8_x4)))
-svint8x4_t svmax_single_s8_x4(svint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s32_x4)))
-svint32x4_t svmax_single_s32_x4(svint32x4_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s64_x4)))
-svint64x4_t svmax_single_s64_x4(svint64x4_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s16_x4)))
-svint16x4_t svmax_single_s16_x4(svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u8_x4)))
-svuint8x4_t svmax_single_u8_x4(svuint8x4_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u32_x4)))
-svuint32x4_t svmax_single_u32_x4(svuint32x4_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u64_x4)))
-svuint64x4_t svmax_single_u64_x4(svuint64x4_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u16_x4)))
-svuint16x4_t svmax_single_u16_x4(svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_x2)))
-svbfloat16x2_t svmax_bf16_x2(svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_x2)))
-svfloat64x2_t svmax_f64_x2(svfloat64x2_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_x2)))
-svfloat32x2_t svmax_f32_x2(svfloat32x2_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_x2)))
-svfloat16x2_t svmax_f16_x2(svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_x2)))
-svint8x2_t svmax_s8_x2(svint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_x2)))
-svint32x2_t svmax_s32_x2(svint32x2_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_x2)))
-svint64x2_t svmax_s64_x2(svint64x2_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_x2)))
-svint16x2_t svmax_s16_x2(svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_x2)))
-svuint8x2_t svmax_u8_x2(svuint8x2_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_x2)))
-svuint32x2_t svmax_u32_x2(svuint32x2_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_x2)))
-svuint64x2_t svmax_u64_x2(svuint64x2_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_x2)))
-svuint16x2_t svmax_u16_x2(svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_x4)))
-svbfloat16x4_t svmax_bf16_x4(svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_x4)))
-svfloat64x4_t svmax_f64_x4(svfloat64x4_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_x4)))
-svfloat32x4_t svmax_f32_x4(svfloat32x4_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_x4)))
-svfloat16x4_t svmax_f16_x4(svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_x4)))
-svint8x4_t svmax_s8_x4(svint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_x4)))
-svint32x4_t svmax_s32_x4(svint32x4_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_x4)))
-svint64x4_t svmax_s64_x4(svint64x4_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_x4)))
-svint16x4_t svmax_s16_x4(svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_x4)))
-svuint8x4_t svmax_u8_x4(svuint8x4_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_x4)))
-svuint32x4_t svmax_u32_x4(svuint32x4_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_x4)))
-svuint64x4_t svmax_u64_x4(svuint64x4_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_x4)))
-svuint16x4_t svmax_u16_x4(svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_bf16_x2)))
-svbfloat16x2_t svmaxnm_single_bf16_x2(svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f64_x2)))
-svfloat64x2_t svmaxnm_single_f64_x2(svfloat64x2_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f32_x2)))
-svfloat32x2_t svmaxnm_single_f32_x2(svfloat32x2_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f16_x2)))
-svfloat16x2_t svmaxnm_single_f16_x2(svfloat16x2_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_bf16_x4)))
-svbfloat16x4_t svmaxnm_single_bf16_x4(svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f64_x4)))
-svfloat64x4_t svmaxnm_single_f64_x4(svfloat64x4_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f32_x4)))
-svfloat32x4_t svmaxnm_single_f32_x4(svfloat32x4_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f16_x4)))
-svfloat16x4_t svmaxnm_single_f16_x4(svfloat16x4_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_x2)))
-svbfloat16x2_t svmaxnm_bf16_x2(svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_x2)))
-svfloat64x2_t svmaxnm_f64_x2(svfloat64x2_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_x2)))
-svfloat32x2_t svmaxnm_f32_x2(svfloat32x2_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_x2)))
-svfloat16x2_t svmaxnm_f16_x2(svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_x4)))
-svbfloat16x4_t svmaxnm_bf16_x4(svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_x4)))
-svfloat64x4_t svmaxnm_f64_x4(svfloat64x4_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_x4)))
-svfloat32x4_t svmaxnm_f32_x4(svfloat32x4_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_x4)))
-svfloat16x4_t svmaxnm_f16_x4(svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_bf16_x2)))
-svbfloat16x2_t svmin_single_bf16_x2(svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f64_x2)))
-svfloat64x2_t svmin_single_f64_x2(svfloat64x2_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f32_x2)))
-svfloat32x2_t svmin_single_f32_x2(svfloat32x2_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f16_x2)))
-svfloat16x2_t svmin_single_f16_x2(svfloat16x2_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s8_x2)))
-svint8x2_t svmin_single_s8_x2(svint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s32_x2)))
-svint32x2_t svmin_single_s32_x2(svint32x2_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s64_x2)))
-svint64x2_t svmin_single_s64_x2(svint64x2_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s16_x2)))
-svint16x2_t svmin_single_s16_x2(svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u8_x2)))
-svuint8x2_t svmin_single_u8_x2(svuint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u32_x2)))
-svuint32x2_t svmin_single_u32_x2(svuint32x2_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u64_x2)))
-svuint64x2_t svmin_single_u64_x2(svuint64x2_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u16_x2)))
-svuint16x2_t svmin_single_u16_x2(svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_bf16_x4)))
-svbfloat16x4_t svmin_single_bf16_x4(svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f64_x4)))
-svfloat64x4_t svmin_single_f64_x4(svfloat64x4_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f32_x4)))
-svfloat32x4_t svmin_single_f32_x4(svfloat32x4_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f16_x4)))
-svfloat16x4_t svmin_single_f16_x4(svfloat16x4_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s8_x4)))
-svint8x4_t svmin_single_s8_x4(svint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s32_x4)))
-svint32x4_t svmin_single_s32_x4(svint32x4_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s64_x4)))
-svint64x4_t svmin_single_s64_x4(svint64x4_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s16_x4)))
-svint16x4_t svmin_single_s16_x4(svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u8_x4)))
-svuint8x4_t svmin_single_u8_x4(svuint8x4_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u32_x4)))
-svuint32x4_t svmin_single_u32_x4(svuint32x4_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u64_x4)))
-svuint64x4_t svmin_single_u64_x4(svuint64x4_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u16_x4)))
-svuint16x4_t svmin_single_u16_x4(svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_x2)))
-svbfloat16x2_t svmin_bf16_x2(svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_x2)))
-svfloat64x2_t svmin_f64_x2(svfloat64x2_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_x2)))
-svfloat32x2_t svmin_f32_x2(svfloat32x2_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_x2)))
-svfloat16x2_t svmin_f16_x2(svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_x2)))
-svint8x2_t svmin_s8_x2(svint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_x2)))
-svint32x2_t svmin_s32_x2(svint32x2_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_x2)))
-svint64x2_t svmin_s64_x2(svint64x2_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_x2)))
-svint16x2_t svmin_s16_x2(svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_x2)))
-svuint8x2_t svmin_u8_x2(svuint8x2_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_x2)))
-svuint32x2_t svmin_u32_x2(svuint32x2_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_x2)))
-svuint64x2_t svmin_u64_x2(svuint64x2_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_x2)))
-svuint16x2_t svmin_u16_x2(svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_x4)))
-svbfloat16x4_t svmin_bf16_x4(svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_x4)))
-svfloat64x4_t svmin_f64_x4(svfloat64x4_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_x4)))
-svfloat32x4_t svmin_f32_x4(svfloat32x4_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_x4)))
-svfloat16x4_t svmin_f16_x4(svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_x4)))
-svint8x4_t svmin_s8_x4(svint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_x4)))
-svint32x4_t svmin_s32_x4(svint32x4_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_x4)))
-svint64x4_t svmin_s64_x4(svint64x4_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_x4)))
-svint16x4_t svmin_s16_x4(svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_x4)))
-svuint8x4_t svmin_u8_x4(svuint8x4_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_x4)))
-svuint32x4_t svmin_u32_x4(svuint32x4_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_x4)))
-svuint64x4_t svmin_u64_x4(svuint64x4_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_x4)))
-svuint16x4_t svmin_u16_x4(svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_bf16_x2)))
-svbfloat16x2_t svminnm_single_bf16_x2(svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f64_x2)))
-svfloat64x2_t svminnm_single_f64_x2(svfloat64x2_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f32_x2)))
-svfloat32x2_t svminnm_single_f32_x2(svfloat32x2_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f16_x2)))
-svfloat16x2_t svminnm_single_f16_x2(svfloat16x2_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_bf16_x4)))
-svbfloat16x4_t svminnm_single_bf16_x4(svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f64_x4)))
-svfloat64x4_t svminnm_single_f64_x4(svfloat64x4_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f32_x4)))
-svfloat32x4_t svminnm_single_f32_x4(svfloat32x4_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f16_x4)))
-svfloat16x4_t svminnm_single_f16_x4(svfloat16x4_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_x2)))
-svbfloat16x2_t svminnm_bf16_x2(svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_x2)))
-svfloat64x2_t svminnm_f64_x2(svfloat64x2_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_x2)))
-svfloat32x2_t svminnm_f32_x2(svfloat32x2_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_x2)))
-svfloat16x2_t svminnm_f16_x2(svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_x4)))
-svbfloat16x4_t svminnm_bf16_x4(svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_x4)))
-svfloat64x4_t svminnm_f64_x4(svfloat64x4_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_x4)))
-svfloat32x4_t svminnm_f32_x4(svfloat32x4_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_x4)))
-svfloat16x4_t svminnm_f16_x4(svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_s16_s32_x2)))
-svint16_t svqcvt_s16_s32_x2(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_s16_s64_x4)))
-svint16_t svqcvt_s16_s64_x4(svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_s8_s32_x4)))
-svint8_t svqcvt_s8_s32_x4(svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u16_s32_x2)))
-svuint16_t svqcvt_u16_s32_x2(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u16_u32_x2)))
-svuint16_t svqcvt_u16_u32_x2(svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u16_s64_x4)))
-svuint16_t svqcvt_u16_s64_x4(svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u16_u64_x4)))
-svuint16_t svqcvt_u16_u64_x4(svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u8_s32_x4)))
-svuint8_t svqcvt_u8_s32_x4(svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u8_u32_x4)))
-svuint8_t svqcvt_u8_u32_x4(svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_s16_s64_x4)))
-svint16_t svqcvtn_s16_s64_x4(svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_s8_s32_x4)))
-svint8_t svqcvtn_s8_s32_x4(svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u16_s64_x4)))
-svuint16_t svqcvtn_u16_s64_x4(svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u16_u64_x4)))
-svuint16_t svqcvtn_u16_u64_x4(svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u8_s32_x4)))
-svuint8_t svqcvtn_u8_s32_x4(svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u8_u32_x4)))
-svuint8_t svqcvtn_u8_u32_x4(svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s8_x2)))
-svint8x2_t svqdmulh_single_s8_x2(svint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s32_x2)))
-svint32x2_t svqdmulh_single_s32_x2(svint32x2_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s64_x2)))
-svint64x2_t svqdmulh_single_s64_x2(svint64x2_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s16_x2)))
-svint16x2_t svqdmulh_single_s16_x2(svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s8_x4)))
-svint8x4_t svqdmulh_single_s8_x4(svint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s32_x4)))
-svint32x4_t svqdmulh_single_s32_x4(svint32x4_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s64_x4)))
-svint64x4_t svqdmulh_single_s64_x4(svint64x4_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s16_x4)))
-svint16x4_t svqdmulh_single_s16_x4(svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s8_x2)))
-svint8x2_t svqdmulh_s8_x2(svint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s32_x2)))
-svint32x2_t svqdmulh_s32_x2(svint32x2_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s64_x2)))
-svint64x2_t svqdmulh_s64_x2(svint64x2_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s16_x2)))
-svint16x2_t svqdmulh_s16_x2(svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s8_x4)))
-svint8x4_t svqdmulh_s8_x4(svint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s32_x4)))
-svint32x4_t svqdmulh_s32_x4(svint32x4_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s64_x4)))
-svint64x4_t svqdmulh_s64_x4(svint64x4_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s16_x4)))
-svint16x4_t svqdmulh_s16_x4(svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_s16_s32_x2)))
-svint16_t svqrshr_n_s16_s32_x2(svint32x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_u16_u32_x2)))
-svuint16_t svqrshr_n_u16_u32_x2(svuint32x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_s8_s32_x4)))
-svint8_t svqrshr_n_s8_s32_x4(svint32x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_s16_s64_x4)))
-svint16_t svqrshr_n_s16_s64_x4(svint64x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_u8_u32_x4)))
-svuint8_t svqrshr_n_u8_u32_x4(svuint32x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_u16_u64_x4)))
-svuint16_t svqrshr_n_u16_u64_x4(svuint64x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_s8_s32_x4)))
-svint8_t svqrshrn_n_s8_s32_x4(svint32x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_s16_s64_x4)))
-svint16_t svqrshrn_n_s16_s64_x4(svint64x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_u8_u32_x4)))
-svuint8_t svqrshrn_n_u8_u32_x4(svuint32x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_u16_u64_x4)))
-svuint16_t svqrshrn_n_u16_u64_x4(svuint64x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshru_n_u16_s32_x2)))
-svuint16_t svqrshru_n_u16_s32_x2(svint32x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshru_n_u8_s32_x4)))
-svuint8_t svqrshru_n_u8_s32_x4(svint32x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshru_n_u16_s64_x4)))
-svuint16_t svqrshru_n_u16_s64_x4(svint64x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrun_n_u8_s32_x4)))
-svuint8_t svqrshrun_n_u8_s32_x4(svint32x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrun_n_u16_s64_x4)))
-svuint16_t svqrshrun_n_u16_s64_x4(svint64x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_x2)))
-svfloat32x2_t svrinta_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_x4)))
-svfloat32x4_t svrinta_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_x2)))
-svfloat32x2_t svrintm_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_x4)))
-svfloat32x4_t svrintm_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_x2)))
-svfloat32x2_t svrintn_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_x4)))
-svfloat32x4_t svrintn_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_x2)))
-svfloat32x2_t svrintp_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_x4)))
-svfloat32x4_t svrintp_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s8_x2)))
-svint8x2_t svrshl_single_s8_x2(svint8x2_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s32_x2)))
-svint32x2_t svrshl_single_s32_x2(svint32x2_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s64_x2)))
-svint64x2_t svrshl_single_s64_x2(svint64x2_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s16_x2)))
-svint16x2_t svrshl_single_s16_x2(svint16x2_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u8_x2)))
-svuint8x2_t svrshl_single_u8_x2(svuint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u32_x2)))
-svuint32x2_t svrshl_single_u32_x2(svuint32x2_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u64_x2)))
-svuint64x2_t svrshl_single_u64_x2(svuint64x2_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u16_x2)))
-svuint16x2_t svrshl_single_u16_x2(svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s8_x4)))
-svint8x4_t svrshl_single_s8_x4(svint8x4_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s32_x4)))
-svint32x4_t svrshl_single_s32_x4(svint32x4_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s64_x4)))
-svint64x4_t svrshl_single_s64_x4(svint64x4_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s16_x4)))
-svint16x4_t svrshl_single_s16_x4(svint16x4_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u8_x4)))
-svuint8x4_t svrshl_single_u8_x4(svuint8x4_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u32_x4)))
-svuint32x4_t svrshl_single_u32_x4(svuint32x4_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u64_x4)))
-svuint64x4_t svrshl_single_u64_x4(svuint64x4_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u16_x4)))
-svuint16x4_t svrshl_single_u16_x4(svuint16x4_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_x2)))
-svint8x2_t svrshl_s8_x2(svint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_x2)))
-svint32x2_t svrshl_s32_x2(svint32x2_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_x2)))
-svint64x2_t svrshl_s64_x2(svint64x2_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_x2)))
-svint16x2_t svrshl_s16_x2(svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_x2)))
-svuint8x2_t svrshl_u8_x2(svuint8x2_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_x2)))
-svuint32x2_t svrshl_u32_x2(svuint32x2_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_x2)))
-svuint64x2_t svrshl_u64_x2(svuint64x2_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_x2)))
-svuint16x2_t svrshl_u16_x2(svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_x4)))
-svint8x4_t svrshl_s8_x4(svint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_x4)))
-svint32x4_t svrshl_s32_x4(svint32x4_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_x4)))
-svint64x4_t svrshl_s64_x4(svint64x4_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_x4)))
-svint16x4_t svrshl_s16_x4(svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_x4)))
-svuint8x4_t svrshl_u8_x4(svuint8x4_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_x4)))
-svuint32x4_t svrshl_u32_x4(svuint32x4_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_x4)))
-svuint64x4_t svrshl_u64_x4(svuint64x4_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_x4)))
-svuint16x4_t svrshl_u16_x4(svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u8_x2)))
-svuint8x2_t svsel_u8_x2(svcount_t, svuint8x2_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u32_x2)))
-svuint32x2_t svsel_u32_x2(svcount_t, svuint32x2_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u64_x2)))
-svuint64x2_t svsel_u64_x2(svcount_t, svuint64x2_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u16_x2)))
-svuint16x2_t svsel_u16_x2(svcount_t, svuint16x2_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16_x2)))
-svbfloat16x2_t svsel_bf16_x2(svcount_t, svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s8_x2)))
-svint8x2_t svsel_s8_x2(svcount_t, svint8x2_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f64_x2)))
-svfloat64x2_t svsel_f64_x2(svcount_t, svfloat64x2_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f32_x2)))
-svfloat32x2_t svsel_f32_x2(svcount_t, svfloat32x2_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f16_x2)))
-svfloat16x2_t svsel_f16_x2(svcount_t, svfloat16x2_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s32_x2)))
-svint32x2_t svsel_s32_x2(svcount_t, svint32x2_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s64_x2)))
-svint64x2_t svsel_s64_x2(svcount_t, svint64x2_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s16_x2)))
-svint16x2_t svsel_s16_x2(svcount_t, svint16x2_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u8_x4)))
-svuint8x4_t svsel_u8_x4(svcount_t, svuint8x4_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u32_x4)))
-svuint32x4_t svsel_u32_x4(svcount_t, svuint32x4_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u64_x4)))
-svuint64x4_t svsel_u64_x4(svcount_t, svuint64x4_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u16_x4)))
-svuint16x4_t svsel_u16_x4(svcount_t, svuint16x4_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16_x4)))
-svbfloat16x4_t svsel_bf16_x4(svcount_t, svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s8_x4)))
-svint8x4_t svsel_s8_x4(svcount_t, svint8x4_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f64_x4)))
-svfloat64x4_t svsel_f64_x4(svcount_t, svfloat64x4_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f32_x4)))
-svfloat32x4_t svsel_f32_x4(svcount_t, svfloat32x4_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f16_x4)))
-svfloat16x4_t svsel_f16_x4(svcount_t, svfloat16x4_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s32_x4)))
-svint32x4_t svsel_s32_x4(svcount_t, svint32x4_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s64_x4)))
-svint64x4_t svsel_s64_x4(svcount_t, svint64x4_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s16_x4)))
-svint16x4_t svsel_s16_x4(svcount_t, svint16x4_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s32_s16_x2)))
-svint32x2_t svunpk_s32_s16_x2(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s64_s32_x2)))
-svint64x2_t svunpk_s64_s32_x2(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s16_s8_x2)))
-svint16x2_t svunpk_s16_s8_x2(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u32_u16_x2)))
-svuint32x2_t svunpk_u32_u16_x2(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u64_u32_x2)))
-svuint64x2_t svunpk_u64_u32_x2(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u16_u8_x2)))
-svuint16x2_t svunpk_u16_u8_x2(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s32_s16_x4)))
-svint32x4_t svunpk_s32_s16_x4(svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s64_s32_x4)))
-svint64x4_t svunpk_s64_s32_x4(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s16_s8_x4)))
-svint16x4_t svunpk_s16_s8_x4(svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u32_u16_x4)))
-svuint32x4_t svunpk_u32_u16_x4(svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u64_u32_x4)))
-svuint64x4_t svunpk_u64_u32_x4(svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u16_u8_x4)))
-svuint16x4_t svunpk_u16_u8_x4(svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u8_x2)))
-svuint8x2_t svuzp_u8_x2(svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u32_x2)))
-svuint32x2_t svuzp_u32_x2(svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u64_x2)))
-svuint64x2_t svuzp_u64_x2(svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u16_x2)))
-svuint16x2_t svuzp_u16_x2(svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_bf16_x2)))
-svbfloat16x2_t svuzp_bf16_x2(svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s8_x2)))
-svint8x2_t svuzp_s8_x2(svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f64_x2)))
-svfloat64x2_t svuzp_f64_x2(svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f32_x2)))
-svfloat32x2_t svuzp_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f16_x2)))
-svfloat16x2_t svuzp_f16_x2(svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s32_x2)))
-svint32x2_t svuzp_s32_x2(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s64_x2)))
-svint64x2_t svuzp_s64_x2(svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s16_x2)))
-svint16x2_t svuzp_s16_x2(svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u8_x4)))
-svuint8x4_t svuzp_u8_x4(svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u32_x4)))
-svuint32x4_t svuzp_u32_x4(svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u64_x4)))
-svuint64x4_t svuzp_u64_x4(svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u16_x4)))
-svuint16x4_t svuzp_u16_x4(svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_bf16_x4)))
-svbfloat16x4_t svuzp_bf16_x4(svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s8_x4)))
-svint8x4_t svuzp_s8_x4(svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f64_x4)))
-svfloat64x4_t svuzp_f64_x4(svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f32_x4)))
-svfloat32x4_t svuzp_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f16_x4)))
-svfloat16x4_t svuzp_f16_x4(svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s32_x4)))
-svint32x4_t svuzp_s32_x4(svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s64_x4)))
-svint64x4_t svuzp_s64_x4(svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s16_x4)))
-svint16x4_t svuzp_s16_x4(svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u8_x2)))
-svuint8x2_t svuzpq_u8_x2(svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u32_x2)))
-svuint32x2_t svuzpq_u32_x2(svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u64_x2)))
-svuint64x2_t svuzpq_u64_x2(svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u16_x2)))
-svuint16x2_t svuzpq_u16_x2(svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_bf16_x2)))
-svbfloat16x2_t svuzpq_bf16_x2(svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s8_x2)))
-svint8x2_t svuzpq_s8_x2(svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f64_x2)))
-svfloat64x2_t svuzpq_f64_x2(svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f32_x2)))
-svfloat32x2_t svuzpq_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f16_x2)))
-svfloat16x2_t svuzpq_f16_x2(svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s32_x2)))
-svint32x2_t svuzpq_s32_x2(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s64_x2)))
-svint64x2_t svuzpq_s64_x2(svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s16_x2)))
-svint16x2_t svuzpq_s16_x2(svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u8_x4)))
-svuint8x4_t svuzpq_u8_x4(svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u32_x4)))
-svuint32x4_t svuzpq_u32_x4(svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u64_x4)))
-svuint64x4_t svuzpq_u64_x4(svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u16_x4)))
-svuint16x4_t svuzpq_u16_x4(svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_bf16_x4)))
-svbfloat16x4_t svuzpq_bf16_x4(svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s8_x4)))
-svint8x4_t svuzpq_s8_x4(svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f64_x4)))
-svfloat64x4_t svuzpq_f64_x4(svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f32_x4)))
-svfloat32x4_t svuzpq_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f16_x4)))
-svfloat16x4_t svuzpq_f16_x4(svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s32_x4)))
-svint32x4_t svuzpq_s32_x4(svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s64_x4)))
-svint64x4_t svuzpq_s64_x4(svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s16_x4)))
-svint16x4_t svuzpq_s16_x4(svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u8_x2)))
-svuint8x2_t svzip_u8_x2(svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u32_x2)))
-svuint32x2_t svzip_u32_x2(svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u64_x2)))
-svuint64x2_t svzip_u64_x2(svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u16_x2)))
-svuint16x2_t svzip_u16_x2(svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_bf16_x2)))
-svbfloat16x2_t svzip_bf16_x2(svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s8_x2)))
-svint8x2_t svzip_s8_x2(svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f64_x2)))
-svfloat64x2_t svzip_f64_x2(svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f32_x2)))
-svfloat32x2_t svzip_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f16_x2)))
-svfloat16x2_t svzip_f16_x2(svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s32_x2)))
-svint32x2_t svzip_s32_x2(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s64_x2)))
-svint64x2_t svzip_s64_x2(svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s16_x2)))
-svint16x2_t svzip_s16_x2(svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u8_x4)))
-svuint8x4_t svzip_u8_x4(svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u32_x4)))
-svuint32x4_t svzip_u32_x4(svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u64_x4)))
-svuint64x4_t svzip_u64_x4(svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u16_x4)))
-svuint16x4_t svzip_u16_x4(svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_bf16_x4)))
-svbfloat16x4_t svzip_bf16_x4(svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s8_x4)))
-svint8x4_t svzip_s8_x4(svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f64_x4)))
-svfloat64x4_t svzip_f64_x4(svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f32_x4)))
-svfloat32x4_t svzip_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f16_x4)))
-svfloat16x4_t svzip_f16_x4(svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s32_x4)))
-svint32x4_t svzip_s32_x4(svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s64_x4)))
-svint64x4_t svzip_s64_x4(svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s16_x4)))
-svint16x4_t svzip_s16_x4(svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u8_x2)))
-svuint8x2_t svzipq_u8_x2(svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u32_x2)))
-svuint32x2_t svzipq_u32_x2(svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u64_x2)))
-svuint64x2_t svzipq_u64_x2(svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u16_x2)))
-svuint16x2_t svzipq_u16_x2(svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_bf16_x2)))
-svbfloat16x2_t svzipq_bf16_x2(svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s8_x2)))
-svint8x2_t svzipq_s8_x2(svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f64_x2)))
-svfloat64x2_t svzipq_f64_x2(svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f32_x2)))
-svfloat32x2_t svzipq_f32_x2(svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f16_x2)))
-svfloat16x2_t svzipq_f16_x2(svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s32_x2)))
-svint32x2_t svzipq_s32_x2(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s64_x2)))
-svint64x2_t svzipq_s64_x2(svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s16_x2)))
-svint16x2_t svzipq_s16_x2(svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u8_x4)))
-svuint8x4_t svzipq_u8_x4(svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u32_x4)))
-svuint32x4_t svzipq_u32_x4(svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u64_x4)))
-svuint64x4_t svzipq_u64_x4(svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u16_x4)))
-svuint16x4_t svzipq_u16_x4(svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_bf16_x4)))
-svbfloat16x4_t svzipq_bf16_x4(svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s8_x4)))
-svint8x4_t svzipq_s8_x4(svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f64_x4)))
-svfloat64x4_t svzipq_f64_x4(svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f32_x4)))
-svfloat32x4_t svzipq_f32_x4(svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f16_x4)))
-svfloat16x4_t svzipq_f16_x4(svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s32_x4)))
-svint32x4_t svzipq_s32_x4(svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s64_x4)))
-svint64x4_t svzipq_s64_x4(svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s16_x4)))
-svint16x4_t svzipq_s16_x4(svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u8_x2)))
-svuint8x2_t svadd(svuint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u32_x2)))
-svuint32x2_t svadd(svuint32x2_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u64_x2)))
-svuint64x2_t svadd(svuint64x2_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u16_x2)))
-svuint16x2_t svadd(svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s8_x2)))
-svint8x2_t svadd(svint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s32_x2)))
-svint32x2_t svadd(svint32x2_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s64_x2)))
-svint64x2_t svadd(svint64x2_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s16_x2)))
-svint16x2_t svadd(svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u8_x4)))
-svuint8x4_t svadd(svuint8x4_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u32_x4)))
-svuint32x4_t svadd(svuint32x4_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u64_x4)))
-svuint64x4_t svadd(svuint64x4_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_u16_x4)))
-svuint16x4_t svadd(svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s8_x4)))
-svint8x4_t svadd(svint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s32_x4)))
-svint32x4_t svadd(svint32x4_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s64_x4)))
-svint64x4_t svadd(svint64x4_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_single_s16_x4)))
-svint16x4_t svadd(svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f64_x2)))
-svfloat64x2_t svclamp(svfloat64x2_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f32_x2)))
-svfloat32x2_t svclamp(svfloat32x2_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f16_x2)))
-svfloat16x2_t svclamp(svfloat16x2_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s8_x2)))
-svint8x2_t svclamp(svint8x2_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s32_x2)))
-svint32x2_t svclamp(svint32x2_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s64_x2)))
-svint64x2_t svclamp(svint64x2_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s16_x2)))
-svint16x2_t svclamp(svint16x2_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u8_x2)))
-svuint8x2_t svclamp(svuint8x2_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u32_x2)))
-svuint32x2_t svclamp(svuint32x2_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u64_x2)))
-svuint64x2_t svclamp(svuint64x2_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u16_x2)))
-svuint16x2_t svclamp(svuint16x2_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f64_x4)))
-svfloat64x4_t svclamp(svfloat64x4_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f32_x4)))
-svfloat32x4_t svclamp(svfloat32x4_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_f16_x4)))
-svfloat16x4_t svclamp(svfloat16x4_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s8_x4)))
-svint8x4_t svclamp(svint8x4_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s32_x4)))
-svint32x4_t svclamp(svint32x4_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s64_x4)))
-svint64x4_t svclamp(svint64x4_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_s16_x4)))
-svint16x4_t svclamp(svint16x4_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u8_x4)))
-svuint8x4_t svclamp(svuint8x4_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u32_x4)))
-svuint32x4_t svclamp(svuint32x4_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u64_x4)))
-svuint64x4_t svclamp(svuint64x4_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_u16_x4)))
-svuint16x4_t svclamp(svuint16x4_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_x2)))
-svbfloat16_t svcvt_bf16(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f32_x2)))
-svfloat16_t svcvt_f16(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_x2)))
-svint32x2_t svcvt_s32(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_x2)))
-svuint32x2_t svcvt_u32(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_x4)))
-svint32x4_t svcvt_s32(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_x4)))
-svuint32x4_t svcvt_u32(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_x2)))
-svfloat32x2_t svcvt_f32(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_x4)))
-svfloat32x4_t svcvt_f32(svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_x2)))
-svfloat32x2_t svcvt_f32(svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_x4)))
-svfloat32x4_t svcvt_f32(svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_bf16_f32_x2)))
-svbfloat16_t svcvtn_bf16(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_f16_f32_x2)))
-svfloat16_t svcvtn_f16(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_bf16_x2)))
-svbfloat16x2_t svmax(svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f64_x2)))
-svfloat64x2_t svmax(svfloat64x2_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f32_x2)))
-svfloat32x2_t svmax(svfloat32x2_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f16_x2)))
-svfloat16x2_t svmax(svfloat16x2_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s8_x2)))
-svint8x2_t svmax(svint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s32_x2)))
-svint32x2_t svmax(svint32x2_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s64_x2)))
-svint64x2_t svmax(svint64x2_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s16_x2)))
-svint16x2_t svmax(svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u8_x2)))
-svuint8x2_t svmax(svuint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u32_x2)))
-svuint32x2_t svmax(svuint32x2_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u64_x2)))
-svuint64x2_t svmax(svuint64x2_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u16_x2)))
-svuint16x2_t svmax(svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_bf16_x4)))
-svbfloat16x4_t svmax(svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f64_x4)))
-svfloat64x4_t svmax(svfloat64x4_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f32_x4)))
-svfloat32x4_t svmax(svfloat32x4_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_f16_x4)))
-svfloat16x4_t svmax(svfloat16x4_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s8_x4)))
-svint8x4_t svmax(svint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s32_x4)))
-svint32x4_t svmax(svint32x4_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s64_x4)))
-svint64x4_t svmax(svint64x4_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_s16_x4)))
-svint16x4_t svmax(svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u8_x4)))
-svuint8x4_t svmax(svuint8x4_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u32_x4)))
-svuint32x4_t svmax(svuint32x4_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u64_x4)))
-svuint64x4_t svmax(svuint64x4_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_single_u16_x4)))
-svuint16x4_t svmax(svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_x2)))
-svbfloat16x2_t svmax(svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_x2)))
-svfloat64x2_t svmax(svfloat64x2_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_x2)))
-svfloat32x2_t svmax(svfloat32x2_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_x2)))
-svfloat16x2_t svmax(svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_x2)))
-svint8x2_t svmax(svint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_x2)))
-svint32x2_t svmax(svint32x2_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_x2)))
-svint64x2_t svmax(svint64x2_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_x2)))
-svint16x2_t svmax(svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_x2)))
-svuint8x2_t svmax(svuint8x2_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_x2)))
-svuint32x2_t svmax(svuint32x2_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_x2)))
-svuint64x2_t svmax(svuint64x2_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_x2)))
-svuint16x2_t svmax(svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_x4)))
-svbfloat16x4_t svmax(svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_x4)))
-svfloat64x4_t svmax(svfloat64x4_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_x4)))
-svfloat32x4_t svmax(svfloat32x4_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_x4)))
-svfloat16x4_t svmax(svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_x4)))
-svint8x4_t svmax(svint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_x4)))
-svint32x4_t svmax(svint32x4_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_x4)))
-svint64x4_t svmax(svint64x4_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_x4)))
-svint16x4_t svmax(svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_x4)))
-svuint8x4_t svmax(svuint8x4_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_x4)))
-svuint32x4_t svmax(svuint32x4_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_x4)))
-svuint64x4_t svmax(svuint64x4_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_x4)))
-svuint16x4_t svmax(svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_bf16_x2)))
-svbfloat16x2_t svmaxnm(svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f64_x2)))
-svfloat64x2_t svmaxnm(svfloat64x2_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f32_x2)))
-svfloat32x2_t svmaxnm(svfloat32x2_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f16_x2)))
-svfloat16x2_t svmaxnm(svfloat16x2_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_bf16_x4)))
-svbfloat16x4_t svmaxnm(svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f64_x4)))
-svfloat64x4_t svmaxnm(svfloat64x4_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f32_x4)))
-svfloat32x4_t svmaxnm(svfloat32x4_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_single_f16_x4)))
-svfloat16x4_t svmaxnm(svfloat16x4_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_x2)))
-svbfloat16x2_t svmaxnm(svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_x2)))
-svfloat64x2_t svmaxnm(svfloat64x2_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_x2)))
-svfloat32x2_t svmaxnm(svfloat32x2_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_x2)))
-svfloat16x2_t svmaxnm(svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_x4)))
-svbfloat16x4_t svmaxnm(svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_x4)))
-svfloat64x4_t svmaxnm(svfloat64x4_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_x4)))
-svfloat32x4_t svmaxnm(svfloat32x4_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_x4)))
-svfloat16x4_t svmaxnm(svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_bf16_x2)))
-svbfloat16x2_t svmin(svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f64_x2)))
-svfloat64x2_t svmin(svfloat64x2_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f32_x2)))
-svfloat32x2_t svmin(svfloat32x2_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f16_x2)))
-svfloat16x2_t svmin(svfloat16x2_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s8_x2)))
-svint8x2_t svmin(svint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s32_x2)))
-svint32x2_t svmin(svint32x2_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s64_x2)))
-svint64x2_t svmin(svint64x2_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s16_x2)))
-svint16x2_t svmin(svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u8_x2)))
-svuint8x2_t svmin(svuint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u32_x2)))
-svuint32x2_t svmin(svuint32x2_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u64_x2)))
-svuint64x2_t svmin(svuint64x2_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u16_x2)))
-svuint16x2_t svmin(svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_bf16_x4)))
-svbfloat16x4_t svmin(svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f64_x4)))
-svfloat64x4_t svmin(svfloat64x4_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f32_x4)))
-svfloat32x4_t svmin(svfloat32x4_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_f16_x4)))
-svfloat16x4_t svmin(svfloat16x4_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s8_x4)))
-svint8x4_t svmin(svint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s32_x4)))
-svint32x4_t svmin(svint32x4_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s64_x4)))
-svint64x4_t svmin(svint64x4_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_s16_x4)))
-svint16x4_t svmin(svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u8_x4)))
-svuint8x4_t svmin(svuint8x4_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u32_x4)))
-svuint32x4_t svmin(svuint32x4_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u64_x4)))
-svuint64x4_t svmin(svuint64x4_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_single_u16_x4)))
-svuint16x4_t svmin(svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_x2)))
-svbfloat16x2_t svmin(svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_x2)))
-svfloat64x2_t svmin(svfloat64x2_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_x2)))
-svfloat32x2_t svmin(svfloat32x2_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_x2)))
-svfloat16x2_t svmin(svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_x2)))
-svint8x2_t svmin(svint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_x2)))
-svint32x2_t svmin(svint32x2_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_x2)))
-svint64x2_t svmin(svint64x2_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_x2)))
-svint16x2_t svmin(svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_x2)))
-svuint8x2_t svmin(svuint8x2_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_x2)))
-svuint32x2_t svmin(svuint32x2_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_x2)))
-svuint64x2_t svmin(svuint64x2_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_x2)))
-svuint16x2_t svmin(svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_x4)))
-svbfloat16x4_t svmin(svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_x4)))
-svfloat64x4_t svmin(svfloat64x4_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_x4)))
-svfloat32x4_t svmin(svfloat32x4_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_x4)))
-svfloat16x4_t svmin(svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_x4)))
-svint8x4_t svmin(svint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_x4)))
-svint32x4_t svmin(svint32x4_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_x4)))
-svint64x4_t svmin(svint64x4_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_x4)))
-svint16x4_t svmin(svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_x4)))
-svuint8x4_t svmin(svuint8x4_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_x4)))
-svuint32x4_t svmin(svuint32x4_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_x4)))
-svuint64x4_t svmin(svuint64x4_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_x4)))
-svuint16x4_t svmin(svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_bf16_x2)))
-svbfloat16x2_t svminnm(svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f64_x2)))
-svfloat64x2_t svminnm(svfloat64x2_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f32_x2)))
-svfloat32x2_t svminnm(svfloat32x2_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f16_x2)))
-svfloat16x2_t svminnm(svfloat16x2_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_bf16_x4)))
-svbfloat16x4_t svminnm(svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f64_x4)))
-svfloat64x4_t svminnm(svfloat64x4_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f32_x4)))
-svfloat32x4_t svminnm(svfloat32x4_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_single_f16_x4)))
-svfloat16x4_t svminnm(svfloat16x4_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_x2)))
-svbfloat16x2_t svminnm(svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_x2)))
-svfloat64x2_t svminnm(svfloat64x2_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_x2)))
-svfloat32x2_t svminnm(svfloat32x2_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_x2)))
-svfloat16x2_t svminnm(svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_x4)))
-svbfloat16x4_t svminnm(svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_x4)))
-svfloat64x4_t svminnm(svfloat64x4_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_x4)))
-svfloat32x4_t svminnm(svfloat32x4_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_x4)))
-svfloat16x4_t svminnm(svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_s16_s32_x2)))
-svint16_t svqcvt_s16(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_s16_s64_x4)))
-svint16_t svqcvt_s16(svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_s8_s32_x4)))
-svint8_t svqcvt_s8(svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u16_s32_x2)))
-svuint16_t svqcvt_u16(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u16_u32_x2)))
-svuint16_t svqcvt_u16(svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u16_s64_x4)))
-svuint16_t svqcvt_u16(svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u16_u64_x4)))
-svuint16_t svqcvt_u16(svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u8_s32_x4)))
-svuint8_t svqcvt_u8(svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvt_u8_u32_x4)))
-svuint8_t svqcvt_u8(svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_s16_s64_x4)))
-svint16_t svqcvtn_s16(svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_s8_s32_x4)))
-svint8_t svqcvtn_s8(svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u16_s64_x4)))
-svuint16_t svqcvtn_u16(svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u16_u64_x4)))
-svuint16_t svqcvtn_u16(svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u8_s32_x4)))
-svuint8_t svqcvtn_u8(svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u8_u32_x4)))
-svuint8_t svqcvtn_u8(svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s8_x2)))
-svint8x2_t svqdmulh(svint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s32_x2)))
-svint32x2_t svqdmulh(svint32x2_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s64_x2)))
-svint64x2_t svqdmulh(svint64x2_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s16_x2)))
-svint16x2_t svqdmulh(svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s8_x4)))
-svint8x4_t svqdmulh(svint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s32_x4)))
-svint32x4_t svqdmulh(svint32x4_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s64_x4)))
-svint64x4_t svqdmulh(svint64x4_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_single_s16_x4)))
-svint16x4_t svqdmulh(svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s8_x2)))
-svint8x2_t svqdmulh(svint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s32_x2)))
-svint32x2_t svqdmulh(svint32x2_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s64_x2)))
-svint64x2_t svqdmulh(svint64x2_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s16_x2)))
-svint16x2_t svqdmulh(svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s8_x4)))
-svint8x4_t svqdmulh(svint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s32_x4)))
-svint32x4_t svqdmulh(svint32x4_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s64_x4)))
-svint64x4_t svqdmulh(svint64x4_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s16_x4)))
-svint16x4_t svqdmulh(svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_s16_s32_x2)))
-svint16_t svqrshr_s16(svint32x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_u16_u32_x2)))
-svuint16_t svqrshr_u16(svuint32x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_s8_s32_x4)))
-svint8_t svqrshr_s8(svint32x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_s16_s64_x4)))
-svint16_t svqrshr_s16(svint64x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_u8_u32_x4)))
-svuint8_t svqrshr_u8(svuint32x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshr_n_u16_u64_x4)))
-svuint16_t svqrshr_u16(svuint64x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_s8_s32_x4)))
-svint8_t svqrshrn_s8(svint32x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_s16_s64_x4)))
-svint16_t svqrshrn_s16(svint64x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_u8_u32_x4)))
-svuint8_t svqrshrn_u8(svuint32x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_u16_u64_x4)))
-svuint16_t svqrshrn_u16(svuint64x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshru_n_u16_s32_x2)))
-svuint16_t svqrshru_u16(svint32x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshru_n_u8_s32_x4)))
-svuint8_t svqrshru_u8(svint32x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshru_n_u16_s64_x4)))
-svuint16_t svqrshru_u16(svint64x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrun_n_u8_s32_x4)))
-svuint8_t svqrshrun_u8(svint32x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrun_n_u16_s64_x4)))
-svuint16_t svqrshrun_u16(svint64x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_x2)))
-svfloat32x2_t svrinta(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_x4)))
-svfloat32x4_t svrinta(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_x2)))
-svfloat32x2_t svrintm(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_x4)))
-svfloat32x4_t svrintm(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_x2)))
-svfloat32x2_t svrintn(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_x4)))
-svfloat32x4_t svrintn(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_x2)))
-svfloat32x2_t svrintp(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_x4)))
-svfloat32x4_t svrintp(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s8_x2)))
-svint8x2_t svrshl(svint8x2_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s32_x2)))
-svint32x2_t svrshl(svint32x2_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s64_x2)))
-svint64x2_t svrshl(svint64x2_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s16_x2)))
-svint16x2_t svrshl(svint16x2_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u8_x2)))
-svuint8x2_t svrshl(svuint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u32_x2)))
-svuint32x2_t svrshl(svuint32x2_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u64_x2)))
-svuint64x2_t svrshl(svuint64x2_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u16_x2)))
-svuint16x2_t svrshl(svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s8_x4)))
-svint8x4_t svrshl(svint8x4_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s32_x4)))
-svint32x4_t svrshl(svint32x4_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s64_x4)))
-svint64x4_t svrshl(svint64x4_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_s16_x4)))
-svint16x4_t svrshl(svint16x4_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u8_x4)))
-svuint8x4_t svrshl(svuint8x4_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u32_x4)))
-svuint32x4_t svrshl(svuint32x4_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u64_x4)))
-svuint64x4_t svrshl(svuint64x4_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_single_u16_x4)))
-svuint16x4_t svrshl(svuint16x4_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_x2)))
-svint8x2_t svrshl(svint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_x2)))
-svint32x2_t svrshl(svint32x2_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_x2)))
-svint64x2_t svrshl(svint64x2_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_x2)))
-svint16x2_t svrshl(svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_x2)))
-svuint8x2_t svrshl(svuint8x2_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_x2)))
-svuint32x2_t svrshl(svuint32x2_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_x2)))
-svuint64x2_t svrshl(svuint64x2_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_x2)))
-svuint16x2_t svrshl(svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_x4)))
-svint8x4_t svrshl(svint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_x4)))
-svint32x4_t svrshl(svint32x4_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_x4)))
-svint64x4_t svrshl(svint64x4_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_x4)))
-svint16x4_t svrshl(svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_x4)))
-svuint8x4_t svrshl(svuint8x4_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_x4)))
-svuint32x4_t svrshl(svuint32x4_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_x4)))
-svuint64x4_t svrshl(svuint64x4_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_x4)))
-svuint16x4_t svrshl(svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u8_x2)))
-svuint8x2_t svsel(svcount_t, svuint8x2_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u32_x2)))
-svuint32x2_t svsel(svcount_t, svuint32x2_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u64_x2)))
-svuint64x2_t svsel(svcount_t, svuint64x2_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u16_x2)))
-svuint16x2_t svsel(svcount_t, svuint16x2_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16_x2)))
-svbfloat16x2_t svsel(svcount_t, svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s8_x2)))
-svint8x2_t svsel(svcount_t, svint8x2_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f64_x2)))
-svfloat64x2_t svsel(svcount_t, svfloat64x2_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f32_x2)))
-svfloat32x2_t svsel(svcount_t, svfloat32x2_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f16_x2)))
-svfloat16x2_t svsel(svcount_t, svfloat16x2_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s32_x2)))
-svint32x2_t svsel(svcount_t, svint32x2_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s64_x2)))
-svint64x2_t svsel(svcount_t, svint64x2_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s16_x2)))
-svint16x2_t svsel(svcount_t, svint16x2_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u8_x4)))
-svuint8x4_t svsel(svcount_t, svuint8x4_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u32_x4)))
-svuint32x4_t svsel(svcount_t, svuint32x4_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u64_x4)))
-svuint64x4_t svsel(svcount_t, svuint64x4_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u16_x4)))
-svuint16x4_t svsel(svcount_t, svuint16x4_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16_x4)))
-svbfloat16x4_t svsel(svcount_t, svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s8_x4)))
-svint8x4_t svsel(svcount_t, svint8x4_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f64_x4)))
-svfloat64x4_t svsel(svcount_t, svfloat64x4_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f32_x4)))
-svfloat32x4_t svsel(svcount_t, svfloat32x4_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f16_x4)))
-svfloat16x4_t svsel(svcount_t, svfloat16x4_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s32_x4)))
-svint32x4_t svsel(svcount_t, svint32x4_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s64_x4)))
-svint64x4_t svsel(svcount_t, svint64x4_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s16_x4)))
-svint16x4_t svsel(svcount_t, svint16x4_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s32_s16_x2)))
-svint32x2_t svunpk_s32(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s64_s32_x2)))
-svint64x2_t svunpk_s64(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s16_s8_x2)))
-svint16x2_t svunpk_s16(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u32_u16_x2)))
-svuint32x2_t svunpk_u32(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u64_u32_x2)))
-svuint64x2_t svunpk_u64(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u16_u8_x2)))
-svuint16x2_t svunpk_u16(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s32_s16_x4)))
-svint32x4_t svunpk_s32(svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s64_s32_x4)))
-svint64x4_t svunpk_s64(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_s16_s8_x4)))
-svint16x4_t svunpk_s16(svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u32_u16_x4)))
-svuint32x4_t svunpk_u32(svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u64_u32_x4)))
-svuint64x4_t svunpk_u64(svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpk_u16_u8_x4)))
-svuint16x4_t svunpk_u16(svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u8_x2)))
-svuint8x2_t svuzp(svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u32_x2)))
-svuint32x2_t svuzp(svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u64_x2)))
-svuint64x2_t svuzp(svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u16_x2)))
-svuint16x2_t svuzp(svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_bf16_x2)))
-svbfloat16x2_t svuzp(svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s8_x2)))
-svint8x2_t svuzp(svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f64_x2)))
-svfloat64x2_t svuzp(svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f32_x2)))
-svfloat32x2_t svuzp(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f16_x2)))
-svfloat16x2_t svuzp(svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s32_x2)))
-svint32x2_t svuzp(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s64_x2)))
-svint64x2_t svuzp(svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s16_x2)))
-svint16x2_t svuzp(svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u8_x4)))
-svuint8x4_t svuzp(svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u32_x4)))
-svuint32x4_t svuzp(svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u64_x4)))
-svuint64x4_t svuzp(svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_u16_x4)))
-svuint16x4_t svuzp(svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_bf16_x4)))
-svbfloat16x4_t svuzp(svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s8_x4)))
-svint8x4_t svuzp(svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f64_x4)))
-svfloat64x4_t svuzp(svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f32_x4)))
-svfloat32x4_t svuzp(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_f16_x4)))
-svfloat16x4_t svuzp(svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s32_x4)))
-svint32x4_t svuzp(svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s64_x4)))
-svint64x4_t svuzp(svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp_s16_x4)))
-svint16x4_t svuzp(svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u8_x2)))
-svuint8x2_t svuzpq(svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u32_x2)))
-svuint32x2_t svuzpq(svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u64_x2)))
-svuint64x2_t svuzpq(svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u16_x2)))
-svuint16x2_t svuzpq(svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_bf16_x2)))
-svbfloat16x2_t svuzpq(svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s8_x2)))
-svint8x2_t svuzpq(svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f64_x2)))
-svfloat64x2_t svuzpq(svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f32_x2)))
-svfloat32x2_t svuzpq(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f16_x2)))
-svfloat16x2_t svuzpq(svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s32_x2)))
-svint32x2_t svuzpq(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s64_x2)))
-svint64x2_t svuzpq(svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s16_x2)))
-svint16x2_t svuzpq(svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u8_x4)))
-svuint8x4_t svuzpq(svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u32_x4)))
-svuint32x4_t svuzpq(svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u64_x4)))
-svuint64x4_t svuzpq(svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_u16_x4)))
-svuint16x4_t svuzpq(svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_bf16_x4)))
-svbfloat16x4_t svuzpq(svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s8_x4)))
-svint8x4_t svuzpq(svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f64_x4)))
-svfloat64x4_t svuzpq(svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f32_x4)))
-svfloat32x4_t svuzpq(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_f16_x4)))
-svfloat16x4_t svuzpq(svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s32_x4)))
-svint32x4_t svuzpq(svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s64_x4)))
-svint64x4_t svuzpq(svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq_s16_x4)))
-svint16x4_t svuzpq(svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u8_x2)))
-svuint8x2_t svzip(svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u32_x2)))
-svuint32x2_t svzip(svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u64_x2)))
-svuint64x2_t svzip(svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u16_x2)))
-svuint16x2_t svzip(svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_bf16_x2)))
-svbfloat16x2_t svzip(svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s8_x2)))
-svint8x2_t svzip(svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f64_x2)))
-svfloat64x2_t svzip(svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f32_x2)))
-svfloat32x2_t svzip(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f16_x2)))
-svfloat16x2_t svzip(svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s32_x2)))
-svint32x2_t svzip(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s64_x2)))
-svint64x2_t svzip(svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s16_x2)))
-svint16x2_t svzip(svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u8_x4)))
-svuint8x4_t svzip(svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u32_x4)))
-svuint32x4_t svzip(svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u64_x4)))
-svuint64x4_t svzip(svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_u16_x4)))
-svuint16x4_t svzip(svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_bf16_x4)))
-svbfloat16x4_t svzip(svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s8_x4)))
-svint8x4_t svzip(svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f64_x4)))
-svfloat64x4_t svzip(svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f32_x4)))
-svfloat32x4_t svzip(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_f16_x4)))
-svfloat16x4_t svzip(svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s32_x4)))
-svint32x4_t svzip(svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s64_x4)))
-svint64x4_t svzip(svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip_s16_x4)))
-svint16x4_t svzip(svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u8_x2)))
-svuint8x2_t svzipq(svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u32_x2)))
-svuint32x2_t svzipq(svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u64_x2)))
-svuint64x2_t svzipq(svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u16_x2)))
-svuint16x2_t svzipq(svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_bf16_x2)))
-svbfloat16x2_t svzipq(svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s8_x2)))
-svint8x2_t svzipq(svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f64_x2)))
-svfloat64x2_t svzipq(svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f32_x2)))
-svfloat32x2_t svzipq(svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f16_x2)))
-svfloat16x2_t svzipq(svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s32_x2)))
-svint32x2_t svzipq(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s64_x2)))
-svint64x2_t svzipq(svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s16_x2)))
-svint16x2_t svzipq(svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u8_x4)))
-svuint8x4_t svzipq(svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u32_x4)))
-svuint32x4_t svzipq(svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u64_x4)))
-svuint64x4_t svzipq(svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_u16_x4)))
-svuint16x4_t svzipq(svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_bf16_x4)))
-svbfloat16x4_t svzipq(svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s8_x4)))
-svint8x4_t svzipq(svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f64_x4)))
-svfloat64x4_t svzipq(svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f32_x4)))
-svfloat32x4_t svzipq(svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_f16_x4)))
-svfloat16x4_t svzipq(svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s32_x4)))
-svint32x4_t svzipq(svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s64_x4)))
-svint64x4_t svzipq(svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s16_x4)))
-svint16x4_t svzipq(svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_bf16_x2)))
-svbfloat16x2_t svclamp_single_bf16_x2(svbfloat16x2_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_bf16_x4)))
-svbfloat16x4_t svclamp_single_bf16_x4(svbfloat16x4_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_bf16_x2)))
-svbfloat16x2_t svclamp(svbfloat16x2_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_bf16_x4)))
-svbfloat16x4_t svclamp(svbfloat16x4_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u32base_u32offset)))
-svuint32_t svadrb_u32base_u32offset(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u64base_u64offset)))
-svuint64_t svadrb_u64base_u64offset(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u32base_s32offset)))
-svuint32_t svadrb_u32base_s32offset(svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u64base_s64offset)))
-svuint64_t svadrb_u64base_s64offset(svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrd_u32base_u32index)))
-svuint32_t svadrd_u32base_u32index(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrd_u64base_u64index)))
-svuint64_t svadrd_u64base_u64index(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrd_u32base_s32index)))
-svuint32_t svadrd_u32base_s32index(svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrd_u64base_s64index)))
-svuint64_t svadrd_u64base_s64index(svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrh_u32base_u32index)))
-svuint32_t svadrh_u32base_u32index(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrh_u64base_u64index)))
-svuint64_t svadrh_u64base_u64index(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrh_u32base_s32index)))
-svuint32_t svadrh_u32base_s32index(svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrh_u64base_s64index)))
-svuint64_t svadrh_u64base_s64index(svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrw_u32base_u32index)))
-svuint32_t svadrw_u32base_u32index(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrw_u64base_u64index)))
-svuint64_t svadrw_u64base_u64index(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrw_u32base_s32index)))
-svuint32_t svadrw_u32base_s32index(svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrw_u64base_s64index)))
-svuint64_t svadrw_u64base_s64index(svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_u32)))
-svuint32_t svcompact_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_u64)))
-svuint64_t svcompact_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_f64)))
-svfloat64_t svcompact_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_f32)))
-svfloat32_t svcompact_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_s32)))
-svint32_t svcompact_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_s64)))
-svint64_t svcompact_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexpa_f64)))
-svfloat64_t svexpa_f64(svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexpa_f32)))
-svfloat32_t svexpa_f32(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexpa_f16)))
-svfloat16_t svexpa_f16(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_index_u32)))
-svuint32_t svld1_gather_u32base_index_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_index_u64)))
-svuint64_t svld1_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_index_f64)))
-svfloat64_t svld1_gather_u64base_index_f64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_index_f32)))
-svfloat32_t svld1_gather_u32base_index_f32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_index_s32)))
-svint32_t svld1_gather_u32base_index_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_index_s64)))
-svint64_t svld1_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_offset_u32)))
-svuint32_t svld1_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_offset_u64)))
-svuint64_t svld1_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_offset_f64)))
-svfloat64_t svld1_gather_u64base_offset_f64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_offset_f32)))
-svfloat32_t svld1_gather_u32base_offset_f32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_offset_s32)))
-svint32_t svld1_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_offset_s64)))
-svint64_t svld1_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_u32)))
-svuint32_t svld1_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_u64)))
-svuint64_t svld1_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_f64)))
-svfloat64_t svld1_gather_u64base_f64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_f32)))
-svfloat32_t svld1_gather_u32base_f32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_s32)))
-svint32_t svld1_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_s64)))
-svint64_t svld1_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32index_u32)))
-svuint32_t svld1_gather_s32index_u32(svbool_t, uint32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32index_f32)))
-svfloat32_t svld1_gather_s32index_f32(svbool_t, float32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32index_s32)))
-svint32_t svld1_gather_s32index_s32(svbool_t, int32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32index_u32)))
-svuint32_t svld1_gather_u32index_u32(svbool_t, uint32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32index_f32)))
-svfloat32_t svld1_gather_u32index_f32(svbool_t, float32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32index_s32)))
-svint32_t svld1_gather_u32index_s32(svbool_t, int32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64index_u64)))
-svuint64_t svld1_gather_s64index_u64(svbool_t, uint64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64index_f64)))
-svfloat64_t svld1_gather_s64index_f64(svbool_t, float64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64index_s64)))
-svint64_t svld1_gather_s64index_s64(svbool_t, int64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64index_u64)))
-svuint64_t svld1_gather_u64index_u64(svbool_t, uint64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64index_f64)))
-svfloat64_t svld1_gather_u64index_f64(svbool_t, float64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64index_s64)))
-svint64_t svld1_gather_u64index_s64(svbool_t, int64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32offset_u32)))
-svuint32_t svld1_gather_s32offset_u32(svbool_t, uint32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32offset_f32)))
-svfloat32_t svld1_gather_s32offset_f32(svbool_t, float32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32offset_s32)))
-svint32_t svld1_gather_s32offset_s32(svbool_t, int32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32offset_u32)))
-svuint32_t svld1_gather_u32offset_u32(svbool_t, uint32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32offset_f32)))
-svfloat32_t svld1_gather_u32offset_f32(svbool_t, float32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32offset_s32)))
-svint32_t svld1_gather_u32offset_s32(svbool_t, int32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64offset_u64)))
-svuint64_t svld1_gather_s64offset_u64(svbool_t, uint64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64offset_f64)))
-svfloat64_t svld1_gather_s64offset_f64(svbool_t, float64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64offset_s64)))
-svint64_t svld1_gather_s64offset_s64(svbool_t, int64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64offset_u64)))
-svuint64_t svld1_gather_u64offset_u64(svbool_t, uint64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64offset_f64)))
-svfloat64_t svld1_gather_u64offset_f64(svbool_t, float64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64offset_s64)))
-svint64_t svld1_gather_u64offset_s64(svbool_t, int64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32base_offset_u32)))
-svuint32_t svld1sb_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64base_offset_u64)))
-svuint64_t svld1sb_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32base_offset_s32)))
-svint32_t svld1sb_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64base_offset_s64)))
-svint64_t svld1sb_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32base_u32)))
-svuint32_t svld1sb_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64base_u64)))
-svuint64_t svld1sb_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32base_s32)))
-svint32_t svld1sb_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64base_s64)))
-svint64_t svld1sb_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_s32offset_u32)))
-svuint32_t svld1sb_gather_s32offset_u32(svbool_t, int8_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_s32offset_s32)))
-svint32_t svld1sb_gather_s32offset_s32(svbool_t, int8_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32offset_u32)))
-svuint32_t svld1sb_gather_u32offset_u32(svbool_t, int8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32offset_s32)))
-svint32_t svld1sb_gather_u32offset_s32(svbool_t, int8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_s64offset_u64)))
-svuint64_t svld1sb_gather_s64offset_u64(svbool_t, int8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_s64offset_s64)))
-svint64_t svld1sb_gather_s64offset_s64(svbool_t, int8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64offset_u64)))
-svuint64_t svld1sb_gather_u64offset_u64(svbool_t, int8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64offset_s64)))
-svint64_t svld1sb_gather_u64offset_s64(svbool_t, int8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_index_u32)))
-svuint32_t svld1sh_gather_u32base_index_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_index_u64)))
-svuint64_t svld1sh_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_index_s32)))
-svint32_t svld1sh_gather_u32base_index_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_index_s64)))
-svint64_t svld1sh_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_offset_u32)))
-svuint32_t svld1sh_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_offset_u64)))
-svuint64_t svld1sh_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_offset_s32)))
-svint32_t svld1sh_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_offset_s64)))
-svint64_t svld1sh_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_u32)))
-svuint32_t svld1sh_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_u64)))
-svuint64_t svld1sh_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_s32)))
-svint32_t svld1sh_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_s64)))
-svint64_t svld1sh_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s32index_u32)))
-svuint32_t svld1sh_gather_s32index_u32(svbool_t, int16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s32index_s32)))
-svint32_t svld1sh_gather_s32index_s32(svbool_t, int16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32index_u32)))
-svuint32_t svld1sh_gather_u32index_u32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32index_s32)))
-svint32_t svld1sh_gather_u32index_s32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s64index_u64)))
-svuint64_t svld1sh_gather_s64index_u64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s64index_s64)))
-svint64_t svld1sh_gather_s64index_s64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64index_u64)))
-svuint64_t svld1sh_gather_u64index_u64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64index_s64)))
-svint64_t svld1sh_gather_u64index_s64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s32offset_u32)))
-svuint32_t svld1sh_gather_s32offset_u32(svbool_t, int16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s32offset_s32)))
-svint32_t svld1sh_gather_s32offset_s32(svbool_t, int16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32offset_u32)))
-svuint32_t svld1sh_gather_u32offset_u32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32offset_s32)))
-svint32_t svld1sh_gather_u32offset_s32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s64offset_u64)))
-svuint64_t svld1sh_gather_s64offset_u64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s64offset_s64)))
-svint64_t svld1sh_gather_s64offset_s64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64offset_u64)))
-svuint64_t svld1sh_gather_u64offset_u64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64offset_s64)))
-svint64_t svld1sh_gather_u64offset_s64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_index_u64)))
-svuint64_t svld1sw_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_index_s64)))
-svint64_t svld1sw_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_offset_u64)))
-svuint64_t svld1sw_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_offset_s64)))
-svint64_t svld1sw_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_u64)))
-svuint64_t svld1sw_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_s64)))
-svint64_t svld1sw_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_s64index_u64)))
-svuint64_t svld1sw_gather_s64index_u64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_s64index_s64)))
-svint64_t svld1sw_gather_s64index_s64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64index_u64)))
-svuint64_t svld1sw_gather_u64index_u64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64index_s64)))
-svint64_t svld1sw_gather_u64index_s64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_s64offset_u64)))
-svuint64_t svld1sw_gather_s64offset_u64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_s64offset_s64)))
-svint64_t svld1sw_gather_s64offset_s64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64offset_u64)))
-svuint64_t svld1sw_gather_u64offset_u64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64offset_s64)))
-svint64_t svld1sw_gather_u64offset_s64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32base_offset_u32)))
-svuint32_t svld1ub_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64base_offset_u64)))
-svuint64_t svld1ub_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32base_offset_s32)))
-svint32_t svld1ub_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64base_offset_s64)))
-svint64_t svld1ub_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32base_u32)))
-svuint32_t svld1ub_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64base_u64)))
-svuint64_t svld1ub_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32base_s32)))
-svint32_t svld1ub_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64base_s64)))
-svint64_t svld1ub_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_s32offset_u32)))
-svuint32_t svld1ub_gather_s32offset_u32(svbool_t, uint8_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_s32offset_s32)))
-svint32_t svld1ub_gather_s32offset_s32(svbool_t, uint8_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32offset_u32)))
-svuint32_t svld1ub_gather_u32offset_u32(svbool_t, uint8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32offset_s32)))
-svint32_t svld1ub_gather_u32offset_s32(svbool_t, uint8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_s64offset_u64)))
-svuint64_t svld1ub_gather_s64offset_u64(svbool_t, uint8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_s64offset_s64)))
-svint64_t svld1ub_gather_s64offset_s64(svbool_t, uint8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64offset_u64)))
-svuint64_t svld1ub_gather_u64offset_u64(svbool_t, uint8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64offset_s64)))
-svint64_t svld1ub_gather_u64offset_s64(svbool_t, uint8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_index_u32)))
-svuint32_t svld1uh_gather_u32base_index_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_index_u64)))
-svuint64_t svld1uh_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_index_s32)))
-svint32_t svld1uh_gather_u32base_index_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_index_s64)))
-svint64_t svld1uh_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_offset_u32)))
-svuint32_t svld1uh_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_offset_u64)))
-svuint64_t svld1uh_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_offset_s32)))
-svint32_t svld1uh_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_offset_s64)))
-svint64_t svld1uh_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_u32)))
-svuint32_t svld1uh_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_u64)))
-svuint64_t svld1uh_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_s32)))
-svint32_t svld1uh_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_s64)))
-svint64_t svld1uh_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s32index_u32)))
-svuint32_t svld1uh_gather_s32index_u32(svbool_t, uint16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s32index_s32)))
-svint32_t svld1uh_gather_s32index_s32(svbool_t, uint16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32index_u32)))
-svuint32_t svld1uh_gather_u32index_u32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32index_s32)))
-svint32_t svld1uh_gather_u32index_s32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s64index_u64)))
-svuint64_t svld1uh_gather_s64index_u64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s64index_s64)))
-svint64_t svld1uh_gather_s64index_s64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64index_u64)))
-svuint64_t svld1uh_gather_u64index_u64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64index_s64)))
-svint64_t svld1uh_gather_u64index_s64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s32offset_u32)))
-svuint32_t svld1uh_gather_s32offset_u32(svbool_t, uint16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s32offset_s32)))
-svint32_t svld1uh_gather_s32offset_s32(svbool_t, uint16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32offset_u32)))
-svuint32_t svld1uh_gather_u32offset_u32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32offset_s32)))
-svint32_t svld1uh_gather_u32offset_s32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s64offset_u64)))
-svuint64_t svld1uh_gather_s64offset_u64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s64offset_s64)))
-svint64_t svld1uh_gather_s64offset_s64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64offset_u64)))
-svuint64_t svld1uh_gather_u64offset_u64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64offset_s64)))
-svint64_t svld1uh_gather_u64offset_s64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_index_u64)))
-svuint64_t svld1uw_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_index_s64)))
-svint64_t svld1uw_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_offset_u64)))
-svuint64_t svld1uw_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_offset_s64)))
-svint64_t svld1uw_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_u64)))
-svuint64_t svld1uw_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_s64)))
-svint64_t svld1uw_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_s64index_u64)))
-svuint64_t svld1uw_gather_s64index_u64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_s64index_s64)))
-svint64_t svld1uw_gather_s64index_s64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64index_u64)))
-svuint64_t svld1uw_gather_u64index_u64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64index_s64)))
-svint64_t svld1uw_gather_u64index_s64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_s64offset_u64)))
-svuint64_t svld1uw_gather_s64offset_u64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_s64offset_s64)))
-svint64_t svld1uw_gather_s64offset_s64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64offset_u64)))
-svuint64_t svld1uw_gather_u64offset_u64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64offset_s64)))
-svint64_t svld1uw_gather_u64offset_s64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_u8)))
-svuint8_t svldff1_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_u32)))
-svuint32_t svldff1_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_u64)))
-svuint64_t svldff1_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_u16)))
-svuint16_t svldff1_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_s8)))
-svint8_t svldff1_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_f64)))
-svfloat64_t svldff1_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_f32)))
-svfloat32_t svldff1_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_f16)))
-svfloat16_t svldff1_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_s32)))
-svint32_t svldff1_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_s64)))
-svint64_t svldff1_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_s16)))
-svint16_t svldff1_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_index_u32)))
-svuint32_t svldff1_gather_u32base_index_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_index_u64)))
-svuint64_t svldff1_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_index_f64)))
-svfloat64_t svldff1_gather_u64base_index_f64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_index_f32)))
-svfloat32_t svldff1_gather_u32base_index_f32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_index_s32)))
-svint32_t svldff1_gather_u32base_index_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_index_s64)))
-svint64_t svldff1_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_offset_u32)))
-svuint32_t svldff1_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_offset_u64)))
-svuint64_t svldff1_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_offset_f64)))
-svfloat64_t svldff1_gather_u64base_offset_f64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_offset_f32)))
-svfloat32_t svldff1_gather_u32base_offset_f32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_offset_s32)))
-svint32_t svldff1_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_offset_s64)))
-svint64_t svldff1_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_u32)))
-svuint32_t svldff1_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_u64)))
-svuint64_t svldff1_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_f64)))
-svfloat64_t svldff1_gather_u64base_f64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_f32)))
-svfloat32_t svldff1_gather_u32base_f32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_s32)))
-svint32_t svldff1_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_s64)))
-svint64_t svldff1_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32index_u32)))
-svuint32_t svldff1_gather_s32index_u32(svbool_t, uint32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32index_f32)))
-svfloat32_t svldff1_gather_s32index_f32(svbool_t, float32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32index_s32)))
-svint32_t svldff1_gather_s32index_s32(svbool_t, int32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32index_u32)))
-svuint32_t svldff1_gather_u32index_u32(svbool_t, uint32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32index_f32)))
-svfloat32_t svldff1_gather_u32index_f32(svbool_t, float32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32index_s32)))
-svint32_t svldff1_gather_u32index_s32(svbool_t, int32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64index_u64)))
-svuint64_t svldff1_gather_s64index_u64(svbool_t, uint64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64index_f64)))
-svfloat64_t svldff1_gather_s64index_f64(svbool_t, float64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64index_s64)))
-svint64_t svldff1_gather_s64index_s64(svbool_t, int64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64index_u64)))
-svuint64_t svldff1_gather_u64index_u64(svbool_t, uint64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64index_f64)))
-svfloat64_t svldff1_gather_u64index_f64(svbool_t, float64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64index_s64)))
-svint64_t svldff1_gather_u64index_s64(svbool_t, int64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32offset_u32)))
-svuint32_t svldff1_gather_s32offset_u32(svbool_t, uint32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32offset_f32)))
-svfloat32_t svldff1_gather_s32offset_f32(svbool_t, float32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32offset_s32)))
-svint32_t svldff1_gather_s32offset_s32(svbool_t, int32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32offset_u32)))
-svuint32_t svldff1_gather_u32offset_u32(svbool_t, uint32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32offset_f32)))
-svfloat32_t svldff1_gather_u32offset_f32(svbool_t, float32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32offset_s32)))
-svint32_t svldff1_gather_u32offset_s32(svbool_t, int32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64offset_u64)))
-svuint64_t svldff1_gather_s64offset_u64(svbool_t, uint64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64offset_f64)))
-svfloat64_t svldff1_gather_s64offset_f64(svbool_t, float64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64offset_s64)))
-svint64_t svldff1_gather_s64offset_s64(svbool_t, int64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64offset_u64)))
-svuint64_t svldff1_gather_u64offset_u64(svbool_t, uint64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64offset_f64)))
-svfloat64_t svldff1_gather_u64offset_f64(svbool_t, float64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64offset_s64)))
-svint64_t svldff1_gather_u64offset_s64(svbool_t, int64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_u8)))
-svuint8_t svldff1_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_u32)))
-svuint32_t svldff1_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_u64)))
-svuint64_t svldff1_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_u16)))
-svuint16_t svldff1_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_s8)))
-svint8_t svldff1_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_f64)))
-svfloat64_t svldff1_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_f32)))
-svfloat32_t svldff1_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_f16)))
-svfloat16_t svldff1_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_s32)))
-svint32_t svldff1_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_s64)))
-svint64_t svldff1_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_s16)))
-svint16_t svldff1_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32base_offset_u32)))
-svuint32_t svldff1sb_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64base_offset_u64)))
-svuint64_t svldff1sb_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32base_offset_s32)))
-svint32_t svldff1sb_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64base_offset_s64)))
-svint64_t svldff1sb_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32base_u32)))
-svuint32_t svldff1sb_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64base_u64)))
-svuint64_t svldff1sb_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32base_s32)))
-svint32_t svldff1sb_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64base_s64)))
-svint64_t svldff1sb_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_s32offset_u32)))
-svuint32_t svldff1sb_gather_s32offset_u32(svbool_t, int8_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_s32offset_s32)))
-svint32_t svldff1sb_gather_s32offset_s32(svbool_t, int8_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32offset_u32)))
-svuint32_t svldff1sb_gather_u32offset_u32(svbool_t, int8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32offset_s32)))
-svint32_t svldff1sb_gather_u32offset_s32(svbool_t, int8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_s64offset_u64)))
-svuint64_t svldff1sb_gather_s64offset_u64(svbool_t, int8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_s64offset_s64)))
-svint64_t svldff1sb_gather_s64offset_s64(svbool_t, int8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64offset_u64)))
-svuint64_t svldff1sb_gather_u64offset_u64(svbool_t, int8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64offset_s64)))
-svint64_t svldff1sb_gather_u64offset_s64(svbool_t, int8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_vnum_u32)))
-svuint32_t svldff1sb_vnum_u32(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_vnum_u64)))
-svuint64_t svldff1sb_vnum_u64(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_vnum_u16)))
-svuint16_t svldff1sb_vnum_u16(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_vnum_s32)))
-svint32_t svldff1sb_vnum_s32(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_vnum_s64)))
-svint64_t svldff1sb_vnum_s64(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_vnum_s16)))
-svint16_t svldff1sb_vnum_s16(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_u32)))
-svuint32_t svldff1sb_u32(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_u64)))
-svuint64_t svldff1sb_u64(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_u16)))
-svuint16_t svldff1sb_u16(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_s32)))
-svint32_t svldff1sb_s32(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_s64)))
-svint64_t svldff1sb_s64(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_s16)))
-svint16_t svldff1sb_s16(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_index_u32)))
-svuint32_t svldff1sh_gather_u32base_index_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_index_u64)))
-svuint64_t svldff1sh_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_index_s32)))
-svint32_t svldff1sh_gather_u32base_index_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_index_s64)))
-svint64_t svldff1sh_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_offset_u32)))
-svuint32_t svldff1sh_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_offset_u64)))
-svuint64_t svldff1sh_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_offset_s32)))
-svint32_t svldff1sh_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_offset_s64)))
-svint64_t svldff1sh_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_u32)))
-svuint32_t svldff1sh_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_u64)))
-svuint64_t svldff1sh_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_s32)))
-svint32_t svldff1sh_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_s64)))
-svint64_t svldff1sh_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s32index_u32)))
-svuint32_t svldff1sh_gather_s32index_u32(svbool_t, int16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s32index_s32)))
-svint32_t svldff1sh_gather_s32index_s32(svbool_t, int16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32index_u32)))
-svuint32_t svldff1sh_gather_u32index_u32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32index_s32)))
-svint32_t svldff1sh_gather_u32index_s32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s64index_u64)))
-svuint64_t svldff1sh_gather_s64index_u64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s64index_s64)))
-svint64_t svldff1sh_gather_s64index_s64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64index_u64)))
-svuint64_t svldff1sh_gather_u64index_u64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64index_s64)))
-svint64_t svldff1sh_gather_u64index_s64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s32offset_u32)))
-svuint32_t svldff1sh_gather_s32offset_u32(svbool_t, int16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s32offset_s32)))
-svint32_t svldff1sh_gather_s32offset_s32(svbool_t, int16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32offset_u32)))
-svuint32_t svldff1sh_gather_u32offset_u32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32offset_s32)))
-svint32_t svldff1sh_gather_u32offset_s32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s64offset_u64)))
-svuint64_t svldff1sh_gather_s64offset_u64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s64offset_s64)))
-svint64_t svldff1sh_gather_s64offset_s64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64offset_u64)))
-svuint64_t svldff1sh_gather_u64offset_u64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64offset_s64)))
-svint64_t svldff1sh_gather_u64offset_s64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_vnum_u32)))
-svuint32_t svldff1sh_vnum_u32(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_vnum_u64)))
-svuint64_t svldff1sh_vnum_u64(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_vnum_s32)))
-svint32_t svldff1sh_vnum_s32(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_vnum_s64)))
-svint64_t svldff1sh_vnum_s64(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_u32)))
-svuint32_t svldff1sh_u32(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_u64)))
-svuint64_t svldff1sh_u64(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_s32)))
-svint32_t svldff1sh_s32(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_s64)))
-svint64_t svldff1sh_s64(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_index_u64)))
-svuint64_t svldff1sw_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_index_s64)))
-svint64_t svldff1sw_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_offset_u64)))
-svuint64_t svldff1sw_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_offset_s64)))
-svint64_t svldff1sw_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_u64)))
-svuint64_t svldff1sw_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_s64)))
-svint64_t svldff1sw_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_s64index_u64)))
-svuint64_t svldff1sw_gather_s64index_u64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_s64index_s64)))
-svint64_t svldff1sw_gather_s64index_s64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64index_u64)))
-svuint64_t svldff1sw_gather_u64index_u64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64index_s64)))
-svint64_t svldff1sw_gather_u64index_s64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_s64offset_u64)))
-svuint64_t svldff1sw_gather_s64offset_u64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_s64offset_s64)))
-svint64_t svldff1sw_gather_s64offset_s64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64offset_u64)))
-svuint64_t svldff1sw_gather_u64offset_u64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64offset_s64)))
-svint64_t svldff1sw_gather_u64offset_s64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_vnum_u64)))
-svuint64_t svldff1sw_vnum_u64(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_vnum_s64)))
-svint64_t svldff1sw_vnum_s64(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_u64)))
-svuint64_t svldff1sw_u64(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_s64)))
-svint64_t svldff1sw_s64(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32base_offset_u32)))
-svuint32_t svldff1ub_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64base_offset_u64)))
-svuint64_t svldff1ub_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32base_offset_s32)))
-svint32_t svldff1ub_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64base_offset_s64)))
-svint64_t svldff1ub_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32base_u32)))
-svuint32_t svldff1ub_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64base_u64)))
-svuint64_t svldff1ub_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32base_s32)))
-svint32_t svldff1ub_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64base_s64)))
-svint64_t svldff1ub_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_s32offset_u32)))
-svuint32_t svldff1ub_gather_s32offset_u32(svbool_t, uint8_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_s32offset_s32)))
-svint32_t svldff1ub_gather_s32offset_s32(svbool_t, uint8_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32offset_u32)))
-svuint32_t svldff1ub_gather_u32offset_u32(svbool_t, uint8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32offset_s32)))
-svint32_t svldff1ub_gather_u32offset_s32(svbool_t, uint8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_s64offset_u64)))
-svuint64_t svldff1ub_gather_s64offset_u64(svbool_t, uint8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_s64offset_s64)))
-svint64_t svldff1ub_gather_s64offset_s64(svbool_t, uint8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64offset_u64)))
-svuint64_t svldff1ub_gather_u64offset_u64(svbool_t, uint8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64offset_s64)))
-svint64_t svldff1ub_gather_u64offset_s64(svbool_t, uint8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_vnum_u32)))
-svuint32_t svldff1ub_vnum_u32(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_vnum_u64)))
-svuint64_t svldff1ub_vnum_u64(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_vnum_u16)))
-svuint16_t svldff1ub_vnum_u16(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_vnum_s32)))
-svint32_t svldff1ub_vnum_s32(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_vnum_s64)))
-svint64_t svldff1ub_vnum_s64(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_vnum_s16)))
-svint16_t svldff1ub_vnum_s16(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_u32)))
-svuint32_t svldff1ub_u32(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_u64)))
-svuint64_t svldff1ub_u64(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_u16)))
-svuint16_t svldff1ub_u16(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_s32)))
-svint32_t svldff1ub_s32(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_s64)))
-svint64_t svldff1ub_s64(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_s16)))
-svint16_t svldff1ub_s16(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_index_u32)))
-svuint32_t svldff1uh_gather_u32base_index_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_index_u64)))
-svuint64_t svldff1uh_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_index_s32)))
-svint32_t svldff1uh_gather_u32base_index_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_index_s64)))
-svint64_t svldff1uh_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_offset_u32)))
-svuint32_t svldff1uh_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_offset_u64)))
-svuint64_t svldff1uh_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_offset_s32)))
-svint32_t svldff1uh_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_offset_s64)))
-svint64_t svldff1uh_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_u32)))
-svuint32_t svldff1uh_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_u64)))
-svuint64_t svldff1uh_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_s32)))
-svint32_t svldff1uh_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_s64)))
-svint64_t svldff1uh_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s32index_u32)))
-svuint32_t svldff1uh_gather_s32index_u32(svbool_t, uint16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s32index_s32)))
-svint32_t svldff1uh_gather_s32index_s32(svbool_t, uint16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32index_u32)))
-svuint32_t svldff1uh_gather_u32index_u32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32index_s32)))
-svint32_t svldff1uh_gather_u32index_s32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s64index_u64)))
-svuint64_t svldff1uh_gather_s64index_u64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s64index_s64)))
-svint64_t svldff1uh_gather_s64index_s64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64index_u64)))
-svuint64_t svldff1uh_gather_u64index_u64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64index_s64)))
-svint64_t svldff1uh_gather_u64index_s64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s32offset_u32)))
-svuint32_t svldff1uh_gather_s32offset_u32(svbool_t, uint16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s32offset_s32)))
-svint32_t svldff1uh_gather_s32offset_s32(svbool_t, uint16_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32offset_u32)))
-svuint32_t svldff1uh_gather_u32offset_u32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32offset_s32)))
-svint32_t svldff1uh_gather_u32offset_s32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s64offset_u64)))
-svuint64_t svldff1uh_gather_s64offset_u64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s64offset_s64)))
-svint64_t svldff1uh_gather_s64offset_s64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64offset_u64)))
-svuint64_t svldff1uh_gather_u64offset_u64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64offset_s64)))
-svint64_t svldff1uh_gather_u64offset_s64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_vnum_u32)))
-svuint32_t svldff1uh_vnum_u32(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_vnum_u64)))
-svuint64_t svldff1uh_vnum_u64(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_vnum_s32)))
-svint32_t svldff1uh_vnum_s32(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_vnum_s64)))
-svint64_t svldff1uh_vnum_s64(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_u32)))
-svuint32_t svldff1uh_u32(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_u64)))
-svuint64_t svldff1uh_u64(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_s32)))
-svint32_t svldff1uh_s32(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_s64)))
-svint64_t svldff1uh_s64(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_index_u64)))
-svuint64_t svldff1uw_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_index_s64)))
-svint64_t svldff1uw_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_offset_u64)))
-svuint64_t svldff1uw_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_offset_s64)))
-svint64_t svldff1uw_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_u64)))
-svuint64_t svldff1uw_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_s64)))
-svint64_t svldff1uw_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_s64index_u64)))
-svuint64_t svldff1uw_gather_s64index_u64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_s64index_s64)))
-svint64_t svldff1uw_gather_s64index_s64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64index_u64)))
-svuint64_t svldff1uw_gather_u64index_u64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64index_s64)))
-svint64_t svldff1uw_gather_u64index_s64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_s64offset_u64)))
-svuint64_t svldff1uw_gather_s64offset_u64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_s64offset_s64)))
-svint64_t svldff1uw_gather_s64offset_s64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64offset_u64)))
-svuint64_t svldff1uw_gather_u64offset_u64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64offset_s64)))
-svint64_t svldff1uw_gather_u64offset_s64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_vnum_u64)))
-svuint64_t svldff1uw_vnum_u64(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_vnum_s64)))
-svint64_t svldff1uw_vnum_s64(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_u64)))
-svuint64_t svldff1uw_u64(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_s64)))
-svint64_t svldff1uw_s64(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_u8)))
-svuint8_t svldnf1_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_u32)))
-svuint32_t svldnf1_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_u64)))
-svuint64_t svldnf1_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_u16)))
-svuint16_t svldnf1_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_s8)))
-svint8_t svldnf1_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_f64)))
-svfloat64_t svldnf1_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_f32)))
-svfloat32_t svldnf1_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_f16)))
-svfloat16_t svldnf1_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_s32)))
-svint32_t svldnf1_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_s64)))
-svint64_t svldnf1_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_s16)))
-svint16_t svldnf1_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_u8)))
-svuint8_t svldnf1_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_u32)))
-svuint32_t svldnf1_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_u64)))
-svuint64_t svldnf1_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_u16)))
-svuint16_t svldnf1_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_s8)))
-svint8_t svldnf1_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_f64)))
-svfloat64_t svldnf1_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_f32)))
-svfloat32_t svldnf1_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_f16)))
-svfloat16_t svldnf1_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_s32)))
-svint32_t svldnf1_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_s64)))
-svint64_t svldnf1_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_s16)))
-svint16_t svldnf1_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_vnum_u32)))
-svuint32_t svldnf1sb_vnum_u32(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_vnum_u64)))
-svuint64_t svldnf1sb_vnum_u64(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_vnum_u16)))
-svuint16_t svldnf1sb_vnum_u16(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_vnum_s32)))
-svint32_t svldnf1sb_vnum_s32(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_vnum_s64)))
-svint64_t svldnf1sb_vnum_s64(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_vnum_s16)))
-svint16_t svldnf1sb_vnum_s16(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_u32)))
-svuint32_t svldnf1sb_u32(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_u64)))
-svuint64_t svldnf1sb_u64(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_u16)))
-svuint16_t svldnf1sb_u16(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_s32)))
-svint32_t svldnf1sb_s32(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_s64)))
-svint64_t svldnf1sb_s64(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sb_s16)))
-svint16_t svldnf1sb_s16(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sh_vnum_u32)))
-svuint32_t svldnf1sh_vnum_u32(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sh_vnum_u64)))
-svuint64_t svldnf1sh_vnum_u64(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sh_vnum_s32)))
-svint32_t svldnf1sh_vnum_s32(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sh_vnum_s64)))
-svint64_t svldnf1sh_vnum_s64(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sh_u32)))
-svuint32_t svldnf1sh_u32(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sh_u64)))
-svuint64_t svldnf1sh_u64(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sh_s32)))
-svint32_t svldnf1sh_s32(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sh_s64)))
-svint64_t svldnf1sh_s64(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sw_vnum_u64)))
-svuint64_t svldnf1sw_vnum_u64(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sw_vnum_s64)))
-svint64_t svldnf1sw_vnum_s64(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sw_u64)))
-svuint64_t svldnf1sw_u64(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1sw_s64)))
-svint64_t svldnf1sw_s64(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_vnum_u32)))
-svuint32_t svldnf1ub_vnum_u32(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_vnum_u64)))
-svuint64_t svldnf1ub_vnum_u64(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_vnum_u16)))
-svuint16_t svldnf1ub_vnum_u16(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_vnum_s32)))
-svint32_t svldnf1ub_vnum_s32(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_vnum_s64)))
-svint64_t svldnf1ub_vnum_s64(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_vnum_s16)))
-svint16_t svldnf1ub_vnum_s16(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_u32)))
-svuint32_t svldnf1ub_u32(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_u64)))
-svuint64_t svldnf1ub_u64(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_u16)))
-svuint16_t svldnf1ub_u16(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_s32)))
-svint32_t svldnf1ub_s32(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_s64)))
-svint64_t svldnf1ub_s64(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1ub_s16)))
-svint16_t svldnf1ub_s16(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uh_vnum_u32)))
-svuint32_t svldnf1uh_vnum_u32(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uh_vnum_u64)))
-svuint64_t svldnf1uh_vnum_u64(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uh_vnum_s32)))
-svint32_t svldnf1uh_vnum_s32(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uh_vnum_s64)))
-svint64_t svldnf1uh_vnum_s64(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uh_u32)))
-svuint32_t svldnf1uh_u32(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uh_u64)))
-svuint64_t svldnf1uh_u64(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uh_s32)))
-svint32_t svldnf1uh_s32(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uh_s64)))
-svint64_t svldnf1uh_s64(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uw_vnum_u64)))
-svuint64_t svldnf1uw_vnum_u64(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uw_vnum_s64)))
-svint64_t svldnf1uw_vnum_s64(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uw_u64)))
-svuint64_t svldnf1uw_u64(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1uw_s64)))
-svint64_t svldnf1uw_s64(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base)))
-void svprfb_gather_u32base(svbool_t, svuint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base)))
-void svprfb_gather_u64base(svbool_t, svuint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base_offset)))
-void svprfb_gather_u32base_offset(svbool_t, svuint32_t, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base_offset)))
-void svprfb_gather_u64base_offset(svbool_t, svuint64_t, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s32offset)))
-void svprfb_gather_s32offset(svbool_t, void const *, svint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32offset)))
-void svprfb_gather_u32offset(svbool_t, void const *, svuint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s64offset)))
-void svprfb_gather_s64offset(svbool_t, void const *, svint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64offset)))
-void svprfb_gather_u64offset(svbool_t, void const *, svuint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base)))
-void svprfd_gather_u32base(svbool_t, svuint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base)))
-void svprfd_gather_u64base(svbool_t, svuint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base_index)))
-void svprfd_gather_u32base_index(svbool_t, svuint32_t, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base_index)))
-void svprfd_gather_u64base_index(svbool_t, svuint64_t, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s32index)))
-void svprfd_gather_s32index(svbool_t, void const *, svint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32index)))
-void svprfd_gather_u32index(svbool_t, void const *, svuint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s64index)))
-void svprfd_gather_s64index(svbool_t, void const *, svint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64index)))
-void svprfd_gather_u64index(svbool_t, void const *, svuint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base)))
-void svprfh_gather_u32base(svbool_t, svuint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base)))
-void svprfh_gather_u64base(svbool_t, svuint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base_index)))
-void svprfh_gather_u32base_index(svbool_t, svuint32_t, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base_index)))
-void svprfh_gather_u64base_index(svbool_t, svuint64_t, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s32index)))
-void svprfh_gather_s32index(svbool_t, void const *, svint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32index)))
-void svprfh_gather_u32index(svbool_t, void const *, svuint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s64index)))
-void svprfh_gather_s64index(svbool_t, void const *, svint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64index)))
-void svprfh_gather_u64index(svbool_t, void const *, svuint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base)))
-void svprfw_gather_u32base(svbool_t, svuint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base)))
-void svprfw_gather_u64base(svbool_t, svuint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base_index)))
-void svprfw_gather_u32base_index(svbool_t, svuint32_t, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base_index)))
-void svprfw_gather_u64base_index(svbool_t, svuint64_t, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s32index)))
-void svprfw_gather_s32index(svbool_t, void const *, svint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32index)))
-void svprfw_gather_u32index(svbool_t, void const *, svuint32_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s64index)))
-void svprfw_gather_s64index(svbool_t, void const *, svint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64index)))
-void svprfw_gather_u64index(svbool_t, void const *, svuint64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr)))
-svbool_t svrdffr(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr_z)))
-svbool_t svrdffr_z(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsetffr)))
-void svsetffr(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_index_u32)))
-void svst1_scatter_u32base_index_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_index_u64)))
-void svst1_scatter_u64base_index_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_index_f64)))
-void svst1_scatter_u64base_index_f64(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_index_f32)))
-void svst1_scatter_u32base_index_f32(svbool_t, svuint32_t, int64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_index_s32)))
-void svst1_scatter_u32base_index_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_index_s64)))
-void svst1_scatter_u64base_index_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_offset_u32)))
-void svst1_scatter_u32base_offset_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_offset_u64)))
-void svst1_scatter_u64base_offset_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_offset_f64)))
-void svst1_scatter_u64base_offset_f64(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_offset_f32)))
-void svst1_scatter_u32base_offset_f32(svbool_t, svuint32_t, int64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_offset_s32)))
-void svst1_scatter_u32base_offset_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_offset_s64)))
-void svst1_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_u32)))
-void svst1_scatter_u32base_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_u64)))
-void svst1_scatter_u64base_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_f64)))
-void svst1_scatter_u64base_f64(svbool_t, svuint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_f32)))
-void svst1_scatter_u32base_f32(svbool_t, svuint32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_s32)))
-void svst1_scatter_u32base_s32(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_s64)))
-void svst1_scatter_u64base_s64(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32index_u32)))
-void svst1_scatter_s32index_u32(svbool_t, uint32_t *, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32index_f32)))
-void svst1_scatter_s32index_f32(svbool_t, float32_t *, svint32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32index_s32)))
-void svst1_scatter_s32index_s32(svbool_t, int32_t *, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32index_u32)))
-void svst1_scatter_u32index_u32(svbool_t, uint32_t *, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32index_f32)))
-void svst1_scatter_u32index_f32(svbool_t, float32_t *, svuint32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32index_s32)))
-void svst1_scatter_u32index_s32(svbool_t, int32_t *, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64index_u64)))
-void svst1_scatter_s64index_u64(svbool_t, uint64_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64index_f64)))
-void svst1_scatter_s64index_f64(svbool_t, float64_t *, svint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64index_s64)))
-void svst1_scatter_s64index_s64(svbool_t, int64_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64index_u64)))
-void svst1_scatter_u64index_u64(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64index_f64)))
-void svst1_scatter_u64index_f64(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64index_s64)))
-void svst1_scatter_u64index_s64(svbool_t, int64_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32offset_u32)))
-void svst1_scatter_s32offset_u32(svbool_t, uint32_t *, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32offset_f32)))
-void svst1_scatter_s32offset_f32(svbool_t, float32_t *, svint32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32offset_s32)))
-void svst1_scatter_s32offset_s32(svbool_t, int32_t *, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32offset_u32)))
-void svst1_scatter_u32offset_u32(svbool_t, uint32_t *, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32offset_f32)))
-void svst1_scatter_u32offset_f32(svbool_t, float32_t *, svuint32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32offset_s32)))
-void svst1_scatter_u32offset_s32(svbool_t, int32_t *, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64offset_u64)))
-void svst1_scatter_s64offset_u64(svbool_t, uint64_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64offset_f64)))
-void svst1_scatter_s64offset_f64(svbool_t, float64_t *, svint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64offset_s64)))
-void svst1_scatter_s64offset_s64(svbool_t, int64_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64offset_u64)))
-void svst1_scatter_u64offset_u64(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64offset_f64)))
-void svst1_scatter_u64offset_f64(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64offset_s64)))
-void svst1_scatter_u64offset_s64(svbool_t, int64_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32base_offset_u32)))
-void svst1b_scatter_u32base_offset_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64base_offset_u64)))
-void svst1b_scatter_u64base_offset_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32base_offset_s32)))
-void svst1b_scatter_u32base_offset_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64base_offset_s64)))
-void svst1b_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32base_u32)))
-void svst1b_scatter_u32base_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64base_u64)))
-void svst1b_scatter_u64base_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32base_s32)))
-void svst1b_scatter_u32base_s32(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64base_s64)))
-void svst1b_scatter_u64base_s64(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_s32offset_s32)))
-void svst1b_scatter_s32offset_s32(svbool_t, int8_t *, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_s32offset_u32)))
-void svst1b_scatter_s32offset_u32(svbool_t, uint8_t *, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32offset_s32)))
-void svst1b_scatter_u32offset_s32(svbool_t, int8_t *, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32offset_u32)))
-void svst1b_scatter_u32offset_u32(svbool_t, uint8_t *, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_s64offset_s64)))
-void svst1b_scatter_s64offset_s64(svbool_t, int8_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_s64offset_u64)))
-void svst1b_scatter_s64offset_u64(svbool_t, uint8_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64offset_s64)))
-void svst1b_scatter_u64offset_s64(svbool_t, int8_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64offset_u64)))
-void svst1b_scatter_u64offset_u64(svbool_t, uint8_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_index_u32)))
-void svst1h_scatter_u32base_index_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_index_u64)))
-void svst1h_scatter_u64base_index_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_index_s32)))
-void svst1h_scatter_u32base_index_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_index_s64)))
-void svst1h_scatter_u64base_index_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_offset_u32)))
-void svst1h_scatter_u32base_offset_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_offset_u64)))
-void svst1h_scatter_u64base_offset_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_offset_s32)))
-void svst1h_scatter_u32base_offset_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_offset_s64)))
-void svst1h_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_u32)))
-void svst1h_scatter_u32base_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_u64)))
-void svst1h_scatter_u64base_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_s32)))
-void svst1h_scatter_u32base_s32(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_s64)))
-void svst1h_scatter_u64base_s64(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s32index_s32)))
-void svst1h_scatter_s32index_s32(svbool_t, int16_t *, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s32index_u32)))
-void svst1h_scatter_s32index_u32(svbool_t, uint16_t *, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32index_s32)))
-void svst1h_scatter_u32index_s32(svbool_t, int16_t *, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32index_u32)))
-void svst1h_scatter_u32index_u32(svbool_t, uint16_t *, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s64index_s64)))
-void svst1h_scatter_s64index_s64(svbool_t, int16_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s64index_u64)))
-void svst1h_scatter_s64index_u64(svbool_t, uint16_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64index_s64)))
-void svst1h_scatter_u64index_s64(svbool_t, int16_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64index_u64)))
-void svst1h_scatter_u64index_u64(svbool_t, uint16_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s32offset_s32)))
-void svst1h_scatter_s32offset_s32(svbool_t, int16_t *, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s32offset_u32)))
-void svst1h_scatter_s32offset_u32(svbool_t, uint16_t *, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32offset_s32)))
-void svst1h_scatter_u32offset_s32(svbool_t, int16_t *, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32offset_u32)))
-void svst1h_scatter_u32offset_u32(svbool_t, uint16_t *, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s64offset_s64)))
-void svst1h_scatter_s64offset_s64(svbool_t, int16_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s64offset_u64)))
-void svst1h_scatter_s64offset_u64(svbool_t, uint16_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64offset_s64)))
-void svst1h_scatter_u64offset_s64(svbool_t, int16_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64offset_u64)))
-void svst1h_scatter_u64offset_u64(svbool_t, uint16_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_index_u64)))
-void svst1w_scatter_u64base_index_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_index_s64)))
-void svst1w_scatter_u64base_index_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_offset_u64)))
-void svst1w_scatter_u64base_offset_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_offset_s64)))
-void svst1w_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_u64)))
-void svst1w_scatter_u64base_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_s64)))
-void svst1w_scatter_u64base_s64(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_s64index_s64)))
-void svst1w_scatter_s64index_s64(svbool_t, int32_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_s64index_u64)))
-void svst1w_scatter_s64index_u64(svbool_t, uint32_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64index_s64)))
-void svst1w_scatter_u64index_s64(svbool_t, int32_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64index_u64)))
-void svst1w_scatter_u64index_u64(svbool_t, uint32_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_s64offset_s64)))
-void svst1w_scatter_s64offset_s64(svbool_t, int32_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_s64offset_u64)))
-void svst1w_scatter_s64offset_u64(svbool_t, uint32_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64offset_s64)))
-void svst1w_scatter_u64offset_s64(svbool_t, int32_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64offset_u64)))
-void svst1w_scatter_u64offset_u64(svbool_t, uint32_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtmad_f64)))
-svfloat64_t svtmad_f64(svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtmad_f32)))
-svfloat32_t svtmad_f32(svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtmad_f16)))
-svfloat16_t svtmad_f16(svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f64)))
-svfloat64_t svtsmul_f64(svfloat64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f32)))
-svfloat32_t svtsmul_f32(svfloat32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f16)))
-svfloat16_t svtsmul_f16(svfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f64)))
-svfloat64_t svtssel_f64(svfloat64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f32)))
-svfloat32_t svtssel_f32(svfloat32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f16)))
-svfloat16_t svtssel_f16(svfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwrffr)))
-void svwrffr(svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u32base_u32offset)))
-svuint32_t svadrb_offset(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u64base_u64offset)))
-svuint64_t svadrb_offset(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u32base_s32offset)))
-svuint32_t svadrb_offset(svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u64base_s64offset)))
-svuint64_t svadrb_offset(svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrd_u32base_u32index)))
-svuint32_t svadrd_index(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrd_u64base_u64index)))
-svuint64_t svadrd_index(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrd_u32base_s32index)))
-svuint32_t svadrd_index(svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrd_u64base_s64index)))
-svuint64_t svadrd_index(svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrh_u32base_u32index)))
-svuint32_t svadrh_index(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrh_u64base_u64index)))
-svuint64_t svadrh_index(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrh_u32base_s32index)))
-svuint32_t svadrh_index(svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrh_u64base_s64index)))
-svuint64_t svadrh_index(svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrw_u32base_u32index)))
-svuint32_t svadrw_index(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrw_u64base_u64index)))
-svuint64_t svadrw_index(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrw_u32base_s32index)))
-svuint32_t svadrw_index(svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrw_u64base_s64index)))
-svuint64_t svadrw_index(svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_u32)))
-svuint32_t svcompact(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_u64)))
-svuint64_t svcompact(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_f64)))
-svfloat64_t svcompact(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_f32)))
-svfloat32_t svcompact(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_s32)))
-svint32_t svcompact(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_s64)))
-svint64_t svcompact(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexpa_f64)))
-svfloat64_t svexpa(svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexpa_f32)))
-svfloat32_t svexpa(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexpa_f16)))
-svfloat16_t svexpa(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_index_u32)))
-svuint32_t svld1_gather_index_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_index_u64)))
-svuint64_t svld1_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_index_f64)))
-svfloat64_t svld1_gather_index_f64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_index_f32)))
-svfloat32_t svld1_gather_index_f32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_index_s32)))
-svint32_t svld1_gather_index_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_index_s64)))
-svint64_t svld1_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_offset_u32)))
-svuint32_t svld1_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_offset_u64)))
-svuint64_t svld1_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_offset_f64)))
-svfloat64_t svld1_gather_offset_f64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_offset_f32)))
-svfloat32_t svld1_gather_offset_f32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_offset_s32)))
-svint32_t svld1_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_offset_s64)))
-svint64_t svld1_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_u32)))
-svuint32_t svld1_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_u64)))
-svuint64_t svld1_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_f64)))
-svfloat64_t svld1_gather_f64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_f32)))
-svfloat32_t svld1_gather_f32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32base_s32)))
-svint32_t svld1_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64base_s64)))
-svint64_t svld1_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32index_u32)))
-svuint32_t svld1_gather_index(svbool_t, uint32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32index_f32)))
-svfloat32_t svld1_gather_index(svbool_t, float32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32index_s32)))
-svint32_t svld1_gather_index(svbool_t, int32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32index_u32)))
-svuint32_t svld1_gather_index(svbool_t, uint32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32index_f32)))
-svfloat32_t svld1_gather_index(svbool_t, float32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32index_s32)))
-svint32_t svld1_gather_index(svbool_t, int32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64index_u64)))
-svuint64_t svld1_gather_index(svbool_t, uint64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64index_f64)))
-svfloat64_t svld1_gather_index(svbool_t, float64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64index_s64)))
-svint64_t svld1_gather_index(svbool_t, int64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64index_u64)))
-svuint64_t svld1_gather_index(svbool_t, uint64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64index_f64)))
-svfloat64_t svld1_gather_index(svbool_t, float64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64index_s64)))
-svint64_t svld1_gather_index(svbool_t, int64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32offset_u32)))
-svuint32_t svld1_gather_offset(svbool_t, uint32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32offset_f32)))
-svfloat32_t svld1_gather_offset(svbool_t, float32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s32offset_s32)))
-svint32_t svld1_gather_offset(svbool_t, int32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32offset_u32)))
-svuint32_t svld1_gather_offset(svbool_t, uint32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32offset_f32)))
-svfloat32_t svld1_gather_offset(svbool_t, float32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u32offset_s32)))
-svint32_t svld1_gather_offset(svbool_t, int32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64offset_u64)))
-svuint64_t svld1_gather_offset(svbool_t, uint64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64offset_f64)))
-svfloat64_t svld1_gather_offset(svbool_t, float64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_s64offset_s64)))
-svint64_t svld1_gather_offset(svbool_t, int64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64offset_u64)))
-svuint64_t svld1_gather_offset(svbool_t, uint64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64offset_f64)))
-svfloat64_t svld1_gather_offset(svbool_t, float64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_gather_u64offset_s64)))
-svint64_t svld1_gather_offset(svbool_t, int64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32base_offset_u32)))
-svuint32_t svld1sb_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64base_offset_u64)))
-svuint64_t svld1sb_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32base_offset_s32)))
-svint32_t svld1sb_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64base_offset_s64)))
-svint64_t svld1sb_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32base_u32)))
-svuint32_t svld1sb_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64base_u64)))
-svuint64_t svld1sb_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32base_s32)))
-svint32_t svld1sb_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64base_s64)))
-svint64_t svld1sb_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_s32offset_u32)))
-svuint32_t svld1sb_gather_offset_u32(svbool_t, int8_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_s32offset_s32)))
-svint32_t svld1sb_gather_offset_s32(svbool_t, int8_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32offset_u32)))
-svuint32_t svld1sb_gather_offset_u32(svbool_t, int8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u32offset_s32)))
-svint32_t svld1sb_gather_offset_s32(svbool_t, int8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_s64offset_u64)))
-svuint64_t svld1sb_gather_offset_u64(svbool_t, int8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_s64offset_s64)))
-svint64_t svld1sb_gather_offset_s64(svbool_t, int8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64offset_u64)))
-svuint64_t svld1sb_gather_offset_u64(svbool_t, int8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_gather_u64offset_s64)))
-svint64_t svld1sb_gather_offset_s64(svbool_t, int8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_index_u32)))
-svuint32_t svld1sh_gather_index_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_index_u64)))
-svuint64_t svld1sh_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_index_s32)))
-svint32_t svld1sh_gather_index_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_index_s64)))
-svint64_t svld1sh_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_offset_u32)))
-svuint32_t svld1sh_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_offset_u64)))
-svuint64_t svld1sh_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_offset_s32)))
-svint32_t svld1sh_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_offset_s64)))
-svint64_t svld1sh_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_u32)))
-svuint32_t svld1sh_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_u64)))
-svuint64_t svld1sh_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32base_s32)))
-svint32_t svld1sh_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64base_s64)))
-svint64_t svld1sh_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s32index_u32)))
-svuint32_t svld1sh_gather_index_u32(svbool_t, int16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s32index_s32)))
-svint32_t svld1sh_gather_index_s32(svbool_t, int16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32index_u32)))
-svuint32_t svld1sh_gather_index_u32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32index_s32)))
-svint32_t svld1sh_gather_index_s32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s64index_u64)))
-svuint64_t svld1sh_gather_index_u64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s64index_s64)))
-svint64_t svld1sh_gather_index_s64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64index_u64)))
-svuint64_t svld1sh_gather_index_u64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64index_s64)))
-svint64_t svld1sh_gather_index_s64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s32offset_u32)))
-svuint32_t svld1sh_gather_offset_u32(svbool_t, int16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s32offset_s32)))
-svint32_t svld1sh_gather_offset_s32(svbool_t, int16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32offset_u32)))
-svuint32_t svld1sh_gather_offset_u32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u32offset_s32)))
-svint32_t svld1sh_gather_offset_s32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s64offset_u64)))
-svuint64_t svld1sh_gather_offset_u64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_s64offset_s64)))
-svint64_t svld1sh_gather_offset_s64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64offset_u64)))
-svuint64_t svld1sh_gather_offset_u64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_gather_u64offset_s64)))
-svint64_t svld1sh_gather_offset_s64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_index_u64)))
-svuint64_t svld1sw_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_index_s64)))
-svint64_t svld1sw_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_offset_u64)))
-svuint64_t svld1sw_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_offset_s64)))
-svint64_t svld1sw_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_u64)))
-svuint64_t svld1sw_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64base_s64)))
-svint64_t svld1sw_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_s64index_u64)))
-svuint64_t svld1sw_gather_index_u64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_s64index_s64)))
-svint64_t svld1sw_gather_index_s64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64index_u64)))
-svuint64_t svld1sw_gather_index_u64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64index_s64)))
-svint64_t svld1sw_gather_index_s64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_s64offset_u64)))
-svuint64_t svld1sw_gather_offset_u64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_s64offset_s64)))
-svint64_t svld1sw_gather_offset_s64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64offset_u64)))
-svuint64_t svld1sw_gather_offset_u64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_gather_u64offset_s64)))
-svint64_t svld1sw_gather_offset_s64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32base_offset_u32)))
-svuint32_t svld1ub_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64base_offset_u64)))
-svuint64_t svld1ub_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32base_offset_s32)))
-svint32_t svld1ub_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64base_offset_s64)))
-svint64_t svld1ub_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32base_u32)))
-svuint32_t svld1ub_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64base_u64)))
-svuint64_t svld1ub_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32base_s32)))
-svint32_t svld1ub_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64base_s64)))
-svint64_t svld1ub_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_s32offset_u32)))
-svuint32_t svld1ub_gather_offset_u32(svbool_t, uint8_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_s32offset_s32)))
-svint32_t svld1ub_gather_offset_s32(svbool_t, uint8_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32offset_u32)))
-svuint32_t svld1ub_gather_offset_u32(svbool_t, uint8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u32offset_s32)))
-svint32_t svld1ub_gather_offset_s32(svbool_t, uint8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_s64offset_u64)))
-svuint64_t svld1ub_gather_offset_u64(svbool_t, uint8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_s64offset_s64)))
-svint64_t svld1ub_gather_offset_s64(svbool_t, uint8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64offset_u64)))
-svuint64_t svld1ub_gather_offset_u64(svbool_t, uint8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_gather_u64offset_s64)))
-svint64_t svld1ub_gather_offset_s64(svbool_t, uint8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_index_u32)))
-svuint32_t svld1uh_gather_index_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_index_u64)))
-svuint64_t svld1uh_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_index_s32)))
-svint32_t svld1uh_gather_index_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_index_s64)))
-svint64_t svld1uh_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_offset_u32)))
-svuint32_t svld1uh_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_offset_u64)))
-svuint64_t svld1uh_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_offset_s32)))
-svint32_t svld1uh_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_offset_s64)))
-svint64_t svld1uh_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_u32)))
-svuint32_t svld1uh_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_u64)))
-svuint64_t svld1uh_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32base_s32)))
-svint32_t svld1uh_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64base_s64)))
-svint64_t svld1uh_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s32index_u32)))
-svuint32_t svld1uh_gather_index_u32(svbool_t, uint16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s32index_s32)))
-svint32_t svld1uh_gather_index_s32(svbool_t, uint16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32index_u32)))
-svuint32_t svld1uh_gather_index_u32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32index_s32)))
-svint32_t svld1uh_gather_index_s32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s64index_u64)))
-svuint64_t svld1uh_gather_index_u64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s64index_s64)))
-svint64_t svld1uh_gather_index_s64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64index_u64)))
-svuint64_t svld1uh_gather_index_u64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64index_s64)))
-svint64_t svld1uh_gather_index_s64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s32offset_u32)))
-svuint32_t svld1uh_gather_offset_u32(svbool_t, uint16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s32offset_s32)))
-svint32_t svld1uh_gather_offset_s32(svbool_t, uint16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32offset_u32)))
-svuint32_t svld1uh_gather_offset_u32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u32offset_s32)))
-svint32_t svld1uh_gather_offset_s32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s64offset_u64)))
-svuint64_t svld1uh_gather_offset_u64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_s64offset_s64)))
-svint64_t svld1uh_gather_offset_s64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64offset_u64)))
-svuint64_t svld1uh_gather_offset_u64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_gather_u64offset_s64)))
-svint64_t svld1uh_gather_offset_s64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_index_u64)))
-svuint64_t svld1uw_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_index_s64)))
-svint64_t svld1uw_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_offset_u64)))
-svuint64_t svld1uw_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_offset_s64)))
-svint64_t svld1uw_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_u64)))
-svuint64_t svld1uw_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64base_s64)))
-svint64_t svld1uw_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_s64index_u64)))
-svuint64_t svld1uw_gather_index_u64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_s64index_s64)))
-svint64_t svld1uw_gather_index_s64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64index_u64)))
-svuint64_t svld1uw_gather_index_u64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64index_s64)))
-svint64_t svld1uw_gather_index_s64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_s64offset_u64)))
-svuint64_t svld1uw_gather_offset_u64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_s64offset_s64)))
-svint64_t svld1uw_gather_offset_s64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64offset_u64)))
-svuint64_t svld1uw_gather_offset_u64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_gather_u64offset_s64)))
-svint64_t svld1uw_gather_offset_s64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_u8)))
-svuint8_t svldff1(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_u32)))
-svuint32_t svldff1(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_u64)))
-svuint64_t svldff1(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_u16)))
-svuint16_t svldff1(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_s8)))
-svint8_t svldff1(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_f64)))
-svfloat64_t svldff1(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_f32)))
-svfloat32_t svldff1(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_f16)))
-svfloat16_t svldff1(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_s32)))
-svint32_t svldff1(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_s64)))
-svint64_t svldff1(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_s16)))
-svint16_t svldff1(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_index_u32)))
-svuint32_t svldff1_gather_index_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_index_u64)))
-svuint64_t svldff1_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_index_f64)))
-svfloat64_t svldff1_gather_index_f64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_index_f32)))
-svfloat32_t svldff1_gather_index_f32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_index_s32)))
-svint32_t svldff1_gather_index_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_index_s64)))
-svint64_t svldff1_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_offset_u32)))
-svuint32_t svldff1_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_offset_u64)))
-svuint64_t svldff1_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_offset_f64)))
-svfloat64_t svldff1_gather_offset_f64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_offset_f32)))
-svfloat32_t svldff1_gather_offset_f32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_offset_s32)))
-svint32_t svldff1_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_offset_s64)))
-svint64_t svldff1_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_u32)))
-svuint32_t svldff1_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_u64)))
-svuint64_t svldff1_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_f64)))
-svfloat64_t svldff1_gather_f64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_f32)))
-svfloat32_t svldff1_gather_f32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32base_s32)))
-svint32_t svldff1_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64base_s64)))
-svint64_t svldff1_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32index_u32)))
-svuint32_t svldff1_gather_index(svbool_t, uint32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32index_f32)))
-svfloat32_t svldff1_gather_index(svbool_t, float32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32index_s32)))
-svint32_t svldff1_gather_index(svbool_t, int32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32index_u32)))
-svuint32_t svldff1_gather_index(svbool_t, uint32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32index_f32)))
-svfloat32_t svldff1_gather_index(svbool_t, float32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32index_s32)))
-svint32_t svldff1_gather_index(svbool_t, int32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64index_u64)))
-svuint64_t svldff1_gather_index(svbool_t, uint64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64index_f64)))
-svfloat64_t svldff1_gather_index(svbool_t, float64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64index_s64)))
-svint64_t svldff1_gather_index(svbool_t, int64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64index_u64)))
-svuint64_t svldff1_gather_index(svbool_t, uint64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64index_f64)))
-svfloat64_t svldff1_gather_index(svbool_t, float64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64index_s64)))
-svint64_t svldff1_gather_index(svbool_t, int64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32offset_u32)))
-svuint32_t svldff1_gather_offset(svbool_t, uint32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32offset_f32)))
-svfloat32_t svldff1_gather_offset(svbool_t, float32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s32offset_s32)))
-svint32_t svldff1_gather_offset(svbool_t, int32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32offset_u32)))
-svuint32_t svldff1_gather_offset(svbool_t, uint32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32offset_f32)))
-svfloat32_t svldff1_gather_offset(svbool_t, float32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u32offset_s32)))
-svint32_t svldff1_gather_offset(svbool_t, int32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64offset_u64)))
-svuint64_t svldff1_gather_offset(svbool_t, uint64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64offset_f64)))
-svfloat64_t svldff1_gather_offset(svbool_t, float64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_s64offset_s64)))
-svint64_t svldff1_gather_offset(svbool_t, int64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64offset_u64)))
-svuint64_t svldff1_gather_offset(svbool_t, uint64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64offset_f64)))
-svfloat64_t svldff1_gather_offset(svbool_t, float64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_gather_u64offset_s64)))
-svint64_t svldff1_gather_offset(svbool_t, int64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_u8)))
-svuint8_t svldff1_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_u32)))
-svuint32_t svldff1_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_u64)))
-svuint64_t svldff1_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_u16)))
-svuint16_t svldff1_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_s8)))
-svint8_t svldff1_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_f64)))
-svfloat64_t svldff1_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_f32)))
-svfloat32_t svldff1_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_f16)))
-svfloat16_t svldff1_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_s32)))
-svint32_t svldff1_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_s64)))
-svint64_t svldff1_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_s16)))
-svint16_t svldff1_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32base_offset_u32)))
-svuint32_t svldff1sb_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64base_offset_u64)))
-svuint64_t svldff1sb_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32base_offset_s32)))
-svint32_t svldff1sb_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64base_offset_s64)))
-svint64_t svldff1sb_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32base_u32)))
-svuint32_t svldff1sb_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64base_u64)))
-svuint64_t svldff1sb_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32base_s32)))
-svint32_t svldff1sb_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64base_s64)))
-svint64_t svldff1sb_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_s32offset_u32)))
-svuint32_t svldff1sb_gather_offset_u32(svbool_t, int8_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_s32offset_s32)))
-svint32_t svldff1sb_gather_offset_s32(svbool_t, int8_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32offset_u32)))
-svuint32_t svldff1sb_gather_offset_u32(svbool_t, int8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u32offset_s32)))
-svint32_t svldff1sb_gather_offset_s32(svbool_t, int8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_s64offset_u64)))
-svuint64_t svldff1sb_gather_offset_u64(svbool_t, int8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_s64offset_s64)))
-svint64_t svldff1sb_gather_offset_s64(svbool_t, int8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64offset_u64)))
-svuint64_t svldff1sb_gather_offset_u64(svbool_t, int8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sb_gather_u64offset_s64)))
-svint64_t svldff1sb_gather_offset_s64(svbool_t, int8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_index_u32)))
-svuint32_t svldff1sh_gather_index_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_index_u64)))
-svuint64_t svldff1sh_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_index_s32)))
-svint32_t svldff1sh_gather_index_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_index_s64)))
-svint64_t svldff1sh_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_offset_u32)))
-svuint32_t svldff1sh_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_offset_u64)))
-svuint64_t svldff1sh_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_offset_s32)))
-svint32_t svldff1sh_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_offset_s64)))
-svint64_t svldff1sh_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_u32)))
-svuint32_t svldff1sh_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_u64)))
-svuint64_t svldff1sh_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32base_s32)))
-svint32_t svldff1sh_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64base_s64)))
-svint64_t svldff1sh_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s32index_u32)))
-svuint32_t svldff1sh_gather_index_u32(svbool_t, int16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s32index_s32)))
-svint32_t svldff1sh_gather_index_s32(svbool_t, int16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32index_u32)))
-svuint32_t svldff1sh_gather_index_u32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32index_s32)))
-svint32_t svldff1sh_gather_index_s32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s64index_u64)))
-svuint64_t svldff1sh_gather_index_u64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s64index_s64)))
-svint64_t svldff1sh_gather_index_s64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64index_u64)))
-svuint64_t svldff1sh_gather_index_u64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64index_s64)))
-svint64_t svldff1sh_gather_index_s64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s32offset_u32)))
-svuint32_t svldff1sh_gather_offset_u32(svbool_t, int16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s32offset_s32)))
-svint32_t svldff1sh_gather_offset_s32(svbool_t, int16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32offset_u32)))
-svuint32_t svldff1sh_gather_offset_u32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u32offset_s32)))
-svint32_t svldff1sh_gather_offset_s32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s64offset_u64)))
-svuint64_t svldff1sh_gather_offset_u64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_s64offset_s64)))
-svint64_t svldff1sh_gather_offset_s64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64offset_u64)))
-svuint64_t svldff1sh_gather_offset_u64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sh_gather_u64offset_s64)))
-svint64_t svldff1sh_gather_offset_s64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_index_u64)))
-svuint64_t svldff1sw_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_index_s64)))
-svint64_t svldff1sw_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_offset_u64)))
-svuint64_t svldff1sw_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_offset_s64)))
-svint64_t svldff1sw_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_u64)))
-svuint64_t svldff1sw_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64base_s64)))
-svint64_t svldff1sw_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_s64index_u64)))
-svuint64_t svldff1sw_gather_index_u64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_s64index_s64)))
-svint64_t svldff1sw_gather_index_s64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64index_u64)))
-svuint64_t svldff1sw_gather_index_u64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64index_s64)))
-svint64_t svldff1sw_gather_index_s64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_s64offset_u64)))
-svuint64_t svldff1sw_gather_offset_u64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_s64offset_s64)))
-svint64_t svldff1sw_gather_offset_s64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64offset_u64)))
-svuint64_t svldff1sw_gather_offset_u64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1sw_gather_u64offset_s64)))
-svint64_t svldff1sw_gather_offset_s64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32base_offset_u32)))
-svuint32_t svldff1ub_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64base_offset_u64)))
-svuint64_t svldff1ub_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32base_offset_s32)))
-svint32_t svldff1ub_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64base_offset_s64)))
-svint64_t svldff1ub_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32base_u32)))
-svuint32_t svldff1ub_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64base_u64)))
-svuint64_t svldff1ub_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32base_s32)))
-svint32_t svldff1ub_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64base_s64)))
-svint64_t svldff1ub_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_s32offset_u32)))
-svuint32_t svldff1ub_gather_offset_u32(svbool_t, uint8_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_s32offset_s32)))
-svint32_t svldff1ub_gather_offset_s32(svbool_t, uint8_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32offset_u32)))
-svuint32_t svldff1ub_gather_offset_u32(svbool_t, uint8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u32offset_s32)))
-svint32_t svldff1ub_gather_offset_s32(svbool_t, uint8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_s64offset_u64)))
-svuint64_t svldff1ub_gather_offset_u64(svbool_t, uint8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_s64offset_s64)))
-svint64_t svldff1ub_gather_offset_s64(svbool_t, uint8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64offset_u64)))
-svuint64_t svldff1ub_gather_offset_u64(svbool_t, uint8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1ub_gather_u64offset_s64)))
-svint64_t svldff1ub_gather_offset_s64(svbool_t, uint8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_index_u32)))
-svuint32_t svldff1uh_gather_index_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_index_u64)))
-svuint64_t svldff1uh_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_index_s32)))
-svint32_t svldff1uh_gather_index_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_index_s64)))
-svint64_t svldff1uh_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_offset_u32)))
-svuint32_t svldff1uh_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_offset_u64)))
-svuint64_t svldff1uh_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_offset_s32)))
-svint32_t svldff1uh_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_offset_s64)))
-svint64_t svldff1uh_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_u32)))
-svuint32_t svldff1uh_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_u64)))
-svuint64_t svldff1uh_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32base_s32)))
-svint32_t svldff1uh_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64base_s64)))
-svint64_t svldff1uh_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s32index_u32)))
-svuint32_t svldff1uh_gather_index_u32(svbool_t, uint16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s32index_s32)))
-svint32_t svldff1uh_gather_index_s32(svbool_t, uint16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32index_u32)))
-svuint32_t svldff1uh_gather_index_u32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32index_s32)))
-svint32_t svldff1uh_gather_index_s32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s64index_u64)))
-svuint64_t svldff1uh_gather_index_u64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s64index_s64)))
-svint64_t svldff1uh_gather_index_s64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64index_u64)))
-svuint64_t svldff1uh_gather_index_u64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64index_s64)))
-svint64_t svldff1uh_gather_index_s64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s32offset_u32)))
-svuint32_t svldff1uh_gather_offset_u32(svbool_t, uint16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s32offset_s32)))
-svint32_t svldff1uh_gather_offset_s32(svbool_t, uint16_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32offset_u32)))
-svuint32_t svldff1uh_gather_offset_u32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u32offset_s32)))
-svint32_t svldff1uh_gather_offset_s32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s64offset_u64)))
-svuint64_t svldff1uh_gather_offset_u64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_s64offset_s64)))
-svint64_t svldff1uh_gather_offset_s64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64offset_u64)))
-svuint64_t svldff1uh_gather_offset_u64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uh_gather_u64offset_s64)))
-svint64_t svldff1uh_gather_offset_s64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_index_u64)))
-svuint64_t svldff1uw_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_index_s64)))
-svint64_t svldff1uw_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_offset_u64)))
-svuint64_t svldff1uw_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_offset_s64)))
-svint64_t svldff1uw_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_u64)))
-svuint64_t svldff1uw_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64base_s64)))
-svint64_t svldff1uw_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_s64index_u64)))
-svuint64_t svldff1uw_gather_index_u64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_s64index_s64)))
-svint64_t svldff1uw_gather_index_s64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64index_u64)))
-svuint64_t svldff1uw_gather_index_u64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64index_s64)))
-svint64_t svldff1uw_gather_index_s64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_s64offset_u64)))
-svuint64_t svldff1uw_gather_offset_u64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_s64offset_s64)))
-svint64_t svldff1uw_gather_offset_s64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64offset_u64)))
-svuint64_t svldff1uw_gather_offset_u64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1uw_gather_u64offset_s64)))
-svint64_t svldff1uw_gather_offset_s64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_u8)))
-svuint8_t svldnf1(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_u32)))
-svuint32_t svldnf1(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_u64)))
-svuint64_t svldnf1(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_u16)))
-svuint16_t svldnf1(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_s8)))
-svint8_t svldnf1(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_f64)))
-svfloat64_t svldnf1(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_f32)))
-svfloat32_t svldnf1(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_f16)))
-svfloat16_t svldnf1(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_s32)))
-svint32_t svldnf1(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_s64)))
-svint64_t svldnf1(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_s16)))
-svint16_t svldnf1(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_u8)))
-svuint8_t svldnf1_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_u32)))
-svuint32_t svldnf1_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_u64)))
-svuint64_t svldnf1_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_u16)))
-svuint16_t svldnf1_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_s8)))
-svint8_t svldnf1_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_f64)))
-svfloat64_t svldnf1_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_f32)))
-svfloat32_t svldnf1_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_f16)))
-svfloat16_t svldnf1_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_s32)))
-svint32_t svldnf1_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_s64)))
-svint64_t svldnf1_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_s16)))
-svint16_t svldnf1_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base)))
-void svprfb_gather(svbool_t, svuint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base)))
-void svprfb_gather(svbool_t, svuint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base_offset)))
-void svprfb_gather_offset(svbool_t, svuint32_t, int64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base_offset)))
-void svprfb_gather_offset(svbool_t, svuint64_t, int64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s32offset)))
-void svprfb_gather_offset(svbool_t, void const *, svint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32offset)))
-void svprfb_gather_offset(svbool_t, void const *, svuint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s64offset)))
-void svprfb_gather_offset(svbool_t, void const *, svint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64offset)))
-void svprfb_gather_offset(svbool_t, void const *, svuint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base)))
-void svprfd_gather(svbool_t, svuint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base)))
-void svprfd_gather(svbool_t, svuint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base_index)))
-void svprfd_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base_index)))
-void svprfd_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s32index)))
-void svprfd_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32index)))
-void svprfd_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s64index)))
-void svprfd_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64index)))
-void svprfd_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base)))
-void svprfh_gather(svbool_t, svuint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base)))
-void svprfh_gather(svbool_t, svuint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base_index)))
-void svprfh_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base_index)))
-void svprfh_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s32index)))
-void svprfh_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32index)))
-void svprfh_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s64index)))
-void svprfh_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64index)))
-void svprfh_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base)))
-void svprfw_gather(svbool_t, svuint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base)))
-void svprfw_gather(svbool_t, svuint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base_index)))
-void svprfw_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base_index)))
-void svprfw_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s32index)))
-void svprfw_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32index)))
-void svprfw_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s64index)))
-void svprfw_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64index)))
-void svprfw_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_index_u32)))
-void svst1_scatter_index(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_index_u64)))
-void svst1_scatter_index(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_index_f64)))
-void svst1_scatter_index(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_index_f32)))
-void svst1_scatter_index(svbool_t, svuint32_t, int64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_index_s32)))
-void svst1_scatter_index(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_index_s64)))
-void svst1_scatter_index(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_offset_u32)))
-void svst1_scatter_offset(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_offset_u64)))
-void svst1_scatter_offset(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_offset_f64)))
-void svst1_scatter_offset(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_offset_f32)))
-void svst1_scatter_offset(svbool_t, svuint32_t, int64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_offset_s32)))
-void svst1_scatter_offset(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_offset_s64)))
-void svst1_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_u32)))
-void svst1_scatter(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_u64)))
-void svst1_scatter(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_f64)))
-void svst1_scatter(svbool_t, svuint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_f32)))
-void svst1_scatter(svbool_t, svuint32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32base_s32)))
-void svst1_scatter(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64base_s64)))
-void svst1_scatter(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32index_u32)))
-void svst1_scatter_index(svbool_t, uint32_t *, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32index_f32)))
-void svst1_scatter_index(svbool_t, float32_t *, svint32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32index_s32)))
-void svst1_scatter_index(svbool_t, int32_t *, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32index_u32)))
-void svst1_scatter_index(svbool_t, uint32_t *, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32index_f32)))
-void svst1_scatter_index(svbool_t, float32_t *, svuint32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32index_s32)))
-void svst1_scatter_index(svbool_t, int32_t *, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64index_u64)))
-void svst1_scatter_index(svbool_t, uint64_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64index_f64)))
-void svst1_scatter_index(svbool_t, float64_t *, svint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64index_s64)))
-void svst1_scatter_index(svbool_t, int64_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64index_u64)))
-void svst1_scatter_index(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64index_f64)))
-void svst1_scatter_index(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64index_s64)))
-void svst1_scatter_index(svbool_t, int64_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32offset_u32)))
-void svst1_scatter_offset(svbool_t, uint32_t *, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32offset_f32)))
-void svst1_scatter_offset(svbool_t, float32_t *, svint32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s32offset_s32)))
-void svst1_scatter_offset(svbool_t, int32_t *, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32offset_u32)))
-void svst1_scatter_offset(svbool_t, uint32_t *, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32offset_f32)))
-void svst1_scatter_offset(svbool_t, float32_t *, svuint32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u32offset_s32)))
-void svst1_scatter_offset(svbool_t, int32_t *, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64offset_u64)))
-void svst1_scatter_offset(svbool_t, uint64_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64offset_f64)))
-void svst1_scatter_offset(svbool_t, float64_t *, svint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_s64offset_s64)))
-void svst1_scatter_offset(svbool_t, int64_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64offset_u64)))
-void svst1_scatter_offset(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64offset_f64)))
-void svst1_scatter_offset(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_scatter_u64offset_s64)))
-void svst1_scatter_offset(svbool_t, int64_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32base_offset_u32)))
-void svst1b_scatter_offset(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64base_offset_u64)))
-void svst1b_scatter_offset(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32base_offset_s32)))
-void svst1b_scatter_offset(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64base_offset_s64)))
-void svst1b_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32base_u32)))
-void svst1b_scatter(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64base_u64)))
-void svst1b_scatter(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32base_s32)))
-void svst1b_scatter(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64base_s64)))
-void svst1b_scatter(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_s32offset_s32)))
-void svst1b_scatter_offset(svbool_t, int8_t *, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_s32offset_u32)))
-void svst1b_scatter_offset(svbool_t, uint8_t *, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32offset_s32)))
-void svst1b_scatter_offset(svbool_t, int8_t *, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u32offset_u32)))
-void svst1b_scatter_offset(svbool_t, uint8_t *, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_s64offset_s64)))
-void svst1b_scatter_offset(svbool_t, int8_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_s64offset_u64)))
-void svst1b_scatter_offset(svbool_t, uint8_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64offset_s64)))
-void svst1b_scatter_offset(svbool_t, int8_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_scatter_u64offset_u64)))
-void svst1b_scatter_offset(svbool_t, uint8_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_index_u32)))
-void svst1h_scatter_index(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_index_u64)))
-void svst1h_scatter_index(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_index_s32)))
-void svst1h_scatter_index(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_index_s64)))
-void svst1h_scatter_index(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_offset_u32)))
-void svst1h_scatter_offset(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_offset_u64)))
-void svst1h_scatter_offset(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_offset_s32)))
-void svst1h_scatter_offset(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_offset_s64)))
-void svst1h_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_u32)))
-void svst1h_scatter(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_u64)))
-void svst1h_scatter(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32base_s32)))
-void svst1h_scatter(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64base_s64)))
-void svst1h_scatter(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s32index_s32)))
-void svst1h_scatter_index(svbool_t, int16_t *, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s32index_u32)))
-void svst1h_scatter_index(svbool_t, uint16_t *, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32index_s32)))
-void svst1h_scatter_index(svbool_t, int16_t *, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32index_u32)))
-void svst1h_scatter_index(svbool_t, uint16_t *, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s64index_s64)))
-void svst1h_scatter_index(svbool_t, int16_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s64index_u64)))
-void svst1h_scatter_index(svbool_t, uint16_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64index_s64)))
-void svst1h_scatter_index(svbool_t, int16_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64index_u64)))
-void svst1h_scatter_index(svbool_t, uint16_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s32offset_s32)))
-void svst1h_scatter_offset(svbool_t, int16_t *, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s32offset_u32)))
-void svst1h_scatter_offset(svbool_t, uint16_t *, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32offset_s32)))
-void svst1h_scatter_offset(svbool_t, int16_t *, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u32offset_u32)))
-void svst1h_scatter_offset(svbool_t, uint16_t *, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s64offset_s64)))
-void svst1h_scatter_offset(svbool_t, int16_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_s64offset_u64)))
-void svst1h_scatter_offset(svbool_t, uint16_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64offset_s64)))
-void svst1h_scatter_offset(svbool_t, int16_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_scatter_u64offset_u64)))
-void svst1h_scatter_offset(svbool_t, uint16_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_index_u64)))
-void svst1w_scatter_index(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_index_s64)))
-void svst1w_scatter_index(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_offset_u64)))
-void svst1w_scatter_offset(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_offset_s64)))
-void svst1w_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_u64)))
-void svst1w_scatter(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64base_s64)))
-void svst1w_scatter(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_s64index_s64)))
-void svst1w_scatter_index(svbool_t, int32_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_s64index_u64)))
-void svst1w_scatter_index(svbool_t, uint32_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64index_s64)))
-void svst1w_scatter_index(svbool_t, int32_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64index_u64)))
-void svst1w_scatter_index(svbool_t, uint32_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_s64offset_s64)))
-void svst1w_scatter_offset(svbool_t, int32_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_s64offset_u64)))
-void svst1w_scatter_offset(svbool_t, uint32_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64offset_s64)))
-void svst1w_scatter_offset(svbool_t, int32_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_scatter_u64offset_u64)))
-void svst1w_scatter_offset(svbool_t, uint32_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtmad_f64)))
-svfloat64_t svtmad(svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtmad_f32)))
-svfloat32_t svtmad(svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtmad_f16)))
-svfloat16_t svtmad(svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f64)))
-svfloat64_t svtsmul(svfloat64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f32)))
-svfloat32_t svtsmul(svfloat32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f16)))
-svfloat16_t svtsmul(svfloat16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f64)))
-svfloat64_t svtssel(svfloat64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f32)))
-svfloat32_t svtssel(svfloat32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f16)))
-svfloat16_t svtssel(svfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmmla_f32)))
-svfloat32_t svbfmmla_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_bf16)))
-svbfloat16_t svldff1_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_bf16)))
-svbfloat16_t svldff1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_bf16)))
-svbfloat16_t svldnf1_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_bf16)))
-svbfloat16_t svldnf1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmmla_f32)))
-svfloat32_t svbfmmla(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_bf16)))
-svbfloat16_t svldff1(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldff1_vnum_bf16)))
-svbfloat16_t svldff1_vnum(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_bf16)))
-svbfloat16_t svldnf1(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnf1_vnum_bf16)))
-svbfloat16_t svldnf1_vnum(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_bf16)))
-svbfloat16_t svtrn1q_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_bf16)))
-svbfloat16_t svtrn2q_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_bf16)))
-svbfloat16_t svuzp1q_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_bf16)))
-svbfloat16_t svuzp2q_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_bf16)))
-svbfloat16_t svzip1q_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_bf16)))
-svbfloat16_t svzip2q_bf16(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_bf16)))
-svbfloat16_t svtrn1q(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_bf16)))
-svbfloat16_t svtrn2q(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_bf16)))
-svbfloat16_t svuzp1q(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_bf16)))
-svbfloat16_t svuzp2q(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_bf16)))
-svbfloat16_t svzip1q(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_bf16)))
-svbfloat16_t svzip2q(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_n_f32)))
-svfloat32_t svbfdot_n_f32(svfloat32_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_f32)))
-svfloat32_t svbfdot_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_lane_f32)))
-svfloat32_t svbfdot_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_n_f32)))
-svfloat32_t svbfmlalb_n_f32(svfloat32_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_f32)))
-svfloat32_t svbfmlalb_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_lane_f32)))
-svfloat32_t svbfmlalb_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_n_f32)))
-svfloat32_t svbfmlalt_n_f32(svfloat32_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_f32)))
-svfloat32_t svbfmlalt_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_lane_f32)))
-svfloat32_t svbfmlalt_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_bf16)))
-bfloat16_t svclasta_n_bf16(svbool_t, bfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_bf16)))
-svbfloat16_t svclasta_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_bf16)))
-bfloat16_t svclastb_n_bf16(svbool_t, bfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_bf16)))
-svbfloat16_t svclastb_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_m)))
-svuint16_t svcnt_bf16_m(svuint16_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_x)))
-svuint16_t svcnt_bf16_x(svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_z)))
-svuint16_t svcnt_bf16_z(svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_bf16)))
-svbfloat16x2_t svcreate2_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_bf16)))
-svbfloat16x3_t svcreate3_bf16(svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_bf16)))
-svbfloat16x4_t svcreate4_bf16(svbfloat16_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_m)))
-svbfloat16_t svcvt_bf16_f32_m(svbfloat16_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_x)))
-svbfloat16_t svcvt_bf16_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_z)))
-svbfloat16_t svcvt_bf16_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_bf16_f32_m)))
-svbfloat16_t svcvtnt_bf16_f32_m(svbfloat16_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16)))
-svbfloat16_t svdup_n_bf16(bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_m)))
-svbfloat16_t svdup_n_bf16_m(svbfloat16_t, svbool_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_x)))
-svbfloat16_t svdup_n_bf16_x(svbool_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_z)))
-svbfloat16_t svdup_n_bf16_z(svbool_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_bf16)))
-svbfloat16_t svdup_lane_bf16(svbfloat16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_bf16)))
-svbfloat16_t svdupq_n_bf16(bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_bf16)))
-svbfloat16_t svdupq_lane_bf16(svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_bf16)))
-svbfloat16_t svext_bf16(svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_bf16)))
-svbfloat16_t svget2_bf16(svbfloat16x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_bf16)))
-svbfloat16_t svget3_bf16(svbfloat16x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_bf16)))
-svbfloat16_t svget4_bf16(svbfloat16x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_bf16)))
-svbfloat16_t svinsr_n_bf16(svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_bf16)))
-bfloat16_t svlasta_bf16(svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_bf16)))
-bfloat16_t svlastb_bf16(svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16)))
-svbfloat16_t svld1_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16)))
-svbfloat16_t svld1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_bf16)))
-svbfloat16_t svld1rq_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_bf16)))
-svbfloat16x2_t svld2_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_bf16)))
-svbfloat16x2_t svld2_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_bf16)))
-svbfloat16x3_t svld3_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_bf16)))
-svbfloat16x3_t svld3_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_bf16)))
-svbfloat16x4_t svld4_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_bf16)))
-svbfloat16x4_t svld4_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16)))
-svbfloat16_t svldnt1_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16)))
-svbfloat16_t svldnt1_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_bf16)))
-uint64_t svlen_bf16(svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_bf16)))
-svbfloat16_t svrev_bf16(svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16)))
-svbfloat16_t svsel_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_bf16)))
-svbfloat16x2_t svset2_bf16(svbfloat16x2_t, uint64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_bf16)))
-svbfloat16x3_t svset3_bf16(svbfloat16x3_t, uint64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_bf16)))
-svbfloat16x4_t svset4_bf16(svbfloat16x4_t, uint64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_bf16)))
-svbfloat16_t svsplice_bf16(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16)))
-void svst1_bf16(svbool_t, bfloat16_t *, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16)))
-void svst1_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_bf16)))
-void svst2_bf16(svbool_t, bfloat16_t *, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_bf16)))
-void svst2_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_bf16)))
-void svst3_bf16(svbool_t, bfloat16_t *, svbfloat16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_bf16)))
-void svst3_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_bf16)))
-void svst4_bf16(svbool_t, bfloat16_t *, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_bf16)))
-void svst4_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16)))
-void svstnt1_bf16(svbool_t, bfloat16_t *, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16)))
-void svstnt1_vnum_bf16(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_bf16)))
-svbfloat16_t svtbl_bf16(svbfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_bf16)))
-svbfloat16_t svtrn1_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
-svbfloat16_t svtrn2_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_bf16)))
-svbfloat16x2_t svundef2_bf16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_bf16)))
-svbfloat16x3_t svundef3_bf16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_bf16)))
-svbfloat16x4_t svundef4_bf16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_bf16)))
-svbfloat16_t svundef_bf16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
-svbfloat16_t svuzp1_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
-svbfloat16_t svuzp2_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_bf16)))
-svbfloat16_t svzip1_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_bf16)))
-svbfloat16_t svzip2_bf16(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_n_f32)))
-svfloat32_t svbfdot(svfloat32_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_f32)))
-svfloat32_t svbfdot(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfdot_lane_f32)))
-svfloat32_t svbfdot_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_n_f32)))
-svfloat32_t svbfmlalb(svfloat32_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_f32)))
-svfloat32_t svbfmlalb(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalb_lane_f32)))
-svfloat32_t svbfmlalb_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_n_f32)))
-svfloat32_t svbfmlalt(svfloat32_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_f32)))
-svfloat32_t svbfmlalt(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlalt_lane_f32)))
-svfloat32_t svbfmlalt_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_bf16)))
-bfloat16_t svclasta(svbool_t, bfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_bf16)))
-svbfloat16_t svclasta(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_bf16)))
-bfloat16_t svclastb(svbool_t, bfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_bf16)))
-svbfloat16_t svclastb(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_m)))
-svuint16_t svcnt_m(svuint16_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_x)))
-svuint16_t svcnt_x(svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_bf16_z)))
-svuint16_t svcnt_z(svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_bf16)))
-svbfloat16x2_t svcreate2(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_bf16)))
-svbfloat16x3_t svcreate3(svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_bf16)))
-svbfloat16x4_t svcreate4(svbfloat16_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_m)))
-svbfloat16_t svcvt_bf16_m(svbfloat16_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_x)))
-svbfloat16_t svcvt_bf16_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_bf16_f32_z)))
-svbfloat16_t svcvt_bf16_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_bf16_f32_m)))
-svbfloat16_t svcvtnt_bf16_m(svbfloat16_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16)))
-svbfloat16_t svdup_bf16(bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_m)))
-svbfloat16_t svdup_bf16_m(svbfloat16_t, svbool_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_x)))
-svbfloat16_t svdup_bf16_x(svbool_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_bf16_z)))
-svbfloat16_t svdup_bf16_z(svbool_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_bf16)))
-svbfloat16_t svdup_lane(svbfloat16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_bf16)))
-svbfloat16_t svdupq_bf16(bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_bf16)))
-svbfloat16_t svdupq_lane(svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_bf16)))
-svbfloat16_t svext(svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_bf16)))
-svbfloat16_t svget2(svbfloat16x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_bf16)))
-svbfloat16_t svget3(svbfloat16x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_bf16)))
-svbfloat16_t svget4(svbfloat16x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_bf16)))
-svbfloat16_t svinsr(svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_bf16)))
-bfloat16_t svlasta(svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_bf16)))
-bfloat16_t svlastb(svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16)))
-svbfloat16_t svld1(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16)))
-svbfloat16_t svld1_vnum(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_bf16)))
-svbfloat16_t svld1rq(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_bf16)))
-svbfloat16x2_t svld2(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_bf16)))
-svbfloat16x2_t svld2_vnum(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_bf16)))
-svbfloat16x3_t svld3(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_bf16)))
-svbfloat16x3_t svld3_vnum(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_bf16)))
-svbfloat16x4_t svld4(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_bf16)))
-svbfloat16x4_t svld4_vnum(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16)))
-svbfloat16_t svldnt1(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16)))
-svbfloat16_t svldnt1_vnum(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_bf16)))
-uint64_t svlen(svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_bf16)))
-svbfloat16_t svrev(svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_bf16)))
-svbfloat16_t svsel(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_bf16)))
-svbfloat16x2_t svset2(svbfloat16x2_t, uint64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_bf16)))
-svbfloat16x3_t svset3(svbfloat16x3_t, uint64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_bf16)))
-svbfloat16x4_t svset4(svbfloat16x4_t, uint64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_bf16)))
-svbfloat16_t svsplice(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16)))
-void svst1(svbool_t, bfloat16_t *, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16)))
-void svst1_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_bf16)))
-void svst2(svbool_t, bfloat16_t *, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_bf16)))
-void svst2_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_bf16)))
-void svst3(svbool_t, bfloat16_t *, svbfloat16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_bf16)))
-void svst3_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_bf16)))
-void svst4(svbool_t, bfloat16_t *, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_bf16)))
-void svst4_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16)))
-void svstnt1(svbool_t, bfloat16_t *, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16)))
-void svstnt1_vnum(svbool_t, bfloat16_t *, int64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_bf16)))
-svbfloat16_t svtbl(svbfloat16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_bf16)))
-svbfloat16_t svtrn1(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
-svbfloat16_t svtrn2(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
-svbfloat16_t svuzp1(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
-svbfloat16_t svuzp2(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_bf16)))
-svbfloat16_t svzip1(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_bf16)))
-svbfloat16_t svzip2(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f32)))
-svfloat32_t svmmla_f32(svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f32)))
-svfloat32_t svmmla(svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u8)))
-svuint8_t svld1ro_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u32)))
-svuint32_t svld1ro_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u64)))
-svuint64_t svld1ro_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u16)))
-svuint16_t svld1ro_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s8)))
-svint8_t svld1ro_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f64)))
-svfloat64_t svld1ro_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f32)))
-svfloat32_t svld1ro_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f16)))
-svfloat16_t svld1ro_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s32)))
-svint32_t svld1ro_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s64)))
-svint64_t svld1ro_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s16)))
-svint16_t svld1ro_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f64)))
-svfloat64_t svmmla_f64(svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u8)))
-svuint8_t svtrn1q_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u32)))
-svuint32_t svtrn1q_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u64)))
-svuint64_t svtrn1q_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u16)))
-svuint16_t svtrn1q_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s8)))
-svint8_t svtrn1q_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f64)))
-svfloat64_t svtrn1q_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f32)))
-svfloat32_t svtrn1q_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f16)))
-svfloat16_t svtrn1q_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s32)))
-svint32_t svtrn1q_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s64)))
-svint64_t svtrn1q_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s16)))
-svint16_t svtrn1q_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u8)))
-svuint8_t svtrn2q_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u32)))
-svuint32_t svtrn2q_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u64)))
-svuint64_t svtrn2q_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u16)))
-svuint16_t svtrn2q_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s8)))
-svint8_t svtrn2q_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f64)))
-svfloat64_t svtrn2q_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f32)))
-svfloat32_t svtrn2q_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f16)))
-svfloat16_t svtrn2q_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s32)))
-svint32_t svtrn2q_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s64)))
-svint64_t svtrn2q_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s16)))
-svint16_t svtrn2q_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u8)))
-svuint8_t svuzp1q_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u32)))
-svuint32_t svuzp1q_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u64)))
-svuint64_t svuzp1q_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u16)))
-svuint16_t svuzp1q_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s8)))
-svint8_t svuzp1q_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f64)))
-svfloat64_t svuzp1q_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f32)))
-svfloat32_t svuzp1q_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f16)))
-svfloat16_t svuzp1q_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s32)))
-svint32_t svuzp1q_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s64)))
-svint64_t svuzp1q_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s16)))
-svint16_t svuzp1q_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u8)))
-svuint8_t svuzp2q_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u32)))
-svuint32_t svuzp2q_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u64)))
-svuint64_t svuzp2q_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u16)))
-svuint16_t svuzp2q_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s8)))
-svint8_t svuzp2q_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f64)))
-svfloat64_t svuzp2q_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f32)))
-svfloat32_t svuzp2q_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f16)))
-svfloat16_t svuzp2q_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s32)))
-svint32_t svuzp2q_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s64)))
-svint64_t svuzp2q_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s16)))
-svint16_t svuzp2q_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u8)))
-svuint8_t svzip1q_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u32)))
-svuint32_t svzip1q_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u64)))
-svuint64_t svzip1q_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u16)))
-svuint16_t svzip1q_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s8)))
-svint8_t svzip1q_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f64)))
-svfloat64_t svzip1q_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f32)))
-svfloat32_t svzip1q_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f16)))
-svfloat16_t svzip1q_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s32)))
-svint32_t svzip1q_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s64)))
-svint64_t svzip1q_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s16)))
-svint16_t svzip1q_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u8)))
-svuint8_t svzip2q_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u32)))
-svuint32_t svzip2q_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u64)))
-svuint64_t svzip2q_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u16)))
-svuint16_t svzip2q_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s8)))
-svint8_t svzip2q_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f64)))
-svfloat64_t svzip2q_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f32)))
-svfloat32_t svzip2q_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f16)))
-svfloat16_t svzip2q_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s32)))
-svint32_t svzip2q_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s64)))
-svint64_t svzip2q_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s16)))
-svint16_t svzip2q_s16(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u8)))
-svuint8_t svld1ro(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u32)))
-svuint32_t svld1ro(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u64)))
-svuint64_t svld1ro(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_u16)))
-svuint16_t svld1ro(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s8)))
-svint8_t svld1ro(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f64)))
-svfloat64_t svld1ro(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f32)))
-svfloat32_t svld1ro(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_f16)))
-svfloat16_t svld1ro(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s32)))
-svint32_t svld1ro(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s64)))
-svint64_t svld1ro(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_s16)))
-svint16_t svld1ro(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_f64)))
-svfloat64_t svmmla(svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u8)))
-svuint8_t svtrn1q(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u32)))
-svuint32_t svtrn1q(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u64)))
-svuint64_t svtrn1q(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_u16)))
-svuint16_t svtrn1q(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s8)))
-svint8_t svtrn1q(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f64)))
-svfloat64_t svtrn1q(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f32)))
-svfloat32_t svtrn1q(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_f16)))
-svfloat16_t svtrn1q(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s32)))
-svint32_t svtrn1q(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s64)))
-svint64_t svtrn1q(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1q_s16)))
-svint16_t svtrn1q(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u8)))
-svuint8_t svtrn2q(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u32)))
-svuint32_t svtrn2q(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u64)))
-svuint64_t svtrn2q(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_u16)))
-svuint16_t svtrn2q(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s8)))
-svint8_t svtrn2q(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f64)))
-svfloat64_t svtrn2q(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f32)))
-svfloat32_t svtrn2q(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_f16)))
-svfloat16_t svtrn2q(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s32)))
-svint32_t svtrn2q(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s64)))
-svint64_t svtrn2q(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2q_s16)))
-svint16_t svtrn2q(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u8)))
-svuint8_t svuzp1q(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u32)))
-svuint32_t svuzp1q(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u64)))
-svuint64_t svuzp1q(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_u16)))
-svuint16_t svuzp1q(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s8)))
-svint8_t svuzp1q(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f64)))
-svfloat64_t svuzp1q(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f32)))
-svfloat32_t svuzp1q(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_f16)))
-svfloat16_t svuzp1q(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s32)))
-svint32_t svuzp1q(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s64)))
-svint64_t svuzp1q(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1q_s16)))
-svint16_t svuzp1q(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u8)))
-svuint8_t svuzp2q(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u32)))
-svuint32_t svuzp2q(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u64)))
-svuint64_t svuzp2q(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_u16)))
-svuint16_t svuzp2q(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s8)))
-svint8_t svuzp2q(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f64)))
-svfloat64_t svuzp2q(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f32)))
-svfloat32_t svuzp2q(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_f16)))
-svfloat16_t svuzp2q(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s32)))
-svint32_t svuzp2q(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s64)))
-svint64_t svuzp2q(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2q_s16)))
-svint16_t svuzp2q(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u8)))
-svuint8_t svzip1q(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u32)))
-svuint32_t svzip1q(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u64)))
-svuint64_t svzip1q(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_u16)))
-svuint16_t svzip1q(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s8)))
-svint8_t svzip1q(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f64)))
-svfloat64_t svzip1q(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f32)))
-svfloat32_t svzip1q(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_f16)))
-svfloat16_t svzip1q(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s32)))
-svint32_t svzip1q(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s64)))
-svint64_t svzip1q(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1q_s16)))
-svint16_t svzip1q(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u8)))
-svuint8_t svzip2q(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u32)))
-svuint32_t svzip2q(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u64)))
-svuint64_t svzip2q(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_u16)))
-svuint16_t svzip2q(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s8)))
-svint8_t svzip2q(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f64)))
-svfloat64_t svzip2q(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f32)))
-svfloat32_t svzip2q(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_f16)))
-svfloat16_t svzip2q(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s32)))
-svint32_t svzip2q(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s64)))
-svint64_t svzip2q(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2q_s16)))
-svint16_t svzip2q(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_bf16)))
-svbfloat16_t svld1ro_bf16(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ro_bf16)))
-svbfloat16_t svld1ro(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_s32)))
-svint32_t svmmla_s32(svint32_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_u32)))
-svuint32_t svmmla_u32(svuint32_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusmmla_s32)))
-svint32_t svusmmla_s32(svint32_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_s32)))
-svint32_t svmmla(svint32_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmmla_u32)))
-svuint32_t svmmla(svuint32_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusmmla_s32)))
-svint32_t svusmmla(svint32_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_n_s32)))
-svint32_t svsudot_n_s32(svint32_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_s32)))
-svint32_t svsudot_s32(svint32_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_lane_s32)))
-svint32_t svsudot_lane_s32(svint32_t, svint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_n_s32)))
-svint32_t svusdot_n_s32(svint32_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_s32)))
-svint32_t svusdot_s32(svint32_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_lane_s32)))
-svint32_t svusdot_lane_s32(svint32_t, svuint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_n_s32)))
-svint32_t svsudot(svint32_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_s32)))
-svint32_t svsudot(svint32_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsudot_lane_s32)))
-svint32_t svsudot_lane(svint32_t, svint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_n_s32)))
-svint32_t svusdot(svint32_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_s32)))
-svint32_t svusdot(svint32_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svusdot_lane_s32)))
-svint32_t svusdot_lane(svint32_t, svuint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistcnt_u32_z)))
-svuint32_t svhistcnt_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistcnt_u64_z)))
-svuint64_t svhistcnt_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistcnt_s32_z)))
-svuint32_t svhistcnt_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistcnt_s64_z)))
-svuint64_t svhistcnt_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistseg_u8)))
-svuint8_t svhistseg_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistseg_s8)))
-svuint8_t svhistseg_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_index_u32)))
-svuint32_t svldnt1_gather_u32base_index_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_index_u64)))
-svuint64_t svldnt1_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_index_f64)))
-svfloat64_t svldnt1_gather_u64base_index_f64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_index_f32)))
-svfloat32_t svldnt1_gather_u32base_index_f32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_index_s32)))
-svint32_t svldnt1_gather_u32base_index_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_index_s64)))
-svint64_t svldnt1_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_offset_u32)))
-svuint32_t svldnt1_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_offset_u64)))
-svuint64_t svldnt1_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_offset_f64)))
-svfloat64_t svldnt1_gather_u64base_offset_f64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_offset_f32)))
-svfloat32_t svldnt1_gather_u32base_offset_f32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_offset_s32)))
-svint32_t svldnt1_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_offset_s64)))
-svint64_t svldnt1_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_u32)))
-svuint32_t svldnt1_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_u64)))
-svuint64_t svldnt1_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_f64)))
-svfloat64_t svldnt1_gather_u64base_f64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_f32)))
-svfloat32_t svldnt1_gather_u32base_f32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_s32)))
-svint32_t svldnt1_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_s64)))
-svint64_t svldnt1_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64index_u64)))
-svuint64_t svldnt1_gather_s64index_u64(svbool_t, uint64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64index_f64)))
-svfloat64_t svldnt1_gather_s64index_f64(svbool_t, float64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64index_s64)))
-svint64_t svldnt1_gather_s64index_s64(svbool_t, int64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64index_u64)))
-svuint64_t svldnt1_gather_u64index_u64(svbool_t, uint64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64index_f64)))
-svfloat64_t svldnt1_gather_u64index_f64(svbool_t, float64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64index_s64)))
-svint64_t svldnt1_gather_u64index_s64(svbool_t, int64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32offset_u32)))
-svuint32_t svldnt1_gather_u32offset_u32(svbool_t, uint32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32offset_f32)))
-svfloat32_t svldnt1_gather_u32offset_f32(svbool_t, float32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32offset_s32)))
-svint32_t svldnt1_gather_u32offset_s32(svbool_t, int32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64offset_u64)))
-svuint64_t svldnt1_gather_s64offset_u64(svbool_t, uint64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64offset_f64)))
-svfloat64_t svldnt1_gather_s64offset_f64(svbool_t, float64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64offset_s64)))
-svint64_t svldnt1_gather_s64offset_s64(svbool_t, int64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64offset_u64)))
-svuint64_t svldnt1_gather_u64offset_u64(svbool_t, uint64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64offset_f64)))
-svfloat64_t svldnt1_gather_u64offset_f64(svbool_t, float64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64offset_s64)))
-svint64_t svldnt1_gather_u64offset_s64(svbool_t, int64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32base_offset_u32)))
-svuint32_t svldnt1sb_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64base_offset_u64)))
-svuint64_t svldnt1sb_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32base_offset_s32)))
-svint32_t svldnt1sb_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64base_offset_s64)))
-svint64_t svldnt1sb_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32base_u32)))
-svuint32_t svldnt1sb_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64base_u64)))
-svuint64_t svldnt1sb_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32base_s32)))
-svint32_t svldnt1sb_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64base_s64)))
-svint64_t svldnt1sb_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32offset_u32)))
-svuint32_t svldnt1sb_gather_u32offset_u32(svbool_t, int8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32offset_s32)))
-svint32_t svldnt1sb_gather_u32offset_s32(svbool_t, int8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_s64offset_u64)))
-svuint64_t svldnt1sb_gather_s64offset_u64(svbool_t, int8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_s64offset_s64)))
-svint64_t svldnt1sb_gather_s64offset_s64(svbool_t, int8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64offset_u64)))
-svuint64_t svldnt1sb_gather_u64offset_u64(svbool_t, int8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64offset_s64)))
-svint64_t svldnt1sb_gather_u64offset_s64(svbool_t, int8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_index_u32)))
-svuint32_t svldnt1sh_gather_u32base_index_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_index_u64)))
-svuint64_t svldnt1sh_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_index_s32)))
-svint32_t svldnt1sh_gather_u32base_index_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_index_s64)))
-svint64_t svldnt1sh_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_offset_u32)))
-svuint32_t svldnt1sh_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_offset_u64)))
-svuint64_t svldnt1sh_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_offset_s32)))
-svint32_t svldnt1sh_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_offset_s64)))
-svint64_t svldnt1sh_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_u32)))
-svuint32_t svldnt1sh_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_u64)))
-svuint64_t svldnt1sh_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_s32)))
-svint32_t svldnt1sh_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_s64)))
-svint64_t svldnt1sh_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_s64index_u64)))
-svuint64_t svldnt1sh_gather_s64index_u64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_s64index_s64)))
-svint64_t svldnt1sh_gather_s64index_s64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64index_u64)))
-svuint64_t svldnt1sh_gather_u64index_u64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64index_s64)))
-svint64_t svldnt1sh_gather_u64index_s64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32offset_u32)))
-svuint32_t svldnt1sh_gather_u32offset_u32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32offset_s32)))
-svint32_t svldnt1sh_gather_u32offset_s32(svbool_t, int16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_s64offset_u64)))
-svuint64_t svldnt1sh_gather_s64offset_u64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_s64offset_s64)))
-svint64_t svldnt1sh_gather_s64offset_s64(svbool_t, int16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64offset_u64)))
-svuint64_t svldnt1sh_gather_u64offset_u64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64offset_s64)))
-svint64_t svldnt1sh_gather_u64offset_s64(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_index_u64)))
-svuint64_t svldnt1sw_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_index_s64)))
-svint64_t svldnt1sw_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_offset_u64)))
-svuint64_t svldnt1sw_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_offset_s64)))
-svint64_t svldnt1sw_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_u64)))
-svuint64_t svldnt1sw_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_s64)))
-svint64_t svldnt1sw_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_s64index_u64)))
-svuint64_t svldnt1sw_gather_s64index_u64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_s64index_s64)))
-svint64_t svldnt1sw_gather_s64index_s64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64index_u64)))
-svuint64_t svldnt1sw_gather_u64index_u64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64index_s64)))
-svint64_t svldnt1sw_gather_u64index_s64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_s64offset_u64)))
-svuint64_t svldnt1sw_gather_s64offset_u64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_s64offset_s64)))
-svint64_t svldnt1sw_gather_s64offset_s64(svbool_t, int32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64offset_u64)))
-svuint64_t svldnt1sw_gather_u64offset_u64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64offset_s64)))
-svint64_t svldnt1sw_gather_u64offset_s64(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32base_offset_u32)))
-svuint32_t svldnt1ub_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64base_offset_u64)))
-svuint64_t svldnt1ub_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32base_offset_s32)))
-svint32_t svldnt1ub_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64base_offset_s64)))
-svint64_t svldnt1ub_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32base_u32)))
-svuint32_t svldnt1ub_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64base_u64)))
-svuint64_t svldnt1ub_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32base_s32)))
-svint32_t svldnt1ub_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64base_s64)))
-svint64_t svldnt1ub_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32offset_u32)))
-svuint32_t svldnt1ub_gather_u32offset_u32(svbool_t, uint8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32offset_s32)))
-svint32_t svldnt1ub_gather_u32offset_s32(svbool_t, uint8_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_s64offset_u64)))
-svuint64_t svldnt1ub_gather_s64offset_u64(svbool_t, uint8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_s64offset_s64)))
-svint64_t svldnt1ub_gather_s64offset_s64(svbool_t, uint8_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64offset_u64)))
-svuint64_t svldnt1ub_gather_u64offset_u64(svbool_t, uint8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64offset_s64)))
-svint64_t svldnt1ub_gather_u64offset_s64(svbool_t, uint8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_index_u32)))
-svuint32_t svldnt1uh_gather_u32base_index_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_index_u64)))
-svuint64_t svldnt1uh_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_index_s32)))
-svint32_t svldnt1uh_gather_u32base_index_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_index_s64)))
-svint64_t svldnt1uh_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_offset_u32)))
-svuint32_t svldnt1uh_gather_u32base_offset_u32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_offset_u64)))
-svuint64_t svldnt1uh_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_offset_s32)))
-svint32_t svldnt1uh_gather_u32base_offset_s32(svbool_t, svuint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_offset_s64)))
-svint64_t svldnt1uh_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_u32)))
-svuint32_t svldnt1uh_gather_u32base_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_u64)))
-svuint64_t svldnt1uh_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_s32)))
-svint32_t svldnt1uh_gather_u32base_s32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_s64)))
-svint64_t svldnt1uh_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_s64index_u64)))
-svuint64_t svldnt1uh_gather_s64index_u64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_s64index_s64)))
-svint64_t svldnt1uh_gather_s64index_s64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64index_u64)))
-svuint64_t svldnt1uh_gather_u64index_u64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64index_s64)))
-svint64_t svldnt1uh_gather_u64index_s64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32offset_u32)))
-svuint32_t svldnt1uh_gather_u32offset_u32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32offset_s32)))
-svint32_t svldnt1uh_gather_u32offset_s32(svbool_t, uint16_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_s64offset_u64)))
-svuint64_t svldnt1uh_gather_s64offset_u64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_s64offset_s64)))
-svint64_t svldnt1uh_gather_s64offset_s64(svbool_t, uint16_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64offset_u64)))
-svuint64_t svldnt1uh_gather_u64offset_u64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64offset_s64)))
-svint64_t svldnt1uh_gather_u64offset_s64(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_index_u64)))
-svuint64_t svldnt1uw_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_index_s64)))
-svint64_t svldnt1uw_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_offset_u64)))
-svuint64_t svldnt1uw_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_offset_s64)))
-svint64_t svldnt1uw_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_u64)))
-svuint64_t svldnt1uw_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_s64)))
-svint64_t svldnt1uw_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_s64index_u64)))
-svuint64_t svldnt1uw_gather_s64index_u64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_s64index_s64)))
-svint64_t svldnt1uw_gather_s64index_s64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64index_u64)))
-svuint64_t svldnt1uw_gather_u64index_u64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64index_s64)))
-svint64_t svldnt1uw_gather_u64index_s64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_s64offset_u64)))
-svuint64_t svldnt1uw_gather_s64offset_u64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_s64offset_s64)))
-svint64_t svldnt1uw_gather_s64offset_s64(svbool_t, uint32_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64offset_u64)))
-svuint64_t svldnt1uw_gather_u64offset_u64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64offset_s64)))
-svint64_t svldnt1uw_gather_u64offset_s64(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmatch_u8)))
-svbool_t svmatch_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmatch_u16)))
-svbool_t svmatch_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmatch_s8)))
-svbool_t svmatch_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmatch_s16)))
-svbool_t svmatch_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmatch_u8)))
-svbool_t svnmatch_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmatch_u16)))
-svbool_t svnmatch_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmatch_s8)))
-svbool_t svnmatch_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmatch_s16)))
-svbool_t svnmatch_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_index_u32)))
-void svstnt1_scatter_u32base_index_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_index_u64)))
-void svstnt1_scatter_u64base_index_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_index_f64)))
-void svstnt1_scatter_u64base_index_f64(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_index_f32)))
-void svstnt1_scatter_u32base_index_f32(svbool_t, svuint32_t, int64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_index_s32)))
-void svstnt1_scatter_u32base_index_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_index_s64)))
-void svstnt1_scatter_u64base_index_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_offset_u32)))
-void svstnt1_scatter_u32base_offset_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_offset_u64)))
-void svstnt1_scatter_u64base_offset_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_offset_f64)))
-void svstnt1_scatter_u64base_offset_f64(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_offset_f32)))
-void svstnt1_scatter_u32base_offset_f32(svbool_t, svuint32_t, int64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_offset_s32)))
-void svstnt1_scatter_u32base_offset_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_offset_s64)))
-void svstnt1_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_u32)))
-void svstnt1_scatter_u32base_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_u64)))
-void svstnt1_scatter_u64base_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_f64)))
-void svstnt1_scatter_u64base_f64(svbool_t, svuint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_f32)))
-void svstnt1_scatter_u32base_f32(svbool_t, svuint32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_s32)))
-void svstnt1_scatter_u32base_s32(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_s64)))
-void svstnt1_scatter_u64base_s64(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64index_u64)))
-void svstnt1_scatter_s64index_u64(svbool_t, uint64_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64index_f64)))
-void svstnt1_scatter_s64index_f64(svbool_t, float64_t *, svint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64index_s64)))
-void svstnt1_scatter_s64index_s64(svbool_t, int64_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64index_u64)))
-void svstnt1_scatter_u64index_u64(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64index_f64)))
-void svstnt1_scatter_u64index_f64(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64index_s64)))
-void svstnt1_scatter_u64index_s64(svbool_t, int64_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32offset_u32)))
-void svstnt1_scatter_u32offset_u32(svbool_t, uint32_t *, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32offset_f32)))
-void svstnt1_scatter_u32offset_f32(svbool_t, float32_t *, svuint32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32offset_s32)))
-void svstnt1_scatter_u32offset_s32(svbool_t, int32_t *, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64offset_u64)))
-void svstnt1_scatter_s64offset_u64(svbool_t, uint64_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64offset_f64)))
-void svstnt1_scatter_s64offset_f64(svbool_t, float64_t *, svint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64offset_s64)))
-void svstnt1_scatter_s64offset_s64(svbool_t, int64_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64offset_u64)))
-void svstnt1_scatter_u64offset_u64(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64offset_f64)))
-void svstnt1_scatter_u64offset_f64(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64offset_s64)))
-void svstnt1_scatter_u64offset_s64(svbool_t, int64_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32base_offset_u32)))
-void svstnt1b_scatter_u32base_offset_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64base_offset_u64)))
-void svstnt1b_scatter_u64base_offset_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32base_offset_s32)))
-void svstnt1b_scatter_u32base_offset_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64base_offset_s64)))
-void svstnt1b_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32base_u32)))
-void svstnt1b_scatter_u32base_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64base_u64)))
-void svstnt1b_scatter_u64base_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32base_s32)))
-void svstnt1b_scatter_u32base_s32(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64base_s64)))
-void svstnt1b_scatter_u64base_s64(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32offset_s32)))
-void svstnt1b_scatter_u32offset_s32(svbool_t, int8_t *, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32offset_u32)))
-void svstnt1b_scatter_u32offset_u32(svbool_t, uint8_t *, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_s64offset_s64)))
-void svstnt1b_scatter_s64offset_s64(svbool_t, int8_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_s64offset_u64)))
-void svstnt1b_scatter_s64offset_u64(svbool_t, uint8_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64offset_s64)))
-void svstnt1b_scatter_u64offset_s64(svbool_t, int8_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64offset_u64)))
-void svstnt1b_scatter_u64offset_u64(svbool_t, uint8_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_index_u32)))
-void svstnt1h_scatter_u32base_index_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_index_u64)))
-void svstnt1h_scatter_u64base_index_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_index_s32)))
-void svstnt1h_scatter_u32base_index_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_index_s64)))
-void svstnt1h_scatter_u64base_index_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_offset_u32)))
-void svstnt1h_scatter_u32base_offset_u32(svbool_t, svuint32_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_offset_u64)))
-void svstnt1h_scatter_u64base_offset_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_offset_s32)))
-void svstnt1h_scatter_u32base_offset_s32(svbool_t, svuint32_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_offset_s64)))
-void svstnt1h_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_u32)))
-void svstnt1h_scatter_u32base_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_u64)))
-void svstnt1h_scatter_u64base_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_s32)))
-void svstnt1h_scatter_u32base_s32(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_s64)))
-void svstnt1h_scatter_u64base_s64(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_s64index_s64)))
-void svstnt1h_scatter_s64index_s64(svbool_t, int16_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_s64index_u64)))
-void svstnt1h_scatter_s64index_u64(svbool_t, uint16_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64index_s64)))
-void svstnt1h_scatter_u64index_s64(svbool_t, int16_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64index_u64)))
-void svstnt1h_scatter_u64index_u64(svbool_t, uint16_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32offset_s32)))
-void svstnt1h_scatter_u32offset_s32(svbool_t, int16_t *, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32offset_u32)))
-void svstnt1h_scatter_u32offset_u32(svbool_t, uint16_t *, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_s64offset_s64)))
-void svstnt1h_scatter_s64offset_s64(svbool_t, int16_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_s64offset_u64)))
-void svstnt1h_scatter_s64offset_u64(svbool_t, uint16_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64offset_s64)))
-void svstnt1h_scatter_u64offset_s64(svbool_t, int16_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64offset_u64)))
-void svstnt1h_scatter_u64offset_u64(svbool_t, uint16_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_index_u64)))
-void svstnt1w_scatter_u64base_index_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_index_s64)))
-void svstnt1w_scatter_u64base_index_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_offset_u64)))
-void svstnt1w_scatter_u64base_offset_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_offset_s64)))
-void svstnt1w_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_u64)))
-void svstnt1w_scatter_u64base_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_s64)))
-void svstnt1w_scatter_u64base_s64(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_s64index_s64)))
-void svstnt1w_scatter_s64index_s64(svbool_t, int32_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_s64index_u64)))
-void svstnt1w_scatter_s64index_u64(svbool_t, uint32_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64index_s64)))
-void svstnt1w_scatter_u64index_s64(svbool_t, int32_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64index_u64)))
-void svstnt1w_scatter_u64index_u64(svbool_t, uint32_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_s64offset_s64)))
-void svstnt1w_scatter_s64offset_s64(svbool_t, int32_t *, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_s64offset_u64)))
-void svstnt1w_scatter_s64offset_u64(svbool_t, uint32_t *, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64offset_s64)))
-void svstnt1w_scatter_u64offset_s64(svbool_t, int32_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64offset_u64)))
-void svstnt1w_scatter_u64offset_u64(svbool_t, uint32_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistcnt_u32_z)))
-svuint32_t svhistcnt_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistcnt_u64_z)))
-svuint64_t svhistcnt_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistcnt_s32_z)))
-svuint32_t svhistcnt_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistcnt_s64_z)))
-svuint64_t svhistcnt_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistseg_u8)))
-svuint8_t svhistseg(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhistseg_s8)))
-svuint8_t svhistseg(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_index_u32)))
-svuint32_t svldnt1_gather_index_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_index_u64)))
-svuint64_t svldnt1_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_index_f64)))
-svfloat64_t svldnt1_gather_index_f64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_index_f32)))
-svfloat32_t svldnt1_gather_index_f32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_index_s32)))
-svint32_t svldnt1_gather_index_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_index_s64)))
-svint64_t svldnt1_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_offset_u32)))
-svuint32_t svldnt1_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_offset_u64)))
-svuint64_t svldnt1_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_offset_f64)))
-svfloat64_t svldnt1_gather_offset_f64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_offset_f32)))
-svfloat32_t svldnt1_gather_offset_f32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_offset_s32)))
-svint32_t svldnt1_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_offset_s64)))
-svint64_t svldnt1_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_u32)))
-svuint32_t svldnt1_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_u64)))
-svuint64_t svldnt1_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_f64)))
-svfloat64_t svldnt1_gather_f64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_f32)))
-svfloat32_t svldnt1_gather_f32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32base_s32)))
-svint32_t svldnt1_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64base_s64)))
-svint64_t svldnt1_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64index_u64)))
-svuint64_t svldnt1_gather_index(svbool_t, uint64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64index_f64)))
-svfloat64_t svldnt1_gather_index(svbool_t, float64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64index_s64)))
-svint64_t svldnt1_gather_index(svbool_t, int64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64index_u64)))
-svuint64_t svldnt1_gather_index(svbool_t, uint64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64index_f64)))
-svfloat64_t svldnt1_gather_index(svbool_t, float64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64index_s64)))
-svint64_t svldnt1_gather_index(svbool_t, int64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32offset_u32)))
-svuint32_t svldnt1_gather_offset(svbool_t, uint32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32offset_f32)))
-svfloat32_t svldnt1_gather_offset(svbool_t, float32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u32offset_s32)))
-svint32_t svldnt1_gather_offset(svbool_t, int32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64offset_u64)))
-svuint64_t svldnt1_gather_offset(svbool_t, uint64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64offset_f64)))
-svfloat64_t svldnt1_gather_offset(svbool_t, float64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_s64offset_s64)))
-svint64_t svldnt1_gather_offset(svbool_t, int64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64offset_u64)))
-svuint64_t svldnt1_gather_offset(svbool_t, uint64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64offset_f64)))
-svfloat64_t svldnt1_gather_offset(svbool_t, float64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_gather_u64offset_s64)))
-svint64_t svldnt1_gather_offset(svbool_t, int64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32base_offset_u32)))
-svuint32_t svldnt1sb_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64base_offset_u64)))
-svuint64_t svldnt1sb_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32base_offset_s32)))
-svint32_t svldnt1sb_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64base_offset_s64)))
-svint64_t svldnt1sb_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32base_u32)))
-svuint32_t svldnt1sb_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64base_u64)))
-svuint64_t svldnt1sb_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32base_s32)))
-svint32_t svldnt1sb_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64base_s64)))
-svint64_t svldnt1sb_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32offset_u32)))
-svuint32_t svldnt1sb_gather_offset_u32(svbool_t, int8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u32offset_s32)))
-svint32_t svldnt1sb_gather_offset_s32(svbool_t, int8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_s64offset_u64)))
-svuint64_t svldnt1sb_gather_offset_u64(svbool_t, int8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_s64offset_s64)))
-svint64_t svldnt1sb_gather_offset_s64(svbool_t, int8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64offset_u64)))
-svuint64_t svldnt1sb_gather_offset_u64(svbool_t, int8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sb_gather_u64offset_s64)))
-svint64_t svldnt1sb_gather_offset_s64(svbool_t, int8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_index_u32)))
-svuint32_t svldnt1sh_gather_index_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_index_u64)))
-svuint64_t svldnt1sh_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_index_s32)))
-svint32_t svldnt1sh_gather_index_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_index_s64)))
-svint64_t svldnt1sh_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_offset_u32)))
-svuint32_t svldnt1sh_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_offset_u64)))
-svuint64_t svldnt1sh_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_offset_s32)))
-svint32_t svldnt1sh_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_offset_s64)))
-svint64_t svldnt1sh_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_u32)))
-svuint32_t svldnt1sh_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_u64)))
-svuint64_t svldnt1sh_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32base_s32)))
-svint32_t svldnt1sh_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64base_s64)))
-svint64_t svldnt1sh_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_s64index_u64)))
-svuint64_t svldnt1sh_gather_index_u64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_s64index_s64)))
-svint64_t svldnt1sh_gather_index_s64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64index_u64)))
-svuint64_t svldnt1sh_gather_index_u64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64index_s64)))
-svint64_t svldnt1sh_gather_index_s64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32offset_u32)))
-svuint32_t svldnt1sh_gather_offset_u32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u32offset_s32)))
-svint32_t svldnt1sh_gather_offset_s32(svbool_t, int16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_s64offset_u64)))
-svuint64_t svldnt1sh_gather_offset_u64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_s64offset_s64)))
-svint64_t svldnt1sh_gather_offset_s64(svbool_t, int16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64offset_u64)))
-svuint64_t svldnt1sh_gather_offset_u64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sh_gather_u64offset_s64)))
-svint64_t svldnt1sh_gather_offset_s64(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_index_u64)))
-svuint64_t svldnt1sw_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_index_s64)))
-svint64_t svldnt1sw_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_offset_u64)))
-svuint64_t svldnt1sw_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_offset_s64)))
-svint64_t svldnt1sw_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_u64)))
-svuint64_t svldnt1sw_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64base_s64)))
-svint64_t svldnt1sw_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_s64index_u64)))
-svuint64_t svldnt1sw_gather_index_u64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_s64index_s64)))
-svint64_t svldnt1sw_gather_index_s64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64index_u64)))
-svuint64_t svldnt1sw_gather_index_u64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64index_s64)))
-svint64_t svldnt1sw_gather_index_s64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_s64offset_u64)))
-svuint64_t svldnt1sw_gather_offset_u64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_s64offset_s64)))
-svint64_t svldnt1sw_gather_offset_s64(svbool_t, int32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64offset_u64)))
-svuint64_t svldnt1sw_gather_offset_u64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1sw_gather_u64offset_s64)))
-svint64_t svldnt1sw_gather_offset_s64(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32base_offset_u32)))
-svuint32_t svldnt1ub_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64base_offset_u64)))
-svuint64_t svldnt1ub_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32base_offset_s32)))
-svint32_t svldnt1ub_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64base_offset_s64)))
-svint64_t svldnt1ub_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32base_u32)))
-svuint32_t svldnt1ub_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64base_u64)))
-svuint64_t svldnt1ub_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32base_s32)))
-svint32_t svldnt1ub_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64base_s64)))
-svint64_t svldnt1ub_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32offset_u32)))
-svuint32_t svldnt1ub_gather_offset_u32(svbool_t, uint8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u32offset_s32)))
-svint32_t svldnt1ub_gather_offset_s32(svbool_t, uint8_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_s64offset_u64)))
-svuint64_t svldnt1ub_gather_offset_u64(svbool_t, uint8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_s64offset_s64)))
-svint64_t svldnt1ub_gather_offset_s64(svbool_t, uint8_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64offset_u64)))
-svuint64_t svldnt1ub_gather_offset_u64(svbool_t, uint8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1ub_gather_u64offset_s64)))
-svint64_t svldnt1ub_gather_offset_s64(svbool_t, uint8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_index_u32)))
-svuint32_t svldnt1uh_gather_index_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_index_u64)))
-svuint64_t svldnt1uh_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_index_s32)))
-svint32_t svldnt1uh_gather_index_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_index_s64)))
-svint64_t svldnt1uh_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_offset_u32)))
-svuint32_t svldnt1uh_gather_offset_u32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_offset_u64)))
-svuint64_t svldnt1uh_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_offset_s32)))
-svint32_t svldnt1uh_gather_offset_s32(svbool_t, svuint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_offset_s64)))
-svint64_t svldnt1uh_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_u32)))
-svuint32_t svldnt1uh_gather_u32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_u64)))
-svuint64_t svldnt1uh_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32base_s32)))
-svint32_t svldnt1uh_gather_s32(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64base_s64)))
-svint64_t svldnt1uh_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_s64index_u64)))
-svuint64_t svldnt1uh_gather_index_u64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_s64index_s64)))
-svint64_t svldnt1uh_gather_index_s64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64index_u64)))
-svuint64_t svldnt1uh_gather_index_u64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64index_s64)))
-svint64_t svldnt1uh_gather_index_s64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32offset_u32)))
-svuint32_t svldnt1uh_gather_offset_u32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u32offset_s32)))
-svint32_t svldnt1uh_gather_offset_s32(svbool_t, uint16_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_s64offset_u64)))
-svuint64_t svldnt1uh_gather_offset_u64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_s64offset_s64)))
-svint64_t svldnt1uh_gather_offset_s64(svbool_t, uint16_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64offset_u64)))
-svuint64_t svldnt1uh_gather_offset_u64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uh_gather_u64offset_s64)))
-svint64_t svldnt1uh_gather_offset_s64(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_index_u64)))
-svuint64_t svldnt1uw_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_index_s64)))
-svint64_t svldnt1uw_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_offset_u64)))
-svuint64_t svldnt1uw_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_offset_s64)))
-svint64_t svldnt1uw_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_u64)))
-svuint64_t svldnt1uw_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64base_s64)))
-svint64_t svldnt1uw_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_s64index_u64)))
-svuint64_t svldnt1uw_gather_index_u64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_s64index_s64)))
-svint64_t svldnt1uw_gather_index_s64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64index_u64)))
-svuint64_t svldnt1uw_gather_index_u64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64index_s64)))
-svint64_t svldnt1uw_gather_index_s64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_s64offset_u64)))
-svuint64_t svldnt1uw_gather_offset_u64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_s64offset_s64)))
-svint64_t svldnt1uw_gather_offset_s64(svbool_t, uint32_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64offset_u64)))
-svuint64_t svldnt1uw_gather_offset_u64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1uw_gather_u64offset_s64)))
-svint64_t svldnt1uw_gather_offset_s64(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmatch_u8)))
-svbool_t svmatch(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmatch_u16)))
-svbool_t svmatch(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmatch_s8)))
-svbool_t svmatch(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmatch_s16)))
-svbool_t svmatch(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmatch_u8)))
-svbool_t svnmatch(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmatch_u16)))
-svbool_t svnmatch(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmatch_s8)))
-svbool_t svnmatch(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmatch_s16)))
-svbool_t svnmatch(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_index_u32)))
-void svstnt1_scatter_index(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_index_u64)))
-void svstnt1_scatter_index(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_index_f64)))
-void svstnt1_scatter_index(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_index_f32)))
-void svstnt1_scatter_index(svbool_t, svuint32_t, int64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_index_s32)))
-void svstnt1_scatter_index(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_index_s64)))
-void svstnt1_scatter_index(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_offset_u32)))
-void svstnt1_scatter_offset(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_offset_u64)))
-void svstnt1_scatter_offset(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_offset_f64)))
-void svstnt1_scatter_offset(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_offset_f32)))
-void svstnt1_scatter_offset(svbool_t, svuint32_t, int64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_offset_s32)))
-void svstnt1_scatter_offset(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_offset_s64)))
-void svstnt1_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_u32)))
-void svstnt1_scatter(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_u64)))
-void svstnt1_scatter(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_f64)))
-void svstnt1_scatter(svbool_t, svuint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_f32)))
-void svstnt1_scatter(svbool_t, svuint32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32base_s32)))
-void svstnt1_scatter(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64base_s64)))
-void svstnt1_scatter(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64index_u64)))
-void svstnt1_scatter_index(svbool_t, uint64_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64index_f64)))
-void svstnt1_scatter_index(svbool_t, float64_t *, svint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64index_s64)))
-void svstnt1_scatter_index(svbool_t, int64_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64index_u64)))
-void svstnt1_scatter_index(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64index_f64)))
-void svstnt1_scatter_index(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64index_s64)))
-void svstnt1_scatter_index(svbool_t, int64_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32offset_u32)))
-void svstnt1_scatter_offset(svbool_t, uint32_t *, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32offset_f32)))
-void svstnt1_scatter_offset(svbool_t, float32_t *, svuint32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u32offset_s32)))
-void svstnt1_scatter_offset(svbool_t, int32_t *, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64offset_u64)))
-void svstnt1_scatter_offset(svbool_t, uint64_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64offset_f64)))
-void svstnt1_scatter_offset(svbool_t, float64_t *, svint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_s64offset_s64)))
-void svstnt1_scatter_offset(svbool_t, int64_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64offset_u64)))
-void svstnt1_scatter_offset(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64offset_f64)))
-void svstnt1_scatter_offset(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_scatter_u64offset_s64)))
-void svstnt1_scatter_offset(svbool_t, int64_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32base_offset_u32)))
-void svstnt1b_scatter_offset(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64base_offset_u64)))
-void svstnt1b_scatter_offset(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32base_offset_s32)))
-void svstnt1b_scatter_offset(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64base_offset_s64)))
-void svstnt1b_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32base_u32)))
-void svstnt1b_scatter(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64base_u64)))
-void svstnt1b_scatter(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32base_s32)))
-void svstnt1b_scatter(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64base_s64)))
-void svstnt1b_scatter(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32offset_s32)))
-void svstnt1b_scatter_offset(svbool_t, int8_t *, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u32offset_u32)))
-void svstnt1b_scatter_offset(svbool_t, uint8_t *, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_s64offset_s64)))
-void svstnt1b_scatter_offset(svbool_t, int8_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_s64offset_u64)))
-void svstnt1b_scatter_offset(svbool_t, uint8_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64offset_s64)))
-void svstnt1b_scatter_offset(svbool_t, int8_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1b_scatter_u64offset_u64)))
-void svstnt1b_scatter_offset(svbool_t, uint8_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_index_u32)))
-void svstnt1h_scatter_index(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_index_u64)))
-void svstnt1h_scatter_index(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_index_s32)))
-void svstnt1h_scatter_index(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_index_s64)))
-void svstnt1h_scatter_index(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_offset_u32)))
-void svstnt1h_scatter_offset(svbool_t, svuint32_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_offset_u64)))
-void svstnt1h_scatter_offset(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_offset_s32)))
-void svstnt1h_scatter_offset(svbool_t, svuint32_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_offset_s64)))
-void svstnt1h_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_u32)))
-void svstnt1h_scatter(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_u64)))
-void svstnt1h_scatter(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32base_s32)))
-void svstnt1h_scatter(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64base_s64)))
-void svstnt1h_scatter(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_s64index_s64)))
-void svstnt1h_scatter_index(svbool_t, int16_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_s64index_u64)))
-void svstnt1h_scatter_index(svbool_t, uint16_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64index_s64)))
-void svstnt1h_scatter_index(svbool_t, int16_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64index_u64)))
-void svstnt1h_scatter_index(svbool_t, uint16_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32offset_s32)))
-void svstnt1h_scatter_offset(svbool_t, int16_t *, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u32offset_u32)))
-void svstnt1h_scatter_offset(svbool_t, uint16_t *, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_s64offset_s64)))
-void svstnt1h_scatter_offset(svbool_t, int16_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_s64offset_u64)))
-void svstnt1h_scatter_offset(svbool_t, uint16_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64offset_s64)))
-void svstnt1h_scatter_offset(svbool_t, int16_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1h_scatter_u64offset_u64)))
-void svstnt1h_scatter_offset(svbool_t, uint16_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_index_u64)))
-void svstnt1w_scatter_index(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_index_s64)))
-void svstnt1w_scatter_index(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_offset_u64)))
-void svstnt1w_scatter_offset(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_offset_s64)))
-void svstnt1w_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_u64)))
-void svstnt1w_scatter(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64base_s64)))
-void svstnt1w_scatter(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_s64index_s64)))
-void svstnt1w_scatter_index(svbool_t, int32_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_s64index_u64)))
-void svstnt1w_scatter_index(svbool_t, uint32_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64index_s64)))
-void svstnt1w_scatter_index(svbool_t, int32_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64index_u64)))
-void svstnt1w_scatter_index(svbool_t, uint32_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_s64offset_s64)))
-void svstnt1w_scatter_offset(svbool_t, int32_t *, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_s64offset_u64)))
-void svstnt1w_scatter_offset(svbool_t, uint32_t *, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64offset_s64)))
-void svstnt1w_scatter_offset(svbool_t, int32_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64offset_u64)))
-void svstnt1w_scatter_offset(svbool_t, uint32_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_bf16_m)))
-svbfloat16_t svadd_n_bf16_m(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_bf16_x)))
-svbfloat16_t svadd_n_bf16_x(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_bf16_z)))
-svbfloat16_t svadd_n_bf16_z(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_bf16_m)))
-svbfloat16_t svadd_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_bf16_x)))
-svbfloat16_t svadd_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_bf16_z)))
-svbfloat16_t svadd_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_bf16)))
-svbfloat16_t svclamp_bf16(svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_bf16_m)))
-svbfloat16_t svmax_n_bf16_m(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_bf16_x)))
-svbfloat16_t svmax_n_bf16_x(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_bf16_z)))
-svbfloat16_t svmax_n_bf16_z(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_m)))
-svbfloat16_t svmax_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_x)))
-svbfloat16_t svmax_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_z)))
-svbfloat16_t svmax_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_bf16_m)))
-svbfloat16_t svmaxnm_n_bf16_m(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_bf16_x)))
-svbfloat16_t svmaxnm_n_bf16_x(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_bf16_z)))
-svbfloat16_t svmaxnm_n_bf16_z(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_m)))
-svbfloat16_t svmaxnm_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_x)))
-svbfloat16_t svmaxnm_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_z)))
-svbfloat16_t svmaxnm_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_bf16_m)))
-svbfloat16_t svmin_n_bf16_m(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_bf16_x)))
-svbfloat16_t svmin_n_bf16_x(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_bf16_z)))
-svbfloat16_t svmin_n_bf16_z(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_m)))
-svbfloat16_t svmin_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_x)))
-svbfloat16_t svmin_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_z)))
-svbfloat16_t svmin_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_bf16_m)))
-svbfloat16_t svminnm_n_bf16_m(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_bf16_x)))
-svbfloat16_t svminnm_n_bf16_x(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_bf16_z)))
-svbfloat16_t svminnm_n_bf16_z(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_m)))
-svbfloat16_t svminnm_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_x)))
-svbfloat16_t svminnm_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_z)))
-svbfloat16_t svminnm_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_bf16_m)))
-svbfloat16_t svmla_n_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_bf16_x)))
-svbfloat16_t svmla_n_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_bf16_z)))
-svbfloat16_t svmla_n_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_bf16_m)))
-svbfloat16_t svmla_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_bf16_x)))
-svbfloat16_t svmla_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_bf16_z)))
-svbfloat16_t svmla_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_bf16)))
-svbfloat16_t svmla_lane_bf16(svbfloat16_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_bf16_m)))
-svbfloat16_t svmls_n_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_bf16_x)))
-svbfloat16_t svmls_n_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_bf16_z)))
-svbfloat16_t svmls_n_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_bf16_m)))
-svbfloat16_t svmls_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_bf16_x)))
-svbfloat16_t svmls_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_bf16_z)))
-svbfloat16_t svmls_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_bf16)))
-svbfloat16_t svmls_lane_bf16(svbfloat16_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_bf16_m)))
-svbfloat16_t svmul_n_bf16_m(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_bf16_x)))
-svbfloat16_t svmul_n_bf16_x(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_bf16_z)))
-svbfloat16_t svmul_n_bf16_z(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_bf16_m)))
-svbfloat16_t svmul_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_bf16_x)))
-svbfloat16_t svmul_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_bf16_z)))
-svbfloat16_t svmul_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_bf16)))
-svbfloat16_t svmul_lane_bf16(svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_bf16_m)))
-svbfloat16_t svsub_n_bf16_m(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_bf16_x)))
-svbfloat16_t svsub_n_bf16_x(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_bf16_z)))
-svbfloat16_t svsub_n_bf16_z(svbool_t, svbfloat16_t, bfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_bf16_m)))
-svbfloat16_t svsub_bf16_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_bf16_x)))
-svbfloat16_t svsub_bf16_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_bf16_z)))
-svbfloat16_t svsub_bf16_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_bf16_m)))
-svbfloat16_t svadd_m(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_bf16_x)))
-svbfloat16_t svadd_x(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_bf16_z)))
-svbfloat16_t svadd_z(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_bf16_m)))
-svbfloat16_t svadd_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_bf16_x)))
-svbfloat16_t svadd_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_bf16_z)))
-svbfloat16_t svadd_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_bf16)))
-svbfloat16_t svclamp(svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_bf16_m)))
-svbfloat16_t svmax_m(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_bf16_x)))
-svbfloat16_t svmax_x(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_bf16_z)))
-svbfloat16_t svmax_z(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_m)))
-svbfloat16_t svmax_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_x)))
-svbfloat16_t svmax_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_bf16_z)))
-svbfloat16_t svmax_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_bf16_m)))
-svbfloat16_t svmaxnm_m(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_bf16_x)))
-svbfloat16_t svmaxnm_x(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_bf16_z)))
-svbfloat16_t svmaxnm_z(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_m)))
-svbfloat16_t svmaxnm_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_x)))
-svbfloat16_t svmaxnm_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_bf16_z)))
-svbfloat16_t svmaxnm_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_bf16_m)))
-svbfloat16_t svmin_m(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_bf16_x)))
-svbfloat16_t svmin_x(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_bf16_z)))
-svbfloat16_t svmin_z(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_m)))
-svbfloat16_t svmin_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_x)))
-svbfloat16_t svmin_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_bf16_z)))
-svbfloat16_t svmin_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_bf16_m)))
-svbfloat16_t svminnm_m(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_bf16_x)))
-svbfloat16_t svminnm_x(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_bf16_z)))
-svbfloat16_t svminnm_z(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_m)))
-svbfloat16_t svminnm_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_x)))
-svbfloat16_t svminnm_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_z)))
-svbfloat16_t svminnm_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_bf16_m)))
-svbfloat16_t svmla_m(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_bf16_x)))
-svbfloat16_t svmla_x(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_bf16_z)))
-svbfloat16_t svmla_z(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_bf16_m)))
-svbfloat16_t svmla_m(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_bf16_x)))
-svbfloat16_t svmla_x(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_bf16_z)))
-svbfloat16_t svmla_z(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_bf16)))
-svbfloat16_t svmla_lane(svbfloat16_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_bf16_m)))
-svbfloat16_t svmls_m(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_bf16_x)))
-svbfloat16_t svmls_x(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_bf16_z)))
-svbfloat16_t svmls_z(svbool_t, svbfloat16_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_bf16_m)))
-svbfloat16_t svmls_m(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_bf16_x)))
-svbfloat16_t svmls_x(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_bf16_z)))
-svbfloat16_t svmls_z(svbool_t, svbfloat16_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_bf16)))
-svbfloat16_t svmls_lane(svbfloat16_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_bf16_m)))
-svbfloat16_t svmul_m(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_bf16_x)))
-svbfloat16_t svmul_x(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_bf16_z)))
-svbfloat16_t svmul_z(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_bf16_m)))
-svbfloat16_t svmul_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_bf16_x)))
-svbfloat16_t svmul_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_bf16_z)))
-svbfloat16_t svmul_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_bf16)))
-svbfloat16_t svmul_lane(svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_bf16_m)))
-svbfloat16_t svsub_m(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_bf16_x)))
-svbfloat16_t svsub_x(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_bf16_z)))
-svbfloat16_t svsub_z(svbool_t, svbfloat16_t, bfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_bf16_m)))
-svbfloat16_t svsub_m(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_bf16_x)))
-svbfloat16_t svsub_x(svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_bf16_z)))
-svbfloat16_t svsub_z(svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_bf16)))
-svbfloat16_t svtbl2_bf16(svbfloat16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_bf16)))
-svbfloat16_t svtbx_bf16(svbfloat16_t, svbfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16)))
-svbool_t svwhilerw_bf16(bfloat16_t const *, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16)))
-svbool_t svwhilewr_bf16(bfloat16_t const *, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_bf16)))
-svbfloat16_t svtbl2(svbfloat16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_bf16)))
-svbfloat16_t svtbx(svbfloat16_t, svbfloat16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16)))
-svbool_t svwhilerw(bfloat16_t const *, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16)))
-svbool_t svwhilewr(bfloat16_t const *, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesd_u8)))
-svuint8_t svaesd_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaese_u8)))
-svuint8_t svaese_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesimc_u8)))
-svuint8_t svaesimc_u8(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesmc_u8)))
-svuint8_t svaesmc_u8(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u64)))
-svuint64_t svpmullb_pair_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u64)))
-svuint64_t svpmullb_pair_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u64)))
-svuint64_t svpmullt_pair_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u64)))
-svuint64_t svpmullt_pair_u64(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesd_u8)))
-svuint8_t svaesd(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaese_u8)))
-svuint8_t svaese(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesimc_u8)))
-svuint8_t svaesimc(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesmc_u8)))
-svuint8_t svaesmc(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u64)))
-svuint64_t svpmullb_pair(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u64)))
-svuint64_t svpmullb_pair(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u64)))
-svuint64_t svpmullt_pair(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u64)))
-svuint64_t svpmullt_pair(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u8)))
-svuint8_t svbdep_n_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u32)))
-svuint32_t svbdep_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u64)))
-svuint64_t svbdep_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u16)))
-svuint16_t svbdep_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u8)))
-svuint8_t svbdep_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u32)))
-svuint32_t svbdep_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u64)))
-svuint64_t svbdep_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u16)))
-svuint16_t svbdep_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u8)))
-svuint8_t svbext_n_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u32)))
-svuint32_t svbext_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u64)))
-svuint64_t svbext_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u16)))
-svuint16_t svbext_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u8)))
-svuint8_t svbext_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u32)))
-svuint32_t svbext_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u64)))
-svuint64_t svbext_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u16)))
-svuint16_t svbext_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u8)))
-svuint8_t svbgrp_n_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u32)))
-svuint32_t svbgrp_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u64)))
-svuint64_t svbgrp_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u16)))
-svuint16_t svbgrp_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u8)))
-svuint8_t svbgrp_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u32)))
-svuint32_t svbgrp_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u64)))
-svuint64_t svbgrp_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u16)))
-svuint16_t svbgrp_u16(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u8)))
-svuint8_t svbdep(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u32)))
-svuint32_t svbdep(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u64)))
-svuint64_t svbdep(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u16)))
-svuint16_t svbdep(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u8)))
-svuint8_t svbdep(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u32)))
-svuint32_t svbdep(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u64)))
-svuint64_t svbdep(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_u16)))
-svuint16_t svbdep(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u8)))
-svuint8_t svbext(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u32)))
-svuint32_t svbext(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u64)))
-svuint64_t svbext(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_n_u16)))
-svuint16_t svbext(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u8)))
-svuint8_t svbext(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u32)))
-svuint32_t svbext(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u64)))
-svuint64_t svbext(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbext_u16)))
-svuint16_t svbext(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u8)))
-svuint8_t svbgrp(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u32)))
-svuint32_t svbgrp(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u64)))
-svuint64_t svbgrp(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_n_u16)))
-svuint16_t svbgrp(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u8)))
-svuint8_t svbgrp(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u32)))
-svuint32_t svbgrp(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u64)))
-svuint64_t svbgrp(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbgrp_u16)))
-svuint16_t svbgrp(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrax1_u64)))
-svuint64_t svrax1_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrax1_s64)))
-svint64_t svrax1_s64(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrax1_u64)))
-svuint64_t svrax1(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrax1_s64)))
-svint64_t svrax1(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsm4e_u32)))
-svuint32_t svsm4e_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsm4ekey_u32)))
-svuint32_t svsm4ekey_u32(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsm4e_u32)))
-svuint32_t svsm4e(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsm4ekey_u32)))
-svuint32_t svsm4ekey(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_u8)))
-uint8x16_t svaddqv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_u32)))
-uint32x4_t svaddqv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_u64)))
-uint64x2_t svaddqv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_u16)))
-uint16x8_t svaddqv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_s8)))
-int8x16_t svaddqv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_s32)))
-int32x4_t svaddqv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_s64)))
-int64x2_t svaddqv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_s16)))
-int16x8_t svaddqv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_f64)))
-float64x2_t svaddqv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_f32)))
-float32x4_t svaddqv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_f16)))
-float16x8_t svaddqv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_u8)))
-uint8x16_t svandqv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_u32)))
-uint32x4_t svandqv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_u64)))
-uint64x2_t svandqv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_u16)))
-uint16x8_t svandqv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_s8)))
-int8x16_t svandqv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_s32)))
-int32x4_t svandqv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_s64)))
-int64x2_t svandqv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_s16)))
-int16x8_t svandqv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_u8)))
-uint8x16_t sveorqv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_u32)))
-uint32x4_t sveorqv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_u64)))
-uint64x2_t sveorqv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_u16)))
-uint16x8_t sveorqv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_s8)))
-int8x16_t sveorqv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_s32)))
-int32x4_t sveorqv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_s64)))
-int64x2_t sveorqv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_s16)))
-int16x8_t sveorqv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_u8)))
-svuint8_t svextq_u8(svuint8_t, svuint8_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_u32)))
-svuint32_t svextq_u32(svuint32_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_u64)))
-svuint64_t svextq_u64(svuint64_t, svuint64_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_u16)))
-svuint16_t svextq_u16(svuint16_t, svuint16_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_bf16)))
-svbfloat16_t svextq_bf16(svbfloat16_t, svbfloat16_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s8)))
-svint8_t svextq_s8(svint8_t, svint8_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_f64)))
-svfloat64_t svextq_f64(svfloat64_t, svfloat64_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_f32)))
-svfloat32_t svextq_f32(svfloat32_t, svfloat32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_f16)))
-svfloat16_t svextq_f16(svfloat16_t, svfloat16_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s32)))
-svint32_t svextq_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s64)))
-svint64_t svextq_s64(svint64_t, svint64_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s16)))
-svint16_t svextq_s16(svint16_t, svint16_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_u32)))
-svuint32_t svld1q_gather_u64base_index_u32(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_u64)))
-svuint64_t svld1q_gather_u64base_index_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_u16)))
-svuint16_t svld1q_gather_u64base_index_u16(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_bf16)))
-svbfloat16_t svld1q_gather_u64base_index_bf16(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_f64)))
-svfloat64_t svld1q_gather_u64base_index_f64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_f32)))
-svfloat32_t svld1q_gather_u64base_index_f32(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_f16)))
-svfloat16_t svld1q_gather_u64base_index_f16(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_s32)))
-svint32_t svld1q_gather_u64base_index_s32(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_s64)))
-svint64_t svld1q_gather_u64base_index_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_s16)))
-svint16_t svld1q_gather_u64base_index_s16(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_u8)))
-svuint8_t svld1q_gather_u64base_offset_u8(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_u32)))
-svuint32_t svld1q_gather_u64base_offset_u32(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_u64)))
-svuint64_t svld1q_gather_u64base_offset_u64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_u16)))
-svuint16_t svld1q_gather_u64base_offset_u16(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_bf16)))
-svbfloat16_t svld1q_gather_u64base_offset_bf16(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_s8)))
-svint8_t svld1q_gather_u64base_offset_s8(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_f64)))
-svfloat64_t svld1q_gather_u64base_offset_f64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_f32)))
-svfloat32_t svld1q_gather_u64base_offset_f32(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_f16)))
-svfloat16_t svld1q_gather_u64base_offset_f16(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_s32)))
-svint32_t svld1q_gather_u64base_offset_s32(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_s64)))
-svint64_t svld1q_gather_u64base_offset_s64(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_s16)))
-svint16_t svld1q_gather_u64base_offset_s16(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_u8)))
-svuint8_t svld1q_gather_u64base_u8(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_u32)))
-svuint32_t svld1q_gather_u64base_u32(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_u64)))
-svuint64_t svld1q_gather_u64base_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_u16)))
-svuint16_t svld1q_gather_u64base_u16(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_bf16)))
-svbfloat16_t svld1q_gather_u64base_bf16(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_s8)))
-svint8_t svld1q_gather_u64base_s8(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_f64)))
-svfloat64_t svld1q_gather_u64base_f64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_f32)))
-svfloat32_t svld1q_gather_u64base_f32(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_f16)))
-svfloat16_t svld1q_gather_u64base_f16(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_s32)))
-svint32_t svld1q_gather_u64base_s32(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_s64)))
-svint64_t svld1q_gather_u64base_s64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_s16)))
-svint16_t svld1q_gather_u64base_s16(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_u32)))
-svuint32_t svld1q_gather_u64index_u32(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_u64)))
-svuint64_t svld1q_gather_u64index_u64(svbool_t, uint64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_u16)))
-svuint16_t svld1q_gather_u64index_u16(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_bf16)))
-svbfloat16_t svld1q_gather_u64index_bf16(svbool_t, bfloat16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_f64)))
-svfloat64_t svld1q_gather_u64index_f64(svbool_t, float64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_f32)))
-svfloat32_t svld1q_gather_u64index_f32(svbool_t, float32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_f16)))
-svfloat16_t svld1q_gather_u64index_f16(svbool_t, float16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_s32)))
-svint32_t svld1q_gather_u64index_s32(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_s64)))
-svint64_t svld1q_gather_u64index_s64(svbool_t, int64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_s16)))
-svint16_t svld1q_gather_u64index_s16(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_u8)))
-svuint8_t svld1q_gather_u64offset_u8(svbool_t, uint8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_u32)))
-svuint32_t svld1q_gather_u64offset_u32(svbool_t, uint32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_u64)))
-svuint64_t svld1q_gather_u64offset_u64(svbool_t, uint64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_u16)))
-svuint16_t svld1q_gather_u64offset_u16(svbool_t, uint16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_bf16)))
-svbfloat16_t svld1q_gather_u64offset_bf16(svbool_t, bfloat16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_s8)))
-svint8_t svld1q_gather_u64offset_s8(svbool_t, int8_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_f64)))
-svfloat64_t svld1q_gather_u64offset_f64(svbool_t, float64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_f32)))
-svfloat32_t svld1q_gather_u64offset_f32(svbool_t, float32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_f16)))
-svfloat16_t svld1q_gather_u64offset_f16(svbool_t, float16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_s32)))
-svint32_t svld1q_gather_u64offset_s32(svbool_t, int32_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_s64)))
-svint64_t svld1q_gather_u64offset_s64(svbool_t, int64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_s16)))
-svint16_t svld1q_gather_u64offset_s16(svbool_t, int16_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_u64)))
-svuint64_t svld1udq_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_f64)))
-svfloat64_t svld1udq_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_s64)))
-svint64_t svld1udq_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_vnum_u64)))
-svuint64_t svld1udq_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_vnum_f64)))
-svfloat64_t svld1udq_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_vnum_s64)))
-svint64_t svld1udq_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_u32)))
-svuint32_t svld1uwq_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_f32)))
-svfloat32_t svld1uwq_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_s32)))
-svint32_t svld1uwq_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_vnum_u32)))
-svuint32_t svld1uwq_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_vnum_f32)))
-svfloat32_t svld1uwq_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_vnum_s32)))
-svint32_t svld1uwq_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_u8)))
-svuint8x2_t svld2q_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_u32)))
-svuint32x2_t svld2q_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_u64)))
-svuint64x2_t svld2q_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_u16)))
-svuint16x2_t svld2q_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_s8)))
-svint8x2_t svld2q_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_f64)))
-svfloat64x2_t svld2q_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_f32)))
-svfloat32x2_t svld2q_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_f16)))
-svfloat16x2_t svld2q_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_s32)))
-svint32x2_t svld2q_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_s64)))
-svint64x2_t svld2q_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_s16)))
-svint16x2_t svld2q_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_bf16)))
-svbfloat16x2_t svld2q_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_u8)))
-svuint8x2_t svld2q_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_u32)))
-svuint32x2_t svld2q_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_u64)))
-svuint64x2_t svld2q_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_u16)))
-svuint16x2_t svld2q_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_s8)))
-svint8x2_t svld2q_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_f64)))
-svfloat64x2_t svld2q_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_f32)))
-svfloat32x2_t svld2q_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_f16)))
-svfloat16x2_t svld2q_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_s32)))
-svint32x2_t svld2q_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_s64)))
-svint64x2_t svld2q_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_s16)))
-svint16x2_t svld2q_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_bf16)))
-svbfloat16x2_t svld2q_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_u8)))
-svuint8x3_t svld3q_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_u32)))
-svuint32x3_t svld3q_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_u64)))
-svuint64x3_t svld3q_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_u16)))
-svuint16x3_t svld3q_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_s8)))
-svint8x3_t svld3q_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_f64)))
-svfloat64x3_t svld3q_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_f32)))
-svfloat32x3_t svld3q_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_f16)))
-svfloat16x3_t svld3q_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_s32)))
-svint32x3_t svld3q_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_s64)))
-svint64x3_t svld3q_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_s16)))
-svint16x3_t svld3q_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_bf16)))
-svbfloat16x3_t svld3q_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_u8)))
-svuint8x3_t svld3q_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_u32)))
-svuint32x3_t svld3q_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_u64)))
-svuint64x3_t svld3q_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_u16)))
-svuint16x3_t svld3q_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_s8)))
-svint8x3_t svld3q_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_f64)))
-svfloat64x3_t svld3q_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_f32)))
-svfloat32x3_t svld3q_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_f16)))
-svfloat16x3_t svld3q_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_s32)))
-svint32x3_t svld3q_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_s64)))
-svint64x3_t svld3q_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_s16)))
-svint16x3_t svld3q_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_bf16)))
-svbfloat16x3_t svld3q_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_u8)))
-svuint8x4_t svld4q_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_u32)))
-svuint32x4_t svld4q_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_u64)))
-svuint64x4_t svld4q_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_u16)))
-svuint16x4_t svld4q_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_s8)))
-svint8x4_t svld4q_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_f64)))
-svfloat64x4_t svld4q_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_f32)))
-svfloat32x4_t svld4q_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_f16)))
-svfloat16x4_t svld4q_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_s32)))
-svint32x4_t svld4q_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_s64)))
-svint64x4_t svld4q_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_s16)))
-svint16x4_t svld4q_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_bf16)))
-svbfloat16x4_t svld4q_bf16(svbool_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_u8)))
-svuint8x4_t svld4q_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_u32)))
-svuint32x4_t svld4q_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_u64)))
-svuint64x4_t svld4q_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_u16)))
-svuint16x4_t svld4q_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_s8)))
-svint8x4_t svld4q_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_f64)))
-svfloat64x4_t svld4q_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_f32)))
-svfloat32x4_t svld4q_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_f16)))
-svfloat16x4_t svld4q_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_s32)))
-svint32x4_t svld4q_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_s64)))
-svint64x4_t svld4q_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_s16)))
-svint16x4_t svld4q_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_bf16)))
-svbfloat16x4_t svld4q_vnum_bf16(svbool_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmqv_f64)))
-float64x2_t svmaxnmqv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmqv_f32)))
-float32x4_t svmaxnmqv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmqv_f16)))
-float16x8_t svmaxnmqv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_f64)))
-float64x2_t svmaxqv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_f32)))
-float32x4_t svmaxqv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_f16)))
-float16x8_t svmaxqv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_s8)))
-int8x16_t svmaxqv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_s32)))
-int32x4_t svmaxqv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_s64)))
-int64x2_t svmaxqv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_s16)))
-int16x8_t svmaxqv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_u8)))
-uint8x16_t svmaxqv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_u32)))
-uint32x4_t svmaxqv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_u64)))
-uint64x2_t svmaxqv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_u16)))
-uint16x8_t svmaxqv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmqv_f64)))
-float64x2_t svminnmqv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmqv_f32)))
-float32x4_t svminnmqv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmqv_f16)))
-float16x8_t svminnmqv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_f64)))
-float64x2_t svminqv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_f32)))
-float32x4_t svminqv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_f16)))
-float16x8_t svminqv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_s8)))
-int8x16_t svminqv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_s32)))
-int32x4_t svminqv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_s64)))
-int64x2_t svminqv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_s16)))
-int16x8_t svminqv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_u8)))
-uint8x16_t svminqv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_u32)))
-uint32x4_t svminqv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_u64)))
-uint64x2_t svminqv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_u16)))
-uint16x8_t svminqv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_u8)))
-uint8x16_t svorqv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_u32)))
-uint32x4_t svorqv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_u64)))
-uint64x2_t svorqv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_u16)))
-uint16x8_t svorqv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_s8)))
-int8x16_t svorqv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_s32)))
-int32x4_t svorqv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_s64)))
-int64x2_t svorqv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_s16)))
-int16x8_t svorqv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u8)))
-svbool_t svpmov_u8(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s8)))
-svbool_t svpmov_s8(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u64)))
-svbool_t svpmov_u64(svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s64)))
-svbool_t svpmov_s64(svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u16)))
-svbool_t svpmov_u16(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s16)))
-svbool_t svpmov_s16(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u32)))
-svbool_t svpmov_u32(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s32)))
-svbool_t svpmov_s32(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u8)))
-svbool_t svpmov_lane_u8(svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s8)))
-svbool_t svpmov_lane_s8(svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u64)))
-svbool_t svpmov_lane_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s64)))
-svbool_t svpmov_lane_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u16)))
-svbool_t svpmov_lane_u16(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s16)))
-svbool_t svpmov_lane_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u32)))
-svbool_t svpmov_lane_u32(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s32)))
-svbool_t svpmov_lane_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u64_m)))
-svuint64_t svpmov_lane_u64_m(svuint64_t, svbool_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s64_m)))
-svint64_t svpmov_lane_s64_m(svint64_t, svbool_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u16_m)))
-svuint16_t svpmov_lane_u16_m(svuint16_t, svbool_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s16_m)))
-svint16_t svpmov_lane_s16_m(svint16_t, svbool_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u32_m)))
-svuint32_t svpmov_lane_u32_m(svuint32_t, svbool_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s32_m)))
-svint32_t svpmov_lane_s32_m(svint32_t, svbool_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u8_z)))
-svuint8_t svpmov_u8_z(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s8_z)))
-svint8_t svpmov_s8_z(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u64_z)))
-svuint64_t svpmov_u64_z(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s64_z)))
-svint64_t svpmov_s64_z(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u16_z)))
-svuint16_t svpmov_u16_z(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s16_z)))
-svint16_t svpmov_s16_z(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u32_z)))
-svuint32_t svpmov_u32_z(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s32_z)))
-svint32_t svpmov_s32_z(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_u64)))
-void svst1dq_u64(svbool_t, uint64_t const *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_f64)))
-void svst1dq_f64(svbool_t, float64_t const *, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_s64)))
-void svst1dq_s64(svbool_t, int64_t const *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_u64)))
-void svst1dq_vnum_u64(svbool_t, uint64_t const *, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_f64)))
-void svst1dq_vnum_f64(svbool_t, float64_t const *, int64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_s64)))
-void svst1dq_vnum_s64(svbool_t, int64_t const *, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u8)))
-void svst1q_scatter_u64base_u8(svbool_t, svuint64_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u32)))
-void svst1q_scatter_u64base_u32(svbool_t, svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u64)))
-void svst1q_scatter_u64base_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u16)))
-void svst1q_scatter_u64base_u16(svbool_t, svuint64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_bf16)))
-void svst1q_scatter_u64base_bf16(svbool_t, svuint64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_s8)))
-void svst1q_scatter_u64base_s8(svbool_t, svuint64_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_f64)))
-void svst1q_scatter_u64base_f64(svbool_t, svuint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_f32)))
-void svst1q_scatter_u64base_f32(svbool_t, svuint64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_f16)))
-void svst1q_scatter_u64base_f16(svbool_t, svuint64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_s32)))
-void svst1q_scatter_u64base_s32(svbool_t, svuint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_s64)))
-void svst1q_scatter_u64base_s64(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_s16)))
-void svst1q_scatter_u64base_s16(svbool_t, svuint64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_u32)))
-void svst1q_scatter_u64base_index_u32(svbool_t, svuint64_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_u64)))
-void svst1q_scatter_u64base_index_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_u16)))
-void svst1q_scatter_u64base_index_u16(svbool_t, svuint64_t, int64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_bf16)))
-void svst1q_scatter_u64base_index_bf16(svbool_t, svuint64_t, int64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_f64)))
-void svst1q_scatter_u64base_index_f64(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_f32)))
-void svst1q_scatter_u64base_index_f32(svbool_t, svuint64_t, int64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_f16)))
-void svst1q_scatter_u64base_index_f16(svbool_t, svuint64_t, int64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_s32)))
-void svst1q_scatter_u64base_index_s32(svbool_t, svuint64_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_s64)))
-void svst1q_scatter_u64base_index_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_s16)))
-void svst1q_scatter_u64base_index_s16(svbool_t, svuint64_t, int64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_u8)))
-void svst1q_scatter_u64base_offset_u8(svbool_t, svuint64_t, int64_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_u32)))
-void svst1q_scatter_u64base_offset_u32(svbool_t, svuint64_t, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_u64)))
-void svst1q_scatter_u64base_offset_u64(svbool_t, svuint64_t, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_u16)))
-void svst1q_scatter_u64base_offset_u16(svbool_t, svuint64_t, int64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_bf16)))
-void svst1q_scatter_u64base_offset_bf16(svbool_t, svuint64_t, int64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s8)))
-void svst1q_scatter_u64base_offset_s8(svbool_t, svuint64_t, int64_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_f64)))
-void svst1q_scatter_u64base_offset_f64(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_f32)))
-void svst1q_scatter_u64base_offset_f32(svbool_t, svuint64_t, int64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_f16)))
-void svst1q_scatter_u64base_offset_f16(svbool_t, svuint64_t, int64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s32)))
-void svst1q_scatter_u64base_offset_s32(svbool_t, svuint64_t, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s64)))
-void svst1q_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s16)))
-void svst1q_scatter_u64base_offset_s16(svbool_t, svuint64_t, int64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u32)))
-void svst1q_scatter_u64index_u32(svbool_t, uint32_t *, svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u64)))
-void svst1q_scatter_u64index_u64(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u16)))
-void svst1q_scatter_u64index_u16(svbool_t, uint16_t *, svuint64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_bf16)))
-void svst1q_scatter_u64index_bf16(svbool_t, bfloat16_t *, svuint64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_f64)))
-void svst1q_scatter_u64index_f64(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_f32)))
-void svst1q_scatter_u64index_f32(svbool_t, float32_t *, svuint64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_f16)))
-void svst1q_scatter_u64index_f16(svbool_t, float16_t *, svuint64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_s32)))
-void svst1q_scatter_u64index_s32(svbool_t, int32_t *, svuint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_s64)))
-void svst1q_scatter_u64index_s64(svbool_t, int64_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_s16)))
-void svst1q_scatter_u64index_s16(svbool_t, int16_t *, svuint64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u8)))
-void svst1q_scatter_u64offset_u8(svbool_t, uint8_t *, svuint64_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u32)))
-void svst1q_scatter_u64offset_u32(svbool_t, uint32_t *, svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u64)))
-void svst1q_scatter_u64offset_u64(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u16)))
-void svst1q_scatter_u64offset_u16(svbool_t, uint16_t *, svuint64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_bf16)))
-void svst1q_scatter_u64offset_bf16(svbool_t, bfloat16_t *, svuint64_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s8)))
-void svst1q_scatter_u64offset_s8(svbool_t, int8_t *, svuint64_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_f64)))
-void svst1q_scatter_u64offset_f64(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_f32)))
-void svst1q_scatter_u64offset_f32(svbool_t, float32_t *, svuint64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_f16)))
-void svst1q_scatter_u64offset_f16(svbool_t, float16_t *, svuint64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s32)))
-void svst1q_scatter_u64offset_s32(svbool_t, int32_t *, svuint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s64)))
-void svst1q_scatter_u64offset_s64(svbool_t, int64_t *, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s16)))
-void svst1q_scatter_u64offset_s16(svbool_t, int16_t *, svuint64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_u32)))
-void svst1wq_u32(svbool_t, uint32_t const *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_f32)))
-void svst1wq_f32(svbool_t, float32_t const *, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_s32)))
-void svst1wq_s32(svbool_t, int32_t const *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_u32)))
-void svst1wq_vnum_u32(svbool_t, uint32_t const *, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_f32)))
-void svst1wq_vnum_f32(svbool_t, float32_t const *, int64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_s32)))
-void svst1wq_vnum_s32(svbool_t, int32_t const *, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u8)))
-void svst2q_u8(svbool_t, uint8_t const *, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u32)))
-void svst2q_u32(svbool_t, uint32_t const *, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u64)))
-void svst2q_u64(svbool_t, uint64_t const *, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u16)))
-void svst2q_u16(svbool_t, uint16_t const *, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_s8)))
-void svst2q_s8(svbool_t, int8_t const *, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_f64)))
-void svst2q_f64(svbool_t, float64_t const *, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_f32)))
-void svst2q_f32(svbool_t, float32_t const *, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_f16)))
-void svst2q_f16(svbool_t, float16_t const *, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_s32)))
-void svst2q_s32(svbool_t, int32_t const *, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_s64)))
-void svst2q_s64(svbool_t, int64_t const *, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_s16)))
-void svst2q_s16(svbool_t, int16_t const *, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_bf16)))
-void svst2q_bf16(svbool_t, bfloat16_t const *, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_u8)))
-void svst2q_vnum_u8(svbool_t, uint8_t const *, int64_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_u32)))
-void svst2q_vnum_u32(svbool_t, uint32_t const *, int64_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_u64)))
-void svst2q_vnum_u64(svbool_t, uint64_t const *, int64_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_u16)))
-void svst2q_vnum_u16(svbool_t, uint16_t const *, int64_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_s8)))
-void svst2q_vnum_s8(svbool_t, int8_t const *, int64_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_f64)))
-void svst2q_vnum_f64(svbool_t, float64_t const *, int64_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_f32)))
-void svst2q_vnum_f32(svbool_t, float32_t const *, int64_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_f16)))
-void svst2q_vnum_f16(svbool_t, float16_t const *, int64_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_s32)))
-void svst2q_vnum_s32(svbool_t, int32_t const *, int64_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_s64)))
-void svst2q_vnum_s64(svbool_t, int64_t const *, int64_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_s16)))
-void svst2q_vnum_s16(svbool_t, int16_t const *, int64_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_bf16)))
-void svst2q_vnum_bf16(svbool_t, bfloat16_t const *, int64_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_u8)))
-void svst3q_u8(svbool_t, uint8_t const *, svuint8x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_u32)))
-void svst3q_u32(svbool_t, uint32_t const *, svuint32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_u64)))
-void svst3q_u64(svbool_t, uint64_t const *, svuint64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_u16)))
-void svst3q_u16(svbool_t, uint16_t const *, svuint16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_s8)))
-void svst3q_s8(svbool_t, int8_t const *, svint8x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_f64)))
-void svst3q_f64(svbool_t, float64_t const *, svfloat64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_f32)))
-void svst3q_f32(svbool_t, float32_t const *, svfloat32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_f16)))
-void svst3q_f16(svbool_t, float16_t const *, svfloat16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_s32)))
-void svst3q_s32(svbool_t, int32_t const *, svint32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_s64)))
-void svst3q_s64(svbool_t, int64_t const *, svint64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_s16)))
-void svst3q_s16(svbool_t, int16_t const *, svint16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_bf16)))
-void svst3q_bf16(svbool_t, bfloat16_t const *, svbfloat16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_u8)))
-void svst3q_vnum_u8(svbool_t, uint8_t const *, int64_t, svuint8x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_u32)))
-void svst3q_vnum_u32(svbool_t, uint32_t const *, int64_t, svuint32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_u64)))
-void svst3q_vnum_u64(svbool_t, uint64_t const *, int64_t, svuint64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_u16)))
-void svst3q_vnum_u16(svbool_t, uint16_t const *, int64_t, svuint16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_s8)))
-void svst3q_vnum_s8(svbool_t, int8_t const *, int64_t, svint8x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_f64)))
-void svst3q_vnum_f64(svbool_t, float64_t const *, int64_t, svfloat64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_f32)))
-void svst3q_vnum_f32(svbool_t, float32_t const *, int64_t, svfloat32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_f16)))
-void svst3q_vnum_f16(svbool_t, float16_t const *, int64_t, svfloat16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_s32)))
-void svst3q_vnum_s32(svbool_t, int32_t const *, int64_t, svint32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_s64)))
-void svst3q_vnum_s64(svbool_t, int64_t const *, int64_t, svint64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_s16)))
-void svst3q_vnum_s16(svbool_t, int16_t const *, int64_t, svint16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_bf16)))
-void svst3q_vnum_bf16(svbool_t, bfloat16_t const *, int64_t, svbfloat16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_u8)))
-void svst4q_u8(svbool_t, uint8_t const *, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_u32)))
-void svst4q_u32(svbool_t, uint32_t const *, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_u64)))
-void svst4q_u64(svbool_t, uint64_t const *, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_u16)))
-void svst4q_u16(svbool_t, uint16_t const *, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_s8)))
-void svst4q_s8(svbool_t, int8_t const *, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_f64)))
-void svst4q_f64(svbool_t, float64_t const *, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_f32)))
-void svst4q_f32(svbool_t, float32_t const *, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_f16)))
-void svst4q_f16(svbool_t, float16_t const *, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_s32)))
-void svst4q_s32(svbool_t, int32_t const *, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_s64)))
-void svst4q_s64(svbool_t, int64_t const *, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_s16)))
-void svst4q_s16(svbool_t, int16_t const *, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_bf16)))
-void svst4q_bf16(svbool_t, bfloat16_t const *, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_u8)))
-void svst4q_vnum_u8(svbool_t, uint8_t const *, int64_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_u32)))
-void svst4q_vnum_u32(svbool_t, uint32_t const *, int64_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_u64)))
-void svst4q_vnum_u64(svbool_t, uint64_t const *, int64_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_u16)))
-void svst4q_vnum_u16(svbool_t, uint16_t const *, int64_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_s8)))
-void svst4q_vnum_s8(svbool_t, int8_t const *, int64_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_f64)))
-void svst4q_vnum_f64(svbool_t, float64_t const *, int64_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_f32)))
-void svst4q_vnum_f32(svbool_t, float32_t const *, int64_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_f16)))
-void svst4q_vnum_f16(svbool_t, float16_t const *, int64_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_s32)))
-void svst4q_vnum_s32(svbool_t, int32_t const *, int64_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_s64)))
-void svst4q_vnum_s64(svbool_t, int64_t const *, int64_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_s16)))
-void svst4q_vnum_s16(svbool_t, int16_t const *, int64_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_bf16)))
-void svst4q_vnum_bf16(svbool_t, bfloat16_t const *, int64_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_u8)))
-svuint8_t svtblq_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_u32)))
-svuint32_t svtblq_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_u64)))
-svuint64_t svtblq_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_u16)))
-svuint16_t svtblq_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_bf16)))
-svbfloat16_t svtblq_bf16(svbfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s8)))
-svint8_t svtblq_s8(svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_f64)))
-svfloat64_t svtblq_f64(svfloat64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_f32)))
-svfloat32_t svtblq_f32(svfloat32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_f16)))
-svfloat16_t svtblq_f16(svfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s32)))
-svint32_t svtblq_s32(svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s64)))
-svint64_t svtblq_s64(svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s16)))
-svint16_t svtblq_s16(svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u8)))
-svuint8_t svtbxq_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u32)))
-svuint32_t svtbxq_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u64)))
-svuint64_t svtbxq_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u16)))
-svuint16_t svtbxq_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_bf16)))
-svbfloat16_t svtbxq_bf16(svbfloat16_t, svbfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s8)))
-svint8_t svtbxq_s8(svint8_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_f64)))
-svfloat64_t svtbxq_f64(svfloat64_t, svfloat64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_f32)))
-svfloat32_t svtbxq_f32(svfloat32_t, svfloat32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_f16)))
-svfloat16_t svtbxq_f16(svfloat16_t, svfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s32)))
-svint32_t svtbxq_s32(svint32_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s64)))
-svint64_t svtbxq_s64(svint64_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s16)))
-svint16_t svtbxq_s16(svint16_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u8)))
-svuint8_t svuzpq1_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u32)))
-svuint32_t svuzpq1_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u64)))
-svuint64_t svuzpq1_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u16)))
-svuint16_t svuzpq1_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_bf16)))
-svbfloat16_t svuzpq1_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s8)))
-svint8_t svuzpq1_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_f64)))
-svfloat64_t svuzpq1_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_f32)))
-svfloat32_t svuzpq1_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_f16)))
-svfloat16_t svuzpq1_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s32)))
-svint32_t svuzpq1_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s64)))
-svint64_t svuzpq1_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s16)))
-svint16_t svuzpq1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u8)))
-svuint8_t svuzpq2_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u32)))
-svuint32_t svuzpq2_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u64)))
-svuint64_t svuzpq2_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u16)))
-svuint16_t svuzpq2_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_bf16)))
-svbfloat16_t svuzpq2_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s8)))
-svint8_t svuzpq2_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_f64)))
-svfloat64_t svuzpq2_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_f32)))
-svfloat32_t svuzpq2_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_f16)))
-svfloat16_t svuzpq2_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s32)))
-svint32_t svuzpq2_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s64)))
-svint64_t svuzpq2_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s16)))
-svint16_t svuzpq2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u8)))
-svuint8_t svzipq1_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u32)))
-svuint32_t svzipq1_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u64)))
-svuint64_t svzipq1_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u16)))
-svuint16_t svzipq1_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_bf16)))
-svbfloat16_t svzipq1_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s8)))
-svint8_t svzipq1_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_f64)))
-svfloat64_t svzipq1_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_f32)))
-svfloat32_t svzipq1_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_f16)))
-svfloat16_t svzipq1_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s32)))
-svint32_t svzipq1_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s64)))
-svint64_t svzipq1_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s16)))
-svint16_t svzipq1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u8)))
-svuint8_t svzipq2_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u32)))
-svuint32_t svzipq2_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u64)))
-svuint64_t svzipq2_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u16)))
-svuint16_t svzipq2_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_bf16)))
-svbfloat16_t svzipq2_bf16(svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s8)))
-svint8_t svzipq2_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_f64)))
-svfloat64_t svzipq2_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_f32)))
-svfloat32_t svzipq2_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_f16)))
-svfloat16_t svzipq2_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s32)))
-svint32_t svzipq2_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s64)))
-svint64_t svzipq2_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s16)))
-svint16_t svzipq2_s16(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_u8)))
-uint8x16_t svaddqv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_u32)))
-uint32x4_t svaddqv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_u64)))
-uint64x2_t svaddqv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_u16)))
-uint16x8_t svaddqv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_s8)))
-int8x16_t svaddqv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_s32)))
-int32x4_t svaddqv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_s64)))
-int64x2_t svaddqv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_s16)))
-int16x8_t svaddqv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_f64)))
-float64x2_t svaddqv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_f32)))
-float32x4_t svaddqv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_f16)))
-float16x8_t svaddqv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_u8)))
-uint8x16_t svandqv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_u32)))
-uint32x4_t svandqv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_u64)))
-uint64x2_t svandqv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_u16)))
-uint16x8_t svandqv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_s8)))
-int8x16_t svandqv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_s32)))
-int32x4_t svandqv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_s64)))
-int64x2_t svandqv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandqv_s16)))
-int16x8_t svandqv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_u8)))
-uint8x16_t sveorqv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_u32)))
-uint32x4_t sveorqv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_u64)))
-uint64x2_t sveorqv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_u16)))
-uint16x8_t sveorqv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_s8)))
-int8x16_t sveorqv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_s32)))
-int32x4_t sveorqv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_s64)))
-int64x2_t sveorqv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorqv_s16)))
-int16x8_t sveorqv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_u8)))
-svuint8_t svextq(svuint8_t, svuint8_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_u32)))
-svuint32_t svextq(svuint32_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_u64)))
-svuint64_t svextq(svuint64_t, svuint64_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_u16)))
-svuint16_t svextq(svuint16_t, svuint16_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_bf16)))
-svbfloat16_t svextq(svbfloat16_t, svbfloat16_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s8)))
-svint8_t svextq(svint8_t, svint8_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_f64)))
-svfloat64_t svextq(svfloat64_t, svfloat64_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_f32)))
-svfloat32_t svextq(svfloat32_t, svfloat32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_f16)))
-svfloat16_t svextq(svfloat16_t, svfloat16_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s32)))
-svint32_t svextq(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s64)))
-svint64_t svextq(svint64_t, svint64_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s16)))
-svint16_t svextq(svint16_t, svint16_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_u32)))
-svuint32_t svld1q_gather_index_u32(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_u64)))
-svuint64_t svld1q_gather_index_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_u16)))
-svuint16_t svld1q_gather_index_u16(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_bf16)))
-svbfloat16_t svld1q_gather_index_bf16(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_f64)))
-svfloat64_t svld1q_gather_index_f64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_f32)))
-svfloat32_t svld1q_gather_index_f32(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_f16)))
-svfloat16_t svld1q_gather_index_f16(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_s32)))
-svint32_t svld1q_gather_index_s32(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_s64)))
-svint64_t svld1q_gather_index_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_s16)))
-svint16_t svld1q_gather_index_s16(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_u8)))
-svuint8_t svld1q_gather_offset_u8(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_u32)))
-svuint32_t svld1q_gather_offset_u32(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_u64)))
-svuint64_t svld1q_gather_offset_u64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_u16)))
-svuint16_t svld1q_gather_offset_u16(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_bf16)))
-svbfloat16_t svld1q_gather_offset_bf16(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_s8)))
-svint8_t svld1q_gather_offset_s8(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_f64)))
-svfloat64_t svld1q_gather_offset_f64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_f32)))
-svfloat32_t svld1q_gather_offset_f32(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_f16)))
-svfloat16_t svld1q_gather_offset_f16(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_s32)))
-svint32_t svld1q_gather_offset_s32(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_s64)))
-svint64_t svld1q_gather_offset_s64(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_offset_s16)))
-svint16_t svld1q_gather_offset_s16(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_u8)))
-svuint8_t svld1q_gather_u8(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_u32)))
-svuint32_t svld1q_gather_u32(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_u64)))
-svuint64_t svld1q_gather_u64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_u16)))
-svuint16_t svld1q_gather_u16(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_bf16)))
-svbfloat16_t svld1q_gather_bf16(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_s8)))
-svint8_t svld1q_gather_s8(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_f64)))
-svfloat64_t svld1q_gather_f64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_f32)))
-svfloat32_t svld1q_gather_f32(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_f16)))
-svfloat16_t svld1q_gather_f16(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_s32)))
-svint32_t svld1q_gather_s32(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_s64)))
-svint64_t svld1q_gather_s64(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_s16)))
-svint16_t svld1q_gather_s16(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_u32)))
-svuint32_t svld1q_gather_index(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_u64)))
-svuint64_t svld1q_gather_index(svbool_t, uint64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_u16)))
-svuint16_t svld1q_gather_index(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_bf16)))
-svbfloat16_t svld1q_gather_index(svbool_t, bfloat16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_f64)))
-svfloat64_t svld1q_gather_index(svbool_t, float64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_f32)))
-svfloat32_t svld1q_gather_index(svbool_t, float32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_f16)))
-svfloat16_t svld1q_gather_index(svbool_t, float16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_s32)))
-svint32_t svld1q_gather_index(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_s64)))
-svint64_t svld1q_gather_index(svbool_t, int64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64index_s16)))
-svint16_t svld1q_gather_index(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_u8)))
-svuint8_t svld1q_gather_offset(svbool_t, uint8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_u32)))
-svuint32_t svld1q_gather_offset(svbool_t, uint32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_u64)))
-svuint64_t svld1q_gather_offset(svbool_t, uint64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_u16)))
-svuint16_t svld1q_gather_offset(svbool_t, uint16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_bf16)))
-svbfloat16_t svld1q_gather_offset(svbool_t, bfloat16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_s8)))
-svint8_t svld1q_gather_offset(svbool_t, int8_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_f64)))
-svfloat64_t svld1q_gather_offset(svbool_t, float64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_f32)))
-svfloat32_t svld1q_gather_offset(svbool_t, float32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_f16)))
-svfloat16_t svld1q_gather_offset(svbool_t, float16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_s32)))
-svint32_t svld1q_gather_offset(svbool_t, int32_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_s64)))
-svint64_t svld1q_gather_offset(svbool_t, int64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64offset_s16)))
-svint16_t svld1q_gather_offset(svbool_t, int16_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_u64)))
-svuint64_t svld1udq(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_f64)))
-svfloat64_t svld1udq(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_s64)))
-svint64_t svld1udq(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_vnum_u64)))
-svuint64_t svld1udq_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_vnum_f64)))
-svfloat64_t svld1udq_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1udq_vnum_s64)))
-svint64_t svld1udq_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_u32)))
-svuint32_t svld1uwq(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_f32)))
-svfloat32_t svld1uwq(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_s32)))
-svint32_t svld1uwq(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_vnum_u32)))
-svuint32_t svld1uwq_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_vnum_f32)))
-svfloat32_t svld1uwq_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uwq_vnum_s32)))
-svint32_t svld1uwq_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_u8)))
-svuint8x2_t svld2q(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_u32)))
-svuint32x2_t svld2q(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_u64)))
-svuint64x2_t svld2q(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_u16)))
-svuint16x2_t svld2q(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_s8)))
-svint8x2_t svld2q(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_f64)))
-svfloat64x2_t svld2q(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_f32)))
-svfloat32x2_t svld2q(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_f16)))
-svfloat16x2_t svld2q(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_s32)))
-svint32x2_t svld2q(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_s64)))
-svint64x2_t svld2q(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_s16)))
-svint16x2_t svld2q(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_bf16)))
-svbfloat16x2_t svld2q(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_u8)))
-svuint8x2_t svld2q_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_u32)))
-svuint32x2_t svld2q_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_u64)))
-svuint64x2_t svld2q_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_u16)))
-svuint16x2_t svld2q_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_s8)))
-svint8x2_t svld2q_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_f64)))
-svfloat64x2_t svld2q_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_f32)))
-svfloat32x2_t svld2q_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_f16)))
-svfloat16x2_t svld2q_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_s32)))
-svint32x2_t svld2q_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_s64)))
-svint64x2_t svld2q_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_s16)))
-svint16x2_t svld2q_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2q_vnum_bf16)))
-svbfloat16x2_t svld2q_vnum(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_u8)))
-svuint8x3_t svld3q(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_u32)))
-svuint32x3_t svld3q(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_u64)))
-svuint64x3_t svld3q(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_u16)))
-svuint16x3_t svld3q(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_s8)))
-svint8x3_t svld3q(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_f64)))
-svfloat64x3_t svld3q(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_f32)))
-svfloat32x3_t svld3q(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_f16)))
-svfloat16x3_t svld3q(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_s32)))
-svint32x3_t svld3q(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_s64)))
-svint64x3_t svld3q(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_s16)))
-svint16x3_t svld3q(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_bf16)))
-svbfloat16x3_t svld3q(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_u8)))
-svuint8x3_t svld3q_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_u32)))
-svuint32x3_t svld3q_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_u64)))
-svuint64x3_t svld3q_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_u16)))
-svuint16x3_t svld3q_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_s8)))
-svint8x3_t svld3q_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_f64)))
-svfloat64x3_t svld3q_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_f32)))
-svfloat32x3_t svld3q_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_f16)))
-svfloat16x3_t svld3q_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_s32)))
-svint32x3_t svld3q_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_s64)))
-svint64x3_t svld3q_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_s16)))
-svint16x3_t svld3q_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3q_vnum_bf16)))
-svbfloat16x3_t svld3q_vnum(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_u8)))
-svuint8x4_t svld4q(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_u32)))
-svuint32x4_t svld4q(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_u64)))
-svuint64x4_t svld4q(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_u16)))
-svuint16x4_t svld4q(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_s8)))
-svint8x4_t svld4q(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_f64)))
-svfloat64x4_t svld4q(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_f32)))
-svfloat32x4_t svld4q(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_f16)))
-svfloat16x4_t svld4q(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_s32)))
-svint32x4_t svld4q(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_s64)))
-svint64x4_t svld4q(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_s16)))
-svint16x4_t svld4q(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_bf16)))
-svbfloat16x4_t svld4q(svbool_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_u8)))
-svuint8x4_t svld4q_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_u32)))
-svuint32x4_t svld4q_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_u64)))
-svuint64x4_t svld4q_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_u16)))
-svuint16x4_t svld4q_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_s8)))
-svint8x4_t svld4q_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_f64)))
-svfloat64x4_t svld4q_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_f32)))
-svfloat32x4_t svld4q_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_f16)))
-svfloat16x4_t svld4q_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_s32)))
-svint32x4_t svld4q_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_s64)))
-svint64x4_t svld4q_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_s16)))
-svint16x4_t svld4q_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4q_vnum_bf16)))
-svbfloat16x4_t svld4q_vnum(svbool_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmqv_f64)))
-float64x2_t svmaxnmqv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmqv_f32)))
-float32x4_t svmaxnmqv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmqv_f16)))
-float16x8_t svmaxnmqv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_f64)))
-float64x2_t svmaxqv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_f32)))
-float32x4_t svmaxqv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_f16)))
-float16x8_t svmaxqv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_s8)))
-int8x16_t svmaxqv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_s32)))
-int32x4_t svmaxqv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_s64)))
-int64x2_t svmaxqv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_s16)))
-int16x8_t svmaxqv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_u8)))
-uint8x16_t svmaxqv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_u32)))
-uint32x4_t svmaxqv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_u64)))
-uint64x2_t svmaxqv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxqv_u16)))
-uint16x8_t svmaxqv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmqv_f64)))
-float64x2_t svminnmqv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmqv_f32)))
-float32x4_t svminnmqv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmqv_f16)))
-float16x8_t svminnmqv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_f64)))
-float64x2_t svminqv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_f32)))
-float32x4_t svminqv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_f16)))
-float16x8_t svminqv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_s8)))
-int8x16_t svminqv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_s32)))
-int32x4_t svminqv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_s64)))
-int64x2_t svminqv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_s16)))
-int16x8_t svminqv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_u8)))
-uint8x16_t svminqv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_u32)))
-uint32x4_t svminqv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_u64)))
-uint64x2_t svminqv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminqv_u16)))
-uint16x8_t svminqv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_u8)))
-uint8x16_t svorqv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_u32)))
-uint32x4_t svorqv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_u64)))
-uint64x2_t svorqv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_u16)))
-uint16x8_t svorqv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_s8)))
-int8x16_t svorqv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_s32)))
-int32x4_t svorqv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_s64)))
-int64x2_t svorqv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorqv_s16)))
-int16x8_t svorqv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u8)))
-svbool_t svpmov(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s8)))
-svbool_t svpmov(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u64)))
-svbool_t svpmov(svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s64)))
-svbool_t svpmov(svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u16)))
-svbool_t svpmov(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s16)))
-svbool_t svpmov(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_u32)))
-svbool_t svpmov(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s32)))
-svbool_t svpmov(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u8)))
-svbool_t svpmov_lane(svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s8)))
-svbool_t svpmov_lane(svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u64)))
-svbool_t svpmov_lane(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s64)))
-svbool_t svpmov_lane(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u16)))
-svbool_t svpmov_lane(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s16)))
-svbool_t svpmov_lane(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u32)))
-svbool_t svpmov_lane(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s32)))
-svbool_t svpmov_lane(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u64_m)))
-svuint64_t svpmov_lane_m(svuint64_t, svbool_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s64_m)))
-svint64_t svpmov_lane_m(svint64_t, svbool_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u16_m)))
-svuint16_t svpmov_lane_m(svuint16_t, svbool_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s16_m)))
-svint16_t svpmov_lane_m(svint16_t, svbool_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_u32_m)))
-svuint32_t svpmov_lane_m(svuint32_t, svbool_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s32_m)))
-svint32_t svpmov_lane_m(svint32_t, svbool_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_u64)))
-void svst1dq(svbool_t, uint64_t const *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_f64)))
-void svst1dq(svbool_t, float64_t const *, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_s64)))
-void svst1dq(svbool_t, int64_t const *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_u64)))
-void svst1dq_vnum(svbool_t, uint64_t const *, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_f64)))
-void svst1dq_vnum(svbool_t, float64_t const *, int64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_s64)))
-void svst1dq_vnum(svbool_t, int64_t const *, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u8)))
-void svst1q_scatter(svbool_t, svuint64_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u32)))
-void svst1q_scatter(svbool_t, svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u64)))
-void svst1q_scatter(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u16)))
-void svst1q_scatter(svbool_t, svuint64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_bf16)))
-void svst1q_scatter(svbool_t, svuint64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_s8)))
-void svst1q_scatter(svbool_t, svuint64_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_f64)))
-void svst1q_scatter(svbool_t, svuint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_f32)))
-void svst1q_scatter(svbool_t, svuint64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_f16)))
-void svst1q_scatter(svbool_t, svuint64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_s32)))
-void svst1q_scatter(svbool_t, svuint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_s64)))
-void svst1q_scatter(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_s16)))
-void svst1q_scatter(svbool_t, svuint64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_u32)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_u64)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_u16)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_bf16)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_f64)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_f32)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_f16)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_s32)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_s64)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_index_s16)))
-void svst1q_scatter_index(svbool_t, svuint64_t, int64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_u8)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_u32)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_u64)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_u16)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_bf16)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s8)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_f64)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_f32)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_f16)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s32)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s64)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s16)))
-void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u32)))
-void svst1q_scatter_index(svbool_t, uint32_t *, svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u64)))
-void svst1q_scatter_index(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u16)))
-void svst1q_scatter_index(svbool_t, uint16_t *, svuint64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_bf16)))
-void svst1q_scatter_index(svbool_t, bfloat16_t *, svuint64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_f64)))
-void svst1q_scatter_index(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_f32)))
-void svst1q_scatter_index(svbool_t, float32_t *, svuint64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_f16)))
-void svst1q_scatter_index(svbool_t, float16_t *, svuint64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_s32)))
-void svst1q_scatter_index(svbool_t, int32_t *, svuint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_s64)))
-void svst1q_scatter_index(svbool_t, int64_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_s16)))
-void svst1q_scatter_index(svbool_t, int16_t *, svuint64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u8)))
-void svst1q_scatter_offset(svbool_t, uint8_t *, svuint64_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u32)))
-void svst1q_scatter_offset(svbool_t, uint32_t *, svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u64)))
-void svst1q_scatter_offset(svbool_t, uint64_t *, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u16)))
-void svst1q_scatter_offset(svbool_t, uint16_t *, svuint64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_bf16)))
-void svst1q_scatter_offset(svbool_t, bfloat16_t *, svuint64_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s8)))
-void svst1q_scatter_offset(svbool_t, int8_t *, svuint64_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_f64)))
-void svst1q_scatter_offset(svbool_t, float64_t *, svuint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_f32)))
-void svst1q_scatter_offset(svbool_t, float32_t *, svuint64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_f16)))
-void svst1q_scatter_offset(svbool_t, float16_t *, svuint64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s32)))
-void svst1q_scatter_offset(svbool_t, int32_t *, svuint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s64)))
-void svst1q_scatter_offset(svbool_t, int64_t *, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s16)))
-void svst1q_scatter_offset(svbool_t, int16_t *, svuint64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_u32)))
-void svst1wq(svbool_t, uint32_t const *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_f32)))
-void svst1wq(svbool_t, float32_t const *, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_s32)))
-void svst1wq(svbool_t, int32_t const *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_u32)))
-void svst1wq_vnum(svbool_t, uint32_t const *, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_f32)))
-void svst1wq_vnum(svbool_t, float32_t const *, int64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_s32)))
-void svst1wq_vnum(svbool_t, int32_t const *, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u8)))
-void svst2q(svbool_t, uint8_t const *, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u32)))
-void svst2q(svbool_t, uint32_t const *, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u64)))
-void svst2q(svbool_t, uint64_t const *, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u16)))
-void svst2q(svbool_t, uint16_t const *, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_s8)))
-void svst2q(svbool_t, int8_t const *, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_f64)))
-void svst2q(svbool_t, float64_t const *, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_f32)))
-void svst2q(svbool_t, float32_t const *, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_f16)))
-void svst2q(svbool_t, float16_t const *, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_s32)))
-void svst2q(svbool_t, int32_t const *, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_s64)))
-void svst2q(svbool_t, int64_t const *, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_s16)))
-void svst2q(svbool_t, int16_t const *, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_bf16)))
-void svst2q(svbool_t, bfloat16_t const *, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_u8)))
-void svst2q_vnum(svbool_t, uint8_t const *, int64_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_u32)))
-void svst2q_vnum(svbool_t, uint32_t const *, int64_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_u64)))
-void svst2q_vnum(svbool_t, uint64_t const *, int64_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_u16)))
-void svst2q_vnum(svbool_t, uint16_t const *, int64_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_s8)))
-void svst2q_vnum(svbool_t, int8_t const *, int64_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_f64)))
-void svst2q_vnum(svbool_t, float64_t const *, int64_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_f32)))
-void svst2q_vnum(svbool_t, float32_t const *, int64_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_f16)))
-void svst2q_vnum(svbool_t, float16_t const *, int64_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_s32)))
-void svst2q_vnum(svbool_t, int32_t const *, int64_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_s64)))
-void svst2q_vnum(svbool_t, int64_t const *, int64_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_s16)))
-void svst2q_vnum(svbool_t, int16_t const *, int64_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_vnum_bf16)))
-void svst2q_vnum(svbool_t, bfloat16_t const *, int64_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_u8)))
-void svst3q(svbool_t, uint8_t const *, svuint8x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_u32)))
-void svst3q(svbool_t, uint32_t const *, svuint32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_u64)))
-void svst3q(svbool_t, uint64_t const *, svuint64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_u16)))
-void svst3q(svbool_t, uint16_t const *, svuint16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_s8)))
-void svst3q(svbool_t, int8_t const *, svint8x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_f64)))
-void svst3q(svbool_t, float64_t const *, svfloat64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_f32)))
-void svst3q(svbool_t, float32_t const *, svfloat32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_f16)))
-void svst3q(svbool_t, float16_t const *, svfloat16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_s32)))
-void svst3q(svbool_t, int32_t const *, svint32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_s64)))
-void svst3q(svbool_t, int64_t const *, svint64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_s16)))
-void svst3q(svbool_t, int16_t const *, svint16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_bf16)))
-void svst3q(svbool_t, bfloat16_t const *, svbfloat16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_u8)))
-void svst3q_vnum(svbool_t, uint8_t const *, int64_t, svuint8x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_u32)))
-void svst3q_vnum(svbool_t, uint32_t const *, int64_t, svuint32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_u64)))
-void svst3q_vnum(svbool_t, uint64_t const *, int64_t, svuint64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_u16)))
-void svst3q_vnum(svbool_t, uint16_t const *, int64_t, svuint16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_s8)))
-void svst3q_vnum(svbool_t, int8_t const *, int64_t, svint8x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_f64)))
-void svst3q_vnum(svbool_t, float64_t const *, int64_t, svfloat64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_f32)))
-void svst3q_vnum(svbool_t, float32_t const *, int64_t, svfloat32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_f16)))
-void svst3q_vnum(svbool_t, float16_t const *, int64_t, svfloat16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_s32)))
-void svst3q_vnum(svbool_t, int32_t const *, int64_t, svint32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_s64)))
-void svst3q_vnum(svbool_t, int64_t const *, int64_t, svint64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_s16)))
-void svst3q_vnum(svbool_t, int16_t const *, int64_t, svint16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3q_vnum_bf16)))
-void svst3q_vnum(svbool_t, bfloat16_t const *, int64_t, svbfloat16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_u8)))
-void svst4q(svbool_t, uint8_t const *, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_u32)))
-void svst4q(svbool_t, uint32_t const *, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_u64)))
-void svst4q(svbool_t, uint64_t const *, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_u16)))
-void svst4q(svbool_t, uint16_t const *, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_s8)))
-void svst4q(svbool_t, int8_t const *, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_f64)))
-void svst4q(svbool_t, float64_t const *, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_f32)))
-void svst4q(svbool_t, float32_t const *, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_f16)))
-void svst4q(svbool_t, float16_t const *, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_s32)))
-void svst4q(svbool_t, int32_t const *, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_s64)))
-void svst4q(svbool_t, int64_t const *, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_s16)))
-void svst4q(svbool_t, int16_t const *, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_bf16)))
-void svst4q(svbool_t, bfloat16_t const *, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_u8)))
-void svst4q_vnum(svbool_t, uint8_t const *, int64_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_u32)))
-void svst4q_vnum(svbool_t, uint32_t const *, int64_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_u64)))
-void svst4q_vnum(svbool_t, uint64_t const *, int64_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_u16)))
-void svst4q_vnum(svbool_t, uint16_t const *, int64_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_s8)))
-void svst4q_vnum(svbool_t, int8_t const *, int64_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_f64)))
-void svst4q_vnum(svbool_t, float64_t const *, int64_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_f32)))
-void svst4q_vnum(svbool_t, float32_t const *, int64_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_f16)))
-void svst4q_vnum(svbool_t, float16_t const *, int64_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_s32)))
-void svst4q_vnum(svbool_t, int32_t const *, int64_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_s64)))
-void svst4q_vnum(svbool_t, int64_t const *, int64_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_s16)))
-void svst4q_vnum(svbool_t, int16_t const *, int64_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4q_vnum_bf16)))
-void svst4q_vnum(svbool_t, bfloat16_t const *, int64_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_u8)))
-svuint8_t svtblq(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_u32)))
-svuint32_t svtblq(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_u64)))
-svuint64_t svtblq(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_u16)))
-svuint16_t svtblq(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_bf16)))
-svbfloat16_t svtblq(svbfloat16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s8)))
-svint8_t svtblq(svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_f64)))
-svfloat64_t svtblq(svfloat64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_f32)))
-svfloat32_t svtblq(svfloat32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_f16)))
-svfloat16_t svtblq(svfloat16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s32)))
-svint32_t svtblq(svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s64)))
-svint64_t svtblq(svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s16)))
-svint16_t svtblq(svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u8)))
-svuint8_t svtbxq(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u32)))
-svuint32_t svtbxq(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u64)))
-svuint64_t svtbxq(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u16)))
-svuint16_t svtbxq(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_bf16)))
-svbfloat16_t svtbxq(svbfloat16_t, svbfloat16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s8)))
-svint8_t svtbxq(svint8_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_f64)))
-svfloat64_t svtbxq(svfloat64_t, svfloat64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_f32)))
-svfloat32_t svtbxq(svfloat32_t, svfloat32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_f16)))
-svfloat16_t svtbxq(svfloat16_t, svfloat16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s32)))
-svint32_t svtbxq(svint32_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s64)))
-svint64_t svtbxq(svint64_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s16)))
-svint16_t svtbxq(svint16_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u8)))
-svuint8_t svuzpq1(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u32)))
-svuint32_t svuzpq1(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u64)))
-svuint64_t svuzpq1(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u16)))
-svuint16_t svuzpq1(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_bf16)))
-svbfloat16_t svuzpq1(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s8)))
-svint8_t svuzpq1(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_f64)))
-svfloat64_t svuzpq1(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_f32)))
-svfloat32_t svuzpq1(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_f16)))
-svfloat16_t svuzpq1(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s32)))
-svint32_t svuzpq1(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s64)))
-svint64_t svuzpq1(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s16)))
-svint16_t svuzpq1(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u8)))
-svuint8_t svuzpq2(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u32)))
-svuint32_t svuzpq2(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u64)))
-svuint64_t svuzpq2(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u16)))
-svuint16_t svuzpq2(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_bf16)))
-svbfloat16_t svuzpq2(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s8)))
-svint8_t svuzpq2(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_f64)))
-svfloat64_t svuzpq2(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_f32)))
-svfloat32_t svuzpq2(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_f16)))
-svfloat16_t svuzpq2(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s32)))
-svint32_t svuzpq2(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s64)))
-svint64_t svuzpq2(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s16)))
-svint16_t svuzpq2(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u8)))
-svuint8_t svzipq1(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u32)))
-svuint32_t svzipq1(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u64)))
-svuint64_t svzipq1(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u16)))
-svuint16_t svzipq1(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_bf16)))
-svbfloat16_t svzipq1(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s8)))
-svint8_t svzipq1(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_f64)))
-svfloat64_t svzipq1(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_f32)))
-svfloat32_t svzipq1(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_f16)))
-svfloat16_t svzipq1(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s32)))
-svint32_t svzipq1(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s64)))
-svint64_t svzipq1(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s16)))
-svint16_t svzipq1(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u8)))
-svuint8_t svzipq2(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u32)))
-svuint32_t svzipq2(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u64)))
-svuint64_t svzipq2(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u16)))
-svuint16_t svzipq2(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_bf16)))
-svbfloat16_t svzipq2(svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s8)))
-svint8_t svzipq2(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_f64)))
-svfloat64_t svzipq2(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_f32)))
-svfloat32_t svzipq2(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_f16)))
-svfloat16_t svzipq2(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s32)))
-svint32_t svzipq2(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s64)))
-svint64_t svzipq2(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s16)))
-svint16_t svzipq2(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_bf16)))
-svbfloat16_t svdup_laneq_bf16(svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_bf16)))
-svbfloat16_t svdup_laneq(svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_s8)))
-svint8_t svclamp_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_s32)))
-svint32_t svclamp_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_s64)))
-svint64_t svclamp_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_s16)))
-svint16_t svclamp_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_u8)))
-svuint8_t svclamp_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_u32)))
-svuint32_t svclamp_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_u64)))
-svuint64_t svclamp_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_u16)))
-svuint16_t svclamp_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpsel_lane_b16)))
-svbool_t svpsel_lane_b16(svbool_t, svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpsel_lane_b32)))
-svbool_t svpsel_lane_b32(svbool_t, svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpsel_lane_b64)))
-svbool_t svpsel_lane_b64(svbool_t, svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpsel_lane_b8)))
-svbool_t svpsel_lane_b8(svbool_t, svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u8_m)))
-svuint8_t svrevd_u8_m(svuint8_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u32_m)))
-svuint32_t svrevd_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u64_m)))
-svuint64_t svrevd_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u16_m)))
-svuint16_t svrevd_u16_m(svuint16_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_bf16_m)))
-svbfloat16_t svrevd_bf16_m(svbfloat16_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s8_m)))
-svint8_t svrevd_s8_m(svint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f64_m)))
-svfloat64_t svrevd_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f32_m)))
-svfloat32_t svrevd_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f16_m)))
-svfloat16_t svrevd_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s32_m)))
-svint32_t svrevd_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s64_m)))
-svint64_t svrevd_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s16_m)))
-svint16_t svrevd_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u8_x)))
-svuint8_t svrevd_u8_x(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u32_x)))
-svuint32_t svrevd_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u64_x)))
-svuint64_t svrevd_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u16_x)))
-svuint16_t svrevd_u16_x(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_bf16_x)))
-svbfloat16_t svrevd_bf16_x(svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s8_x)))
-svint8_t svrevd_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f64_x)))
-svfloat64_t svrevd_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f32_x)))
-svfloat32_t svrevd_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f16_x)))
-svfloat16_t svrevd_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s32_x)))
-svint32_t svrevd_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s64_x)))
-svint64_t svrevd_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s16_x)))
-svint16_t svrevd_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u8_z)))
-svuint8_t svrevd_u8_z(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u32_z)))
-svuint32_t svrevd_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u64_z)))
-svuint64_t svrevd_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u16_z)))
-svuint16_t svrevd_u16_z(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_bf16_z)))
-svbfloat16_t svrevd_bf16_z(svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s8_z)))
-svint8_t svrevd_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f64_z)))
-svfloat64_t svrevd_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f32_z)))
-svfloat32_t svrevd_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f16_z)))
-svfloat16_t svrevd_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s32_z)))
-svint32_t svrevd_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s64_z)))
-svint64_t svrevd_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s16_z)))
-svint16_t svrevd_s16_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_s8)))
-svint8_t svclamp(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_s32)))
-svint32_t svclamp(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_s64)))
-svint64_t svclamp(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_s16)))
-svint16_t svclamp(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_u8)))
-svuint8_t svclamp(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_u32)))
-svuint32_t svclamp(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_u64)))
-svuint64_t svclamp(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_u16)))
-svuint16_t svclamp(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u8_m)))
-svuint8_t svrevd_m(svuint8_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u32_m)))
-svuint32_t svrevd_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u64_m)))
-svuint64_t svrevd_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u16_m)))
-svuint16_t svrevd_m(svuint16_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_bf16_m)))
-svbfloat16_t svrevd_m(svbfloat16_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s8_m)))
-svint8_t svrevd_m(svint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f64_m)))
-svfloat64_t svrevd_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f32_m)))
-svfloat32_t svrevd_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f16_m)))
-svfloat16_t svrevd_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s32_m)))
-svint32_t svrevd_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s64_m)))
-svint64_t svrevd_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s16_m)))
-svint16_t svrevd_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u8_x)))
-svuint8_t svrevd_x(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u32_x)))
-svuint32_t svrevd_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u64_x)))
-svuint64_t svrevd_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u16_x)))
-svuint16_t svrevd_x(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_bf16_x)))
-svbfloat16_t svrevd_x(svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s8_x)))
-svint8_t svrevd_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f64_x)))
-svfloat64_t svrevd_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f32_x)))
-svfloat32_t svrevd_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f16_x)))
-svfloat16_t svrevd_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s32_x)))
-svint32_t svrevd_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s64_x)))
-svint64_t svrevd_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s16_x)))
-svint16_t svrevd_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u8_z)))
-svuint8_t svrevd_z(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u32_z)))
-svuint32_t svrevd_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u64_z)))
-svuint64_t svrevd_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_u16_z)))
-svuint16_t svrevd_z(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_bf16_z)))
-svbfloat16_t svrevd_z(svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s8_z)))
-svint8_t svrevd_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f64_z)))
-svfloat64_t svrevd_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f32_z)))
-svfloat32_t svrevd_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_f16_z)))
-svfloat16_t svrevd_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s32_z)))
-svint32_t svrevd_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s64_z)))
-svint64_t svrevd_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevd_s16_z)))
-svint16_t svrevd_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlslb_f32)))
-svfloat32_t svbfmlslb_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlslb_lane_f32)))
-svfloat32_t svbfmlslb_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlslt_f32)))
-svfloat32_t svbfmlslt_f32(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlslt_lane_f32)))
-svfloat32_t svbfmlslt_lane_f32(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_f64)))
-svfloat64_t svclamp_f64(svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_f32)))
-svfloat32_t svclamp_f32(svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_f16)))
-svfloat16_t svclamp_f16(svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_c8)))
-uint64_t svcntp_c8(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_c32)))
-uint64_t svcntp_c32(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_c64)))
-uint64_t svcntp_c64(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_c16)))
-uint64_t svcntp_c16(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_b)))
-svboolx2_t svcreate2_b(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_b)))
-svboolx4_t svcreate4_b(svbool_t, svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_f32_f16)))
-svfloat32_t svdot_f32_f16(svfloat32_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_s32_s16)))
-svint32_t svdot_s32_s16(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_u32_u16)))
-svuint32_t svdot_u32_u16(svuint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_f32_f16)))
-svfloat32_t svdot_lane_f32_f16(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_s32_s16)))
-svint32_t svdot_lane_s32_s16(svint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_u32_u16)))
-svuint32_t svdot_lane_u32_u16(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_b)))
-svbool_t svget2_b(svboolx2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_b)))
-svbool_t svget4_b(svboolx4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u8_x2)))
-svuint8x2_t svld1_u8_x2(svcount_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s8_x2)))
-svint8x2_t svld1_s8_x2(svcount_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u64_x2)))
-svuint64x2_t svld1_u64_x2(svcount_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f64_x2)))
-svfloat64x2_t svld1_f64_x2(svcount_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s64_x2)))
-svint64x2_t svld1_s64_x2(svcount_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u16_x2)))
-svuint16x2_t svld1_u16_x2(svcount_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16_x2)))
-svbfloat16x2_t svld1_bf16_x2(svcount_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f16_x2)))
-svfloat16x2_t svld1_f16_x2(svcount_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s16_x2)))
-svint16x2_t svld1_s16_x2(svcount_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u32_x2)))
-svuint32x2_t svld1_u32_x2(svcount_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f32_x2)))
-svfloat32x2_t svld1_f32_x2(svcount_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s32_x2)))
-svint32x2_t svld1_s32_x2(svcount_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u8_x4)))
-svuint8x4_t svld1_u8_x4(svcount_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s8_x4)))
-svint8x4_t svld1_s8_x4(svcount_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u64_x4)))
-svuint64x4_t svld1_u64_x4(svcount_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f64_x4)))
-svfloat64x4_t svld1_f64_x4(svcount_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s64_x4)))
-svint64x4_t svld1_s64_x4(svcount_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u16_x4)))
-svuint16x4_t svld1_u16_x4(svcount_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16_x4)))
-svbfloat16x4_t svld1_bf16_x4(svcount_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f16_x4)))
-svfloat16x4_t svld1_f16_x4(svcount_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s16_x4)))
-svint16x4_t svld1_s16_x4(svcount_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u32_x4)))
-svuint32x4_t svld1_u32_x4(svcount_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f32_x4)))
-svfloat32x4_t svld1_f32_x4(svcount_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s32_x4)))
-svint32x4_t svld1_s32_x4(svcount_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u8_x2)))
-svuint8x2_t svld1_vnum_u8_x2(svcount_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s8_x2)))
-svint8x2_t svld1_vnum_s8_x2(svcount_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u64_x2)))
-svuint64x2_t svld1_vnum_u64_x2(svcount_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f64_x2)))
-svfloat64x2_t svld1_vnum_f64_x2(svcount_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s64_x2)))
-svint64x2_t svld1_vnum_s64_x2(svcount_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u16_x2)))
-svuint16x2_t svld1_vnum_u16_x2(svcount_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16_x2)))
-svbfloat16x2_t svld1_vnum_bf16_x2(svcount_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f16_x2)))
-svfloat16x2_t svld1_vnum_f16_x2(svcount_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s16_x2)))
-svint16x2_t svld1_vnum_s16_x2(svcount_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u32_x2)))
-svuint32x2_t svld1_vnum_u32_x2(svcount_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f32_x2)))
-svfloat32x2_t svld1_vnum_f32_x2(svcount_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s32_x2)))
-svint32x2_t svld1_vnum_s32_x2(svcount_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u8_x4)))
-svuint8x4_t svld1_vnum_u8_x4(svcount_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s8_x4)))
-svint8x4_t svld1_vnum_s8_x4(svcount_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u64_x4)))
-svuint64x4_t svld1_vnum_u64_x4(svcount_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f64_x4)))
-svfloat64x4_t svld1_vnum_f64_x4(svcount_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s64_x4)))
-svint64x4_t svld1_vnum_s64_x4(svcount_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u16_x4)))
-svuint16x4_t svld1_vnum_u16_x4(svcount_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16_x4)))
-svbfloat16x4_t svld1_vnum_bf16_x4(svcount_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f16_x4)))
-svfloat16x4_t svld1_vnum_f16_x4(svcount_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s16_x4)))
-svint16x4_t svld1_vnum_s16_x4(svcount_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u32_x4)))
-svuint32x4_t svld1_vnum_u32_x4(svcount_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f32_x4)))
-svfloat32x4_t svld1_vnum_f32_x4(svcount_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s32_x4)))
-svint32x4_t svld1_vnum_s32_x4(svcount_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u8_x2)))
-svuint8x2_t svldnt1_u8_x2(svcount_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s8_x2)))
-svint8x2_t svldnt1_s8_x2(svcount_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u64_x2)))
-svuint64x2_t svldnt1_u64_x2(svcount_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f64_x2)))
-svfloat64x2_t svldnt1_f64_x2(svcount_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s64_x2)))
-svint64x2_t svldnt1_s64_x2(svcount_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u16_x2)))
-svuint16x2_t svldnt1_u16_x2(svcount_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16_x2)))
-svbfloat16x2_t svldnt1_bf16_x2(svcount_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f16_x2)))
-svfloat16x2_t svldnt1_f16_x2(svcount_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s16_x2)))
-svint16x2_t svldnt1_s16_x2(svcount_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u32_x2)))
-svuint32x2_t svldnt1_u32_x2(svcount_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f32_x2)))
-svfloat32x2_t svldnt1_f32_x2(svcount_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s32_x2)))
-svint32x2_t svldnt1_s32_x2(svcount_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u8_x4)))
-svuint8x4_t svldnt1_u8_x4(svcount_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s8_x4)))
-svint8x4_t svldnt1_s8_x4(svcount_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u64_x4)))
-svuint64x4_t svldnt1_u64_x4(svcount_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f64_x4)))
-svfloat64x4_t svldnt1_f64_x4(svcount_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s64_x4)))
-svint64x4_t svldnt1_s64_x4(svcount_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u16_x4)))
-svuint16x4_t svldnt1_u16_x4(svcount_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16_x4)))
-svbfloat16x4_t svldnt1_bf16_x4(svcount_t, bfloat16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f16_x4)))
-svfloat16x4_t svldnt1_f16_x4(svcount_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s16_x4)))
-svint16x4_t svldnt1_s16_x4(svcount_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u32_x4)))
-svuint32x4_t svldnt1_u32_x4(svcount_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f32_x4)))
-svfloat32x4_t svldnt1_f32_x4(svcount_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s32_x4)))
-svint32x4_t svldnt1_s32_x4(svcount_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u8_x2)))
-svuint8x2_t svldnt1_vnum_u8_x2(svcount_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s8_x2)))
-svint8x2_t svldnt1_vnum_s8_x2(svcount_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u64_x2)))
-svuint64x2_t svldnt1_vnum_u64_x2(svcount_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f64_x2)))
-svfloat64x2_t svldnt1_vnum_f64_x2(svcount_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s64_x2)))
-svint64x2_t svldnt1_vnum_s64_x2(svcount_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u16_x2)))
-svuint16x2_t svldnt1_vnum_u16_x2(svcount_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16_x2)))
-svbfloat16x2_t svldnt1_vnum_bf16_x2(svcount_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f16_x2)))
-svfloat16x2_t svldnt1_vnum_f16_x2(svcount_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s16_x2)))
-svint16x2_t svldnt1_vnum_s16_x2(svcount_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u32_x2)))
-svuint32x2_t svldnt1_vnum_u32_x2(svcount_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f32_x2)))
-svfloat32x2_t svldnt1_vnum_f32_x2(svcount_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s32_x2)))
-svint32x2_t svldnt1_vnum_s32_x2(svcount_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u8_x4)))
-svuint8x4_t svldnt1_vnum_u8_x4(svcount_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s8_x4)))
-svint8x4_t svldnt1_vnum_s8_x4(svcount_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u64_x4)))
-svuint64x4_t svldnt1_vnum_u64_x4(svcount_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f64_x4)))
-svfloat64x4_t svldnt1_vnum_f64_x4(svcount_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s64_x4)))
-svint64x4_t svldnt1_vnum_s64_x4(svcount_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u16_x4)))
-svuint16x4_t svldnt1_vnum_u16_x4(svcount_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16_x4)))
-svbfloat16x4_t svldnt1_vnum_bf16_x4(svcount_t, bfloat16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f16_x4)))
-svfloat16x4_t svldnt1_vnum_f16_x4(svcount_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s16_x4)))
-svint16x4_t svldnt1_vnum_s16_x4(svcount_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u32_x4)))
-svuint32x4_t svldnt1_vnum_u32_x4(svcount_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f32_x4)))
-svfloat32x4_t svldnt1_vnum_f32_x4(svcount_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s32_x4)))
-svint32x4_t svldnt1_vnum_s32_x4(svcount_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpext_lane_c8)))
-svbool_t svpext_lane_c8(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpext_lane_c32)))
-svbool_t svpext_lane_c32(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpext_lane_c64)))
-svbool_t svpext_lane_c64(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpext_lane_c16)))
-svbool_t svpext_lane_c16(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpext_lane_c8_x2)))
-svboolx2_t svpext_lane_c8_x2(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpext_lane_c32_x2)))
-svboolx2_t svpext_lane_c32_x2(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpext_lane_c64_x2)))
-svboolx2_t svpext_lane_c64_x2(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpext_lane_c16_x2)))
-svboolx2_t svpext_lane_c16_x2(svcount_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_c)))
-svcount_t svpfalse_c(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpsel_lane_c16)))
-svcount_t svpsel_lane_c16(svcount_t, svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpsel_lane_c32)))
-svcount_t svpsel_lane_c32(svcount_t, svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpsel_lane_c64)))
-svcount_t svpsel_lane_c64(svcount_t, svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpsel_lane_c8)))
-svcount_t svpsel_lane_c8(svcount_t, svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_c8)))
-svcount_t svptrue_c8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_c32)))
-svcount_t svptrue_c32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_c64)))
-svcount_t svptrue_c64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_c16)))
-svcount_t svptrue_c16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_s16_s32_x2)))
-svint16_t svqcvtn_s16_s32_x2(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u16_s32_x2)))
-svuint16_t svqcvtn_u16_s32_x2(svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u16_u32_x2)))
-svuint16_t svqcvtn_u16_u32_x2(svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_s16_s32_x2)))
-svint16_t svqrshrn_n_s16_s32_x2(svint32x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_u16_u32_x2)))
-svuint16_t svqrshrn_n_u16_u32_x2(svuint32x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrun_n_u16_s32_x2)))
-svuint16_t svqrshrun_n_u16_s32_x2(svint32x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svreinterpret_b)))
-svbool_t svreinterpret_b(svcount_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svreinterpret_c)))
-svcount_t svreinterpret_c(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_b)))
-svboolx2_t svset2_b(svboolx2_t, uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_b)))
-svboolx4_t svset4_b(svboolx4_t, uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u8_x2)))
-void svst1_u8_x2(svcount_t, uint8_t *, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s8_x2)))
-void svst1_s8_x2(svcount_t, int8_t *, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u64_x2)))
-void svst1_u64_x2(svcount_t, uint64_t *, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f64_x2)))
-void svst1_f64_x2(svcount_t, float64_t *, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s64_x2)))
-void svst1_s64_x2(svcount_t, int64_t *, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u16_x2)))
-void svst1_u16_x2(svcount_t, uint16_t *, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16_x2)))
-void svst1_bf16_x2(svcount_t, bfloat16_t *, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f16_x2)))
-void svst1_f16_x2(svcount_t, float16_t *, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s16_x2)))
-void svst1_s16_x2(svcount_t, int16_t *, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u32_x2)))
-void svst1_u32_x2(svcount_t, uint32_t *, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f32_x2)))
-void svst1_f32_x2(svcount_t, float32_t *, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s32_x2)))
-void svst1_s32_x2(svcount_t, int32_t *, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u8_x4)))
-void svst1_u8_x4(svcount_t, uint8_t *, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s8_x4)))
-void svst1_s8_x4(svcount_t, int8_t *, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u64_x4)))
-void svst1_u64_x4(svcount_t, uint64_t *, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f64_x4)))
-void svst1_f64_x4(svcount_t, float64_t *, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s64_x4)))
-void svst1_s64_x4(svcount_t, int64_t *, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u16_x4)))
-void svst1_u16_x4(svcount_t, uint16_t *, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16_x4)))
-void svst1_bf16_x4(svcount_t, bfloat16_t *, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f16_x4)))
-void svst1_f16_x4(svcount_t, float16_t *, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s16_x4)))
-void svst1_s16_x4(svcount_t, int16_t *, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u32_x4)))
-void svst1_u32_x4(svcount_t, uint32_t *, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f32_x4)))
-void svst1_f32_x4(svcount_t, float32_t *, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s32_x4)))
-void svst1_s32_x4(svcount_t, int32_t *, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u8_x2)))
-void svst1_vnum_u8_x2(svcount_t, uint8_t *, int64_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s8_x2)))
-void svst1_vnum_s8_x2(svcount_t, int8_t *, int64_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u64_x2)))
-void svst1_vnum_u64_x2(svcount_t, uint64_t *, int64_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f64_x2)))
-void svst1_vnum_f64_x2(svcount_t, float64_t *, int64_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s64_x2)))
-void svst1_vnum_s64_x2(svcount_t, int64_t *, int64_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u16_x2)))
-void svst1_vnum_u16_x2(svcount_t, uint16_t *, int64_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16_x2)))
-void svst1_vnum_bf16_x2(svcount_t, bfloat16_t *, int64_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f16_x2)))
-void svst1_vnum_f16_x2(svcount_t, float16_t *, int64_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s16_x2)))
-void svst1_vnum_s16_x2(svcount_t, int16_t *, int64_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u32_x2)))
-void svst1_vnum_u32_x2(svcount_t, uint32_t *, int64_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f32_x2)))
-void svst1_vnum_f32_x2(svcount_t, float32_t *, int64_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s32_x2)))
-void svst1_vnum_s32_x2(svcount_t, int32_t *, int64_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u8_x4)))
-void svst1_vnum_u8_x4(svcount_t, uint8_t *, int64_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s8_x4)))
-void svst1_vnum_s8_x4(svcount_t, int8_t *, int64_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u64_x4)))
-void svst1_vnum_u64_x4(svcount_t, uint64_t *, int64_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f64_x4)))
-void svst1_vnum_f64_x4(svcount_t, float64_t *, int64_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s64_x4)))
-void svst1_vnum_s64_x4(svcount_t, int64_t *, int64_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u16_x4)))
-void svst1_vnum_u16_x4(svcount_t, uint16_t *, int64_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16_x4)))
-void svst1_vnum_bf16_x4(svcount_t, bfloat16_t *, int64_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f16_x4)))
-void svst1_vnum_f16_x4(svcount_t, float16_t *, int64_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s16_x4)))
-void svst1_vnum_s16_x4(svcount_t, int16_t *, int64_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u32_x4)))
-void svst1_vnum_u32_x4(svcount_t, uint32_t *, int64_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f32_x4)))
-void svst1_vnum_f32_x4(svcount_t, float32_t *, int64_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s32_x4)))
-void svst1_vnum_s32_x4(svcount_t, int32_t *, int64_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u8_x2)))
-void svstnt1_u8_x2(svcount_t, uint8_t *, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s8_x2)))
-void svstnt1_s8_x2(svcount_t, int8_t *, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u64_x2)))
-void svstnt1_u64_x2(svcount_t, uint64_t *, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f64_x2)))
-void svstnt1_f64_x2(svcount_t, float64_t *, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s64_x2)))
-void svstnt1_s64_x2(svcount_t, int64_t *, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u16_x2)))
-void svstnt1_u16_x2(svcount_t, uint16_t *, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16_x2)))
-void svstnt1_bf16_x2(svcount_t, bfloat16_t *, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f16_x2)))
-void svstnt1_f16_x2(svcount_t, float16_t *, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s16_x2)))
-void svstnt1_s16_x2(svcount_t, int16_t *, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u32_x2)))
-void svstnt1_u32_x2(svcount_t, uint32_t *, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f32_x2)))
-void svstnt1_f32_x2(svcount_t, float32_t *, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s32_x2)))
-void svstnt1_s32_x2(svcount_t, int32_t *, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u8_x4)))
-void svstnt1_u8_x4(svcount_t, uint8_t *, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s8_x4)))
-void svstnt1_s8_x4(svcount_t, int8_t *, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u64_x4)))
-void svstnt1_u64_x4(svcount_t, uint64_t *, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f64_x4)))
-void svstnt1_f64_x4(svcount_t, float64_t *, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s64_x4)))
-void svstnt1_s64_x4(svcount_t, int64_t *, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u16_x4)))
-void svstnt1_u16_x4(svcount_t, uint16_t *, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16_x4)))
-void svstnt1_bf16_x4(svcount_t, bfloat16_t *, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f16_x4)))
-void svstnt1_f16_x4(svcount_t, float16_t *, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s16_x4)))
-void svstnt1_s16_x4(svcount_t, int16_t *, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u32_x4)))
-void svstnt1_u32_x4(svcount_t, uint32_t *, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f32_x4)))
-void svstnt1_f32_x4(svcount_t, float32_t *, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s32_x4)))
-void svstnt1_s32_x4(svcount_t, int32_t *, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u8_x2)))
-void svstnt1_vnum_u8_x2(svcount_t, uint8_t *, int64_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s8_x2)))
-void svstnt1_vnum_s8_x2(svcount_t, int8_t *, int64_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u64_x2)))
-void svstnt1_vnum_u64_x2(svcount_t, uint64_t *, int64_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f64_x2)))
-void svstnt1_vnum_f64_x2(svcount_t, float64_t *, int64_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s64_x2)))
-void svstnt1_vnum_s64_x2(svcount_t, int64_t *, int64_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u16_x2)))
-void svstnt1_vnum_u16_x2(svcount_t, uint16_t *, int64_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16_x2)))
-void svstnt1_vnum_bf16_x2(svcount_t, bfloat16_t *, int64_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f16_x2)))
-void svstnt1_vnum_f16_x2(svcount_t, float16_t *, int64_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s16_x2)))
-void svstnt1_vnum_s16_x2(svcount_t, int16_t *, int64_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u32_x2)))
-void svstnt1_vnum_u32_x2(svcount_t, uint32_t *, int64_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f32_x2)))
-void svstnt1_vnum_f32_x2(svcount_t, float32_t *, int64_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s32_x2)))
-void svstnt1_vnum_s32_x2(svcount_t, int32_t *, int64_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u8_x4)))
-void svstnt1_vnum_u8_x4(svcount_t, uint8_t *, int64_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s8_x4)))
-void svstnt1_vnum_s8_x4(svcount_t, int8_t *, int64_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u64_x4)))
-void svstnt1_vnum_u64_x4(svcount_t, uint64_t *, int64_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f64_x4)))
-void svstnt1_vnum_f64_x4(svcount_t, float64_t *, int64_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s64_x4)))
-void svstnt1_vnum_s64_x4(svcount_t, int64_t *, int64_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u16_x4)))
-void svstnt1_vnum_u16_x4(svcount_t, uint16_t *, int64_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16_x4)))
-void svstnt1_vnum_bf16_x4(svcount_t, bfloat16_t *, int64_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f16_x4)))
-void svstnt1_vnum_f16_x4(svcount_t, float16_t *, int64_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s16_x4)))
-void svstnt1_vnum_s16_x4(svcount_t, int16_t *, int64_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u32_x4)))
-void svstnt1_vnum_u32_x4(svcount_t, uint32_t *, int64_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f32_x4)))
-void svstnt1_vnum_f32_x4(svcount_t, float32_t *, int64_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s32_x4)))
-void svstnt1_vnum_s32_x4(svcount_t, int32_t *, int64_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_b)))
-svboolx2_t svundef2_b();
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_b)))
-svboolx4_t svundef4_b();
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c8_s64)))
-svcount_t svwhilege_c8_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c32_s64)))
-svcount_t svwhilege_c32_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c64_s64)))
-svcount_t svwhilege_c64_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c16_s64)))
-svcount_t svwhilege_c16_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c8_u64)))
-svcount_t svwhilege_c8_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c32_u64)))
-svcount_t svwhilege_c32_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c64_u64)))
-svcount_t svwhilege_c64_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c16_u64)))
-svcount_t svwhilege_c16_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_s64_x2)))
-svboolx2_t svwhilege_b8_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_s64_x2)))
-svboolx2_t svwhilege_b32_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_s64_x2)))
-svboolx2_t svwhilege_b64_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_s64_x2)))
-svboolx2_t svwhilege_b16_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_u64_x2)))
-svboolx2_t svwhilege_b8_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_u64_x2)))
-svboolx2_t svwhilege_b32_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_u64_x2)))
-svboolx2_t svwhilege_b64_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_u64_x2)))
-svboolx2_t svwhilege_b16_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c8_s64)))
-svcount_t svwhilegt_c8_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c32_s64)))
-svcount_t svwhilegt_c32_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c64_s64)))
-svcount_t svwhilegt_c64_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c16_s64)))
-svcount_t svwhilegt_c16_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c8_u64)))
-svcount_t svwhilegt_c8_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c32_u64)))
-svcount_t svwhilegt_c32_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c64_u64)))
-svcount_t svwhilegt_c64_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c16_u64)))
-svcount_t svwhilegt_c16_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_s64_x2)))
-svboolx2_t svwhilegt_b8_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_s64_x2)))
-svboolx2_t svwhilegt_b32_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_s64_x2)))
-svboolx2_t svwhilegt_b64_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_s64_x2)))
-svboolx2_t svwhilegt_b16_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_u64_x2)))
-svboolx2_t svwhilegt_b8_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_u64_x2)))
-svboolx2_t svwhilegt_b32_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_u64_x2)))
-svboolx2_t svwhilegt_b64_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_u64_x2)))
-svboolx2_t svwhilegt_b16_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c8_s64)))
-svcount_t svwhilele_c8_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c32_s64)))
-svcount_t svwhilele_c32_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c64_s64)))
-svcount_t svwhilele_c64_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c16_s64)))
-svcount_t svwhilele_c16_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c8_u64)))
-svcount_t svwhilele_c8_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c32_u64)))
-svcount_t svwhilele_c32_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c64_u64)))
-svcount_t svwhilele_c64_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c16_u64)))
-svcount_t svwhilele_c16_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_s64_x2)))
-svboolx2_t svwhilele_b8_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_s64_x2)))
-svboolx2_t svwhilele_b32_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_s64_x2)))
-svboolx2_t svwhilele_b64_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_s64_x2)))
-svboolx2_t svwhilele_b16_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_u64_x2)))
-svboolx2_t svwhilele_b8_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_u64_x2)))
-svboolx2_t svwhilele_b32_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_u64_x2)))
-svboolx2_t svwhilele_b64_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_u64_x2)))
-svboolx2_t svwhilele_b16_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c8_u64)))
-svcount_t svwhilelt_c8_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c32_u64)))
-svcount_t svwhilelt_c32_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c64_u64)))
-svcount_t svwhilelt_c64_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c16_u64)))
-svcount_t svwhilelt_c16_u64(uint64_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c8_s64)))
-svcount_t svwhilelt_c8_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c32_s64)))
-svcount_t svwhilelt_c32_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c64_s64)))
-svcount_t svwhilelt_c64_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c16_s64)))
-svcount_t svwhilelt_c16_s64(int64_t, int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_u64_x2)))
-svboolx2_t svwhilelt_b8_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_u64_x2)))
-svboolx2_t svwhilelt_b32_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_u64_x2)))
-svboolx2_t svwhilelt_b64_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_u64_x2)))
-svboolx2_t svwhilelt_b16_u64_x2(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_s64_x2)))
-svboolx2_t svwhilelt_b8_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_s64_x2)))
-svboolx2_t svwhilelt_b32_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_s64_x2)))
-svboolx2_t svwhilelt_b64_s64_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_s64_x2)))
-svboolx2_t svwhilelt_b16_s64_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlslb_f32)))
-svfloat32_t svbfmlslb(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlslb_lane_f32)))
-svfloat32_t svbfmlslb_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlslt_f32)))
-svfloat32_t svbfmlslt(svfloat32_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbfmlslt_lane_f32)))
-svfloat32_t svbfmlslt_lane(svfloat32_t, svbfloat16_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_f64)))
-svfloat64_t svclamp(svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_f32)))
-svfloat32_t svclamp(svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_f16)))
-svfloat16_t svclamp(svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_b)))
-svboolx2_t svcreate2(svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_b)))
-svboolx4_t svcreate4(svbool_t, svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_f32_f16)))
-svfloat32_t svdot(svfloat32_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_s32_s16)))
-svint32_t svdot(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_u32_u16)))
-svuint32_t svdot(svuint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_f32_f16)))
-svfloat32_t svdot_lane(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_s32_s16)))
-svint32_t svdot_lane(svint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_u32_u16)))
-svuint32_t svdot_lane(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_b)))
-svbool_t svget2(svboolx2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_b)))
-svbool_t svget4(svboolx4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u8_x2)))
-svuint8x2_t svld1_x2(svcount_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s8_x2)))
-svint8x2_t svld1_x2(svcount_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u64_x2)))
-svuint64x2_t svld1_x2(svcount_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f64_x2)))
-svfloat64x2_t svld1_x2(svcount_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s64_x2)))
-svint64x2_t svld1_x2(svcount_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u16_x2)))
-svuint16x2_t svld1_x2(svcount_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16_x2)))
-svbfloat16x2_t svld1_x2(svcount_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f16_x2)))
-svfloat16x2_t svld1_x2(svcount_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s16_x2)))
-svint16x2_t svld1_x2(svcount_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u32_x2)))
-svuint32x2_t svld1_x2(svcount_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f32_x2)))
-svfloat32x2_t svld1_x2(svcount_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s32_x2)))
-svint32x2_t svld1_x2(svcount_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u8_x4)))
-svuint8x4_t svld1_x4(svcount_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s8_x4)))
-svint8x4_t svld1_x4(svcount_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u64_x4)))
-svuint64x4_t svld1_x4(svcount_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f64_x4)))
-svfloat64x4_t svld1_x4(svcount_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s64_x4)))
-svint64x4_t svld1_x4(svcount_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u16_x4)))
-svuint16x4_t svld1_x4(svcount_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_bf16_x4)))
-svbfloat16x4_t svld1_x4(svcount_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f16_x4)))
-svfloat16x4_t svld1_x4(svcount_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s16_x4)))
-svint16x4_t svld1_x4(svcount_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u32_x4)))
-svuint32x4_t svld1_x4(svcount_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f32_x4)))
-svfloat32x4_t svld1_x4(svcount_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s32_x4)))
-svint32x4_t svld1_x4(svcount_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u8_x2)))
-svuint8x2_t svld1_vnum_x2(svcount_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s8_x2)))
-svint8x2_t svld1_vnum_x2(svcount_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u64_x2)))
-svuint64x2_t svld1_vnum_x2(svcount_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f64_x2)))
-svfloat64x2_t svld1_vnum_x2(svcount_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s64_x2)))
-svint64x2_t svld1_vnum_x2(svcount_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u16_x2)))
-svuint16x2_t svld1_vnum_x2(svcount_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16_x2)))
-svbfloat16x2_t svld1_vnum_x2(svcount_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f16_x2)))
-svfloat16x2_t svld1_vnum_x2(svcount_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s16_x2)))
-svint16x2_t svld1_vnum_x2(svcount_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u32_x2)))
-svuint32x2_t svld1_vnum_x2(svcount_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f32_x2)))
-svfloat32x2_t svld1_vnum_x2(svcount_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s32_x2)))
-svint32x2_t svld1_vnum_x2(svcount_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u8_x4)))
-svuint8x4_t svld1_vnum_x4(svcount_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s8_x4)))
-svint8x4_t svld1_vnum_x4(svcount_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u64_x4)))
-svuint64x4_t svld1_vnum_x4(svcount_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f64_x4)))
-svfloat64x4_t svld1_vnum_x4(svcount_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s64_x4)))
-svint64x4_t svld1_vnum_x4(svcount_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u16_x4)))
-svuint16x4_t svld1_vnum_x4(svcount_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_bf16_x4)))
-svbfloat16x4_t svld1_vnum_x4(svcount_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f16_x4)))
-svfloat16x4_t svld1_vnum_x4(svcount_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s16_x4)))
-svint16x4_t svld1_vnum_x4(svcount_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u32_x4)))
-svuint32x4_t svld1_vnum_x4(svcount_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f32_x4)))
-svfloat32x4_t svld1_vnum_x4(svcount_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s32_x4)))
-svint32x4_t svld1_vnum_x4(svcount_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u8_x2)))
-svuint8x2_t svldnt1_x2(svcount_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s8_x2)))
-svint8x2_t svldnt1_x2(svcount_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u64_x2)))
-svuint64x2_t svldnt1_x2(svcount_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f64_x2)))
-svfloat64x2_t svldnt1_x2(svcount_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s64_x2)))
-svint64x2_t svldnt1_x2(svcount_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u16_x2)))
-svuint16x2_t svldnt1_x2(svcount_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16_x2)))
-svbfloat16x2_t svldnt1_x2(svcount_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f16_x2)))
-svfloat16x2_t svldnt1_x2(svcount_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s16_x2)))
-svint16x2_t svldnt1_x2(svcount_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u32_x2)))
-svuint32x2_t svldnt1_x2(svcount_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f32_x2)))
-svfloat32x2_t svldnt1_x2(svcount_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s32_x2)))
-svint32x2_t svldnt1_x2(svcount_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u8_x4)))
-svuint8x4_t svldnt1_x4(svcount_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s8_x4)))
-svint8x4_t svldnt1_x4(svcount_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u64_x4)))
-svuint64x4_t svldnt1_x4(svcount_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f64_x4)))
-svfloat64x4_t svldnt1_x4(svcount_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s64_x4)))
-svint64x4_t svldnt1_x4(svcount_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u16_x4)))
-svuint16x4_t svldnt1_x4(svcount_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_bf16_x4)))
-svbfloat16x4_t svldnt1_x4(svcount_t, bfloat16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f16_x4)))
-svfloat16x4_t svldnt1_x4(svcount_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s16_x4)))
-svint16x4_t svldnt1_x4(svcount_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u32_x4)))
-svuint32x4_t svldnt1_x4(svcount_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f32_x4)))
-svfloat32x4_t svldnt1_x4(svcount_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s32_x4)))
-svint32x4_t svldnt1_x4(svcount_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u8_x2)))
-svuint8x2_t svldnt1_vnum_x2(svcount_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s8_x2)))
-svint8x2_t svldnt1_vnum_x2(svcount_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u64_x2)))
-svuint64x2_t svldnt1_vnum_x2(svcount_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f64_x2)))
-svfloat64x2_t svldnt1_vnum_x2(svcount_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s64_x2)))
-svint64x2_t svldnt1_vnum_x2(svcount_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u16_x2)))
-svuint16x2_t svldnt1_vnum_x2(svcount_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16_x2)))
-svbfloat16x2_t svldnt1_vnum_x2(svcount_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f16_x2)))
-svfloat16x2_t svldnt1_vnum_x2(svcount_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s16_x2)))
-svint16x2_t svldnt1_vnum_x2(svcount_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u32_x2)))
-svuint32x2_t svldnt1_vnum_x2(svcount_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f32_x2)))
-svfloat32x2_t svldnt1_vnum_x2(svcount_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s32_x2)))
-svint32x2_t svldnt1_vnum_x2(svcount_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u8_x4)))
-svuint8x4_t svldnt1_vnum_x4(svcount_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s8_x4)))
-svint8x4_t svldnt1_vnum_x4(svcount_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u64_x4)))
-svuint64x4_t svldnt1_vnum_x4(svcount_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f64_x4)))
-svfloat64x4_t svldnt1_vnum_x4(svcount_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s64_x4)))
-svint64x4_t svldnt1_vnum_x4(svcount_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u16_x4)))
-svuint16x4_t svldnt1_vnum_x4(svcount_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_bf16_x4)))
-svbfloat16x4_t svldnt1_vnum_x4(svcount_t, bfloat16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f16_x4)))
-svfloat16x4_t svldnt1_vnum_x4(svcount_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s16_x4)))
-svint16x4_t svldnt1_vnum_x4(svcount_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u32_x4)))
-svuint32x4_t svldnt1_vnum_x4(svcount_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f32_x4)))
-svfloat32x4_t svldnt1_vnum_x4(svcount_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s32_x4)))
-svint32x4_t svldnt1_vnum_x4(svcount_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_s16_s32_x2)))
-svint16_t svqcvtn_s16(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u16_s32_x2)))
-svuint16_t svqcvtn_u16(svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcvtn_u16_u32_x2)))
-svuint16_t svqcvtn_u16(svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_s16_s32_x2)))
-svint16_t svqrshrn_s16(svint32x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrn_n_u16_u32_x2)))
-svuint16_t svqrshrn_u16(svuint32x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrun_n_u16_s32_x2)))
-svuint16_t svqrshrun_u16(svint32x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svreinterpret_b)))
-svbool_t svreinterpret(svcount_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svreinterpret_c)))
-svcount_t svreinterpret(svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_b)))
-svboolx2_t svset2(svboolx2_t, uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_b)))
-svboolx4_t svset4(svboolx4_t, uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u8_x2)))
-void svst1(svcount_t, uint8_t *, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s8_x2)))
-void svst1(svcount_t, int8_t *, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u64_x2)))
-void svst1(svcount_t, uint64_t *, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f64_x2)))
-void svst1(svcount_t, float64_t *, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s64_x2)))
-void svst1(svcount_t, int64_t *, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u16_x2)))
-void svst1(svcount_t, uint16_t *, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16_x2)))
-void svst1(svcount_t, bfloat16_t *, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f16_x2)))
-void svst1(svcount_t, float16_t *, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s16_x2)))
-void svst1(svcount_t, int16_t *, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u32_x2)))
-void svst1(svcount_t, uint32_t *, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f32_x2)))
-void svst1(svcount_t, float32_t *, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s32_x2)))
-void svst1(svcount_t, int32_t *, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u8_x4)))
-void svst1(svcount_t, uint8_t *, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s8_x4)))
-void svst1(svcount_t, int8_t *, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u64_x4)))
-void svst1(svcount_t, uint64_t *, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f64_x4)))
-void svst1(svcount_t, float64_t *, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s64_x4)))
-void svst1(svcount_t, int64_t *, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u16_x4)))
-void svst1(svcount_t, uint16_t *, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_bf16_x4)))
-void svst1(svcount_t, bfloat16_t *, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f16_x4)))
-void svst1(svcount_t, float16_t *, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s16_x4)))
-void svst1(svcount_t, int16_t *, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u32_x4)))
-void svst1(svcount_t, uint32_t *, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f32_x4)))
-void svst1(svcount_t, float32_t *, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s32_x4)))
-void svst1(svcount_t, int32_t *, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u8_x2)))
-void svst1_vnum(svcount_t, uint8_t *, int64_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s8_x2)))
-void svst1_vnum(svcount_t, int8_t *, int64_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u64_x2)))
-void svst1_vnum(svcount_t, uint64_t *, int64_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f64_x2)))
-void svst1_vnum(svcount_t, float64_t *, int64_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s64_x2)))
-void svst1_vnum(svcount_t, int64_t *, int64_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u16_x2)))
-void svst1_vnum(svcount_t, uint16_t *, int64_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16_x2)))
-void svst1_vnum(svcount_t, bfloat16_t *, int64_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f16_x2)))
-void svst1_vnum(svcount_t, float16_t *, int64_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s16_x2)))
-void svst1_vnum(svcount_t, int16_t *, int64_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u32_x2)))
-void svst1_vnum(svcount_t, uint32_t *, int64_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f32_x2)))
-void svst1_vnum(svcount_t, float32_t *, int64_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s32_x2)))
-void svst1_vnum(svcount_t, int32_t *, int64_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u8_x4)))
-void svst1_vnum(svcount_t, uint8_t *, int64_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s8_x4)))
-void svst1_vnum(svcount_t, int8_t *, int64_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u64_x4)))
-void svst1_vnum(svcount_t, uint64_t *, int64_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f64_x4)))
-void svst1_vnum(svcount_t, float64_t *, int64_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s64_x4)))
-void svst1_vnum(svcount_t, int64_t *, int64_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u16_x4)))
-void svst1_vnum(svcount_t, uint16_t *, int64_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_bf16_x4)))
-void svst1_vnum(svcount_t, bfloat16_t *, int64_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f16_x4)))
-void svst1_vnum(svcount_t, float16_t *, int64_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s16_x4)))
-void svst1_vnum(svcount_t, int16_t *, int64_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u32_x4)))
-void svst1_vnum(svcount_t, uint32_t *, int64_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f32_x4)))
-void svst1_vnum(svcount_t, float32_t *, int64_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s32_x4)))
-void svst1_vnum(svcount_t, int32_t *, int64_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u8_x2)))
-void svstnt1(svcount_t, uint8_t *, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s8_x2)))
-void svstnt1(svcount_t, int8_t *, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u64_x2)))
-void svstnt1(svcount_t, uint64_t *, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f64_x2)))
-void svstnt1(svcount_t, float64_t *, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s64_x2)))
-void svstnt1(svcount_t, int64_t *, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u16_x2)))
-void svstnt1(svcount_t, uint16_t *, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16_x2)))
-void svstnt1(svcount_t, bfloat16_t *, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f16_x2)))
-void svstnt1(svcount_t, float16_t *, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s16_x2)))
-void svstnt1(svcount_t, int16_t *, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u32_x2)))
-void svstnt1(svcount_t, uint32_t *, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f32_x2)))
-void svstnt1(svcount_t, float32_t *, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s32_x2)))
-void svstnt1(svcount_t, int32_t *, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u8_x4)))
-void svstnt1(svcount_t, uint8_t *, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s8_x4)))
-void svstnt1(svcount_t, int8_t *, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u64_x4)))
-void svstnt1(svcount_t, uint64_t *, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f64_x4)))
-void svstnt1(svcount_t, float64_t *, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s64_x4)))
-void svstnt1(svcount_t, int64_t *, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u16_x4)))
-void svstnt1(svcount_t, uint16_t *, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_bf16_x4)))
-void svstnt1(svcount_t, bfloat16_t *, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f16_x4)))
-void svstnt1(svcount_t, float16_t *, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s16_x4)))
-void svstnt1(svcount_t, int16_t *, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u32_x4)))
-void svstnt1(svcount_t, uint32_t *, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f32_x4)))
-void svstnt1(svcount_t, float32_t *, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s32_x4)))
-void svstnt1(svcount_t, int32_t *, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u8_x2)))
-void svstnt1_vnum(svcount_t, uint8_t *, int64_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s8_x2)))
-void svstnt1_vnum(svcount_t, int8_t *, int64_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u64_x2)))
-void svstnt1_vnum(svcount_t, uint64_t *, int64_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f64_x2)))
-void svstnt1_vnum(svcount_t, float64_t *, int64_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s64_x2)))
-void svstnt1_vnum(svcount_t, int64_t *, int64_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u16_x2)))
-void svstnt1_vnum(svcount_t, uint16_t *, int64_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16_x2)))
-void svstnt1_vnum(svcount_t, bfloat16_t *, int64_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f16_x2)))
-void svstnt1_vnum(svcount_t, float16_t *, int64_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s16_x2)))
-void svstnt1_vnum(svcount_t, int16_t *, int64_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u32_x2)))
-void svstnt1_vnum(svcount_t, uint32_t *, int64_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f32_x2)))
-void svstnt1_vnum(svcount_t, float32_t *, int64_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s32_x2)))
-void svstnt1_vnum(svcount_t, int32_t *, int64_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u8_x4)))
-void svstnt1_vnum(svcount_t, uint8_t *, int64_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s8_x4)))
-void svstnt1_vnum(svcount_t, int8_t *, int64_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u64_x4)))
-void svstnt1_vnum(svcount_t, uint64_t *, int64_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f64_x4)))
-void svstnt1_vnum(svcount_t, float64_t *, int64_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s64_x4)))
-void svstnt1_vnum(svcount_t, int64_t *, int64_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u16_x4)))
-void svstnt1_vnum(svcount_t, uint16_t *, int64_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_bf16_x4)))
-void svstnt1_vnum(svcount_t, bfloat16_t *, int64_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f16_x4)))
-void svstnt1_vnum(svcount_t, float16_t *, int64_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s16_x4)))
-void svstnt1_vnum(svcount_t, int16_t *, int64_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u32_x4)))
-void svstnt1_vnum(svcount_t, uint32_t *, int64_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f32_x4)))
-void svstnt1_vnum(svcount_t, float32_t *, int64_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s32_x4)))
-void svstnt1_vnum(svcount_t, int32_t *, int64_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c8_s64)))
-svcount_t svwhilege_c8(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c32_s64)))
-svcount_t svwhilege_c32(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c64_s64)))
-svcount_t svwhilege_c64(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c16_s64)))
-svcount_t svwhilege_c16(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c8_u64)))
-svcount_t svwhilege_c8(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c32_u64)))
-svcount_t svwhilege_c32(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c64_u64)))
-svcount_t svwhilege_c64(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_c16_u64)))
-svcount_t svwhilege_c16(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_s64_x2)))
-svboolx2_t svwhilege_b8_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_s64_x2)))
-svboolx2_t svwhilege_b32_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_s64_x2)))
-svboolx2_t svwhilege_b64_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_s64_x2)))
-svboolx2_t svwhilege_b16_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_u64_x2)))
-svboolx2_t svwhilege_b8_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_u64_x2)))
-svboolx2_t svwhilege_b32_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_u64_x2)))
-svboolx2_t svwhilege_b64_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_u64_x2)))
-svboolx2_t svwhilege_b16_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c8_s64)))
-svcount_t svwhilegt_c8(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c32_s64)))
-svcount_t svwhilegt_c32(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c64_s64)))
-svcount_t svwhilegt_c64(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c16_s64)))
-svcount_t svwhilegt_c16(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c8_u64)))
-svcount_t svwhilegt_c8(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c32_u64)))
-svcount_t svwhilegt_c32(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c64_u64)))
-svcount_t svwhilegt_c64(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_c16_u64)))
-svcount_t svwhilegt_c16(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_s64_x2)))
-svboolx2_t svwhilegt_b8_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_s64_x2)))
-svboolx2_t svwhilegt_b32_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_s64_x2)))
-svboolx2_t svwhilegt_b64_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_s64_x2)))
-svboolx2_t svwhilegt_b16_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_u64_x2)))
-svboolx2_t svwhilegt_b8_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_u64_x2)))
-svboolx2_t svwhilegt_b32_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_u64_x2)))
-svboolx2_t svwhilegt_b64_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_u64_x2)))
-svboolx2_t svwhilegt_b16_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c8_s64)))
-svcount_t svwhilele_c8(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c32_s64)))
-svcount_t svwhilele_c32(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c64_s64)))
-svcount_t svwhilele_c64(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c16_s64)))
-svcount_t svwhilele_c16(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c8_u64)))
-svcount_t svwhilele_c8(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c32_u64)))
-svcount_t svwhilele_c32(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c64_u64)))
-svcount_t svwhilele_c64(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_c16_u64)))
-svcount_t svwhilele_c16(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_s64_x2)))
-svboolx2_t svwhilele_b8_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_s64_x2)))
-svboolx2_t svwhilele_b32_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_s64_x2)))
-svboolx2_t svwhilele_b64_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_s64_x2)))
-svboolx2_t svwhilele_b16_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_u64_x2)))
-svboolx2_t svwhilele_b8_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_u64_x2)))
-svboolx2_t svwhilele_b32_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_u64_x2)))
-svboolx2_t svwhilele_b64_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_u64_x2)))
-svboolx2_t svwhilele_b16_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c8_u64)))
-svcount_t svwhilelt_c8(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c32_u64)))
-svcount_t svwhilelt_c32(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c64_u64)))
-svcount_t svwhilelt_c64(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c16_u64)))
-svcount_t svwhilelt_c16(uint64_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c8_s64)))
-svcount_t svwhilelt_c8(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c32_s64)))
-svcount_t svwhilelt_c32(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c64_s64)))
-svcount_t svwhilelt_c64(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_c16_s64)))
-svcount_t svwhilelt_c16(int64_t, int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_u64_x2)))
-svboolx2_t svwhilelt_b8_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_u64_x2)))
-svboolx2_t svwhilelt_b32_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_u64_x2)))
-svboolx2_t svwhilelt_b64_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_u64_x2)))
-svboolx2_t svwhilelt_b16_x2(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_s64_x2)))
-svboolx2_t svwhilelt_b8_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_s64_x2)))
-svboolx2_t svwhilelt_b32_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_s64_x2)))
-svboolx2_t svwhilelt_b64_x2(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_s64_x2)))
-svboolx2_t svwhilelt_b16_x2(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u8)))
-svuint8_t svdup_laneq_u8(svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s8)))
-svint8_t svdup_laneq_s8(svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u64)))
-svuint64_t svdup_laneq_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_f64)))
-svfloat64_t svdup_laneq_f64(svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s64)))
-svint64_t svdup_laneq_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u16)))
-svuint16_t svdup_laneq_u16(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_f16)))
-svfloat16_t svdup_laneq_f16(svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s16)))
-svint16_t svdup_laneq_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u32)))
-svuint32_t svdup_laneq_u32(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_f32)))
-svfloat32_t svdup_laneq_f32(svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s32)))
-svint32_t svdup_laneq_s32(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u8)))
-svuint8_t svdup_laneq(svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s8)))
-svint8_t svdup_laneq(svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u64)))
-svuint64_t svdup_laneq(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_f64)))
-svfloat64_t svdup_laneq(svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s64)))
-svint64_t svdup_laneq(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u16)))
-svuint16_t svdup_laneq(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_f16)))
-svfloat16_t svdup_laneq(svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s16)))
-svint16_t svdup_laneq(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u32)))
-svuint32_t svdup_laneq(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_f32)))
-svfloat32_t svdup_laneq(svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s32)))
-svint32_t svdup_laneq(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s8)))
-svint8_t svaba_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s32)))
-svint32_t svaba_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s64)))
-svint64_t svaba_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s16)))
-svint16_t svaba_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_u8)))
-svuint8_t svaba_n_u8(svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_u32)))
-svuint32_t svaba_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_u64)))
-svuint64_t svaba_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_u16)))
-svuint16_t svaba_n_u16(svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_s8)))
-svint8_t svaba_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_s32)))
-svint32_t svaba_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_s64)))
-svint64_t svaba_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_s16)))
-svint16_t svaba_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_u8)))
-svuint8_t svaba_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_u32)))
-svuint32_t svaba_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_u64)))
-svuint64_t svaba_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_u16)))
-svuint16_t svaba_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_s32)))
-svint32_t svabalb_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_s64)))
-svint64_t svabalb_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_s16)))
-svint16_t svabalb_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_u32)))
-svuint32_t svabalb_n_u32(svuint32_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_u64)))
-svuint64_t svabalb_n_u64(svuint64_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_u16)))
-svuint16_t svabalb_n_u16(svuint16_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_s32)))
-svint32_t svabalb_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_s64)))
-svint64_t svabalb_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_s16)))
-svint16_t svabalb_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_u32)))
-svuint32_t svabalb_u32(svuint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_u64)))
-svuint64_t svabalb_u64(svuint64_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_u16)))
-svuint16_t svabalb_u16(svuint16_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_s32)))
-svint32_t svabalt_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_s64)))
-svint64_t svabalt_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_s16)))
-svint16_t svabalt_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_u32)))
-svuint32_t svabalt_n_u32(svuint32_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_u64)))
-svuint64_t svabalt_n_u64(svuint64_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_u16)))
-svuint16_t svabalt_n_u16(svuint16_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_s32)))
-svint32_t svabalt_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_s64)))
-svint64_t svabalt_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_s16)))
-svint16_t svabalt_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_u32)))
-svuint32_t svabalt_u32(svuint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_u64)))
-svuint64_t svabalt_u64(svuint64_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_u16)))
-svuint16_t svabalt_u16(svuint16_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_s32)))
-svint32_t svabdlb_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_s64)))
-svint64_t svabdlb_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_s16)))
-svint16_t svabdlb_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_u32)))
-svuint32_t svabdlb_n_u32(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_u64)))
-svuint64_t svabdlb_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_u16)))
-svuint16_t svabdlb_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_s32)))
-svint32_t svabdlb_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_s64)))
-svint64_t svabdlb_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_s16)))
-svint16_t svabdlb_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_u32)))
-svuint32_t svabdlb_u32(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_u64)))
-svuint64_t svabdlb_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_u16)))
-svuint16_t svabdlb_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_s32)))
-svint32_t svabdlt_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_s64)))
-svint64_t svabdlt_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_s16)))
-svint16_t svabdlt_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_u32)))
-svuint32_t svabdlt_n_u32(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_u64)))
-svuint64_t svabdlt_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_u16)))
-svuint16_t svabdlt_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_s32)))
-svint32_t svabdlt_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_s64)))
-svint64_t svabdlt_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_s16)))
-svint16_t svabdlt_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_u32)))
-svuint32_t svabdlt_u32(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_u64)))
-svuint64_t svabdlt_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_u16)))
-svuint16_t svabdlt_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s32_m)))
-svint32_t svadalp_s32_m(svbool_t, svint32_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s64_m)))
-svint64_t svadalp_s64_m(svbool_t, svint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s16_m)))
-svint16_t svadalp_s16_m(svbool_t, svint16_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s32_x)))
-svint32_t svadalp_s32_x(svbool_t, svint32_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s64_x)))
-svint64_t svadalp_s64_x(svbool_t, svint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s16_x)))
-svint16_t svadalp_s16_x(svbool_t, svint16_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s32_z)))
-svint32_t svadalp_s32_z(svbool_t, svint32_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s64_z)))
-svint64_t svadalp_s64_z(svbool_t, svint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s16_z)))
-svint16_t svadalp_s16_z(svbool_t, svint16_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u32_m)))
-svuint32_t svadalp_u32_m(svbool_t, svuint32_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u64_m)))
-svuint64_t svadalp_u64_m(svbool_t, svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u16_m)))
-svuint16_t svadalp_u16_m(svbool_t, svuint16_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u32_x)))
-svuint32_t svadalp_u32_x(svbool_t, svuint32_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u64_x)))
-svuint64_t svadalp_u64_x(svbool_t, svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u16_x)))
-svuint16_t svadalp_u16_x(svbool_t, svuint16_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u32_z)))
-svuint32_t svadalp_u32_z(svbool_t, svuint32_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u64_z)))
-svuint64_t svadalp_u64_z(svbool_t, svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u16_z)))
-svuint16_t svadalp_u16_z(svbool_t, svuint16_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclb_n_u32)))
-svuint32_t svadclb_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclb_n_u64)))
-svuint64_t svadclb_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclb_u32)))
-svuint32_t svadclb_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclb_u64)))
-svuint64_t svadclb_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclt_n_u32)))
-svuint32_t svadclt_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclt_n_u64)))
-svuint64_t svadclt_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclt_u32)))
-svuint32_t svadclt_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclt_u64)))
-svuint64_t svadclt_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_u32)))
-svuint16_t svaddhnb_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_u64)))
-svuint32_t svaddhnb_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_u16)))
-svuint8_t svaddhnb_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_s32)))
-svint16_t svaddhnb_n_s32(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_s64)))
-svint32_t svaddhnb_n_s64(svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_s16)))
-svint8_t svaddhnb_n_s16(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_u32)))
-svuint16_t svaddhnb_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_u64)))
-svuint32_t svaddhnb_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_u16)))
-svuint8_t svaddhnb_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_s32)))
-svint16_t svaddhnb_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_s64)))
-svint32_t svaddhnb_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_s16)))
-svint8_t svaddhnb_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_u32)))
-svuint16_t svaddhnt_n_u32(svuint16_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_u64)))
-svuint32_t svaddhnt_n_u64(svuint32_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_u16)))
-svuint8_t svaddhnt_n_u16(svuint8_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_s32)))
-svint16_t svaddhnt_n_s32(svint16_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_s64)))
-svint32_t svaddhnt_n_s64(svint32_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_s16)))
-svint8_t svaddhnt_n_s16(svint8_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_u32)))
-svuint16_t svaddhnt_u32(svuint16_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_u64)))
-svuint32_t svaddhnt_u64(svuint32_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_u16)))
-svuint8_t svaddhnt_u16(svuint8_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_s32)))
-svint16_t svaddhnt_s32(svint16_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_s64)))
-svint32_t svaddhnt_s64(svint32_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_s16)))
-svint8_t svaddhnt_s16(svint8_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_s32)))
-svint32_t svaddlb_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_s64)))
-svint64_t svaddlb_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_s16)))
-svint16_t svaddlb_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_u32)))
-svuint32_t svaddlb_n_u32(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_u64)))
-svuint64_t svaddlb_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_u16)))
-svuint16_t svaddlb_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_s32)))
-svint32_t svaddlb_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_s64)))
-svint64_t svaddlb_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_s16)))
-svint16_t svaddlb_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_u32)))
-svuint32_t svaddlb_u32(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_u64)))
-svuint64_t svaddlb_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_u16)))
-svuint16_t svaddlb_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_n_s32)))
-svint32_t svaddlbt_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_n_s64)))
-svint64_t svaddlbt_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_n_s16)))
-svint16_t svaddlbt_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_s32)))
-svint32_t svaddlbt_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_s64)))
-svint64_t svaddlbt_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_s16)))
-svint16_t svaddlbt_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_s32)))
-svint32_t svaddlt_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_s64)))
-svint64_t svaddlt_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_s16)))
-svint16_t svaddlt_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_u32)))
-svuint32_t svaddlt_n_u32(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_u64)))
-svuint64_t svaddlt_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_u16)))
-svuint16_t svaddlt_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_s32)))
-svint32_t svaddlt_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_s64)))
-svint64_t svaddlt_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_s16)))
-svint16_t svaddlt_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_u32)))
-svuint32_t svaddlt_u32(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_u64)))
-svuint64_t svaddlt_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_u16)))
-svuint16_t svaddlt_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f64_m)))
-svfloat64_t svaddp_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f32_m)))
-svfloat32_t svaddp_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f16_m)))
-svfloat16_t svaddp_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f64_x)))
-svfloat64_t svaddp_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f32_x)))
-svfloat32_t svaddp_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f16_x)))
-svfloat16_t svaddp_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u8_m)))
-svuint8_t svaddp_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u32_m)))
-svuint32_t svaddp_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u64_m)))
-svuint64_t svaddp_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u16_m)))
-svuint16_t svaddp_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s8_m)))
-svint8_t svaddp_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s32_m)))
-svint32_t svaddp_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s64_m)))
-svint64_t svaddp_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s16_m)))
-svint16_t svaddp_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u8_x)))
-svuint8_t svaddp_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u32_x)))
-svuint32_t svaddp_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u64_x)))
-svuint64_t svaddp_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u16_x)))
-svuint16_t svaddp_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s8_x)))
-svint8_t svaddp_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s32_x)))
-svint32_t svaddp_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s64_x)))
-svint64_t svaddp_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s16_x)))
-svint16_t svaddp_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_s32)))
-svint32_t svaddwb_n_s32(svint32_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_s64)))
-svint64_t svaddwb_n_s64(svint64_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_s16)))
-svint16_t svaddwb_n_s16(svint16_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_u32)))
-svuint32_t svaddwb_n_u32(svuint32_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_u64)))
-svuint64_t svaddwb_n_u64(svuint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_u16)))
-svuint16_t svaddwb_n_u16(svuint16_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_s32)))
-svint32_t svaddwb_s32(svint32_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_s64)))
-svint64_t svaddwb_s64(svint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_s16)))
-svint16_t svaddwb_s16(svint16_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_u32)))
-svuint32_t svaddwb_u32(svuint32_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_u64)))
-svuint64_t svaddwb_u64(svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_u16)))
-svuint16_t svaddwb_u16(svuint16_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_s32)))
-svint32_t svaddwt_n_s32(svint32_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_s64)))
-svint64_t svaddwt_n_s64(svint64_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_s16)))
-svint16_t svaddwt_n_s16(svint16_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_u32)))
-svuint32_t svaddwt_n_u32(svuint32_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_u64)))
-svuint64_t svaddwt_n_u64(svuint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_u16)))
-svuint16_t svaddwt_n_u16(svuint16_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_s32)))
-svint32_t svaddwt_s32(svint32_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_s64)))
-svint64_t svaddwt_s64(svint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_s16)))
-svint16_t svaddwt_s16(svint16_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_u32)))
-svuint32_t svaddwt_u32(svuint32_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_u64)))
-svuint64_t svaddwt_u64(svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_u16)))
-svuint16_t svaddwt_u16(svuint16_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_u8)))
-svuint8_t svbcax_n_u8(svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_u32)))
-svuint32_t svbcax_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_u64)))
-svuint64_t svbcax_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_u16)))
-svuint16_t svbcax_n_u16(svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_s8)))
-svint8_t svbcax_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_s32)))
-svint32_t svbcax_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_s64)))
-svint64_t svbcax_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_s16)))
-svint16_t svbcax_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_u8)))
-svuint8_t svbcax_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_u32)))
-svuint32_t svbcax_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_u64)))
-svuint64_t svbcax_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_u16)))
-svuint16_t svbcax_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_s8)))
-svint8_t svbcax_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_s32)))
-svint32_t svbcax_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_s64)))
-svint64_t svbcax_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_s16)))
-svint16_t svbcax_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_u8)))
-svuint8_t svbsl1n_n_u8(svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_u32)))
-svuint32_t svbsl1n_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_u64)))
-svuint64_t svbsl1n_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_u16)))
-svuint16_t svbsl1n_n_u16(svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_s8)))
-svint8_t svbsl1n_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_s32)))
-svint32_t svbsl1n_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_s64)))
-svint64_t svbsl1n_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_s16)))
-svint16_t svbsl1n_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_u8)))
-svuint8_t svbsl1n_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_u32)))
-svuint32_t svbsl1n_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_u64)))
-svuint64_t svbsl1n_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_u16)))
-svuint16_t svbsl1n_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_s8)))
-svint8_t svbsl1n_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_s32)))
-svint32_t svbsl1n_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_s64)))
-svint64_t svbsl1n_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_s16)))
-svint16_t svbsl1n_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_u8)))
-svuint8_t svbsl2n_n_u8(svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_u32)))
-svuint32_t svbsl2n_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_u64)))
-svuint64_t svbsl2n_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_u16)))
-svuint16_t svbsl2n_n_u16(svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_s8)))
-svint8_t svbsl2n_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_s32)))
-svint32_t svbsl2n_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_s64)))
-svint64_t svbsl2n_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_s16)))
-svint16_t svbsl2n_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_u8)))
-svuint8_t svbsl2n_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_u32)))
-svuint32_t svbsl2n_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_u64)))
-svuint64_t svbsl2n_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_u16)))
-svuint16_t svbsl2n_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_s8)))
-svint8_t svbsl2n_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_s32)))
-svint32_t svbsl2n_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_s64)))
-svint64_t svbsl2n_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_s16)))
-svint16_t svbsl2n_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_u8)))
-svuint8_t svbsl_n_u8(svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_u32)))
-svuint32_t svbsl_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_u64)))
-svuint64_t svbsl_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_u16)))
-svuint16_t svbsl_n_u16(svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_s8)))
-svint8_t svbsl_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_s32)))
-svint32_t svbsl_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_s64)))
-svint64_t svbsl_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_s16)))
-svint16_t svbsl_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_u8)))
-svuint8_t svbsl_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_u32)))
-svuint32_t svbsl_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_u64)))
-svuint64_t svbsl_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_u16)))
-svuint16_t svbsl_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_s8)))
-svint8_t svbsl_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_s32)))
-svint32_t svbsl_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_s64)))
-svint64_t svbsl_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_s16)))
-svint16_t svbsl_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_u8)))
-svuint8_t svcadd_u8(svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_u32)))
-svuint32_t svcadd_u32(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_u64)))
-svuint64_t svcadd_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_u16)))
-svuint16_t svcadd_u16(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_s8)))
-svint8_t svcadd_s8(svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_s32)))
-svint32_t svcadd_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_s64)))
-svint64_t svcadd_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_s16)))
-svint16_t svcadd_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcdot_s32)))
-svint32_t svcdot_s32(svint32_t, svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcdot_s64)))
-svint64_t svcdot_s64(svint64_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcdot_lane_s32)))
-svint32_t svcdot_lane_s32(svint32_t, svint8_t, svint8_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcdot_lane_s64)))
-svint64_t svcdot_lane_s64(svint64_t, svint16_t, svint16_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_u8)))
-svuint8_t svcmla_u8(svuint8_t, svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_u32)))
-svuint32_t svcmla_u32(svuint32_t, svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_u64)))
-svuint64_t svcmla_u64(svuint64_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_u16)))
-svuint16_t svcmla_u16(svuint16_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_s8)))
-svint8_t svcmla_s8(svint8_t, svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_s32)))
-svint32_t svcmla_s32(svint32_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_s64)))
-svint64_t svcmla_s64(svint64_t, svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_s16)))
-svint16_t svcmla_s16(svint16_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_u32)))
-svuint32_t svcmla_lane_u32(svuint32_t, svuint32_t, svuint32_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_u16)))
-svuint16_t svcmla_lane_u16(svuint16_t, svuint16_t, svuint16_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_s32)))
-svint32_t svcmla_lane_s32(svint32_t, svint32_t, svint32_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_s16)))
-svint16_t svcmla_lane_s16(svint16_t, svint16_t, svint16_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt_f32_f16_m)))
-svfloat32_t svcvtlt_f32_f16_m(svfloat32_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt_f32_f16_x)))
-svfloat32_t svcvtlt_f32_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt_f64_f32_m)))
-svfloat64_t svcvtlt_f64_f32_m(svfloat64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt_f64_f32_x)))
-svfloat64_t svcvtlt_f64_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_f16_f32_m)))
-svfloat16_t svcvtnt_f16_f32_m(svfloat16_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_f32_f64_m)))
-svfloat32_t svcvtnt_f32_f64_m(svfloat32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtx_f32_f64_m)))
-svfloat32_t svcvtx_f32_f64_m(svfloat32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtx_f32_f64_x)))
-svfloat32_t svcvtx_f32_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtx_f32_f64_z)))
-svfloat32_t svcvtx_f32_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtxnt_f32_f64_m)))
-svfloat32_t svcvtxnt_f32_f64_m(svfloat32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_u8)))
-svuint8_t sveor3_n_u8(svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_u32)))
-svuint32_t sveor3_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_u64)))
-svuint64_t sveor3_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_u16)))
-svuint16_t sveor3_n_u16(svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_s8)))
-svint8_t sveor3_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_s32)))
-svint32_t sveor3_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_s64)))
-svint64_t sveor3_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_s16)))
-svint16_t sveor3_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_u8)))
-svuint8_t sveor3_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_u32)))
-svuint32_t sveor3_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_u64)))
-svuint64_t sveor3_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_u16)))
-svuint16_t sveor3_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_s8)))
-svint8_t sveor3_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_s32)))
-svint32_t sveor3_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_s64)))
-svint64_t sveor3_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_s16)))
-svint16_t sveor3_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_u8)))
-svuint8_t sveorbt_n_u8(svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_u32)))
-svuint32_t sveorbt_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_u64)))
-svuint64_t sveorbt_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_u16)))
-svuint16_t sveorbt_n_u16(svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_s8)))
-svint8_t sveorbt_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_s32)))
-svint32_t sveorbt_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_s64)))
-svint64_t sveorbt_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_s16)))
-svint16_t sveorbt_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_u8)))
-svuint8_t sveorbt_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_u32)))
-svuint32_t sveorbt_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_u64)))
-svuint64_t sveorbt_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_u16)))
-svuint16_t sveorbt_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_s8)))
-svint8_t sveorbt_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_s32)))
-svint32_t sveorbt_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_s64)))
-svint64_t sveorbt_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_s16)))
-svint16_t sveorbt_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_u8)))
-svuint8_t sveortb_n_u8(svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_u32)))
-svuint32_t sveortb_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_u64)))
-svuint64_t sveortb_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_u16)))
-svuint16_t sveortb_n_u16(svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_s8)))
-svint8_t sveortb_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_s32)))
-svint32_t sveortb_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_s64)))
-svint64_t sveortb_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_s16)))
-svint16_t sveortb_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_u8)))
-svuint8_t sveortb_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_u32)))
-svuint32_t sveortb_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_u64)))
-svuint64_t sveortb_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_u16)))
-svuint16_t sveortb_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_s8)))
-svint8_t sveortb_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_s32)))
-svint32_t sveortb_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_s64)))
-svint64_t sveortb_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_s16)))
-svint16_t sveortb_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s8_m)))
-svint8_t svhadd_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s32_m)))
-svint32_t svhadd_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s64_m)))
-svint64_t svhadd_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s16_m)))
-svint16_t svhadd_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s8_x)))
-svint8_t svhadd_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s32_x)))
-svint32_t svhadd_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s64_x)))
-svint64_t svhadd_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s16_x)))
-svint16_t svhadd_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s8_z)))
-svint8_t svhadd_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s32_z)))
-svint32_t svhadd_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s64_z)))
-svint64_t svhadd_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s16_z)))
-svint16_t svhadd_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u8_m)))
-svuint8_t svhadd_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u32_m)))
-svuint32_t svhadd_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u64_m)))
-svuint64_t svhadd_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u16_m)))
-svuint16_t svhadd_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u8_x)))
-svuint8_t svhadd_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u32_x)))
-svuint32_t svhadd_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u64_x)))
-svuint64_t svhadd_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u16_x)))
-svuint16_t svhadd_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u8_z)))
-svuint8_t svhadd_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u32_z)))
-svuint32_t svhadd_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u64_z)))
-svuint64_t svhadd_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u16_z)))
-svuint16_t svhadd_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s8_m)))
-svint8_t svhadd_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s32_m)))
-svint32_t svhadd_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s64_m)))
-svint64_t svhadd_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s16_m)))
-svint16_t svhadd_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s8_x)))
-svint8_t svhadd_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s32_x)))
-svint32_t svhadd_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s64_x)))
-svint64_t svhadd_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s16_x)))
-svint16_t svhadd_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s8_z)))
-svint8_t svhadd_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s32_z)))
-svint32_t svhadd_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s64_z)))
-svint64_t svhadd_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s16_z)))
-svint16_t svhadd_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u8_m)))
-svuint8_t svhadd_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u32_m)))
-svuint32_t svhadd_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u64_m)))
-svuint64_t svhadd_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u16_m)))
-svuint16_t svhadd_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u8_x)))
-svuint8_t svhadd_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u32_x)))
-svuint32_t svhadd_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u64_x)))
-svuint64_t svhadd_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u16_x)))
-svuint16_t svhadd_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u8_z)))
-svuint8_t svhadd_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u32_z)))
-svuint32_t svhadd_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u64_z)))
-svuint64_t svhadd_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u16_z)))
-svuint16_t svhadd_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s8_m)))
-svint8_t svhsub_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s32_m)))
-svint32_t svhsub_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s64_m)))
-svint64_t svhsub_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s16_m)))
-svint16_t svhsub_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s8_x)))
-svint8_t svhsub_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s32_x)))
-svint32_t svhsub_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s64_x)))
-svint64_t svhsub_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s16_x)))
-svint16_t svhsub_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s8_z)))
-svint8_t svhsub_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s32_z)))
-svint32_t svhsub_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s64_z)))
-svint64_t svhsub_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s16_z)))
-svint16_t svhsub_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u8_m)))
-svuint8_t svhsub_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u32_m)))
-svuint32_t svhsub_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u64_m)))
-svuint64_t svhsub_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u16_m)))
-svuint16_t svhsub_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u8_x)))
-svuint8_t svhsub_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u32_x)))
-svuint32_t svhsub_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u64_x)))
-svuint64_t svhsub_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u16_x)))
-svuint16_t svhsub_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u8_z)))
-svuint8_t svhsub_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u32_z)))
-svuint32_t svhsub_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u64_z)))
-svuint64_t svhsub_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u16_z)))
-svuint16_t svhsub_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s8_m)))
-svint8_t svhsub_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s32_m)))
-svint32_t svhsub_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s64_m)))
-svint64_t svhsub_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s16_m)))
-svint16_t svhsub_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s8_x)))
-svint8_t svhsub_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s32_x)))
-svint32_t svhsub_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s64_x)))
-svint64_t svhsub_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s16_x)))
-svint16_t svhsub_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s8_z)))
-svint8_t svhsub_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s32_z)))
-svint32_t svhsub_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s64_z)))
-svint64_t svhsub_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s16_z)))
-svint16_t svhsub_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u8_m)))
-svuint8_t svhsub_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u32_m)))
-svuint32_t svhsub_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u64_m)))
-svuint64_t svhsub_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u16_m)))
-svuint16_t svhsub_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u8_x)))
-svuint8_t svhsub_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u32_x)))
-svuint32_t svhsub_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u64_x)))
-svuint64_t svhsub_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u16_x)))
-svuint16_t svhsub_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u8_z)))
-svuint8_t svhsub_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u32_z)))
-svuint32_t svhsub_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u64_z)))
-svuint64_t svhsub_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u16_z)))
-svuint16_t svhsub_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s8_m)))
-svint8_t svhsubr_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s32_m)))
-svint32_t svhsubr_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s64_m)))
-svint64_t svhsubr_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s16_m)))
-svint16_t svhsubr_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s8_x)))
-svint8_t svhsubr_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s32_x)))
-svint32_t svhsubr_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s64_x)))
-svint64_t svhsubr_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s16_x)))
-svint16_t svhsubr_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s8_z)))
-svint8_t svhsubr_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s32_z)))
-svint32_t svhsubr_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s64_z)))
-svint64_t svhsubr_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s16_z)))
-svint16_t svhsubr_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u8_m)))
-svuint8_t svhsubr_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u32_m)))
-svuint32_t svhsubr_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u64_m)))
-svuint64_t svhsubr_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u16_m)))
-svuint16_t svhsubr_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u8_x)))
-svuint8_t svhsubr_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u32_x)))
-svuint32_t svhsubr_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u64_x)))
-svuint64_t svhsubr_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u16_x)))
-svuint16_t svhsubr_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u8_z)))
-svuint8_t svhsubr_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u32_z)))
-svuint32_t svhsubr_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u64_z)))
-svuint64_t svhsubr_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u16_z)))
-svuint16_t svhsubr_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s8_m)))
-svint8_t svhsubr_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s32_m)))
-svint32_t svhsubr_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s64_m)))
-svint64_t svhsubr_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s16_m)))
-svint16_t svhsubr_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s8_x)))
-svint8_t svhsubr_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s32_x)))
-svint32_t svhsubr_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s64_x)))
-svint64_t svhsubr_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s16_x)))
-svint16_t svhsubr_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s8_z)))
-svint8_t svhsubr_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s32_z)))
-svint32_t svhsubr_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s64_z)))
-svint64_t svhsubr_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s16_z)))
-svint16_t svhsubr_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u8_m)))
-svuint8_t svhsubr_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u32_m)))
-svuint32_t svhsubr_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u64_m)))
-svuint64_t svhsubr_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u16_m)))
-svuint16_t svhsubr_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u8_x)))
-svuint8_t svhsubr_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u32_x)))
-svuint32_t svhsubr_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u64_x)))
-svuint64_t svhsubr_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u16_x)))
-svuint16_t svhsubr_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u8_z)))
-svuint8_t svhsubr_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u32_z)))
-svuint32_t svhsubr_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u64_z)))
-svuint64_t svhsubr_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u16_z)))
-svuint16_t svhsubr_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f64_m)))
-svint64_t svlogb_f64_m(svint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f32_m)))
-svint32_t svlogb_f32_m(svint32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f16_m)))
-svint16_t svlogb_f16_m(svint16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f64_x)))
-svint64_t svlogb_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f32_x)))
-svint32_t svlogb_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f16_x)))
-svint16_t svlogb_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f64_z)))
-svint64_t svlogb_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f32_z)))
-svint32_t svlogb_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f16_z)))
-svint16_t svlogb_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f64_m)))
-svfloat64_t svmaxnmp_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f32_m)))
-svfloat32_t svmaxnmp_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f16_m)))
-svfloat16_t svmaxnmp_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f64_x)))
-svfloat64_t svmaxnmp_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f32_x)))
-svfloat32_t svmaxnmp_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f16_x)))
-svfloat16_t svmaxnmp_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f64_m)))
-svfloat64_t svmaxp_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f32_m)))
-svfloat32_t svmaxp_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f16_m)))
-svfloat16_t svmaxp_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f64_x)))
-svfloat64_t svmaxp_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f32_x)))
-svfloat32_t svmaxp_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f16_x)))
-svfloat16_t svmaxp_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s8_m)))
-svint8_t svmaxp_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s32_m)))
-svint32_t svmaxp_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s64_m)))
-svint64_t svmaxp_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s16_m)))
-svint16_t svmaxp_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s8_x)))
-svint8_t svmaxp_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s32_x)))
-svint32_t svmaxp_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s64_x)))
-svint64_t svmaxp_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s16_x)))
-svint16_t svmaxp_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u8_m)))
-svuint8_t svmaxp_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u32_m)))
-svuint32_t svmaxp_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u64_m)))
-svuint64_t svmaxp_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u16_m)))
-svuint16_t svmaxp_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u8_x)))
-svuint8_t svmaxp_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u32_x)))
-svuint32_t svmaxp_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u64_x)))
-svuint64_t svmaxp_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u16_x)))
-svuint16_t svmaxp_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f64_m)))
-svfloat64_t svminnmp_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f32_m)))
-svfloat32_t svminnmp_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f16_m)))
-svfloat16_t svminnmp_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f64_x)))
-svfloat64_t svminnmp_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f32_x)))
-svfloat32_t svminnmp_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f16_x)))
-svfloat16_t svminnmp_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f64_m)))
-svfloat64_t svminp_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f32_m)))
-svfloat32_t svminp_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f16_m)))
-svfloat16_t svminp_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f64_x)))
-svfloat64_t svminp_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f32_x)))
-svfloat32_t svminp_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f16_x)))
-svfloat16_t svminp_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s8_m)))
-svint8_t svminp_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s32_m)))
-svint32_t svminp_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s64_m)))
-svint64_t svminp_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s16_m)))
-svint16_t svminp_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s8_x)))
-svint8_t svminp_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s32_x)))
-svint32_t svminp_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s64_x)))
-svint64_t svminp_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s16_x)))
-svint16_t svminp_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u8_m)))
-svuint8_t svminp_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u32_m)))
-svuint32_t svminp_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u64_m)))
-svuint64_t svminp_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u16_m)))
-svuint16_t svminp_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u8_x)))
-svuint8_t svminp_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u32_x)))
-svuint32_t svminp_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u64_x)))
-svuint64_t svminp_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u16_x)))
-svuint16_t svminp_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_u32)))
-svuint32_t svmla_lane_u32(svuint32_t, svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_u64)))
-svuint64_t svmla_lane_u64(svuint64_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_u16)))
-svuint16_t svmla_lane_u16(svuint16_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_s32)))
-svint32_t svmla_lane_s32(svint32_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_s64)))
-svint64_t svmla_lane_s64(svint64_t, svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_s16)))
-svint16_t svmla_lane_s16(svint16_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_f32)))
-svfloat32_t svmlalb_n_f32(svfloat32_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_s32)))
-svint32_t svmlalb_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_s64)))
-svint64_t svmlalb_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_s16)))
-svint16_t svmlalb_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_u32)))
-svuint32_t svmlalb_n_u32(svuint32_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_u64)))
-svuint64_t svmlalb_n_u64(svuint64_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_u16)))
-svuint16_t svmlalb_n_u16(svuint16_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_f32)))
-svfloat32_t svmlalb_f32(svfloat32_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_s32)))
-svint32_t svmlalb_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_s64)))
-svint64_t svmlalb_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_s16)))
-svint16_t svmlalb_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_u32)))
-svuint32_t svmlalb_u32(svuint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_u64)))
-svuint64_t svmlalb_u64(svuint64_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_u16)))
-svuint16_t svmlalb_u16(svuint16_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_f32)))
-svfloat32_t svmlalb_lane_f32(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_s32)))
-svint32_t svmlalb_lane_s32(svint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_s64)))
-svint64_t svmlalb_lane_s64(svint64_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_u32)))
-svuint32_t svmlalb_lane_u32(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_u64)))
-svuint64_t svmlalb_lane_u64(svuint64_t, svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_f32)))
-svfloat32_t svmlalt_n_f32(svfloat32_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_s32)))
-svint32_t svmlalt_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_s64)))
-svint64_t svmlalt_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_s16)))
-svint16_t svmlalt_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_u32)))
-svuint32_t svmlalt_n_u32(svuint32_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_u64)))
-svuint64_t svmlalt_n_u64(svuint64_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_u16)))
-svuint16_t svmlalt_n_u16(svuint16_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_f32)))
-svfloat32_t svmlalt_f32(svfloat32_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_s32)))
-svint32_t svmlalt_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_s64)))
-svint64_t svmlalt_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_s16)))
-svint16_t svmlalt_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_u32)))
-svuint32_t svmlalt_u32(svuint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_u64)))
-svuint64_t svmlalt_u64(svuint64_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_u16)))
-svuint16_t svmlalt_u16(svuint16_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_f32)))
-svfloat32_t svmlalt_lane_f32(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_s32)))
-svint32_t svmlalt_lane_s32(svint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_s64)))
-svint64_t svmlalt_lane_s64(svint64_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_u32)))
-svuint32_t svmlalt_lane_u32(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_u64)))
-svuint64_t svmlalt_lane_u64(svuint64_t, svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_u32)))
-svuint32_t svmls_lane_u32(svuint32_t, svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_u64)))
-svuint64_t svmls_lane_u64(svuint64_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_u16)))
-svuint16_t svmls_lane_u16(svuint16_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_s32)))
-svint32_t svmls_lane_s32(svint32_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_s64)))
-svint64_t svmls_lane_s64(svint64_t, svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_s16)))
-svint16_t svmls_lane_s16(svint16_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_f32)))
-svfloat32_t svmlslb_n_f32(svfloat32_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_s32)))
-svint32_t svmlslb_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_s64)))
-svint64_t svmlslb_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_s16)))
-svint16_t svmlslb_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_u32)))
-svuint32_t svmlslb_n_u32(svuint32_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_u64)))
-svuint64_t svmlslb_n_u64(svuint64_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_u16)))
-svuint16_t svmlslb_n_u16(svuint16_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_f32)))
-svfloat32_t svmlslb_f32(svfloat32_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_s32)))
-svint32_t svmlslb_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_s64)))
-svint64_t svmlslb_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_s16)))
-svint16_t svmlslb_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_u32)))
-svuint32_t svmlslb_u32(svuint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_u64)))
-svuint64_t svmlslb_u64(svuint64_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_u16)))
-svuint16_t svmlslb_u16(svuint16_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_f32)))
-svfloat32_t svmlslb_lane_f32(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_s32)))
-svint32_t svmlslb_lane_s32(svint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_s64)))
-svint64_t svmlslb_lane_s64(svint64_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_u32)))
-svuint32_t svmlslb_lane_u32(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_u64)))
-svuint64_t svmlslb_lane_u64(svuint64_t, svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_f32)))
-svfloat32_t svmlslt_n_f32(svfloat32_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_s32)))
-svint32_t svmlslt_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_s64)))
-svint64_t svmlslt_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_s16)))
-svint16_t svmlslt_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_u32)))
-svuint32_t svmlslt_n_u32(svuint32_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_u64)))
-svuint64_t svmlslt_n_u64(svuint64_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_u16)))
-svuint16_t svmlslt_n_u16(svuint16_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_f32)))
-svfloat32_t svmlslt_f32(svfloat32_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_s32)))
-svint32_t svmlslt_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_s64)))
-svint64_t svmlslt_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_s16)))
-svint16_t svmlslt_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_u32)))
-svuint32_t svmlslt_u32(svuint32_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_u64)))
-svuint64_t svmlslt_u64(svuint64_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_u16)))
-svuint16_t svmlslt_u16(svuint16_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_f32)))
-svfloat32_t svmlslt_lane_f32(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_s32)))
-svint32_t svmlslt_lane_s32(svint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_s64)))
-svint64_t svmlslt_lane_s64(svint64_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_u32)))
-svuint32_t svmlslt_lane_u32(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_u64)))
-svuint64_t svmlslt_lane_u64(svuint64_t, svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_s32)))
-svint32_t svmovlb_s32(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_s64)))
-svint64_t svmovlb_s64(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_s16)))
-svint16_t svmovlb_s16(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_u32)))
-svuint32_t svmovlb_u32(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_u64)))
-svuint64_t svmovlb_u64(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_u16)))
-svuint16_t svmovlb_u16(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_s32)))
-svint32_t svmovlt_s32(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_s64)))
-svint64_t svmovlt_s64(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_s16)))
-svint16_t svmovlt_s16(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_u32)))
-svuint32_t svmovlt_u32(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_u64)))
-svuint64_t svmovlt_u64(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_u16)))
-svuint16_t svmovlt_u16(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_u32)))
-svuint32_t svmul_lane_u32(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_u64)))
-svuint64_t svmul_lane_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_u16)))
-svuint16_t svmul_lane_u16(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_s32)))
-svint32_t svmul_lane_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_s64)))
-svint64_t svmul_lane_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_s16)))
-svint16_t svmul_lane_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_s32)))
-svint32_t svmullb_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_s64)))
-svint64_t svmullb_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_s16)))
-svint16_t svmullb_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_u32)))
-svuint32_t svmullb_n_u32(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_u64)))
-svuint64_t svmullb_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_u16)))
-svuint16_t svmullb_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_s32)))
-svint32_t svmullb_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_s64)))
-svint64_t svmullb_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_s16)))
-svint16_t svmullb_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_u32)))
-svuint32_t svmullb_u32(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_u64)))
-svuint64_t svmullb_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_u16)))
-svuint16_t svmullb_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_lane_s32)))
-svint32_t svmullb_lane_s32(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_lane_s64)))
-svint64_t svmullb_lane_s64(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_lane_u32)))
-svuint32_t svmullb_lane_u32(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_lane_u64)))
-svuint64_t svmullb_lane_u64(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_s32)))
-svint32_t svmullt_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_s64)))
-svint64_t svmullt_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_s16)))
-svint16_t svmullt_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_u32)))
-svuint32_t svmullt_n_u32(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_u64)))
-svuint64_t svmullt_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_u16)))
-svuint16_t svmullt_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_s32)))
-svint32_t svmullt_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_s64)))
-svint64_t svmullt_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_s16)))
-svint16_t svmullt_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_u32)))
-svuint32_t svmullt_u32(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_u64)))
-svuint64_t svmullt_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_u16)))
-svuint16_t svmullt_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_lane_s32)))
-svint32_t svmullt_lane_s32(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_lane_s64)))
-svint64_t svmullt_lane_s64(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_lane_u32)))
-svuint32_t svmullt_lane_u32(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_lane_u64)))
-svuint64_t svmullt_lane_u64(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_u8)))
-svuint8_t svnbsl_n_u8(svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_u32)))
-svuint32_t svnbsl_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_u64)))
-svuint64_t svnbsl_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_u16)))
-svuint16_t svnbsl_n_u16(svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_s8)))
-svint8_t svnbsl_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_s32)))
-svint32_t svnbsl_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_s64)))
-svint64_t svnbsl_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_s16)))
-svint16_t svnbsl_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_u8)))
-svuint8_t svnbsl_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_u32)))
-svuint32_t svnbsl_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_u64)))
-svuint64_t svnbsl_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_u16)))
-svuint16_t svnbsl_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_s8)))
-svint8_t svnbsl_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_s32)))
-svint32_t svnbsl_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_s64)))
-svint64_t svnbsl_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_s16)))
-svint16_t svnbsl_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmul_n_u8)))
-svuint8_t svpmul_n_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmul_u8)))
-svuint8_t svpmul_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_n_u64)))
-svuint64_t svpmullb_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_n_u16)))
-svuint16_t svpmullb_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_u64)))
-svuint64_t svpmullb_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_u16)))
-svuint16_t svpmullb_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u8)))
-svuint8_t svpmullb_pair_n_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u32)))
-svuint32_t svpmullb_pair_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u8)))
-svuint8_t svpmullb_pair_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u32)))
-svuint32_t svpmullb_pair_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_n_u64)))
-svuint64_t svpmullt_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_n_u16)))
-svuint16_t svpmullt_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_u64)))
-svuint64_t svpmullt_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_u16)))
-svuint16_t svpmullt_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u8)))
-svuint8_t svpmullt_pair_n_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u32)))
-svuint32_t svpmullt_pair_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u8)))
-svuint8_t svpmullt_pair_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u32)))
-svuint32_t svpmullt_pair_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s8_m)))
-svint8_t svqabs_s8_m(svint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s32_m)))
-svint32_t svqabs_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s64_m)))
-svint64_t svqabs_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s16_m)))
-svint16_t svqabs_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s8_x)))
-svint8_t svqabs_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s32_x)))
-svint32_t svqabs_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s64_x)))
-svint64_t svqabs_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s16_x)))
-svint16_t svqabs_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s8_z)))
-svint8_t svqabs_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s32_z)))
-svint32_t svqabs_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s64_z)))
-svint64_t svqabs_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s16_z)))
-svint16_t svqabs_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8_m)))
-svint8_t svqadd_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32_m)))
-svint32_t svqadd_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s64_m)))
-svint64_t svqadd_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s16_m)))
-svint16_t svqadd_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8_x)))
-svint8_t svqadd_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32_x)))
-svint32_t svqadd_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s64_x)))
-svint64_t svqadd_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s16_x)))
-svint16_t svqadd_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8_z)))
-svint8_t svqadd_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32_z)))
-svint32_t svqadd_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s64_z)))
-svint64_t svqadd_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s16_z)))
-svint16_t svqadd_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u8_m)))
-svuint8_t svqadd_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u32_m)))
-svuint32_t svqadd_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u64_m)))
-svuint64_t svqadd_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u16_m)))
-svuint16_t svqadd_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u8_x)))
-svuint8_t svqadd_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u32_x)))
-svuint32_t svqadd_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u64_x)))
-svuint64_t svqadd_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u16_x)))
-svuint16_t svqadd_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u8_z)))
-svuint8_t svqadd_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u32_z)))
-svuint32_t svqadd_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u64_z)))
-svuint64_t svqadd_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u16_z)))
-svuint16_t svqadd_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s8_m)))
-svint8_t svqadd_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s32_m)))
-svint32_t svqadd_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s64_m)))
-svint64_t svqadd_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s16_m)))
-svint16_t svqadd_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s8_x)))
-svint8_t svqadd_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s32_x)))
-svint32_t svqadd_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s64_x)))
-svint64_t svqadd_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s16_x)))
-svint16_t svqadd_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s8_z)))
-svint8_t svqadd_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s32_z)))
-svint32_t svqadd_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s64_z)))
-svint64_t svqadd_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s16_z)))
-svint16_t svqadd_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u8_m)))
-svuint8_t svqadd_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u32_m)))
-svuint32_t svqadd_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u64_m)))
-svuint64_t svqadd_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u16_m)))
-svuint16_t svqadd_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u8_x)))
-svuint8_t svqadd_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u32_x)))
-svuint32_t svqadd_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u64_x)))
-svuint64_t svqadd_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u16_x)))
-svuint16_t svqadd_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u8_z)))
-svuint8_t svqadd_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u32_z)))
-svuint32_t svqadd_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u64_z)))
-svuint64_t svqadd_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u16_z)))
-svuint16_t svqadd_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcadd_s8)))
-svint8_t svqcadd_s8(svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcadd_s32)))
-svint32_t svqcadd_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcadd_s64)))
-svint64_t svqcadd_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcadd_s16)))
-svint16_t svqcadd_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_n_s32)))
-svint32_t svqdmlalb_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_n_s64)))
-svint64_t svqdmlalb_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_n_s16)))
-svint16_t svqdmlalb_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_s32)))
-svint32_t svqdmlalb_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_s64)))
-svint64_t svqdmlalb_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_s16)))
-svint16_t svqdmlalb_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_lane_s32)))
-svint32_t svqdmlalb_lane_s32(svint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_lane_s64)))
-svint64_t svqdmlalb_lane_s64(svint64_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_n_s32)))
-svint32_t svqdmlalbt_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_n_s64)))
-svint64_t svqdmlalbt_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_n_s16)))
-svint16_t svqdmlalbt_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_s32)))
-svint32_t svqdmlalbt_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_s64)))
-svint64_t svqdmlalbt_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_s16)))
-svint16_t svqdmlalbt_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_n_s32)))
-svint32_t svqdmlalt_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_n_s64)))
-svint64_t svqdmlalt_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_n_s16)))
-svint16_t svqdmlalt_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_s32)))
-svint32_t svqdmlalt_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_s64)))
-svint64_t svqdmlalt_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_s16)))
-svint16_t svqdmlalt_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_lane_s32)))
-svint32_t svqdmlalt_lane_s32(svint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_lane_s64)))
-svint64_t svqdmlalt_lane_s64(svint64_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_n_s32)))
-svint32_t svqdmlslb_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_n_s64)))
-svint64_t svqdmlslb_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_n_s16)))
-svint16_t svqdmlslb_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_s32)))
-svint32_t svqdmlslb_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_s64)))
-svint64_t svqdmlslb_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_s16)))
-svint16_t svqdmlslb_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_lane_s32)))
-svint32_t svqdmlslb_lane_s32(svint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_lane_s64)))
-svint64_t svqdmlslb_lane_s64(svint64_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_n_s32)))
-svint32_t svqdmlslbt_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_n_s64)))
-svint64_t svqdmlslbt_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_n_s16)))
-svint16_t svqdmlslbt_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_s32)))
-svint32_t svqdmlslbt_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_s64)))
-svint64_t svqdmlslbt_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_s16)))
-svint16_t svqdmlslbt_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_n_s32)))
-svint32_t svqdmlslt_n_s32(svint32_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_n_s64)))
-svint64_t svqdmlslt_n_s64(svint64_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_n_s16)))
-svint16_t svqdmlslt_n_s16(svint16_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_s32)))
-svint32_t svqdmlslt_s32(svint32_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_s64)))
-svint64_t svqdmlslt_s64(svint64_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_s16)))
-svint16_t svqdmlslt_s16(svint16_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_lane_s32)))
-svint32_t svqdmlslt_lane_s32(svint32_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_lane_s64)))
-svint64_t svqdmlslt_lane_s64(svint64_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_n_s8)))
-svint8_t svqdmulh_n_s8(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_n_s32)))
-svint32_t svqdmulh_n_s32(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_n_s64)))
-svint64_t svqdmulh_n_s64(svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_n_s16)))
-svint16_t svqdmulh_n_s16(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s8)))
-svint8_t svqdmulh_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s32)))
-svint32_t svqdmulh_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s64)))
-svint64_t svqdmulh_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s16)))
-svint16_t svqdmulh_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_lane_s32)))
-svint32_t svqdmulh_lane_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_lane_s64)))
-svint64_t svqdmulh_lane_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_lane_s16)))
-svint16_t svqdmulh_lane_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_n_s32)))
-svint32_t svqdmullb_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_n_s64)))
-svint64_t svqdmullb_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_n_s16)))
-svint16_t svqdmullb_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_s32)))
-svint32_t svqdmullb_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_s64)))
-svint64_t svqdmullb_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_s16)))
-svint16_t svqdmullb_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_lane_s32)))
-svint32_t svqdmullb_lane_s32(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_lane_s64)))
-svint64_t svqdmullb_lane_s64(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_n_s32)))
-svint32_t svqdmullt_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_n_s64)))
-svint64_t svqdmullt_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_n_s16)))
-svint16_t svqdmullt_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_s32)))
-svint32_t svqdmullt_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_s64)))
-svint64_t svqdmullt_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_s16)))
-svint16_t svqdmullt_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_lane_s32)))
-svint32_t svqdmullt_lane_s32(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_lane_s64)))
-svint64_t svqdmullt_lane_s64(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s8_m)))
-svint8_t svqneg_s8_m(svint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s32_m)))
-svint32_t svqneg_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s64_m)))
-svint64_t svqneg_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s16_m)))
-svint16_t svqneg_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s8_x)))
-svint8_t svqneg_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s32_x)))
-svint32_t svqneg_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s64_x)))
-svint64_t svqneg_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s16_x)))
-svint16_t svqneg_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s8_z)))
-svint8_t svqneg_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s32_z)))
-svint32_t svqneg_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s64_z)))
-svint64_t svqneg_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s16_z)))
-svint16_t svqneg_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_s8)))
-svint8_t svqrdcmlah_s8(svint8_t, svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_s32)))
-svint32_t svqrdcmlah_s32(svint32_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_s64)))
-svint64_t svqrdcmlah_s64(svint64_t, svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_s16)))
-svint16_t svqrdcmlah_s16(svint16_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_lane_s32)))
-svint32_t svqrdcmlah_lane_s32(svint32_t, svint32_t, svint32_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_lane_s16)))
-svint16_t svqrdcmlah_lane_s16(svint16_t, svint16_t, svint16_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_n_s8)))
-svint8_t svqrdmlah_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_n_s32)))
-svint32_t svqrdmlah_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_n_s64)))
-svint64_t svqrdmlah_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_n_s16)))
-svint16_t svqrdmlah_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_s8)))
-svint8_t svqrdmlah_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_s32)))
-svint32_t svqrdmlah_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_s64)))
-svint64_t svqrdmlah_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_s16)))
-svint16_t svqrdmlah_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_lane_s32)))
-svint32_t svqrdmlah_lane_s32(svint32_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_lane_s64)))
-svint64_t svqrdmlah_lane_s64(svint64_t, svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_lane_s16)))
-svint16_t svqrdmlah_lane_s16(svint16_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_n_s8)))
-svint8_t svqrdmlsh_n_s8(svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_n_s32)))
-svint32_t svqrdmlsh_n_s32(svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_n_s64)))
-svint64_t svqrdmlsh_n_s64(svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_n_s16)))
-svint16_t svqrdmlsh_n_s16(svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_s8)))
-svint8_t svqrdmlsh_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_s32)))
-svint32_t svqrdmlsh_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_s64)))
-svint64_t svqrdmlsh_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_s16)))
-svint16_t svqrdmlsh_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_lane_s32)))
-svint32_t svqrdmlsh_lane_s32(svint32_t, svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_lane_s64)))
-svint64_t svqrdmlsh_lane_s64(svint64_t, svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_lane_s16)))
-svint16_t svqrdmlsh_lane_s16(svint16_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_n_s8)))
-svint8_t svqrdmulh_n_s8(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_n_s32)))
-svint32_t svqrdmulh_n_s32(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_n_s64)))
-svint64_t svqrdmulh_n_s64(svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_n_s16)))
-svint16_t svqrdmulh_n_s16(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_s8)))
-svint8_t svqrdmulh_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_s32)))
-svint32_t svqrdmulh_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_s64)))
-svint64_t svqrdmulh_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_s16)))
-svint16_t svqrdmulh_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_lane_s32)))
-svint32_t svqrdmulh_lane_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_lane_s64)))
-svint64_t svqrdmulh_lane_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_lane_s16)))
-svint16_t svqrdmulh_lane_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s8_m)))
-svint8_t svqrshl_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s32_m)))
-svint32_t svqrshl_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s64_m)))
-svint64_t svqrshl_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s16_m)))
-svint16_t svqrshl_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s8_x)))
-svint8_t svqrshl_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s32_x)))
-svint32_t svqrshl_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s64_x)))
-svint64_t svqrshl_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s16_x)))
-svint16_t svqrshl_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s8_z)))
-svint8_t svqrshl_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s32_z)))
-svint32_t svqrshl_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s64_z)))
-svint64_t svqrshl_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s16_z)))
-svint16_t svqrshl_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u8_m)))
-svuint8_t svqrshl_n_u8_m(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u32_m)))
-svuint32_t svqrshl_n_u32_m(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u64_m)))
-svuint64_t svqrshl_n_u64_m(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u16_m)))
-svuint16_t svqrshl_n_u16_m(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u8_x)))
-svuint8_t svqrshl_n_u8_x(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u32_x)))
-svuint32_t svqrshl_n_u32_x(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u64_x)))
-svuint64_t svqrshl_n_u64_x(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u16_x)))
-svuint16_t svqrshl_n_u16_x(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u8_z)))
-svuint8_t svqrshl_n_u8_z(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u32_z)))
-svuint32_t svqrshl_n_u32_z(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u64_z)))
-svuint64_t svqrshl_n_u64_z(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u16_z)))
-svuint16_t svqrshl_n_u16_z(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s8_m)))
-svint8_t svqrshl_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s32_m)))
-svint32_t svqrshl_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s64_m)))
-svint64_t svqrshl_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s16_m)))
-svint16_t svqrshl_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s8_x)))
-svint8_t svqrshl_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s32_x)))
-svint32_t svqrshl_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s64_x)))
-svint64_t svqrshl_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s16_x)))
-svint16_t svqrshl_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s8_z)))
-svint8_t svqrshl_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s32_z)))
-svint32_t svqrshl_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s64_z)))
-svint64_t svqrshl_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s16_z)))
-svint16_t svqrshl_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u8_m)))
-svuint8_t svqrshl_u8_m(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u32_m)))
-svuint32_t svqrshl_u32_m(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u64_m)))
-svuint64_t svqrshl_u64_m(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u16_m)))
-svuint16_t svqrshl_u16_m(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u8_x)))
-svuint8_t svqrshl_u8_x(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u32_x)))
-svuint32_t svqrshl_u32_x(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u64_x)))
-svuint64_t svqrshl_u64_x(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u16_x)))
-svuint16_t svqrshl_u16_x(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u8_z)))
-svuint8_t svqrshl_u8_z(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u32_z)))
-svuint32_t svqrshl_u32_z(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u64_z)))
-svuint64_t svqrshl_u64_z(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u16_z)))
-svuint16_t svqrshl_u16_z(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_s32)))
-svint16_t svqrshrnb_n_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_s64)))
-svint32_t svqrshrnb_n_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_s16)))
-svint8_t svqrshrnb_n_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_u32)))
-svuint16_t svqrshrnb_n_u32(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_u64)))
-svuint32_t svqrshrnb_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_u16)))
-svuint8_t svqrshrnb_n_u16(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_s32)))
-svint16_t svqrshrnt_n_s32(svint16_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_s64)))
-svint32_t svqrshrnt_n_s64(svint32_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_s16)))
-svint8_t svqrshrnt_n_s16(svint8_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_u32)))
-svuint16_t svqrshrnt_n_u32(svuint16_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_u64)))
-svuint32_t svqrshrnt_n_u64(svuint32_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_u16)))
-svuint8_t svqrshrnt_n_u16(svuint8_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunb_n_s32)))
-svuint16_t svqrshrunb_n_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunb_n_s64)))
-svuint32_t svqrshrunb_n_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunb_n_s16)))
-svuint8_t svqrshrunb_n_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunt_n_s32)))
-svuint16_t svqrshrunt_n_s32(svuint16_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunt_n_s64)))
-svuint32_t svqrshrunt_n_s64(svuint32_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunt_n_s16)))
-svuint8_t svqrshrunt_n_s16(svuint8_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s8_m)))
-svint8_t svqshl_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s32_m)))
-svint32_t svqshl_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s64_m)))
-svint64_t svqshl_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s16_m)))
-svint16_t svqshl_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s8_x)))
-svint8_t svqshl_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s32_x)))
-svint32_t svqshl_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s64_x)))
-svint64_t svqshl_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s16_x)))
-svint16_t svqshl_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s8_z)))
-svint8_t svqshl_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s32_z)))
-svint32_t svqshl_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s64_z)))
-svint64_t svqshl_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s16_z)))
-svint16_t svqshl_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u8_m)))
-svuint8_t svqshl_n_u8_m(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u32_m)))
-svuint32_t svqshl_n_u32_m(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u64_m)))
-svuint64_t svqshl_n_u64_m(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u16_m)))
-svuint16_t svqshl_n_u16_m(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u8_x)))
-svuint8_t svqshl_n_u8_x(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u32_x)))
-svuint32_t svqshl_n_u32_x(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u64_x)))
-svuint64_t svqshl_n_u64_x(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u16_x)))
-svuint16_t svqshl_n_u16_x(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u8_z)))
-svuint8_t svqshl_n_u8_z(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u32_z)))
-svuint32_t svqshl_n_u32_z(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u64_z)))
-svuint64_t svqshl_n_u64_z(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u16_z)))
-svuint16_t svqshl_n_u16_z(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s8_m)))
-svint8_t svqshl_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s32_m)))
-svint32_t svqshl_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s64_m)))
-svint64_t svqshl_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s16_m)))
-svint16_t svqshl_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s8_x)))
-svint8_t svqshl_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s32_x)))
-svint32_t svqshl_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s64_x)))
-svint64_t svqshl_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s16_x)))
-svint16_t svqshl_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s8_z)))
-svint8_t svqshl_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s32_z)))
-svint32_t svqshl_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s64_z)))
-svint64_t svqshl_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s16_z)))
-svint16_t svqshl_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u8_m)))
-svuint8_t svqshl_u8_m(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u32_m)))
-svuint32_t svqshl_u32_m(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u64_m)))
-svuint64_t svqshl_u64_m(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u16_m)))
-svuint16_t svqshl_u16_m(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u8_x)))
-svuint8_t svqshl_u8_x(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u32_x)))
-svuint32_t svqshl_u32_x(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u64_x)))
-svuint64_t svqshl_u64_x(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u16_x)))
-svuint16_t svqshl_u16_x(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u8_z)))
-svuint8_t svqshl_u8_z(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u32_z)))
-svuint32_t svqshl_u32_z(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u64_z)))
-svuint64_t svqshl_u64_z(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u16_z)))
-svuint16_t svqshl_u16_z(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s8_m)))
-svuint8_t svqshlu_n_s8_m(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s32_m)))
-svuint32_t svqshlu_n_s32_m(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s64_m)))
-svuint64_t svqshlu_n_s64_m(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s16_m)))
-svuint16_t svqshlu_n_s16_m(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s8_x)))
-svuint8_t svqshlu_n_s8_x(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s32_x)))
-svuint32_t svqshlu_n_s32_x(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s64_x)))
-svuint64_t svqshlu_n_s64_x(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s16_x)))
-svuint16_t svqshlu_n_s16_x(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s8_z)))
-svuint8_t svqshlu_n_s8_z(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s32_z)))
-svuint32_t svqshlu_n_s32_z(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s64_z)))
-svuint64_t svqshlu_n_s64_z(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s16_z)))
-svuint16_t svqshlu_n_s16_z(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_s32)))
-svint16_t svqshrnb_n_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_s64)))
-svint32_t svqshrnb_n_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_s16)))
-svint8_t svqshrnb_n_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_u32)))
-svuint16_t svqshrnb_n_u32(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_u64)))
-svuint32_t svqshrnb_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_u16)))
-svuint8_t svqshrnb_n_u16(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_s32)))
-svint16_t svqshrnt_n_s32(svint16_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_s64)))
-svint32_t svqshrnt_n_s64(svint32_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_s16)))
-svint8_t svqshrnt_n_s16(svint8_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_u32)))
-svuint16_t svqshrnt_n_u32(svuint16_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_u64)))
-svuint32_t svqshrnt_n_u64(svuint32_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_u16)))
-svuint8_t svqshrnt_n_u16(svuint8_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunb_n_s32)))
-svuint16_t svqshrunb_n_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunb_n_s64)))
-svuint32_t svqshrunb_n_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunb_n_s16)))
-svuint8_t svqshrunb_n_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunt_n_s32)))
-svuint16_t svqshrunt_n_s32(svuint16_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunt_n_s64)))
-svuint32_t svqshrunt_n_s64(svuint32_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunt_n_s16)))
-svuint8_t svqshrunt_n_s16(svuint8_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8_m)))
-svint8_t svqsub_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32_m)))
-svint32_t svqsub_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s64_m)))
-svint64_t svqsub_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s16_m)))
-svint16_t svqsub_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8_x)))
-svint8_t svqsub_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32_x)))
-svint32_t svqsub_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s64_x)))
-svint64_t svqsub_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s16_x)))
-svint16_t svqsub_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8_z)))
-svint8_t svqsub_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32_z)))
-svint32_t svqsub_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s64_z)))
-svint64_t svqsub_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s16_z)))
-svint16_t svqsub_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u8_m)))
-svuint8_t svqsub_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u32_m)))
-svuint32_t svqsub_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u64_m)))
-svuint64_t svqsub_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u16_m)))
-svuint16_t svqsub_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u8_x)))
-svuint8_t svqsub_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u32_x)))
-svuint32_t svqsub_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u64_x)))
-svuint64_t svqsub_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u16_x)))
-svuint16_t svqsub_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u8_z)))
-svuint8_t svqsub_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u32_z)))
-svuint32_t svqsub_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u64_z)))
-svuint64_t svqsub_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u16_z)))
-svuint16_t svqsub_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s8_m)))
-svint8_t svqsub_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s32_m)))
-svint32_t svqsub_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s64_m)))
-svint64_t svqsub_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s16_m)))
-svint16_t svqsub_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s8_x)))
-svint8_t svqsub_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s32_x)))
-svint32_t svqsub_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s64_x)))
-svint64_t svqsub_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s16_x)))
-svint16_t svqsub_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s8_z)))
-svint8_t svqsub_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s32_z)))
-svint32_t svqsub_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s64_z)))
-svint64_t svqsub_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s16_z)))
-svint16_t svqsub_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u8_m)))
-svuint8_t svqsub_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u32_m)))
-svuint32_t svqsub_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u64_m)))
-svuint64_t svqsub_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u16_m)))
-svuint16_t svqsub_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u8_x)))
-svuint8_t svqsub_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u32_x)))
-svuint32_t svqsub_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u64_x)))
-svuint64_t svqsub_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u16_x)))
-svuint16_t svqsub_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u8_z)))
-svuint8_t svqsub_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u32_z)))
-svuint32_t svqsub_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u64_z)))
-svuint64_t svqsub_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u16_z)))
-svuint16_t svqsub_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s8_m)))
-svint8_t svqsubr_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s32_m)))
-svint32_t svqsubr_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s64_m)))
-svint64_t svqsubr_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s16_m)))
-svint16_t svqsubr_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s8_x)))
-svint8_t svqsubr_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s32_x)))
-svint32_t svqsubr_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s64_x)))
-svint64_t svqsubr_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s16_x)))
-svint16_t svqsubr_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s8_z)))
-svint8_t svqsubr_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s32_z)))
-svint32_t svqsubr_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s64_z)))
-svint64_t svqsubr_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s16_z)))
-svint16_t svqsubr_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u8_m)))
-svuint8_t svqsubr_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u32_m)))
-svuint32_t svqsubr_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u64_m)))
-svuint64_t svqsubr_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u16_m)))
-svuint16_t svqsubr_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u8_x)))
-svuint8_t svqsubr_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u32_x)))
-svuint32_t svqsubr_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u64_x)))
-svuint64_t svqsubr_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u16_x)))
-svuint16_t svqsubr_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u8_z)))
-svuint8_t svqsubr_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u32_z)))
-svuint32_t svqsubr_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u64_z)))
-svuint64_t svqsubr_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u16_z)))
-svuint16_t svqsubr_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s8_m)))
-svint8_t svqsubr_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s32_m)))
-svint32_t svqsubr_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s64_m)))
-svint64_t svqsubr_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s16_m)))
-svint16_t svqsubr_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s8_x)))
-svint8_t svqsubr_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s32_x)))
-svint32_t svqsubr_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s64_x)))
-svint64_t svqsubr_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s16_x)))
-svint16_t svqsubr_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s8_z)))
-svint8_t svqsubr_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s32_z)))
-svint32_t svqsubr_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s64_z)))
-svint64_t svqsubr_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s16_z)))
-svint16_t svqsubr_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u8_m)))
-svuint8_t svqsubr_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u32_m)))
-svuint32_t svqsubr_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u64_m)))
-svuint64_t svqsubr_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u16_m)))
-svuint16_t svqsubr_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u8_x)))
-svuint8_t svqsubr_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u32_x)))
-svuint32_t svqsubr_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u64_x)))
-svuint64_t svqsubr_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u16_x)))
-svuint16_t svqsubr_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u8_z)))
-svuint8_t svqsubr_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u32_z)))
-svuint32_t svqsubr_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u64_z)))
-svuint64_t svqsubr_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u16_z)))
-svuint16_t svqsubr_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_s32)))
-svint16_t svqxtnb_s32(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_s64)))
-svint32_t svqxtnb_s64(svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_s16)))
-svint8_t svqxtnb_s16(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_u32)))
-svuint16_t svqxtnb_u32(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_u64)))
-svuint32_t svqxtnb_u64(svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_u16)))
-svuint8_t svqxtnb_u16(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_s32)))
-svint16_t svqxtnt_s32(svint16_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_s64)))
-svint32_t svqxtnt_s64(svint32_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_s16)))
-svint8_t svqxtnt_s16(svint8_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_u32)))
-svuint16_t svqxtnt_u32(svuint16_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_u64)))
-svuint32_t svqxtnt_u64(svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_u16)))
-svuint8_t svqxtnt_u16(svuint8_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunb_s32)))
-svuint16_t svqxtunb_s32(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunb_s64)))
-svuint32_t svqxtunb_s64(svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunb_s16)))
-svuint8_t svqxtunb_s16(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunt_s32)))
-svuint16_t svqxtunt_s32(svuint16_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunt_s64)))
-svuint32_t svqxtunt_s64(svuint32_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunt_s16)))
-svuint8_t svqxtunt_s16(svuint8_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_u32)))
-svuint16_t svraddhnb_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_u64)))
-svuint32_t svraddhnb_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_u16)))
-svuint8_t svraddhnb_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_s32)))
-svint16_t svraddhnb_n_s32(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_s64)))
-svint32_t svraddhnb_n_s64(svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_s16)))
-svint8_t svraddhnb_n_s16(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_u32)))
-svuint16_t svraddhnb_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_u64)))
-svuint32_t svraddhnb_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_u16)))
-svuint8_t svraddhnb_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_s32)))
-svint16_t svraddhnb_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_s64)))
-svint32_t svraddhnb_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_s16)))
-svint8_t svraddhnb_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_u32)))
-svuint16_t svraddhnt_n_u32(svuint16_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_u64)))
-svuint32_t svraddhnt_n_u64(svuint32_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_u16)))
-svuint8_t svraddhnt_n_u16(svuint8_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_s32)))
-svint16_t svraddhnt_n_s32(svint16_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_s64)))
-svint32_t svraddhnt_n_s64(svint32_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_s16)))
-svint8_t svraddhnt_n_s16(svint8_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_u32)))
-svuint16_t svraddhnt_u32(svuint16_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_u64)))
-svuint32_t svraddhnt_u64(svuint32_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_u16)))
-svuint8_t svraddhnt_u16(svuint8_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_s32)))
-svint16_t svraddhnt_s32(svint16_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_s64)))
-svint32_t svraddhnt_s64(svint32_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_s16)))
-svint8_t svraddhnt_s16(svint8_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_u32_m)))
-svuint32_t svrecpe_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_u32_x)))
-svuint32_t svrecpe_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_u32_z)))
-svuint32_t svrecpe_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s8_m)))
-svint8_t svrhadd_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s32_m)))
-svint32_t svrhadd_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s64_m)))
-svint64_t svrhadd_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s16_m)))
-svint16_t svrhadd_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s8_x)))
-svint8_t svrhadd_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s32_x)))
-svint32_t svrhadd_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s64_x)))
-svint64_t svrhadd_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s16_x)))
-svint16_t svrhadd_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s8_z)))
-svint8_t svrhadd_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s32_z)))
-svint32_t svrhadd_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s64_z)))
-svint64_t svrhadd_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s16_z)))
-svint16_t svrhadd_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u8_m)))
-svuint8_t svrhadd_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u32_m)))
-svuint32_t svrhadd_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u64_m)))
-svuint64_t svrhadd_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u16_m)))
-svuint16_t svrhadd_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u8_x)))
-svuint8_t svrhadd_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u32_x)))
-svuint32_t svrhadd_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u64_x)))
-svuint64_t svrhadd_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u16_x)))
-svuint16_t svrhadd_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u8_z)))
-svuint8_t svrhadd_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u32_z)))
-svuint32_t svrhadd_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u64_z)))
-svuint64_t svrhadd_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u16_z)))
-svuint16_t svrhadd_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s8_m)))
-svint8_t svrhadd_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s32_m)))
-svint32_t svrhadd_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s64_m)))
-svint64_t svrhadd_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s16_m)))
-svint16_t svrhadd_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s8_x)))
-svint8_t svrhadd_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s32_x)))
-svint32_t svrhadd_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s64_x)))
-svint64_t svrhadd_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s16_x)))
-svint16_t svrhadd_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s8_z)))
-svint8_t svrhadd_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s32_z)))
-svint32_t svrhadd_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s64_z)))
-svint64_t svrhadd_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s16_z)))
-svint16_t svrhadd_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u8_m)))
-svuint8_t svrhadd_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u32_m)))
-svuint32_t svrhadd_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u64_m)))
-svuint64_t svrhadd_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u16_m)))
-svuint16_t svrhadd_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u8_x)))
-svuint8_t svrhadd_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u32_x)))
-svuint32_t svrhadd_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u64_x)))
-svuint64_t svrhadd_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u16_x)))
-svuint16_t svrhadd_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u8_z)))
-svuint8_t svrhadd_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u32_z)))
-svuint32_t svrhadd_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u64_z)))
-svuint64_t svrhadd_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u16_z)))
-svuint16_t svrhadd_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s8_m)))
-svint8_t svrshl_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s32_m)))
-svint32_t svrshl_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s64_m)))
-svint64_t svrshl_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s16_m)))
-svint16_t svrshl_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s8_x)))
-svint8_t svrshl_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s32_x)))
-svint32_t svrshl_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s64_x)))
-svint64_t svrshl_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s16_x)))
-svint16_t svrshl_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s8_z)))
-svint8_t svrshl_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s32_z)))
-svint32_t svrshl_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s64_z)))
-svint64_t svrshl_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s16_z)))
-svint16_t svrshl_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u8_m)))
-svuint8_t svrshl_n_u8_m(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u32_m)))
-svuint32_t svrshl_n_u32_m(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u64_m)))
-svuint64_t svrshl_n_u64_m(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u16_m)))
-svuint16_t svrshl_n_u16_m(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u8_x)))
-svuint8_t svrshl_n_u8_x(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u32_x)))
-svuint32_t svrshl_n_u32_x(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u64_x)))
-svuint64_t svrshl_n_u64_x(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u16_x)))
-svuint16_t svrshl_n_u16_x(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u8_z)))
-svuint8_t svrshl_n_u8_z(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u32_z)))
-svuint32_t svrshl_n_u32_z(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u64_z)))
-svuint64_t svrshl_n_u64_z(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u16_z)))
-svuint16_t svrshl_n_u16_z(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_m)))
-svint8_t svrshl_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_m)))
-svint32_t svrshl_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_m)))
-svint64_t svrshl_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_m)))
-svint16_t svrshl_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_x)))
-svint8_t svrshl_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_x)))
-svint32_t svrshl_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_x)))
-svint64_t svrshl_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_x)))
-svint16_t svrshl_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_z)))
-svint8_t svrshl_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_z)))
-svint32_t svrshl_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_z)))
-svint64_t svrshl_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_z)))
-svint16_t svrshl_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_m)))
-svuint8_t svrshl_u8_m(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_m)))
-svuint32_t svrshl_u32_m(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_m)))
-svuint64_t svrshl_u64_m(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_m)))
-svuint16_t svrshl_u16_m(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_x)))
-svuint8_t svrshl_u8_x(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_x)))
-svuint32_t svrshl_u32_x(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_x)))
-svuint64_t svrshl_u64_x(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_x)))
-svuint16_t svrshl_u16_x(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_z)))
-svuint8_t svrshl_u8_z(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_z)))
-svuint32_t svrshl_u32_z(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_z)))
-svuint64_t svrshl_u64_z(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_z)))
-svuint16_t svrshl_u16_z(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s8_m)))
-svint8_t svrshr_n_s8_m(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s32_m)))
-svint32_t svrshr_n_s32_m(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s64_m)))
-svint64_t svrshr_n_s64_m(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s16_m)))
-svint16_t svrshr_n_s16_m(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u8_m)))
-svuint8_t svrshr_n_u8_m(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u32_m)))
-svuint32_t svrshr_n_u32_m(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u64_m)))
-svuint64_t svrshr_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u16_m)))
-svuint16_t svrshr_n_u16_m(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s8_x)))
-svint8_t svrshr_n_s8_x(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s32_x)))
-svint32_t svrshr_n_s32_x(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s64_x)))
-svint64_t svrshr_n_s64_x(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s16_x)))
-svint16_t svrshr_n_s16_x(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u8_x)))
-svuint8_t svrshr_n_u8_x(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u32_x)))
-svuint32_t svrshr_n_u32_x(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u64_x)))
-svuint64_t svrshr_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u16_x)))
-svuint16_t svrshr_n_u16_x(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s8_z)))
-svint8_t svrshr_n_s8_z(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s32_z)))
-svint32_t svrshr_n_s32_z(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s64_z)))
-svint64_t svrshr_n_s64_z(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s16_z)))
-svint16_t svrshr_n_s16_z(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u8_z)))
-svuint8_t svrshr_n_u8_z(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u32_z)))
-svuint32_t svrshr_n_u32_z(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u64_z)))
-svuint64_t svrshr_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u16_z)))
-svuint16_t svrshr_n_u16_z(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_u32)))
-svuint16_t svrshrnb_n_u32(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_u64)))
-svuint32_t svrshrnb_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_u16)))
-svuint8_t svrshrnb_n_u16(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_s32)))
-svint16_t svrshrnb_n_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_s64)))
-svint32_t svrshrnb_n_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_s16)))
-svint8_t svrshrnb_n_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_u32)))
-svuint16_t svrshrnt_n_u32(svuint16_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_u64)))
-svuint32_t svrshrnt_n_u64(svuint32_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_u16)))
-svuint8_t svrshrnt_n_u16(svuint8_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_s32)))
-svint16_t svrshrnt_n_s32(svint16_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_s64)))
-svint32_t svrshrnt_n_s64(svint32_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_s16)))
-svint8_t svrshrnt_n_s16(svint8_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_u32_m)))
-svuint32_t svrsqrte_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_u32_x)))
-svuint32_t svrsqrte_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_u32_z)))
-svuint32_t svrsqrte_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_s8)))
-svint8_t svrsra_n_s8(svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_s32)))
-svint32_t svrsra_n_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_s64)))
-svint64_t svrsra_n_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_s16)))
-svint16_t svrsra_n_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_u8)))
-svuint8_t svrsra_n_u8(svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_u32)))
-svuint32_t svrsra_n_u32(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_u64)))
-svuint64_t svrsra_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_u16)))
-svuint16_t svrsra_n_u16(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_u32)))
-svuint16_t svrsubhnb_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_u64)))
-svuint32_t svrsubhnb_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_u16)))
-svuint8_t svrsubhnb_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_s32)))
-svint16_t svrsubhnb_n_s32(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_s64)))
-svint32_t svrsubhnb_n_s64(svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_s16)))
-svint8_t svrsubhnb_n_s16(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_u32)))
-svuint16_t svrsubhnb_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_u64)))
-svuint32_t svrsubhnb_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_u16)))
-svuint8_t svrsubhnb_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_s32)))
-svint16_t svrsubhnb_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_s64)))
-svint32_t svrsubhnb_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_s16)))
-svint8_t svrsubhnb_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_u32)))
-svuint16_t svrsubhnt_n_u32(svuint16_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_u64)))
-svuint32_t svrsubhnt_n_u64(svuint32_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_u16)))
-svuint8_t svrsubhnt_n_u16(svuint8_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_s32)))
-svint16_t svrsubhnt_n_s32(svint16_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_s64)))
-svint32_t svrsubhnt_n_s64(svint32_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_s16)))
-svint8_t svrsubhnt_n_s16(svint8_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_u32)))
-svuint16_t svrsubhnt_u32(svuint16_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_u64)))
-svuint32_t svrsubhnt_u64(svuint32_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_u16)))
-svuint8_t svrsubhnt_u16(svuint8_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_s32)))
-svint16_t svrsubhnt_s32(svint16_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_s64)))
-svint32_t svrsubhnt_s64(svint32_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_s16)))
-svint8_t svrsubhnt_s16(svint8_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclb_n_u32)))
-svuint32_t svsbclb_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclb_n_u64)))
-svuint64_t svsbclb_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclb_u32)))
-svuint32_t svsbclb_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclb_u64)))
-svuint64_t svsbclb_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclt_n_u32)))
-svuint32_t svsbclt_n_u32(svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclt_n_u64)))
-svuint64_t svsbclt_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclt_u32)))
-svuint32_t svsbclt_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclt_u64)))
-svuint64_t svsbclt_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_s32)))
-svint32_t svshllb_n_s32(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_s64)))
-svint64_t svshllb_n_s64(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_s16)))
-svint16_t svshllb_n_s16(svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_u32)))
-svuint32_t svshllb_n_u32(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_u64)))
-svuint64_t svshllb_n_u64(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_u16)))
-svuint16_t svshllb_n_u16(svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_s32)))
-svint32_t svshllt_n_s32(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_s64)))
-svint64_t svshllt_n_s64(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_s16)))
-svint16_t svshllt_n_s16(svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_u32)))
-svuint32_t svshllt_n_u32(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_u64)))
-svuint64_t svshllt_n_u64(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_u16)))
-svuint16_t svshllt_n_u16(svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_u32)))
-svuint16_t svshrnb_n_u32(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_u64)))
-svuint32_t svshrnb_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_u16)))
-svuint8_t svshrnb_n_u16(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_s32)))
-svint16_t svshrnb_n_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_s64)))
-svint32_t svshrnb_n_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_s16)))
-svint8_t svshrnb_n_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_u32)))
-svuint16_t svshrnt_n_u32(svuint16_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_u64)))
-svuint32_t svshrnt_n_u64(svuint32_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_u16)))
-svuint8_t svshrnt_n_u16(svuint8_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_s32)))
-svint16_t svshrnt_n_s32(svint16_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_s64)))
-svint32_t svshrnt_n_s64(svint32_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_s16)))
-svint8_t svshrnt_n_s16(svint8_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_u8)))
-svuint8_t svsli_n_u8(svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_u32)))
-svuint32_t svsli_n_u32(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_u64)))
-svuint64_t svsli_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_u16)))
-svuint16_t svsli_n_u16(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_s8)))
-svint8_t svsli_n_s8(svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_s32)))
-svint32_t svsli_n_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_s64)))
-svint64_t svsli_n_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_s16)))
-svint16_t svsli_n_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u8_m)))
-svuint8_t svsqadd_n_u8_m(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u32_m)))
-svuint32_t svsqadd_n_u32_m(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u64_m)))
-svuint64_t svsqadd_n_u64_m(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u16_m)))
-svuint16_t svsqadd_n_u16_m(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u8_x)))
-svuint8_t svsqadd_n_u8_x(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u32_x)))
-svuint32_t svsqadd_n_u32_x(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u64_x)))
-svuint64_t svsqadd_n_u64_x(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u16_x)))
-svuint16_t svsqadd_n_u16_x(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u8_z)))
-svuint8_t svsqadd_n_u8_z(svbool_t, svuint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u32_z)))
-svuint32_t svsqadd_n_u32_z(svbool_t, svuint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u64_z)))
-svuint64_t svsqadd_n_u64_z(svbool_t, svuint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u16_z)))
-svuint16_t svsqadd_n_u16_z(svbool_t, svuint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u8_m)))
-svuint8_t svsqadd_u8_m(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u32_m)))
-svuint32_t svsqadd_u32_m(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u64_m)))
-svuint64_t svsqadd_u64_m(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u16_m)))
-svuint16_t svsqadd_u16_m(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u8_x)))
-svuint8_t svsqadd_u8_x(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u32_x)))
-svuint32_t svsqadd_u32_x(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u64_x)))
-svuint64_t svsqadd_u64_x(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u16_x)))
-svuint16_t svsqadd_u16_x(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u8_z)))
-svuint8_t svsqadd_u8_z(svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u32_z)))
-svuint32_t svsqadd_u32_z(svbool_t, svuint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u64_z)))
-svuint64_t svsqadd_u64_z(svbool_t, svuint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u16_z)))
-svuint16_t svsqadd_u16_z(svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_s8)))
-svint8_t svsra_n_s8(svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_s32)))
-svint32_t svsra_n_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_s64)))
-svint64_t svsra_n_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_s16)))
-svint16_t svsra_n_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_u8)))
-svuint8_t svsra_n_u8(svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_u32)))
-svuint32_t svsra_n_u32(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_u64)))
-svuint64_t svsra_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_u16)))
-svuint16_t svsra_n_u16(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_u8)))
-svuint8_t svsri_n_u8(svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_u32)))
-svuint32_t svsri_n_u32(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_u64)))
-svuint64_t svsri_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_u16)))
-svuint16_t svsri_n_u16(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_s8)))
-svint8_t svsri_n_s8(svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_s32)))
-svint32_t svsri_n_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_s64)))
-svint64_t svsri_n_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_s16)))
-svint16_t svsri_n_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_u32)))
-svuint16_t svsubhnb_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_u64)))
-svuint32_t svsubhnb_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_u16)))
-svuint8_t svsubhnb_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_s32)))
-svint16_t svsubhnb_n_s32(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_s64)))
-svint32_t svsubhnb_n_s64(svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_s16)))
-svint8_t svsubhnb_n_s16(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_u32)))
-svuint16_t svsubhnb_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_u64)))
-svuint32_t svsubhnb_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_u16)))
-svuint8_t svsubhnb_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_s32)))
-svint16_t svsubhnb_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_s64)))
-svint32_t svsubhnb_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_s16)))
-svint8_t svsubhnb_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_u32)))
-svuint16_t svsubhnt_n_u32(svuint16_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_u64)))
-svuint32_t svsubhnt_n_u64(svuint32_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_u16)))
-svuint8_t svsubhnt_n_u16(svuint8_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_s32)))
-svint16_t svsubhnt_n_s32(svint16_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_s64)))
-svint32_t svsubhnt_n_s64(svint32_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_s16)))
-svint8_t svsubhnt_n_s16(svint8_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_u32)))
-svuint16_t svsubhnt_u32(svuint16_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_u64)))
-svuint32_t svsubhnt_u64(svuint32_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_u16)))
-svuint8_t svsubhnt_u16(svuint8_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_s32)))
-svint16_t svsubhnt_s32(svint16_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_s64)))
-svint32_t svsubhnt_s64(svint32_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_s16)))
-svint8_t svsubhnt_s16(svint8_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_s32)))
-svint32_t svsublb_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_s64)))
-svint64_t svsublb_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_s16)))
-svint16_t svsublb_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_u32)))
-svuint32_t svsublb_n_u32(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_u64)))
-svuint64_t svsublb_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_u16)))
-svuint16_t svsublb_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_s32)))
-svint32_t svsublb_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_s64)))
-svint64_t svsublb_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_s16)))
-svint16_t svsublb_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_u32)))
-svuint32_t svsublb_u32(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_u64)))
-svuint64_t svsublb_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_u16)))
-svuint16_t svsublb_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_n_s32)))
-svint32_t svsublbt_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_n_s64)))
-svint64_t svsublbt_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_n_s16)))
-svint16_t svsublbt_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_s32)))
-svint32_t svsublbt_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_s64)))
-svint64_t svsublbt_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_s16)))
-svint16_t svsublbt_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_s32)))
-svint32_t svsublt_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_s64)))
-svint64_t svsublt_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_s16)))
-svint16_t svsublt_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_u32)))
-svuint32_t svsublt_n_u32(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_u64)))
-svuint64_t svsublt_n_u64(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_u16)))
-svuint16_t svsublt_n_u16(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_s32)))
-svint32_t svsublt_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_s64)))
-svint64_t svsublt_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_s16)))
-svint16_t svsublt_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_u32)))
-svuint32_t svsublt_u32(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_u64)))
-svuint64_t svsublt_u64(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_u16)))
-svuint16_t svsublt_u16(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_n_s32)))
-svint32_t svsubltb_n_s32(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_n_s64)))
-svint64_t svsubltb_n_s64(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_n_s16)))
-svint16_t svsubltb_n_s16(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_s32)))
-svint32_t svsubltb_s32(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_s64)))
-svint64_t svsubltb_s64(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_s16)))
-svint16_t svsubltb_s16(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_s32)))
-svint32_t svsubwb_n_s32(svint32_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_s64)))
-svint64_t svsubwb_n_s64(svint64_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_s16)))
-svint16_t svsubwb_n_s16(svint16_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_u32)))
-svuint32_t svsubwb_n_u32(svuint32_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_u64)))
-svuint64_t svsubwb_n_u64(svuint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_u16)))
-svuint16_t svsubwb_n_u16(svuint16_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_s32)))
-svint32_t svsubwb_s32(svint32_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_s64)))
-svint64_t svsubwb_s64(svint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_s16)))
-svint16_t svsubwb_s16(svint16_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_u32)))
-svuint32_t svsubwb_u32(svuint32_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_u64)))
-svuint64_t svsubwb_u64(svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_u16)))
-svuint16_t svsubwb_u16(svuint16_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_s32)))
-svint32_t svsubwt_n_s32(svint32_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_s64)))
-svint64_t svsubwt_n_s64(svint64_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_s16)))
-svint16_t svsubwt_n_s16(svint16_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_u32)))
-svuint32_t svsubwt_n_u32(svuint32_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_u64)))
-svuint64_t svsubwt_n_u64(svuint64_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_u16)))
-svuint16_t svsubwt_n_u16(svuint16_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_s32)))
-svint32_t svsubwt_s32(svint32_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_s64)))
-svint64_t svsubwt_s64(svint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_s16)))
-svint16_t svsubwt_s16(svint16_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_u32)))
-svuint32_t svsubwt_u32(svuint32_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_u64)))
-svuint64_t svsubwt_u64(svuint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_u16)))
-svuint16_t svsubwt_u16(svuint16_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_u8)))
-svuint8_t svtbl2_u8(svuint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_u32)))
-svuint32_t svtbl2_u32(svuint32x2_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_u64)))
-svuint64_t svtbl2_u64(svuint64x2_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_u16)))
-svuint16_t svtbl2_u16(svuint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_s8)))
-svint8_t svtbl2_s8(svint8x2_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_f64)))
-svfloat64_t svtbl2_f64(svfloat64x2_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_f32)))
-svfloat32_t svtbl2_f32(svfloat32x2_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_f16)))
-svfloat16_t svtbl2_f16(svfloat16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_s32)))
-svint32_t svtbl2_s32(svint32x2_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_s64)))
-svint64_t svtbl2_s64(svint64x2_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_s16)))
-svint16_t svtbl2_s16(svint16x2_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_u8)))
-svuint8_t svtbx_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_u32)))
-svuint32_t svtbx_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_u64)))
-svuint64_t svtbx_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_u16)))
-svuint16_t svtbx_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_s8)))
-svint8_t svtbx_s8(svint8_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_f64)))
-svfloat64_t svtbx_f64(svfloat64_t, svfloat64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_f32)))
-svfloat32_t svtbx_f32(svfloat32_t, svfloat32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_f16)))
-svfloat16_t svtbx_f16(svfloat16_t, svfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_s32)))
-svint32_t svtbx_s32(svint32_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_s64)))
-svint64_t svtbx_s64(svint64_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_s16)))
-svint16_t svtbx_s16(svint16_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s8_m)))
-svint8_t svuqadd_n_s8_m(svbool_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s32_m)))
-svint32_t svuqadd_n_s32_m(svbool_t, svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s64_m)))
-svint64_t svuqadd_n_s64_m(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s16_m)))
-svint16_t svuqadd_n_s16_m(svbool_t, svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s8_x)))
-svint8_t svuqadd_n_s8_x(svbool_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s32_x)))
-svint32_t svuqadd_n_s32_x(svbool_t, svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s64_x)))
-svint64_t svuqadd_n_s64_x(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s16_x)))
-svint16_t svuqadd_n_s16_x(svbool_t, svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s8_z)))
-svint8_t svuqadd_n_s8_z(svbool_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s32_z)))
-svint32_t svuqadd_n_s32_z(svbool_t, svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s64_z)))
-svint64_t svuqadd_n_s64_z(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s16_z)))
-svint16_t svuqadd_n_s16_z(svbool_t, svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s8_m)))
-svint8_t svuqadd_s8_m(svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s32_m)))
-svint32_t svuqadd_s32_m(svbool_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s64_m)))
-svint64_t svuqadd_s64_m(svbool_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s16_m)))
-svint16_t svuqadd_s16_m(svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s8_x)))
-svint8_t svuqadd_s8_x(svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s32_x)))
-svint32_t svuqadd_s32_x(svbool_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s64_x)))
-svint64_t svuqadd_s64_x(svbool_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s16_x)))
-svint16_t svuqadd_s16_x(svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s8_z)))
-svint8_t svuqadd_s8_z(svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s32_z)))
-svint32_t svuqadd_s32_z(svbool_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s64_z)))
-svint64_t svuqadd_s64_z(svbool_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s16_z)))
-svint16_t svuqadd_s16_z(svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_s32)))
-svbool_t svwhilege_b8_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_s32)))
-svbool_t svwhilege_b32_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_s32)))
-svbool_t svwhilege_b64_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_s32)))
-svbool_t svwhilege_b16_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_s64)))
-svbool_t svwhilege_b8_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_s64)))
-svbool_t svwhilege_b32_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_s64)))
-svbool_t svwhilege_b64_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_s64)))
-svbool_t svwhilege_b16_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_u32)))
-svbool_t svwhilege_b8_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_u32)))
-svbool_t svwhilege_b32_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_u32)))
-svbool_t svwhilege_b64_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_u32)))
-svbool_t svwhilege_b16_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_u64)))
-svbool_t svwhilege_b8_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_u64)))
-svbool_t svwhilege_b32_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_u64)))
-svbool_t svwhilege_b64_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_u64)))
-svbool_t svwhilege_b16_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_s32)))
-svbool_t svwhilegt_b8_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_s32)))
-svbool_t svwhilegt_b32_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_s32)))
-svbool_t svwhilegt_b64_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_s32)))
-svbool_t svwhilegt_b16_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_s64)))
-svbool_t svwhilegt_b8_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_s64)))
-svbool_t svwhilegt_b32_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_s64)))
-svbool_t svwhilegt_b64_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_s64)))
-svbool_t svwhilegt_b16_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_u32)))
-svbool_t svwhilegt_b8_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_u32)))
-svbool_t svwhilegt_b32_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_u32)))
-svbool_t svwhilegt_b64_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_u32)))
-svbool_t svwhilegt_b16_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_u64)))
-svbool_t svwhilegt_b8_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_u64)))
-svbool_t svwhilegt_b32_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_u64)))
-svbool_t svwhilegt_b64_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_u64)))
-svbool_t svwhilegt_b16_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_u8)))
-svbool_t svwhilerw_u8(uint8_t const *, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_s8)))
-svbool_t svwhilerw_s8(int8_t const *, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_u64)))
-svbool_t svwhilerw_u64(uint64_t const *, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_f64)))
-svbool_t svwhilerw_f64(float64_t const *, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_s64)))
-svbool_t svwhilerw_s64(int64_t const *, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_u16)))
-svbool_t svwhilerw_u16(uint16_t const *, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_f16)))
-svbool_t svwhilerw_f16(float16_t const *, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_s16)))
-svbool_t svwhilerw_s16(int16_t const *, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_u32)))
-svbool_t svwhilerw_u32(uint32_t const *, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_f32)))
-svbool_t svwhilerw_f32(float32_t const *, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_s32)))
-svbool_t svwhilerw_s32(int32_t const *, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_u8)))
-svbool_t svwhilewr_u8(uint8_t const *, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_s8)))
-svbool_t svwhilewr_s8(int8_t const *, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_u64)))
-svbool_t svwhilewr_u64(uint64_t const *, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_f64)))
-svbool_t svwhilewr_f64(float64_t const *, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_s64)))
-svbool_t svwhilewr_s64(int64_t const *, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_u16)))
-svbool_t svwhilewr_u16(uint16_t const *, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_f16)))
-svbool_t svwhilewr_f16(float16_t const *, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_s16)))
-svbool_t svwhilewr_s16(int16_t const *, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_u32)))
-svbool_t svwhilewr_u32(uint32_t const *, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_f32)))
-svbool_t svwhilewr_f32(float32_t const *, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_s32)))
-svbool_t svwhilewr_s32(int32_t const *, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_u8)))
-svuint8_t svxar_n_u8(svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_u32)))
-svuint32_t svxar_n_u32(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_u64)))
-svuint64_t svxar_n_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_u16)))
-svuint16_t svxar_n_u16(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s8)))
-svint8_t svxar_n_s8(svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s32)))
-svint32_t svxar_n_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s64)))
-svint64_t svxar_n_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s16)))
-svint16_t svxar_n_s16(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s8)))
-svint8_t svaba(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s32)))
-svint32_t svaba(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s64)))
-svint64_t svaba(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_s16)))
-svint16_t svaba(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_u8)))
-svuint8_t svaba(svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_u32)))
-svuint32_t svaba(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_u64)))
-svuint64_t svaba(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_n_u16)))
-svuint16_t svaba(svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_s8)))
-svint8_t svaba(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_s32)))
-svint32_t svaba(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_s64)))
-svint64_t svaba(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_s16)))
-svint16_t svaba(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_u8)))
-svuint8_t svaba(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_u32)))
-svuint32_t svaba(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_u64)))
-svuint64_t svaba(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaba_u16)))
-svuint16_t svaba(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_s32)))
-svint32_t svabalb(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_s64)))
-svint64_t svabalb(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_s16)))
-svint16_t svabalb(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_u32)))
-svuint32_t svabalb(svuint32_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_u64)))
-svuint64_t svabalb(svuint64_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_n_u16)))
-svuint16_t svabalb(svuint16_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_s32)))
-svint32_t svabalb(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_s64)))
-svint64_t svabalb(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_s16)))
-svint16_t svabalb(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_u32)))
-svuint32_t svabalb(svuint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_u64)))
-svuint64_t svabalb(svuint64_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalb_u16)))
-svuint16_t svabalb(svuint16_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_s32)))
-svint32_t svabalt(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_s64)))
-svint64_t svabalt(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_s16)))
-svint16_t svabalt(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_u32)))
-svuint32_t svabalt(svuint32_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_u64)))
-svuint64_t svabalt(svuint64_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_n_u16)))
-svuint16_t svabalt(svuint16_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_s32)))
-svint32_t svabalt(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_s64)))
-svint64_t svabalt(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_s16)))
-svint16_t svabalt(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_u32)))
-svuint32_t svabalt(svuint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_u64)))
-svuint64_t svabalt(svuint64_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabalt_u16)))
-svuint16_t svabalt(svuint16_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_s32)))
-svint32_t svabdlb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_s64)))
-svint64_t svabdlb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_s16)))
-svint16_t svabdlb(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_u32)))
-svuint32_t svabdlb(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_u64)))
-svuint64_t svabdlb(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_n_u16)))
-svuint16_t svabdlb(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_s32)))
-svint32_t svabdlb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_s64)))
-svint64_t svabdlb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_s16)))
-svint16_t svabdlb(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_u32)))
-svuint32_t svabdlb(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_u64)))
-svuint64_t svabdlb(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlb_u16)))
-svuint16_t svabdlb(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_s32)))
-svint32_t svabdlt(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_s64)))
-svint64_t svabdlt(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_s16)))
-svint16_t svabdlt(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_u32)))
-svuint32_t svabdlt(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_u64)))
-svuint64_t svabdlt(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_n_u16)))
-svuint16_t svabdlt(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_s32)))
-svint32_t svabdlt(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_s64)))
-svint64_t svabdlt(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_s16)))
-svint16_t svabdlt(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_u32)))
-svuint32_t svabdlt(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_u64)))
-svuint64_t svabdlt(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabdlt_u16)))
-svuint16_t svabdlt(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s32_m)))
-svint32_t svadalp_m(svbool_t, svint32_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s64_m)))
-svint64_t svadalp_m(svbool_t, svint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s16_m)))
-svint16_t svadalp_m(svbool_t, svint16_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s32_x)))
-svint32_t svadalp_x(svbool_t, svint32_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s64_x)))
-svint64_t svadalp_x(svbool_t, svint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s16_x)))
-svint16_t svadalp_x(svbool_t, svint16_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s32_z)))
-svint32_t svadalp_z(svbool_t, svint32_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s64_z)))
-svint64_t svadalp_z(svbool_t, svint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_s16_z)))
-svint16_t svadalp_z(svbool_t, svint16_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u32_m)))
-svuint32_t svadalp_m(svbool_t, svuint32_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u64_m)))
-svuint64_t svadalp_m(svbool_t, svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u16_m)))
-svuint16_t svadalp_m(svbool_t, svuint16_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u32_x)))
-svuint32_t svadalp_x(svbool_t, svuint32_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u64_x)))
-svuint64_t svadalp_x(svbool_t, svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u16_x)))
-svuint16_t svadalp_x(svbool_t, svuint16_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u32_z)))
-svuint32_t svadalp_z(svbool_t, svuint32_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u64_z)))
-svuint64_t svadalp_z(svbool_t, svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadalp_u16_z)))
-svuint16_t svadalp_z(svbool_t, svuint16_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclb_n_u32)))
-svuint32_t svadclb(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclb_n_u64)))
-svuint64_t svadclb(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclb_u32)))
-svuint32_t svadclb(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclb_u64)))
-svuint64_t svadclb(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclt_n_u32)))
-svuint32_t svadclt(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclt_n_u64)))
-svuint64_t svadclt(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclt_u32)))
-svuint32_t svadclt(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadclt_u64)))
-svuint64_t svadclt(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_u32)))
-svuint16_t svaddhnb(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_u64)))
-svuint32_t svaddhnb(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_u16)))
-svuint8_t svaddhnb(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_s32)))
-svint16_t svaddhnb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_s64)))
-svint32_t svaddhnb(svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_n_s16)))
-svint8_t svaddhnb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_u32)))
-svuint16_t svaddhnb(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_u64)))
-svuint32_t svaddhnb(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_u16)))
-svuint8_t svaddhnb(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_s32)))
-svint16_t svaddhnb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_s64)))
-svint32_t svaddhnb(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnb_s16)))
-svint8_t svaddhnb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_u32)))
-svuint16_t svaddhnt(svuint16_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_u64)))
-svuint32_t svaddhnt(svuint32_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_u16)))
-svuint8_t svaddhnt(svuint8_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_s32)))
-svint16_t svaddhnt(svint16_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_s64)))
-svint32_t svaddhnt(svint32_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_n_s16)))
-svint8_t svaddhnt(svint8_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_u32)))
-svuint16_t svaddhnt(svuint16_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_u64)))
-svuint32_t svaddhnt(svuint32_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_u16)))
-svuint8_t svaddhnt(svuint8_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_s32)))
-svint16_t svaddhnt(svint16_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_s64)))
-svint32_t svaddhnt(svint32_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddhnt_s16)))
-svint8_t svaddhnt(svint8_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_s32)))
-svint32_t svaddlb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_s64)))
-svint64_t svaddlb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_s16)))
-svint16_t svaddlb(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_u32)))
-svuint32_t svaddlb(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_u64)))
-svuint64_t svaddlb(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_n_u16)))
-svuint16_t svaddlb(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_s32)))
-svint32_t svaddlb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_s64)))
-svint64_t svaddlb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_s16)))
-svint16_t svaddlb(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_u32)))
-svuint32_t svaddlb(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_u64)))
-svuint64_t svaddlb(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlb_u16)))
-svuint16_t svaddlb(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_n_s32)))
-svint32_t svaddlbt(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_n_s64)))
-svint64_t svaddlbt(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_n_s16)))
-svint16_t svaddlbt(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_s32)))
-svint32_t svaddlbt(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_s64)))
-svint64_t svaddlbt(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlbt_s16)))
-svint16_t svaddlbt(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_s32)))
-svint32_t svaddlt(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_s64)))
-svint64_t svaddlt(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_s16)))
-svint16_t svaddlt(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_u32)))
-svuint32_t svaddlt(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_u64)))
-svuint64_t svaddlt(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_n_u16)))
-svuint16_t svaddlt(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_s32)))
-svint32_t svaddlt(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_s64)))
-svint64_t svaddlt(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_s16)))
-svint16_t svaddlt(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_u32)))
-svuint32_t svaddlt(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_u64)))
-svuint64_t svaddlt(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddlt_u16)))
-svuint16_t svaddlt(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f64_m)))
-svfloat64_t svaddp_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f32_m)))
-svfloat32_t svaddp_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f16_m)))
-svfloat16_t svaddp_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f64_x)))
-svfloat64_t svaddp_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f32_x)))
-svfloat32_t svaddp_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_f16_x)))
-svfloat16_t svaddp_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u8_m)))
-svuint8_t svaddp_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u32_m)))
-svuint32_t svaddp_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u64_m)))
-svuint64_t svaddp_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u16_m)))
-svuint16_t svaddp_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s8_m)))
-svint8_t svaddp_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s32_m)))
-svint32_t svaddp_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s64_m)))
-svint64_t svaddp_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s16_m)))
-svint16_t svaddp_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u8_x)))
-svuint8_t svaddp_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u32_x)))
-svuint32_t svaddp_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u64_x)))
-svuint64_t svaddp_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_u16_x)))
-svuint16_t svaddp_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s8_x)))
-svint8_t svaddp_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s32_x)))
-svint32_t svaddp_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s64_x)))
-svint64_t svaddp_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddp_s16_x)))
-svint16_t svaddp_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_s32)))
-svint32_t svaddwb(svint32_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_s64)))
-svint64_t svaddwb(svint64_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_s16)))
-svint16_t svaddwb(svint16_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_u32)))
-svuint32_t svaddwb(svuint32_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_u64)))
-svuint64_t svaddwb(svuint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_n_u16)))
-svuint16_t svaddwb(svuint16_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_s32)))
-svint32_t svaddwb(svint32_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_s64)))
-svint64_t svaddwb(svint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_s16)))
-svint16_t svaddwb(svint16_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_u32)))
-svuint32_t svaddwb(svuint32_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_u64)))
-svuint64_t svaddwb(svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwb_u16)))
-svuint16_t svaddwb(svuint16_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_s32)))
-svint32_t svaddwt(svint32_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_s64)))
-svint64_t svaddwt(svint64_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_s16)))
-svint16_t svaddwt(svint16_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_u32)))
-svuint32_t svaddwt(svuint32_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_u64)))
-svuint64_t svaddwt(svuint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_n_u16)))
-svuint16_t svaddwt(svuint16_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_s32)))
-svint32_t svaddwt(svint32_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_s64)))
-svint64_t svaddwt(svint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_s16)))
-svint16_t svaddwt(svint16_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_u32)))
-svuint32_t svaddwt(svuint32_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_u64)))
-svuint64_t svaddwt(svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddwt_u16)))
-svuint16_t svaddwt(svuint16_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_u8)))
-svuint8_t svbcax(svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_u32)))
-svuint32_t svbcax(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_u64)))
-svuint64_t svbcax(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_u16)))
-svuint16_t svbcax(svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_s8)))
-svint8_t svbcax(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_s32)))
-svint32_t svbcax(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_s64)))
-svint64_t svbcax(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_n_s16)))
-svint16_t svbcax(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_u8)))
-svuint8_t svbcax(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_u32)))
-svuint32_t svbcax(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_u64)))
-svuint64_t svbcax(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_u16)))
-svuint16_t svbcax(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_s8)))
-svint8_t svbcax(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_s32)))
-svint32_t svbcax(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_s64)))
-svint64_t svbcax(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbcax_s16)))
-svint16_t svbcax(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_u8)))
-svuint8_t svbsl1n(svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_u32)))
-svuint32_t svbsl1n(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_u64)))
-svuint64_t svbsl1n(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_u16)))
-svuint16_t svbsl1n(svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_s8)))
-svint8_t svbsl1n(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_s32)))
-svint32_t svbsl1n(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_s64)))
-svint64_t svbsl1n(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_n_s16)))
-svint16_t svbsl1n(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_u8)))
-svuint8_t svbsl1n(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_u32)))
-svuint32_t svbsl1n(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_u64)))
-svuint64_t svbsl1n(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_u16)))
-svuint16_t svbsl1n(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_s8)))
-svint8_t svbsl1n(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_s32)))
-svint32_t svbsl1n(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_s64)))
-svint64_t svbsl1n(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl1n_s16)))
-svint16_t svbsl1n(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_u8)))
-svuint8_t svbsl2n(svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_u32)))
-svuint32_t svbsl2n(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_u64)))
-svuint64_t svbsl2n(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_u16)))
-svuint16_t svbsl2n(svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_s8)))
-svint8_t svbsl2n(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_s32)))
-svint32_t svbsl2n(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_s64)))
-svint64_t svbsl2n(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_n_s16)))
-svint16_t svbsl2n(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_u8)))
-svuint8_t svbsl2n(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_u32)))
-svuint32_t svbsl2n(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_u64)))
-svuint64_t svbsl2n(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_u16)))
-svuint16_t svbsl2n(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_s8)))
-svint8_t svbsl2n(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_s32)))
-svint32_t svbsl2n(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_s64)))
-svint64_t svbsl2n(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl2n_s16)))
-svint16_t svbsl2n(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_u8)))
-svuint8_t svbsl(svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_u32)))
-svuint32_t svbsl(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_u64)))
-svuint64_t svbsl(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_u16)))
-svuint16_t svbsl(svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_s8)))
-svint8_t svbsl(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_s32)))
-svint32_t svbsl(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_s64)))
-svint64_t svbsl(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_n_s16)))
-svint16_t svbsl(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_u8)))
-svuint8_t svbsl(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_u32)))
-svuint32_t svbsl(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_u64)))
-svuint64_t svbsl(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_u16)))
-svuint16_t svbsl(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_s8)))
-svint8_t svbsl(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_s32)))
-svint32_t svbsl(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_s64)))
-svint64_t svbsl(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbsl_s16)))
-svint16_t svbsl(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_u8)))
-svuint8_t svcadd(svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_u32)))
-svuint32_t svcadd(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_u64)))
-svuint64_t svcadd(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_u16)))
-svuint16_t svcadd(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_s8)))
-svint8_t svcadd(svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_s32)))
-svint32_t svcadd(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_s64)))
-svint64_t svcadd(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_s16)))
-svint16_t svcadd(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcdot_s32)))
-svint32_t svcdot(svint32_t, svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcdot_s64)))
-svint64_t svcdot(svint64_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcdot_lane_s32)))
-svint32_t svcdot_lane(svint32_t, svint8_t, svint8_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcdot_lane_s64)))
-svint64_t svcdot_lane(svint64_t, svint16_t, svint16_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_u8)))
-svuint8_t svcmla(svuint8_t, svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_u32)))
-svuint32_t svcmla(svuint32_t, svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_u64)))
-svuint64_t svcmla(svuint64_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_u16)))
-svuint16_t svcmla(svuint16_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_s8)))
-svint8_t svcmla(svint8_t, svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_s32)))
-svint32_t svcmla(svint32_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_s64)))
-svint64_t svcmla(svint64_t, svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_s16)))
-svint16_t svcmla(svint16_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_u32)))
-svuint32_t svcmla_lane(svuint32_t, svuint32_t, svuint32_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_u16)))
-svuint16_t svcmla_lane(svuint16_t, svuint16_t, svuint16_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_s32)))
-svint32_t svcmla_lane(svint32_t, svint32_t, svint32_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_s16)))
-svint16_t svcmla_lane(svint16_t, svint16_t, svint16_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt_f32_f16_m)))
-svfloat32_t svcvtlt_f32_m(svfloat32_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt_f32_f16_x)))
-svfloat32_t svcvtlt_f32_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt_f64_f32_m)))
-svfloat64_t svcvtlt_f64_m(svfloat64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt_f64_f32_x)))
-svfloat64_t svcvtlt_f64_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_f16_f32_m)))
-svfloat16_t svcvtnt_f16_m(svfloat16_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_f32_f64_m)))
-svfloat32_t svcvtnt_f32_m(svfloat32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtx_f32_f64_m)))
-svfloat32_t svcvtx_f32_m(svfloat32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtx_f32_f64_x)))
-svfloat32_t svcvtx_f32_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtx_f32_f64_z)))
-svfloat32_t svcvtx_f32_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtxnt_f32_f64_m)))
-svfloat32_t svcvtxnt_f32_m(svfloat32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_u8)))
-svuint8_t sveor3(svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_u32)))
-svuint32_t sveor3(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_u64)))
-svuint64_t sveor3(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_u16)))
-svuint16_t sveor3(svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_s8)))
-svint8_t sveor3(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_s32)))
-svint32_t sveor3(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_s64)))
-svint64_t sveor3(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_n_s16)))
-svint16_t sveor3(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_u8)))
-svuint8_t sveor3(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_u32)))
-svuint32_t sveor3(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_u64)))
-svuint64_t sveor3(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_u16)))
-svuint16_t sveor3(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_s8)))
-svint8_t sveor3(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_s32)))
-svint32_t sveor3(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_s64)))
-svint64_t sveor3(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor3_s16)))
-svint16_t sveor3(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_u8)))
-svuint8_t sveorbt(svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_u32)))
-svuint32_t sveorbt(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_u64)))
-svuint64_t sveorbt(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_u16)))
-svuint16_t sveorbt(svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_s8)))
-svint8_t sveorbt(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_s32)))
-svint32_t sveorbt(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_s64)))
-svint64_t sveorbt(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_n_s16)))
-svint16_t sveorbt(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_u8)))
-svuint8_t sveorbt(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_u32)))
-svuint32_t sveorbt(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_u64)))
-svuint64_t sveorbt(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_u16)))
-svuint16_t sveorbt(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_s8)))
-svint8_t sveorbt(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_s32)))
-svint32_t sveorbt(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_s64)))
-svint64_t sveorbt(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorbt_s16)))
-svint16_t sveorbt(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_u8)))
-svuint8_t sveortb(svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_u32)))
-svuint32_t sveortb(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_u64)))
-svuint64_t sveortb(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_u16)))
-svuint16_t sveortb(svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_s8)))
-svint8_t sveortb(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_s32)))
-svint32_t sveortb(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_s64)))
-svint64_t sveortb(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_n_s16)))
-svint16_t sveortb(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_u8)))
-svuint8_t sveortb(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_u32)))
-svuint32_t sveortb(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_u64)))
-svuint64_t sveortb(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_u16)))
-svuint16_t sveortb(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_s8)))
-svint8_t sveortb(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_s32)))
-svint32_t sveortb(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_s64)))
-svint64_t sveortb(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveortb_s16)))
-svint16_t sveortb(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s8_m)))
-svint8_t svhadd_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s32_m)))
-svint32_t svhadd_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s64_m)))
-svint64_t svhadd_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s16_m)))
-svint16_t svhadd_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s8_x)))
-svint8_t svhadd_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s32_x)))
-svint32_t svhadd_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s64_x)))
-svint64_t svhadd_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s16_x)))
-svint16_t svhadd_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s8_z)))
-svint8_t svhadd_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s32_z)))
-svint32_t svhadd_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s64_z)))
-svint64_t svhadd_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_s16_z)))
-svint16_t svhadd_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u8_m)))
-svuint8_t svhadd_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u32_m)))
-svuint32_t svhadd_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u64_m)))
-svuint64_t svhadd_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u16_m)))
-svuint16_t svhadd_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u8_x)))
-svuint8_t svhadd_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u32_x)))
-svuint32_t svhadd_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u64_x)))
-svuint64_t svhadd_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u16_x)))
-svuint16_t svhadd_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u8_z)))
-svuint8_t svhadd_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u32_z)))
-svuint32_t svhadd_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u64_z)))
-svuint64_t svhadd_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_n_u16_z)))
-svuint16_t svhadd_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s8_m)))
-svint8_t svhadd_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s32_m)))
-svint32_t svhadd_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s64_m)))
-svint64_t svhadd_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s16_m)))
-svint16_t svhadd_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s8_x)))
-svint8_t svhadd_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s32_x)))
-svint32_t svhadd_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s64_x)))
-svint64_t svhadd_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s16_x)))
-svint16_t svhadd_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s8_z)))
-svint8_t svhadd_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s32_z)))
-svint32_t svhadd_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s64_z)))
-svint64_t svhadd_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_s16_z)))
-svint16_t svhadd_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u8_m)))
-svuint8_t svhadd_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u32_m)))
-svuint32_t svhadd_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u64_m)))
-svuint64_t svhadd_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u16_m)))
-svuint16_t svhadd_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u8_x)))
-svuint8_t svhadd_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u32_x)))
-svuint32_t svhadd_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u64_x)))
-svuint64_t svhadd_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u16_x)))
-svuint16_t svhadd_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u8_z)))
-svuint8_t svhadd_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u32_z)))
-svuint32_t svhadd_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u64_z)))
-svuint64_t svhadd_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhadd_u16_z)))
-svuint16_t svhadd_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s8_m)))
-svint8_t svhsub_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s32_m)))
-svint32_t svhsub_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s64_m)))
-svint64_t svhsub_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s16_m)))
-svint16_t svhsub_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s8_x)))
-svint8_t svhsub_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s32_x)))
-svint32_t svhsub_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s64_x)))
-svint64_t svhsub_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s16_x)))
-svint16_t svhsub_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s8_z)))
-svint8_t svhsub_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s32_z)))
-svint32_t svhsub_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s64_z)))
-svint64_t svhsub_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_s16_z)))
-svint16_t svhsub_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u8_m)))
-svuint8_t svhsub_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u32_m)))
-svuint32_t svhsub_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u64_m)))
-svuint64_t svhsub_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u16_m)))
-svuint16_t svhsub_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u8_x)))
-svuint8_t svhsub_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u32_x)))
-svuint32_t svhsub_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u64_x)))
-svuint64_t svhsub_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u16_x)))
-svuint16_t svhsub_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u8_z)))
-svuint8_t svhsub_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u32_z)))
-svuint32_t svhsub_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u64_z)))
-svuint64_t svhsub_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_n_u16_z)))
-svuint16_t svhsub_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s8_m)))
-svint8_t svhsub_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s32_m)))
-svint32_t svhsub_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s64_m)))
-svint64_t svhsub_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s16_m)))
-svint16_t svhsub_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s8_x)))
-svint8_t svhsub_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s32_x)))
-svint32_t svhsub_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s64_x)))
-svint64_t svhsub_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s16_x)))
-svint16_t svhsub_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s8_z)))
-svint8_t svhsub_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s32_z)))
-svint32_t svhsub_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s64_z)))
-svint64_t svhsub_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_s16_z)))
-svint16_t svhsub_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u8_m)))
-svuint8_t svhsub_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u32_m)))
-svuint32_t svhsub_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u64_m)))
-svuint64_t svhsub_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u16_m)))
-svuint16_t svhsub_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u8_x)))
-svuint8_t svhsub_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u32_x)))
-svuint32_t svhsub_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u64_x)))
-svuint64_t svhsub_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u16_x)))
-svuint16_t svhsub_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u8_z)))
-svuint8_t svhsub_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u32_z)))
-svuint32_t svhsub_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u64_z)))
-svuint64_t svhsub_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsub_u16_z)))
-svuint16_t svhsub_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s8_m)))
-svint8_t svhsubr_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s32_m)))
-svint32_t svhsubr_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s64_m)))
-svint64_t svhsubr_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s16_m)))
-svint16_t svhsubr_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s8_x)))
-svint8_t svhsubr_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s32_x)))
-svint32_t svhsubr_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s64_x)))
-svint64_t svhsubr_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s16_x)))
-svint16_t svhsubr_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s8_z)))
-svint8_t svhsubr_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s32_z)))
-svint32_t svhsubr_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s64_z)))
-svint64_t svhsubr_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_s16_z)))
-svint16_t svhsubr_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u8_m)))
-svuint8_t svhsubr_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u32_m)))
-svuint32_t svhsubr_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u64_m)))
-svuint64_t svhsubr_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u16_m)))
-svuint16_t svhsubr_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u8_x)))
-svuint8_t svhsubr_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u32_x)))
-svuint32_t svhsubr_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u64_x)))
-svuint64_t svhsubr_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u16_x)))
-svuint16_t svhsubr_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u8_z)))
-svuint8_t svhsubr_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u32_z)))
-svuint32_t svhsubr_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u64_z)))
-svuint64_t svhsubr_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_n_u16_z)))
-svuint16_t svhsubr_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s8_m)))
-svint8_t svhsubr_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s32_m)))
-svint32_t svhsubr_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s64_m)))
-svint64_t svhsubr_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s16_m)))
-svint16_t svhsubr_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s8_x)))
-svint8_t svhsubr_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s32_x)))
-svint32_t svhsubr_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s64_x)))
-svint64_t svhsubr_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s16_x)))
-svint16_t svhsubr_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s8_z)))
-svint8_t svhsubr_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s32_z)))
-svint32_t svhsubr_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s64_z)))
-svint64_t svhsubr_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_s16_z)))
-svint16_t svhsubr_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u8_m)))
-svuint8_t svhsubr_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u32_m)))
-svuint32_t svhsubr_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u64_m)))
-svuint64_t svhsubr_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u16_m)))
-svuint16_t svhsubr_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u8_x)))
-svuint8_t svhsubr_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u32_x)))
-svuint32_t svhsubr_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u64_x)))
-svuint64_t svhsubr_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u16_x)))
-svuint16_t svhsubr_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u8_z)))
-svuint8_t svhsubr_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u32_z)))
-svuint32_t svhsubr_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u64_z)))
-svuint64_t svhsubr_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svhsubr_u16_z)))
-svuint16_t svhsubr_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f64_m)))
-svint64_t svlogb_m(svint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f32_m)))
-svint32_t svlogb_m(svint32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f16_m)))
-svint16_t svlogb_m(svint16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f64_x)))
-svint64_t svlogb_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f32_x)))
-svint32_t svlogb_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f16_x)))
-svint16_t svlogb_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f64_z)))
-svint64_t svlogb_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f32_z)))
-svint32_t svlogb_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlogb_f16_z)))
-svint16_t svlogb_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f64_m)))
-svfloat64_t svmaxnmp_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f32_m)))
-svfloat32_t svmaxnmp_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f16_m)))
-svfloat16_t svmaxnmp_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f64_x)))
-svfloat64_t svmaxnmp_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f32_x)))
-svfloat32_t svmaxnmp_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmp_f16_x)))
-svfloat16_t svmaxnmp_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f64_m)))
-svfloat64_t svmaxp_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f32_m)))
-svfloat32_t svmaxp_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f16_m)))
-svfloat16_t svmaxp_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f64_x)))
-svfloat64_t svmaxp_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f32_x)))
-svfloat32_t svmaxp_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_f16_x)))
-svfloat16_t svmaxp_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s8_m)))
-svint8_t svmaxp_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s32_m)))
-svint32_t svmaxp_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s64_m)))
-svint64_t svmaxp_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s16_m)))
-svint16_t svmaxp_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s8_x)))
-svint8_t svmaxp_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s32_x)))
-svint32_t svmaxp_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s64_x)))
-svint64_t svmaxp_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_s16_x)))
-svint16_t svmaxp_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u8_m)))
-svuint8_t svmaxp_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u32_m)))
-svuint32_t svmaxp_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u64_m)))
-svuint64_t svmaxp_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u16_m)))
-svuint16_t svmaxp_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u8_x)))
-svuint8_t svmaxp_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u32_x)))
-svuint32_t svmaxp_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u64_x)))
-svuint64_t svmaxp_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxp_u16_x)))
-svuint16_t svmaxp_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f64_m)))
-svfloat64_t svminnmp_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f32_m)))
-svfloat32_t svminnmp_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f16_m)))
-svfloat16_t svminnmp_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f64_x)))
-svfloat64_t svminnmp_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f32_x)))
-svfloat32_t svminnmp_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmp_f16_x)))
-svfloat16_t svminnmp_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f64_m)))
-svfloat64_t svminp_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f32_m)))
-svfloat32_t svminp_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f16_m)))
-svfloat16_t svminp_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f64_x)))
-svfloat64_t svminp_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f32_x)))
-svfloat32_t svminp_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_f16_x)))
-svfloat16_t svminp_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s8_m)))
-svint8_t svminp_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s32_m)))
-svint32_t svminp_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s64_m)))
-svint64_t svminp_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s16_m)))
-svint16_t svminp_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s8_x)))
-svint8_t svminp_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s32_x)))
-svint32_t svminp_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s64_x)))
-svint64_t svminp_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_s16_x)))
-svint16_t svminp_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u8_m)))
-svuint8_t svminp_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u32_m)))
-svuint32_t svminp_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u64_m)))
-svuint64_t svminp_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u16_m)))
-svuint16_t svminp_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u8_x)))
-svuint8_t svminp_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u32_x)))
-svuint32_t svminp_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u64_x)))
-svuint64_t svminp_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminp_u16_x)))
-svuint16_t svminp_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_u32)))
-svuint32_t svmla_lane(svuint32_t, svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_u64)))
-svuint64_t svmla_lane(svuint64_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_u16)))
-svuint16_t svmla_lane(svuint16_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_s32)))
-svint32_t svmla_lane(svint32_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_s64)))
-svint64_t svmla_lane(svint64_t, svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_s16)))
-svint16_t svmla_lane(svint16_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_f32)))
-svfloat32_t svmlalb(svfloat32_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_s32)))
-svint32_t svmlalb(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_s64)))
-svint64_t svmlalb(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_s16)))
-svint16_t svmlalb(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_u32)))
-svuint32_t svmlalb(svuint32_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_u64)))
-svuint64_t svmlalb(svuint64_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_u16)))
-svuint16_t svmlalb(svuint16_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_f32)))
-svfloat32_t svmlalb(svfloat32_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_s32)))
-svint32_t svmlalb(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_s64)))
-svint64_t svmlalb(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_s16)))
-svint16_t svmlalb(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_u32)))
-svuint32_t svmlalb(svuint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_u64)))
-svuint64_t svmlalb(svuint64_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_u16)))
-svuint16_t svmlalb(svuint16_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_f32)))
-svfloat32_t svmlalb_lane(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_s32)))
-svint32_t svmlalb_lane(svint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_s64)))
-svint64_t svmlalb_lane(svint64_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_u32)))
-svuint32_t svmlalb_lane(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_u64)))
-svuint64_t svmlalb_lane(svuint64_t, svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_f32)))
-svfloat32_t svmlalt(svfloat32_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_s32)))
-svint32_t svmlalt(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_s64)))
-svint64_t svmlalt(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_s16)))
-svint16_t svmlalt(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_u32)))
-svuint32_t svmlalt(svuint32_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_u64)))
-svuint64_t svmlalt(svuint64_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_u16)))
-svuint16_t svmlalt(svuint16_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_f32)))
-svfloat32_t svmlalt(svfloat32_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_s32)))
-svint32_t svmlalt(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_s64)))
-svint64_t svmlalt(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_s16)))
-svint16_t svmlalt(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_u32)))
-svuint32_t svmlalt(svuint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_u64)))
-svuint64_t svmlalt(svuint64_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_u16)))
-svuint16_t svmlalt(svuint16_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_f32)))
-svfloat32_t svmlalt_lane(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_s32)))
-svint32_t svmlalt_lane(svint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_s64)))
-svint64_t svmlalt_lane(svint64_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_u32)))
-svuint32_t svmlalt_lane(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_u64)))
-svuint64_t svmlalt_lane(svuint64_t, svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_u32)))
-svuint32_t svmls_lane(svuint32_t, svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_u64)))
-svuint64_t svmls_lane(svuint64_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_u16)))
-svuint16_t svmls_lane(svuint16_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_s32)))
-svint32_t svmls_lane(svint32_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_s64)))
-svint64_t svmls_lane(svint64_t, svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_s16)))
-svint16_t svmls_lane(svint16_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_f32)))
-svfloat32_t svmlslb(svfloat32_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_s32)))
-svint32_t svmlslb(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_s64)))
-svint64_t svmlslb(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_s16)))
-svint16_t svmlslb(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_u32)))
-svuint32_t svmlslb(svuint32_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_u64)))
-svuint64_t svmlslb(svuint64_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_n_u16)))
-svuint16_t svmlslb(svuint16_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_f32)))
-svfloat32_t svmlslb(svfloat32_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_s32)))
-svint32_t svmlslb(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_s64)))
-svint64_t svmlslb(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_s16)))
-svint16_t svmlslb(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_u32)))
-svuint32_t svmlslb(svuint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_u64)))
-svuint64_t svmlslb(svuint64_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_u16)))
-svuint16_t svmlslb(svuint16_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_f32)))
-svfloat32_t svmlslb_lane(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_s32)))
-svint32_t svmlslb_lane(svint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_s64)))
-svint64_t svmlslb_lane(svint64_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_u32)))
-svuint32_t svmlslb_lane(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslb_lane_u64)))
-svuint64_t svmlslb_lane(svuint64_t, svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_f32)))
-svfloat32_t svmlslt(svfloat32_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_s32)))
-svint32_t svmlslt(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_s64)))
-svint64_t svmlslt(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_s16)))
-svint16_t svmlslt(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_u32)))
-svuint32_t svmlslt(svuint32_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_u64)))
-svuint64_t svmlslt(svuint64_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_n_u16)))
-svuint16_t svmlslt(svuint16_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_f32)))
-svfloat32_t svmlslt(svfloat32_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_s32)))
-svint32_t svmlslt(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_s64)))
-svint64_t svmlslt(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_s16)))
-svint16_t svmlslt(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_u32)))
-svuint32_t svmlslt(svuint32_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_u64)))
-svuint64_t svmlslt(svuint64_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_u16)))
-svuint16_t svmlslt(svuint16_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_f32)))
-svfloat32_t svmlslt_lane(svfloat32_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_s32)))
-svint32_t svmlslt_lane(svint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_s64)))
-svint64_t svmlslt_lane(svint64_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_u32)))
-svuint32_t svmlslt_lane(svuint32_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlslt_lane_u64)))
-svuint64_t svmlslt_lane(svuint64_t, svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_s32)))
-svint32_t svmovlb(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_s64)))
-svint64_t svmovlb(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_s16)))
-svint16_t svmovlb(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_u32)))
-svuint32_t svmovlb(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_u64)))
-svuint64_t svmovlb(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlb_u16)))
-svuint16_t svmovlb(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_s32)))
-svint32_t svmovlt(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_s64)))
-svint64_t svmovlt(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_s16)))
-svint16_t svmovlt(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_u32)))
-svuint32_t svmovlt(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_u64)))
-svuint64_t svmovlt(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmovlt_u16)))
-svuint16_t svmovlt(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_u32)))
-svuint32_t svmul_lane(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_u64)))
-svuint64_t svmul_lane(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_u16)))
-svuint16_t svmul_lane(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_s32)))
-svint32_t svmul_lane(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_s64)))
-svint64_t svmul_lane(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_s16)))
-svint16_t svmul_lane(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_s32)))
-svint32_t svmullb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_s64)))
-svint64_t svmullb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_s16)))
-svint16_t svmullb(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_u32)))
-svuint32_t svmullb(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_u64)))
-svuint64_t svmullb(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_n_u16)))
-svuint16_t svmullb(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_s32)))
-svint32_t svmullb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_s64)))
-svint64_t svmullb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_s16)))
-svint16_t svmullb(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_u32)))
-svuint32_t svmullb(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_u64)))
-svuint64_t svmullb(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_u16)))
-svuint16_t svmullb(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_lane_s32)))
-svint32_t svmullb_lane(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_lane_s64)))
-svint64_t svmullb_lane(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_lane_u32)))
-svuint32_t svmullb_lane(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullb_lane_u64)))
-svuint64_t svmullb_lane(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_s32)))
-svint32_t svmullt(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_s64)))
-svint64_t svmullt(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_s16)))
-svint16_t svmullt(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_u32)))
-svuint32_t svmullt(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_u64)))
-svuint64_t svmullt(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_n_u16)))
-svuint16_t svmullt(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_s32)))
-svint32_t svmullt(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_s64)))
-svint64_t svmullt(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_s16)))
-svint16_t svmullt(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_u32)))
-svuint32_t svmullt(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_u64)))
-svuint64_t svmullt(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_u16)))
-svuint16_t svmullt(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_lane_s32)))
-svint32_t svmullt_lane(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_lane_s64)))
-svint64_t svmullt_lane(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_lane_u32)))
-svuint32_t svmullt_lane(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmullt_lane_u64)))
-svuint64_t svmullt_lane(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_u8)))
-svuint8_t svnbsl(svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_u32)))
-svuint32_t svnbsl(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_u64)))
-svuint64_t svnbsl(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_u16)))
-svuint16_t svnbsl(svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_s8)))
-svint8_t svnbsl(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_s32)))
-svint32_t svnbsl(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_s64)))
-svint64_t svnbsl(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_n_s16)))
-svint16_t svnbsl(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_u8)))
-svuint8_t svnbsl(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_u32)))
-svuint32_t svnbsl(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_u64)))
-svuint64_t svnbsl(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_u16)))
-svuint16_t svnbsl(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_s8)))
-svint8_t svnbsl(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_s32)))
-svint32_t svnbsl(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_s64)))
-svint64_t svnbsl(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnbsl_s16)))
-svint16_t svnbsl(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmul_n_u8)))
-svuint8_t svpmul(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmul_u8)))
-svuint8_t svpmul(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_n_u64)))
-svuint64_t svpmullb(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_n_u16)))
-svuint16_t svpmullb(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_u64)))
-svuint64_t svpmullb(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_u16)))
-svuint16_t svpmullb(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u8)))
-svuint8_t svpmullb_pair(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u32)))
-svuint32_t svpmullb_pair(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u8)))
-svuint8_t svpmullb_pair(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u32)))
-svuint32_t svpmullb_pair(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_n_u64)))
-svuint64_t svpmullt(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_n_u16)))
-svuint16_t svpmullt(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_u64)))
-svuint64_t svpmullt(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_u16)))
-svuint16_t svpmullt(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u8)))
-svuint8_t svpmullt_pair(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u32)))
-svuint32_t svpmullt_pair(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u8)))
-svuint8_t svpmullt_pair(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u32)))
-svuint32_t svpmullt_pair(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s8_m)))
-svint8_t svqabs_m(svint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s32_m)))
-svint32_t svqabs_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s64_m)))
-svint64_t svqabs_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s16_m)))
-svint16_t svqabs_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s8_x)))
-svint8_t svqabs_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s32_x)))
-svint32_t svqabs_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s64_x)))
-svint64_t svqabs_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s16_x)))
-svint16_t svqabs_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s8_z)))
-svint8_t svqabs_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s32_z)))
-svint32_t svqabs_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s64_z)))
-svint64_t svqabs_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqabs_s16_z)))
-svint16_t svqabs_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8_m)))
-svint8_t svqadd_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32_m)))
-svint32_t svqadd_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s64_m)))
-svint64_t svqadd_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s16_m)))
-svint16_t svqadd_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8_x)))
-svint8_t svqadd_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32_x)))
-svint32_t svqadd_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s64_x)))
-svint64_t svqadd_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s16_x)))
-svint16_t svqadd_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8_z)))
-svint8_t svqadd_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32_z)))
-svint32_t svqadd_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s64_z)))
-svint64_t svqadd_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s16_z)))
-svint16_t svqadd_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u8_m)))
-svuint8_t svqadd_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u32_m)))
-svuint32_t svqadd_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u64_m)))
-svuint64_t svqadd_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u16_m)))
-svuint16_t svqadd_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u8_x)))
-svuint8_t svqadd_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u32_x)))
-svuint32_t svqadd_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u64_x)))
-svuint64_t svqadd_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u16_x)))
-svuint16_t svqadd_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u8_z)))
-svuint8_t svqadd_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u32_z)))
-svuint32_t svqadd_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u64_z)))
-svuint64_t svqadd_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u16_z)))
-svuint16_t svqadd_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s8_m)))
-svint8_t svqadd_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s32_m)))
-svint32_t svqadd_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s64_m)))
-svint64_t svqadd_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s16_m)))
-svint16_t svqadd_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s8_x)))
-svint8_t svqadd_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s32_x)))
-svint32_t svqadd_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s64_x)))
-svint64_t svqadd_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s16_x)))
-svint16_t svqadd_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s8_z)))
-svint8_t svqadd_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s32_z)))
-svint32_t svqadd_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s64_z)))
-svint64_t svqadd_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s16_z)))
-svint16_t svqadd_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u8_m)))
-svuint8_t svqadd_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u32_m)))
-svuint32_t svqadd_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u64_m)))
-svuint64_t svqadd_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u16_m)))
-svuint16_t svqadd_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u8_x)))
-svuint8_t svqadd_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u32_x)))
-svuint32_t svqadd_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u64_x)))
-svuint64_t svqadd_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u16_x)))
-svuint16_t svqadd_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u8_z)))
-svuint8_t svqadd_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u32_z)))
-svuint32_t svqadd_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u64_z)))
-svuint64_t svqadd_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u16_z)))
-svuint16_t svqadd_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcadd_s8)))
-svint8_t svqcadd(svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcadd_s32)))
-svint32_t svqcadd(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcadd_s64)))
-svint64_t svqcadd(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqcadd_s16)))
-svint16_t svqcadd(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_n_s32)))
-svint32_t svqdmlalb(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_n_s64)))
-svint64_t svqdmlalb(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_n_s16)))
-svint16_t svqdmlalb(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_s32)))
-svint32_t svqdmlalb(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_s64)))
-svint64_t svqdmlalb(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_s16)))
-svint16_t svqdmlalb(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_lane_s32)))
-svint32_t svqdmlalb_lane(svint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalb_lane_s64)))
-svint64_t svqdmlalb_lane(svint64_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_n_s32)))
-svint32_t svqdmlalbt(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_n_s64)))
-svint64_t svqdmlalbt(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_n_s16)))
-svint16_t svqdmlalbt(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_s32)))
-svint32_t svqdmlalbt(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_s64)))
-svint64_t svqdmlalbt(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalbt_s16)))
-svint16_t svqdmlalbt(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_n_s32)))
-svint32_t svqdmlalt(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_n_s64)))
-svint64_t svqdmlalt(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_n_s16)))
-svint16_t svqdmlalt(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_s32)))
-svint32_t svqdmlalt(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_s64)))
-svint64_t svqdmlalt(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_s16)))
-svint16_t svqdmlalt(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_lane_s32)))
-svint32_t svqdmlalt_lane(svint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlalt_lane_s64)))
-svint64_t svqdmlalt_lane(svint64_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_n_s32)))
-svint32_t svqdmlslb(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_n_s64)))
-svint64_t svqdmlslb(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_n_s16)))
-svint16_t svqdmlslb(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_s32)))
-svint32_t svqdmlslb(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_s64)))
-svint64_t svqdmlslb(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_s16)))
-svint16_t svqdmlslb(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_lane_s32)))
-svint32_t svqdmlslb_lane(svint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslb_lane_s64)))
-svint64_t svqdmlslb_lane(svint64_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_n_s32)))
-svint32_t svqdmlslbt(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_n_s64)))
-svint64_t svqdmlslbt(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_n_s16)))
-svint16_t svqdmlslbt(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_s32)))
-svint32_t svqdmlslbt(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_s64)))
-svint64_t svqdmlslbt(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslbt_s16)))
-svint16_t svqdmlslbt(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_n_s32)))
-svint32_t svqdmlslt(svint32_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_n_s64)))
-svint64_t svqdmlslt(svint64_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_n_s16)))
-svint16_t svqdmlslt(svint16_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_s32)))
-svint32_t svqdmlslt(svint32_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_s64)))
-svint64_t svqdmlslt(svint64_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_s16)))
-svint16_t svqdmlslt(svint16_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_lane_s32)))
-svint32_t svqdmlslt_lane(svint32_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmlslt_lane_s64)))
-svint64_t svqdmlslt_lane(svint64_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_n_s8)))
-svint8_t svqdmulh(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_n_s32)))
-svint32_t svqdmulh(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_n_s64)))
-svint64_t svqdmulh(svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_n_s16)))
-svint16_t svqdmulh(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s8)))
-svint8_t svqdmulh(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s32)))
-svint32_t svqdmulh(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s64)))
-svint64_t svqdmulh(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_s16)))
-svint16_t svqdmulh(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_lane_s32)))
-svint32_t svqdmulh_lane(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_lane_s64)))
-svint64_t svqdmulh_lane(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmulh_lane_s16)))
-svint16_t svqdmulh_lane(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_n_s32)))
-svint32_t svqdmullb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_n_s64)))
-svint64_t svqdmullb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_n_s16)))
-svint16_t svqdmullb(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_s32)))
-svint32_t svqdmullb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_s64)))
-svint64_t svqdmullb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_s16)))
-svint16_t svqdmullb(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_lane_s32)))
-svint32_t svqdmullb_lane(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullb_lane_s64)))
-svint64_t svqdmullb_lane(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_n_s32)))
-svint32_t svqdmullt(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_n_s64)))
-svint64_t svqdmullt(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_n_s16)))
-svint16_t svqdmullt(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_s32)))
-svint32_t svqdmullt(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_s64)))
-svint64_t svqdmullt(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_s16)))
-svint16_t svqdmullt(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_lane_s32)))
-svint32_t svqdmullt_lane(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdmullt_lane_s64)))
-svint64_t svqdmullt_lane(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s8_m)))
-svint8_t svqneg_m(svint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s32_m)))
-svint32_t svqneg_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s64_m)))
-svint64_t svqneg_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s16_m)))
-svint16_t svqneg_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s8_x)))
-svint8_t svqneg_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s32_x)))
-svint32_t svqneg_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s64_x)))
-svint64_t svqneg_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s16_x)))
-svint16_t svqneg_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s8_z)))
-svint8_t svqneg_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s32_z)))
-svint32_t svqneg_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s64_z)))
-svint64_t svqneg_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqneg_s16_z)))
-svint16_t svqneg_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_s8)))
-svint8_t svqrdcmlah(svint8_t, svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_s32)))
-svint32_t svqrdcmlah(svint32_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_s64)))
-svint64_t svqrdcmlah(svint64_t, svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_s16)))
-svint16_t svqrdcmlah(svint16_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_lane_s32)))
-svint32_t svqrdcmlah_lane(svint32_t, svint32_t, svint32_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdcmlah_lane_s16)))
-svint16_t svqrdcmlah_lane(svint16_t, svint16_t, svint16_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_n_s8)))
-svint8_t svqrdmlah(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_n_s32)))
-svint32_t svqrdmlah(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_n_s64)))
-svint64_t svqrdmlah(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_n_s16)))
-svint16_t svqrdmlah(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_s8)))
-svint8_t svqrdmlah(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_s32)))
-svint32_t svqrdmlah(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_s64)))
-svint64_t svqrdmlah(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_s16)))
-svint16_t svqrdmlah(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_lane_s32)))
-svint32_t svqrdmlah_lane(svint32_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_lane_s64)))
-svint64_t svqrdmlah_lane(svint64_t, svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlah_lane_s16)))
-svint16_t svqrdmlah_lane(svint16_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_n_s8)))
-svint8_t svqrdmlsh(svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_n_s32)))
-svint32_t svqrdmlsh(svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_n_s64)))
-svint64_t svqrdmlsh(svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_n_s16)))
-svint16_t svqrdmlsh(svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_s8)))
-svint8_t svqrdmlsh(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_s32)))
-svint32_t svqrdmlsh(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_s64)))
-svint64_t svqrdmlsh(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_s16)))
-svint16_t svqrdmlsh(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_lane_s32)))
-svint32_t svqrdmlsh_lane(svint32_t, svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_lane_s64)))
-svint64_t svqrdmlsh_lane(svint64_t, svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmlsh_lane_s16)))
-svint16_t svqrdmlsh_lane(svint16_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_n_s8)))
-svint8_t svqrdmulh(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_n_s32)))
-svint32_t svqrdmulh(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_n_s64)))
-svint64_t svqrdmulh(svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_n_s16)))
-svint16_t svqrdmulh(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_s8)))
-svint8_t svqrdmulh(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_s32)))
-svint32_t svqrdmulh(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_s64)))
-svint64_t svqrdmulh(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_s16)))
-svint16_t svqrdmulh(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_lane_s32)))
-svint32_t svqrdmulh_lane(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_lane_s64)))
-svint64_t svqrdmulh_lane(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrdmulh_lane_s16)))
-svint16_t svqrdmulh_lane(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s8_m)))
-svint8_t svqrshl_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s32_m)))
-svint32_t svqrshl_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s64_m)))
-svint64_t svqrshl_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s16_m)))
-svint16_t svqrshl_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s8_x)))
-svint8_t svqrshl_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s32_x)))
-svint32_t svqrshl_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s64_x)))
-svint64_t svqrshl_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s16_x)))
-svint16_t svqrshl_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s8_z)))
-svint8_t svqrshl_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s32_z)))
-svint32_t svqrshl_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s64_z)))
-svint64_t svqrshl_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_s16_z)))
-svint16_t svqrshl_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u8_m)))
-svuint8_t svqrshl_m(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u32_m)))
-svuint32_t svqrshl_m(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u64_m)))
-svuint64_t svqrshl_m(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u16_m)))
-svuint16_t svqrshl_m(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u8_x)))
-svuint8_t svqrshl_x(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u32_x)))
-svuint32_t svqrshl_x(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u64_x)))
-svuint64_t svqrshl_x(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u16_x)))
-svuint16_t svqrshl_x(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u8_z)))
-svuint8_t svqrshl_z(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u32_z)))
-svuint32_t svqrshl_z(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u64_z)))
-svuint64_t svqrshl_z(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_n_u16_z)))
-svuint16_t svqrshl_z(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s8_m)))
-svint8_t svqrshl_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s32_m)))
-svint32_t svqrshl_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s64_m)))
-svint64_t svqrshl_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s16_m)))
-svint16_t svqrshl_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s8_x)))
-svint8_t svqrshl_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s32_x)))
-svint32_t svqrshl_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s64_x)))
-svint64_t svqrshl_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s16_x)))
-svint16_t svqrshl_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s8_z)))
-svint8_t svqrshl_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s32_z)))
-svint32_t svqrshl_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s64_z)))
-svint64_t svqrshl_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_s16_z)))
-svint16_t svqrshl_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u8_m)))
-svuint8_t svqrshl_m(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u32_m)))
-svuint32_t svqrshl_m(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u64_m)))
-svuint64_t svqrshl_m(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u16_m)))
-svuint16_t svqrshl_m(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u8_x)))
-svuint8_t svqrshl_x(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u32_x)))
-svuint32_t svqrshl_x(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u64_x)))
-svuint64_t svqrshl_x(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u16_x)))
-svuint16_t svqrshl_x(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u8_z)))
-svuint8_t svqrshl_z(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u32_z)))
-svuint32_t svqrshl_z(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u64_z)))
-svuint64_t svqrshl_z(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshl_u16_z)))
-svuint16_t svqrshl_z(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_s32)))
-svint16_t svqrshrnb(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_s64)))
-svint32_t svqrshrnb(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_s16)))
-svint8_t svqrshrnb(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_u32)))
-svuint16_t svqrshrnb(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_u64)))
-svuint32_t svqrshrnb(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnb_n_u16)))
-svuint8_t svqrshrnb(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_s32)))
-svint16_t svqrshrnt(svint16_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_s64)))
-svint32_t svqrshrnt(svint32_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_s16)))
-svint8_t svqrshrnt(svint8_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_u32)))
-svuint16_t svqrshrnt(svuint16_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_u64)))
-svuint32_t svqrshrnt(svuint32_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrnt_n_u16)))
-svuint8_t svqrshrnt(svuint8_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunb_n_s32)))
-svuint16_t svqrshrunb(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunb_n_s64)))
-svuint32_t svqrshrunb(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunb_n_s16)))
-svuint8_t svqrshrunb(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunt_n_s32)))
-svuint16_t svqrshrunt(svuint16_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunt_n_s64)))
-svuint32_t svqrshrunt(svuint32_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqrshrunt_n_s16)))
-svuint8_t svqrshrunt(svuint8_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s8_m)))
-svint8_t svqshl_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s32_m)))
-svint32_t svqshl_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s64_m)))
-svint64_t svqshl_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s16_m)))
-svint16_t svqshl_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s8_x)))
-svint8_t svqshl_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s32_x)))
-svint32_t svqshl_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s64_x)))
-svint64_t svqshl_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s16_x)))
-svint16_t svqshl_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s8_z)))
-svint8_t svqshl_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s32_z)))
-svint32_t svqshl_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s64_z)))
-svint64_t svqshl_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_s16_z)))
-svint16_t svqshl_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u8_m)))
-svuint8_t svqshl_m(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u32_m)))
-svuint32_t svqshl_m(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u64_m)))
-svuint64_t svqshl_m(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u16_m)))
-svuint16_t svqshl_m(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u8_x)))
-svuint8_t svqshl_x(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u32_x)))
-svuint32_t svqshl_x(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u64_x)))
-svuint64_t svqshl_x(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u16_x)))
-svuint16_t svqshl_x(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u8_z)))
-svuint8_t svqshl_z(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u32_z)))
-svuint32_t svqshl_z(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u64_z)))
-svuint64_t svqshl_z(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_n_u16_z)))
-svuint16_t svqshl_z(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s8_m)))
-svint8_t svqshl_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s32_m)))
-svint32_t svqshl_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s64_m)))
-svint64_t svqshl_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s16_m)))
-svint16_t svqshl_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s8_x)))
-svint8_t svqshl_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s32_x)))
-svint32_t svqshl_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s64_x)))
-svint64_t svqshl_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s16_x)))
-svint16_t svqshl_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s8_z)))
-svint8_t svqshl_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s32_z)))
-svint32_t svqshl_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s64_z)))
-svint64_t svqshl_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_s16_z)))
-svint16_t svqshl_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u8_m)))
-svuint8_t svqshl_m(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u32_m)))
-svuint32_t svqshl_m(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u64_m)))
-svuint64_t svqshl_m(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u16_m)))
-svuint16_t svqshl_m(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u8_x)))
-svuint8_t svqshl_x(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u32_x)))
-svuint32_t svqshl_x(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u64_x)))
-svuint64_t svqshl_x(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u16_x)))
-svuint16_t svqshl_x(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u8_z)))
-svuint8_t svqshl_z(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u32_z)))
-svuint32_t svqshl_z(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u64_z)))
-svuint64_t svqshl_z(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshl_u16_z)))
-svuint16_t svqshl_z(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s8_m)))
-svuint8_t svqshlu_m(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s32_m)))
-svuint32_t svqshlu_m(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s64_m)))
-svuint64_t svqshlu_m(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s16_m)))
-svuint16_t svqshlu_m(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s8_x)))
-svuint8_t svqshlu_x(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s32_x)))
-svuint32_t svqshlu_x(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s64_x)))
-svuint64_t svqshlu_x(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s16_x)))
-svuint16_t svqshlu_x(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s8_z)))
-svuint8_t svqshlu_z(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s32_z)))
-svuint32_t svqshlu_z(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s64_z)))
-svuint64_t svqshlu_z(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshlu_n_s16_z)))
-svuint16_t svqshlu_z(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_s32)))
-svint16_t svqshrnb(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_s64)))
-svint32_t svqshrnb(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_s16)))
-svint8_t svqshrnb(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_u32)))
-svuint16_t svqshrnb(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_u64)))
-svuint32_t svqshrnb(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnb_n_u16)))
-svuint8_t svqshrnb(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_s32)))
-svint16_t svqshrnt(svint16_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_s64)))
-svint32_t svqshrnt(svint32_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_s16)))
-svint8_t svqshrnt(svint8_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_u32)))
-svuint16_t svqshrnt(svuint16_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_u64)))
-svuint32_t svqshrnt(svuint32_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrnt_n_u16)))
-svuint8_t svqshrnt(svuint8_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunb_n_s32)))
-svuint16_t svqshrunb(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunb_n_s64)))
-svuint32_t svqshrunb(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunb_n_s16)))
-svuint8_t svqshrunb(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunt_n_s32)))
-svuint16_t svqshrunt(svuint16_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunt_n_s64)))
-svuint32_t svqshrunt(svuint32_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqshrunt_n_s16)))
-svuint8_t svqshrunt(svuint8_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8_m)))
-svint8_t svqsub_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32_m)))
-svint32_t svqsub_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s64_m)))
-svint64_t svqsub_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s16_m)))
-svint16_t svqsub_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8_x)))
-svint8_t svqsub_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32_x)))
-svint32_t svqsub_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s64_x)))
-svint64_t svqsub_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s16_x)))
-svint16_t svqsub_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8_z)))
-svint8_t svqsub_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32_z)))
-svint32_t svqsub_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s64_z)))
-svint64_t svqsub_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s16_z)))
-svint16_t svqsub_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u8_m)))
-svuint8_t svqsub_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u32_m)))
-svuint32_t svqsub_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u64_m)))
-svuint64_t svqsub_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u16_m)))
-svuint16_t svqsub_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u8_x)))
-svuint8_t svqsub_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u32_x)))
-svuint32_t svqsub_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u64_x)))
-svuint64_t svqsub_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u16_x)))
-svuint16_t svqsub_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u8_z)))
-svuint8_t svqsub_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u32_z)))
-svuint32_t svqsub_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u64_z)))
-svuint64_t svqsub_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u16_z)))
-svuint16_t svqsub_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s8_m)))
-svint8_t svqsub_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s32_m)))
-svint32_t svqsub_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s64_m)))
-svint64_t svqsub_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s16_m)))
-svint16_t svqsub_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s8_x)))
-svint8_t svqsub_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s32_x)))
-svint32_t svqsub_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s64_x)))
-svint64_t svqsub_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s16_x)))
-svint16_t svqsub_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s8_z)))
-svint8_t svqsub_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s32_z)))
-svint32_t svqsub_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s64_z)))
-svint64_t svqsub_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s16_z)))
-svint16_t svqsub_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u8_m)))
-svuint8_t svqsub_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u32_m)))
-svuint32_t svqsub_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u64_m)))
-svuint64_t svqsub_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u16_m)))
-svuint16_t svqsub_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u8_x)))
-svuint8_t svqsub_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u32_x)))
-svuint32_t svqsub_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u64_x)))
-svuint64_t svqsub_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u16_x)))
-svuint16_t svqsub_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u8_z)))
-svuint8_t svqsub_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u32_z)))
-svuint32_t svqsub_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u64_z)))
-svuint64_t svqsub_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u16_z)))
-svuint16_t svqsub_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s8_m)))
-svint8_t svqsubr_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s32_m)))
-svint32_t svqsubr_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s64_m)))
-svint64_t svqsubr_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s16_m)))
-svint16_t svqsubr_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s8_x)))
-svint8_t svqsubr_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s32_x)))
-svint32_t svqsubr_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s64_x)))
-svint64_t svqsubr_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s16_x)))
-svint16_t svqsubr_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s8_z)))
-svint8_t svqsubr_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s32_z)))
-svint32_t svqsubr_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s64_z)))
-svint64_t svqsubr_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_s16_z)))
-svint16_t svqsubr_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u8_m)))
-svuint8_t svqsubr_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u32_m)))
-svuint32_t svqsubr_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u64_m)))
-svuint64_t svqsubr_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u16_m)))
-svuint16_t svqsubr_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u8_x)))
-svuint8_t svqsubr_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u32_x)))
-svuint32_t svqsubr_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u64_x)))
-svuint64_t svqsubr_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u16_x)))
-svuint16_t svqsubr_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u8_z)))
-svuint8_t svqsubr_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u32_z)))
-svuint32_t svqsubr_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u64_z)))
-svuint64_t svqsubr_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_n_u16_z)))
-svuint16_t svqsubr_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s8_m)))
-svint8_t svqsubr_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s32_m)))
-svint32_t svqsubr_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s64_m)))
-svint64_t svqsubr_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s16_m)))
-svint16_t svqsubr_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s8_x)))
-svint8_t svqsubr_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s32_x)))
-svint32_t svqsubr_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s64_x)))
-svint64_t svqsubr_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s16_x)))
-svint16_t svqsubr_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s8_z)))
-svint8_t svqsubr_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s32_z)))
-svint32_t svqsubr_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s64_z)))
-svint64_t svqsubr_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_s16_z)))
-svint16_t svqsubr_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u8_m)))
-svuint8_t svqsubr_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u32_m)))
-svuint32_t svqsubr_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u64_m)))
-svuint64_t svqsubr_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u16_m)))
-svuint16_t svqsubr_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u8_x)))
-svuint8_t svqsubr_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u32_x)))
-svuint32_t svqsubr_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u64_x)))
-svuint64_t svqsubr_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u16_x)))
-svuint16_t svqsubr_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u8_z)))
-svuint8_t svqsubr_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u32_z)))
-svuint32_t svqsubr_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u64_z)))
-svuint64_t svqsubr_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsubr_u16_z)))
-svuint16_t svqsubr_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_s32)))
-svint16_t svqxtnb(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_s64)))
-svint32_t svqxtnb(svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_s16)))
-svint8_t svqxtnb(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_u32)))
-svuint16_t svqxtnb(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_u64)))
-svuint32_t svqxtnb(svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnb_u16)))
-svuint8_t svqxtnb(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_s32)))
-svint16_t svqxtnt(svint16_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_s64)))
-svint32_t svqxtnt(svint32_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_s16)))
-svint8_t svqxtnt(svint8_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_u32)))
-svuint16_t svqxtnt(svuint16_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_u64)))
-svuint32_t svqxtnt(svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtnt_u16)))
-svuint8_t svqxtnt(svuint8_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunb_s32)))
-svuint16_t svqxtunb(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunb_s64)))
-svuint32_t svqxtunb(svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunb_s16)))
-svuint8_t svqxtunb(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunt_s32)))
-svuint16_t svqxtunt(svuint16_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunt_s64)))
-svuint32_t svqxtunt(svuint32_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqxtunt_s16)))
-svuint8_t svqxtunt(svuint8_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_u32)))
-svuint16_t svraddhnb(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_u64)))
-svuint32_t svraddhnb(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_u16)))
-svuint8_t svraddhnb(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_s32)))
-svint16_t svraddhnb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_s64)))
-svint32_t svraddhnb(svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_n_s16)))
-svint8_t svraddhnb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_u32)))
-svuint16_t svraddhnb(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_u64)))
-svuint32_t svraddhnb(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_u16)))
-svuint8_t svraddhnb(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_s32)))
-svint16_t svraddhnb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_s64)))
-svint32_t svraddhnb(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnb_s16)))
-svint8_t svraddhnb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_u32)))
-svuint16_t svraddhnt(svuint16_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_u64)))
-svuint32_t svraddhnt(svuint32_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_u16)))
-svuint8_t svraddhnt(svuint8_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_s32)))
-svint16_t svraddhnt(svint16_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_s64)))
-svint32_t svraddhnt(svint32_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_n_s16)))
-svint8_t svraddhnt(svint8_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_u32)))
-svuint16_t svraddhnt(svuint16_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_u64)))
-svuint32_t svraddhnt(svuint32_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_u16)))
-svuint8_t svraddhnt(svuint8_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_s32)))
-svint16_t svraddhnt(svint16_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_s64)))
-svint32_t svraddhnt(svint32_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svraddhnt_s16)))
-svint8_t svraddhnt(svint8_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_u32_m)))
-svuint32_t svrecpe_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_u32_x)))
-svuint32_t svrecpe_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_u32_z)))
-svuint32_t svrecpe_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s8_m)))
-svint8_t svrhadd_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s32_m)))
-svint32_t svrhadd_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s64_m)))
-svint64_t svrhadd_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s16_m)))
-svint16_t svrhadd_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s8_x)))
-svint8_t svrhadd_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s32_x)))
-svint32_t svrhadd_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s64_x)))
-svint64_t svrhadd_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s16_x)))
-svint16_t svrhadd_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s8_z)))
-svint8_t svrhadd_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s32_z)))
-svint32_t svrhadd_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s64_z)))
-svint64_t svrhadd_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_s16_z)))
-svint16_t svrhadd_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u8_m)))
-svuint8_t svrhadd_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u32_m)))
-svuint32_t svrhadd_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u64_m)))
-svuint64_t svrhadd_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u16_m)))
-svuint16_t svrhadd_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u8_x)))
-svuint8_t svrhadd_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u32_x)))
-svuint32_t svrhadd_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u64_x)))
-svuint64_t svrhadd_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u16_x)))
-svuint16_t svrhadd_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u8_z)))
-svuint8_t svrhadd_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u32_z)))
-svuint32_t svrhadd_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u64_z)))
-svuint64_t svrhadd_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_n_u16_z)))
-svuint16_t svrhadd_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s8_m)))
-svint8_t svrhadd_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s32_m)))
-svint32_t svrhadd_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s64_m)))
-svint64_t svrhadd_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s16_m)))
-svint16_t svrhadd_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s8_x)))
-svint8_t svrhadd_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s32_x)))
-svint32_t svrhadd_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s64_x)))
-svint64_t svrhadd_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s16_x)))
-svint16_t svrhadd_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s8_z)))
-svint8_t svrhadd_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s32_z)))
-svint32_t svrhadd_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s64_z)))
-svint64_t svrhadd_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_s16_z)))
-svint16_t svrhadd_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u8_m)))
-svuint8_t svrhadd_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u32_m)))
-svuint32_t svrhadd_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u64_m)))
-svuint64_t svrhadd_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u16_m)))
-svuint16_t svrhadd_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u8_x)))
-svuint8_t svrhadd_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u32_x)))
-svuint32_t svrhadd_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u64_x)))
-svuint64_t svrhadd_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u16_x)))
-svuint16_t svrhadd_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u8_z)))
-svuint8_t svrhadd_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u32_z)))
-svuint32_t svrhadd_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u64_z)))
-svuint64_t svrhadd_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrhadd_u16_z)))
-svuint16_t svrhadd_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s8_m)))
-svint8_t svrshl_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s32_m)))
-svint32_t svrshl_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s64_m)))
-svint64_t svrshl_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s16_m)))
-svint16_t svrshl_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s8_x)))
-svint8_t svrshl_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s32_x)))
-svint32_t svrshl_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s64_x)))
-svint64_t svrshl_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s16_x)))
-svint16_t svrshl_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s8_z)))
-svint8_t svrshl_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s32_z)))
-svint32_t svrshl_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s64_z)))
-svint64_t svrshl_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_s16_z)))
-svint16_t svrshl_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u8_m)))
-svuint8_t svrshl_m(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u32_m)))
-svuint32_t svrshl_m(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u64_m)))
-svuint64_t svrshl_m(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u16_m)))
-svuint16_t svrshl_m(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u8_x)))
-svuint8_t svrshl_x(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u32_x)))
-svuint32_t svrshl_x(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u64_x)))
-svuint64_t svrshl_x(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u16_x)))
-svuint16_t svrshl_x(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u8_z)))
-svuint8_t svrshl_z(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u32_z)))
-svuint32_t svrshl_z(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u64_z)))
-svuint64_t svrshl_z(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_n_u16_z)))
-svuint16_t svrshl_z(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_m)))
-svint8_t svrshl_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_m)))
-svint32_t svrshl_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_m)))
-svint64_t svrshl_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_m)))
-svint16_t svrshl_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_x)))
-svint8_t svrshl_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_x)))
-svint32_t svrshl_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_x)))
-svint64_t svrshl_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_x)))
-svint16_t svrshl_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s8_z)))
-svint8_t svrshl_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s32_z)))
-svint32_t svrshl_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s64_z)))
-svint64_t svrshl_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_s16_z)))
-svint16_t svrshl_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_m)))
-svuint8_t svrshl_m(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_m)))
-svuint32_t svrshl_m(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_m)))
-svuint64_t svrshl_m(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_m)))
-svuint16_t svrshl_m(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_x)))
-svuint8_t svrshl_x(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_x)))
-svuint32_t svrshl_x(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_x)))
-svuint64_t svrshl_x(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_x)))
-svuint16_t svrshl_x(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u8_z)))
-svuint8_t svrshl_z(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u32_z)))
-svuint32_t svrshl_z(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u64_z)))
-svuint64_t svrshl_z(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshl_u16_z)))
-svuint16_t svrshl_z(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s8_m)))
-svint8_t svrshr_m(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s32_m)))
-svint32_t svrshr_m(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s64_m)))
-svint64_t svrshr_m(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s16_m)))
-svint16_t svrshr_m(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u8_m)))
-svuint8_t svrshr_m(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u32_m)))
-svuint32_t svrshr_m(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u64_m)))
-svuint64_t svrshr_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u16_m)))
-svuint16_t svrshr_m(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s8_x)))
-svint8_t svrshr_x(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s32_x)))
-svint32_t svrshr_x(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s64_x)))
-svint64_t svrshr_x(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s16_x)))
-svint16_t svrshr_x(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u8_x)))
-svuint8_t svrshr_x(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u32_x)))
-svuint32_t svrshr_x(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u64_x)))
-svuint64_t svrshr_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u16_x)))
-svuint16_t svrshr_x(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s8_z)))
-svint8_t svrshr_z(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s32_z)))
-svint32_t svrshr_z(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s64_z)))
-svint64_t svrshr_z(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_s16_z)))
-svint16_t svrshr_z(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u8_z)))
-svuint8_t svrshr_z(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u32_z)))
-svuint32_t svrshr_z(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u64_z)))
-svuint64_t svrshr_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshr_n_u16_z)))
-svuint16_t svrshr_z(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_u32)))
-svuint16_t svrshrnb(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_u64)))
-svuint32_t svrshrnb(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_u16)))
-svuint8_t svrshrnb(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_s32)))
-svint16_t svrshrnb(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_s64)))
-svint32_t svrshrnb(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnb_n_s16)))
-svint8_t svrshrnb(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_u32)))
-svuint16_t svrshrnt(svuint16_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_u64)))
-svuint32_t svrshrnt(svuint32_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_u16)))
-svuint8_t svrshrnt(svuint8_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_s32)))
-svint16_t svrshrnt(svint16_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_s64)))
-svint32_t svrshrnt(svint32_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrshrnt_n_s16)))
-svint8_t svrshrnt(svint8_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_u32_m)))
-svuint32_t svrsqrte_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_u32_x)))
-svuint32_t svrsqrte_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_u32_z)))
-svuint32_t svrsqrte_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_s8)))
-svint8_t svrsra(svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_s32)))
-svint32_t svrsra(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_s64)))
-svint64_t svrsra(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_s16)))
-svint16_t svrsra(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_u8)))
-svuint8_t svrsra(svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_u32)))
-svuint32_t svrsra(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_u64)))
-svuint64_t svrsra(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsra_n_u16)))
-svuint16_t svrsra(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_u32)))
-svuint16_t svrsubhnb(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_u64)))
-svuint32_t svrsubhnb(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_u16)))
-svuint8_t svrsubhnb(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_s32)))
-svint16_t svrsubhnb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_s64)))
-svint32_t svrsubhnb(svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_n_s16)))
-svint8_t svrsubhnb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_u32)))
-svuint16_t svrsubhnb(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_u64)))
-svuint32_t svrsubhnb(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_u16)))
-svuint8_t svrsubhnb(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_s32)))
-svint16_t svrsubhnb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_s64)))
-svint32_t svrsubhnb(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnb_s16)))
-svint8_t svrsubhnb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_u32)))
-svuint16_t svrsubhnt(svuint16_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_u64)))
-svuint32_t svrsubhnt(svuint32_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_u16)))
-svuint8_t svrsubhnt(svuint8_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_s32)))
-svint16_t svrsubhnt(svint16_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_s64)))
-svint32_t svrsubhnt(svint32_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_n_s16)))
-svint8_t svrsubhnt(svint8_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_u32)))
-svuint16_t svrsubhnt(svuint16_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_u64)))
-svuint32_t svrsubhnt(svuint32_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_u16)))
-svuint8_t svrsubhnt(svuint8_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_s32)))
-svint16_t svrsubhnt(svint16_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_s64)))
-svint32_t svrsubhnt(svint32_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsubhnt_s16)))
-svint8_t svrsubhnt(svint8_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclb_n_u32)))
-svuint32_t svsbclb(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclb_n_u64)))
-svuint64_t svsbclb(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclb_u32)))
-svuint32_t svsbclb(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclb_u64)))
-svuint64_t svsbclb(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclt_n_u32)))
-svuint32_t svsbclt(svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclt_n_u64)))
-svuint64_t svsbclt(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclt_u32)))
-svuint32_t svsbclt(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsbclt_u64)))
-svuint64_t svsbclt(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_s32)))
-svint32_t svshllb(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_s64)))
-svint64_t svshllb(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_s16)))
-svint16_t svshllb(svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_u32)))
-svuint32_t svshllb(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_u64)))
-svuint64_t svshllb(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllb_n_u16)))
-svuint16_t svshllb(svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_s32)))
-svint32_t svshllt(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_s64)))
-svint64_t svshllt(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_s16)))
-svint16_t svshllt(svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_u32)))
-svuint32_t svshllt(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_u64)))
-svuint64_t svshllt(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshllt_n_u16)))
-svuint16_t svshllt(svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_u32)))
-svuint16_t svshrnb(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_u64)))
-svuint32_t svshrnb(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_u16)))
-svuint8_t svshrnb(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_s32)))
-svint16_t svshrnb(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_s64)))
-svint32_t svshrnb(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnb_n_s16)))
-svint8_t svshrnb(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_u32)))
-svuint16_t svshrnt(svuint16_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_u64)))
-svuint32_t svshrnt(svuint32_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_u16)))
-svuint8_t svshrnt(svuint8_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_s32)))
-svint16_t svshrnt(svint16_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_s64)))
-svint32_t svshrnt(svint32_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svshrnt_n_s16)))
-svint8_t svshrnt(svint8_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_u8)))
-svuint8_t svsli(svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_u32)))
-svuint32_t svsli(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_u64)))
-svuint64_t svsli(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_u16)))
-svuint16_t svsli(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_s8)))
-svint8_t svsli(svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_s32)))
-svint32_t svsli(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_s64)))
-svint64_t svsli(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsli_n_s16)))
-svint16_t svsli(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u8_m)))
-svuint8_t svsqadd_m(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u32_m)))
-svuint32_t svsqadd_m(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u64_m)))
-svuint64_t svsqadd_m(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u16_m)))
-svuint16_t svsqadd_m(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u8_x)))
-svuint8_t svsqadd_x(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u32_x)))
-svuint32_t svsqadd_x(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u64_x)))
-svuint64_t svsqadd_x(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u16_x)))
-svuint16_t svsqadd_x(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u8_z)))
-svuint8_t svsqadd_z(svbool_t, svuint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u32_z)))
-svuint32_t svsqadd_z(svbool_t, svuint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u64_z)))
-svuint64_t svsqadd_z(svbool_t, svuint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_n_u16_z)))
-svuint16_t svsqadd_z(svbool_t, svuint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u8_m)))
-svuint8_t svsqadd_m(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u32_m)))
-svuint32_t svsqadd_m(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u64_m)))
-svuint64_t svsqadd_m(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u16_m)))
-svuint16_t svsqadd_m(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u8_x)))
-svuint8_t svsqadd_x(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u32_x)))
-svuint32_t svsqadd_x(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u64_x)))
-svuint64_t svsqadd_x(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u16_x)))
-svuint16_t svsqadd_x(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u8_z)))
-svuint8_t svsqadd_z(svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u32_z)))
-svuint32_t svsqadd_z(svbool_t, svuint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u64_z)))
-svuint64_t svsqadd_z(svbool_t, svuint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqadd_u16_z)))
-svuint16_t svsqadd_z(svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_s8)))
-svint8_t svsra(svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_s32)))
-svint32_t svsra(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_s64)))
-svint64_t svsra(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_s16)))
-svint16_t svsra(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_u8)))
-svuint8_t svsra(svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_u32)))
-svuint32_t svsra(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_u64)))
-svuint64_t svsra(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsra_n_u16)))
-svuint16_t svsra(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_u8)))
-svuint8_t svsri(svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_u32)))
-svuint32_t svsri(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_u64)))
-svuint64_t svsri(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_u16)))
-svuint16_t svsri(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_s8)))
-svint8_t svsri(svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_s32)))
-svint32_t svsri(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_s64)))
-svint64_t svsri(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsri_n_s16)))
-svint16_t svsri(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_u32)))
-svuint16_t svsubhnb(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_u64)))
-svuint32_t svsubhnb(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_u16)))
-svuint8_t svsubhnb(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_s32)))
-svint16_t svsubhnb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_s64)))
-svint32_t svsubhnb(svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_n_s16)))
-svint8_t svsubhnb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_u32)))
-svuint16_t svsubhnb(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_u64)))
-svuint32_t svsubhnb(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_u16)))
-svuint8_t svsubhnb(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_s32)))
-svint16_t svsubhnb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_s64)))
-svint32_t svsubhnb(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnb_s16)))
-svint8_t svsubhnb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_u32)))
-svuint16_t svsubhnt(svuint16_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_u64)))
-svuint32_t svsubhnt(svuint32_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_u16)))
-svuint8_t svsubhnt(svuint8_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_s32)))
-svint16_t svsubhnt(svint16_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_s64)))
-svint32_t svsubhnt(svint32_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_n_s16)))
-svint8_t svsubhnt(svint8_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_u32)))
-svuint16_t svsubhnt(svuint16_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_u64)))
-svuint32_t svsubhnt(svuint32_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_u16)))
-svuint8_t svsubhnt(svuint8_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_s32)))
-svint16_t svsubhnt(svint16_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_s64)))
-svint32_t svsubhnt(svint32_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubhnt_s16)))
-svint8_t svsubhnt(svint8_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_s32)))
-svint32_t svsublb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_s64)))
-svint64_t svsublb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_s16)))
-svint16_t svsublb(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_u32)))
-svuint32_t svsublb(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_u64)))
-svuint64_t svsublb(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_n_u16)))
-svuint16_t svsublb(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_s32)))
-svint32_t svsublb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_s64)))
-svint64_t svsublb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_s16)))
-svint16_t svsublb(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_u32)))
-svuint32_t svsublb(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_u64)))
-svuint64_t svsublb(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublb_u16)))
-svuint16_t svsublb(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_n_s32)))
-svint32_t svsublbt(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_n_s64)))
-svint64_t svsublbt(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_n_s16)))
-svint16_t svsublbt(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_s32)))
-svint32_t svsublbt(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_s64)))
-svint64_t svsublbt(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublbt_s16)))
-svint16_t svsublbt(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_s32)))
-svint32_t svsublt(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_s64)))
-svint64_t svsublt(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_s16)))
-svint16_t svsublt(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_u32)))
-svuint32_t svsublt(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_u64)))
-svuint64_t svsublt(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_n_u16)))
-svuint16_t svsublt(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_s32)))
-svint32_t svsublt(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_s64)))
-svint64_t svsublt(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_s16)))
-svint16_t svsublt(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_u32)))
-svuint32_t svsublt(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_u64)))
-svuint64_t svsublt(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsublt_u16)))
-svuint16_t svsublt(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_n_s32)))
-svint32_t svsubltb(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_n_s64)))
-svint64_t svsubltb(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_n_s16)))
-svint16_t svsubltb(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_s32)))
-svint32_t svsubltb(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_s64)))
-svint64_t svsubltb(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubltb_s16)))
-svint16_t svsubltb(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_s32)))
-svint32_t svsubwb(svint32_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_s64)))
-svint64_t svsubwb(svint64_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_s16)))
-svint16_t svsubwb(svint16_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_u32)))
-svuint32_t svsubwb(svuint32_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_u64)))
-svuint64_t svsubwb(svuint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_n_u16)))
-svuint16_t svsubwb(svuint16_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_s32)))
-svint32_t svsubwb(svint32_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_s64)))
-svint64_t svsubwb(svint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_s16)))
-svint16_t svsubwb(svint16_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_u32)))
-svuint32_t svsubwb(svuint32_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_u64)))
-svuint64_t svsubwb(svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwb_u16)))
-svuint16_t svsubwb(svuint16_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_s32)))
-svint32_t svsubwt(svint32_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_s64)))
-svint64_t svsubwt(svint64_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_s16)))
-svint16_t svsubwt(svint16_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_u32)))
-svuint32_t svsubwt(svuint32_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_u64)))
-svuint64_t svsubwt(svuint64_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_n_u16)))
-svuint16_t svsubwt(svuint16_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_s32)))
-svint32_t svsubwt(svint32_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_s64)))
-svint64_t svsubwt(svint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_s16)))
-svint16_t svsubwt(svint16_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_u32)))
-svuint32_t svsubwt(svuint32_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_u64)))
-svuint64_t svsubwt(svuint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubwt_u16)))
-svuint16_t svsubwt(svuint16_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_u8)))
-svuint8_t svtbl2(svuint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_u32)))
-svuint32_t svtbl2(svuint32x2_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_u64)))
-svuint64_t svtbl2(svuint64x2_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_u16)))
-svuint16_t svtbl2(svuint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_s8)))
-svint8_t svtbl2(svint8x2_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_f64)))
-svfloat64_t svtbl2(svfloat64x2_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_f32)))
-svfloat32_t svtbl2(svfloat32x2_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_f16)))
-svfloat16_t svtbl2(svfloat16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_s32)))
-svint32_t svtbl2(svint32x2_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_s64)))
-svint64_t svtbl2(svint64x2_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_s16)))
-svint16_t svtbl2(svint16x2_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_u8)))
-svuint8_t svtbx(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_u32)))
-svuint32_t svtbx(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_u64)))
-svuint64_t svtbx(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_u16)))
-svuint16_t svtbx(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_s8)))
-svint8_t svtbx(svint8_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_f64)))
-svfloat64_t svtbx(svfloat64_t, svfloat64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_f32)))
-svfloat32_t svtbx(svfloat32_t, svfloat32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_f16)))
-svfloat16_t svtbx(svfloat16_t, svfloat16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_s32)))
-svint32_t svtbx(svint32_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_s64)))
-svint64_t svtbx(svint64_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_s16)))
-svint16_t svtbx(svint16_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s8_m)))
-svint8_t svuqadd_m(svbool_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s32_m)))
-svint32_t svuqadd_m(svbool_t, svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s64_m)))
-svint64_t svuqadd_m(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s16_m)))
-svint16_t svuqadd_m(svbool_t, svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s8_x)))
-svint8_t svuqadd_x(svbool_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s32_x)))
-svint32_t svuqadd_x(svbool_t, svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s64_x)))
-svint64_t svuqadd_x(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s16_x)))
-svint16_t svuqadd_x(svbool_t, svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s8_z)))
-svint8_t svuqadd_z(svbool_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s32_z)))
-svint32_t svuqadd_z(svbool_t, svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s64_z)))
-svint64_t svuqadd_z(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_n_s16_z)))
-svint16_t svuqadd_z(svbool_t, svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s8_m)))
-svint8_t svuqadd_m(svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s32_m)))
-svint32_t svuqadd_m(svbool_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s64_m)))
-svint64_t svuqadd_m(svbool_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s16_m)))
-svint16_t svuqadd_m(svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s8_x)))
-svint8_t svuqadd_x(svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s32_x)))
-svint32_t svuqadd_x(svbool_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s64_x)))
-svint64_t svuqadd_x(svbool_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s16_x)))
-svint16_t svuqadd_x(svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s8_z)))
-svint8_t svuqadd_z(svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s32_z)))
-svint32_t svuqadd_z(svbool_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s64_z)))
-svint64_t svuqadd_z(svbool_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuqadd_s16_z)))
-svint16_t svuqadd_z(svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_s32)))
-svbool_t svwhilege_b8(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_s32)))
-svbool_t svwhilege_b32(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_s32)))
-svbool_t svwhilege_b64(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_s32)))
-svbool_t svwhilege_b16(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_s64)))
-svbool_t svwhilege_b8(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_s64)))
-svbool_t svwhilege_b32(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_s64)))
-svbool_t svwhilege_b64(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_s64)))
-svbool_t svwhilege_b16(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_u32)))
-svbool_t svwhilege_b8(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_u32)))
-svbool_t svwhilege_b32(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_u32)))
-svbool_t svwhilege_b64(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_u32)))
-svbool_t svwhilege_b16(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b8_u64)))
-svbool_t svwhilege_b8(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b32_u64)))
-svbool_t svwhilege_b32(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b64_u64)))
-svbool_t svwhilege_b64(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilege_b16_u64)))
-svbool_t svwhilege_b16(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_s32)))
-svbool_t svwhilegt_b8(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_s32)))
-svbool_t svwhilegt_b32(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_s32)))
-svbool_t svwhilegt_b64(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_s32)))
-svbool_t svwhilegt_b16(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_s64)))
-svbool_t svwhilegt_b8(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_s64)))
-svbool_t svwhilegt_b32(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_s64)))
-svbool_t svwhilegt_b64(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_s64)))
-svbool_t svwhilegt_b16(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_u32)))
-svbool_t svwhilegt_b8(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_u32)))
-svbool_t svwhilegt_b32(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_u32)))
-svbool_t svwhilegt_b64(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_u32)))
-svbool_t svwhilegt_b16(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b8_u64)))
-svbool_t svwhilegt_b8(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b32_u64)))
-svbool_t svwhilegt_b32(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b64_u64)))
-svbool_t svwhilegt_b64(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilegt_b16_u64)))
-svbool_t svwhilegt_b16(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_u8)))
-svbool_t svwhilerw(uint8_t const *, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_s8)))
-svbool_t svwhilerw(int8_t const *, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_u64)))
-svbool_t svwhilerw(uint64_t const *, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_f64)))
-svbool_t svwhilerw(float64_t const *, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_s64)))
-svbool_t svwhilerw(int64_t const *, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_u16)))
-svbool_t svwhilerw(uint16_t const *, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_f16)))
-svbool_t svwhilerw(float16_t const *, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_s16)))
-svbool_t svwhilerw(int16_t const *, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_u32)))
-svbool_t svwhilerw(uint32_t const *, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_f32)))
-svbool_t svwhilerw(float32_t const *, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_s32)))
-svbool_t svwhilerw(int32_t const *, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_u8)))
-svbool_t svwhilewr(uint8_t const *, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_s8)))
-svbool_t svwhilewr(int8_t const *, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_u64)))
-svbool_t svwhilewr(uint64_t const *, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_f64)))
-svbool_t svwhilewr(float64_t const *, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_s64)))
-svbool_t svwhilewr(int64_t const *, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_u16)))
-svbool_t svwhilewr(uint16_t const *, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_f16)))
-svbool_t svwhilewr(float16_t const *, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_s16)))
-svbool_t svwhilewr(int16_t const *, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_u32)))
-svbool_t svwhilewr(uint32_t const *, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_f32)))
-svbool_t svwhilewr(float32_t const *, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_s32)))
-svbool_t svwhilewr(int32_t const *, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_u8)))
-svuint8_t svxar(svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_u32)))
-svuint32_t svxar(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_u64)))
-svuint64_t svxar(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_u16)))
-svuint16_t svxar(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s8)))
-svint8_t svxar(svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s32)))
-svint32_t svxar(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s64)))
-svint64_t svxar(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svxar_n_s16)))
-svint16_t svxar(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f64_m)))
-svfloat64_t svabd_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f32_m)))
-svfloat32_t svabd_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f16_m)))
-svfloat16_t svabd_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f64_x)))
-svfloat64_t svabd_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f32_x)))
-svfloat32_t svabd_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f16_x)))
-svfloat16_t svabd_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f64_z)))
-svfloat64_t svabd_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f32_z)))
-svfloat32_t svabd_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f16_z)))
-svfloat16_t svabd_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s8_m)))
-svint8_t svabd_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s32_m)))
-svint32_t svabd_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s64_m)))
-svint64_t svabd_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s16_m)))
-svint16_t svabd_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s8_x)))
-svint8_t svabd_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s32_x)))
-svint32_t svabd_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s64_x)))
-svint64_t svabd_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s16_x)))
-svint16_t svabd_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s8_z)))
-svint8_t svabd_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s32_z)))
-svint32_t svabd_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s64_z)))
-svint64_t svabd_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s16_z)))
-svint16_t svabd_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u8_m)))
-svuint8_t svabd_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u32_m)))
-svuint32_t svabd_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u64_m)))
-svuint64_t svabd_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u16_m)))
-svuint16_t svabd_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u8_x)))
-svuint8_t svabd_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u32_x)))
-svuint32_t svabd_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u64_x)))
-svuint64_t svabd_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u16_x)))
-svuint16_t svabd_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u8_z)))
-svuint8_t svabd_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u32_z)))
-svuint32_t svabd_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u64_z)))
-svuint64_t svabd_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u16_z)))
-svuint16_t svabd_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f64_m)))
-svfloat64_t svabd_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f32_m)))
-svfloat32_t svabd_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f16_m)))
-svfloat16_t svabd_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f64_x)))
-svfloat64_t svabd_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f32_x)))
-svfloat32_t svabd_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f16_x)))
-svfloat16_t svabd_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f64_z)))
-svfloat64_t svabd_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f32_z)))
-svfloat32_t svabd_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f16_z)))
-svfloat16_t svabd_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s8_m)))
-svint8_t svabd_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s32_m)))
-svint32_t svabd_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s64_m)))
-svint64_t svabd_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s16_m)))
-svint16_t svabd_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s8_x)))
-svint8_t svabd_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s32_x)))
-svint32_t svabd_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s64_x)))
-svint64_t svabd_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s16_x)))
-svint16_t svabd_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s8_z)))
-svint8_t svabd_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s32_z)))
-svint32_t svabd_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s64_z)))
-svint64_t svabd_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s16_z)))
-svint16_t svabd_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u8_m)))
-svuint8_t svabd_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u32_m)))
-svuint32_t svabd_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u64_m)))
-svuint64_t svabd_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u16_m)))
-svuint16_t svabd_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u8_x)))
-svuint8_t svabd_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u32_x)))
-svuint32_t svabd_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u64_x)))
-svuint64_t svabd_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u16_x)))
-svuint16_t svabd_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u8_z)))
-svuint8_t svabd_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u32_z)))
-svuint32_t svabd_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u64_z)))
-svuint64_t svabd_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u16_z)))
-svuint16_t svabd_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f64_m)))
-svfloat64_t svabs_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f32_m)))
-svfloat32_t svabs_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f16_m)))
-svfloat16_t svabs_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f64_x)))
-svfloat64_t svabs_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f32_x)))
-svfloat32_t svabs_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f16_x)))
-svfloat16_t svabs_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f64_z)))
-svfloat64_t svabs_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f32_z)))
-svfloat32_t svabs_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f16_z)))
-svfloat16_t svabs_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s8_m)))
-svint8_t svabs_s8_m(svint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s32_m)))
-svint32_t svabs_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s64_m)))
-svint64_t svabs_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s16_m)))
-svint16_t svabs_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s8_x)))
-svint8_t svabs_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s32_x)))
-svint32_t svabs_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s64_x)))
-svint64_t svabs_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s16_x)))
-svint16_t svabs_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s8_z)))
-svint8_t svabs_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s32_z)))
-svint32_t svabs_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s64_z)))
-svint64_t svabs_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s16_z)))
-svint16_t svabs_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_n_f64)))
-svbool_t svacge_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_n_f32)))
-svbool_t svacge_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_n_f16)))
-svbool_t svacge_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_f64)))
-svbool_t svacge_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_f32)))
-svbool_t svacge_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_f16)))
-svbool_t svacge_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_n_f64)))
-svbool_t svacgt_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_n_f32)))
-svbool_t svacgt_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_n_f16)))
-svbool_t svacgt_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_f64)))
-svbool_t svacgt_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_f32)))
-svbool_t svacgt_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_f16)))
-svbool_t svacgt_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_n_f64)))
-svbool_t svacle_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_n_f32)))
-svbool_t svacle_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_n_f16)))
-svbool_t svacle_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_f64)))
-svbool_t svacle_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_f32)))
-svbool_t svacle_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_f16)))
-svbool_t svacle_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_n_f64)))
-svbool_t svaclt_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_n_f32)))
-svbool_t svaclt_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_n_f16)))
-svbool_t svaclt_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_f64)))
-svbool_t svaclt_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_f32)))
-svbool_t svaclt_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_f16)))
-svbool_t svaclt_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f64_m)))
-svfloat64_t svadd_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f32_m)))
-svfloat32_t svadd_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f16_m)))
-svfloat16_t svadd_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f64_x)))
-svfloat64_t svadd_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f32_x)))
-svfloat32_t svadd_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f16_x)))
-svfloat16_t svadd_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f64_z)))
-svfloat64_t svadd_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f32_z)))
-svfloat32_t svadd_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f16_z)))
-svfloat16_t svadd_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u8_m)))
-svuint8_t svadd_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u32_m)))
-svuint32_t svadd_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u64_m)))
-svuint64_t svadd_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u16_m)))
-svuint16_t svadd_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s8_m)))
-svint8_t svadd_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s32_m)))
-svint32_t svadd_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s64_m)))
-svint64_t svadd_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s16_m)))
-svint16_t svadd_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u8_x)))
-svuint8_t svadd_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u32_x)))
-svuint32_t svadd_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u64_x)))
-svuint64_t svadd_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u16_x)))
-svuint16_t svadd_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s8_x)))
-svint8_t svadd_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s32_x)))
-svint32_t svadd_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s64_x)))
-svint64_t svadd_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s16_x)))
-svint16_t svadd_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u8_z)))
-svuint8_t svadd_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u32_z)))
-svuint32_t svadd_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u64_z)))
-svuint64_t svadd_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u16_z)))
-svuint16_t svadd_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s8_z)))
-svint8_t svadd_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s32_z)))
-svint32_t svadd_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s64_z)))
-svint64_t svadd_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s16_z)))
-svint16_t svadd_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f64_m)))
-svfloat64_t svadd_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f32_m)))
-svfloat32_t svadd_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f16_m)))
-svfloat16_t svadd_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f64_x)))
-svfloat64_t svadd_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f32_x)))
-svfloat32_t svadd_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f16_x)))
-svfloat16_t svadd_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f64_z)))
-svfloat64_t svadd_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f32_z)))
-svfloat32_t svadd_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f16_z)))
-svfloat16_t svadd_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u8_m)))
-svuint8_t svadd_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u32_m)))
-svuint32_t svadd_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u64_m)))
-svuint64_t svadd_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u16_m)))
-svuint16_t svadd_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s8_m)))
-svint8_t svadd_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s32_m)))
-svint32_t svadd_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s64_m)))
-svint64_t svadd_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s16_m)))
-svint16_t svadd_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u8_x)))
-svuint8_t svadd_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u32_x)))
-svuint32_t svadd_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u64_x)))
-svuint64_t svadd_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u16_x)))
-svuint16_t svadd_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s8_x)))
-svint8_t svadd_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s32_x)))
-svint32_t svadd_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s64_x)))
-svint64_t svadd_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s16_x)))
-svint16_t svadd_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u8_z)))
-svuint8_t svadd_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u32_z)))
-svuint32_t svadd_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u64_z)))
-svuint64_t svadd_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u16_z)))
-svuint16_t svadd_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s8_z)))
-svint8_t svadd_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s32_z)))
-svint32_t svadd_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s64_z)))
-svint64_t svadd_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s16_z)))
-svint16_t svadd_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f64)))
-float64_t svadda_f64(svbool_t, float64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f32)))
-float32_t svadda_f32(svbool_t, float32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f16)))
-float16_t svadda_f16(svbool_t, float16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s8)))
-int64_t svaddv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s32)))
-int64_t svaddv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s64)))
-int64_t svaddv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s16)))
-int64_t svaddv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_u8)))
-uint64_t svaddv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_u32)))
-uint64_t svaddv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_u64)))
-uint64_t svaddv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_u16)))
-uint64_t svaddv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_f64)))
-float64_t svaddv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_f32)))
-float32_t svaddv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_f16)))
-float16_t svaddv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_b_z)))
-svbool_t svand_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u8_m)))
-svuint8_t svand_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u32_m)))
-svuint32_t svand_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u64_m)))
-svuint64_t svand_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u16_m)))
-svuint16_t svand_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s8_m)))
-svint8_t svand_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s32_m)))
-svint32_t svand_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s64_m)))
-svint64_t svand_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s16_m)))
-svint16_t svand_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u8_x)))
-svuint8_t svand_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u32_x)))
-svuint32_t svand_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u64_x)))
-svuint64_t svand_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u16_x)))
-svuint16_t svand_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s8_x)))
-svint8_t svand_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s32_x)))
-svint32_t svand_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s64_x)))
-svint64_t svand_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s16_x)))
-svint16_t svand_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u8_z)))
-svuint8_t svand_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u32_z)))
-svuint32_t svand_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u64_z)))
-svuint64_t svand_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u16_z)))
-svuint16_t svand_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s8_z)))
-svint8_t svand_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s32_z)))
-svint32_t svand_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s64_z)))
-svint64_t svand_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s16_z)))
-svint16_t svand_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u8_m)))
-svuint8_t svand_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u32_m)))
-svuint32_t svand_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u64_m)))
-svuint64_t svand_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u16_m)))
-svuint16_t svand_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s8_m)))
-svint8_t svand_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s32_m)))
-svint32_t svand_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s64_m)))
-svint64_t svand_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s16_m)))
-svint16_t svand_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u8_x)))
-svuint8_t svand_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u32_x)))
-svuint32_t svand_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u64_x)))
-svuint64_t svand_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u16_x)))
-svuint16_t svand_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s8_x)))
-svint8_t svand_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s32_x)))
-svint32_t svand_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s64_x)))
-svint64_t svand_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s16_x)))
-svint16_t svand_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u8_z)))
-svuint8_t svand_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u32_z)))
-svuint32_t svand_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u64_z)))
-svuint64_t svand_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u16_z)))
-svuint16_t svand_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s8_z)))
-svint8_t svand_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s32_z)))
-svint32_t svand_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s64_z)))
-svint64_t svand_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s16_z)))
-svint16_t svand_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_u8)))
-uint8_t svandv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_u32)))
-uint32_t svandv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_u64)))
-uint64_t svandv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_u16)))
-uint16_t svandv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_s8)))
-int8_t svandv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_s32)))
-int32_t svandv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_s64)))
-int64_t svandv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_s16)))
-int16_t svandv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s8_m)))
-svint8_t svasr_n_s8_m(svbool_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s32_m)))
-svint32_t svasr_n_s32_m(svbool_t, svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s64_m)))
-svint64_t svasr_n_s64_m(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s16_m)))
-svint16_t svasr_n_s16_m(svbool_t, svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s8_x)))
-svint8_t svasr_n_s8_x(svbool_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s32_x)))
-svint32_t svasr_n_s32_x(svbool_t, svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s64_x)))
-svint64_t svasr_n_s64_x(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s16_x)))
-svint16_t svasr_n_s16_x(svbool_t, svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s8_z)))
-svint8_t svasr_n_s8_z(svbool_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s32_z)))
-svint32_t svasr_n_s32_z(svbool_t, svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s64_z)))
-svint64_t svasr_n_s64_z(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s16_z)))
-svint16_t svasr_n_s16_z(svbool_t, svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s8_m)))
-svint8_t svasr_s8_m(svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s32_m)))
-svint32_t svasr_s32_m(svbool_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s64_m)))
-svint64_t svasr_s64_m(svbool_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s16_m)))
-svint16_t svasr_s16_m(svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s8_x)))
-svint8_t svasr_s8_x(svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s32_x)))
-svint32_t svasr_s32_x(svbool_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s64_x)))
-svint64_t svasr_s64_x(svbool_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s16_x)))
-svint16_t svasr_s16_x(svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s8_z)))
-svint8_t svasr_s8_z(svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s32_z)))
-svint32_t svasr_s32_z(svbool_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s64_z)))
-svint64_t svasr_s64_z(svbool_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s16_z)))
-svint16_t svasr_s16_z(svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s8_m)))
-svint8_t svasr_wide_n_s8_m(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s32_m)))
-svint32_t svasr_wide_n_s32_m(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s16_m)))
-svint16_t svasr_wide_n_s16_m(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s8_x)))
-svint8_t svasr_wide_n_s8_x(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s32_x)))
-svint32_t svasr_wide_n_s32_x(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s16_x)))
-svint16_t svasr_wide_n_s16_x(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s8_z)))
-svint8_t svasr_wide_n_s8_z(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s32_z)))
-svint32_t svasr_wide_n_s32_z(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s16_z)))
-svint16_t svasr_wide_n_s16_z(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s8_m)))
-svint8_t svasr_wide_s8_m(svbool_t, svint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s32_m)))
-svint32_t svasr_wide_s32_m(svbool_t, svint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s16_m)))
-svint16_t svasr_wide_s16_m(svbool_t, svint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s8_x)))
-svint8_t svasr_wide_s8_x(svbool_t, svint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s32_x)))
-svint32_t svasr_wide_s32_x(svbool_t, svint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s16_x)))
-svint16_t svasr_wide_s16_x(svbool_t, svint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s8_z)))
-svint8_t svasr_wide_s8_z(svbool_t, svint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s32_z)))
-svint32_t svasr_wide_s32_z(svbool_t, svint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s16_z)))
-svint16_t svasr_wide_s16_z(svbool_t, svint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s8_m)))
-svint8_t svasrd_n_s8_m(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s32_m)))
-svint32_t svasrd_n_s32_m(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s64_m)))
-svint64_t svasrd_n_s64_m(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s16_m)))
-svint16_t svasrd_n_s16_m(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s8_x)))
-svint8_t svasrd_n_s8_x(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s32_x)))
-svint32_t svasrd_n_s32_x(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s64_x)))
-svint64_t svasrd_n_s64_x(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s16_x)))
-svint16_t svasrd_n_s16_x(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s8_z)))
-svint8_t svasrd_n_s8_z(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s32_z)))
-svint32_t svasrd_n_s32_z(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s64_z)))
-svint64_t svasrd_n_s64_z(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s16_z)))
-svint16_t svasrd_n_s16_z(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_b_z)))
-svbool_t svbic_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u8_m)))
-svuint8_t svbic_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u32_m)))
-svuint32_t svbic_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u64_m)))
-svuint64_t svbic_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u16_m)))
-svuint16_t svbic_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s8_m)))
-svint8_t svbic_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s32_m)))
-svint32_t svbic_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s64_m)))
-svint64_t svbic_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s16_m)))
-svint16_t svbic_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u8_x)))
-svuint8_t svbic_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u32_x)))
-svuint32_t svbic_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u64_x)))
-svuint64_t svbic_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u16_x)))
-svuint16_t svbic_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s8_x)))
-svint8_t svbic_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s32_x)))
-svint32_t svbic_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s64_x)))
-svint64_t svbic_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s16_x)))
-svint16_t svbic_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u8_z)))
-svuint8_t svbic_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u32_z)))
-svuint32_t svbic_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u64_z)))
-svuint64_t svbic_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u16_z)))
-svuint16_t svbic_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s8_z)))
-svint8_t svbic_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s32_z)))
-svint32_t svbic_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s64_z)))
-svint64_t svbic_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s16_z)))
-svint16_t svbic_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u8_m)))
-svuint8_t svbic_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u32_m)))
-svuint32_t svbic_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u64_m)))
-svuint64_t svbic_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u16_m)))
-svuint16_t svbic_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s8_m)))
-svint8_t svbic_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s32_m)))
-svint32_t svbic_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s64_m)))
-svint64_t svbic_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s16_m)))
-svint16_t svbic_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u8_x)))
-svuint8_t svbic_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u32_x)))
-svuint32_t svbic_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u64_x)))
-svuint64_t svbic_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u16_x)))
-svuint16_t svbic_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s8_x)))
-svint8_t svbic_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s32_x)))
-svint32_t svbic_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s64_x)))
-svint64_t svbic_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s16_x)))
-svint16_t svbic_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u8_z)))
-svuint8_t svbic_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u32_z)))
-svuint32_t svbic_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u64_z)))
-svuint64_t svbic_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u16_z)))
-svuint16_t svbic_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s8_z)))
-svint8_t svbic_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s32_z)))
-svint32_t svbic_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s64_z)))
-svint64_t svbic_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s16_z)))
-svint16_t svbic_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrka_b_m)))
-svbool_t svbrka_b_m(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrka_b_z)))
-svbool_t svbrka_b_z(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkb_b_m)))
-svbool_t svbrkb_b_m(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkb_b_z)))
-svbool_t svbrkb_b_z(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkn_b_z)))
-svbool_t svbrkn_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkpa_b_z)))
-svbool_t svbrkpa_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkpb_b_z)))
-svbool_t svbrkpb_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f64_m)))
-svfloat64_t svcadd_f64_m(svbool_t, svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f32_m)))
-svfloat32_t svcadd_f32_m(svbool_t, svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f16_m)))
-svfloat16_t svcadd_f16_m(svbool_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f64_x)))
-svfloat64_t svcadd_f64_x(svbool_t, svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f32_x)))
-svfloat32_t svcadd_f32_x(svbool_t, svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f16_x)))
-svfloat16_t svcadd_f16_x(svbool_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f64_z)))
-svfloat64_t svcadd_f64_z(svbool_t, svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f32_z)))
-svfloat32_t svcadd_f32_z(svbool_t, svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f16_z)))
-svfloat16_t svcadd_f16_z(svbool_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_u8)))
-uint8_t svclasta_n_u8(svbool_t, uint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_u32)))
-uint32_t svclasta_n_u32(svbool_t, uint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_u64)))
-uint64_t svclasta_n_u64(svbool_t, uint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_u16)))
-uint16_t svclasta_n_u16(svbool_t, uint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_s8)))
-int8_t svclasta_n_s8(svbool_t, int8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_f64)))
-float64_t svclasta_n_f64(svbool_t, float64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_f32)))
-float32_t svclasta_n_f32(svbool_t, float32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_f16)))
-float16_t svclasta_n_f16(svbool_t, float16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_s32)))
-int32_t svclasta_n_s32(svbool_t, int32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_s64)))
-int64_t svclasta_n_s64(svbool_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_s16)))
-int16_t svclasta_n_s16(svbool_t, int16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_u8)))
-svuint8_t svclasta_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_u32)))
-svuint32_t svclasta_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_u64)))
-svuint64_t svclasta_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_u16)))
-svuint16_t svclasta_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_s8)))
-svint8_t svclasta_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_f64)))
-svfloat64_t svclasta_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_f32)))
-svfloat32_t svclasta_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_f16)))
-svfloat16_t svclasta_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_s32)))
-svint32_t svclasta_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_s64)))
-svint64_t svclasta_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_s16)))
-svint16_t svclasta_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_u8)))
-uint8_t svclastb_n_u8(svbool_t, uint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_u32)))
-uint32_t svclastb_n_u32(svbool_t, uint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_u64)))
-uint64_t svclastb_n_u64(svbool_t, uint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_u16)))
-uint16_t svclastb_n_u16(svbool_t, uint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_s8)))
-int8_t svclastb_n_s8(svbool_t, int8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_f64)))
-float64_t svclastb_n_f64(svbool_t, float64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_f32)))
-float32_t svclastb_n_f32(svbool_t, float32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_f16)))
-float16_t svclastb_n_f16(svbool_t, float16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_s32)))
-int32_t svclastb_n_s32(svbool_t, int32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_s64)))
-int64_t svclastb_n_s64(svbool_t, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_s16)))
-int16_t svclastb_n_s16(svbool_t, int16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_u8)))
-svuint8_t svclastb_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_u32)))
-svuint32_t svclastb_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_u64)))
-svuint64_t svclastb_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_u16)))
-svuint16_t svclastb_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_s8)))
-svint8_t svclastb_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_f64)))
-svfloat64_t svclastb_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_f32)))
-svfloat32_t svclastb_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_f16)))
-svfloat16_t svclastb_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_s32)))
-svint32_t svclastb_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_s64)))
-svint64_t svclastb_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_s16)))
-svint16_t svclastb_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s8_m)))
-svuint8_t svcls_s8_m(svuint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s32_m)))
-svuint32_t svcls_s32_m(svuint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s64_m)))
-svuint64_t svcls_s64_m(svuint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s16_m)))
-svuint16_t svcls_s16_m(svuint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s8_x)))
-svuint8_t svcls_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s32_x)))
-svuint32_t svcls_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s64_x)))
-svuint64_t svcls_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s16_x)))
-svuint16_t svcls_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s8_z)))
-svuint8_t svcls_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s32_z)))
-svuint32_t svcls_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s64_z)))
-svuint64_t svcls_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s16_z)))
-svuint16_t svcls_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u8_m)))
-svuint8_t svclz_u8_m(svuint8_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u32_m)))
-svuint32_t svclz_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u64_m)))
-svuint64_t svclz_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u16_m)))
-svuint16_t svclz_u16_m(svuint16_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s8_m)))
-svuint8_t svclz_s8_m(svuint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s32_m)))
-svuint32_t svclz_s32_m(svuint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s64_m)))
-svuint64_t svclz_s64_m(svuint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s16_m)))
-svuint16_t svclz_s16_m(svuint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u8_x)))
-svuint8_t svclz_u8_x(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u32_x)))
-svuint32_t svclz_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u64_x)))
-svuint64_t svclz_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u16_x)))
-svuint16_t svclz_u16_x(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s8_x)))
-svuint8_t svclz_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s32_x)))
-svuint32_t svclz_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s64_x)))
-svuint64_t svclz_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s16_x)))
-svuint16_t svclz_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u8_z)))
-svuint8_t svclz_u8_z(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u32_z)))
-svuint32_t svclz_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u64_z)))
-svuint64_t svclz_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u16_z)))
-svuint16_t svclz_u16_z(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s8_z)))
-svuint8_t svclz_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s32_z)))
-svuint32_t svclz_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s64_z)))
-svuint64_t svclz_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s16_z)))
-svuint16_t svclz_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f64_m)))
-svfloat64_t svcmla_f64_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f32_m)))
-svfloat32_t svcmla_f32_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f16_m)))
-svfloat16_t svcmla_f16_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f64_x)))
-svfloat64_t svcmla_f64_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f32_x)))
-svfloat32_t svcmla_f32_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f16_x)))
-svfloat16_t svcmla_f16_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f64_z)))
-svfloat64_t svcmla_f64_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f32_z)))
-svfloat32_t svcmla_f32_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f16_z)))
-svfloat16_t svcmla_f16_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_f32)))
-svfloat32_t svcmla_lane_f32(svfloat32_t, svfloat32_t, svfloat32_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_f16)))
-svfloat16_t svcmla_lane_f16(svfloat16_t, svfloat16_t, svfloat16_t, uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_f64)))
-svbool_t svcmpeq_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_f32)))
-svbool_t svcmpeq_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_f16)))
-svbool_t svcmpeq_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_u8)))
-svbool_t svcmpeq_n_u8(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_u32)))
-svbool_t svcmpeq_n_u32(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_u64)))
-svbool_t svcmpeq_n_u64(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_u16)))
-svbool_t svcmpeq_n_u16(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_s8)))
-svbool_t svcmpeq_n_s8(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_s32)))
-svbool_t svcmpeq_n_s32(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_s64)))
-svbool_t svcmpeq_n_s64(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_s16)))
-svbool_t svcmpeq_n_s16(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_u8)))
-svbool_t svcmpeq_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_u32)))
-svbool_t svcmpeq_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_u64)))
-svbool_t svcmpeq_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_u16)))
-svbool_t svcmpeq_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_s8)))
-svbool_t svcmpeq_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_s32)))
-svbool_t svcmpeq_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_s64)))
-svbool_t svcmpeq_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_s16)))
-svbool_t svcmpeq_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_f64)))
-svbool_t svcmpeq_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_f32)))
-svbool_t svcmpeq_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_f16)))
-svbool_t svcmpeq_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_n_s8)))
-svbool_t svcmpeq_wide_n_s8(svbool_t, svint8_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_n_s32)))
-svbool_t svcmpeq_wide_n_s32(svbool_t, svint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_n_s16)))
-svbool_t svcmpeq_wide_n_s16(svbool_t, svint16_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_s8)))
-svbool_t svcmpeq_wide_s8(svbool_t, svint8_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_s32)))
-svbool_t svcmpeq_wide_s32(svbool_t, svint32_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_s16)))
-svbool_t svcmpeq_wide_s16(svbool_t, svint16_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_f64)))
-svbool_t svcmpge_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_f32)))
-svbool_t svcmpge_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_f16)))
-svbool_t svcmpge_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_s8)))
-svbool_t svcmpge_n_s8(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_s32)))
-svbool_t svcmpge_n_s32(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_s64)))
-svbool_t svcmpge_n_s64(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_s16)))
-svbool_t svcmpge_n_s16(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_u8)))
-svbool_t svcmpge_n_u8(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_u32)))
-svbool_t svcmpge_n_u32(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_u64)))
-svbool_t svcmpge_n_u64(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_u16)))
-svbool_t svcmpge_n_u16(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_s8)))
-svbool_t svcmpge_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_s32)))
-svbool_t svcmpge_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_s64)))
-svbool_t svcmpge_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_s16)))
-svbool_t svcmpge_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_f64)))
-svbool_t svcmpge_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_f32)))
-svbool_t svcmpge_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_f16)))
-svbool_t svcmpge_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_u8)))
-svbool_t svcmpge_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_u32)))
-svbool_t svcmpge_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_u64)))
-svbool_t svcmpge_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_u16)))
-svbool_t svcmpge_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_s8)))
-svbool_t svcmpge_wide_n_s8(svbool_t, svint8_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_s32)))
-svbool_t svcmpge_wide_n_s32(svbool_t, svint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_s16)))
-svbool_t svcmpge_wide_n_s16(svbool_t, svint16_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_u8)))
-svbool_t svcmpge_wide_n_u8(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_u32)))
-svbool_t svcmpge_wide_n_u32(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_u16)))
-svbool_t svcmpge_wide_n_u16(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_s8)))
-svbool_t svcmpge_wide_s8(svbool_t, svint8_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_s32)))
-svbool_t svcmpge_wide_s32(svbool_t, svint32_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_s16)))
-svbool_t svcmpge_wide_s16(svbool_t, svint16_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_u8)))
-svbool_t svcmpge_wide_u8(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_u32)))
-svbool_t svcmpge_wide_u32(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_u16)))
-svbool_t svcmpge_wide_u16(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_f64)))
-svbool_t svcmpgt_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_f32)))
-svbool_t svcmpgt_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_f16)))
-svbool_t svcmpgt_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_s8)))
-svbool_t svcmpgt_n_s8(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_s32)))
-svbool_t svcmpgt_n_s32(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_s64)))
-svbool_t svcmpgt_n_s64(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_s16)))
-svbool_t svcmpgt_n_s16(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_u8)))
-svbool_t svcmpgt_n_u8(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_u32)))
-svbool_t svcmpgt_n_u32(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_u64)))
-svbool_t svcmpgt_n_u64(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_u16)))
-svbool_t svcmpgt_n_u16(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_s8)))
-svbool_t svcmpgt_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_s32)))
-svbool_t svcmpgt_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_s64)))
-svbool_t svcmpgt_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_s16)))
-svbool_t svcmpgt_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_f64)))
-svbool_t svcmpgt_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_f32)))
-svbool_t svcmpgt_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_f16)))
-svbool_t svcmpgt_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_u8)))
-svbool_t svcmpgt_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_u32)))
-svbool_t svcmpgt_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_u64)))
-svbool_t svcmpgt_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_u16)))
-svbool_t svcmpgt_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_s8)))
-svbool_t svcmpgt_wide_n_s8(svbool_t, svint8_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_s32)))
-svbool_t svcmpgt_wide_n_s32(svbool_t, svint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_s16)))
-svbool_t svcmpgt_wide_n_s16(svbool_t, svint16_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_u8)))
-svbool_t svcmpgt_wide_n_u8(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_u32)))
-svbool_t svcmpgt_wide_n_u32(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_u16)))
-svbool_t svcmpgt_wide_n_u16(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_s8)))
-svbool_t svcmpgt_wide_s8(svbool_t, svint8_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_s32)))
-svbool_t svcmpgt_wide_s32(svbool_t, svint32_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_s16)))
-svbool_t svcmpgt_wide_s16(svbool_t, svint16_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_u8)))
-svbool_t svcmpgt_wide_u8(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_u32)))
-svbool_t svcmpgt_wide_u32(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_u16)))
-svbool_t svcmpgt_wide_u16(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_f64)))
-svbool_t svcmple_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_f32)))
-svbool_t svcmple_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_f16)))
-svbool_t svcmple_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_s8)))
-svbool_t svcmple_n_s8(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_s32)))
-svbool_t svcmple_n_s32(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_s64)))
-svbool_t svcmple_n_s64(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_s16)))
-svbool_t svcmple_n_s16(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_u8)))
-svbool_t svcmple_n_u8(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_u32)))
-svbool_t svcmple_n_u32(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_u64)))
-svbool_t svcmple_n_u64(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_u16)))
-svbool_t svcmple_n_u16(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_s8)))
-svbool_t svcmple_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_s32)))
-svbool_t svcmple_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_s64)))
-svbool_t svcmple_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_s16)))
-svbool_t svcmple_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_f64)))
-svbool_t svcmple_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_f32)))
-svbool_t svcmple_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_f16)))
-svbool_t svcmple_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_u8)))
-svbool_t svcmple_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_u32)))
-svbool_t svcmple_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_u64)))
-svbool_t svcmple_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_u16)))
-svbool_t svcmple_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_s8)))
-svbool_t svcmple_wide_n_s8(svbool_t, svint8_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_s32)))
-svbool_t svcmple_wide_n_s32(svbool_t, svint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_s16)))
-svbool_t svcmple_wide_n_s16(svbool_t, svint16_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_u8)))
-svbool_t svcmple_wide_n_u8(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_u32)))
-svbool_t svcmple_wide_n_u32(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_u16)))
-svbool_t svcmple_wide_n_u16(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_s8)))
-svbool_t svcmple_wide_s8(svbool_t, svint8_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_s32)))
-svbool_t svcmple_wide_s32(svbool_t, svint32_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_s16)))
-svbool_t svcmple_wide_s16(svbool_t, svint16_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_u8)))
-svbool_t svcmple_wide_u8(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_u32)))
-svbool_t svcmple_wide_u32(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_u16)))
-svbool_t svcmple_wide_u16(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_u8)))
-svbool_t svcmplt_n_u8(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_u32)))
-svbool_t svcmplt_n_u32(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_u64)))
-svbool_t svcmplt_n_u64(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_u16)))
-svbool_t svcmplt_n_u16(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_f64)))
-svbool_t svcmplt_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_f32)))
-svbool_t svcmplt_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_f16)))
-svbool_t svcmplt_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_s8)))
-svbool_t svcmplt_n_s8(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_s32)))
-svbool_t svcmplt_n_s32(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_s64)))
-svbool_t svcmplt_n_s64(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_s16)))
-svbool_t svcmplt_n_s16(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_u8)))
-svbool_t svcmplt_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_u32)))
-svbool_t svcmplt_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_u64)))
-svbool_t svcmplt_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_u16)))
-svbool_t svcmplt_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_s8)))
-svbool_t svcmplt_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_s32)))
-svbool_t svcmplt_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_s64)))
-svbool_t svcmplt_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_s16)))
-svbool_t svcmplt_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_f64)))
-svbool_t svcmplt_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_f32)))
-svbool_t svcmplt_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_f16)))
-svbool_t svcmplt_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_u8)))
-svbool_t svcmplt_wide_n_u8(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_u32)))
-svbool_t svcmplt_wide_n_u32(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_u16)))
-svbool_t svcmplt_wide_n_u16(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_s8)))
-svbool_t svcmplt_wide_n_s8(svbool_t, svint8_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_s32)))
-svbool_t svcmplt_wide_n_s32(svbool_t, svint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_s16)))
-svbool_t svcmplt_wide_n_s16(svbool_t, svint16_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_u8)))
-svbool_t svcmplt_wide_u8(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_u32)))
-svbool_t svcmplt_wide_u32(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_u16)))
-svbool_t svcmplt_wide_u16(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_s8)))
-svbool_t svcmplt_wide_s8(svbool_t, svint8_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_s32)))
-svbool_t svcmplt_wide_s32(svbool_t, svint32_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_s16)))
-svbool_t svcmplt_wide_s16(svbool_t, svint16_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_f64)))
-svbool_t svcmpne_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_f32)))
-svbool_t svcmpne_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_f16)))
-svbool_t svcmpne_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_u8)))
-svbool_t svcmpne_n_u8(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_u32)))
-svbool_t svcmpne_n_u32(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_u64)))
-svbool_t svcmpne_n_u64(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_u16)))
-svbool_t svcmpne_n_u16(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_s8)))
-svbool_t svcmpne_n_s8(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_s32)))
-svbool_t svcmpne_n_s32(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_s64)))
-svbool_t svcmpne_n_s64(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_s16)))
-svbool_t svcmpne_n_s16(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_u8)))
-svbool_t svcmpne_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_u32)))
-svbool_t svcmpne_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_u64)))
-svbool_t svcmpne_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_u16)))
-svbool_t svcmpne_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_s8)))
-svbool_t svcmpne_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_s32)))
-svbool_t svcmpne_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_s64)))
-svbool_t svcmpne_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_s16)))
-svbool_t svcmpne_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_f64)))
-svbool_t svcmpne_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_f32)))
-svbool_t svcmpne_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_f16)))
-svbool_t svcmpne_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_n_s8)))
-svbool_t svcmpne_wide_n_s8(svbool_t, svint8_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_n_s32)))
-svbool_t svcmpne_wide_n_s32(svbool_t, svint32_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_n_s16)))
-svbool_t svcmpne_wide_n_s16(svbool_t, svint16_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_s8)))
-svbool_t svcmpne_wide_s8(svbool_t, svint8_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_s32)))
-svbool_t svcmpne_wide_s32(svbool_t, svint32_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_s16)))
-svbool_t svcmpne_wide_s16(svbool_t, svint16_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_n_f64)))
-svbool_t svcmpuo_n_f64(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_n_f32)))
-svbool_t svcmpuo_n_f32(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_n_f16)))
-svbool_t svcmpuo_n_f16(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_f64)))
-svbool_t svcmpuo_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_f32)))
-svbool_t svcmpuo_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_f16)))
-svbool_t svcmpuo_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u8_m)))
-svuint8_t svcnot_u8_m(svuint8_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u32_m)))
-svuint32_t svcnot_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u64_m)))
-svuint64_t svcnot_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u16_m)))
-svuint16_t svcnot_u16_m(svuint16_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s8_m)))
-svint8_t svcnot_s8_m(svint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s32_m)))
-svint32_t svcnot_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s64_m)))
-svint64_t svcnot_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s16_m)))
-svint16_t svcnot_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u8_x)))
-svuint8_t svcnot_u8_x(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u32_x)))
-svuint32_t svcnot_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u64_x)))
-svuint64_t svcnot_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u16_x)))
-svuint16_t svcnot_u16_x(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s8_x)))
-svint8_t svcnot_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s32_x)))
-svint32_t svcnot_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s64_x)))
-svint64_t svcnot_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s16_x)))
-svint16_t svcnot_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u8_z)))
-svuint8_t svcnot_u8_z(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u32_z)))
-svuint32_t svcnot_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u64_z)))
-svuint64_t svcnot_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u16_z)))
-svuint16_t svcnot_u16_z(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s8_z)))
-svint8_t svcnot_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s32_z)))
-svint32_t svcnot_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s64_z)))
-svint64_t svcnot_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s16_z)))
-svint16_t svcnot_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u8_m)))
-svuint8_t svcnt_u8_m(svuint8_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u32_m)))
-svuint32_t svcnt_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u64_m)))
-svuint64_t svcnt_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u16_m)))
-svuint16_t svcnt_u16_m(svuint16_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s8_m)))
-svuint8_t svcnt_s8_m(svuint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f64_m)))
-svuint64_t svcnt_f64_m(svuint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f32_m)))
-svuint32_t svcnt_f32_m(svuint32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f16_m)))
-svuint16_t svcnt_f16_m(svuint16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s32_m)))
-svuint32_t svcnt_s32_m(svuint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s64_m)))
-svuint64_t svcnt_s64_m(svuint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_m)))
-svuint16_t svcnt_s16_m(svuint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u8_x)))
-svuint8_t svcnt_u8_x(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u32_x)))
-svuint32_t svcnt_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u64_x)))
-svuint64_t svcnt_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u16_x)))
-svuint16_t svcnt_u16_x(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s8_x)))
-svuint8_t svcnt_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f64_x)))
-svuint64_t svcnt_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f32_x)))
-svuint32_t svcnt_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f16_x)))
-svuint16_t svcnt_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s32_x)))
-svuint32_t svcnt_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s64_x)))
-svuint64_t svcnt_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_x)))
-svuint16_t svcnt_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u8_z)))
-svuint8_t svcnt_u8_z(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u32_z)))
-svuint32_t svcnt_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u64_z)))
-svuint64_t svcnt_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u16_z)))
-svuint16_t svcnt_u16_z(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s8_z)))
-svuint8_t svcnt_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f64_z)))
-svuint64_t svcnt_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f32_z)))
-svuint32_t svcnt_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f16_z)))
-svuint16_t svcnt_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s32_z)))
-svuint32_t svcnt_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s64_z)))
-svuint64_t svcnt_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_z)))
-svuint16_t svcnt_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb)))
-uint64_t svcntb(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb_pat)))
-uint64_t svcntb_pat(enum svpattern);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd)))
-uint64_t svcntd(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd_pat)))
-uint64_t svcntd_pat(enum svpattern);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth)))
-uint64_t svcnth(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth_pat)))
-uint64_t svcnth_pat(enum svpattern);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b8)))
-uint64_t svcntp_b8(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b32)))
-uint64_t svcntp_b32(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b64)))
-uint64_t svcntp_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b16)))
-uint64_t svcntp_b16(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw)))
-uint64_t svcntw(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw_pat)))
-uint64_t svcntw_pat(enum svpattern);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_u8)))
-svuint8x2_t svcreate2_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_u32)))
-svuint32x2_t svcreate2_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_u64)))
-svuint64x2_t svcreate2_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_u16)))
-svuint16x2_t svcreate2_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_s8)))
-svint8x2_t svcreate2_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_f64)))
-svfloat64x2_t svcreate2_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_f32)))
-svfloat32x2_t svcreate2_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_f16)))
-svfloat16x2_t svcreate2_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_s32)))
-svint32x2_t svcreate2_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_s64)))
-svint64x2_t svcreate2_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_s16)))
-svint16x2_t svcreate2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_u8)))
-svuint8x3_t svcreate3_u8(svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_u32)))
-svuint32x3_t svcreate3_u32(svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_u64)))
-svuint64x3_t svcreate3_u64(svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_u16)))
-svuint16x3_t svcreate3_u16(svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_s8)))
-svint8x3_t svcreate3_s8(svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_f64)))
-svfloat64x3_t svcreate3_f64(svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_f32)))
-svfloat32x3_t svcreate3_f32(svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_f16)))
-svfloat16x3_t svcreate3_f16(svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_s32)))
-svint32x3_t svcreate3_s32(svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_s64)))
-svint64x3_t svcreate3_s64(svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_s16)))
-svint16x3_t svcreate3_s16(svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_u8)))
-svuint8x4_t svcreate4_u8(svuint8_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_u32)))
-svuint32x4_t svcreate4_u32(svuint32_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_u64)))
-svuint64x4_t svcreate4_u64(svuint64_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_u16)))
-svuint16x4_t svcreate4_u16(svuint16_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_s8)))
-svint8x4_t svcreate4_s8(svint8_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_f64)))
-svfloat64x4_t svcreate4_f64(svfloat64_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_f32)))
-svfloat32x4_t svcreate4_f32(svfloat32_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_f16)))
-svfloat16x4_t svcreate4_f16(svfloat16_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_s32)))
-svint32x4_t svcreate4_s32(svint32_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_s64)))
-svint64x4_t svcreate4_s64(svint64_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_s16)))
-svint16x4_t svcreate4_s16(svint16_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f32_m)))
-svfloat16_t svcvt_f16_f32_m(svfloat16_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f32_x)))
-svfloat16_t svcvt_f16_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f32_z)))
-svfloat16_t svcvt_f16_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f64_m)))
-svfloat16_t svcvt_f16_f64_m(svfloat16_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f64_x)))
-svfloat16_t svcvt_f16_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f64_z)))
-svfloat16_t svcvt_f16_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s16_m)))
-svfloat16_t svcvt_f16_s16_m(svfloat16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s16_x)))
-svfloat16_t svcvt_f16_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s16_z)))
-svfloat16_t svcvt_f16_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s32_m)))
-svfloat16_t svcvt_f16_s32_m(svfloat16_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s32_x)))
-svfloat16_t svcvt_f16_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s32_z)))
-svfloat16_t svcvt_f16_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s64_m)))
-svfloat16_t svcvt_f16_s64_m(svfloat16_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s64_x)))
-svfloat16_t svcvt_f16_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s64_z)))
-svfloat16_t svcvt_f16_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u16_m)))
-svfloat16_t svcvt_f16_u16_m(svfloat16_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u16_x)))
-svfloat16_t svcvt_f16_u16_x(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u16_z)))
-svfloat16_t svcvt_f16_u16_z(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u32_m)))
-svfloat16_t svcvt_f16_u32_m(svfloat16_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u32_x)))
-svfloat16_t svcvt_f16_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u32_z)))
-svfloat16_t svcvt_f16_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u64_m)))
-svfloat16_t svcvt_f16_u64_m(svfloat16_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u64_x)))
-svfloat16_t svcvt_f16_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u64_z)))
-svfloat16_t svcvt_f16_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f16_m)))
-svfloat32_t svcvt_f32_f16_m(svfloat32_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f16_x)))
-svfloat32_t svcvt_f32_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f16_z)))
-svfloat32_t svcvt_f32_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f64_m)))
-svfloat32_t svcvt_f32_f64_m(svfloat32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f64_x)))
-svfloat32_t svcvt_f32_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f64_z)))
-svfloat32_t svcvt_f32_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_m)))
-svfloat32_t svcvt_f32_s32_m(svfloat32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_x)))
-svfloat32_t svcvt_f32_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_z)))
-svfloat32_t svcvt_f32_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s64_m)))
-svfloat32_t svcvt_f32_s64_m(svfloat32_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s64_x)))
-svfloat32_t svcvt_f32_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s64_z)))
-svfloat32_t svcvt_f32_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_m)))
-svfloat32_t svcvt_f32_u32_m(svfloat32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_x)))
-svfloat32_t svcvt_f32_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_z)))
-svfloat32_t svcvt_f32_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u64_m)))
-svfloat32_t svcvt_f32_u64_m(svfloat32_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u64_x)))
-svfloat32_t svcvt_f32_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u64_z)))
-svfloat32_t svcvt_f32_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f16_m)))
-svfloat64_t svcvt_f64_f16_m(svfloat64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f16_x)))
-svfloat64_t svcvt_f64_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f16_z)))
-svfloat64_t svcvt_f64_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f32_m)))
-svfloat64_t svcvt_f64_f32_m(svfloat64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f32_x)))
-svfloat64_t svcvt_f64_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f32_z)))
-svfloat64_t svcvt_f64_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s32_m)))
-svfloat64_t svcvt_f64_s32_m(svfloat64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s32_x)))
-svfloat64_t svcvt_f64_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s32_z)))
-svfloat64_t svcvt_f64_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s64_m)))
-svfloat64_t svcvt_f64_s64_m(svfloat64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s64_x)))
-svfloat64_t svcvt_f64_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s64_z)))
-svfloat64_t svcvt_f64_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u32_m)))
-svfloat64_t svcvt_f64_u32_m(svfloat64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u32_x)))
-svfloat64_t svcvt_f64_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u32_z)))
-svfloat64_t svcvt_f64_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u64_m)))
-svfloat64_t svcvt_f64_u64_m(svfloat64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u64_x)))
-svfloat64_t svcvt_f64_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u64_z)))
-svfloat64_t svcvt_f64_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s16_f16_m)))
-svint16_t svcvt_s16_f16_m(svint16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s16_f16_x)))
-svint16_t svcvt_s16_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s16_f16_z)))
-svint16_t svcvt_s16_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f16_m)))
-svint32_t svcvt_s32_f16_m(svint32_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f16_x)))
-svint32_t svcvt_s32_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f16_z)))
-svint32_t svcvt_s32_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_m)))
-svint32_t svcvt_s32_f32_m(svint32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_x)))
-svint32_t svcvt_s32_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_z)))
-svint32_t svcvt_s32_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f64_m)))
-svint32_t svcvt_s32_f64_m(svint32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f64_x)))
-svint32_t svcvt_s32_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f64_z)))
-svint32_t svcvt_s32_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f16_m)))
-svint64_t svcvt_s64_f16_m(svint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f16_x)))
-svint64_t svcvt_s64_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f16_z)))
-svint64_t svcvt_s64_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f32_m)))
-svint64_t svcvt_s64_f32_m(svint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f32_x)))
-svint64_t svcvt_s64_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f32_z)))
-svint64_t svcvt_s64_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f64_m)))
-svint64_t svcvt_s64_f64_m(svint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f64_x)))
-svint64_t svcvt_s64_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f64_z)))
-svint64_t svcvt_s64_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u16_f16_m)))
-svuint16_t svcvt_u16_f16_m(svuint16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u16_f16_x)))
-svuint16_t svcvt_u16_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u16_f16_z)))
-svuint16_t svcvt_u16_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f16_m)))
-svuint32_t svcvt_u32_f16_m(svuint32_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f16_x)))
-svuint32_t svcvt_u32_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f16_z)))
-svuint32_t svcvt_u32_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_m)))
-svuint32_t svcvt_u32_f32_m(svuint32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_x)))
-svuint32_t svcvt_u32_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_z)))
-svuint32_t svcvt_u32_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f64_m)))
-svuint32_t svcvt_u32_f64_m(svuint32_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f64_x)))
-svuint32_t svcvt_u32_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f64_z)))
-svuint32_t svcvt_u32_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f16_m)))
-svuint64_t svcvt_u64_f16_m(svuint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f16_x)))
-svuint64_t svcvt_u64_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f16_z)))
-svuint64_t svcvt_u64_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f32_m)))
-svuint64_t svcvt_u64_f32_m(svuint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f32_x)))
-svuint64_t svcvt_u64_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f32_z)))
-svuint64_t svcvt_u64_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f64_m)))
-svuint64_t svcvt_u64_f64_m(svuint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f64_x)))
-svuint64_t svcvt_u64_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f64_z)))
-svuint64_t svcvt_u64_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f64_m)))
-svfloat64_t svdiv_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f32_m)))
-svfloat32_t svdiv_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f16_m)))
-svfloat16_t svdiv_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f64_x)))
-svfloat64_t svdiv_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f32_x)))
-svfloat32_t svdiv_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f16_x)))
-svfloat16_t svdiv_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f64_z)))
-svfloat64_t svdiv_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f32_z)))
-svfloat32_t svdiv_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f16_z)))
-svfloat16_t svdiv_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s32_m)))
-svint32_t svdiv_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s64_m)))
-svint64_t svdiv_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s32_x)))
-svint32_t svdiv_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s64_x)))
-svint64_t svdiv_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s32_z)))
-svint32_t svdiv_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s64_z)))
-svint64_t svdiv_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u32_m)))
-svuint32_t svdiv_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u64_m)))
-svuint64_t svdiv_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u32_x)))
-svuint32_t svdiv_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u64_x)))
-svuint64_t svdiv_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u32_z)))
-svuint32_t svdiv_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u64_z)))
-svuint64_t svdiv_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f64_m)))
-svfloat64_t svdiv_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f32_m)))
-svfloat32_t svdiv_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f16_m)))
-svfloat16_t svdiv_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f64_x)))
-svfloat64_t svdiv_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f32_x)))
-svfloat32_t svdiv_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f16_x)))
-svfloat16_t svdiv_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f64_z)))
-svfloat64_t svdiv_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f32_z)))
-svfloat32_t svdiv_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f16_z)))
-svfloat16_t svdiv_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s32_m)))
-svint32_t svdiv_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s64_m)))
-svint64_t svdiv_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s32_x)))
-svint32_t svdiv_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s64_x)))
-svint64_t svdiv_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s32_z)))
-svint32_t svdiv_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s64_z)))
-svint64_t svdiv_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u32_m)))
-svuint32_t svdiv_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u64_m)))
-svuint64_t svdiv_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u32_x)))
-svuint32_t svdiv_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u64_x)))
-svuint64_t svdiv_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u32_z)))
-svuint32_t svdiv_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u64_z)))
-svuint64_t svdiv_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f64_m)))
-svfloat64_t svdivr_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f32_m)))
-svfloat32_t svdivr_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f16_m)))
-svfloat16_t svdivr_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f64_x)))
-svfloat64_t svdivr_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f32_x)))
-svfloat32_t svdivr_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f16_x)))
-svfloat16_t svdivr_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f64_z)))
-svfloat64_t svdivr_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f32_z)))
-svfloat32_t svdivr_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f16_z)))
-svfloat16_t svdivr_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s32_m)))
-svint32_t svdivr_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s64_m)))
-svint64_t svdivr_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s32_x)))
-svint32_t svdivr_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s64_x)))
-svint64_t svdivr_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s32_z)))
-svint32_t svdivr_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s64_z)))
-svint64_t svdivr_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u32_m)))
-svuint32_t svdivr_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u64_m)))
-svuint64_t svdivr_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u32_x)))
-svuint32_t svdivr_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u64_x)))
-svuint64_t svdivr_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u32_z)))
-svuint32_t svdivr_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u64_z)))
-svuint64_t svdivr_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f64_m)))
-svfloat64_t svdivr_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f32_m)))
-svfloat32_t svdivr_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f16_m)))
-svfloat16_t svdivr_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f64_x)))
-svfloat64_t svdivr_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f32_x)))
-svfloat32_t svdivr_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f16_x)))
-svfloat16_t svdivr_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f64_z)))
-svfloat64_t svdivr_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f32_z)))
-svfloat32_t svdivr_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f16_z)))
-svfloat16_t svdivr_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s32_m)))
-svint32_t svdivr_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s64_m)))
-svint64_t svdivr_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s32_x)))
-svint32_t svdivr_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s64_x)))
-svint64_t svdivr_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s32_z)))
-svint32_t svdivr_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s64_z)))
-svint64_t svdivr_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u32_m)))
-svuint32_t svdivr_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u64_m)))
-svuint64_t svdivr_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u32_x)))
-svuint32_t svdivr_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u64_x)))
-svuint64_t svdivr_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u32_z)))
-svuint32_t svdivr_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u64_z)))
-svuint64_t svdivr_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_s32)))
-svint32_t svdot_n_s32(svint32_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_s64)))
-svint64_t svdot_n_s64(svint64_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_u32)))
-svuint32_t svdot_n_u32(svuint32_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_u64)))
-svuint64_t svdot_n_u64(svuint64_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_s32)))
-svint32_t svdot_s32(svint32_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_s64)))
-svint64_t svdot_s64(svint64_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_u32)))
-svuint32_t svdot_u32(svuint32_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_u64)))
-svuint64_t svdot_u64(svuint64_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_s32)))
-svint32_t svdot_lane_s32(svint32_t, svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_s64)))
-svint64_t svdot_lane_s64(svint64_t, svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_u32)))
-svuint32_t svdot_lane_u32(svuint32_t, svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_u64)))
-svuint64_t svdot_lane_u64(svuint64_t, svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u8)))
-svuint8_t svdup_n_u8(uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u32)))
-svuint32_t svdup_n_u32(uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u64)))
-svuint64_t svdup_n_u64(uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u16)))
-svuint16_t svdup_n_u16(uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s8)))
-svint8_t svdup_n_s8(int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f64)))
-svfloat64_t svdup_n_f64(float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f32)))
-svfloat32_t svdup_n_f32(float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f16)))
-svfloat16_t svdup_n_f16(float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s32)))
-svint32_t svdup_n_s32(int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s64)))
-svint64_t svdup_n_s64(int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s16)))
-svint16_t svdup_n_s16(int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u8_m)))
-svuint8_t svdup_n_u8_m(svuint8_t, svbool_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u32_m)))
-svuint32_t svdup_n_u32_m(svuint32_t, svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u64_m)))
-svuint64_t svdup_n_u64_m(svuint64_t, svbool_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u16_m)))
-svuint16_t svdup_n_u16_m(svuint16_t, svbool_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s8_m)))
-svint8_t svdup_n_s8_m(svint8_t, svbool_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f64_m)))
-svfloat64_t svdup_n_f64_m(svfloat64_t, svbool_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f32_m)))
-svfloat32_t svdup_n_f32_m(svfloat32_t, svbool_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f16_m)))
-svfloat16_t svdup_n_f16_m(svfloat16_t, svbool_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s32_m)))
-svint32_t svdup_n_s32_m(svint32_t, svbool_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s64_m)))
-svint64_t svdup_n_s64_m(svint64_t, svbool_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s16_m)))
-svint16_t svdup_n_s16_m(svint16_t, svbool_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_b8)))
-svbool_t svdup_n_b8(bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_b32)))
-svbool_t svdup_n_b32(bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_b64)))
-svbool_t svdup_n_b64(bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_b16)))
-svbool_t svdup_n_b16(bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u8_x)))
-svuint8_t svdup_n_u8_x(svbool_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u32_x)))
-svuint32_t svdup_n_u32_x(svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u64_x)))
-svuint64_t svdup_n_u64_x(svbool_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u16_x)))
-svuint16_t svdup_n_u16_x(svbool_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s8_x)))
-svint8_t svdup_n_s8_x(svbool_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f64_x)))
-svfloat64_t svdup_n_f64_x(svbool_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f32_x)))
-svfloat32_t svdup_n_f32_x(svbool_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f16_x)))
-svfloat16_t svdup_n_f16_x(svbool_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s32_x)))
-svint32_t svdup_n_s32_x(svbool_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s64_x)))
-svint64_t svdup_n_s64_x(svbool_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s16_x)))
-svint16_t svdup_n_s16_x(svbool_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u8_z)))
-svuint8_t svdup_n_u8_z(svbool_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u32_z)))
-svuint32_t svdup_n_u32_z(svbool_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u64_z)))
-svuint64_t svdup_n_u64_z(svbool_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u16_z)))
-svuint16_t svdup_n_u16_z(svbool_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s8_z)))
-svint8_t svdup_n_s8_z(svbool_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f64_z)))
-svfloat64_t svdup_n_f64_z(svbool_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f32_z)))
-svfloat32_t svdup_n_f32_z(svbool_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f16_z)))
-svfloat16_t svdup_n_f16_z(svbool_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s32_z)))
-svint32_t svdup_n_s32_z(svbool_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s64_z)))
-svint64_t svdup_n_s64_z(svbool_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s16_z)))
-svint16_t svdup_n_s16_z(svbool_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_u8)))
-svuint8_t svdup_lane_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_u32)))
-svuint32_t svdup_lane_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_u64)))
-svuint64_t svdup_lane_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_u16)))
-svuint16_t svdup_lane_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s8)))
-svint8_t svdup_lane_s8(svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_f64)))
-svfloat64_t svdup_lane_f64(svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_f32)))
-svfloat32_t svdup_lane_f32(svfloat32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_f16)))
-svfloat16_t svdup_lane_f16(svfloat16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s32)))
-svint32_t svdup_lane_s32(svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s64)))
-svint64_t svdup_lane_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s16)))
-svint16_t svdup_lane_s16(svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
-svuint8_t svdupq_n_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
-svint8_t svdupq_n_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u16)))
-svuint16_t svdupq_n_u16(uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f16)))
-svfloat16_t svdupq_n_f16(float16_t, float16_t, float16_t, float16_t, float16_t, float16_t, float16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s16)))
-svint16_t svdupq_n_s16(int16_t, int16_t, int16_t, int16_t, int16_t, int16_t, int16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u32)))
-svuint32_t svdupq_n_u32(uint32_t, uint32_t, uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f32)))
-svfloat32_t svdupq_n_f32(float32_t, float32_t, float32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s32)))
-svint32_t svdupq_n_s32(int32_t, int32_t, int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u64)))
-svuint64_t svdupq_n_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f64)))
-svfloat64_t svdupq_n_f64(float64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s64)))
-svint64_t svdupq_n_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
-svbool_t svdupq_n_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b16)))
-svbool_t svdupq_n_b16(bool, bool, bool, bool, bool, bool, bool, bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b32)))
-svbool_t svdupq_n_b32(bool, bool, bool, bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b64)))
-svbool_t svdupq_n_b64(bool, bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u8)))
-svuint8_t svdupq_lane_u8(svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u32)))
-svuint32_t svdupq_lane_u32(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u64)))
-svuint64_t svdupq_lane_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u16)))
-svuint16_t svdupq_lane_u16(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_s8)))
-svint8_t svdupq_lane_s8(svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_f64)))
-svfloat64_t svdupq_lane_f64(svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_f32)))
-svfloat32_t svdupq_lane_f32(svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_f16)))
-svfloat16_t svdupq_lane_f16(svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_s32)))
-svint32_t svdupq_lane_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_s64)))
-svint64_t svdupq_lane_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_s16)))
-svint16_t svdupq_lane_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_b_z)))
-svbool_t sveor_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u8_m)))
-svuint8_t sveor_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u32_m)))
-svuint32_t sveor_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u64_m)))
-svuint64_t sveor_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u16_m)))
-svuint16_t sveor_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s8_m)))
-svint8_t sveor_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s32_m)))
-svint32_t sveor_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s64_m)))
-svint64_t sveor_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s16_m)))
-svint16_t sveor_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u8_x)))
-svuint8_t sveor_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u32_x)))
-svuint32_t sveor_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u64_x)))
-svuint64_t sveor_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u16_x)))
-svuint16_t sveor_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s8_x)))
-svint8_t sveor_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s32_x)))
-svint32_t sveor_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s64_x)))
-svint64_t sveor_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s16_x)))
-svint16_t sveor_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u8_z)))
-svuint8_t sveor_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u32_z)))
-svuint32_t sveor_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u64_z)))
-svuint64_t sveor_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u16_z)))
-svuint16_t sveor_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s8_z)))
-svint8_t sveor_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s32_z)))
-svint32_t sveor_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s64_z)))
-svint64_t sveor_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s16_z)))
-svint16_t sveor_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u8_m)))
-svuint8_t sveor_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u32_m)))
-svuint32_t sveor_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u64_m)))
-svuint64_t sveor_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u16_m)))
-svuint16_t sveor_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s8_m)))
-svint8_t sveor_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s32_m)))
-svint32_t sveor_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s64_m)))
-svint64_t sveor_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s16_m)))
-svint16_t sveor_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u8_x)))
-svuint8_t sveor_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u32_x)))
-svuint32_t sveor_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u64_x)))
-svuint64_t sveor_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u16_x)))
-svuint16_t sveor_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s8_x)))
-svint8_t sveor_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s32_x)))
-svint32_t sveor_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s64_x)))
-svint64_t sveor_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s16_x)))
-svint16_t sveor_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u8_z)))
-svuint8_t sveor_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u32_z)))
-svuint32_t sveor_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u64_z)))
-svuint64_t sveor_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u16_z)))
-svuint16_t sveor_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s8_z)))
-svint8_t sveor_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s32_z)))
-svint32_t sveor_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s64_z)))
-svint64_t sveor_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s16_z)))
-svint16_t sveor_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_u8)))
-uint8_t sveorv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_u32)))
-uint32_t sveorv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_u64)))
-uint64_t sveorv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_u16)))
-uint16_t sveorv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_s8)))
-int8_t sveorv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_s32)))
-int32_t sveorv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_s64)))
-int64_t sveorv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_s16)))
-int16_t sveorv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_u8)))
-svuint8_t svext_u8(svuint8_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_u32)))
-svuint32_t svext_u32(svuint32_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_u64)))
-svuint64_t svext_u64(svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_u16)))
-svuint16_t svext_u16(svuint16_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_s8)))
-svint8_t svext_s8(svint8_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_f64)))
-svfloat64_t svext_f64(svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_f32)))
-svfloat32_t svext_f32(svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_f16)))
-svfloat16_t svext_f16(svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_s32)))
-svint32_t svext_s32(svint32_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_s64)))
-svint64_t svext_s64(svint64_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_s16)))
-svint16_t svext_s16(svint16_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s32_m)))
-svint32_t svextb_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s64_m)))
-svint64_t svextb_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s16_m)))
-svint16_t svextb_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s32_x)))
-svint32_t svextb_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s64_x)))
-svint64_t svextb_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s16_x)))
-svint16_t svextb_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s32_z)))
-svint32_t svextb_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s64_z)))
-svint64_t svextb_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s16_z)))
-svint16_t svextb_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u32_m)))
-svuint32_t svextb_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u64_m)))
-svuint64_t svextb_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u16_m)))
-svuint16_t svextb_u16_m(svuint16_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u32_x)))
-svuint32_t svextb_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u64_x)))
-svuint64_t svextb_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u16_x)))
-svuint16_t svextb_u16_x(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u32_z)))
-svuint32_t svextb_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u64_z)))
-svuint64_t svextb_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u16_z)))
-svuint16_t svextb_u16_z(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s32_m)))
-svint32_t svexth_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s64_m)))
-svint64_t svexth_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s32_x)))
-svint32_t svexth_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s64_x)))
-svint64_t svexth_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s32_z)))
-svint32_t svexth_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s64_z)))
-svint64_t svexth_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u32_m)))
-svuint32_t svexth_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u64_m)))
-svuint64_t svexth_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u32_x)))
-svuint32_t svexth_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u64_x)))
-svuint64_t svexth_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u32_z)))
-svuint32_t svexth_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u64_z)))
-svuint64_t svexth_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_s64_m)))
-svint64_t svextw_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_s64_x)))
-svint64_t svextw_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_s64_z)))
-svint64_t svextw_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_u64_m)))
-svuint64_t svextw_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_u64_x)))
-svuint64_t svextw_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_u64_z)))
-svuint64_t svextw_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_u8)))
-svuint8_t svget2_u8(svuint8x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_u32)))
-svuint32_t svget2_u32(svuint32x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_u64)))
-svuint64_t svget2_u64(svuint64x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_u16)))
-svuint16_t svget2_u16(svuint16x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_s8)))
-svint8_t svget2_s8(svint8x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_f64)))
-svfloat64_t svget2_f64(svfloat64x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_f32)))
-svfloat32_t svget2_f32(svfloat32x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_f16)))
-svfloat16_t svget2_f16(svfloat16x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_s32)))
-svint32_t svget2_s32(svint32x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_s64)))
-svint64_t svget2_s64(svint64x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_s16)))
-svint16_t svget2_s16(svint16x2_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_u8)))
-svuint8_t svget3_u8(svuint8x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_u32)))
-svuint32_t svget3_u32(svuint32x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_u64)))
-svuint64_t svget3_u64(svuint64x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_u16)))
-svuint16_t svget3_u16(svuint16x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_s8)))
-svint8_t svget3_s8(svint8x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_f64)))
-svfloat64_t svget3_f64(svfloat64x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_f32)))
-svfloat32_t svget3_f32(svfloat32x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_f16)))
-svfloat16_t svget3_f16(svfloat16x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_s32)))
-svint32_t svget3_s32(svint32x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_s64)))
-svint64_t svget3_s64(svint64x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_s16)))
-svint16_t svget3_s16(svint16x3_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_u8)))
-svuint8_t svget4_u8(svuint8x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_u32)))
-svuint32_t svget4_u32(svuint32x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_u64)))
-svuint64_t svget4_u64(svuint64x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_u16)))
-svuint16_t svget4_u16(svuint16x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_s8)))
-svint8_t svget4_s8(svint8x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_f64)))
-svfloat64_t svget4_f64(svfloat64x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_f32)))
-svfloat32_t svget4_f32(svfloat32x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_f16)))
-svfloat16_t svget4_f16(svfloat16x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_s32)))
-svint32_t svget4_s32(svint32x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_s64)))
-svint64_t svget4_s64(svint64x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_s16)))
-svint16_t svget4_s16(svint16x4_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svindex_u8)))
-svuint8_t svindex_u8(uint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svindex_u32)))
-svuint32_t svindex_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svindex_u64)))
-svuint64_t svindex_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svindex_u16)))
-svuint16_t svindex_u16(uint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svindex_s8)))
-svint8_t svindex_s8(int8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svindex_s32)))
-svint32_t svindex_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svindex_s64)))
-svint64_t svindex_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svindex_s16)))
-svint16_t svindex_s16(int16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_u8)))
-svuint8_t svinsr_n_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_u32)))
-svuint32_t svinsr_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_u64)))
-svuint64_t svinsr_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_u16)))
-svuint16_t svinsr_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_s8)))
-svint8_t svinsr_n_s8(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_f64)))
-svfloat64_t svinsr_n_f64(svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_f32)))
-svfloat32_t svinsr_n_f32(svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_f16)))
-svfloat16_t svinsr_n_f16(svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_s32)))
-svint32_t svinsr_n_s32(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_s64)))
-svint64_t svinsr_n_s64(svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_s16)))
-svint16_t svinsr_n_s16(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_u8)))
-uint8_t svlasta_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_u32)))
-uint32_t svlasta_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_u64)))
-uint64_t svlasta_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_u16)))
-uint16_t svlasta_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_s8)))
-int8_t svlasta_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_f64)))
-float64_t svlasta_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_f32)))
-float32_t svlasta_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_f16)))
-float16_t svlasta_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_s32)))
-int32_t svlasta_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_s64)))
-int64_t svlasta_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_s16)))
-int16_t svlasta_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_u8)))
-uint8_t svlastb_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_u32)))
-uint32_t svlastb_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_u64)))
-uint64_t svlastb_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_u16)))
-uint16_t svlastb_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_s8)))
-int8_t svlastb_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_f64)))
-float64_t svlastb_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_f32)))
-float32_t svlastb_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_f16)))
-float16_t svlastb_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_s32)))
-int32_t svlastb_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_s64)))
-int64_t svlastb_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_s16)))
-int16_t svlastb_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u8)))
-svuint8_t svld1_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u32)))
-svuint32_t svld1_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u64)))
-svuint64_t svld1_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u16)))
-svuint16_t svld1_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s8)))
-svint8_t svld1_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f64)))
-svfloat64_t svld1_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f32)))
-svfloat32_t svld1_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f16)))
-svfloat16_t svld1_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s32)))
-svint32_t svld1_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s64)))
-svint64_t svld1_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s16)))
-svint16_t svld1_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u8)))
-svuint8_t svld1_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u32)))
-svuint32_t svld1_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u64)))
-svuint64_t svld1_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u16)))
-svuint16_t svld1_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s8)))
-svint8_t svld1_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f64)))
-svfloat64_t svld1_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f32)))
-svfloat32_t svld1_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f16)))
-svfloat16_t svld1_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s32)))
-svint32_t svld1_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s64)))
-svint64_t svld1_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s16)))
-svint16_t svld1_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_u8)))
-svuint8_t svld1rq_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_u32)))
-svuint32_t svld1rq_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_u64)))
-svuint64_t svld1rq_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_u16)))
-svuint16_t svld1rq_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_s8)))
-svint8_t svld1rq_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_f64)))
-svfloat64_t svld1rq_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_f32)))
-svfloat32_t svld1rq_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_f16)))
-svfloat16_t svld1rq_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_s32)))
-svint32_t svld1rq_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_s64)))
-svint64_t svld1rq_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_s16)))
-svint16_t svld1rq_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_vnum_u32)))
-svuint32_t svld1sb_vnum_u32(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_vnum_u64)))
-svuint64_t svld1sb_vnum_u64(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_vnum_u16)))
-svuint16_t svld1sb_vnum_u16(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_vnum_s32)))
-svint32_t svld1sb_vnum_s32(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_vnum_s64)))
-svint64_t svld1sb_vnum_s64(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_vnum_s16)))
-svint16_t svld1sb_vnum_s16(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_u32)))
-svuint32_t svld1sb_u32(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_u64)))
-svuint64_t svld1sb_u64(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_u16)))
-svuint16_t svld1sb_u16(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_s32)))
-svint32_t svld1sb_s32(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_s64)))
-svint64_t svld1sb_s64(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sb_s16)))
-svint16_t svld1sb_s16(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_vnum_u32)))
-svuint32_t svld1sh_vnum_u32(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_vnum_u64)))
-svuint64_t svld1sh_vnum_u64(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_vnum_s32)))
-svint32_t svld1sh_vnum_s32(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_vnum_s64)))
-svint64_t svld1sh_vnum_s64(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_u32)))
-svuint32_t svld1sh_u32(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_u64)))
-svuint64_t svld1sh_u64(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_s32)))
-svint32_t svld1sh_s32(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sh_s64)))
-svint64_t svld1sh_s64(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_vnum_u64)))
-svuint64_t svld1sw_vnum_u64(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_vnum_s64)))
-svint64_t svld1sw_vnum_s64(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_u64)))
-svuint64_t svld1sw_u64(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1sw_s64)))
-svint64_t svld1sw_s64(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_vnum_u32)))
-svuint32_t svld1ub_vnum_u32(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_vnum_u64)))
-svuint64_t svld1ub_vnum_u64(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_vnum_u16)))
-svuint16_t svld1ub_vnum_u16(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_vnum_s32)))
-svint32_t svld1ub_vnum_s32(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_vnum_s64)))
-svint64_t svld1ub_vnum_s64(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_vnum_s16)))
-svint16_t svld1ub_vnum_s16(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_u32)))
-svuint32_t svld1ub_u32(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_u64)))
-svuint64_t svld1ub_u64(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_u16)))
-svuint16_t svld1ub_u16(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_s32)))
-svint32_t svld1ub_s32(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_s64)))
-svint64_t svld1ub_s64(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1ub_s16)))
-svint16_t svld1ub_s16(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_vnum_u32)))
-svuint32_t svld1uh_vnum_u32(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_vnum_u64)))
-svuint64_t svld1uh_vnum_u64(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_vnum_s32)))
-svint32_t svld1uh_vnum_s32(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_vnum_s64)))
-svint64_t svld1uh_vnum_s64(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_u32)))
-svuint32_t svld1uh_u32(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_u64)))
-svuint64_t svld1uh_u64(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_s32)))
-svint32_t svld1uh_s32(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uh_s64)))
-svint64_t svld1uh_s64(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_vnum_u64)))
-svuint64_t svld1uw_vnum_u64(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_vnum_s64)))
-svint64_t svld1uw_vnum_s64(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_u64)))
-svuint64_t svld1uw_u64(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1uw_s64)))
-svint64_t svld1uw_s64(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_u8)))
-svuint8x2_t svld2_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_u32)))
-svuint32x2_t svld2_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_u64)))
-svuint64x2_t svld2_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_u16)))
-svuint16x2_t svld2_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_s8)))
-svint8x2_t svld2_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_f64)))
-svfloat64x2_t svld2_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_f32)))
-svfloat32x2_t svld2_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_f16)))
-svfloat16x2_t svld2_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_s32)))
-svint32x2_t svld2_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_s64)))
-svint64x2_t svld2_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_s16)))
-svint16x2_t svld2_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_u8)))
-svuint8x2_t svld2_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_u32)))
-svuint32x2_t svld2_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_u64)))
-svuint64x2_t svld2_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_u16)))
-svuint16x2_t svld2_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_s8)))
-svint8x2_t svld2_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_f64)))
-svfloat64x2_t svld2_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_f32)))
-svfloat32x2_t svld2_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_f16)))
-svfloat16x2_t svld2_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_s32)))
-svint32x2_t svld2_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_s64)))
-svint64x2_t svld2_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_s16)))
-svint16x2_t svld2_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_u8)))
-svuint8x3_t svld3_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_u32)))
-svuint32x3_t svld3_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_u64)))
-svuint64x3_t svld3_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_u16)))
-svuint16x3_t svld3_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_s8)))
-svint8x3_t svld3_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_f64)))
-svfloat64x3_t svld3_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_f32)))
-svfloat32x3_t svld3_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_f16)))
-svfloat16x3_t svld3_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_s32)))
-svint32x3_t svld3_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_s64)))
-svint64x3_t svld3_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_s16)))
-svint16x3_t svld3_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_u8)))
-svuint8x3_t svld3_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_u32)))
-svuint32x3_t svld3_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_u64)))
-svuint64x3_t svld3_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_u16)))
-svuint16x3_t svld3_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_s8)))
-svint8x3_t svld3_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_f64)))
-svfloat64x3_t svld3_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_f32)))
-svfloat32x3_t svld3_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_f16)))
-svfloat16x3_t svld3_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_s32)))
-svint32x3_t svld3_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_s64)))
-svint64x3_t svld3_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_s16)))
-svint16x3_t svld3_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_u8)))
-svuint8x4_t svld4_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_u32)))
-svuint32x4_t svld4_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_u64)))
-svuint64x4_t svld4_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_u16)))
-svuint16x4_t svld4_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_s8)))
-svint8x4_t svld4_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_f64)))
-svfloat64x4_t svld4_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_f32)))
-svfloat32x4_t svld4_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_f16)))
-svfloat16x4_t svld4_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_s32)))
-svint32x4_t svld4_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_s64)))
-svint64x4_t svld4_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_s16)))
-svint16x4_t svld4_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_u8)))
-svuint8x4_t svld4_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_u32)))
-svuint32x4_t svld4_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_u64)))
-svuint64x4_t svld4_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_u16)))
-svuint16x4_t svld4_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_s8)))
-svint8x4_t svld4_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_f64)))
-svfloat64x4_t svld4_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_f32)))
-svfloat32x4_t svld4_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_f16)))
-svfloat16x4_t svld4_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_s32)))
-svint32x4_t svld4_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_s64)))
-svint64x4_t svld4_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_s16)))
-svint16x4_t svld4_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u8)))
-svuint8_t svldnt1_u8(svbool_t, uint8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u32)))
-svuint32_t svldnt1_u32(svbool_t, uint32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u64)))
-svuint64_t svldnt1_u64(svbool_t, uint64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u16)))
-svuint16_t svldnt1_u16(svbool_t, uint16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s8)))
-svint8_t svldnt1_s8(svbool_t, int8_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f64)))
-svfloat64_t svldnt1_f64(svbool_t, float64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f32)))
-svfloat32_t svldnt1_f32(svbool_t, float32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f16)))
-svfloat16_t svldnt1_f16(svbool_t, float16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s32)))
-svint32_t svldnt1_s32(svbool_t, int32_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s64)))
-svint64_t svldnt1_s64(svbool_t, int64_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s16)))
-svint16_t svldnt1_s16(svbool_t, int16_t const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u8)))
-svuint8_t svldnt1_vnum_u8(svbool_t, uint8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u32)))
-svuint32_t svldnt1_vnum_u32(svbool_t, uint32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u64)))
-svuint64_t svldnt1_vnum_u64(svbool_t, uint64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u16)))
-svuint16_t svldnt1_vnum_u16(svbool_t, uint16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s8)))
-svint8_t svldnt1_vnum_s8(svbool_t, int8_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f64)))
-svfloat64_t svldnt1_vnum_f64(svbool_t, float64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f32)))
-svfloat32_t svldnt1_vnum_f32(svbool_t, float32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f16)))
-svfloat16_t svldnt1_vnum_f16(svbool_t, float16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s32)))
-svint32_t svldnt1_vnum_s32(svbool_t, int32_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s64)))
-svint64_t svldnt1_vnum_s64(svbool_t, int64_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s16)))
-svint16_t svldnt1_vnum_s16(svbool_t, int16_t const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_u8)))
-uint64_t svlen_u8(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_u32)))
-uint64_t svlen_u32(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_u64)))
-uint64_t svlen_u64(svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_u16)))
-uint64_t svlen_u16(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_s8)))
-uint64_t svlen_s8(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_f64)))
-uint64_t svlen_f64(svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_f32)))
-uint64_t svlen_f32(svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_f16)))
-uint64_t svlen_f16(svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_s32)))
-uint64_t svlen_s32(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_s64)))
-uint64_t svlen_s64(svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_s16)))
-uint64_t svlen_s16(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u8_m)))
-svuint8_t svlsl_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u32_m)))
-svuint32_t svlsl_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u64_m)))
-svuint64_t svlsl_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u16_m)))
-svuint16_t svlsl_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s8_m)))
-svint8_t svlsl_n_s8_m(svbool_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s32_m)))
-svint32_t svlsl_n_s32_m(svbool_t, svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s64_m)))
-svint64_t svlsl_n_s64_m(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s16_m)))
-svint16_t svlsl_n_s16_m(svbool_t, svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u8_x)))
-svuint8_t svlsl_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u32_x)))
-svuint32_t svlsl_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u64_x)))
-svuint64_t svlsl_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u16_x)))
-svuint16_t svlsl_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s8_x)))
-svint8_t svlsl_n_s8_x(svbool_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s32_x)))
-svint32_t svlsl_n_s32_x(svbool_t, svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s64_x)))
-svint64_t svlsl_n_s64_x(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s16_x)))
-svint16_t svlsl_n_s16_x(svbool_t, svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u8_z)))
-svuint8_t svlsl_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u32_z)))
-svuint32_t svlsl_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u64_z)))
-svuint64_t svlsl_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u16_z)))
-svuint16_t svlsl_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s8_z)))
-svint8_t svlsl_n_s8_z(svbool_t, svint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s32_z)))
-svint32_t svlsl_n_s32_z(svbool_t, svint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s64_z)))
-svint64_t svlsl_n_s64_z(svbool_t, svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s16_z)))
-svint16_t svlsl_n_s16_z(svbool_t, svint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u8_m)))
-svuint8_t svlsl_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u32_m)))
-svuint32_t svlsl_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u64_m)))
-svuint64_t svlsl_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u16_m)))
-svuint16_t svlsl_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s8_m)))
-svint8_t svlsl_s8_m(svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s32_m)))
-svint32_t svlsl_s32_m(svbool_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s64_m)))
-svint64_t svlsl_s64_m(svbool_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s16_m)))
-svint16_t svlsl_s16_m(svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u8_x)))
-svuint8_t svlsl_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u32_x)))
-svuint32_t svlsl_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u64_x)))
-svuint64_t svlsl_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u16_x)))
-svuint16_t svlsl_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s8_x)))
-svint8_t svlsl_s8_x(svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s32_x)))
-svint32_t svlsl_s32_x(svbool_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s64_x)))
-svint64_t svlsl_s64_x(svbool_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s16_x)))
-svint16_t svlsl_s16_x(svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u8_z)))
-svuint8_t svlsl_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u32_z)))
-svuint32_t svlsl_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u64_z)))
-svuint64_t svlsl_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u16_z)))
-svuint16_t svlsl_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s8_z)))
-svint8_t svlsl_s8_z(svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s32_z)))
-svint32_t svlsl_s32_z(svbool_t, svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s64_z)))
-svint64_t svlsl_s64_z(svbool_t, svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s16_z)))
-svint16_t svlsl_s16_z(svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u8_m)))
-svuint8_t svlsl_wide_n_u8_m(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u32_m)))
-svuint32_t svlsl_wide_n_u32_m(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u16_m)))
-svuint16_t svlsl_wide_n_u16_m(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s8_m)))
-svint8_t svlsl_wide_n_s8_m(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s32_m)))
-svint32_t svlsl_wide_n_s32_m(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s16_m)))
-svint16_t svlsl_wide_n_s16_m(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u8_x)))
-svuint8_t svlsl_wide_n_u8_x(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u32_x)))
-svuint32_t svlsl_wide_n_u32_x(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u16_x)))
-svuint16_t svlsl_wide_n_u16_x(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s8_x)))
-svint8_t svlsl_wide_n_s8_x(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s32_x)))
-svint32_t svlsl_wide_n_s32_x(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s16_x)))
-svint16_t svlsl_wide_n_s16_x(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u8_z)))
-svuint8_t svlsl_wide_n_u8_z(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u32_z)))
-svuint32_t svlsl_wide_n_u32_z(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u16_z)))
-svuint16_t svlsl_wide_n_u16_z(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s8_z)))
-svint8_t svlsl_wide_n_s8_z(svbool_t, svint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s32_z)))
-svint32_t svlsl_wide_n_s32_z(svbool_t, svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s16_z)))
-svint16_t svlsl_wide_n_s16_z(svbool_t, svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u8_m)))
-svuint8_t svlsl_wide_u8_m(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u32_m)))
-svuint32_t svlsl_wide_u32_m(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u16_m)))
-svuint16_t svlsl_wide_u16_m(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s8_m)))
-svint8_t svlsl_wide_s8_m(svbool_t, svint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s32_m)))
-svint32_t svlsl_wide_s32_m(svbool_t, svint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s16_m)))
-svint16_t svlsl_wide_s16_m(svbool_t, svint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u8_x)))
-svuint8_t svlsl_wide_u8_x(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u32_x)))
-svuint32_t svlsl_wide_u32_x(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u16_x)))
-svuint16_t svlsl_wide_u16_x(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s8_x)))
-svint8_t svlsl_wide_s8_x(svbool_t, svint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s32_x)))
-svint32_t svlsl_wide_s32_x(svbool_t, svint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s16_x)))
-svint16_t svlsl_wide_s16_x(svbool_t, svint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u8_z)))
-svuint8_t svlsl_wide_u8_z(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u32_z)))
-svuint32_t svlsl_wide_u32_z(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u16_z)))
-svuint16_t svlsl_wide_u16_z(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s8_z)))
-svint8_t svlsl_wide_s8_z(svbool_t, svint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s32_z)))
-svint32_t svlsl_wide_s32_z(svbool_t, svint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s16_z)))
-svint16_t svlsl_wide_s16_z(svbool_t, svint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u8_m)))
-svuint8_t svlsr_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u32_m)))
-svuint32_t svlsr_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u64_m)))
-svuint64_t svlsr_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u16_m)))
-svuint16_t svlsr_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u8_x)))
-svuint8_t svlsr_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u32_x)))
-svuint32_t svlsr_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u64_x)))
-svuint64_t svlsr_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u16_x)))
-svuint16_t svlsr_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u8_z)))
-svuint8_t svlsr_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u32_z)))
-svuint32_t svlsr_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u64_z)))
-svuint64_t svlsr_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u16_z)))
-svuint16_t svlsr_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u8_m)))
-svuint8_t svlsr_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u32_m)))
-svuint32_t svlsr_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u64_m)))
-svuint64_t svlsr_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u16_m)))
-svuint16_t svlsr_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u8_x)))
-svuint8_t svlsr_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u32_x)))
-svuint32_t svlsr_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u64_x)))
-svuint64_t svlsr_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u16_x)))
-svuint16_t svlsr_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u8_z)))
-svuint8_t svlsr_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u32_z)))
-svuint32_t svlsr_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u64_z)))
-svuint64_t svlsr_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u16_z)))
-svuint16_t svlsr_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u8_m)))
-svuint8_t svlsr_wide_n_u8_m(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u32_m)))
-svuint32_t svlsr_wide_n_u32_m(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u16_m)))
-svuint16_t svlsr_wide_n_u16_m(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u8_x)))
-svuint8_t svlsr_wide_n_u8_x(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u32_x)))
-svuint32_t svlsr_wide_n_u32_x(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u16_x)))
-svuint16_t svlsr_wide_n_u16_x(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u8_z)))
-svuint8_t svlsr_wide_n_u8_z(svbool_t, svuint8_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u32_z)))
-svuint32_t svlsr_wide_n_u32_z(svbool_t, svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u16_z)))
-svuint16_t svlsr_wide_n_u16_z(svbool_t, svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u8_m)))
-svuint8_t svlsr_wide_u8_m(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u32_m)))
-svuint32_t svlsr_wide_u32_m(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u16_m)))
-svuint16_t svlsr_wide_u16_m(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u8_x)))
-svuint8_t svlsr_wide_u8_x(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u32_x)))
-svuint32_t svlsr_wide_u32_x(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u16_x)))
-svuint16_t svlsr_wide_u16_x(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u8_z)))
-svuint8_t svlsr_wide_u8_z(svbool_t, svuint8_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u32_z)))
-svuint32_t svlsr_wide_u32_z(svbool_t, svuint32_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u16_z)))
-svuint16_t svlsr_wide_u16_z(svbool_t, svuint16_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f64_m)))
-svfloat64_t svmad_n_f64_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f32_m)))
-svfloat32_t svmad_n_f32_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f16_m)))
-svfloat16_t svmad_n_f16_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f64_x)))
-svfloat64_t svmad_n_f64_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f32_x)))
-svfloat32_t svmad_n_f32_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f16_x)))
-svfloat16_t svmad_n_f16_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f64_z)))
-svfloat64_t svmad_n_f64_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f32_z)))
-svfloat32_t svmad_n_f32_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f16_z)))
-svfloat16_t svmad_n_f16_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u8_m)))
-svuint8_t svmad_n_u8_m(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u32_m)))
-svuint32_t svmad_n_u32_m(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u64_m)))
-svuint64_t svmad_n_u64_m(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u16_m)))
-svuint16_t svmad_n_u16_m(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s8_m)))
-svint8_t svmad_n_s8_m(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s32_m)))
-svint32_t svmad_n_s32_m(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s64_m)))
-svint64_t svmad_n_s64_m(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s16_m)))
-svint16_t svmad_n_s16_m(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u8_x)))
-svuint8_t svmad_n_u8_x(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u32_x)))
-svuint32_t svmad_n_u32_x(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u64_x)))
-svuint64_t svmad_n_u64_x(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u16_x)))
-svuint16_t svmad_n_u16_x(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s8_x)))
-svint8_t svmad_n_s8_x(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s32_x)))
-svint32_t svmad_n_s32_x(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s64_x)))
-svint64_t svmad_n_s64_x(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s16_x)))
-svint16_t svmad_n_s16_x(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u8_z)))
-svuint8_t svmad_n_u8_z(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u32_z)))
-svuint32_t svmad_n_u32_z(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u64_z)))
-svuint64_t svmad_n_u64_z(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u16_z)))
-svuint16_t svmad_n_u16_z(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s8_z)))
-svint8_t svmad_n_s8_z(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s32_z)))
-svint32_t svmad_n_s32_z(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s64_z)))
-svint64_t svmad_n_s64_z(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s16_z)))
-svint16_t svmad_n_s16_z(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f64_m)))
-svfloat64_t svmad_f64_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f32_m)))
-svfloat32_t svmad_f32_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f16_m)))
-svfloat16_t svmad_f16_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f64_x)))
-svfloat64_t svmad_f64_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f32_x)))
-svfloat32_t svmad_f32_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f16_x)))
-svfloat16_t svmad_f16_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f64_z)))
-svfloat64_t svmad_f64_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f32_z)))
-svfloat32_t svmad_f32_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f16_z)))
-svfloat16_t svmad_f16_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u8_m)))
-svuint8_t svmad_u8_m(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u32_m)))
-svuint32_t svmad_u32_m(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u64_m)))
-svuint64_t svmad_u64_m(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u16_m)))
-svuint16_t svmad_u16_m(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s8_m)))
-svint8_t svmad_s8_m(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s32_m)))
-svint32_t svmad_s32_m(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s64_m)))
-svint64_t svmad_s64_m(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s16_m)))
-svint16_t svmad_s16_m(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u8_x)))
-svuint8_t svmad_u8_x(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u32_x)))
-svuint32_t svmad_u32_x(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u64_x)))
-svuint64_t svmad_u64_x(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u16_x)))
-svuint16_t svmad_u16_x(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s8_x)))
-svint8_t svmad_s8_x(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s32_x)))
-svint32_t svmad_s32_x(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s64_x)))
-svint64_t svmad_s64_x(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s16_x)))
-svint16_t svmad_s16_x(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u8_z)))
-svuint8_t svmad_u8_z(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u32_z)))
-svuint32_t svmad_u32_z(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u64_z)))
-svuint64_t svmad_u64_z(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u16_z)))
-svuint16_t svmad_u16_z(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s8_z)))
-svint8_t svmad_s8_z(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s32_z)))
-svint32_t svmad_s32_z(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s64_z)))
-svint64_t svmad_s64_z(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s16_z)))
-svint16_t svmad_s16_z(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f64_m)))
-svfloat64_t svmax_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f32_m)))
-svfloat32_t svmax_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f16_m)))
-svfloat16_t svmax_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f64_x)))
-svfloat64_t svmax_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f32_x)))
-svfloat32_t svmax_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f16_x)))
-svfloat16_t svmax_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f64_z)))
-svfloat64_t svmax_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f32_z)))
-svfloat32_t svmax_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f16_z)))
-svfloat16_t svmax_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s8_m)))
-svint8_t svmax_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s32_m)))
-svint32_t svmax_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s64_m)))
-svint64_t svmax_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s16_m)))
-svint16_t svmax_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s8_x)))
-svint8_t svmax_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s32_x)))
-svint32_t svmax_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s64_x)))
-svint64_t svmax_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s16_x)))
-svint16_t svmax_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s8_z)))
-svint8_t svmax_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s32_z)))
-svint32_t svmax_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s64_z)))
-svint64_t svmax_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s16_z)))
-svint16_t svmax_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u8_m)))
-svuint8_t svmax_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u32_m)))
-svuint32_t svmax_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u64_m)))
-svuint64_t svmax_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u16_m)))
-svuint16_t svmax_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u8_x)))
-svuint8_t svmax_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u32_x)))
-svuint32_t svmax_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u64_x)))
-svuint64_t svmax_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u16_x)))
-svuint16_t svmax_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u8_z)))
-svuint8_t svmax_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u32_z)))
-svuint32_t svmax_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u64_z)))
-svuint64_t svmax_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u16_z)))
-svuint16_t svmax_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_m)))
-svfloat64_t svmax_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_m)))
-svfloat32_t svmax_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_m)))
-svfloat16_t svmax_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_x)))
-svfloat64_t svmax_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_x)))
-svfloat32_t svmax_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_x)))
-svfloat16_t svmax_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_z)))
-svfloat64_t svmax_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_z)))
-svfloat32_t svmax_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_z)))
-svfloat16_t svmax_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_m)))
-svint8_t svmax_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_m)))
-svint32_t svmax_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_m)))
-svint64_t svmax_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_m)))
-svint16_t svmax_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_x)))
-svint8_t svmax_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_x)))
-svint32_t svmax_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_x)))
-svint64_t svmax_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_x)))
-svint16_t svmax_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_z)))
-svint8_t svmax_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_z)))
-svint32_t svmax_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_z)))
-svint64_t svmax_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_z)))
-svint16_t svmax_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_m)))
-svuint8_t svmax_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_m)))
-svuint32_t svmax_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_m)))
-svuint64_t svmax_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_m)))
-svuint16_t svmax_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_x)))
-svuint8_t svmax_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_x)))
-svuint32_t svmax_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_x)))
-svuint64_t svmax_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_x)))
-svuint16_t svmax_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_z)))
-svuint8_t svmax_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_z)))
-svuint32_t svmax_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_z)))
-svuint64_t svmax_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_z)))
-svuint16_t svmax_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f64_m)))
-svfloat64_t svmaxnm_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f32_m)))
-svfloat32_t svmaxnm_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f16_m)))
-svfloat16_t svmaxnm_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f64_x)))
-svfloat64_t svmaxnm_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f32_x)))
-svfloat32_t svmaxnm_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f16_x)))
-svfloat16_t svmaxnm_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f64_z)))
-svfloat64_t svmaxnm_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f32_z)))
-svfloat32_t svmaxnm_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f16_z)))
-svfloat16_t svmaxnm_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_m)))
-svfloat64_t svmaxnm_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_m)))
-svfloat32_t svmaxnm_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_m)))
-svfloat16_t svmaxnm_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_x)))
-svfloat64_t svmaxnm_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_x)))
-svfloat32_t svmaxnm_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_x)))
-svfloat16_t svmaxnm_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_z)))
-svfloat64_t svmaxnm_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_z)))
-svfloat32_t svmaxnm_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_z)))
-svfloat16_t svmaxnm_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmv_f64)))
-float64_t svmaxnmv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmv_f32)))
-float32_t svmaxnmv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmv_f16)))
-float16_t svmaxnmv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_f64)))
-float64_t svmaxv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_f32)))
-float32_t svmaxv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_f16)))
-float16_t svmaxv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_s8)))
-int8_t svmaxv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_s32)))
-int32_t svmaxv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_s64)))
-int64_t svmaxv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_s16)))
-int16_t svmaxv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_u8)))
-uint8_t svmaxv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_u32)))
-uint32_t svmaxv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_u64)))
-uint64_t svmaxv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_u16)))
-uint16_t svmaxv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f64_m)))
-svfloat64_t svmin_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f32_m)))
-svfloat32_t svmin_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f16_m)))
-svfloat16_t svmin_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f64_x)))
-svfloat64_t svmin_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f32_x)))
-svfloat32_t svmin_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f16_x)))
-svfloat16_t svmin_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f64_z)))
-svfloat64_t svmin_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f32_z)))
-svfloat32_t svmin_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f16_z)))
-svfloat16_t svmin_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s8_m)))
-svint8_t svmin_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s32_m)))
-svint32_t svmin_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s64_m)))
-svint64_t svmin_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s16_m)))
-svint16_t svmin_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s8_x)))
-svint8_t svmin_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s32_x)))
-svint32_t svmin_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s64_x)))
-svint64_t svmin_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s16_x)))
-svint16_t svmin_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s8_z)))
-svint8_t svmin_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s32_z)))
-svint32_t svmin_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s64_z)))
-svint64_t svmin_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s16_z)))
-svint16_t svmin_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u8_m)))
-svuint8_t svmin_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u32_m)))
-svuint32_t svmin_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u64_m)))
-svuint64_t svmin_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u16_m)))
-svuint16_t svmin_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u8_x)))
-svuint8_t svmin_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u32_x)))
-svuint32_t svmin_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u64_x)))
-svuint64_t svmin_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u16_x)))
-svuint16_t svmin_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u8_z)))
-svuint8_t svmin_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u32_z)))
-svuint32_t svmin_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u64_z)))
-svuint64_t svmin_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u16_z)))
-svuint16_t svmin_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_m)))
-svfloat64_t svmin_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_m)))
-svfloat32_t svmin_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_m)))
-svfloat16_t svmin_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_x)))
-svfloat64_t svmin_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_x)))
-svfloat32_t svmin_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_x)))
-svfloat16_t svmin_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_z)))
-svfloat64_t svmin_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_z)))
-svfloat32_t svmin_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_z)))
-svfloat16_t svmin_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_m)))
-svint8_t svmin_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_m)))
-svint32_t svmin_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_m)))
-svint64_t svmin_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_m)))
-svint16_t svmin_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_x)))
-svint8_t svmin_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_x)))
-svint32_t svmin_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_x)))
-svint64_t svmin_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_x)))
-svint16_t svmin_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_z)))
-svint8_t svmin_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_z)))
-svint32_t svmin_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_z)))
-svint64_t svmin_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_z)))
-svint16_t svmin_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_m)))
-svuint8_t svmin_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_m)))
-svuint32_t svmin_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_m)))
-svuint64_t svmin_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_m)))
-svuint16_t svmin_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_x)))
-svuint8_t svmin_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_x)))
-svuint32_t svmin_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_x)))
-svuint64_t svmin_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_x)))
-svuint16_t svmin_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_z)))
-svuint8_t svmin_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_z)))
-svuint32_t svmin_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_z)))
-svuint64_t svmin_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_z)))
-svuint16_t svmin_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f64_m)))
-svfloat64_t svminnm_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f32_m)))
-svfloat32_t svminnm_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f16_m)))
-svfloat16_t svminnm_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f64_x)))
-svfloat64_t svminnm_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f32_x)))
-svfloat32_t svminnm_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f16_x)))
-svfloat16_t svminnm_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f64_z)))
-svfloat64_t svminnm_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f32_z)))
-svfloat32_t svminnm_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f16_z)))
-svfloat16_t svminnm_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_m)))
-svfloat64_t svminnm_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_m)))
-svfloat32_t svminnm_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_m)))
-svfloat16_t svminnm_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_x)))
-svfloat64_t svminnm_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_x)))
-svfloat32_t svminnm_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_x)))
-svfloat16_t svminnm_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_z)))
-svfloat64_t svminnm_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_z)))
-svfloat32_t svminnm_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_z)))
-svfloat16_t svminnm_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmv_f64)))
-float64_t svminnmv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmv_f32)))
-float32_t svminnmv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmv_f16)))
-float16_t svminnmv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_f64)))
-float64_t svminv_f64(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_f32)))
-float32_t svminv_f32(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_f16)))
-float16_t svminv_f16(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_s8)))
-int8_t svminv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_s32)))
-int32_t svminv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_s64)))
-int64_t svminv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_s16)))
-int16_t svminv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_u8)))
-uint8_t svminv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_u32)))
-uint32_t svminv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_u64)))
-uint64_t svminv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_u16)))
-uint16_t svminv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f64_m)))
-svfloat64_t svmla_n_f64_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f32_m)))
-svfloat32_t svmla_n_f32_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f16_m)))
-svfloat16_t svmla_n_f16_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f64_x)))
-svfloat64_t svmla_n_f64_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f32_x)))
-svfloat32_t svmla_n_f32_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f16_x)))
-svfloat16_t svmla_n_f16_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f64_z)))
-svfloat64_t svmla_n_f64_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f32_z)))
-svfloat32_t svmla_n_f32_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f16_z)))
-svfloat16_t svmla_n_f16_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u8_m)))
-svuint8_t svmla_n_u8_m(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u32_m)))
-svuint32_t svmla_n_u32_m(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u64_m)))
-svuint64_t svmla_n_u64_m(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u16_m)))
-svuint16_t svmla_n_u16_m(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s8_m)))
-svint8_t svmla_n_s8_m(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s32_m)))
-svint32_t svmla_n_s32_m(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s64_m)))
-svint64_t svmla_n_s64_m(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s16_m)))
-svint16_t svmla_n_s16_m(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u8_x)))
-svuint8_t svmla_n_u8_x(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u32_x)))
-svuint32_t svmla_n_u32_x(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u64_x)))
-svuint64_t svmla_n_u64_x(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u16_x)))
-svuint16_t svmla_n_u16_x(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s8_x)))
-svint8_t svmla_n_s8_x(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s32_x)))
-svint32_t svmla_n_s32_x(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s64_x)))
-svint64_t svmla_n_s64_x(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s16_x)))
-svint16_t svmla_n_s16_x(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u8_z)))
-svuint8_t svmla_n_u8_z(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u32_z)))
-svuint32_t svmla_n_u32_z(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u64_z)))
-svuint64_t svmla_n_u64_z(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u16_z)))
-svuint16_t svmla_n_u16_z(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s8_z)))
-svint8_t svmla_n_s8_z(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s32_z)))
-svint32_t svmla_n_s32_z(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s64_z)))
-svint64_t svmla_n_s64_z(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s16_z)))
-svint16_t svmla_n_s16_z(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f64_m)))
-svfloat64_t svmla_f64_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f32_m)))
-svfloat32_t svmla_f32_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f16_m)))
-svfloat16_t svmla_f16_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f64_x)))
-svfloat64_t svmla_f64_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f32_x)))
-svfloat32_t svmla_f32_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f16_x)))
-svfloat16_t svmla_f16_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f64_z)))
-svfloat64_t svmla_f64_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f32_z)))
-svfloat32_t svmla_f32_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f16_z)))
-svfloat16_t svmla_f16_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u8_m)))
-svuint8_t svmla_u8_m(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u32_m)))
-svuint32_t svmla_u32_m(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u64_m)))
-svuint64_t svmla_u64_m(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u16_m)))
-svuint16_t svmla_u16_m(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s8_m)))
-svint8_t svmla_s8_m(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s32_m)))
-svint32_t svmla_s32_m(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s64_m)))
-svint64_t svmla_s64_m(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s16_m)))
-svint16_t svmla_s16_m(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u8_x)))
-svuint8_t svmla_u8_x(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u32_x)))
-svuint32_t svmla_u32_x(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u64_x)))
-svuint64_t svmla_u64_x(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u16_x)))
-svuint16_t svmla_u16_x(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s8_x)))
-svint8_t svmla_s8_x(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s32_x)))
-svint32_t svmla_s32_x(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s64_x)))
-svint64_t svmla_s64_x(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s16_x)))
-svint16_t svmla_s16_x(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u8_z)))
-svuint8_t svmla_u8_z(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u32_z)))
-svuint32_t svmla_u32_z(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u64_z)))
-svuint64_t svmla_u64_z(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u16_z)))
-svuint16_t svmla_u16_z(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s8_z)))
-svint8_t svmla_s8_z(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s32_z)))
-svint32_t svmla_s32_z(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s64_z)))
-svint64_t svmla_s64_z(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s16_z)))
-svint16_t svmla_s16_z(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_f64)))
-svfloat64_t svmla_lane_f64(svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_f32)))
-svfloat32_t svmla_lane_f32(svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_f16)))
-svfloat16_t svmla_lane_f16(svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f64_m)))
-svfloat64_t svmls_n_f64_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f32_m)))
-svfloat32_t svmls_n_f32_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f16_m)))
-svfloat16_t svmls_n_f16_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f64_x)))
-svfloat64_t svmls_n_f64_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f32_x)))
-svfloat32_t svmls_n_f32_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f16_x)))
-svfloat16_t svmls_n_f16_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f64_z)))
-svfloat64_t svmls_n_f64_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f32_z)))
-svfloat32_t svmls_n_f32_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f16_z)))
-svfloat16_t svmls_n_f16_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u8_m)))
-svuint8_t svmls_n_u8_m(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u32_m)))
-svuint32_t svmls_n_u32_m(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u64_m)))
-svuint64_t svmls_n_u64_m(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u16_m)))
-svuint16_t svmls_n_u16_m(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s8_m)))
-svint8_t svmls_n_s8_m(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s32_m)))
-svint32_t svmls_n_s32_m(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s64_m)))
-svint64_t svmls_n_s64_m(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s16_m)))
-svint16_t svmls_n_s16_m(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u8_x)))
-svuint8_t svmls_n_u8_x(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u32_x)))
-svuint32_t svmls_n_u32_x(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u64_x)))
-svuint64_t svmls_n_u64_x(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u16_x)))
-svuint16_t svmls_n_u16_x(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s8_x)))
-svint8_t svmls_n_s8_x(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s32_x)))
-svint32_t svmls_n_s32_x(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s64_x)))
-svint64_t svmls_n_s64_x(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s16_x)))
-svint16_t svmls_n_s16_x(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u8_z)))
-svuint8_t svmls_n_u8_z(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u32_z)))
-svuint32_t svmls_n_u32_z(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u64_z)))
-svuint64_t svmls_n_u64_z(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u16_z)))
-svuint16_t svmls_n_u16_z(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s8_z)))
-svint8_t svmls_n_s8_z(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s32_z)))
-svint32_t svmls_n_s32_z(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s64_z)))
-svint64_t svmls_n_s64_z(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s16_z)))
-svint16_t svmls_n_s16_z(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f64_m)))
-svfloat64_t svmls_f64_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f32_m)))
-svfloat32_t svmls_f32_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f16_m)))
-svfloat16_t svmls_f16_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f64_x)))
-svfloat64_t svmls_f64_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f32_x)))
-svfloat32_t svmls_f32_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f16_x)))
-svfloat16_t svmls_f16_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f64_z)))
-svfloat64_t svmls_f64_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f32_z)))
-svfloat32_t svmls_f32_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f16_z)))
-svfloat16_t svmls_f16_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u8_m)))
-svuint8_t svmls_u8_m(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u32_m)))
-svuint32_t svmls_u32_m(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u64_m)))
-svuint64_t svmls_u64_m(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u16_m)))
-svuint16_t svmls_u16_m(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s8_m)))
-svint8_t svmls_s8_m(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s32_m)))
-svint32_t svmls_s32_m(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s64_m)))
-svint64_t svmls_s64_m(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s16_m)))
-svint16_t svmls_s16_m(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u8_x)))
-svuint8_t svmls_u8_x(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u32_x)))
-svuint32_t svmls_u32_x(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u64_x)))
-svuint64_t svmls_u64_x(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u16_x)))
-svuint16_t svmls_u16_x(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s8_x)))
-svint8_t svmls_s8_x(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s32_x)))
-svint32_t svmls_s32_x(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s64_x)))
-svint64_t svmls_s64_x(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s16_x)))
-svint16_t svmls_s16_x(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u8_z)))
-svuint8_t svmls_u8_z(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u32_z)))
-svuint32_t svmls_u32_z(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u64_z)))
-svuint64_t svmls_u64_z(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u16_z)))
-svuint16_t svmls_u16_z(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s8_z)))
-svint8_t svmls_s8_z(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s32_z)))
-svint32_t svmls_s32_z(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s64_z)))
-svint64_t svmls_s64_z(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s16_z)))
-svint16_t svmls_s16_z(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_f64)))
-svfloat64_t svmls_lane_f64(svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_f32)))
-svfloat32_t svmls_lane_f32(svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_f16)))
-svfloat16_t svmls_lane_f16(svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmov_b_z)))
-svbool_t svmov_b_z(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f64_m)))
-svfloat64_t svmsb_n_f64_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f32_m)))
-svfloat32_t svmsb_n_f32_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f16_m)))
-svfloat16_t svmsb_n_f16_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f64_x)))
-svfloat64_t svmsb_n_f64_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f32_x)))
-svfloat32_t svmsb_n_f32_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f16_x)))
-svfloat16_t svmsb_n_f16_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f64_z)))
-svfloat64_t svmsb_n_f64_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f32_z)))
-svfloat32_t svmsb_n_f32_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f16_z)))
-svfloat16_t svmsb_n_f16_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u8_m)))
-svuint8_t svmsb_n_u8_m(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u32_m)))
-svuint32_t svmsb_n_u32_m(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u64_m)))
-svuint64_t svmsb_n_u64_m(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u16_m)))
-svuint16_t svmsb_n_u16_m(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s8_m)))
-svint8_t svmsb_n_s8_m(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s32_m)))
-svint32_t svmsb_n_s32_m(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s64_m)))
-svint64_t svmsb_n_s64_m(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s16_m)))
-svint16_t svmsb_n_s16_m(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u8_x)))
-svuint8_t svmsb_n_u8_x(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u32_x)))
-svuint32_t svmsb_n_u32_x(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u64_x)))
-svuint64_t svmsb_n_u64_x(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u16_x)))
-svuint16_t svmsb_n_u16_x(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s8_x)))
-svint8_t svmsb_n_s8_x(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s32_x)))
-svint32_t svmsb_n_s32_x(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s64_x)))
-svint64_t svmsb_n_s64_x(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s16_x)))
-svint16_t svmsb_n_s16_x(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u8_z)))
-svuint8_t svmsb_n_u8_z(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u32_z)))
-svuint32_t svmsb_n_u32_z(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u64_z)))
-svuint64_t svmsb_n_u64_z(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u16_z)))
-svuint16_t svmsb_n_u16_z(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s8_z)))
-svint8_t svmsb_n_s8_z(svbool_t, svint8_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s32_z)))
-svint32_t svmsb_n_s32_z(svbool_t, svint32_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s64_z)))
-svint64_t svmsb_n_s64_z(svbool_t, svint64_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s16_z)))
-svint16_t svmsb_n_s16_z(svbool_t, svint16_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f64_m)))
-svfloat64_t svmsb_f64_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f32_m)))
-svfloat32_t svmsb_f32_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f16_m)))
-svfloat16_t svmsb_f16_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f64_x)))
-svfloat64_t svmsb_f64_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f32_x)))
-svfloat32_t svmsb_f32_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f16_x)))
-svfloat16_t svmsb_f16_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f64_z)))
-svfloat64_t svmsb_f64_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f32_z)))
-svfloat32_t svmsb_f32_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f16_z)))
-svfloat16_t svmsb_f16_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u8_m)))
-svuint8_t svmsb_u8_m(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u32_m)))
-svuint32_t svmsb_u32_m(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u64_m)))
-svuint64_t svmsb_u64_m(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u16_m)))
-svuint16_t svmsb_u16_m(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s8_m)))
-svint8_t svmsb_s8_m(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s32_m)))
-svint32_t svmsb_s32_m(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s64_m)))
-svint64_t svmsb_s64_m(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s16_m)))
-svint16_t svmsb_s16_m(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u8_x)))
-svuint8_t svmsb_u8_x(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u32_x)))
-svuint32_t svmsb_u32_x(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u64_x)))
-svuint64_t svmsb_u64_x(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u16_x)))
-svuint16_t svmsb_u16_x(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s8_x)))
-svint8_t svmsb_s8_x(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s32_x)))
-svint32_t svmsb_s32_x(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s64_x)))
-svint64_t svmsb_s64_x(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s16_x)))
-svint16_t svmsb_s16_x(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u8_z)))
-svuint8_t svmsb_u8_z(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u32_z)))
-svuint32_t svmsb_u32_z(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u64_z)))
-svuint64_t svmsb_u64_z(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u16_z)))
-svuint16_t svmsb_u16_z(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s8_z)))
-svint8_t svmsb_s8_z(svbool_t, svint8_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s32_z)))
-svint32_t svmsb_s32_z(svbool_t, svint32_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s64_z)))
-svint64_t svmsb_s64_z(svbool_t, svint64_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s16_z)))
-svint16_t svmsb_s16_z(svbool_t, svint16_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f64_m)))
-svfloat64_t svmul_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f32_m)))
-svfloat32_t svmul_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f16_m)))
-svfloat16_t svmul_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f64_x)))
-svfloat64_t svmul_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f32_x)))
-svfloat32_t svmul_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f16_x)))
-svfloat16_t svmul_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f64_z)))
-svfloat64_t svmul_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f32_z)))
-svfloat32_t svmul_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f16_z)))
-svfloat16_t svmul_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u8_m)))
-svuint8_t svmul_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u32_m)))
-svuint32_t svmul_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u64_m)))
-svuint64_t svmul_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u16_m)))
-svuint16_t svmul_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s8_m)))
-svint8_t svmul_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s32_m)))
-svint32_t svmul_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s64_m)))
-svint64_t svmul_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s16_m)))
-svint16_t svmul_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u8_x)))
-svuint8_t svmul_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u32_x)))
-svuint32_t svmul_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u64_x)))
-svuint64_t svmul_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u16_x)))
-svuint16_t svmul_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s8_x)))
-svint8_t svmul_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s32_x)))
-svint32_t svmul_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s64_x)))
-svint64_t svmul_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s16_x)))
-svint16_t svmul_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u8_z)))
-svuint8_t svmul_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u32_z)))
-svuint32_t svmul_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u64_z)))
-svuint64_t svmul_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u16_z)))
-svuint16_t svmul_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s8_z)))
-svint8_t svmul_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s32_z)))
-svint32_t svmul_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s64_z)))
-svint64_t svmul_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s16_z)))
-svint16_t svmul_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f64_m)))
-svfloat64_t svmul_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f32_m)))
-svfloat32_t svmul_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f16_m)))
-svfloat16_t svmul_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f64_x)))
-svfloat64_t svmul_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f32_x)))
-svfloat32_t svmul_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f16_x)))
-svfloat16_t svmul_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f64_z)))
-svfloat64_t svmul_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f32_z)))
-svfloat32_t svmul_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f16_z)))
-svfloat16_t svmul_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u8_m)))
-svuint8_t svmul_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u32_m)))
-svuint32_t svmul_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u64_m)))
-svuint64_t svmul_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u16_m)))
-svuint16_t svmul_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s8_m)))
-svint8_t svmul_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s32_m)))
-svint32_t svmul_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s64_m)))
-svint64_t svmul_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s16_m)))
-svint16_t svmul_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u8_x)))
-svuint8_t svmul_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u32_x)))
-svuint32_t svmul_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u64_x)))
-svuint64_t svmul_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u16_x)))
-svuint16_t svmul_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s8_x)))
-svint8_t svmul_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s32_x)))
-svint32_t svmul_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s64_x)))
-svint64_t svmul_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s16_x)))
-svint16_t svmul_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u8_z)))
-svuint8_t svmul_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u32_z)))
-svuint32_t svmul_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u64_z)))
-svuint64_t svmul_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u16_z)))
-svuint16_t svmul_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s8_z)))
-svint8_t svmul_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s32_z)))
-svint32_t svmul_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s64_z)))
-svint64_t svmul_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s16_z)))
-svint16_t svmul_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_f64)))
-svfloat64_t svmul_lane_f64(svfloat64_t, svfloat64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_f32)))
-svfloat32_t svmul_lane_f32(svfloat32_t, svfloat32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_f16)))
-svfloat16_t svmul_lane_f16(svfloat16_t, svfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s8_m)))
-svint8_t svmulh_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s32_m)))
-svint32_t svmulh_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s64_m)))
-svint64_t svmulh_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s16_m)))
-svint16_t svmulh_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s8_x)))
-svint8_t svmulh_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s32_x)))
-svint32_t svmulh_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s64_x)))
-svint64_t svmulh_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s16_x)))
-svint16_t svmulh_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s8_z)))
-svint8_t svmulh_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s32_z)))
-svint32_t svmulh_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s64_z)))
-svint64_t svmulh_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s16_z)))
-svint16_t svmulh_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u8_m)))
-svuint8_t svmulh_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u32_m)))
-svuint32_t svmulh_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u64_m)))
-svuint64_t svmulh_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u16_m)))
-svuint16_t svmulh_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u8_x)))
-svuint8_t svmulh_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u32_x)))
-svuint32_t svmulh_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u64_x)))
-svuint64_t svmulh_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u16_x)))
-svuint16_t svmulh_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u8_z)))
-svuint8_t svmulh_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u32_z)))
-svuint32_t svmulh_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u64_z)))
-svuint64_t svmulh_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u16_z)))
-svuint16_t svmulh_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s8_m)))
-svint8_t svmulh_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s32_m)))
-svint32_t svmulh_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s64_m)))
-svint64_t svmulh_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s16_m)))
-svint16_t svmulh_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s8_x)))
-svint8_t svmulh_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s32_x)))
-svint32_t svmulh_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s64_x)))
-svint64_t svmulh_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s16_x)))
-svint16_t svmulh_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s8_z)))
-svint8_t svmulh_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s32_z)))
-svint32_t svmulh_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s64_z)))
-svint64_t svmulh_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s16_z)))
-svint16_t svmulh_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u8_m)))
-svuint8_t svmulh_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u32_m)))
-svuint32_t svmulh_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u64_m)))
-svuint64_t svmulh_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u16_m)))
-svuint16_t svmulh_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u8_x)))
-svuint8_t svmulh_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u32_x)))
-svuint32_t svmulh_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u64_x)))
-svuint64_t svmulh_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u16_x)))
-svuint16_t svmulh_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u8_z)))
-svuint8_t svmulh_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u32_z)))
-svuint32_t svmulh_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u64_z)))
-svuint64_t svmulh_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u16_z)))
-svuint16_t svmulh_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f64_m)))
-svfloat64_t svmulx_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f32_m)))
-svfloat32_t svmulx_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f16_m)))
-svfloat16_t svmulx_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f64_x)))
-svfloat64_t svmulx_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f32_x)))
-svfloat32_t svmulx_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f16_x)))
-svfloat16_t svmulx_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f64_z)))
-svfloat64_t svmulx_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f32_z)))
-svfloat32_t svmulx_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f16_z)))
-svfloat16_t svmulx_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f64_m)))
-svfloat64_t svmulx_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f32_m)))
-svfloat32_t svmulx_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f16_m)))
-svfloat16_t svmulx_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f64_x)))
-svfloat64_t svmulx_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f32_x)))
-svfloat32_t svmulx_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f16_x)))
-svfloat16_t svmulx_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f64_z)))
-svfloat64_t svmulx_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f32_z)))
-svfloat32_t svmulx_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f16_z)))
-svfloat16_t svmulx_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnand_b_z)))
-svbool_t svnand_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f64_m)))
-svfloat64_t svneg_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f32_m)))
-svfloat32_t svneg_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f16_m)))
-svfloat16_t svneg_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f64_x)))
-svfloat64_t svneg_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f32_x)))
-svfloat32_t svneg_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f16_x)))
-svfloat16_t svneg_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f64_z)))
-svfloat64_t svneg_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f32_z)))
-svfloat32_t svneg_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f16_z)))
-svfloat16_t svneg_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s8_m)))
-svint8_t svneg_s8_m(svint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s32_m)))
-svint32_t svneg_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s64_m)))
-svint64_t svneg_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s16_m)))
-svint16_t svneg_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s8_x)))
-svint8_t svneg_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s32_x)))
-svint32_t svneg_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s64_x)))
-svint64_t svneg_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s16_x)))
-svint16_t svneg_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s8_z)))
-svint8_t svneg_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s32_z)))
-svint32_t svneg_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s64_z)))
-svint64_t svneg_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s16_z)))
-svint16_t svneg_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f64_m)))
-svfloat64_t svnmad_n_f64_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f32_m)))
-svfloat32_t svnmad_n_f32_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f16_m)))
-svfloat16_t svnmad_n_f16_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f64_x)))
-svfloat64_t svnmad_n_f64_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f32_x)))
-svfloat32_t svnmad_n_f32_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f16_x)))
-svfloat16_t svnmad_n_f16_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f64_z)))
-svfloat64_t svnmad_n_f64_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f32_z)))
-svfloat32_t svnmad_n_f32_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f16_z)))
-svfloat16_t svnmad_n_f16_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f64_m)))
-svfloat64_t svnmad_f64_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f32_m)))
-svfloat32_t svnmad_f32_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f16_m)))
-svfloat16_t svnmad_f16_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f64_x)))
-svfloat64_t svnmad_f64_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f32_x)))
-svfloat32_t svnmad_f32_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f16_x)))
-svfloat16_t svnmad_f16_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f64_z)))
-svfloat64_t svnmad_f64_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f32_z)))
-svfloat32_t svnmad_f32_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f16_z)))
-svfloat16_t svnmad_f16_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f64_m)))
-svfloat64_t svnmla_n_f64_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f32_m)))
-svfloat32_t svnmla_n_f32_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f16_m)))
-svfloat16_t svnmla_n_f16_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f64_x)))
-svfloat64_t svnmla_n_f64_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f32_x)))
-svfloat32_t svnmla_n_f32_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f16_x)))
-svfloat16_t svnmla_n_f16_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f64_z)))
-svfloat64_t svnmla_n_f64_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f32_z)))
-svfloat32_t svnmla_n_f32_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f16_z)))
-svfloat16_t svnmla_n_f16_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f64_m)))
-svfloat64_t svnmla_f64_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f32_m)))
-svfloat32_t svnmla_f32_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f16_m)))
-svfloat16_t svnmla_f16_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f64_x)))
-svfloat64_t svnmla_f64_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f32_x)))
-svfloat32_t svnmla_f32_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f16_x)))
-svfloat16_t svnmla_f16_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f64_z)))
-svfloat64_t svnmla_f64_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f32_z)))
-svfloat32_t svnmla_f32_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f16_z)))
-svfloat16_t svnmla_f16_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f64_m)))
-svfloat64_t svnmls_n_f64_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f32_m)))
-svfloat32_t svnmls_n_f32_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f16_m)))
-svfloat16_t svnmls_n_f16_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f64_x)))
-svfloat64_t svnmls_n_f64_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f32_x)))
-svfloat32_t svnmls_n_f32_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f16_x)))
-svfloat16_t svnmls_n_f16_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f64_z)))
-svfloat64_t svnmls_n_f64_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f32_z)))
-svfloat32_t svnmls_n_f32_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f16_z)))
-svfloat16_t svnmls_n_f16_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f64_m)))
-svfloat64_t svnmls_f64_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f32_m)))
-svfloat32_t svnmls_f32_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f16_m)))
-svfloat16_t svnmls_f16_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f64_x)))
-svfloat64_t svnmls_f64_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f32_x)))
-svfloat32_t svnmls_f32_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f16_x)))
-svfloat16_t svnmls_f16_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f64_z)))
-svfloat64_t svnmls_f64_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f32_z)))
-svfloat32_t svnmls_f32_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f16_z)))
-svfloat16_t svnmls_f16_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f64_m)))
-svfloat64_t svnmsb_n_f64_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f32_m)))
-svfloat32_t svnmsb_n_f32_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f16_m)))
-svfloat16_t svnmsb_n_f16_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f64_x)))
-svfloat64_t svnmsb_n_f64_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f32_x)))
-svfloat32_t svnmsb_n_f32_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f16_x)))
-svfloat16_t svnmsb_n_f16_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f64_z)))
-svfloat64_t svnmsb_n_f64_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f32_z)))
-svfloat32_t svnmsb_n_f32_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f16_z)))
-svfloat16_t svnmsb_n_f16_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f64_m)))
-svfloat64_t svnmsb_f64_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f32_m)))
-svfloat32_t svnmsb_f32_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f16_m)))
-svfloat16_t svnmsb_f16_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f64_x)))
-svfloat64_t svnmsb_f64_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f32_x)))
-svfloat32_t svnmsb_f32_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f16_x)))
-svfloat16_t svnmsb_f16_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f64_z)))
-svfloat64_t svnmsb_f64_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f32_z)))
-svfloat32_t svnmsb_f32_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f16_z)))
-svfloat16_t svnmsb_f16_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnor_b_z)))
-svbool_t svnor_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_b_z)))
-svbool_t svnot_b_z(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u8_m)))
-svuint8_t svnot_u8_m(svuint8_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u32_m)))
-svuint32_t svnot_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u64_m)))
-svuint64_t svnot_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u16_m)))
-svuint16_t svnot_u16_m(svuint16_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s8_m)))
-svint8_t svnot_s8_m(svint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s32_m)))
-svint32_t svnot_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s64_m)))
-svint64_t svnot_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s16_m)))
-svint16_t svnot_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u8_x)))
-svuint8_t svnot_u8_x(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u32_x)))
-svuint32_t svnot_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u64_x)))
-svuint64_t svnot_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u16_x)))
-svuint16_t svnot_u16_x(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s8_x)))
-svint8_t svnot_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s32_x)))
-svint32_t svnot_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s64_x)))
-svint64_t svnot_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s16_x)))
-svint16_t svnot_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u8_z)))
-svuint8_t svnot_u8_z(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u32_z)))
-svuint32_t svnot_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u64_z)))
-svuint64_t svnot_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u16_z)))
-svuint16_t svnot_u16_z(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s8_z)))
-svint8_t svnot_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s32_z)))
-svint32_t svnot_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s64_z)))
-svint64_t svnot_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s16_z)))
-svint16_t svnot_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorn_b_z)))
-svbool_t svorn_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_b_z)))
-svbool_t svorr_b_z(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u8_m)))
-svuint8_t svorr_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u32_m)))
-svuint32_t svorr_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u64_m)))
-svuint64_t svorr_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u16_m)))
-svuint16_t svorr_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s8_m)))
-svint8_t svorr_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s32_m)))
-svint32_t svorr_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s64_m)))
-svint64_t svorr_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s16_m)))
-svint16_t svorr_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u8_x)))
-svuint8_t svorr_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u32_x)))
-svuint32_t svorr_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u64_x)))
-svuint64_t svorr_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u16_x)))
-svuint16_t svorr_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s8_x)))
-svint8_t svorr_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s32_x)))
-svint32_t svorr_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s64_x)))
-svint64_t svorr_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s16_x)))
-svint16_t svorr_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u8_z)))
-svuint8_t svorr_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u32_z)))
-svuint32_t svorr_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u64_z)))
-svuint64_t svorr_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u16_z)))
-svuint16_t svorr_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s8_z)))
-svint8_t svorr_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s32_z)))
-svint32_t svorr_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s64_z)))
-svint64_t svorr_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s16_z)))
-svint16_t svorr_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u8_m)))
-svuint8_t svorr_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u32_m)))
-svuint32_t svorr_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u64_m)))
-svuint64_t svorr_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u16_m)))
-svuint16_t svorr_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s8_m)))
-svint8_t svorr_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s32_m)))
-svint32_t svorr_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s64_m)))
-svint64_t svorr_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s16_m)))
-svint16_t svorr_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u8_x)))
-svuint8_t svorr_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u32_x)))
-svuint32_t svorr_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u64_x)))
-svuint64_t svorr_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u16_x)))
-svuint16_t svorr_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s8_x)))
-svint8_t svorr_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s32_x)))
-svint32_t svorr_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s64_x)))
-svint64_t svorr_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s16_x)))
-svint16_t svorr_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u8_z)))
-svuint8_t svorr_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u32_z)))
-svuint32_t svorr_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u64_z)))
-svuint64_t svorr_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u16_z)))
-svuint16_t svorr_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s8_z)))
-svint8_t svorr_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s32_z)))
-svint32_t svorr_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s64_z)))
-svint64_t svorr_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s16_z)))
-svint16_t svorr_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_u8)))
-uint8_t svorv_u8(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_u32)))
-uint32_t svorv_u32(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_u64)))
-uint64_t svorv_u64(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_u16)))
-uint16_t svorv_u16(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s8)))
-int8_t svorv_s8(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s32)))
-int32_t svorv_s32(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s64)))
-int64_t svorv_s64(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
-int16_t svorv_s16(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
-svbool_t svpfalse_b(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
-svbool_t svpfirst_b(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpnext_b8)))
-svbool_t svpnext_b8(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpnext_b32)))
-svbool_t svpnext_b32(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpnext_b64)))
-svbool_t svpnext_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpnext_b16)))
-svbool_t svpnext_b16(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb)))
-void svprfb(svbool_t, void const *, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_vnum)))
-void svprfb_vnum(svbool_t, void const *, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd)))
-void svprfd(svbool_t, void const *, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_vnum)))
-void svprfd_vnum(svbool_t, void const *, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh)))
-void svprfh(svbool_t, void const *, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_vnum)))
-void svprfh_vnum(svbool_t, void const *, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw)))
-void svprfw(svbool_t, void const *, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_vnum)))
-void svprfw_vnum(svbool_t, void const *, int64_t, enum svprfop);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptest_any)))
-bool svptest_any(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptest_first)))
-bool svptest_first(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptest_last)))
-bool svptest_last(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_pat_b8)))
-svbool_t svptrue_pat_b8(enum svpattern);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_pat_b32)))
-svbool_t svptrue_pat_b32(enum svpattern);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_pat_b64)))
-svbool_t svptrue_pat_b64(enum svpattern);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_pat_b16)))
-svbool_t svptrue_pat_b16(enum svpattern);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b8)))
-svbool_t svptrue_b8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b32)))
-svbool_t svptrue_b32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b64)))
-svbool_t svptrue_b64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b16)))
-svbool_t svptrue_b16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8)))
-svint8_t svqadd_n_s8(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32)))
-svint32_t svqadd_n_s32(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s64)))
-svint64_t svqadd_n_s64(svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s16)))
-svint16_t svqadd_n_s16(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u8)))
-svuint8_t svqadd_n_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u32)))
-svuint32_t svqadd_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u64)))
-svuint64_t svqadd_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u16)))
-svuint16_t svqadd_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s8)))
-svint8_t svqadd_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s32)))
-svint32_t svqadd_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s64)))
-svint64_t svqadd_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s16)))
-svint16_t svqadd_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u8)))
-svuint8_t svqadd_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u32)))
-svuint32_t svqadd_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u64)))
-svuint64_t svqadd_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u16)))
-svuint16_t svqadd_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_s32)))
-int32_t svqdecb_n_s32(int32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_s64)))
-int64_t svqdecb_n_s64(int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_u32)))
-uint32_t svqdecb_n_u32(uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_u64)))
-uint64_t svqdecb_n_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s32)))
-int32_t svqdecb_pat_n_s32(int32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s64)))
-int64_t svqdecb_pat_n_s64(int64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u32)))
-uint32_t svqdecb_pat_n_u32(uint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u64)))
-uint64_t svqdecb_pat_n_u64(uint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s32)))
-int32_t svqdecd_n_s32(int32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s64)))
-int64_t svqdecd_n_s64(int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_u32)))
-uint32_t svqdecd_n_u32(uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_u64)))
-uint64_t svqdecd_n_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_s64)))
-svint64_t svqdecd_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_u64)))
-svuint64_t svqdecd_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s32)))
-int32_t svqdecd_pat_n_s32(int32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s64)))
-int64_t svqdecd_pat_n_s64(int64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u32)))
-uint32_t svqdecd_pat_n_u32(uint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u64)))
-uint64_t svqdecd_pat_n_u64(uint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_s64)))
-svint64_t svqdecd_pat_s64(svint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_u64)))
-svuint64_t svqdecd_pat_u64(svuint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s32)))
-int32_t svqdech_n_s32(int32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s64)))
-int64_t svqdech_n_s64(int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_u32)))
-uint32_t svqdech_n_u32(uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_u64)))
-uint64_t svqdech_n_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_s16)))
-svint16_t svqdech_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_u16)))
-svuint16_t svqdech_u16(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s32)))
-int32_t svqdech_pat_n_s32(int32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s64)))
-int64_t svqdech_pat_n_s64(int64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u32)))
-uint32_t svqdech_pat_n_u32(uint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u64)))
-uint64_t svqdech_pat_n_u64(uint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_s16)))
-svint16_t svqdech_pat_s16(svint16_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_u16)))
-svuint16_t svqdech_pat_u16(svuint16_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b8)))
-int32_t svqdecp_n_s32_b8(int32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b32)))
-int32_t svqdecp_n_s32_b32(int32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b64)))
-int32_t svqdecp_n_s32_b64(int32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b16)))
-int32_t svqdecp_n_s32_b16(int32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s64_b8)))
-int64_t svqdecp_n_s64_b8(int64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s64_b32)))
-int64_t svqdecp_n_s64_b32(int64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s64_b64)))
-int64_t svqdecp_n_s64_b64(int64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s64_b16)))
-int64_t svqdecp_n_s64_b16(int64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u32_b8)))
-uint32_t svqdecp_n_u32_b8(uint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u32_b32)))
-uint32_t svqdecp_n_u32_b32(uint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u32_b64)))
-uint32_t svqdecp_n_u32_b64(uint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u32_b16)))
-uint32_t svqdecp_n_u32_b16(uint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u64_b8)))
-uint64_t svqdecp_n_u64_b8(uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u64_b32)))
-uint64_t svqdecp_n_u64_b32(uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u64_b64)))
-uint64_t svqdecp_n_u64_b64(uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u64_b16)))
-uint64_t svqdecp_n_u64_b16(uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_s32)))
-svint32_t svqdecp_s32(svint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_s64)))
-svint64_t svqdecp_s64(svint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_s16)))
-svint16_t svqdecp_s16(svint16_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_u32)))
-svuint32_t svqdecp_u32(svuint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_u64)))
-svuint64_t svqdecp_u64(svuint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_u16)))
-svuint16_t svqdecp_u16(svuint16_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_n_s32)))
-int32_t svqdecw_n_s32(int32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_n_s64)))
-int64_t svqdecw_n_s64(int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_n_u32)))
-uint32_t svqdecw_n_u32(uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_n_u64)))
-uint64_t svqdecw_n_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_s32)))
-svint32_t svqdecw_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_u32)))
-svuint32_t svqdecw_u32(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s32)))
-int32_t svqdecw_pat_n_s32(int32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s64)))
-int64_t svqdecw_pat_n_s64(int64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u32)))
-uint32_t svqdecw_pat_n_u32(uint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u64)))
-uint64_t svqdecw_pat_n_u64(uint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_s32)))
-svint32_t svqdecw_pat_s32(svint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_u32)))
-svuint32_t svqdecw_pat_u32(svuint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s32)))
-int32_t svqincb_n_s32(int32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s64)))
-int64_t svqincb_n_s64(int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_u32)))
-uint32_t svqincb_n_u32(uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_u64)))
-uint64_t svqincb_n_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s32)))
-int32_t svqincb_pat_n_s32(int32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s64)))
-int64_t svqincb_pat_n_s64(int64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u32)))
-uint32_t svqincb_pat_n_u32(uint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u64)))
-uint64_t svqincb_pat_n_u64(uint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s32)))
-int32_t svqincd_n_s32(int32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s64)))
-int64_t svqincd_n_s64(int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_u32)))
-uint32_t svqincd_n_u32(uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_u64)))
-uint64_t svqincd_n_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_s64)))
-svint64_t svqincd_s64(svint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_u64)))
-svuint64_t svqincd_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s32)))
-int32_t svqincd_pat_n_s32(int32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s64)))
-int64_t svqincd_pat_n_s64(int64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u32)))
-uint32_t svqincd_pat_n_u32(uint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u64)))
-uint64_t svqincd_pat_n_u64(uint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_s64)))
-svint64_t svqincd_pat_s64(svint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_u64)))
-svuint64_t svqincd_pat_u64(svuint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s32)))
-int32_t svqinch_n_s32(int32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s64)))
-int64_t svqinch_n_s64(int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_u32)))
-uint32_t svqinch_n_u32(uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_u64)))
-uint64_t svqinch_n_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_s16)))
-svint16_t svqinch_s16(svint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_u16)))
-svuint16_t svqinch_u16(svuint16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s32)))
-int32_t svqinch_pat_n_s32(int32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s64)))
-int64_t svqinch_pat_n_s64(int64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u32)))
-uint32_t svqinch_pat_n_u32(uint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u64)))
-uint64_t svqinch_pat_n_u64(uint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_s16)))
-svint16_t svqinch_pat_s16(svint16_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_u16)))
-svuint16_t svqinch_pat_u16(svuint16_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b8)))
-int32_t svqincp_n_s32_b8(int32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b32)))
-int32_t svqincp_n_s32_b32(int32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b64)))
-int32_t svqincp_n_s32_b64(int32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b16)))
-int32_t svqincp_n_s32_b16(int32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s64_b8)))
-int64_t svqincp_n_s64_b8(int64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s64_b32)))
-int64_t svqincp_n_s64_b32(int64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s64_b64)))
-int64_t svqincp_n_s64_b64(int64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s64_b16)))
-int64_t svqincp_n_s64_b16(int64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u32_b8)))
-uint32_t svqincp_n_u32_b8(uint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u32_b32)))
-uint32_t svqincp_n_u32_b32(uint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u32_b64)))
-uint32_t svqincp_n_u32_b64(uint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u32_b16)))
-uint32_t svqincp_n_u32_b16(uint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u64_b8)))
-uint64_t svqincp_n_u64_b8(uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u64_b32)))
-uint64_t svqincp_n_u64_b32(uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u64_b64)))
-uint64_t svqincp_n_u64_b64(uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u64_b16)))
-uint64_t svqincp_n_u64_b16(uint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_s32)))
-svint32_t svqincp_s32(svint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_s64)))
-svint64_t svqincp_s64(svint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_s16)))
-svint16_t svqincp_s16(svint16_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_u32)))
-svuint32_t svqincp_u32(svuint32_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_u64)))
-svuint64_t svqincp_u64(svuint64_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_u16)))
-svuint16_t svqincp_u16(svuint16_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_n_s32)))
-int32_t svqincw_n_s32(int32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_n_s64)))
-int64_t svqincw_n_s64(int64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_n_u32)))
-uint32_t svqincw_n_u32(uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_n_u64)))
-uint64_t svqincw_n_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_s32)))
-svint32_t svqincw_s32(svint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_u32)))
-svuint32_t svqincw_u32(svuint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s32)))
-int32_t svqincw_pat_n_s32(int32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s64)))
-int64_t svqincw_pat_n_s64(int64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u32)))
-uint32_t svqincw_pat_n_u32(uint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u64)))
-uint64_t svqincw_pat_n_u64(uint64_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_s32)))
-svint32_t svqincw_pat_s32(svint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_u32)))
-svuint32_t svqincw_pat_u32(svuint32_t, enum svpattern, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8)))
-svint8_t svqsub_n_s8(svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32)))
-svint32_t svqsub_n_s32(svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s64)))
-svint64_t svqsub_n_s64(svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s16)))
-svint16_t svqsub_n_s16(svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u8)))
-svuint8_t svqsub_n_u8(svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u32)))
-svuint32_t svqsub_n_u32(svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u64)))
-svuint64_t svqsub_n_u64(svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u16)))
-svuint16_t svqsub_n_u16(svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s8)))
-svint8_t svqsub_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s32)))
-svint32_t svqsub_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s64)))
-svint64_t svqsub_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s16)))
-svint16_t svqsub_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u8)))
-svuint8_t svqsub_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u32)))
-svuint32_t svqsub_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u64)))
-svuint64_t svqsub_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u16)))
-svuint16_t svqsub_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u8_m)))
-svuint8_t svrbit_u8_m(svuint8_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u32_m)))
-svuint32_t svrbit_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u64_m)))
-svuint64_t svrbit_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u16_m)))
-svuint16_t svrbit_u16_m(svuint16_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s8_m)))
-svint8_t svrbit_s8_m(svint8_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s32_m)))
-svint32_t svrbit_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s64_m)))
-svint64_t svrbit_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_m)))
-svint16_t svrbit_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u8_x)))
-svuint8_t svrbit_u8_x(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u32_x)))
-svuint32_t svrbit_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u64_x)))
-svuint64_t svrbit_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u16_x)))
-svuint16_t svrbit_u16_x(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s8_x)))
-svint8_t svrbit_s8_x(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s32_x)))
-svint32_t svrbit_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s64_x)))
-svint64_t svrbit_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_x)))
-svint16_t svrbit_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u8_z)))
-svuint8_t svrbit_u8_z(svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u32_z)))
-svuint32_t svrbit_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u64_z)))
-svuint64_t svrbit_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u16_z)))
-svuint16_t svrbit_u16_z(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s8_z)))
-svint8_t svrbit_s8_z(svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s32_z)))
-svint32_t svrbit_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s64_z)))
-svint64_t svrbit_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_z)))
-svint16_t svrbit_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f64)))
-svfloat64_t svrecpe_f64(svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f32)))
-svfloat32_t svrecpe_f32(svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f16)))
-svfloat16_t svrecpe_f16(svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecps_f64)))
-svfloat64_t svrecps_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecps_f32)))
-svfloat32_t svrecps_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecps_f16)))
-svfloat16_t svrecps_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f64_m)))
-svfloat64_t svrecpx_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f32_m)))
-svfloat32_t svrecpx_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f16_m)))
-svfloat16_t svrecpx_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f64_x)))
-svfloat64_t svrecpx_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f32_x)))
-svfloat32_t svrecpx_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f16_x)))
-svfloat16_t svrecpx_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f64_z)))
-svfloat64_t svrecpx_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f32_z)))
-svfloat32_t svrecpx_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f16_z)))
-svfloat16_t svrecpx_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_u8)))
-svuint8_t svrev_u8(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_u32)))
-svuint32_t svrev_u32(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_u64)))
-svuint64_t svrev_u64(svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_u16)))
-svuint16_t svrev_u16(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s8)))
-svint8_t svrev_s8(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_f64)))
-svfloat64_t svrev_f64(svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_f32)))
-svfloat32_t svrev_f32(svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_f16)))
-svfloat16_t svrev_f16(svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s32)))
-svint32_t svrev_s32(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s64)))
-svint64_t svrev_s64(svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s16)))
-svint16_t svrev_s16(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b16)))
-svbool_t svrev_b16(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b32)))
-svbool_t svrev_b32(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b64)))
-svbool_t svrev_b64(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b8)))
-svbool_t svrev_b8(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u32_m)))
-svuint32_t svrevb_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u64_m)))
-svuint64_t svrevb_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u16_m)))
-svuint16_t svrevb_u16_m(svuint16_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s32_m)))
-svint32_t svrevb_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s64_m)))
-svint64_t svrevb_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s16_m)))
-svint16_t svrevb_s16_m(svint16_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u32_x)))
-svuint32_t svrevb_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u64_x)))
-svuint64_t svrevb_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u16_x)))
-svuint16_t svrevb_u16_x(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s32_x)))
-svint32_t svrevb_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s64_x)))
-svint64_t svrevb_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s16_x)))
-svint16_t svrevb_s16_x(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u32_z)))
-svuint32_t svrevb_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u64_z)))
-svuint64_t svrevb_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u16_z)))
-svuint16_t svrevb_u16_z(svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s32_z)))
-svint32_t svrevb_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s64_z)))
-svint64_t svrevb_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s16_z)))
-svint16_t svrevb_s16_z(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u32_m)))
-svuint32_t svrevh_u32_m(svuint32_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u64_m)))
-svuint64_t svrevh_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s32_m)))
-svint32_t svrevh_s32_m(svint32_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s64_m)))
-svint64_t svrevh_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u32_x)))
-svuint32_t svrevh_u32_x(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u64_x)))
-svuint64_t svrevh_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s32_x)))
-svint32_t svrevh_s32_x(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s64_x)))
-svint64_t svrevh_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u32_z)))
-svuint32_t svrevh_u32_z(svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u64_z)))
-svuint64_t svrevh_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s32_z)))
-svint32_t svrevh_s32_z(svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s64_z)))
-svint64_t svrevh_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_u64_m)))
-svuint64_t svrevw_u64_m(svuint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_s64_m)))
-svint64_t svrevw_s64_m(svint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_u64_x)))
-svuint64_t svrevw_u64_x(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_s64_x)))
-svint64_t svrevw_s64_x(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_u64_z)))
-svuint64_t svrevw_u64_z(svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_s64_z)))
-svint64_t svrevw_s64_z(svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f64_m)))
-svfloat64_t svrinta_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_m)))
-svfloat32_t svrinta_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f16_m)))
-svfloat16_t svrinta_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f64_x)))
-svfloat64_t svrinta_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_x)))
-svfloat32_t svrinta_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f16_x)))
-svfloat16_t svrinta_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f64_z)))
-svfloat64_t svrinta_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_z)))
-svfloat32_t svrinta_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f16_z)))
-svfloat16_t svrinta_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f64_m)))
-svfloat64_t svrinti_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f32_m)))
-svfloat32_t svrinti_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f16_m)))
-svfloat16_t svrinti_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f64_x)))
-svfloat64_t svrinti_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f32_x)))
-svfloat32_t svrinti_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f16_x)))
-svfloat16_t svrinti_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f64_z)))
-svfloat64_t svrinti_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f32_z)))
-svfloat32_t svrinti_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f16_z)))
-svfloat16_t svrinti_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f64_m)))
-svfloat64_t svrintm_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_m)))
-svfloat32_t svrintm_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f16_m)))
-svfloat16_t svrintm_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f64_x)))
-svfloat64_t svrintm_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_x)))
-svfloat32_t svrintm_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f16_x)))
-svfloat16_t svrintm_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f64_z)))
-svfloat64_t svrintm_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_z)))
-svfloat32_t svrintm_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f16_z)))
-svfloat16_t svrintm_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f64_m)))
-svfloat64_t svrintn_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_m)))
-svfloat32_t svrintn_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f16_m)))
-svfloat16_t svrintn_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f64_x)))
-svfloat64_t svrintn_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_x)))
-svfloat32_t svrintn_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f16_x)))
-svfloat16_t svrintn_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f64_z)))
-svfloat64_t svrintn_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_z)))
-svfloat32_t svrintn_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f16_z)))
-svfloat16_t svrintn_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f64_m)))
-svfloat64_t svrintp_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_m)))
-svfloat32_t svrintp_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f16_m)))
-svfloat16_t svrintp_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f64_x)))
-svfloat64_t svrintp_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_x)))
-svfloat32_t svrintp_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f16_x)))
-svfloat16_t svrintp_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f64_z)))
-svfloat64_t svrintp_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_z)))
-svfloat32_t svrintp_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f16_z)))
-svfloat16_t svrintp_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f64_m)))
-svfloat64_t svrintx_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f32_m)))
-svfloat32_t svrintx_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f16_m)))
-svfloat16_t svrintx_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f64_x)))
-svfloat64_t svrintx_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f32_x)))
-svfloat32_t svrintx_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f16_x)))
-svfloat16_t svrintx_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f64_z)))
-svfloat64_t svrintx_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f32_z)))
-svfloat32_t svrintx_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f16_z)))
-svfloat16_t svrintx_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f64_m)))
-svfloat64_t svrintz_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f32_m)))
-svfloat32_t svrintz_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f16_m)))
-svfloat16_t svrintz_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f64_x)))
-svfloat64_t svrintz_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f32_x)))
-svfloat32_t svrintz_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f16_x)))
-svfloat16_t svrintz_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f64_z)))
-svfloat64_t svrintz_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f32_z)))
-svfloat32_t svrintz_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f16_z)))
-svfloat16_t svrintz_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_f64)))
-svfloat64_t svrsqrte_f64(svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_f32)))
-svfloat32_t svrsqrte_f32(svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_f16)))
-svfloat16_t svrsqrte_f16(svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrts_f64)))
-svfloat64_t svrsqrts_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrts_f32)))
-svfloat32_t svrsqrts_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrts_f16)))
-svfloat16_t svrsqrts_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f64_m)))
-svfloat64_t svscale_n_f64_m(svbool_t, svfloat64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f32_m)))
-svfloat32_t svscale_n_f32_m(svbool_t, svfloat32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f16_m)))
-svfloat16_t svscale_n_f16_m(svbool_t, svfloat16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f64_x)))
-svfloat64_t svscale_n_f64_x(svbool_t, svfloat64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f32_x)))
-svfloat32_t svscale_n_f32_x(svbool_t, svfloat32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f16_x)))
-svfloat16_t svscale_n_f16_x(svbool_t, svfloat16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f64_z)))
-svfloat64_t svscale_n_f64_z(svbool_t, svfloat64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f32_z)))
-svfloat32_t svscale_n_f32_z(svbool_t, svfloat32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f16_z)))
-svfloat16_t svscale_n_f16_z(svbool_t, svfloat16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_m)))
-svfloat64_t svscale_f64_m(svbool_t, svfloat64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_m)))
-svfloat32_t svscale_f32_m(svbool_t, svfloat32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_m)))
-svfloat16_t svscale_f16_m(svbool_t, svfloat16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_x)))
-svfloat64_t svscale_f64_x(svbool_t, svfloat64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_x)))
-svfloat32_t svscale_f32_x(svbool_t, svfloat32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_x)))
-svfloat16_t svscale_f16_x(svbool_t, svfloat16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_z)))
-svfloat64_t svscale_f64_z(svbool_t, svfloat64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_z)))
-svfloat32_t svscale_f32_z(svbool_t, svfloat32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_z)))
-svfloat16_t svscale_f16_z(svbool_t, svfloat16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_b)))
-svbool_t svsel_b(svbool_t, svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u8)))
-svuint8_t svsel_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u32)))
-svuint32_t svsel_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u64)))
-svuint64_t svsel_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u16)))
-svuint16_t svsel_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s8)))
-svint8_t svsel_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f64)))
-svfloat64_t svsel_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f32)))
-svfloat32_t svsel_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f16)))
-svfloat16_t svsel_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s32)))
-svint32_t svsel_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s64)))
-svint64_t svsel_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s16)))
-svint16_t svsel_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_u8)))
-svuint8x2_t svset2_u8(svuint8x2_t, uint64_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_u32)))
-svuint32x2_t svset2_u32(svuint32x2_t, uint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_u64)))
-svuint64x2_t svset2_u64(svuint64x2_t, uint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_u16)))
-svuint16x2_t svset2_u16(svuint16x2_t, uint64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_s8)))
-svint8x2_t svset2_s8(svint8x2_t, uint64_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_f64)))
-svfloat64x2_t svset2_f64(svfloat64x2_t, uint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_f32)))
-svfloat32x2_t svset2_f32(svfloat32x2_t, uint64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_f16)))
-svfloat16x2_t svset2_f16(svfloat16x2_t, uint64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_s32)))
-svint32x2_t svset2_s32(svint32x2_t, uint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_s64)))
-svint64x2_t svset2_s64(svint64x2_t, uint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_s16)))
-svint16x2_t svset2_s16(svint16x2_t, uint64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_u8)))
-svuint8x3_t svset3_u8(svuint8x3_t, uint64_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_u32)))
-svuint32x3_t svset3_u32(svuint32x3_t, uint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_u64)))
-svuint64x3_t svset3_u64(svuint64x3_t, uint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_u16)))
-svuint16x3_t svset3_u16(svuint16x3_t, uint64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_s8)))
-svint8x3_t svset3_s8(svint8x3_t, uint64_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_f64)))
-svfloat64x3_t svset3_f64(svfloat64x3_t, uint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_f32)))
-svfloat32x3_t svset3_f32(svfloat32x3_t, uint64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_f16)))
-svfloat16x3_t svset3_f16(svfloat16x3_t, uint64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_s32)))
-svint32x3_t svset3_s32(svint32x3_t, uint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_s64)))
-svint64x3_t svset3_s64(svint64x3_t, uint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_s16)))
-svint16x3_t svset3_s16(svint16x3_t, uint64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_u8)))
-svuint8x4_t svset4_u8(svuint8x4_t, uint64_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_u32)))
-svuint32x4_t svset4_u32(svuint32x4_t, uint64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_u64)))
-svuint64x4_t svset4_u64(svuint64x4_t, uint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_u16)))
-svuint16x4_t svset4_u16(svuint16x4_t, uint64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s8)))
-svint8x4_t svset4_s8(svint8x4_t, uint64_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_f64)))
-svfloat64x4_t svset4_f64(svfloat64x4_t, uint64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_f32)))
-svfloat32x4_t svset4_f32(svfloat32x4_t, uint64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_f16)))
-svfloat16x4_t svset4_f16(svfloat16x4_t, uint64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s32)))
-svint32x4_t svset4_s32(svint32x4_t, uint64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s64)))
-svint64x4_t svset4_s64(svint64x4_t, uint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s16)))
-svint16x4_t svset4_s16(svint16x4_t, uint64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u8)))
-svuint8_t svsplice_u8(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u32)))
-svuint32_t svsplice_u32(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u64)))
-svuint64_t svsplice_u64(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u16)))
-svuint16_t svsplice_u16(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_s8)))
-svint8_t svsplice_s8(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_f64)))
-svfloat64_t svsplice_f64(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_f32)))
-svfloat32_t svsplice_f32(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_f16)))
-svfloat16_t svsplice_f16(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_s32)))
-svint32_t svsplice_s32(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_s64)))
-svint64_t svsplice_s64(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_s16)))
-svint16_t svsplice_s16(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f64_m)))
-svfloat64_t svsqrt_f64_m(svfloat64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f32_m)))
-svfloat32_t svsqrt_f32_m(svfloat32_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f16_m)))
-svfloat16_t svsqrt_f16_m(svfloat16_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f64_x)))
-svfloat64_t svsqrt_f64_x(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f32_x)))
-svfloat32_t svsqrt_f32_x(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f16_x)))
-svfloat16_t svsqrt_f16_x(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f64_z)))
-svfloat64_t svsqrt_f64_z(svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f32_z)))
-svfloat32_t svsqrt_f32_z(svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f16_z)))
-svfloat16_t svsqrt_f16_z(svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u8)))
-void svst1_u8(svbool_t, uint8_t *, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u32)))
-void svst1_u32(svbool_t, uint32_t *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u64)))
-void svst1_u64(svbool_t, uint64_t *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u16)))
-void svst1_u16(svbool_t, uint16_t *, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s8)))
-void svst1_s8(svbool_t, int8_t *, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f64)))
-void svst1_f64(svbool_t, float64_t *, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f32)))
-void svst1_f32(svbool_t, float32_t *, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f16)))
-void svst1_f16(svbool_t, float16_t *, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s32)))
-void svst1_s32(svbool_t, int32_t *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s64)))
-void svst1_s64(svbool_t, int64_t *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s16)))
-void svst1_s16(svbool_t, int16_t *, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u8)))
-void svst1_vnum_u8(svbool_t, uint8_t *, int64_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u32)))
-void svst1_vnum_u32(svbool_t, uint32_t *, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u64)))
-void svst1_vnum_u64(svbool_t, uint64_t *, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u16)))
-void svst1_vnum_u16(svbool_t, uint16_t *, int64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s8)))
-void svst1_vnum_s8(svbool_t, int8_t *, int64_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f64)))
-void svst1_vnum_f64(svbool_t, float64_t *, int64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f32)))
-void svst1_vnum_f32(svbool_t, float32_t *, int64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f16)))
-void svst1_vnum_f16(svbool_t, float16_t *, int64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s32)))
-void svst1_vnum_s32(svbool_t, int32_t *, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s64)))
-void svst1_vnum_s64(svbool_t, int64_t *, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s16)))
-void svst1_vnum_s16(svbool_t, int16_t *, int64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_s32)))
-void svst1b_s32(svbool_t, int8_t *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_s64)))
-void svst1b_s64(svbool_t, int8_t *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_s16)))
-void svst1b_s16(svbool_t, int8_t *, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_u32)))
-void svst1b_u32(svbool_t, uint8_t *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_u64)))
-void svst1b_u64(svbool_t, uint8_t *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_u16)))
-void svst1b_u16(svbool_t, uint8_t *, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_s32)))
-void svst1b_vnum_s32(svbool_t, int8_t *, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_s64)))
-void svst1b_vnum_s64(svbool_t, int8_t *, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_s16)))
-void svst1b_vnum_s16(svbool_t, int8_t *, int64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_u32)))
-void svst1b_vnum_u32(svbool_t, uint8_t *, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_u64)))
-void svst1b_vnum_u64(svbool_t, uint8_t *, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_u16)))
-void svst1b_vnum_u16(svbool_t, uint8_t *, int64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_s32)))
-void svst1h_s32(svbool_t, int16_t *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_s64)))
-void svst1h_s64(svbool_t, int16_t *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_u32)))
-void svst1h_u32(svbool_t, uint16_t *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_u64)))
-void svst1h_u64(svbool_t, uint16_t *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_vnum_s32)))
-void svst1h_vnum_s32(svbool_t, int16_t *, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_vnum_s64)))
-void svst1h_vnum_s64(svbool_t, int16_t *, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_vnum_u32)))
-void svst1h_vnum_u32(svbool_t, uint16_t *, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_vnum_u64)))
-void svst1h_vnum_u64(svbool_t, uint16_t *, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_s64)))
-void svst1w_s64(svbool_t, int32_t *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_u64)))
-void svst1w_u64(svbool_t, uint32_t *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_vnum_s64)))
-void svst1w_vnum_s64(svbool_t, int32_t *, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_vnum_u64)))
-void svst1w_vnum_u64(svbool_t, uint32_t *, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_u8)))
-void svst2_u8(svbool_t, uint8_t *, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_u32)))
-void svst2_u32(svbool_t, uint32_t *, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_u64)))
-void svst2_u64(svbool_t, uint64_t *, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_u16)))
-void svst2_u16(svbool_t, uint16_t *, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_s8)))
-void svst2_s8(svbool_t, int8_t *, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_f64)))
-void svst2_f64(svbool_t, float64_t *, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_f32)))
-void svst2_f32(svbool_t, float32_t *, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_f16)))
-void svst2_f16(svbool_t, float16_t *, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_s32)))
-void svst2_s32(svbool_t, int32_t *, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_s64)))
-void svst2_s64(svbool_t, int64_t *, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_s16)))
-void svst2_s16(svbool_t, int16_t *, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_u8)))
-void svst2_vnum_u8(svbool_t, uint8_t *, int64_t, svuint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_u32)))
-void svst2_vnum_u32(svbool_t, uint32_t *, int64_t, svuint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_u64)))
-void svst2_vnum_u64(svbool_t, uint64_t *, int64_t, svuint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_u16)))
-void svst2_vnum_u16(svbool_t, uint16_t *, int64_t, svuint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_s8)))
-void svst2_vnum_s8(svbool_t, int8_t *, int64_t, svint8x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_f64)))
-void svst2_vnum_f64(svbool_t, float64_t *, int64_t, svfloat64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_f32)))
-void svst2_vnum_f32(svbool_t, float32_t *, int64_t, svfloat32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_f16)))
-void svst2_vnum_f16(svbool_t, float16_t *, int64_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_s32)))
-void svst2_vnum_s32(svbool_t, int32_t *, int64_t, svint32x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_s64)))
-void svst2_vnum_s64(svbool_t, int64_t *, int64_t, svint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_s16)))
-void svst2_vnum_s16(svbool_t, int16_t *, int64_t, svint16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_u8)))
-void svst3_u8(svbool_t, uint8_t *, svuint8x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_u32)))
-void svst3_u32(svbool_t, uint32_t *, svuint32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_u64)))
-void svst3_u64(svbool_t, uint64_t *, svuint64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_u16)))
-void svst3_u16(svbool_t, uint16_t *, svuint16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_s8)))
-void svst3_s8(svbool_t, int8_t *, svint8x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_f64)))
-void svst3_f64(svbool_t, float64_t *, svfloat64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_f32)))
-void svst3_f32(svbool_t, float32_t *, svfloat32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_f16)))
-void svst3_f16(svbool_t, float16_t *, svfloat16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_s32)))
-void svst3_s32(svbool_t, int32_t *, svint32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_s64)))
-void svst3_s64(svbool_t, int64_t *, svint64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_s16)))
-void svst3_s16(svbool_t, int16_t *, svint16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_u8)))
-void svst3_vnum_u8(svbool_t, uint8_t *, int64_t, svuint8x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_u32)))
-void svst3_vnum_u32(svbool_t, uint32_t *, int64_t, svuint32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_u64)))
-void svst3_vnum_u64(svbool_t, uint64_t *, int64_t, svuint64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_u16)))
-void svst3_vnum_u16(svbool_t, uint16_t *, int64_t, svuint16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_s8)))
-void svst3_vnum_s8(svbool_t, int8_t *, int64_t, svint8x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_f64)))
-void svst3_vnum_f64(svbool_t, float64_t *, int64_t, svfloat64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_f32)))
-void svst3_vnum_f32(svbool_t, float32_t *, int64_t, svfloat32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_f16)))
-void svst3_vnum_f16(svbool_t, float16_t *, int64_t, svfloat16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_s32)))
-void svst3_vnum_s32(svbool_t, int32_t *, int64_t, svint32x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_s64)))
-void svst3_vnum_s64(svbool_t, int64_t *, int64_t, svint64x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_s16)))
-void svst3_vnum_s16(svbool_t, int16_t *, int64_t, svint16x3_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_u8)))
-void svst4_u8(svbool_t, uint8_t *, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_u32)))
-void svst4_u32(svbool_t, uint32_t *, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_u64)))
-void svst4_u64(svbool_t, uint64_t *, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_u16)))
-void svst4_u16(svbool_t, uint16_t *, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_s8)))
-void svst4_s8(svbool_t, int8_t *, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_f64)))
-void svst4_f64(svbool_t, float64_t *, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_f32)))
-void svst4_f32(svbool_t, float32_t *, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_f16)))
-void svst4_f16(svbool_t, float16_t *, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_s32)))
-void svst4_s32(svbool_t, int32_t *, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_s64)))
-void svst4_s64(svbool_t, int64_t *, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_s16)))
-void svst4_s16(svbool_t, int16_t *, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_u8)))
-void svst4_vnum_u8(svbool_t, uint8_t *, int64_t, svuint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_u32)))
-void svst4_vnum_u32(svbool_t, uint32_t *, int64_t, svuint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_u64)))
-void svst4_vnum_u64(svbool_t, uint64_t *, int64_t, svuint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_u16)))
-void svst4_vnum_u16(svbool_t, uint16_t *, int64_t, svuint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_s8)))
-void svst4_vnum_s8(svbool_t, int8_t *, int64_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_f64)))
-void svst4_vnum_f64(svbool_t, float64_t *, int64_t, svfloat64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_f32)))
-void svst4_vnum_f32(svbool_t, float32_t *, int64_t, svfloat32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_f16)))
-void svst4_vnum_f16(svbool_t, float16_t *, int64_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_s32)))
-void svst4_vnum_s32(svbool_t, int32_t *, int64_t, svint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_s64)))
-void svst4_vnum_s64(svbool_t, int64_t *, int64_t, svint64x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_s16)))
-void svst4_vnum_s16(svbool_t, int16_t *, int64_t, svint16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u8)))
-void svstnt1_u8(svbool_t, uint8_t *, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u32)))
-void svstnt1_u32(svbool_t, uint32_t *, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u64)))
-void svstnt1_u64(svbool_t, uint64_t *, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u16)))
-void svstnt1_u16(svbool_t, uint16_t *, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s8)))
-void svstnt1_s8(svbool_t, int8_t *, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f64)))
-void svstnt1_f64(svbool_t, float64_t *, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f32)))
-void svstnt1_f32(svbool_t, float32_t *, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f16)))
-void svstnt1_f16(svbool_t, float16_t *, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s32)))
-void svstnt1_s32(svbool_t, int32_t *, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s64)))
-void svstnt1_s64(svbool_t, int64_t *, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s16)))
-void svstnt1_s16(svbool_t, int16_t *, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u8)))
-void svstnt1_vnum_u8(svbool_t, uint8_t *, int64_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u32)))
-void svstnt1_vnum_u32(svbool_t, uint32_t *, int64_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u64)))
-void svstnt1_vnum_u64(svbool_t, uint64_t *, int64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u16)))
-void svstnt1_vnum_u16(svbool_t, uint16_t *, int64_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s8)))
-void svstnt1_vnum_s8(svbool_t, int8_t *, int64_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f64)))
-void svstnt1_vnum_f64(svbool_t, float64_t *, int64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f32)))
-void svstnt1_vnum_f32(svbool_t, float32_t *, int64_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f16)))
-void svstnt1_vnum_f16(svbool_t, float16_t *, int64_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s32)))
-void svstnt1_vnum_s32(svbool_t, int32_t *, int64_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s64)))
-void svstnt1_vnum_s64(svbool_t, int64_t *, int64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s16)))
-void svstnt1_vnum_s16(svbool_t, int16_t *, int64_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f64_m)))
-svfloat64_t svsub_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f32_m)))
-svfloat32_t svsub_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f16_m)))
-svfloat16_t svsub_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f64_x)))
-svfloat64_t svsub_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f32_x)))
-svfloat32_t svsub_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f16_x)))
-svfloat16_t svsub_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f64_z)))
-svfloat64_t svsub_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f32_z)))
-svfloat32_t svsub_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f16_z)))
-svfloat16_t svsub_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u8_m)))
-svuint8_t svsub_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u32_m)))
-svuint32_t svsub_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u64_m)))
-svuint64_t svsub_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u16_m)))
-svuint16_t svsub_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s8_m)))
-svint8_t svsub_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s32_m)))
-svint32_t svsub_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s64_m)))
-svint64_t svsub_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s16_m)))
-svint16_t svsub_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u8_x)))
-svuint8_t svsub_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u32_x)))
-svuint32_t svsub_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u64_x)))
-svuint64_t svsub_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u16_x)))
-svuint16_t svsub_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s8_x)))
-svint8_t svsub_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s32_x)))
-svint32_t svsub_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s64_x)))
-svint64_t svsub_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s16_x)))
-svint16_t svsub_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u8_z)))
-svuint8_t svsub_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u32_z)))
-svuint32_t svsub_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u64_z)))
-svuint64_t svsub_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u16_z)))
-svuint16_t svsub_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s8_z)))
-svint8_t svsub_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s32_z)))
-svint32_t svsub_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s64_z)))
-svint64_t svsub_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s16_z)))
-svint16_t svsub_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f64_m)))
-svfloat64_t svsub_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f32_m)))
-svfloat32_t svsub_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f16_m)))
-svfloat16_t svsub_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f64_x)))
-svfloat64_t svsub_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f32_x)))
-svfloat32_t svsub_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f16_x)))
-svfloat16_t svsub_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f64_z)))
-svfloat64_t svsub_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f32_z)))
-svfloat32_t svsub_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f16_z)))
-svfloat16_t svsub_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u8_m)))
-svuint8_t svsub_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u32_m)))
-svuint32_t svsub_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u64_m)))
-svuint64_t svsub_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u16_m)))
-svuint16_t svsub_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s8_m)))
-svint8_t svsub_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s32_m)))
-svint32_t svsub_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s64_m)))
-svint64_t svsub_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s16_m)))
-svint16_t svsub_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u8_x)))
-svuint8_t svsub_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u32_x)))
-svuint32_t svsub_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u64_x)))
-svuint64_t svsub_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u16_x)))
-svuint16_t svsub_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s8_x)))
-svint8_t svsub_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s32_x)))
-svint32_t svsub_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s64_x)))
-svint64_t svsub_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s16_x)))
-svint16_t svsub_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u8_z)))
-svuint8_t svsub_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u32_z)))
-svuint32_t svsub_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u64_z)))
-svuint64_t svsub_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u16_z)))
-svuint16_t svsub_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s8_z)))
-svint8_t svsub_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s32_z)))
-svint32_t svsub_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s64_z)))
-svint64_t svsub_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s16_z)))
-svint16_t svsub_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f64_m)))
-svfloat64_t svsubr_n_f64_m(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f32_m)))
-svfloat32_t svsubr_n_f32_m(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f16_m)))
-svfloat16_t svsubr_n_f16_m(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f64_x)))
-svfloat64_t svsubr_n_f64_x(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f32_x)))
-svfloat32_t svsubr_n_f32_x(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f16_x)))
-svfloat16_t svsubr_n_f16_x(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f64_z)))
-svfloat64_t svsubr_n_f64_z(svbool_t, svfloat64_t, float64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f32_z)))
-svfloat32_t svsubr_n_f32_z(svbool_t, svfloat32_t, float32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f16_z)))
-svfloat16_t svsubr_n_f16_z(svbool_t, svfloat16_t, float16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u8_m)))
-svuint8_t svsubr_n_u8_m(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u32_m)))
-svuint32_t svsubr_n_u32_m(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u64_m)))
-svuint64_t svsubr_n_u64_m(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u16_m)))
-svuint16_t svsubr_n_u16_m(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s8_m)))
-svint8_t svsubr_n_s8_m(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s32_m)))
-svint32_t svsubr_n_s32_m(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s64_m)))
-svint64_t svsubr_n_s64_m(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s16_m)))
-svint16_t svsubr_n_s16_m(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u8_x)))
-svuint8_t svsubr_n_u8_x(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u32_x)))
-svuint32_t svsubr_n_u32_x(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u64_x)))
-svuint64_t svsubr_n_u64_x(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u16_x)))
-svuint16_t svsubr_n_u16_x(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s8_x)))
-svint8_t svsubr_n_s8_x(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s32_x)))
-svint32_t svsubr_n_s32_x(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s64_x)))
-svint64_t svsubr_n_s64_x(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s16_x)))
-svint16_t svsubr_n_s16_x(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u8_z)))
-svuint8_t svsubr_n_u8_z(svbool_t, svuint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u32_z)))
-svuint32_t svsubr_n_u32_z(svbool_t, svuint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u64_z)))
-svuint64_t svsubr_n_u64_z(svbool_t, svuint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u16_z)))
-svuint16_t svsubr_n_u16_z(svbool_t, svuint16_t, uint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s8_z)))
-svint8_t svsubr_n_s8_z(svbool_t, svint8_t, int8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s32_z)))
-svint32_t svsubr_n_s32_z(svbool_t, svint32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s64_z)))
-svint64_t svsubr_n_s64_z(svbool_t, svint64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s16_z)))
-svint16_t svsubr_n_s16_z(svbool_t, svint16_t, int16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f64_m)))
-svfloat64_t svsubr_f64_m(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f32_m)))
-svfloat32_t svsubr_f32_m(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f16_m)))
-svfloat16_t svsubr_f16_m(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f64_x)))
-svfloat64_t svsubr_f64_x(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f32_x)))
-svfloat32_t svsubr_f32_x(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f16_x)))
-svfloat16_t svsubr_f16_x(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f64_z)))
-svfloat64_t svsubr_f64_z(svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f32_z)))
-svfloat32_t svsubr_f32_z(svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f16_z)))
-svfloat16_t svsubr_f16_z(svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u8_m)))
-svuint8_t svsubr_u8_m(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u32_m)))
-svuint32_t svsubr_u32_m(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u64_m)))
-svuint64_t svsubr_u64_m(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u16_m)))
-svuint16_t svsubr_u16_m(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s8_m)))
-svint8_t svsubr_s8_m(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s32_m)))
-svint32_t svsubr_s32_m(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s64_m)))
-svint64_t svsubr_s64_m(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s16_m)))
-svint16_t svsubr_s16_m(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u8_x)))
-svuint8_t svsubr_u8_x(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u32_x)))
-svuint32_t svsubr_u32_x(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u64_x)))
-svuint64_t svsubr_u64_x(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u16_x)))
-svuint16_t svsubr_u16_x(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s8_x)))
-svint8_t svsubr_s8_x(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s32_x)))
-svint32_t svsubr_s32_x(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s64_x)))
-svint64_t svsubr_s64_x(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s16_x)))
-svint16_t svsubr_s16_x(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u8_z)))
-svuint8_t svsubr_u8_z(svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u32_z)))
-svuint32_t svsubr_u32_z(svbool_t, svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u64_z)))
-svuint64_t svsubr_u64_z(svbool_t, svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u16_z)))
-svuint16_t svsubr_u16_z(svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s8_z)))
-svint8_t svsubr_s8_z(svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s32_z)))
-svint32_t svsubr_s32_z(svbool_t, svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s64_z)))
-svint64_t svsubr_s64_z(svbool_t, svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s16_z)))
-svint16_t svsubr_s16_z(svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_u8)))
-svuint8_t svtbl_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_u32)))
-svuint32_t svtbl_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_u64)))
-svuint64_t svtbl_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_u16)))
-svuint16_t svtbl_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_s8)))
-svint8_t svtbl_s8(svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_f64)))
-svfloat64_t svtbl_f64(svfloat64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_f32)))
-svfloat32_t svtbl_f32(svfloat32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_f16)))
-svfloat16_t svtbl_f16(svfloat16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_s32)))
-svint32_t svtbl_s32(svint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_s64)))
-svint64_t svtbl_s64(svint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_s16)))
-svint16_t svtbl_s16(svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_u8)))
-svuint8_t svtrn1_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_u32)))
-svuint32_t svtrn1_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_u64)))
-svuint64_t svtrn1_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_u16)))
-svuint16_t svtrn1_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s8)))
-svint8_t svtrn1_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_f64)))
-svfloat64_t svtrn1_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_f32)))
-svfloat32_t svtrn1_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_f16)))
-svfloat16_t svtrn1_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s32)))
-svint32_t svtrn1_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s64)))
-svint64_t svtrn1_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s16)))
-svint16_t svtrn1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b16)))
-svbool_t svtrn1_b16(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b32)))
-svbool_t svtrn1_b32(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b64)))
-svbool_t svtrn1_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b8)))
-svbool_t svtrn1_b8(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u8)))
-svuint8_t svtrn2_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u32)))
-svuint32_t svtrn2_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u64)))
-svuint64_t svtrn2_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u16)))
-svuint16_t svtrn2_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s8)))
-svint8_t svtrn2_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_f64)))
-svfloat64_t svtrn2_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_f32)))
-svfloat32_t svtrn2_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_f16)))
-svfloat16_t svtrn2_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s32)))
-svint32_t svtrn2_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s64)))
-svint64_t svtrn2_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s16)))
-svint16_t svtrn2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b16)))
-svbool_t svtrn2_b16(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b32)))
-svbool_t svtrn2_b32(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b64)))
-svbool_t svtrn2_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b8)))
-svbool_t svtrn2_b8(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u8)))
-svuint8x2_t svundef2_u8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u32)))
-svuint32x2_t svundef2_u32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u64)))
-svuint64x2_t svundef2_u64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u16)))
-svuint16x2_t svundef2_u16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s8)))
-svint8x2_t svundef2_s8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f64)))
-svfloat64x2_t svundef2_f64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f32)))
-svfloat32x2_t svundef2_f32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f16)))
-svfloat16x2_t svundef2_f16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s32)))
-svint32x2_t svundef2_s32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s64)))
-svint64x2_t svundef2_s64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s16)))
-svint16x2_t svundef2_s16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u8)))
-svuint8x3_t svundef3_u8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u32)))
-svuint32x3_t svundef3_u32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u64)))
-svuint64x3_t svundef3_u64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u16)))
-svuint16x3_t svundef3_u16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s8)))
-svint8x3_t svundef3_s8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f64)))
-svfloat64x3_t svundef3_f64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f32)))
-svfloat32x3_t svundef3_f32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f16)))
-svfloat16x3_t svundef3_f16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s32)))
-svint32x3_t svundef3_s32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s64)))
-svint64x3_t svundef3_s64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s16)))
-svint16x3_t svundef3_s16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u8)))
-svuint8x4_t svundef4_u8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u32)))
-svuint32x4_t svundef4_u32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u64)))
-svuint64x4_t svundef4_u64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u16)))
-svuint16x4_t svundef4_u16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s8)))
-svint8x4_t svundef4_s8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f64)))
-svfloat64x4_t svundef4_f64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f32)))
-svfloat32x4_t svundef4_f32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f16)))
-svfloat16x4_t svundef4_f16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s32)))
-svint32x4_t svundef4_s32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s64)))
-svint64x4_t svundef4_s64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s16)))
-svint16x4_t svundef4_s16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u8)))
-svuint8_t svundef_u8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u32)))
-svuint32_t svundef_u32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u64)))
-svuint64_t svundef_u64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u16)))
-svuint16_t svundef_u16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s8)))
-svint8_t svundef_s8(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f64)))
-svfloat64_t svundef_f64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f32)))
-svfloat32_t svundef_f32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f16)))
-svfloat16_t svundef_f16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s32)))
-svint32_t svundef_s32(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s64)))
-svint64_t svundef_s64(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s16)))
-svint16_t svundef_s16(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_b)))
-svbool_t svunpkhi_b(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s32)))
-svint32_t svunpkhi_s32(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s64)))
-svint64_t svunpkhi_s64(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s16)))
-svint16_t svunpkhi_s16(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_u32)))
-svuint32_t svunpkhi_u32(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_u64)))
-svuint64_t svunpkhi_u64(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_u16)))
-svuint16_t svunpkhi_u16(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_b)))
-svbool_t svunpklo_b(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_s32)))
-svint32_t svunpklo_s32(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_s64)))
-svint64_t svunpklo_s64(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_s16)))
-svint16_t svunpklo_s16(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_u32)))
-svuint32_t svunpklo_u32(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_u64)))
-svuint64_t svunpklo_u64(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_u16)))
-svuint16_t svunpklo_u16(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_u8)))
-svuint8_t svuzp1_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_u32)))
-svuint32_t svuzp1_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_u64)))
-svuint64_t svuzp1_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_u16)))
-svuint16_t svuzp1_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s8)))
-svint8_t svuzp1_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_f64)))
-svfloat64_t svuzp1_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_f32)))
-svfloat32_t svuzp1_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_f16)))
-svfloat16_t svuzp1_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s32)))
-svint32_t svuzp1_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s64)))
-svint64_t svuzp1_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s16)))
-svint16_t svuzp1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b16)))
-svbool_t svuzp1_b16(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b32)))
-svbool_t svuzp1_b32(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b64)))
-svbool_t svuzp1_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b8)))
-svbool_t svuzp1_b8(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u8)))
-svuint8_t svuzp2_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u32)))
-svuint32_t svuzp2_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u64)))
-svuint64_t svuzp2_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u16)))
-svuint16_t svuzp2_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s8)))
-svint8_t svuzp2_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_f64)))
-svfloat64_t svuzp2_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_f32)))
-svfloat32_t svuzp2_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_f16)))
-svfloat16_t svuzp2_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s32)))
-svint32_t svuzp2_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s64)))
-svint64_t svuzp2_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s16)))
-svint16_t svuzp2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b16)))
-svbool_t svuzp2_b16(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b32)))
-svbool_t svuzp2_b32(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b64)))
-svbool_t svuzp2_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b8)))
-svbool_t svuzp2_b8(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_s32)))
-svbool_t svwhilele_b8_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_s32)))
-svbool_t svwhilele_b32_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_s32)))
-svbool_t svwhilele_b64_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_s32)))
-svbool_t svwhilele_b16_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_s64)))
-svbool_t svwhilele_b8_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_s64)))
-svbool_t svwhilele_b32_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_s64)))
-svbool_t svwhilele_b64_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_s64)))
-svbool_t svwhilele_b16_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_u32)))
-svbool_t svwhilele_b8_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_u32)))
-svbool_t svwhilele_b32_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_u32)))
-svbool_t svwhilele_b64_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_u32)))
-svbool_t svwhilele_b16_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_u64)))
-svbool_t svwhilele_b8_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_u64)))
-svbool_t svwhilele_b32_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_u64)))
-svbool_t svwhilele_b64_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_u64)))
-svbool_t svwhilele_b16_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_u32)))
-svbool_t svwhilelt_b8_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_u32)))
-svbool_t svwhilelt_b32_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_u32)))
-svbool_t svwhilelt_b64_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_u32)))
-svbool_t svwhilelt_b16_u32(uint32_t, uint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_u64)))
-svbool_t svwhilelt_b8_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_u64)))
-svbool_t svwhilelt_b32_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_u64)))
-svbool_t svwhilelt_b64_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_u64)))
-svbool_t svwhilelt_b16_u64(uint64_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_s32)))
-svbool_t svwhilelt_b8_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_s32)))
-svbool_t svwhilelt_b32_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_s32)))
-svbool_t svwhilelt_b64_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_s32)))
-svbool_t svwhilelt_b16_s32(int32_t, int32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_s64)))
-svbool_t svwhilelt_b8_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_s64)))
-svbool_t svwhilelt_b32_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_s64)))
-svbool_t svwhilelt_b64_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_s64)))
-svbool_t svwhilelt_b16_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_u8)))
-svuint8_t svzip1_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_u32)))
-svuint32_t svzip1_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_u64)))
-svuint64_t svzip1_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_u16)))
-svuint16_t svzip1_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s8)))
-svint8_t svzip1_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_f64)))
-svfloat64_t svzip1_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_f32)))
-svfloat32_t svzip1_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_f16)))
-svfloat16_t svzip1_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s32)))
-svint32_t svzip1_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s64)))
-svint64_t svzip1_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s16)))
-svint16_t svzip1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b16)))
-svbool_t svzip1_b16(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b32)))
-svbool_t svzip1_b32(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b64)))
-svbool_t svzip1_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b8)))
-svbool_t svzip1_b8(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u8)))
-svuint8_t svzip2_u8(svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u32)))
-svuint32_t svzip2_u32(svuint32_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u64)))
-svuint64_t svzip2_u64(svuint64_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u16)))
-svuint16_t svzip2_u16(svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s8)))
-svint8_t svzip2_s8(svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_f64)))
-svfloat64_t svzip2_f64(svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_f32)))
-svfloat32_t svzip2_f32(svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_f16)))
-svfloat16_t svzip2_f16(svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s32)))
-svint32_t svzip2_s32(svint32_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s64)))
-svint64_t svzip2_s64(svint64_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s16)))
-svint16_t svzip2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b16)))
-svbool_t svzip2_b16(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b32)))
-svbool_t svzip2_b32(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b64)))
-svbool_t svzip2_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b8)))
-svbool_t svzip2_b8(svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f64_m)))
-svfloat64_t svabd_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f32_m)))
-svfloat32_t svabd_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f16_m)))
-svfloat16_t svabd_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f64_x)))
-svfloat64_t svabd_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f32_x)))
-svfloat32_t svabd_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f16_x)))
-svfloat16_t svabd_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f64_z)))
-svfloat64_t svabd_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f32_z)))
-svfloat32_t svabd_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f16_z)))
-svfloat16_t svabd_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s8_m)))
-svint8_t svabd_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s32_m)))
-svint32_t svabd_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s64_m)))
-svint64_t svabd_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s16_m)))
-svint16_t svabd_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s8_x)))
-svint8_t svabd_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s32_x)))
-svint32_t svabd_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s64_x)))
-svint64_t svabd_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s16_x)))
-svint16_t svabd_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s8_z)))
-svint8_t svabd_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s32_z)))
-svint32_t svabd_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s64_z)))
-svint64_t svabd_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_s16_z)))
-svint16_t svabd_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u8_m)))
-svuint8_t svabd_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u32_m)))
-svuint32_t svabd_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u64_m)))
-svuint64_t svabd_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u16_m)))
-svuint16_t svabd_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u8_x)))
-svuint8_t svabd_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u32_x)))
-svuint32_t svabd_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u64_x)))
-svuint64_t svabd_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u16_x)))
-svuint16_t svabd_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u8_z)))
-svuint8_t svabd_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u32_z)))
-svuint32_t svabd_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u64_z)))
-svuint64_t svabd_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_u16_z)))
-svuint16_t svabd_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f64_m)))
-svfloat64_t svabd_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f32_m)))
-svfloat32_t svabd_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f16_m)))
-svfloat16_t svabd_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f64_x)))
-svfloat64_t svabd_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f32_x)))
-svfloat32_t svabd_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f16_x)))
-svfloat16_t svabd_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f64_z)))
-svfloat64_t svabd_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f32_z)))
-svfloat32_t svabd_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_f16_z)))
-svfloat16_t svabd_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s8_m)))
-svint8_t svabd_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s32_m)))
-svint32_t svabd_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s64_m)))
-svint64_t svabd_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s16_m)))
-svint16_t svabd_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s8_x)))
-svint8_t svabd_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s32_x)))
-svint32_t svabd_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s64_x)))
-svint64_t svabd_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s16_x)))
-svint16_t svabd_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s8_z)))
-svint8_t svabd_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s32_z)))
-svint32_t svabd_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s64_z)))
-svint64_t svabd_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_s16_z)))
-svint16_t svabd_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u8_m)))
-svuint8_t svabd_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u32_m)))
-svuint32_t svabd_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u64_m)))
-svuint64_t svabd_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u16_m)))
-svuint16_t svabd_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u8_x)))
-svuint8_t svabd_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u32_x)))
-svuint32_t svabd_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u64_x)))
-svuint64_t svabd_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u16_x)))
-svuint16_t svabd_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u8_z)))
-svuint8_t svabd_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u32_z)))
-svuint32_t svabd_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u64_z)))
-svuint64_t svabd_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_u16_z)))
-svuint16_t svabd_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f64_m)))
-svfloat64_t svabs_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f32_m)))
-svfloat32_t svabs_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f16_m)))
-svfloat16_t svabs_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f64_x)))
-svfloat64_t svabs_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f32_x)))
-svfloat32_t svabs_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f16_x)))
-svfloat16_t svabs_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f64_z)))
-svfloat64_t svabs_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f32_z)))
-svfloat32_t svabs_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_f16_z)))
-svfloat16_t svabs_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s8_m)))
-svint8_t svabs_m(svint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s32_m)))
-svint32_t svabs_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s64_m)))
-svint64_t svabs_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s16_m)))
-svint16_t svabs_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s8_x)))
-svint8_t svabs_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s32_x)))
-svint32_t svabs_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s64_x)))
-svint64_t svabs_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s16_x)))
-svint16_t svabs_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s8_z)))
-svint8_t svabs_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s32_z)))
-svint32_t svabs_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s64_z)))
-svint64_t svabs_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabs_s16_z)))
-svint16_t svabs_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_n_f64)))
-svbool_t svacge(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_n_f32)))
-svbool_t svacge(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_n_f16)))
-svbool_t svacge(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_f64)))
-svbool_t svacge(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_f32)))
-svbool_t svacge(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacge_f16)))
-svbool_t svacge(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_n_f64)))
-svbool_t svacgt(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_n_f32)))
-svbool_t svacgt(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_n_f16)))
-svbool_t svacgt(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_f64)))
-svbool_t svacgt(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_f32)))
-svbool_t svacgt(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacgt_f16)))
-svbool_t svacgt(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_n_f64)))
-svbool_t svacle(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_n_f32)))
-svbool_t svacle(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_n_f16)))
-svbool_t svacle(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_f64)))
-svbool_t svacle(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_f32)))
-svbool_t svacle(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svacle_f16)))
-svbool_t svacle(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_n_f64)))
-svbool_t svaclt(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_n_f32)))
-svbool_t svaclt(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_n_f16)))
-svbool_t svaclt(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_f64)))
-svbool_t svaclt(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_f32)))
-svbool_t svaclt(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaclt_f16)))
-svbool_t svaclt(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f64_m)))
-svfloat64_t svadd_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f32_m)))
-svfloat32_t svadd_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f16_m)))
-svfloat16_t svadd_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f64_x)))
-svfloat64_t svadd_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f32_x)))
-svfloat32_t svadd_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f16_x)))
-svfloat16_t svadd_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f64_z)))
-svfloat64_t svadd_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f32_z)))
-svfloat32_t svadd_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_f16_z)))
-svfloat16_t svadd_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u8_m)))
-svuint8_t svadd_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u32_m)))
-svuint32_t svadd_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u64_m)))
-svuint64_t svadd_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u16_m)))
-svuint16_t svadd_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s8_m)))
-svint8_t svadd_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s32_m)))
-svint32_t svadd_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s64_m)))
-svint64_t svadd_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s16_m)))
-svint16_t svadd_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u8_x)))
-svuint8_t svadd_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u32_x)))
-svuint32_t svadd_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u64_x)))
-svuint64_t svadd_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u16_x)))
-svuint16_t svadd_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s8_x)))
-svint8_t svadd_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s32_x)))
-svint32_t svadd_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s64_x)))
-svint64_t svadd_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s16_x)))
-svint16_t svadd_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u8_z)))
-svuint8_t svadd_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u32_z)))
-svuint32_t svadd_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u64_z)))
-svuint64_t svadd_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_u16_z)))
-svuint16_t svadd_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s8_z)))
-svint8_t svadd_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s32_z)))
-svint32_t svadd_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s64_z)))
-svint64_t svadd_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_s16_z)))
-svint16_t svadd_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f64_m)))
-svfloat64_t svadd_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f32_m)))
-svfloat32_t svadd_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f16_m)))
-svfloat16_t svadd_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f64_x)))
-svfloat64_t svadd_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f32_x)))
-svfloat32_t svadd_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f16_x)))
-svfloat16_t svadd_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f64_z)))
-svfloat64_t svadd_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f32_z)))
-svfloat32_t svadd_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_f16_z)))
-svfloat16_t svadd_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u8_m)))
-svuint8_t svadd_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u32_m)))
-svuint32_t svadd_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u64_m)))
-svuint64_t svadd_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u16_m)))
-svuint16_t svadd_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s8_m)))
-svint8_t svadd_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s32_m)))
-svint32_t svadd_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s64_m)))
-svint64_t svadd_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s16_m)))
-svint16_t svadd_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u8_x)))
-svuint8_t svadd_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u32_x)))
-svuint32_t svadd_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u64_x)))
-svuint64_t svadd_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u16_x)))
-svuint16_t svadd_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s8_x)))
-svint8_t svadd_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s32_x)))
-svint32_t svadd_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s64_x)))
-svint64_t svadd_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s16_x)))
-svint16_t svadd_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u8_z)))
-svuint8_t svadd_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u32_z)))
-svuint32_t svadd_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u64_z)))
-svuint64_t svadd_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_u16_z)))
-svuint16_t svadd_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s8_z)))
-svint8_t svadd_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s32_z)))
-svint32_t svadd_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s64_z)))
-svint64_t svadd_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s16_z)))
-svint16_t svadd_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f64)))
-float64_t svadda(svbool_t, float64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f32)))
-float32_t svadda(svbool_t, float32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f16)))
-float16_t svadda(svbool_t, float16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s8)))
-int64_t svaddv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s32)))
-int64_t svaddv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s64)))
-int64_t svaddv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s16)))
-int64_t svaddv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_u8)))
-uint64_t svaddv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_u32)))
-uint64_t svaddv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_u64)))
-uint64_t svaddv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_u16)))
-uint64_t svaddv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_f64)))
-float64_t svaddv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_f32)))
-float32_t svaddv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_f16)))
-float16_t svaddv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_b_z)))
-svbool_t svand_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u8_m)))
-svuint8_t svand_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u32_m)))
-svuint32_t svand_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u64_m)))
-svuint64_t svand_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u16_m)))
-svuint16_t svand_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s8_m)))
-svint8_t svand_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s32_m)))
-svint32_t svand_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s64_m)))
-svint64_t svand_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s16_m)))
-svint16_t svand_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u8_x)))
-svuint8_t svand_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u32_x)))
-svuint32_t svand_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u64_x)))
-svuint64_t svand_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u16_x)))
-svuint16_t svand_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s8_x)))
-svint8_t svand_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s32_x)))
-svint32_t svand_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s64_x)))
-svint64_t svand_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s16_x)))
-svint16_t svand_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u8_z)))
-svuint8_t svand_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u32_z)))
-svuint32_t svand_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u64_z)))
-svuint64_t svand_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_u16_z)))
-svuint16_t svand_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s8_z)))
-svint8_t svand_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s32_z)))
-svint32_t svand_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s64_z)))
-svint64_t svand_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_n_s16_z)))
-svint16_t svand_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u8_m)))
-svuint8_t svand_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u32_m)))
-svuint32_t svand_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u64_m)))
-svuint64_t svand_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u16_m)))
-svuint16_t svand_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s8_m)))
-svint8_t svand_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s32_m)))
-svint32_t svand_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s64_m)))
-svint64_t svand_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s16_m)))
-svint16_t svand_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u8_x)))
-svuint8_t svand_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u32_x)))
-svuint32_t svand_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u64_x)))
-svuint64_t svand_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u16_x)))
-svuint16_t svand_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s8_x)))
-svint8_t svand_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s32_x)))
-svint32_t svand_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s64_x)))
-svint64_t svand_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s16_x)))
-svint16_t svand_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u8_z)))
-svuint8_t svand_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u32_z)))
-svuint32_t svand_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u64_z)))
-svuint64_t svand_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_u16_z)))
-svuint16_t svand_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s8_z)))
-svint8_t svand_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s32_z)))
-svint32_t svand_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s64_z)))
-svint64_t svand_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svand_s16_z)))
-svint16_t svand_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_u8)))
-uint8_t svandv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_u32)))
-uint32_t svandv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_u64)))
-uint64_t svandv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_u16)))
-uint16_t svandv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_s8)))
-int8_t svandv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_s32)))
-int32_t svandv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_s64)))
-int64_t svandv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svandv_s16)))
-int16_t svandv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s8_m)))
-svint8_t svasr_m(svbool_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s32_m)))
-svint32_t svasr_m(svbool_t, svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s64_m)))
-svint64_t svasr_m(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s16_m)))
-svint16_t svasr_m(svbool_t, svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s8_x)))
-svint8_t svasr_x(svbool_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s32_x)))
-svint32_t svasr_x(svbool_t, svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s64_x)))
-svint64_t svasr_x(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s16_x)))
-svint16_t svasr_x(svbool_t, svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s8_z)))
-svint8_t svasr_z(svbool_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s32_z)))
-svint32_t svasr_z(svbool_t, svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s64_z)))
-svint64_t svasr_z(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_n_s16_z)))
-svint16_t svasr_z(svbool_t, svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s8_m)))
-svint8_t svasr_m(svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s32_m)))
-svint32_t svasr_m(svbool_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s64_m)))
-svint64_t svasr_m(svbool_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s16_m)))
-svint16_t svasr_m(svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s8_x)))
-svint8_t svasr_x(svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s32_x)))
-svint32_t svasr_x(svbool_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s64_x)))
-svint64_t svasr_x(svbool_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s16_x)))
-svint16_t svasr_x(svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s8_z)))
-svint8_t svasr_z(svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s32_z)))
-svint32_t svasr_z(svbool_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s64_z)))
-svint64_t svasr_z(svbool_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_s16_z)))
-svint16_t svasr_z(svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s8_m)))
-svint8_t svasr_wide_m(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s32_m)))
-svint32_t svasr_wide_m(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s16_m)))
-svint16_t svasr_wide_m(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s8_x)))
-svint8_t svasr_wide_x(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s32_x)))
-svint32_t svasr_wide_x(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s16_x)))
-svint16_t svasr_wide_x(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s8_z)))
-svint8_t svasr_wide_z(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s32_z)))
-svint32_t svasr_wide_z(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_n_s16_z)))
-svint16_t svasr_wide_z(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s8_m)))
-svint8_t svasr_wide_m(svbool_t, svint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s32_m)))
-svint32_t svasr_wide_m(svbool_t, svint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s16_m)))
-svint16_t svasr_wide_m(svbool_t, svint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s8_x)))
-svint8_t svasr_wide_x(svbool_t, svint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s32_x)))
-svint32_t svasr_wide_x(svbool_t, svint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s16_x)))
-svint16_t svasr_wide_x(svbool_t, svint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s8_z)))
-svint8_t svasr_wide_z(svbool_t, svint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s32_z)))
-svint32_t svasr_wide_z(svbool_t, svint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasr_wide_s16_z)))
-svint16_t svasr_wide_z(svbool_t, svint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s8_m)))
-svint8_t svasrd_m(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s32_m)))
-svint32_t svasrd_m(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s64_m)))
-svint64_t svasrd_m(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s16_m)))
-svint16_t svasrd_m(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s8_x)))
-svint8_t svasrd_x(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s32_x)))
-svint32_t svasrd_x(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s64_x)))
-svint64_t svasrd_x(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s16_x)))
-svint16_t svasrd_x(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s8_z)))
-svint8_t svasrd_z(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s32_z)))
-svint32_t svasrd_z(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s64_z)))
-svint64_t svasrd_z(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svasrd_n_s16_z)))
-svint16_t svasrd_z(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_b_z)))
-svbool_t svbic_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u8_m)))
-svuint8_t svbic_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u32_m)))
-svuint32_t svbic_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u64_m)))
-svuint64_t svbic_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u16_m)))
-svuint16_t svbic_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s8_m)))
-svint8_t svbic_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s32_m)))
-svint32_t svbic_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s64_m)))
-svint64_t svbic_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s16_m)))
-svint16_t svbic_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u8_x)))
-svuint8_t svbic_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u32_x)))
-svuint32_t svbic_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u64_x)))
-svuint64_t svbic_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u16_x)))
-svuint16_t svbic_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s8_x)))
-svint8_t svbic_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s32_x)))
-svint32_t svbic_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s64_x)))
-svint64_t svbic_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s16_x)))
-svint16_t svbic_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u8_z)))
-svuint8_t svbic_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u32_z)))
-svuint32_t svbic_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u64_z)))
-svuint64_t svbic_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_u16_z)))
-svuint16_t svbic_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s8_z)))
-svint8_t svbic_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s32_z)))
-svint32_t svbic_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s64_z)))
-svint64_t svbic_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_n_s16_z)))
-svint16_t svbic_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u8_m)))
-svuint8_t svbic_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u32_m)))
-svuint32_t svbic_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u64_m)))
-svuint64_t svbic_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u16_m)))
-svuint16_t svbic_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s8_m)))
-svint8_t svbic_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s32_m)))
-svint32_t svbic_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s64_m)))
-svint64_t svbic_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s16_m)))
-svint16_t svbic_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u8_x)))
-svuint8_t svbic_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u32_x)))
-svuint32_t svbic_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u64_x)))
-svuint64_t svbic_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u16_x)))
-svuint16_t svbic_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s8_x)))
-svint8_t svbic_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s32_x)))
-svint32_t svbic_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s64_x)))
-svint64_t svbic_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s16_x)))
-svint16_t svbic_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u8_z)))
-svuint8_t svbic_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u32_z)))
-svuint32_t svbic_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u64_z)))
-svuint64_t svbic_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_u16_z)))
-svuint16_t svbic_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s8_z)))
-svint8_t svbic_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s32_z)))
-svint32_t svbic_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s64_z)))
-svint64_t svbic_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbic_s16_z)))
-svint16_t svbic_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrka_b_m)))
-svbool_t svbrka_m(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrka_b_z)))
-svbool_t svbrka_z(svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkb_b_m)))
-svbool_t svbrkb_m(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkb_b_z)))
-svbool_t svbrkb_z(svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkn_b_z)))
-svbool_t svbrkn_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkpa_b_z)))
-svbool_t svbrkpa_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbrkpb_b_z)))
-svbool_t svbrkpb_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f64_m)))
-svfloat64_t svcadd_m(svbool_t, svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f32_m)))
-svfloat32_t svcadd_m(svbool_t, svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f16_m)))
-svfloat16_t svcadd_m(svbool_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f64_x)))
-svfloat64_t svcadd_x(svbool_t, svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f32_x)))
-svfloat32_t svcadd_x(svbool_t, svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f16_x)))
-svfloat16_t svcadd_x(svbool_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f64_z)))
-svfloat64_t svcadd_z(svbool_t, svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f32_z)))
-svfloat32_t svcadd_z(svbool_t, svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcadd_f16_z)))
-svfloat16_t svcadd_z(svbool_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_u8)))
-uint8_t svclasta(svbool_t, uint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_u32)))
-uint32_t svclasta(svbool_t, uint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_u64)))
-uint64_t svclasta(svbool_t, uint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_u16)))
-uint16_t svclasta(svbool_t, uint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_s8)))
-int8_t svclasta(svbool_t, int8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_f64)))
-float64_t svclasta(svbool_t, float64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_f32)))
-float32_t svclasta(svbool_t, float32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_f16)))
-float16_t svclasta(svbool_t, float16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_s32)))
-int32_t svclasta(svbool_t, int32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_s64)))
-int64_t svclasta(svbool_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_n_s16)))
-int16_t svclasta(svbool_t, int16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_u8)))
-svuint8_t svclasta(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_u32)))
-svuint32_t svclasta(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_u64)))
-svuint64_t svclasta(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_u16)))
-svuint16_t svclasta(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_s8)))
-svint8_t svclasta(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_f64)))
-svfloat64_t svclasta(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_f32)))
-svfloat32_t svclasta(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_f16)))
-svfloat16_t svclasta(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_s32)))
-svint32_t svclasta(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_s64)))
-svint64_t svclasta(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclasta_s16)))
-svint16_t svclasta(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_u8)))
-uint8_t svclastb(svbool_t, uint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_u32)))
-uint32_t svclastb(svbool_t, uint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_u64)))
-uint64_t svclastb(svbool_t, uint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_u16)))
-uint16_t svclastb(svbool_t, uint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_s8)))
-int8_t svclastb(svbool_t, int8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_f64)))
-float64_t svclastb(svbool_t, float64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_f32)))
-float32_t svclastb(svbool_t, float32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_f16)))
-float16_t svclastb(svbool_t, float16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_s32)))
-int32_t svclastb(svbool_t, int32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_s64)))
-int64_t svclastb(svbool_t, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_n_s16)))
-int16_t svclastb(svbool_t, int16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_u8)))
-svuint8_t svclastb(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_u32)))
-svuint32_t svclastb(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_u64)))
-svuint64_t svclastb(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_u16)))
-svuint16_t svclastb(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_s8)))
-svint8_t svclastb(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_f64)))
-svfloat64_t svclastb(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_f32)))
-svfloat32_t svclastb(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_f16)))
-svfloat16_t svclastb(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_s32)))
-svint32_t svclastb(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_s64)))
-svint64_t svclastb(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclastb_s16)))
-svint16_t svclastb(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s8_m)))
-svuint8_t svcls_m(svuint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s32_m)))
-svuint32_t svcls_m(svuint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s64_m)))
-svuint64_t svcls_m(svuint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s16_m)))
-svuint16_t svcls_m(svuint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s8_x)))
-svuint8_t svcls_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s32_x)))
-svuint32_t svcls_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s64_x)))
-svuint64_t svcls_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s16_x)))
-svuint16_t svcls_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s8_z)))
-svuint8_t svcls_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s32_z)))
-svuint32_t svcls_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s64_z)))
-svuint64_t svcls_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcls_s16_z)))
-svuint16_t svcls_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u8_m)))
-svuint8_t svclz_m(svuint8_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u32_m)))
-svuint32_t svclz_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u64_m)))
-svuint64_t svclz_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u16_m)))
-svuint16_t svclz_m(svuint16_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s8_m)))
-svuint8_t svclz_m(svuint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s32_m)))
-svuint32_t svclz_m(svuint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s64_m)))
-svuint64_t svclz_m(svuint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s16_m)))
-svuint16_t svclz_m(svuint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u8_x)))
-svuint8_t svclz_x(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u32_x)))
-svuint32_t svclz_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u64_x)))
-svuint64_t svclz_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u16_x)))
-svuint16_t svclz_x(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s8_x)))
-svuint8_t svclz_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s32_x)))
-svuint32_t svclz_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s64_x)))
-svuint64_t svclz_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s16_x)))
-svuint16_t svclz_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u8_z)))
-svuint8_t svclz_z(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u32_z)))
-svuint32_t svclz_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u64_z)))
-svuint64_t svclz_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_u16_z)))
-svuint16_t svclz_z(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s8_z)))
-svuint8_t svclz_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s32_z)))
-svuint32_t svclz_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s64_z)))
-svuint64_t svclz_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclz_s16_z)))
-svuint16_t svclz_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f64_m)))
-svfloat64_t svcmla_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f32_m)))
-svfloat32_t svcmla_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f16_m)))
-svfloat16_t svcmla_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f64_x)))
-svfloat64_t svcmla_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f32_x)))
-svfloat32_t svcmla_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f16_x)))
-svfloat16_t svcmla_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f64_z)))
-svfloat64_t svcmla_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f32_z)))
-svfloat32_t svcmla_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_f16_z)))
-svfloat16_t svcmla_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_f32)))
-svfloat32_t svcmla_lane(svfloat32_t, svfloat32_t, svfloat32_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmla_lane_f16)))
-svfloat16_t svcmla_lane(svfloat16_t, svfloat16_t, svfloat16_t, uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_f64)))
-svbool_t svcmpeq(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_f32)))
-svbool_t svcmpeq(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_f16)))
-svbool_t svcmpeq(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_u8)))
-svbool_t svcmpeq(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_u32)))
-svbool_t svcmpeq(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_u64)))
-svbool_t svcmpeq(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_u16)))
-svbool_t svcmpeq(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_s8)))
-svbool_t svcmpeq(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_s32)))
-svbool_t svcmpeq(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_s64)))
-svbool_t svcmpeq(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_n_s16)))
-svbool_t svcmpeq(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_u8)))
-svbool_t svcmpeq(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_u32)))
-svbool_t svcmpeq(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_u64)))
-svbool_t svcmpeq(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_u16)))
-svbool_t svcmpeq(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_s8)))
-svbool_t svcmpeq(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_s32)))
-svbool_t svcmpeq(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_s64)))
-svbool_t svcmpeq(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_s16)))
-svbool_t svcmpeq(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_f64)))
-svbool_t svcmpeq(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_f32)))
-svbool_t svcmpeq(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_f16)))
-svbool_t svcmpeq(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_n_s8)))
-svbool_t svcmpeq_wide(svbool_t, svint8_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_n_s32)))
-svbool_t svcmpeq_wide(svbool_t, svint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_n_s16)))
-svbool_t svcmpeq_wide(svbool_t, svint16_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_s8)))
-svbool_t svcmpeq_wide(svbool_t, svint8_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_s32)))
-svbool_t svcmpeq_wide(svbool_t, svint32_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpeq_wide_s16)))
-svbool_t svcmpeq_wide(svbool_t, svint16_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_f64)))
-svbool_t svcmpge(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_f32)))
-svbool_t svcmpge(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_f16)))
-svbool_t svcmpge(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_s8)))
-svbool_t svcmpge(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_s32)))
-svbool_t svcmpge(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_s64)))
-svbool_t svcmpge(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_s16)))
-svbool_t svcmpge(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_u8)))
-svbool_t svcmpge(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_u32)))
-svbool_t svcmpge(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_u64)))
-svbool_t svcmpge(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_n_u16)))
-svbool_t svcmpge(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_s8)))
-svbool_t svcmpge(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_s32)))
-svbool_t svcmpge(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_s64)))
-svbool_t svcmpge(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_s16)))
-svbool_t svcmpge(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_f64)))
-svbool_t svcmpge(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_f32)))
-svbool_t svcmpge(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_f16)))
-svbool_t svcmpge(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_u8)))
-svbool_t svcmpge(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_u32)))
-svbool_t svcmpge(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_u64)))
-svbool_t svcmpge(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_u16)))
-svbool_t svcmpge(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_s8)))
-svbool_t svcmpge_wide(svbool_t, svint8_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_s32)))
-svbool_t svcmpge_wide(svbool_t, svint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_s16)))
-svbool_t svcmpge_wide(svbool_t, svint16_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_u8)))
-svbool_t svcmpge_wide(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_u32)))
-svbool_t svcmpge_wide(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_n_u16)))
-svbool_t svcmpge_wide(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_s8)))
-svbool_t svcmpge_wide(svbool_t, svint8_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_s32)))
-svbool_t svcmpge_wide(svbool_t, svint32_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_s16)))
-svbool_t svcmpge_wide(svbool_t, svint16_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_u8)))
-svbool_t svcmpge_wide(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_u32)))
-svbool_t svcmpge_wide(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpge_wide_u16)))
-svbool_t svcmpge_wide(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_f64)))
-svbool_t svcmpgt(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_f32)))
-svbool_t svcmpgt(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_f16)))
-svbool_t svcmpgt(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_s8)))
-svbool_t svcmpgt(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_s32)))
-svbool_t svcmpgt(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_s64)))
-svbool_t svcmpgt(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_s16)))
-svbool_t svcmpgt(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_u8)))
-svbool_t svcmpgt(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_u32)))
-svbool_t svcmpgt(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_u64)))
-svbool_t svcmpgt(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_n_u16)))
-svbool_t svcmpgt(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_s8)))
-svbool_t svcmpgt(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_s32)))
-svbool_t svcmpgt(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_s64)))
-svbool_t svcmpgt(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_s16)))
-svbool_t svcmpgt(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_f64)))
-svbool_t svcmpgt(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_f32)))
-svbool_t svcmpgt(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_f16)))
-svbool_t svcmpgt(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_u8)))
-svbool_t svcmpgt(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_u32)))
-svbool_t svcmpgt(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_u64)))
-svbool_t svcmpgt(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_u16)))
-svbool_t svcmpgt(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_s8)))
-svbool_t svcmpgt_wide(svbool_t, svint8_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_s32)))
-svbool_t svcmpgt_wide(svbool_t, svint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_s16)))
-svbool_t svcmpgt_wide(svbool_t, svint16_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_u8)))
-svbool_t svcmpgt_wide(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_u32)))
-svbool_t svcmpgt_wide(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_n_u16)))
-svbool_t svcmpgt_wide(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_s8)))
-svbool_t svcmpgt_wide(svbool_t, svint8_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_s32)))
-svbool_t svcmpgt_wide(svbool_t, svint32_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_s16)))
-svbool_t svcmpgt_wide(svbool_t, svint16_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_u8)))
-svbool_t svcmpgt_wide(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_u32)))
-svbool_t svcmpgt_wide(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpgt_wide_u16)))
-svbool_t svcmpgt_wide(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_f64)))
-svbool_t svcmple(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_f32)))
-svbool_t svcmple(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_f16)))
-svbool_t svcmple(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_s8)))
-svbool_t svcmple(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_s32)))
-svbool_t svcmple(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_s64)))
-svbool_t svcmple(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_s16)))
-svbool_t svcmple(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_u8)))
-svbool_t svcmple(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_u32)))
-svbool_t svcmple(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_u64)))
-svbool_t svcmple(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_n_u16)))
-svbool_t svcmple(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_s8)))
-svbool_t svcmple(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_s32)))
-svbool_t svcmple(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_s64)))
-svbool_t svcmple(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_s16)))
-svbool_t svcmple(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_f64)))
-svbool_t svcmple(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_f32)))
-svbool_t svcmple(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_f16)))
-svbool_t svcmple(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_u8)))
-svbool_t svcmple(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_u32)))
-svbool_t svcmple(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_u64)))
-svbool_t svcmple(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_u16)))
-svbool_t svcmple(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_s8)))
-svbool_t svcmple_wide(svbool_t, svint8_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_s32)))
-svbool_t svcmple_wide(svbool_t, svint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_s16)))
-svbool_t svcmple_wide(svbool_t, svint16_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_u8)))
-svbool_t svcmple_wide(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_u32)))
-svbool_t svcmple_wide(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_n_u16)))
-svbool_t svcmple_wide(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_s8)))
-svbool_t svcmple_wide(svbool_t, svint8_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_s32)))
-svbool_t svcmple_wide(svbool_t, svint32_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_s16)))
-svbool_t svcmple_wide(svbool_t, svint16_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_u8)))
-svbool_t svcmple_wide(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_u32)))
-svbool_t svcmple_wide(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmple_wide_u16)))
-svbool_t svcmple_wide(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_u8)))
-svbool_t svcmplt(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_u32)))
-svbool_t svcmplt(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_u64)))
-svbool_t svcmplt(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_u16)))
-svbool_t svcmplt(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_f64)))
-svbool_t svcmplt(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_f32)))
-svbool_t svcmplt(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_f16)))
-svbool_t svcmplt(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_s8)))
-svbool_t svcmplt(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_s32)))
-svbool_t svcmplt(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_s64)))
-svbool_t svcmplt(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_n_s16)))
-svbool_t svcmplt(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_u8)))
-svbool_t svcmplt(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_u32)))
-svbool_t svcmplt(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_u64)))
-svbool_t svcmplt(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_u16)))
-svbool_t svcmplt(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_s8)))
-svbool_t svcmplt(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_s32)))
-svbool_t svcmplt(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_s64)))
-svbool_t svcmplt(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_s16)))
-svbool_t svcmplt(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_f64)))
-svbool_t svcmplt(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_f32)))
-svbool_t svcmplt(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_f16)))
-svbool_t svcmplt(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_u8)))
-svbool_t svcmplt_wide(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_u32)))
-svbool_t svcmplt_wide(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_u16)))
-svbool_t svcmplt_wide(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_s8)))
-svbool_t svcmplt_wide(svbool_t, svint8_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_s32)))
-svbool_t svcmplt_wide(svbool_t, svint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_n_s16)))
-svbool_t svcmplt_wide(svbool_t, svint16_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_u8)))
-svbool_t svcmplt_wide(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_u32)))
-svbool_t svcmplt_wide(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_u16)))
-svbool_t svcmplt_wide(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_s8)))
-svbool_t svcmplt_wide(svbool_t, svint8_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_s32)))
-svbool_t svcmplt_wide(svbool_t, svint32_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmplt_wide_s16)))
-svbool_t svcmplt_wide(svbool_t, svint16_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_f64)))
-svbool_t svcmpne(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_f32)))
-svbool_t svcmpne(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_f16)))
-svbool_t svcmpne(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_u8)))
-svbool_t svcmpne(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_u32)))
-svbool_t svcmpne(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_u64)))
-svbool_t svcmpne(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_u16)))
-svbool_t svcmpne(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_s8)))
-svbool_t svcmpne(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_s32)))
-svbool_t svcmpne(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_s64)))
-svbool_t svcmpne(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_n_s16)))
-svbool_t svcmpne(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_u8)))
-svbool_t svcmpne(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_u32)))
-svbool_t svcmpne(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_u64)))
-svbool_t svcmpne(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_u16)))
-svbool_t svcmpne(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_s8)))
-svbool_t svcmpne(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_s32)))
-svbool_t svcmpne(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_s64)))
-svbool_t svcmpne(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_s16)))
-svbool_t svcmpne(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_f64)))
-svbool_t svcmpne(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_f32)))
-svbool_t svcmpne(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_f16)))
-svbool_t svcmpne(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_n_s8)))
-svbool_t svcmpne_wide(svbool_t, svint8_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_n_s32)))
-svbool_t svcmpne_wide(svbool_t, svint32_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_n_s16)))
-svbool_t svcmpne_wide(svbool_t, svint16_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_s8)))
-svbool_t svcmpne_wide(svbool_t, svint8_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_s32)))
-svbool_t svcmpne_wide(svbool_t, svint32_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpne_wide_s16)))
-svbool_t svcmpne_wide(svbool_t, svint16_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_n_f64)))
-svbool_t svcmpuo(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_n_f32)))
-svbool_t svcmpuo(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_n_f16)))
-svbool_t svcmpuo(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_f64)))
-svbool_t svcmpuo(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_f32)))
-svbool_t svcmpuo(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcmpuo_f16)))
-svbool_t svcmpuo(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u8_m)))
-svuint8_t svcnot_m(svuint8_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u32_m)))
-svuint32_t svcnot_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u64_m)))
-svuint64_t svcnot_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u16_m)))
-svuint16_t svcnot_m(svuint16_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s8_m)))
-svint8_t svcnot_m(svint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s32_m)))
-svint32_t svcnot_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s64_m)))
-svint64_t svcnot_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s16_m)))
-svint16_t svcnot_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u8_x)))
-svuint8_t svcnot_x(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u32_x)))
-svuint32_t svcnot_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u64_x)))
-svuint64_t svcnot_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u16_x)))
-svuint16_t svcnot_x(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s8_x)))
-svint8_t svcnot_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s32_x)))
-svint32_t svcnot_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s64_x)))
-svint64_t svcnot_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s16_x)))
-svint16_t svcnot_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u8_z)))
-svuint8_t svcnot_z(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u32_z)))
-svuint32_t svcnot_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u64_z)))
-svuint64_t svcnot_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_u16_z)))
-svuint16_t svcnot_z(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s8_z)))
-svint8_t svcnot_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s32_z)))
-svint32_t svcnot_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s64_z)))
-svint64_t svcnot_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnot_s16_z)))
-svint16_t svcnot_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u8_m)))
-svuint8_t svcnt_m(svuint8_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u32_m)))
-svuint32_t svcnt_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u64_m)))
-svuint64_t svcnt_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u16_m)))
-svuint16_t svcnt_m(svuint16_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s8_m)))
-svuint8_t svcnt_m(svuint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f64_m)))
-svuint64_t svcnt_m(svuint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f32_m)))
-svuint32_t svcnt_m(svuint32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f16_m)))
-svuint16_t svcnt_m(svuint16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s32_m)))
-svuint32_t svcnt_m(svuint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s64_m)))
-svuint64_t svcnt_m(svuint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_m)))
-svuint16_t svcnt_m(svuint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u8_x)))
-svuint8_t svcnt_x(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u32_x)))
-svuint32_t svcnt_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u64_x)))
-svuint64_t svcnt_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u16_x)))
-svuint16_t svcnt_x(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s8_x)))
-svuint8_t svcnt_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f64_x)))
-svuint64_t svcnt_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f32_x)))
-svuint32_t svcnt_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f16_x)))
-svuint16_t svcnt_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s32_x)))
-svuint32_t svcnt_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s64_x)))
-svuint64_t svcnt_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_x)))
-svuint16_t svcnt_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u8_z)))
-svuint8_t svcnt_z(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u32_z)))
-svuint32_t svcnt_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u64_z)))
-svuint64_t svcnt_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_u16_z)))
-svuint16_t svcnt_z(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s8_z)))
-svuint8_t svcnt_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f64_z)))
-svuint64_t svcnt_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f32_z)))
-svuint32_t svcnt_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_f16_z)))
-svuint16_t svcnt_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s32_z)))
-svuint32_t svcnt_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s64_z)))
-svuint64_t svcnt_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_z)))
-svuint16_t svcnt_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_u8)))
-svuint8x2_t svcreate2(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_u32)))
-svuint32x2_t svcreate2(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_u64)))
-svuint64x2_t svcreate2(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_u16)))
-svuint16x2_t svcreate2(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_s8)))
-svint8x2_t svcreate2(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_f64)))
-svfloat64x2_t svcreate2(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_f32)))
-svfloat32x2_t svcreate2(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_f16)))
-svfloat16x2_t svcreate2(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_s32)))
-svint32x2_t svcreate2(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_s64)))
-svint64x2_t svcreate2(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate2_s16)))
-svint16x2_t svcreate2(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_u8)))
-svuint8x3_t svcreate3(svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_u32)))
-svuint32x3_t svcreate3(svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_u64)))
-svuint64x3_t svcreate3(svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_u16)))
-svuint16x3_t svcreate3(svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_s8)))
-svint8x3_t svcreate3(svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_f64)))
-svfloat64x3_t svcreate3(svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_f32)))
-svfloat32x3_t svcreate3(svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_f16)))
-svfloat16x3_t svcreate3(svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_s32)))
-svint32x3_t svcreate3(svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_s64)))
-svint64x3_t svcreate3(svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate3_s16)))
-svint16x3_t svcreate3(svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_u8)))
-svuint8x4_t svcreate4(svuint8_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_u32)))
-svuint32x4_t svcreate4(svuint32_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_u64)))
-svuint64x4_t svcreate4(svuint64_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_u16)))
-svuint16x4_t svcreate4(svuint16_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_s8)))
-svint8x4_t svcreate4(svint8_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_f64)))
-svfloat64x4_t svcreate4(svfloat64_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_f32)))
-svfloat32x4_t svcreate4(svfloat32_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_f16)))
-svfloat16x4_t svcreate4(svfloat16_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_s32)))
-svint32x4_t svcreate4(svint32_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_s64)))
-svint64x4_t svcreate4(svint64_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcreate4_s16)))
-svint16x4_t svcreate4(svint16_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f32_m)))
-svfloat16_t svcvt_f16_m(svfloat16_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f32_x)))
-svfloat16_t svcvt_f16_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f32_z)))
-svfloat16_t svcvt_f16_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f64_m)))
-svfloat16_t svcvt_f16_m(svfloat16_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f64_x)))
-svfloat16_t svcvt_f16_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_f64_z)))
-svfloat16_t svcvt_f16_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s16_m)))
-svfloat16_t svcvt_f16_m(svfloat16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s16_x)))
-svfloat16_t svcvt_f16_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s16_z)))
-svfloat16_t svcvt_f16_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s32_m)))
-svfloat16_t svcvt_f16_m(svfloat16_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s32_x)))
-svfloat16_t svcvt_f16_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s32_z)))
-svfloat16_t svcvt_f16_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s64_m)))
-svfloat16_t svcvt_f16_m(svfloat16_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s64_x)))
-svfloat16_t svcvt_f16_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_s64_z)))
-svfloat16_t svcvt_f16_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u16_m)))
-svfloat16_t svcvt_f16_m(svfloat16_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u16_x)))
-svfloat16_t svcvt_f16_x(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u16_z)))
-svfloat16_t svcvt_f16_z(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u32_m)))
-svfloat16_t svcvt_f16_m(svfloat16_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u32_x)))
-svfloat16_t svcvt_f16_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u32_z)))
-svfloat16_t svcvt_f16_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u64_m)))
-svfloat16_t svcvt_f16_m(svfloat16_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u64_x)))
-svfloat16_t svcvt_f16_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f16_u64_z)))
-svfloat16_t svcvt_f16_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f16_m)))
-svfloat32_t svcvt_f32_m(svfloat32_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f16_x)))
-svfloat32_t svcvt_f32_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f16_z)))
-svfloat32_t svcvt_f32_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f64_m)))
-svfloat32_t svcvt_f32_m(svfloat32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f64_x)))
-svfloat32_t svcvt_f32_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_f64_z)))
-svfloat32_t svcvt_f32_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_m)))
-svfloat32_t svcvt_f32_m(svfloat32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_x)))
-svfloat32_t svcvt_f32_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s32_z)))
-svfloat32_t svcvt_f32_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s64_m)))
-svfloat32_t svcvt_f32_m(svfloat32_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s64_x)))
-svfloat32_t svcvt_f32_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_s64_z)))
-svfloat32_t svcvt_f32_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_m)))
-svfloat32_t svcvt_f32_m(svfloat32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_x)))
-svfloat32_t svcvt_f32_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u32_z)))
-svfloat32_t svcvt_f32_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u64_m)))
-svfloat32_t svcvt_f32_m(svfloat32_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u64_x)))
-svfloat32_t svcvt_f32_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f32_u64_z)))
-svfloat32_t svcvt_f32_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f16_m)))
-svfloat64_t svcvt_f64_m(svfloat64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f16_x)))
-svfloat64_t svcvt_f64_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f16_z)))
-svfloat64_t svcvt_f64_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f32_m)))
-svfloat64_t svcvt_f64_m(svfloat64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f32_x)))
-svfloat64_t svcvt_f64_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_f32_z)))
-svfloat64_t svcvt_f64_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s32_m)))
-svfloat64_t svcvt_f64_m(svfloat64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s32_x)))
-svfloat64_t svcvt_f64_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s32_z)))
-svfloat64_t svcvt_f64_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s64_m)))
-svfloat64_t svcvt_f64_m(svfloat64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s64_x)))
-svfloat64_t svcvt_f64_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_s64_z)))
-svfloat64_t svcvt_f64_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u32_m)))
-svfloat64_t svcvt_f64_m(svfloat64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u32_x)))
-svfloat64_t svcvt_f64_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u32_z)))
-svfloat64_t svcvt_f64_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u64_m)))
-svfloat64_t svcvt_f64_m(svfloat64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u64_x)))
-svfloat64_t svcvt_f64_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_f64_u64_z)))
-svfloat64_t svcvt_f64_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s16_f16_m)))
-svint16_t svcvt_s16_m(svint16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s16_f16_x)))
-svint16_t svcvt_s16_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s16_f16_z)))
-svint16_t svcvt_s16_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f16_m)))
-svint32_t svcvt_s32_m(svint32_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f16_x)))
-svint32_t svcvt_s32_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f16_z)))
-svint32_t svcvt_s32_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_m)))
-svint32_t svcvt_s32_m(svint32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_x)))
-svint32_t svcvt_s32_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f32_z)))
-svint32_t svcvt_s32_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f64_m)))
-svint32_t svcvt_s32_m(svint32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f64_x)))
-svint32_t svcvt_s32_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s32_f64_z)))
-svint32_t svcvt_s32_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f16_m)))
-svint64_t svcvt_s64_m(svint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f16_x)))
-svint64_t svcvt_s64_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f16_z)))
-svint64_t svcvt_s64_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f32_m)))
-svint64_t svcvt_s64_m(svint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f32_x)))
-svint64_t svcvt_s64_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f32_z)))
-svint64_t svcvt_s64_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f64_m)))
-svint64_t svcvt_s64_m(svint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f64_x)))
-svint64_t svcvt_s64_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_s64_f64_z)))
-svint64_t svcvt_s64_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u16_f16_m)))
-svuint16_t svcvt_u16_m(svuint16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u16_f16_x)))
-svuint16_t svcvt_u16_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u16_f16_z)))
-svuint16_t svcvt_u16_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f16_m)))
-svuint32_t svcvt_u32_m(svuint32_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f16_x)))
-svuint32_t svcvt_u32_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f16_z)))
-svuint32_t svcvt_u32_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_m)))
-svuint32_t svcvt_u32_m(svuint32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_x)))
-svuint32_t svcvt_u32_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f32_z)))
-svuint32_t svcvt_u32_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f64_m)))
-svuint32_t svcvt_u32_m(svuint32_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f64_x)))
-svuint32_t svcvt_u32_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u32_f64_z)))
-svuint32_t svcvt_u32_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f16_m)))
-svuint64_t svcvt_u64_m(svuint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f16_x)))
-svuint64_t svcvt_u64_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f16_z)))
-svuint64_t svcvt_u64_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f32_m)))
-svuint64_t svcvt_u64_m(svuint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f32_x)))
-svuint64_t svcvt_u64_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f32_z)))
-svuint64_t svcvt_u64_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f64_m)))
-svuint64_t svcvt_u64_m(svuint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f64_x)))
-svuint64_t svcvt_u64_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_u64_f64_z)))
-svuint64_t svcvt_u64_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f64_m)))
-svfloat64_t svdiv_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f32_m)))
-svfloat32_t svdiv_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f16_m)))
-svfloat16_t svdiv_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f64_x)))
-svfloat64_t svdiv_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f32_x)))
-svfloat32_t svdiv_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f16_x)))
-svfloat16_t svdiv_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f64_z)))
-svfloat64_t svdiv_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f32_z)))
-svfloat32_t svdiv_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_f16_z)))
-svfloat16_t svdiv_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s32_m)))
-svint32_t svdiv_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s64_m)))
-svint64_t svdiv_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s32_x)))
-svint32_t svdiv_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s64_x)))
-svint64_t svdiv_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s32_z)))
-svint32_t svdiv_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_s64_z)))
-svint64_t svdiv_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u32_m)))
-svuint32_t svdiv_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u64_m)))
-svuint64_t svdiv_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u32_x)))
-svuint32_t svdiv_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u64_x)))
-svuint64_t svdiv_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u32_z)))
-svuint32_t svdiv_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_n_u64_z)))
-svuint64_t svdiv_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f64_m)))
-svfloat64_t svdiv_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f32_m)))
-svfloat32_t svdiv_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f16_m)))
-svfloat16_t svdiv_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f64_x)))
-svfloat64_t svdiv_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f32_x)))
-svfloat32_t svdiv_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f16_x)))
-svfloat16_t svdiv_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f64_z)))
-svfloat64_t svdiv_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f32_z)))
-svfloat32_t svdiv_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_f16_z)))
-svfloat16_t svdiv_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s32_m)))
-svint32_t svdiv_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s64_m)))
-svint64_t svdiv_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s32_x)))
-svint32_t svdiv_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s64_x)))
-svint64_t svdiv_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s32_z)))
-svint32_t svdiv_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_s64_z)))
-svint64_t svdiv_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u32_m)))
-svuint32_t svdiv_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u64_m)))
-svuint64_t svdiv_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u32_x)))
-svuint32_t svdiv_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u64_x)))
-svuint64_t svdiv_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u32_z)))
-svuint32_t svdiv_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdiv_u64_z)))
-svuint64_t svdiv_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f64_m)))
-svfloat64_t svdivr_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f32_m)))
-svfloat32_t svdivr_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f16_m)))
-svfloat16_t svdivr_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f64_x)))
-svfloat64_t svdivr_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f32_x)))
-svfloat32_t svdivr_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f16_x)))
-svfloat16_t svdivr_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f64_z)))
-svfloat64_t svdivr_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f32_z)))
-svfloat32_t svdivr_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_f16_z)))
-svfloat16_t svdivr_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s32_m)))
-svint32_t svdivr_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s64_m)))
-svint64_t svdivr_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s32_x)))
-svint32_t svdivr_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s64_x)))
-svint64_t svdivr_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s32_z)))
-svint32_t svdivr_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_s64_z)))
-svint64_t svdivr_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u32_m)))
-svuint32_t svdivr_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u64_m)))
-svuint64_t svdivr_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u32_x)))
-svuint32_t svdivr_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u64_x)))
-svuint64_t svdivr_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u32_z)))
-svuint32_t svdivr_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_n_u64_z)))
-svuint64_t svdivr_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f64_m)))
-svfloat64_t svdivr_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f32_m)))
-svfloat32_t svdivr_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f16_m)))
-svfloat16_t svdivr_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f64_x)))
-svfloat64_t svdivr_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f32_x)))
-svfloat32_t svdivr_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f16_x)))
-svfloat16_t svdivr_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f64_z)))
-svfloat64_t svdivr_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f32_z)))
-svfloat32_t svdivr_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_f16_z)))
-svfloat16_t svdivr_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s32_m)))
-svint32_t svdivr_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s64_m)))
-svint64_t svdivr_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s32_x)))
-svint32_t svdivr_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s64_x)))
-svint64_t svdivr_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s32_z)))
-svint32_t svdivr_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_s64_z)))
-svint64_t svdivr_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u32_m)))
-svuint32_t svdivr_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u64_m)))
-svuint64_t svdivr_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u32_x)))
-svuint32_t svdivr_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u64_x)))
-svuint64_t svdivr_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u32_z)))
-svuint32_t svdivr_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdivr_u64_z)))
-svuint64_t svdivr_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_s32)))
-svint32_t svdot(svint32_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_s64)))
-svint64_t svdot(svint64_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_u32)))
-svuint32_t svdot(svuint32_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_u64)))
-svuint64_t svdot(svuint64_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_s32)))
-svint32_t svdot(svint32_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_s64)))
-svint64_t svdot(svint64_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_u32)))
-svuint32_t svdot(svuint32_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_u64)))
-svuint64_t svdot(svuint64_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_s32)))
-svint32_t svdot_lane(svint32_t, svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_s64)))
-svint64_t svdot_lane(svint64_t, svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_u32)))
-svuint32_t svdot_lane(svuint32_t, svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_u64)))
-svuint64_t svdot_lane(svuint64_t, svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u8)))
-svuint8_t svdup_u8(uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u32)))
-svuint32_t svdup_u32(uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u64)))
-svuint64_t svdup_u64(uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u16)))
-svuint16_t svdup_u16(uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s8)))
-svint8_t svdup_s8(int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f64)))
-svfloat64_t svdup_f64(float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f32)))
-svfloat32_t svdup_f32(float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f16)))
-svfloat16_t svdup_f16(float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s32)))
-svint32_t svdup_s32(int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s64)))
-svint64_t svdup_s64(int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s16)))
-svint16_t svdup_s16(int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u8_m)))
-svuint8_t svdup_u8_m(svuint8_t, svbool_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u32_m)))
-svuint32_t svdup_u32_m(svuint32_t, svbool_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u64_m)))
-svuint64_t svdup_u64_m(svuint64_t, svbool_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u16_m)))
-svuint16_t svdup_u16_m(svuint16_t, svbool_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s8_m)))
-svint8_t svdup_s8_m(svint8_t, svbool_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f64_m)))
-svfloat64_t svdup_f64_m(svfloat64_t, svbool_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f32_m)))
-svfloat32_t svdup_f32_m(svfloat32_t, svbool_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f16_m)))
-svfloat16_t svdup_f16_m(svfloat16_t, svbool_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s32_m)))
-svint32_t svdup_s32_m(svint32_t, svbool_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s64_m)))
-svint64_t svdup_s64_m(svint64_t, svbool_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s16_m)))
-svint16_t svdup_s16_m(svint16_t, svbool_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_b8)))
-svbool_t svdup_b8(bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_b32)))
-svbool_t svdup_b32(bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_b64)))
-svbool_t svdup_b64(bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_b16)))
-svbool_t svdup_b16(bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u8_x)))
-svuint8_t svdup_u8_x(svbool_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u32_x)))
-svuint32_t svdup_u32_x(svbool_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u64_x)))
-svuint64_t svdup_u64_x(svbool_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u16_x)))
-svuint16_t svdup_u16_x(svbool_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s8_x)))
-svint8_t svdup_s8_x(svbool_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f64_x)))
-svfloat64_t svdup_f64_x(svbool_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f32_x)))
-svfloat32_t svdup_f32_x(svbool_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f16_x)))
-svfloat16_t svdup_f16_x(svbool_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s32_x)))
-svint32_t svdup_s32_x(svbool_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s64_x)))
-svint64_t svdup_s64_x(svbool_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s16_x)))
-svint16_t svdup_s16_x(svbool_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u8_z)))
-svuint8_t svdup_u8_z(svbool_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u32_z)))
-svuint32_t svdup_u32_z(svbool_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u64_z)))
-svuint64_t svdup_u64_z(svbool_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_u16_z)))
-svuint16_t svdup_u16_z(svbool_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s8_z)))
-svint8_t svdup_s8_z(svbool_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f64_z)))
-svfloat64_t svdup_f64_z(svbool_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f32_z)))
-svfloat32_t svdup_f32_z(svbool_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_f16_z)))
-svfloat16_t svdup_f16_z(svbool_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s32_z)))
-svint32_t svdup_s32_z(svbool_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s64_z)))
-svint64_t svdup_s64_z(svbool_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_n_s16_z)))
-svint16_t svdup_s16_z(svbool_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_u8)))
-svuint8_t svdup_lane(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_u32)))
-svuint32_t svdup_lane(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_u64)))
-svuint64_t svdup_lane(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_u16)))
-svuint16_t svdup_lane(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s8)))
-svint8_t svdup_lane(svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_f64)))
-svfloat64_t svdup_lane(svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_f32)))
-svfloat32_t svdup_lane(svfloat32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_f16)))
-svfloat16_t svdup_lane(svfloat16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s32)))
-svint32_t svdup_lane(svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s64)))
-svint64_t svdup_lane(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s16)))
-svint16_t svdup_lane(svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
-svuint8_t svdupq_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
-svint8_t svdupq_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u16)))
-svuint16_t svdupq_u16(uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f16)))
-svfloat16_t svdupq_f16(float16_t, float16_t, float16_t, float16_t, float16_t, float16_t, float16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s16)))
-svint16_t svdupq_s16(int16_t, int16_t, int16_t, int16_t, int16_t, int16_t, int16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u32)))
-svuint32_t svdupq_u32(uint32_t, uint32_t, uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f32)))
-svfloat32_t svdupq_f32(float32_t, float32_t, float32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s32)))
-svint32_t svdupq_s32(int32_t, int32_t, int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u64)))
-svuint64_t svdupq_u64(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f64)))
-svfloat64_t svdupq_f64(float64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s64)))
-svint64_t svdupq_s64(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
-svbool_t svdupq_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b16)))
-svbool_t svdupq_b16(bool, bool, bool, bool, bool, bool, bool, bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b32)))
-svbool_t svdupq_b32(bool, bool, bool, bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b64)))
-svbool_t svdupq_b64(bool, bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u8)))
-svuint8_t svdupq_lane(svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u32)))
-svuint32_t svdupq_lane(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u64)))
-svuint64_t svdupq_lane(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u16)))
-svuint16_t svdupq_lane(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_s8)))
-svint8_t svdupq_lane(svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_f64)))
-svfloat64_t svdupq_lane(svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_f32)))
-svfloat32_t svdupq_lane(svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_f16)))
-svfloat16_t svdupq_lane(svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_s32)))
-svint32_t svdupq_lane(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_s64)))
-svint64_t svdupq_lane(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_s16)))
-svint16_t svdupq_lane(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_b_z)))
-svbool_t sveor_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u8_m)))
-svuint8_t sveor_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u32_m)))
-svuint32_t sveor_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u64_m)))
-svuint64_t sveor_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u16_m)))
-svuint16_t sveor_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s8_m)))
-svint8_t sveor_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s32_m)))
-svint32_t sveor_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s64_m)))
-svint64_t sveor_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s16_m)))
-svint16_t sveor_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u8_x)))
-svuint8_t sveor_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u32_x)))
-svuint32_t sveor_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u64_x)))
-svuint64_t sveor_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u16_x)))
-svuint16_t sveor_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s8_x)))
-svint8_t sveor_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s32_x)))
-svint32_t sveor_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s64_x)))
-svint64_t sveor_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s16_x)))
-svint16_t sveor_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u8_z)))
-svuint8_t sveor_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u32_z)))
-svuint32_t sveor_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u64_z)))
-svuint64_t sveor_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_u16_z)))
-svuint16_t sveor_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s8_z)))
-svint8_t sveor_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s32_z)))
-svint32_t sveor_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s64_z)))
-svint64_t sveor_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_n_s16_z)))
-svint16_t sveor_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u8_m)))
-svuint8_t sveor_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u32_m)))
-svuint32_t sveor_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u64_m)))
-svuint64_t sveor_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u16_m)))
-svuint16_t sveor_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s8_m)))
-svint8_t sveor_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s32_m)))
-svint32_t sveor_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s64_m)))
-svint64_t sveor_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s16_m)))
-svint16_t sveor_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u8_x)))
-svuint8_t sveor_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u32_x)))
-svuint32_t sveor_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u64_x)))
-svuint64_t sveor_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u16_x)))
-svuint16_t sveor_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s8_x)))
-svint8_t sveor_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s32_x)))
-svint32_t sveor_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s64_x)))
-svint64_t sveor_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s16_x)))
-svint16_t sveor_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u8_z)))
-svuint8_t sveor_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u32_z)))
-svuint32_t sveor_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u64_z)))
-svuint64_t sveor_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_u16_z)))
-svuint16_t sveor_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s8_z)))
-svint8_t sveor_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s32_z)))
-svint32_t sveor_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s64_z)))
-svint64_t sveor_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveor_s16_z)))
-svint16_t sveor_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_u8)))
-uint8_t sveorv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_u32)))
-uint32_t sveorv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_u64)))
-uint64_t sveorv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_u16)))
-uint16_t sveorv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_s8)))
-int8_t sveorv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_s32)))
-int32_t sveorv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_s64)))
-int64_t sveorv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_sveorv_s16)))
-int16_t sveorv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_u8)))
-svuint8_t svext(svuint8_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_u32)))
-svuint32_t svext(svuint32_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_u64)))
-svuint64_t svext(svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_u16)))
-svuint16_t svext(svuint16_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_s8)))
-svint8_t svext(svint8_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_f64)))
-svfloat64_t svext(svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_f32)))
-svfloat32_t svext(svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_f16)))
-svfloat16_t svext(svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_s32)))
-svint32_t svext(svint32_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_s64)))
-svint64_t svext(svint64_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svext_s16)))
-svint16_t svext(svint16_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s32_m)))
-svint32_t svextb_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s64_m)))
-svint64_t svextb_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s16_m)))
-svint16_t svextb_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s32_x)))
-svint32_t svextb_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s64_x)))
-svint64_t svextb_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s16_x)))
-svint16_t svextb_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s32_z)))
-svint32_t svextb_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s64_z)))
-svint64_t svextb_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_s16_z)))
-svint16_t svextb_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u32_m)))
-svuint32_t svextb_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u64_m)))
-svuint64_t svextb_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u16_m)))
-svuint16_t svextb_m(svuint16_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u32_x)))
-svuint32_t svextb_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u64_x)))
-svuint64_t svextb_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u16_x)))
-svuint16_t svextb_x(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u32_z)))
-svuint32_t svextb_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u64_z)))
-svuint64_t svextb_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextb_u16_z)))
-svuint16_t svextb_z(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s32_m)))
-svint32_t svexth_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s64_m)))
-svint64_t svexth_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s32_x)))
-svint32_t svexth_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s64_x)))
-svint64_t svexth_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s32_z)))
-svint32_t svexth_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_s64_z)))
-svint64_t svexth_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u32_m)))
-svuint32_t svexth_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u64_m)))
-svuint64_t svexth_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u32_x)))
-svuint32_t svexth_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u64_x)))
-svuint64_t svexth_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u32_z)))
-svuint32_t svexth_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svexth_u64_z)))
-svuint64_t svexth_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_s64_m)))
-svint64_t svextw_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_s64_x)))
-svint64_t svextw_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_s64_z)))
-svint64_t svextw_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_u64_m)))
-svuint64_t svextw_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_u64_x)))
-svuint64_t svextw_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextw_u64_z)))
-svuint64_t svextw_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_u8)))
-svuint8_t svget2(svuint8x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_u32)))
-svuint32_t svget2(svuint32x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_u64)))
-svuint64_t svget2(svuint64x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_u16)))
-svuint16_t svget2(svuint16x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_s8)))
-svint8_t svget2(svint8x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_f64)))
-svfloat64_t svget2(svfloat64x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_f32)))
-svfloat32_t svget2(svfloat32x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_f16)))
-svfloat16_t svget2(svfloat16x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_s32)))
-svint32_t svget2(svint32x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_s64)))
-svint64_t svget2(svint64x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget2_s16)))
-svint16_t svget2(svint16x2_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_u8)))
-svuint8_t svget3(svuint8x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_u32)))
-svuint32_t svget3(svuint32x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_u64)))
-svuint64_t svget3(svuint64x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_u16)))
-svuint16_t svget3(svuint16x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_s8)))
-svint8_t svget3(svint8x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_f64)))
-svfloat64_t svget3(svfloat64x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_f32)))
-svfloat32_t svget3(svfloat32x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_f16)))
-svfloat16_t svget3(svfloat16x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_s32)))
-svint32_t svget3(svint32x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_s64)))
-svint64_t svget3(svint64x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget3_s16)))
-svint16_t svget3(svint16x3_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_u8)))
-svuint8_t svget4(svuint8x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_u32)))
-svuint32_t svget4(svuint32x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_u64)))
-svuint64_t svget4(svuint64x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_u16)))
-svuint16_t svget4(svuint16x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_s8)))
-svint8_t svget4(svint8x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_f64)))
-svfloat64_t svget4(svfloat64x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_f32)))
-svfloat32_t svget4(svfloat32x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_f16)))
-svfloat16_t svget4(svfloat16x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_s32)))
-svint32_t svget4(svint32x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_s64)))
-svint64_t svget4(svint64x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget4_s16)))
-svint16_t svget4(svint16x4_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_u8)))
-svuint8_t svinsr(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_u32)))
-svuint32_t svinsr(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_u64)))
-svuint64_t svinsr(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_u16)))
-svuint16_t svinsr(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_s8)))
-svint8_t svinsr(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_f64)))
-svfloat64_t svinsr(svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_f32)))
-svfloat32_t svinsr(svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_f16)))
-svfloat16_t svinsr(svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_s32)))
-svint32_t svinsr(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_s64)))
-svint64_t svinsr(svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svinsr_n_s16)))
-svint16_t svinsr(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_u8)))
-uint8_t svlasta(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_u32)))
-uint32_t svlasta(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_u64)))
-uint64_t svlasta(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_u16)))
-uint16_t svlasta(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_s8)))
-int8_t svlasta(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_f64)))
-float64_t svlasta(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_f32)))
-float32_t svlasta(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_f16)))
-float16_t svlasta(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_s32)))
-int32_t svlasta(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_s64)))
-int64_t svlasta(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlasta_s16)))
-int16_t svlasta(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_u8)))
-uint8_t svlastb(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_u32)))
-uint32_t svlastb(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_u64)))
-uint64_t svlastb(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_u16)))
-uint16_t svlastb(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_s8)))
-int8_t svlastb(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_f64)))
-float64_t svlastb(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_f32)))
-float32_t svlastb(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_f16)))
-float16_t svlastb(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_s32)))
-int32_t svlastb(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_s64)))
-int64_t svlastb(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlastb_s16)))
-int16_t svlastb(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u8)))
-svuint8_t svld1(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u32)))
-svuint32_t svld1(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u64)))
-svuint64_t svld1(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_u16)))
-svuint16_t svld1(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s8)))
-svint8_t svld1(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f64)))
-svfloat64_t svld1(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f32)))
-svfloat32_t svld1(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_f16)))
-svfloat16_t svld1(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s32)))
-svint32_t svld1(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s64)))
-svint64_t svld1(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_s16)))
-svint16_t svld1(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u8)))
-svuint8_t svld1_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u32)))
-svuint32_t svld1_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u64)))
-svuint64_t svld1_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_u16)))
-svuint16_t svld1_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s8)))
-svint8_t svld1_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f64)))
-svfloat64_t svld1_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f32)))
-svfloat32_t svld1_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_f16)))
-svfloat16_t svld1_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s32)))
-svint32_t svld1_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s64)))
-svint64_t svld1_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1_vnum_s16)))
-svint16_t svld1_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_u8)))
-svuint8_t svld1rq(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_u32)))
-svuint32_t svld1rq(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_u64)))
-svuint64_t svld1rq(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_u16)))
-svuint16_t svld1rq(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_s8)))
-svint8_t svld1rq(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_f64)))
-svfloat64_t svld1rq(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_f32)))
-svfloat32_t svld1rq(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_f16)))
-svfloat16_t svld1rq(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_s32)))
-svint32_t svld1rq(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_s64)))
-svint64_t svld1rq(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1rq_s16)))
-svint16_t svld1rq(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_u8)))
-svuint8x2_t svld2(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_u32)))
-svuint32x2_t svld2(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_u64)))
-svuint64x2_t svld2(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_u16)))
-svuint16x2_t svld2(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_s8)))
-svint8x2_t svld2(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_f64)))
-svfloat64x2_t svld2(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_f32)))
-svfloat32x2_t svld2(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_f16)))
-svfloat16x2_t svld2(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_s32)))
-svint32x2_t svld2(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_s64)))
-svint64x2_t svld2(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_s16)))
-svint16x2_t svld2(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_u8)))
-svuint8x2_t svld2_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_u32)))
-svuint32x2_t svld2_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_u64)))
-svuint64x2_t svld2_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_u16)))
-svuint16x2_t svld2_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_s8)))
-svint8x2_t svld2_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_f64)))
-svfloat64x2_t svld2_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_f32)))
-svfloat32x2_t svld2_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_f16)))
-svfloat16x2_t svld2_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_s32)))
-svint32x2_t svld2_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_s64)))
-svint64x2_t svld2_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld2_vnum_s16)))
-svint16x2_t svld2_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_u8)))
-svuint8x3_t svld3(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_u32)))
-svuint32x3_t svld3(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_u64)))
-svuint64x3_t svld3(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_u16)))
-svuint16x3_t svld3(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_s8)))
-svint8x3_t svld3(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_f64)))
-svfloat64x3_t svld3(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_f32)))
-svfloat32x3_t svld3(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_f16)))
-svfloat16x3_t svld3(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_s32)))
-svint32x3_t svld3(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_s64)))
-svint64x3_t svld3(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_s16)))
-svint16x3_t svld3(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_u8)))
-svuint8x3_t svld3_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_u32)))
-svuint32x3_t svld3_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_u64)))
-svuint64x3_t svld3_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_u16)))
-svuint16x3_t svld3_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_s8)))
-svint8x3_t svld3_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_f64)))
-svfloat64x3_t svld3_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_f32)))
-svfloat32x3_t svld3_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_f16)))
-svfloat16x3_t svld3_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_s32)))
-svint32x3_t svld3_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_s64)))
-svint64x3_t svld3_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld3_vnum_s16)))
-svint16x3_t svld3_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_u8)))
-svuint8x4_t svld4(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_u32)))
-svuint32x4_t svld4(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_u64)))
-svuint64x4_t svld4(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_u16)))
-svuint16x4_t svld4(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_s8)))
-svint8x4_t svld4(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_f64)))
-svfloat64x4_t svld4(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_f32)))
-svfloat32x4_t svld4(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_f16)))
-svfloat16x4_t svld4(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_s32)))
-svint32x4_t svld4(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_s64)))
-svint64x4_t svld4(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_s16)))
-svint16x4_t svld4(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_u8)))
-svuint8x4_t svld4_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_u32)))
-svuint32x4_t svld4_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_u64)))
-svuint64x4_t svld4_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_u16)))
-svuint16x4_t svld4_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_s8)))
-svint8x4_t svld4_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_f64)))
-svfloat64x4_t svld4_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_f32)))
-svfloat32x4_t svld4_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_f16)))
-svfloat16x4_t svld4_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_s32)))
-svint32x4_t svld4_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_s64)))
-svint64x4_t svld4_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld4_vnum_s16)))
-svint16x4_t svld4_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u8)))
-svuint8_t svldnt1(svbool_t, uint8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u32)))
-svuint32_t svldnt1(svbool_t, uint32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u64)))
-svuint64_t svldnt1(svbool_t, uint64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_u16)))
-svuint16_t svldnt1(svbool_t, uint16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s8)))
-svint8_t svldnt1(svbool_t, int8_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f64)))
-svfloat64_t svldnt1(svbool_t, float64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f32)))
-svfloat32_t svldnt1(svbool_t, float32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_f16)))
-svfloat16_t svldnt1(svbool_t, float16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s32)))
-svint32_t svldnt1(svbool_t, int32_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s64)))
-svint64_t svldnt1(svbool_t, int64_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_s16)))
-svint16_t svldnt1(svbool_t, int16_t const *);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u8)))
-svuint8_t svldnt1_vnum(svbool_t, uint8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u32)))
-svuint32_t svldnt1_vnum(svbool_t, uint32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u64)))
-svuint64_t svldnt1_vnum(svbool_t, uint64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_u16)))
-svuint16_t svldnt1_vnum(svbool_t, uint16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s8)))
-svint8_t svldnt1_vnum(svbool_t, int8_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f64)))
-svfloat64_t svldnt1_vnum(svbool_t, float64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f32)))
-svfloat32_t svldnt1_vnum(svbool_t, float32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_f16)))
-svfloat16_t svldnt1_vnum(svbool_t, float16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s32)))
-svint32_t svldnt1_vnum(svbool_t, int32_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s64)))
-svint64_t svldnt1_vnum(svbool_t, int64_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svldnt1_vnum_s16)))
-svint16_t svldnt1_vnum(svbool_t, int16_t const *, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_u8)))
-uint64_t svlen(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_u32)))
-uint64_t svlen(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_u64)))
-uint64_t svlen(svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_u16)))
-uint64_t svlen(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_s8)))
-uint64_t svlen(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_f64)))
-uint64_t svlen(svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_f32)))
-uint64_t svlen(svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_f16)))
-uint64_t svlen(svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_s32)))
-uint64_t svlen(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_s64)))
-uint64_t svlen(svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlen_s16)))
-uint64_t svlen(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u8_m)))
-svuint8_t svlsl_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u32_m)))
-svuint32_t svlsl_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u64_m)))
-svuint64_t svlsl_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u16_m)))
-svuint16_t svlsl_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s8_m)))
-svint8_t svlsl_m(svbool_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s32_m)))
-svint32_t svlsl_m(svbool_t, svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s64_m)))
-svint64_t svlsl_m(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s16_m)))
-svint16_t svlsl_m(svbool_t, svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u8_x)))
-svuint8_t svlsl_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u32_x)))
-svuint32_t svlsl_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u64_x)))
-svuint64_t svlsl_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u16_x)))
-svuint16_t svlsl_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s8_x)))
-svint8_t svlsl_x(svbool_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s32_x)))
-svint32_t svlsl_x(svbool_t, svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s64_x)))
-svint64_t svlsl_x(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s16_x)))
-svint16_t svlsl_x(svbool_t, svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u8_z)))
-svuint8_t svlsl_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u32_z)))
-svuint32_t svlsl_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u64_z)))
-svuint64_t svlsl_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_u16_z)))
-svuint16_t svlsl_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s8_z)))
-svint8_t svlsl_z(svbool_t, svint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s32_z)))
-svint32_t svlsl_z(svbool_t, svint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s64_z)))
-svint64_t svlsl_z(svbool_t, svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_n_s16_z)))
-svint16_t svlsl_z(svbool_t, svint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u8_m)))
-svuint8_t svlsl_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u32_m)))
-svuint32_t svlsl_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u64_m)))
-svuint64_t svlsl_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u16_m)))
-svuint16_t svlsl_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s8_m)))
-svint8_t svlsl_m(svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s32_m)))
-svint32_t svlsl_m(svbool_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s64_m)))
-svint64_t svlsl_m(svbool_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s16_m)))
-svint16_t svlsl_m(svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u8_x)))
-svuint8_t svlsl_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u32_x)))
-svuint32_t svlsl_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u64_x)))
-svuint64_t svlsl_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u16_x)))
-svuint16_t svlsl_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s8_x)))
-svint8_t svlsl_x(svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s32_x)))
-svint32_t svlsl_x(svbool_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s64_x)))
-svint64_t svlsl_x(svbool_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s16_x)))
-svint16_t svlsl_x(svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u8_z)))
-svuint8_t svlsl_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u32_z)))
-svuint32_t svlsl_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u64_z)))
-svuint64_t svlsl_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_u16_z)))
-svuint16_t svlsl_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s8_z)))
-svint8_t svlsl_z(svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s32_z)))
-svint32_t svlsl_z(svbool_t, svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s64_z)))
-svint64_t svlsl_z(svbool_t, svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_s16_z)))
-svint16_t svlsl_z(svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u8_m)))
-svuint8_t svlsl_wide_m(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u32_m)))
-svuint32_t svlsl_wide_m(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u16_m)))
-svuint16_t svlsl_wide_m(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s8_m)))
-svint8_t svlsl_wide_m(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s32_m)))
-svint32_t svlsl_wide_m(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s16_m)))
-svint16_t svlsl_wide_m(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u8_x)))
-svuint8_t svlsl_wide_x(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u32_x)))
-svuint32_t svlsl_wide_x(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u16_x)))
-svuint16_t svlsl_wide_x(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s8_x)))
-svint8_t svlsl_wide_x(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s32_x)))
-svint32_t svlsl_wide_x(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s16_x)))
-svint16_t svlsl_wide_x(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u8_z)))
-svuint8_t svlsl_wide_z(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u32_z)))
-svuint32_t svlsl_wide_z(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_u16_z)))
-svuint16_t svlsl_wide_z(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s8_z)))
-svint8_t svlsl_wide_z(svbool_t, svint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s32_z)))
-svint32_t svlsl_wide_z(svbool_t, svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_n_s16_z)))
-svint16_t svlsl_wide_z(svbool_t, svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u8_m)))
-svuint8_t svlsl_wide_m(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u32_m)))
-svuint32_t svlsl_wide_m(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u16_m)))
-svuint16_t svlsl_wide_m(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s8_m)))
-svint8_t svlsl_wide_m(svbool_t, svint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s32_m)))
-svint32_t svlsl_wide_m(svbool_t, svint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s16_m)))
-svint16_t svlsl_wide_m(svbool_t, svint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u8_x)))
-svuint8_t svlsl_wide_x(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u32_x)))
-svuint32_t svlsl_wide_x(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u16_x)))
-svuint16_t svlsl_wide_x(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s8_x)))
-svint8_t svlsl_wide_x(svbool_t, svint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s32_x)))
-svint32_t svlsl_wide_x(svbool_t, svint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s16_x)))
-svint16_t svlsl_wide_x(svbool_t, svint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u8_z)))
-svuint8_t svlsl_wide_z(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u32_z)))
-svuint32_t svlsl_wide_z(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_u16_z)))
-svuint16_t svlsl_wide_z(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s8_z)))
-svint8_t svlsl_wide_z(svbool_t, svint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s32_z)))
-svint32_t svlsl_wide_z(svbool_t, svint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsl_wide_s16_z)))
-svint16_t svlsl_wide_z(svbool_t, svint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u8_m)))
-svuint8_t svlsr_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u32_m)))
-svuint32_t svlsr_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u64_m)))
-svuint64_t svlsr_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u16_m)))
-svuint16_t svlsr_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u8_x)))
-svuint8_t svlsr_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u32_x)))
-svuint32_t svlsr_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u64_x)))
-svuint64_t svlsr_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u16_x)))
-svuint16_t svlsr_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u8_z)))
-svuint8_t svlsr_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u32_z)))
-svuint32_t svlsr_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u64_z)))
-svuint64_t svlsr_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_n_u16_z)))
-svuint16_t svlsr_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u8_m)))
-svuint8_t svlsr_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u32_m)))
-svuint32_t svlsr_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u64_m)))
-svuint64_t svlsr_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u16_m)))
-svuint16_t svlsr_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u8_x)))
-svuint8_t svlsr_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u32_x)))
-svuint32_t svlsr_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u64_x)))
-svuint64_t svlsr_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u16_x)))
-svuint16_t svlsr_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u8_z)))
-svuint8_t svlsr_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u32_z)))
-svuint32_t svlsr_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u64_z)))
-svuint64_t svlsr_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_u16_z)))
-svuint16_t svlsr_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u8_m)))
-svuint8_t svlsr_wide_m(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u32_m)))
-svuint32_t svlsr_wide_m(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u16_m)))
-svuint16_t svlsr_wide_m(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u8_x)))
-svuint8_t svlsr_wide_x(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u32_x)))
-svuint32_t svlsr_wide_x(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u16_x)))
-svuint16_t svlsr_wide_x(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u8_z)))
-svuint8_t svlsr_wide_z(svbool_t, svuint8_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u32_z)))
-svuint32_t svlsr_wide_z(svbool_t, svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_n_u16_z)))
-svuint16_t svlsr_wide_z(svbool_t, svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u8_m)))
-svuint8_t svlsr_wide_m(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u32_m)))
-svuint32_t svlsr_wide_m(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u16_m)))
-svuint16_t svlsr_wide_m(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u8_x)))
-svuint8_t svlsr_wide_x(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u32_x)))
-svuint32_t svlsr_wide_x(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u16_x)))
-svuint16_t svlsr_wide_x(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u8_z)))
-svuint8_t svlsr_wide_z(svbool_t, svuint8_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u32_z)))
-svuint32_t svlsr_wide_z(svbool_t, svuint32_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svlsr_wide_u16_z)))
-svuint16_t svlsr_wide_z(svbool_t, svuint16_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f64_m)))
-svfloat64_t svmad_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f32_m)))
-svfloat32_t svmad_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f16_m)))
-svfloat16_t svmad_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f64_x)))
-svfloat64_t svmad_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f32_x)))
-svfloat32_t svmad_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f16_x)))
-svfloat16_t svmad_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f64_z)))
-svfloat64_t svmad_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f32_z)))
-svfloat32_t svmad_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_f16_z)))
-svfloat16_t svmad_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u8_m)))
-svuint8_t svmad_m(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u32_m)))
-svuint32_t svmad_m(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u64_m)))
-svuint64_t svmad_m(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u16_m)))
-svuint16_t svmad_m(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s8_m)))
-svint8_t svmad_m(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s32_m)))
-svint32_t svmad_m(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s64_m)))
-svint64_t svmad_m(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s16_m)))
-svint16_t svmad_m(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u8_x)))
-svuint8_t svmad_x(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u32_x)))
-svuint32_t svmad_x(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u64_x)))
-svuint64_t svmad_x(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u16_x)))
-svuint16_t svmad_x(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s8_x)))
-svint8_t svmad_x(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s32_x)))
-svint32_t svmad_x(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s64_x)))
-svint64_t svmad_x(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s16_x)))
-svint16_t svmad_x(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u8_z)))
-svuint8_t svmad_z(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u32_z)))
-svuint32_t svmad_z(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u64_z)))
-svuint64_t svmad_z(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_u16_z)))
-svuint16_t svmad_z(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s8_z)))
-svint8_t svmad_z(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s32_z)))
-svint32_t svmad_z(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s64_z)))
-svint64_t svmad_z(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_n_s16_z)))
-svint16_t svmad_z(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f64_m)))
-svfloat64_t svmad_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f32_m)))
-svfloat32_t svmad_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f16_m)))
-svfloat16_t svmad_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f64_x)))
-svfloat64_t svmad_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f32_x)))
-svfloat32_t svmad_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f16_x)))
-svfloat16_t svmad_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f64_z)))
-svfloat64_t svmad_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f32_z)))
-svfloat32_t svmad_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_f16_z)))
-svfloat16_t svmad_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u8_m)))
-svuint8_t svmad_m(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u32_m)))
-svuint32_t svmad_m(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u64_m)))
-svuint64_t svmad_m(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u16_m)))
-svuint16_t svmad_m(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s8_m)))
-svint8_t svmad_m(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s32_m)))
-svint32_t svmad_m(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s64_m)))
-svint64_t svmad_m(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s16_m)))
-svint16_t svmad_m(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u8_x)))
-svuint8_t svmad_x(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u32_x)))
-svuint32_t svmad_x(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u64_x)))
-svuint64_t svmad_x(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u16_x)))
-svuint16_t svmad_x(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s8_x)))
-svint8_t svmad_x(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s32_x)))
-svint32_t svmad_x(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s64_x)))
-svint64_t svmad_x(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s16_x)))
-svint16_t svmad_x(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u8_z)))
-svuint8_t svmad_z(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u32_z)))
-svuint32_t svmad_z(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u64_z)))
-svuint64_t svmad_z(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_u16_z)))
-svuint16_t svmad_z(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s8_z)))
-svint8_t svmad_z(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s32_z)))
-svint32_t svmad_z(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s64_z)))
-svint64_t svmad_z(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmad_s16_z)))
-svint16_t svmad_z(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f64_m)))
-svfloat64_t svmax_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f32_m)))
-svfloat32_t svmax_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f16_m)))
-svfloat16_t svmax_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f64_x)))
-svfloat64_t svmax_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f32_x)))
-svfloat32_t svmax_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f16_x)))
-svfloat16_t svmax_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f64_z)))
-svfloat64_t svmax_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f32_z)))
-svfloat32_t svmax_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_f16_z)))
-svfloat16_t svmax_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s8_m)))
-svint8_t svmax_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s32_m)))
-svint32_t svmax_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s64_m)))
-svint64_t svmax_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s16_m)))
-svint16_t svmax_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s8_x)))
-svint8_t svmax_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s32_x)))
-svint32_t svmax_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s64_x)))
-svint64_t svmax_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s16_x)))
-svint16_t svmax_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s8_z)))
-svint8_t svmax_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s32_z)))
-svint32_t svmax_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s64_z)))
-svint64_t svmax_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_s16_z)))
-svint16_t svmax_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u8_m)))
-svuint8_t svmax_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u32_m)))
-svuint32_t svmax_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u64_m)))
-svuint64_t svmax_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u16_m)))
-svuint16_t svmax_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u8_x)))
-svuint8_t svmax_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u32_x)))
-svuint32_t svmax_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u64_x)))
-svuint64_t svmax_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u16_x)))
-svuint16_t svmax_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u8_z)))
-svuint8_t svmax_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u32_z)))
-svuint32_t svmax_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u64_z)))
-svuint64_t svmax_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_n_u16_z)))
-svuint16_t svmax_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_m)))
-svfloat64_t svmax_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_m)))
-svfloat32_t svmax_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_m)))
-svfloat16_t svmax_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_x)))
-svfloat64_t svmax_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_x)))
-svfloat32_t svmax_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_x)))
-svfloat16_t svmax_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f64_z)))
-svfloat64_t svmax_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f32_z)))
-svfloat32_t svmax_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_f16_z)))
-svfloat16_t svmax_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_m)))
-svint8_t svmax_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_m)))
-svint32_t svmax_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_m)))
-svint64_t svmax_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_m)))
-svint16_t svmax_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_x)))
-svint8_t svmax_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_x)))
-svint32_t svmax_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_x)))
-svint64_t svmax_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_x)))
-svint16_t svmax_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s8_z)))
-svint8_t svmax_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s32_z)))
-svint32_t svmax_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s64_z)))
-svint64_t svmax_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_s16_z)))
-svint16_t svmax_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_m)))
-svuint8_t svmax_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_m)))
-svuint32_t svmax_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_m)))
-svuint64_t svmax_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_m)))
-svuint16_t svmax_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_x)))
-svuint8_t svmax_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_x)))
-svuint32_t svmax_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_x)))
-svuint64_t svmax_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_x)))
-svuint16_t svmax_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u8_z)))
-svuint8_t svmax_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u32_z)))
-svuint32_t svmax_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u64_z)))
-svuint64_t svmax_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmax_u16_z)))
-svuint16_t svmax_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f64_m)))
-svfloat64_t svmaxnm_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f32_m)))
-svfloat32_t svmaxnm_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f16_m)))
-svfloat16_t svmaxnm_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f64_x)))
-svfloat64_t svmaxnm_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f32_x)))
-svfloat32_t svmaxnm_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f16_x)))
-svfloat16_t svmaxnm_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f64_z)))
-svfloat64_t svmaxnm_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f32_z)))
-svfloat32_t svmaxnm_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_n_f16_z)))
-svfloat16_t svmaxnm_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_m)))
-svfloat64_t svmaxnm_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_m)))
-svfloat32_t svmaxnm_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_m)))
-svfloat16_t svmaxnm_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_x)))
-svfloat64_t svmaxnm_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_x)))
-svfloat32_t svmaxnm_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_x)))
-svfloat16_t svmaxnm_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f64_z)))
-svfloat64_t svmaxnm_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f32_z)))
-svfloat32_t svmaxnm_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnm_f16_z)))
-svfloat16_t svmaxnm_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmv_f64)))
-float64_t svmaxnmv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmv_f32)))
-float32_t svmaxnmv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxnmv_f16)))
-float16_t svmaxnmv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_f64)))
-float64_t svmaxv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_f32)))
-float32_t svmaxv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_f16)))
-float16_t svmaxv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_s8)))
-int8_t svmaxv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_s32)))
-int32_t svmaxv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_s64)))
-int64_t svmaxv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_s16)))
-int16_t svmaxv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_u8)))
-uint8_t svmaxv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_u32)))
-uint32_t svmaxv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_u64)))
-uint64_t svmaxv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmaxv_u16)))
-uint16_t svmaxv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f64_m)))
-svfloat64_t svmin_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f32_m)))
-svfloat32_t svmin_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f16_m)))
-svfloat16_t svmin_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f64_x)))
-svfloat64_t svmin_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f32_x)))
-svfloat32_t svmin_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f16_x)))
-svfloat16_t svmin_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f64_z)))
-svfloat64_t svmin_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f32_z)))
-svfloat32_t svmin_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_f16_z)))
-svfloat16_t svmin_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s8_m)))
-svint8_t svmin_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s32_m)))
-svint32_t svmin_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s64_m)))
-svint64_t svmin_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s16_m)))
-svint16_t svmin_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s8_x)))
-svint8_t svmin_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s32_x)))
-svint32_t svmin_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s64_x)))
-svint64_t svmin_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s16_x)))
-svint16_t svmin_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s8_z)))
-svint8_t svmin_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s32_z)))
-svint32_t svmin_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s64_z)))
-svint64_t svmin_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_s16_z)))
-svint16_t svmin_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u8_m)))
-svuint8_t svmin_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u32_m)))
-svuint32_t svmin_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u64_m)))
-svuint64_t svmin_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u16_m)))
-svuint16_t svmin_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u8_x)))
-svuint8_t svmin_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u32_x)))
-svuint32_t svmin_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u64_x)))
-svuint64_t svmin_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u16_x)))
-svuint16_t svmin_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u8_z)))
-svuint8_t svmin_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u32_z)))
-svuint32_t svmin_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u64_z)))
-svuint64_t svmin_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_n_u16_z)))
-svuint16_t svmin_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_m)))
-svfloat64_t svmin_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_m)))
-svfloat32_t svmin_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_m)))
-svfloat16_t svmin_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_x)))
-svfloat64_t svmin_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_x)))
-svfloat32_t svmin_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_x)))
-svfloat16_t svmin_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f64_z)))
-svfloat64_t svmin_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f32_z)))
-svfloat32_t svmin_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_f16_z)))
-svfloat16_t svmin_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_m)))
-svint8_t svmin_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_m)))
-svint32_t svmin_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_m)))
-svint64_t svmin_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_m)))
-svint16_t svmin_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_x)))
-svint8_t svmin_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_x)))
-svint32_t svmin_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_x)))
-svint64_t svmin_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_x)))
-svint16_t svmin_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s8_z)))
-svint8_t svmin_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s32_z)))
-svint32_t svmin_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s64_z)))
-svint64_t svmin_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_s16_z)))
-svint16_t svmin_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_m)))
-svuint8_t svmin_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_m)))
-svuint32_t svmin_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_m)))
-svuint64_t svmin_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_m)))
-svuint16_t svmin_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_x)))
-svuint8_t svmin_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_x)))
-svuint32_t svmin_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_x)))
-svuint64_t svmin_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_x)))
-svuint16_t svmin_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u8_z)))
-svuint8_t svmin_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u32_z)))
-svuint32_t svmin_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u64_z)))
-svuint64_t svmin_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmin_u16_z)))
-svuint16_t svmin_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f64_m)))
-svfloat64_t svminnm_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f32_m)))
-svfloat32_t svminnm_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f16_m)))
-svfloat16_t svminnm_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f64_x)))
-svfloat64_t svminnm_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f32_x)))
-svfloat32_t svminnm_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f16_x)))
-svfloat16_t svminnm_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f64_z)))
-svfloat64_t svminnm_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f32_z)))
-svfloat32_t svminnm_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_n_f16_z)))
-svfloat16_t svminnm_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_m)))
-svfloat64_t svminnm_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_m)))
-svfloat32_t svminnm_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_m)))
-svfloat16_t svminnm_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_x)))
-svfloat64_t svminnm_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_x)))
-svfloat32_t svminnm_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_x)))
-svfloat16_t svminnm_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f64_z)))
-svfloat64_t svminnm_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f32_z)))
-svfloat32_t svminnm_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_f16_z)))
-svfloat16_t svminnm_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmv_f64)))
-float64_t svminnmv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmv_f32)))
-float32_t svminnmv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnmv_f16)))
-float16_t svminnmv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_f64)))
-float64_t svminv(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_f32)))
-float32_t svminv(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_f16)))
-float16_t svminv(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_s8)))
-int8_t svminv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_s32)))
-int32_t svminv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_s64)))
-int64_t svminv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_s16)))
-int16_t svminv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_u8)))
-uint8_t svminv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_u32)))
-uint32_t svminv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_u64)))
-uint64_t svminv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminv_u16)))
-uint16_t svminv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f64_m)))
-svfloat64_t svmla_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f32_m)))
-svfloat32_t svmla_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f16_m)))
-svfloat16_t svmla_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f64_x)))
-svfloat64_t svmla_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f32_x)))
-svfloat32_t svmla_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f16_x)))
-svfloat16_t svmla_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f64_z)))
-svfloat64_t svmla_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f32_z)))
-svfloat32_t svmla_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_f16_z)))
-svfloat16_t svmla_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u8_m)))
-svuint8_t svmla_m(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u32_m)))
-svuint32_t svmla_m(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u64_m)))
-svuint64_t svmla_m(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u16_m)))
-svuint16_t svmla_m(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s8_m)))
-svint8_t svmla_m(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s32_m)))
-svint32_t svmla_m(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s64_m)))
-svint64_t svmla_m(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s16_m)))
-svint16_t svmla_m(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u8_x)))
-svuint8_t svmla_x(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u32_x)))
-svuint32_t svmla_x(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u64_x)))
-svuint64_t svmla_x(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u16_x)))
-svuint16_t svmla_x(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s8_x)))
-svint8_t svmla_x(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s32_x)))
-svint32_t svmla_x(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s64_x)))
-svint64_t svmla_x(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s16_x)))
-svint16_t svmla_x(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u8_z)))
-svuint8_t svmla_z(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u32_z)))
-svuint32_t svmla_z(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u64_z)))
-svuint64_t svmla_z(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_u16_z)))
-svuint16_t svmla_z(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s8_z)))
-svint8_t svmla_z(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s32_z)))
-svint32_t svmla_z(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s64_z)))
-svint64_t svmla_z(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_n_s16_z)))
-svint16_t svmla_z(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f64_m)))
-svfloat64_t svmla_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f32_m)))
-svfloat32_t svmla_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f16_m)))
-svfloat16_t svmla_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f64_x)))
-svfloat64_t svmla_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f32_x)))
-svfloat32_t svmla_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f16_x)))
-svfloat16_t svmla_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f64_z)))
-svfloat64_t svmla_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f32_z)))
-svfloat32_t svmla_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_f16_z)))
-svfloat16_t svmla_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u8_m)))
-svuint8_t svmla_m(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u32_m)))
-svuint32_t svmla_m(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u64_m)))
-svuint64_t svmla_m(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u16_m)))
-svuint16_t svmla_m(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s8_m)))
-svint8_t svmla_m(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s32_m)))
-svint32_t svmla_m(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s64_m)))
-svint64_t svmla_m(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s16_m)))
-svint16_t svmla_m(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u8_x)))
-svuint8_t svmla_x(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u32_x)))
-svuint32_t svmla_x(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u64_x)))
-svuint64_t svmla_x(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u16_x)))
-svuint16_t svmla_x(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s8_x)))
-svint8_t svmla_x(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s32_x)))
-svint32_t svmla_x(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s64_x)))
-svint64_t svmla_x(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s16_x)))
-svint16_t svmla_x(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u8_z)))
-svuint8_t svmla_z(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u32_z)))
-svuint32_t svmla_z(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u64_z)))
-svuint64_t svmla_z(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_u16_z)))
-svuint16_t svmla_z(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s8_z)))
-svint8_t svmla_z(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s32_z)))
-svint32_t svmla_z(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s64_z)))
-svint64_t svmla_z(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_s16_z)))
-svint16_t svmla_z(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_f64)))
-svfloat64_t svmla_lane(svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_f32)))
-svfloat32_t svmla_lane(svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmla_lane_f16)))
-svfloat16_t svmla_lane(svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f64_m)))
-svfloat64_t svmls_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f32_m)))
-svfloat32_t svmls_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f16_m)))
-svfloat16_t svmls_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f64_x)))
-svfloat64_t svmls_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f32_x)))
-svfloat32_t svmls_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f16_x)))
-svfloat16_t svmls_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f64_z)))
-svfloat64_t svmls_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f32_z)))
-svfloat32_t svmls_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_f16_z)))
-svfloat16_t svmls_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u8_m)))
-svuint8_t svmls_m(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u32_m)))
-svuint32_t svmls_m(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u64_m)))
-svuint64_t svmls_m(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u16_m)))
-svuint16_t svmls_m(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s8_m)))
-svint8_t svmls_m(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s32_m)))
-svint32_t svmls_m(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s64_m)))
-svint64_t svmls_m(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s16_m)))
-svint16_t svmls_m(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u8_x)))
-svuint8_t svmls_x(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u32_x)))
-svuint32_t svmls_x(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u64_x)))
-svuint64_t svmls_x(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u16_x)))
-svuint16_t svmls_x(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s8_x)))
-svint8_t svmls_x(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s32_x)))
-svint32_t svmls_x(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s64_x)))
-svint64_t svmls_x(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s16_x)))
-svint16_t svmls_x(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u8_z)))
-svuint8_t svmls_z(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u32_z)))
-svuint32_t svmls_z(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u64_z)))
-svuint64_t svmls_z(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_u16_z)))
-svuint16_t svmls_z(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s8_z)))
-svint8_t svmls_z(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s32_z)))
-svint32_t svmls_z(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s64_z)))
-svint64_t svmls_z(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_n_s16_z)))
-svint16_t svmls_z(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f64_m)))
-svfloat64_t svmls_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f32_m)))
-svfloat32_t svmls_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f16_m)))
-svfloat16_t svmls_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f64_x)))
-svfloat64_t svmls_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f32_x)))
-svfloat32_t svmls_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f16_x)))
-svfloat16_t svmls_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f64_z)))
-svfloat64_t svmls_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f32_z)))
-svfloat32_t svmls_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_f16_z)))
-svfloat16_t svmls_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u8_m)))
-svuint8_t svmls_m(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u32_m)))
-svuint32_t svmls_m(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u64_m)))
-svuint64_t svmls_m(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u16_m)))
-svuint16_t svmls_m(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s8_m)))
-svint8_t svmls_m(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s32_m)))
-svint32_t svmls_m(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s64_m)))
-svint64_t svmls_m(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s16_m)))
-svint16_t svmls_m(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u8_x)))
-svuint8_t svmls_x(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u32_x)))
-svuint32_t svmls_x(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u64_x)))
-svuint64_t svmls_x(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u16_x)))
-svuint16_t svmls_x(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s8_x)))
-svint8_t svmls_x(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s32_x)))
-svint32_t svmls_x(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s64_x)))
-svint64_t svmls_x(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s16_x)))
-svint16_t svmls_x(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u8_z)))
-svuint8_t svmls_z(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u32_z)))
-svuint32_t svmls_z(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u64_z)))
-svuint64_t svmls_z(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_u16_z)))
-svuint16_t svmls_z(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s8_z)))
-svint8_t svmls_z(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s32_z)))
-svint32_t svmls_z(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s64_z)))
-svint64_t svmls_z(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_s16_z)))
-svint16_t svmls_z(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_f64)))
-svfloat64_t svmls_lane(svfloat64_t, svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_f32)))
-svfloat32_t svmls_lane(svfloat32_t, svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmls_lane_f16)))
-svfloat16_t svmls_lane(svfloat16_t, svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmov_b_z)))
-svbool_t svmov_z(svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f64_m)))
-svfloat64_t svmsb_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f32_m)))
-svfloat32_t svmsb_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f16_m)))
-svfloat16_t svmsb_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f64_x)))
-svfloat64_t svmsb_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f32_x)))
-svfloat32_t svmsb_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f16_x)))
-svfloat16_t svmsb_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f64_z)))
-svfloat64_t svmsb_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f32_z)))
-svfloat32_t svmsb_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_f16_z)))
-svfloat16_t svmsb_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u8_m)))
-svuint8_t svmsb_m(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u32_m)))
-svuint32_t svmsb_m(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u64_m)))
-svuint64_t svmsb_m(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u16_m)))
-svuint16_t svmsb_m(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s8_m)))
-svint8_t svmsb_m(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s32_m)))
-svint32_t svmsb_m(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s64_m)))
-svint64_t svmsb_m(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s16_m)))
-svint16_t svmsb_m(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u8_x)))
-svuint8_t svmsb_x(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u32_x)))
-svuint32_t svmsb_x(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u64_x)))
-svuint64_t svmsb_x(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u16_x)))
-svuint16_t svmsb_x(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s8_x)))
-svint8_t svmsb_x(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s32_x)))
-svint32_t svmsb_x(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s64_x)))
-svint64_t svmsb_x(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s16_x)))
-svint16_t svmsb_x(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u8_z)))
-svuint8_t svmsb_z(svbool_t, svuint8_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u32_z)))
-svuint32_t svmsb_z(svbool_t, svuint32_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u64_z)))
-svuint64_t svmsb_z(svbool_t, svuint64_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_u16_z)))
-svuint16_t svmsb_z(svbool_t, svuint16_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s8_z)))
-svint8_t svmsb_z(svbool_t, svint8_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s32_z)))
-svint32_t svmsb_z(svbool_t, svint32_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s64_z)))
-svint64_t svmsb_z(svbool_t, svint64_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_n_s16_z)))
-svint16_t svmsb_z(svbool_t, svint16_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f64_m)))
-svfloat64_t svmsb_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f32_m)))
-svfloat32_t svmsb_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f16_m)))
-svfloat16_t svmsb_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f64_x)))
-svfloat64_t svmsb_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f32_x)))
-svfloat32_t svmsb_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f16_x)))
-svfloat16_t svmsb_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f64_z)))
-svfloat64_t svmsb_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f32_z)))
-svfloat32_t svmsb_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_f16_z)))
-svfloat16_t svmsb_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u8_m)))
-svuint8_t svmsb_m(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u32_m)))
-svuint32_t svmsb_m(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u64_m)))
-svuint64_t svmsb_m(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u16_m)))
-svuint16_t svmsb_m(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s8_m)))
-svint8_t svmsb_m(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s32_m)))
-svint32_t svmsb_m(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s64_m)))
-svint64_t svmsb_m(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s16_m)))
-svint16_t svmsb_m(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u8_x)))
-svuint8_t svmsb_x(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u32_x)))
-svuint32_t svmsb_x(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u64_x)))
-svuint64_t svmsb_x(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u16_x)))
-svuint16_t svmsb_x(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s8_x)))
-svint8_t svmsb_x(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s32_x)))
-svint32_t svmsb_x(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s64_x)))
-svint64_t svmsb_x(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s16_x)))
-svint16_t svmsb_x(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u8_z)))
-svuint8_t svmsb_z(svbool_t, svuint8_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u32_z)))
-svuint32_t svmsb_z(svbool_t, svuint32_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u64_z)))
-svuint64_t svmsb_z(svbool_t, svuint64_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_u16_z)))
-svuint16_t svmsb_z(svbool_t, svuint16_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s8_z)))
-svint8_t svmsb_z(svbool_t, svint8_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s32_z)))
-svint32_t svmsb_z(svbool_t, svint32_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s64_z)))
-svint64_t svmsb_z(svbool_t, svint64_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmsb_s16_z)))
-svint16_t svmsb_z(svbool_t, svint16_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f64_m)))
-svfloat64_t svmul_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f32_m)))
-svfloat32_t svmul_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f16_m)))
-svfloat16_t svmul_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f64_x)))
-svfloat64_t svmul_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f32_x)))
-svfloat32_t svmul_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f16_x)))
-svfloat16_t svmul_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f64_z)))
-svfloat64_t svmul_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f32_z)))
-svfloat32_t svmul_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_f16_z)))
-svfloat16_t svmul_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u8_m)))
-svuint8_t svmul_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u32_m)))
-svuint32_t svmul_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u64_m)))
-svuint64_t svmul_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u16_m)))
-svuint16_t svmul_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s8_m)))
-svint8_t svmul_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s32_m)))
-svint32_t svmul_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s64_m)))
-svint64_t svmul_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s16_m)))
-svint16_t svmul_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u8_x)))
-svuint8_t svmul_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u32_x)))
-svuint32_t svmul_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u64_x)))
-svuint64_t svmul_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u16_x)))
-svuint16_t svmul_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s8_x)))
-svint8_t svmul_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s32_x)))
-svint32_t svmul_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s64_x)))
-svint64_t svmul_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s16_x)))
-svint16_t svmul_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u8_z)))
-svuint8_t svmul_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u32_z)))
-svuint32_t svmul_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u64_z)))
-svuint64_t svmul_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_u16_z)))
-svuint16_t svmul_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s8_z)))
-svint8_t svmul_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s32_z)))
-svint32_t svmul_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s64_z)))
-svint64_t svmul_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_n_s16_z)))
-svint16_t svmul_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f64_m)))
-svfloat64_t svmul_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f32_m)))
-svfloat32_t svmul_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f16_m)))
-svfloat16_t svmul_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f64_x)))
-svfloat64_t svmul_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f32_x)))
-svfloat32_t svmul_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f16_x)))
-svfloat16_t svmul_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f64_z)))
-svfloat64_t svmul_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f32_z)))
-svfloat32_t svmul_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_f16_z)))
-svfloat16_t svmul_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u8_m)))
-svuint8_t svmul_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u32_m)))
-svuint32_t svmul_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u64_m)))
-svuint64_t svmul_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u16_m)))
-svuint16_t svmul_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s8_m)))
-svint8_t svmul_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s32_m)))
-svint32_t svmul_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s64_m)))
-svint64_t svmul_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s16_m)))
-svint16_t svmul_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u8_x)))
-svuint8_t svmul_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u32_x)))
-svuint32_t svmul_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u64_x)))
-svuint64_t svmul_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u16_x)))
-svuint16_t svmul_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s8_x)))
-svint8_t svmul_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s32_x)))
-svint32_t svmul_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s64_x)))
-svint64_t svmul_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s16_x)))
-svint16_t svmul_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u8_z)))
-svuint8_t svmul_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u32_z)))
-svuint32_t svmul_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u64_z)))
-svuint64_t svmul_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_u16_z)))
-svuint16_t svmul_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s8_z)))
-svint8_t svmul_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s32_z)))
-svint32_t svmul_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s64_z)))
-svint64_t svmul_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_s16_z)))
-svint16_t svmul_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_f64)))
-svfloat64_t svmul_lane(svfloat64_t, svfloat64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_f32)))
-svfloat32_t svmul_lane(svfloat32_t, svfloat32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmul_lane_f16)))
-svfloat16_t svmul_lane(svfloat16_t, svfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s8_m)))
-svint8_t svmulh_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s32_m)))
-svint32_t svmulh_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s64_m)))
-svint64_t svmulh_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s16_m)))
-svint16_t svmulh_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s8_x)))
-svint8_t svmulh_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s32_x)))
-svint32_t svmulh_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s64_x)))
-svint64_t svmulh_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s16_x)))
-svint16_t svmulh_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s8_z)))
-svint8_t svmulh_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s32_z)))
-svint32_t svmulh_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s64_z)))
-svint64_t svmulh_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_s16_z)))
-svint16_t svmulh_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u8_m)))
-svuint8_t svmulh_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u32_m)))
-svuint32_t svmulh_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u64_m)))
-svuint64_t svmulh_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u16_m)))
-svuint16_t svmulh_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u8_x)))
-svuint8_t svmulh_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u32_x)))
-svuint32_t svmulh_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u64_x)))
-svuint64_t svmulh_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u16_x)))
-svuint16_t svmulh_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u8_z)))
-svuint8_t svmulh_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u32_z)))
-svuint32_t svmulh_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u64_z)))
-svuint64_t svmulh_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_n_u16_z)))
-svuint16_t svmulh_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s8_m)))
-svint8_t svmulh_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s32_m)))
-svint32_t svmulh_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s64_m)))
-svint64_t svmulh_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s16_m)))
-svint16_t svmulh_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s8_x)))
-svint8_t svmulh_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s32_x)))
-svint32_t svmulh_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s64_x)))
-svint64_t svmulh_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s16_x)))
-svint16_t svmulh_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s8_z)))
-svint8_t svmulh_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s32_z)))
-svint32_t svmulh_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s64_z)))
-svint64_t svmulh_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_s16_z)))
-svint16_t svmulh_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u8_m)))
-svuint8_t svmulh_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u32_m)))
-svuint32_t svmulh_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u64_m)))
-svuint64_t svmulh_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u16_m)))
-svuint16_t svmulh_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u8_x)))
-svuint8_t svmulh_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u32_x)))
-svuint32_t svmulh_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u64_x)))
-svuint64_t svmulh_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u16_x)))
-svuint16_t svmulh_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u8_z)))
-svuint8_t svmulh_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u32_z)))
-svuint32_t svmulh_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u64_z)))
-svuint64_t svmulh_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulh_u16_z)))
-svuint16_t svmulh_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f64_m)))
-svfloat64_t svmulx_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f32_m)))
-svfloat32_t svmulx_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f16_m)))
-svfloat16_t svmulx_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f64_x)))
-svfloat64_t svmulx_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f32_x)))
-svfloat32_t svmulx_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f16_x)))
-svfloat16_t svmulx_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f64_z)))
-svfloat64_t svmulx_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f32_z)))
-svfloat32_t svmulx_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_n_f16_z)))
-svfloat16_t svmulx_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f64_m)))
-svfloat64_t svmulx_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f32_m)))
-svfloat32_t svmulx_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f16_m)))
-svfloat16_t svmulx_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f64_x)))
-svfloat64_t svmulx_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f32_x)))
-svfloat32_t svmulx_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f16_x)))
-svfloat16_t svmulx_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f64_z)))
-svfloat64_t svmulx_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f32_z)))
-svfloat32_t svmulx_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmulx_f16_z)))
-svfloat16_t svmulx_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnand_b_z)))
-svbool_t svnand_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f64_m)))
-svfloat64_t svneg_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f32_m)))
-svfloat32_t svneg_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f16_m)))
-svfloat16_t svneg_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f64_x)))
-svfloat64_t svneg_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f32_x)))
-svfloat32_t svneg_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f16_x)))
-svfloat16_t svneg_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f64_z)))
-svfloat64_t svneg_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f32_z)))
-svfloat32_t svneg_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_f16_z)))
-svfloat16_t svneg_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s8_m)))
-svint8_t svneg_m(svint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s32_m)))
-svint32_t svneg_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s64_m)))
-svint64_t svneg_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s16_m)))
-svint16_t svneg_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s8_x)))
-svint8_t svneg_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s32_x)))
-svint32_t svneg_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s64_x)))
-svint64_t svneg_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s16_x)))
-svint16_t svneg_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s8_z)))
-svint8_t svneg_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s32_z)))
-svint32_t svneg_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s64_z)))
-svint64_t svneg_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svneg_s16_z)))
-svint16_t svneg_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f64_m)))
-svfloat64_t svnmad_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f32_m)))
-svfloat32_t svnmad_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f16_m)))
-svfloat16_t svnmad_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f64_x)))
-svfloat64_t svnmad_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f32_x)))
-svfloat32_t svnmad_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f16_x)))
-svfloat16_t svnmad_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f64_z)))
-svfloat64_t svnmad_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f32_z)))
-svfloat32_t svnmad_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_n_f16_z)))
-svfloat16_t svnmad_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f64_m)))
-svfloat64_t svnmad_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f32_m)))
-svfloat32_t svnmad_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f16_m)))
-svfloat16_t svnmad_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f64_x)))
-svfloat64_t svnmad_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f32_x)))
-svfloat32_t svnmad_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f16_x)))
-svfloat16_t svnmad_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f64_z)))
-svfloat64_t svnmad_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f32_z)))
-svfloat32_t svnmad_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmad_f16_z)))
-svfloat16_t svnmad_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f64_m)))
-svfloat64_t svnmla_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f32_m)))
-svfloat32_t svnmla_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f16_m)))
-svfloat16_t svnmla_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f64_x)))
-svfloat64_t svnmla_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f32_x)))
-svfloat32_t svnmla_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f16_x)))
-svfloat16_t svnmla_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f64_z)))
-svfloat64_t svnmla_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f32_z)))
-svfloat32_t svnmla_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_n_f16_z)))
-svfloat16_t svnmla_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f64_m)))
-svfloat64_t svnmla_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f32_m)))
-svfloat32_t svnmla_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f16_m)))
-svfloat16_t svnmla_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f64_x)))
-svfloat64_t svnmla_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f32_x)))
-svfloat32_t svnmla_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f16_x)))
-svfloat16_t svnmla_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f64_z)))
-svfloat64_t svnmla_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f32_z)))
-svfloat32_t svnmla_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmla_f16_z)))
-svfloat16_t svnmla_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f64_m)))
-svfloat64_t svnmls_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f32_m)))
-svfloat32_t svnmls_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f16_m)))
-svfloat16_t svnmls_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f64_x)))
-svfloat64_t svnmls_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f32_x)))
-svfloat32_t svnmls_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f16_x)))
-svfloat16_t svnmls_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f64_z)))
-svfloat64_t svnmls_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f32_z)))
-svfloat32_t svnmls_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_n_f16_z)))
-svfloat16_t svnmls_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f64_m)))
-svfloat64_t svnmls_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f32_m)))
-svfloat32_t svnmls_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f16_m)))
-svfloat16_t svnmls_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f64_x)))
-svfloat64_t svnmls_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f32_x)))
-svfloat32_t svnmls_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f16_x)))
-svfloat16_t svnmls_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f64_z)))
-svfloat64_t svnmls_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f32_z)))
-svfloat32_t svnmls_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmls_f16_z)))
-svfloat16_t svnmls_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f64_m)))
-svfloat64_t svnmsb_m(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f32_m)))
-svfloat32_t svnmsb_m(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f16_m)))
-svfloat16_t svnmsb_m(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f64_x)))
-svfloat64_t svnmsb_x(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f32_x)))
-svfloat32_t svnmsb_x(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f16_x)))
-svfloat16_t svnmsb_x(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f64_z)))
-svfloat64_t svnmsb_z(svbool_t, svfloat64_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f32_z)))
-svfloat32_t svnmsb_z(svbool_t, svfloat32_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_n_f16_z)))
-svfloat16_t svnmsb_z(svbool_t, svfloat16_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f64_m)))
-svfloat64_t svnmsb_m(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f32_m)))
-svfloat32_t svnmsb_m(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f16_m)))
-svfloat16_t svnmsb_m(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f64_x)))
-svfloat64_t svnmsb_x(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f32_x)))
-svfloat32_t svnmsb_x(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f16_x)))
-svfloat16_t svnmsb_x(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f64_z)))
-svfloat64_t svnmsb_z(svbool_t, svfloat64_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f32_z)))
-svfloat32_t svnmsb_z(svbool_t, svfloat32_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnmsb_f16_z)))
-svfloat16_t svnmsb_z(svbool_t, svfloat16_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnor_b_z)))
-svbool_t svnor_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_b_z)))
-svbool_t svnot_z(svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u8_m)))
-svuint8_t svnot_m(svuint8_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u32_m)))
-svuint32_t svnot_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u64_m)))
-svuint64_t svnot_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u16_m)))
-svuint16_t svnot_m(svuint16_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s8_m)))
-svint8_t svnot_m(svint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s32_m)))
-svint32_t svnot_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s64_m)))
-svint64_t svnot_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s16_m)))
-svint16_t svnot_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u8_x)))
-svuint8_t svnot_x(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u32_x)))
-svuint32_t svnot_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u64_x)))
-svuint64_t svnot_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u16_x)))
-svuint16_t svnot_x(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s8_x)))
-svint8_t svnot_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s32_x)))
-svint32_t svnot_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s64_x)))
-svint64_t svnot_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s16_x)))
-svint16_t svnot_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u8_z)))
-svuint8_t svnot_z(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u32_z)))
-svuint32_t svnot_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u64_z)))
-svuint64_t svnot_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_u16_z)))
-svuint16_t svnot_z(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s8_z)))
-svint8_t svnot_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s32_z)))
-svint32_t svnot_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s64_z)))
-svint64_t svnot_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svnot_s16_z)))
-svint16_t svnot_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorn_b_z)))
-svbool_t svorn_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_b_z)))
-svbool_t svorr_z(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u8_m)))
-svuint8_t svorr_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u32_m)))
-svuint32_t svorr_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u64_m)))
-svuint64_t svorr_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u16_m)))
-svuint16_t svorr_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s8_m)))
-svint8_t svorr_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s32_m)))
-svint32_t svorr_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s64_m)))
-svint64_t svorr_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s16_m)))
-svint16_t svorr_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u8_x)))
-svuint8_t svorr_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u32_x)))
-svuint32_t svorr_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u64_x)))
-svuint64_t svorr_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u16_x)))
-svuint16_t svorr_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s8_x)))
-svint8_t svorr_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s32_x)))
-svint32_t svorr_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s64_x)))
-svint64_t svorr_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s16_x)))
-svint16_t svorr_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u8_z)))
-svuint8_t svorr_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u32_z)))
-svuint32_t svorr_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u64_z)))
-svuint64_t svorr_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_u16_z)))
-svuint16_t svorr_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s8_z)))
-svint8_t svorr_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s32_z)))
-svint32_t svorr_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s64_z)))
-svint64_t svorr_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_n_s16_z)))
-svint16_t svorr_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u8_m)))
-svuint8_t svorr_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u32_m)))
-svuint32_t svorr_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u64_m)))
-svuint64_t svorr_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u16_m)))
-svuint16_t svorr_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s8_m)))
-svint8_t svorr_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s32_m)))
-svint32_t svorr_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s64_m)))
-svint64_t svorr_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s16_m)))
-svint16_t svorr_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u8_x)))
-svuint8_t svorr_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u32_x)))
-svuint32_t svorr_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u64_x)))
-svuint64_t svorr_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u16_x)))
-svuint16_t svorr_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s8_x)))
-svint8_t svorr_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s32_x)))
-svint32_t svorr_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s64_x)))
-svint64_t svorr_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s16_x)))
-svint16_t svorr_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u8_z)))
-svuint8_t svorr_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u32_z)))
-svuint32_t svorr_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u64_z)))
-svuint64_t svorr_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_u16_z)))
-svuint16_t svorr_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s8_z)))
-svint8_t svorr_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s32_z)))
-svint32_t svorr_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s64_z)))
-svint64_t svorr_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorr_s16_z)))
-svint16_t svorr_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_u8)))
-uint8_t svorv(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_u32)))
-uint32_t svorv(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_u64)))
-uint64_t svorv(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_u16)))
-uint16_t svorv(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s8)))
-int8_t svorv(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s32)))
-int32_t svorv(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s64)))
-int64_t svorv(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
-int16_t svorv(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
-svbool_t svpfalse(void);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
-svbool_t svpfirst(svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8)))
-svint8_t svqadd(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32)))
-svint32_t svqadd(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s64)))
-svint64_t svqadd(svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s16)))
-svint16_t svqadd(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u8)))
-svuint8_t svqadd(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u32)))
-svuint32_t svqadd(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u64)))
-svuint64_t svqadd(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_u16)))
-svuint16_t svqadd(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s8)))
-svint8_t svqadd(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s32)))
-svint32_t svqadd(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s64)))
-svint64_t svqadd(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_s16)))
-svint16_t svqadd(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u8)))
-svuint8_t svqadd(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u32)))
-svuint32_t svqadd(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u64)))
-svuint64_t svqadd(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_u16)))
-svuint16_t svqadd(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_s32)))
-int32_t svqdecb(int32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_s64)))
-int64_t svqdecb(int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_u32)))
-uint32_t svqdecb(uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_u64)))
-uint64_t svqdecb(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s32)))
-int32_t svqdecb_pat(int32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s64)))
-int64_t svqdecb_pat(int64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u32)))
-uint32_t svqdecb_pat(uint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u64)))
-uint64_t svqdecb_pat(uint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s32)))
-int32_t svqdecd(int32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s64)))
-int64_t svqdecd(int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_u32)))
-uint32_t svqdecd(uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_u64)))
-uint64_t svqdecd(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_s64)))
-svint64_t svqdecd(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_u64)))
-svuint64_t svqdecd(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s32)))
-int32_t svqdecd_pat(int32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s64)))
-int64_t svqdecd_pat(int64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u32)))
-uint32_t svqdecd_pat(uint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u64)))
-uint64_t svqdecd_pat(uint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_s64)))
-svint64_t svqdecd_pat(svint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_u64)))
-svuint64_t svqdecd_pat(svuint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s32)))
-int32_t svqdech(int32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s64)))
-int64_t svqdech(int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_u32)))
-uint32_t svqdech(uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_u64)))
-uint64_t svqdech(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_s16)))
-svint16_t svqdech(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_u16)))
-svuint16_t svqdech(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s32)))
-int32_t svqdech_pat(int32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s64)))
-int64_t svqdech_pat(int64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u32)))
-uint32_t svqdech_pat(uint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u64)))
-uint64_t svqdech_pat(uint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_s16)))
-svint16_t svqdech_pat(svint16_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_u16)))
-svuint16_t svqdech_pat(svuint16_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b8)))
-int32_t svqdecp_b8(int32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b32)))
-int32_t svqdecp_b32(int32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b64)))
-int32_t svqdecp_b64(int32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b16)))
-int32_t svqdecp_b16(int32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s64_b8)))
-int64_t svqdecp_b8(int64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s64_b32)))
-int64_t svqdecp_b32(int64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s64_b64)))
-int64_t svqdecp_b64(int64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s64_b16)))
-int64_t svqdecp_b16(int64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u32_b8)))
-uint32_t svqdecp_b8(uint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u32_b32)))
-uint32_t svqdecp_b32(uint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u32_b64)))
-uint32_t svqdecp_b64(uint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u32_b16)))
-uint32_t svqdecp_b16(uint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u64_b8)))
-uint64_t svqdecp_b8(uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u64_b32)))
-uint64_t svqdecp_b32(uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u64_b64)))
-uint64_t svqdecp_b64(uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_u64_b16)))
-uint64_t svqdecp_b16(uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_s32)))
-svint32_t svqdecp(svint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_s64)))
-svint64_t svqdecp(svint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_s16)))
-svint16_t svqdecp(svint16_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_u32)))
-svuint32_t svqdecp(svuint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_u64)))
-svuint64_t svqdecp(svuint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_u16)))
-svuint16_t svqdecp(svuint16_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_n_s32)))
-int32_t svqdecw(int32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_n_s64)))
-int64_t svqdecw(int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_n_u32)))
-uint32_t svqdecw(uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_n_u64)))
-uint64_t svqdecw(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_s32)))
-svint32_t svqdecw(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_u32)))
-svuint32_t svqdecw(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s32)))
-int32_t svqdecw_pat(int32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s64)))
-int64_t svqdecw_pat(int64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u32)))
-uint32_t svqdecw_pat(uint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u64)))
-uint64_t svqdecw_pat(uint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_s32)))
-svint32_t svqdecw_pat(svint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_u32)))
-svuint32_t svqdecw_pat(svuint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s32)))
-int32_t svqincb(int32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s64)))
-int64_t svqincb(int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_u32)))
-uint32_t svqincb(uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_u64)))
-uint64_t svqincb(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s32)))
-int32_t svqincb_pat(int32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s64)))
-int64_t svqincb_pat(int64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u32)))
-uint32_t svqincb_pat(uint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u64)))
-uint64_t svqincb_pat(uint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s32)))
-int32_t svqincd(int32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s64)))
-int64_t svqincd(int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_u32)))
-uint32_t svqincd(uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_u64)))
-uint64_t svqincd(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_s64)))
-svint64_t svqincd(svint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_u64)))
-svuint64_t svqincd(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s32)))
-int32_t svqincd_pat(int32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s64)))
-int64_t svqincd_pat(int64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u32)))
-uint32_t svqincd_pat(uint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u64)))
-uint64_t svqincd_pat(uint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_s64)))
-svint64_t svqincd_pat(svint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_u64)))
-svuint64_t svqincd_pat(svuint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s32)))
-int32_t svqinch(int32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s64)))
-int64_t svqinch(int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_u32)))
-uint32_t svqinch(uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_u64)))
-uint64_t svqinch(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_s16)))
-svint16_t svqinch(svint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_u16)))
-svuint16_t svqinch(svuint16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s32)))
-int32_t svqinch_pat(int32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s64)))
-int64_t svqinch_pat(int64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u32)))
-uint32_t svqinch_pat(uint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u64)))
-uint64_t svqinch_pat(uint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_s16)))
-svint16_t svqinch_pat(svint16_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_u16)))
-svuint16_t svqinch_pat(svuint16_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b8)))
-int32_t svqincp_b8(int32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b32)))
-int32_t svqincp_b32(int32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b64)))
-int32_t svqincp_b64(int32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b16)))
-int32_t svqincp_b16(int32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s64_b8)))
-int64_t svqincp_b8(int64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s64_b32)))
-int64_t svqincp_b32(int64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s64_b64)))
-int64_t svqincp_b64(int64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s64_b16)))
-int64_t svqincp_b16(int64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u32_b8)))
-uint32_t svqincp_b8(uint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u32_b32)))
-uint32_t svqincp_b32(uint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u32_b64)))
-uint32_t svqincp_b64(uint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u32_b16)))
-uint32_t svqincp_b16(uint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u64_b8)))
-uint64_t svqincp_b8(uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u64_b32)))
-uint64_t svqincp_b32(uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u64_b64)))
-uint64_t svqincp_b64(uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_u64_b16)))
-uint64_t svqincp_b16(uint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_s32)))
-svint32_t svqincp(svint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_s64)))
-svint64_t svqincp(svint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_s16)))
-svint16_t svqincp(svint16_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_u32)))
-svuint32_t svqincp(svuint32_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_u64)))
-svuint64_t svqincp(svuint64_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_u16)))
-svuint16_t svqincp(svuint16_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_n_s32)))
-int32_t svqincw(int32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_n_s64)))
-int64_t svqincw(int64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_n_u32)))
-uint32_t svqincw(uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_n_u64)))
-uint64_t svqincw(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_s32)))
-svint32_t svqincw(svint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_u32)))
-svuint32_t svqincw(svuint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s32)))
-int32_t svqincw_pat(int32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s64)))
-int64_t svqincw_pat(int64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u32)))
-uint32_t svqincw_pat(uint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u64)))
-uint64_t svqincw_pat(uint64_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_s32)))
-svint32_t svqincw_pat(svint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_u32)))
-svuint32_t svqincw_pat(svuint32_t, enum svpattern, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8)))
-svint8_t svqsub(svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32)))
-svint32_t svqsub(svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s64)))
-svint64_t svqsub(svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s16)))
-svint16_t svqsub(svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u8)))
-svuint8_t svqsub(svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u32)))
-svuint32_t svqsub(svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u64)))
-svuint64_t svqsub(svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_u16)))
-svuint16_t svqsub(svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s8)))
-svint8_t svqsub(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s32)))
-svint32_t svqsub(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s64)))
-svint64_t svqsub(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_s16)))
-svint16_t svqsub(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u8)))
-svuint8_t svqsub(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u32)))
-svuint32_t svqsub(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u64)))
-svuint64_t svqsub(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_u16)))
-svuint16_t svqsub(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u8_m)))
-svuint8_t svrbit_m(svuint8_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u32_m)))
-svuint32_t svrbit_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u64_m)))
-svuint64_t svrbit_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u16_m)))
-svuint16_t svrbit_m(svuint16_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s8_m)))
-svint8_t svrbit_m(svint8_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s32_m)))
-svint32_t svrbit_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s64_m)))
-svint64_t svrbit_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_m)))
-svint16_t svrbit_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u8_x)))
-svuint8_t svrbit_x(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u32_x)))
-svuint32_t svrbit_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u64_x)))
-svuint64_t svrbit_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u16_x)))
-svuint16_t svrbit_x(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s8_x)))
-svint8_t svrbit_x(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s32_x)))
-svint32_t svrbit_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s64_x)))
-svint64_t svrbit_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_x)))
-svint16_t svrbit_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u8_z)))
-svuint8_t svrbit_z(svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u32_z)))
-svuint32_t svrbit_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u64_z)))
-svuint64_t svrbit_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_u16_z)))
-svuint16_t svrbit_z(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s8_z)))
-svint8_t svrbit_z(svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s32_z)))
-svint32_t svrbit_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s64_z)))
-svint64_t svrbit_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_z)))
-svint16_t svrbit_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f64)))
-svfloat64_t svrecpe(svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f32)))
-svfloat32_t svrecpe(svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f16)))
-svfloat16_t svrecpe(svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecps_f64)))
-svfloat64_t svrecps(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecps_f32)))
-svfloat32_t svrecps(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecps_f16)))
-svfloat16_t svrecps(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f64_m)))
-svfloat64_t svrecpx_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f32_m)))
-svfloat32_t svrecpx_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f16_m)))
-svfloat16_t svrecpx_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f64_x)))
-svfloat64_t svrecpx_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f32_x)))
-svfloat32_t svrecpx_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f16_x)))
-svfloat16_t svrecpx_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f64_z)))
-svfloat64_t svrecpx_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f32_z)))
-svfloat32_t svrecpx_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpx_f16_z)))
-svfloat16_t svrecpx_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_u8)))
-svuint8_t svrev(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_u32)))
-svuint32_t svrev(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_u64)))
-svuint64_t svrev(svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_u16)))
-svuint16_t svrev(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s8)))
-svint8_t svrev(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_f64)))
-svfloat64_t svrev(svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_f32)))
-svfloat32_t svrev(svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_f16)))
-svfloat16_t svrev(svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s32)))
-svint32_t svrev(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s64)))
-svint64_t svrev(svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s16)))
-svint16_t svrev(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u32_m)))
-svuint32_t svrevb_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u64_m)))
-svuint64_t svrevb_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u16_m)))
-svuint16_t svrevb_m(svuint16_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s32_m)))
-svint32_t svrevb_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s64_m)))
-svint64_t svrevb_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s16_m)))
-svint16_t svrevb_m(svint16_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u32_x)))
-svuint32_t svrevb_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u64_x)))
-svuint64_t svrevb_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u16_x)))
-svuint16_t svrevb_x(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s32_x)))
-svint32_t svrevb_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s64_x)))
-svint64_t svrevb_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s16_x)))
-svint16_t svrevb_x(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u32_z)))
-svuint32_t svrevb_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u64_z)))
-svuint64_t svrevb_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u16_z)))
-svuint16_t svrevb_z(svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s32_z)))
-svint32_t svrevb_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s64_z)))
-svint64_t svrevb_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_s16_z)))
-svint16_t svrevb_z(svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u32_m)))
-svuint32_t svrevh_m(svuint32_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u64_m)))
-svuint64_t svrevh_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s32_m)))
-svint32_t svrevh_m(svint32_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s64_m)))
-svint64_t svrevh_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u32_x)))
-svuint32_t svrevh_x(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u64_x)))
-svuint64_t svrevh_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s32_x)))
-svint32_t svrevh_x(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s64_x)))
-svint64_t svrevh_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u32_z)))
-svuint32_t svrevh_z(svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_u64_z)))
-svuint64_t svrevh_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s32_z)))
-svint32_t svrevh_z(svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevh_s64_z)))
-svint64_t svrevh_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_u64_m)))
-svuint64_t svrevw_m(svuint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_s64_m)))
-svint64_t svrevw_m(svint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_u64_x)))
-svuint64_t svrevw_x(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_s64_x)))
-svint64_t svrevw_x(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_u64_z)))
-svuint64_t svrevw_z(svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevw_s64_z)))
-svint64_t svrevw_z(svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f64_m)))
-svfloat64_t svrinta_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_m)))
-svfloat32_t svrinta_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f16_m)))
-svfloat16_t svrinta_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f64_x)))
-svfloat64_t svrinta_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_x)))
-svfloat32_t svrinta_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f16_x)))
-svfloat16_t svrinta_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f64_z)))
-svfloat64_t svrinta_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f32_z)))
-svfloat32_t svrinta_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinta_f16_z)))
-svfloat16_t svrinta_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f64_m)))
-svfloat64_t svrinti_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f32_m)))
-svfloat32_t svrinti_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f16_m)))
-svfloat16_t svrinti_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f64_x)))
-svfloat64_t svrinti_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f32_x)))
-svfloat32_t svrinti_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f16_x)))
-svfloat16_t svrinti_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f64_z)))
-svfloat64_t svrinti_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f32_z)))
-svfloat32_t svrinti_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrinti_f16_z)))
-svfloat16_t svrinti_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f64_m)))
-svfloat64_t svrintm_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_m)))
-svfloat32_t svrintm_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f16_m)))
-svfloat16_t svrintm_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f64_x)))
-svfloat64_t svrintm_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_x)))
-svfloat32_t svrintm_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f16_x)))
-svfloat16_t svrintm_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f64_z)))
-svfloat64_t svrintm_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f32_z)))
-svfloat32_t svrintm_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintm_f16_z)))
-svfloat16_t svrintm_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f64_m)))
-svfloat64_t svrintn_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_m)))
-svfloat32_t svrintn_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f16_m)))
-svfloat16_t svrintn_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f64_x)))
-svfloat64_t svrintn_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_x)))
-svfloat32_t svrintn_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f16_x)))
-svfloat16_t svrintn_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f64_z)))
-svfloat64_t svrintn_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f32_z)))
-svfloat32_t svrintn_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintn_f16_z)))
-svfloat16_t svrintn_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f64_m)))
-svfloat64_t svrintp_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_m)))
-svfloat32_t svrintp_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f16_m)))
-svfloat16_t svrintp_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f64_x)))
-svfloat64_t svrintp_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_x)))
-svfloat32_t svrintp_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f16_x)))
-svfloat16_t svrintp_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f64_z)))
-svfloat64_t svrintp_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f32_z)))
-svfloat32_t svrintp_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintp_f16_z)))
-svfloat16_t svrintp_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f64_m)))
-svfloat64_t svrintx_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f32_m)))
-svfloat32_t svrintx_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f16_m)))
-svfloat16_t svrintx_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f64_x)))
-svfloat64_t svrintx_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f32_x)))
-svfloat32_t svrintx_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f16_x)))
-svfloat16_t svrintx_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f64_z)))
-svfloat64_t svrintx_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f32_z)))
-svfloat32_t svrintx_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintx_f16_z)))
-svfloat16_t svrintx_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f64_m)))
-svfloat64_t svrintz_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f32_m)))
-svfloat32_t svrintz_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f16_m)))
-svfloat16_t svrintz_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f64_x)))
-svfloat64_t svrintz_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f32_x)))
-svfloat32_t svrintz_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f16_x)))
-svfloat16_t svrintz_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f64_z)))
-svfloat64_t svrintz_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f32_z)))
-svfloat32_t svrintz_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrintz_f16_z)))
-svfloat16_t svrintz_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_f64)))
-svfloat64_t svrsqrte(svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_f32)))
-svfloat32_t svrsqrte(svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrte_f16)))
-svfloat16_t svrsqrte(svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrts_f64)))
-svfloat64_t svrsqrts(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrts_f32)))
-svfloat32_t svrsqrts(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrsqrts_f16)))
-svfloat16_t svrsqrts(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f64_m)))
-svfloat64_t svscale_m(svbool_t, svfloat64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f32_m)))
-svfloat32_t svscale_m(svbool_t, svfloat32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f16_m)))
-svfloat16_t svscale_m(svbool_t, svfloat16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f64_x)))
-svfloat64_t svscale_x(svbool_t, svfloat64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f32_x)))
-svfloat32_t svscale_x(svbool_t, svfloat32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f16_x)))
-svfloat16_t svscale_x(svbool_t, svfloat16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f64_z)))
-svfloat64_t svscale_z(svbool_t, svfloat64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f32_z)))
-svfloat32_t svscale_z(svbool_t, svfloat32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_n_f16_z)))
-svfloat16_t svscale_z(svbool_t, svfloat16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_m)))
-svfloat64_t svscale_m(svbool_t, svfloat64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_m)))
-svfloat32_t svscale_m(svbool_t, svfloat32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_m)))
-svfloat16_t svscale_m(svbool_t, svfloat16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_x)))
-svfloat64_t svscale_x(svbool_t, svfloat64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_x)))
-svfloat32_t svscale_x(svbool_t, svfloat32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_x)))
-svfloat16_t svscale_x(svbool_t, svfloat16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_z)))
-svfloat64_t svscale_z(svbool_t, svfloat64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_z)))
-svfloat32_t svscale_z(svbool_t, svfloat32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_z)))
-svfloat16_t svscale_z(svbool_t, svfloat16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_b)))
-svbool_t svsel(svbool_t, svbool_t, svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u8)))
-svuint8_t svsel(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u32)))
-svuint32_t svsel(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u64)))
-svuint64_t svsel(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_u16)))
-svuint16_t svsel(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s8)))
-svint8_t svsel(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f64)))
-svfloat64_t svsel(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f32)))
-svfloat32_t svsel(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_f16)))
-svfloat16_t svsel(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s32)))
-svint32_t svsel(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s64)))
-svint64_t svsel(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsel_s16)))
-svint16_t svsel(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_u8)))
-svuint8x2_t svset2(svuint8x2_t, uint64_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_u32)))
-svuint32x2_t svset2(svuint32x2_t, uint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_u64)))
-svuint64x2_t svset2(svuint64x2_t, uint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_u16)))
-svuint16x2_t svset2(svuint16x2_t, uint64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_s8)))
-svint8x2_t svset2(svint8x2_t, uint64_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_f64)))
-svfloat64x2_t svset2(svfloat64x2_t, uint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_f32)))
-svfloat32x2_t svset2(svfloat32x2_t, uint64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_f16)))
-svfloat16x2_t svset2(svfloat16x2_t, uint64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_s32)))
-svint32x2_t svset2(svint32x2_t, uint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_s64)))
-svint64x2_t svset2(svint64x2_t, uint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset2_s16)))
-svint16x2_t svset2(svint16x2_t, uint64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_u8)))
-svuint8x3_t svset3(svuint8x3_t, uint64_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_u32)))
-svuint32x3_t svset3(svuint32x3_t, uint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_u64)))
-svuint64x3_t svset3(svuint64x3_t, uint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_u16)))
-svuint16x3_t svset3(svuint16x3_t, uint64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_s8)))
-svint8x3_t svset3(svint8x3_t, uint64_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_f64)))
-svfloat64x3_t svset3(svfloat64x3_t, uint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_f32)))
-svfloat32x3_t svset3(svfloat32x3_t, uint64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_f16)))
-svfloat16x3_t svset3(svfloat16x3_t, uint64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_s32)))
-svint32x3_t svset3(svint32x3_t, uint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_s64)))
-svint64x3_t svset3(svint64x3_t, uint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset3_s16)))
-svint16x3_t svset3(svint16x3_t, uint64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_u8)))
-svuint8x4_t svset4(svuint8x4_t, uint64_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_u32)))
-svuint32x4_t svset4(svuint32x4_t, uint64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_u64)))
-svuint64x4_t svset4(svuint64x4_t, uint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_u16)))
-svuint16x4_t svset4(svuint16x4_t, uint64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s8)))
-svint8x4_t svset4(svint8x4_t, uint64_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_f64)))
-svfloat64x4_t svset4(svfloat64x4_t, uint64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_f32)))
-svfloat32x4_t svset4(svfloat32x4_t, uint64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_f16)))
-svfloat16x4_t svset4(svfloat16x4_t, uint64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s32)))
-svint32x4_t svset4(svint32x4_t, uint64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s64)))
-svint64x4_t svset4(svint64x4_t, uint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s16)))
-svint16x4_t svset4(svint16x4_t, uint64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u8)))
-svuint8_t svsplice(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u32)))
-svuint32_t svsplice(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u64)))
-svuint64_t svsplice(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u16)))
-svuint16_t svsplice(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_s8)))
-svint8_t svsplice(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_f64)))
-svfloat64_t svsplice(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_f32)))
-svfloat32_t svsplice(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_f16)))
-svfloat16_t svsplice(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_s32)))
-svint32_t svsplice(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_s64)))
-svint64_t svsplice(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_s16)))
-svint16_t svsplice(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f64_m)))
-svfloat64_t svsqrt_m(svfloat64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f32_m)))
-svfloat32_t svsqrt_m(svfloat32_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f16_m)))
-svfloat16_t svsqrt_m(svfloat16_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f64_x)))
-svfloat64_t svsqrt_x(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f32_x)))
-svfloat32_t svsqrt_x(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f16_x)))
-svfloat16_t svsqrt_x(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f64_z)))
-svfloat64_t svsqrt_z(svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f32_z)))
-svfloat32_t svsqrt_z(svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsqrt_f16_z)))
-svfloat16_t svsqrt_z(svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u8)))
-void svst1(svbool_t, uint8_t *, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u32)))
-void svst1(svbool_t, uint32_t *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u64)))
-void svst1(svbool_t, uint64_t *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_u16)))
-void svst1(svbool_t, uint16_t *, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s8)))
-void svst1(svbool_t, int8_t *, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f64)))
-void svst1(svbool_t, float64_t *, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f32)))
-void svst1(svbool_t, float32_t *, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_f16)))
-void svst1(svbool_t, float16_t *, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s32)))
-void svst1(svbool_t, int32_t *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s64)))
-void svst1(svbool_t, int64_t *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_s16)))
-void svst1(svbool_t, int16_t *, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u8)))
-void svst1_vnum(svbool_t, uint8_t *, int64_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u32)))
-void svst1_vnum(svbool_t, uint32_t *, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u64)))
-void svst1_vnum(svbool_t, uint64_t *, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_u16)))
-void svst1_vnum(svbool_t, uint16_t *, int64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s8)))
-void svst1_vnum(svbool_t, int8_t *, int64_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f64)))
-void svst1_vnum(svbool_t, float64_t *, int64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f32)))
-void svst1_vnum(svbool_t, float32_t *, int64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_f16)))
-void svst1_vnum(svbool_t, float16_t *, int64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s32)))
-void svst1_vnum(svbool_t, int32_t *, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s64)))
-void svst1_vnum(svbool_t, int64_t *, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1_vnum_s16)))
-void svst1_vnum(svbool_t, int16_t *, int64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_s32)))
-void svst1b(svbool_t, int8_t *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_s64)))
-void svst1b(svbool_t, int8_t *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_s16)))
-void svst1b(svbool_t, int8_t *, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_u32)))
-void svst1b(svbool_t, uint8_t *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_u64)))
-void svst1b(svbool_t, uint8_t *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_u16)))
-void svst1b(svbool_t, uint8_t *, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_s32)))
-void svst1b_vnum(svbool_t, int8_t *, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_s64)))
-void svst1b_vnum(svbool_t, int8_t *, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_s16)))
-void svst1b_vnum(svbool_t, int8_t *, int64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_u32)))
-void svst1b_vnum(svbool_t, uint8_t *, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_u64)))
-void svst1b_vnum(svbool_t, uint8_t *, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1b_vnum_u16)))
-void svst1b_vnum(svbool_t, uint8_t *, int64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_s32)))
-void svst1h(svbool_t, int16_t *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_s64)))
-void svst1h(svbool_t, int16_t *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_u32)))
-void svst1h(svbool_t, uint16_t *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_u64)))
-void svst1h(svbool_t, uint16_t *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_vnum_s32)))
-void svst1h_vnum(svbool_t, int16_t *, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_vnum_s64)))
-void svst1h_vnum(svbool_t, int16_t *, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_vnum_u32)))
-void svst1h_vnum(svbool_t, uint16_t *, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1h_vnum_u64)))
-void svst1h_vnum(svbool_t, uint16_t *, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_s64)))
-void svst1w(svbool_t, int32_t *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_u64)))
-void svst1w(svbool_t, uint32_t *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_vnum_s64)))
-void svst1w_vnum(svbool_t, int32_t *, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1w_vnum_u64)))
-void svst1w_vnum(svbool_t, uint32_t *, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_u8)))
-void svst2(svbool_t, uint8_t *, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_u32)))
-void svst2(svbool_t, uint32_t *, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_u64)))
-void svst2(svbool_t, uint64_t *, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_u16)))
-void svst2(svbool_t, uint16_t *, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_s8)))
-void svst2(svbool_t, int8_t *, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_f64)))
-void svst2(svbool_t, float64_t *, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_f32)))
-void svst2(svbool_t, float32_t *, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_f16)))
-void svst2(svbool_t, float16_t *, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_s32)))
-void svst2(svbool_t, int32_t *, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_s64)))
-void svst2(svbool_t, int64_t *, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_s16)))
-void svst2(svbool_t, int16_t *, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_u8)))
-void svst2_vnum(svbool_t, uint8_t *, int64_t, svuint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_u32)))
-void svst2_vnum(svbool_t, uint32_t *, int64_t, svuint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_u64)))
-void svst2_vnum(svbool_t, uint64_t *, int64_t, svuint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_u16)))
-void svst2_vnum(svbool_t, uint16_t *, int64_t, svuint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_s8)))
-void svst2_vnum(svbool_t, int8_t *, int64_t, svint8x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_f64)))
-void svst2_vnum(svbool_t, float64_t *, int64_t, svfloat64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_f32)))
-void svst2_vnum(svbool_t, float32_t *, int64_t, svfloat32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_f16)))
-void svst2_vnum(svbool_t, float16_t *, int64_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_s32)))
-void svst2_vnum(svbool_t, int32_t *, int64_t, svint32x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_s64)))
-void svst2_vnum(svbool_t, int64_t *, int64_t, svint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2_vnum_s16)))
-void svst2_vnum(svbool_t, int16_t *, int64_t, svint16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_u8)))
-void svst3(svbool_t, uint8_t *, svuint8x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_u32)))
-void svst3(svbool_t, uint32_t *, svuint32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_u64)))
-void svst3(svbool_t, uint64_t *, svuint64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_u16)))
-void svst3(svbool_t, uint16_t *, svuint16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_s8)))
-void svst3(svbool_t, int8_t *, svint8x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_f64)))
-void svst3(svbool_t, float64_t *, svfloat64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_f32)))
-void svst3(svbool_t, float32_t *, svfloat32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_f16)))
-void svst3(svbool_t, float16_t *, svfloat16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_s32)))
-void svst3(svbool_t, int32_t *, svint32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_s64)))
-void svst3(svbool_t, int64_t *, svint64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_s16)))
-void svst3(svbool_t, int16_t *, svint16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_u8)))
-void svst3_vnum(svbool_t, uint8_t *, int64_t, svuint8x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_u32)))
-void svst3_vnum(svbool_t, uint32_t *, int64_t, svuint32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_u64)))
-void svst3_vnum(svbool_t, uint64_t *, int64_t, svuint64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_u16)))
-void svst3_vnum(svbool_t, uint16_t *, int64_t, svuint16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_s8)))
-void svst3_vnum(svbool_t, int8_t *, int64_t, svint8x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_f64)))
-void svst3_vnum(svbool_t, float64_t *, int64_t, svfloat64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_f32)))
-void svst3_vnum(svbool_t, float32_t *, int64_t, svfloat32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_f16)))
-void svst3_vnum(svbool_t, float16_t *, int64_t, svfloat16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_s32)))
-void svst3_vnum(svbool_t, int32_t *, int64_t, svint32x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_s64)))
-void svst3_vnum(svbool_t, int64_t *, int64_t, svint64x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst3_vnum_s16)))
-void svst3_vnum(svbool_t, int16_t *, int64_t, svint16x3_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_u8)))
-void svst4(svbool_t, uint8_t *, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_u32)))
-void svst4(svbool_t, uint32_t *, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_u64)))
-void svst4(svbool_t, uint64_t *, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_u16)))
-void svst4(svbool_t, uint16_t *, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_s8)))
-void svst4(svbool_t, int8_t *, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_f64)))
-void svst4(svbool_t, float64_t *, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_f32)))
-void svst4(svbool_t, float32_t *, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_f16)))
-void svst4(svbool_t, float16_t *, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_s32)))
-void svst4(svbool_t, int32_t *, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_s64)))
-void svst4(svbool_t, int64_t *, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_s16)))
-void svst4(svbool_t, int16_t *, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_u8)))
-void svst4_vnum(svbool_t, uint8_t *, int64_t, svuint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_u32)))
-void svst4_vnum(svbool_t, uint32_t *, int64_t, svuint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_u64)))
-void svst4_vnum(svbool_t, uint64_t *, int64_t, svuint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_u16)))
-void svst4_vnum(svbool_t, uint16_t *, int64_t, svuint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_s8)))
-void svst4_vnum(svbool_t, int8_t *, int64_t, svint8x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_f64)))
-void svst4_vnum(svbool_t, float64_t *, int64_t, svfloat64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_f32)))
-void svst4_vnum(svbool_t, float32_t *, int64_t, svfloat32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_f16)))
-void svst4_vnum(svbool_t, float16_t *, int64_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_s32)))
-void svst4_vnum(svbool_t, int32_t *, int64_t, svint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_s64)))
-void svst4_vnum(svbool_t, int64_t *, int64_t, svint64x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst4_vnum_s16)))
-void svst4_vnum(svbool_t, int16_t *, int64_t, svint16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u8)))
-void svstnt1(svbool_t, uint8_t *, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u32)))
-void svstnt1(svbool_t, uint32_t *, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u64)))
-void svstnt1(svbool_t, uint64_t *, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_u16)))
-void svstnt1(svbool_t, uint16_t *, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s8)))
-void svstnt1(svbool_t, int8_t *, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f64)))
-void svstnt1(svbool_t, float64_t *, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f32)))
-void svstnt1(svbool_t, float32_t *, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_f16)))
-void svstnt1(svbool_t, float16_t *, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s32)))
-void svstnt1(svbool_t, int32_t *, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s64)))
-void svstnt1(svbool_t, int64_t *, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_s16)))
-void svstnt1(svbool_t, int16_t *, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u8)))
-void svstnt1_vnum(svbool_t, uint8_t *, int64_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u32)))
-void svstnt1_vnum(svbool_t, uint32_t *, int64_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u64)))
-void svstnt1_vnum(svbool_t, uint64_t *, int64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_u16)))
-void svstnt1_vnum(svbool_t, uint16_t *, int64_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s8)))
-void svstnt1_vnum(svbool_t, int8_t *, int64_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f64)))
-void svstnt1_vnum(svbool_t, float64_t *, int64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f32)))
-void svstnt1_vnum(svbool_t, float32_t *, int64_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_f16)))
-void svstnt1_vnum(svbool_t, float16_t *, int64_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s32)))
-void svstnt1_vnum(svbool_t, int32_t *, int64_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s64)))
-void svstnt1_vnum(svbool_t, int64_t *, int64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1_vnum_s16)))
-void svstnt1_vnum(svbool_t, int16_t *, int64_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f64_m)))
-svfloat64_t svsub_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f32_m)))
-svfloat32_t svsub_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f16_m)))
-svfloat16_t svsub_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f64_x)))
-svfloat64_t svsub_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f32_x)))
-svfloat32_t svsub_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f16_x)))
-svfloat16_t svsub_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f64_z)))
-svfloat64_t svsub_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f32_z)))
-svfloat32_t svsub_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_f16_z)))
-svfloat16_t svsub_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u8_m)))
-svuint8_t svsub_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u32_m)))
-svuint32_t svsub_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u64_m)))
-svuint64_t svsub_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u16_m)))
-svuint16_t svsub_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s8_m)))
-svint8_t svsub_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s32_m)))
-svint32_t svsub_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s64_m)))
-svint64_t svsub_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s16_m)))
-svint16_t svsub_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u8_x)))
-svuint8_t svsub_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u32_x)))
-svuint32_t svsub_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u64_x)))
-svuint64_t svsub_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u16_x)))
-svuint16_t svsub_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s8_x)))
-svint8_t svsub_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s32_x)))
-svint32_t svsub_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s64_x)))
-svint64_t svsub_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s16_x)))
-svint16_t svsub_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u8_z)))
-svuint8_t svsub_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u32_z)))
-svuint32_t svsub_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u64_z)))
-svuint64_t svsub_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_u16_z)))
-svuint16_t svsub_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s8_z)))
-svint8_t svsub_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s32_z)))
-svint32_t svsub_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s64_z)))
-svint64_t svsub_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_n_s16_z)))
-svint16_t svsub_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f64_m)))
-svfloat64_t svsub_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f32_m)))
-svfloat32_t svsub_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f16_m)))
-svfloat16_t svsub_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f64_x)))
-svfloat64_t svsub_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f32_x)))
-svfloat32_t svsub_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f16_x)))
-svfloat16_t svsub_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f64_z)))
-svfloat64_t svsub_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f32_z)))
-svfloat32_t svsub_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_f16_z)))
-svfloat16_t svsub_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u8_m)))
-svuint8_t svsub_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u32_m)))
-svuint32_t svsub_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u64_m)))
-svuint64_t svsub_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u16_m)))
-svuint16_t svsub_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s8_m)))
-svint8_t svsub_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s32_m)))
-svint32_t svsub_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s64_m)))
-svint64_t svsub_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s16_m)))
-svint16_t svsub_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u8_x)))
-svuint8_t svsub_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u32_x)))
-svuint32_t svsub_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u64_x)))
-svuint64_t svsub_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u16_x)))
-svuint16_t svsub_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s8_x)))
-svint8_t svsub_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s32_x)))
-svint32_t svsub_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s64_x)))
-svint64_t svsub_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s16_x)))
-svint16_t svsub_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u8_z)))
-svuint8_t svsub_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u32_z)))
-svuint32_t svsub_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u64_z)))
-svuint64_t svsub_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_u16_z)))
-svuint16_t svsub_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s8_z)))
-svint8_t svsub_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s32_z)))
-svint32_t svsub_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s64_z)))
-svint64_t svsub_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_s16_z)))
-svint16_t svsub_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f64_m)))
-svfloat64_t svsubr_m(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f32_m)))
-svfloat32_t svsubr_m(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f16_m)))
-svfloat16_t svsubr_m(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f64_x)))
-svfloat64_t svsubr_x(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f32_x)))
-svfloat32_t svsubr_x(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f16_x)))
-svfloat16_t svsubr_x(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f64_z)))
-svfloat64_t svsubr_z(svbool_t, svfloat64_t, float64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f32_z)))
-svfloat32_t svsubr_z(svbool_t, svfloat32_t, float32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_f16_z)))
-svfloat16_t svsubr_z(svbool_t, svfloat16_t, float16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u8_m)))
-svuint8_t svsubr_m(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u32_m)))
-svuint32_t svsubr_m(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u64_m)))
-svuint64_t svsubr_m(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u16_m)))
-svuint16_t svsubr_m(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s8_m)))
-svint8_t svsubr_m(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s32_m)))
-svint32_t svsubr_m(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s64_m)))
-svint64_t svsubr_m(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s16_m)))
-svint16_t svsubr_m(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u8_x)))
-svuint8_t svsubr_x(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u32_x)))
-svuint32_t svsubr_x(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u64_x)))
-svuint64_t svsubr_x(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u16_x)))
-svuint16_t svsubr_x(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s8_x)))
-svint8_t svsubr_x(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s32_x)))
-svint32_t svsubr_x(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s64_x)))
-svint64_t svsubr_x(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s16_x)))
-svint16_t svsubr_x(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u8_z)))
-svuint8_t svsubr_z(svbool_t, svuint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u32_z)))
-svuint32_t svsubr_z(svbool_t, svuint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u64_z)))
-svuint64_t svsubr_z(svbool_t, svuint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_u16_z)))
-svuint16_t svsubr_z(svbool_t, svuint16_t, uint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s8_z)))
-svint8_t svsubr_z(svbool_t, svint8_t, int8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s32_z)))
-svint32_t svsubr_z(svbool_t, svint32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s64_z)))
-svint64_t svsubr_z(svbool_t, svint64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_n_s16_z)))
-svint16_t svsubr_z(svbool_t, svint16_t, int16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f64_m)))
-svfloat64_t svsubr_m(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f32_m)))
-svfloat32_t svsubr_m(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f16_m)))
-svfloat16_t svsubr_m(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f64_x)))
-svfloat64_t svsubr_x(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f32_x)))
-svfloat32_t svsubr_x(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f16_x)))
-svfloat16_t svsubr_x(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f64_z)))
-svfloat64_t svsubr_z(svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f32_z)))
-svfloat32_t svsubr_z(svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_f16_z)))
-svfloat16_t svsubr_z(svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u8_m)))
-svuint8_t svsubr_m(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u32_m)))
-svuint32_t svsubr_m(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u64_m)))
-svuint64_t svsubr_m(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u16_m)))
-svuint16_t svsubr_m(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s8_m)))
-svint8_t svsubr_m(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s32_m)))
-svint32_t svsubr_m(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s64_m)))
-svint64_t svsubr_m(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s16_m)))
-svint16_t svsubr_m(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u8_x)))
-svuint8_t svsubr_x(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u32_x)))
-svuint32_t svsubr_x(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u64_x)))
-svuint64_t svsubr_x(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u16_x)))
-svuint16_t svsubr_x(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s8_x)))
-svint8_t svsubr_x(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s32_x)))
-svint32_t svsubr_x(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s64_x)))
-svint64_t svsubr_x(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s16_x)))
-svint16_t svsubr_x(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u8_z)))
-svuint8_t svsubr_z(svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u32_z)))
-svuint32_t svsubr_z(svbool_t, svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u64_z)))
-svuint64_t svsubr_z(svbool_t, svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_u16_z)))
-svuint16_t svsubr_z(svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s8_z)))
-svint8_t svsubr_z(svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s32_z)))
-svint32_t svsubr_z(svbool_t, svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s64_z)))
-svint64_t svsubr_z(svbool_t, svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsubr_s16_z)))
-svint16_t svsubr_z(svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_u8)))
-svuint8_t svtbl(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_u32)))
-svuint32_t svtbl(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_u64)))
-svuint64_t svtbl(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_u16)))
-svuint16_t svtbl(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_s8)))
-svint8_t svtbl(svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_f64)))
-svfloat64_t svtbl(svfloat64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_f32)))
-svfloat32_t svtbl(svfloat32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_f16)))
-svfloat16_t svtbl(svfloat16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_s32)))
-svint32_t svtbl(svint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_s64)))
-svint64_t svtbl(svint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl_s16)))
-svint16_t svtbl(svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_u8)))
-svuint8_t svtrn1(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_u32)))
-svuint32_t svtrn1(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_u64)))
-svuint64_t svtrn1(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_u16)))
-svuint16_t svtrn1(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s8)))
-svint8_t svtrn1(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_f64)))
-svfloat64_t svtrn1(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_f32)))
-svfloat32_t svtrn1(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_f16)))
-svfloat16_t svtrn1(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s32)))
-svint32_t svtrn1(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s64)))
-svint64_t svtrn1(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s16)))
-svint16_t svtrn1(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u8)))
-svuint8_t svtrn2(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u32)))
-svuint32_t svtrn2(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u64)))
-svuint64_t svtrn2(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u16)))
-svuint16_t svtrn2(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s8)))
-svint8_t svtrn2(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_f64)))
-svfloat64_t svtrn2(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_f32)))
-svfloat32_t svtrn2(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_f16)))
-svfloat16_t svtrn2(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s32)))
-svint32_t svtrn2(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s64)))
-svint64_t svtrn2(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s16)))
-svint16_t svtrn2(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_b)))
-svbool_t svunpkhi(svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s32)))
-svint32_t svunpkhi(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s64)))
-svint64_t svunpkhi(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s16)))
-svint16_t svunpkhi(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_u32)))
-svuint32_t svunpkhi(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_u64)))
-svuint64_t svunpkhi(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_u16)))
-svuint16_t svunpkhi(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_b)))
-svbool_t svunpklo(svbool_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_s32)))
-svint32_t svunpklo(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_s64)))
-svint64_t svunpklo(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_s16)))
-svint16_t svunpklo(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_u32)))
-svuint32_t svunpklo(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_u64)))
-svuint64_t svunpklo(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpklo_u16)))
-svuint16_t svunpklo(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_u8)))
-svuint8_t svuzp1(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_u32)))
-svuint32_t svuzp1(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_u64)))
-svuint64_t svuzp1(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_u16)))
-svuint16_t svuzp1(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s8)))
-svint8_t svuzp1(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_f64)))
-svfloat64_t svuzp1(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_f32)))
-svfloat32_t svuzp1(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_f16)))
-svfloat16_t svuzp1(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s32)))
-svint32_t svuzp1(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s64)))
-svint64_t svuzp1(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s16)))
-svint16_t svuzp1(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u8)))
-svuint8_t svuzp2(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u32)))
-svuint32_t svuzp2(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u64)))
-svuint64_t svuzp2(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u16)))
-svuint16_t svuzp2(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s8)))
-svint8_t svuzp2(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_f64)))
-svfloat64_t svuzp2(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_f32)))
-svfloat32_t svuzp2(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_f16)))
-svfloat16_t svuzp2(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s32)))
-svint32_t svuzp2(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s64)))
-svint64_t svuzp2(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s16)))
-svint16_t svuzp2(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_s32)))
-svbool_t svwhilele_b8(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_s32)))
-svbool_t svwhilele_b32(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_s32)))
-svbool_t svwhilele_b64(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_s32)))
-svbool_t svwhilele_b16(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_s64)))
-svbool_t svwhilele_b8(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_s64)))
-svbool_t svwhilele_b32(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_s64)))
-svbool_t svwhilele_b64(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_s64)))
-svbool_t svwhilele_b16(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_u32)))
-svbool_t svwhilele_b8(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_u32)))
-svbool_t svwhilele_b32(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_u32)))
-svbool_t svwhilele_b64(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_u32)))
-svbool_t svwhilele_b16(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_u64)))
-svbool_t svwhilele_b8(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_u64)))
-svbool_t svwhilele_b32(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b64_u64)))
-svbool_t svwhilele_b64(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b16_u64)))
-svbool_t svwhilele_b16(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_u32)))
-svbool_t svwhilelt_b8(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_u32)))
-svbool_t svwhilelt_b32(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_u32)))
-svbool_t svwhilelt_b64(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_u32)))
-svbool_t svwhilelt_b16(uint32_t, uint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_u64)))
-svbool_t svwhilelt_b8(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_u64)))
-svbool_t svwhilelt_b32(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_u64)))
-svbool_t svwhilelt_b64(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_u64)))
-svbool_t svwhilelt_b16(uint64_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_s32)))
-svbool_t svwhilelt_b8(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_s32)))
-svbool_t svwhilelt_b32(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_s32)))
-svbool_t svwhilelt_b64(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_s32)))
-svbool_t svwhilelt_b16(int32_t, int32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b8_s64)))
-svbool_t svwhilelt_b8(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b32_s64)))
-svbool_t svwhilelt_b32(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b64_s64)))
-svbool_t svwhilelt_b64(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilelt_b16_s64)))
-svbool_t svwhilelt_b16(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_u8)))
-svuint8_t svzip1(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_u32)))
-svuint32_t svzip1(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_u64)))
-svuint64_t svzip1(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_u16)))
-svuint16_t svzip1(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s8)))
-svint8_t svzip1(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_f64)))
-svfloat64_t svzip1(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_f32)))
-svfloat32_t svzip1(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_f16)))
-svfloat16_t svzip1(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s32)))
-svint32_t svzip1(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s64)))
-svint64_t svzip1(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s16)))
-svint16_t svzip1(svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u8)))
-svuint8_t svzip2(svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u32)))
-svuint32_t svzip2(svuint32_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u64)))
-svuint64_t svzip2(svuint64_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u16)))
-svuint16_t svzip2(svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s8)))
-svint8_t svzip2(svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_f64)))
-svfloat64_t svzip2(svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_f32)))
-svfloat32_t svzip2(svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_f16)))
-svfloat16_t svzip2(svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s32)))
-svint32_t svzip2(svint32_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s64)))
-svint64_t svzip2(svint64_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s16)))
-svint16_t svzip2(svint16_t, svint16_t);
-#define svcvtnt_bf16_x      svcvtnt_bf16_m
-#define svcvtnt_bf16_f32_x  svcvtnt_bf16_f32_m
-#define svcvtnt_f16_x      svcvtnt_f16_m
-#define svcvtnt_f16_f32_x  svcvtnt_f16_f32_m
-#define svcvtnt_f32_x      svcvtnt_f32_m
-#define svcvtnt_f32_f64_x  svcvtnt_f32_f64_m
-
-#define svcvtxnt_f32_x     svcvtxnt_f32_m
-#define svcvtxnt_f32_f64_x svcvtxnt_f32_f64_m
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#undef __ai
-
-#undef __aio
-
-#endif /* __ARM_SVE_H */
diff --git a/third_party/aarch64/clang/arm_vector_types.h b/third_party/aarch64/clang/arm_vector_types.h
deleted file mode 100644
index 8e79d39a6..000000000
--- a/third_party/aarch64/clang/arm_vector_types.h
+++ /dev/null
@@ -1,345 +0,0 @@
-/*===---- arm_vector_types - ARM vector type ------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined(__ARM_NEON_H) && !defined(__ARM_SVE_H)
-#error "This file should not be used standalone. Please include arm_neon.h or arm_sve.h instead"
-
-#endif
-#ifndef __ARM_NEON_TYPES_H
-#define __ARM_NEON_TYPES_H
-typedef float float32_t;
-typedef __fp16 float16_t;
-#if defined(__aarch64__) || defined(__arm64ec__)
-typedef double float64_t;
-#endif
-
-typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
-typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
-typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
-typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
-typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
-typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
-typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
-typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
-typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
-typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
-typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
-typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
-typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
-typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
-typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
-typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
-typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
-typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
-typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
-typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
-#if defined(__aarch64__) || defined(__arm64ec__)
-typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
-typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
-#endif
-
-typedef struct int8x8x2_t {
-  int8x8_t val[2];
-} int8x8x2_t;
-
-typedef struct int8x16x2_t {
-  int8x16_t val[2];
-} int8x16x2_t;
-
-typedef struct int16x4x2_t {
-  int16x4_t val[2];
-} int16x4x2_t;
-
-typedef struct int16x8x2_t {
-  int16x8_t val[2];
-} int16x8x2_t;
-
-typedef struct int32x2x2_t {
-  int32x2_t val[2];
-} int32x2x2_t;
-
-typedef struct int32x4x2_t {
-  int32x4_t val[2];
-} int32x4x2_t;
-
-typedef struct int64x1x2_t {
-  int64x1_t val[2];
-} int64x1x2_t;
-
-typedef struct int64x2x2_t {
-  int64x2_t val[2];
-} int64x2x2_t;
-
-typedef struct uint8x8x2_t {
-  uint8x8_t val[2];
-} uint8x8x2_t;
-
-typedef struct uint8x16x2_t {
-  uint8x16_t val[2];
-} uint8x16x2_t;
-
-typedef struct uint16x4x2_t {
-  uint16x4_t val[2];
-} uint16x4x2_t;
-
-typedef struct uint16x8x2_t {
-  uint16x8_t val[2];
-} uint16x8x2_t;
-
-typedef struct uint32x2x2_t {
-  uint32x2_t val[2];
-} uint32x2x2_t;
-
-typedef struct uint32x4x2_t {
-  uint32x4_t val[2];
-} uint32x4x2_t;
-
-typedef struct uint64x1x2_t {
-  uint64x1_t val[2];
-} uint64x1x2_t;
-
-typedef struct uint64x2x2_t {
-  uint64x2_t val[2];
-} uint64x2x2_t;
-
-typedef struct float16x4x2_t {
-  float16x4_t val[2];
-} float16x4x2_t;
-
-typedef struct float16x8x2_t {
-  float16x8_t val[2];
-} float16x8x2_t;
-
-typedef struct float32x2x2_t {
-  float32x2_t val[2];
-} float32x2x2_t;
-
-typedef struct float32x4x2_t {
-  float32x4_t val[2];
-} float32x4x2_t;
-
-#if defined(__aarch64__) || defined(__arm64ec__)
-typedef struct float64x1x2_t {
-  float64x1_t val[2];
-} float64x1x2_t;
-
-typedef struct float64x2x2_t {
-  float64x2_t val[2];
-} float64x2x2_t;
-
-#endif
-typedef struct int8x8x3_t {
-  int8x8_t val[3];
-} int8x8x3_t;
-
-typedef struct int8x16x3_t {
-  int8x16_t val[3];
-} int8x16x3_t;
-
-typedef struct int16x4x3_t {
-  int16x4_t val[3];
-} int16x4x3_t;
-
-typedef struct int16x8x3_t {
-  int16x8_t val[3];
-} int16x8x3_t;
-
-typedef struct int32x2x3_t {
-  int32x2_t val[3];
-} int32x2x3_t;
-
-typedef struct int32x4x3_t {
-  int32x4_t val[3];
-} int32x4x3_t;
-
-typedef struct int64x1x3_t {
-  int64x1_t val[3];
-} int64x1x3_t;
-
-typedef struct int64x2x3_t {
-  int64x2_t val[3];
-} int64x2x3_t;
-
-typedef struct uint8x8x3_t {
-  uint8x8_t val[3];
-} uint8x8x3_t;
-
-typedef struct uint8x16x3_t {
-  uint8x16_t val[3];
-} uint8x16x3_t;
-
-typedef struct uint16x4x3_t {
-  uint16x4_t val[3];
-} uint16x4x3_t;
-
-typedef struct uint16x8x3_t {
-  uint16x8_t val[3];
-} uint16x8x3_t;
-
-typedef struct uint32x2x3_t {
-  uint32x2_t val[3];
-} uint32x2x3_t;
-
-typedef struct uint32x4x3_t {
-  uint32x4_t val[3];
-} uint32x4x3_t;
-
-typedef struct uint64x1x3_t {
-  uint64x1_t val[3];
-} uint64x1x3_t;
-
-typedef struct uint64x2x3_t {
-  uint64x2_t val[3];
-} uint64x2x3_t;
-
-typedef struct float16x4x3_t {
-  float16x4_t val[3];
-} float16x4x3_t;
-
-typedef struct float16x8x3_t {
-  float16x8_t val[3];
-} float16x8x3_t;
-
-typedef struct float32x2x3_t {
-  float32x2_t val[3];
-} float32x2x3_t;
-
-typedef struct float32x4x3_t {
-  float32x4_t val[3];
-} float32x4x3_t;
-
-#if defined(__aarch64__) || defined(__arm64ec__)
-typedef struct float64x1x3_t {
-  float64x1_t val[3];
-} float64x1x3_t;
-
-typedef struct float64x2x3_t {
-  float64x2_t val[3];
-} float64x2x3_t;
-
-#endif
-typedef struct int8x8x4_t {
-  int8x8_t val[4];
-} int8x8x4_t;
-
-typedef struct int8x16x4_t {
-  int8x16_t val[4];
-} int8x16x4_t;
-
-typedef struct int16x4x4_t {
-  int16x4_t val[4];
-} int16x4x4_t;
-
-typedef struct int16x8x4_t {
-  int16x8_t val[4];
-} int16x8x4_t;
-
-typedef struct int32x2x4_t {
-  int32x2_t val[4];
-} int32x2x4_t;
-
-typedef struct int32x4x4_t {
-  int32x4_t val[4];
-} int32x4x4_t;
-
-typedef struct int64x1x4_t {
-  int64x1_t val[4];
-} int64x1x4_t;
-
-typedef struct int64x2x4_t {
-  int64x2_t val[4];
-} int64x2x4_t;
-
-typedef struct uint8x8x4_t {
-  uint8x8_t val[4];
-} uint8x8x4_t;
-
-typedef struct uint8x16x4_t {
-  uint8x16_t val[4];
-} uint8x16x4_t;
-
-typedef struct uint16x4x4_t {
-  uint16x4_t val[4];
-} uint16x4x4_t;
-
-typedef struct uint16x8x4_t {
-  uint16x8_t val[4];
-} uint16x8x4_t;
-
-typedef struct uint32x2x4_t {
-  uint32x2_t val[4];
-} uint32x2x4_t;
-
-typedef struct uint32x4x4_t {
-  uint32x4_t val[4];
-} uint32x4x4_t;
-
-typedef struct uint64x1x4_t {
-  uint64x1_t val[4];
-} uint64x1x4_t;
-
-typedef struct uint64x2x4_t {
-  uint64x2_t val[4];
-} uint64x2x4_t;
-
-typedef struct float16x4x4_t {
-  float16x4_t val[4];
-} float16x4x4_t;
-
-typedef struct float16x8x4_t {
-  float16x8_t val[4];
-} float16x8x4_t;
-
-typedef struct float32x2x4_t {
-  float32x2_t val[4];
-} float32x2x4_t;
-
-typedef struct float32x4x4_t {
-  float32x4_t val[4];
-} float32x4x4_t;
-
-#if defined(__aarch64__) || defined(__arm64ec__)
-typedef struct float64x1x4_t {
-  float64x1_t val[4];
-} float64x1x4_t;
-
-typedef struct float64x2x4_t {
-  float64x2_t val[4];
-} float64x2x4_t;
-
-#endif
-typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
-typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
-
-typedef struct bfloat16x4x2_t {
-  bfloat16x4_t val[2];
-} bfloat16x4x2_t;
-
-typedef struct bfloat16x8x2_t {
-  bfloat16x8_t val[2];
-} bfloat16x8x2_t;
-
-typedef struct bfloat16x4x3_t {
-  bfloat16x4_t val[3];
-} bfloat16x4x3_t;
-
-typedef struct bfloat16x8x3_t {
-  bfloat16x8_t val[3];
-} bfloat16x8x3_t;
-
-typedef struct bfloat16x4x4_t {
-  bfloat16x4_t val[4];
-} bfloat16x4x4_t;
-
-typedef struct bfloat16x8x4_t {
-  bfloat16x8_t val[4];
-} bfloat16x8x4_t;
-
-#endif // __ARM_NEON_TYPES_H
diff --git a/third_party/aarch64/clang/armintr.h b/third_party/aarch64/clang/armintr.h
deleted file mode 100644
index 300ed4ee4..000000000
--- a/third_party/aarch64/clang/armintr.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-/* Only include this if we're compiling for the windows platform. */
-#ifndef _MSC_VER
-#include_next <armintr.h>
-#else
-
-#ifndef __ARMINTR_H
-#define __ARMINTR_H
-
-typedef enum
-{
-  _ARM_BARRIER_SY    = 0xF,
-  _ARM_BARRIER_ST    = 0xE,
-  _ARM_BARRIER_ISH   = 0xB,
-  _ARM_BARRIER_ISHST = 0xA,
-  _ARM_BARRIER_NSH   = 0x7,
-  _ARM_BARRIER_NSHST = 0x6,
-  _ARM_BARRIER_OSH   = 0x3,
-  _ARM_BARRIER_OSHST = 0x2
-} _ARMINTR_BARRIER_TYPE;
-
-#endif /* __ARMINTR_H */
-#endif /* _MSC_VER */
diff --git a/third_party/awk/BUILD.mk b/third_party/awk/BUILD.mk
index da4affc22..42ea4aad8 100644
--- a/third_party/awk/BUILD.mk
+++ b/third_party/awk/BUILD.mk
@@ -22,12 +22,10 @@ THIRD_PARTY_AWK_A_DIRECTDEPS =				\
 	LIBC_RUNTIME					\
 	LIBC_STDIO					\
 	LIBC_STR					\
-	LIBC_SYSTEM					\
 	LIBC_SYSV					\
 	LIBC_TINYMATH					\
-	THIRD_PARTY_GDTOA				\
-	THIRD_PARTY_MUSL				\
 	TOOL_ARGS					\
+	THIRD_PARTY_GDTOA
 
 THIRD_PARTY_AWK_A_DEPS :=				\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_AWK_A_DIRECTDEPS),$($(x))))
diff --git a/third_party/awk/cmd.c b/third_party/awk/cmd.c
index 78ca032e8..54882575f 100644
--- a/third_party/awk/cmd.c
+++ b/third_party/awk/cmd.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/awk/cmd.h"
-#include "libc/cosmo.h"
+#include "tool/args/args.h"
 
 int main(int argc, char *argv[]) {
   LoadZipArgs(&argc, &argv);
diff --git a/third_party/awk/run.c b/third_party/awk/run.c
index 4d5b28aef..e0ab6208d 100644
--- a/third_party/awk/run.c
+++ b/third_party/awk/run.c
@@ -495,7 +495,7 @@ makearraystring(Node *p, const char *func)
 
 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
 			FATAL("%s: out of memory %s[%s...]",
-			      func ? func : "NULL", x->nval, buf);
+			    func, x->nval, buf);
 		}
 		memcpy(buf + blen, s, slen);
 		if (nsub) {
diff --git a/third_party/chibicc/BUILD.mk b/third_party/chibicc/BUILD.mk
index 30d9019f2..8e3d9bb6d 100644
--- a/third_party/chibicc/BUILD.mk
+++ b/third_party/chibicc/BUILD.mk
@@ -63,7 +63,6 @@ THIRD_PARTY_CHIBICC_A_DIRECTDEPS =					\
 	THIRD_PARTY_COMPILER_RT						\
 	THIRD_PARTY_DLMALLOC						\
 	THIRD_PARTY_GDTOA						\
-	THIRD_PARTY_MUSL						\
 	THIRD_PARTY_TZ							\
 	TOOL_BUILD_LIB
 
diff --git a/third_party/chibicc/as.c b/third_party/chibicc/as.c
index 2859df2b7..e17c68d27 100644
--- a/third_party/chibicc/as.c
+++ b/third_party/chibicc/as.c
@@ -24,13 +24,13 @@
 #include "libc/intrin/popcnt.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/crc32.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/s.h"
 #include "libc/x/x.h"
diff --git a/third_party/chibicc/chibicc.h b/third_party/chibicc/chibicc.h
index aefe6592e..25beb3469 100644
--- a/third_party/chibicc/chibicc.h
+++ b/third_party/chibicc/chibicc.h
@@ -11,7 +11,7 @@
 #include "libc/limits.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/crc32.h"
 #include "libc/runtime/runtime.h"
diff --git a/third_party/chibicc/test/vla_test.c b/third_party/chibicc/test/vla_test.c
index 870e04775..b5010645d 100644
--- a/third_party/chibicc/test/vla_test.c
+++ b/third_party/chibicc/test/vla_test.c
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "third_party/chibicc/test/test.h"
 
 int index1d(int xn, int p[xn], int x) {
diff --git a/third_party/chibicc/tokenize.c b/third_party/chibicc/tokenize.c
index ea12c8765..cc297966a 100644
--- a/third_party/chibicc/tokenize.c
+++ b/third_party/chibicc/tokenize.c
@@ -2,7 +2,7 @@
 #include "libc/log/log.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "third_party/chibicc/chibicc.h"
 #include "third_party/chibicc/file.h"
 #include "libc/ctype.h"
diff --git a/third_party/compiler_rt/clear_cache.c b/third_party/compiler_rt/clear_cache.c
index 8f3cd9cb2..7486b0966 100644
--- a/third_party/compiler_rt/clear_cache.c
+++ b/third_party/compiler_rt/clear_cache.c
@@ -15,7 +15,7 @@
 // It is expected to invalidate the instruction cache for the
 // specified range.
 
-privileged void __clear_cache(void *start, void *end) {
+void __clear_cache(void *start, void *end) {
 
 #ifdef __aarch64__
   if (IsXnu()) {
@@ -59,8 +59,6 @@ privileged void __clear_cache(void *start, void *end) {
   }
   __asm__ volatile("isync");
 
-#elif defined(__x86_64__)
-  // do nothing
 #else
   compilerrt_abort();
 #endif
diff --git a/third_party/compiler_rt/comprt.S b/third_party/compiler_rt/comprt.S
index d31861204..95060b658 100644
--- a/third_party/compiler_rt/comprt.S
+++ b/third_party/compiler_rt/comprt.S
@@ -1,4 +1,4 @@
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Nop ref this to force pull the license into linkage.
 	.section .yoink
diff --git a/third_party/compiler_rt/extendhfdf2.c b/third_party/compiler_rt/extendhfdf2.c
new file mode 100644
index 000000000..729eb04c1
--- /dev/null
+++ b/third_party/compiler_rt/extendhfdf2.c
@@ -0,0 +1,17 @@
+//===-- lib/extendhfdf2.c - half -> dubble conversion -------------*- C -*-===//
+//
+//                The Cosmopolitan Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#define SRC_HALF
+#define DST_DOUBLE
+#include "third_party/compiler_rt/fp16_extend_impl.inc"
+
+COMPILER_RT_ABI dst_t __extendhfdf2(src_t a) {
+    return __extendXfYf2__(a);
+}
diff --git a/third_party/compiler_rt/extendhfsf2.c b/third_party/compiler_rt/extendhfsf2.c
new file mode 100644
index 000000000..f891d9542
--- /dev/null
+++ b/third_party/compiler_rt/extendhfsf2.c
@@ -0,0 +1,27 @@
+//===-- lib/extendhfsf2.c - half -> single conversion -------------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SRC_HALF
+#define DST_SINGLE
+#include "fp16_extend_impl.inc"
+
+// Use a forwarding definition and noinline to implement a poor man's alias,
+// as there isn't a good cross-platform way of defining one.
+COMPILER_RT_ABI NOINLINE float __extendhfsf2(src_t a) {
+  return __extendXfYf2__(a);
+}
+
+COMPILER_RT_ABI float __gnu_h2f_ieee(src_t a) { return __extendhfsf2(a); }
+
+#if defined(__ARM_EABI__)
+#if defined(COMPILER_RT_ARMHF_TARGET)
+AEABI_RTABI float __aeabi_h2f(src_t a) { return __extendhfsf2(a); }
+#else
+COMPILER_RT_ALIAS(__extendhfsf2, __aeabi_h2f)
+#endif
+#endif
diff --git a/libc/intrin/extendsftf2.c b/third_party/compiler_rt/extendsftf2.c
similarity index 89%
rename from libc/intrin/extendsftf2.c
rename to third_party/compiler_rt/extendsftf2.c
index 444140e1a..1509b45e4 100644
--- a/libc/intrin/extendsftf2.c
+++ b/third_party/compiler_rt/extendsftf2.c
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 //
 
+__static_yoink("huge_compiler_rt_license");
+
 #define QUAD_PRECISION
 #include "third_party/compiler_rt/fp_lib.inc"
 
@@ -17,7 +19,7 @@
 #include "third_party/compiler_rt/fp_extend_impl.inc"
 
 COMPILER_RT_ABI long double __extendsftf2(float a) {
-  return __extendXfYf2__(a);
+    return __extendXfYf2__(a);
 }
 
 #endif
diff --git a/third_party/compiler_rt/truncdfhf2.c b/third_party/compiler_rt/truncdfhf2.c
new file mode 100644
index 000000000..9a01e2c2e
--- /dev/null
+++ b/third_party/compiler_rt/truncdfhf2.c
@@ -0,0 +1,21 @@
+//===-- lib/truncdfhf2.c - double -> half conversion --------------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SRC_DOUBLE
+#define DST_HALF
+#include "fp16_trunc_impl.inc"
+
+COMPILER_RT_ABI dst_t __truncdfhf2(double a) { return __truncXfYf2__(a); }
+
+#if defined(__ARM_EABI__)
+#if defined(COMPILER_RT_ARMHF_TARGET)
+AEABI_RTABI dst_t __aeabi_d2h(double a) { return __truncdfhf2(a); }
+#else
+COMPILER_RT_ALIAS(__truncdfhf2, __aeabi_d2h)
+#endif
+#endif
diff --git a/third_party/compiler_rt/truncsfhf2.c b/third_party/compiler_rt/truncsfhf2.c
new file mode 100644
index 000000000..d15e1884f
--- /dev/null
+++ b/third_party/compiler_rt/truncsfhf2.c
@@ -0,0 +1,27 @@
+//===-- lib/truncsfhf2.c - single -> half conversion --------------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SRC_SINGLE
+#define DST_HALF
+#include "fp16_trunc_impl.inc"
+
+// Use a forwarding definition and noinline to implement a poor man's alias,
+// as there isn't a good cross-platform way of defining one.
+COMPILER_RT_ABI NOINLINE dst_t __truncsfhf2(float a) {
+  return __truncXfYf2__(a);
+}
+
+COMPILER_RT_ABI dst_t __gnu_f2h_ieee(float a) { return __truncsfhf2(a); }
+
+#if defined(__ARM_EABI__)
+#if defined(COMPILER_RT_ARMHF_TARGET)
+AEABI_RTABI dst_t __aeabi_f2h(float a) { return __truncsfhf2(a); }
+#else
+COMPILER_RT_ALIAS(__truncsfhf2, __aeabi_f2h)
+#endif
+#endif
diff --git a/libc/intrin/trunctfsf2.c b/third_party/compiler_rt/trunctfsf2.c
similarity index 89%
rename from libc/intrin/trunctfsf2.c
rename to third_party/compiler_rt/trunctfsf2.c
index bbb961dfe..3ebda8151 100644
--- a/libc/intrin/trunctfsf2.c
+++ b/third_party/compiler_rt/trunctfsf2.c
@@ -7,6 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+__static_yoink("huge_compiler_rt_license");
+
 #define QUAD_PRECISION
 #include "third_party/compiler_rt/fp_lib.inc"
 
@@ -16,7 +18,7 @@
 #include "third_party/compiler_rt/fp_trunc_impl.inc"
 
 COMPILER_RT_ABI float __trunctfsf2(long double a) {
-  return __truncXfYf2__(a);
+    return __truncXfYf2__(a);
 }
 
 #endif
diff --git a/third_party/ctags/BUILD.mk b/third_party/ctags/BUILD.mk
index 973ce3a02..826ceefe4 100644
--- a/third_party/ctags/BUILD.mk
+++ b/third_party/ctags/BUILD.mk
@@ -19,14 +19,13 @@ THIRD_PARTY_CTAGS_DIRECTDEPS =				\
 	LIBC_LOG					\
 	LIBC_MEM					\
 	LIBC_NEXGEN32E					\
-	LIBC_PROC					\
 	LIBC_RUNTIME					\
+	LIBC_PROC					\
 	LIBC_STDIO					\
 	LIBC_STR					\
-	LIBC_SYSTEM					\
 	LIBC_SYSV					\
 	THIRD_PARTY_MUSL				\
-	THIRD_PARTY_REGEX				\
+	THIRD_PARTY_REGEX
 
 THIRD_PARTY_CTAGS_DEPS :=				\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_CTAGS_DIRECTDEPS),$($(x))))
diff --git a/third_party/dlmalloc/BUILD.mk b/third_party/dlmalloc/BUILD.mk
index 70af0e364..8b7b9d6dc 100644
--- a/third_party/dlmalloc/BUILD.mk
+++ b/third_party/dlmalloc/BUILD.mk
@@ -58,13 +58,6 @@ $(THIRD_PARTY_DLMALLOC_A_OBJS): private				\
 			-Wframe-larger-than=4096		\
 			-Walloca-larger-than=4096
 
-# avoid the legacy sse decoding penalty on avx systems
-ifeq ($(MODE),)
-$(THIRD_PARTY_DLMALLOC_A_OBJS): private				\
-		COPTS +=					\
-			-mgeneral-regs-only
-endif
-
 THIRD_PARTY_DLMALLOC_LIBS = $(foreach x,$(THIRD_PARTY_DLMALLOC_ARTIFACTS),$($(x)))
 THIRD_PARTY_DLMALLOC_SRCS = $(foreach x,$(THIRD_PARTY_DLMALLOC_ARTIFACTS),$($(x)_SRCS))
 THIRD_PARTY_DLMALLOC_HDRS = $(foreach x,$(THIRD_PARTY_DLMALLOC_ARTIFACTS),$($(x)_HDRS))
diff --git a/third_party/dlmalloc/README.cosmo b/third_party/dlmalloc/README.cosmo
index 097b9342a..0db6ea937 100644
--- a/third_party/dlmalloc/README.cosmo
+++ b/third_party/dlmalloc/README.cosmo
@@ -9,7 +9,6 @@ LICENSE
 
 LOCAL CHANGES
 
-  - Fix MT-safety bugs in DEBUG mode
   - Fix bug in dlmalloc_inspect_all()
   - Define dlmalloc_requires_more_vespene_gas()
   - Make dlmalloc scalable using sched_getcpu()
diff --git a/third_party/dlmalloc/dlmalloc.c b/third_party/dlmalloc/dlmalloc.c
index b20e28cd9..fa546f8cc 100644
--- a/third_party/dlmalloc/dlmalloc.c
+++ b/third_party/dlmalloc/dlmalloc.c
@@ -8,7 +8,7 @@
 #include "libc/intrin/bsr.h"
 #include "libc/intrin/likely.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/rdtsc.h"
 #include "libc/runtime/internal.h"
@@ -31,20 +31,23 @@
 #define FOOTERS 1
 #define MSPACES 1
 #define ONLY_MSPACES 1 // enables scalable multi-threaded malloc
-#define USE_SPIN_LOCKS 0 // set to 0 to use scalable nsync locks
 #else
 #define INSECURE 1
 #define PROCEED_ON_ERROR 1
 #define FOOTERS 0
 #define MSPACES 0
 #define ONLY_MSPACES 0
-#define USE_SPIN_LOCKS 1
 #endif
 
+#define HAVE_MMAP 1
 #define HAVE_MREMAP 1
+#define HAVE_MORECORE 0
 #define USE_LOCKS 2
+#define USE_SPIN_LOCKS 1
+#define MORECORE_CONTIGUOUS 0
 #define MALLOC_INSPECT_ALL 1
 #define ABORT_ON_ASSERT_FAILURE 0
+#define LOCK_AT_FORK 1
 #define NO_MALLOC_STATS 1
 
 #if IsModeDbg()
@@ -62,6 +65,11 @@
 #include "locks.inc"
 #include "chunks.inc"
 #include "headfoot.inc"
+
+#if ONLY_MSPACES
+#include "threaded.inc"
+#endif
+
 #include "global.inc"
 #include "system.inc"
 #include "hooks.inc"
@@ -69,11 +77,6 @@
 #include "indexing.inc"
 #include "binmaps.inc"
 #include "runtimechecks.inc"
-
-#if ONLY_MSPACES
-#include "threaded.inc"
-#endif
-
 #include "init.inc"
 #include "debuglib.inc"
 #include "statistics.inc"
@@ -84,7 +87,7 @@
 
 /* -------------------------- System allocation -------------------------- */
 
-/* Get memory from system */
+/* Get memory from system using MORECORE or MMAP */
 static void* sys_alloc(mstate m, size_t nb) {
   char* tbase = CMFAIL;
   size_t tsize = 0;
@@ -109,7 +112,90 @@ static void* sys_alloc(mstate m, size_t nb) {
       return 0;
   }
 
-  if (tbase == CMFAIL) {  /* Try MMAP */
+  /*
+    Try getting memory in any of three ways (in most-preferred to
+    least-preferred order):
+    1. A call to MORECORE that can normally contiguously extend memory.
+       (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or
+       or main space is mmapped or a previous contiguous call failed)
+    2. A call to MMAP new space (disabled if not HAVE_MMAP).
+       Note that under the default settings, if MORECORE is unable to
+       fulfill a request, and HAVE_MMAP is true, then mmap is
+       used as a noncontiguous system allocator. This is a useful backup
+       strategy for systems with holes in address spaces -- in this case
+       sbrk cannot contiguously expand the heap, but mmap may be able to
+       find space.
+    3. A call to MORECORE that cannot usually contiguously extend memory.
+       (disabled if not HAVE_MORECORE)
+
+   In all cases, we need to request enough bytes from system to ensure
+   we can malloc nb bytes upon success, so pad with enough space for
+   top_foot, plus alignment-pad to make sure we don't lose bytes if
+   not on boundary, and round this up to a granularity unit.
+  */
+
+  if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) {
+    char* br = CMFAIL;
+    size_t ssize = asize; /* sbrk call size */
+    msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top);
+    ACQUIRE_MALLOC_GLOBAL_LOCK();
+
+    if (ss == 0) {  /* First time through or recovery */
+      char* base = (char*)CALL_MORECORE(0);
+      if (base != CMFAIL) {
+        size_t fp;
+        /* Adjust to end on a page boundary */
+        if (!is_page_aligned(base))
+          ssize += (page_align((size_t)base) - (size_t)base);
+        fp = m->footprint + ssize; /* recheck limits */
+        if (ssize > nb && ssize < HALF_MAX_SIZE_T &&
+            (m->footprint_limit == 0 ||
+             (fp > m->footprint && fp <= m->footprint_limit)) &&
+            (br = (char*)(CALL_MORECORE(ssize))) == base) {
+          tbase = base;
+          tsize = ssize;
+        }
+      }
+    }
+    else {
+      /* Subtract out existing available top space from MORECORE request. */
+      ssize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING);
+      /* Use mem here only if it did continuously extend old space */
+      if (ssize < HALF_MAX_SIZE_T &&
+          (br = (char*)(CALL_MORECORE(ssize))) == ss->base+ss->size) {
+        tbase = br;
+        tsize = ssize;
+      }
+    }
+
+    if (tbase == CMFAIL) {    /* Cope with partial failure */
+      if (br != CMFAIL) {    /* Try to use/extend the space we did get */
+        if (ssize < HALF_MAX_SIZE_T &&
+            ssize < nb + SYS_ALLOC_PADDING) {
+          size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - ssize);
+          if (esize < HALF_MAX_SIZE_T) {
+            char* end = (char*)CALL_MORECORE(esize);
+            if (end != CMFAIL)
+              ssize += esize;
+            else {            /* Can't use; try to release */
+              (void) CALL_MORECORE(-ssize);
+              br = CMFAIL;
+            }
+          }
+        }
+      }
+      if (br != CMFAIL) {    /* Use the space we did get */
+        tbase = br;
+        tsize = ssize;
+      }
+      else
+        disable_contiguous(m); /* Don't try contiguous path in the future */
+    }
+
+    RELEASE_MALLOC_GLOBAL_LOCK();
+  }
+
+  if (HAVE_MMAP && tbase == CMFAIL) {  /* Try MMAP */
     char* mp = dlmalloc_requires_more_vespene_gas(asize);
     if (mp != CMFAIL) {
       tbase = mp;
@@ -118,6 +204,24 @@ static void* sys_alloc(mstate m, size_t nb) {
     }
   }
 
+  if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */
+    if (asize < HALF_MAX_SIZE_T) {
+      char* br = CMFAIL;
+      char* end = CMFAIL;
+      ACQUIRE_MALLOC_GLOBAL_LOCK();
+      br = (char*)(CALL_MORECORE(asize));
+      end = (char*)(CALL_MORECORE(0));
+      RELEASE_MALLOC_GLOBAL_LOCK();
+      if (br != CMFAIL && end != CMFAIL && br < end) {
+        size_t ssize = end - br;
+        if (ssize > nb + TOP_FOOT_SIZE) {
+          tbase = br;
+          tsize = ssize;
+        }
+      }
+    }
+  }
+
   if (tbase != CMFAIL) {
 
     if ((m->footprint += tsize) > m->max_footprint)
@@ -257,7 +361,8 @@ static int sys_trim(mstate m, size_t pad) {
 
       if (!is_extern_segment(sp)) {
         if (is_mmapped_segment(sp)) {
-          if (sp->size >= extra &&
+          if (HAVE_MMAP &&
+              sp->size >= extra &&
               !has_segment_link(m, sp)) { /* can't shrink if pinned */
             size_t newsize = sp->size - extra;
             (void)newsize; /* placate people compiling -Wunused-variable */
@@ -268,6 +373,22 @@ static int sys_trim(mstate m, size_t pad) {
             }
           }
         }
+        else if (HAVE_MORECORE) {
+          if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */
+            extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit;
+          ACQUIRE_MALLOC_GLOBAL_LOCK();
+          {
+            /* Make sure end of memory is where we last set it. */
+            char* old_br = (char*)(CALL_MORECORE(0));
+            if (old_br == sp->base + sp->size) {
+              char* rel_br = (char*)(CALL_MORECORE(-extra));
+              char* new_br = (char*)(CALL_MORECORE(0));
+              if (rel_br != CMFAIL && new_br < old_br)
+                released = old_br - new_br;
+            }
+          }
+          RELEASE_MALLOC_GLOBAL_LOCK();
+        }
       }
 
       if (released != 0) {
@@ -279,7 +400,8 @@ static int sys_trim(mstate m, size_t pad) {
     }
 
     /* Unmap any unused mmapped segments */
-    released += release_unused_segments(m);
+    if (HAVE_MMAP)
+      released += release_unused_segments(m);
 
     /* On failure, disable autotrim to avoid repeated failed future calls */
     if (released == 0 && m->topsize > m->trim_check)
@@ -1141,15 +1263,12 @@ void* dlrealloc_single(void* oldmem, size_t bytes) {
 #endif /* FOOTERS */
     if (!PREACTION(m)) {
       mchunkptr newp = try_realloc_chunk(m, oldp, nb, MREMAP_MAYMOVE);
+      POSTACTION(m);
       if (newp != 0) {
-        /* [jart] fix realloc MT bug in DEBUG mode
-                  https://github.com/intel/linux-sgx/issues/534 */
         check_inuse_chunk(m, newp);
-        POSTACTION(m);
         mem = chunk2mem(newp);
       }
       else {
-        POSTACTION(m);
         mem = internal_malloc(m, bytes);
         if (mem != 0) {
           size_t oc = chunksize(oldp) - overhead_for(oldp);
@@ -1182,13 +1301,11 @@ void* dlrealloc_in_place(void* oldmem, size_t bytes) {
 #endif /* FOOTERS */
       if (!PREACTION(m)) {
         mchunkptr newp = try_realloc_chunk(m, oldp, nb, 0);
+        POSTACTION(m);
         if (newp == oldp) {
-          /* [jart] fix realloc MT bug in DEBUG mode
-                    https://github.com/intel/linux-sgx/issues/534 */
           check_inuse_chunk(m, newp);
           mem = oldmem;
         }
-        POSTACTION(m);
       }
     }
   }
@@ -1202,6 +1319,13 @@ void* dlmemalign_single(size_t alignment, size_t bytes) {
   return internal_memalign(gm, alignment, bytes);
 }
 
+#if USE_LOCKS
+void dlmalloc_atfork(void) {
+  bzero(&gm->mutex, sizeof(gm->mutex));
+  bzero(&malloc_global_mutex, sizeof(malloc_global_mutex));
+}
+#endif
+
 void** dlindependent_calloc(size_t n_elements, size_t elem_size,
                             void* chunks[]) {
   size_t sz = elem_size; /* serves as 1-element array */
diff --git a/third_party/dlmalloc/dlmalloc.h b/third_party/dlmalloc/dlmalloc.h
index 5bbb9a179..edb86f27a 100644
--- a/third_party/dlmalloc/dlmalloc.h
+++ b/third_party/dlmalloc/dlmalloc.h
@@ -9,6 +9,7 @@
 #define dlmallinfo                   __dlmallinfo
 #define dlmalloc                     __dlmalloc
 #define dlmalloc_abort               __dlmalloc_abort
+#define dlmalloc_atfork              __dlmalloc_atfork
 #define dlmalloc_footprint           __dlmalloc_footprint
 #define dlmalloc_footprint_limit     __dlmalloc_footprint_limit
 #define dlmalloc_inspect_all         __dlmalloc_inspect_all
@@ -526,10 +527,7 @@ void mspace_inspect_all(mspace msp,
                         void (*handler)(void*, void*, size_t, void*),
                         void* arg);
 
-void dlmalloc_pre_fork(void) libcesque;
-void dlmalloc_post_fork_parent(void) libcesque;
-void dlmalloc_post_fork_child(void) libcesque;
-
+void dlmalloc_atfork(void);
 void dlmalloc_abort(void) relegated wontreturn;
 
 COSMOPOLITAN_C_END_
diff --git a/third_party/dlmalloc/init.inc b/third_party/dlmalloc/init.inc
index ac7ce8edf..682b50408 100644
--- a/third_party/dlmalloc/init.inc
+++ b/third_party/dlmalloc/init.inc
@@ -1,51 +1,68 @@
 #include "libc/sysv/consts/auxv.h"
 #include "libc/runtime/runtime.h"
-#include "libc/nexgen32e/rdtsc.h"
 #include "libc/runtime/runtime.h"
 
-void dlmalloc_pre_fork(void) {
+/* ---------------------------- setting mparams -------------------------- */
+
+#if LOCK_AT_FORK
 #if ONLY_MSPACES
+
+static void dlmalloc_pre_fork(void) {
   mstate h;
-  for (unsigned i = ARRAYLEN(g_heaps); i--;)
+  for (unsigned i = 0; i < ARRAYLEN(g_heaps); ++i)
     if ((h = atomic_load_explicit(&g_heaps[i], memory_order_acquire)))
       ACQUIRE_LOCK(&h->mutex);
-#else
-  ACQUIRE_LOCK(&(gm)->mutex);
-#endif
 }
 
-void dlmalloc_post_fork_parent(void) {
-#if ONLY_MSPACES
+static void dlmalloc_post_fork_parent(void) {
   mstate h;
   for (unsigned i = 0; i < ARRAYLEN(g_heaps); ++i)
     if ((h = atomic_load_explicit(&g_heaps[i], memory_order_acquire)))
       RELEASE_LOCK(&h->mutex);
-#else
-  RELEASE_LOCK(&(gm)->mutex);
-#endif
 }
 
-void dlmalloc_post_fork_child(void) {
-#if ONLY_MSPACES
+static void dlmalloc_post_fork_child(void) {
   mstate h;
   for (unsigned i = 0; i < ARRAYLEN(g_heaps); ++i)
     if ((h = atomic_load_explicit(&g_heaps[i], memory_order_acquire)))
-      REFRESH_LOCK(&h->mutex);
-#else
-  REFRESH_LOCK(&(gm)->mutex);
-#endif
+      (void)INITIAL_LOCK(&h->mutex);
 }
 
+#else
+static void dlmalloc_pre_fork(void)         { ACQUIRE_LOCK(&(gm)->mutex); }
+static void dlmalloc_post_fork_parent(void) { RELEASE_LOCK(&(gm)->mutex); }
+static void dlmalloc_post_fork_child(void)  { (void)INITIAL_LOCK(&(gm)->mutex); }
+#endif /* ONLY_MSPACES */
+#endif /* LOCK_AT_FORK */
+
 /* Initialize mparams */
 __attribute__((__constructor__(49))) int init_mparams(void) {
+#ifdef NEED_GLOBAL_LOCK_INIT
+  if (malloc_global_mutex_status <= 0)
+    init_malloc_global_mutex();
+#endif
 
+  // ACQUIRE_MALLOC_GLOBAL_LOCK();
   if (mparams.magic == 0) {
     size_t magic;
     size_t psize;
     size_t gsize;
 
-    psize = __pagesize;
+#if defined(__COSMOPOLITAN__)
+    psize = getpagesize();
     gsize = DEFAULT_GRANULARITY ? DEFAULT_GRANULARITY : psize;
+#elif !defined(WIN32)
+    psize = malloc_getpagesize;
+    gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);
+#else /* WIN32 */
+    {
+      SYSTEM_INFO system_info;
+      GetSystemInfo(&system_info);
+      psize = system_info.dwPageSize;
+      gsize = ((DEFAULT_GRANULARITY != 0)?
+               DEFAULT_GRANULARITY : system_info.dwAllocationGranularity);
+    }
+#endif /* WIN32 */
 
     /* Sanity-check configuration:
        size_t must be unsigned and as wide as pointer type.
@@ -66,7 +83,11 @@ __attribute__((__constructor__(49))) int init_mparams(void) {
     mparams.page_size = psize;
     mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
     mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD;
+#if MORECORE_CONTIGUOUS
+    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT;
+#else  /* MORECORE_CONTIGUOUS */
     mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT;
+#endif /* MORECORE_CONTIGUOUS */
 
 #if !ONLY_MSPACES
     /* Set up lock for main malloc area */
@@ -74,6 +95,12 @@ __attribute__((__constructor__(49))) int init_mparams(void) {
     (void)INITIAL_LOCK(&gm->mutex);
 #endif
 
+#if LOCK_AT_FORK
+    pthread_atfork(&dlmalloc_pre_fork,
+                   &dlmalloc_post_fork_parent,
+                   &dlmalloc_post_fork_child);
+#endif
+
     {
 #if USE_DEV_RANDOM
       int fd;
@@ -86,7 +113,7 @@ __attribute__((__constructor__(49))) int init_mparams(void) {
       }
       else
 #endif /* USE_DEV_RANDOM */
-      magic = (size_t)(rdtsc() ^ (size_t)0x55555555U);
+      magic = (size_t)(_rand64() ^ (size_t)0x55555555U);
       magic |= (size_t)8U;    /* ensure nonzero */
       magic &= ~(size_t)7U;   /* improve chances of fault for bad values */
       /* Until memory modes commonly available, use volatile-write */
@@ -94,6 +121,8 @@ __attribute__((__constructor__(49))) int init_mparams(void) {
     }
   }
 
+  // RELEASE_MALLOC_GLOBAL_LOCK();
+
 #if ONLY_MSPACES
   threaded_dlmalloc();
 #endif
diff --git a/third_party/dlmalloc/locks.inc b/third_party/dlmalloc/locks.inc
index ea962c778..037442ac5 100644
--- a/third_party/dlmalloc/locks.inc
+++ b/third_party/dlmalloc/locks.inc
@@ -1,7 +1,3 @@
-#include "libc/cosmo.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/intrin/maps.h"
-#include "libc/thread/thread.h"
 
 /* --------------------------- Lock preliminaries ------------------------ */
 
@@ -37,94 +33,56 @@
 
 #define MLOCK_T atomic_uint
 
-static int malloc_inlk(MLOCK_T *lk) {
-  atomic_store_explicit(lk, 0, memory_order_relaxed);
-  return 0;
-}
-
 static int malloc_wipe(MLOCK_T *lk) {
-  atomic_store_explicit(lk, 0, memory_order_relaxed);
-  return 0;
-}
-
-static int malloc_kilk(MLOCK_T *lk) {
+  bzero(lk, sizeof(*lk));
   return 0;
 }
 
 static int malloc_lock(MLOCK_T *lk) {
-  for (;;) {
-    if (!atomic_exchange_explicit(lk, 1, memory_order_acquire))
-      break;
-    for (;;)
-      if (!atomic_load_explicit(lk, memory_order_relaxed))
-        break;
+  if (!__threaded) return 0;
+  while (atomic_exchange_explicit(lk, 1, memory_order_acquire)) {
+    pthread_pause_np();
   }
   return 0;
 }
 
-static int malloc_unlk(MLOCK_T *lk) {
+static int malloc_unlock(MLOCK_T *lk) {
+  if (!__threaded) return 0;
   atomic_store_explicit(lk, 0, memory_order_release);
   return 0;
 }
 
 #else
 
-#define MLOCK_T struct MallocLock
+#define MLOCK_T nsync_mu
 
-struct MallocLock {
-#if DEBUG
-  void *edges;
-#endif
-  nsync_mu mu;
-};
-
-static int malloc_inlk(MLOCK_T *lk) {
+static int malloc_wipe(MLOCK_T *lk) {
   bzero(lk, sizeof(*lk));
   return 0;
 }
 
-static int malloc_wipe(MLOCK_T *lk) {
-  bzero(&lk->mu, sizeof(lk->mu));
-  return 0;
-}
-
-static int malloc_kilk(MLOCK_T *lk) {
-  return 0;
-}
-
 static int malloc_lock(MLOCK_T *lk) {
-#if DEBUG
-  __deadlock_check(lk, 0);
-#endif
-  nsync_mu_lock(&lk->mu);
-#if DEBUG
-  __deadlock_record(lk, 0);
-  __deadlock_track(lk, 0);
-#endif
+  if (!__threaded) return 0;
+  nsync_mu_lock(lk);
   return 0;
 }
 
-static int malloc_unlk(MLOCK_T *lk) {
-#if DEBUG
-  if (__deadlock_tracked(lk) == 0) {
-    kprintf("error: unlock malloc mutex not owned by caller: %t\n", lk);
-    DebugBreak();
-  }
-#endif
-  nsync_mu_unlock(&lk->mu);
-#if DEBUG
-  __deadlock_untrack(lk);
-#endif
+static int malloc_unlock(MLOCK_T *lk) {
+  if (!__threaded) return 0;
+  nsync_mu_unlock(lk);
   return 0;
 }
 
 #endif
 
 #define ACQUIRE_LOCK(lk) malloc_lock(lk)
-#define RELEASE_LOCK(lk) malloc_unlk(lk)
-#define INITIAL_LOCK(lk) malloc_inlk(lk)
-#define REFRESH_LOCK(lk) malloc_wipe(lk)
-#define DESTROY_LOCK(lk) malloc_kilk(lk)
+#define RELEASE_LOCK(lk) malloc_unlock(lk)
+#define INITIAL_LOCK(lk) malloc_wipe(lk)
+#define DESTROY_LOCK(lk) malloc_wipe(lk)
+#define ACQUIRE_MALLOC_GLOBAL_LOCK() ACQUIRE_LOCK(&malloc_global_mutex);
+#define RELEASE_MALLOC_GLOBAL_LOCK() RELEASE_LOCK(&malloc_global_mutex);
+
+static MLOCK_T malloc_global_mutex;
 
 #define USE_LOCK_BIT               (2U)
 
diff --git a/third_party/dlmalloc/mspaces.inc b/third_party/dlmalloc/mspaces.inc
index d17d96549..1f048d0eb 100644
--- a/third_party/dlmalloc/mspaces.inc
+++ b/third_party/dlmalloc/mspaces.inc
@@ -368,15 +368,12 @@ void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
 #endif /* FOOTERS */
     if (!PREACTION(m)) {
       mchunkptr newp = try_realloc_chunk(m, oldp, nb, 1);
+      POSTACTION(m);
       if (newp != 0) {
-        /* [jart] fix realloc MT bug in DEBUG mode
-                  https://github.com/intel/linux-sgx/issues/534 */
         check_inuse_chunk(m, newp);
-        POSTACTION(m);
         mem = chunk2mem(newp);
       }
       else {
-        POSTACTION(m);
         mem = mspace_malloc(m, bytes);
         if (mem != 0) {
           size_t oc = chunksize(oldp) - overhead_for(oldp);
@@ -410,13 +407,11 @@ void* mspace_realloc_in_place(mspace msp, void* oldmem, size_t bytes) {
 #endif /* FOOTERS */
       if (!PREACTION(m)) {
         mchunkptr newp = try_realloc_chunk(m, oldp, nb, 0);
+        POSTACTION(m);
         if (newp == oldp) {
-          /* [jart] fix realloc_in_place MT bug in DEBUG mode
-                    https://github.com/intel/linux-sgx/issues/534 */
           check_inuse_chunk(m, newp);
           mem = oldmem;
         }
-        POSTACTION(m);
       }
     }
   }
diff --git a/third_party/dlmalloc/platform.inc b/third_party/dlmalloc/platform.inc
index 5385a7f88..8fab2e29e 100644
--- a/third_party/dlmalloc/platform.inc
+++ b/third_party/dlmalloc/platform.inc
@@ -75,6 +75,9 @@
 #ifndef MALLOC_INSPECT_ALL
 #define MALLOC_INSPECT_ALL 0
 #endif  /* MALLOC_INSPECT_ALL */
+#ifndef HAVE_MMAP
+#define HAVE_MMAP 1
+#endif  /* HAVE_MMAP */
 #ifndef MMAP_CLEARS
 #define MMAP_CLEARS 1
 #endif  /* MMAP_CLEARS */
@@ -89,17 +92,48 @@
 #ifndef MALLOC_FAILURE_ACTION
 #define MALLOC_FAILURE_ACTION  errno = ENOMEM;
 #endif  /* MALLOC_FAILURE_ACTION */
+#ifndef HAVE_MORECORE
+#if ONLY_MSPACES
+#define HAVE_MORECORE 0
+#else   /* ONLY_MSPACES */
+#define HAVE_MORECORE 1
+#endif  /* ONLY_MSPACES */
+#endif  /* HAVE_MORECORE */
+#if !HAVE_MORECORE
+#define MORECORE_CONTIGUOUS 0
+#else   /* !HAVE_MORECORE */
+#define MORECORE_DEFAULT sbrk
+#ifndef MORECORE_CONTIGUOUS
+#define MORECORE_CONTIGUOUS 1
+#endif  /* MORECORE_CONTIGUOUS */
+#endif  /* HAVE_MORECORE */
 #ifndef DEFAULT_GRANULARITY
+#if (MORECORE_CONTIGUOUS || defined(WIN32))
+#define DEFAULT_GRANULARITY (0)  /* 0 means to compute in init_mparams */
+#else   /* MORECORE_CONTIGUOUS */
 #define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
+#endif  /* MORECORE_CONTIGUOUS */
 #endif  /* DEFAULT_GRANULARITY */
 #ifndef DEFAULT_TRIM_THRESHOLD
+#ifndef MORECORE_CANNOT_TRIM
 #define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U)
+#else   /* MORECORE_CANNOT_TRIM */
+#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T
+#endif  /* MORECORE_CANNOT_TRIM */
 #endif  /* DEFAULT_TRIM_THRESHOLD */
 #ifndef DEFAULT_MMAP_THRESHOLD
+#if HAVE_MMAP
 #define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U)
+#else   /* HAVE_MMAP */
+#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
+#endif  /* HAVE_MMAP */
 #endif  /* DEFAULT_MMAP_THRESHOLD */
 #ifndef MAX_RELEASE_CHECK_RATE
+#if HAVE_MMAP
 #define MAX_RELEASE_CHECK_RATE 4095
+#else
+#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T
+#endif /* HAVE_MMAP */
 #endif /* MAX_RELEASE_CHECK_RATE */
 #ifndef USE_BUILTIN_FFS
 #define USE_BUILTIN_FFS 0
@@ -151,6 +185,165 @@
   ========================================================================
 */
 
+/* #include "malloc.h" */
+
+/*------------------------------ internal #includes ---------------------- */
+
+#ifdef _MSC_VER
+#pragma warning( disable : 4146 ) /* no "unsigned" warnings */
+#endif /* _MSC_VER */
+#if !NO_MALLOC_STATS
+#endif /* NO_MALLOC_STATS */
+#ifndef LACKS_ERRNO_H
+#include <errno.h>       /* for MALLOC_FAILURE_ACTION */
+#endif /* LACKS_ERRNO_H */
+#ifdef DEBUG
+#if ABORT_ON_ASSERT_FAILURE
+#endif /* ABORT_ON_ASSERT_FAILURE */
+#else  /* DEBUG */
+#ifndef assert
+#define assert(x)
+#endif
+#define DEBUG 0
+#endif /* DEBUG */
+#if !defined(WIN32) && !defined(LACKS_TIME_H)
+#include <time.h>        /* for magic initialization */
+#endif /* WIN32 */
+#ifndef LACKS_STDLIB_H
+#include <stdlib.h>      /* for abort() */
+#endif /* LACKS_STDLIB_H */
+#ifndef LACKS_STRING_H
+#include <string.h>      /* for memset etc */
+#endif  /* LACKS_STRING_H */
+#if USE_BUILTIN_FFS
+#ifndef LACKS_STRINGS_H
+#include <strings.h>     /* for ffs */
+#endif /* LACKS_STRINGS_H */
+#endif /* USE_BUILTIN_FFS */
+#if HAVE_MMAP
+#ifndef LACKS_SYS_MMAN_H
+/* On some versions of linux, mremap decl in mman.h needs __USE_GNU set */
+#if (defined(linux) && !defined(__USE_GNU))
+#define __USE_GNU 1
+#include <sys/mman.h>    /* for mmap */
+#undef __USE_GNU
+#else
+#include <sys/mman.h>    /* for mmap */
+#endif /* linux */
+#endif /* LACKS_SYS_MMAN_H */
+#ifndef LACKS_FCNTL_H
+#include <fcntl.h>
+#endif /* LACKS_FCNTL_H */
+#endif /* HAVE_MMAP */
+#ifndef LACKS_UNISTD_H
+#include <unistd.h>     /* for sbrk, sysconf */
+#else /* LACKS_UNISTD_H */
+#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) && !defined(__COSMOPOLITAN__)
+extern void*     sbrk(ptrdiff_t);
+#endif /* FreeBSD etc */
+#endif /* LACKS_UNISTD_H */
+
+/* Declarations for locking */
+#if USE_LOCKS
+#ifndef WIN32
+#if defined (__SVR4) && defined (__sun)  /* solaris */
+#elif !defined(LACKS_SCHED_H)
+#endif /* solaris or LACKS_SCHED_H */
+#if (defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0) || !USE_SPIN_LOCKS
+#endif /* USE_RECURSIVE_LOCKS ... */
+#elif defined(_MSC_VER)
+#ifndef _M_AMD64
+/* These are already defined on AMD64 builds */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp);
+LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* _M_AMD64 */
+#pragma intrinsic (_InterlockedCompareExchange)
+#pragma intrinsic (_InterlockedExchange)
+#define interlockedcompareexchange _InterlockedCompareExchange
+#define interlockedexchange _InterlockedExchange
+#elif defined(WIN32) && defined(__GNUC__)
+#define interlockedcompareexchange(a, b, c) __sync_val_compare_and_swap(a, c, b)
+#define interlockedexchange __sync_lock_test_and_set
+#endif /* Win32 */
+#else /* USE_LOCKS */
+#endif /* USE_LOCKS */
+
+#ifndef LOCK_AT_FORK
+#define LOCK_AT_FORK 0
+#endif
+
+/* Declarations for bit scanning on win32 */
+#if defined(_MSC_VER) && _MSC_VER>=1300
+#ifndef BitScanForward /* Try to avoid pulling in WinNT.h */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
+unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#define BitScanForward _BitScanForward
+#define BitScanReverse _BitScanReverse
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#endif /* BitScanForward */
+#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */
+
+#ifndef WIN32
+#ifndef malloc_getpagesize
+#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
+#    ifndef _SC_PAGE_SIZE
+#      define _SC_PAGE_SIZE _SC_PAGESIZE
+#    endif
+#  endif
+#  ifdef _SC_PAGE_SIZE
+#    define malloc_getpagesize 4096 /*sysconf(_SC_PAGE_SIZE)*/
+#  else
+#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
+       extern size_t getpagesize();
+#      define malloc_getpagesize getpagesize()
+#    else
+#      ifdef WIN32 /* use supplied emulation of getpagesize */
+#        define malloc_getpagesize getpagesize()
+#      else
+#        ifndef LACKS_SYS_PARAM_H
+#          include <sys/param.h>
+#        endif
+#        ifdef EXEC_PAGESIZE
+#          define malloc_getpagesize EXEC_PAGESIZE
+#        else
+#          ifdef NBPG
+#            ifndef CLSIZE
+#              define malloc_getpagesize NBPG
+#            else
+#              define malloc_getpagesize (NBPG * CLSIZE)
+#            endif
+#          else
+#            ifdef NBPC
+#              define malloc_getpagesize NBPC
+#            else
+#              ifdef PAGESIZE
+#                define malloc_getpagesize PAGESIZE
+#              else /* just guess */
+#                define malloc_getpagesize ((size_t)4096U)
+#              endif
+#            endif
+#          endif
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+#endif
+
 /* ------------------- size_t and alignment properties -------------------- */
 
 /* The byte and bit size of a size_t */
@@ -181,53 +374,141 @@
 
 /* -------------------------- MMAP preliminaries ------------------------- */
 
+/*
+   If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and
+   checks to fail so compiler optimizer can delete code rather than
+   using so many "#if"s.
+*/
+
+
 /* MORECORE and MMAP must return MFAIL on failure */
 #define MFAIL                NULL
 #define CMFAIL               ((char*)(MFAIL)) /* defined for convenience */
 
+#if HAVE_MMAP
+
+#ifndef WIN32
 #define MUNMAP_DEFAULT(a, s)  munmap((a), (s))
 #define MMAP_PROT            (PROT_READ|PROT_WRITE)
+#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
+#define MAP_ANONYMOUS        MAP_ANON
+#endif /* MAP_ANON */
+#ifdef MAP_ANONYMOUS
 #define MMAP_FLAGS           (MAP_PRIVATE|MAP_ANONYMOUS)
 #define MMAP_DEFAULT(s)       _mapanon(s)
+#else /* MAP_ANONYMOUS */
+/*
+   Nearly all versions of mmap support MAP_ANONYMOUS, so the following
+   is unlikely to be needed, but is supplied just in case.
+*/
+#define MMAP_FLAGS           (MAP_PRIVATE)
+static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
+#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \
+           (dev_zero_fd = open("/dev/zero", O_RDWR), \
+            mmap_no(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \
+            mmap_no(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0))
+#endif /* MAP_ANONYMOUS */
+
 #define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s)
 
+#else /* WIN32 */
+
+/* Win32 MMAP via VirtualAlloc */
+FORCEINLINE void* win32mmap(size_t size) {
+  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
+FORCEINLINE void* win32direct_mmap(size_t size) {
+  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
+                           PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* This function supports releasing coalesed segments */
+FORCEINLINE int win32munmap(void* ptr, size_t size) {
+  MEMORY_BASIC_INFORMATION minfo;
+  char* cptr = (char*)ptr;
+  while (size) {
+    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
+      return -1;
+    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
+        minfo.State != MEM_COMMIT || minfo.RegionSize > size)
+      return -1;
+    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
+      return -1;
+    cptr += minfo.RegionSize;
+    size -= minfo.RegionSize;
+  }
+  return 0;
+}
+
+#define MMAP_DEFAULT(s)             win32mmap(s)
+#define MUNMAP_DEFAULT(a, s)        win32munmap((a), (s))
+#define DIRECT_MMAP_DEFAULT(s)      win32direct_mmap(s)
+#endif /* WIN32 */
+#endif /* HAVE_MMAP */
+
 #if HAVE_MREMAP
 #ifndef WIN32
 #define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
 #endif /* WIN32 */
 #endif /* HAVE_MREMAP */
 
+/**
+ * Define CALL_MORECORE
+ */
+#if HAVE_MORECORE
+    #ifdef MORECORE
+        #define CALL_MORECORE(S)    MORECORE(S)
+    #else  /* MORECORE */
+        #define CALL_MORECORE(S)    MORECORE_DEFAULT(S)
+    #endif /* MORECORE */
+#else  /* HAVE_MORECORE */
+    #define CALL_MORECORE(S)        MFAIL
+#endif /* HAVE_MORECORE */
+
 /**
  * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP
  */
-#define USE_MMAP_BIT            (SIZE_T_ONE)
+#if HAVE_MMAP
+    #define USE_MMAP_BIT            (SIZE_T_ONE)
 
-#ifdef MMAP
-#define CALL_MMAP(s)        MMAP(s)
-#else /* MMAP */
-#define CALL_MMAP(s)        MMAP_DEFAULT(s)
-#endif /* MMAP */
+    #ifdef MMAP
+        #define CALL_MMAP(s)        MMAP(s)
+    #else /* MMAP */
+        #define CALL_MMAP(s)        MMAP_DEFAULT(s)
+    #endif /* MMAP */
+    #ifdef MUNMAP
+        #define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
+    #else /* MUNMAP */
+        #define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
+    #endif /* MUNMAP */
+    #ifdef DIRECT_MMAP
+        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
+    #else /* DIRECT_MMAP */
+        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
+    #endif /* DIRECT_MMAP */
+#else  /* HAVE_MMAP */
+    #define USE_MMAP_BIT            (SIZE_T_ZERO)
 
-#ifdef MUNMAP
-#define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
-#else /* MUNMAP */
-#define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
-#endif /* MUNMAP */
-
-#ifdef DIRECT_MMAP
-#define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
-#else /* DIRECT_MMAP */
-#define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
-#endif /* DIRECT_MMAP */
+    #define MMAP(s)                 MFAIL
+    #define MUNMAP(a, s)            (-1)
+    #define DIRECT_MMAP(s)          MFAIL
+    #define CALL_DIRECT_MMAP(s)     DIRECT_MMAP(s)
+    #define CALL_MMAP(s)            MMAP(s)
+    #define CALL_MUNMAP(a, s)       MUNMAP((a), (s))
+#endif /* HAVE_MMAP */
 
 /**
  * Define CALL_MREMAP
  */
-#if HAVE_MREMAP
+#if HAVE_MMAP && HAVE_MREMAP
     #define CALL_MREMAP(addr, osz, nsz, mv) ({ int olderr = errno; void *res = mremap((addr), (osz), (nsz), (mv)); if (res == MAP_FAILED) errno = olderr; res; })
-#else  /* HAVE_MREMAP */
+#else  /* HAVE_MMAP && HAVE_MREMAP */
     #define CALL_MREMAP(addr, osz, nsz, mv)     MAP_FAILED
-#endif /* HAVE_MREMAP */
+#endif /* HAVE_MMAP && HAVE_MREMAP */
 
 /* mstate bit set if continguous morecore disabled or failed */
 #define USE_NONCONTIGUOUS_BIT (4U)
diff --git a/third_party/dlmalloc/runtimechecks.inc b/third_party/dlmalloc/runtimechecks.inc
index dc86de808..df3fd226c 100644
--- a/third_party/dlmalloc/runtimechecks.inc
+++ b/third_party/dlmalloc/runtimechecks.inc
@@ -28,7 +28,7 @@
 */
 
 #if !INSECURE
-/* Check if address a is at least as high as any from MMAP */
+/* Check if address a is at least as high as any from MORECORE or MMAP */
 #define ok_address(M, a) ((char*)(a) >= (M)->least_addr)
 /* Check if address of next chunk n is higher than base chunk p */
 #define ok_next(p, n)    ((char*)(p) < (char*)(n))
diff --git a/third_party/dlmalloc/system.inc b/third_party/dlmalloc/system.inc
index 443fdfac9..b8fc6ab79 100644
--- a/third_party/dlmalloc/system.inc
+++ b/third_party/dlmalloc/system.inc
@@ -13,7 +13,11 @@
 
 #define use_mmap(M)           ((M)->mflags &   USE_MMAP_BIT)
 #define enable_mmap(M)        ((M)->mflags |=  USE_MMAP_BIT)
+#if HAVE_MMAP
 #define disable_mmap(M)       ((M)->mflags &= ~USE_MMAP_BIT)
+#else
+#define disable_mmap(M)
+#endif
 
 #define use_noncontiguous(M)  ((M)->mflags &   USE_NONCONTIGUOUS_BIT)
 #define disable_contiguous(M) ((M)->mflags |=  USE_NONCONTIGUOUS_BIT)
@@ -74,7 +78,11 @@ static int has_segment_link(mstate m, msegmentptr ss) {
   }
 }
 
+#ifndef MORECORE_CANNOT_TRIM
 #define should_trim(M,s)  ((s) > (M)->trim_check)
+#else  /* MORECORE_CANNOT_TRIM */
+#define should_trim(M,s)  (0)
+#endif /* MORECORE_CANNOT_TRIM */
 
 /*
   TOP_FOOT_SIZE is padding at the end of a segment, including space
diff --git a/third_party/dlmalloc/threaded.inc b/third_party/dlmalloc/threaded.inc
index 3dbfb5b35..83d608b9b 100644
--- a/third_party/dlmalloc/threaded.inc
+++ b/third_party/dlmalloc/threaded.inc
@@ -20,12 +20,13 @@
 #include "libc/intrin/magicu.h"
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
+#include "libc/nexgen32e/rdtscp.h"
+#include "libc/nexgen32e/x86feature.h"
 #include "libc/runtime/runtime.h"
 #include "libc/thread/thread.h"
-#include "libc/thread/threads.h"
-#include "libc/errno.h"
-#include "libc/calls/struct/cpuset.h"
+#include "libc/runtime/runtime.h"
+#include "libc/intrin/weaken.h"
 #include "third_party/dlmalloc/dlmalloc.h"
 
 #if !FOOTERS || !MSPACES
@@ -33,7 +34,6 @@
 #endif
 
 static struct magicu magiu;
-static unsigned g_cpucount;
 static unsigned g_heapslen;
 static mstate g_heaps[128];
 
@@ -61,23 +61,8 @@ int dlmalloc_trim(size_t pad) {
 }
 
 size_t dlbulk_free(void *array[], size_t nelem) {
-  size_t j = 0;
-  mstate msp = (mstate)-1;
-  for (size_t i = 0; i < nelem; ++i) {
-    mstate next;
-    if (array[i]) {
-      next = get_mstate_for(mem2chunk(array[i]));
-      if (next != msp) {
-        if (j)
-          mspace_bulk_free(msp, array, j);
-        msp = next;
-        j = 0;
-      }
-      array[j++] = array[i];
-    }
-  }
-  if (j)
-    mspace_bulk_free(msp, array, j);
+  for (size_t i = 0; i < nelem; ++i)
+    mspace_free(0, array[i]);
   return 0;
 }
 
@@ -105,31 +90,18 @@ void dlmalloc_inspect_all(void handler(void *start, void *end,
   }
 }
 
-// we make malloc() scalable basically by
-//
-//     return g_heaps[sched_getcpu() / 2];
-//
-// except we cache the syscall result using thread-local storage. on
-// some platforms, it's not possible to use sched_getcpu() so we use
-// arbitrary assignments to help scalability, but may not be optimal
-static mstate get_arena(void) {
-  static atomic_uint assign;
-  static thread_local unsigned i;
-  static thread_local unsigned n;
-  if (n == 50)
-    n = 0;
-  if (!n) {
-    int e = errno;
-    i = sched_getcpu();
-    if (i == -1) {
-      errno = e;
-      i = atomic_fetch_add_explicit(&assign, 1, memory_order_relaxed);
-      i %= g_cpucount;
-    }
-    i = __magicu_div(i, magiu) % g_heapslen;
-  }
-  ++n;
-  return g_heaps[i];
+forceinline mstate get_arena(void) {
+  unsigned cpu;
+#ifdef __x86_64__
+  unsigned tsc_aux;
+  rdtscp(&tsc_aux);
+  cpu = TSC_AUX_CORE(tsc_aux);
+#else
+  long tpidr_el0;
+  asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
+  cpu = tpidr_el0 & 255;
+#endif
+  return g_heaps[__magicu_div(cpu, magiu) % g_heapslen];
 }
 
 static void *dlmalloc_single(size_t n) {
@@ -202,18 +174,19 @@ static void threaded_dlmalloc(void) {
   if (!_weaken(pthread_create))
     return use_single_heap(false);
 
+  if (!IsAarch64() && !X86_HAVE(RDTSCP))
+    return use_single_heap(true);
+
   // determine how many independent heaps we should install
   // by default we do an approximation of one heap per core
   // this code makes the c++ stl go 164x faster on my ryzen
-  g_cpucount = cpus = __get_cpu_count();
-  if (cpus == -1) {
+  cpus = __get_cpu_count();
+  if (cpus == -1)
     heaps = 1;
-    g_cpucount = 1;
-  } else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT"))) {
+  else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT")))
     heaps = dlmalloc_atoi(var);
-  } else {
+  else
     heaps = cpus >> 1;
-  }
   if (heaps <= 1)
     return use_single_heap(true);
   if (heaps > ARRAYLEN(g_heaps))
diff --git a/third_party/double-conversion/BUILD.mk b/third_party/double-conversion/BUILD.mk
index 847f02f5e..10da7f072 100644
--- a/third_party/double-conversion/BUILD.mk
+++ b/third_party/double-conversion/BUILD.mk
@@ -34,8 +34,7 @@ THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS =					\
 	LIBC_MEM								\
 	LIBC_STR								\
 	LIBC_TINYMATH								\
-	THIRD_PARTY_LIBCXXABI							\
-	THIRD_PARTY_LIBUNWIND
+	THIRD_PARTY_LIBCXXABI
 
 THIRD_PARTY_DOUBLECONVERSION_A_DEPS :=						\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS),$($(x))))
diff --git a/third_party/gdtoa/dmisc.c b/third_party/gdtoa/dmisc.c
index 8f5b55a84..a0871ddec 100644
--- a/third_party/gdtoa/dmisc.c
+++ b/third_party/gdtoa/dmisc.c
@@ -49,8 +49,6 @@ __gdtoa_rv_alloc(int i, ThInfo **PTI)
 	     j <<= 1)
 		k++;
 	r = (int *)__gdtoa_Balloc(k, PTI);
-	if (r == NULL)
-		return NULL;
 	*r = k;
 	return (char *)(r + 1);
 }
diff --git a/third_party/gdtoa/dtoa.c b/third_party/gdtoa/dtoa.c
index a6eafc70e..6982c73b0 100644
--- a/third_party/gdtoa/dtoa.c
+++ b/third_party/gdtoa/dtoa.c
@@ -246,9 +246,6 @@ dtoa(double d0, int mode, int ndigits, int *decpt, int *sign, char **rve)
 			i = 1;
 	}
 	s = s0 = __gdtoa_rv_alloc(i, &TI);
-	if (s0 == NULL)
-		goto ret1;
-
 	if (mode > 1 && Rounding != 1)
 		leftright = 0;
 	if (ilim >= 0 && ilim <= Quick_max && try_quick) {
@@ -617,8 +614,7 @@ retc:
 		--s;
 ret1:
 	__gdtoa_Bfree(b, &TI);
-	if (s != NULL)
-		*s = 0;
+	*s = 0;
 	*decpt = k + 1;
 	if (rve)
 		*rve = s;
diff --git a/third_party/gdtoa/gdtoa.c b/third_party/gdtoa/gdtoa.c
index aba7358c2..67199c53d 100644
--- a/third_party/gdtoa/gdtoa.c
+++ b/third_party/gdtoa/gdtoa.c
@@ -286,16 +286,12 @@ gdtoa(const FPI *fpi, int be, ULong *bits, int *kindp, int mode, int ndigits, in
 			i = 1;
 	}
 	s = s0 = __gdtoa_rv_alloc(i, &TI);
-	if (s0 == NULL)
-		goto ret1;
 	if (mode <= 1)
 		rdir = 0;
 	else if ( (rdir = fpi->rounding - 1) !=0) {
 		if (rdir < 0)
 			rdir = 2;
-		// note that we check for fpi->rounding == 0 as in that case we
-		// must *always* round towards 0, i.e. downwards, with rdir = 2
-		if (kind & STRTOG_Neg && fpi->rounding != 0)
+		if (kind & STRTOG_Neg)
 			rdir = 3 - rdir;
 	}
 	/* Now rdir = 0 ==> round near, 1 ==> round up, 2 ==> round down. */
@@ -677,12 +673,10 @@ ret:
 		__gdtoa_Bfree(mhi, &TI);
 	}
 ret1:
-	if (s != NULL)
-		while(s > s0 && s[-1] == '0')
-			--s;
+	while(s > s0 && s[-1] == '0')
+		--s;
 	__gdtoa_Bfree(b, &TI);
-	if (s != NULL)
-		*s = 0;
+	*s = 0;
 	*decpt = k + 1;
 	if (rve)
 		*rve = s;
diff --git a/third_party/gdtoa/gethex.c b/third_party/gdtoa/gethex.c
index 22fd02d23..36f5cf751 100644
--- a/third_party/gdtoa/gethex.c
+++ b/third_party/gdtoa/gethex.c
@@ -169,9 +169,7 @@ pcheck:
 			L = 0;
 			n = 0;
 		}
-		// We can shift in a way that changes the sign bit or overflows,
-		// so we need to cast to unsigned to avoid undefined behavior
-		L |= (unsigned)(__gdtoa_hexdig[*s1] & 0x0f) << n;
+		L |= (__gdtoa_hexdig[*s1] & 0x0f) << n;
 		n += 4;
 	}
 	*x++ = L;
diff --git a/third_party/gdtoa/lock.c b/third_party/gdtoa/lock.c
deleted file mode 100644
index 1e5cc36de..000000000
--- a/third_party/gdtoa/lock.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  The author of this software is David M. Gay.                                │
-│  Please send bug reports to David M. Gay <dmg@acm.org>                       │
-│                          or Justine Tunney <jtunney@gmail.com>               │
-│                                                                              │
-│  Copyright (C) 1998, 1999 by Lucent Technologies                             │
-│  All Rights Reserved                                                         │
-│                                                                              │
-│  Permission to use, copy, modify, and distribute this software and           │
-│  its documentation for any purpose and without fee is hereby                 │
-│  granted, provided that the above copyright notice appear in all             │
-│  copies and that both that the copyright notice and this                     │
-│  permission notice and warranty disclaimer appear in supporting              │
-│  documentation, and that the name of Lucent or any of its entities           │
-│  not be used in advertising or publicity pertaining to                       │
-│  distribution of the software without specific, written prior                │
-│  permission.                                                                 │
-│                                                                              │
-│  LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,               │
-│  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.            │
-│  IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY           │
-│  SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES                   │
-│  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER             │
-│  IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,              │
-│  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF              │
-│  THIS SOFTWARE.                                                              │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/thread/posixthread.internal.h"
-#include "third_party/gdtoa/lock.h"
-
-static pthread_mutex_t __gdtoa_lock_obj = PTHREAD_MUTEX_INITIALIZER;
-static pthread_mutex_t __gdtoa_lock1_obj = PTHREAD_MUTEX_INITIALIZER;
-
-void
-__gdtoa_lock(void)
-{
-	_pthread_mutex_lock(&__gdtoa_lock_obj);
-}
-
-void
-__gdtoa_unlock(void)
-{
-	_pthread_mutex_unlock(&__gdtoa_lock_obj);
-}
-
-void
-__gdtoa_wipe(void)
-{
-	_pthread_mutex_wipe_np(&__gdtoa_lock_obj);
-}
-
-void
-__gdtoa_lock1(void)
-{
-	_pthread_mutex_lock(&__gdtoa_lock1_obj);
-}
-
-void
-__gdtoa_unlock1(void)
-{
-	_pthread_mutex_unlock(&__gdtoa_lock1_obj);
-}
-
-void
-__gdtoa_wipe1(void)
-{
-	_pthread_mutex_wipe_np(&__gdtoa_lock1_obj);
-}
diff --git a/third_party/gdtoa/lock.h b/third_party/gdtoa/lock.h
deleted file mode 100644
index 71af847aa..000000000
--- a/third_party/gdtoa/lock.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GDTOA_LOCK_H_
-#define COSMOPOLITAN_THIRD_PARTY_GDTOA_LOCK_H_
-#include "libc/thread/thread.h"
-COSMOPOLITAN_C_START_
-
-void __gdtoa_lock(void);
-void __gdtoa_unlock(void);
-void __gdtoa_wipe(void);
-
-void __gdtoa_lock1(void);
-void __gdtoa_unlock1(void);
-void __gdtoa_wipe1(void);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GDTOA_LOCK_H_ */
diff --git a/third_party/gdtoa/misc.c b/third_party/gdtoa/misc.c
index 2d3809a9c..75d3883d8 100644
--- a/third_party/gdtoa/misc.c
+++ b/third_party/gdtoa/misc.c
@@ -30,14 +30,51 @@
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 #include "third_party/gdtoa/gdtoa.internal.h"
-#include "third_party/gdtoa/lock.h"
 
 static ThInfo TI0;
+static pthread_mutex_t __gdtoa_lock_obj;
+static pthread_mutex_t __gdtoa_lock1_obj;
+
+static void
+__gdtoa_lock(void)
+{
+	pthread_mutex_lock(&__gdtoa_lock_obj);
+}
+
+static void
+__gdtoa_unlock(void)
+{
+	pthread_mutex_unlock(&__gdtoa_lock_obj);
+}
+
+static void
+__gdtoa_initlock(void)
+{
+	pthread_mutex_init(&__gdtoa_lock_obj, 0);
+}
+
+static void
+__gdtoa_lock1(void)
+{
+	pthread_mutex_lock(&__gdtoa_lock1_obj);
+}
+
+static void
+__gdtoa_unlock1(void)
+{
+	pthread_mutex_unlock(&__gdtoa_lock1_obj);
+}
+
+static void
+__gdtoa_initlock1(void)
+{
+	pthread_mutex_init(&__gdtoa_lock1_obj, 0);
+}
 
 static void
 __gdtoa_Brelease(Bigint *rv)
@@ -51,20 +88,24 @@ static void
 __gdtoa_Bclear(void)
 {
 	int i;
-	__gdtoa_lock1();
+	__gdtoa_lock();
 	for (i = 0; i < ARRAYLEN(TI0.Freelist); ++i)
 		__gdtoa_Brelease(TI0.Freelist[i]);
-	__gdtoa_lock();
+	__gdtoa_lock1();
 	__gdtoa_Brelease(TI0.P5s);
-	__gdtoa_unlock();
-	bzero(&TI0, sizeof(TI0));
 	__gdtoa_unlock1();
+	bzero(&TI0, sizeof(TI0));
+	__gdtoa_unlock();
 }
 
 __attribute__((__constructor__(60))) static void
 __gdtoa_Binit(void)
 {
+	__gdtoa_initlock();
+	__gdtoa_initlock1();
 	atexit(__gdtoa_Bclear);
+	pthread_atfork(__gdtoa_lock1, __gdtoa_unlock1, __gdtoa_initlock1);
+	pthread_atfork(__gdtoa_lock, __gdtoa_unlock, __gdtoa_initlock);
 }
 
 static ThInfo *
@@ -88,16 +129,12 @@ __gdtoa_Balloc(int k, ThInfo **PTI)
 	} else {
 		x = 1 << k;
 		rv = malloc(sizeof(Bigint) + (x-1)*sizeof(ULong));
-		if (rv == NULL)
-			goto ret;
 		rv->k = k;
 		rv->maxwds = x;
 	}
-	rv->sign = rv->wds = 0;
-
-ret:
 	if (TI == &TI0)
 		__gdtoa_unlock();
+	rv->sign = rv->wds = 0;
 	return rv;
 }
 
diff --git a/third_party/intel/BUILD.mk b/third_party/intel/BUILD.mk
index 7c810ce96..fb82e1fbc 100644
--- a/third_party/intel/BUILD.mk
+++ b/third_party/intel/BUILD.mk
@@ -3,4 +3,4 @@
 
 PKGS += THIRD_PARTY_INTEL
 THIRD_PARTY_INTEL_HDRS = $(filter %.h,$(THIRD_PARTY_INTEL_FILES))
-THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*) $(wildcard third_party/intel/clang/*)
+THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*)
diff --git a/third_party/intel/clang/__wmmintrin_aes.h b/third_party/intel/clang/__wmmintrin_aes.h
deleted file mode 100644
index 3010b3871..000000000
--- a/third_party/intel/clang/__wmmintrin_aes.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __WMMINTRIN_H
-#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
-#endif
-
-#ifndef __WMMINTRIN_AES_H
-#define __WMMINTRIN_AES_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
-
-/// Performs a single round of AES encryption using the Equivalent
-///    Inverse Cipher, transforming the state value from the first source
-///    operand using a 128-bit round key value contained in the second source
-///    operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the state value.
-/// \param __R
-///    A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the encrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesenc_si128(__m128i __V, __m128i __R)
-{
-  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
-}
-
-/// Performs the final round of AES encryption using the Equivalent
-///    Inverse Cipher, transforming the state value from the first source
-///    operand using a 128-bit round key value contained in the second source
-///    operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the state value.
-/// \param __R
-///    A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the encrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesenclast_si128(__m128i __V, __m128i __R)
-{
-  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
-}
-
-/// Performs a single round of AES decryption using the Equivalent
-///    Inverse Cipher, transforming the state value from the first source
-///    operand using a 128-bit round key value contained in the second source
-///    operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the state value.
-/// \param __R
-///    A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the decrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesdec_si128(__m128i __V, __m128i __R)
-{
-  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
-}
-
-/// Performs the final round of AES decryption using the Equivalent
-///    Inverse Cipher, transforming the state value from the first source
-///    operand using a 128-bit round key value contained in the second source
-///    operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the state value.
-/// \param __R
-///    A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the decrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesdeclast_si128(__m128i __V, __m128i __R)
-{
-  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
-}
-
-/// Applies the AES InvMixColumns() transformation to an expanded key
-///    contained in the source operand, and writes the result to the
-///    destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the expanded key.
-/// \returns A 128-bit integer vector containing the transformed value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesimc_si128(__m128i __V)
-{
-  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
-}
-
-/// Generates a round key for AES encryption, operating on 128-bit data
-///    specified in the first source operand and using an 8-bit round constant
-///    specified by the second source operand, and writes the result to the
-///    destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
-///
-/// \param C
-///    A 128-bit integer vector that is used to generate the AES encryption key.
-/// \param R
-///    An 8-bit round constant used to generate the AES encryption key.
-/// \returns A 128-bit round key for AES encryption.
-#define _mm_aeskeygenassist_si128(C, R) \
-  ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif  /* __WMMINTRIN_AES_H */
diff --git a/third_party/intel/clang/__wmmintrin_pclmul.h b/third_party/intel/clang/__wmmintrin_pclmul.h
deleted file mode 100644
index c9a6d50bd..000000000
--- a/third_party/intel/clang/__wmmintrin_pclmul.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __WMMINTRIN_H
-#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
-#endif
-
-#ifndef __WMMINTRIN_PCLMUL_H
-#define __WMMINTRIN_PCLMUL_H
-
-/// Multiplies two 64-bit integer values, which are selected from source
-///    operands using the immediate-value operand. The multiplication is a
-///    carry-less multiplication, and the 128-bit integer product is stored in
-///    the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param Y
-///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param I
-///    An immediate value specifying which 64-bit values to select from the
-///    operands. Bit 0 is used to select a value from operand \a X, and bit
-///    4 is used to select a value from operand \a Y: \n
-///    Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
-///    Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
-///    Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
-///    Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
-/// \returns The 128-bit integer vector containing the result of the carry-less
-///    multiplication of the selected 64-bit values.
-#define _mm_clmulepi64_si128(X, Y, I) \
-  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
-                                        (__v2di)(__m128i)(Y), (char)(I)))
-
-#endif /* __WMMINTRIN_PCLMUL_H */
diff --git a/third_party/intel/clang/adcintrin.h b/third_party/intel/clang/adcintrin.h
deleted file mode 100644
index 0065a1b54..000000000
--- a/third_party/intel/clang/adcintrin.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ADCINTRIN_H
-#define __ADCINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-
-/* Use C++ inline semantics in C++, GNU inline for C mode. */
-#if defined(__cplusplus)
-#define __INLINE __inline
-#else
-#define __INLINE static __inline
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 32-bit unsigned addend.
-/// \param __y
-///    A 32-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
-                                                        unsigned int __x,
-                                                        unsigned int __y,
-                                                        unsigned int *__p) {
-  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
-///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SBB instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 32-bit unsigned minuend.
-/// \param __y
-///    The 32-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
-                                                         unsigned int __x,
-                                                         unsigned int __y,
-                                                         unsigned int *__p) {
-  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 64-bit unsigned addend.
-/// \param __y
-///    A 64-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u64(unsigned char __cf, unsigned long long __x,
-              unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-
-/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
-///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 64-bit unsigned minuend.
-/// \param __y
-///    The 64-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u64(unsigned char __cf, unsigned long long __x,
-               unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
-}
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#undef __INLINE
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __ADCINTRIN_H */
diff --git a/third_party/intel/clang/adxintrin.h b/third_party/intel/clang/adxintrin.h
deleted file mode 100644
index bc6a4caf3..000000000
--- a/third_party/intel/clang/adxintrin.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __ADXINTRIN_H
-#define __ADXINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-
-/* Use C++ inline semantics in C++, GNU inline for C mode. */
-#if defined(__cplusplus)
-#define __INLINE __inline
-#else
-#define __INLINE static __inline
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* Intrinsics that are available only if __ADX__ is defined. */
-
-/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADCX instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 32-bit unsigned addend.
-/// \param __y
-///    A 32-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
-                                                         unsigned int __x,
-                                                         unsigned int __y,
-                                                         unsigned int *__p) {
-  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADCX instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 64-bit unsigned addend.
-/// \param __y
-///    A 64-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_addcarryx_u64(unsigned char __cf, unsigned long long __x,
-               unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#undef __INLINE
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __ADXINTRIN_H */
diff --git a/third_party/intel/clang/ammintrin.h b/third_party/intel/clang/ammintrin.h
deleted file mode 100644
index edf08e8c5..000000000
--- a/third_party/intel/clang/ammintrin.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __AMMINTRIN_H
-#define __AMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include "pmmintrin.h"
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
-
-/// Extracts the specified bits from the lower 64 bits of the 128-bit
-///    integer vector operand at the index \a idx and of the length \a len.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
-///
-/// \param x
-///    The value from which bits are extracted.
-/// \param len
-///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
-///    are zero, the length is interpreted as 64.
-/// \param idx
-///    Bits [5:0] specify the index of the least significant bit; the other
-///    bits are ignored. If the sum of the index and length is greater than 64,
-///    the result is undefined. If the length and index are both zero, bits
-///    [63:0] of parameter \a x are extracted. If the length is zero but the
-///    index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
-///    extracted from the source operand.
-#define _mm_extracti_si64(x, len, idx) \
-  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
-                                  (char)(len), (char)(idx)))
-
-/// Extracts the specified bits from the lower 64 bits of the 128-bit
-///    integer vector operand at the index and of the length specified by
-///    \a __y.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
-///
-/// \param __x
-///    The value from which bits are extracted.
-/// \param __y
-///    Specifies the index of the least significant bit at [13:8] and the
-///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
-///    length is interpreted as 64. If the sum of the index and length is
-///    greater than 64, the result is undefined. If the length and index are
-///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
-///    is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
-///    from the source operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_extract_si64(__m128i __x, __m128i __y)
-{
-  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
-}
-
-/// Inserts bits of a specified length from the source integer vector
-///    \a y into the lower 64 bits of the destination integer vector \a x at
-///    the index \a idx and of the length \a len.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
-/// const int idx);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
-///
-/// \param x
-///    The destination operand where bits will be inserted. The inserted bits
-///    are defined by the length \a len and by the index \a idx specifying the
-///    least significant bit.
-/// \param y
-///    The source operand containing the bits to be extracted. The extracted
-///    bits are the least significant bits of operand \a y of length \a len.
-/// \param len
-///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
-///    are zero, the length is interpreted as 64.
-/// \param idx
-///    Bits [5:0] specify the index of the least significant bit; the other
-///    bits are ignored. If the sum of the index and length is greater than 64,
-///    the result is undefined. If the length and index are both zero, bits
-///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
-///    is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits of
-///    destination operand \a x with the specified bitfields replaced by the
-///    lower bits of source operand \a y. The upper 64 bits of the return value
-///    are undefined.
-#define _mm_inserti_si64(x, y, len, idx) \
-  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
-                                    (__v2di)(__m128i)(y), \
-                                    (char)(len), (char)(idx)))
-
-/// Inserts bits of a specified length from the source integer vector
-///    \a __y into the lower 64 bits of the destination integer vector \a __x
-///    at the index and of the length specified by \a __y.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
-///
-/// \param __x
-///    The destination operand where bits will be inserted. The inserted bits
-///    are defined by the length and by the index of the least significant bit
-///    specified by operand \a __y.
-/// \param __y
-///    The source operand containing the bits to be extracted. The extracted
-///    bits are the least significant bits of operand \a __y with length
-///    specified by bits [69:64]. These are inserted into the destination at the
-///    index specified by bits [77:72]; all other bits are ignored. If bits
-///    [69:64] are zero, the length is interpreted as 64. If the sum of the
-///    index and length is greater than 64, the result is undefined. If the
-///    length and index are both zero, bits [63:0] of parameter \a __y are
-///    inserted into parameter \a __x. If the length is zero but the index is
-///    non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits of
-///    destination operand \a __x with the specified bitfields replaced by the
-///    lower bits of source operand \a __y. The upper 64 bits of the return
-///    value are undefined.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_insert_si64(__m128i __x, __m128i __y)
-{
-  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
-}
-
-/// Stores a 64-bit double-precision value in a 64-bit memory location.
-///    To minimize caching, the data is flagged as non-temporal (unlikely to be
-///    used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
-///
-/// \param __p
-///    The 64-bit memory location used to store the register value.
-/// \param __a
-///    The 64-bit double-precision floating-point register value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_sd(void *__p, __m128d __a)
-{
-  __builtin_ia32_movntsd((double *)__p, (__v2df)__a);
-}
-
-/// Stores a 32-bit single-precision floating-point value in a 32-bit
-///    memory location. To minimize caching, the data is flagged as
-///    non-temporal (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
-///
-/// \param __p
-///    The 32-bit memory location used to store the register value.
-/// \param __a
-///    The 32-bit single-precision floating-point register value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ss(void *__p, __m128 __a)
-{
-  __builtin_ia32_movntss((float *)__p, (__v4sf)__a);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __AMMINTRIN_H */
diff --git a/third_party/intel/clang/amxcomplexintrin.h b/third_party/intel/clang/amxcomplexintrin.h
deleted file mode 100644
index 84ef972fc..000000000
--- a/third_party/intel/clang/amxcomplexintrin.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AMX_COMPLEXINTRIN_H
-#define __AMX_COMPLEXINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS_COMPLEX                                             \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles \a a and \a b is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-/// Calculates the imaginary part of the result. For each possible combination
-///    of (row of \a a, column of \a b), it performs a set of multiplication
-///    and accumulations on all corresponding complex numbers (one from \a a
-///    and one from \a b). The imaginary part of the \a a element is multiplied
-///    with the real part of the corresponding \a b element, and the real part
-///    of the \a a element is multiplied with the imaginary part of the
-///    corresponding \a b elements. The two accumulated results are added, and
-///    then accumulated into the corresponding row and column of \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles \a a and \a b is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-/// Calculates the real part of the result. For each possible combination
-///    of (row of \a a, column of \a b), it performs a set of multiplication
-///    and accumulations on all corresponding complex numbers (one from \a a
-///    and one from \a b). The real part of the \a a element is multiplied
-///    with the real part of the corresponding \a b element, and the negated
-///    imaginary part of the \a a element is multiplied with the imaginary
-///    part of the corresponding \a b elements. The two accumulated results
-///    are added, and then accumulated into the corresponding row and column
-///    of \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
-///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
-_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
-_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-/// accumulate the results into a packed single precision tile. Each dword
-/// element in input tiles src0 and src1 is interpreted as a complex number with
-/// FP16 real part and FP16 imaginary part.
-/// This function calculates the imaginary part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_COMPLEX
-static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
-                               __tile1024i src1) {
-  dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
-                                         dst->tile, src0.tile, src1.tile);
-}
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-/// accumulate the results into a packed single precision tile. Each dword
-/// element in input tiles src0 and src1 is interpreted as a complex number with
-/// FP16 real part and FP16 imaginary part.
-/// This function calculates the real part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_COMPLEX
-static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
-                               __tile1024i src1) {
-  dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
-                                         dst->tile, src0.tile, src1.tile);
-}
-
-#endif // __x86_64__
-#endif // __AMX_COMPLEXINTRIN_H
diff --git a/third_party/intel/clang/amxfp16intrin.h b/third_party/intel/clang/amxfp16intrin.h
deleted file mode 100644
index ed798245d..000000000
--- a/third_party/intel/clang/amxfp16intrin.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_FP16INTRIN_H
-#define __AMX_FP16INTRIN_H
-#ifdef __x86_64__
-
-/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
-///    and \a b, accumulating the intermediate single-precision (32-bit)
-///    floating-point elements with elements in \a dst, and store the 32-bit
-///    result back to tile \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
-///					FP32(b.row[k].fp16[2*n+0])
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
-///					FP32(b.row[k].fp16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TDPFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpfp16ps(dst, a, b)                                \
-  __builtin_ia32_tdpfp16ps(dst, a, b)
-
-#endif /* __x86_64__ */
-#endif /* __AMX_FP16INTRIN_H */
diff --git a/third_party/intel/clang/amxintrin.h b/third_party/intel/clang/amxintrin.h
deleted file mode 100644
index baa56f5b2..000000000
--- a/third_party/intel/clang/amxintrin.h
+++ /dev/null
@@ -1,524 +0,0 @@
-/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMXINTRIN_H
-#define __AMXINTRIN_H
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS_TILE                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
-#define __DEFAULT_FN_ATTRS_INT8                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
-#define __DEFAULT_FN_ATTRS_BF16                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
-#define __DEFAULT_FN_ATTRS_FP16                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
-
-/// Load tile configuration from a 64-byte memory location specified by
-/// "mem_addr". The tile configuration includes the tile type palette, the
-/// number of bytes per row, and the number of rows. If the specified
-/// palette_id is zero, that signifies the init state for both the tile
-/// config and the tile data, and the tiles are zeroed. Any invalid
-/// configurations will result in #GP fault.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
-///
-/// \param __config
-///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS_TILE
-_tile_loadconfig(const void *__config) {
-  __builtin_ia32_tile_loadconfig(__config);
-}
-
-/// Stores the current tile configuration to a 64-byte memory location
-/// specified by "mem_addr". The tile configuration includes the tile type
-/// palette, the number of bytes per row, and the number of rows. If tiles
-/// are not configured, all zeroes will be stored to memory.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
-///
-/// \param __config
-///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS_TILE
-_tile_storeconfig(void *__config) {
-  __builtin_ia32_tile_storeconfig(__config);
-}
-
-/// Release the tile configuration to return to the init state, which
-/// releases all storage it currently holds.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
-  __builtin_ia32_tilerelease();
-}
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst" using the tile configuration previously configured
-/// via "_tile_loadconfig".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-#define _tile_loadd(dst, base, stride)                                         \
-  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
-                             (__SIZE_TYPE__)(stride))
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst" using the tile configuration previously configured
-/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
-/// that the data will likely not be reused in the near future and the data
-/// caching can be optimized accordingly.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-#define _tile_stream_loadd(dst, base, stride)                                  \
-  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
-                               (__SIZE_TYPE__)(stride))
-
-/// Store the tile specified by "src" to memory specifieid by "base" address and
-/// "stride" using the tile configuration previously configured via
-/// "_tile_loadconfig".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be stored in memory.
-#define _tile_stored(dst, base, stride)                                        \
-  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
-
-/// Zero the tile specified by "tdest".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
-///
-/// \param tile
-///    The destination tile to be zero. Max size is 1024 Bytes.
-#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbssd(dst, src0, src1)                                          \
-  __builtin_ia32_tdpbssd((dst), (src0), (src1))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbsud(dst, src0, src1)                                          \
-  __builtin_ia32_tdpbsud((dst), (src0), (src1))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbusd(dst, src0, src1)                                          \
-  __builtin_ia32_tdpbusd((dst), (src0), (src1))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
-/// "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbuud(dst, src0, src1)                                          \
-  __builtin_ia32_tdpbuud((dst), (src0), (src1))
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf16ps(dst, src0, src1)                                        \
-  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
-
-/// AMX tile register size can be configured, the maximum size is 16x64=1024
-/// bytes. Since there is no 2D type in llvm IR, we use vector type to
-/// represent 2D tile and the fixed size is maximum amx tile register size.
-typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
-                     __SIZE_TYPE__ stride) {
-  return __builtin_ia32_tileloadd64_internal(m, n, base,
-                                             (__SIZE_TYPE__)(stride));
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
-                       __SIZE_TYPE__ stride) {
-  return __builtin_ia32_tileloaddt164_internal(m, n, base,
-                                               (__SIZE_TYPE__)(stride));
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
-                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
-                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
-                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
-                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ void __DEFAULT_FN_ATTRS_INT8
-_tile_stored_internal(unsigned short m, unsigned short n, void *base,
-                      __SIZE_TYPE__ stride, _tile1024i tile) {
-  return __builtin_ia32_tilestored64_internal(m, n, base,
-                                              (__SIZE_TYPE__)(stride), tile);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
-_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
-_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// This struct pack the shape and tile data together for user. We suggest
-/// initializing the struct as early as possible, because compiler depends
-/// on the shape information to do configure. The constant value is preferred
-/// for optimization by compiler.
-typedef struct __tile1024i_str {
-  const unsigned short row;
-  const unsigned short col;
-  _tile1024i tile;
-} __tile1024i;
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
-                                    __SIZE_TYPE__ stride) {
-  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
-}
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst". This intrinsic provides a hint to the implementation
-/// that the data will likely not be reused in the near future and the data
-/// caching can be optimized accordingly.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
-                                           __SIZE_TYPE__ stride) {
-  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
-                                     __tile1024i src1) {
-  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
-                                    src0.tile, src1.tile);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
-                                     __tile1024i src1) {
-  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
-                                    src0.tile, src1.tile);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
-                                     __tile1024i src1) {
-  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
-                                    src0.tile, src1.tile);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
-/// "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
-                                     __tile1024i src1) {
-  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
-                                    src0.tile, src1.tile);
-}
-
-/// Store the tile specified by "src" to memory specifieid by "base" address and
-/// "stride".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
-///
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be stored in memory.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
-                                     __tile1024i src) {
-  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
-}
-
-/// Zero the tile specified by "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
-///
-/// \param dst
-///    The destination tile to be zero. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_zero(__tile1024i *dst) {
-  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_BF16
-static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
-                                       __tile1024i src1) {
-  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
-                                      src0.tile, src1.tile);
-}
-
-/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_FP16
-static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
-                                       __tile1024i src1) {
-  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
-                                      src0.tile, src1.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS_TILE
-#undef __DEFAULT_FN_ATTRS_INT8
-#undef __DEFAULT_FN_ATTRS_BF16
-#undef __DEFAULT_FN_ATTRS_FP16
-
-#endif /* __x86_64__ */
-#endif /* __AMXINTRIN_H */
diff --git a/third_party/intel/clang/avx2intrin.h b/third_party/intel/clang/avx2intrin.h
deleted file mode 100644
index 096cae01b..000000000
--- a/third_party/intel/clang/avx2intrin.h
+++ /dev/null
@@ -1,5284 +0,0 @@
-/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX2INTRIN_H
-#define __AVX2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx2,no-evex512"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx2,no-evex512"), __min_vector_width__(128)))
-
-/* SSE4 Multiple Packed Sums of Absolute Difference.  */
-/// Computes sixteen sum of absolute difference (SAD) operations on sets of
-///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
-///    \a Y.
-///
-///    Eight SAD results are computed using the lower half of the input
-///    vectors, and another eight using the upper half. These 16-bit values
-///    are returned in the lower and upper halves of the 256-bit result,
-///    respectively.
-///
-///    A single SAD operation selects four bytes from \a X and four bytes from
-///    \a Y as input. It computes the differences between each \a X byte and
-///    the corresponding \a Y byte, takes the absolute value of each
-///    difference, and sums these four values to form one 16-bit result. The
-///    intrinsic computes 16 of these results with different sets of input
-///    bytes.
-///
-///    For each set of eight results, the SAD operations use the same four
-///    bytes from \a Y; the starting bit position for these four bytes is
-///    specified by \a M[1:0] times 32. The eight operations use successive
-///    sets of four bytes from \a X; the starting bit position for the first
-///    set of four bytes is specified by \a M[2] times 32. These bit positions
-///    are all relative to the 128-bit lane for each set of eight operations.
-///
-/// \code{.operation}
-/// r := 0
-/// FOR i := 0 TO 1
-///   j := i*3
-///   Ybase := M[j+1:j]*32 + i*128
-///   Xbase := M[j+2]*32 + i*128
-///   FOR k := 0 TO 3
-///     temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
-///     temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
-///     temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
-///     temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
-///     result[r+15:r] := temp0 + temp1 + temp2 + temp3
-///     Xbase := Xbase + 8
-///     r := r + 16
-///   ENDFOR
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VMPSADBW instruction.
-///
-/// \param X
-///    A 256-bit integer vector containing one of the inputs.
-/// \param Y
-///    A 256-bit integer vector containing one of the inputs.
-/// \param M
-///     An unsigned immediate value specifying the starting positions of the
-///     bytes to operate on.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_mpsadbw_epu8(X, Y, M) \
-  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
-                                      (__v32qi)(__m256i)(Y), (int)(M)))
-
-/// Computes the absolute value of each signed byte in the 256-bit integer
-///    vector \a __a and returns each value in the corresponding byte of
-///    the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPABSB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi8(__m256i __a)
-{
-    return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
-}
-
-/// Computes the absolute value of each signed 16-bit element in the 256-bit
-///    vector of [16 x i16] in \a __a and returns each value in the
-///    corresponding element of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPABSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16].
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi16(__m256i __a)
-{
-    return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
-}
-
-/// Computes the absolute value of each signed 32-bit element in the 256-bit
-///    vector of [8 x i32] in \a __a and returns each value in the
-///    corresponding element of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPABSD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32].
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi32(__m256i __a)
-{
-    return (__m256i)__builtin_elementwise_abs((__v8si)__a);
-}
-
-/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
-///    integers using signed saturation, and returns the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*16
-///   k := i*8
-///   result[7+k:k] := SATURATE8(__a[15+j:j])
-///   result[71+k:64+k] := SATURATE8(__b[15+j:j])
-///   result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
-///   result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPACKSSWB instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
-///    result[191:128].
-/// \param __b
-///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
-///    result[255:192].
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
-///    integers using signed saturation, and returns the resulting 256-bit
-///    vector of [16 x i16].
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*32
-///   k := i*16
-///   result[15+k:k] := SATURATE16(__a[31+j:j])
-///   result[79+k:64+k] := SATURATE16(__b[31+j:j])
-///   result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
-///   result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPACKSSDW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
-///    result[191:128].
-/// \param __b
-///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
-///    result[255:192].
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
-}
-
-/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
-///    using unsigned saturation, and returns the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*16
-///   k := i*8
-///   result[7+k:k] := SATURATE8U(__a[15+j:j])
-///   result[71+k:64+k] := SATURATE8U(__b[15+j:j])
-///   result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
-///   result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPACKUSWB instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
-///    result[191:128].
-/// \param __b
-///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
-///    result[255:192].
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
-///    using unsigned saturation, and returns the resulting 256-bit vector of
-///    [16 x i16].
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*32
-///   k := i*16
-///   result[15+k:k] := SATURATE16U(__V1[31+j:j])
-///   result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
-///   result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
-///   result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPACKUSDW instruction.
-///
-/// \param __V1
-///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
-///    result[191:128].
-/// \param __V2
-///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
-///    result[255:192].
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2)
-{
-  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
-}
-
-/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
-///    vectors and returns the lower 8 bits of each sum in the corresponding
-///    byte of the 256-bit integer vector result (overflow is ignored).
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPADDB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 256-bit integer vector containing one of the source operands.
-/// \returns A 256-bit integer vector containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v32qu)__a + (__v32qu)__b);
-}
-
-/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
-///    [16 x i16] and returns the lower 16 bits of each sum in the
-///    corresponding element of the [16 x i16] result (overflow is ignored).
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPADDW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v16hu)__a + (__v16hu)__b);
-}
-
-/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
-///    [8 x i32] and returns the lower 32 bits of each sum in the corresponding
-///    element of the [8 x i32] result (overflow is ignored).
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPADDD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v8su)__a + (__v8su)__b);
-}
-
-/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
-///    [4 x i64] and returns the lower 64 bits of each sum in the corresponding
-///    element of the [4 x i64] result (overflow is ignored).
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPADDQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [4 x i64] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x i64] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi64(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4du)__a + (__v4du)__b);
-}
-
-/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
-///    vectors using signed saturation, and returns each sum in the
-///    corresponding byte of the 256-bit integer vector result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPADDSB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 256-bit integer vector containing one of the source operands.
-/// \returns A 256-bit integer vector containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_adds_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
-}
-
-/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
-///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPADDSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_adds_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
-///    vectors using unsigned saturation, and returns each sum in the
-///    corresponding byte of the 256-bit integer vector result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPADDUSB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 256-bit integer vector containing one of the source operands.
-/// \returns A 256-bit integer vector containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_adds_epu8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
-}
-
-/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
-///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPADDUSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_adds_epu16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
-}
-
-/// Uses the lower half of the 256-bit vector \a a as the upper half of a
-///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
-///    as the lower half of the temporary value. Right-shifts the temporary
-///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
-///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
-///    \a b to make another temporary value, right shifts by \a n, and uses
-///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
-///    result.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPALIGNR instruction.
-///
-/// \param a
-///    A 256-bit integer vector containing source values.
-/// \param b
-///    A 256-bit integer vector containing source values.
-/// \param n
-///    An immediate value specifying the number of bytes to shift.
-/// \returns A 256-bit integer vector containing the result.
-#define _mm256_alignr_epi8(a, b, n) \
-  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
-                                      (__v32qi)(__m256i)(b), (n)))
-
-/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
-///    \a __b.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPAND instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_and_si256(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4du)__a & (__v4du)__b);
-}
-
-/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
-///    the bitwise NOT of the 256-bit integer vector in \a __a.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPANDN instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_andnot_si256(__m256i __a, __m256i __b)
-{
-  return (__m256i)(~(__v4du)__a & (__v4du)__b);
-}
-
-/// Computes the averages of the corresponding unsigned bytes in the two
-///    256-bit integer vectors in \a __a and \a __b and returns each
-///    average in the corresponding byte of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 31
-///   j := i*8
-///   result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPAVGB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_avg_epu8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
-}
-
-/// Computes the averages of the corresponding unsigned 16-bit integers in
-///    the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
-///    each average in the corresponding element of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*16
-///   result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPAVGW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16].
-/// \param __b
-///    A 256-bit vector of [16 x i16].
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_avg_epu16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Merges 8-bit integer values from either of the two 256-bit vectors
-///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
-///    the resulting 256-bit integer vector.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 31
-///   j := i*8
-///   IF __M[7+i] == 0
-///     result[7+j:j] := __V1[7+j:j]
-///   ELSE
-///     result[7+j:j] := __V2[7+j:j]
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPBLENDVB instruction.
-///
-/// \param __V1
-///    A 256-bit integer vector containing source values.
-/// \param __V2
-///    A 256-bit integer vector containing source values.
-/// \param __M
-///    A 256-bit integer vector, with bit [7] of each byte specifying the
-///    source for each corresponding byte of the result. When the mask bit
-///    is 0, the byte is copied from \a __V1; otherwise, it is copied from
-///    \a __V2.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
-{
-  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
-                                              (__v32qi)__M);
-}
-
-/// Merges 16-bit integer values from either of the two 256-bit vectors
-///    \a V1 or \a V2, as specified by the immediate integer operand \a M,
-///    and returns the resulting 256-bit vector of [16 x i16].
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*16
-///   IF M[i] == 0
-///     result[7+j:j] := V1[7+j:j]
-///     result[135+j:128+j] := V1[135+j:128+j]
-///   ELSE
-///     result[7+j:j] := V2[7+j:j]
-///     result[135+j:128+j] := V2[135+j:128+j]
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPBLENDW instruction.
-///
-/// \param V1
-///    A 256-bit vector of [16 x i16] containing source values.
-/// \param V2
-///    A 256-bit vector of [16 x i16] containing source values.
-/// \param M
-///    An immediate 8-bit integer operand, with bits [7:0] specifying the
-///    source for each element of the result. The position of the mask bit
-///    corresponds to the index of a copied value. When a mask bit is 0, the
-///    element is copied from \a V1; otherwise, it is copied from \a V2.
-///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
-///    elements 1 and 9, and so forth.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_blend_epi16(V1, V2, M) \
-  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
-                                      (__v16hi)(__m256i)(V2), (int)(M)))
-
-/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
-///    \a __b for equality and returns the outcomes in the corresponding
-///    bytes of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 31
-///   j := i*8
-///   result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPCMPEQB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing one of the inputs.
-/// \param __b
-///    A 256-bit integer vector containing one of the inputs.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v32qi)__a == (__v32qi)__b);
-}
-
-/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
-///    \a __a and \a __b for equality and returns the outcomes in the
-///    corresponding elements of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*16
-///   result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPCMPEQW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the inputs.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the inputs.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v16hi)__a == (__v16hi)__b);
-}
-
-/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
-///    \a __a and \a __b for equality and returns the outcomes in the
-///    corresponding elements of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*32
-///   result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPCMPEQD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing one of the inputs.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing one of the inputs.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v8si)__a == (__v8si)__b);
-}
-
-/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
-///    \a __a and \a __b for equality and returns the outcomes in the
-///    corresponding elements of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*64
-///   result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] containing one of the inputs.
-/// \param __b
-///    A 256-bit vector of [4 x i64] containing one of the inputs.
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4di)__a == (__v4di)__b);
-}
-
-/// Compares corresponding signed bytes in the 256-bit integer vectors in
-///    \a __a and \a __b for greater-than and returns the outcomes in the
-///    corresponding bytes of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 31
-///   j := i*8
-///   result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPCMPGTB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing one of the inputs.
-/// \param __b
-///    A 256-bit integer vector containing one of the inputs.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
-{
-  /* This function always performs a signed comparison, but __v32qi is a char
-     which may be signed or unsigned, so use __v32qs. */
-  return (__m256i)((__v32qs)__a > (__v32qs)__b);
-}
-
-/// Compares corresponding signed elements in the 256-bit vectors of
-///    [16 x i16] in \a __a and \a __b for greater-than and returns the
-///    outcomes in the corresponding elements of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*16
-///   result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPCMPGTW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the inputs.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the inputs.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v16hi)__a > (__v16hi)__b);
-}
-
-/// Compares corresponding signed elements in the 256-bit vectors of
-///    [8 x i32] in \a __a and \a __b for greater-than and returns the
-///    outcomes in the corresponding elements of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*32
-///   result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPCMPGTD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing one of the inputs.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing one of the inputs.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v8si)__a > (__v8si)__b);
-}
-
-/// Compares corresponding signed elements in the 256-bit vectors of
-///    [4 x i64] in \a __a and \a __b for greater-than and returns the
-///    outcomes in the corresponding elements of the 256-bit result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*64
-///   result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] containing one of the inputs.
-/// \param __b
-///    A 256-bit vector of [4 x i64] containing one of the inputs.
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4di)__a > (__v4di)__b);
-}
-
-/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
-///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
-///    element of the [16 x i16] result (overflow is ignored). Sums from
-///    \a __a are returned in the lower 64 bits of each 128-bit half of the
-///    result; sums from \a __b are returned in the upper 64 bits of each
-///    128-bit half of the result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 1
-///   j := i*128
-///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
-///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
-///   result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
-///   result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
-///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
-///   result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
-///   result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
-///   result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPHADDW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
-///    vectors of [8 x i32] and returns the lower 32 bits of each sum in an
-///    element of the [8 x i32] result (overflow is ignored). Sums from \a __a
-///    are returned in the lower 64 bits of each 128-bit half of the result;
-///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
-///    of the result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 1
-///   j := i*128
-///   result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
-///   result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
-///   result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
-///   result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPHADDD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
-}
-
-/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
-///    vectors of [16 x i16] using signed saturation and returns each sum in
-///    an element of the [16 x i16] result. Sums from \a __a are returned in
-///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
-///    are returned in the upper 64 bits of each 128-bit half of the result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 1
-///   j := i*128
-///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
-///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
-///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
-///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
-///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
-///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
-///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
-///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPHADDSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadds_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
-///    vectors of [16 x i16] and returns the lower 16 bits of each difference
-///    in an element of the [16 x i16] result (overflow is ignored).
-///    Differences from \a __a are returned in the lower 64 bits of each
-///    128-bit half of the result; differences from \a __b are returned in the
-///    upper 64 bits of each 128-bit half of the result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 1
-///   j := i*128
-///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
-///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
-///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
-///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
-///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
-///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
-///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
-///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPHSUBW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
-///    vectors of [8 x i32] and returns the lower 32 bits of each difference in
-///    an element of the [8 x i32] result (overflow is ignored). Differences
-///    from \a __a are returned in the lower 64 bits of each 128-bit half of
-///    the result; differences from \a __b are returned in the upper 64 bits
-///    of each 128-bit half of the result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 1
-///   j := i*128
-///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
-///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
-///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
-///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPHSUBD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
-}
-
-/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
-///    vectors of [16 x i16] using signed saturation and returns each sum in
-///    an element of the [16 x i16] result. Differences from \a __a are
-///    returned in the lower 64 bits of each 128-bit half of the result;
-///    differences from \a __b are returned in the upper 64 bits of each
-///    128-bit half of the result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 1
-///   j := i*128
-///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
-///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
-///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
-///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
-///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
-///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
-///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
-///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPHSUBSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsubs_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
-///    with the corresponding signed byte from the 256-bit integer vector in
-///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
-///    pairs of those products using signed saturation to form 16-bit sums
-///    returned as elements of the [16 x i16] result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*16
-///   temp1 := __a[j+7:j] * __b[j+7:j]
-///   temp2 := __a[j+15:j+8] * __b[j+15:j+8]
-///   result[j+15:j] := SATURATE16(temp1 + temp2)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
-///
-/// \param __a
-///    A 256-bit vector containing one of the source operands.
-/// \param __b
-///    A 256-bit vector containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maddubs_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
-}
-
-/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
-///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
-///    those products to form 32-bit sums returned as elements of the
-///    [8 x i32] result.
-///
-///    There is only one wraparound case: when all four of the 16-bit sources
-///    are \c 0x8000, the result will be \c 0x80000000.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*32
-///   temp1 := __a[j+15:j] * __b[j+15:j]
-///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
-///   result[j+31:j] := temp1 + temp2
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMADDWD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Compares the corresponding signed bytes in the two 256-bit integer vectors
-///     in \a __a and \a __b and returns the larger of each pair in the
-///     corresponding byte of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMAXSB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
-}
-
-/// Compares the corresponding signed 16-bit integers in the two 256-bit
-///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
-///    each pair in the corresponding element of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMAXSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16].
-/// \param __b
-///    A 256-bit vector of [16 x i16].
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Compares the corresponding signed 32-bit integers in the two 256-bit
-///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
-///    each pair in the corresponding element of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMAXSD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32].
-/// \param __b
-///    A 256-bit vector of [8 x i32].
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
-}
-
-/// Compares the corresponding unsigned bytes in the two 256-bit integer
-///     vectors in \a __a and \a __b and returns the larger of each pair in
-///     the corresponding byte of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMAXUB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
-}
-
-/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
-///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
-///    each pair in the corresponding element of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMAXUW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16].
-/// \param __b
-///    A 256-bit vector of [16 x i16].
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
-}
-
-/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
-///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
-///    each pair in the corresponding element of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMAXUD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32].
-/// \param __b
-///    A 256-bit vector of [8 x i32].
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu32(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
-}
-
-/// Compares the corresponding signed bytes in the two 256-bit integer vectors
-///     in \a __a and \a __b and returns the smaller of each pair in the
-///     corresponding byte of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMINSB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
-}
-
-/// Compares the corresponding signed 16-bit integers in the two 256-bit
-///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
-///    each pair in the corresponding element of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMINSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16].
-/// \param __b
-///    A 256-bit vector of [16 x i16].
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Compares the corresponding signed 32-bit integers in the two 256-bit
-///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
-///    each pair in the corresponding element of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMINSD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32].
-/// \param __b
-///    A 256-bit vector of [8 x i32].
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
-}
-
-/// Compares the corresponding unsigned bytes in the two 256-bit integer
-///     vectors in \a __a and \a __b and returns the smaller of each pair in
-///     the corresponding byte of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMINUB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
-}
-
-/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
-///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
-///    each pair in the corresponding element of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMINUW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16].
-/// \param __b
-///    A 256-bit vector of [16 x i16].
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
-}
-
-/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
-///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
-///    each pair in the corresponding element of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMINUD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32].
-/// \param __b
-///    A 256-bit vector of [8 x i32].
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu32(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
-}
-
-/// Creates a 32-bit integer mask from the most significant bit of each byte
-///    in the 256-bit integer vector in \a __a and returns the result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 31
-///   j := i*8
-///   result[i] := __a[j+7]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing the source bytes.
-/// \returns The 32-bit integer mask.
-static __inline__ int __DEFAULT_FN_ATTRS256
-_mm256_movemask_epi8(__m256i __a)
-{
-  return __builtin_ia32_pmovmskb256((__v32qi)__a);
-}
-
-/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
-///    the 16-bit values in the corresponding elements of a 256-bit vector
-///    of [16 x i16].
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*8
-///   k := i*16
-///   result[k+15:k] := SignExtend(__V[j+7:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the source bytes.
-/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi8_epi16(__m128i __V)
-{
-  /* This function always performs a signed extension, but __v16qi is a char
-     which may be signed or unsigned, so use __v16qs. */
-  return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
-}
-
-/// Sign-extends bytes from the lower half of the 128-bit integer vector in
-///    \a __V and returns the 32-bit values in the corresponding elements of a
-///    256-bit vector of [8 x i32].
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*8
-///   k := i*32
-///   result[k+31:k] := SignExtend(__V[j+7:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the source bytes.
-/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi8_epi32(__m128i __V)
-{
-  /* This function always performs a signed extension, but __v16qi is a char
-     which may be signed or unsigned, so use __v16qs. */
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
-}
-
-/// Sign-extends the first four bytes from the 128-bit integer vector in
-///    \a __V and returns the 64-bit values in the corresponding elements of a
-///    256-bit vector of [4 x i64].
-///
-/// \code{.operation}
-/// result[63:0] := SignExtend(__V[7:0])
-/// result[127:64] := SignExtend(__V[15:8])
-/// result[191:128] := SignExtend(__V[23:16])
-/// result[255:192] := SignExtend(__V[31:24])
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the source bytes.
-/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi8_epi64(__m128i __V)
-{
-  /* This function always performs a signed extension, but __v16qi is a char
-     which may be signed or unsigned, so use __v16qs. */
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
-}
-
-/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
-///    \a __V and returns the 32-bit values in the corresponding elements of a
-///    256-bit vector of [8 x i32].
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*16
-///   k := i*32
-///   result[k+31:k] := SignExtend(__V[j+15:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
-///
-/// \param __V
-///    A 128-bit vector of [8 x i16] containing the source values.
-/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi16_epi32(__m128i __V)
-{
-  return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
-}
-
-/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
-///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
-///    elements of a 256-bit vector of [4 x i64].
-///
-/// \code{.operation}
-/// result[63:0] := SignExtend(__V[15:0])
-/// result[127:64] := SignExtend(__V[31:16])
-/// result[191:128] := SignExtend(__V[47:32])
-/// result[255:192] := SignExtend(__V[64:48])
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
-///
-/// \param __V
-///    A 128-bit vector of [8 x i16] containing the source values.
-/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi16_epi64(__m128i __V)
-{
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
-}
-
-/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
-///    \a __V and returns the 64-bit values in the corresponding elements of a
-///    256-bit vector of [4 x i64].
-///
-/// \code{.operation}
-/// result[63:0] := SignExtend(__V[31:0])
-/// result[127:64] := SignExtend(__V[63:32])
-/// result[191:128] := SignExtend(__V[95:64])
-/// result[255:192] := SignExtend(__V[127:96])
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
-///
-/// \param __V
-///    A 128-bit vector of [4 x i32] containing the source values.
-/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi64(__m128i __V)
-{
-  return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
-}
-
-/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
-///    the 16-bit values in the corresponding elements of a 256-bit vector
-///    of [16 x i16].
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*8
-///   k := i*16
-///   result[k+15:k] := ZeroExtend(__V[j+7:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the source bytes.
-/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu8_epi16(__m128i __V)
-{
-  return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
-}
-
-/// Zero-extends bytes from the lower half of the 128-bit integer vector in
-///    \a __V and returns the 32-bit values in the corresponding elements of a
-///    256-bit vector of [8 x i32].
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*8
-///   k := i*32
-///   result[k+31:k] := ZeroExtend(__V[j+7:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the source bytes.
-/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu8_epi32(__m128i __V)
-{
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
-}
-
-/// Zero-extends the first four bytes from the 128-bit integer vector in
-///    \a __V and returns the 64-bit values in the corresponding elements of a
-///    256-bit vector of [4 x i64].
-///
-/// \code{.operation}
-/// result[63:0] := ZeroExtend(__V[7:0])
-/// result[127:64] := ZeroExtend(__V[15:8])
-/// result[191:128] := ZeroExtend(__V[23:16])
-/// result[255:192] := ZeroExtend(__V[31:24])
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the source bytes.
-/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu8_epi64(__m128i __V)
-{
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
-}
-
-/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
-///    \a __V and returns the 32-bit values in the corresponding elements of a
-///    256-bit vector of [8 x i32].
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*16
-///   k := i*32
-///   result[k+31:k] := ZeroExtend(__V[j+15:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
-///
-/// \param __V
-///    A 128-bit vector of [8 x i16] containing the source values.
-/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu16_epi32(__m128i __V)
-{
-  return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
-}
-
-/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
-///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
-///    elements of a 256-bit vector of [4 x i64].
-///
-/// \code{.operation}
-/// result[63:0] := ZeroExtend(__V[15:0])
-/// result[127:64] := ZeroExtend(__V[31:16])
-/// result[191:128] := ZeroExtend(__V[47:32])
-/// result[255:192] := ZeroExtend(__V[64:48])
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
-///
-/// \param __V
-///    A 128-bit vector of [8 x i16] containing the source values.
-/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu16_epi64(__m128i __V)
-{
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
-}
-
-/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
-///    \a __V and returns the 64-bit values in the corresponding elements of a
-///    256-bit vector of [4 x i64].
-///
-/// \code{.operation}
-/// result[63:0] := ZeroExtend(__V[31:0])
-/// result[127:64] := ZeroExtend(__V[63:32])
-/// result[191:128] := ZeroExtend(__V[95:64])
-/// result[255:192] := ZeroExtend(__V[127:96])
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
-///
-/// \param __V
-///    A 128-bit vector of [4 x i32] containing the source values.
-/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
-///    values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu32_epi64(__m128i __V)
-{
-  return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
-}
-
-/// Multiplies signed 32-bit integers from even-numbered elements of two
-///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
-///    [4 x i64] result.
-///
-/// \code{.operation}
-/// result[63:0] := __a[31:0] * __b[31:0]
-/// result[127:64] := __a[95:64] * __b[95:64]
-/// result[191:128] := __a[159:128] * __b[159:128]
-/// result[255:192] := __a[223:192] * __b[223:192]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMULDQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__  __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
-}
-
-/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
-///    [16 x i16], truncates the 32-bit results to the most significant 18
-///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
-///    product in the [16 x i16] result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*16
-///   temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
-///   result[j+15:j] := temp[16:1]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMULHRSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
-///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
-///    [16 x i16] result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMULHUW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhi_epu16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
-///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
-///    [16 x i16] result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMULHW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhi_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
-///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
-///    [16 x i16] result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMULLW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing one of the source operands.
-/// \returns A 256-bit vector of [16 x i16] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mullo_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v16hu)__a * (__v16hu)__b);
-}
-
-/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
-///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
-///    [8 x i32] result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMULLD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x i32] containing the products.
-static __inline__  __m256i __DEFAULT_FN_ATTRS256
-_mm256_mullo_epi32 (__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v8su)__a * (__v8su)__b);
-}
-
-/// Multiplies unsigned 32-bit integers from even-numered elements of two
-///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
-///    [4 x i64] result.
-///
-/// \code{.operation}
-/// result[63:0] := __a[31:0] * __b[31:0]
-/// result[127:64] := __a[95:64] * __b[95:64]
-/// result[191:128] := __a[159:128] * __b[159:128]
-/// result[255:192] := __a[223:192] * __b[223:192]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMULUDQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x i64] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epu32(__m256i __a, __m256i __b)
-{
-  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
-}
-
-/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
-///    \a __b.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPOR instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_or_si256(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4du)__a | (__v4du)__b);
-}
-
-/// Computes four sum of absolute difference (SAD) operations on sets of eight
-///    unsigned 8-bit integers from the 256-bit integer vectors \a __a and
-///    \a __b.
-///
-///    One SAD result is computed for each set of eight bytes from \a __a and
-///    eight bytes from \a __b. The zero-extended SAD value is returned in the
-///    corresponding 64-bit element of the result.
-///
-///    A single SAD operation takes the differences between the corresponding
-///    bytes of \a __a and \a __b, takes the absolute value of each difference,
-///    and sums these eight values to form one 16-bit result. This operation
-///    is repeated four times with successive sets of eight bytes.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*64
-///   temp0 := ABS(__a[j+7:j] - __b[j+7:j])
-///   temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
-///   temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
-///   temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
-///   temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
-///   temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
-///   temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
-///   temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
-///   result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
-///                     temp4 + temp5 + temp6 + temp7
-///   result[j+63:j+16] := 0
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSADBW instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sad_epu8(__m256i __a, __m256i __b)
-{
-  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
-}
-
-/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
-///    to control information in the 256-bit integer vector \a __b, and
-///    returns the 256-bit result. In effect there are two separate 128-bit
-///    shuffles in the lower and upper halves.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 31
-///   j := i*8
-///   IF __b[j+7] == 1
-///     result[j+7:j] := 0
-///   ELSE
-///     k := __b[j+3:j] * 8
-///     IF i > 15
-///       k := k + 128
-///     FI
-///     result[j+7:j] := __a[k+7:k]
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSHUFB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing source values.
-/// \param __b
-///    A 256-bit integer vector containing control information to determine
-///    what goes into the corresponding byte of the result. If bit 7 of the
-///    control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
-///    control byte specify the index (within the same 128-bit half) of \a __a
-///    to copy to the result byte.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shuffle_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
-}
-
-/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
-///    according to control information in the integer literal \a imm, and
-///    returns the 256-bit result. In effect there are two parallel 128-bit
-///    shuffles in the lower and upper halves.
-///
-/// \code{.operation}
-/// FOR i := 0 to 3
-///   j := i*32
-///   k := (imm >> i*2)[1:0] * 32
-///   result[j+31:j] := a[k+31:k]
-///   result[128+j+31:128+j] := a[128+k+31:128+k]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPSHUFB instruction.
-///
-/// \param a
-///    A 256-bit vector of [8 x i32] containing source values.
-/// \param imm
-///    An immediate 8-bit value specifying which elements to copy from \a a.
-///    \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
-///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
-///    forth.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-#define _mm256_shuffle_epi32(a, imm) \
-  ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
-
-/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
-///    according to control information in the integer literal \a imm, and
-///    returns the 256-bit result. The upper 64 bits of each 128-bit half
-///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
-///    copied from \a a unchanged.
-///
-/// \code{.operation}
-/// result[63:0] := a[63:0]
-/// result[191:128] := a[191:128]
-/// FOR i := 0 TO 3
-///   j := i * 16 + 64
-///   k := (imm >> i*2)[1:0] * 16 + 64
-///   result[j+15:j] := a[k+15:k]
-///   result[128+j+15:128+j] := a[128+k+15:128+k]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPSHUFHW instruction.
-///
-/// \param a
-///    A 256-bit vector of [16 x i16] containing source values.
-/// \param imm
-///    An immediate 8-bit value specifying which elements to copy from \a a.
-///    \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
-///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
-///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_shufflehi_epi16(a, imm) \
-  ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
-
-/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
-///    according to control information in the integer literal \a imm, and
-///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
-///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
-///    copied from \a a unchanged.
-///
-/// \code{.operation}
-/// result[127:64] := a[127:64]
-/// result[255:192] := a[255:192]
-/// FOR i := 0 TO 3
-///   j := i * 16
-///   k := (imm >> i*2)[1:0] * 16
-///   result[j+15:j] := a[k+15:k]
-///   result[128+j+15:128+j] := a[128+k+15:128+k]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPSHUFLW instruction.
-///
-/// \param a
-///    A 256-bit vector of [16 x i16] to use as a source of data for the
-///    result.
-/// \param imm
-///    An immediate 8-bit value specifying which elements to copy from \a a.
-///    \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
-///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
-///    forth.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_shufflelo_epi16(a, imm) \
-  ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
-
-/// Sets each byte of the result to the corresponding byte of the 256-bit
-///    integer vector in \a __a, the negative of that byte, or zero, depending
-///    on whether the corresponding byte of the 256-bit integer vector in
-///    \a __b is greater than zero, less than zero, or equal to zero,
-///    respectively.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSIGNB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector].
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi8(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
-}
-
-/// Sets each element of the result to the corresponding element of the
-///    256-bit vector of [16 x i16] in \a __a, the negative of that element,
-///    or zero, depending on whether the corresponding element of the 256-bit
-///    vector of [16 x i16] in \a __b is greater than zero, less than zero, or
-///    equal to zero, respectively.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSIGNW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16].
-/// \param __b
-///    A 256-bit vector of [16 x i16].
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Sets each element of the result to the corresponding element of the
-///    256-bit vector of [8 x i32] in \a __a, the negative of that element, or
-///    zero, depending on whether the corresponding element of the 256-bit
-///    vector of [8 x i32] in \a __b is greater than zero, less than zero, or
-///    equal to zero, respectively.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSIGND instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32].
-/// \param __b
-///    A 256-bit vector of [8 x i32].
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
-}
-
-/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
-///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
-///    is greater than 15, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_slli_si256(__m256i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPSLLDQ instruction.
-///
-/// \param a
-///    A 256-bit integer vector to be shifted.
-/// \param imm
-///     An unsigned immediate value specifying the shift count (in bytes).
-/// \returns A 256-bit integer vector containing the result.
-#define _mm256_slli_si256(a, imm) \
-  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
-
-/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
-///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
-///    is greater than 15, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPSLLDQ instruction.
-///
-/// \param a
-///    A 256-bit integer vector to be shifted.
-/// \param imm
-///    An unsigned immediate value specifying the shift count (in bytes).
-/// \returns A 256-bit integer vector containing the result.
-#define _mm256_bslli_epi128(a, imm) \
-  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
-
-/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
-///    left by \a __count bits, shifting in zero bits, and returns the result.
-///    If \a __count is greater than 15, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] to be shifted.
-/// \param __count
-///    An unsigned integer value specifying the shift count (in bits).
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_slli_epi16(__m256i __a, int __count)
-{
-  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
-}
-
-/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
-///    left by the number of bits specified by the lower 64 bits of \a __count,
-///    shifting in zero bits, and returns the result. If \a __count is greater
-///    than 15, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] to be shifted.
-/// \param __count
-///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
-///    shift count (in bits). The upper element is ignored.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi16(__m256i __a, __m128i __count)
-{
-  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
-}
-
-/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
-///    left by \a __count bits, shifting in zero bits, and returns the result.
-///    If \a __count is greater than 31, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] to be shifted.
-/// \param __count
-///    An unsigned integer value specifying the shift count (in bits).
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_slli_epi32(__m256i __a, int __count)
-{
-  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
-}
-
-/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
-///    left by the number of bits given in the lower 64 bits of \a __count,
-///    shifting in zero bits, and returns the result. If \a __count is greater
-///    than 31, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] to be shifted.
-/// \param __count
-///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
-///    shift count (in bits). The upper element is ignored.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi32(__m256i __a, __m128i __count)
-{
-  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
-}
-
-/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
-///    left by \a __count bits, shifting in zero bits, and returns the result.
-///    If \a __count is greater than 63, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] to be shifted.
-/// \param __count
-///    An unsigned integer value specifying the shift count (in bits).
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_slli_epi64(__m256i __a, int __count)
-{
-  return __builtin_ia32_psllqi256((__v4di)__a, __count);
-}
-
-/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
-///    left by the number of bits given in the lower 64 bits of \a __count,
-///    shifting in zero bits, and returns the result. If \a __count is greater
-///    than 63, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] to be shifted.
-/// \param __count
-///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
-///    shift count (in bits). The upper element is ignored.
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi64(__m256i __a, __m128i __count)
-{
-  return __builtin_ia32_psllq256((__v4di)__a, __count);
-}
-
-/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
-///    right by \a __count bits, shifting in sign bits, and returns the result.
-///    If \a __count is greater than 15, each element of the result is either
-///    0 or -1 according to the corresponding input sign bit.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRAW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] to be shifted.
-/// \param __count
-///    An unsigned integer value specifying the shift count (in bits).
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srai_epi16(__m256i __a, int __count)
-{
-  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
-}
-
-/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
-///    right by the number of bits given in the lower 64 bits of \a __count,
-///    shifting in sign bits, and returns the result. If \a __count is greater
-///    than 15, each element of the result is either 0 or -1 according to the
-///    corresponding input sign bit.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRAW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] to be shifted.
-/// \param __count
-///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
-///    shift count (in bits). The upper element is ignored.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi16(__m256i __a, __m128i __count)
-{
-  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
-}
-
-/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
-///    right by \a __count bits, shifting in sign bits, and returns the result.
-///    If \a __count is greater than 31, each element of the result is either
-///    0 or -1 according to the corresponding input sign bit.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRAD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] to be shifted.
-/// \param __count
-///    An unsigned integer value specifying the shift count (in bits).
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srai_epi32(__m256i __a, int __count)
-{
-  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
-}
-
-/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
-///    right by the number of bits given in the lower 64 bits of \a __count,
-///    shifting in sign bits, and returns the result. If \a __count is greater
-///    than 31, each element of the result is either 0 or -1 according to the
-///    corresponding input sign bit.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRAD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] to be shifted.
-/// \param __count
-///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
-///    shift count (in bits). The upper element is ignored.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi32(__m256i __a, __m128i __count)
-{
-  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
-}
-
-/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
-///    \a imm bytes, shifting in zero bytes, and returns the result. If
-///    \a imm is greater than 15, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_srli_si256(__m256i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPSRLDQ instruction.
-///
-/// \param a
-///    A 256-bit integer vector to be shifted.
-/// \param imm
-///    An unsigned immediate value specifying the shift count (in bytes).
-/// \returns A 256-bit integer vector containing the result.
-#define _mm256_srli_si256(a, imm) \
-  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
-
-/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
-///    \a imm bytes, shifting in zero bytes, and returns the result. If
-///    \a imm is greater than 15, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPSRLDQ instruction.
-///
-/// \param a
-///    A 256-bit integer vector to be shifted.
-/// \param imm
-///     An unsigned immediate value specifying the shift count (in bytes).
-/// \returns A 256-bit integer vector containing the result.
-#define _mm256_bsrli_epi128(a, imm) \
-  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
-
-/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
-///    right by \a __count bits, shifting in zero bits, and returns the result.
-///    If \a __count is greater than 15, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] to be shifted.
-/// \param __count
-///    An unsigned integer value specifying the shift count (in bits).
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srli_epi16(__m256i __a, int __count)
-{
-  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
-}
-
-/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
-///    right by the number of bits given in the lower 64 bits of \a __count,
-///    shifting in zero bits, and returns the result. If \a __count is greater
-///    than 15, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] to be shifted.
-/// \param __count
-///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
-///    shift count (in bits). The upper element is ignored.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi16(__m256i __a, __m128i __count)
-{
-  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
-}
-
-/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
-///    right by \a __count bits, shifting in zero bits, and returns the result.
-///    If \a __count is greater than 31, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] to be shifted.
-/// \param __count
-///    An unsigned integer value specifying the shift count (in bits).
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srli_epi32(__m256i __a, int __count)
-{
-  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
-}
-
-/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
-///    right by the number of bits given in the lower 64 bits of \a __count,
-///    shifting in zero bits, and returns the result. If \a __count is greater
-///    than 31, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] to be shifted.
-/// \param __count
-///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
-///    shift count (in bits). The upper element is ignored.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi32(__m256i __a, __m128i __count)
-{
-  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
-}
-
-/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
-///    right by \a __count bits, shifting in zero bits, and returns the result.
-///    If \a __count is greater than 63, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] to be shifted.
-/// \param __count
-///    An unsigned integer value specifying the shift count (in bits).
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srli_epi64(__m256i __a, int __count)
-{
-  return __builtin_ia32_psrlqi256((__v4di)__a, __count);
-}
-
-/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
-///    right by the number of bits given in the lower 64 bits of \a __count,
-///    shifting in zero bits, and returns the result. If \a __count is greater
-///    than 63, the returned result is all zeroes.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] to be shifted.
-/// \param __count
-///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
-///    shift count (in bits). The upper element is ignored.
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi64(__m256i __a, __m128i __count)
-{
-  return __builtin_ia32_psrlq256((__v4di)__a, __count);
-}
-
-/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
-///    vectors. Returns the lower 8 bits of each difference in the
-///    corresponding byte of the 256-bit integer vector result (overflow is
-///    ignored).
-///
-/// \code{.operation}
-/// FOR i := 0 TO 31
-///   j := i*8
-///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSUBB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing the minuends.
-/// \param __b
-///    A 256-bit integer vector containing the subtrahends.
-/// \returns A 256-bit integer vector containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v32qu)__a - (__v32qu)__b);
-}
-
-/// Subtracts 16-bit integers from corresponding elements of two 256-bit
-///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
-///    the corresponding element of the [16 x i16] result (overflow is
-///    ignored).
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*16
-///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSUBW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing the minuends.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing the subtrahends.
-/// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v16hu)__a - (__v16hu)__b);
-}
-
-/// Subtracts 32-bit integers from corresponding elements of two 256-bit
-///    vectors of [8 x i32]. Returns the lower 32 bits of each difference in
-///    the corresponding element of the [8 x i32] result (overflow is ignored).
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*32
-///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSUBD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing the minuends.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing the subtrahends.
-/// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v8su)__a - (__v8su)__b);
-}
-
-/// Subtracts 64-bit integers from corresponding elements of two 256-bit
-///    vectors of [4 x i64]. Returns the lower 64 bits of each difference in
-///    the corresponding element of the [4 x i64] result (overflow is ignored).
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*64
-///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSUBQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] containing the minuends.
-/// \param __b
-///    A 256-bit vector of [4 x i64] containing the subtrahends.
-/// \returns A 256-bit vector of [4 x i64] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi64(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4du)__a - (__v4du)__b);
-}
-
-/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
-///    vectors using signed saturation, and returns each differences in the
-///    corresponding byte of the 256-bit integer vector result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 31
-///   j := i*8
-///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSUBSB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing the minuends.
-/// \param __b
-///    A 256-bit integer vector containing the subtrahends.
-/// \returns A 256-bit integer vector containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_subs_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
-}
-
-/// Subtracts 16-bit integers from corresponding elements of two 256-bit
-///    vectors of [16 x i16] using signed saturation, and returns each
-///    difference in the corresponding element of the [16 x i16] result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*16
-///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSUBSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing the minuends.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing the subtrahends.
-/// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_subs_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
-}
-
-/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
-///    vectors using unsigned saturation, and returns each difference in the
-///    corresponding byte of the 256-bit integer vector result. For each byte,
-///    computes <c> result = __a - __b </c>.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 31
-///   j := i*8
-///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSUBUSB instruction.
-///
-/// \param __a
-///    A 256-bit integer vector containing the minuends.
-/// \param __b
-///    A 256-bit integer vector containing the subtrahends.
-/// \returns A 256-bit integer vector containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_subs_epu8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
-}
-
-/// Subtracts 16-bit integers from corresponding elements of two 256-bit
-///    vectors of [16 x i16] using unsigned saturation, and returns each
-///    difference in the corresponding element of the [16 x i16] result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 15
-///   j := i*16
-///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSUBUSW instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] containing the minuends.
-/// \param __b
-///    A 256-bit vector of [16 x i16] containing the subtrahends.
-/// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_subs_epu16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
-}
-
-/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
-///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
-///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
-///    input; other bits in these parameters are ignored.
-///
-/// \code{.operation}
-/// result[7:0] := __a[71:64]
-/// result[15:8] := __b[71:64]
-/// result[23:16] := __a[79:72]
-/// result[31:24] := __b[79:72]
-/// . . .
-/// result[127:120] := __b[127:120]
-/// result[135:128] := __a[199:192]
-/// . . .
-/// result[255:248] := __b[255:248]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
-///
-/// \param __a
-///    A 256-bit integer vector used as the source for the even-numbered bytes
-///    of the result.
-/// \param __b
-///    A 256-bit integer vector used as the source for the odd-numbered bytes
-///    of the result.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
-}
-
-/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
-///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
-///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
-///    128-bit half of \a __a and \a __b as input; other bits in these
-///    parameters are ignored.
-///
-/// \code{.operation}
-/// result[15:0] := __a[79:64]
-/// result[31:16] := __b[79:64]
-/// result[47:32] := __a[95:80]
-/// result[63:48] := __b[95:80]
-/// . . .
-/// result[127:112] := __b[127:112]
-/// result[143:128] := __a[211:196]
-/// . . .
-/// result[255:240] := __b[255:240]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
-///    elements of the result.
-/// \param __b
-///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
-///    elements of the result.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
-}
-
-/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
-///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
-///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
-///    of \a __a and \a __b as input; other bits in these parameters are
-///    ignored.
-///
-/// \code{.operation}
-/// result[31:0] := __a[95:64]
-/// result[63:32] := __b[95:64]
-/// result[95:64] := __a[127:96]
-/// result[127:96] := __b[127:96]
-/// result[159:128] := __a[223:192]
-/// result[191:160] := __b[223:192]
-/// result[223:192] := __a[255:224]
-/// result[255:224] := __b[255:224]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
-///    elements of the result.
-/// \param __b
-///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
-///    elements of the result.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
-}
-
-/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
-///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
-///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
-///    of \a __a and \a __b as input; other bits in these parameters are
-///    ignored.
-///
-/// \code{.operation}
-/// result[63:0] := __a[127:64]
-/// result[127:64] := __b[127:64]
-/// result[191:128] := __a[255:192]
-/// result[255:192] := __b[255:192]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
-///    elements of the result.
-/// \param __b
-///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
-///    elements of the result.
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
-}
-
-/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
-///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
-///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
-///    input; other bits in these parameters are ignored.
-///
-/// \code{.operation}
-/// result[7:0] := __a[7:0]
-/// result[15:8] := __b[7:0]
-/// result[23:16] := __a[15:8]
-/// result[31:24] := __b[15:8]
-/// . . .
-/// result[127:120] := __b[63:56]
-/// result[135:128] := __a[135:128]
-/// . . .
-/// result[255:248] := __b[191:184]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
-///
-/// \param __a
-///    A 256-bit integer vector used as the source for the even-numbered bytes
-///    of the result.
-/// \param __b
-///    A 256-bit integer vector used as the source for the odd-numbered bytes
-///    of the result.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
-}
-
-/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
-///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
-///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
-///    128-bit half of \a __a and \a __b as input; other bits in these
-///    parameters are ignored.
-///
-/// \code{.operation}
-/// result[15:0] := __a[15:0]
-/// result[31:16] := __b[15:0]
-/// result[47:32] := __a[31:16]
-/// result[63:48] := __b[31:16]
-/// . . .
-/// result[127:112] := __b[63:48]
-/// result[143:128] := __a[143:128]
-/// . . .
-/// result[255:239] := __b[191:176]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
-///    elements of the result.
-/// \param __b
-///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
-///    elements of the result.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
-}
-
-/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
-///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
-///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
-///    of \a __a and \a __b as input; other bits in these parameters are
-///    ignored.
-///
-/// \code{.operation}
-/// result[31:0] := __a[31:0]
-/// result[63:32] := __b[31:0]
-/// result[95:64] := __a[63:32]
-/// result[127:96] := __b[63:32]
-/// result[159:128] := __a[159:128]
-/// result[191:160] := __b[159:128]
-/// result[223:192] := __a[191:160]
-/// result[255:224] := __b[191:190]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
-///    elements of the result.
-/// \param __b
-///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
-///    elements of the result.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
-}
-
-/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
-///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
-///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
-///    of \a __a and \a __b as input; other bits in these parameters are
-///    ignored.
-///
-/// \code{.operation}
-/// result[63:0] := __a[63:0]
-/// result[127:64] := __b[63:0]
-/// result[191:128] := __a[191:128]
-/// result[255:192] := __b[191:128]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
-///    elements of the result.
-/// \param __b
-///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
-///    elements of the result.
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
-}
-
-/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
-///    \a __b.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPXOR instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_xor_si256(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4du)__a ^ (__v4du)__b);
-}
-
-/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
-///   memory hint and returns the vector. \a __V must be aligned on a 32-byte
-///   boundary.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
-///
-/// \param __V
-///    A pointer to the 32-byte aligned memory containing the vector to load.
-/// \returns A 256-bit integer vector loaded from memory.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(const void *__V)
-{
-  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
-  return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
-}
-
-/// Broadcasts the 32-bit floating-point value from the low element of the
-///    128-bit vector of [4 x float] in \a __X to all elements of the result's
-///    128-bit vector of [4 x float].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x float] whose low element will be broadcast.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_broadcastss_ps(__m128 __X)
-{
-  return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
-}
-
-/// Broadcasts the 64-bit floating-point value from the low element of the
-///    128-bit vector of [2 x double] in \a __a to both elements of the
-///    result's 128-bit vector of [2 x double].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c MOVDDUP instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] whose low element will be broadcast.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_broadcastsd_pd(__m128d __a)
-{
-  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
-}
-
-/// Broadcasts the 32-bit floating-point value from the low element of the
-///    128-bit vector of [4 x float] in \a __X to all elements of the
-///    result's 256-bit vector of [8 x float].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x float] whose low element will be broadcast.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_broadcastss_ps(__m128 __X)
-{
-  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-/// Broadcasts the 64-bit floating-point value from the low element of the
-///    128-bit vector of [2 x double] in \a __X to all elements of the
-///    result's 256-bit vector of [4 x double].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
-///
-/// \param __X
-///    A 128-bit vector of [2 x double] whose low element will be broadcast.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_broadcastsd_pd(__m128d __X)
-{
-  return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
-}
-
-/// Broadcasts the 128-bit integer data from \a __X to both the lower and
-///    upper halves of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
-///
-/// \param __X
-///    A 128-bit integer vector to be broadcast.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastsi128_si256(__m128i __X)
-{
-  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
-}
-
-#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
-
-/// Merges 32-bit integer elements from either of the two 128-bit vectors of
-///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
-///    as specified by the immediate integer operand \a M.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*32
-///   IF M[i] == 0
-///     result[31+j:j] := V1[31+j:j]
-///   ELSE
-///     result[31+j:j] := V2[32+j:j]
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPBLENDDD instruction.
-///
-/// \param V1
-///    A 128-bit vector of [4 x i32] containing source values.
-/// \param V2
-///    A 128-bit vector of [4 x i32] containing source values.
-/// \param M
-///    An immediate 8-bit integer operand, with bits [3:0] specifying the
-///    source for each element of the result. The position of the mask bit
-///    corresponds to the index of a copied value. When a mask bit is 0, the
-///    element is copied from \a V1; otherwise, it is copied from \a V2.
-/// \returns A 128-bit vector of [4 x i32] containing the result.
-#define _mm_blend_epi32(V1, V2, M) \
-  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
-                                      (__v4si)(__m128i)(V2), (int)(M)))
-
-/// Merges 32-bit integer elements from either of the two 256-bit vectors of
-///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
-///    as specified by the immediate integer operand \a M.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*32
-///   IF M[i] == 0
-///     result[31+j:j] := V1[31+j:j]
-///   ELSE
-///     result[31+j:j] := V2[32+j:j]
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPBLENDDD instruction.
-///
-/// \param V1
-///    A 256-bit vector of [8 x i32] containing source values.
-/// \param V2
-///    A 256-bit vector of [8 x i32] containing source values.
-/// \param M
-///    An immediate 8-bit integer operand, with bits [7:0] specifying the
-///    source for each element of the result. The position of the mask bit
-///    corresponds to the index of a copied value. When a mask bit is 0, the
-///    element is copied from \a V1; otherwise, it is is copied from \a V2.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-#define _mm256_blend_epi32(V1, V2, M) \
-  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
-                                      (__v8si)(__m256i)(V2), (int)(M)))
-
-/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
-///    bytes of the 256-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
-///
-/// \param __X
-///    A 128-bit integer vector whose low byte will be broadcast.
-/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastb_epi8(__m128i __X)
-{
-  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
-///    to all elements of the result's 256-bit vector of [16 x i16].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
-///
-/// \param __X
-///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
-/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastw_epi16(__m128i __X)
-{
-  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
-///    to all elements of the result's 256-bit vector of [8 x i32].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastd_epi32(__m128i __X)
-{
-  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
-///    to all elements of the result's 256-bit vector of [4 x i64].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
-///
-/// \param __X
-///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastq_epi64(__m128i __X)
-{
-  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
-}
-
-/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
-///    bytes of the 128-bit result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
-///
-/// \param __X
-///    A 128-bit integer vector whose low byte will be broadcast.
-/// \returns A 128-bit integer vector containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastb_epi8(__m128i __X)
-{
-  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
-///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
-///
-/// \param __X
-///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
-/// \returns A 128-bit vector of [8 x i16] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastw_epi16(__m128i __X)
-{
-  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
-///    to all elements of the result's vector of [4 x i32].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
-/// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastd_epi32(__m128i __X)
-{
-  return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
-}
-
-/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
-///    to both elements of the result's 128-bit vector of [2 x i64].
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
-///
-/// \param __X
-///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
-/// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastq_epi64(__m128i __X)
-{
-  return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
-}
-
-/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
-///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
-///    elements of the 256-bit vector of [8 x i32] in \a __b.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*32
-///   k := __b[j+2:j] * 32
-///   result[j+31:j] := __a[k+31:k]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPERMD instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32] containing the source values.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing indexes of values to use from
-///    \a __a.
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
-}
-
-/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
-///    the 256-bit vector of [4 x double] in \a V as specified by the
-///    immediate value \a M.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*64
-///   k := (M >> i*2)[1:0] * 64
-///   result[j+63:j] := V[k+63:k]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPERMPD instruction.
-///
-/// \param V
-///    A 256-bit vector of [4 x double] containing the source values.
-/// \param M
-///    An immediate 8-bit value specifying which elements to copy from \a V.
-///    \a M[1:0] specifies the index in \a a for element 0 of the result,
-///    \a M[3:2] specifies the index for element 1, and so forth.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-#define _mm256_permute4x64_pd(V, M) \
-  ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
-
-/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
-///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
-///    the elements of the 256-bit vector of [8 x i32] in \a __b.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*32
-///   k := __b[j+2:j] * 32
-///   result[j+31:j] := __a[k+31:k]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPERMPS instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the source values.
-/// \param __b
-///    A 256-bit vector of [8 x i32] containing indexes of values to use from
-///    \a __a.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
-{
-  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
-}
-
-/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
-///    of the 256-bit vector of [4 x i64] in \a V as specified by the
-///    immediate value \a M.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*64
-///   k := (M >> i*2)[1:0] * 64
-///   result[j+63:j] := V[k+63:k]
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPERMQ instruction.
-///
-/// \param V
-///    A 256-bit vector of [4 x i64] containing the source values.
-/// \param M
-///    An immediate 8-bit value specifying which elements to copy from \a V.
-///    \a M[1:0] specifies the index in \a a for element 0 of the result,
-///    \a M[3:2] specifies the index for element 1, and so forth.
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-#define _mm256_permute4x64_epi64(V, M) \
-  ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
-
-/// Sets each half of the 256-bit result either to zero or to one of the
-///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
-///    as specified by the immediate value \a M.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 1
-///   j := i*128
-///   k := M >> (i*4)
-///   IF k[3] == 0
-///     CASE (k[1:0]) OF
-///     0: result[127+j:j] := V1[127:0]
-///     1: result[127+j:j] := V1[255:128]
-///     2: result[127+j:j] := V2[127:0]
-///     3: result[127+j:j] := V2[255:128]
-///     ESAC
-///   ELSE
-///     result[127+j:j] := 0
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPERM2I128 instruction.
-///
-/// \param V1
-///    A 256-bit integer vector containing source values.
-/// \param V2
-///    A 256-bit integer vector containing source values.
-/// \param M
-///    An immediate value specifying how to form the result. Bits [3:0]
-///    control the lower half of the result, bits [7:4] control the upper half.
-///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
-///    otherwise bits [1:0] determine the source as follows. \n
-///    0: the lower half of \a V1 \n
-///    1: the upper half of \a V1 \n
-///    2: the lower half of \a V2 \n
-///    3: the upper half of \a V2
-/// \returns A 256-bit integer vector containing the result.
-#define _mm256_permute2x128_si256(V1, V2, M) \
-  ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
-
-/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
-///     of the immediate \a M is zero, extracts the lower half of the result;
-///     otherwise, extracts the upper half.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
-///
-/// \param V
-///    A 256-bit integer vector containing the source values.
-/// \param M
-///    An immediate value specifying which half of \a V to extract.
-/// \returns A 128-bit integer vector containing the result.
-#define _mm256_extracti128_si256(V, M) \
-  ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
-
-/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
-///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
-///     is zero, overwrites the lower half of the result; otherwise,
-///     overwrites the upper half.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VINSERTI128 instruction.
-///
-/// \param V1
-///    A 256-bit integer vector containing a source value.
-/// \param V2
-///    A 128-bit integer vector containing a source value.
-/// \param M
-///    An immediate value specifying where to put \a V2 in the result.
-/// \returns A 256-bit integer vector containing the result.
-#define _mm256_inserti128_si256(V1, V2, M) \
-  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
-                                         (__v2di)(__m128i)(V2), (int)(M)))
-
-/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
-///    the most significant bit of the corresponding element in the mask
-///    \a __M is set; otherwise, sets that element of the result to zero.
-///    Returns the 256-bit [8 x i32] result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*32
-///   IF __M[j+31] == 1
-///     result[j+31:j] := Load32(__X+(i*4))
-///   ELSE
-///     result[j+31:j] := 0
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
-///
-/// \param __X
-///    A pointer to the memory used for loading values.
-/// \param __M
-///    A 256-bit vector of [8 x i32] containing the mask bits.
-/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
-///    elements.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskload_epi32(int const *__X, __m256i __M)
-{
-  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
-}
-
-/// Conditionally loads four 64-bit integer elements from memory \a __X, if
-///    the most significant bit of the corresponding element in the mask
-///    \a __M is set; otherwise, sets that element of the result to zero.
-///    Returns the 256-bit [4 x i64] result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*64
-///   IF __M[j+63] == 1
-///     result[j+63:j] := Load64(__X+(i*8))
-///   ELSE
-///     result[j+63:j] := 0
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
-///
-/// \param __X
-///    A pointer to the memory used for loading values.
-/// \param __M
-///    A 256-bit vector of [4 x i64] containing the mask bits.
-/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
-///    elements.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskload_epi64(long long const *__X, __m256i __M)
-{
-  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
-}
-
-/// Conditionally loads four 32-bit integer elements from memory \a __X, if
-///    the most significant bit of the corresponding element in the mask
-///    \a __M is set; otherwise, sets that element of the result to zero.
-///    Returns the 128-bit [4 x i32] result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*32
-///   IF __M[j+31] == 1
-///     result[j+31:j] := Load32(__X+(i*4))
-///   ELSE
-///     result[j+31:j] := 0
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
-///
-/// \param __X
-///    A pointer to the memory used for loading values.
-/// \param __M
-///    A 128-bit vector of [4 x i32] containing the mask bits.
-/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
-///    elements.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskload_epi32(int const *__X, __m128i __M)
-{
-  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
-}
-
-/// Conditionally loads two 64-bit integer elements from memory \a __X, if
-///    the most significant bit of the corresponding element in the mask
-///    \a __M is set; otherwise, sets that element of the result to zero.
-///    Returns the 128-bit [2 x i64] result.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 1
-///   j := i*64
-///   IF __M[j+63] == 1
-///     result[j+63:j] := Load64(__X+(i*8))
-///   ELSE
-///     result[j+63:j] := 0
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
-///
-/// \param __X
-///    A pointer to the memory used for loading values.
-/// \param __M
-///    A 128-bit vector of [2 x i64] containing the mask bits.
-/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
-///    elements.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskload_epi64(long long const *__X, __m128i __M)
-{
-  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
-}
-
-/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
-///    of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
-///    the corresponding element in the mask \a __M is set; otherwise, the
-///    memory element is unchanged.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 7
-///   j := i*32
-///   IF __M[j+31] == 1
-///     Store32(__X+(i*4), __Y[j+31:j])
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
-///
-/// \param __X
-///    A pointer to the memory used for storing values.
-/// \param __M
-///    A 256-bit vector of [8 x i32] containing the mask bits.
-/// \param __Y
-///    A 256-bit vector of [8 x i32] containing the values to store.
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
-{
-  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
-}
-
-/// Conditionally stores four 64-bit integer elements from the 256-bit vector
-///    of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
-///    the corresponding element in the mask \a __M is set; otherwise, the
-///    memory element is unchanged.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*64
-///   IF __M[j+63] == 1
-///     Store64(__X+(i*8), __Y[j+63:j])
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
-///
-/// \param __X
-///    A pointer to the memory used for storing values.
-/// \param __M
-///    A 256-bit vector of [4 x i64] containing the mask bits.
-/// \param __Y
-///    A 256-bit vector of [4 x i64] containing the values to store.
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
-{
-  __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
-}
-
-/// Conditionally stores four 32-bit integer elements from the 128-bit vector
-///    of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
-///    the corresponding element in the mask \a __M is set; otherwise, the
-///    memory element is unchanged.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 3
-///   j := i*32
-///   IF __M[j+31] == 1
-///     Store32(__X+(i*4), __Y[j+31:j])
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
-///
-/// \param __X
-///    A pointer to the memory used for storing values.
-/// \param __M
-///    A 128-bit vector of [4 x i32] containing the mask bits.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing the values to store.
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
-{
-  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
-}
-
-/// Conditionally stores two 64-bit integer elements from the 128-bit vector
-///    of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
-///    the corresponding element in the mask \a __M is set; otherwise, the
-///    memory element is unchanged.
-///
-/// \code{.operation}
-/// FOR i := 0 TO 1
-///   j := i*64
-///   IF __M[j+63] == 1
-///     Store64(__X+(i*8), __Y[j+63:j])
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
-///
-/// \param __X
-///    A pointer to the memory used for storing values.
-/// \param __M
-///    A 128-bit vector of [2 x i64] containing the mask bits.
-/// \param __Y
-///    A 128-bit vector of [2 x i64] containing the values to store.
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
-{
-  __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
-}
-
-/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
-///    left by the number of bits given in the corresponding element of the
-///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
-///    returns the result. If the shift count for any element is greater than
-///    31, the result for that element is zero.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLVD instruction.
-///
-/// \param __X
-///    A 256-bit vector of [8 x i32] to be shifted.
-/// \param __Y
-///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sllv_epi32(__m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
-}
-
-/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
-///    left by the number of bits given in the corresponding element of the
-///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
-///    returns the result. If the shift count for any element is greater than
-///    31, the result for that element is zero.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLVD instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] to be shifted.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_sllv_epi32(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
-}
-
-/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
-///    left by the number of bits given in the corresponding element of the
-///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
-///    returns the result. If the shift count for any element is greater than
-///    63, the result for that element is zero.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLVQ instruction.
-///
-/// \param __X
-///    A 256-bit vector of [4 x i64] to be shifted.
-/// \param __Y
-///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sllv_epi64(__m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
-}
-
-/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
-///    left by the number of bits given in the corresponding element of the
-///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
-///    returns the result. If the shift count for any element is greater than
-///    63, the result for that element is zero.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSLLVQ instruction.
-///
-/// \param __X
-///    A 128-bit vector of [2 x i64] to be shifted.
-/// \param __Y
-///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_sllv_epi64(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
-}
-
-/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
-///    right by the number of bits given in the corresponding element of the
-///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
-///    returns the result. If the shift count for any element is greater than
-///    31, the result for that element is 0 or -1 according to the sign bit
-///    for that element.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRAVD instruction.
-///
-/// \param __X
-///    A 256-bit vector of [8 x i32] to be shifted.
-/// \param __Y
-///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srav_epi32(__m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
-}
-
-/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
-///    right by the number of bits given in the corresponding element of the
-///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
-///    returns the result. If the shift count for any element is greater than
-///    31, the result for that element is 0 or -1 according to the sign bit
-///    for that element.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRAVD instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] to be shifted.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srav_epi32(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
-}
-
-/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
-///    right by the number of bits given in the corresponding element of the
-///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
-///    returns the result. If the shift count for any element is greater than
-///    31, the result for that element is zero.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLVD instruction.
-///
-/// \param __X
-///    A 256-bit vector of [8 x i32] to be shifted.
-/// \param __Y
-///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srlv_epi32(__m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
-}
-
-/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
-///    right by the number of bits given in the corresponding element of the
-///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
-///    returns the result. If the shift count for any element is greater than
-///    31, the result for that element is zero.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLVD instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] to be shifted.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srlv_epi32(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
-}
-
-/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
-///    right by the number of bits given in the corresponding element of the
-///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
-///    returns the result. If the shift count for any element is greater than
-///    63, the result for that element is zero.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLVQ instruction.
-///
-/// \param __X
-///    A 256-bit vector of [4 x i64] to be shifted.
-/// \param __Y
-///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srlv_epi64(__m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
-}
-
-/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
-///    right by the number of bits given in the corresponding element of the
-///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
-///    returns the result. If the shift count for any element is greater than
-///    63, the result for that element is zero.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VPSRLVQ instruction.
-///
-/// \param __X
-///    A 128-bit vector of [2 x i64] to be shifted.
-/// \param __Y
-///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
-///    bits).
-/// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srlv_epi64(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
-}
-
-/// Conditionally gathers two 64-bit floating-point values, either from the
-///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
-///    of [2 x double] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*64
-///   k := element*32
-///   IF mask[j+63] == 0
-///     result[j+63:j] := a[j+63:j]
-///   ELSE
-///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
-///                               __m128d mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERDPD instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x double] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
-///    the first two elements are used.
-/// \param mask
-///    A 128-bit vector of [2 x double] containing the mask. The most
-///    significant bit of each element in the mask vector represents the mask
-///    bits. If a mask bit is zero, the corresponding value from vector \a a
-///    is gathered; otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
-  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
-                                      (double const *)(m), \
-                                      (__v4si)(__m128i)(i), \
-                                      (__v2df)(__m128d)(mask), (s)))
-
-/// Conditionally gathers four 64-bit floating-point values, either from the
-///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
-///    of [4 x double] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*64
-///   k := element*32
-///   IF mask[j+63] == 0
-///     result[j+63:j] := a[j+63:j]
-///   ELSE
-///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
-///                                  __m256d mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERDPD instruction.
-///
-/// \param a
-///    A 256-bit vector of [4 x double] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
-/// \param mask
-///    A 256-bit vector of [4 x double] containing the mask. The most
-///    significant bit of each element in the mask vector represents the mask
-///    bits. If a mask bit is zero, the corresponding value from vector \a a
-///    is gathered; otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
-  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
-                                         (double const *)(m), \
-                                         (__v4si)(__m128i)(i), \
-                                         (__v4df)(__m256d)(mask), (s)))
-
-/// Conditionally gathers two 64-bit floating-point values, either from the
-///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
-///    of [2 x double] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*64
-///   k := element*64
-///   IF mask[j+63] == 0
-///     result[j+63:j] := a[j+63:j]
-///   ELSE
-///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
-///                               __m128d mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERQPD instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x double] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
-/// \param mask
-///    A 128-bit vector of [2 x double] containing the mask. The most
-///    significant bit of each element in the mask vector represents the mask
-///    bits. If a mask bit is zero, the corresponding value from vector \a a
-///    is gathered; otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
-  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
-                                      (double const *)(m), \
-                                      (__v2di)(__m128i)(i), \
-                                      (__v2df)(__m128d)(mask), (s)))
-
-/// Conditionally gathers four 64-bit floating-point values, either from the
-///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
-///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
-///    of [4 x double] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*64
-///   k := element*64
-///   IF mask[j+63] == 0
-///     result[j+63:j] := a[j+63:j]
-///   ELSE
-///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
-///                                  __m256d mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERQPD instruction.
-///
-/// \param a
-///    A 256-bit vector of [4 x double] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
-/// \param mask
-///    A 256-bit vector of [4 x double] containing the mask. The most
-///    significant bit of each element in the mask vector represents the mask
-///    bits. If a mask bit is zero, the corresponding value from vector \a a
-///    is gathered; otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
-  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
-                                         (double const *)(m), \
-                                         (__v4di)(__m256i)(i), \
-                                         (__v4df)(__m256d)(mask), (s)))
-
-/// Conditionally gathers four 32-bit floating-point values, either from the
-///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
-///    of [4 x float] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*32
-///   k := element*32
-///   IF mask[j+31] == 0
-///     result[j+31:j] := a[j+31:j]
-///   ELSE
-///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
-///                              __m128 mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERDPS instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
-/// \param mask
-///    A 128-bit vector of [4 x float] containing the mask. The most
-///    significant bit of each element in the mask vector represents the mask
-///    bits. If a mask bit is zero, the corresponding value from vector \a a
-///    is gathered; otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
-  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
-                                     (float const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v4sf)(__m128)(mask), (s)))
-
-/// Conditionally gathers eight 32-bit floating-point values, either from the
-///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
-///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
-///    of [8 x float] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 7
-///   j := element*32
-///   k := element*32
-///   IF mask[j+31] == 0
-///     result[j+31:j] := a[j+31:j]
-///   ELSE
-///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
-///                                 __m256 mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERDPS instruction.
-///
-/// \param a
-///    A 256-bit vector of [8 x float] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
-/// \param mask
-///    A 256-bit vector of [8 x float] containing the mask. The most
-///    significant bit of each element in the mask vector represents the mask
-///    bits. If a mask bit is zero, the corresponding value from vector \a a
-///    is gathered; otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [8 x float] containing the gathered values.
-#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
-  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
-                                        (float const *)(m), \
-                                        (__v8si)(__m256i)(i), \
-                                        (__v8sf)(__m256)(mask), (s)))
-
-/// Conditionally gathers two 32-bit floating-point values, either from the
-///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
-///    of [4 x float] in \a mask determines the source for the lower two
-///    elements. The upper two elements of the result are zeroed.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*32
-///   k := element*64
-///   IF mask[j+31] == 0
-///     result[j+31:j] := a[j+31:j]
-///   ELSE
-///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
-///   FI
-/// ENDFOR
-/// result[127:64] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
-///                              __m128 mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERQPS instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float] used as the source when a mask bit is
-///    zero. Only the first two elements are used.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
-/// \param mask
-///    A 128-bit vector of [4 x float] containing the mask. The most
-///    significant bit of each element in the mask vector represents the mask
-///    bits. If a mask bit is zero, the corresponding value from vector \a a
-///    is gathered; otherwise the value is loaded from memory. Only the first
-///    two elements are used.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
-  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
-                                     (float const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v4sf)(__m128)(mask), (s)))
-
-/// Conditionally gathers four 32-bit floating-point values, either from the
-///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
-///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
-///    of [4 x float] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*32
-///   k := element*64
-///   IF mask[j+31] == 0
-///     result[j+31:j] := a[j+31:j]
-///   ELSE
-///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
-///                                 __m128 mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERQPS instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float] used as the source when a mask bit is
-///   zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
-/// \param mask
-///    A 128-bit vector of [4 x float] containing the mask. The most
-///    significant bit of each element in the mask vector represents the mask
-///    bits. If a mask bit is zero, the corresponding value from vector \a a
-///    is gathered; otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
-  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
-                                        (float const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4sf)(__m128)(mask), (s)))
-
-/// Conditionally gathers four 32-bit integer values, either from the
-///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
-///    of [4 x i32] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*32
-///   k := element*32
-///   IF mask[j+31] == 0
-///     result[j+31:j] := a[j+31:j]
-///   ELSE
-///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
-///                                  __m128i mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERDD instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
-/// \param mask
-///    A 128-bit vector of [4 x i32] containing the mask. The most significant
-///    bit of each element in the mask vector represents the mask bits. If a
-///    mask bit is zero, the corresponding value from vector \a a is gathered;
-///    otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
-                                     (int const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v4si)(__m128i)(mask), (s)))
-
-/// Conditionally gathers eight 32-bit integer values, either from the
-///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
-///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
-///    of [8 x i32] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 7
-///   j := element*32
-///   k := element*32
-///   IF mask[j+31] == 0
-///     result[j+31:j] := a[j+31:j]
-///   ELSE
-///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
-///                                     __m256i mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERDD instruction.
-///
-/// \param a
-///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
-/// \param mask
-///    A 256-bit vector of [8 x i32] containing the mask. The most significant
-///    bit of each element in the mask vector represents the mask bits. If a
-///    mask bit is zero, the corresponding value from vector \a a is gathered;
-///    otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
-#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
-  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
-                                        (int const *)(m), \
-                                        (__v8si)(__m256i)(i), \
-                                        (__v8si)(__m256i)(mask), (s)))
-
-/// Conditionally gathers two 32-bit integer values, either from the
-///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
-///    of [4 x i32] in \a mask determines the source for the lower two
-///    elements. The upper two elements of the result are zeroed.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*32
-///   k := element*64
-///   IF mask[j+31] == 0
-///     result[j+31:j] := a[j+31:j]
-///   ELSE
-///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
-///   FI
-/// ENDFOR
-/// result[127:64] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
-///                                  __m128i mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERQD instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
-///   zero. Only the first two elements are used.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [2 x i64] containing indexes into \a m.
-/// \param mask
-///    A 128-bit vector of [4 x i32] containing the mask. The most significant
-///    bit of each element in the mask vector represents the mask bits. If a
-///    mask bit is zero, the corresponding value from vector \a a is gathered;
-///    otherwise the value is loaded from memory. Only the first two elements
-///    are used.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
-                                     (int const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v4si)(__m128i)(mask), (s)))
-
-/// Conditionally gathers four 32-bit integer values, either from the
-///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
-///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
-///    of [4 x i32] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*32
-///   k := element*64
-///   IF mask[j+31] == 0
-///     result[j+31:j] := a[j+31:j]
-///   ELSE
-///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
-///                                     __m128i mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERQD instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
-/// \param mask
-///    A 128-bit vector of [4 x i32] containing the mask. The most significant
-///    bit of each element in the mask vector represents the mask bits. If a
-///    mask bit is zero, the corresponding value from vector \a a is gathered;
-///    otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
-                                        (int const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4si)(__m128i)(mask), (s)))
-
-/// Conditionally gathers two 64-bit integer values, either from the
-///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
-///    of [2 x i64] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*64
-///   k := element*32
-///   IF mask[j+63] == 0
-///     result[j+63:j] := a[j+63:j]
-///   ELSE
-///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
-///                                  __m128i mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
-///    the first two elements are used.
-/// \param mask
-///    A 128-bit vector of [2 x i64] containing the mask. The most significant
-///    bit of each element in the mask vector represents the mask bits. If a
-///    mask bit is zero, the corresponding value from vector \a a is gathered;
-///    otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
-                                     (long long const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v2di)(__m128i)(mask), (s)))
-
-/// Conditionally gathers four 64-bit integer values, either from the
-///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
-///    of [4 x i64] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*64
-///   k := element*32
-///   IF mask[j+63] == 0
-///     result[j+63:j] := a[j+63:j]
-///   ELSE
-///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
-///                                     __m128i i, __m256i mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
-///
-/// \param a
-///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
-/// \param mask
-///    A 256-bit vector of [4 x i64] containing the mask. The most significant
-///    bit of each element in the mask vector represents the mask bits. If a
-///    mask bit is zero, the corresponding value from vector \a a is gathered;
-///    otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
-  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
-                                        (long long const *)(m), \
-                                        (__v4si)(__m128i)(i), \
-                                        (__v4di)(__m256i)(mask), (s)))
-
-/// Conditionally gathers two 64-bit integer values, either from the
-///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
-///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
-///    of [2 x i64] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*64
-///   k := element*64
-///   IF mask[j+63] == 0
-///     result[j+63:j] := a[j+63:j]
-///   ELSE
-///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
-///                                  __m128i mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
-/// \param mask
-///    A 128-bit vector of [2 x i64] containing the mask. The most significant
-///    bit of each element in the mask vector represents the mask bits. If a
-///    mask bit is zero, the corresponding value from vector \a a is gathered;
-///    otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
-                                     (long long const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v2di)(__m128i)(mask), (s)))
-
-/// Conditionally gathers four 64-bit integer values, either from the
-///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
-///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
-///    of [4 x i64] in \a mask determines the source for each element.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*64
-///   k := element*64
-///   IF mask[j+63] == 0
-///     result[j+63:j] := a[j+63:j]
-///   ELSE
-///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
-///   FI
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
-///                                     __m256i i, __m256i mask, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
-///
-/// \param a
-///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
-///    zero.
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
-/// \param mask
-///    A 256-bit vector of [4 x i64] containing the mask. The most significant
-///    bit of each element in the mask vector represents the mask bits. If a
-///    mask bit is zero, the corresponding value from vector \a a is gathered;
-///    otherwise the value is loaded from memory.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
-  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
-                                        (long long const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4di)(__m256i)(mask), (s)))
-
-/// Gathers two 64-bit floating-point values from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*64
-///   k := element*32
-///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERDPD instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
-///    the first two elements are used.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_i32gather_pd(m, i, s) \
-  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
-                                      (double const *)(m), \
-                                      (__v4si)(__m128i)(i), \
-                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
-                                                           _mm_setzero_pd()), \
-                                      (s)))
-
-/// Gathers four 64-bit floating-point values from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*64
-///   k := element*32
-///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERDPD instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_i32gather_pd(m, i, s) \
-  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
-                                         (double const *)(m), \
-                                         (__v4si)(__m128i)(i), \
-                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
-                                                               _mm256_setzero_pd(), \
-                                                               _CMP_EQ_OQ), \
-                                         (s)))
-
-/// Gathers two 64-bit floating-point values from memory \a m using scaled
-///    indexes from the 128-bit vector of [2 x i64] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*64
-///   k := element*64
-///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERQPD instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_i64gather_pd(m, i, s) \
-  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
-                                      (double const *)(m), \
-                                      (__v2di)(__m128i)(i), \
-                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
-                                                           _mm_setzero_pd()), \
-                                      (s)))
-
-/// Gathers four 64-bit floating-point values from memory \a m using scaled
-///    indexes from the 256-bit vector of [4 x i64] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*64
-///   k := element*64
-///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERQPD instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_i64gather_pd(m, i, s) \
-  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
-                                         (double const *)(m), \
-                                         (__v4di)(__m256i)(i), \
-                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
-                                                               _mm256_setzero_pd(), \
-                                                               _CMP_EQ_OQ), \
-                                         (s)))
-
-/// Gathers four 32-bit floating-point values from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*32
-///   k := element*32
-///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERDPS instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_i32gather_ps(m, i, s) \
-  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
-                                     (float const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                          _mm_setzero_ps()), \
-                                     (s)))
-
-/// Gathers eight 32-bit floating-point values from memory \a m using scaled
-///    indexes from the 256-bit vector of [8 x i32] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 7
-///   j := element*32
-///   k := element*32
-///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERDPS instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [8 x float] containing the gathered values.
-#define _mm256_i32gather_ps(m, i, s) \
-  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
-                                        (float const *)(m), \
-                                        (__v8si)(__m256i)(i), \
-                                        (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
-                                                              _mm256_setzero_ps(), \
-                                                              _CMP_EQ_OQ), \
-                                        (s)))
-
-/// Gathers two 32-bit floating-point values from memory \a m using scaled
-///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
-///    elements of the result are zeroed.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*32
-///   k := element*64
-///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
-/// ENDFOR
-/// result[127:64] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERQPS instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_i64gather_ps(m, i, s) \
-  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
-                                     (float const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                          _mm_setzero_ps()), \
-                                     (s)))
-
-/// Gathers four 32-bit floating-point values from memory \a m using scaled
-///    indexes from the 256-bit vector of [4 x i64] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*32
-///   k := element*64
-///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VGATHERQPS instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm256_i64gather_ps(m, i, s) \
-  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
-                                        (float const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                             _mm_setzero_ps()), \
-                                        (s)))
-
-/// Gathers four 32-bit floating-point values from memory \a m using scaled
-///    indexes from the 128-bit vector of [4 x i32] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*32
-///   k := element*32
-///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERDD instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_i32gather_epi32(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
-                                     (int const *)(m), (__v4si)(__m128i)(i), \
-                                     (__v4si)_mm_set1_epi32(-1), (s)))
-
-/// Gathers eight 32-bit floating-point values from memory \a m using scaled
-///    indexes from the 256-bit vector of [8 x i32] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 7
-///   j := element*32
-///   k := element*32
-///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERDD instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
-#define _mm256_i32gather_epi32(m, i, s) \
-  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
-                                        (int const *)(m), (__v8si)(__m256i)(i), \
-                                        (__v8si)_mm256_set1_epi32(-1), (s)))
-
-/// Gathers two 32-bit integer values from memory \a m using scaled indexes
-///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
-///    of the result are zeroed.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*32
-///   k := element*64
-///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
-/// ENDFOR
-/// result[127:64] := 0
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERQD instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_i64gather_epi32(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
-                                     (int const *)(m), (__v2di)(__m128i)(i), \
-                                     (__v4si)_mm_set1_epi32(-1), (s)))
-
-/// Gathers four 32-bit integer values from memory \a m using scaled indexes
-///    from the 256-bit vector of [4 x i64] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*32
-///   k := element*64
-///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERQD instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm256_i64gather_epi32(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
-                                        (int const *)(m), (__v4di)(__m256i)(i), \
-                                        (__v4si)_mm_set1_epi32(-1), (s)))
-
-/// Gathers two 64-bit integer values from memory \a m using scaled indexes
-///    from the 128-bit vector of [4 x i32] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*64
-///   k := element*32
-///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
-///    the first two elements are used.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_i32gather_epi64(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
-                                     (long long const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v2di)_mm_set1_epi64x(-1), (s)))
-
-/// Gathers four 64-bit integer values from memory \a m using scaled indexes
-///    from the 128-bit vector of [4 x i32] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*64
-///   k := element*32
-///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_i32gather_epi64(m, i, s) \
-  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
-                                        (long long const *)(m), \
-                                        (__v4si)(__m128i)(i), \
-                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
-
-/// Gathers two 64-bit integer values from memory \a m using scaled indexes
-///    from the 128-bit vector of [2 x i64] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 1
-///   j := element*64
-///   k := element*64
-///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_i64gather_epi64(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
-                                     (long long const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v2di)_mm_set1_epi64x(-1), (s)))
-
-/// Gathers four 64-bit integer values from memory \a m using scaled indexes
-///    from the 256-bit vector of [4 x i64] in \a i.
-///
-/// \code{.operation}
-/// FOR element := 0 to 3
-///   j := element*64
-///   k := element*64
-///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
-///
-/// \param m
-///    A pointer to the memory used for loading values.
-/// \param i
-///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
-/// \param s
-///    A literal constant scale factor for the indexes in \a i. Must be
-///    1, 2, 4, or 8.
-/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_i64gather_epi64(m, i, s) \
-  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
-                                        (long long const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
-
-#undef __DEFAULT_FN_ATTRS256
-#undef __DEFAULT_FN_ATTRS128
-
-#endif /* __AVX2INTRIN_H */
diff --git a/third_party/intel/clang/avx512bf16intrin.h b/third_party/intel/clang/avx512bf16intrin.h
deleted file mode 100644
index b28d2e243..000000000
--- a/third_party/intel/clang/avx512bf16intrin.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifdef __SSE2__
-
-#ifndef __AVX512BF16INTRIN_H
-#define __AVX512BF16INTRIN_H
-
-typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
-typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
-typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
-
-#define __DEFAULT_FN_ATTRS512 \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
-                 __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512bf16,no-evex512")))
-
-/// Convert One BF16 Data to One Single Float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic does not correspond to a specific instruction.
-///
-/// \param __A
-///    A bfloat data.
-/// \returns A float data whose sign field and exponent field keep unchanged,
-///    and fraction field is extended to 23 bits.
-static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
-  return __builtin_ia32_cvtsbf162ss_32(__A);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __B
-///    A 512-bit vector of [16 x float].
-/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
-///    conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
-                                                    (__v16sf) __B);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __B
-///    A 512-bit vector of [16 x float].
-/// \param __W
-///    A 512-bit vector of [32 x bfloat].
-/// \param __U
-///    A 32-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element from __W.
-/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
-///    conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
-                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
-                                        (__v32bf)__W);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __B
-///    A 512-bit vector of [16 x float].
-/// \param __U
-///    A 32-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element is zero.
-/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
-///    conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
-                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
-                                        (__v32bf)_mm512_setzero_si512());
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_cvtneps_pbh(__m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                              (__v16bf)_mm256_undefined_si256(),
-                                              (__mmask16)-1);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __W
-///    A 256-bit vector of [16 x bfloat].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element from __W.
-/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                                        (__v16bf)__W,
-                                                        (__mmask16)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element is zero.
-/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                                (__v16bf)_mm256_setzero_si256(),
-                                                (__mmask16)__U);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [32 x bfloat].
-/// \param __B
-///    A 512-bit vector of [32 x bfloat].
-/// \param __D
-///    A 512-bit vector of [16 x float].
-/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
-  return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
-                                             (__v32bf) __A,
-                                             (__v32bf) __B);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [32 x bfloat].
-/// \param __B
-///    A 512-bit vector of [32 x bfloat].
-/// \param __D
-///    A 512-bit vector of [16 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
-/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
-                                       (__v16sf)__D);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [32 x bfloat].
-/// \param __B
-///    A 512-bit vector of [32 x bfloat].
-/// \param __D
-///    A 512-bit vector of [16 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
-/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
-                                       (__v16sf)_mm512_setzero_si512());
-}
-
-/// Convert Packed BF16 Data to Packed float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
-  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
-      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __U
-///    A 16-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
-  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
-      (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using merging mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __S
-///    A 512-bit vector of [16 x float]. Elements are copied from __S when
-///     the corresponding mask bit is not set.
-/// \param __U
-///    A 16-bit mask.
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
-  return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
-      (__m512i)__S, (__mmask16)__U,
-      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
-}
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS512
-
-#endif
-#endif
diff --git a/third_party/intel/clang/avx512bitalgintrin.h b/third_party/intel/clang/avx512bitalgintrin.h
deleted file mode 100644
index bad265ceb..000000000
--- a/third_party/intel/clang/avx512bitalgintrin.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512BITALGINTRIN_H
-#define __AVX512BITALGINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512bitalg,evex512"),                           \
-                 __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_popcnt_epi16(__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
-              (__v32hi) _mm512_popcnt_epi16(__B),
-              (__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
-{
-  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
-              __U,
-              __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_popcnt_epi8(__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
-              (__v64qi) _mm512_popcnt_epi8(__B),
-              (__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
-{
-  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
-              __U,
-              __B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
-              (__v64qi) __B,
-              __U);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
-{
-  return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
-              __A,
-              __B);
-}
-
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/avx512bwintrin.h b/third_party/intel/clang/avx512bwintrin.h
deleted file mode 100644
index c854720de..000000000
--- a/third_party/intel/clang/avx512bwintrin.h
+++ /dev/null
@@ -1,2014 +0,0 @@
-/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512BWINTRIN_H
-#define __AVX512BWINTRIN_H
-
-typedef unsigned int __mmask32;
-typedef unsigned long long __mmask64;
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512bw,evex512"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512bw,no-evex512")))
-
-static __inline __mmask32 __DEFAULT_FN_ATTRS
-_knot_mask32(__mmask32 __M)
-{
-  return __builtin_ia32_knotsi(__M);
-}
-
-static __inline __mmask64 __DEFAULT_FN_ATTRS _knot_mask64(__mmask64 __M) {
-  return __builtin_ia32_knotdi(__M);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kand_mask32(__mmask32 __A, __mmask32 __B)
-{
-  return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kand_mask64(__mmask64 __A,
-                                                            __mmask64 __B) {
-  return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kandn_mask32(__mmask32 __A, __mmask32 __B)
-{
-  return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kandn_mask64(__mmask64 __A,
-                                                             __mmask64 __B) {
-  return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kor_mask32(__mmask32 __A, __mmask32 __B)
-{
-  return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kor_mask64(__mmask64 __A,
-                                                           __mmask64 __B) {
-  return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kxnor_mask32(__mmask32 __A, __mmask32 __B)
-{
-  return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kxnor_mask64(__mmask64 __A,
-                                                             __mmask64 __B) {
-  return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kxor_mask32(__mmask32 __A, __mmask32 __B)
-{
-  return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kxor_mask64(__mmask64 __A,
-                                                            __mmask64 __B) {
-  return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
-  return (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
-  return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
-  *__C = (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
-  return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
-  return (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
-  return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
-  *__C = (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
-  return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
-  return (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
-  return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
-  *__C = (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
-  return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
-  return (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
-  return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
-  *__C = (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
-  return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kadd_mask32(__mmask32 __A, __mmask32 __B)
-{
-  return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kadd_mask64(__mmask64 __A,
-                                                            __mmask64 __B) {
-  return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B);
-}
-
-#define _kshiftli_mask32(A, I) \
-  ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I)))
-
-#define _kshiftri_mask32(A, I) \
-  ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I)))
-
-#define _kshiftli_mask64(A, I) \
-  ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I)))
-
-#define _kshiftri_mask64(A, I) \
-  ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_cvtmask32_u32(__mmask32 __A) {
-  return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_cvtmask64_u64(__mmask64 __A) {
-  return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_cvtu32_mask32(unsigned int __A) {
-  return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_cvtu64_mask64(unsigned long long __A) {
-  return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_load_mask32(__mmask32 *__A) {
-  return (__mmask32)__builtin_ia32_kmovd(*(__mmask32 *)__A);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS _load_mask64(__mmask64 *__A) {
-  return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_store_mask32(__mmask32 *__A, __mmask32 __B) {
-  *(__mmask32 *)__A = __builtin_ia32_kmovd((__mmask32)__B);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _store_mask64(__mmask64 *__A,
-                                                        __mmask64 __B) {
-  *(__mmask64 *)__A = __builtin_ia32_kmovq((__mmask64)__B);
-}
-
-/* Integer compare */
-
-#define _mm512_cmp_epi8_mask(a, b, p) \
-  ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)-1))
-
-#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
-  ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)(m)))
-
-#define _mm512_cmp_epu8_mask(a, b, p) \
-  ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                           (__v64qi)(__m512i)(b), (int)(p), \
-                                           (__mmask64)-1))
-
-#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
-  ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                           (__v64qi)(__m512i)(b), (int)(p), \
-                                           (__mmask64)(m)))
-
-#define _mm512_cmp_epi16_mask(a, b, p) \
-  ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)-1))
-
-#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
-  ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)(m)))
-
-#define _mm512_cmp_epu16_mask(a, b, p) \
-  ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                           (__v32hi)(__m512i)(b), (int)(p), \
-                                           (__mmask32)-1))
-
-#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
-  ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                           (__v32hi)(__m512i)(b), (int)(p), \
-                                           (__mmask32)(m)))
-
-#define _mm512_cmpeq_epi8_mask(A, B) \
-    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \
-    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epi8_mask(A, B) \
-    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epi8_mask(k, A, B) \
-    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epi8_mask(A, B) \
-    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \
-    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epi8_mask(A, B) \
-    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epi8_mask(k, A, B) \
-    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epi8_mask(A, B) \
-    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epi8_mask(k, A, B) \
-    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epi8_mask(A, B) \
-    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \
-    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epu8_mask(A, B) \
-    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \
-    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epu8_mask(A, B) \
-    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epu8_mask(k, A, B) \
-    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epu8_mask(A, B) \
-    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \
-    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epu8_mask(A, B) \
-    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epu8_mask(k, A, B) \
-    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epu8_mask(A, B) \
-    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epu8_mask(k, A, B) \
-    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epu8_mask(A, B) \
-    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \
-    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epi16_mask(A, B) \
-    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \
-    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epi16_mask(A, B) \
-    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epi16_mask(k, A, B) \
-    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epi16_mask(A, B) \
-    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \
-    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epi16_mask(A, B) \
-    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epi16_mask(k, A, B) \
-    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epi16_mask(A, B) \
-    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epi16_mask(k, A, B) \
-    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epi16_mask(A, B) \
-    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \
-    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epu16_mask(A, B) \
-    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \
-    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epu16_mask(A, B) \
-    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epu16_mask(k, A, B) \
-    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epu16_mask(A, B) \
-    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \
-    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epu16_mask(A, B) \
-    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epu16_mask(k, A, B) \
-    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epu16_mask(A, B) \
-    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epu16_mask(k, A, B) \
-    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epu16_mask(A, B) \
-    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \
-    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi8 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v64qu) __A + (__v64qu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                             (__v64qi)_mm512_add_epi8(__A, __B),
-                                             (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                             (__v64qi)_mm512_add_epi8(__A, __B),
-                                             (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sub_epi8 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v64qu) __A - (__v64qu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                             (__v64qi)_mm512_sub_epi8(__A, __B),
-                                             (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                             (__v64qi)_mm512_sub_epi8(__A, __B),
-                                             (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi16 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v32hu) __A + (__v32hu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_add_epi16(__A, __B),
-                                             (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_add_epi16(__A, __B),
-                                             (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sub_epi16 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v32hu) __A - (__v32hu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_sub_epi16(__A, __B),
-                                             (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_sub_epi16(__A, __B),
-                                             (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v32hu) __A * (__v32hu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
-                                             (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
-                                             (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
-{
-  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
-              (__v64qi) __W,
-              (__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
-{
-  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
-              (__v32hi) __W,
-              (__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_abs_epi8 (__m512i __A)
-{
-  return (__m512i)__builtin_elementwise_abs((__v64qs)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                             (__v64qi)_mm512_abs_epi8(__A),
-                                             (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                             (__v64qi)_mm512_abs_epi8(__A),
-                                             (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_abs_epi16 (__m512i __A)
-{
-  return (__m512i)__builtin_elementwise_abs((__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_abs_epi16(__A),
-                                             (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_abs_epi16(__A),
-                                             (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi32(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                       (__v32hi)_mm512_packs_epi32(__A, __B),
-                                       (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                       (__v32hi)_mm512_packs_epi32(__A, __B),
-                                       (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi16(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                        (__v64qi)_mm512_packs_epi16(__A, __B),
-                                        (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                        (__v64qi)_mm512_packs_epi16(__A, __B),
-                                        (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi32(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                       (__v32hi)_mm512_packus_epi32(__A, __B),
-                                       (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                       (__v32hi)_mm512_packus_epi32(__A, __B),
-                                       (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi16(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                        (__v64qi)_mm512_packus_epi16(__A, __B),
-                                        (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                        (__v64qi)_mm512_packus_epi16(__A, __B),
-                                        (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_adds_epi8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_adds_epi8(__A, __B),
-                                        (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_adds_epi8(__A, __B),
-                                        (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_adds_epi16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_adds_epi16(__A, __B),
-                                        (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_adds_epi16(__A, __B),
-                                        (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_adds_epu8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_adds_epu8(__A, __B),
-                                        (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_adds_epu8(__A, __B),
-                                        (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_adds_epu16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_adds_epu16(__A, __B),
-                                        (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_adds_epu16(__A, __B),
-                                        (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_avg_epu8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_pavgb512((__v64qi)__A, (__v64qi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
-          __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-              (__v64qi)_mm512_avg_epu8(__A, __B),
-              (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-              (__v64qi)_mm512_avg_epu8(__A, __B),
-              (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_avg_epu16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_pavgw512((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
-           __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-              (__v32hi)_mm512_avg_epu16(__A, __B),
-              (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-              (__v32hi)_mm512_avg_epu16(__A, __B),
-              (__v32hi) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epi8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_max((__v64qs) __A, (__v64qs) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                             (__v64qi)_mm512_max_epi8(__A, __B),
-                                             (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                             (__v64qi)_mm512_max_epi8(__A, __B),
-                                             (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epi16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_max((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                            (__v32hi)_mm512_max_epi16(__A, __B),
-                                            (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
-           __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                            (__v32hi)_mm512_max_epi16(__A, __B),
-                                            (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epu8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_max((__v64qu)__A, (__v64qu)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                             (__v64qi)_mm512_max_epu8(__A, __B),
-                                             (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                             (__v64qi)_mm512_max_epu8(__A, __B),
-                                             (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epu16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_max((__v32hu)__A, (__v32hu)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                            (__v32hi)_mm512_max_epu16(__A, __B),
-                                            (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                            (__v32hi)_mm512_max_epu16(__A, __B),
-                                            (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epi8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_min((__v64qs) __A, (__v64qs) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                             (__v64qi)_mm512_min_epi8(__A, __B),
-                                             (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                             (__v64qi)_mm512_min_epi8(__A, __B),
-                                             (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epi16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_min((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                            (__v32hi)_mm512_min_epi16(__A, __B),
-                                            (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                            (__v32hi)_mm512_min_epi16(__A, __B),
-                                            (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epu8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_min((__v64qu)__A, (__v64qu)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                             (__v64qi)_mm512_min_epu8(__A, __B),
-                                             (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                             (__v64qi)_mm512_min_epu8(__A, __B),
-                                             (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epu16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_min((__v32hu)__A, (__v32hu)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                            (__v32hi)_mm512_min_epu16(__A, __B),
-                                            (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                            (__v32hi)_mm512_min_epu16(__A, __B),
-                                            (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_shuffle_epi8(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
-                                         (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
-                                         (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_subs_epi8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_subs_epi8(__A, __B),
-                                        (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_subs_epi8(__A, __B),
-                                        (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_subs_epi16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_subs_epi16(__A, __B),
-                                        (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_subs_epi16(__A, __B),
-                                        (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_subs_epu8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_subs_epu8(__A, __B),
-                                        (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_subs_epu8(__A, __B),
-                                        (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_subs_epu16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_subs_epu16(__A, __B),
-                                        (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_subs_epu16(__A, __B),
-                                        (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
-                                                 (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I,
-                               __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                              (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
-                              (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U,
-                                __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                              (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
-                              (__v32hi)__I);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                              (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
-                              (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mulhrs_epi16(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_mulhrs_epi16(__A, __B),
-                                         (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_mulhrs_epi16(__A, __B),
-                                         (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mulhi_epi16(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A,
-       __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_mulhi_epi16(__A, __B),
-                                          (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_mulhi_epi16(__A, __B),
-                                          (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mulhi_epu16(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_mulhi_epu16(__A, __B),
-                                          (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_mulhi_epu16(__A, __B),
-                                          (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
-  return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
-                          __m512i __Y) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
-                                        (__v32hi)_mm512_maddubs_epi16(__X, __Y),
-                                        (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
-                                        (__v32hi)_mm512_maddubs_epi16(__X, __Y),
-                                        (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_madd_epi16(__m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                           (__v16si)_mm512_madd_epi16(__A, __B),
-                                           (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                           (__v16si)_mm512_madd_epi16(__A, __B),
-                                           (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi16_epi8 (__m512i __A) {
-  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
-               (__v32qi)_mm256_setzero_si256(),
-               (__mmask32) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
-  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
-               (__v32qi)__O,
-               __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) {
-  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
-               (__v32qi) _mm256_setzero_si256(),
-               __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi16_epi8 (__m512i __A) {
-  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
-                (__v32qi) _mm256_setzero_si256(),
-                (__mmask32) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
-  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
-                (__v32qi) __O,
-                __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) {
-  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
-                (__v32qi) _mm256_setzero_si256(),
-                __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi16_epi8 (__m512i __A) {
-  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
-              (__v32qi) _mm256_undefined_si256(),
-              (__mmask32) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
-  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
-              (__v32qi) __O,
-              __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
-  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
-              (__v32qi) _mm256_setzero_si256(),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
-{
-  __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
-{
-  __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
-{
-  __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
-  return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
-                                          8,  64+8,   9, 64+9,
-                                          10, 64+10, 11, 64+11,
-                                          12, 64+12, 13, 64+13,
-                                          14, 64+14, 15, 64+15,
-                                          24, 64+24, 25, 64+25,
-                                          26, 64+26, 27, 64+27,
-                                          28, 64+28, 29, 64+29,
-                                          30, 64+30, 31, 64+31,
-                                          40, 64+40, 41, 64+41,
-                                          42, 64+42, 43, 64+43,
-                                          44, 64+44, 45, 64+45,
-                                          46, 64+46, 47, 64+47,
-                                          56, 64+56, 57, 64+57,
-                                          58, 64+58, 59, 64+59,
-                                          60, 64+60, 61, 64+61,
-                                          62, 64+62, 63, 64+63);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_unpackhi_epi8(__A, __B),
-                                        (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_unpackhi_epi8(__A, __B),
-                                        (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
-  return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
-                                          4,  32+4,   5, 32+5,
-                                          6,  32+6,   7, 32+7,
-                                          12, 32+12, 13, 32+13,
-                                          14, 32+14, 15, 32+15,
-                                          20, 32+20, 21, 32+21,
-                                          22, 32+22, 23, 32+23,
-                                          28, 32+28, 29, 32+29,
-                                          30, 32+30, 31, 32+31);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                       (__v32hi)_mm512_unpackhi_epi16(__A, __B),
-                                       (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                       (__v32hi)_mm512_unpackhi_epi16(__A, __B),
-                                       (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
-  return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
-                                          0,  64+0,   1, 64+1,
-                                          2,  64+2,   3, 64+3,
-                                          4,  64+4,   5, 64+5,
-                                          6,  64+6,   7, 64+7,
-                                          16, 64+16, 17, 64+17,
-                                          18, 64+18, 19, 64+19,
-                                          20, 64+20, 21, 64+21,
-                                          22, 64+22, 23, 64+23,
-                                          32, 64+32, 33, 64+33,
-                                          34, 64+34, 35, 64+35,
-                                          36, 64+36, 37, 64+37,
-                                          38, 64+38, 39, 64+39,
-                                          48, 64+48, 49, 64+49,
-                                          50, 64+50, 51, 64+51,
-                                          52, 64+52, 53, 64+53,
-                                          54, 64+54, 55, 64+55);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_unpacklo_epi8(__A, __B),
-                                        (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
-                                        (__v64qi)_mm512_unpacklo_epi8(__A, __B),
-                                        (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
-  return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
-                                          0,  32+0,   1, 32+1,
-                                          2,  32+2,   3, 32+3,
-                                          8,  32+8,   9, 32+9,
-                                          10, 32+10, 11, 32+11,
-                                          16, 32+16, 17, 32+17,
-                                          18, 32+18, 19, 32+19,
-                                          24, 32+24, 25, 32+25,
-                                          26, 32+26, 27, 32+27);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                       (__v32hi)_mm512_unpacklo_epi16(__A, __B),
-                                       (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                       (__v32hi)_mm512_unpacklo_epi16(__A, __B),
-                                       (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi8_epi16(__m256i __A)
-{
-  /* This function always performs a signed extension, but __v32qi is a char
-     which may be signed or unsigned, so use __v32qs. */
-  return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
-                                             (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
-                                             (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu8_epi16(__m256i __A)
-{
-  return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
-                                             (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
-                                             (__v32hi)_mm512_setzero_si512());
-}
-
-
-#define _mm512_shufflehi_epi16(A, imm) \
-  ((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm)))
-
-#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                       (__v32hi)_mm512_shufflehi_epi16((A), \
-                                                                       (imm)), \
-                                       (__v32hi)(__m512i)(W)))
-
-#define _mm512_maskz_shufflehi_epi16(U, A, imm) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                       (__v32hi)_mm512_shufflehi_epi16((A), \
-                                                                       (imm)), \
-                                       (__v32hi)_mm512_setzero_si512()))
-
-#define _mm512_shufflelo_epi16(A, imm) \
-  ((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm)))
-
-
-#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                       (__v32hi)_mm512_shufflelo_epi16((A), \
-                                                                       (imm)), \
-                                       (__v32hi)(__m512i)(W)))
-
-
-#define _mm512_maskz_shufflelo_epi16(U, A, imm) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                       (__v32hi)_mm512_shufflelo_epi16((A), \
-                                                                       (imm)), \
-                                       (__v32hi)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sllv_epi16(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
-                                           (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
-                                           (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sll_epi16(__m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_sll_epi16(__A, __B),
-                                          (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_sll_epi16(__A, __B),
-                                          (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_slli_epi16(__m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
-                       unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_slli_epi16(__A, __B),
-                                         (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_slli_epi16(__A, __B),
-                                         (__v32hi)_mm512_setzero_si512());
-}
-
-#define _mm512_bslli_epi128(a, imm) \
-  ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srlv_epi16(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
-                                           (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
-                                           (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srav_epi16(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                           (__v32hi)_mm512_srav_epi16(__A, __B),
-                                           (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                           (__v32hi)_mm512_srav_epi16(__A, __B),
-                                           (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sra_epi16(__m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_sra_epi16(__A, __B),
-                                          (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_sra_epi16(__A, __B),
-                                          (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srai_epi16(__m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A,
-                       unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_srai_epi16(__A, __B),
-                                         (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_srai_epi16(__A, __B),
-                                         (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srl_epi16(__m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_srl_epi16(__A, __B),
-                                          (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                          (__v32hi)_mm512_srl_epi16(__A, __B),
-                                          (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srli_epi16(__m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, (int)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
-                       unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_srli_epi16(__A, __B),
-                                         (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_srli_epi16(__A, (unsigned int)__B),
-                                         (__v32hi)_mm512_setzero_si512());
-}
-
-#define _mm512_bsrli_epi128(a, imm) \
-  ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
-                (__v32hi) __A,
-                (__v32hi) __W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
-                (__v32hi) __A,
-                (__v32hi) _mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
-                (__v64qi) __A,
-                (__v64qi) __W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
-                (__v64qi) __A,
-                (__v64qi) _mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
-{
-  return (__m512i) __builtin_ia32_selectb_512(__M,
-                                              (__v64qi)_mm512_set1_epi8(__A),
-                                              (__v64qi) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
-{
-  return (__m512i) __builtin_ia32_selectb_512(__M,
-                                              (__v64qi) _mm512_set1_epi8(__A),
-                                              (__v64qi) _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd(__mmask64 __A,
-                                                               __mmask64 __B) {
-  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
-                (__mmask64) __B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
-{
-  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
-                (__mmask32) __B);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_epi16 (void const *__P)
-{
-  struct __loadu_epi16 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi16*)__P)->__v;
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P,
-                 (__v32hi) __W,
-                 (__mmask32) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P,
-                 (__v32hi)
-                 _mm512_setzero_si512 (),
-                 (__mmask32) __U);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_epi8 (void const *__P)
-{
-  struct __loadu_epi8 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi8*)__P)->__v;
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P,
-                 (__v64qi) __W,
-                 (__mmask64) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P,
-                 (__v64qi)
-                 _mm512_setzero_si512 (),
-                 (__mmask64) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_epi16 (void *__P, __m512i __A)
-{
-  struct __storeu_epi16 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi16*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
-{
-  __builtin_ia32_storedquhi512_mask ((__v32hi *) __P,
-             (__v32hi) __A,
-             (__mmask32) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_epi8 (void *__P, __m512i __A)
-{
-  struct __storeu_epi8 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi8*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
-{
-  __builtin_ia32_storedquqi512_mask ((__v64qi *) __P,
-             (__v64qi) __A,
-             (__mmask64) __U);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_test_epi8_mask (__m512i __A, __m512i __B)
-{
-  return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B),
-                                  _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
-                                       _mm512_setzero_si512());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_test_epi16_mask (__m512i __A, __m512i __B)
-{
-  return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B),
-                                   _mm512_setzero_si512());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
-                                        _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
-{
-  return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
-                                      _mm512_setzero_si512());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
-{
-  return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B),
-                                  _mm512_setzero_si512());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
-                                       _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_movepi8_mask (__m512i __A)
-{
-  return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_movepi16_mask (__m512i __A)
-{
-  return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi8 (__mmask64 __A)
-{
-  return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi16 (__mmask32 __A)
-{
-  return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastb_epi8 (__m128i __A)
-{
-  return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A,
-                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectb_512(__M,
-                                             (__v64qi) _mm512_broadcastb_epi8(__A),
-                                             (__v64qi) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectb_512(__M,
-                                             (__v64qi) _mm512_broadcastb_epi8(__A),
-                                             (__v64qi) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
-{
-  return (__m512i) __builtin_ia32_selectw_512(__M,
-                                              (__v32hi) _mm512_set1_epi16(__A),
-                                              (__v32hi) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
-{
-  return (__m512i) __builtin_ia32_selectw_512(__M,
-                                              (__v32hi) _mm512_set1_epi16(__A),
-                                              (__v32hi) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastw_epi16 (__m128i __A)
-{
-  return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A,
-                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__M,
-                                             (__v32hi) _mm512_broadcastw_epi16(__A),
-                                             (__v32hi) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__M,
-                                             (__v32hi) _mm512_broadcastw_epi16(__A),
-                                             (__v32hi) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
-        __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                    (__v32hi)_mm512_permutexvar_epi16(__A, __B),
-                                    (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
-             __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
-                                    (__v32hi)_mm512_permutexvar_epi16(__A, __B),
-                                    (__v32hi)__W);
-}
-
-#define _mm512_alignr_epi8(A, B, N) \
-  ((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
-                                      (__v64qi)(__m512i)(B), (int)(N)))
-
-#define _mm512_mask_alignr_epi8(W, U, A, B, N) \
-  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
-                              (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
-                              (__v64qi)(__m512i)(W)))
-
-#define _mm512_maskz_alignr_epi8(U, A, B, N) \
-  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
-                              (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
-                              (__v64qi)(__m512i)_mm512_setzero_si512()))
-
-#define _mm512_dbsad_epu8(A, B, imm) \
-  ((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
-                                       (__v64qi)(__m512i)(B), (int)(imm)))
-
-#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                  (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
-                                  (__v32hi)(__m512i)(W)))
-
-#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                  (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
-                                  (__v32hi)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sad_epu8 (__m512i __A, __m512i __B)
-{
- return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
-               (__v64qi) __B);
-}
-
-#undef __DEFAULT_FN_ATTRS512
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/avx512cdintrin.h b/third_party/intel/clang/avx512cdintrin.h
deleted file mode 100644
index 33b552f6f..000000000
--- a/third_party/intel/clang/avx512cdintrin.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512CDINTRIN_H
-#define __AVX512CDINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512cd,evex512"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi64 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_conflict_epi64(__A),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_conflict_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi32 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_conflict_epi32(__A),
-                                            (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_conflict_epi32(__A),
-                                            (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_lzcnt_epi32 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_lzcnt_epi32(__A),
-                                             (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_lzcnt_epi32(__A),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_lzcnt_epi64 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_lzcnt_epi64(__A),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_lzcnt_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcastmb_epi64 (__mmask8 __A)
-{
-  return (__m512i) _mm512_set1_epi64((long long) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcastmw_epi32 (__mmask16 __A)
-{
-  return (__m512i) _mm512_set1_epi32((int) __A);
-
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/avx512dqintrin.h b/third_party/intel/clang/avx512dqintrin.h
deleted file mode 100644
index 88b48e3a3..000000000
--- a/third_party/intel/clang/avx512dqintrin.h
+++ /dev/null
@@ -1,1379 +0,0 @@
-/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512DQINTRIN_H
-#define __AVX512DQINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq,evex512"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512dq,no-evex512")))
-
-static __inline __mmask8 __DEFAULT_FN_ATTRS
-_knot_mask8(__mmask8 __M)
-{
-  return __builtin_ia32_knotqi(__M);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kand_mask8(__mmask8 __A, __mmask8 __B)
-{
-  return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kandn_mask8(__mmask8 __A, __mmask8 __B)
-{
-  return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kor_mask8(__mmask8 __A, __mmask8 __B)
-{
-  return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kxnor_mask8(__mmask8 __A, __mmask8 __B)
-{
-  return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kxor_mask8(__mmask8 __A, __mmask8 __B)
-{
-  return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
-  return (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
-  return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
-  *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
-  return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
-  return (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
-  return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
-  *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
-  return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
-  return (unsigned char)__builtin_ia32_ktestchi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
-  return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
-  *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B);
-  return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kadd_mask8(__mmask8 __A, __mmask8 __B)
-{
-  return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_kadd_mask16(__mmask16 __A, __mmask16 __B)
-{
-  return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B);
-}
-
-#define _kshiftli_mask8(A, I) \
-  ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)))
-
-#define _kshiftri_mask8(A, I) \
-  ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_cvtmask8_u32(__mmask8 __A) {
-  return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_cvtu32_mask8(unsigned int __A) {
-  return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_load_mask8(__mmask8 *__A) {
-  return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_store_mask8(__mmask8 *__A, __mmask8 __B) {
-  *(__mmask8 *)__A = __builtin_ia32_kmovb((__mmask8)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v8du) __A * (__v8du) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_mullo_epi64(__A, __B),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_mullo_epi64(__A, __B),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_xor_pd(__m512d __A, __m512d __B) {
-  return (__m512d)((__v8du)__A ^ (__v8du)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_xor_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_xor_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_xor_ps (__m512 __A, __m512 __B) {
-  return (__m512)((__v16su)__A ^ (__v16su)__B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_xor_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_xor_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_or_pd(__m512d __A, __m512d __B) {
-  return (__m512d)((__v8du)__A | (__v8du)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_or_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_or_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_or_ps(__m512 __A, __m512 __B) {
-  return (__m512)((__v16su)__A | (__v16su)__B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_or_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_or_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_and_pd(__m512d __A, __m512d __B) {
-  return (__m512d)((__v8du)__A & (__v8du)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_and_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_and_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_and_ps(__m512 __A, __m512 __B) {
-  return (__m512)((__v16su)__A & (__v16su)__B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_and_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_and_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_andnot_pd(__m512d __A, __m512d __B) {
-  return (__m512d)(~(__v8du)__A & (__v8du)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_andnot_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_andnot_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_andnot_ps(__m512 __A, __m512 __B) {
-  return (__m512)(~(__v16su)__A & (__v16su)__B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_andnot_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_andnot_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_epi64 (__m512d __A) {
-  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
-                (__v8di) _mm512_setzero_si512(),
-                (__mmask8) -1,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
-  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
-                (__v8di) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
-  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
-                (__v8di) _mm512_setzero_si512(),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundpd_epi64(A, R) \
-  ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
-  ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_epu64 (__m512d __A) {
-  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
-                 (__v8di) _mm512_setzero_si512(),
-                 (__mmask8) -1,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
-  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
-                 (__v8di) __W,
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
-  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
-                 (__v8di) _mm512_setzero_si512(),
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundpd_epu64(A, R) \
-  ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
-  ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtps_epi64 (__m256 __A) {
-  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
-                (__v8di) _mm512_setzero_si512(),
-                (__mmask8) -1,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
-  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
-                (__v8di) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
-  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
-                (__v8di) _mm512_setzero_si512(),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundps_epi64(A, R) \
-  ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
-                                            (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
-  ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtps_epu64 (__m256 __A) {
-  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
-                 (__v8di) _mm512_setzero_si512(),
-                 (__mmask8) -1,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
-  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
-                 (__v8di) __W,
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
-  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
-                 (__v8di) _mm512_setzero_si512(),
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundps_epu64(A, R) \
-  ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
-                                             (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
-  ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_pd (__m512i __A) {
-  return (__m512d)__builtin_convertvector((__v8di)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_cvtepi64_pd(__A),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_cvtepi64_pd(__A),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_cvt_roundepi64_pd(A, R) \
-  ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
-  ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
-                                            (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
-  ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_ps (__m512i __A) {
-  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
-               (__v8sf) _mm256_setzero_ps(),
-               (__mmask8) -1,
-               _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
-  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
-               (__v8sf) __W,
-               (__mmask8) __U,
-               _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
-  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
-               (__v8sf) _mm256_setzero_ps(),
-               (__mmask8) __U,
-               _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi64_ps(A, R) \
-  ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
-  ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
-                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                           (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
-  ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttpd_epi64 (__m512d __A) {
-  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
-                 (__v8di) _mm512_setzero_si512(),
-                 (__mmask8) -1,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
-  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
-                 (__v8di) __W,
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
-  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
-                 (__v8di) _mm512_setzero_si512(),
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundpd_epi64(A, R) \
-  ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
-  ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttpd_epu64 (__m512d __A) {
-  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
-                  (__v8di) _mm512_setzero_si512(),
-                  (__mmask8) -1,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
-  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
-                  (__v8di) __W,
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
-  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
-                  (__v8di) _mm512_setzero_si512(),
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundpd_epu64(A, R) \
-  ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                              (__v8di)_mm512_setzero_si512(), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                              (__v8di)(__m512i)(W), \
-                                              (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
-  ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                              (__v8di)_mm512_setzero_si512(), \
-                                              (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttps_epi64 (__m256 __A) {
-  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
-                 (__v8di) _mm512_setzero_si512(),
-                 (__mmask8) -1,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
-  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
-                 (__v8di) __W,
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
-  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
-                 (__v8di) _mm512_setzero_si512(),
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundps_epi64(A, R) \
-  ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
-                                             (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
-  ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttps_epu64 (__m256 __A) {
-  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
-                  (__v8di) _mm512_setzero_si512(),
-                  (__mmask8) -1,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
-  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
-                  (__v8di) __W,
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
-  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
-                  (__v8di) _mm512_setzero_si512(),
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundps_epu64(A, R) \
-  ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
-                                              (__v8di)_mm512_setzero_si512(), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
-                                              (__v8di)(__m512i)(W), \
-                                              (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
-  ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
-                                              (__v8di)_mm512_setzero_si512(), \
-                                              (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepu64_pd (__m512i __A) {
-  return (__m512d)__builtin_convertvector((__v8du)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_cvtepu64_pd(__A),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_cvtepu64_pd(__A),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_cvt_roundepu64_pd(A, R) \
-  ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
-                                             (__v8df)_mm512_setzero_pd(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
-  ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
-                                             (__v8df)(__m512d)(W), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
-  ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
-                                             (__v8df)_mm512_setzero_pd(), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_cvtepu64_ps (__m512i __A) {
-  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
-                (__v8sf) _mm256_setzero_ps(),
-                (__mmask8) -1,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
-  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
-                (__v8sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
-  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
-                (__v8sf) _mm256_setzero_ps(),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepu64_ps(A, R) \
-  ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
-                                            (__v8sf)_mm256_setzero_ps(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
-  ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
-                                            (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                            (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
-  ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
-                                            (__v8sf)_mm256_setzero_ps(), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_range_pd(A, B, C) \
-  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), (int)(C), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)-1, \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_range_pd(W, U, A, B, C) \
-  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), (int)(C), \
-                                           (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_range_pd(U, A, B, C) \
-  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), (int)(C), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_range_round_pd(A, B, C, R) \
-  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), (int)(C), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), (int)(C), \
-                                           (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                           (int)(R)))
-
-#define _mm512_maskz_range_round_pd(U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), (int)(C), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R)))
-
-#define _mm512_range_ps(A, B, C) \
-  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), (int)(C), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1, \
-                                          _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_range_ps(W, U, A, B, C) \
-  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), (int)(C), \
-                                          (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                          _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_range_ps(U, A, B, C) \
-  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), (int)(C), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), \
-                                          _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_range_round_ps(A, B, C, R) \
-  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), (int)(C), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
-  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), (int)(C), \
-                                          (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                          (int)(R)))
-
-#define _mm512_maskz_range_round_ps(U, A, B, C, R) \
-  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), (int)(C), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R)))
-
-#define _mm_range_round_ss(A, B, C, R) \
-  ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8) -1, (int)(C),\
-                                                (int)(R)))
-
-#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_range_round_ss(W, U, A, B, C, R) \
-  ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)(__m128)(W),\
-                                                (__mmask8)(U), (int)(C),\
-                                                (int)(R)))
-
-#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_range_round_ss(U, A, B, C, R) \
-  ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)(U), (int)(C),\
-                                                (int)(R)))
-
-#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm_range_round_sd(A, B, C, R) \
-  ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8) -1, (int)(C),\
-                                                 (int)(R)))
-
-#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_range_round_sd(W, U, A, B, C, R) \
-  ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)(__m128d)(W),\
-                                                 (__mmask8)(U), (int)(C),\
-                                                 (int)(R)))
-
-#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_range_round_sd(U, A, B, C, R) \
-  ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)(U), (int)(C),\
-                                                 (int)(R)))
-
-#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_reduce_pd(A, B) \
-  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)-1, \
-                                            _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_reduce_pd(W, U, A, B) \
-  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                            (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), \
-                                            _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_reduce_pd(U, A, B) \
-  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), \
-                                            _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_reduce_ps(A, B) \
-  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)-1, \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_reduce_ps(W, U, A, B) \
-  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                           (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_reduce_ps(U, A, B) \
-  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_reduce_round_pd(A, B, R) \
-  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
-  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                            (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_reduce_round_pd(U, A, B, R) \
-  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_reduce_round_ps(A, B, R) \
-  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
-  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                           (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_reduce_round_ps(U, A, B, R) \
-  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R)))
-
-#define _mm_reduce_ss(A, B, C) \
-  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
-                                        (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_reduce_ss(W, U, A, B, C) \
-  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                        (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_reduce_ss(U, A, B, C) \
-  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)_mm_setzero_ps(), \
-                                        (__mmask8)(U), (int)(C), \
-                                        _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_reduce_round_ss(A, B, C, R) \
-  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
-                                        (int)(C), (int)(R)))
-
-#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
-  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                        (int)(C), (int)(R)))
-
-#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
-  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)_mm_setzero_ps(), \
-                                        (__mmask8)(U), (int)(C), (int)(R)))
-
-#define _mm_reduce_sd(A, B, C) \
-  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)_mm_setzero_pd(), \
-                                         (__mmask8)-1, (int)(C), \
-                                         _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_reduce_sd(W, U, A, B, C) \
-  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                         (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_reduce_sd(U, A, B, C) \
-  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)_mm_setzero_pd(), \
-                                         (__mmask8)(U), (int)(C), \
-                                         _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_reduce_round_sd(A, B, C, R) \
-  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)_mm_setzero_pd(), \
-                                         (__mmask8)-1, (int)(C), (int)(R)))
-
-#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
-  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                         (int)(C), (int)(R)))
-
-#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
-  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)_mm_setzero_pd(), \
-                                         (__mmask8)(U), (int)(C), (int)(R)))
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_movepi32_mask (__m512i __A)
-{
-  return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi32 (__mmask16 __A)
-{
-  return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi64 (__mmask8 __A)
-{
-  return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_movepi64_mask (__m512i __A)
-{
-  return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
-}
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f32x2 (__m128 __A)
-{
-  return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
-                                         0, 1, 0, 1, 0, 1, 0, 1,
-                                         0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
-                                             (__v16sf)_mm512_broadcast_f32x2(__A),
-                                             (__v16sf)__O);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
-                                             (__v16sf)_mm512_broadcast_f32x2(__A),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f32x8(__m256 __A)
-{
-  return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,
-                                         0, 1, 2, 3, 4, 5, 6, 7,
-                                         0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
-                                           (__v16sf)_mm512_broadcast_f32x8(__A),
-                                           (__v16sf)__O);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
-                                           (__v16sf)_mm512_broadcast_f32x8(__A),
-                                           (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f64x2(__m128d __A)
-{
-  return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
-                                          0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
-                                            (__v8df)_mm512_broadcast_f64x2(__A),
-                                            (__v8df)__O);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
-                                            (__v8df)_mm512_broadcast_f64x2(__A),
-                                            (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i32x2 (__m128i __A)
-{
-  return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
-                                          0, 1, 0, 1, 0, 1, 0, 1,
-                                          0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                             (__v16si)_mm512_broadcast_i32x2(__A),
-                                             (__v16si)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                             (__v16si)_mm512_broadcast_i32x2(__A),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i32x8(__m256i __A)
-{
-  return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,
-                                          0, 1, 2, 3, 4, 5, 6, 7,
-                                          0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                           (__v16si)_mm512_broadcast_i32x8(__A),
-                                           (__v16si)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                           (__v16si)_mm512_broadcast_i32x8(__A),
-                                           (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i64x2(__m128i __A)
-{
-  return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
-                                          0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                            (__v8di)_mm512_broadcast_i64x2(__A),
-                                            (__v8di)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                            (__v8di)_mm512_broadcast_i64x2(__A),
-                                            (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_extractf32x8_ps(A, imm) \
-  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                            (__v8sf)_mm256_undefined_ps(), \
-                                            (__mmask8)-1))
-
-#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
-  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                            (__v8sf)(__m256)(W), \
-                                            (__mmask8)(U)))
-
-#define _mm512_maskz_extractf32x8_ps(U, A, imm) \
-  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                            (__v8sf)_mm256_setzero_ps(), \
-                                            (__mmask8)(U)))
-
-#define _mm512_extractf64x2_pd(A, imm) \
-  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                 (int)(imm), \
-                                                 (__v2df)_mm_undefined_pd(), \
-                                                 (__mmask8)-1))
-
-#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
-  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                 (int)(imm), \
-                                                 (__v2df)(__m128d)(W), \
-                                                 (__mmask8)(U)))
-
-#define _mm512_maskz_extractf64x2_pd(U, A, imm) \
-  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                 (int)(imm), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)(U)))
-
-#define _mm512_extracti32x8_epi32(A, imm) \
-  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                             (__v8si)_mm256_undefined_si256(), \
-                                             (__mmask8)-1))
-
-#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
-  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                             (__v8si)(__m256i)(W), \
-                                             (__mmask8)(U)))
-
-#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
-  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                             (__v8si)_mm256_setzero_si256(), \
-                                             (__mmask8)(U)))
-
-#define _mm512_extracti64x2_epi64(A, imm) \
-  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_undefined_si128(), \
-                                                (__mmask8)-1))
-
-#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
-  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
-                                                 (int)(imm), \
-                                                 (__v2di)(__m128i)(W), \
-                                                 (__mmask8)(U)))
-
-#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
-  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
-                                                 (int)(imm), \
-                                                 (__v2di)_mm_setzero_si128(), \
-                                                 (__mmask8)(U)))
-
-#define _mm512_insertf32x8(A, B, imm) \
-  ((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
-                                      (__v8sf)(__m256)(B), (int)(imm)))
-
-#define _mm512_mask_insertf32x8(W, U, A, B, imm) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
-                                 (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_insertf32x8(U, A, B, imm) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
-                                 (__v16sf)_mm512_setzero_ps()))
-
-#define _mm512_insertf64x2(A, B, imm) \
-  ((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
-                                           (__v2df)(__m128d)(B), (int)(imm)))
-
-#define _mm512_mask_insertf64x2(W, U, A, B, imm) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
-                                  (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_insertf64x2(U, A, B, imm) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
-                                  (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_inserti32x8(A, B, imm) \
-  ((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
-                                       (__v8si)(__m256i)(B), (int)(imm)))
-
-#define _mm512_mask_inserti32x8(W, U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
-                                 (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_inserti32x8(U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
-                                 (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_inserti64x2(A, B, imm) \
-  ((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
-                                           (__v2di)(__m128i)(B), (int)(imm)))
-
-#define _mm512_mask_inserti64x2(W, U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
-                                  (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_inserti64x2(U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
-                                  (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_mask_fpclass_ps_mask(U, A, imm) \
-  ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
-                                               (int)(imm), (__mmask16)(U)))
-
-#define _mm512_fpclass_ps_mask(A, imm) \
-  ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
-                                               (int)(imm), (__mmask16)-1))
-
-#define _mm512_mask_fpclass_pd_mask(U, A, imm) \
-  ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                              (__mmask8)(U)))
-
-#define _mm512_fpclass_pd_mask(A, imm) \
-  ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                              (__mmask8)-1))
-
-#define _mm_fpclass_sd_mask(A, imm) \
-  ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                           (__mmask8)-1))
-
-#define _mm_mask_fpclass_sd_mask(U, A, imm) \
-  ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                           (__mmask8)(U)))
-
-#define _mm_fpclass_ss_mask(A, imm) \
-  ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                           (__mmask8)-1))
-
-#define _mm_mask_fpclass_ss_mask(U, A, imm) \
-  ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                           (__mmask8)(U)))
-
-#undef __DEFAULT_FN_ATTRS512
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/avx512erintrin.h b/third_party/intel/clang/avx512erintrin.h
deleted file mode 100644
index 1c5a2d2d2..000000000
--- a/third_party/intel/clang/avx512erintrin.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512ERINTRIN_H
-#define __AVX512ERINTRIN_H
-
-/* exp2a23 */
-#define _mm512_exp2a23_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (int)(R)))
-
-#define _mm512_exp2a23_pd(A) \
-  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_pd(S, M, A) \
-  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_pd(M, A) \
-  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_exp2a23_round_ps(A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                      (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (int)(R)))
-
-#define _mm512_exp2a23_ps(A) \
-  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_ps(S, M, A) \
-  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_ps(M, A) \
-  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-/* rsqrt28 */
-#define _mm512_rsqrt28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                          (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(M), (int)(R)))
-
-#define _mm512_rsqrt28_pd(A) \
-  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_pd(S, M, A) \
-  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_pd(M, A) \
-  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rsqrt28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                         (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(M), (int)(R)))
-
-#define _mm512_rsqrt28_ps(A) \
-  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_ps(S, M, A) \
-  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_ps(M, A) \
-  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)(__m128)(S), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_ss(A, B) \
-  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_ss(S, M, A, B) \
-  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_ss(M, A, B) \
-  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)(__m128d)(S), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_sd(A, B) \
-  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_sd(S, M, A, B) \
-  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_sd(M, A, B) \
-  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-/* rcp28 */
-#define _mm512_rcp28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                        (int)(R)))
-
-#define _mm512_maskz_rcp28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(M), (int)(R)))
-
-#define _mm512_rcp28_pd(A) \
-  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_pd(S, M, A) \
-  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_pd(M, A) \
-  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rcp28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_rcp28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(M), (int)(R)))
-
-#define _mm512_rcp28_ps(A) \
-  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_ps(S, M, A) \
-  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_ps(M, A) \
-  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)(__m128)(S), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_ss(A, B) \
-  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_ss(S, M, A, B) \
-  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_ss(M, A, B) \
-  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)(__m128d)(S), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_sd(A, B) \
-  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_sd(S, M, A, B) \
-  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_sd(M, A, B) \
-  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#endif /* __AVX512ERINTRIN_H */
diff --git a/third_party/intel/clang/avx512fintrin.h b/third_party/intel/clang/avx512fintrin.h
deleted file mode 100644
index 4f172c74b..000000000
--- a/third_party/intel/clang/avx512fintrin.h
+++ /dev/null
@@ -1,9779 +0,0 @@
-/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512FINTRIN_H
-#define __AVX512FINTRIN_H
-
-typedef char __v64qi __attribute__((__vector_size__(64)));
-typedef short __v32hi __attribute__((__vector_size__(64)));
-typedef double __v8df __attribute__((__vector_size__(64)));
-typedef float __v16sf __attribute__((__vector_size__(64)));
-typedef long long __v8di __attribute__((__vector_size__(64)));
-typedef int __v16si __attribute__((__vector_size__(64)));
-
-/* Unsigned types */
-typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
-typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
-typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
-typedef unsigned int __v16su __attribute__((__vector_size__(64)));
-
-/* We need an explicitly signed variant for char. Note that this shouldn't
- * appear in the interface though. */
-typedef signed char __v64qs __attribute__((__vector_size__(64)));
-
-typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
-typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
-typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
-
-typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
-typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
-typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
-
-typedef unsigned char __mmask8;
-typedef unsigned short __mmask16;
-
-/* Rounding mode macros.  */
-#define _MM_FROUND_TO_NEAREST_INT   0x00
-#define _MM_FROUND_TO_NEG_INF       0x01
-#define _MM_FROUND_TO_POS_INF       0x02
-#define _MM_FROUND_TO_ZERO          0x03
-#define _MM_FROUND_CUR_DIRECTION    0x04
-
-/* Constants for integer comparison predicates */
-typedef enum {
-    _MM_CMPINT_EQ,      /* Equal */
-    _MM_CMPINT_LT,      /* Less than */
-    _MM_CMPINT_LE,      /* Less than or Equal */
-    _MM_CMPINT_UNUSED,
-    _MM_CMPINT_NE,      /* Not Equal */
-    _MM_CMPINT_NLT,     /* Not Less than */
-#define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
-    _MM_CMPINT_NLE      /* Not Less than or Equal */
-#define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
-} _MM_CMPINT_ENUM;
-
-typedef enum
-{
-  _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
-  _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
-  _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
-  _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
-  _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
-  _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
-  _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
-  _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
-  _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
-  _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
-  _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
-  _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
-  _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
-  _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
-  _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
-  _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
-  _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
-  _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
-  _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
-  _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
-  _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
-  _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
-  _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
-  _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
-  _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
-  _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
-  _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
-  _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
-  _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
-  _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
-  _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
-  _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
-  _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
-  _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
-  _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
-  _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
-  _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
-  _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
-  _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
-  _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
-  _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
-  _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
-  _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
-  _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
-  _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
-  _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
-  _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
-  _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
-  _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
-  _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
-  _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
-  _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
-  _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
-  _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
-  _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
-  _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
-  _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
-  _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
-  _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
-  _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
-  _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
-  _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
-  _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
-  _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
-  _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
-  _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
-  _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
-  _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
-  _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
-  _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
-  _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
-  _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
-  _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
-  _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
-  _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
-  _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
-  _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
-  _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
-  _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
-  _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
-  _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
-  _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
-  _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
-  _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
-  _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
-  _MM_PERM_DDDD = 0xFF
-} _MM_PERM_ENUM;
-
-typedef enum
-{
-  _MM_MANT_NORM_1_2,    /* interval [1, 2)      */
-  _MM_MANT_NORM_p5_2,   /* interval [0.5, 2)    */
-  _MM_MANT_NORM_p5_1,   /* interval [0.5, 1)    */
-  _MM_MANT_NORM_p75_1p5   /* interval [0.75, 1.5) */
-} _MM_MANTISSA_NORM_ENUM;
-
-typedef enum
-{
-  _MM_MANT_SIGN_src,    /* sign = sign(SRC)     */
-  _MM_MANT_SIGN_zero,   /* sign = 0             */
-  _MM_MANT_SIGN_nan   /* DEST = NaN if sign(SRC) = 1 */
-} _MM_MANTISSA_SIGN_ENUM;
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f,evex512"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512f,no-evex512"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512f,no-evex512")))
-
-/* Create vectors with repeated elements */
-
-static  __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_setzero_si512(void)
-{
-  return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
-}
-
-#define _mm512_setzero_epi32 _mm512_setzero_si512
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_undefined_pd(void)
-{
-  return (__m512d)__builtin_ia32_undef512();
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_undefined(void)
-{
-  return (__m512)__builtin_ia32_undef512();
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_undefined_ps(void)
-{
-  return (__m512)__builtin_ia32_undef512();
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_undefined_epi32(void)
-{
-  return (__m512i)__builtin_ia32_undef512();
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastd_epi32 (__m128i __A)
-{
-  return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
-                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__M,
-                                             (__v16si) _mm512_broadcastd_epi32(__A),
-                                             (__v16si) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__M,
-                                             (__v16si) _mm512_broadcastd_epi32(__A),
-                                             (__v16si) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastq_epi64 (__m128i __A)
-{
-  return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
-                                          0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                             (__v8di) _mm512_broadcastq_epi64(__A),
-                                             (__v8di) __O);
-
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                             (__v8di) _mm512_broadcastq_epi64(__A),
-                                             (__v8di) _mm512_setzero_si512());
-}
-
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_setzero_ps(void)
-{
-  return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                                 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
-}
-
-#define _mm512_setzero _mm512_setzero_ps
-
-static  __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_setzero_pd(void)
-{
-  return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_set1_ps(float __w)
-{
-  return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
-                                 __w, __w, __w, __w, __w, __w, __w, __w  };
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_set1_pd(double __w)
-{
-  return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set1_epi8(char __w)
-{
-  return __extension__ (__m512i)(__v64qi){
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w  };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set1_epi16(short __w)
-{
-  return __extension__ (__m512i)(__v32hi){
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w,
-    __w, __w, __w, __w, __w, __w, __w, __w };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set1_epi32(int __s)
-{
-  return __extension__ (__m512i)(__v16si){
-    __s, __s, __s, __s, __s, __s, __s, __s,
-    __s, __s, __s, __s, __s, __s, __s, __s };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__M,
-                                             (__v16si)_mm512_set1_epi32(__A),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set1_epi64(long long __d)
-{
-  return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                             (__v8di)_mm512_set1_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcastss_ps(__m128 __A)
-{
-  return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
-{
-  return __extension__ (__m512i)(__v16si)
-   { __D, __C, __B, __A, __D, __C, __B, __A,
-     __D, __C, __B, __A, __D, __C, __B, __A };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set4_epi64 (long long __A, long long __B, long long __C,
-       long long __D)
-{
-  return __extension__ (__m512i) (__v8di)
-   { __D, __C, __B, __A, __D, __C, __B, __A };
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_set4_pd (double __A, double __B, double __C, double __D)
-{
-  return __extension__ (__m512d)
-   { __D, __C, __B, __A, __D, __C, __B, __A };
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_set4_ps (float __A, float __B, float __C, float __D)
-{
-  return __extension__ (__m512)
-   { __D, __C, __B, __A, __D, __C, __B, __A,
-     __D, __C, __B, __A, __D, __C, __B, __A };
-}
-
-#define _mm512_setr4_epi32(e0,e1,e2,e3)               \
-  _mm512_set4_epi32((e3),(e2),(e1),(e0))
-
-#define _mm512_setr4_epi64(e0,e1,e2,e3)               \
-  _mm512_set4_epi64((e3),(e2),(e1),(e0))
-
-#define _mm512_setr4_pd(e0,e1,e2,e3)                \
-  _mm512_set4_pd((e3),(e2),(e1),(e0))
-
-#define _mm512_setr4_ps(e0,e1,e2,e3)                \
-  _mm512_set4_ps((e3),(e2),(e1),(e0))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_broadcastsd_pd(__m128d __A)
-{
-  return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
-                                          0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-/* Cast between vector types */
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_castpd256_pd512(__m256d __a)
-{
-  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
-                                 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_castps256_ps512(__m256 __a)
-{
-  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
-                                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-static __inline __m128d __DEFAULT_FN_ATTRS512
-_mm512_castpd512_pd128(__m512d __a)
-{
-  return __builtin_shufflevector(__a, __a, 0, 1);
-}
-
-static __inline __m256d __DEFAULT_FN_ATTRS512
-_mm512_castpd512_pd256 (__m512d __A)
-{
-  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
-}
-
-static __inline __m128 __DEFAULT_FN_ATTRS512
-_mm512_castps512_ps128(__m512 __a)
-{
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
-}
-
-static __inline __m256 __DEFAULT_FN_ATTRS512
-_mm512_castps512_ps256 (__m512 __A)
-{
-  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_castpd_ps (__m512d __A)
-{
-  return (__m512) (__A);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_castpd_si512 (__m512d __A)
-{
-  return (__m512i) (__A);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_castpd128_pd512 (__m128d __A)
-{
-  __m256d __B = __builtin_nondeterministic_value(__B);
-  return __builtin_shufflevector(
-      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
-      __B, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_castps_pd (__m512 __A)
-{
-  return (__m512d) (__A);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_castps_si512 (__m512 __A)
-{
-  return (__m512i) (__A);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_castps128_ps512 (__m128 __A)
-{
-  __m256 __B = __builtin_nondeterministic_value(__B);
-  return __builtin_shufflevector(
-      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7),
-      __B, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_castsi128_si512 (__m128i __A)
-{
-  __m256i __B = __builtin_nondeterministic_value(__B);
-  return __builtin_shufflevector(
-      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
-      __B, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_castsi256_si512 (__m256i __A)
-{
-   return  __builtin_shufflevector( __A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_castsi512_ps (__m512i __A)
-{
-  return (__m512) (__A);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_castsi512_pd (__m512i __A)
-{
-  return (__m512d) (__A);
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS512
-_mm512_castsi512_si128 (__m512i __A)
-{
-  return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS512
-_mm512_castsi512_si256 (__m512i __A)
-{
-  return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_int2mask(int __a)
-{
-  return (__mmask16)__a;
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_mask2int(__mmask16 __a)
-{
-  return (int)__a;
-}
-
-/// Constructs a 512-bit floating-point vector of [8 x double] from a
-///    128-bit floating-point vector of [2 x double]. The lower 128 bits
-///    contain the value of the source vector. The upper 384 bits are set
-///    to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
-///    contain the value of the parameter. The upper 384 bits are set to zero.
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_zextpd128_pd512(__m128d __a)
-{
-  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
-}
-
-/// Constructs a 512-bit floating-point vector of [8 x double] from a
-///    256-bit floating-point vector of [4 x double]. The lower 256 bits
-///    contain the value of the source vector. The upper 256 bits are set
-///    to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
-///    contain the value of the parameter. The upper 256 bits are set to zero.
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_zextpd256_pd512(__m256d __a)
-{
-  return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-/// Constructs a 512-bit floating-point vector of [16 x float] from a
-///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
-///    the value of the source vector. The upper 384 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
-///    contain the value of the parameter. The upper 384 bits are set to zero.
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_zextps128_ps512(__m128 __a)
-{
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
-}
-
-/// Constructs a 512-bit floating-point vector of [16 x float] from a
-///    256-bit floating-point vector of [8 x float]. The lower 256 bits contain
-///    the value of the source vector. The upper 256 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
-///    contain the value of the parameter. The upper 256 bits are set to zero.
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_zextps256_ps512(__m256 __a)
-{
-  return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-/// Constructs a 512-bit integer vector from a 128-bit integer vector.
-///    The lower 128 bits contain the value of the source vector. The upper
-///    384 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
-///    the parameter. The upper 384 bits are set to zero.
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_zextsi128_si512(__m128i __a)
-{
-  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
-}
-
-/// Constructs a 512-bit integer vector from a 256-bit integer vector.
-///    The lower 256 bits contain the value of the source vector. The upper
-///    256 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
-///    the parameter. The upper 256 bits are set to zero.
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_zextsi256_si512(__m256i __a)
-{
-  return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-/* Bitwise operators */
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_and_epi32(__m512i __a, __m512i __b)
-{
-  return (__m512i)((__v16su)__a & (__v16su)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
-                (__v16si) _mm512_and_epi32(__a, __b),
-                (__v16si) __src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
-                                         __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_and_epi64(__m512i __a, __m512i __b)
-{
-  return (__m512i)((__v8du)__a & (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
-{
-    return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
-                (__v8di) _mm512_and_epi64(__a, __b),
-                (__v8di) __src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
-                                         __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_andnot_si512 (__m512i __A, __m512i __B)
-{
-  return (__m512i)(~(__v8du)__A & (__v8du)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_andnot_epi32 (__m512i __A, __m512i __B)
-{
-  return (__m512i)(~(__v16su)__A & (__v16su)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                         (__v16si)_mm512_andnot_epi32(__A, __B),
-                                         (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
-                                           __U, __A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_andnot_epi64(__m512i __A, __m512i __B)
-{
-  return (__m512i)(~(__v8du)__A & (__v8du)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                          (__v8di)_mm512_andnot_epi64(__A, __B),
-                                          (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
-                                           __U, __A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_or_epi32(__m512i __a, __m512i __b)
-{
-  return (__m512i)((__v16su)__a | (__v16su)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
-                                             (__v16si)_mm512_or_epi32(__a, __b),
-                                             (__v16si)__src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_or_epi64(__m512i __a, __m512i __b)
-{
-  return (__m512i)((__v8du)__a | (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
-                                             (__v8di)_mm512_or_epi64(__a, __b),
-                                             (__v8di)__src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_xor_epi32(__m512i __a, __m512i __b)
-{
-  return (__m512i)((__v16su)__a ^ (__v16su)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
-                                            (__v16si)_mm512_xor_epi32(__a, __b),
-                                            (__v16si)__src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_xor_epi64(__m512i __a, __m512i __b)
-{
-  return (__m512i)((__v8du)__a ^ (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
-                                             (__v8di)_mm512_xor_epi64(__a, __b),
-                                             (__v8di)__src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
-{
-  return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_and_si512(__m512i __a, __m512i __b)
-{
-  return (__m512i)((__v8du)__a & (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_or_si512(__m512i __a, __m512i __b)
-{
-  return (__m512i)((__v8du)__a | (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_xor_si512(__m512i __a, __m512i __b)
-{
-  return (__m512i)((__v8du)__a ^ (__v8du)__b);
-}
-
-/* Arithmetic */
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_add_pd(__m512d __a, __m512d __b)
-{
-  return (__m512d)((__v8df)__a + (__v8df)__b);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_add_ps(__m512 __a, __m512 __b)
-{
-  return (__m512)((__v16sf)__a + (__v16sf)__b);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mul_pd(__m512d __a, __m512d __b)
-{
-  return (__m512d)((__v8df)__a * (__v8df)__b);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mul_ps(__m512 __a, __m512 __b)
-{
-  return (__m512)((__v16sf)__a * (__v16sf)__b);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_sub_pd(__m512d __a, __m512d __b)
-{
-  return (__m512d)((__v8df)__a - (__v8df)__b);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_sub_ps(__m512 __a, __m512 __b)
-{
-  return (__m512)((__v16sf)__a - (__v16sf)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi64 (__m512i __A, __m512i __B)
-{
-  return (__m512i) ((__v8du) __A + (__v8du) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_add_epi64(__A, __B),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_add_epi64(__A, __B),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sub_epi64 (__m512i __A, __m512i __B)
-{
-  return (__m512i) ((__v8du) __A - (__v8du) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_sub_epi64(__A, __B),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_sub_epi64(__A, __B),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi32 (__m512i __A, __m512i __B)
-{
-  return (__m512i) ((__v16su) __A + (__v16su) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_add_epi32(__A, __B),
-                                             (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_add_epi32(__A, __B),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sub_epi32 (__m512i __A, __m512i __B)
-{
-  return (__m512i) ((__v16su) __A - (__v16su) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_sub_epi32(__A, __B),
-                                             (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_sub_epi32(__A, __B),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-#define _mm512_max_round_pd(A, B, R) \
-  ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
-                                    (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_max_round_pd(W, U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_max_round_pd((A), (B), (R)), \
-                                   (__v8df)(W)))
-
-#define _mm512_maskz_max_round_pd(U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_max_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd()))
-
-static  __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_max_pd(__m512d __A, __m512d __B)
-{
-  return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
-                                           _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__U,
-                                              (__v8df)_mm512_max_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__U,
-                                              (__v8df)_mm512_max_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_max_round_ps(A, B, R) \
-  ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
-                                   (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_max_round_ps(W, U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
-                                  (__v16sf)(W)))
-
-#define _mm512_maskz_max_round_ps(U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps()))
-
-static  __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_max_ps(__m512 __A, __m512 __B)
-{
-  return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
-                                          _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512(__U,
-                                             (__v16sf)_mm512_max_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512(__U,
-                                             (__v16sf)_mm512_max_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf)  _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_max_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_max_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                           (int)(R)))
-
-#define _mm_maskz_max_round_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)  _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_max_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_max_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_max_round_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline __m512i
-__DEFAULT_FN_ATTRS512
-_mm512_max_epi32(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                            (__v16si)_mm512_max_epi32(__A, __B),
-                                            (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                            (__v16si)_mm512_max_epi32(__A, __B),
-                                            (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epu32(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                            (__v16si)_mm512_max_epu32(__A, __B),
-                                            (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                            (__v16si)_mm512_max_epu32(__A, __B),
-                                            (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epi64(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_max_epi64(__A, __B),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_max_epi64(__A, __B),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epu64(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_max_epu64(__A, __B),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_max_epu64(__A, __B),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_min_round_pd(A, B, R) \
-  ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
-                                    (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_min_round_pd(W, U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_min_round_pd((A), (B), (R)), \
-                                   (__v8df)(W)))
-
-#define _mm512_maskz_min_round_pd(U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_min_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd()))
-
-static  __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_min_pd(__m512d __A, __m512d __B)
-{
-  return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
-                                           _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__U,
-                                              (__v8df)_mm512_min_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__U,
-                                              (__v8df)_mm512_min_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_min_round_ps(A, B, R) \
-  ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
-                                   (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_min_round_ps(W, U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
-                                  (__v16sf)(W)))
-
-#define _mm512_maskz_min_round_ps(U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps()))
-
-static  __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_min_ps(__m512 __A, __m512 __B)
-{
-  return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
-                                          _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512(__U,
-                                             (__v16sf)_mm512_min_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512(__U,
-                                             (__v16sf)_mm512_min_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf)  _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_min_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_min_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                           (int)(R)))
-
-#define _mm_maskz_min_round_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)  _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_min_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_min_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_min_round_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline __m512i
-__DEFAULT_FN_ATTRS512
-_mm512_min_epi32(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                            (__v16si)_mm512_min_epi32(__A, __B),
-                                            (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                            (__v16si)_mm512_min_epi32(__A, __B),
-                                            (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epu32(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                            (__v16si)_mm512_min_epu32(__A, __B),
-                                            (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                            (__v16si)_mm512_min_epu32(__A, __B),
-                                            (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epi64(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_min_epi64(__A, __B),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_min_epi64(__A, __B),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epu64(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_min_epu64(__A, __B),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_min_epu64(__A, __B),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epi32(__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_mul_epi32(__X, __Y),
-                                             (__v8di)__W);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_mul_epi32(__X, __Y),
-                                             (__v8di)_mm512_setzero_si512 ());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epu32(__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_mul_epu32(__X, __Y),
-                                             (__v8di)__W);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                             (__v8di)_mm512_mul_epu32(__X, __Y),
-                                             (__v8di)_mm512_setzero_si512 ());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mullo_epi32 (__m512i __A, __m512i __B)
-{
-  return (__m512i) ((__v16su) __A * (__v16su) __B);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                             (__v16si)_mm512_mullo_epi32(__A, __B),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                             (__v16si)_mm512_mullo_epi32(__A, __B),
-                                             (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mullox_epi64 (__m512i __A, __m512i __B) {
-  return (__m512i) ((__v8du) __A * (__v8du) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_mullox_epi64(__A, __B),
-                                             (__v8di)__W);
-}
-
-#define _mm512_sqrt_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
-
-#define _mm512_mask_sqrt_round_pd(W, U, A, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_sqrt_round_pd((A), (R)), \
-                                       (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_sqrt_round_pd(U, A, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_sqrt_round_pd((A), (R)), \
-                                       (__v8df)_mm512_setzero_pd()))
-
-static  __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_sqrt_pd(__m512d __A)
-{
-  return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
-                                           _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__U,
-                                              (__v8df)_mm512_sqrt_pd(__A),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__U,
-                                              (__v8df)_mm512_sqrt_pd(__A),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_sqrt_round_ps(A, R) \
-  ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
-
-#define _mm512_mask_sqrt_round_ps(W, U, A, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                      (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
-                                      (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_sqrt_round_ps(U, A, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                      (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
-                                      (__v16sf)_mm512_setzero_ps()))
-
-static  __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_sqrt_ps(__m512 __A)
-{
-  return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
-                                          _MM_FROUND_CUR_DIRECTION);
-}
-
-static  __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512(__U,
-                                             (__v16sf)_mm512_sqrt_ps(__A),
-                                             (__v16sf)__W);
-}
-
-static  __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512(__U,
-                                             (__v16sf)_mm512_sqrt_ps(__A),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static  __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_rsqrt14_pd(__m512d __A)
-{
-  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
-                 (__v8df)
-                 _mm512_setzero_pd (),
-                 (__mmask8) -1);}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
-                  (__v8df) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
-                  (__v8df)
-                  _mm512_setzero_pd (),
-                  (__mmask8) __U);
-}
-
-static  __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_rsqrt14_ps(__m512 __A)
-{
-  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
-                (__v16sf)
-                _mm512_setzero_ps (),
-                (__mmask16) -1);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
-                 (__v16sf) __W,
-                 (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
-                 (__v16sf)
-                 _mm512_setzero_ps (),
-                 (__mmask16) __U);
-}
-
-static  __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_rsqrt14_ss(__m128 __A, __m128 __B)
-{
-  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
-             (__v4sf) __B,
-             (__v4sf)
-             _mm_setzero_ps (),
-             (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) __W,
-          (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) _mm_setzero_ps (),
-          (__mmask8) __U);
-}
-
-static  __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_rsqrt14_sd(__m128d __A, __m128d __B)
-{
-  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
-              (__v2df) __B,
-              (__v2df)
-              _mm_setzero_pd (),
-              (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
-          (__v2df) __B,
-          (__v2df) __W,
-          (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
-          (__v2df) __B,
-          (__v2df) _mm_setzero_pd (),
-          (__mmask8) __U);
-}
-
-static  __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_rcp14_pd(__m512d __A)
-{
-  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
-               (__v8df)
-               _mm512_setzero_pd (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
-                (__v8df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
-                (__v8df)
-                _mm512_setzero_pd (),
-                (__mmask8) __U);
-}
-
-static  __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_rcp14_ps(__m512 __A)
-{
-  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
-              (__v16sf)
-              _mm512_setzero_ps (),
-              (__mmask16) -1);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
-                   (__v16sf) __W,
-                   (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
-                   (__v16sf)
-                   _mm512_setzero_ps (),
-                   (__mmask16) __U);
-}
-
-static  __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_rcp14_ss(__m128 __A, __m128 __B)
-{
-  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
-                 (__v4sf) __B,
-                 (__v4sf)
-                 _mm_setzero_ps (),
-                 (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) __W,
-          (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) _mm_setzero_ps (),
-          (__mmask8) __U);
-}
-
-static  __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_rcp14_sd(__m128d __A, __m128d __B)
-{
-  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
-            (__v2df) __B,
-            (__v2df)
-            _mm_setzero_pd (),
-            (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
-          (__v2df) __B,
-          (__v2df) __W,
-          (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
-          (__v2df) __B,
-          (__v2df) _mm_setzero_pd (),
-          (__mmask8) __U);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_floor_ps(__m512 __A)
-{
-  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
-                                                  _MM_FROUND_FLOOR,
-                                                  (__v16sf) __A, (unsigned short)-1,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
-                   _MM_FROUND_FLOOR,
-                   (__v16sf) __W, __U,
-                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_floor_pd(__m512d __A)
-{
-  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
-                                                   _MM_FROUND_FLOOR,
-                                                   (__v8df) __A, (unsigned char)-1,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
-                _MM_FROUND_FLOOR,
-                (__v8df) __W, __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
-                   _MM_FROUND_CEIL,
-                   (__v16sf) __W, __U,
-                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_ceil_ps(__m512 __A)
-{
-  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
-                                                  _MM_FROUND_CEIL,
-                                                  (__v16sf) __A, (unsigned short)-1,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_ceil_pd(__m512d __A)
-{
-  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
-                                                   _MM_FROUND_CEIL,
-                                                   (__v8df) __A, (unsigned char)-1,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
-                _MM_FROUND_CEIL,
-                (__v8df) __W, __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_abs_epi64(__m512i __A)
-{
-  return (__m512i)__builtin_elementwise_abs((__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_abs_epi64(__A),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_abs_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_abs_epi32(__m512i __A)
-{
-  return (__m512i)__builtin_elementwise_abs((__v16si) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                             (__v16si)_mm512_abs_epi32(__A),
-                                             (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                             (__v16si)_mm512_abs_epi32(__A),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  __A = _mm_add_ss(__A, __B);
-  return __builtin_ia32_selectss_128(__U, __A, __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  __A = _mm_add_ss(__A, __B);
-  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
-}
-
-#define _mm_add_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_add_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                           (int)(R)))
-
-#define _mm_maskz_add_round_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  __A = _mm_add_sd(__A, __B);
-  return __builtin_ia32_selectsd_128(__U, __A, __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  __A = _mm_add_sd(__A, __B);
-  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
-}
-#define _mm_add_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_add_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_add_round_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_add_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_add_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_add_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_add_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-#define _mm512_add_round_pd(A, B, R) \
-  ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
-                                    (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_add_round_pd(W, U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_add_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_add_round_pd(U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_add_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_add_round_ps(A, B, R) \
-  ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
-                                   (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_add_round_ps(W, U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_add_round_ps(U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  __A = _mm_sub_ss(__A, __B);
-  return __builtin_ia32_selectss_128(__U, __A, __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  __A = _mm_sub_ss(__A, __B);
-  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
-}
-#define _mm_sub_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_sub_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                           (int)(R)))
-
-#define _mm_maskz_sub_round_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  __A = _mm_sub_sd(__A, __B);
-  return __builtin_ia32_selectsd_128(__U, __A, __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  __A = _mm_sub_sd(__A, __B);
-  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
-}
-
-#define _mm_sub_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_sub_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_sub_round_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_sub_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_sub_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_sub_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_sub_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-#define _mm512_sub_round_pd(A, B, R) \
-  ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
-                                    (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_sub_round_pd(W, U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_sub_round_pd(U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_sub_round_ps(A, B, R) \
-  ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
-                                   (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_sub_round_ps(W, U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_sub_round_ps(U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  __A = _mm_mul_ss(__A, __B);
-  return __builtin_ia32_selectss_128(__U, __A, __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  __A = _mm_mul_ss(__A, __B);
-  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
-}
-#define _mm_mul_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_mul_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                           (int)(R)))
-
-#define _mm_maskz_mul_round_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  __A = _mm_mul_sd(__A, __B);
-  return __builtin_ia32_selectsd_128(__U, __A, __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  __A = _mm_mul_sd(__A, __B);
-  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
-}
-
-#define _mm_mul_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_mul_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_mul_round_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_mul_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_mul_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_mul_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_mul_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-#define _mm512_mul_round_pd(A, B, R) \
-  ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
-                                    (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_mul_round_pd(W, U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_mul_round_pd(U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_mul_round_ps(A, B, R) \
-  ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
-                                  (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_mul_round_ps(W, U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_mul_round_ps(U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  __A = _mm_div_ss(__A, __B);
-  return __builtin_ia32_selectss_128(__U, __A, __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  __A = _mm_div_ss(__A, __B);
-  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
-}
-
-#define _mm_div_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_div_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                           (int)(R)))
-
-#define _mm_maskz_div_round_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  __A = _mm_div_sd(__A, __B);
-  return __builtin_ia32_selectsd_128(__U, __A, __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  __A = _mm_div_sd(__A, __B);
-  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
-}
-
-#define _mm_div_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_div_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_div_round_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_div_pd(__m512d __a, __m512d __b)
-{
-  return (__m512d)((__v8df)__a/(__v8df)__b);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_div_pd(__A, __B),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_div_pd(__A, __B),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_div_ps(__m512 __a, __m512 __b)
-{
-  return (__m512)((__v16sf)__a/(__v16sf)__b);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_div_ps(__A, __B),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_div_ps(__A, __B),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-#define _mm512_div_round_pd(A, B, R) \
-  ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
-                                    (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_div_round_pd(W, U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_div_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_div_round_pd(U, A, B, R) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_div_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_div_round_ps(A, B, R) \
-  ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
-                                   (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_div_round_ps(W, U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_div_round_ps(U, A, B, R) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps()))
-
-#define _mm512_roundscale_ps(A, B) \
-  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
-                                          (__v16sf)_mm512_undefined_ps(), \
-                                          (__mmask16)-1, \
-                                          _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_ps(A, B, C, imm) \
-  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
-                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
-                                         _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_roundscale_ps(A, B, imm) \
-  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(A), \
-                                          _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
-  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
-                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
-                                         (int)(R)))
-
-#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
-  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(A), (int)(R)))
-
-#define _mm512_roundscale_round_ps(A, imm, R) \
-  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                          (__v16sf)_mm512_undefined_ps(), \
-                                          (__mmask16)-1, (int)(R)))
-
-#define _mm512_roundscale_pd(A, B) \
-  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
-                                           (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1, \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_pd(A, B, C, imm) \
-  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
-                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
-                                          _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_roundscale_pd(A, B, imm) \
-  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(A), \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
-  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
-                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
-                                          (int)(R)))
-
-#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
-  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(A), (int)(R)))
-
-#define _mm512_roundscale_round_pd(A, imm, R) \
-  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                           (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm512_fmadd_round_pd(A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_fmsub_round_pd(A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            -(__v8df)(__m512d)(C), \
-                                            (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            -(__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             -(__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_fnmadd_round_pd(A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_fnmsub_round_pd(A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            -(__v8df)(__m512d)(C), \
-                                            (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             -(__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    -(__v8df) __C,
-                                                    (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    -(__v8df) __C,
-                                                    (__mmask8) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
-                                                     (__v8df) __B,
-                                                     -(__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    -(__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    -(__v8df) __B,
-                                                    -(__v8df) __C,
-                                                    (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
-                                                     (__v8df) __B,
-                                                     -(__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmadd_round_ps(A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_fmsub_round_ps(A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           -(__v16sf)(__m512)(C), \
-                                           (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           -(__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            -(__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_fnmadd_round_ps(A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                           -(__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_fnmsub_round_ps(A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                           -(__v16sf)(__m512)(B), \
-                                           -(__v16sf)(__m512)(C), \
-                                           (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            -(__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   (__mmask16) -1,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   (__mmask16) __U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   -(__v16sf) __C,
-                                                   (__mmask16) -1,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   -(__v16sf) __C,
-                                                   (__mmask16) __U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    -(__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   -(__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   (__mmask16) -1,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   -(__v16sf) __B,
-                                                   -(__v16sf) __C,
-                                                   (__mmask16) -1,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    -(__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmaddsub_round_pd(A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8df)(__m512d)(C), \
-                                               (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
-  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
-                                                (__v8df)(__m512d)(B), \
-                                                (__v8df)(__m512d)(C), \
-                                                (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
-                                                (__v8df)(__m512d)(B), \
-                                                (__v8df)(__m512d)(C), \
-                                                (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_fmsubadd_round_pd(A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               -(__v8df)(__m512d)(C), \
-                                               (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               -(__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
-                                                (__v8df)(__m512d)(B), \
-                                                -(__v8df)(__m512d)(C), \
-                                                (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
-                                                      (__v8df) __B,
-                                                      (__v8df) __C,
-                                                      (__mmask8) -1,
-                                                      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
-                                                      (__v8df) __B,
-                                                      (__v8df) __C,
-                                                      (__mmask8) __U,
-                                                      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       (__v8df) __C,
-                                                       (__mmask8) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       (__v8df) __C,
-                                                       (__mmask8) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       -(__v8df) __C,
-                                                       (__mmask8) -1,
-                                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       -(__v8df) __C,
-                                                       (__mmask8) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
-                                                        (__v8df) __B,
-                                                        -(__v8df) __C,
-                                                        (__mmask8) __U,
-                                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmaddsub_round_ps(A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16sf)(__m512)(C), \
-                                              (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
-  ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
-                                               (__v16sf)(__m512)(B), \
-                                               (__v16sf)(__m512)(C), \
-                                               (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
-                                               (__v16sf)(__m512)(B), \
-                                               (__v16sf)(__m512)(C), \
-                                               (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_fmsubadd_round_ps(A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              -(__v16sf)(__m512)(C), \
-                                              (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              -(__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
-                                               (__v16sf)(__m512)(B), \
-                                               -(__v16sf)(__m512)(C), \
-                                               (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
-                                                      (__v16sf) __B,
-                                                      (__v16sf) __C,
-                                                      (__mmask16) -1,
-                                                      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
-                                                      (__v16sf) __B,
-                                                      (__v16sf) __C,
-                                                      (__mmask16) __U,
-                                                      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
-                                                       (__v16sf) __B,
-                                                       (__v16sf) __C,
-                                                       (__mmask16) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
-                                                       (__v16sf) __B,
-                                                       (__v16sf) __C,
-                                                       (__mmask16) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
-                                                      (__v16sf) __B,
-                                                      -(__v16sf) __C,
-                                                      (__mmask16) -1,
-                                                      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
-                                                      (__v16sf) __B,
-                                                      -(__v16sf) __C,
-                                                      (__mmask16) __U,
-                                                      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
-                                                       (__v16sf) __B,
-                                                       -(__v16sf) __C,
-                                                       (__mmask16) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
-  ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
-  return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
-  ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
-  return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   (__mmask16) __U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
-  ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
-                                                (__v8df)(__m512d)(B), \
-                                                (__v8df)(__m512d)(C), \
-                                                (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
-  return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
-                                                       (__v8df) __B,
-                                                       (__v8df) __C,
-                                                       (__mmask8) __U,
-                                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
-  ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
-                                               (__v16sf)(__m512)(B), \
-                                               (__v16sf)(__m512)(C), \
-                                               (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
-  return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
-                                                      (__v16sf) __B,
-                                                      (__v16sf) __C,
-                                                      (__mmask16) __U,
-                                                      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                            -(__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    -(__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                           -(__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   -(__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   (__mmask16) __U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
-  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                            -(__v8df)(__m512d)(B), \
-                                            -(__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
-  ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8df)(__m512d)(C), \
-                                             (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    -(__v8df) __B,
-                                                    -(__v8df) __C,
-                                                    (__mmask8) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
-  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
-                                                     (__v8df) __B,
-                                                     (__v8df) __C,
-                                                     (__mmask8) __U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
-  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                           -(__v16sf)(__m512)(B), \
-                                           -(__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
-  ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16sf)(__m512)(C), \
-                                            (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
-  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
-                                                   -(__v16sf) __B,
-                                                   -(__v16sf) __C,
-                                                   (__mmask16) __U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
-  return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
-                                                    (__v16sf) __B,
-                                                    (__v16sf) __C,
-                                                    (__mmask16) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-
-
-/* Vector permutations */
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
-                                                (__v16si) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
-                               __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                              (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
-                              (__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
-                                __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                              (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
-                              (__v16si)__I);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                              (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
-                              (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
-                                                (__v8di) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
-                               __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                               (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
-                               (__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
-                                __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                               (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
-                               (__v8di)__I);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                               (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
-                               (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_alignr_epi64(A, B, I) \
-  ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
-                                     (__v8di)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                  (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
-                                  (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_alignr_epi64(U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                  (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
-                                  (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_alignr_epi32(A, B, I) \
-  ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
-                                     (__v16si)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
-                                 (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_alignr_epi32(U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
-                                 (__v16si)_mm512_setzero_si512()))
-/* Vector Extract */
-
-#define _mm512_extractf64x4_pd(A, I) \
-  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
-                                             (__v4df)_mm256_undefined_pd(), \
-                                             (__mmask8)-1))
-
-#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
-  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                             (__v4df)(__m256d)(W), \
-                                             (__mmask8)(U)))
-
-#define _mm512_maskz_extractf64x4_pd(U, A, imm) \
-  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                             (__v4df)_mm256_setzero_pd(), \
-                                             (__mmask8)(U)))
-
-#define _mm512_extractf32x4_ps(A, I) \
-  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
-                                            (__v4sf)_mm_undefined_ps(), \
-                                            (__mmask8)-1))
-
-#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
-  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                            (__v4sf)(__m128)(W), \
-                                            (__mmask8)(U)))
-
-#define _mm512_maskz_extractf32x4_ps(U, A, imm) \
-  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                            (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)(U)))
-
-/* Vector Blend */
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
-{
-  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
-                 (__v8df) __W,
-                 (__v8df) __A);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
-{
-  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
-                (__v16sf) __W,
-                (__v16sf) __A);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
-{
-  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
-                (__v8di) __W,
-                (__v8di) __A);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
-{
-  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
-                (__v16si) __W,
-                (__v16si) __A);
-}
-
-/* Compare */
-
-#define _mm512_cmp_round_ps_mask(A, B, P, R) \
-  ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), (int)(P), \
-                                           (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
-  ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), (int)(P), \
-                                           (__mmask16)(U), (int)(R)))
-
-#define _mm512_cmp_ps_mask(A, B, P) \
-  _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
-  _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_cmpeq_ps_mask(A, B) \
-    _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
-#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
-    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
-
-#define _mm512_cmplt_ps_mask(A, B) \
-    _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
-#define _mm512_mask_cmplt_ps_mask(k, A, B) \
-    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
-
-#define _mm512_cmple_ps_mask(A, B) \
-    _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
-#define _mm512_mask_cmple_ps_mask(k, A, B) \
-    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
-
-#define _mm512_cmpunord_ps_mask(A, B) \
-    _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
-#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
-    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
-
-#define _mm512_cmpneq_ps_mask(A, B) \
-    _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
-#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
-    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
-
-#define _mm512_cmpnlt_ps_mask(A, B) \
-    _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
-#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
-    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
-
-#define _mm512_cmpnle_ps_mask(A, B) \
-    _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
-#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
-    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
-
-#define _mm512_cmpord_ps_mask(A, B) \
-    _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
-#define _mm512_mask_cmpord_ps_mask(k, A, B) \
-    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
-
-#define _mm512_cmp_round_pd_mask(A, B, P, R) \
-  ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(P), \
-                                          (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
-  ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(P), \
-                                          (__mmask8)(U), (int)(R)))
-
-#define _mm512_cmp_pd_mask(A, B, P) \
-  _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
-  _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_cmpeq_pd_mask(A, B) \
-    _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
-#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
-    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
-
-#define _mm512_cmplt_pd_mask(A, B) \
-    _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
-#define _mm512_mask_cmplt_pd_mask(k, A, B) \
-    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
-
-#define _mm512_cmple_pd_mask(A, B) \
-    _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
-#define _mm512_mask_cmple_pd_mask(k, A, B) \
-    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
-
-#define _mm512_cmpunord_pd_mask(A, B) \
-    _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
-#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
-    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
-
-#define _mm512_cmpneq_pd_mask(A, B) \
-    _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
-#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
-    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
-
-#define _mm512_cmpnlt_pd_mask(A, B) \
-    _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
-#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
-    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
-
-#define _mm512_cmpnle_pd_mask(A, B) \
-    _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
-#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
-    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
-
-#define _mm512_cmpord_pd_mask(A, B) \
-    _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
-#define _mm512_mask_cmpord_pd_mask(k, A, B) \
-    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
-
-/* Conversion */
-
-#define _mm512_cvtt_roundps_epu32(A, R) \
-  ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
-                                              (__v16si)_mm512_undefined_epi32(), \
-                                              (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
-                                              (__v16si)(__m512i)(W), \
-                                              (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
-  ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
-                                              (__v16si)_mm512_setzero_si512(), \
-                                              (__mmask16)(U), (int)(R)))
-
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttps_epu32(__m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
-                  (__v16si)
-                  _mm512_setzero_si512 (),
-                  (__mmask16) -1,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
-                   (__v16si) __W,
-                   (__mmask16) __U,
-                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
-                   (__v16si) _mm512_setzero_si512 (),
-                   (__mmask16) __U,
-                   _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi32_ps(A, R) \
-  ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
-  ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
-                                           (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
-  ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R)))
-
-#define _mm512_cvt_roundepu32_ps(A, R) \
-  ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
-                                            (__v16sf)_mm512_setzero_ps(), \
-                                            (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
-  ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
-                                            (__v16sf)(__m512)(W), \
-                                            (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
-  ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
-                                            (__v16sf)_mm512_setzero_ps(), \
-                                            (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32_ps (__m512i __A)
-{
-  return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_cvtepu32_ps(__A),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_cvtepu32_ps(__A),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_pd(__m256i __A)
-{
-  return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
-                                              (__v8df)_mm512_cvtepi32_pd(__A),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
-                                              (__v8df)_mm512_cvtepi32_pd(__A),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32lo_pd(__m512i __A)
-{
-  return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
-{
-  return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_ps (__m512i __A)
-{
-  return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_cvtepi32_ps(__A),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_cvtepi32_ps(__A),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32_pd(__m256i __A)
-{
-  return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
-                                              (__v8df)_mm512_cvtepu32_pd(__A),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
-                                              (__v8df)_mm512_cvtepu32_pd(__A),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32lo_pd(__m512i __A)
-{
-  return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
-{
-  return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
-}
-
-#define _mm512_cvt_roundpd_ps(A, R) \
-  ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
-  ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
-                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                           (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
-  ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_ps (__m512d __A)
-{
-  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
-                (__v8sf) _mm256_undefined_ps (),
-                (__mmask8) -1,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
-{
-  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
-                (__v8sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
-{
-  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
-                (__v8sf) _mm256_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_pslo (__m512d __A)
-{
-  return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
-                (__v8sf) _mm256_setzero_ps (),
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
-{
-  return (__m512) __builtin_shufflevector (
-                (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
-                                               __U, __A),
-                (__v8sf) _mm256_setzero_ps (),
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-#define _mm512_cvt_roundps_ph(A, I) \
-  ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
-                                             (__v16hi)_mm256_undefined_si256(), \
-                                             (__mmask16)-1))
-
-#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
-  ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
-                                             (__v16hi)(__m256i)(U), \
-                                             (__mmask16)(W)))
-
-#define _mm512_maskz_cvt_roundps_ph(W, A, I) \
-  ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
-                                             (__v16hi)_mm256_setzero_si256(), \
-                                             (__mmask16)(W)))
-
-#define _mm512_cvtps_ph       _mm512_cvt_roundps_ph
-#define _mm512_mask_cvtps_ph  _mm512_mask_cvt_roundps_ph
-#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
-
-#define _mm512_cvt_roundph_ps(A, R) \
-  ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
-                                            (__v16sf)_mm512_undefined_ps(), \
-                                            (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
-  ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
-                                            (__v16sf)(__m512)(W), \
-                                            (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_ps(U, A, R) \
-  ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
-                                            (__v16sf)_mm512_setzero_ps(), \
-                                            (__mmask16)(U), (int)(R)))
-
-
-static  __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_cvtph_ps(__m256i __A)
-{
-  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
-                (__v16sf)
-                _mm512_setzero_ps (),
-                (__mmask16) -1,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
-{
-  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
-                 (__v16sf) __W,
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
-{
-  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
-                 (__v16sf) _mm512_setzero_ps (),
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundpd_epi32(A, R) \
-  ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8si)_mm256_setzero_si256(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
-  ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8si)(__m256i)(W), \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
-  ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8si)_mm256_setzero_si256(), \
-                                             (__mmask8)(U), (int)(R)))
-
-static __inline __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvttpd_epi32(__m512d __a)
-{
-  return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
-                                                   (__v8si)_mm256_setzero_si256(),
-                                                   (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
-                  (__v8si) __W,
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
-                  (__v8si) _mm256_setzero_si256 (),
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundps_epi32(A, R) \
-  ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
-                                             (__v16si)_mm512_setzero_si512(), \
-                                             (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
-                                             (__v16si)(__m512i)(W), \
-                                             (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
-  ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
-                                             (__v16si)_mm512_setzero_si512(), \
-                                             (__mmask16)(U), (int)(R)))
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttps_epi32(__m512 __a)
-{
-  return (__m512i)
-    __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
-                                     (__v16si) _mm512_setzero_si512 (),
-                                     (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
-                  (__v16si) __W,
-                  (__mmask16) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
-                  (__v16si) _mm512_setzero_si512 (),
-                  (__mmask16) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundps_epi32(A, R) \
-  ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
-                                            (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
-                                            (__v16si)(__m512i)(W), \
-                                            (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
-  ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
-                                            (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtps_epi32 (__m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
-                 (__v16si) _mm512_undefined_epi32 (),
-                 (__mmask16) -1,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
-                 (__v16si) __W,
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
-                 (__v16si)
-                 _mm512_setzero_si512 (),
-                 (__mmask16) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundpd_epi32(A, R) \
-  ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
-  ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)(__m256i)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
-  ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_epi32 (__m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
-                 (__v8si)
-                 _mm256_undefined_si256 (),
-                 (__mmask8) -1,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
-                 (__v8si) __W,
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundps_epu32(A, R) \
-  ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
-                                             (__v16si)_mm512_setzero_si512(), \
-                                             (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
-  ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
-                                             (__v16si)(__m512i)(W), \
-                                             (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
-  ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
-                                             (__v16si)_mm512_setzero_si512(), \
-                                             (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtps_epu32 ( __m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
-                  (__v16si)\
-                  _mm512_undefined_epi32 (),
-                  (__mmask16) -1,\
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
-                  (__v16si) __W,
-                  (__mmask16) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
-{
-  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
-                  (__v16si)
-                  _mm512_setzero_si512 (),
-                  (__mmask16) __U ,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundpd_epu32(A, R) \
-  ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8si)_mm256_setzero_si256(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
-  ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8si)(__m256i)(W), \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
-  ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8si)_mm256_setzero_si256(), \
-                                             (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_epu32 (__m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
-                  (__v8si)
-                  _mm256_undefined_si256 (),
-                  (__mmask8) -1,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
-                  (__v8si) __W,
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
-                  (__v8si)
-                  _mm256_setzero_si256 (),
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_cvtsd_f64(__m512d __a)
-{
-  return __a[0];
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_cvtss_f32(__m512 __a)
-{
-  return __a[0];
-}
-
-/* Unpack and Interleave */
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_pd(__m512d __a, __m512d __b)
-{
-  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
-                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
-                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
-                                           (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
-                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
-                                           (__v8df)_mm512_setzero_pd());
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_pd(__m512d __a, __m512d __b)
-{
-  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
-                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
-                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
-                                           (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
-                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
-                                           (__v8df)_mm512_setzero_pd());
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_ps(__m512 __a, __m512 __b)
-{
-  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
-                                         2,    18,    3,    19,
-                                         2+4,  18+4,  3+4,  19+4,
-                                         2+8,  18+8,  3+8,  19+8,
-                                         2+12, 18+12, 3+12, 19+12);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
-                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
-                                          (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
-                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
-                                          (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_ps(__m512 __a, __m512 __b)
-{
-  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
-                                         0,    16,    1,    17,
-                                         0+4,  16+4,  1+4,  17+4,
-                                         0+8,  16+8,  1+8,  17+8,
-                                         0+12, 16+12, 1+12, 17+12);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
-                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
-                                          (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
-                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
-                                          (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
-                                          2,    18,    3,    19,
-                                          2+4,  18+4,  3+4,  19+4,
-                                          2+8,  18+8,  3+8,  19+8,
-                                          2+12, 18+12, 3+12, 19+12);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
-                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
-                                       (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
-                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
-                                       (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
-                                          0,    16,    1,    17,
-                                          0+4,  16+4,  1+4,  17+4,
-                                          0+8,  16+8,  1+8,  17+8,
-                                          0+12, 16+12, 1+12, 17+12);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
-                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
-                                       (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
-                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
-                                       (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
-                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
-                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
-                                        (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
-                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
-                                        (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
-                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
-                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
-                                        (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
-                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
-                                        (__v8di)_mm512_setzero_si512());
-}
-
-
-/* SIMD load ops */
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_si512 (void const *__P)
-{
-  struct __loadu_si512 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_si512*)__P)->__v;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_epi32 (void const *__P)
-{
-  struct __loadu_epi32 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi32*)__P)->__v;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
-                  (__v16si) __W,
-                  (__mmask16) __U);
-}
-
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
-                                                     (__v16si)
-                                                     _mm512_setzero_si512 (),
-                                                     (__mmask16) __U);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_epi64 (void const *__P)
-{
-  struct __loadu_epi64 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi64*)__P)->__v;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
-                  (__v8di) __W,
-                  (__mmask8) __U);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
-                                                     (__v8di)
-                                                     _mm512_setzero_si512 (),
-                                                     (__mmask8) __U);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
-{
-  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
-                   (__v16sf) __W,
-                   (__mmask16) __U);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
-{
-  return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
-                                                  (__v16sf)
-                                                  _mm512_setzero_ps (),
-                                                  (__mmask16) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
-{
-  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
-                (__v8df) __W,
-                (__mmask8) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
-{
-  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
-                                                   (__v8df)
-                                                   _mm512_setzero_pd (),
-                                                   (__mmask8) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_loadu_pd(void const *__p)
-{
-  struct __loadu_pd {
-    __m512d_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_pd*)__p)->__v;
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_loadu_ps(void const *__p)
-{
-  struct __loadu_ps {
-    __m512_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_ps*)__p)->__v;
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_load_ps(void const *__p)
-{
-  return *(const __m512*)__p;
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
-{
-  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
-                   (__v16sf) __W,
-                   (__mmask16) __U);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
-{
-  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
-                                                  (__v16sf)
-                                                  _mm512_setzero_ps (),
-                                                  (__mmask16) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_load_pd(void const *__p)
-{
-  return *(const __m512d*)__p;
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
-{
-  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
-                          (__v8df) __W,
-                          (__mmask8) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
-{
-  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
-                                                   (__v8df)
-                                                   _mm512_setzero_pd (),
-                                                   (__mmask8) __U);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_load_si512 (void const *__P)
-{
-  return *(const __m512i *) __P;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_load_epi32 (void const *__P)
-{
-  return *(const __m512i *) __P;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_load_epi64 (void const *__P)
-{
-  return *(const __m512i *) __P;
-}
-
-/* SIMD store ops */
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_epi64 (void *__P, __m512i __A)
-{
-  struct __storeu_epi64 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi64*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
-{
-  __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
-                                     (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_si512 (void *__P, __m512i __A)
-{
-  struct __storeu_si512 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_si512*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_epi32 (void *__P, __m512i __A)
-{
-  struct __storeu_epi32 {
-    __m512i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi32*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
-{
-  __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
-                                     (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
-{
-  __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_pd(void *__P, __m512d __A)
-{
-  struct __storeu_pd {
-    __m512d_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_pd*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
-{
-  __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
-                                   (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_ps(void *__P, __m512 __A)
-{
-  struct __storeu_ps {
-    __m512_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_ps*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
-{
-  __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_pd(void *__P, __m512d __A)
-{
-  *(__m512d*)__P = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
-{
-  __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
-                                   (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_ps(void *__P, __m512 __A)
-{
-  *(__m512*)__P = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_si512 (void *__P, __m512i __A)
-{
-  *(__m512i *) __P = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_epi32 (void *__P, __m512i __A)
-{
-  *(__m512i *) __P = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_epi64 (void *__P, __m512i __A)
-{
-  *(__m512i *) __P = __A;
-}
-
-/* Mask ops */
-
-static __inline __mmask16 __DEFAULT_FN_ATTRS
-_mm512_knot(__mmask16 __M)
-{
-  return __builtin_ia32_knothi(__M);
-}
-
-/* Integer compare */
-
-#define _mm512_cmpeq_epi32_mask(A, B) \
-    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
-    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epi32_mask(A, B) \
-    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epi32_mask(k, A, B) \
-    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epi32_mask(A, B) \
-    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
-    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epi32_mask(A, B) \
-    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epi32_mask(k, A, B) \
-    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epi32_mask(A, B) \
-    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epi32_mask(k, A, B) \
-    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epi32_mask(A, B) \
-    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
-    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epu32_mask(A, B) \
-    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
-    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epu32_mask(A, B) \
-    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epu32_mask(k, A, B) \
-    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epu32_mask(A, B) \
-    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
-    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epu32_mask(A, B) \
-    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epu32_mask(k, A, B) \
-    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epu32_mask(A, B) \
-    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epu32_mask(k, A, B) \
-    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epu32_mask(A, B) \
-    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
-    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epi64_mask(A, B) \
-    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
-    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epi64_mask(A, B) \
-    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epi64_mask(k, A, B) \
-    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epi64_mask(A, B) \
-    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
-    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epi64_mask(A, B) \
-    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epi64_mask(k, A, B) \
-    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epi64_mask(A, B) \
-    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epi64_mask(k, A, B) \
-    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epi64_mask(A, B) \
-    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
-    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epu64_mask(A, B) \
-    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
-    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epu64_mask(A, B) \
-    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epu64_mask(k, A, B) \
-    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epu64_mask(A, B) \
-    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
-    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epu64_mask(A, B) \
-    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epu64_mask(k, A, B) \
-    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epu64_mask(A, B) \
-    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epu64_mask(k, A, B) \
-    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epu64_mask(A, B) \
-    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
-    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi8_epi32(__m128i __A)
-{
-  /* This function always performs a signed extension, but __v16qi is a char
-     which may be signed or unsigned, so use __v16qs. */
-  return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_cvtepi8_epi32(__A),
-                                             (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_cvtepi8_epi32(__A),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi8_epi64(__m128i __A)
-{
-  /* This function always performs a signed extension, but __v16qi is a char
-     which may be signed or unsigned, so use __v16qs. */
-  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepi8_epi64(__A),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepi8_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_epi64(__m256i __X)
-{
-  return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepi32_epi64(__X),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepi32_epi64(__X),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi16_epi32(__m256i __A)
-{
-  return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_cvtepi16_epi32(__A),
-                                            (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_cvtepi16_epi32(__A),
-                                            (__v16si)_mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi16_epi64(__m128i __A)
-{
-  return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepi16_epi64(__A),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepi16_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu8_epi32(__m128i __A)
-{
-  return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_cvtepu8_epi32(__A),
-                                             (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_cvtepu8_epi32(__A),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu8_epi64(__m128i __A)
-{
-  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepu8_epi64(__A),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepu8_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32_epi64(__m256i __X)
-{
-  return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepu32_epi64(__X),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepu32_epi64(__X),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu16_epi32(__m256i __A)
-{
-  return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_cvtepu16_epi32(__A),
-                                            (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_cvtepu16_epi32(__A),
-                                            (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu16_epi64(__m128i __A)
-{
-  return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepu16_epi64(__A),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_cvtepu16_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_rorv_epi32 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                           (__v16si)_mm512_rorv_epi32(__A, __B),
-                                           (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                           (__v16si)_mm512_rorv_epi32(__A, __B),
-                                           (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_rorv_epi64 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                            (__v8di)_mm512_rorv_epi64(__A, __B),
-                                            (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                            (__v8di)_mm512_rorv_epi64(__A, __B),
-                                            (__v8di)_mm512_setzero_si512());
-}
-
-
-
-#define _mm512_cmp_epi32_mask(a, b, p) \
-  ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
-                                          (__v16si)(__m512i)(b), (int)(p), \
-                                          (__mmask16)-1))
-
-#define _mm512_cmp_epu32_mask(a, b, p) \
-  ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
-                                           (__v16si)(__m512i)(b), (int)(p), \
-                                           (__mmask16)-1))
-
-#define _mm512_cmp_epi64_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
-                                         (__v8di)(__m512i)(b), (int)(p), \
-                                         (__mmask8)-1))
-
-#define _mm512_cmp_epu64_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
-                                          (__v8di)(__m512i)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
-  ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
-                                          (__v16si)(__m512i)(b), (int)(p), \
-                                          (__mmask16)(m)))
-
-#define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
-  ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
-                                           (__v16si)(__m512i)(b), (int)(p), \
-                                           (__mmask16)(m)))
-
-#define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
-                                         (__v8di)(__m512i)(b), (int)(p), \
-                                         (__mmask8)(m)))
-
-#define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
-                                          (__v8di)(__m512i)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-#define _mm512_rol_epi32(a, b) \
-  ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
-
-#define _mm512_mask_rol_epi32(W, U, a, b) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                       (__v16si)_mm512_rol_epi32((a), (b)), \
-                                       (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_rol_epi32(U, a, b) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                       (__v16si)_mm512_rol_epi32((a), (b)), \
-                                       (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_rol_epi64(a, b) \
-  ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
-
-#define _mm512_mask_rol_epi64(W, U, a, b) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                       (__v8di)_mm512_rol_epi64((a), (b)), \
-                                       (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_rol_epi64(U, a, b) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                       (__v8di)_mm512_rol_epi64((a), (b)), \
-                                       (__v8di)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_rolv_epi32 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                           (__v16si)_mm512_rolv_epi32(__A, __B),
-                                           (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                           (__v16si)_mm512_rolv_epi32(__A, __B),
-                                           (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_rolv_epi64 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                            (__v8di)_mm512_rolv_epi64(__A, __B),
-                                            (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                            (__v8di)_mm512_rolv_epi64(__A, __B),
-                                            (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_ror_epi32(A, B) \
-  ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
-
-#define _mm512_mask_ror_epi32(W, U, A, B) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                       (__v16si)_mm512_ror_epi32((A), (B)), \
-                                       (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_ror_epi32(U, A, B) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                       (__v16si)_mm512_ror_epi32((A), (B)), \
-                                       (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_ror_epi64(A, B) \
-  ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
-
-#define _mm512_mask_ror_epi64(W, U, A, B) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                       (__v8di)_mm512_ror_epi64((A), (B)), \
-                                       (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_ror_epi64(U, A, B) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                       (__v8di)_mm512_ror_epi64((A), (B)), \
-                                       (__v8di)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_slli_epi32(__m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
-                       unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                         (__v16si)_mm512_slli_epi32(__A, __B),
-                                         (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                         (__v16si)_mm512_slli_epi32(__A, __B),
-                                         (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_slli_epi64(__m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                          (__v8di)_mm512_slli_epi64(__A, __B),
-                                          (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                          (__v8di)_mm512_slli_epi64(__A, __B),
-                                          (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srli_epi32(__m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
-                       unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                         (__v16si)_mm512_srli_epi32(__A, __B),
-                                         (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                         (__v16si)_mm512_srli_epi32(__A, __B),
-                                         (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srli_epi64(__m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
-                       unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                          (__v8di)_mm512_srli_epi64(__A, __B),
-                                          (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
-                        unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                          (__v8di)_mm512_srli_epi64(__A, __B),
-                                          (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
-              (__v16si) __W,
-              (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
-              (__v16si)
-              _mm512_setzero_si512 (),
-              (__mmask16) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
-{
-  __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
-          (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
-                 (__v16si) __A,
-                 (__v16si) __W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
-                 (__v16si) __A,
-                 (__v16si) _mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
-                 (__v8di) __A,
-                 (__v8di) __W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
-                 (__v8di) __A,
-                 (__v8di) _mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
-              (__v8di) __W,
-              (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
-              (__v8di)
-              _mm512_setzero_si512 (),
-              (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
-{
-  __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
-          (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_movedup_pd (__m512d __A)
-{
-  return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
-                                          0, 0, 2, 2, 4, 4, 6, 6);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_movedup_pd(__A),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_movedup_pd(__A),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
-  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8di)(__m512i)(C), (int)(imm), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
-  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8di)(__m512i)(C), (int)(imm), \
-                                              (__mmask8)(U), (int)(R)))
-
-#define _mm512_fixupimm_pd(A, B, C, imm) \
-  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8di)(__m512i)(C), (int)(imm), \
-                                              (__mmask8)-1, \
-                                              _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
-  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8di)(__m512i)(C), (int)(imm), \
-                                              (__mmask8)(U), \
-                                              _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
-  ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8di)(__m512i)(C), \
-                                               (int)(imm), (__mmask8)(U), \
-                                               (int)(R)))
-
-#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
-  ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8di)(__m512i)(C), \
-                                               (int)(imm), (__mmask8)(U), \
-                                               _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
-  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                             (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
-  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                             (__mmask16)(U), (int)(R)))
-
-#define _mm512_fixupimm_ps(A, B, C, imm) \
-  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                             (__mmask16)-1, \
-                                             _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
-  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                             (__mmask16)(U), \
-                                             _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
-  ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16si)(__m512i)(C), \
-                                              (int)(imm), (__mmask16)(U), \
-                                              (int)(R)))
-
-#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
-  ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16si)(__m512i)(C), \
-                                              (int)(imm), (__mmask16)(U), \
-                                              _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_fixupimm_round_sd(A, B, C, imm, R) \
-  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2di)(__m128i)(C), (int)(imm), \
-                                           (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
-  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2di)(__m128i)(C), (int)(imm), \
-                                           (__mmask8)(U), (int)(R)))
-
-#define _mm_fixupimm_sd(A, B, C, imm) \
-  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2di)(__m128i)(C), (int)(imm), \
-                                           (__mmask8)-1, \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
-  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2di)(__m128i)(C), (int)(imm), \
-                                           (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
-  ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2di)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
-  ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2di)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)(U), \
-                                            _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_fixupimm_round_ss(A, B, C, imm, R) \
-  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4si)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
-  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4si)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)(U), (int)(R)))
-
-#define _mm_fixupimm_ss(A, B, C, imm) \
-  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4si)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)-1, \
-                                          _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
-  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4si)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
-  ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4si)(__m128i)(C), (int)(imm), \
-                                           (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
-  ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4si)(__m128i)(C), (int)(imm), \
-                                           (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_getexp_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
-                                                  (__v2df)(__m128d)(B), \
-                                                  (__v2df)_mm_setzero_pd(), \
-                                                  (__mmask8)-1, (int)(R)))
-
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_getexp_sd (__m128d __A, __m128d __B)
-{
-  return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
-                 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
-          (__v2df) __B,
-          (__v2df) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_getexp_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
-                                                  (__v2df)(__m128d)(B), \
-                                                  (__v2df)(__m128d)(W), \
-                                                  (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
-          (__v2df) __B,
-          (__v2df) _mm_setzero_pd (),
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_getexp_round_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
-                                                  (__v2df)(__m128d)(B), \
-                                                  (__v2df)_mm_setzero_pd(), \
-                                                  (__mmask8)(U), (int)(R)))
-
-#define _mm_getexp_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
-                                                 (__v4sf)(__m128)(B), \
-                                                 (__v4sf)_mm_setzero_ps(), \
-                                                 (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_getexp_ss (__m128 __A, __m128 __B)
-{
-  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
-                (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) __W,
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_getexp_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
-                                                 (__v4sf)(__m128)(B), \
-                                                 (__v4sf)(__m128)(W), \
-                                                 (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
-          (__v4sf) __B,
-          (__v4sf) _mm_setzero_ps (),
-          (__mmask8) __U,
-          _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_getexp_round_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
-                                                 (__v4sf)(__m128)(B), \
-                                                 (__v4sf)_mm_setzero_ps(), \
-                                                 (__mmask8)(U), (int)(R)))
-
-#define _mm_getmant_round_sd(A, B, C, D, R) \
-  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (int)(((D)<<2) | (C)), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1, (int)(R)))
-
-#define _mm_getmant_sd(A, B, C, D)  \
-  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (int)(((D)<<2) | (C)), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1, \
-                                                _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_sd(W, U, A, B, C, D) \
-  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (int)(((D)<<2) | (C)), \
-                                                (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U), \
-                                                _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
-  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (int)(((D)<<2) | (C)), \
-                                                (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_getmant_sd(U, A, B, C, D) \
-  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (int)(((D)<<2) | (C)), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U), \
-                                                _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
-  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (int)(((D)<<2) | (C)), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U), (int)(R)))
-
-#define _mm_getmant_round_ss(A, B, C, D, R) \
-  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1, (int)(R)))
-
-#define _mm_getmant_ss(A, B, C, D) \
-  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1, \
-                                               _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_ss(W, U, A, B, C, D) \
-  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v4sf)(__m128)(W), \
-                                               (__mmask8)(U), \
-                                               _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
-  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v4sf)(__m128)(W), \
-                                               (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_getmant_ss(U, A, B, C, D) \
-  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(U), \
-                                               _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
-  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(U), (int)(R)))
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kmov (__mmask16 __A)
-{
-  return  __A;
-}
-
-#define _mm_comi_round_sd(A, B, P, R) \
-  ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
-                               (int)(P), (int)(R)))
-
-#define _mm_comi_round_ss(A, B, P, R) \
-  ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
-                               (int)(P), (int)(R)))
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsd_si64(A, R) \
-  ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
-#endif
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sll_epi32(__m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                          (__v16si)_mm512_sll_epi32(__A, __B),
-                                          (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                          (__v16si)_mm512_sll_epi32(__A, __B),
-                                          (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sll_epi64(__m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_sll_epi64(__A, __B),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                           (__v8di)_mm512_sll_epi64(__A, __B),
-                                           (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sllv_epi32(__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
-                                           (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
-                                           (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sllv_epi64(__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
-                                            (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
-                                            (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sra_epi32(__m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                          (__v16si)_mm512_sra_epi32(__A, __B),
-                                          (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                          (__v16si)_mm512_sra_epi32(__A, __B),
-                                          (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sra_epi64(__m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                           (__v8di)_mm512_sra_epi64(__A, __B),
-                                           (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                           (__v8di)_mm512_sra_epi64(__A, __B),
-                                           (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srav_epi32(__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                           (__v16si)_mm512_srav_epi32(__X, __Y),
-                                           (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                           (__v16si)_mm512_srav_epi32(__X, __Y),
-                                           (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srav_epi64(__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                            (__v8di)_mm512_srav_epi64(__X, __Y),
-                                            (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                            (__v8di)_mm512_srav_epi64(__X, __Y),
-                                            (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srl_epi32(__m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                          (__v16si)_mm512_srl_epi32(__A, __B),
-                                          (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                          (__v16si)_mm512_srl_epi32(__A, __B),
-                                          (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srl_epi64(__m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                           (__v8di)_mm512_srl_epi64(__A, __B),
-                                           (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                           (__v8di)_mm512_srl_epi64(__A, __B),
-                                           (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srlv_epi32(__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
-                                           (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
-                                           (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
-                                            (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
-                                            (__v8di)_mm512_setzero_si512());
-}
-
-/// \enum _MM_TERNLOG_ENUM
-///    A helper to represent the ternary logic operations among vector \a A,
-///    \a B and \a C. The representation is passed to \a imm.
-typedef enum {
-  _MM_TERNLOG_A = 0xF0,
-  _MM_TERNLOG_B = 0xCC,
-  _MM_TERNLOG_C = 0xAA
-} _MM_TERNLOG_ENUM;
-
-#define _mm512_ternarylogic_epi32(A, B, C, imm)                                \
-  ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
-      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
-      (unsigned char)(imm), (__mmask16)-1))
-
-#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
-  ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
-      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
-      (unsigned char)(imm), (__mmask16)(U)))
-
-#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
-  ((__m512i)__builtin_ia32_pternlogd512_maskz(                                 \
-      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
-      (unsigned char)(imm), (__mmask16)(U)))
-
-#define _mm512_ternarylogic_epi64(A, B, C, imm)                                \
-  ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
-      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
-      (unsigned char)(imm), (__mmask8)-1))
-
-#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
-  ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
-      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
-  ((__m512i)__builtin_ia32_pternlogq512_maskz(                                 \
-      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsd_i64(A, R) \
-  ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
-#endif
-
-#define _mm_cvt_roundsd_si32(A, R) \
-  ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
-
-#define _mm_cvt_roundsd_i32(A, R) \
-  ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
-
-#define _mm_cvt_roundsd_u32(A, R) \
-  ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)))
-
-static __inline__ unsigned __DEFAULT_FN_ATTRS128
-_mm_cvtsd_u32 (__m128d __A)
-{
-  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
-             _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsd_u64(A, R) \
-  ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
-                                                   (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvtsd_u64 (__m128d __A)
-{
-  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
-                 __A,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvt_roundss_si32(A, R) \
-  ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
-
-#define _mm_cvt_roundss_i32(A, R) \
-  ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
-
-#ifdef __x86_64__
-#define _mm_cvt_roundss_si64(A, R) \
-  ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
-
-#define _mm_cvt_roundss_i64(A, R) \
-  ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
-#endif
-
-#define _mm_cvt_roundss_u32(A, R) \
-  ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)))
-
-static __inline__ unsigned __DEFAULT_FN_ATTRS128
-_mm_cvtss_u32 (__m128 __A)
-{
-  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
-             _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundss_u64(A, R) \
-  ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
-                                                   (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvtss_u64 (__m128 __A)
-{
-  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
-                 __A,
-                 _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundsd_i32(A, R) \
-  ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
-
-#define _mm_cvtt_roundsd_si32(A, R) \
-  ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
-
-static __inline__ int __DEFAULT_FN_ATTRS128
-_mm_cvttsd_i32 (__m128d __A)
-{
-  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
-              _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundsd_si64(A, R) \
-  ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
-
-#define _mm_cvtt_roundsd_i64(A, R) \
-  ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
-
-static __inline__ long long __DEFAULT_FN_ATTRS128
-_mm_cvttsd_i64 (__m128d __A)
-{
-  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
-              _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundsd_u32(A, R) \
-  ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)))
-
-static __inline__ unsigned __DEFAULT_FN_ATTRS128
-_mm_cvttsd_u32 (__m128d __A)
-{
-  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
-              _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundsd_u64(A, R) \
-  ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
-                                                    (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvttsd_u64 (__m128d __A)
-{
-  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
-                  __A,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundss_i32(A, R) \
-  ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
-
-#define _mm_cvtt_roundss_si32(A, R) \
-  ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
-
-static __inline__ int __DEFAULT_FN_ATTRS128
-_mm_cvttss_i32 (__m128 __A)
-{
-  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
-              _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundss_i64(A, R) \
-  ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
-
-#define _mm_cvtt_roundss_si64(A, R) \
-  ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
-
-static __inline__ long long __DEFAULT_FN_ATTRS128
-_mm_cvttss_i64 (__m128 __A)
-{
-  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
-              _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundss_u32(A, R) \
-  ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)))
-
-static __inline__ unsigned __DEFAULT_FN_ATTRS128
-_mm_cvttss_u32 (__m128 __A)
-{
-  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
-              _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundss_u64(A, R) \
-  ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
-                                                    (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvttss_u64 (__m128 __A)
-{
-  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
-                  __A,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm512_permute_pd(X, C) \
-  ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)))
-
-#define _mm512_mask_permute_pd(W, U, X, C) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                        (__v8df)_mm512_permute_pd((X), (C)), \
-                                        (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_permute_pd(U, X, C) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                        (__v8df)_mm512_permute_pd((X), (C)), \
-                                        (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_permute_ps(X, C) \
-  ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)))
-
-#define _mm512_mask_permute_ps(W, U, X, C) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                       (__v16sf)_mm512_permute_ps((X), (C)), \
-                                       (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_permute_ps(U, X, C) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                       (__v16sf)_mm512_permute_ps((X), (C)), \
-                                       (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_permutevar_pd(__m512d __A, __m512i __C)
-{
-  return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                         (__v8df)_mm512_permutevar_pd(__A, __C),
-                                         (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                         (__v8df)_mm512_permutevar_pd(__A, __C),
-                                         (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_permutevar_ps(__m512 __A, __m512i __C)
-{
-  return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
-                                        (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
-                                        (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
-                                                 (__v8df)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__U,
-                                  (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
-                                  (__v8df)__A);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
-                             __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__U,
-                                  (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
-                                  (__v8df)(__m512d)__I);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
-                             __m512d __B)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__U,
-                                  (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
-                                  (__v8df)_mm512_setzero_pd());
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
-{
-  return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
-                                                (__v16sf) __B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512(__U,
-                                 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
-                                 (__v16sf)__A);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512(__U,
-                                 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
-                                 (__v16sf)(__m512)__I);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
-{
-  return (__m512)__builtin_ia32_selectps_512(__U,
-                                 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
-                                 (__v16sf)_mm512_setzero_ps());
-}
-
-
-#define _mm512_cvtt_roundpd_epu32(A, R) \
-  ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
-                                              (__v8si)_mm256_undefined_si256(), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
-  ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
-                                              (__v8si)(__m256i)(W), \
-                                              (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
-  ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
-                                              (__v8si)_mm256_setzero_si256(), \
-                                              (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvttpd_epu32 (__m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
-                  (__v8si)
-                  _mm256_undefined_si256 (),
-                  (__mmask8) -1,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
-                  (__v8si) __W,
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
-{
-  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
-                  (__v8si)
-                  _mm256_setzero_si256 (),
-                  (__mmask8) __U,
-                  _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_roundscale_round_sd(A, B, imm, R) \
-  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)-1, (int)(imm), \
-                                                 (int)(R)))
-
-#define _mm_roundscale_sd(A, B, imm) \
-  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)-1, (int)(imm), \
-                                                 _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_sd(W, U, A, B, imm) \
-  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)(__m128d)(W), \
-                                                 (__mmask8)(U), (int)(imm), \
-                                                 _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
-  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)(__m128d)(W), \
-                                                 (__mmask8)(U), (int)(I), \
-                                                 (int)(R)))
-
-#define _mm_maskz_roundscale_sd(U, A, B, I) \
-  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)(U), (int)(I), \
-                                                 _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
-  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)(U), (int)(I), \
-                                                 (int)(R)))
-
-#define _mm_roundscale_round_ss(A, B, imm, R) \
-  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)-1, (int)(imm), \
-                                                (int)(R)))
-
-#define _mm_roundscale_ss(A, B, imm) \
-  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)-1, (int)(imm), \
-                                                _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_ss(W, U, A, B, I) \
-  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)(__m128)(W), \
-                                                (__mmask8)(U), (int)(I), \
-                                                _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
-  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)(__m128)(W), \
-                                                (__mmask8)(U), (int)(I), \
-                                                (int)(R)))
-
-#define _mm_maskz_roundscale_ss(U, A, B, I) \
-  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)(U), (int)(I), \
-                                                _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
-  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)(U), (int)(I), \
-                                                (int)(R)))
-
-#define _mm512_scalef_round_pd(A, B, R) \
-  ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)_mm512_undefined_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
-  ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_scalef_round_pd(U, A, B, R) \
-  ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_scalef_pd (__m512d __A, __m512d __B)
-{
-  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
-                (__v8df) __B,
-                (__v8df)
-                _mm512_undefined_pd (),
-                (__mmask8) -1,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
-                (__v8df) __B,
-                (__v8df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
-{
-  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
-                (__v8df) __B,
-                (__v8df)
-                _mm512_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_scalef_round_ps(A, B, R) \
-  ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)_mm512_undefined_ps(), \
-                                           (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
-  ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_scalef_round_ps(U, A, B, R) \
-  ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_scalef_ps (__m512 __A, __m512 __B)
-{
-  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
-               (__v16sf) __B,
-               (__v16sf)
-               _mm512_undefined_ps (),
-               (__mmask16) -1,
-               _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
-               (__v16sf) __B,
-               (__v16sf) __W,
-               (__mmask16) __U,
-               _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
-  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
-               (__v16sf) __B,
-               (__v16sf)
-               _mm512_setzero_ps (),
-               (__mmask16) __U,
-               _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_scalef_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_scalef_sd (__m128d __A, __m128d __B)
-{
-  return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
-              (__v2df)( __B), (__v2df) _mm_setzero_pd(),
-              (__mmask8) -1,
-              _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
-                 (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_scalef_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (__v2df)(__m128d)(W), \
-                                               (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
-                 (__v2df) __B,
-                (__v2df) _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_scalef_round_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(U), (int)(R)))
-
-#define _mm_scalef_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_scalef_ss (__m128 __A, __m128 __B)
-{
-  return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
-             (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
-             (__mmask8) -1,
-             _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_scalef_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v4sf)(__m128)(W), \
-                                              (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
-                 (__v4sf) __B,
-                (__v4sf) _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_scalef_round_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(U), \
-                                              (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srai_epi32(__m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
-                       unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                         (__v16si)_mm512_srai_epi32(__A, __B),
-                                         (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
-                        unsigned int __B) {
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                         (__v16si)_mm512_srai_epi32(__A, __B),
-                                         (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srai_epi64(__m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                          (__v8di)_mm512_srai_epi64(__A, __B),
-                                          (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                          (__v8di)_mm512_srai_epi64(__A, __B),
-                                          (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_shuffle_f32x4(A, B, imm) \
-  ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
-                                     (__v16sf)(__m512)(B), (int)(imm)))
-
-#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                       (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
-                                       (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                       (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
-                                       (__v16sf)_mm512_setzero_ps()))
-
-#define _mm512_shuffle_f64x2(A, B, imm) \
-  ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
-                                      (__v8df)(__m512d)(B), (int)(imm)))
-
-#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                        (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
-                                        (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                        (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
-                                        (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_shuffle_i32x4(A, B, imm) \
-  ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
-                                      (__v16si)(__m512i)(B), (int)(imm)))
-
-#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                       (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
-                                       (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                       (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
-                                       (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_shuffle_i64x2(A, B, imm) \
-  ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
-                                      (__v8di)(__m512i)(B), (int)(imm)))
-
-#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                       (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
-                                       (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                       (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
-                                       (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_shuffle_pd(A, B, M) \
-  ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
-                                     (__v8df)(__m512d)(B), (int)(M)))
-
-#define _mm512_mask_shuffle_pd(W, U, A, B, M) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                        (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
-                                        (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_shuffle_pd(U, A, B, M) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                        (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
-                                        (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_shuffle_ps(A, B, M) \
-  ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
-                                    (__v16sf)(__m512)(B), (int)(M)))
-
-#define _mm512_mask_shuffle_ps(W, U, A, B, M) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                       (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
-                                       (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_shuffle_ps(U, A, B, M) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                       (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
-                                       (__v16sf)_mm512_setzero_ps()))
-
-#define _mm_sqrt_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
-                 (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v2df)(__m128d)(W), \
-                                             (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
-                 (__v2df) __B,
-                (__v2df) _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_sqrt_round_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm_sqrt_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
-                                            (__v4sf)(__m128)(B), \
-                                            (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
-                 (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
-                                            (__v4sf)(__m128)(B), \
-                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                            (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
-                 (__v4sf) __B,
-                (__v4sf) _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_sqrt_round_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
-                                            (__v4sf)(__m128)(B), \
-                                            (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f32x4(__m128 __A)
-{
-  return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
-                                         0, 1, 2, 3, 0, 1, 2, 3,
-                                         0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
-                                           (__v16sf)_mm512_broadcast_f32x4(__A),
-                                           (__v16sf)__O);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
-                                           (__v16sf)_mm512_broadcast_f32x4(__A),
-                                           (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f64x4(__m256d __A)
-{
-  return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
-                                          0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
-                                            (__v8df)_mm512_broadcast_f64x4(__A),
-                                            (__v8df)__O);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
-                                            (__v8df)_mm512_broadcast_f64x4(__A),
-                                            (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i32x4(__m128i __A)
-{
-  return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
-                                          0, 1, 2, 3, 0, 1, 2, 3,
-                                          0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                           (__v16si)_mm512_broadcast_i32x4(__A),
-                                           (__v16si)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                           (__v16si)_mm512_broadcast_i32x4(__A),
-                                           (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i64x4(__m256i __A)
-{
-  return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
-                                          0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                            (__v8di)_mm512_broadcast_i64x4(__A),
-                                            (__v8di)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                            (__v8di)_mm512_broadcast_i64x4(__A),
-                                            (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__M,
-                                              (__v8df) _mm512_broadcastsd_pd(__A),
-                                              (__v8df) __O);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512(__M,
-                                              (__v8df) _mm512_broadcastsd_pd(__A),
-                                              (__v8df) _mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512(__M,
-                                             (__v16sf) _mm512_broadcastss_ps(__A),
-                                             (__v16sf) __O);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512(__M,
-                                             (__v16sf) _mm512_broadcastss_ps(__A),
-                                             (__v16sf) _mm512_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi32_epi8 (__m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
-               (__v16qi) _mm_undefined_si128 (),
-               (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
-               (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
-{
-  __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi32_epi16 (__m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
-               (__v16hi) _mm256_undefined_si256 (),
-               (__mmask16) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
-               (__v16hi) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
-               (__v16hi) _mm256_setzero_si256 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
-{
-  __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi64_epi8 (__m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
-               (__v16qi) _mm_undefined_si128 (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
-               (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
-{
-  __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi64_epi32 (__m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
-               (__v8si) _mm256_undefined_si256 (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
-               (__v8si) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
-               (__v8si) _mm256_setzero_si256 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
-{
-  __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi64_epi16 (__m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
-               (__v8hi) _mm_undefined_si128 (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
-               (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
-               (__v8hi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
-{
-  __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi32_epi8 (__m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
-                (__v16qi) _mm_undefined_si128 (),
-                (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
-                (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
-                (__v16qi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
-{
-  __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi32_epi16 (__m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
-                (__v16hi) _mm256_undefined_si256 (),
-                (__mmask16) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
-                (__v16hi) __O,
-                __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
-                (__v16hi) _mm256_setzero_si256 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
-{
-  __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi64_epi8 (__m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
-                (__v16qi) _mm_undefined_si128 (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
-                (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
-                (__v16qi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
-{
-  __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi64_epi32 (__m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
-                (__v8si) _mm256_undefined_si256 (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
-                (__v8si) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
-                (__v8si) _mm256_setzero_si256 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
-{
-  __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi64_epi16 (__m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
-                (__v8hi) _mm_undefined_si128 (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
-                (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
-                (__v8hi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
-{
-  __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_epi8 (__m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
-              (__v16qi) _mm_undefined_si128 (),
-              (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
-              (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
-              (__v16qi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
-{
-  __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_epi16 (__m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
-              (__v16hi) _mm256_undefined_si256 (),
-              (__mmask16) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
-              (__v16hi) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
-              (__v16hi) _mm256_setzero_si256 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
-{
-  __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_epi8 (__m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
-              (__v16qi) _mm_undefined_si128 (),
-              (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
-              (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
-              (__v16qi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
-{
-  __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_epi32 (__m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
-              (__v8si) _mm256_undefined_si256 (),
-              (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
-              (__v8si) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
-{
-  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
-              (__v8si) _mm256_setzero_si256 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
-{
-  __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_epi16 (__m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
-              (__v8hi) _mm_undefined_si128 (),
-              (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
-              (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
-              (__v8hi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
-{
-  __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
-}
-
-#define _mm512_extracti32x4_epi32(A, imm) \
-  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                             (__v4si)_mm_undefined_si128(), \
-                                             (__mmask8)-1))
-
-#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
-  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                             (__v4si)(__m128i)(W), \
-                                             (__mmask8)(U)))
-
-#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
-  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                             (__v4si)_mm_setzero_si128(), \
-                                             (__mmask8)(U)))
-
-#define _mm512_extracti64x4_epi64(A, imm) \
-  ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                             (__v4di)_mm256_undefined_si256(), \
-                                             (__mmask8)-1))
-
-#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
-  ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                             (__v4di)(__m256i)(W), \
-                                             (__mmask8)(U)))
-
-#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
-  ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                             (__v4di)_mm256_setzero_si256(), \
-                                             (__mmask8)(U)))
-
-#define _mm512_insertf64x4(A, B, imm) \
-  ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
-                                       (__v4df)(__m256d)(B), (int)(imm)))
-
-#define _mm512_mask_insertf64x4(W, U, A, B, imm) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
-                                   (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_insertf64x4(U, A, B, imm) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                   (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
-                                   (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_inserti64x4(A, B, imm) \
-  ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
-                                       (__v4di)(__m256i)(B), (int)(imm)))
-
-#define _mm512_mask_inserti64x4(W, U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                   (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
-                                   (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_inserti64x4(U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                   (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
-                                   (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_insertf32x4(A, B, imm) \
-  ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
-                                      (__v4sf)(__m128)(B), (int)(imm)))
-
-#define _mm512_mask_insertf32x4(W, U, A, B, imm) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
-                                  (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_insertf32x4(U, A, B, imm) \
-  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                  (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
-                                  (__v16sf)_mm512_setzero_ps()))
-
-#define _mm512_inserti32x4(A, B, imm) \
-  ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
-                                       (__v4si)(__m128i)(B), (int)(imm)))
-
-#define _mm512_mask_inserti32x4(W, U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                  (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
-                                  (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_inserti32x4(U, A, B, imm) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                  (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
-                                  (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_getmant_round_pd(A, B, C, R) \
-  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v8df)_mm512_undefined_pd(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v8df)(__m512d)(W), \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
-  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v8df)_mm512_setzero_pd(), \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_getmant_pd(A, B, C) \
-  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v8df)_mm512_setzero_pd(), \
-                                             (__mmask8)-1, \
-                                             _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_getmant_pd(W, U, A, B, C) \
-  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v8df)(__m512d)(W), \
-                                             (__mmask8)(U), \
-                                             _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_getmant_pd(U, A, B, C) \
-  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v8df)_mm512_setzero_pd(), \
-                                             (__mmask8)(U), \
-                                             _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_getmant_round_ps(A, B, C, R) \
-  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v16sf)_mm512_undefined_ps(), \
-                                            (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
-  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v16sf)(__m512)(W), \
-                                            (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
-  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v16sf)_mm512_setzero_ps(), \
-                                            (__mmask16)(U), (int)(R)))
-
-#define _mm512_getmant_ps(A, B, C) \
-  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                            (int)(((C)<<2)|(B)), \
-                                            (__v16sf)_mm512_undefined_ps(), \
-                                            (__mmask16)-1, \
-                                            _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_getmant_ps(W, U, A, B, C) \
-  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                            (int)(((C)<<2)|(B)), \
-                                            (__v16sf)(__m512)(W), \
-                                            (__mmask16)(U), \
-                                            _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_getmant_ps(U, A, B, C) \
-  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                            (int)(((C)<<2)|(B)), \
-                                            (__v16sf)_mm512_setzero_ps(), \
-                                            (__mmask16)(U), \
-                                            _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_getexp_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)_mm512_undefined_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_getexp_round_pd(W, U, A, R) \
-  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_getexp_round_pd(U, A, R) \
-  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_getexp_pd (__m512d __A)
-{
-  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
-                (__v8df) _mm512_undefined_pd (),
-                (__mmask8) -1,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
-                (__v8df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
-                (__v8df) _mm512_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_getexp_round_ps(A, R) \
-  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)_mm512_undefined_ps(), \
-                                           (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_getexp_round_ps(W, U, A, R) \
-  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_getexp_round_ps(U, A, R) \
-  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_getexp_ps (__m512 __A)
-{
-  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
-               (__v16sf) _mm512_undefined_ps (),
-               (__mmask16) -1,
-               _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
-               (__v16sf) __W,
-               (__mmask16) __U,
-               _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
-               (__v16sf) _mm512_setzero_ps (),
-               (__mmask16) __U,
-               _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_i64gather_ps(index, addr, scale) \
-  ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
-                                        (void const *)(addr), \
-                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
-                                        (int)(scale)))
-
-#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
-  ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
-                                        (void const *)(addr), \
-                                        (__v8di)(__m512i)(index), \
-                                        (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i64gather_epi32(index, addr, scale) \
-  ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
-                                         (void const *)(addr), \
-                                         (__v8di)(__m512i)(index), \
-                                         (__mmask8)-1, (int)(scale)))
-
-#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
-  ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v8di)(__m512i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i64gather_pd(index, addr, scale) \
-  ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
-                                        (void const *)(addr), \
-                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
-                                        (int)(scale)))
-
-#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
-  ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v8di)(__m512i)(index), \
-                                        (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i64gather_epi64(index, addr, scale) \
-  ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
-                                        (void const *)(addr), \
-                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
-                                        (int)(scale)))
-
-#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
-  ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v8di)(__m512i)(index), \
-                                        (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i32gather_ps(index, addr, scale) \
-  ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
-                                        (void const *)(addr), \
-                                        (__v16si)(__m512)(index), \
-                                        (__mmask16)-1, (int)(scale)))
-
-#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
-  ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v16si)(__m512)(index), \
-                                        (__mmask16)(mask), (int)(scale)))
-
-#define _mm512_i32gather_epi32(index, addr, scale) \
-  ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
-                                         (void const *)(addr), \
-                                         (__v16si)(__m512i)(index), \
-                                         (__mmask16)-1, (int)(scale)))
-
-#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
-  ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v16si)(__m512i)(index), \
-                                         (__mmask16)(mask), (int)(scale)))
-
-#define _mm512_i32gather_pd(index, addr, scale) \
-  ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
-                                        (void const *)(addr), \
-                                        (__v8si)(__m256i)(index), (__mmask8)-1, \
-                                        (int)(scale)))
-
-#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
-  ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v8si)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i32gather_epi64(index, addr, scale) \
-  ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
-                                        (void const *)(addr), \
-                                        (__v8si)(__m256i)(index), (__mmask8)-1, \
-                                        (int)(scale)))
-
-#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
-  ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v8si)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i64scatter_ps(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \
-                                (__v8di)(__m512i)(index), \
-                                (__v8sf)(__m256)(v1), (int)(scale))
-
-#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \
-                                (__v8di)(__m512i)(index), \
-                                (__v8sf)(__m256)(v1), (int)(scale))
-
-#define _mm512_i64scatter_epi32(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \
-                                (__v8di)(__m512i)(index), \
-                                (__v8si)(__m256i)(v1), (int)(scale))
-
-#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \
-                                (__v8di)(__m512i)(index), \
-                                (__v8si)(__m256i)(v1), (int)(scale))
-
-#define _mm512_i64scatter_pd(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \
-                               (__v8di)(__m512i)(index), \
-                               (__v8df)(__m512d)(v1), (int)(scale))
-
-#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \
-                               (__v8di)(__m512i)(index), \
-                               (__v8df)(__m512d)(v1), (int)(scale))
-
-#define _mm512_i64scatter_epi64(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \
-                               (__v8di)(__m512i)(index), \
-                               (__v8di)(__m512i)(v1), (int)(scale))
-
-#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \
-                               (__v8di)(__m512i)(index), \
-                               (__v8di)(__m512i)(v1), (int)(scale))
-
-#define _mm512_i32scatter_ps(addr, index, v1, scale) \
-  __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \
-                                (__v16si)(__m512i)(index), \
-                                (__v16sf)(__m512)(v1), (int)(scale))
-
-#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \
-  __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \
-                                (__v16si)(__m512i)(index), \
-                                (__v16sf)(__m512)(v1), (int)(scale))
-
-#define _mm512_i32scatter_epi32(addr, index, v1, scale) \
-  __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \
-                                (__v16si)(__m512i)(index), \
-                                (__v16si)(__m512i)(v1), (int)(scale))
-
-#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
-  __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \
-                                (__v16si)(__m512i)(index), \
-                                (__v16si)(__m512i)(v1), (int)(scale))
-
-#define _mm512_i32scatter_pd(addr, index, v1, scale) \
-  __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \
-                               (__v8si)(__m256i)(index), \
-                               (__v8df)(__m512d)(v1), (int)(scale))
-
-#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \
-  __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \
-                               (__v8si)(__m256i)(index), \
-                               (__v8df)(__m512d)(v1), (int)(scale))
-
-#define _mm512_i32scatter_epi64(addr, index, v1, scale) \
-  __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \
-                               (__v8si)(__m256i)(index), \
-                               (__v8di)(__m512i)(v1), (int)(scale))
-
-#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
-  __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \
-                               (__v8si)(__m256i)(index), \
-                               (__v8di)(__m512i)(v1), (int)(scale))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
-  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
-                                       (__v4sf)__A,
-                                       (__v4sf)__B,
-                                       (__mmask8)__U,
-                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmadd_round_ss(A, B, C, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
-                                         (__v4sf)(__m128)(C), (__mmask8)-1, \
-                                         (int)(R)))
-
-#define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
-                                         (__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), (__mmask8)(U), \
-                                         (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
-                                        (__v4sf)__B,
-                                        (__v4sf)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)(__m128)(C), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
-{
-  return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
-                                        (__v4sf)__X,
-                                        (__v4sf)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
-                                          (__v4sf)(__m128)(X), \
-                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
-  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
-                                       (__v4sf)__A,
-                                       -(__v4sf)__B,
-                                       (__mmask8)__U,
-                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmsub_round_ss(A, B, C, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
-                                         -(__v4sf)(__m128)(C), (__mmask8)-1, \
-                                         (int)(R)))
-
-#define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
-                                         (__v4sf)(__m128)(A), \
-                                         -(__v4sf)(__m128)(B), (__mmask8)(U), \
-                                         (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
-                                        (__v4sf)__B,
-                                        -(__v4sf)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          -(__v4sf)(__m128)(C), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
-{
-  return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
-                                        (__v4sf)__X,
-                                        (__v4sf)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
-  ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
-                                          (__v4sf)(__m128)(X), \
-                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
-  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
-                                       -(__v4sf)__A,
-                                       (__v4sf)__B,
-                                       (__mmask8)__U,
-                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmadd_round_ss(A, B, C, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                         -(__v4sf)(__m128)(B), \
-                                         (__v4sf)(__m128)(C), (__mmask8)-1, \
-                                         (int)(R)))
-
-#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
-                                         -(__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), (__mmask8)(U), \
-                                         (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
-                                        -(__v4sf)__B,
-                                        (__v4sf)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
-                                          -(__v4sf)(__m128)(B), \
-                                          (__v4sf)(__m128)(C), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
-{
-  return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
-                                        -(__v4sf)__X,
-                                        (__v4sf)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
-                                          -(__v4sf)(__m128)(X), \
-                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
-  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
-                                       -(__v4sf)__A,
-                                       -(__v4sf)__B,
-                                       (__mmask8)__U,
-                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmsub_round_ss(A, B, C, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                         -(__v4sf)(__m128)(B), \
-                                         -(__v4sf)(__m128)(C), (__mmask8)-1, \
-                                         (int)(R)))
-
-#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
-                                         -(__v4sf)(__m128)(A), \
-                                         -(__v4sf)(__m128)(B), (__mmask8)(U), \
-                                         (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
-                                        -(__v4sf)__B,
-                                        -(__v4sf)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
-  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
-                                          -(__v4sf)(__m128)(B), \
-                                          -(__v4sf)(__m128)(C), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
-{
-  return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
-                                        -(__v4sf)__X,
-                                        (__v4sf)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
-  ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
-                                          -(__v4sf)(__m128)(X), \
-                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
-  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
-                                       (__v2df)__A,
-                                       (__v2df)__B,
-                                       (__mmask8)__U,
-                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmadd_round_sd(A, B, C, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
-                                          (__v2df)(__m128d)(C), (__mmask8)-1, \
-                                          (int)(R)))
-
-#define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
-                                          (__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
-                                        (__v2df)__B,
-                                        (__v2df)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)(__m128d)(C), (__mmask8)(U), \
-                                           (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
-{
-  return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
-                                        (__v2df)__X,
-                                        (__v2df)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
-                                           (__v2df)(__m128d)(X), \
-                                           (__v2df)(__m128d)(Y), (__mmask8)(U), \
-                                           (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
-  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
-                                       (__v2df)__A,
-                                       -(__v2df)__B,
-                                       (__mmask8)__U,
-                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmsub_round_sd(A, B, C, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
-                                          -(__v2df)(__m128d)(C), (__mmask8)-1, \
-                                          (int)(R)))
-
-#define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
-                                          (__v2df)(__m128d)(A), \
-                                          -(__v2df)(__m128d)(B), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
-                                        (__v2df)__B,
-                                        -(__v2df)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           -(__v2df)(__m128d)(C), \
-                                           (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
-{
-  return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
-                                        (__v2df)__X,
-                                        (__v2df)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
-  ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
-                                           (__v2df)(__m128d)(X), \
-                                           (__v2df)(__m128d)(Y), \
-                                           (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
-  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
-                                       -(__v2df)__A,
-                                       (__v2df)__B,
-                                       (__mmask8)__U,
-                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmadd_round_sd(A, B, C, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                          -(__v2df)(__m128d)(B), \
-                                          (__v2df)(__m128d)(C), (__mmask8)-1, \
-                                          (int)(R)))
-
-#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
-                                          -(__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
-                                        -(__v2df)__B,
-                                        (__v2df)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
-                                           -(__v2df)(__m128d)(B), \
-                                           (__v2df)(__m128d)(C), (__mmask8)(U), \
-                                           (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
-{
-  return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
-                                        -(__v2df)__X,
-                                        (__v2df)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
-                                           -(__v2df)(__m128d)(X), \
-                                           (__v2df)(__m128d)(Y), (__mmask8)(U), \
-                                           (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
-  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
-                                       -(__v2df)__A,
-                                       -(__v2df)__B,
-                                       (__mmask8)__U,
-                                       _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmsub_round_sd(A, B, C, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                          -(__v2df)(__m128d)(B), \
-                                          -(__v2df)(__m128d)(C), (__mmask8)-1, \
-                                          (int)(R)))
-
-#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
-                                          -(__v2df)(__m128d)(A), \
-                                          -(__v2df)(__m128d)(B), (__mmask8)(U), \
-                                          (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
-                                        -(__v2df)__B,
-                                        -(__v2df)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
-  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
-                                           -(__v2df)(__m128d)(B), \
-                                           -(__v2df)(__m128d)(C), \
-                                           (__mmask8)(U), \
-                                           (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
-{
-  return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
-                                        -(__v2df)__X,
-                                        (__v2df)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
-  ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
-                                           -(__v2df)(__m128d)(X), \
-                                           (__v2df)(__m128d)(Y), \
-                                           (__mmask8)(U), (int)(R)))
-
-#define _mm512_permutex_pd(X, C) \
-  ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)))
-
-#define _mm512_mask_permutex_pd(W, U, X, C) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                        (__v8df)_mm512_permutex_pd((X), (C)), \
-                                        (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_permutex_pd(U, X, C) \
-  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                        (__v8df)_mm512_permutex_pd((X), (C)), \
-                                        (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_permutex_epi64(X, C) \
-  ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)))
-
-#define _mm512_mask_permutex_epi64(W, U, X, C) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                       (__v8di)_mm512_permutex_epi64((X), (C)), \
-                                       (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_permutex_epi64(U, X, C) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                       (__v8di)_mm512_permutex_epi64((X), (C)), \
-                                       (__v8di)_mm512_setzero_si512()))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
-{
-  return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                        (__v8df)_mm512_permutexvar_pd(__X, __Y),
-                                        (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                        (__v8df)_mm512_permutexvar_pd(__X, __Y),
-                                        (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                     (__v8di)_mm512_permutexvar_epi64(__X, __Y),
-                                     (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
-             __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
-                                     (__v8di)_mm512_permutexvar_epi64(__X, __Y),
-                                     (__v8di)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
-{
-  return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                       (__v16sf)_mm512_permutexvar_ps(__X, __Y),
-                                       (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                       (__v16sf)_mm512_permutexvar_ps(__X, __Y),
-                                       (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X);
-}
-
-#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                    (__v16si)_mm512_permutexvar_epi32(__X, __Y),
-                                    (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
-             __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
-                                    (__v16si)_mm512_permutexvar_epi32(__X, __Y),
-                                    (__v16si)__W);
-}
-
-#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kand (__mmask16 __A, __mmask16 __B)
-{
-  return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kandn (__mmask16 __A, __mmask16 __B)
-{
-  return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kor (__mmask16 __A, __mmask16 __B)
-{
-  return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_kortestc (__mmask16 __A, __mmask16 __B)
-{
-  return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_kortestz (__mmask16 __A, __mmask16 __B)
-{
-  return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
-  return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
-  return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
-  *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
-  return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
-{
-  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kxnor (__mmask16 __A, __mmask16 __B)
-{
-  return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kxor (__mmask16 __A, __mmask16 __B)
-{
-  return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-#define _kand_mask16 _mm512_kand
-#define _kandn_mask16 _mm512_kandn
-#define _knot_mask16 _mm512_knot
-#define _kor_mask16 _mm512_kor
-#define _kxnor_mask16 _mm512_kxnor
-#define _kxor_mask16 _mm512_kxor
-
-#define _kshiftli_mask16(A, I) \
-  ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)))
-
-#define _kshiftri_mask16(A, I) \
-  ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_cvtmask16_u32(__mmask16 __A) {
-  return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_cvtu32_mask16(unsigned int __A) {
-  return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_load_mask16(__mmask16 *__A) {
-  return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_store_mask16(__mmask16 *__A, __mmask16 __B) {
-  *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_si512 (void * __P, __m512i __A)
-{
-  typedef __v8di __v8di_aligned __attribute__((aligned(64)));
-  __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_stream_load_si512 (void const *__P)
-{
-  typedef __v8di __v8di_aligned __attribute__((aligned(64)));
-  return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_pd (void *__P, __m512d __A)
-{
-  typedef __v8df __v8df_aligned __attribute__((aligned(64)));
-  __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_ps (void *__P, __m512 __A)
-{
-  typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
-  __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
-                  (__v8df) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
-                  (__v8df)
-                  _mm512_setzero_pd (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
-                  (__v8di) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
-                  (__v8di)
-                  _mm512_setzero_si512 (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
-                 (__v16sf) __W,
-                 (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
-                 (__v16sf)
-                 _mm512_setzero_ps (),
-                 (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
-                  (__v16si) __W,
-                  (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
-                  (__v16si)
-                  _mm512_setzero_si512 (),
-                  (__mmask16) __U);
-}
-
-#define _mm_cmp_round_ss_mask(X, Y, P, R) \
-  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
-                                       (__v4sf)(__m128)(Y), (int)(P), \
-                                       (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
-  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
-                                       (__v4sf)(__m128)(Y), (int)(P), \
-                                       (__mmask8)(M), (int)(R)))
-
-#define _mm_cmp_ss_mask(X, Y, P) \
-  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
-                                       (__v4sf)(__m128)(Y), (int)(P), \
-                                       (__mmask8)-1, \
-                                       _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_cmp_ss_mask(M, X, Y, P) \
-  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
-                                       (__v4sf)(__m128)(Y), (int)(P), \
-                                       (__mmask8)(M), \
-                                       _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_cmp_round_sd_mask(X, Y, P, R) \
-  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
-                                       (__v2df)(__m128d)(Y), (int)(P), \
-                                       (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
-  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
-                                       (__v2df)(__m128d)(Y), (int)(P), \
-                                       (__mmask8)(M), (int)(R)))
-
-#define _mm_cmp_sd_mask(X, Y, P) \
-  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
-                                       (__v2df)(__m128d)(Y), (int)(P), \
-                                       (__mmask8)-1, \
-                                       _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_cmp_sd_mask(M, X, Y, P) \
-  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
-                                       (__v2df)(__m128d)(Y), (int)(P), \
-                                       (__mmask8)(M), \
-                                       _MM_FROUND_CUR_DIRECTION))
-
-/* Bit Test */
-
-static __inline __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_test_epi32_mask (__m512i __A, __m512i __B)
-{
-  return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
-                                   _mm512_setzero_si512());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
-                                        _mm512_setzero_si512());
-}
-
-static __inline __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_test_epi64_mask (__m512i __A, __m512i __B)
-{
-  return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
-                                   _mm512_setzero_si512());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
-                                        _mm512_setzero_si512());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
-{
-  return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
-                                  _mm512_setzero_si512());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
-                                       _mm512_setzero_si512());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
-{
-  return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
-                                  _mm512_setzero_si512());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
-                                       _mm512_setzero_si512());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_movehdup_ps (__m512 __A)
-{
-  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
-                         1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_movehdup_ps(__A),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_movehdup_ps(__A),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_moveldup_ps (__m512 __A)
-{
-  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
-                         0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_moveldup_ps(__A),
-                                             (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
-{
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                             (__v16sf)_mm512_moveldup_ps(__A),
-                                             (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
-  return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
-  return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
-                                     _mm_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
-  return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
-  return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
-                                     _mm_setzero_pd());
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
-{
-  __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
-{
-  __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
-{
-  __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
-                                                (__v4sf)_mm_setzero_ps(),
-                                                0, 4, 4, 4);
-
-  return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_load_ss (__mmask8 __U, const float* __A)
-{
-  return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A,
-                                                (__v4sf) _mm_setzero_ps(),
-                                                __U & 1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
-{
-  __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
-                                                 (__v2df)_mm_setzero_pd(),
-                                                 0, 2);
-
-  return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_load_sd (__mmask8 __U, const double* __A)
-{
-  return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A,
-                                                  (__v2df) _mm_setzero_pd(),
-                                                  __U & 1);
-}
-
-#define _mm512_shuffle_epi32(A, I) \
-  ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)))
-
-#define _mm512_mask_shuffle_epi32(W, U, A, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                       (__v16si)_mm512_shuffle_epi32((A), (I)), \
-                                       (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_shuffle_epi32(U, A, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                       (__v16si)_mm512_shuffle_epi32((A), (I)), \
-                                       (__v16si)_mm512_setzero_si512()))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
-                (__v8df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
-                (__v8df) _mm512_setzero_pd (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
-                (__v8di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
-                (__v8di) _mm512_setzero_si512 (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
-{
-  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
-              (__v8df) __W,
-              (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
-{
-  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
-              (__v8df) _mm512_setzero_pd(),
-              (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
-              (__v8di) __W,
-              (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
-              (__v8di) _mm512_setzero_si512(),
-              (__mmask8) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
-{
-  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
-                   (__v16sf) __W,
-                   (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
-{
-  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
-                   (__v16sf) _mm512_setzero_ps(),
-                   (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
-              (__v16si) __W,
-              (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
-              (__v16si) _mm512_setzero_si512(),
-              (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
-               (__v16sf) __W,
-               (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
-               (__v16sf) _mm512_setzero_ps(),
-               (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
-                (__v16si) __W,
-                (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
-                (__v16si) _mm512_setzero_si512(),
-                (__mmask16) __U);
-}
-
-#define _mm512_cvt_roundps_pd(A, R) \
-  ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
-                                            (__v8df)_mm512_undefined_pd(), \
-                                            (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
-  ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
-                                            (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_pd(U, A, R) \
-  ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtps_pd (__m256 __A)
-{
-  return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_cvtps_pd(__A),
-                                              (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
-{
-  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
-                                              (__v8df)_mm512_cvtps_pd(__A),
-                                              (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtpslo_pd (__m512 __A)
-{
-  return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
-{
-  return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
-              (__v8df) __A,
-              (__v8df) __W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
-{
-  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
-              (__v8df) __A,
-              (__v8df) _mm512_setzero_pd ());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
-             (__v16sf) __A,
-             (__v16sf) __W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
-{
-  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
-             (__v16sf) __A,
-             (__v16sf) _mm512_setzero_ps ());
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
-{
-  __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
-{
-  __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
-{
-  __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
-            (__mmask16) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
-{
-  __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
-            (__mmask16) __U);
-}
-
-#define _mm_cvt_roundsd_ss(A, B, R) \
-  ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v4sf)_mm_undefined_ps(), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
-  ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v4sf)(__m128)(W), \
-                                              (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
-  ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
-{
-  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
-                                             (__v2df)__B,
-                                             (__v4sf)__W,
-                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
-{
-  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
-                                             (__v2df)__B,
-                                             (__v4sf)_mm_setzero_ps(),
-                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvtss_i32 _mm_cvtss_si32
-#define _mm_cvtsd_i32 _mm_cvtsd_si32
-#define _mm_cvti32_sd _mm_cvtsi32_sd
-#define _mm_cvti32_ss _mm_cvtsi32_ss
-#ifdef __x86_64__
-#define _mm_cvtss_i64 _mm_cvtss_si64
-#define _mm_cvtsd_i64 _mm_cvtsd_si64
-#define _mm_cvti64_sd _mm_cvtsi64_sd
-#define _mm_cvti64_ss _mm_cvtsi64_ss
-#endif
-
-#ifdef __x86_64__
-#define _mm_cvt_roundi64_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
-                                      (int)(R)))
-
-#define _mm_cvt_roundsi64_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
-                                      (int)(R)))
-#endif
-
-#define _mm_cvt_roundsi32_ss(A, B, R) \
-  ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
-
-#define _mm_cvt_roundi32_ss(A, B, R) \
-  ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsi64_ss(A, B, R) \
-  ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
-                                     (int)(R)))
-
-#define _mm_cvt_roundi64_ss(A, B, R) \
-  ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
-                                     (int)(R)))
-#endif
-
-#define _mm_cvt_roundss_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v2df)_mm_undefined_pd(), \
-                                               (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
-  ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v2df)(__m128d)(W), \
-                                               (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
-  ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
-{
-  return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
-                                            (__v4sf)__B,
-                                            (__v2df)__W,
-                                            (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
-{
-  return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
-                                            (__v4sf)__B,
-                                            (__v2df)_mm_setzero_pd(),
-                                            (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtu32_sd (__m128d __A, unsigned __B)
-{
-  __A[0] = __B;
-  return __A;
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundu64_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
-                                       (unsigned long long)(B), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
-{
-  __A[0] = __B;
-  return __A;
-}
-#endif
-
-#define _mm_cvt_roundu32_ss(A, B, R) \
-  ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
-                                      (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtu32_ss (__m128 __A, unsigned __B)
-{
-  __A[0] = __B;
-  return __A;
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundu64_ss(A, B, R) \
-  ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
-                                      (unsigned long long)(B), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
-{
-  __A[0] = __B;
-  return __A;
-}
-#endif
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
-{
-  return (__m512i) __builtin_ia32_selectd_512(__M,
-                                              (__v16si) _mm512_set1_epi32(__A),
-                                              (__v16si) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
-{
-  return (__m512i) __builtin_ia32_selectq_512(__M,
-                                              (__v8di) _mm512_set1_epi64(__A),
-                                              (__v8di) __O);
-}
-
-static  __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
-    char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
-    char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
-    char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
-    char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
-    char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
-    char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
-    char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
-    char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
-    char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
-    char __e4, char __e3, char __e2, char __e1, char __e0) {
-
-  return __extension__ (__m512i)(__v64qi)
-    {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
-     __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
-     __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
-     __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
-     __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
-     __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
-     __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
-     __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
-}
-
-static  __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
-    short __e27, short __e26, short __e25, short __e24, short __e23,
-    short __e22, short __e21, short __e20, short __e19, short __e18,
-    short __e17, short __e16, short __e15, short __e14, short __e13,
-    short __e12, short __e11, short __e10, short __e9, short __e8,
-    short __e7, short __e6, short __e5, short __e4, short __e3,
-    short __e2, short __e1, short __e0) {
-  return __extension__ (__m512i)(__v32hi)
-    {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
-     __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
-     __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
-     __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set_epi32 (int __A, int __B, int __C, int __D,
-     int __E, int __F, int __G, int __H,
-     int __I, int __J, int __K, int __L,
-     int __M, int __N, int __O, int __P)
-{
-  return __extension__ (__m512i)(__v16si)
-  { __P, __O, __N, __M, __L, __K, __J, __I,
-    __H, __G, __F, __E, __D, __C, __B, __A };
-}
-
-#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,           \
-       e8,e9,e10,e11,e12,e13,e14,e15)          \
-  _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
-                   (e5),(e4),(e3),(e2),(e1),(e0))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_set_epi64 (long long __A, long long __B, long long __C,
-     long long __D, long long __E, long long __F,
-     long long __G, long long __H)
-{
-  return __extension__ (__m512i) (__v8di)
-  { __H, __G, __F, __E, __D, __C, __B, __A };
-}
-
-#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)           \
-  _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_set_pd (double __A, double __B, double __C, double __D,
-        double __E, double __F, double __G, double __H)
-{
-  return __extension__ (__m512d)
-  { __H, __G, __F, __E, __D, __C, __B, __A };
-}
-
-#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)              \
-  _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_set_ps (float __A, float __B, float __C, float __D,
-        float __E, float __F, float __G, float __H,
-        float __I, float __J, float __K, float __L,
-        float __M, float __N, float __O, float __P)
-{
-  return __extension__ (__m512)
-  { __P, __O, __N, __M, __L, __K, __J, __I,
-    __H, __G, __F, __E, __D, __C, __B, __A };
-}
-
-#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
-  _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
-                (e4),(e3),(e2),(e1),(e0))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_abs_ps(__m512 __A)
-{
-  return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
-{
-  return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_abs_pd(__m512d __A)
-{
-  return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
-{
-  return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
-}
-
-/* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
- * outputs. This class of vector operation forms the basis of many scientific
- * computations. In vector-reduction arithmetic, the evaluation order is
- * independent of the order of the input elements of V.
-
- * For floating-point intrinsics:
- * 1. When using fadd/fmul intrinsics, the order of operations within the
- * vector is unspecified (associative math).
- * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
- * produce unspecified results.
-
- * Used bisection method. At each step, we partition the vector with previous
- * step in half, and the operation is performed on its two halves.
- * This takes log2(n) steps where n is the number of elements in the vector.
- */
-
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
-  return __builtin_reduce_add((__v8di)__W);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
-  return __builtin_reduce_mul((__v8di)__W);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
-  return __builtin_reduce_and((__v8di)__W);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
-  return __builtin_reduce_or((__v8di)__W);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
-  __W = _mm512_maskz_mov_epi64(__M, __W);
-  return __builtin_reduce_add((__v8di)__W);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
-  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
-  return __builtin_reduce_mul((__v8di)__W);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
-  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W);
-  return __builtin_reduce_and((__v8di)__W);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
-  __W = _mm512_maskz_mov_epi64(__M, __W);
-  return __builtin_reduce_or((__v8di)__W);
-}
-
-// -0.0 is used to ignore the start value since it is the neutral value of
-// floating point addition. For more information, please refer to
-// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
-static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
-  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
-  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
-  __W = _mm512_maskz_mov_pd(__M, __W);
-  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
-  __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
-  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_add_epi32(__m512i __W) {
-  return __builtin_reduce_add((__v16si)__W);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_mul_epi32(__m512i __W) {
-  return __builtin_reduce_mul((__v16si)__W);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_and_epi32(__m512i __W) {
-  return __builtin_reduce_and((__v16si)__W);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_or_epi32(__m512i __W) {
-  return __builtin_reduce_or((__v16si)__W);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
-  __W = _mm512_maskz_mov_epi32(__M, __W);
-  return __builtin_reduce_add((__v16si)__W);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
-  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
-  return __builtin_reduce_mul((__v16si)__W);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
-  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W);
-  return __builtin_reduce_and((__v16si)__W);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
-  __W = _mm512_maskz_mov_epi32(__M, __W);
-  return __builtin_reduce_or((__v16si)__W);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_reduce_add_ps(__m512 __W) {
-  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_reduce_mul_ps(__m512 __W) {
-  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
-  __W = _mm512_maskz_mov_ps(__M, __W);
-  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
-  __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
-  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_epi64(__m512i __V) {
-  return __builtin_reduce_max((__v8di)__V);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_epu64(__m512i __V) {
-  return __builtin_reduce_max((__v8du)__V);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_epi64(__m512i __V) {
-  return __builtin_reduce_min((__v8di)__V);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_epu64(__m512i __V) {
-  return __builtin_reduce_min((__v8du)__V);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
-  return __builtin_reduce_max((__v8di)__V);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
-  __V = _mm512_maskz_mov_epi64(__M, __V);
-  return __builtin_reduce_max((__v8du)__V);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
-  return __builtin_reduce_min((__v8di)__V);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
-  return __builtin_reduce_min((__v8du)__V);
-}
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_epi32(__m512i __V) {
-  return __builtin_reduce_max((__v16si)__V);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_epu32(__m512i __V) {
-  return __builtin_reduce_max((__v16su)__V);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_epi32(__m512i __V) {
-  return __builtin_reduce_min((__v16si)__V);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_epu32(__m512i __V) {
-  return __builtin_reduce_min((__v16su)__V);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
-  return __builtin_reduce_max((__v16si)__V);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
-  __V = _mm512_maskz_mov_epi32(__M, __V);
-  return __builtin_reduce_max((__v16su)__V);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
-  return __builtin_reduce_min((__v16si)__V);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V);
-  return __builtin_reduce_min((__v16su)__V);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_pd(__m512d __V) {
-  return __builtin_ia32_reduce_fmax_pd512(__V);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_pd(__m512d __V) {
-  return __builtin_ia32_reduce_fmin_pd512(__V);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
-  __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
-  return __builtin_ia32_reduce_fmax_pd512(__V);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
-  __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
-  return __builtin_ia32_reduce_fmin_pd512(__V);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_ps(__m512 __V) {
-  return __builtin_ia32_reduce_fmax_ps512(__V);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_ps(__m512 __V) {
-  return __builtin_ia32_reduce_fmin_ps512(__V);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
-  __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
-  return __builtin_ia32_reduce_fmax_ps512(__V);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
-  __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
-  return __builtin_ia32_reduce_fmin_ps512(__V);
-}
-
-/// Moves the least significant 32 bits of a vector of [16 x i32] to a
-///    32-bit signed integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __A
-///    A vector of [16 x i32]. The least significant 32 bits are moved to the
-///    destination.
-/// \returns A 32-bit signed integer containing the moved value.
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_cvtsi512_si32(__m512i __A) {
-  __v16si __b = (__v16si)__A;
-  return __b[0];
-}
-
-/// Loads 8 double-precision (64-bit) floating-point elements stored at memory
-/// locations starting at location \a base_addr at packed 32-bit integer indices
-/// stored in the lower half of \a vindex scaled by \a scale them in dst.
-///
-/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-///   i := j*64
-///   m := j*32
-///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-///   dst[i+63:i] := MEM[addr+63:addr]
-/// ENDFOR
-/// dst[MAX:512] := 0
-/// \endcode
-#define _mm512_i32logather_pd(vindex, base_addr, scale)                        \
-  _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
-
-/// Loads 8 double-precision (64-bit) floating-point elements from memory
-/// starting at location \a base_addr at packed 32-bit integer indices stored in
-/// the lower half of \a vindex scaled by \a scale into dst using writemask
-/// \a mask (elements are copied from \a src when the corresponding mask bit is
-/// not set).
-///
-/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-///   i := j*64
-///   m := j*32
-///   IF mask[j]
-///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-///     dst[i+63:i] := MEM[addr+63:addr]
-///   ELSE
-///     dst[i+63:i] := src[i+63:i]
-///   FI
-/// ENDFOR
-/// dst[MAX:512] := 0
-/// \endcode
-#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale)        \
-  _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex),      \
-                           (base_addr), (scale))
-
-/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
-/// at packed 32-bit integer indices stored in the lower half of \a vindex
-/// scaled by \a scale and stores them in dst.
-///
-/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-///   i := j*64
-///   m := j*32
-///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-///   dst[i+63:i] := MEM[addr+63:addr]
-/// ENDFOR
-/// dst[MAX:512] := 0
-/// \endcode
-#define _mm512_i32logather_epi64(vindex, base_addr, scale)                     \
-  _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
-
-/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
-/// at packed 32-bit integer indices stored in the lower half of \a vindex
-/// scaled by \a scale and stores them in dst using writemask \a mask (elements
-/// are copied from \a src when the corresponding mask bit is not set).
-///
-/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-///   i := j*64
-///   m := j*32
-///   IF mask[j]
-///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-///     dst[i+63:i] := MEM[addr+63:addr]
-///   ELSE
-///     dst[i+63:i] := src[i+63:i]
-///   FI
-/// ENDFOR
-/// dst[MAX:512] := 0
-/// \endcode
-#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale)     \
-  _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex),   \
-                              (base_addr), (scale))
-
-/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
-/// and to memory locations starting at location \a base_addr at packed 32-bit
-/// integer indices stored in \a vindex scaled by \a scale.
-///
-/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-///   i := j*64
-///   m := j*32
-///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-///   MEM[addr+63:addr] := v1[i+63:i]
-/// ENDFOR
-/// \endcode
-#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale)                   \
-  _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
-
-/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
-/// to memory locations starting at location \a base_addr at packed 32-bit
-/// integer indices stored in \a vindex scaled by \a scale. Only those elements
-/// whose corresponding mask bit is set in writemask \a mask are written to
-/// memory.
-///
-/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-///   i := j*64
-///   m := j*32
-///   IF mask[j]
-///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-///     MEM[addr+63:addr] := a[i+63:i]
-///   FI
-/// ENDFOR
-/// \endcode
-#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale)        \
-  _mm512_mask_i32scatter_pd((base_addr), (mask),                               \
-                            _mm512_castsi512_si256(vindex), (v1), (scale))
-
-/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
-/// memory locations starting at location \a base_addr at packed 32-bit integer
-/// indices stored in \a vindex scaled by \a scale.
-///
-/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-///   i := j*64
-///   m := j*32
-///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-///   MEM[addr+63:addr] := a[i+63:i]
-/// ENDFOR
-/// \endcode
-#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale)                \
-  _mm512_i32scatter_epi64((base_addr),                                         \
-                          _mm512_castsi512_si256(vindex), (v1), (scale))
-
-/// Stores 8 packed 64-bit integer elements located in a and stores them in
-/// memory locations starting at location \a base_addr at packed 32-bit integer
-/// indices stored in \a vindex scaled by scale using writemask \a mask (elements
-/// whose corresponding mask bit is not set are not written to memory).
-///
-/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-///   i := j*64
-///   m := j*32
-///   IF mask[j]
-///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-///     MEM[addr+63:addr] := a[i+63:i]
-///   FI
-/// ENDFOR
-/// \endcode
-#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale)     \
-  _mm512_mask_i32scatter_epi64((base_addr), (mask),                            \
-                               _mm512_castsi512_si256(vindex), (v1), (scale))
-
-#undef __DEFAULT_FN_ATTRS512
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __AVX512FINTRIN_H */
diff --git a/third_party/intel/clang/avx512fp16intrin.h b/third_party/intel/clang/avx512fp16intrin.h
deleted file mode 100644
index e136aa14a..000000000
--- a/third_party/intel/clang/avx512fp16intrin.h
+++ /dev/null
@@ -1,3352 +0,0 @@
-/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifdef __SSE2__
-
-#ifndef __AVX512FP16INTRIN_H
-#define __AVX512FP16INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
-typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
-typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512fp16,evex512"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512fp16,no-evex512"),                          \
-                 __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512fp16,no-evex512"),                          \
-                 __min_vector_width__(128)))
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
-  return __a[0];
-}
-
-static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
-  return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
-  return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
-  return (__m256h)__builtin_ia32_undef256();
-}
-
-static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
-  return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
-  return (__m128h)__builtin_ia32_undef128();
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
-  return (__m512h)__builtin_ia32_undef512();
-}
-
-static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
-  return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
-                            __h, __h, __h, __h, __h, __h, __h, __h,
-                            __h, __h, __h, __h, __h, __h, __h, __h,
-                            __h, __h, __h, __h, __h, __h, __h, __h};
-}
-
-static __inline __m512h __DEFAULT_FN_ATTRS512
-_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
-              _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
-              _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
-              _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
-              _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
-              _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
-              _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
-              _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
-  return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
-                            __h25, __h24, __h23, __h22, __h21, __h20, __h19,
-                            __h18, __h17, __h16, __h15, __h14, __h13, __h12,
-                            __h11, __h10, __h9,  __h8,  __h7,  __h6,  __h5,
-                            __h4,  __h3,  __h2,  __h1};
-}
-
-#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
-                       h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24,  \
-                       h25, h26, h27, h28, h29, h30, h31, h32)                 \
-  _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
-                (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
-                (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6),     \
-                (h5), (h4), (h3), (h2), (h1))
-
-static __inline __m512h __DEFAULT_FN_ATTRS512
-_mm512_set1_pch(_Float16 _Complex __h) {
-  return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
-  return (__m128)__a;
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
-  return (__m256)__a;
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
-  return (__m512)__a;
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
-  return (__m128d)__a;
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
-  return (__m256d)__a;
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
-  return (__m512d)__a;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
-  return (__m128i)__a;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_castph_si256(__m256h __a) {
-  return (__m256i)__a;
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_castph_si512(__m512h __a) {
-  return (__m512i)__a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
-  return (__m128h)__a;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
-  return (__m256h)__a;
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
-  return (__m512h)__a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
-  return (__m128h)__a;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
-  return (__m256h)__a;
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
-  return (__m512h)__a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
-  return (__m128h)__a;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_castsi256_ph(__m256i __a) {
-  return (__m256h)__a;
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_castsi512_ph(__m512i __a) {
-  return (__m512h)__a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_castph256_ph128(__m256h __a) {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_castph512_ph128(__m512h __a) {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_castph512_ph256(__m512h __a) {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                                 12, 13, 14, 15);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_castph128_ph256(__m128h __a) {
-  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
-                                  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_castph128_ph512(__m128h __a) {
-  __m256h __b = __builtin_nondeterministic_value(__b);
-  return __builtin_shufflevector(
-      __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
-                              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
-      __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-      20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_castph256_ph512(__m256h __a) {
-  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
-                                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
-                                 27, 28, 29, 30, 31);
-}
-
-/// Constructs a 256-bit floating-point vector of [16 x half] from a
-///    128-bit floating-point vector of [8 x half]. The lower 128 bits
-///    contain the value of the source vector. The upper 384 bits are set
-///    to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x half].
-/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
-///    contain the value of the parameter. The upper 384 bits are set to zero.
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_zextph128_ph256(__m128h __a) {
-  return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
-                                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-/// Constructs a 512-bit floating-point vector of [32 x half] from a
-///    128-bit floating-point vector of [8 x half]. The lower 128 bits
-///    contain the value of the source vector. The upper 384 bits are set
-///    to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x half].
-/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
-///    contain the value of the parameter. The upper 384 bits are set to zero.
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_zextph128_ph512(__m128h __a) {
-  return __builtin_shufflevector(
-      __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-      13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-/// Constructs a 512-bit floating-point vector of [32 x half] from a
-///    256-bit floating-point vector of [16 x half]. The lower 256 bits
-///    contain the value of the source vector. The upper 256 bits are set
-///    to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit vector of [16 x half].
-/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
-///    contain the value of the parameter. The upper 256 bits are set to zero.
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_zextph256_ph512(__m256h __a) {
-  return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
-                                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
-                                 29, 30, 31);
-}
-
-#define _mm_comi_round_sh(A, B, P, R)                                          \
-  __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
-
-#define _mm_comi_sh(A, B, pred)                                                \
-  _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
-                                                          __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
-                                                          __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
-                                                          __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
-                                                          __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
-                                                          __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
-                                                           __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
-                                                           __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
-                                                           __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
-                                                           __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
-                                                           __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
-                                                           __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
-                                                            __m128h __B) {
-  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
-                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
-                                                              __m512h __B) {
-  return (__m512h)((__v32hf)__A + (__v32hf)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512(
-      (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
-                                              (__v32hf)_mm512_add_ph(__A, __B),
-                                              (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_add_round_ph(A, B, R)                                           \
-  ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
-                                    (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
-      (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
-      (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
-                                                              __m512h __B) {
-  return (__m512h)((__v32hf)__A - (__v32hf)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512(
-      (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
-                                              (__v32hf)_mm512_sub_ph(__A, __B),
-                                              (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_sub_round_ph(A, B, R)                                           \
-  ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
-                                    (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
-      (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
-      (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
-                                                              __m512h __B) {
-  return (__m512h)((__v32hf)__A * (__v32hf)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512(
-      (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
-                                              (__v32hf)_mm512_mul_ph(__A, __B),
-                                              (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_mul_round_ph(A, B, R)                                           \
-  ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
-                                    (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
-      (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
-      (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
-                                                              __m512h __B) {
-  return (__m512h)((__v32hf)__A / (__v32hf)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512(
-      (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
-                                              (__v32hf)_mm512_div_ph(__A, __B),
-                                              (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_div_round_ph(A, B, R)                                           \
-  ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
-                                    (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
-      (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
-      (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
-                                                              __m512h __B) {
-  return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
-                                          _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512(
-      (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
-                                              (__v32hf)_mm512_min_ph(__A, __B),
-                                              (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_min_round_ph(A, B, R)                                           \
-  ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
-                                    (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
-      (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
-      (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
-                                                              __m512h __B) {
-  return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
-                                          _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512(
-      (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
-                                              (__v32hf)_mm512_max_ph(__A, __B),
-                                              (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_max_round_ph(A, B, R)                                           \
-  ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
-                                    (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
-      (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
-      (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
-  return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
-  return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_selectps_512(
-      (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
-                                              (__v16sf)_mm512_conj_pch(__A),
-                                              (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
-                                                           __m128h __B) {
-  __A[0] += __B[0];
-  return __A;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  __A = _mm_add_sh(__A, __B);
-  return __builtin_ia32_selectsh_128(__U, __A, __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  __A = _mm_add_sh(__A, __B);
-  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
-}
-
-#define _mm_add_round_sh(A, B, R)                                              \
-  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
-  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_add_round_sh(U, A, B, R)                                     \
-  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
-                                                           __m128h __B) {
-  __A[0] -= __B[0];
-  return __A;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  __A = _mm_sub_sh(__A, __B);
-  return __builtin_ia32_selectsh_128(__U, __A, __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  __A = _mm_sub_sh(__A, __B);
-  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
-}
-
-#define _mm_sub_round_sh(A, B, R)                                              \
-  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
-  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
-  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
-                                                           __m128h __B) {
-  __A[0] *= __B[0];
-  return __A;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  __A = _mm_mul_sh(__A, __B);
-  return __builtin_ia32_selectsh_128(__U, __A, __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  __A = _mm_mul_sh(__A, __B);
-  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
-}
-
-#define _mm_mul_round_sh(A, B, R)                                              \
-  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
-  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
-  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
-                                                           __m128h __B) {
-  __A[0] /= __B[0];
-  return __A;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  __A = _mm_div_sh(__A, __B);
-  return __builtin_ia32_selectsh_128(__U, __A, __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  __A = _mm_div_sh(__A, __B);
-  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
-}
-
-#define _mm_div_round_sh(A, B, R)                                              \
-  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
-  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_div_round_sh(U, A, B, R)                                     \
-  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
-                                                           __m128h __B) {
-  return (__m128h)__builtin_ia32_minsh_round_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
-                                                  (__v8hf)__W, (__mmask8)__U,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_minsh_round_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_min_round_sh(A, B, R)                                              \
-  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
-  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_min_round_sh(U, A, B, R)                                     \
-  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
-                                                           __m128h __B) {
-  return (__m128h)__builtin_ia32_maxsh_round_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
-                                                  (__v8hf)__W, (__mmask8)__U,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_maxsh_round_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_max_round_sh(A, B, R)                                              \
-  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
-  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_max_round_sh(U, A, B, R)                                     \
-  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
-  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
-                                           (__v32hf)(__m512h)(B), (int)(P),    \
-                                           (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
-  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
-                                           (__v32hf)(__m512h)(B), (int)(P),    \
-                                           (__mmask32)(U), (int)(R)))
-
-#define _mm512_cmp_ph_mask(A, B, P)                                            \
-  _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
-  _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
-  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
-                                       (__v8hf)(__m128h)(Y), (int)(P),         \
-                                       (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
-  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
-                                       (__v8hf)(__m128h)(Y), (int)(P),         \
-                                       (__mmask8)(M), (int)(R)))
-
-#define _mm_cmp_sh_mask(X, Y, P)                                               \
-  ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
-      (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
-      _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
-  ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
-      (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
-      _MM_FROUND_CUR_DIRECTION))
-// loads with vmovsh:
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
-  struct __mm_load_sh_struct {
-    _Float16 __u;
-  } __attribute__((__packed__, __may_alias__));
-  _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
-  return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
-  __m128h src = (__v8hf)__builtin_shufflevector(
-      (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
-
-  return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
-  return (__m128h)__builtin_ia32_loadsh128_mask(
-      (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_load_ph(void const *__p) {
-  return *(const __m512h *)__p;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_load_ph(void const *__p) {
-  return *(const __m256h *)__p;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
-  return *(const __m128h *)__p;
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_loadu_ph(void const *__p) {
-  struct __loadu_ph {
-    __m512h_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_ph *)__p)->__v;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_loadu_ph(void const *__p) {
-  struct __loadu_ph {
-    __m256h_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_ph *)__p)->__v;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
-  struct __loadu_ph {
-    __m128h_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_ph *)__p)->__v;
-}
-
-// stores with vmovsh:
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
-                                                          __m128h __a) {
-  struct __mm_store_sh_struct {
-    _Float16 __u;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
-                                                               __mmask8 __U,
-                                                               __m128h __A) {
-  __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
-                                                             __m512h __A) {
-  *(__m512h *)__P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
-                                                             __m256h __A) {
-  *(__m256h *)__P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
-                                                          __m128h __A) {
-  *(__m128h *)__P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
-                                                              __m512h __A) {
-  struct __storeu_ph {
-    __m512h_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_ph *)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
-                                                              __m256h __A) {
-  struct __storeu_ph {
-    __m256h_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_ph *)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
-                                                           __m128h __A) {
-  struct __storeu_ph {
-    __m128h_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_ph *)__P)->__v = __A;
-}
-
-// moves with vmovsh:
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
-                                                            __m128h __b) {
-  __a[0] = __b[0];
-  return __a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
-                                                                 __mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128h __B) {
-  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
-                                     _mm_setzero_ph());
-}
-
-// vmovw:
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
-  return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
-  __v8hi __b = (__v8hi)__a;
-  return __b[0];
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
-  return (__m512h)__builtin_ia32_rcpph512_mask(
-      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
-                                               (__mmask32)__U);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_rcpph512_mask(
-      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
-  return (__m512h)__builtin_ia32_rsqrtph512_mask(
-      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
-                                                 (__mmask32)__U);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_rsqrtph512_mask(
-      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
-}
-
-#define _mm512_getmant_ph(A, B, C)                                             \
-  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
-      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
-      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
-      _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
-  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
-      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
-      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
-  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
-      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
-      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_getmant_round_ph(A, B, C, R)                                    \
-  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
-      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
-      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
-  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
-      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
-  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
-      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
-      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
-  return (__m512h)__builtin_ia32_getexpph512_mask(
-      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_getexpph512_mask(
-      (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_getexpph512_mask(
-      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_getexp_round_ph(A, R)                                           \
-  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
-                                            (__v32hf)_mm512_undefined_ph(),    \
-                                            (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
-  ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
-  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
-                                            (__v32hf)_mm512_setzero_ph(),      \
-                                            (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
-                                                                 __m512h __B) {
-  return (__m512h)__builtin_ia32_scalefph512_mask(
-      (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
-                                                  (__v32hf)__W, (__mmask32)__U,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_scalefph512_mask(
-      (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_scalef_round_ph(A, B, R)                                        \
-  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
-      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
-  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
-  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
-      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
-
-#define _mm512_roundscale_ph(A, B)                                             \
-  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
-      (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
-      _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
-  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
-      (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
-      (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
-  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
-      (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
-      (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
-  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
-                                           (__v32hf)(__m512h)(A),              \
-                                           (__mmask32)(B), (int)(R)))
-
-#define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
-  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
-                                           (__v32hf)_mm512_setzero_ph(),       \
-                                           (__mmask32)(A), (int)(R)))
-
-#define _mm512_roundscale_round_ph(A, imm, R)                                  \
-  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
-                                           (__v32hf)_mm512_undefined_ph(),     \
-                                           (__mmask32)-1, (int)(R)))
-
-#define _mm512_reduce_ph(A, imm)                                               \
-  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
-      (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
-  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
-      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_reduce_ph(U, A, imm)                                      \
-  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
-      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
-  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
-                                            (__v32hf)(__m512h)(W),             \
-                                            (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
-  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
-                                            (__v32hf)_mm512_setzero_ph(),      \
-                                            (__mmask32)(U), (int)(R)))
-
-#define _mm512_reduce_round_ph(A, imm, R)                                      \
-  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
-                                            (__v32hf)_mm512_undefined_ph(),    \
-                                            (__mmask32)-1, (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
-                                                           __m128h __B) {
-  return (__m128h)__builtin_ia32_rcpsh_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
-                                            (__v8hf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_rcpsh_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
-                                                             __m128h __B) {
-  return (__m128h)__builtin_ia32_rsqrtsh_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128h __B) {
-  return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
-                                              (__v8hf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_rsqrtsh_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-#define _mm_getmant_round_sh(A, B, C, D, R)                                    \
-  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
-      (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
-
-#define _mm_getmant_sh(A, B, C, D)                                             \
-  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
-      (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
-  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
-      (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
-  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
-      (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
-  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
-      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
-  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
-      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-#define _mm_getexp_round_sh(A, B, R)                                           \
-  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
-                                                              __m128h __B) {
-  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
-  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
-  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_scalef_round_sh(A, B, R)                                           \
-  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
-                                                              __m128h __B) {
-  return (__m128h)__builtin_ia32_scalefsh_round_mask(
-      (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
-                                                     (__v8hf)__W, (__mmask8)__U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
-  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_scalefsh_round_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
-  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_roundscale_round_sh(A, B, imm, R)                                  \
-  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(imm), (int)(R)))
-
-#define _mm_roundscale_sh(A, B, imm)                                           \
-  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
-  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
-  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(I), (int)(R)))
-
-#define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
-  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
-  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(I), (int)(R)))
-
-#define _mm_reduce_sh(A, B, C)                                                 \
-  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
-  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_reduce_sh(U, A, B, C)                                        \
-  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_reduce_round_sh(A, B, C, R)                                        \
-  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(C), (int)(R)))
-
-#define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
-  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(C), (int)(R)))
-
-#define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
-  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(C), (int)(R)))
-
-#define _mm512_sqrt_round_ph(A, R)                                             \
-  ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
-
-#define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
-      (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
-  ((__m512h)__builtin_ia32_selectph_512(                                       \
-      (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
-      (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
-  return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
-                                           _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_selectph_512(
-      (__mmask32)(__U),
-      (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
-      (__v32hf)(__m512h)(__W));
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
-  return (__m512h)__builtin_ia32_selectph_512(
-      (__mmask32)(__U),
-      (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
-      (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm_sqrt_round_sh(A, B, R)                                             \
-  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
-  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
-  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
-                                                            __m128h __B) {
-  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
-      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
-      (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
-                                                                 __mmask32 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
-      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
-      (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
-                                                                  __m128h __A,
-                                                                  __m128h __B) {
-  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
-      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
-      (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
-  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
-                                               (int)(imm), (__mmask32)(U)))
-
-#define _mm512_fpclass_ph_mask(A, imm)                                         \
-  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
-                                               (int)(imm), (__mmask32)-1))
-
-#define _mm_fpclass_sh_mask(A, imm)                                            \
-  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
-                                           (__mmask8)-1))
-
-#define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
-  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
-                                           (__mmask8)(U)))
-
-#define _mm512_cvt_roundpd_ph(A, R)                                            \
-  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
-      (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
-  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
-  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
-      (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
-  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
-      (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
-  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
-      (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
-  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
-      (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_pd(A, R)                                            \
-  ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
-      (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
-  ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
-  ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
-      (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
-  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
-      (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
-  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
-      (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
-  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
-      (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_ss(A, B, R)                                            \
-  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
-                                               (__v4sf)_mm_undefined_ps(),     \
-                                               (__mmask8)(-1), (int)(R)))
-
-#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
-  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
-      (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
-  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
-                                               (__v4sf)_mm_setzero_ps(),       \
-                                               (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
-                                                            __m128h __B) {
-  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
-      (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
-                                                                 __mmask8 __U,
-                                                                 __m128 __A,
-                                                                 __m128h __B) {
-  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
-                                                     (__v4sf)__W, (__mmask8)__U,
-                                                     _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
-                                                                  __m128 __A,
-                                                                  __m128h __B) {
-  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
-      (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundss_sh(A, B, R)                                            \
-  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
-                                                (__v8hf)_mm_undefined_ph(),    \
-                                                (__mmask8)(-1), (int)(R)))
-
-#define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
-  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
-      (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
-  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
-                                                (__v8hf)_mm_setzero_ph(),      \
-                                                (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
-                                                             __m128 __B) {
-  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
-      (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128 __B) {
-  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
-      (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
-                                                                   __m128h __A,
-                                                                   __m128 __B) {
-  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
-      (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsd_sh(A, B, R)                                            \
-  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
-                                                (__v8hf)_mm_undefined_ph(),    \
-                                                (__mmask8)(-1), (int)(R)))
-
-#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
-  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
-      (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
-  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
-                                                (__v8hf)_mm_setzero_ph(),      \
-                                                (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
-                                                             __m128d __B) {
-  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
-      (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128d __B) {
-  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
-      (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
-  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
-      (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_sd(A, B, R)                                            \
-  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
-                                                (__v2df)_mm_undefined_pd(),    \
-                                                (__mmask8)(-1), (int)(R)))
-
-#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
-  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
-      (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
-  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
-                                                (__v2df)_mm_setzero_pd(),      \
-                                                (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
-                                                             __m128h __B) {
-  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
-      (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
-                                                                  __mmask8 __U,
-                                                                  __m128d __A,
-                                                                  __m128h __B) {
-  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
-      (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
-  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
-      (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epi16(A, R)                                         \
-  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
-                                            (__v32hi)_mm512_undefined_epi32(), \
-                                            (__mmask32)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
-  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
-                                            (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
-  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
-                                            (__v32hi)_mm512_setzero_epi32(),   \
-                                            (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epi16(__m512h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
-      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
-      (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
-      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epi16(A, R)                                        \
-  ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
-      (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
-      (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
-  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
-                                             (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
-  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
-                                             (__v32hi)_mm512_setzero_epi32(),  \
-                                             (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epi16(__m512h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
-      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
-      (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
-      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi16_ph(A, R)                                         \
-  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
-                                            (__v32hf)_mm512_undefined_ph(),    \
-                                            (__mmask32)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
-  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
-                                            (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
-  ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
-      (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_cvtepi16_ph(__m512i __A) {
-  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
-      (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
-  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
-      (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
-  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
-      (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epu16(A, R)                                         \
-  ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
-      (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
-      (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
-  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
-                                             (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
-  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
-                                             (__v32hu)_mm512_setzero_epi32(),  \
-                                             (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epu16(__m512h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
-      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
-      (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
-      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epu16(A, R)                                        \
-  ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
-      (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
-      (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
-  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
-                                              (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
-  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
-                                              (__v32hu)_mm512_setzero_epi32(), \
-                                              (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epu16(__m512h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
-      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
-      (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
-      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepu16_ph(A, R)                                         \
-  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
-                                             (__v32hf)_mm512_undefined_ph(),   \
-                                             (__mmask32)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
-  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
-                                             (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
-  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
-      (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_cvtepu16_ph(__m512i __A) {
-  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
-      (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
-  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
-      (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
-  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
-      (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epi32(A, R)                                         \
-  ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
-      (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
-      (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
-  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
-                                             (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
-  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
-                                             (__v16si)_mm512_setzero_epi32(),  \
-                                             (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epi32(__m256h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
-      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
-      (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
-      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epu32(A, R)                                         \
-  ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
-      (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
-      (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
-  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
-                                              (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
-  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
-                                              (__v16su)_mm512_setzero_epi32(), \
-                                              (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epu32(__m256h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
-      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
-      (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
-      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi32_ph(A, R)                                         \
-  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
-                                             (__v16hf)_mm256_undefined_ph(),   \
-                                             (__mmask16)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
-  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
-                                             (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
-  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
-      (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_ph(__m512i __A) {
-  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
-      (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
-  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
-      (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
-  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
-      (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepu32_ph(A, R)                                         \
-  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
-                                              (__v16hf)_mm256_undefined_ph(),  \
-                                              (__mmask16)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
-  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
-                                              (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
-  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
-      (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32_ph(__m512i __A) {
-  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
-      (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
-  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
-      (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
-  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
-      (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epi32(A, R)                                        \
-  ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
-      (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
-      (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
-  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
-                                              (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
-  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
-                                              (__v16si)_mm512_setzero_epi32(), \
-                                              (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epi32(__m256h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
-      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
-      (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
-      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epu32(A, R)                                        \
-  ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
-      (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
-      (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
-  ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
-                                               (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
-  ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
-      (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
-      (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epu32(__m256h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
-      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
-      (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
-      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi64_ph(A, R)                                         \
-  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
-      (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
-  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
-  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
-      (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_ph(__m512i __A) {
-  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
-      (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
-  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
-      (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
-  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
-      (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epi64(A, R)                                         \
-  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
-                                             (__v8di)_mm512_undefined_epi32(), \
-                                             (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
-  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
-                                             (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
-  ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
-      (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epi64(__m128h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
-      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
-      (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
-      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepu64_ph(A, R)                                         \
-  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
-      (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
-  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
-                                              (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
-  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
-      (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_cvtepu64_ph(__m512i __A) {
-  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
-      (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
-  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
-      (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
-  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
-      (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epu64(A, R)                                         \
-  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
-      (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
-      (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
-  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
-                                              (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
-  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
-      (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epu64(__m128h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
-      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
-      (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
-  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
-      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epi64(A, R)                                        \
-  ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
-      (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
-      (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
-  ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
-                                              (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
-  ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
-      (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epi64(__m128h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
-      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
-      (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
-      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epu64(A, R)                                        \
-  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
-      (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
-      (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
-  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
-                                               (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
-  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
-      (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epu64(__m128h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
-      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
-      (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
-  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
-      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_i32(A, R)                                              \
-  ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
-  return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_u32(A, R)                                              \
-  ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS128
-_mm_cvtsh_u32(__m128h __A) {
-  return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsh_i64(A, R)                                              \
-  ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
-
-static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
-  return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
-                                               _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_u64(A, R)                                              \
-  ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvtsh_u64(__m128h __A) {
-  return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
-      (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
-}
-#endif // __x86_64__
-
-#define _mm_cvt_roundu32_sh(A, B, R)                                           \
-  ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
-  __A[0] = __B;
-  return __A;
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundu64_sh(A, B, R)                                           \
-  ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
-                                        (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
-  __A[0] = __B;
-  return __A;
-}
-#endif
-
-#define _mm_cvt_roundi32_sh(A, B, R)                                           \
-  ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
-                                                              int __B) {
-  __A[0] = __B;
-  return __A;
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundi64_sh(A, B, R)                                           \
-  ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
-                                                              long long __B) {
-  __A[0] = __B;
-  return __A;
-}
-#endif
-
-#define _mm_cvtt_roundsh_i32(A, R)                                             \
-  ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
-  return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
-                                          _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundsh_i64(A, R)                                             \
-  ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
-
-static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
-  return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
-                                                _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundsh_u32(A, R)                                             \
-  ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS128
-_mm_cvttsh_u32(__m128h __A) {
-  return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundsh_u64(A, R)                                             \
-  ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvttsh_u64(__m128h __A) {
-  return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
-      (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm512_cvtx_roundph_ps(A, R)                                           \
-  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
-                                             (__v16sf)_mm512_undefined_ps(),   \
-                                             (__mmask16)(-1), (int)(R)))
-
-#define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
-  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
-                                             (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
-  ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
-      (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
-  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
-      (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
-  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
-      (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
-  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
-      (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtx_roundps_ph(A, R)                                           \
-  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
-                                              (__v16hf)_mm256_undefined_ph(),  \
-                                              (__mmask16)(-1), (int)(R)))
-
-#define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
-  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
-                                              (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
-  ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
-      (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
-  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
-      (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
-  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
-      (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
-  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
-      (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmadd_round_ph(A, B, C, R)                                      \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                              \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                             \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                             \
-  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_fmsub_round_ph(A, B, C, R)                                      \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
-      (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                              \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                             \
-  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_fnmadd_round_ph(A, B, C, R)                                     \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
-      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
-      (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
-      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
-  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
-      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_fnmsub_round_ph(A, B, C, R)                                     \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
-      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
-      (__mmask32)-1, (int)(R)))
-
-#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
-  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
-      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
-      (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
-                                                                __m512h __B,
-                                                                __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
-                                                  (__v32hf)__C, (__mmask32)-1,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
-                                                  (__v32hf)__C, (__mmask32)__U,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
-                                                   (__v32hf)__C, (__mmask32)__U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
-                                                   (__v32hf)__C, (__mmask32)__U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
-                                                                __m512h __B,
-                                                                __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
-                                                  -(__v32hf)__C, (__mmask32)-1,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
-                                                  -(__v32hf)__C, (__mmask32)__U,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_maskz(
-      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
-                                                                 __m512h __B,
-                                                                 __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
-                                                  (__v32hf)__C, (__mmask32)-1,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
-                                                   (__v32hf)__C, (__mmask32)__U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
-                                                   (__v32hf)__C, (__mmask32)__U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
-                                                                 __m512h __B,
-                                                                 __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
-                                                  -(__v32hf)__C, (__mmask32)-1,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_maskz(
-      -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmaddsub_round_ph(A, B, C, R)                                   \
-  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
-  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
-  ((__m512h)__builtin_ia32_vfmaddsubph512_mask3(                               \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
-  ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_fmsubadd_round_ph(A, B, C, R)                                   \
-  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
-      (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
-  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
-  ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
-      (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
-      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
-      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
-      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
-      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
-      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
-      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
-      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                             \
-  ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
-                                                   (__v32hf)__C, (__mmask32)__U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
-  ((__m512h)__builtin_ia32_vfmsubaddph512_mask3(                               \
-      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
-      (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
-      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                             \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
-      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
-      (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
-                                                  (__v32hf)__C, (__mmask32)__U,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                             \
-  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
-      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
-      (__mmask32)(U), (int)(R)))
-
-#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
-  ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
-      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
-      (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
-                                                  -(__v32hf)__C, (__mmask32)__U,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
-                                                   (__v32hf)__C, (__mmask32)__U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
-                                                             __m128h __A,
-                                                             __m128h __B) {
-  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
-                                       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128h __B) {
-  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
-                                       (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmadd_round_sh(A, B, C, R)                                         \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fmadd_round_sh(W, U, A, B, R)                                 \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
-      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),        \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
-  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R)                                \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
-      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
-                                                             __m128h __A,
-                                                             __m128h __B) {
-  return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
-                                                -(__v8hf)__B, (__mmask8)-1,
-                                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128h __B) {
-  return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
-                                                -(__v8hf)__B, (__mmask8)__U,
-                                                _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmsub_round_sh(A, B, C, R)                                         \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fmsub_round_sh(W, U, A, B, R)                                 \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
-      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),       \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
-                                                 -(__v8hf)__C, (__mmask8)__U,
-                                                 _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
-  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
-      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
-      (__mmask8)(U), (int)R))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R)                                \
-  ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
-      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
-                                                              __m128h __A,
-                                                              __m128h __B) {
-  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
-                                       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
-                                       (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmadd_round_sh(A, B, C, R)                                        \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
-      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fnmadd_round_sh(W, U, A, B, R)                                \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
-      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),       \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                               \
-  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
-      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R)                               \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
-      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
-                                                              __m128h __A,
-                                                              __m128h __B) {
-  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
-                                       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
-                                       (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmsub_round_sh(A, B, C, R)                                        \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
-      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fnmsub_round_sh(W, U, A, B, R)                                \
-  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
-      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),      \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                               \
-  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
-      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
-                                        (__mmask8)__U,
-                                        _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R)                               \
-  ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
-      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
-                                                               __m128h __B,
-                                                               __m128h __C) {
-  return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
-                                                 (__v4sf)__C, (__mmask8)-1,
-                                                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
-      (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
-                                                  (__v4sf)__C, (__mmask8)__U,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fcmadd_round_sch(A, B, C, R)                                       \
-  ((__m128h)__builtin_ia32_vfcmaddcsh_mask(                                    \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fcmadd_round_sch(A, U, B, C, R)                               \
-  ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask(                              \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R)                              \
-  ((__m128h)__builtin_ia32_vfcmaddcsh_maskz(                                   \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R)                              \
-  ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(                             \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
-                                                              __m128h __B,
-                                                              __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
-                                                (__v4sf)__C, (__mmask8)-1,
-                                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
-      (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
-                                                 (__v4sf)__C, (__mmask8)__U,
-                                                 _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmadd_round_sch(A, B, C, R)                                        \
-  ((__m128h)__builtin_ia32_vfmaddcsh_mask(                                     \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
-      (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fmadd_round_sch(A, U, B, C, R)                                \
-  ((__m128h)__builtin_ia32_vfmaddcsh_round_mask(                               \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fmadd_round_sch(U, A, B, C, R)                               \
-  ((__m128h)__builtin_ia32_vfmaddcsh_maskz(                                    \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_mask3_fmadd_round_sch(A, B, C, U, R)                               \
-  ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3(                              \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
-      (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
-                                                              __m128h __B) {
-  return (__m128h)__builtin_ia32_vfcmulcsh_mask(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
-                                                (__v4sf)__W, (__mmask8)__U,
-                                                _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_vfcmulcsh_mask(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fcmul_round_sch(A, B, R)                                           \
-  ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
-      (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fcmul_round_sch(W, U, A, B, R)                                \
-  ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fcmul_round_sch(U, A, B, R)                                  \
-  ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
-      (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
-                                                             __m128h __B) {
-  return (__m128h)__builtin_ia32_vfmulcsh_mask(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128h __B) {
-  return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
-                                               (__v4sf)__W, (__mmask8)__U,
-                                               _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_vfmulcsh_mask(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmul_round_sch(A, B, R)                                            \
-  ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
-      (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fmul_round_sch(W, U, A, B, R)                                 \
-  ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
-      (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fmul_round_sch(U, A, B, R)                                   \
-  ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
-      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
-      (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
-                                                                 __m512h __B) {
-  return (__m512h)__builtin_ia32_vfcmulcph512_mask(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
-                                                   (__v16sf)__W, (__mmask16)__U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_vfcmulcph512_mask(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fcmul_round_pch(A, B, R)                                        \
-  ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
-      (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fcmul_round_pch(W, U, A, B, R)                             \
-  ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
-      (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_fcmul_round_pch(U, A, B, R)                               \
-  ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
-      (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
-                                                                __m512h __B) {
-  return (__m512h)__builtin_ia32_vfmulcph512_mask(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
-                                                  (__v16sf)__W, (__mmask16)__U,
-                                                  _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_vfmulcph512_mask(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmul_round_pch(A, B, R)                                         \
-  ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
-      (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fmul_round_pch(W, U, A, B, R)                              \
-  ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
-      (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_fmul_round_pch(U, A, B, R)                                \
-  ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
-      (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
-                                                                  __m512h __B,
-                                                                  __m512h __C) {
-  return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
-  return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fcmadd_round_pch(A, B, C, R)                                    \
-  ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
-      (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R)                            \
-  ((__m512h)__builtin_ia32_vfcmaddcph512_mask(                                 \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
-      (__mmask16)(U), (int)(R)))
-
-#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
-  ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
-      (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
-  ((__m512h)__builtin_ia32_vfcmaddcph512_maskz(                                \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
-      (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
-                                                                 __m512h __B,
-                                                                 __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
-                                                    (__v16sf)__C, (__mmask16)-1,
-                                                    _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
-                                                   (__v16sf)__C, (__mmask16)__U,
-                                                   _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
-  return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
-  return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
-      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
-      _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmadd_round_pch(A, B, C, R)                                     \
-  ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
-      (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fmadd_round_pch(A, U, B, C, R)                             \
-  ((__m512h)__builtin_ia32_vfmaddcph512_mask(                                  \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
-      (__mmask16)(U), (int)(R)))
-
-#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R)                            \
-  ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
-      (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R)                            \
-  ((__m512h)__builtin_ia32_vfmaddcph512_maskz(                                 \
-      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
-      (__mmask16)(U), (int)(R)))
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512
-_mm512_reduce_add_ph(__m512h __W) {
-  return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512
-_mm512_reduce_mul_ph(__m512h __W) {
-  return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_ph(__m512h __V) {
-  return __builtin_ia32_reduce_fmax_ph512(__V);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_ph(__m512h __V) {
-  return __builtin_ia32_reduce_fmin_ph512(__V);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
-  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
-                                              (__v32hf)__A);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
-  return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
-                                                 (__v32hi)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
-  return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
-}
-
-// intrinsics below are alias for f*mul_*ch
-#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
-#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
-#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
-#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
-#define _mm512_mask_mul_round_pch(W, U, A, B, R)                               \
-  _mm512_mask_fmul_round_pch(W, U, A, B, R)
-#define _mm512_maskz_mul_round_pch(U, A, B, R)                                 \
-  _mm512_maskz_fmul_round_pch(U, A, B, R)
-
-#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
-#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
-#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
-#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
-#define _mm512_mask_cmul_round_pch(W, U, A, B, R)                              \
-  _mm512_mask_fcmul_round_pch(W, U, A, B, R)
-#define _mm512_maskz_cmul_round_pch(U, A, B, R)                                \
-  _mm512_maskz_fcmul_round_pch(U, A, B, R)
-
-#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
-#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
-#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
-#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
-#define _mm_mask_mul_round_sch(W, U, A, B, R)                                  \
-  _mm_mask_fmul_round_sch(W, U, A, B, R)
-#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
-
-#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
-#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
-#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
-#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
-#define _mm_mask_cmul_round_sch(W, U, A, B, R)                                 \
-  _mm_mask_fcmul_round_sch(W, U, A, B, R)
-#define _mm_maskz_cmul_round_sch(U, A, B, R)                                   \
-  _mm_maskz_fcmul_round_sch(U, A, B, R)
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-#undef __DEFAULT_FN_ATTRS512
-
-#endif
-#endif
diff --git a/third_party/intel/clang/avx512ifmaintrin.h b/third_party/intel/clang/avx512ifmaintrin.h
deleted file mode 100644
index 9468d1755..000000000
--- a/third_party/intel/clang/avx512ifmaintrin.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __IFMAINTRIN_H
-#define __IFMAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512ifma,evex512"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
-                                                (__v8di) __Z);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
-                                   (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
-                                   (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
-                                                (__v8di) __Z);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
-                                   (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
-                                   (__v8di)_mm512_setzero_si512());
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/avx512ifmavlintrin.h b/third_party/intel/clang/avx512ifmavlintrin.h
deleted file mode 100644
index 8787cd471..000000000
--- a/third_party/intel/clang/avx512ifmavlintrin.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __IFMAVLINTRIN_H
-#define __IFMAVLINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
-                 __min_vector_width__(256)))
-
-#define _mm_madd52hi_epu64(X, Y, Z)                                            \
-  ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
-                                          (__v2di)(Z)))
-
-#define _mm256_madd52hi_epu64(X, Y, Z)                                         \
-  ((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y),            \
-                                          (__v4di)(Z)))
-
-#define _mm_madd52lo_epu64(X, Y, Z)                                            \
-  ((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y),            \
-                                          (__v2di)(Z)))
-
-#define _mm256_madd52lo_epu64(X, Y, Z)                                         \
-  ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y),            \
-                                          (__v4di)(Z)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
-                                      (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
-                                      (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
-                                   (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
-                                   (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
-                                      (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
-                                      (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
-                                   (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
-                                   (__v4di)_mm256_setzero_si256());
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
diff --git a/third_party/intel/clang/avx512pfintrin.h b/third_party/intel/clang/avx512pfintrin.h
deleted file mode 100644
index f853be021..000000000
--- a/third_party/intel/clang/avx512pfintrin.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512PFINTRIN_H
-#define __AVX512PFINTRIN_H
-
-#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16) -1, \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
-                              (__v16si)(__m512i)(index), (void *)(addr), \
-                              (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#endif
diff --git a/third_party/intel/clang/avx512vbmi2intrin.h b/third_party/intel/clang/avx512vbmi2intrin.h
deleted file mode 100644
index 11598c888..000000000
--- a/third_party/intel/clang/avx512vbmi2intrin.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VBMI2INTRIN_H
-#define __AVX512VBMI2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2,evex512"), __min_vector_width__(512)))
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
-              (__v32hi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
-              (__v32hi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
-              (__v64qi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
-              (__v64qi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
-{
-  __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
-{
-  __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
-              (__v32hi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
-              (__v32hi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
-              (__v64qi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
-              (__v64qi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
-              (__v32hi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
-              (__v32hi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
-              (__v64qi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
-              (__v64qi) _mm512_setzero_si512(),
-              __U);
-}
-
-#define _mm512_shldi_epi64(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
-                                      (__v8di)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
-                                     (__v8di)(__m512i)(S)))
-
-#define _mm512_maskz_shldi_epi64(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
-                                     (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_shldi_epi32(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
-                                      (__v16si)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
-                                    (__v16si)(__m512i)(S)))
-
-#define _mm512_maskz_shldi_epi32(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
-                                    (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_shldi_epi16(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
-                                      (__v32hi)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
-                                    (__v32hi)(__m512i)(S)))
-
-#define _mm512_maskz_shldi_epi16(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
-                                    (__v32hi)_mm512_setzero_si512()))
-
-#define _mm512_shrdi_epi64(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
-                                      (__v8di)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
-                                     (__v8di)(__m512i)(S)))
-
-#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
-                                     (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_shrdi_epi32(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
-                                      (__v16si)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
-                                    (__v16si)(__m512i)(S)))
-
-#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
-                                    (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_shrdi_epi16(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
-                                      (__v32hi)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
-                                    (__v32hi)(__m512i)(S)))
-
-#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
-                                    (__v32hi)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B,
-                                             (__v8di)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                      (__v8di)_mm512_shldv_epi64(__A, __B, __C),
-                                      (__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                      (__v8di)_mm512_shldv_epi64(__A, __B, __C),
-                                      (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                     (__v16si)_mm512_shldv_epi32(__A, __B, __C),
-                                     (__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                     (__v16si)_mm512_shldv_epi32(__A, __B, __C),
-                                     (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B,
-                                             (__v32hi)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                                     (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
-                                     (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                                     (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
-                                     (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B,
-                                             (__v8di)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                      (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
-                                      (__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                      (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
-                                      (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i) __builtin_ia32_selectd_512(__U,
-                                     (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
-                                     (__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i) __builtin_ia32_selectd_512(__U,
-                                     (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
-                                     (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B,
-                                             (__v32hi)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                                     (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
-                                     (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                                     (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
-                                     (__v32hi)_mm512_setzero_si512());
-}
-
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
-
diff --git a/third_party/intel/clang/avx512vbmiintrin.h b/third_party/intel/clang/avx512vbmiintrin.h
deleted file mode 100644
index e47cd5cad..000000000
--- a/third_party/intel/clang/avx512vbmiintrin.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VBMIINTRIN_H
-#define __VBMIINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vbmi,evex512"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
-                                                 (__v64qi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
-                              __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512(__U,
-                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
-                               (__v64qi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
-                               __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512(__U,
-                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
-                               (__v64qi)__I);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
-                               __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512(__U,
-                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
-                               (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
-        __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
-                                     (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
-             __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
-                                     (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
-                                  __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
-                                (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
-                                (__v64qi)_mm512_setzero_si512());
-}
-
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/avx512vbmivlintrin.h b/third_party/intel/clang/avx512vbmivlintrin.h
deleted file mode 100644
index 848ca2d18..000000000
--- a/third_party/intel/clang/avx512vbmivlintrin.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VBMIVLINTRIN_H
-#define __VBMIVLINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
-                 __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
-                                                 (__v16qi)__I,
-                                                 (__v16qi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
-                           __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128(__U,
-                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
-                                  (__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
-                            __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128(__U,
-                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
-                                  (__v16qi)__I);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
-                            __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128(__U,
-                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
-                                  (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
-                                                 (__v32qi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
-                              __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256(__U,
-                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
-                               (__v32qi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
-                               __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256(__U,
-                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
-                               (__v32qi)__I);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
-                               __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256(__U,
-                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
-                               (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
-                                        (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
-                                        (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
-        __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
-                                     (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
-             __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
-                                     (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
-                               __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                   (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
-                                   (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                   (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
-                                   (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
-                                  __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
-                                (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
-                                (__v32qi)_mm256_setzero_si256());
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
diff --git a/third_party/intel/clang/avx512vlbf16intrin.h b/third_party/intel/clang/avx512vlbf16intrin.h
deleted file mode 100644
index 89c9f49c7..000000000
--- a/third_party/intel/clang/avx512vlbf16intrin.h
+++ /dev/null
@@ -1,517 +0,0 @@
-/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifdef __SSE2__
-
-#ifndef __AVX512VLBF16INTRIN_H
-#define __AVX512VLBF16INTRIN_H
-
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
-                 __min_vector_width__(256)))
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __B
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
-                                                    (__v4sf) __B);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __B
-///    A 128-bit vector of [4 x float].
-/// \param __W
-///    A 128-bit vector of [8 x bfloat].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element from __W.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
-                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
-                                             (__v8bf)__W);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __B
-///    A 128-bit vector of [4 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element is zero.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
-                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
-                                             (__v8bf)_mm_setzero_si128());
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __B
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
-///    conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
-                                                    (__v8sf) __B);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __B
-///    A 256-bit vector of [8 x float].
-/// \param __W
-///    A 256-bit vector of [16 x bfloat].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element from __W.
-/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
-///    conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
-                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
-                                         (__v16bf)__W);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __B
-///    A 256-bit vector of [8 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element is zero.
-/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
-///    conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
-                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
-                                         (__v16bf)_mm256_setzero_si256());
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __A, and higher 64 bits are 0.
-#define _mm_cvtneps_pbh(A)                                                     \
-  ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __W
-///    A 128-bit vector of [8 x bfloat].
-/// \param __U
-///    A 4-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element from __W.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                        (__v8bf)__W,
-                                                        (__mmask8)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __U
-///    A 4-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element is zero.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                    (__v8bf)_mm_setzero_si128(),
-                                                    (__mmask8)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-#define _mm256_cvtneps_pbh(A)                                                  \
-  ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __W
-///    A 256-bit vector of [8 x bfloat].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element from __W.
-/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                        (__v8bf)__W,
-                                                        (__mmask8)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element is zero.
-/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                    (__v8bf)_mm_setzero_si128(),
-                                                    (__mmask8)__U);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \param __B
-///    A 128-bit vector of [8 x bfloat].
-/// \param __D
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
-  return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
-                                             (__v8bf)__A,
-                                             (__v8bf)__B);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \param __B
-///    A 128-bit vector of [8 x bfloat].
-/// \param __D
-///    A 128-bit vector of [4 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
-/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
-                                           (__v4sf)__D);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \param __B
-///    A 128-bit vector of [8 x bfloat].
-/// \param __D
-///    A 128-bit vector of [4 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
-/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
-                                           (__v4sf)_mm_setzero_si128());
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \param __B
-///    A 256-bit vector of [16 x bfloat].
-/// \param __D
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
-  return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
-                                             (__v16bf)__A,
-                                             (__v16bf)__B);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \param __B
-///    A 256-bit vector of [16 x bfloat].
-/// \param __D
-///    A 256-bit vector of [8 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
-/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
-                                        (__v8sf)__D);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \param __B
-///    A 256-bit vector of [16 x bfloat].
-/// \param __D
-///    A 256-bit vector of [8 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
-/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
-                                        (__v8sf)_mm256_setzero_si256());
-}
-
-/// Convert One Single float Data to One BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A float data.
-/// \returns A bf16 data whose sign field and exponent field keep unchanged,
-///    and fraction field is truncated to 7 bits.
-static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
-  __v4sf __V = {__A, 0, 0, 0};
-  __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
-      (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
-  return (__bf16)__R[0];
-}
-
-/// Convert Packed BF16 Data to Packed float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __A
-///    A 128-bit vector of [4 x bfloat].
-/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
-  return _mm_castsi128_ps(
-      (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
-  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
-      (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __U
-///    A 4-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 128-bit vector of [4 x bfloat].
-/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
-  return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
-      (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __U
-///    A 8-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
-  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
-      (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using merging mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __S
-///    A 128-bit vector of [4 x float]. Elements are copied from __S when
-///     the corresponding mask bit is not set.
-/// \param __U
-///    A 4-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 128-bit vector of [4 x bfloat].
-/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
-  return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
-      (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
-      16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using merging mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __S
-///    A 256-bit vector of [8 x float]. Elements are copied from __S when
-///     the corresponding mask bit is not set.
-/// \param __U
-///    A 8-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
-  return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
-      (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
-      16));
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
-#endif
diff --git a/third_party/intel/clang/avx512vlbitalgintrin.h b/third_party/intel/clang/avx512vlbitalgintrin.h
deleted file mode 100644
index 377e3a5ea..000000000
--- a/third_party/intel/clang/avx512vlbitalgintrin.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLBITALGINTRIN_H
-#define __AVX512VLBITALGINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
-                 __min_vector_width__(256)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi16(__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
-              (__v16hi) _mm256_popcnt_epi16(__B),
-              (__v16hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
-{
-  return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
-              __U,
-              __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi16(__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
-              (__v8hi) _mm_popcnt_epi16(__B),
-              (__v8hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
-{
-  return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
-              __U,
-              __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi8(__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
-              (__v32qi) _mm256_popcnt_epi8(__B),
-              (__v32qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
-{
-  return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
-              __U,
-              __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi8(__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
-              (__v16qi) _mm_popcnt_epi8(__B),
-              (__v16qi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
-{
-  return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
-              __U,
-              __B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
-              (__v32qi) __B,
-              __U);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
-{
-  return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
-              __A,
-              __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
-              (__v16qi) __B,
-              __U);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
-{
-  return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
-              __A,
-              __B);
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
diff --git a/third_party/intel/clang/avx512vlbwintrin.h b/third_party/intel/clang/avx512vlbwintrin.h
deleted file mode 100644
index 9aedba066..000000000
--- a/third_party/intel/clang/avx512vlbwintrin.h
+++ /dev/null
@@ -1,3167 +0,0 @@
-/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLBWINTRIN_H
-#define __AVX512VLBWINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512bw,no-evex512"),                   \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512bw,no-evex512"),                   \
-                 __min_vector_width__(256)))
-
-/* Integer compare */
-
-#define _mm_cmp_epi8_mask(a, b, p) \
-  ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
-                                          (__v16qi)(__m128i)(b), (int)(p), \
-                                          (__mmask16)-1))
-
-#define _mm_mask_cmp_epi8_mask(m, a, b, p) \
-  ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
-                                          (__v16qi)(__m128i)(b), (int)(p), \
-                                          (__mmask16)(m)))
-
-#define _mm_cmp_epu8_mask(a, b, p) \
-  ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
-                                           (__v16qi)(__m128i)(b), (int)(p), \
-                                           (__mmask16)-1))
-
-#define _mm_mask_cmp_epu8_mask(m, a, b, p) \
-  ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
-                                           (__v16qi)(__m128i)(b), (int)(p), \
-                                           (__mmask16)(m)))
-
-#define _mm256_cmp_epi8_mask(a, b, p) \
-  ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
-                                          (__v32qi)(__m256i)(b), (int)(p), \
-                                          (__mmask32)-1))
-
-#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
-  ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
-                                          (__v32qi)(__m256i)(b), (int)(p), \
-                                          (__mmask32)(m)))
-
-#define _mm256_cmp_epu8_mask(a, b, p) \
-  ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
-                                           (__v32qi)(__m256i)(b), (int)(p), \
-                                           (__mmask32)-1))
-
-#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
-  ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
-                                           (__v32qi)(__m256i)(b), (int)(p), \
-                                           (__mmask32)(m)))
-
-#define _mm_cmp_epi16_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
-                                         (__v8hi)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1))
-
-#define _mm_mask_cmp_epi16_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
-                                         (__v8hi)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m)))
-
-#define _mm_cmp_epu16_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
-                                          (__v8hi)(__m128i)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm_mask_cmp_epu16_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
-                                          (__v8hi)(__m128i)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-#define _mm256_cmp_epi16_mask(a, b, p) \
-  ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
-                                          (__v16hi)(__m256i)(b), (int)(p), \
-                                          (__mmask16)-1))
-
-#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
-  ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
-                                          (__v16hi)(__m256i)(b), (int)(p), \
-                                          (__mmask16)(m)))
-
-#define _mm256_cmp_epu16_mask(a, b, p) \
-  ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
-                                           (__v16hi)(__m256i)(b), (int)(p), \
-                                           (__mmask16)-1))
-
-#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
-  ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
-                                           (__v16hi)(__m256i)(b), (int)(p), \
-                                           (__mmask16)(m)))
-
-#define _mm_cmpeq_epi8_mask(A, B) \
-    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epi8_mask(k, A, B) \
-    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epi8_mask(A, B) \
-    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epi8_mask(k, A, B) \
-    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epi8_mask(A, B) \
-    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epi8_mask(k, A, B) \
-    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epi8_mask(A, B) \
-    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epi8_mask(k, A, B) \
-    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epi8_mask(A, B) \
-    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epi8_mask(k, A, B) \
-    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epi8_mask(A, B) \
-    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epi8_mask(k, A, B) \
-    _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epi8_mask(A, B) \
-    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epi8_mask(k, A, B) \
-    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epi8_mask(A, B) \
-    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epi8_mask(k, A, B) \
-    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epi8_mask(A, B) \
-    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epi8_mask(k, A, B) \
-    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epi8_mask(A, B) \
-    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epi8_mask(k, A, B) \
-    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epi8_mask(A, B) \
-    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epi8_mask(k, A, B) \
-    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epi8_mask(A, B) \
-    _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epi8_mask(k, A, B) \
-    _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epu8_mask(A, B) \
-    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epu8_mask(k, A, B) \
-    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epu8_mask(A, B) \
-    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epu8_mask(k, A, B) \
-    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epu8_mask(A, B) \
-    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epu8_mask(k, A, B) \
-    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epu8_mask(A, B) \
-    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epu8_mask(k, A, B) \
-    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epu8_mask(A, B) \
-    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epu8_mask(k, A, B) \
-    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epu8_mask(A, B) \
-    _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epu8_mask(k, A, B) \
-    _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epu8_mask(A, B) \
-    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epu8_mask(k, A, B) \
-    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epu8_mask(A, B) \
-    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epu8_mask(k, A, B) \
-    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epu8_mask(A, B) \
-    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epu8_mask(k, A, B) \
-    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epu8_mask(A, B) \
-    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epu8_mask(k, A, B) \
-    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epu8_mask(A, B) \
-    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epu8_mask(k, A, B) \
-    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epu8_mask(A, B) \
-    _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epu8_mask(k, A, B) \
-    _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epi16_mask(A, B) \
-    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epi16_mask(k, A, B) \
-    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epi16_mask(A, B) \
-    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epi16_mask(k, A, B) \
-    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epi16_mask(A, B) \
-    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epi16_mask(k, A, B) \
-    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epi16_mask(A, B) \
-    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epi16_mask(k, A, B) \
-    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epi16_mask(A, B) \
-    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epi16_mask(k, A, B) \
-    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epi16_mask(A, B) \
-    _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epi16_mask(k, A, B) \
-    _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epi16_mask(A, B) \
-    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epi16_mask(k, A, B) \
-    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epi16_mask(A, B) \
-    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epi16_mask(k, A, B) \
-    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epi16_mask(A, B) \
-    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epi16_mask(k, A, B) \
-    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epi16_mask(A, B) \
-    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epi16_mask(k, A, B) \
-    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epi16_mask(A, B) \
-    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epi16_mask(k, A, B) \
-    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epi16_mask(A, B) \
-    _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epi16_mask(k, A, B) \
-    _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epu16_mask(A, B) \
-    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epu16_mask(k, A, B) \
-    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epu16_mask(A, B) \
-    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epu16_mask(k, A, B) \
-    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epu16_mask(A, B) \
-    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epu16_mask(k, A, B) \
-    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epu16_mask(A, B) \
-    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epu16_mask(k, A, B) \
-    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epu16_mask(A, B) \
-    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epu16_mask(k, A, B) \
-    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epu16_mask(A, B) \
-    _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epu16_mask(k, A, B) \
-    _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epu16_mask(A, B) \
-    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epu16_mask(k, A, B) \
-    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epu16_mask(A, B) \
-    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epu16_mask(k, A, B) \
-    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epu16_mask(A, B) \
-    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epu16_mask(k, A, B) \
-    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epu16_mask(A, B) \
-    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epu16_mask(k, A, B) \
-    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epu16_mask(A, B) \
-    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epu16_mask(k, A, B) \
-    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epu16_mask(A, B) \
-    _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epu16_mask(k, A, B) \
-    _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                             (__v32qi)_mm256_add_epi8(__A, __B),
-                                             (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                             (__v32qi)_mm256_add_epi8(__A, __B),
-                                             (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_add_epi16(__A, __B),
-                                             (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_add_epi16(__A, __B),
-                                             (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                             (__v32qi)_mm256_sub_epi8(__A, __B),
-                                             (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                             (__v32qi)_mm256_sub_epi8(__A, __B),
-                                             (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_sub_epi16(__A, __B),
-                                             (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_sub_epi16(__A, __B),
-                                             (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_add_epi8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_add_epi8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_add_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_add_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_sub_epi8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_sub_epi8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_sub_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_sub_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
-                                             (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
-                                             (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_mullo_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_mullo_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
-{
-  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
-              (__v16qi) __W,
-              (__v16qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
-{
-  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
-               (__v32qi) __W,
-               (__v32qi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
-{
-  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
-               (__v8hi) __W,
-               (__v8hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
-{
-  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
-               (__v16hi) __W,
-               (__v16hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_abs_epi8(__A),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_abs_epi8(__A),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                             (__v32qi)_mm256_abs_epi8(__A),
-                                             (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                             (__v32qi)_mm256_abs_epi8(__A),
-                                             (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_abs_epi16(__A),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_abs_epi16(__A),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_abs_epi16(__A),
-                                             (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_abs_epi16(__A),
-                                             (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_packs_epi32(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_packs_epi32(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                          (__v16hi)_mm256_packs_epi32(__A, __B),
-                                          (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                          (__v16hi)_mm256_packs_epi32(__A, __B),
-                                          (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_packs_epi16(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_packs_epi16(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                          (__v32qi)_mm256_packs_epi16(__A, __B),
-                                          (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                          (__v32qi)_mm256_packs_epi16(__A, __B),
-                                          (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_packus_epi32(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_packus_epi32(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                         (__v16hi)_mm256_packus_epi32(__A, __B),
-                                         (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                         (__v16hi)_mm256_packus_epi32(__A, __B),
-                                         (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                            (__v16qi)_mm_packus_epi16(__A, __B),
-                                            (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                            (__v16qi)_mm_packus_epi16(__A, __B),
-                                            (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                         (__v32qi)_mm256_packus_epi16(__A, __B),
-                                         (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                         (__v32qi)_mm256_packus_epi16(__A, __B),
-                                         (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_adds_epi8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_adds_epi8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                            (__v32qi)_mm256_adds_epi8(__A, __B),
-                                            (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                            (__v32qi)_mm256_adds_epi8(__A, __B),
-                                            (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_adds_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_adds_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_adds_epi16(__A, __B),
-                                           (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_adds_epi16(__A, __B),
-                                           (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_adds_epu8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_adds_epu8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                            (__v32qi)_mm256_adds_epu8(__A, __B),
-                                            (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                            (__v32qi)_mm256_adds_epu8(__A, __B),
-                                            (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_adds_epu16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_adds_epu16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_adds_epu16(__A, __B),
-                                           (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_adds_epu16(__A, __B),
-                                           (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_avg_epu8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_avg_epu8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                             (__v32qi)_mm256_avg_epu8(__A, __B),
-                                             (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                             (__v32qi)_mm256_avg_epu8(__A, __B),
-                                             (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_avg_epu16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_avg_epu16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                            (__v16hi)_mm256_avg_epu16(__A, __B),
-                                            (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                            (__v16hi)_mm256_avg_epu16(__A, __B),
-                                            (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_max_epi8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_max_epi8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                             (__v32qi)_mm256_max_epi8(__A, __B),
-                                             (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                             (__v32qi)_mm256_max_epi8(__A, __B),
-                                             (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_max_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_max_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                            (__v16hi)_mm256_max_epi16(__A, __B),
-                                            (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                            (__v16hi)_mm256_max_epi16(__A, __B),
-                                            (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_max_epu8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_max_epu8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                             (__v32qi)_mm256_max_epu8(__A, __B),
-                                             (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                             (__v32qi)_mm256_max_epu8(__A, __B),
-                                             (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_max_epu16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_max_epu16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                            (__v16hi)_mm256_max_epu16(__A, __B),
-                                            (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                            (__v16hi)_mm256_max_epu16(__A, __B),
-                                            (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_min_epi8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_min_epi8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                             (__v32qi)_mm256_min_epi8(__A, __B),
-                                             (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                             (__v32qi)_mm256_min_epi8(__A, __B),
-                                             (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_min_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_min_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                            (__v16hi)_mm256_min_epi16(__A, __B),
-                                            (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                            (__v16hi)_mm256_min_epi16(__A, __B),
-                                            (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_min_epu8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm_min_epu8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                             (__v32qi)_mm256_min_epu8(__A, __B),
-                                             (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                             (__v32qi)_mm256_min_epu8(__A, __B),
-                                             (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_min_epu16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                             (__v8hi)_mm_min_epu16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                            (__v16hi)_mm256_min_epu16(__A, __B),
-                                            (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                            (__v16hi)_mm256_min_epu16(__A, __B),
-                                            (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
-                                            (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
-                                            (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
-                                         (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
-                                         (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_subs_epi8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_subs_epi8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                            (__v32qi)_mm256_subs_epi8(__A, __B),
-                                            (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                            (__v32qi)_mm256_subs_epi8(__A, __B),
-                                            (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_subs_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_subs_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_subs_epi16(__A, __B),
-                                           (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_subs_epi16(__A, __B),
-                                           (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_subs_epu8(__A, __B),
-                                             (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                             (__v16qi)_mm_subs_epu8(__A, __B),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                            (__v32qi)_mm256_subs_epu8(__A, __B),
-                                            (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                            (__v32qi)_mm256_subs_epu8(__A, __B),
-                                            (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_subs_epu16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_subs_epu16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
-      __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_subs_epu16(__A, __B),
-                                           (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_subs_epu16(__A, __B),
-                                           (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
-                                                 (__v8hi) __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I,
-                            __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                  (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
-                                  (__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U,
-                             __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                  (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
-                                  (__v8hi)__I);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
-            __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                  (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
-                                  (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
-                                                 (__v16hi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I,
-                               __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                              (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
-                              (__v16hi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U,
-                                __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                              (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
-                              (__v16hi)__I);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
-                                 __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                              (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
-                              (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
-                                            (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
-                                            (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
-                          __m256i __Y) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
-                                        (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
-                                        (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_madd_epi16(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_madd_epi16(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                            (__v8si)_mm256_madd_epi16(__A, __B),
-                                            (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                            (__v8si)_mm256_madd_epi16(__A, __B),
-                                            (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi16_epi8 (__m128i __A) {
-  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
-               (__v16qi) _mm_setzero_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
-  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
-               (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) {
-  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
-               (__v16qi) _mm_setzero_si128(),
-               __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi16_epi8 (__m256i __A) {
-  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
-               (__v16qi) _mm_setzero_si128(),
-               (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
-  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
-               (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) {
-  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
-               (__v16qi) _mm_setzero_si128(),
-               __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi16_epi8 (__m128i __A) {
-  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
-                (__v16qi) _mm_setzero_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
-  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
-                (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) {
-  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
-                (__v16qi) _mm_setzero_si128(),
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi16_epi8 (__m256i __A) {
-  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
-                (__v16qi) _mm_setzero_si128(),
-                (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
-  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
-                (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) {
-  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
-                (__v16qi) _mm_setzero_si128(),
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi16_epi8 (__m128i __A) {
-  return (__m128i)__builtin_shufflevector(
-      __builtin_convertvector((__v8hi)__A, __v8qi),
-      (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-      12, 13, 14, 15);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
-  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
-               (__v16qi) __O,
-               __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
-  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
-               (__v16qi) _mm_setzero_si128(),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
-}
-
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi16_epi8 (__m256i __A) {
-  return (__m128i)__builtin_convertvector((__v16hi) __A, __v16qi);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm256_cvtepi16_epi8(__A),
-                                             (__v16qi)__O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                             (__v16qi)_mm256_cvtepi16_epi8(__A),
-                                             (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
-{
-  __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
-{
-  __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
-{
-  __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
-                                         (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
-                                         (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
-                                          (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
-                                          (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
-                                          (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
-                                          (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                           (__v16qi)_mm_unpackhi_epi8(__A, __B),
-                                           (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                           (__v16qi)_mm_unpackhi_epi8(__A, __B),
-                                           (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                        (__v32qi)_mm256_unpackhi_epi8(__A, __B),
-                                        (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                        (__v32qi)_mm256_unpackhi_epi8(__A, __B),
-                                        (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                           (__v8hi)_mm_unpackhi_epi16(__A, __B),
-                                           (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                           (__v8hi)_mm_unpackhi_epi16(__A, __B),
-                                           (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                       (__v16hi)_mm256_unpackhi_epi16(__A, __B),
-                                       (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                       (__v16hi)_mm256_unpackhi_epi16(__A, __B),
-                                       (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                           (__v16qi)_mm_unpacklo_epi8(__A, __B),
-                                           (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
-                                           (__v16qi)_mm_unpacklo_epi8(__A, __B),
-                                           (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                        (__v32qi)_mm256_unpacklo_epi8(__A, __B),
-                                        (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
-                                        (__v32qi)_mm256_unpacklo_epi8(__A, __B),
-                                        (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                           (__v8hi)_mm_unpacklo_epi16(__A, __B),
-                                           (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                           (__v8hi)_mm_unpacklo_epi16(__A, __B),
-                                           (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                       (__v16hi)_mm256_unpacklo_epi16(__A, __B),
-                                       (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                       (__v16hi)_mm256_unpacklo_epi16(__A, __B),
-                                       (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_cvtepi8_epi16(__A),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_cvtepi8_epi16(__A),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
-                                             (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
-                                             (__v16hi)_mm256_setzero_si256());
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_cvtepu8_epi16(__A),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_cvtepu8_epi16(__A),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
-                                             (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
-                                             (__v16hi)_mm256_setzero_si256());
-}
-
-
-#define _mm_mask_shufflehi_epi16(W, U, A, imm) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
-                                       (__v8hi)(__m128i)(W)))
-
-#define _mm_maskz_shufflehi_epi16(U, A, imm) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
-                                       (__v8hi)_mm_setzero_si128()))
-
-#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                       (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
-                                       (__v16hi)(__m256i)(W)))
-
-#define _mm256_maskz_shufflehi_epi16(U, A, imm) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                       (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
-                                       (__v16hi)_mm256_setzero_si256()))
-
-#define _mm_mask_shufflelo_epi16(W, U, A, imm) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
-                                       (__v8hi)(__m128i)(W)))
-
-#define _mm_maskz_shufflelo_epi16(U, A, imm) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
-                                       (__v8hi)_mm_setzero_si128()))
-
-#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                       (__v16hi)_mm256_shufflelo_epi16((A), \
-                                                                       (imm)), \
-                                       (__v16hi)(__m256i)(W)))
-
-#define _mm256_maskz_shufflelo_epi16(U, A, imm) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                       (__v16hi)_mm256_shufflelo_epi16((A), \
-                                                                       (imm)), \
-                                       (__v16hi)_mm256_setzero_si256()))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sllv_epi16(__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
-                                           (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
-                                           (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_sllv_epi16(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_sllv_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_sllv_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_sll_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_sll_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_sll_epi16(__A, __B),
-                                          (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_sll_epi16(__A, __B),
-                                          (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_slli_epi16(__A, (int)__B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_slli_epi16(__A, (int)__B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A,
-                       unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_slli_epi16(__A, (int)__B),
-                                         (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_slli_epi16(__A, (int)__B),
-                                         (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srlv_epi16(__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
-                                           (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
-                                           (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srlv_epi16(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srlv_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srlv_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srav_epi16(__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_srav_epi16(__A, __B),
-                                           (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                           (__v16hi)_mm256_srav_epi16(__A, __B),
-                                           (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srav_epi16(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srav_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srav_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_sra_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_sra_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_sra_epi16(__A, __B),
-                                          (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_sra_epi16(__A, __B),
-                                          (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srai_epi16(__A, (int)__B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srai_epi16(__A, (int)__B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A,
-                       unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_srai_epi16(__A, (int)__B),
-                                         (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_srai_epi16(__A, (int)__B),
-                                         (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srl_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srl_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_srl_epi16(__A, __B),
-                                          (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                          (__v16hi)_mm256_srl_epi16(__A, __B),
-                                          (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srli_epi16(__A, __B),
-                                             (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srli_epi16(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_srli_epi16(__A, __B),
-                                         (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_srli_epi16(__A, __B),
-                                         (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
-                (__v8hi) __A,
-                (__v8hi) __W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
-                (__v8hi) __A,
-                (__v8hi) _mm_setzero_si128 ());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
-{
-  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
-                (__v16hi) __A,
-                (__v16hi) __W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
-{
-  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
-                (__v16hi) __A,
-                (__v16hi) _mm256_setzero_si256 ());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
-                (__v16qi) __A,
-                (__v16qi) __W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
-                (__v16qi) __A,
-                (__v16qi) _mm_setzero_si128 ());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
-{
-  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
-                (__v32qi) __A,
-                (__v32qi) __W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
-{
-  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
-                (__v32qi) __A,
-                (__v32qi) _mm256_setzero_si256 ());
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
-{
-  return (__m128i) __builtin_ia32_selectb_128(__M,
-                                              (__v16qi) _mm_set1_epi8(__A),
-                                              (__v16qi) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
-{
- return (__m128i) __builtin_ia32_selectb_128(__M,
-                                             (__v16qi) _mm_set1_epi8(__A),
-                                             (__v16qi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
-{
-  return (__m256i) __builtin_ia32_selectb_256(__M,
-                                              (__v32qi) _mm256_set1_epi8(__A),
-                                              (__v32qi) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
-{
-  return (__m256i) __builtin_ia32_selectb_256(__M,
-                                              (__v32qi) _mm256_set1_epi8(__A),
-                                              (__v32qi) _mm256_setzero_si256());
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_loadu_epi16 (void const *__P)
-{
-  struct __loadu_epi16 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi16*)__P)->__v;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P,
-                 (__v8hi) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P,
-                 (__v8hi)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_loadu_epi16 (void const *__P)
-{
-  struct __loadu_epi16 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi16*)__P)->__v;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P,
-                 (__v16hi) __W,
-                 (__mmask16) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P,
-                 (__v16hi)
-                 _mm256_setzero_si256 (),
-                 (__mmask16) __U);
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_loadu_epi8 (void const *__P)
-{
-  struct __loadu_epi8 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi8*)__P)->__v;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P,
-                 (__v16qi) __W,
-                 (__mmask16) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P,
-                 (__v16qi)
-                 _mm_setzero_si128 (),
-                 (__mmask16) __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_loadu_epi8 (void const *__P)
-{
-  struct __loadu_epi8 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi8*)__P)->__v;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P,
-                 (__v32qi) __W,
-                 (__mmask32) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P,
-                 (__v32qi)
-                 _mm256_setzero_si256 (),
-                 (__mmask32) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_storeu_epi16 (void *__P, __m128i __A)
-{
-  struct __storeu_epi16 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi16*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
-{
-  __builtin_ia32_storedquhi128_mask ((__v8hi *) __P,
-             (__v8hi) __A,
-             (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_storeu_epi16 (void *__P, __m256i __A)
-{
-  struct __storeu_epi16 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi16*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
-{
-  __builtin_ia32_storedquhi256_mask ((__v16hi *) __P,
-             (__v16hi) __A,
-             (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_storeu_epi8 (void *__P, __m128i __A)
-{
-  struct __storeu_epi8 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi8*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
-{
-  __builtin_ia32_storedquqi128_mask ((__v16qi *) __P,
-             (__v16qi) __A,
-             (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_storeu_epi8 (void *__P, __m256i __A)
-{
-  struct __storeu_epi8 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi8*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
-{
-  __builtin_ia32_storedquqi256_mask ((__v32qi *) __P,
-             (__v32qi) __A,
-             (__mmask32) __U);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_test_epi8_mask (__m128i __A, __m128i __B)
-{
-  return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B),
-                                    _mm_setzero_si128());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_test_epi8_mask (__m256i __A, __m256i __B)
-{
-  return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B),
-                                  _mm256_setzero_si256());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_cmpneq_epi8_mask (__U, _mm256_and_si256(__A, __B),
-                                       _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_test_epi16_mask (__m128i __A, __m128i __B)
-{
-  return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_cmpneq_epi16_mask (__U, _mm_and_si128 (__A, __B),
-                                     _mm_setzero_si128());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_test_epi16_mask (__m256i __A, __m256i __B)
-{
-  return _mm256_cmpneq_epi16_mask (_mm256_and_si256 (__A, __B),
-                                   _mm256_setzero_si256 ());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_cmpneq_epi16_mask (__U, _mm256_and_si256(__A, __B),
-                                        _mm256_setzero_si256());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_testn_epi8_mask (__m128i __A, __m128i __B)
-{
-  return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_cmpeq_epi8_mask (__U, _mm_and_si128 (__A, __B),
-                                  _mm_setzero_si128());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
-{
-  return _mm256_cmpeq_epi8_mask (_mm256_and_si256 (__A, __B),
-                                 _mm256_setzero_si256());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_cmpeq_epi8_mask (__U, _mm256_and_si256 (__A, __B),
-                                      _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_testn_epi16_mask (__m128i __A, __m128i __B)
-{
-  return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
-{
-  return _mm256_cmpeq_epi16_mask (_mm256_and_si256(__A, __B),
-                                  _mm256_setzero_si256());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_cmpeq_epi16_mask (__U, _mm256_and_si256 (__A, __B),
-                                       _mm256_setzero_si256());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_movepi8_mask (__m128i __A)
-{
-  return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_movepi8_mask (__m256i __A)
-{
-  return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_movepi16_mask (__m128i __A)
-{
-  return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_movepi16_mask (__m256i __A)
-{
-  return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi8 (__mmask16 __A)
-{
-  return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi8 (__mmask32 __A)
-{
-  return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi16 (__mmask8 __A)
-{
-  return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi16 (__mmask16 __A)
-{
-  return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectb_128(__M,
-                                             (__v16qi) _mm_broadcastb_epi8(__A),
-                                             (__v16qi) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectb_128(__M,
-                                             (__v16qi) _mm_broadcastb_epi8(__A),
-                                             (__v16qi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectb_256(__M,
-                                             (__v32qi) _mm256_broadcastb_epi8(__A),
-                                             (__v32qi) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectb_256(__M,
-                                             (__v32qi) _mm256_broadcastb_epi8(__A),
-                                             (__v32qi) _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__M,
-                                             (__v8hi) _mm_broadcastw_epi16(__A),
-                                             (__v8hi) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__M,
-                                             (__v8hi) _mm_broadcastw_epi16(__A),
-                                             (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__M,
-                                             (__v16hi) _mm256_broadcastw_epi16(__A),
-                                             (__v16hi) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__M,
-                                             (__v16hi) _mm256_broadcastw_epi16(__A),
-                                             (__v16hi) _mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
-{
-  return (__m256i) __builtin_ia32_selectw_256 (__M,
-                                               (__v16hi) _mm256_set1_epi16(__A),
-                                               (__v16hi) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
-{
-  return (__m256i) __builtin_ia32_selectw_256(__M,
-                                              (__v16hi)_mm256_set1_epi16(__A),
-                                              (__v16hi) _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
-{
-  return (__m128i) __builtin_ia32_selectw_128(__M,
-                                              (__v8hi) _mm_set1_epi16(__A),
-                                              (__v8hi) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
-{
-  return (__m128i) __builtin_ia32_selectw_128(__M,
-                                              (__v8hi) _mm_set1_epi16(__A),
-                                              (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_permvarhi128((__v8hi) __B, (__v8hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                        (__v8hi)_mm_permutexvar_epi16(__A, __B),
-                                        (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
-          __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
-                                        (__v8hi)_mm_permutexvar_epi16(__A, __B),
-                                        (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_permvarhi256((__v16hi) __B, (__v16hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
-        __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                    (__v16hi)_mm256_permutexvar_epi16(__A, __B),
-                                    (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
-             __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
-                                    (__v16hi)_mm256_permutexvar_epi16(__A, __B),
-                                    (__v16hi)__W);
-}
-
-#define _mm_mask_alignr_epi8(W, U, A, B, N) \
-  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
-                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
-                                 (__v16qi)(__m128i)(W)))
-
-#define _mm_maskz_alignr_epi8(U, A, B, N) \
-  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
-                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
-                                 (__v16qi)_mm_setzero_si128()))
-
-#define _mm256_mask_alignr_epi8(W, U, A, B, N) \
-  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
-                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
-                              (__v32qi)(__m256i)(W)))
-
-#define _mm256_maskz_alignr_epi8(U, A, B, N) \
-  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
-                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
-                              (__v32qi)_mm256_setzero_si256()))
-
-#define _mm_dbsad_epu8(A, B, imm) \
-  ((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
-                                       (__v16qi)(__m128i)(B), (int)(imm)))
-
-#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
-                                      (__v8hi)(__m128i)(W)))
-
-#define _mm_maskz_dbsad_epu8(U, A, B, imm) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
-                                      (__v8hi)_mm_setzero_si128()))
-
-#define _mm256_dbsad_epu8(A, B, imm) \
-  ((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
-                                       (__v32qi)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                  (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
-                                  (__v16hi)(__m256i)(W)))
-
-#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                  (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
-                                  (__v16hi)_mm256_setzero_si256()))
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_reduce_add_epi16(__m128i __W) {
-  return __builtin_reduce_add((__v8hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_reduce_mul_epi16(__m128i __W) {
-  return __builtin_reduce_mul((__v8hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_reduce_and_epi16(__m128i __W) {
-  return __builtin_reduce_and((__v8hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_reduce_or_epi16(__m128i __W) {
-  return __builtin_reduce_or((__v8hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_add_epi16( __mmask8 __M, __m128i __W) {
-  __W = _mm_maskz_mov_epi16(__M, __W);
-  return __builtin_reduce_add((__v8hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_mul_epi16( __mmask8 __M, __m128i __W) {
-  __W = _mm_mask_mov_epi16(_mm_set1_epi16(1), __M, __W);
-  return __builtin_reduce_mul((__v8hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_and_epi16( __mmask8 __M, __m128i __W) {
-  __W = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __W);
-  return __builtin_reduce_and((__v8hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W) {
-  __W = _mm_maskz_mov_epi16(__M, __W);
-  return __builtin_reduce_or((__v8hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_reduce_max_epi16(__m128i __V) {
-  return __builtin_reduce_max((__v8hi)__V);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS128
-_mm_reduce_max_epu16(__m128i __V) {
-  return __builtin_reduce_max((__v8hu)__V);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_reduce_min_epi16(__m128i __V) {
-  return __builtin_reduce_min((__v8hi)__V);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS128
-_mm_reduce_min_epu16(__m128i __V) {
-  return __builtin_reduce_min((__v8hu)__V);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_max_epi16(__mmask16 __M, __m128i __V) {
-  __V = _mm_mask_mov_epi16(_mm_set1_epi16(-32767-1), __M, __V);
-  return __builtin_reduce_max((__v8hi)__V);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_max_epu16(__mmask16 __M, __m128i __V) {
-  __V = _mm_maskz_mov_epi16(__M, __V);
-  return __builtin_reduce_max((__v8hu)__V);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_min_epi16(__mmask16 __M, __m128i __V) {
-  __V = _mm_mask_mov_epi16(_mm_set1_epi16(32767), __M, __V);
-  return __builtin_reduce_min((__v8hi)__V);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_min_epu16(__mmask16 __M, __m128i __V) {
-  __V = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __V);
-  return __builtin_reduce_min((__v8hu)__V);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_reduce_add_epi16(__m256i __W) {
-  return __builtin_reduce_add((__v16hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_reduce_mul_epi16(__m256i __W) {
-  return __builtin_reduce_mul((__v16hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_reduce_and_epi16(__m256i __W) {
-  return __builtin_reduce_and((__v16hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_reduce_or_epi16(__m256i __W) {
-  return __builtin_reduce_or((__v16hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_add_epi16( __mmask16 __M, __m256i __W) {
-  __W = _mm256_maskz_mov_epi16(__M, __W);
-  return __builtin_reduce_add((__v16hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_mul_epi16( __mmask16 __M, __m256i __W) {
-  __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(1), __M, __W);
-  return __builtin_reduce_mul((__v16hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_and_epi16( __mmask16 __M, __m256i __W) {
-  __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __W);
-  return __builtin_reduce_and((__v16hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W) {
-  __W = _mm256_maskz_mov_epi16(__M, __W);
-  return __builtin_reduce_or((__v16hi)__W);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_reduce_max_epi16(__m256i __V) {
-  return __builtin_reduce_max((__v16hi)__V);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS256
-_mm256_reduce_max_epu16(__m256i __V) {
-  return __builtin_reduce_max((__v16hu)__V);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_reduce_min_epi16(__m256i __V) {
-  return __builtin_reduce_min((__v16hi)__V);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS256
-_mm256_reduce_min_epu16(__m256i __V) {
-  return __builtin_reduce_min((__v16hu)__V);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __V) {
-  __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-32767-1), __M, __V);
-  return __builtin_reduce_max((__v16hi)__V);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __V) {
-  __V = _mm256_maskz_mov_epi16(__M, __V);
-  return __builtin_reduce_max((__v16hu)__V);
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __V) {
-  __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(32767), __M, __V);
-  return __builtin_reduce_min((__v16hi)__V);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __V) {
-  __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __V);
-  return __builtin_reduce_min((__v16hu)__V);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_reduce_add_epi8(__m128i __W) {
-  return __builtin_reduce_add((__v16qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_reduce_mul_epi8(__m128i __W) {
-  return __builtin_reduce_mul((__v16qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_reduce_and_epi8(__m128i __W) {
-  return __builtin_reduce_and((__v16qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_reduce_or_epi8(__m128i __W) {
-  return __builtin_reduce_or((__v16qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_add_epi8(__mmask16 __M, __m128i __W) {
-  __W = _mm_maskz_mov_epi8(__M, __W);
-  return __builtin_reduce_add((__v16qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_mul_epi8(__mmask16 __M, __m128i __W) {
-  __W = _mm_mask_mov_epi8(_mm_set1_epi8(1), __M, __W);
-  return __builtin_reduce_mul((__v16qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_and_epi8(__mmask16 __M, __m128i __W) {
-  __W = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __W);
-  return __builtin_reduce_and((__v16qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W) {
-  __W = _mm_maskz_mov_epi8(__M, __W);
-  return __builtin_reduce_or((__v16qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_reduce_max_epi8(__m128i __V) {
-  return __builtin_reduce_max((__v16qs)__V);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS128
-_mm_reduce_max_epu8(__m128i __V) {
-  return __builtin_reduce_max((__v16qu)__V);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_reduce_min_epi8(__m128i __V) {
-  return __builtin_reduce_min((__v16qs)__V);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS128
-_mm_reduce_min_epu8(__m128i __V) {
-  return __builtin_reduce_min((__v16qu)__V);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __V) {
-  __V = _mm_mask_mov_epi8(_mm_set1_epi8(-127-1), __M, __V);
-  return __builtin_reduce_max((__v16qs)__V);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __V) {
-  __V = _mm_maskz_mov_epi8(__M, __V);
-  return __builtin_reduce_max((__v16qu)__V);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __V) {
-  __V = _mm_mask_mov_epi8(_mm_set1_epi8(127), __M, __V);
-  return __builtin_reduce_min((__v16qs)__V);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS128
-_mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __V) {
-  __V = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __V);
-  return __builtin_reduce_min((__v16qu)__V);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_reduce_add_epi8(__m256i __W) {
-  return __builtin_reduce_add((__v32qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_reduce_mul_epi8(__m256i __W) {
-  return __builtin_reduce_mul((__v32qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_reduce_and_epi8(__m256i __W) {
-  return __builtin_reduce_and((__v32qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_reduce_or_epi8(__m256i __W) {
-  return __builtin_reduce_or((__v32qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_add_epi8(__mmask32 __M, __m256i __W) {
-  __W = _mm256_maskz_mov_epi8(__M, __W);
-  return __builtin_reduce_add((__v32qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_mul_epi8(__mmask32 __M, __m256i __W) {
-  __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(1), __M, __W);
-  return __builtin_reduce_mul((__v32qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_and_epi8(__mmask32 __M, __m256i __W) {
-  __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __W);
-  return __builtin_reduce_and((__v32qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W) {
-  __W = _mm256_maskz_mov_epi8(__M, __W);
-  return __builtin_reduce_or((__v32qs)__W);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_reduce_max_epi8(__m256i __V) {
-  return __builtin_reduce_max((__v32qs)__V);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS256
-_mm256_reduce_max_epu8(__m256i __V) {
-  return __builtin_reduce_max((__v32qu)__V);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_reduce_min_epi8(__m256i __V) {
-  return __builtin_reduce_min((__v32qs)__V);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS256
-_mm256_reduce_min_epu8(__m256i __V) {
-  return __builtin_reduce_min((__v32qu)__V);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __V) {
-  __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-127-1), __M, __V);
-  return __builtin_reduce_max((__v32qs)__V);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __V) {
-  __V = _mm256_maskz_mov_epi8(__M, __V);
-  return __builtin_reduce_max((__v32qu)__V);
-}
-
-static __inline__ signed char __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __V) {
-  __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(127), __M, __V);
-  return __builtin_reduce_min((__v32qs)__V);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS256
-_mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __V) {
-  __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __V);
-  return __builtin_reduce_min((__v32qu)__V);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __AVX512VLBWINTRIN_H */
diff --git a/third_party/intel/clang/avx512vlcdintrin.h b/third_party/intel/clang/avx512vlcdintrin.h
deleted file mode 100644
index 923e2c551..000000000
--- a/third_party/intel/clang/avx512vlcdintrin.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLCDINTRIN_H
-#define __AVX512VLCDINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512cd,no-evex512"),                   \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512cd,no-evex512"),                   \
-                 __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastmb_epi64 (__mmask8 __A)
-{
-  return (__m128i) _mm_set1_epi64x((long long) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastmb_epi64 (__mmask8 __A)
-{
-  return (__m256i) _mm256_set1_epi64x((long long)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastmw_epi32 (__mmask16 __A)
-{
-  return (__m128i) _mm_set1_epi32((int)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastmw_epi32 (__mmask16 __A)
-{
-  return (__m256i) _mm256_set1_epi32((int)__A);
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi64 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_conflict_epi64(__A),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_conflict_epi64(__A),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi64 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_conflict_epi64(__A),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_conflict_epi64(__A),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi32 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_conflict_epi32(__A),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_conflict_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi32 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_conflict_epi32(__A),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_conflict_epi32(__A),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_lzcnt_epi32 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_lzcnt_epi32(__A),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_lzcnt_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_lzcnt_epi32 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_lzcnt_epi32(__A),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_lzcnt_epi32(__A),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_lzcnt_epi64 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_lzcnt_epi64(__A),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_lzcnt_epi64(__A),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_lzcnt_epi64 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_lzcnt_epi64(__A),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_lzcnt_epi64(__A),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __AVX512VLCDINTRIN_H */
diff --git a/third_party/intel/clang/avx512vldqintrin.h b/third_party/intel/clang/avx512vldqintrin.h
deleted file mode 100644
index 272cdd89e..000000000
--- a/third_party/intel/clang/avx512vldqintrin.h
+++ /dev/null
@@ -1,1173 +0,0 @@
-/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLDQINTRIN_H
-#define __AVX512VLDQINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512dq,no-evex512"),                   \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512dq,no-evex512"),                   \
-                 __min_vector_width__(256)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
-  return (__m256i) ((__v4du) __A * (__v4du) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_mullo_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_mullo_epi64(__A, __B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mullo_epi64 (__m128i __A, __m128i __B) {
-  return (__m128i) ((__v2du) __A * (__v2du) __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_mullo_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_mullo_epi64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_andnot_pd(__A, __B),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_andnot_pd(__A, __B),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_andnot_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_andnot_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_andnot_ps(__A, __B),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_andnot_ps(__A, __B),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_andnot_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_andnot_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_and_pd(__A, __B),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_and_pd(__A, __B),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_and_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_and_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_and_ps(__A, __B),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_and_ps(__A, __B),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_and_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_and_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_xor_pd(__A, __B),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_xor_pd(__A, __B),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_xor_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_xor_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_xor_ps(__A, __B),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_xor_ps(__A, __B),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_xor_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_xor_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_or_pd(__A, __B),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_or_pd(__A, __B),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_or_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_or_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_or_ps(__A, __B),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_or_ps(__A, __B),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_or_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_or_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtpd_epi64 (__m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtpd_epi64 (__m256d __A) {
-  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
-  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
-  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtpd_epu64 (__m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtpd_epu64 (__m256d __A) {
-  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
-  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
-  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtps_epi64 (__m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtps_epi64 (__m128 __A) {
-  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
-  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
-  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtps_epu64 (__m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtps_epu64 (__m128 __A) {
-  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
-  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
-  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_pd (__m128i __A) {
-  return (__m128d)__builtin_convertvector((__v2di)__A, __v2df);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_cvtepi64_pd(__A),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_cvtepi64_pd(__A),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_pd (__m256i __A) {
-  return (__m256d)__builtin_convertvector((__v4di)__A, __v4df);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_cvtepi64_pd(__A),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_cvtepi64_pd(__A),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_ps (__m128i __A) {
-  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
-                (__v4sf) _mm_setzero_ps(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
-  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
-                (__v4sf) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
-  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
-                (__v4sf) _mm_setzero_ps(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_ps (__m256i __A) {
-  return (__m128)__builtin_convertvector((__v4di)__A, __v4sf);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm256_cvtepi64_ps(__A),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm256_cvtepi64_ps(__A),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttpd_epi64 (__m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttpd_epi64 (__m256d __A) {
-  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
-  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
-  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttpd_epu64 (__m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttpd_epu64 (__m256d __A) {
-  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
-  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
-  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttps_epi64 (__m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttps_epi64 (__m128 __A) {
-  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
-  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
-  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttps_epu64 (__m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
-                (__v2di) _mm_setzero_si128(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttps_epu64 (__m128 __A) {
-  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
-  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
-  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
-                (__v4di) _mm256_setzero_si256(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtepu64_pd (__m128i __A) {
-  return (__m128d)__builtin_convertvector((__v2du)__A, __v2df);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_cvtepu64_pd(__A),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_cvtepu64_pd(__A),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_cvtepu64_pd (__m256i __A) {
-  return (__m256d)__builtin_convertvector((__v4du)__A, __v4df);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_cvtepu64_pd(__A),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_cvtepu64_pd(__A),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtepu64_ps (__m128i __A) {
-  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
-                (__v4sf) _mm_setzero_ps(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
-  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
-                (__v4sf) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
-  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
-                (__v4sf) _mm_setzero_ps(),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_cvtepu64_ps (__m256i __A) {
-  return (__m128)__builtin_convertvector((__v4du)__A, __v4sf);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm256_cvtepu64_ps(__A),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm256_cvtepu64_ps(__A),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-#define _mm_range_pd(A, B, C) \
-  ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), (int)(C), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1))
-
-#define _mm_mask_range_pd(W, U, A, B, C) \
-  ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), (int)(C), \
-                                           (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U)))
-
-#define _mm_maskz_range_pd(U, A, B, C) \
-  ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), (int)(C), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U)))
-
-#define _mm256_range_pd(A, B, C) \
-  ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
-                                           (__v4df)(__m256d)(B), (int)(C), \
-                                           (__v4df)_mm256_setzero_pd(), \
-                                           (__mmask8)-1))
-
-#define _mm256_mask_range_pd(W, U, A, B, C) \
-  ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
-                                           (__v4df)(__m256d)(B), (int)(C), \
-                                           (__v4df)(__m256d)(W), \
-                                           (__mmask8)(U)))
-
-#define _mm256_maskz_range_pd(U, A, B, C) \
-  ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
-                                           (__v4df)(__m256d)(B), (int)(C), \
-                                           (__v4df)_mm256_setzero_pd(), \
-                                           (__mmask8)(U)))
-
-#define _mm_range_ps(A, B, C) \
-  ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), (int)(C), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1))
-
-#define _mm_mask_range_ps(W, U, A, B, C) \
-  ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), (int)(C), \
-                                          (__v4sf)(__m128)(W), (__mmask8)(U)))
-
-#define _mm_maskz_range_ps(U, A, B, C) \
-  ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), (int)(C), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U)))
-
-#define _mm256_range_ps(A, B, C) \
-  ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
-                                          (__v8sf)(__m256)(B), (int)(C), \
-                                          (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1))
-
-#define _mm256_mask_range_ps(W, U, A, B, C) \
-  ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
-                                          (__v8sf)(__m256)(B), (int)(C), \
-                                          (__v8sf)(__m256)(W), (__mmask8)(U)))
-
-#define _mm256_maskz_range_ps(U, A, B, C) \
-  ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
-                                          (__v8sf)(__m256)(B), (int)(C), \
-                                          (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)(U)))
-
-#define _mm_reduce_pd(A, B) \
-  ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1))
-
-#define _mm_mask_reduce_pd(W, U, A, B) \
-  ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
-                                            (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U)))
-
-#define _mm_maskz_reduce_pd(U, A, B) \
-  ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U)))
-
-#define _mm256_reduce_pd(A, B) \
-  ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
-                                            (__v4df)_mm256_setzero_pd(), \
-                                            (__mmask8)-1))
-
-#define _mm256_mask_reduce_pd(W, U, A, B) \
-  ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
-                                            (__v4df)(__m256d)(W), \
-                                            (__mmask8)(U)))
-
-#define _mm256_maskz_reduce_pd(U, A, B) \
-  ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
-                                            (__v4df)_mm256_setzero_pd(), \
-                                            (__mmask8)(U)))
-
-#define _mm_reduce_ps(A, B) \
-  ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1))
-
-#define _mm_mask_reduce_ps(W, U, A, B) \
-  ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
-                                           (__v4sf)(__m128)(W), \
-                                           (__mmask8)(U)))
-
-#define _mm_maskz_reduce_ps(U, A, B) \
-  ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U)))
-
-#define _mm256_reduce_ps(A, B) \
-  ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1))
-
-#define _mm256_mask_reduce_ps(W, U, A, B) \
-  ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
-                                           (__v8sf)(__m256)(W), \
-                                           (__mmask8)(U)))
-
-#define _mm256_maskz_reduce_ps(U, A, B) \
-  ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U)))
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_movepi32_mask (__m128i __A)
-{
-  return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_movepi32_mask (__m256i __A)
-{
-  return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi32 (__mmask8 __A)
-{
-  return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi32 (__mmask8 __A)
-{
-  return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi64 (__mmask8 __A)
-{
-  return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi64 (__mmask8 __A)
-{
-  return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_movepi64_mask (__m128i __A)
-{
-  return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_movepi64_mask (__m256i __A)
-{
-  return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_broadcast_f32x2 (__m128 __A)
-{
-  return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
-                                         0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
-                                             (__v8sf)_mm256_broadcast_f32x2(__A),
-                                             (__v8sf)__O);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
-                                             (__v8sf)_mm256_broadcast_f32x2(__A),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_broadcast_f64x2(__m128d __A)
-{
-  return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
-                                          0, 1, 0, 1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
-                                            (__v4df)_mm256_broadcast_f64x2(__A),
-                                            (__v4df)__O);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
-                                            (__v4df)_mm256_broadcast_f64x2(__A),
-                                            (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcast_i32x2 (__m128i __A)
-{
-  return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
-                                          0, 1, 0, 1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_broadcast_i32x2(__A),
-                                             (__v4si)__O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_broadcast_i32x2(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcast_i32x2 (__m128i __A)
-{
-  return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
-                                          0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_broadcast_i32x2(__A),
-                                             (__v8si)__O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_broadcast_i32x2(__A),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcast_i64x2(__m128i __A)
-{
-  return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
-                                          0, 1, 0, 1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                            (__v4di)_mm256_broadcast_i64x2(__A),
-                                            (__v4di)__O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                            (__v4di)_mm256_broadcast_i64x2(__A),
-                                            (__v4di)_mm256_setzero_si256());
-}
-
-#define _mm256_extractf64x2_pd(A, imm) \
-  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                 (int)(imm), \
-                                                 (__v2df)_mm_undefined_pd(), \
-                                                 (__mmask8)-1))
-
-#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
-  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                 (int)(imm), \
-                                                 (__v2df)(__m128d)(W), \
-                                                 (__mmask8)(U)))
-
-#define _mm256_maskz_extractf64x2_pd(U, A, imm) \
-  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                 (int)(imm), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)(U)))
-
-#define _mm256_extracti64x2_epi64(A, imm) \
-  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_undefined_si128(), \
-                                                (__mmask8)-1))
-
-#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
-  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
-                                                 (int)(imm), \
-                                                 (__v2di)(__m128i)(W), \
-                                                 (__mmask8)(U)))
-
-#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
-  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
-                                                 (int)(imm), \
-                                                 (__v2di)_mm_setzero_si128(), \
-                                                 (__mmask8)(U)))
-
-#define _mm256_insertf64x2(A, B, imm) \
-  ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
-                                           (__v2df)(__m128d)(B), (int)(imm)))
-
-#define _mm256_mask_insertf64x2(W, U, A, B, imm) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
-                                  (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_insertf64x2(U, A, B, imm) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
-                                  (__v4df)_mm256_setzero_pd()))
-
-#define _mm256_inserti64x2(A, B, imm) \
-  ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
-                                           (__v2di)(__m128i)(B), (int)(imm)))
-
-#define _mm256_mask_inserti64x2(W, U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                   (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
-                                   (__v4di)(__m256i)(W)))
-
-#define _mm256_maskz_inserti64x2(U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                   (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
-                                   (__v4di)_mm256_setzero_si256()))
-
-#define _mm_mask_fpclass_pd_mask(U, A, imm) \
-  ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                              (__mmask8)(U)))
-
-#define _mm_fpclass_pd_mask(A, imm) \
-  ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                              (__mmask8)-1))
-
-#define _mm256_mask_fpclass_pd_mask(U, A, imm) \
-  ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
-                                              (__mmask8)(U)))
-
-#define _mm256_fpclass_pd_mask(A, imm) \
-  ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
-                                              (__mmask8)-1))
-
-#define _mm_mask_fpclass_ps_mask(U, A, imm) \
-  ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                              (__mmask8)(U)))
-
-#define _mm_fpclass_ps_mask(A, imm) \
-  ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                              (__mmask8)-1))
-
-#define _mm256_mask_fpclass_ps_mask(U, A, imm) \
-  ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                              (__mmask8)(U)))
-
-#define _mm256_fpclass_ps_mask(A, imm) \
-  ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                              (__mmask8)-1))
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
diff --git a/third_party/intel/clang/avx512vlfp16intrin.h b/third_party/intel/clang/avx512vlfp16intrin.h
deleted file mode 100644
index a12acb7d9..000000000
--- a/third_party/intel/clang/avx512vlfp16intrin.h
+++ /dev/null
@@ -1,2071 +0,0 @@
-/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifdef __SSE2__
-
-#ifndef __AVX512VLFP16INTRIN_H
-#define __AVX512VLFP16INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512fp16,avx512vl,no-evex512"),                 \
-                 __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512fp16,avx512vl,no-evex512"),                 \
-                 __min_vector_width__(128)))
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
-  return __a[0];
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) {
-  return __a[0];
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) {
-  return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
-}
-
-static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) {
-  return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) {
-  return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
-                            __h, __h, __h, __h, __h, __h, __h, __h};
-}
-
-static __inline __m128h __DEFAULT_FN_ATTRS128
-_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
-           _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
-  return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1};
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256
-_mm256_set1_pch(_Float16 _Complex h) {
-  return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h));
-}
-
-static __inline __m128h __DEFAULT_FN_ATTRS128
-_mm_set1_pch(_Float16 _Complex h) {
-  return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h));
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256
-_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
-              _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
-              _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
-              _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
-  return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11,
-                            __h10, __h9,  __h8,  __h7,  __h6,  __h5,
-                            __h4,  __h3,  __h2,  __h1};
-}
-
-#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8)                            \
-  _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
-
-#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
-                       h14, h15, h16)                                          \
-  _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8),   \
-                (h7), (h6), (h5), (h4), (h3), (h2), (h1))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A,
-                                                              __m256h __B) {
-  return (__m256h)((__v16hf)__A + (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A,
-                                                           __m128h __B) {
-  return (__m128h)((__v8hf)__A + (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
-                                              (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
-                                              (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A,
-                                                              __m256h __B) {
-  return (__m256h)((__v16hf)__A - (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A,
-                                                           __m128h __B) {
-  return (__m128h)((__v8hf)__A - (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
-                                              (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
-                                              (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A,
-                                                              __m256h __B) {
-  return (__m256h)((__v16hf)__A * (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A,
-                                                           __m128h __B) {
-  return (__m128h)((__v8hf)__A * (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
-                                              (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
-                                              (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A,
-                                                              __m256h __B) {
-  return (__m256h)((__v16hf)__A / (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A,
-                                                           __m128h __B) {
-  return (__m128h)((__v8hf)__A / (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
-                                              (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
-                                              (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A,
-                                                              __m256h __B) {
-  return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
-      (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
-      (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A,
-                                                           __m128h __B) {
-  return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
-      (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
-      (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A,
-                                                              __m256h __B) {
-  return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
-      (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
-      (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A,
-                                                           __m128h __B) {
-  return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A,
-                                                                __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
-      (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U,
-                                                                 __m128h __A,
-                                                                 __m128h __B) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
-      (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) {
-  return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
-  return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) {
-  return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f));
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_selectps_256(
-      (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_selectps_256(
-      (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) {
-  return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f));
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A) {
-  return (__m128h)__builtin_ia32_selectps_128(
-      (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) {
-  return (__m128h)__builtin_ia32_selectps_128(
-      (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps());
-}
-
-#define _mm256_cmp_ph_mask(a, b, p)                                            \
-  ((__mmask16)__builtin_ia32_cmpph256_mask(                                    \
-      (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
-
-#define _mm256_mask_cmp_ph_mask(m, a, b, p)                                    \
-  ((__mmask16)__builtin_ia32_cmpph256_mask(                                    \
-      (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
-
-#define _mm_cmp_ph_mask(a, b, p)                                               \
-  ((__mmask8)__builtin_ia32_cmpph128_mask(                                     \
-      (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
-
-#define _mm_mask_cmp_ph_mask(m, a, b, p)                                       \
-  ((__mmask8)__builtin_ia32_cmpph128_mask(                                     \
-      (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) {
-  return (__m256h)__builtin_ia32_rcpph256_mask(
-      (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
-                                               (__mmask16)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_rcpph256_mask(
-      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) {
-  return (__m128h)__builtin_ia32_rcpph128_mask(
-      (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W,
-                                                                __mmask8 __U,
-                                                                __m128h __A) {
-  return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
-                                               (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U,
-                                                                 __m128h __A) {
-  return (__m128h)__builtin_ia32_rcpph128_mask(
-      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) {
-  return (__m256h)__builtin_ia32_rsqrtph256_mask(
-      (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
-                                                 (__mmask16)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_rsqrtph256_mask(
-      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) {
-  return (__m128h)__builtin_ia32_rsqrtph128_mask(
-      (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A) {
-  return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
-                                                 (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) {
-  return (__m128h)__builtin_ia32_rsqrtph128_mask(
-      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) {
-  return (__m128h)__builtin_ia32_getexpph128_mask(
-      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
-  return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
-                                                  (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
-  return (__m128h)__builtin_ia32_getexpph128_mask(
-      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) {
-  return (__m256h)__builtin_ia32_getexpph256_mask(
-      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
-                                                  (__mmask16)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_getexpph256_mask(
-      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
-}
-
-#define _mm_getmant_ph(A, B, C)                                                \
-  ((__m128h)__builtin_ia32_getmantph128_mask(                                  \
-      (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
-      (__mmask8)-1))
-
-#define _mm_mask_getmant_ph(W, U, A, B, C)                                     \
-  ((__m128h)__builtin_ia32_getmantph128_mask(                                  \
-      (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W),     \
-      (__mmask8)(U)))
-
-#define _mm_maskz_getmant_ph(U, A, B, C)                                       \
-  ((__m128h)__builtin_ia32_getmantph128_mask(                                  \
-      (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
-      (__mmask8)(U)))
-
-#define _mm256_getmant_ph(A, B, C)                                             \
-  ((__m256h)__builtin_ia32_getmantph256_mask(                                  \
-      (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
-      (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
-
-#define _mm256_mask_getmant_ph(W, U, A, B, C)                                  \
-  ((__m256h)__builtin_ia32_getmantph256_mask(                                  \
-      (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W),   \
-      (__mmask16)(U)))
-
-#define _mm256_maskz_getmant_ph(U, A, B, C)                                    \
-  ((__m256h)__builtin_ia32_getmantph256_mask(                                  \
-      (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
-      (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A,
-                                                              __m128h __B) {
-  return (__m128h)__builtin_ia32_scalefph128_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
-                                                  (__v8hf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_scalefph128_mask(
-      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A,
-                                                                 __m256h __B) {
-  return (__m256h)__builtin_ia32_scalefph256_mask(
-      (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
-                                                  (__v16hf)__W, (__mmask16)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_scalefph256_mask(
-      (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
-}
-
-#define _mm_roundscale_ph(A, imm)                                              \
-  ((__m128h)__builtin_ia32_rndscaleph_128_mask(                                \
-      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(),              \
-      (__mmask8)-1))
-
-#define _mm_mask_roundscale_ph(W, U, A, imm)                                   \
-  ((__m128h)__builtin_ia32_rndscaleph_128_mask(                                \
-      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
-
-#define _mm_maskz_roundscale_ph(U, A, imm)                                     \
-  ((__m128h)__builtin_ia32_rndscaleph_128_mask(                                \
-      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(),              \
-      (__mmask8)(U)))
-
-#define _mm256_roundscale_ph(A, imm)                                           \
-  ((__m256h)__builtin_ia32_rndscaleph_256_mask(                                \
-      (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
-      (__mmask16)-1))
-
-#define _mm256_mask_roundscale_ph(W, U, A, imm)                                \
-  ((__m256h)__builtin_ia32_rndscaleph_256_mask(                                \
-      (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W),                \
-      (__mmask16)(U)))
-
-#define _mm256_maskz_roundscale_ph(U, A, imm)                                  \
-  ((__m256h)__builtin_ia32_rndscaleph_256_mask(                                \
-      (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
-      (__mmask16)(U)))
-
-#define _mm_reduce_ph(A, imm)                                                  \
-  ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm),  \
-                                            (__v8hf)_mm_setzero_ph(),          \
-                                            (__mmask8)-1))
-
-#define _mm_mask_reduce_ph(W, U, A, imm)                                       \
-  ((__m128h)__builtin_ia32_reduceph128_mask(                                   \
-      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
-
-#define _mm_maskz_reduce_ph(U, A, imm)                                         \
-  ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm),  \
-                                            (__v8hf)_mm_setzero_ph(),          \
-                                            (__mmask8)(U)))
-
-#define _mm256_reduce_ph(A, imm)                                               \
-  ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
-                                            (__v16hf)_mm256_setzero_ph(),      \
-                                            (__mmask16)-1))
-
-#define _mm256_mask_reduce_ph(W, U, A, imm)                                    \
-  ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
-                                            (__v16hf)(__m256h)(W),             \
-                                            (__mmask16)(U)))
-
-#define _mm256_maskz_reduce_ph(U, A, imm)                                      \
-  ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
-                                            (__v16hf)_mm256_setzero_ph(),      \
-                                            (__mmask16)(U)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
-  return __builtin_ia32_sqrtph((__v8hf)__a);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
-                                                                 __mmask8 __U,
-                                                                 __m128h __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
-                                                                  __m128h __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
-  return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
-  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
-                                              (__v16hf)_mm256_sqrt_ph(__A),
-                                              (__v16hf)_mm256_setzero_ph());
-}
-
-#define _mm_mask_fpclass_ph_mask(U, A, imm)                                    \
-  ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A),            \
-                                              (int)(imm), (__mmask8)(U)))
-
-#define _mm_fpclass_ph_mask(A, imm)                                            \
-  ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A),            \
-                                              (int)(imm), (__mmask8)-1))
-
-#define _mm256_mask_fpclass_ph_mask(U, A, imm)                                 \
-  ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A),          \
-                                               (int)(imm), (__mmask16)(U)))
-
-#define _mm256_fpclass_ph_mask(A, imm)                                         \
-  ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A),          \
-                                               (int)(imm), (__mmask16)-1))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) {
-  return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
-      (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128d __A) {
-  return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) {
-  return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
-      (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) {
-  return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
-      (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) {
-  return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) {
-  return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
-      (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) {
-  return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
-      (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A) {
-  return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
-  return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
-      (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) {
-  return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
-      (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) {
-  return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
-  return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
-      (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2w128_mask(
-      (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
-                                                  (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2w128_mask(
-      (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epi16(__m256h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2w256_mask(
-      (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
-                                                  (__mmask16)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2w256_mask(
-      (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2w128_mask(
-      (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2w128_mask(
-      (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epi16(__m256h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2w256_mask(
-      (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
-                                                   (__mmask16)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2w256_mask(
-      (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) {
-  return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_cvtepi16_ph(__m256i __A) {
-  return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) {
-  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
-                                              (__v16hf)_mm256_cvtepi16_ph(__A),
-                                              (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
-      (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
-      (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epu16(__m256h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
-      (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
-                                                   (__mmask16)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
-      (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
-      (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
-      (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epu16(__m256h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
-      (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
-                                                    (__mmask16)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
-      (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) {
-  return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_cvtepu16_ph(__m256i __A) {
-  return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) {
-  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
-                                              (__v16hf)_mm256_cvtepu16_ph(__A),
-                                              (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
-      (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
-      (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epi32(__m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
-      (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
-      (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
-      (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
-      (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epu32(__m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
-      (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
-      (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
-      (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
-      (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_ph(__m256i __A) {
-  return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
-      (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
-      (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_cvtepu32_ph(__m256i __A) {
-  return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
-      (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
-      (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epi32(__m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
-      (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
-      (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
-      (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
-                                                     (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
-      (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epu32(__m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
-      (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
-                                                     (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
-      (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
-      (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
-      (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_ph(__m256i __A) {
-  return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
-      (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
-  return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) {
-  return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
-      (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
-      (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
-      (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epi64(__m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
-      (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
-      (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
-      (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) {
-  return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
-      (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_cvtepu64_ph(__m256i __A) {
-  return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
-      (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
-  return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) {
-  return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
-      (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
-      (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
-      (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epu64(__m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
-      (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
-      (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
-      (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
-      (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epi64(__m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
-      (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
-      (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
-      (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
-                                                     (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
-  return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
-      (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epu64(__m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
-      (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
-                                                     (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
-  return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
-      (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) {
-  return (__m128)__builtin_ia32_vcvtph2psx128_mask(
-      (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A) {
-  return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
-  return (__m128)__builtin_ia32_vcvtph2psx128_mask(
-      (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) {
-  return (__m256)__builtin_ia32_vcvtph2psx256_mask(
-      (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) {
-  return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
-                                                   (__mmask8)__U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
-  return (__m256)__builtin_ia32_vcvtph2psx256_mask(
-      (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) {
-  return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
-      (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W,
-                                                                   __mmask8 __U,
-                                                                   __m128 __A) {
-  return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) {
-  return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
-      (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) {
-  return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
-      (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) {
-  return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
-                                                    (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
-  return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
-      (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
-                                                             __m128h __B,
-                                                             __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
-                                          (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
-                                                                  __mmask8 __U,
-                                                                  __m128h __B,
-                                                                  __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
-                                                             __m128h __B,
-                                                             __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
-                                          -(__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A,
-                                                                  __mmask8 __U,
-                                                                  __m128h __B,
-                                                                  __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
-      (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
-                                                                __m256h __B,
-                                                                __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
-                                             (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
-      (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
-      (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
-      (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
-                                                                __m256h __B,
-                                                                __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
-                                             -(__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
-      (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
-      (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
-      (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
-      (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
-      (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A,
-                                                                __m128h __B,
-                                                                __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
-                                             (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
-      (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A,
-                                                                __m128h __B,
-                                                                __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
-                                             -(__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
-      (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
-      (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
-                                                (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
-      (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
-      (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
-      (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
-                                                -(__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
-      (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
-      (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
-      (__v8hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
-      (__v16hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
-      (__v8hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
-      (__v16hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
-                                                              __m128h __B,
-                                                              __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
-                                          (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
-      (__v8hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
-                                                                 __m256h __B,
-                                                                 __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
-                                             (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
-      (__v16hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A,
-                                                              __m128h __B,
-                                                              __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
-                                          -(__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
-      (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_selectph_128(
-      (__mmask8)__U,
-      __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
-      (__v8hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A,
-                                                                 __m256h __B,
-                                                                 __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
-                                             -(__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
-      (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
-  return (__m256h)__builtin_ia32_selectph_256(
-      (__mmask16)__U,
-      __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
-      (__v16hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A,
-                                                              __m128h __B) {
-  return (__m128h)__builtin_ia32_vfcmulcph128_mask(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
-                                                   (__v4sf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_vfcmulcph128_mask(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A,
-                                                                 __m256h __B) {
-  return (__m256h)__builtin_ia32_vfcmulcph256_mask(
-      (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
-                                                   (__v8sf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_vfcmulcph256_mask(
-      (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A,
-                                                               __m128h __B,
-                                                               __m128h __C) {
-  return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
-                                                    (__v4sf)__C, (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectps_128(
-      __U,
-      __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B,
-                                        (__v4sf)__C, (__mmask8)__U),
-      (__v4sf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
-                                                    (__v4sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A,
-                                                                  __m256h __B,
-                                                                  __m256h __C) {
-  return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
-                                                    (__v8sf)__C, (__mmask8)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectps_256(
-      __U,
-      __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
-                                        (__mmask8)__U),
-      (__v8sf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
-  return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
-                                                    (__v8sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
-      (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A,
-                                                             __m128h __B) {
-  return (__m128h)__builtin_ia32_vfmulcph128_mask(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W,
-                                                                  __mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128h __B) {
-  return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
-                                                  (__v4sf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_vfmulcph128_mask(
-      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A,
-                                                                __m256h __B) {
-  return (__m256h)__builtin_ia32_vfmulcph256_mask(
-      (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
-                                                  (__v8sf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_vfmulcph256_mask(
-      (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A,
-                                                              __m128h __B,
-                                                              __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
-                                                   (__v4sf)__C, (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_selectps_128(
-      __U,
-      __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
-                                       (__mmask8)__U),
-      (__v4sf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
-                                                   (__v4sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B,
-                                                    (__v4sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A,
-                                                                 __m256h __B,
-                                                                 __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
-                                                   (__v8sf)__C, (__mmask8)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_selectps_256(
-      __U,
-      __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
-                                       (__mmask8)__U),
-      (__v8sf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
-  return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
-                                                   (__v8sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
-  return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B,
-                                                    (__v8sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
-                                                                  __m128h __A,
-                                                                  __m128h __W) {
-  return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W,
-                                              (__v8hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
-  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W,
-                                              (__v16hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
-  return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
-                                                 (__v8hi)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
-  return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
-                                                 (__v16hi)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_permutexvar_ph(__m128i __A, __m128h __B) {
-  return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
-  return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256
-_mm256_reduce_add_ph(__m256h __W) {
-  return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256
-_mm256_reduce_mul_ph(__m256h __W) {
-  return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256
-_mm256_reduce_max_ph(__m256h __V) {
-  return __builtin_ia32_reduce_fmax_ph256(__V);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256
-_mm256_reduce_min_ph(__m256h __V) {
-  return __builtin_ia32_reduce_fmin_ph256(__V);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128
-_mm_reduce_add_ph(__m128h __W) {
-  return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128
-_mm_reduce_mul_ph(__m128h __W) {
-  return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128
-_mm_reduce_max_ph(__m128h __V) {
-  return __builtin_ia32_reduce_fmax_ph128(__V);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128
-_mm_reduce_min_ph(__m128h __V) {
-  return __builtin_ia32_reduce_fmin_ph128(__V);
-}
-
-// intrinsics below are alias for f*mul_*ch
-#define _mm_mul_pch(A, B) _mm_fmul_pch(A, B)
-#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B)
-#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B)
-#define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B)
-#define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B)
-#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B)
-
-#define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B)
-#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B)
-#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B)
-#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B)
-#define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B)
-#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B)
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
-#endif
diff --git a/third_party/intel/clang/avx512vlintrin.h b/third_party/intel/clang/avx512vlintrin.h
deleted file mode 100644
index 2a5f7b43f..000000000
--- a/third_party/intel/clang/avx512vlintrin.h
+++ /dev/null
@@ -1,8437 +0,0 @@
-/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLINTRIN_H
-#define __AVX512VLINTRIN_H
-
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,no-evex512"),                            \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,no-evex512"),                            \
-                 __min_vector_width__(256)))
-
-typedef short __v2hi __attribute__((__vector_size__(4)));
-typedef char __v4qi __attribute__((__vector_size__(4)));
-typedef char __v2qi __attribute__((__vector_size__(2)));
-
-/* Integer compare */
-
-#define _mm_cmpeq_epi32_mask(A, B) \
-    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epi32_mask(k, A, B) \
-    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epi32_mask(A, B) \
-    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epi32_mask(k, A, B) \
-    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epi32_mask(A, B) \
-    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epi32_mask(k, A, B) \
-    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epi32_mask(A, B) \
-    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epi32_mask(k, A, B) \
-    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epi32_mask(A, B) \
-    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epi32_mask(k, A, B) \
-    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epi32_mask(A, B) \
-    _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epi32_mask(k, A, B) \
-    _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epi32_mask(A, B) \
-    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epi32_mask(k, A, B) \
-    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epi32_mask(A, B) \
-    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epi32_mask(k, A, B) \
-    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epi32_mask(A, B) \
-    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epi32_mask(k, A, B) \
-    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epi32_mask(A, B) \
-    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epi32_mask(k, A, B) \
-    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epi32_mask(A, B) \
-    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epi32_mask(k, A, B) \
-    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epi32_mask(A, B) \
-    _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epi32_mask(k, A, B) \
-    _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epu32_mask(A, B) \
-    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epu32_mask(k, A, B) \
-    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epu32_mask(A, B) \
-    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epu32_mask(k, A, B) \
-    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epu32_mask(A, B) \
-    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epu32_mask(k, A, B) \
-    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epu32_mask(A, B) \
-    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epu32_mask(k, A, B) \
-    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epu32_mask(A, B) \
-    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epu32_mask(k, A, B) \
-    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epu32_mask(A, B) \
-    _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epu32_mask(k, A, B) \
-    _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epu32_mask(A, B) \
-    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epu32_mask(k, A, B) \
-    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epu32_mask(A, B) \
-    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epu32_mask(k, A, B) \
-    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epu32_mask(A, B) \
-    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epu32_mask(k, A, B) \
-    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epu32_mask(A, B) \
-    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epu32_mask(k, A, B) \
-    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epu32_mask(A, B) \
-    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epu32_mask(k, A, B) \
-    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epu32_mask(A, B) \
-    _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epu32_mask(k, A, B) \
-    _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epi64_mask(A, B) \
-    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epi64_mask(k, A, B) \
-    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epi64_mask(A, B) \
-    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epi64_mask(k, A, B) \
-    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epi64_mask(A, B) \
-    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epi64_mask(k, A, B) \
-    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epi64_mask(A, B) \
-    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epi64_mask(k, A, B) \
-    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epi64_mask(A, B) \
-    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epi64_mask(k, A, B) \
-    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epi64_mask(A, B) \
-    _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epi64_mask(k, A, B) \
-    _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epi64_mask(A, B) \
-    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epi64_mask(k, A, B) \
-    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epi64_mask(A, B) \
-    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epi64_mask(k, A, B) \
-    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epi64_mask(A, B) \
-    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epi64_mask(k, A, B) \
-    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epi64_mask(A, B) \
-    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epi64_mask(k, A, B) \
-    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epi64_mask(A, B) \
-    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epi64_mask(k, A, B) \
-    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epi64_mask(A, B) \
-    _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epi64_mask(k, A, B) \
-    _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epu64_mask(A, B) \
-    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epu64_mask(k, A, B) \
-    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epu64_mask(A, B) \
-    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epu64_mask(k, A, B) \
-    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epu64_mask(A, B) \
-    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epu64_mask(k, A, B) \
-    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epu64_mask(A, B) \
-    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epu64_mask(k, A, B) \
-    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epu64_mask(A, B) \
-    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epu64_mask(k, A, B) \
-    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epu64_mask(A, B) \
-    _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epu64_mask(k, A, B) \
-    _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epu64_mask(A, B) \
-    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epu64_mask(k, A, B) \
-    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epu64_mask(A, B) \
-    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epu64_mask(k, A, B) \
-    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epu64_mask(A, B) \
-    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epu64_mask(k, A, B) \
-    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epu64_mask(A, B) \
-    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epu64_mask(k, A, B) \
-    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epu64_mask(A, B) \
-    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epu64_mask(k, A, B) \
-    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epu64_mask(A, B) \
-    _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epu64_mask(k, A, B) \
-    _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_add_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_add_epi32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_add_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_add_epi64(__A, __B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_sub_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_sub_epi32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_sub_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_sub_epi64(__A, __B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_add_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_add_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_add_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_add_epi64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_sub_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_sub_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_sub_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_sub_epi64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_mul_epi32(__X, __Y),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_mul_epi32(__X, __Y),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_mul_epi32(__X, __Y),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_mul_epi32(__X, __Y),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_mul_epu32(__X, __Y),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_mul_epu32(__X, __Y),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_mul_epu32(__X, __Y),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_mul_epu32(__X, __Y),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_mullo_epi32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_mullo_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_mullo_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_mullo_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_and_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v8su)__a & (__v8su)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_and_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_and_epi32(__m128i __a, __m128i __b)
-{
-  return (__m128i)((__v4su)__a & (__v4su)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_and_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_andnot_epi32(__m256i __A, __m256i __B)
-{
-  return (__m256i)(~(__v8su)__A & (__v8su)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                          (__v8si)_mm256_andnot_epi32(__A, __B),
-                                          (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(),
-                                           __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_andnot_epi32(__m128i __A, __m128i __B)
-{
-  return (__m128i)(~(__v4su)__A & (__v4su)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_andnot_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_or_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v8su)__a | (__v8su)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_or_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_or_epi32(__m128i __a, __m128i __b)
-{
-  return (__m128i)((__v4su)__a | (__v4su)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_or_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_xor_epi32(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v8su)__a ^ (__v8su)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_xor_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_xor_epi32(__m128i __a, __m128i __b)
-{
-  return (__m128i)((__v4su)__a ^ (__v4su)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_xor_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_and_epi64(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4du)__a & (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_and_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_and_epi64(__m128i __a, __m128i __b)
-{
-  return (__m128i)((__v2du)__a & (__v2du)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_and_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_andnot_epi64(__m256i __A, __m256i __B)
-{
-  return (__m256i)(~(__v4du)__A & (__v4du)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                          (__v4di)_mm256_andnot_epi64(__A, __B),
-                                          (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(),
-                                           __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_andnot_epi64(__m128i __A, __m128i __B)
-{
-  return (__m128i)(~(__v2du)__A & (__v2du)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_andnot_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_or_epi64(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4du)__a | (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_or_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_or_epi64(__m128i __a, __m128i __b)
-{
-  return (__m128i)((__v2du)__a | (__v2du)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_or_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_xor_epi64(__m256i __a, __m256i __b)
-{
-  return (__m256i)((__v4du)__a ^ (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_xor_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_xor_epi64(__m128i __a, __m128i __b)
-{
-  return (__m128i)((__v2du)__a ^ (__v2du)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A,
-        __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_xor_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B);
-}
-
-#define _mm_cmp_epi32_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
-                                         (__v4si)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1))
-
-#define _mm_mask_cmp_epi32_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
-                                         (__v4si)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m)))
-
-#define _mm_cmp_epu32_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
-                                          (__v4si)(__m128i)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm_mask_cmp_epu32_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
-                                          (__v4si)(__m128i)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-#define _mm256_cmp_epi32_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
-                                         (__v8si)(__m256i)(b), (int)(p), \
-                                         (__mmask8)-1))
-
-#define _mm256_mask_cmp_epi32_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
-                                         (__v8si)(__m256i)(b), (int)(p), \
-                                         (__mmask8)(m)))
-
-#define _mm256_cmp_epu32_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
-                                          (__v8si)(__m256i)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm256_mask_cmp_epu32_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
-                                          (__v8si)(__m256i)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-#define _mm_cmp_epi64_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
-                                         (__v2di)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1))
-
-#define _mm_mask_cmp_epi64_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
-                                         (__v2di)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m)))
-
-#define _mm_cmp_epu64_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
-                                          (__v2di)(__m128i)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm_mask_cmp_epu64_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
-                                          (__v2di)(__m128i)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-#define _mm256_cmp_epi64_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
-                                         (__v4di)(__m256i)(b), (int)(p), \
-                                         (__mmask8)-1))
-
-#define _mm256_mask_cmp_epi64_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
-                                         (__v4di)(__m256i)(b), (int)(p), \
-                                         (__mmask8)(m)))
-
-#define _mm256_cmp_epu64_mask(a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
-                                          (__v4di)(__m256i)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm256_mask_cmp_epu64_mask(m, a, b, p) \
-  ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
-                                          (__v4di)(__m256i)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-#define _mm256_cmp_ps_mask(a, b, p)  \
-  ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
-                                          (__v8sf)(__m256)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm256_mask_cmp_ps_mask(m, a, b, p)  \
-  ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
-                                          (__v8sf)(__m256)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-#define _mm256_cmp_pd_mask(a, b, p)  \
-  ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
-                                          (__v4df)(__m256d)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm256_mask_cmp_pd_mask(m, a, b, p)  \
-  ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
-                                          (__v4df)(__m256d)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-#define _mm_cmp_ps_mask(a, b, p)  \
-  ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
-                                          (__v4sf)(__m128)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm_mask_cmp_ps_mask(m, a, b, p)  \
-  ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
-                                          (__v4sf)(__m128)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-#define _mm_cmp_pd_mask(a, b, p)  \
-  ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
-                                          (__v2df)(__m128d)(b), (int)(p), \
-                                          (__mmask8)-1))
-
-#define _mm_mask_cmp_pd_mask(m, a, b, p)  \
-  ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
-                                          (__v2df)(__m128d)(b), (int)(p), \
-                                          (__mmask8)(m)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
-                                             (__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd (-(__v2df) __A,
-                                             (__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
-                                                (__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
-                                                (__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
-                                             (__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps (-(__v4sf) __A,
-                                             (__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
-                                                (__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
-                                                (__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
-                                                (__v2df) __B,
-                                                (__v2df) __C),
-                    (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
-                                                (__v2df) __B,
-                                                (__v2df) __C),
-                    (__v2df) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
-                                                (__v2df) __B,
-                                                (__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
-                                                (__v2df) __B,
-                                                -(__v2df) __C),
-                    (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
-                                                (__v2df) __B,
-                                                -(__v2df) __C),
-                    (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
-                                                   (__v4df) __B,
-                                                   (__v4df) __C),
-                    (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
-                                                   (__v4df) __B,
-                                                   (__v4df) __C),
-                    (__v4df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
-                                                   (__v4df) __B,
-                                                   (__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
-                                                   (__v4df) __B,
-                                                   -(__v4df) __C),
-                    (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
-                                                   (__v4df) __B,
-                                                   -(__v4df) __C),
-                    (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
-                                                (__v4sf) __B,
-                                                (__v4sf) __C),
-                    (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
-                                                (__v4sf) __B,
-                                                (__v4sf) __C),
-                    (__v4sf) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
-                                                (__v4sf) __B,
-                                                (__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
-                                                (__v4sf) __B,
-                                                -(__v4sf) __C),
-                    (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
-                                                (__v4sf) __B,
-                                                -(__v4sf) __C),
-                    (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
-                         __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   (__v8sf) __C),
-                    (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   (__v8sf) __C),
-                    (__v8sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   (__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   -(__v8sf) __C),
-                    (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   -(__v8sf) __C),
-                    (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             (__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                (__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             (__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                (__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd ((__v2df) __A,
-                                                (__v2df) __B,
-                                                -(__v2df) __C),
-                    (__v2df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
-                                                   (__v4df) __B,
-                                                   -(__v4df) __C),
-                    (__v4df) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps ((__v4sf) __A,
-                                                (__v4sf) __B,
-                                                -(__v4sf) __C),
-                    (__v4sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
-                                                   (__v8sf) __B,
-                                                   -(__v8sf) __C),
-                    (__v8sf) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             -(__v2df) __B,
-                                             (__v2df) __C),
-                    (__v2df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                -(__v4df) __B,
-                                                (__v4df) __C),
-                    (__v4df) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             -(__v4sf) __B,
-                                             (__v4sf) __C),
-                    (__v4sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                -(__v8sf) __B,
-                                                (__v8sf) __C),
-                    (__v8sf) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             -(__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
-  return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd ((__v2df) __A,
-                                             -(__v2df) __B,
-                                             -(__v2df) __C),
-                    (__v2df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                -(__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
-  return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd256 ((__v4df) __A,
-                                                -(__v4df) __B,
-                                                -(__v4df) __C),
-                    (__v4df) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             -(__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
-  return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
-                    __builtin_ia32_vfmaddps ((__v4sf) __A,
-                                             -(__v4sf) __B,
-                                             -(__v4sf) __C),
-                    (__v4sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                -(__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
-  return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
-                    __builtin_ia32_vfmaddps256 ((__v8sf) __A,
-                                                -(__v8sf) __B,
-                                                -(__v8sf) __C),
-                    (__v8sf) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_add_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_add_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_add_pd(__A, __B),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_add_pd(__A, __B),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_add_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_add_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_add_ps(__A, __B),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_add_ps(__A, __B),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
-  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
-                (__v4si) __W,
-                (__v4si) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
-  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
-                (__v8si) __W,
-                (__v8si) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
-  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
-                 (__v2df) __W,
-                 (__v2df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
-  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
-                 (__v4df) __W,
-                 (__v4df) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
-  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
-                (__v4sf) __W,
-                (__v4sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
-  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
-                (__v8sf) __W,
-                (__v8sf) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
-  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
-                (__v2di) __W,
-                (__v2di) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
-  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
-                (__v4di) __W,
-                (__v4di) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
-                  (__v2df) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
-                  (__v2df)
-                  _mm_setzero_pd (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
-                  (__v4df) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
-                  (__v4df)
-                  _mm256_setzero_pd (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
-                  (__v2di) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
-                  (__v2di)
-                  _mm_setzero_si128 (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
-                  (__v4di) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
-                  (__v4di)
-                  _mm256_setzero_si256 (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
-                 (__v4sf) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
-                 (__v4sf)
-                 _mm_setzero_ps (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
-                 (__v8sf) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
-                 (__v8sf)
-                 _mm256_setzero_ps (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
-                  (__v4si) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
-                  (__v4si)
-                  _mm_setzero_si128 (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
-                  (__v8si) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
-                  (__v8si)
-                  _mm256_setzero_si256 (),
-                  (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) {
-  __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
-            (__v2df) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) {
-  __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
-            (__v4df) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) {
-  __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
-            (__v2di) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) {
-  __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
-            (__v4di) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) {
-  __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
-            (__v4sf) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) {
-  __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
-            (__v8sf) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) {
-  __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
-            (__v4si) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
-  __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
-            (__v8si) __A,
-            (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
-                                              (__v2df)_mm_cvtepi32_pd(__A),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
-                                              (__v2df)_mm_cvtepi32_pd(__A),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
-                                              (__v4df)_mm256_cvtepi32_pd(__A),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
-                                              (__v4df)_mm256_cvtepi32_pd(__A),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_cvtepi32_ps(__A),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_cvtepi32_ps(__A),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_cvtepi32_ps(__A),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_cvtepi32_ps(__A),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
-                (__v4si)
-                _mm_setzero_si128 (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm256_cvtpd_epi32(__A),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm256_cvtpd_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) {
-  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
-            (__v4sf) __W,
-            (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) {
-  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
-            (__v4sf)
-            _mm_setzero_ps (),
-            (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm256_cvtpd_ps(__A),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm256_cvtpd_ps(__A),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtpd_epu32 (__m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
-                 (__v4si) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtpd_epu32 (__m256d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
-                 (__v4si) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) {
-  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_cvtps_epi32(__A),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_cvtps_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_cvtps_epi32(__A),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_cvtps_epi32(__A),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_cvtps_pd(__A),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_cvtps_pd(__A),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_cvtps_pd(__A),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_cvtps_pd(__A),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtps_epu32 (__m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
-                 (__v4si) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtps_epu32 (__m256 __A) {
-  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
-  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
-                 (__v8si) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) {
-  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
-                 (__v4si) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm256_cvttpd_epi32(__A),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm256_cvttpd_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttpd_epu32 (__m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
-                  (__v4si)
-                  _mm_setzero_si128 (),
-                  (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
-                  (__v4si) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
-                  (__v4si)
-                  _mm_setzero_si128 (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvttpd_epu32 (__m256d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
-                  (__v4si)
-                  _mm_setzero_si128 (),
-                  (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
-                  (__v4si) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) {
-  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
-                  (__v4si)
-                  _mm_setzero_si128 (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_cvttps_epi32(__A),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_cvttps_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_cvttps_epi32(__A),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_cvttps_epi32(__A),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttps_epu32 (__m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
-                  (__v4si)
-                  _mm_setzero_si128 (),
-                  (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
-                  (__v4si) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) {
-  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
-                  (__v4si)
-                  _mm_setzero_si128 (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttps_epu32 (__m256 __A) {
-  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
-                  (__v8si)
-                  _mm256_setzero_si256 (),
-                  (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
-  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
-                  (__v8si) __W,
-                  (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
-  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
-                  (__v8si)
-                  _mm256_setzero_si256 (),
-                  (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtepu32_pd (__m128i __A) {
-  return (__m128d) __builtin_convertvector(
-      __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
-                                              (__v2df)_mm_cvtepu32_pd(__A),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
-                                              (__v2df)_mm_cvtepu32_pd(__A),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_cvtepu32_pd (__m128i __A) {
-  return (__m256d)__builtin_convertvector((__v4su)__A, __v4df);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
-                                              (__v4df)_mm256_cvtepu32_pd(__A),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
-                                              (__v4df)_mm256_cvtepu32_pd(__A),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtepu32_ps (__m128i __A) {
-  return (__m128)__builtin_convertvector((__v4su)__A, __v4sf);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_cvtepu32_ps(__A),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_cvtepu32_ps(__A),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtepu32_ps (__m256i __A) {
-  return (__m256)__builtin_convertvector((__v8su)__A, __v8sf);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_cvtepu32_ps(__A),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_cvtepu32_ps(__A),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_div_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_div_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_div_pd(__A, __B),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_div_pd(__A, __B),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_div_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_div_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_div_ps(__A, __B),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_div_ps(__A, __B),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
-                (__v2df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
-                 (__v2df)
-                 _mm_setzero_pd (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
-                (__v4df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
-                 (__v4df)
-                 _mm256_setzero_pd (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
-                (__v2di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
-                 (__v2di)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
-                (__v4di) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
-                 (__v4di)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
-  return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P,
-              (__v2df) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
-  return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P,
-               (__v2df)
-               _mm_setzero_pd (),
-               (__mmask8)
-               __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
-  return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P,
-              (__v4df) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
-  return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P,
-               (__v4df)
-               _mm256_setzero_pd (),
-               (__mmask8)
-               __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
-  return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P,
-              (__v2di) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
-  return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P,
-               (__v2di)
-               _mm_setzero_si128 (),
-               (__mmask8)
-               __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
-             void const *__P) {
-  return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P,
-              (__v4di) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
-  return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P,
-               (__v4di)
-               _mm256_setzero_si256 (),
-               (__mmask8)
-               __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
-  return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P,
-                   (__v4sf) __W,
-                   (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
-  return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P,
-              (__v4sf)
-              _mm_setzero_ps (),
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
-  return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P,
-                   (__v8sf) __W,
-                   (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
-  return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P,
-              (__v8sf)
-              _mm256_setzero_ps (),
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
-  return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P,
-              (__v4si) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
-  return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P,
-               (__v4si)
-               _mm_setzero_si128 (),
-               (__mmask8)     __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
-             void const *__P) {
-  return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P,
-              (__v8si) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
-  return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P,
-               (__v8si)
-               _mm256_setzero_si256 (),
-               (__mmask8)
-               __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
-               (__v4sf) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
-                (__v4sf)
-                _mm_setzero_ps (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
-               (__v8sf) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
-                (__v8sf)
-                _mm256_setzero_ps (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
-                (__v4si) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) {
-  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
-                (__v8si) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) {
-  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_getexp_pd (__m128d __A) {
-  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
-                (__v2df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) {
-  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_getexp_pd (__m256d __A) {
-  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
-                (__v4df)
-                _mm256_setzero_pd (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
-                (__v4df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) {
-  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
-                (__v4df)
-                _mm256_setzero_pd (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_getexp_ps (__m128 __A) {
-  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
-               (__v4sf) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) {
-  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_getexp_ps (__m256 __A) {
-  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
-               (__v8sf)
-               _mm256_setzero_ps (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
-               (__v8sf) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
-  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
-               (__v8sf)
-               _mm256_setzero_ps (),
-               (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_max_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_max_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_max_pd(__A, __B),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_max_pd(__A, __B),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_max_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_max_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_max_ps(__A, __B),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_max_ps(__A, __B),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_min_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_min_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_min_pd(__A, __B),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_min_pd(__A, __B),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_min_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_min_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_min_ps(__A, __B),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_min_ps(__A, __B),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_mul_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_mul_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_mul_pd(__A, __B),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_mul_pd(__A, __B),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_mul_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_mul_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_mul_ps(__A, __B),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_mul_ps(__A, __B),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_abs_epi32(__A),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_abs_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_abs_epi32(__A),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_abs_epi32(__A),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_abs_epi64 (__m128i __A) {
-  return (__m128i)__builtin_elementwise_abs((__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_abs_epi64(__A),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_abs_epi64(__A),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi64 (__m256i __A) {
-  return (__m256i)__builtin_elementwise_abs((__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_abs_epi64(__A),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_abs_epi64(__A),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_max_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_max_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_max_epi32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_max_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_max_epi64 (__m128i __A, __m128i __B) {
-  return (__m128i)__builtin_elementwise_max((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_max_epi64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_max_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi64 (__m256i __A, __m256i __B) {
-  return (__m256i)__builtin_elementwise_max((__v4di)__A, (__v4di)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_max_epi64(__A, __B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_max_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_max_epu32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_max_epu32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_max_epu32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_max_epu32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_max_epu64 (__m128i __A, __m128i __B) {
-  return (__m128i)__builtin_elementwise_max((__v2du)__A, (__v2du)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_max_epu64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_max_epu64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu64 (__m256i __A, __m256i __B) {
-  return (__m256i)__builtin_elementwise_max((__v4du)__A, (__v4du)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_max_epu64(__A, __B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_max_epu64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_min_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_min_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_min_epi32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_min_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_min_epi64 (__m128i __A, __m128i __B) {
-  return (__m128i)__builtin_elementwise_min((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_min_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_min_epi64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi64 (__m256i __A, __m256i __B) {
-  return (__m256i)__builtin_elementwise_min((__v4di)__A, (__v4di)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_min_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_min_epi64(__A, __B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_min_epu32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm_min_epu32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_min_epu32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                             (__v8si)_mm256_min_epu32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_min_epu64 (__m128i __A, __m128i __B) {
-  return (__m128i)__builtin_elementwise_min((__v2du)__A, (__v2du)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_min_epu64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
-                                             (__v2di)_mm_min_epu64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu64 (__m256i __A, __m256i __B) {
-  return (__m256i)__builtin_elementwise_min((__v4du)__A, (__v4du)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_min_epu64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                             (__v4di)_mm256_min_epu64(__A, __B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-#define _mm_roundscale_pd(A, imm) \
-  ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
-                                               (int)(imm), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)-1))
-
-
-#define _mm_mask_roundscale_pd(W, U, A, imm) \
-  ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
-                                               (int)(imm), \
-                                               (__v2df)(__m128d)(W), \
-                                               (__mmask8)(U)))
-
-
-#define _mm_maskz_roundscale_pd(U, A, imm) \
-  ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
-                                               (int)(imm), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(U)))
-
-
-#define _mm256_roundscale_pd(A, imm) \
-  ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
-                                               (int)(imm), \
-                                               (__v4df)_mm256_setzero_pd(), \
-                                               (__mmask8)-1))
-
-
-#define _mm256_mask_roundscale_pd(W, U, A, imm) \
-  ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
-                                               (int)(imm), \
-                                               (__v4df)(__m256d)(W), \
-                                               (__mmask8)(U)))
-
-
-#define _mm256_maskz_roundscale_pd(U, A, imm)  \
-  ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
-                                               (int)(imm), \
-                                               (__v4df)_mm256_setzero_pd(), \
-                                               (__mmask8)(U)))
-
-#define _mm_roundscale_ps(A, imm)  \
-  ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)-1))
-
-
-#define _mm_mask_roundscale_ps(W, U, A, imm)  \
-  ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                              (__v4sf)(__m128)(W), \
-                                              (__mmask8)(U)))
-
-
-#define _mm_maskz_roundscale_ps(U, A, imm)  \
-  ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(U)))
-
-#define _mm256_roundscale_ps(A, imm)  \
-  ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                              (__v8sf)_mm256_setzero_ps(), \
-                                              (__mmask8)-1))
-
-#define _mm256_mask_roundscale_ps(W, U, A, imm)  \
-  ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                              (__v8sf)(__m256)(W), \
-                                              (__mmask8)(U)))
-
-
-#define _mm256_maskz_roundscale_ps(U, A, imm)  \
-  ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                              (__v8sf)_mm256_setzero_ps(), \
-                                              (__mmask8)(U)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_scalef_pd (__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
-        __m128d __B) {
-  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_scalef_pd (__m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
-                (__v4df) __B,
-                (__v4df)
-                _mm256_setzero_pd (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
-           __m256d __B) {
-  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
-                (__v4df) __B,
-                (__v4df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
-                (__v4df) __B,
-                (__v4df)
-                _mm256_setzero_pd (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_scalef_ps (__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
-               (__v4sf) __B,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_scalef_ps (__m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
-               (__v8sf) __B,
-               (__v8sf)
-               _mm256_setzero_ps (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
-           __m256 __B) {
-  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
-               (__v8sf) __B,
-               (__v8sf) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
-               (__v8sf) __B,
-               (__v8sf)
-               _mm256_setzero_ps (),
-               (__mmask8) __U);
-}
-
-#define _mm_i64scatter_pd(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)-1, \
-                               (__v2di)(__m128i)(index), \
-                               (__v2df)(__m128d)(v1), (int)(scale))
-
-#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)(mask), \
-                               (__v2di)(__m128i)(index), \
-                               (__v2df)(__m128d)(v1), (int)(scale))
-
-#define _mm_i64scatter_epi64(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)-1, \
-                               (__v2di)(__m128i)(index), \
-                               (__v2di)(__m128i)(v1), (int)(scale))
-
-#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)(mask), \
-                               (__v2di)(__m128i)(index), \
-                               (__v2di)(__m128i)(v1), (int)(scale))
-
-#define _mm256_i64scatter_pd(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)-1, \
-                               (__v4di)(__m256i)(index), \
-                               (__v4df)(__m256d)(v1), (int)(scale))
-
-#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)(mask), \
-                               (__v4di)(__m256i)(index), \
-                               (__v4df)(__m256d)(v1), (int)(scale))
-
-#define _mm256_i64scatter_epi64(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)-1, \
-                               (__v4di)(__m256i)(index), \
-                               (__v4di)(__m256i)(v1), (int)(scale))
-
-#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)(mask), \
-                               (__v4di)(__m256i)(index), \
-                               (__v4di)(__m256i)(v1), (int)(scale))
-
-#define _mm_i64scatter_ps(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)-1, \
-                               (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale))
-
-#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)(mask), \
-                               (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale))
-
-#define _mm_i64scatter_epi32(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)-1, \
-                               (__v2di)(__m128i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)(mask), \
-                               (__v2di)(__m128i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm256_i64scatter_ps(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)-1, \
-                               (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale))
-
-#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)(mask), \
-                               (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
-                               (int)(scale))
-
-#define _mm256_i64scatter_epi32(addr, index, v1, scale) \
-  __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)-1, \
-                               (__v4di)(__m256i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
-  __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)(mask), \
-                               (__v4di)(__m256i)(index), \
-                               (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm_i32scatter_pd(addr, index, v1, scale) \
-  __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)-1, \
-                               (__v4si)(__m128i)(index), \
-                               (__v2df)(__m128d)(v1), (int)(scale))
-
-#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) \
-    __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)(mask), \
-                                 (__v4si)(__m128i)(index), \
-                                 (__v2df)(__m128d)(v1), (int)(scale))
-
-#define _mm_i32scatter_epi64(addr, index, v1, scale) \
-    __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)-1, \
-                                 (__v4si)(__m128i)(index), \
-                                 (__v2di)(__m128i)(v1), (int)(scale))
-
-#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
-    __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)(mask), \
-                                 (__v4si)(__m128i)(index), \
-                                 (__v2di)(__m128i)(v1), (int)(scale))
-
-#define _mm256_i32scatter_pd(addr, index, v1, scale) \
-    __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)-1, \
-                                 (__v4si)(__m128i)(index), \
-                                 (__v4df)(__m256d)(v1), (int)(scale))
-
-#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) \
-    __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)(mask), \
-                                 (__v4si)(__m128i)(index), \
-                                 (__v4df)(__m256d)(v1), (int)(scale))
-
-#define _mm256_i32scatter_epi64(addr, index, v1, scale) \
-    __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)-1, \
-                                 (__v4si)(__m128i)(index), \
-                                 (__v4di)(__m256i)(v1), (int)(scale))
-
-#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
-    __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)(mask), \
-                                 (__v4si)(__m128i)(index), \
-                                 (__v4di)(__m256i)(v1), (int)(scale))
-
-#define _mm_i32scatter_ps(addr, index, v1, scale) \
-    __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)-1, \
-                                 (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
-                                 (int)(scale))
-
-#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) \
-    __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)(mask), \
-                                 (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
-                                 (int)(scale))
-
-#define _mm_i32scatter_epi32(addr, index, v1, scale) \
-    __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)-1, \
-                                 (__v4si)(__m128i)(index), \
-                                 (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
-    __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)(mask), \
-                                 (__v4si)(__m128i)(index), \
-                                 (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm256_i32scatter_ps(addr, index, v1, scale) \
-    __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)-1, \
-                                 (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
-                                 (int)(scale))
-
-#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) \
-    __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)(mask), \
-                                 (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
-                                 (int)(scale))
-
-#define _mm256_i32scatter_epi32(addr, index, v1, scale) \
-    __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)-1, \
-                                 (__v8si)(__m256i)(index), \
-                                 (__v8si)(__m256i)(v1), (int)(scale))
-
-#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
-    __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)(mask), \
-                                 (__v8si)(__m256i)(index), \
-                                 (__v8si)(__m256i)(v1), (int)(scale))
-
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
-    return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                                (__v2df)_mm_sqrt_pd(__A),
-                                                (__v2df)__W);
-  }
-
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
-    return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                                (__v2df)_mm_sqrt_pd(__A),
-                                                (__v2df)_mm_setzero_pd());
-  }
-
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
-  _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
-    return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                                (__v4df)_mm256_sqrt_pd(__A),
-                                                (__v4df)__W);
-  }
-
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
-  _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
-    return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                                (__v4df)_mm256_sqrt_pd(__A),
-                                                (__v4df)_mm256_setzero_pd());
-  }
-
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
-  _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
-    return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                               (__v4sf)_mm_sqrt_ps(__A),
-                                               (__v4sf)__W);
-  }
-
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
-  _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
-    return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                               (__v4sf)_mm_sqrt_ps(__A),
-                                               (__v4sf)_mm_setzero_ps());
-  }
-
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
-    return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                               (__v8sf)_mm256_sqrt_ps(__A),
-                                               (__v8sf)__W);
-  }
-
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
-    return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                               (__v8sf)_mm256_sqrt_ps(__A),
-                                               (__v8sf)_mm256_setzero_ps());
-  }
-
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-    return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                                (__v2df)_mm_sub_pd(__A, __B),
-                                                (__v2df)__W);
-  }
-
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-    return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                                (__v2df)_mm_sub_pd(__A, __B),
-                                                (__v2df)_mm_setzero_pd());
-  }
-
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
-  _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-    return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                                (__v4df)_mm256_sub_pd(__A, __B),
-                                                (__v4df)__W);
-  }
-
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
-  _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-    return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                                (__v4df)_mm256_sub_pd(__A, __B),
-                                                (__v4df)_mm256_setzero_pd());
-  }
-
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
-  _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-    return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                               (__v4sf)_mm_sub_ps(__A, __B),
-                                               (__v4sf)__W);
-  }
-
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
-  _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-    return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                               (__v4sf)_mm_sub_ps(__A, __B),
-                                               (__v4sf)_mm_setzero_ps());
-  }
-
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-    return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                               (__v8sf)_mm256_sub_ps(__A, __B),
-                                               (__v8sf)__W);
-  }
-
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-    return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                               (__v8sf)_mm256_sub_ps(__A, __B),
-                                               (__v8sf)_mm256_setzero_ps());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
-    return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I,
-                                                  (__v4si)__B);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I,
-                              __m128i __B) {
-    return (__m128i)__builtin_ia32_selectd_128(__U,
-                                    (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
-                                    (__v4si)__A);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
-                               __m128i __B) {
-    return (__m128i)__builtin_ia32_selectd_128(__U,
-                                    (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
-                                    (__v4si)__I);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,
-                               __m128i __B) {
-    return (__m128i)__builtin_ia32_selectd_128(__U,
-                                    (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
-                                    (__v4si)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
-    return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I,
-                                                  (__v8si) __B);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I,
-                                 __m256i __B) {
-    return (__m256i)__builtin_ia32_selectd_256(__U,
-                                 (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
-                                 (__v8si)__A);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U,
-                                  __m256i __B) {
-    return (__m256i)__builtin_ia32_selectd_256(__U,
-                                 (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
-                                 (__v8si)__I);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I,
-                                  __m256i __B) {
-    return (__m256i)__builtin_ia32_selectd_256(__U,
-                                 (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
-                                 (__v8si)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
-    return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I,
-                                                   (__v2df)__B);
-  }
-
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
-    return (__m128d)__builtin_ia32_selectpd_128(__U,
-                                       (__v2df)_mm_permutex2var_pd(__A, __I, __B),
-                                       (__v2df)__A);
-  }
-
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
-    return (__m128d)__builtin_ia32_selectpd_128(__U,
-                                       (__v2df)_mm_permutex2var_pd(__A, __I, __B),
-                                       (__v2df)(__m128d)__I);
-  }
-
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
-    return (__m128d)__builtin_ia32_selectpd_128(__U,
-                                       (__v2df)_mm_permutex2var_pd(__A, __I, __B),
-                                       (__v2df)_mm_setzero_pd());
-  }
-
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
-  _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
-    return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I,
-                                                   (__v4df)__B);
-  }
-
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
-  _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I,
-                              __m256d __B) {
-    return (__m256d)__builtin_ia32_selectpd_256(__U,
-                                    (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
-                                    (__v4df)__A);
-  }
-
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
-  _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U,
-                               __m256d __B) {
-    return (__m256d)__builtin_ia32_selectpd_256(__U,
-                                    (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
-                                    (__v4df)(__m256d)__I);
-  }
-
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
-  _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,
-                               __m256d __B) {
-    return (__m256d)__builtin_ia32_selectpd_256(__U,
-                                    (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
-                                    (__v4df)_mm256_setzero_pd());
-  }
-
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
-  _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
-    return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I,
-                                                  (__v4sf)__B);
-  }
-
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
-  _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
-    return (__m128)__builtin_ia32_selectps_128(__U,
-                                       (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
-                                       (__v4sf)__A);
-  }
-
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
-  _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) {
-    return (__m128)__builtin_ia32_selectps_128(__U,
-                                       (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
-                                       (__v4sf)(__m128)__I);
-  }
-
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
-  _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
-    return (__m128)__builtin_ia32_selectps_128(__U,
-                                       (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
-                                       (__v4sf)_mm_setzero_ps());
-  }
-
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
-    return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I,
-                                                  (__v8sf) __B);
-  }
-
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
-    return (__m256)__builtin_ia32_selectps_256(__U,
-                                    (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
-                                    (__v8sf)__A);
-  }
-
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U,
-                               __m256 __B) {
-    return (__m256)__builtin_ia32_selectps_256(__U,
-                                    (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
-                                    (__v8sf)(__m256)__I);
-  }
-
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I,
-                               __m256 __B) {
-    return (__m256)__builtin_ia32_selectps_256(__U,
-                                    (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
-                                    (__v8sf)_mm256_setzero_ps());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
-    return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I,
-                                                  (__v2di)__B);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I,
-                              __m128i __B) {
-    return (__m128i)__builtin_ia32_selectq_128(__U,
-                                    (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
-                                    (__v2di)__A);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
-                               __m128i __B) {
-    return (__m128i)__builtin_ia32_selectq_128(__U,
-                                    (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
-                                    (__v2di)__I);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I,
-                               __m128i __B) {
-    return (__m128i)__builtin_ia32_selectq_128(__U,
-                                    (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
-                                    (__v2di)_mm_setzero_si128());
-  }
-
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
-    return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I,
-                                                  (__v4di) __B);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I,
-                                 __m256i __B) {
-    return (__m256i)__builtin_ia32_selectq_256(__U,
-                                 (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
-                                 (__v4di)__A);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U,
-                                  __m256i __B) {
-    return (__m256i)__builtin_ia32_selectq_256(__U,
-                                 (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
-                                 (__v4di)__I);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I,
-                                  __m256i __B) {
-    return (__m256i)__builtin_ia32_selectq_256(__U,
-                                 (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
-                                 (__v4di)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                               (__v4si)_mm_cvtepi8_epi32(__A),
-                                               (__v4si)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                               (__v4si)_mm_cvtepi8_epi32(__A),
-                                               (__v4si)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                               (__v8si)_mm256_cvtepi8_epi32(__A),
-                                               (__v8si)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                               (__v8si)_mm256_cvtepi8_epi32(__A),
-                                               (__v8si)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepi8_epi64(__A),
-                                               (__v2di)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepi8_epi64(__A),
-                                               (__v2di)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepi8_epi64(__A),
-                                               (__v4di)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepi8_epi64(__A),
-                                               (__v4di)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepi32_epi64(__X),
-                                               (__v2di)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepi32_epi64(__X),
-                                               (__v2di)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepi32_epi64(__X),
-                                               (__v4di)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepi32_epi64(__X),
-                                               (__v4di)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                               (__v4si)_mm_cvtepi16_epi32(__A),
-                                               (__v4si)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                               (__v4si)_mm_cvtepi16_epi32(__A),
-                                               (__v4si)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                               (__v8si)_mm256_cvtepi16_epi32(__A),
-                                               (__v8si)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                               (__v8si)_mm256_cvtepi16_epi32(__A),
-                                               (__v8si)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepi16_epi64(__A),
-                                               (__v2di)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepi16_epi64(__A),
-                                               (__v2di)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepi16_epi64(__A),
-                                               (__v4di)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepi16_epi64(__A),
-                                               (__v4di)_mm256_setzero_si256());
-  }
-
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                               (__v4si)_mm_cvtepu8_epi32(__A),
-                                               (__v4si)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                               (__v4si)_mm_cvtepu8_epi32(__A),
-                                               (__v4si)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                               (__v8si)_mm256_cvtepu8_epi32(__A),
-                                               (__v8si)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                               (__v8si)_mm256_cvtepu8_epi32(__A),
-                                               (__v8si)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepu8_epi64(__A),
-                                               (__v2di)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepu8_epi64(__A),
-                                               (__v2di)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepu8_epi64(__A),
-                                               (__v4di)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepu8_epi64(__A),
-                                               (__v4di)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepu32_epi64(__X),
-                                               (__v2di)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepu32_epi64(__X),
-                                               (__v2di)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepu32_epi64(__X),
-                                               (__v4di)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepu32_epi64(__X),
-                                               (__v4di)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                               (__v4si)_mm_cvtepu16_epi32(__A),
-                                               (__v4si)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                               (__v4si)_mm_cvtepu16_epi32(__A),
-                                               (__v4si)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                               (__v8si)_mm256_cvtepu16_epi32(__A),
-                                               (__v8si)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                               (__v8si)_mm256_cvtepu16_epi32(__A),
-                                               (__v8si)_mm256_setzero_si256());
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepu16_epi64(__A),
-                                               (__v2di)__W);
-  }
-
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
-  _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
-  {
-    return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                               (__v2di)_mm_cvtepu16_epi64(__A),
-                                               (__v2di)_mm_setzero_si128());
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepu16_epi64(__A),
-                                               (__v4di)__W);
-  }
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
-  _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
-  {
-    return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                               (__v4di)_mm256_cvtepu16_epi64(__A),
-                                               (__v4di)_mm256_setzero_si256());
-  }
-
-
-#define _mm_rol_epi32(a, b) \
-  ((__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b)))
-
-#define _mm_mask_rol_epi32(w, u, a, b) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
-                                       (__v4si)_mm_rol_epi32((a), (b)), \
-                                       (__v4si)(__m128i)(w)))
-
-#define _mm_maskz_rol_epi32(u, a, b) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
-                                       (__v4si)_mm_rol_epi32((a), (b)), \
-                                       (__v4si)_mm_setzero_si128()))
-
-#define _mm256_rol_epi32(a, b) \
-  ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b)))
-
-#define _mm256_mask_rol_epi32(w, u, a, b) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
-                                       (__v8si)_mm256_rol_epi32((a), (b)), \
-                                       (__v8si)(__m256i)(w)))
-
-#define _mm256_maskz_rol_epi32(u, a, b) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
-                                       (__v8si)_mm256_rol_epi32((a), (b)), \
-                                       (__v8si)_mm256_setzero_si256()))
-
-#define _mm_rol_epi64(a, b) \
-  ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b)))
-
-#define _mm_mask_rol_epi64(w, u, a, b) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
-                                       (__v2di)_mm_rol_epi64((a), (b)), \
-                                       (__v2di)(__m128i)(w)))
-
-#define _mm_maskz_rol_epi64(u, a, b) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
-                                       (__v2di)_mm_rol_epi64((a), (b)), \
-                                       (__v2di)_mm_setzero_si128()))
-
-#define _mm256_rol_epi64(a, b) \
-  ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b)))
-
-#define _mm256_mask_rol_epi64(w, u, a, b) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
-                                       (__v4di)_mm256_rol_epi64((a), (b)), \
-                                       (__v4di)(__m256i)(w)))
-
-#define _mm256_maskz_rol_epi64(u, a, b) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
-                                       (__v4di)_mm256_rol_epi64((a), (b)), \
-                                       (__v4di)_mm256_setzero_si256()))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_rolv_epi32 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                             (__v4si)_mm_rolv_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                             (__v4si)_mm_rolv_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_rolv_epi32 (__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_prolvd256((__v8si)__A, (__v8si)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                            (__v8si)_mm256_rolv_epi32(__A, __B),
-                                            (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                            (__v8si)_mm256_rolv_epi32(__A, __B),
-                                            (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_rolv_epi64 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_prolvq128((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                             (__v2di)_mm_rolv_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                             (__v2di)_mm_rolv_epi64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_rolv_epi64 (__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_prolvq256((__v4di)__A, (__v4di)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                            (__v4di)_mm256_rolv_epi64(__A, __B),
-                                            (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                            (__v4di)_mm256_rolv_epi64(__A, __B),
-                                            (__v4di)_mm256_setzero_si256());
-}
-
-#define _mm_ror_epi32(a, b) \
-  ((__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b)))
-
-#define _mm_mask_ror_epi32(w, u, a, b) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
-                                       (__v4si)_mm_ror_epi32((a), (b)), \
-                                       (__v4si)(__m128i)(w)))
-
-#define _mm_maskz_ror_epi32(u, a, b) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
-                                       (__v4si)_mm_ror_epi32((a), (b)), \
-                                       (__v4si)_mm_setzero_si128()))
-
-#define _mm256_ror_epi32(a, b) \
-  ((__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b)))
-
-#define _mm256_mask_ror_epi32(w, u, a, b) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
-                                       (__v8si)_mm256_ror_epi32((a), (b)), \
-                                       (__v8si)(__m256i)(w)))
-
-#define _mm256_maskz_ror_epi32(u, a, b) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
-                                       (__v8si)_mm256_ror_epi32((a), (b)), \
-                                       (__v8si)_mm256_setzero_si256()))
-
-#define _mm_ror_epi64(a, b) \
-  ((__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b)))
-
-#define _mm_mask_ror_epi64(w, u, a, b) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
-                                       (__v2di)_mm_ror_epi64((a), (b)), \
-                                       (__v2di)(__m128i)(w)))
-
-#define _mm_maskz_ror_epi64(u, a, b) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
-                                       (__v2di)_mm_ror_epi64((a), (b)), \
-                                       (__v2di)_mm_setzero_si128()))
-
-#define _mm256_ror_epi64(a, b) \
-  ((__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b)))
-
-#define _mm256_mask_ror_epi64(w, u, a, b) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
-                                       (__v4di)_mm256_ror_epi64((a), (b)), \
-                                       (__v4di)(__m256i)(w)))
-
-#define _mm256_maskz_ror_epi64(u, a, b) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
-                                       (__v4di)_mm256_ror_epi64((a), (b)), \
-                                       (__v4di)_mm256_setzero_si256()))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_sll_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_sll_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_sll_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_sll_epi32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_slli_epi32(__A, (int)__B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_slli_epi32(__A, (int)__B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_slli_epi32(__A, (int)__B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_slli_epi32(__A, (int)__B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_sll_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_sll_epi64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_sll_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_sll_epi64(__A, __B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_slli_epi64(__A, (int)__B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_slli_epi64(__A, (int)__B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_slli_epi64(__A, (int)__B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_slli_epi64(__A, (int)__B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_rorv_epi32 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_prorvd128((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                             (__v4si)_mm_rorv_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                             (__v4si)_mm_rorv_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_rorv_epi32 (__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_prorvd256((__v8si)__A, (__v8si)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                            (__v8si)_mm256_rorv_epi32(__A, __B),
-                                            (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                            (__v8si)_mm256_rorv_epi32(__A, __B),
-                                            (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_rorv_epi64 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_prorvq128((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                             (__v2di)_mm_rorv_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                             (__v2di)_mm_rorv_epi64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_rorv_epi64 (__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_prorvq256((__v4di)__A, (__v4di)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                            (__v4di)_mm256_rorv_epi64(__A, __B),
-                                            (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                            (__v4di)_mm256_rorv_epi64(__A, __B),
-                                            (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_sllv_epi64(__X, __Y),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_sllv_epi64(__X, __Y),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
-                                            (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
-                                            (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_sllv_epi32(__X, __Y),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_sllv_epi32(__X, __Y),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
-                                            (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
-                                            (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srlv_epi64(__X, __Y),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srlv_epi64(__X, __Y),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
-                                            (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
-                                            (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                            (__v4si)_mm_srlv_epi32(__X, __Y),
-                                            (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                            (__v4si)_mm_srlv_epi32(__X, __Y),
-                                            (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
-                                            (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
-                                            (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srl_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srl_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srl_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srl_epi32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srli_epi32(__A, (int)__B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srli_epi32(__A, (int)__B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srli_epi32(__A, (int)__B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srli_epi32(__A, (int)__B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srl_epi64(__A, __B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srl_epi64(__A, __B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srl_epi64(__A, __B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srl_epi64(__A, __B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srli_epi64(__A, (int)__B),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srli_epi64(__A, (int)__B),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srli_epi64(__A, (int)__B),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srli_epi64(__A, (int)__B),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                            (__v4si)_mm_srav_epi32(__X, __Y),
-                                            (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                            (__v4si)_mm_srav_epi32(__X, __Y),
-                                            (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                            (__v8si)_mm256_srav_epi32(__X, __Y),
-                                            (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                            (__v8si)_mm256_srav_epi32(__X, __Y),
-                                            (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srav_epi64(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srav_epi64(__X, __Y),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srav_epi64(__X, __Y),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srav_epi64(__m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srav_epi64(__X, __Y),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srav_epi64(__X, __Y),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
-                 (__v4si) __A,
-                 (__v4si) __W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
-                 (__v4si) __A,
-                 (__v4si) _mm_setzero_si128 ());
-}
-
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
-                 (__v8si) __A,
-                 (__v8si) __W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
-                 (__v8si) __A,
-                 (__v8si) _mm256_setzero_si256 ());
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_load_epi32 (void const *__P)
-{
-  return *(const __m128i *) __P;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P,
-              (__v4si) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P,
-              (__v4si)
-              _mm_setzero_si128 (),
-              (__mmask8)
-              __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_load_epi32 (void const *__P)
-{
-  return *(const __m256i *) __P;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P,
-              (__v8si) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P,
-              (__v8si)
-              _mm256_setzero_si256 (),
-              (__mmask8)
-              __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_store_epi32 (void *__P, __m128i __A)
-{
-  *(__m128i *) __P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A)
-{
-  __builtin_ia32_movdqa32store128_mask ((__v4si *) __P,
-          (__v4si) __A,
-          (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_store_epi32 (void *__P, __m256i __A)
-{
-  *(__m256i *) __P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A)
-{
-  __builtin_ia32_movdqa32store256_mask ((__v8si *) __P,
-          (__v8si) __A,
-          (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
-                 (__v2di) __A,
-                 (__v2di) __W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
-                 (__v2di) __A,
-                 (__v2di) _mm_setzero_si128 ());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
-                 (__v4di) __A,
-                 (__v4di) __W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
-                 (__v4di) __A,
-                 (__v4di) _mm256_setzero_si256 ());
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_load_epi64 (void const *__P)
-{
-  return *(const __m128i *) __P;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P,
-              (__v2di) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P,
-              (__v2di)
-              _mm_setzero_si128 (),
-              (__mmask8)
-              __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_load_epi64 (void const *__P)
-{
-  return *(const __m256i *) __P;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P,
-              (__v4di) __W,
-              (__mmask8)
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P,
-              (__v4di)
-              _mm256_setzero_si256 (),
-              (__mmask8)
-              __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_store_epi64 (void *__P, __m128i __A)
-{
-  *(__m128i *) __P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A)
-{
-  __builtin_ia32_movdqa64store128_mask ((__v2di *) __P,
-          (__v2di) __A,
-          (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_store_epi64 (void *__P, __m256i __A)
-{
-  *(__m256i *) __P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A)
-{
-  __builtin_ia32_movdqa64store256_mask ((__v4di *) __P,
-          (__v4di) __A,
-          (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A)
-{
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_movedup_pd(__A),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A)
-{
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_movedup_pd(__A),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_movedup_pd(__A),
-                                              (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                              (__v4df)_mm256_movedup_pd(__A),
-                                              (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A)
-{
-   return (__m128i)__builtin_ia32_selectd_128(__M,
-                                              (__v4si) _mm_set1_epi32(__A),
-                                              (__v4si)__O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi32( __mmask8 __M, int __A)
-{
-   return (__m128i)__builtin_ia32_selectd_128(__M,
-                                              (__v4si) _mm_set1_epi32(__A),
-                                              (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A)
-{
-   return (__m256i)__builtin_ia32_selectd_256(__M,
-                                              (__v8si) _mm256_set1_epi32(__A),
-                                              (__v8si)__O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi32( __mmask8 __M, int __A)
-{
-   return (__m256i)__builtin_ia32_selectd_256(__M,
-                                              (__v8si) _mm256_set1_epi32(__A),
-                                              (__v8si)_mm256_setzero_si256());
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
-{
-  return (__m128i) __builtin_ia32_selectq_128(__M,
-                                              (__v2di) _mm_set1_epi64x(__A),
-                                              (__v2di) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
-{
-  return (__m128i) __builtin_ia32_selectq_128(__M,
-                                              (__v2di) _mm_set1_epi64x(__A),
-                                              (__v2di) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
-{
-  return (__m256i) __builtin_ia32_selectq_256(__M,
-                                              (__v4di) _mm256_set1_epi64x(__A),
-                                              (__v4di) __O) ;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
-{
-   return (__m256i) __builtin_ia32_selectq_256(__M,
-                                               (__v4di) _mm256_set1_epi64x(__A),
-                                               (__v4di) _mm256_setzero_si256());
-}
-
-#define _mm_fixupimm_pd(A, B, C, imm) \
-  ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2di)(__m128i)(C), (int)(imm), \
-                                              (__mmask8)-1))
-
-#define _mm_mask_fixupimm_pd(A, U, B, C, imm) \
-  ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2di)(__m128i)(C), (int)(imm), \
-                                              (__mmask8)(U)))
-
-#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) \
-  ((__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (__v2di)(__m128i)(C), \
-                                               (int)(imm), (__mmask8)(U)))
-
-#define _mm256_fixupimm_pd(A, B, C, imm) \
-  ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
-                                              (__v4df)(__m256d)(B), \
-                                              (__v4di)(__m256i)(C), (int)(imm), \
-                                              (__mmask8)-1))
-
-#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) \
-  ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
-                                              (__v4df)(__m256d)(B), \
-                                              (__v4di)(__m256i)(C), (int)(imm), \
-                                              (__mmask8)(U)))
-
-#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) \
-  ((__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \
-                                               (__v4df)(__m256d)(B), \
-                                               (__v4di)(__m256i)(C), \
-                                               (int)(imm), (__mmask8)(U)))
-
-#define _mm_fixupimm_ps(A, B, C, imm) \
-  ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4si)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)-1))
-
-#define _mm_mask_fixupimm_ps(A, U, B, C, imm) \
-  ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4si)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U)))
-
-#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \
-  ((__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v4si)(__m128i)(C), (int)(imm), \
-                                              (__mmask8)(U)))
-
-#define _mm256_fixupimm_ps(A, B, C, imm) \
-  ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
-                                             (__v8sf)(__m256)(B), \
-                                             (__v8si)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)-1))
-
-#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) \
-  ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
-                                             (__v8sf)(__m256)(B), \
-                                             (__v8si)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U)))
-
-#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \
-  ((__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \
-                                              (__v8sf)(__m256)(B), \
-                                              (__v8si)(__m256i)(C), (int)(imm), \
-                                              (__mmask8)(U)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
-{
-  return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P,
-               (__v2df) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_load_pd (__mmask8 __U, void const *__P)
-{
-  return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P,
-               (__v2df)
-               _mm_setzero_pd (),
-               (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
-{
-  return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P,
-               (__v4df) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_load_pd (__mmask8 __U, void const *__P)
-{
-  return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P,
-               (__v4df)
-               _mm256_setzero_pd (),
-               (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
-{
-  return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P,
-              (__v4sf) __W,
-              (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_load_ps (__mmask8 __U, void const *__P)
-{
-  return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P,
-              (__v4sf)
-              _mm_setzero_ps (),
-              (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
-{
-  return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P,
-              (__v8sf) __W,
-              (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_load_ps (__mmask8 __U, void const *__P)
-{
-  return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P,
-              (__v8sf)
-              _mm256_setzero_ps (),
-              (__mmask8) __U);
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_loadu_epi64 (void const *__P)
-{
-  struct __loadu_epi64 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi64*)__P)->__v;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P,
-                 (__v2di) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P,
-                 (__v2di)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_loadu_epi64 (void const *__P)
-{
-  struct __loadu_epi64 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi64*)__P)->__v;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P,
-                 (__v4di) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P,
-                 (__v4di)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_loadu_epi32 (void const *__P)
-{
-  struct __loadu_epi32 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi32*)__P)->__v;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P,
-                 (__v4si) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_loadu_epi32 (void const *__P)
-{
-  struct __loadu_epi32 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_epi32*)__P)->__v;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P,
-                 (__v8si) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
-{
-  return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P,
-               (__v2df) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
-{
-  return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P,
-               (__v2df)
-               _mm_setzero_pd (),
-               (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
-{
-  return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P,
-               (__v4df) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
-{
-  return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P,
-               (__v4df)
-               _mm256_setzero_pd (),
-               (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
-{
-  return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P,
-              (__v4sf) __W,
-              (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
-{
-  return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P,
-              (__v4sf)
-              _mm_setzero_ps (),
-              (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
-{
-  return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P,
-              (__v8sf) __W,
-              (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
-{
-  return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P,
-              (__v8sf)
-              _mm256_setzero_ps (),
-              (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
-{
-  __builtin_ia32_storeapd128_mask ((__v2df *) __P,
-           (__v2df) __A,
-           (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
-{
-  __builtin_ia32_storeapd256_mask ((__v4df *) __P,
-           (__v4df) __A,
-           (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
-{
-  __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
-           (__v4sf) __A,
-           (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
-{
-  __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
-           (__v8sf) __A,
-           (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_storeu_epi64 (void *__P, __m128i __A)
-{
-  struct __storeu_epi64 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi64*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
-{
-  __builtin_ia32_storedqudi128_mask ((__v2di *) __P,
-             (__v2di) __A,
-             (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_storeu_epi64 (void *__P, __m256i __A)
-{
-  struct __storeu_epi64 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi64*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
-{
-  __builtin_ia32_storedqudi256_mask ((__v4di *) __P,
-             (__v4di) __A,
-             (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_storeu_epi32 (void *__P, __m128i __A)
-{
-  struct __storeu_epi32 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi32*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
-{
-  __builtin_ia32_storedqusi128_mask ((__v4si *) __P,
-             (__v4si) __A,
-             (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_storeu_epi32 (void *__P, __m256i __A)
-{
-  struct __storeu_epi32 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_epi32*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
-{
-  __builtin_ia32_storedqusi256_mask ((__v8si *) __P,
-             (__v8si) __A,
-             (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
-{
-  __builtin_ia32_storeupd128_mask ((__v2df *) __P,
-           (__v2df) __A,
-           (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
-{
-  __builtin_ia32_storeupd256_mask ((__v4df *) __P,
-           (__v4df) __A,
-           (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
-{
-  __builtin_ia32_storeups128_mask ((__v4sf *) __P,
-           (__v4sf) __A,
-           (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
-{
-  __builtin_ia32_storeups256_mask ((__v8sf *) __P,
-           (__v8sf) __A,
-           (__mmask8) __U);
-}
-
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_unpackhi_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B)
-{
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_unpackhi_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                           (__v4df)_mm256_unpackhi_pd(__A, __B),
-                                           (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                           (__v4df)_mm256_unpackhi_pd(__A, __B),
-                                           (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_unpackhi_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_unpackhi_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                           (__v8sf)_mm256_unpackhi_ps(__A, __B),
-                                           (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                           (__v8sf)_mm256_unpackhi_ps(__A, __B),
-                                           (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_unpacklo_pd(__A, __B),
-                                              (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B)
-{
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                              (__v2df)_mm_unpacklo_pd(__A, __B),
-                                              (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                           (__v4df)_mm256_unpacklo_pd(__A, __B),
-                                           (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                           (__v4df)_mm256_unpacklo_pd(__A, __B),
-                                           (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_unpacklo_ps(__A, __B),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_unpacklo_ps(__A, __B),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                           (__v8sf)_mm256_unpacklo_ps(__A, __B),
-                                           (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                           (__v8sf)_mm256_unpacklo_ps(__A, __B),
-                                           (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_rcp14_pd (__m128d __A)
-{
-  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
-{
-  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
-                (__v2df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
-{
-  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
-                (__v2df)
-                _mm_setzero_pd (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_rcp14_pd (__m256d __A)
-{
-  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
-                (__v4df)
-                _mm256_setzero_pd (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
-{
-  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
-                (__v4df) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
-{
-  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
-                (__v4df)
-                _mm256_setzero_pd (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_rcp14_ps (__m128 __A)
-{
-  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
-  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
-               (__v4sf) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
-{
-  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
-               (__v4sf)
-               _mm_setzero_ps (),
-               (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_rcp14_ps (__m256 __A)
-{
-  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
-               (__v8sf)
-               _mm256_setzero_ps (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
-  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
-               (__v8sf) __W,
-               (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
-{
-  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
-               (__v8sf)
-               _mm256_setzero_ps (),
-               (__mmask8) __U);
-}
-
-#define _mm_mask_permute_pd(W, U, X, C) \
-  ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
-                                        (__v2df)_mm_permute_pd((X), (C)), \
-                                        (__v2df)(__m128d)(W)))
-
-#define _mm_maskz_permute_pd(U, X, C) \
-  ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
-                                        (__v2df)_mm_permute_pd((X), (C)), \
-                                        (__v2df)_mm_setzero_pd()))
-
-#define _mm256_mask_permute_pd(W, U, X, C) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                        (__v4df)_mm256_permute_pd((X), (C)), \
-                                        (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_permute_pd(U, X, C) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                        (__v4df)_mm256_permute_pd((X), (C)), \
-                                        (__v4df)_mm256_setzero_pd()))
-
-#define _mm_mask_permute_ps(W, U, X, C) \
-  ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
-                                       (__v4sf)_mm_permute_ps((X), (C)), \
-                                       (__v4sf)(__m128)(W)))
-
-#define _mm_maskz_permute_ps(U, X, C) \
-  ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
-                                       (__v4sf)_mm_permute_ps((X), (C)), \
-                                       (__v4sf)_mm_setzero_ps()))
-
-#define _mm256_mask_permute_ps(W, U, X, C) \
-  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                       (__v8sf)_mm256_permute_ps((X), (C)), \
-                                       (__v8sf)(__m256)(W)))
-
-#define _mm256_maskz_permute_ps(U, X, C) \
-  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                       (__v8sf)_mm256_permute_ps((X), (C)), \
-                                       (__v8sf)_mm256_setzero_ps()))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C)
-{
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                            (__v2df)_mm_permutevar_pd(__A, __C),
-                                            (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C)
-{
-  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
-                                            (__v2df)_mm_permutevar_pd(__A, __C),
-                                            (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                         (__v4df)_mm256_permutevar_pd(__A, __C),
-                                         (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                         (__v4df)_mm256_permutevar_pd(__A, __C),
-                                         (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                            (__v4sf)_mm_permutevar_ps(__A, __C),
-                                            (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                            (__v4sf)_mm_permutevar_ps(__A, __C),
-                                            (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
-                                          (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
-                                          (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_test_epi32_mask (__m128i __A, __m128i __B)
-{
-  return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_cmpneq_epi32_mask (__U, _mm_and_si128 (__A, __B),
-                                     _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_test_epi32_mask (__m256i __A, __m256i __B)
-{
-  return _mm256_cmpneq_epi32_mask (_mm256_and_si256 (__A, __B),
-                                   _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_cmpneq_epi32_mask (__U, _mm256_and_si256 (__A, __B),
-                                        _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_test_epi64_mask (__m128i __A, __m128i __B)
-{
-  return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_cmpneq_epi64_mask (__U, _mm_and_si128 (__A, __B),
-                                     _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_test_epi64_mask (__m256i __A, __m256i __B)
-{
-  return _mm256_cmpneq_epi64_mask (_mm256_and_si256 (__A, __B),
-                                   _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_cmpneq_epi64_mask (__U, _mm256_and_si256 (__A, __B),
-                                        _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_testn_epi32_mask (__m128i __A, __m128i __B)
-{
-  return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_cmpeq_epi32_mask (__U, _mm_and_si128 (__A, __B),
-                                    _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_testn_epi32_mask (__m256i __A, __m256i __B)
-{
-  return _mm256_cmpeq_epi32_mask (_mm256_and_si256 (__A, __B),
-                                  _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_cmpeq_epi32_mask (__U, _mm256_and_si256 (__A, __B),
-                                       _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_testn_epi64_mask (__m128i __A, __m128i __B)
-{
-  return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_cmpeq_epi64_mask (__U, _mm_and_si128 (__A, __B),
-                                    _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_testn_epi64_mask (__m256i __A, __m256i __B)
-{
-  return _mm256_cmpeq_epi64_mask (_mm256_and_si256 (__A, __B),
-                                  _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_cmpeq_epi64_mask (__U, _mm256_and_si256 (__A, __B),
-                                       _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                           (__v4si)_mm_unpackhi_epi32(__A, __B),
-                                           (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                           (__v4si)_mm_unpackhi_epi32(__A, __B),
-                                           (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                        (__v8si)_mm256_unpackhi_epi32(__A, __B),
-                                        (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                        (__v8si)_mm256_unpackhi_epi32(__A, __B),
-                                        (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                           (__v2di)_mm_unpackhi_epi64(__A, __B),
-                                           (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                           (__v2di)_mm_unpackhi_epi64(__A, __B),
-                                           (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                        (__v4di)_mm256_unpackhi_epi64(__A, __B),
-                                        (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                        (__v4di)_mm256_unpackhi_epi64(__A, __B),
-                                        (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                           (__v4si)_mm_unpacklo_epi32(__A, __B),
-                                           (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                           (__v4si)_mm_unpacklo_epi32(__A, __B),
-                                           (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                        (__v8si)_mm256_unpacklo_epi32(__A, __B),
-                                        (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                        (__v8si)_mm256_unpacklo_epi32(__A, __B),
-                                        (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                           (__v2di)_mm_unpacklo_epi64(__A, __B),
-                                           (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                           (__v2di)_mm_unpacklo_epi64(__A, __B),
-                                           (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                        (__v4di)_mm256_unpacklo_epi64(__A, __B),
-                                        (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                        (__v4di)_mm256_unpacklo_epi64(__A, __B),
-                                        (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_sra_epi32(__A, __B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_sra_epi32(__A, __B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_sra_epi32(__A, __B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_sra_epi32(__A, __B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srai_epi32(__A, (int)__B),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srai_epi32(__A, (int)__B),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srai_epi32(__A, (int)__B),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srai_epi32(__A, (int)__B),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_sra_epi64(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
-                                             (__v2di)_mm_sra_epi64(__A, __B), \
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
-                                             (__v2di)_mm_sra_epi64(__A, __B), \
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi64(__m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
-                                           (__v4di)_mm256_sra_epi64(__A, __B), \
-                                           (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
-                                           (__v4di)_mm256_sra_epi64(__A, __B), \
-                                           (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srai_epi64(__m128i __A, unsigned int __imm)
-{
-  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, (int)__imm);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
-                                           (__v2di)_mm_srai_epi64(__A, __imm), \
-                                           (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
-                                           (__v2di)_mm_srai_epi64(__A, __imm), \
-                                           (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srai_epi64(__m256i __A, unsigned int __imm)
-{
-  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, (int)__imm);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A,
-                       unsigned int __imm)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
-                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
-                                        (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
-                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
-                                        (__v4di)_mm256_setzero_si256());
-}
-
-#define _mm_ternarylogic_epi32(A, B, C, imm)                                   \
-  ((__m128i)__builtin_ia32_pternlogd128_mask(                                  \
-      (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C),        \
-      (unsigned char)(imm), (__mmask8)-1))
-
-#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm)                           \
-  ((__m128i)__builtin_ia32_pternlogd128_mask(                                  \
-      (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm)                          \
-  ((__m128i)__builtin_ia32_pternlogd128_maskz(                                 \
-      (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#define _mm256_ternarylogic_epi32(A, B, C, imm)                                \
-  ((__m256i)__builtin_ia32_pternlogd256_mask(                                  \
-      (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C),        \
-      (unsigned char)(imm), (__mmask8)-1))
-
-#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
-  ((__m256i)__builtin_ia32_pternlogd256_mask(                                  \
-      (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
-  ((__m256i)__builtin_ia32_pternlogd256_maskz(                                 \
-      (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#define _mm_ternarylogic_epi64(A, B, C, imm)                                   \
-  ((__m128i)__builtin_ia32_pternlogq128_mask(                                  \
-      (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C),        \
-      (unsigned char)(imm), (__mmask8)-1))
-
-#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm)                           \
-  ((__m128i)__builtin_ia32_pternlogq128_mask(                                  \
-      (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm)                          \
-  ((__m128i)__builtin_ia32_pternlogq128_maskz(                                 \
-      (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#define _mm256_ternarylogic_epi64(A, B, C, imm)                                \
-  ((__m256i)__builtin_ia32_pternlogq256_mask(                                  \
-      (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C),        \
-      (unsigned char)(imm), (__mmask8)-1))
-
-#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
-  ((__m256i)__builtin_ia32_pternlogq256_mask(                                  \
-      (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
-  ((__m256i)__builtin_ia32_pternlogq256_maskz(                                 \
-      (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C),        \
-      (unsigned char)(imm), (__mmask8)(U)))
-
-#define _mm256_shuffle_f32x4(A, B, imm) \
-  ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
-                                         (__v8sf)(__m256)(B), (int)(imm)))
-
-#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \
-  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                       (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
-                                       (__v8sf)(__m256)(W)))
-
-#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \
-  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                       (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
-                                       (__v8sf)_mm256_setzero_ps()))
-
-#define _mm256_shuffle_f64x2(A, B, imm) \
-  ((__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \
-                                          (__v4df)(__m256d)(B), (int)(imm)))
-
-#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                       (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
-                                       (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                       (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
-                                       (__v4df)_mm256_setzero_pd()))
-
-#define _mm256_shuffle_i32x4(A, B, imm) \
-  ((__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \
-                                          (__v8si)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                       (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
-                                       (__v8si)(__m256i)(W)))
-
-#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                       (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
-                                       (__v8si)_mm256_setzero_si256()))
-
-#define _mm256_shuffle_i64x2(A, B, imm) \
-  ((__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \
-                                          (__v4di)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                       (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
-                                       (__v4di)(__m256i)(W)))
-
-
-#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                       (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
-                                       (__v4di)_mm256_setzero_si256()))
-
-#define _mm_mask_shuffle_pd(W, U, A, B, M) \
-  ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
-                                        (__v2df)_mm_shuffle_pd((A), (B), (M)), \
-                                        (__v2df)(__m128d)(W)))
-
-#define _mm_maskz_shuffle_pd(U, A, B, M) \
-  ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
-                                        (__v2df)_mm_shuffle_pd((A), (B), (M)), \
-                                        (__v2df)_mm_setzero_pd()))
-
-#define _mm256_mask_shuffle_pd(W, U, A, B, M) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                        (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
-                                        (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_shuffle_pd(U, A, B, M) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                        (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
-                                        (__v4df)_mm256_setzero_pd()))
-
-#define _mm_mask_shuffle_ps(W, U, A, B, M) \
-  ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
-                                       (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
-                                       (__v4sf)(__m128)(W)))
-
-#define _mm_maskz_shuffle_ps(U, A, B, M) \
-  ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
-                                       (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
-                                       (__v4sf)_mm_setzero_ps()))
-
-#define _mm256_mask_shuffle_ps(W, U, A, B, M) \
-  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                       (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
-                                       (__v8sf)(__m256)(W)))
-
-#define _mm256_maskz_shuffle_ps(U, A, B, M) \
-  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                       (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
-                                       (__v8sf)_mm256_setzero_ps()))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_rsqrt14_pd (__m128d __A)
-{
-  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
-                 (__v2df)
-                 _mm_setzero_pd (),
-                 (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
-{
-  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
-                 (__v2df) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
-{
-  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
-                 (__v2df)
-                 _mm_setzero_pd (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_rsqrt14_pd (__m256d __A)
-{
-  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
-                 (__v4df)
-                 _mm256_setzero_pd (),
-                 (__mmask8) -1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
-{
-  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
-                 (__v4df) __W,
-                 (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
-{
-  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
-                 (__v4df)
-                 _mm256_setzero_pd (),
-                 (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_rsqrt14_ps (__m128 __A)
-{
-  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
-                (__v4sf)
-                _mm_setzero_ps (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
-  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
-                (__v4sf) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
-{
-  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
-                (__v4sf)
-                _mm_setzero_ps (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_rsqrt14_ps (__m256 __A)
-{
-  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
-                (__v8sf)
-                _mm256_setzero_ps (),
-                (__mmask8) -1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
-  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
-                (__v8sf) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
-{
-  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
-                (__v8sf)
-                _mm256_setzero_ps (),
-                (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_broadcast_f32x4(__m128 __A)
-{
-  return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
-                                         0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
-                                            (__v8sf)_mm256_broadcast_f32x4(__A),
-                                            (__v8sf)__O);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
-                                            (__v8sf)_mm256_broadcast_f32x4(__A),
-                                            (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcast_i32x4(__m128i __A)
-{
-  return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
-                                          0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                            (__v8si)_mm256_broadcast_i32x4(__A),
-                                            (__v8si)__O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                            (__v8si)_mm256_broadcast_i32x4(__A),
-                                            (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
-{
-  return (__m256d)__builtin_ia32_selectpd_256(__M,
-                                              (__v4df) _mm256_broadcastsd_pd(__A),
-                                              (__v4df) __O);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
-{
-  return (__m256d)__builtin_ia32_selectpd_256(__M,
-                                              (__v4df) _mm256_broadcastsd_pd(__A),
-                                              (__v4df) _mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
-{
-  return (__m128)__builtin_ia32_selectps_128(__M,
-                                             (__v4sf) _mm_broadcastss_ps(__A),
-                                             (__v4sf) __O);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
-{
-  return (__m128)__builtin_ia32_selectps_128(__M,
-                                             (__v4sf) _mm_broadcastss_ps(__A),
-                                             (__v4sf) _mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256(__M,
-                                             (__v8sf) _mm256_broadcastss_ps(__A),
-                                             (__v8sf) __O);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256(__M,
-                                             (__v8sf) _mm256_broadcastss_ps(__A),
-                                             (__v8sf) _mm256_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__M,
-                                             (__v4si) _mm_broadcastd_epi32(__A),
-                                             (__v4si) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__M,
-                                             (__v4si) _mm_broadcastd_epi32(__A),
-                                             (__v4si) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__M,
-                                             (__v8si) _mm256_broadcastd_epi32(__A),
-                                             (__v8si) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__M,
-                                             (__v8si) _mm256_broadcastd_epi32(__A),
-                                             (__v8si) _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                             (__v2di) _mm_broadcastq_epi64(__A),
-                                             (__v2di) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                             (__v2di) _mm_broadcastq_epi64(__A),
-                                             (__v2di) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                             (__v4di) _mm256_broadcastq_epi64(__A),
-                                             (__v4di) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                             (__v4di) _mm256_broadcastq_epi64(__A),
-                                             (__v4di) _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi32_epi8 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
-               (__v16qi)_mm_undefined_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
-               (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi32_epi8 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
-               (__v16qi)_mm_undefined_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
-               (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi32_epi16 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
-               (__v8hi)_mm_setzero_si128 (),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
-               (__v8hi)__O,
-               __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
-               (__v8hi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi32_epi16 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
-               (__v8hi)_mm_undefined_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
-               (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
-               (__v8hi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi64_epi8 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
-               (__v16qi)_mm_undefined_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
-               (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi64_epi8 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
-               (__v16qi)_mm_undefined_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
-               (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
-               (__v16qi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi64_epi32 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
-               (__v4si)_mm_undefined_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
-               (__v4si) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
-               (__v4si) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi64_epi32 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
-               (__v4si)_mm_undefined_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
-               (__v4si)__O,
-               __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
-               (__v4si) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi64_epi16 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
-               (__v8hi)_mm_undefined_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
-               (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
-               (__v8hi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi64_epi16 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
-               (__v8hi)_mm_undefined_si128(),
-               (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
-               (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
-               (__v8hi) _mm_setzero_si128 (),
-               __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi32_epi8 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
-                (__v16qi)_mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
-                (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
-                (__v16qi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi32_epi8 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
-                (__v16qi)_mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
-                (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
-                (__v16qi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi32_epi16 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
-                (__v8hi)_mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
-                (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
-                (__v8hi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi32_epi16 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
-                (__v8hi) _mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
-                (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
-                (__v8hi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi64_epi8 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
-                (__v16qi)_mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
-                (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
-                (__v16qi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi64_epi8 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
-                (__v16qi)_mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
-                (__v16qi) __O,
-                __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
-                (__v16qi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi64_epi32 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
-                (__v4si)_mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
-                (__v4si) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
-                (__v4si) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi64_epi32 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
-                (__v4si)_mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
-                (__v4si) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
-                (__v4si) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi64_epi16 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
-                (__v8hi)_mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
-                (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
-                (__v8hi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi64_epi16 (__m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
-                (__v8hi)_mm_undefined_si128(),
-                (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
-                (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
-                (__v8hi) _mm_setzero_si128 (),
-                __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi32_epi8 (__m128i __A)
-{
-  return (__m128i)__builtin_shufflevector(
-      __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
-      2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
-              (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
-              (__v16qi)
-              _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi8 (__m256i __A)
-{
-  return (__m128i)__builtin_shufflevector(
-      __builtin_convertvector((__v8si)__A, __v8qi),
-      (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-      12, 13, 14, 15);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
-              (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
-              (__v16qi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi32_epi16 (__m128i __A)
-{
-  return (__m128i)__builtin_shufflevector(
-      __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
-      2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
-              (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
-              (__v8hi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi16 (__m256i __A)
-{
-  return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
-              (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
-              (__v8hi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_storeu_epi16 (void *  __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi8 (__m128i __A)
-{
-  return (__m128i)__builtin_shufflevector(
-      __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3,
-      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
-              (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
-              (__v16qi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi8 (__m256i __A)
-{
-  return (__m128i)__builtin_shufflevector(
-      __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
-      2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
-              (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
-              (__v16qi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi32 (__m128i __A)
-{
-  return (__m128i)__builtin_shufflevector(
-      __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
-              (__v4si) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
-              (__v4si) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi32 (__m256i __A)
-{
-  return (__m128i)__builtin_convertvector((__v4di)__A, __v4si);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm256_cvtepi64_epi32(__A),
-                                             (__v4si)__O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
-                                             (__v4si)_mm256_cvtepi64_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi16 (__m128i __A)
-{
-  return (__m128i)__builtin_shufflevector(
-      __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3,
-      3, 3, 3, 3);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
-              (__v8hi)__O,
-              __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
-              (__v8hi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
-  __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi16 (__m256i __A)
-{
-  return (__m128i)__builtin_shufflevector(
-      __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
-      2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
-              (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
-{
-  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
-              (__v8hi) _mm_setzero_si128 (),
-              __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
-  __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
-}
-
-#define _mm256_extractf32x4_ps(A, imm) \
-  ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                                (int)(imm), \
-                                                (__v4sf)_mm_undefined_ps(), \
-                                                (__mmask8)-1))
-
-#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
-  ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                                (int)(imm), \
-                                                (__v4sf)(__m128)(W), \
-                                                (__mmask8)(U)))
-
-#define _mm256_maskz_extractf32x4_ps(U, A, imm) \
-  ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                                (int)(imm), \
-                                                (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)(U)))
-
-#define _mm256_extracti32x4_epi32(A, imm) \
-  ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                 (int)(imm), \
-                                                 (__v4si)_mm_undefined_si128(), \
-                                                 (__mmask8)-1))
-
-#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
-  ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                 (int)(imm), \
-                                                 (__v4si)(__m128i)(W), \
-                                                 (__mmask8)(U)))
-
-#define _mm256_maskz_extracti32x4_epi32(U, A, imm) \
-  ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                 (int)(imm), \
-                                                 (__v4si)_mm_setzero_si128(), \
-                                                 (__mmask8)(U)))
-
-#define _mm256_insertf32x4(A, B, imm) \
-  ((__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \
-                                          (__v4sf)(__m128)(B), (int)(imm)))
-
-#define _mm256_mask_insertf32x4(W, U, A, B, imm) \
-  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
-                                  (__v8sf)(__m256)(W)))
-
-#define _mm256_maskz_insertf32x4(U, A, B, imm) \
-  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
-                                  (__v8sf)_mm256_setzero_ps()))
-
-#define _mm256_inserti32x4(A, B, imm) \
-  ((__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \
-                                           (__v4si)(__m128i)(B), (int)(imm)))
-
-#define _mm256_mask_inserti32x4(W, U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
-                                  (__v8si)(__m256i)(W)))
-
-#define _mm256_maskz_inserti32x4(U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
-                                  (__v8si)_mm256_setzero_si256()))
-
-#define _mm_getmant_pd(A, B, C) \
-  ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)-1))
-
-#define _mm_mask_getmant_pd(W, U, A, B, C) \
-  ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v2df)(__m128d)(W), \
-                                             (__mmask8)(U)))
-
-#define _mm_maskz_getmant_pd(U, A, B, C) \
-  ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)(U)))
-
-#define _mm256_getmant_pd(A, B, C) \
-  ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v4df)_mm256_setzero_pd(), \
-                                             (__mmask8)-1))
-
-#define _mm256_mask_getmant_pd(W, U, A, B, C) \
-  ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v4df)(__m256d)(W), \
-                                             (__mmask8)(U)))
-
-#define _mm256_maskz_getmant_pd(U, A, B, C) \
-  ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
-                                             (int)(((C)<<2) | (B)), \
-                                             (__v4df)_mm256_setzero_pd(), \
-                                             (__mmask8)(U)))
-
-#define _mm_getmant_ps(A, B, C) \
-  ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)-1))
-
-#define _mm_mask_getmant_ps(W, U, A, B, C) \
-  ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v4sf)(__m128)(W), \
-                                            (__mmask8)(U)))
-
-#define _mm_maskz_getmant_ps(U, A, B, C) \
-  ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)(U)))
-
-#define _mm256_getmant_ps(A, B, C) \
-  ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v8sf)_mm256_setzero_ps(), \
-                                            (__mmask8)-1))
-
-#define _mm256_mask_getmant_ps(W, U, A, B, C) \
-  ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v8sf)(__m256)(W), \
-                                            (__mmask8)(U)))
-
-#define _mm256_maskz_getmant_ps(U, A, B, C) \
-  ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v8sf)_mm256_setzero_ps(), \
-                                            (__mmask8)(U)))
-
-#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
-  ((__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v2di)(__m128i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
-  ((__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v2di)(__m128i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
-  ((__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v4di)(__m256i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
-  ((__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v4di)(__m256i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
-  ((__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v2di)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
-  ((__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v2di)(__m128i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
-  ((__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v4di)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
-  ((__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v4di)(__m256i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
-  ((__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v4si)(__m128i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
-  ((__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v4si)(__m128i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
-  ((__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v4si)(__m128i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
-  ((__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v4si)(__m128i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
-  ((__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
-  ((__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v4si)(__m128i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
-  ((__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v8si)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
-  ((__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \
-                                         (void const *)(addr), \
-                                         (__v8si)(__m256i)(index), \
-                                         (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_permutex_pd(X, C) \
-  ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C)))
-
-#define _mm256_mask_permutex_pd(W, U, X, C) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                       (__v4df)_mm256_permutex_pd((X), (C)), \
-                                       (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_permutex_pd(U, X, C) \
-  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                        (__v4df)_mm256_permutex_pd((X), (C)), \
-                                        (__v4df)_mm256_setzero_pd()))
-
-#define _mm256_permutex_epi64(X, C) \
-  ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C)))
-
-#define _mm256_mask_permutex_epi64(W, U, X, C) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                      (__v4di)_mm256_permutex_epi64((X), (C)), \
-                                      (__v4di)(__m256i)(W)))
-
-#define _mm256_maskz_permutex_epi64(U, X, C) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                      (__v4di)_mm256_permutex_epi64((X), (C)), \
-                                      (__v4di)_mm256_setzero_si256()))
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
-{
-  return (__m256d)__builtin_ia32_permvardf256((__v4df)__Y, (__v4di)__X);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
-          __m256d __Y)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                        (__v4df)_mm256_permutexvar_pd(__X, __Y),
-                                        (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
-{
-  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
-                                        (__v4df)_mm256_permutexvar_pd(__X, __Y),
-                                        (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_permvardi256((__v4di) __Y, (__v4di) __X);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                     (__v4di)_mm256_permutexvar_epi64(__X, __Y),
-                                     (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
-             __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
-                                     (__v4di)_mm256_permutexvar_epi64(__X, __Y),
-                                     (__v4di)__W);
-}
-
-#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A))
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                        (__v8sf)_mm256_permutexvar_ps(__X, __Y),
-                                        (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                        (__v8sf)_mm256_permutexvar_ps(__X, __Y),
-                                        (__v8sf)_mm256_setzero_ps());
-}
-
-#define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X,
-                              __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                     (__v8si)_mm256_permutexvar_epi32(__X, __Y),
-                                     (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
-                                     (__v8si)_mm256_permutexvar_epi32(__X, __Y),
-                                     (__v8si)_mm256_setzero_si256());
-}
-
-#define _mm_alignr_epi32(A, B, imm) \
-  ((__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \
-                                     (__v4si)(__m128i)(B), (int)(imm)))
-
-#define _mm_mask_alignr_epi32(W, U, A, B, imm) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
-                                    (__v4si)(__m128i)(W)))
-
-#define _mm_maskz_alignr_epi32(U, A, B, imm) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
-                                    (__v4si)_mm_setzero_si128()))
-
-#define _mm256_alignr_epi32(A, B, imm) \
-  ((__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \
-                                     (__v8si)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
-                                 (__v8si)(__m256i)(W)))
-
-#define _mm256_maskz_alignr_epi32(U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
-                                 (__v8si)_mm256_setzero_si256()))
-
-#define _mm_alignr_epi64(A, B, imm) \
-  ((__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \
-                                     (__v2di)(__m128i)(B), (int)(imm)))
-
-#define _mm_mask_alignr_epi64(W, U, A, B, imm) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
-                                    (__v2di)(__m128i)(W)))
-
-#define _mm_maskz_alignr_epi64(U, A, B, imm) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
-                                    (__v2di)_mm_setzero_si128()))
-
-#define _mm256_alignr_epi64(A, B, imm) \
-  ((__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \
-                                     (__v4di)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
-                                 (__v4di)(__m256i)(W)))
-
-#define _mm256_maskz_alignr_epi64(U, A, B, imm) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
-                                 (__v4di)_mm256_setzero_si256()))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_movehdup_ps(__A),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_movehdup_ps(__A),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_movehdup_ps(__A),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_movehdup_ps(__A),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_moveldup_ps(__A),
-                                             (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
-{
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                             (__v4sf)_mm_moveldup_ps(__A),
-                                             (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_moveldup_ps(__A),
-                                             (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
-{
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                             (__v8sf)_mm256_moveldup_ps(__A),
-                                             (__v8sf)_mm256_setzero_ps());
-}
-
-#define _mm256_mask_shuffle_epi32(W, U, A, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                       (__v8si)_mm256_shuffle_epi32((A), (I)), \
-                                       (__v8si)(__m256i)(W)))
-
-#define _mm256_maskz_shuffle_epi32(U, A, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                       (__v8si)_mm256_shuffle_epi32((A), (I)), \
-                                       (__v8si)_mm256_setzero_si256()))
-
-#define _mm_mask_shuffle_epi32(W, U, A, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shuffle_epi32((A), (I)), \
-                                       (__v4si)(__m128i)(W)))
-
-#define _mm_maskz_shuffle_epi32(U, A, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shuffle_epi32((A), (I)), \
-                                       (__v4si)_mm_setzero_si128()))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
-{
-  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
-              (__v2df) __A,
-              (__v2df) __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
-{
-  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
-              (__v2df) __A,
-              (__v2df) _mm_setzero_pd ());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
-{
-  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
-              (__v4df) __A,
-              (__v4df) __W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
-{
-  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
-              (__v4df) __A,
-              (__v4df) _mm256_setzero_pd ());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
-  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
-             (__v4sf) __A,
-             (__v4sf) __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
-{
-  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
-             (__v4sf) __A,
-             (__v4sf) _mm_setzero_ps ());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
-  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
-             (__v8sf) __A,
-             (__v8sf) __W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
-{
-  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
-             (__v8sf) __A,
-             (__v8sf) _mm256_setzero_ps ());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
-             (__v4sf) __W,
-             (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
-{
-  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
-             (__v4sf)
-             _mm_setzero_ps (),
-             (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
-{
-  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
-                (__v8sf) __W,
-                (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
-{
-  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
-                (__v8sf)
-                _mm256_setzero_ps (),
-                (__mmask8) __U);
-}
-
-#define _mm_mask_cvt_roundps_ph(W, U, A, I) \
-  ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
-                                          (__v8hi)(__m128i)(W), \
-                                          (__mmask8)(U)))
-
-#define _mm_maskz_cvt_roundps_ph(U, A, I) \
-  ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
-                                          (__v8hi)_mm_setzero_si128(), \
-                                          (__mmask8)(U)))
-
-#define _mm_mask_cvtps_ph  _mm_mask_cvt_roundps_ph
-#define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph
-
-#define _mm256_mask_cvt_roundps_ph(W, U, A, I) \
-  ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
-                                             (__v8hi)(__m128i)(W), \
-                                             (__mmask8)(U)))
-
-#define _mm256_maskz_cvt_roundps_ph(U, A, I) \
-  ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
-                                             (__v8hi)_mm_setzero_si128(), \
-                                             (__mmask8)(U)))
-
-#define _mm256_mask_cvtps_ph  _mm256_mask_cvt_roundps_ph
-#define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __AVX512VLINTRIN_H */
diff --git a/third_party/intel/clang/avx512vlvbmi2intrin.h b/third_party/intel/clang/avx512vlvbmi2intrin.h
deleted file mode 100644
index 77af2d5cb..000000000
--- a/third_party/intel/clang/avx512vlvbmi2intrin.h
+++ /dev/null
@@ -1,695 +0,0 @@
-/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLVBMI2INTRIN_H
-#define __AVX512VLVBMI2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512vbmi2,no-evex512"),                \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512vbmi2,no-evex512"),                \
-                 __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
-              (__v8hi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
-              (__v8hi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
-              (__v16qi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
-              (__v16qi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
-{
-  __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
-{
-  __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
-              (__v8hi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
-              (__v8hi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
-              (__v16qi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
-              (__v16qi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
-              (__v8hi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
-              (__v8hi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
-              (__v16qi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
-              (__v16qi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
-              (__v16hi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
-              (__v16hi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
-              (__v32qi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
-              (__v32qi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
-{
-  __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
-{
-  __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
-              (__v16hi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
-              (__v16hi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
-              (__v32qi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
-              (__v32qi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
-              (__v16hi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
-              (__v16hi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
-              (__v32qi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
-              (__v32qi) _mm256_setzero_si256(),
-              __U);
-}
-
-#define _mm256_shldi_epi64(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
-                                      (__v4di)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
-                                     (__v4di)(__m256i)(S)))
-
-#define _mm256_maskz_shldi_epi64(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
-                                     (__v4di)_mm256_setzero_si256()))
-
-#define _mm_shldi_epi64(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
-                                      (__v2di)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shldi_epi64(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
-                                       (__v2di)(__m128i)(S)))
-
-#define _mm_maskz_shldi_epi64(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
-                                       (__v2di)_mm_setzero_si128()))
-
-#define _mm256_shldi_epi32(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
-                                      (__v8si)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
-                                     (__v8si)(__m256i)(S)))
-
-#define _mm256_maskz_shldi_epi32(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
-                                     (__v8si)_mm256_setzero_si256()))
-
-#define _mm_shldi_epi32(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
-                                      (__v4si)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shldi_epi32(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
-                                       (__v4si)(__m128i)(S)))
-
-#define _mm_maskz_shldi_epi32(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
-                                       (__v4si)_mm_setzero_si128()))
-
-#define _mm256_shldi_epi16(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
-                                      (__v16hi)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
-                                    (__v16hi)(__m256i)(S)))
-
-#define _mm256_maskz_shldi_epi16(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
-                                    (__v16hi)_mm256_setzero_si256()))
-
-#define _mm_shldi_epi16(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
-                                      (__v8hi)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shldi_epi16(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
-                                       (__v8hi)(__m128i)(S)))
-
-#define _mm_maskz_shldi_epi16(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
-                                       (__v8hi)_mm_setzero_si128()))
-
-#define _mm256_shrdi_epi64(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
-                                      (__v4di)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
-                                     (__v4di)(__m256i)(S)))
-
-#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
-                                     (__v4di)_mm256_setzero_si256()))
-
-#define _mm_shrdi_epi64(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
-                                      (__v2di)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
-                                       (__v2di)(__m128i)(S)))
-
-#define _mm_maskz_shrdi_epi64(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
-                                       (__v2di)_mm_setzero_si128()))
-
-#define _mm256_shrdi_epi32(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
-                                      (__v8si)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
-                                     (__v8si)(__m256i)(S)))
-
-#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
-                                     (__v8si)_mm256_setzero_si256()))
-
-#define _mm_shrdi_epi32(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
-                                      (__v4si)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
-                                       (__v4si)(__m128i)(S)))
-
-#define _mm_maskz_shrdi_epi32(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
-                                       (__v4si)_mm_setzero_si128()))
-
-#define _mm256_shrdi_epi16(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
-                                      (__v16hi)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
-                                    (__v16hi)(__m256i)(S)))
-
-#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
-                                    (__v16hi)_mm256_setzero_si256()))
-
-#define _mm_shrdi_epi16(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
-                                      (__v8hi)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
-                                       (__v8hi)(__m128i)(S)))
-
-#define _mm_maskz_shrdi_epi16(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
-                                       (__v8hi)_mm_setzero_si128()))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B,
-                                             (__v4di)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                      (__v4di)_mm256_shldv_epi64(__A, __B, __C),
-                                      (__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                      (__v4di)_mm256_shldv_epi64(__A, __B, __C),
-                                      (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B,
-                                             (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                         (__v2di)_mm_shldv_epi64(__A, __B, __C),
-                                         (__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                         (__v2di)_mm_shldv_epi64(__A, __B, __C),
-                                         (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B,
-                                             (__v8si)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                      (__v8si)_mm256_shldv_epi32(__A, __B, __C),
-                                      (__v8si)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                      (__v8si)_mm256_shldv_epi32(__A, __B, __C),
-                                      (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B,
-                                             (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                         (__v4si)_mm_shldv_epi32(__A, __B, __C),
-                                         (__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                         (__v4si)_mm_shldv_epi32(__A, __B, __C),
-                                         (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B,
-                                             (__v16hi)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                                      (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
-                                      (__v16hi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                                      (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
-                                      (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B,
-                                             (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                         (__v8hi)_mm_shldv_epi16(__A, __B, __C),
-                                         (__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                         (__v8hi)_mm_shldv_epi16(__A, __B, __C),
-                                         (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B,
-                                             (__v4di)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                      (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
-                                      (__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                      (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
-                                      (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B,
-                                             (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                         (__v2di)_mm_shrdv_epi64(__A, __B, __C),
-                                         (__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                         (__v2di)_mm_shrdv_epi64(__A, __B, __C),
-                                         (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B,
-                                             (__v8si)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                      (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
-                                      (__v8si)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                      (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
-                                      (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B,
-                                             (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                         (__v4si)_mm_shrdv_epi32(__A, __B, __C),
-                                         (__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                         (__v4si)_mm_shrdv_epi32(__A, __B, __C),
-                                         (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B,
-                                             (__v16hi)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                                     (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
-                                     (__v16hi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                                     (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
-                                     (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B,
-                                             (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                         (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
-                                         (__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                         (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
-                                         (__v8hi)_mm_setzero_si128());
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
diff --git a/third_party/intel/clang/avx512vlvnniintrin.h b/third_party/intel/clang/avx512vlvnniintrin.h
deleted file mode 100644
index d1e5cd9d6..000000000
--- a/third_party/intel/clang/avx512vlvnniintrin.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLVNNIINTRIN_H
-#define __AVX512VLVNNIINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512vnni,no-evex512"),                 \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512vnni,no-evex512"),                 \
-                 __min_vector_width__(256)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-#define _mm256_dpbusd_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-#define _mm256_dpbusds_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
-///  and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-#define _mm256_dpwssd_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-#define _mm256_dpwssds_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-#define _mm_dpbusd_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-#define _mm_dpbusds_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
-/// and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-#define _mm_dpwssd_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-#define _mm_dpwssds_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
-                                     (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
-                                     (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                    (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
-                                    (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
-                                     (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
-                                     (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
-                                     (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
-                                    (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
-                                    (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
-                                        (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
-                                        (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
-                                       (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
-                                       (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
-                                        (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
-                                        (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
-                                       (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
-                                       (__v4si)_mm_setzero_si128());
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
diff --git a/third_party/intel/clang/avx512vlvp2intersectintrin.h b/third_party/intel/clang/avx512vlvp2intersectintrin.h
deleted file mode 100644
index 63a31241a..000000000
--- a/third_party/intel/clang/avx512vlvp2intersectintrin.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------===
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlvp2intersectintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef _AVX512VLVP2INTERSECT_H
-#define _AVX512VLVP2INTERSECT_H
-
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512vp2intersect,no-evex512"),         \
-                 __min_vector_width__(128)))
-
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl,avx512vp2intersect,no-evex512"),         \
-                 __min_vector_width__(256)))
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between dwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32].
-/// \param __b
-///    A 256-bit vector of [8 x i32]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between quadwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64].
-/// \param __b
-///    A 256-bit vector of [4 x i64]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between dwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x i32].
-/// \param __b
-///    A 128-bit vector of [4 x i32]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between quadwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x i64].
-/// \param __b
-///    A 128-bit vector of [2 x i64]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
diff --git a/third_party/intel/clang/avx512vnniintrin.h b/third_party/intel/clang/avx512vnniintrin.h
deleted file mode 100644
index 0fb381a12..000000000
--- a/third_party/intel/clang/avx512vnniintrin.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VNNIINTRIN_H
-#define __AVX512VNNIINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vnni,evex512"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
-                                             (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
-                                    (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
-                                    (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
-                                              (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
-                                   (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
-                                   (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
-                                             (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
-                                    (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
-                                    (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
-                                              (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
-                                   (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
-                                   (__v16si)_mm512_setzero_si512());
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/avx512vp2intersectintrin.h b/third_party/intel/clang/avx512vp2intersectintrin.h
deleted file mode 100644
index 16552cae3..000000000
--- a/third_party/intel/clang/avx512vp2intersectintrin.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------===
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vp2intersect.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef _AVX512VP2INTERSECT_H
-#define _AVX512VP2INTERSECT_H
-
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vp2intersect,evex512"),                     \
-                 __min_vector_width__(512)))
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between dwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
-///
-/// \param __a
-///    A 512-bit vector of [16 x i32].
-/// \param __b
-///    A 512-bit vector of [16 x i32]
-/// \param __m0
-///    A pointer point to 16-bit mask
-/// \param __m1
-///    A pointer point to 16-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) {
-  __builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between quadwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
-///
-/// \param __a
-///    A 512-bit vector of [8 x i64].
-/// \param __b
-///    A 512-bit vector of [8 x i64]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/avx512vpopcntdqintrin.h b/third_party/intel/clang/avx512vpopcntdqintrin.h
deleted file mode 100644
index e73e7e4f7..000000000
--- a/third_party/intel/clang/avx512vpopcntdqintrin.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VPOPCNTDQINTRIN_H
-#define __AVX512VPOPCNTDQINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vpopcntdq,evex512"),                        \
-                 __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
-  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  return (__m512i)__builtin_ia32_selectq_512(
-      (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
-  return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
-  return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  return (__m512i)__builtin_ia32_selectd_512(
-      (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
-  return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/avx512vpopcntdqvlintrin.h b/third_party/intel/clang/avx512vpopcntdqvlintrin.h
deleted file mode 100644
index b2df2e84d..000000000
--- a/third_party/intel/clang/avx512vpopcntdqvlintrin.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VPOPCNTDQVLINTRIN_H
-#define __AVX512VPOPCNTDQVLINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vpopcntdq,avx512vl,no-evex512"),            \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vpopcntdq,avx512vl,no-evex512"),            \
-                 __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi64(__m128i __A) {
-  return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectq_128(
-      (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
-  return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi32(__m128i __A) {
-  return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectd_128(
-      (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
-  return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi64(__m256i __A) {
-  return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectq_256(
-      (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
-  return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi32(__m256i __A) {
-  return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectd_256(
-      (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
-  return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
diff --git a/third_party/intel/clang/avxifmaintrin.h b/third_party/intel/clang/avxifmaintrin.h
deleted file mode 100644
index 5c782d2a5..000000000
--- a/third_party/intel/clang/avxifmaintrin.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVXIFMAINTRIN_H
-#define __AVXIFMAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
-                 __min_vector_width__(256)))
-
-// must vex-encoding
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
-/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the corresponding
-/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i
-/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
-///
-/// \return
-/// 	return __m128i dst.
-/// \param __X
-/// 	A 128-bit vector of [2 x i64]
-/// \param __Y
-/// 	A 128-bit vector of [2 x i64]
-/// \param __Z
-/// 	A 128-bit vector of [2 x i64]
-///
-/// \code{.operation}
-/// FOR j := 0 to 1
-/// 	i := j*64
-/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
-/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
-                                                (__v2di)__Z);
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
-/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the corresponding
-/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i
-/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
-///
-/// \return
-/// 	return __m256i dst.
-/// \param __X
-/// 	A 256-bit vector of [4 x i64]
-/// \param __Y
-/// 	A 256-bit vector of [4 x i64]
-/// \param __Z
-/// 	A 256-bit vector of [4 x i64]
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	i := j*64
-/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
-/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
-                                                (__v4di)__Z);
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
-/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the corresponding
-/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i
-/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
-///
-/// \return
-/// 	return __m128i dst.
-/// \param __X
-/// 	A 128-bit vector of [2 x i64]
-/// \param __Y
-/// 	A 128-bit vector of [2 x i64]
-/// \param __Z
-/// 	A 128-bit vector of [2 x i64]
-///
-/// \code{.operation}
-/// FOR j := 0 to 1
-/// 	i := j*64
-/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
-/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
-                                                (__v2di)__Z);
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
-/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the corresponding
-/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i
-/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
-///
-/// \return
-/// 	return __m256i dst.
-/// \param __X
-/// 	A 256-bit vector of [4 x i64]
-/// \param __Y
-/// 	A 256-bit vector of [4 x i64]
-/// \param __Z
-/// 	A 256-bit vector of [4 x i64]
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	i := j*64
-/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
-/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
-                                                (__v4di)__Z);
-}
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXIFMAINTRIN_H
diff --git a/third_party/intel/clang/avxintrin.h b/third_party/intel/clang/avxintrin.h
deleted file mode 100644
index 4983f3311..000000000
--- a/third_party/intel/clang/avxintrin.h
+++ /dev/null
@@ -1,5126 +0,0 @@
-/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVXINTRIN_H
-#define __AVXINTRIN_H
-
-typedef double __v4df __attribute__ ((__vector_size__ (32)));
-typedef float __v8sf __attribute__ ((__vector_size__ (32)));
-typedef long long __v4di __attribute__ ((__vector_size__ (32)));
-typedef int __v8si __attribute__ ((__vector_size__ (32)));
-typedef short __v16hi __attribute__ ((__vector_size__ (32)));
-typedef char __v32qi __attribute__ ((__vector_size__ (32)));
-
-/* Unsigned types */
-typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
-typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
-typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
-typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
-
-/* We need an explicitly signed variant for char. Note that this shouldn't
- * appear in the interface though. */
-typedef signed char __v32qs __attribute__((__vector_size__(32)));
-
-typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
-typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
-typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
-
-typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
-typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
-typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
-
-#ifdef __SSE2__
-/* Both _Float16 and __bf16 require SSE2 being enabled. */
-typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
-
-typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
-typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
-                 __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
-                 __min_vector_width__(128)))
-
-/* Arithmetic */
-/// Adds two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x double] containing the sums of both
-///    operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_add_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)((__v4df)__a+(__v4df)__b);
-}
-
-/// Adds two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x float] containing the sums of both
-///    operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_add_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)((__v8sf)__a+(__v8sf)__b);
-}
-
-/// Subtracts two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing the minuend.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing the subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the differences between
-///    both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sub_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)((__v4df)__a-(__v4df)__b);
-}
-
-/// Subtracts two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the minuend.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing the subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the differences between
-///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sub_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)((__v8sf)__a-(__v8sf)__b);
-}
-
-/// Adds the even-indexed values and subtracts the odd-indexed values of
-///    two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing the left source operand.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing the right source operand.
-/// \returns A 256-bit vector of [4 x double] containing the alternating sums
-///    and differences between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_addsub_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Adds the even-indexed values and subtracts the odd-indexed values of
-///    two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the left source operand.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing the right source operand.
-/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
-///    differences between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_addsub_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Divides two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing the dividend.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing the divisor.
-/// \returns A 256-bit vector of [4 x double] containing the quotients of both
-///    operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_div_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)((__v4df)__a/(__v4df)__b);
-}
-
-/// Divides two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the dividend.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing the divisor.
-/// \returns A 256-bit vector of [8 x float] containing the quotients of both
-///    operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_div_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)((__v8sf)__a/(__v8sf)__b);
-}
-
-/// Compares two 256-bit vectors of [4 x double] and returns the greater
-///    of each pair of values.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing one of the operands.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing one of the operands.
-/// \returns A 256-bit vector of [4 x double] containing the maximum values
-///    between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_max_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Compares two 256-bit vectors of [8 x float] and returns the greater
-///    of each pair of values.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing one of the operands.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing one of the operands.
-/// \returns A 256-bit vector of [8 x float] containing the maximum values
-///    between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_max_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Compares two 256-bit vectors of [4 x double] and returns the lesser
-///    of each pair of values.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing one of the operands.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing one of the operands.
-/// \returns A 256-bit vector of [4 x double] containing the minimum values
-///    between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_min_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Compares two 256-bit vectors of [8 x float] and returns the lesser
-///    of each pair of values.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing one of the operands.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing one of the operands.
-/// \returns A 256-bit vector of [8 x float] containing the minimum values
-///    between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_min_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Multiplies two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing one of the operands.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing one of the operands.
-/// \returns A 256-bit vector of [4 x double] containing the products of both
-///    operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_mul_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)((__v4df)__a * (__v4df)__b);
-}
-
-/// Multiplies two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing one of the operands.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing one of the operands.
-/// \returns A 256-bit vector of [8 x float] containing the products of both
-///    operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_mul_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)((__v8sf)__a * (__v8sf)__b);
-}
-
-/// Calculates the square roots of the values in a 256-bit vector of
-///    [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the square roots of the
-///    values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
-  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
-}
-
-/// Calculates the square roots of the values in a 256-bit vector of
-///    [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the square roots of the
-///    values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
-  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
-}
-
-/// Calculates the reciprocal square roots of the values in a 256-bit
-///    vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
-///    roots of the values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_rsqrt_ps(__m256 __a)
-{
-  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
-}
-
-/// Calculates the reciprocals of the values in a 256-bit vector of
-///    [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
-///    values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_rcp_ps(__m256 __a)
-{
-  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
-}
-
-/// Rounds the values in a 256-bit vector of [4 x double] as specified
-///    by the byte operand. The source values are rounded to integer values and
-///    returned as 64-bit double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_round_pd(__m256d V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
-///
-/// \param V
-///    A 256-bit vector of [4 x double].
-/// \param M
-///    An integer value that specifies the rounding operation. \n
-///    Bits [7:4] are reserved. \n
-///    Bit [3] is a precision exception value: \n
-///      0: A normal PE exception is used. \n
-///      1: The PE field is not updated. \n
-///    Bit [2] is the rounding control source: \n
-///      0: Use bits [1:0] of \a M. \n
-///      1: Use the current MXCSR setting. \n
-///    Bits [1:0] contain the rounding control definition: \n
-///      00: Nearest. \n
-///      01: Downward (toward negative infinity). \n
-///      10: Upward (toward positive infinity). \n
-///      11: Truncated.
-/// \returns A 256-bit vector of [4 x double] containing the rounded values.
-#define _mm256_round_pd(V, M) \
-  ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
-
-/// Rounds the values stored in a 256-bit vector of [8 x float] as
-///    specified by the byte operand. The source values are rounded to integer
-///    values and returned as floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_round_ps(__m256 V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
-///
-/// \param V
-///    A 256-bit vector of [8 x float].
-/// \param M
-///    An integer value that specifies the rounding operation. \n
-///    Bits [7:4] are reserved. \n
-///    Bit [3] is a precision exception value: \n
-///      0: A normal PE exception is used. \n
-///      1: The PE field is not updated. \n
-///    Bit [2] is the rounding control source: \n
-///      0: Use bits [1:0] of \a M. \n
-///      1: Use the current MXCSR setting. \n
-///    Bits [1:0] contain the rounding control definition: \n
-///      00: Nearest. \n
-///      01: Downward (toward negative infinity). \n
-///      10: Upward (toward positive infinity). \n
-///      11: Truncated.
-/// \returns A 256-bit vector of [8 x float] containing the rounded values.
-#define _mm256_round_ps(V, M) \
-  ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
-
-/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
-///    source values are rounded up to integer values and returned as 64-bit
-///    double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_ceil_pd(__m256d V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
-///
-/// \param V
-///    A 256-bit vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
-#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
-
-/// Rounds down the values stored in a 256-bit vector of [4 x double].
-///    The source values are rounded down to integer values and returned as
-///    64-bit double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_floor_pd(__m256d V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
-///
-/// \param V
-///    A 256-bit vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the rounded down
-///    values.
-#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
-
-/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
-///    source values are rounded up to integer values and returned as
-///    floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_ceil_ps(__m256 V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
-///
-/// \param V
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
-#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
-
-/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
-///    source values are rounded down to integer values and returned as
-///    floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_floor_ps(__m256 V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
-///
-/// \param V
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
-#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
-
-/* Logical */
-/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
-///    values between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_and_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)((__v4du)__a & (__v4du)__b);
-}
-
-/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
-///    values between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_and_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)((__v8su)__a & (__v8su)__b);
-}
-
-/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
-///    the one's complement of the values contained in the first source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing the left source operand. The
-///    one's complement of this value is used in the bitwise AND.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing the right source operand.
-/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
-///    values of the second operand and the one's complement of the first
-///    operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_andnot_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)(~(__v4du)__a & (__v4du)__b);
-}
-
-/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
-///    the one's complement of the values contained in the first source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the left source operand. The
-///    one's complement of this value is used in the bitwise AND.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing the right source operand.
-/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
-///    values of the second operand and the one's complement of the first
-///    operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_andnot_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)(~(__v8su)__a & (__v8su)__b);
-}
-
-/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VORPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
-///    values between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_or_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)((__v4du)__a | (__v4du)__b);
-}
-
-/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VORPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
-///    values between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_or_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)((__v8su)__a | (__v8su)__b);
-}
-
-/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
-///    values between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_xor_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)((__v4du)__a ^ (__v4du)__b);
-}
-
-/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-/// \param __b
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
-///    values between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_xor_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)((__v8su)__a ^ (__v8su)__b);
-}
-
-/* Horizontal arithmetic */
-/// Horizontally adds the adjacent pairs of values contained in two
-///    256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-///    The horizontal sums of the values are returned in the even-indexed
-///    elements of a vector of [4 x double].
-/// \param __b
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-///    The horizontal sums of the values are returned in the odd-indexed
-///    elements of a vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
-///    both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_hadd_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in two
-///    256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-///    The horizontal sums of the values are returned in the elements with
-///    index 0, 1, 4, 5 of a vector of [8 x float].
-/// \param __b
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-///    The horizontal sums of the values are returned in the elements with
-///    index 2, 3, 6, 7 of a vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
-///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_hadd_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in two
-///    256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-///    The horizontal differences between the values are returned in the
-///    even-indexed elements of a vector of [4 x double].
-/// \param __b
-///    A 256-bit vector of [4 x double] containing one of the source operands.
-///    The horizontal differences between the values are returned in the
-///    odd-indexed elements of a vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the horizontal
-///    differences of both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_hsub_pd(__m256d __a, __m256d __b)
-{
-  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in two
-///    256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-///    The horizontal differences between the values are returned in the
-///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
-/// \param __b
-///    A 256-bit vector of [8 x float] containing one of the source operands.
-///    The horizontal differences between the values are returned in the
-///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the horizontal
-///    differences of both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_hsub_ps(__m256 __a, __m256 __b)
-{
-  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/* Vector permutations */
-/// Copies the values in a 128-bit vector of [2 x double] as specified
-///    by the 128-bit integer vector operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __c
-///    A 128-bit integer vector operand specifying how the values are to be
-///    copied. \n
-///    Bit [1]: \n
-///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
-///         vector. \n
-///      1: Bits [127:64] of the source are copied to bits [63:0] of the
-///         returned vector. \n
-///    Bit [65]: \n
-///      0: Bits [63:0] of the source are copied to bits [127:64] of the
-///         returned vector. \n
-///      1: Bits [127:64] of the source are copied to bits [127:64] of the
-///         returned vector.
-/// \returns A 128-bit vector of [2 x double] containing the copied values.
-static __inline __m128d __DEFAULT_FN_ATTRS128
-_mm_permutevar_pd(__m128d __a, __m128i __c)
-{
-  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
-}
-
-/// Copies the values in a 256-bit vector of [4 x double] as specified
-///    by the 256-bit integer vector operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \param __c
-///    A 256-bit integer vector operand specifying how the values are to be
-///    copied. \n
-///    Bit [1]: \n
-///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
-///         vector. \n
-///      1: Bits [127:64] of the source are copied to bits [63:0] of the
-///         returned vector. \n
-///    Bit [65]: \n
-///      0: Bits [63:0] of the source are copied to bits [127:64] of the
-///         returned vector. \n
-///      1: Bits [127:64] of the source are copied to bits [127:64] of the
-///         returned vector. \n
-///    Bit [129]: \n
-///      0: Bits [191:128] of the source are copied to bits [191:128] of the
-///         returned vector. \n
-///      1: Bits [255:192] of the source are copied to bits [191:128] of the
-///         returned vector. \n
-///    Bit [193]: \n
-///      0: Bits [191:128] of the source are copied to bits [255:192] of the
-///         returned vector. \n
-///      1: Bits [255:192] of the source are copied to bits [255:192] of the
-///    returned vector.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_permutevar_pd(__m256d __a, __m256i __c)
-{
-  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
-}
-
-/// Copies the values stored in a 128-bit vector of [4 x float] as
-///    specified by the 128-bit integer vector operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __c
-///    A 128-bit integer vector operand specifying how the values are to be
-///    copied. \n
-///    Bits [1:0]: \n
-///      00: Bits [31:0] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///    Bits [33:32]: \n
-///      00: Bits [31:0] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///    Bits [65:64]: \n
-///      00: Bits [31:0] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///    Bits [97:96]: \n
-///      00: Bits [31:0] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [127:96] of the
-///          returned vector.
-/// \returns A 128-bit vector of [4 x float] containing the copied values.
-static __inline __m128 __DEFAULT_FN_ATTRS128
-_mm_permutevar_ps(__m128 __a, __m128i __c)
-{
-  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
-}
-
-/// Copies the values stored in a 256-bit vector of [8 x float] as
-///    specified by the 256-bit integer vector operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \param __c
-///    A 256-bit integer vector operand specifying how the values are to be
-///    copied. \n
-///    Bits [1:0]: \n
-///      00: Bits [31:0] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///    Bits [33:32]: \n
-///      00: Bits [31:0] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///    Bits [65:64]: \n
-///      00: Bits [31:0] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///    Bits [97:96]: \n
-///      00: Bits [31:0] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///    Bits [129:128]: \n
-///      00: Bits [159:128] of the source are copied to bits [159:128] of the
-///          returned vector. \n
-///      01: Bits [191:160] of the source are copied to bits [159:128] of the
-///          returned vector. \n
-///      10: Bits [223:192] of the source are copied to bits [159:128] of the
-///          returned vector. \n
-///      11: Bits [255:224] of the source are copied to bits [159:128] of the
-///          returned vector. \n
-///    Bits [161:160]: \n
-///      00: Bits [159:128] of the source are copied to bits [191:160] of the
-///          returned vector. \n
-///      01: Bits [191:160] of the source are copied to bits [191:160] of the
-///          returned vector. \n
-///      10: Bits [223:192] of the source are copied to bits [191:160] of the
-///          returned vector. \n
-///      11: Bits [255:224] of the source are copied to bits [191:160] of the
-///          returned vector. \n
-///    Bits [193:192]: \n
-///      00: Bits [159:128] of the source are copied to bits [223:192] of the
-///          returned vector. \n
-///      01: Bits [191:160] of the source are copied to bits [223:192] of the
-///          returned vector. \n
-///      10: Bits [223:192] of the source are copied to bits [223:192] of the
-///          returned vector. \n
-///      11: Bits [255:224] of the source are copied to bits [223:192] of the
-///          returned vector. \n
-///    Bits [225:224]: \n
-///      00: Bits [159:128] of the source are copied to bits [255:224] of the
-///          returned vector. \n
-///      01: Bits [191:160] of the source are copied to bits [255:224] of the
-///          returned vector. \n
-///      10: Bits [223:192] of the source are copied to bits [255:224] of the
-///          returned vector. \n
-///      11: Bits [255:224] of the source are copied to bits [255:224] of the
-///          returned vector.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_permutevar_ps(__m256 __a, __m256i __c)
-{
-  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
-}
-
-/// Copies the values in a 128-bit vector of [2 x double] as specified
-///    by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_permute_pd(__m128d A, const int C);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
-///
-/// \param A
-///    A 128-bit vector of [2 x double].
-/// \param C
-///    An immediate integer operand specifying how the values are to be
-///    copied. \n
-///    Bit [0]: \n
-///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
-///         vector. \n
-///      1: Bits [127:64] of the source are copied to bits [63:0] of the
-///         returned vector. \n
-///    Bit [1]: \n
-///      0: Bits [63:0] of the source are copied to bits [127:64] of the
-///         returned vector. \n
-///      1: Bits [127:64] of the source are copied to bits [127:64] of the
-///         returned vector.
-/// \returns A 128-bit vector of [2 x double] containing the copied values.
-#define _mm_permute_pd(A, C) \
-  ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
-
-/// Copies the values in a 256-bit vector of [4 x double] as specified by
-///    the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_permute_pd(__m256d A, const int C);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
-///
-/// \param A
-///    A 256-bit vector of [4 x double].
-/// \param C
-///    An immediate integer operand specifying how the values are to be
-///    copied. \n
-///    Bit [0]: \n
-///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
-///         vector. \n
-///      1: Bits [127:64] of the source are copied to bits [63:0] of the
-///         returned vector. \n
-///    Bit [1]: \n
-///      0: Bits [63:0] of the source are copied to bits [127:64] of the
-///         returned vector. \n
-///      1: Bits [127:64] of the source are copied to bits [127:64] of the
-///         returned vector. \n
-///    Bit [2]: \n
-///      0: Bits [191:128] of the source are copied to bits [191:128] of the
-///         returned vector. \n
-///      1: Bits [255:192] of the source are copied to bits [191:128] of the
-///         returned vector. \n
-///    Bit [3]: \n
-///      0: Bits [191:128] of the source are copied to bits [255:192] of the
-///         returned vector. \n
-///      1: Bits [255:192] of the source are copied to bits [255:192] of the
-///         returned vector.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-#define _mm256_permute_pd(A, C) \
-  ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
-
-/// Copies the values in a 128-bit vector of [4 x float] as specified by
-///    the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_permute_ps(__m128 A, const int C);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
-///
-/// \param A
-///    A 128-bit vector of [4 x float].
-/// \param C
-///    An immediate integer operand specifying how the values are to be
-///    copied. \n
-///    Bits [1:0]: \n
-///      00: Bits [31:0] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///    Bits [3:2]: \n
-///      00: Bits [31:0] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///    Bits [5:4]: \n
-///      00: Bits [31:0] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///    Bits [7:6]: \n
-///      00: Bits [31:0] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [127:96] of the
-///          returned vector.
-/// \returns A 128-bit vector of [4 x float] containing the copied values.
-#define _mm_permute_ps(A, C) \
-  ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
-
-/// Copies the values in a 256-bit vector of [8 x float] as specified by
-///    the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_permute_ps(__m256 A, const int C);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
-///
-/// \param A
-///    A 256-bit vector of [8 x float].
-/// \param C
-///    An immediate integer operand specifying how the values are to be
-///    copied. \n
-///    Bits [1:0]: \n
-///      00: Bits [31:0] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [31:0] of the
-///          returned vector. \n
-///    Bits [3:2]: \n
-///      00: Bits [31:0] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [63:32] of the
-///          returned vector. \n
-///    Bits [5:4]: \n
-///      00: Bits [31:0] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [95:64] of the
-///          returned vector. \n
-///    Bits [7:6]: \n
-///      00: Bits [31:0] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      01: Bits [63:32] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      10: Bits [95:64] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///      11: Bits [127:96] of the source are copied to bits [127:96] of the
-///          returned vector. \n
-///    Bits [1:0]: \n
-///      00: Bits [159:128] of the source are copied to bits [159:128] of the
-///          returned vector. \n
-///      01: Bits [191:160] of the source are copied to bits [159:128] of the
-///          returned vector. \n
-///      10: Bits [223:192] of the source are copied to bits [159:128] of the
-///          returned vector. \n
-///      11: Bits [255:224] of the source are copied to bits [159:128] of the
-///          returned vector. \n
-///    Bits [3:2]: \n
-///      00: Bits [159:128] of the source are copied to bits [191:160] of the
-///          returned vector. \n
-///      01: Bits [191:160] of the source are copied to bits [191:160] of the
-///          returned vector. \n
-///      10: Bits [223:192] of the source are copied to bits [191:160] of the
-///          returned vector. \n
-///      11: Bits [255:224] of the source are copied to bits [191:160] of the
-///          returned vector. \n
-///    Bits [5:4]: \n
-///      00: Bits [159:128] of the source are copied to bits [223:192] of the
-///          returned vector. \n
-///      01: Bits [191:160] of the source are copied to bits [223:192] of the
-///          returned vector. \n
-///      10: Bits [223:192] of the source are copied to bits [223:192] of the
-///          returned vector. \n
-///      11: Bits [255:224] of the source are copied to bits [223:192] of the
-///          returned vector. \n
-///    Bits [7:6]: \n
-///      00: Bits [159:128] of the source are copied to bits [255:224] of the
-///          returned vector. \n
-///      01: Bits [191:160] of the source are copied to bits [255:224] of the
-///          returned vector. \n
-///      10: Bits [223:192] of the source are copied to bits [255:224] of the
-///          returned vector. \n
-///      11: Bits [255:224] of the source are copied to bits [255:224] of the
-///          returned vector.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-#define _mm256_permute_ps(A, C) \
-  ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
-
-/// Permutes 128-bit data values stored in two 256-bit vectors of
-///    [4 x double], as specified by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
-///
-/// \param V1
-///    A 256-bit vector of [4 x double].
-/// \param V2
-///    A 256-bit vector of [4 x double.
-/// \param M
-///    An immediate integer operand specifying how the values are to be
-///    permuted. \n
-///    Bits [1:0]: \n
-///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
-///          destination. \n
-///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
-///          destination. \n
-///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
-///          destination. \n
-///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
-///          destination. \n
-///    Bits [5:4]: \n
-///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
-///          destination. \n
-///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
-///          destination. \n
-///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
-///          destination. \n
-///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
-///          destination.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-#define _mm256_permute2f128_pd(V1, V2, M) \
-  ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
-                                            (__v4df)(__m256d)(V2), (int)(M)))
-
-/// Permutes 128-bit data values stored in two 256-bit vectors of
-///    [8 x float], as specified by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
-///
-/// \param V1
-///    A 256-bit vector of [8 x float].
-/// \param V2
-///    A 256-bit vector of [8 x float].
-/// \param M
-///    An immediate integer operand specifying how the values are to be
-///    permuted. \n
-///    Bits [1:0]: \n
-///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
-///    destination. \n
-///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
-///    destination. \n
-///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
-///    destination. \n
-///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
-///    destination. \n
-///    Bits [5:4]: \n
-///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
-///    destination. \n
-///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
-///    destination. \n
-///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
-///    destination. \n
-///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
-///    destination.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-#define _mm256_permute2f128_ps(V1, V2, M) \
-  ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
-                                           (__v8sf)(__m256)(V2), (int)(M)))
-
-/// Permutes 128-bit data values stored in two 256-bit integer vectors,
-///    as specified by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
-///
-/// \param V1
-///    A 256-bit integer vector.
-/// \param V2
-///    A 256-bit integer vector.
-/// \param M
-///    An immediate integer operand specifying how the values are to be copied.
-///    Bits [1:0]: \n
-///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
-///    destination. \n
-///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
-///    destination. \n
-///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
-///    destination. \n
-///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
-///    destination. \n
-///    Bits [5:4]: \n
-///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
-///    destination. \n
-///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
-///    destination. \n
-///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
-///    destination. \n
-///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
-///    destination.
-/// \returns A 256-bit integer vector containing the copied values.
-#define _mm256_permute2f128_si256(V1, V2, M) \
-  ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
-                                            (__v8si)(__m256i)(V2), (int)(M)))
-
-/* Vector Blend */
-/// Merges 64-bit double-precision data values stored in either of the
-///    two 256-bit vectors of [4 x double], as specified by the immediate
-///    integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
-///
-/// \param V1
-///    A 256-bit vector of [4 x double].
-/// \param V2
-///    A 256-bit vector of [4 x double].
-/// \param M
-///    An immediate integer operand, with mask bits [3:0] specifying how the
-///    values are to be copied. The position of the mask bit corresponds to the
-///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
-///    element in operand \a V1 is copied to the same position in the
-///    destination. When a mask bit is 1, the corresponding 64-bit element in
-///    operand \a V2 is copied to the same position in the destination.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-#define _mm256_blend_pd(V1, V2, M) \
-  ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
-                                      (__v4df)(__m256d)(V2), (int)(M)))
-
-/// Merges 32-bit single-precision data values stored in either of the
-///    two 256-bit vectors of [8 x float], as specified by the immediate
-///    integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
-///
-/// \param V1
-///    A 256-bit vector of [8 x float].
-/// \param V2
-///    A 256-bit vector of [8 x float].
-/// \param M
-///    An immediate integer operand, with mask bits [7:0] specifying how the
-///    values are to be copied. The position of the mask bit corresponds to the
-///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
-///    element in operand \a V1 is copied to the same position in the
-///    destination. When a mask bit is 1, the corresponding 32-bit element in
-///    operand \a V2 is copied to the same position in the destination.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-#define _mm256_blend_ps(V1, V2, M) \
-  ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
-                                     (__v8sf)(__m256)(V2), (int)(M)))
-
-/// Merges 64-bit double-precision data values stored in either of the
-///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
-///    operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \param __b
-///    A 256-bit vector of [4 x double].
-/// \param __c
-///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
-///    how the values are to be copied. The position of the mask bit corresponds
-///    to the most significant bit of a copied value. When a mask bit is 0, the
-///    corresponding 64-bit element in operand \a __a is copied to the same
-///    position in the destination. When a mask bit is 1, the corresponding
-///    64-bit element in operand \a __b is copied to the same position in the
-///    destination.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
-{
-  return (__m256d)__builtin_ia32_blendvpd256(
-    (__v4df)__a, (__v4df)__b, (__v4df)__c);
-}
-
-/// Merges 32-bit single-precision data values stored in either of the
-///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
-///    operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \param __b
-///    A 256-bit vector of [8 x float].
-/// \param __c
-///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
-///    and 31 specifying how the values are to be copied. The position of the
-///    mask bit corresponds to the most significant bit of a copied value. When
-///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
-///    copied to the same position in the destination. When a mask bit is 1, the
-///    corresponding 32-bit element in operand \a __b is copied to the same
-///    position in the destination.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
-{
-  return (__m256)__builtin_ia32_blendvps256(
-    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
-}
-
-/* Vector Dot Product */
-/// Computes two dot products in parallel, using the lower and upper
-///    halves of two [8 x float] vectors as input to the two computations, and
-///    returning the two dot products in the lower and upper halves of the
-///    [8 x float] result.
-///
-///    The immediate integer operand controls which input elements will
-///    contribute to the dot product, and where the final results are returned.
-///    In general, for each dot product, the four corresponding elements of the
-///    input vectors are multiplied; the first two and second two products are
-///    summed, then the two sums are added to form the final result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
-///
-/// \param V1
-///    A vector of [8 x float] values, treated as two [4 x float] vectors.
-/// \param V2
-///    A vector of [8 x float] values, treated as two [4 x float] vectors.
-/// \param M
-///    An immediate integer argument. Bits [7:4] determine which elements of
-///    the input vectors are used, with bit [4] corresponding to the lowest
-///    element and bit [7] corresponding to the highest element of each [4 x
-///    float] subvector. If a bit is set, the corresponding elements from the
-///    two input vectors are used as an input for dot product; otherwise that
-///    input is treated as zero. Bits [3:0] determine which elements of the
-///    result will receive a copy of the final dot product, with bit [0]
-///    corresponding to the lowest element and bit [3] corresponding to the
-///    highest element of each [4 x float] subvector. If a bit is set, the dot
-///    product is returned in the corresponding element; otherwise that element
-///    is set to zero. The bitmask is applied in the same way to each of the
-///    two parallel dot product computations.
-/// \returns A 256-bit vector of [8 x float] containing the two dot products.
-#define _mm256_dp_ps(V1, V2, M) \
-  ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
-                                  (__v8sf)(__m256)(V2), (M)))
-
-/* Vector shuffle */
-/// Selects 8 float values from the 256-bit operands of [8 x float], as
-///    specified by the immediate value operand.
-///
-///    The four selected elements in each operand are copied to the destination
-///    according to the bits specified in the immediate operand. The selected
-///    elements from the first 256-bit operand are copied to bits [63:0] and
-///    bits [191:128] of the destination, and the selected elements from the
-///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
-///    the destination. For example, if bits [7:0] of the immediate operand
-///    contain a value of 0xFF, the 256-bit destination vector would contain the
-///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
-///
-/// \param a
-///    A 256-bit vector of [8 x float]. The four selected elements in this
-///    operand are copied to bits [63:0] and bits [191:128] in the destination,
-///    according to the bits specified in the immediate operand.
-/// \param b
-///    A 256-bit vector of [8 x float]. The four selected elements in this
-///    operand are copied to bits [127:64] and bits [255:192] in the
-///    destination, according to the bits specified in the immediate operand.
-/// \param mask
-///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from \a a and \a b \n.
-///    Bits [3:0] specify the values copied from operand \a a. \n
-///    Bits [7:4] specify the values copied from operand \a b. \n
-///    The destinations within the 256-bit destination are assigned values as
-///    follows, according to the bit value assignments described below: \n
-///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
-///    destination. \n
-///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
-///    destination. \n
-///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
-///    destination. \n
-///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
-///    the destination. \n
-///    Bit value assignments: \n
-///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
-///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
-///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
-///    11: Bits [127:96] and [255:224] are copied from the selected operand. \n
-///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
-///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
-///    <c>[b6, b4, b2, b0]</c>.
-/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
-#define _mm256_shuffle_ps(a, b, mask) \
-  ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
-                                    (__v8sf)(__m256)(b), (int)(mask)))
-
-/// Selects four double-precision values from the 256-bit operands of
-///    [4 x double], as specified by the immediate value operand.
-///
-///    The selected elements from the first 256-bit operand are copied to bits
-///    [63:0] and bits [191:128] in the destination, and the selected elements
-///    from the second 256-bit operand are copied to bits [127:64] and bits
-///    [255:192] in the destination. For example, if bits [3:0] of the immediate
-///    operand contain a value of 0xF, the 256-bit destination vector would
-///    contain the following values: b[3], a[3], b[1], a[1].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
-///
-/// \param a
-///    A 256-bit vector of [4 x double].
-/// \param b
-///    A 256-bit vector of [4 x double].
-/// \param mask
-///    An immediate value containing 8-bit values specifying which elements to
-///    copy from \a a and \a b: \n
-///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
-///    destination. \n
-///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
-///    destination. \n
-///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
-///    destination. \n
-///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
-///    destination. \n
-///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
-///    destination. \n
-///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
-///    destination. \n
-///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
-///    destination. \n
-///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
-///    destination.
-/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
-#define _mm256_shuffle_pd(a, b, mask) \
-  ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
-                                     (__v4df)(__m256d)(b), (int)(mask)))
-
-/* Compare */
-#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
-#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
-#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
-#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
-#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
-#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
-#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
-#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
-#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
-#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
-#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
-#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
-#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
-#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
-#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
-#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
-#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
-#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
-#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
-#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
-#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
-#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
-#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
-#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
-
-/* Below intrinsic defined in emmintrin.h can be used for AVX */
-/// Compares each of the corresponding double-precision values of two
-///    128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x double].
-/// \param b
-///    A 128-bit vector of [2 x double].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [2 x double] containing the comparison results.
-/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
-
-/* Below intrinsic defined in xmmintrin.h can be used for AVX */
-/// Compares each of the corresponding values of two 128-bit vectors of
-///    [4 x float], using the operation specified by the immediate integer
-///    operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float].
-/// \param b
-///    A 128-bit vector of [4 x float].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
-
-/// Compares each of the corresponding double-precision values of two
-///    256-bit vectors of [4 x double], using the operation specified by the
-///    immediate integer operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
-///
-/// \param a
-///    A 256-bit vector of [4 x double].
-/// \param b
-///    A 256-bit vector of [4 x double].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 256-bit vector of [4 x double] containing the comparison results.
-#define _mm256_cmp_pd(a, b, c) \
-  ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
-                                    (__v4df)(__m256d)(b), (c)))
-
-/// Compares each of the corresponding values of two 256-bit vectors of
-///    [8 x float], using the operation specified by the immediate integer
-///    operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
-///
-/// \param a
-///    A 256-bit vector of [8 x float].
-/// \param b
-///    A 256-bit vector of [8 x float].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 256-bit vector of [8 x float] containing the comparison results.
-#define _mm256_cmp_ps(a, b, c) \
-  ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
-                                   (__v8sf)(__m256)(b), (c)))
-
-/* Below intrinsic defined in emmintrin.h can be used for AVX */
-/// Compares each of the corresponding scalar double-precision values of
-///    two 128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x double].
-/// \param b
-///    A 128-bit vector of [2 x double].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [2 x double] containing the comparison results.
-/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
-
-/* Below intrinsic defined in xmmintrin.h can be used for AVX */
-/// Compares each of the corresponding scalar values of two 128-bit
-///    vectors of [4 x float], using the operation specified by the immediate
-///    integer operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float].
-/// \param b
-///    A 128-bit vector of [4 x float].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
-
-/// Takes a [8 x i32] vector and returns the vector element value
-///    indexed by the immediate constant operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm256_extract_epi32(__m256i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
-///   instruction.
-///
-/// \param X
-///    A 256-bit vector of [8 x i32].
-/// \param N
-///    An immediate integer operand with bits [2:0] determining which vector
-///    element is extracted and returned.
-/// \returns A 32-bit integer containing the extracted 32 bits of extended
-///    packed data.
-#define _mm256_extract_epi32(X, N) \
-  ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
-
-/// Takes a [16 x i16] vector and returns the vector element value
-///    indexed by the immediate constant operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm256_extract_epi16(__m256i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
-///   instruction.
-///
-/// \param X
-///    A 256-bit integer vector of [16 x i16].
-/// \param N
-///    An immediate integer operand with bits [3:0] determining which vector
-///    element is extracted and returned.
-/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
-///    packed data.
-#define _mm256_extract_epi16(X, N) \
-  ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
-                                                     (int)(N)))
-
-/// Takes a [32 x i8] vector and returns the vector element value
-///    indexed by the immediate constant operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm256_extract_epi8(__m256i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
-///   instruction.
-///
-/// \param X
-///    A 256-bit integer vector of [32 x i8].
-/// \param N
-///    An immediate integer operand with bits [4:0] determining which vector
-///    element is extracted and returned.
-/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
-///    packed data.
-#define _mm256_extract_epi8(X, N) \
-  ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
-                                                    (int)(N)))
-
-#ifdef __x86_64__
-/// Takes a [4 x i64] vector and returns the vector element value
-///    indexed by the immediate constant operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// long long _mm256_extract_epi64(__m256i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
-///   instruction.
-///
-/// \param X
-///    A 256-bit integer vector of [4 x i64].
-/// \param N
-///    An immediate integer operand with bits [1:0] determining which vector
-///    element is extracted and returned.
-/// \returns A 64-bit integer containing the extracted 64 bits of extended
-///    packed data.
-#define _mm256_extract_epi64(X, N) \
-  ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
-#endif
-
-/// Takes a [8 x i32] vector and replaces the vector element value
-///    indexed by the immediate constant operand by a new value. Returns the
-///    modified vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
-///   instruction.
-///
-/// \param X
-///    A vector of [8 x i32] to be used by the insert operation.
-/// \param I
-///    An integer value. The replacement value for the insert operation.
-/// \param N
-///    An immediate integer specifying the index of the vector element to be
-///    replaced.
-/// \returns A copy of vector \a X, after replacing its element indexed by
-///    \a N with \a I.
-#define _mm256_insert_epi32(X, I, N) \
-  ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
-                                        (int)(I), (int)(N)))
-
-
-/// Takes a [16 x i16] vector and replaces the vector element value
-///    indexed by the immediate constant operand with a new value. Returns the
-///    modified vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
-///   instruction.
-///
-/// \param X
-///    A vector of [16 x i16] to be used by the insert operation.
-/// \param I
-///    An i16 integer value. The replacement value for the insert operation.
-/// \param N
-///    An immediate integer specifying the index of the vector element to be
-///    replaced.
-/// \returns A copy of vector \a X, after replacing its element indexed by
-///    \a N with \a I.
-#define _mm256_insert_epi16(X, I, N) \
-  ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
-                                         (int)(I), (int)(N)))
-
-/// Takes a [32 x i8] vector and replaces the vector element value
-///    indexed by the immediate constant operand with a new value. Returns the
-///    modified vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
-///   instruction.
-///
-/// \param X
-///    A vector of [32 x i8] to be used by the insert operation.
-/// \param I
-///    An i8 integer value. The replacement value for the insert operation.
-/// \param N
-///    An immediate integer specifying the index of the vector element to be
-///    replaced.
-/// \returns A copy of vector \a X, after replacing its element indexed by
-///    \a N with \a I.
-#define _mm256_insert_epi8(X, I, N) \
-  ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
-                                         (int)(I), (int)(N)))
-
-#ifdef __x86_64__
-/// Takes a [4 x i64] vector and replaces the vector element value
-///    indexed by the immediate constant operand with a new value. Returns the
-///    modified vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
-///   instruction.
-///
-/// \param X
-///    A vector of [4 x i64] to be used by the insert operation.
-/// \param I
-///    A 64-bit integer value. The replacement value for the insert operation.
-/// \param N
-///    An immediate integer specifying the index of the vector element to be
-///    replaced.
-/// \returns A copy of vector \a X, after replacing its element indexed by
-///     \a N with \a I.
-#define _mm256_insert_epi64(X, I, N) \
-  ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
-                                        (long long)(I), (int)(N)))
-#endif
-
-/* Conversion */
-/// Converts a vector of [4 x i32] into a vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector of [4 x i32].
-/// \returns A 256-bit vector of [4 x double] containing the converted values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_cvtepi32_pd(__m128i __a)
-{
-  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
-}
-
-/// Converts a vector of [8 x i32] into a vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \returns A 256-bit vector of [8 x float] containing the converted values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_cvtepi32_ps(__m256i __a)
-{
-  return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
-}
-
-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
-///    [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \returns A 128-bit vector of [4 x float] containing the converted values.
-static __inline __m128 __DEFAULT_FN_ATTRS
-_mm256_cvtpd_ps(__m256d __a)
-{
-  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
-}
-
-/// Converts a vector of [8 x float] into a vector of [8 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit integer vector containing the converted values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_cvtps_epi32(__m256 __a)
-{
-  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
-}
-
-/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
-///    x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 256-bit vector of [4 x double] containing the converted values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_cvtps_pd(__m128 __a)
-{
-  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
-}
-
-/// Converts a 256-bit vector of [4 x double] into four signed truncated
-///    (rounded toward zero) 32-bit integers returned in a 128-bit vector of
-///    [4 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \returns A 128-bit integer vector containing the converted values.
-static __inline __m128i __DEFAULT_FN_ATTRS
-_mm256_cvttpd_epi32(__m256d __a)
-{
-  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
-}
-
-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
-///    [4 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \returns A 128-bit integer vector containing the converted values.
-static __inline __m128i __DEFAULT_FN_ATTRS
-_mm256_cvtpd_epi32(__m256d __a)
-{
-  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
-}
-
-/// Converts a vector of [8 x float] into eight signed truncated (rounded
-///    toward zero) 32-bit integers returned in a vector of [8 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit integer vector containing the converted values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_cvttps_epi32(__m256 __a)
-{
-  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
-}
-
-/// Returns the first element of the input vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \returns A 64 bit double containing the first element of the input vector.
-static __inline double __DEFAULT_FN_ATTRS
-_mm256_cvtsd_f64(__m256d __a)
-{
- return __a[0];
-}
-
-/// Returns the first element of the input vector of [8 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32].
-/// \returns A 32 bit integer containing the first element of the input vector.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_cvtsi256_si32(__m256i __a)
-{
- __v8si __b = (__v8si)__a;
- return __b[0];
-}
-
-/// Returns the first element of the input vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \returns A 32 bit float containing the first element of the input vector.
-static __inline float __DEFAULT_FN_ATTRS
-_mm256_cvtss_f32(__m256 __a)
-{
- return __a[0];
-}
-
-/* Vector replicate */
-/// Moves and duplicates odd-indexed values from a 256-bit vector of
-///    [8 x float] to float values in a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float]. \n
-///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
-///    the return value. \n
-///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
-///    the return value. \n
-///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
-///    return value. \n
-///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
-///    return value.
-/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
-///    values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_movehdup_ps(__m256 __a)
-{
-  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
-}
-
-/// Moves and duplicates even-indexed values from a 256-bit vector of
-///    [8 x float] to float values in a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float]. \n
-///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
-///    the return value. \n
-///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
-///    the return value. \n
-///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
-///    return value. \n
-///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
-///    return value.
-/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
-///    values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_moveldup_ps(__m256 __a)
-{
-  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
-}
-
-/// Moves and duplicates double-precision floating point values from a
-///    256-bit vector of [4 x double] to double-precision values in a 256-bit
-///    vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double]. \n
-///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
-///    return value. \n
-///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
-///    the return value.
-/// \returns A 256-bit vector of [4 x double] containing the moved and
-///    duplicated values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_movedup_pd(__m256d __a)
-{
-  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
-}
-
-/* Unpack and Interleave */
-/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
-///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit floating-point vector of [4 x double]. \n
-///    Bits [127:64] are written to bits [63:0] of the return value. \n
-///    Bits [255:192] are written to bits [191:128] of the return value. \n
-/// \param __b
-///    A 256-bit floating-point vector of [4 x double]. \n
-///    Bits [127:64] are written to bits [127:64] of the return value. \n
-///    Bits [255:192] are written to bits [255:192] of the return value. \n
-/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_unpackhi_pd(__m256d __a, __m256d __b)
-{
-  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
-}
-
-/// Unpacks the even-indexed vector elements from two 256-bit vectors of
-///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit floating-point vector of [4 x double]. \n
-///    Bits [63:0] are written to bits [63:0] of the return value. \n
-///    Bits [191:128] are written to bits [191:128] of the return value.
-/// \param __b
-///    A 256-bit floating-point vector of [4 x double]. \n
-///    Bits [63:0] are written to bits [127:64] of the return value. \n
-///    Bits [191:128] are written to bits [255:192] of the return value. \n
-/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_unpacklo_pd(__m256d __a, __m256d __b)
-{
-  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
-}
-
-/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
-///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
-///    vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float]. \n
-///    Bits [95:64] are written to bits [31:0] of the return value. \n
-///    Bits [127:96] are written to bits [95:64] of the return value. \n
-///    Bits [223:192] are written to bits [159:128] of the return value. \n
-///    Bits [255:224] are written to bits [223:192] of the return value.
-/// \param __b
-///    A 256-bit vector of [8 x float]. \n
-///    Bits [95:64] are written to bits [63:32] of the return value. \n
-///    Bits [127:96] are written to bits [127:96] of the return value. \n
-///    Bits [223:192] are written to bits [191:160] of the return value. \n
-///    Bits [255:224] are written to bits [255:224] of the return value.
-/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_unpackhi_ps(__m256 __a, __m256 __b)
-{
-  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
-}
-
-/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
-///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
-///    vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float]. \n
-///    Bits [31:0] are written to bits [31:0] of the return value. \n
-///    Bits [63:32] are written to bits [95:64] of the return value. \n
-///    Bits [159:128] are written to bits [159:128] of the return value. \n
-///    Bits [191:160] are written to bits [223:192] of the return value.
-/// \param __b
-///    A 256-bit vector of [8 x float]. \n
-///    Bits [31:0] are written to bits [63:32] of the return value. \n
-///    Bits [63:32] are written to bits [127:96] of the return value. \n
-///    Bits [159:128] are written to bits [191:160] of the return value. \n
-///    Bits [191:160] are written to bits [255:224] of the return value.
-/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_unpacklo_ps(__m256 __a, __m256 __b)
-{
-  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
-}
-
-/* Bit Test */
-/// Given two 128-bit floating-point vectors of [2 x double], perform an
-///    element-by-element comparison of the double-precision element in the
-///    first source vector and the corresponding element in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns the ZF flag in the EFLAGS register.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testz_pd(__m128d __a, __m128d __b)
-{
-  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [2 x double], perform an
-///    element-by-element comparison of the double-precision element in the
-///    first source vector and the corresponding element in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns the CF flag in the EFLAGS register.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testc_pd(__m128d __a, __m128d __b)
-{
-  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [2 x double], perform an
-///    element-by-element comparison of the double-precision element in the
-///    first source vector and the corresponding element in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-///    otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testnzc_pd(__m128d __a, __m128d __b)
-{
-  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [4 x float], perform an
-///    element-by-element comparison of the single-precision element in the
-///    first source vector and the corresponding element in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns the ZF flag.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testz_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [4 x float], perform an
-///    element-by-element comparison of the single-precision element in the
-///    first source vector and the corresponding element in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns the CF flag.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testc_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [4 x float], perform an
-///    element-by-element comparison of the single-precision element in the
-///    first source vector and the corresponding element in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-///    otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testnzc_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [4 x double], perform an
-///    element-by-element comparison of the double-precision elements in the
-///    first source vector and the corresponding elements in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \param __b
-///    A 256-bit vector of [4 x double].
-/// \returns the ZF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testz_pd(__m256d __a, __m256d __b)
-{
-  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [4 x double], perform an
-///    element-by-element comparison of the double-precision elements in the
-///    first source vector and the corresponding elements in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \param __b
-///    A 256-bit vector of [4 x double].
-/// \returns the CF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testc_pd(__m256d __a, __m256d __b)
-{
-  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [4 x double], perform an
-///    element-by-element comparison of the double-precision elements in the
-///    first source vector and the corresponding elements in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of double-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-///    otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double].
-/// \param __b
-///    A 256-bit vector of [4 x double].
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testnzc_pd(__m256d __a, __m256d __b)
-{
-  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [8 x float], perform an
-///    element-by-element comparison of the single-precision element in the
-///    first source vector and the corresponding element in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \param __b
-///    A 256-bit vector of [8 x float].
-/// \returns the ZF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testz_ps(__m256 __a, __m256 __b)
-{
-  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [8 x float], perform an
-///    element-by-element comparison of the single-precision element in the
-///    first source vector and the corresponding element in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \param __b
-///    A 256-bit vector of [8 x float].
-/// \returns the CF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testc_ps(__m256 __a, __m256 __b)
-{
-  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [8 x float], perform an
-///    element-by-element comparison of the single-precision elements in the
-///    first source vector and the corresponding elements in the second source
-///    vector.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-///    ZF flag is set to 1. \n
-///    If there is at least one pair of single-precision elements where the
-///    sign-bit of the first element is 0 and the sign-bit of the second element
-///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-///    otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float].
-/// \param __b
-///    A 256-bit vector of [8 x float].
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testnzc_ps(__m256 __a, __m256 __b)
-{
-  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
-///    of the two source vectors.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of bits where both bits are 1, the ZF flag
-///    is set to 0. Otherwise the ZF flag is set to 1. \n
-///    If there is at least one pair of bits where the bit from the first source
-///    vector is 0 and the bit from the second source vector is 1, the CF flag
-///    is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns the ZF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testz_si256(__m256i __a, __m256i __b)
-{
-  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
-}
-
-/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
-///    of the two source vectors.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of bits where both bits are 1, the ZF flag
-///    is set to 0. Otherwise the ZF flag is set to 1. \n
-///    If there is at least one pair of bits where the bit from the first source
-///    vector is 0 and the bit from the second source vector is 1, the CF flag
-///    is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns the CF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testc_si256(__m256i __a, __m256i __b)
-{
-  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
-}
-
-/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
-///    of the two source vectors.
-///
-///    The EFLAGS register is updated as follows: \n
-///    If there is at least one pair of bits where both bits are 1, the ZF flag
-///    is set to 0. Otherwise the ZF flag is set to 1. \n
-///    If there is at least one pair of bits where the bit from the first source
-///    vector is 0 and the bit from the second source vector is 1, the CF flag
-///    is set to 0. Otherwise the CF flag is set to 1. \n
-///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-///    otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \param __b
-///    A 256-bit integer vector.
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testnzc_si256(__m256i __a, __m256i __b)
-{
-  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
-}
-
-/* Vector extract sign mask */
-/// Extracts the sign bits of double-precision floating point elements
-///    in a 256-bit vector of [4 x double] and writes them to the lower order
-///    bits of the return value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x double] containing the double-precision
-///    floating point values with sign bits to be extracted.
-/// \returns The sign bits from the operand, written to bits [3:0].
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_movemask_pd(__m256d __a)
-{
-  return __builtin_ia32_movmskpd256((__v4df)__a);
-}
-
-/// Extracts the sign bits of single-precision floating point elements
-///    in a 256-bit vector of [8 x float] and writes them to the lower order
-///    bits of the return value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the single-precision floating
-///    point values with sign bits to be extracted.
-/// \returns The sign bits from the operand, written to bits [7:0].
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_movemask_ps(__m256 __a)
-{
-  return __builtin_ia32_movmskps256((__v8sf)__a);
-}
-
-/* Vector __zero */
-/// Zeroes the contents of all XMM or YMM registers.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
-static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
-_mm256_zeroall(void)
-{
-  __builtin_ia32_vzeroall();
-}
-
-/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
-static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
-_mm256_zeroupper(void)
-{
-  __builtin_ia32_vzeroupper();
-}
-
-/* Vector load with broadcast */
-/// Loads a scalar single-precision floating point value from the
-///    specified address pointed to by \a __a and broadcasts it to the elements
-///    of a [4 x float] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
-///
-/// \param __a
-///    The single-precision floating point value to be broadcast.
-/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
-///    equal to the broadcast value.
-static __inline __m128 __DEFAULT_FN_ATTRS128
-_mm_broadcast_ss(float const *__a)
-{
-  struct __mm_broadcast_ss_struct {
-    float __f;
-  } __attribute__((__packed__, __may_alias__));
-  float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
-  return __extension__ (__m128){ __f, __f, __f, __f };
-}
-
-/// Loads a scalar double-precision floating point value from the
-///    specified address pointed to by \a __a and broadcasts it to the elements
-///    of a [4 x double] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
-///
-/// \param __a
-///    The double-precision floating point value to be broadcast.
-/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
-///    equal to the broadcast value.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_broadcast_sd(double const *__a)
-{
-  struct __mm256_broadcast_sd_struct {
-    double __d;
-  } __attribute__((__packed__, __may_alias__));
-  double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
-  return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
-}
-
-/// Loads a scalar single-precision floating point value from the
-///    specified address pointed to by \a __a and broadcasts it to the elements
-///    of a [8 x float] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
-///
-/// \param __a
-///    The single-precision floating point value to be broadcast.
-/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
-///    equal to the broadcast value.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_broadcast_ss(float const *__a)
-{
-  struct __mm256_broadcast_ss_struct {
-    float __f;
-  } __attribute__((__packed__, __may_alias__));
-  float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
-  return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
-}
-
-/// Loads the data from a 128-bit vector of [2 x double] from the
-///    specified address pointed to by \a __a and broadcasts it to 128-bit
-///    elements in a 256-bit vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
-///
-/// \param __a
-///    The 128-bit vector of [2 x double] to be broadcast.
-/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
-///    equal to the broadcast value.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_broadcast_pd(__m128d const *__a)
-{
-  __m128d __b = _mm_loadu_pd((const double *)__a);
-  return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
-                                          0, 1, 0, 1);
-}
-
-/// Loads the data from a 128-bit vector of [4 x float] from the
-///    specified address pointed to by \a __a and broadcasts it to 128-bit
-///    elements in a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
-///
-/// \param __a
-///    The 128-bit vector of [4 x float] to be broadcast.
-/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
-///    equal to the broadcast value.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_broadcast_ps(__m128 const *__a)
-{
-  __m128 __b = _mm_loadu_ps((const float *)__a);
-  return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
-                                         0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-/* SIMD load ops */
-/// Loads 4 double-precision floating point values from a 32-byte aligned
-///    memory location pointed to by \a __p into a vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
-///
-/// \param __p
-///    A 32-byte aligned pointer to a memory location containing
-///    double-precision floating point values.
-/// \returns A 256-bit vector of [4 x double] containing the moved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_load_pd(double const *__p)
-{
-  return *(const __m256d *)__p;
-}
-
-/// Loads 8 single-precision floating point values from a 32-byte aligned
-///    memory location pointed to by \a __p into a vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
-///
-/// \param __p
-///    A 32-byte aligned pointer to a memory location containing float values.
-/// \returns A 256-bit vector of [8 x float] containing the moved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_load_ps(float const *__p)
-{
-  return *(const __m256 *)__p;
-}
-
-/// Loads 4 double-precision floating point values from an unaligned
-///    memory location pointed to by \a __p into a vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location containing double-precision floating
-///    point values.
-/// \returns A 256-bit vector of [4 x double] containing the moved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_loadu_pd(double const *__p)
-{
-  struct __loadu_pd {
-    __m256d_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_pd*)__p)->__v;
-}
-
-/// Loads 8 single-precision floating point values from an unaligned
-///    memory location pointed to by \a __p into a vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location containing single-precision floating
-///    point values.
-/// \returns A 256-bit vector of [8 x float] containing the moved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_loadu_ps(float const *__p)
-{
-  struct __loadu_ps {
-    __m256_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_ps*)__p)->__v;
-}
-
-/// Loads 256 bits of integer data from a 32-byte aligned memory
-///    location pointed to by \a __p into elements of a 256-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
-///
-/// \param __p
-///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
-///    values.
-/// \returns A 256-bit integer vector containing the moved values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_load_si256(__m256i const *__p)
-{
-  return *__p;
-}
-
-/// Loads 256 bits of integer data from an unaligned memory location
-///    pointed to by \a __p into a 256-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
-///
-/// \param __p
-///    A pointer to a 256-bit integer vector containing integer values.
-/// \returns A 256-bit integer vector containing the moved values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_loadu_si256(__m256i_u const *__p)
-{
-  struct __loadu_si256 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_si256*)__p)->__v;
-}
-
-/// Loads 256 bits of integer data from an unaligned memory location
-///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
-///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
-///    line boundary.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
-///
-/// \param __p
-///    A pointer to a 256-bit integer vector containing integer values.
-/// \returns A 256-bit integer vector containing the moved values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_lddqu_si256(__m256i_u const *__p)
-{
-  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
-}
-
-/* SIMD store ops */
-/// Stores double-precision floating point values from a 256-bit vector
-///    of [4 x double] to a 32-byte aligned memory location pointed to by
-///    \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
-///
-/// \param __p
-///    A 32-byte aligned pointer to a memory location that will receive the
-///    double-precision floaing point values.
-/// \param __a
-///    A 256-bit vector of [4 x double] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_store_pd(double *__p, __m256d __a)
-{
-  *(__m256d *)__p = __a;
-}
-
-/// Stores single-precision floating point values from a 256-bit vector
-///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
-///
-/// \param __p
-///    A 32-byte aligned pointer to a memory location that will receive the
-///    float values.
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_store_ps(float *__p, __m256 __a)
-{
-  *(__m256 *)__p = __a;
-}
-
-/// Stores double-precision floating point values from a 256-bit vector
-///    of [4 x double] to an unaligned memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that will receive the double-precision
-///    floating point values.
-/// \param __a
-///    A 256-bit vector of [4 x double] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu_pd(double *__p, __m256d __a)
-{
-  struct __storeu_pd {
-    __m256d_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_pd*)__p)->__v = __a;
-}
-
-/// Stores single-precision floating point values from a 256-bit vector
-///    of [8 x float] to an unaligned memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that will receive the float values.
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu_ps(float *__p, __m256 __a)
-{
-  struct __storeu_ps {
-    __m256_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_ps*)__p)->__v = __a;
-}
-
-/// Stores integer values from a 256-bit integer vector to a 32-byte
-///    aligned memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
-///
-/// \param __p
-///    A 32-byte aligned pointer to a memory location that will receive the
-///    integer values.
-/// \param __a
-///    A 256-bit integer vector containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_store_si256(__m256i *__p, __m256i __a)
-{
-  *__p = __a;
-}
-
-/// Stores integer values from a 256-bit integer vector to an unaligned
-///    memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that will receive the integer values.
-/// \param __a
-///    A 256-bit integer vector containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
-{
-  struct __storeu_si256 {
-    __m256i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_si256*)__p)->__v = __a;
-}
-
-/* Conditional load ops */
-/// Conditionally loads double-precision floating point elements from a
-///    memory location pointed to by \a __p into a 128-bit vector of
-///    [2 x double], depending on the mask bits associated with each data
-///    element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that contains the double-precision
-///    floating point values.
-/// \param __m
-///    A 128-bit integer vector containing the mask. The most significant bit of
-///    each data element represents the mask bits. If a mask bit is zero, the
-///    corresponding value in the memory location is not loaded and the
-///    corresponding field in the return value is set to zero.
-/// \returns A 128-bit vector of [2 x double] containing the loaded values.
-static __inline __m128d __DEFAULT_FN_ATTRS128
-_mm_maskload_pd(double const *__p, __m128i __m)
-{
-  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
-}
-
-/// Conditionally loads double-precision floating point elements from a
-///    memory location pointed to by \a __p into a 256-bit vector of
-///    [4 x double], depending on the mask bits associated with each data
-///    element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that contains the double-precision
-///    floating point values.
-/// \param __m
-///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
-///    significant bit of each quadword element represents the mask bits. If a
-///    mask bit is zero, the corresponding value in the memory location is not
-///    loaded and the corresponding field in the return value is set to zero.
-/// \returns A 256-bit vector of [4 x double] containing the loaded values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_maskload_pd(double const *__p, __m256i __m)
-{
-  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
-                                               (__v4di)__m);
-}
-
-/// Conditionally loads single-precision floating point elements from a
-///    memory location pointed to by \a __p into a 128-bit vector of
-///    [4 x float], depending on the mask bits associated with each data
-///    element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that contains the single-precision
-///    floating point values.
-/// \param __m
-///    A 128-bit integer vector containing the mask. The most significant bit of
-///    each data element represents the mask bits. If a mask bit is zero, the
-///    corresponding value in the memory location is not loaded and the
-///    corresponding field in the return value is set to zero.
-/// \returns A 128-bit vector of [4 x float] containing the loaded values.
-static __inline __m128 __DEFAULT_FN_ATTRS128
-_mm_maskload_ps(float const *__p, __m128i __m)
-{
-  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
-}
-
-/// Conditionally loads single-precision floating point elements from a
-///    memory location pointed to by \a __p into a 256-bit vector of
-///    [8 x float], depending on the mask bits associated with each data
-///    element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that contains the single-precision
-///    floating point values.
-/// \param __m
-///    A 256-bit integer vector of [8 x dword] containing the mask. The most
-///    significant bit of each dword element represents the mask bits. If a mask
-///    bit is zero, the corresponding value in the memory location is not loaded
-///    and the corresponding field in the return value is set to zero.
-/// \returns A 256-bit vector of [8 x float] containing the loaded values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_maskload_ps(float const *__p, __m256i __m)
-{
-  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
-}
-
-/* Conditional store ops */
-/// Moves single-precision floating point values from a 256-bit vector
-///    of [8 x float] to a memory location pointed to by \a __p, according to
-///    the specified mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that will receive the float values.
-/// \param __m
-///    A 256-bit integer vector of [8 x dword] containing the mask. The most
-///    significant bit of each dword element in the mask vector represents the
-///    mask bits. If a mask bit is zero, the corresponding value from vector
-///    \a __a is not stored and the corresponding field in the memory location
-///    pointed to by \a __p is not changed.
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the values to be stored.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
-{
-  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
-}
-
-/// Moves double-precision values from a 128-bit vector of [2 x double]
-///    to a memory location pointed to by \a __p, according to the specified
-///    mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that will receive the float values.
-/// \param __m
-///    A 128-bit integer vector containing the mask. The most significant bit of
-///    each field in the mask vector represents the mask bits. If a mask bit is
-///    zero, the corresponding value from vector \a __a is not stored and the
-///    corresponding field in the memory location pointed to by \a __p is not
-///    changed.
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the values to be stored.
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
-{
-  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
-}
-
-/// Moves double-precision values from a 256-bit vector of [4 x double]
-///    to a memory location pointed to by \a __p, according to the specified
-///    mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that will receive the float values.
-/// \param __m
-///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
-///    significant bit of each quadword element in the mask vector represents
-///    the mask bits. If a mask bit is zero, the corresponding value from vector
-///    __a is not stored and the corresponding field in the memory location
-///    pointed to by \a __p is not changed.
-/// \param __a
-///    A 256-bit vector of [4 x double] containing the values to be stored.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
-{
-  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
-}
-
-/// Moves single-precision floating point values from a 128-bit vector
-///    of [4 x float] to a memory location pointed to by \a __p, according to
-///    the specified mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that will receive the float values.
-/// \param __m
-///    A 128-bit integer vector containing the mask. The most significant bit of
-///    each field in the mask vector represents the mask bits. If a mask bit is
-///    zero, the corresponding value from vector __a is not stored and the
-///    corresponding field in the memory location pointed to by \a __p is not
-///    changed.
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
-{
-  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
-}
-
-/* Cacheability support ops */
-/// Moves integer data from a 256-bit integer vector to a 32-byte
-///    aligned memory location. To minimize caching, the data is flagged as
-///    non-temporal (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
-///
-/// \param __a
-///    A pointer to a 32-byte aligned memory location that will receive the
-///    integer values.
-/// \param __b
-///    A 256-bit integer vector containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_si256(void *__a, __m256i __b)
-{
-  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
-  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
-}
-
-/// Moves double-precision values from a 256-bit vector of [4 x double]
-///    to a 32-byte aligned memory location. To minimize caching, the data is
-///    flagged as non-temporal (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
-///
-/// \param __a
-///    A pointer to a 32-byte aligned memory location that will receive the
-///    double-precision floating-point values.
-/// \param __b
-///    A 256-bit vector of [4 x double] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_pd(void *__a, __m256d __b)
-{
-  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
-  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
-}
-
-/// Moves single-precision floating point values from a 256-bit vector
-///    of [8 x float] to a 32-byte aligned memory location. To minimize
-///    caching, the data is flagged as non-temporal (unlikely to be used again
-///    soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a 32-byte aligned memory location that will receive the
-///    single-precision floating point values.
-/// \param __a
-///    A 256-bit vector of [8 x float] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_ps(void *__p, __m256 __a)
-{
-  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
-  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
-}
-
-/* Create vectors */
-/// Create a 256-bit vector of [4 x double] with undefined values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 256-bit vector of [4 x double] containing undefined values.
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_undefined_pd(void)
-{
-  return (__m256d)__builtin_ia32_undef256();
-}
-
-/// Create a 256-bit vector of [8 x float] with undefined values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 256-bit vector of [8 x float] containing undefined values.
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_undefined_ps(void)
-{
-  return (__m256)__builtin_ia32_undef256();
-}
-
-/// Create a 256-bit integer vector with undefined values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 256-bit integer vector containing undefined values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_undefined_si256(void)
-{
-  return (__m256i)__builtin_ia32_undef256();
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double]
-///    initialized with the specified double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
-///   instruction.
-///
-/// \param __a
-///    A double-precision floating-point value used to initialize bits [255:192]
-///    of the result.
-/// \param __b
-///    A double-precision floating-point value used to initialize bits [191:128]
-///    of the result.
-/// \param __c
-///    A double-precision floating-point value used to initialize bits [127:64]
-///    of the result.
-/// \param __d
-///    A double-precision floating-point value used to initialize bits [63:0]
-///    of the result.
-/// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_set_pd(double __a, double __b, double __c, double __d)
-{
-  return __extension__ (__m256d){ __d, __c, __b, __a };
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] initialized
-///    with the specified single-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///   instruction.
-///
-/// \param __a
-///    A single-precision floating-point value used to initialize bits [255:224]
-///    of the result.
-/// \param __b
-///    A single-precision floating-point value used to initialize bits [223:192]
-///    of the result.
-/// \param __c
-///    A single-precision floating-point value used to initialize bits [191:160]
-///    of the result.
-/// \param __d
-///    A single-precision floating-point value used to initialize bits [159:128]
-///    of the result.
-/// \param __e
-///    A single-precision floating-point value used to initialize bits [127:96]
-///    of the result.
-/// \param __f
-///    A single-precision floating-point value used to initialize bits [95:64]
-///    of the result.
-/// \param __g
-///    A single-precision floating-point value used to initialize bits [63:32]
-///    of the result.
-/// \param __h
-///    A single-precision floating-point value used to initialize bits [31:0]
-///    of the result.
-/// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_set_ps(float __a, float __b, float __c, float __d,
-              float __e, float __f, float __g, float __h)
-{
-  return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
-}
-
-/// Constructs a 256-bit integer vector initialized with the specified
-///    32-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///   instruction.
-///
-/// \param __i0
-///    A 32-bit integral value used to initialize bits [255:224] of the result.
-/// \param __i1
-///    A 32-bit integral value used to initialize bits [223:192] of the result.
-/// \param __i2
-///    A 32-bit integral value used to initialize bits [191:160] of the result.
-/// \param __i3
-///    A 32-bit integral value used to initialize bits [159:128] of the result.
-/// \param __i4
-///    A 32-bit integral value used to initialize bits [127:96] of the result.
-/// \param __i5
-///    A 32-bit integral value used to initialize bits [95:64] of the result.
-/// \param __i6
-///    A 32-bit integral value used to initialize bits [63:32] of the result.
-/// \param __i7
-///    A 32-bit integral value used to initialize bits [31:0] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
-                 int __i4, int __i5, int __i6, int __i7)
-{
-  return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
-}
-
-/// Constructs a 256-bit integer vector initialized with the specified
-///    16-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///   instruction.
-///
-/// \param __w15
-///    A 16-bit integral value used to initialize bits [255:240] of the result.
-/// \param __w14
-///    A 16-bit integral value used to initialize bits [239:224] of the result.
-/// \param __w13
-///    A 16-bit integral value used to initialize bits [223:208] of the result.
-/// \param __w12
-///    A 16-bit integral value used to initialize bits [207:192] of the result.
-/// \param __w11
-///    A 16-bit integral value used to initialize bits [191:176] of the result.
-/// \param __w10
-///    A 16-bit integral value used to initialize bits [175:160] of the result.
-/// \param __w09
-///    A 16-bit integral value used to initialize bits [159:144] of the result.
-/// \param __w08
-///    A 16-bit integral value used to initialize bits [143:128] of the result.
-/// \param __w07
-///    A 16-bit integral value used to initialize bits [127:112] of the result.
-/// \param __w06
-///    A 16-bit integral value used to initialize bits [111:96] of the result.
-/// \param __w05
-///    A 16-bit integral value used to initialize bits [95:80] of the result.
-/// \param __w04
-///    A 16-bit integral value used to initialize bits [79:64] of the result.
-/// \param __w03
-///    A 16-bit integral value used to initialize bits [63:48] of the result.
-/// \param __w02
-///    A 16-bit integral value used to initialize bits [47:32] of the result.
-/// \param __w01
-///    A 16-bit integral value used to initialize bits [31:16] of the result.
-/// \param __w00
-///    A 16-bit integral value used to initialize bits [15:0] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
-                 short __w11, short __w10, short __w09, short __w08,
-                 short __w07, short __w06, short __w05, short __w04,
-                 short __w03, short __w02, short __w01, short __w00)
-{
-  return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
-    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
-}
-
-/// Constructs a 256-bit integer vector initialized with the specified
-///    8-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///   instruction.
-///
-/// \param __b31
-///    An 8-bit integral value used to initialize bits [255:248] of the result.
-/// \param __b30
-///    An 8-bit integral value used to initialize bits [247:240] of the result.
-/// \param __b29
-///    An 8-bit integral value used to initialize bits [239:232] of the result.
-/// \param __b28
-///    An 8-bit integral value used to initialize bits [231:224] of the result.
-/// \param __b27
-///    An 8-bit integral value used to initialize bits [223:216] of the result.
-/// \param __b26
-///    An 8-bit integral value used to initialize bits [215:208] of the result.
-/// \param __b25
-///    An 8-bit integral value used to initialize bits [207:200] of the result.
-/// \param __b24
-///    An 8-bit integral value used to initialize bits [199:192] of the result.
-/// \param __b23
-///    An 8-bit integral value used to initialize bits [191:184] of the result.
-/// \param __b22
-///    An 8-bit integral value used to initialize bits [183:176] of the result.
-/// \param __b21
-///    An 8-bit integral value used to initialize bits [175:168] of the result.
-/// \param __b20
-///    An 8-bit integral value used to initialize bits [167:160] of the result.
-/// \param __b19
-///    An 8-bit integral value used to initialize bits [159:152] of the result.
-/// \param __b18
-///    An 8-bit integral value used to initialize bits [151:144] of the result.
-/// \param __b17
-///    An 8-bit integral value used to initialize bits [143:136] of the result.
-/// \param __b16
-///    An 8-bit integral value used to initialize bits [135:128] of the result.
-/// \param __b15
-///    An 8-bit integral value used to initialize bits [127:120] of the result.
-/// \param __b14
-///    An 8-bit integral value used to initialize bits [119:112] of the result.
-/// \param __b13
-///    An 8-bit integral value used to initialize bits [111:104] of the result.
-/// \param __b12
-///    An 8-bit integral value used to initialize bits [103:96] of the result.
-/// \param __b11
-///    An 8-bit integral value used to initialize bits [95:88] of the result.
-/// \param __b10
-///    An 8-bit integral value used to initialize bits [87:80] of the result.
-/// \param __b09
-///    An 8-bit integral value used to initialize bits [79:72] of the result.
-/// \param __b08
-///    An 8-bit integral value used to initialize bits [71:64] of the result.
-/// \param __b07
-///    An 8-bit integral value used to initialize bits [63:56] of the result.
-/// \param __b06
-///    An 8-bit integral value used to initialize bits [55:48] of the result.
-/// \param __b05
-///    An 8-bit integral value used to initialize bits [47:40] of the result.
-/// \param __b04
-///    An 8-bit integral value used to initialize bits [39:32] of the result.
-/// \param __b03
-///    An 8-bit integral value used to initialize bits [31:24] of the result.
-/// \param __b02
-///    An 8-bit integral value used to initialize bits [23:16] of the result.
-/// \param __b01
-///    An 8-bit integral value used to initialize bits [15:8] of the result.
-/// \param __b00
-///    An 8-bit integral value used to initialize bits [7:0] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
-                char __b27, char __b26, char __b25, char __b24,
-                char __b23, char __b22, char __b21, char __b20,
-                char __b19, char __b18, char __b17, char __b16,
-                char __b15, char __b14, char __b13, char __b12,
-                char __b11, char __b10, char __b09, char __b08,
-                char __b07, char __b06, char __b05, char __b04,
-                char __b03, char __b02, char __b01, char __b00)
-{
-  return __extension__ (__m256i)(__v32qi){
-    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
-    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
-    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
-    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
-  };
-}
-
-/// Constructs a 256-bit integer vector initialized with the specified
-///    64-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
-///   instruction.
-///
-/// \param __a
-///    A 64-bit integral value used to initialize bits [255:192] of the result.
-/// \param __b
-///    A 64-bit integral value used to initialize bits [191:128] of the result.
-/// \param __c
-///    A 64-bit integral value used to initialize bits [127:64] of the result.
-/// \param __d
-///    A 64-bit integral value used to initialize bits [63:0] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
-{
-  return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
-}
-
-/* Create vectors with elements in reverse order */
-/// Constructs a 256-bit floating-point vector of [4 x double],
-///    initialized in reverse order with the specified double-precision
-///    floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
-///   instruction.
-///
-/// \param __a
-///    A double-precision floating-point value used to initialize bits [63:0]
-///    of the result.
-/// \param __b
-///    A double-precision floating-point value used to initialize bits [127:64]
-///    of the result.
-/// \param __c
-///    A double-precision floating-point value used to initialize bits [191:128]
-///    of the result.
-/// \param __d
-///    A double-precision floating-point value used to initialize bits [255:192]
-///    of the result.
-/// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setr_pd(double __a, double __b, double __c, double __d)
-{
-  return _mm256_set_pd(__d, __c, __b, __a);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float],
-///    initialized in reverse order with the specified single-precision
-///    float-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///   instruction.
-///
-/// \param __a
-///    A single-precision floating-point value used to initialize bits [31:0]
-///    of the result.
-/// \param __b
-///    A single-precision floating-point value used to initialize bits [63:32]
-///    of the result.
-/// \param __c
-///    A single-precision floating-point value used to initialize bits [95:64]
-///    of the result.
-/// \param __d
-///    A single-precision floating-point value used to initialize bits [127:96]
-///    of the result.
-/// \param __e
-///    A single-precision floating-point value used to initialize bits [159:128]
-///    of the result.
-/// \param __f
-///    A single-precision floating-point value used to initialize bits [191:160]
-///    of the result.
-/// \param __g
-///    A single-precision floating-point value used to initialize bits [223:192]
-///    of the result.
-/// \param __h
-///    A single-precision floating-point value used to initialize bits [255:224]
-///    of the result.
-/// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setr_ps(float __a, float __b, float __c, float __d,
-               float __e, float __f, float __g, float __h)
-{
-  return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
-}
-
-/// Constructs a 256-bit integer vector, initialized in reverse order
-///    with the specified 32-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///   instruction.
-///
-/// \param __i0
-///    A 32-bit integral value used to initialize bits [31:0] of the result.
-/// \param __i1
-///    A 32-bit integral value used to initialize bits [63:32] of the result.
-/// \param __i2
-///    A 32-bit integral value used to initialize bits [95:64] of the result.
-/// \param __i3
-///    A 32-bit integral value used to initialize bits [127:96] of the result.
-/// \param __i4
-///    A 32-bit integral value used to initialize bits [159:128] of the result.
-/// \param __i5
-///    A 32-bit integral value used to initialize bits [191:160] of the result.
-/// \param __i6
-///    A 32-bit integral value used to initialize bits [223:192] of the result.
-/// \param __i7
-///    A 32-bit integral value used to initialize bits [255:224] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
-                  int __i4, int __i5, int __i6, int __i7)
-{
-  return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
-}
-
-/// Constructs a 256-bit integer vector, initialized in reverse order
-///    with the specified 16-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///   instruction.
-///
-/// \param __w15
-///    A 16-bit integral value used to initialize bits [15:0] of the result.
-/// \param __w14
-///    A 16-bit integral value used to initialize bits [31:16] of the result.
-/// \param __w13
-///    A 16-bit integral value used to initialize bits [47:32] of the result.
-/// \param __w12
-///    A 16-bit integral value used to initialize bits [63:48] of the result.
-/// \param __w11
-///    A 16-bit integral value used to initialize bits [79:64] of the result.
-/// \param __w10
-///    A 16-bit integral value used to initialize bits [95:80] of the result.
-/// \param __w09
-///    A 16-bit integral value used to initialize bits [111:96] of the result.
-/// \param __w08
-///    A 16-bit integral value used to initialize bits [127:112] of the result.
-/// \param __w07
-///    A 16-bit integral value used to initialize bits [143:128] of the result.
-/// \param __w06
-///    A 16-bit integral value used to initialize bits [159:144] of the result.
-/// \param __w05
-///    A 16-bit integral value used to initialize bits [175:160] of the result.
-/// \param __w04
-///    A 16-bit integral value used to initialize bits [191:176] of the result.
-/// \param __w03
-///    A 16-bit integral value used to initialize bits [207:192] of the result.
-/// \param __w02
-///    A 16-bit integral value used to initialize bits [223:208] of the result.
-/// \param __w01
-///    A 16-bit integral value used to initialize bits [239:224] of the result.
-/// \param __w00
-///    A 16-bit integral value used to initialize bits [255:240] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
-       short __w11, short __w10, short __w09, short __w08,
-       short __w07, short __w06, short __w05, short __w04,
-       short __w03, short __w02, short __w01, short __w00)
-{
-  return _mm256_set_epi16(__w00, __w01, __w02, __w03,
-                          __w04, __w05, __w06, __w07,
-                          __w08, __w09, __w10, __w11,
-                          __w12, __w13, __w14, __w15);
-}
-
-/// Constructs a 256-bit integer vector, initialized in reverse order
-///    with the specified 8-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///   instruction.
-///
-/// \param __b31
-///    An 8-bit integral value used to initialize bits [7:0] of the result.
-/// \param __b30
-///    An 8-bit integral value used to initialize bits [15:8] of the result.
-/// \param __b29
-///    An 8-bit integral value used to initialize bits [23:16] of the result.
-/// \param __b28
-///    An 8-bit integral value used to initialize bits [31:24] of the result.
-/// \param __b27
-///    An 8-bit integral value used to initialize bits [39:32] of the result.
-/// \param __b26
-///    An 8-bit integral value used to initialize bits [47:40] of the result.
-/// \param __b25
-///    An 8-bit integral value used to initialize bits [55:48] of the result.
-/// \param __b24
-///    An 8-bit integral value used to initialize bits [63:56] of the result.
-/// \param __b23
-///    An 8-bit integral value used to initialize bits [71:64] of the result.
-/// \param __b22
-///    An 8-bit integral value used to initialize bits [79:72] of the result.
-/// \param __b21
-///    An 8-bit integral value used to initialize bits [87:80] of the result.
-/// \param __b20
-///    An 8-bit integral value used to initialize bits [95:88] of the result.
-/// \param __b19
-///    An 8-bit integral value used to initialize bits [103:96] of the result.
-/// \param __b18
-///    An 8-bit integral value used to initialize bits [111:104] of the result.
-/// \param __b17
-///    An 8-bit integral value used to initialize bits [119:112] of the result.
-/// \param __b16
-///    An 8-bit integral value used to initialize bits [127:120] of the result.
-/// \param __b15
-///    An 8-bit integral value used to initialize bits [135:128] of the result.
-/// \param __b14
-///    An 8-bit integral value used to initialize bits [143:136] of the result.
-/// \param __b13
-///    An 8-bit integral value used to initialize bits [151:144] of the result.
-/// \param __b12
-///    An 8-bit integral value used to initialize bits [159:152] of the result.
-/// \param __b11
-///    An 8-bit integral value used to initialize bits [167:160] of the result.
-/// \param __b10
-///    An 8-bit integral value used to initialize bits [175:168] of the result.
-/// \param __b09
-///    An 8-bit integral value used to initialize bits [183:176] of the result.
-/// \param __b08
-///    An 8-bit integral value used to initialize bits [191:184] of the result.
-/// \param __b07
-///    An 8-bit integral value used to initialize bits [199:192] of the result.
-/// \param __b06
-///    An 8-bit integral value used to initialize bits [207:200] of the result.
-/// \param __b05
-///    An 8-bit integral value used to initialize bits [215:208] of the result.
-/// \param __b04
-///    An 8-bit integral value used to initialize bits [223:216] of the result.
-/// \param __b03
-///    An 8-bit integral value used to initialize bits [231:224] of the result.
-/// \param __b02
-///    An 8-bit integral value used to initialize bits [239:232] of the result.
-/// \param __b01
-///    An 8-bit integral value used to initialize bits [247:240] of the result.
-/// \param __b00
-///    An 8-bit integral value used to initialize bits [255:248] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
-                 char __b27, char __b26, char __b25, char __b24,
-                 char __b23, char __b22, char __b21, char __b20,
-                 char __b19, char __b18, char __b17, char __b16,
-                 char __b15, char __b14, char __b13, char __b12,
-                 char __b11, char __b10, char __b09, char __b08,
-                 char __b07, char __b06, char __b05, char __b04,
-                 char __b03, char __b02, char __b01, char __b00)
-{
-  return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
-                         __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
-                         __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
-                         __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
-}
-
-/// Constructs a 256-bit integer vector, initialized in reverse order
-///    with the specified 64-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
-///   instruction.
-///
-/// \param __a
-///    A 64-bit integral value used to initialize bits [63:0] of the result.
-/// \param __b
-///    A 64-bit integral value used to initialize bits [127:64] of the result.
-/// \param __c
-///    A 64-bit integral value used to initialize bits [191:128] of the result.
-/// \param __d
-///    A 64-bit integral value used to initialize bits [255:192] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
-{
-  return _mm256_set_epi64x(__d, __c, __b, __a);
-}
-
-/* Create vectors with repeated elements */
-/// Constructs a 256-bit floating-point vector of [4 x double], with each
-///    of the four double-precision floating-point vector elements set to the
-///    specified double-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
-///
-/// \param __w
-///    A double-precision floating-point value used to initialize each vector
-///    element of the result.
-/// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_set1_pd(double __w)
-{
-  return _mm256_set_pd(__w, __w, __w, __w);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float], with each
-///    of the eight single-precision floating-point vector elements set to the
-///    specified single-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
-///   instruction.
-///
-/// \param __w
-///    A single-precision floating-point value used to initialize each vector
-///    element of the result.
-/// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_set1_ps(float __w)
-{
-  return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
-}
-
-/// Constructs a 256-bit integer vector of [8 x i32], with each of the
-///    32-bit integral vector elements set to the specified 32-bit integral
-///    value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
-///   instruction.
-///
-/// \param __i
-///    A 32-bit integral value used to initialize each vector element of the
-///    result.
-/// \returns An initialized 256-bit integer vector of [8 x i32].
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set1_epi32(int __i)
-{
-  return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
-}
-
-/// Constructs a 256-bit integer vector of [16 x i16], with each of the
-///    16-bit integral vector elements set to the specified 16-bit integral
-///    value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
-///
-/// \param __w
-///    A 16-bit integral value used to initialize each vector element of the
-///    result.
-/// \returns An initialized 256-bit integer vector of [16 x i16].
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set1_epi16(short __w)
-{
-  return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
-                          __w, __w, __w, __w, __w, __w, __w, __w);
-}
-
-/// Constructs a 256-bit integer vector of [32 x i8], with each of the
-///    8-bit integral vector elements set to the specified 8-bit integral value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
-///
-/// \param __b
-///    An 8-bit integral value used to initialize each vector element of the
-///    result.
-/// \returns An initialized 256-bit integer vector of [32 x i8].
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set1_epi8(char __b)
-{
-  return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
-                         __b, __b, __b, __b, __b, __b, __b, __b,
-                         __b, __b, __b, __b, __b, __b, __b, __b,
-                         __b, __b, __b, __b, __b, __b, __b, __b);
-}
-
-/// Constructs a 256-bit integer vector of [4 x i64], with each of the
-///    64-bit integral vector elements set to the specified 64-bit integral
-///    value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
-///
-/// \param __q
-///    A 64-bit integral value used to initialize each vector element of the
-///    result.
-/// \returns An initialized 256-bit integer vector of [4 x i64].
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set1_epi64x(long long __q)
-{
-  return _mm256_set_epi64x(__q, __q, __q, __q);
-}
-
-/* Create __zeroed vectors */
-/// Constructs a 256-bit floating-point vector of [4 x double] with all
-///    vector elements initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
-///
-/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setzero_pd(void)
-{
-  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] with all
-///    vector elements initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
-///
-/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setzero_ps(void)
-{
-  return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
-}
-
-/// Constructs a 256-bit integer vector initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
-///
-/// \returns A 256-bit integer vector initialized to zero.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setzero_si256(void)
-{
-  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
-}
-
-/* Cast between vector types */
-/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
-///    floating-point vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit floating-point vector of [4 x double].
-/// \returns A 256-bit floating-point vector of [8 x float] containing the same
-///    bitwise pattern as the parameter.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_castpd_ps(__m256d __a)
-{
-  return (__m256)__a;
-}
-
-/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
-///    integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit floating-point vector of [4 x double].
-/// \returns A 256-bit integer vector containing the same bitwise pattern as the
-///    parameter.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_castpd_si256(__m256d __a)
-{
-  return (__m256i)__a;
-}
-
-/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
-///    floating-point vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit floating-point vector of [8 x float].
-/// \returns A 256-bit floating-point vector of [4 x double] containing the same
-///    bitwise pattern as the parameter.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_castps_pd(__m256 __a)
-{
-  return (__m256d)__a;
-}
-
-/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
-///    integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit floating-point vector of [8 x float].
-/// \returns A 256-bit integer vector containing the same bitwise pattern as the
-///    parameter.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_castps_si256(__m256 __a)
-{
-  return (__m256i)__a;
-}
-
-/// Casts a 256-bit integer vector into a 256-bit floating-point vector
-///    of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \returns A 256-bit floating-point vector of [8 x float] containing the same
-///    bitwise pattern as the parameter.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_castsi256_ps(__m256i __a)
-{
-  return (__m256)__a;
-}
-
-/// Casts a 256-bit integer vector into a 256-bit floating-point vector
-///    of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \returns A 256-bit floating-point vector of [4 x double] containing the same
-///    bitwise pattern as the parameter.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_castsi256_pd(__m256i __a)
-{
-  return (__m256d)__a;
-}
-
-/// Returns the lower 128 bits of a 256-bit floating-point vector of
-///    [4 x double] as a 128-bit floating-point vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit floating-point vector of [4 x double].
-/// \returns A 128-bit floating-point vector of [2 x double] containing the
-///    lower 128 bits of the parameter.
-static __inline __m128d __DEFAULT_FN_ATTRS
-_mm256_castpd256_pd128(__m256d __a)
-{
-  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
-}
-
-/// Returns the lower 128 bits of a 256-bit floating-point vector of
-///    [8 x float] as a 128-bit floating-point vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit floating-point vector of [8 x float].
-/// \returns A 128-bit floating-point vector of [4 x float] containing the
-///    lower 128 bits of the parameter.
-static __inline __m128 __DEFAULT_FN_ATTRS
-_mm256_castps256_ps128(__m256 __a)
-{
-  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
-}
-
-/// Truncates a 256-bit integer vector into a 128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 256-bit integer vector.
-/// \returns A 128-bit integer vector containing the lower 128 bits of the
-///    parameter.
-static __inline __m128i __DEFAULT_FN_ATTRS
-_mm256_castsi256_si128(__m256i __a)
-{
-  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double] from a
-///    128-bit floating-point vector of [2 x double].
-///
-///    The lower 128 bits contain the value of the source vector. The contents
-///    of the upper 128 bits are undefined.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
-///    contain the value of the parameter. The contents of the upper 128 bits
-///    are undefined.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_castpd128_pd256(__m128d __a)
-{
-  return __builtin_shufflevector(
-      (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] from a
-///    128-bit floating-point vector of [4 x float].
-///
-///    The lower 128 bits contain the value of the source vector. The contents
-///    of the upper 128 bits are undefined.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
-///    contain the value of the parameter. The contents of the upper 128 bits
-///    are undefined.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_castps128_ps256(__m128 __a)
-{
-  return __builtin_shufflevector((__v4sf)__a,
-                                 (__v4sf)__builtin_nondeterministic_value(__a),
-                                 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-/// Constructs a 256-bit integer vector from a 128-bit integer vector.
-///
-///    The lower 128 bits contain the value of the source vector. The contents
-///    of the upper 128 bits are undefined.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
-///    the parameter. The contents of the upper 128 bits are undefined.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_castsi128_si256(__m128i __a)
-{
-  return __builtin_shufflevector(
-      (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double] from a
-///    128-bit floating-point vector of [2 x double]. The lower 128 bits
-///    contain the value of the source vector. The upper 128 bits are set
-///    to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
-///    contain the value of the parameter. The upper 128 bits are set to zero.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_zextpd128_pd256(__m128d __a)
-{
-  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] from a
-///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
-///    the value of the source vector. The upper 128 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
-///    contain the value of the parameter. The upper 128 bits are set to zero.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_zextps128_ps256(__m128 __a)
-{
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-/// Constructs a 256-bit integer vector from a 128-bit integer vector.
-///    The lower 128 bits contain the value of the source vector. The upper
-///    128 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
-///    the parameter. The upper 128 bits are set to zero.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_zextsi128_si256(__m128i __a)
-{
-  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
-}
-
-/*
-   Vector insert.
-   We use macros rather than inlines because we only want to accept
-   invocations where the immediate M is a constant expression.
-*/
-/// Constructs a new 256-bit vector of [8 x float] by first duplicating
-///    a 256-bit vector of [8 x float] given in the first parameter, and then
-///    replacing either the upper or the lower 128 bits with the contents of a
-///    128-bit vector of [4 x float] in the second parameter.
-///
-///    The immediate integer parameter determines between the upper or the lower
-///    128 bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param V1
-///    A 256-bit vector of [8 x float]. This vector is copied to the result
-///    first, and then either the upper or the lower 128 bits of the result will
-///    be replaced by the contents of \a V2.
-/// \param V2
-///    A 128-bit vector of [4 x float]. The contents of this parameter are
-///    written to either the upper or the lower 128 bits of the result depending
-///    on the value of parameter \a M.
-/// \param M
-///    An immediate integer. The least significant bit determines how the values
-///    from the two parameters are interleaved: \n
-///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
-///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
-///    result. \n
-///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
-///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
-///    result.
-/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-#define _mm256_insertf128_ps(V1, V2, M) \
-  ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
-                                            (__v4sf)(__m128)(V2), (int)(M)))
-
-/// Constructs a new 256-bit vector of [4 x double] by first duplicating
-///    a 256-bit vector of [4 x double] given in the first parameter, and then
-///    replacing either the upper or the lower 128 bits with the contents of a
-///    128-bit vector of [2 x double] in the second parameter.
-///
-///    The immediate integer parameter determines between the upper or the lower
-///    128 bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param V1
-///    A 256-bit vector of [4 x double]. This vector is copied to the result
-///    first, and then either the upper or the lower 128 bits of the result will
-///    be replaced by the contents of \a V2.
-/// \param V2
-///    A 128-bit vector of [2 x double]. The contents of this parameter are
-///    written to either the upper or the lower 128 bits of the result depending
-///    on the value of parameter \a M.
-/// \param M
-///    An immediate integer. The least significant bit determines how the values
-///    from the two parameters are interleaved: \n
-///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
-///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
-///    result. \n
-///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
-///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
-///    result.
-/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-#define _mm256_insertf128_pd(V1, V2, M) \
-  ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
-                                             (__v2df)(__m128d)(V2), (int)(M)))
-
-/// Constructs a new 256-bit integer vector by first duplicating a
-///    256-bit integer vector given in the first parameter, and then replacing
-///    either the upper or the lower 128 bits with the contents of a 128-bit
-///    integer vector in the second parameter.
-///
-///    The immediate integer parameter determines between the upper or the lower
-///    128 bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param V1
-///    A 256-bit integer vector. This vector is copied to the result first, and
-///    then either the upper or the lower 128 bits of the result will be
-///    replaced by the contents of \a V2.
-/// \param V2
-///    A 128-bit integer vector. The contents of this parameter are written to
-///    either the upper or the lower 128 bits of the result depending on the
-///     value of parameter \a M.
-/// \param M
-///    An immediate integer. The least significant bit determines how the values
-///    from the two parameters are interleaved: \n
-///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
-///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
-///    result. \n
-///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
-///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
-///    result.
-/// \returns A 256-bit integer vector containing the interleaved values.
-#define _mm256_insertf128_si256(V1, V2, M) \
-  ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
-                                             (__v4si)(__m128i)(V2), (int)(M)))
-
-/*
-   Vector extract.
-   We use macros rather than inlines because we only want to accept
-   invocations where the immediate M is a constant expression.
-*/
-/// Extracts either the upper or the lower 128 bits from a 256-bit vector
-///    of [8 x float], as determined by the immediate integer parameter, and
-///    returns the extracted bits as a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
-///
-/// \param V
-///    A 256-bit vector of [8 x float].
-/// \param M
-///    An immediate integer. The least significant bit determines which bits are
-///    extracted from the first parameter: \n
-///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
-///    result. \n
-///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
-/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
-#define _mm256_extractf128_ps(V, M) \
-  ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
-
-/// Extracts either the upper or the lower 128 bits from a 256-bit vector
-///    of [4 x double], as determined by the immediate integer parameter, and
-///    returns the extracted bits as a 128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
-///
-/// \param V
-///    A 256-bit vector of [4 x double].
-/// \param M
-///    An immediate integer. The least significant bit determines which bits are
-///    extracted from the first parameter: \n
-///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
-///    result. \n
-///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
-/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
-#define _mm256_extractf128_pd(V, M) \
-  ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
-
-/// Extracts either the upper or the lower 128 bits from a 256-bit
-///    integer vector, as determined by the immediate integer parameter, and
-///    returns the extracted bits as a 128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
-///
-/// \param V
-///    A 256-bit integer vector.
-/// \param M
-///    An immediate integer. The least significant bit determines which bits are
-///    extracted from the first parameter:  \n
-///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
-///    result. \n
-///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
-/// \returns A 128-bit integer vector containing the extracted bits.
-#define _mm256_extractf128_si256(V, M) \
-  ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
-
-/// Constructs a 256-bit floating-point vector of [8 x float] by
-///    concatenating two 128-bit floating-point vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __hi
-///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
-///    128 bits of the result.
-/// \param __lo
-///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
-///    128 bits of the result.
-/// \returns A 256-bit floating-point vector of [8 x float] containing the
-///    concatenated result.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_set_m128 (__m128 __hi, __m128 __lo)
-{
-  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double] by
-///    concatenating two 128-bit floating-point vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __hi
-///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
-///    128 bits of the result.
-/// \param __lo
-///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
-///    128 bits of the result.
-/// \returns A 256-bit floating-point vector of [4 x double] containing the
-///    concatenated result.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_set_m128d (__m128d __hi, __m128d __lo)
-{
-  return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
-}
-
-/// Constructs a 256-bit integer vector by concatenating two 128-bit
-///    integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __hi
-///    A 128-bit integer vector to be copied to the upper 128 bits of the
-///    result.
-/// \param __lo
-///    A 128-bit integer vector to be copied to the lower 128 bits of the
-///    result.
-/// \returns A 256-bit integer vector containing the concatenated result.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_m128i (__m128i __hi, __m128i __lo)
-{
-  return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] by
-///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
-///    similar to _mm256_set_m128, but the order of the input parameters is
-///    swapped.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __lo
-///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
-///    128 bits of the result.
-/// \param __hi
-///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
-///    128 bits of the result.
-/// \returns A 256-bit floating-point vector of [8 x float] containing the
-///    concatenated result.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setr_m128 (__m128 __lo, __m128 __hi)
-{
-  return _mm256_set_m128(__hi, __lo);
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double] by
-///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
-///    similar to _mm256_set_m128d, but the order of the input parameters is
-///    swapped.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __lo
-///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
-///    128 bits of the result.
-/// \param __hi
-///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
-///    128 bits of the result.
-/// \returns A 256-bit floating-point vector of [4 x double] containing the
-///    concatenated result.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setr_m128d (__m128d __lo, __m128d __hi)
-{
-  return (__m256d)_mm256_set_m128d(__hi, __lo);
-}
-
-/// Constructs a 256-bit integer vector by concatenating two 128-bit
-///    integer vectors. This is similar to _mm256_set_m128i, but the order of
-///    the input parameters is swapped.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __lo
-///    A 128-bit integer vector to be copied to the lower 128 bits of the
-///    result.
-/// \param __hi
-///    A 128-bit integer vector to be copied to the upper 128 bits of the
-///    result.
-/// \returns A 256-bit integer vector containing the concatenated result.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_m128i (__m128i __lo, __m128i __hi)
-{
-  return (__m256i)_mm256_set_m128i(__hi, __lo);
-}
-
-/* SIMD load ops (unaligned) */
-/// Loads two 128-bit floating-point vectors of [4 x float] from
-///    unaligned memory locations and constructs a 256-bit floating-point vector
-///    of [8 x float] by concatenating the two 128-bit vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to load instructions followed by the
-///   <c> VINSERTF128 </c> instruction.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location containing 4 consecutive
-///    single-precision floating-point values. These values are to be copied to
-///    bits[255:128] of the result. The address of the memory location does not
-///    have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location containing 4 consecutive
-///    single-precision floating-point values. These values are to be copied to
-///    bits[127:0] of the result. The address of the memory location does not
-///    have to be aligned.
-/// \returns A 256-bit floating-point vector of [8 x float] containing the
-///    concatenated result.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
-{
-  return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
-}
-
-/// Loads two 128-bit floating-point vectors of [2 x double] from
-///    unaligned memory locations and constructs a 256-bit floating-point vector
-///    of [4 x double] by concatenating the two 128-bit vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to load instructions followed by the
-///   <c> VINSERTF128 </c> instruction.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location containing two consecutive
-///    double-precision floating-point values. These values are to be copied to
-///    bits[255:128] of the result. The address of the memory location does not
-///    have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location containing two consecutive
-///    double-precision floating-point values. These values are to be copied to
-///    bits[127:0] of the result. The address of the memory location does not
-///    have to be aligned.
-/// \returns A 256-bit floating-point vector of [4 x double] containing the
-///    concatenated result.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
-{
-  return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
-}
-
-/// Loads two 128-bit integer vectors from unaligned memory locations and
-///    constructs a 256-bit integer vector by concatenating the two 128-bit
-///    vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to load instructions followed by the
-///   <c> VINSERTF128 </c> instruction.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location containing a 128-bit integer
-///    vector. This vector is to be copied to bits[255:128] of the result. The
-///    address of the memory location does not have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location containing a 128-bit integer
-///    vector. This vector is to be copied to bits[127:0] of the result. The
-///    address of the memory location does not have to be aligned.
-/// \returns A 256-bit integer vector containing the concatenated result.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
-{
-   return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
-}
-
-/* SIMD store ops (unaligned) */
-/// Stores the upper and lower 128 bits of a 256-bit floating-point
-///    vector of [8 x float] into two different unaligned memory locations.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
-///   store instructions.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __a
-///    A 256-bit floating-point vector of [8 x float].
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
-{
-  __m128 __v128;
-
-  __v128 = _mm256_castps256_ps128(__a);
-  _mm_storeu_ps(__addr_lo, __v128);
-  __v128 = _mm256_extractf128_ps(__a, 1);
-  _mm_storeu_ps(__addr_hi, __v128);
-}
-
-/// Stores the upper and lower 128 bits of a 256-bit floating-point
-///    vector of [4 x double] into two different unaligned memory locations.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
-///   store instructions.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __a
-///    A 256-bit floating-point vector of [4 x double].
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
-{
-  __m128d __v128;
-
-  __v128 = _mm256_castpd256_pd128(__a);
-  _mm_storeu_pd(__addr_lo, __v128);
-  __v128 = _mm256_extractf128_pd(__a, 1);
-  _mm_storeu_pd(__addr_hi, __v128);
-}
-
-/// Stores the upper and lower 128 bits of a 256-bit integer vector into
-///    two different unaligned memory locations.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
-///   store instructions.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __a
-///    A 256-bit integer vector.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
-{
-  __m128i __v128;
-
-  __v128 = _mm256_castsi256_si128(__a);
-  _mm_storeu_si128(__addr_lo, __v128);
-  __v128 = _mm256_extractf128_si256(__a, 1);
-  _mm_storeu_si128(__addr_hi, __v128);
-}
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS128
-
-#endif /* __AVXINTRIN_H */
diff --git a/third_party/intel/clang/avxneconvertintrin.h b/third_party/intel/clang/avxneconvertintrin.h
deleted file mode 100644
index 1bef1c893..000000000
--- a/third_party/intel/clang/avxneconvertintrin.h
+++ /dev/null
@@ -1,484 +0,0 @@
-/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifdef __SSE2__
-
-#ifndef __AVXNECONVERTINTRIN_H
-#define __AVXNECONVERTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
-                 __min_vector_width__(256)))
-
-/// Convert scalar BF16 (16-bit) floating-point element
-/// stored at memory locations starting at location \a __A to a
-/// single-precision (32-bit) floating-point, broadcast it to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_bcstnebf16_ps(const void *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
-/// FOR j := 0 to 3
-///   m := j*32
-///   dst[m+31:m] := b
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_bcstnebf16_ps(const void *__A) {
-  return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
-}
-
-/// Convert scalar BF16 (16-bit) floating-point element
-/// stored at memory locations starting at location \a __A to a
-/// single-precision (32-bit) floating-point, broadcast it to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_bcstnebf16_ps(const void *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
-/// FOR j := 0 to 7
-///   m := j*32
-///   dst[m+31:m] := b
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_bcstnebf16_ps(const void *__A) {
-  return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
-}
-
-/// Convert scalar half-precision (16-bit) floating-point element
-/// stored at memory locations starting at location \a __A to a
-/// single-precision (32-bit) floating-point, broadcast it to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_bcstnesh_ps(const void *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
-/// FOR j := 0 to 3
-///   m := j*32
-///   dst[m+31:m] := b
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_bcstnesh_ps(const void *__A) {
-  return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
-}
-
-/// Convert scalar half-precision (16-bit) floating-point element
-/// stored at memory locations starting at location \a __A to a
-/// single-precision (32-bit) floating-point, broadcast it to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_bcstnesh_ps(const void *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
-/// FOR j := 0 to 7
-///   m := j*32
-///   dst[m+31:m] := b
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_bcstnesh_ps(const void *__A) {
-  return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
-}
-
-/// Convert packed BF16 (16-bit) floating-point even-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneebf16_ps(const __m128bh *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 128-bit memory location containing 8 consecutive
-///    BF16 (16-bit) floating-point values.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	k := j*2
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtneebf16_ps(const __m128bh *__A) {
-  return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
-}
-
-/// Convert packed BF16 (16-bit) floating-point even-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneebf16_ps(const __m256bh *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 256-bit memory location containing 16 consecutive
-///    BF16 (16-bit) floating-point values.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	k := j*2
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtneebf16_ps(const __m256bh *__A) {
-  return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
-}
-
-/// Convert packed half-precision (16-bit) floating-point even-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneeph_ps(const __m128h *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 128-bit memory location containing 8 consecutive
-///    half-precision (16-bit) floating-point values.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	k := j*2
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtneeph_ps(const __m128h *__A) {
-  return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
-}
-
-/// Convert packed half-precision (16-bit) floating-point even-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneeph_ps(const __m256h *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 256-bit memory location containing 16 consecutive
-///    half-precision (16-bit) floating-point values.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	k := j*2
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtneeph_ps(const __m256h *__A) {
-  return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
-}
-
-/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneobf16_ps(const __m128bh *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 128-bit memory location containing 8 consecutive
-///    BF16 (16-bit) floating-point values.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	k := j*2+1
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtneobf16_ps(const __m128bh *__A) {
-  return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
-}
-
-/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneobf16_ps(const __m256bh *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 256-bit memory location containing 16 consecutive
-///    BF16 (16-bit) floating-point values.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	k := j*2+1
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtneobf16_ps(const __m256bh *__A) {
-  return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
-}
-
-/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneoph_ps(const __m128h *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 128-bit memory location containing 8 consecutive
-///    half-precision (16-bit) floating-point values.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	k := j*2+1
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtneoph_ps(const __m128h *__A) {
-  return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
-}
-
-/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneoph_ps(const __m256h *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 256-bit memory location containing 16 consecutive
-///    half-precision (16-bit) floating-point values.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	k := j*2+1
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtneoph_ps(const __m256h *__A) {
-  return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in \a __A
-/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
-/// dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneps_avx_pbh(__m128 __A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \returns
-///    A 128-bit vector of [8 x bfloat].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_cvtneps_avx_pbh(__m128 __A) {
-  return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in \a __A
-/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
-/// dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneps_avx_pbh(__m256 __A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \returns
-///    A 128-bit vector of [8 x bfloat].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_cvtneps_avx_pbh(__m256 __A) {
-  return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXNECONVERTINTRIN_H
-#endif // __SSE2__
diff --git a/third_party/intel/clang/avxvnniint16intrin.h b/third_party/intel/clang/avxvnniint16intrin.h
deleted file mode 100644
index e4d342a8b..000000000
--- a/third_party/intel/clang/avxvnniint16intrin.h
+++ /dev/null
@@ -1,473 +0,0 @@
-/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AVXVNNIINT16INTRIN_H
-#define __AVXVNNIINT16INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
-                 __min_vector_width__(256)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUD instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x int].
-/// \param __A
-///    A 128-bit vector of [8 x short].
-/// \param __B
-///    A 128-bit vector of [8 x unsigned short].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUD instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x int].
-/// \param __A
-///    A 256-bit vector of [16 x short].
-/// \param __B
-///    A 256-bit vector of [16 x unsigned short].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x int].
-/// \param __A
-///    A 128-bit vector of [8 x short].
-/// \param __B
-///    A 128-bit vector of [8 x unsigned short].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x int].
-/// \param __A
-///    A 256-bit vector of [16 x short].
-/// \param __B
-///    A 256-bit vector of [16 x unsigned short].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWUSD instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x int].
-/// \param __A
-///    A 128-bit vector of [8 x unsigned short].
-/// \param __B
-///    A 128-bit vector of [8 x short].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWUSD instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x int].
-/// \param __A
-///    A 256-bit vector of [16 x unsigned short].
-/// \param __B
-///    A 256-bit vector of [16 x short].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x int].
-/// \param __A
-///    A 128-bit vector of [8 x unsigned short].
-/// \param __B
-///    A 128-bit vector of [8 x short].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x int].
-/// \param __A
-///    A 256-bit vector of [16 x unsigned short].
-/// \param __B
-///    A 256-bit vector of [16 x short].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWUUD instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x unsigned int].
-/// \param __A
-///    A 128-bit vector of [8 x unsigned short].
-/// \param __B
-///    A 128-bit vector of [8 x unsigned short].
-/// \returns
-///    A 128-bit vector of [4 x unsigned int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWUUD instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x unsigned int].
-/// \param __A
-///    A 256-bit vector of [16 x unsigned short].
-/// \param __B
-///    A 256-bit vector of [16 x unsigned short].
-/// \returns
-///    A 256-bit vector of [8 x unsigned int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x unsigned int].
-/// \param __A
-///    A 128-bit vector of [8 x unsigned short].
-/// \param __B
-///    A 128-bit vector of [8 x unsigned short].
-/// \returns
-///    A 128-bit vector of [4 x unsigned int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x unsigned int].
-/// \param __A
-///    A 256-bit vector of [16 x unsigned short].
-/// \param __B
-///    A 256-bit vector of [16 x unsigned short].
-/// \returns
-///    A 256-bit vector of [8 x unsigned int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXVNNIINT16INTRIN_H
diff --git a/third_party/intel/clang/avxvnniint8intrin.h b/third_party/intel/clang/avxvnniint8intrin.h
deleted file mode 100644
index b0b6cb853..000000000
--- a/third_party/intel/clang/avxvnniint8intrin.h
+++ /dev/null
@@ -1,471 +0,0 @@
-/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVXVNNIINT8INTRIN_H
-#define __AVXVNNIINT8INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
-                 __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
-                 __min_vector_width__(128)))
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x char].
-/// \param __B
-///    A 128-bit vector of [16 x char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
-/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x char].
-/// \param __B
-///    A 256-bit vector of [32 x char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
-/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x char].
-/// \param __B
-///    A 128-bit vector of [16 x char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
-/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x char].
-/// \param __B
-///    A 256-bit vector of [32 x char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
-/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x char].
-/// \param __B
-///    A 128-bit vector of [16 x unsigned char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
-/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
-/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
-/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x char].
-/// \param __B
-///    A 256-bit vector of [32 x unsigned char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
-/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
-/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
-/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x char].
-/// \param __B
-///    A 128-bit vector of [16 x unsigned char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
-/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
-/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
-/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x char].
-/// \param __B
-///    A 256-bit vector of [32 x unsigned char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
-/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
-/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
-/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x unsigned char].
-/// \param __B
-///    A 128-bit vector of [16 x unsigned char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
-/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x unsigned char].
-/// \param __B
-///    A 256-bit vector of [32 x unsigned char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
-/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x unsigned char].
-/// \param __B
-///    A 128-bit vector of [16 x unsigned char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
-/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x unsigned char].
-/// \param __B
-///    A 256-bit vector of [32 x unsigned char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
-/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXVNNIINT8INTRIN_H
diff --git a/third_party/intel/clang/avxvnniintrin.h b/third_party/intel/clang/avxvnniintrin.h
deleted file mode 100644
index b7de562b5..000000000
--- a/third_party/intel/clang/avxvnniintrin.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVXVNNIINTRIN_H
-#define __AVXVNNIINTRIN_H
-
-/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
-/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
-/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
-/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
-/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
-
-/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
-///  and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
-/// and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXVNNIINTRIN_H
diff --git a/third_party/intel/clang/bmi2intrin.h b/third_party/intel/clang/bmi2intrin.h
deleted file mode 100644
index f0a3343be..000000000
--- a/third_party/intel/clang/bmi2intrin.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <bmi2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __BMI2INTRIN_H
-#define __BMI2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
-
-/// Copies the unsigned 32-bit integer \a __X and zeroes the upper bits
-///    starting at bit number \a __Y.
-///
-/// \code{.operation}
-/// i := __Y[7:0]
-/// result := __X
-/// IF i < 32
-///   result[31:i] := 0
-/// FI
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c BZHI instruction.
-///
-/// \param __X
-///    The 32-bit source value to copy.
-/// \param __Y
-///    The lower 8 bits specify the bit number of the lowest bit to zero.
-/// \returns The partially zeroed 32-bit value.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bzhi_u32(unsigned int __X, unsigned int __Y)
-{
-  return __builtin_ia32_bzhi_si(__X, __Y);
-}
-
-/// Deposit (scatter) low-order bits from the unsigned 32-bit integer \a __X
-///    into the 32-bit result, according to the mask in the unsigned 32-bit
-///    integer \a __Y. All other bits of the result are zero.
-///
-/// \code{.operation}
-/// i := 0
-/// result := 0
-/// FOR m := 0 TO 31
-///   IF __Y[m] == 1
-///     result[m] := __X[i]
-///     i := i + 1
-///   ENDIF
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c PDEP instruction.
-///
-/// \param __X
-///    The 32-bit source value to copy.
-/// \param __Y
-///    The 32-bit mask specifying where to deposit source bits.
-/// \returns The 32-bit result.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pdep_u32(unsigned int __X, unsigned int __Y)
-{
-  return __builtin_ia32_pdep_si(__X, __Y);
-}
-
-/// Extract (gather) bits from the unsigned 32-bit integer \a __X into the
-///    low-order bits of the 32-bit result, according to the mask in the
-///    unsigned 32-bit integer \a __Y. All other bits of the result are zero.
-///
-/// \code{.operation}
-/// i := 0
-/// result := 0
-/// FOR m := 0 TO 31
-///   IF __Y[m] == 1
-///     result[i] := __X[m]
-///     i := i + 1
-///   ENDIF
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c PEXT instruction.
-///
-/// \param __X
-///    The 32-bit source value to copy.
-/// \param __Y
-///    The 32-bit mask specifying which source bits to extract.
-/// \returns The 32-bit result.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pext_u32(unsigned int __X, unsigned int __Y)
-{
-  return __builtin_ia32_pext_si(__X, __Y);
-}
-
-/// Multiplies the unsigned 32-bit integers \a __X and \a __Y to form a
-///    64-bit product. Stores the upper 32 bits of the product in the
-///    memory at \a __P and returns the lower 32 bits.
-///
-/// \code{.operation}
-/// Store32(__P, (__X * __Y)[63:32])
-/// result := (__X * __Y)[31:0]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c MULX instruction.
-///
-/// \param __X
-///    An unsigned 32-bit multiplicand.
-/// \param __Y
-///    An unsigned 32-bit multiplicand.
-/// \param __P
-///    A pointer to memory for storing the upper half of the product.
-/// \returns The lower half of the product.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
-{
-  unsigned long long __res = (unsigned long long) __X * __Y;
-  *__P = (unsigned int)(__res >> 32);
-  return (unsigned int)__res;
-}
-
-#ifdef  __x86_64__
-
-/// Copies the unsigned 64-bit integer \a __X and zeroes the upper bits
-///    starting at bit number \a __Y.
-///
-/// \code{.operation}
-/// i := __Y[7:0]
-/// result := __X
-/// IF i < 64
-///   result[63:i] := 0
-/// FI
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c BZHI instruction.
-///
-/// \param __X
-///    The 64-bit source value to copy.
-/// \param __Y
-///    The lower 8 bits specify the bit number of the lowest bit to zero.
-/// \returns The partially zeroed 64-bit value.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bzhi_u64(unsigned long long __X, unsigned long long __Y)
-{
-  return __builtin_ia32_bzhi_di(__X, __Y);
-}
-
-/// Deposit (scatter) low-order bits from the unsigned 64-bit integer \a __X
-///    into the 64-bit result, according to the mask in the unsigned 64-bit
-///    integer \a __Y. All other bits of the result are zero.
-///
-/// \code{.operation}
-/// i := 0
-/// result := 0
-/// FOR m := 0 TO 63
-///   IF __Y[m] == 1
-///     result[m] := __X[i]
-///     i := i + 1
-///   ENDIF
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c PDEP instruction.
-///
-/// \param __X
-///    The 64-bit source value to copy.
-/// \param __Y
-///    The 64-bit mask specifying where to deposit source bits.
-/// \returns The 64-bit result.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pdep_u64(unsigned long long __X, unsigned long long __Y)
-{
-  return __builtin_ia32_pdep_di(__X, __Y);
-}
-
-/// Extract (gather) bits from the unsigned 64-bit integer \a __X into the
-///    low-order bits of the 64-bit result, according to the mask in the
-///    unsigned 64-bit integer \a __Y. All other bits of the result are zero.
-///
-/// \code{.operation}
-/// i := 0
-/// result := 0
-/// FOR m := 0 TO 63
-///   IF __Y[m] == 1
-///     result[i] := __X[m]
-///     i := i + 1
-///   ENDIF
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c PEXT instruction.
-///
-/// \param __X
-///    The 64-bit source value to copy.
-/// \param __Y
-///    The 64-bit mask specifying which source bits to extract.
-/// \returns The 64-bit result.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pext_u64(unsigned long long __X, unsigned long long __Y)
-{
-  return __builtin_ia32_pext_di(__X, __Y);
-}
-
-/// Multiplies the unsigned 64-bit integers \a __X and \a __Y to form a
-///    128-bit product. Stores the upper 64 bits of the product to the
-///    memory addressed by \a __P and returns the lower 64 bits.
-///
-/// \code{.operation}
-/// Store64(__P, (__X * __Y)[127:64])
-/// result := (__X * __Y)[63:0]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c MULX instruction.
-///
-/// \param __X
-///    An unsigned 64-bit multiplicand.
-/// \param __Y
-///    An unsigned 64-bit multiplicand.
-/// \param __P
-///    A pointer to memory for storing the upper half of the product.
-/// \returns The lower half of the product.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_mulx_u64 (unsigned long long __X, unsigned long long __Y,
-	   unsigned long long *__P)
-{
-  unsigned __int128 __res = (unsigned __int128) __X * __Y;
-  *__P = (unsigned long long) (__res >> 64);
-  return (unsigned long long) __res;
-}
-
-#endif /* __x86_64__  */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __BMI2INTRIN_H */
diff --git a/third_party/intel/clang/bmiintrin.h b/third_party/intel/clang/bmiintrin.h
deleted file mode 100644
index 78bffe68e..000000000
--- a/third_party/intel/clang/bmiintrin.h
+++ /dev/null
@@ -1,614 +0,0 @@
-/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __BMIINTRIN_H
-#define __BMIINTRIN_H
-
-/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
-   instruction behaves as BSF on non-BMI targets, there is code that expects
-   to use it as a potentially faster version of BSF. */
-#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param __X
-///    An unsigned 16-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 16-bit integer containing the number of trailing zero
-///    bits in the operand.
-/// \see _tzcnt_u16
-static __inline__ unsigned short __RELAXED_FN_ATTRS
-__tzcnt_u16(unsigned short __X)
-{
-  return __builtin_ia32_tzcnt_u16(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned short _tzcnt_u16(unsigned short __X);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param __X
-///    An unsigned 16-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 16-bit integer containing the number of trailing zero
-///    bits in the operand.
-/// \see __tzcnt_u16
-#define _tzcnt_u16 __tzcnt_u16
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param __X
-///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of trailing zero
-///    bits in the operand.
-/// \see { _mm_tzcnt_32 _tzcnt_u32 }
-static __inline__ unsigned int __RELAXED_FN_ATTRS
-__tzcnt_u32(unsigned int __X)
-{
-  return __builtin_ia32_tzcnt_u32(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param __X
-///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns A 32-bit integer containing the number of trailing zero bits in
-///    the operand.
-/// \see { __tzcnt_u32 _tzcnt_u32 }
-static __inline__ int __RELAXED_FN_ATTRS
-_mm_tzcnt_32(unsigned int __X)
-{
-  return (int)__builtin_ia32_tzcnt_u32(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _tzcnt_u32(unsigned int __X);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param __X
-///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of trailing zero
-///    bits in the operand.
-/// \see { _mm_tzcnt_32 __tzcnt_u32 }
-#define _tzcnt_u32 __tzcnt_u32
-
-#ifdef __x86_64__
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of trailing zero
-///    bits in the operand.
-/// \see { _mm_tzcnt_64 _tzcnt_u64 }
-static __inline__ unsigned long long __RELAXED_FN_ATTRS
-__tzcnt_u64(unsigned long long __X)
-{
-  return __builtin_ia32_tzcnt_u64(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An 64-bit integer containing the number of trailing zero bits in
-///    the operand.
-/// \see { __tzcnt_u64 _tzcnt_u64 }
-static __inline__ long long __RELAXED_FN_ATTRS
-_mm_tzcnt_64(unsigned long long __X)
-{
-  return (long long)__builtin_ia32_tzcnt_u64(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _tzcnt_u64(unsigned long long __X);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TZCNT instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of trailing zero
-///    bits in the operand.
-/// \see { _mm_tzcnt_64 __tzcnt_u64
-#define _tzcnt_u64 __tzcnt_u64
-
-#endif /* __x86_64__ */
-
-#undef __RELAXED_FN_ATTRS
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
-
-/// Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ANDN instruction.
-///
-/// \param __X
-///    An unsigned integer containing one of the operands.
-/// \param __Y
-///    An unsigned integer containing one of the operands.
-/// \returns An unsigned integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
-/// \see _andn_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__andn_u32(unsigned int __X, unsigned int __Y)
-{
-  return ~__X & __Y;
-}
-
-/// Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _andn_u32(unsigned int __X, unsigned int __Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ANDN instruction.
-///
-/// \param __X
-///    An unsigned integer containing one of the operands.
-/// \param __Y
-///    An unsigned integer containing one of the operands.
-/// \returns An unsigned integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
-/// \see __andn_u32
-#define _andn_u32 __andn_u32
-
-/* AMD-specified, double-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BEXTR instruction.
-///
-/// \param __X
-///    An unsigned integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
-///    specify the index of the least significant bit. Bits [15:8] specify the
-///    number of bits to be extracted.
-/// \returns An unsigned integer whose least significant bits contain the
-///    extracted bits.
-/// \see _bextr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__bextr_u32(unsigned int __X, unsigned int __Y)
-{
-  return __builtin_ia32_bextr_u32(__X, __Y);
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BEXTR instruction.
-///
-/// \param __X
-///    An unsigned integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned integer used to specify the index of the least significant
-///    bit for the bits to be extracted. Bits [7:0] specify the index.
-/// \param __Z
-///    An unsigned integer used to specify the number of bits to be extracted.
-///    Bits [7:0] specify the number of bits.
-/// \returns An unsigned integer whose least significant bits contain the
-///    extracted bits.
-/// \see __bextr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
-{
-  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR2 */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BEXTR instruction.
-///
-/// \param __X
-///    An unsigned integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
-///    specify the index of the least significant bit. Bits [15:8] specify the
-///    number of bits to be extracted.
-/// \returns An unsigned integer whose least significant bits contain the
-///    extracted bits.
-/// \see __bextr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bextr2_u32(unsigned int __X, unsigned int __Y) {
-  return __builtin_ia32_bextr_u32(__X, __Y);
-}
-
-/// Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BLSI instruction.
-///
-/// \param __X
-///    An unsigned integer whose bits are to be cleared.
-/// \returns An unsigned integer containing the result of clearing the bits from
-///    the source operand.
-/// \see _blsi_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsi_u32(unsigned int __X)
-{
-  return __X & -__X;
-}
-
-/// Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _blsi_u32(unsigned int __X);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSI instruction.
-///
-/// \param __X
-///    An unsigned integer whose bits are to be cleared.
-/// \returns An unsigned integer containing the result of clearing the bits from
-///    the source operand.
-/// \see __blsi_u32
-#define _blsi_u32 __blsi_u32
-
-/// Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
-///
-/// \param __X
-///    An unsigned integer used to create the mask.
-/// \returns An unsigned integer containing the newly created mask.
-/// \see _blsmsk_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsmsk_u32(unsigned int __X)
-{
-  return __X ^ (__X - 1);
-}
-
-/// Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _blsmsk_u32(unsigned int __X);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
-///
-/// \param __X
-///    An unsigned integer used to create the mask.
-/// \returns An unsigned integer containing the newly created mask.
-/// \see __blsmsk_u32
-#define _blsmsk_u32 __blsmsk_u32
-
-/// Clears the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BLSR instruction.
-///
-/// \param __X
-///    An unsigned integer containing the operand to be cleared.
-/// \returns An unsigned integer containing the result of clearing the source
-///    operand.
-/// \see _blsr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsr_u32(unsigned int __X)
-{
-  return __X & (__X - 1);
-}
-
-/// Clears the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _bls4_u32(unsigned int __X);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSR instruction.
-///
-/// \param __X
-///    An unsigned integer containing the operand to be cleared.
-/// \returns An unsigned integer containing the result of clearing the source
-///    operand.
-/// \see __blsr_u32
-#define _blsr_u32 __blsr_u32
-
-#ifdef __x86_64__
-
-/// Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ANDN instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer containing one of the operands.
-/// \param __Y
-///    An unsigned 64-bit integer containing one of the operands.
-/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
-/// \see _andn_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__andn_u64 (unsigned long long __X, unsigned long long __Y)
-{
-  return ~__X & __Y;
-}
-
-/// Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _andn_u64(unsigned long long __X,
-///                              unsigned long long __Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ANDN instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer containing one of the operands.
-/// \param __Y
-///    An unsigned 64-bit integer containing one of the operands.
-/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
-/// \see __andn_u64
-#define _andn_u64 __andn_u64
-
-/* AMD-specified, double-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BEXTR instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
-///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
-///    the number of bits to be extracted.
-/// \returns An unsigned 64-bit integer whose least significant bits contain the
-///    extracted bits.
-/// \see _bextr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__bextr_u64(unsigned long long __X, unsigned long long __Y)
-{
-  return __builtin_ia32_bextr_u64(__X, __Y);
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-///     in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BEXTR instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned integer used to specify the index of the least significant
-///    bit for the bits to be extracted. Bits [7:0] specify the index.
-/// \param __Z
-///    An unsigned integer used to specify the number of bits to be extracted.
-///    Bits [7:0] specify the number of bits.
-/// \returns An unsigned 64-bit integer whose least significant bits contain the
-///    extracted bits.
-/// \see __bextr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
-{
-  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR2 */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BEXTR instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
-///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
-///    the number of bits to be extracted.
-/// \returns An unsigned 64-bit integer whose least significant bits contain the
-///    extracted bits.
-/// \see __bextr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bextr2_u64(unsigned long long __X, unsigned long long __Y) {
-  return __builtin_ia32_bextr_u64(__X, __Y);
-}
-
-/// Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BLSI instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose bits are to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    bits from the source operand.
-/// \see _blsi_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsi_u64(unsigned long long __X)
-{
-  return __X & -__X;
-}
-
-/// Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _blsi_u64(unsigned long long __X);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSI instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose bits are to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    bits from the source operand.
-/// \see __blsi_u64
-#define _blsi_u64 __blsi_u64
-
-/// Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer used to create the mask.
-/// \returns An unsigned 64-bit integer containing the newly created mask.
-/// \see _blsmsk_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsmsk_u64(unsigned long long __X)
-{
-  return __X ^ (__X - 1);
-}
-
-/// Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _blsmsk_u64(unsigned long long __X);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer used to create the mask.
-/// \returns An unsigned 64-bit integer containing the newly created mask.
-/// \see __blsmsk_u64
-#define _blsmsk_u64 __blsmsk_u64
-
-/// Clears the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BLSR instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer containing the operand to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    source operand.
-/// \see _blsr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsr_u64(unsigned long long __X)
-{
-  return __X & (__X - 1);
-}
-
-/// Clears the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _blsr_u64(unsigned long long __X);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BLSR instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer containing the operand to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    source operand.
-/// \see __blsr_u64
-#define _blsr_u64 __blsr_u64
-
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__BMI__) */
-
-#endif /* __BMIINTRIN_H */
diff --git a/third_party/intel/clang/cetintrin.h b/third_party/intel/clang/cetintrin.h
deleted file mode 100644
index a68df5b1d..000000000
--- a/third_party/intel/clang/cetintrin.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*===---- cetintrin.h - CET intrinsic --------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __CETINTRIN_H
-#define __CETINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
-
-static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
-  __builtin_ia32_incsspd((unsigned int)__a);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
-  __builtin_ia32_incsspq(__a);
-}
-#endif /* __x86_64__ */
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
-  __builtin_ia32_incsspq(__a);
-}
-#else /* __x86_64__ */
-static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
-  __builtin_ia32_incsspd(__a);
-}
-#endif /* __x86_64__ */
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
-  return __builtin_ia32_rdsspd(__a);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32(void) {
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-  unsigned int t;
-  return __builtin_ia32_rdsspd(t);
-#pragma clang diagnostic pop
-}
-
-#ifdef __x86_64__
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
-  return __builtin_ia32_rdsspq(__a);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64(void) {
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-  unsigned long long t;
-  return __builtin_ia32_rdsspq(t);
-#pragma clang diagnostic pop
-}
-#endif /* __x86_64__ */
-
-#ifdef __x86_64__
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) {
-  return __builtin_ia32_rdsspq(0);
-}
-#else /* __x86_64__ */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
-  return __builtin_ia32_rdsspd(0);
-}
-#endif /* __x86_64__ */
-
-static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp(void) {
-  __builtin_ia32_saveprevssp();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
-  __builtin_ia32_rstorssp(__p);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
-  __builtin_ia32_wrssd(__a, __p);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
-  __builtin_ia32_wrssq(__a, __p);
-}
-#endif /* __x86_64__ */
-
-static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
-  __builtin_ia32_wrussd(__a, __p);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
-  __builtin_ia32_wrussq(__a, __p);
-}
-#endif /* __x86_64__ */
-
-static __inline__ void __DEFAULT_FN_ATTRS _setssbsy(void) {
-  __builtin_ia32_setssbsy();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
-  __builtin_ia32_clrssbsy(__p);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __CETINTRIN_H */
diff --git a/third_party/intel/clang/cldemoteintrin.h b/third_party/intel/clang/cldemoteintrin.h
deleted file mode 100644
index cfb951c1b..000000000
--- a/third_party/intel/clang/cldemoteintrin.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __CLDEMOTEINTRIN_H
-#define __CLDEMOTEINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("cldemote")))
-
-/// Hint to hardware that the cache line that contains \p __P should be demoted
-/// from the cache closest to the processor core to a level more distant from
-/// the processor core.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CLDEMOTE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS
-_cldemote(const void * __P) {
-  __builtin_ia32_cldemote(__P);
-}
-
-#define _mm_cldemote(p) _cldemote(p)
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/clflushoptintrin.h b/third_party/intel/clang/clflushoptintrin.h
deleted file mode 100644
index ae0a0244c..000000000
--- a/third_party/intel/clang/clflushoptintrin.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __CLFLUSHOPTINTRIN_H
-#define __CLFLUSHOPTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
-
-/// Invalidates all levels of the cache hierarchy and flushes modified data to
-///    memory for the cache line specified by the address \a __m.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c CLFLUSHOPT instruction.
-///
-/// \param __m
-///    An address within the cache line to flush and invalidate.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clflushopt(void const * __m) {
-  __builtin_ia32_clflushopt(__m);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/clwbintrin.h b/third_party/intel/clang/clwbintrin.h
deleted file mode 100644
index 3360d203f..000000000
--- a/third_party/intel/clang/clwbintrin.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __CLWBINTRIN_H
-#define __CLWBINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clwb")))
-
-/// Writes back to memory the cache line (if modified) that contains the
-/// linear address specified in \a __p from any level of the cache hierarchy in
-/// the cache coherence domain
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> CLWB </c> instruction.
-///
-/// \param __p
-///    A pointer to the memory location used to identify the cache line to be
-///    written back.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clwb(void const *__p) {
-  __builtin_ia32_clwb(__p);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/clzerointrin.h b/third_party/intel/clang/clzerointrin.h
deleted file mode 100644
index acccfe94f..000000000
--- a/third_party/intel/clang/clzerointrin.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*===----------------------- clzerointrin.h - CLZERO ----------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __X86INTRIN_H
-#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __CLZEROINTRIN_H
-#define __CLZEROINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("clzero")))
-
-/// Zeroes out the cache line for the address \a __line. This uses a
-///    non-temporal store. Calling \c _mm_sfence() afterward might be needed
-///    to enforce ordering.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c CLZERO instruction.
-///
-/// \param __line
-///    An address within the cache line to zero out.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clzero (void * __line)
-{
-  __builtin_ia32_clzero ((void *)__line);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __CLZEROINTRIN_H */
diff --git a/third_party/intel/clang/cmpccxaddintrin.h b/third_party/intel/clang/cmpccxaddintrin.h
deleted file mode 100644
index 695749899..000000000
--- a/third_party/intel/clang/cmpccxaddintrin.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*===--------------- cmpccxaddintrin.h - CMPCCXADD intrinsics--------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __X86GPRINTRIN_H
-#error                                                                         \
-    "Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
-#endif // __X86GPRINTRIN_H
-
-#ifndef __CMPCCXADDINTRIN_H
-#define __CMPCCXADDINTRIN_H
-#ifdef __x86_64__
-
-typedef enum {
-  _CMPCCX_O,   /* Overflow.  */
-  _CMPCCX_NO,  /* No overflow.  */
-  _CMPCCX_B,   /* Below.  */
-  _CMPCCX_NB,  /* Not below.  */
-  _CMPCCX_Z,   /* Zero.  */
-  _CMPCCX_NZ,  /* Not zero.  */
-  _CMPCCX_BE,  /* Below or equal.  */
-  _CMPCCX_NBE, /* Neither below nor equal.  */
-  _CMPCCX_S,   /* Sign.  */
-  _CMPCCX_NS,  /* No sign.  */
-  _CMPCCX_P,   /* Parity.  */
-  _CMPCCX_NP,  /* No parity.  */
-  _CMPCCX_L,   /* Less.  */
-  _CMPCCX_NL,  /* Not less.  */
-  _CMPCCX_LE,  /* Less or equal.  */
-  _CMPCCX_NLE, /* Neither less nor equal.  */
-} _CMPCCX_ENUM;
-
-/// Compares the value from the memory __A with the value of __B. If the
-/// specified condition __D is met, then add the third operand __C to the
-/// __A and write it into __A, else the value of __A is unchanged. The return
-/// value is the original value of __A.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c CMPCCXADD instructions.
-///
-/// \param __A
-///    __A pointer specifying the memory address.
-///
-/// \param __B
-///   A integer operand.
-///
-/// \param __C
-///   A integer operand.
-///
-/// \param __D
-///   The specified condition.
-///
-/// \returns a integer which is the original value of first operand.
-
-#define _cmpccxadd_epi32(__A, __B, __C, __D)                                   \
-  ((int)(__builtin_ia32_cmpccxadd32((void *)(__A), (int)(__B), (int)(__C),     \
-                                    (int)(__D))))
-
-#define _cmpccxadd_epi64(__A, __B, __C, __D)                                   \
-  ((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B),     \
-                                          (long long)(__C), (int)(__D))))
-
-#endif // __x86_64__
-#endif // __CMPCCXADDINTRIN_H
diff --git a/third_party/intel/clang/crc32intrin.h b/third_party/intel/clang/crc32intrin.h
deleted file mode 100644
index a0bd99d1b..000000000
--- a/third_party/intel/clang/crc32intrin.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CRC32INTRIN_H
-#define __CRC32INTRIN_H
-
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned char operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a  __D.
-/// \param __D
-///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u8(unsigned int __C, unsigned char __D)
-{
-  return __builtin_ia32_crc32qi(__C, __D);
-}
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned short operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a __D.
-/// \param __D
-///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u16(unsigned int __C, unsigned short __D)
-{
-  return __builtin_ia32_crc32hi(__C, __D);
-}
-
-/// Adds the first unsigned integer operand to the CRC-32C checksum of
-///    the second unsigned integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a __D.
-/// \param __D
-///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u32(unsigned int __C, unsigned int __D)
-{
-  return __builtin_ia32_crc32si(__C, __D);
-}
-
-#ifdef __x86_64__
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned 64-bit integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a __D.
-/// \param __D
-///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
-{
-  return __builtin_ia32_crc32di(__C, __D);
-}
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __CRC32INTRIN_H */
diff --git a/third_party/intel/clang/emmintrin.h b/third_party/intel/clang/emmintrin.h
deleted file mode 100644
index 16ac07eaa..000000000
--- a/third_party/intel/clang/emmintrin.h
+++ /dev/null
@@ -1,4906 +0,0 @@
-/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __EMMINTRIN_H
-#define __EMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include "xmmintrin.h"
-
-typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
-typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
-
-typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
-typedef long long __m128i_u
-    __attribute__((__vector_size__(16), __aligned__(1)));
-
-/* Type defines.  */
-typedef double __v2df __attribute__((__vector_size__(16)));
-typedef long long __v2di __attribute__((__vector_size__(16)));
-typedef short __v8hi __attribute__((__vector_size__(16)));
-typedef char __v16qi __attribute__((__vector_size__(16)));
-
-/* Unsigned types */
-typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
-typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
-typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
-
-/* We need an explicitly signed variant for char. Note that this shouldn't
- * appear in the interface though. */
-typedef signed char __v16qs __attribute__((__vector_size__(16)));
-
-#ifdef __SSE2__
-/* Both _Float16 and __bf16 require SSE2 being enabled. */
-typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
-
-typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
-typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
-
-/// Adds lower double-precision values in both operands and returns the
-///    sum in the lower 64 bits of the result. The upper 64 bits of the result
-///    are copied from the upper double-precision value of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
-///    from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
-                                                        __m128d __b) {
-  __a[0] += __b[0];
-  return __a;
-}
-
-/// Adds two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] containing the sums of both
-///    operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
-                                                        __m128d __b) {
-  return (__m128d)((__v2df)__a + (__v2df)__b);
-}
-
-/// Subtracts the lower double-precision value of the second operand
-///    from the lower double-precision value of the first operand and returns
-///    the difference in the lower 64 bits of the result. The upper 64 bits of
-///    the result are copied from the upper double-precision value of the first
-///    operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the minuend.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing the subtrahend.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-///    difference of the lower 64 bits of both operands. The upper 64 bits are
-///    copied from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
-                                                        __m128d __b) {
-  __a[0] -= __b[0];
-  return __a;
-}
-
-/// Subtracts two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the minuend.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing the subtrahend.
-/// \returns A 128-bit vector of [2 x double] containing the differences between
-///    both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
-                                                        __m128d __b) {
-  return (__m128d)((__v2df)__a - (__v2df)__b);
-}
-
-/// Multiplies lower double-precision values in both operands and returns
-///    the product in the lower 64 bits of the result. The upper 64 bits of the
-///    result are copied from the upper double-precision value of the first
-///    operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-///    product of the lower 64 bits of both operands. The upper 64 bits are
-///    copied from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
-                                                        __m128d __b) {
-  __a[0] *= __b[0];
-  return __a;
-}
-
-/// Multiplies two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the operands.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the operands.
-/// \returns A 128-bit vector of [2 x double] containing the products of both
-///    operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
-                                                        __m128d __b) {
-  return (__m128d)((__v2df)__a * (__v2df)__b);
-}
-
-/// Divides the lower double-precision value of the first operand by the
-///    lower double-precision value of the second operand and returns the
-///    quotient in the lower 64 bits of the result. The upper 64 bits of the
-///    result are copied from the upper double-precision value of the first
-///    operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the dividend.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing divisor.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-///    quotient of the lower 64 bits of both operands. The upper 64 bits are
-///    copied from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
-                                                        __m128d __b) {
-  __a[0] /= __b[0];
-  return __a;
-}
-
-/// Performs an element-by-element division of two 128-bit vectors of
-///    [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the dividend.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing the divisor.
-/// \returns A 128-bit vector of [2 x double] containing the quotients of both
-///    operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
-                                                        __m128d __b) {
-  return (__m128d)((__v2df)__a / (__v2df)__b);
-}
-
-/// Calculates the square root of the lower double-precision value of
-///    the second operand and returns it in the lower 64 bits of the result.
-///    The upper 64 bits of the result are copied from the upper
-///    double-precision value of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the operands. The
-///    upper 64 bits of this operand are copied to the upper 64 bits of the
-///    result.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the operands. The
-///    square root is calculated using the lower 64 bits of this operand.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-///    square root of the lower 64 bits of operand \a __b, and whose upper 64
-///    bits are copied from the upper 64 bits of operand \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
-                                                         __m128d __b) {
-  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
-  return __extension__(__m128d){__c[0], __a[1]};
-}
-
-/// Calculates the square root of the each of two values stored in a
-///    128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [2 x double] containing the square roots of the
-///    values in the operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
-  return __builtin_ia32_sqrtpd((__v2df)__a);
-}
-
-/// Compares lower 64-bit double-precision values of both operands, and
-///    returns the lesser of the pair of values in the lower 64-bits of the
-///    result. The upper 64 bits of the result are copied from the upper
-///    double-precision value of the first operand.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the operands. The
-///    lower 64 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the operands. The
-///    lower 64 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-///    minimum value between both operands. The upper 64 bits are copied from
-///    the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Performs element-by-element comparison of the two 128-bit vectors of
-///    [2 x double] and returns a vector containing the lesser of each pair of
-///    values.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the operands.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the operands.
-/// \returns A 128-bit vector of [2 x double] containing the minimum values
-///    between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares lower 64-bit double-precision values of both operands, and
-///    returns the greater of the pair of values in the lower 64-bits of the
-///    result. The upper 64 bits of the result are copied from the upper
-///    double-precision value of the first operand.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the operands. The
-///    lower 64 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the operands. The
-///    lower 64 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-///    maximum value between both operands. The upper 64 bits are copied from
-///    the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Performs element-by-element comparison of the two 128-bit vectors of
-///    [2 x double] and returns a vector containing the greater of each pair
-///    of values.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the operands.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the operands.
-/// \returns A 128-bit vector of [2 x double] containing the maximum values
-///    between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
-///    values between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
-                                                        __m128d __b) {
-  return (__m128d)((__v2du)__a & (__v2du)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
-///    the one's complement of the values contained in the first source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the left source operand. The
-///    one's complement of this value is used in the bitwise AND.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing the right source operand.
-/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
-///    values in the second operand and the one's complement of the first
-///    operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)(~(__v2du)__a & (__v2du)__b);
-}
-
-/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
-///    values between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
-                                                       __m128d __b) {
-  return (__m128d)((__v2du)__a | (__v2du)__b);
-}
-
-/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
-///    values between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
-                                                        __m128d __b) {
-  return (__m128d)((__v2du)__a ^ (__v2du)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] for equality.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
-                                                          __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are less than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
-                                                          __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are less than or equal to those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
-                                                          __m128d __b) {
-  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are greater than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
-                                                          __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are greater than or equal to those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
-                                                          __m128d __b) {
-  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are ordered with respect to those in the second operand.
-///
-///    A pair of double-precision values are ordered with respect to each
-///    other if neither value is a NaN. Each comparison returns 0x0 for false,
-///    0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are unordered with respect to those in the second operand.
-///
-///    A pair of double-precision values are unordered with respect to each
-///    other if one or both values are NaN. Each comparison returns 0x0 for
-///    false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
-                                                             __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are unequal to those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are not less than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are not less than or equal to those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are not greater than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
-}
-
-/// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are not greater than or equal to those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \param __b
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] for equality.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
-                                                          __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is less than the corresponding value in
-///    the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
-                                                          __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is less than or equal to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
-                                                          __m128d __b) {
-  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is greater than the corresponding value
-///    in the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
-///
-/// \param __a
-///     A 128-bit vector of [2 x double]. The lower double-precision value is
-///     compared to the lower double-precision value of \a __b.
-/// \param __b
-///     A 128-bit vector of [2 x double]. The lower double-precision value is
-///     compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
-                                                          __m128d __b) {
-  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
-  return __extension__(__m128d){__c[0], __a[1]};
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is greater than or equal to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
-                                                          __m128d __b) {
-  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
-  return __extension__(__m128d){__c[0], __a[1]};
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is ordered with respect to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
-///    of double-precision values are ordered with respect to each other if
-///    neither value is a NaN.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is unordered with respect to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
-///    of double-precision values are unordered with respect to each other if
-///    one or both values are NaN.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
-                                                             __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is unequal to the corresponding value in
-///    the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is not less than the corresponding
-///    value in the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is not less than or equal to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
-                                                           __m128d __b) {
-  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is not greater than the corresponding
-///    value in the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
-                                                           __m128d __b) {
-  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
-  return __extension__(__m128d){__c[0], __a[1]};
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is not greater than or equal to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
-                                                           __m128d __b) {
-  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
-  return __extension__(__m128d){__c[0], __a[1]};
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] for equality.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
-                                                       __m128d __b) {
-  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is less than the corresponding value in
-///    the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
-                                                       __m128d __b) {
-  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is less than or equal to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///     A 128-bit vector of [2 x double]. The lower double-precision value is
-///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
-                                                       __m128d __b) {
-  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is greater than the corresponding value
-///    in the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
-                                                       __m128d __b) {
-  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is greater than or equal to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
-                                                       __m128d __b) {
-  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is unequal to the corresponding value in
-///    the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] for equality.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is less than the corresponding value in
-///    the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is less than or equal to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///     A 128-bit vector of [2 x double]. The lower double-precision value is
-///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is greater than the corresponding value
-///    in the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///     A 128-bit vector of [2 x double]. The lower double-precision value is
-///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is greater than or equal to the
-///    corresponding value in the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
-                                                        __m128d __b) {
-  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is unequal to the corresponding value in
-///    the second parameter.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __b.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision value is
-///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison result.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
-                                                         __m128d __b) {
-  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
-}
-
-/// Converts the two double-precision floating-point elements of a
-///    128-bit vector of [2 x double] into two single-precision floating-point
-///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
-///    The upper 64 bits of the result vector are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
-///    converted values. The upper 64 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
-  return __builtin_ia32_cvtpd2ps((__v2df)__a);
-}
-
-/// Converts the lower two single-precision floating-point elements of a
-///    128-bit vector of [4 x float] into two double-precision floating-point
-///    values, returned in a 128-bit vector of [2 x double]. The upper two
-///    elements of the input vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower two single-precision
-///    floating-point elements are converted to double-precision values. The
-///    upper two elements are unused.
-/// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
-  return (__m128d) __builtin_convertvector(
-      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
-}
-
-/// Converts the lower two integer elements of a 128-bit vector of
-///    [4 x i32] into two double-precision floating-point values, returned in a
-///    128-bit vector of [2 x double].
-///
-///    The upper two elements of the input vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
-///    converted to double-precision values.
-///
-///    The upper two elements are unused.
-/// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
-  return (__m128d) __builtin_convertvector(
-      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
-}
-
-/// Converts the two double-precision floating-point elements of a
-///    128-bit vector of [2 x double] into two signed 32-bit integer values,
-///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
-///    64 bits of the result vector are set to zero.
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
-///    converted values. The upper 64 bits are set to zero.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
-  return __builtin_ia32_cvtpd2dq((__v2df)__a);
-}
-
-/// Converts the low-order element of a 128-bit vector of [2 x double]
-///    into a 32-bit signed integer value.
-///
-///    If the converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
-///    conversion.
-/// \returns A 32-bit signed integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
-  return __builtin_ia32_cvtsd2si((__v2df)__a);
-}
-
-/// Converts the lower double-precision floating-point element of a
-///    128-bit vector of [2 x double], in the second parameter, into a
-///    single-precision floating-point value, returned in the lower 32 bits of a
-///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
-///    copied from the upper 96 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
-///    copied to the upper 96 bits of the result.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower double-precision
-///    floating-point element is used in the conversion.
-/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
-///    converted value from the second parameter. The upper 96 bits are copied
-///    from the upper 96 bits of the first parameter.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
-                                                         __m128d __b) {
-  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
-}
-
-/// Converts a 32-bit signed integer value, in the second parameter, into
-///    a double-precision floating-point value, returned in the lower 64 bits of
-///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
-///    are copied from the upper 64 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
-///    copied to the upper 64 bits of the result.
-/// \param __b
-///    A 32-bit signed integer containing the value to be converted.
-/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
-///    converted value from the second parameter. The upper 64 bits are copied
-///    from the upper 64 bits of the first parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
-                                                            int __b) {
-  __a[0] = __b;
-  return __a;
-}
-
-/// Converts the lower single-precision floating-point element of a
-///    128-bit vector of [4 x float], in the second parameter, into a
-///    double-precision floating-point value, returned in the lower 64 bits of
-///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
-///    are copied from the upper 64 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
-///    copied to the upper 64 bits of the result.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower single-precision
-///    floating-point element is used in the conversion.
-/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
-///    converted value from the second parameter. The upper 64 bits are copied
-///    from the upper 64 bits of the first parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
-                                                          __m128 __b) {
-  __a[0] = __b[0];
-  return __a;
-}
-
-/// Converts the two double-precision floating-point elements of a
-///    128-bit vector of [2 x double] into two signed truncated (rounded
-///    toward zero) 32-bit integer values, returned in the lower 64 bits
-///    of a 128-bit vector of [4 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
-///    converted values. The upper 64 bits are set to zero.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
-  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
-}
-
-/// Converts the low-order element of a [2 x double] vector into a 32-bit
-///    signed truncated (rounded toward zero) integer value.
-///
-///    If the converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
-///    conversion.
-/// \returns A 32-bit signed integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
-  return __builtin_ia32_cvttsd2si((__v2df)__a);
-}
-
-/// Converts the two double-precision floating-point elements of a
-///    128-bit vector of [2 x double] into two signed 32-bit integer values,
-///    returned in a 64-bit vector of [2 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
-}
-
-/// Converts the two double-precision floating-point elements of a
-///    128-bit vector of [2 x double] into two signed truncated (rounded toward
-///    zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double].
-/// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
-}
-
-/// Converts the two signed 32-bit integer elements of a 64-bit vector of
-///    [2 x i32] into two double-precision floating-point values, returned in a
-///    128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
-///
-/// \param __a
-///    A 64-bit vector of [2 x i32].
-/// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
-  return __builtin_ia32_cvtpi2pd((__v2si)__a);
-}
-
-/// Returns the low-order element of a 128-bit vector of [2 x double] as
-///    a double-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
-/// \returns A double-precision floating-point value copied from the lower 64
-///    bits of \a __a.
-static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
-  return __a[0];
-}
-
-/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
-///    memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
-///
-/// \param __dp
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location has to be 16-byte aligned.
-/// \returns A 128-bit vector of [2 x double] containing the loaded values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
-  return *(const __m128d *)__dp;
-}
-
-/// Loads a double-precision floating-point value from a specified memory
-///    location and duplicates it to both vector elements of a 128-bit vector of
-///    [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
-///
-/// \param __dp
-///    A pointer to a memory location containing a double-precision value.
-/// \returns A 128-bit vector of [2 x double] containing the loaded and
-///    duplicated values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
-  struct __mm_load1_pd_struct {
-    double __u;
-  } __attribute__((__packed__, __may_alias__));
-  double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
-  return __extension__(__m128d){__u, __u};
-}
-
-#define _mm_load_pd1(dp) _mm_load1_pd(dp)
-
-/// Loads two double-precision values, in reverse order, from an aligned
-///    memory location into a 128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
-/// needed shuffling instructions. In AVX mode, the shuffling may be combined
-/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
-///
-/// \param __dp
-///    A 16-byte aligned pointer to an array of double-precision values to be
-///    loaded in reverse order.
-/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
-///    values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
-  __m128d __u = *(const __m128d *)__dp;
-  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
-}
-
-/// Loads a 128-bit floating-point vector of [2 x double] from an
-///    unaligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
-///
-/// \param __dp
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns A 128-bit vector of [2 x double] containing the loaded values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
-  struct __loadu_pd {
-    __m128d_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_pd *)__dp)->__v;
-}
-
-/// Loads a 64-bit integer value to the low element of a 128-bit integer
-///    vector and clears the upper element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __a
-///    A pointer to a 64-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
-  struct __loadu_si64 {
-    long long __v;
-  } __attribute__((__packed__, __may_alias__));
-  long long __u = ((const struct __loadu_si64 *)__a)->__v;
-  return __extension__(__m128i)(__v2di){__u, 0LL};
-}
-
-/// Loads a 32-bit integer value to the low element of a 128-bit integer
-///    vector and clears the upper element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __a
-///    A pointer to a 32-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
-  struct __loadu_si32 {
-    int __v;
-  } __attribute__((__packed__, __may_alias__));
-  int __u = ((const struct __loadu_si32 *)__a)->__v;
-  return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
-}
-
-/// Loads a 16-bit integer value to the low element of a 128-bit integer
-///    vector and clears the upper element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic does not correspond to a specific instruction.
-///
-/// \param __a
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
-  struct __loadu_si16 {
-    short __v;
-  } __attribute__((__packed__, __may_alias__));
-  short __u = ((const struct __loadu_si16 *)__a)->__v;
-  return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
-}
-
-/// Loads a 64-bit double-precision value to the low element of a
-///    128-bit integer vector and clears the upper element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
-///
-/// \param __dp
-///    A pointer to a memory location containing a double-precision value.
-///    The address of the memory location does not have to be aligned.
-/// \returns A 128-bit vector of [2 x double] containing the loaded value.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
-  struct __mm_load_sd_struct {
-    double __u;
-  } __attribute__((__packed__, __may_alias__));
-  double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
-  return __extension__(__m128d){__u, 0};
-}
-
-/// Loads a double-precision value into the high-order bits of a 128-bit
-///    vector of [2 x double]. The low-order bits are copied from the low-order
-///    bits of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. \n
-///    Bits [63:0] are written to bits [63:0] of the result.
-/// \param __dp
-///    A pointer to a 64-bit memory location containing a double-precision
-///    floating-point value that is loaded. The loaded value is written to bits
-///    [127:64] of the result. The address of the memory location does not have
-///    to be aligned.
-/// \returns A 128-bit vector of [2 x double] containing the moved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
-                                                          double const *__dp) {
-  struct __mm_loadh_pd_struct {
-    double __u;
-  } __attribute__((__packed__, __may_alias__));
-  double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
-  return __extension__(__m128d){__a[0], __u};
-}
-
-/// Loads a double-precision value into the low-order bits of a 128-bit
-///    vector of [2 x double]. The high-order bits are copied from the
-///    high-order bits of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. \n
-///    Bits [127:64] are written to bits [127:64] of the result.
-/// \param __dp
-///    A pointer to a 64-bit memory location containing a double-precision
-///    floating-point value that is loaded. The loaded value is written to bits
-///    [63:0] of the result. The address of the memory location does not have to
-///    be aligned.
-/// \returns A 128-bit vector of [2 x double] containing the moved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
-                                                          double const *__dp) {
-  struct __mm_loadl_pd_struct {
-    double __u;
-  } __attribute__((__packed__, __may_alias__));
-  double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
-  return __extension__(__m128d){__u, __a[1]};
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double] with
-///    unspecified content. This could be used as an argument to another
-///    intrinsic function where the argument is required but the value is not
-///    actually used.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
-///    content.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
-  return (__m128d)__builtin_ia32_undef128();
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
-///    64 bits of the vector are initialized with the specified double-precision
-///    floating-point value. The upper 64 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __w
-///    A double-precision floating-point value used to initialize the lower 64
-///    bits of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
-///    lower 64 bits contain the value of the parameter. The upper 64 bits are
-///    set to zero.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
-  return __extension__(__m128d){__w, 0};
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double], with each
-///    of the two double-precision floating-point vector elements set to the
-///    specified double-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
-///
-/// \param __w
-///    A double-precision floating-point value used to initialize each vector
-///    element of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
-  return __extension__(__m128d){__w, __w};
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double], with each
-///    of the two double-precision floating-point vector elements set to the
-///    specified double-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
-///
-/// \param __w
-///    A double-precision floating-point value used to initialize each vector
-///    element of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
-  return _mm_set1_pd(__w);
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double]
-///    initialized with the specified double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
-///
-/// \param __w
-///    A double-precision floating-point value used to initialize the upper 64
-///    bits of the result.
-/// \param __x
-///    A double-precision floating-point value used to initialize the lower 64
-///    bits of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
-                                                        double __x) {
-  return __extension__(__m128d){__x, __w};
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double],
-///    initialized in reverse order with the specified double-precision
-///    floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
-///
-/// \param __w
-///    A double-precision floating-point value used to initialize the lower 64
-///    bits of the result.
-/// \param __x
-///    A double-precision floating-point value used to initialize the upper 64
-///    bits of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
-                                                         double __x) {
-  return __extension__(__m128d){__w, __x};
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double]
-///    initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
-///
-/// \returns An initialized 128-bit floating-point vector of [2 x double] with
-///    all elements set to zero.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
-  return __extension__(__m128d){0.0, 0.0};
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
-///    64 bits are set to the lower 64 bits of the second parameter. The upper
-///    64 bits are set to the upper 64 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
-///    upper 64 bits of the result.
-/// \param __b
-///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
-///    lower 64 bits of the result.
-/// \returns A 128-bit vector of [2 x double] containing the moved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
-                                                         __m128d __b) {
-  __a[0] = __b[0];
-  return __a;
-}
-
-/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
-///    memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
-///
-/// \param __dp
-///    A pointer to a 64-bit memory location.
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
-                                                       __m128d __a) {
-  struct __mm_store_sd_struct {
-    double __u;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
-}
-
-/// Moves packed double-precision values from a 128-bit vector of
-///    [2 x double] to a memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
-///
-/// \param __dp
-///    A pointer to an aligned memory location that can store two
-///    double-precision values.
-/// \param __a
-///    A packed 128-bit vector of [2 x double] containing the values to be
-///    moved.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
-                                                       __m128d __a) {
-  *(__m128d *)__dp = __a;
-}
-
-/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
-///    the upper and lower 64 bits of a memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the
-///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
-///
-/// \param __dp
-///    A pointer to a memory location that can store two double-precision
-///    values.
-/// \param __a
-///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
-///    of the values in \a __dp.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
-                                                        __m128d __a) {
-  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
-  _mm_store_pd(__dp, __a);
-}
-
-/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
-///    the upper and lower 64 bits of a memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the
-///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
-///
-/// \param __dp
-///    A pointer to a memory location that can store two double-precision
-///    values.
-/// \param __a
-///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
-///    of the values in \a __dp.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
-                                                        __m128d __a) {
-  _mm_store1_pd(__dp, __a);
-}
-
-/// Stores a 128-bit vector of [2 x double] into an unaligned memory
-///    location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
-///
-/// \param __dp
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
-                                                        __m128d __a) {
-  struct __storeu_pd {
-    __m128d_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_pd *)__dp)->__v = __a;
-}
-
-/// Stores two double-precision values, in reverse order, from a 128-bit
-///    vector of [2 x double] to a 16-byte aligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to a shuffling instruction followed by a
-/// <c> VMOVAPD / MOVAPD </c> instruction.
-///
-/// \param __dp
-///    A pointer to a 16-byte aligned memory location that can store two
-///    double-precision values.
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the values to be reversed and
-///    stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
-                                                        __m128d __a) {
-  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
-  *(__m128d *)__dp = __a;
-}
-
-/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
-///    memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
-///
-/// \param __dp
-///    A pointer to a 64-bit memory location.
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
-                                                        __m128d __a) {
-  struct __mm_storeh_pd_struct {
-    double __u;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
-}
-
-/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
-///    memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
-///
-/// \param __dp
-///    A pointer to a 64-bit memory location.
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
-                                                        __m128d __a) {
-  struct __mm_storeh_pd_struct {
-    double __u;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
-}
-
-/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
-///    saving the lower 8 bits of each sum in the corresponding element of a
-///    128-bit result vector of [16 x i8].
-///
-///    The integer elements of both parameters can be either signed or unsigned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [16 x i8].
-/// \param __b
-///    A 128-bit vector of [16 x i8].
-/// \returns A 128-bit vector of [16 x i8] containing the sums of both
-///    parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
-                                                          __m128i __b) {
-  return (__m128i)((__v16qu)__a + (__v16qu)__b);
-}
-
-/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
-///    saving the lower 16 bits of each sum in the corresponding element of a
-///    128-bit result vector of [8 x i16].
-///
-///    The integer elements of both parameters can be either signed or unsigned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x i16].
-/// \param __b
-///    A 128-bit vector of [8 x i16].
-/// \returns A 128-bit vector of [8 x i16] containing the sums of both
-///    parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)((__v8hu)__a + (__v8hu)__b);
-}
-
-/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
-///    saving the lower 32 bits of each sum in the corresponding element of a
-///    128-bit result vector of [4 x i32].
-///
-///    The integer elements of both parameters can be either signed or unsigned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x i32].
-/// \param __b
-///    A 128-bit vector of [4 x i32].
-/// \returns A 128-bit vector of [4 x i32] containing the sums of both
-///    parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)((__v4su)__a + (__v4su)__b);
-}
-
-/// Adds two signed or unsigned 64-bit integer values, returning the
-///    lower 64 bits of the sum.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer.
-/// \param __b
-///    A 64-bit integer.
-/// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
-                                                            __m64 __b) {
-  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
-}
-
-/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
-///    saving the lower 64 bits of each sum in the corresponding element of a
-///    128-bit result vector of [2 x i64].
-///
-///    The integer elements of both parameters can be either signed or unsigned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x i64].
-/// \param __b
-///    A 128-bit vector of [2 x i64].
-/// \returns A 128-bit vector of [2 x i64] containing the sums of both
-///    parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)((__v2du)__a + (__v2du)__b);
-}
-
-/// Adds, with saturation, the corresponding elements of two 128-bit
-///    signed [16 x i8] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [16 x i8].
-///
-///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
-///    less than 0x80 are saturated to 0x80.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
-///
-/// \param __a
-///    A 128-bit signed [16 x i8] vector.
-/// \param __b
-///    A 128-bit signed [16 x i8] vector.
-/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
-///    both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
-}
-
-/// Adds, with saturation, the corresponding elements of two 128-bit
-///    signed [8 x i16] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [8 x i16].
-///
-///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
-///    less than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
-///
-/// \param __a
-///    A 128-bit signed [8 x i16] vector.
-/// \param __b
-///    A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
-///    both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
-                                                            __m128i __b) {
-  return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Adds, with saturation, the corresponding elements of two 128-bit
-///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [16 x i8].
-///
-///    Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
-///    saturated to 0x00.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
-///
-/// \param __a
-///    A 128-bit unsigned [16 x i8] vector.
-/// \param __b
-///    A 128-bit unsigned [16 x i8] vector.
-/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
-///    of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
-}
-
-/// Adds, with saturation, the corresponding elements of two 128-bit
-///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [8 x i16].
-///
-///    Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
-///    are saturated to 0x0000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
-///
-/// \param __a
-///    A 128-bit unsigned [8 x i16] vector.
-/// \param __b
-///    A 128-bit unsigned [8 x i16] vector.
-/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
-///    of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
-                                                            __m128i __b) {
-  return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
-}
-
-/// Computes the rounded averages of corresponding elements of two
-///    128-bit unsigned [16 x i8] vectors, saving each result in the
-///    corresponding element of a 128-bit result vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
-///
-/// \param __a
-///    A 128-bit unsigned [16 x i8] vector.
-/// \param __b
-///    A 128-bit unsigned [16 x i8] vector.
-/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
-///    averages of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
-                                                          __m128i __b) {
-  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// Computes the rounded averages of corresponding elements of two
-///    128-bit unsigned [8 x i16] vectors, saving each result in the
-///    corresponding element of a 128-bit result vector of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
-///
-/// \param __a
-///    A 128-bit unsigned [8 x i16] vector.
-/// \param __b
-///    A 128-bit unsigned [8 x i16] vector.
-/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
-///    averages of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
-///    vectors, producing eight intermediate 32-bit signed integer products, and
-///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
-///    [4 x i32] vector.
-///
-///    For example, bits [15:0] of both parameters are multiplied producing a
-///    32-bit product, bits [31:16] of both parameters are multiplied producing
-///    a 32-bit product, and the sum of those two products becomes bits [31:0]
-///    of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
-///
-/// \param __a
-///    A 128-bit signed [8 x i16] vector.
-/// \param __b
-///    A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
-///    of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
-                                                            __m128i __b) {
-  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Compares corresponding elements of two 128-bit signed [8 x i16]
-///    vectors, saving the greater value from each comparison in the
-///    corresponding element of a 128-bit result vector of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
-///
-/// \param __a
-///    A 128-bit signed [8 x i16] vector.
-/// \param __b
-///    A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
-///    each comparison.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
-///    vectors, saving the greater value from each comparison in the
-///    corresponding element of a 128-bit result vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
-///
-/// \param __a
-///    A 128-bit unsigned [16 x i8] vector.
-/// \param __b
-///    A 128-bit unsigned [16 x i8] vector.
-/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
-///    each comparison.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
-                                                          __m128i __b) {
-  return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
-}
-
-/// Compares corresponding elements of two 128-bit signed [8 x i16]
-///    vectors, saving the smaller value from each comparison in the
-///    corresponding element of a 128-bit result vector of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
-///
-/// \param __a
-///    A 128-bit signed [8 x i16] vector.
-/// \param __b
-///    A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
-///    each comparison.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
-///    vectors, saving the smaller value from each comparison in the
-///    corresponding element of a 128-bit result vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
-///
-/// \param __a
-///    A 128-bit unsigned [16 x i8] vector.
-/// \param __b
-///    A 128-bit unsigned [16 x i8] vector.
-/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
-///    each comparison.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
-                                                          __m128i __b) {
-  return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
-}
-
-/// Multiplies the corresponding elements of two signed [8 x i16]
-///    vectors, saving the upper 16 bits of each 32-bit product in the
-///    corresponding element of a 128-bit signed [8 x i16] result vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
-///
-/// \param __a
-///    A 128-bit signed [8 x i16] vector.
-/// \param __b
-///    A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
-///    each of the eight 32-bit products.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Multiplies the corresponding elements of two unsigned [8 x i16]
-///    vectors, saving the upper 16 bits of each 32-bit product in the
-///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
-///
-/// \param __a
-///    A 128-bit unsigned [8 x i16] vector.
-/// \param __b
-///    A 128-bit unsigned [8 x i16] vector.
-/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
-///    of each of the eight 32-bit products.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Multiplies the corresponding elements of two signed [8 x i16]
-///    vectors, saving the lower 16 bits of each 32-bit product in the
-///    corresponding element of a 128-bit signed [8 x i16] result vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
-///
-/// \param __a
-///    A 128-bit signed [8 x i16] vector.
-/// \param __b
-///    A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
-///    each of the eight 32-bit products.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)((__v8hu)__a * (__v8hu)__b);
-}
-
-/// Multiplies 32-bit unsigned integer values contained in the lower bits
-///    of the two 64-bit integer vectors and returns the 64-bit unsigned
-///    product.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer containing one of the source operands.
-/// \param __b
-///    A 64-bit integer containing one of the source operands.
-/// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
-                                                            __m64 __b) {
-  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
-}
-
-/// Multiplies 32-bit unsigned integer values contained in the lower
-///    bits of the corresponding elements of two [2 x i64] vectors, and returns
-///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
-///
-/// \param __a
-///    A [2 x i64] vector containing one of the source operands.
-/// \param __b
-///    A [2 x i64] vector containing one of the source operands.
-/// \returns A [2 x i64] vector containing the product of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
-                                                           __m128i __b) {
-  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
-}
-
-/// Computes the absolute differences of corresponding 8-bit integer
-///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
-///    separately sums the second 8 absolute differences. Packs these two
-///    unsigned 16-bit integer sums into the upper and lower elements of a
-///    [2 x i64] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 128-bit integer vector containing one of the source operands.
-/// \returns A [2 x i64] vector containing the sums of the sets of absolute
-///    differences between both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
-                                                          __m128i __b) {
-  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// Subtracts the corresponding 8-bit integer values in the operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the minuends.
-/// \param __b
-///    A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-///    in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
-                                                          __m128i __b) {
-  return (__m128i)((__v16qu)__a - (__v16qu)__b);
-}
-
-/// Subtracts the corresponding 16-bit integer values in the operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the minuends.
-/// \param __b
-///    A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-///    in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)((__v8hu)__a - (__v8hu)__b);
-}
-
-/// Subtracts the corresponding 32-bit integer values in the operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the minuends.
-/// \param __b
-///    A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-///    in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)((__v4su)__a - (__v4su)__b);
-}
-
-/// Subtracts signed or unsigned 64-bit integer values and writes the
-///    difference to the corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing the minuend.
-/// \param __b
-///    A 64-bit integer vector containing the subtrahend.
-/// \returns A 64-bit integer vector containing the difference of the values in
-///    the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
-                                                            __m64 __b) {
-  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
-}
-
-/// Subtracts the corresponding elements of two [2 x i64] vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the minuends.
-/// \param __b
-///    A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-///    in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)((__v2du)__a - (__v2du)__b);
-}
-
-/// Subtracts, with saturation, corresponding 8-bit signed integer values in
-///    the input and returns the differences in the corresponding bytes in the
-///    destination.
-///
-///    Differences greater than 0x7F are saturated to 0x7F, and differences
-///    less than 0x80 are saturated to 0x80.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the minuends.
-/// \param __b
-///    A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-///    in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
-}
-
-/// Subtracts, with saturation, corresponding 16-bit signed integer values in
-///    the input and returns the differences in the corresponding bytes in the
-///    destination.
-///
-///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
-///    than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the minuends.
-/// \param __b
-///    A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-///    in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
-                                                            __m128i __b) {
-  return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
-///    the input and returns the differences in the corresponding bytes in the
-///    destination.
-///
-///    Differences less than 0x00 are saturated to 0x00.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the minuends.
-/// \param __b
-///    A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the unsigned integer
-///    differences of the values in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
-}
-
-/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
-///    the input and returns the differences in the corresponding bytes in the
-///    destination.
-///
-///    Differences less than 0x0000 are saturated to 0x0000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the minuends.
-/// \param __b
-///    A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the unsigned integer
-///    differences of the values in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
-                                                            __m128i __b) {
-  return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 128-bit integer vector containing one of the source operands.
-/// \returns A 128-bit integer vector containing the bitwise AND of the values
-///    in both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)((__v2du)__a & (__v2du)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit integer vectors, using the
-///    one's complement of the values contained in the first source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector containing the left source operand. The one's complement
-///    of this value is used in the bitwise AND.
-/// \param __b
-///    A 128-bit vector containing the right source operand.
-/// \returns A 128-bit integer vector containing the bitwise AND of the one's
-///    complement of the first operand and the values in the second operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
-                                                              __m128i __b) {
-  return (__m128i)(~(__v2du)__a & (__v2du)__b);
-}
-/// Performs a bitwise OR of two 128-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 128-bit integer vector containing one of the source operands.
-/// \returns A 128-bit integer vector containing the bitwise OR of the values
-///    in both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
-                                                          __m128i __b) {
-  return (__m128i)((__v2du)__a | (__v2du)__b);
-}
-
-/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 128-bit integer vector containing one of the source operands.
-/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
-///    values in both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
-                                                           __m128i __b) {
-  return (__m128i)((__v2du)__a ^ (__v2du)__b);
-}
-
-/// Left-shifts the 128-bit integer vector operand by the specified
-///    number of bytes. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_slli_si128(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
-///
-/// \param a
-///    A 128-bit integer vector containing the source operand.
-/// \param imm
-///    An immediate value specifying the number of bytes to left-shift operand
-///    \a a.
-/// \returns A 128-bit integer vector containing the left-shifted value.
-#define _mm_slli_si128(a, imm)                                                 \
-  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
-                                                (int)(imm)))
-
-#define _mm_bslli_si128(a, imm)                                                \
-  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
-                                                (int)(imm)))
-
-/// Left-shifts each 16-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    An integer value specifying the number of bits to left-shift each value
-///    in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
-                                                            int __count) {
-  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
-}
-
-/// Left-shifts each 16-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to left-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
-                                                           __m128i __count) {
-  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
-}
-
-/// Left-shifts each 32-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    An integer value specifying the number of bits to left-shift each value
-///    in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
-                                                            int __count) {
-  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
-}
-
-/// Left-shifts each 32-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to left-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
-                                                           __m128i __count) {
-  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
-}
-
-/// Left-shifts each 64-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    An integer value specifying the number of bits to left-shift each value
-///    in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
-                                                            int __count) {
-  return __builtin_ia32_psllqi128((__v2di)__a, __count);
-}
-
-/// Left-shifts each 64-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to left-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
-                                                           __m128i __count) {
-  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
-}
-
-/// Right-shifts each 16-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. High-order bits are filled with the sign
-///    bit of the initial value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    An integer value specifying the number of bits to right-shift each value
-///    in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
-                                                            int __count) {
-  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
-}
-
-/// Right-shifts each 16-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. High-order bits are filled with the sign
-///    bit of the initial value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
-                                                           __m128i __count) {
-  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
-}
-
-/// Right-shifts each 32-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. High-order bits are filled with the sign
-///    bit of the initial value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    An integer value specifying the number of bits to right-shift each value
-///    in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
-                                                            int __count) {
-  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
-}
-
-/// Right-shifts each 32-bit value in the 128-bit integer vector operand
-///    by the specified number of bits. High-order bits are filled with the sign
-///    bit of the initial value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
-                                                           __m128i __count) {
-  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
-}
-
-/// Right-shifts the 128-bit integer vector operand by the specified
-///    number of bytes. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_srli_si128(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
-///
-/// \param a
-///    A 128-bit integer vector containing the source operand.
-/// \param imm
-///    An immediate value specifying the number of bytes to right-shift operand
-///    \a a.
-/// \returns A 128-bit integer vector containing the right-shifted value.
-#define _mm_srli_si128(a, imm)                                                 \
-  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
-                                                (int)(imm)))
-
-#define _mm_bsrli_si128(a, imm)                                                \
-  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
-                                                (int)(imm)))
-
-/// Right-shifts each of 16-bit values in the 128-bit integer vector
-///    operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    An integer value specifying the number of bits to right-shift each value
-///    in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
-                                                            int __count) {
-  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
-}
-
-/// Right-shifts each of 16-bit values in the 128-bit integer vector
-///    operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
-                                                           __m128i __count) {
-  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
-}
-
-/// Right-shifts each of 32-bit values in the 128-bit integer vector
-///    operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    An integer value specifying the number of bits to right-shift each value
-///    in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
-                                                            int __count) {
-  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
-}
-
-/// Right-shifts each of 32-bit values in the 128-bit integer vector
-///    operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
-                                                           __m128i __count) {
-  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
-}
-
-/// Right-shifts each of 64-bit values in the 128-bit integer vector
-///    operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    An integer value specifying the number of bits to right-shift each value
-///    in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
-                                                            int __count) {
-  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
-}
-
-/// Right-shifts each of 64-bit values in the 128-bit integer vector
-///    operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the source operand.
-/// \param __count
-///    A 128-bit integer vector in which bits [63:0] specify the number of bits
-///    to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
-                                                           __m128i __count) {
-  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
-}
-
-/// Compares each of the corresponding 8-bit values of the 128-bit
-///    integer vectors for equality.
-///
-///    Each comparison returns 0x0 for false, 0xFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \param __b
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
-                                                            __m128i __b) {
-  return (__m128i)((__v16qi)__a == (__v16qi)__b);
-}
-
-/// Compares each of the corresponding 16-bit values of the 128-bit
-///    integer vectors for equality.
-///
-///    Each comparison returns 0x0 for false, 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \param __b
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)((__v8hi)__a == (__v8hi)__b);
-}
-
-/// Compares each of the corresponding 32-bit values of the 128-bit
-///    integer vectors for equality.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \param __b
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)((__v4si)__a == (__v4si)__b);
-}
-
-/// Compares each of the corresponding signed 8-bit values of the 128-bit
-///    integer vectors to determine if the values in the first operand are
-///    greater than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \param __b
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
-                                                            __m128i __b) {
-  /* This function always performs a signed comparison, but __v16qi is a char
-     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i)((__v16qs)__a > (__v16qs)__b);
-}
-
-/// Compares each of the corresponding signed 16-bit values of the
-///    128-bit integer vectors to determine if the values in the first operand
-///    are greater than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \param __b
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)((__v8hi)__a > (__v8hi)__b);
-}
-
-/// Compares each of the corresponding signed 32-bit values of the
-///    128-bit integer vectors to determine if the values in the first operand
-///    are greater than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \param __b
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)((__v4si)__a > (__v4si)__b);
-}
-
-/// Compares each of the corresponding signed 8-bit values of the 128-bit
-///    integer vectors to determine if the values in the first operand are less
-///    than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \param __b
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
-                                                            __m128i __b) {
-  return _mm_cmpgt_epi8(__b, __a);
-}
-
-/// Compares each of the corresponding signed 16-bit values of the
-///    128-bit integer vectors to determine if the values in the first operand
-///    are less than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \param __b
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
-                                                             __m128i __b) {
-  return _mm_cmpgt_epi16(__b, __a);
-}
-
-/// Compares each of the corresponding signed 32-bit values of the
-///    128-bit integer vectors to determine if the values in the first operand
-///    are less than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \param __b
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
-                                                             __m128i __b) {
-  return _mm_cmpgt_epi32(__b, __a);
-}
-
-#ifdef __x86_64__
-/// Converts a 64-bit signed integer value from the second operand into a
-///    double-precision value and returns it in the lower element of a [2 x
-///    double] vector; the upper element of the returned vector is copied from
-///    the upper element of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
-///    copied to the upper 64 bits of the destination.
-/// \param __b
-///    A 64-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-///    converted value of the second operand. The upper 64 bits are copied from
-///    the upper 64 bits of the first operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
-                                                            long long __b) {
-  __a[0] = __b;
-  return __a;
-}
-
-/// Converts the first (lower) element of a vector of [2 x double] into a
-///    64-bit signed integer value.
-///
-///    If the converted value does not fit in a 64-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
-///    conversion.
-/// \returns A 64-bit signed integer containing the converted value.
-static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
-  return __builtin_ia32_cvtsd2si64((__v2df)__a);
-}
-
-/// Converts the first (lower) element of a vector of [2 x double] into a
-///    64-bit signed truncated (rounded toward zero) integer value.
-///
-///    If a converted value does not fit in a 64-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
-///    conversion.
-/// \returns A 64-bit signed integer containing the converted value.
-static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
-  return __builtin_ia32_cvttsd2si64((__v2df)__a);
-}
-#endif
-
-/// Converts a vector of [4 x i32] into a vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \returns A 128-bit vector of [4 x float] containing the converted values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
-  return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
-}
-
-/// Converts a vector of [4 x float] into a vector of [4 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit integer vector of [4 x i32] containing the converted
-///    values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
-  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
-}
-
-/// Converts a vector of [4 x float] into four signed truncated (rounded toward
-///    zero) 32-bit integers, returned in a vector of [4 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x i32] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
-  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
-}
-
-/// Returns a vector of [4 x i32] where the lowest element is the input
-///    operand and the remaining elements are zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __a
-///    A 32-bit signed integer operand.
-/// \returns A 128-bit vector of [4 x i32].
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
-  return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
-}
-
-/// Returns a vector of [2 x i64] where the lower element is the input
-///    operand and the upper element is zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
-/// in 64-bit mode.
-///
-/// \param __a
-///    A 64-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [2 x i64] containing the converted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
-  return __extension__(__m128i)(__v2di){__a, 0};
-}
-
-/// Moves the least significant 32 bits of a vector of [4 x i32] to a
-///    32-bit signed integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __a
-///    A vector of [4 x i32]. The least significant 32 bits are moved to the
-///    destination.
-/// \returns A 32-bit signed integer containing the moved value.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
-  __v4si __b = (__v4si)__a;
-  return __b[0];
-}
-
-/// Moves the least significant 64 bits of a vector of [2 x i64] to a
-///    64-bit signed integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __a
-///    A vector of [2 x i64]. The least significant 64 bits are moved to the
-///    destination.
-/// \returns A 64-bit signed integer containing the moved value.
-static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
-  return __a[0];
-}
-
-/// Moves packed integer values from an aligned 128-bit memory location
-///    to elements in a 128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
-///
-/// \param __p
-///    An aligned pointer to a memory location containing integer values.
-/// \returns A 128-bit integer vector containing the moved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_load_si128(__m128i const *__p) {
-  return *__p;
-}
-
-/// Moves packed integer values from an unaligned 128-bit memory location
-///    to elements in a 128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location containing integer values.
-/// \returns A 128-bit integer vector containing the moved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_loadu_si128(__m128i_u const *__p) {
-  struct __loadu_si128 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_si128 *)__p)->__v;
-}
-
-/// Returns a vector of [2 x i64] where the lower element is taken from
-///    the lower element of the operand, and the upper element is zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __p
-///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
-///    the destination.
-/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
-///    moved value. The higher order bits are cleared.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_loadl_epi64(__m128i_u const *__p) {
-  struct __mm_loadl_epi64_struct {
-    long long __u;
-  } __attribute__((__packed__, __may_alias__));
-  return __extension__(__m128i){
-      ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
-}
-
-/// Generates a 128-bit vector of [4 x i32] with unspecified content.
-///    This could be used as an argument to another intrinsic function where the
-///    argument is required but the value is not actually used.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 128-bit vector of [4 x i32] with unspecified content.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
-  return (__m128i)__builtin_ia32_undef128();
-}
-
-/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
-///    the specified 64-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __q1
-///    A 64-bit integer value used to initialize the upper 64 bits of the
-///    destination vector of [2 x i64].
-/// \param __q0
-///    A 64-bit integer value used to initialize the lower 64 bits of the
-///    destination vector of [2 x i64].
-/// \returns An initialized 128-bit vector of [2 x i64] containing the values
-///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
-                                                            long long __q0) {
-  return __extension__(__m128i)(__v2di){__q0, __q1};
-}
-
-/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
-///    the specified 64-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __q1
-///    A 64-bit integer value used to initialize the upper 64 bits of the
-///    destination vector of [2 x i64].
-/// \param __q0
-///    A 64-bit integer value used to initialize the lower 64 bits of the
-///    destination vector of [2 x i64].
-/// \returns An initialized 128-bit vector of [2 x i64] containing the values
-///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
-                                                           __m64 __q0) {
-  return _mm_set_epi64x((long long)__q1, (long long)__q0);
-}
-
-/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
-///    the specified 32-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __i3
-///    A 32-bit integer value used to initialize bits [127:96] of the
-///    destination vector.
-/// \param __i2
-///    A 32-bit integer value used to initialize bits [95:64] of the destination
-///    vector.
-/// \param __i1
-///    A 32-bit integer value used to initialize bits [63:32] of the destination
-///    vector.
-/// \param __i0
-///    A 32-bit integer value used to initialize bits [31:0] of the destination
-///    vector.
-/// \returns An initialized 128-bit vector of [4 x i32] containing the values
-///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
-                                                           int __i1, int __i0) {
-  return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
-}
-
-/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
-///    the specified 16-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __w7
-///    A 16-bit integer value used to initialize bits [127:112] of the
-///    destination vector.
-/// \param __w6
-///    A 16-bit integer value used to initialize bits [111:96] of the
-///    destination vector.
-/// \param __w5
-///    A 16-bit integer value used to initialize bits [95:80] of the destination
-///    vector.
-/// \param __w4
-///    A 16-bit integer value used to initialize bits [79:64] of the destination
-///    vector.
-/// \param __w3
-///    A 16-bit integer value used to initialize bits [63:48] of the destination
-///    vector.
-/// \param __w2
-///    A 16-bit integer value used to initialize bits [47:32] of the destination
-///    vector.
-/// \param __w1
-///    A 16-bit integer value used to initialize bits [31:16] of the destination
-///    vector.
-/// \param __w0
-///    A 16-bit integer value used to initialize bits [15:0] of the destination
-///    vector.
-/// \returns An initialized 128-bit vector of [8 x i16] containing the values
-///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
-              short __w2, short __w1, short __w0) {
-  return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
-                                        __w4, __w5, __w6, __w7};
-}
-
-/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
-///    the specified 8-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __b15
-///    Initializes bits [127:120] of the destination vector.
-/// \param __b14
-///    Initializes bits [119:112] of the destination vector.
-/// \param __b13
-///    Initializes bits [111:104] of the destination vector.
-/// \param __b12
-///    Initializes bits [103:96] of the destination vector.
-/// \param __b11
-///    Initializes bits [95:88] of the destination vector.
-/// \param __b10
-///    Initializes bits [87:80] of the destination vector.
-/// \param __b9
-///    Initializes bits [79:72] of the destination vector.
-/// \param __b8
-///    Initializes bits [71:64] of the destination vector.
-/// \param __b7
-///    Initializes bits [63:56] of the destination vector.
-/// \param __b6
-///    Initializes bits [55:48] of the destination vector.
-/// \param __b5
-///    Initializes bits [47:40] of the destination vector.
-/// \param __b4
-///    Initializes bits [39:32] of the destination vector.
-/// \param __b3
-///    Initializes bits [31:24] of the destination vector.
-/// \param __b2
-///    Initializes bits [23:16] of the destination vector.
-/// \param __b1
-///    Initializes bits [15:8] of the destination vector.
-/// \param __b0
-///    Initializes bits [7:0] of the destination vector.
-/// \returns An initialized 128-bit vector of [16 x i8] containing the values
-///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
-             char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
-             char __b4, char __b3, char __b2, char __b1, char __b0) {
-  return __extension__(__m128i)(__v16qi){
-      __b0, __b1, __b2,  __b3,  __b4,  __b5,  __b6,  __b7,
-      __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
-}
-
-/// Initializes both values in a 128-bit integer vector with the
-///    specified 64-bit integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __q
-///    Integer value used to initialize the elements of the destination integer
-///    vector.
-/// \returns An initialized 128-bit integer vector of [2 x i64] with both
-///    elements containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
-  return _mm_set_epi64x(__q, __q);
-}
-
-/// Initializes both values in a 128-bit vector of [2 x i64] with the
-///    specified 64-bit value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __q
-///    A 64-bit value used to initialize the elements of the destination integer
-///    vector.
-/// \returns An initialized 128-bit vector of [2 x i64] with all elements
-///    containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
-  return _mm_set_epi64(__q, __q);
-}
-
-/// Initializes all values in a 128-bit vector of [4 x i32] with the
-///    specified 32-bit value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __i
-///    A 32-bit value used to initialize the elements of the destination integer
-///    vector.
-/// \returns An initialized 128-bit vector of [4 x i32] with all elements
-///    containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
-  return _mm_set_epi32(__i, __i, __i, __i);
-}
-
-/// Initializes all values in a 128-bit vector of [8 x i16] with the
-///    specified 16-bit value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __w
-///    A 16-bit value used to initialize the elements of the destination integer
-///    vector.
-/// \returns An initialized 128-bit vector of [8 x i16] with all elements
-///    containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
-  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
-}
-
-/// Initializes all values in a 128-bit vector of [16 x i8] with the
-///    specified 8-bit value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __b
-///    An 8-bit value used to initialize the elements of the destination integer
-///    vector.
-/// \returns An initialized 128-bit vector of [16 x i8] with all elements
-///    containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
-  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
-                      __b, __b, __b, __b, __b);
-}
-
-/// Constructs a 128-bit integer vector, initialized in reverse order
-///     with the specified 64-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic does not correspond to a specific instruction.
-///
-/// \param __q0
-///    A 64-bit integral value used to initialize the lower 64 bits of the
-///    result.
-/// \param __q1
-///    A 64-bit integral value used to initialize the upper 64 bits of the
-///    result.
-/// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
-                                                            __m64 __q1) {
-  return _mm_set_epi64(__q1, __q0);
-}
-
-/// Constructs a 128-bit integer vector, initialized in reverse order
-///     with the specified 32-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __i0
-///    A 32-bit integral value used to initialize bits [31:0] of the result.
-/// \param __i1
-///    A 32-bit integral value used to initialize bits [63:32] of the result.
-/// \param __i2
-///    A 32-bit integral value used to initialize bits [95:64] of the result.
-/// \param __i3
-///    A 32-bit integral value used to initialize bits [127:96] of the result.
-/// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
-                                                            int __i2,
-                                                            int __i3) {
-  return _mm_set_epi32(__i3, __i2, __i1, __i0);
-}
-
-/// Constructs a 128-bit integer vector, initialized in reverse order
-///     with the specified 16-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __w0
-///    A 16-bit integral value used to initialize bits [15:0] of the result.
-/// \param __w1
-///    A 16-bit integral value used to initialize bits [31:16] of the result.
-/// \param __w2
-///    A 16-bit integral value used to initialize bits [47:32] of the result.
-/// \param __w3
-///    A 16-bit integral value used to initialize bits [63:48] of the result.
-/// \param __w4
-///    A 16-bit integral value used to initialize bits [79:64] of the result.
-/// \param __w5
-///    A 16-bit integral value used to initialize bits [95:80] of the result.
-/// \param __w6
-///    A 16-bit integral value used to initialize bits [111:96] of the result.
-/// \param __w7
-///    A 16-bit integral value used to initialize bits [127:112] of the result.
-/// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
-               short __w5, short __w6, short __w7) {
-  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
-}
-
-/// Constructs a 128-bit integer vector, initialized in reverse order
-///     with the specified 8-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __b0
-///    An 8-bit integral value used to initialize bits [7:0] of the result.
-/// \param __b1
-///    An 8-bit integral value used to initialize bits [15:8] of the result.
-/// \param __b2
-///    An 8-bit integral value used to initialize bits [23:16] of the result.
-/// \param __b3
-///    An 8-bit integral value used to initialize bits [31:24] of the result.
-/// \param __b4
-///    An 8-bit integral value used to initialize bits [39:32] of the result.
-/// \param __b5
-///    An 8-bit integral value used to initialize bits [47:40] of the result.
-/// \param __b6
-///    An 8-bit integral value used to initialize bits [55:48] of the result.
-/// \param __b7
-///    An 8-bit integral value used to initialize bits [63:56] of the result.
-/// \param __b8
-///    An 8-bit integral value used to initialize bits [71:64] of the result.
-/// \param __b9
-///    An 8-bit integral value used to initialize bits [79:72] of the result.
-/// \param __b10
-///    An 8-bit integral value used to initialize bits [87:80] of the result.
-/// \param __b11
-///    An 8-bit integral value used to initialize bits [95:88] of the result.
-/// \param __b12
-///    An 8-bit integral value used to initialize bits [103:96] of the result.
-/// \param __b13
-///    An 8-bit integral value used to initialize bits [111:104] of the result.
-/// \param __b14
-///    An 8-bit integral value used to initialize bits [119:112] of the result.
-/// \param __b15
-///    An 8-bit integral value used to initialize bits [127:120] of the result.
-/// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
-              char __b6, char __b7, char __b8, char __b9, char __b10,
-              char __b11, char __b12, char __b13, char __b14, char __b15) {
-  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
-                      __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
-}
-
-/// Creates a 128-bit integer vector initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
-///
-/// \returns An initialized 128-bit integer vector with all elements set to
-///    zero.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
-  return __extension__(__m128i)(__v2di){0LL, 0LL};
-}
-
-/// Stores a 128-bit integer vector to a memory location aligned on a
-///    128-bit boundary.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
-///
-/// \param __p
-///    A pointer to an aligned memory location that will receive the integer
-///    values.
-/// \param __b
-///    A 128-bit integer vector containing the values to be moved.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
-                                                          __m128i __b) {
-  *__p = __b;
-}
-
-/// Stores a 128-bit integer vector to an unaligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that will receive the integer values.
-/// \param __b
-///    A 128-bit integer vector containing the values to be moved.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
-                                                           __m128i __b) {
-  struct __storeu_si128 {
-    __m128i_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_si128 *)__p)->__v = __b;
-}
-
-/// Stores a 64-bit integer value from the low element of a 128-bit integer
-///    vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __p
-///    A pointer to a 64-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \param __b
-///    A 128-bit integer vector containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
-                                                          __m128i __b) {
-  struct __storeu_si64 {
-    long long __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
-}
-
-/// Stores a 32-bit integer value from the low element of a 128-bit integer
-///    vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __p
-///    A pointer to a 32-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \param __b
-///    A 128-bit integer vector containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
-                                                          __m128i __b) {
-  struct __storeu_si32 {
-    int __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
-}
-
-/// Stores a 16-bit integer value from the low element of a 128-bit integer
-///    vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic does not correspond to a specific instruction.
-///
-/// \param __p
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \param __b
-///    A 128-bit integer vector containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
-                                                          __m128i __b) {
-  struct __storeu_si16 {
-    short __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
-}
-
-/// Moves bytes selected by the mask from the first operand to the
-///    specified unaligned memory location. When a mask bit is 1, the
-///    corresponding byte is written, otherwise it is not written.
-///
-///    To minimize caching, the data is flagged as non-temporal (unlikely to be
-///    used again soon). Exception and trap behavior for elements not selected
-///    for storage to memory are implementation dependent.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
-///   instruction.
-///
-/// \param __d
-///    A 128-bit integer vector containing the values to be moved.
-/// \param __n
-///    A 128-bit integer vector containing the mask. The most significant bit of
-///    each byte represents the mask bits.
-/// \param __p
-///    A pointer to an unaligned 128-bit memory location where the specified
-///    values are moved.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
-                                                              __m128i __n,
-                                                              char *__p) {
-  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
-}
-
-/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
-///    a memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a 64-bit memory location that will receive the lower 64 bits
-///    of the integer vector parameter.
-/// \param __a
-///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
-///    value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
-                                                           __m128i __a) {
-  struct __mm_storel_epi64_struct {
-    long long __u;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
-}
-
-/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
-///    aligned memory location.
-///
-///    To minimize caching, the data is flagged as non-temporal (unlikely to be
-///    used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
-///
-/// \param __p
-///    A pointer to the 128-bit aligned memory location used to store the value.
-/// \param __a
-///    A vector of [2 x double] containing the 64-bit values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
-                                                        __m128d __a) {
-  __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
-}
-
-/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
-///
-///    To minimize caching, the data is flagged as non-temporal (unlikely to be
-///    used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
-///
-/// \param __p
-///    A pointer to the 128-bit aligned memory location used to store the value.
-/// \param __a
-///    A 128-bit integer vector containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
-                                                           __m128i __a) {
-  __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
-}
-
-/// Stores a 32-bit integer value in the specified memory location.
-///
-///    To minimize caching, the data is flagged as non-temporal (unlikely to be
-///    used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
-///
-/// \param __p
-///    A pointer to the 32-bit memory location used to store the value.
-/// \param __a
-///    A 32-bit integer containing the value to be stored.
-static __inline__ void
-    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si32(void *__p, int __a) {
-  __builtin_ia32_movnti((int *)__p, __a);
-}
-
-#ifdef __x86_64__
-/// Stores a 64-bit integer value in the specified memory location.
-///
-///    To minimize caching, the data is flagged as non-temporal (unlikely to be
-///    used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
-///
-/// \param __p
-///    A pointer to the 64-bit memory location used to store the value.
-/// \param __a
-///    A 64-bit integer containing the value to be stored.
-static __inline__ void
-    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si64(void *__p, long long __a) {
-  __builtin_ia32_movnti64((long long *)__p, __a);
-}
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/// The cache line containing \a __p is flushed and invalidated from all
-///    caches in the coherency domain.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
-///
-/// \param __p
-///    A pointer to the memory location used to identify the cache line to be
-///    flushed.
-void _mm_clflush(void const *__p);
-
-/// Forces strong memory ordering (serialization) between load
-///    instructions preceding this instruction and load instructions following
-///    this instruction, ensuring the system completes all previous loads before
-///    executing subsequent loads.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
-///
-void _mm_lfence(void);
-
-/// Forces strong memory ordering (serialization) between load and store
-///    instructions preceding this instruction and load and store instructions
-///    following this instruction, ensuring that the system completes all
-///    previous memory accesses before executing subsequent memory accesses.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
-///
-void _mm_mfence(void);
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
-///    vector operands into 8-bit signed integers, and packs the results into
-///    the destination.
-///
-///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
-///    less than 0x80 are saturated to 0x80.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
-///
-/// \param __a
-///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
-///   written to the lower 64 bits of the result.
-/// \param __b
-///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
-///   written to the higher 64 bits of the result.
-/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
-///    vector operands into 16-bit signed integers, and packs the results into
-///    the destination.
-///
-///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
-///    values less than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
-///    are written to the lower 64 bits of the result.
-/// \param __b
-///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
-///    are written to the higher 64 bits of the result.
-/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
-                                                             __m128i __b) {
-  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
-}
-
-/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
-///    vector operands into 8-bit unsigned integers, and packs the results into
-///    the destination.
-///
-///    Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
-///    are saturated to 0x00.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
-///    written to the lower 64 bits of the result.
-/// \param __b
-///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
-///    written to the higher 64 bits of the result.
-/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
-                                                              __m128i __b) {
-  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
-///    the immediate-value parameter as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_extract_epi16(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
-///
-/// \param a
-///    A 128-bit integer vector.
-/// \param imm
-///    An immediate value. Bits [2:0] selects values from \a a to be assigned
-///    to bits[15:0] of the result. \n
-///    000: assign values from bits [15:0] of \a a. \n
-///    001: assign values from bits [31:16] of \a a. \n
-///    010: assign values from bits [47:32] of \a a. \n
-///    011: assign values from bits [63:48] of \a a. \n
-///    100: assign values from bits [79:64] of \a a. \n
-///    101: assign values from bits [95:80] of \a a. \n
-///    110: assign values from bits [111:96] of \a a. \n
-///    111: assign values from bits [127:112] of \a a.
-/// \returns An integer, whose lower 16 bits are selected from the 128-bit
-///    integer vector parameter and the remaining bits are assigned zeros.
-#define _mm_extract_epi16(a, imm)                                              \
-  ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
-                                                    (int)(imm)))
-
-/// Constructs a 128-bit integer vector by first making a copy of the
-///    128-bit integer vector parameter, and then inserting the lower 16 bits
-///    of an integer parameter into an offset specified by the immediate-value
-///    parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
-///
-/// \param a
-///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
-///    result and then one of the eight elements in the result is replaced by
-///    the lower 16 bits of \a b.
-/// \param b
-///    An integer. The lower 16 bits of this parameter are written to the
-///    result beginning at an offset specified by \a imm.
-/// \param imm
-///    An immediate value specifying the bit offset in the result at which the
-///    lower 16 bits of \a b are written.
-/// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi16(a, b, imm)                                            \
-  ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
-                                        (int)(imm)))
-
-/// Copies the values of the most significant bits from each 8-bit
-///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
-///    value, zero-extends the value, and writes it to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the values with bits to be extracted.
-/// \returns The most significant bits from each 8-bit element in \a __a,
-///    written to bits [15:0]. The other bits are assigned zeros.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
-  return __builtin_ia32_pmovmskb128((__v16qi)__a);
-}
-
-/// Constructs a 128-bit integer vector by shuffling four 32-bit
-///    elements of a 128-bit integer vector parameter, using the immediate-value
-///    parameter as a specifier.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
-///
-/// \param a
-///    A 128-bit integer vector containing the values to be copied.
-/// \param imm
-///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from a. The destinations within the 128-bit destination are assigned
-///    values as follows: \n
-///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
-///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
-///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
-///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
-///    Bit value assignments: \n
-///    00: assign values from bits [31:0] of \a a. \n
-///    01: assign values from bits [63:32] of \a a. \n
-///    10: assign values from bits [95:64] of \a a. \n
-///    11: assign values from bits [127:96] of \a a. \n
-///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
-///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
-///    <c>[b6, b4, b2, b0]</c>.
-/// \returns A 128-bit integer vector containing the shuffled values.
-#define _mm_shuffle_epi32(a, imm)                                              \
-  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
-
-/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
-///    elements of a 128-bit integer vector of [8 x i16], using the immediate
-///    value parameter as a specifier.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
-///
-/// \param a
-///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
-///    [127:64] of the result.
-/// \param imm
-///    An 8-bit immediate value specifying which elements to copy from \a a. \n
-///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
-///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
-///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
-///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
-///    Bit value assignments: \n
-///    00: assign values from bits [15:0] of \a a. \n
-///    01: assign values from bits [31:16] of \a a. \n
-///    10: assign values from bits [47:32] of \a a. \n
-///    11: assign values from bits [63:48] of \a a. \n
-///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
-///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
-///    <c>[b6, b4, b2, b0]</c>.
-/// \returns A 128-bit integer vector containing the shuffled values.
-#define _mm_shufflelo_epi16(a, imm)                                            \
-  ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
-
-/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
-///    elements of a 128-bit integer vector of [8 x i16], using the immediate
-///    value parameter as a specifier.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
-///
-/// \param a
-///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
-///    [63:0] of the result.
-/// \param imm
-///    An 8-bit immediate value specifying which elements to copy from \a a. \n
-///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
-///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
-///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
-///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
-///    Bit value assignments: \n
-///    00: assign values from bits [79:64] of \a a. \n
-///    01: assign values from bits [95:80] of \a a. \n
-///    10: assign values from bits [111:96] of \a a. \n
-///    11: assign values from bits [127:112] of \a a. \n
-///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
-///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
-///    <c>[b6, b4, b2, b0]</c>.
-/// \returns A 128-bit integer vector containing the shuffled values.
-#define _mm_shufflehi_epi16(a, imm)                                            \
-  ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
-
-/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
-///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [16 x i8].
-///    Bits [71:64] are written to bits [7:0] of the result. \n
-///    Bits [79:72] are written to bits [23:16] of the result. \n
-///    Bits [87:80] are written to bits [39:32] of the result. \n
-///    Bits [95:88] are written to bits [55:48] of the result. \n
-///    Bits [103:96] are written to bits [71:64] of the result. \n
-///    Bits [111:104] are written to bits [87:80] of the result. \n
-///    Bits [119:112] are written to bits [103:96] of the result. \n
-///    Bits [127:120] are written to bits [119:112] of the result.
-/// \param __b
-///    A 128-bit vector of [16 x i8]. \n
-///    Bits [71:64] are written to bits [15:8] of the result. \n
-///    Bits [79:72] are written to bits [31:24] of the result. \n
-///    Bits [87:80] are written to bits [47:40] of the result. \n
-///    Bits [95:88] are written to bits [63:56] of the result. \n
-///    Bits [103:96] are written to bits [79:72] of the result. \n
-///    Bits [111:104] are written to bits [95:88] of the result. \n
-///    Bits [119:112] are written to bits [111:104] of the result. \n
-///    Bits [127:120] are written to bits [127:120] of the result.
-/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
-                                                               __m128i __b) {
-  return (__m128i)__builtin_shufflevector(
-      (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
-      16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
-}
-
-/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
-///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x i16].
-///    Bits [79:64] are written to bits [15:0] of the result. \n
-///    Bits [95:80] are written to bits [47:32] of the result. \n
-///    Bits [111:96] are written to bits [79:64] of the result. \n
-///    Bits [127:112] are written to bits [111:96] of the result.
-/// \param __b
-///    A 128-bit vector of [8 x i16].
-///    Bits [79:64] are written to bits [31:16] of the result. \n
-///    Bits [95:80] are written to bits [63:48] of the result. \n
-///    Bits [111:96] are written to bits [95:80] of the result. \n
-///    Bits [127:112] are written to bits [127:112] of the result.
-/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
-                                                                __m128i __b) {
-  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
-                                          8 + 5, 6, 8 + 6, 7, 8 + 7);
-}
-
-/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
-///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x i32]. \n
-///    Bits [95:64] are written to bits [31:0] of the destination. \n
-///    Bits [127:96] are written to bits [95:64] of the destination.
-/// \param __b
-///    A 128-bit vector of [4 x i32]. \n
-///    Bits [95:64] are written to bits [64:32] of the destination. \n
-///    Bits [127:96] are written to bits [127:96] of the destination.
-/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
-                                                                __m128i __b) {
-  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
-                                          4 + 3);
-}
-
-/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
-///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x i64]. \n
-///    Bits [127:64] are written to bits [63:0] of the destination.
-/// \param __b
-///    A 128-bit vector of [2 x i64]. \n
-///    Bits [127:64] are written to bits [127:64] of the destination.
-/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
-                                                                __m128i __b) {
-  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
-}
-
-/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
-///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [16 x i8]. \n
-///    Bits [7:0] are written to bits [7:0] of the result. \n
-///    Bits [15:8] are written to bits [23:16] of the result. \n
-///    Bits [23:16] are written to bits [39:32] of the result. \n
-///    Bits [31:24] are written to bits [55:48] of the result. \n
-///    Bits [39:32] are written to bits [71:64] of the result. \n
-///    Bits [47:40] are written to bits [87:80] of the result. \n
-///    Bits [55:48] are written to bits [103:96] of the result. \n
-///    Bits [63:56] are written to bits [119:112] of the result.
-/// \param __b
-///    A 128-bit vector of [16 x i8].
-///    Bits [7:0] are written to bits [15:8] of the result. \n
-///    Bits [15:8] are written to bits [31:24] of the result. \n
-///    Bits [23:16] are written to bits [47:40] of the result. \n
-///    Bits [31:24] are written to bits [63:56] of the result. \n
-///    Bits [39:32] are written to bits [79:72] of the result. \n
-///    Bits [47:40] are written to bits [95:88] of the result. \n
-///    Bits [55:48] are written to bits [111:104] of the result. \n
-///    Bits [63:56] are written to bits [127:120] of the result.
-/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
-                                                               __m128i __b) {
-  return (__m128i)__builtin_shufflevector(
-      (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
-      16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
-}
-
-/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
-///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
-///    [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x i16].
-///    Bits [15:0] are written to bits [15:0] of the result. \n
-///    Bits [31:16] are written to bits [47:32] of the result. \n
-///    Bits [47:32] are written to bits [79:64] of the result. \n
-///    Bits [63:48] are written to bits [111:96] of the result.
-/// \param __b
-///    A 128-bit vector of [8 x i16].
-///    Bits [15:0] are written to bits [31:16] of the result. \n
-///    Bits [31:16] are written to bits [63:48] of the result. \n
-///    Bits [47:32] are written to bits [95:80] of the result. \n
-///    Bits [63:48] are written to bits [127:112] of the result.
-/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
-                                                                __m128i __b) {
-  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
-                                          8 + 1, 2, 8 + 2, 3, 8 + 3);
-}
-
-/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
-///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x i32]. \n
-///    Bits [31:0] are written to bits [31:0] of the destination. \n
-///    Bits [63:32] are written to bits [95:64] of the destination.
-/// \param __b
-///    A 128-bit vector of [4 x i32]. \n
-///    Bits [31:0] are written to bits [64:32] of the destination. \n
-///    Bits [63:32] are written to bits [127:96] of the destination.
-/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
-                                                                __m128i __b) {
-  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
-                                          4 + 1);
-}
-
-/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
-///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
-///   instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x i64]. \n
-///    Bits [63:0] are written to bits [63:0] of the destination. \n
-/// \param __b
-///    A 128-bit vector of [2 x i64]. \n
-///    Bits [63:0] are written to bits [127:64] of the destination. \n
-/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
-                                                                __m128i __b) {
-  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
-}
-
-/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
-///    integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector operand. The lower 64 bits are moved to the
-///    destination.
-/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
-  return (__m64)__a[0];
-}
-
-/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
-///    upper bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
-///
-/// \param __a
-///    A 64-bit value.
-/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
-///    the operand. The upper 64 bits are assigned zeros.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
-  return __extension__(__m128i)(__v2di){(long long)__a, 0};
-}
-
-/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
-///    integer vector, zeroing the upper bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit integer vector operand. The lower 64 bits are moved to the
-///    destination.
-/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
-///    the operand. The upper 64 bits are assigned zeros.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
-  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
-}
-
-/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
-///    [2 x double] and interleaves them into a 128-bit vector of [2 x
-///    double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. \n
-///    Bits [127:64] are written to bits [63:0] of the destination.
-/// \param __b
-///    A 128-bit vector of [2 x double]. \n
-///    Bits [127:64] are written to bits [127:64] of the destination.
-/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
-                                                             __m128d __b) {
-  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
-}
-
-/// Unpacks the low-order 64-bit elements from two 128-bit vectors
-///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
-///    double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. \n
-///    Bits [63:0] are written to bits [63:0] of the destination.
-/// \param __b
-///    A 128-bit vector of [2 x double]. \n
-///    Bits [63:0] are written to bits [127:64] of the destination.
-/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
-                                                             __m128d __b) {
-  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
-}
-
-/// Extracts the sign bits of the double-precision values in the 128-bit
-///    vector of [2 x double], zero-extends the value, and writes it to the
-///    low-order bits of the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the values with sign bits to
-///    be extracted.
-/// \returns The sign bits from each of the double-precision elements in \a __a,
-///    written to bits [1:0]. The remaining bits are assigned values of zero.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
-  return __builtin_ia32_movmskpd((__v2df)__a);
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double] from two
-///    128-bit vector parameters of [2 x double], using the immediate-value
-///     parameter as a specifier.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x double].
-/// \param b
-///    A 128-bit vector of [2 x double].
-/// \param i
-///    An 8-bit immediate value. The least significant two bits specify which
-///    elements to copy from \a a and \a b: \n
-///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
-///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
-///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
-///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
-///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
-///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
-///    <c>[b1, b0]</c>.
-/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
-#define _mm_shuffle_pd(a, b, i)                                                \
-  ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
-                                  (int)(i)))
-
-/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
-///    floating-point vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit floating-point vector of [2 x double].
-/// \returns A 128-bit floating-point vector of [4 x float] containing the same
-///    bitwise pattern as the parameter.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
-  return (__m128)__a;
-}
-
-/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
-///    integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit floating-point vector of [2 x double].
-/// \returns A 128-bit integer vector containing the same bitwise pattern as the
-///    parameter.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
-  return (__m128i)__a;
-}
-
-/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
-///    floating-point vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit floating-point vector of [4 x float].
-/// \returns A 128-bit floating-point vector of [2 x double] containing the same
-///    bitwise pattern as the parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
-  return (__m128d)__a;
-}
-
-/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
-///    integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit floating-point vector of [4 x float].
-/// \returns A 128-bit integer vector containing the same bitwise pattern as the
-///    parameter.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
-  return (__m128i)__a;
-}
-
-/// Casts a 128-bit integer vector into a 128-bit floating-point vector
-///    of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \returns A 128-bit floating-point vector of [4 x float] containing the same
-///    bitwise pattern as the parameter.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
-  return (__m128)__a;
-}
-
-/// Casts a 128-bit integer vector into a 128-bit floating-point vector
-///    of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit integer vector.
-/// \returns A 128-bit floating-point vector of [2 x double] containing the same
-///    bitwise pattern as the parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
-  return (__m128d)__a;
-}
-
-/// Compares each of the corresponding double-precision values of two
-///    128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x double].
-/// \param b
-///    A 128-bit vector of [2 x double].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-/// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_pd(a, b, c)                                                    \
-  ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
-                                 (c)))
-
-/// Compares each of the corresponding scalar double-precision values of
-///    two 128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x double].
-/// \param b
-///    A 128-bit vector of [2 x double].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-/// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_sd(a, b, c)                                                    \
-  ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
-                                 (c)))
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/// Indicates that a spin loop is being executed for the purposes of
-///    optimizing power consumption during the loop.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
-///
-void _mm_pause(void);
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
-
-#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
-
-#define _MM_DENORMALS_ZERO_ON (0x0040U)
-#define _MM_DENORMALS_ZERO_OFF (0x0000U)
-
-#define _MM_DENORMALS_ZERO_MASK (0x0040U)
-
-#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
-#define _MM_SET_DENORMALS_ZERO_MODE(x)                                         \
-  (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
-
-#endif /* __EMMINTRIN_H */
diff --git a/third_party/intel/clang/enqcmdintrin.h b/third_party/intel/clang/enqcmdintrin.h
deleted file mode 100644
index 30af67f6b..000000000
--- a/third_party/intel/clang/enqcmdintrin.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <enqcmdintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __ENQCMDINTRIN_H
-#define __ENQCMDINTRIN_H
-
-/* Define the default attributes for the functions in this file */
-#define _DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("enqcmd")))
-
-/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
-///    data, and performs 64-byte enqueue store to memory pointed by \a __dst.
-///    This intrinsics may only be used in User mode.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsics corresponds to the <c> ENQCMD </c> instruction.
-///
-/// \param __dst
-///    Pointer to the destination of the enqueue store.
-/// \param __src
-///    Pointer to 64-byte command data.
-/// \returns If the command data is successfully written to \a __dst then 0 is
-///    returned. Otherwise 1 is returned.
-static __inline__ int _DEFAULT_FN_ATTRS
-_enqcmd (void *__dst, const void *__src)
-{
-  return __builtin_ia32_enqcmd(__dst, __src);
-}
-
-/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
-///    data, and performs 64-byte enqueue store to memory pointed by \a __dst
-///    This intrinsic may only be used in Privileged mode.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsics corresponds to the <c> ENQCMDS </c> instruction.
-///
-/// \param __dst
-///    Pointer to the destination of the enqueue store.
-/// \param __src
-///    Pointer to 64-byte command data.
-/// \returns If the command data is successfully written to \a __dst then 0 is
-///    returned. Otherwise 1 is returned.
-static __inline__ int _DEFAULT_FN_ATTRS
-_enqcmds (void *__dst, const void *__src)
-{
-  return __builtin_ia32_enqcmds(__dst, __src);
-}
-
-#undef _DEFAULT_FN_ATTRS
-
-#endif /* __ENQCMDINTRIN_H */
diff --git a/third_party/intel/clang/f16cintrin.h b/third_party/intel/clang/f16cintrin.h
deleted file mode 100644
index 94a662c1d..000000000
--- a/third_party/intel/clang/f16cintrin.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __IMMINTRIN_H
-#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __F16CINTRIN_H
-#define __F16CINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 \
-  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 \
-  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
-
-/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
- * but that's because icc can emulate these without f16c using a library call.
- * Since we don't do that let's leave these in f16cintrin.h.
- */
-
-/// Converts a 16-bit half-precision float value into a 32-bit float
-///    value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-///    A 16-bit half-precision float value.
-/// \returns The converted 32-bit float value.
-static __inline float __DEFAULT_FN_ATTRS128
-_cvtsh_ss(unsigned short __a)
-{
-  __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
-  __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
-  return __r[0];
-}
-
-/// Converts a 32-bit single-precision float value to a 16-bit
-///    half-precision float value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned short _cvtss_sh(float a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-///    A 32-bit single-precision float value to be converted to a 16-bit
-///    half-precision float value.
-/// \param imm
-///    An immediate value controlling rounding using bits [2:0]: \n
-///    000: Nearest \n
-///    001: Down \n
-///    010: Up \n
-///    011: Truncate \n
-///    1XX: Use MXCSR.RC for rounding
-/// \returns The converted 16-bit half-precision float value.
-#define _cvtss_sh(a, imm) __extension__ ({ \
-  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
-                                                     (imm)))[0]); })
-
-/// Converts a 128-bit vector containing 32-bit float values into a
-///    128-bit vector containing 16-bit half-precision float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-///    A 128-bit vector containing 32-bit float values.
-/// \param imm
-///    An immediate value controlling rounding using bits [2:0]: \n
-///    000: Nearest \n
-///    001: Down \n
-///    010: Up \n
-///    011: Truncate \n
-///    1XX: Use MXCSR.RC for rounding
-/// \returns A 128-bit vector containing converted 16-bit half-precision float
-///    values. The lower 64 bits are used to store the converted 16-bit
-///    half-precision floating-point values.
-#define _mm_cvtps_ph(a, imm) \
-  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
-
-/// Converts a 128-bit vector containing 16-bit half-precision float
-///    values into a 128-bit vector containing 32-bit float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector containing 16-bit half-precision float values. The lower
-///    64 bits are used in the conversion.
-/// \returns A 128-bit vector of [4 x float] containing converted float values.
-static __inline __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtph_ps(__m128i __a)
-{
-  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
-}
-
-/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
-///    containing 16-bit half-precision float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-///    A 256-bit vector containing 32-bit single-precision float values to be
-///    converted to 16-bit half-precision float values.
-/// \param imm
-///    An immediate value controlling rounding using bits [2:0]: \n
-///    000: Nearest \n
-///    001: Down \n
-///    010: Up \n
-///    011: Truncate \n
-///    1XX: Use MXCSR.RC for rounding
-/// \returns A 128-bit vector containing the converted 16-bit half-precision
-///    float values.
-#define _mm256_cvtps_ph(a, imm) \
- ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
-
-/// Converts a 128-bit vector containing 16-bit half-precision float
-///    values into a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector containing 16-bit half-precision float values to be
-///    converted to 32-bit single-precision float values.
-/// \returns A vector of [8 x float] containing the converted 32-bit
-///    single-precision float values.
-static __inline __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtph_ps(__m128i __a)
-{
-  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __F16CINTRIN_H */
diff --git a/third_party/intel/clang/fma4intrin.h b/third_party/intel/clang/fma4intrin.h
deleted file mode 100644
index 7ff69d96d..000000000
--- a/third_party/intel/clang/fma4intrin.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __FMA4INTRIN_H
-#define __FMA4INTRIN_H
-
-#include "pmmintrin.h"
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __FMA4INTRIN_H */
diff --git a/third_party/intel/clang/fmaintrin.h b/third_party/intel/clang/fmaintrin.h
deleted file mode 100644
index 22d1a780b..000000000
--- a/third_party/intel/clang/fmaintrin.h
+++ /dev/null
@@ -1,796 +0,0 @@
-/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __FMAINTRIN_H
-#define __FMAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
-
-/// Computes a multiply-add of 128-bit vectors of [4 x float].
-///    For each element, computes <c> (__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a multiply-add of 128-bit vectors of [2 x double].
-///    For each element, computes <c> (__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend.
-/// \returns A 128-bit [2 x double] vector containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a scalar multiply-add of the single-precision values in the
-///    low 32 bits of 128-bit vectors of [4 x float].
-///
-/// \code{.operation}
-/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
-/// result[127:32] = __A[127:32]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213SS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand in the low
-///    32 bits.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier in the low
-///    32 bits.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend in the low
-///    32 bits.
-/// \returns A 128-bit vector of [4 x float] containing the result in the low
-///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a scalar multiply-add of the double-precision values in the
-///    low 64 bits of 128-bit vectors of [2 x double].
-///
-/// \code{.operation}
-/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
-/// result[127:64] = __A[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213SD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand in the low
-///    64 bits.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier in the low
-///    64 bits.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend in the low
-///    64 bits.
-/// \returns A 128-bit vector of [2 x double] containing the result in the low
-///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
-///    For each element, computes <c> (__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a multiply-subtract of 128-bit vectors of [2 x double].
-///    For each element, computes <c> (__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a scalar multiply-subtract of the single-precision values in
-///    the low 32 bits of 128-bit vectors of [4 x float].
-///
-/// \code{.operation}
-/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
-/// result[127:32] = __A[127:32]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213SS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand in the low
-///    32 bits.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier in the low
-///    32 bits.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the subtrahend in the low
-///   32 bits.
-/// \returns A 128-bit vector of [4 x float] containing the result in the low
-///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a scalar multiply-subtract of the double-precision values in
-///    the low 64 bits of 128-bit vectors of [2 x double].
-///
-/// \code{.operation}
-/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
-/// result[127:64] = __A[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213SD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand in the low
-///    64 bits.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier in the low
-///    64 bits.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the subtrahend in the low
-///    64 bits.
-/// \returns A 128-bit vector of [2 x double] containing the result in the low
-///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
-///    For each element, computes <c> -(__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend.
-/// \returns A 128-bit [4 x float] vector containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a negated multiply-add of 128-bit vectors of [2 x double].
-///    For each element, computes <c> -(__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a scalar negated multiply-add of the single-precision values in
-///    the low 32 bits of 128-bit vectors of [4 x float].
-///
-/// \code{.operation}
-/// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
-/// result[127:32] = __A[127:32]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213SS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand in the low
-///    32 bits.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier in the low
-///    32 bits.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend in the low
-///    32 bits.
-/// \returns A 128-bit vector of [4 x float] containing the result in the low
-///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a scalar negated multiply-add of the double-precision values
-///    in the low 64 bits of 128-bit vectors of [2 x double].
-///
-/// \code{.operation}
-/// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
-/// result[127:64] = __A[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213SD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand in the low
-///    64 bits.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier in the low
-///    64 bits.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend in the low
-///    64 bits.
-/// \returns A 128-bit vector of [2 x double] containing the result in the low
-///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
-///    For each element, computes <c> -(__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
-///    For each element, computes <c> -(__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the subtrahend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a scalar negated multiply-subtract of the single-precision
-///    values in the low 32 bits of 128-bit vectors of [4 x float].
-///
-/// \code{.operation}
-/// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
-/// result[127:32] = __A[127:32]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand in the low
-///    32 bits.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier in the low
-///    32 bits.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the subtrahend in the low
-///    32 bits.
-/// \returns A 128-bit vector of [4 x float] containing the result in the low
-///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a scalar negated multiply-subtract of the double-precision
-///    values in the low 64 bits of 128-bit vectors of [2 x double].
-///
-/// \code{.operation}
-/// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
-/// result[127:64] = __A[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand in the low
-///    64 bits.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier in the low
-///    64 bits.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the subtrahend in the low
-///    64 bits.
-/// \returns A 128-bit vector of [2 x double] containing the result in the low
-///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 128-bit vectors of
-///    [4 x float].
-///
-/// \code{.operation}
-/// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
-/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
-/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
-/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 128-bit vectors of
-///    [2 x double].
-///
-/// \code{.operation}
-/// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
-/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 128-bit vectors of
-///    [4 x float].
-///
-/// \code{.operation}
-/// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
-/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
-/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
-/// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 128-bit vectors of
-///    [2 x double].
-///
-/// \code{.operation}
-/// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
-/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a multiply-add of 256-bit vectors of [8 x float].
-///    For each element, computes <c> (__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the addend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-/// Computes a multiply-add of 256-bit vectors of [4 x double].
-///    For each element, computes <c> (__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the addend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-/// Computes a multiply-subtract of 256-bit vectors of [8 x float].
-///    For each element, computes <c> (__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-/// Computes a multiply-subtract of 256-bit vectors of [4 x double].
-///    For each element, computes <c> (__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-/// Computes a negated multiply-add of 256-bit vectors of [8 x float].
-///    For each element, computes <c> -(__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the addend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-/// Computes a negated multiply-add of 256-bit vectors of [4 x double].
-///    For each element, computes <c> -(__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the addend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-/// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
-///    For each element, computes <c> -(__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-/// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
-///    For each element, computes <c> -(__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 256-bit vectors of
-///    [8 x float].
-///
-/// \code{.operation}
-/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
-/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
-/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
-/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
-/// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
-/// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
-/// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
-/// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 256-bit vectors of
-///    [4 x double].
-///
-/// \code{.operation}
-/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
-/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
-/// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
-/// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-/// Computes a vector multiply with alternating add/subtract of 256-bit
-///    vectors of [8 x float].
-///
-/// \code{.operation}
-/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
-/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
-/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
-/// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
-/// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
-/// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
-/// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
-/// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-/// Computes a vector multiply with alternating add/subtract of 256-bit
-///    vectors of [4 x double].
-///
-/// \code{.operation}
-/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
-/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
-/// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
-/// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __FMAINTRIN_H */
diff --git a/third_party/intel/clang/fxsrintrin.h b/third_party/intel/clang/fxsrintrin.h
deleted file mode 100644
index afee6aa97..000000000
--- a/third_party/intel/clang/fxsrintrin.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __FXSRINTRIN_H
-#define __FXSRINTRIN_H
-
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
-
-/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
-///    memory region pointed to by the input parameter \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
-///
-/// \param __p
-///    A pointer to a 512-byte memory region. The beginning of this memory
-///    region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave(void *__p)
-{
-  __builtin_ia32_fxsave(__p);
-}
-
-/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
-///    memory region pointed to by the input parameter \a __p. The contents of
-///    this memory region should have been written to by a previous \c _fxsave
-///    or \c _fxsave64 intrinsic.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
-///
-/// \param __p
-///    A pointer to a 512-byte memory region. The beginning of this memory
-///    region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor(void *__p)
-{
-  __builtin_ia32_fxrstor(__p);
-}
-
-#ifdef __x86_64__
-/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
-///    memory region pointed to by the input parameter \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
-///
-/// \param __p
-///    A pointer to a 512-byte memory region. The beginning of this memory
-///    region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave64(void *__p)
-{
-  __builtin_ia32_fxsave64(__p);
-}
-
-/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
-///    memory region pointed to by the input parameter \a __p. The contents of
-///    this memory region should have been written to by a previous \c _fxsave
-///    or \c _fxsave64 intrinsic.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
-///
-/// \param __p
-///    A pointer to a 512-byte memory region. The beginning of this memory
-///    region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor64(void *__p)
-{
-  __builtin_ia32_fxrstor64(__p);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/gfniintrin.h b/third_party/intel/clang/gfniintrin.h
deleted file mode 100644
index 73b04a824..000000000
--- a/third_party/intel/clang/gfniintrin.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __GFNIINTRIN_H
-#define __GFNIINTRIN_H
-
-/* Default attributes for simple form (no masking). */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("gfni,no-evex512"), __min_vector_width__(128)))
-
-/* Default attributes for YMM unmasked form. */
-#define __DEFAULT_FN_ATTRS_Y                                                   \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx,gfni,no-evex512"),                            \
-                 __min_vector_width__(256)))
-
-/* Default attributes for ZMM unmasked forms. */
-#define __DEFAULT_FN_ATTRS_Z                                                   \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512f,evex512,gfni"),                           \
-                 __min_vector_width__(512)))
-/* Default attributes for ZMM masked forms. */
-#define __DEFAULT_FN_ATTRS_Z_MASK                                              \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512bw,evex512,gfni"),                          \
-                 __min_vector_width__(512)))
-
-/* Default attributes for VLX masked forms. */
-#define __DEFAULT_FN_ATTRS_VL128                                               \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_VL256                                               \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
-                 __min_vector_width__(256)))
-
-#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
-  ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
-                                                   (__v16qi)(__m128i)(B), \
-                                                   (char)(I)))
-
-#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
-  ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
-                                                   (__v16qi)(__m128i)(B), \
-                                                   (char)(I)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
-              (__v16qi) __B);
-}
-
-#ifdef __AVXINTRIN_H
-#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
-  ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
-                                                   (__v32qi)(__m256i)(B), \
-                                                   (char)(I)))
-
-#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
-  ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
-                                                   (__v32qi)(__m256i)(B), \
-                                                   (char)(I)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
-_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
-              (__v32qi) __B);
-}
-#endif /* __AVXINTRIN_H */
-
-#ifdef __AVX512BWINTRIN_H
-#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
-  ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
-                                                   (__v64qi)(__m512i)(B), \
-                                                   (char)(I)))
-
-#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
-         (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
-         (__v64qi)(__m512i)(S)))
-
-#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
-         U, A, B, I)
-
-#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
-  ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
-                                                   (__v64qi)(__m512i)(B), \
-                                                   (char)(I)))
-
-#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
-         (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
-         (__v64qi)(__m512i)(S)))
-
-#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
-         U, A, B, I)
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
-_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
-              (__v64qi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
-_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_selectb_512(__U,
-              (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
-              (__v64qi) __S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
-_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
-              __U, __A, __B);
-}
-#endif /* __AVX512BWINTRIN_H */
-
-#ifdef __AVX512VLBWINTRIN_H
-#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
-         (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
-         (__v16qi)(__m128i)(S)))
-
-#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
-         U, A, B, I)
-
-#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
-         (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
-         (__v32qi)(__m256i)(S)))
-
-#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
-         U, A, B, I)
-
-#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
-         (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
-         (__v16qi)(__m128i)(S)))
-
-#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
-
-#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
-         (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
-         (__v32qi)(__m256i)(S)))
-
-#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
-         U, A, B, I)
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_selectb_128(__U,
-              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
-              (__v16qi) __S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
-              __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_selectb_256(__U,
-              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
-              (__v32qi) __S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
-              __U, __A, __B);
-}
-#endif /* __AVX512VLBWINTRIN_H */
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_Y
-#undef __DEFAULT_FN_ATTRS_Z
-#undef __DEFAULT_FN_ATTRS_VL128
-#undef __DEFAULT_FN_ATTRS_VL256
-
-#endif /* __GFNIINTRIN_H */
-
diff --git a/third_party/intel/clang/hresetintrin.h b/third_party/intel/clang/hresetintrin.h
deleted file mode 100644
index 646f6c130..000000000
--- a/third_party/intel/clang/hresetintrin.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*===---------------- hresetintrin.h - HRESET intrinsics -------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __X86GPRINTRIN_H
-#error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
-#endif
-
-#ifndef __HRESETINTRIN_H
-#define __HRESETINTRIN_H
-
-#if __has_extension(gnu_asm)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("hreset")))
-
-/// Provides a hint to the processor to selectively reset the prediction
-///    history of the current logical processor specified by a 32-bit integer
-///    value \a __eax.
-///
-/// This intrinsic corresponds to the <c> HRESET </c> instruction.
-///
-/// \code{.operation}
-///    IF __eax == 0
-///      // nop
-///    ELSE
-///      FOR i := 0 to 31
-///        IF __eax[i]
-///          ResetPredictionFeature(i)
-///        FI
-///      ENDFOR
-///    FI
-/// \endcode
-static __inline void __DEFAULT_FN_ATTRS
-_hreset(int __eax)
-{
-  __asm__ ("hreset $0" :: "a"(__eax));
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __has_extension(gnu_asm) */
-
-#endif /* __HRESETINTRIN_H */
diff --git a/third_party/intel/clang/ia32intrin.h b/third_party/intel/clang/ia32intrin.h
deleted file mode 100644
index 8e65f232a..000000000
--- a/third_party/intel/clang/ia32intrin.h
+++ /dev/null
@@ -1,863 +0,0 @@
-/* ===-------- ia32intrin.h ---------------------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __IA32INTRIN_H
-#define __IA32INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
-#else
-#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__))
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
-#endif
-
-/// Finds the first set bit starting from the least significant bit. The result
-///    is undefined if the input is 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BSF instruction or the
-///    \c TZCNT instruction.
-///
-/// \param __A
-///    A 32-bit integer operand.
-/// \returns A 32-bit integer containing the bit number.
-/// \see _bit_scan_forward
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bsfd(int __A) {
-  return __builtin_ctz((unsigned int)__A);
-}
-
-/// Finds the first set bit starting from the most significant bit. The result
-///    is undefined if the input is 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BSR instruction or the
-///    \c LZCNT instruction and an \c XOR.
-///
-/// \param __A
-///    A 32-bit integer operand.
-/// \returns A 32-bit integer containing the bit number.
-/// \see _bit_scan_reverse
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bsrd(int __A) {
-  return 31 - __builtin_clz((unsigned int)__A);
-}
-
-/// Swaps the bytes in the input, converting little endian to big endian or
-///    vice versa.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BSWAP instruction.
-///
-/// \param __A
-///    A 32-bit integer operand.
-/// \returns A 32-bit integer containing the swapped bytes.
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bswapd(int __A) {
-  return (int)__builtin_bswap32((unsigned int)__A);
-}
-
-/// Swaps the bytes in the input, converting little endian to big endian or
-///    vice versa.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BSWAP instruction.
-///
-/// \param __A
-///    A 32-bit integer operand.
-/// \returns A 32-bit integer containing the swapped bytes.
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-_bswap(int __A) {
-  return (int)__builtin_bswap32((unsigned int)__A);
-}
-
-/// Finds the first set bit starting from the least significant bit. The result
-///    is undefined if the input is 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _bit_scan_forward(int A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BSF instruction or the
-///    \c TZCNT instruction.
-///
-/// \param A
-///    A 32-bit integer operand.
-/// \returns A 32-bit integer containing the bit number.
-/// \see __bsfd
-#define _bit_scan_forward(A) __bsfd((A))
-
-/// Finds the first set bit starting from the most significant bit. The result
-///    is undefined if the input is 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _bit_scan_reverse(int A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BSR instruction or the
-///    \c LZCNT instruction and an \c XOR.
-///
-/// \param A
-///    A 32-bit integer operand.
-/// \returns A 32-bit integer containing the bit number.
-/// \see __bsrd
-#define _bit_scan_reverse(A) __bsrd((A))
-
-#ifdef __x86_64__
-/// Finds the first set bit starting from the least significant bit. The result
-///    is undefined if the input is 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BSF instruction or the
-///    \c TZCNT instruction.
-///
-/// \param __A
-///    A 64-bit integer operand.
-/// \returns A 32-bit integer containing the bit number.
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bsfq(long long __A) {
-  return (long long)__builtin_ctzll((unsigned long long)__A);
-}
-
-/// Finds the first set bit starting from the most significant bit. The result
-///    is undefined if input is 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BSR instruction or the
-///    \c LZCNT instruction and an \c XOR.
-///
-/// \param __A
-///    A 64-bit integer operand.
-/// \returns A 32-bit integer containing the bit number.
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bsrq(long long __A) {
-  return 63 - __builtin_clzll((unsigned long long)__A);
-}
-
-/// Swaps the bytes in the input, converting little endian to big endian or
-///    vice versa.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c BSWAP instruction.
-///
-/// \param __A
-///    A 64-bit integer operand.
-/// \returns A 64-bit integer containing the swapped bytes.
-/// \see _bswap64
-static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
-__bswapq(long long __A) {
-  return (long long)__builtin_bswap64((unsigned long long)__A);
-}
-
-/// Swaps the bytes in the input, converting little endian to big endian or
-///    vice versa.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// long long _bswap64(long long A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c BSWAP instruction.
-///
-/// \param A
-///    A 64-bit integer operand.
-/// \returns A 64-bit integer containing the swapped bytes.
-/// \see __bswapq
-#define _bswap64(A) __bswapq((A))
-#endif /* __x86_64__ */
-
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c POPCNT instruction or a
-///    sequence of arithmetic and logic operations to calculate it.
-///
-/// \param __A
-///    An unsigned 32-bit integer operand.
-/// \returns A 32-bit integer containing the number of bits with value 1 in the
-///    source operand.
-/// \see _popcnt32
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__popcntd(unsigned int __A)
-{
-  return __builtin_popcount(__A);
-}
-
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _popcnt32(int A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c POPCNT instruction or a
-///    sequence of arithmetic and logic operations to calculate it.
-///
-/// \param A
-///    An unsigned 32-bit integer operand.
-/// \returns A 32-bit integer containing the number of bits with value 1 in the
-///    source operand.
-/// \see __popcntd
-#define _popcnt32(A) __popcntd((A))
-
-#ifdef __x86_64__
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c POPCNT instruction or a
-///    sequence of arithmetic and logic operations to calculate it.
-///
-/// \param __A
-///    An unsigned 64-bit integer operand.
-/// \returns A 64-bit integer containing the number of bits with value 1 in the
-///    source operand.
-/// \see _popcnt64
-static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
-__popcntq(unsigned long long __A)
-{
-  return __builtin_popcountll(__A);
-}
-
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// long long _popcnt64(unsigned long long A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c POPCNT instruction or a
-///    sequence of arithmetic and logic operations to calculate it.
-///
-/// \param A
-///    An unsigned 64-bit integer operand.
-/// \returns A 64-bit integer containing the number of bits with value 1 in the
-///    source operand.
-/// \see __popcntq
-#define _popcnt64(A) __popcntq((A))
-#endif /* __x86_64__ */
-
-#ifdef __x86_64__
-/// Returns the program status-and-control \c RFLAGS register with the \c VM
-///    and \c RF flags cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PUSHFQ + \c POP instruction sequence.
-///
-/// \returns The 64-bit value of the RFLAGS register.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__readeflags(void)
-{
-  return __builtin_ia32_readeflags_u64();
-}
-
-/// Writes the specified value to the program status-and-control \c RFLAGS
-///    register. Reserved bits are not affected.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PUSH + \c POPFQ instruction sequence.
-///
-/// \param __f
-///    The 64-bit value to write to \c RFLAGS.
-static __inline__ void __DEFAULT_FN_ATTRS
-__writeeflags(unsigned long long __f)
-{
-  __builtin_ia32_writeeflags_u64(__f);
-}
-
-#else /* !__x86_64__ */
-/// Returns the program status-and-control \c EFLAGS register with the \c VM
-///    and \c RF flags cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PUSHFD + \c POP instruction sequence.
-///
-/// \returns The 32-bit value of the EFLAGS register.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__readeflags(void)
-{
-  return __builtin_ia32_readeflags_u32();
-}
-
-/// Writes the specified value to the program status-and-control \c EFLAGS
-///    register. Reserved bits are not affected.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PUSH + \c POPFD instruction sequence.
-///
-/// \param __f
-///    The 32-bit value to write to \c EFLAGS.
-static __inline__ void __DEFAULT_FN_ATTRS
-__writeeflags(unsigned int __f)
-{
-  __builtin_ia32_writeeflags_u32(__f);
-}
-#endif /* !__x86_64__ */
-
-/// Casts a 32-bit float value to a 32-bit unsigned integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VMOVD / \c MOVD instruction in x86_64,
-///    and corresponds to the \c VMOVL / \c MOVL instruction in ia32.
-///
-/// \param __A
-///    A 32-bit float value.
-/// \returns A 32-bit unsigned integer containing the converted value.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
-_castf32_u32(float __A) {
-  return __builtin_bit_cast(unsigned int, __A);
-}
-
-/// Casts a 64-bit float value to a 64-bit unsigned integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VMOVQ / \c MOVQ instruction in x86_64,
-///    and corresponds to the \c VMOVL / \c MOVL instruction in ia32.
-///
-/// \param __A
-///    A 64-bit float value.
-/// \returns A 64-bit unsigned integer containing the converted value.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
-_castf64_u64(double __A) {
-  return __builtin_bit_cast(unsigned long long, __A);
-}
-
-/// Casts a 32-bit unsigned integer value to a 32-bit float value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VMOVQ / \c MOVQ instruction in x86_64,
-///    and corresponds to the \c FLDS instruction in ia32.
-///
-/// \param __A
-///    A 32-bit unsigned integer value.
-/// \returns A 32-bit float value containing the converted value.
-static __inline__ float __DEFAULT_FN_ATTRS_CAST
-_castu32_f32(unsigned int __A) {
-  return __builtin_bit_cast(float, __A);
-}
-
-/// Casts a 64-bit unsigned integer value to a 64-bit float value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VMOVQ / \c MOVQ instruction in x86_64,
-///    and corresponds to the \c FLDL instruction in ia32.
-///
-/// \param __A
-///    A 64-bit unsigned integer value.
-/// \returns A 64-bit float value containing the converted value.
-static __inline__ double __DEFAULT_FN_ATTRS_CAST
-_castu64_f64(unsigned long long __A) {
-  return __builtin_bit_cast(double, __A);
-}
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///     unsigned char operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c CRC32B instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a  __D.
-/// \param __D
-///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
-__crc32b(unsigned int __C, unsigned char __D)
-{
-  return __builtin_ia32_crc32qi(__C, __D);
-}
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned short operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c CRC32W instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a  __D.
-/// \param __D
-///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
-__crc32w(unsigned int __C, unsigned short __D)
-{
-  return __builtin_ia32_crc32hi(__C, __D);
-}
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    second unsigned integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c CRC32D instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a  __D.
-/// \param __D
-///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
-__crc32d(unsigned int __C, unsigned int __D)
-{
-  return __builtin_ia32_crc32si(__C, __D);
-}
-
-#ifdef __x86_64__
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned 64-bit integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c CRC32Q instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a  __D.
-/// \param __D
-///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32
-__crc32q(unsigned long long __C, unsigned long long __D)
-{
-  return __builtin_ia32_crc32di(__C, __D);
-}
-#endif /* __x86_64__ */
-
-/// Reads the specified performance-monitoring counter. Refer to your
-///    processor's documentation to determine which performance counters are
-///    supported.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c RDPMC instruction.
-///
-/// \param __A
-///    The performance counter to read.
-/// \returns The 64-bit value read from the performance counter.
-/// \see _rdpmc
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__rdpmc(int __A) {
-  return __builtin_ia32_rdpmc(__A);
-}
-
-/// Reads the processor's time-stamp counter and the \c IA32_TSC_AUX MSR
-///    \c (0xc0000103).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c RDTSCP instruction.
-///
-/// \param __A
-///    The address of where to store the 32-bit \c IA32_TSC_AUX value.
-/// \returns The 64-bit value of the time-stamp counter.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__rdtscp(unsigned int *__A) {
-  return __builtin_ia32_rdtscp(__A);
-}
-
-/// Reads the processor's time-stamp counter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _rdtsc();
-/// \endcode
-///
-/// This intrinsic corresponds to the \c RDTSC instruction.
-///
-/// \returns The 64-bit value of the time-stamp counter.
-#define _rdtsc() __rdtsc()
-
-/// Reads the specified performance monitoring counter. Refer to your
-///    processor's documentation to determine which performance counters are
-///    supported.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _rdpmc(int A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c RDPMC instruction.
-///
-/// \param A
-///    The performance counter to read.
-/// \returns The 64-bit value read from the performance counter.
-/// \see __rdpmc
-#define _rdpmc(A) __rdpmc(A)
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_wbinvd(void) {
-  __builtin_ia32_wbinvd();
-}
-
-/// Rotates an 8-bit value to the left by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ROL instruction.
-///
-/// \param __X
-///    The unsigned 8-bit value to be rotated.
-/// \param __C
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
-__rolb(unsigned char __X, int __C) {
-  return __builtin_rotateleft8(__X, __C);
-}
-
-/// Rotates an 8-bit value to the right by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ROR instruction.
-///
-/// \param __X
-///    The unsigned 8-bit value to be rotated.
-/// \param __C
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
-__rorb(unsigned char __X, int __C) {
-  return __builtin_rotateright8(__X, __C);
-}
-
-/// Rotates a 16-bit value to the left by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ROL instruction.
-///
-/// \param __X
-///    The unsigned 16-bit value to be rotated.
-/// \param __C
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see _rotwl
-static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
-__rolw(unsigned short __X, int __C) {
-  return __builtin_rotateleft16(__X, __C);
-}
-
-/// Rotates a 16-bit value to the right by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ROR instruction.
-///
-/// \param __X
-///    The unsigned 16-bit value to be rotated.
-/// \param __C
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see _rotwr
-static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
-__rorw(unsigned short __X, int __C) {
-  return __builtin_rotateright16(__X, __C);
-}
-
-/// Rotates a 32-bit value to the left by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ROL instruction.
-///
-/// \param __X
-///    The unsigned 32-bit value to be rotated.
-/// \param __C
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see _rotl
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
-__rold(unsigned int __X, int __C) {
-  return __builtin_rotateleft32(__X, (unsigned int)__C);
-}
-
-/// Rotates a 32-bit value to the right by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ROR instruction.
-///
-/// \param __X
-///    The unsigned 32-bit value to be rotated.
-/// \param __C
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see _rotr
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
-__rord(unsigned int __X, int __C) {
-  return __builtin_rotateright32(__X, (unsigned int)__C);
-}
-
-#ifdef __x86_64__
-/// Rotates a 64-bit value to the left by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ROL instruction.
-///
-/// \param __X
-///    The unsigned 64-bit value to be rotated.
-/// \param __C
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
-__rolq(unsigned long long __X, int __C) {
-  return __builtin_rotateleft64(__X, (unsigned long long)__C);
-}
-
-/// Rotates a 64-bit value to the right by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c ROR instruction.
-///
-/// \param __X
-///    The unsigned 64-bit value to be rotated.
-/// \param __C
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
-__rorq(unsigned long long __X, int __C) {
-  return __builtin_rotateright64(__X, (unsigned long long)__C);
-}
-#endif /* __x86_64__ */
-
-#ifndef _MSC_VER
-/* These are already provided as builtins for MSVC. */
-/* Select the correct function based on the size of long. */
-#ifdef __LP64__
-/// Rotates a 64-bit value to the left by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _lrotl(unsigned long long a, int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ROL instruction.
-///
-/// \param a
-///    The unsigned 64-bit value to be rotated.
-/// \param b
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see __rolq
-#define _lrotl(a,b) __rolq((a), (b))
-
-/// Rotates a 64-bit value to the right by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _lrotr(unsigned long long a, int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ROR instruction.
-///
-/// \param a
-///    The unsigned 64-bit value to be rotated.
-/// \param b
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see __rorq
-#define _lrotr(a,b) __rorq((a), (b))
-#else // __LP64__
-/// Rotates a 32-bit value to the left by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _lrotl(unsigned int a, int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ROL instruction.
-///
-/// \param a
-///    The unsigned 32-bit value to be rotated.
-/// \param b
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see __rold
-#define _lrotl(a,b) __rold((a), (b))
-
-/// Rotates a 32-bit value to the right by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _lrotr(unsigned int a, int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ROR instruction.
-///
-/// \param a
-///    The unsigned 32-bit value to be rotated.
-/// \param b
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see __rord
-#define _lrotr(a,b) __rord((a), (b))
-#endif // __LP64__
-
-/// Rotates a 32-bit value to the left by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _rotl(unsigned int a, int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ROL instruction.
-///
-/// \param a
-///    The unsigned 32-bit value to be rotated.
-/// \param b
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see __rold
-#define _rotl(a,b) __rold((a), (b))
-
-/// Rotates a 32-bit value to the right by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _rotr(unsigned int a, int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ROR instruction.
-///
-/// \param a
-///    The unsigned 32-bit value to be rotated.
-/// \param b
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see __rord
-#define _rotr(a,b) __rord((a), (b))
-#endif // _MSC_VER
-
-/* These are not builtins so need to be provided in all modes. */
-/// Rotates a 16-bit value to the left by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned short _rotwl(unsigned short a, int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ROL instruction.
-///
-/// \param a
-///    The unsigned 16-bit value to be rotated.
-/// \param b
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see __rolw
-#define _rotwl(a,b) __rolw((a), (b))
-
-/// Rotates a 16-bit value to the right by the specified number of bits.
-///    This operation is undefined if the number of bits exceeds the size of
-///    the value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned short _rotwr(unsigned short a, int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c ROR instruction.
-///
-/// \param a
-///    The unsigned 16-bit value to be rotated.
-/// \param b
-///    The number of bits to rotate the value.
-/// \returns The rotated value.
-/// \see __rorw
-#define _rotwr(a,b) __rorw((a), (b))
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_CAST
-#undef __DEFAULT_FN_ATTRS_CRC32
-#undef __DEFAULT_FN_ATTRS_CONSTEXPR
-
-#endif /* __IA32INTRIN_H */
diff --git a/third_party/intel/clang/immintrin.h b/third_party/intel/clang/immintrin.h
deleted file mode 100644
index a0b08a1e2..000000000
--- a/third_party/intel/clang/immintrin.h
+++ /dev/null
@@ -1,747 +0,0 @@
-/*===---- immintrin.h - Intel intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#define __IMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include "x86gprintrin.h"
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__MMX__)
-#include "mmintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE__)
-#include "xmmintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE2__)
-#include "emmintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE3__)
-#include "pmmintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSSE3__)
-#include "tmmintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__SSE4_2__) || defined(__SSE4_1__))
-#include "smmintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AES__) || defined(__PCLMUL__))
-#include "wmmintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CLFLUSHOPT__)
-#include "clflushoptintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CLWB__)
-#include "clwbintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX__)
-#include "avxintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX2__)
-#include "avx2intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__F16C__)
-#include "f16cintrin.h"
-#endif
-
-/* No feature check desired due to internal checks */
-#include "bmiintrin.h"
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI2__)
-#include "bmi2intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__LZCNT__)
-#include "lzcntintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__POPCNT__)
-#include "popcntintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA__)
-#include "fmaintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512F__)
-#include "avx512fintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VL__)
-#include "avx512vlintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BW__)
-#include "avx512bwintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BITALG__)
-#include "avx512bitalgintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512CD__)
-#include "avx512cdintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
-#include "avx512vpopcntdqintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
-#include "avx512vpopcntdqvlintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VNNI__)
-#include "avx512vnniintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512VNNI__))
-#include "avx512vlvnniintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNI__)
-#include "avxvnniintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512DQ__)
-#include "avx512dqintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512BITALG__))
-#include "avx512vlbitalgintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512BW__))
-#include "avx512vlbwintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512CD__))
-#include "avx512vlcdintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512DQ__))
-#include "avx512vldqintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
-#include "avx512ifmaintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
-#include "avx512ifmavlintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXIFMA__)
-#include "avxifmaintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI__)
-#include "avx512vbmiintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
-#include "avx512vbmivlintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI2__)
-#include "avx512vbmi2intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
-#include "avx512vlvbmi2intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
-#include "avx512fp16intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512FP16__))
-#include "avx512vlfp16intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BF16__)
-#include "avx512bf16intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512BF16__))
-#include "avx512vlbf16intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PKU__)
-#include "pkuintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__VPCLMULQDQ__)
-#include "vpclmulqdqintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__VAES__)
-#include "vaesintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__GFNI__)
-#include "gfniintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT8__)
-#include "avxvnniint8intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXNECONVERT__)
-#include "avxneconvertintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA512__)
-#include "sha512intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SM3__)
-#include "sm3intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SM4__)
-#include "sm4intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT16__)
-#include "avxvnniint16intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPID__)
-/// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> RDPID </c> instruction.
-///
-/// \returns The 32-bit contents of the MSR.
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
-_rdpid_u32(void) {
-  return __builtin_ia32_rdpid();
-}
-#endif // __RDPID__
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RDRND__)
-/// Returns a 16-bit hardware-generated random value.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
-///
-/// \param __p
-///    A pointer to a 16-bit memory location to place the random value.
-/// \returns 1 if the value was successfully generated, 0 otherwise.
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand16_step(unsigned short *__p)
-{
-  return (int)__builtin_ia32_rdrand16_step(__p);
-}
-
-/// Returns a 32-bit hardware-generated random value.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
-///
-/// \param __p
-///    A pointer to a 32-bit memory location to place the random value.
-/// \returns 1 if the value was successfully generated, 0 otherwise.
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand32_step(unsigned int *__p)
-{
-  return (int)__builtin_ia32_rdrand32_step(__p);
-}
-
-/// Returns a 64-bit hardware-generated random value.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
-///
-/// \param __p
-///    A pointer to a 64-bit memory location to place the random value.
-/// \returns 1 if the value was successfully generated, 0 otherwise.
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
-#ifdef __x86_64__
-  return (int)__builtin_ia32_rdrand64_step(__p);
-#else
-  // We need to emulate the functionality of 64-bit rdrand with 2 32-bit
-  // rdrand instructions.
-  unsigned int __lo, __hi;
-  unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
-  unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
-  if (__res_lo && __res_hi) {
-    *__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
-    return 1;
-  } else {
-    *__p = 0;
-    return 0;
-  }
-#endif
-}
-#endif /* __RDRND__ */
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__FSGSBASE__)
-#ifdef __x86_64__
-/// Reads the FS base register.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
-///
-/// \returns The lower 32 bits of the FS base register.
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_readfsbase_u32(void)
-{
-  return __builtin_ia32_rdfsbase32();
-}
-
-/// Reads the FS base register.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
-///
-/// \returns The contents of the FS base register.
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_readfsbase_u64(void)
-{
-  return __builtin_ia32_rdfsbase64();
-}
-
-/// Reads the GS base register.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
-///
-/// \returns The lower 32 bits of the GS base register.
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_readgsbase_u32(void)
-{
-  return __builtin_ia32_rdgsbase32();
-}
-
-/// Reads the GS base register.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
-///
-/// \returns The contents of the GS base register.
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_readgsbase_u64(void)
-{
-  return __builtin_ia32_rdgsbase64();
-}
-
-/// Modifies the FS base register.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
-///
-/// \param __V
-///    Value to use for the lower 32 bits of the FS base register.
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_writefsbase_u32(unsigned int __V)
-{
-  __builtin_ia32_wrfsbase32(__V);
-}
-
-/// Modifies the FS base register.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
-///
-/// \param __V
-///    Value to use for the FS base register.
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_writefsbase_u64(unsigned long long __V)
-{
-  __builtin_ia32_wrfsbase64(__V);
-}
-
-/// Modifies the GS base register.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> WRGSBASE </c> instruction.
-///
-/// \param __V
-///    Value to use for the lower 32 bits of the GS base register.
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_writegsbase_u32(unsigned int __V)
-{
-  __builtin_ia32_wrgsbase32(__V);
-}
-
-/// Modifies the GS base register.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
-///
-/// \param __V
-///    Value to use for GS base register.
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_writegsbase_u64(unsigned long long __V)
-{
-  __builtin_ia32_wrgsbase64(__V);
-}
-
-#endif
-#endif /* __FSGSBASE__ */
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVBE__)
-
-/* The structs used below are to force the load/store to be unaligned. This
- * is accomplished with the __packed__ attribute. The __may_alias__ prevents
- * tbaa metadata from being generated based on the struct and the type of the
- * field inside of it.
- */
-
-/// Load a 16-bit value from memory and swap its bytes.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the MOVBE instruction.
-///
-/// \param __P
-///    A pointer to the 16-bit value to load.
-/// \returns The byte-swapped value.
-static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_loadbe_i16(void const * __P) {
-  struct __loadu_i16 {
-    unsigned short __v;
-  } __attribute__((__packed__, __may_alias__));
-  return (short)__builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
-}
-
-/// Swap the bytes of a 16-bit value and store it to memory.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the MOVBE instruction.
-///
-/// \param __P
-///    A pointer to the memory for storing the swapped value.
-/// \param __D
-///    The 16-bit value to be byte-swapped.
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_storebe_i16(void * __P, short __D) {
-  struct __storeu_i16 {
-    unsigned short __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_i16*)__P)->__v = __builtin_bswap16((unsigned short)__D);
-}
-
-/// Load a 32-bit value from memory and swap its bytes.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the MOVBE instruction.
-///
-/// \param __P
-///    A pointer to the 32-bit value to load.
-/// \returns The byte-swapped value.
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_loadbe_i32(void const * __P) {
-  struct __loadu_i32 {
-    unsigned int __v;
-  } __attribute__((__packed__, __may_alias__));
-  return (int)__builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
-}
-
-/// Swap the bytes of a 32-bit value and store it to memory.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the MOVBE instruction.
-///
-/// \param __P
-///    A pointer to the memory for storing the swapped value.
-/// \param __D
-///    The 32-bit value to be byte-swapped.
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_storebe_i32(void * __P, int __D) {
-  struct __storeu_i32 {
-    unsigned int __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_i32*)__P)->__v = __builtin_bswap32((unsigned int)__D);
-}
-
-#ifdef __x86_64__
-/// Load a 64-bit value from memory and swap its bytes.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the MOVBE instruction.
-///
-/// \param __P
-///    A pointer to the 64-bit value to load.
-/// \returns The byte-swapped value.
-static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_loadbe_i64(void const * __P) {
-  struct __loadu_i64 {
-    unsigned long long __v;
-  } __attribute__((__packed__, __may_alias__));
-  return (long long)__builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
-}
-
-/// Swap the bytes of a 64-bit value and store it to memory.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the MOVBE instruction.
-///
-/// \param __P
-///    A pointer to the memory for storing the swapped value.
-/// \param __D
-///    The 64-bit value to be byte-swapped.
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_storebe_i64(void * __P, long long __D) {
-  struct __storeu_i64 {
-    unsigned long long __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_i64*)__P)->__v = __builtin_bswap64((unsigned long long)__D);
-}
-#endif
-#endif /* __MOVBE */
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RTM__)
-#include "rtmintrin.h"
-#include "xtestintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA__)
-#include "shaintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__FXSR__)
-#include "fxsrintrin.h"
-#endif
-
-/* No feature check desired due to internal MSC_VER checks */
-#include "xsaveintrin.h"
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEOPT__)
-#include "xsaveoptintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEC__)
-#include "xsavecintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVES__)
-#include "xsavesintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SHSTK__)
-#include "cetintrin.h"
-#endif
-
-/* Intrinsics inside adcintrin.h are available at all times. */
-#include "adcintrin.h"
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__ADX__)
-#include "adxintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RDSEED__)
-#include "rdseedintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__WBNOINVD__)
-#include "wbnoinvdintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CLDEMOTE__)
-#include "cldemoteintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__WAITPKG__)
-#include "waitpkgintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVDIRI__) ||     \
-    defined(__MOVDIR64B__)
-#include "movdirintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
-#include "pconfigintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SGX__)
-#include "sgxintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PTWRITE__)
-#include "ptwriteintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
-#include "invpcidintrin.h"
-#endif
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
-#include "amxfp16intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) ||          \
-    defined(__WIDEKL__)
-#include "keylockerintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TILE__) ||    \
-    defined(__AMX_INT8__) || defined(__AMX_BF16__)
-#include "amxintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
-#include "amxcomplexintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    defined(__AVX512VP2INTERSECT__)
-#include "avx512vp2intersectintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
-#include "avx512vlvp2intersectintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
-#include "enqcmdintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SERIALIZE__)
-#include "serializeintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__TSXLDTRK__)
-#include "tsxldtrkintrin.h"
-#endif
-
-#if defined(_MSC_VER) && __has_extension(gnu_asm)
-/* Define the default attributes for these intrinsics */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange HLE
-\*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
-  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
-                       : "+r" (_Value), "+m" (*_Target) :: "memory");
-  return _Value;
-}
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
-  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
-                       : "+r" (_Value), "+m" (*_Target) :: "memory");
-  return _Value;
-}
-#endif
-#if defined(__x86_64__)
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
-  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
-                       : "+r" (_Value), "+m" (*_Target) :: "memory");
-  return _Value;
-}
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
-  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
-                       : "+r" (_Value), "+m" (*_Target) :: "memory");
-  return _Value;
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Compare Exchange HLE
-\*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
-                              long _Exchange, long _Comparand) {
-  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
-                       : "+a" (_Comparand), "+m" (*_Destination)
-                       : "r" (_Exchange) : "memory");
-  return _Comparand;
-}
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
-                              long _Exchange, long _Comparand) {
-  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
-                       : "+a" (_Comparand), "+m" (*_Destination)
-                       : "r" (_Exchange) : "memory");
-  return _Comparand;
-}
-#endif
-#if defined(__x86_64__)
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand) {
-  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
-                       : "+a" (_Comparand), "+m" (*_Destination)
-                       : "r" (_Exchange) : "memory");
-  return _Comparand;
-}
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand) {
-  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
-                       : "+a" (_Comparand), "+m" (*_Destination)
-                       : "r" (_Exchange) : "memory");
-  return _Comparand;
-}
-#endif
-#ifdef __cplusplus
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */
-
-#endif /* __IMMINTRIN_H */
diff --git a/third_party/intel/clang/invpcidintrin.h b/third_party/intel/clang/invpcidintrin.h
deleted file mode 100644
index 48dae0a86..000000000
--- a/third_party/intel/clang/invpcidintrin.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <invpcidintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __INVPCIDINTRIN_H
-#define __INVPCIDINTRIN_H
-
-static __inline__ void
-  __attribute__((__always_inline__, __nodebug__,  __target__("invpcid")))
-_invpcid(unsigned int __type, void *__descriptor) {
-  __builtin_ia32_invpcid(__type, __descriptor);
-}
-
-#endif /* __INVPCIDINTRIN_H */
diff --git a/third_party/intel/clang/keylockerintrin.h b/third_party/intel/clang/keylockerintrin.h
deleted file mode 100644
index f76e91b4d..000000000
--- a/third_party/intel/clang/keylockerintrin.h
+++ /dev/null
@@ -1,527 +0,0 @@
-/*===----------------- keylockerintrin.h - KL Intrinsics -------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <keylockerintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef _KEYLOCKERINTRIN_H
-#define _KEYLOCKERINTRIN_H
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("kl"),\
-                 __min_vector_width__(128)))
-
-/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl
-/// will assigned to EAX, whch specifies the KeySource and whether backing up
-/// the key is permitted. The 256-bit encryption key is loaded from the two
-/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is
-/// loaded from the implicit operand XMM0 which assigned by __intkey.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
-///
-/// \code{.operation}
-/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
-///   GP (0)
-/// FI
-/// IF “LOADIWKEY exiting” VM execution control set
-///   VMexit
-/// FI
-/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used
-///   GP (0)
-/// FI
-/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set
-///   GP (0)
-/// FI
-/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part
-///   GP (0)
-/// FI
-/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part
-///   GP (0)
-/// FI
-/// IF (__ctl[4:1] == 0) // KeySource of 0.
-///   IWKey.Encryption Key[127:0] := __enkey_hi[127:0]:
-///   IWKey.Encryption Key[255:128] := __enkey_lo[127:0]
-///   IWKey.IntegrityKey[127:0] := __intkey[127:0]
-///   IWKey.NoBackup := __ctl[0]
-///   IWKey.KeySource := __ctl[4:1]
-///   ZF := 0
-/// ELSE // KeySource of 1. See RDSEED definition for details of randomness
-///   IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received
-///     IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0]
-///     IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128]
-///     IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0]
-///     IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256]
-///     IWKey.NoBackup := __ctl[0]
-///     IWKey.KeySource := __ctl[4:1]
-///     ZF := 0
-///   ELSE // Random data was not returned from RDSEED. IWKey was not loaded
-///     ZF := 1
-///   FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
-               __m128i __enkey_lo, __m128i __enkey_hi) {
-  __builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl);
-}
-
-/// Wrap a 128-bit AES key from __key into a key handle and output in
-/// ((__m128i*)__h) to ((__m128i*)__h) + 2  and a 32-bit value as return.
-/// The explicit source operand __htype specifies handle restrictions.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
-///
-/// \code{.operation}
-/// InputKey[127:0] := __key[127:0]
-/// KeyMetadata[2:0] := __htype[2:0]
-/// KeyMetadata[23:3] := 0 // Reserved for future usage
-/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0)
-/// KeyMetadata[127:28] := 0 // Reserved for future usage
-/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0],
-///                  IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
-/// dst[0] := IWKey.NoBackup
-/// dst[4:1] := IWKey.KeySource[3:0]
-/// dst[31:5] := 0
-/// MEM[__h+127:__h] := Handle[127:0]   // AAD
-/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
-/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
-/// OF := 0
-/// SF := 0
-/// ZF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
-  return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
-}
-
-/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
-/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and
-/// a 32-bit value as return.
-/// The explicit source operand __htype specifies handle restrictions.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
-///
-/// \code{.operation}
-/// InputKey[127:0] := __key_lo[127:0]
-/// InputKey[255:128] := __key_hi[255:128]
-/// KeyMetadata[2:0] := __htype[2:0]
-/// KeyMetadata[23:3] := 0 // Reserved for future usage
-/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1)
-/// KeyMetadata[127:28] := 0 // Reserved for future usage
-/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0],
-///                  IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
-/// dst[0] := IWKey.NoBackup
-/// dst[4:1] := IWKey.KeySource[3:0]
-/// dst[31:5] := 0
-/// MEM[__h+127:__h]   := Handle[127:0] // AAD
-/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
-/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
-/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
-/// OF := 0
-/// SF := 0
-/// ZF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
-                     void *__h) {
-  return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo,
-                                         (__v2di)__key_hi, __h);
-}
-
-/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using
-/// the 128-bit key in the handle from the __h. It stores the result in the
-/// __odata. And return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
-///
-/// \code{.operation}
-/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
-/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
-///                    (Handle[127:0] AND (CPL > 0)) ||
-///                    Handle[383:256] ||
-///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
-/// IF (IllegalHandle)
-///   ZF := 1
-/// ELSE
-///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
-///   IF (Authentic == 0)
-///     ZF := 1
-///   ELSE
-///     MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey)
-///     ZF := 0
-///   FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
-  return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
-}
-
-/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using
-/// the 256-bit key in the handle from the __h. It stores the result in the
-/// __odata. And return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
-///
-/// \code{.operation}
-/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
-/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
-///                    (Handle[127:0] AND (CPL > 0)) ||
-///                    Handle[255:128] ||
-///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
-/// IF (IllegalHandle)
-///   ZF := 1
-///   MEM[__odata+127:__odata] := 0
-/// ELSE
-///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
-///   IF (Authentic == 0)
-///     ZF := 1
-///     MEM[__odata+127:__odata] := 0
-///   ELSE
-///     MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
-///     ZF := 0
-///   FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
-  return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
-}
-
-/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using
-/// the 128-bit key in the handle from the __h. It stores the result in the
-/// __odata. And return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
-///
-/// \code{.operation}
-/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
-/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
-///                  (Handle[127:0] AND (CPL > 0)) ||
-///                  Handle[383:256] ||
-///                  HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
-/// IF (IllegalHandle)
-///   ZF := 1
-///   MEM[__odata+127:__odata] := 0
-/// ELSE
-///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
-///   IF (Authentic == 0)
-///     ZF := 1
-///     MEM[__odata+127:__odata] := 0
-///   ELSE
-///     MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
-///     ZF := 0
-///   FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
-  return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
-}
-
-/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using
-/// the 256-bit key in the handle from the __h. It stores the result in the
-/// __odata. And return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
-///
-/// \code{.operation}
-/// Handle[511:0] := MEM[__h+511:__h]
-/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
-///                   (Handle[127:0] AND (CPL > 0)) ||
-///                   Handle[383:256] ||
-///                   HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
-/// IF (IllegalHandle)
-///   ZF := 1
-///   MEM[__odata+127:__odata] := 0
-/// ELSE
-///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
-///   IF (Authentic == 0)
-///     ZF := 1
-///     MEM[__odata+127:__odata] := 0
-///   ELSE
-///     MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
-///     ZF := 0
-///   FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
-  return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* !defined(__SCE__ || __has_feature(modules) || defined(__KL__) */
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\
-                 __min_vector_width__(128)))
-
-/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
-/// at __h and store each resultant block back from __odata to __odata+7. And
-/// return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
-///
-/// \code{.operation}
-/// Handle := MEM[__h+383:__h]
-/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
-///                    (Handle[127:0] AND (CPL > 0)) ||
-///                    Handle[255:128] ||
-///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
-/// IF (IllegalHandle)
-///   ZF := 1
-///   FOR i := 0 to 7
-///     __odata[i] := 0
-///   ENDFOR
-/// ELSE
-///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
-///   IF Authentic == 0
-///     ZF := 1
-///     FOR i := 0 to 7
-///       __odata[i] := 0
-///     ENDFOR
-///   ELSE
-///     FOR i := 0 to 7
-///       __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
-///     ENDFOR
-///     ZF := 0
-///   FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
-  return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
-                                           (const __v2di *)__idata, __h);
-}
-
-/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
-/// at __h and store each resultant block back from __odata to __odata+7. And
-/// return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
-///
-/// \code{.operation}
-/// Handle[511:0] := MEM[__h+511:__h]
-/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
-///                    (Handle[127:0] AND (CPL > 0)) ||
-///                    Handle[255:128] ||
-///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
-/// IF (IllegalHandle)
-///   ZF := 1
-///   FOR i := 0 to 7
-///     __odata[i] := 0
-///   ENDFOR
-/// ELSE
-///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
-///   IF Authentic == 0
-///     ZF := 1
-///     FOR i := 0 to 7
-///       __odata[i] := 0
-///     ENDFOR
-///   ELSE
-///     FOR i := 0 to 7
-///       __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
-///     ENDFOR
-///     ZF := 0
-///   FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
-  return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
-                                           (const __v2di *)__idata, __h);
-}
-
-/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
-/// at __h and store each resultant block back from __odata to __odata+7. And
-/// return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
-///
-/// \code{.operation}
-/// Handle[383:0] := MEM[__h+383:__h]
-/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
-///                    (Handle[127:0] AND (CPL > 0)) ||
-///                    Handle[255:128] ||
-///                    HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
-/// IF (IllegalHandle)
-///   ZF := 1
-///   FOR i := 0 to 7
-///     __odata[i] := 0
-///   ENDFOR
-/// ELSE
-///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
-///   IF Authentic == 0
-///     ZF := 1
-///     FOR i := 0 to 7
-///       __odata[i] := 0
-///     ENDFOR
-///   ELSE
-///     FOR i := 0 to 7
-///       __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
-///     ENDFOR
-///     ZF := 0
-///   FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
-  return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
-                                           (const __v2di *)__idata, __h);
-}
-
-/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
-/// at __h and store each resultant block back from __odata to __odata+7. And
-/// return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
-///
-/// \code{.operation}
-/// Handle[511:0] := MEM[__h+511:__h]
-/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
-///                   (Handle[127:0] AND (CPL > 0)) ||
-///                   Handle[255:128] ||
-///                   HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
-/// If (IllegalHandle)
-///   ZF := 1
-///   FOR i := 0 to 7
-///     __odata[i] := 0
-///   ENDFOR
-/// ELSE
-///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
-///   IF Authentic == 0
-///     ZF := 1
-///     FOR i := 0 to 7
-///       __odata[i] := 0
-///     ENDFOR
-///   ELSE
-///     FOR i := 0 to 7
-///       __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
-///     ENDFOR
-///     ZF := 0
-///   FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endcode
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
-  return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
-                                           (const __v2di *)__idata, __h);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)   \
-        */
-
-#endif /* _KEYLOCKERINTRIN_H */
diff --git a/third_party/intel/clang/lwpintrin.h b/third_party/intel/clang/lwpintrin.h
deleted file mode 100644
index d8ab0db03..000000000
--- a/third_party/intel/clang/lwpintrin.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __LWPINTRIN_H
-#define __LWPINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
-
-/// Parses the LWPCB at the specified address and enables
-///        profiling if valid.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
-///
-/// \param __addr
-///    Address to the new Lightweight Profiling Control Block (LWPCB). If the
-///    LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
-///    Lightweight Profiling.
-static __inline__ void __DEFAULT_FN_ATTRS
-__llwpcb (void *__addr)
-{
-  __builtin_ia32_llwpcb(__addr);
-}
-
-/// Flushes the LWP state to memory and returns the address of the LWPCB.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
-///
-/// \return
-///    Address to the current Lightweight Profiling Control Block (LWPCB).
-///    If LWP is not currently enabled, returns NULL.
-static __inline__ void* __DEFAULT_FN_ATTRS
-__slwpcb (void)
-{
-  return __builtin_ia32_slwpcb();
-}
-
-/// Inserts programmed event record into the LWP event ring buffer
-///        and advances the ring buffer pointer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
-///
-/// \param DATA2
-///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
-/// \param DATA1
-///    A 32-bit value is inserted into the 32-bit Data1 field.
-/// \param FLAGS
-///    A 32-bit immediate value is inserted into the 32-bit Flags field.
-/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
-///    the event record overwrites the last record in the buffer, the MissedEvents
-///    counter in the LWPCB is incremented, the head pointer is not advanced, and
-///    1 is returned. Otherwise 0 is returned.
-#define __lwpins32(DATA2, DATA1, FLAGS) \
-  (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
-                           (unsigned int) (FLAGS)))
-
-/// Decrements the LWP programmed value sample event counter. If the result is
-///        negative, inserts an event record into the LWP event ring buffer in memory
-///        and advances the ring buffer pointer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
-///
-/// \param DATA2
-///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
-/// \param DATA1
-///    A 32-bit value is inserted into the 32-bit Data1 field.
-/// \param FLAGS
-///    A 32-bit immediate value is inserted into the 32-bit Flags field.
-#define __lwpval32(DATA2, DATA1, FLAGS) \
-  (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
-                           (unsigned int) (FLAGS)))
-
-#ifdef __x86_64__
-
-/// Inserts programmed event record into the LWP event ring buffer
-///        and advances the ring buffer pointer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
-///
-/// \param DATA2
-///    A 64-bit value is inserted into the 64-bit Data2 field.
-/// \param DATA1
-///    A 32-bit value is inserted into the 32-bit Data1 field.
-/// \param FLAGS
-///    A 32-bit immediate value is inserted into the 32-bit Flags field.
-/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
-///    the event record overwrites the last record in the buffer, the MissedEvents
-///    counter in the LWPCB is incremented, the head pointer is not advanced, and
-///    1 is returned. Otherwise 0 is returned.
-#define __lwpins64(DATA2, DATA1, FLAGS) \
-  (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
-                           (unsigned int) (FLAGS)))
-
-/// Decrements the LWP programmed value sample event counter. If the result is
-///        negative, inserts an event record into the LWP event ring buffer in memory
-///        and advances the ring buffer pointer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
-///
-/// \param DATA2
-///    A 64-bit value is and inserted into the 64-bit Data2 field.
-/// \param DATA1
-///    A 32-bit value is inserted into the 32-bit Data1 field.
-/// \param FLAGS
-///    A 32-bit immediate value is inserted into the 32-bit Flags field.
-#define __lwpval64(DATA2, DATA1, FLAGS) \
-  (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
-                           (unsigned int) (FLAGS)))
-
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __LWPINTRIN_H */
diff --git a/third_party/intel/clang/lzcntintrin.h b/third_party/intel/clang/lzcntintrin.h
deleted file mode 100644
index f4ddce9d0..000000000
--- a/third_party/intel/clang/lzcntintrin.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __LZCNTINTRIN_H
-#define __LZCNTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
-
-#ifndef _MSC_VER
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-///    An unsigned 16-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 16-bit integer containing the number of leading zero
-///    bits in the operand.
-#define __lzcnt16(X) __builtin_ia32_lzcnt_u16((unsigned short)(X))
-#endif // _MSC_VER
-
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-///    An unsigned 32-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of leading zero
-///    bits in the operand.
-/// \see _lzcnt_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__lzcnt32(unsigned int __X)
-{
-  return __builtin_ia32_lzcnt_u32(__X);
-}
-
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-///    An unsigned 32-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of leading zero
-///    bits in the operand.
-/// \see __lzcnt32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_lzcnt_u32(unsigned int __X)
-{
-  return __builtin_ia32_lzcnt_u32(__X);
-}
-
-#ifdef __x86_64__
-#ifndef _MSC_VER
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of leading zero
-///    bits in the operand.
-/// \see _lzcnt_u64
-#define __lzcnt64(X) __builtin_ia32_lzcnt_u64((unsigned long long)(X))
-#endif // _MSC_VER
-
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of leading zero
-///    bits in the operand.
-/// \see __lzcnt64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_lzcnt_u64(unsigned long long __X)
-{
-  return __builtin_ia32_lzcnt_u64(__X);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __LZCNTINTRIN_H */
diff --git a/third_party/intel/clang/mm_malloc.h b/third_party/intel/clang/mm_malloc.h
deleted file mode 100644
index d32fe5941..000000000
--- a/third_party/intel/clang/mm_malloc.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __MM_MALLOC_H
-#define __MM_MALLOC_H
-
-#include <stdlib.h>
-
-#ifdef _WIN32
-#include <malloc.h>
-#else
-#ifndef __cplusplus
-extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
-#else
-// Some systems (e.g. those with GNU libc) declare posix_memalign with an
-// exception specifier. Via an "egregious workaround" in
-// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
-// redeclaration of glibc's declaration.
-extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
-#endif
-#endif
-
-#if !(defined(_WIN32) && defined(_mm_malloc))
-static __inline__ void *__attribute__((__always_inline__, __nodebug__,
-                                       __malloc__, __alloc_size__(1),
-                                       __alloc_align__(2)))
-_mm_malloc(size_t __size, size_t __align) {
-  if (__align == 1) {
-    return malloc(__size);
-  }
-
-  if (!(__align & (__align - 1)) && __align < sizeof(void *))
-    __align = sizeof(void *);
-
-  void *__mallocedMemory;
-#if defined(__MINGW32__)
-  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
-#elif defined(_WIN32)
-  __mallocedMemory = _aligned_malloc(__size, __align);
-#else
-  if (posix_memalign(&__mallocedMemory, __align, __size))
-    return 0;
-#endif
-
-  return __mallocedMemory;
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
-_mm_free(void *__p)
-{
-#if defined(__MINGW32__)
-  __mingw_aligned_free(__p);
-#elif defined(_WIN32)
-  _aligned_free(__p);
-#else
-  free(__p);
-#endif
-}
-#endif
-
-#endif /* __MM_MALLOC_H */
diff --git a/third_party/intel/clang/mmintrin.h b/third_party/intel/clang/mmintrin.h
deleted file mode 100644
index 4e154e2d8..000000000
--- a/third_party/intel/clang/mmintrin.h
+++ /dev/null
@@ -1,1556 +0,0 @@
-/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __MMINTRIN_H
-#define __MMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
-
-typedef long long __v1di __attribute__((__vector_size__(8)));
-typedef int __v2si __attribute__((__vector_size__(8)));
-typedef short __v4hi __attribute__((__vector_size__(8)));
-typedef char __v8qi __attribute__((__vector_size__(8)));
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \
-                 __min_vector_width__(64)))
-
-/// Clears the MMX state by setting the state of the x87 stack registers
-///    to empty.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> EMMS </c> instruction.
-///
-static __inline__ void __attribute__((__always_inline__, __nodebug__,
-                                      __target__("mmx,no-evex512")))
-_mm_empty(void) {
-  __builtin_ia32_emms();
-}
-
-/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
-///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVD </c> instruction.
-///
-/// \param __i
-///    A 32-bit integer value.
-/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
-///    parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cvtsi32_si64(int __i)
-{
-    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
-}
-
-/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
-///    signed integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVD </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector.
-/// \returns A 32-bit signed integer value containing the lower 32 bits of the
-///    parameter.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvtsi64_si32(__m64 __m)
-{
-    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
-}
-
-/// Casts a 64-bit signed integer value into a 64-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
-///
-/// \param __i
-///    A 64-bit signed integer.
-/// \returns A 64-bit integer vector containing the same bitwise pattern as the
-///    parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cvtsi64_m64(long long __i)
-{
-    return (__m64)__i;
-}
-
-/// Casts a 64-bit integer vector into a 64-bit signed integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector.
-/// \returns A 64-bit signed integer containing the same bitwise pattern as the
-///    parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm_cvtm64_si64(__m64 __m)
-{
-    return (long long)__m;
-}
-
-/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
-///    vector parameters of [4 x i16] into 8-bit signed integer values, and
-///    constructs a 64-bit integer vector of [8 x i8] as the result.
-///
-///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
-///    less than 0x80 are saturated to 0x80.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
-///    written to the lower 32 bits of the result.
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
-///    written to the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [8 x i8] containing the converted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
-///    vector parameters of [2 x i32] into 16-bit signed integer values, and
-///    constructs a 64-bit integer vector of [4 x i16] as the result.
-///
-///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
-///    values less than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
-///    written to the lower 32 bits of the result.
-/// \param __m2
-///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
-///    written to the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [4 x i16] containing the converted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
-///    vector parameters of [4 x i16] into 8-bit unsigned integer values, and
-///    constructs a 64-bit integer vector of [8 x i8] as the result.
-///
-///    Values greater than 0xFF are saturated to 0xFF. Values less than 0 are
-///    saturated to 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
-///    written to the lower 32 bits of the result.
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
-///    written to the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [8 x i8] containing the converted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
-///    and interleaves them into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8]. \n
-///    Bits [39:32] are written to bits [7:0] of the result. \n
-///    Bits [47:40] are written to bits [23:16] of the result. \n
-///    Bits [55:48] are written to bits [39:32] of the result. \n
-///    Bits [63:56] are written to bits [55:48] of the result.
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8].
-///    Bits [39:32] are written to bits [15:8] of the result. \n
-///    Bits [47:40] are written to bits [31:24] of the result. \n
-///    Bits [55:48] are written to bits [47:40] of the result. \n
-///    Bits [63:56] are written to bits [63:56] of the result.
-/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Unpacks the upper 32 bits from two 64-bit integer vectors of
-///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-///    Bits [47:32] are written to bits [15:0] of the result. \n
-///    Bits [63:48] are written to bits [47:32] of the result.
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-///    Bits [47:32] are written to bits [31:16] of the result. \n
-///    Bits [63:48] are written to bits [63:48] of the result.
-/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Unpacks the upper 32 bits from two 64-bit integer vectors of
-///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
-///    the lower 32 bits of the result.
-/// \param __m2
-///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
-///    the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
-///    and interleaves them into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8].
-///    Bits [7:0] are written to bits [7:0] of the result. \n
-///    Bits [15:8] are written to bits [23:16] of the result. \n
-///    Bits [23:16] are written to bits [39:32] of the result. \n
-///    Bits [31:24] are written to bits [55:48] of the result.
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8].
-///    Bits [7:0] are written to bits [15:8] of the result. \n
-///    Bits [15:8] are written to bits [31:24] of the result. \n
-///    Bits [23:16] are written to bits [47:40] of the result. \n
-///    Bits [31:24] are written to bits [63:56] of the result.
-/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Unpacks the lower 32 bits from two 64-bit integer vectors of
-///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-///    Bits [15:0] are written to bits [15:0] of the result. \n
-///    Bits [31:16] are written to bits [47:32] of the result.
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-///    Bits [15:0] are written to bits [31:16] of the result. \n
-///    Bits [31:16] are written to bits [63:48] of the result.
-/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Unpacks the lower 32 bits from two 64-bit integer vectors of
-///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
-///    the lower 32 bits of the result.
-/// \param __m2
-///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
-///    the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Adds each 8-bit integer element of the first 64-bit integer vector
-///    of [8 x i8] to the corresponding 8-bit integer element of the second
-///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
-///    packed into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8].
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
-///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_add_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Adds each 16-bit integer element of the first 64-bit integer vector
-///    of [4 x i16] to the corresponding 16-bit integer element of the second
-///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
-///    packed into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
-///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_add_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Adds each 32-bit integer element of the first 64-bit integer vector
-///    of [2 x i32] to the corresponding 32-bit integer element of the second
-///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
-///    packed into a 64-bit integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDD </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [2 x i32].
-/// \param __m2
-///    A 64-bit integer vector of [2 x i32].
-/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
-///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_add_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Adds, with saturation, each 8-bit signed integer element of the first
-///    64-bit integer vector of [8 x i8] to the corresponding 8-bit signed
-///    integer element of the second 64-bit integer vector of [8 x i8].
-///
-///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
-///    less than 0x80 are saturated to 0x80. The results are packed into a
-///    64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8].
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
-///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Adds, with saturation, each 16-bit signed integer element of the first
-///    64-bit integer vector of [4 x i16] to the corresponding 16-bit signed
-///    integer element of the second 64-bit integer vector of [4 x i16].
-///
-///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
-///    less than 0x8000 are saturated to 0x8000. The results are packed into a
-///    64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
-///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Adds, with saturation, each 8-bit unsigned integer element of the first
-///    64-bit integer vector of [8 x i8] to the corresponding 8-bit unsigned
-///    integer element of the second 64-bit integer vector of [8 x i8].
-///
-///    Sums greater than 0xFF are saturated to 0xFF. The results are packed
-///    into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8].
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
-///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pu8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Adds, with saturation, each 16-bit unsigned integer element of the first
-///    64-bit integer vector of [4 x i16] to the corresponding 16-bit unsigned
-///    integer element of the second 64-bit integer vector of [4 x i16].
-///
-///    Sums greater than 0xFFFF are saturated to 0xFFFF. The results are packed
-///    into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
-///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pu16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Subtracts each 8-bit integer element of the second 64-bit integer
-///    vector of [8 x i8] from the corresponding 8-bit integer element of the
-///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
-///    are packed into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8] containing the minuends.
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
-/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
-///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sub_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Subtracts each 16-bit integer element of the second 64-bit integer
-///    vector of [4 x i16] from the corresponding 16-bit integer element of the
-///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
-///    results are packed into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16] containing the minuends.
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
-/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
-///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sub_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Subtracts each 32-bit integer element of the second 64-bit integer
-///    vector of [2 x i32] from the corresponding 32-bit integer element of the
-///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
-///    results are packed into a 64-bit integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [2 x i32] containing the minuends.
-/// \param __m2
-///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
-/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
-///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sub_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Subtracts, with saturation, each 8-bit signed integer element of the second
-///    64-bit integer vector of [8 x i8] from the corresponding 8-bit signed
-///    integer element of the first 64-bit integer vector of [8 x i8].
-///
-///    Positive results greater than 0x7F are saturated to 0x7F. Negative
-///    results less than 0x80 are saturated to 0x80. The results are packed
-///    into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8] containing the minuends.
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
-/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
-///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_subs_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Subtracts, with saturation, each 16-bit signed integer element of the
-///    second 64-bit integer vector of [4 x i16] from the corresponding 16-bit
-///    signed integer element of the first 64-bit integer vector of [4 x i16].
-///
-///    Positive results greater than 0x7FFF are saturated to 0x7FFF. Negative
-///    results less than 0x8000 are saturated to 0x8000. The results are packed
-///    into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16] containing the minuends.
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
-/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
-///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_subs_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Subtracts each 8-bit unsigned integer element of the second 64-bit
-///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
-///    element of the first 64-bit integer vector of [8 x i8].
-///
-///    If an element of the first vector is less than the corresponding element
-///    of the second vector, the result is saturated to 0. The results are
-///    packed into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8] containing the minuends.
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
-/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
-///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_subs_pu8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Subtracts each 16-bit unsigned integer element of the second 64-bit
-///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
-///    integer element of the first 64-bit integer vector of [4 x i16].
-///
-///    If an element of the first vector is less than the corresponding element
-///    of the second vector, the result is saturated to 0. The results are
-///    packed into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16] containing the minuends.
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
-/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
-///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_subs_pu16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Multiplies each 16-bit signed integer element of the first 64-bit
-///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
-///    element of the second 64-bit integer vector of [4 x i16] and get four
-///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
-///    The lower 32 bits of these two sums are packed into a 64-bit integer
-///    vector of [2 x i32].
-///
-///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
-///    of both parameters are multiplied, and the sum of both results is written
-///    to bits [31:0] of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
-///    products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_madd_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Multiplies each 16-bit signed integer element of the first 64-bit
-///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
-///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
-///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
-///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Multiplies each 16-bit signed integer element of the first 64-bit
-///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
-///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
-///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
-///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_mullo_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Left-shifts each 16-bit signed integer element of the first
-///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
-///    of bits specified by the second parameter, which is a 64-bit integer. The
-///    lower 16 bits of the results are packed into a 64-bit integer vector of
-///    [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [4 x i16].
-/// \param __count
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-///    values. If \a __count is greater or equal to 16, the result is set to all
-///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sll_pi16(__m64 __m, __m64 __count)
-{
-    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
-}
-
-/// Left-shifts each 16-bit signed integer element of a 64-bit integer
-///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
-///    The lower 16 bits of the results are packed into a 64-bit integer vector
-///    of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [4 x i16].
-/// \param __count
-///    A 32-bit integer value.
-/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-///    values. If \a __count is greater or equal to 16, the result is set to all
-///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_slli_pi16(__m64 __m, int __count)
-{
-    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
-}
-
-/// Left-shifts each 32-bit signed integer element of the first
-///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
-///    of bits specified by the second parameter, which is a 64-bit integer. The
-///    lower 32 bits of the results are packed into a 64-bit integer vector of
-///    [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [2 x i32].
-/// \param __count
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-///    values. If \a __count is greater or equal to 32, the result is set to all
-///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sll_pi32(__m64 __m, __m64 __count)
-{
-    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
-}
-
-/// Left-shifts each 32-bit signed integer element of a 64-bit integer
-///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
-///    The lower 32 bits of the results are packed into a 64-bit integer vector
-///    of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [2 x i32].
-/// \param __count
-///    A 32-bit integer value.
-/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-///    values. If \a __count is greater or equal to 32, the result is set to all
-///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_slli_pi32(__m64 __m, int __count)
-{
-    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
-}
-
-/// Left-shifts the first 64-bit integer parameter by the number of bits
-///    specified by the second 64-bit integer parameter. The lower 64 bits of
-///    result are returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \param __count
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector containing the left-shifted value. If
-///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sll_si64(__m64 __m, __m64 __count)
-{
-    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
-}
-
-/// Left-shifts the first parameter, which is a 64-bit integer, by the
-///    number of bits specified by the second parameter, which is a 32-bit
-///    integer. The lower 64 bits of result are returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \param __count
-///    A 32-bit integer value.
-/// \returns A 64-bit integer vector containing the left-shifted value. If
-///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_slli_si64(__m64 __m, int __count)
-{
-    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
-}
-
-/// Right-shifts each 16-bit integer element of the first parameter,
-///    which is a 64-bit integer vector of [4 x i16], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer.
-///
-///    High-order bits are filled with the sign bit of the initial value of each
-///    16-bit element. The 16-bit results are packed into a 64-bit integer
-///    vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [4 x i16].
-/// \param __count
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sra_pi16(__m64 __m, __m64 __count)
-{
-    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
-}
-
-/// Right-shifts each 16-bit integer element of a 64-bit integer vector
-///    of [4 x i16] by the number of bits specified by a 32-bit integer.
-///
-///    High-order bits are filled with the sign bit of the initial value of each
-///    16-bit element. The 16-bit results are packed into a 64-bit integer
-///    vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [4 x i16].
-/// \param __count
-///    A 32-bit integer value.
-/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srai_pi16(__m64 __m, int __count)
-{
-    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
-}
-
-/// Right-shifts each 32-bit integer element of the first parameter,
-///    which is a 64-bit integer vector of [2 x i32], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer.
-///
-///    High-order bits are filled with the sign bit of the initial value of each
-///    32-bit element. The 32-bit results are packed into a 64-bit integer
-///    vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [2 x i32].
-/// \param __count
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sra_pi32(__m64 __m, __m64 __count)
-{
-    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
-}
-
-/// Right-shifts each 32-bit integer element of a 64-bit integer vector
-///    of [2 x i32] by the number of bits specified by a 32-bit integer.
-///
-///    High-order bits are filled with the sign bit of the initial value of each
-///    32-bit element. The 32-bit results are packed into a 64-bit integer
-///    vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [2 x i32].
-/// \param __count
-///    A 32-bit integer value.
-/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srai_pi32(__m64 __m, int __count)
-{
-    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
-}
-
-/// Right-shifts each 16-bit integer element of the first parameter,
-///    which is a 64-bit integer vector of [4 x i16], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer.
-///
-///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
-///    integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [4 x i16].
-/// \param __count
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srl_pi16(__m64 __m, __m64 __count)
-{
-    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
-}
-
-/// Right-shifts each 16-bit integer element of a 64-bit integer vector
-///    of [4 x i16] by the number of bits specified by a 32-bit integer.
-///
-///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
-///    integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [4 x i16].
-/// \param __count
-///    A 32-bit integer value.
-/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srli_pi16(__m64 __m, int __count)
-{
-    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
-}
-
-/// Right-shifts each 32-bit integer element of the first parameter,
-///    which is a 64-bit integer vector of [2 x i32], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer.
-///
-///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
-///    integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [2 x i32].
-/// \param __count
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srl_pi32(__m64 __m, __m64 __count)
-{
-    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
-}
-
-/// Right-shifts each 32-bit integer element of a 64-bit integer vector
-///    of [2 x i32] by the number of bits specified by a 32-bit integer.
-///
-///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
-///    integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector of [2 x i32].
-/// \param __count
-///    A 32-bit integer value.
-/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srli_pi32(__m64 __m, int __count)
-{
-    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
-}
-
-/// Right-shifts the first 64-bit integer parameter by the number of bits
-///    specified by the second 64-bit integer parameter.
-///
-///    High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \param __count
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srl_si64(__m64 __m, __m64 __count)
-{
-    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
-}
-
-/// Right-shifts the first parameter, which is a 64-bit integer, by the
-///    number of bits specified by the second parameter, which is a 32-bit
-///    integer.
-///
-///    High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
-///
-/// \param __m
-///    A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \param __count
-///    A 32-bit integer value.
-/// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srli_si64(__m64 __m, int __count)
-{
-    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
-}
-
-/// Performs a bitwise AND of two 64-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PAND </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector.
-/// \param __m2
-///    A 64-bit integer vector.
-/// \returns A 64-bit integer vector containing the bitwise AND of both
-///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_and_si64(__m64 __m1, __m64 __m2)
-{
-    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
-}
-
-/// Performs a bitwise NOT of the first 64-bit integer vector, and then
-///    performs a bitwise AND of the intermediate result and the second 64-bit
-///    integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PANDN </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector. The one's complement of this parameter is used
-///    in the bitwise AND.
-/// \param __m2
-///    A 64-bit integer vector.
-/// \returns A 64-bit integer vector containing the bitwise AND of the second
-///    parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_andnot_si64(__m64 __m1, __m64 __m2)
-{
-    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
-}
-
-/// Performs a bitwise OR of two 64-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> POR </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector.
-/// \param __m2
-///    A 64-bit integer vector.
-/// \returns A 64-bit integer vector containing the bitwise OR of both
-///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_or_si64(__m64 __m1, __m64 __m2)
-{
-    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
-}
-
-/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PXOR </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector.
-/// \param __m2
-///    A 64-bit integer vector.
-/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
-///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_xor_si64(__m64 __m1, __m64 __m2)
-{
-    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
-}
-
-/// Compares the 8-bit integer elements of two 64-bit integer vectors of
-///    [8 x i8] to determine if the element of the first vector is equal to the
-///    corresponding element of the second vector.
-///
-///    Each comparison returns 0 for false, 0xFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8].
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
-///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Compares the 16-bit integer elements of two 64-bit integer vectors of
-///    [4 x i16] to determine if the element of the first vector is equal to the
-///    corresponding element of the second vector.
-///
-///    Each comparison returns 0 for false, 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
-///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Compares the 32-bit integer elements of two 64-bit integer vectors of
-///    [2 x i32] to determine if the element of the first vector is equal to the
-///    corresponding element of the second vector.
-///
-///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [2 x i32].
-/// \param __m2
-///    A 64-bit integer vector of [2 x i32].
-/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
-///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Compares the 8-bit integer elements of two 64-bit integer vectors of
-///    [8 x i8] to determine if the element of the first vector is greater than
-///    the corresponding element of the second vector.
-///
-///    Each comparison returns 0 for false, 0xFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [8 x i8].
-/// \param __m2
-///    A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
-///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Compares the 16-bit integer elements of two 64-bit integer vectors of
-///    [4 x i16] to determine if the element of the first vector is greater than
-///    the corresponding element of the second vector.
-///
-///    Each comparison returns 0 for false, 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [4 x i16].
-/// \param __m2
-///    A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
-///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Compares the 32-bit integer elements of two 64-bit integer vectors of
-///    [2 x i32] to determine if the element of the first vector is greater than
-///    the corresponding element of the second vector.
-///
-///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
-///
-/// \param __m1
-///    A 64-bit integer vector of [2 x i32].
-/// \param __m2
-///    A 64-bit integer vector of [2 x i32].
-/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
-///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Constructs a 64-bit integer vector initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PXOR </c> instruction.
-///
-/// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setzero_si64(void)
-{
-    return __extension__ (__m64){ 0LL };
-}
-
-/// Constructs a 64-bit integer vector initialized with the specified
-///    32-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __i1
-///    A 32-bit integer value used to initialize the upper 32 bits of the
-///    result.
-/// \param __i0
-///    A 32-bit integer value used to initialize the lower 32 bits of the
-///    result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set_pi32(int __i1, int __i0)
-{
-    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
-}
-
-/// Constructs a 64-bit integer vector initialized with the specified
-///    16-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __s3
-///    A 16-bit integer value used to initialize bits [63:48] of the result.
-/// \param __s2
-///    A 16-bit integer value used to initialize bits [47:32] of the result.
-/// \param __s1
-///    A 16-bit integer value used to initialize bits [31:16] of the result.
-/// \param __s0
-///    A 16-bit integer value used to initialize bits [15:0] of the result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
-{
-    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
-}
-
-/// Constructs a 64-bit integer vector initialized with the specified
-///    8-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __b7
-///    An 8-bit integer value used to initialize bits [63:56] of the result.
-/// \param __b6
-///    An 8-bit integer value used to initialize bits [55:48] of the result.
-/// \param __b5
-///    An 8-bit integer value used to initialize bits [47:40] of the result.
-/// \param __b4
-///    An 8-bit integer value used to initialize bits [39:32] of the result.
-/// \param __b3
-///    An 8-bit integer value used to initialize bits [31:24] of the result.
-/// \param __b2
-///    An 8-bit integer value used to initialize bits [23:16] of the result.
-/// \param __b1
-///    An 8-bit integer value used to initialize bits [15:8] of the result.
-/// \param __b0
-///    An 8-bit integer value used to initialize bits [7:0] of the result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
-            char __b1, char __b0)
-{
-    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
-                                               __b4, __b5, __b6, __b7);
-}
-
-/// Constructs a 64-bit integer vector of [2 x i32], with each of the
-///    32-bit integer vector elements set to the specified 32-bit integer
-///    value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __i
-///    A 32-bit integer value used to initialize each vector element of the
-///    result.
-/// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set1_pi32(int __i)
-{
-    return _mm_set_pi32(__i, __i);
-}
-
-/// Constructs a 64-bit integer vector of [4 x i16], with each of the
-///    16-bit integer vector elements set to the specified 16-bit integer
-///    value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __w
-///    A 16-bit integer value used to initialize each vector element of the
-///    result.
-/// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set1_pi16(short __w)
-{
-    return _mm_set_pi16(__w, __w, __w, __w);
-}
-
-/// Constructs a 64-bit integer vector of [8 x i8], with each of the
-///    8-bit integer vector elements set to the specified 8-bit integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __b
-///    An 8-bit integer value used to initialize each vector element of the
-///    result.
-/// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set1_pi8(char __b)
-{
-    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
-}
-
-/// Constructs a 64-bit integer vector, initialized in reverse order with
-///    the specified 32-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __i0
-///    A 32-bit integer value used to initialize the lower 32 bits of the
-///    result.
-/// \param __i1
-///    A 32-bit integer value used to initialize the upper 32 bits of the
-///    result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setr_pi32(int __i0, int __i1)
-{
-    return _mm_set_pi32(__i1, __i0);
-}
-
-/// Constructs a 64-bit integer vector, initialized in reverse order with
-///    the specified 16-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __w0
-///    A 16-bit integer value used to initialize bits [15:0] of the result.
-/// \param __w1
-///    A 16-bit integer value used to initialize bits [31:16] of the result.
-/// \param __w2
-///    A 16-bit integer value used to initialize bits [47:32] of the result.
-/// \param __w3
-///    A 16-bit integer value used to initialize bits [63:48] of the result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
-{
-    return _mm_set_pi16(__w3, __w2, __w1, __w0);
-}
-
-/// Constructs a 64-bit integer vector, initialized in reverse order with
-///    the specified 8-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __b0
-///    An 8-bit integer value used to initialize bits [7:0] of the result.
-/// \param __b1
-///    An 8-bit integer value used to initialize bits [15:8] of the result.
-/// \param __b2
-///    An 8-bit integer value used to initialize bits [23:16] of the result.
-/// \param __b3
-///    An 8-bit integer value used to initialize bits [31:24] of the result.
-/// \param __b4
-///    An 8-bit integer value used to initialize bits [39:32] of the result.
-/// \param __b5
-///    An 8-bit integer value used to initialize bits [47:40] of the result.
-/// \param __b6
-///    An 8-bit integer value used to initialize bits [55:48] of the result.
-/// \param __b7
-///    An 8-bit integer value used to initialize bits [63:56] of the result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
-             char __b6, char __b7)
-{
-    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-/* Aliases for compatibility. */
-#define _m_empty _mm_empty
-#define _m_from_int _mm_cvtsi32_si64
-#define _m_from_int64 _mm_cvtsi64_m64
-#define _m_to_int _mm_cvtsi64_si32
-#define _m_to_int64 _mm_cvtm64_si64
-#define _m_packsswb _mm_packs_pi16
-#define _m_packssdw _mm_packs_pi32
-#define _m_packuswb _mm_packs_pu16
-#define _m_punpckhbw _mm_unpackhi_pi8
-#define _m_punpckhwd _mm_unpackhi_pi16
-#define _m_punpckhdq _mm_unpackhi_pi32
-#define _m_punpcklbw _mm_unpacklo_pi8
-#define _m_punpcklwd _mm_unpacklo_pi16
-#define _m_punpckldq _mm_unpacklo_pi32
-#define _m_paddb _mm_add_pi8
-#define _m_paddw _mm_add_pi16
-#define _m_paddd _mm_add_pi32
-#define _m_paddsb _mm_adds_pi8
-#define _m_paddsw _mm_adds_pi16
-#define _m_paddusb _mm_adds_pu8
-#define _m_paddusw _mm_adds_pu16
-#define _m_psubb _mm_sub_pi8
-#define _m_psubw _mm_sub_pi16
-#define _m_psubd _mm_sub_pi32
-#define _m_psubsb _mm_subs_pi8
-#define _m_psubsw _mm_subs_pi16
-#define _m_psubusb _mm_subs_pu8
-#define _m_psubusw _mm_subs_pu16
-#define _m_pmaddwd _mm_madd_pi16
-#define _m_pmulhw _mm_mulhi_pi16
-#define _m_pmullw _mm_mullo_pi16
-#define _m_psllw _mm_sll_pi16
-#define _m_psllwi _mm_slli_pi16
-#define _m_pslld _mm_sll_pi32
-#define _m_pslldi _mm_slli_pi32
-#define _m_psllq _mm_sll_si64
-#define _m_psllqi _mm_slli_si64
-#define _m_psraw _mm_sra_pi16
-#define _m_psrawi _mm_srai_pi16
-#define _m_psrad _mm_sra_pi32
-#define _m_psradi _mm_srai_pi32
-#define _m_psrlw _mm_srl_pi16
-#define _m_psrlwi _mm_srli_pi16
-#define _m_psrld _mm_srl_pi32
-#define _m_psrldi _mm_srli_pi32
-#define _m_psrlq _mm_srl_si64
-#define _m_psrlqi _mm_srli_si64
-#define _m_pand _mm_and_si64
-#define _m_pandn _mm_andnot_si64
-#define _m_por _mm_or_si64
-#define _m_pxor _mm_xor_si64
-#define _m_pcmpeqb _mm_cmpeq_pi8
-#define _m_pcmpeqw _mm_cmpeq_pi16
-#define _m_pcmpeqd _mm_cmpeq_pi32
-#define _m_pcmpgtb _mm_cmpgt_pi8
-#define _m_pcmpgtw _mm_cmpgt_pi16
-#define _m_pcmpgtd _mm_cmpgt_pi32
-
-#endif /* __MMINTRIN_H */
-
diff --git a/third_party/intel/clang/movdirintrin.h b/third_party/intel/clang/movdirintrin.h
deleted file mode 100644
index 30c4d02c8..000000000
--- a/third_party/intel/clang/movdirintrin.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*===------------------------- movdirintrin.h ------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef _MOVDIRINTRIN_H
-#define _MOVDIRINTRIN_H
-
-/* Move doubleword as direct store */
-static __inline__ void
-__attribute__((__always_inline__, __nodebug__,  __target__("movdiri")))
-_directstoreu_u32 (void *__dst, unsigned int  __value)
-{
-  __builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value);
-}
-
-#ifdef __x86_64__
-
-/* Move quadword as direct store */
-static __inline__ void
-__attribute__((__always_inline__, __nodebug__,  __target__("movdiri")))
-_directstoreu_u64 (void *__dst, unsigned long __value)
-{
-  __builtin_ia32_directstore_u64((unsigned long *)__dst, __value);
-}
-
-#endif /* __x86_64__ */
-
-/*
- * movdir64b - Move 64 bytes as direct store.
- * The destination must be 64 byte aligned, and the store is atomic.
- * The source address has no alignment requirement, and the load from
- * the source address is not atomic.
- */
-static __inline__ void
-__attribute__((__always_inline__, __nodebug__,  __target__("movdir64b")))
-_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src)
-{
-  __builtin_ia32_movdir64b(__dst, __src);
-}
-
-#endif /* _MOVDIRINTRIN_H */
diff --git a/third_party/intel/clang/mwaitxintrin.h b/third_party/intel/clang/mwaitxintrin.h
deleted file mode 100644
index 65f427105..000000000
--- a/third_party/intel/clang/mwaitxintrin.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __MWAITXINTRIN_H
-#define __MWAITXINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
-
-/// Establishes a linear address memory range to be monitored and puts
-///    the processor in the monitor event pending state. Data stored in the
-///    monitored address range causes the processor to exit the pending state.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c MONITORX instruction.
-///
-/// \param __p
-///    The memory range to be monitored. The size of the range is determined by
-///    CPUID function 0000_0005h.
-/// \param __extensions
-///    Optional extensions for the monitoring state.
-/// \param __hints
-///    Optional hints for the monitoring state.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
-{
-  __builtin_ia32_monitorx(__p, __extensions, __hints);
-}
-
-/// Used with the \c MONITORX instruction to wait while the processor is in
-///    the monitor event pending state. Data stored in the monitored address
-///    range, or an interrupt, causes the processor to exit the pending state.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c MWAITX instruction.
-///
-/// \param __extensions
-///    Optional extensions for the monitoring state, which can vary by
-///    processor.
-/// \param __hints
-///    Optional hints for the monitoring state, which can vary by processor.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
-{
-  __builtin_ia32_mwaitx(__extensions, __hints, __clock);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __MWAITXINTRIN_H */
diff --git a/third_party/intel/clang/nmmintrin.h b/third_party/intel/clang/nmmintrin.h
deleted file mode 100644
index d26d58eab..000000000
--- a/third_party/intel/clang/nmmintrin.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __NMMINTRIN_H
-#define __NMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
-   just include it now then.  */
-#include "smmintrin.h"
-#endif /* __NMMINTRIN_H */
diff --git a/third_party/intel/clang/pconfigintrin.h b/third_party/intel/clang/pconfigintrin.h
deleted file mode 100644
index d2014b026..000000000
--- a/third_party/intel/clang/pconfigintrin.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*===---- pconfigintrin.h - X86 platform configuration ---------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __PCONFIGINTRIN_H
-#define __PCONFIGINTRIN_H
-
-#define __PCONFIG_KEY_PROGRAM 0x00000001
-
-#if __has_extension(gnu_asm)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("pconfig")))
-
-static __inline unsigned int __DEFAULT_FN_ATTRS
-_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
-{
-  unsigned int __result;
-  __asm__ ("pconfig"
-           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
-           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
-           : "cc");
-  return __result;
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __has_extension(gnu_asm) */
-
-#endif
diff --git a/third_party/intel/clang/pkuintrin.h b/third_party/intel/clang/pkuintrin.h
deleted file mode 100644
index c62080bec..000000000
--- a/third_party/intel/clang/pkuintrin.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __PKUINTRIN_H
-#define __PKUINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rdpkru_u32(void)
-{
-  return __builtin_ia32_rdpkru();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_wrpkru(unsigned int __val)
-{
-  __builtin_ia32_wrpkru(__val);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/pmmintrin.h b/third_party/intel/clang/pmmintrin.h
deleted file mode 100644
index 6414e9e0c..000000000
--- a/third_party/intel/clang/pmmintrin.h
+++ /dev/null
@@ -1,301 +0,0 @@
-/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __PMMINTRIN_H
-#define __PMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include "emmintrin.h"
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("sse3,no-evex512"), __min_vector_width__(128)))
-
-/// Loads data from an unaligned memory location to elements in a 128-bit
-///    vector.
-///
-///    If the address of the data is not 16-byte aligned, the instruction may
-///    read two adjacent aligned blocks of memory to retrieve the requested
-///    data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit integer vector containing integer values.
-/// \returns A 128-bit vector containing the moved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_lddqu_si128(__m128i_u const *__p)
-{
-  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
-}
-
-/// Adds the even-indexed values and subtracts the odd-indexed values of
-///    two 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the left source operand.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing the right source operand.
-/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
-///    differences of both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_addsub_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in two
-///    128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-///    The horizontal sums of the values are stored in the lower bits of the
-///    destination.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-///    The horizontal sums of the values are stored in the upper bits of the
-///    destination.
-/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
-///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_hadd_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in two
-///    128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-///    The horizontal differences between the values are stored in the lower
-///    bits of the destination.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-///    The horizontal differences between the values are stored in the upper
-///    bits of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the horizontal
-///    differences of both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_hsub_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Moves and duplicates odd-indexed values from a 128-bit vector
-///    of [4 x float] to float values stored in a 128-bit vector of
-///    [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. \n
-///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
-///    the destination. \n
-///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
-///    destination.
-/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
-///    values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehdup_ps(__m128 __a)
-{
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
-}
-
-/// Duplicates even-indexed values from a 128-bit vector of
-///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] \n
-///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
-///    the destination. \n
-///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
-///    destination.
-/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
-///    values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_moveldup_ps(__m128 __a)
-{
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
-}
-
-/// Adds the even-indexed values and subtracts the odd-indexed values of
-///    two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing the left source operand.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing the right source operand.
-/// \returns A 128-bit vector of [2 x double] containing the alternating sums
-///    and differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_addsub_pd(__m128d __a, __m128d __b)
-{
-  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Horizontally adds the pairs of values contained in two 128-bit
-///    vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-///    The horizontal sum of the values is stored in the lower bits of the
-///    destination.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-///    The horizontal sum of the values is stored in the upper bits of the
-///    destination.
-/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
-///    both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hadd_pd(__m128d __a, __m128d __b)
-{
-  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Horizontally subtracts the pairs of values contained in two 128-bit
-///    vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-///    The horizontal difference of the values is stored in the lower bits of
-///    the destination.
-/// \param __b
-///    A 128-bit vector of [2 x double] containing one of the source operands.
-///    The horizontal difference of the values is stored in the upper bits of
-///    the destination.
-/// \returns A 128-bit vector of [2 x double] containing the horizontal
-///    differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hsub_pd(__m128d __a, __m128d __b)
-{
-  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Moves and duplicates one double-precision value to double-precision
-///    values stored in a 128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_loaddup_pd(double const *dp);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
-///
-/// \param dp
-///    A pointer to a double-precision value to be moved and duplicated.
-/// \returns A 128-bit vector of [2 x double] containing the moved and
-///    duplicated values.
-#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
-
-/// Moves and duplicates the double-precision value in the lower bits of
-///    a 128-bit vector of [2 x double] to double-precision values stored in a
-///    128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
-///    [127:64] and [63:0] of the destination.
-/// \returns A 128-bit vector of [2 x double] containing the moved and
-///    duplicated values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_movedup_pd(__m128d __a)
-{
-  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
-}
-
-/// Establishes a linear address memory range to be monitored and puts
-///    the processor in the monitor event pending state. Data stored in the
-///    monitored address range causes the processor to exit the pending state.
-///
-/// The \c MONITOR instruction can be used in kernel mode, and in other modes
-/// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c MONITOR instruction.
-///
-/// \param __p
-///    The memory range to be monitored. The size of the range is determined by
-///    CPUID function 0000_0005h.
-/// \param __extensions
-///    Optional extensions for the monitoring state.
-/// \param __hints
-///    Optional hints for the monitoring state.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
-{
-  __builtin_ia32_monitor(__p, __extensions, __hints);
-}
-
-/// Used with the \c MONITOR instruction to wait while the processor is in
-///    the monitor event pending state. Data stored in the monitored address
-///    range, or an interrupt, causes the processor to exit the pending state.
-///
-/// The \c MWAIT instruction can be used in kernel mode, and in other modes if
-/// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c MWAIT instruction.
-///
-/// \param __extensions
-///    Optional extensions for the monitoring state, which can vary by
-///    processor.
-/// \param __hints
-///    Optional hints for the monitoring state, which can vary by processor.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_mwait(unsigned __extensions, unsigned __hints)
-{
-  __builtin_ia32_mwait(__extensions, __hints);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __PMMINTRIN_H */
diff --git a/third_party/intel/clang/popcntintrin.h b/third_party/intel/clang/popcntintrin.h
deleted file mode 100644
index 0aa94aecd..000000000
--- a/third_party/intel/clang/popcntintrin.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __POPCNTINTRIN_H
-#define __POPCNTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
-#else
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
-#endif
-
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
-///
-/// \param __A
-///    An unsigned 32-bit integer operand.
-/// \returns A 32-bit integer containing the number of bits with value 1 in the
-///    source operand.
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_popcnt_u32(unsigned int __A)
-{
-  return __builtin_popcount(__A);
-}
-
-#ifdef __x86_64__
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
-///
-/// \param __A
-///    An unsigned 64-bit integer operand.
-/// \returns A 64-bit integer containing the number of bits with value 1 in the
-///    source operand.
-static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_popcnt_u64(unsigned long long __A)
-{
-  return __builtin_popcountll(__A);
-}
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_CONSTEXPR
-
-#endif /* __POPCNTINTRIN_H */
diff --git a/third_party/intel/clang/prfchiintrin.h b/third_party/intel/clang/prfchiintrin.h
deleted file mode 100644
index 36600b25a..000000000
--- a/third_party/intel/clang/prfchiintrin.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*===---- prfchiintrin.h - PREFETCHI intrinsic -----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __PRFCHIINTRIN_H
-#define __PRFCHIINTRIN_H
-
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("prefetchi")))
-
-/// Loads an instruction sequence containing the specified memory address into
-///    all level cache.
-///
-///    Note that the effect of this intrinsic is dependent on the processor
-///    implementation.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PREFETCHIT0 instruction.
-///
-/// \param __P
-///    A pointer specifying the memory address to be prefetched.
-static __inline__ void __DEFAULT_FN_ATTRS
-_m_prefetchit0(volatile const void *__P) {
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wcast-qual"
-  __builtin_ia32_prefetchi((const void *)__P, 3 /* _MM_HINT_T0 */);
-#pragma clang diagnostic pop
-}
-
-/// Loads an instruction sequence containing the specified memory address into
-///    all but the first-level cache.
-///
-///    Note that the effect of this intrinsic is dependent on the processor
-///    implementation.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PREFETCHIT1 instruction.
-///
-/// \param __P
-///    A pointer specifying the memory address to be prefetched.
-static __inline__ void __DEFAULT_FN_ATTRS
-_m_prefetchit1(volatile const void *__P) {
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wcast-qual"
-  __builtin_ia32_prefetchi((const void *)__P, 2 /* _MM_HINT_T1 */);
-#pragma clang diagnostic pop
-}
-#endif /* __x86_64__ */
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __PRFCHWINTRIN_H */
diff --git a/third_party/intel/clang/prfchwintrin.h b/third_party/intel/clang/prfchwintrin.h
deleted file mode 100644
index eaea5f3cf..000000000
--- a/third_party/intel/clang/prfchwintrin.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
-#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __PRFCHWINTRIN_H
-#define __PRFCHWINTRIN_H
-
-/// Loads a memory sequence containing the specified memory address into
-///    all data cache levels.
-///
-///    The cache-coherency state is set to exclusive. Data can be read from
-///    and written to the cache line without additional delay.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
-///
-/// \param __P
-///    A pointer specifying the memory address to be prefetched.
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
-_m_prefetch(void *__P)
-{
-  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
-}
-
-/// Loads a memory sequence containing the specified memory address into
-///    the L1 data cache and sets the cache-coherency state to modified.
-///
-///    This provides a hint to the processor that the cache line will be
-///    modified. It is intended for use when the cache line will be written to
-///    shortly after the prefetch is performed.
-///
-///    Note that the effect of this intrinsic is dependent on the processor
-///    implementation.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PREFETCHW instruction.
-///
-/// \param __P
-///    A pointer specifying the memory address to be prefetched.
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
-_m_prefetchw(volatile const void *__P)
-{
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wcast-qual"
-  __builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */);
-#pragma clang diagnostic pop
-}
-
-#endif /* __PRFCHWINTRIN_H */
diff --git a/third_party/intel/clang/ptwriteintrin.h b/third_party/intel/clang/ptwriteintrin.h
deleted file mode 100644
index 0a04f7c1d..000000000
--- a/third_party/intel/clang/ptwriteintrin.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <ptwriteintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __PTWRITEINTRIN_H
-#define __PTWRITEINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("ptwrite")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_ptwrite32(unsigned int __value) {
-  __builtin_ia32_ptwrite32(__value);
-}
-
-#ifdef __x86_64__
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_ptwrite64(unsigned long long __value) {
-  __builtin_ia32_ptwrite64(__value);
-}
-
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __PTWRITEINTRIN_H */
diff --git a/third_party/intel/clang/raointintrin.h b/third_party/intel/clang/raointintrin.h
deleted file mode 100644
index d3290eb62..000000000
--- a/third_party/intel/clang/raointintrin.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*===----------------------- raointintrin.h - RAOINT ------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86GPRINTRIN_H
-#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
-#endif // __X86GPRINTRIN_H
-
-#ifndef __RAOINTINTRIN_H
-#define __RAOINTINTRIN_H
-
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("raoint")))
-
-/// Atomically add a 32-bit value at memory operand \a __A and a 32-bit \a __B,
-///    and store the result to the same memory location.
-///
-///    This intrinsic should be used for contention or weak ordering. It may
-///    result in bad performance for hot data used by single thread only.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c AADD instruction.
-///
-/// \param __A
-///    A pointer to a 32-bit memory location.
-/// \param __B
-///    A 32-bit integer value.
-///
-/// \code{.operation}
-/// MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0]
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS _aadd_i32(int *__A, int __B) {
-  __builtin_ia32_aadd32((int *)__A, __B);
-}
-
-/// Atomically and a 32-bit value at memory operand \a __A and a 32-bit \a __B,
-///    and store the result to the same memory location.
-///
-///    This intrinsic should be used for contention or weak ordering. It may
-///    result in bad performance for hot data used by single thread only.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c AAND instruction.
-///
-/// \param __A
-///    A pointer to a 32-bit memory location.
-/// \param __B
-///    A 32-bit integer value.
-///
-/// \code{.operation}
-/// MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0]
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS _aand_i32(int *__A, int __B) {
-  __builtin_ia32_aand32((int *)__A, __B);
-}
-
-/// Atomically or a 32-bit value at memory operand \a __A and a 32-bit \a __B,
-///    and store the result to the same memory location.
-///
-///    This intrinsic should be used for contention or weak ordering. It may
-///    result in bad performance for hot data used by single thread only.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c AOR instruction.
-///
-/// \param __A
-///    A pointer to a 32-bit memory location.
-/// \param __B
-///    A 32-bit integer value.
-///
-/// \code{.operation}
-/// MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0]
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS _aor_i32(int *__A, int __B) {
-  __builtin_ia32_aor32((int *)__A, __B);
-}
-
-/// Atomically xor a 32-bit value at memory operand \a __A and a 32-bit \a __B,
-///    and store the result to the same memory location.
-///
-///    This intrinsic should be used for contention or weak ordering. It may
-///    result in bad performance for hot data used by single thread only.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c AXOR instruction.
-///
-/// \param __A
-///    A pointer to a 32-bit memory location.
-/// \param __B
-///    A 32-bit integer value.
-///
-/// \code{.operation}
-/// MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0]
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS _axor_i32(int *__A, int __B) {
-  __builtin_ia32_axor32((int *)__A, __B);
-}
-
-#ifdef __x86_64__
-/// Atomically add a 64-bit value at memory operand \a __A and a 64-bit \a __B,
-///    and store the result to the same memory location.
-///
-///    This intrinsic should be used for contention or weak ordering. It may
-///    result in bad performance for hot data used by single thread only.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c AADD instruction.
-///
-/// \param __A
-///    A pointer to a 64-bit memory location.
-/// \param __B
-///    A 64-bit integer value.
-///
-/// \code{.operation}
-/// MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0]
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS _aadd_i64(long long *__A,
-                                                    long long __B) {
-  __builtin_ia32_aadd64((long long *)__A, __B);
-}
-
-/// Atomically and a 64-bit value at memory operand \a __A and a 64-bit \a __B,
-///    and store the result to the same memory location.
-///
-///    This intrinsic should be used for contention or weak ordering. It may
-///    result in bad performance for hot data used by single thread only.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c AAND instruction.
-///
-/// \param __A
-///    A pointer to a 64-bit memory location.
-/// \param __B
-///    A 64-bit integer value.
-///
-/// \code{.operation}
-/// MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0]
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS _aand_i64(long long *__A,
-                                                    long long __B) {
-  __builtin_ia32_aand64((long long *)__A, __B);
-}
-
-/// Atomically or a 64-bit value at memory operand \a __A and a 64-bit \a __B,
-///    and store the result to the same memory location.
-///
-///    This intrinsic should be used for contention or weak ordering. It may
-///    result in bad performance for hot data used by single thread only.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c AOR instruction.
-///
-/// \param __A
-///    A pointer to a 64-bit memory location.
-/// \param __B
-///    A 64-bit integer value.
-///
-/// \code{.operation}
-/// MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0]
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS _aor_i64(long long *__A,
-                                                   long long __B) {
-  __builtin_ia32_aor64((long long *)__A, __B);
-}
-
-/// Atomically xor a 64-bit value at memory operand \a __A and a 64-bit \a __B,
-///    and store the result to the same memory location.
-///
-///    This intrinsic should be used for contention or weak ordering. It may
-///    result in bad performance for hot data used by single thread only.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c AXOR instruction.
-///
-/// \param __A
-///    A pointer to a 64-bit memory location.
-/// \param __B
-///    A 64-bit integer value.
-///
-/// \code{.operation}
-/// MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0]
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS _axor_i64(long long *__A,
-                                                    long long __B) {
-  __builtin_ia32_axor64((long long *)__A, __B);
-}
-#endif // __x86_64__
-
-#undef __DEFAULT_FN_ATTRS
-#endif // __RAOINTINTRIN_H
diff --git a/third_party/intel/clang/rdpruintrin.h b/third_party/intel/clang/rdpruintrin.h
deleted file mode 100644
index 89732bb8b..000000000
--- a/third_party/intel/clang/rdpruintrin.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*===---- rdpruintrin.h - RDPRU intrinsics ---------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H
-#error "Never use <rdpruintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __RDPRUINTRIN_H
-#define __RDPRUINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("rdpru")))
-
-
-/// Reads the content of a processor register.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> RDPRU </c> instruction.
-///
-/// \param reg_id
-///    A processor register identifier.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__rdpru (int reg_id)
-{
-  return __builtin_ia32_rdpru(reg_id);
-}
-
-#define __RDPRU_MPERF 0
-#define __RDPRU_APERF 1
-
-/// Reads the content of processor register MPERF.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
-/// register MPERF.
-#define __mperf() __builtin_ia32_rdpru(__RDPRU_MPERF)
-
-/// Reads the content of processor register APERF.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
-/// register APERF.
-#define __aperf() __builtin_ia32_rdpru(__RDPRU_APERF)
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __RDPRUINTRIN_H */
diff --git a/third_party/intel/clang/rdseedintrin.h b/third_party/intel/clang/rdseedintrin.h
deleted file mode 100644
index 8a4fe0930..000000000
--- a/third_party/intel/clang/rdseedintrin.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <rdseedintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __RDSEEDINTRIN_H
-#define __RDSEEDINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
-
-/// Stores a hardware-generated 16-bit random value in the memory at \a __p.
-///
-///    The random number generator complies with NIST SP800-90B and SP800-90C.
-///
-/// \code{.operation}
-/// IF HW_NRND_GEN.ready == 1
-///   Store16(__p, HW_NRND_GEN.data)
-///   result := 1
-/// ELSE
-///   Store16(__p, 0)
-///   result := 0
-/// END
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c RDSEED instruction.
-///
-/// \param __p
-///    Pointer to memory for storing the 16-bit random number.
-/// \returns 1 if a random number was generated, 0 if not.
-static __inline__ int __DEFAULT_FN_ATTRS
-_rdseed16_step(unsigned short *__p)
-{
-  return (int) __builtin_ia32_rdseed16_step(__p);
-}
-
-/// Stores a hardware-generated 32-bit random value in the memory at \a __p.
-///
-///    The random number generator complies with NIST SP800-90B and SP800-90C.
-///
-/// \code{.operation}
-/// IF HW_NRND_GEN.ready == 1
-///   Store32(__p, HW_NRND_GEN.data)
-///   result := 1
-/// ELSE
-///   Store32(__p, 0)
-///   result := 0
-/// END
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c RDSEED instruction.
-///
-/// \param __p
-///    Pointer to memory for storing the 32-bit random number.
-/// \returns 1 if a random number was generated, 0 if not.
-static __inline__ int __DEFAULT_FN_ATTRS
-_rdseed32_step(unsigned int *__p)
-{
-  return (int) __builtin_ia32_rdseed32_step(__p);
-}
-
-#ifdef __x86_64__
-/// Stores a hardware-generated 64-bit random value in the memory at \a __p.
-///
-///    The random number generator complies with NIST SP800-90B and SP800-90C.
-///
-/// \code{.operation}
-/// IF HW_NRND_GEN.ready == 1
-///   Store64(__p, HW_NRND_GEN.data)
-///   result := 1
-/// ELSE
-///   Store64(__p, 0)
-///   result := 0
-/// END
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c RDSEED instruction.
-///
-/// \param __p
-///    Pointer to memory for storing the 64-bit random number.
-/// \returns 1 if a random number was generated, 0 if not.
-static __inline__ int __DEFAULT_FN_ATTRS
-_rdseed64_step(unsigned long long *__p)
-{
-  return (int) __builtin_ia32_rdseed64_step(__p);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __RDSEEDINTRIN_H */
diff --git a/third_party/intel/clang/rtmintrin.h b/third_party/intel/clang/rtmintrin.h
deleted file mode 100644
index a3ec81e3f..000000000
--- a/third_party/intel/clang/rtmintrin.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __RTMINTRIN_H
-#define __RTMINTRIN_H
-
-#define _XBEGIN_STARTED   (~0u)
-#define _XABORT_EXPLICIT  (1 << 0)
-#define _XABORT_RETRY     (1 << 1)
-#define _XABORT_CONFLICT  (1 << 2)
-#define _XABORT_CAPACITY  (1 << 3)
-#define _XABORT_DEBUG     (1 << 4)
-#define _XABORT_NESTED    (1 << 5)
-#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_xbegin(void)
-{
-  return (unsigned int)__builtin_ia32_xbegin();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xend(void)
-{
-  __builtin_ia32_xend();
-}
-
-#define _xabort(imm) __builtin_ia32_xabort((imm))
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __RTMINTRIN_H */
diff --git a/third_party/intel/clang/serializeintrin.h b/third_party/intel/clang/serializeintrin.h
deleted file mode 100644
index b774e5a24..000000000
--- a/third_party/intel/clang/serializeintrin.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*===--------------- serializeintrin.h - serialize intrinsics --------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <serializeintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __SERIALIZEINTRIN_H
-#define __SERIALIZEINTRIN_H
-
-/// Serialize instruction fetch and execution.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> SERIALIZE </c> instruction.
-///
-static __inline__ void
-__attribute__((__always_inline__, __nodebug__, __target__("serialize")))
-_serialize (void)
-{
-  __builtin_ia32_serialize ();
-}
-
-#endif /* __SERIALIZEINTRIN_H */
diff --git a/third_party/intel/clang/sgxintrin.h b/third_party/intel/clang/sgxintrin.h
deleted file mode 100644
index 303a21f6b..000000000
--- a/third_party/intel/clang/sgxintrin.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <sgxintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __SGXINTRIN_H
-#define __SGXINTRIN_H
-
-#if __has_extension(gnu_asm)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("sgx")))
-
-static __inline unsigned int __DEFAULT_FN_ATTRS
-_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
-{
-  unsigned int __result;
-  __asm__ ("enclu"
-           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
-           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
-           : "cc");
-  return __result;
-}
-
-static __inline unsigned int __DEFAULT_FN_ATTRS
-_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
-{
-  unsigned int __result;
-  __asm__ ("encls"
-           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
-           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
-           : "cc");
-  return __result;
-}
-
-static __inline unsigned int __DEFAULT_FN_ATTRS
-_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
-{
-  unsigned int __result;
-  __asm__ ("enclv"
-           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
-           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
-           : "cc");
-  return __result;
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __has_extension(gnu_asm) */
-
-#endif
diff --git a/third_party/intel/clang/sha512intrin.h b/third_party/intel/clang/sha512intrin.h
deleted file mode 100644
index 065ef5dac..000000000
--- a/third_party/intel/clang/sha512intrin.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/*===--------------- sha512intrin.h - SHA512 intrinsics -----------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <sha512intrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __SHA512INTRIN_H
-#define __SHA512INTRIN_H
-
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("sha512"),         \
-                 __min_vector_width__(256)))
-
-/// This intrinisc is one of the two SHA512 message scheduling instructions.
-///    The intrinsic performs an intermediate calculation for the next four
-///    SHA512 message qwords. The calculated results are stored in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_sha512msg1_epi64(__m256i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VSHA512MSG1 instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x long long].
-/// \param __B
-///    A 128-bit vector of [2 x long long].
-/// \returns
-///    A 256-bit vector of [4 x long long].
-///
-/// \code{.operation}
-/// DEFINE ROR64(qword, n) {
-/// 	count := n % 64
-/// 	dest := (qword >> count) | (qword << (64 - count))
-/// 	RETURN dest
-/// }
-/// DEFINE SHR64(qword, n) {
-/// 	RETURN qword >> n
-/// }
-/// DEFINE s0(qword):
-/// 	RETURN ROR64(qword,1) ^ ROR64(qword, 8) ^ SHR64(qword, 7)
-/// }
-/// W[4] := __B.qword[0]
-/// W[3] := __A.qword[3]
-/// W[2] := __A.qword[2]
-/// W[1] := __A.qword[1]
-/// W[0] := __A.qword[0]
-/// dst.qword[3] := W[3] + s0(W[4])
-/// dst.qword[2] := W[2] + s0(W[3])
-/// dst.qword[1] := W[1] + s0(W[2])
-/// dst.qword[0] := W[0] + s0(W[1])
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sha512msg1_epi64(__m256i __A, __m128i __B) {
-  return (__m256i)__builtin_ia32_vsha512msg1((__v4du)__A, (__v2du)__B);
-}
-
-/// This intrinisc is one of the two SHA512 message scheduling instructions.
-///    The intrinsic performs the final calculation for the next four SHA512
-///    message qwords. The calculated results are stored in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_sha512msg2_epi64(__m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VSHA512MSG2 instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x long long].
-/// \param __B
-///    A 256-bit vector of [4 x long long].
-/// \returns
-///    A 256-bit vector of [4 x long long].
-///
-/// \code{.operation}
-/// DEFINE ROR64(qword, n) {
-/// 	count := n % 64
-/// 	dest := (qword >> count) | (qword << (64 - count))
-/// 	RETURN dest
-/// }
-/// DEFINE SHR64(qword, n) {
-/// 	RETURN qword >> n
-/// }
-/// DEFINE s1(qword) {
-/// 	RETURN ROR64(qword,19) ^ ROR64(qword, 61) ^ SHR64(qword, 6)
-/// }
-/// W[14] := __B.qword[2]
-/// W[15] := __B.qword[3]
-/// W[16] := __A.qword[0] + s1(W[14])
-/// W[17] := __A.qword[1] + s1(W[15])
-/// W[18] := __A.qword[2] + s1(W[16])
-/// W[19] := __A.qword[3] + s1(W[17])
-/// dst.qword[3] := W[19]
-/// dst.qword[2] := W[18]
-/// dst.qword[1] := W[17]
-/// dst.qword[0] := W[16]
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sha512msg2_epi64(__m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vsha512msg2((__v4du)__A, (__v4du)__B);
-}
-
-/// This intrinisc performs two rounds of SHA512 operation using initial SHA512
-///    state (C,D,G,H) from \a __A, an initial SHA512 state (A,B,E,F) from
-///    \a __A, and a pre-computed sum of the next two round message qwords and
-///    the corresponding round constants from \a __C (only the two lower qwords
-///    of the third operand). The updated SHA512 state (A,B,E,F) is written to
-///    \a __A, and \a __A can be used as the updated state (C,D,G,H) in later
-///    rounds.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_sha512rnds2_epi64(__m256i __A, __m256i __B, __m128i __C)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VSHA512RNDS2 instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x long long].
-/// \param __B
-///    A 256-bit vector of [4 x long long].
-/// \param __C
-///    A 128-bit vector of [2 x long long].
-/// \returns
-///    A 256-bit vector of [4 x long long].
-///
-/// \code{.operation}
-/// DEFINE ROR64(qword, n) {
-/// 	count := n % 64
-/// 	dest := (qword >> count) | (qword << (64 - count))
-/// 	RETURN dest
-/// }
-/// DEFINE SHR64(qword, n) {
-/// 	RETURN qword >> n
-/// }
-/// DEFINE cap_sigma0(qword) {
-/// 	RETURN ROR64(qword,28) ^ ROR64(qword, 34) ^ ROR64(qword, 39)
-/// }
-/// DEFINE cap_sigma1(qword) {
-/// 	RETURN ROR64(qword,14) ^ ROR64(qword, 18) ^ ROR64(qword, 41)
-/// }
-/// DEFINE MAJ(a,b,c) {
-/// 	RETURN (a & b) ^ (a & c) ^ (b & c)
-/// }
-/// DEFINE CH(e,f,g) {
-/// 	RETURN (e & f) ^ (g & ~e)
-/// }
-/// A[0] := __B.qword[3]
-/// B[0] := __B.qword[2]
-/// C[0] := __C.qword[3]
-/// D[0] := __C.qword[2]
-/// E[0] := __B.qword[1]
-/// F[0] := __B.qword[0]
-/// G[0] := __C.qword[1]
-/// H[0] := __C.qword[0]
-/// WK[0]:= __A.qword[0]
-/// WK[1]:= __A.qword[1]
-/// FOR i := 0 to 1:
-/// 	A[i+1] := CH(E[i], F[i], G[i]) +
-/// 	cap_sigma1(E[i]) + WK[i] + H[i] +
-/// 	MAJ(A[i], B[i], C[i]) +
-/// 	cap_sigma0(A[i])
-/// 	B[i+1] := A[i]
-/// 	C[i+1] := B[i]
-/// 	D[i+1] := C[i]
-/// 	E[i+1] := CH(E[i], F[i], G[i]) +
-/// 	cap_sigma1(E[i]) + WK[i] + H[i] + D[i]
-/// 	F[i+1] := E[i]
-/// 	G[i+1] := F[i]
-/// 	H[i+1] := G[i]
-/// ENDFOR
-/// dst.qword[3] := A[2]
-/// dst.qword[2] := B[2]
-/// dst.qword[1] := E[2]
-/// dst.qword[0] := F[2]
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sha512rnds2_epi64(__m256i __A, __m256i __B, __m128i __C) {
-  return (__m256i)__builtin_ia32_vsha512rnds2((__v4du)__A, (__v4du)__B,
-                                              (__v2du)__C);
-}
-
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __SHA512INTRIN_H
diff --git a/third_party/intel/clang/shaintrin.h b/third_party/intel/clang/shaintrin.h
deleted file mode 100644
index 232e1fa29..000000000
--- a/third_party/intel/clang/shaintrin.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __SHAINTRIN_H
-#define __SHAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128)))
-
-/// Performs four iterations of the inner loop of the SHA-1 message digest
-///    algorithm using the starting SHA-1 state (A, B, C, D) from the 128-bit
-///    vector of [4 x i32] in \a V1 and the next four 32-bit elements of the
-///    message from the 128-bit vector of [4 x i32] in \a V2. Note that the
-///    SHA-1 state variable E must have already been added to \a V2
-///    (\c _mm_sha1nexte_epu32() can perform this step). Returns the updated
-///    SHA-1 state (A, B, C, D) as a 128-bit vector of [4 x i32].
-///
-///    The SHA-1 algorithm has an inner loop of 80 iterations, twenty each
-///    with a different combining function and rounding constant. This
-///    intrinsic performs four iterations using a combining function and
-///    rounding constant selected by \a M[1:0].
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_sha1rnds4_epu32(__m128i V1, __m128i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c SHA1RNDS4 instruction.
-///
-/// \param V1
-///    A 128-bit vector of [4 x i32] containing the initial SHA-1 state.
-/// \param V2
-///    A 128-bit vector of [4 x i32] containing the next four elements of
-///    the message, plus SHA-1 state variable E.
-/// \param M
-///    An immediate value where bits [1:0] select among four possible
-///    combining functions and rounding constants (not specified here).
-/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-1 state.
-#define _mm_sha1rnds4_epu32(V1, V2, M) \
-  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M))
-
-/// Calculates the SHA-1 state variable E from the SHA-1 state variables in
-///    the 128-bit vector of [4 x i32] in \a __X, adds that to the next set of
-///    four message elements in the 128-bit vector of [4 x i32] in \a __Y, and
-///    returns the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SHA1NEXTE instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] containing the current SHA-1 state.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing the next four elements of the
-///    message.
-/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-1
-///    values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
-}
-
-/// Performs an intermediate calculation for deriving the next four SHA-1
-///    message elements using previous message elements from the 128-bit
-///    vectors of [4 x i32] in \a __X and \a __Y, and returns the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SHA1MSG1 instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] containing previous message elements.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing previous message elements.
-/// \returns A 128-bit vector of [4 x i32] containing the derived SHA-1
-///    elements.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
-}
-
-/// Performs the final calculation for deriving the next four SHA-1 message
-///    elements using previous message elements from the 128-bit vectors of
-///    [4 x i32] in \a __X and \a __Y, and returns the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SHA1MSG2 instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] containing an intermediate result.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing previous message values.
-/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-1
-///    values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
-}
-
-/// Performs two rounds of SHA-256 operation using the following inputs: a
-///    starting SHA-256 state (C, D, G, H) from the 128-bit vector of
-///    [4 x i32] in \a __X; a starting SHA-256 state (A, B, E, F) from the
-///    128-bit vector of [4 x i32] in \a __Y; and a pre-computed sum of the
-///    next two message elements (unsigned 32-bit integers) and corresponding
-///    rounding constants from the 128-bit vector of [4 x i32] in \a __Z.
-///    Returns the updated SHA-256 state (A, B, E, F) as a 128-bit vector of
-///    [4 x i32].
-///
-///    The SHA-256 algorithm has a core loop of 64 iterations. This intrinsic
-///    performs two of those iterations.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SHA256RNDS2 instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] containing part of the initial SHA-256
-///    state.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing part of the initial SHA-256
-///    state.
-/// \param __Z
-///    A 128-bit vector of [4 x i32] containing additional input to the
-///    SHA-256 operation.
-/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-1 state.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
-}
-
-/// Performs an intermediate calculation for deriving the next four SHA-256
-///    message elements using previous message elements from the 128-bit
-///    vectors of [4 x i32] in \a __X and \a __Y, and returns the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SHA256MSG1 instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] containing previous message elements.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing previous message elements.
-/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-256
-///    values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
-}
-
-/// Performs the final calculation for deriving the next four SHA-256 message
-///    elements using previous message elements from the 128-bit vectors of
-///    [4 x i32] in \a __X and \a __Y, and returns the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SHA256MSG2 instruction.
-///
-/// \param __X
-///    A 128-bit vector of [4 x i32] containing an intermediate result.
-/// \param __Y
-///    A 128-bit vector of [4 x i32] containing previous message values.
-/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-256
-///    values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __SHAINTRIN_H */
diff --git a/third_party/intel/clang/sm3intrin.h b/third_party/intel/clang/sm3intrin.h
deleted file mode 100644
index 8a3d8bc9e..000000000
--- a/third_party/intel/clang/sm3intrin.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/*===-------------------- sm3intrin.h - SM3 intrinsics ---------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <sm3intrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __SM3INTRIN_H
-#define __SM3INTRIN_H
-
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("sm3"),            \
-                 __min_vector_width__(128)))
-
-/// This intrinisc is one of the two SM3 message scheduling intrinsics. The
-///    intrinsic performs an initial calculation for the next four SM3 message
-///    words. The calculated results are stored in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_sm3msg1_epi32(__m128i __A, __m128i __B, __m128i __C)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VSM3MSG1 instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x int].
-/// \param __B
-///    A 128-bit vector of [4 x int].
-/// \param __C
-///    A 128-bit vector of [4 x int].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// DEFINE ROL32(dword, n) {
-/// 	count := n % 32
-/// 	dest := (dword << count) | (dword >> (32 - count))
-/// 	RETURN dest
-/// }
-/// DEFINE P1(x) {
-/// 	RETURN x ^ ROL32(x, 15) ^ ROL32(x, 23)
-/// }
-/// W[0] := __C.dword[0]
-/// W[1] := __C.dword[1]
-/// W[2] := __C.dword[2]
-/// W[3] := __C.dword[3]
-/// W[7] := __A.dword[0]
-/// W[8] := __A.dword[1]
-/// W[9] := __A.dword[2]
-/// W[10] := __A.dword[3]
-/// W[13] := __B.dword[0]
-/// W[14] := __B.dword[1]
-/// W[15] := __B.dword[2]
-/// TMP0 := W[7] ^ W[0] ^ ROL32(W[13], 15)
-/// TMP1 := W[8] ^ W[1] ^ ROL32(W[14], 15)
-/// TMP2 := W[9] ^ W[2] ^ ROL32(W[15], 15)
-/// TMP3 := W[10] ^ W[3]
-/// dst.dword[0] := P1(TMP0)
-/// dst.dword[1] := P1(TMP1)
-/// dst.dword[2] := P1(TMP2)
-/// dst.dword[3] := P1(TMP3)
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sm3msg1_epi32(__m128i __A,
-                                                                  __m128i __B,
-                                                                  __m128i __C) {
-  return (__m128i)__builtin_ia32_vsm3msg1((__v4su)__A, (__v4su)__B,
-                                          (__v4su)__C);
-}
-
-/// This intrinisc is one of the two SM3 message scheduling intrinsics. The
-///    intrinsic performs the final calculation for the next four SM3 message
-///    words. The calculated results are stored in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_sm3msg2_epi32(__m128i __A, __m128i __B, __m128i __C)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VSM3MSG2 instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x int].
-/// \param __B
-///    A 128-bit vector of [4 x int].
-/// \param __C
-///    A 128-bit vector of [4 x int].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// DEFINE ROL32(dword, n) {
-/// 	count := n % 32
-/// 	dest := (dword << count) | (dword >> (32-count))
-/// 	RETURN dest
-/// }
-/// WTMP[0] := __A.dword[0]
-/// WTMP[1] := __A.dword[1]
-/// WTMP[2] := __A.dword[2]
-/// WTMP[3] := __A.dword[3]
-/// W[3] := __B.dword[0]
-/// W[4] := __B.dword[1]
-/// W[5] := __B.dword[2]
-/// W[6] := __B.dword[3]
-/// W[10] := __C.dword[0]
-/// W[11] := __C.dword[1]
-/// W[12] := __C.dword[2]
-/// W[13] := __C.dword[3]
-/// W[16] := ROL32(W[3], 7) ^ W[10] ^ WTMP[0]
-/// W[17] := ROL32(W[4], 7) ^ W[11] ^ WTMP[1]
-/// W[18] := ROL32(W[5], 7) ^ W[12] ^ WTMP[2]
-/// W[19] := ROL32(W[6], 7) ^ W[13] ^ WTMP[3]
-/// W[19] := W[19] ^ ROL32(W[16], 6) ^ ROL32(W[16], 15) ^ ROL32(W[16], 30)
-/// dst.dword[0] := W[16]
-/// dst.dword[1] := W[17]
-/// dst.dword[2] := W[18]
-/// dst.dword[3] := W[19]
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sm3msg2_epi32(__m128i __A,
-                                                                  __m128i __B,
-                                                                  __m128i __C) {
-  return (__m128i)__builtin_ia32_vsm3msg2((__v4su)__A, (__v4su)__B,
-                                          (__v4su)__C);
-}
-
-/// This intrinsic performs two rounds of SM3 operation using initial SM3 state
-///    (C, D, G, H) from \a __A, an initial SM3 states (A, B, E, F)
-///    from \a __B and a pre-computed words from the \a __C. \a __A with
-///    initial SM3 state of (C, D, G, H) assumes input of non-rotated left
-///    variables from previous state. The updated SM3 state (A, B, E, F) is
-///    written to \a __A. The \a imm8 should contain the even round number
-///    for the first of the two rounds computed by this instruction. The
-///    computation masks the \a imm8 value by AND’ing it with 0x3E so that only
-///    even round numbers from 0 through 62 are used for this operation. The
-///    calculated results are stored in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_sm3rnds2_epi32(__m128i __A, __m128i __B, __m128i __C, const int
-/// imm8) \endcode
-///
-/// This intrinsic corresponds to the \c VSM3RNDS2 instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x int].
-/// \param __B
-///    A 128-bit vector of [4 x int].
-/// \param __C
-///    A 128-bit vector of [4 x int].
-/// \param imm8
-///    A 8-bit constant integer.
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// DEFINE ROL32(dword, n) {
-/// 	count := n % 32
-/// 	dest := (dword << count) | (dword >> (32-count))
-/// 	RETURN dest
-/// }
-/// DEFINE P0(dword) {
-/// 	RETURN dword ^ ROL32(dword, 9) ^ ROL32(dword, 17)
-/// }
-/// DEFINE FF(x,y,z, round){
-/// 	IF round < 16
-/// 		RETURN (x ^ y ^ z)
-/// 	ELSE
-/// 		RETURN (x & y) | (x & z) | (y & z)
-/// 	FI
-/// }
-/// DEFINE GG(x, y, z, round){
-///   IF round < 16
-///   	RETURN (x ^ y ^ z)
-///   ELSE
-///   	RETURN (x & y) | (~x & z)
-///   FI
-/// }
-/// A[0] := __B.dword[3]
-/// B[0] := __B.dword[2]
-/// C[0] := __A.dword[3]
-/// D[0] := __A.dword[2]
-/// E[0] := __B.dword[1]
-/// F[0] := __B.dword[0]
-/// G[0] := __A.dword[1]
-/// H[0] := __A.dword[0]
-/// W[0] := __C.dword[0]
-/// W[1] := __C.dword[1]
-/// W[4] := __C.dword[2]
-/// W[5] := __C.dword[3]
-/// C[0] := ROL32(C[0], 9)
-/// D[0] := ROL32(D[0], 9)
-/// G[0] := ROL32(G[0], 19)
-/// H[0] := ROL32(H[0], 19)
-/// ROUND := __D & 0x3E
-/// IF ROUND < 16
-/// 	CONST := 0x79CC4519
-/// ELSE
-/// 	CONST := 0x7A879D8A
-/// FI
-/// CONST := ROL32(CONST,ROUND)
-/// FOR i:= 0 to 1
-/// 	S1 := ROL32((ROL32(A[i], 12) + E[i] + CONST), 7)
-/// 	S2 := S1 ^ ROL32(A[i], 12)
-/// 	T1 := FF(A[i], B[i], C[i], ROUND) + D[i] + S2 + (W[i] ^ W[i+4])
-/// 	T2 := GG(E[i], F[i], G[i], ROUND) + H[i] + S1 + W[i]
-/// 	D[i+1] := C[i]
-/// 	C[i+1] := ROL32(B[i],9)
-/// 	B[i+1] := A[i]
-/// 	A[i+1] := T1
-/// 	H[i+1] := G[i]
-/// 	G[i+1] := ROL32(F[i], 19)
-/// 	F[i+1] := E[i]
-/// 	E[i+1] := P0(T2)
-/// 	CONST := ROL32(CONST, 1)
-/// ENDFOR
-/// dst.dword[3] := A[2]
-/// dst.dword[2] := B[2]
-/// dst.dword[1] := E[2]
-/// dst.dword[0] := F[2]
-/// dst[MAX:128] := 0
-/// \endcode
-#define _mm_sm3rnds2_epi32(A, B, C, D)                                         \
-  (__m128i) __builtin_ia32_vsm3rnds2((__v4su)A, (__v4su)B, (__v4su)C, (int)D)
-
-#undef __DEFAULT_FN_ATTRS128
-
-#endif // __SM3INTRIN_H
diff --git a/third_party/intel/clang/sm4intrin.h b/third_party/intel/clang/sm4intrin.h
deleted file mode 100644
index 47aeec46a..000000000
--- a/third_party/intel/clang/sm4intrin.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*===--------------- sm4intrin.h - SM4 intrinsics -----------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __SM4INTRIN_H
-#define __SM4INTRIN_H
-
-/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
-///    operates on independent 128-bit lanes. The calculated results are
-///    stored in \a dst.
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x int].
-/// \param __B
-///    A 128-bit vector of [4 x int].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// DEFINE ROL32(dword, n) {
-/// 	count := n % 32
-/// 	dest := (dword << count) | (dword >> (32-count))
-/// 	RETURN dest
-/// }
-/// DEFINE SBOX_BYTE(dword, i) {
-/// 	RETURN sbox[dword.byte[i]]
-/// }
-/// DEFINE lower_t(dword) {
-/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
-/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
-/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
-/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
-/// 	RETURN tmp
-/// }
-/// DEFINE L_KEY(dword) {
-/// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
-/// }
-/// DEFINE T_KEY(dword) {
-/// 	RETURN L_KEY(lower_t(dword))
-/// }
-/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
-/// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
-/// }
-/// FOR i:= 0 to 0
-/// 	P[0] := __B.xmm[i].dword[0]
-/// 	P[1] := __B.xmm[i].dword[1]
-/// 	P[2] := __B.xmm[i].dword[2]
-/// 	P[3] := __B.xmm[i].dword[3]
-/// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
-/// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
-/// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
-/// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
-/// 	DEST.xmm[i].dword[0] := C[0]
-/// 	DEST.xmm[i].dword[1] := C[1]
-/// 	DEST.xmm[i].dword[2] := C[2]
-/// 	DEST.xmm[i].dword[3] := C[3]
-/// ENDFOR
-/// DEST[MAX:128] := 0
-/// \endcode
-#define _mm_sm4key4_epi32(A, B)                                                \
-  (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B)
-
-/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
-///    operates on independent 128-bit lanes. The calculated results are
-///    stored in \a dst.
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x int].
-/// \param __B
-///    A 256-bit vector of [8 x int].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// DEFINE ROL32(dword, n) {
-/// 	count := n % 32
-/// 	dest := (dword << count) | (dword >> (32-count))
-/// 	RETURN dest
-/// }
-/// DEFINE SBOX_BYTE(dword, i) {
-/// 	RETURN sbox[dword.byte[i]]
-/// }
-/// DEFINE lower_t(dword) {
-/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
-/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
-/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
-/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
-/// 	RETURN tmp
-/// }
-/// DEFINE L_KEY(dword) {
-/// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
-/// }
-/// DEFINE T_KEY(dword) {
-/// 	RETURN L_KEY(lower_t(dword))
-/// }
-/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
-/// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
-/// }
-/// FOR i:= 0 to 1
-/// 	P[0] := __B.xmm[i].dword[0]
-/// 	P[1] := __B.xmm[i].dword[1]
-/// 	P[2] := __B.xmm[i].dword[2]
-/// 	P[3] := __B.xmm[i].dword[3]
-/// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
-/// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
-/// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
-/// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
-/// 	DEST.xmm[i].dword[0] := C[0]
-/// 	DEST.xmm[i].dword[1] := C[1]
-/// 	DEST.xmm[i].dword[2] := C[2]
-/// 	DEST.xmm[i].dword[3] := C[3]
-/// ENDFOR
-/// DEST[MAX:256] := 0
-/// \endcode
-#define _mm256_sm4key4_epi32(A, B)                                             \
-  (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B)
-
-/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
-///    operates on independent 128-bit lanes. The calculated results are
-///    stored in \a dst.
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x int].
-/// \param __B
-///    A 128-bit vector of [4 x int].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// DEFINE ROL32(dword, n) {
-/// 	count := n % 32
-/// 	dest := (dword << count) | (dword >> (32-count))
-/// 	RETURN dest
-/// }
-/// DEFINE lower_t(dword) {
-/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
-/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
-/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
-/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
-/// 	RETURN tmp
-/// }
-/// DEFINE L_RND(dword) {
-/// 	tmp := dword
-/// 	tmp := tmp ^ ROL32(dword, 2)
-/// 	tmp := tmp ^ ROL32(dword, 10)
-/// 	tmp := tmp ^ ROL32(dword, 18)
-/// 	tmp := tmp ^ ROL32(dword, 24)
-///   RETURN tmp
-/// }
-/// DEFINE T_RND(dword) {
-/// 	RETURN L_RND(lower_t(dword))
-/// }
-/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
-/// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
-/// }
-/// FOR i:= 0 to 0
-/// 	P[0] := __B.xmm[i].dword[0]
-/// 	P[1] := __B.xmm[i].dword[1]
-/// 	P[2] := __B.xmm[i].dword[2]
-/// 	P[3] := __B.xmm[i].dword[3]
-/// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
-/// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
-/// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
-/// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
-/// 	DEST.xmm[i].dword[0] := C[0]
-/// 	DEST.xmm[i].dword[1] := C[1]
-/// 	DEST.xmm[i].dword[2] := C[2]
-/// 	DEST.xmm[i].dword[3] := C[3]
-/// ENDFOR
-/// DEST[MAX:128] := 0
-/// \endcode
-#define _mm_sm4rnds4_epi32(A, B)                                               \
-  (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B)
-
-/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
-///    operates on independent 128-bit lanes. The calculated results are
-///    stored in \a dst.
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x int].
-/// \param __B
-///    A 256-bit vector of [8 x int].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// DEFINE ROL32(dword, n) {
-/// 	count := n % 32
-/// 	dest := (dword << count) | (dword >> (32-count))
-/// 	RETURN dest
-/// }
-/// DEFINE lower_t(dword) {
-/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
-/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
-/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
-/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
-/// 	RETURN tmp
-/// }
-/// DEFINE L_RND(dword) {
-/// 	tmp := dword
-/// 	tmp := tmp ^ ROL32(dword, 2)
-/// 	tmp := tmp ^ ROL32(dword, 10)
-/// 	tmp := tmp ^ ROL32(dword, 18)
-/// 	tmp := tmp ^ ROL32(dword, 24)
-///   RETURN tmp
-/// }
-/// DEFINE T_RND(dword) {
-/// 	RETURN L_RND(lower_t(dword))
-/// }
-/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
-/// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
-/// }
-/// FOR i:= 0 to 0
-/// 	P[0] := __B.xmm[i].dword[0]
-/// 	P[1] := __B.xmm[i].dword[1]
-/// 	P[2] := __B.xmm[i].dword[2]
-/// 	P[3] := __B.xmm[i].dword[3]
-/// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
-/// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
-/// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
-/// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
-/// 	DEST.xmm[i].dword[0] := C[0]
-/// 	DEST.xmm[i].dword[1] := C[1]
-/// 	DEST.xmm[i].dword[2] := C[2]
-/// 	DEST.xmm[i].dword[3] := C[3]
-/// ENDFOR
-/// DEST[MAX:256] := 0
-/// \endcode
-#define _mm256_sm4rnds4_epi32(A, B)                                            \
-  (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B)
-
-#endif // __SM4INTRIN_H
diff --git a/third_party/intel/clang/smmintrin.h b/third_party/intel/clang/smmintrin.h
deleted file mode 100644
index 6f7f586dc..000000000
--- a/third_party/intel/clang/smmintrin.h
+++ /dev/null
@@ -1,2328 +0,0 @@
-/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __SMMINTRIN_H
-#define __SMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include "tmmintrin.h"
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("sse4.1,no-evex512"), __min_vector_width__(128)))
-
-/* SSE4 Rounding macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-
-#define _MM_FROUND_RAISE_EXC 0x00
-#define _MM_FROUND_NO_EXC 0x08
-
-#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
-#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
-#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
-#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
-#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
-#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
-
-/// Rounds up each element of the 128-bit vector of [4 x float] to an
-///    integer and returns the rounded values in a 128-bit vector of
-///    [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_ceil_ps(__m128 X);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [4 x float] values to be rounded up.
-/// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
-
-/// Rounds up each element of the 128-bit vector of [2 x double] to an
-///    integer and returns the rounded values in a 128-bit vector of
-///    [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_ceil_pd(__m128d X);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [2 x double] values to be rounded up.
-/// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
-
-/// Copies three upper elements of the first 128-bit vector operand to
-///    the corresponding three upper elements of the 128-bit result vector of
-///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
-///    operand to an integer and copies it to the lowest element of the 128-bit
-///    result vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
-///    copied to the corresponding bits of the result.
-/// \param Y
-///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
-///    rounded up to the nearest integer and copied to the corresponding bits
-///    of the result.
-/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
-///    values.
-#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
-
-/// Copies the upper element of the first 128-bit vector operand to the
-///    corresponding upper element of the 128-bit result vector of [2 x double].
-///    Rounds up the lower element of the second 128-bit vector operand to an
-///    integer and copies it to the lower element of the 128-bit result vector
-///    of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
-///    copied to the corresponding bits of the result.
-/// \param Y
-///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
-///    rounded up to the nearest integer and copied to the corresponding bits
-///    of the result.
-/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
-///    values.
-#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
-
-/// Rounds down each element of the 128-bit vector of [4 x float] to an
-///    an integer and returns the rounded values in a 128-bit vector of
-///    [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_floor_ps(__m128 X);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [4 x float] values to be rounded down.
-/// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
-
-/// Rounds down each element of the 128-bit vector of [2 x double] to an
-///    integer and returns the rounded values in a 128-bit vector of
-///    [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_floor_pd(__m128d X);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
-
-/// Copies three upper elements of the first 128-bit vector operand to
-///    the corresponding three upper elements of the 128-bit result vector of
-///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
-///    operand to an integer and copies it to the lowest element of the 128-bit
-///    result vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
-///    copied to the corresponding bits of the result.
-/// \param Y
-///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
-///    rounded down to the nearest integer and copied to the corresponding bits
-///    of the result.
-/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
-///    values.
-#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
-
-/// Copies the upper element of the first 128-bit vector operand to the
-///    corresponding upper element of the 128-bit result vector of [2 x double].
-///    Rounds down the lower element of the second 128-bit vector operand to an
-///    integer and copies it to the lower element of the 128-bit result vector
-///    of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
-///    copied to the corresponding bits of the result.
-/// \param Y
-///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
-///    rounded down to the nearest integer and copied to the corresponding bits
-///    of the result.
-/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
-///    values.
-#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
-
-/// Rounds each element of the 128-bit vector of [4 x float] to an
-///    integer value according to the rounding control specified by the second
-///    argument and returns the rounded values in a 128-bit vector of
-///    [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_round_ps(__m128 X, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [4 x float].
-/// \param M
-///    An integer value that specifies the rounding operation. \n
-///    Bits [7:4] are reserved. \n
-///    Bit [3] is a precision exception value: \n
-///      0: A normal PE exception is used \n
-///      1: The PE field is not updated \n
-///    Bit [2] is the rounding control source: \n
-///      0: Use bits [1:0] of \a M \n
-///      1: Use the current MXCSR setting \n
-///    Bits [1:0] contain the rounding control definition: \n
-///      00: Nearest \n
-///      01: Downward (toward negative infinity) \n
-///      10: Upward (toward positive infinity) \n
-///      11: Truncated
-/// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_round_ps(X, M)                                                     \
-  ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
-
-/// Copies three upper elements of the first 128-bit vector operand to
-///    the corresponding three upper elements of the 128-bit result vector of
-///    [4 x float]. Rounds the lowest element of the second 128-bit vector
-///    operand to an integer value according to the rounding control specified
-///    by the third argument and copies it to the lowest element of the 128-bit
-///    result vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
-///    copied to the corresponding bits of the result.
-/// \param Y
-///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
-///    rounded to the nearest integer using the specified rounding control and
-///    copied to the corresponding bits of the result.
-/// \param M
-///    An integer value that specifies the rounding operation. \n
-///    Bits [7:4] are reserved. \n
-///    Bit [3] is a precision exception value: \n
-///      0: A normal PE exception is used \n
-///      1: The PE field is not updated \n
-///    Bit [2] is the rounding control source: \n
-///      0: Use bits [1:0] of \a M \n
-///      1: Use the current MXCSR setting \n
-///    Bits [1:0] contain the rounding control definition: \n
-///      00: Nearest \n
-///      01: Downward (toward negative infinity) \n
-///      10: Upward (toward positive infinity) \n
-///      11: Truncated
-/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
-///    values.
-#define _mm_round_ss(X, Y, M)                                                  \
-  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y),    \
-                                  (M)))
-
-/// Rounds each element of the 128-bit vector of [2 x double] to an
-///    integer value according to the rounding control specified by the second
-///    argument and returns the rounded values in a 128-bit vector of
-///    [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_round_pd(__m128d X, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [2 x double].
-/// \param M
-///    An integer value that specifies the rounding operation. \n
-///    Bits [7:4] are reserved. \n
-///    Bit [3] is a precision exception value: \n
-///      0: A normal PE exception is used \n
-///      1: The PE field is not updated \n
-///    Bit [2] is the rounding control source: \n
-///      0: Use bits [1:0] of \a M \n
-///      1: Use the current MXCSR setting \n
-///    Bits [1:0] contain the rounding control definition: \n
-///      00: Nearest \n
-///      01: Downward (toward negative infinity) \n
-///      10: Upward (toward positive infinity) \n
-///      11: Truncated
-/// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_round_pd(X, M)                                                     \
-  ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
-
-/// Copies the upper element of the first 128-bit vector operand to the
-///    corresponding upper element of the 128-bit result vector of [2 x double].
-///    Rounds the lower element of the second 128-bit vector operand to an
-///    integer value according to the rounding control specified by the third
-///    argument and copies it to the lower element of the 128-bit result vector
-///    of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
-///    copied to the corresponding bits of the result.
-/// \param Y
-///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
-///    rounded to the nearest integer using the specified rounding control and
-///    copied to the corresponding bits of the result.
-/// \param M
-///    An integer value that specifies the rounding operation. \n
-///    Bits [7:4] are reserved. \n
-///    Bit [3] is a precision exception value: \n
-///      0: A normal PE exception is used \n
-///      1: The PE field is not updated \n
-///    Bit [2] is the rounding control source: \n
-///      0: Use bits [1:0] of \a M \n
-///      1: Use the current MXCSR setting \n
-///    Bits [1:0] contain the rounding control definition: \n
-///      00: Nearest \n
-///      01: Downward (toward negative infinity) \n
-///      10: Upward (toward positive infinity) \n
-///      11: Truncated
-/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
-///    values.
-#define _mm_round_sd(X, Y, M)                                                  \
-  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
-                                   (M)))
-
-/* SSE4 Packed Blending Intrinsics.  */
-/// Returns a 128-bit vector of [2 x double] where the values are
-///    selected from either the first or second operand as specified by the
-///    third operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
-///
-/// \param V1
-///    A 128-bit vector of [2 x double].
-/// \param V2
-///    A 128-bit vector of [2 x double].
-/// \param M
-///    An immediate integer operand, with mask bits [1:0] specifying how the
-///    values are to be copied. The position of the mask bit corresponds to the
-///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
-///    element in operand \a V1 is copied to the same position in the result.
-///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
-///    is copied to the same position in the result.
-/// \returns A 128-bit vector of [2 x double] containing the copied values.
-#define _mm_blend_pd(V1, V2, M)                                                \
-  ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1),                      \
-                                   (__v2df)(__m128d)(V2), (int)(M)))
-
-/// Returns a 128-bit vector of [4 x float] where the values are selected
-///    from either the first or second operand as specified by the third
-///    operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
-///
-/// \param V1
-///    A 128-bit vector of [4 x float].
-/// \param V2
-///    A 128-bit vector of [4 x float].
-/// \param M
-///    An immediate integer operand, with mask bits [3:0] specifying how the
-///    values are to be copied. The position of the mask bit corresponds to the
-///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
-///    element in operand \a V1 is copied to the same position in the result.
-///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
-///    is copied to the same position in the result.
-/// \returns A 128-bit vector of [4 x float] containing the copied values.
-#define _mm_blend_ps(V1, V2, M)                                                \
-  ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2),  \
-                                  (int)(M)))
-
-/// Returns a 128-bit vector of [2 x double] where the values are
-///    selected from either the first or second operand as specified by the
-///    third operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [2 x double].
-/// \param __V2
-///    A 128-bit vector of [2 x double].
-/// \param __M
-///    A 128-bit vector operand, with mask bits 127 and 63 specifying how the
-///    values are to be copied. The position of the mask bit corresponds to the
-///    most significant bit of a copied value. When a mask bit is 0, the
-///    corresponding 64-bit element in operand \a __V1 is copied to the same
-///    position in the result. When a mask bit is 1, the corresponding 64-bit
-///    element in operand \a __V2 is copied to the same position in the result.
-/// \returns A 128-bit vector of [2 x double] containing the copied values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
-                                                           __m128d __V2,
-                                                           __m128d __M) {
-  return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
-                                          (__v2df)__M);
-}
-
-/// Returns a 128-bit vector of [4 x float] where the values are
-///    selected from either the first or second operand as specified by the
-///    third operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [4 x float].
-/// \param __V2
-///    A 128-bit vector of [4 x float].
-/// \param __M
-///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
-///    how the values are to be copied. The position of the mask bit corresponds
-///    to the most significant bit of a copied value. When a mask bit is 0, the
-///    corresponding 32-bit element in operand \a __V1 is copied to the same
-///    position in the result. When a mask bit is 1, the corresponding 32-bit
-///    element in operand \a __V2 is copied to the same position in the result.
-/// \returns A 128-bit vector of [4 x float] containing the copied values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
-                                                          __m128 __V2,
-                                                          __m128 __M) {
-  return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
-                                         (__v4sf)__M);
-}
-
-/// Returns a 128-bit vector of [16 x i8] where the values are selected
-///    from either of the first or second operand as specified by the third
-///    operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [16 x i8].
-/// \param __V2
-///    A 128-bit vector of [16 x i8].
-/// \param __M
-///    A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
-///    how the values are to be copied. The position of the mask bit corresponds
-///    to the most significant bit of a copied value. When a mask bit is 0, the
-///    corresponding 8-bit element in operand \a __V1 is copied to the same
-///    position in the result. When a mask bit is 1, the corresponding 8-bit
-///    element in operand \a __V2 is copied to the same position in the result.
-/// \returns A 128-bit vector of [16 x i8] containing the copied values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
-                                                             __m128i __V2,
-                                                             __m128i __M) {
-  return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
-                                             (__v16qi)__M);
-}
-
-/// Returns a 128-bit vector of [8 x i16] where the values are selected
-///    from either of the first or second operand as specified by the third
-///    operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
-///
-/// \param V1
-///    A 128-bit vector of [8 x i16].
-/// \param V2
-///    A 128-bit vector of [8 x i16].
-/// \param M
-///    An immediate integer operand, with mask bits [7:0] specifying how the
-///    values are to be copied. The position of the mask bit corresponds to the
-///    index of a copied value. When a mask bit is 0, the corresponding 16-bit
-///    element in operand \a V1 is copied to the same position in the result.
-///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
-///    is copied to the same position in the result.
-/// \returns A 128-bit vector of [8 x i16] containing the copied values.
-#define _mm_blend_epi16(V1, V2, M)                                             \
-  ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1),                   \
-                                      (__v8hi)(__m128i)(V2), (int)(M)))
-
-/* SSE4 Dword Multiply Instructions.  */
-/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
-///    and returns the lower 32 bits of the each product in a 128-bit vector of
-///    [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
-///
-/// \param __V1
-///    A 128-bit integer vector.
-/// \param __V2
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
-                                                             __m128i __V2) {
-  return (__m128i)((__v4su)__V1 * (__v4su)__V2);
-}
-
-/// Multiplies corresponding even-indexed elements of two 128-bit
-///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
-///    containing the products.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [4 x i32].
-/// \param __V2
-///    A 128-bit vector of [4 x i32].
-/// \returns A 128-bit vector of [2 x i64] containing the products of both
-///    operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
-                                                           __m128i __V2) {
-  return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
-}
-
-/* SSE4 Floating Point Dot Product Instructions.  */
-/// Computes the dot product of the two 128-bit vectors of [4 x float]
-///    and returns it in the elements of the 128-bit result vector of
-///    [4 x float].
-///
-///    The immediate integer operand controls which input elements
-///    will contribute to the dot product, and where the final results are
-///    returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [4 x float].
-/// \param Y
-///    A 128-bit vector of [4 x float].
-/// \param M
-///    An immediate integer operand. Mask bits [7:4] determine which elements
-///    of the input vectors are used, with bit [4] corresponding to the lowest
-///    element and bit [7] corresponding to the highest element of each [4 x
-///    float] vector. If a bit is set, the corresponding elements from the two
-///    input vectors are used as an input for dot product; otherwise that input
-///    is treated as zero. Bits [3:0] determine which elements of the result
-///    will receive a copy of the final dot product, with bit [0] corresponding
-///    to the lowest element and bit [3] corresponding to the highest element of
-///    each [4 x float] subvector. If a bit is set, the dot product is returned
-///    in the corresponding element; otherwise that element is set to zero.
-/// \returns A 128-bit vector of [4 x float] containing the dot product.
-#define _mm_dp_ps(X, Y, M)                                                     \
-  ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
-
-/// Computes the dot product of the two 128-bit vectors of [2 x double]
-///    and returns it in the elements of the 128-bit result vector of
-///    [2 x double].
-///
-///    The immediate integer operand controls which input
-///    elements will contribute to the dot product, and where the final results
-///    are returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [2 x double].
-/// \param Y
-///    A 128-bit vector of [2 x double].
-/// \param M
-///    An immediate integer operand. Mask bits [5:4] determine which elements
-///    of the input vectors are used, with bit [4] corresponding to the lowest
-///    element and bit [5] corresponding to the highest element of each of [2 x
-///    double] vector. If a bit is set, the corresponding elements from the two
-///    input vectors are used as an input for dot product; otherwise that input
-///    is treated as zero. Bits [1:0] determine which elements of the result
-///    will receive a copy of the final dot product, with bit [0] corresponding
-///    to the lowest element and bit [1] corresponding to the highest element of
-///    each [2 x double] vector. If a bit is set, the dot product is returned in
-///    the corresponding element; otherwise that element is set to zero.
-#define _mm_dp_pd(X, Y, M)                                                     \
-  ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y),    \
-                                (M)))
-
-/* SSE4 Streaming Load Hint Instruction.  */
-/// Loads integer values from a 128-bit aligned memory location to a
-///    128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
-///
-/// \param __V
-///    A pointer to a 128-bit aligned memory location that contains the integer
-///    values.
-/// \returns A 128-bit integer vector containing the data stored at the
-///    specified memory location.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128(const void *__V) {
-  return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
-}
-
-/* SSE4 Packed Integer Min/Max Instructions.  */
-/// Compares the corresponding elements of two 128-bit vectors of
-///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
-///    of the two values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [16 x i8].
-/// \param __V2
-///    A 128-bit vector of [16 x i8]
-/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
-                                                          __m128i __V2) {
-  return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
-///    greater value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [16 x i8].
-/// \param __V2
-///    A 128-bit vector of [16 x i8].
-/// \returns A 128-bit vector of [16 x i8] containing the greater values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
-                                                          __m128i __V2) {
-  return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
-///    value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [8 x u16].
-/// \param __V2
-///    A 128-bit vector of [8 x u16].
-/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
-                                                           __m128i __V2) {
-  return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
-///    greater value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [8 x u16].
-/// \param __V2
-///    A 128-bit vector of [8 x u16].
-/// \returns A 128-bit vector of [8 x u16] containing the greater values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
-                                                           __m128i __V2) {
-  return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
-///    value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [4 x i32].
-/// \param __V2
-///    A 128-bit vector of [4 x i32].
-/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
-                                                           __m128i __V2) {
-  return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
-///    greater value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [4 x i32].
-/// \param __V2
-///    A 128-bit vector of [4 x i32].
-/// \returns A 128-bit vector of [4 x i32] containing the greater values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
-                                                           __m128i __V2) {
-  return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
-///    value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [4 x u32].
-/// \param __V2
-///    A 128-bit vector of [4 x u32].
-/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
-                                                           __m128i __V2) {
-  return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
-///    greater value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [4 x u32].
-/// \param __V2
-///    A 128-bit vector of [4 x u32].
-/// \returns A 128-bit vector of [4 x u32] containing the greater values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
-                                                           __m128i __V2) {
-  return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
-}
-
-/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
-/// Takes the first argument \a X and inserts an element from the second
-///    argument \a Y as selected by the third argument \a N. That result then
-///    has elements zeroed out also as selected by the third argument \a N. The
-///    resulting 128-bit vector of [4 x float] is then returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
-///
-/// \param X
-///    A 128-bit vector source operand of [4 x float]. With the exception of
-///    those bits in the result copied from parameter \a Y and zeroed by bits
-///    [3:0] of \a N, all bits from this parameter are copied to the result.
-/// \param Y
-///    A 128-bit vector source operand of [4 x float]. One single-precision
-///    floating-point element from this source, as determined by the immediate
-///    parameter, is copied to the result.
-/// \param N
-///    Specifies which bits from operand \a Y will be copied, which bits in the
-///    result they will be copied to, and which bits in the result will be
-///    cleared. The following assignments are made: \n
-///    Bits [7:6] specify the bits to copy from operand \a Y: \n
-///      00: Selects bits [31:0] from operand \a Y. \n
-///      01: Selects bits [63:32] from operand \a Y. \n
-///      10: Selects bits [95:64] from operand \a Y. \n
-///      11: Selects bits [127:96] from operand \a Y. \n
-///    Bits [5:4] specify the bits in the result to which the selected bits
-///    from operand \a Y are copied: \n
-///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
-///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
-///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
-///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
-///    Bits[3:0]: If any of these bits are set, the corresponding result
-///    element is cleared.
-/// \returns A 128-bit vector of [4 x float] containing the copied
-///    single-precision floating point elements from the operands.
-#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
-
-/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
-///    returns it, using the immediate value parameter \a N as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_extract_ps(__m128 X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
-/// instruction.
-///
-/// \param X
-///    A 128-bit vector of [4 x float].
-/// \param N
-///    An immediate value. Bits [1:0] determines which bits from the argument
-///    \a X are extracted and returned: \n
-///    00: Bits [31:0] of parameter \a X are returned. \n
-///    01: Bits [63:32] of parameter \a X are returned. \n
-///    10: Bits [95:64] of parameter \a X are returned. \n
-///    11: Bits [127:96] of parameter \a X are returned.
-/// \returns A 32-bit integer containing the extracted 32 bits of float data.
-#define _mm_extract_ps(X, N)                                                   \
-  __builtin_bit_cast(                                                          \
-      int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
-
-/* Miscellaneous insert and extract macros.  */
-/* Extract a single-precision float from X at index N into D.  */
-#define _MM_EXTRACT_FLOAT(D, X, N)                                             \
-  do {                                                                         \
-    (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N));          \
-  } while (0)
-
-/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
-   an index suitable for _mm_insert_ps.  */
-#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
-
-/* Extract a float from X at index N into the first index of the return.  */
-#define _MM_PICK_OUT_PS(X, N)                                                  \
-  _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
-
-/* Insert int into packed integer array at index.  */
-/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
-///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
-///    of an integer parameter \a I into an offset specified by the immediate
-///    value parameter \a N.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
-///
-/// \param X
-///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
-///    result and then one of the sixteen elements in the result vector is
-///    replaced by the lower 8 bits of \a I.
-/// \param I
-///    An integer. The lower 8 bits of this operand are written to the result
-///    beginning at the offset specified by \a N.
-/// \param N
-///    An immediate value. Bits [3:0] specify the bit offset in the result at
-///    which the lower 8 bits of \a I are written. \n
-///    0000: Bits [7:0] of the result are used for insertion. \n
-///    0001: Bits [15:8] of the result are used for insertion. \n
-///    0010: Bits [23:16] of the result are used for insertion. \n
-///    0011: Bits [31:24] of the result are used for insertion. \n
-///    0100: Bits [39:32] of the result are used for insertion. \n
-///    0101: Bits [47:40] of the result are used for insertion. \n
-///    0110: Bits [55:48] of the result are used for insertion. \n
-///    0111: Bits [63:56] of the result are used for insertion. \n
-///    1000: Bits [71:64] of the result are used for insertion. \n
-///    1001: Bits [79:72] of the result are used for insertion. \n
-///    1010: Bits [87:80] of the result are used for insertion. \n
-///    1011: Bits [95:88] of the result are used for insertion. \n
-///    1100: Bits [103:96] of the result are used for insertion. \n
-///    1101: Bits [111:104] of the result are used for insertion. \n
-///    1110: Bits [119:112] of the result are used for insertion. \n
-///    1111: Bits [127:120] of the result are used for insertion.
-/// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi8(X, I, N)                                               \
-  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I),      \
-                                         (int)(N)))
-
-/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
-///    the 128-bit integer vector parameter, and then inserting the 32-bit
-///    integer parameter \a I at the offset specified by the immediate value
-///    parameter \a N.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
-///
-/// \param X
-///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
-///    result and then one of the four elements in the result vector is
-///    replaced by \a I.
-/// \param I
-///    A 32-bit integer that is written to the result beginning at the offset
-///    specified by \a N.
-/// \param N
-///    An immediate value. Bits [1:0] specify the bit offset in the result at
-///    which the integer \a I is written. \n
-///    00: Bits [31:0] of the result are used for insertion. \n
-///    01: Bits [63:32] of the result are used for insertion. \n
-///    10: Bits [95:64] of the result are used for insertion. \n
-///    11: Bits [127:96] of the result are used for insertion.
-/// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi32(X, I, N)                                              \
-  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I),        \
-                                        (int)(N)))
-
-#ifdef __x86_64__
-/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
-///    the 128-bit integer vector parameter, and then inserting the 64-bit
-///    integer parameter \a I, using the immediate value parameter \a N as an
-///    insertion location selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
-///
-/// \param X
-///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
-///    result and then one of the two elements in the result vector is replaced
-///    by \a I.
-/// \param I
-///    A 64-bit integer that is written to the result beginning at the offset
-///    specified by \a N.
-/// \param N
-///    An immediate value. Bit [0] specifies the bit offset in the result at
-///    which the integer \a I is written. \n
-///    0: Bits [63:0] of the result are used for insertion. \n
-///    1: Bits [127:64] of the result are used for insertion. \n
-/// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi64(X, I, N)                                              \
-  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I),  \
-                                        (int)(N)))
-#endif /* __x86_64__ */
-
-/* Extract int from packed integer array at index.  This returns the element
- * as a zero extended value, so it is unsigned.
- */
-/// Extracts an 8-bit element from the 128-bit integer vector of
-///    [16 x i8], using the immediate value parameter \a N as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_extract_epi8(__m128i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
-///
-/// \param X
-///    A 128-bit integer vector.
-/// \param N
-///    An immediate value. Bits [3:0] specify which 8-bit vector element from
-///    the argument \a X to extract and copy to the result. \n
-///    0000: Bits [7:0] of parameter \a X are extracted. \n
-///    0001: Bits [15:8] of the parameter \a X are extracted. \n
-///    0010: Bits [23:16] of the parameter \a X are extracted. \n
-///    0011: Bits [31:24] of the parameter \a X are extracted. \n
-///    0100: Bits [39:32] of the parameter \a X are extracted. \n
-///    0101: Bits [47:40] of the parameter \a X are extracted. \n
-///    0110: Bits [55:48] of the parameter \a X are extracted. \n
-///    0111: Bits [63:56] of the parameter \a X are extracted. \n
-///    1000: Bits [71:64] of the parameter \a X are extracted. \n
-///    1001: Bits [79:72] of the parameter \a X are extracted. \n
-///    1010: Bits [87:80] of the parameter \a X are extracted. \n
-///    1011: Bits [95:88] of the parameter \a X are extracted. \n
-///    1100: Bits [103:96] of the parameter \a X are extracted. \n
-///    1101: Bits [111:104] of the parameter \a X are extracted. \n
-///    1110: Bits [119:112] of the parameter \a X are extracted. \n
-///    1111: Bits [127:120] of the parameter \a X are extracted.
-/// \returns  An unsigned integer, whose lower 8 bits are selected from the
-///    128-bit integer vector parameter and the remaining bits are assigned
-///    zeros.
-#define _mm_extract_epi8(X, N)                                                 \
-  ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X),     \
-                                                    (int)(N)))
-
-/// Extracts a 32-bit element from the 128-bit integer vector of
-///    [4 x i32], using the immediate value parameter \a N as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_extract_epi32(__m128i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
-///
-/// \param X
-///    A 128-bit integer vector.
-/// \param N
-///    An immediate value. Bits [1:0] specify which 32-bit vector element from
-///    the argument \a X to extract and copy to the result. \n
-///    00: Bits [31:0] of the parameter \a X are extracted. \n
-///    01: Bits [63:32] of the parameter \a X are extracted. \n
-///    10: Bits [95:64] of the parameter \a X are extracted. \n
-///    11: Bits [127:96] of the parameter \a X are exracted.
-/// \returns  An integer, whose lower 32 bits are selected from the 128-bit
-///    integer vector parameter and the remaining bits are assigned zeros.
-#define _mm_extract_epi32(X, N)                                                \
-  ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
-
-/// Extracts a 64-bit element from the 128-bit integer vector of
-///    [2 x i64], using the immediate value parameter \a N as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// long long _mm_extract_epi64(__m128i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
-/// in 64-bit mode.
-///
-/// \param X
-///    A 128-bit integer vector.
-/// \param N
-///    An immediate value. Bit [0] specifies which 64-bit vector element from
-///    the argument \a X to return. \n
-///    0: Bits [63:0] are returned. \n
-///    1: Bits [127:64] are returned. \n
-/// \returns  A 64-bit integer.
-#define _mm_extract_epi64(X, N)                                                \
-  ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
-
-/* SSE4 128-bit Packed Integer Comparisons.  */
-/// Tests whether the specified bits in a 128-bit integer vector are all
-///    zeros.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param __M
-///    A 128-bit integer vector containing the bits to be tested.
-/// \param __V
-///    A 128-bit integer vector selecting which bits to test in operand \a __M.
-/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
-                                                         __m128i __V) {
-  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-///    ones.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param __M
-///    A 128-bit integer vector containing the bits to be tested.
-/// \param __V
-///    A 128-bit integer vector selecting which bits to test in operand \a __M.
-/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
-                                                         __m128i __V) {
-  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are
-///    neither all zeros nor all ones.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param __M
-///    A 128-bit integer vector containing the bits to be tested.
-/// \param __V
-///    A 128-bit integer vector selecting which bits to test in operand \a __M.
-/// \returns TRUE if the specified bits are neither all zeros nor all ones;
-///    FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
-                                                           __m128i __V) {
-  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-///    ones.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_test_all_ones(__m128i V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param V
-///    A 128-bit integer vector containing the bits to be tested.
-/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
-///    otherwise.
-#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
-
-/// Tests whether the specified bits in a 128-bit integer vector are
-///    neither all zeros nor all ones.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param M
-///    A 128-bit integer vector containing the bits to be tested.
-/// \param V
-///    A 128-bit integer vector selecting which bits to test in operand \a M.
-/// \returns TRUE if the specified bits are neither all zeros nor all ones;
-///    FALSE otherwise.
-#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-///    zeros.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_test_all_zeros(__m128i M, __m128i V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param M
-///    A 128-bit integer vector containing the bits to be tested.
-/// \param V
-///    A 128-bit integer vector selecting which bits to test in operand \a M.
-/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
-#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
-
-/* SSE4 64-bit Packed Integer Comparisons.  */
-/// Compares each of the corresponding 64-bit values of the 128-bit
-///    integer vectors for equality.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
-///
-/// \param __V1
-///    A 128-bit integer vector.
-/// \param __V2
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
-                                                             __m128i __V2) {
-  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
-}
-
-/* SSE4 Packed Integer Sign-Extension.  */
-/// Sign-extends each of the lower eight 8-bit integer elements of a
-///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
-///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
-///    are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
-///    sign-extended to 16-bit values.
-/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
-  /* This function always performs a signed extension, but __v16qi is a char
-     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
-                              7),
-      __v8hi);
-}
-
-/// Sign-extends each of the lower four 8-bit integer elements of a
-///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
-///    128-bit vector of [4 x i32]. The upper twelve elements of the input
-///    vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
-///    sign-extended to 32-bit values.
-/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
-  /* This function always performs a signed extension, but __v16qi is a char
-     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
-}
-
-/// Sign-extends each of the lower two 8-bit integer elements of a
-///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
-///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
-///    vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
-///    sign-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
-  /* This function always performs a signed extension, but __v16qi is a char
-     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
-}
-
-/// Sign-extends each of the lower four 16-bit integer elements of a
-///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
-///    a 128-bit vector of [4 x i32]. The upper four elements of the input
-///    vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
-///    sign-extended to 32-bit values.
-/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
-}
-
-/// Sign-extends each of the lower two 16-bit integer elements of a
-///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
-///    a 128-bit vector of [2 x i64]. The upper six elements of the input
-///    vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
-///     sign-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
-}
-
-/// Sign-extends each of the lower two 32-bit integer elements of a
-///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
-///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
-///    are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
-///    sign-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
-}
-
-/* SSE4 Packed Integer Zero-Extension.  */
-/// Zero-extends each of the lower eight 8-bit integer elements of a
-///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
-///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
-///    are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
-///    zero-extended to 16-bit values.
-/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
-                              7),
-      __v8hi);
-}
-
-/// Zero-extends each of the lower four 8-bit integer elements of a
-///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
-///    128-bit vector of [4 x i32]. The upper twelve elements of the input
-///    vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
-///    zero-extended to 32-bit values.
-/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
-}
-
-/// Zero-extends each of the lower two 8-bit integer elements of a
-///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
-///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
-///    vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
-///    zero-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
-}
-
-/// Zero-extends each of the lower four 16-bit integer elements of a
-///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
-///    a 128-bit vector of [4 x i32]. The upper four elements of the input
-///    vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
-///    zero-extended to 32-bit values.
-/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
-}
-
-/// Zero-extends each of the lower two 16-bit integer elements of a
-///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
-///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
-///    are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
-///    zero-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
-}
-
-/// Zero-extends each of the lower two 32-bit integer elements of a
-///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
-///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
-///    are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
-///
-/// \param __V
-///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
-///    zero-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
-  return (__m128i) __builtin_convertvector(
-      __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
-}
-
-/* SSE4 Pack with Unsigned Saturation.  */
-/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
-///    vector operands into 16-bit unsigned integers, and returns the packed
-///    result.
-///
-///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
-///    0x0000 are saturated to 0x0000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
-///
-/// \param __V1
-///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
-///    written to the lower 64 bits of the result.
-/// \param __V2
-///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
-///    written to the higher 64 bits of the result.
-/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
-                                                              __m128i __V2) {
-  return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
-}
-
-/* SSE4 Multiple Packed Sums of Absolute Difference.  */
-/// Subtracts 8-bit unsigned integer values and computes the absolute
-///    values of the differences to the corresponding bits in the destination.
-///    Then sums of the absolute differences are returned according to the bit
-///    fields in the immediate operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [16 x i8].
-/// \param Y
-///    A 128-bit vector of [16 x i8].
-/// \param M
-///    An 8-bit immediate operand specifying how the absolute differences are to
-///    be calculated, according to the following algorithm:
-///    \code
-///    // M2 represents bit 2 of the immediate operand
-///    // M10 represents bits [1:0] of the immediate operand
-///    i = M2 * 4;
-///    j = M10 * 4;
-///    for (k = 0; k < 8; k = k + 1) {
-///      d0 = abs(X[i + k + 0] - Y[j + 0]);
-///      d1 = abs(X[i + k + 1] - Y[j + 1]);
-///      d2 = abs(X[i + k + 2] - Y[j + 2]);
-///      d3 = abs(X[i + k + 3] - Y[j + 3]);
-///      r[k] = d0 + d1 + d2 + d3;
-///    }
-///    \endcode
-/// \returns A 128-bit integer vector containing the sums of the sets of
-///    absolute differences between both operands.
-#define _mm_mpsadbw_epu8(X, Y, M)                                              \
-  ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X),                   \
-                                      (__v16qi)(__m128i)(Y), (M)))
-
-/// Finds the minimum unsigned 16-bit element in the input 128-bit
-///    vector of [8 x u16] and returns it and along with its index.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
-/// instruction.
-///
-/// \param __V
-///    A 128-bit vector of [8 x u16].
-/// \returns A 128-bit value where bits [15:0] contain the minimum value found
-///    in parameter \a __V, bits [18:16] contain the index of the minimum value
-///    and the remaining bits are set to 0.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
-  return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
-}
-
-/* Handle the sse4.2 definitions here. */
-
-/* These definitions are normally in nmmintrin.h, but gcc puts them in here
-   so we'll do the same.  */
-
-#undef __DEFAULT_FN_ATTRS
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
-
-/* These specify the type of data that we're comparing.  */
-#define _SIDD_UBYTE_OPS 0x00
-#define _SIDD_UWORD_OPS 0x01
-#define _SIDD_SBYTE_OPS 0x02
-#define _SIDD_SWORD_OPS 0x03
-
-/* These specify the type of comparison operation.  */
-#define _SIDD_CMP_EQUAL_ANY 0x00
-#define _SIDD_CMP_RANGES 0x04
-#define _SIDD_CMP_EQUAL_EACH 0x08
-#define _SIDD_CMP_EQUAL_ORDERED 0x0c
-
-/* These macros specify the polarity of the operation.  */
-#define _SIDD_POSITIVE_POLARITY 0x00
-#define _SIDD_NEGATIVE_POLARITY 0x10
-#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
-#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
-
-/* These macros are used in _mm_cmpXstri() to specify the return.  */
-#define _SIDD_LEAST_SIGNIFICANT 0x00
-#define _SIDD_MOST_SIGNIFICANT 0x40
-
-/* These macros are used in _mm_cmpXstri() to specify the return.  */
-#define _SIDD_BIT_MASK 0x00
-#define _SIDD_UNIT_MASK 0x40
-
-/* SSE4.2 Packed Comparison Intrinsics.  */
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with implicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns a 128-bit integer vector representing the result
-///    mask of the comparison.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words, the type of comparison to perform, and the format of the return
-///    value. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B. \n
-///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
-///             bytes. \n
-///      0: The result is zero-extended to 16 bytes. \n
-///      1: The result is expanded to 16 bytes (this expansion is performed by
-///         repeating each bit 8 or 16 times).
-/// \returns Returns a 128-bit integer vector representing the result mask of
-///    the comparison.
-#define _mm_cmpistrm(A, B, M)                                                  \
-  ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A),                 \
-                                        (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with implicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns an integer representing the result index of the
-///    comparison.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words, the type of comparison to perform, and the format of the return
-///    value. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B. \n
-///    Bit [6]: Determines whether the index of the lowest set bit or the
-///             highest set bit is returned. \n
-///      0: The index of the least significant set bit. \n
-///      1: The index of the most significant set bit. \n
-/// \returns Returns an integer representing the result index of the comparison.
-#define _mm_cmpistri(A, B, M)                                                  \
-  ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A),                     \
-                                    (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with explicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns a 128-bit integer vector representing the result
-///    mask of the comparison.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LA
-///    An integer that specifies the length of the string in \a A.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LB
-///    An integer that specifies the length of the string in \a B.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words, the type of comparison to perform, and the format of the return
-///    value. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B. \n
-///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
-///             bytes. \n
-///      0: The result is zero-extended to 16 bytes. \n
-///      1: The result is expanded to 16 bytes (this expansion is performed by
-///         repeating each bit 8 or 16 times). \n
-/// \returns Returns a 128-bit integer vector representing the result mask of
-///    the comparison.
-#define _mm_cmpestrm(A, LA, B, LB, M)                                          \
-  ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA),      \
-                                        (__v16qi)(__m128i)(B), (int)(LB),      \
-                                        (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with explicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns an integer representing the result index of the
-///    comparison.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LA
-///    An integer that specifies the length of the string in \a A.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LB
-///    An integer that specifies the length of the string in \a B.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words, the type of comparison to perform, and the format of the return
-///    value. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B. \n
-///    Bit [6]: Determines whether the index of the lowest set bit or the
-///             highest set bit is returned. \n
-///      0: The index of the least significant set bit. \n
-///      1: The index of the most significant set bit. \n
-/// \returns Returns an integer representing the result index of the comparison.
-#define _mm_cmpestri(A, LA, B, LB, M)                                          \
-  ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA),          \
-                                    (__v16qi)(__m128i)(B), (int)(LB),          \
-                                    (int)(M)))
-
-/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with implicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
-///    string in \a B is the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B. \n
-/// \returns Returns 1 if the bit mask is zero and the length of the string in
-///    \a B is the maximum; otherwise, returns 0.
-#define _mm_cmpistra(A, B, M)                                                  \
-  ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A),                    \
-                                     (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with implicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
-///    0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B.
-/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
-#define _mm_cmpistrc(A, B, M)                                                  \
-  ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A),                    \
-                                     (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with implicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns bit 0 of the resulting bit mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B. \n
-/// \returns Returns bit 0 of the resulting bit mask.
-#define _mm_cmpistro(A, B, M)                                                  \
-  ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A),                    \
-                                     (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with implicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
-///    the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B. \n
-/// \returns Returns 1 if the length of the string in \a A is less than the
-///    maximum, otherwise, returns 0.
-#define _mm_cmpistrs(A, B, M)                                                  \
-  ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A),                    \
-                                     (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with implicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
-///    the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B.
-/// \returns Returns 1 if the length of the string in \a B is less than the
-///    maximum, otherwise, returns 0.
-#define _mm_cmpistrz(A, B, M)                                                  \
-  ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A),                    \
-                                     (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with explicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
-///    string in \a B is the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LA
-///    An integer that specifies the length of the string in \a A.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LB
-///    An integer that specifies the length of the string in \a B.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B.
-/// \returns Returns 1 if the bit mask is zero and the length of the string in
-///    \a B is the maximum, otherwise, returns 0.
-#define _mm_cmpestra(A, LA, B, LB, M)                                          \
-  ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA),         \
-                                     (__v16qi)(__m128i)(B), (int)(LB),         \
-                                     (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with explicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
-///    returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LA
-///    An integer that specifies the length of the string in \a A.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LB
-///    An integer that specifies the length of the string in \a B.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B. \n
-/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
-#define _mm_cmpestrc(A, LA, B, LB, M)                                          \
-  ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA),         \
-                                     (__v16qi)(__m128i)(B), (int)(LB),         \
-                                     (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with explicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns bit 0 of the resulting bit mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LA
-///    An integer that specifies the length of the string in \a A.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LB
-///    An integer that specifies the length of the string in \a B.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B.
-/// \returns Returns bit 0 of the resulting bit mask.
-#define _mm_cmpestro(A, LA, B, LB, M)                                          \
-  ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA),         \
-                                     (__v16qi)(__m128i)(B), (int)(LB),         \
-                                     (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with explicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
-///    the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LA
-///    An integer that specifies the length of the string in \a A.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LB
-///    An integer that specifies the length of the string in \a B.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement in the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B. \n
-/// \returns Returns 1 if the length of the string in \a A is less than the
-///    maximum, otherwise, returns 0.
-#define _mm_cmpestrs(A, LA, B, LB, M)                                          \
-  ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA),         \
-                                     (__v16qi)(__m128i)(B), (int)(LB),         \
-                                     (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-///    data with explicitly defined lengths that is contained in source operands
-///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
-///    the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
-///
-/// \param A
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LA
-///    An integer that specifies the length of the string in \a A.
-/// \param B
-///    A 128-bit integer vector containing one of the source operands to be
-///    compared.
-/// \param LB
-///    An integer that specifies the length of the string in \a B.
-/// \param M
-///    An 8-bit immediate operand specifying whether the characters are bytes or
-///    words and the type of comparison to perform. \n
-///    Bits [1:0]: Determine source data format. \n
-///      00: 16 unsigned bytes  \n
-///      01: 8 unsigned words \n
-///      10: 16 signed bytes \n
-///      11: 8 signed words \n
-///    Bits [3:2]: Determine comparison type and aggregation method. \n
-///      00: Subset: Each character in \a B is compared for equality with all
-///          the characters in \a A. \n
-///      01: Ranges: Each character in \a B is compared to \a A. The comparison
-///          basis is greater than or equal for even-indexed elements in \a A,
-///          and less than or equal for odd-indexed elements in \a A. \n
-///      10: Match: Compare each pair of corresponding characters in \a A and
-///          \a B for equality. \n
-///      11: Substring: Search \a B for substring matches of \a A. \n
-///    Bits [5:4]: Determine whether to perform a one's complement on the bit
-///                mask of the comparison results. \n
-///      00: No effect. \n
-///      01: Negate the bit mask. \n
-///      10: No effect. \n
-///      11: Negate the bit mask only for bits with an index less than or equal
-///          to the size of \a A or \a B.
-/// \returns Returns 1 if the length of the string in \a B is less than the
-///    maximum, otherwise, returns 0.
-#define _mm_cmpestrz(A, LA, B, LB, M)                                          \
-  ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA),         \
-                                     (__v16qi)(__m128i)(B), (int)(LB),         \
-                                     (int)(M)))
-
-/* SSE4.2 Compare Packed Data -- Greater Than.  */
-/// Compares each of the corresponding 64-bit values of the 128-bit
-///    integer vectors to determine if the values in the first operand are
-///    greater than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
-///
-/// \param __V1
-///    A 128-bit integer vector.
-/// \param __V2
-///    A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
-                                                             __m128i __V2) {
-  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#include "popcntintrin.h"
-
-#include "crc32intrin.h"
-
-#endif /* __SMMINTRIN_H */
diff --git a/third_party/intel/clang/tbmintrin.h b/third_party/intel/clang/tbmintrin.h
deleted file mode 100644
index f4e848a1c..000000000
--- a/third_party/intel/clang/tbmintrin.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __TBMINTRIN_H
-#define __TBMINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
-
-#define __bextri_u32(a, b) \
-  ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
-                                           (unsigned int)(b)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcfill_u32(unsigned int __a)
-{
-  return __a & (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blci_u32(unsigned int __a)
-{
-  return __a | ~(__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcic_u32(unsigned int __a)
-{
-  return ~__a & (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcmsk_u32(unsigned int __a)
-{
-  return __a ^ (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcs_u32(unsigned int __a)
-{
-  return __a | (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsfill_u32(unsigned int __a)
-{
-  return __a | (__a - 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsic_u32(unsigned int __a)
-{
-  return ~__a | (__a - 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__t1mskc_u32(unsigned int __a)
-{
-  return ~__a | (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__tzmsk_u32(unsigned int __a)
-{
-  return ~__a & (__a - 1);
-}
-
-#ifdef __x86_64__
-#define __bextri_u64(a, b) \
-  ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
-                                                 (unsigned long long)(b)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcfill_u64(unsigned long long __a)
-{
-  return __a & (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blci_u64(unsigned long long __a)
-{
-  return __a | ~(__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcic_u64(unsigned long long __a)
-{
-  return ~__a & (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcmsk_u64(unsigned long long __a)
-{
-  return __a ^ (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcs_u64(unsigned long long __a)
-{
-  return __a | (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsfill_u64(unsigned long long __a)
-{
-  return __a | (__a - 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsic_u64(unsigned long long __a)
-{
-  return ~__a | (__a - 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__t1mskc_u64(unsigned long long __a)
-{
-  return ~__a | (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__tzmsk_u64(unsigned long long __a)
-{
-  return ~__a & (__a - 1);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __TBMINTRIN_H */
diff --git a/third_party/intel/clang/tmmintrin.h b/third_party/intel/clang/tmmintrin.h
deleted file mode 100644
index 1674545c0..000000000
--- a/third_party/intel/clang/tmmintrin.h
+++ /dev/null
@@ -1,784 +0,0 @@
-/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __TMMINTRIN_H
-#define __TMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include "pmmintrin.h"
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("ssse3,no-evex512"), __min_vector_width__(64)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,ssse3,no-evex512"),                           \
-                 __min_vector_width__(64)))
-
-/// Computes the absolute value of each of the packed 8-bit signed
-///    integers in the source operand and stores the 8-bit unsigned integer
-///    results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PABSB instruction.
-///
-/// \param __a
-///    A 64-bit vector of [8 x i8].
-/// \returns A 64-bit integer vector containing the absolute values of the
-///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_abs_pi8(__m64 __a)
-{
-    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
-}
-
-/// Computes the absolute value of each of the packed 8-bit signed
-///    integers in the source operand and stores the 8-bit unsigned integer
-///    results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPABSB instruction.
-///
-/// \param __a
-///    A 128-bit vector of [16 x i8].
-/// \returns A 128-bit integer vector containing the absolute values of the
-///    elements in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_abs_epi8(__m128i __a)
-{
-    return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
-}
-
-/// Computes the absolute value of each of the packed 16-bit signed
-///    integers in the source operand and stores the 16-bit unsigned integer
-///    results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PABSW instruction.
-///
-/// \param __a
-///    A 64-bit vector of [4 x i16].
-/// \returns A 64-bit integer vector containing the absolute values of the
-///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_abs_pi16(__m64 __a)
-{
-    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
-}
-
-/// Computes the absolute value of each of the packed 16-bit signed
-///    integers in the source operand and stores the 16-bit unsigned integer
-///    results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPABSW instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x i16].
-/// \returns A 128-bit integer vector containing the absolute values of the
-///    elements in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_abs_epi16(__m128i __a)
-{
-    return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
-}
-
-/// Computes the absolute value of each of the packed 32-bit signed
-///    integers in the source operand and stores the 32-bit unsigned integer
-///    results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PABSD instruction.
-///
-/// \param __a
-///    A 64-bit vector of [2 x i32].
-/// \returns A 64-bit integer vector containing the absolute values of the
-///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_abs_pi32(__m64 __a)
-{
-    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
-}
-
-/// Computes the absolute value of each of the packed 32-bit signed
-///    integers in the source operand and stores the 32-bit unsigned integer
-///    results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPABSD instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x i32].
-/// \returns A 128-bit integer vector containing the absolute values of the
-///    elements in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_abs_epi32(__m128i __a)
-{
-    return (__m128i)__builtin_elementwise_abs((__v4si)__a);
-}
-
-/// Concatenates the two 128-bit integer vector operands, and
-///    right-shifts the result by the number of bytes specified in the immediate
-///    operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c PALIGNR instruction.
-///
-/// \param a
-///    A 128-bit vector of [16 x i8] containing one of the source operands.
-/// \param b
-///    A 128-bit vector of [16 x i8] containing one of the source operands.
-/// \param n
-///    An immediate operand specifying how many bytes to right-shift the result.
-/// \returns A 128-bit integer vector containing the concatenated right-shifted
-///    value.
-#define _mm_alignr_epi8(a, b, n) \
-  ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
-                                      (__v16qi)(__m128i)(b), (n)))
-
-/// Concatenates the two 64-bit integer vector operands, and right-shifts
-///    the result by the number of bytes specified in the immediate operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c PALIGNR instruction.
-///
-/// \param a
-///    A 64-bit vector of [8 x i8] containing one of the source operands.
-/// \param b
-///    A 64-bit vector of [8 x i8] containing one of the source operands.
-/// \param n
-///    An immediate operand specifying how many bytes to right-shift the result.
-/// \returns A 64-bit integer vector containing the concatenated right-shifted
-///    value.
-#define _mm_alignr_pi8(a, b, n) \
-  ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    128-bit vectors of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHADDW instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x i16] containing one of the source operands. The
-///    horizontal sums of the values are stored in the lower bits of the
-///    destination.
-/// \param __b
-///    A 128-bit vector of [8 x i16] containing one of the source operands. The
-///    horizontal sums of the values are stored in the upper bits of the
-///    destination.
-/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
-///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    128-bit vectors of [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHADDD instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x i32] containing one of the source operands. The
-///    horizontal sums of the values are stored in the lower bits of the
-///    destination.
-/// \param __b
-///    A 128-bit vector of [4 x i32] containing one of the source operands. The
-///    horizontal sums of the values are stored in the upper bits of the
-///    destination.
-/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
-///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    64-bit vectors of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHADDW instruction.
-///
-/// \param __a
-///    A 64-bit vector of [4 x i16] containing one of the source operands. The
-///    horizontal sums of the values are stored in the lower bits of the
-///    destination.
-/// \param __b
-///    A 64-bit vector of [4 x i16] containing one of the source operands. The
-///    horizontal sums of the values are stored in the upper bits of the
-///    destination.
-/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
-///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hadd_pi16(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    64-bit vectors of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHADDD instruction.
-///
-/// \param __a
-///    A 64-bit vector of [2 x i32] containing one of the source operands. The
-///    horizontal sums of the values are stored in the lower bits of the
-///    destination.
-/// \param __b
-///    A 64-bit vector of [2 x i32] containing one of the source operands. The
-///    horizontal sums of the values are stored in the upper bits of the
-///    destination.
-/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
-///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hadd_pi32(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
-}
-
-/// Horizontally adds, with saturation, the adjacent pairs of values contained
-///    in two packed 128-bit vectors of [8 x i16].
-///
-///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
-///    less than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHADDSW instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x i16] containing one of the source operands. The
-///    horizontal sums of the values are stored in the lower bits of the
-///    destination.
-/// \param __b
-///    A 128-bit vector of [8 x i16] containing one of the source operands. The
-///    horizontal sums of the values are stored in the upper bits of the
-///    destination.
-/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
-///    sums of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadds_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Horizontally adds, with saturation, the adjacent pairs of values contained
-///    in two packed 64-bit vectors of [4 x i16].
-///
-///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
-///    less than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHADDSW instruction.
-///
-/// \param __a
-///    A 64-bit vector of [4 x i16] containing one of the source operands. The
-///    horizontal sums of the values are stored in the lower bits of the
-///    destination.
-/// \param __b
-///    A 64-bit vector of [4 x i16] containing one of the source operands. The
-///    horizontal sums of the values are stored in the upper bits of the
-///    destination.
-/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
-///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hadds_pi16(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 128-bit vectors of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHSUBW instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x i16] containing one of the source operands. The
-///    horizontal differences between the values are stored in the lower bits of
-///    the destination.
-/// \param __b
-///    A 128-bit vector of [8 x i16] containing one of the source operands. The
-///    horizontal differences between the values are stored in the upper bits of
-///    the destination.
-/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
-///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 128-bit vectors of [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHSUBD instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x i32] containing one of the source operands. The
-///    horizontal differences between the values are stored in the lower bits of
-///    the destination.
-/// \param __b
-///    A 128-bit vector of [4 x i32] containing one of the source operands. The
-///    horizontal differences between the values are stored in the upper bits of
-///    the destination.
-/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
-///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 64-bit vectors of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHSUBW instruction.
-///
-/// \param __a
-///    A 64-bit vector of [4 x i16] containing one of the source operands. The
-///    horizontal differences between the values are stored in the lower bits of
-///    the destination.
-/// \param __b
-///    A 64-bit vector of [4 x i16] containing one of the source operands. The
-///    horizontal differences between the values are stored in the upper bits of
-///    the destination.
-/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
-///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hsub_pi16(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 64-bit vectors of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHSUBD instruction.
-///
-/// \param __a
-///    A 64-bit vector of [2 x i32] containing one of the source operands. The
-///    horizontal differences between the values are stored in the lower bits of
-///    the destination.
-/// \param __b
-///    A 64-bit vector of [2 x i32] containing one of the source operands. The
-///    horizontal differences between the values are stored in the upper bits of
-///    the destination.
-/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
-///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hsub_pi32(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
-}
-
-/// Horizontally subtracts, with saturation, the adjacent pairs of values
-///    contained in two packed 128-bit vectors of [8 x i16].
-///
-///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative differences less than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHSUBSW instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x i16] containing one of the source operands. The
-///    horizontal differences between the values are stored in the lower bits of
-///    the destination.
-/// \param __b
-///    A 128-bit vector of [8 x i16] containing one of the source operands. The
-///    horizontal differences between the values are stored in the upper bits of
-///    the destination.
-/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
-///    differences of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Horizontally subtracts, with saturation, the adjacent pairs of values
-///    contained in two packed 64-bit vectors of [4 x i16].
-///
-///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative differences less than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHSUBSW instruction.
-///
-/// \param __a
-///    A 64-bit vector of [4 x i16] containing one of the source operands. The
-///    horizontal differences between the values are stored in the lower bits of
-///    the destination.
-/// \param __b
-///    A 64-bit vector of [4 x i16] containing one of the source operands. The
-///    horizontal differences between the values are stored in the upper bits of
-///    the destination.
-/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
-///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hsubs_pi16(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Multiplies corresponding pairs of packed 8-bit unsigned integer
-///    values contained in the first source operand and packed 8-bit signed
-///    integer values contained in the second source operand, adds pairs of
-///    contiguous products with signed saturation, and writes the 16-bit sums to
-///    the corresponding bits in the destination.
-///
-///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
-///    both operands are multiplied, and the sum of both results is written to
-///    bits [15:0] of the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the first source operand.
-/// \param __b
-///    A 128-bit integer vector containing the second source operand.
-/// \returns A 128-bit integer vector containing the sums of products of both
-///    operands: \n
-///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
-///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
-///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
-///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
-///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
-///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
-///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
-///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// Multiplies corresponding pairs of packed 8-bit unsigned integer
-///    values contained in the first source operand and packed 8-bit signed
-///    integer values contained in the second source operand, adds pairs of
-///    contiguous products with signed saturation, and writes the 16-bit sums to
-///    the corresponding bits in the destination.
-///
-///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
-///    both operands are multiplied, and the sum of both results is written to
-///    bits [15:0] of the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PMADDUBSW instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing the first source operand.
-/// \param __b
-///    A 64-bit integer vector containing the second source operand.
-/// \returns A 64-bit integer vector containing the sums of products of both
-///    operands: \n
-///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
-///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
-///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
-///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_maddubs_pi16(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
-}
-
-/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
-///    products to the 18 most significant bits by right-shifting, rounds the
-///    truncated value by adding 1, and writes bits [16:1] to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPMULHRSW instruction.
-///
-/// \param __a
-///    A 128-bit vector of [8 x i16] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [8 x i16] containing one of the source operands.
-/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
-///    products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mulhrs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
-///    products to the 18 most significant bits by right-shifting, rounds the
-///    truncated value by adding 1, and writes bits [16:1] to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PMULHRSW instruction.
-///
-/// \param __a
-///    A 64-bit vector of [4 x i16] containing one of the source operands.
-/// \param __b
-///    A 64-bit vector of [4 x i16] containing one of the source operands.
-/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
-///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_mulhrs_pi16(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Copies the 8-bit integers from a 128-bit integer vector to the
-///    destination or clears 8-bit values in the destination, as specified by
-///    the second source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPSHUFB instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the values to be copied.
-/// \param __b
-///    A 128-bit integer vector containing control bytes corresponding to
-///    positions in the destination:
-///    Bit 7: \n
-///    1: Clear the corresponding byte in the destination. \n
-///    0: Copy the selected source byte to the corresponding byte in the
-///    destination. \n
-///    Bits [6:4] Reserved.  \n
-///    Bits [3:0] select the source byte to be copied.
-/// \returns A 128-bit integer vector containing the copied or cleared values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shuffle_epi8(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// Copies the 8-bit integers from a 64-bit integer vector to the
-///    destination or clears 8-bit values in the destination, as specified by
-///    the second source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PSHUFB instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing the values to be copied.
-/// \param __b
-///    A 64-bit integer vector containing control bytes corresponding to
-///    positions in the destination:
-///    Bit 7: \n
-///    1: Clear the corresponding byte in the destination. \n
-///    0: Copy the selected source byte to the corresponding byte in the
-///    destination. \n
-///    Bits [3:0] select the source byte to be copied.
-/// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_shuffle_pi8(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
-}
-
-/// For each 8-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand.
-///
-///    If the byte in the second source is negative, calculate the two's
-///    complement of the corresponding byte in the first source, and write that
-///    value to the destination. If the byte in the second source is positive,
-///    copy the corresponding byte from the first source to the destination. If
-///    the byte in the second source is zero, clear the corresponding byte in
-///    the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPSIGNB instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the values to be copied.
-/// \param __b
-///    A 128-bit integer vector containing control bytes corresponding to
-///    positions in the destination.
-/// \returns A 128-bit integer vector containing the resultant values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sign_epi8(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// For each 16-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand.
-///
-///    If the word in the second source is negative, calculate the two's
-///    complement of the corresponding word in the first source, and write that
-///    value to the destination. If the word in the second source is positive,
-///    copy the corresponding word from the first source to the destination. If
-///    the word in the second source is zero, clear the corresponding word in
-///    the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPSIGNW instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the values to be copied.
-/// \param __b
-///    A 128-bit integer vector containing control words corresponding to
-///    positions in the destination.
-/// \returns A 128-bit integer vector containing the resultant values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sign_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// For each 32-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand.
-///
-///    If the doubleword in the second source is negative, calculate the two's
-///    complement of the corresponding word in the first source, and write that
-///    value to the destination. If the doubleword in the second source is
-///    positive, copy the corresponding word from the first source to the
-///    destination. If the doubleword in the second source is zero, clear the
-///    corresponding word in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPSIGND instruction.
-///
-/// \param __a
-///    A 128-bit integer vector containing the values to be copied.
-/// \param __b
-///    A 128-bit integer vector containing control doublewords corresponding to
-///    positions in the destination.
-/// \returns A 128-bit integer vector containing the resultant values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sign_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
-}
-
-/// For each 8-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand.
-///
-///    If the byte in the second source is negative, calculate the two's
-///    complement of the corresponding byte in the first source, and write that
-///    value to the destination. If the byte in the second source is positive,
-///    copy the corresponding byte from the first source to the destination. If
-///    the byte in the second source is zero, clear the corresponding byte in
-///    the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PSIGNB instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing the values to be copied.
-/// \param __b
-///    A 64-bit integer vector containing control bytes corresponding to
-///    positions in the destination.
-/// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_sign_pi8(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
-}
-
-/// For each 16-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand.
-///
-///    If the word in the second source is negative, calculate the two's
-///    complement of the corresponding word in the first source, and write that
-///    value to the destination. If the word in the second source is positive,
-///    copy the corresponding word from the first source to the destination. If
-///    the word in the second source is zero, clear the corresponding word in
-///    the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PSIGNW instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing the values to be copied.
-/// \param __b
-///    A 64-bit integer vector containing control words corresponding to
-///    positions in the destination.
-/// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_sign_pi16(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// For each 32-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand.
-///
-///    If the doubleword in the second source is negative, calculate the two's
-///    complement of the corresponding doubleword in the first source, and
-///    write that value to the destination. If the doubleword in the second
-///    source is positive, copy the corresponding doubleword from the first
-///    source to the destination. If the doubleword in the second source is
-///    zero, clear the corresponding doubleword in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PSIGND instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing the values to be copied.
-/// \param __b
-///    A 64-bit integer vector containing two control doublewords corresponding
-///    to positions in the destination.
-/// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_sign_pi32(__m64 __a, __m64 __b)
-{
-    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
-}
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
-
-#endif /* __TMMINTRIN_H */
diff --git a/third_party/intel/clang/tsxldtrkintrin.h b/third_party/intel/clang/tsxldtrkintrin.h
deleted file mode 100644
index 491823e93..000000000
--- a/third_party/intel/clang/tsxldtrkintrin.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*===------------- tsxldtrkintrin.h - tsxldtrk intrinsics ------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <tsxldtrkintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __TSXLDTRKINTRIN_H
-#define __TSXLDTRKINTRIN_H
-
-/* Define the default attributes for the functions in this file */
-#define _DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("tsxldtrk")))
-
-/// Marks the start of an TSX (RTM) suspend load address tracking region. If
-///    this intrinsic is used inside a transactional region, subsequent loads
-///    are not added to the read set of the transaction. If it's used inside a
-///    suspend load address tracking region it will cause transaction abort.
-///    If it's used outside of a transactional region it behaves like a NOP.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c XSUSLDTRK instruction.
-///
-static __inline__ void _DEFAULT_FN_ATTRS
-_xsusldtrk (void)
-{
-    __builtin_ia32_xsusldtrk();
-}
-
-/// Marks the end of an TSX (RTM) suspend load address tracking region. If this
-///    intrinsic is used inside a suspend load address tracking region it will
-///    end the suspend region and all following load addresses will be added to
-///    the transaction read set. If it's used inside an active transaction but
-///    not in a suspend region it will cause transaction abort. If it's used
-///    outside of a transactional region it behaves like a NOP.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c XRESLDTRK instruction.
-///
-static __inline__ void _DEFAULT_FN_ATTRS
-_xresldtrk (void)
-{
-    __builtin_ia32_xresldtrk();
-}
-
-#undef _DEFAULT_FN_ATTRS
-
-#endif /* __TSXLDTRKINTRIN_H */
diff --git a/third_party/intel/clang/uintrintrin.h b/third_party/intel/clang/uintrintrin.h
deleted file mode 100644
index 135dc814c..000000000
--- a/third_party/intel/clang/uintrintrin.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*===------------------ uintrintrin.h - UINTR intrinsics -------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86GPRINTRIN_H
-#error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead."
-#endif
-
-#ifndef __UINTRINTRIN_H
-#define __UINTRINTRIN_H
-
-/* Define the default attributes for the functions in this file */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("uintr")))
-
-#ifdef __x86_64__
-
-struct __uintr_frame
-{
-  unsigned long long rip;
-  unsigned long long rflags;
-  unsigned long long rsp;
-};
-
-/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
-///    user interrupt cannot be delivered on the instruction boundary following
-///    CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
-///    64-bit mode, and software is not executing inside an enclave; otherwise,
-///    each causes an invalid-opcode exception. Causes a transactional abort if
-///    executed inside a transactional region; the abort loads EAX as it would
-///    had it been due to an execution of CLI.
-///
-/// \headerfile <x86gprintrin.h>
-///
-/// This intrinsic corresponds to the <c> CLUI </c> instruction.
-///
-/// \code{.operation}
-///   UIF := 0
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS
-_clui (void)
-{
-  __builtin_ia32_clui();
-}
-
-/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a
-///    user interrupt may be delivered on the instruction boundary following
-///    STUI. Can be executed only if CR4.UINT = 1, the logical processor is in
-///    64-bit mode, and software is not executing inside an enclave; otherwise,
-///    each causes an invalid-opcode exception. Causes a transactional abort if
-///    executed inside a transactional region; the abort loads EAX as it would
-///    had it been due to an execution of STI.
-///
-/// \headerfile <x86gprintrin.h>
-///
-/// This intrinsic corresponds to the <c> STUI </c> instruction.
-///
-/// \code{.operation}
-///   UIF := 1
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS
-_stui (void)
-{
-  __builtin_ia32_stui();
-}
-
-/// Get the current value of the user interrupt flag (UIF). Can be executed
-///    regardless of CPL and inside a transactional region. Can be executed only
-///    if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is
-///    not executing inside an enclave; otherwise, it causes an invalid-opcode
-///    exception.
-///
-/// \headerfile <x86gprintrin.h>
-///
-/// This intrinsic corresponds to the <c> TESTUI </c> instruction.
-///
-/// \returns The current value of the user interrupt flag (UIF).
-///
-/// \code{.operation}
-///   CF := UIF
-///   ZF := 0
-///   AF := 0
-///   OF := 0
-///   PF := 0
-///   SF := 0
-///   dst := CF
-/// \endcode
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_testui (void)
-{
-  return __builtin_ia32_testui();
-}
-
-/// Send interprocessor user interrupt. Can be executed only if
-///    CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode,
-///    and software is not executing inside an enclave; otherwise, it causes an
-///    invalid-opcode exception. May be executed at any privilege level, all of
-///    its memory accesses are performed with supervisor privilege.
-///
-/// \headerfile <x86gprintrin.h>
-///
-/// This intrinsic corresponds to the <c> SENDUIPI </c> instruction
-///
-/// \param __a
-///    Index of user-interrupt target table entry in user-interrupt target
-///    table.
-///
-/// \code{.operation}
-///   IF __a > UITTSZ
-///     GP (0)
-///   FI
-///   tempUITTE := MEM[UITTADDR + (a<<4)]
-///   // tempUITTE must be valid, and can't have any reserved bit set
-///   IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0)
-///     GP (0)
-///   FI
-///   tempUPID := MEM[tempUITTE.UPIDADDR] // under lock
-///   // tempUPID can't have any reserved bit set
-///   IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0)
-///     GP (0) // release lock
-///   FI
-///   tempUPID.PIR[tempUITTE.UV] := 1;
-///   IF (tempUPID.SN == 0 AND tempUPID.ON == 0)
-///     tempUPID.ON := 1
-///     sendNotify := 1
-///   ELSE
-///     sendNotify := 0
-///   FI
-///   MEM[tempUITTE.UPIDADDR] := tempUPID // release lock
-///   IF sendNotify == 1
-///     IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode
-///       // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC
-///       // ID tempUPID.NDST
-///       SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST)
-///     ELSE
-///       // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC
-///       // ID tempUPID.NDST[15:8]
-///       SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
-///     FI
-///   FI
-/// \endcode
-static __inline__ void __DEFAULT_FN_ATTRS
-_senduipi (unsigned long long __a)
-{
-  __builtin_ia32_senduipi(__a);
-}
-
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __UINTRINTRIN_H */
diff --git a/third_party/intel/clang/usermsrintrin.h b/third_party/intel/clang/usermsrintrin.h
deleted file mode 100644
index 613883767..000000000
--- a/third_party/intel/clang/usermsrintrin.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*===--------------- usermsrintrin.h - USERMSR intrinsics -----------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __X86GPRINTRIN_H
-#error "Never use <usermsrintrin.h> directly; include <x86gprintrin.h> instead."
-#endif // __X86GPRINTRIN_H
-
-#ifndef __USERMSRINTRIN_H
-#define __USERMSRINTRIN_H
-#ifdef __x86_64__
-
-/// Reads the contents of a 64-bit MSR specified in \a __A into \a dst.
-///
-/// This intrinsic corresponds to the <c> URDMSR </c> instruction.
-/// \param __A
-///    An unsigned long long.
-///
-/// \code{.operation}
-///    DEST := MSR[__A]
-/// \endcode
-static __inline__ unsigned long long
-    __attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
-    _urdmsr(unsigned long long __A) {
-  return __builtin_ia32_urdmsr(__A);
-}
-
-/// Writes the contents of \a __B into the 64-bit MSR specified in \a __A.
-///
-/// This intrinsic corresponds to the <c> UWRMSR </c> instruction.
-///
-/// \param __A
-///    An unsigned long long.
-/// \param __B
-///    An unsigned long long.
-///
-/// \code{.operation}
-///    MSR[__A] := __B
-/// \endcode
-static __inline__ void
-    __attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
-    _uwrmsr(unsigned long long __A, unsigned long long __B) {
-  return __builtin_ia32_uwrmsr(__A, __B);
-}
-
-#endif // __x86_64__
-#endif // __USERMSRINTRIN_H
diff --git a/third_party/intel/clang/vaesintrin.h b/third_party/intel/clang/vaesintrin.h
deleted file mode 100644
index d7c162f5c..000000000
--- a/third_party/intel/clang/vaesintrin.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*===------------------ vaesintrin.h - VAES intrinsics ---------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <vaesintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VAESINTRIN_H
-#define __VAESINTRIN_H
-
-/* Default attributes for YMM forms. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256)))
-
-/* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_F                                                   \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512f,evex512,vaes"),                           \
-                 __min_vector_width__(512)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
- _mm256_aesenc_epi128(__m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_aesenc256((__v4di) __A,
-              (__v4di) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
- _mm256_aesdec_epi128(__m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_aesdec256((__v4di) __A,
-              (__v4di) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
- _mm256_aesenclast_epi128(__m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A,
-              (__v4di) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
- _mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A,
-              (__v4di) __B);
-}
-
-#ifdef __AVX512FINTRIN_H
-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesenc_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
-              (__v8di) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesdec_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
-              (__v8di) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesenclast_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
-              (__v8di) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
-              (__v8di) __B);
-}
-#endif // __AVX512FINTRIN_H
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_F
-
-#endif // __VAESINTRIN_H
diff --git a/third_party/intel/clang/vpclmulqdqintrin.h b/third_party/intel/clang/vpclmulqdqintrin.h
deleted file mode 100644
index 485692ea2..000000000
--- a/third_party/intel/clang/vpclmulqdqintrin.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VPCLMULQDQINTRIN_H
-#define __VPCLMULQDQINTRIN_H
-
-#define _mm256_clmulepi64_epi128(A, B, I) \
-  ((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A),  \
-                                        (__v4di)(__m256i)(B),  \
-                                        (char)(I)))
-
-#ifdef __AVX512FINTRIN_H
-#define _mm512_clmulepi64_epi128(A, B, I) \
-  ((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
-                                        (__v8di)(__m512i)(B),  \
-                                        (char)(I)))
-#endif // __AVX512FINTRIN_H
-
-#endif /* __VPCLMULQDQINTRIN_H */
-
diff --git a/third_party/intel/clang/waitpkgintrin.h b/third_party/intel/clang/waitpkgintrin.h
deleted file mode 100644
index 7ecada4cf..000000000
--- a/third_party/intel/clang/waitpkgintrin.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*===----------------------- waitpkgintrin.h - WAITPKG --------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <waitpkgintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __WAITPKGINTRIN_H
-#define __WAITPKGINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("waitpkg")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_umonitor (void * __address)
-{
-  __builtin_ia32_umonitor (__address);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_umwait (unsigned int __control, unsigned long long __counter)
-{
-  return __builtin_ia32_umwait (__control,
-    (unsigned int)(__counter >> 32), (unsigned int)__counter);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_tpause (unsigned int __control, unsigned long long __counter)
-{
-  return __builtin_ia32_tpause (__control,
-    (unsigned int)(__counter >> 32), (unsigned int)__counter);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __WAITPKGINTRIN_H */
diff --git a/third_party/intel/clang/wbnoinvdintrin.h b/third_party/intel/clang/wbnoinvdintrin.h
deleted file mode 100644
index cac0347ef..000000000
--- a/third_party/intel/clang/wbnoinvdintrin.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <wbnoinvdintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __WBNOINVDINTRIN_H
-#define __WBNOINVDINTRIN_H
-
-static __inline__ void
-  __attribute__((__always_inline__, __nodebug__,  __target__("wbnoinvd")))
-_wbnoinvd (void)
-{
-  __builtin_ia32_wbnoinvd ();
-}
-
-#endif /* __WBNOINVDINTRIN_H */
diff --git a/third_party/intel/clang/wmmintrin.h b/third_party/intel/clang/wmmintrin.h
deleted file mode 100644
index f3121e1c3..000000000
--- a/third_party/intel/clang/wmmintrin.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __WMMINTRIN_H
-#define __WMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include "emmintrin.h"
-
-#include "__wmmintrin_aes.h"
-
-#include "__wmmintrin_pclmul.h"
-
-#endif /* __WMMINTRIN_H */
diff --git a/third_party/intel/clang/x86gprintrin.h b/third_party/intel/clang/x86gprintrin.h
deleted file mode 100644
index f8447ed4a..000000000
--- a/third_party/intel/clang/x86gprintrin.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86GPRINTRIN_H
-#define __X86GPRINTRIN_H
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__HRESET__)
-#include "hresetintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__UINTR__)
-#include "uintrintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__USERMSR__)
-#include "usermsrintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CRC32__)
-#include "crc32intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHI__)
-#include "prfchiintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RAOINT__)
-#include "raointintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CMPCCXADD__)
-#include "cmpccxaddintrin.h"
-#endif
-
-#if defined(__i386__)
-#define __SAVE_GPRBX "mov {%%ebx, %%eax |eax, ebx};"
-#define __RESTORE_GPRBX "mov {%%eax, %%ebx |ebx, eax};"
-#define __TMPGPR "eax"
-#else
-// When in 64-bit target, the 32-bit operands generate a 32-bit result,
-// zero-extended to a 64-bit result in the destination general-purpose,
-// It means "mov x %ebx" will clobber the higher 32 bits of rbx, so we
-// should preserve the 64-bit register rbx.
-#define __SAVE_GPRBX "mov {%%rbx, %%rax |rax, rbx};"
-#define __RESTORE_GPRBX "mov {%%rax, %%rbx |rbx, rax};"
-#define __TMPGPR "rax"
-#endif
-
-#define __SSC_MARK(__Tag)                                                      \
-  __asm__ __volatile__( __SAVE_GPRBX                                           \
-                       "mov {%0, %%ebx|ebx, %0}; "                             \
-                       ".byte 0x64, 0x67, 0x90; "                              \
-                        __RESTORE_GPRBX                                        \
-                       ::"i"(__Tag)                                            \
-                       :  __TMPGPR );
-
-#endif /* __X86GPRINTRIN_H */
diff --git a/third_party/intel/clang/x86intrin.h b/third_party/intel/clang/x86intrin.h
deleted file mode 100644
index ceae912cf..000000000
--- a/third_party/intel/clang/x86intrin.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#define __X86INTRIN_H
-
-#include "ia32intrin.h"
-
-#include "immintrin.h"
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHW__)
-#include "prfchwintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE4A__)
-#include "ammintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA4__)
-#include "fma4intrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__XOP__)
-#include "xopintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__TBM__)
-#include "tbmintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__LWP__)
-#include "lwpintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__MWAITX__)
-#include "mwaitxintrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CLZERO__)
-#include "clzerointrin.h"
-#endif
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPRU__)
-#include "rdpruintrin.h"
-#endif
-
-#endif /* __X86INTRIN_H */
diff --git a/third_party/intel/clang/xmmintrin.h b/third_party/intel/clang/xmmintrin.h
deleted file mode 100644
index 6a371c48f..000000000
--- a/third_party/intel/clang/xmmintrin.h
+++ /dev/null
@@ -1,3207 +0,0 @@
-/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __XMMINTRIN_H
-#define __XMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include "mmintrin.h"
-
-typedef int __v4si __attribute__((__vector_size__(16)));
-typedef float __v4sf __attribute__((__vector_size__(16)));
-typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
-
-typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
-
-/* Unsigned types */
-typedef unsigned int __v4su __attribute__((__vector_size__(16)));
-
-/* This header should only be included in a hosted environment as it depends on
- * a standard library to provide allocation routines. */
-#if __STDC_HOSTED__
-#include "mm_malloc.h"
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
-
-/// Adds the 32-bit float values in the low-order bits of the operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-///    The lower 32 bits of this operand are used in the calculation.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-///    The lower 32 bits of this operand are used in the calculation.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
-///    of the lower 32 bits of both operands. The upper 96 bits are copied from
-///    the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ss(__m128 __a, __m128 __b)
-{
-  __a[0] += __b[0];
-  return __a;
-}
-
-/// Adds two 128-bit vectors of [4 x float], and returns the results of
-///    the addition.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the sums of both
-///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)((__v4sf)__a + (__v4sf)__b);
-}
-
-/// Subtracts the 32-bit float value in the low-order bits of the second
-///    operand from the corresponding value in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
-///    of this operand are used in the calculation.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
-///    bits of this operand are used in the calculation.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-///    difference of the lower 32 bits of both operands. The upper 96 bits are
-///    copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ss(__m128 __a, __m128 __b)
-{
-  __a[0] -= __b[0];
-  return __a;
-}
-
-/// Subtracts each of the values of the second operand from the first
-///    operand, both of which are 128-bit vectors of [4 x float] and returns
-///    the results of the subtraction.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the minuend.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing the subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the differences between
-///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)((__v4sf)__a - (__v4sf)__b);
-}
-
-/// Multiplies two 32-bit float values in the low-order bits of the
-///    operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-///    The lower 32 bits of this operand are used in the calculation.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-///    The lower 32 bits of this operand are used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the product of the lower
-///    32 bits of both operands. The upper 96 bits are copied from the upper 96
-///    bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ss(__m128 __a, __m128 __b)
-{
-  __a[0] *= __b[0];
-  return __a;
-}
-
-/// Multiplies two 128-bit vectors of [4 x float] and returns the
-///    results of the multiplication.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the products of both
-///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)((__v4sf)__a * (__v4sf)__b);
-}
-
-/// Divides the value in the low-order 32 bits of the first operand by
-///    the corresponding value in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
-///    bits of this operand are used in the calculation.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
-///    of this operand are used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the quotients of the
-///    lower 32 bits of both operands. The upper 96 bits are copied from the
-///    upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ss(__m128 __a, __m128 __b)
-{
-  __a[0] /= __b[0];
-  return __a;
-}
-
-/// Divides two 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the dividend.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing the divisor.
-/// \returns A 128-bit vector of [4 x float] containing the quotients of both
-///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)((__v4sf)__a / (__v4sf)__b);
-}
-
-/// Calculates the square root of the value stored in the low-order bits
-///    of a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the square root of the
-///    value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
-  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
-}
-
-/// Calculates the square roots of the values stored in a 128-bit vector
-///    of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the square roots of the
-///    values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
-  return __builtin_ia32_sqrtps((__v4sf)__a);
-}
-
-/// Calculates the approximate reciprocal of the value stored in the
-///    low-order bits of a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the approximate
-///    reciprocal of the value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_rcp_ss(__m128 __a)
-{
-  return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
-}
-
-/// Calculates the approximate reciprocals of the values stored in a
-///    128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the approximate
-///    reciprocals of the values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_rcp_ps(__m128 __a)
-{
-  return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
-}
-
-/// Calculates the approximate reciprocal of the square root of the value
-///    stored in the low-order bits of a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the approximate
-///    reciprocal of the square root of the value in the low-order bits of the
-///    operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_rsqrt_ss(__m128 __a)
-{
-  return __builtin_ia32_rsqrtss((__v4sf)__a);
-}
-
-/// Calculates the approximate reciprocals of the square roots of the
-///    values stored in a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the approximate
-///    reciprocals of the square roots of the values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_rsqrt_ps(__m128 __a)
-{
-  return __builtin_ia32_rsqrtps((__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands and returns the lesser value in the low-order bits of the
-///    vector of [4 x float].
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-///    minimum value between both operands. The upper 96 bits are copied from
-///    the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_min_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 128-bit vectors of [4 x float] and returns the lesser
-///    of each pair of values.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands.
-/// \returns A 128-bit vector of [4 x float] containing the minimum values
-///    between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_min_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands and returns the greater value in the low-order bits of a 128-bit
-///    vector of [4 x float].
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-///    maximum value between both operands. The upper 96 bits are copied from
-///    the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_max_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 128-bit vectors of [4 x float] and returns the greater
-///    of each pair of values.
-///
-///    If either value in a comparison is NaN, returns the value from \a __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands.
-/// \returns A 128-bit vector of [4 x float] containing the maximum values
-///    between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_max_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector containing one of the source operands.
-/// \param __b
-///    A 128-bit vector containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
-///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_and_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)((__v4su)__a & (__v4su)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
-///    the one's complement of the values contained in the first source
-///    operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the first source operand. The
-///    one's complement of this value is used in the bitwise AND.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing the second source operand.
-/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
-///    one's complement of the first operand and the values in the second
-///    operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_andnot_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)(~(__v4su)__a & (__v4su)__b);
-}
-
-/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
-///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_or_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)((__v4su)__a | (__v4su)__b);
-}
-
-/// Performs a bitwise exclusive OR of two 128-bit vectors of
-///    [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
-///    of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_xor_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)((__v4su)__a ^ (__v4su)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands for equality.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector [4 x float].
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpeq_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] for equality.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpeq_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is less than the
-///    corresponding value in the second operand.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmplt_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are less than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmplt_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is less than or
-///    equal to the corresponding value in the second operand.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
-///    the low-order bits of a vector of [4 x float].
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmple_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are less than or equal to those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmple_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is greater than
-///    the corresponding value in the second operand.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpgt_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_shufflevector((__v4sf)__a,
-                                         (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
-                                         4, 1, 2, 3);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are greater than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpgt_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is greater than
-///    or equal to the corresponding value in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpge_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_shufflevector((__v4sf)__a,
-                                         (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
-                                         4, 1, 2, 3);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are greater than or equal to those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns false.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpge_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both operands
-///    for inequality.
-///
-///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpneq_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] for inequality.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpneq_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is not less than
-///    the corresponding value in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnlt_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are not less than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnlt_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is not less than
-///    or equal to the corresponding value in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnle_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are not less than or equal to those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnle_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is not greater
-///    than the corresponding value in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpngt_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_shufflevector((__v4sf)__a,
-                                         (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
-                                         4, 1, 2, 3);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are not greater than those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpngt_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is not greater
-///    than or equal to the corresponding value in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnge_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_shufflevector((__v4sf)__a,
-                                         (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
-                                         4, 1, 2, 3);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are not greater than or equal to those in the second operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, returns true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnge_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is ordered with
-///    respect to the corresponding value in the second operand.
-///
-///    A pair of floating-point values are ordered with respect to each
-///    other if neither value is a NaN. Each comparison returns 0x0 for false,
-///    0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpord_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are ordered with respect to those in the second operand.
-///
-///    A pair of floating-point values are ordered with respect to each
-///    other if neither value is a NaN. Each comparison returns 0x0 for false,
-///    0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpord_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the value in the first operand is unordered
-///    with respect to the corresponding value in the second operand.
-///
-///    A pair of double-precision values are unordered with respect to each
-///    other if one or both values are NaN. Each comparison returns 0x0 for
-///    false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float] containing one of the operands. The lower
-///    32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-///    in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpunord_ss(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-///    128-bit vectors of [4 x float] to determine if the values in the first
-///    operand are unordered with respect to those in the second operand.
-///
-///    A pair of double-precision values are unordered with respect to each
-///    other if one or both values are NaN. Each comparison returns 0x0 for
-///    false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpunord_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands for equality.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comieq_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the first operand is less than the second
-///    operand.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comilt_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the first operand is less than or equal to the
-///    second operand.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comile_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the first operand is greater than the second
-///    operand.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comigt_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the first operand is greater than or equal to
-///    the second operand.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comige_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands to determine if the first operand is not equal to the second
-///    operand.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comineq_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine equality.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomieq_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine if the first operand is
-///    less than the second operand.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomilt_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine if the first operand is
-///    less than or equal to the second operand.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomile_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine if the first operand is
-///    greater than the second operand.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomigt_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine if the first operand is
-///    greater than or equal to the second operand.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomige_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine inequality.
-///
-///    The comparison returns 0 for false, 1 for true. If either value in a
-///    comparison is NaN, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \param __b
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the comparison.
-/// \returns An integer containing the comparison results.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomineq_ss(__m128 __a, __m128 __b)
-{
-  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 32-bit integer.
-///
-///    If the converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the conversion.
-/// \returns A 32-bit integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvtss_si32(__m128 __a)
-{
-  return __builtin_ia32_cvtss2si((__v4sf)__a);
-}
-
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 32-bit integer.
-///
-///    If the converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the conversion.
-/// \returns A 32-bit integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvt_ss2si(__m128 __a)
-{
-  return _mm_cvtss_si32(__a);
-}
-
-#ifdef __x86_64__
-
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 64-bit integer.
-///
-///    If the converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the conversion.
-/// \returns A 64-bit integer containing the converted value.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm_cvtss_si64(__m128 __a)
-{
-  return __builtin_ia32_cvtss2si64((__v4sf)__a);
-}
-
-#endif
-
-/// Converts two low-order float values in a 128-bit vector of
-///    [4 x float] into a 64-bit vector of [2 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtps_pi32(__m128 __a)
-{
-  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
-}
-
-/// Converts two low-order float values in a 128-bit vector of
-///    [4 x float] into a 64-bit vector of [2 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvt_ps2pi(__m128 __a)
-{
-  return _mm_cvtps_pi32(__a);
-}
-
-/// Converts the lower (first) element of a vector of [4 x float] into a signed
-///    truncated (rounded toward zero) 32-bit integer.
-///
-///    If the converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the conversion.
-/// \returns A 32-bit integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvttss_si32(__m128 __a)
-{
-  return __builtin_ia32_cvttss2si((__v4sf)__a);
-}
-
-/// Converts the lower (first) element of a vector of [4 x float] into a signed
-///    truncated (rounded toward zero) 32-bit integer.
-///
-///    If the converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the conversion.
-/// \returns A 32-bit integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvtt_ss2si(__m128 __a)
-{
-  return _mm_cvttss_si32(__a);
-}
-
-#ifdef __x86_64__
-/// Converts the lower (first) element of a vector of [4 x float] into a signed
-///    truncated (rounded toward zero) 64-bit integer.
-///
-///    If the converted value does not fit in a 64-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the conversion.
-/// \returns A 64-bit integer containing the converted value.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm_cvttss_si64(__m128 __a)
-{
-  return __builtin_ia32_cvttss2si64((__v4sf)__a);
-}
-#endif
-
-/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
-///    into two signed truncated (rounded toward zero) 32-bit integers,
-///    returned in a 64-bit vector of [2 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
-///   instructions.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvttps_pi32(__m128 __a)
-{
-  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
-}
-
-/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
-///    into two signed truncated (rounded toward zero) 64-bit integers,
-///    returned in a 64-bit vector of [2 x i32].
-///
-///    If a converted value does not fit in a 32-bit integer, raises a
-///    floating-point invalid exception. If the exception is masked, returns
-///    the most negative integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtt_ps2pi(__m128 __a)
-{
-  return _mm_cvttps_pi32(__a);
-}
-
-/// Converts a 32-bit signed integer value into a floating point value
-///    and writes it to the lower 32 bits of the destination. The remaining
-///    higher order elements of the destination vector are copied from the
-///    corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 32-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-///    converted value of the second operand. The upper 96 bits are copied from
-///    the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtsi32_ss(__m128 __a, int __b)
-{
-  __a[0] = __b;
-  return __a;
-}
-
-/// Converts a 32-bit signed integer value into a floating point value
-///    and writes it to the lower 32 bits of the destination. The remaining
-///    higher order elements of the destination are copied from the
-///    corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 32-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-///    converted value of the second operand. The upper 96 bits are copied from
-///    the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvt_si2ss(__m128 __a, int __b)
-{
-  return _mm_cvtsi32_ss(__a, __b);
-}
-
-#ifdef __x86_64__
-
-/// Converts a 64-bit signed integer value into a floating point value
-///    and writes it to the lower 32 bits of the destination. The remaining
-///    higher order elements of the destination are copied from the
-///    corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 64-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-///    converted value of the second operand. The upper 96 bits are copied from
-///    the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtsi64_ss(__m128 __a, long long __b)
-{
-  __a[0] = __b;
-  return __a;
-}
-
-#endif
-
-/// Converts two elements of a 64-bit vector of [2 x i32] into two
-///    floating point values and writes them to the lower 64-bits of the
-///    destination. The remaining higher order elements of the destination are
-///    copied from the corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
-///    and written to the corresponding low-order elements in the destination.
-/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
-///    converted value of the second operand. The upper 64 bits are copied from
-///    the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpi32_ps(__m128 __a, __m64 __b)
-{
-  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
-}
-
-/// Converts two elements of a 64-bit vector of [2 x i32] into two
-///    floating point values and writes them to the lower 64-bits of the
-///    destination. The remaining higher order elements of the destination are
-///    copied from the corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float].
-/// \param __b
-///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
-///    and written to the corresponding low-order elements in the destination.
-/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
-///    converted value from the second operand. The upper 64 bits are copied
-///    from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvt_pi2ps(__m128 __a, __m64 __b)
-{
-  return _mm_cvtpi32_ps(__a, __b);
-}
-
-/// Extracts a float value contained in the lower 32 bits of a vector of
-///    [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-///    used in the extraction.
-/// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm_cvtss_f32(__m128 __a)
-{
-  return __a[0];
-}
-
-/// Loads two packed float values from the address \a __p into the
-///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
-///     are copied from the low-order bits of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
-///    of the destination.
-/// \param __p
-///    A pointer to two packed float values. Bits [63:0] are written to bits
-///    [127:64] of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the moved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_loadh_pi(__m128 __a, const __m64 *__p)
-{
-  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
-  struct __mm_loadh_pi_struct {
-    __mm_loadh_pi_v2f32 __u;
-  } __attribute__((__packed__, __may_alias__));
-  __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
-  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
-  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
-}
-
-/// Loads two packed float values from the address \a __p into the
-///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
-///    are copied from the high-order bits of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
-///    [127:64] of the destination.
-/// \param __p
-///    A pointer to two packed float values. Bits [63:0] are written to bits
-///    [63:0] of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the moved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_loadl_pi(__m128 __a, const __m64 *__p)
-{
-  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
-  struct __mm_loadl_pi_struct {
-    __mm_loadl_pi_v2f32 __u;
-  } __attribute__((__packed__, __may_alias__));
-  __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
-  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
-  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-///    32 bits of the vector are initialized with the single-precision
-///    floating-point value loaded from a specified memory location. The upper
-///    96 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
-///
-/// \param __p
-///    A pointer to a 32-bit memory location containing a single-precision
-///    floating-point value.
-/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
-///    lower 32 bits contain the value loaded from the memory location. The
-///    upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_load_ss(const float *__p)
-{
-  struct __mm_load_ss_struct {
-    float __u;
-  } __attribute__((__packed__, __may_alias__));
-  float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
-  return __extension__ (__m128){ __u, 0, 0, 0 };
-}
-
-/// Loads a 32-bit float value and duplicates it to all four vector
-///    elements of a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
-///    instruction.
-///
-/// \param __p
-///    A pointer to a float value to be loaded and duplicated.
-/// \returns A 128-bit vector of [4 x float] containing the loaded and
-///    duplicated values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_load1_ps(const float *__p)
-{
-  struct __mm_load1_ps_struct {
-    float __u;
-  } __attribute__((__packed__, __may_alias__));
-  float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
-  return __extension__ (__m128){ __u, __u, __u, __u };
-}
-
-#define        _mm_load_ps1(p) _mm_load1_ps(p)
-
-/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
-///    memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location has to be 128-bit aligned.
-/// \returns A 128-bit vector of [4 x float] containing the loaded values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_load_ps(const float *__p)
-{
-  return *(const __m128*)__p;
-}
-
-/// Loads a 128-bit floating-point vector of [4 x float] from an
-///    unaligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns A 128-bit vector of [4 x float] containing the loaded values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_loadu_ps(const float *__p)
-{
-  struct __loadu_ps {
-    __m128_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  return ((const struct __loadu_ps*)__p)->__v;
-}
-
-/// Loads four packed float values, in reverse order, from an aligned
-///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
-///    instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location has to be 128-bit aligned.
-/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
-///    in reverse order.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_loadr_ps(const float *__p)
-{
-  __m128 __a = _mm_load_ps(__p);
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
-}
-
-/// Create a 128-bit vector of [4 x float] with undefined values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 128-bit vector of [4 x float] containing undefined values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_undefined_ps(void)
-{
-  return (__m128)__builtin_ia32_undef128();
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-///    32 bits of the vector are initialized with the specified single-precision
-///    floating-point value. The upper 96 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
-///
-/// \param __w
-///    A single-precision floating-point value used to initialize the lower 32
-///    bits of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
-///    lower 32 bits contain the value provided in the source operand. The
-///    upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ss(float __w)
-{
-  return __extension__ (__m128){ __w, 0, 0, 0 };
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float], with each
-///    of the four single-precision floating-point vector elements set to the
-///    specified single-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
-///
-/// \param __w
-///    A single-precision floating-point value used to initialize each vector
-///    element of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set1_ps(float __w)
-{
-  return __extension__ (__m128){ __w, __w, __w, __w };
-}
-
-/* Microsoft specific. */
-/// Constructs a 128-bit floating-point vector of [4 x float], with each
-///    of the four single-precision floating-point vector elements set to the
-///    specified single-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
-///
-/// \param __w
-///    A single-precision floating-point value used to initialize each vector
-///    element of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps1(float __w)
-{
-    return _mm_set1_ps(__w);
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]
-///    initialized with the specified single-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __z
-///    A single-precision floating-point value used to initialize bits [127:96]
-///    of the result.
-/// \param __y
-///    A single-precision floating-point value used to initialize bits [95:64]
-///    of the result.
-/// \param __x
-///    A single-precision floating-point value used to initialize bits [63:32]
-///    of the result.
-/// \param __w
-///    A single-precision floating-point value used to initialize bits [31:0]
-///    of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps(float __z, float __y, float __x, float __w)
-{
-  return __extension__ (__m128){ __w, __x, __y, __z };
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float],
-///    initialized in reverse order with the specified 32-bit single-precision
-///    float-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-///    instruction.
-///
-/// \param __z
-///    A single-precision floating-point value used to initialize bits [31:0]
-///    of the result.
-/// \param __y
-///    A single-precision floating-point value used to initialize bits [63:32]
-///    of the result.
-/// \param __x
-///    A single-precision floating-point value used to initialize bits [95:64]
-///    of the result.
-/// \param __w
-///    A single-precision floating-point value used to initialize bits [127:96]
-///    of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setr_ps(float __z, float __y, float __x, float __w)
-{
-  return __extension__ (__m128){ __z, __y, __x, __w };
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float] initialized
-///    to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
-///
-/// \returns An initialized 128-bit floating-point vector of [4 x float] with
-///    all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setzero_ps(void)
-{
-  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
-}
-
-/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
-///    memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
-///
-/// \param __p
-///    A pointer to a 64-bit memory location.
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeh_pi(__m64 *__p, __m128 __a)
-{
-  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
-  struct __mm_storeh_pi_struct {
-    __mm_storeh_pi_v2f32 __u;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
-}
-
-/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
-///     memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a memory location that will receive the float values.
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storel_pi(__m64 *__p, __m128 __a)
-{
-  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
-  struct __mm_storeh_pi_struct {
-    __mm_storeh_pi_v2f32 __u;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
-}
-
-/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
-///     memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
-///
-/// \param __p
-///    A pointer to a 32-bit memory location.
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_ss(float *__p, __m128 __a)
-{
-  struct __mm_store_ss_struct {
-    float __u;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
-}
-
-/// Stores a 128-bit vector of [4 x float] to an unaligned memory
-///    location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_ps(float *__p, __m128 __a)
-{
-  struct __storeu_ps {
-    __m128_u __v;
-  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_ps*)__p)->__v = __a;
-}
-
-/// Stores a 128-bit vector of [4 x float] into an aligned memory
-///    location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location has to be 16-byte aligned.
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_ps(float *__p, __m128 __a)
-{
-  *(__m128*)__p = __a;
-}
-
-/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
-///    four contiguous elements in an aligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
-///    instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit memory location.
-/// \param __a
-///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-///    of the four contiguous elements pointed by \a __p.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store1_ps(float *__p, __m128 __a)
-{
-  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
-  _mm_store_ps(__p, __a);
-}
-
-/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
-///    four contiguous elements in an aligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
-///    instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit memory location.
-/// \param __a
-///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-///    of the four contiguous elements pointed by \a __p.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_ps1(float *__p, __m128 __a)
-{
-  _mm_store1_ps(__p, __a);
-}
-
-/// Stores float values from a 128-bit vector of [4 x float] to an
-///    aligned memory location in reverse order.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
-///    instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location has to be 128-bit aligned.
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storer_ps(float *__p, __m128 __a)
-{
-  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
-  _mm_store_ps(__p, __a);
-}
-
-#define _MM_HINT_ET0 7
-#define _MM_HINT_ET1 6
-#define _MM_HINT_T0  3
-#define _MM_HINT_T1  2
-#define _MM_HINT_T2  1
-#define _MM_HINT_NTA 0
-
-#ifndef _MSC_VER
-/* FIXME: We have to #define this because "sel" must be a constant integer, and
-   Sema doesn't do any form of constant propagation yet. */
-
-/// Loads one cache line of data from the specified address to a location
-///    closer to the processor.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _mm_prefetch(const void *a, const int sel);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
-///
-/// \param a
-///    A pointer to a memory location containing a cache line of data.
-/// \param sel
-///    A predefined integer constant specifying the type of prefetch
-///    operation: \n
-///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
-///    PREFETCHNTA instruction will be generated. \n
-///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
-///    be generated. \n
-///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
-///    be generated. \n
-///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
-///    be generated.
-#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
-                                                 ((sel) >> 2) & 1, (sel) & 0x3))
-#endif
-
-/// Stores a 64-bit integer in the specified aligned memory location. To
-///    minimize caching, the data is flagged as non-temporal (unlikely to be
-///    used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
-///
-/// \param __p
-///    A pointer to an aligned memory location used to store the register value.
-/// \param __a
-///    A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
-_mm_stream_pi(void *__p, __m64 __a)
-{
-  __builtin_ia32_movntq((__m64 *)__p, __a);
-}
-
-/// Moves packed float values from a 128-bit vector of [4 x float] to a
-///    128-bit aligned memory location. To minimize caching, the data is flagged
-///    as non-temporal (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
-///
-/// \param __p
-///    A pointer to a 128-bit aligned memory location that will receive the
-///    single-precision floating-point values.
-/// \param __a
-///    A 128-bit vector of [4 x float] containing the values to be moved.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ps(void *__p, __m128 __a)
-{
-  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
-}
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/// Forces strong memory ordering (serialization) between store
-///    instructions preceding this instruction and store instructions following
-///    this instruction, ensuring the system completes all previous stores
-///    before executing subsequent stores.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
-///
-void _mm_sfence(void);
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
-///    returns it, as specified by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_extract_pi16(__m64 a, int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
-///
-/// \param a
-///    A 64-bit vector of [4 x i16].
-/// \param n
-///    An immediate integer operand that determines which bits are extracted: \n
-///    0: Bits [15:0] are copied to the destination. \n
-///    1: Bits [31:16] are copied to the destination. \n
-///    2: Bits [47:32] are copied to the destination. \n
-///    3: Bits [63:48] are copied to the destination.
-/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
-#define _mm_extract_pi16(a, n) \
-  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
-
-/// Copies data from the 64-bit vector of [4 x i16] to the destination,
-///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
-///    specified by the immediate operand \a n.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
-///
-/// \param a
-///    A 64-bit vector of [4 x i16].
-/// \param d
-///    An integer. The lower 16-bit value from this operand is written to the
-///    destination at the offset specified by operand \a n.
-/// \param n
-///    An immediate integer operant that determines which the bits to be used
-///    in the destination. \n
-///    0: Bits [15:0] are copied to the destination. \n
-///    1: Bits [31:16] are copied to the destination. \n
-///    2: Bits [47:32] are copied to the destination. \n
-///    3: Bits [63:48] are copied to the destination.  \n
-///    The remaining bits in the destination are copied from the corresponding
-///    bits in operand \a a.
-/// \returns A 64-bit integer vector containing the copied packed data from the
-///    operands.
-#define _mm_insert_pi16(a, d, n) \
-  ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
-
-/// Compares each of the corresponding packed 16-bit integer values of
-///    the 64-bit integer vectors, and writes the greater value to the
-///    corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_max_pi16(__m64 __a, __m64 __b)
-{
-  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Compares each of the corresponding packed 8-bit unsigned integer
-///    values of the 64-bit integer vectors, and writes the greater value to the
-///    corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_max_pu8(__m64 __a, __m64 __b)
-{
-  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
-}
-
-/// Compares each of the corresponding packed 16-bit integer values of
-///    the 64-bit integer vectors, and writes the lesser value to the
-///    corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_min_pi16(__m64 __a, __m64 __b)
-{
-  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Compares each of the corresponding packed 8-bit unsigned integer
-///    values of the 64-bit integer vectors, and writes the lesser value to the
-///    corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_min_pu8(__m64 __a, __m64 __b)
-{
-  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
-}
-
-/// Takes the most significant bit from each 8-bit element in a 64-bit
-///    integer vector to create an 8-bit mask value. Zero-extends the value to
-///    32-bit integer and writes it to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing the values with bits to be extracted.
-/// \returns The most significant bit from each 8-bit element in \a __a,
-///    written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_MMX
-_mm_movemask_pi8(__m64 __a)
-{
-  return __builtin_ia32_pmovmskb((__v8qi)__a);
-}
-
-/// Multiplies packed 16-bit unsigned integer values and writes the
-///    high-order 16 bits of each 32-bit product to the corresponding bits in
-///    the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_mulhi_pu16(__m64 __a, __m64 __b)
-{
-  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
-///    destination, as specified by the immediate value operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
-///
-/// \param a
-///    A 64-bit integer vector containing the values to be shuffled.
-/// \param n
-///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from \a a. The destinations within the 64-bit destination are
-///    assigned values as follows: \n
-///    Bits [1:0] are used to assign values to bits [15:0] in the
-///    destination. \n
-///    Bits [3:2] are used to assign values to bits [31:16] in the
-///    destination. \n
-///    Bits [5:4] are used to assign values to bits [47:32] in the
-///    destination. \n
-///    Bits [7:6] are used to assign values to bits [63:48] in the
-///    destination. \n
-///    Bit value assignments: \n
-///    00: assigned from bits [15:0] of \a a. \n
-///    01: assigned from bits [31:16] of \a a. \n
-///    10: assigned from bits [47:32] of \a a. \n
-///    11: assigned from bits [63:48] of \a a. \n
-///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
-///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
-///    <c>[b6, b4, b2, b0]</c>.
-/// \returns A 64-bit integer vector containing the shuffled values.
-#define _mm_shuffle_pi16(a, n) \
-  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
-
-/// Conditionally copies the values from each 8-bit element in the first
-///    64-bit integer vector operand to the specified memory location, as
-///    specified by the most significant bit in the corresponding element in the
-///    second 64-bit integer vector operand.
-///
-///    To minimize caching, the data is flagged as non-temporal
-///    (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
-///
-/// \param __d
-///    A 64-bit integer vector containing the values with elements to be copied.
-/// \param __n
-///    A 64-bit integer vector operand. The most significant bit from each 8-bit
-///    element determines whether the corresponding element in operand \a __d
-///    is copied. If the most significant bit of a given element is 1, the
-///    corresponding element in operand \a __d is copied.
-/// \param __p
-///    A pointer to a 64-bit memory location that will receive the conditionally
-///    copied integer values. The address of the memory location does not have
-///    to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
-_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
-{
-  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
-}
-
-/// Computes the rounded averages of the packed unsigned 8-bit integer
-///    values and writes the averages to the corresponding bits in the
-///    destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_avg_pu8(__m64 __a, __m64 __b)
-{
-  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
-}
-
-/// Computes the rounded averages of the packed unsigned 16-bit integer
-///    values and writes the averages to the corresponding bits in the
-///    destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_avg_pu16(__m64 __a, __m64 __b)
-{
-  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Subtracts the corresponding 8-bit unsigned integer values of the two
-///    64-bit vector operands and computes the absolute value for each of the
-///    difference. Then sum of the 8 absolute differences is written to the
-///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
-///
-/// \param __a
-///    A 64-bit integer vector containing one of the source operands.
-/// \param __b
-///    A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
-///    sets of absolute differences between both operands. The upper bits are
-///    cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_sad_pu8(__m64 __a, __m64 __b)
-{
-  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
-}
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/// Returns the contents of the MXCSR register as a 32-bit unsigned
-///    integer value.
-///
-///    There are several groups of macros associated with this
-///    intrinsic, including:
-///    <ul>
-///    <li>
-///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
-///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
-///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
-///      _MM_GET_EXCEPTION_STATE().
-///    </li>
-///    <li>
-///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
-///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
-///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
-///    </li>
-///    <li>
-///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
-///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
-///      _MM_GET_ROUNDING_MODE().
-///    </li>
-///    <li>
-///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
-///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
-///    </li>
-///    <li>
-///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
-///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
-///      _MM_GET_DENORMALS_ZERO_MODE().
-///    </li>
-///    </ul>
-///
-///    For example, the following expression checks if an overflow exception has
-///    occurred:
-///    \code
-///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
-///    \endcode
-///
-///    The following expression gets the current rounding mode:
-///    \code
-///      _MM_GET_ROUNDING_MODE()
-///    \endcode
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
-///
-/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
-///    register.
-unsigned int _mm_getcsr(void);
-
-/// Sets the MXCSR register with the 32-bit unsigned integer value.
-///
-///    There are several groups of macros associated with this intrinsic,
-///    including:
-///    <ul>
-///    <li>
-///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
-///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
-///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
-///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
-///    </li>
-///    <li>
-///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
-///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
-///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
-///      of these macros.
-///    </li>
-///    <li>
-///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
-///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
-///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
-///    </li>
-///    <li>
-///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
-///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
-///      one of these macros.
-///    </li>
-///    <li>
-///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
-///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
-///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
-///    </li>
-///    </ul>
-///
-///    For example, the following expression causes subsequent floating-point
-///    operations to round up:
-///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
-///
-///    The following example sets the DAZ and FTZ flags:
-///    \code
-///    void setFlags() {
-///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-///    }
-///    \endcode
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
-///
-/// \param __i
-///    A 32-bit unsigned integer value to be written to the MXCSR register.
-void _mm_setcsr(unsigned int __i);
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-/// Selects 4 float values from the 128-bit operands of [4 x float], as
-///    specified by the immediate value operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float].
-/// \param b
-///    A 128-bit vector of [4 x float].
-/// \param mask
-///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from \a a and \a b. \n
-///    Bits [3:0] specify the values copied from operand \a a. \n
-///    Bits [7:4] specify the values copied from operand \a b. \n
-///    The destinations within the 128-bit destination are assigned values as
-///    follows: \n
-///    Bits [1:0] are used to assign values to bits [31:0] in the
-///    destination. \n
-///    Bits [3:2] are used to assign values to bits [63:32] in the
-///    destination. \n
-///    Bits [5:4] are used to assign values to bits [95:64] in the
-///    destination. \n
-///    Bits [7:6] are used to assign values to bits [127:96] in the
-///    destination. \n
-///    Bit value assignments: \n
-///    00: Bits [31:0] copied from the specified operand. \n
-///    01: Bits [63:32] copied from the specified operand. \n
-///    10: Bits [95:64] copied from the specified operand. \n
-///    11: Bits [127:96] copied from the specified operand. \n
-///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
-///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
-///    <c>[b6, b4, b2, b0]</c>.
-/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
-#define _mm_shuffle_ps(a, b, mask) \
-  ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
-                                 (int)(mask)))
-
-/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
-///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. \n
-///    Bits [95:64] are written to bits [31:0] of the destination. \n
-///    Bits [127:96] are written to bits [95:64] of the destination.
-/// \param __b
-///    A 128-bit vector of [4 x float].
-///    Bits [95:64] are written to bits [63:32] of the destination. \n
-///    Bits [127:96] are written to bits [127:96] of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpackhi_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
-}
-
-/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
-///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x float]. \n
-///    Bits [31:0] are written to bits [31:0] of the destination.  \n
-///    Bits [63:32] are written to bits [95:64] of the destination.
-/// \param __b
-///    A 128-bit vector of [4 x float]. \n
-///    Bits [31:0] are written to bits [63:32] of the destination. \n
-///    Bits [63:32] are written to bits [127:96] of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpacklo_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-///    32 bits are set to the lower 32 bits of the second parameter. The upper
-///    96 bits are set to the upper 96 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
-///    instruction.
-///
-/// \param __a
-///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
-///    written to the upper 96 bits of the result.
-/// \param __b
-///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
-///    written to the lower 32 bits of the result.
-/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_move_ss(__m128 __a, __m128 __b)
-{
-  __a[0] = __b[0];
-  return __a;
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-///    64 bits are set to the upper 64 bits of the second parameter. The upper
-///    64 bits are set to the upper 64 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
-///    written to the upper 64 bits of the result.
-/// \param __b
-///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
-///    written to the lower 64 bits of the result.
-/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehl_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-///    64 bits are set to the lower 64 bits of the first parameter. The upper
-///    64 bits are set to the lower 64 bits of the second parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
-///
-/// \param __a
-///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
-///    written to the lower 64 bits of the result.
-/// \param __b
-///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
-///    written to the upper 64 bits of the result.
-/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movelh_ps(__m128 __a, __m128 __b)
-{
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
-}
-
-/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
-///    float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
-///    from the corresponding elements in this operand.
-/// \returns A 128-bit vector of [4 x float] containing the copied and converted
-///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpi16_ps(__m64 __a)
-{
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi16(__b, __a);
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
-}
-
-/// Converts a 64-bit vector of 16-bit unsigned integer values into a
-///    128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
-///    destination are copied from the corresponding elements in this operand.
-/// \returns A 128-bit vector of [4 x float] containing the copied and converted
-///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpu16_ps(__m64 __a)
-{
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
-}
-
-/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
-///    into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
-///    from the corresponding lower 4 elements in this operand.
-/// \returns A 128-bit vector of [4 x float] containing the copied and converted
-///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpi8_ps(__m64 __a)
-{
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi8(__b, __a);
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
-}
-
-/// Converts the lower four unsigned 8-bit integer values from a 64-bit
-///    vector of [8 x u8] into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
-///    destination are copied from the corresponding lower 4 elements in this
-///    operand.
-/// \returns A 128-bit vector of [4 x float] containing the copied and converted
-///    values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpu8_ps(__m64 __a)
-{
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
-}
-
-/// Converts the two 32-bit signed integer values from each 64-bit vector
-///    operand of [2 x i32] into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
-///    copied from the elements in this operand.
-/// \param __b
-///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
-///    copied from the elements in this operand.
-/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
-///    copied and converted values from the first operand. The upper 64 bits
-///    contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
-{
-  __m128 __c;
-
-  __c = _mm_setzero_ps();
-  __c = _mm_cvtpi32_ps(__c, __b);
-  __c = _mm_movelh_ps(__c, __c);
-
-  return _mm_cvtpi32_ps(__c, __a);
-}
-
-/// Converts each single-precision floating-point element of a 128-bit
-///    floating-point vector of [4 x float] into a 16-bit signed integer, and
-///    packs the results into a 64-bit integer vector of [4 x i16].
-///
-///    If the floating-point element is NaN or infinity, or if the
-///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
-///    it is converted to 0x8000. Otherwise if the floating-point element is
-///    greater than 0x7FFF, it is converted to 0x7FFF.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
-///
-/// \param __a
-///    A 128-bit floating-point vector of [4 x float].
-/// \returns A 64-bit integer vector of [4 x i16] containing the converted
-///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtps_pi16(__m128 __a)
-{
-  __m64 __b, __c;
-
-  __b = _mm_cvtps_pi32(__a);
-  __a = _mm_movehl_ps(__a, __a);
-  __c = _mm_cvtps_pi32(__a);
-
-  return _mm_packs_pi32(__b, __c);
-}
-
-/// Converts each single-precision floating-point element of a 128-bit
-///    floating-point vector of [4 x float] into an 8-bit signed integer, and
-///    packs the results into the lower 32 bits of a 64-bit integer vector of
-///    [8 x i8]. The upper 32 bits of the vector are set to 0.
-///
-///    If the floating-point element is NaN or infinity, or if the
-///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
-///    is converted to 0x80. Otherwise if the floating-point element is greater
-///    than 0x7F, it is converted to 0x7F.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
-///
-/// \param __a
-///    128-bit floating-point vector of [4 x float].
-/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
-///    converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtps_pi8(__m128 __a)
-{
-  __m64 __b, __c;
-
-  __b = _mm_cvtps_pi16(__a);
-  __c = _mm_setzero_si64();
-
-  return _mm_packs_pi16(__b, __c);
-}
-
-/// Extracts the sign bits from each single-precision floating-point
-///    element of a 128-bit floating-point vector of [4 x float] and returns the
-///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
-///    to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
-///
-/// \param __a
-///    A 128-bit floating-point vector of [4 x float].
-/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
-///    single-precision floating-point element of the parameter. Bits [31:4] are
-///    set to zero.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_movemask_ps(__m128 __a)
-{
-  return __builtin_ia32_movmskps((__v4sf)__a);
-}
-
-/* Compare */
-#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
-#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
-#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
-#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
-#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
-#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
-#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
-#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
-
-/// Compares each of the corresponding values of two 128-bit vectors of
-///    [4 x float], using the operation specified by the immediate integer
-///    operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float].
-/// \param b
-///    A 128-bit vector of [4 x float].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ps(a, b, c)                                                    \
-  ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
-
-/// Compares each of the corresponding scalar values of two 128-bit
-///    vectors of [4 x float], using the operation specified by the immediate
-///    integer operand.
-///
-///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
-///    If either value in a comparison is NaN, comparisons that are ordered
-///    return false, and comparisons that are unordered return true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float].
-/// \param b
-///    A 128-bit vector of [4 x float].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ss(a, b, c)                                                    \
-  ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
-
-#define _MM_ALIGN16 __attribute__((aligned(16)))
-
-#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
-
-#define _MM_EXCEPT_INVALID    (0x0001U)
-#define _MM_EXCEPT_DENORM     (0x0002U)
-#define _MM_EXCEPT_DIV_ZERO   (0x0004U)
-#define _MM_EXCEPT_OVERFLOW   (0x0008U)
-#define _MM_EXCEPT_UNDERFLOW  (0x0010U)
-#define _MM_EXCEPT_INEXACT    (0x0020U)
-#define _MM_EXCEPT_MASK       (0x003fU)
-
-#define _MM_MASK_INVALID      (0x0080U)
-#define _MM_MASK_DENORM       (0x0100U)
-#define _MM_MASK_DIV_ZERO     (0x0200U)
-#define _MM_MASK_OVERFLOW     (0x0400U)
-#define _MM_MASK_UNDERFLOW    (0x0800U)
-#define _MM_MASK_INEXACT      (0x1000U)
-#define _MM_MASK_MASK         (0x1f80U)
-
-#define _MM_ROUND_NEAREST     (0x0000U)
-#define _MM_ROUND_DOWN        (0x2000U)
-#define _MM_ROUND_UP          (0x4000U)
-#define _MM_ROUND_TOWARD_ZERO (0x6000U)
-#define _MM_ROUND_MASK        (0x6000U)
-
-#define _MM_FLUSH_ZERO_MASK   (0x8000U)
-#define _MM_FLUSH_ZERO_ON     (0x8000U)
-#define _MM_FLUSH_ZERO_OFF    (0x0000U)
-
-#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
-#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
-#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
-#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
-
-#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
-#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
-#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
-#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
-
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
-do { \
-  __m128 tmp3, tmp2, tmp1, tmp0; \
-  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
-  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
-  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
-  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
-  (row0) = _mm_movelh_ps(tmp0, tmp2); \
-  (row1) = _mm_movehl_ps(tmp2, tmp0); \
-  (row2) = _mm_movelh_ps(tmp1, tmp3); \
-  (row3) = _mm_movehl_ps(tmp3, tmp1); \
-} while (0)
-
-/* Aliases for compatibility. */
-#define _m_pextrw _mm_extract_pi16
-#define _m_pinsrw _mm_insert_pi16
-#define _m_pmaxsw _mm_max_pi16
-#define _m_pmaxub _mm_max_pu8
-#define _m_pminsw _mm_min_pi16
-#define _m_pminub _mm_min_pu8
-#define _m_pmovmskb _mm_movemask_pi8
-#define _m_pmulhuw _mm_mulhi_pu16
-#define _m_pshufw _mm_shuffle_pi16
-#define _m_maskmovq _mm_maskmove_si64
-#define _m_pavgb _mm_avg_pu8
-#define _m_pavgw _mm_avg_pu16
-#define _m_psadbw _mm_sad_pu8
-#define _m_ _mm_
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
-
-/* Ugly hack for backwards-compatibility (compatible with gcc) */
-#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
-#include "emmintrin.h"
-#endif
-
-#endif /* __XMMINTRIN_H */
diff --git a/third_party/intel/clang/xopintrin.h b/third_party/intel/clang/xopintrin.h
deleted file mode 100644
index 976cdf490..000000000
--- a/third_party/intel/clang/xopintrin.h
+++ /dev/null
@@ -1,770 +0,0 @@
-/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __XOPINTRIN_H
-#define __XOPINTRIN_H
-
-#include <fma4intrin.h>
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddw_epi8(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddd_epi8(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epi8(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddd_epi16(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epi16(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epi32(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddw_epu8(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddd_epu8(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epu8(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddd_epu16(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epu16(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epu32(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubw_epi8(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubd_epi16(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubq_epi32(__m128i __A)
-{
-  return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_rot_epi8(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_rot_epi16(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_rot_epi32(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_rot_epi64(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
-}
-
-#define _mm_roti_epi8(A, N) \
-  ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)))
-
-#define _mm_roti_epi16(A, N) \
-  ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)))
-
-#define _mm_roti_epi32(A, N) \
-  ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)))
-
-#define _mm_roti_epi64(A, N) \
-  ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shl_epi8(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shl_epi16(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shl_epi32(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shl_epi64(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha_epi8(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha_epi16(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha_epi32(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha_epi64(__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
-}
-
-#define _mm_com_epu8(A, B, N) \
-  ((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
-                                   (__v16qi)(__m128i)(B), (N)))
-
-#define _mm_com_epu16(A, B, N) \
-  ((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
-                                   (__v8hi)(__m128i)(B), (N)))
-
-#define _mm_com_epu32(A, B, N) \
-  ((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
-                                   (__v4si)(__m128i)(B), (N)))
-
-#define _mm_com_epu64(A, B, N) \
-  ((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
-                                   (__v2di)(__m128i)(B), (N)))
-
-#define _mm_com_epi8(A, B, N) \
-  ((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
-                                  (__v16qi)(__m128i)(B), (N)))
-
-#define _mm_com_epi16(A, B, N) \
-  ((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
-                                  (__v8hi)(__m128i)(B), (N)))
-
-#define _mm_com_epi32(A, B, N) \
-  ((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
-                                  (__v4si)(__m128i)(B), (N)))
-
-#define _mm_com_epi64(A, B, N) \
-  ((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
-                                  (__v2di)(__m128i)(B), (N)))
-
-#define _MM_PCOMCTRL_LT    0
-#define _MM_PCOMCTRL_LE    1
-#define _MM_PCOMCTRL_GT    2
-#define _MM_PCOMCTRL_GE    3
-#define _MM_PCOMCTRL_EQ    4
-#define _MM_PCOMCTRL_NEQ   5
-#define _MM_PCOMCTRL_FALSE 6
-#define _MM_PCOMCTRL_TRUE  7
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epu8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epu8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epu8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epu8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epu8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epu8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epu8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epu8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epu16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epu16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epu16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epu16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epu16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epu16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epu16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epu16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epu32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epu32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epu32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epu32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epu32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epu32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epu32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epu32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epu64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epu64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epu64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epu64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epu64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epu64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epu64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epu64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epi8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epi8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epi8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epi8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epi8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epi8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epi8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epi8(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epi16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epi16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epi16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epi16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epi16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epi16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epi16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epi16(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epi32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epi32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epi32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epi32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epi32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epi32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epi32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epi32(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epi64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epi64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epi64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epi64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epi64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epi64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epi64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epi64(__m128i __A, __m128i __B)
-{
-  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-#define _mm_permute2_pd(X, Y, C, I) \
-  ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
-                                      (__v2df)(__m128d)(Y), \
-                                      (__v2di)(__m128i)(C), (I)))
-
-#define _mm256_permute2_pd(X, Y, C, I) \
-  ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
-                                         (__v4df)(__m256d)(Y), \
-                                         (__v4di)(__m256i)(C), (I)))
-
-#define _mm_permute2_ps(X, Y, C, I) \
-  ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
-                                     (__v4si)(__m128i)(C), (I)))
-
-#define _mm256_permute2_ps(X, Y, C, I) \
-  ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
-                                        (__v8sf)(__m256)(Y), \
-                                        (__v8si)(__m256i)(C), (I)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_frcz_ss(__m128 __A)
-{
-  return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_frcz_sd(__m128d __A)
-{
-  return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_frcz_ps(__m128 __A)
-{
-  return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_frcz_pd(__m128d __A)
-{
-  return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_frcz_ps(__m256 __A)
-{
-  return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_frcz_pd(__m256d __A)
-{
-  return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
-}
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __XOPINTRIN_H */
diff --git a/third_party/intel/clang/xsavecintrin.h b/third_party/intel/clang/xsavecintrin.h
deleted file mode 100644
index 1f2d00120..000000000
--- a/third_party/intel/clang/xsavecintrin.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XSAVECINTRIN_H
-#define __XSAVECINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsavec")))
-
-/// Performs a full or partial save of processor state to the memory at
-///    \a __p. The exact state saved depends on the 64-bit mask \a __m and
-///    processor control register \c XCR0.
-///
-/// \code{.operation}
-/// mask[62:0] := __m[62:0] AND XCR0[62:0]
-/// FOR i := 0 TO 62
-///   IF mask[i] == 1
-///     CASE (i) OF
-///     0: save X87 FPU state
-///     1: save SSE state
-///     DEFAULT: __p.Ext_Save_Area[i] := ProcessorState[i]
-///   FI
-/// ENDFOR
-/// __p.Header.XSTATE_BV[62:0] := INIT_FUNCTION(mask[62:0])
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c XSAVEC instruction.
-///
-/// \param __p
-///    Pointer to the save area; must be 64-byte aligned.
-/// \param __m
-///    A 64-bit mask indicating what state should be saved.
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsavec(void *__p, unsigned long long __m) {
-  __builtin_ia32_xsavec(__p, __m);
-}
-
-#ifdef __x86_64__
-/// Performs a full or partial save of processor state to the memory at
-///    \a __p. The exact state saved depends on the 64-bit mask \a __m and
-///    processor control register \c XCR0.
-///
-/// \code{.operation}
-/// mask[62:0] := __m[62:0] AND XCR0[62:0]
-/// FOR i := 0 TO 62
-///   IF mask[i] == 1
-///     CASE (i) OF
-///     0: save X87 FPU state
-///     1: save SSE state
-///     DEFAULT: __p.Ext_Save_Area[i] := ProcessorState[i]
-///   FI
-/// ENDFOR
-/// __p.Header.XSTATE_BV[62:0] := INIT_FUNCTION(mask[62:0])
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c XSAVEC64 instruction.
-///
-/// \param __p
-///    Pointer to the save area; must be 64-byte aligned.
-/// \param __m
-///    A 64-bit mask indicating what state should be saved.
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsavec64(void *__p, unsigned long long __m) {
-  __builtin_ia32_xsavec64(__p, __m);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/xsaveintrin.h b/third_party/intel/clang/xsaveintrin.h
deleted file mode 100644
index 9429db6dd..000000000
--- a/third_party/intel/clang/xsaveintrin.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XSAVEINTRIN_H
-#define __XSAVEINTRIN_H
-
-#ifdef _MSC_VER
-#define _XCR_XFEATURE_ENABLED_MASK 0
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsave")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsave(void *__p, unsigned long long __m) {
-  __builtin_ia32_xsave(__p, __m);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xrstor(void *__p, unsigned long long __m) {
-  __builtin_ia32_xrstor(__p, __m);
-}
-
-#ifndef _MSC_VER
-#define _xgetbv(A) __builtin_ia32_xgetbv((long long)(A))
-#define _xsetbv(A, B) __builtin_ia32_xsetbv((unsigned int)(A), (unsigned long long)(B))
-#else
-#ifdef __cplusplus
-extern "C" {
-#endif
-unsigned __int64 __cdecl _xgetbv(unsigned int);
-void __cdecl _xsetbv(unsigned int, unsigned __int64);
-#ifdef __cplusplus
-}
-#endif
-#endif /* _MSC_VER */
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsave64(void *__p, unsigned long long __m) {
-  __builtin_ia32_xsave64(__p, __m);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xrstor64(void *__p, unsigned long long __m) {
-  __builtin_ia32_xrstor64(__p, __m);
-}
-
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/xsaveoptintrin.h b/third_party/intel/clang/xsaveoptintrin.h
deleted file mode 100644
index 89a4c44db..000000000
--- a/third_party/intel/clang/xsaveoptintrin.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XSAVEOPTINTRIN_H
-#define __XSAVEOPTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaveopt")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsaveopt(void *__p, unsigned long long __m) {
-  __builtin_ia32_xsaveopt(__p, __m);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsaveopt64(void *__p, unsigned long long __m) {
-  __builtin_ia32_xsaveopt64(__p, __m);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/xsavesintrin.h b/third_party/intel/clang/xsavesintrin.h
deleted file mode 100644
index 3f99219a2..000000000
--- a/third_party/intel/clang/xsavesintrin.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XSAVESINTRIN_H
-#define __XSAVESINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaves")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsaves(void *__p, unsigned long long __m) {
-  __builtin_ia32_xsaves(__p, __m);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xrstors(void *__p, unsigned long long __m) {
-  __builtin_ia32_xrstors(__p, __m);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-_xrstors64(void *__p, unsigned long long __m) {
-  __builtin_ia32_xrstors64(__p, __m);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsaves64(void *__p, unsigned long long __m) {
-  __builtin_ia32_xsaves64(__p, __m);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
diff --git a/third_party/intel/clang/xtestintrin.h b/third_party/intel/clang/xtestintrin.h
deleted file mode 100644
index 7d19e3733..000000000
--- a/third_party/intel/clang/xtestintrin.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*===---- xtestintrin.h - XTEST intrinsic ----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XTESTINTRIN_H
-#define __XTESTINTRIN_H
-
-/* xtest returns non-zero if the instruction is executed within an RTM or active
- * HLE region. */
-/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is
- * supported. */
-static __inline__ int
-    __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
-    _xtest(void) {
-  return __builtin_ia32_xtest();
-}
-
-#endif
diff --git a/third_party/intel/mm_malloc.internal.h b/third_party/intel/mm_malloc.internal.h
index 2443d5f0b..4a41f80c8 100644
--- a/third_party/intel/mm_malloc.internal.h
+++ b/third_party/intel/mm_malloc.internal.h
@@ -5,7 +5,7 @@
 #ifndef __cplusplus
 extern int posix_memalign (void **, size_t, size_t);
 #else
-extern "C" int posix_memalign (void **, size_t, size_t);
+extern "C" int posix_memalign (void **, size_t, size_t) throw ();
 #endif
 static __inline void *
 _mm_malloc (size_t __size, size_t __alignment)
diff --git a/third_party/less/BUILD.mk b/third_party/less/BUILD.mk
index 45990d4ed..2a0ddd0ff 100644
--- a/third_party/less/BUILD.mk
+++ b/third_party/less/BUILD.mk
@@ -26,11 +26,9 @@ THIRD_PARTY_LESS_DIRECTDEPS =				\
 	LIBC_RUNTIME					\
 	LIBC_STDIO					\
 	LIBC_STR					\
-	LIBC_SYSTEM					\
 	LIBC_SYSV					\
-	THIRD_PARTY_MUSL				\
 	THIRD_PARTY_NCURSES				\
-	THIRD_PARTY_PCRE				\
+	THIRD_PARTY_PCRE
 
 THIRD_PARTY_LESS_DEPS :=				\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_LESS_DIRECTDEPS),$($(x))))
diff --git a/third_party/libcxx/BUILD.mk b/third_party/libcxx/BUILD.mk
index e33b96ee2..22c4a0d84 100644
--- a/third_party/libcxx/BUILD.mk
+++ b/third_party/libcxx/BUILD.mk
@@ -1040,7 +1040,6 @@ third_party/libcxx/ryu/d2s_intrinsics.h \
 third_party/libcxx/ryu/digit_table.h \
 third_party/libcxx/ryu/f2s.h \
 third_party/libcxx/ryu/ryu.h \
-third_party/libcxx/stdfloat \
 
 THIRD_PARTY_LIBCXX_A_SRCS = \
 third_party/libcxx/algorithm.cpp \
@@ -1065,9 +1064,6 @@ third_party/libcxx/ios.instantiations.cpp \
 third_party/libcxx/iostream.cpp \
 third_party/libcxx/legacy_pointer_safety.cpp \
 third_party/libcxx/locale.cpp \
-third_party/libcxx/locale2.cpp \
-third_party/libcxx/locale3.cpp \
-third_party/libcxx/locale4.cpp \
 third_party/libcxx/memory.cpp \
 third_party/libcxx/memory_resource.cpp \
 third_party/libcxx/mutex.cpp \
@@ -1098,11 +1094,11 @@ third_party/libcxx/fs/filesystem_clock.cpp \
 third_party/libcxx/fs/filesystem_error.cpp \
 third_party/libcxx/fs/int128_builtins.cpp \
 third_party/libcxx/fs/operations.cpp \
+third_party/libcxx/fs/cosmo.cpp \
 third_party/libcxx/fs/path.cpp \
 third_party/libcxx/ryu/d2fixed.cpp \
 third_party/libcxx/ryu/d2s.cpp \
 third_party/libcxx/ryu/f2s.cpp \
-third_party/libcxx/errc.cpp \
 
 THIRD_PARTY_LIBCXX_A_HDRS_CHECKEM = \
 third_party/libcxx/__assertion_handler \
@@ -2152,9 +2148,6 @@ $(THIRD_PARTY_LIBCXX_A_OBJS): private				\
 			-DLIBCXX_BUILDING_LIBCXXABI		\
 			-D_LIBCPP_BUILDING_LIBRARY
 
-o/$(MODE)/third_party/libcxx/locale.o: private			\
-		OVERRIDE_COPTS += -O -g0
-
 THIRD_PARTY_LIBCXX_LIBS = $(foreach x,$(THIRD_PARTY_LIBCXX_ARTIFACTS),$($(x)))
 THIRD_PARTY_LIBCXX_SRCS = $(foreach x,$(THIRD_PARTY_LIBCXX_ARTIFACTS),$($(x)_SRCS))
 THIRD_PARTY_LIBCXX_HDRS = $(foreach x,$(THIRD_PARTY_LIBCXX_ARTIFACTS),$($(x)_HDRS))
diff --git a/third_party/libcxx/__system_error/errc.h b/third_party/libcxx/__system_error/errc.h
index 8f6532ec9..33a2645d9 100644
--- a/third_party/libcxx/__system_error/errc.h
+++ b/third_party/libcxx/__system_error/errc.h
@@ -141,7 +141,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // This leads to the odd pushing and popping of the deprecated
 // diagnostic.
 _LIBCPP_DECLARE_STRONG_ENUM(errc){
-  success = 0,
     address_family_not_supported = 65536, //       = EAFNOSUPPORT,
           address_in_use, //                     = EADDRINUSE,
           address_not_available, //              = EADDRNOTAVAIL,
@@ -222,9 +221,6 @@ _LIBCPP_DECLARE_STRONG_ENUM(errc){
     wrong_protocol_type};
 _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(errc)
 
-errc __err_to_errc(int) noexcept;
-int __errc_to_err(errc) noexcept;
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___ERRC
diff --git a/third_party/libcxx/__system_error/error_code.h b/third_party/libcxx/__system_error/error_code.h
index 4f6e14945..475f2bb96 100644
--- a/third_party/libcxx/__system_error/error_code.h
+++ b/third_party/libcxx/__system_error/error_code.h
@@ -46,9 +46,8 @@ class _LIBCPP_EXPORTED_FROM_ABI error_code {
 
 public:
   _LIBCPP_HIDE_FROM_ABI error_code() _NOEXCEPT : __val_(0), __cat_(&system_category()) {}
-  _LIBCPP_HIDE_FROM_ABI error_code(errc __val) _NOEXCEPT : __val_(__errc_to_err(__val)), __cat_(&system_category()) {}
-  _LIBCPP_HIDE_FROM_ABI error_code(int __val, const error_category& __cat) _NOEXCEPT : __val_(__errc_to_err((errc)__val)), __cat_(&__cat) {}
-  _LIBCPP_HIDE_FROM_ABI error_code(errc __val, const error_category& __cat) _NOEXCEPT : __val_(__errc_to_err(__val)), __cat_(&__cat) {}
+
+  _LIBCPP_HIDE_FROM_ABI error_code(int __val, const error_category& __cat) _NOEXCEPT : __val_(__val), __cat_(&__cat) {}
 
   template <class _Ep, __enable_if_t<is_error_code_enum<_Ep>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI error_code(_Ep __e) _NOEXCEPT {
@@ -73,7 +72,7 @@ public:
     __cat_ = &system_category();
   }
 
-  _LIBCPP_HIDE_FROM_ABI int value() const _NOEXCEPT { return __errc_to_err((errc)__val_); }
+  _LIBCPP_HIDE_FROM_ABI int value() const _NOEXCEPT { return __val_; }
 
   _LIBCPP_HIDE_FROM_ABI const error_category& category() const _NOEXCEPT { return *__cat_; }
 
@@ -87,17 +86,13 @@ public:
 };
 
 inline _LIBCPP_HIDE_FROM_ABI error_code make_error_code(errc __e) _NOEXCEPT {
-  return error_code(__e, generic_category());
+  return error_code(static_cast<int>(__e), generic_category());
 }
 
 inline _LIBCPP_HIDE_FROM_ABI bool operator==(const error_code& __x, const error_code& __y) _NOEXCEPT {
   return __x.category() == __y.category() && __x.value() == __y.value();
 }
 
-inline _LIBCPP_HIDE_FROM_ABI bool operator==(const error_code& __x, errc __y) _NOEXCEPT {
-  return __x == error_code(__y, __x.category());
-}
-
 inline _LIBCPP_HIDE_FROM_ABI bool operator==(const error_code& __x, const error_condition& __y) _NOEXCEPT {
   return __x.category().equivalent(__x.value(), __y) || __y.category().equivalent(__x, __y.value());
 }
diff --git a/third_party/libcxx/__system_error/error_condition.h b/third_party/libcxx/__system_error/error_condition.h
index 9003bc919..42898c1f0 100644
--- a/third_party/libcxx/__system_error/error_condition.h
+++ b/third_party/libcxx/__system_error/error_condition.h
@@ -54,12 +54,8 @@ class _LIBCPP_EXPORTED_FROM_ABI error_condition {
 public:
   _LIBCPP_HIDE_FROM_ABI error_condition() _NOEXCEPT : __val_(0), __cat_(&generic_category()) {}
 
-  _LIBCPP_HIDE_FROM_ABI error_condition(errc __val, const error_category& __cat) _NOEXCEPT
-      : __val_(__errc_to_err(__val)),
-        __cat_(&__cat) {}
-
   _LIBCPP_HIDE_FROM_ABI error_condition(int __val, const error_category& __cat) _NOEXCEPT
-      : __val_(__errc_to_err((errc)__val)),
+      : __val_(__val),
         __cat_(&__cat) {}
 
   template <class _Ep, __enable_if_t<is_error_condition_enum<_Ep>::value, int> = 0>
@@ -85,7 +81,7 @@ public:
     __cat_ = &generic_category();
   }
 
-  _LIBCPP_HIDE_FROM_ABI int value() const _NOEXCEPT { return __errc_to_err((errc)__val_); }
+  _LIBCPP_HIDE_FROM_ABI int value() const _NOEXCEPT { return __val_; }
 
   _LIBCPP_HIDE_FROM_ABI const error_category& category() const _NOEXCEPT { return *__cat_; }
   string message() const;
@@ -94,7 +90,7 @@ public:
 };
 
 inline _LIBCPP_HIDE_FROM_ABI error_condition make_error_condition(errc __e) _NOEXCEPT {
-  return error_condition(__e, generic_category());
+  return error_condition(static_cast<int>(__e), generic_category());
 }
 
 inline _LIBCPP_HIDE_FROM_ABI bool operator==(const error_condition& __x, const error_condition& __y) _NOEXCEPT {
diff --git a/third_party/libcxx/__system_error/system_error.h b/third_party/libcxx/__system_error/system_error.h
index ff41ea9a7..362e67505 100644
--- a/third_party/libcxx/__system_error/system_error.h
+++ b/third_party/libcxx/__system_error/system_error.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___SYSTEM_ERROR_SYSTEM_ERROR_H
 
 #include <__config>
-#include <__system_error/errc.h>
 #include <__system_error/error_category.h>
 #include <__system_error/error_code.h>
 #include <__verbose_abort>
diff --git a/third_party/libcxx/errc.cpp b/third_party/libcxx/fs/cosmo.cpp
similarity index 95%
rename from third_party/libcxx/errc.cpp
rename to third_party/libcxx/fs/cosmo.cpp
index 101fe97d6..fbb2ab1d4 100644
--- a/third_party/libcxx/errc.cpp
+++ b/third_party/libcxx/fs/cosmo.cpp
@@ -1,8 +1,11 @@
-#include <__system_error/errc.h>
+#ifdef __COSMOPOLITAN__
+#include <filesystem>
 
-_LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_FILESYSTEM
 
-static std::errc __err_to_errc_impl(int err) noexcept {
+namespace detail {
+
+std::errc __cosmo_err_to_errc_impl(int err) {
   if (err == EAFNOSUPPORT) return errc::address_family_not_supported;
   if (err == EADDRINUSE) return errc::address_in_use;
   if (err == EADDRNOTAVAIL) return errc::address_not_available;
@@ -84,7 +87,7 @@ static std::errc __err_to_errc_impl(int err) noexcept {
   return errc::not_supported;
 }
 
-static int __errc_to_err_impl(std::errc err) noexcept {
+int __cosmo_errc_to_err_impl(std::errc err) {
   if (err == errc::address_family_not_supported) return EAFNOSUPPORT;
   if (err == errc::address_in_use) return EADDRINUSE;
   if (err == errc::address_not_available) return EADDRNOTAVAIL;
@@ -166,20 +169,20 @@ static int __errc_to_err_impl(std::errc err) noexcept {
   return ENOTSUP;
 }
 
-std::errc __err_to_errc(int err) noexcept {
-  if (!err)
-    return (std::errc)0;
+std::errc __cosmo_err_to_errc(int err) {
   if (err >= 65536)
     return (std::errc)err;
-  return __err_to_errc_impl(err);
+  return __cosmo_err_to_errc_impl(err);
 }
 
-int __errc_to_err(std::errc err) noexcept {
-  if (!(int)err)
-    return 0;
+int __cosmo_errc_to_err(std::errc err) {
   if ((int)err < 65536)
     return (int)err;
-  return __errc_to_err_impl(err);
+  return __cosmo_errc_to_err_impl(err);
 }
 
-_LIBCPP_END_NAMESPACE_STD
+} // end namespace detail
+
+_LIBCPP_END_NAMESPACE_FILESYSTEM
+
+#endif // __COSMOPOLITAN__
diff --git a/third_party/libcxx/fs/directory_iterator.cpp b/third_party/libcxx/fs/directory_iterator.cpp
index a82816c60..a7ffa9188 100644
--- a/third_party/libcxx/fs/directory_iterator.cpp
+++ b/third_party/libcxx/fs/directory_iterator.cpp
@@ -49,7 +49,7 @@ public:
     if (__stream_ == INVALID_HANDLE_VALUE) {
       ec                                  = detail::make_windows_error(GetLastError());
       const bool ignore_permission_denied = bool(opts & directory_options::skip_permission_denied);
-      if (ignore_permission_denied && ec == errc::permission_denied)
+      if (ignore_permission_denied && ec.value() == static_cast<int>(errc::permission_denied))
         ec.clear();
       return;
     }
@@ -118,7 +118,11 @@ public:
     if ((__stream_ = ::opendir(root.c_str())) == nullptr) {
       ec                      = detail::capture_errno();
       const bool allow_eacces = bool(opts & directory_options::skip_permission_denied);
-      if (allow_eacces && ec == errc::permission_denied)
+#ifdef __COSMOPOLITAN__
+      if (allow_eacces && ec.value() == (int)errc::permission_denied)
+#else
+      if (allow_eacces && ec.value() == EACCES)
+#endif
         ec.clear();
       return;
     }
@@ -307,7 +311,11 @@ bool recursive_directory_iterator::__try_recursion(error_code* ec) {
   }
   if (m_ec) {
     const bool allow_eacess = bool(__imp_->__options_ & directory_options::skip_permission_denied);
-    if (m_ec == errc::permission_denied && allow_eacess) {
+#ifdef __COSMOPOLITAN__
+    if (m_ec.value() == (int)errc::permission_denied && allow_eacess) {
+#else
+    if (m_ec.value() == EACCES && allow_eacess) {
+#endif
       if (ec)
         ec->clear();
     } else {
diff --git a/third_party/libcxx/fs/error.h b/third_party/libcxx/fs/error.h
index ecbfc1f3f..e74d06917 100644
--- a/third_party/libcxx/fs/error.h
+++ b/third_party/libcxx/fs/error.h
@@ -98,9 +98,16 @@ inline errc __win_err_to_errc(int err) {
 
 #endif // _LIBCPP_WIN32API
 
+errc __cosmo_err_to_errc(int);
+int __cosmo_errc_to_err(errc);
+
 inline error_code capture_errno() {
   _LIBCPP_ASSERT_INTERNAL(errno != 0, "Expected errno to be non-zero");
-  return error_code(__errc_to_err((errc)errno), generic_category());
+#ifdef __COSMOPOLITAN__
+  return error_code((int)__cosmo_err_to_errc(errno), generic_category());
+#else
+  return error_code(errno, generic_category());
+#endif
 }
 
 #if defined(_LIBCPP_WIN32API)
diff --git a/third_party/libcxx/fs/file_descriptor.h b/third_party/libcxx/fs/file_descriptor.h
index 55c313658..d41fe77bb 100644
--- a/third_party/libcxx/fs/file_descriptor.h
+++ b/third_party/libcxx/fs/file_descriptor.h
@@ -194,8 +194,12 @@ inline perms posix_get_perms(const StatT& st) noexcept { return static_cast<perm
 inline file_status create_file_status(error_code& m_ec, path const& p, const StatT& path_stat, error_code* ec) {
   if (ec)
     *ec = m_ec;
-  if (m_ec && (m_ec == errc::no_such_file_or_directory ||
-               m_ec == errc::not_a_directory)) {
+#ifdef __COSMOPOLITAN__
+  if (m_ec && (m_ec.value() == (int)errc::no_such_file_or_directory ||
+               m_ec.value() == (int)errc::not_a_directory)) {
+#else
+  if (m_ec && (m_ec.value() == ENOENT || m_ec.value() == ENOTDIR)) {
+#endif
     return file_status(file_type::not_found);
   } else if (m_ec) {
     ErrorHandler<void> err("posix_stat", ec, &p);
diff --git a/third_party/libcxx/fs/operations.cpp b/third_party/libcxx/fs/operations.cpp
index 07f661412..a83c1ae15 100644
--- a/third_party/libcxx/fs/operations.cpp
+++ b/third_party/libcxx/fs/operations.cpp
@@ -37,10 +37,7 @@
 #include <fcntl.h> /* values for fchmodat */
 #include <time.h>
 
-#if defined(__COSMOPOLITAN__)
-#  include <fstream>
-#  define _LIBCPP_FILESYSTEM_USE_FSTREAM
-#elif __has_include(<sys/sendfile.h>)
+#if __has_include(<sys/sendfile.h>)
 #  include <sys/sendfile.h>
 #  define _LIBCPP_FILESYSTEM_USE_SENDFILE
 #elif defined(__APPLE__) || __has_include(<copyfile.h>)
diff --git a/third_party/libcxx/ios b/third_party/libcxx/ios
index 92c32f203..d8a3643c7 100644
--- a/third_party/libcxx/ios
+++ b/third_party/libcxx/ios
@@ -13,7 +13,6 @@
 /*
     ios synopsis
 
-#include "third_party/libcxx/__system_error/error_code.h"
 #include <iosfwd>
 
 namespace std
diff --git a/third_party/libcxx/locale.cpp b/third_party/libcxx/locale.cpp
index ea8e89f7d..490deccf5 100644
--- a/third_party/libcxx/locale.cpp
+++ b/third_party/libcxx/locale.cpp
@@ -5643,6 +5643,75 @@ void moneypunct_byname<wchar_t, true>::init(const char* nm) {
 }
 #endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
+void __do_nothing(void*) {}
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS collate<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS collate<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS num_get<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS num_get<wchar_t>;)
+
+template struct _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __num_get<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template struct _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __num_get<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS num_put<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS num_put<wchar_t>;)
+
+template struct _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __num_put<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template struct _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __num_put<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_get<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_get<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_get_byname<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_get_byname<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_put<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_put<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_put_byname<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_put_byname<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct<char, false>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct<char, true>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct<wchar_t, false>;)
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct<wchar_t, true>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct_byname<char, false>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct_byname<char, true>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct_byname<wchar_t, false>;)
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct_byname<wchar_t, true>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS money_get<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS money_get<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __money_get<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __money_get<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS money_put<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS money_put<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __money_put<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __money_put<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS messages<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS messages<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS messages_byname<char>;
+_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS messages_byname<wchar_t>;)
+
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char, char, mbstate_t>;
+_LIBCPP_IF_WIDE_CHARACTERS(
+    template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<wchar_t, char, mbstate_t>;)
+template class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS
+    codecvt_byname<char16_t, char, mbstate_t>;
+template class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS
+    codecvt_byname<char32_t, char, mbstate_t>;
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char16_t, char8_t, mbstate_t>;
+template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char32_t, char8_t, mbstate_t>;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
diff --git a/third_party/libcxx/locale2.cpp b/third_party/libcxx/locale2.cpp
deleted file mode 100644
index 0e9778f1d..000000000
--- a/third_party/libcxx/locale2.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <__utility/no_destroy.h>
-#include <algorithm>
-#include <clocale>
-#include <codecvt>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <locale>
-#include <new>
-#include <string>
-#include <type_traits>
-#include <typeinfo>
-#include <utility>
-#include <vector>
-
-#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-#  include <cwctype>
-#endif
-
-#if defined(_AIX)
-#  include <sys/localedef.h> // for __lc_ctype_ptr
-#endif
-
-#if defined(_LIBCPP_MSVCRT)
-#  define _CTYPE_DISABLE_MACROS
-#endif
-
-#if !defined(_LIBCPP_MSVCRT) && !defined(__MINGW32__) && !defined(__BIONIC__) && !defined(__NuttX__)
-#  include <langinfo.h>
-#endif
-
-#include "atomic_support.h"
-#include "sso_allocator.h"
-
-// On Linux, wint_t and wchar_t have different signed-ness, and this causes
-// lots of noise in the build log, but no bugs that I know of.
-_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wsign-conversion")
-
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-void __do_nothing(void*) {}
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS collate<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS collate<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS num_get<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS num_get<wchar_t>;)
-
-template struct _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __num_get<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template struct _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __num_get<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS num_put<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS num_put<wchar_t>;)
-
-_LIBCPP_END_NAMESPACE_STD
-
-_LIBCPP_POP_MACROS
diff --git a/third_party/libcxx/locale3.cpp b/third_party/libcxx/locale3.cpp
deleted file mode 100644
index c185d1e44..000000000
--- a/third_party/libcxx/locale3.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <__utility/no_destroy.h>
-#include <algorithm>
-#include <clocale>
-#include <codecvt>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <locale>
-#include <new>
-#include <string>
-#include <type_traits>
-#include <typeinfo>
-#include <utility>
-#include <vector>
-
-#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-#  include <cwctype>
-#endif
-
-#if defined(_AIX)
-#  include <sys/localedef.h> // for __lc_ctype_ptr
-#endif
-
-#if defined(_LIBCPP_MSVCRT)
-#  define _CTYPE_DISABLE_MACROS
-#endif
-
-#if !defined(_LIBCPP_MSVCRT) && !defined(__MINGW32__) && !defined(__BIONIC__) && !defined(__NuttX__)
-#  include <langinfo.h>
-#endif
-
-#include "atomic_support.h"
-#include "sso_allocator.h"
-
-// On Linux, wint_t and wchar_t have different signed-ness, and this causes
-// lots of noise in the build log, but no bugs that I know of.
-_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wsign-conversion")
-
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct<char, false>;
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct<char, true>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct<wchar_t, false>;)
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct<wchar_t, true>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct_byname<char, false>;
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct_byname<char, true>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct_byname<wchar_t, false>;)
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS moneypunct_byname<wchar_t, true>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS money_get<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS money_get<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __money_get<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __money_get<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS money_put<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS money_put<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __money_put<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __money_put<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS messages<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS messages<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS messages_byname<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS messages_byname<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char, char, mbstate_t>;
-_LIBCPP_IF_WIDE_CHARACTERS(
-    template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<wchar_t, char, mbstate_t>;)
-template class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS
-    codecvt_byname<char16_t, char, mbstate_t>;
-template class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS
-    codecvt_byname<char32_t, char, mbstate_t>;
-#ifndef _LIBCPP_HAS_NO_CHAR8_T
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char16_t, char8_t, mbstate_t>;
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname<char32_t, char8_t, mbstate_t>;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-_LIBCPP_POP_MACROS
diff --git a/third_party/libcxx/locale4.cpp b/third_party/libcxx/locale4.cpp
deleted file mode 100644
index 3e6d757e2..000000000
--- a/third_party/libcxx/locale4.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <__utility/no_destroy.h>
-#include <algorithm>
-#include <clocale>
-#include <codecvt>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <locale>
-#include <new>
-#include <string>
-#include <type_traits>
-#include <typeinfo>
-#include <utility>
-#include <vector>
-
-#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-#  include <cwctype>
-#endif
-
-#if defined(_AIX)
-#  include <sys/localedef.h> // for __lc_ctype_ptr
-#endif
-
-#if defined(_LIBCPP_MSVCRT)
-#  define _CTYPE_DISABLE_MACROS
-#endif
-
-#if !defined(_LIBCPP_MSVCRT) && !defined(__MINGW32__) && !defined(__BIONIC__) && !defined(__NuttX__)
-#  include <langinfo.h>
-#endif
-
-#include "atomic_support.h"
-#include "sso_allocator.h"
-
-// On Linux, wint_t and wchar_t have different signed-ness, and this causes
-// lots of noise in the build log, but no bugs that I know of.
-_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wsign-conversion")
-
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template struct _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __num_put<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template struct _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __num_put<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_get<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_get<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_get_byname<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_get_byname<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_put<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_put<wchar_t>;)
-
-template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_put_byname<char>;
-_LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS time_put_byname<wchar_t>;)
-
-_LIBCPP_END_NAMESPACE_STD
-
-_LIBCPP_POP_MACROS
diff --git a/third_party/libcxx/stdfloat b/third_party/libcxx/stdfloat
deleted file mode 100644
index a3a27dcb5..000000000
--- a/third_party/libcxx/stdfloat
+++ /dev/null
@@ -1,26 +0,0 @@
-// -*- C++ -*-
-#pragma once
-
-export namespace std {
-
-#if defined(__STDCPP_FLOAT16_T__)
-  using float16_t = _Float16;
-#endif
-
-#if defined(__STDCPP_FLOAT32_T__)
-  using float32_t = float;
-#endif
-
-#if defined(__STDCPP_FLOAT64_T__)
-  using float64_t = double;
-#endif
-
-#if defined(__STDCPP_FLOAT128_T__)
-  using float128_t = long double;
-#endif
-
-#if defined(__STDCPP_BFLOAT16_T__)
-  using bfloat16_t = __bf16;
-#endif
-
-} // namespace std
diff --git a/third_party/libcxx/string b/third_party/libcxx/string
index b1756794b..2d5a1154a 100644
--- a/third_party/libcxx/string
+++ b/third_party/libcxx/string
@@ -10,8 +10,6 @@
 #ifndef _LIBCPP_STRING
 #define _LIBCPP_STRING
 
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-
 // clang-format off
 
 /*
diff --git a/third_party/libcxx/system_error b/third_party/libcxx/system_error
index 2d3af910d..eeab34778 100644
--- a/third_party/libcxx/system_error
+++ b/third_party/libcxx/system_error
@@ -146,8 +146,8 @@ template <> struct hash<std::error_condition>;
 
 #include <__config>
 #include <__system_error/errc.h>
-#include <__system_error/error_code.h>
 #include <__system_error/error_category.h>
+#include <__system_error/error_code.h>
 #include <__system_error/error_condition.h>
 #include <__system_error/system_error.h>
 #include <version>
diff --git a/third_party/libcxx/system_error.cpp b/third_party/libcxx/system_error.cpp
index 0e5bf2d4f..6f6b417a9 100644
--- a/third_party/libcxx/system_error.cpp
+++ b/third_party/libcxx/system_error.cpp
@@ -16,7 +16,6 @@
 #include <string.h>
 #include <string>
 #include <system_error>
-#include <__system_error/errc.h>
 
 #include "config_elast.h"
 
@@ -24,6 +23,10 @@
 #  include <android/api-level.h>
 #endif
 
+#ifdef __COSMOPOLITAN__
+#include <fs/error.h>
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace {
@@ -36,7 +39,9 @@ string do_strerror_r(int ev);
 
 #  if defined(_LIBCPP_MSVCRT_LIKE)
 string do_strerror_r(int ev) {
-  ev = __errc_to_err(ev);
+#ifdef __COSMOPOLITAN__
+  ev = (int)filesystem::detail::__cosmo_errc_to_err(ev);
+#endif
   char buffer[strerror_buff_size];
   if (::strerror_s(buffer, strerror_buff_size, ev) == 0)
     return string(buffer);
@@ -83,7 +88,9 @@ string do_strerror_r(int ev) {
   // Preserve errno around the call. (The C++ standard requires that
   // system_error functions not modify errno).
   const int old_errno       = errno;
-  ev = __errc_to_err((errc)ev);
+#ifdef __COSMOPOLITAN__
+  ev = filesystem::detail::__cosmo_errc_to_err((errc)ev);
+#endif
   const char* error_message = handle_strerror_r_return(::strerror_r(ev, buffer, strerror_buff_size), buffer);
   // If we didn't get any message, print one now.
   if (!error_message[0]) {
@@ -132,7 +139,9 @@ public:
 const char* __generic_error_category::name() const noexcept { return "generic"; }
 
 string __generic_error_category::message(int ev) const {
-  ev = __errc_to_err((errc)ev);
+#ifdef __COSMOPOLITAN__
+  ev = filesystem::detail::__cosmo_errc_to_err((errc)ev);
+#endif
 #ifdef _LIBCPP_ELAST
   if (ev > _LIBCPP_ELAST)
     return string("unspecified generic_category error");
@@ -160,7 +169,9 @@ public:
 const char* __system_error_category::name() const noexcept { return "system"; }
 
 string __system_error_category::message(int ev) const {
-  ev = __errc_to_err((errc)ev);
+#ifdef __COSMOPOLITAN__
+  ev = filesystem::detail::__cosmo_errc_to_err((errc)ev);
+#endif
 #ifdef _LIBCPP_ELAST
   if (ev > _LIBCPP_ELAST)
     return string("unspecified system_category error");
@@ -169,7 +180,9 @@ string __system_error_category::message(int ev) const {
 }
 
 error_condition __system_error_category::default_error_condition(int ev) const noexcept {
-  ev = __errc_to_err((errc)ev);
+#ifdef __COSMOPOLITAN__
+  ev = filesystem::detail::__cosmo_errc_to_err((errc)ev);
+#endif
 #ifdef _LIBCPP_ELAST
   if (ev > _LIBCPP_ELAST)
     return error_condition(ev, system_category());
@@ -218,7 +231,11 @@ system_error::~system_error() noexcept {}
 
 void __throw_system_error(int ev, const char* what_arg) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  std::__throw_system_error(error_code(__errc_to_err((errc)ev), system_category()), what_arg);
+#ifdef __COSMOPOLITAN__
+  std::__throw_system_error(error_code((int)filesystem::detail::__cosmo_err_to_errc(ev), system_category()), what_arg);
+#else
+  std::__throw_system_error(error_code(ev, system_category()), what_arg);
+#endif
 #else
   // The above could also handle the no-exception case, but for size, avoid referencing system_category() unnecessarily.
   _LIBCPP_VERBOSE_ABORT(
diff --git a/third_party/libcxxabi/cxa_guard_impl.h b/third_party/libcxxabi/cxa_guard_impl.h
index 568170507..6a2d25569 100644
--- a/third_party/libcxxabi/cxa_guard_impl.h
+++ b/third_party/libcxxabi/cxa_guard_impl.h
@@ -338,10 +338,7 @@ public:
       return true;
 
     if (has_thread_id_support)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
       *thread_id_address = current_thread_id.get();
-#pragma GCC diagnostic pop
 
     *init_byte_address = PENDING_BIT;
     return false;
diff --git a/third_party/libunwind/AddressSpace.hpp b/third_party/libunwind/AddressSpace.hpp
index 26ee10c52..f7c627141 100644
--- a/third_party/libunwind/AddressSpace.hpp
+++ b/third_party/libunwind/AddressSpace.hpp
@@ -94,12 +94,12 @@ namespace libunwind {
 //   __eh_frame_hdr_start = SIZEOF(.eh_frame_hdr) > 0 ? ADDR(.eh_frame_hdr) : 0;
 //   __eh_frame_hdr_end = SIZEOF(.eh_frame_hdr) > 0 ? . : 0;
 
-extern char __eh_frame_start __attribute__((__weak__)); // [jart]
-extern char __eh_frame_end __attribute__((__weak__)); // [jart]
+extern char __eh_frame_start;
+extern char __eh_frame_end;
 
 #if defined(_LIBUNWIND_SUPPORT_DWARF_INDEX)
-extern char __eh_frame_hdr_start __attribute__((__weak__)); // [jart]
-extern char __eh_frame_hdr_end __attribute__((__weak__)); // [jart]
+extern char __eh_frame_hdr_start;
+extern char __eh_frame_hdr_end;
 #endif
 
 #elif defined(_LIBUNWIND_ARM_EHABI) && defined(_LIBUNWIND_IS_BAREMETAL)
diff --git a/third_party/libunwind/BUILD.mk b/third_party/libunwind/BUILD.mk
index 560df58b4..242d4f8d1 100644
--- a/third_party/libunwind/BUILD.mk
+++ b/third_party/libunwind/BUILD.mk
@@ -20,7 +20,6 @@ THIRD_PARTY_LIBUNWIND_A_HDRS =						\
 	third_party/libunwind/include/__libunwind_config.h		\
 	third_party/libunwind/include/libunwind.h			\
 	third_party/libunwind/include/unwind.h				\
-	third_party/libunwind/assembly.h				\
 	third_party/libunwind/config.h					\
 	third_party/libunwind/cet_unwind.h				\
 	third_party/libunwind/dwarf2.h					\
@@ -36,23 +35,18 @@ THIRD_PARTY_LIBUNWIND_A_SRCS_CC =					\
 	third_party/libunwind/libunwind.cc
 
 THIRD_PARTY_LIBUNWIND_A_SRCS_C =					\
+	third_party/libunwind/Unwind-sjlj.c				\
 	third_party/libunwind/UnwindLevel1-gcc-ext.c			\
 	third_party/libunwind/UnwindLevel1.c				\
 	third_party/libunwind/gcc_personality_v0.c
 
-THIRD_PARTY_LIBUNWIND_A_SRCS_S =					\
-	third_party/libunwind/UnwindRegistersRestore.S			\
-	third_party/libunwind/UnwindRegistersSave.S			\
-
 THIRD_PARTY_LIBUNWIND_A_SRCS =						\
 	$(THIRD_PARTY_LIBUNWIND_A_SRCS_C)				\
-	$(THIRD_PARTY_LIBUNWIND_A_SRCS_CC)				\
-	$(THIRD_PARTY_LIBUNWIND_A_SRCS_S)				\
+	$(THIRD_PARTY_LIBUNWIND_A_SRCS_CC)
 
 THIRD_PARTY_LIBUNWIND_A_OBJS =						\
 	$(THIRD_PARTY_LIBUNWIND_A_SRCS_C:%.c=o/$(MODE)/%.o)		\
-	$(THIRD_PARTY_LIBUNWIND_A_SRCS_CC:%.cc=o/$(MODE)/%.o)		\
-	$(THIRD_PARTY_LIBUNWIND_A_SRCS_S:%.S=o/$(MODE)/%.o)		\
+	$(THIRD_PARTY_LIBUNWIND_A_SRCS_CC:%.cc=o/$(MODE)/%.o)
 
 THIRD_PARTY_LIBUNWIND_A_CHECKS =					\
 	$(THIRD_PARTY_LIBUNWIND_A).pkg					\
@@ -61,9 +55,7 @@ THIRD_PARTY_LIBUNWIND_A_CHECKS =					\
 THIRD_PARTY_LIBUNWIND_A_DIRECTDEPS =					\
 	LIBC_CALLS							\
 	LIBC_INTRIN							\
-	LIBC_STDIO							\
-	LIBC_MEM							\
-	LIBC_THREAD							\
+	LIBC_STDIO
 
 THIRD_PARTY_LIBUNWIND_A_DEPS :=						\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_LIBUNWIND_A_DIRECTDEPS),$($(x))))
@@ -83,20 +75,7 @@ $(THIRD_PARTY_LIBUNWIND_A_OBJS): private				\
 			-fno-sanitize=all				\
 			-ffunction-sections				\
 			-fdata-sections					\
-			-D_LIBUNWIND_USE_DLADDR=0			\
-			-D_LIBUNWIND_IS_BAREMETAL=1			\
-
-# avoid cyclic dependency on libcxxabi
-o/$(MODE)/third_party/libunwind/libunwind.o:				\
-		COPTS +=						\
-			-fno-rtti					\
-
-o/$(MODE)/third_party/libunwind/UnwindRegistersRestore.o: third_party/libunwind/UnwindRegistersRestore.S
-	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
-o/$(MODE)/third_party/libunwind/UnwindRegistersSave.o: third_party/libunwind/UnwindRegistersSave.S
-	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
-
-$(THIRD_PARTY_LIBUNWIND_A_OBJS): third_party/libunwind/BUILD.mk
+			-D_LIBUNWIND_USE_DLADDR=0
 
 THIRD_PARTY_LIBUNWIND_LIBS = $(foreach x,$(THIRD_PARTY_LIBUNWIND_ARTIFACTS),$($(x)))
 THIRD_PARTY_LIBUNWIND_SRCS = $(foreach x,$(THIRD_PARTY_LIBUNWIND_ARTIFACTS),$($(x)_SRCS))
diff --git a/third_party/libunwind/Unwind-sjlj.c b/third_party/libunwind/Unwind-sjlj.c
new file mode 100644
index 000000000..514358e5b
--- /dev/null
+++ b/third_party/libunwind/Unwind-sjlj.c
@@ -0,0 +1,530 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+//  Implements setjump-longjump based C++ exceptions
+//
+//===----------------------------------------------------------------------===//
+
+#include "third_party/libunwind/include/unwind.h"
+
+#include "libc/isystem/inttypes.h"
+#include "libc/isystem/stdint.h"
+#include "libc/isystem/stdbool.h"
+#include "libc/isystem/stdlib.h"
+
+#include "third_party/libunwind/config.h"
+
+/// With SJLJ based exceptions, any function that has a catch clause or needs to
+/// do any clean up when an exception propagates through it, needs to call
+/// \c _Unwind_SjLj_Register at the start of the function and
+/// \c _Unwind_SjLj_Unregister at the end.  The register function is called with
+/// the address of a block of memory in the function's stack frame.  The runtime
+/// keeps a linked list (stack) of these blocks - one per thread.  The calling
+/// function also sets the personality and lsda fields of the block.
+
+#if defined(_LIBUNWIND_BUILD_SJLJ_APIS)
+
+typedef uintptr_t _Unwind_Word __attribute__((__mode__(__unwind_word__)));
+
+struct _Unwind_FunctionContext {
+  // next function in stack of handlers
+  struct _Unwind_FunctionContext *prev;
+
+#if defined(__ve__)
+  // VE requires to store 64 bit pointers in the buffer for SjLj exception.
+  // We expand the size of values defined here.  This size must be matched
+  // to the size returned by TargetMachine::getSjLjDataSize().
+
+  // set by calling function before registering to be the landing pad
+  uint64_t                        resumeLocation;
+
+  // set by personality handler to be parameters passed to landing pad function
+  uint64_t                        resumeParameters[4];
+#else
+  // set by calling function before registering to be the landing pad
+  uint32_t                        resumeLocation;
+
+  // set by personality handler to be parameters passed to landing pad function
+  _Unwind_Word                    resumeParameters[4];
+#endif
+
+  // set by calling function before registering
+  _Unwind_Personality_Fn personality;          // arm offset=24
+  uintptr_t                       lsda;        // arm offset=28
+
+  // variable length array, contains registers to restore
+  // 0 = r7, 1 = pc, 2 = sp
+  void                           *jbuf[];
+};
+
+#if defined(_LIBUNWIND_HAS_NO_THREADS)
+# define _LIBUNWIND_THREAD_LOCAL
+#else
+# if __STDC_VERSION__ >= 201112L
+#  define _LIBUNWIND_THREAD_LOCAL _Thread_local
+# elif defined(_MSC_VER)
+#  define _LIBUNWIND_THREAD_LOCAL __declspec(thread)
+# elif defined(__GNUC__) || defined(__clang__)
+#  define _LIBUNWIND_THREAD_LOCAL __thread
+# else
+#  error Unable to create thread local storage
+# endif
+#endif
+
+
+#if !defined(FOR_DYLD)
+
+#if defined(__APPLE__)
+#include <System/pthread_machdep.h>
+#else
+static _LIBUNWIND_THREAD_LOCAL struct _Unwind_FunctionContext *stack = NULL;
+#endif
+
+static struct _Unwind_FunctionContext *__Unwind_SjLj_GetTopOfFunctionStack() {
+#if defined(__APPLE__)
+  return _pthread_getspecific_direct(__PTK_LIBC_DYLD_Unwind_SjLj_Key);
+#else
+  return stack;
+#endif
+}
+
+static void
+__Unwind_SjLj_SetTopOfFunctionStack(struct _Unwind_FunctionContext *fc) {
+#if defined(__APPLE__)
+  _pthread_setspecific_direct(__PTK_LIBC_DYLD_Unwind_SjLj_Key, fc);
+#else
+  stack = fc;
+#endif
+}
+
+#endif
+
+
+/// Called at start of each function that catches exceptions
+_LIBUNWIND_EXPORT void
+_Unwind_SjLj_Register(struct _Unwind_FunctionContext *fc) {
+  fc->prev = __Unwind_SjLj_GetTopOfFunctionStack();
+  __Unwind_SjLj_SetTopOfFunctionStack(fc);
+}
+
+
+/// Called at end of each function that catches exceptions
+_LIBUNWIND_EXPORT void
+_Unwind_SjLj_Unregister(struct _Unwind_FunctionContext *fc) {
+  __Unwind_SjLj_SetTopOfFunctionStack(fc->prev);
+}
+
+
+static _Unwind_Reason_Code
+unwind_phase1(struct _Unwind_Exception *exception_object) {
+  _Unwind_FunctionContext_t c = __Unwind_SjLj_GetTopOfFunctionStack();
+  _LIBUNWIND_TRACE_UNWINDING("unwind_phase1: initial function-context=%p",
+                             (void *)c);
+
+  // walk each frame looking for a place to stop
+  for (bool handlerNotFound = true; handlerNotFound; c = c->prev) {
+
+    // check for no more frames
+    if (c == NULL) {
+      _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): reached "
+                                 "bottom => _URC_END_OF_STACK",
+                                 (void *)exception_object);
+      return _URC_END_OF_STACK;
+    }
+
+    _LIBUNWIND_TRACE_UNWINDING("unwind_phase1: function-context=%p", (void *)c);
+    // if there is a personality routine, ask it if it will want to stop at this
+    // frame
+    if (c->personality != NULL) {
+      _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): calling "
+                                 "personality function %p",
+                                 (void *)exception_object,
+                                 (void *)c->personality);
+      _Unwind_Reason_Code personalityResult = (*c->personality)(
+          1, _UA_SEARCH_PHASE, exception_object->exception_class,
+          exception_object, (struct _Unwind_Context *)c);
+      switch (personalityResult) {
+      case _URC_HANDLER_FOUND:
+        // found a catch clause or locals that need destructing in this frame
+        // stop search and remember function context
+        handlerNotFound = false;
+        exception_object->private_2 = (uintptr_t) c;
+        _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): "
+                                   "_URC_HANDLER_FOUND",
+                                   (void *)exception_object);
+        return _URC_NO_REASON;
+
+      case _URC_CONTINUE_UNWIND:
+        _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): "
+                                   "_URC_CONTINUE_UNWIND",
+                                   (void *)exception_object);
+        // continue unwinding
+        break;
+
+      default:
+        // something went wrong
+        _LIBUNWIND_TRACE_UNWINDING(
+            "unwind_phase1(ex_ojb=%p): _URC_FATAL_PHASE1_ERROR",
+            (void *)exception_object);
+        return _URC_FATAL_PHASE1_ERROR;
+      }
+    }
+  }
+  return _URC_NO_REASON;
+}
+
+
+static _Unwind_Reason_Code
+unwind_phase2(struct _Unwind_Exception *exception_object) {
+  _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p)",
+                             (void *)exception_object);
+
+  // walk each frame until we reach where search phase said to stop
+  _Unwind_FunctionContext_t c = __Unwind_SjLj_GetTopOfFunctionStack();
+  while (true) {
+    _LIBUNWIND_TRACE_UNWINDING("unwind_phase2s(ex_ojb=%p): context=%p",
+                               (void *)exception_object, (void *)c);
+
+    // check for no more frames
+    if (c == NULL) {
+      _LIBUNWIND_TRACE_UNWINDING(
+          "unwind_phase2(ex_ojb=%p): __unw_step() reached "
+          "bottom => _URC_END_OF_STACK",
+          (void *)exception_object);
+      return _URC_END_OF_STACK;
+    }
+
+    // if there is a personality routine, tell it we are unwinding
+    if (c->personality != NULL) {
+      _Unwind_Action action = _UA_CLEANUP_PHASE;
+      if ((uintptr_t) c == exception_object->private_2)
+        action = (_Unwind_Action)(
+            _UA_CLEANUP_PHASE |
+            _UA_HANDLER_FRAME); // tell personality this was the frame it marked
+                                // in phase 1
+      _Unwind_Reason_Code personalityResult =
+          (*c->personality)(1, action, exception_object->exception_class,
+                            exception_object, (struct _Unwind_Context *)c);
+      switch (personalityResult) {
+      case _URC_CONTINUE_UNWIND:
+        // continue unwinding
+        _LIBUNWIND_TRACE_UNWINDING(
+            "unwind_phase2(ex_ojb=%p): _URC_CONTINUE_UNWIND",
+            (void *)exception_object);
+        if ((uintptr_t) c == exception_object->private_2) {
+          // phase 1 said we would stop at this frame, but we did not...
+          _LIBUNWIND_ABORT("during phase1 personality function said it would "
+                           "stop here, but now if phase2 it did not stop here");
+        }
+        break;
+      case _URC_INSTALL_CONTEXT:
+        _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): "
+                                   "_URC_INSTALL_CONTEXT, will resume at "
+                                   "landing pad %p",
+                                   (void *)exception_object, c->jbuf[1]);
+        // personality routine says to transfer control to landing pad
+        // we may get control back if landing pad calls _Unwind_Resume()
+        __Unwind_SjLj_SetTopOfFunctionStack(c);
+        __builtin_longjmp(c->jbuf, 1);
+        // __unw_resume() only returns if there was an error
+        return _URC_FATAL_PHASE2_ERROR;
+      default:
+        // something went wrong
+        _LIBUNWIND_DEBUG_LOG("personality function returned unknown result %d",
+                      personalityResult);
+        return _URC_FATAL_PHASE2_ERROR;
+      }
+    }
+    c = c->prev;
+  }
+
+  // clean up phase did not resume at the frame that the search phase said it
+  // would
+  return _URC_FATAL_PHASE2_ERROR;
+}
+
+
+static _Unwind_Reason_Code
+unwind_phase2_forced(struct _Unwind_Exception *exception_object,
+                     _Unwind_Stop_Fn stop, void *stop_parameter) {
+  // walk each frame until we reach where search phase said to stop
+  _Unwind_FunctionContext_t c = __Unwind_SjLj_GetTopOfFunctionStack();
+  while (true) {
+
+    // get next frame (skip over first which is _Unwind_RaiseException)
+    if (c == NULL) {
+      _LIBUNWIND_TRACE_UNWINDING(
+          "unwind_phase2(ex_ojb=%p): __unw_step() reached "
+          "bottom => _URC_END_OF_STACK",
+          (void *)exception_object);
+      return _URC_END_OF_STACK;
+    }
+
+    // call stop function at each frame
+    _Unwind_Action action =
+        (_Unwind_Action)(_UA_FORCE_UNWIND | _UA_CLEANUP_PHASE);
+    _Unwind_Reason_Code stopResult =
+        (*stop)(1, action, exception_object->exception_class, exception_object,
+                (struct _Unwind_Context *)c, stop_parameter);
+    _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): "
+                               "stop function returned %d",
+                               (void *)exception_object, stopResult);
+    if (stopResult != _URC_NO_REASON) {
+      _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): "
+                                 "stopped by stop function",
+                                 (void *)exception_object);
+      return _URC_FATAL_PHASE2_ERROR;
+    }
+
+    // if there is a personality routine, tell it we are unwinding
+    if (c->personality != NULL) {
+      _Unwind_Personality_Fn p = (_Unwind_Personality_Fn)c->personality;
+      _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): "
+                                 "calling personality function %p",
+                                 (void *)exception_object, (void *)p);
+      _Unwind_Reason_Code personalityResult =
+          (*p)(1, action, exception_object->exception_class, exception_object,
+               (struct _Unwind_Context *)c);
+      switch (personalityResult) {
+      case _URC_CONTINUE_UNWIND:
+        _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p):  "
+                                   "personality returned _URC_CONTINUE_UNWIND",
+                                   (void *)exception_object);
+        // destructors called, continue unwinding
+        break;
+      case _URC_INSTALL_CONTEXT:
+        _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): "
+                                   "personality returned _URC_INSTALL_CONTEXT",
+                                   (void *)exception_object);
+        // we may get control back if landing pad calls _Unwind_Resume()
+        __Unwind_SjLj_SetTopOfFunctionStack(c);
+        __builtin_longjmp(c->jbuf, 1);
+        break;
+      default:
+        // something went wrong
+        _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): "
+                                   "personality returned %d, "
+                                   "_URC_FATAL_PHASE2_ERROR",
+                                   (void *)exception_object, personalityResult);
+        return _URC_FATAL_PHASE2_ERROR;
+      }
+    }
+    c = c->prev;
+  }
+
+  // call stop function one last time and tell it we've reached the end of the
+  // stack
+  _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): calling stop "
+                             "function with _UA_END_OF_STACK",
+                             (void *)exception_object);
+  _Unwind_Action lastAction =
+      (_Unwind_Action)(_UA_FORCE_UNWIND | _UA_CLEANUP_PHASE | _UA_END_OF_STACK);
+  (*stop)(1, lastAction, exception_object->exception_class, exception_object,
+          (struct _Unwind_Context *)c, stop_parameter);
+
+  // clean up phase did not resume at the frame that the search phase said it
+  // would
+  return _URC_FATAL_PHASE2_ERROR;
+}
+
+
+/// Called by __cxa_throw.  Only returns if there is a fatal error
+_LIBUNWIND_EXPORT _Unwind_Reason_Code
+_Unwind_SjLj_RaiseException(struct _Unwind_Exception *exception_object) {
+  _LIBUNWIND_TRACE_API("_Unwind_SjLj_RaiseException(ex_obj=%p)",
+                       (void *)exception_object);
+
+  // mark that this is a non-forced unwind, so _Unwind_Resume() can do the right
+  // thing
+  exception_object->private_1 = 0;
+  exception_object->private_2 = 0;
+
+  // phase 1: the search phase
+  _Unwind_Reason_Code phase1 = unwind_phase1(exception_object);
+  if (phase1 != _URC_NO_REASON)
+    return phase1;
+
+  // phase 2: the clean up phase
+  return unwind_phase2(exception_object);
+}
+
+
+
+/// When _Unwind_RaiseException() is in phase2, it hands control
+/// to the personality function at each frame.  The personality
+/// may force a jump to a landing pad in that function, the landing
+/// pad code may then call _Unwind_Resume() to continue with the
+/// unwinding.  Note: the call to _Unwind_Resume() is from compiler
+/// generated user code.  All other _Unwind_* routines are called
+/// by the C++ runtime __cxa_* routines.
+///
+/// Re-throwing an exception is implemented by having the code call
+/// __cxa_rethrow() which in turn calls _Unwind_Resume_or_Rethrow()
+_LIBUNWIND_EXPORT void
+_Unwind_SjLj_Resume(struct _Unwind_Exception *exception_object) {
+  _LIBUNWIND_TRACE_API("_Unwind_SjLj_Resume(ex_obj=%p)",
+                       (void *)exception_object);
+
+  if (exception_object->private_1 != 0)
+    unwind_phase2_forced(exception_object,
+                         (_Unwind_Stop_Fn) exception_object->private_1,
+                         (void *)exception_object->private_2);
+  else
+    unwind_phase2(exception_object);
+
+  // clients assume _Unwind_Resume() does not return, so all we can do is abort.
+  _LIBUNWIND_ABORT("_Unwind_SjLj_Resume() can't return");
+}
+
+
+///  Called by __cxa_rethrow().
+_LIBUNWIND_EXPORT _Unwind_Reason_Code
+_Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *exception_object) {
+  _LIBUNWIND_TRACE_API("__Unwind_SjLj_Resume_or_Rethrow(ex_obj=%p), "
+                       "private_1=%" PRIuPTR,
+                       (void *)exception_object, exception_object->private_1);
+  // If this is non-forced and a stopping place was found, then this is a
+  // re-throw.
+  // Call _Unwind_RaiseException() as if this was a new exception.
+  if (exception_object->private_1 == 0) {
+    return _Unwind_SjLj_RaiseException(exception_object);
+    // should return if there is no catch clause, so that __cxa_rethrow can call
+    // std::terminate()
+  }
+
+  // Call through to _Unwind_Resume() which distinguishes between forced and
+  // regular exceptions.
+  _Unwind_SjLj_Resume(exception_object);
+  _LIBUNWIND_ABORT("__Unwind_SjLj_Resume_or_Rethrow() called "
+                    "_Unwind_SjLj_Resume() which unexpectedly returned");
+}
+
+
+/// Called by personality handler during phase 2 to get LSDA for current frame.
+_LIBUNWIND_EXPORT uintptr_t
+_Unwind_GetLanguageSpecificData(struct _Unwind_Context *context) {
+  _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context;
+  _LIBUNWIND_TRACE_API("_Unwind_GetLanguageSpecificData(context=%p) "
+                       "=> 0x%" PRIuPTR,
+                       (void *)context, ufc->lsda);
+  return ufc->lsda;
+}
+
+
+/// Called by personality handler during phase 2 to get register values.
+_LIBUNWIND_EXPORT uintptr_t _Unwind_GetGR(struct _Unwind_Context *context,
+                                          int index) {
+  _LIBUNWIND_TRACE_API("_Unwind_GetGR(context=%p, reg=%d)", (void *)context,
+                       index);
+  _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context;
+  return ufc->resumeParameters[index];
+}
+
+
+/// Called by personality handler during phase 2 to alter register values.
+_LIBUNWIND_EXPORT void _Unwind_SetGR(struct _Unwind_Context *context, int index,
+                                     uintptr_t new_value) {
+  _LIBUNWIND_TRACE_API("_Unwind_SetGR(context=%p, reg=%d, value=0x%" PRIuPTR
+                       ")",
+                       (void *)context, index, new_value);
+  _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context;
+  ufc->resumeParameters[index] = new_value;
+}
+
+
+/// Called by personality handler during phase 2 to get instruction pointer.
+_LIBUNWIND_EXPORT uintptr_t _Unwind_GetIP(struct _Unwind_Context *context) {
+  _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context;
+  _LIBUNWIND_TRACE_API("_Unwind_GetIP(context=%p) => 0x%" PRIu32,
+                       (void *)context, ufc->resumeLocation + 1);
+  return ufc->resumeLocation + 1;
+}
+
+
+/// Called by personality handler during phase 2 to get instruction pointer.
+/// ipBefore is a boolean that says if IP is already adjusted to be the call
+/// site address.  Normally IP is the return address.
+_LIBUNWIND_EXPORT uintptr_t _Unwind_GetIPInfo(struct _Unwind_Context *context,
+                                              int *ipBefore) {
+  _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context;
+  *ipBefore = 0;
+  _LIBUNWIND_TRACE_API("_Unwind_GetIPInfo(context=%p, %p) => 0x%" PRIu32,
+                       (void *)context, (void *)ipBefore,
+                       ufc->resumeLocation + 1);
+  return ufc->resumeLocation + 1;
+}
+
+
+/// Called by personality handler during phase 2 to alter instruction pointer.
+_LIBUNWIND_EXPORT void _Unwind_SetIP(struct _Unwind_Context *context,
+                                     uintptr_t new_value) {
+  _LIBUNWIND_TRACE_API("_Unwind_SetIP(context=%p, value=0x%" PRIuPTR ")",
+                       (void *)context, new_value);
+  _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context;
+  ufc->resumeLocation = new_value - 1;
+}
+
+
+/// Called by personality handler during phase 2 to find the start of the
+/// function.
+_LIBUNWIND_EXPORT uintptr_t
+_Unwind_GetRegionStart(struct _Unwind_Context *context) {
+  // Not supported or needed for sjlj based unwinding
+  (void)context;
+  _LIBUNWIND_TRACE_API("_Unwind_GetRegionStart(context=%p)", (void *)context);
+  return 0;
+}
+
+
+/// Called by personality handler during phase 2 if a foreign exception
+/// is caught.
+_LIBUNWIND_EXPORT void
+_Unwind_DeleteException(struct _Unwind_Exception *exception_object) {
+  _LIBUNWIND_TRACE_API("_Unwind_DeleteException(ex_obj=%p)",
+                       (void *)exception_object);
+  if (exception_object->exception_cleanup != NULL)
+    (*exception_object->exception_cleanup)(_URC_FOREIGN_EXCEPTION_CAUGHT,
+                                           exception_object);
+}
+
+
+
+/// Called by personality handler during phase 2 to get base address for data
+/// relative encodings.
+_LIBUNWIND_EXPORT uintptr_t
+_Unwind_GetDataRelBase(struct _Unwind_Context *context) {
+  // Not supported or needed for sjlj based unwinding
+  (void)context;
+  _LIBUNWIND_TRACE_API("_Unwind_GetDataRelBase(context=%p)", (void *)context);
+  _LIBUNWIND_ABORT("_Unwind_GetDataRelBase() not implemented");
+}
+
+
+/// Called by personality handler during phase 2 to get base address for text
+/// relative encodings.
+_LIBUNWIND_EXPORT uintptr_t
+_Unwind_GetTextRelBase(struct _Unwind_Context *context) {
+  // Not supported or needed for sjlj based unwinding
+  (void)context;
+  _LIBUNWIND_TRACE_API("_Unwind_GetTextRelBase(context=%p)", (void *)context);
+  _LIBUNWIND_ABORT("_Unwind_GetTextRelBase() not implemented");
+}
+
+
+/// Called by personality handler to get "Call Frame Area" for current frame.
+_LIBUNWIND_EXPORT uintptr_t _Unwind_GetCFA(struct _Unwind_Context *context) {
+  _LIBUNWIND_TRACE_API("_Unwind_GetCFA(context=%p)", (void *)context);
+  if (context != NULL) {
+    _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context;
+    // Setjmp/longjmp based exceptions don't have a true CFA.
+    // Instead, the SP in the jmpbuf is the closest approximation.
+    return (uintptr_t) ufc->jbuf[2];
+  }
+  return 0;
+}
+
+#endif // defined(_LIBUNWIND_BUILD_SJLJ_APIS)
diff --git a/third_party/libunwind/UnwindRegistersRestore.S b/third_party/libunwind/UnwindRegistersRestore.S
deleted file mode 100644
index 180a66582..000000000
--- a/third_party/libunwind/UnwindRegistersRestore.S
+++ /dev/null
@@ -1,1256 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "assembly.h"
-
-#define FROM_0_TO_15 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-#define FROM_16_TO_31 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-
-#define FROM_0_TO_31 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-#define FROM_32_TO_63 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63
-
-#if defined(_AIX)
-  .toc
-#else
-  .text
-#endif
-
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
-
-#if defined(__i386__)
-DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_jumpto)
-#
-# extern "C" void __libunwind_Registers_x86_jumpto(Registers_x86 *);
-#
-# On entry:
-#  +                       +
-#  +-----------------------+
-#  + thread_state pointer  +
-#  +-----------------------+
-#  + return address        +
-#  +-----------------------+   <-- SP
-#  +                       +
-
-  _LIBUNWIND_CET_ENDBR
-  movl   4(%esp), %eax
-  # set up eax and ret on new stack location
-  movl  28(%eax), %edx # edx holds new stack pointer
-  subl  $8,%edx
-  movl  %edx, 28(%eax)
-  movl  0(%eax), %ebx
-  movl  %ebx, 0(%edx)
-  movl  40(%eax), %ebx
-  movl  %ebx, 4(%edx)
-  # we now have ret and eax pushed onto where new stack will be
-  # restore all registers
-  movl   4(%eax), %ebx
-  movl   8(%eax), %ecx
-  movl  12(%eax), %edx
-  movl  16(%eax), %edi
-  movl  20(%eax), %esi
-  movl  24(%eax), %ebp
-  movl  28(%eax), %esp
-  # skip ss
-  # skip eflags
-  pop    %eax  # eax was already pushed on new stack
-  pop    %ecx
-  jmp    *%ecx
-  # skip cs
-  # skip ds
-  # skip es
-  # skip fs
-  # skip gs
-
-#elif defined(__x86_64__)
-
-DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_64_jumpto)
-#
-# extern "C" void __libunwind_Registers_x86_64_jumpto(Registers_x86_64 *);
-#
-#if defined(_WIN64)
-# On entry, thread_state pointer is in rcx; move it into rdi
-# to share restore code below. Since this routine restores and
-# overwrites all registers, we can use the same registers for
-# pointers and temporaries as on unix even though win64 normally
-# mustn't clobber some of them.
-  movq  %rcx, %rdi
-#else
-# On entry, thread_state pointer is in rdi
-#endif
-
-  _LIBUNWIND_CET_ENDBR
-  movq  56(%rdi), %rax # rax holds new stack pointer
-  subq  $16, %rax
-  movq  %rax, 56(%rdi)
-  movq  32(%rdi), %rbx  # store new rdi on new stack
-  movq  %rbx, 0(%rax)
-  movq  128(%rdi), %rbx # store new rip on new stack
-  movq  %rbx, 8(%rax)
-  # restore all registers
-  movq    0(%rdi), %rax
-  movq    8(%rdi), %rbx
-  movq   16(%rdi), %rcx
-  movq   24(%rdi), %rdx
-  # restore rdi later
-  movq   40(%rdi), %rsi
-  movq   48(%rdi), %rbp
-  # restore rsp later
-  movq   64(%rdi), %r8
-  movq   72(%rdi), %r9
-  movq   80(%rdi), %r10
-  movq   88(%rdi), %r11
-  movq   96(%rdi), %r12
-  movq  104(%rdi), %r13
-  movq  112(%rdi), %r14
-  movq  120(%rdi), %r15
-  # skip rflags
-  # skip cs
-  # skip fs
-  # skip gs
-
-#if defined(_WIN64)
-  movdqu 176(%rdi),%xmm0
-  movdqu 192(%rdi),%xmm1
-  movdqu 208(%rdi),%xmm2
-  movdqu 224(%rdi),%xmm3
-  movdqu 240(%rdi),%xmm4
-  movdqu 256(%rdi),%xmm5
-  movdqu 272(%rdi),%xmm6
-  movdqu 288(%rdi),%xmm7
-  movdqu 304(%rdi),%xmm8
-  movdqu 320(%rdi),%xmm9
-  movdqu 336(%rdi),%xmm10
-  movdqu 352(%rdi),%xmm11
-  movdqu 368(%rdi),%xmm12
-  movdqu 384(%rdi),%xmm13
-  movdqu 400(%rdi),%xmm14
-  movdqu 416(%rdi),%xmm15
-#endif
-  movq  56(%rdi), %rsp  # cut back rsp to new location
-  pop    %rdi      # rdi was saved here earlier
-  pop    %rcx
-  jmpq   *%rcx
-
-
-#elif defined(__powerpc64__)
-
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_ppc646jumptoEv)
-//
-// void libunwind::Registers_ppc64::jumpto()
-//
-// On entry:
-//  thread_state pointer is in r3
-//
-
-// load register (GPR)
-#define PPC64_LR(n) \
-  ld    n, (8 * (n + 2))(3)
-
-  // restore integral registers
-  // skip r0 for now
-  // skip r1 for now
-  PPC64_LR(2)
-  // skip r3 for now
-  // skip r4 for now
-  // skip r5 for now
-  PPC64_LR(6)
-  PPC64_LR(7)
-  PPC64_LR(8)
-  PPC64_LR(9)
-  PPC64_LR(10)
-  PPC64_LR(11)
-  PPC64_LR(12)
-  PPC64_LR(13)
-  PPC64_LR(14)
-  PPC64_LR(15)
-  PPC64_LR(16)
-  PPC64_LR(17)
-  PPC64_LR(18)
-  PPC64_LR(19)
-  PPC64_LR(20)
-  PPC64_LR(21)
-  PPC64_LR(22)
-  PPC64_LR(23)
-  PPC64_LR(24)
-  PPC64_LR(25)
-  PPC64_LR(26)
-  PPC64_LR(27)
-  PPC64_LR(28)
-  PPC64_LR(29)
-  PPC64_LR(30)
-  PPC64_LR(31)
-
-#if defined(__VSX__)
-
-  // restore VS registers
-  // (note that this also restores floating point registers and V registers,
-  // because part of VS is mapped to these registers)
-
-  addi  4, 3, PPC64_OFFS_FP
-
-// load VS register
-#ifdef __LITTLE_ENDIAN__
-// For little-endian targets, we need a swap since lxvd2x will load the register
-// in the incorrect doubleword order.
-// FIXME: when supporting targets older than Power9 on LE is no longer required,
-//        this can be changed to simply `lxv n, (16 * n)(4)`.
-#define PPC64_LVS(n)         \
-  lxvd2x  n, 0, 4           ;\
-  xxswapd n, n              ;\
-  addi    4, 4, 16
-#else
-#define PPC64_LVS(n)         \
-  lxvd2x  n, 0, 4           ;\
-  addi    4, 4, 16
-#endif
-
-  // restore the first 32 VS regs (and also all floating point regs)
-  PPC64_LVS(0)
-  PPC64_LVS(1)
-  PPC64_LVS(2)
-  PPC64_LVS(3)
-  PPC64_LVS(4)
-  PPC64_LVS(5)
-  PPC64_LVS(6)
-  PPC64_LVS(7)
-  PPC64_LVS(8)
-  PPC64_LVS(9)
-  PPC64_LVS(10)
-  PPC64_LVS(11)
-  PPC64_LVS(12)
-  PPC64_LVS(13)
-  PPC64_LVS(14)
-  PPC64_LVS(15)
-  PPC64_LVS(16)
-  PPC64_LVS(17)
-  PPC64_LVS(18)
-  PPC64_LVS(19)
-  PPC64_LVS(20)
-  PPC64_LVS(21)
-  PPC64_LVS(22)
-  PPC64_LVS(23)
-  PPC64_LVS(24)
-  PPC64_LVS(25)
-  PPC64_LVS(26)
-  PPC64_LVS(27)
-  PPC64_LVS(28)
-  PPC64_LVS(29)
-  PPC64_LVS(30)
-  PPC64_LVS(31)
-
-#ifdef __LITTLE_ENDIAN__
-#define PPC64_CLVS_RESTORE(n)                    \
-  addi   4, 3, PPC64_OFFS_FP + n * 16           ;\
-  lxvd2x n, 0, 4                                ;\
-  xxswapd n, n
-#else
-#define PPC64_CLVS_RESTORE(n)                    \
-  addi   4, 3, PPC64_OFFS_FP + n * 16           ;\
-  lxvd2x n, 0, 4
-#endif
-
-#if !defined(_AIX)
-  // use VRSAVE to conditionally restore the remaining VS regs, that are
-  // where the V regs are mapped. In the AIX ABI, VRSAVE is not used.
-  ld    5, PPC64_OFFS_VRSAVE(3)   // test VRsave
-  cmpwi 5, 0
-  beq   Lnovec
-
-// conditionally load VS
-#define PPC64_CLVSl(n)                           \
-  andis. 0, 5, (1 PPC_LEFT_SHIFT(47-n))         ;\
-  beq    Ldone##n                               ;\
-  PPC64_CLVS_RESTORE(n)                         ;\
-Ldone##n:
-
-#define PPC64_CLVSh(n)                           \
-  andi.  0, 5, (1 PPC_LEFT_SHIFT(63-n))         ;\
-  beq    Ldone##n                               ;\
-  PPC64_CLVS_RESTORE(n)                         ;\
-Ldone##n:
-
-#else
-
-#define PPC64_CLVSl(n) PPC64_CLVS_RESTORE(n)
-#define PPC64_CLVSh(n) PPC64_CLVS_RESTORE(n)
-
-#endif // !defined(_AIX)
-
-  PPC64_CLVSl(32)
-  PPC64_CLVSl(33)
-  PPC64_CLVSl(34)
-  PPC64_CLVSl(35)
-  PPC64_CLVSl(36)
-  PPC64_CLVSl(37)
-  PPC64_CLVSl(38)
-  PPC64_CLVSl(39)
-  PPC64_CLVSl(40)
-  PPC64_CLVSl(41)
-  PPC64_CLVSl(42)
-  PPC64_CLVSl(43)
-  PPC64_CLVSl(44)
-  PPC64_CLVSl(45)
-  PPC64_CLVSl(46)
-  PPC64_CLVSl(47)
-  PPC64_CLVSh(48)
-  PPC64_CLVSh(49)
-  PPC64_CLVSh(50)
-  PPC64_CLVSh(51)
-  PPC64_CLVSh(52)
-  PPC64_CLVSh(53)
-  PPC64_CLVSh(54)
-  PPC64_CLVSh(55)
-  PPC64_CLVSh(56)
-  PPC64_CLVSh(57)
-  PPC64_CLVSh(58)
-  PPC64_CLVSh(59)
-  PPC64_CLVSh(60)
-  PPC64_CLVSh(61)
-  PPC64_CLVSh(62)
-  PPC64_CLVSh(63)
-
-#else
-
-// load FP register
-#define PPC64_LF(n) \
-  lfd   n, (PPC64_OFFS_FP + n * 16)(3)
-
-  // restore float registers
-  PPC64_LF(0)
-  PPC64_LF(1)
-  PPC64_LF(2)
-  PPC64_LF(3)
-  PPC64_LF(4)
-  PPC64_LF(5)
-  PPC64_LF(6)
-  PPC64_LF(7)
-  PPC64_LF(8)
-  PPC64_LF(9)
-  PPC64_LF(10)
-  PPC64_LF(11)
-  PPC64_LF(12)
-  PPC64_LF(13)
-  PPC64_LF(14)
-  PPC64_LF(15)
-  PPC64_LF(16)
-  PPC64_LF(17)
-  PPC64_LF(18)
-  PPC64_LF(19)
-  PPC64_LF(20)
-  PPC64_LF(21)
-  PPC64_LF(22)
-  PPC64_LF(23)
-  PPC64_LF(24)
-  PPC64_LF(25)
-  PPC64_LF(26)
-  PPC64_LF(27)
-  PPC64_LF(28)
-  PPC64_LF(29)
-  PPC64_LF(30)
-  PPC64_LF(31)
-
-#if defined(__ALTIVEC__)
-
-#define PPC64_CLV_UNALIGNED_RESTORE(n)       \
-  ld     0, (PPC64_OFFS_V + n * 16)(3)      ;\
-  std    0, 0(4)                            ;\
-  ld     0, (PPC64_OFFS_V + n * 16 + 8)(3)  ;\
-  std    0, 8(4)                            ;\
-  lvx    n, 0, 4
-
-#if !defined(_AIX)
-  // restore vector registers if any are in use. In the AIX ABI, VRSAVE is
-  // not used.
-  ld    5, PPC64_OFFS_VRSAVE(3)   // test VRsave
-  cmpwi 5, 0
-  beq   Lnovec
-
-#define PPC64_CLV_UNALIGNEDl(n)              \
-  andis. 0, 5, (1 PPC_LEFT_SHIFT(15-n))     ;\
-  beq    Ldone##n                           ;\
-  PPC64_CLV_UNALIGNED_RESTORE(n)            ;\
-Ldone  ## n:
-
-#define PPC64_CLV_UNALIGNEDh(n)              \
-  andi.  0, 5, (1 PPC_LEFT_SHIFT(31-n))     ;\
-  beq    Ldone##n                           ;\
-  PPC64_CLV_UNALIGNED_RESTORE(n)            ;\
-Ldone  ## n:
-
-#else
-
-#define PPC64_CLV_UNALIGNEDl(n) PPC64_CLV_UNALIGNED_RESTORE(n)
-#define PPC64_CLV_UNALIGNEDh(n) PPC64_CLV_UNALIGNED_RESTORE(n)
-
-#endif // !defined(_AIX)
-
-  subi  4, 1, 16
-  // r4 is now a 16-byte aligned pointer into the red zone
-  // the _vectorScalarRegisters may not be 16-byte aligned
-  // so copy via red zone temp buffer
-
-  PPC64_CLV_UNALIGNEDl(0)
-  PPC64_CLV_UNALIGNEDl(1)
-  PPC64_CLV_UNALIGNEDl(2)
-  PPC64_CLV_UNALIGNEDl(3)
-  PPC64_CLV_UNALIGNEDl(4)
-  PPC64_CLV_UNALIGNEDl(5)
-  PPC64_CLV_UNALIGNEDl(6)
-  PPC64_CLV_UNALIGNEDl(7)
-  PPC64_CLV_UNALIGNEDl(8)
-  PPC64_CLV_UNALIGNEDl(9)
-  PPC64_CLV_UNALIGNEDl(10)
-  PPC64_CLV_UNALIGNEDl(11)
-  PPC64_CLV_UNALIGNEDl(12)
-  PPC64_CLV_UNALIGNEDl(13)
-  PPC64_CLV_UNALIGNEDl(14)
-  PPC64_CLV_UNALIGNEDl(15)
-  PPC64_CLV_UNALIGNEDh(16)
-  PPC64_CLV_UNALIGNEDh(17)
-  PPC64_CLV_UNALIGNEDh(18)
-  PPC64_CLV_UNALIGNEDh(19)
-  PPC64_CLV_UNALIGNEDh(20)
-  PPC64_CLV_UNALIGNEDh(21)
-  PPC64_CLV_UNALIGNEDh(22)
-  PPC64_CLV_UNALIGNEDh(23)
-  PPC64_CLV_UNALIGNEDh(24)
-  PPC64_CLV_UNALIGNEDh(25)
-  PPC64_CLV_UNALIGNEDh(26)
-  PPC64_CLV_UNALIGNEDh(27)
-  PPC64_CLV_UNALIGNEDh(28)
-  PPC64_CLV_UNALIGNEDh(29)
-  PPC64_CLV_UNALIGNEDh(30)
-  PPC64_CLV_UNALIGNEDh(31)
-
-#endif
-#endif
-
-Lnovec:
-  ld    0, PPC64_OFFS_CR(3)
-  mtcr  0
-  ld    0, PPC64_OFFS_SRR0(3)
-  mtctr 0
-
-#if defined(_AIX)
-  // After setting GPR1 to a higher address, AIX wipes out the original
-  // stack space below that address invalidated by the new GPR1 value. Use
-  // GPR0 to save the value of GPR3 in the context before it is wiped out.
-  // This compromises the content of GPR0 which is a volatile register.
-  ld 0, (8 * (3 + 2))(3)
-#else
-  PPC64_LR(0)
-#endif
-  PPC64_LR(5)
-  PPC64_LR(4)
-  PPC64_LR(1)
-#if defined(_AIX)
-  mr 3, 0
-#else
-  PPC64_LR(3)
-#endif
-  bctr
-
-#elif defined(__powerpc__)
-
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_ppc6jumptoEv)
-//
-// void libunwind::Registers_ppc::jumpto()
-//
-// On entry:
-//  thread_state pointer is in r3
-//
-
-  // restore integral registers
-  // skip r0 for now
-  // skip r1 for now
-  lwz     2,  16(3)
-  // skip r3 for now
-  // skip r4 for now
-  // skip r5 for now
-  lwz     6,  32(3)
-  lwz     7,  36(3)
-  lwz     8,  40(3)
-  lwz     9,  44(3)
-  lwz     10, 48(3)
-  lwz     11, 52(3)
-  lwz     12, 56(3)
-  lwz     13, 60(3)
-  lwz     14, 64(3)
-  lwz     15, 68(3)
-  lwz     16, 72(3)
-  lwz     17, 76(3)
-  lwz     18, 80(3)
-  lwz     19, 84(3)
-  lwz     20, 88(3)
-  lwz     21, 92(3)
-  lwz     22, 96(3)
-  lwz     23,100(3)
-  lwz     24,104(3)
-  lwz     25,108(3)
-  lwz     26,112(3)
-  lwz     27,116(3)
-  lwz     28,120(3)
-  lwz     29,124(3)
-  lwz     30,128(3)
-  lwz     31,132(3)
-
-#ifndef __NO_FPRS__
-  // restore float registers
-  lfd     0, 160(3)
-  lfd     1, 168(3)
-  lfd     2, 176(3)
-  lfd     3, 184(3)
-  lfd     4, 192(3)
-  lfd     5, 200(3)
-  lfd     6, 208(3)
-  lfd     7, 216(3)
-  lfd     8, 224(3)
-  lfd     9, 232(3)
-  lfd     10,240(3)
-  lfd     11,248(3)
-  lfd     12,256(3)
-  lfd     13,264(3)
-  lfd     14,272(3)
-  lfd     15,280(3)
-  lfd     16,288(3)
-  lfd     17,296(3)
-  lfd     18,304(3)
-  lfd     19,312(3)
-  lfd     20,320(3)
-  lfd     21,328(3)
-  lfd     22,336(3)
-  lfd     23,344(3)
-  lfd     24,352(3)
-  lfd     25,360(3)
-  lfd     26,368(3)
-  lfd     27,376(3)
-  lfd     28,384(3)
-  lfd     29,392(3)
-  lfd     30,400(3)
-  lfd     31,408(3)
-#endif
-
-#if defined(__ALTIVEC__)
-
-#define LOAD_VECTOR_RESTORE(_index)                 \
-  lwz     0, 424+_index*16(3)             SEPARATOR \
-  stw     0, 0(4)                         SEPARATOR \
-  lwz     0, 424+_index*16+4(3)           SEPARATOR \
-  stw     0, 4(4)                         SEPARATOR \
-  lwz     0, 424+_index*16+8(3)           SEPARATOR \
-  stw     0, 8(4)                         SEPARATOR \
-  lwz     0, 424+_index*16+12(3)          SEPARATOR \
-  stw     0, 12(4)                        SEPARATOR \
-  lvx     _index, 0, 4
-
-#if !defined(_AIX)
-  // restore vector registers if any are in use. In the AIX ABI, VRSAVE
-  // is not used.
-  lwz     5, 156(3)       // test VRsave
-  cmpwi   5, 0
-  beq     Lnovec
-
-#define LOAD_VECTOR_UNALIGNEDl(_index)                   \
-  andis.  0, 5, (1 PPC_LEFT_SHIFT(15-_index))  SEPARATOR \
-  beq     Ldone ## _index                      SEPARATOR \
-  LOAD_VECTOR_RESTORE(_index)                  SEPARATOR \
-  Ldone ## _index:
-
-#define LOAD_VECTOR_UNALIGNEDh(_index)                   \
-  andi.   0, 5, (1 PPC_LEFT_SHIFT(31-_index))  SEPARATOR \
-  beq     Ldone ## _index                      SEPARATOR \
-  LOAD_VECTOR_RESTORE(_index)                  SEPARATOR \
-  Ldone ## _index:
-
-#else
-
-#define LOAD_VECTOR_UNALIGNEDl(_index) LOAD_VECTOR_RESTORE(_index)
-#define LOAD_VECTOR_UNALIGNEDh(_index) LOAD_VECTOR_RESTORE(_index)
-
-#endif // !defined(_AIX)
-
-  subi    4, 1, 16
-  rlwinm  4, 4, 0, 0, 27  // mask low 4-bits
-  // r4 is now a 16-byte aligned pointer into the red zone
-  // the _vectorRegisters may not be 16-byte aligned so copy via red zone temp buffer
-
-  LOAD_VECTOR_UNALIGNEDl(0)
-  LOAD_VECTOR_UNALIGNEDl(1)
-  LOAD_VECTOR_UNALIGNEDl(2)
-  LOAD_VECTOR_UNALIGNEDl(3)
-  LOAD_VECTOR_UNALIGNEDl(4)
-  LOAD_VECTOR_UNALIGNEDl(5)
-  LOAD_VECTOR_UNALIGNEDl(6)
-  LOAD_VECTOR_UNALIGNEDl(7)
-  LOAD_VECTOR_UNALIGNEDl(8)
-  LOAD_VECTOR_UNALIGNEDl(9)
-  LOAD_VECTOR_UNALIGNEDl(10)
-  LOAD_VECTOR_UNALIGNEDl(11)
-  LOAD_VECTOR_UNALIGNEDl(12)
-  LOAD_VECTOR_UNALIGNEDl(13)
-  LOAD_VECTOR_UNALIGNEDl(14)
-  LOAD_VECTOR_UNALIGNEDl(15)
-  LOAD_VECTOR_UNALIGNEDh(16)
-  LOAD_VECTOR_UNALIGNEDh(17)
-  LOAD_VECTOR_UNALIGNEDh(18)
-  LOAD_VECTOR_UNALIGNEDh(19)
-  LOAD_VECTOR_UNALIGNEDh(20)
-  LOAD_VECTOR_UNALIGNEDh(21)
-  LOAD_VECTOR_UNALIGNEDh(22)
-  LOAD_VECTOR_UNALIGNEDh(23)
-  LOAD_VECTOR_UNALIGNEDh(24)
-  LOAD_VECTOR_UNALIGNEDh(25)
-  LOAD_VECTOR_UNALIGNEDh(26)
-  LOAD_VECTOR_UNALIGNEDh(27)
-  LOAD_VECTOR_UNALIGNEDh(28)
-  LOAD_VECTOR_UNALIGNEDh(29)
-  LOAD_VECTOR_UNALIGNEDh(30)
-  LOAD_VECTOR_UNALIGNEDh(31)
-#endif
-
-Lnovec:
-  lwz     0, 136(3)   // __cr
-  mtcr    0
-  lwz     0, 148(3)   // __ctr
-  mtctr   0
-  lwz     0,   0(3)   // __ssr0
-  mtctr   0
-  lwz     0,   8(3)   // do r0 now
-  lwz     5,  28(3)   // do r5 now
-  lwz     4,  24(3)   // do r4 now
-  lwz     1,  12(3)   // do sp now
-  lwz     3,  20(3)   // do r3 last
-  bctr
-
-#elif defined(__aarch64__)
-
-#if defined(__ARM_FEATURE_GCS_DEFAULT)
-.arch_extension gcs
-#endif
-
-//
-// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
-//
-// On entry:
-//  thread_state pointer is in x0
-//
-  .p2align 2
-DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
-  // skip restore of x0,x1 for now
-  ldp    x2, x3,  [x0, #0x010]
-  ldp    x4, x5,  [x0, #0x020]
-  ldp    x6, x7,  [x0, #0x030]
-  ldp    x8, x9,  [x0, #0x040]
-  ldp    x10,x11, [x0, #0x050]
-  ldp    x12,x13, [x0, #0x060]
-  ldp    x14,x15, [x0, #0x070]
-  // x16 and x17 were clobbered by the call into the unwinder, so no point in
-  // restoring them.
-  ldp    x18,x19, [x0, #0x090]
-  ldp    x20,x21, [x0, #0x0A0]
-  ldp    x22,x23, [x0, #0x0B0]
-  ldp    x24,x25, [x0, #0x0C0]
-  ldp    x26,x27, [x0, #0x0D0]
-  ldp    x28,x29, [x0, #0x0E0]
-  ldr    x30,     [x0, #0x100]  // restore pc into lr
-
-  ldp    d0, d1,  [x0, #0x110]
-  ldp    d2, d3,  [x0, #0x120]
-  ldp    d4, d5,  [x0, #0x130]
-  ldp    d6, d7,  [x0, #0x140]
-  ldp    d8, d9,  [x0, #0x150]
-  ldp    d10,d11, [x0, #0x160]
-  ldp    d12,d13, [x0, #0x170]
-  ldp    d14,d15, [x0, #0x180]
-  ldp    d16,d17, [x0, #0x190]
-  ldp    d18,d19, [x0, #0x1A0]
-  ldp    d20,d21, [x0, #0x1B0]
-  ldp    d22,d23, [x0, #0x1C0]
-  ldp    d24,d25, [x0, #0x1D0]
-  ldp    d26,d27, [x0, #0x1E0]
-  ldp    d28,d29, [x0, #0x1F0]
-  ldr    d30,     [x0, #0x200]
-  ldr    d31,     [x0, #0x208]
-
-  // Finally, restore sp. This must be done after the last read from the
-  // context struct, because it is allocated on the stack, and an exception
-  // could clobber the de-allocated portion of the stack after sp has been
-  // restored.
-  ldr    x16,     [x0, #0x0F8]
-  ldp    x0, x1,  [x0, #0x000]  // restore x0,x1
-  mov    sp,x16                 // restore sp
-#if defined(__ARM_FEATURE_GCS_DEFAULT)
-  // If GCS is enabled we need to push the address we're returning to onto the
-  // GCS stack. We can't just return using br, as there won't be a BTI landing
-  // pad instruction at the destination.
-  mov      x16, #1
-  chkfeat  x16
-  cbnz     x16, Lnogcs
-  gcspushm x30
-Lnogcs:
-#endif
-  ret    x30                    // jump to pc
-
-#elif defined(__arm__) && !defined(__APPLE__)
-
-#if !defined(__ARM_ARCH_ISA_ARM)
-#if (__ARM_ARCH_ISA_THUMB == 2)
-  .syntax unified
-#endif
-  .thumb
-#endif
-
-@
-@ void libunwind::Registers_arm::restoreCoreAndJumpTo()
-@
-@ On entry:
-@  thread_state pointer is in r0
-@
-  .p2align 2
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm20restoreCoreAndJumpToEv)
-#if !defined(__ARM_ARCH_ISA_ARM) && __ARM_ARCH_ISA_THUMB == 1
-  @ r8-r11: ldm into r1-r4, then mov to r8-r11
-  adds r0, #0x20
-  ldm r0!, {r1-r4}
-  subs r0, #0x30
-  mov r8, r1
-  mov r9, r2
-  mov r10, r3
-  mov r11, r4
-  @ r12 does not need loading, it it the intra-procedure-call scratch register
-  ldr r2, [r0, #0x34]
-  ldr r3, [r0, #0x3c]
-  mov sp, r2
-  mov lr, r3         @ restore pc into lr
-  ldm r0, {r0-r7}
-#else
-  @ Use lr as base so that r0 can be restored.
-  mov lr, r0
-  @ 32bit thumb-2 restrictions for ldm:
-  @ . the sp (r13) cannot be in the list
-  @ . the pc (r15) and lr (r14) cannot both be in the list in an LDM instruction
-  ldm lr, {r0-r12}
-  ldr sp, [lr, #52]
-  ldr lr, [lr, #60]  @ restore pc into lr
-#endif
-#if defined(__ARM_FEATURE_BTI_DEFAULT) && !defined(__ARM_ARCH_ISA_ARM)
-  // 'bx' is not BTI setting when used with lr, therefore r12 is used instead
-  mov r12, lr
-  JMP(r12)
-#else
-  JMP(lr)
-#endif
-
-@
-@ static void libunwind::Registers_arm::restoreVFPWithFLDMD(unw_fpreg_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .fpu vfpv3-d16
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm19restoreVFPWithFLDMDEPv)
-  @ VFP and iwMMX instructions are only available when compiling with the flags
-  @ that enable them. We do not want to do that in the library (because we do not
-  @ want the compiler to generate instructions that access those) but this is
-  @ only accessed if the personality routine needs these registers. Use of
-  @ these registers implies they are, actually, available on the target, so
-  @ it's ok to execute.
-  @ So, generate the instruction using the corresponding coprocessor mnemonic.
-  vldmia r0, {d0-d15}
-  JMP(lr)
-
-@
-@ static void libunwind::Registers_arm::restoreVFPWithFLDMX(unw_fpreg_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .fpu vfpv3-d16
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm19restoreVFPWithFLDMXEPv)
-  vldmia r0, {d0-d15} @ fldmiax is deprecated in ARMv7+ and now behaves like vldmia
-  JMP(lr)
-
-@
-@ static void libunwind::Registers_arm::restoreVFPv3(unw_fpreg_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .fpu vfpv3
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm12restoreVFPv3EPv)
-  vldmia r0, {d16-d31}
-  JMP(lr)
-
-#if defined(__ARM_WMMX)
-
-@
-@ static void libunwind::Registers_arm::restoreiWMMX(unw_fpreg_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .arch armv5te
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm12restoreiWMMXEPv)
-  ldcl p1, cr0, [r0], #8  @ wldrd wR0, [r0], #8
-  ldcl p1, cr1, [r0], #8  @ wldrd wR1, [r0], #8
-  ldcl p1, cr2, [r0], #8  @ wldrd wR2, [r0], #8
-  ldcl p1, cr3, [r0], #8  @ wldrd wR3, [r0], #8
-  ldcl p1, cr4, [r0], #8  @ wldrd wR4, [r0], #8
-  ldcl p1, cr5, [r0], #8  @ wldrd wR5, [r0], #8
-  ldcl p1, cr6, [r0], #8  @ wldrd wR6, [r0], #8
-  ldcl p1, cr7, [r0], #8  @ wldrd wR7, [r0], #8
-  ldcl p1, cr8, [r0], #8  @ wldrd wR8, [r0], #8
-  ldcl p1, cr9, [r0], #8  @ wldrd wR9, [r0], #8
-  ldcl p1, cr10, [r0], #8  @ wldrd wR10, [r0], #8
-  ldcl p1, cr11, [r0], #8  @ wldrd wR11, [r0], #8
-  ldcl p1, cr12, [r0], #8  @ wldrd wR12, [r0], #8
-  ldcl p1, cr13, [r0], #8  @ wldrd wR13, [r0], #8
-  ldcl p1, cr14, [r0], #8  @ wldrd wR14, [r0], #8
-  ldcl p1, cr15, [r0], #8  @ wldrd wR15, [r0], #8
-  JMP(lr)
-
-@
-@ static void libunwind::Registers_arm::restoreiWMMXControl(unw_uint32_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .arch armv5te
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm19restoreiWMMXControlEPj)
-  ldc2 p1, cr8, [r0], #4  @ wldrw wCGR0, [r0], #4
-  ldc2 p1, cr9, [r0], #4  @ wldrw wCGR1, [r0], #4
-  ldc2 p1, cr10, [r0], #4  @ wldrw wCGR2, [r0], #4
-  ldc2 p1, cr11, [r0], #4  @ wldrw wCGR3, [r0], #4
-  JMP(lr)
-
-#endif
-
-#elif defined(__or1k__)
-
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind14Registers_or1k6jumptoEv)
-#
-# void libunwind::Registers_or1k::jumpto()
-#
-# On entry:
-#  thread_state pointer is in r3
-#
-
-  # restore integral registers
-  l.lwz     r0,  0(r3)
-  l.lwz     r1,  4(r3)
-  l.lwz     r2,  8(r3)
-  # skip r3 for now
-  l.lwz     r4, 16(r3)
-  l.lwz     r5, 20(r3)
-  l.lwz     r6, 24(r3)
-  l.lwz     r7, 28(r3)
-  l.lwz     r8, 32(r3)
-  # skip r9
-  l.lwz    r10, 40(r3)
-  l.lwz    r11, 44(r3)
-  l.lwz    r12, 48(r3)
-  l.lwz    r13, 52(r3)
-  l.lwz    r14, 56(r3)
-  l.lwz    r15, 60(r3)
-  l.lwz    r16, 64(r3)
-  l.lwz    r17, 68(r3)
-  l.lwz    r18, 72(r3)
-  l.lwz    r19, 76(r3)
-  l.lwz    r20, 80(r3)
-  l.lwz    r21, 84(r3)
-  l.lwz    r22, 88(r3)
-  l.lwz    r23, 92(r3)
-  l.lwz    r24, 96(r3)
-  l.lwz    r25,100(r3)
-  l.lwz    r26,104(r3)
-  l.lwz    r27,108(r3)
-  l.lwz    r28,112(r3)
-  l.lwz    r29,116(r3)
-  l.lwz    r30,120(r3)
-  l.lwz    r31,124(r3)
-
-  # load new pc into ra
-  l.lwz    r9, 128(r3)
-
-  # at last, restore r3
-  l.lwz    r3,  12(r3)
-
-  # jump to pc
-  l.jr     r9
-   l.nop
-
-#elif defined(__hexagon__)
-# On entry:
-#  thread_state pointer is in r2
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind17Registers_hexagon6jumptoEv)
-#
-# void libunwind::Registers_hexagon::jumpto()
-#
-  r8 = memw(r0+#32)
-  r9 = memw(r0+#36)
-  r10 = memw(r0+#40)
-  r11 = memw(r0+#44)
-
-  r12 = memw(r0+#48)
-  r13 = memw(r0+#52)
-  r14 = memw(r0+#56)
-  r15 = memw(r0+#60)
-
-  r16 = memw(r0+#64)
-  r17 = memw(r0+#68)
-  r18 = memw(r0+#72)
-  r19 = memw(r0+#76)
-
-  r20 = memw(r0+#80)
-  r21 = memw(r0+#84)
-  r22 = memw(r0+#88)
-  r23 = memw(r0+#92)
-
-  r24 = memw(r0+#96)
-  r25 = memw(r0+#100)
-  r26 = memw(r0+#104)
-  r27 = memw(r0+#108)
-
-  r28 = memw(r0+#112)
-  r29 = memw(r0+#116)
-  r30 = memw(r0+#120)
-  r31 = memw(r0+#132)
-
-  r1 = memw(r0+#128)
-  c4 = r1   // Predicate register
-  r1 = memw(r0+#4)
-  r0 = memw(r0)
-  jumpr r31
-#elif defined(__mips__) && defined(_ABIO32) && _MIPS_SIM == _ABIO32
-
-//
-// void libunwind::Registers_mips_o32::jumpto()
-//
-// On entry:
-//  thread state pointer is in a0 ($4)
-//
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind18Registers_mips_o326jumptoEv)
-  .set push
-  .set noat
-  .set noreorder
-  .set nomacro
-#ifdef __mips_hard_float
-#if __mips_fpr != 64
-  ldc1  $f0, (4 * 36 + 8 * 0)($4)
-  ldc1  $f2, (4 * 36 + 8 * 2)($4)
-  ldc1  $f4, (4 * 36 + 8 * 4)($4)
-  ldc1  $f6, (4 * 36 + 8 * 6)($4)
-  ldc1  $f8, (4 * 36 + 8 * 8)($4)
-  ldc1  $f10, (4 * 36 + 8 * 10)($4)
-  ldc1  $f12, (4 * 36 + 8 * 12)($4)
-  ldc1  $f14, (4 * 36 + 8 * 14)($4)
-  ldc1  $f16, (4 * 36 + 8 * 16)($4)
-  ldc1  $f18, (4 * 36 + 8 * 18)($4)
-  ldc1  $f20, (4 * 36 + 8 * 20)($4)
-  ldc1  $f22, (4 * 36 + 8 * 22)($4)
-  ldc1  $f24, (4 * 36 + 8 * 24)($4)
-  ldc1  $f26, (4 * 36 + 8 * 26)($4)
-  ldc1  $f28, (4 * 36 + 8 * 28)($4)
-  ldc1  $f30, (4 * 36 + 8 * 30)($4)
-#else
-  ldc1  $f0, (4 * 36 + 8 * 0)($4)
-  ldc1  $f1, (4 * 36 + 8 * 1)($4)
-  ldc1  $f2, (4 * 36 + 8 * 2)($4)
-  ldc1  $f3, (4 * 36 + 8 * 3)($4)
-  ldc1  $f4, (4 * 36 + 8 * 4)($4)
-  ldc1  $f5, (4 * 36 + 8 * 5)($4)
-  ldc1  $f6, (4 * 36 + 8 * 6)($4)
-  ldc1  $f7, (4 * 36 + 8 * 7)($4)
-  ldc1  $f8, (4 * 36 + 8 * 8)($4)
-  ldc1  $f9, (4 * 36 + 8 * 9)($4)
-  ldc1  $f10, (4 * 36 + 8 * 10)($4)
-  ldc1  $f11, (4 * 36 + 8 * 11)($4)
-  ldc1  $f12, (4 * 36 + 8 * 12)($4)
-  ldc1  $f13, (4 * 36 + 8 * 13)($4)
-  ldc1  $f14, (4 * 36 + 8 * 14)($4)
-  ldc1  $f15, (4 * 36 + 8 * 15)($4)
-  ldc1  $f16, (4 * 36 + 8 * 16)($4)
-  ldc1  $f17, (4 * 36 + 8 * 17)($4)
-  ldc1  $f18, (4 * 36 + 8 * 18)($4)
-  ldc1  $f19, (4 * 36 + 8 * 19)($4)
-  ldc1  $f20, (4 * 36 + 8 * 20)($4)
-  ldc1  $f21, (4 * 36 + 8 * 21)($4)
-  ldc1  $f22, (4 * 36 + 8 * 22)($4)
-  ldc1  $f23, (4 * 36 + 8 * 23)($4)
-  ldc1  $f24, (4 * 36 + 8 * 24)($4)
-  ldc1  $f25, (4 * 36 + 8 * 25)($4)
-  ldc1  $f26, (4 * 36 + 8 * 26)($4)
-  ldc1  $f27, (4 * 36 + 8 * 27)($4)
-  ldc1  $f28, (4 * 36 + 8 * 28)($4)
-  ldc1  $f29, (4 * 36 + 8 * 29)($4)
-  ldc1  $f30, (4 * 36 + 8 * 30)($4)
-  ldc1  $f31, (4 * 36 + 8 * 31)($4)
-#endif
-#endif
-#if __mips_isa_rev < 6
-  // restore hi and lo
-  lw    $8, (4 * 33)($4)
-  mthi  $8
-  lw    $8, (4 * 34)($4)
-  mtlo  $8
-#endif
-  // r0 is zero
-  lw    $1, (4 * 1)($4)
-  lw    $2, (4 * 2)($4)
-  lw    $3, (4 * 3)($4)
-  // skip a0 for now
-  lw    $5, (4 * 5)($4)
-  lw    $6, (4 * 6)($4)
-  lw    $7, (4 * 7)($4)
-  lw    $8, (4 * 8)($4)
-  lw    $9, (4 * 9)($4)
-  lw    $10, (4 * 10)($4)
-  lw    $11, (4 * 11)($4)
-  lw    $12, (4 * 12)($4)
-  lw    $13, (4 * 13)($4)
-  lw    $14, (4 * 14)($4)
-  lw    $15, (4 * 15)($4)
-  lw    $16, (4 * 16)($4)
-  lw    $17, (4 * 17)($4)
-  lw    $18, (4 * 18)($4)
-  lw    $19, (4 * 19)($4)
-  lw    $20, (4 * 20)($4)
-  lw    $21, (4 * 21)($4)
-  lw    $22, (4 * 22)($4)
-  lw    $23, (4 * 23)($4)
-  lw    $24, (4 * 24)($4)
-  lw    $25, (4 * 25)($4)
-  lw    $26, (4 * 26)($4)
-  lw    $27, (4 * 27)($4)
-  lw    $28, (4 * 28)($4)
-  lw    $29, (4 * 29)($4)
-  lw    $30, (4 * 30)($4)
-  // load new pc into ra
-  lw    $31, (4 * 32)($4)
-  // jump to ra, load a0 in the delay slot
-  jr    $31
-  lw    $4, (4 * 4)($4)
-  .set pop
-
-#elif defined(__mips64)
-
-//
-// void libunwind::Registers_mips_newabi::jumpto()
-//
-// On entry:
-//  thread state pointer is in a0 ($4)
-//
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind21Registers_mips_newabi6jumptoEv)
-  .set push
-  .set noat
-  .set noreorder
-  .set nomacro
-#ifdef __mips_hard_float
-  .irp i,FROM_0_TO_31
-    ldc1 $f\i, (280+8*\i)($4)
-  .endr
-#endif
-#if __mips_isa_rev < 6
-  // restore hi and lo
-  ld    $8, (8 * 33)($4)
-  mthi  $8
-  ld    $8, (8 * 34)($4)
-  mtlo  $8
-#endif
-  // r0 is zero
-  ld    $1, (8 * 1)($4)
-  ld    $2, (8 * 2)($4)
-  ld    $3, (8 * 3)($4)
-  // skip a0 for now
-  .irp i,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
-    ld $\i, (8 * \i)($4)
-  .endr
-  // load new pc into ra
-  ld    $31, (8 * 32)($4)
-  // jump to ra, load a0 in the delay slot
-  jr    $31
-  ld    $4, (8 * 4)($4)
-  .set pop
-
-#elif defined(__sparc__) && defined(__arch64__)
-
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind17Registers_sparc646jumptoEv)
-//
-// void libunwind::Registers_sparc64::jumpto()
-//
-// On entry:
-//  thread_state pointer is in %o0
-//
-  .register %g2, #scratch
-  .register %g3, #scratch
-  .register %g6, #scratch
-  .register %g7, #scratch
-  flushw
-  ldx  [%o0 + 0x08], %g1
-  ldx  [%o0 + 0x10], %g2
-  ldx  [%o0 + 0x18], %g3
-  ldx  [%o0 + 0x20], %g4
-  ldx  [%o0 + 0x28], %g5
-  ldx  [%o0 + 0x30], %g6
-  ldx  [%o0 + 0x38], %g7
-  ldx  [%o0 + 0x48], %o1
-  ldx  [%o0 + 0x50], %o2
-  ldx  [%o0 + 0x58], %o3
-  ldx  [%o0 + 0x60], %o4
-  ldx  [%o0 + 0x68], %o5
-  ldx  [%o0 + 0x70], %o6
-  ldx  [%o0 + 0x78], %o7
-  ldx  [%o0 + 0x80], %l0
-  ldx  [%o0 + 0x88], %l1
-  ldx  [%o0 + 0x90], %l2
-  ldx  [%o0 + 0x98], %l3
-  ldx  [%o0 + 0xa0], %l4
-  ldx  [%o0 + 0xa8], %l5
-  ldx  [%o0 + 0xb0], %l6
-  ldx  [%o0 + 0xb8], %l7
-  ldx  [%o0 + 0xc0], %i0
-  ldx  [%o0 + 0xc8], %i1
-  ldx  [%o0 + 0xd0], %i2
-  ldx  [%o0 + 0xd8], %i3
-  ldx  [%o0 + 0xe0], %i4
-  ldx  [%o0 + 0xe8], %i5
-  ldx  [%o0 + 0xf0], %i6
-  ldx  [%o0 + 0xf8], %i7
-  jmp  %o7
-   ldx [%o0 + 0x40], %o0
-
-#elif defined(__sparc__)
-
-//
-// void libunwind::Registers_sparc_o32::jumpto()
-//
-// On entry:
-//  thread_state pointer is in o0
-//
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_sparc6jumptoEv)
-  ta 3
-  ldd [%o0 + 64],  %l0
-  ldd [%o0 + 72],  %l2
-  ldd [%o0 + 80],  %l4
-  ldd [%o0 + 88],  %l6
-  ldd [%o0 + 96],  %i0
-  ldd [%o0 + 104], %i2
-  ldd [%o0 + 112], %i4
-  ldd [%o0 + 120], %i6
-  ld  [%o0 + 60],  %o7
-  jmp %o7
-   nop
-
-#elif defined(__riscv)
-
-//
-// void libunwind::Registers_riscv::jumpto()
-//
-// On entry:
-//  thread_state pointer is in a0
-//
-  .p2align 2
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_riscv6jumptoEv)
-# if defined(__riscv_flen)
-  .irp i,FROM_0_TO_31
-    FLOAD f\i, (RISCV_FOFFSET + RISCV_FSIZE * \i)(a0)
-  .endr
-# endif
-
-  // x0 is zero
-  ILOAD    x1, (RISCV_ISIZE * 0)(a0) // restore pc into ra
-  .irp i,2,3,4,5,6,7,8,9
-    ILOAD x\i, (RISCV_ISIZE * \i)(a0)
-  .endr
-  // skip a0 for now
-#if defined(__riscv_32e)
-  .irp i,11,12,13,14,15
-#else
-  .irp i,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-#endif
-    ILOAD x\i, (RISCV_ISIZE * \i)(a0)
-  .endr
-  ILOAD    x10, (RISCV_ISIZE * 10)(a0)   // restore a0
-
-  ret                       // jump to ra
-
-#elif defined(__s390x__)
-
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_s390x6jumptoEv)
-//
-// void libunwind::Registers_s390x::jumpto()
-//
-// On entry:
-//  thread_state pointer is in r2
-//
-
-  // Skip PSWM, but load PSWA into r1
-  lg %r1, 8(%r2)
-
-  // Restore FPRs
-  .irp i,FROM_0_TO_15
-    ld %f\i, (144+8*\i)(%r2)
-  .endr
-
-  // Restore GPRs - skipping %r0 and %r1
-  lmg  %r2, %r15, 32(%r2)
-
-  // Return to PSWA (was loaded into %r1 above)
-  br %r1
-
-#elif defined(__loongarch__) && __loongarch_grlen == 64
-
-//
-// void libunwind::Registers_loongarch::jumpto()
-//
-// On entry:
-//  thread_state pointer is in $a0($r4)
-//
-  .p2align 2
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind19Registers_loongarch6jumptoEv)
-# if __loongarch_frlen == 64
-  .irp i,FROM_0_TO_31
-    fld.d $f\i, $a0, (8 * 33 + 8 * \i)
-  .endr
-# endif
-
-  // $r0 is zero
-  .irp i,1,2,3
-    ld.d $r\i, $a0, (8 * \i)
-  .endr
-  // skip $a0 for now
-  .irp i,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-    ld.d $r\i, $a0, (8 * \i)
-  .endr
-
-  ld.d    $ra,  $a0, (8 * 32)  // load new pc into $ra
-  ld.d    $a0,  $a0, (8 * 4)   // restore $a0 last
-
-  jr      $ra
-
-#endif
-
-#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__) */
-
-NO_EXEC_STACK_DIRECTIVE
-
diff --git a/third_party/libunwind/UnwindRegistersSave.S b/third_party/libunwind/UnwindRegistersSave.S
deleted file mode 100644
index fab234fcd..000000000
--- a/third_party/libunwind/UnwindRegistersSave.S
+++ /dev/null
@@ -1,1186 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "assembly.h"
-
-#define FROM_0_TO_15 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-#define FROM_16_TO_31 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-
-#define FROM_0_TO_31 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-#define FROM_32_TO_63 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63
-
-#if defined(_AIX)
-    .toc
-#else
-    .text
-#endif
-
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
-
-#if defined(__i386__)
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#   +                       +
-#   +-----------------------+
-#   + thread_state pointer  +
-#   +-----------------------+
-#   + return address        +
-#   +-----------------------+   <-- SP
-#   +                       +
-#
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-
-  _LIBUNWIND_CET_ENDBR
-  push  %eax
-  movl  8(%esp), %eax
-  movl  %ebx,  4(%eax)
-  movl  %ecx,  8(%eax)
-  movl  %edx, 12(%eax)
-  movl  %edi, 16(%eax)
-  movl  %esi, 20(%eax)
-  movl  %ebp, 24(%eax)
-  movl  %esp, %edx
-  addl  $8, %edx
-  movl  %edx, 28(%eax)  # store what sp was at call site as esp
-  # skip ss
-  # skip eflags
-  movl  4(%esp), %edx
-  movl  %edx, 40(%eax)  # store return address as eip
-  # skip cs
-  # skip ds
-  # skip es
-  # skip fs
-  # skip gs
-  movl  (%esp), %edx
-  movl  %edx, (%eax)  # store original eax
-  popl  %eax
-  xorl  %eax, %eax    # return UNW_ESUCCESS
-  ret
-
-#elif defined(__x86_64__)
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#  thread_state pointer is in rdi
-#
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-#if defined(_WIN64)
-#define PTR %rcx
-#define TMP %rdx
-#else
-#define PTR %rdi
-#define TMP %rsi
-#endif
-
-  _LIBUNWIND_CET_ENDBR
-  movq  %rax,   (PTR)
-  movq  %rbx,  8(PTR)
-  movq  %rcx, 16(PTR)
-  movq  %rdx, 24(PTR)
-  movq  %rdi, 32(PTR)
-  movq  %rsi, 40(PTR)
-  movq  %rbp, 48(PTR)
-  movq  %rsp, 56(PTR)
-  addq  $8,   56(PTR)
-  movq  %r8,  64(PTR)
-  movq  %r9,  72(PTR)
-  movq  %r10, 80(PTR)
-  movq  %r11, 88(PTR)
-  movq  %r12, 96(PTR)
-  movq  %r13,104(PTR)
-  movq  %r14,112(PTR)
-  movq  %r15,120(PTR)
-  movq  (%rsp),TMP
-  movq  TMP,128(PTR) # store return address as rip
-  # skip rflags
-  # skip cs
-  # skip fs
-  # skip gs
-
-#if defined(_WIN64)
-  movdqu %xmm0,176(PTR)
-  movdqu %xmm1,192(PTR)
-  movdqu %xmm2,208(PTR)
-  movdqu %xmm3,224(PTR)
-  movdqu %xmm4,240(PTR)
-  movdqu %xmm5,256(PTR)
-  movdqu %xmm6,272(PTR)
-  movdqu %xmm7,288(PTR)
-  movdqu %xmm8,304(PTR)
-  movdqu %xmm9,320(PTR)
-  movdqu %xmm10,336(PTR)
-  movdqu %xmm11,352(PTR)
-  movdqu %xmm12,368(PTR)
-  movdqu %xmm13,384(PTR)
-  movdqu %xmm14,400(PTR)
-  movdqu %xmm15,416(PTR)
-#endif
-  xorl  %eax, %eax    # return UNW_ESUCCESS
-  ret
-
-#elif defined(__mips__) && defined(_ABIO32) && _MIPS_SIM == _ABIO32
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#  thread_state pointer is in a0 ($4)
-#
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  .set push
-  .set noat
-  .set noreorder
-  .set nomacro
-  sw    $1, (4 * 1)($4)
-  sw    $2, (4 * 2)($4)
-  sw    $3, (4 * 3)($4)
-  sw    $4, (4 * 4)($4)
-  sw    $5, (4 * 5)($4)
-  sw    $6, (4 * 6)($4)
-  sw    $7, (4 * 7)($4)
-  sw    $8, (4 * 8)($4)
-  sw    $9, (4 * 9)($4)
-  sw    $10, (4 * 10)($4)
-  sw    $11, (4 * 11)($4)
-  sw    $12, (4 * 12)($4)
-  sw    $13, (4 * 13)($4)
-  sw    $14, (4 * 14)($4)
-  sw    $15, (4 * 15)($4)
-  sw    $16, (4 * 16)($4)
-  sw    $17, (4 * 17)($4)
-  sw    $18, (4 * 18)($4)
-  sw    $19, (4 * 19)($4)
-  sw    $20, (4 * 20)($4)
-  sw    $21, (4 * 21)($4)
-  sw    $22, (4 * 22)($4)
-  sw    $23, (4 * 23)($4)
-  sw    $24, (4 * 24)($4)
-  sw    $25, (4 * 25)($4)
-  sw    $26, (4 * 26)($4)
-  sw    $27, (4 * 27)($4)
-  sw    $28, (4 * 28)($4)
-  sw    $29, (4 * 29)($4)
-  sw    $30, (4 * 30)($4)
-  sw    $31, (4 * 31)($4)
-  # Store return address to pc
-  sw    $31, (4 * 32)($4)
-#if __mips_isa_rev < 6
-  # hi and lo
-  mfhi  $8
-  sw    $8,  (4 * 33)($4)
-  mflo  $8
-  sw    $8,  (4 * 34)($4)
-#endif
-#ifdef __mips_hard_float
-#if __mips_fpr != 64
-  sdc1  $f0, (4 * 36 + 8 * 0)($4)
-  sdc1  $f2, (4 * 36 + 8 * 2)($4)
-  sdc1  $f4, (4 * 36 + 8 * 4)($4)
-  sdc1  $f6, (4 * 36 + 8 * 6)($4)
-  sdc1  $f8, (4 * 36 + 8 * 8)($4)
-  sdc1  $f10, (4 * 36 + 8 * 10)($4)
-  sdc1  $f12, (4 * 36 + 8 * 12)($4)
-  sdc1  $f14, (4 * 36 + 8 * 14)($4)
-  sdc1  $f16, (4 * 36 + 8 * 16)($4)
-  sdc1  $f18, (4 * 36 + 8 * 18)($4)
-  sdc1  $f20, (4 * 36 + 8 * 20)($4)
-  sdc1  $f22, (4 * 36 + 8 * 22)($4)
-  sdc1  $f24, (4 * 36 + 8 * 24)($4)
-  sdc1  $f26, (4 * 36 + 8 * 26)($4)
-  sdc1  $f28, (4 * 36 + 8 * 28)($4)
-  sdc1  $f30, (4 * 36 + 8 * 30)($4)
-#else
-  sdc1  $f0, (4 * 36 + 8 * 0)($4)
-  sdc1  $f1, (4 * 36 + 8 * 1)($4)
-  sdc1  $f2, (4 * 36 + 8 * 2)($4)
-  sdc1  $f3, (4 * 36 + 8 * 3)($4)
-  sdc1  $f4, (4 * 36 + 8 * 4)($4)
-  sdc1  $f5, (4 * 36 + 8 * 5)($4)
-  sdc1  $f6, (4 * 36 + 8 * 6)($4)
-  sdc1  $f7, (4 * 36 + 8 * 7)($4)
-  sdc1  $f8, (4 * 36 + 8 * 8)($4)
-  sdc1  $f9, (4 * 36 + 8 * 9)($4)
-  sdc1  $f10, (4 * 36 + 8 * 10)($4)
-  sdc1  $f11, (4 * 36 + 8 * 11)($4)
-  sdc1  $f12, (4 * 36 + 8 * 12)($4)
-  sdc1  $f13, (4 * 36 + 8 * 13)($4)
-  sdc1  $f14, (4 * 36 + 8 * 14)($4)
-  sdc1  $f15, (4 * 36 + 8 * 15)($4)
-  sdc1  $f16, (4 * 36 + 8 * 16)($4)
-  sdc1  $f17, (4 * 36 + 8 * 17)($4)
-  sdc1  $f18, (4 * 36 + 8 * 18)($4)
-  sdc1  $f19, (4 * 36 + 8 * 19)($4)
-  sdc1  $f20, (4 * 36 + 8 * 20)($4)
-  sdc1  $f21, (4 * 36 + 8 * 21)($4)
-  sdc1  $f22, (4 * 36 + 8 * 22)($4)
-  sdc1  $f23, (4 * 36 + 8 * 23)($4)
-  sdc1  $f24, (4 * 36 + 8 * 24)($4)
-  sdc1  $f25, (4 * 36 + 8 * 25)($4)
-  sdc1  $f26, (4 * 36 + 8 * 26)($4)
-  sdc1  $f27, (4 * 36 + 8 * 27)($4)
-  sdc1  $f28, (4 * 36 + 8 * 28)($4)
-  sdc1  $f29, (4 * 36 + 8 * 29)($4)
-  sdc1  $f30, (4 * 36 + 8 * 30)($4)
-  sdc1  $f31, (4 * 36 + 8 * 31)($4)
-#endif
-#endif
-  jr	$31
-  # return UNW_ESUCCESS
-  or    $2, $0, $0
-  .set pop
-
-#elif defined(__mips64)
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#  thread_state pointer is in a0 ($4)
-#
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  .set push
-  .set noat
-  .set noreorder
-  .set nomacro
-  .irp i,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-    sd $\i, (8 * \i)($4)
-  .endr
-  # Store return address to pc
-  sd    $31, (8 * 32)($4)
-#if __mips_isa_rev < 6
-  # hi and lo
-  mfhi  $8
-  sd    $8,  (8 * 33)($4)
-  mflo  $8
-  sd    $8,  (8 * 34)($4)
-#endif
-#ifdef __mips_hard_float
-  .irp i,FROM_0_TO_31
-    sdc1 $f\i, (280+8*\i)($4)
-  .endr
-#endif
-  jr	$31
-  # return UNW_ESUCCESS
-  or    $2, $0, $0
-  .set pop
-
-# elif defined(__mips__)
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# Just trap for the time being.
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  teq $0, $0
-
-#elif defined(__powerpc64__)
-
-//
-// extern int __unw_getcontext(unw_context_t* thread_state)
-//
-// On entry:
-//  thread_state pointer is in r3
-//
-#if defined(_AIX)
-DEFINE_LIBUNWIND_FUNCTION_AND_WEAK_ALIAS(__unw_getcontext, unw_getcontext)
-#else
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-#endif
-// store register (GPR)
-#define PPC64_STR(n) \
-  std   n, (8 * (n + 2))(3)
-
-  // save GPRs
-  PPC64_STR(0)
-  mflr  0
-  std   0, PPC64_OFFS_SRR0(3) // store lr as ssr0
-  PPC64_STR(1)
-  PPC64_STR(4)        // Save r4 first since it will be used for fixing r2.
-#if defined(_AIX)
-  // The TOC register (r2) was changed by the glue code if unw_getcontext
-  // is called from a different module. Save the original TOC register
-  // in the context if this is the case.
-  mflr   4
-  lwz    4, 0(4)      // Get the first instruction at the return address.
-  xoris  0, 4, 0xe841 // Is it reloading the TOC register "ld 2,40(1)"?
-  cmplwi 0, 0x28
-  bne    0, LnoR2Fix  // No need to fix up r2 if it is not.
-  ld     2, 40(1)     // Use the saved TOC register in the stack.
-LnoR2Fix:
-#endif
-  PPC64_STR(2)
-  PPC64_STR(3)
-  PPC64_STR(5)
-  PPC64_STR(6)
-  PPC64_STR(7)
-  PPC64_STR(8)
-  PPC64_STR(9)
-  PPC64_STR(10)
-  PPC64_STR(11)
-  PPC64_STR(12)
-  PPC64_STR(13)
-  PPC64_STR(14)
-  PPC64_STR(15)
-  PPC64_STR(16)
-  PPC64_STR(17)
-  PPC64_STR(18)
-  PPC64_STR(19)
-  PPC64_STR(20)
-  PPC64_STR(21)
-  PPC64_STR(22)
-  PPC64_STR(23)
-  PPC64_STR(24)
-  PPC64_STR(25)
-  PPC64_STR(26)
-  PPC64_STR(27)
-  PPC64_STR(28)
-  PPC64_STR(29)
-  PPC64_STR(30)
-  PPC64_STR(31)
-
-  mfcr  0
-  std   0,  PPC64_OFFS_CR(3)
-  mfxer 0
-  std   0,  PPC64_OFFS_XER(3)
-#if defined(_AIX)
-  // LR value saved from the register is not used, initialize it to 0.
-  li    0,  0
-#else
-  mflr  0
-#endif
-  std   0,  PPC64_OFFS_LR(3)
-  mfctr 0
-  std   0,  PPC64_OFFS_CTR(3)
-  mfvrsave    0
-  std   0,  PPC64_OFFS_VRSAVE(3)
-
-#if defined(__VSX__)
-  // save VS registers
-  // (note that this also saves floating point registers and V registers,
-  // because part of VS is mapped to these registers)
-
-  addi  4, 3, PPC64_OFFS_FP
-
-// store VS register
-#ifdef __LITTLE_ENDIAN__
-// For little-endian targets, we need a swap since stxvd2x will store the
-// register in the incorrect doubleword order.
-// FIXME: when supporting targets older than Power9 on LE is no longer required
-//        this can be changed to simply `stxv n, 16 * n(4)`.
-#define PPC64_STVS(n)      \
-  xxswapd n, n            ;\
-  stxvd2x n, 0, 4         ;\
-  addi    4, 4, 16
-#else
-#define PPC64_STVS(n)      \
-  stxvd2x n, 0, 4         ;\
-  addi    4, 4, 16
-#endif
-
-  PPC64_STVS(0)
-  PPC64_STVS(1)
-  PPC64_STVS(2)
-  PPC64_STVS(3)
-  PPC64_STVS(4)
-  PPC64_STVS(5)
-  PPC64_STVS(6)
-  PPC64_STVS(7)
-  PPC64_STVS(8)
-  PPC64_STVS(9)
-  PPC64_STVS(10)
-  PPC64_STVS(11)
-  PPC64_STVS(12)
-  PPC64_STVS(13)
-  PPC64_STVS(14)
-  PPC64_STVS(15)
-  PPC64_STVS(16)
-  PPC64_STVS(17)
-  PPC64_STVS(18)
-  PPC64_STVS(19)
-  PPC64_STVS(20)
-  PPC64_STVS(21)
-  PPC64_STVS(22)
-  PPC64_STVS(23)
-  PPC64_STVS(24)
-  PPC64_STVS(25)
-  PPC64_STVS(26)
-  PPC64_STVS(27)
-  PPC64_STVS(28)
-  PPC64_STVS(29)
-  PPC64_STVS(30)
-  PPC64_STVS(31)
-  PPC64_STVS(32)
-  PPC64_STVS(33)
-  PPC64_STVS(34)
-  PPC64_STVS(35)
-  PPC64_STVS(36)
-  PPC64_STVS(37)
-  PPC64_STVS(38)
-  PPC64_STVS(39)
-  PPC64_STVS(40)
-  PPC64_STVS(41)
-  PPC64_STVS(42)
-  PPC64_STVS(43)
-  PPC64_STVS(44)
-  PPC64_STVS(45)
-  PPC64_STVS(46)
-  PPC64_STVS(47)
-  PPC64_STVS(48)
-  PPC64_STVS(49)
-  PPC64_STVS(50)
-  PPC64_STVS(51)
-  PPC64_STVS(52)
-  PPC64_STVS(53)
-  PPC64_STVS(54)
-  PPC64_STVS(55)
-  PPC64_STVS(56)
-  PPC64_STVS(57)
-  PPC64_STVS(58)
-  PPC64_STVS(59)
-  PPC64_STVS(60)
-  PPC64_STVS(61)
-  PPC64_STVS(62)
-  PPC64_STVS(63)
-
-#else
-
-// store FP register
-#define PPC64_STF(n) \
-  stfd  n, (PPC64_OFFS_FP + n * 16)(3)
-
-  // save float registers
-  PPC64_STF(0)
-  PPC64_STF(1)
-  PPC64_STF(2)
-  PPC64_STF(3)
-  PPC64_STF(4)
-  PPC64_STF(5)
-  PPC64_STF(6)
-  PPC64_STF(7)
-  PPC64_STF(8)
-  PPC64_STF(9)
-  PPC64_STF(10)
-  PPC64_STF(11)
-  PPC64_STF(12)
-  PPC64_STF(13)
-  PPC64_STF(14)
-  PPC64_STF(15)
-  PPC64_STF(16)
-  PPC64_STF(17)
-  PPC64_STF(18)
-  PPC64_STF(19)
-  PPC64_STF(20)
-  PPC64_STF(21)
-  PPC64_STF(22)
-  PPC64_STF(23)
-  PPC64_STF(24)
-  PPC64_STF(25)
-  PPC64_STF(26)
-  PPC64_STF(27)
-  PPC64_STF(28)
-  PPC64_STF(29)
-  PPC64_STF(30)
-  PPC64_STF(31)
-
-#if defined(__ALTIVEC__)
-  // save vector registers
-
-  // Use 16-bytes below the stack pointer as an
-  // aligned buffer to save each vector register.
-  // Note that the stack pointer is always 16-byte aligned.
-  subi  4, 1, 16
-
-#define PPC64_STV_UNALIGNED(n)             \
-  stvx  n, 0, 4                           ;\
-  ld    5, 0(4)                           ;\
-  std   5, (PPC64_OFFS_V + n * 16)(3)     ;\
-  ld    5, 8(4)                           ;\
-  std   5, (PPC64_OFFS_V + n * 16 + 8)(3)
-
-  PPC64_STV_UNALIGNED(0)
-  PPC64_STV_UNALIGNED(1)
-  PPC64_STV_UNALIGNED(2)
-  PPC64_STV_UNALIGNED(3)
-  PPC64_STV_UNALIGNED(4)
-  PPC64_STV_UNALIGNED(5)
-  PPC64_STV_UNALIGNED(6)
-  PPC64_STV_UNALIGNED(7)
-  PPC64_STV_UNALIGNED(8)
-  PPC64_STV_UNALIGNED(9)
-  PPC64_STV_UNALIGNED(10)
-  PPC64_STV_UNALIGNED(11)
-  PPC64_STV_UNALIGNED(12)
-  PPC64_STV_UNALIGNED(13)
-  PPC64_STV_UNALIGNED(14)
-  PPC64_STV_UNALIGNED(15)
-  PPC64_STV_UNALIGNED(16)
-  PPC64_STV_UNALIGNED(17)
-  PPC64_STV_UNALIGNED(18)
-  PPC64_STV_UNALIGNED(19)
-  PPC64_STV_UNALIGNED(20)
-  PPC64_STV_UNALIGNED(21)
-  PPC64_STV_UNALIGNED(22)
-  PPC64_STV_UNALIGNED(23)
-  PPC64_STV_UNALIGNED(24)
-  PPC64_STV_UNALIGNED(25)
-  PPC64_STV_UNALIGNED(26)
-  PPC64_STV_UNALIGNED(27)
-  PPC64_STV_UNALIGNED(28)
-  PPC64_STV_UNALIGNED(29)
-  PPC64_STV_UNALIGNED(30)
-  PPC64_STV_UNALIGNED(31)
-
-#endif
-#endif
-
-  li    3,  0   // return UNW_ESUCCESS
-  blr
-
-
-#elif defined(__powerpc__)
-
-//
-// extern int unw_getcontext(unw_context_t* thread_state)
-//
-// On entry:
-//  thread_state pointer is in r3
-//
-#if defined(_AIX)
-DEFINE_LIBUNWIND_FUNCTION_AND_WEAK_ALIAS(__unw_getcontext, unw_getcontext)
-#else
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-#endif
-  stw     0,   8(3)
-  mflr    0
-  stw     0,   0(3) // store lr as ssr0
-  stw     1,  12(3)
-  stw     4,  24(3) // Save r4 first since it will be used for fixing r2.
-#if defined(_AIX)
-  // The TOC register (r2) was changed by the glue code if unw_getcontext
-  // is called from a different module. Save the original TOC register
-  // in the context if this is the case.
-  mflr    4
-  lwz     4,  0(4)      // Get the instruction at the return address.
-  xoris   0,  4, 0x8041 // Is it reloading the TOC register "lwz 2,20(1)"?
-  cmplwi  0,  0x14
-  bne     0,  LnoR2Fix  // No need to fix up r2 if it is not.
-  lwz     2,  20(1)     // Use the saved TOC register in the stack.
-LnoR2Fix:
-#endif
-  stw     2,  16(3)
-  stw     3,  20(3)
-  stw     5,  28(3)
-  stw     6,  32(3)
-  stw     7,  36(3)
-  stw     8,  40(3)
-  stw     9,  44(3)
-  stw     10, 48(3)
-  stw     11, 52(3)
-  stw     12, 56(3)
-  stw     13, 60(3)
-  stw     14, 64(3)
-  stw     15, 68(3)
-  stw     16, 72(3)
-  stw     17, 76(3)
-  stw     18, 80(3)
-  stw     19, 84(3)
-  stw     20, 88(3)
-  stw     21, 92(3)
-  stw     22, 96(3)
-  stw     23,100(3)
-  stw     24,104(3)
-  stw     25,108(3)
-  stw     26,112(3)
-  stw     27,116(3)
-  stw     28,120(3)
-  stw     29,124(3)
-  stw     30,128(3)
-  stw     31,132(3)
-
-#if defined(__ALTIVEC__)
-  // save VRSave register
-  mfspr   0, 256
-  stw     0, 156(3)
-#endif
-  // save CR registers
-  mfcr    0
-  stw     0, 136(3)
-#if defined(_AIX)
-  // LR value from the register is not used, initialize it to 0.
-  li      0, 0
-  stw     0, 144(3)
-#endif
-  // save CTR register
-  mfctr   0
-  stw     0, 148(3)
-
-#if !defined(__NO_FPRS__)
-  // save float registers
-  stfd    0, 160(3)
-  stfd    1, 168(3)
-  stfd    2, 176(3)
-  stfd    3, 184(3)
-  stfd    4, 192(3)
-  stfd    5, 200(3)
-  stfd    6, 208(3)
-  stfd    7, 216(3)
-  stfd    8, 224(3)
-  stfd    9, 232(3)
-  stfd    10,240(3)
-  stfd    11,248(3)
-  stfd    12,256(3)
-  stfd    13,264(3)
-  stfd    14,272(3)
-  stfd    15,280(3)
-  stfd    16,288(3)
-  stfd    17,296(3)
-  stfd    18,304(3)
-  stfd    19,312(3)
-  stfd    20,320(3)
-  stfd    21,328(3)
-  stfd    22,336(3)
-  stfd    23,344(3)
-  stfd    24,352(3)
-  stfd    25,360(3)
-  stfd    26,368(3)
-  stfd    27,376(3)
-  stfd    28,384(3)
-  stfd    29,392(3)
-  stfd    30,400(3)
-  stfd    31,408(3)
-#endif
-
-#if defined(__ALTIVEC__)
-  // save vector registers
-
-  subi    4, 1, 16
-  rlwinm  4, 4, 0, 0, 27  // mask low 4-bits
-  // r4 is now a 16-byte aligned pointer into the red zone
-
-#define SAVE_VECTOR_UNALIGNED(_vec, _offset) \
-  stvx    _vec, 0, 4               SEPARATOR \
-  lwz     5, 0(4)                  SEPARATOR \
-  stw     5, _offset(3)            SEPARATOR \
-  lwz     5, 4(4)                  SEPARATOR \
-  stw     5, _offset+4(3)          SEPARATOR \
-  lwz     5, 8(4)                  SEPARATOR \
-  stw     5, _offset+8(3)          SEPARATOR \
-  lwz     5, 12(4)                 SEPARATOR \
-  stw     5, _offset+12(3)
-
-  SAVE_VECTOR_UNALIGNED( 0, 424+0x000)
-  SAVE_VECTOR_UNALIGNED( 1, 424+0x010)
-  SAVE_VECTOR_UNALIGNED( 2, 424+0x020)
-  SAVE_VECTOR_UNALIGNED( 3, 424+0x030)
-  SAVE_VECTOR_UNALIGNED( 4, 424+0x040)
-  SAVE_VECTOR_UNALIGNED( 5, 424+0x050)
-  SAVE_VECTOR_UNALIGNED( 6, 424+0x060)
-  SAVE_VECTOR_UNALIGNED( 7, 424+0x070)
-  SAVE_VECTOR_UNALIGNED( 8, 424+0x080)
-  SAVE_VECTOR_UNALIGNED( 9, 424+0x090)
-  SAVE_VECTOR_UNALIGNED(10, 424+0x0A0)
-  SAVE_VECTOR_UNALIGNED(11, 424+0x0B0)
-  SAVE_VECTOR_UNALIGNED(12, 424+0x0C0)
-  SAVE_VECTOR_UNALIGNED(13, 424+0x0D0)
-  SAVE_VECTOR_UNALIGNED(14, 424+0x0E0)
-  SAVE_VECTOR_UNALIGNED(15, 424+0x0F0)
-  SAVE_VECTOR_UNALIGNED(16, 424+0x100)
-  SAVE_VECTOR_UNALIGNED(17, 424+0x110)
-  SAVE_VECTOR_UNALIGNED(18, 424+0x120)
-  SAVE_VECTOR_UNALIGNED(19, 424+0x130)
-  SAVE_VECTOR_UNALIGNED(20, 424+0x140)
-  SAVE_VECTOR_UNALIGNED(21, 424+0x150)
-  SAVE_VECTOR_UNALIGNED(22, 424+0x160)
-  SAVE_VECTOR_UNALIGNED(23, 424+0x170)
-  SAVE_VECTOR_UNALIGNED(24, 424+0x180)
-  SAVE_VECTOR_UNALIGNED(25, 424+0x190)
-  SAVE_VECTOR_UNALIGNED(26, 424+0x1A0)
-  SAVE_VECTOR_UNALIGNED(27, 424+0x1B0)
-  SAVE_VECTOR_UNALIGNED(28, 424+0x1C0)
-  SAVE_VECTOR_UNALIGNED(29, 424+0x1D0)
-  SAVE_VECTOR_UNALIGNED(30, 424+0x1E0)
-  SAVE_VECTOR_UNALIGNED(31, 424+0x1F0)
-#endif
-
-  li      3, 0  // return UNW_ESUCCESS
-  blr
-
-
-#elif defined(__aarch64__)
-
-//
-// extern int __unw_getcontext(unw_context_t* thread_state)
-//
-// On entry:
-//  thread_state pointer is in x0
-//
-  .p2align 2
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  stp    x0, x1,  [x0, #0x000]
-  stp    x2, x3,  [x0, #0x010]
-  stp    x4, x5,  [x0, #0x020]
-  stp    x6, x7,  [x0, #0x030]
-  stp    x8, x9,  [x0, #0x040]
-  stp    x10,x11, [x0, #0x050]
-  stp    x12,x13, [x0, #0x060]
-  stp    x14,x15, [x0, #0x070]
-  stp    x16,x17, [x0, #0x080]
-  stp    x18,x19, [x0, #0x090]
-  stp    x20,x21, [x0, #0x0A0]
-  stp    x22,x23, [x0, #0x0B0]
-  stp    x24,x25, [x0, #0x0C0]
-  stp    x26,x27, [x0, #0x0D0]
-  stp    x28,x29, [x0, #0x0E0]
-  str    x30,     [x0, #0x0F0]
-  mov    x1,sp
-  str    x1,      [x0, #0x0F8]
-  str    x30,     [x0, #0x100]    // store return address as pc
-  // skip cpsr
-  stp    d0, d1,  [x0, #0x110]
-  stp    d2, d3,  [x0, #0x120]
-  stp    d4, d5,  [x0, #0x130]
-  stp    d6, d7,  [x0, #0x140]
-  stp    d8, d9,  [x0, #0x150]
-  stp    d10,d11, [x0, #0x160]
-  stp    d12,d13, [x0, #0x170]
-  stp    d14,d15, [x0, #0x180]
-  stp    d16,d17, [x0, #0x190]
-  stp    d18,d19, [x0, #0x1A0]
-  stp    d20,d21, [x0, #0x1B0]
-  stp    d22,d23, [x0, #0x1C0]
-  stp    d24,d25, [x0, #0x1D0]
-  stp    d26,d27, [x0, #0x1E0]
-  stp    d28,d29, [x0, #0x1F0]
-  str    d30,     [x0, #0x200]
-  str    d31,     [x0, #0x208]
-  mov    x0, #0                   // return UNW_ESUCCESS
-  ret
-
-#elif defined(__arm__) && !defined(__APPLE__)
-
-#if !defined(__ARM_ARCH_ISA_ARM)
-#if (__ARM_ARCH_ISA_THUMB == 2)
-  .syntax unified
-#endif
-  .thumb
-#endif
-
-@
-@ extern int __unw_getcontext(unw_context_t* thread_state)
-@
-@ On entry:
-@  thread_state pointer is in r0
-@
-@ Per EHABI #4.7 this only saves the core integer registers.
-@ EHABI #7.4.5 notes that in general all VRS registers should be restored
-@ however this is very hard to do for VFP registers because it is unknown
-@ to the library how many registers are implemented by the architecture.
-@ Instead, VFP registers are demand saved by logic external to __unw_getcontext.
-@
-  .p2align 2
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-#if !defined(__ARM_ARCH_ISA_ARM) && __ARM_ARCH_ISA_THUMB == 1
-  stm r0!, {r0-r7}
-  mov r1, r8
-  mov r2, r9
-  mov r3, r10
-  stm r0!, {r1-r3}
-  mov r1, r11
-  mov r2, sp
-  mov r3, lr
-  str r1, [r0, #0]   @ r11
-  @ r12 does not need storing, it it the intra-procedure-call scratch register
-  str r2, [r0, #8]   @ sp
-  str r3, [r0, #12]  @ lr
-  str r3, [r0, #16]  @ store return address as pc
-  @ T1 does not have a non-cpsr-clobbering register-zeroing instruction.
-  @ It is safe to use here though because we are about to return, and cpsr is
-  @ not expected to be preserved.
-  movs r0, #0        @ return UNW_ESUCCESS
-#else
-  @ 32bit thumb-2 restrictions for stm:
-  @ . the sp (r13) cannot be in the list
-  @ . the pc (r15) cannot be in the list in an STM instruction
-  stm r0, {r0-r12}
-  str sp, [r0, #52]
-  str lr, [r0, #56]
-  str lr, [r0, #60]  @ store return address as pc
-  mov r0, #0         @ return UNW_ESUCCESS
-#endif
-  JMP(lr)
-
-@
-@ static void libunwind::Registers_arm::saveVFPWithFSTMD(unw_fpreg_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .fpu vfpv3-d16
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm16saveVFPWithFSTMDEPv)
-  vstmia r0, {d0-d15}
-  JMP(lr)
-
-@
-@ static void libunwind::Registers_arm::saveVFPWithFSTMX(unw_fpreg_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .fpu vfpv3-d16
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm16saveVFPWithFSTMXEPv)
-  vstmia r0, {d0-d15} @ fstmiax is deprecated in ARMv7+ and now behaves like vstmia
-  JMP(lr)
-
-@
-@ static void libunwind::Registers_arm::saveVFPv3(unw_fpreg_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .fpu vfpv3
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm9saveVFPv3EPv)
-  @ VFP and iwMMX instructions are only available when compiling with the flags
-  @ that enable them. We do not want to do that in the library (because we do not
-  @ want the compiler to generate instructions that access those) but this is
-  @ only accessed if the personality routine needs these registers. Use of
-  @ these registers implies they are, actually, available on the target, so
-  @ it's ok to execute.
-  @ So, generate the instructions using the corresponding coprocessor mnemonic.
-  vstmia r0, {d16-d31}
-  JMP(lr)
-
-#if defined(_LIBUNWIND_ARM_WMMX)
-
-@
-@ static void libunwind::Registers_arm::saveiWMMX(unw_fpreg_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .arch armv5te
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm9saveiWMMXEPv)
-  stcl p1, cr0, [r0], #8  @ wstrd wR0, [r0], #8
-  stcl p1, cr1, [r0], #8  @ wstrd wR1, [r0], #8
-  stcl p1, cr2, [r0], #8  @ wstrd wR2, [r0], #8
-  stcl p1, cr3, [r0], #8  @ wstrd wR3, [r0], #8
-  stcl p1, cr4, [r0], #8  @ wstrd wR4, [r0], #8
-  stcl p1, cr5, [r0], #8  @ wstrd wR5, [r0], #8
-  stcl p1, cr6, [r0], #8  @ wstrd wR6, [r0], #8
-  stcl p1, cr7, [r0], #8  @ wstrd wR7, [r0], #8
-  stcl p1, cr8, [r0], #8  @ wstrd wR8, [r0], #8
-  stcl p1, cr9, [r0], #8  @ wstrd wR9, [r0], #8
-  stcl p1, cr10, [r0], #8  @ wstrd wR10, [r0], #8
-  stcl p1, cr11, [r0], #8  @ wstrd wR11, [r0], #8
-  stcl p1, cr12, [r0], #8  @ wstrd wR12, [r0], #8
-  stcl p1, cr13, [r0], #8  @ wstrd wR13, [r0], #8
-  stcl p1, cr14, [r0], #8  @ wstrd wR14, [r0], #8
-  stcl p1, cr15, [r0], #8  @ wstrd wR15, [r0], #8
-  JMP(lr)
-
-@
-@ static void libunwind::Registers_arm::saveiWMMXControl(unw_uint32_t* values)
-@
-@ On entry:
-@  values pointer is in r0
-@
-  .p2align 2
-#if defined(__ELF__)
-  .arch armv5te
-#endif
-DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_arm16saveiWMMXControlEPj)
-  stc2 p1, cr8, [r0], #4  @ wstrw wCGR0, [r0], #4
-  stc2 p1, cr9, [r0], #4  @ wstrw wCGR1, [r0], #4
-  stc2 p1, cr10, [r0], #4  @ wstrw wCGR2, [r0], #4
-  stc2 p1, cr11, [r0], #4  @ wstrw wCGR3, [r0], #4
-  JMP(lr)
-
-#endif
-
-#elif defined(__or1k__)
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#  thread_state pointer is in r3
-#
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  l.sw       0(r3), r0
-  l.sw       4(r3), r1
-  l.sw       8(r3), r2
-  l.sw      12(r3), r3
-  l.sw      16(r3), r4
-  l.sw      20(r3), r5
-  l.sw      24(r3), r6
-  l.sw      28(r3), r7
-  l.sw      32(r3), r8
-  l.sw      36(r3), r9
-  l.sw      40(r3), r10
-  l.sw      44(r3), r11
-  l.sw      48(r3), r12
-  l.sw      52(r3), r13
-  l.sw      56(r3), r14
-  l.sw      60(r3), r15
-  l.sw      64(r3), r16
-  l.sw      68(r3), r17
-  l.sw      72(r3), r18
-  l.sw      76(r3), r19
-  l.sw      80(r3), r20
-  l.sw      84(r3), r21
-  l.sw      88(r3), r22
-  l.sw      92(r3), r23
-  l.sw      96(r3), r24
-  l.sw     100(r3), r25
-  l.sw     104(r3), r26
-  l.sw     108(r3), r27
-  l.sw     112(r3), r28
-  l.sw     116(r3), r29
-  l.sw     120(r3), r30
-  l.sw     124(r3), r31
-  # store ra to pc
-  l.sw     128(r3), r9
-  # zero epcr
-  l.sw     132(r3), r0
-
-#elif defined(__hexagon__)
-#
-# extern int unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#  thread_state pointer is in r0
-#
-#define OFFSET(offset) (offset/4)
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  memw(r0+#32) = r8
-  memw(r0+#36) = r9
-  memw(r0+#40) = r10
-  memw(r0+#44) = r11
-
-  memw(r0+#48) = r12
-  memw(r0+#52) = r13
-  memw(r0+#56) = r14
-  memw(r0+#60) = r15
-
-  memw(r0+#64) = r16
-  memw(r0+#68) = r17
-  memw(r0+#72) = r18
-  memw(r0+#76) = r19
-
-  memw(r0+#80) = r20
-  memw(r0+#84) = r21
-  memw(r0+#88) = r22
-  memw(r0+#92) = r23
-
-  memw(r0+#96) = r24
-  memw(r0+#100) = r25
-  memw(r0+#104) = r26
-  memw(r0+#108) = r27
-
-  memw(r0+#112) = r28
-  memw(r0+#116) = r29
-  memw(r0+#120) = r30
-  memw(r0+#124) = r31
-  r1 = c4   // Predicate register
-  memw(r0+#128) = r1
-  r1 = memw(r30)           // *FP == Saved FP
-  r1 = r31
-  memw(r0+#132) = r1
-
-  jumpr r31
-
-#elif defined(__sparc__) && defined(__arch64__)
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#  thread_state pointer is in %o0
-#
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  .register %g2, #scratch
-  .register %g3, #scratch
-  .register %g6, #scratch
-  .register %g7, #scratch
-  stx  %g1, [%o0 + 0x08]
-  stx  %g2, [%o0 + 0x10]
-  stx  %g3, [%o0 + 0x18]
-  stx  %g4, [%o0 + 0x20]
-  stx  %g5, [%o0 + 0x28]
-  stx  %g6, [%o0 + 0x30]
-  stx  %g7, [%o0 + 0x38]
-  stx  %o0, [%o0 + 0x40]
-  stx  %o1, [%o0 + 0x48]
-  stx  %o2, [%o0 + 0x50]
-  stx  %o3, [%o0 + 0x58]
-  stx  %o4, [%o0 + 0x60]
-  stx  %o5, [%o0 + 0x68]
-  stx  %o6, [%o0 + 0x70]
-  stx  %o7, [%o0 + 0x78]
-  stx  %l0, [%o0 + 0x80]
-  stx  %l1, [%o0 + 0x88]
-  stx  %l2, [%o0 + 0x90]
-  stx  %l3, [%o0 + 0x98]
-  stx  %l4, [%o0 + 0xa0]
-  stx  %l5, [%o0 + 0xa8]
-  stx  %l6, [%o0 + 0xb0]
-  stx  %l7, [%o0 + 0xb8]
-  stx  %i0, [%o0 + 0xc0]
-  stx  %i1, [%o0 + 0xc8]
-  stx  %i2, [%o0 + 0xd0]
-  stx  %i3, [%o0 + 0xd8]
-  stx  %i4, [%o0 + 0xe0]
-  stx  %i5, [%o0 + 0xe8]
-  stx  %i6, [%o0 + 0xf0]
-  stx  %i7, [%o0 + 0xf8]
-
-  # save StackGhost cookie
-  mov  %i7, %g4
-  save %sp, -176, %sp
-  # register window flush necessary even without StackGhost
-  flushw
-  restore
-  ldx  [%sp + 2047 + 0x78], %g5
-  xor  %g4, %g5, %g4
-  stx  %g4, [%o0 + 0x100]
-  retl
-  # return UNW_ESUCCESS
-   clr %o0
-
-#elif defined(__sparc__)
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#  thread_state pointer is in o0
-#
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  ta 3
-  add %o7, 8, %o7
-  std %g0, [%o0 +   0]
-  std %g2, [%o0 +   8]
-  std %g4, [%o0 +  16]
-  std %g6, [%o0 +  24]
-  std %o0, [%o0 +  32]
-  std %o2, [%o0 +  40]
-  std %o4, [%o0 +  48]
-  std %o6, [%o0 +  56]
-  std %l0, [%o0 +  64]
-  std %l2, [%o0 +  72]
-  std %l4, [%o0 +  80]
-  std %l6, [%o0 +  88]
-  std %i0, [%o0 +  96]
-  std %i2, [%o0 + 104]
-  std %i4, [%o0 + 112]
-  std %i6, [%o0 + 120]
-  jmp %o7
-   clr %o0                   // return UNW_ESUCCESS
-
-#elif defined(__riscv)
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#  thread_state pointer is in a0
-#
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  ISTORE    x1, (RISCV_ISIZE * 0)(a0) // store ra as pc
-#if defined(__riscv_32e)
-  .irp i,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-#else
-  .irp i,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-#endif
-    ISTORE x\i, (RISCV_ISIZE * \i)(a0)
-  .endr
-
-# if defined(__riscv_flen)
-  .irp i,FROM_0_TO_31
-    FSTORE f\i, (RISCV_FOFFSET + RISCV_FSIZE * \i)(a0)
-  .endr
-# endif
-
-  li     a0, 0  // return UNW_ESUCCESS
-  ret           // jump to ra
-
-#elif defined(__s390x__)
-
-//
-// extern int __unw_getcontext(unw_context_t* thread_state)
-//
-// On entry:
-//  thread_state pointer is in r2
-//
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-
-  // Save GPRs
-  stmg %r0, %r15, 16(%r2)
-
-  // Save PSWM
-  epsw %r0, %r1
-  stm %r0, %r1, 0(%r2)
-
-  // Store return address as PSWA
-  stg %r14, 8(%r2)
-
-  // Save FPRs
-  .irp i,FROM_0_TO_15
-    std %f\i, (144+8*\i)(%r2)
-  .endr
-
-  // Return UNW_ESUCCESS
-  lghi %r2, 0
-  br %r14
-
-#elif defined(__loongarch__) && __loongarch_grlen == 64
-
-#
-# extern int __unw_getcontext(unw_context_t* thread_state)
-#
-# On entry:
-#  thread_state pointer is in $a0($r4)
-#
-DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  .irp i,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-    st.d $r\i, $a0, (8*\i)
-  .endr
-  st.d    $r1,  $a0, (8 * 32) // store $ra to pc
-
-# if __loongarch_frlen == 64
-  .irp i,FROM_0_TO_31
-    fst.d $f\i, $a0, (8 * 33 + 8 * \i)
-  .endr
-# endif
-
-  move     $a0, $zero  // UNW_ESUCCESS
-  jr       $ra
-
-#endif
-
-  WEAK_ALIAS(__unw_getcontext, unw_getcontext)
-
-#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__) */
-
-NO_EXEC_STACK_DIRECTIVE
diff --git a/third_party/libunwind/assembly.h b/third_party/libunwind/assembly.h
deleted file mode 100644
index f8e83e138..000000000
--- a/third_party/libunwind/assembly.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/* ===-- assembly.h - libUnwind assembler support macros -------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- * ===----------------------------------------------------------------------===
- *
- * This file defines macros for use in libUnwind assembler source.
- * This file is not part of the interface of this library.
- *
- * ===----------------------------------------------------------------------===
- */
-
-#ifndef UNWIND_ASSEMBLY_H
-#define UNWIND_ASSEMBLY_H
-
-#if defined(__linux__) && defined(__CET__)
-#include <cet.h>
-#define _LIBUNWIND_CET_ENDBR _CET_ENDBR
-#else
-#define _LIBUNWIND_CET_ENDBR
-#endif
-
-#if defined(__powerpc64__)
-#define SEPARATOR ;
-#define PPC64_OFFS_SRR0   0
-#define PPC64_OFFS_CR     272
-#define PPC64_OFFS_XER    280
-#define PPC64_OFFS_LR     288
-#define PPC64_OFFS_CTR    296
-#define PPC64_OFFS_VRSAVE 304
-#define PPC64_OFFS_FP     312
-#define PPC64_OFFS_V      824
-#elif defined(__APPLE__) && defined(__aarch64__)
-#define SEPARATOR %%
-#elif defined(__riscv)
-# define RISCV_ISIZE (__riscv_xlen / 8)
-# define RISCV_FOFFSET (RISCV_ISIZE * 32)
-# if defined(__riscv_flen)
-#  define RISCV_FSIZE (__riscv_flen / 8)
-# endif
-
-# if __riscv_xlen == 64
-#  define ILOAD ld
-#  define ISTORE sd
-# elif __riscv_xlen == 32
-#  define ILOAD lw
-#  define ISTORE sw
-# else
-#  error "Unsupported __riscv_xlen"
-# endif
-
-# if defined(__riscv_flen)
-#  if __riscv_flen == 64
-#   define FLOAD fld
-#   define FSTORE fsd
-#  elif __riscv_flen == 32
-#   define FLOAD flw
-#   define FSTORE fsw
-#  else
-#   error "Unsupported __riscv_flen"
-#  endif
-# endif
-# define SEPARATOR ;
-#else
-#define SEPARATOR ;
-#endif
-
-#if defined(__powerpc64__) && (!defined(_CALL_ELF) || _CALL_ELF == 1) &&       \
-    !defined(_AIX)
-#define PPC64_OPD1 .section .opd,"aw",@progbits SEPARATOR
-#define PPC64_OPD2 SEPARATOR \
-  .p2align 3 SEPARATOR \
-  .quad .Lfunc_begin0 SEPARATOR \
-  .quad .TOC.@tocbase SEPARATOR \
-  .quad 0 SEPARATOR \
-  .text SEPARATOR \
-.Lfunc_begin0:
-#else
-#define PPC64_OPD1
-#define PPC64_OPD2
-#endif
-
-#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_GCS_DEFAULT) && defined(__ARM_FEATURE_BTI_DEFAULT)
-// Set BTI, PAC, and GCS gnu property bits
-#define GNU_PROPERTY 7
-// We indirectly branch to __libunwind_Registers_arm64_jumpto from
-// __unw_phase2_resume, so we need to use bti jc.
-#define AARCH64_BTI bti jc
-#elif defined(__ARM_FEATURE_GCS_DEFAULT)
-// Set GCS gnu property bit
-#define GNU_PROPERTY 4
-#elif defined(__ARM_FEATURE_BTI_DEFAULT)
-// Set BTI and PAC gnu property bits
-#define GNU_PROPERTY 3
-#define AARCH64_BTI bti c
-#endif
-#ifdef GNU_PROPERTY
-  .pushsection ".note.gnu.property", "a" SEPARATOR                             \
-  .balign 8 SEPARATOR                                                          \
-  .long 4 SEPARATOR                                                            \
-  .long 0x10 SEPARATOR                                                         \
-  .long 0x5 SEPARATOR                                                          \
-  .asciz "GNU" SEPARATOR                                                       \
-  .long 0xc0000000 SEPARATOR /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */          \
-  .long 4 SEPARATOR                                                            \
-  .long GNU_PROPERTY SEPARATOR                                                 \
-  .long 0 SEPARATOR                                                            \
-  .popsection SEPARATOR
-#endif
-#endif
-#if !defined(AARCH64_BTI)
-#define AARCH64_BTI
-#endif
-
-#if !defined(__aarch64__)
-#ifdef __ARM_FEATURE_PAC_DEFAULT
-  .eabi_attribute Tag_PAC_extension, 2
-  .eabi_attribute Tag_PACRET_use, 1
-#endif
-#ifdef __ARM_FEATURE_BTI_DEFAULT
-  .eabi_attribute Tag_BTI_extension, 1
-  .eabi_attribute Tag_BTI_use, 1
-#endif
-#endif
-
-#define GLUE2(a, b) a ## b
-#define GLUE(a, b) GLUE2(a, b)
-#define SYMBOL_NAME(name) GLUE(__USER_LABEL_PREFIX__, name)
-
-#if defined(__APPLE__)
-
-#define SYMBOL_IS_FUNC(name)
-#define HIDDEN_SYMBOL(name) .private_extern name
-#if defined(_LIBUNWIND_HIDE_SYMBOLS)
-#define EXPORT_SYMBOL(name) HIDDEN_SYMBOL(name)
-#else
-#define EXPORT_SYMBOL(name)
-#endif
-#define WEAK_ALIAS(name, aliasname)                                            \
-  .globl SYMBOL_NAME(aliasname) SEPARATOR                                      \
-  EXPORT_SYMBOL(SYMBOL_NAME(aliasname)) SEPARATOR                              \
-  SYMBOL_NAME(aliasname) = SYMBOL_NAME(name)
-
-#define NO_EXEC_STACK_DIRECTIVE
-
-#elif defined(__ELF__)
-
-#if defined(__arm__)
-#define SYMBOL_IS_FUNC(name) .type name,%function
-#else
-#define SYMBOL_IS_FUNC(name) .type name,@function
-#endif
-#define HIDDEN_SYMBOL(name) .hidden name
-#if defined(_LIBUNWIND_HIDE_SYMBOLS)
-#define EXPORT_SYMBOL(name) HIDDEN_SYMBOL(name)
-#else
-#define EXPORT_SYMBOL(name)
-#endif
-#define WEAK_SYMBOL(name) .weak name
-
-#if defined(__hexagon__)
-#define WEAK_ALIAS(name, aliasname)                                            \
-  EXPORT_SYMBOL(SYMBOL_NAME(aliasname)) SEPARATOR                              \
-  WEAK_SYMBOL(SYMBOL_NAME(aliasname)) SEPARATOR                                \
-  .equiv SYMBOL_NAME(aliasname), SYMBOL_NAME(name)
-#else
-#define WEAK_ALIAS(name, aliasname)                                            \
-  EXPORT_SYMBOL(SYMBOL_NAME(aliasname)) SEPARATOR                              \
-  WEAK_SYMBOL(SYMBOL_NAME(aliasname)) SEPARATOR                                \
-  SYMBOL_NAME(aliasname) = SYMBOL_NAME(name)
-#endif
-
-#if defined(__GNU__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
-    defined(__linux__)
-#define NO_EXEC_STACK_DIRECTIVE .section .note.GNU-stack,"",%progbits
-#else
-#define NO_EXEC_STACK_DIRECTIVE
-#endif
-
-#elif defined(_WIN32)
-
-#define SYMBOL_IS_FUNC(name)                                                   \
-  .def name SEPARATOR                                                          \
-    .scl 2 SEPARATOR                                                           \
-    .type 32 SEPARATOR                                                         \
-  .endef
-#define EXPORT_SYMBOL2(name)                                                   \
-  .section .drectve,"yn" SEPARATOR                                             \
-  .ascii "-export:", #name, "\0" SEPARATOR                                     \
-  .text
-#if defined(_LIBUNWIND_HIDE_SYMBOLS)
-#define EXPORT_SYMBOL(name)
-#else
-#define EXPORT_SYMBOL(name) EXPORT_SYMBOL2(name)
-#endif
-#define HIDDEN_SYMBOL(name)
-
-#if defined(__MINGW32__)
-#define WEAK_ALIAS(name, aliasname)                                            \
-  .globl SYMBOL_NAME(aliasname) SEPARATOR                                      \
-  EXPORT_SYMBOL(aliasname) SEPARATOR                                           \
-  SYMBOL_NAME(aliasname) = SYMBOL_NAME(name)
-#else
-#define WEAK_ALIAS3(name, aliasname)                                           \
-  .section .drectve,"yn" SEPARATOR                                             \
-  .ascii "-alternatename:", #aliasname, "=", #name, "\0" SEPARATOR             \
-  .text
-#define WEAK_ALIAS2(name, aliasname)                                           \
-  WEAK_ALIAS3(name, aliasname)
-#define WEAK_ALIAS(name, aliasname)                                            \
-  EXPORT_SYMBOL(SYMBOL_NAME(aliasname)) SEPARATOR                              \
-  WEAK_ALIAS2(SYMBOL_NAME(name), SYMBOL_NAME(aliasname))
-#endif
-
-#define NO_EXEC_STACK_DIRECTIVE
-
-#elif defined(__sparc__)
-
-#elif defined(_AIX)
-
-#if defined(__powerpc64__)
-#define VBYTE_LEN 8
-#define CSECT_ALIGN 3
-#else
-#define VBYTE_LEN 4
-#define CSECT_ALIGN 2
-#endif
-
-// clang-format off
-#define DEFINE_LIBUNWIND_FUNCTION_AND_WEAK_ALIAS(name, aliasname)              \
-  .csect .text[PR], 2 SEPARATOR                                                \
-  .csect .name[PR], 2 SEPARATOR                                                \
-  .globl name[DS] SEPARATOR                                                    \
-  .globl .name[PR] SEPARATOR                                                   \
-  .align 4 SEPARATOR                                                           \
-  .csect name[DS], CSECT_ALIGN SEPARATOR                                       \
-aliasname:                                                                     \
-  .vbyte VBYTE_LEN, .name[PR] SEPARATOR                                        \
-  .vbyte VBYTE_LEN, TOC[TC0] SEPARATOR                                         \
-  .vbyte VBYTE_LEN, 0 SEPARATOR                                                \
-  .weak  aliasname SEPARATOR                                                   \
-  .weak  .aliasname SEPARATOR                                                  \
-  .csect .name[PR], 2 SEPARATOR                                                \
-.aliasname:                                                                    \
-
-#define WEAK_ALIAS(name, aliasname)
-#define NO_EXEC_STACK_DIRECTIVE
-
-// clang-format on
-#else
-
-#error Unsupported target
-
-#endif
-
-#if defined(_AIX)
-  // clang-format off
-#define DEFINE_LIBUNWIND_FUNCTION(name)                                        \
-  .globl name[DS] SEPARATOR                                                    \
-  .globl .name SEPARATOR                                                       \
-  .align 4 SEPARATOR                                                           \
-  .csect name[DS], CSECT_ALIGN SEPARATOR                                       \
-  .vbyte VBYTE_LEN, .name SEPARATOR                                            \
-  .vbyte VBYTE_LEN, TOC[TC0] SEPARATOR                                         \
-  .vbyte VBYTE_LEN, 0 SEPARATOR                                                \
-  .csect .text[PR], 2 SEPARATOR                                                \
-.name:
-  // clang-format on
-#else
-#define DEFINE_LIBUNWIND_FUNCTION(name)                                        \
-  .globl SYMBOL_NAME(name) SEPARATOR                                           \
-  HIDDEN_SYMBOL(SYMBOL_NAME(name)) SEPARATOR                                   \
-  SYMBOL_IS_FUNC(SYMBOL_NAME(name)) SEPARATOR                                  \
-  PPC64_OPD1                                                                   \
-  SYMBOL_NAME(name):                                                           \
-  PPC64_OPD2                                                                   \
-  AARCH64_BTI
-#endif
-
-#if defined(__arm__)
-#if !defined(__ARM_ARCH)
-#define __ARM_ARCH 4
-#endif
-
-#if defined(__ARM_ARCH_4T__) || __ARM_ARCH >= 5
-#define ARM_HAS_BX
-#endif
-
-#ifdef ARM_HAS_BX
-#define JMP(r) bx r
-#else
-#define JMP(r) mov pc, r
-#endif
-#endif /* __arm__ */
-
-#if defined(__powerpc__)
-#define PPC_LEFT_SHIFT(index) << (index)
-#endif
-
-#endif /* UNWIND_ASSEMBLY_H */
diff --git a/third_party/libunwind/libunwind.cc b/third_party/libunwind/libunwind.cc
index 053b972a7..4f58a27a0 100644
--- a/third_party/libunwind/libunwind.cc
+++ b/third_party/libunwind/libunwind.cc
@@ -321,7 +321,7 @@ void __unw_remove_dynamic_fde(unw_word_t fde) {
 void __unw_add_dynamic_eh_frame_section(unw_word_t eh_frame_start) {
   // The eh_frame section start serves as the mh_group
   unw_word_t mh_group = eh_frame_start;
-  CFI_Parser<LocalAddressSpace>::CIE_Info cieInfo = {};
+  CFI_Parser<LocalAddressSpace>::CIE_Info cieInfo;
   CFI_Parser<LocalAddressSpace>::FDE_Info fdeInfo;
   auto p = (LocalAddressSpace::pint_t)eh_frame_start;
   while (true) {
diff --git a/third_party/linenoise/BUILD.mk b/third_party/linenoise/BUILD.mk
index 8ee501529..70414264f 100644
--- a/third_party/linenoise/BUILD.mk
+++ b/third_party/linenoise/BUILD.mk
@@ -19,17 +19,16 @@ THIRD_PARTY_LINENOISE_A_DIRECTDEPS =				\
 	LIBC_CALLS						\
 	LIBC_FMT						\
 	LIBC_INTRIN						\
-	LIBC_LOG						\
-	LIBC_MEM						\
 	LIBC_NEXGEN32E						\
-	LIBC_RUNTIME						\
+	LIBC_MEM						\
+	LIBC_SYSV						\
 	LIBC_SOCK						\
 	LIBC_STDIO						\
-	LIBC_STR						\
-	LIBC_SYSV						\
+	LIBC_RUNTIME						\
+	LIBC_LOG						\
 	LIBC_SYSV_CALLS						\
-	NET_HTTP						\
-	THIRD_PARTY_MUSL					\
+	LIBC_STR						\
+	NET_HTTP
 
 THIRD_PARTY_LINENOISE_A_DEPS :=					\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_LINENOISE_A_DIRECTDEPS),$($(x))))
diff --git a/third_party/linenoise/linenoise.c b/third_party/linenoise/linenoise.c
index fffa024e4..94ecebb90 100644
--- a/third_party/linenoise/linenoise.c
+++ b/third_party/linenoise/linenoise.c
@@ -144,7 +144,7 @@
 #include "libc/intrin/strace.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/rdtsc.h"
@@ -155,7 +155,7 @@
 #include "libc/stdio/append.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/str/unicode.h"
 #include "libc/sysv/consts/fileno.h"
 #include "libc/sysv/consts/map.h"
@@ -408,7 +408,9 @@ static int linenoiseIsUnsupportedTerm(void) {
   char *term;
   static char once, res;
   if (!once) {
-    if ((term = getenv("TERM"))) {
+    if (IsWindows()) {
+      res = 1;
+    } else if ((term = getenv("TERM"))) {
       for (i = 0; i < sizeof(kUnsupported) / sizeof(*kUnsupported); i++) {
         if (!strcasecmp(term, kUnsupported[i])) {
           res = 1;
diff --git a/third_party/lua/BUILD.mk b/third_party/lua/BUILD.mk
index 1adb27977..6fe58a0c9 100644
--- a/third_party/lua/BUILD.mk
+++ b/third_party/lua/BUILD.mk
@@ -131,7 +131,6 @@ THIRD_PARTY_LUA_A_DIRECTDEPS =					\
 	LIBC_RUNTIME						\
 	LIBC_STDIO						\
 	LIBC_STR						\
-	LIBC_SYSTEM						\
 	LIBC_SYSV						\
 	LIBC_THREAD						\
 	LIBC_TINYMATH						\
@@ -140,8 +139,7 @@ THIRD_PARTY_LUA_A_DIRECTDEPS =					\
 	THIRD_PARTY_DOUBLECONVERSION				\
 	THIRD_PARTY_GDTOA					\
 	THIRD_PARTY_LINENOISE					\
-	THIRD_PARTY_MUSL					\
-	THIRD_PARTY_TZ						\
+	THIRD_PARTY_TZ
 
 THIRD_PARTY_LUA_A_DEPS :=					\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_LUA_A_DIRECTDEPS),$($(x))))
diff --git a/third_party/lua/llock.c b/third_party/lua/llock.c
index 359140f55..9a0f0bfbb 100644
--- a/third_party/lua/llock.c
+++ b/third_party/lua/llock.c
@@ -19,16 +19,12 @@
 #include "libc/thread/thread.h"
 #include "third_party/lua/lrepl.h"
 
-static pthread_mutex_t lua_repl_lock_obj = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t lua_repl_lock_obj;
 
-void lua_repl_wock(void) {
-  lua_repl_lock_obj = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
-}
-
-void lua_repl_lock(void) {
+void(lua_repl_lock)(void) {
   pthread_mutex_lock(&lua_repl_lock_obj);
 }
 
-void lua_repl_unlock(void) {
+void(lua_repl_unlock)(void) {
   pthread_mutex_unlock(&lua_repl_lock_obj);
 }
diff --git a/third_party/lua/lrepl.c b/third_party/lua/lrepl.c
index 5201f1e7b..3312918e9 100644
--- a/third_party/lua/lrepl.c
+++ b/third_party/lua/lrepl.c
@@ -32,7 +32,7 @@
 #include "libc/errno.h"
 #include "libc/intrin/nomultics.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
@@ -335,7 +335,7 @@ void lua_initrepl(lua_State *L) {
     prompt = get_prompt(L, 1);
     if ((g_historypath = linenoiseGetHistoryPath(lua_progname))) {
       if (linenoiseHistoryLoad(g_historypath) == -1) {
-        fprintf(stderr, "%r%s: failed to load history: %m\n", g_historypath);
+        fprintf(stderr, "%r%s: failed to load history: %m%n", g_historypath);
         free(g_historypath);
         g_historypath = 0;
       }
diff --git a/third_party/lua/lrepl.h b/third_party/lua/lrepl.h
index 7d08b0730..a2294c5ca 100644
--- a/third_party/lua/lrepl.h
+++ b/third_party/lua/lrepl.h
@@ -11,7 +11,6 @@ extern struct linenoiseState *lua_repl_linenoise;
 extern linenoiseCompletionCallback *lua_repl_completions_callback;
 
 void lua_freerepl(void);
-void lua_repl_wock(void);
 void lua_repl_lock(void);
 void lua_repl_unlock(void);
 int lua_loadline(lua_State *);
diff --git a/third_party/lua/ltests.c b/third_party/lua/ltests.c
index 851284956..8a9a66347 100644
--- a/third_party/lua/ltests.c
+++ b/third_party/lua/ltests.c
@@ -28,39 +28,25 @@
 #define ltests_c
 #define LUA_CORE
 
-#include "lprefix.h"
-
-#include <limits.h>
-#include <setjmp.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "libc/mem/gc.h"
-#include "libc/log/log.h"
-
-#include "lua.h"
-
-#include "lapi.h"
-#include "lauxlib.h"
-#include "lcode.h"
-#include "lctype.h"
-#include "ldebug.h"
-#include "ldo.h"
-#include "lfunc.h"
-#include "lmem.h"
-#include "lopcodes.h"
-#include "lopnames.inc"
-#include "lprefix.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "lualib.h"
-#include "ltm.h"
-
+#include "third_party/lua/lapi.h"
+#include "third_party/lua/lauxlib.h"
+#include "third_party/lua/lcode.h"
+#include "third_party/lua/lctype.h"
+#include "third_party/lua/ldebug.h"
+#include "third_party/lua/ldo.h"
+#include "third_party/lua/lfunc.h"
+#include "third_party/lua/lmem.h"
+#include "third_party/lua/lopcodes.h"
+#include "third_party/lua/lopnames.inc"
+#include "third_party/lua/lprefix.h"
+#include "third_party/lua/lstate.h"
+#include "third_party/lua/lstring.h"
+#include "third_party/lua/ltable.h"
+#include "third_party/lua/lua.h"
+#include "third_party/lua/lualib.h"
 __static_yoink("lua_notice");
 
 
-
 /*
 ** The whole module only makes sense with LUA_DEBUG on
 */
diff --git a/third_party/lua/lua.h b/third_party/lua/lua.h
index 470e0f423..f17da40f3 100644
--- a/third_party/lua/lua.h
+++ b/third_party/lua/lua.h
@@ -133,14 +133,6 @@ typedef struct lua_Debug lua_Debug;
 typedef void (*lua_Hook) (lua_State *L, lua_Debug *ar);
 
 
-/*
-** [jart] support ltests.h without unsafe LUA_USER_H kludge
-**        use `make MODE=dbg` to get this functionality
-*/
-#ifdef MODE_DBG
-#include "ltests.h"
-#endif
-
 /*
 ** generic extra include file
 */
diff --git a/third_party/lua/lua.main.c b/third_party/lua/lua.main.c
index 9a0cee129..469c5638f 100644
--- a/third_party/lua/lua.main.c
+++ b/third_party/lua/lua.main.c
@@ -50,8 +50,8 @@
 #include "third_party/lua/lrepl.h"
 #include "third_party/lua/lualib.h"
 #include "third_party/lua/lunix.h"
-#include "libc/cosmo.h"
 #include "libc/mem/leaks.h"
+#include "tool/args/args.h"
 __static_yoink("lua_notice");
 
 #if !defined(LUA_PROGNAME)
diff --git a/third_party/lua/luacallwithtrace.c b/third_party/lua/luacallwithtrace.c
index 5c0ac4d93..7dee8c79f 100644
--- a/third_party/lua/luacallwithtrace.c
+++ b/third_party/lua/luacallwithtrace.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "third_party/lua/cosmo.h"
 #include "third_party/lua/lauxlib.h"
 
diff --git a/third_party/lua/lunix.c b/third_party/lua/lunix.c
index f5007e414..8f9e8a966 100644
--- a/third_party/lua/lunix.c
+++ b/third_party/lua/lunix.c
@@ -49,7 +49,7 @@
 #include "libc/intrin/strace.h"
 #include "libc/limits.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/process.h"
 #include "libc/nt/runtime.h"
@@ -109,9 +109,7 @@
 #include "third_party/lua/lgc.h"
 #include "third_party/lua/lua.h"
 #include "third_party/lua/luaconf.h"
-#include "libc/sysv/consts/clock.h"
-#include "libc/cosmo.h"
-#include "libc/cosmo.h"
+#include "third_party/nsync/futex.internal.h"
 #include "tool/net/luacheck.h"
 
 #define DNS_NAME_MAX  253
@@ -1009,7 +1007,7 @@ static int LuaUnixOpen(lua_State *L) {
   return SysretInteger(
       L, "open", olderr,
       openat(luaL_optinteger(L, 4, AT_FDCWD), luaL_checkstring(L, 1),
-             luaL_optinteger(L, 2, O_RDONLY), luaL_optinteger(L, 3, 0644)));
+             luaL_optinteger(L, 2, O_RDONLY), luaL_optinteger(L, 3, 0)));
 }
 
 // unix.tmpfd()
@@ -2856,8 +2854,8 @@ static int LuaUnixMemoryWait(lua_State *L) {
     deadline = &ts;
   }
   BEGIN_CANCELATION_POINT;
-  rc = cosmo_futex_wait((atomic_int *)GetWord(L), expect,
-                         PTHREAD_PROCESS_SHARED, CLOCK_REALTIME, deadline);
+  rc = nsync_futex_wait_((atomic_int *)GetWord(L), expect,
+                         PTHREAD_PROCESS_SHARED, deadline);
   END_CANCELATION_POINT;
   if (rc < 0) errno = -rc, rc = -1;
   return SysretInteger(L, "futex_wait", olderr, rc);
@@ -2868,7 +2866,7 @@ static int LuaUnixMemoryWait(lua_State *L) {
 static int LuaUnixMemoryWake(lua_State *L) {
   int count, woken;
   count = luaL_optinteger(L, 3, INT_MAX);
-  woken = cosmo_futex_wake((atomic_int *)GetWord(L), count,
+  woken = nsync_futex_wake_((atomic_int *)GetWord(L), count,
                             PTHREAD_PROCESS_SHARED);
   npassert(woken >= 0);
   return ReturnInteger(L, woken);
@@ -2959,7 +2957,7 @@ static int LuaUnixMapshared(lua_State *L) {
   m->mapsize = c;
   m->lock = (pthread_mutex_t *)p;
   pthread_mutexattr_init(&mattr);
-  pthread_mutexattr_settype(&mattr, PTHREAD_MUTEX_DEFAULT);
+  pthread_mutexattr_settype(&mattr, PTHREAD_MUTEX_NORMAL);
   pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
   pthread_mutex_init(m->lock, &mattr);
   pthread_mutexattr_destroy(&mattr);
@@ -3440,14 +3438,12 @@ int LuaUnix(lua_State *L) {
   LuaSetIntField(L, "SHUT_RDWR", SHUT_RDWR);
 
   // recvfrom() / sendto() flags
-  LuaSetIntField(L, "MSG_OOB", MSG_OOB);
-  LuaSetIntField(L, "MSG_PEEK", MSG_PEEK);
-  LuaSetIntField(L, "MSG_DONTROUTE", MSG_DONTROUTE);
-  LuaSetIntField(L, "MSG_DONTWAIT", MSG_DONTWAIT);
-  LuaSetIntField(L, "MSG_NOSIGNAL", MSG_NOSIGNAL);
   LuaSetIntField(L, "MSG_WAITALL", MSG_WAITALL);
-  LuaSetIntField(L, "MSG_TRUNC", MSG_TRUNC);
-  LuaSetIntField(L, "MSG_CTRUNC", MSG_CTRUNC);
+  LuaSetIntField(L, "MSG_DONTROUTE", MSG_DONTROUTE);
+  LuaSetIntField(L, "MSG_PEEK", MSG_PEEK);
+  LuaSetIntField(L, "MSG_OOB", MSG_OOB);
+  LuaSetIntField(L, "MSG_NOSIGNAL", MSG_NOSIGNAL);
+  LuaSetIntField(L, "MSG_MORE", MSG_MORE);
 
   // readdir() type
   LuaSetIntField(L, "DT_UNKNOWN", DT_UNKNOWN);
diff --git a/third_party/lua/test/literals.lua b/third_party/lua/test/literals.lua
index 6394c4f88..4831534a2 100644
--- a/third_party/lua/test/literals.lua
+++ b/third_party/lua/test/literals.lua
@@ -294,31 +294,30 @@ end
 
 
 -- testing decimal point locale
--- <disabled by jart: doesn't play nice with musl locale faking>
--- if os.setlocale("pt_BR") or os.setlocale("ptb") then
---   assert(tonumber("3,4") == 3.4 and tonumber"3.4" == 3.4)
---   assert(tonumber("  -.4  ") == -0.4)
---   assert(tonumber("  +0x.41  ") == 0X0.41)
---   assert(not load("a = (3,4)"))
---   assert(assert(load("return 3.4"))() == 3.4)
---   assert(assert(load("return .4,3"))() == .4)
---   assert(assert(load("return 4."))() == 4.)
---   assert(assert(load("return 4.+.5"))() == 4.5)
+if os.setlocale("pt_BR") or os.setlocale("ptb") then
+  assert(tonumber("3,4") == 3.4 and tonumber"3.4" == 3.4)
+  assert(tonumber("  -.4  ") == -0.4)
+  assert(tonumber("  +0x.41  ") == 0X0.41)
+  assert(not load("a = (3,4)"))
+  assert(assert(load("return 3.4"))() == 3.4)
+  assert(assert(load("return .4,3"))() == .4)
+  assert(assert(load("return 4."))() == 4.)
+  assert(assert(load("return 4.+.5"))() == 4.5)
 
---   assert(" 0x.1 " + " 0x,1" + "-0X.1\t" == 0x0.1)
+  assert(" 0x.1 " + " 0x,1" + "-0X.1\t" == 0x0.1)
 
---   assert(not tonumber"inf" and not tonumber"NAN")
+  assert(not tonumber"inf" and not tonumber"NAN")
 
---   assert(assert(load(string.format("return %q", 4.51)))() == 4.51)
+  assert(assert(load(string.format("return %q", 4.51)))() == 4.51)
 
---   local a,b = load("return 4.5.")
---   assert(string.find(b, "'4%.5%.'"))
+  local a,b = load("return 4.5.")
+  assert(string.find(b, "'4%.5%.'"))
 
---   assert(os.setlocale("C"))
--- else
---   (Message or print)(
---    '\n >>> pt_BR locale not available: skipping decimal point tests <<<\n')
--- end
+  assert(os.setlocale("C"))
+else
+  (Message or print)(
+   '\n >>> pt_BR locale not available: skipping decimal point tests <<<\n')
+end
 
 
 -- testing %q x line ends
diff --git a/third_party/lua/test/strings.lua b/third_party/lua/test/strings.lua
index bc62cf192..3af86efd0 100644
--- a/third_party/lua/test/strings.lua
+++ b/third_party/lua/test/strings.lua
@@ -430,18 +430,14 @@ if not _port then
   end
 
   if trylocale("collate")  then
-    -- <disabled by jart: doesn't play nice with musl locale faking>
-    -- assert("alo" < "�lo" and "�lo" < "amo")
-    -- </disabled by jart>
+    assert("alo" < "�lo" and "�lo" < "amo")
   end
 
   if trylocale("ctype") then
-    -- <disabled by jart: doesn't play nice with musl locale faking>
-    -- assert(string.gsub("�����", "%a", "x") == "xxxxx")
-    -- assert(string.gsub("����", "%l", "x") == "x�x�")
-    -- assert(string.gsub("����", "%u", "x") == "�x�x")
-    -- assert(string.upper"���{xuxu}��o" == "���{XUXU}��O")
-    -- </disabled by jart>
+    assert(string.gsub("�����", "%a", "x") == "xxxxx")
+    assert(string.gsub("����", "%l", "x") == "x�x�")
+    assert(string.gsub("����", "%u", "x") == "�x�x")
+    assert(string.upper"���{xuxu}��o" == "���{XUXU}��O")
   end
 
   os.setlocale("C")
diff --git a/third_party/maxmind/getmetroname.c b/third_party/maxmind/getmetroname.c
index 79a6e1ca7..2326f2ddb 100644
--- a/third_party/maxmind/getmetroname.c
+++ b/third_party/maxmind/getmetroname.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "third_party/maxmind/maxminddb.h"
 
 const struct thatispacked MetroName {
diff --git a/third_party/mbedtls/bigmul.c b/third_party/mbedtls/bigmul.c
index 2233ea644..8d84456ae 100644
--- a/third_party/mbedtls/bigmul.c
+++ b/third_party/mbedtls/bigmul.c
@@ -17,7 +17,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/log/backtrace.internal.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "third_party/mbedtls/bignum.h"
diff --git a/third_party/mbedtls/bignum.c b/third_party/mbedtls/bignum.c
index 6bcf0f029..96f0eb1a1 100644
--- a/third_party/mbedtls/bignum.c
+++ b/third_party/mbedtls/bignum.c
@@ -19,7 +19,7 @@
 #include "libc/serialize.h"
 #include "libc/intrin/bsf.h"
 #include "libc/intrin/bswap.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/nexgen32e.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/runtime/runtime.h"
diff --git a/third_party/mbedtls/bigshift.c b/third_party/mbedtls/bigshift.c
index 0c056a145..d6be87e91 100644
--- a/third_party/mbedtls/bigshift.c
+++ b/third_party/mbedtls/bigshift.c
@@ -15,7 +15,7 @@
 │ See the License for the specific language governing permissions and          │
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "third_party/mbedtls/bignum.h"
 #include "third_party/mbedtls/bignum_internal.h"
diff --git a/third_party/mbedtls/fastdiv.h b/third_party/mbedtls/fastdiv.h
index df177adfd..16d866a5c 100644
--- a/third_party/mbedtls/fastdiv.h
+++ b/third_party/mbedtls/fastdiv.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_THIRD_PARTY_MBEDTLS_FASTDIV_H_
 #define COSMOPOLITAN_THIRD_PARTY_MBEDTLS_FASTDIV_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 COSMOPOLITAN_C_START_
 
 struct Divisor {
diff --git a/third_party/mbedtls/formatclientciphers.c b/third_party/mbedtls/formatclientciphers.c
index 392373a0c..3b2b04c3b 100644
--- a/third_party/mbedtls/formatclientciphers.c
+++ b/third_party/mbedtls/formatclientciphers.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/serialize.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/stdio/append.h"
 #include "third_party/mbedtls/iana.h"
 
diff --git a/third_party/mbedtls/sha1.c b/third_party/mbedtls/sha1.c
index 193a4c3a5..7507a7445 100644
--- a/third_party/mbedtls/sha1.c
+++ b/third_party/mbedtls/sha1.c
@@ -17,7 +17,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/mbedtls/sha1.h"
 #include "libc/serialize.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/sha.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
diff --git a/third_party/mbedtls/sha256.c b/third_party/mbedtls/sha256.c
index 6112e6c77..03e979011 100644
--- a/third_party/mbedtls/sha256.c
+++ b/third_party/mbedtls/sha256.c
@@ -17,7 +17,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/mbedtls/sha256.h"
 #include "libc/dce.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/nexgen32e.h"
 #include "libc/nexgen32e/sha.h"
 #include "libc/nexgen32e/x86feature.h"
diff --git a/third_party/mbedtls/sha512.c b/third_party/mbedtls/sha512.c
index e4c551962..82469e893 100644
--- a/third_party/mbedtls/sha512.c
+++ b/third_party/mbedtls/sha512.c
@@ -17,7 +17,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/mbedtls/sha512.h"
 #include "libc/literal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/nexgen32e.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
diff --git a/third_party/mbedtls/sha512t.c b/third_party/mbedtls/sha512t.c
index 33180e110..5e4730831 100644
--- a/third_party/mbedtls/sha512t.c
+++ b/third_party/mbedtls/sha512t.c
@@ -15,7 +15,7 @@
 │ See the License for the specific language governing permissions and          │
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "third_party/mbedtls/platform.h"
 #include "third_party/mbedtls/sha512.h"
diff --git a/third_party/mbedtls/ssl_srv.c b/third_party/mbedtls/ssl_srv.c
index b58d403fe..5b2cbe2e8 100644
--- a/third_party/mbedtls/ssl_srv.c
+++ b/third_party/mbedtls/ssl_srv.c
@@ -16,7 +16,7 @@
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/time.h"
 #include "third_party/mbedtls/common.h"
diff --git a/third_party/mbedtls/test/test.inc b/third_party/mbedtls/test/test.inc
index 8c10d98da..f4a992d4e 100644
--- a/third_party/mbedtls/test/test.inc
+++ b/third_party/mbedtls/test/test.inc
@@ -2,7 +2,7 @@
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/exit.h"
 #include "third_party/mbedtls/config.h"
 #include "third_party/mbedtls/test/lib.h"
diff --git a/third_party/musl/BUILD.mk b/third_party/musl/BUILD.mk
index 940bd7cf9..b8b2c98fd 100644
--- a/third_party/musl/BUILD.mk
+++ b/third_party/musl/BUILD.mk
@@ -11,7 +11,6 @@ THIRD_PARTY_MUSL = $(THIRD_PARTY_MUSL_A_DEPS) $(THIRD_PARTY_MUSL_A)
 THIRD_PARTY_MUSL_A = o/$(MODE)/third_party/musl/musl.a
 THIRD_PARTY_MUSL_A_FILES := $(wildcard third_party/musl/*)
 THIRD_PARTY_MUSL_A_HDRS = $(filter %.h,$(THIRD_PARTY_MUSL_A_FILES))
-THIRD_PARTY_MUSL_A_INCS = $(filter %.inc,$(THIRD_PARTY_MUSL_A_FILES))
 THIRD_PARTY_MUSL_A_SRCS = $(filter %.c,$(THIRD_PARTY_MUSL_A_FILES))
 
 THIRD_PARTY_MUSL_A_OBJS =				\
@@ -31,8 +30,7 @@ THIRD_PARTY_MUSL_A_DIRECTDEPS =				\
 	LIBC_STR					\
 	LIBC_SYSV					\
 	LIBC_THREAD					\
-	THIRD_PARTY_TZ					\
-	THIRD_PARTY_ZLIB				\
+	THIRD_PARTY_ZLIB
 
 THIRD_PARTY_MUSL_A_DEPS :=				\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_MUSL_A_DIRECTDEPS),$($(x))))
@@ -61,8 +59,6 @@ $(THIRD_PARTY_MUSL_A_OBJS): private COPTS += -Wframe-larger-than=4096 -Walloca-l
 
 THIRD_PARTY_MUSL_LIBS = $(foreach x,$(THIRD_PARTY_MUSL_ARTIFACTS),$($(x)))
 THIRD_PARTY_MUSL_SRCS = $(foreach x,$(THIRD_PARTY_MUSL_ARTIFACTS),$($(x)_SRCS))
-THIRD_PARTY_MUSL_HDRS = $(foreach x,$(THIRD_PARTY_MUSL_ARTIFACTS),$($(x)_HDRS))
-THIRD_PARTY_MUSL_INCS = $(foreach x,$(THIRD_PARTY_MUSL_ARTIFACTS),$($(x)_INCS))
 THIRD_PARTY_MUSL_CHECKS = $(foreach x,$(THIRD_PARTY_MUSL_ARTIFACTS),$($(x)_CHECKS))
 THIRD_PARTY_MUSL_OBJS = $(foreach x,$(THIRD_PARTY_MUSL_ARTIFACTS),$($(x)_OBJS))
 $(THIRD_PARTY_MUSL_OBJS): third_party/musl/BUILD.mk
diff --git a/third_party/musl/__month_to_secs.c b/third_party/musl/__month_to_secs.c
deleted file mode 100644
index 43248fb3c..000000000
--- a/third_party/musl/__month_to_secs.c
+++ /dev/null
@@ -1,10 +0,0 @@
-int __month_to_secs(int month, int is_leap)
-{
-	static const int secs_through_month[] = {
-		0, 31*86400, 59*86400, 90*86400,
-		120*86400, 151*86400, 181*86400, 212*86400,
-		243*86400, 273*86400, 304*86400, 334*86400 };
-	int t = secs_through_month[month];
-	if (is_leap && month >= 2) t+=86400;
-	return t;
-}
diff --git a/third_party/musl/__secs_to_tm.c b/third_party/musl/__secs_to_tm.c
deleted file mode 100644
index 093d9021a..000000000
--- a/third_party/musl/__secs_to_tm.c
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "time_impl.h"
-#include <limits.h>
-
-/* 2000-03-01 (mod 400 year, immediately after feb29 */
-#define LEAPOCH (946684800LL + 86400*(31+29))
-
-#define DAYS_PER_400Y (365*400 + 97)
-#define DAYS_PER_100Y (365*100 + 24)
-#define DAYS_PER_4Y   (365*4   + 1)
-
-int __secs_to_tm(long long t, struct tm *tm)
-{
-	long long days, secs, years;
-	int remdays, remsecs, remyears;
-	int qc_cycles, c_cycles, q_cycles;
-	int months;
-	int wday, yday, leap;
-	static const char days_in_month[] = {31,30,31,30,31,31,30,31,30,31,31,29};
-
-	/* Reject time_t values whose year would overflow int */
-	if (t < INT_MIN * 31622400LL || t > INT_MAX * 31622400LL)
-		return -1;
-
-	secs = t - LEAPOCH;
-	days = secs / 86400;
-	remsecs = secs % 86400;
-	if (remsecs < 0) {
-		remsecs += 86400;
-		days--;
-	}
-
-	wday = (3+days)%7;
-	if (wday < 0) wday += 7;
-
-	qc_cycles = days / DAYS_PER_400Y;
-	remdays = days % DAYS_PER_400Y;
-	if (remdays < 0) {
-		remdays += DAYS_PER_400Y;
-		qc_cycles--;
-	}
-
-	c_cycles = remdays / DAYS_PER_100Y;
-	if (c_cycles == 4) c_cycles--;
-	remdays -= c_cycles * DAYS_PER_100Y;
-
-	q_cycles = remdays / DAYS_PER_4Y;
-	if (q_cycles == 25) q_cycles--;
-	remdays -= q_cycles * DAYS_PER_4Y;
-
-	remyears = remdays / 365;
-	if (remyears == 4) remyears--;
-	remdays -= remyears * 365;
-
-	leap = !remyears && (q_cycles || !c_cycles);
-	yday = remdays + 31 + 28 + leap;
-	if (yday >= 365+leap) yday -= 365+leap;
-
-	years = remyears + 4*q_cycles + 100*c_cycles + 400LL*qc_cycles;
-
-	for (months=0; days_in_month[months] <= remdays; months++)
-		remdays -= days_in_month[months];
-
-	if (months >= 10) {
-		months -= 12;
-		years++;
-	}
-
-	if (years+100 > INT_MAX || years+100 < INT_MIN)
-		return -1;
-
-	tm->tm_year = years + 100;
-	tm->tm_mon = months + 2;
-	tm->tm_mday = remdays + 1;
-	tm->tm_wday = wday;
-	tm->tm_yday = yday;
-
-	tm->tm_hour = remsecs / 3600;
-	tm->tm_min = remsecs / 60 % 60;
-	tm->tm_sec = remsecs % 60;
-
-	return 0;
-}
diff --git a/third_party/musl/__tm_to_secs.c b/third_party/musl/__tm_to_secs.c
deleted file mode 100644
index c29fa985a..000000000
--- a/third_party/musl/__tm_to_secs.c
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "time_impl.h"
-
-long long __tm_to_secs(const struct tm *tm)
-{
-	int is_leap;
-	long long year = tm->tm_year;
-	int month = tm->tm_mon;
-	if (month >= 12 || month < 0) {
-		int adj = month / 12;
-		month %= 12;
-		if (month < 0) {
-			adj--;
-			month += 12;
-		}
-		year += adj;
-	}
-	long long t = __year_to_secs(year, &is_leap);
-	t += __month_to_secs(month, is_leap);
-	t += 86400LL * (tm->tm_mday-1);
-	t += 3600LL * tm->tm_hour;
-	t += 60LL * tm->tm_min;
-	t += tm->tm_sec;
-	return t;
-}
diff --git a/third_party/musl/__year_to_secs.c b/third_party/musl/__year_to_secs.c
deleted file mode 100644
index b42f5a6d2..000000000
--- a/third_party/musl/__year_to_secs.c
+++ /dev/null
@@ -1,47 +0,0 @@
-long long __year_to_secs(long long year, int *is_leap)
-{
-	if (year-2ULL <= 136) {
-		int y = year;
-		int leaps = (y-68)>>2;
-		if (!((y-68)&3)) {
-			leaps--;
-			if (is_leap) *is_leap = 1;
-		} else if (is_leap) *is_leap = 0;
-		return 31536000*(y-70) + 86400*leaps;
-	}
-
-	int cycles, centuries, leaps, rem, dummy;
-
-	if (!is_leap) is_leap = &dummy;
-	cycles = (year-100) / 400;
-	rem = (year-100) % 400;
-	if (rem < 0) {
-		cycles--;
-		rem += 400;
-	}
-	if (!rem) {
-		*is_leap = 1;
-		centuries = 0;
-		leaps = 0;
-	} else {
-		if (rem >= 200) {
-			if (rem >= 300) centuries = 3, rem -= 300;
-			else centuries = 2, rem -= 200;
-		} else {
-			if (rem >= 100) centuries = 1, rem -= 100;
-			else centuries = 0;
-		}
-		if (!rem) {
-			*is_leap = 0;
-			leaps = 0;
-		} else {
-			leaps = rem / 4U;
-			rem %= 4U;
-			*is_leap = !rem;
-		}
-	}
-
-	leaps += 97*cycles + 24*centuries - *is_leap;
-
-	return (year-100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
-}
diff --git a/third_party/musl/alpha.inc b/third_party/musl/alpha.inc
deleted file mode 100644
index 4167f3876..000000000
--- a/third_party/musl/alpha.inc
+++ /dev/null
@@ -1,172 +0,0 @@
-18,17,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,17,34,35,36,17,37,38,39,40,
-41,42,43,44,17,45,46,47,16,16,48,16,16,16,16,16,16,16,49,50,51,16,52,53,16,16,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,54,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,55,17,17,17,17,56,17,57,58,59,60,61,62,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,63,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,64,65,17,66,67,
-68,69,70,71,72,73,74,17,75,76,77,78,79,80,81,16,82,83,84,85,86,87,88,89,90,91,
-92,93,16,94,95,96,16,17,17,17,97,98,99,16,16,16,16,16,16,16,16,16,16,17,17,17,
-17,100,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,17,101,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,17,17,102,103,16,16,104,105,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
-17,17,17,17,17,17,17,17,17,106,17,17,107,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,
-108,109,16,16,16,16,16,16,16,16,16,110,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,111,112,113,114,16,16,16,16,16,16,16,16,115,116,
-117,16,16,16,16,16,118,119,16,16,16,16,120,16,16,121,16,16,16,16,16,16,16,16,
-16,16,16,16,16,
-16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,254,255,255,7,254,
-255,255,7,0,0,0,0,0,4,32,4,255,255,127,255,255,255,127,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,195,255,3,0,31,80,0,0,0,0,0,0,0,0,0,0,32,0,0,0,0,0,223,188,64,215,255,255,
-251,255,255,255,255,255,255,255,255,255,191,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,3,252,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,254,255,255,255,127,2,255,255,255,
-255,255,1,0,0,0,0,255,191,182,0,255,255,255,135,7,0,0,0,255,7,255,255,255,255,
-255,255,255,254,255,195,255,255,255,255,255,255,255,255,255,255,255,255,239,
-31,254,225,255,
-159,0,0,255,255,255,255,255,255,0,224,255,255,255,255,255,255,255,255,255,255,
-255,255,3,0,255,255,255,255,255,7,48,4,255,255,255,252,255,31,0,0,255,255,255,
-1,255,7,0,0,0,0,0,0,255,255,223,63,0,0,240,255,248,3,255,255,255,255,255,255,
-255,255,255,239,255,223,225,255,207,255,254,255,239,159,249,255,255,253,197,
-227,159,89,128,176,207,255,3,16,238,135,249,255,255,253,109,195,135,25,2,94,
-192,255,63,0,238,191,251,255,255,253,237,227,191,27,1,0,207,255,0,30,238,159,
-249,255,255,253,237,227,159,25,192,176,207,255,2,0,236,199,61,214,24,199,255,
-195,199,29,129,0,192,255,0,0,239,223,253,255,255,253,255,227,223,29,96,7,207,
-255,0,0,239,223,253,255,255,253,239,227,223,29,96,64,207,255,6,0,239,223,253,
-255,255,255,255,231,223,93,240,128,207,255,0,252,236,255,127,252,255,255,251,
-47,127,128,95,255,192,255,12,0,254,255,255,255,255,127,255,7,63,32,255,3,0,0,
-0,0,214,247,255,255,175,255,255,59,95,32,255,243,0,0,0,
-0,1,0,0,0,255,3,0,0,255,254,255,255,255,31,254,255,3,255,255,254,255,255,255,
-31,0,0,0,0,0,0,0,0,255,255,255,255,255,255,127,249,255,3,255,255,255,255,255,
-255,255,255,255,63,255,255,255,255,191,32,255,255,255,255,255,247,255,255,255,
-255,255,255,255,255,255,61,127,61,255,255,255,255,255,61,255,255,255,255,61,
-127,61,255,127,255,255,255,255,255,255,255,61,255,255,255,255,255,255,255,255,
-7,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,63,63,254,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,159,255,255,254,255,255,7,255,255,255,255,255,255,255,255,
-255,199,255,1,255,223,15,0,255,255,15,0,255,255,15,0,255,223,13,0,255,255,255,
-255,255,255,207,255,255,1,128,16,255,3,0,0,0,0,255,3,255,255,255,255,255,255,
-255,255,255,255,255,1,255,255,255,255,255,7,255,255,255,255,255,255,255,255,
-63,
-0,255,255,255,127,255,15,255,1,192,255,255,255,255,63,31,0,255,255,255,255,
-255,15,255,255,255,3,255,3,0,0,0,0,255,255,255,15,255,255,255,255,255,255,255,
-127,254,255,31,0,255,3,255,3,128,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,
-255,239,255,239,15,255,3,0,0,0,0,255,255,255,255,255,243,255,255,255,255,255,
-255,191,255,3,0,255,255,255,255,255,255,127,0,255,227,255,255,255,255,255,63,
-255,1,255,255,255,255,255,231,0,0,0,0,0,222,111,4,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,
-128,255,31,0,255,255,63,63,255,255,255,255,63,63,255,170,255,255,255,63,255,
-255,255,255,255,255,223,95,220,31,207,15,255,31,220,31,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,2,128,0,0,255,31,0,0,0,0,0,0,0,0,0,0,0,0,132,252,47,62,80,189,255,243,
-224,67,0,0,255,255,255,255,255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,255,255,255,255,255,255,3,0,
-0,255,255,255,255,255,127,255,255,255,255,255,127,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,31,120,12,0,255,255,255,255,191,32,255,
-255,255,255,255,255,255,128,0,0,255,255,127,0,127,127,127,127,127,127,127,127,
-255,255,255,255,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,224,0,0,0,254,3,62,31,254,255,255,255,255,255,255,255,255,255,127,224,254,
-255,255,255,255,255,255,255,255,255,255,247,224,255,255,255,255,255,254,255,
-255,255,255,255,255,255,255,255,255,127,0,0,255,255,255,7,0,0,0,0,0,0,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,63,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,
-0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,31,0,0,
-0,0,0,0,0,0,255,255,255,255,255,63,255,31,255,255,255,15,0,0,255,255,255,255,
-255,127,240,143,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,
-0,128,255,252,255,255,255,255,255,255,255,255,255,255,255,255,249,255,255,255,
-255,255,255,124,0,0,0,0,0,128,255,191,255,255,255,255,0,0,0,255,255,255,255,
-255,255,15,0,255,255,255,255,255,255,255,255,47,0,255,3,0,0,252,232,255,255,
-255,255,255,7,255,255,255,255,7,0,255,255,255,31,255,255,255,255,255,255,247,
-255,0,128,255,3,255,255,255,127,255,255,255,255,255,255,127,0,255,63,255,3,
-255,255,127,252,255,255,255,255,255,255,255,127,5,0,0,56,255,255,60,0,126,126,
-126,0,127,127,255,255,255,255,255,247,255,0,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,7,255,3,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,15,0,255,255,127,248,255,255,255,255,
-255,
-15,255,255,255,255,255,255,255,255,255,255,255,255,255,63,255,255,255,255,255,
-255,255,255,255,255,255,255,255,3,0,0,0,0,127,0,248,224,255,253,127,95,219,
-255,255,255,255,255,255,255,255,255,255,255,255,255,3,0,0,0,248,255,255,255,
-255,255,255,255,255,255,255,255,255,63,0,0,255,255,255,255,255,255,255,255,
-252,255,255,255,255,255,255,0,0,0,0,0,255,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,223,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,31,0,0,255,3,
-254,255,255,7,254,255,255,7,192,255,255,255,255,255,255,255,255,255,255,127,
-252,252,252,28,0,0,0,0,255,239,255,255,127,255,255,183,255,63,255,63,0,0,0,0,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7,0,0,0,0,0,0,0,0,
-255,255,255,255,255,255,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,255,255,255,31,255,255,255,255,255,255,1,0,0,0,0,
-0,255,255,255,255,0,224,255,255,255,7,255,255,255,255,255,7,255,255,255,63,
-255,255,255,255,15,255,62,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,63,255,3,255,255,255,255,15,255,255,255,
-255,15,255,255,255,255,255,0,255,255,255,255,255,255,15,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,255,255,255,255,255,255,127,0,255,255,63,0,255,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,63,253,255,255,255,255,191,145,255,255,63,0,255,255,
-127,0,255,255,255,127,0,0,0,0,0,0,0,0,255,255,55,0,255,255,63,0,255,255,255,3,
-0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,192,0,0,0,0,0,0,0,0,111,240,239,
-254,255,255,63,0,0,0,0,0,255,255,255,31,255,255,255,31,0,0,0,0,255,254,255,
-255,31,0,0,0,255,255,255,255,255,255,63,0,255,255,63,0,255,255,7,0,255,255,3,
-0,0,0,0,0,0,0,0,0,0,0,0,
-0,255,255,255,255,255,255,255,255,255,1,0,0,0,0,0,0,255,255,255,255,255,255,7,
-0,255,255,255,255,255,255,7,0,255,255,255,255,255,0,255,3,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,31,128,0,255,255,63,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,255,255,127,0,255,255,255,255,255,255,255,255,63,0,0,0,
-192,255,0,0,252,255,255,255,255,255,255,1,0,0,255,255,255,1,255,3,255,255,255,
-255,255,255,199,255,112,0,255,255,255,255,71,0,255,255,255,255,255,255,255,
-255,30,0,255,23,0,0,0,0,255,255,251,255,255,255,159,64,0,0,0,0,0,0,0,0,127,
-189,255,191,255,1,255,255,255,255,255,255,255,1,255,3,239,159,249,255,255,253,
-237,227,159,25,129,224,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,
-255,255,255,255,255,187,7,255,131,0,0,0,0,255,255,255,255,255,255,255,255,179,
-0,255,3,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,63,127,0,0,0,63,0,0,
-0,0,255,255,255,255,255,255,255,127,17,0,255,3,0,0,0,0,255,255,255,255,255,
-255,63,1,255,3,0,0,0,0,0,0,255,255,255,231,255,7,255,3,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,1,0,0,0,0,0,0,0,0,0,0,0,
-0,255,255,255,255,255,255,255,255,255,3,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,255,252,255,255,255,255,255,252,26,0,0,0,255,255,255,255,255,255,231,
-127,0,0,255,255,255,255,255,255,255,255,255,32,0,0,0,0,255,255,255,255,255,
-255,255,1,255,253,255,255,255,255,127,127,1,0,255,3,0,0,252,255,255,255,252,
-255,255,254,127,0,0,0,0,0,0,0,0,0,127,251,255,255,255,255,127,180,203,0,255,3,
-191,253,255,255,255,127,123,1,255,3,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,127,0,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,3,0,0,
-0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,127,0,
-0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,
-255,255,255,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,
-255,255,255,255,255,255,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,
-255,255,255,255,255,255,1,255,255,255,127,255,3,0,0,0,0,0,0,0,0,0,0,0,0,255,
-255,255,63,0,0,255,255,255,255,255,255,0,0,15,0,255,3,248,255,255,224,255,255,
-0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,255,255,255,255,255,255,255,255,255,135,255,255,255,255,255,255,255,128,
-255,255,0,0,0,0,0,0,0,0,11,0,0,0,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,7,0,255,255,255,127,0,0,0,0,0,
-0,7,0,240,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,15,255,255,255,255,
-255,255,255,255,255,255,255,255,255,7,255,31,255,1,255,67,0,0,0,0,0,0,0,0,0,0,
-0,0,255,255,255,255,255,255,255,255,255,255,223,255,255,255,255,255,255,255,
-255,223,100,222,255,235,239,255,255,255,255,255,255,
-255,191,231,223,223,255,255,255,123,95,252,253,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,63,255,255,255,
-253,255,255,247,255,255,255,247,255,255,223,255,255,255,223,255,255,127,255,
-255,255,127,255,255,255,253,255,255,255,253,255,255,247,207,255,255,255,255,
-255,255,127,255,255,249,219,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,255,255,255,255,255,31,128,63,255,67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,
-15,255,3,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,31,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,
-143,8,255,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,239,255,255,255,150,254,247,10,132,234,150,170,150,247,247,94,255,251,255,
-15,238,251,255,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,3,255,255,255,3,255,
-255,255,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
diff --git a/third_party/musl/asctime.c b/third_party/musl/asctime.c
deleted file mode 100644
index 7ba0e5581..000000000
--- a/third_party/musl/asctime.c
+++ /dev/null
@@ -1,10 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/time.h"
-
-char *asctime(const struct tm *tm)
-{
-	static char buf[26];
-	return asctime_r(tm, buf);
-}
diff --git a/third_party/musl/asctime_r.c b/third_party/musl/asctime_r.c
deleted file mode 100644
index 6f62b1242..000000000
--- a/third_party/musl/asctime_r.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/stdio/stdio.h"
-#include "libc/str/langinfo.h"
-#include "libc/str/locale.internal.h"
-__static_yoink("musl_libc_notice");
-
-char *asctime_r(const struct tm *tm, char *buf)
-{
-	if (snprintf(buf, 26, "%.3s %.3s%3d %.2d:%.2d:%.2d %d\n",
-		nl_langinfo_l(ABDAY_1+tm->tm_wday, C_LOCALE),
-		nl_langinfo_l(ABMON_1+tm->tm_mon, C_LOCALE),
-		tm->tm_mday, tm->tm_hour,
-		tm->tm_min, tm->tm_sec,
-		1900 + tm->tm_year) >= 26)
-	{
-		/* ISO C requires us to use the above format string,
-		 * even if it will not fit in the buffer. Thus asctime_r
-		 * is _supposed_ to crash if the fields in tm are too large.
-		 * We follow this behavior and crash "gracefully" to warn
-		 * application developers that they may not be so lucky
-		 * on other implementations (e.g. stack smashing..).
-		 */
-		__builtin_trap();
-	}
-	return buf;
-}
diff --git a/third_party/musl/bsearch.c b/third_party/musl/bsearch.c
deleted file mode 100644
index fe050ea30..000000000
--- a/third_party/musl/bsearch.c
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <stdlib.h>
-
-void *bsearch(const void *key, const void *base, size_t nel, size_t width, int (*cmp)(const void *, const void *))
-{
-	void *try;
-	int sign;
-	while (nel > 0) {
-		try = (char *)base + width*(nel/2);
-		sign = cmp(key, try);
-		if (sign < 0) {
-			nel /= 2;
-		} else if (sign > 0) {
-			base = (char *)try + width;
-			nel -= nel/2+1;
-		} else {
-			return try;
-		}
-	}
-	return NULL;
-}
diff --git a/third_party/musl/c32rtomb.c b/third_party/musl/c32rtomb.c
deleted file mode 100644
index d23efca16..000000000
--- a/third_party/musl/c32rtomb.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <uchar.h>
-#include <wchar.h>
-__static_yoink("musl_libc_notice");
-
-size_t c32rtomb(char *restrict s, char32_t c32, mbstate_t *restrict ps)
-{
-	return wcrtomb(s, c32, ps);
-}
diff --git a/third_party/musl/casemap.inc b/third_party/musl/casemap.inc
deleted file mode 100644
index 6ee1209b9..000000000
--- a/third_party/musl/casemap.inc
+++ /dev/null
@@ -1,297 +0,0 @@
-static const unsigned char tab[] = {
-	7, 8, 9, 10, 11, 12, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	13, 6, 6, 14, 6, 6, 6, 6, 6, 6, 6, 6, 15, 16, 17, 18,
-	6, 19, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 20, 21, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 22, 23, 6, 6, 6, 24, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 25,
-	6, 6, 6, 6, 26, 6, 6, 6, 6, 6, 6, 6, 27, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 28, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 29, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 30, 6, 6, 6, 6, 6, 6,
-	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36,
-	43, 43, 43, 43, 43, 43, 43, 43, 1, 0, 84, 86, 86, 86, 86, 86,
-	86, 86, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 43, 43, 43, 43, 43, 43,
-	43, 7, 43, 43, 91, 86, 86, 86, 86, 86, 86, 86, 74, 86, 86, 5,
-	49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80,
-	36, 80, 121, 49, 80, 49, 80, 49, 56, 80, 49, 80, 49, 80, 49, 80,
-	49, 80, 49, 80, 49, 80, 49, 80, 78, 49, 2, 78, 13, 13, 78, 3,
-	78, 0, 36, 110, 0, 78, 49, 38, 110, 81, 78, 36, 80, 78, 57, 20,
-	129, 27, 29, 29, 83, 49, 80, 49, 80, 13, 49, 80, 49, 80, 49, 80,
-	27, 83, 36, 80, 49, 2, 92, 123, 92, 123, 92, 123, 92, 123, 92, 123,
-	20, 121, 92, 123, 92, 123, 92, 45, 43, 73, 3, 72, 3, 120, 92, 123,
-	20, 0, 150, 10, 1, 43, 40, 6, 6, 0, 42, 6, 42, 42, 43, 7,
-	187, 181, 43, 30, 0, 43, 7, 43, 43, 43, 1, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 1, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 42, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 205, 70, 205, 43, 0, 37, 43, 7, 1, 6, 1, 85, 86, 86, 86,
-	86, 86, 85, 86, 86, 2, 36, 129, 129, 129, 129, 129, 21, 129, 129, 129,
-	0, 0, 43, 0, 178, 209, 178, 209, 178, 209, 178, 209, 0, 0, 205, 204,
-	1, 0, 215, 215, 215, 215, 215, 131, 129, 129, 129, 129, 129, 129, 129, 129,
-	129, 129, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 28, 0, 0, 0,
-	0, 0, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 2, 0, 0,
-	49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80,
-	49, 80, 78, 49, 80, 49, 80, 78, 49, 80, 49, 80, 49, 80, 49, 80,
-	49, 80, 49, 80, 49, 80, 49, 2, 135, 166, 135, 166, 135, 166, 135, 166,
-	135, 166, 135, 166, 135, 166, 135, 166, 42, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 0, 0, 0, 84, 86, 86, 86, 86, 86, 86, 86,
-	86, 86, 86, 86, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 84, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86,
-	12, 0, 12, 42, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 7, 42, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 86, 86, 108, 129, 21, 0, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 7, 108, 3, 65, 43, 43, 86, 86, 86, 86, 86, 86,
-	86, 86, 86, 86, 86, 86, 86, 86, 44, 86, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 1,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 12, 108, 0, 0, 0, 0, 0, 6,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37,
-	6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37,
-	6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37,
-	6, 37, 6, 37, 6, 37, 6, 37, 86, 122, 158, 38, 6, 37, 6, 37,
-	6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 37,
-	6, 37, 6, 37, 6, 37, 6, 37, 6, 37, 6, 1, 43, 43, 79, 86,
-	86, 44, 43, 127, 86, 86, 57, 43, 43, 85, 86, 86, 43, 43, 79, 86,
-	86, 44, 43, 127, 86, 86, 129, 55, 117, 91, 123, 92, 43, 43, 79, 86,
-	86, 2, 172, 4, 0, 0, 57, 43, 43, 85, 86, 86, 43, 43, 79, 86,
-	86, 44, 43, 43, 86, 86, 50, 19, 129, 87, 0, 111, 129, 126, 201, 215,
-	126, 45, 129, 129, 14, 126, 57, 127, 111, 87, 0, 129, 129, 126, 21, 0,
-	126, 3, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 7, 43,
-	36, 43, 151, 43, 43, 43, 43, 43, 43, 43, 43, 43, 42, 43, 43, 43,
-	43, 43, 86, 86, 86, 86, 86, 128, 129, 129, 129, 129, 57, 187, 42, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 1, 129, 129, 129, 129, 129, 129, 129, 129,
-	129, 129, 129, 129, 129, 129, 129, 201, 172, 172, 172, 172, 172, 172, 172, 172,
-	172, 172, 172, 172, 172, 172, 172, 208, 13, 0, 78, 49, 2, 180, 193, 193,
-	215, 215, 36, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80,
-	49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80, 49, 80,
-	49, 80, 49, 80, 215, 215, 83, 193, 71, 212, 215, 215, 215, 5, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 7, 1, 0, 1, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 78, 49, 80, 49, 80, 49, 80,
-	49, 80, 49, 80, 49, 80, 49, 80, 13, 0, 0, 0, 0, 0, 36, 80,
-	49, 80, 49, 80, 49, 80, 49, 80, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 121, 92, 123, 92, 123, 79, 123, 92, 123, 92, 123,
-	92, 123, 92, 123, 92, 123, 92, 123, 92, 123, 92, 123, 92, 123, 92, 45,
-	43, 43, 121, 20, 92, 123, 92, 45, 121, 42, 92, 39, 92, 123, 92, 123,
-	92, 123, 164, 0, 10, 180, 92, 123, 92, 123, 79, 3, 42, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 1,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 42, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 43, 43, 43, 43, 43, 43, 43, 43, 7, 0, 72, 86, 86, 86, 86,
-	86, 86, 86, 86, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 85, 86, 86, 86, 86, 86, 86,
-	86, 86, 86, 86, 86, 86, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 36, 43, 43, 43, 43, 43, 43, 43, 43, 43,
-	43, 43, 7, 0, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 43, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 7, 0, 0,
-	0, 0, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86,
-	86, 86, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 43, 43,
-	43, 43, 43, 43, 43, 43, 43, 43, 86, 86, 86, 86, 86, 86, 86, 86,
-	86, 86, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 42, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 86, 86,
-	86, 86, 86, 86, 86, 86, 86, 86, 14, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 85,
-	86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 14, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-static const int rules[] = {
-	0x0, 0x2001, -0x2000, 0x1dbf00, 0x2e700, 0x7900,
-	0x2402, 0x101, -0x100, 0x0, 0x201, -0x200,
-	-0xc6ff, -0xe800, -0x78ff, -0x12c00, 0xc300, 0xd201,
-	0xce01, 0xcd01, 0x4f01, 0xca01, 0xcb01, 0xcf01,
-	0x6100, 0xd301, 0xd101, 0xa300, 0xd501, 0x8200,
-	0xd601, 0xda01, 0xd901, 0xdb01, 0x3800, 0x3,
-	-0x4f00, -0x60ff, -0x37ff, 0x242802, 0x0, 0x101,
-	-0x100, -0xcd00, -0xda00, -0x81ff, 0x2a2b01, -0xa2ff,
-	0x2a2801, 0x2a3f00, -0xc2ff, 0x4501, 0x4701, 0x2a1f00,
-	0x2a1c00, 0x2a1e00, -0xd200, -0xce00, -0xca00, -0xcb00,
-	0xa54f00, 0xa54b00, -0xcf00, 0xa52800, 0xa54400, -0xd100,
-	-0xd300, 0x29f700, 0xa54100, 0x29fd00, -0xd500, -0xd600,
-	0x29e700, 0xa54300, 0xa52a00, -0x4500, -0xd900, -0x4700,
-	-0xdb00, 0xa51500, 0xa51200, 0x4c2402, 0x0, 0x2001,
-	-0x2000, 0x101, -0x100, 0x5400, 0x7401, 0x2601,
-	0x2501, 0x4001, 0x3f01, -0x2600, -0x2500, -0x1f00,
-	-0x4000, -0x3f00, 0x801, -0x3e00, -0x3900, -0x2f00,
-	-0x3600, -0x800, -0x5600, -0x5000, 0x700, -0x7400,
-	-0x3bff, -0x6000, -0x6ff, 0x701a02, 0x101, -0x100,
-	0x2001, -0x2000, 0x5001, 0xf01, -0xf00, 0x0,
-	0x3001, -0x3000, 0x101, -0x100, 0x0, 0xbc000,
-	0x1c6001, 0x0, 0x97d001, 0x801, -0x800, 0x8a0502,
-	0x0, -0xbbfff, -0x186200, 0x89c200, -0x182500, -0x186e00,
-	-0x186d00, -0x186400, -0x186300, -0x185c00, 0x0, 0x8a3800,
-	0x8a0400, 0xee600, 0x101, -0x100, 0x0, -0x3b00,
-	-0x1dbeff, 0x8f1d02, 0x800, -0x7ff, 0x0, 0x5600,
-	-0x55ff, 0x4a00, 0x6400, 0x8000, 0x7000, 0x7e00,
-	0x900, -0x49ff, -0x8ff, -0x1c2500, -0x63ff, -0x6fff,
-	-0x7fff, -0x7dff, 0xac0502, 0x0, 0x1001, -0x1000,
-	0x1c01, 0x101, -0x1d5cff, -0x20beff, -0x2045ff, -0x1c00,
-	0xb10b02, 0x101, -0x100, 0x3001, -0x3000, 0x0,
-	-0x29f6ff, -0xee5ff, -0x29e6ff, -0x2a2b00, -0x2a2800, -0x2a1bff,
-	-0x29fcff, -0x2a1eff, -0x2a1dff, -0x2a3eff, 0x0, -0x1c6000,
-	0x0, 0x101, -0x100, 0xbc0c02, 0x0, 0x101,
-	-0x100, -0xa543ff, 0x3a001, -0x8a03ff, -0xa527ff, 0x3000,
-	-0xa54eff, -0xa54aff, -0xa540ff, -0xa511ff, -0xa529ff, -0xa514ff,
-	-0x2fff, -0xa542ff, -0x8a37ff, 0x0, -0x97d000, -0x3a000,
-	0x0, 0x2001, -0x2000, 0x0, 0x2801, -0x2800,
-	0x0, 0x4001, -0x4000, 0x0, 0x2001, -0x2000,
-	0x0, 0x2001, -0x2000, 0x0, 0x2201, -0x2200,
-};
-static const unsigned char rulebases[] = {
-	0, 6, 39, 81, 111, 119, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	124, 0, 0, 127, 0, 0, 0, 0, 0, 0, 0, 0, 131, 142, 146, 151,
-	0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 180, 196, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 198, 201, 0, 0, 0, 219, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 222,
-	0, 0, 0, 0, 225, 0, 0, 0, 0, 0, 0, 0, 228, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 231, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 234, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 237, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-static const unsigned char exceptions[][2] = {
-	{ 48, 12 }, { 49, 13 }, { 120, 14 }, { 127, 15 },
-	{ 128, 16 }, { 129, 17 }, { 134, 18 }, { 137, 19 },
-	{ 138, 19 }, { 142, 20 }, { 143, 21 }, { 144, 22 },
-	{ 147, 19 }, { 148, 23 }, { 149, 24 }, { 150, 25 },
-	{ 151, 26 }, { 154, 27 }, { 156, 25 }, { 157, 28 },
-	{ 158, 29 }, { 159, 30 }, { 166, 31 }, { 169, 31 },
-	{ 174, 31 }, { 177, 32 }, { 178, 32 }, { 183, 33 },
-	{ 191, 34 }, { 197, 35 }, { 200, 35 }, { 203, 35 },
-	{ 221, 36 }, { 242, 35 }, { 246, 37 }, { 247, 38 },
-	{ 32, 45 }, { 58, 46 }, { 61, 47 }, { 62, 48 },
-	{ 63, 49 }, { 64, 49 }, { 67, 50 }, { 68, 51 },
-	{ 69, 52 }, { 80, 53 }, { 81, 54 }, { 82, 55 },
-	{ 83, 56 }, { 84, 57 }, { 89, 58 }, { 91, 59 },
-	{ 92, 60 }, { 97, 61 }, { 99, 62 }, { 101, 63 },
-	{ 102, 64 }, { 104, 65 }, { 105, 66 }, { 106, 64 },
-	{ 107, 67 }, { 108, 68 }, { 111, 66 }, { 113, 69 },
-	{ 114, 70 }, { 117, 71 }, { 125, 72 }, { 130, 73 },
-	{ 135, 74 }, { 137, 75 }, { 138, 76 }, { 139, 76 },
-	{ 140, 77 }, { 146, 78 }, { 157, 79 }, { 158, 80 },
-	{ 69, 87 }, { 123, 29 }, { 124, 29 }, { 125, 29 },
-	{ 127, 88 }, { 134, 89 }, { 136, 90 }, { 137, 90 },
-	{ 138, 90 }, { 140, 91 }, { 142, 92 }, { 143, 92 },
-	{ 172, 93 }, { 173, 94 }, { 174, 94 }, { 175, 94 },
-	{ 194, 95 }, { 204, 96 }, { 205, 97 }, { 206, 97 },
-	{ 207, 98 }, { 208, 99 }, { 209, 100 }, { 213, 101 },
-	{ 214, 102 }, { 215, 103 }, { 240, 104 }, { 241, 105 },
-	{ 242, 106 }, { 243, 107 }, { 244, 108 }, { 245, 109 },
-	{ 249, 110 }, { 253, 45 }, { 254, 45 }, { 255, 45 },
-	{ 80, 105 }, { 81, 105 }, { 82, 105 }, { 83, 105 },
-	{ 84, 105 }, { 85, 105 }, { 86, 105 }, { 87, 105 },
-	{ 88, 105 }, { 89, 105 }, { 90, 105 }, { 91, 105 },
-	{ 92, 105 }, { 93, 105 }, { 94, 105 }, { 95, 105 },
-	{ 130, 0 }, { 131, 0 }, { 132, 0 }, { 133, 0 },
-	{ 134, 0 }, { 135, 0 }, { 136, 0 }, { 137, 0 },
-	{ 192, 117 }, { 207, 118 }, { 128, 137 }, { 129, 138 },
-	{ 130, 139 }, { 133, 140 }, { 134, 141 }, { 112, 157 },
-	{ 113, 157 }, { 118, 158 }, { 119, 158 }, { 120, 159 },
-	{ 121, 159 }, { 122, 160 }, { 123, 160 }, { 124, 161 },
-	{ 125, 161 }, { 179, 162 }, { 186, 163 }, { 187, 163 },
-	{ 188, 164 }, { 190, 165 }, { 195, 162 }, { 204, 164 },
-	{ 218, 166 }, { 219, 166 }, { 229, 106 }, { 234, 167 },
-	{ 235, 167 }, { 236, 110 }, { 243, 162 }, { 248, 168 },
-	{ 249, 168 }, { 250, 169 }, { 251, 169 }, { 252, 164 },
-	{ 38, 176 }, { 42, 177 }, { 43, 178 }, { 78, 179 },
-	{ 132, 8 }, { 98, 186 }, { 99, 187 }, { 100, 188 },
-	{ 101, 189 }, { 102, 190 }, { 109, 191 }, { 110, 192 },
-	{ 111, 193 }, { 112, 194 }, { 126, 195 }, { 127, 195 },
-	{ 125, 207 }, { 141, 208 }, { 148, 209 }, { 171, 210 },
-	{ 172, 211 }, { 173, 212 }, { 176, 213 }, { 177, 214 },
-	{ 178, 215 }, { 196, 216 }, { 197, 217 }, { 198, 218 },
-};
diff --git a/third_party/musl/catclose.c b/third_party/musl/catclose.c
index 76140f687..f3f3b73f1 100644
--- a/third_party/musl/catclose.c
+++ b/third_party/musl/catclose.c
@@ -30,7 +30,6 @@
 #include <stdint.h>
 #include <endian.h>
 #include <sys/mman.h>
-__static_yoink("musl_libc_notice");
 
 #define V(p) be32toh(*(uint32_t *)(p))
 
diff --git a/third_party/musl/catgets.c b/third_party/musl/catgets.c
index cca0b16e8..8921ffc4c 100644
--- a/third_party/musl/catgets.c
+++ b/third_party/musl/catgets.c
@@ -31,7 +31,6 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <errno.h>
-__static_yoink("musl_libc_notice");
 
 #define V(p) be32toh(*(uint32_t *)(p))
 
diff --git a/third_party/musl/catopen.c b/third_party/musl/catopen.c
index fcc09b126..8bcaff432 100644
--- a/third_party/musl/catopen.c
+++ b/third_party/musl/catopen.c
@@ -35,7 +35,6 @@
 #include <locale.h>
 #include "third_party/musl/mapfile.internal.h"
 #include <sys/mman.h>
-__static_yoink("musl_libc_notice");
 
 #define V(p) be32toh(*(uint32_t *)(p))
 
diff --git a/third_party/musl/fnmatch.c b/third_party/musl/fnmatch.c
index e48d7b998..6ccf0a2e7 100644
--- a/third_party/musl/fnmatch.c
+++ b/third_party/musl/fnmatch.c
@@ -25,12 +25,10 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <string.h>
-#include <fnmatch.h>
-#include <stdlib.h>
-#include <wchar.h>
-#include <wctype.h>
-#include "libc/str/locale.internal.h"
+#include "libc/limits.h"
+#include "libc/str/str.h"
+#include "libc/wctype.h"
+#include "third_party/musl/fnmatch.h"
 __static_yoink("musl_libc_notice");
 
 /*
@@ -48,279 +46,284 @@ __static_yoink("musl_libc_notice");
  * - Rich Felker, April 2012
  */
 
-#define END 0
+#define END         0
 #define UNMATCHABLE -2
-#define BRACKET -3
-#define QUESTION -4
-#define STAR -5
+#define BRACKET     -3
+#define QUESTION    -4
+#define STAR        -5
 
-static int str_next(const char *str, size_t n, size_t *step)
-{
-	if (!n) {
-		*step = 0;
-		return 0;
-	}
-	if (str[0] >= 128U) {
-		wchar_t wc;
-		int k = mbtowc(&wc, str, n);
-		if (k<0) {
-			*step = 1;
-			return -1;
-		}
-		*step = k;
-		return wc;
-	}
-	*step = 1;
-	return str[0];
+static int FnmatchNextString(const char *str, size_t n, size_t *step) {
+  if (!n) {
+    *step = 0;
+    return 0;
+  }
+  if (str[0] >= 128U) {
+    wchar_t wc;
+    int k = mbtowc(&wc, str, n);
+    if (k < 0) {
+      *step = 1;
+      return -1;
+    }
+    *step = k;
+    return wc;
+  }
+  *step = 1;
+  return str[0];
 }
 
-static int pat_next(const char *pat, size_t m, size_t *step, int flags)
-{
-	int esc = 0;
-	if (!m || !*pat) {
-		*step = 0;
-		return END;
-	}
-	*step = 1;
-	if (pat[0]=='\\' && pat[1] && !(flags & FNM_NOESCAPE)) {
-		*step = 2;
-		pat++;
-		esc = 1;
-		goto escaped;
-	}
-	if (pat[0]=='[') {
-		size_t k = 1;
-		if (k<m) if (pat[k] == '^' || pat[k] == '!') k++;
-		if (k<m) if (pat[k] == ']') k++;
-		for (; k<m && pat[k] && pat[k]!=']'; k++) {
-			if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' || pat[k+1]=='.' || pat[k+1]=='=')) {
-				int z = pat[k+1];
-				k+=2;
-				if (k<m && pat[k]) k++;
-				while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=']')) k++;
-				if (k==m || !pat[k]) break;
-			}
-		}
-		if (k==m || !pat[k]) {
-			*step = 1;
-			return '[';
-		}
-		*step = k+1;
-		return BRACKET;
-	}
-	if (pat[0] == '*')
-		return STAR;
-	if (pat[0] == '?')
-		return QUESTION;
+static int FnmatchNextPattern(const char *pat, size_t m, size_t *step,
+                              int flags) {
+  int esc = 0;
+  if (!m || !*pat) {
+    *step = 0;
+    return END;
+  }
+  *step = 1;
+  if (pat[0] == '\\' && pat[1] && !(flags & FNM_NOESCAPE)) {
+    *step = 2;
+    pat++;
+    esc = 1;
+    goto escaped;
+  }
+  if (pat[0] == '[') {
+    size_t k = 1;
+    if (k < m)
+      if (pat[k] == '^' || pat[k] == '!') k++;
+    if (k < m)
+      if (pat[k] == ']') k++;
+    for (; k < m && pat[k] && pat[k] != ']'; k++) {
+      if (k + 1 < m && pat[k + 1] && pat[k] == '[' &&
+          (pat[k + 1] == ':' || pat[k + 1] == '.' || pat[k + 1] == '=')) {
+        int z = pat[k + 1];
+        k += 2;
+        if (k < m && pat[k]) k++;
+        while (k < m && pat[k] && (pat[k - 1] != z || pat[k] != ']')) k++;
+        if (k == m || !pat[k]) break;
+      }
+    }
+    if (k == m || !pat[k]) {
+      *step = 1;
+      return '[';
+    }
+    *step = k + 1;
+    return BRACKET;
+  }
+  if (pat[0] == '*') return STAR;
+  if (pat[0] == '?') return QUESTION;
 escaped:
-	if (pat[0] >= 128U) {
-		wchar_t wc;
-		int k = mbtowc(&wc, pat, m);
-		if (k<0) {
-			*step = 0;
-			return UNMATCHABLE;
-		}
-		*step = k + esc;
-		return wc;
-	}
-	return pat[0];
+  if (pat[0] >= 128U) {
+    wchar_t wc;
+    int k = mbtowc(&wc, pat, m);
+    if (k < 0) {
+      *step = 0;
+      return UNMATCHABLE;
+    }
+    *step = k + esc;
+    return wc;
+  }
+  return pat[0];
 }
 
-static int casefold(int k)
-{
-	int c = towupper(k);
-	return c == k ? towlower(k) : c;
+static int FnmatchCaseFold(int k) {
+  int c = towupper(k);
+  return c == k ? towlower(k) : c;
 }
 
-static int match_bracket(const char *p, int k, int kfold)
-{
-	wchar_t wc;
-	int inv = 0;
-	p++;
-	if (*p=='^' || *p=='!') {
-		inv = 1;
-		p++;
-	}
-	if (*p==']') {
-		if (k==']') return !inv;
-		p++;
-	} else if (*p=='-') {
-		if (k=='-') return !inv;
-		p++;
-	}
-	wc = p[-1];
-	for (; *p != ']'; p++) {
-		if (p[0]=='-' && p[1]!=']') {
-			wchar_t wc2;
-			int l = mbtowc(&wc2, p+1, 4);
-			if (l < 0) return 0;
-			if (wc <= wc2)
-				if ((unsigned)k-wc <= wc2-wc ||
-				    (unsigned)kfold-wc <= wc2-wc)
-					return !inv;
-			p += l-1;
-			continue;
-		}
-		if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) {
-			const char *p0 = p+2;
-			int z = p[1];
-			p+=3;
-			while (p[-1]!=z || p[0]!=']') p++;
-			if (z == ':' && p-1-p0 < 16) {
-				char buf[16];
-				memcpy(buf, p0, p-1-p0);
-				buf[p-1-p0] = 0;
-				if (iswctype(k, wctype(buf)) ||
-				    iswctype(kfold, wctype(buf)))
-					return !inv;
-			}
-			continue;
-		}
-		if (*p < 128U) {
-			wc = (unsigned char)*p;
-		} else {
-			int l = mbtowc(&wc, p, 4);
-			if (l < 0) return 0;
-			p += l-1;
-		}
-		if (wc==k || wc==kfold) return !inv;
-	}
-	return inv;
+static int FnmatchBracket(const char *p, int k, int kfold) {
+  wchar_t wc;
+  int inv = 0;
+  p++;
+  if (*p == '^' || *p == '!') {
+    inv = 1;
+    p++;
+  }
+  if (*p == ']') {
+    if (k == ']') return !inv;
+    p++;
+  } else if (*p == '-') {
+    if (k == '-') return !inv;
+    p++;
+  }
+  wc = p[-1];
+  for (; *p != ']'; p++) {
+    if (p[0] == '-' && p[1] != ']') {
+      wchar_t wc2;
+      int l = mbtowc(&wc2, p + 1, 4);
+      if (l < 0) return 0;
+      if (wc <= wc2)
+        if ((unsigned)k - wc <= wc2 - wc || (unsigned)kfold - wc <= wc2 - wc)
+          return !inv;
+      p += l - 1;
+      continue;
+    }
+    if (p[0] == '[' && (p[1] == ':' || p[1] == '.' || p[1] == '=')) {
+      const char *p0 = p + 2;
+      int z = p[1];
+      p += 3;
+      while (p[-1] != z || p[0] != ']') p++;
+      if (z == ':' && p - 1 - p0 < 16) {
+        char buf[16];
+        memcpy(buf, p0, p - 1 - p0);
+        buf[p - 1 - p0] = 0;
+        if (iswctype(k, wctype(buf)) || iswctype(kfold, wctype(buf)))
+          return !inv;
+      }
+      continue;
+    }
+    if (*p < 128U) {
+      wc = (unsigned char)*p;
+    } else {
+      int l = mbtowc(&wc, p, 4);
+      if (l < 0) return 0;
+      p += l - 1;
+    }
+    if (wc == k || wc == kfold) return !inv;
+  }
+  return inv;
 }
 
-static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n, int flags)
-{
-	const char *p, *ptail, *endpat;
-	const char *s, *stail, *endstr;
-	size_t pinc, sinc, tailcnt=0;
-	int c, k, kfold;
+static int FnmatchPerform(const char *pat, size_t m, const char *str, size_t n,
+                          int flags) {
+  const char *p, *ptail, *endpat;
+  const char *s, *stail, *endstr;
+  size_t pinc, sinc, tailcnt = 0;
+  int c, k, kfold;
 
-	if (flags & FNM_PERIOD) {
-		if (*str == '.' && *pat != '.')
-			return FNM_NOMATCH;
-	}
-	for (;;) {
-		switch ((c = pat_next(pat, m, &pinc, flags))) {
-		case UNMATCHABLE:
-			return FNM_NOMATCH;
-		case STAR:
-			pat++;
-			m--;
-			break;
-		default:
-			k = str_next(str, n, &sinc);
-			if (k <= 0)
-				return (c==END) ? 0 : FNM_NOMATCH;
-			str += sinc;
-			n -= sinc;
-			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
-			if (c == BRACKET) {
-				if (!match_bracket(pat, k, kfold))
-					return FNM_NOMATCH;
-			} else if (c != QUESTION && k != c && kfold != c) {
-				return FNM_NOMATCH;
-			}
-			pat+=pinc;
-			m-=pinc;
-			continue;
-		}
-		break;
-	}
+  if (flags & FNM_PERIOD) {
+    if (*str == '.' && *pat != '.') {
+      return FNM_NOMATCH;
+    }
+  }
 
-	/* Compute real pat length if it was initially unknown/-1 */
-	m = strnlen(pat, m);
-	endpat = pat + m;
+  for (;;) {
+    switch ((c = FnmatchNextPattern(pat, m, &pinc, flags))) {
+      case UNMATCHABLE:
+        return FNM_NOMATCH;
+      case STAR:
+        pat++;
+        m--;
+        break;
+      default:
+        k = FnmatchNextString(str, n, &sinc);
+        if (k <= 0) return (c == END) ? 0 : FNM_NOMATCH;
+        str += sinc;
+        n -= sinc;
+        kfold = flags & FNM_CASEFOLD ? FnmatchCaseFold(k) : k;
+        if (c == BRACKET) {
+          if (!FnmatchBracket(pat, k, kfold)) return FNM_NOMATCH;
+        } else if (c != QUESTION && k != c && kfold != c) {
+          return FNM_NOMATCH;
+        }
+        pat += pinc;
+        m -= pinc;
+        continue;
+    }
+    break;
+  }
 
-	/* Find the last * in pat and count chars needed after it */
-	for (p=ptail=pat; p<endpat; p+=pinc) {
-		switch (pat_next(p, endpat-p, &pinc, flags)) {
-		case UNMATCHABLE:
-			return FNM_NOMATCH;
-		case STAR:
-			tailcnt=0;
-			ptail = p+1;
-			break;
-		default:
-			tailcnt++;
-			break;
-		}
-	}
+  /* Compute real pat length if it was initially unknown/-1 */
+  m = strnlen(pat, m);
+  endpat = pat + m;
 
-	/* Past this point we need not check for UNMATCHABLE in pat,
-	 * because all of pat has already been parsed once. */
+  /* Find the last * in pat and count chars needed after it */
+  for (p = ptail = pat; p < endpat; p += pinc) {
+    switch (FnmatchNextPattern(p, endpat - p, &pinc, flags)) {
+      case UNMATCHABLE:
+        return FNM_NOMATCH;
+      case STAR:
+        tailcnt = 0;
+        ptail = p + 1;
+        break;
+      default:
+        tailcnt++;
+        break;
+    }
+  }
 
-	/* Compute real str length if it was initially unknown/-1 */
-	n = strnlen(str, n);
-	endstr = str + n;
-	if (n < tailcnt) return FNM_NOMATCH;
+  /* Past this point we need not check for UNMATCHABLE in pat,
+   * because all of pat has already been parsed once. */
 
-	/* Find the final tailcnt chars of str, accounting for UTF-8.
-	 * On illegal sequences we may get it wrong, but in that case
-	 * we necessarily have a matching failure anyway. */
-	for (s=endstr; s>str && tailcnt; tailcnt--) {
-		if (s[-1] < 128U || MB_CUR_MAX==1) s--;
-		else while ((unsigned char)*--s-0x80U<0x40 && s>str);
-	}
-	if (tailcnt) return FNM_NOMATCH;
-	stail = s;
+  /* Compute real str length if it was initially unknown/-1 */
+  n = strnlen(str, n);
+  endstr = str + n;
+  if (n < tailcnt) {
+    return FNM_NOMATCH;
+  }
 
-	/* Check that the pat and str tails match */
-	p = ptail;
-	for (;;) {
-		c = pat_next(p, endpat-p, &pinc, flags);
-		p += pinc;
-		if ((k = str_next(s, endstr-s, &sinc)) <= 0) {
-			if (c != END) return FNM_NOMATCH;
-			break;
-		}
-		s += sinc;
-		kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
-		if (c == BRACKET) {
-			if (!match_bracket(p-pinc, k, kfold))
-				return FNM_NOMATCH;
-		} else if (c != QUESTION && k != c && kfold != c) {
-			return FNM_NOMATCH;
-		}
-	}
+  /* Find the final tailcnt chars of str, accounting for UTF-8.
+   * On illegal sequences we may get it wrong, but in that case
+   * we necessarily have a matching failure anyway. */
+  for (s = endstr; s > str && tailcnt; tailcnt--) {
+    if (s[-1] < 128U || MB_CUR_MAX == 1) {
+      s--;
+    } else {
+      while ((unsigned char)*--s - 0x80U < 0x40 && s > str)
+        ;
+    }
+  }
+  if (tailcnt) return FNM_NOMATCH;
+  stail = s;
 
-	/* We're all done with the tails now, so throw them out */
-	endstr = stail;
-	endpat = ptail;
+  /* Check that the pat and str tails match */
+  p = ptail;
+  for (;;) {
+    c = FnmatchNextPattern(p, endpat - p, &pinc, flags);
+    p += pinc;
+    if ((k = FnmatchNextString(s, endstr - s, &sinc)) <= 0) {
+      if (c != END) return FNM_NOMATCH;
+      break;
+    }
+    s += sinc;
+    kfold = flags & FNM_CASEFOLD ? FnmatchCaseFold(k) : k;
+    if (c == BRACKET) {
+      if (!FnmatchBracket(p - pinc, k, kfold)) return FNM_NOMATCH;
+    } else if (c != QUESTION && k != c && kfold != c) {
+      return FNM_NOMATCH;
+    }
+  }
 
-	/* Match pattern components until there are none left */
-	while (pat<endpat) {
-		p = pat;
-		s = str;
-		for (;;) {
-			c = pat_next(p, endpat-p, &pinc, flags);
-			p += pinc;
-			/* Encountering * completes/commits a component */
-			if (c == STAR) {
-				pat = p;
-				str = s;
-				break;
-			}
-			k = str_next(s, endstr-s, &sinc);
-			if (!k)
-				return FNM_NOMATCH;
-			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
-			if (c == BRACKET) {
-				if (!match_bracket(p-pinc, k, kfold))
-					break;
-			} else if (c != QUESTION && k != c && kfold != c) {
-				break;
-			}
-			s += sinc;
-		}
-		if (c == STAR) continue;
-		/* If we failed, advance str, by 1 char if it's a valid
-		 * char, or past all invalid bytes otherwise. */
-		k = str_next(str, endstr-str, &sinc);
-		if (k > 0) str += sinc;
-		else for (str++; str_next(str, endstr-str, &sinc)<0; str++);
-	}
+  /* We're all done with the tails now, so throw them out */
+  endstr = stail;
+  endpat = ptail;
 
-	return 0;
+  /* Match pattern components until there are none left */
+  while (pat < endpat) {
+    p = pat;
+    s = str;
+    for (;;) {
+      c = FnmatchNextPattern(p, endpat - p, &pinc, flags);
+      p += pinc;
+      /* Encountering * completes/commits a component */
+      if (c == STAR) {
+        pat = p;
+        str = s;
+        break;
+      }
+      k = FnmatchNextString(s, endstr - s, &sinc);
+      if (!k) return FNM_NOMATCH;
+      kfold = flags & FNM_CASEFOLD ? FnmatchCaseFold(k) : k;
+      if (c == BRACKET) {
+        if (!FnmatchBracket(p - pinc, k, kfold)) break;
+      } else if (c != QUESTION && k != c && kfold != c) {
+        break;
+      }
+      s += sinc;
+    }
+    if (c == STAR) continue;
+    /* If we failed, advance str, by 1 char if it's a valid
+     * char, or past all invalid bytes otherwise. */
+    k = FnmatchNextString(str, endstr - str, &sinc);
+    if (k > 0) {
+      str += sinc;
+    } else {
+      str++;
+      while (FnmatchNextString(str, endstr - str, &sinc) < 0) {
+        str++;
+      }
+    }
+  }
+
+  return 0;
 }
 
 /**
@@ -334,27 +337,29 @@ static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n
  *
  * @see glob()
  */
-int fnmatch(const char *pat, const char *str, int flags)
-{
-	const char *s, *p;
-	size_t inc;
-	int c;
-	if (flags & FNM_PATHNAME) for (;;) {
-		for (s=str; *s && *s!='/'; s++);
-		for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=inc);
-		if (c!=*s && (!*s || !(flags & FNM_LEADING_DIR)))
-			return FNM_NOMATCH;
-		if (fnmatch_internal(pat, p-pat, str, s-str, flags))
-			return FNM_NOMATCH;
-		if (!c) return 0;
-		str = s+1;
-		pat = p+inc;
-	} else if (flags & FNM_LEADING_DIR) {
-		for (s=str; *s; s++) {
-			if (*s != '/') continue;
-			if (!fnmatch_internal(pat, -1, str, s-str, flags))
-				return 0;
-		}
-	}
-	return fnmatch_internal(pat, -1, str, -1, flags);
+int fnmatch(const char *pat, const char *str, int flags) {
+  const char *s, *p;
+  size_t inc;
+  int c;
+  if (flags & FNM_PATHNAME) {
+    for (;;) {
+      for (s = str; *s && *s != '/'; s++)
+        ;
+      for (p = pat;
+           (c = FnmatchNextPattern(p, -1, &inc, flags)) != END && c != '/';
+           p += inc)
+        ;
+      if (c != *s && (!*s || !(flags & FNM_LEADING_DIR))) return FNM_NOMATCH;
+      if (FnmatchPerform(pat, p - pat, str, s - str, flags)) return FNM_NOMATCH;
+      if (!c) return 0;
+      str = s + 1;
+      pat = p + inc;
+    }
+  } else if (flags & FNM_LEADING_DIR) {
+    for (s = str; *s; s++) {
+      if (*s != '/') continue;
+      if (!FnmatchPerform(pat, -1, str, s - str, flags)) return 0;
+    }
+  }
+  return FnmatchPerform(pat, -1, str, -1, flags);
 }
diff --git a/third_party/musl/glob.c b/third_party/musl/glob.c
index e8a5314b7..655532d9a 100644
--- a/third_party/musl/glob.c
+++ b/third_party/musl/glob.c
@@ -35,227 +35,190 @@
 #include "libc/str/str.h"
 #include "libc/sysv/consts/dt.h"
 #include "libc/sysv/consts/s.h"
-#include "libc/limits.h"
-#include "libc/str/str.h"
-#include "libc/runtime/runtime.h"
-#include "third_party/musl/passwd.h"
 #include "third_party/musl/fnmatch.h"
 __static_yoink("musl_libc_notice");
 
-#pragma GCC diagnostic ignored "-Wparentheses"
+#define MAXPATH 1024
 
-struct match
-{
-	struct match *next;
-	char name[];
+struct GlobList {
+  struct GlobList *next;
+  char name[];
 };
 
-static int append(struct match **tail, const char *name, size_t len, int mark)
-{
-	struct match *new = malloc(sizeof(struct match) + len + 2);
-	if (!new) return -1;
-	(*tail)->next = new;
-	new->next = NULL;
-	memcpy(new->name, name, len+1);
-	if (mark && len && name[len-1]!='/') {
-		new->name[len] = '/';
-		new->name[len+1] = 0;
-	}
-	*tail = new;
-	return 0;
+static int AppendGlob(struct GlobList **tail, const char *name, size_t len,
+                      int mark) {
+  struct GlobList *new;
+  if ((new = malloc(sizeof(struct GlobList) + len + 2))) {
+    (*tail)->next = new;
+    new->next = NULL;
+    memcpy(new->name, name, len + 1);
+    if (mark && len && name[len - 1] != '/') {
+      new->name[len] = '/';
+      new->name[len + 1] = 0;
+    }
+    *tail = new;
+    return 0;
+  } else {
+    return -1;
+  }
 }
 
-static int do_glob(char *buf, size_t pos, int type, char *pat, int flags, int (*errfunc)(const char *path, int err), struct match **tail)
-{
-	/* If GLOB_MARK is unused, we don't care about type. */
-	if (!type && !(flags & GLOB_MARK)) type = DT_REG;
-
-	/* Special-case the remaining pattern being all slashes, in
-	 * which case we can use caller-passed type if it's a dir. */
-	if (*pat && type!=DT_DIR) type = 0;
-	while (pos+1 < PATH_MAX && *pat=='/') buf[pos++] = *pat++;
-
-	/* Consume maximal [escaped-]literal prefix of pattern, copying
-	 * and un-escaping it to the running buffer as we go. */
-	ptrdiff_t i=0, j=0;
-	int in_bracket = 0, overflow = 0;
-	for (; pat[i]!='*' && pat[i]!='?' && (!in_bracket || pat[i]!=']'); i++) {
-		if (!pat[i]) {
-			if (overflow) return 0;
-			pat += i;
-			pos += j;
-			i = j = 0;
-			break;
-		} else if (pat[i] == '[') {
-			in_bracket = 1;
-		} else if (pat[i] == '\\' && !(flags & GLOB_NOESCAPE)) {
-			/* Backslashes inside a bracket are (at least by
-			 * our interpretation) non-special, so if next
-			 * char is ']' we have a complete expression. */
-			if (in_bracket && pat[i+1]==']') break;
-			/* Unpaired final backslash never matches. */
-			if (!pat[i+1]) return 0;
-			i++;
-		}
-		if (pat[i] == '/') {
-			if (overflow) return 0;
-			in_bracket = 0;
-			pat += i+1;
-			i = -1;
-			pos += j+1;
-			j = -1;
-		}
-		/* Only store a character if it fits in the buffer, but if
-		 * a potential bracket expression is open, the overflow
-		 * must be remembered and handled later only if the bracket
-		 * is unterminated (and thereby a literal), so as not to
-		 * disallow long bracket expressions with short matches. */
-		if (pos+(j+1) < PATH_MAX) {
-			buf[pos+j++] = pat[i];
-		} else if (in_bracket) {
-			overflow = 1;
-		} else {
-			return 0;
-		}
-		/* If we consume any new components, the caller-passed type
-		 * or dummy type from above is no longer valid. */
-		type = 0;
-	}
-	buf[pos] = 0;
-	if (!*pat) {
-		/* If we consumed any components above, or if GLOB_MARK is
-		 * requested and we don't yet know if the match is a dir,
-		 * we must confirm the file exists and/or determine its type.
-		 *
-		 * If marking dirs, symlink type is inconclusive; we need the
-		 * type for the symlink target, and therefore must try stat
-		 * first unless type is known not to be a symlink. Otherwise,
-		 * or if that fails, use lstat for determining existence to
-		 * avoid false negatives in the case of broken symlinks. */
-		struct stat st;
-		if ((flags & GLOB_MARK) && (!type||type==DT_LNK) && !stat(buf, &st)) {
-			if (S_ISDIR(st.st_mode)) type = DT_DIR;
-			else type = DT_REG;
-		}
-		if (!type && lstat(buf, &st)) {
-			if (errno!=ENOENT && (errfunc(buf, errno) || (flags & GLOB_ERR)))
-				return GLOB_ABORTED;
-			return 0;
-		}
-		if (append(tail, buf, pos, (flags & GLOB_MARK) && type==DT_DIR))
-			return GLOB_NOSPACE;
-		return 0;
-	}
-	char *p2 = strchr(pat, '/'), saved_sep = '/';
-	/* Check if the '/' was escaped and, if so, remove the escape char
-	 * so that it will not be unpaired when passed to fnmatch. */
-	if (p2 && !(flags & GLOB_NOESCAPE)) {
-		char *p;
-		for (p=p2; p>pat && p[-1]=='\\'; p--);
-		if ((p2-p)%2) {
-			p2--;
-			saved_sep = '\\';
-		}
-	}
-	DIR *dir = opendir(pos ? buf : ".");
-	if (!dir) {
-		if (errfunc(buf, errno) || (flags & GLOB_ERR))
-			return GLOB_ABORTED;
-		return 0;
-	}
-	int old_errno = errno;
-	struct dirent *de;
-	while (errno=0, de=readdir(dir)) {
-		/* Quickly skip non-directories when there's pattern left. */
-		if (p2 && de->d_type && de->d_type!=DT_DIR && de->d_type!=DT_LNK)
-			continue;
-
-		size_t l = strlen(de->d_name);
-		if (l >= PATH_MAX-pos) continue;
-
-		if (p2) *p2 = 0;
-
-		int fnm_flags= ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0)
-			| ((!(flags & GLOB_PERIOD)) ? FNM_PERIOD : 0);
-
-		if (fnmatch(pat, de->d_name, fnm_flags))
-			continue;
-
-		/* With GLOB_PERIOD, don't allow matching . or .. unless
-		 * fnmatch would match them with FNM_PERIOD rules in effect. */
-		if (p2 && (flags & GLOB_PERIOD) && de->d_name[0]=='.'
-		    && (!de->d_name[1] || de->d_name[1]=='.' && !de->d_name[2])
-		    && fnmatch(pat, de->d_name, fnm_flags | FNM_PERIOD))
-			continue;
-
-		memcpy(buf+pos, de->d_name, l+1);
-		if (p2) *p2 = saved_sep;
-		int r = do_glob(buf, pos+l, de->d_type, p2 ? p2 : "", flags, errfunc, tail);
-		if (r) {
-			closedir(dir);
-			return r;
-		}
-	}
-	int readerr = errno;
-	if (p2) *p2 = saved_sep;
-	closedir(dir);
-	if (readerr && (errfunc(buf, errno) || (flags & GLOB_ERR)))
-		return GLOB_ABORTED;
-	errno = old_errno;
-	return 0;
+static int PerformGlob(char *buf, size_t pos, int type, char *pat, int flags,
+                       int (*errfunc)(const char *path, int err),
+                       struct GlobList **tail) {
+  DIR *dir;
+  size_t l;
+  char *p, *p2;
+  char saved_sep;
+  ptrdiff_t i, j;
+  struct stat st;
+  struct dirent *de;
+  int r, readerr, in_bracket, overflow, old_errno, fnm_flags;
+  /* If GLOB_MARK is unused, we don't care about type. */
+  if (!type && !(flags & GLOB_MARK)) type = DT_REG;
+  /* Special-case the remaining pattern being all slashes, in
+   * which case we can use caller-passed type if it's a dir. */
+  if (*pat && type != DT_DIR) type = 0;
+  while (pos + 1 < MAXPATH && *pat == '/') {
+    buf[pos++] = *pat++;
+  }
+  /* Consume maximal [escaped-]literal prefix of pattern, copying
+   * and un-escaping it to the running buffer as we go. */
+  i = 0;
+  j = 0;
+  overflow = 0;
+  in_bracket = 0;
+  for (; pat[i] != '*' && pat[i] != '?' && (!in_bracket || pat[i] != ']');
+       i++) {
+    if (!pat[i]) {
+      if (overflow) return 0;
+      pat += i;
+      pos += j;
+      i = j = 0;
+      break;
+    } else if (pat[i] == '[') {
+      in_bracket = 1;
+    } else if (pat[i] == '\\' && !(flags & GLOB_NOESCAPE)) {
+      /* Backslashes inside a bracket are (at least by
+       * our interpretation) non-special, so if next
+       * char is ']' we have a complete expression. */
+      if (in_bracket && pat[i + 1] == ']') break;
+      /* Unpaired final backslash never matches. */
+      if (!pat[i + 1]) return 0;
+      i++;
+    }
+    if (pat[i] == '/') {
+      if (overflow) return 0;
+      in_bracket = 0;
+      pat += i + 1;
+      i = -1;
+      pos += j + 1;
+      j = -1;
+    }
+    /* Only store a character if it fits in the buffer, but if
+     * a potential bracket expression is open, the overflow
+     * must be remembered and handled later only if the bracket
+     * is unterminated (and thereby a literal), so as not to
+     * disallow long bracket expressions with short matches. */
+    if (pos + (j + 1) < MAXPATH) {
+      buf[pos + j++] = pat[i];
+    } else if (in_bracket) {
+      overflow = 1;
+    } else {
+      return 0;
+    }
+    /* If we consume any new components, the caller-passed type
+     * or dummy type from above is no longer valid. */
+    type = 0;
+  }
+  buf[pos] = 0;
+  if (!*pat) {
+    /* If we consumed any components above, or if GLOB_MARK is
+     * requested and we don't yet know if the match is a dir,
+     * we must call stat to confirm the file exists and/or
+     * determine its type. */
+    if ((flags & GLOB_MARK) && type == DT_LNK) type = 0;
+    if (!type && stat(buf, &st)) {
+      if (errno != ENOENT && (errfunc(buf, errno) || (flags & GLOB_ERR))) {
+        return GLOB_ABORTED;
+      }
+      return 0;
+    }
+    if (!type && S_ISDIR(st.st_mode)) type = DT_DIR;
+    if (AppendGlob(tail, buf, pos, (flags & GLOB_MARK) && type == DT_DIR)) {
+      return GLOB_NOSPACE;
+    }
+    return 0;
+  }
+  p2 = strchr(pat, '/');
+  saved_sep = '/';
+  /* Check if the '/' was escaped and, if so, remove the escape char
+   * so that it will not be unpaired when passed to fnmatch. */
+  if (p2 && !(flags & GLOB_NOESCAPE)) {
+    for (p = p2; p > pat && p[-1] == '\\'; p--)
+      ;
+    if ((p2 - p) % 2) {
+      p2--;
+      saved_sep = '\\';
+    }
+  }
+  dir = opendir(pos ? buf : ".");
+  if (!dir) {
+    if (errfunc(buf, errno) || (flags & GLOB_ERR)) return GLOB_ABORTED;
+    return 0;
+  }
+  old_errno = errno;
+  while (errno = 0, de = readdir(dir)) {
+    /* Quickly skip non-directories when there's pattern left. */
+    if (p2 && de->d_type && de->d_type != DT_DIR && de->d_type != DT_LNK) {
+      continue;
+    }
+    l = strlen(de->d_name);
+    if (l >= MAXPATH - pos) continue;
+    if (p2) *p2 = 0;
+    fnm_flags = ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0) |
+                ((!(flags & GLOB_PERIOD)) ? FNM_PERIOD : 0);
+    if (fnmatch(pat, de->d_name, fnm_flags)) continue;
+    /* With GLOB_PERIOD don't allow matching . or .. unless fnmatch()
+     * would match them with FNM_PERIOD rules in effect. */
+    if (p2 && (flags & GLOB_PERIOD) && de->d_name[0] == '.' &&
+        (!de->d_name[1] || (de->d_name[1] == '.' && !de->d_name[2])) &&
+        fnmatch(pat, de->d_name, fnm_flags | FNM_PERIOD)) {
+      continue;
+    }
+    memcpy(buf + pos, de->d_name, l + 1);
+    if (p2) *p2 = saved_sep;
+    r = PerformGlob(buf, pos + l, de->d_type, p2 ? p2 : "", flags, errfunc,
+                    tail);
+    if (r) {
+      closedir(dir);
+      return r;
+    }
+  }
+  readerr = errno;
+  if (p2) *p2 = saved_sep;
+  closedir(dir);
+  if (readerr && (errfunc(buf, errno) || (flags & GLOB_ERR))) {
+    return GLOB_ABORTED;
+  }
+  errno = old_errno;
+  return 0;
 }
 
-static int ignore_err(const char *path, int err)
-{
-	return 0;
+static int IgnoreGlobError(const char *path, int err) {
+  return 0;
 }
 
-static void freelist(struct match *head)
-{
-	struct match *match, *next;
-	for (match=head->next; match; match=next) {
-		next = match->next;
-		free(match);
-	}
+static void FreeGlobList(struct GlobList *head) {
+  struct GlobList *match, *next;
+  for (match = head->next; match; match = next) {
+    next = match->next;
+    free(match);
+  }
 }
 
-static int sort(const void *a, const void *b)
-{
-	return strcmp(*(const char **)a, *(const char **)b);
-}
-
-static int expand_tilde(char **pat, char *buf, size_t *pos)
-{
-	char *p = *pat + 1;
-	size_t i = 0;
-
-	char delim, *name_end = strchrnul(p, '/');
-	if ((delim = *name_end)) *name_end++ = 0;
-	*pat = name_end;
-
-	char *home = *p ? NULL : getenv("HOME");
-	if (!home) {
-		struct passwd pw, *res;
-		int e = *p ? getpwnam_r(p, &pw, buf, PATH_MAX, &res)
-			   : getpwuid_r(getuid(), &pw, buf, PATH_MAX, &res);
-		if (e == ENOMEM) {
-			return GLOB_NOSPACE;
-		} else if (e == 0) {
-			if (!res)
-				return GLOB_NOMATCH;
-		} else {
-			return GLOB_NOMATCH;
-		}
-		home = pw.pw_dir;
-	}
-	while (i < PATH_MAX - 2 && *home)
-		buf[i++] = *home++;
-	if (*home)
-		return GLOB_NOMATCH;
-	if ((buf[i] = delim))
-		buf[++i] = 0;
-	*pos = i;
-	return 0;
+static int GlobPredicate(const void *a, const void *b) {
+  return strcmp(*(const char **)a, *(const char **)b);
 }
 
 /**
@@ -276,88 +239,81 @@ static int expand_tilde(char **pat, char *buf, size_t *pos)
  * @return 0 on success or GLOB_NOMATCH, GLOB_NOSPACE on OOM, or
  *     GLOB_ABORTED on read error
  */
-int glob(const char *restrict pat, int flags, int (*errfunc)(const char *path, int err), glob_t *restrict g)
-{
-	struct match head = { .next = NULL }, *tail = &head;
-	size_t cnt, i;
-	size_t offs = (flags & GLOB_DOOFFS) ? g->gl_offs : 0;
-	int error = 0;
-	char buf[PATH_MAX];
-	
-	if (!errfunc) errfunc = ignore_err;
-
-	if (!(flags & GLOB_APPEND)) {
-		g->gl_offs = offs;
-		g->gl_pathc = 0;
-		g->gl_pathv = NULL;
-	}
-
-	if (*pat) {
-		char *p = strdup(pat);
-		if (!p) return GLOB_NOSPACE;
-		buf[0] = 0;
-		size_t pos = 0;
-		char *s = p;
-		if ((flags & (GLOB_TILDE | GLOB_TILDE_CHECK)) && *p == '~')
-			error = expand_tilde(&s, buf, &pos);
-		if (!error)
-			error = do_glob(buf, pos, 0, s, flags, errfunc, &tail);
-		free(p);
-	}
-
-	if (error == GLOB_NOSPACE) {
-		freelist(&head);
-		return error;
-	}
-	
-	for (cnt=0, tail=head.next; tail; tail=tail->next, cnt++);
-	if (!cnt) {
-		if (flags & GLOB_NOCHECK) {
-			tail = &head;
-			if (append(&tail, pat, strlen(pat), 0))
-				return GLOB_NOSPACE;
-			cnt++;
-		} else if (!error)
-			return GLOB_NOMATCH;
-	}
-
-	if (flags & GLOB_APPEND) {
-		char **pathv = realloc(g->gl_pathv, (offs + g->gl_pathc + cnt + 1) * sizeof(char *));
-		if (!pathv) {
-			freelist(&head);
-			return GLOB_NOSPACE;
-		}
-		g->gl_pathv = pathv;
-		offs += g->gl_pathc;
-	} else {
-		g->gl_pathv = malloc((offs + cnt + 1) * sizeof(char *));
-		if (!g->gl_pathv) {
-			freelist(&head);
-			return GLOB_NOSPACE;
-		}
-		for (i=0; i<offs; i++)
-			g->gl_pathv[i] = NULL;
-	}
-	for (i=0, tail=head.next; i<cnt; tail=tail->next, i++)
-		g->gl_pathv[offs + i] = tail->name;
-	g->gl_pathv[offs + i] = NULL;
-	g->gl_pathc += cnt;
-
-	if (!(flags & GLOB_NOSORT))
-		qsort(g->gl_pathv+offs, cnt, sizeof(char *), sort);
-	
-	return error;
+int glob(const char *pat, int flags, int errfunc(const char *path, int err),
+         glob_t *g) {
+  int error = 0;
+  size_t cnt, i;
+  char **pathv, buf[MAXPATH];
+  struct GlobList head = {.next = NULL}, *tail = &head;
+  size_t offs = (flags & GLOB_DOOFFS) ? g->gl_offs : 0;
+  if (!errfunc) errfunc = IgnoreGlobError;
+  if (!(flags & GLOB_APPEND)) {
+    g->gl_offs = offs;
+    g->gl_pathc = 0;
+    g->gl_pathv = NULL;
+  }
+  if (*pat) {
+    char *p = strdup(pat);
+    if (!p) return GLOB_NOSPACE;
+    buf[0] = 0;
+    error = PerformGlob(buf, 0, 0, p, flags, errfunc, &tail);
+    free(p);
+  }
+  if (error == GLOB_NOSPACE) {
+    FreeGlobList(&head);
+    return error;
+  }
+  for (cnt = 0, tail = head.next; tail; tail = tail->next, cnt++)
+    ;
+  if (!cnt) {
+    if (flags & GLOB_NOCHECK) {
+      tail = &head;
+      if (AppendGlob(&tail, pat, strlen(pat), 0)) {
+        return GLOB_NOSPACE;
+      }
+      cnt++;
+    } else
+      return GLOB_NOMATCH;
+  }
+  if (flags & GLOB_APPEND) {
+    pathv =
+        realloc(g->gl_pathv, (offs + g->gl_pathc + cnt + 1) * sizeof(char *));
+    if (!pathv) {
+      FreeGlobList(&head);
+      return GLOB_NOSPACE;
+    }
+    g->gl_pathv = pathv;
+    offs += g->gl_pathc;
+  } else {
+    g->gl_pathv = malloc((offs + cnt + 1) * sizeof(char *));
+    if (!g->gl_pathv) {
+      FreeGlobList(&head);
+      return GLOB_NOSPACE;
+    }
+    for (i = 0; i < offs; i++) {
+      g->gl_pathv[i] = NULL;
+    }
+  }
+  for (i = 0, tail = head.next; i < cnt; tail = tail->next, i++) {
+    g->gl_pathv[offs + i] = tail->name;
+  }
+  g->gl_pathv[offs + i] = NULL;
+  g->gl_pathc += cnt;
+  if (!(flags & GLOB_NOSORT)) {
+    qsort(g->gl_pathv + offs, cnt, sizeof(char *), GlobPredicate);
+  }
+  return error;
 }
 
 /**
  * Frees entries allocated by glob().
  */
-void globfree(glob_t *g)
-{
-	size_t i;
-	for (i=0; i<g->gl_pathc; i++)
-		free(g->gl_pathv[g->gl_offs + i] - offsetof(struct match, name));
-	free(g->gl_pathv);
-	g->gl_pathc = 0;
-	g->gl_pathv = NULL;
+void globfree(glob_t *g) {
+  size_t i;
+  for (i = 0; i < g->gl_pathc; i++) {
+    free(g->gl_pathv[g->gl_offs + i] - offsetof(struct GlobList, name));
+  }
+  free(g->gl_pathv);
+  g->gl_pathc = 0;
+  g->gl_pathv = NULL;
 }
diff --git a/third_party/musl/glob.h b/third_party/musl/glob.h
index 3aae69da0..d18d25cdd 100644
--- a/third_party/musl/glob.h
+++ b/third_party/musl/glob.h
@@ -30,11 +30,5 @@ typedef struct {
 int glob(const char *, int, int (*)(const char *, int), glob_t *);
 void globfree(glob_t *);
 
-#ifdef _LARGEFILE64_SOURCE
-#define glob64 glob
-#define globfree64 globfree
-#define glob64_t glob_t
-#endif
-
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_THIRD_PARTY_MUSL_GLOB_H_ */
diff --git a/third_party/musl/iswalpha.c b/third_party/musl/iswalpha.c
deleted file mode 100644
index 33157ea29..000000000
--- a/third_party/musl/iswalpha.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <wctype.h>
-#include <locale.h>
-__static_yoink("musl_libc_notice");
-
-static const unsigned char table[] = {
-#include "alpha.inc"
-};
-
-int iswalpha(wint_t wc)
-{
-	if (wc<0x20000U)
-		return (table[table[wc>>8]*32+((wc&255)>>3)]>>(wc&7))&1;
-	if (wc<0x2fffeU)
-		return 1;
-	return 0;
-}
-
-int __iswalpha_l(wint_t c, locale_t l)
-{
-	return iswalpha(c);
-}
-
-__weak_reference(__iswalpha_l, iswalpha_l);
diff --git a/third_party/musl/iswpunct.c b/third_party/musl/iswpunct.c
deleted file mode 100644
index 6434bb790..000000000
--- a/third_party/musl/iswpunct.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <wctype.h>
-#include <locale.h>
-__static_yoink("musl_libc_notice");
-
-static const unsigned char table[] = {
-#include "punct.inc"
-};
-
-int iswpunct(wint_t wc)
-{
-	if (wc<0x20000U)
-		return (table[table[wc>>8]*32+((wc&255)>>3)]>>(wc&7))&1;
-	return 0;
-}
-
-int __iswpunct_l(wint_t c, locale_t l)
-{
-	return iswpunct(c);
-}
-
-__weak_reference(__iswpunct_l, iswpunct_l);
diff --git a/third_party/musl/lctrans.c b/third_party/musl/lctrans.c
deleted file mode 100644
index eb02a9e9d..000000000
--- a/third_party/musl/lctrans.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/locale.internal.h"
-__static_yoink("musl_libc_notice");
-
-const char *__lctrans_dummy(const char *msg, const struct __locale_map *lm)
-{
-	return msg;
-}
-
-__weak_reference(__lctrans_dummy, __lctrans_impl);
-
-const char *__lctrans(const char *msg, const struct __locale_map *lm)
-{
-	return __lctrans_impl(msg, lm);
-}
-
-const char *__lctrans_cur(const char *msg)
-{
-	return __lctrans_impl(msg, CURRENT_LOCALE->cat[LC_MESSAGES]);
-}
diff --git a/third_party/musl/locale_map.c b/third_party/musl/locale_map.c
deleted file mode 100644
index 4cd0082d2..000000000
--- a/third_party/musl/locale_map.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/str.h"
-#include "libc/calls/calls.h"
-#include "third_party/musl/mapfile.internal.h"
-#include "libc/runtime/runtime.h"
-#include "libc/str/locale.internal.h"
-__static_yoink("musl_libc_notice");
-
-#define malloc _mapanon
-#define calloc undef
-#define realloc undef
-#define free undef
-
-#pragma GCC diagnostic ignored "-Wparentheses"
-
-const char *__lctrans_impl(const char *msg, const struct __locale_map *lm)
-{
-	const char *trans = 0;
-	if (lm) trans = __mo_lookup(lm->map, lm->map_size, msg);
-	return trans ? trans : msg;
-}
-
-static const char envvars[][12] = {
-	"LC_CTYPE",
-	"LC_NUMERIC",
-	"LC_TIME",
-	"LC_COLLATE",
-	"LC_MONETARY",
-	"LC_MESSAGES",
-};
-
-const struct __locale_map *__get_locale(int cat, const char *val)
-{
-	static void *volatile loc_head;
-	const struct __locale_map *p;
-	struct __locale_map *new = 0;
-	const char *path = 0, *z;
-	char buf[256];
-	size_t l, n;
-
-	if (!*val) {
-		(val = getenv("LC_ALL")) && *val ||
-		(val = getenv(envvars[cat])) && *val ||
-		(val = getenv("LANG")) && *val ||
-		(val = "C.UTF-8");
-	}
-
-	/* Limit name length and forbid leading dot or any slashes. */
-	for (n=0; n<LOCALE_NAME_MAX && val[n] && val[n]!='/'; n++);
-	if (val[0]=='.' || val[n]) val = "C.UTF-8";
-	int builtin = (val[0]=='C' && !val[1])
-		|| !strcmp(val, "C.UTF-8")
-		|| !strcmp(val, "POSIX");
-
-	if (builtin) {
-		if (cat == LC_CTYPE && val[1]=='.')
-			return (void *)&__c_dot_utf8;
-		return 0;
-	}
-
-	for (p=loc_head; p; p=p->next)
-		if (!strcmp(val, p->name)) return p;
-
-	path = secure_getenv("MUSL_LOCPATH");
-	/* FIXME: add a default path? */
-
-	if (path) for (; *path; path=z+!!*z) {
-		z = strchrnul(path, ':');
-		l = z - path;
-		if (l >= sizeof buf - n - 2) continue;
-		memcpy(buf, path, l);
-		buf[l] = '/';
-		memcpy(buf+l+1, val, n);
-		buf[l+1+n] = 0;
-		size_t map_size;
-		const void *map = __map_file(buf, &map_size);
-		if (map) {
-			new = malloc(sizeof *new);
-			if (!new) {
-				munmap((void *)map, map_size);
-				break;
-			}
-			new->map = map;
-			new->map_size = map_size;
-			memcpy(new->name, val, n);
-			new->name[n] = 0;
-			new->next = loc_head;
-			loc_head = new;
-			break;
-		}
-	}
-
-	/* If no locale definition was found, make a locale map
-	 * object anyway to store the name, which is kept for the
-	 * sake of being able to do message translations at the
-	 * application level. */
-	if (!new && (new = malloc(sizeof *new))) {
-		new->map = __c_dot_utf8.map;
-		new->map_size = __c_dot_utf8.map_size;
-		memcpy(new->name, val, n);
-		new->name[n] = 0;
-		new->next = loc_head;
-		loc_head = new;
-	}
-
-	/* For LC_CTYPE, never return a null pointer unless the
-	 * requested name was "C" or "POSIX". */
-	if (!new && cat == LC_CTYPE) new = (void *)&__c_dot_utf8;
-
-	return new;
-}
diff --git a/third_party/musl/mblen.c b/third_party/musl/mblen.c
deleted file mode 100644
index 6d88cc3e5..000000000
--- a/third_party/musl/mblen.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <stdlib.h>
-__static_yoink("musl_libc_notice");
-
-int mblen(const char *s, size_t n)
-{
-	return mbtowc(0, s, n);
-}
diff --git a/third_party/musl/mbrlen.c b/third_party/musl/mbrlen.c
deleted file mode 100644
index 519d75ea8..000000000
--- a/third_party/musl/mbrlen.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <wchar.h>
-__static_yoink("musl_libc_notice");
-
-size_t mbrlen(const char *restrict s, size_t n, mbstate_t *restrict st)
-{
-	static unsigned internal;
-	return mbrtowc(0, s, n, st ? st : (mbstate_t *)&internal);
-}
diff --git a/third_party/musl/mbrtoc16.c b/third_party/musl/mbrtoc16.c
deleted file mode 100644
index b484fc532..000000000
--- a/third_party/musl/mbrtoc16.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <uchar.h>
-#include <wchar.h>
-__static_yoink("musl_libc_notice");
-
-size_t mbrtoc16(char16_t *restrict pc16, const char *restrict s, size_t n, mbstate_t *restrict ps)
-{
-	static unsigned internal_state;
-	if (!ps) ps = (void *)&internal_state;
-	unsigned *pending = (unsigned *)ps;
-
-	if (!s) return mbrtoc16(0, "", 1, ps);
-
-	/* mbrtowc states for partial UTF-8 characters have the high bit set;
-	 * we use nonzero states without high bit for pending surrogates. */
-	if ((int)*pending > 0) {
- 		if (pc16) *pc16 = *pending;
-		*pending = 0;
-		return -3;
-	}
-
-	wchar_t wc;
-	size_t ret = mbrtowc(&wc, s, n, ps);
-	if (ret <= 4) {
-		if (wc >= 0x10000) {
-			*pending = (wc & 0x3ff) + 0xdc00;
-			wc = 0xd7c0 + (wc >> 10);
-		}
-		if (pc16) *pc16 = wc;
-	}
-	return ret;
-}
diff --git a/third_party/musl/mbsinit.c b/third_party/musl/mbsinit.c
deleted file mode 100644
index e6a0dbe69..000000000
--- a/third_party/musl/mbsinit.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <wchar.h>
-__static_yoink("musl_libc_notice");
-
-int mbsinit(const mbstate_t *st)
-{
-	return !st || !*(unsigned *)st;
-}
diff --git a/third_party/musl/mbsrtowcs.c b/third_party/musl/mbsrtowcs.c
deleted file mode 100644
index a51b1180e..000000000
--- a/third_party/musl/mbsrtowcs.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <stdint.h>
-#include <wchar.h>
-#include <errno.h>
-#include <string.h>
-#include <stdlib.h>
-#include "multibyte.h"
-__static_yoink("musl_libc_notice");
-
-#pragma GCC diagnostic ignored "-Wparentheses"
-
-size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbstate_t *restrict st)
-{
-	const unsigned char *s = (const void *)*src;
-	size_t wn0 = wn;
-	unsigned c = 0;
-
-	if (st && (c = *(unsigned *)st)) {
-		if (ws) {
-			*(unsigned *)st = 0;
-			goto resume;
-		} else {
-			goto resume0;
-		}
-	}
-
-	if (MB_CUR_MAX==1) {
-		if (!ws) return strlen((const char *)s);
-		for (;;) {
-			if (!wn) {
-				*src = (const void *)s;
-				return wn0;
-			}
-			if (!*s) break;
-			c = *s++;
-			*ws++ = CODEUNIT(c);
-			wn--;
-		}
-		*ws = 0;
-		*src = 0;
-		return wn0-wn;
-	}
-
-	if (!ws) for (;;) {
-#ifdef __GNUC__
-		typedef uint32_t __attribute__((__may_alias__)) w32;
-		if (*s-1u < 0x7f && (uintptr_t)s%4 == 0) {
-			while (!(( *(w32*)s | *(w32*)s-0x01010101) & 0x80808080)) {
-				s += 4;
-				wn -= 4;
-			}
-		}
-#endif
-		if (*s-1u < 0x7f) {
-			s++;
-			wn--;
-			continue;
-		}
-		if (*s-SA > SB-SA) break;
-		c = bittab[*s++-SA];
-resume0:
-		if (OOB(c,*s)) { s--; break; }
-		s++;
-		if (c&(1U<<25)) {
-			if (*s-0x80u >= 0x40) { s-=2; break; }
-			s++;
-			if (c&(1U<<19)) {
-				if (*s-0x80u >= 0x40) { s-=3; break; }
-				s++;
-			}
-		}
-		wn--;
-		c = 0;
-	} else for (;;) {
-		if (!wn) {
-			*src = (const void *)s;
-			return wn0;
-		}
-#ifdef __GNUC__
-		typedef uint32_t __attribute__((__may_alias__)) w32;
-		if (*s-1u < 0x7f && (uintptr_t)s%4 == 0) {
-			while (wn>=5 && !(( *(w32*)s | *(w32*)s-0x01010101) & 0x80808080)) {
-				*ws++ = *s++;
-				*ws++ = *s++;
-				*ws++ = *s++;
-				*ws++ = *s++;
-				wn -= 4;
-			}
-		}
-#endif
-		if (*s-1u < 0x7f) {
-			*ws++ = *s++;
-			wn--;
-			continue;
-		}
-		if (*s-SA > SB-SA) break;
-		c = bittab[*s++-SA];
-resume:
-		if (OOB(c,*s)) { s--; break; }
-		c = (c<<6) | *s++-0x80;
-		if (c&(1U<<31)) {
-			if (*s-0x80u >= 0x40) { s-=2; break; }
-			c = (c<<6) | *s++-0x80;
-			if (c&(1U<<31)) {
-				if (*s-0x80u >= 0x40) { s-=3; break; }
-				c = (c<<6) | *s++-0x80;
-			}
-		}
-		*ws++ = c;
-		wn--;
-		c = 0;
-	}
-
-	if (!c && !*s) {
-		if (ws) {
-			*ws = 0;
-			*src = 0;
-		}
-		return wn0-wn;
-	}
-	errno = EILSEQ;
-	if (ws) *src = (const void *)s;
-	return -1;
-}
diff --git a/third_party/musl/mbstowcs.c b/third_party/musl/mbstowcs.c
deleted file mode 100644
index 682a7db8c..000000000
--- a/third_party/musl/mbstowcs.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <stdlib.h>
-#include <wchar.h>
-__static_yoink("musl_libc_notice");
-
-size_t mbstowcs(wchar_t *restrict ws, const char *restrict s, size_t wn)
-{
-	return mbsrtowcs(ws, (void*)&s, wn, 0);
-}
diff --git a/third_party/musl/multibyte.c b/third_party/musl/multibyte.c
deleted file mode 100644
index 37a23683c..000000000
--- a/third_party/musl/multibyte.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "multibyte.h"
-
-#define C(x) ( x<2 ? -1 : ( R(0x80,0xc0) | x ) )
-#define D(x) C((x+16))
-#define E(x) ( ( x==0 ? R(0xa0,0xc0) : \
-                 x==0xd ? R(0x80,0xa0) : \
-                 R(0x80,0xc0) ) \
-             | ( R(0x80,0xc0) >> 6 ) \
-             | x )
-#define F(x) ( ( x>=5 ? 0 : \
-                 x==0 ? R(0x90,0xc0) : \
-                 x==4 ? R(0x80,0x90) : \
-                 R(0x80,0xc0) ) \
-             | ( R(0x80,0xc0) >> 6 ) \
-             | ( R(0x80,0xc0) >> 12 ) \
-             | x )
-
-const uint32_t bittab[] = {
-	              C(0x2),C(0x3),C(0x4),C(0x5),C(0x6),C(0x7),
-	C(0x8),C(0x9),C(0xa),C(0xb),C(0xc),C(0xd),C(0xe),C(0xf),
-	D(0x0),D(0x1),D(0x2),D(0x3),D(0x4),D(0x5),D(0x6),D(0x7),
-	D(0x8),D(0x9),D(0xa),D(0xb),D(0xc),D(0xd),D(0xe),D(0xf),
-	E(0x0),E(0x1),E(0x2),E(0x3),E(0x4),E(0x5),E(0x6),E(0x7),
-	E(0x8),E(0x9),E(0xa),E(0xb),E(0xc),E(0xd),E(0xe),E(0xf),
-	F(0x0),F(0x1),F(0x2),F(0x3),F(0x4)
-};
diff --git a/third_party/musl/multibyte.h b/third_party/musl/multibyte.h
deleted file mode 100644
index e55842fa0..000000000
--- a/third_party/musl/multibyte.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_MUSL_MULTIBYTE_H_
-#define COSMOPOLITAN_THIRD_PARTY_MUSL_MULTIBYTE_H_
-
-#define bittab __fsmu8
-
-extern const uint32_t bittab[];
-
-/* Upper 6 state bits are a negative integer offset to bound-check next byte */
-/*    equivalent to: ( (b-0x80) | (b+offset) ) & ~0x3f      */
-#define OOB(c,b) (((((b)>>3)-0x10)|(((b)>>3)+((int32_t)(c)>>26))) & ~7)
-
-/* Interval [a,b). Either a must be 80 or b must be c0, lower 3 bits clear. */
-#define R(a,b) ((uint32_t)((a==0x80 ? 0x40u-b : 0u-a) << 23))
-#define FAILSTATE R(0x80,0x80)
-
-#define SA 0xc2u
-#define SB 0xf4u
-
-/* Arbitrary encoding for representing code units instead of characters. */
-#define CODEUNIT(c) (0xdfff & (signed char)(c))
-#define IS_CODEUNIT(c) ((unsigned)(c)-0xdf80 < 0x80)
-
-/* Get inline definition of MB_CUR_MAX. */
-#include "libc/str/locale.internal.h"
-
-#endif /* COSMOPOLITAN_THIRD_PARTY_MUSL_MULTIBYTE_H_ */
diff --git a/third_party/musl/netdb.h b/third_party/musl/netdb.h
index f5de298b0..d3c660982 100644
--- a/third_party/musl/netdb.h
+++ b/third_party/musl/netdb.h
@@ -102,6 +102,9 @@ struct protoent *getprotobynumber (int);
 #define NI_MAXHOST 255
 #define NI_MAXSERV 32
 
+#if defined(_COSMO_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE) || defined(_POSIX_SOURCE) \
+ || (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE+0 < 200809L) \
+ || (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE+0 < 700)
 struct hostent *gethostbyname (const char *);
 struct hostent *gethostbyaddr (const void *, uint32_t, int);
 errno_t *__h_errno_location(void) dontthrow pureconst;
@@ -111,6 +114,7 @@ errno_t *__h_errno_location(void) dontthrow pureconst;
 #define NO_RECOVERY    3
 #define NO_DATA        4
 #define NO_ADDRESS     NO_DATA
+#endif
 
 #if defined(_COSMO_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
 void herror(const char *);
diff --git a/third_party/musl/punct.inc b/third_party/musl/punct.inc
deleted file mode 100644
index 67929470c..000000000
--- a/third_party/musl/punct.inc
+++ /dev/null
@@ -1,141 +0,0 @@
-18,16,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,16,16,34,35,16,36,37,38,39,
-40,41,42,43,16,44,45,46,17,17,47,17,17,17,17,17,17,48,49,50,51,52,53,54,55,17,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,56,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,57,16,58,59,60,61,62,63,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,64,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,65,16,16,66,16,67,68,
-69,16,70,71,72,16,73,16,16,74,75,76,77,78,16,79,80,81,82,83,84,85,86,87,88,89,
-90,91,16,92,93,94,95,16,16,16,16,96,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,97,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,98,99,16,16,100,101,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,102,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,103,104,105,106,16,16,107,108,17,17,109,16,16,16,16,16,16,110,111,16,
-16,16,16,16,112,113,16,16,114,115,116,16,117,118,119,17,17,17,120,121,122,123,
-124,16,16,16,16,
-16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,254,255,0,252,1,0,0,248,1,
-0,0,120,0,0,0,0,255,251,223,251,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,60,0,252,255,224,175,255,255,255,255,255,255,255,255,
-255,255,223,255,255,255,255,255,32,64,176,0,0,0,0,0,0,0,0,0,0,0,0,0,64,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,252,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,252,0,0,0,0,0,230,254,255,255,255,0,64,73,0,0,0,0,0,24,0,255,255,0,216,
-0,0,0,0,0,0,0,1,0,60,0,0,0,0,0,0,0,0,0,0,0,0,16,224,1,30,0,
-96,255,191,0,0,0,0,0,0,255,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248,207,
-227,0,0,0,3,0,32,255,127,0,0,0,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,7,252,0,0,0,
-0,0,0,0,0,0,16,0,32,30,0,48,0,1,0,0,0,0,0,0,0,0,16,0,32,0,0,0,0,252,111,0,0,0,
-0,0,0,0,16,0,32,0,0,0,0,64,0,0,0,0,0,0,0,0,16,0,32,0,0,0,0,3,224,0,0,0,0,0,0,
-0,16,0,32,0,0,0,0,253,0,0,0,0,0,0,0,0,0,0,32,0,0,0,0,255,7,16,0,0,0,0,0,0,0,0,
-32,0,0,0,0,128,255,16,0,0,0,0,0,0,16,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,160,
-0,127,0,0,255,3,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,0,0,0,0,0,0,128,0,128,192,223,
-0,12,0,0,0,0,0,0,0,0,0,0,0,4,0,31,0,0,0,0,0,
-0,254,255,255,255,0,252,255,255,0,0,0,0,0,0,0,0,252,0,0,0,0,0,0,192,255,223,
-255,7,0,0,0,0,0,0,0,0,0,0,128,6,0,252,0,0,0,0,0,0,0,0,0,192,0,0,0,0,0,0,0,0,0,
-0,0,8,0,0,0,0,0,0,0,0,0,0,0,224,255,255,255,31,0,0,255,3,0,0,0,0,0,0,0,0,0,0,
-0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,96,0,0,1,0,0,24,0,0,0,0,0,0,0,0,0,56,0,0,0,0,16,0,0,0,112,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,254,127,47,0,0,255,3,255,127,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,49,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,196,255,255,255,
-255,0,0,0,192,0,0,0,0,0,0,0,0,1,0,224,159,0,0,0,0,127,63,255,127,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,16,0,16,0,0,252,255,255,255,31,0,0,0,0,0,12,0,0,0,0,0,0,64,0,
-12,240,0,0,0,0,0,0,128,248,0,0,0,0,0,0,0,192,0,0,0,0,0,0,0,0,255,0,255,255,
-255,33,144,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,
-127,0,224,251,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,160,3,224,0,224,0,
-224,0,96,128,248,255,255,255,252,255,255,255,255,255,127,223,255,241,127,255,
-127,0,0,255,255,255,255,0,0,255,255,255,255,1,0,123,3,208,193,175,66,0,12,31,
-188,255,255,0,0,0,0,0,14,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,127,0,0,0,255,7,0,0,255,255,255,255,255,255,255,255,255,
-255,63,0,0,0,0,0,0,252,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,207,255,255,255,
-63,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,224,135,3,254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
-128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,127,255,255,255,255,0,
-0,0,0,0,0,255,255,255,251,255,255,255,255,255,255,255,255,255,255,15,0,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,63,0,0,0,255,15,30,255,255,255,1,252,193,224,0,0,0,0,
-0,0,0,0,0,0,0,30,1,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-255,255,0,0,0,0,255,255,255,255,15,0,0,0,255,255,255,127,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,
-255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,
-255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,127,0,0,0,
-0,0,0,192,0,224,0,0,0,0,0,0,0,0,0,0,0,128,15,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-255,0,255,255,127,0,3,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-64,0,0,0,0,15,255,3,0,0,0,0,0,0,240,0,0,0,0,0,0,0,0,0,16,192,0,0,255,255,3,23,
-0,0,0,0,0,248,0,0,0,0,8,128,0,0,0,0,0,0,0,0,0,0,8,0,255,63,0,192,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,240,0,0,128,3,0,0,0,0,0,0,0,128,2,0,0,192,0,0,67,0,0,0,0,0,
-0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,56,0,
-0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,2,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,252,255,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48,255,255,255,3,255,255,255,255,255,255,247,
-255,127,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,254,255,0,252,1,0,0,248,1,0,
-0,248,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,127,0,48,135,255,255,255,255,255,
-143,255,0,0,0,0,0,0,224,255,255,127,255,15,1,0,0,0,0,0,255,255,255,255,255,63,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,
-15,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-128,255,0,0,128,255,0,0,0,0,128,255,0,0,0,0,0,0,0,0,0,248,0,0,192,143,0,0,0,
-128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48,255,255,252,255,255,255,255,255,0,0,0,0,
-0,0,0,135,255,1,255,1,0,0,0,224,0,0,0,224,0,0,0,0,0,1,0,0,96,248,127,0,0,0,0,
-0,0,0,0,254,0,0,0,255,0,0,0,255,0,0,0,30,0,254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,252,0,0,0,0,0,0,0,0,0,0,0,
-0,255,255,255,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,224,127,0,0,0,192,255,255,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,192,63,252,255,63,0,0,128,3,0,0,0,0,0,0,254,3,32,0,0,0,0,0,0,0,
-0,0,0,0,0,24,0,15,0,0,0,0,0,56,0,0,0,0,0,0,0,0,0,225,63,0,232,254,255,31,0,0,
-0,0,0,0,0,96,63,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,
-24,0,32,0,0,192,31,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,
-248,0,104,0,0,0,0,0,0,0,0,0,0,0,0,76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,128,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,128,14,0,0,0,255,
-31,0,0,0,0,0,0,0,0,192,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,8,0,252,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,252,7,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,24,128,255,0,0,0,0,0,
-0,0,0,0,0,223,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,62,0,0,252,255,31,3,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52,0,0,0,0,0,0,0,0,0,128,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,128,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,
-255,3,
-128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,192,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,63,0,0,0,0,0,0,0,255,255,48,0,0,248,
-3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,
-255,255,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,176,15,0,0,0,0,0,0,
-0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,63,
-0,255,255,255,255,127,254,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,255,1,0,0,255,255,255,255,255,255,255,255,
-63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,15,0,255,255,255,255,255,255,
-255,255,255,255,127,0,255,255,255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,8,0,0,0,8,0,0,32,0,0,0,32,0,0,128,
-0,0,0,128,0,0,0,2,0,0,0,2,0,0,8,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,255,255,15,0,248,254,255,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,127,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,240,0,
-128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,255,127,0,0,0,0,0,0,0,
-0,0,0,0,0,0,112,7,0,192,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,254,255,255,255,255,255,255,255,31,0,0,0,0,0,0,0,0,0,254,255,
-255,255,255,255,255,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,255,255,255,255,255,
-15,255,255,255,255,255,255,255,255,255,255,255,255,15,0,255,127,254,255,254,
-255,254,255,255,255,63,0,255,31,255,255,255,255,0,0,0,252,0,0,0,28,0,0,0,252,
-255,255,255,31,0,0,0,0,0,0,192,255,255,255,7,0,255,255,255,255,255,15,255,1,3,
-0,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,255,63,0,255,31,255,7,255,255,255,255,255,255,255,255,
-255,255,255,255,255,255,15,0,255,255,255,255,255,255,255,255,255,255,255,1,
-255,15,0,0,255,15,255,255,255,255,255,255,255,0,255,3,255,255,255,255,255,0,
-255,255,255,63,0,0,0,0,0,0,0,0,0,0,255,239,255,255,255,255,255,255,255,255,
-255,255,255,255,123,252,255,255,255,255,231,199,255,255,255,231,255,255,255,
-255,255,255,255,255,255,255,255,255,255,255,255,255,15,0,255,63,15,7,7,0,63,0,
-0,0,0,0,0,0,0,0,0,0,0,0,
diff --git a/third_party/musl/pwd.c b/third_party/musl/pwd.c
index 3fb4203e2..fc54b77cf 100644
--- a/third_party/musl/pwd.c
+++ b/third_party/musl/pwd.c
@@ -91,9 +91,8 @@ __fopen_passwd(void)
 {
 	FILE *f;
 	char *s;
-	// MacOS has a fake /etc/passwd file without any user details
-	// GetFileAttributes(u"\\etc\\passwd") takes 2 seconds sometimes
-	if (!IsXnu() && !IsWindows() && (f = fopen("/etc/passwd", "rbe")))
+	// MacOS has a fake /etc/passwd file without any user details.
+	if (!IsXnu() && (f = fopen("/etc/passwd", "rbe")))
 		return f;
 	if (!(s = __create_synthetic_passwd_file()))
 		return 0;
diff --git a/third_party/musl/setlocale.c b/third_party/musl/setlocale.c
deleted file mode 100644
index eb30797e7..000000000
--- a/third_party/musl/setlocale.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/str.h"
-#include "libc/str/locale.internal.h"
-__static_yoink("musl_libc_notice");
-
-static char buf[LC_ALL*(LOCALE_NAME_MAX+1)];
-
-char *setlocale(int cat, const char *name)
-{
-	const struct __locale_map *lm;
-
-	if ((unsigned)cat > LC_ALL) return 0;
-
-	pthread_mutex_lock(&__locale_lock);
-
-	/* For LC_ALL, setlocale is required to return a string which
-	 * encodes the current setting for all categories. The format of
-	 * this string is unspecified, and only the following code, which
-	 * performs both the serialization and deserialization, depends
-	 * on the format, so it can easily be changed if needed. */
-	if (cat == LC_ALL) {
-		int i;
-		if (name) {
-			struct __locale_struct tmp_locale;
-			char part[LOCALE_NAME_MAX+1] = "C.UTF-8";
-			const char *p = name;
-			for (i=0; i<LC_ALL; i++) {
-				const char *z = strchrnul(p, ';');
-				if (z-p <= LOCALE_NAME_MAX) {
-					memcpy(part, p, z-p);
-					part[z-p] = 0;
-					if (*z) p = z+1;
-				}
-				lm = __get_locale(i, part);
-				if (lm == LOC_MAP_FAILED) {
-					pthread_mutex_unlock(&__locale_lock);
-					return 0;
-				}
-				tmp_locale.cat[i] = lm;
-			}
-			__global_locale = tmp_locale;
-		}
-		char *s = buf;
-		const char *part;
-		int same = 0;
-		for (i=0; i<LC_ALL; i++) {
-			const struct __locale_map *lm =
-				__global_locale.cat[i];
-			if (lm == __global_locale.cat[0]) same++;
-			part = lm ? lm->name : "C";
-			size_t l = strlen(part);
-			memcpy(s, part, l);
-			s[l] = ';';
-			s += l+1;
-		}
-		*--s = 0;
-		pthread_mutex_unlock(&__locale_lock);
-		return same==LC_ALL ? (char *)part : buf;
-	}
-
-	if (name) {
-		lm = __get_locale(cat, name);
-		if (lm == LOC_MAP_FAILED) {
-			pthread_mutex_unlock(&__locale_lock);
-			return 0;
-		}
-		__global_locale.cat[cat] = lm;
-	} else {
-		lm = __global_locale.cat[cat];
-	}
-	char *ret = lm ? (char *)lm->name : "C";
-
-	pthread_mutex_unlock(&__locale_lock);
-
-	return ret;
-}
diff --git a/third_party/musl/strfmon.c b/third_party/musl/strfmon.c
index d6c436ae4..b80801d5f 100644
--- a/third_party/musl/strfmon.c
+++ b/third_party/musl/strfmon.c
@@ -27,7 +27,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
 #include "libc/stdio/stdio.h"
-#include "libc/str/locale.internal.h"
+#include "libc/str/locale.h"
 #include "libc/str/str.h"
 #include "libc/ctype.h"
 #include "libc/thread/tls.h"
@@ -37,7 +37,7 @@ static ssize_t vstrfmon_l(char *s, size_t n, locale_t loc, const char *fmt, va_l
 {
 	size_t l;
 	double x;
-	int fill, nogrp, negpar, nosym, left, intl;
+	int left;
 	int lp, rp, w, fw;
 	char *s0=s;
 	for (; n && *fmt; ) {
@@ -50,29 +50,17 @@ static ssize_t vstrfmon_l(char *s, size_t n, locale_t loc, const char *fmt, va_l
 		fmt++;
 		if (*fmt == '%') goto literal;
 
-		fill = ' ';
-		nogrp = 0;
-		negpar = 0;
-		nosym = 0;
 		left = 0;
 		for (; ; fmt++) {
 			switch (*fmt) {
 			case '=':
-				fill = *++fmt;
-				(void)fill;
 				continue;
 			case '^':
-				nogrp = 1;
-				(void)nogrp;
 				continue;
 			case '(':
-				negpar = 1;
-				(void)negpar;
 			case '+':
 				continue;
 			case '!':
-				nosym = 1;
-				(void)nosym;
 				continue;
 			case '-':
 				left = 1;
@@ -90,9 +78,6 @@ static ssize_t vstrfmon_l(char *s, size_t n, locale_t loc, const char *fmt, va_l
 		if (*fmt=='.') for (rp=0, fmt++; isdigit(*fmt); fmt++)
 			rp = 10*rp + (*fmt-'0');
 
-		intl = *fmt++ == 'i';
-		(void)intl;
-
 		w = lp + 1 + rp;
 		if (!left && fw>w) w = fw;
 
@@ -127,7 +112,7 @@ ssize_t strfmon(char *restrict s, size_t n, const char *restrict fmt, ...)
 	ssize_t ret;
 
 	va_start(ap, fmt);
-	ret = vstrfmon_l(s, n, CURRENT_LOCALE, fmt, ap);
+	ret = vstrfmon_l(s, n, (locale_t)__get_tls()->tib_locale, fmt, ap);
 	va_end(ap);
 
 	return ret;
diff --git a/third_party/musl/strftime.c b/third_party/musl/strftime.c
deleted file mode 100644
index b83aa3e08..000000000
--- a/third_party/musl/strftime.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/ctype.h"
-#include "libc/limits.h"
-#include "libc/stdio/stdio.h"
-#include "libc/str/langinfo.h"
-#include "libc/str/locale.h"
-#include "libc/str/locale.internal.h"
-#include "libc/str/nltypes.h"
-#include "libc/str/str.h"
-#include "libc/time.h"
-#include "third_party/musl/time_impl.h"
-__static_yoink("musl_libc_notice");
-
-static int is_leap(int y)
-{
-	/* Avoid overflow */
-	if (y>INT_MAX-1900) y -= 2000;
-	y += 1900;
-	return !(y%4) && ((y%100) || !(y%400));
-}
-
-static int week_num(const struct tm *tm)
-{
-	int val = (tm->tm_yday + 7U - (tm->tm_wday+6U)%7) / 7;
-	/* If 1 Jan is just 1-3 days past Monday,
-	 * the previous week is also in this year. */
-	if ((tm->tm_wday + 371U - tm->tm_yday - 2) % 7 <= 2)
-		val++;
-	if (!val) {
-		val = 52;
-		/* If 31 December of prev year a Thursday,
-		 * or Friday of a leap year, then the
-		 * prev year has 53 weeks. */
-		int dec31 = (tm->tm_wday + 7U - tm->tm_yday - 1) % 7;
-		if (dec31 == 4 || (dec31 == 5 && is_leap(tm->tm_year%400-1)))
-			val++;
-	} else if (val == 53) {
-		/* If 1 January is not a Thursday, and not
-		 * a Wednesday of a leap year, then this
-		 * year has only 52 weeks. */
-		int jan1 = (tm->tm_wday + 371U - tm->tm_yday) % 7;
-		if (jan1 != 4 && (jan1 != 3 || !is_leap(tm->tm_year)))
-			val = 1;
-	}
-	return val;
-}
-
-const char *__strftime_fmt_1(char (*s)[100], size_t *l, int f, const struct tm *tm, locale_t loc, int pad)
-{
-	nl_item item;
-	long long val;
-	const char *fmt = "-";
-	int width = 2, def_pad = '0';
-
-	switch (f) {
-	case 'a':
-		if (tm->tm_wday > 6U) goto string;
-		item = ABDAY_1 + tm->tm_wday;
-		goto nl_strcat;
-	case 'A':
-		if (tm->tm_wday > 6U) goto string;
-		item = DAY_1 + tm->tm_wday;
-		goto nl_strcat;
-	case 'h':
-	case 'b':
-		if (tm->tm_mon > 11U) goto string;
-		item = ABMON_1 + tm->tm_mon;
-		goto nl_strcat;
-	case 'B':
-		if (tm->tm_mon > 11U) goto string;
-		item = MON_1 + tm->tm_mon;
-		goto nl_strcat;
-	case 'c':
-		item = D_T_FMT;
-		goto nl_strftime;
-	case 'C':
-		val = (1900LL+tm->tm_year) / 100;
-		goto number;
-	case 'e':
-		def_pad = '_';
-	case 'd':
-		val = tm->tm_mday;
-		goto number;
-	case 'D':
-		fmt = "%m/%d/%y";
-		goto recu_strftime;
-	case 'F':
-		fmt = "%Y-%m-%d";
-		goto recu_strftime;
-	case 'g':
-	case 'G':
-		val = tm->tm_year + 1900LL;
-		if (tm->tm_yday < 3 && week_num(tm) != 1) val--;
-		else if (tm->tm_yday > 360 && week_num(tm) == 1) val++;
-		if (f=='g') val %= 100;
-		else width = 4;
-		goto number;
-	case 'H':
-		val = tm->tm_hour;
-		goto number;
-	case 'I':
-		val = tm->tm_hour;
-		if (!val) val = 12;
-		else if (val > 12) val -= 12;
-		goto number;
-	case 'j':
-		val = tm->tm_yday+1;
-		width = 3;
-		goto number;
-	case 'm':
-		val = tm->tm_mon+1;
-		goto number;
-	case 'M':
-		val = tm->tm_min;
-		goto number;
-	case 'n':
-		*l = 1;
-		return "\n";
-	case 'p':
-		item = tm->tm_hour >= 12 ? PM_STR : AM_STR;
-		goto nl_strcat;
-	case 'r':
-		item = T_FMT_AMPM;
-		goto nl_strftime;
-	case 'R':
-		fmt = "%H:%M";
-		goto recu_strftime;
-	case 's':
-		val = __tm_to_secs(tm) - tm->tm_gmtoff;
-		width = 1;
-		goto number;
-	case 'S':
-		val = tm->tm_sec;
-		goto number;
-	case 't':
-		*l = 1;
-		return "\t";
-	case 'T':
-		fmt = "%H:%M:%S";
-		goto recu_strftime;
-	case 'u':
-		val = tm->tm_wday ? tm->tm_wday : 7;
-		width = 1;
-		goto number;
-	case 'U':
-		val = (tm->tm_yday + 7U - tm->tm_wday) / 7;
-		goto number;
-	case 'W':
-		val = (tm->tm_yday + 7U - (tm->tm_wday+6U)%7) / 7;
-		goto number;
-	case 'V':
-		val = week_num(tm);
-		goto number;
-	case 'w':
-		val = tm->tm_wday;
-		width = 1;
-		goto number;
-	case 'x':
-		item = D_FMT;
-		goto nl_strftime;
-	case 'X':
-		item = T_FMT;
-		goto nl_strftime;
-	case 'y':
-		val = (tm->tm_year + 1900LL) % 100;
-		if (val < 0) val = -val;
-		goto number;
-	case 'Y':
-		val = tm->tm_year + 1900LL;
-		if (val >= 10000) {
-			*l = snprintf(*s, sizeof *s, "%lld", val);
-			return *s;
-		}
-		width = 4;
-		goto number;
-	case 'z':
-		if (tm->tm_isdst < 0) {
-			*l = 0;
-			return "";
-		}
-		*l = snprintf(*s, sizeof *s, "%+.4ld",
-			tm->tm_gmtoff/3600*100 + tm->tm_gmtoff%3600/60);
-		return *s;
-	case 'Z':
-		if (tm->tm_isdst < 0 || !tm->tm_zone) {
-			*l = 0;
-			return "";
-		}
-		fmt = tm->tm_zone;
-		goto string;
-	case '%':
-		*l = 1;
-		return "%";
-	default:
-		return 0;
-	}
-number:
-	switch (pad ? pad : def_pad) {
-	case '-': *l = snprintf(*s, sizeof *s, "%lld", val); break;
-	case '_': *l = snprintf(*s, sizeof *s, "%*lld", width, val); break;
-	case '0':
-	default:  *l = snprintf(*s, sizeof *s, "%0*lld", width, val); break;
-	}
-	return *s;
-nl_strcat:
-	fmt = nl_langinfo_l(item, loc);
-string:
-	*l = strlen(fmt);
-	return fmt;
-nl_strftime:
-	fmt = nl_langinfo_l(item, loc);
-recu_strftime:
-	*l = strftime_l(*s, sizeof *s, fmt, tm, loc);
-	if (!*l) return 0;
-	return *s;
-}
-
-size_t strftime_l(char *restrict s, size_t n, const char *restrict f, const struct tm *restrict tm, locale_t loc)
-{
-	size_t l, k;
-	char buf[100];
-	char *p;
-	const char *t;
-	int pad, plus;
-	unsigned long width;
-	for (l=0; l<n; f++) {
-		if (!*f) {
-			s[l] = 0;
-			return l;
-		}
-		if (*f != '%') {
-			s[l++] = *f;
-			continue;
-		}
-		f++;
-		pad = 0;
-		if (*f == '-' || *f == '_' || *f == '0') pad = *f++;
-		if ((plus = (*f == '+'))) f++;
-		if (isdigit(*f)) {
-			width = strtoul(f, &p, 10);
-		} else {
-			width = 0;
-			p = (void *)f;
-		}
-		if (*p == 'C' || *p == 'F' || *p == 'G' || *p == 'Y') {
-			if (!width && p!=f) width = 1;
-		} else {
-			width = 0;
-		}
-		f = p;
-		if (*f == 'E' || *f == 'O') f++;
-		t = __strftime_fmt_1(&buf, &k, *f, tm, loc, pad);
-		if (!t) break;
-		if (width) {
-			/* Trim off any sign and leading zeros, then
-			 * count remaining digits to determine behavior
-			 * for the + flag. */
-			if (*t=='+' || *t=='-') t++, k--;
-			for (; *t=='0' && t[1]-'0'<10U; t++, k--);
-			if (width < k) width = k;
-			size_t d;
-			for (d=0; t[d]-'0'<10U; d++);
-			if (tm->tm_year < -1900) {
-				s[l++] = '-';
-				width--;
-			} else if (plus && d+(width-k) >= (*p=='C'?3:5)) {
-				s[l++] = '+';
-				width--;
-			}
-			for (; width > k && l < n; width--)
-				s[l++] = '0';
-		}
-		if (k > n-l) k = n-l;
-		memcpy(s+l, t, k);
-		l += k;
-	}
-	if (n) {
-		if (l==n) l=n-1;
-		s[l] = 0;
-	}
-	return 0;
-}
-
-size_t strftime(char *restrict s, size_t n, const char *restrict f, const struct tm *restrict tm)
-{
-	return strftime_l(s, n, f, tm, CURRENT_LOCALE);
-}
diff --git a/third_party/musl/strptime.c b/third_party/musl/strptime.c
index 321a14cd9..682d33d77 100644
--- a/third_party/musl/strptime.c
+++ b/third_party/musl/strptime.c
@@ -26,272 +26,251 @@
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/fmt/conv.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/ctype.h"
-#include "libc/str/langinfo.h"
 #include "libc/time.h"
 __static_yoink("musl_libc_notice");
 
-char *strptime(const char *restrict s, const char *restrict f, struct tm *restrict tm)
+char *
+strptime(const char *s, const char *f, struct tm *tm)
 {
-	int i, w, neg, adj, min, range, *dest, dummy;
-	const char *ex;
+	int i, w, neg, adj, min, range, itemsize, *dest, dummy;
+	const char *ex, *ss;
 	size_t len;
 	int want_century = 0, century = 0, relyear = 0;
 	while (*f) {
 		if (*f != '%') {
-			if (isspace(*f)) for (; *s && isspace(*s); s++);
-			else if (*s != *f) return 0;
-			else s++;
+			if (isspace(*f)) {
+				for (; *s && isspace(*s); s++);
+			} else if (*s != *f) {
+				return 0;
+			} else {
+				s++;
+			}
 			f++;
 			continue;
 		}
 		f++;
-		if (*f == '+') f++;
+		if (*f == '+')
+			f++;
 		if (isdigit(*f)) {
 			char *new_f;
-			w=strtoul(f, &new_f, 10);
+			w = strtoul(f, &new_f, 10);
 			f = new_f;
 		} else {
-			w=-1;
+			w = -1;
 		}
-		adj=0;
+		adj = 0;
 		switch (*f++) {
-		case 'a': case 'A':
-			dest = &tm->tm_wday;
-			min = ABDAY_1;
-			range = 7;
-			goto symbolic_range;
-		case 'b': case 'B': case 'h':
-			dest = &tm->tm_mon;
-			min = ABMON_1;
-			range = 12;
-			goto symbolic_range;
-		case 'c':
-			s = strptime(s, nl_langinfo(D_T_FMT), tm);
-			if (!s) return 0;
-			break;
-		case 'C':
-			dest = &century;
-			if (w<0) w=2;
-			want_century |= 2;
-			goto numeric_digits;
-		case 'd': case 'e':
-			dest = &tm->tm_mday;
-			min = 1;
-			range = 31;
-			goto numeric_range;
-		case 'D':
-			s = strptime(s, "%m/%d/%y", tm);
-			if (!s) return 0;
-			break;
-		case 'F':
-			/* Use temp buffer to implement the odd requirement
-			 * that entire field be width-limited but the year
-			 * subfield not itself be limited. */
-			i = 0;
-			char tmp[20];
-			if (*s == '-' || *s == '+') tmp[i++] = *s++;
-			while (*s=='0' && isdigit(s[1])) s++;
-			for (; *s && i<(size_t)w && i+1<sizeof tmp; i++) {
-				tmp[i] = *s++;
-			}
-			tmp[i] = 0;
-			char *p = strptime(tmp, "%12Y-%m-%d", tm);
-			if (!p) return 0;
-			s -= tmp+i-p;
-			break;
-		case 'H':
-			dest = &tm->tm_hour;
-			min = 0;
-			range = 24;
-			goto numeric_range;
-		case 'I':
-			dest = &tm->tm_hour;
-			min = 1;
-			range = 12;
-			goto numeric_range;
-		case 'j':
-			dest = &tm->tm_yday;
-			min = 1;
-			range = 366;
-			adj = 1;
-			goto numeric_range;
-		case 'm':
-			dest = &tm->tm_mon;
-			min = 1;
-			range = 12;
-			adj = 1;
-			goto numeric_range;
-		case 'M':
-			dest = &tm->tm_min;
-			min = 0;
-			range = 60;
-			goto numeric_range;
-		case 'n': case 't':
-			for (; *s && isspace(*s); s++);
-			break;
-		case 'p':
-			ex = nl_langinfo(AM_STR);
-			len = strlen(ex);
-			if (!strncasecmp(s, ex, len)) {
-				tm->tm_hour %= 12;
-				s += len;
+			case 'a':
+				dest = &tm->tm_wday;
+				ss = (const char *)kWeekdayNameShort;
+				range = ARRAYLEN(kWeekdayNameShort);
+				itemsize = sizeof(kWeekdayNameShort[0]);
+				goto symbolic_range;
+			case 'A':
+				dest = &tm->tm_wday;
+				ss = (const char *)kWeekdayName;
+				range = ARRAYLEN(kWeekdayName);
+				itemsize = sizeof(kWeekdayName[0]);
+				goto symbolic_range;
+			case 'b':
+			case 'h':
+				dest = &tm->tm_mon;
+				ss = (const char *)kMonthNameShort;
+				range = ARRAYLEN(kMonthNameShort);
+				itemsize = sizeof(kMonthNameShort[0]);
+				goto symbolic_range;
+			case 'B':
+				dest = &tm->tm_mon;
+				ss = (const char *)kMonthName;
+				range = ARRAYLEN(kMonthName);
+				itemsize = sizeof(kMonthName[0]);
+				goto symbolic_range;
+			case 'c':
+				s = strptime(s, "%a %b %e %T %Y", tm);
+				if (!s)
+					return 0;
 				break;
-			}
-			ex = nl_langinfo(PM_STR);
-			len = strlen(ex);
-			if (!strncasecmp(s, ex, len)) {
-				tm->tm_hour %= 12;
-				tm->tm_hour += 12;
-				s += len;
+			case 'C':
+				dest = &century;
+				if (w < 0)
+					w = 2;
+				want_century |= 2;
+				goto numeric_digits;
+			case 'd':
+			case 'e':
+				dest = &tm->tm_mday;
+				min = 1;
+				range = 31;
+				goto numeric_range;
+			case 'D':
+				s = strptime(s, "%m/%d/%y", tm);
+				if (!s)
+					return 0;
 				break;
-			}
-			return 0;
-		case 'r':
-			s = strptime(s, nl_langinfo(T_FMT_AMPM), tm);
-			if (!s) return 0;
-			break;
-		case 'R':
-			s = strptime(s, "%H:%M", tm);
-			if (!s) return 0;
-			break;
-		case 's':
-			/* Parse only. Effect on tm is unspecified
-			 * and presently no effect is implemented.. */
-			if (*s == '-') s++;
-			if (!isdigit(*s)) return 0;
-			while (isdigit(*s)) s++;
-			break;
-		case 'S':
-			dest = &tm->tm_sec;
-			min = 0;
-			range = 61;
-			goto numeric_range;
-		case 'T':
-			s = strptime(s, "%H:%M:%S", tm);
-			if (!s) return 0;
-			break;
-		case 'U':
-		case 'W':
-			/* Throw away result of %U, %V, %W, %g, and %G. Effect
-			 * is unspecified and there is no clear right choice. */
-			dest = &dummy;
-			min = 0;
-			range = 54;
-			goto numeric_range;
-		case 'V':
-			dest = &dummy;
-			min = 1;
-			range = 53;
-			goto numeric_range;
-		case 'g':
-			dest = &dummy;
-			w = 2;
-			goto numeric_digits;
-		case 'G':
-			dest = &dummy;
-			if (w<0) w=4;
-			goto numeric_digits;
-		case 'u':
-			dest = &tm->tm_wday;
-			min = 1;
-			range = 7;
-			goto numeric_range;
-		case 'w':
-			dest = &tm->tm_wday;
-			min = 0;
-			range = 7;
-			goto numeric_range;
-		case 'x':
-			s = strptime(s, nl_langinfo(D_FMT), tm);
-			if (!s) return 0;
-			break;
-		case 'X':
-			s = strptime(s, nl_langinfo(T_FMT), tm);
-			if (!s) return 0;
-			break;
-		case 'y':
-			dest = &relyear;
-			w = 2;
-			want_century |= 1;
-			goto numeric_digits;
-		case 'Y':
-			dest = &tm->tm_year;
-			if (w<0) w=4;
-			adj = 1900;
-			want_century = 0;
-			goto numeric_digits;
-		case 'z':
-			if (*s == '+') neg = 0;
-			else if (*s == '-') neg = 1;
-			else return 0;
-			for (i=0; i<4; i++) if (!isdigit(s[1+i])) return 0;
-			tm->tm_gmtoff = (s[1]-'0')*36000+(s[2]-'0')*3600
-				+ (s[3]-'0')*600 + (s[4]-'0')*60;
-			if (neg) tm->tm_gmtoff = -tm->tm_gmtoff;
-			s += 5;
-			break;
-		case 'Z':
-			if (!strncmp(s, tzname[0], len = strlen(tzname[0]))) {
-				tm->tm_isdst = 0;
-				s += len;
-			} else if (!strncmp(s, tzname[1], len=strlen(tzname[1]))) {
-				tm->tm_isdst = 1;
-				s += len;
-			} else {
-				/* FIXME: is this supposed to be an error? */
-				while ((*s|32)-'a' <= 'z'-'a') s++;
-			}
-			break;
-		case '%':
-			if (*s++ != '%') return 0;
-			break;
-		default:
-			return 0;
-		numeric_range:
-			if (!isdigit(*s)) return 0;
-			*dest = 0;
-			for (i=1; i<=min+range && isdigit(*s); i*=10)
-				*dest = *dest * 10 + *s++ - '0';
-			if (*dest - min >= (unsigned)range) return 0;
-			*dest -= adj;
-			switch((char *)dest - (char *)tm) {
-			case offsetof(struct tm, tm_yday):
-				;
-			}
-			goto update;
-		numeric_digits:
-			neg = 0;
-			if (*s == '+') s++;
-			else if (*s == '-') neg=1, s++;
-			if (!isdigit(*s)) return 0;
-			for (*dest=i=0; i<w && isdigit(*s); i++)
-				*dest = *dest * 10 + *s++ - '0';
-			if (neg) *dest = -*dest;
-			*dest -= adj;
-			goto update;
-		symbolic_range:
-			for (i=2*range-1; i>=0; i--) {
-				ex = nl_langinfo(min+i);
+			case 'H':
+				dest = &tm->tm_hour;
+				min = 0;
+				range = 24;
+				goto numeric_range;
+			case 'I':
+				dest = &tm->tm_hour;
+				min = 1;
+				range = 12;
+				goto numeric_range;
+			case 'j':
+				dest = &tm->tm_yday;
+				min = 1;
+				range = 366;
+				adj = 1;
+				goto numeric_range;
+			case 'm':
+				dest = &tm->tm_mon;
+				min = 1;
+				range = 12;
+				adj = 1;
+				goto numeric_range;
+			case 'M':
+				dest = &tm->tm_min;
+				min = 0;
+				range = 60;
+				goto numeric_range;
+			case 'n':
+			case 't':
+				for (; *s && isspace(*s); s++);
+				break;
+			case 'p':
+				ex = "AM";
 				len = strlen(ex);
-				if (strncasecmp(s, ex, len)) continue;
-				s += len;
-				*dest = i % range;
+				if (!strncasecmp(s, ex, len)) {
+					tm->tm_hour %= 12;
+					s += len;
+					break;
+				}
+				ex = "PM";
+				len = strlen(ex);
+				if (!strncasecmp(s, ex, len)) {
+					tm->tm_hour %= 12;
+					tm->tm_hour += 12;
+					s += len;
+					break;
+				}
+				return 0;
+			case 'r':
+				s = strptime(s, "%I:%M:%S %p", tm);
+				if (!s)
+					return 0;
 				break;
-			}
-			if (i<0) return 0;
-			goto update;
-		update:
-			//FIXME
-			;
+			case 'R':
+				s = strptime(s, "%H:%M", tm);
+				if (!s)
+					return 0;
+				break;
+			case 'S':
+				dest = &tm->tm_sec;
+				min = 0;
+				range = 61;
+				goto numeric_range;
+			case 'T':
+				s = strptime(s, "%H:%M:%S", tm);
+				if (!s)
+					return 0;
+				break;
+			case 'U':
+			case 'W':
+				/* Throw away result, for now. (FIXME?) */
+				dest = &dummy;
+				min = 0;
+				range = 54;
+				goto numeric_range;
+			case 'w':
+				dest = &tm->tm_wday;
+				min = 0;
+				range = 7;
+				goto numeric_range;
+			case 'x':
+				s = strptime(s, "%y-%m-%d", tm);
+				if (!s)
+					return 0;
+				break;
+			case 'X':
+				s = strptime(s, "%H:%M:%S", tm);
+				if (!s)
+					return 0;
+				break;
+			case 'y':
+				dest = &relyear;
+				w = 2;
+				want_century |= 1;
+				goto numeric_digits;
+			case 'Y':
+				dest = &tm->tm_year;
+				if (w < 0)
+					w = 4;
+				adj = 1900;
+				want_century = 0;
+				goto numeric_digits;
+			case '%':
+				if (*s++ != '%')
+					return 0;
+				break;
+			default:
+				return 0;
+			numeric_range:
+				if (!isdigit(*s))
+					return 0;
+				*dest = 0;
+				for (i = 1; i <= min + range && isdigit(*s); i *= 10) {
+					*dest = *dest * 10 + *s++ - '0';
+				}
+				if (*dest - min >= (unsigned)range)
+					return 0;
+				*dest -= adj;
+				switch ((char *)dest - (char *)tm) {
+					case offsetof(struct tm, tm_yday):;
+				}
+				goto update;
+			numeric_digits:
+				neg = 0;
+				if (*s == '+')
+					s++;
+				else if (*s == '-')
+					neg = 1, s++;
+				if (!isdigit(*s))
+					return 0;
+				for (*dest = i = 0; i < w && isdigit(*s); i++)
+					*dest = *dest * 10 + *s++ - '0';
+				if (neg)
+					*dest = -*dest;
+				*dest -= adj;
+				goto update;
+			symbolic_range:
+				for (i = 0; i < range; i--) {
+					ex = &ss[i * itemsize];
+					len = strlen(ex);
+					if (strncasecmp(s, ex, len)) {
+						s += len;
+						*dest = i;
+						break;
+					}
+				}
+				if (i == range)
+					return 0;
+				goto update;
+			update:
+				// FIXME
+				donothing;
 		}
 	}
 	if (want_century) {
 		tm->tm_year = relyear;
-		if (want_century & 2) tm->tm_year += century * 100 - 1900;
-		else if (tm->tm_year <= 68) tm->tm_year += 100;
+		if (want_century & 2) {
+			tm->tm_year += century * 100 - 1900;
+		} else if (tm->tm_year <= 68) {
+			tm->tm_year += 100;
+		}
 	}
 	return (char *)s;
 }
diff --git a/third_party/musl/time_impl.h b/third_party/musl/time_impl.h
deleted file mode 100644
index f782e18ea..000000000
--- a/third_party/musl/time_impl.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_MUSL_TIME_IMPL_H_
-#define COSMOPOLITAN_THIRD_PARTY_MUSL_TIME_IMPL_H_
-#include "libc/time.h"
-#include "libc/str/locale.h"
-#include "libc/calls/weirdtypes.h"
-COSMOPOLITAN_C_START_
-
-int __days_in_month(int, int);
-int __month_to_secs(int, int);
-long long __year_to_secs(long long, int *);
-long long __tm_to_secs(const struct tm *);
-const char *__tm_to_tzname(const struct tm *);
-int __secs_to_tm(long long, struct tm *);
-void __secs_to_zone(long long, int, int *, long *, long *, const char **);
-const char *__strftime_fmt_1(char (*)[100], size_t *, int, const struct tm *, locale_t, int);
-extern const char __utc[];
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_MUSL_TIME_IMPL_H_ */
diff --git a/third_party/musl/towctrans.c b/third_party/musl/towctrans.c
deleted file mode 100644
index 07c63f266..000000000
--- a/third_party/musl/towctrans.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <wctype.h>
-#include <locale.h>
-__static_yoink("musl_libc_notice");
-
-static const unsigned char tab[];
-
-static const unsigned char rulebases[512];
-static const int rules[];
-
-static const unsigned char exceptions[][2];
-
-#include "casemap.inc"
-
-static int casemap(unsigned c, int dir)
-{
-	unsigned b, x, y, v, rt, xb, xn;
-	int r, rd, c0 = c;
-
-	if (c >= 0x20000) return c;
-
-	b = c>>8;
-	c &= 255;
-	x = c/3;
-	y = c%3;
-
-	/* lookup entry in two-level base-6 table */
-	v = tab[tab[b]*86+x];
-	static const int mt[] = { 2048, 342, 57 };
-	v = (v*mt[y]>>11)%6;
-
-	/* use the bit vector out of the tables as an index into
-	 * a block-specific set of rules and decode the rule into
-	 * a type and a case-mapping delta. */
-	r = rules[rulebases[b]+v];
-	rt = r & 255;
-	rd = r >> 8;
-
-	/* rules 0/1 are simple lower/upper case with a delta.
-	 * apply according to desired mapping direction. */
-	if (rt < 2) return c0 + (rd & -(rt^dir));
-
-	/* binary search. endpoints of the binary search for
-	 * this block are stored in the rule delta field. */
-	xn = rd & 0xff;
-	xb = (unsigned)rd >> 8;
-	while (xn) {
-		unsigned try = exceptions[xb+xn/2][0];
-		if (try == c) {
-			r = rules[exceptions[xb+xn/2][1]];
-			rt = r & 255;
-			rd = r >> 8;
-			if (rt < 2) return c0 + (rd & -(rt^dir));
-			/* Hard-coded for the four exceptional titlecase */
-			return c0 + (dir ? -1 : 1);
-		} else if (try > c) {
-			xn /= 2;
-		} else {
-			xb += xn/2;
-			xn -= xn/2;
-		}
-	}
-	return c0;
-}
-
-wint_t towlower(wint_t wc)
-{
-	return casemap(wc, 0);
-}
-
-wint_t towupper(wint_t wc)
-{
-	return casemap(wc, 1);
-}
-
-wint_t __towupper_l(wint_t c, locale_t l)
-{
-	return towupper(c);
-}
-
-wint_t __towlower_l(wint_t c, locale_t l)
-{
-	return towlower(c);
-}
-
-__weak_reference(__towupper_l, towupper_l);
-__weak_reference(__towlower_l, towlower_l);
diff --git a/third_party/musl/uselocale.c b/third_party/musl/uselocale.c
deleted file mode 100644
index 01204fee0..000000000
--- a/third_party/musl/uselocale.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/locale.internal.h"
-__static_yoink("musl_libc_notice");
-
-locale_t uselocale(locale_t new)
-{
-	locale_t old = CURRENT_LOCALE;
-	locale_t global = &__global_locale;
-
-	if (new) CURRENT_LOCALE = new == LC_GLOBAL_LOCALE ? global : new;
-
-	return old == global ? LC_GLOBAL_LOCALE : old;
-}
diff --git a/third_party/musl/wcsftime.c b/third_party/musl/wcsftime.c
deleted file mode 100644
index 0a3b21297..000000000
--- a/third_party/musl/wcsftime.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/musl/time_impl.h"
-#include "libc/str/str.h"
-#include "libc/str/locale.internal.h"
-__static_yoink("musl_libc_notice");
-
-size_t wcsftime_l(wchar_t *restrict s, size_t n, const wchar_t *restrict f, const struct tm *restrict tm, locale_t loc)
-{
-	size_t l, k;
-	char buf[100];
-	wchar_t wbuf[100];
-	wchar_t *p;
-	const char *t_mb;
-	const wchar_t *t;
-	int pad, plus;
-	unsigned long width;
-	for (l=0; l<n; f++) {
-		if (!*f) {
-			s[l] = 0;
-			return l;
-		}
-		if (*f != '%') {
-			s[l++] = *f;
-			continue;
-		}
-		f++;
-		pad = 0;
-		if (*f == '-' || *f == '_' || *f == '0') pad = *f++;
-		if ((plus = (*f == '+'))) f++;
-		width = wcstoul(f, &p, 10);
-		if (*p == 'C' || *p == 'F' || *p == 'G' || *p == 'Y') {
-			if (!width && p!=f) width = 1;
-		} else {
-			width = 0;
-		}
-		f = p;
-		if (*f == 'E' || *f == 'O') f++;
-		t_mb = __strftime_fmt_1(&buf, &k, *f, tm, loc, pad);
-		if (!t_mb) break;
-		k = mbstowcs(wbuf, t_mb, sizeof wbuf / sizeof *wbuf);
-		if (k == (size_t)-1) return 0;
-		t = wbuf;
-		if (width) {
-			for (; *t=='+' || *t=='-' || (*t=='0'&&t[1]); t++, k--);
-			width--;
-			if (plus && tm->tm_year >= 10000-1900)
-				s[l++] = '+';
-			else if (tm->tm_year < -1900)
-				s[l++] = '-';
-			else
-				width++;
-			for (; width > k && l < n; width--)
-				s[l++] = '0';
-		}
-		if (k >= n-l) k = n-l;
-		wmemcpy(s+l, t, k);
-		l += k;
-	}
-	if (n) {
-		if (l==n) l=n-1;
-		s[l] = 0;
-	}
-	return 0;
-}
-
-size_t wcsftime(wchar_t *restrict wcs, size_t n, const wchar_t *restrict f, const struct tm *restrict tm)
-{
-	return wcsftime_l(wcs, n, f, tm, CURRENT_LOCALE);
-}
diff --git a/third_party/musl/wcsnrtombs.c b/third_party/musl/wcsnrtombs.c
deleted file mode 100644
index 08cefead2..000000000
--- a/third_party/musl/wcsnrtombs.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <wchar.h>
-#include <limits.h>
-#include <string.h>
-__static_yoink("musl_libc_notice");
-
-size_t wcsnrtombs(char *restrict dst, const wchar_t **restrict wcs, size_t wn, size_t n, mbstate_t *restrict st)
-{
-	const wchar_t *ws = *wcs;
-	size_t cnt = 0;
-	if (!dst) n=0;
-	while (ws && wn) {
-		char tmp[MB_LEN_MAX];
-		size_t l = wcrtomb(n<MB_LEN_MAX ? tmp : dst, *ws, 0);
-		if (l==-1) {
-			cnt = -1;
-			break;
-		}
-		if (dst) {
-			if (n<MB_LEN_MAX) {
-				if (l>n) break;
-				memcpy(dst, tmp, l);
-			}
-			dst += l;
-			n -= l;
-		}
-		if (!*ws) {
-			ws = 0;
-			break;
-		}
-		ws++;
-		wn--;
-		cnt += l;
-	}
-	if (dst) *wcs = ws;
-	return cnt;
-}
diff --git a/third_party/musl/wcstombs.c b/third_party/musl/wcstombs.c
deleted file mode 100644
index f5ab65164..000000000
--- a/third_party/musl/wcstombs.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <stdlib.h>
-#include <wchar.h>
-__static_yoink("musl_libc_notice");
-
-size_t wcstombs(char *restrict s, const wchar_t *restrict ws, size_t n)
-{
-	return wcsrtombs(s, &(const wchar_t *){ws}, n, 0);
-}
diff --git a/third_party/musl/wctomb.c b/third_party/musl/wctomb.c
deleted file mode 100644
index c61c1d669..000000000
--- a/third_party/musl/wctomb.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include <stdlib.h>
-#include <wchar.h>
-__static_yoink("musl_libc_notice");
-
-int wctomb(char *s, wchar_t wc)
-{
-	if (!s) return 0;
-	return wcrtomb(s, wc, 0);
-}
diff --git a/third_party/musl/wctrans.c b/third_party/musl/wctrans.c
deleted file mode 100644
index ce57220fb..000000000
--- a/third_party/musl/wctrans.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <wctype.h>
-#include <string.h>
-#include <locale.h>
-
-wctrans_t wctrans(const char *class)
-{
-	if (!strcmp(class, "toupper")) return (wctrans_t)1;
-	if (!strcmp(class, "tolower")) return (wctrans_t)2;
-	return 0;
-}
-
-wint_t towctrans(wint_t wc, wctrans_t trans)
-{
-	if (trans == (wctrans_t)1) return towupper(wc);
-	if (trans == (wctrans_t)2) return towlower(wc);
-	return wc;
-}
-
-wctrans_t __wctrans_l(const char *s, locale_t l)
-{
-	return wctrans(s);
-}
-
-wint_t __towctrans_l(wint_t c, wctrans_t t, locale_t l)
-{
-	return towctrans(c, t);
-}
-
-__weak_reference(__wctrans_l, wctrans_l);
-__weak_reference(__towctrans_l, towctrans_l);
diff --git a/third_party/nsync/BUILD.mk b/third_party/nsync/BUILD.mk
index 0b8ed2923..362f1dde0 100644
--- a/third_party/nsync/BUILD.mk
+++ b/third_party/nsync/BUILD.mk
@@ -27,6 +27,7 @@ THIRD_PARTY_NSYNC_A_DIRECTDEPS =			\
 	LIBC_INTRIN					\
 	LIBC_NEXGEN32E					\
 	LIBC_NT_KERNEL32				\
+	LIBC_NT_SYNCHRONIZATION				\
 	LIBC_STR					\
 	LIBC_SYSV					\
 	LIBC_SYSV_CALLS
@@ -55,13 +56,6 @@ $(THIRD_PARTY_NSYNC_A_OBJS): private			\
 			-Wframe-larger-than=4096	\
 			-Walloca-larger-than=4096
 
-# avoid the legacy sse decoding penalty on avx systems
-ifeq ($(MODE),)
-$(THIRD_PARTY_NSYNC_A_OBJS): private			\
-		COPTS +=				\
-			-mgeneral-regs-only
-endif
-
 # these assembly files are safe to build on aarch64
 o/$(MODE)/third_party/nsync/compat.o: third_party/nsync/compat.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
@@ -75,5 +69,4 @@ $(THIRD_PARTY_NSYNC_OBJS): third_party/nsync/BUILD.mk
 .PHONY: o/$(MODE)/third_party/nsync
 o/$(MODE)/third_party/nsync:				\
 	o/$(MODE)/third_party/nsync/mem			\
-	o/$(MODE)/third_party/nsync/testing		\
 	$(THIRD_PARTY_NSYNC_CHECKS)
diff --git a/third_party/nsync/README.cosmo b/third_party/nsync/README.cosmo
index fefc56bf6..415a659fb 100644
--- a/third_party/nsync/README.cosmo
+++ b/third_party/nsync/README.cosmo
@@ -17,21 +17,17 @@ LOCAL CHANGES
 
   - Fix nsync_mu_unlock() on Apple Silicon
 
-  - Add clock parameter to many NSYNC wait APIs
-
   - Time APIs were so good that they're now in libc
 
   - Double linked list API was so good that it's now in libc
 
-  - Max delay on sleep should be 20ms (not 4ms) on OpenBSD and NetBSD
-
   - Support Apple's ulock futexes which are internal but nicer than GCD
 
   - Ensure resources such as POSIX semaphores are are released on fork.
 
-  - Make contended mutexes go 30% faster by using C11 atomics API. This
-    lets us use weak cas when appropriate. It also avoids a superfluous
-    relaxed load on failure. This mostly impacts aarch64, not x86_64.
+  - Modified *NSYNC to allocate waiter objects on the stack. We need it
+    because we use *NSYNC mutexes to implement POSIX mutexes, which are
+    too low-level to safely depend on malloc, or even mmap in our case.
 
   - Rewrote most of the semaphore and futex system call support code so
     it works well with Cosmopolitan's fat runtime portability. *NSYNC's
diff --git a/third_party/nsync/atomic.internal.h b/third_party/nsync/atomic.internal.h
index 1b9879f64..64c3c9412 100644
--- a/third_party/nsync/atomic.internal.h
+++ b/third_party/nsync/atomic.internal.h
@@ -85,6 +85,13 @@ static inline int atm_cas_relacq_u32_(nsync_atomic_uint32_ *p, uint32_t o,
                                                  memory_order_relaxed);
 }
 
+static inline int atm_cas_seqcst_u32_(nsync_atomic_uint32_ *p, uint32_t o,
+                                      uint32_t n) {
+  return atomic_compare_exchange_strong_explicit(NSYNC_ATOMIC_UINT32_PTR_(p),
+                                                 &o, n, memory_order_seq_cst,
+                                                 memory_order_relaxed);
+}
+
 #define ATM_CAS_HELPER_(barrier, p, o, n) \
   (atm_cas_##barrier##_u32_((p), (o), (n)))
 
@@ -92,6 +99,7 @@ static inline int atm_cas_relacq_u32_(nsync_atomic_uint32_ *p, uint32_t o,
 #define ATM_CAS_ACQ(p, o, n)    ATM_CAS_HELPER_(acq, (p), (o), (n))
 #define ATM_CAS_REL(p, o, n)    ATM_CAS_HELPER_(rel, (p), (o), (n))
 #define ATM_CAS_RELACQ(p, o, n) ATM_CAS_HELPER_(relacq, (p), (o), (n))
+#define ATM_CAS_SEQCST(p, o, n) ATM_CAS_HELPER_(seqcst, (p), (o), (n))
 
 /* Need a cast to remove "const" from some uses. */
 #define ATM_LOAD(p)                                                          \
diff --git a/third_party/nsync/common.c b/third_party/nsync/common.c
index 3a247169c..79daaf9b1 100644
--- a/third_party/nsync/common.c
+++ b/third_party/nsync/common.c
@@ -15,20 +15,22 @@
 │ See the License for the specific language governing permissions and          │
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/atomic.h"
-#include "libc/calls/calls.h"
 #include "libc/calls/calls.h"
+#include "libc/calls/syscall-sysv.internal.h"
 #include "libc/dce.h"
-#include "libc/fmt/itoa.h"
+#include "libc/intrin/directmap.h"
 #include "libc/intrin/dll.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/intrin/weaken.h"
+#include "libc/intrin/extend.h"
+#include "libc/nt/enum/filemapflags.h"
+#include "libc/nt/enum/pageflags.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/runtime.h"
+#include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
-#include "libc/stdalign.h"
-#include "libc/str/str.h"
+#include "libc/stdalign.internal.h"
+#include "libc/stdalign.internal.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/prot.h"
-#include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 #include "third_party/nsync/atomic.h"
@@ -36,7 +38,6 @@
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/mu_semaphore.h"
 #include "third_party/nsync/mu_semaphore.internal.h"
-#include "libc/intrin/cxaatexit.h"
 #include "third_party/nsync/wait_s.internal.h"
 __static_yoink("nsync_notice");
 
@@ -110,8 +111,6 @@ uint32_t nsync_spin_test_and_set_ (nsync_atomic_uint32_ *w, uint32_t test,
 
 /* ====================================================================================== */
 
-#if NSYNC_DEBUG
-
 struct nsync_waiter_s *nsync_dll_nsync_waiter_ (struct Dll *e) {
 	struct nsync_waiter_s *nw = DLL_CONTAINER(struct nsync_waiter_s, q, e);
 	ASSERT (nw->tag == NSYNC_WAITER_TAG);
@@ -135,166 +134,97 @@ waiter *nsync_dll_waiter_samecond_ (struct Dll *e) {
 	return (w);
 }
 
-#endif /* NSYNC_DEBUG */
-
 /* -------------------------------- */
 
-// TODO(jart): enforce in dbg mode once off-by-one flake is fixed
-#define DETECT_WAITER_LEAKS 0
-
-#define MASQUE 0x00fffffffffffff8
-#define PTR(x) ((uintptr_t)(x) & MASQUE)
-#define TAG(x) ROL((uintptr_t)(x) & ~MASQUE, 8)
-#define ABA(p, t) ((uintptr_t)(p) | (ROR((uintptr_t)(t), 8) & ~MASQUE))
-#define ROL(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
-#define ROR(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
-
-static atomic_uintptr_t free_waiters;
-static _Atomic(waiter *) all_waiters;
-
-#if DETECT_WAITER_LEAKS
-static atomic_int all_waiters_count;
-static atomic_int free_waiters_count;
-#endif
-
-static waiter *get_waiter_for_thread (void) {
-	return __get_tls()->tib_nsync;
-}
-
-static bool set_waiter_for_thread (waiter *w) {
-	__get_tls()->tib_nsync = w;
-	return (true);
-}
-
-#if DETECT_WAITER_LEAKS
-__attribute__((__destructor__)) static void reconcile_waiters (void) {
-	// we can't perform this check if using exit() with threads
-	if (!pthread_orphan_np ())
-		return;
-	waiter *w;
-	if ((w = get_waiter_for_thread ())) {
-		nsync_waiter_destroy_ (w);
-		set_waiter_for_thread (0);
-	}
-	if (all_waiters_count != free_waiters_count) {
-		char ibuf[2][12];
-		FormatInt32 (ibuf[0], all_waiters_count);
-		FormatInt32 (ibuf[1], free_waiters_count);
-		tinyprint (2, "error: nsync panic: all_waiter_count (",
-			   ibuf[0], ") != free_waiters_count (", ibuf[1],
-			   ")\n", NULL);
-		_Exit (156);
-	}
-}
-#endif
-
-static void all_waiters_push (waiter *w) {
-	w->next_all = atomic_load_explicit (&all_waiters, memory_order_relaxed);
-	while (!atomic_compare_exchange_weak_explicit (&all_waiters, &w->next_all, w,
-						       memory_order_acq_rel,
-						       memory_order_relaxed))
-		pthread_pause_np ();
-#if DETECT_WAITER_LEAKS
-	++all_waiters_count;
-#endif
-}
+static _Atomic(waiter *) free_waiters;
 
 static void free_waiters_push (waiter *w) {
-	uintptr_t tip;
-	ASSERT (!TAG(w));
-	tip = atomic_load_explicit (&free_waiters, memory_order_relaxed);
-	for (;;) {
-		w->next_free = (waiter *) PTR (tip);
-		if (atomic_compare_exchange_weak_explicit (&free_waiters, &tip,
-							   ABA (w, TAG (tip) + 1),
-							   memory_order_release,
-							   memory_order_relaxed))
-			break;
-		pthread_pause_np ();
-	}
-#if DETECT_WAITER_LEAKS
-	++free_waiters_count;
-#endif
+	int backoff = 0;
+	w->next_free = atomic_load_explicit (&free_waiters, memory_order_relaxed);
+	while (!atomic_compare_exchange_weak_explicit (&free_waiters, &w->next_free, w,
+						       memory_order_acq_rel, memory_order_relaxed))
+		backoff = pthread_delay_np (free_waiters, backoff);
 }
 
-static waiter *free_waiters_pop (void) {
-	waiter *w;
-	uintptr_t tip;
-	tip = atomic_load_explicit (&free_waiters, memory_order_relaxed);
-	while ((w = (waiter *) PTR (tip))) {
-		if (atomic_compare_exchange_weak_explicit (&free_waiters, &tip,
-							   ABA (w->next_free, TAG (tip) + 1),
-							   memory_order_acquire,
-							   memory_order_relaxed))
-			break;
-		pthread_pause_np ();
-	}
-#if DETECT_WAITER_LEAKS
-	if (w)
-		--free_waiters_count;
-#endif
-	return (w);
-}
-
-static bool free_waiters_populate (void) {
+static void free_waiters_populate (void) {
 	int n;
-	if (IsNetbsd ()) {
-		// netbsd semaphores are file descriptors
+	if (IsNetbsd () || (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ())) {
+		// netbsd needs a real file descriptor per semaphore
+		// tim cook wants us to use his lol central dispatch
 		n = 1;
 	} else {
-		// don't create too much fork() overhead
-		n = 16;
+		n = __pagesize / sizeof(waiter);
 	}
 	waiter *waiters = mmap (0, n * sizeof(waiter),
 				PROT_READ | PROT_WRITE,
 				MAP_PRIVATE | MAP_ANONYMOUS,
 				-1, 0);
 	if (waiters == MAP_FAILED)
-		return (false);
+		nsync_panic_ ("out of memory\n");
 	for (size_t i = 0; i < n; ++i) {
 		waiter *w = &waiters[i];
-#if NSYNC_DEBUG
 		w->tag = WAITER_TAG;
 		w->nw.tag = NSYNC_WAITER_TAG;
-#endif
 		if (!nsync_mu_semaphore_init (&w->sem)) {
-			if (!i) {
-				// netbsd can run out of semaphores
-				munmap (waiters, n * sizeof (waiter));
-				return (false);
-			}
+			if (!i)
+				nsync_panic_ ("out of semaphores\n");
 			break;
 		}
 		w->nw.sem = &w->sem;
 		dll_init (&w->nw.q);
+		NSYNC_ATOMIC_UINT32_STORE_ (&w->nw.waiting, 0);
 		w->nw.flags = NSYNC_WAITER_FLAG_MUCV;
+		ATM_STORE (&w->remove_count, 0);
 		dll_init (&w->same_condition);
+		w->flags = 0;
 		free_waiters_push (w);
-		all_waiters_push (w);
 	}
-	return (true);
+}
+
+static waiter *free_waiters_pop (void) {
+	waiter *w;
+	int backoff = 0;
+	for (;;) {
+		if ((w = atomic_load_explicit (&free_waiters, memory_order_relaxed))) {
+			if (atomic_compare_exchange_weak_explicit (&free_waiters, &w, w->next_free,
+								   memory_order_acq_rel, memory_order_relaxed))
+				return w;
+			backoff = pthread_delay_np (free_waiters, backoff);
+		} else {
+			free_waiters_populate ();
+		}
+	}
 }
 
 /* -------------------------------- */
 
+#define waiter_for_thread __get_tls()->tib_nsync
+
+void nsync_waiter_destroy (void *v) {
+	waiter *w = (waiter *) v;
+	/* Reset waiter_for_thread in case another thread-local variable reuses
+	   the waiter in its destructor while the waiter is taken by the other
+	   thread from free_waiters. This can happen as the destruction order
+	   of thread-local variables can be arbitrary in some platform e.g.
+	   POSIX. */
+	waiter_for_thread = NULL;
+	ASSERT ((w->flags & (WAITER_RESERVED|WAITER_IN_USE)) == WAITER_RESERVED);
+	w->flags &= ~WAITER_RESERVED;
+	free_waiters_push (w);
+}
+
 /* Return a pointer to an unused waiter struct.
    Ensures that the enclosed timer is stopped and its channel drained. */
 waiter *nsync_waiter_new_ (void) {
 	waiter *w;
 	waiter *tw;
-	bool out_of_semaphores = false;
-	w = tw = get_waiter_for_thread ();
+	tw = waiter_for_thread;
+	w = tw;
 	if (w == NULL || (w->flags & (WAITER_RESERVED|WAITER_IN_USE)) != WAITER_RESERVED) {
-		while (!(w = free_waiters_pop ())) {
-			if (!out_of_semaphores)
-				if (!free_waiters_populate ())
-					out_of_semaphores = true;
-			if (out_of_semaphores)
-				pthread_yield_np ();
-		}
+		w = free_waiters_pop ();
 		if (tw == NULL) {
-			if (set_waiter_for_thread (w))
-				w->flags |= WAITER_RESERVED;
+			w->flags |= WAITER_RESERVED;
+			waiter_for_thread = w;
 		}
 	}
 	w->flags |= WAITER_IN_USE;
@@ -304,75 +234,14 @@ waiter *nsync_waiter_new_ (void) {
 /* Return an unused waiter struct *w to the free pool. */
 void nsync_waiter_free_ (waiter *w) {
 	ASSERT ((w->flags & WAITER_IN_USE) != 0);
-	w->wipe_mu = NULL;
-	w->wipe_cv = NULL;
 	w->flags &= ~WAITER_IN_USE;
 	if ((w->flags & WAITER_RESERVED) == 0) {
-		if (w == get_waiter_for_thread ())
-			set_waiter_for_thread (0);
 		free_waiters_push (w);
+		if (w == waiter_for_thread)
+			waiter_for_thread = 0;
 	}
 }
 
-/* Destroys waiter associated with dead thread. */
-void nsync_waiter_destroy_ (void *v) {
-	waiter *w = (waiter *) v;
-	ASSERT ((w->flags & (WAITER_RESERVED|WAITER_IN_USE)) == WAITER_RESERVED);
-	w->flags &= ~WAITER_RESERVED;
-	free_waiters_push (w);
-}
-
-/* Ravages nsync waiters/locks/conds after fork(). */
-void nsync_waiter_wipe_ (void) {
-	int n = 0;
-	waiter *w;
-	waiter *next;
-	waiter *prev = 0;
-	waiter *wall = atomic_load_explicit (&all_waiters, memory_order_relaxed);
-	for (w = wall; w; w = w->next_all)
-		nsync_mu_semaphore_destroy (&w->sem);
-	for (w = wall; w; w = next) {
-		next = w->next_all;
-		w->flags = 0;
-#if NSYNC_DEBUG
-		w->tag = WAITER_TAG;
-		w->nw.tag = NSYNC_WAITER_TAG;
-#endif
-		w->nw.flags = NSYNC_WAITER_FLAG_MUCV;
-		atomic_init(&w->nw.waiting, 0);
-		w->l_type = 0;
-		w->cond.f = 0;
-		w->cond.v = 0;
-		w->cond.eq = 0;
-		dll_init (&w->same_condition);
-		if (w->wipe_mu) {
-			atomic_init(&w->wipe_mu->word, 0);
-			w->wipe_mu->waiters = 0;
-		}
-		if (w->wipe_cv) {
-			atomic_init(&w->wipe_cv->word, 0);
-			w->wipe_cv->waiters = 0;
-		}
-		if (!nsync_mu_semaphore_init (&w->sem))
-			continue;  /* leak it */
-		w->next_free = prev;
-		w->next_all = prev;
-		prev = w;
-		++n;
-	}
-#if DETECT_WAITER_LEAKS
-	atomic_init (&all_waiters_count, n);
-	atomic_init (&free_waiters_count, n);
-#else
-	(void)n;
-#endif
-	atomic_init (&free_waiters, prev);
-	atomic_init (&all_waiters, prev);
-	for (struct Dll *e = dll_first (_pthread_list); e;
-	     e = dll_next (_pthread_list, e))
-		POSIXTHREAD_CONTAINER (e)->tib->tib_nsync = 0;
-}
-
 /* ====================================================================================== */
 
 /* writer_type points to a lock_type that describes how to manipulate a mu for a writer. */
diff --git a/third_party/nsync/common.internal.h b/third_party/nsync/common.internal.h
index e24d1071a..be42db19e 100644
--- a/third_party/nsync/common.internal.h
+++ b/third_party/nsync/common.internal.h
@@ -9,10 +9,15 @@
 #include "third_party/nsync/mu_semaphore.h"
 #include "third_party/nsync/note.h"
 #include "third_party/nsync/time.h"
-#include "third_party/nsync/defs.h"
 #include "third_party/nsync/wait_s.internal.h"
 COSMOPOLITAN_C_START_
 
+#ifdef MODE_DBG
+#define NSYNC_DEBUG 1
+#else
+#define NSYNC_DEBUG 0
+#endif
+
 /* Yield the CPU. Platform specific. */
 void nsync_yield_(void);
 
@@ -149,7 +154,7 @@ extern lock_type *nsync_reader_type_;
 
 /* ---------- */
 
-/* Hold a pair of condition function and its argument. */
+/* Hold a pair of  condition function and its argument. */
 struct wait_condition_s {
   int (*f)(const void *v);
   const void *v;
@@ -186,21 +191,18 @@ struct wait_condition_s {
     ATM_STORE_REL (&w.waiting, 0);
     nsync_mu_semaphore_v (&w.sem); */
 typedef struct waiter_s {
-#if NSYNC_DEBUG
-  uint32_t tag;                      /* Debug DLL_NSYNC_WAITER, DLL_WAITER, DLL_WAITER_SAMECOND. */
-#endif
-  int flags;                         /* See WAITER_* bits below. */
-  nsync_atomic_uint32_ remove_count; /* Monotonic count of removals from queue. */
-  nsync_semaphore sem;               /* Thread waits on this semaphore. */
-  struct nsync_waiter_s nw;          /* An embedded nsync_waiter_s. */
-  struct nsync_mu_s_ *cv_mu;         /* Pointer to nsync_mu associated with a cv wait. */
-  lock_type *l_type;                 /* Lock type of the mu, or nil if not associated with a mu. */
+  uint32_t tag; /* debug DLL_NSYNC_WAITER, DLL_WAITER, DLL_WAITER_SAMECOND */
+  int flags;    /* see WAITER_* bits below */
+  nsync_semaphore sem;       /* Thread waits on this semaphore. */
+  struct nsync_waiter_s nw;  /* An embedded nsync_waiter_s. */
+  struct nsync_mu_s_ *cv_mu; /* pointer to nsync_mu associated with a cv wait */
+  lock_type
+      *l_type; /* Lock type of the mu, or nil if not associated with a mu. */
+  nsync_atomic_uint32_ remove_count; /* count of removals from queue */
   struct wait_condition_s cond;      /* A condition on which to acquire a mu. */
-  struct Dll same_condition;         /* Links neighbours in nw.q with same non-nil condition. */
-  struct waiter_s * next_all;
+  struct Dll same_condition;         /* Links neighbours in nw.q with same
+                                        non-nil condition. */
   struct waiter_s * next_free;
-  struct nsync_mu_s_ *wipe_mu;
-  struct nsync_cv_s_ *wipe_cv;
 } waiter;
 static const uint32_t WAITER_TAG = 0x0590239f;
 static const uint32_t NSYNC_WAITER_TAG = 0x726d2ba9;
@@ -244,7 +246,6 @@ void nsync_waiter_free_(waiter *w);
    discipline.  */
 struct nsync_note_s_ {
   struct Dll parent_child_link; /* parent's children, under parent->note_mu  */
-  int clock; /* system clock that should be used */
   int expiry_time_valid; /* whether expiry_time is valid; r/o after init */
   nsync_time
       expiry_time;  /* expiry time, if expiry_time_valid != 0; r/o after init */
@@ -265,7 +266,7 @@ void nsync_mu_unlock_slow_(nsync_mu *mu, lock_type *l_type);
 struct Dll *nsync_remove_from_mu_queue_(struct Dll *mu_queue, struct Dll *e);
 void nsync_maybe_merge_conditions_(struct Dll *p, struct Dll *n);
 nsync_time nsync_note_notified_deadline_(nsync_note n);
-int nsync_sem_wait_with_cancel_(waiter *w, int clock, nsync_time abs_deadline,
+int nsync_sem_wait_with_cancel_(waiter *w, nsync_time abs_deadline,
                                 nsync_note cancel_note);
 
 COSMOPOLITAN_C_END_
diff --git a/third_party/nsync/compat.S b/third_party/nsync/compat.S
index 5d8b382cc..bcd4d8cd3 100644
--- a/third_party/nsync/compat.S
+++ b/third_party/nsync/compat.S
@@ -17,7 +17,11 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/timespec.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
+
+nsync_time_now:
+	jmp	timespec_real
+	.endfn	nsync_time_now,globl
 
 nsync_time_add:
 	jmp	timespec_add
diff --git a/third_party/nsync/counter.h b/third_party/nsync/counter.h
index 8b99335bc..227fa4333 100644
--- a/third_party/nsync/counter.h
+++ b/third_party/nsync/counter.h
@@ -33,7 +33,7 @@ uint32_t nsync_counter_value(nsync_counter c);
    a waiter may have been woken due to the counter reaching zero.
    If abs_deadline==nsync_time_no_deadline, the deadline
    is far in the future. */
-uint32_t nsync_counter_wait(nsync_counter c, int clock, nsync_time abs_deadline);
+uint32_t nsync_counter_wait(nsync_counter c, nsync_time abs_deadline);
 
 COSMOPOLITAN_C_END_
 #endif /* NSYNC_COUNTER_H_ */
diff --git a/third_party/nsync/cv.h b/third_party/nsync/cv.h
index a02b587b8..4209a7909 100644
--- a/third_party/nsync/cv.h
+++ b/third_party/nsync/cv.h
@@ -144,7 +144,7 @@ int nsync_cv_wait(nsync_cv *cv, nsync_mu *mu);
    mostly in tests and trivial examples than they are in real
    programmes. */
 int nsync_cv_wait_with_deadline(nsync_cv *cv, nsync_mu *mu,
-                                int clock, nsync_time abs_deadline,
+                                nsync_time abs_deadline,
                                 struct nsync_note_s_ *cancel_note);
 
 /* Like nsync_cv_wait_with_deadline(), but allow an arbitrary lock *v to be
@@ -152,7 +152,7 @@ int nsync_cv_wait_with_deadline(nsync_cv *cv, nsync_mu *mu,
 int nsync_cv_wait_with_deadline_generic(nsync_cv *cv, void *mu,
                                         void (*lock)(void *),
                                         void (*unlock)(void *),
-                                        int clock, nsync_time abs_deadline,
+                                        nsync_time abs_deadline,
                                         struct nsync_note_s_ *cancel_note);
 
 COSMOPOLITAN_C_END_
diff --git a/third_party/nsync/defs.h b/third_party/nsync/defs.h
deleted file mode 100644
index 73b5c0752..000000000
--- a/third_party/nsync/defs.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_NSYNC_DEFS_H_
-#define COSMOPOLITAN_THIRD_PARTY_NSYNC_DEFS_H_
-COSMOPOLITAN_C_START_
-
-#ifdef MODE_DBG
-#define NSYNC_DEBUG 1
-#else
-#define NSYNC_DEBUG 0
-#endif
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_NSYNC_DEFS_H_ */
diff --git a/libc/intrin/cosmo_futex.c b/third_party/nsync/futex.c
similarity index 58%
rename from libc/intrin/cosmo_futex.c
rename to third_party/nsync/futex.c
index 0c0531894..9a0f264a5 100644
--- a/libc/intrin/cosmo_futex.c
+++ b/third_party/nsync/futex.c
@@ -16,15 +16,18 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/sysv/consts/futex.h"
 #include "libc/assert.h"
 #include "libc/atomic.h"
-#include "libc/calls/cp.internal.h"
+#include "libc/calls/calls.h"
 #include "libc/calls/internal.h"
 #include "libc/calls/sig.internal.h"
+#include "libc/calls/state.internal.h"
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/sigset.internal.h"
 #include "libc/calls/struct/timespec.h"
 #include "libc/calls/struct/timespec.internal.h"
+#include "libc/calls/syscall_support-nt.internal.h"
 #include "libc/cosmo.h"
 #include "libc/dce.h"
 #include "libc/errno.h"
@@ -34,64 +37,71 @@
 #include "libc/intrin/ulock.h"
 #include "libc/intrin/weaken.h"
 #include "libc/limits.h"
+#include "libc/nexgen32e/vendor.internal.h"
 #include "libc/nt/runtime.h"
 #include "libc/nt/synchronization.h"
+#include "libc/runtime/clktck.h"
 #include "libc/sysv/consts/clock.h"
-#include "libc/sysv/consts/futex.h"
 #include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/timer.h"
 #include "libc/sysv/errfuns.h"
 #include "libc/thread/freebsd.internal.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
-// clang-format off
+#include "libc/thread/tls.h"
+#include "third_party/nsync/atomic.h"
+#include "third_party/nsync/common.internal.h"
+#include "third_party/nsync/futex.internal.h"
+#include "libc/intrin/kprintf.h"
+#include "third_party/nsync/time.h"
 
 #define FUTEX_WAIT_BITS_ FUTEX_BITSET_MATCH_ANY
 
-errno_t cosmo_futex_thunk (atomic_int *, int, int, const struct timespec *, int *, int);
-errno_t _futex_wake (atomic_int *, int, int) asm ("cosmo_futex_thunk");
+errno_t _futex (atomic_int *, int, int, const struct timespec *, int *, int);
+errno_t _futex_wake (atomic_int *, int, int) asm ("_futex");
 int sys_futex_cp (atomic_int *, int, int, const struct timespec *, int *, int);
 
-static struct CosmoFutex {
+static struct NsyncFutex {
 	atomic_uint once;
 	int FUTEX_WAIT_;
 	int FUTEX_PRIVATE_FLAG_;
-	int FUTEX_CLOCK_REALTIME_;
 	bool is_supported;
 	bool timeout_is_relative;
-} g_cosmo_futex;
+} nsync_futex_;
 
-static void cosmo_futex_init (void) {
+static void nsync_futex_init_ (void) {
 	int e;
 	atomic_int x;
 
-	g_cosmo_futex.FUTEX_WAIT_ = FUTEX_WAIT;
+	nsync_futex_.FUTEX_WAIT_ = FUTEX_WAIT;
 
 	if (IsWindows ()) {
-		g_cosmo_futex.is_supported = true;
+		nsync_futex_.is_supported = true;
 		return;
 	}
 
 	if (IsXnu ()) {
-		g_cosmo_futex.is_supported = true;
-		g_cosmo_futex.timeout_is_relative = true;
+		nsync_futex_.is_supported = true;
+		nsync_futex_.timeout_is_relative = true;
 		return;
 	}
 
 	if (IsFreebsd ()) {
-		g_cosmo_futex.is_supported = true;
-		g_cosmo_futex.FUTEX_PRIVATE_FLAG_ = FUTEX_PRIVATE_FLAG;
+		nsync_futex_.is_supported = true;
+		nsync_futex_.FUTEX_PRIVATE_FLAG_ = FUTEX_PRIVATE_FLAG;
 		return;
 	}
 
-	if (!(g_cosmo_futex.is_supported = IsLinux () || IsOpenbsd ()))
+        if (!(nsync_futex_.is_supported = IsLinux () || IsOpenbsd ())) {
 		return;
+	}
 
 	// In our testing, we found that the monotonic clock on various
 	// popular systems (such as Linux, and some BSD variants) was no
 	// better behaved than the realtime clock, and routinely took
 	// large steps backwards, especially on multiprocessors. Given
 	// that "monotonic" doesn't seem to mean what it says,
-	// implementers of cosmo_time might consider retaining the
+	// implementers of nsync_time might consider retaining the
 	// simplicity of a single epoch within an address space, by
 	// configuring any time synchronization mechanism (like ntp) to
 	// adjust for leap seconds by adjusting the rate, rather than
@@ -99,52 +109,49 @@ static void cosmo_futex_init (void) {
 	e = errno;
 	atomic_store_explicit (&x, 0, memory_order_relaxed);
 	if (IsLinux () &&
-	    cosmo_futex_thunk (&x, FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME,
-			       1, 0, 0, FUTEX_BITSET_MATCH_ANY) == -EAGAIN) {
-		g_cosmo_futex.FUTEX_WAIT_ = FUTEX_WAIT_BITSET;
-		g_cosmo_futex.FUTEX_PRIVATE_FLAG_ = FUTEX_PRIVATE_FLAG;
-		g_cosmo_futex.FUTEX_CLOCK_REALTIME_ = FUTEX_CLOCK_REALTIME;
+	    _futex (&x, FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME,
+		    1, 0, 0, FUTEX_BITSET_MATCH_ANY) == -EAGAIN) {
+		nsync_futex_.FUTEX_WAIT_ =
+			FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME;
+		nsync_futex_.FUTEX_PRIVATE_FLAG_ = FUTEX_PRIVATE_FLAG;
+	} else if (!IsTiny () && IsLinux () &&
+		   _futex (&x, FUTEX_WAIT_BITSET, 1, 0, 0,
+			   FUTEX_BITSET_MATCH_ANY) == -EAGAIN) {
+		nsync_futex_.FUTEX_WAIT_ = FUTEX_WAIT_BITSET;
+		nsync_futex_.FUTEX_PRIVATE_FLAG_ = FUTEX_PRIVATE_FLAG;
 	} else if (IsOpenbsd () ||
-		   (IsLinux () &&
+		   (!IsTiny () && IsLinux () &&
 		    !_futex_wake (&x, FUTEX_WAKE_PRIVATE, 1))) {
-		g_cosmo_futex.FUTEX_WAIT_ = FUTEX_WAIT;
-		g_cosmo_futex.FUTEX_PRIVATE_FLAG_ = FUTEX_PRIVATE_FLAG;
-		g_cosmo_futex.timeout_is_relative = true;
+		nsync_futex_.FUTEX_WAIT_ = FUTEX_WAIT;
+		nsync_futex_.FUTEX_PRIVATE_FLAG_ = FUTEX_PRIVATE_FLAG;
+		nsync_futex_.timeout_is_relative = true;
 	} else {
-		g_cosmo_futex.FUTEX_WAIT_ = FUTEX_WAIT;
-		g_cosmo_futex.timeout_is_relative = true;
+		nsync_futex_.FUTEX_WAIT_ = FUTEX_WAIT;
+		nsync_futex_.timeout_is_relative = true;
 	}
 	errno = e;
 }
 
-static uint32_t cosmo_time_64to32u (uint64_t duration) {
-	if (duration <= -1u)
-		return duration;
-	return -1u;
-}
-
-static int cosmo_futex_polyfill (atomic_int *w, int expect, int clock,
-				 struct timespec *abstime) {
+static int nsync_futex_polyfill_ (atomic_int *w, int expect, struct timespec *abstime) {
 	for (;;) {
-		if (atomic_load_explicit (w, memory_order_acquire) != expect)
+		if (atomic_load_explicit (w, memory_order_acquire) != expect) {
 			return 0;
+		}
 		if (_weaken (pthread_testcancel_np) &&
-		    _weaken (pthread_testcancel_np) ())
+		    _weaken (pthread_testcancel_np) ()) {
 			return -ECANCELED;
-		struct timespec now;
-		if (clock_gettime (clock, &now))
-			return -EINVAL;
-		if (abstime && timespec_cmp (now, *abstime) >= 0)
+		}
+		if (abstime && timespec_cmp (timespec_real (), *abstime) >= 0) {
 			return -ETIMEDOUT;
+		}
 		pthread_yield_np ();
 	}
 }
 
-static int cosmo_futex_wait_win32 (atomic_int *w, int expect, char pshare,
-				   int clock, const struct timespec *timeout,
-				   struct PosixThread *pt,
-				   sigset_t waitmask) {
-#ifdef __x86_64__
+static int nsync_futex_wait_win32_ (atomic_int *w, int expect, char pshare,
+				    const struct timespec *timeout,
+				    struct PosixThread *pt,
+				    sigset_t waitmask) {
 	int sig;
 	bool32 ok;
 	struct timespec deadline, wait, now;
@@ -156,102 +163,75 @@ static int cosmo_futex_wait_win32 (atomic_int *w, int expect, char pshare,
 	}
 
 	for (;;) {
-		if (clock_gettime (clock, &now))
-			return einval ();
-		if (timespec_cmp (now, deadline) >= 0)
-			return etimedout ();
+		now = timespec_real ();
+		if (timespec_cmp (now, deadline) >= 0) {
+			return etimedout();
+		}
 		wait = timespec_sub (deadline, now);
-		if (atomic_load_explicit (w, memory_order_acquire) != expect)
+		if (atomic_load_explicit (w, memory_order_acquire) != expect) {
 			return 0;
+		}
 		if (pt) {
-			if (_check_cancel () == -1)
+			if (_check_cancel () == -1) {
 				return -1; /* ECANCELED */
+			}
 			if ((sig = __sig_get (waitmask))) {
 				__sig_relay (sig, SI_KERNEL, waitmask);
-				if (_check_cancel () == -1)
+				if (_check_cancel () == -1) {
 					return -1; /* ECANCELED */
+				}
 				return eintr ();
 			}
 			pt->pt_blkmask = waitmask;
 			atomic_store_explicit (&pt->pt_blocker, w, memory_order_release);
 		}
-		ok = WaitOnAddress (w, &expect, sizeof(int), cosmo_time_64to32u (timespec_tomillis (wait)));
+		ok = WaitOnAddress (w, &expect, sizeof(int), timespec_tomillis (wait));
 		if (pt) {
-			/* __sig_wake wakes our futex without changing `w` after enqueing signals */
+			/* __sig_cancel wakes our futex without changing `w` after enqueing signals */
 			atomic_store_explicit (&pt->pt_blocker, 0, memory_order_release);
 			if (ok && atomic_load_explicit (w, memory_order_acquire) == expect && (sig = __sig_get (waitmask))) {
 				__sig_relay (sig, SI_KERNEL, waitmask);
-				if (_check_cancel () == -1)
+				if (_check_cancel () == -1) {
 					return -1; /* ECANCELED */
+				}
 				return eintr ();
 			}
 		}
 		if (ok) {
 			return 0;
 		} else {
-			unassert (GetLastError () == ETIMEDOUT);
+			ASSERT (GetLastError () == ETIMEDOUT);
 		}
 	}
-#else
-	return 0;
-#endif /* __x86_64__ */
 }
 
-static int cosmo_futex_fix_timeout (struct timespec *memory, int clock,
-				    const struct timespec *abstime,
-				    struct timespec **result) {
+static struct timespec *nsync_futex_timeout_ (struct timespec *memory,
+					      const struct timespec *abstime) {
 	struct timespec now;
 	if (!abstime) {
-		*result = 0;
 		return 0;
-	} else if (!g_cosmo_futex.timeout_is_relative) {
+	} else if (!nsync_futex_.timeout_is_relative) {
 		*memory = *abstime;
-		*result = memory;
-		return 0;
+		return memory;
 	} else {
-		if (clock_gettime (clock, &now))
-			return -EINVAL;
+		now = timespec_real ();
 		*memory = timespec_subz (*abstime, now);
-		*result = memory;
-		return 0;
+		return memory;
 	}
 }
 
-/**
- * Waits on futex.
- *
- * This function may be used to ask the OS to park the calling thread
- * until cosmo_futex_wake() is called on the memory address `w`.
- *
- * @param w is your futex
- * @param expect is the value `*w` is expected to have on entry
- * @param pshare is `PTHREAD_PROCESS_PRIVATE` / `PTHREAD_PROCESS_SHARED`
- * @param clock is `CLOCK_MONOTONIC`, `CLOCK_REALTIME`, etc.
- * @param abstime is null to wait forever or absolute timestamp to stop
- * @return 0 on success, or -errno on error
- * @raise EINVAL on bad parameter
- * @raise EAGAIN if `*w` wasn't `expect`
- * @raise EINTR if a signal handler was called while waiting
- * @raise ECANCELED if calling thread was canceled while waiting
- * @cancelationpoint
- */
-int cosmo_futex_wait (atomic_int *w, int expect, char pshare,
-		      int clock, const struct timespec *abstime) {
+int nsync_futex_wait_ (atomic_int *w, int expect, char pshare, const struct timespec *abstime) {
 	int e, rc, op;
 	struct CosmoTib *tib;
 	struct PosixThread *pt;
-	struct timespec tsmem;
-	struct timespec *timeout = 0;
-	BEGIN_CANCELATION_POINT;
+	struct timespec tsmem, *timeout;
 
-	cosmo_once (&g_cosmo_futex.once, cosmo_futex_init);
+	cosmo_once (&nsync_futex_.once, nsync_futex_init_);
 
-	op = g_cosmo_futex.FUTEX_WAIT_;
-	if (pshare == PTHREAD_PROCESS_PRIVATE)
-		op |= g_cosmo_futex.FUTEX_PRIVATE_FLAG_;
-	if (clock == CLOCK_REALTIME ||
-	    clock == CLOCK_REALTIME_COARSE)
-		op |= g_cosmo_futex.FUTEX_CLOCK_REALTIME_;
+	op = nsync_futex_.FUTEX_WAIT_;
+	if (pshare == PTHREAD_PROCESS_PRIVATE) {
+		op |= nsync_futex_.FUTEX_PRIVATE_FLAG_;
+	}
 
 	if (abstime && timespec_cmp (*abstime, timespec_zero) <= 0) {
 		rc = -ETIMEDOUT;
@@ -263,8 +243,7 @@ int cosmo_futex_wait (atomic_int *w, int expect, char pshare,
 		goto Finished;
 	}
 
-	if ((rc = cosmo_futex_fix_timeout (&tsmem, clock, abstime, &timeout)))
-		goto Finished;
+	timeout = nsync_futex_timeout_ (&tsmem, abstime);
 
 	LOCKTRACE ("futex(%t [%d], %s, %#x, %s) → ...",
 		   w, atomic_load_explicit (w, memory_order_relaxed),
@@ -274,29 +253,15 @@ int cosmo_futex_wait (atomic_int *w, int expect, char pshare,
 	tib = __get_tls();
 	pt = (struct PosixThread *)tib->tib_pthread;
 
-	if (g_cosmo_futex.is_supported) {
+	if (nsync_futex_.is_supported) {
 		e = errno;
 		if (IsWindows ()) {
 			// Windows 8 futexes don't support multiple processes :(
 			if (pshare) goto Polyfill;
 			sigset_t m = __sig_block ();
-			rc = cosmo_futex_wait_win32 (w, expect, pshare, clock, timeout, pt, m);
+			rc = nsync_futex_wait_win32_ (w, expect, pshare, timeout, pt, m);
 			__sig_unblock (m);
 		} else if (IsXnu ()) {
-
-			/* XNU ulock (used by cosmo futexes) is an internal API, however:
-
-			     1. Unlike GCD it's cancelable i.e. can be EINTR'd by signals
-			     2. We have no choice but to use ulock for joining threads
-			     3. Grand Central Dispatch requires a busy loop workaround
-			     4. ulock makes our mutexes use 20% more system time (meh)
-			     5. ulock makes our mutexes use 40% less wall time (good)
-			     6. ulock makes our mutexes use 64% less user time (woop)
-			     7. GCD uses Mach timestamps D: ulock just uses rel. time
-
-			   ulock is an outstanding system call that must be used.
-			   gcd is not an acceptable alternative to ulock. */
-
 			uint32_t op, us;
 			if (pshare) {
 				op = UL_COMPARE_AND_WAIT_SHARED;
@@ -304,14 +269,14 @@ int cosmo_futex_wait (atomic_int *w, int expect, char pshare,
 				op = UL_COMPARE_AND_WAIT;
 			}
 			if (timeout) {
-				us = cosmo_time_64to32u (timespec_tomicros (*timeout));
+				us = timespec_tomicros (*timeout);
 			} else {
 				us = -1u;
 			}
 			rc = ulock_wait (op, w, expect, us);
 			if (rc > 0) rc = 0; // don't care about #waiters
 		} else if (IsFreebsd ()) {
-			rc = sys_umtx_timedwait_uint (w, expect, pshare, clock, timeout);
+			rc = sys_umtx_timedwait_uint (w, expect, pshare, timeout);
 		} else {
 			if (IsOpenbsd()) {
 				// OpenBSD 6.8 futex() returns errors as
@@ -344,7 +309,7 @@ int cosmo_futex_wait (atomic_int *w, int expect, char pshare,
 		}
 	} else {
 	Polyfill:
-		rc = cosmo_futex_polyfill (w, expect, clock, timeout);
+		rc = nsync_futex_polyfill_ (w, expect, timeout);
 	}
 
 Finished:
@@ -354,28 +319,22 @@ Finished:
 		DescribeTimespec (0, abstime),
 		DescribeErrno (rc));
 
-	END_CANCELATION_POINT;
 	return rc;
 }
 
-/**
- * Wakes futex.
- *
- * @param w is your futex
- * @param count is number of threads to wake (usually 1 or `INT_MAX`)
- * @param pshare is `PTHREAD_PROCESS_PRIVATE` / `PTHREAD_PROCESS_SHARED`
- * @return number of threads woken on success, or -errno on error
- */
-int cosmo_futex_wake (atomic_int *w, int count, char pshare) {
+int nsync_futex_wake_ (atomic_int *w, int count, char pshare) {
 	int rc, op, fop;
 
-	cosmo_once (&g_cosmo_futex.once, cosmo_futex_init);
+	ASSERT (count == 1 || count == INT_MAX);
+
+	cosmo_once (&nsync_futex_.once, nsync_futex_init_);
 
 	op = FUTEX_WAKE;
-	if (pshare == PTHREAD_PROCESS_PRIVATE)
-		op |= g_cosmo_futex.FUTEX_PRIVATE_FLAG_;
+	if (pshare == PTHREAD_PROCESS_PRIVATE) {
+		op |= nsync_futex_.FUTEX_PRIVATE_FLAG_;
+	}
 
-	if (g_cosmo_futex.is_supported) {
+	if (nsync_futex_.is_supported) {
 		if (IsWindows ()) {
 			if (pshare) {
 				goto Polyfill;
@@ -397,7 +356,7 @@ int cosmo_futex_wake (atomic_int *w, int count, char pshare) {
 				op |= ULF_WAKE_ALL;
 			}
 			rc = ulock_wake (op, w, 0);
-			unassert (!rc || rc == -ENOENT);
+			ASSERT (!rc || rc == -ENOENT);
 			if (!rc) {
 				rc = 1;
 			} else if (rc == -ENOENT) {
diff --git a/third_party/nsync/futex.internal.h b/third_party/nsync/futex.internal.h
new file mode 100644
index 000000000..c572a1595
--- /dev/null
+++ b/third_party/nsync/futex.internal.h
@@ -0,0 +1,17 @@
+#ifndef NSYNC_FUTEX_INTERNAL_H_
+#define NSYNC_FUTEX_INTERNAL_H_
+#include "libc/calls/struct/timespec.h"
+#include "libc/dce.h"
+COSMOPOLITAN_C_START_
+
+#ifndef __cplusplus
+#define _FUTEX_ATOMIC(x) _Atomic(x)
+#else
+#define _FUTEX_ATOMIC(x) x
+#endif
+
+int nsync_futex_wake_(_FUTEX_ATOMIC(int) *, int, char);
+int nsync_futex_wait_(_FUTEX_ATOMIC(int) *, int, char, const struct timespec *);
+
+COSMOPOLITAN_C_END_
+#endif /* NSYNC_FUTEX_INTERNAL_H_ */
diff --git a/third_party/nsync/mem/BUILD.mk b/third_party/nsync/mem/BUILD.mk
index a947a2e18..aa5c3c1e3 100644
--- a/third_party/nsync/mem/BUILD.mk
+++ b/third_party/nsync/mem/BUILD.mk
@@ -49,13 +49,6 @@ $(THIRD_PARTY_NSYNC_MEM_A_OBJS): private		\
 			-Wframe-larger-than=4096	\
 			-Walloca-larger-than=4096
 
-# avoid the legacy sse decoding penalty on avx systems
-ifeq ($(MODE),)
-$(THIRD_PARTY_NSYNC_MEM_A_OBJS): private		\
-		COPTS +=				\
-			-mgeneral-regs-only
-endif
-
 THIRD_PARTY_NSYNC_MEM_LIBS = $(foreach x,$(THIRD_PARTY_NSYNC_MEM_ARTIFACTS),$($(x)))
 THIRD_PARTY_NSYNC_MEM_SRCS = $(foreach x,$(THIRD_PARTY_NSYNC_MEM_ARTIFACTS),$($(x)_SRCS))
 THIRD_PARTY_NSYNC_MEM_CHECKS = $(foreach x,$(THIRD_PARTY_NSYNC_MEM_ARTIFACTS),$($(x)_CHECKS))
diff --git a/third_party/nsync/mem/nsync_counter.c b/third_party/nsync/mem/nsync_counter.c
index 0eccec105..c508797fc 100644
--- a/third_party/nsync/mem/nsync_counter.c
+++ b/third_party/nsync/mem/nsync_counter.c
@@ -19,7 +19,6 @@
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "third_party/nsync/atomic.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/atomic.internal.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/counter.h"
@@ -95,13 +94,13 @@ uint32_t nsync_counter_value (nsync_counter c) {
 	return (result);
 }
 
-uint32_t nsync_counter_wait (nsync_counter c, int clock, nsync_time abs_deadline) {
+uint32_t nsync_counter_wait (nsync_counter c, nsync_time abs_deadline) {
 	struct nsync_waitable_s waitable;
 	struct nsync_waitable_s *pwaitable = &waitable;
 	uint32_t result = 0;
 	waitable.v = c;
 	waitable.funcs = &nsync_counter_waitable_funcs;
-	if (nsync_wait_n (NULL, NULL, NULL, clock, abs_deadline, 1, &pwaitable) != 0) {
+	if (nsync_wait_n (NULL, NULL, NULL, abs_deadline, 1, &pwaitable) != 0) {
 		IGNORE_RACES_START ();
 		result = ATM_LOAD_ACQ (&c->value);
 		IGNORE_RACES_END ();
diff --git a/third_party/nsync/mem/nsync_cv.c b/third_party/nsync/mem/nsync_cv.c
index c871c581d..8e363f77c 100644
--- a/third_party/nsync/mem/nsync_cv.c
+++ b/third_party/nsync/mem/nsync_cv.c
@@ -175,7 +175,6 @@ struct nsync_cv_wait_with_deadline_s {
 	void *pmu;
 	void (*lock) (void *);
 	nsync_mu *cv_mu;
-	int clock;
 	nsync_time abs_deadline;
 	nsync_note cancel_note;
 	waiter *w;
@@ -188,7 +187,7 @@ static int nsync_cv_wait_with_deadline_impl_ (struct nsync_cv_wait_with_deadline
 	IGNORE_RACES_START ();
 	while (ATM_LOAD_ACQ (&c->w->nw.waiting) != 0) { /* acquire load */
 		if (c->sem_outcome == 0) {
-			c->sem_outcome = nsync_sem_wait_with_cancel_ (c->w, c->clock, c->abs_deadline, c->cancel_note);
+			c->sem_outcome = nsync_sem_wait_with_cancel_ (c->w, c->abs_deadline, c->cancel_note);
 		}
 		if (c->sem_outcome != 0 && ATM_LOAD (&c->w->nw.waiting) != 0) {
 			/* A timeout or cancellation occurred, and no wakeup.
@@ -234,9 +233,7 @@ static int nsync_cv_wait_with_deadline_impl_ (struct nsync_cv_wait_with_deadline
 		/* Requeue on *pmu using existing waiter struct; current thread
 		   is the designated waker.  */
 		nsync_mu_lock_slow_ (c->cv_mu, c->w, MU_DESIG_WAKER, c->w->l_type);
-		nsync_waiter_free_ (c->w);
 	} else {
-		nsync_waiter_free_ (c->w);
 		/* Traditional case: We've woken from the cv, and need to reacquire *pmu. */
 		if (c->is_reader_mu) {
 			nsync_mu_rlock (c->cv_mu);
@@ -244,6 +241,7 @@ static int nsync_cv_wait_with_deadline_impl_ (struct nsync_cv_wait_with_deadline
 			(*c->lock) (c->pmu);
 		}
 	}
+	nsync_waiter_free_ (c->w);
 	IGNORE_RACES_END ();
 	return (outcome);
 }
@@ -279,16 +277,13 @@ static void nsync_cv_wait_with_deadline_unwind_ (void *arg) {
    programmes. */
 int nsync_cv_wait_with_deadline_generic (nsync_cv *pcv, void *pmu,
 					 void (*lock) (void *), void (*unlock) (void *),
-					 int clock, nsync_time abs_deadline,
+					 nsync_time abs_deadline,
 					 nsync_note cancel_note) {
 	int outcome;
 	struct nsync_cv_wait_with_deadline_s c;
 	IGNORE_RACES_START ();
 
 	c.w = nsync_waiter_new_ ();
-	c.w->wipe_cv = pcv;
-	c.w->wipe_mu = pmu;
-	c.clock = clock;
 	c.abs_deadline = abs_deadline;
 	c.cancel_note = cancel_note;
 	c.cv_mu = NULL;
@@ -474,10 +469,10 @@ void nsync_cv_broadcast (nsync_cv *pcv) {
 
 /* Wait with deadline, using an nsync_mu. */
 errno_t nsync_cv_wait_with_deadline (nsync_cv *pcv, nsync_mu *pmu,
-				     int clock, nsync_time abs_deadline,
+				     nsync_time abs_deadline,
 				     nsync_note cancel_note) {
 	return (nsync_cv_wait_with_deadline_generic (pcv, pmu, &void_mu_lock,
-						     &void_mu_unlock, clock,
+						     &void_mu_unlock,
 						     abs_deadline, cancel_note));
 }
 
@@ -490,7 +485,7 @@ errno_t nsync_cv_wait_with_deadline (nsync_cv *pcv, nsync_mu *pmu,
    ECANCELED may be returned if calling POSIX thread is cancelled only when
    the PTHREAD_CANCEL_MASKED mode is in play. */
 errno_t nsync_cv_wait (nsync_cv *pcv, nsync_mu *pmu) {
-	return nsync_cv_wait_with_deadline (pcv, pmu, 0, nsync_time_no_deadline, NULL);
+	return nsync_cv_wait_with_deadline (pcv, pmu, nsync_time_no_deadline, NULL);
 }
 
 static nsync_time cv_ready_time (void *v, struct nsync_waiter_s *nw) {
diff --git a/third_party/nsync/mem/nsync_debug.c b/third_party/nsync/mem/nsync_debug.c
index 8c7d7e124..a3d847286 100644
--- a/third_party/nsync/mem/nsync_debug.c
+++ b/third_party/nsync/mem/nsync_debug.c
@@ -20,7 +20,6 @@
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/mu_semaphore.h"
 #include "third_party/nsync/races.internal.h"
-#include "third_party/nsync/defs.h"
 #include "third_party/nsync/wait_s.internal.h"
 __static_yoink("nsync_notice");
 
@@ -149,23 +148,15 @@ static void emit_waiters (struct emit_buf *b, struct Dll *list) {
                 waiter *w = DLL_WAITER (p);
                 next = NULL;
                 emit_print (b, "   %i", (uintptr_t) w);
-#if NSYNC_DEBUG
                 if (w->tag != WAITER_TAG) {
                         emit_print (b, "bad WAITER_TAG %i",
                                     (uintptr_t) w->tag);
                 } else {
-#else
-		{
-#endif
                         next = dll_next (list, p);
-#if NSYNC_DEBUG
                         if (nw->tag != NSYNC_WAITER_TAG) {
                                 emit_print (b, " bad WAITER_TAG %i",
                                             (uintptr_t) nw->tag);
                         } else {
-#else
-			{
-#endif
                                 emit_print (b, " embedded=%i waiting=%i",
                                             (uintptr_t) (w->flags & NSYNC_WAITER_FLAG_MUCV),
                                             (uintptr_t) ATM_LOAD (&nw->waiting));
diff --git a/third_party/nsync/mem/nsync_mu_wait.c b/third_party/nsync/mem/nsync_mu_wait.c
index 785823c5c..e46492aa9 100644
--- a/third_party/nsync/mem/nsync_mu_wait.c
+++ b/third_party/nsync/mem/nsync_mu_wait.c
@@ -141,7 +141,7 @@ int nsync_mu_wait_with_deadline (nsync_mu *mu,
 				 int (*condition) (const void *condition_arg),
 				 const void *condition_arg,
 				 int (*condition_arg_eq) (const void *a, const void *b),
-				 int clock, nsync_time abs_deadline, nsync_note cancel_note) {
+				 nsync_time abs_deadline, nsync_note cancel_note) {
 	lock_type *l_type;
 	int first_wait;
 	int condition_is_true;
@@ -231,7 +231,7 @@ int nsync_mu_wait_with_deadline (nsync_mu *mu,
 		have_lock = 0;
 		while (ATM_LOAD_ACQ (&w->nw.waiting) != 0) { /* acquire load */
 			if (sem_outcome == 0) {
-				sem_outcome = nsync_sem_wait_with_cancel_ (w, clock, abs_deadline,
+				sem_outcome = nsync_sem_wait_with_cancel_ (w, abs_deadline,
 									   cancel_note);
 				if (sem_outcome != 0 && ATM_LOAD (&w->nw.waiting) != 0) {
 					/* A timeout or cancellation occurred, and no wakeup.
@@ -280,7 +280,7 @@ void nsync_mu_wait (nsync_mu *mu, int (*condition) (const void *condition_arg),
                     const void *condition_arg,
 		    int (*condition_arg_eq) (const void *a, const void *b)) {
 	if (nsync_mu_wait_with_deadline (mu, condition, condition_arg, condition_arg_eq,
-					 0, nsync_time_no_deadline, NULL) != 0) {
+					 nsync_time_no_deadline, NULL) != 0) {
 		nsync_panic_ ("nsync_mu_wait woke but condition not true\n");
 	}
 }
diff --git a/third_party/nsync/mem/nsync_note.c b/third_party/nsync/mem/nsync_note.c
index 6b68b164a..bdf8e9ad0 100644
--- a/third_party/nsync/mem/nsync_note.c
+++ b/third_party/nsync/mem/nsync_note.c
@@ -19,7 +19,6 @@
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "third_party/nsync/atomic.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/mu_semaphore.h"
 #include "third_party/nsync/mu_wait.h"
@@ -152,7 +151,7 @@ nsync_time nsync_note_notified_deadline_ (nsync_note n) {
 		ntime = NOTIFIED_TIME (n);
 		nsync_mu_unlock (&n->note_mu);
 		if (nsync_time_cmp (ntime, nsync_time_zero) > 0) {
-			if (nsync_time_cmp (ntime, nsync_time_now (n->clock)) <= 0) {
+			if (nsync_time_cmp (ntime, nsync_time_now ()) <= 0) {
 				notify (n);
 				ntime = nsync_time_zero;
 			}
@@ -169,12 +168,11 @@ int nsync_note_is_notified (nsync_note n) {
 	return (result);
 }
 
-nsync_note nsync_note_new (nsync_note parent, int clock,
+nsync_note nsync_note_new (nsync_note parent,
 			   nsync_time abs_deadline) {
 	nsync_note n = (nsync_note) malloc (sizeof (*n));
 	if (n != NULL) {
 		bzero (n, sizeof (*n));
-		n->clock = clock;
 		dll_init (&n->parent_child_link);
 		set_expiry_time (n, abs_deadline);
 		if (!nsync_note_is_notified (n) && parent != NULL) {
@@ -249,7 +247,7 @@ int nsync_note_wait (nsync_note n, nsync_time abs_deadline) {
 	struct nsync_waitable_s *pwaitable = &waitable;
 	waitable.v = n;
 	waitable.funcs = &nsync_note_waitable_funcs;
-	return (nsync_wait_n (NULL, NULL, NULL, n->clock, abs_deadline, 1, &pwaitable) == 0);
+	return (nsync_wait_n (NULL, NULL, NULL, abs_deadline, 1, &pwaitable) == 0);
 }
 
 nsync_time nsync_note_expiry (nsync_note n) {
diff --git a/third_party/nsync/mem/nsync_once.c b/third_party/nsync/mem/nsync_once.c
index 5f355dc7a..873766b99 100644
--- a/third_party/nsync/mem/nsync_once.c
+++ b/third_party/nsync/mem/nsync_once.c
@@ -17,7 +17,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/nsync/atomic.h"
 #include "third_party/nsync/atomic.internal.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/mu_semaphore.h"
 #include "third_party/nsync/once.h"
@@ -91,8 +90,8 @@ static void nsync_run_once_impl (nsync_once *once, struct once_sync_s *s,
 				if (attempts < 50) {
 					attempts += 10;
 				}
-				deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (attempts));
-				nsync_cv_wait_with_deadline (&s->once_cv, &s->once_mu, NSYNC_CLOCK, deadline, NULL);
+				deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (attempts));
+				nsync_cv_wait_with_deadline (&s->once_cv, &s->once_mu, deadline, NULL);
 			} else {
 				attempts = pthread_delay_np (once, attempts);
 			}
diff --git a/third_party/nsync/mem/nsync_sem_wait.c b/third_party/nsync/mem/nsync_sem_wait.c
index 059fd456a..62507d686 100644
--- a/third_party/nsync/mem/nsync_sem_wait.c
+++ b/third_party/nsync/mem/nsync_sem_wait.c
@@ -29,20 +29,18 @@ __static_yoink("nsync_notice");
      w->sem is non-zero----decrement it and return 0.
      abs_deadline expires---return ETIMEDOUT.
      cancel_note is non-NULL and *cancel_note becomes notified---return ECANCELED. */
-int nsync_sem_wait_with_cancel_ (waiter *w, int clock, nsync_time abs_deadline,
+int nsync_sem_wait_with_cancel_ (waiter *w, nsync_time abs_deadline,
 			         nsync_note cancel_note) {
 	int sem_outcome;
 	if (cancel_note == NULL) {
-		sem_outcome = nsync_mu_semaphore_p_with_deadline (&w->sem, clock, abs_deadline);
+		sem_outcome = nsync_mu_semaphore_p_with_deadline (&w->sem, abs_deadline);
 	} else {
 		nsync_time cancel_time;
 		cancel_time = nsync_note_notified_deadline_ (cancel_note);
 		sem_outcome = ECANCELED;
 		if (nsync_time_cmp (cancel_time, nsync_time_zero) > 0) {
 			struct nsync_waiter_s nw;
-#if NSYNC_DEBUG
 			nw.tag = NSYNC_WAITER_TAG;
-#endif
 			nw.sem = &w->sem;
 			dll_init (&nw.q);
 			ATM_STORE (&nw.waiting, 1);
@@ -60,7 +58,7 @@ int nsync_sem_wait_with_cancel_ (waiter *w, int clock, nsync_time abs_deadline,
 				}
 				nsync_mu_unlock (&cancel_note->note_mu);
 				sem_outcome = nsync_mu_semaphore_p_with_deadline (&w->sem,
-					clock, local_abs_deadline);
+					local_abs_deadline);
 				if (sem_outcome == ETIMEDOUT && !deadline_is_nearer) {
 					sem_outcome = ECANCELED;
 					nsync_note_notify (cancel_note);
diff --git a/third_party/nsync/mem/nsync_wait.c b/third_party/nsync/mem/nsync_wait.c
index 1bf5bdeb2..9d8e95b7d 100644
--- a/third_party/nsync/mem/nsync_wait.c
+++ b/third_party/nsync/mem/nsync_wait.c
@@ -28,7 +28,7 @@
 __static_yoink("nsync_notice");
 
 int nsync_wait_n (void *mu, void (*lock) (void *), void (*unlock) (void *),
-		  int clock, nsync_time abs_deadline,
+		  nsync_time abs_deadline,
 		  int count, struct nsync_waitable_s *waitable[]) {
 	int ready;
 	IGNORE_RACES_START ();
@@ -51,9 +51,7 @@ int nsync_wait_n (void *mu, void (*lock) (void *), void (*unlock) (void *),
 			nw = (struct nsync_waiter_s *) malloc (count * sizeof (nw[0]));
 		}
 		for (i = 0; i != count && enqueued; i++) {
-#if NSYNC_DEBUG
 			nw[i].tag = NSYNC_WAITER_TAG;
-#endif
 			nw[i].sem = &w->sem;
 			dll_init (&nw[i].q);
 			ATM_STORE (&nw[i].waiting, 0);
@@ -79,7 +77,7 @@ int nsync_wait_n (void *mu, void (*lock) (void *), void (*unlock) (void *),
 				}
 			} while (nsync_time_cmp (min_ntime, nsync_time_zero) > 0 &&
 				 nsync_mu_semaphore_p_with_deadline (&w->sem,
-					clock, min_ntime) == 0);
+					min_ntime) == 0);
 		}
 
 		/* An attempt was made above to enqueue waitable[0..i-1].
diff --git a/third_party/nsync/mu.c b/third_party/nsync/mu.c
index 6da4d14a8..20eac1e68 100644
--- a/third_party/nsync/mu.c
+++ b/third_party/nsync/mu.c
@@ -23,7 +23,6 @@
 #include "third_party/nsync/mu_semaphore.h"
 #include "third_party/nsync/races.internal.h"
 #include "libc/thread/thread.h"
-#include "libc/intrin/strace.h"
 #include "third_party/nsync/wait_s.internal.h"
 __static_yoink("nsync_notice");
 
@@ -34,11 +33,9 @@ void nsync_mu_init (nsync_mu *mu) {
 
 /* Release the mutex spinlock. */
 static void mu_release_spinlock (nsync_mu *mu) {
-	uint32_t old_word = atomic_load_explicit (&mu->word,
-						  memory_order_relaxed);
-	while (!atomic_compare_exchange_weak_explicit (
-		       &mu->word, &old_word, old_word & ~MU_SPINLOCK,
-		       memory_order_release, memory_order_relaxed)) {
+	uint32_t old_word = ATM_LOAD (&mu->word);
+	while (!ATM_CAS_REL (&mu->word, old_word, old_word & ~MU_SPINLOCK)) {
+		old_word = ATM_LOAD (&mu->word);
 	}
 }
 
@@ -57,7 +54,6 @@ void nsync_mu_lock_slow_ (nsync_mu *mu, waiter *w, uint32_t clear, lock_type *l_
 	w->cond.f = NULL; /* Not using a conditional critical section. */
 	w->cond.v = NULL;
 	w->cond.eq = NULL;
-	w->wipe_mu = mu;
 	w->l_type = l_type;
 	zero_to_acquire = l_type->zero_to_acquire;
 	if (clear != 0) {
@@ -71,17 +67,15 @@ void nsync_mu_lock_slow_ (nsync_mu *mu, waiter *w, uint32_t clear, lock_type *l_
 		if ((old_word & zero_to_acquire) == 0) {
 			/* lock can be acquired; try to acquire, possibly
 			   clearing MU_DESIG_WAKER and MU_LONG_WAIT.  */
-			if (atomic_compare_exchange_weak_explicit (&mu->word, &old_word,
-								   (old_word+l_type->add_to_acquire) &
-								   ~(clear|long_wait|l_type->clear_on_acquire),
-								   memory_order_acquire, memory_order_relaxed)) {
+			if (ATM_CAS_ACQ (&mu->word, old_word,
+					 (old_word+l_type->add_to_acquire) &
+					  ~(clear|long_wait|l_type->clear_on_acquire))) {
 				break;
 			}
 		} else if ((old_word&MU_SPINLOCK) == 0 &&
-			   atomic_compare_exchange_weak_explicit (&mu->word, &old_word,
-								  (old_word|MU_SPINLOCK|long_wait|
-								   l_type->set_when_waiting) & ~(clear | MU_ALL_FALSE),
-								  memory_order_acquire, memory_order_relaxed)) {
+			   ATM_CAS_ACQ (&mu->word, old_word,
+					(old_word|MU_SPINLOCK|long_wait|
+					 l_type->set_when_waiting) & ~(clear | MU_ALL_FALSE))) {
 
 			/* Spinlock is now held, and lock is held by someone
 			   else; MU_WAITING has also been set; queue ourselves.
@@ -138,16 +132,13 @@ void nsync_mu_lock_slow_ (nsync_mu *mu, waiter *w, uint32_t clear, lock_type *l_
 int nsync_mu_trylock (nsync_mu *mu) {
 	int result;
 	IGNORE_RACES_START ();
-	uint32_t old_word = 0;
-	if (atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_WADD_TO_ACQUIRE,
-						     memory_order_acquire, memory_order_relaxed)) {
+	if (ATM_CAS_ACQ (&mu->word, 0, MU_WADD_TO_ACQUIRE)) { /* acquire CAS */
 		result = 1;
 	} else {
+		uint32_t old_word = ATM_LOAD (&mu->word);
 		result = ((old_word & MU_WZERO_TO_ACQUIRE) == 0 &&
-			  atomic_compare_exchange_strong_explicit (
-				  &mu->word, &old_word,
-				  (old_word + MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE,
-				  memory_order_acquire, memory_order_relaxed));
+			  ATM_CAS_ACQ (&mu->word, old_word,
+				       (old_word + MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE));
 	}
 	IGNORE_RACES_END ();
 	return (result);
@@ -156,14 +147,11 @@ int nsync_mu_trylock (nsync_mu *mu) {
 /* Block until *mu is free and then acquire it in writer mode. */
 void nsync_mu_lock (nsync_mu *mu) {
 	IGNORE_RACES_START ();
-	uint32_t old_word = 0;
-	if (!atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_WADD_TO_ACQUIRE,
-						      memory_order_acquire, memory_order_relaxed)) {
+	if (!ATM_CAS_ACQ (&mu->word, 0, MU_WADD_TO_ACQUIRE)) { /* acquire CAS */
+		uint32_t old_word = ATM_LOAD (&mu->word);
 		if ((old_word&MU_WZERO_TO_ACQUIRE) != 0 ||
-		    !atomic_compare_exchange_strong_explicit (&mu->word, &old_word,
-							      (old_word+MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE,
-							      memory_order_acquire, memory_order_relaxed)) {
-			LOCKTRACE("acquiring nsync_mu_lock(%t)...", mu);
+		    !ATM_CAS_ACQ (&mu->word, old_word,
+				  (old_word+MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE)) {
 			waiter *w = nsync_waiter_new_ ();
 			nsync_mu_lock_slow_ (mu, w, 0, nsync_writer_type_);
 			nsync_waiter_free_ (w);
@@ -179,15 +167,13 @@ void nsync_mu_lock (nsync_mu *mu) {
 int nsync_mu_rtrylock (nsync_mu *mu) {
 	int result;
 	IGNORE_RACES_START ();
-	uint32_t old_word = 0;
-	if (atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_RADD_TO_ACQUIRE,
-						     memory_order_acquire, memory_order_relaxed)) {
+	if (ATM_CAS_ACQ (&mu->word, 0, MU_RADD_TO_ACQUIRE)) { /* acquire CAS */
 		result = 1;
 	} else {
+		uint32_t old_word = ATM_LOAD (&mu->word);
 		result = ((old_word&MU_RZERO_TO_ACQUIRE) == 0 &&
-			  atomic_compare_exchange_strong_explicit (&mu->word, &old_word,
-								   (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE,
-								   memory_order_acquire, memory_order_relaxed));
+			  ATM_CAS_ACQ (&mu->word, old_word,
+				       (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE));
 	}
 	IGNORE_RACES_END ();
 	return (result);
@@ -196,14 +182,11 @@ int nsync_mu_rtrylock (nsync_mu *mu) {
 /* Block until *mu can be acquired in reader mode and then acquire it. */
 void nsync_mu_rlock (nsync_mu *mu) {
 	IGNORE_RACES_START ();
-	uint32_t old_word = 0;
-	if (!atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_RADD_TO_ACQUIRE,
-						      memory_order_acquire, memory_order_relaxed)) {
+	if (!ATM_CAS_ACQ (&mu->word, 0, MU_RADD_TO_ACQUIRE)) { /* acquire CAS */
+		uint32_t old_word = ATM_LOAD (&mu->word);
 		if ((old_word&MU_RZERO_TO_ACQUIRE) != 0 ||
-		    !atomic_compare_exchange_strong_explicit (&mu->word, &old_word,
-							      (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE,
-							      memory_order_acquire, memory_order_relaxed)) {
-			LOCKTRACE("acquiring nsync_mu_rlock(%t)...", mu);
+		    !ATM_CAS_ACQ (&mu->word, old_word,
+				  (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE)) {
 			waiter *w = nsync_waiter_new_ ();
 			nsync_mu_lock_slow_ (mu, w, 0, nsync_reader_type_);
 			nsync_waiter_free_ (w);
@@ -251,16 +234,16 @@ struct Dll *nsync_remove_from_mu_queue_ (struct Dll *mu_queue, struct Dll *e) {
 	/* Record previous and next elements in the original queue. */
 	struct Dll *prev = e->prev;
 	struct Dll *next = e->next;
+	uint32_t old_value;
 	/* Remove. */
 	dll_remove (&mu_queue, e);
-	uint32_t old_value = ATM_LOAD (&DLL_WAITER (e)->remove_count);
-        while (!atomic_compare_exchange_weak_explicit (
-		       &DLL_WAITER (e)->remove_count, &old_value, old_value+1,
-		       memory_order_relaxed, memory_order_relaxed)) {
-	}
+        do {    
+                old_value = ATM_LOAD (&DLL_WAITER (e)->remove_count);
+        } while (!ATM_CAS (&DLL_WAITER (e)->remove_count, old_value, old_value+1));
 	if (!dll_is_empty (mu_queue)) {
 		/* Fix up same_condition. */
 		struct Dll *e_same_condition = &DLL_WAITER (e)->same_condition;
+
 		if (e_same_condition->next != e_same_condition) {
 			/* *e is linked to a same_condition neighbour---just remove it. */
 			e_same_condition->next->prev = e_same_condition->prev;
@@ -305,18 +288,14 @@ void nsync_mu_unlock_slow_ (nsync_mu *mu, lock_type *l_type) {
 			/* no one to wake, there's a designated waker waking
 			   up, there are still readers, or it's a reader and all waiters
 			   have false conditions */
-			if (atomic_compare_exchange_weak_explicit (
-				    &mu->word, &old_word,
-				    (old_word - l_type->add_to_acquire) &
-				    ~l_type->clear_on_uncontended_release,
-				    memory_order_release, memory_order_relaxed)) {
+			if (ATM_CAS_REL (&mu->word, old_word,
+					 (old_word - l_type->add_to_acquire) &
+					 ~l_type->clear_on_uncontended_release)) {
 				return;
 			}
 		} else if ((old_word&MU_SPINLOCK) == 0 &&
-			   atomic_compare_exchange_weak_explicit (
-				   &mu->word, &old_word,
-				   (old_word-early_release_mu)|MU_SPINLOCK|MU_DESIG_WAKER,
-				   memory_order_acq_rel, memory_order_relaxed)) {
+			   ATM_CAS_SEQCST (&mu->word, old_word, /* [jart] fixes issues on apple silicon */
+                                           (old_word-early_release_mu)|MU_SPINLOCK|MU_DESIG_WAKER)) {
 			struct Dll *wake;
 			lock_type *wake_type;
 			uint32_t clear_on_release;
@@ -452,10 +431,10 @@ void nsync_mu_unlock_slow_ (nsync_mu *mu, lock_type *l_type) {
 			   whether any waiters remain, and whether any of them
 			   are writers.  */
 			old_word = ATM_LOAD (&mu->word);
-			while (!atomic_compare_exchange_weak_explicit (
-				       &mu->word, &old_word,
-				       ((old_word - late_release_mu) | set_on_release) & ~clear_on_release,
-				       memory_order_release, memory_order_relaxed)) {
+			while (!ATM_CAS_REL (&mu->word, old_word,
+					     ((old_word-late_release_mu)|set_on_release) &
+					     ~clear_on_release)) { /* release CAS */
+				old_word = ATM_LOAD (&mu->word);
 			}
 			/* Wake the waiters. */
 			for (p = dll_first (wake); p != NULL; p = next) {
@@ -478,10 +457,8 @@ void nsync_mu_unlock (nsync_mu *mu) {
 	   waiter.  Another thread could acquire, decrement a reference count
 	   and deallocate the mutex before the current thread touched the mutex
 	   word again. */
-	uint32_t old_word = MU_WLOCK;
-	if (!atomic_compare_exchange_strong_explicit (&mu->word, &old_word, 0,
-						      memory_order_release,
-						      memory_order_relaxed)) {
+	if (!ATM_CAS_REL (&mu->word, MU_WLOCK, 0)) {
+		uint32_t old_word = ATM_LOAD (&mu->word);
                 /* Clear MU_ALL_FALSE because the critical section we're just
                    leaving may have made some conditions true.  */
 		uint32_t new_word = (old_word - MU_WLOCK) & ~MU_ALL_FALSE;
@@ -509,10 +486,8 @@ void nsync_mu_unlock (nsync_mu *mu) {
 void nsync_mu_runlock (nsync_mu *mu) {
 	IGNORE_RACES_START ();
 	/* See comment in nsync_mu_unlock(). */
-	uint32_t old_word = MU_RLOCK;
-	if (!atomic_compare_exchange_strong_explicit (&mu->word, &old_word, 0,
-						      memory_order_release,
-						      memory_order_relaxed)) {
+	if (!ATM_CAS_REL (&mu->word, MU_RLOCK, 0)) {
+		uint32_t old_word = ATM_LOAD (&mu->word);
                 /* Sanity check:  mutex must not be held in write mode and
                    reader count must not be 0.  */
 		if (((old_word ^ MU_WLOCK) & (MU_WLOCK | MU_RLOCK_FIELD)) == 0) {
diff --git a/third_party/nsync/mu.h b/third_party/nsync/mu.h
index 4831cacd8..dab1ed722 100644
--- a/third_party/nsync/mu.h
+++ b/third_party/nsync/mu.h
@@ -48,6 +48,7 @@ COSMOPOLITAN_C_START_
 */
 typedef struct nsync_mu_s_ {
   nsync_atomic_uint32_ word; /* internal use only */
+  int _zero;                 /* c pthread_mutex_t */
   struct Dll *waiters;       /* internal use only */
 } nsync_mu;
 
diff --git a/third_party/nsync/mu_semaphore.c b/third_party/nsync/mu_semaphore.c
index cc6906400..274b4e75b 100644
--- a/third_party/nsync/mu_semaphore.c
+++ b/third_party/nsync/mu_semaphore.c
@@ -15,30 +15,23 @@
 │ See the License for the specific language governing permissions and          │
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/mu_semaphore.internal.h"
+#include "third_party/nsync/mu_semaphore.h"
 #include "libc/calls/cp.internal.h"
 #include "libc/dce.h"
-#include "third_party/nsync/mu_semaphore.h"
+#include "third_party/nsync/mu_semaphore.internal.h"
 __static_yoink("nsync_notice");
 
 /* Initialize *s; the initial value is 0. */
 bool nsync_mu_semaphore_init (nsync_semaphore *s) {
-	if (IsNetbsd ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
+		return nsync_mu_semaphore_init_gcd (s);
+	} else if (IsNetbsd ()) {
 		return nsync_mu_semaphore_init_sem (s);
 	} else {
 		return nsync_mu_semaphore_init_futex (s);
 	}
 }
 
-/* Destroy *s. */
-void nsync_mu_semaphore_destroy (nsync_semaphore *s) {
-	if (IsNetbsd ()) {
-		return nsync_mu_semaphore_destroy_sem (s);
-	} else {
-		return nsync_mu_semaphore_destroy_futex (s);
-	}
-}
-
 /* Wait until the count of *s exceeds 0, and decrement it. If POSIX cancellations
    are currently disabled by the thread, then this function always succeeds. When
    they're enabled in MASKED mode, this function may return ECANCELED. Otherwise,
@@ -46,7 +39,9 @@ void nsync_mu_semaphore_destroy (nsync_semaphore *s) {
 errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
 	errno_t err;
 	BEGIN_CANCELATION_POINT;
-	if (IsNetbsd ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
+		err = nsync_mu_semaphore_p_gcd (s);
+	} else if (IsNetbsd ()) {
 		err = nsync_mu_semaphore_p_sem (s);
 	} else {
 		err = nsync_mu_semaphore_p_futex (s);
@@ -59,13 +54,15 @@ errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
    while additionally supporting a time parameter specifying at what point
    in the future ETIMEDOUT should be returned, if neither cancelation, or
    semaphore release happens. */
-errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, int clock, nsync_time abs_deadline) {
+errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_deadline) {
 	errno_t err;
 	BEGIN_CANCELATION_POINT;
-	if (IsNetbsd ()) {
-		err = nsync_mu_semaphore_p_with_deadline_sem (s, clock, abs_deadline);
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
+		err = nsync_mu_semaphore_p_with_deadline_gcd (s, abs_deadline);
+	} else if (IsNetbsd ()) {
+		err = nsync_mu_semaphore_p_with_deadline_sem (s, abs_deadline);
 	} else {
-		err = nsync_mu_semaphore_p_with_deadline_futex (s, clock, abs_deadline);
+		err = nsync_mu_semaphore_p_with_deadline_futex (s, abs_deadline);
 	}
 	END_CANCELATION_POINT;
 	return err;
@@ -73,7 +70,9 @@ errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, int clock, nsync
 
 /* Ensure that the count of *s is at least 1. */
 void nsync_mu_semaphore_v (nsync_semaphore *s) {
-	if (IsNetbsd ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
+		return nsync_mu_semaphore_v_gcd (s);
+	} else if (IsNetbsd ()) {
 		return nsync_mu_semaphore_v_sem (s);
 	} else {
 		return nsync_mu_semaphore_v_futex (s);
diff --git a/third_party/nsync/mu_semaphore.h b/third_party/nsync/mu_semaphore.h
index fffb99e51..992d4849f 100644
--- a/third_party/nsync/mu_semaphore.h
+++ b/third_party/nsync/mu_semaphore.h
@@ -10,16 +10,13 @@ typedef struct nsync_semaphore_s_ {
 /* Initialize *s; the initial value is 0. */
 bool nsync_mu_semaphore_init(nsync_semaphore *s);
 
-/* Destroy *s. */
-void nsync_mu_semaphore_destroy(nsync_semaphore *s);
-
 /* Wait until the count of *s exceeds 0, and decrement it. */
 errno_t nsync_mu_semaphore_p(nsync_semaphore *s);
 
 /* Wait until one of: the count of *s is non-zero, in which case
    decrement *s and return 0; or abs_deadline expires, in which case
    return ETIMEDOUT. */
-errno_t nsync_mu_semaphore_p_with_deadline(nsync_semaphore *s, int clock,
+errno_t nsync_mu_semaphore_p_with_deadline(nsync_semaphore *s,
                                            nsync_time abs_deadline);
 
 /* Ensure that the count of *s is at least 1. */
diff --git a/third_party/nsync/mu_semaphore.internal.h b/third_party/nsync/mu_semaphore.internal.h
index 6fe15090f..8795fe349 100755
--- a/third_party/nsync/mu_semaphore.internal.h
+++ b/third_party/nsync/mu_semaphore.internal.h
@@ -4,17 +4,34 @@
 #include "third_party/nsync/time.h"
 COSMOPOLITAN_C_START_
 
+/* XNU ulock (used by cosmo futexes) is an internal API, however:
+
+     1. Unlike GCD it's cancelable i.e. can be EINTR'd by signals
+     2. We have no choice but to use ulock for joining threads
+     3. Grand Central Dispatch requires a busy loop workaround
+     4. ulock makes our mutexes use 20% more system time (meh)
+     5. ulock makes our mutexes use 40% less wall time (good)
+     6. ulock makes our mutexes use 64% less user time (woop)
+
+   ulock is an outstanding system call that must be used.
+   gcd is not an acceptable alternative to ulock. */
+
+#define NSYNC_USE_GRAND_CENTRAL 0
+
 bool nsync_mu_semaphore_init_futex(nsync_semaphore *);
-void nsync_mu_semaphore_destroy_futex(nsync_semaphore *);
 errno_t nsync_mu_semaphore_p_futex(nsync_semaphore *);
-errno_t nsync_mu_semaphore_p_with_deadline_futex(nsync_semaphore *, int, nsync_time);
+errno_t nsync_mu_semaphore_p_with_deadline_futex(nsync_semaphore *, nsync_time);
 void nsync_mu_semaphore_v_futex(nsync_semaphore *);
 
 bool nsync_mu_semaphore_init_sem(nsync_semaphore *);
-void nsync_mu_semaphore_destroy_sem(nsync_semaphore *);
 errno_t nsync_mu_semaphore_p_sem(nsync_semaphore *);
-errno_t nsync_mu_semaphore_p_with_deadline_sem(nsync_semaphore *, int, nsync_time);
+errno_t nsync_mu_semaphore_p_with_deadline_sem(nsync_semaphore *, nsync_time);
 void nsync_mu_semaphore_v_sem(nsync_semaphore *);
 
+bool nsync_mu_semaphore_init_gcd(nsync_semaphore *);
+errno_t nsync_mu_semaphore_p_gcd(nsync_semaphore *);
+errno_t nsync_mu_semaphore_p_with_deadline_gcd(nsync_semaphore *, nsync_time);
+void nsync_mu_semaphore_v_gcd(nsync_semaphore *);
+
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_THIRD_PARTY_NSYNC_MU_SEMAPHORE_INTERNAL_H_ */
diff --git a/third_party/nsync/mu_semaphore_futex.c b/third_party/nsync/mu_semaphore_futex.c
index cc556267d..a4e605a6e 100644
--- a/third_party/nsync/mu_semaphore_futex.c
+++ b/third_party/nsync/mu_semaphore_futex.c
@@ -21,9 +21,7 @@
 #include "libc/thread/thread.h"
 #include "third_party/nsync/atomic.h"
 #include "third_party/nsync/atomic.internal.h"
-#include "libc/cosmo.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/cosmo.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu_semaphore.internal.h"
 
 /**
@@ -51,9 +49,6 @@ bool nsync_mu_semaphore_init_futex (nsync_semaphore *s) {
 	return true;
 }
 
-void nsync_mu_semaphore_destroy_futex (nsync_semaphore *s) {
-}
-
 /* Wait until the count of *s exceeds 0, and decrement it. If POSIX cancellations
    are currently disabled by the thread, then this function always succeeds. When
    they're enabled in MASKED mode, this function may return ECANCELED. Otherwise,
@@ -66,9 +61,9 @@ errno_t nsync_mu_semaphore_p_futex (nsync_semaphore *s) {
 		i = ATM_LOAD ((nsync_atomic_uint32_ *) &f->i);
 		if (i == 0) {
 			int futex_result;
-			futex_result = -cosmo_futex_wait (
+			futex_result = -nsync_futex_wait_ (
 				(atomic_int *)&f->i, i,
-				PTHREAD_PROCESS_PRIVATE, 0, 0);
+				PTHREAD_PROCESS_PRIVATE, 0);
 			ASSERT (futex_result == 0 ||
 				futex_result == EINTR ||
 				futex_result == EAGAIN ||
@@ -78,10 +73,7 @@ errno_t nsync_mu_semaphore_p_futex (nsync_semaphore *s) {
 				result = ECANCELED;
 			}
 		}
-	} while (result == 0 && (i == 0 ||
-				 !atomic_compare_exchange_weak_explicit (
-					 (nsync_atomic_uint32_ *) &f->i, &i, i-1,
-					 memory_order_acquire, memory_order_relaxed)));
+	} while (result == 0 && (i == 0 || !ATM_CAS_ACQ ((nsync_atomic_uint32_ *) &f->i, i, i-1)));
 	return result;
 }
 
@@ -89,7 +81,7 @@ errno_t nsync_mu_semaphore_p_futex (nsync_semaphore *s) {
    while additionally supporting a time parameter specifying at what point
    in the future ETIMEDOUT should be returned, if neither cancellation, or
    semaphore release happens. */
-errno_t nsync_mu_semaphore_p_with_deadline_futex (nsync_semaphore *s, int clock, nsync_time abs_deadline) {
+errno_t nsync_mu_semaphore_p_with_deadline_futex (nsync_semaphore *s, nsync_time abs_deadline) {
 	struct futex *f = (struct futex *)s;
 	int i;
 	int result = 0;
@@ -105,9 +97,8 @@ errno_t nsync_mu_semaphore_p_with_deadline_futex (nsync_semaphore *s, int clock,
 				ts_buf.tv_nsec = NSYNC_TIME_NSEC (abs_deadline);
 				ts = &ts_buf;
 			}
-			futex_result = cosmo_futex_wait ((atomic_int *)&f->i, i,
-							 PTHREAD_PROCESS_PRIVATE,
-							 clock, ts);
+			futex_result = nsync_futex_wait_ ((atomic_int *)&f->i, i,
+							  PTHREAD_PROCESS_PRIVATE, ts);
 			ASSERT (futex_result == 0 ||
 				futex_result == -EINTR ||
 				futex_result == -EAGAIN ||
@@ -115,31 +106,24 @@ errno_t nsync_mu_semaphore_p_with_deadline_futex (nsync_semaphore *s, int clock,
 				futex_result == -ETIMEDOUT ||
 				futex_result == -EWOULDBLOCK);
 			/* Some systems don't wait as long as they are told. */
-			if (futex_result == -ETIMEDOUT) {
-				nsync_time now;
-				if (clock_gettime (clock, &now))
-					result = EINVAL;
-				if (nsync_time_cmp (now, abs_deadline) >= 0)
-					result = ETIMEDOUT;
+			if (futex_result == -ETIMEDOUT &&
+			    nsync_time_cmp (abs_deadline, nsync_time_now ()) <= 0) {
+				result = ETIMEDOUT;
 			}
 			if (futex_result == -ECANCELED) {
 				result = ECANCELED;
 			}
 		}
-	} while (result == 0 && (i == 0 ||
-				 !atomic_compare_exchange_weak_explicit (
-					 (nsync_atomic_uint32_ *) &f->i, &i, i-1,
-					 memory_order_acquire, memory_order_relaxed)));
+	} while (result == 0 && (i == 0 || !ATM_CAS_ACQ ((nsync_atomic_uint32_ *) &f->i, i, i - 1)));
 	return (result);
 }
 
 /* Ensure that the count of *s is at least 1. */
 void nsync_mu_semaphore_v_futex (nsync_semaphore *s) {
 	struct futex *f = (struct futex *) s;
-        uint32_t old_value = ATM_LOAD ((nsync_atomic_uint32_ *) &f->i);
-	while (!atomic_compare_exchange_weak_explicit (
-		       (nsync_atomic_uint32_ *) &f->i, &old_value, old_value+1,
-		       memory_order_release, memory_order_relaxed)) {
-	}
-	ASSERT (cosmo_futex_wake ((atomic_int *)&f->i, 1, PTHREAD_PROCESS_PRIVATE) >= 0);
+        uint32_t old_value;
+	do {
+		old_value = ATM_LOAD ((nsync_atomic_uint32_ *) &f->i);
+	} while (!ATM_CAS_REL ((nsync_atomic_uint32_ *) &f->i, old_value, old_value+1));
+	ASSERT (nsync_futex_wake_ ((atomic_int *)&f->i, 1, PTHREAD_PROCESS_PRIVATE) >= 0);
 }
diff --git a/third_party/nsync/mu_semaphore_gcd.c b/third_party/nsync/mu_semaphore_gcd.c
new file mode 100644
index 000000000..088b50414
--- /dev/null
+++ b/third_party/nsync/mu_semaphore_gcd.c
@@ -0,0 +1,140 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
+│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2016 Google Inc.                                                   │
+│                                                                              │
+│ Licensed under the Apache License, Version 2.0 (the "License");              │
+│ you may not use this file except in compliance with the License.             │
+│ You may obtain a copy of the License at                                      │
+│                                                                              │
+│     http://www.apache.org/licenses/LICENSE-2.0                               │
+│                                                                              │
+│ Unless required by applicable law or agreed to in writing, software          │
+│ distributed under the License is distributed on an "AS IS" BASIS,            │
+│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
+│ See the License for the specific language governing permissions and          │
+│ limitations under the License.                                               │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "libc/calls/sig.internal.h"
+#include "libc/errno.h"
+#include "libc/intrin/strace.h"
+#include "libc/intrin/weaken.h"
+#include "libc/runtime/clktck.h"
+#include "libc/runtime/syslib.internal.h"
+#include "libc/str/str.h"
+#include "libc/thread/posixthread.internal.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/tls.h"
+#include "third_party/nsync/atomic.h"
+#include "third_party/nsync/atomic.internal.h"
+#include "third_party/nsync/futex.internal.h"
+#include "third_party/nsync/mu_semaphore.internal.h"
+#include "third_party/nsync/time.h"
+
+/**
+ * @fileoverview Semaphores w/ Apple's Grand Central Dispatch API.
+ */
+
+#define DISPATCH_TIME_FOREVER ~0ull
+
+static dispatch_semaphore_t dispatch_semaphore_create(long count) {
+	dispatch_semaphore_t ds;
+	ds = __syslib->__dispatch_semaphore_create (count);
+	STRACE ("dispatch_semaphore_create(%ld) → %#lx", count, ds);
+	return (ds);
+}
+
+static void dispatch_release (dispatch_semaphore_t ds) {
+	__syslib->__dispatch_release (ds);
+	STRACE ("dispatch_release(%#lx)", ds);
+}
+
+static long dispatch_semaphore_wait (dispatch_semaphore_t ds,
+				     dispatch_time_t dt) {
+	long rc = __syslib->__dispatch_semaphore_wait (ds, dt);
+	STRACE ("dispatch_semaphore_wait(%#lx, %ld) → %ld", ds, dt, rc);
+	return (rc);
+}
+
+static long dispatch_semaphore_signal (dispatch_semaphore_t ds) {
+	long rc = __syslib->__dispatch_semaphore_signal (ds);
+	(void)rc;
+	STRACE ("dispatch_semaphore_signal(%#lx) → %ld", ds, rc);
+	return (ds);
+}
+
+static dispatch_time_t dispatch_walltime (const struct timespec *base,
+					  int64_t offset) {
+	return __syslib->__dispatch_walltime (base, offset);
+}
+
+static errno_t nsync_dispatch_semaphore_wait (nsync_semaphore *s,
+					      nsync_time abs_deadline) {
+	errno_t result = 0;
+	dispatch_time_t dt;
+	if (nsync_time_cmp (abs_deadline, nsync_time_no_deadline) == 0) {
+		dt = DISPATCH_TIME_FOREVER;
+	} else {
+		dt = dispatch_walltime (&abs_deadline, 0);
+	}
+	if (dispatch_semaphore_wait (*(dispatch_semaphore_t *)s, dt) != 0) {
+		result = ETIMEDOUT;
+	}
+	return (result);
+}
+
+/* Initialize *s; the initial value is 0.  */
+bool nsync_mu_semaphore_init_gcd (nsync_semaphore *s) {
+	return !!(*(dispatch_semaphore_t *)s = dispatch_semaphore_create (0));
+}
+
+/* Wait until the count of *s exceeds 0, and decrement it. If POSIX cancellations
+   are currently disabled by the thread, then this function always succeeds. When
+   they're enabled in MASKED mode, this function may return ECANCELED. Otherwise,
+   cancellation will occur by unwinding cleanup handlers pushed to the stack. */
+errno_t nsync_mu_semaphore_p_gcd (nsync_semaphore *s) {
+	return nsync_mu_semaphore_p_with_deadline_gcd (s, nsync_time_no_deadline);
+}
+
+/* Like nsync_mu_semaphore_p() this waits for the count of *s to exceed 0,
+   while additionally supporting a time parameter specifying at what point
+   in the future ETIMEDOUT should be returned, if neither cancellation, or
+   semaphore release happens. */
+errno_t nsync_mu_semaphore_p_with_deadline_gcd (nsync_semaphore *s,
+						nsync_time abs_deadline) {
+	errno_t result = 0;
+	struct PosixThread *pt;
+	if (!__tls_enabled ||
+	    !_weaken (pthread_testcancel_np) ||
+	    !(pt = _pthread_self()) ||
+	    (pt->pt_flags & PT_NOCANCEL)) {
+		result = nsync_dispatch_semaphore_wait (s, abs_deadline);
+	} else {
+		struct timespec now, until, slice = {0, 1000000000 / CLK_TCK};
+		for (;;) {
+			if (_weaken (pthread_testcancel_np) () == ECANCELED) {
+				result = ECANCELED;
+				break;
+			}
+			now = timespec_real();
+			if (timespec_cmp (now, abs_deadline) >= 0) {
+				result = ETIMEDOUT;
+				break;
+			}
+			until = timespec_add (now, slice);
+			if (timespec_cmp (until, abs_deadline) > 0) {
+				until = abs_deadline;
+			}
+			if (!nsync_dispatch_semaphore_wait (s, until)) {
+				break;
+			}
+		}
+	}
+	return (result);
+}
+
+/* Ensure that the count of *s is at least 1. */
+void nsync_mu_semaphore_v_gcd (nsync_semaphore *s) {
+	dispatch_semaphore_signal (*(dispatch_semaphore_t *)s);
+}
diff --git a/third_party/nsync/mu_semaphore_sem.c b/third_party/nsync/mu_semaphore_sem.c
index a42b2e8c3..9b25ae7a6 100644
--- a/third_party/nsync/mu_semaphore_sem.c
+++ b/third_party/nsync/mu_semaphore_sem.c
@@ -30,9 +30,9 @@
 #include "libc/sysv/consts/f.h"
 #include "libc/sysv/consts/fd.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/mu_semaphore.h"
 #include "libc/intrin/atomic.h"
+#include "libc/atomic.h"
 #include "third_party/nsync/time.h"
 
 /**
@@ -43,14 +43,23 @@
 
 struct sem {
 	int64_t id;
+	struct sem *next;
 };
 
+static _Atomic(struct sem *) g_sems;
+
 static nsync_semaphore *sem_big_enough_for_sem = (nsync_semaphore *) (uintptr_t)(1 /
 	(sizeof (struct sem) <= sizeof (*sem_big_enough_for_sem)));
 
-/* Initialize *s; the initial value is 0. */
-bool nsync_mu_semaphore_init_sem (nsync_semaphore *s) {
-	struct sem *f = (struct sem *) s;
+static void sems_push (struct sem *f) {
+	int backoff = 0;
+	f->next = atomic_load_explicit (&g_sems, memory_order_relaxed);
+	while (!atomic_compare_exchange_weak_explicit (&g_sems, &f->next, f,
+						       memory_order_acq_rel, memory_order_relaxed))
+		backoff = pthread_delay_np (&g_sems, backoff);
+}
+
+static bool nsync_mu_semaphore_sem_create (struct sem *f) {
 	int rc;
 	int lol;
 	f->id = 0;
@@ -68,10 +77,28 @@ bool nsync_mu_semaphore_init_sem (nsync_semaphore *s) {
 	return true;
 }
 
-/* Destroys *s. */
-void nsync_mu_semaphore_destroy_sem (nsync_semaphore *s) {
+static void nsync_mu_semaphore_sem_fork_child (void) {
+	struct sem *f;
+	for (f = atomic_load_explicit (&g_sems, memory_order_relaxed); f; f = f->next) {
+		int rc = sys_close (f->id);
+		STRACE ("close(%ld) → %d", f->id, rc);
+		ASSERT (nsync_mu_semaphore_sem_create (f));
+	}
+}
+
+static void nsync_mu_semaphore_sem_init (void) {
+	pthread_atfork (0, 0, nsync_mu_semaphore_sem_fork_child);
+}
+
+/* Initialize *s; the initial value is 0. */
+bool nsync_mu_semaphore_init_sem (nsync_semaphore *s) {
+	static atomic_uint once;
 	struct sem *f = (struct sem *) s;
-	sys_close (f->id);
+	if (!nsync_mu_semaphore_sem_create (f))
+		return false;
+	cosmo_once (&once, nsync_mu_semaphore_sem_init);
+	sems_push(f);
+	return true;
 }
 
 /* Wait until the count of *s exceeds 0, and decrement it. If POSIX cancellations
@@ -99,22 +126,10 @@ errno_t nsync_mu_semaphore_p_sem (nsync_semaphore *s) {
    while additionally supporting a time parameter specifying at what point
    in the future ETIMEDOUT should be returned, if neither cancellation, or
    semaphore release happens. */
-errno_t nsync_mu_semaphore_p_with_deadline_sem (nsync_semaphore *s, int clock,
-						nsync_time abs_deadline) {
+errno_t nsync_mu_semaphore_p_with_deadline_sem (nsync_semaphore *s, nsync_time abs_deadline) {
 	int e, rc;
 	errno_t result;
 	struct sem *f = (struct sem *) s;
-
-	// convert monotonic back to realtime just for netbsd
-	if (clock && nsync_time_cmp (abs_deadline, nsync_time_no_deadline)) {
-		struct timespec now, delta;
-		if (clock_gettime (clock, &now))
-			return EINVAL;
-		delta = timespec_subz (abs_deadline, now);
-		clock_gettime (CLOCK_REALTIME, &now);
-		abs_deadline = timespec_add (now, delta);
-	}
-
 	e = errno;
 	rc = sys_sem_timedwait (f->id, &abs_deadline);
 	STRACE ("sem_timedwait(%ld, %s) → %d% m", f->id,
diff --git a/third_party/nsync/mu_wait.h b/third_party/nsync/mu_wait.h
index d17a10c9d..3ee9d9793 100644
--- a/third_party/nsync/mu_wait.h
+++ b/third_party/nsync/mu_wait.h
@@ -97,7 +97,7 @@ void nsync_mu_wait(nsync_mu *mu, int (*condition)(const void *condition_arg),
 int nsync_mu_wait_with_deadline(
     nsync_mu *mu, int (*condition)(const void *condition_arg),
     const void *condition_arg,
-    int (*condition_arg_eq)(const void *a, const void *b), int clock,
+    int (*condition_arg_eq)(const void *a, const void *b),
     nsync_time abs_deadline, struct nsync_note_s_ *cancel_note);
 
 /* Unlock *mu, which must be held in write mode, and wake waiters, if
diff --git a/third_party/nsync/note.h b/third_party/nsync/note.h
index 0d46e92ee..008166aab 100644
--- a/third_party/nsync/note.h
+++ b/third_party/nsync/note.h
@@ -19,7 +19,7 @@ typedef struct nsync_note_s_ *nsync_note;
    abs_deadline==nsync_zero_time.
 
    nsync_notes should be passed to nsync_note_free() when no longer needed. */
-nsync_note nsync_note_new(nsync_note parent, int clock, nsync_time abs_deadline);
+nsync_note nsync_note_new(nsync_note parent, nsync_time abs_deadline);
 
 /* Free resources associated with n. Requires that n was allocated by
    nsync_note_new(), and no concurrent or future operations are applied
diff --git a/third_party/nsync/panic.c b/third_party/nsync/panic.c
index 7c6cebf38..10f9eddf8 100644
--- a/third_party/nsync/panic.c
+++ b/third_party/nsync/panic.c
@@ -24,9 +24,9 @@
 
 /* Aborts after printing the nul-terminated string s[]. */
 void nsync_panic_ (const char *s) {
-	tinyprint (2, "error: nsync panic: ", s,
-		   "cosmoaddr2line ", program_invocation_name, " ",
-		   DescribeBacktrace (__builtin_frame_address (0)), "\n",
-		   NULL);
-	__builtin_trap ();
+	tinyprint(2, "error: nsync panic: ", s,
+		"cosmoaddr2line ", program_invocation_name, " ",
+		DescribeBacktrace (__builtin_frame_address (0)), "\n",
+		NULL);
+	_Exit (44);
 }
diff --git a/third_party/nsync/testing/BUILD.mk b/third_party/nsync/testing/BUILD.mk
index d7e261430..1ddb4368b 100644
--- a/third_party/nsync/testing/BUILD.mk
+++ b/third_party/nsync/testing/BUILD.mk
@@ -8,13 +8,12 @@ THIRD_PARTY_NSYNC_TESTING_A = o/$(MODE)/third_party/nsync/testing/lib.a
 THIRD_PARTY_NSYNC_TESTING_FILES = $(wildcard third_party/nsync/testing/*)
 THIRD_PARTY_NSYNC_TESTING_SRCS = $(filter %.c,$(THIRD_PARTY_NSYNC_TESTING_FILES))
 THIRD_PARTY_NSYNC_TESTING_HDRS = $(filter %.h,$(THIRD_PARTY_NSYNC_TESTING_FILES))
-THIRD_PARTY_NSYNC_TESTING_INCS = $(filter %.inc,$(THIRD_PARTY_NSYNC_TESTING_FILES))
 THIRD_PARTY_NSYNC_TESTING_SRCS_TEST = $(filter %_test.c,$(THIRD_PARTY_NSYNC_TESTING_SRCS))
 THIRD_PARTY_NSYNC_TESTING_OBJS = $(THIRD_PARTY_NSYNC_TESTING_SRCS:%.c=o/$(MODE)/%.o)
 THIRD_PARTY_NSYNC_TESTING_COMS = $(THIRD_PARTY_NSYNC_TESTING_SRCS_TEST:%.c=o/$(MODE)/%)
 THIRD_PARTY_NSYNC_TESTING_BINS = $(THIRD_PARTY_NSYNC_TESTING_COMS) $(THIRD_PARTY_NSYNC_TESTING_COMS:%=%.dbg)
-THIRD_PARTY_NSYNC_TESTING_TESTS = $(THIRD_PARTY_NSYNC_TESTING_SRCS_TEST:%.c=o/$(MODE)/%.ok)
-THIRD_PARTY_NSYNC_TESTING_CHECKS = $(THIRD_PARTY_NSYNC_TESTING_SRCS_TEST:%.c=o/$(MODE)/%.runs)
+THIRD_PARTY_NSYNC_TESTING_TESTS_ = $(THIRD_PARTY_NSYNC_TESTING_SRCS_TEST:%.c=o/$(MODE)/%.ok)
+THIRD_PARTY_NSYNC_TESTING_CHECKS_ = $(THIRD_PARTY_NSYNC_TESTING_SRCS_TEST:%.c=o/$(MODE)/%.runs)
 
 THIRD_PARTY_NSYNC_TESTING_DIRECTDEPS =				\
 	LIBC_CALLS						\
@@ -52,28 +51,15 @@ o/$(MODE)/third_party/nsync/testing/%_test.dbg:			\
 		$(APE_NO_MODIFY_SELF)
 	@$(APELINK)
 
-o/$(MODE)/third_party/nsync/testing/mu_starvation_test.ok: private QUOTA = -L300
-o/$(MODE)/third_party/nsync/testing/mu_starvation_test.runs: private QUOTA = -C128 -L300
-o/$(MODE)/third_party/nsync/testing/mu_test.ok: private QUOTA = -L300
-o/$(MODE)/third_party/nsync/testing/mu2_test.ok: private QUOTA = -L300
-o/$(MODE)/third_party/nsync/testing/mu3_test.ok: private QUOTA = -L300
-o/$(MODE)/third_party/nsync/testing/cv_mu_timeout_stress_test.ok: private QUOTA = -L300
-o/$(MODE)/third_party/nsync/testing/cv_mu_timeout_stress2_test.ok: private QUOTA = -L300
-o/$(MODE)/third_party/nsync/testing/cv_mu_timeout_stress3_test.ok: private QUOTA = -L300
-o/$(MODE)/third_party/nsync/testing/mu_test.runs: private QUOTA = -C128 -L300
-o/$(MODE)/third_party/nsync/testing/mu2_test.runs: private QUOTA = -C128 -L300
-o/$(MODE)/third_party/nsync/testing/mu3_test.runs: private QUOTA = -C128 -L300
-o/$(MODE)/third_party/nsync/testing/wait_test.ok: private QUOTA = -P65536
-o/$(MODE)/third_party/nsync/testing/wait_test.runs: private QUOTA = -P65536
-
 $(THIRD_PARTY_NSYNC_TESTING_OBJS): third_party/nsync/testing/BUILD.mk
+o/$(MODE)/third_party/nsync/testing/mu_test.runs: private QUOTA = -C64
 
 .PHONY: o/$(MODE)/third_party/nsync/testing
 o/$(MODE)/third_party/nsync/testing:				\
-	$(THIRD_PARTY_NSYNC_TESTING_CHECKS)			\
-	$(THIRD_PARTY_NSYNC_TESTING_BINS)
+	$(THIRD_PARTY_NSYNC_TESTING_CHECKS_)			\
+	$(THIRD_PARTY_NSYNC_TESTING_BINS_)
 
 .PHONY: o/$(MODE)/third_party/nsync/test
 o/$(MODE)/third_party/nsync/test:				\
-	$(THIRD_PARTY_NSYNC_TESTING_CHECKS)			\
-	$(THIRD_PARTY_NSYNC_TESTING_TESTS)
+	$(THIRD_PARTY_NSYNC_TESTING_CHECKS_)			\
+	$(THIRD_PARTY_NSYNC_TESTING_TESTS_)
diff --git a/third_party/nsync/testing/counter_test.c b/third_party/nsync/testing/counter_test.c
index eb53a702a..99a9c0b95 100644
--- a/third_party/nsync/testing/counter_test.c
+++ b/third_party/nsync/testing/counter_test.c
@@ -19,7 +19,6 @@
 #include "third_party/nsync/testing/closure.h"
 #include "third_party/nsync/testing/smprintf.h"
 #include "third_party/nsync/testing/testing.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/testing/time_extra.h"
 
 /* Verify the properties of a zero counter. */
@@ -30,10 +29,10 @@ static void test_counter_zero (testing t) {
 		if (nsync_counter_value (c) != 0) {
 			TEST_ERROR (t, ("zero counter is not zero (test, %d)", i));
 		}
-		if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_zero) != 0) {
+		if (nsync_counter_wait (c, nsync_time_zero) != 0) {
 			TEST_ERROR (t, ("zero counter is not zero (poll, %d)", i));
 		}
-		if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_no_deadline) != 0) {
+		if (nsync_counter_wait (c, nsync_time_no_deadline) != 0) {
 			TEST_ERROR (t, ("zero counter is not zero (infinite wait, %d)", i));
 		}
 		nsync_counter_add (c, 0);
@@ -51,15 +50,15 @@ static void test_counter_non_zero (testing t) {
 	if (nsync_counter_value (c) != 1) {
 		TEST_ERROR (t, ("counter is not 1 (test)"));
 	}
-	if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_zero) != 1) {
+	if (nsync_counter_wait (c, nsync_time_zero) != 1) {
 		TEST_ERROR (t, ("counter is not 1 (poll)"));
 	}
-	start = nsync_time_now (NSYNC_CLOCK);
-	abs_deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1000));
-	if (nsync_counter_wait (c, NSYNC_CLOCK, abs_deadline) != 1) {
+	start = nsync_time_now ();
+	abs_deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (1000));
+	if (nsync_counter_wait (c, abs_deadline) != 1) {
 		TEST_ERROR (t, ("counter is not 1 (1s wait)"));
 	}
-	waited = nsync_time_sub (nsync_time_now (NSYNC_CLOCK), start);
+	waited = nsync_time_sub (nsync_time_now (), start);
 	if (nsync_time_cmp (waited, nsync_time_ms (900)) < 0) {
 		TEST_ERROR (t, ("timed wait on non-zero counter returned too quickly (1s wait took %s)",
 			   nsync_time_str (waited, 2)));
@@ -76,17 +75,17 @@ static void test_counter_non_zero (testing t) {
 	if (nsync_counter_value (c) != 0) {
 		TEST_ERROR (t, ("zero counter note is not 0 (test)"));
 	}
-	if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_zero) != 0) {
+	if (nsync_counter_wait (c, nsync_time_zero) != 0) {
 		TEST_ERROR (t, ("zero counter note is not 0 (poll)"));
 	}
-	if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_no_deadline) != 0) {
+	if (nsync_counter_wait (c, nsync_time_no_deadline) != 0) {
 		TEST_ERROR (t, ("zero counter note is not 0 (infinite wait)"));
 	}
 	nsync_counter_free (c);
 }
 
 static void decrement_at (nsync_counter c, nsync_time abs_deadline) {
-	nsync_time_sleep_until (NSYNC_CLOCK, abs_deadline);
+	nsync_time_sleep_until (abs_deadline);
 	nsync_counter_add (c, -1);
 }
 
@@ -98,12 +97,12 @@ static void test_counter_decrement (testing t) {
 	nsync_time waited;
 	nsync_counter c = nsync_counter_new (1);
 	closure_fork (closure_decrement (&decrement_at, c,
-		nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1000))));
-	start = nsync_time_now (NSYNC_CLOCK);
-	if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_no_deadline) != 0) {
+		nsync_time_add (nsync_time_now (), nsync_time_ms (1000))));
+	start = nsync_time_now ();
+	if (nsync_counter_wait (c, nsync_time_no_deadline) != 0) {
 		TEST_ERROR (t, ("counter is not 0"));
 	}
-	waited = nsync_time_sub (nsync_time_now (NSYNC_CLOCK), start);
+	waited = nsync_time_sub (nsync_time_now (), start);
 	if (nsync_time_cmp (waited, nsync_time_ms (900)) < 0) {
 		TEST_ERROR (t, ("counter wait too fast (1s delay took %s)", nsync_time_str (waited, 2)));
 	}
@@ -113,22 +112,22 @@ static void test_counter_decrement (testing t) {
 	if (nsync_counter_value (c) != 0) {
 		TEST_ERROR (t, ("counter is not 0 (test)"));
 	}
-	if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_zero) != 0) {
+	if (nsync_counter_wait (c, nsync_time_zero) != 0) {
 		TEST_ERROR (t, ("counter is not 0 (poll)"));
 	}
-	if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_no_deadline) != 0) {
+	if (nsync_counter_wait (c, nsync_time_no_deadline) != 0) {
 		TEST_ERROR (t, ("counter is not 0 (infinite wait)"));
 	}
 	nsync_counter_free (c);
 
 	c = nsync_counter_new (1);
 	closure_fork (closure_decrement (&decrement_at, c,
-		nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1000))));
-	start = nsync_time_now (NSYNC_CLOCK);
+		nsync_time_add (nsync_time_now (), nsync_time_ms (1000))));
+	start = nsync_time_now ();
 	while (nsync_counter_value (c) != 0) {
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (10));
+		nsync_time_sleep (nsync_time_ms (10));
 	}
-	waited = nsync_time_sub (nsync_time_now (NSYNC_CLOCK), start);
+	waited = nsync_time_sub (nsync_time_now (), start);
 	if (nsync_time_cmp (waited, nsync_time_ms (900)) < 0) {
 		TEST_ERROR (t, ("counter wait too fast (1s delay took %s)", nsync_time_str (waited, 2)));
 	}
@@ -138,10 +137,10 @@ static void test_counter_decrement (testing t) {
 	if (nsync_counter_value (c) != 0) {
 		TEST_ERROR (t, ("counter is not 0 (test)"));
 	}
-	if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_zero) != 0) {
+	if (nsync_counter_wait (c, nsync_time_zero) != 0) {
 		TEST_ERROR (t, ("counter is not 0 (poll)"));
 	}
-	if (nsync_counter_wait (c, NSYNC_CLOCK, nsync_time_no_deadline) != 0) {
+	if (nsync_counter_wait (c, nsync_time_no_deadline) != 0) {
 		TEST_ERROR (t, ("counter is not 0 (infinite wait)"));
 	}
 	nsync_counter_free (c);
diff --git a/third_party/nsync/testing/cv2_test.c b/third_party/nsync/testing/cv2_test.c
deleted file mode 100644
index 47cc96485..000000000
--- a/third_party/nsync/testing/cv2_test.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/cv_test.inc"
-
-int main (int argc, char *argv[]) {
-	testing_base tb = testing_new (argc, argv, 0);
-	TEST_RUN (tb, test_cv_deadline);
-	return (testing_base_exit (tb));
-}
diff --git a/third_party/nsync/testing/cv3_test.c b/third_party/nsync/testing/cv3_test.c
deleted file mode 100644
index 52e0e9839..000000000
--- a/third_party/nsync/testing/cv3_test.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/cv_test.inc"
-
-int main (int argc, char *argv[]) {
-	testing_base tb = testing_new (argc, argv, 0);
-	TEST_RUN (tb, test_cv_cancel);
-	return (testing_base_exit (tb));
-}
diff --git a/third_party/nsync/testing/cv_mu_timeout_stress2_test.c b/third_party/nsync/testing/cv_mu_timeout_stress2_test.c
deleted file mode 100644
index 94127460d..000000000
--- a/third_party/nsync/testing/cv_mu_timeout_stress2_test.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/cv_mu_timeout_stress_test.inc"
-
-int main (int argc, char *argv[]) {
-	testing_base tb = testing_new (argc, argv, 0);
-	TEST_RUN (tb, test_mu_timeout_stress);
-	return (testing_base_exit (tb));
-}
diff --git a/third_party/nsync/testing/cv_mu_timeout_stress3_test.c b/third_party/nsync/testing/cv_mu_timeout_stress3_test.c
deleted file mode 100644
index 8b74d34be..000000000
--- a/third_party/nsync/testing/cv_mu_timeout_stress3_test.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/cv_mu_timeout_stress_test.inc"
-
-int main (int argc, char *argv[]) {
-	testing_base tb = testing_new (argc, argv, 0);
-	TEST_RUN (tb, test_mu_cv_timeout_stress);
-	return (testing_base_exit (tb));
-}
diff --git a/third_party/nsync/testing/cv_mu_timeout_stress_test.c b/third_party/nsync/testing/cv_mu_timeout_stress_test.c
deleted file mode 100644
index 6c9cf3a63..000000000
--- a/third_party/nsync/testing/cv_mu_timeout_stress_test.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/cv_mu_timeout_stress_test.inc"
-
-int main (int argc, char *argv[]) {
-	testing_base tb = testing_new (argc, argv, 0);
-	TEST_RUN (tb, test_cv_timeout_stress);
-	return (testing_base_exit (tb));
-}
diff --git a/third_party/nsync/testing/cv_mu_timeout_stress_test.inc b/third_party/nsync/testing/cv_mu_timeout_stress_test_.c
similarity index 93%
rename from third_party/nsync/testing/cv_mu_timeout_stress_test.inc
rename to third_party/nsync/testing/cv_mu_timeout_stress_test_.c
index ea9f259a9..302ec90f5 100644
--- a/third_party/nsync/testing/cv_mu_timeout_stress_test.inc
+++ b/third_party/nsync/testing/cv_mu_timeout_stress_test_.c
@@ -17,13 +17,11 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/stdio/rand.h"
 #include "libc/str/str.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/cv.h"
 #include "third_party/nsync/mu.h"
 #include "third_party/nsync/mu_wait.h"
 #include "third_party/nsync/testing/closure.h"
 #include "third_party/nsync/testing/smprintf.h"
-#include "libc/dce.h"
 #include "third_party/nsync/testing/testing.h"
 
 /* A cv_stress_data represents the data used by the threads of the tests below. */
@@ -60,8 +58,8 @@ typedef struct cv_stress_data_s {
 /* The delays in cv_stress_inc_loop(), cv_stress_reader_loop(), mu_stress_inc_loop(),
    and mu_stress_reader_loop() are uniformly distributed from 0 to
    STRESS_MAX_DELAY_MICROS-1 microseconds. */
-#define STRESS_MAX_DELAY_MICROS (IsNetbsd() || IsOpenbsd() ? 30000 : 4000)  /* maximum delay */
-#define STRESS_MEAN_DELAY_MICROS (STRESS_MAX_DELAY_MICROS / 2)              /* mean delay */
+#define STRESS_MAX_DELAY_MICROS (4000)                                    /* maximum delay */
+#define STRESS_MEAN_DELAY_MICROS (STRESS_MAX_DELAY_MICROS / 2)               /* mean delay */
 #define STRESS_EXPECT_TIMEOUTS_PER_SEC (1000000 / STRESS_MEAN_DELAY_MICROS) /* expect timeouts/s*/
 
 /* Acquire s.mu, then increment s.count n times, each time
@@ -77,16 +75,16 @@ static void cv_stress_inc_loop (cv_stress_data *s, uintmax_t count_imod4) {
 		nsync_mu_assert_held (&s->mu);
 		while ((s->count & 3) != count_imod4) {
 			nsync_time abs_deadline;
-			abs_deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK),
+			abs_deadline = nsync_time_add (nsync_time_now (),
 				nsync_time_us (rand () % STRESS_MAX_DELAY_MICROS));
 			while (nsync_cv_wait_with_deadline (
 					&s->count_is_imod4[count_imod4],
-					&s->mu, NSYNC_CLOCK, abs_deadline, NULL) != 0 &&
+					&s->mu, abs_deadline, NULL) != 0 &&
 			       (s->count&3) != count_imod4) {
 				nsync_mu_assert_held (&s->mu);
 				s->timeouts++;
 				nsync_mu_assert_held (&s->mu);
-				abs_deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK),
+				abs_deadline = nsync_time_add (nsync_time_now (),
 				       nsync_time_us (rand () % STRESS_MAX_DELAY_MICROS));
 			}
 		}
@@ -129,16 +127,15 @@ static void cv_stress_reader_loop (cv_stress_data *s, uintmax_t count_imod4) {
 		nsync_mu_rassert_held (&s->mu);
 		while ((s->count&3) != count_imod4 && s->refs != 0) {
 			nsync_time abs_deadline;
-			abs_deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK),
+			abs_deadline = nsync_time_add (nsync_time_now (),
 				nsync_time_us (rand () % STRESS_MAX_DELAY_MICROS));
 			while (nsync_cv_wait_with_deadline (&s->count_is_imod4[count_imod4],
-							    &s->mu, NSYNC_CLOCK,
-							    abs_deadline, NULL) != 0 &&
+							    &s->mu, abs_deadline, NULL) != 0 &&
 			       (s->count&3) != count_imod4 && s->refs != 0) {
 
 				nsync_mu_rassert_held (&s->mu);
 				timeouts++;
-				abs_deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK),
+				abs_deadline = nsync_time_add (nsync_time_now (),
 					nsync_time_us (rand () % STRESS_MAX_DELAY_MICROS));
 			}
 		}
@@ -147,7 +144,7 @@ static void cv_stress_reader_loop (cv_stress_data *s, uintmax_t count_imod4) {
 		if ((loops & 0xf) == 0) {
 			nsync_mu_runlock (&s->mu);
 			if ((loops & 0xfff) == 0) {
-				nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (1));
+				nsync_time_sleep (nsync_time_ms (1));
 			}
 			nsync_mu_rlock (&s->mu);
 		}
@@ -237,14 +234,14 @@ static void mu_stress_inc_loop (cv_stress_data *s, condition_func condition,
 		nsync_time abs_deadline;
 		nsync_mu_assert_held (&s->mu);
 
-		abs_deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK),
+		abs_deadline = nsync_time_add (nsync_time_now (),
 			nsync_time_us (rand () % STRESS_MAX_DELAY_MICROS));
 		while (nsync_mu_wait_with_deadline (&s->mu, condition, condition_arg, NULL,
-						    NSYNC_CLOCK, abs_deadline, NULL) != 0) {
+						    abs_deadline, NULL) != 0) {
 			nsync_mu_assert_held (&s->mu);
 			s->timeouts++;
 			nsync_mu_assert_held (&s->mu);
-			abs_deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK),
+			abs_deadline = nsync_time_add (nsync_time_now (),
 				nsync_time_us (rand () % STRESS_MAX_DELAY_MICROS));
 		}
 
@@ -287,14 +284,14 @@ static void mu_stress_reader_loop (cv_stress_data *s, condition_func condition,
 	while (s->refs != 0) {
 		nsync_time abs_deadline;
 		nsync_mu_rassert_held (&s->mu);
-		abs_deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK),
+		abs_deadline = nsync_time_add (nsync_time_now (),
 			nsync_time_us (rand () % STRESS_MAX_DELAY_MICROS));
 		while (nsync_mu_wait_with_deadline (&s->mu, condition, condition_arg, NULL,
-						    NSYNC_CLOCK, abs_deadline, NULL) != 0) {
+						    abs_deadline, NULL) != 0) {
 			nsync_mu_rassert_held (&s->mu);
 			s->timeouts++;
 			nsync_mu_rassert_held (&s->mu);
-			abs_deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK),
+			abs_deadline = nsync_time_add (nsync_time_now (),
 				nsync_time_us (rand () % STRESS_MAX_DELAY_MICROS));
 		}
 
@@ -303,7 +300,7 @@ static void mu_stress_reader_loop (cv_stress_data *s, condition_func condition,
 		if ((loops & 0xf) == 0) {
 			nsync_mu_runlock (&s->mu);
 			if ((loops & 0xfff) == 0) {
-				nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (1));
+				nsync_time_sleep (nsync_time_ms (1));
 			}
 			nsync_mu_rlock (&s->mu);
 		}
@@ -419,7 +416,7 @@ static int run_stress_test (cv_stress_data *s, testing t,
 	nsync_mu_unlock (&s->mu);
 
 	/* Sleep a while to cause many timeouts. */
-	nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (sleep_seconds * 1000));
+	nsync_time_sleep (nsync_time_ms (sleep_seconds * 1000));
 
 	nsync_mu_lock (&s->mu);
 	nsync_mu_assert_held (&s->mu);
@@ -468,7 +465,7 @@ static int run_stress_test (cv_stress_data *s, testing t,
 	nsync_mu_assert_held (&s->mu);
 	nsync_mu_unlock (&s->mu);
 
-	if (nsync_time_cmp (s->deadline, nsync_time_now (NSYNC_CLOCK)) < 0) {
+	if (nsync_time_cmp (s->deadline, nsync_time_now ()) < 0) {
 		if (timeouts_seen < expected_timeouts && !testing_is_uniprocessor (t)) {
 			TEST_ERROR (t, ("%s: expected more than %d timeouts, got %d",
 				   test_name, expected_timeouts, timeouts_seen));
@@ -499,7 +496,7 @@ static void test_cv_timeout_stress (testing t) {
 	uintmax_t loop_count = 3;
 	cv_stress_data s;
 	nsync_time deadline;
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (5000));
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (5000));
 	do {
 		bzero ((void *) &s, sizeof (s));
 		s.loop_count = loop_count;
@@ -519,7 +516,7 @@ static void test_mu_timeout_stress (testing t) {
 	uintmax_t loop_count = 3;
 	cv_stress_data s;
 	nsync_time deadline;
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (5000));
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (5000));
 	do {
 		bzero ((void *) &s, sizeof (s));
 		s.loop_count = loop_count;
@@ -539,7 +536,7 @@ static void test_mu_cv_timeout_stress (testing t) {
 	uintmax_t loop_count = 3;
 	cv_stress_data s;
 	nsync_time deadline;
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (5000));
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (5000));
 	do {
 		bzero ((void *) &s, sizeof (s));
 		s.loop_count = loop_count;
@@ -551,3 +548,11 @@ static void test_mu_cv_timeout_stress (testing t) {
 		loop_count *= 2;
 	} while (!run_stress_test (&s, t, "test_mu_cv_timeout_stress"));
 }
+
+int main (int argc, char *argv[]) {
+	testing_base tb = testing_new (argc, argv, 0);
+	TEST_RUN (tb, test_cv_timeout_stress);
+	TEST_RUN (tb, test_mu_timeout_stress);
+	TEST_RUN (tb, test_mu_cv_timeout_stress);
+	return (testing_base_exit (tb));
+}
diff --git a/third_party/nsync/testing/cv_test.c b/third_party/nsync/testing/cv_test.c
index 09fb43d88..d31413d4f 100644
--- a/third_party/nsync/testing/cv_test.c
+++ b/third_party/nsync/testing/cv_test.c
@@ -15,7 +15,766 @@
 │ See the License for the specific language governing permissions and          │
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/cv_test.inc"
+#include "third_party/nsync/cv.h"
+#include "libc/errno.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "third_party/nsync/debug.h"
+#include "third_party/nsync/mu.h"
+#include "third_party/nsync/mu_wait.h"
+#include "third_party/nsync/note.h"
+#include "third_party/nsync/testing/closure.h"
+#include "third_party/nsync/testing/smprintf.h"
+#include "third_party/nsync/testing/testing.h"
+#include "third_party/nsync/testing/time_extra.h"
+#include "third_party/nsync/time.h"
+
+/* --------------------------- */
+
+/* A cv_queue represents a FIFO queue with up to limit elements.
+   The storage for the queue expands as necessary up to limit. */
+typedef struct cv_queue_s {
+	int limit;          /* max value of count---should not be changed after initialization */
+	nsync_cv non_empty; /* signalled when count transitions from zero to non-zero */
+	nsync_cv non_full;  /* signalled when count transitions from limit to less than limit */
+	nsync_mu mu;        /* protects fields below */
+	int pos;            /* index of first in-use element */
+	int count;          /* number of elements in use */
+	void *data[1];      /* in use elements are data[pos, ..., (pos+count-1)%limit] */
+} cv_queue;
+
+/* Return a pointer to new cv_queue. */
+static cv_queue *cv_queue_new (int limit) {
+	cv_queue *q;
+	int size = offsetof (struct cv_queue_s, data) + sizeof (q->data[0]) * limit;
+	q = (cv_queue *) malloc (size);
+	bzero ((void *) q, size);
+	q->limit = limit;
+	return (q);
+}
+
+/* Add v to the end of the FIFO *q and return non-zero, or if the FIFO already
+   has limit elements and continues to do so until abs_deadline, do nothing and
+   return 0. */
+static int cv_queue_put (cv_queue *q, void *v, nsync_time abs_deadline) {
+	int added = 0;
+	int wake = 0;
+	nsync_mu_lock (&q->mu);
+	while (q->count == q->limit &&
+	       nsync_cv_wait_with_deadline (&q->non_full, &q->mu, abs_deadline, NULL) == 0) {
+	}
+	if (q->count != q->limit) {
+		int i = q->pos + q->count;
+		if (q->limit <= i) {
+			i -= q->limit;
+		}
+		q->data[i] = v;
+		if (q->count == 0) {
+			wake = 1;
+		}
+		q->count++;
+		added = 1;
+	}
+	nsync_mu_unlock (&q->mu);
+	if (wake) {
+		nsync_cv_broadcast (&q->non_empty);
+	}
+	return (added);
+}
+
+/* Remove the first value from the front of the FIFO *q and return it,
+   or if the FIFO is empty and continues to be so until abs_deadline,
+   do nothing and return NULL. */
+static void *cv_queue_get (cv_queue *q, nsync_time abs_deadline) {
+	void *v = NULL;
+	nsync_mu_lock (&q->mu);
+	while (q->count == 0 &&
+	       nsync_cv_wait_with_deadline (&q->non_empty, &q->mu, abs_deadline, NULL) == 0) {
+	}
+	if (q->count != 0) {
+		v = q->data[q->pos];
+		q->data[q->pos] = NULL;
+		if (q->count == q->limit) {
+			nsync_cv_broadcast (&q->non_full);
+		}
+		q->pos++;
+		q->count--;
+		if (q->pos == q->limit) {
+			q->pos = 0;
+		}
+	}
+	nsync_mu_unlock (&q->mu);
+	return (v);
+}
+
+/* --------------------------- */
+
+static char ptr_to_int_c;
+#define INT_TO_PTR(x) ((x) + &ptr_to_int_c)
+#define PTR_TO_INT(p) (((char *) (p)) - &ptr_to_int_c)
+
+/* Put count integers on *q, in the sequence start*3, (start+1)*3, (start+2)*3, .... */
+static void producer_cv_n (testing t, cv_queue *q, int start, int count) {
+	int i;
+	for (i = 0; i != count; i++) {
+		if (!cv_queue_put (q, INT_TO_PTR ((start+i)*3), nsync_time_no_deadline)) {
+			TEST_FATAL (t, ("cv_queue_put() returned 0 with no deadline"));
+		}
+	}
+}
+CLOSURE_DECL_BODY4 (producer_cv_n, testing, cv_queue *, int, int)
+
+/* Get count integers from *q, and check that they are in the
+   sequence start*3, (start+1)*3, (start+2)*3, .... */
+static void consumer_cv_n (testing t, cv_queue *q, int start, int count) {
+	int i;
+	for (i = 0; i != count; i++) {
+		void *v = cv_queue_get (q, nsync_time_no_deadline);
+		int x;
+		if (v == NULL) {
+			TEST_FATAL (t, ("cv_queue_get() returned NULL with no deadline"));
+		}
+		x = PTR_TO_INT (v);
+		if (x != (start+i)*3) {
+			TEST_FATAL (t, ("cv_queue_get() returned bad value; want %d, got %d",
+				   (start+i)*3, x));
+		}
+	}
+}
+
+/* CV_PRODUCER_CONSUMER_N is the number of elements passed from producer to consumer in the
+   test_cv_producer_consumer*() tests below. */
+#define CV_PRODUCER_CONSUMER_N 100000
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**0. */
+static void test_cv_producer_consumer0 (testing t) {
+	cv_queue *q = cv_queue_new (1);
+	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
+	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**1. */
+static void test_cv_producer_consumer1 (testing t) {
+	cv_queue *q = cv_queue_new (10);
+	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
+	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**2. */
+static void test_cv_producer_consumer2 (testing t) {
+	cv_queue *q = cv_queue_new (100);
+	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
+	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**3. */
+static void test_cv_producer_consumer3 (testing t) {
+	cv_queue *q = cv_queue_new (1000);
+	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
+	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**4. */
+static void test_cv_producer_consumer4 (testing t) {
+	cv_queue *q = cv_queue_new (10 * 1000);
+	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
+	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**5. */
+static void test_cv_producer_consumer5 (testing t) {
+	cv_queue *q = cv_queue_new (100 * 1000);
+	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
+	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**6. */
+static void test_cv_producer_consumer6 (testing t) {
+	cv_queue *q = cv_queue_new (1000 * 1000);
+	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
+	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* The following values control how aggressively we police the timeout. */
+#define TOO_EARLY_MS 1
+#define TOO_LATE_MS 100   /* longer, to accommodate scheduling delays */
+#define TOO_LATE_ALLOWED 25         /* number of iterations permitted to violate too_late */
+
+/* Check timeouts on a CV wait_with_deadline(). */
+static void test_cv_deadline (testing t) {
+	int too_late_violations;
+	nsync_mu mu;
+	nsync_cv cv;
+	int i;
+	nsync_time too_early;
+	nsync_time too_late;
+
+	nsync_mu_init (&mu);
+	nsync_cv_init (&cv);
+	too_early = nsync_time_ms (TOO_EARLY_MS);
+	too_late = nsync_time_ms (TOO_LATE_MS);
+	too_late_violations = 0;
+	nsync_mu_lock (&mu);
+	for (i = 0; i != 50; i++) {
+		nsync_time end_time;
+		nsync_time start_time;
+		nsync_time expected_end_time;
+		start_time = nsync_time_now ();
+		expected_end_time = nsync_time_add (start_time, nsync_time_ms (87));
+		if (nsync_cv_wait_with_deadline (&cv, &mu, expected_end_time,
+						 NULL) != ETIMEDOUT) {
+			TEST_FATAL (t, ("nsync_cv_wait() returned non-expired for a timeout"));
+		}
+		end_time = nsync_time_now ();
+		if (nsync_time_cmp (end_time, nsync_time_sub (expected_end_time, too_early)) < 0) {
+			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
+			TEST_ERROR (t, ("nsync_cv_wait() returned %s too early", elapsed_str));
+			free (elapsed_str);
+		}
+		if (nsync_time_cmp (nsync_time_add (expected_end_time, too_late), end_time) < 0) {
+			too_late_violations++;
+		}
+	}
+	nsync_mu_unlock (&mu);
+	if (too_late_violations > TOO_LATE_ALLOWED) {
+		TEST_ERROR (t, ("nsync_cv_wait() returned too late %d times", too_late_violations));
+	}
+}
+
+/* Check cancellations with nsync_cv_wait_with_deadline(). */
+static void test_cv_cancel (testing t) {
+	nsync_time future_time;
+	int too_late_violations;
+	nsync_mu mu;
+	nsync_cv cv;
+	int i;
+	nsync_time too_early;
+	nsync_time too_late;
+
+	nsync_mu_init (&mu);
+	nsync_cv_init (&cv);
+	too_early = nsync_time_ms (TOO_EARLY_MS);
+	too_late = nsync_time_ms (TOO_LATE_MS);
+
+	/* The loops below cancel after 87 milliseconds, like the timeout tests above. */
+
+	future_time = nsync_time_add (nsync_time_now (), nsync_time_ms (3600000)); /* test cancels with timeout */
+
+	too_late_violations = 0;
+	nsync_mu_lock (&mu);
+	for (i = 0; i != 50; i++) {
+		int x;
+		nsync_note cancel;
+		nsync_time end_time;
+		nsync_time start_time;
+		nsync_time expected_end_time;
+		start_time = nsync_time_now ();
+		expected_end_time = nsync_time_add (start_time, nsync_time_ms (87));
+
+		cancel = nsync_note_new (NULL, expected_end_time);
+
+		x = nsync_cv_wait_with_deadline (&cv, &mu, future_time, cancel);
+		if (x != ECANCELED) {
+			TEST_FATAL (t, ("nsync_cv_wait() returned non-cancelled (%d) for "
+				   "a cancellation; expected %d",
+				   x, ECANCELED));
+		}
+		end_time = nsync_time_now ();
+		if (nsync_time_cmp (end_time, nsync_time_sub (expected_end_time, too_early)) < 0) {
+			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
+			TEST_ERROR (t, ("nsync_cv_wait() returned %s too early", elapsed_str));
+			free (elapsed_str);
+		}
+		if (nsync_time_cmp (nsync_time_add (expected_end_time, too_late), end_time) < 0) {
+			too_late_violations++;
+		}
+
+		/* Check that an already cancelled wait returns immediately. */
+		start_time = nsync_time_now ();
+
+		x = nsync_cv_wait_with_deadline (&cv, &mu, nsync_time_no_deadline, cancel);
+		if (x != ECANCELED) {
+			TEST_FATAL (t, ("nsync_cv_wait() returned non-cancelled (%d) for "
+				   "a cancellation; expected %d",
+				   x, ECANCELED));
+		}
+		end_time = nsync_time_now ();
+		if (nsync_time_cmp (end_time, start_time) < 0) {
+			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
+			TEST_ERROR (t, ("nsync_cv_wait() returned %s too early", elapsed_str));
+			free (elapsed_str);
+		}
+		if (nsync_time_cmp (nsync_time_add (start_time, too_late), end_time) < 0) {
+			too_late_violations++;
+		}
+		nsync_note_notify (cancel);
+
+		nsync_note_free (cancel);
+	}
+	nsync_mu_unlock (&mu);
+	if (too_late_violations > TOO_LATE_ALLOWED) {
+		TEST_ERROR (t, ("nsync_cv_wait() returned too late %d times", too_late_violations));
+	}
+}
+
+/* --------------------------- */
+
+/* Names of debug results for test_cv_debug. */
+static const char *result_name[] = {
+	"init_mu0",
+	"init_cv0",
+	"init_mu1",
+	"init_cv1",
+	"init_mu2",
+	"init_cv2",
+	"held_mu",
+	"wait0_mu",
+	"wait0_cv",
+	"wait1_mu",
+	"wait1_cv",
+	"wait2_mu",
+	"wait2_cv",
+	"wait3_mu",
+	"wait3_cv",
+	"rheld1_mu",
+	"rheld2_mu",
+	"rheld1again_mu",
+	NULL /* sentinel */
+};
+
+/* state for test_cv_debug() */
+struct debug_state {
+	nsync_mu mu;  /* protects flag field */
+	nsync_cv cv;  /* signalled when flag becomes zero */
+	int flag;     /* 0 => threads proceed; non-zero => threads block */
+
+	/* result[] is an array of nul-terminated string values, accessed via
+	   name (in result_name[]) via slot().  Entries accessed from multiple
+	   threads are protected by result_mu.  */
+	char *result[sizeof (result_name) / sizeof (result_name[0])];
+	nsync_mu result_mu;
+};
+
+/* Return a pointer to the slot in s->result[] associated with the
+   nul-terminated name[] */
+static char **slot (struct debug_state *s, const char *name) {
+	int i = 0;
+	while (result_name[i] != NULL && strcmp (result_name[i], name) != 0) {
+		i++;
+	}
+	if (result_name[i] == NULL) {  /* caller gave non-existent name */
+		abort ();
+	}
+	return (&s->result[i]);
+}
+
+/* Check that the strings associated with nul-terminated strings name0[] and
+   name1[] have the same values in s->result[].  */
+static void check_same (testing t, struct debug_state *s,
+			     const char *name0, const char *name1) {
+	if (strcmp (*slot (s, name0), *slot (s, name1)) != 0) {
+		TEST_ERROR (t, ("nsync_mu_debug_state() %s state != %s state (%s vs. %s)",
+				name0, name1, *slot (s, name0), *slot (s, name1)));
+	}
+}
+
+/* Check that the strings associated with nul-terminated strings name0[] and
+   name1[] have different values in s->result[].  */
+static void check_different (testing t, struct debug_state *s,
+			     const char *name0, const char *name1) {
+	if (strcmp (*slot (s, name0), *slot (s, name1)) == 0) {
+		TEST_ERROR (t, ("nsync_mu_debug_state() %s state == %s state",
+				name0, name1));
+	}
+}
+
+/* Return whether the integer at address v is zero. */
+static int int_is_zero (const void *v) {
+	return (*(int *)v == 0);
+}
+
+/* Acquire and release s->mu in write mode, waiting for s->flag==0
+   using nsync_mu_wait(). */
+static void debug_thread_writer (struct debug_state *s) {
+	nsync_mu_lock (&s->mu);
+	nsync_mu_wait (&s->mu, &int_is_zero, &s->flag, NULL);
+	nsync_mu_unlock (&s->mu);
+}
+
+/* Acquire and release s->mu in write mode, waiting for s->flag==0
+   using nsync_cv_wait(). */
+static void debug_thread_writer_cv (struct debug_state *s) {
+	nsync_mu_lock (&s->mu);
+	while (s->flag != 0) {
+		nsync_cv_wait (&s->cv, &s->mu);
+	}
+	nsync_mu_unlock (&s->mu);
+}
+
+/* Acquire and release s->mu in read mode, waiting for s->flag==0
+   using nsync_mu_wait().
+   If name!=NULL, record state of s->mu while held using name[]. */
+static void debug_thread_reader (struct debug_state *s,
+				 const char *name) {
+	nsync_mu_rlock (&s->mu);
+	nsync_mu_wait (&s->mu, &int_is_zero, &s->flag, NULL);
+	if (name != NULL) {
+		int len = 1024;
+		nsync_mu_lock (&s->result_mu);
+		*slot (s, name) = nsync_mu_debug_state_and_waiters (
+			&s->mu, (char *) malloc (len), len);
+		nsync_mu_unlock (&s->result_mu);
+	}
+	nsync_mu_runlock (&s->mu);
+}
+
+/* Acquire and release s->mu in read mode, waiting for s->flag==0
+   using nsync_cv_wait().
+   If name!=NULL, record state of s->mu while held using name[]. */
+static void debug_thread_reader_cv (struct debug_state *s,
+				    const char *name) {
+	nsync_mu_rlock (&s->mu);
+	while (s->flag != 0) {
+		nsync_cv_wait (&s->cv, &s->mu);
+	}
+	if (name != NULL) {
+		int len = 1024;
+		nsync_mu_lock (&s->result_mu);
+		*slot (s, name) = nsync_mu_debug_state_and_waiters (
+			&s->mu, (char *) malloc (len), len);
+		nsync_mu_unlock (&s->result_mu);
+	}
+	nsync_mu_runlock (&s->mu);
+}
+
+CLOSURE_DECL_BODY1 (debug_thread, struct debug_state *)
+CLOSURE_DECL_BODY2 (debug_thread_reader, struct debug_state *, const char *)
+
+/* Check that nsync_mu_debug_state() and nsync_cv_debug_state()
+   and their variants yield reasonable results.
+
+   The specification of those routines is intentionally loose,
+   so this do not check much, but the various possibilities can be 
+   examined using the verbose testing flag (-v). */
+static void test_cv_debug (testing t) {
+	int i;
+	int len = 1024;
+	char *tmp;
+	char *buf;
+	int buflen;
+	struct debug_state xs;
+	struct debug_state *s = &xs;
+	bzero ((void *) s, sizeof (*s));
+
+	/* Use nsync_*_debugger to check that they work. */
+	tmp = nsync_mu_debugger (&s->mu);
+	buflen = strlen (tmp)+1;
+	buf = (char *) malloc (buflen);
+	snprintf (buf, buflen, "%s", tmp);
+	*slot (s, "init_mu0") = buf;
+
+	tmp = nsync_cv_debugger (&s->cv);
+	buflen = strlen (tmp)+1;
+	buf = (char *) malloc (buflen);
+	snprintf (buf, buflen, "%s", tmp);
+	*slot (s, "init_cv0") = buf;
+
+	/* Get the same information via the other routines */
+	*slot (s, "init_mu1") = nsync_mu_debug_state (
+		&s->mu, (char *) malloc (len), len);
+	*slot (s, "init_cv1") = nsync_cv_debug_state (
+		&s->cv, (char *) malloc (len), len);
+	*slot (s, "init_mu2") = nsync_mu_debug_state_and_waiters (
+		&s->mu, (char *) malloc (len), len);
+	*slot (s, "init_cv2") = nsync_cv_debug_state_and_waiters (
+		&s->cv, (char *) malloc (len), len);
+
+	nsync_mu_lock (&s->mu);
+	*slot (s, "held_mu") = nsync_mu_debug_state_and_waiters (
+		&s->mu, (char *) malloc (len), len);
+	nsync_mu_unlock (&s->mu);
+
+	/* set up several threads waiting on the mutex */
+	nsync_mu_lock (&s->mu);
+	s->flag = 1;   /* so thread will block on conditions */
+	closure_fork (closure_debug_thread (&debug_thread_writer, s));
+	closure_fork (closure_debug_thread (&debug_thread_writer, s));
+	closure_fork (closure_debug_thread (&debug_thread_writer, s));
+	closure_fork (closure_debug_thread_reader (&debug_thread_reader, s, NULL));
+	closure_fork (closure_debug_thread (&debug_thread_writer_cv, s));
+	closure_fork (closure_debug_thread (&debug_thread_writer_cv, s));
+	closure_fork (closure_debug_thread (&debug_thread_writer_cv, s));
+	closure_fork (closure_debug_thread_reader (&debug_thread_reader_cv, s, NULL));
+	nsync_time_sleep (nsync_time_ms (500));
+	*slot (s, "wait0_mu") = nsync_mu_debug_state_and_waiters (
+		&s->mu, (char *) malloc (len), len);
+	*slot (s, "wait0_cv") = nsync_cv_debug_state_and_waiters (
+		&s->cv, (char *) malloc (len), len);
+
+	/* allow the threads to proceed to their conditional waits */
+	nsync_mu_unlock (&s->mu);
+	nsync_time_sleep (nsync_time_ms (500));
+	*slot (s, "wait1_mu") = nsync_mu_debug_state_and_waiters (
+		&s->mu, (char *) malloc (len), len);
+	*slot (s, "wait1_cv") = nsync_cv_debug_state_and_waiters (
+		&s->cv, (char *) malloc (len), len);
+
+	nsync_mu_lock (&s->mu);
+	/* move cv waiters to mutex queue */
+	nsync_cv_broadcast (&s->cv);
+	*slot (s, "wait2_mu") = nsync_mu_debug_state_and_waiters (
+		&s->mu, (char *) malloc (len), len);
+	*slot (s, "wait2_cv") = nsync_cv_debug_state_and_waiters (
+		&s->cv, (char *) malloc (len), len);
+
+	/* allow all threads to proceed and exit */
+	s->flag = 0;
+	nsync_mu_unlock (&s->mu);
+	nsync_time_sleep (nsync_time_ms (500));
+	*slot (s, "wait3_mu") = nsync_mu_debug_state_and_waiters (
+		&s->mu, (char *) malloc (len), len);
+	*slot (s, "wait3_cv") = nsync_cv_debug_state_and_waiters (
+		&s->cv, (char *) malloc (len), len);
+
+	/* Test with more than one reader */
+	nsync_mu_rlock (&s->mu);
+	*slot (s, "rheld1_mu") = nsync_mu_debug_state_and_waiters (
+		&s->mu, (char *) malloc (len), len);
+	closure_fork (closure_debug_thread_reader (
+		&debug_thread_reader, s, "rheld2_mu"));
+	nsync_time_sleep (nsync_time_ms (500));
+	*slot (s, "rheld1again_mu") = nsync_mu_debug_state_and_waiters (
+		&s->mu, (char *) malloc (len), len);
+	nsync_mu_runlock (&s->mu);
+
+	check_same (t, s, "init_mu0", "init_mu1");
+	check_same (t, s, "init_mu0", "init_mu2");
+	check_same (t, s, "init_cv0", "init_cv1");
+	check_same (t, s, "init_cv0", "init_cv2");
+	check_different (t, s, "init_mu0", "held_mu");
+	check_different (t, s, "rheld1_mu", "held_mu");
+	/* Must acquire result_mu, because the "rheld2_mu" slot is accessed
+	   from the debug_thread_reader() thread created above.  */
+	nsync_mu_lock (&s->result_mu);
+	check_different (t, s, "rheld1_mu", "rheld2_mu");
+	nsync_mu_unlock (&s->result_mu);
+	check_different (t, s, "init_mu0", "init_cv0");
+
+	for (i = 0; result_name[i] != NULL; i++) {
+		if (testing_verbose (t)) {
+			const char *str = *slot (s, result_name[i]);
+			TEST_LOG (t, ("%-16s  %s\n", result_name[i], str));
+		}
+		if (strlen (s->result[i]) == 0) {
+			TEST_ERROR (t, ("nsync_mu_debug_state() %s empty",
+					result_name[i]));
+		}
+		free (s->result[i]);
+	}
+}
+
+/* --------------------------- */
+
+/* Max number of waiter threads used in transfer test.
+   The last uses a conditional critical section, and others
+   use a condition variable.   */
+#define TRANSFER_MAX_WAITERS 8
+
+/* A struct cv_transfer is used to test cv-to-mu thread transfer.
+   There are up to TRANSFER_MAX_WAITERS waiter threads, and a wakeup thread.
+   Some threads wait using conditional critical sections,
+   and others using a condition variable. */
+struct cv_transfer {
+	nsync_mu mu;
+
+	nsync_cv cv;  /* signalled each time a cond[] element becomes non-zero */
+	/* Thread i waits for cond[i] to be non-zero; under mu.  */
+        int cond[TRANSFER_MAX_WAITERS];
+
+	nsync_mu control_mu;  /* protects fields below */
+	nsync_cv done_cv; /* signalled each time an element of done[] becomes non-zero */
+	int ready[TRANSFER_MAX_WAITERS];  /* set by waiters as they wait */
+	int done[TRANSFER_MAX_WAITERS];   /* set by completed waiters: to 1 by readers, and to 2 by writers */
+};
+
+/* Return whether *(int *)v != 0.  Used as a condition for nsync_mu_wait().  */
+static int int_is_non_zero (const void *v) {
+	return (0 != *(const int *)v);
+}
+
+/* Return when *pi becomes non-zero, where *pi is protected by *mu.
+   Acquires and releases *mu. */
+static void transfer_await_nonzero (nsync_mu *mu, int *pi) {
+	nsync_mu_lock (mu);
+	nsync_mu_wait (mu, &int_is_non_zero, pi, NULL);
+	nsync_mu_unlock (mu);
+}
+
+/* Set *pi to x value, where *pi is protected by *mu.
+   Acquires and releases *mu. */
+static void transfer_set (nsync_mu *mu, int *pi, int x) {
+	nsync_mu_lock (mu);
+	*pi = x;
+	nsync_mu_unlock (mu);
+}
+
+/* Lock and unlock routines for writers (index 0), and readers (index 1).  */
+static const struct {
+	void (*lock) (nsync_mu *);
+	void (*unlock) (nsync_mu *);
+} lock_type[2] = {
+	{ &nsync_mu_lock, &nsync_mu_unlock },
+	{ &nsync_mu_rlock, &nsync_mu_runlock },
+};
+
+/* Signal and broadcast routines */
+typedef void (*wakeup_func_type) (nsync_cv *);
+static wakeup_func_type wakeup_func[2] = { &nsync_cv_broadcast, &nsync_cv_signal };
+
+/* Acquire cvt->mu in write or read mode (depending on "reader"),
+   set cvt->ready[i], wait for cvt->cond[i] to become non-zero (using
+   a condition variable if use_cv!=0), then release cvt->mu, and
+   set cvt->done[i].
+   Used as the body of waiter threads created by test_cv_transfer(). */
+static void transfer_waiter_thread (struct cv_transfer *cvt, int i, int reader, int use_cv) {
+	(*lock_type[reader].lock) (&cvt->mu);
+	transfer_set (&cvt->control_mu, &cvt->ready[i], 1);
+	if (use_cv) {
+		while (!cvt->cond[i]) {
+			nsync_cv_wait (&cvt->cv, &cvt->mu);
+		}
+	} else {
+		nsync_mu_wait (&cvt->mu, &int_is_non_zero, &cvt->cond[i], NULL);
+	}
+	(*lock_type[reader].unlock) (&cvt->mu);
+
+	transfer_set (&cvt->control_mu, &cvt->done[i], reader? 1 : 2);
+	nsync_cv_broadcast (&cvt->done_cv);
+}
+
+/* Return whether all the elements a[0..n-1] are less than x. */
+static int are_all_below (int a[], int n, int x) {
+	int i;
+	for (i = 0; i != n && a[i] < x; i++) {
+	}
+	return (i == n);
+}
+
+CLOSURE_DECL_BODY4 (transfer_thread, struct cv_transfer *, int, int, int)
+
+/* Test cv-to-mutex queue transfer.  (See the code in cv.c, wake_waiters().)
+
+   The queue transfer needs to work regardless of:
+   - whether the mutex is also being used with conditional critical sections,
+   - whether reader locks are used,
+   - whether the waker signals from within the critical section (as it would in
+     a traditional monitor), or after that critical section, and
+   - the number of threads that might be awoken.  */
+static void test_cv_transfer (testing t) {
+	int waiters;	 /* number of waiters (in [2, TRANSFER_MAX_WAITERS]). */
+	int cv_writers;  /* number of cv_writers: -1 means all */
+	int ccs_reader; /* ccs waiter is a reader */
+	int wakeup_type; /* bits: use_signal and after_region */
+	enum { use_signal = 0x1 };  /* use signal rather than broadcast */
+	enum { after_region = 0x2 };  /* perform wakeup after region, rather than within */
+	struct cv_transfer Xcvt;
+	struct cv_transfer *cvt = &Xcvt;  /* So all accesses are of form cvt-> */
+	int i;
+
+	/* for all settings of all of wakeup_type, ccs_reader, cv_writers,
+	   and various different numbers of waiters */
+	for (waiters = 2; waiters <= TRANSFER_MAX_WAITERS; waiters <<= 1) {
+		for (wakeup_type = 0; wakeup_type != 4; wakeup_type++) {
+			for (cv_writers = -1; cv_writers != 3; cv_writers++) {
+				for (ccs_reader = 0; ccs_reader != 2; ccs_reader++) {
+					if (testing_verbose (t)) {
+						TEST_LOG (t, ("transfer waiters %d wakeup_type %d  cv_writers %d  ccs_reader %d\n",
+							      waiters, wakeup_type, cv_writers, ccs_reader));
+					}
+					bzero ((void *) cvt, sizeof (*cvt));
+
+					/* Start the waiter threads that use condition variables. */
+					for (i = 0; i < waiters-1; i++) {
+						int is_reader = (cv_writers != -1 && i < waiters-1-cv_writers);
+						closure_fork (closure_transfer_thread (&transfer_waiter_thread, cvt, i,
+										       is_reader, 1/*use_cv*/));
+						transfer_await_nonzero (&cvt->control_mu, &cvt->ready[i]);
+					}
+					/* Start the waiter thread that uses conditional critical sections. */
+					closure_fork (closure_transfer_thread (&transfer_waiter_thread, cvt, i,
+									       ccs_reader, 0/*use_cv*/));
+					/* Wait for all waiters to enter their regions. */
+					for (i = 0; i != waiters; i++) {
+						transfer_await_nonzero (&cvt->control_mu, &cvt->ready[i]);
+					}
+
+					nsync_mu_lock (&cvt->mu);
+					/* At this point, all the waiter threads are in waiting: 
+					   they have set their ready[] flags, and have released cvt->mu. */
+
+					/* Mark all the condition-variable as runnable,
+					   and signal at least one of them.
+					   This may wake more than one, depending on
+					   the presence of readers, and the use of
+					   signal vs broadcast.  */
+					for (i = 0; i != waiters-1; i++) {
+						cvt->cond[i] = 1;
+					}
+					if ((wakeup_type & after_region) == 0) {
+						(*wakeup_func[wakeup_type & use_signal]) (&cvt->cv);
+					}
+					nsync_mu_unlock (&cvt->mu);
+					if ((wakeup_type & after_region) != 0) {
+						for (i = 0; i != waiters-1; i++) {
+							(*wakeup_func[wakeup_type & use_signal]) (&cvt->cv);
+						}
+					}
+
+					/* Wait for at least one woken waiter to proceed,
+					   and at least one writer if there is one.  */
+					nsync_mu_lock (&cvt->control_mu);
+					while (are_all_below (&cvt->done[0], waiters-1, cv_writers!=0? 2 : 1)) {
+						nsync_cv_wait (&cvt->done_cv, &cvt->control_mu);
+					}
+					nsync_mu_unlock (&cvt->control_mu);
+
+					/* Wake all remaining threads. */
+					nsync_cv_broadcast (&cvt->cv);
+					transfer_set (&cvt->mu, &cvt->cond[waiters-1], 1);
+
+					/* And wait for all to finish. */
+					for (i = 0; i != waiters; i++) {
+						transfer_await_nonzero (&cvt->control_mu, &cvt->done[i]);
+					}
+
+					if (testing_verbose (t)) {
+						TEST_LOG (t, ("transfer waiters %d wakeup_type %d  cv_writers %d  ccs_reader %d complete\n",
+							      waiters, wakeup_type, cv_writers, ccs_reader));
+					}
+				}
+			}
+		}
+	}
+}
+
+
+/* --------------------------- */
 
 int main (int argc, char *argv[]) {
 	testing_base tb = testing_new (argc, argv, 0);
@@ -26,6 +785,8 @@ int main (int argc, char *argv[]) {
 	TEST_RUN (tb, test_cv_producer_consumer4);
 	TEST_RUN (tb, test_cv_producer_consumer5);
 	TEST_RUN (tb, test_cv_producer_consumer6);
+	TEST_RUN (tb, test_cv_deadline);
+	TEST_RUN (tb, test_cv_cancel);
 	TEST_RUN (tb, test_cv_debug);
 	TEST_RUN (tb, test_cv_transfer);
 	return (testing_base_exit (tb));
diff --git a/third_party/nsync/testing/cv_test.inc b/third_party/nsync/testing/cv_test.inc
deleted file mode 100644
index 6a1f656b3..000000000
--- a/third_party/nsync/testing/cv_test.inc
+++ /dev/null
@@ -1,774 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/cv.h"
-#include "libc/errno.h"
-#include "libc/mem/mem.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "libc/str/str.h"
-#include "third_party/nsync/debug.h"
-#include "third_party/nsync/mu.h"
-#include "third_party/nsync/mu_wait.h"
-#include "third_party/nsync/note.h"
-#include "third_party/nsync/testing/closure.h"
-#include "third_party/nsync/testing/smprintf.h"
-#include "third_party/nsync/testing/testing.h"
-#include "third_party/nsync/testing/time_extra.h"
-#include "third_party/nsync/time.h"
-
-/* --------------------------- */
-
-/* A cv_queue represents a FIFO queue with up to limit elements.
-   The storage for the queue expands as necessary up to limit. */
-typedef struct cv_queue_s {
-	int limit;          /* max value of count---should not be changed after initialization */
-	nsync_cv non_empty; /* signalled when count transitions from zero to non-zero */
-	nsync_cv non_full;  /* signalled when count transitions from limit to less than limit */
-	nsync_mu mu;        /* protects fields below */
-	int pos;            /* index of first in-use element */
-	int count;          /* number of elements in use */
-	void *data[1];      /* in use elements are data[pos, ..., (pos+count-1)%limit] */
-} cv_queue;
-
-/* Return a pointer to new cv_queue. */
-static cv_queue *cv_queue_new (int limit) {
-	cv_queue *q;
-	int size = offsetof (struct cv_queue_s, data) + sizeof (q->data[0]) * limit;
-	q = (cv_queue *) malloc (size);
-	bzero ((void *) q, size);
-	q->limit = limit;
-	return (q);
-}
-
-/* Add v to the end of the FIFO *q and return non-zero, or if the FIFO already
-   has limit elements and continues to do so until abs_deadline, do nothing and
-   return 0. */
-static int cv_queue_put (cv_queue *q, void *v, nsync_time abs_deadline) {
-	int added = 0;
-	int wake = 0;
-	nsync_mu_lock (&q->mu);
-	while (q->count == q->limit &&
-	       nsync_cv_wait_with_deadline (&q->non_full, &q->mu, NSYNC_CLOCK, abs_deadline, NULL) == 0) {
-	}
-	if (q->count != q->limit) {
-		int i = q->pos + q->count;
-		if (q->limit <= i) {
-			i -= q->limit;
-		}
-		q->data[i] = v;
-		if (q->count == 0) {
-			wake = 1;
-		}
-		q->count++;
-		added = 1;
-	}
-	nsync_mu_unlock (&q->mu);
-	if (wake) {
-		nsync_cv_broadcast (&q->non_empty);
-	}
-	return (added);
-}
-
-/* Remove the first value from the front of the FIFO *q and return it,
-   or if the FIFO is empty and continues to be so until abs_deadline,
-   do nothing and return NULL. */
-static void *cv_queue_get (cv_queue *q, nsync_time abs_deadline) {
-	void *v = NULL;
-	nsync_mu_lock (&q->mu);
-	while (q->count == 0 &&
-	       nsync_cv_wait_with_deadline (&q->non_empty, &q->mu, NSYNC_CLOCK, abs_deadline, NULL) == 0) {
-	}
-	if (q->count != 0) {
-		v = q->data[q->pos];
-		q->data[q->pos] = NULL;
-		if (q->count == q->limit) {
-			nsync_cv_broadcast (&q->non_full);
-		}
-		q->pos++;
-		q->count--;
-		if (q->pos == q->limit) {
-			q->pos = 0;
-		}
-	}
-	nsync_mu_unlock (&q->mu);
-	return (v);
-}
-
-/* --------------------------- */
-
-static char ptr_to_int_c;
-#define INT_TO_PTR(x) ((x) + &ptr_to_int_c)
-#define PTR_TO_INT(p) (((char *) (p)) - &ptr_to_int_c)
-
-/* Put count integers on *q, in the sequence start*3, (start+1)*3, (start+2)*3, .... */
-static void producer_cv_n (testing t, cv_queue *q, int start, int count) {
-	int i;
-	for (i = 0; i != count; i++) {
-		if (!cv_queue_put (q, INT_TO_PTR ((start+i)*3), nsync_time_no_deadline)) {
-			TEST_FATAL (t, ("cv_queue_put() returned 0 with no deadline"));
-		}
-	}
-}
-CLOSURE_DECL_BODY4 (producer_cv_n, testing, cv_queue *, int, int)
-
-/* Get count integers from *q, and check that they are in the
-   sequence start*3, (start+1)*3, (start+2)*3, .... */
-static void consumer_cv_n (testing t, cv_queue *q, int start, int count) {
-	int i;
-	for (i = 0; i != count; i++) {
-		void *v = cv_queue_get (q, nsync_time_no_deadline);
-		int x;
-		if (v == NULL) {
-			TEST_FATAL (t, ("cv_queue_get() returned NULL with no deadline"));
-		}
-		x = PTR_TO_INT (v);
-		if (x != (start+i)*3) {
-			TEST_FATAL (t, ("cv_queue_get() returned bad value; want %d, got %d",
-				   (start+i)*3, x));
-		}
-	}
-}
-
-/* CV_PRODUCER_CONSUMER_N is the number of elements passed from producer to consumer in the
-   test_cv_producer_consumer*() tests below. */
-#define CV_PRODUCER_CONSUMER_N 100000
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**0. */
-static void test_cv_producer_consumer0 (testing t) {
-	cv_queue *q = cv_queue_new (1);
-	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
-	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**1. */
-static void test_cv_producer_consumer1 (testing t) {
-	cv_queue *q = cv_queue_new (10);
-	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
-	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**2. */
-static void test_cv_producer_consumer2 (testing t) {
-	cv_queue *q = cv_queue_new (100);
-	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
-	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**3. */
-static void test_cv_producer_consumer3 (testing t) {
-	cv_queue *q = cv_queue_new (1000);
-	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
-	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**4. */
-static void test_cv_producer_consumer4 (testing t) {
-	cv_queue *q = cv_queue_new (10 * 1000);
-	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
-	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**5. */
-static void test_cv_producer_consumer5 (testing t) {
-	cv_queue *q = cv_queue_new (100 * 1000);
-	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
-	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**6. */
-static void test_cv_producer_consumer6 (testing t) {
-	cv_queue *q = cv_queue_new (1000 * 1000);
-	closure_fork (closure_producer_cv_n (&producer_cv_n, t, q, 0, CV_PRODUCER_CONSUMER_N));
-	consumer_cv_n (t, q, 0, CV_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* The following values control how aggressively we police the timeout. */
-#define TOO_EARLY_MS 1
-#define TOO_LATE_MS 100   /* longer, to accommodate scheduling delays */
-#define TOO_LATE_ALLOWED 25         /* number of iterations permitted to violate too_late */
-
-/* Check timeouts on a CV wait_with_deadline(). */
-static void test_cv_deadline (testing t) {
-	int too_late_violations;
-	nsync_mu mu;
-	nsync_cv cv;
-	int i;
-	nsync_time too_early;
-	nsync_time too_late;
-
-	nsync_mu_init (&mu);
-	nsync_cv_init (&cv);
-	too_early = nsync_time_ms (TOO_EARLY_MS);
-	too_late = nsync_time_ms (TOO_LATE_MS);
-	too_late_violations = 0;
-	nsync_mu_lock (&mu);
-	for (i = 0; i != 50; i++) {
-		nsync_time end_time;
-		nsync_time start_time;
-		nsync_time expected_end_time;
-		start_time = nsync_time_now (NSYNC_CLOCK);
-		expected_end_time = nsync_time_add (start_time, nsync_time_ms (87));
-		if (nsync_cv_wait_with_deadline (&cv, &mu, NSYNC_CLOCK, expected_end_time,
-						 NULL) != ETIMEDOUT) {
-			TEST_FATAL (t, ("nsync_cv_wait() returned non-expired for a timeout"));
-		}
-		end_time = nsync_time_now (NSYNC_CLOCK);
-		if (nsync_time_cmp (end_time, nsync_time_sub (expected_end_time, too_early)) < 0) {
-			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
-			TEST_ERROR (t, ("nsync_cv_wait() returned %s too early", elapsed_str));
-			free (elapsed_str);
-		}
-		if (nsync_time_cmp (nsync_time_add (expected_end_time, too_late), end_time) < 0) {
-			too_late_violations++;
-		}
-	}
-	nsync_mu_unlock (&mu);
-	if (too_late_violations > TOO_LATE_ALLOWED) {
-		TEST_ERROR (t, ("nsync_cv_wait() returned too late %d times", too_late_violations));
-	}
-}
-
-/* Check cancellations with nsync_cv_wait_with_deadline(). */
-static void test_cv_cancel (testing t) {
-	nsync_time future_time;
-	int too_late_violations;
-	nsync_mu mu;
-	nsync_cv cv;
-	int i;
-	nsync_time too_early;
-	nsync_time too_late;
-
-	nsync_mu_init (&mu);
-	nsync_cv_init (&cv);
-	too_early = nsync_time_ms (TOO_EARLY_MS);
-	too_late = nsync_time_ms (TOO_LATE_MS);
-
-	/* The loops below cancel after 87 milliseconds, like the timeout tests above. */
-
-	future_time = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (3600000)); /* test cancels with timeout */
-
-	too_late_violations = 0;
-	nsync_mu_lock (&mu);
-	for (i = 0; i != 50; i++) {
-		int x;
-		nsync_note cancel;
-		nsync_time end_time;
-		nsync_time start_time;
-		nsync_time expected_end_time;
-		start_time = nsync_time_now (NSYNC_CLOCK);
-		expected_end_time = nsync_time_add (start_time, nsync_time_ms (87));
-
-		cancel = nsync_note_new (NULL, NSYNC_CLOCK, expected_end_time);
-
-		x = nsync_cv_wait_with_deadline (&cv, &mu, NSYNC_CLOCK, future_time, cancel);
-		if (x != ECANCELED) {
-			TEST_FATAL (t, ("nsync_cv_wait() returned non-cancelled (%d) for "
-				   "a cancellation; expected %d",
-				   x, ECANCELED));
-		}
-		end_time = nsync_time_now (NSYNC_CLOCK);
-		if (nsync_time_cmp (end_time, nsync_time_sub (expected_end_time, too_early)) < 0) {
-			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
-			TEST_ERROR (t, ("nsync_cv_wait() returned %s too early", elapsed_str));
-			free (elapsed_str);
-		}
-		if (nsync_time_cmp (nsync_time_add (expected_end_time, too_late), end_time) < 0) {
-			too_late_violations++;
-		}
-
-		/* Check that an already cancelled wait returns immediately. */
-		start_time = nsync_time_now (NSYNC_CLOCK);
-
-		x = nsync_cv_wait_with_deadline (&cv, &mu, NSYNC_CLOCK, nsync_time_no_deadline, cancel);
-		if (x != ECANCELED) {
-			TEST_FATAL (t, ("nsync_cv_wait() returned non-cancelled (%d) for "
-				   "a cancellation; expected %d",
-				   x, ECANCELED));
-		}
-		end_time = nsync_time_now (NSYNC_CLOCK);
-		if (nsync_time_cmp (end_time, start_time) < 0) {
-			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
-			TEST_ERROR (t, ("nsync_cv_wait() returned %s too early", elapsed_str));
-			free (elapsed_str);
-		}
-		if (nsync_time_cmp (nsync_time_add (start_time, too_late), end_time) < 0) {
-			too_late_violations++;
-		}
-		nsync_note_notify (cancel);
-
-		nsync_note_free (cancel);
-	}
-	nsync_mu_unlock (&mu);
-	if (too_late_violations > TOO_LATE_ALLOWED) {
-		TEST_ERROR (t, ("nsync_cv_wait() returned too late %d times", too_late_violations));
-	}
-}
-
-/* --------------------------- */
-
-/* Names of debug results for test_cv_debug. */
-static const char *result_name[] = {
-	"init_mu0",
-	"init_cv0",
-	"init_mu1",
-	"init_cv1",
-	"init_mu2",
-	"init_cv2",
-	"held_mu",
-	"wait0_mu",
-	"wait0_cv",
-	"wait1_mu",
-	"wait1_cv",
-	"wait2_mu",
-	"wait2_cv",
-	"wait3_mu",
-	"wait3_cv",
-	"rheld1_mu",
-	"rheld2_mu",
-	"rheld1again_mu",
-	NULL /* sentinel */
-};
-
-/* state for test_cv_debug() */
-struct debug_state {
-	nsync_mu mu;  /* protects flag field */
-	nsync_cv cv;  /* signalled when flag becomes zero */
-	int flag;     /* 0 => threads proceed; non-zero => threads block */
-
-	/* result[] is an array of nul-terminated string values, accessed via
-	   name (in result_name[]) via slot().  Entries accessed from multiple
-	   threads are protected by result_mu.  */
-	char *result[sizeof (result_name) / sizeof (result_name[0])];
-	nsync_mu result_mu;
-};
-
-/* Return a pointer to the slot in s->result[] associated with the
-   nul-terminated name[] */
-static char **slot (struct debug_state *s, const char *name) {
-	int i = 0;
-	while (result_name[i] != NULL && strcmp (result_name[i], name) != 0) {
-		i++;
-	}
-	if (result_name[i] == NULL) {  /* caller gave non-existent name */
-		abort ();
-	}
-	return (&s->result[i]);
-}
-
-/* Check that the strings associated with nul-terminated strings name0[] and
-   name1[] have the same values in s->result[].  */
-static void check_same (testing t, struct debug_state *s,
-			     const char *name0, const char *name1) {
-	if (strcmp (*slot (s, name0), *slot (s, name1)) != 0) {
-		TEST_ERROR (t, ("nsync_mu_debug_state() %s state != %s state (%s vs. %s)",
-				name0, name1, *slot (s, name0), *slot (s, name1)));
-	}
-}
-
-/* Check that the strings associated with nul-terminated strings name0[] and
-   name1[] have different values in s->result[].  */
-static void check_different (testing t, struct debug_state *s,
-			     const char *name0, const char *name1) {
-	if (strcmp (*slot (s, name0), *slot (s, name1)) == 0) {
-		TEST_ERROR (t, ("nsync_mu_debug_state() %s state == %s state",
-				name0, name1));
-	}
-}
-
-/* Return whether the integer at address v is zero. */
-static int int_is_zero (const void *v) {
-	return (*(int *)v == 0);
-}
-
-/* Acquire and release s->mu in write mode, waiting for s->flag==0
-   using nsync_mu_wait(). */
-static void debug_thread_writer (struct debug_state *s) {
-	nsync_mu_lock (&s->mu);
-	nsync_mu_wait (&s->mu, &int_is_zero, &s->flag, NULL);
-	nsync_mu_unlock (&s->mu);
-}
-
-/* Acquire and release s->mu in write mode, waiting for s->flag==0
-   using nsync_cv_wait(). */
-static void debug_thread_writer_cv (struct debug_state *s) {
-	nsync_mu_lock (&s->mu);
-	while (s->flag != 0) {
-		nsync_cv_wait (&s->cv, &s->mu);
-	}
-	nsync_mu_unlock (&s->mu);
-}
-
-/* Acquire and release s->mu in read mode, waiting for s->flag==0
-   using nsync_mu_wait().
-   If name!=NULL, record state of s->mu while held using name[]. */
-static void debug_thread_reader (struct debug_state *s,
-				 const char *name) {
-	nsync_mu_rlock (&s->mu);
-	nsync_mu_wait (&s->mu, &int_is_zero, &s->flag, NULL);
-	if (name != NULL) {
-		int len = 1024;
-		nsync_mu_lock (&s->result_mu);
-		*slot (s, name) = nsync_mu_debug_state_and_waiters (
-			&s->mu, (char *) malloc (len), len);
-		nsync_mu_unlock (&s->result_mu);
-	}
-	nsync_mu_runlock (&s->mu);
-}
-
-/* Acquire and release s->mu in read mode, waiting for s->flag==0
-   using nsync_cv_wait().
-   If name!=NULL, record state of s->mu while held using name[]. */
-static void debug_thread_reader_cv (struct debug_state *s,
-				    const char *name) {
-	nsync_mu_rlock (&s->mu);
-	while (s->flag != 0) {
-		nsync_cv_wait (&s->cv, &s->mu);
-	}
-	if (name != NULL) {
-		int len = 1024;
-		nsync_mu_lock (&s->result_mu);
-		*slot (s, name) = nsync_mu_debug_state_and_waiters (
-			&s->mu, (char *) malloc (len), len);
-		nsync_mu_unlock (&s->result_mu);
-	}
-	nsync_mu_runlock (&s->mu);
-}
-
-CLOSURE_DECL_BODY1 (debug_thread, struct debug_state *)
-CLOSURE_DECL_BODY2 (debug_thread_reader, struct debug_state *, const char *)
-
-/* Check that nsync_mu_debug_state() and nsync_cv_debug_state()
-   and their variants yield reasonable results.
-
-   The specification of those routines is intentionally loose,
-   so this do not check much, but the various possibilities can be 
-   examined using the verbose testing flag (-v). */
-static void test_cv_debug (testing t) {
-	int i;
-	int len = 1024;
-	char *tmp;
-	char *buf;
-	int buflen;
-	struct debug_state xs;
-	struct debug_state *s = &xs;
-	bzero ((void *) s, sizeof (*s));
-
-	/* Use nsync_*_debugger to check that they work. */
-	tmp = nsync_mu_debugger (&s->mu);
-	buflen = strlen (tmp)+1;
-	buf = (char *) malloc (buflen);
-	snprintf (buf, buflen, "%s", tmp);
-	*slot (s, "init_mu0") = buf;
-
-	tmp = nsync_cv_debugger (&s->cv);
-	buflen = strlen (tmp)+1;
-	buf = (char *) malloc (buflen);
-	snprintf (buf, buflen, "%s", tmp);
-	*slot (s, "init_cv0") = buf;
-
-	/* Get the same information via the other routines */
-	*slot (s, "init_mu1") = nsync_mu_debug_state (
-		&s->mu, (char *) malloc (len), len);
-	*slot (s, "init_cv1") = nsync_cv_debug_state (
-		&s->cv, (char *) malloc (len), len);
-	*slot (s, "init_mu2") = nsync_mu_debug_state_and_waiters (
-		&s->mu, (char *) malloc (len), len);
-	*slot (s, "init_cv2") = nsync_cv_debug_state_and_waiters (
-		&s->cv, (char *) malloc (len), len);
-
-	nsync_mu_lock (&s->mu);
-	*slot (s, "held_mu") = nsync_mu_debug_state_and_waiters (
-		&s->mu, (char *) malloc (len), len);
-	nsync_mu_unlock (&s->mu);
-
-	/* set up several threads waiting on the mutex */
-	nsync_mu_lock (&s->mu);
-	s->flag = 1;   /* so thread will block on conditions */
-	closure_fork (closure_debug_thread (&debug_thread_writer, s));
-	closure_fork (closure_debug_thread (&debug_thread_writer, s));
-	closure_fork (closure_debug_thread (&debug_thread_writer, s));
-	closure_fork (closure_debug_thread_reader (&debug_thread_reader, s, NULL));
-	closure_fork (closure_debug_thread (&debug_thread_writer_cv, s));
-	closure_fork (closure_debug_thread (&debug_thread_writer_cv, s));
-	closure_fork (closure_debug_thread (&debug_thread_writer_cv, s));
-	closure_fork (closure_debug_thread_reader (&debug_thread_reader_cv, s, NULL));
-	nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (500));
-	*slot (s, "wait0_mu") = nsync_mu_debug_state_and_waiters (
-		&s->mu, (char *) malloc (len), len);
-	*slot (s, "wait0_cv") = nsync_cv_debug_state_and_waiters (
-		&s->cv, (char *) malloc (len), len);
-
-	/* allow the threads to proceed to their conditional waits */
-	nsync_mu_unlock (&s->mu);
-	nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (500));
-	*slot (s, "wait1_mu") = nsync_mu_debug_state_and_waiters (
-		&s->mu, (char *) malloc (len), len);
-	*slot (s, "wait1_cv") = nsync_cv_debug_state_and_waiters (
-		&s->cv, (char *) malloc (len), len);
-
-	nsync_mu_lock (&s->mu);
-	/* move cv waiters to mutex queue */
-	nsync_cv_broadcast (&s->cv);
-	*slot (s, "wait2_mu") = nsync_mu_debug_state_and_waiters (
-		&s->mu, (char *) malloc (len), len);
-	*slot (s, "wait2_cv") = nsync_cv_debug_state_and_waiters (
-		&s->cv, (char *) malloc (len), len);
-
-	/* allow all threads to proceed and exit */
-	s->flag = 0;
-	nsync_mu_unlock (&s->mu);
-	nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (500));
-	*slot (s, "wait3_mu") = nsync_mu_debug_state_and_waiters (
-		&s->mu, (char *) malloc (len), len);
-	*slot (s, "wait3_cv") = nsync_cv_debug_state_and_waiters (
-		&s->cv, (char *) malloc (len), len);
-
-	/* Test with more than one reader */
-	nsync_mu_rlock (&s->mu);
-	*slot (s, "rheld1_mu") = nsync_mu_debug_state_and_waiters (
-		&s->mu, (char *) malloc (len), len);
-	closure_fork (closure_debug_thread_reader (
-		&debug_thread_reader, s, "rheld2_mu"));
-	nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (500));
-	*slot (s, "rheld1again_mu") = nsync_mu_debug_state_and_waiters (
-		&s->mu, (char *) malloc (len), len);
-	nsync_mu_runlock (&s->mu);
-
-	check_same (t, s, "init_mu0", "init_mu1");
-	check_same (t, s, "init_mu0", "init_mu2");
-	check_same (t, s, "init_cv0", "init_cv1");
-	check_same (t, s, "init_cv0", "init_cv2");
-	check_different (t, s, "init_mu0", "held_mu");
-	check_different (t, s, "rheld1_mu", "held_mu");
-	/* Must acquire result_mu, because the "rheld2_mu" slot is accessed
-	   from the debug_thread_reader() thread created above.  */
-	nsync_mu_lock (&s->result_mu);
-	check_different (t, s, "rheld1_mu", "rheld2_mu");
-	nsync_mu_unlock (&s->result_mu);
-	check_different (t, s, "init_mu0", "init_cv0");
-
-	for (i = 0; result_name[i] != NULL; i++) {
-		if (testing_verbose (t)) {
-			const char *str = *slot (s, result_name[i]);
-			TEST_LOG (t, ("%-16s  %s\n", result_name[i], str));
-		}
-		if (strlen (s->result[i]) == 0) {
-			TEST_ERROR (t, ("nsync_mu_debug_state() %s empty",
-					result_name[i]));
-		}
-		free (s->result[i]);
-	}
-}
-
-/* --------------------------- */
-
-/* Max number of waiter threads used in transfer test.
-   The last uses a conditional critical section, and others
-   use a condition variable.   */
-#define TRANSFER_MAX_WAITERS 8
-
-/* A struct cv_transfer is used to test cv-to-mu thread transfer.
-   There are up to TRANSFER_MAX_WAITERS waiter threads, and a wakeup thread.
-   Some threads wait using conditional critical sections,
-   and others using a condition variable. */
-struct cv_transfer {
-	nsync_mu mu;
-
-	nsync_cv cv;  /* signalled each time a cond[] element becomes non-zero */
-	/* Thread i waits for cond[i] to be non-zero; under mu.  */
-        int cond[TRANSFER_MAX_WAITERS];
-
-	nsync_mu control_mu;  /* protects fields below */
-	nsync_cv done_cv; /* signalled each time an element of done[] becomes non-zero */
-	int ready[TRANSFER_MAX_WAITERS];  /* set by waiters as they wait */
-	int done[TRANSFER_MAX_WAITERS];   /* set by completed waiters: to 1 by readers, and to 2 by writers */
-};
-
-/* Return whether *(int *)v != 0.  Used as a condition for nsync_mu_wait().  */
-static int int_is_non_zero (const void *v) {
-	return (0 != *(const int *)v);
-}
-
-/* Return when *pi becomes non-zero, where *pi is protected by *mu.
-   Acquires and releases *mu. */
-static void transfer_await_nonzero (nsync_mu *mu, int *pi) {
-	nsync_mu_lock (mu);
-	nsync_mu_wait (mu, &int_is_non_zero, pi, NULL);
-	nsync_mu_unlock (mu);
-}
-
-/* Set *pi to x value, where *pi is protected by *mu.
-   Acquires and releases *mu. */
-static void transfer_set (nsync_mu *mu, int *pi, int x) {
-	nsync_mu_lock (mu);
-	*pi = x;
-	nsync_mu_unlock (mu);
-}
-
-/* Lock and unlock routines for writers (index 0), and readers (index 1).  */
-static const struct {
-	void (*lock) (nsync_mu *);
-	void (*unlock) (nsync_mu *);
-} lock_type[2] = {
-	{ &nsync_mu_lock, &nsync_mu_unlock },
-	{ &nsync_mu_rlock, &nsync_mu_runlock },
-};
-
-/* Signal and broadcast routines */
-typedef void (*wakeup_func_type) (nsync_cv *);
-static wakeup_func_type wakeup_func[2] = { &nsync_cv_broadcast, &nsync_cv_signal };
-
-/* Acquire cvt->mu in write or read mode (depending on "reader"),
-   set cvt->ready[i], wait for cvt->cond[i] to become non-zero (using
-   a condition variable if use_cv!=0), then release cvt->mu, and
-   set cvt->done[i].
-   Used as the body of waiter threads created by test_cv_transfer(). */
-static void transfer_waiter_thread (struct cv_transfer *cvt, int i, int reader, int use_cv) {
-	(*lock_type[reader].lock) (&cvt->mu);
-	transfer_set (&cvt->control_mu, &cvt->ready[i], 1);
-	if (use_cv) {
-		while (!cvt->cond[i]) {
-			nsync_cv_wait (&cvt->cv, &cvt->mu);
-		}
-	} else {
-		nsync_mu_wait (&cvt->mu, &int_is_non_zero, &cvt->cond[i], NULL);
-	}
-	(*lock_type[reader].unlock) (&cvt->mu);
-
-	transfer_set (&cvt->control_mu, &cvt->done[i], reader? 1 : 2);
-	nsync_cv_broadcast (&cvt->done_cv);
-}
-
-/* Return whether all the elements a[0..n-1] are less than x. */
-static int are_all_below (int a[], int n, int x) {
-	int i;
-	for (i = 0; i != n && a[i] < x; i++) {
-	}
-	return (i == n);
-}
-
-CLOSURE_DECL_BODY4 (transfer_thread, struct cv_transfer *, int, int, int)
-
-/* Test cv-to-mutex queue transfer.  (See the code in cv.c, wake_waiters().)
-
-   The queue transfer needs to work regardless of:
-   - whether the mutex is also being used with conditional critical sections,
-   - whether reader locks are used,
-   - whether the waker signals from within the critical section (as it would in
-     a traditional monitor), or after that critical section, and
-   - the number of threads that might be awoken.  */
-static void test_cv_transfer (testing t) {
-	int waiters;	 /* number of waiters (in [2, TRANSFER_MAX_WAITERS]). */
-	int cv_writers;  /* number of cv_writers: -1 means all */
-	int ccs_reader; /* ccs waiter is a reader */
-	int wakeup_type; /* bits: use_signal and after_region */
-	enum { use_signal = 0x1 };  /* use signal rather than broadcast */
-	enum { after_region = 0x2 };  /* perform wakeup after region, rather than within */
-	struct cv_transfer Xcvt;
-	struct cv_transfer *cvt = &Xcvt;  /* So all accesses are of form cvt-> */
-	int i;
-
-	/* for all settings of all of wakeup_type, ccs_reader, cv_writers,
-	   and various different numbers of waiters */
-	for (waiters = 2; waiters <= TRANSFER_MAX_WAITERS; waiters <<= 1) {
-		for (wakeup_type = 0; wakeup_type != 4; wakeup_type++) {
-			for (cv_writers = -1; cv_writers != 3; cv_writers++) {
-				for (ccs_reader = 0; ccs_reader != 2; ccs_reader++) {
-					if (testing_verbose (t)) {
-						TEST_LOG (t, ("transfer waiters %d wakeup_type %d  cv_writers %d  ccs_reader %d\n",
-							      waiters, wakeup_type, cv_writers, ccs_reader));
-					}
-					bzero ((void *) cvt, sizeof (*cvt));
-
-					/* Start the waiter threads that use condition variables. */
-					for (i = 0; i < waiters-1; i++) {
-						int is_reader = (cv_writers != -1 && i < waiters-1-cv_writers);
-						closure_fork (closure_transfer_thread (&transfer_waiter_thread, cvt, i,
-										       is_reader, 1/*use_cv*/));
-						transfer_await_nonzero (&cvt->control_mu, &cvt->ready[i]);
-					}
-					/* Start the waiter thread that uses conditional critical sections. */
-					closure_fork (closure_transfer_thread (&transfer_waiter_thread, cvt, i,
-									       ccs_reader, 0/*use_cv*/));
-					/* Wait for all waiters to enter their regions. */
-					for (i = 0; i != waiters; i++) {
-						transfer_await_nonzero (&cvt->control_mu, &cvt->ready[i]);
-					}
-
-					nsync_mu_lock (&cvt->mu);
-					/* At this point, all the waiter threads are in waiting: 
-					   they have set their ready[] flags, and have released cvt->mu. */
-
-					/* Mark all the condition-variable as runnable,
-					   and signal at least one of them.
-					   This may wake more than one, depending on
-					   the presence of readers, and the use of
-					   signal vs broadcast.  */
-					for (i = 0; i != waiters-1; i++) {
-						cvt->cond[i] = 1;
-					}
-					if ((wakeup_type & after_region) == 0) {
-						(*wakeup_func[wakeup_type & use_signal]) (&cvt->cv);
-					}
-					nsync_mu_unlock (&cvt->mu);
-					if ((wakeup_type & after_region) != 0) {
-						for (i = 0; i != waiters-1; i++) {
-							(*wakeup_func[wakeup_type & use_signal]) (&cvt->cv);
-						}
-					}
-
-					/* Wait for at least one woken waiter to proceed,
-					   and at least one writer if there is one.  */
-					nsync_mu_lock (&cvt->control_mu);
-					while (are_all_below (&cvt->done[0], waiters-1, cv_writers!=0? 2 : 1)) {
-						nsync_cv_wait (&cvt->done_cv, &cvt->control_mu);
-					}
-					nsync_mu_unlock (&cvt->control_mu);
-
-					/* Wake all remaining threads. */
-					nsync_cv_broadcast (&cvt->cv);
-					transfer_set (&cvt->mu, &cvt->cond[waiters-1], 1);
-
-					/* And wait for all to finish. */
-					for (i = 0; i != waiters; i++) {
-						transfer_await_nonzero (&cvt->control_mu, &cvt->done[i]);
-					}
-
-					if (testing_verbose (t)) {
-						TEST_LOG (t, ("transfer waiters %d wakeup_type %d  cv_writers %d  ccs_reader %d complete\n",
-							      waiters, wakeup_type, cv_writers, ccs_reader));
-					}
-				}
-			}
-		}
-	}
-}
diff --git a/third_party/nsync/testing/cv_wait_example.c b/third_party/nsync/testing/cv_wait_example_test.c
similarity index 95%
rename from third_party/nsync/testing/cv_wait_example.c
rename to third_party/nsync/testing/cv_wait_example_test.c
index f94092987..20f7ac000 100644
--- a/third_party/nsync/testing/cv_wait_example.c
+++ b/third_party/nsync/testing/cv_wait_example_test.c
@@ -18,14 +18,12 @@
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "third_party/nsync/array.internal.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/cv.h"
 #include "third_party/nsync/heap.internal.h"
 #include "third_party/nsync/mu.h"
 #include "third_party/nsync/testing/closure.h"
 #include "third_party/nsync/testing/smprintf.h"
 #include "third_party/nsync/testing/testing.h"
-#include "libc/dce.h"
 #include "third_party/nsync/testing/time_extra.h"
 
 /* Example use of CV.wait():  A priority queue of strings whose
@@ -76,8 +74,7 @@ static const char *string_priority_queue_cv_remove_with_deadline (string_priorit
 	const char *s = NULL;
 	nsync_mu_lock (&q->mu);
 	while (A_LEN (&q->heap) == 0 &&
-	       nsync_cv_wait_with_deadline (&q->non_empty, &q->mu, NSYNC_CLOCK,
-					    abs_deadline, NULL) == 0) {
+	       nsync_cv_wait_with_deadline (&q->non_empty, &q->mu, abs_deadline, NULL) == 0) {
 	}
 	alen = A_LEN (&q->heap);
 	if (alen != 0) {
@@ -102,7 +99,7 @@ static void add_and_wait_cv (string_priority_queue_cv *q, nsync_time delay,
 	int i;
 	for (i = 0; i != n; i++) {
 		string_priority_queue_cv_add (q, s[i]);
-		nsync_time_sleep (NSYNC_CLOCK, delay);
+		nsync_time_sleep (delay);
 	}
 }
 
@@ -124,7 +121,7 @@ static void a_char_append (a_char *a, const char *str) {
 static void remove_and_print_cv (string_priority_queue_cv *q, nsync_time delay, a_char *output) {
 	const char *s;
 	if ((s = string_priority_queue_cv_remove_with_deadline (
-			q, nsync_time_add (nsync_time_now (NSYNC_CLOCK), delay))) != NULL) {
+			q, nsync_time_add (nsync_time_now(), delay))) != NULL) {
 		a_char_append (output, s);
 		a_char_append (output, "\n");
 	} else {
@@ -158,7 +155,7 @@ static void example_cv_wait (testing t) {
 					       nsync_time_ms (500), NELEM (input), input));
 
 	/* delay: "one", "two", "three" are queued; not "four" */
-	nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (1200));
+	nsync_time_sleep (nsync_time_ms (1200));
 
 	remove_and_print_cv (&q, nsync_time_ms (1000), &output);    /* "one" */
 	remove_and_print_cv (&q, nsync_time_ms (1000), &output);    /* "three" (less than "two") */
diff --git a/third_party/nsync/testing/mu2_test.c b/third_party/nsync/testing/mu2_test.c
deleted file mode 100644
index 938100063..000000000
--- a/third_party/nsync/testing/mu2_test.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/mu_test.inc"
-
-int main (int argc, char *argv[]) {
-	testing_base tb = testing_new (argc, argv, 0);
-
-	TEST_RUN (tb, test_mutex_nthread);
-	TEST_RUN (tb, test_xmutex_nthread);
-
-	return (testing_base_exit (tb));
-}
diff --git a/third_party/nsync/testing/mu3_test.c b/third_party/nsync/testing/mu3_test.c
deleted file mode 100644
index 2aac65baa..000000000
--- a/third_party/nsync/testing/mu3_test.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/mu_test.inc"
-
-int main (int argc, char *argv[]) {
-	testing_base tb = testing_new (argc, argv, 0);
-
-	TEST_RUN (tb, test_rwmutex_nthread);
-	TEST_RUN (tb, test_try_mu_nthread);
-
-	return (testing_base_exit (tb));
-}
diff --git a/third_party/nsync/testing/mu_starvation_test_.c b/third_party/nsync/testing/mu_starvation_test.c
similarity index 86%
rename from third_party/nsync/testing/mu_starvation_test_.c
rename to third_party/nsync/testing/mu_starvation_test.c
index a84e31af9..f6df7cd5c 100644
--- a/third_party/nsync/testing/mu_starvation_test_.c
+++ b/third_party/nsync/testing/mu_starvation_test.c
@@ -44,7 +44,7 @@ static void starve_data_init (starve_data *sd, int threads) {
 	bzero ((void *) sd, sizeof (*sd));
 	sd->not_yet_started = threads;
 	sd->not_yet_done = threads;
-	sd->start = nsync_time_now (NSYNC_CLOCK);
+	sd->start = nsync_time_now ();
 }
 
 /* Loop until *cancel or deadline, and on each iteration
@@ -62,9 +62,9 @@ static void starve_with_readers (starve_data *sd, nsync_time period,
 	sd->not_yet_started--;
 	nsync_mu_unlock (&sd->control_mu);
 
-	for (now = nsync_time_now (NSYNC_CLOCK);
+	for (now = nsync_time_now ();
 	     !sd->cancel && nsync_time_cmp (now, deadline) < 0;
-	     now = nsync_time_now (NSYNC_CLOCK)) {
+	     now = nsync_time_now ()) {
 		uint32_t new_us;
 		uint32_t now_us = (uint32_t) (nsync_time_to_dbl (nsync_time_sub (now, sd->start)) * 1e6);
 		uint32_t index = (now_us + period_us - 1) / period_us;
@@ -72,7 +72,7 @@ static void starve_with_readers (starve_data *sd, nsync_time period,
 			index++;
 		}
 		new_us = index * period_us;
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_from_dbl (1e-6 * (double) (new_us-now_us)));
+		nsync_time_sleep (nsync_time_from_dbl (1e-6 * (double) (new_us-now_us)));
 		nsync_mu_runlock (&sd->mu);
 		nsync_mu_rlock (&sd->mu);
 	}
@@ -113,7 +113,7 @@ static void test_starve_with_readers (testing t) {
 	starve_data_init (&sd, 2); /* two threads, started below */
 
 	/* Threads run for at most 10s. */
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (10000));
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (10000));
 
 	/* These two threads will try to hold a reader lock
 	   continuously until cancel is set or deadline is reached,
@@ -130,9 +130,9 @@ static void test_starve_with_readers (testing t) {
 
 	/* If using an nsync_mu, use nsync_mu_trylock() to attempt to acquire while the
 	   readers are hogging the lock.  We expect no acquisitions to succeed. */
-	finish = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (500));
+	finish = nsync_time_add (nsync_time_now (), nsync_time_ms (500));
 	trylock_acquires = 0; /* number of acquires */
-	while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), finish) < 0) {
+	while (nsync_time_cmp (nsync_time_now (), finish) < 0) {
 		if (nsync_mu_trylock (&sd.mu)) {
 			trylock_acquires++;
 			nsync_mu_unlock (&sd.mu);
@@ -147,15 +147,15 @@ static void test_starve_with_readers (testing t) {
 	/* Use nsync_mu_lock() to attempt to acquire while the readers are hogging
 	   the lock.  We expect several acquisitions to succeed. */
 	expected_lo = 2;
-	finish = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (5000));
+	finish = nsync_time_add (nsync_time_now (), nsync_time_ms (5000));
 	lock_acquires = 0; /* number of acquires */
-	while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), finish) < 0 && lock_acquires < expected_lo) {
+	while (nsync_time_cmp (nsync_time_now (), finish) < 0 && lock_acquires < expected_lo) {
 		nsync_mu_lock (&sd.mu);
 		lock_acquires++;
 		nsync_mu_unlock (&sd.mu);
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (1));
+		nsync_time_sleep (nsync_time_ms (1));
 	}
-	if (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), deadline) > 0 && lock_acquires == 1) {
+	if (nsync_time_cmp (nsync_time_now (), deadline) > 0 && lock_acquires == 1) {
 		lock_acquires = 0; /* hog threads timed out */
 	}
 	if (lock_acquires < expected_lo) {
@@ -185,10 +185,10 @@ static void starve_with_writer (starve_data *sd, nsync_time hold_time,
 	sd->not_yet_started--;
 	nsync_mu_unlock (&sd->control_mu);
 
-	for (now = nsync_time_now (NSYNC_CLOCK);
+	for (now = nsync_time_now ();
 	     !sd->cancel && nsync_time_cmp (now, deadline) < 0;
-	     now = nsync_time_now (NSYNC_CLOCK)) {
-		nsync_time_sleep (NSYNC_CLOCK, hold_time);
+	     now = nsync_time_now ()) {
+		nsync_time_sleep (hold_time);
 		nsync_mu_unlock (&sd->mu);
 		nsync_mu_lock (&sd->mu);
 	}
@@ -231,7 +231,7 @@ static void test_starve_with_writer (testing t) {
 	nsync_time deadline;
 	starve_data sd;
 	starve_data_init (&sd, 1); /* one thread, started below */
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (25000)); /* runs for at most 25s. */
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (25000)); /* runs for at most 25s. */
 
 	/* This thread will try to hold a writer lock almost
 	   continuously, releasing momentarily every 10ms. */
@@ -249,9 +249,9 @@ static void test_starve_with_writer (testing t) {
 		/* Use nsync_mu_trylock() to attempt to acquire while the writer is hogging the
 		   lock.  We expect some acquisitions to succeed. */
 		expected_lo = 1;
-		finish = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (30000));
+		finish = nsync_time_add (nsync_time_now (), nsync_time_ms (30000));
 		trylock_acquires = 0; /* number of acquires */
-		while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), finish) < 0 && trylock_acquires < expected_lo) {
+		while (nsync_time_cmp (nsync_time_now (), finish) < 0 && trylock_acquires < expected_lo) {
 			if (nsync_mu_trylock (&sd.mu)) {
 				trylock_acquires++;
 				nsync_mu_unlock (&sd.mu);
@@ -269,9 +269,9 @@ static void test_starve_with_writer (testing t) {
 		/* Use nsync_mu_rtrylock() to attempt to read-acquire while the writer is
 		   hogging the lock.  We expect some acquisitions to succeed. */
 		expected_lo = 1;
-		finish = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (30000));
+		finish = nsync_time_add (nsync_time_now (), nsync_time_ms (30000));
 		rtrylock_acquires = 0; /* number of acquires */
-		while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), finish) < 0 && rtrylock_acquires < expected_lo) {
+		while (nsync_time_cmp (nsync_time_now (), finish) < 0 && rtrylock_acquires < expected_lo) {
 			if (nsync_mu_rtrylock (&sd.mu)) {
 				rtrylock_acquires++;
 				nsync_mu_runlock (&sd.mu);
@@ -288,15 +288,15 @@ static void test_starve_with_writer (testing t) {
 	/* Use nsync_mu_lock() to attempt to acquire while the writer is hogging
 	   the lock.  We expect several acquisitions to succeed. */
 	expected_lo = 2;
-	finish = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (5000));
+	finish = nsync_time_add (nsync_time_now (), nsync_time_ms (5000));
 	lock_acquires = 0; /* number of acquires */
-	while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), finish) < 0 && lock_acquires < expected_lo) {
+	while (nsync_time_cmp (nsync_time_now (), finish) < 0 && lock_acquires < expected_lo) {
 		nsync_mu_lock (&sd.mu);
 		lock_acquires++;
 		nsync_mu_unlock (&sd.mu);
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (2));
+		nsync_time_sleep (nsync_time_ms (2));
 	}
-	if (lock_acquires == 1 && nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), deadline) > 0) {
+	if (lock_acquires == 1 && nsync_time_cmp (nsync_time_now (), deadline) > 0) {
 		lock_acquires = 0; /* hog thread timed out */
 	}
 	if (lock_acquires < expected_lo) {
@@ -310,16 +310,16 @@ static void test_starve_with_writer (testing t) {
 	   time----it means that a writer couldn't break in (the test case
 	   above failed), so a reader is unlikely to manage it either. */
 	expected_lo = 2;
-	finish = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (5000));
+	finish = nsync_time_add (nsync_time_now (), nsync_time_ms (5000));
 	rlock_acquires = 0; /* number of acquires */
 	if (nsync_time_cmp (finish, deadline) < 0) {
-		while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), finish) < 0 && rlock_acquires < expected_lo) {
+		while (nsync_time_cmp (nsync_time_now (), finish) < 0 && rlock_acquires < expected_lo) {
 			nsync_mu_rlock (&sd.mu);
 			rlock_acquires++;
 			nsync_mu_runlock (&sd.mu);
-			nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (2));
+			nsync_time_sleep (nsync_time_ms (2));
 		}
-		if (rlock_acquires == 1 && nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), deadline) > 0) {
+		if (rlock_acquires == 1 && nsync_time_cmp (nsync_time_now (), deadline) > 0) {
 			rlock_acquires = 0; /* hog thread timed out */
 		}
 		if (rlock_acquires < expected_lo) {
diff --git a/third_party/nsync/testing/mu_test.c b/third_party/nsync/testing/mu_test.c
index fb4713f31..5229e6957 100644
--- a/third_party/nsync/testing/mu_test.c
+++ b/third_party/nsync/testing/mu_test.c
@@ -15,23 +15,1056 @@
 │ See the License for the specific language governing permissions and          │
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/mu_test.inc"
+#include "third_party/nsync/mu.h"
+#include "libc/calls/calls.h"
+#include "libc/str/str.h"
+#include "libc/thread/thread.h"
+#include "third_party/nsync/cv.h"
+#include "third_party/nsync/mu_wait.h"
+#include "third_party/nsync/testing/closure.h"
+#include "third_party/nsync/testing/smprintf.h"
+#include "third_party/nsync/testing/testing.h"
+#include "third_party/nsync/testing/time_extra.h"
+
+/* The state shared between the threads in each of the tests below. */
+typedef struct test_data_s {
+	testing t;
+	int n_threads;  /* Number of test threads; constant after init. */
+	int loop_count; /* Iteration count for each test thread; constant after init */
+	
+	/* mu_in_use protects i, id, loop_count, and finished_threads. */
+	void *mu_in_use; /* points at mu, mutex, or rwmutex depending on which is in use. */
+	void (*lock) (void *);  /* operations on mu_in_use */
+	void (*unlock) (void *);
+	
+	nsync_mu mu;
+	pthread_mutex_t mutex;
+	pthread_rwlock_t rwmutex;
+	
+	int i; /* counter incremented by test loops. */
+	volatile int id; /* id of current lock-holding thread in some tests. */
+	
+	nsync_cv done; /* Signalled when finished_threads==n_threads. */
+	int finished_threads;      /* Count of threads that have finished. */
+} test_data;
+
+/* Indicate that a thread has finished its operations on test_data
+   by incrementing td.finished_threads, and signal td.done when it reaches td.n_threads.
+   See test_data_wait_for_all_threads(). */
+static void test_data_thread_finished (test_data *td) {
+	(*td->lock) (td->mu_in_use);
+	td->finished_threads++;
+	if (td->finished_threads == td->n_threads) {
+		nsync_cv_broadcast (&td->done);
+	}
+	(*td->unlock) (td->mu_in_use);
+}
+
+/* Wait until all td.n_threads have called test_data_thread_finished(),
+   and then return. */
+static void test_data_wait_for_all_threads (test_data *td) {
+	(*td->lock) (td->mu_in_use);
+	while (td->finished_threads != td->n_threads) {
+		nsync_cv_wait_with_deadline_generic (&td->done, td->mu_in_use,
+						     td->lock, td->unlock,
+						     nsync_time_no_deadline, NULL);
+	}
+	(*td->unlock) (td->mu_in_use);
+}
+
+/* --------------------------------------- */
+
+/* The body of each thread executed by test_mu_nthread()
+   and test_mutex_nthread.
+   *td represents the test data that the threads share, and id is an integer
+   unique to each test thread. */
+static void counting_loop (test_data *td, int id) {
+	int n = td->loop_count;
+	int i = 0;
+	for (i = 0; i != n; i++) {
+		(*td->lock) (td->mu_in_use);
+		td->id = id;
+		td->i++;
+		if (td->id != id) {
+			testing_panic ("td->id != id");
+		}
+		(*td->unlock) (td->mu_in_use);
+	}
+	test_data_thread_finished (td);
+}
+
+CLOSURE_DECL_BODY2 (counting, test_data *, int)
+
+/* Versions of nsync_mu_lock() and nsync_mu_unlock() that take "void *"
+   arguments, to avoid call through a function pointer of a different type,
+   which is undefined.  */
+static void void_mu_lock (void *mu) {
+	nsync_mu_lock ((nsync_mu *) mu);
+}
+static void void_mu_unlock (void *mu) {
+	nsync_mu_unlock((nsync_mu *) mu);
+}
+
+/* Create a few threads, each of which increments an
+   integer a fixed number of times, using an nsync_mu for mutual exclusion.
+   It checks that the integer is incremented the correct number of times. */
+static void test_mu_nthread (testing t) {
+	int loop_count = 100000;
+	nsync_time deadline;
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (1500));
+	do {
+		int i;
+		test_data td;
+		bzero ((void *) &td, sizeof (td));
+		td.t = t;
+		td.n_threads = 5;
+		td.loop_count = loop_count;
+		td.mu_in_use = &td.mu;
+		td.lock = &void_mu_lock;
+		td.unlock = &void_mu_unlock;
+		for (i = 0; i != td.n_threads; i++) {
+			closure_fork (closure_counting (&counting_loop, &td, i));
+		}
+		test_data_wait_for_all_threads (&td);
+		if (td.i != td.n_threads*td.loop_count) {
+			TEST_FATAL (t, ("test_mu_nthread final count inconsistent: want %d, got %d",
+				   td.n_threads*td.loop_count, td.i));
+		}
+		loop_count *= 2;
+	} while (nsync_time_cmp (nsync_time_now (), deadline) < 0);
+}
+
+/* void pthread_mutex_lock */
+static void void_pthread_mutex_lock (void *mu) {
+        pthread_mutex_lock ((pthread_mutex_t *) mu);
+}
+
+/* void pthread_mutex_unlock */
+static void void_pthread_mutex_unlock (void *mu) {
+        pthread_mutex_unlock ((pthread_mutex_t *) mu);
+}
+
+/* Create a few threads, each of which increments an
+   integer a fixed number of times, using a pthread_mutex_t for mutual exclusion.
+   It checks that the integer is incremented the correct number of times. */
+static void test_mutex_nthread (testing t) {
+	int loop_count = 100000;
+	nsync_time deadline;
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (1500));
+	do {
+		int i;
+		test_data td;
+		bzero ((void *) &td, sizeof (td));
+		td.t = t;
+		td.n_threads = 5;
+		td.loop_count = loop_count;
+		td.mu_in_use = &td.mutex;
+		td.lock = &void_pthread_mutex_lock;
+		td.unlock = &void_pthread_mutex_unlock;
+		pthread_mutex_init (&td.mutex, NULL);
+		for (i = 0; i != td.n_threads; i++) {
+			closure_fork (closure_counting (&counting_loop, &td, i));
+		}
+		test_data_wait_for_all_threads (&td);
+		if (td.i != td.n_threads*td.loop_count) {
+			TEST_FATAL (t, ("test_mutex_nthread final count inconsistent: want %d, got %d",
+				   td.n_threads*td.loop_count, td.i));
+		}
+		pthread_mutex_destroy (&td.mutex);
+		loop_count *= 2;
+	} while (nsync_time_cmp (nsync_time_now (), deadline) < 0);
+}
+
+/* void pthread_rwlock_wrlock */
+static void void_pthread_rwlock_wrlock (void *mu) {
+        pthread_rwlock_wrlock ((pthread_rwlock_t *) mu);
+}
+
+/* void pthread_rwlock_unlock */
+static void void_pthread_rwlock_unlock (void *mu) {
+        pthread_rwlock_unlock ((pthread_rwlock_t *) mu);
+}
+
+/* Create a few threads, each of which increments an
+   integer a fixed number of times, using a pthread_rwlock_t for mutual exclusion.
+   It checks that the integer is incremented the correct number of times. */
+static void test_rwmutex_nthread (testing t) {
+	int loop_count = 100000;
+	nsync_time deadline;
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (1500));
+	do {
+		int i;
+		test_data td;
+		bzero ((void *) &td, sizeof (td));
+		td.t = t;
+		td.n_threads = 5;
+		td.loop_count = loop_count;
+		td.mu_in_use = &td.rwmutex;
+		td.lock = &void_pthread_rwlock_wrlock;
+		td.unlock = &void_pthread_rwlock_unlock;
+		pthread_rwlock_init (&td.rwmutex, NULL);
+		for (i = 0; i != td.n_threads; i++) {
+			closure_fork (closure_counting (&counting_loop, &td, i));
+		}
+		test_data_wait_for_all_threads (&td);
+		if (td.i != td.n_threads*td.loop_count) {
+			TEST_FATAL (t, ("test_mutex_nthread final count inconsistent: want %d, got %d",
+				   td.n_threads*td.loop_count, td.i));
+		}
+		pthread_rwlock_destroy (&td.rwmutex);
+		loop_count *= 2;
+	} while (nsync_time_cmp (nsync_time_now (), deadline) < 0);
+}
+
+/* --------------------------------------- */
+
+/* The body of each thread executed by test_try_mu_nthread().
+   *td represents the test data that the threads share, and id is an integer
+   unique to each test thread. */
+static void counting_loop_try_mu (test_data *td, int id) {
+	int i;
+	int n = td->loop_count;
+	for (i = 0; i != n; i++) {
+		while (!nsync_mu_trylock (&td->mu)) {
+			pthread_yield ();
+		}
+		td->id = id;
+		td->i++;
+		if (td->id != id) {
+			testing_panic ("td->id != id");
+		}
+		n = td->loop_count;
+		nsync_mu_unlock (&td->mu);
+	}
+	test_data_thread_finished (td);
+}
+
+/* Test that acquiring an nsync_mu with nsync_mu_trylock()
+   using several threads provides mutual exclusion. */
+static void test_try_mu_nthread (testing t) {
+	int loop_count = 100000;
+	nsync_time deadline;
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (1500));
+	do {
+		int i;
+		test_data td;
+		bzero ((void *) &td, sizeof (td));
+		td.t = t;
+		td.n_threads = 5;
+		td.loop_count = loop_count;
+		td.mu_in_use = &td.mu;
+		td.lock = &void_mu_lock;
+		td.unlock = &void_mu_unlock;
+		for (i = 0; i != td.n_threads; i++) {
+			closure_fork (closure_counting (&counting_loop_try_mu, &td, i));
+		}
+		test_data_wait_for_all_threads (&td);
+		if (td.i != td.n_threads*td.loop_count) {
+			TEST_FATAL (t, ("test_try_mu_nthread final count inconsistent: want %d, got %d",
+				   td.n_threads*td.loop_count, td.i));
+		}
+		loop_count *= 2;
+	} while (nsync_time_cmp (nsync_time_now (), deadline) < 0);
+}
+
+/* --------------------------------------- */
+
+/* An integer protected by a mutex, and with an associated
+   condition variable that is signalled when the counter reaches 0. */
+typedef struct counter_s {
+	nsync_mu mu; /* protects value */
+	int value;
+	nsync_cv cv; /* signalled when value becomes 0 */
+} counter;
+
+/* Return a counter with initial value "initial". */
+static counter *counter_new (int initial) {
+	counter *c = (counter *) malloc (sizeof (*c));
+	bzero ((void *) c, sizeof (*c));
+	c->value = initial;
+	return (c);
+}
+
+/* Increment *c by "increment". */
+static void counter_inc (counter *c, int increment) {
+	if (increment != 0) {
+		nsync_mu_lock (&c->mu);
+		c->value += increment;
+		if (c->value == 0) {
+			nsync_cv_broadcast (&c->cv);
+		}
+		nsync_mu_unlock (&c->mu);
+	}
+}
+
+/* Wait on *c's condition variable until the counter
+   becomes 0, or abs_deadline is reached. */
+static int counter_wait_for_zero_with_deadline (counter *c, nsync_time abs_deadline) {
+	int value;
+	nsync_mu_rlock (&c->mu);
+	while (c->value != 0 &&
+	       nsync_cv_wait_with_deadline (&c->cv, &c->mu, abs_deadline, NULL) == 0) {
+	}
+	value = c->value;
+	nsync_mu_runlock (&c->mu);
+	return (value);
+}
+
+/* Wait on *c's condition variable until the counter becomes 0. */
+static void counter_wait_for_zero (counter *c) {
+	int value = counter_wait_for_zero_with_deadline (c, nsync_time_no_deadline);
+	if (value != 0) {
+		testing_panic (smprintf ("wait_for_zero() about to return with "
+					 "non-zero value %d", value));
+	}
+}
+
+/* Return the current value of *c. */
+static int counter_value (counter *c) {
+	int value;
+	nsync_mu_rlock (&c->mu);
+	value = c->value;
+	nsync_mu_runlock (&c->mu);
+	return (value);
+}
+
+/* --------------------------------------- */
+
+CLOSURE_DECL_BODY9 (attempt_trylock, testing , const char *, int, nsync_mu *,
+		    int, int, int *, int, counter *)
+
+/* Call nsync_mu_trylock(), and compares the result to expected_acquire.
+   If the lock was acquired, then:
+   - if expected_value != -1, compare *value against expected_value.
+   - increment *value.
+   - if release is non-zero, release the lock before returning.
+   In any case, the counter *done is decremented. */
+static void attempt_trylock (testing t, const char *id, int verbose,
+			     nsync_mu *mu, int expected_acquire, int release,
+			     int *value, int expected_value, counter *done) {
+	int acquired = nsync_mu_trylock (mu);
+	if (acquired != expected_acquire) {
+		testing_panic (smprintf ("attempt_trylock %s:  expected "
+					 "nsync_mu_trylock() to return %d but got %d",
+					 id, expected_acquire, acquired));
+	}
+	if (verbose) {
+		TEST_LOG (t, ("attempt_trylock %s %d\n", id, acquired));
+	}
+	if (acquired) {
+		nsync_mu_assert_held (mu);
+		if (expected_value != -1 && *value != expected_value) {
+			testing_panic (smprintf ("attempt_trylock %s expected "
+						 "value %d, *value=%d",
+						 id, expected_value, *value));
+		}
+		(*value)++;
+		if (verbose) {
+			TEST_LOG (t, ("attempt_trylock %s incremented value to %d\n", id, *value));
+		}
+		if (release) {
+			nsync_mu_unlock (mu);
+		}
+	}
+	counter_inc (done, -1);
+}
+
+/* Call nsync_mu_rtrylock(), and compare the result to expected_acquire.
+   If the lock was acquired, then:
+   - if expected_value != -1, compare *value against expected_value.
+   - if release is non-zero, release the lock before returning.
+   In any case, decrement *done. */
+static void attempt_rtrylock (testing t, const char *id, int verbose,
+			      nsync_mu *mu, int expected_acquire, int release,
+			      int *value, int expected_value, counter *done) {
+	int acquired = nsync_mu_rtrylock (mu);
+	if (acquired != expected_acquire) {
+		testing_panic (smprintf ("attempt_rtrylock %s: expected "
+					 "nsync_mu_rtrylock() to return %d but got %d",
+					 id, expected_acquire, acquired));
+	}
+	if (verbose) {
+		TEST_LOG (t, ("attempt_rtrylock %s %d\n", id, acquired));
+	}
+	if (acquired) {
+		nsync_mu_rassert_held (mu);
+		if (expected_value != -1 && *value != expected_value) {
+			testing_panic (smprintf ("attempt_rtrylock %s expected "
+						 "value %d, *value=%d",
+						 id, expected_value, *value));
+		}
+		if (release) {
+			nsync_mu_runlock (mu);
+		}
+	}
+	counter_inc (done, -1);
+}
+
+CLOSURE_DECL_BODY9 (lock_unlock, testing, const char *, int, nsync_mu *,
+		    int *, int, nsync_time, counter *, counter *)
+
+/* First acquire *mu, then:
+   - if expected_value != -1, compare *value against expected_value.
+   - increment *value.
+   - sleep for "sleep".
+   Then release *mu and decrement *done. */
+static void lock_unlock (testing t, const char *id, int verbose, nsync_mu *mu, int *value,
+		  int expected_value, nsync_time sleep, counter *sleeping, counter *done) {
+	if (verbose) {
+		TEST_LOG (t, ("lock_unlock %s\n", id));
+	}
+	if (sleeping != NULL) {
+		counter_inc (sleeping, -1);
+	}
+	nsync_mu_lock (mu);
+	nsync_mu_assert_held (mu);
+	if (expected_value != -1 && *value != expected_value) {
+		testing_panic (smprintf ("lock_unlock %s expected "
+					 "value %d, *value=%d",
+					 id, expected_value, *value));
+	}
+	(*value)++;
+	if (verbose) {
+		TEST_LOG (t, ("lock_unlock %s incremented value to %d\n", id, *value));
+	}
+	nsync_time_sleep (sleep);
+	nsync_mu_unlock (mu);
+	counter_inc (done, -1);
+}
+
+/* First acquire *mu in read mode, then:
+   - if expected_value != -1, compare *value against expected_value.
+   - sleep for "sleep".
+   Then release *mu and decrement *done. */
+static void rlock_runlock (testing t, const char *id, int verbose, nsync_mu *mu,
+			   int *value, int expected_value, nsync_time sleep,
+			   counter *sleeping, counter *done) {
+	if (verbose) {
+		TEST_LOG (t, ("rlock_runlock %s\n", id));
+	}
+	if (sleeping != NULL) {
+		counter_inc (sleeping, -1);
+	}
+	nsync_mu_rlock (mu);
+	nsync_mu_rassert_held (mu);
+	if (expected_value != -1 && *value != expected_value) {
+		testing_panic (smprintf ("rlock_runlock %s expected "
+					 "value %d, *value=%d", id, expected_value, *value));
+	}
+	nsync_time_sleep (sleep);
+	nsync_mu_runlock (mu);
+	counter_inc (done, -1);
+}
+
+/* Check that the time since start_time is between expected_duration-1ms.
+   If the time exceeds expected_duration+slop_duration, return 1, else 0. */
+static int check_times (testing t, const char *id, nsync_time start_time,
+			nsync_time expected_duration, nsync_time slop_duration) {
+	int exceeds_count = 0;
+	nsync_time now;
+	nsync_time measured_duration;
+	now = nsync_time_now ();
+	measured_duration = nsync_time_sub (now, start_time);
+	if (nsync_time_cmp (measured_duration,
+			    nsync_time_sub (expected_duration, nsync_time_ms (5))) < 0) {
+		char *m_str = nsync_time_str (measured_duration, 2);
+		char *e_str = nsync_time_str (expected_duration, 2);
+		TEST_ERROR (t, ("check_times %s too short a delay: %s instead of %s",
+			   id, m_str, e_str));
+		free (m_str);
+		free (e_str);
+	}
+	if (nsync_time_cmp (nsync_time_add (expected_duration, slop_duration), measured_duration) < 0) {
+		exceeds_count++;
+	}
+	return (exceeds_count);
+}
+
+/* Check the operation of nsync_mu as a reader/writer lock. */
+static void test_rlock (testing t) {
+	int loop;
+	int i;
+	int max_write_wait_exceeded;
+	int max_read_wait_exceeded;
+	nsync_time time_unit;
+	nsync_time slop_duration;
+	nsync_time delay_duration;
+	nsync_time writer_duration;
+	nsync_time reader_duration;
+	static const int loop_count = 5;
+	static const int read_threads = 3;
+	static const int limit = 3;
+	static const int verbose = 0;
+	max_write_wait_exceeded = 0;
+	max_read_wait_exceeded = 0;
+
+	time_unit = nsync_time_ms (100);
+	slop_duration = nsync_time_add (nsync_time_add (time_unit, time_unit), time_unit);
+	delay_duration = time_unit;
+	writer_duration = time_unit;
+	reader_duration = nsync_time_add (time_unit, time_unit);
+
+	max_write_wait_exceeded = 0;
+	max_read_wait_exceeded = 0;
+	for (loop = 0; loop != loop_count; loop++) {
+		counter *lock_unlock_sleeping;
+		counter *rlock_runlock_sleeping;
+		counter *lock_unlock_done;
+		counter *rlock_runlock_done;
+		nsync_time read_start_time;
+		nsync_mu mu;
+		int value = 0;
+		counter *thread_done;
+
+		nsync_time start_time;
+		nsync_mu_init (&mu);
+		start_time = nsync_time_now ();
+
+		/* ------------------------------------ */
+		/* Acquire lock with nsync_mu_rtrylock().  This thread will
+		   hold a read lock until the next line with =====.  */
+		thread_done = counter_new (1);
+		attempt_rtrylock (t, "a", verbose, &mu, 1, 0, &value, 0, thread_done);
+		counter_wait_for_zero (thread_done);
+
+		nsync_mu_rassert_held (&mu);
+
+		counter_inc (thread_done, 1);
+		/* Can get read lock holding read lock. */
+		closure_fork (closure_attempt_trylock (&attempt_rtrylock,
+			t, "b", verbose, &mu, 1, 1, &value, 0, thread_done));
+		counter_wait_for_zero (thread_done);
+
+		nsync_mu_rassert_held (&mu);
+
+		counter_inc (thread_done, 1);
+		/* Can't get write lock holding read lock. */
+		closure_fork (closure_attempt_trylock (&attempt_trylock, t, "c", verbose,
+						       &mu, 0, 1, &value, -1, thread_done));
+		counter_wait_for_zero (thread_done);
+
+		if (!nsync_mu_is_reader (&mu)) {
+			TEST_FATAL(t, ("expected mu held in reader mode"));
+		}
+
+		counter_inc (thread_done, 1);
+		closure_fork (closure_lock_unlock (&rlock_runlock, t, "d", verbose,
+						   &mu, &value, 0, nsync_time_zero /*no delay*/,
+						   NULL, thread_done));
+		counter_wait_for_zero (thread_done);
+
+		nsync_mu_rassert_held (&mu);
+
+		lock_unlock_done = counter_new (1);
+		lock_unlock_sleeping = counter_new (1);
+		closure_fork (closure_lock_unlock (&lock_unlock, t, "e", verbose,
+						   &mu, &value, 0, writer_duration,
+						   lock_unlock_sleeping, lock_unlock_done));
+
+		counter_wait_for_zero (lock_unlock_sleeping);
+		nsync_time_sleep (delay_duration); /* give time for lock_unlock() thread to wait. */
+
+		nsync_mu_rassert_held (&mu);
+
+		rlock_runlock_done = counter_new (read_threads);
+		rlock_runlock_sleeping = counter_new (read_threads);
+		for (i = 0; i != read_threads; i++) {
+			/* read lock will be acquired after lock_unlock() completes */
+			closure_fork (closure_lock_unlock (&rlock_runlock, t, "f", verbose,
+							   &mu, &value, 1, reader_duration,
+							   rlock_runlock_sleeping,
+							   rlock_runlock_done));
+		}
+
+		nsync_mu_rassert_held (&mu);
+
+		counter_wait_for_zero (rlock_runlock_sleeping);
+		nsync_time_sleep (delay_duration); /* time for rlock_runlock() threads to wait. */
+
+		nsync_mu_rassert_held (&mu);
+
+		if (counter_value (lock_unlock_done) == 0) {
+			TEST_FATAL (t, ("thread was able to acquire write lock while read lock held"));
+		}
+		if (counter_value (rlock_runlock_done) == 0) {
+			TEST_FATAL (t, ("thread was able to acquire read lock with "
+				   "other reader and waiting writer"));
+		}
+
+		nsync_mu_rassert_held (&mu);
+
+		counter_inc (thread_done, 1);
+	       /* Still can't get write lock. */
+		closure_fork (closure_attempt_trylock (&attempt_trylock, t, "g", verbose,
+						       &mu, 0, 1, &value, -1, thread_done));
+		counter_wait_for_zero (thread_done);
+
+		counter_inc (thread_done, 1);
+		/* Now can't get read lock because a writer is waiting. */
+		closure_fork (closure_attempt_trylock (&attempt_rtrylock, t, "h", verbose,
+						       &mu, 0, 1, &value, -1, thread_done));
+		counter_wait_for_zero (thread_done);
+
+		nsync_mu_runlock (&mu);
+		/* ==================================== */
+
+		read_start_time = nsync_time_now ();
+		counter_wait_for_zero (lock_unlock_done); /* Now can get write lock. */
+		max_write_wait_exceeded += check_times (t, "i", start_time,
+			nsync_time_add (nsync_time_add (delay_duration, delay_duration), writer_duration),
+			slop_duration);
+
+		counter_wait_for_zero (rlock_runlock_done); /* And now an get read lock again. */
+		max_read_wait_exceeded += check_times (t, "j", read_start_time,
+						       reader_duration, slop_duration);
+
+		free (thread_done);
+		free (lock_unlock_done);
+		free (rlock_runlock_done);
+		free (lock_unlock_sleeping);
+		free (rlock_runlock_sleeping);
+	}
+	if (verbose) {
+		TEST_LOG (t, ("read lock max_write_wait_exceeded %d max_read_wait_exceeded %d\n",
+			 max_write_wait_exceeded, max_read_wait_exceeded));
+	}
+	if (max_write_wait_exceeded > limit) {
+		TEST_ERROR (t, ("lock_unlock() took too long %d "
+			   "(more than %d) times out of %d",
+			   max_write_wait_exceeded, limit, loop_count));
+	}
+	if (max_read_wait_exceeded > limit) {
+		TEST_ERROR (t, ("rlock_runlock() took too long %d "
+			   "(more than %d) times out of %d",
+			   max_read_wait_exceeded, limit, loop_count));
+	}
+
+	max_write_wait_exceeded = 0;
+	max_read_wait_exceeded = 0;
+	for (loop = 0; loop != loop_count; loop++) {
+		counter *lock_unlock_sleeping;
+		counter *rlock_runlock_sleeping;
+		counter *lock_unlock_done;
+		counter *rlock_runlock_done;
+		nsync_time read_start_time;
+		nsync_mu mu;
+		int value = 0;
+		counter *thread_done;
+
+		nsync_time start_time;
+
+		nsync_mu_init (&mu);
+		start_time = nsync_time_now ();
+
+		/* ------------------------------------ */
+		/* Acquire lock with nsync_mu_trylock().  This thread will hold
+		   a write lock until the next line with =====.  */
+		thread_done = counter_new (1);
+		attempt_trylock (t, "A", verbose, &mu, 1, 0, &value, 0, thread_done);
+		counter_wait_for_zero (thread_done);
+
+		nsync_mu_assert_held (&mu);
+		nsync_mu_rassert_held (&mu);
+
+		counter_inc (thread_done, 1);
+		/* Can't get read lock while holding write lock. */
+		closure_fork (closure_attempt_trylock (&attempt_rtrylock, t, "B", verbose,
+						       &mu, 0, 1, &value, -1, thread_done));
+		counter_wait_for_zero (thread_done);
+
+		if (nsync_mu_is_reader (&mu)) {
+			TEST_FATAL (t, ("expected mu held in write mode"));
+		}
+		nsync_mu_assert_held (&mu);
+		nsync_mu_rassert_held (&mu);
+
+		counter_inc (thread_done, 1);
+		/* Can't get write lock while holding write lock. */
+		closure_fork (closure_attempt_trylock (&attempt_trylock, t, "C", verbose,
+						       &mu, 0, 1, &value, -1, thread_done));
+		counter_wait_for_zero (thread_done);
+
+		nsync_mu_assert_held (&mu);
+		nsync_mu_rassert_held (&mu);
+
+		lock_unlock_done = counter_new (1);
+		lock_unlock_sleeping = counter_new (1);
+		closure_fork (closure_lock_unlock (&lock_unlock, t, "D", verbose,
+						   &mu, &value, 1, writer_duration,
+						   lock_unlock_sleeping, lock_unlock_done));
+
+		counter_wait_for_zero (lock_unlock_sleeping);
+		nsync_time_sleep (delay_duration); /* give time for lock_unlock() thread to wait. */
+
+		nsync_mu_assert_held (&mu);
+		nsync_mu_rassert_held (&mu);
+
+		rlock_runlock_done = counter_new (read_threads);
+		rlock_runlock_sleeping = counter_new (read_threads);
+		for (i = 0; i != read_threads; i++) {
+			/* not guaranteed will complete after lock_unlock() above */
+			closure_fork (closure_lock_unlock (&rlock_runlock, t, "E", verbose,
+							   &mu, &value, -1, reader_duration,
+							   rlock_runlock_sleeping,
+							   rlock_runlock_done));
+		}
+
+		nsync_mu_assert_held (&mu);
+		nsync_mu_rassert_held (&mu);
+
+		counter_wait_for_zero (rlock_runlock_sleeping);
+		nsync_time_sleep (delay_duration); /* time for rlock_runlock() threads to wait. */
+
+		nsync_mu_assert_held (&mu);
+		nsync_mu_rassert_held (&mu);
+
+		if (counter_value (lock_unlock_done) == 0) {
+			TEST_FATAL (t, ("thread was able to acquire write lock "
+				   "while other write lock held"));
+		}
+		if (counter_value (rlock_runlock_done) == 0) {
+			TEST_FATAL (t, ("thread was able to acquire read lock "
+				   "while  write lock held"));
+		}
+
+		nsync_mu_assert_held (&mu);
+		nsync_mu_rassert_held (&mu);
+
+		counter_inc (thread_done, 1);
+		/* Still can't get read lock while holding write lock. */
+		closure_fork (closure_attempt_trylock (&attempt_rtrylock, t, "F", verbose,
+						       &mu, 0, 1, &value, -1, thread_done));
+		counter_wait_for_zero (thread_done);
+
+		nsync_mu_assert_held (&mu);
+		nsync_mu_rassert_held (&mu);
+
+		counter_inc (thread_done, 1);
+		/* Still can't get write lock while holding write lock. */
+		closure_fork (closure_attempt_trylock (&attempt_trylock, t, "G", verbose,
+						       &mu, 0, 1, &value, -1, thread_done));
+		counter_wait_for_zero (thread_done);
+
+		nsync_mu_assert_held (&mu);
+		nsync_mu_rassert_held (&mu);
+
+		nsync_mu_unlock (&mu);
+		/* ==================================== */
+
+		read_start_time = nsync_time_now ();
+		counter_wait_for_zero (lock_unlock_done); /* Now can get write lock. */
+		max_write_wait_exceeded += check_times (t, "H", start_time,
+			nsync_time_add (nsync_time_add (delay_duration, delay_duration), writer_duration),
+			slop_duration);
+
+		counter_wait_for_zero (rlock_runlock_done); /* And now can get read lock again. */
+		max_read_wait_exceeded += check_times (t, "I", read_start_time,
+						       reader_duration, slop_duration);
+
+		free (thread_done);
+		free (lock_unlock_done);
+		free (rlock_runlock_done);
+		free (lock_unlock_sleeping);
+		free (rlock_runlock_sleeping);
+	}
+	if (verbose) {
+		TEST_LOG (t, ("write lock max_write_wait_exceeded %d "
+			 "max_read_wait_exceeded %d\n",
+			 max_write_wait_exceeded, max_read_wait_exceeded));
+	}
+	if (max_write_wait_exceeded > limit) {
+		TEST_ERROR (t, ("lock_unlock() took too long %d (more than %d) "
+			   "times out of %d",
+			   max_write_wait_exceeded, limit, loop_count));
+	}
+	if (max_read_wait_exceeded > limit) {
+		TEST_ERROR (t, ("rlock_runlock() took too long %d (more than %d) "
+			   "times out of %d",
+			   max_read_wait_exceeded, limit, loop_count));
+	}
+}
+
+/* --------------------------------------- */
+
+/* Measure the performance of an uncontended nsync_mu. */
+static void benchmark_mu_uncontended (testing t) {
+	int i;
+	int n = testing_n (t);
+	nsync_mu mu;
+	nsync_mu_init (&mu);
+	for (i = 0; i != n; i++) {
+		nsync_mu_lock (&mu);
+		nsync_mu_unlock (&mu);
+	}
+}
+
+/* Return whether int *value is one. */
+static int int_is_1 (const void *value) { return (*(const int *)value == 1); }
+
+/* Return whether int *value is two. */
+static int int_is_2 (const void *value) { return (*(const int *)value == 2); }
+
+/* Return whether int *value is three. */
+static int int_is_3 (const void *value) { return (*(const int *)value == 3); }
+
+/* Set *value to 1, wait for it to become 2, then set it to 3.  *value is under
+   *mu */
+static void waiter (nsync_mu *mu, int *value) {
+	nsync_mu_lock (mu);
+	*value = 1;
+	nsync_mu_wait (mu, &int_is_2, value, NULL);
+	*value = 3;
+	nsync_mu_unlock (mu);
+}
+
+CLOSURE_DECL_BODY2 (waiter, nsync_mu *, int *)
+
+/* Measure the performance of an uncontended nsync_mu
+   with a blocked waiter. */
+static void benchmark_mu_uncontended_waiter (testing t) {
+	int i;
+	int n = testing_n (t);
+	nsync_mu mu;
+	int value = 0;
+	nsync_mu_init (&mu);
+	closure_fork (closure_waiter (&waiter, &mu, &value));
+	nsync_mu_lock (&mu);
+	nsync_mu_wait (&mu, &int_is_1, &value, NULL);
+	nsync_mu_unlock (&mu);
+	for (i = 0; i != n; i++) {
+		nsync_mu_lock (&mu);
+		nsync_mu_unlock (&mu);
+	}
+	nsync_mu_lock (&mu);
+	value = 2;
+	nsync_mu_wait (&mu, &int_is_3, &value, NULL);
+	nsync_mu_unlock (&mu);
+}
+
+/* Measure the performance of an uncontended nsync_mu
+   with a blocked waiter using nsync_mu_unlock_without_wakeup. */
+static void benchmark_mu_uncontended_no_wakeup (testing t) {
+	int i;
+	int n = testing_n (t);
+	nsync_mu mu;
+	int value = 0;
+	nsync_mu_init (&mu);
+	closure_fork (closure_waiter (&waiter, &mu, &value));
+	nsync_mu_lock (&mu);
+	nsync_mu_wait (&mu, &int_is_1, &value, NULL);
+	nsync_mu_unlock (&mu);
+	for (i = 0; i != n; i++) {
+		nsync_mu_lock (&mu);
+		nsync_mu_unlock_without_wakeup (&mu);
+	}
+	nsync_mu_lock (&mu);
+	value = 2;
+	nsync_mu_wait (&mu, &int_is_3, &value, NULL);
+	nsync_mu_unlock (&mu);
+}
+
+/* Measure the performance of an uncontended
+   nsync_mu in read mode. */
+static void benchmark_rmu_uncontended (testing t) {
+	int i;
+	int n = testing_n (t);
+	nsync_mu mu;
+	nsync_mu_init (&mu);
+	for (i = 0; i != n; i++) {
+		nsync_mu_rlock (&mu);
+		nsync_mu_runlock (&mu);
+	}
+}
+
+/* Measure the performance of an uncontended nsync_mu
+   in read mode with a blocked waiter. */
+static void benchmark_rmu_uncontended_waiter (testing t) {
+	int i;
+	int n = testing_n (t);
+	nsync_mu mu;
+	int value = 0;
+	nsync_mu_init (&mu);
+	closure_fork (closure_waiter (&waiter, &mu, &value));
+	nsync_mu_lock (&mu);
+	nsync_mu_wait (&mu, &int_is_1, &value, NULL);
+	nsync_mu_unlock (&mu);
+	for (i = 0; i != n; i++) {
+		nsync_mu_rlock (&mu);
+		nsync_mu_runlock (&mu);
+	}
+	nsync_mu_lock (&mu);
+	value = 2;
+	nsync_mu_wait (&mu, &int_is_3, &value, NULL);
+	nsync_mu_unlock (&mu);
+}
+
+/* Measure the performance of an uncontended pthread_mutex_t. */
+static void benchmark_mutex_uncontended (testing t) {
+	int i;
+	int n = testing_n (t);
+	pthread_mutex_t mu;
+	pthread_mutex_init (&mu, NULL);
+	for (i = 0; i != n; i++) {
+		pthread_mutex_lock (&mu);
+		pthread_mutex_unlock (&mu);
+	}
+	pthread_mutex_destroy (&mu);
+}
+
+/* Measure the performance of an uncontended pthread_rwlock_t. */
+static void benchmark_wmutex_uncontended (testing t) {
+	int i;
+	int n = testing_n (t);
+	pthread_rwlock_t mu;
+	pthread_rwlock_init (&mu, NULL);
+	for (i = 0; i != n; i++) {
+		pthread_rwlock_wrlock (&mu);
+		pthread_rwlock_unlock (&mu);
+	}
+	pthread_rwlock_destroy (&mu);
+}
+
+/* Measure the performance of an uncontended
+   pthread_rwlock_t in read mode. */
+static void benchmark_rmutex_uncontended (testing t) {
+	int i;
+	int n = testing_n (t);
+	pthread_rwlock_t mu;
+	pthread_rwlock_init (&mu, NULL);
+	for (i = 0; i != n; i++) {
+		pthread_rwlock_rdlock (&mu);
+		pthread_rwlock_unlock (&mu);
+	}
+	pthread_rwlock_destroy (&mu);
+}
+
+/* ---------------------------------------
+   Benchmarks for contended locks. */
+
+/* It's hard to write these as benchmark functions, since we wish to measure
+   throughput over an extended period (a second or two), rather than get the
+   latency of a few iterations. */
+
+/* A contended_state represents state shared between threads
+   in the contended benchmarks. */
+typedef struct contended_state_s {
+	testing t;
+
+	/* locks to test */
+	nsync_mu mu;
+	pthread_mutex_t mutex;
+	pthread_rwlock_t rwmutex;
+	int count; /* counter protected by a lock above */
+	
+	nsync_mu start_done_mu;
+	int start; /* whether threads should start, under start_done_mu */
+	int not_yet_done;  /* threads not yet complete, under start_done_mu */
+} contended_state;
+
+static int contended_state_may_start (const void *v) {
+	return (((const contended_state *)v)->start);
+}
+
+static int contended_state_all_done (const void *v) {
+	return (((const contended_state *)v)->not_yet_done == 0);
+}
+
+/* Wait for cs.start to become non-zero, then loop, acquiring and
+   releasing mu on each iteration until cs.deadline is reached, then decrement
+   cs.not_yet_done. */
+static void contended_state_contend_loop (contended_state *cs,
+					  void *mu, void (*lock) (void *),
+					  void (*unlock) (void *)) {
+	int n = testing_n (cs->t);
+	int j;
+	int i;
+	nsync_mu_rlock (&cs->start_done_mu);
+	nsync_mu_wait (&cs->start_done_mu, &contended_state_may_start, cs, NULL);
+	nsync_mu_runlock (&cs->start_done_mu);
+
+	for (j = 0; j < n; j += 10000) {
+		for (i = 0; i != 10000; i++) {
+			(*lock) (mu);
+			cs->count++;
+			(*unlock) (mu);
+		}
+	}
+
+	nsync_mu_lock (&cs->start_done_mu);
+	cs->not_yet_done--;
+	nsync_mu_unlock (&cs->start_done_mu);
+}
+
+typedef void (*func_any) (void *);
+CLOSURE_DECL_BODY4 (contended_state_contend_loop, contended_state *, void *, func_any, func_any)
+
+/* Start the threads in a contended test, wait for them to finish,
+   and print the number of iterations achieved. */
+static void contended_state_run_test (contended_state *cs, testing t,
+				      void *mu, void (*lock) (void *),
+				      void (*unlock) (void *)) {
+	int i;
+	cs->t = t;
+	cs->not_yet_done = 4; /* number of threads */
+	cs->start = 0;
+	cs->count = 0;
+	for (i = 0; i != cs->not_yet_done; i++) {
+		closure_fork (closure_contended_state_contend_loop (
+			&contended_state_contend_loop, cs, mu, lock, unlock));
+	}
+	nsync_mu_lock (&cs->start_done_mu);
+	cs->start = 1;
+	nsync_mu_wait (&cs->start_done_mu, &contended_state_all_done, cs, NULL);
+	nsync_mu_unlock (&cs->start_done_mu);
+}
+
+/* Measure the performance of highly contended
+   nsync_mu locks, with small critical sections.  */
+static void benchmark_mu_contended (testing t) {
+	contended_state cs;
+	bzero ((void *) &cs, sizeof (cs));
+	contended_state_run_test (&cs, t, &cs.mu, (void (*) (void*))&nsync_mu_lock,
+				  (void (*) (void*))&nsync_mu_unlock);
+}
+
+/* Measure the performance of highly contended
+   pthread_mutex_t locks, with small critical sections.  */
+static void benchmark_mutex_contended (testing t) {
+	contended_state cs;
+	bzero ((void *) &cs, sizeof (cs));
+	pthread_mutex_init (&cs.mutex, NULL);
+	contended_state_run_test (&cs, t, &cs.mutex, &void_pthread_mutex_lock,
+				  &void_pthread_mutex_unlock);
+	pthread_mutex_destroy (&cs.mutex);
+}
+
+/* Measure the performance of highly contended
+   pthread_rwlock_t locks, with small critical sections.  */
+static void benchmark_wmutex_contended (testing t) {
+	contended_state cs;
+	bzero ((void *) &cs, sizeof (cs));
+	pthread_rwlock_init (&cs.rwmutex, NULL);
+	contended_state_run_test (&cs, t, &cs.rwmutex, &void_pthread_rwlock_wrlock,
+				  &void_pthread_rwlock_unlock);
+	pthread_rwlock_destroy (&cs.rwmutex);
+}
 
 int main (int argc, char *argv[]) {
 	testing_base tb = testing_new (argc, argv, 0);
 
 	TEST_RUN (tb, test_rlock);
 	TEST_RUN (tb, test_mu_nthread);
+	TEST_RUN (tb, test_mutex_nthread);
+	TEST_RUN (tb, test_rwmutex_nthread);
+	TEST_RUN (tb, test_try_mu_nthread);
 
 	BENCHMARK_RUN (tb, benchmark_mu_contended);
 	BENCHMARK_RUN (tb, benchmark_mutex_contended);
-	BENCHMARK_RUN (tb, benchmark_xmutex_contended);
 	BENCHMARK_RUN (tb, benchmark_wmutex_contended);
 
 	BENCHMARK_RUN (tb, benchmark_mu_uncontended);
 	BENCHMARK_RUN (tb, benchmark_rmu_uncontended);
 	BENCHMARK_RUN (tb, benchmark_mutex_uncontended);
-	BENCHMARK_RUN (tb, benchmark_xmutex_uncontended);
 	BENCHMARK_RUN (tb, benchmark_wmutex_uncontended);
 	BENCHMARK_RUN (tb, benchmark_rmutex_uncontended);
 	BENCHMARK_RUN (tb, benchmark_mu_uncontended_waiter);
diff --git a/third_party/nsync/testing/mu_test.inc b/third_party/nsync/testing/mu_test.inc
deleted file mode 100644
index 086520f2c..000000000
--- a/third_party/nsync/testing/mu_test.inc
+++ /dev/null
@@ -1,1119 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/mu.h"
-#include "libc/calls/calls.h"
-#include "libc/str/str.h"
-#include "libc/thread/thread.h"
-#include "third_party/nsync/time.h"
-#include "third_party/nsync/cv.h"
-#include "third_party/nsync/mu_wait.h"
-#include "third_party/nsync/testing/closure.h"
-#include "third_party/nsync/testing/smprintf.h"
-#include "third_party/nsync/testing/testing.h"
-#include "third_party/nsync/testing/time_extra.h"
-
-/* The state shared between the threads in each of the tests below. */
-typedef struct test_data_s {
-	testing t;
-	int n_threads;  /* Number of test threads; constant after init. */
-	int loop_count; /* Iteration count for each test thread; constant after init */
-	
-	/* mu_in_use protects i, id, loop_count, and finished_threads. */
-	void *mu_in_use; /* points at mu, mutex, or rwmutex depending on which is in use. */
-	void (*lock) (void *);  /* operations on mu_in_use */
-	void (*unlock) (void *);
-	
-	nsync_mu mu;
-	pthread_mutex_t mutex;
-	pthread_rwlock_t rwmutex;
-	
-	int i; /* counter incremented by test loops. */
-	volatile int id; /* id of current lock-holding thread in some tests. */
-	
-	nsync_cv done; /* Signalled when finished_threads==n_threads. */
-	int finished_threads;      /* Count of threads that have finished. */
-} test_data;
-
-/* Indicate that a thread has finished its operations on test_data
-   by incrementing td.finished_threads, and signal td.done when it reaches td.n_threads.
-   See test_data_wait_for_all_threads(). */
-static void test_data_thread_finished (test_data *td) {
-	(*td->lock) (td->mu_in_use);
-	td->finished_threads++;
-	if (td->finished_threads == td->n_threads) {
-		nsync_cv_broadcast (&td->done);
-	}
-	(*td->unlock) (td->mu_in_use);
-}
-
-/* Wait until all td.n_threads have called test_data_thread_finished(),
-   and then return. */
-static void test_data_wait_for_all_threads (test_data *td) {
-	(*td->lock) (td->mu_in_use);
-	while (td->finished_threads != td->n_threads) {
-		nsync_cv_wait_with_deadline_generic (&td->done, td->mu_in_use,
-						     td->lock, td->unlock,
-						     NSYNC_CLOCK,
-						     nsync_time_no_deadline, NULL);
-	}
-	(*td->unlock) (td->mu_in_use);
-}
-
-/* --------------------------------------- */
-
-/* The body of each thread executed by test_mu_nthread()
-   and test_mutex_nthread.
-   *td represents the test data that the threads share, and id is an integer
-   unique to each test thread. */
-static void counting_loop (test_data *td, int id) {
-	int n = td->loop_count;
-	int i = 0;
-	for (i = 0; i != n; i++) {
-		(*td->lock) (td->mu_in_use);
-		td->id = id;
-		td->i++;
-		if (td->id != id) {
-			testing_panic ("td->id != id");
-		}
-		(*td->unlock) (td->mu_in_use);
-	}
-	test_data_thread_finished (td);
-}
-
-CLOSURE_DECL_BODY2 (counting, test_data *, int)
-
-/* Versions of nsync_mu_lock() and nsync_mu_unlock() that take "void *"
-   arguments, to avoid call through a function pointer of a different type,
-   which is undefined.  */
-static void void_mu_lock (void *mu) {
-	nsync_mu_lock ((nsync_mu *) mu);
-}
-static void void_mu_unlock (void *mu) {
-	nsync_mu_unlock((nsync_mu *) mu);
-}
-
-/* Create a few threads, each of which increments an
-   integer a fixed number of times, using an nsync_mu for mutual exclusion.
-   It checks that the integer is incremented the correct number of times. */
-static void test_mu_nthread (testing t) {
-	int loop_count = 100000;
-	nsync_time deadline;
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1500));
-	do {
-		int i;
-		test_data td;
-		bzero ((void *) &td, sizeof (td));
-		td.t = t;
-		td.n_threads = 5;
-		td.loop_count = loop_count;
-		td.mu_in_use = &td.mu;
-		td.lock = &void_mu_lock;
-		td.unlock = &void_mu_unlock;
-		for (i = 0; i != td.n_threads; i++) {
-			closure_fork (closure_counting (&counting_loop, &td, i));
-		}
-		test_data_wait_for_all_threads (&td);
-		if (td.i != td.n_threads*td.loop_count) {
-			TEST_FATAL (t, ("test_mu_nthread final count inconsistent: want %d, got %d",
-				   td.n_threads*td.loop_count, td.i));
-		}
-		loop_count *= 2;
-	} while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), deadline) < 0);
-}
-
-/* void pthread_mutex_lock */
-static void void_pthread_mutex_lock (void *mu) {
-        pthread_mutex_lock ((pthread_mutex_t *) mu);
-}
-
-/* void pthread_mutex_unlock */
-static void void_pthread_mutex_unlock (void *mu) {
-        pthread_mutex_unlock ((pthread_mutex_t *) mu);
-}
-
-/* Create a few threads, each of which increments an
-   integer a fixed number of times, using a pthread_mutex_t for mutual exclusion.
-   It checks that the integer is incremented the correct number of times. */
-static void test_mutex_nthread (testing t) {
-	int loop_count = 100000;
-	nsync_time deadline;
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1500));
-	do {
-		int i;
-		test_data td;
-		bzero ((void *) &td, sizeof (td));
-		td.t = t;
-		td.n_threads = 5;
-		td.loop_count = loop_count;
-		td.mu_in_use = &td.mutex;
-		td.lock = &void_pthread_mutex_lock;
-		td.unlock = &void_pthread_mutex_unlock;
-		pthread_mutex_init (&td.mutex, NULL);
-		for (i = 0; i != td.n_threads; i++) {
-			closure_fork (closure_counting (&counting_loop, &td, i));
-		}
-		test_data_wait_for_all_threads (&td);
-		if (td.i != td.n_threads*td.loop_count) {
-			TEST_FATAL (t, ("test_mutex_nthread final count inconsistent: want %d, got %d",
-				   td.n_threads*td.loop_count, td.i));
-		}
-		pthread_mutex_destroy (&td.mutex);
-		loop_count *= 2;
-	} while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), deadline) < 0);
-}
-
-/* Create a few threads, each of which increments an integer a fixed
-   number of times, using a recursive pthread_mutex_t for mutual exclusion.
-   It checks that the integer is incremented the correct number of times. */
-static void test_xmutex_nthread (testing t) {
-	int loop_count = 100000;
-	nsync_time deadline;
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1500));
-	do {
-		int i;
-		test_data td;
-		pthread_mutexattr_t attr;
-		bzero ((void *) &td, sizeof (td));
-		td.t = t;
-		td.n_threads = 5;
-		td.loop_count = loop_count;
-		td.mu_in_use = &td.mutex;
-		td.lock = &void_pthread_mutex_lock;
-		td.unlock = &void_pthread_mutex_unlock;
-		pthread_mutexattr_init (&attr);
-		pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_RECURSIVE);
-		pthread_mutex_init (&td.mutex, &attr);
-		pthread_mutexattr_destroy (&attr);
-		for (i = 0; i != td.n_threads; i++) {
-			closure_fork (closure_counting (&counting_loop, &td, i));
-		}
-		test_data_wait_for_all_threads (&td);
-		if (td.i != td.n_threads*td.loop_count) {
-			TEST_FATAL (t, ("test_mutex_nthread final count inconsistent: want %d, got %d",
-				   td.n_threads*td.loop_count, td.i));
-		}
-		pthread_mutex_destroy (&td.mutex);
-		loop_count *= 2;
-	} while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), deadline) < 0);
-}
-
-/* void pthread_rwlock_wrlock */
-static void void_pthread_rwlock_wrlock (void *mu) {
-        pthread_rwlock_wrlock ((pthread_rwlock_t *) mu);
-}
-
-/* void pthread_rwlock_unlock */
-static void void_pthread_rwlock_unlock (void *mu) {
-        pthread_rwlock_unlock ((pthread_rwlock_t *) mu);
-}
-
-/* Create a few threads, each of which increments an
-   integer a fixed number of times, using a pthread_rwlock_t for mutual exclusion.
-   It checks that the integer is incremented the correct number of times. */
-static void test_rwmutex_nthread (testing t) {
-	int loop_count = 100000;
-	nsync_time deadline;
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1500));
-	do {
-		int i;
-		test_data td;
-		bzero ((void *) &td, sizeof (td));
-		td.t = t;
-		td.n_threads = 5;
-		td.loop_count = loop_count;
-		td.mu_in_use = &td.rwmutex;
-		td.lock = &void_pthread_rwlock_wrlock;
-		td.unlock = &void_pthread_rwlock_unlock;
-		pthread_rwlock_init (&td.rwmutex, NULL);
-		for (i = 0; i != td.n_threads; i++) {
-			closure_fork (closure_counting (&counting_loop, &td, i));
-		}
-		test_data_wait_for_all_threads (&td);
-		if (td.i != td.n_threads*td.loop_count) {
-			TEST_FATAL (t, ("test_mutex_nthread final count inconsistent: want %d, got %d",
-				   td.n_threads*td.loop_count, td.i));
-		}
-		pthread_rwlock_destroy (&td.rwmutex);
-		loop_count *= 2;
-	} while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), deadline) < 0);
-}
-
-/* --------------------------------------- */
-
-/* The body of each thread executed by test_try_mu_nthread().
-   *td represents the test data that the threads share, and id is an integer
-   unique to each test thread. */
-static void counting_loop_try_mu (test_data *td, int id) {
-	int i;
-	int n = td->loop_count;
-	for (i = 0; i != n; i++) {
-		while (!nsync_mu_trylock (&td->mu)) {
-			pthread_yield ();
-		}
-		td->id = id;
-		td->i++;
-		if (td->id != id) {
-			testing_panic ("td->id != id");
-		}
-		n = td->loop_count;
-		nsync_mu_unlock (&td->mu);
-	}
-	test_data_thread_finished (td);
-}
-
-/* Test that acquiring an nsync_mu with nsync_mu_trylock()
-   using several threads provides mutual exclusion. */
-static void test_try_mu_nthread (testing t) {
-	int loop_count = 100000;
-	nsync_time deadline;
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1500));
-	do {
-		int i;
-		test_data td;
-		bzero ((void *) &td, sizeof (td));
-		td.t = t;
-		td.n_threads = 5;
-		td.loop_count = loop_count;
-		td.mu_in_use = &td.mu;
-		td.lock = &void_mu_lock;
-		td.unlock = &void_mu_unlock;
-		for (i = 0; i != td.n_threads; i++) {
-			closure_fork (closure_counting (&counting_loop_try_mu, &td, i));
-		}
-		test_data_wait_for_all_threads (&td);
-		if (td.i != td.n_threads*td.loop_count) {
-			TEST_FATAL (t, ("test_try_mu_nthread final count inconsistent: want %d, got %d",
-				   td.n_threads*td.loop_count, td.i));
-		}
-		loop_count *= 2;
-	} while (nsync_time_cmp (nsync_time_now (NSYNC_CLOCK), deadline) < 0);
-}
-
-/* --------------------------------------- */
-
-/* An integer protected by a mutex, and with an associated
-   condition variable that is signalled when the counter reaches 0. */
-typedef struct counter_s {
-	nsync_mu mu; /* protects value */
-	int value;
-	nsync_cv cv; /* signalled when value becomes 0 */
-} counter;
-
-/* Return a counter with initial value "initial". */
-static counter *counter_new (int initial) {
-	counter *c = (counter *) malloc (sizeof (*c));
-	bzero ((void *) c, sizeof (*c));
-	c->value = initial;
-	return (c);
-}
-
-/* Increment *c by "increment". */
-static void counter_inc (counter *c, int increment) {
-	if (increment != 0) {
-		nsync_mu_lock (&c->mu);
-		c->value += increment;
-		if (c->value == 0) {
-			nsync_cv_broadcast (&c->cv);
-		}
-		nsync_mu_unlock (&c->mu);
-	}
-}
-
-/* Wait on *c's condition variable until the counter
-   becomes 0, or abs_deadline is reached. */
-static int counter_wait_for_zero_with_deadline (counter *c, nsync_time abs_deadline) {
-	int value;
-	nsync_mu_rlock (&c->mu);
-	while (c->value != 0 &&
-	       nsync_cv_wait_with_deadline (&c->cv, &c->mu, NSYNC_CLOCK, abs_deadline, NULL) == 0) {
-	}
-	value = c->value;
-	nsync_mu_runlock (&c->mu);
-	return (value);
-}
-
-/* Wait on *c's condition variable until the counter becomes 0. */
-static void counter_wait_for_zero (counter *c) {
-	int value = counter_wait_for_zero_with_deadline (c, nsync_time_no_deadline);
-	if (value != 0) {
-		testing_panic (smprintf ("wait_for_zero() about to return with "
-					 "non-zero value %d", value));
-	}
-}
-
-/* Return the current value of *c. */
-static int counter_value (counter *c) {
-	int value;
-	nsync_mu_rlock (&c->mu);
-	value = c->value;
-	nsync_mu_runlock (&c->mu);
-	return (value);
-}
-
-/* --------------------------------------- */
-
-CLOSURE_DECL_BODY9 (attempt_trylock, testing , const char *, int, nsync_mu *,
-		    int, int, int *, int, counter *)
-
-/* Call nsync_mu_trylock(), and compares the result to expected_acquire.
-   If the lock was acquired, then:
-   - if expected_value != -1, compare *value against expected_value.
-   - increment *value.
-   - if release is non-zero, release the lock before returning.
-   In any case, the counter *done is decremented. */
-static void attempt_trylock (testing t, const char *id, int verbose,
-			     nsync_mu *mu, int expected_acquire, int release,
-			     int *value, int expected_value, counter *done) {
-	int acquired = nsync_mu_trylock (mu);
-	if (acquired != expected_acquire) {
-		testing_panic (smprintf ("attempt_trylock %s:  expected "
-					 "nsync_mu_trylock() to return %d but got %d",
-					 id, expected_acquire, acquired));
-	}
-	if (verbose) {
-		TEST_LOG (t, ("attempt_trylock %s %d\n", id, acquired));
-	}
-	if (acquired) {
-		nsync_mu_assert_held (mu);
-		if (expected_value != -1 && *value != expected_value) {
-			testing_panic (smprintf ("attempt_trylock %s expected "
-						 "value %d, *value=%d",
-						 id, expected_value, *value));
-		}
-		(*value)++;
-		if (verbose) {
-			TEST_LOG (t, ("attempt_trylock %s incremented value to %d\n", id, *value));
-		}
-		if (release) {
-			nsync_mu_unlock (mu);
-		}
-	}
-	counter_inc (done, -1);
-}
-
-/* Call nsync_mu_rtrylock(), and compare the result to expected_acquire.
-   If the lock was acquired, then:
-   - if expected_value != -1, compare *value against expected_value.
-   - if release is non-zero, release the lock before returning.
-   In any case, decrement *done. */
-static void attempt_rtrylock (testing t, const char *id, int verbose,
-			      nsync_mu *mu, int expected_acquire, int release,
-			      int *value, int expected_value, counter *done) {
-	int acquired = nsync_mu_rtrylock (mu);
-	if (acquired != expected_acquire) {
-		testing_panic (smprintf ("attempt_rtrylock %s: expected "
-					 "nsync_mu_rtrylock() to return %d but got %d",
-					 id, expected_acquire, acquired));
-	}
-	if (verbose) {
-		TEST_LOG (t, ("attempt_rtrylock %s %d\n", id, acquired));
-	}
-	if (acquired) {
-		nsync_mu_rassert_held (mu);
-		if (expected_value != -1 && *value != expected_value) {
-			testing_panic (smprintf ("attempt_rtrylock %s expected "
-						 "value %d, *value=%d",
-						 id, expected_value, *value));
-		}
-		if (release) {
-			nsync_mu_runlock (mu);
-		}
-	}
-	counter_inc (done, -1);
-}
-
-CLOSURE_DECL_BODY9 (lock_unlock, testing, const char *, int, nsync_mu *,
-		    int *, int, nsync_time, counter *, counter *)
-
-/* First acquire *mu, then:
-   - if expected_value != -1, compare *value against expected_value.
-   - increment *value.
-   - sleep for "sleep".
-   Then release *mu and decrement *done. */
-static void lock_unlock (testing t, const char *id, int verbose, nsync_mu *mu, int *value,
-		  int expected_value, nsync_time sleep, counter *sleeping, counter *done) {
-	if (verbose) {
-		TEST_LOG (t, ("lock_unlock %s\n", id));
-	}
-	if (sleeping != NULL) {
-		counter_inc (sleeping, -1);
-	}
-	nsync_mu_lock (mu);
-	nsync_mu_assert_held (mu);
-	if (expected_value != -1 && *value != expected_value) {
-		testing_panic (smprintf ("lock_unlock %s expected "
-					 "value %d, *value=%d",
-					 id, expected_value, *value));
-	}
-	(*value)++;
-	if (verbose) {
-		TEST_LOG (t, ("lock_unlock %s incremented value to %d\n", id, *value));
-	}
-	nsync_time_sleep (NSYNC_CLOCK, sleep);
-	nsync_mu_unlock (mu);
-	counter_inc (done, -1);
-}
-
-/* First acquire *mu in read mode, then:
-   - if expected_value != -1, compare *value against expected_value.
-   - sleep for "sleep".
-   Then release *mu and decrement *done. */
-static void rlock_runlock (testing t, const char *id, int verbose, nsync_mu *mu,
-			   int *value, int expected_value, nsync_time sleep,
-			   counter *sleeping, counter *done) {
-	if (verbose) {
-		TEST_LOG (t, ("rlock_runlock %s\n", id));
-	}
-	if (sleeping != NULL) {
-		counter_inc (sleeping, -1);
-	}
-	nsync_mu_rlock (mu);
-	nsync_mu_rassert_held (mu);
-	if (expected_value != -1 && *value != expected_value) {
-		testing_panic (smprintf ("rlock_runlock %s expected "
-					 "value %d, *value=%d", id, expected_value, *value));
-	}
-	nsync_time_sleep (NSYNC_CLOCK, sleep);
-	nsync_mu_runlock (mu);
-	counter_inc (done, -1);
-}
-
-/* Check that the time since start_time is between expected_duration-1ms.
-   If the time exceeds expected_duration+slop_duration, return 1, else 0. */
-static int check_times (testing t, const char *id, nsync_time start_time,
-			nsync_time expected_duration, nsync_time slop_duration) {
-	int exceeds_count = 0;
-	nsync_time now;
-	nsync_time measured_duration;
-	now = nsync_time_now (NSYNC_CLOCK);
-	measured_duration = nsync_time_sub (now, start_time);
-	if (nsync_time_cmp (measured_duration,
-			    nsync_time_sub (expected_duration, nsync_time_ms (5))) < 0) {
-		char *m_str = nsync_time_str (measured_duration, 2);
-		char *e_str = nsync_time_str (expected_duration, 2);
-		TEST_ERROR (t, ("check_times %s too short a delay: %s instead of %s",
-			   id, m_str, e_str));
-		free (m_str);
-		free (e_str);
-	}
-	if (nsync_time_cmp (nsync_time_add (expected_duration, slop_duration), measured_duration) < 0) {
-		exceeds_count++;
-	}
-	return (exceeds_count);
-}
-
-/* Check the operation of nsync_mu as a reader/writer lock. */
-static void test_rlock (testing t) {
-	int loop;
-	int i;
-	int max_write_wait_exceeded;
-	int max_read_wait_exceeded;
-	nsync_time time_unit;
-	nsync_time slop_duration;
-	nsync_time delay_duration;
-	nsync_time writer_duration;
-	nsync_time reader_duration;
-	static const int loop_count = 5;
-	static const int read_threads = 3;
-	static const int limit = 3;
-	static const int verbose = 0;
-	max_write_wait_exceeded = 0;
-	max_read_wait_exceeded = 0;
-
-	time_unit = nsync_time_ms (100);
-	slop_duration = nsync_time_add (nsync_time_add (time_unit, time_unit), time_unit);
-	delay_duration = time_unit;
-	writer_duration = time_unit;
-	reader_duration = nsync_time_add (time_unit, time_unit);
-
-	max_write_wait_exceeded = 0;
-	max_read_wait_exceeded = 0;
-	for (loop = 0; loop != loop_count; loop++) {
-		counter *lock_unlock_sleeping;
-		counter *rlock_runlock_sleeping;
-		counter *lock_unlock_done;
-		counter *rlock_runlock_done;
-		nsync_time read_start_time;
-		nsync_mu mu;
-		int value = 0;
-		counter *thread_done;
-
-		nsync_time start_time;
-		nsync_mu_init (&mu);
-		start_time = nsync_time_now (NSYNC_CLOCK);
-
-		/* ------------------------------------ */
-		/* Acquire lock with nsync_mu_rtrylock().  This thread will
-		   hold a read lock until the next line with =====.  */
-		thread_done = counter_new (1);
-		attempt_rtrylock (t, "a", verbose, &mu, 1, 0, &value, 0, thread_done);
-		counter_wait_for_zero (thread_done);
-
-		nsync_mu_rassert_held (&mu);
-
-		counter_inc (thread_done, 1);
-		/* Can get read lock holding read lock. */
-		closure_fork (closure_attempt_trylock (&attempt_rtrylock,
-			t, "b", verbose, &mu, 1, 1, &value, 0, thread_done));
-		counter_wait_for_zero (thread_done);
-
-		nsync_mu_rassert_held (&mu);
-
-		counter_inc (thread_done, 1);
-		/* Can't get write lock holding read lock. */
-		closure_fork (closure_attempt_trylock (&attempt_trylock, t, "c", verbose,
-						       &mu, 0, 1, &value, -1, thread_done));
-		counter_wait_for_zero (thread_done);
-
-		if (!nsync_mu_is_reader (&mu)) {
-			TEST_FATAL(t, ("expected mu held in reader mode"));
-		}
-
-		counter_inc (thread_done, 1);
-		closure_fork (closure_lock_unlock (&rlock_runlock, t, "d", verbose,
-						   &mu, &value, 0, nsync_time_zero /*no delay*/,
-						   NULL, thread_done));
-		counter_wait_for_zero (thread_done);
-
-		nsync_mu_rassert_held (&mu);
-
-		lock_unlock_done = counter_new (1);
-		lock_unlock_sleeping = counter_new (1);
-		closure_fork (closure_lock_unlock (&lock_unlock, t, "e", verbose,
-						   &mu, &value, 0, writer_duration,
-						   lock_unlock_sleeping, lock_unlock_done));
-
-		counter_wait_for_zero (lock_unlock_sleeping);
-		nsync_time_sleep (NSYNC_CLOCK, delay_duration); /* give time for lock_unlock() thread to wait. */
-
-		nsync_mu_rassert_held (&mu);
-
-		rlock_runlock_done = counter_new (read_threads);
-		rlock_runlock_sleeping = counter_new (read_threads);
-		for (i = 0; i != read_threads; i++) {
-			/* read lock will be acquired after lock_unlock() completes */
-			closure_fork (closure_lock_unlock (&rlock_runlock, t, "f", verbose,
-							   &mu, &value, 1, reader_duration,
-							   rlock_runlock_sleeping,
-							   rlock_runlock_done));
-		}
-
-		nsync_mu_rassert_held (&mu);
-
-		counter_wait_for_zero (rlock_runlock_sleeping);
-		nsync_time_sleep (NSYNC_CLOCK, delay_duration); /* time for rlock_runlock() threads to wait. */
-
-		nsync_mu_rassert_held (&mu);
-
-		if (counter_value (lock_unlock_done) == 0) {
-			TEST_FATAL (t, ("thread was able to acquire write lock while read lock held"));
-		}
-		if (counter_value (rlock_runlock_done) == 0) {
-			TEST_FATAL (t, ("thread was able to acquire read lock with "
-				   "other reader and waiting writer"));
-		}
-
-		nsync_mu_rassert_held (&mu);
-
-		counter_inc (thread_done, 1);
-	       /* Still can't get write lock. */
-		closure_fork (closure_attempt_trylock (&attempt_trylock, t, "g", verbose,
-						       &mu, 0, 1, &value, -1, thread_done));
-		counter_wait_for_zero (thread_done);
-
-		counter_inc (thread_done, 1);
-		/* Now can't get read lock because a writer is waiting. */
-		closure_fork (closure_attempt_trylock (&attempt_rtrylock, t, "h", verbose,
-						       &mu, 0, 1, &value, -1, thread_done));
-		counter_wait_for_zero (thread_done);
-
-		nsync_mu_runlock (&mu);
-		/* ==================================== */
-
-		read_start_time = nsync_time_now (NSYNC_CLOCK);
-		counter_wait_for_zero (lock_unlock_done); /* Now can get write lock. */
-		max_write_wait_exceeded += check_times (t, "i", start_time,
-			nsync_time_add (nsync_time_add (delay_duration, delay_duration), writer_duration),
-			slop_duration);
-
-		counter_wait_for_zero (rlock_runlock_done); /* And now an get read lock again. */
-		max_read_wait_exceeded += check_times (t, "j", read_start_time,
-						       reader_duration, slop_duration);
-
-		free (thread_done);
-		free (lock_unlock_done);
-		free (rlock_runlock_done);
-		free (lock_unlock_sleeping);
-		free (rlock_runlock_sleeping);
-	}
-	if (verbose) {
-		TEST_LOG (t, ("read lock max_write_wait_exceeded %d max_read_wait_exceeded %d\n",
-			 max_write_wait_exceeded, max_read_wait_exceeded));
-	}
-	if (max_write_wait_exceeded > limit) {
-		TEST_ERROR (t, ("lock_unlock() took too long %d "
-			   "(more than %d) times out of %d",
-			   max_write_wait_exceeded, limit, loop_count));
-	}
-	if (max_read_wait_exceeded > limit) {
-		TEST_ERROR (t, ("rlock_runlock() took too long %d "
-			   "(more than %d) times out of %d",
-			   max_read_wait_exceeded, limit, loop_count));
-	}
-
-	max_write_wait_exceeded = 0;
-	max_read_wait_exceeded = 0;
-	for (loop = 0; loop != loop_count; loop++) {
-		counter *lock_unlock_sleeping;
-		counter *rlock_runlock_sleeping;
-		counter *lock_unlock_done;
-		counter *rlock_runlock_done;
-		nsync_time read_start_time;
-		nsync_mu mu;
-		int value = 0;
-		counter *thread_done;
-
-		nsync_time start_time;
-
-		nsync_mu_init (&mu);
-		start_time = nsync_time_now (NSYNC_CLOCK);
-
-		/* ------------------------------------ */
-		/* Acquire lock with nsync_mu_trylock().  This thread will hold
-		   a write lock until the next line with =====.  */
-		thread_done = counter_new (1);
-		attempt_trylock (t, "A", verbose, &mu, 1, 0, &value, 0, thread_done);
-		counter_wait_for_zero (thread_done);
-
-		nsync_mu_assert_held (&mu);
-		nsync_mu_rassert_held (&mu);
-
-		counter_inc (thread_done, 1);
-		/* Can't get read lock while holding write lock. */
-		closure_fork (closure_attempt_trylock (&attempt_rtrylock, t, "B", verbose,
-						       &mu, 0, 1, &value, -1, thread_done));
-		counter_wait_for_zero (thread_done);
-
-		if (nsync_mu_is_reader (&mu)) {
-			TEST_FATAL (t, ("expected mu held in write mode"));
-		}
-		nsync_mu_assert_held (&mu);
-		nsync_mu_rassert_held (&mu);
-
-		counter_inc (thread_done, 1);
-		/* Can't get write lock while holding write lock. */
-		closure_fork (closure_attempt_trylock (&attempt_trylock, t, "C", verbose,
-						       &mu, 0, 1, &value, -1, thread_done));
-		counter_wait_for_zero (thread_done);
-
-		nsync_mu_assert_held (&mu);
-		nsync_mu_rassert_held (&mu);
-
-		lock_unlock_done = counter_new (1);
-		lock_unlock_sleeping = counter_new (1);
-		closure_fork (closure_lock_unlock (&lock_unlock, t, "D", verbose,
-						   &mu, &value, 1, writer_duration,
-						   lock_unlock_sleeping, lock_unlock_done));
-
-		counter_wait_for_zero (lock_unlock_sleeping);
-		nsync_time_sleep (NSYNC_CLOCK, delay_duration); /* give time for lock_unlock() thread to wait. */
-
-		nsync_mu_assert_held (&mu);
-		nsync_mu_rassert_held (&mu);
-
-		rlock_runlock_done = counter_new (read_threads);
-		rlock_runlock_sleeping = counter_new (read_threads);
-		for (i = 0; i != read_threads; i++) {
-			/* not guaranteed will complete after lock_unlock() above */
-			closure_fork (closure_lock_unlock (&rlock_runlock, t, "E", verbose,
-							   &mu, &value, -1, reader_duration,
-							   rlock_runlock_sleeping,
-							   rlock_runlock_done));
-		}
-
-		nsync_mu_assert_held (&mu);
-		nsync_mu_rassert_held (&mu);
-
-		counter_wait_for_zero (rlock_runlock_sleeping);
-		nsync_time_sleep (NSYNC_CLOCK, delay_duration); /* time for rlock_runlock() threads to wait. */
-
-		nsync_mu_assert_held (&mu);
-		nsync_mu_rassert_held (&mu);
-
-		if (counter_value (lock_unlock_done) == 0) {
-			TEST_FATAL (t, ("thread was able to acquire write lock "
-				   "while other write lock held"));
-		}
-		if (counter_value (rlock_runlock_done) == 0) {
-			TEST_FATAL (t, ("thread was able to acquire read lock "
-				   "while  write lock held"));
-		}
-
-		nsync_mu_assert_held (&mu);
-		nsync_mu_rassert_held (&mu);
-
-		counter_inc (thread_done, 1);
-		/* Still can't get read lock while holding write lock. */
-		closure_fork (closure_attempt_trylock (&attempt_rtrylock, t, "F", verbose,
-						       &mu, 0, 1, &value, -1, thread_done));
-		counter_wait_for_zero (thread_done);
-
-		nsync_mu_assert_held (&mu);
-		nsync_mu_rassert_held (&mu);
-
-		counter_inc (thread_done, 1);
-		/* Still can't get write lock while holding write lock. */
-		closure_fork (closure_attempt_trylock (&attempt_trylock, t, "G", verbose,
-						       &mu, 0, 1, &value, -1, thread_done));
-		counter_wait_for_zero (thread_done);
-
-		nsync_mu_assert_held (&mu);
-		nsync_mu_rassert_held (&mu);
-
-		nsync_mu_unlock (&mu);
-		/* ==================================== */
-
-		read_start_time = nsync_time_now (NSYNC_CLOCK);
-		counter_wait_for_zero (lock_unlock_done); /* Now can get write lock. */
-		max_write_wait_exceeded += check_times (t, "H", start_time,
-			nsync_time_add (nsync_time_add (delay_duration, delay_duration), writer_duration),
-			slop_duration);
-
-		counter_wait_for_zero (rlock_runlock_done); /* And now can get read lock again. */
-		max_read_wait_exceeded += check_times (t, "I", read_start_time,
-						       reader_duration, slop_duration);
-
-		free (thread_done);
-		free (lock_unlock_done);
-		free (rlock_runlock_done);
-		free (lock_unlock_sleeping);
-		free (rlock_runlock_sleeping);
-	}
-	if (verbose) {
-		TEST_LOG (t, ("write lock max_write_wait_exceeded %d "
-			 "max_read_wait_exceeded %d\n",
-			 max_write_wait_exceeded, max_read_wait_exceeded));
-	}
-	if (max_write_wait_exceeded > limit) {
-		TEST_ERROR (t, ("lock_unlock() took too long %d (more than %d) "
-			   "times out of %d",
-			   max_write_wait_exceeded, limit, loop_count));
-	}
-	if (max_read_wait_exceeded > limit) {
-		TEST_ERROR (t, ("rlock_runlock() took too long %d (more than %d) "
-			   "times out of %d",
-			   max_read_wait_exceeded, limit, loop_count));
-	}
-}
-
-/* --------------------------------------- */
-
-/* Measure the performance of an uncontended nsync_mu. */
-static void benchmark_mu_uncontended (testing t) {
-	int i;
-	int n = testing_n (t);
-	nsync_mu mu;
-	nsync_mu_init (&mu);
-	for (i = 0; i != n; i++) {
-		nsync_mu_lock (&mu);
-		nsync_mu_unlock (&mu);
-	}
-}
-
-/* Return whether int *value is one. */
-static int int_is_1 (const void *value) { return (*(const int *)value == 1); }
-
-/* Return whether int *value is two. */
-static int int_is_2 (const void *value) { return (*(const int *)value == 2); }
-
-/* Return whether int *value is three. */
-static int int_is_3 (const void *value) { return (*(const int *)value == 3); }
-
-/* Set *value to 1, wait for it to become 2, then set it to 3.  *value is under
-   *mu */
-static void waiter (nsync_mu *mu, int *value) {
-	nsync_mu_lock (mu);
-	*value = 1;
-	nsync_mu_wait (mu, &int_is_2, value, NULL);
-	*value = 3;
-	nsync_mu_unlock (mu);
-}
-
-CLOSURE_DECL_BODY2 (waiter, nsync_mu *, int *)
-
-/* Measure the performance of an uncontended nsync_mu
-   with a blocked waiter. */
-static void benchmark_mu_uncontended_waiter (testing t) {
-	int i;
-	int n = testing_n (t);
-	nsync_mu mu;
-	int value = 0;
-	nsync_mu_init (&mu);
-	closure_fork (closure_waiter (&waiter, &mu, &value));
-	nsync_mu_lock (&mu);
-	nsync_mu_wait (&mu, &int_is_1, &value, NULL);
-	nsync_mu_unlock (&mu);
-	for (i = 0; i != n; i++) {
-		nsync_mu_lock (&mu);
-		nsync_mu_unlock (&mu);
-	}
-	nsync_mu_lock (&mu);
-	value = 2;
-	nsync_mu_wait (&mu, &int_is_3, &value, NULL);
-	nsync_mu_unlock (&mu);
-}
-
-/* Measure the performance of an uncontended nsync_mu
-   with a blocked waiter using nsync_mu_unlock_without_wakeup. */
-static void benchmark_mu_uncontended_no_wakeup (testing t) {
-	int i;
-	int n = testing_n (t);
-	nsync_mu mu;
-	int value = 0;
-	nsync_mu_init (&mu);
-	closure_fork (closure_waiter (&waiter, &mu, &value));
-	nsync_mu_lock (&mu);
-	nsync_mu_wait (&mu, &int_is_1, &value, NULL);
-	nsync_mu_unlock (&mu);
-	for (i = 0; i != n; i++) {
-		nsync_mu_lock (&mu);
-		nsync_mu_unlock_without_wakeup (&mu);
-	}
-	nsync_mu_lock (&mu);
-	value = 2;
-	nsync_mu_wait (&mu, &int_is_3, &value, NULL);
-	nsync_mu_unlock (&mu);
-}
-
-/* Measure the performance of an uncontended
-   nsync_mu in read mode. */
-static void benchmark_rmu_uncontended (testing t) {
-	int i;
-	int n = testing_n (t);
-	nsync_mu mu;
-	nsync_mu_init (&mu);
-	for (i = 0; i != n; i++) {
-		nsync_mu_rlock (&mu);
-		nsync_mu_runlock (&mu);
-	}
-}
-
-/* Measure the performance of an uncontended nsync_mu
-   in read mode with a blocked waiter. */
-static void benchmark_rmu_uncontended_waiter (testing t) {
-	int i;
-	int n = testing_n (t);
-	nsync_mu mu;
-	int value = 0;
-	nsync_mu_init (&mu);
-	closure_fork (closure_waiter (&waiter, &mu, &value));
-	nsync_mu_lock (&mu);
-	nsync_mu_wait (&mu, &int_is_1, &value, NULL);
-	nsync_mu_unlock (&mu);
-	for (i = 0; i != n; i++) {
-		nsync_mu_rlock (&mu);
-		nsync_mu_runlock (&mu);
-	}
-	nsync_mu_lock (&mu);
-	value = 2;
-	nsync_mu_wait (&mu, &int_is_3, &value, NULL);
-	nsync_mu_unlock (&mu);
-}
-
-/* Measure the performance of an uncontended pthread_mutex_t. */
-static void benchmark_mutex_uncontended (testing t) {
-	int i;
-	int n = testing_n (t);
-	pthread_mutex_t mu;
-	pthread_mutex_init (&mu, NULL);
-	for (i = 0; i != n; i++) {
-		pthread_mutex_lock (&mu);
-		pthread_mutex_unlock (&mu);
-	}
-	pthread_mutex_destroy (&mu);
-}
-
-/* Measure the performance of an uncontended recursive pthread_mutex_t. */
-static void benchmark_xmutex_uncontended (testing t) {
-	int i;
-	int n = testing_n (t);
-	pthread_mutex_t mu;
-	pthread_mutexattr_t attr;
-	pthread_mutexattr_init (&attr);
-	pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_RECURSIVE);
-	pthread_mutex_init (&mu, &attr);
-	pthread_mutexattr_destroy (&attr);
-	for (i = 0; i != n; i++) {
-		pthread_mutex_lock (&mu);
-		pthread_mutex_unlock (&mu);
-	}
-	pthread_mutex_destroy (&mu);
-}
-
-/* Measure the performance of an uncontended pthread_rwlock_t. */
-static void benchmark_wmutex_uncontended (testing t) {
-	int i;
-	int n = testing_n (t);
-	pthread_rwlock_t mu;
-	pthread_rwlock_init (&mu, NULL);
-	for (i = 0; i != n; i++) {
-		pthread_rwlock_wrlock (&mu);
-		pthread_rwlock_unlock (&mu);
-	}
-	pthread_rwlock_destroy (&mu);
-}
-
-/* Measure the performance of an uncontended
-   pthread_rwlock_t in read mode. */
-static void benchmark_rmutex_uncontended (testing t) {
-	int i;
-	int n = testing_n (t);
-	pthread_rwlock_t mu;
-	pthread_rwlock_init (&mu, NULL);
-	for (i = 0; i != n; i++) {
-		pthread_rwlock_rdlock (&mu);
-		pthread_rwlock_unlock (&mu);
-	}
-	pthread_rwlock_destroy (&mu);
-}
-
-/* ---------------------------------------
-   Benchmarks for contended locks. */
-
-/* It's hard to write these as benchmark functions, since we wish to measure
-   throughput over an extended period (a second or two), rather than get the
-   latency of a few iterations. */
-
-/* A contended_state represents state shared between threads
-   in the contended benchmarks. */
-typedef struct contended_state_s {
-	testing t;
-
-	/* locks to test */
-	nsync_mu mu;
-	pthread_mutex_t mutex;
-	pthread_rwlock_t rwmutex;
-	int count; /* counter protected by a lock above */
-	
-	nsync_mu start_done_mu;
-	int start; /* whether threads should start, under start_done_mu */
-	int not_yet_done;  /* threads not yet complete, under start_done_mu */
-} contended_state;
-
-static int contended_state_may_start (const void *v) {
-	return (((const contended_state *)v)->start);
-}
-
-static int contended_state_all_done (const void *v) {
-	return (((const contended_state *)v)->not_yet_done == 0);
-}
-
-/* Wait for cs.start to become non-zero, then loop, acquiring and
-   releasing mu on each iteration until cs.deadline is reached, then decrement
-   cs.not_yet_done. */
-static void contended_state_contend_loop (contended_state *cs,
-					  void *mu, void (*lock) (void *),
-					  void (*unlock) (void *)) {
-	int n = testing_n (cs->t);
-	int j;
-	int i;
-	nsync_mu_rlock (&cs->start_done_mu);
-	nsync_mu_wait (&cs->start_done_mu, &contended_state_may_start, cs, NULL);
-	nsync_mu_runlock (&cs->start_done_mu);
-
-	for (j = 0; j < n; j += 10000) {
-		for (i = 0; i != 10000; i++) {
-			(*lock) (mu);
-			cs->count++;
-			(*unlock) (mu);
-		}
-	}
-
-	nsync_mu_lock (&cs->start_done_mu);
-	cs->not_yet_done--;
-	nsync_mu_unlock (&cs->start_done_mu);
-}
-
-typedef void (*func_any) (void *);
-CLOSURE_DECL_BODY4 (contended_state_contend_loop, contended_state *, void *, func_any, func_any)
-
-/* Start the threads in a contended test, wait for them to finish,
-   and print the number of iterations achieved. */
-static void contended_state_run_test (contended_state *cs, testing t,
-				      void *mu, void (*lock) (void *),
-				      void (*unlock) (void *)) {
-	int i;
-	cs->t = t;
-	cs->not_yet_done = 4; /* number of threads */
-	cs->start = 0;
-	cs->count = 0;
-	for (i = 0; i != cs->not_yet_done; i++) {
-		closure_fork (closure_contended_state_contend_loop (
-			&contended_state_contend_loop, cs, mu, lock, unlock));
-	}
-	nsync_mu_lock (&cs->start_done_mu);
-	cs->start = 1;
-	nsync_mu_wait (&cs->start_done_mu, &contended_state_all_done, cs, NULL);
-	nsync_mu_unlock (&cs->start_done_mu);
-}
-
-/* Measure the performance of highly contended
-   nsync_mu locks, with small critical sections.  */
-static void benchmark_mu_contended (testing t) {
-	contended_state cs;
-	bzero ((void *) &cs, sizeof (cs));
-	contended_state_run_test (&cs, t, &cs.mu, (void (*) (void*))&nsync_mu_lock,
-				  (void (*) (void*))&nsync_mu_unlock);
-}
-
-/* Measure the performance of highly contended
-   pthread_mutex_t locks, with small critical sections.  */
-static void benchmark_mutex_contended (testing t) {
-	contended_state cs;
-	bzero ((void *) &cs, sizeof (cs));
-	pthread_mutex_init (&cs.mutex, NULL);
-	contended_state_run_test (&cs, t, &cs.mutex, &void_pthread_mutex_lock,
-				  &void_pthread_mutex_unlock);
-	pthread_mutex_destroy (&cs.mutex);
-}
-
-/* Measure the performance of highly contended recursive
-   pthread_mutex_t locks, with small critical sections.  */
-static void benchmark_xmutex_contended (testing t) {
-	contended_state cs;
-	pthread_mutexattr_t attr;
-	bzero ((void *) &cs, sizeof (cs));
-	pthread_mutexattr_init (&attr);
-	pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_RECURSIVE);
-	pthread_mutex_init (&cs.mutex, &attr);
-	pthread_mutexattr_destroy (&attr);
-	contended_state_run_test (&cs, t, &cs.mutex, &void_pthread_mutex_lock,
-				  &void_pthread_mutex_unlock);
-	pthread_mutex_destroy (&cs.mutex);
-}
-
-/* Measure the performance of highly contended
-   pthread_rwlock_t locks, with small critical sections.  */
-static void benchmark_wmutex_contended (testing t) {
-	contended_state cs;
-	bzero ((void *) &cs, sizeof (cs));
-	pthread_rwlock_init (&cs.rwmutex, NULL);
-	contended_state_run_test (&cs, t, &cs.rwmutex, &void_pthread_rwlock_wrlock,
-				  &void_pthread_rwlock_unlock);
-	pthread_rwlock_destroy (&cs.rwmutex);
-}
diff --git a/third_party/nsync/testing/mu_wait2_test.c b/third_party/nsync/testing/mu_wait2_test.c
deleted file mode 100644
index 30f65a2b0..000000000
--- a/third_party/nsync/testing/mu_wait2_test.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/mu_wait_test.inc"
-
-int main (int argc, char *argv[]) {
-	testing_base tb = testing_new (argc, argv, 0);
-	TEST_RUN (tb, test_mu_producer_consumer0);
-	TEST_RUN (tb, test_mu_producer_consumer3);
-	TEST_RUN (tb, test_mu_producer_consumer4);
-	TEST_RUN (tb, test_mu_producer_consumer5);
-	return (testing_base_exit (tb));
-}
diff --git a/third_party/nsync/testing/mu_wait3_test.c b/third_party/nsync/testing/mu_wait3_test.c
deleted file mode 100644
index 37c9ee382..000000000
--- a/third_party/nsync/testing/mu_wait3_test.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/mu_wait_test.inc"
-
-int main (int argc, char *argv[]) {
-	testing_base tb = testing_new (argc, argv, 0);
-	TEST_RUN (tb, test_mu_producer_consumer6);
-	TEST_RUN (tb, test_mu_cancel);
-	return (testing_base_exit (tb));
-}
diff --git a/third_party/nsync/testing/mu_wait_example_test.c b/third_party/nsync/testing/mu_wait_example_test.c
index 90c8bf86b..220e473dd 100644
--- a/third_party/nsync/testing/mu_wait_example_test.c
+++ b/third_party/nsync/testing/mu_wait_example_test.c
@@ -18,7 +18,6 @@
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "third_party/nsync/array.internal.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/heap.internal.h"
 #include "third_party/nsync/mu.h"
 #include "third_party/nsync/mu_wait.h"
@@ -75,7 +74,7 @@ static const char *string_priority_queue_mu_remove_with_deadline (
 	const char *s = NULL;
 	nsync_mu_lock (&q->mu);
 	if (nsync_mu_wait_with_deadline (&q->mu, &spq_is_non_empty, q, NULL,
-					 NSYNC_CLOCK, abs_deadline, NULL) == 0) {
+					 abs_deadline, NULL) == 0) {
 		int alen = A_LEN (&q->heap);
 		if (alen != 0) {
 			s = A (&q->heap, 0);
@@ -100,7 +99,7 @@ static void add_and_wait_mu (string_priority_queue_mu *q,
 	int i;
 	for (i = 0; i != n; i++) {
 		string_priority_queue_mu_add (q, s[i]);
-		nsync_time_sleep (NSYNC_CLOCK, delay);
+		nsync_time_sleep (delay);
 	}
 }
 
@@ -121,7 +120,7 @@ static void a_char_append (a_char *a, const char *str) {
 static void remove_and_print_mu (string_priority_queue_mu *q, nsync_time delay, a_char *output) {
 	const char *s;
 	if ((s = string_priority_queue_mu_remove_with_deadline (q,
-			nsync_time_add (nsync_time_now (NSYNC_CLOCK), delay))) != NULL) {
+			nsync_time_add (nsync_time_now (), delay))) != NULL) {
 		a_char_append (output, s);
 		a_char_append (output, "\n");
 	} else {
@@ -155,7 +154,7 @@ static void example_mu_wait (testing t) {
 					       NELEM (input), input));
 
 	/* delay: "one", "two", "three"; not yet "four" */
-	nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (1200));
+	nsync_time_sleep (nsync_time_ms (1200));
 
 	remove_and_print_mu (&q, nsync_time_ms (1000), &output);    /* "one" */
 	remove_and_print_mu (&q, nsync_time_ms (1000), &output);    /* "three" (less than "two") */
diff --git a/third_party/nsync/testing/mu_wait_test.c b/third_party/nsync/testing/mu_wait_test.c
index 955733856..5ba1c6b53 100644
--- a/third_party/nsync/testing/mu_wait_test.c
+++ b/third_party/nsync/testing/mu_wait_test.c
@@ -15,12 +15,331 @@
 │ See the License for the specific language governing permissions and          │
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/testing/mu_wait_test.inc"
+#include "third_party/nsync/mu_wait.h"
+#include "libc/errno.h"
+#include "libc/str/str.h"
+#include "third_party/nsync/mu.h"
+#include "third_party/nsync/note.h"
+#include "third_party/nsync/testing/closure.h"
+#include "third_party/nsync/testing/smprintf.h"
+#include "third_party/nsync/testing/testing.h"
+#include "third_party/nsync/testing/time_extra.h"
+#include "third_party/nsync/time.h"
+
+/* --------------------------- */
+
+/* A FIFO queue with up to limit elements.
+   The storage for the queue expands as necessary up to limit. */
+typedef struct mu_queue_s {
+	int limit;     /* max value of count---should not be changed after initialization */
+	nsync_mu mu;   /* protects fields below */
+	int pos;       /* index of first in-use element */
+	int count;     /* number of elements in use */
+	void *data[1]; /* in use elements are data[pos, ..., (pos+count-1)%limit] */
+} mu_queue;
+
+/* Return a pointer to new mu_queue. */
+static mu_queue *mu_queue_new (int limit) {
+	mu_queue *q;
+	int size = offsetof (struct mu_queue_s, data) + sizeof (q->data[0]) * limit;
+	q = (mu_queue *) malloc (size);
+	bzero ((void *) q, size);
+	q->limit = limit;
+	return (q);
+}
+
+static int mu_queue_non_empty (const void *v) {
+	const mu_queue *q = (const mu_queue *) v;
+	return (q->count != 0);
+}
+static int mu_queue_non_full (const void *v) {
+	const mu_queue *q = (const mu_queue *) v;
+	return (q->count != q->limit);
+}
+
+/* Add v to the end of the FIFO *q and return non-zero, or if the FIFO already
+   has limit elements and continues to do so until abs_deadline, do nothing and
+   return 0. */
+static int mu_queue_put (mu_queue *q, void *v, nsync_time abs_deadline) {
+	int added = 0;
+	nsync_mu_lock (&q->mu);
+	if (nsync_mu_wait_with_deadline (&q->mu, &mu_queue_non_full,
+					 q, NULL, abs_deadline, NULL) == 0) {
+		int i = q->pos + q->count;
+		if (q->count == q->limit) {
+			testing_panic ("q->count == q->limit");
+		}
+		if (q->limit <= i) {
+			i -= q->limit;
+		}
+		q->data[i] = v;
+		q->count++;
+		added = 1;
+	}
+	nsync_mu_unlock (&q->mu);
+	return (added);
+}
+
+/* Remove the first value from the front of the FIFO *q and return it,
+   or if the FIFO is empty and continues to be so until abs_deadline,
+   do nothing and return NULL. */
+static void *mu_queue_get (mu_queue *q, nsync_time abs_deadline) {
+	void *v = NULL;
+	nsync_mu_lock (&q->mu);
+	if (nsync_mu_wait_with_deadline (&q->mu, &mu_queue_non_empty,
+					 q, NULL, abs_deadline, NULL) == 0) {
+		if (q->count == 0) {
+			testing_panic ("q->count == 0");
+		}
+		v = q->data[q->pos];
+		q->data[q->pos] = NULL;
+		q->pos++;
+		q->count--;
+		if (q->pos == q->limit) {
+			q->pos = 0;
+		}
+	}
+	nsync_mu_unlock (&q->mu);
+	return (v);
+}
+
+/* --------------------------- */
+
+static char ptr_to_int_c;
+#define INT_TO_PTR(x) ((x) + &ptr_to_int_c)
+#define PTR_TO_INT(p) (((char *) (p)) - &ptr_to_int_c)
+
+/* Put count integers on *q, in the sequence start*3, (start+1)*3, (start+2)*3, .... */
+static void producer_mu_n (testing t, mu_queue *q, int start, int count) {
+	int i;
+	for (i = 0; i != count; i++) {
+		if (!mu_queue_put (q, INT_TO_PTR ((start+i)*3), nsync_time_no_deadline)) {
+			TEST_FATAL (t, ("mu_queue_put() returned 0 with no deadline"));
+		}
+	}
+}
+
+CLOSURE_DECL_BODY4 (producer_mu_n, testing , mu_queue *, int, int)
+
+/* Get count integers from *q, and check that they are in the
+   sequence start*3, (start+1)*3, (start+2)*3, .... */
+static void consumer_mu_n (testing t, mu_queue *q, int start, int count) {
+	int i;
+	for (i = 0; i != count; i++) {
+		void *v = mu_queue_get (q, nsync_time_no_deadline);
+		int x;
+		if (v == NULL) {
+			TEST_FATAL (t, ("mu_queue_get() returned 0 with no deadline"));
+		}
+		x = PTR_TO_INT (v);
+		if (x != (start+i)*3) {
+			TEST_FATAL (t, ("mu_queue_get() returned bad value; want %d, got %d",
+				   (start+i)*3, x));
+		}
+	}
+}
+
+/* The number of elements passed from producer to consumer in the
+   test_mu_producer_consumer*() tests below. */
+#define MU_PRODUCER_CONSUMER_N (100000)
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**0. */
+static void test_mu_producer_consumer0 (testing t) {
+	mu_queue *q = mu_queue_new (1);
+	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
+	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**1. */
+static void test_mu_producer_consumer1 (testing t) {
+	mu_queue *q = mu_queue_new (10);
+	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
+	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**2. */
+static void test_mu_producer_consumer2 (testing t) {
+	mu_queue *q = mu_queue_new (100);
+	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
+	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**3. */
+static void test_mu_producer_consumer3 (testing t) {
+	mu_queue *q = mu_queue_new (1000);
+	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
+	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**4. */
+static void test_mu_producer_consumer4 (testing t) {
+	mu_queue *q = mu_queue_new (10000);
+	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
+	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**5. */
+static void test_mu_producer_consumer5 (testing t) {
+	mu_queue *q = mu_queue_new (100000);
+	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
+	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* Send a stream of integers from a producer thread to
+   a consumer thread via a queue with limit 10**6. */
+static void test_mu_producer_consumer6 (testing t) {
+	mu_queue *q = mu_queue_new (1000000);
+	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
+	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
+	free (q);
+}
+
+/* A perpetually false wait condition. */
+static int false_condition (const void *v) {
+	return (0);
+}
+
+/* The following values control how aggressively we police the timeout. */
+#define TOO_EARLY_MS 1  
+#define TOO_LATE_MS 100   /* longer, to accommodate scheduling delays */
+#define TOO_LATE_ALLOWED 25       /* number of iterations permitted to violate too_late */
+
+/* Check timeouts on a mu wait_with_deadline(). */
+static void test_mu_deadline (testing t) {
+	int i;
+	int too_late_violations;
+	nsync_mu mu;
+	nsync_time too_early;
+	nsync_time too_late;
+
+	nsync_mu_init (&mu);
+	too_early = nsync_time_ms (TOO_EARLY_MS);
+	too_late = nsync_time_ms (TOO_LATE_MS);
+	too_late_violations = 0;
+	nsync_mu_lock (&mu);;
+	for (i = 0; i != 50; i++) {
+		nsync_time end_time;
+		nsync_time start_time;
+		nsync_time expected_end_time;
+		start_time = nsync_time_now ();
+		expected_end_time = nsync_time_add (start_time, nsync_time_ms (87));
+		if (nsync_mu_wait_with_deadline (&mu, &false_condition, NULL, NULL,
+						 expected_end_time, NULL) != ETIMEDOUT) {
+			TEST_FATAL (t, ("nsync_mu_wait() returned non-expired for a timeout"));
+		}
+		end_time = nsync_time_now ();
+		if (nsync_time_cmp (end_time, nsync_time_sub (expected_end_time, too_early)) < 0) {
+			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
+			TEST_ERROR (t, ("nsync_mu_wait() returned %s too early", elapsed_str));
+			free (elapsed_str);
+		}
+		if (nsync_time_cmp (nsync_time_add (expected_end_time, too_late),  end_time) < 0) {
+			too_late_violations++;
+		}
+	}
+	nsync_mu_unlock (&mu);
+	if (too_late_violations > TOO_LATE_ALLOWED) {
+		TEST_ERROR (t, ("nsync_mu_wait() returned too late %d (> %d) times",
+			   too_late_violations, TOO_LATE_ALLOWED));
+	}
+}
+
+/* Check cancellations on a mu wait_with_deadline(). */
+static void test_mu_cancel (testing t) {
+	int i;
+	nsync_time future_time;
+	int too_late_violations;
+	nsync_mu mu;
+	nsync_time too_early;
+	nsync_time too_late;
+
+	nsync_mu_init (&mu);
+	too_early = nsync_time_ms (TOO_EARLY_MS);
+	too_late = nsync_time_ms (TOO_LATE_MS);
+
+	/* The loops below cancel after 87 milliseconds, like the timeout tests above. */
+
+	future_time = nsync_time_add (nsync_time_now (), nsync_time_ms (3600000)); /* test cancels with timeout */
+
+	too_late_violations = 0;
+	nsync_mu_lock (&mu);
+	for (i = 0; i != 50; i++) {
+		nsync_time end_time;
+		nsync_time start_time;
+		nsync_time expected_end_time;
+		int x;
+		nsync_note cancel;
+
+		start_time = nsync_time_now ();
+		expected_end_time = nsync_time_add (start_time, nsync_time_ms (87));
+		cancel = nsync_note_new (NULL, expected_end_time);
+
+		x = nsync_mu_wait_with_deadline (&mu, &false_condition, NULL, NULL,
+						 future_time, cancel);
+		if (x != ECANCELED) {
+			TEST_FATAL (t, ("nsync_mu_wait() return non-cancelled (%d) for "
+				   "a cancellation; expected %d",
+				   x, ECANCELED));
+		}
+		end_time = nsync_time_now ();
+		if (nsync_time_cmp (end_time, nsync_time_sub (expected_end_time, too_early)) < 0) {
+			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
+			TEST_ERROR (t, ("nsync_mu_wait() returned %s too early", elapsed_str));
+			free (elapsed_str);
+		}
+		if (nsync_time_cmp (nsync_time_add (expected_end_time, too_late), end_time) < 0) {
+			too_late_violations++;
+		}
+
+		/* Check that an already cancelled wait returns immediately. */
+		start_time = nsync_time_now ();
+		x = nsync_mu_wait_with_deadline (&mu, &false_condition, NULL, NULL,
+						 nsync_time_no_deadline, cancel);
+		if (x != ECANCELED) {
+			TEST_FATAL (t, ("nsync_mu_wait() returned non-cancelled for a "
+				   "cancellation; expected %d",
+				   x, ECANCELED));
+		}
+		end_time = nsync_time_now ();
+		if (nsync_time_cmp (end_time, start_time) < 0) {
+			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
+			TEST_ERROR (t, ("nsync_mu_wait() returned %s too early", elapsed_str));
+			free (elapsed_str);
+		}
+		if (nsync_time_cmp (nsync_time_add (start_time, too_late), end_time) < 0) {
+			too_late_violations++;
+		}
+		nsync_note_free (cancel);
+	}
+	nsync_mu_unlock (&mu);
+	if (too_late_violations > TOO_LATE_ALLOWED) {
+		TEST_ERROR (t, ("nsync_mu_wait() returned too late %d (> %d) times",
+			   too_late_violations, TOO_LATE_ALLOWED));
+	}
+}
 
 int main (int argc, char *argv[]) {
 	testing_base tb = testing_new (argc, argv, 0);
+	TEST_RUN (tb, test_mu_producer_consumer0);
 	TEST_RUN (tb, test_mu_producer_consumer1);
 	TEST_RUN (tb, test_mu_producer_consumer2);
+	TEST_RUN (tb, test_mu_producer_consumer3);
+	TEST_RUN (tb, test_mu_producer_consumer4);
+	TEST_RUN (tb, test_mu_producer_consumer5);
+	TEST_RUN (tb, test_mu_producer_consumer6);
 	TEST_RUN (tb, test_mu_deadline);
+	TEST_RUN (tb, test_mu_cancel);
 	return (testing_base_exit (tb));
 }
diff --git a/third_party/nsync/testing/mu_wait_test.inc b/third_party/nsync/testing/mu_wait_test.inc
deleted file mode 100644
index 5ab4dedcc..000000000
--- a/third_party/nsync/testing/mu_wait_test.inc
+++ /dev/null
@@ -1,333 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/mu_wait.h"
-#include "libc/errno.h"
-#include "libc/str/str.h"
-#include "third_party/nsync/time.h"
-#include "third_party/nsync/mu.h"
-#include "third_party/nsync/note.h"
-#include "third_party/nsync/testing/closure.h"
-#include "third_party/nsync/testing/smprintf.h"
-#include "third_party/nsync/testing/testing.h"
-#include "third_party/nsync/testing/time_extra.h"
-
-/* --------------------------- */
-
-/* A FIFO queue with up to limit elements.
-   The storage for the queue expands as necessary up to limit. */
-typedef struct mu_queue_s {
-	int limit;     /* max value of count---should not be changed after initialization */
-	nsync_mu mu;   /* protects fields below */
-	int pos;       /* index of first in-use element */
-	int count;     /* number of elements in use */
-	void *data[1]; /* in use elements are data[pos, ..., (pos+count-1)%limit] */
-} mu_queue;
-
-/* Return a pointer to new mu_queue. */
-static mu_queue *mu_queue_new (int limit) {
-	mu_queue *q;
-	int size = offsetof (struct mu_queue_s, data) + sizeof (q->data[0]) * limit;
-	q = (mu_queue *) malloc (size);
-	bzero ((void *) q, size);
-	q->limit = limit;
-	return (q);
-}
-
-static int mu_queue_non_empty (const void *v) {
-	const mu_queue *q = (const mu_queue *) v;
-	return (q->count != 0);
-}
-static int mu_queue_non_full (const void *v) {
-	const mu_queue *q = (const mu_queue *) v;
-	return (q->count != q->limit);
-}
-
-/* Add v to the end of the FIFO *q and return non-zero, or if the FIFO already
-   has limit elements and continues to do so until abs_deadline, do nothing and
-   return 0. */
-static int mu_queue_put (mu_queue *q, void *v, nsync_time abs_deadline) {
-	int added = 0;
-	nsync_mu_lock (&q->mu);
-	if (nsync_mu_wait_with_deadline (&q->mu, &mu_queue_non_full,
-					 q, NULL, 0, abs_deadline, NULL) == 0) {
-		int i = q->pos + q->count;
-		if (q->count == q->limit) {
-			testing_panic ("q->count == q->limit");
-		}
-		if (q->limit <= i) {
-			i -= q->limit;
-		}
-		q->data[i] = v;
-		q->count++;
-		added = 1;
-	}
-	nsync_mu_unlock (&q->mu);
-	return (added);
-}
-
-/* Remove the first value from the front of the FIFO *q and return it,
-   or if the FIFO is empty and continues to be so until abs_deadline,
-   do nothing and return NULL. */
-static void *mu_queue_get (mu_queue *q, nsync_time abs_deadline) {
-	void *v = NULL;
-	nsync_mu_lock (&q->mu);
-	if (nsync_mu_wait_with_deadline (&q->mu, &mu_queue_non_empty,
-					 q, NULL, NSYNC_CLOCK,
-					 abs_deadline, NULL) == 0) {
-		if (q->count == 0) {
-			testing_panic ("q->count == 0");
-		}
-		v = q->data[q->pos];
-		q->data[q->pos] = NULL;
-		q->pos++;
-		q->count--;
-		if (q->pos == q->limit) {
-			q->pos = 0;
-		}
-	}
-	nsync_mu_unlock (&q->mu);
-	return (v);
-}
-
-/* --------------------------- */
-
-static char ptr_to_int_c;
-#define INT_TO_PTR(x) ((x) + &ptr_to_int_c)
-#define PTR_TO_INT(p) (((char *) (p)) - &ptr_to_int_c)
-
-/* Put count integers on *q, in the sequence start*3, (start+1)*3, (start+2)*3, .... */
-static void producer_mu_n (testing t, mu_queue *q, int start, int count) {
-	int i;
-	for (i = 0; i != count; i++) {
-		if (!mu_queue_put (q, INT_TO_PTR ((start+i)*3), nsync_time_no_deadline)) {
-			TEST_FATAL (t, ("mu_queue_put() returned 0 with no deadline"));
-		}
-	}
-}
-
-CLOSURE_DECL_BODY4 (producer_mu_n, testing , mu_queue *, int, int)
-
-/* Get count integers from *q, and check that they are in the
-   sequence start*3, (start+1)*3, (start+2)*3, .... */
-static void consumer_mu_n (testing t, mu_queue *q, int start, int count) {
-	int i;
-	for (i = 0; i != count; i++) {
-		void *v = mu_queue_get (q, nsync_time_no_deadline);
-		int x;
-		if (v == NULL) {
-			TEST_FATAL (t, ("mu_queue_get() returned 0 with no deadline"));
-		}
-		x = PTR_TO_INT (v);
-		if (x != (start+i)*3) {
-			TEST_FATAL (t, ("mu_queue_get() returned bad value; want %d, got %d",
-				   (start+i)*3, x));
-		}
-	}
-}
-
-/* The number of elements passed from producer to consumer in the
-   test_mu_producer_consumer*() tests below. */
-#define MU_PRODUCER_CONSUMER_N (100000)
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**0. */
-static void test_mu_producer_consumer0 (testing t) {
-	mu_queue *q = mu_queue_new (1);
-	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
-	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**1. */
-static void test_mu_producer_consumer1 (testing t) {
-	mu_queue *q = mu_queue_new (10);
-	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
-	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**2. */
-static void test_mu_producer_consumer2 (testing t) {
-	mu_queue *q = mu_queue_new (100);
-	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
-	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**3. */
-static void test_mu_producer_consumer3 (testing t) {
-	mu_queue *q = mu_queue_new (1000);
-	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
-	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**4. */
-static void test_mu_producer_consumer4 (testing t) {
-	mu_queue *q = mu_queue_new (10000);
-	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
-	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**5. */
-static void test_mu_producer_consumer5 (testing t) {
-	mu_queue *q = mu_queue_new (100000);
-	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
-	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* Send a stream of integers from a producer thread to
-   a consumer thread via a queue with limit 10**6. */
-static void test_mu_producer_consumer6 (testing t) {
-	mu_queue *q = mu_queue_new (1000000);
-	closure_fork (closure_producer_mu_n (&producer_mu_n, t, q, 0, MU_PRODUCER_CONSUMER_N));
-	consumer_mu_n (t, q, 0, MU_PRODUCER_CONSUMER_N);
-	free (q);
-}
-
-/* A perpetually false wait condition. */
-static int false_condition (const void *v) {
-	return (0);
-}
-
-/* The following values control how aggressively we police the timeout. */
-#define TOO_EARLY_MS 1  
-#define TOO_LATE_MS 100   /* longer, to accommodate scheduling delays */
-#define TOO_LATE_ALLOWED 25       /* number of iterations permitted to violate too_late */
-
-/* Check timeouts on a mu wait_with_deadline(). */
-static void test_mu_deadline (testing t) {
-	int i;
-	int too_late_violations;
-	nsync_mu mu;
-	nsync_time too_early;
-	nsync_time too_late;
-
-	nsync_mu_init (&mu);
-	too_early = nsync_time_ms (TOO_EARLY_MS);
-	too_late = nsync_time_ms (TOO_LATE_MS);
-	too_late_violations = 0;
-	nsync_mu_lock (&mu);
-	for (i = 0; i != 50; i++) {
-		nsync_time end_time;
-		nsync_time start_time;
-		nsync_time expected_end_time;
-		start_time = nsync_time_now (NSYNC_CLOCK);
-		expected_end_time = nsync_time_add (start_time, nsync_time_ms (87));
-		if (nsync_mu_wait_with_deadline (&mu, &false_condition, NULL, NULL, NSYNC_CLOCK,
-						 expected_end_time, NULL) != ETIMEDOUT) {
-			TEST_FATAL (t, ("nsync_mu_wait() returned non-expired for a timeout"));
-		}
-		end_time = nsync_time_now (NSYNC_CLOCK);
-		if (nsync_time_cmp (end_time, nsync_time_sub (expected_end_time, too_early)) < 0) {
-			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
-			TEST_ERROR (t, ("nsync_mu_wait() returned %s too early", elapsed_str));
-			free (elapsed_str);
-		}
-		if (nsync_time_cmp (nsync_time_add (expected_end_time, too_late),  end_time) < 0) {
-			too_late_violations++;
-		}
-	}
-	nsync_mu_unlock (&mu);
-	if (too_late_violations > TOO_LATE_ALLOWED) {
-		TEST_ERROR (t, ("nsync_mu_wait() returned too late %d (> %d) times",
-			   too_late_violations, TOO_LATE_ALLOWED));
-	}
-}
-
-/* Check cancellations on a mu wait_with_deadline(). */
-static void test_mu_cancel (testing t) {
-	int i;
-	nsync_time future_time;
-	int too_late_violations;
-	nsync_mu mu;
-	nsync_time too_early;
-	nsync_time too_late;
-
-	nsync_mu_init (&mu);
-	too_early = nsync_time_ms (TOO_EARLY_MS);
-	too_late = nsync_time_ms (TOO_LATE_MS);
-
-	/* The loops below cancel after 87 milliseconds, like the timeout tests above. */
-
-	future_time = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (3600000)); /* test cancels with timeout */
-
-	too_late_violations = 0;
-	nsync_mu_lock (&mu);
-	for (i = 0; i != 50; i++) {
-		nsync_time end_time;
-		nsync_time start_time;
-		nsync_time expected_end_time;
-		int x;
-		nsync_note cancel;
-
-		start_time = nsync_time_now (NSYNC_CLOCK);
-		expected_end_time = nsync_time_add (start_time, nsync_time_ms (87));
-		cancel = nsync_note_new (NULL, NSYNC_CLOCK, expected_end_time);
-
-		x = nsync_mu_wait_with_deadline (&mu, &false_condition, NULL, NULL,
-						 NSYNC_CLOCK, future_time, cancel);
-		if (x != ECANCELED) {
-			TEST_FATAL (t, ("nsync_mu_wait() return non-cancelled (%d) for "
-				   "a cancellation; expected %d",
-				   x, ECANCELED));
-		}
-		end_time = nsync_time_now (NSYNC_CLOCK);
-		if (nsync_time_cmp (end_time, nsync_time_sub (expected_end_time, too_early)) < 0) {
-			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
-			TEST_ERROR (t, ("nsync_mu_wait() returned %s too early", elapsed_str));
-			free (elapsed_str);
-		}
-		if (nsync_time_cmp (nsync_time_add (expected_end_time, too_late), end_time) < 0) {
-			too_late_violations++;
-		}
-
-		/* Check that an already cancelled wait returns immediately. */
-		start_time = nsync_time_now (NSYNC_CLOCK);
-		x = nsync_mu_wait_with_deadline (&mu, &false_condition, NULL, NULL,
-						 NSYNC_CLOCK, nsync_time_no_deadline,
-						 cancel);
-		if (x != ECANCELED) {
-			TEST_FATAL (t, ("nsync_mu_wait() returned non-cancelled for a "
-				   "cancellation; expected %d",
-				   x, ECANCELED));
-		}
-		end_time = nsync_time_now (NSYNC_CLOCK);
-		if (nsync_time_cmp (end_time, start_time) < 0) {
-			char *elapsed_str = nsync_time_str (nsync_time_sub (expected_end_time, end_time), 2);
-			TEST_ERROR (t, ("nsync_mu_wait() returned %s too early", elapsed_str));
-			free (elapsed_str);
-		}
-		if (nsync_time_cmp (nsync_time_add (start_time, too_late), end_time) < 0) {
-			too_late_violations++;
-		}
-		nsync_note_free (cancel);
-	}
-	nsync_mu_unlock (&mu);
-	if (too_late_violations > TOO_LATE_ALLOWED) {
-		TEST_ERROR (t, ("nsync_mu_wait() returned too late %d (> %d) times",
-			   too_late_violations, TOO_LATE_ALLOWED));
-	}
-}
diff --git a/third_party/nsync/testing/note_test.c b/third_party/nsync/testing/note_test.c
index 4321c1e75..57298683f 100644
--- a/third_party/nsync/testing/note_test.c
+++ b/third_party/nsync/testing/note_test.c
@@ -20,13 +20,12 @@
 #include "third_party/nsync/testing/smprintf.h"
 #include "third_party/nsync/testing/testing.h"
 #include "third_party/nsync/testing/time_extra.h"
-#include "libc/dce.h"
 #include "third_party/nsync/time.h"
 
 /* Verify the properties of a prenotified note. */
 static void test_note_prenotified (testing t) {
 	int i;
-	nsync_note n = nsync_note_new (NULL, NSYNC_CLOCK, nsync_time_zero /* prenotified */);
+	nsync_note n = nsync_note_new (NULL, nsync_time_zero /* prenotified */);
 	nsync_time expiry;
 	expiry = nsync_note_expiry (n);
 	if (nsync_time_cmp (expiry, nsync_time_zero) != 0) {
@@ -56,7 +55,7 @@ static void test_note_unnotified (testing t) {
 	nsync_time start;
 	nsync_time waited;
 	nsync_time deadline;
-	nsync_note n = nsync_note_new (NULL, NSYNC_CLOCK, nsync_time_no_deadline);
+	nsync_note n = nsync_note_new (NULL, nsync_time_no_deadline);
 	nsync_time expiry;
 	expiry = nsync_note_expiry (n);
 	if (nsync_time_cmp (expiry, nsync_time_no_deadline) != 0) {
@@ -69,17 +68,17 @@ static void test_note_unnotified (testing t) {
 	if (nsync_note_wait (n, nsync_time_zero)) {
 		TEST_ERROR (t, ("notified note is notified (poll)"));
 	}
-	start = nsync_time_now (NSYNC_CLOCK);
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1000));
+	start = nsync_time_now ();
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (1000));
 	if (nsync_note_wait (n, deadline)) {
 		TEST_ERROR (t, ("unnotified note is notified (1s wait)"));
 	}
-	waited = nsync_time_sub (nsync_time_now (NSYNC_CLOCK), start);
+	waited = nsync_time_sub (nsync_time_now (), start);
 	if (nsync_time_cmp (waited, nsync_time_ms (900)) < 0) {
 		TEST_ERROR (t, ("timed wait on unnotified note returned too quickly (1s wait took %s)",
 			   nsync_time_str (waited, 2)));
 	}
-	if (nsync_time_cmp (waited, nsync_time_ms (IsNetbsd() || IsOpenbsd() || IsFreebsd() ? 4000 : 2000)) > 0) {
+	if (nsync_time_cmp (waited, nsync_time_ms (2000)) > 0) {
 		TEST_ERROR (t, ("timed wait on unnotified note returned too slowly (1s wait took %s)",
 			   nsync_time_str (waited, 2)));
 	}
@@ -111,13 +110,13 @@ static void test_note_expiry (testing t) {
 	nsync_time deadline;
 	nsync_note n;
 
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1000));
-	n = nsync_note_new (NULL, NSYNC_CLOCK, deadline);
-	start = nsync_time_now (NSYNC_CLOCK);
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (1000));
+	n = nsync_note_new (NULL, deadline);
+	start = nsync_time_now ();
 	if (!nsync_note_wait (n, nsync_time_no_deadline)) {
 		TEST_ERROR (t, ("expired note is not notified"));
 	}
-	waited = nsync_time_sub (nsync_time_now (NSYNC_CLOCK), start);
+	waited = nsync_time_sub (nsync_time_now (), start);
 	if (nsync_time_cmp (waited, nsync_time_ms (900)) < 0) {
 		TEST_ERROR (t, ("note expired too quickly (1s expiry took %s)",
 			   nsync_time_str (waited, 2)));
@@ -137,13 +136,13 @@ static void test_note_expiry (testing t) {
 	}
 	nsync_note_free (n);
 
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1000));
-	n = nsync_note_new (NULL, NSYNC_CLOCK, deadline);
-	start = nsync_time_now (NSYNC_CLOCK);
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (1000));
+	n = nsync_note_new (NULL, deadline);
+	start = nsync_time_now ();
 	while (!nsync_note_is_notified (n)) {
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (10));
+		nsync_time_sleep (nsync_time_ms (10));
 	}
-	waited = nsync_time_sub (nsync_time_now (NSYNC_CLOCK), start);
+	waited = nsync_time_sub (nsync_time_now (), start);
 	if (nsync_time_cmp (waited, nsync_time_ms (900)) < 0) {
 		TEST_ERROR (t, ("note expired too quickly (1s expiry took %s)",
 			   nsync_time_str (waited, 2)));
@@ -165,7 +164,7 @@ static void test_note_expiry (testing t) {
 }
 
 static void notify_at (nsync_note n, nsync_time abs_deadline) {
-	nsync_time_sleep_until (NSYNC_CLOCK, abs_deadline);
+	nsync_time_sleep_until (abs_deadline);
 	nsync_note_notify (n);
 }
 
@@ -178,14 +177,14 @@ static void test_note_notify (testing t) {
 	nsync_time deadline;
 	nsync_note n;
 
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (10000));
-	n = nsync_note_new (NULL, NSYNC_CLOCK, deadline);
-	closure_fork (closure_notify (&notify_at, n, nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1000))));
-	start = nsync_time_now (NSYNC_CLOCK);
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (10000));
+	n = nsync_note_new (NULL, deadline);
+	closure_fork (closure_notify (&notify_at, n, nsync_time_add (nsync_time_now (), nsync_time_ms (1000))));
+	start = nsync_time_now ();
 	if (!nsync_note_wait (n, nsync_time_no_deadline)) {
 		TEST_ERROR (t, ("expired note is not notified"));
 	}
-	waited = nsync_time_sub (nsync_time_now (NSYNC_CLOCK), start);
+	waited = nsync_time_sub (nsync_time_now (), start);
 	if (nsync_time_cmp (waited, nsync_time_ms (900)) < 0) {
 		TEST_ERROR (t, ("note expired too quickly (1s expiry took %s)",
 			   nsync_time_str (waited, 2)));
@@ -205,14 +204,14 @@ static void test_note_notify (testing t) {
 	}
 	nsync_note_free (n);
 
-	deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (10000));
-	n = nsync_note_new (NULL, NSYNC_CLOCK, deadline);
-	closure_fork (closure_notify (&notify_at, n, nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1000))));
-	start = nsync_time_now (NSYNC_CLOCK);
+	deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (10000));
+	n = nsync_note_new (NULL, deadline);
+	closure_fork (closure_notify (&notify_at, n, nsync_time_add (nsync_time_now (), nsync_time_ms (1000))));
+	start = nsync_time_now ();
 	while (!nsync_note_is_notified (n)) {
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (10));
+		nsync_time_sleep (nsync_time_ms (10));
 	}
-	waited = nsync_time_sub (nsync_time_now (NSYNC_CLOCK), start);
+	waited = nsync_time_sub (nsync_time_now (), start);
 	if (nsync_time_cmp (waited, nsync_time_ms (900)) < 0) {
 		TEST_ERROR (t, ("note expired too quickly (1s expiry took %s)",
 			   nsync_time_str (waited, 2)));
@@ -254,9 +253,9 @@ static void test_note_in_tree (testing t) {
 	nsync_note node[count_i];
 
 	/* Initialize heap structure in the nodes.  No deadlines. */
-	node[0] = nsync_note_new (NULL, NSYNC_CLOCK, nsync_time_no_deadline);
+	node[0] = nsync_note_new (NULL, nsync_time_no_deadline);
 	for (i = 1; i != count_i; i++) {
-		node[i] = nsync_note_new (node[(i-1)/2], NSYNC_CLOCK, nsync_time_no_deadline);
+		node[i] = nsync_note_new (node[(i-1)/2], nsync_time_no_deadline);
 	}
 
 	/* check that the nodes are not yet notified. */
@@ -286,14 +285,14 @@ static void test_note_in_tree (testing t) {
 	}
 
 	/* Initialize heap structure in the nodes.  The focus node has a 1s deadline. */
-	node[0] = nsync_note_new (NULL, NSYNC_CLOCK, nsync_time_no_deadline);
+	node[0] = nsync_note_new (NULL, nsync_time_no_deadline);
 	for (i = 1; i != count_i; i++) {
 		nsync_time deadline;
-		deadline = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (1000));
+		deadline = nsync_time_add (nsync_time_now (), nsync_time_ms (1000));
 		if (i != focus_i) {
 			deadline = nsync_time_no_deadline;
 		}
-		node[i] = nsync_note_new (node[(i - 1) / 2], NSYNC_CLOCK, deadline);
+		node[i] = nsync_note_new (node[(i - 1) / 2], deadline);
 	}
 
 	/* check that the nodes are not yet notified. */
@@ -304,7 +303,7 @@ static void test_note_in_tree (testing t) {
 	}
 
 	/* Wait for timer to go off. */
-	nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (1100));
+	nsync_time_sleep (nsync_time_ms (1100));
 
 	/* Check that the right nodes have been notified. */
 	for (i = 0; i != count_i; i++) {
diff --git a/third_party/nsync/testing/once_test.c b/third_party/nsync/testing/once_test.c
index 0114d7286..6e411d761 100644
--- a/third_party/nsync/testing/once_test.c
+++ b/third_party/nsync/testing/once_test.c
@@ -76,7 +76,7 @@ static void once_thread (struct once_test_thread_s *lott) {
 	nsync_mu_lock (&ott_s_mu);
         s = lott->s;
 	nsync_mu_unlock (&ott_s_mu);
-        nsync_time_sleep (NSYNC_CLOCK, nsync_time_s_ns (0, 1 * 1000 * 1000));
+        nsync_time_sleep (nsync_time_s_ns (0, 1 * 1000 * 1000));
         switch (lott->id & 3) {
         case 0:  nsync_run_once (&s->once, &once_func0); break;
         case 1:  nsync_run_once_spin (&s->once, &once_func1); break;
@@ -111,7 +111,7 @@ static void test_once_run (testing t) {
                         closure_fork (closure_once_thread (&once_thread,
                                                            &ott[j]));
                 }
-                if (nsync_counter_wait (s->done, NSYNC_CLOCK,
+                if (nsync_counter_wait (s->done,
                                         nsync_time_no_deadline) != 0) {
                         TEST_ERROR (t, ("s.done not decremented to 0"));
                 }
diff --git a/third_party/nsync/testing/pingpong_test.c b/third_party/nsync/testing/pingpong_test.c
index 67eeb6ede..5d653bb95 100644
--- a/third_party/nsync/testing/pingpong_test.c
+++ b/third_party/nsync/testing/pingpong_test.c
@@ -17,9 +17,9 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/struct/timespec.h"
 #include "libc/str/str.h"
+#include "libc/sysv/consts/clock.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/thread2.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/cv.h"
 #include "third_party/nsync/mu.h"
 #include "third_party/nsync/mu_wait.h"
@@ -107,7 +107,6 @@ static void mutex_cv_ping_pong (ping_pong *pp, int parity) {
 			nsync_cv_wait_with_deadline_generic (&pp->cv[parity], &pp->mutex,
 						             &void_pthread_mutex_lock,
 						             &void_pthread_mutex_unlock,
-							     NSYNC_CLOCK,
 						             nsync_time_no_deadline, NULL);
 		}
 		pp->i++;
@@ -159,13 +158,12 @@ static void benchmark_ping_pong_mu_cv (testing t) {
 /* Run by each thread in benchmark_ping_pong_mu_cv_unexpired_deadline(). */
 static void mu_cv_unexpired_deadline_ping_pong (ping_pong *pp, int parity) {
 	nsync_time deadline_in1hour;
-	deadline_in1hour = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (3600000));
+	deadline_in1hour = nsync_time_add (nsync_time_now (), nsync_time_ms (3600000));
 	nsync_mu_lock (&pp->mu);
 	while (pp->i < pp->limit) {
 		while ((pp->i & 1) == parity) {
 			nsync_cv_wait_with_deadline (&pp->cv[parity], &pp->mu,
-						     NSYNC_CLOCK, deadline_in1hour,
-						     NULL);
+						     deadline_in1hour, NULL);
 		}
 		pp->i++;
 		nsync_cv_signal (&pp->cv[1 - parity]);
@@ -200,11 +198,11 @@ static const condition_func condition[] = { &even_ping_pong, &odd_ping_pong };
 /* Run by each thread in benchmark_ping_pong_mu_unexpired_deadline(). */
 static void mu_unexpired_deadline_ping_pong (ping_pong *pp, int parity) {
 	nsync_time deadline_in1hour;
-	deadline_in1hour = nsync_time_add (nsync_time_now (NSYNC_CLOCK), nsync_time_ms (3600000));
+	deadline_in1hour = nsync_time_add (nsync_time_now (), nsync_time_ms (3600000));
 	nsync_mu_lock (&pp->mu);
 	while (pp->i < pp->limit) {
 		nsync_mu_wait_with_deadline (&pp->mu, condition[parity], pp, NULL,
-					     NSYNC_CLOCK, deadline_in1hour, NULL);
+					     deadline_in1hour, NULL);
 		pp->i++;
 	}
 	nsync_mu_unlock (&pp->mu);
@@ -227,7 +225,7 @@ static void benchmark_ping_pong_mu_unexpired_deadline (testing t) {
 /* Run by each thread in benchmark_ping_pong_mutex_cond_unexpired_deadline(). */
 static void mutex_cond_unexpired_deadline_ping_pong (ping_pong *pp, int parity) {
 	struct timespec ts;
-	clock_gettime (NSYNC_CLOCK, &ts);
+	clock_gettime (CLOCK_REALTIME, &ts);
 	ts.tv_sec += 3600;
 	pthread_mutex_lock (&pp->mutex);
 	while (pp->i < pp->limit) {
@@ -320,7 +318,6 @@ static void rw_mutex_cv_ping_pong (ping_pong *pp, int parity) {
 			nsync_cv_wait_with_deadline_generic (&pp->cv[parity], &pp->rwmutex,
 						             &void_pthread_rwlock_wrlock,
 						             &void_pthread_rwlock_unlock,
-							     NSYNC_CLOCK,
 						             nsync_time_no_deadline, NULL);
 		}
 		pp->i++;
@@ -353,8 +350,7 @@ static void wait_n_cv_ping_pong (ping_pong *pp, int parity) {
 		while ((pp->i & 1) == parity) {
 			nsync_wait_n (&pp->mu, (void (*) (void *)) &nsync_mu_lock,
 				      (void (*) (void *)) &nsync_mu_unlock,
-				      NSYNC_CLOCK, nsync_time_no_deadline, 1,
-				      &pwaitable);
+				      nsync_time_no_deadline, 1, &pwaitable);
 		}
 		pp->i++;
 		nsync_cv_signal (&pp->cv[1 - parity]);
diff --git a/third_party/nsync/testing/start_thread.c b/third_party/nsync/testing/start_thread.c
index b025e710d..f4e122d9c 100644
--- a/third_party/nsync/testing/start_thread.c
+++ b/third_party/nsync/testing/start_thread.c
@@ -16,9 +16,6 @@
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/mem/mem.h"
-#include "libc/stdio/stdio.h"
-#include "libc/str/str.h"
-#include "libc/runtime/runtime.h"
 #include "libc/thread/thread.h"
 
 struct thd_args {
@@ -38,10 +35,6 @@ void nsync_start_thread_ (void (*f) (void *), void *arg) {
 	pthread_t t;
 	args->f = f;
 	args->arg = arg;
-	errno_t err = pthread_create (&t, NULL, body, args);
-	if (err) {
-		fprintf(stderr, "pthread_create: %s\n", strerror(err));
-		exit(1);
-	}
+	pthread_create (&t, NULL, body, args);
 	pthread_detach (t);
 }
diff --git a/third_party/nsync/testing/testing.c b/third_party/nsync/testing/testing.c
index 062dee306..321e752b0 100644
--- a/third_party/nsync/testing/testing.c
+++ b/third_party/nsync/testing/testing.c
@@ -239,9 +239,9 @@ static void run_test (testing t) {
 	t->test_status = 0;
 	t->n = 0;
 	t->stop_time = nsync_time_zero;
-	t->start_time = nsync_time_now (NSYNC_CLOCK);
+	t->start_time = nsync_time_now ();
 	(*t->f) (t);
-	elapsed_str = nsync_time_str (nsync_time_sub (nsync_time_now (NSYNC_CLOCK), t->start_time), 2);
+	elapsed_str = nsync_time_str (nsync_time_sub (nsync_time_now (), t->start_time), 2);
 	if (!ATM_LOAD (&t->partial_line)) {
 		fprintf (t->fp, "%-25s %-45s  %s %8s\n", tb->prog, t->name,
 		         t->test_status != 0? "failed": "passed", elapsed_str);
@@ -275,9 +275,9 @@ static void run_benchmark (testing t) {
 		t->test_status = 0;
 		t->n = n;
 		t->stop_time = nsync_time_zero;
-		t->start_time = nsync_time_now (NSYNC_CLOCK);
+		t->start_time = nsync_time_now ();
 		(*t->f) (t);
-		elapsed = nsync_time_to_dbl (nsync_time_sub (nsync_time_now (NSYNC_CLOCK), t->start_time));
+		elapsed = nsync_time_to_dbl (nsync_time_sub (nsync_time_now (), t->start_time));
 		if (elapsed < 1e-1) {
 			elapsed = 1e-1;
 		}
@@ -445,9 +445,9 @@ int testing_is_uniprocessor (testing t) {
 
 		ATM_STORE_REL (&state, 0);
 		closure_fork (closure_uniprocessor_check (&uniprocessor_check, &state, &s[0]));
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (100));
+		nsync_time_sleep (nsync_time_ms (100));
 		ATM_STORE_REL (&state, 1);
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (400));
+		nsync_time_sleep (nsync_time_ms (400));
 		ATM_STORE_REL (&state, 2);
 		while (!ATM_LOAD_ACQ (&s[0].done)) {
 		}
@@ -455,9 +455,9 @@ int testing_is_uniprocessor (testing t) {
 		ATM_STORE_REL (&state, 0);
 		closure_fork (closure_uniprocessor_check (&uniprocessor_check, &state, &s[1]));
 		closure_fork (closure_uniprocessor_check (&uniprocessor_check, &state, &s[2]));
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (100));
+		nsync_time_sleep (nsync_time_ms (100));
 		ATM_STORE_REL (&state, 1);
-		nsync_time_sleep (NSYNC_CLOCK, nsync_time_ms (400));
+		nsync_time_sleep (nsync_time_ms (400));
 		ATM_STORE_REL (&state, 2);
 		while (!ATM_LOAD_ACQ (&s[1].done) || !ATM_LOAD_ACQ (&s[2].done)) {
 		}
@@ -472,7 +472,7 @@ void testing_stop_timer (testing t) {
 	if (nsync_time_cmp (t->stop_time, nsync_time_zero) != 0) {
 		abort ();
 	}
-	t->stop_time = nsync_time_now (NSYNC_CLOCK);
+	t->stop_time = nsync_time_now ();
 }
 
 void testing_start_timer (testing t) {
@@ -480,7 +480,7 @@ void testing_start_timer (testing t) {
 		abort ();
 	}
 	t->start_time = nsync_time_add (t->start_time,
-		nsync_time_sub (nsync_time_now (NSYNC_CLOCK), t->stop_time));
+		nsync_time_sub (nsync_time_now (), t->stop_time));
 	t->stop_time = nsync_time_zero;
 }
 
diff --git a/third_party/nsync/testing/wait_test.c b/third_party/nsync/testing/wait_test.c
index 567f35979..6e3c51161 100644
--- a/third_party/nsync/testing/wait_test.c
+++ b/third_party/nsync/testing/wait_test.c
@@ -17,7 +17,6 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
 #include "third_party/nsync/array.internal.h"
-#include "third_party/nsync/time.h"
 #include "third_party/nsync/counter.h"
 #include "third_party/nsync/note.h"
 #include "third_party/nsync/testing/closure.h"
@@ -25,12 +24,10 @@
 #include "third_party/nsync/testing/testing.h"
 #include "third_party/nsync/testing/time_extra.h"
 #include "third_party/nsync/time.h"
-#include "libc/calls/calls.h"
-#include "libc/dce.h"
 #include "third_party/nsync/waiter.h"
 
 static void decrement_at (nsync_counter c, nsync_time abs_deadline, nsync_counter done) {
-	nsync_time_sleep_until (NSYNC_CLOCK, abs_deadline);
+	nsync_time_sleep_until (abs_deadline);
 	nsync_counter_add (c, -1);
 	nsync_counter_add (done, -1);
 }
@@ -38,7 +35,7 @@ static void decrement_at (nsync_counter c, nsync_time abs_deadline, nsync_counte
 CLOSURE_DECL_BODY3 (decrement, nsync_counter, nsync_time, nsync_counter)
 
 static void notify_at (nsync_note n, nsync_time abs_deadline, nsync_counter done) {
-	nsync_time_sleep_until (NSYNC_CLOCK, abs_deadline);
+	nsync_time_sleep_until (abs_deadline);
 	nsync_note_notify (n);
 	nsync_counter_add (done, -1);
 }
@@ -64,7 +61,7 @@ static void test_wait_n (testing t) {
 		a_pwaitable apw;
 		bzero (&aw, sizeof (aw));
 		bzero (&apw, sizeof (apw));
-		now = nsync_time_now (NSYNC_CLOCK);
+		now = nsync_time_now ();
 		deadline = nsync_time_add (now, nsync_time_ms (100));
 		for (j = A_LEN (&aw); A_LEN (&aw) < j+ncounter;) {
 			nsync_counter c = nsync_counter_new (0);
@@ -78,28 +75,28 @@ static void test_wait_n (testing t) {
 			}
 		}
 		for (j = A_LEN (&aw); A_LEN (&aw) < j+nnote;) {
-			nsync_note n = nsync_note_new (NULL, NSYNC_CLOCK, nsync_time_no_deadline);
+			nsync_note n = nsync_note_new (NULL, nsync_time_no_deadline);
 			struct nsync_waitable_s *w = &A_PUSH (&aw);
 			w->v = n;
 			w->funcs = &nsync_note_waitable_funcs;
 			nsync_counter_add (done, 1);
 			closure_fork (closure_notify (&notify_at, n, deadline, done));
 			for (k = 0; k != 4 && A_LEN (&aw) < j+nnote; k++) {
-				nsync_note cn = nsync_note_new (n, NSYNC_CLOCK, nsync_time_no_deadline);
+				nsync_note cn = nsync_note_new (n, nsync_time_no_deadline);
 				struct nsync_waitable_s *lw = &A_PUSH (&aw);
 				lw->v = cn;
 				lw->funcs = &nsync_note_waitable_funcs;
 			}
 		}
 		for (j = A_LEN (&aw); A_LEN (&aw) < j+nnote_expire;) {
-			nsync_note n = nsync_note_new (NULL, NSYNC_CLOCK, deadline);
+			nsync_note n = nsync_note_new (NULL, deadline);
 			struct nsync_waitable_s *w = &A_PUSH (&aw);
 			w->v = n;
 			w->funcs = &nsync_note_waitable_funcs;
 			nsync_counter_add (done, 1);
 			closure_fork (closure_notify (&notify_at, n, deadline, done));
 			for (k = 0; k != 4 && A_LEN (&aw) < j+nnote; k++) {
-				nsync_note cn = nsync_note_new (n, NSYNC_CLOCK, nsync_time_no_deadline);
+				nsync_note cn = nsync_note_new (n, nsync_time_no_deadline);
 				struct nsync_waitable_s *lw = &A_PUSH (&aw);
 				lw->v = cn;
 				lw->funcs = &nsync_note_waitable_funcs;
@@ -112,8 +109,7 @@ static void test_wait_n (testing t) {
 			A_PUSH (&apw) = &A (&aw, j);
 		}
 		while (A_LEN (&apw) != 0) {
-			k = nsync_wait_n (NULL, NULL, NULL,
-					  NSYNC_CLOCK, nsync_time_no_deadline,
+			k = nsync_wait_n (NULL, NULL, NULL, nsync_time_no_deadline,
 					  A_LEN (&apw), &A (&apw, 0));
 			if (k == A_LEN (&apw)) {
 				TEST_ERROR (t, ("nsync_wait_n returned with no waiter ready"));
@@ -121,7 +117,7 @@ static void test_wait_n (testing t) {
 			A (&apw, k) = A (&apw, A_LEN (&apw) - 1);
 			A_DISCARD (&apw, 1);
 		}
-		nsync_counter_wait (done, NSYNC_CLOCK, nsync_time_no_deadline);
+		nsync_counter_wait (done, nsync_time_no_deadline);
 		for (k = 0; k != ncounter; k++) {
 			nsync_counter_free ((nsync_counter) A (&aw, k).v);
 		}
@@ -163,7 +159,7 @@ static void test_wait_n_ready_while_queuing (testing t) {
 	wrapped_note_waitable_funcs.ready_time = &note_ready_time_wrapper;
 
 	for (count = 0; count != sizeof (w) / sizeof (w[0]); count++) {
-		nsync_note n = nsync_note_new (NULL, NSYNC_CLOCK, nsync_time_no_deadline);
+		nsync_note n = nsync_note_new (NULL, nsync_time_no_deadline);
 		if (nsync_note_is_notified (n)) {
 			TEST_ERROR (t, ("nsync_note is unexpectedly notified"));
 		}
@@ -171,8 +167,8 @@ static void test_wait_n_ready_while_queuing (testing t) {
 		w[count].funcs = &wrapped_note_waitable_funcs;
 		pw[count] = &w[count];
 	}
-	woken = nsync_wait_n (NULL, NULL, NULL, NSYNC_CLOCK,
-			      nsync_time_no_deadline, count, pw);
+	woken = nsync_wait_n (NULL, NULL, NULL, nsync_time_no_deadline,
+			      count, pw);
 	if (woken != 0) {
 		TEST_ERROR (t, ("nsync_wait_n unexpectedly failed to find pw[0] notified"));
 	}
@@ -187,9 +183,6 @@ static void test_wait_n_ready_while_queuing (testing t) {
 
 int main (int argc, char *argv[]) {
 	testing_base tb = testing_new (argc, argv, 0);
-	// TODO(jart): remove after cosmocc update when process rlimit flake is solved
-	if (IsAarch64 () && IsQemuUser ())
-		return 0;
 	TEST_RUN (tb, test_wait_n);
 	TEST_RUN (tb, test_wait_n_ready_while_queuing);
 	return (testing_base_exit (tb));
diff --git a/third_party/nsync/time.c b/third_party/nsync/time.c
deleted file mode 100644
index 996459176..000000000
--- a/third_party/nsync/time.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
-│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2016 Google Inc.                                                   │
-│                                                                              │
-│ Licensed under the Apache License, Version 2.0 (the "License");              │
-│ you may not use this file except in compliance with the License.             │
-│ You may obtain a copy of the License at                                      │
-│                                                                              │
-│     http://www.apache.org/licenses/LICENSE-2.0                               │
-│                                                                              │
-│ Unless required by applicable law or agreed to in writing, software          │
-│ distributed under the License is distributed on an "AS IS" BASIS,            │
-│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
-│ See the License for the specific language governing permissions and          │
-│ limitations under the License.                                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/nsync/time.h"
-
-/* Return the current time since the epoch.  */
-nsync_time nsync_time_now(int clock) {
-	nsync_time result;
-	if (clock_gettime (clock, &result))
-		__builtin_trap();
-	return result;
-}
diff --git a/third_party/nsync/time.h b/third_party/nsync/time.h
index 774555685..badd254cd 100644
--- a/third_party/nsync/time.h
+++ b/third_party/nsync/time.h
@@ -1,11 +1,8 @@
 #ifndef NSYNC_TIME_H_
 #define NSYNC_TIME_H_
-#include "libc/sysv/consts/clock.h"
 #include "libc/calls/struct/timespec.h"
 COSMOPOLITAN_C_START_
 
-#define NSYNC_CLOCK CLOCK_REALTIME
-
 #define NSYNC_TIME_SEC(t)  ((t).tv_sec)
 #define NSYNC_TIME_NSEC(t) ((t).tv_nsec)
 #define NSYNC_TIME_STATIC_INIT(t, ns) \
@@ -25,15 +22,15 @@ typedef struct timespec nsync_time;
 #define nsync_time_zero timespec_zero
 
 /* Return the current time since the epoch.  */
-nsync_time nsync_time_now(int clock);
+#define nsync_time_now() timespec_real()
 
 /* Sleep for the specified delay. Returns the unslept time which may be
    non-zero if the call was interrupted. */
-#define nsync_time_sleep(c,a) timespec_sleep(c,a)
+#define nsync_time_sleep(a) timespec_sleep(a)
 
 /* Sleep until the specified time.  Returns 0 on success, and EINTR
    if the call was interrupted. */
-#define nsync_time_sleep_until(c,a) timespec_sleep_until(c,a)
+#define nsync_time_sleep_until(a) timespec_sleep_until(a)
 
 /* Return a+b */
 #define nsync_time_add(a, b) timespec_add(a, b)
diff --git a/third_party/nsync/wait_s.internal.h b/third_party/nsync/wait_s.internal.h
index a4cb868ef..3d1d1de88 100644
--- a/third_party/nsync/wait_s.internal.h
+++ b/third_party/nsync/wait_s.internal.h
@@ -1,7 +1,6 @@
 #ifndef COSMOPOLITAN_LIBC_THREAD_WAIT_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_THREAD_WAIT_INTERNAL_H_
 #include "libc/intrin/dll.h"
-#include "third_party/nsync/defs.h"
 #include "third_party/nsync/atomic.h"
 COSMOPOLITAN_C_START_
 
@@ -11,19 +10,17 @@ COSMOPOLITAN_C_START_
    with v pointing to the client's object and nw pointing to a struct
    nsync_waiter_s. */
 struct nsync_waiter_s {
-#if NSYNC_DEBUG
   uint32_t tag;                   /* used for debugging */
-#endif
   uint32_t flags;                 /* see below */
-  nsync_atomic_uint32_ waiting;   /* non-zero <=> the waiter is waiting */
   struct Dll q;                   /* used to link children of parent */
+  nsync_atomic_uint32_ waiting;   /* non-zero <=> the waiter is waiting */
   struct nsync_semaphore_s_ *sem; /* *sem will be Ved when waiter is woken */
 };
 
 /* set if waiter is embedded in Mu/CV's internal structures */
 #define NSYNC_WAITER_FLAG_MUCV 0x1
 
-void nsync_waiter_destroy_(void *);
+void nsync_waiter_destroy(void *);
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_THREAD_WAIT_INTERNAL_H_ */
diff --git a/third_party/nsync/waiter.h b/third_party/nsync/waiter.h
index 4bf9b801c..b1eeba29f 100644
--- a/third_party/nsync/waiter.h
+++ b/third_party/nsync/waiter.h
@@ -102,7 +102,7 @@ struct nsync_waitable_s {
    mu/lock/unlock are used to acquire and release the relevant locks
    whan waiting on condition variables. */
 int nsync_wait_n(void *mu, void (*lock)(void *), void (*unlock)(void *),
-                 int clock, nsync_time abs_deadline, int count,
+                 nsync_time abs_deadline, int count,
                  struct nsync_waitable_s *waitable[]);
 
 /* A "struct nsync_waitable_s" implementation must implement these
diff --git a/third_party/openmp/BUILD.mk b/third_party/openmp/BUILD.mk
index 7e6dde1f1..a916aa22d 100644
--- a/third_party/openmp/BUILD.mk
+++ b/third_party/openmp/BUILD.mk
@@ -33,8 +33,6 @@ THIRD_PARTY_OPENMP_A_DIRECTDEPS =				\
 	THIRD_PARTY_COMPILER_RT					\
 	THIRD_PARTY_GDTOA					\
 	THIRD_PARTY_LIBCXX					\
-	THIRD_PARTY_LIBCXXABI					\
-	THIRD_PARTY_LIBUNWIND					\
 	THIRD_PARTY_NSYNC					\
 	THIRD_PARTY_MUSL
 
diff --git a/third_party/openmp/kmp_lock.cpp b/third_party/openmp/kmp_lock.cpp
index 593d805b8..95e734536 100644
--- a/third_party/openmp/kmp_lock.cpp
+++ b/third_party/openmp/kmp_lock.cpp
@@ -23,7 +23,7 @@
 
 #if KMP_USE_FUTEX
 #ifdef __COSMOPOLITAN__
-#include <cosmo.h>
+#include "third_party/nsync/futex.internal.h"
 #else
 #include <sys/syscall.h>
 #include <unistd.h>
@@ -380,7 +380,7 @@ __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
 
     long rc;
 #ifdef __COSMOPOLITAN__
-    if ((rc = cosmo_futex_wait((int *)&(lck->lk.poll), poll_val, false, 0, NULL)) != 0) {
+    if ((rc = nsync_futex_wait_((int *)&(lck->lk.poll), poll_val, false, NULL)) != 0) {
 #else
     if ((rc = syscall(__NR_futex, (int *)&(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
                       NULL, 0)) != 0) {
@@ -462,7 +462,7 @@ int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
              ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
               lck, gtid));
 #ifdef __COSMOPOLITAN__
-    cosmo_futex_wake((int *)&(lck->lk.poll), 1, false);
+    nsync_futex_wake_((int *)&(lck->lk.poll), 1, false);
 #else
     syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex),
             NULL, NULL, 0);
diff --git a/third_party/pcre/BUILD.mk b/third_party/pcre/BUILD.mk
index 817aee016..3a32a0cfe 100644
--- a/third_party/pcre/BUILD.mk
+++ b/third_party/pcre/BUILD.mk
@@ -26,8 +26,7 @@ THIRD_PARTY_PCRE_A_DIRECTDEPS =				\
 	LIBC_RUNTIME					\
 	LIBC_STDIO					\
 	LIBC_STR					\
-	LIBC_SYSV					\
-	THIRD_PARTY_MUSL				\
+	LIBC_SYSV
 
 THIRD_PARTY_PCRE_A_DEPS :=				\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_PCRE_A_DIRECTDEPS),$($(x))))
diff --git a/third_party/python/BUILD.mk b/third_party/python/BUILD.mk
index 48b001176..1fe2638b9 100644
--- a/third_party/python/BUILD.mk
+++ b/third_party/python/BUILD.mk
@@ -34,7 +34,6 @@ THIRD_PARTY_PYTHON_CHECKS =						\
 
 # TODO: Deal with aarch64 under qemu not making execve() easy.
 ifneq ($(MODE), dbg)
-ifneq ($(MODE), x86_64-dbg)
 ifeq ($(ARCH), x86_64)
 ifneq ($(UNAME_S), Windows)
 THIRD_PARTY_PYTHON_CHECKS +=						\
@@ -42,7 +41,6 @@ THIRD_PARTY_PYTHON_CHECKS +=						\
 endif
 endif
 endif
-endif
 
 ################################################################################
 # STAGE ONE - BOOTSTRAPPING PYTHON
@@ -476,7 +474,6 @@ THIRD_PARTY_PYTHON_STAGE1_A_DIRECTDEPS =				\
 	LIBC_X								\
 	THIRD_PARTY_DLMALLOC						\
 	THIRD_PARTY_GETOPT						\
-	THIRD_PARTY_MUSL						\
 	THIRD_PARTY_TZ							\
 	THIRD_PARTY_XED							\
 	TOOL_BUILD_LIB							\
@@ -529,6 +526,7 @@ THIRD_PARTY_PYTHON_STAGE2_A_SRCS =					\
 	third_party/python/runpythonmodule.c				\
 	third_party/python/launch.c					\
 	third_party/python/Objects/fromfd.c				\
+	third_party/python/Objects/unicodeobject-deadcode.c		\
 	third_party/python/Modules/_bisectmodule.c			\
 	third_party/python/Modules/_bz2module.c				\
 	third_party/python/Modules/_codecsmodule.c			\
@@ -1176,13 +1174,12 @@ THIRD_PARTY_PYTHON_STAGE2_A_DIRECTDEPS =				\
 	LIBC_NT_KERNEL32						\
 	LIBC_PROC							\
 	LIBC_RUNTIME							\
+	LIBC_THREAD							\
 	LIBC_SOCK							\
 	LIBC_STDIO							\
 	LIBC_STR							\
-	LIBC_SYSTEM							\
 	LIBC_SYSV							\
 	LIBC_SYSV_CALLS							\
-	LIBC_THREAD							\
 	LIBC_TINYMATH							\
 	LIBC_X								\
 	NET_HTTP							\
@@ -1190,14 +1187,14 @@ THIRD_PARTY_PYTHON_STAGE2_A_DIRECTDEPS =				\
 	THIRD_PARTY_BZIP2						\
 	THIRD_PARTY_GDTOA						\
 	THIRD_PARTY_LINENOISE						\
-	THIRD_PARTY_MBEDTLS						\
 	THIRD_PARTY_MUSL						\
+	THIRD_PARTY_MBEDTLS						\
 	THIRD_PARTY_PYTHON_STAGE1					\
 	THIRD_PARTY_SQLITE3						\
 	THIRD_PARTY_TZ							\
-	THIRD_PARTY_XED							\
 	THIRD_PARTY_ZLIB						\
-	TOOL_ARGS							\
+	THIRD_PARTY_XED							\
+	TOOL_ARGS
 
 THIRD_PARTY_PYTHON_STAGE2_A_DEPS =					\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_PYTHON_STAGE2_A_DIRECTDEPS),$($(x))))
@@ -1749,6 +1746,7 @@ THIRD_PARTY_PYTHON_PYTEST_A_DIRECTDEPS =					\
 THIRD_PARTY_PYTHON_PYTEST_PYMAINS =						\
 	third_party/python/Lib/test/signalinterproctester.py			\
 	third_party/python/Lib/test/test___future__.py				\
+	third_party/python/Lib/test/test__locale.py				\
 	third_party/python/Lib/test/test__opcode.py				\
 	third_party/python/Lib/test/test_abc.py					\
 	third_party/python/Lib/test/test_abstract_numbers.py			\
@@ -1845,6 +1843,7 @@ THIRD_PARTY_PYTHON_PYTEST_PYMAINS =						\
 	third_party/python/Lib/test/test_enum.py				\
 	third_party/python/Lib/test/test_enumerate.py				\
 	third_party/python/Lib/test/test_eof.py					\
+	third_party/python/Lib/test/test_epoll.py				\
 	third_party/python/Lib/test/test_errno.py				\
 	third_party/python/Lib/test/test_exception_hierarchy.py			\
 	third_party/python/Lib/test/test_exception_variations.py		\
@@ -1965,6 +1964,7 @@ THIRD_PARTY_PYTHON_PYTEST_PYMAINS =						\
 	third_party/python/Lib/test/test_string.py				\
 	third_party/python/Lib/test/test_string_literals.py			\
 	third_party/python/Lib/test/test_stringprep.py				\
+	third_party/python/Lib/test/test_strptime.py				\
 	third_party/python/Lib/test/test_strtod.py				\
 	third_party/python/Lib/test/test_struct.py				\
 	third_party/python/Lib/test/test_structmembers.py			\
@@ -2148,6 +2148,8 @@ o/$(MODE)/third_party/python/Lib/test/test_wsgiref.py.runs: private	\
 			/usr/local/etc/httpd/conf/mime.types		\
 			/usr/local/etc/mime.types
 
+o/$(MODE)/third_party/python/Lib/test/test_epoll.py.runs:		\
+		private .PLEDGE = stdio rpath wpath cpath fattr proc inet
 o/$(MODE)/third_party/python/Lib/test/test_wsgiref.py.runs:		\
 		private .PLEDGE = stdio rpath wpath cpath fattr proc inet
 o/$(MODE)/third_party/python/Lib/test/test_fcntl.py.runs:		\
@@ -2196,8 +2198,8 @@ o/$(MODE)/third_party/python/Lib/test/test_binhex.py.runs: $(PYTHONTESTER)
 o/$(MODE)/third_party/python/Lib/test/test_capi.py.runs: $(PYTHONTESTER)
 	@$(COMPILE) -ACHECK -wtT$@ $(PYHARNESSARGS) $(PYTHONTESTER) -m test.test_capi $(PYTESTARGS)
 
-# o/$(MODE)/third_party/python/Lib/test/test__locale.py.runs: $(PYTHONTESTER)
-# 	@$(COMPILE) -ACHECK -wtT$@ $(PYHARNESSARGS) $(PYTHONTESTER) -m test.test__locale $(PYTESTARGS)
+o/$(MODE)/third_party/python/Lib/test/test__locale.py.runs: $(PYTHONTESTER)
+	@$(COMPILE) -ACHECK -wtT$@ $(PYHARNESSARGS) $(PYTHONTESTER) -m test.test__locale $(PYTESTARGS)
 
 o/$(MODE)/third_party/python/Lib/test/test_binop.py.runs: $(PYTHONTESTER)
 	@$(COMPILE) -ACHECK -wtT$@ $(PYHARNESSARGS) $(PYTHONTESTER) -m test.test_binop $(PYTESTARGS)
@@ -2785,6 +2787,9 @@ o/$(MODE)/third_party/python/Lib/test/test_dis.py.runs: $(PYTHONTESTER)
 o/$(MODE)/third_party/python/Lib/test/test_asyncore.py.runs: $(PYTHONTESTER)
 	@$(COMPILE) -ACHECK -wtT$@ $(PYHARNESSARGS) $(PYTHONTESTER) -m test.test_asyncore $(PYTESTARGS)
 
+o/$(MODE)/third_party/python/Lib/test/test_epoll.py.runs: $(PYTHONTESTER)
+	@$(COMPILE) -ACHECK -wtT$@ $(PYHARNESSARGS) $(PYTHONTESTER) -m test.test_epoll $(PYTESTARGS)
+
 o/$(MODE)/third_party/python/Lib/test/test_cmd_line.py.runs: $(PYTHONTESTER)
 	@$(COMPILE) -ACHECK -wtT$@ $(PYHARNESSARGS) $(PYTHONTESTER) -m test.test_cmd_line $(PYTESTARGS)
 
diff --git a/third_party/python/Include/pyatomic.h b/third_party/python/Include/pyatomic.h
index c49ccf4f4..b1b49d9de 100644
--- a/third_party/python/Include/pyatomic.h
+++ b/third_party/python/Include/pyatomic.h
@@ -2,7 +2,6 @@
 #define Py_ATOMIC_H
 #include "libc/assert.h"
 #include "third_party/python/Include/dynamic_annotations.h"
-#include "libc/intrin/atomic.h"
 #include "third_party/python/pyconfig.h"
 
 /* This is modeled after the atomics interface from C1x, according to
diff --git a/third_party/python/Include/pyctype.h b/third_party/python/Include/pyctype.h
index dabcae58a..58ad51ca4 100644
--- a/third_party/python/Include/pyctype.h
+++ b/third_party/python/Include/pyctype.h
@@ -1,7 +1,7 @@
 #ifndef Py_LIMITED_API
 #ifndef PYCTYPE_H
 #define PYCTYPE_H
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 #define Py_TOLOWER(c) kToLower[255 & (c)]
 #define Py_TOUPPER(c) kToUpper[255 & (c)]
diff --git a/third_party/python/Lib/_sysconfigdata_m_cosmo_x86_64_cosmo.py b/third_party/python/Lib/_sysconfigdata_m_cosmo_x86_64_cosmo.py
index 349ed7400..f371f4236 100644
--- a/third_party/python/Lib/_sysconfigdata_m_cosmo_x86_64_cosmo.py
+++ b/third_party/python/Lib/_sysconfigdata_m_cosmo_x86_64_cosmo.py
@@ -486,7 +486,7 @@ build_time_vars = {'ABIFLAGS': 'm',
  'HAVE_SYS_DEVPOLL_H': 0,
  'HAVE_SYS_DIR_H': 1,
  'HAVE_SYS_ENDIAN_H': 0,
- 'HAVE_SYS_EPOLL_H': 0,
+ 'HAVE_SYS_EPOLL_H': 1,
  'HAVE_SYS_EVENT_H': 0,
  'HAVE_SYS_FILE_H': 1,
  'HAVE_SYS_IOCTL_H': 1,
diff --git a/third_party/python/Lib/test/test_re.py b/third_party/python/Lib/test/test_re.py
index 1a492c6a6..55871b8a3 100644
--- a/third_party/python/Lib/test/test_re.py
+++ b/third_party/python/Lib/test/test_re.py
@@ -1754,11 +1754,11 @@ SUBPATTERN None 0 0
                 self.skipTest('test needs %s locale' % loc)
 
         re.purge()
-        # self.check_en_US_iso88591()
+        self.check_en_US_iso88591()
         self.check_en_US_utf8()
         re.purge()
         self.check_en_US_utf8()
-        # self.check_en_US_iso88591()
+        self.check_en_US_iso88591()
 
     def check_en_US_iso88591(self):
         locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
diff --git a/third_party/python/Modules/_hashmbedtls.c b/third_party/python/Modules/_hashmbedtls.c
index 75db2c482..26202c7b5 100644
--- a/third_party/python/Modules/_hashmbedtls.c
+++ b/third_party/python/Modules/_hashmbedtls.c
@@ -18,7 +18,7 @@
 #define PY_SSIZE_T_CLEAN
 #include "libc/calls/calls.h"
 #include "libc/log/backtrace.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
diff --git a/third_party/python/Modules/selectmodule.c b/third_party/python/Modules/selectmodule.c
index e79dafb11..37ba974fa 100644
--- a/third_party/python/Modules/selectmodule.c
+++ b/third_party/python/Modules/selectmodule.c
@@ -9,9 +9,12 @@
 #include "libc/errno.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
+#include "libc/nt/efi.h"
+#include "libc/sock/epoll.h"
 #include "libc/sock/select.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/pollfd.h"
+#include "libc/sysv/consts/epoll.h"
 #include "libc/sysv/consts/poll.h"
 #include "third_party/python/Include/abstract.h"
 #include "third_party/python/Include/boolobject.h"
@@ -32,6 +35,21 @@
 #include "third_party/python/pyconfig.h"
 
 PYTHON_PROVIDE("select");
+PYTHON_PROVIDE("select.EPOLLERR");
+PYTHON_PROVIDE("select.EPOLLET");
+PYTHON_PROVIDE("select.EPOLLEXCLUSIVE");
+PYTHON_PROVIDE("select.EPOLLHUP");
+PYTHON_PROVIDE("select.EPOLLIN");
+PYTHON_PROVIDE("select.EPOLLMSG");
+PYTHON_PROVIDE("select.EPOLLONESHOT");
+PYTHON_PROVIDE("select.EPOLLOUT");
+PYTHON_PROVIDE("select.EPOLLPRI");
+PYTHON_PROVIDE("select.EPOLLRDBAND");
+PYTHON_PROVIDE("select.EPOLLRDHUP");
+PYTHON_PROVIDE("select.EPOLLRDNORM");
+PYTHON_PROVIDE("select.EPOLLWRBAND");
+PYTHON_PROVIDE("select.EPOLLWRNORM");
+PYTHON_PROVIDE("select.EPOLL_CLOEXEC");
 PYTHON_PROVIDE("select.POLLERR");
 PYTHON_PROVIDE("select.POLLHUP");
 PYTHON_PROVIDE("select.POLLIN");
@@ -43,6 +61,7 @@ PYTHON_PROVIDE("select.POLLRDHUP");
 PYTHON_PROVIDE("select.POLLRDNORM");
 PYTHON_PROVIDE("select.POLLWRBAND");
 PYTHON_PROVIDE("select.POLLWRNORM");
+PYTHON_PROVIDE("select.epoll");
 PYTHON_PROVIDE("select.error");
 PYTHON_PROVIDE("select.poll");
 PYTHON_PROVIDE("select.select");
diff --git a/third_party/python/Modules/socketmodule.c b/third_party/python/Modules/socketmodule.c
index a8e368268..a44040bbb 100644
--- a/third_party/python/Modules/socketmodule.c
+++ b/third_party/python/Modules/socketmodule.c
@@ -52,8 +52,6 @@
 #include "third_party/python/Include/warnings.h"
 #include "third_party/python/Include/yoink.h"
 #include "third_party/musl/netdb.h"
-#include "libc/sysv/consts/af.h"
-#include "libc/sysv/consts/af.h"
 #include "third_party/python/pyconfig.h"
 
 PYTHON_PROVIDE("_socket");
@@ -1045,15 +1043,16 @@ setipaddr(const char *name, struct sockaddr *addr_ret, size_t addr_ret_size, int
             set_gaierror(error);
             return -1;
         }
-        if (res->ai_family == AF_INET) {
+        switch (res->ai_family) {
+        case AF_INET:
             siz = 4;
-        }
+            break;
 #ifdef ENABLE_IPV6
-        else if (res->ai_family == AF_INET6) {
+        case AF_INET6:
             siz = 16;
-        }
+            break;
 #endif
-        else {
+        default:
             freeaddrinfo(res);
             PyErr_SetString(PyExc_OSError,
                 "unsupported address family");
@@ -1160,14 +1159,17 @@ setipaddr(const char *name, struct sockaddr *addr_ret, size_t addr_ret_size, int
         addr_ret_size = res->ai_addrlen;
     memcpy((char *) addr_ret, res->ai_addr, addr_ret_size);
     freeaddrinfo(res);
-    if (addr_ret->sa_family == AF_INET)
+    switch (addr_ret->sa_family) {
+    case AF_INET:
         return 4;
 #ifdef ENABLE_IPV6
-    if (addr_ret->sa_family == AF_INET6)
+    case AF_INET6:
         return 16;
 #endif
-    PyErr_SetString(PyExc_OSError, "unknown address family");
-    return -1;
+    default:
+        PyErr_SetString(PyExc_OSError, "unknown address family");
+        return -1;
+    }
 }
 
 
@@ -6790,9 +6792,24 @@ PyInit__socket(void)
     PyModule_AddIntMacro(m, MSG_TRUNC);
     PyModule_AddIntMacro(m, MSG_CTRUNC);
     PyModule_AddIntMacro(m, MSG_WAITALL);
-    PyModule_AddIntMacro(m, MSG_DONTWAIT);
-    PyModule_AddIntMacro(m, MSG_NOSIGNAL);
-    if (MSG_FASTOPEN != -1) PyModule_AddIntMacro(m, MSG_FASTOPEN);
+    if (MSG_DONTWAIT) PyModule_AddIntMacro(m, MSG_DONTWAIT);
+    if (MSG_EOR) PyModule_AddIntMacro(m, MSG_EOR);
+    if (MSG_NOSIGNAL) PyModule_AddIntMacro(m, MSG_NOSIGNAL);
+    if (MSG_BCAST) PyModule_AddIntMacro(m, MSG_BCAST);
+    if (MSG_MCAST) PyModule_AddIntMacro(m, MSG_MCAST);
+    if (MSG_CMSG_CLOEXEC) PyModule_AddIntMacro(m, MSG_CMSG_CLOEXEC);
+    if (MSG_ERRQUEUE) PyModule_AddIntMacro(m, MSG_ERRQUEUE);
+    if (MSG_CONFIRM) PyModule_AddIntMacro(m, MSG_CONFIRM);
+    if (MSG_MORE) PyModule_AddIntMacro(m, MSG_MORE);
+    if (MSG_NOTIFICATION) PyModule_AddIntMacro(m, MSG_NOTIFICATION);
+    if (MSG_EOF) PyModule_AddIntMacro(m, MSG_EOF);
+    if (MSG_FASTOPEN) PyModule_AddIntMacro(m, MSG_FASTOPEN);
+#ifdef MSG_BTAG
+    if (MSG_BTAG) PyModule_AddIntMacro(m, MSG_BTAG);
+#endif
+#ifdef MSG_ETAG
+    if (MSG_ETAG) PyModule_AddIntMacro(m, MSG_ETAG);
+#endif
 
     /* Protocol level and numbers, usable for [gs]etsockopt */
     PyModule_AddIntMacro(m, SOL_SOCKET);
@@ -7000,6 +7017,12 @@ PyInit__socket(void)
     PyModule_AddIntMacro(m, IP_MULTICAST_LOOP);
     PyModule_AddIntMacro(m, IP_DEFAULT_MULTICAST_TTL);
     PyModule_AddIntMacro(m, IP_DEFAULT_MULTICAST_LOOP);
+    PyModule_AddIntMacro(m, IP_MAX_MEMBERSHIPS);
+    if (IP_RECVOPTS) PyModule_AddIntMacro(m, IP_RECVOPTS);
+    if (IP_RECVRETOPTS) PyModule_AddIntMacro(m, IP_RECVRETOPTS);
+    if (IP_RECVDSTADDR) PyModule_AddIntMacro(m, IP_RECVDSTADDR);
+    if (IP_RETOPTS) PyModule_AddIntMacro(m, IP_RETOPTS);
+    if (IP_TRANSPARENT) PyModule_AddIntMacro(m, IP_TRANSPARENT);
 
 #ifdef ENABLE_IPV6
     /* IPv6 [gs]etsockopt options, defined in RFC2553 */
diff --git a/third_party/python/Modules/socketmodule.h b/third_party/python/Modules/socketmodule.h
index 05c1e6239..c135b85a7 100644
--- a/third_party/python/Modules/socketmodule.h
+++ b/third_party/python/Modules/socketmodule.h
@@ -3,7 +3,6 @@
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/sockaddr.h"
 #include "third_party/python/Include/object.h"
-#include "libc/sock/struct/sockaddr6.h"
 #include "third_party/python/Include/pytime.h"
 COSMOPOLITAN_C_START_
 
diff --git a/third_party/python/Modules/timemodule.c b/third_party/python/Modules/timemodule.c
index 136696c69..9dbabc98c 100644
--- a/third_party/python/Modules/timemodule.c
+++ b/third_party/python/Modules/timemodule.c
@@ -1052,9 +1052,14 @@ _PyTime_GetProcessTimeWithInfo(_PyTime_t *tp, _Py_clock_info_t *info)
         *tp = (ReadFileTime(kernel_time) + ReadFileTime(user_time)) * 100;
         return 0;
     }
-    if (CLOCK_PROCESS_CPUTIME_ID != -1) {
-        clk_id = CLOCK_PROCESS_CPUTIME_ID;
-        function = "clock_gettime(CLOCK_PROCESS_CPUTIME_ID)";
+    if (CLOCK_PROF != -1 || CLOCK_PROCESS_CPUTIME_ID != -1) {
+        if (CLOCK_PROF != -1) {
+            clk_id = CLOCK_PROF;
+            function = "clock_gettime(CLOCK_PROF)";
+        } else {
+            clk_id = CLOCK_PROCESS_CPUTIME_ID;
+            function = "clock_gettime(CLOCK_PROCESS_CPUTIME_ID)";
+        }
         if (!clock_gettime(clk_id, &ts)) {
             if (info) {
                 info->implementation = function;
diff --git a/third_party/python/Modules/tlsmodule.c b/third_party/python/Modules/tlsmodule.c
index 6ef1d7762..1fca255fa 100644
--- a/third_party/python/Modules/tlsmodule.c
+++ b/third_party/python/Modules/tlsmodule.c
@@ -19,7 +19,7 @@
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/str/str.h"
 #include "net/https/https.h"
diff --git a/third_party/python/Modules/tokenbucket.c b/third_party/python/Modules/tokenbucket.c
index 4f9bb7ced..22cd4f05a 100644
--- a/third_party/python/Modules/tokenbucket.c
+++ b/third_party/python/Modules/tokenbucket.c
@@ -24,7 +24,7 @@
 #include "libc/calls/struct/timespec.h"
 #include "libc/errno.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/sockaddr.h"
diff --git a/third_party/python/Objects/unicodeobject-deadcode.c b/third_party/python/Objects/unicodeobject-deadcode.c
new file mode 100644
index 000000000..a007f1ec4
--- /dev/null
+++ b/third_party/python/Objects/unicodeobject-deadcode.c
@@ -0,0 +1,430 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Python 3                                                                     │
+│ https://docs.python.org/3/license.html                                       │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#define PY_SSIZE_T_CLEAN
+#include "libc/assert.h"
+#include "third_party/python/Include/codecs.h"
+#include "third_party/python/Include/pyerrors.h"
+#include "third_party/python/Include/pymem.h"
+#include "third_party/python/Include/unicodeobject.h"
+#include "third_party/python/Include/warnings.h"
+
+#define _PyUnicode_STATE(op)                            \
+    (((PyASCIIObject *)(op))->state)
+
+int ensure_unicode(PyObject *);
+PyObject *unicode_result(PyObject *);
+int unicode_check_modifiable(PyObject *);
+PyObject *unicode_encode_ucs1(PyObject *, const char *, const Py_UCS4);
+PyObject *_PyUnicode_TranslateCharmap(PyObject *, PyObject *, const char *);
+
+/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
+   This function is kept for backward compatibility with the old API. */
+Py_UNICODE
+PyUnicode_GetMax(void)
+{
+#ifdef Py_UNICODE_WIDE
+    return 0x10FFFF;
+#else
+    /* This is actually an illegal character, so it should
+       not be passed to unichr. */
+    return 0xFFFF;
+#endif
+}
+
+PyObject *
+PyUnicode_AsDecodedObject(PyObject *unicode,
+                          const char *encoding,
+                          const char *errors)
+{
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        return NULL;
+    }
+    if (PyErr_WarnEx(PyExc_DeprecationWarning,
+                     "PyUnicode_AsDecodedObject() is deprecated; "
+                     "use PyCodec_Decode() to decode from str", 1) < 0)
+        return NULL;
+    if (encoding == NULL)
+        encoding = PyUnicode_GetDefaultEncoding();
+    /* Decode via the codec registry */
+    return PyCodec_Decode(unicode, encoding, errors);
+}
+
+PyObject *
+PyUnicode_AsDecodedUnicode(PyObject *unicode,
+                           const char *encoding,
+                           const char *errors)
+{
+    PyObject *v;
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        goto onError;
+    }
+    if (PyErr_WarnEx(PyExc_DeprecationWarning,
+                     "PyUnicode_AsDecodedUnicode() is deprecated; "
+                     "use PyCodec_Decode() to decode from str to str", 1) < 0)
+        return NULL;
+    if (encoding == NULL)
+        encoding = PyUnicode_GetDefaultEncoding();
+    /* Decode via the codec registry */
+    v = PyCodec_Decode(unicode, encoding, errors);
+    if (v == NULL)
+        goto onError;
+    if (!PyUnicode_Check(v)) {
+        PyErr_Format(PyExc_TypeError,
+                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
+                     "use codecs.decode() to decode to arbitrary types",
+                     encoding,
+                     Py_TYPE(unicode)->tp_name);
+        Py_DECREF(v);
+        goto onError;
+    }
+    return unicode_result(v);
+  onError:
+    return NULL;
+}
+
+PyObject *
+PyUnicode_AsEncodedObject(PyObject *unicode,
+                          const char *encoding,
+                          const char *errors)
+{
+    PyObject *v;
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        goto onError;
+    }
+    if (PyErr_WarnEx(PyExc_DeprecationWarning,
+                     "PyUnicode_AsEncodedObject() is deprecated; "
+                     "use PyUnicode_AsEncodedString() to encode from str to bytes "
+                     "or PyCodec_Encode() for generic encoding", 1) < 0)
+        return NULL;
+    if (encoding == NULL)
+        encoding = PyUnicode_GetDefaultEncoding();
+    /* Encode via the codec registry */
+    v = PyCodec_Encode(unicode, encoding, errors);
+    if (v == NULL)
+        goto onError;
+    return v;
+  onError:
+    return NULL;
+}
+
+PyObject *
+PyUnicode_AsEncodedUnicode(PyObject *unicode,
+                           const char *encoding,
+                           const char *errors)
+{
+    PyObject *v;
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        goto onError;
+    }
+    if (PyErr_WarnEx(PyExc_DeprecationWarning,
+                     "PyUnicode_AsEncodedUnicode() is deprecated; "
+                     "use PyCodec_Encode() to encode from str to str", 1) < 0)
+        return NULL;
+    if (encoding == NULL)
+        encoding = PyUnicode_GetDefaultEncoding();
+    /* Encode via the codec registry */
+    v = PyCodec_Encode(unicode, encoding, errors);
+    if (v == NULL)
+        goto onError;
+    if (!PyUnicode_Check(v)) {
+        PyErr_Format(PyExc_TypeError,
+                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
+                     "use codecs.encode() to encode to arbitrary types",
+                     encoding,
+                     Py_TYPE(v)->tp_name);
+        Py_DECREF(v);
+        goto onError;
+    }
+    return v;
+  onError:
+    return NULL;
+}
+
+wchar_t *
+_PyUnicode_AsWideCharString(PyObject *unicode)
+{
+    const wchar_t *wstr;
+    wchar_t *buffer;
+    Py_ssize_t buflen;
+    if (unicode == NULL) {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+    wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
+    if (wstr == NULL) {
+        return NULL;
+    }
+    if (wcslen(wstr) != (size_t)buflen) {
+        PyErr_SetString(PyExc_ValueError,
+                        "embedded null character");
+        return NULL;
+    }
+    buffer = PyMem_NEW(wchar_t, buflen + 1);
+    if (buffer == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
+    return buffer;
+}
+
+const Py_UNICODE *
+_PyUnicode_AsUnicode(PyObject *unicode)
+{
+    Py_ssize_t size;
+    const Py_UNICODE *wstr;
+    wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
+    if (wstr && wcslen(wstr) != (size_t)size) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        return NULL;
+    }
+    return wstr;
+}
+
+Py_ssize_t
+PyUnicode_GetSize(PyObject *unicode)
+{
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        goto onError;
+    }
+    return PyUnicode_GET_SIZE(unicode);
+  onError:
+    return -1;
+}
+
+int
+PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
+{
+    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
+        PyErr_BadArgument();
+        return -1;
+    }
+    assert(PyUnicode_IS_READY(unicode));
+    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
+        PyErr_SetString(PyExc_IndexError, "string index out of range");
+        return -1;
+    }
+    if (unicode_check_modifiable(unicode))
+        return -1;
+    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
+        PyErr_SetString(PyExc_ValueError, "character out of range");
+        return -1;
+    }
+    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
+                    index, ch);
+    return 0;
+}
+
+/* Deprecated */
+PyObject *
+PyUnicode_EncodeLatin1(const Py_UNICODE *p,
+                       Py_ssize_t size,
+                       const char *errors)
+{
+    PyObject *result;
+    PyObject *unicode = PyUnicode_FromUnicode(p, size);
+    if (unicode == NULL)
+        return NULL;
+    result = unicode_encode_ucs1(unicode, errors, 256);
+    Py_DECREF(unicode);
+    return result;
+}
+
+/* Deprecated */
+PyObject *
+PyUnicode_EncodeASCII(const Py_UNICODE *p,
+                      Py_ssize_t size,
+                      const char *errors)
+{
+    PyObject *result;
+    PyObject *unicode = PyUnicode_FromUnicode(p, size);
+    if (unicode == NULL)
+        return NULL;
+    result = unicode_encode_ucs1(unicode, errors, 128);
+    Py_DECREF(unicode);
+    return result;
+}
+
+PyObject *
+PyUnicode_Encode(const Py_UNICODE *s,
+                 Py_ssize_t size,
+                 const char *encoding,
+                 const char *errors)
+{
+    PyObject *v, *unicode;
+    unicode = PyUnicode_FromUnicode(s, size);
+    if (unicode == NULL)
+        return NULL;
+    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
+    Py_DECREF(unicode);
+    return v;
+}
+
+/* Deprecated */
+PyObject *
+PyUnicode_EncodeCharmap(const Py_UNICODE *p,
+                        Py_ssize_t size,
+                        PyObject *mapping,
+                        const char *errors)
+{
+    PyObject *result;
+    PyObject *unicode = PyUnicode_FromUnicode(p, size);
+    if (unicode == NULL)
+        return NULL;
+    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
+    Py_DECREF(unicode);
+    return result;
+}
+
+/* Deprecated. Use PyUnicode_Translate instead. */
+PyObject *
+PyUnicode_TranslateCharmap(const Py_UNICODE *p,
+                           Py_ssize_t size,
+                           PyObject *mapping,
+                           const char *errors)
+{
+    PyObject *result;
+    PyObject *unicode = PyUnicode_FromUnicode(p, size);
+    if (!unicode)
+        return NULL;
+    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
+    Py_DECREF(unicode);
+    return result;
+}
+
+void
+PyUnicode_InternImmortal(PyObject **p)
+{
+    PyUnicode_InternInPlace(p);
+    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
+        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
+        Py_INCREF(*p);
+    }
+}
+
+Py_UNICODE*
+Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
+{
+    Py_UNICODE *u = s1;
+    while ((*u++ = *s2++));
+    return s1;
+}
+
+Py_UNICODE*
+Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
+{
+    Py_UNICODE *u = s1;
+    while ((*u++ = *s2++))
+        if (n-- == 0)
+            break;
+    return s1;
+}
+
+Py_UNICODE*
+Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
+{
+    Py_UNICODE *u1 = s1;
+    u1 += Py_UNICODE_strlen(u1);
+    Py_UNICODE_strcpy(u1, s2);
+    return s1;
+}
+
+int
+Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
+{
+    while (*s1 && *s2 && *s1 == *s2)
+        s1++, s2++;
+    if (*s1 && *s2)
+        return (*s1 < *s2) ? -1 : +1;
+    if (*s1)
+        return 1;
+    if (*s2)
+        return -1;
+    return 0;
+}
+
+int
+Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
+{
+    Py_UNICODE u1, u2;
+    for (; n != 0; n--) {
+        u1 = *s1;
+        u2 = *s2;
+        if (u1 != u2)
+            return (u1 < u2) ? -1 : +1;
+        if (u1 == '\0')
+            return 0;
+        s1++;
+        s2++;
+    }
+    return 0;
+}
+
+Py_UNICODE*
+Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
+{
+    const Py_UNICODE *p;
+    for (p = s; *p; p++)
+        if (*p == c)
+            return (Py_UNICODE*)p;
+    return NULL;
+}
+
+Py_UNICODE*
+Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
+{
+    const Py_UNICODE *p;
+    p = s + Py_UNICODE_strlen(s);
+    while (p != s) {
+        p--;
+        if (*p == c)
+            return (Py_UNICODE*)p;
+    }
+    return NULL;
+}
+
+size_t
+Py_UNICODE_strlen(const Py_UNICODE *u)
+{
+    int res = 0;
+    while(*u++)
+        res++;
+    return res;
+}
+
+Py_UNICODE*
+PyUnicode_AsUnicodeCopy(PyObject *unicode)
+{
+    Py_UNICODE *u, *copy;
+    Py_ssize_t len, size;
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        return NULL;
+    }
+    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
+    if (u == NULL)
+        return NULL;
+    /* Ensure we won't overflow the size. */
+    if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    size = len + 1; /* copy the null character */
+    size *= sizeof(Py_UNICODE);
+    copy = PyMem_Malloc(size);
+    if (copy == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    memcpy(copy, u, size);
+    return copy;
+}
diff --git a/third_party/python/Objects/unicodeobject.c b/third_party/python/Objects/unicodeobject.c
index 4bc672f47..37ed619e7 100644
--- a/third_party/python/Objects/unicodeobject.c
+++ b/third_party/python/Objects/unicodeobject.c
@@ -3158,37 +3158,6 @@ PyUnicode_AsWideCharString(PyObject *unicode,
     return buffer;
 }
 
-wchar_t*
-_PyUnicode_AsWideCharString(PyObject *unicode)
-{
-    const wchar_t *wstr;
-    wchar_t *buffer;
-    Py_ssize_t buflen;
-
-    if (unicode == NULL) {
-        PyErr_BadInternalCall();
-        return NULL;
-    }
-
-    wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
-    if (wstr == NULL) {
-        return NULL;
-    }
-    if (wcslen(wstr) != (size_t)buflen) {
-        PyErr_SetString(PyExc_ValueError,
-                        "embedded null character");
-        return NULL;
-    }
-
-    buffer = PyMem_NEW(wchar_t, buflen + 1);
-    if (buffer == NULL) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-    memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
-    return buffer;
-}
-
 PyObject *
 PyUnicode_FromOrdinal(int ordinal)
 {
diff --git a/third_party/python/Python/cosmomodule.c b/third_party/python/Python/cosmomodule.c
index 1ce9343bd..fa82214ed 100644
--- a/third_party/python/Python/cosmomodule.c
+++ b/third_party/python/Python/cosmomodule.c
@@ -23,7 +23,7 @@
 #include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/intrin/popcnt.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/crc32.h"
diff --git a/third_party/python/Python/import.c b/third_party/python/Python/import.c
index 9b8c1d723..1ed177be9 100644
--- a/third_party/python/Python/import.c
+++ b/third_party/python/Python/import.c
@@ -10,7 +10,7 @@
 #include "libc/calls/struct/stat.macros.h"
 #include "libc/fmt/conv.h"
 #include "libc/fmt/libgen.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
diff --git a/third_party/python/Python/marshal.c b/third_party/python/Python/marshal.c
index 53984862a..573e7689b 100644
--- a/third_party/python/Python/marshal.c
+++ b/third_party/python/Python/marshal.c
@@ -5,6 +5,7 @@
 │ https://docs.python.org/3/license.html                                       │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #define PY_SSIZE_T_CLEAN
+#include "dsp/mpeg/video.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/weirdtypes.h"
 #include "libc/mem/mem.h"
diff --git a/third_party/python/Python/random.c b/third_party/python/Python/random.c
index 8f3fc9b75..673907859 100644
--- a/third_party/python/Python/random.c
+++ b/third_party/python/Python/random.c
@@ -9,7 +9,7 @@
 #include "libc/calls/weirdtypes.h"
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/rdtsc.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/rand.h"
diff --git a/third_party/python/pyconfig.h b/third_party/python/pyconfig.h
index fe9749bd4..1cfe5bb40 100644
--- a/third_party/python/pyconfig.h
+++ b/third_party/python/pyconfig.h
@@ -122,8 +122,8 @@
 #define HAVE_DIRENT_D_TYPE 1
 #define HAVE_DUP2          1
 #define HAVE_DUP3          1
-// #define HAVE_EPOLL         1
-// #define HAVE_EPOLL_CREATE1 1
+#define HAVE_EPOLL         1
+#define HAVE_EPOLL_CREATE1 1
 #define HAVE_ERF           1
 #define HAVE_ERFC          1
 #define HAVE_EXECV         1
@@ -318,9 +318,8 @@
 #define HAVE_WAIT4            1
 #define HAVE_WAITPID          1
 #define HAVE_STATVFS          1
-#define HAVE_STD_ATOMIC       1
-#define HAVE_MREMAP           1
 
+/* #define HAVE_MREMAP 1 */
 /* #undef HAVE_PLOCK */
 /* #undef HAVE_POSIX_FALLOCATE */
 /* #undef HAVE_PRLIMIT */
@@ -336,15 +335,16 @@
 /* #undef HAVE_SIGWAITINFO */
 /* #undef HAVE_SOCKADDR_ALG */
 /* #undef HAVE_SOCKADDR_SA_LEN */
+/* #undef HAVE_STD_ATOMIC */
 
 #define HAVE_SNPRINTF 1
 #define HAVE_STRDUP   1
 #define HAVE_STRFTIME 1
 #define HAVE_STRLCPY  1
 #define HAVE_WMEMCMP  1
-#define HAVE_WCSCOLL  1
-#define HAVE_WCSXFRM  1
-#define HAVE_WCSFTIME 1
+/* #undef HAVE_WCSCOLL */
+/* #undef HAVE_WCSFTIME */
+/* #undef HAVE_WCSXFRM */
 
 #define HAVE_USABLE_WCHAR_T                 1
 #define HAVE_SOCKETPAIR                     1
@@ -532,7 +532,7 @@
 /* define to 1 if your sem_getvalue is broken. */
 /* #define HAVE_BROKEN_SEM_GETVALUE 1 */
 /* Define if --enable-ipv6 is specified */
-// #define ENABLE_IPV6 1
+/* #undef ENABLE_IPV6 */
 /* Define if flock needs to be linked with bsd library. */
 /* #undef FLOCK_NEEDS_LIBBSD */
 /* Define if getpgrp() must be called as getpgrp(0). */
diff --git a/third_party/python/pyobj.c b/third_party/python/pyobj.c
index 927155d3f..4f9def529 100644
--- a/third_party/python/pyobj.c
+++ b/third_party/python/pyobj.c
@@ -24,7 +24,7 @@
 #include "libc/fmt/conv.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
@@ -35,7 +35,7 @@
 #include "libc/sysv/consts/o.h"
 #include "libc/time.h"
 #include "libc/x/x.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "third_party/getopt/getopt.internal.h"
 #include "third_party/python/Include/abstract.h"
 #include "third_party/python/Include/bytesobject.h"
diff --git a/third_party/python/python3.c b/third_party/python/python3.c
index 8f6bf3865..907bd32a7 100644
--- a/third_party/python/python3.c
+++ b/third_party/python/python3.c
@@ -6,7 +6,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/python/Include/yoink.h"
 #include "third_party/python/runpythonmodule.h"
-#include "libc/cosmo.h"
+#include "tool/args/args.h"
 
 PYTHON_YOINK("xed");
 PYTHON_YOINK("xterm");
diff --git a/third_party/python/pythontester.c b/third_party/python/pythontester.c
index e55e63edd..07549673b 100644
--- a/third_party/python/pythontester.c
+++ b/third_party/python/pythontester.c
@@ -8,7 +8,7 @@
 #include "libc/runtime/runtime.h"
 #include "third_party/python/Include/yoink.h"
 #include "third_party/python/runpythonmodule.h"
-#include "libc/cosmo.h"
+#include "tool/args/args.h"
 
 int
 main(int argc, char **argv)
diff --git a/third_party/python/repl.c b/third_party/python/repl.c
index 4b4aa7c22..9528e4e83 100644
--- a/third_party/python/repl.c
+++ b/third_party/python/repl.c
@@ -6,7 +6,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/python/Include/yoink.h"
 #include "third_party/python/runpythonmodule.h"
-#include "libc/cosmo.h"
+#include "tool/args/args.h"
 
 int
 main(int argc, char **argv)
diff --git a/third_party/python/runpythonmodule.c b/third_party/python/runpythonmodule.c
index f4c41e38e..ba23daa13 100644
--- a/third_party/python/runpythonmodule.c
+++ b/third_party/python/runpythonmodule.c
@@ -16,7 +16,7 @@
 #include "libc/intrin/weaken.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/third_party/regex/BUILD.mk b/third_party/regex/BUILD.mk
index a9bbc59ef..79b93315f 100644
--- a/third_party/regex/BUILD.mk
+++ b/third_party/regex/BUILD.mk
@@ -20,8 +20,7 @@ THIRD_PARTY_REGEX_A_DIRECTDEPS =			\
 	LIBC_MEM					\
 	LIBC_NEXGEN32E					\
 	LIBC_RUNTIME					\
-	LIBC_STR					\
-	THIRD_PARTY_MUSL				\
+	LIBC_STR
 
 THIRD_PARTY_REGEX_A_DEPS :=				\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_REGEX_A_DIRECTDEPS),$($(x))))
diff --git a/third_party/regex/regcomp.c b/third_party/regex/regcomp.c
index 0031fe059..f0fffdc78 100644
--- a/third_party/regex/regcomp.c
+++ b/third_party/regex/regcomp.c
@@ -56,16 +56,12 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <string.h>
-#include <stdlib.h>
-#include <regex.h>
-#include <limits.h>
-#include <stdint.h>
-#include <ctype.h>
+#include "libc/mem/alg.h"
+#include "libc/ctype.h"
+#include "third_party/regex/tre.inc"
 
-#include "tre.inc"
-
-#include <assert.h>
+#define CHARCLASS_NAME_MAX 14
+#define RE_DUP_MAX         255
 
 /***********************************************************************
  from tre-compile.h
@@ -82,37 +78,30 @@ typedef struct {
   int backref;
 } tre_pos_and_tags_t;
 
-
 /***********************************************************************
  from tre-ast.c and tre-ast.h
 ***********************************************************************/
 
 /* The different AST node types. */
-typedef enum {
-  LITERAL,
-  CATENATION,
-  ITERATION,
-  UNION
-} tre_ast_type_t;
+typedef enum { LITERAL, CATENATION, ITERATION, UNION } tre_ast_type_t;
 
 /* Special subtypes of TRE_LITERAL. */
-#define EMPTY	  -1   /* Empty leaf (denotes empty string). */
-#define ASSERTION -2   /* Assertion leaf. */
-#define TAG	  -3   /* Tag leaf. */
-#define BACKREF	  -4   /* Back reference leaf. */
+#define EMPTY     -1 /* Empty leaf (denotes empty string). */
+#define ASSERTION -2 /* Assertion leaf. */
+#define TAG       -3 /* Tag leaf. */
+#define BACKREF   -4 /* Back reference leaf. */
 
-#define IS_SPECIAL(x)	((x)->code_min < 0)
-#define IS_EMPTY(x)	((x)->code_min == EMPTY)
+#define IS_SPECIAL(x)   ((x)->code_min < 0)
+#define IS_EMPTY(x)     ((x)->code_min == EMPTY)
 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
-#define IS_TAG(x)	((x)->code_min == TAG)
-#define IS_BACKREF(x)	((x)->code_min == BACKREF)
-
+#define IS_TAG(x)       ((x)->code_min == TAG)
+#define IS_BACKREF(x)   ((x)->code_min == BACKREF)
 
 /* A generic AST node.  All AST nodes consist of this node on the top
    level with `obj' pointing to the actual content. */
 typedef struct {
-  tre_ast_type_t type;   /* Type of the node. */
-  void *obj;             /* Pointer to actual node. */
+  tre_ast_type_t type; /* Type of the node. */
+  void *obj;           /* Pointer to actual node. */
   int nullable;
   int submatch_id;
   int num_submatches;
@@ -121,7 +110,6 @@ typedef struct {
   tre_pos_and_tags_t *lastpos;
 } tre_ast_node_t;
 
-
 /* A "literal" node.  These are created for assertions, back references,
    tags, matching parameter settings, and all expressions that match one
    character. */
@@ -154,7 +142,7 @@ typedef struct {
   /* If 0, match as many characters as possible, if 1 match as few as
      possible.	Note that this does not always mean the same thing as
      matching as many/few repetitions as possible. */
-  unsigned int minimal:1;
+  unsigned int minimal : 1;
 } tre_iteration_t;
 
 /* An "union" node.  These are created for the "|" operator. */
@@ -163,91 +151,77 @@ typedef struct {
   tre_ast_node_t *right;
 } tre_union_t;
 
-
-static tre_ast_node_t *
-tre_ast_new_node(tre_mem_t mem, int type, void *obj)
-{
-	tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
-	if (!node || !obj)
-		return 0;
-	node->obj = obj;
-	node->type = type;
-	node->nullable = -1;
-	node->submatch_id = -1;
-	return node;
+static tre_ast_node_t *tre_ast_new_node(tre_mem_t mem, int type, void *obj) {
+  tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
+  if (!node || !obj) return 0;
+  node->obj = obj;
+  node->type = type;
+  node->nullable = -1;
+  node->submatch_id = -1;
+  return node;
 }
 
-static tre_ast_node_t *
-tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
-{
-	tre_ast_node_t *node;
-	tre_literal_t *lit;
+static tre_ast_node_t *tre_ast_new_literal(tre_mem_t mem, int code_min,
+                                           int code_max, int position) {
+  tre_ast_node_t *node;
+  tre_literal_t *lit;
 
-	lit = tre_mem_calloc(mem, sizeof *lit);
-	node = tre_ast_new_node(mem, LITERAL, lit);
-	if (!node)
-		return 0;
-	lit->code_min = code_min;
-	lit->code_max = code_max;
-	lit->position = position;
-	return node;
+  lit = tre_mem_calloc(mem, sizeof *lit);
+  node = tre_ast_new_node(mem, LITERAL, lit);
+  if (!node) return 0;
+  lit->code_min = code_min;
+  lit->code_max = code_max;
+  lit->position = position;
+  return node;
 }
 
-static tre_ast_node_t *
-tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
-{
-	tre_ast_node_t *node;
-	tre_iteration_t *iter;
+static tre_ast_node_t *tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg,
+                                        int min, int max, int minimal) {
+  tre_ast_node_t *node;
+  tre_iteration_t *iter;
 
-	iter = tre_mem_calloc(mem, sizeof *iter);
-	node = tre_ast_new_node(mem, ITERATION, iter);
-	if (!node)
-		return 0;
-	iter->arg = arg;
-	iter->min = min;
-	iter->max = max;
-	iter->minimal = minimal;
-	node->num_submatches = arg->num_submatches;
-	return node;
+  iter = tre_mem_calloc(mem, sizeof *iter);
+  node = tre_ast_new_node(mem, ITERATION, iter);
+  if (!node) return 0;
+  iter->arg = arg;
+  iter->min = min;
+  iter->max = max;
+  iter->minimal = minimal;
+  node->num_submatches = arg->num_submatches;
+  return node;
 }
 
-static tre_ast_node_t *
-tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
-{
-	tre_ast_node_t *node;
-	tre_union_t *un;
+static tre_ast_node_t *tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left,
+                                         tre_ast_node_t *right) {
+  tre_ast_node_t *node;
+  tre_union_t *un;
 
-	if (!left)
-		return right;
-	un = tre_mem_calloc(mem, sizeof *un);
-	node = tre_ast_new_node(mem, UNION, un);
-	if (!node || !right)
-		return 0;
-	un->left = left;
-	un->right = right;
-	node->num_submatches = left->num_submatches + right->num_submatches;
-	return node;
+  if (!left) return right;
+  un = tre_mem_calloc(mem, sizeof *un);
+  node = tre_ast_new_node(mem, UNION, un);
+  if (!node || !right) return 0;
+  un->left = left;
+  un->right = right;
+  node->num_submatches = left->num_submatches + right->num_submatches;
+  return node;
 }
 
-static tre_ast_node_t *
-tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
-{
-	tre_ast_node_t *node;
-	tre_catenation_t *cat;
+static tre_ast_node_t *tre_ast_new_catenation(tre_mem_t mem,
+                                              tre_ast_node_t *left,
+                                              tre_ast_node_t *right) {
+  tre_ast_node_t *node;
+  tre_catenation_t *cat;
 
-	if (!left)
-		return right;
-	cat = tre_mem_calloc(mem, sizeof *cat);
-	node = tre_ast_new_node(mem, CATENATION, cat);
-	if (!node)
-		return 0;
-	cat->left = left;
-	cat->right = right;
-	node->num_submatches = left->num_submatches + right->num_submatches;
-	return node;
+  if (!left) return right;
+  cat = tre_mem_calloc(mem, sizeof *cat);
+  node = tre_ast_new_node(mem, CATENATION, cat);
+  if (!node) return 0;
+  cat->left = left;
+  cat->right = right;
+  node->num_submatches = left->num_submatches + right->num_submatches;
+  return node;
 }
 
-
 /***********************************************************************
  from tre-stack.c and tre-stack.h
 ***********************************************************************/
@@ -258,23 +232,20 @@ typedef struct tre_stack_rec tre_stack_t;
    is maximum size, and `increment' specifies how much more space will be
    allocated with realloc() if all space gets used up.	Returns the stack
    object or NULL if out of memory. */
-static tre_stack_t *
-tre_stack_new(int size, int max_size, int increment);
+static tre_stack_t *tre_stack_new(int size, int max_size, int increment);
 
 /* Frees the stack object. */
-static void
-tre_stack_destroy(tre_stack_t *s);
+static void tre_stack_destroy(tre_stack_t *s);
 
 /* Returns the current number of objects in the stack. */
-static int
-tre_stack_num_objects(tre_stack_t *s);
+static int tre_stack_num_objects(tre_stack_t *s);
 
 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
    This tries to realloc() more space before failing if maximum size
    has not yet been reached.  Returns REG_OK if successful. */
-#define declare_pushf(typetag, type)					      \
-  static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
+#define declare_pushf(typetag, type) \
+  static reg_errcode_t tre_stack_push_##typetag(tre_stack_t *s, type value)
 
 declare_pushf(voidptr, void *);
 declare_pushf(int, int);
@@ -282,33 +253,29 @@ declare_pushf(int, int);
 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
    element off of stack `s' and returns it.  The stack must not be
    empty. */
-#define declare_popf(typetag, type)		  \
-  static type tre_stack_pop_ ## typetag(tre_stack_t *s)
+#define declare_popf(typetag, type) \
+  static type tre_stack_pop_##typetag(tre_stack_t *s)
 
 declare_popf(voidptr, void *);
 declare_popf(int, int);
 
 /* Just to save some typing. */
-#define STACK_PUSH(s, typetag, value)					      \
-  do									      \
-    {									      \
-      status = tre_stack_push_ ## typetag(s, value);			      \
-    }									      \
-  while (/*CONSTCOND*/0)
+#define STACK_PUSH(s, typetag, value)            \
+  do {                                           \
+    status = tre_stack_push_##typetag(s, value); \
+  } while (/*CONSTCOND*/ 0)
 
-#define STACK_PUSHX(s, typetag, value)					      \
-  {									      \
-    status = tre_stack_push_ ## typetag(s, value);			      \
-    if (status != REG_OK)						      \
-      break;								      \
+#define STACK_PUSHX(s, typetag, value)           \
+  {                                              \
+    status = tre_stack_push_##typetag(s, value); \
+    if (status != REG_OK) break;                 \
   }
 
-#define STACK_PUSHR(s, typetag, value)					      \
-  {									      \
-    reg_errcode_t _status;						      \
-    _status = tre_stack_push_ ## typetag(s, value);			      \
-    if (_status != REG_OK)						      \
-      return _status;							      \
+#define STACK_PUSHR(s, typetag, value)            \
+  {                                               \
+    reg_errcode_t _status;                        \
+    _status = tre_stack_push_##typetag(s, value); \
+    if (_status != REG_OK) return _status;        \
   }
 
 union tre_stack_item {
@@ -324,215 +291,193 @@ struct tre_stack_rec {
   union tre_stack_item *stack;
 };
 
-
-static tre_stack_t *
-tre_stack_new(int size, int max_size, int increment)
-{
+static tre_stack_t *tre_stack_new(int size, int max_size, int increment) {
   tre_stack_t *s;
 
-  s = xmalloc(sizeof(*s));
-  if (s != NULL)
-    {
-      s->stack = xmalloc(sizeof(*s->stack) * size);
-      if (s->stack == NULL)
-	{
-	  xfree(s);
-	  return NULL;
-	}
-      s->size = size;
-      s->max_size = max_size;
-      s->increment = increment;
-      s->ptr = 0;
+  s = malloc(sizeof(*s));
+  if (s != NULL) {
+    s->stack = malloc(sizeof(*s->stack) * size);
+    if (s->stack == NULL) {
+      free(s), s = NULL;
+      return NULL;
     }
+    s->size = size;
+    s->max_size = max_size;
+    s->increment = increment;
+    s->ptr = 0;
+  }
   return s;
 }
 
-static void
-tre_stack_destroy(tre_stack_t *s)
-{
-  xfree(s->stack);
-  xfree(s);
+static void tre_stack_destroy(tre_stack_t *s) {
+  free(s->stack), s->stack = NULL;
+  free(s), s = NULL;
 }
 
-static int
-tre_stack_num_objects(tre_stack_t *s)
-{
+static int tre_stack_num_objects(tre_stack_t *s) {
   return s->ptr;
 }
 
-static reg_errcode_t
-tre_stack_push(tre_stack_t *s, union tre_stack_item value)
-{
-  if (s->ptr < s->size)
-    {
-      s->stack[s->ptr] = value;
-      s->ptr++;
-    }
-  else
-    {
-      if (s->size >= s->max_size)
-	{
-	  return REG_ESPACE;
-	}
-      else
-	{
-	  union tre_stack_item *new_buffer;
-	  int new_size;
-	  new_size = s->size + s->increment;
-	  if (new_size > s->max_size)
-	    new_size = s->max_size;
-	  new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
-	  if (new_buffer == NULL)
-	    {
-	      return REG_ESPACE;
-	    }
-	  assert(new_size > s->size);
-	  s->size = new_size;
-	  s->stack = new_buffer;
-	  tre_stack_push(s, value);
-	}
+static reg_errcode_t tre_stack_push(tre_stack_t *s,
+                                    union tre_stack_item value) {
+  if (s->ptr < s->size) {
+    s->stack[s->ptr] = value;
+    s->ptr++;
+  } else {
+    if (s->size >= s->max_size) {
+      return REG_ESPACE;
+    } else {
+      union tre_stack_item *new_buffer;
+      int new_size;
+      new_size = s->size + s->increment;
+      if (new_size > s->max_size) new_size = s->max_size;
+      new_buffer = realloc(s->stack, sizeof(*new_buffer) * new_size);
+      if (new_buffer == NULL) {
+        return REG_ESPACE;
+      }
+      unassert(new_size > s->size);
+      s->size = new_size;
+      s->stack = new_buffer;
+      tre_stack_push(s, value);
     }
+  }
   return REG_OK;
 }
 
-#define define_pushf(typetag, type)  \
-  declare_pushf(typetag, type) {     \
-    union tre_stack_item item;	     \
-    item.typetag ## _value = value;  \
-    return tre_stack_push(s, item);  \
-}
-
-define_pushf(int, int)
-define_pushf(voidptr, void *)
-
-#define define_popf(typetag, type)		    \
-  declare_popf(typetag, type) {			    \
-    return s->stack[--s->ptr].typetag ## _value;    \
+#define define_pushf(typetag, type) \
+  declare_pushf(typetag, type) {    \
+    union tre_stack_item item;      \
+    item.typetag##_value = value;   \
+    return tre_stack_push(s, item); \
   }
 
-define_popf(int, int)
-define_popf(voidptr, void *)
+define_pushf(int, int) define_pushf(voidptr, void *)
+#define define_popf(typetag, type)             \
+  declare_popf(typetag, type) {                \
+    return s->stack[--s->ptr].typetag##_value; \
+  }
 
+    define_popf(int, int) define_popf(voidptr, void *)
 
-/***********************************************************************
- from tre-parse.c and tre-parse.h
-***********************************************************************/
+    /***********************************************************************
+     from tre-parse.c and tre-parse.h
+    ***********************************************************************/
 
-/* Parse context. */
-typedef struct {
-	/* Memory allocator. The AST is allocated using this. */
-	tre_mem_t mem;
-	/* Stack used for keeping track of regexp syntax. */
-	tre_stack_t *stack;
-	/* The parsed node after a parse function returns. */
-	tre_ast_node_t *n;
-	/* Position in the regexp pattern after a parse function returns. */
-	const char *s;
-	/* The first character of the last subexpression parsed. */
-	const char *start;
-	/* Current submatch ID. */
-	int submatch_id;
-	/* Current position (number of literal). */
-	int position;
-	/* The highest back reference or -1 if none seen so far. */
-	int max_backref;
-	/* Compilation flags. */
-	int cflags;
+    /* Parse context. */
+    typedef struct {
+  /* Memory allocator. The AST is allocated using this. */
+  tre_mem_t mem;
+  /* Stack used for keeping track of regexp syntax. */
+  tre_stack_t *stack;
+  /* The parsed node after a parse function returns. */
+  tre_ast_node_t *n;
+  /* Position in the regexp pattern after a parse function returns. */
+  const char *s;
+  /* The first character of the last subexpression parsed. */
+  const char *start;
+  /* Current submatch ID. */
+  int submatch_id;
+  /* Current position (number of literal). */
+  int position;
+  /* The highest back reference or -1 if none seen so far. */
+  int max_backref;
+  /* Compilation flags. */
+  int cflags;
 } tre_parse_ctx_t;
 
 /* Some macros for expanding \w, \s, etc. */
 static const struct {
-	char c;
-	const char *expansion;
+  char c;
+  const char *expansion;
 } tre_macros[] = {
-	{'t', "\t"}, {'n', "\n"}, {'r', "\r"},
-	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
-	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
-	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
-	{ 0, 0 }
+    {'t', "\t"},
+    {'n', "\n"},
+    {'r', "\r"},
+    {'f', "\f"},
+    {'a', "\a"},
+    {'e', "\033"},
+    {'w', "[[:alnum:]_]"},
+    {'W', "[^[:alnum:]_]"},
+    {'s', "[[:space:]]"},
+    {'S', "[^[:space:]]"},
+    {'d', "[[:digit:]]"},
+    {'D', "[^[:digit:]]"},
+    {0, 0},
 };
 
 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
    must have at least `len' items.  Sets buf[0] to zero if the there
    is no match in `tre_macros'. */
-static const char *tre_expand_macro(const char *s)
-{
-	int i;
-	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
-	return tre_macros[i].expansion;
+static const char *tre_expand_macro(const char *s) {
+  int i;
+  for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++)
+    ;
+  return tre_macros[i].expansion;
 }
 
-static int
-tre_compare_lit(const void *a, const void *b)
-{
-	const tre_literal_t *const *la = a;
-	const tre_literal_t *const *lb = b;
-	/* assumes the range of valid code_min is < INT_MAX */
-	return la[0]->code_min - lb[0]->code_min;
+static int tre_compare_lit(const void *a, const void *b) {
+  const tre_literal_t *const *la = a;
+  const tre_literal_t *const *lb = b;
+  /* assumes the range of valid code_min is < INT_MAX */
+  return la[0]->code_min - lb[0]->code_min;
 }
 
 struct literals {
-	tre_mem_t mem;
-	tre_literal_t **a;
-	int len;
-	int cap;
+  tre_mem_t mem;
+  tre_literal_t **a;
+  int len;
+  int cap;
 };
 
-static tre_literal_t *tre_new_lit(struct literals *p)
-{
-	tre_literal_t **a;
-	if (p->len >= p->cap) {
-		if (p->cap >= 1<<15)
-			return 0;
-		p->cap *= 2;
-		a = xrealloc(p->a, p->cap * sizeof *p->a);
-		if (!a)
-			return 0;
-		p->a = a;
-	}
-	a = p->a + p->len++;
-	*a = tre_mem_calloc(p->mem, sizeof **a);
-	return *a;
+static tre_literal_t *tre_new_lit(struct literals *p) {
+  tre_literal_t **a;
+  if (p->len >= p->cap) {
+    if (p->cap >= 1 << 15) return 0;
+    p->cap *= 2;
+    a = realloc(p->a, p->cap * sizeof *p->a);
+    if (!a) return 0;
+    p->a = a;
+  }
+  a = p->a + p->len++;
+  *a = tre_mem_calloc(p->mem, sizeof **a);
+  return *a;
 }
 
-static int add_icase_literals(struct literals *ls, int min, int max)
-{
-	tre_literal_t *lit;
-	int b, e, c;
-	for (c=min; c<=max; ) {
-		/* assumes islower(c) and isupper(c) are exclusive
-		   and toupper(c)!=c if islower(c).
-		   multiple opposite case characters are not supported */
-		if (tre_islower(c)) {
-			b = e = tre_toupper(c);
-			for (c++, e++; c<=max; c++, e++)
-				if (tre_toupper(c) != e) break;
-		} else if (tre_isupper(c)) {
-			b = e = tre_tolower(c);
-			for (c++, e++; c<=max; c++, e++)
-				if (tre_tolower(c) != e) break;
-		} else {
-			c++;
-			continue;
-		}
-		lit = tre_new_lit(ls);
-		if (!lit)
-			return -1;
-		lit->code_min = b;
-		lit->code_max = e-1;
-		lit->position = -1;
-	}
-	return 0;
+static int add_icase_literals(struct literals *ls, int min, int max) {
+  tre_literal_t *lit;
+  int b, e, c;
+  for (c = min; c <= max;) {
+    /* assumes islower(c) and isupper(c) are exclusive
+       and toupper(c)!=c if islower(c).
+       multiple opposite case characters are not supported */
+    if (tre_islower(c)) {
+      b = e = tre_toupper(c);
+      for (c++, e++; c <= max; c++, e++)
+        if (tre_toupper(c) != e) break;
+    } else if (tre_isupper(c)) {
+      b = e = tre_tolower(c);
+      for (c++, e++; c <= max; c++, e++)
+        if (tre_tolower(c) != e) break;
+    } else {
+      c++;
+      continue;
+    }
+    lit = tre_new_lit(ls);
+    if (!lit) return -1;
+    lit->code_min = b;
+    lit->code_max = e - 1;
+    lit->position = -1;
+  }
+  return 0;
 }
 
-
 /* Maximum number of character classes in a negated bracket expression. */
 #define MAX_NEG_CLASSES 64
 
 struct neg {
-	int negate;
-	int len;
-	tre_ctype_t a[MAX_NEG_CLASSES];
+  int negate;
+  int len;
+  tre_ctype_t a[MAX_NEG_CLASSES];
 };
 
 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
@@ -555,565 +500,525 @@ coll_single is a single char collating element but it can be
  '^' anywhere except after the openning '['
 */
 
-static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
-{
-	const char *start = s;
-	tre_ctype_t class;
-	int min, max;
-	wchar_t wc;
-	int len;
+static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s,
+                                         struct literals *ls, struct neg *neg) {
+  const char *start = s;
+  tre_ctype_t class;
+  int min, max;
+  wchar_t wc;
+  int len;
 
-	for (;;) {
-		class = 0;
-		len = mbtowc(&wc, s, -1);
-		if (len <= 0)
-			return *s ? REG_BADPAT : REG_EBRACK;
-		if (*s == ']' && s != start) {
-			ctx->s = s+1;
-			return REG_OK;
-		}
-		if (*s == '-' && s != start && s[1] != ']' &&
-		    /* extension: [a-z--@] is accepted as [a-z]|[--@] */
-		    (s[1] != '-' || s[2] == ']'))
-			return REG_ERANGE;
-		if (*s == '[' && (s[1] == '.' || s[1] == '='))
-			/* collating symbols and equivalence classes are not supported */
-			return REG_ECOLLATE;
-		if (*s == '[' && s[1] == ':') {
-			char tmp[CHARCLASS_NAME_MAX+1];
-			s += 2;
-			for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
-				if (s[len] == ':') {
-					memcpy(tmp, s, len);
-					tmp[len] = 0;
-					class = tre_ctype(tmp);
-					break;
-				}
-			}
-			if (!class || s[len+1] != ']')
-				return REG_ECTYPE;
-			min = 0;
-			max = TRE_CHAR_MAX;
-			s += len+2;
-		} else {
-			min = max = wc;
-			s += len;
-			if (*s == '-' && s[1] != ']') {
-				s++;
-				len = mbtowc(&wc, s, -1);
-				max = wc;
-				/* XXX - Should use collation order instead of
-				   encoding values in character ranges. */
-				if (len <= 0 || min > max)
-					return REG_ERANGE;
-				s += len;
-			}
-		}
+  for (;;) {
+    class = 0;
+    len = mbtowc(&wc, s, -1);
+    if (len <= 0) return *s ? REG_BADPAT : REG_EBRACK;
+    if (*s == ']' && s != start) {
+      ctx->s = s + 1;
+      return REG_OK;
+    }
+    if (*s == '-' && s != start && s[1] != ']' &&
+        /* extension: [a-z--@] is accepted as [a-z]|[--@] */
+        (s[1] != '-' || s[2] == ']')) {
+      return REG_ERANGE;
+    }
+    if (*s == '[' && (s[1] == '.' || s[1] == '=')) {
+      /* collating symbols and equivalence classes are not supported */
+      return REG_ECOLLATE;
+    }
+    if (*s == '[' && s[1] == ':') {
+      char tmp[CHARCLASS_NAME_MAX + 1];
+      s += 2;
+      for (len = 0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
+        if (s[len] == ':') {
+          memcpy(tmp, s, len);
+          tmp[len] = 0;
+          class = tre_ctype(tmp);
+          break;
+        }
+      }
+      if (!class || s[len + 1] != ']') return REG_ECTYPE;
+      min = 0;
+      max = TRE_CHAR_MAX;
+      s += len + 2;
+    } else {
+      min = max = wc;
+      s += len;
+      if (*s == '-' && s[1] != ']') {
+        s++;
+        len = mbtowc(&wc, s, -1);
+        max = wc;
+        /* XXX - Should use collation order instead of
+           encoding values in character ranges. */
+        if (len <= 0 || min > max) {
+          return REG_ERANGE;
+        }
+        s += len;
+      }
+    }
 
-		if (class && neg->negate) {
-			if (neg->len >= MAX_NEG_CLASSES)
-				return REG_ESPACE;
-			neg->a[neg->len++] = class;
-		} else  {
-			tre_literal_t *lit = tre_new_lit(ls);
-			if (!lit)
-				return REG_ESPACE;
-			lit->code_min = min;
-			lit->code_max = max;
-			lit->class = class;
-			lit->position = -1;
+    if (class && neg->negate) {
+      if (neg->len >= MAX_NEG_CLASSES) return REG_ESPACE;
+      neg->a[neg->len++] = class;
+    } else {
+      tre_literal_t *lit = tre_new_lit(ls);
+      if (!lit) return REG_ESPACE;
+      lit->code_min = min;
+      lit->code_max = max;
+      lit->class = class;
+      lit->position = -1;
 
-			/* Add opposite-case codepoints if REG_ICASE is present.
-			   It seems that POSIX requires that bracket negation
-			   should happen before case-folding, but most practical
-			   implementations do it the other way around. Changing
-			   the order would need efficient representation of
-			   case-fold ranges and bracket range sets even with
-			   simple patterns so this is ok for now. */
-			if (ctx->cflags & REG_ICASE && !class)
-				if (add_icase_literals(ls, min, max))
-					return REG_ESPACE;
-		}
-	}
+      /* Add opposite-case codepoints if REG_ICASE is present.
+         It seems that POSIX requires that bracket negation
+         should happen before case-folding, but most practical
+         implementations do it the other way around. Changing
+         the order would need efficient representation of
+         case-fold ranges and bracket range sets even with
+         simple patterns so this is ok for now. */
+      if (ctx->cflags & REG_ICASE && !class)
+        if (add_icase_literals(ls, min, max)) return REG_ESPACE;
+    }
+  }
 }
 
-static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
-{
-	int i, max, min, negmax, negmin;
-	tre_ast_node_t *node = 0, *n;
-	tre_ctype_t *nc = 0;
-	tre_literal_t *lit;
-	struct literals ls;
-	struct neg neg;
-	reg_errcode_t err;
+static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s) {
+  int i, max, min, negmax, negmin;
+  tre_ast_node_t *node = 0, *n;
+  tre_ctype_t *nc = 0;
+  tre_literal_t *lit;
+  struct literals ls;
+  struct neg neg;
+  reg_errcode_t err;
 
-	ls.mem = ctx->mem;
-	ls.len = 0;
-	ls.cap = 32;
-	ls.a = xmalloc(ls.cap * sizeof *ls.a);
-	if (!ls.a)
-		return REG_ESPACE;
-	neg.len = 0;
-	neg.negate = *s == '^';
-	if (neg.negate)
-		s++;
+  ls.mem = ctx->mem;
+  ls.len = 0;
+  ls.cap = 32;
+  ls.a = malloc(ls.cap * sizeof *ls.a);
+  if (!ls.a) return REG_ESPACE;
+  neg.len = 0;
+  neg.negate = *s == '^';
+  if (neg.negate) s++;
 
-	err = parse_bracket_terms(ctx, s, &ls, &neg);
-	if (err != REG_OK)
-		goto parse_bracket_done;
+  err = parse_bracket_terms(ctx, s, &ls, &neg);
+  if (err != REG_OK) goto parse_bracket_done;
 
-	if (neg.negate) {
-		/*
-		 * With REG_NEWLINE, POSIX requires that newlines are not matched by
-		 * any form of a non-matching list.
-		 */
-		if (ctx->cflags & REG_NEWLINE) {
-			lit = tre_new_lit(&ls);
-			if (!lit) {
-				err = REG_ESPACE;
-				goto parse_bracket_done;
-			}
-			lit->code_min = '\n';
-			lit->code_max = '\n';
-			lit->position = -1;
-		}
-		/* Sort the array if we need to negate it. */
-		qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
-		/* extra lit for the last negated range */
-		lit = tre_new_lit(&ls);
-		if (!lit) {
-			err = REG_ESPACE;
-			goto parse_bracket_done;
-		}
-		lit->code_min = TRE_CHAR_MAX+1;
-		lit->code_max = TRE_CHAR_MAX+1;
-		lit->position = -1;
-		/* negated classes */
-		if (neg.len) {
-			nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
-			if (!nc) {
-				err = REG_ESPACE;
-				goto parse_bracket_done;
-			}
-			memcpy(nc, neg.a, neg.len*sizeof *neg.a);
-			nc[neg.len] = 0;
-		}
-	}
+  if (neg.negate) {
+    /*
+     * With REG_NEWLINE, POSIX requires that newlines are not matched by
+     * any form of a non-matching list.
+     */
+    if (ctx->cflags & REG_NEWLINE) {
+      lit = tre_new_lit(&ls);
+      if (!lit) {
+        err = REG_ESPACE;
+        goto parse_bracket_done;
+      }
+      lit->code_min = '\n';
+      lit->code_max = '\n';
+      lit->position = -1;
+    }
+    /* Sort the array if we need to negate it. */
+    qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
+    /* extra lit for the last negated range */
+    lit = tre_new_lit(&ls);
+    if (!lit) {
+      err = REG_ESPACE;
+      goto parse_bracket_done;
+    }
+    lit->code_min = TRE_CHAR_MAX + 1;
+    lit->code_max = TRE_CHAR_MAX + 1;
+    lit->position = -1;
+    /* negated classes */
+    if (neg.len) {
+      nc = tre_mem_alloc(ctx->mem, (neg.len + 1) * sizeof *neg.a);
+      if (!nc) {
+        err = REG_ESPACE;
+        goto parse_bracket_done;
+      }
+      memcpy(nc, neg.a, neg.len * sizeof *neg.a);
+      nc[neg.len] = 0;
+    }
+  }
 
-	/* Build a union of the items in the array, negated if necessary. */
-	negmax = negmin = 0;
-	for (i = 0; i < ls.len; i++) {
-		lit = ls.a[i];
-		min = lit->code_min;
-		max = lit->code_max;
-		if (neg.negate) {
-			if (min <= negmin) {
-				/* Overlap. */
-				negmin = MAX(max + 1, negmin);
-				continue;
-			}
-			negmax = min - 1;
-			lit->code_min = negmin;
-			lit->code_max = negmax;
-			negmin = max + 1;
-		}
-		lit->position = ctx->position;
-		lit->neg_classes = nc;
-		n = tre_ast_new_node(ctx->mem, LITERAL, lit);
-		node = tre_ast_new_union(ctx->mem, node, n);
-		if (!node) {
-			err = REG_ESPACE;
-			break;
-		}
-	}
+  /* Build a union of the items in the array, negated if necessary. */
+  negmax = negmin = 0;
+  for (i = 0; i < ls.len; i++) {
+    lit = ls.a[i];
+    min = lit->code_min;
+    max = lit->code_max;
+    if (neg.negate) {
+      if (min <= negmin) {
+        /* Overlap. */
+        negmin = MAX(max + 1, negmin);
+        continue;
+      }
+      negmax = min - 1;
+      lit->code_min = negmin;
+      lit->code_max = negmax;
+      negmin = max + 1;
+    }
+    lit->position = ctx->position;
+    lit->neg_classes = nc;
+    n = tre_ast_new_node(ctx->mem, LITERAL, lit);
+    node = tre_ast_new_union(ctx->mem, node, n);
+    if (!node) {
+      err = REG_ESPACE;
+      break;
+    }
+  }
 
 parse_bracket_done:
-	xfree(ls.a);
-	ctx->position++;
-	ctx->n = node;
-	return err;
+  free(ls.a), ls.a = NULL;
+  ctx->position++;
+  ctx->n = node;
+  return err;
 }
 
-static const char *parse_dup_count(const char *s, int *n)
-{
-	*n = -1;
-	if (!isdigit(*s))
-		return s;
-	*n = 0;
-	for (;;) {
-		*n = 10 * *n + (*s - '0');
-		s++;
-		if (!isdigit(*s) || *n > RE_DUP_MAX)
-			break;
-	}
-	return s;
+static const char *parse_dup_count(const char *s, int *n) {
+  *n = -1;
+  if (!isdigit(*s)) return s;
+  *n = 0;
+  for (;;) {
+    *n = 10 * *n + (*s - '0');
+    s++;
+    if (!isdigit(*s) || *n > RE_DUP_MAX) break;
+  }
+  return s;
 }
 
-static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
-{
-	int min, max;
+static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax) {
+  int min, max;
 
-	s = parse_dup_count(s, &min);
-	if (*s == ',')
-		s = parse_dup_count(s+1, &max);
-	else
-		max = min;
+  s = parse_dup_count(s, &min);
+  if (*s == ',')
+    s = parse_dup_count(s + 1, &max);
+  else
+    max = min;
 
-	if (
-		(max < min && max >= 0) ||
-		max > RE_DUP_MAX ||
-		min > RE_DUP_MAX ||
-		min < 0 ||
-		(!ere && *s++ != '\\') ||
-		*s++ != '}'
-	)
-		return 0;
-	*pmin = min;
-	*pmax = max;
-	return s;
+  if ((max < min && max >= 0) || max > RE_DUP_MAX || min > RE_DUP_MAX ||
+      min < 0 || (!ere && *s++ != '\\') || *s++ != '}')
+    return 0;
+  *pmin = min;
+  *pmax = max;
+  return s;
 }
 
-static int hexval(unsigned c)
-{
-	if (c-'0'<10) return c-'0';
-	c |= 32;
-	if (c-'a'<6) return c-'a'+10;
-	return -1;
+static int hexval(unsigned c) {
+  if (c - '0' < 10) return c - '0';
+  c |= 32;
+  if (c - 'a' < 6) return c - 'a' + 10;
+  return -1;
 }
 
-static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
-{
-	if (node->submatch_id >= 0) {
-		tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-		if (!n)
-			return REG_ESPACE;
-		n = tre_ast_new_catenation(ctx->mem, n, node);
-		if (!n)
-			return REG_ESPACE;
-		n->num_submatches = node->num_submatches;
-		node = n;
-	}
-	node->submatch_id = subid;
-	node->num_submatches++;
-	ctx->n = node;
-	return REG_OK;
+static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node,
+                             int subid) {
+  if (node->submatch_id >= 0) {
+    tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+    if (!n) return REG_ESPACE;
+    n = tre_ast_new_catenation(ctx->mem, n, node);
+    if (!n) return REG_ESPACE;
+    n->num_submatches = node->num_submatches;
+    node = n;
+  }
+  node->submatch_id = subid;
+  node->num_submatches++;
+  ctx->n = node;
+  return REG_OK;
 }
 
 /*
 BRE grammar:
-Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
-Branch =  Atom  |  Branch Atom
-Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
-Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
+Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^'
+Branch '$' Branch =  Atom  |  Branch Atom Atom   =  char  |  quoted_char  |  '.'
+|  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref Dup    =  '*'  |  '\{'
+Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
 
 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
 
 ERE grammar:
 Regex  =  Branch  |  Regex '|' Branch
 Branch =  Atom  |  Branch Atom
-Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
-Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
+Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex
+')'  |  '^'  |  '$' Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{'
+Count ',}'  |  '{' Count ',' Count '}'
 
 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
 */
 
-static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
-{
-	int len, ere = ctx->cflags & REG_EXTENDED;
-	const char *p;
-	tre_ast_node_t *node;
-	wchar_t wc;
-	switch (*s) {
-	case '[':
-		return parse_bracket(ctx, s+1);
-	case '\\':
-		p = tre_expand_macro(s+1);
-		if (p) {
-			/* assume \X expansion is a single atom */
-			reg_errcode_t err = parse_atom(ctx, p);
-			ctx->s = s+2;
-			return err;
-		}
-		/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
-		switch (*++s) {
-		case 0:
-			return REG_EESCAPE;
-		case 'b':
-			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
-			break;
-		case 'B':
-			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
-			break;
-		case '<':
-			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
-			break;
-		case '>':
-			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
-			break;
-		case 'x':
-			s++;
-			int i, v = 0, c;
-			len = 2;
-			if (*s == '{') {
-				len = 8;
-				s++;
-			}
-			for (i=0; i<len && v<0x110000; i++) {
-				c = hexval(s[i]);
-				if (c < 0) break;
-				v = 16*v + c;
-			}
-			s += i;
-			if (len == 8) {
-				if (*s != '}')
-					return REG_EBRACE;
-				s++;
-			}
-			node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
-			s--;
-			break;
-		case '{':
-		case '+':
-		case '?':
-			/* extension: treat \+, \? as repetitions in BRE */
-			/* reject repetitions after empty expression in BRE */
-			if (!ere)
-				return REG_BADRPT;
-		case '|':
-			/* extension: treat \| as alternation in BRE */
-			if (!ere) {
-				node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-				s--;
-				goto end;
-			}
-			/* fallthrough */
-		default:
-			if (!ere && (unsigned)*s-'1' < 9) {
-				/* back reference */
-				int val = *s - '0';
-				node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
-				ctx->max_backref = MAX(val, ctx->max_backref);
-			} else {
-				/* extension: accept unknown escaped char
-				   as a literal */
-				goto parse_literal;
-			}
-		}
-		s++;
-		break;
-	case '.':
-		if (ctx->cflags & REG_NEWLINE) {
-			tre_ast_node_t *tmp1, *tmp2;
-			tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
-			tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
-			if (tmp1 && tmp2)
-				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
-			else
-				node = 0;
-		} else {
-			node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
-		}
-		s++;
-		break;
-	case '^':
-		/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
-		if (!ere && s != ctx->start)
-			goto parse_literal;
-		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
-		s++;
-		break;
-	case '$':
-		/* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
-		if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
-			goto parse_literal;
-		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
-		s++;
-		break;
-	case '*':
-	case '{':
-	case '+':
-	case '?':
-		/* reject repetitions after empty expression in ERE */
-		if (ere)
-			return REG_BADRPT;
-	case '|':
-		if (!ere)
-			goto parse_literal;
-	case 0:
-		node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-		break;
-	default:
-parse_literal:
-		len = mbtowc(&wc, s, -1);
-		if (len < 0)
-			return REG_BADPAT;
-		if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
-			tre_ast_node_t *tmp1, *tmp2;
-			/* multiple opposite case characters are not supported */
-			tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
-			tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
-			if (tmp1 && tmp2)
-				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
-			else
-				node = 0;
-		} else {
-			node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
-		}
-		ctx->position++;
-		s += len;
-		break;
-	}
+static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s) {
+  int len, ere = ctx->cflags & REG_EXTENDED;
+  const char *p;
+  tre_ast_node_t *node;
+  wchar_t wc;
+  switch (*s) {
+    case '[':
+      return parse_bracket(ctx, s + 1);
+    case '\\':
+      p = tre_expand_macro(s + 1);
+      if (p) {
+        /* assume \X expansion is a single atom */
+        reg_errcode_t err = parse_atom(ctx, p);
+        ctx->s = s + 2;
+        return err;
+      }
+      /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
+      switch (*++s) {
+        case 0:
+          return REG_EESCAPE;
+        case 'b':
+          node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
+          break;
+        case 'B':
+          node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
+          break;
+        case '<':
+          node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
+          break;
+        case '>':
+          node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
+          break;
+        case 'x':
+          s++;
+          int i, v = 0, c;
+          len = 2;
+          if (*s == '{') {
+            len = 8;
+            s++;
+          }
+          for (i = 0; i < len && v < 0x110000; i++) {
+            c = hexval(s[i]);
+            if (c < 0) break;
+            v = 16 * v + c;
+          }
+          s += i;
+          if (len == 8) {
+            if (*s != '}') return REG_EBRACE;
+            s++;
+          }
+          node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
+          s--;
+          break;
+        case '{':
+        case '+':
+        case '?':
+          /* extension: treat \+, \? as repetitions in BRE */
+          /* reject repetitions after empty expression in BRE */
+          if (!ere) return REG_BADRPT;
+          /* fallthrough */
+        case '|':
+          /* extension: treat \| as alternation in BRE */
+          if (!ere) {
+            node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+            s--;
+            goto end;
+          }
+          /* fallthrough */
+        default:
+          if (!ere && (unsigned)*s - '1' < 9) {
+            /* back reference */
+            int val = *s - '0';
+            node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
+            ctx->max_backref = MAX(val, ctx->max_backref);
+          } else {
+            /* extension: accept unknown escaped char
+               as a literal */
+            goto parse_literal;
+          }
+      }
+      s++;
+      break;
+    case '.':
+      if (ctx->cflags & REG_NEWLINE) {
+        tre_ast_node_t *tmp1, *tmp2;
+        tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1, ctx->position++);
+        tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
+                                   ctx->position++);
+        if (tmp1 && tmp2)
+          node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+        else
+          node = 0;
+      } else {
+        node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
+      }
+      s++;
+      break;
+    case '^':
+      /* '^' has a special meaning everywhere in EREs, and at beginning of BRE.
+       */
+      if (!ere && s != ctx->start) goto parse_literal;
+      node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
+      s++;
+      break;
+    case '$':
+      /* '$' is special everywhere in EREs, and at the end of a BRE
+       * subexpression. */
+      if (!ere && s[1] && (s[1] != '\\' || (s[2] != ')' && s[2] != '|')))
+        goto parse_literal;
+      node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
+      s++;
+      break;
+    case '*':
+    case '{':
+    case '+':
+    case '?':
+      /* reject repetitions after empty expression in ERE */
+      if (ere) return REG_BADRPT;
+      /* fallthrough */
+    case '|':
+      if (!ere) goto parse_literal;
+      /* fallthrough */
+    case 0:
+      node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+      break;
+    default:
+    parse_literal:
+      len = mbtowc(&wc, s, -1);
+      if (len < 0) return REG_BADPAT;
+      if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
+        tre_ast_node_t *tmp1, *tmp2;
+        /* multiple opposite case characters are not supported */
+        tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc),
+                                   ctx->position);
+        tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc),
+                                   ctx->position);
+        if (tmp1 && tmp2)
+          node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+        else
+          node = 0;
+      } else {
+        node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
+      }
+      ctx->position++;
+      s += len;
+      break;
+  }
 end:
-	if (!node)
-		return REG_ESPACE;
-	ctx->n = node;
-	ctx->s = s;
-	return REG_OK;
+  if (!node) return REG_ESPACE;
+  ctx->n = node;
+  ctx->s = s;
+  return REG_OK;
 }
 
-#define PUSHPTR(err, s, v) do { \
-	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
-		return err; \
-} while(0)
+#define PUSHPTR(err, s, v)                                          \
+  do {                                                              \
+    if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) return err; \
+  } while (0)
 
-#define PUSHINT(err, s, v) do { \
-	if ((err = tre_stack_push_int(s, v)) != REG_OK) \
-		return err; \
-} while(0)
+#define PUSHINT(err, s, v)                                      \
+  do {                                                          \
+    if ((err = tre_stack_push_int(s, v)) != REG_OK) return err; \
+  } while (0)
 
-static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
-{
-	tre_ast_node_t *nbranch=0, *nunion=0;
-	int ere = ctx->cflags & REG_EXTENDED;
-	const char *s = ctx->start;
-	int subid = 0;
-	int depth = 0;
-	reg_errcode_t err;
-	tre_stack_t *stack = ctx->stack;
+static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) {
+  tre_ast_node_t *nbranch = 0, *nunion = 0;
+  int ere = ctx->cflags & REG_EXTENDED;
+  const char *s = ctx->start;
+  int subid = 0;
+  int depth = 0;
+  reg_errcode_t err;
+  tre_stack_t *stack = ctx->stack;
 
-	PUSHINT(err, stack, subid++);
-	for (;;) {
-		if ((!ere && *s == '\\' && s[1] == '(') ||
-		    (ere && *s == '(')) {
-			PUSHPTR(err, stack, nunion);
-			PUSHPTR(err, stack, nbranch);
-			PUSHINT(err, stack, subid++);
-			s++;
-			if (!ere)
-				s++;
-			depth++;
-			nbranch = nunion = 0;
-			ctx->start = s;
-			continue;
-		}
-		if ((!ere && *s == '\\' && s[1] == ')') ||
-		    (ere && *s == ')' && depth)) {
-			ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-			if (!ctx->n)
-				return REG_ESPACE;
-		} else {
-			err = parse_atom(ctx, s);
-			if (err != REG_OK)
-				return err;
-			s = ctx->s;
-		}
+  PUSHINT(err, stack, subid++);
+  for (;;) {
+    if ((!ere && *s == '\\' && s[1] == '(') || (ere && *s == '(')) {
+      PUSHPTR(err, stack, nunion);
+      PUSHPTR(err, stack, nbranch);
+      PUSHINT(err, stack, subid++);
+      s++;
+      if (!ere) s++;
+      depth++;
+      nbranch = nunion = 0;
+      ctx->start = s;
+      continue;
+    }
+    if ((!ere && *s == '\\' && s[1] == ')') || (ere && *s == ')' && depth)) {
+      ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+      if (!ctx->n) return REG_ESPACE;
+    } else {
+      err = parse_atom(ctx, s);
+      if (err != REG_OK) return err;
+      s = ctx->s;
+    }
 
-	parse_iter:
-		for (;;) {
-			int min, max;
+  parse_iter:
+    for (;;) {
+      int min, max;
 
-			if (*s!='\\' && *s!='*') {
-				if (!ere)
-					break;
-				if (*s!='+' && *s!='?' && *s!='{')
-					break;
-			}
-			if (*s=='\\' && ere)
-				break;
-			/* extension: treat \+, \? as repetitions in BRE */
-			if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
-				break;
-			if (*s=='\\')
-				s++;
+      if (*s != '\\' && *s != '*') {
+        if (!ere) break;
+        if (*s != '+' && *s != '?' && *s != '{') break;
+      }
+      if (*s == '\\' && ere) break;
+      /* extension: treat \+, \? as repetitions in BRE */
+      if (*s == '\\' && s[1] != '+' && s[1] != '?' && s[1] != '{') break;
+      if (*s == '\\') s++;
 
-			/* handle ^* at the start of a BRE. */
-			if (!ere && s==ctx->start+1 && s[-1]=='^')
-				break;
+      /* handle ^* at the start of a BRE. */
+      if (!ere && s == ctx->start + 1 && s[-1] == '^') break;
 
-			/* extension: multiple consecutive *+?{,} is unspecified,
-			   but (a+)+ has to be supported so accepting a++ makes
-			   sense, note however that the RE_DUP_MAX limit can be
-			   circumvented: (a{255}){255} uses a lot of memory.. */
-			if (*s=='{') {
-				s = parse_dup(s+1, ere, &min, &max);
-				if (!s)
-					return REG_BADBR;
-			} else {
-				min=0;
-				max=-1;
-				if (*s == '+')
-					min = 1;
-				if (*s == '?')
-					max = 1;
-				s++;
-			}
-			if (max == 0)
-				ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-			else
-				ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
-			if (!ctx->n)
-				return REG_ESPACE;
-		}
+      /* extension: multiple consecutive *+?{,} is unspecified,
+         but (a+)+ has to be supported so accepting a++ makes
+         sense, note however that the RE_DUP_MAX limit can be
+         circumvented: (a{255}){255} uses a lot of memory.. */
+      if (*s == '{') {
+        s = parse_dup(s + 1, ere, &min, &max);
+        if (!s) return REG_BADBR;
+      } else {
+        min = 0;
+        max = -1;
+        if (*s == '+') min = 1;
+        if (*s == '?') max = 1;
+        s++;
+      }
+      if (max == 0)
+        ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+      else
+        ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
+      if (!ctx->n) return REG_ESPACE;
+    }
 
-		nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
-		if ((ere && *s == '|') ||
-		    (ere && *s == ')' && depth) ||
-		    (!ere && *s == '\\' && s[1] == ')') ||
-		    /* extension: treat \| as alternation in BRE */
-		    (!ere && *s == '\\' && s[1] == '|') ||
-		    !*s) {
-			/* extension: empty branch is unspecified (), (|a), (a|)
-			   here they are not rejected but match on empty string */
-			int c = *s;
-			nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
-			nbranch = 0;
+    nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
+    if ((ere && *s == '|') || (ere && *s == ')' && depth) ||
+        (!ere && *s == '\\' && s[1] == ')') ||
+        /* extension: treat \| as alternation in BRE */
+        (!ere && *s == '\\' && s[1] == '|') || !*s) {
+      /* extension: empty branch is unspecified (), (|a), (a|)
+         here they are not rejected but match on empty string */
+      int c = *s;
+      nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
+      nbranch = 0;
 
-			if (c == '\\' && s[1] == '|') {
-				s+=2;
-				ctx->start = s;
-			} else if (c == '|') {
-				s++;
-				ctx->start = s;
-			} else {
-				if (c == '\\') {
-					if (!depth) return REG_EPAREN;
-					s+=2;
-				} else if (c == ')')
-					s++;
-				depth--;
-				err = marksub(ctx, nunion, tre_stack_pop_int(stack));
-				if (err != REG_OK)
-					return err;
-				if (!c && depth<0) {
-					ctx->submatch_id = subid;
-					return REG_OK;
-				}
-				if (!c || depth<0)
-					return REG_EPAREN;
-				nbranch = tre_stack_pop_voidptr(stack);
-				nunion = tre_stack_pop_voidptr(stack);
-				goto parse_iter;
-			}
-		}
-	}
+      if (c == '\\' && s[1] == '|') {
+        s += 2;
+        ctx->start = s;
+      } else if (c == '|') {
+        s++;
+        ctx->start = s;
+      } else {
+        if (c == '\\') {
+          if (!depth) return REG_EPAREN;
+          s += 2;
+        } else if (c == ')')
+          s++;
+        depth--;
+        err = marksub(ctx, nunion, tre_stack_pop_int(stack));
+        if (err != REG_OK) return err;
+        if (!c && depth < 0) {
+          ctx->submatch_id = subid;
+          return REG_OK;
+        }
+        if (!c || depth < 0) return REG_EPAREN;
+        nbranch = tre_stack_pop_voidptr(stack);
+        nunion = tre_stack_pop_voidptr(stack);
+        goto parse_iter;
+      }
+    }
+  }
 }
 
-
 /***********************************************************************
  from tre-compile.c
 ***********************************************************************/
 
-
 /*
   TODO:
    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
@@ -1124,24 +1029,19 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
   Algorithms to setup tags so that submatch addressing can be done.
 */
 
-
 /* Inserts a catenation node to the root of the tree given in `node'.
    As the left child a new tag with number `tag_id' to `node' is added,
    and the right child is the old root. */
-static reg_errcode_t
-tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
-{
+static reg_errcode_t tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node,
+                                      int tag_id) {
   tre_catenation_t *c;
 
   c = tre_mem_alloc(mem, sizeof(*c));
-  if (c == NULL)
-    return REG_ESPACE;
+  if (c == NULL) return REG_ESPACE;
   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
-  if (c->left == NULL)
-    return REG_ESPACE;
+  if (c->left == NULL) return REG_ESPACE;
   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
-  if (c->right == NULL)
-    return REG_ESPACE;
+  if (c->right == NULL) return REG_ESPACE;
 
   c->right->obj = node->obj;
   c->right->type = node->type;
@@ -1159,20 +1059,16 @@ tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
 /* Inserts a catenation node to the root of the tree given in `node'.
    As the right child a new tag with number `tag_id' to `node' is added,
    and the left child is the old root. */
-static reg_errcode_t
-tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
-{
+static reg_errcode_t tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node,
+                                       int tag_id) {
   tre_catenation_t *c;
 
   c = tre_mem_alloc(mem, sizeof(*c));
-  if (c == NULL)
-    return REG_ESPACE;
+  if (c == NULL) return REG_ESPACE;
   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
-  if (c->right == NULL)
-    return REG_ESPACE;
+  if (c->right == NULL) return REG_ESPACE;
   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
-  if (c->left == NULL)
-    return REG_ESPACE;
+  if (c->left == NULL) return REG_ESPACE;
 
   c->left->obj = node->obj;
   c->left->type = node->type;
@@ -1197,39 +1093,31 @@ typedef enum {
   ADDTAGS_SET_SUBMATCH_END
 } tre_addtags_symbol_t;
 
-
 typedef struct {
   int tag;
   int next_tag;
 } tre_tag_states_t;
 
-
 /* Go through `regset' and set submatch data for submatches that are
    using this tag. */
-static void
-tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
-{
+static void tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag) {
   int i;
 
-  for (i = 0; regset[i] >= 0; i++)
-    {
-      int id = regset[i] / 2;
-      int start = !(regset[i] % 2);
-      if (start)
-	tnfa->submatch_data[id].so_tag = tag;
-      else
-	tnfa->submatch_data[id].eo_tag = tag;
-    }
+  for (i = 0; regset[i] >= 0; i++) {
+    int id = regset[i] / 2;
+    int start = !(regset[i] % 2);
+    if (start)
+      tnfa->submatch_data[id].so_tag = tag;
+    else
+      tnfa->submatch_data[id].eo_tag = tag;
+  }
   regset[0] = -1;
 }
 
-
 /* Adds tags to appropriate locations in the parse tree in `tree', so that
    subexpressions marked for submatch addressing can be traced. */
-static reg_errcode_t
-tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
-	     tre_tnfa_t *tnfa)
-{
+static reg_errcode_t tre_add_tags(tre_mem_t mem, tre_stack_t *stack,
+                                  tre_ast_node_t *tree, tre_tnfa_t *tnfa) {
   reg_errcode_t status = REG_OK;
   tre_addtags_symbol_t symbol;
   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
@@ -1237,484 +1125,422 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
   /* True for first pass (counting number of needed tags) */
   int first_pass = (mem == NULL || tnfa == NULL);
   int *regset, *orig_regset;
-  int num_tags = 0; /* Total number of tags. */
-  int num_minimals = 0;	 /* Number of special minimal tags. */
-  int tag = 0;	    /* The tag that is to be added next. */
-  int next_tag = 1; /* Next tag to use after this one. */
-  int *parents;	    /* Stack of submatches the current submatch is
-		       contained in. */
+  int num_tags = 0;     /* Total number of tags. */
+  int num_minimals = 0; /* Number of special minimal tags. */
+  int tag = 0;          /* The tag that is to be added next. */
+  int next_tag = 1;     /* Next tag to use after this one. */
+  int *parents;         /* Stack of submatches the current submatch is
+                           contained in. */
   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
   tre_tag_states_t *saved_states;
 
   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
-  if (!first_pass)
-    {
-      tnfa->end_tag = 0;
-      tnfa->minimal_tags[0] = -1;
-    }
+  if (!first_pass) {
+    tnfa->end_tag = 0;
+    tnfa->minimal_tags[0] = -1;
+  }
 
-  regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
-  if (regset == NULL)
-    return REG_ESPACE;
+  regset = malloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
+  if (regset == NULL) return REG_ESPACE;
   regset[0] = -1;
   orig_regset = regset;
 
-  parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
-  if (parents == NULL)
-    {
-      xfree(regset);
-      return REG_ESPACE;
-    }
+  parents = malloc(sizeof(*parents) * (tnfa->num_submatches + 1));
+  if (parents == NULL) {
+    free(regset), regset = NULL;
+    return REG_ESPACE;
+  }
   parents[0] = -1;
 
-  saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
-  if (saved_states == NULL)
-    {
-      xfree(regset);
-      xfree(parents);
-      return REG_ESPACE;
-    }
-  else
-    {
-      unsigned int i;
-      for (i = 0; i <= tnfa->num_submatches; i++)
-	saved_states[i].tag = -1;
-    }
+  saved_states = malloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
+  if (saved_states == NULL) {
+    free(regset), regset = NULL;
+    free(parents), parents = NULL;
+    return REG_ESPACE;
+  } else {
+    unsigned int i;
+    for (i = 0; i <= tnfa->num_submatches; i++) saved_states[i].tag = -1;
+  }
 
   STACK_PUSH(stack, voidptr, node);
   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
 
-  while (tre_stack_num_objects(stack) > bottom)
-    {
-      if (status != REG_OK)
-	break;
+  while (tre_stack_num_objects(stack) > bottom) {
+    if (status != REG_OK) break;
 
-      symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
-      switch (symbol)
-	{
+    symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
+    switch (symbol) {
+      case ADDTAGS_SET_SUBMATCH_END: {
+        int id = tre_stack_pop_int(stack);
+        int i;
 
-	case ADDTAGS_SET_SUBMATCH_END:
-	  {
-	    int id = tre_stack_pop_int(stack);
-	    int i;
+        /* Add end of this submatch to regset. */
+        for (i = 0; regset[i] >= 0; i++)
+          ;
+        regset[i] = id * 2 + 1;
+        regset[i + 1] = -1;
 
-	    /* Add end of this submatch to regset. */
-	    for (i = 0; regset[i] >= 0; i++);
-	    regset[i] = id * 2 + 1;
-	    regset[i + 1] = -1;
+        /* Pop this submatch from the parents stack. */
+        for (i = 0; parents[i] >= 0; i++)
+          ;
+        parents[i - 1] = -1;
+        break;
+      }
 
-	    /* Pop this submatch from the parents stack. */
-	    for (i = 0; parents[i] >= 0; i++);
-	    parents[i - 1] = -1;
-	    break;
-	  }
+      case ADDTAGS_RECURSE:
+        node = tre_stack_pop_voidptr(stack);
 
-	case ADDTAGS_RECURSE:
-	  node = tre_stack_pop_voidptr(stack);
+        if (node->submatch_id >= 0) {
+          int id = node->submatch_id;
+          int i;
 
-	  if (node->submatch_id >= 0)
-	    {
-	      int id = node->submatch_id;
-	      int i;
+          /* Add start of this submatch to regset. */
+          for (i = 0; regset[i] >= 0; i++)
+            ;
+          regset[i] = id * 2;
+          regset[i + 1] = -1;
 
+          if (!first_pass) {
+            for (i = 0; parents[i] >= 0; i++)
+              ;
+            tnfa->submatch_data[id].parents = NULL;
+            if (i > 0) {
+              int *p = malloc(sizeof(*p) * (i + 1));
+              if (p == NULL) {
+                status = REG_ESPACE;
+                break;
+              }
+              unassert(tnfa->submatch_data[id].parents == NULL);
+              tnfa->submatch_data[id].parents = p;
+              for (i = 0; parents[i] >= 0; i++) p[i] = parents[i];
+              p[i] = -1;
+            }
+          }
 
-	      /* Add start of this submatch to regset. */
-	      for (i = 0; regset[i] >= 0; i++);
-	      regset[i] = id * 2;
-	      regset[i + 1] = -1;
+          /* Add end of this submatch to regset after processing this
+             node. */
+          STACK_PUSHX(stack, int, node->submatch_id);
+          STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
+        }
 
-	      if (!first_pass)
-		{
-		  for (i = 0; parents[i] >= 0; i++);
-		  tnfa->submatch_data[id].parents = NULL;
-		  if (i > 0)
-		    {
-		      int *p = xmalloc(sizeof(*p) * (i + 1));
-		      if (p == NULL)
-			{
-			  status = REG_ESPACE;
-			  break;
-			}
-		      assert(tnfa->submatch_data[id].parents == NULL);
-		      tnfa->submatch_data[id].parents = p;
-		      for (i = 0; parents[i] >= 0; i++)
-			p[i] = parents[i];
-		      p[i] = -1;
-		    }
-		}
+        switch (node->type) {
+          case LITERAL: {
+            tre_literal_t *lit = node->obj;
 
-	      /* Add end of this submatch to regset after processing this
-		 node. */
-	      STACK_PUSHX(stack, int, node->submatch_id);
-	      STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
-	    }
+            if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) {
+              int i;
+              if (regset[0] >= 0) {
+                /* Regset is not empty, so add a tag before the
+                   literal or backref. */
+                if (!first_pass) {
+                  status = tre_add_tag_left(mem, node, tag);
+                  tnfa->tag_directions[tag] = direction;
+                  if (minimal_tag >= 0) {
+                    for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
+                      ;
+                    tnfa->minimal_tags[i] = tag;
+                    tnfa->minimal_tags[i + 1] = minimal_tag;
+                    tnfa->minimal_tags[i + 2] = -1;
+                    minimal_tag = -1;
+                    num_minimals++;
+                  }
+                  tre_purge_regset(regset, tnfa, tag);
+                } else {
+                  node->num_tags = 1;
+                }
 
-	  switch (node->type)
-	    {
-	    case LITERAL:
-	      {
-		tre_literal_t *lit = node->obj;
+                regset[0] = -1;
+                tag = next_tag;
+                num_tags++;
+                next_tag++;
+              }
+            } else {
+              unassert(!IS_TAG(lit));
+            }
+            break;
+          }
+          case CATENATION: {
+            tre_catenation_t *cat = node->obj;
+            tre_ast_node_t *left = cat->left;
+            tre_ast_node_t *right = cat->right;
+            int reserved_tag = -1;
 
-		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
-		  {
-		    int i;
-		    if (regset[0] >= 0)
-		      {
-			/* Regset is not empty, so add a tag before the
-			   literal or backref. */
-			if (!first_pass)
-			  {
-			    status = tre_add_tag_left(mem, node, tag);
-			    tnfa->tag_directions[tag] = direction;
-			    if (minimal_tag >= 0)
-			      {
-				for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
-				tnfa->minimal_tags[i] = tag;
-				tnfa->minimal_tags[i + 1] = minimal_tag;
-				tnfa->minimal_tags[i + 2] = -1;
-				minimal_tag = -1;
-				num_minimals++;
-			      }
-			    tre_purge_regset(regset, tnfa, tag);
-			  }
-			else
-			  {
-			    node->num_tags = 1;
-			  }
+            /* After processing right child. */
+            STACK_PUSHX(stack, voidptr, node);
+            STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
 
-			regset[0] = -1;
-			tag = next_tag;
-			num_tags++;
-			next_tag++;
-		      }
-		  }
-		else
-		  {
-		    assert(!IS_TAG(lit));
-		  }
-		break;
-	      }
-	    case CATENATION:
-	      {
-		tre_catenation_t *cat = node->obj;
-		tre_ast_node_t *left = cat->left;
-		tre_ast_node_t *right = cat->right;
-		int reserved_tag = -1;
+            /* Process right child. */
+            STACK_PUSHX(stack, voidptr, right);
+            STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
 
+            /* After processing left child. */
+            STACK_PUSHX(stack, int, next_tag + left->num_tags);
+            if (left->num_tags > 0 && right->num_tags > 0) {
+              /* Reserve the next tag to the right child. */
+              reserved_tag = next_tag;
+              next_tag++;
+            }
+            STACK_PUSHX(stack, int, reserved_tag);
+            STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
 
-		/* After processing right child. */
-		STACK_PUSHX(stack, voidptr, node);
-		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
+            /* Process left child. */
+            STACK_PUSHX(stack, voidptr, left);
+            STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
 
-		/* Process right child. */
-		STACK_PUSHX(stack, voidptr, right);
-		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
+          } break;
+          case ITERATION: {
+            tre_iteration_t *iter = node->obj;
 
-		/* After processing left child. */
-		STACK_PUSHX(stack, int, next_tag + left->num_tags);
-		if (left->num_tags > 0 && right->num_tags > 0)
-		  {
-		    /* Reserve the next tag to the right child. */
-		    reserved_tag = next_tag;
-		    next_tag++;
-		  }
-		STACK_PUSHX(stack, int, reserved_tag);
-		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
+            if (first_pass) {
+              STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
+            } else {
+              STACK_PUSHX(stack, int, tag);
+              STACK_PUSHX(stack, int, iter->minimal);
+            }
+            STACK_PUSHX(stack, voidptr, node);
+            STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
 
-		/* Process left child. */
-		STACK_PUSHX(stack, voidptr, left);
-		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
+            STACK_PUSHX(stack, voidptr, iter->arg);
+            STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
 
-		}
-	      break;
-	    case ITERATION:
-	      {
-		tre_iteration_t *iter = node->obj;
+            /* Regset is not empty, so add a tag here. */
+            if (regset[0] >= 0 || iter->minimal) {
+              if (!first_pass) {
+                int i;
+                status = tre_add_tag_left(mem, node, tag);
+                if (iter->minimal)
+                  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
+                else
+                  tnfa->tag_directions[tag] = direction;
+                if (minimal_tag >= 0) {
+                  for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
+                    ;
+                  tnfa->minimal_tags[i] = tag;
+                  tnfa->minimal_tags[i + 1] = minimal_tag;
+                  tnfa->minimal_tags[i + 2] = -1;
+                  minimal_tag = -1;
+                  num_minimals++;
+                }
+                tre_purge_regset(regset, tnfa, tag);
+              }
 
-		if (first_pass)
-		  {
-		    STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
-		  }
-		else
-		  {
-		    STACK_PUSHX(stack, int, tag);
-		    STACK_PUSHX(stack, int, iter->minimal);
-		  }
-		STACK_PUSHX(stack, voidptr, node);
-		STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
+              regset[0] = -1;
+              tag = next_tag;
+              num_tags++;
+              next_tag++;
+            }
+            direction = TRE_TAG_MINIMIZE;
+          } break;
+          case UNION: {
+            tre_union_t *uni = node->obj;
+            tre_ast_node_t *left = uni->left;
+            tre_ast_node_t *right = uni->right;
+            int left_tag;
+            int right_tag;
 
-		STACK_PUSHX(stack, voidptr, iter->arg);
-		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
+            if (regset[0] >= 0) {
+              left_tag = next_tag;
+              right_tag = next_tag + 1;
+            } else {
+              left_tag = tag;
+              right_tag = next_tag;
+            }
 
-		/* Regset is not empty, so add a tag here. */
-		if (regset[0] >= 0 || iter->minimal)
-		  {
-		    if (!first_pass)
-		      {
-			int i;
-			status = tre_add_tag_left(mem, node, tag);
-			if (iter->minimal)
-			  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
-			else
-			  tnfa->tag_directions[tag] = direction;
-			if (minimal_tag >= 0)
-			  {
-			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
-			    tnfa->minimal_tags[i] = tag;
-			    tnfa->minimal_tags[i + 1] = minimal_tag;
-			    tnfa->minimal_tags[i + 2] = -1;
-			    minimal_tag = -1;
-			    num_minimals++;
-			  }
-			tre_purge_regset(regset, tnfa, tag);
-		      }
+            /* After processing right child. */
+            STACK_PUSHX(stack, int, right_tag);
+            STACK_PUSHX(stack, int, left_tag);
+            STACK_PUSHX(stack, voidptr, regset);
+            STACK_PUSHX(stack, int, regset[0] >= 0);
+            STACK_PUSHX(stack, voidptr, node);
+            STACK_PUSHX(stack, voidptr, right);
+            STACK_PUSHX(stack, voidptr, left);
+            STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
 
-		    regset[0] = -1;
-		    tag = next_tag;
-		    num_tags++;
-		    next_tag++;
-		  }
-		direction = TRE_TAG_MINIMIZE;
-	      }
-	      break;
-	    case UNION:
-	      {
-		tre_union_t *uni = node->obj;
-		tre_ast_node_t *left = uni->left;
-		tre_ast_node_t *right = uni->right;
-		int left_tag;
-		int right_tag;
+            /* Process right child. */
+            STACK_PUSHX(stack, voidptr, right);
+            STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
 
-		if (regset[0] >= 0)
-		  {
-		    left_tag = next_tag;
-		    right_tag = next_tag + 1;
-		  }
-		else
-		  {
-		    left_tag = tag;
-		    right_tag = next_tag;
-		  }
+            /* After processing left child. */
+            STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
 
-		/* After processing right child. */
-		STACK_PUSHX(stack, int, right_tag);
-		STACK_PUSHX(stack, int, left_tag);
-		STACK_PUSHX(stack, voidptr, regset);
-		STACK_PUSHX(stack, int, regset[0] >= 0);
-		STACK_PUSHX(stack, voidptr, node);
-		STACK_PUSHX(stack, voidptr, right);
-		STACK_PUSHX(stack, voidptr, left);
-		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
+            /* Process left child. */
+            STACK_PUSHX(stack, voidptr, left);
+            STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
 
-		/* Process right child. */
-		STACK_PUSHX(stack, voidptr, right);
-		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
+            /* Regset is not empty, so add a tag here. */
+            if (regset[0] >= 0) {
+              if (!first_pass) {
+                int i;
+                status = tre_add_tag_left(mem, node, tag);
+                tnfa->tag_directions[tag] = direction;
+                if (minimal_tag >= 0) {
+                  for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
+                    ;
+                  tnfa->minimal_tags[i] = tag;
+                  tnfa->minimal_tags[i + 1] = minimal_tag;
+                  tnfa->minimal_tags[i + 2] = -1;
+                  minimal_tag = -1;
+                  num_minimals++;
+                }
+                tre_purge_regset(regset, tnfa, tag);
+              }
 
-		/* After processing left child. */
-		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
+              regset[0] = -1;
+              tag = next_tag;
+              num_tags++;
+              next_tag++;
+            }
 
-		/* Process left child. */
-		STACK_PUSHX(stack, voidptr, left);
-		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
+            if (node->num_submatches > 0) {
+              /* The next two tags are reserved for markers. */
+              next_tag++;
+              tag = next_tag;
+              next_tag++;
+            }
 
-		/* Regset is not empty, so add a tag here. */
-		if (regset[0] >= 0)
-		  {
-		    if (!first_pass)
-		      {
-			int i;
-			status = tre_add_tag_left(mem, node, tag);
-			tnfa->tag_directions[tag] = direction;
-			if (minimal_tag >= 0)
-			  {
-			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
-			    tnfa->minimal_tags[i] = tag;
-			    tnfa->minimal_tags[i + 1] = minimal_tag;
-			    tnfa->minimal_tags[i + 2] = -1;
-			    minimal_tag = -1;
-			    num_minimals++;
-			  }
-			tre_purge_regset(regset, tnfa, tag);
-		      }
+            break;
+          }
+        }
 
-		    regset[0] = -1;
-		    tag = next_tag;
-		    num_tags++;
-		    next_tag++;
-		  }
+        if (node->submatch_id >= 0) {
+          int i;
+          /* Push this submatch on the parents stack. */
+          for (i = 0; parents[i] >= 0; i++)
+            ;
+          parents[i] = node->submatch_id;
+          parents[i + 1] = -1;
+        }
 
-		if (node->num_submatches > 0)
-		  {
-		    /* The next two tags are reserved for markers. */
-		    next_tag++;
-		    tag = next_tag;
-		    next_tag++;
-		  }
+        break; /* end case: ADDTAGS_RECURSE */
 
-		break;
-	      }
-	    }
+      case ADDTAGS_AFTER_ITERATION: {
+        int minimal = 0;
+        int enter_tag;
+        node = tre_stack_pop_voidptr(stack);
+        if (first_pass) {
+          node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags +
+                           tre_stack_pop_int(stack);
+          minimal_tag = -1;
+        } else {
+          minimal = tre_stack_pop_int(stack);
+          enter_tag = tre_stack_pop_int(stack);
+          if (minimal) minimal_tag = enter_tag;
+        }
 
-	  if (node->submatch_id >= 0)
-	    {
-	      int i;
-	      /* Push this submatch on the parents stack. */
-	      for (i = 0; parents[i] >= 0; i++);
-	      parents[i] = node->submatch_id;
-	      parents[i + 1] = -1;
-	    }
+        if (!first_pass) {
+          if (minimal)
+            direction = TRE_TAG_MINIMIZE;
+          else
+            direction = TRE_TAG_MAXIMIZE;
+        }
+        break;
+      }
 
-	  break; /* end case: ADDTAGS_RECURSE */
+      case ADDTAGS_AFTER_CAT_LEFT: {
+        int new_tag = tre_stack_pop_int(stack);
+        next_tag = tre_stack_pop_int(stack);
+        if (new_tag >= 0) {
+          tag = new_tag;
+        }
+        break;
+      }
 
-	case ADDTAGS_AFTER_ITERATION:
-	  {
-	    int minimal = 0;
-	    int enter_tag;
-	    node = tre_stack_pop_voidptr(stack);
-	    if (first_pass)
-	      {
-		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
-		  + tre_stack_pop_int(stack);
-		minimal_tag = -1;
-	      }
-	    else
-	      {
-		minimal = tre_stack_pop_int(stack);
-		enter_tag = tre_stack_pop_int(stack);
-		if (minimal)
-		  minimal_tag = enter_tag;
-	      }
+      case ADDTAGS_AFTER_CAT_RIGHT:
+        node = tre_stack_pop_voidptr(stack);
+        if (first_pass)
+          node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags +
+                           ((tre_catenation_t *)node->obj)->right->num_tags;
+        break;
 
-	    if (!first_pass)
-	      {
-		if (minimal)
-		  direction = TRE_TAG_MINIMIZE;
-		else
-		  direction = TRE_TAG_MAXIMIZE;
-	      }
-	    break;
-	  }
+      case ADDTAGS_AFTER_UNION_LEFT:
+        /* Lift the bottom of the `regset' array so that when processing
+           the right operand the items currently in the array are
+           invisible.	 The original bottom was saved at ADDTAGS_UNION and
+           will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
+        while (*regset >= 0) regset++;
+        break;
 
-	case ADDTAGS_AFTER_CAT_LEFT:
-	  {
-	    int new_tag = tre_stack_pop_int(stack);
-	    next_tag = tre_stack_pop_int(stack);
-	    if (new_tag >= 0)
-	      {
-		tag = new_tag;
-	      }
-	    break;
-	  }
+      case ADDTAGS_AFTER_UNION_RIGHT: {
+        int added_tags, tag_left, tag_right;
+        tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
+        tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
+        node = tre_stack_pop_voidptr(stack);
+        added_tags = tre_stack_pop_int(stack);
+        if (first_pass) {
+          node->num_tags = ((tre_union_t *)node->obj)->left->num_tags +
+                           ((tre_union_t *)node->obj)->right->num_tags +
+                           added_tags + ((node->num_submatches > 0) ? 2 : 0);
+        }
+        regset = tre_stack_pop_voidptr(stack);
+        tag_left = tre_stack_pop_int(stack);
+        tag_right = tre_stack_pop_int(stack);
 
-	case ADDTAGS_AFTER_CAT_RIGHT:
-	  node = tre_stack_pop_voidptr(stack);
-	  if (first_pass)
-	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
-	      + ((tre_catenation_t *)node->obj)->right->num_tags;
-	  break;
+        /* Add tags after both children, the left child gets a smaller
+           tag than the right child.  This guarantees that we prefer
+           the left child over the right child. */
+        /* XXX - This is not always necessary (if the children have
+           tags which must be seen for every match of that child). */
+        /* XXX - Check if this is the only place where tre_add_tag_right
+           is used.	 If so, use tre_add_tag_left (putting the tag before
+           the child as opposed after the child) and throw away
+           tre_add_tag_right. */
+        if (node->num_submatches > 0) {
+          if (!first_pass) {
+            status = tre_add_tag_right(mem, left, tag_left);
+            tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
+            if (status == REG_OK)
+              status = tre_add_tag_right(mem, right, tag_right);
+            tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
+          }
+          num_tags += 2;
+        }
+        direction = TRE_TAG_MAXIMIZE;
+        break;
+      }
 
-	case ADDTAGS_AFTER_UNION_LEFT:
-	  /* Lift the bottom of the `regset' array so that when processing
-	     the right operand the items currently in the array are
-	     invisible.	 The original bottom was saved at ADDTAGS_UNION and
-	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
-	  while (*regset >= 0)
-	    regset++;
-	  break;
+      default:
+        __builtin_unreachable();
 
-	case ADDTAGS_AFTER_UNION_RIGHT:
-	  {
-	    int added_tags, tag_left, tag_right;
-	    tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
-	    tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
-	    node = tre_stack_pop_voidptr(stack);
-	    added_tags = tre_stack_pop_int(stack);
-	    if (first_pass)
-	      {
-		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
-		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags
-		  + ((node->num_submatches > 0) ? 2 : 0);
-	      }
-	    regset = tre_stack_pop_voidptr(stack);
-	    tag_left = tre_stack_pop_int(stack);
-	    tag_right = tre_stack_pop_int(stack);
+    } /* end switch(symbol) */
+  }   /* end while(tre_stack_num_objects(stack) > bottom) */
 
-	    /* Add tags after both children, the left child gets a smaller
-	       tag than the right child.  This guarantees that we prefer
-	       the left child over the right child. */
-	    /* XXX - This is not always necessary (if the children have
-	       tags which must be seen for every match of that child). */
-	    /* XXX - Check if this is the only place where tre_add_tag_right
-	       is used.	 If so, use tre_add_tag_left (putting the tag before
-	       the child as opposed after the child) and throw away
-	       tre_add_tag_right. */
-	    if (node->num_submatches > 0)
-	      {
-		if (!first_pass)
-		  {
-		    status = tre_add_tag_right(mem, left, tag_left);
-		    tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
-		    if (status == REG_OK)
-		      status = tre_add_tag_right(mem, right, tag_right);
-		    tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
-		  }
-		num_tags += 2;
-	      }
-	    direction = TRE_TAG_MAXIMIZE;
-	    break;
-	  }
+  if (!first_pass) tre_purge_regset(regset, tnfa, tag);
 
-	default:
-	  assert(0);
-	  break;
+  if (!first_pass && minimal_tag >= 0) {
+    int i;
+    for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
+      ;
+    tnfa->minimal_tags[i] = tag;
+    tnfa->minimal_tags[i + 1] = minimal_tag;
+    tnfa->minimal_tags[i + 2] = -1;
+    minimal_tag = -1;
+    num_minimals++;
+  }
 
-	} /* end switch(symbol) */
-    } /* end while(tre_stack_num_objects(stack) > bottom) */
-
-  if (!first_pass)
-    tre_purge_regset(regset, tnfa, tag);
-
-  if (!first_pass && minimal_tag >= 0)
-    {
-      int i;
-      for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
-      tnfa->minimal_tags[i] = tag;
-      tnfa->minimal_tags[i + 1] = minimal_tag;
-      tnfa->minimal_tags[i + 2] = -1;
-      minimal_tag = -1;
-      num_minimals++;
-    }
-
-  assert(tree->num_tags == num_tags);
+  unassert(tree->num_tags == num_tags);
   tnfa->end_tag = num_tags;
   tnfa->num_tags = num_tags;
   tnfa->num_minimals = num_minimals;
-  xfree(orig_regset);
-  xfree(parents);
-  xfree(saved_states);
+  free(orig_regset), orig_regset = NULL;
+  free(parents), parents = NULL;
+  free(saved_states), saved_states = NULL;
   return status;
 }
 
-
-
 /*
   AST to TNFA compilation routines.
 */
 
-typedef enum {
-  COPY_RECURSE,
-  COPY_SET_RESULT_PTR
-} tre_copyast_symbol_t;
+typedef enum { COPY_RECURSE, COPY_SET_RESULT_PTR } tre_copyast_symbol_t;
 
 /* Flags for tre_copy_ast(). */
-#define COPY_REMOVE_TAGS	 1
-#define COPY_MAXIMIZE_FIRST_TAG	 2
+#define COPY_REMOVE_TAGS        1
+#define COPY_MAXIMIZE_FIRST_TAG 2
 
-static reg_errcode_t
-tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
-	     int flags, int *pos_add, tre_tag_direction_t *tag_directions,
-	     tre_ast_node_t **copy, int *max_pos)
-{
+static reg_errcode_t tre_copy_ast(tre_mem_t mem, tre_stack_t *stack,
+                                  tre_ast_node_t *ast, int flags, int *pos_add,
+                                  tre_tag_direction_t *tag_directions,
+                                  tre_ast_node_t **copy, int *max_pos) {
   reg_errcode_t status = REG_OK;
   int bottom = tre_stack_num_objects(stack);
   int num_copied = 0;
@@ -1725,143 +1551,121 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
   STACK_PUSH(stack, voidptr, ast);
   STACK_PUSH(stack, int, COPY_RECURSE);
 
-  while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
-    {
-      tre_ast_node_t *node;
-      if (status != REG_OK)
-	break;
+  while (status == REG_OK && tre_stack_num_objects(stack) > bottom) {
+    tre_ast_node_t *node;
+    if (status != REG_OK) break;
 
-      symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
-      switch (symbol)
-	{
-	case COPY_SET_RESULT_PTR:
-	  result = tre_stack_pop_voidptr(stack);
-	  break;
-	case COPY_RECURSE:
-	  node = tre_stack_pop_voidptr(stack);
-	  switch (node->type)
-	    {
-	    case LITERAL:
-	      {
-		tre_literal_t *lit = node->obj;
-		int pos = lit->position;
-		int min = lit->code_min;
-		int max = lit->code_max;
-		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
-		  {
-		    /* XXX - e.g. [ab] has only one position but two
-		       nodes, so we are creating holes in the state space
-		       here.  Not fatal, just wastes memory. */
-		    pos += *pos_add;
-		    num_copied++;
-		  }
-		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
-		  {
-		    /* Change this tag to empty. */
-		    min = EMPTY;
-		    max = pos = -1;
-		  }
-		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
-			 && first_tag)
-		  {
-		    /* Maximize the first tag. */
-		    tag_directions[max] = TRE_TAG_MAXIMIZE;
-		    first_tag = 0;
-		  }
-		*result = tre_ast_new_literal(mem, min, max, pos);
-		if (*result == NULL)
-		  status = REG_ESPACE;
-		else {
-		  tre_literal_t *p = (*result)->obj;
-		  p->class = lit->class;
-		  p->neg_classes = lit->neg_classes;
-		}
+    symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
+    switch (symbol) {
+      case COPY_SET_RESULT_PTR:
+        result = tre_stack_pop_voidptr(stack);
+        break;
+      case COPY_RECURSE:
+        node = tre_stack_pop_voidptr(stack);
+        switch (node->type) {
+          case LITERAL: {
+            tre_literal_t *lit = node->obj;
+            int pos = lit->position;
+            int min = lit->code_min;
+            int max = lit->code_max;
+            if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) {
+              /* XXX - e.g. [ab] has only one position but two
+                 nodes, so we are creating holes in the state space
+                 here.  Not fatal, just wastes memory. */
+              pos += *pos_add;
+              num_copied++;
+            } else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS)) {
+              /* Change this tag to empty. */
+              min = EMPTY;
+              max = pos = -1;
+            } else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG) &&
+                       first_tag) {
+              /* Maximize the first tag. */
+              tag_directions[max] = TRE_TAG_MAXIMIZE;
+              first_tag = 0;
+            }
+            *result = tre_ast_new_literal(mem, min, max, pos);
+            if (*result == NULL)
+              status = REG_ESPACE;
+            else {
+              tre_literal_t *p = (*result)->obj;
+              p->class = lit->class;
+              p->neg_classes = lit->neg_classes;
+            }
 
-		if (pos > *max_pos)
-		  *max_pos = pos;
-		break;
-	      }
-	    case UNION:
-	      {
-		tre_union_t *uni = node->obj;
-		tre_union_t *tmp;
-		*result = tre_ast_new_union(mem, uni->left, uni->right);
-		if (*result == NULL)
-		  {
-		    status = REG_ESPACE;
-		    break;
-		  }
-		tmp = (*result)->obj;
-		result = &tmp->left;
-		STACK_PUSHX(stack, voidptr, uni->right);
-		STACK_PUSHX(stack, int, COPY_RECURSE);
-		STACK_PUSHX(stack, voidptr, &tmp->right);
-		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
-		STACK_PUSHX(stack, voidptr, uni->left);
-		STACK_PUSHX(stack, int, COPY_RECURSE);
-		break;
-	      }
-	    case CATENATION:
-	      {
-		tre_catenation_t *cat = node->obj;
-		tre_catenation_t *tmp;
-		*result = tre_ast_new_catenation(mem, cat->left, cat->right);
-		if (*result == NULL)
-		  {
-		    status = REG_ESPACE;
-		    break;
-		  }
-		tmp = (*result)->obj;
-		tmp->left = NULL;
-		tmp->right = NULL;
-		result = &tmp->left;
+            if (pos > *max_pos) *max_pos = pos;
+            break;
+          }
+          case UNION: {
+            tre_union_t *uni = node->obj;
+            tre_union_t *tmp;
+            *result = tre_ast_new_union(mem, uni->left, uni->right);
+            if (*result == NULL) {
+              status = REG_ESPACE;
+              break;
+            }
+            tmp = (*result)->obj;
+            result = &tmp->left;
+            STACK_PUSHX(stack, voidptr, uni->right);
+            STACK_PUSHX(stack, int, COPY_RECURSE);
+            STACK_PUSHX(stack, voidptr, &tmp->right);
+            STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
+            STACK_PUSHX(stack, voidptr, uni->left);
+            STACK_PUSHX(stack, int, COPY_RECURSE);
+            break;
+          }
+          case CATENATION: {
+            tre_catenation_t *cat = node->obj;
+            tre_catenation_t *tmp;
+            *result = tre_ast_new_catenation(mem, cat->left, cat->right);
+            if (*result == NULL) {
+              status = REG_ESPACE;
+              break;
+            }
+            tmp = (*result)->obj;
+            tmp->left = NULL;
+            tmp->right = NULL;
+            result = &tmp->left;
 
-		STACK_PUSHX(stack, voidptr, cat->right);
-		STACK_PUSHX(stack, int, COPY_RECURSE);
-		STACK_PUSHX(stack, voidptr, &tmp->right);
-		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
-		STACK_PUSHX(stack, voidptr, cat->left);
-		STACK_PUSHX(stack, int, COPY_RECURSE);
-		break;
-	      }
-	    case ITERATION:
-	      {
-		tre_iteration_t *iter = node->obj;
-		STACK_PUSHX(stack, voidptr, iter->arg);
-		STACK_PUSHX(stack, int, COPY_RECURSE);
-		*result = tre_ast_new_iter(mem, iter->arg, iter->min,
-					   iter->max, iter->minimal);
-		if (*result == NULL)
-		  {
-		    status = REG_ESPACE;
-		    break;
-		  }
-		iter = (*result)->obj;
-		result = &iter->arg;
-		break;
-	      }
-	    default:
-	      assert(0);
-	      break;
-	    }
-	  break;
-	}
+            STACK_PUSHX(stack, voidptr, cat->right);
+            STACK_PUSHX(stack, int, COPY_RECURSE);
+            STACK_PUSHX(stack, voidptr, &tmp->right);
+            STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
+            STACK_PUSHX(stack, voidptr, cat->left);
+            STACK_PUSHX(stack, int, COPY_RECURSE);
+            break;
+          }
+          case ITERATION: {
+            tre_iteration_t *iter = node->obj;
+            STACK_PUSHX(stack, voidptr, iter->arg);
+            STACK_PUSHX(stack, int, COPY_RECURSE);
+            *result = tre_ast_new_iter(mem, iter->arg, iter->min, iter->max,
+                                       iter->minimal);
+            if (*result == NULL) {
+              status = REG_ESPACE;
+              break;
+            }
+            iter = (*result)->obj;
+            result = &iter->arg;
+            break;
+          }
+          default:
+            __builtin_unreachable();
+        }
+        break;
     }
+  }
   *pos_add += num_copied;
   return status;
 }
 
-typedef enum {
-  EXPAND_RECURSE,
-  EXPAND_AFTER_ITER
-} tre_expand_ast_symbol_t;
+typedef enum { EXPAND_RECURSE, EXPAND_AFTER_ITER } tre_expand_ast_symbol_t;
 
 /* Expands each iteration node that has a finite nonzero minimum or maximum
    iteration count to a catenated sequence of copies of the node. */
-static reg_errcode_t
-tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
-	       int *position, tre_tag_direction_t *tag_directions)
-{
+static reg_errcode_t tre_expand_ast(tre_mem_t mem, tre_stack_t *stack,
+                                    tre_ast_node_t *ast, int *position,
+                                    tre_tag_direction_t *tag_directions) {
   reg_errcode_t status = REG_OK;
   int bottom = tre_stack_num_objects(stack);
   int pos_add = 0;
@@ -1871,165 +1675,133 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
 
   STACK_PUSHR(stack, voidptr, ast);
   STACK_PUSHR(stack, int, EXPAND_RECURSE);
-  while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
-    {
-      tre_ast_node_t *node;
-      tre_expand_ast_symbol_t symbol;
+  while (status == REG_OK && tre_stack_num_objects(stack) > bottom) {
+    tre_ast_node_t *node;
+    tre_expand_ast_symbol_t symbol;
 
-      if (status != REG_OK)
-	break;
+    if (status != REG_OK) break;
 
-      symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
-      node = tre_stack_pop_voidptr(stack);
-      switch (symbol)
-	{
-	case EXPAND_RECURSE:
-	  switch (node->type)
-	    {
-	    case LITERAL:
-	      {
-		tre_literal_t *lit= node->obj;
-		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
-		  {
-		    lit->position += pos_add;
-		    if (lit->position > max_pos)
-		      max_pos = lit->position;
-		  }
-		break;
-	      }
-	    case UNION:
-	      {
-		tre_union_t *uni = node->obj;
-		STACK_PUSHX(stack, voidptr, uni->right);
-		STACK_PUSHX(stack, int, EXPAND_RECURSE);
-		STACK_PUSHX(stack, voidptr, uni->left);
-		STACK_PUSHX(stack, int, EXPAND_RECURSE);
-		break;
-	      }
-	    case CATENATION:
-	      {
-		tre_catenation_t *cat = node->obj;
-		STACK_PUSHX(stack, voidptr, cat->right);
-		STACK_PUSHX(stack, int, EXPAND_RECURSE);
-		STACK_PUSHX(stack, voidptr, cat->left);
-		STACK_PUSHX(stack, int, EXPAND_RECURSE);
-		break;
-	      }
-	    case ITERATION:
-	      {
-		tre_iteration_t *iter = node->obj;
-		STACK_PUSHX(stack, int, pos_add);
-		STACK_PUSHX(stack, voidptr, node);
-		STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
-		STACK_PUSHX(stack, voidptr, iter->arg);
-		STACK_PUSHX(stack, int, EXPAND_RECURSE);
-		/* If we are going to expand this node at EXPAND_AFTER_ITER
-		   then don't increase the `pos' fields of the nodes now, it
-		   will get done when expanding. */
-		if (iter->min > 1 || iter->max > 1)
-		  pos_add = 0;
-		iter_depth++;
-		break;
-	      }
-	    default:
-	      assert(0);
-	      break;
-	    }
-	  break;
-	case EXPAND_AFTER_ITER:
-	  {
-	    tre_iteration_t *iter = node->obj;
-	    int pos_add_last;
-	    pos_add = tre_stack_pop_int(stack);
-	    pos_add_last = pos_add;
-	    if (iter->min > 1 || iter->max > 1)
-	      {
-		tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
-		int j;
-		int pos_add_save = pos_add;
+    symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
+    node = tre_stack_pop_voidptr(stack);
+    switch (symbol) {
+      case EXPAND_RECURSE:
+        switch (node->type) {
+          case LITERAL: {
+            tre_literal_t *lit = node->obj;
+            if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) {
+              lit->position += pos_add;
+              if (lit->position > max_pos) max_pos = lit->position;
+            }
+            break;
+          }
+          case UNION: {
+            tre_union_t *uni = node->obj;
+            STACK_PUSHX(stack, voidptr, uni->right);
+            STACK_PUSHX(stack, int, EXPAND_RECURSE);
+            STACK_PUSHX(stack, voidptr, uni->left);
+            STACK_PUSHX(stack, int, EXPAND_RECURSE);
+            break;
+          }
+          case CATENATION: {
+            tre_catenation_t *cat = node->obj;
+            STACK_PUSHX(stack, voidptr, cat->right);
+            STACK_PUSHX(stack, int, EXPAND_RECURSE);
+            STACK_PUSHX(stack, voidptr, cat->left);
+            STACK_PUSHX(stack, int, EXPAND_RECURSE);
+            break;
+          }
+          case ITERATION: {
+            tre_iteration_t *iter = node->obj;
+            STACK_PUSHX(stack, int, pos_add);
+            STACK_PUSHX(stack, voidptr, node);
+            STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
+            STACK_PUSHX(stack, voidptr, iter->arg);
+            STACK_PUSHX(stack, int, EXPAND_RECURSE);
+            /* If we are going to expand this node at EXPAND_AFTER_ITER
+               then don't increase the `pos' fields of the nodes now, it
+               will get done when expanding. */
+            if (iter->min > 1 || iter->max > 1) pos_add = 0;
+            iter_depth++;
+            break;
+          }
+          default:
+            __builtin_unreachable();
+        }
+        break;
+      case EXPAND_AFTER_ITER: {
+        tre_iteration_t *iter = node->obj;
+        int pos_add_last;
+        pos_add = tre_stack_pop_int(stack);
+        pos_add_last = pos_add;
+        if (iter->min > 1 || iter->max > 1) {
+          tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
+          int j;
+          int pos_add_save = pos_add;
 
-		/* Create a catenated sequence of copies of the node. */
-		for (j = 0; j < iter->min; j++)
-		  {
-		    tre_ast_node_t *copy;
-		    /* Remove tags from all but the last copy. */
-		    int flags = ((j + 1 < iter->min)
-				 ? COPY_REMOVE_TAGS
-				 : COPY_MAXIMIZE_FIRST_TAG);
-		    pos_add_save = pos_add;
-		    status = tre_copy_ast(mem, stack, iter->arg, flags,
-					  &pos_add, tag_directions, &copy,
-					  &max_pos);
-		    if (status != REG_OK)
-		      return status;
-		    if (seq1 != NULL)
-		      seq1 = tre_ast_new_catenation(mem, seq1, copy);
-		    else
-		      seq1 = copy;
-		    if (seq1 == NULL)
-		      return REG_ESPACE;
-		  }
+          /* Create a catenated sequence of copies of the node. */
+          for (j = 0; j < iter->min; j++) {
+            tre_ast_node_t *copy;
+            /* Remove tags from all but the last copy. */
+            int flags = ((j + 1 < iter->min) ? COPY_REMOVE_TAGS
+                                             : COPY_MAXIMIZE_FIRST_TAG);
+            pos_add_save = pos_add;
+            status = tre_copy_ast(mem, stack, iter->arg, flags, &pos_add,
+                                  tag_directions, &copy, &max_pos);
+            if (status != REG_OK) return status;
+            if (seq1 != NULL)
+              seq1 = tre_ast_new_catenation(mem, seq1, copy);
+            else
+              seq1 = copy;
+            if (seq1 == NULL) return REG_ESPACE;
+          }
 
-		if (iter->max == -1)
-		  {
-		    /* No upper limit. */
-		    pos_add_save = pos_add;
-		    status = tre_copy_ast(mem, stack, iter->arg, 0,
-					  &pos_add, NULL, &seq2, &max_pos);
-		    if (status != REG_OK)
-		      return status;
-		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
-		    if (seq2 == NULL)
-		      return REG_ESPACE;
-		  }
-		else
-		  {
-		    for (j = iter->min; j < iter->max; j++)
-		      {
-			tre_ast_node_t *tmp, *copy;
-			pos_add_save = pos_add;
-			status = tre_copy_ast(mem, stack, iter->arg, 0,
-					      &pos_add, NULL, &copy, &max_pos);
-			if (status != REG_OK)
-			  return status;
-			if (seq2 != NULL)
-			  seq2 = tre_ast_new_catenation(mem, copy, seq2);
-			else
-			  seq2 = copy;
-			if (seq2 == NULL)
-			  return REG_ESPACE;
-			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
-			if (tmp == NULL)
-			  return REG_ESPACE;
-			seq2 = tre_ast_new_union(mem, tmp, seq2);
-			if (seq2 == NULL)
-			  return REG_ESPACE;
-		      }
-		  }
+          if (iter->max == -1) {
+            /* No upper limit. */
+            pos_add_save = pos_add;
+            status = tre_copy_ast(mem, stack, iter->arg, 0, &pos_add, NULL,
+                                  &seq2, &max_pos);
+            if (status != REG_OK) return status;
+            seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
+            if (seq2 == NULL) return REG_ESPACE;
+          } else {
+            for (j = iter->min; j < iter->max; j++) {
+              tre_ast_node_t *tmp, *copy;
+              pos_add_save = pos_add;
+              status = tre_copy_ast(mem, stack, iter->arg, 0, &pos_add, NULL,
+                                    &copy, &max_pos);
+              if (status != REG_OK) return status;
+              if (seq2 != NULL)
+                seq2 = tre_ast_new_catenation(mem, copy, seq2);
+              else
+                seq2 = copy;
+              if (seq2 == NULL) return REG_ESPACE;
+              tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
+              if (tmp == NULL) return REG_ESPACE;
+              seq2 = tre_ast_new_union(mem, tmp, seq2);
+              if (seq2 == NULL) return REG_ESPACE;
+            }
+          }
 
-		pos_add = pos_add_save;
-		if (seq1 == NULL)
-		  seq1 = seq2;
-		else if (seq2 != NULL)
-		  seq1 = tre_ast_new_catenation(mem, seq1, seq2);
-		if (seq1 == NULL)
-		  return REG_ESPACE;
-		node->obj = seq1->obj;
-		node->type = seq1->type;
-	      }
+          pos_add = pos_add_save;
+          if (seq1 == NULL)
+            seq1 = seq2;
+          else if (seq2 != NULL)
+            seq1 = tre_ast_new_catenation(mem, seq1, seq2);
+          if (seq1 == NULL) return REG_ESPACE;
+          node->obj = seq1->obj;
+          node->type = seq1->type;
+        }
 
-	    iter_depth--;
-	    pos_add_total += pos_add - pos_add_last;
-	    if (iter_depth == 0)
-	      pos_add = pos_add_total;
+        iter_depth--;
+        pos_add_total += pos_add - pos_add_last;
+        if (iter_depth == 0) pos_add = pos_add_total;
 
-	    break;
-	  }
-	default:
-	  assert(0);
-	  break;
-	}
+        break;
+      }
+      default:
+        __builtin_unreachable();
     }
+  }
 
   *position += pos_add_total;
 
@@ -2037,20 +1809,16 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
      code works, but just an extra safeguard let's make sure
      `*position' is set large enough so enough memory will be
      allocated for the transition table. */
-  if (max_pos > *position)
-    *position = max_pos;
+  if (max_pos > *position) *position = max_pos;
 
   return status;
 }
 
-static tre_pos_and_tags_t *
-tre_set_empty(tre_mem_t mem)
-{
+static tre_pos_and_tags_t *tre_set_empty(tre_mem_t mem) {
   tre_pos_and_tags_t *new_set;
 
   new_set = tre_mem_calloc(mem, sizeof(*new_set));
-  if (new_set == NULL)
-    return NULL;
+  if (new_set == NULL) return NULL;
 
   new_set[0].position = -1;
   new_set[0].code_min = -1;
@@ -2059,15 +1827,14 @@ tre_set_empty(tre_mem_t mem)
   return new_set;
 }
 
-static tre_pos_and_tags_t *
-tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
-	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
-{
+static tre_pos_and_tags_t *tre_set_one(tre_mem_t mem, int position,
+                                       int code_min, int code_max,
+                                       tre_ctype_t class,
+                                       tre_ctype_t *neg_classes, int backref) {
   tre_pos_and_tags_t *new_set;
 
   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
-  if (new_set == NULL)
-    return NULL;
+  if (new_set == NULL) return NULL;
 
   new_set[0].position = position;
   new_set[0].code_min = code_min;
@@ -2082,73 +1849,67 @@ tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
   return new_set;
 }
 
-static tre_pos_and_tags_t *
-tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
-	      int *tags, int assertions)
-{
+static tre_pos_and_tags_t *tre_set_union(tre_mem_t mem,
+                                         tre_pos_and_tags_t *set1,
+                                         tre_pos_and_tags_t *set2, int *tags,
+                                         int assertions) {
   int s1, s2, i, j;
   tre_pos_and_tags_t *new_set;
   int *new_tags;
   int num_tags;
 
-  for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
-  for (s1 = 0; set1[s1].position >= 0; s1++);
-  for (s2 = 0; set2[s2].position >= 0; s2++);
-  new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
-  if (!new_set )
-    return NULL;
-
+  for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++)
+    ;
   for (s1 = 0; set1[s1].position >= 0; s1++)
-    {
-      new_set[s1].position = set1[s1].position;
-      new_set[s1].code_min = set1[s1].code_min;
-      new_set[s1].code_max = set1[s1].code_max;
-      new_set[s1].assertions = set1[s1].assertions | assertions;
-      new_set[s1].class = set1[s1].class;
-      new_set[s1].neg_classes = set1[s1].neg_classes;
-      new_set[s1].backref = set1[s1].backref;
-      if (set1[s1].tags == NULL && tags == NULL)
-	new_set[s1].tags = NULL;
-      else
-	{
-	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
-	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
-					 * (i + num_tags + 1)));
-	  if (new_tags == NULL)
-	    return NULL;
-	  for (j = 0; j < i; j++)
-	    new_tags[j] = set1[s1].tags[j];
-	  for (i = 0; i < num_tags; i++)
-	    new_tags[j + i] = tags[i];
-	  new_tags[j + i] = -1;
-	  new_set[s1].tags = new_tags;
-	}
-    }
-
+    ;
   for (s2 = 0; set2[s2].position >= 0; s2++)
-    {
-      new_set[s1 + s2].position = set2[s2].position;
-      new_set[s1 + s2].code_min = set2[s2].code_min;
-      new_set[s1 + s2].code_max = set2[s2].code_max;
-      /* XXX - why not | assertions here as well? */
-      new_set[s1 + s2].assertions = set2[s2].assertions;
-      new_set[s1 + s2].class = set2[s2].class;
-      new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
-      new_set[s1 + s2].backref = set2[s2].backref;
-      if (set2[s2].tags == NULL)
-	new_set[s1 + s2].tags = NULL;
-      else
-	{
-	  for (i = 0; set2[s2].tags[i] >= 0; i++);
-	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
-	  if (new_tags == NULL)
-	    return NULL;
-	  for (j = 0; j < i; j++)
-	    new_tags[j] = set2[s2].tags[j];
-	  new_tags[j] = -1;
-	  new_set[s1 + s2].tags = new_tags;
-	}
+    ;
+  new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
+  if (!new_set) return NULL;
+
+  for (s1 = 0; set1[s1].position >= 0; s1++) {
+    new_set[s1].position = set1[s1].position;
+    new_set[s1].code_min = set1[s1].code_min;
+    new_set[s1].code_max = set1[s1].code_max;
+    new_set[s1].assertions = set1[s1].assertions | assertions;
+    new_set[s1].class = set1[s1].class;
+    new_set[s1].neg_classes = set1[s1].neg_classes;
+    new_set[s1].backref = set1[s1].backref;
+    if (set1[s1].tags == NULL && tags == NULL)
+      new_set[s1].tags = NULL;
+    else {
+      for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++)
+        ;
+      new_tags = tre_mem_alloc(mem, (sizeof(*new_tags) * (i + num_tags + 1)));
+      if (new_tags == NULL) return NULL;
+      for (j = 0; j < i; j++) new_tags[j] = set1[s1].tags[j];
+      for (i = 0; i < num_tags; i++) new_tags[j + i] = tags[i];
+      new_tags[j + i] = -1;
+      new_set[s1].tags = new_tags;
     }
+  }
+
+  for (s2 = 0; set2[s2].position >= 0; s2++) {
+    new_set[s1 + s2].position = set2[s2].position;
+    new_set[s1 + s2].code_min = set2[s2].code_min;
+    new_set[s1 + s2].code_max = set2[s2].code_max;
+    /* XXX - why not | assertions here as well? */
+    new_set[s1 + s2].assertions = set2[s2].assertions;
+    new_set[s1 + s2].class = set2[s2].class;
+    new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
+    new_set[s1 + s2].backref = set2[s2].backref;
+    if (set2[s2].tags == NULL)
+      new_set[s1 + s2].tags = NULL;
+    else {
+      for (i = 0; set2[s2].tags[i] >= 0; i++)
+        ;
+      new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
+      if (new_tags == NULL) return NULL;
+      for (j = 0; j < i; j++) new_tags[j] = set2[s2].tags[j];
+      new_tags[j] = -1;
+      new_set[s1 + s2].tags = new_tags;
+    }
+  }
   new_set[s1 + s2].position = -1;
   return new_set;
 }
@@ -2157,10 +1918,9 @@ tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
    taken according to POSIX.2 rules, and adds the tags on that path to
    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
    set to the number of tags seen on the path. */
-static reg_errcode_t
-tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
-		int *assertions, int *num_tags_seen)
-{
+static reg_errcode_t tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node,
+                                     int *tags, int *assertions,
+                                     int *num_tags_seen) {
   tre_literal_t *lit;
   tre_union_t *uni;
   tre_catenation_t *cat;
@@ -2168,95 +1928,80 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
   int i;
   int bottom = tre_stack_num_objects(stack);
   reg_errcode_t status = REG_OK;
-  if (num_tags_seen)
-    *num_tags_seen = 0;
+  if (num_tags_seen) *num_tags_seen = 0;
 
   status = tre_stack_push_voidptr(stack, node);
 
   /* Walk through the tree recursively. */
-  while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
-    {
-      node = tre_stack_pop_voidptr(stack);
+  while (status == REG_OK && tre_stack_num_objects(stack) > bottom) {
+    node = tre_stack_pop_voidptr(stack);
 
-      switch (node->type)
-	{
-	case LITERAL:
-	  lit = (tre_literal_t *)node->obj;
-	  switch (lit->code_min)
-	    {
-	    case TAG:
-	      if (lit->code_max >= 0)
-		{
-		  if (tags != NULL)
-		    {
-		      /* Add the tag to `tags'. */
-		      for (i = 0; tags[i] >= 0; i++)
-			if (tags[i] == lit->code_max)
-			  break;
-		      if (tags[i] < 0)
-			{
-			  tags[i] = lit->code_max;
-			  tags[i + 1] = -1;
-			}
-		    }
-		  if (num_tags_seen)
-		    (*num_tags_seen)++;
-		}
-	      break;
-	    case ASSERTION:
-	      assert(lit->code_max >= 1
-		     || lit->code_max <= ASSERT_LAST);
-	      if (assertions != NULL)
-		*assertions |= lit->code_max;
-	      break;
-	    case EMPTY:
-	      break;
-	    default:
-	      assert(0);
-	      break;
-	    }
-	  break;
+    switch (node->type) {
+      case LITERAL:
+        lit = (tre_literal_t *)node->obj;
+        switch (lit->code_min) {
+          case TAG:
+            if (lit->code_max >= 0) {
+              if (tags != NULL) {
+                /* Add the tag to `tags'. */
+                for (i = 0; tags[i] >= 0; i++)
+                  if (tags[i] == lit->code_max) break;
+                if (tags[i] < 0) {
+                  tags[i] = lit->code_max;
+                  tags[i + 1] = -1;
+                }
+              }
+              if (num_tags_seen) (*num_tags_seen)++;
+            }
+            break;
+          case ASSERTION:
+            unassert(lit->code_max >= 1 || lit->code_max <= ASSERT_LAST);
+            if (assertions != NULL) *assertions |= lit->code_max;
+            break;
+          case EMPTY:
+            break;
+          default:
+            __builtin_unreachable();
+        }
+        break;
 
-	case UNION:
-	  /* Subexpressions starting earlier take priority over ones
-	     starting later, so we prefer the left subexpression over the
-	     right subexpression. */
-	  uni = (tre_union_t *)node->obj;
-	  if (uni->left->nullable)
-	    STACK_PUSHX(stack, voidptr, uni->left)
-	  else if (uni->right->nullable)
-	    STACK_PUSHX(stack, voidptr, uni->right)
-	  else
-	    assert(0);
-	  break;
+      case UNION:
+        /* Subexpressions starting earlier take priority over ones
+           starting later, so we prefer the left subexpression over the
+           right subexpression. */
+        uni = (tre_union_t *)node->obj;
+        if (uni->left->nullable)
+          STACK_PUSHX(stack, voidptr, uni->left)
+        else if (uni->right->nullable)
+          STACK_PUSHX(stack, voidptr, uni->right)
+        else
+          __builtin_unreachable();
+        break;
 
-	case CATENATION:
-	  /* The path must go through both children. */
-	  cat = (tre_catenation_t *)node->obj;
-	  assert(cat->left->nullable);
-	  assert(cat->right->nullable);
-	  STACK_PUSHX(stack, voidptr, cat->left);
-	  STACK_PUSHX(stack, voidptr, cat->right);
-	  break;
+      case CATENATION:
+        /* The path must go through both children. */
+        cat = (tre_catenation_t *)node->obj;
+        unassert(cat->left->nullable);
+        unassert(cat->right->nullable);
+        STACK_PUSHX(stack, voidptr, cat->left);
+        STACK_PUSHX(stack, voidptr, cat->right);
+        break;
 
-	case ITERATION:
-	  /* A match with an empty string is preferred over no match at
-	     all, so we go through the argument if possible. */
-	  iter = (tre_iteration_t *)node->obj;
-	  if (iter->arg->nullable)
-	    STACK_PUSHX(stack, voidptr, iter->arg);
-	  break;
+      case ITERATION:
+        /* A match with an empty string is preferred over no match at
+           all, so we go through the argument if possible. */
+        iter = (tre_iteration_t *)node->obj;
+        if (iter->arg->nullable) STACK_PUSHX(stack, voidptr, iter->arg);
+        break;
 
-	default:
-	  assert(0);
-	  break;
-	}
+      default:
+        __builtin_unreachable();
     }
+  }
 
   return status;
 }
 
-
 typedef enum {
   NFL_RECURSE,
   NFL_POST_UNION,
@@ -2264,263 +2009,211 @@ typedef enum {
   NFL_POST_ITERATION
 } tre_nfl_stack_symbol_t;
 
-
 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
    the nodes of the AST `tree'. */
-static reg_errcode_t
-tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
-{
+static reg_errcode_t tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack,
+                                     tre_ast_node_t *tree) {
   int bottom = tre_stack_num_objects(stack);
 
   STACK_PUSHR(stack, voidptr, tree);
   STACK_PUSHR(stack, int, NFL_RECURSE);
 
-  while (tre_stack_num_objects(stack) > bottom)
-    {
-      tre_nfl_stack_symbol_t symbol;
-      tre_ast_node_t *node;
+  while (tre_stack_num_objects(stack) > bottom) {
+    tre_nfl_stack_symbol_t symbol;
+    tre_ast_node_t *node;
 
-      symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
-      node = tre_stack_pop_voidptr(stack);
-      switch (symbol)
-	{
-	case NFL_RECURSE:
-	  switch (node->type)
-	    {
-	    case LITERAL:
-	      {
-		tre_literal_t *lit = (tre_literal_t *)node->obj;
-		if (IS_BACKREF(lit))
-		  {
-		    /* Back references: nullable = false, firstpos = {i},
-		       lastpos = {i}. */
-		    node->nullable = 0;
-		    node->firstpos = tre_set_one(mem, lit->position, 0,
-					     TRE_CHAR_MAX, 0, NULL, -1);
-		    if (!node->firstpos)
-		      return REG_ESPACE;
-		    node->lastpos = tre_set_one(mem, lit->position, 0,
-						TRE_CHAR_MAX, 0, NULL,
-						(int)lit->code_max);
-		    if (!node->lastpos)
-		      return REG_ESPACE;
-		  }
-		else if (lit->code_min < 0)
-		  {
-		    /* Tags, empty strings, params, and zero width assertions:
-		       nullable = true, firstpos = {}, and lastpos = {}. */
-		    node->nullable = 1;
-		    node->firstpos = tre_set_empty(mem);
-		    if (!node->firstpos)
-		      return REG_ESPACE;
-		    node->lastpos = tre_set_empty(mem);
-		    if (!node->lastpos)
-		      return REG_ESPACE;
-		  }
-		else
-		  {
-		    /* Literal at position i: nullable = false, firstpos = {i},
-		       lastpos = {i}. */
-		    node->nullable = 0;
-		    node->firstpos =
-		      tre_set_one(mem, lit->position, (int)lit->code_min,
-				  (int)lit->code_max, 0, NULL, -1);
-		    if (!node->firstpos)
-		      return REG_ESPACE;
-		    node->lastpos = tre_set_one(mem, lit->position,
-						(int)lit->code_min,
-						(int)lit->code_max,
-						lit->class, lit->neg_classes,
-						-1);
-		    if (!node->lastpos)
-		      return REG_ESPACE;
-		  }
-		break;
-	      }
+    symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
+    node = tre_stack_pop_voidptr(stack);
+    switch (symbol) {
+      case NFL_RECURSE:
+        switch (node->type) {
+          case LITERAL: {
+            tre_literal_t *lit = (tre_literal_t *)node->obj;
+            if (IS_BACKREF(lit)) {
+              /* Back references: nullable = false, firstpos = {i},
+                 lastpos = {i}. */
+              node->nullable = 0;
+              node->firstpos =
+                  tre_set_one(mem, lit->position, 0, TRE_CHAR_MAX, 0, NULL, -1);
+              if (!node->firstpos) return REG_ESPACE;
+              node->lastpos = tre_set_one(mem, lit->position, 0, TRE_CHAR_MAX,
+                                          0, NULL, (int)lit->code_max);
+              if (!node->lastpos) return REG_ESPACE;
+            } else if (lit->code_min < 0) {
+              /* Tags, empty strings, params, and zero width assertions:
+                 nullable = true, firstpos = {}, and lastpos = {}. */
+              node->nullable = 1;
+              node->firstpos = tre_set_empty(mem);
+              if (!node->firstpos) return REG_ESPACE;
+              node->lastpos = tre_set_empty(mem);
+              if (!node->lastpos) return REG_ESPACE;
+            } else {
+              /* Literal at position i: nullable = false, firstpos = {i},
+                 lastpos = {i}. */
+              node->nullable = 0;
+              node->firstpos =
+                  tre_set_one(mem, lit->position, (int)lit->code_min,
+                              (int)lit->code_max, 0, NULL, -1);
+              if (!node->firstpos) return REG_ESPACE;
+              node->lastpos = tre_set_one(
+                  mem, lit->position, (int)lit->code_min, (int)lit->code_max,
+                  lit->class, lit->neg_classes, -1);
+              if (!node->lastpos) return REG_ESPACE;
+            }
+            break;
+          }
 
-	    case UNION:
-	      /* Compute the attributes for the two subtrees, and after that
-		 for this node. */
-	      STACK_PUSHR(stack, voidptr, node);
-	      STACK_PUSHR(stack, int, NFL_POST_UNION);
-	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
-	      STACK_PUSHR(stack, int, NFL_RECURSE);
-	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
-	      STACK_PUSHR(stack, int, NFL_RECURSE);
-	      break;
+          case UNION:
+            /* Compute the attributes for the two subtrees, and after that
+               for this node. */
+            STACK_PUSHR(stack, voidptr, node);
+            STACK_PUSHR(stack, int, NFL_POST_UNION);
+            STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
+            STACK_PUSHR(stack, int, NFL_RECURSE);
+            STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
+            STACK_PUSHR(stack, int, NFL_RECURSE);
+            break;
 
-	    case CATENATION:
-	      /* Compute the attributes for the two subtrees, and after that
-		 for this node. */
-	      STACK_PUSHR(stack, voidptr, node);
-	      STACK_PUSHR(stack, int, NFL_POST_CATENATION);
-	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
-	      STACK_PUSHR(stack, int, NFL_RECURSE);
-	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
-	      STACK_PUSHR(stack, int, NFL_RECURSE);
-	      break;
+          case CATENATION:
+            /* Compute the attributes for the two subtrees, and after that
+               for this node. */
+            STACK_PUSHR(stack, voidptr, node);
+            STACK_PUSHR(stack, int, NFL_POST_CATENATION);
+            STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
+            STACK_PUSHR(stack, int, NFL_RECURSE);
+            STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
+            STACK_PUSHR(stack, int, NFL_RECURSE);
+            break;
 
-	    case ITERATION:
-	      /* Compute the attributes for the subtree, and after that for
-		 this node. */
-	      STACK_PUSHR(stack, voidptr, node);
-	      STACK_PUSHR(stack, int, NFL_POST_ITERATION);
-	      STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
-	      STACK_PUSHR(stack, int, NFL_RECURSE);
-	      break;
-	    }
-	  break; /* end case: NFL_RECURSE */
+          case ITERATION:
+            /* Compute the attributes for the subtree, and after that for
+               this node. */
+            STACK_PUSHR(stack, voidptr, node);
+            STACK_PUSHR(stack, int, NFL_POST_ITERATION);
+            STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
+            STACK_PUSHR(stack, int, NFL_RECURSE);
+            break;
+        }
+        break; /* end case: NFL_RECURSE */
 
-	case NFL_POST_UNION:
-	  {
-	    tre_union_t *uni = (tre_union_t *)node->obj;
-	    node->nullable = uni->left->nullable || uni->right->nullable;
-	    node->firstpos = tre_set_union(mem, uni->left->firstpos,
-					   uni->right->firstpos, NULL, 0);
-	    if (!node->firstpos)
-	      return REG_ESPACE;
-	    node->lastpos = tre_set_union(mem, uni->left->lastpos,
-					  uni->right->lastpos, NULL, 0);
-	    if (!node->lastpos)
-	      return REG_ESPACE;
-	    break;
-	  }
+      case NFL_POST_UNION: {
+        tre_union_t *uni = (tre_union_t *)node->obj;
+        node->nullable = uni->left->nullable || uni->right->nullable;
+        node->firstpos = tre_set_union(mem, uni->left->firstpos,
+                                       uni->right->firstpos, NULL, 0);
+        if (!node->firstpos) return REG_ESPACE;
+        node->lastpos = tre_set_union(mem, uni->left->lastpos,
+                                      uni->right->lastpos, NULL, 0);
+        if (!node->lastpos) return REG_ESPACE;
+        break;
+      }
 
-	case NFL_POST_ITERATION:
-	  {
-	    tre_iteration_t *iter = (tre_iteration_t *)node->obj;
+      case NFL_POST_ITERATION: {
+        tre_iteration_t *iter = (tre_iteration_t *)node->obj;
 
-	    if (iter->min == 0 || iter->arg->nullable)
-	      node->nullable = 1;
-	    else
-	      node->nullable = 0;
-	    node->firstpos = iter->arg->firstpos;
-	    node->lastpos = iter->arg->lastpos;
-	    break;
-	  }
+        if (iter->min == 0 || iter->arg->nullable)
+          node->nullable = 1;
+        else
+          node->nullable = 0;
+        node->firstpos = iter->arg->firstpos;
+        node->lastpos = iter->arg->lastpos;
+        break;
+      }
 
-	case NFL_POST_CATENATION:
-	  {
-	    int num_tags, *tags, assertions;
-	    reg_errcode_t status;
-	    tre_catenation_t *cat = node->obj;
-	    node->nullable = cat->left->nullable && cat->right->nullable;
+      case NFL_POST_CATENATION: {
+        int num_tags, *tags, assertions;
+        reg_errcode_t status;
+        tre_catenation_t *cat = node->obj;
+        node->nullable = cat->left->nullable && cat->right->nullable;
 
-	    /* Compute firstpos. */
-	    if (cat->left->nullable)
-	      {
-		/* The left side matches the empty string.  Make a first pass
-		   with tre_match_empty() to get the number of tags and
-		   parameters. */
-		status = tre_match_empty(stack, cat->left,
-					 NULL, NULL, &num_tags);
-		if (status != REG_OK)
-		  return status;
-		/* Allocate arrays for the tags and parameters. */
-		tags = xmalloc(sizeof(*tags) * (num_tags + 1));
-		if (!tags)
-		  return REG_ESPACE;
-		tags[0] = -1;
-		assertions = 0;
-		/* Second pass with tre_mach_empty() to get the list of
-		   tags and parameters. */
-		status = tre_match_empty(stack, cat->left, tags,
-					 &assertions, NULL);
-		if (status != REG_OK)
-		  {
-		    xfree(tags);
-		    return status;
-		  }
-		node->firstpos =
-		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
-				tags, assertions);
-		xfree(tags);
-		if (!node->firstpos)
-		  return REG_ESPACE;
-	      }
-	    else
-	      {
-		node->firstpos = cat->left->firstpos;
-	      }
+        /* Compute firstpos. */
+        if (cat->left->nullable) {
+          /* The left side matches the empty string.  Make a first pass
+             with tre_match_empty() to get the number of tags and
+             parameters. */
+          status = tre_match_empty(stack, cat->left, NULL, NULL, &num_tags);
+          if (status != REG_OK) return status;
+          /* Allocate arrays for the tags and parameters. */
+          tags = malloc(sizeof(*tags) * (num_tags + 1));
+          if (!tags) return REG_ESPACE;
+          tags[0] = -1;
+          assertions = 0;
+          /* Second pass with tre_mach_empty() to get the list of
+             tags and parameters. */
+          status = tre_match_empty(stack, cat->left, tags, &assertions, NULL);
+          if (status != REG_OK) {
+            free(tags), tags = NULL;
+            return status;
+          }
+          node->firstpos = tre_set_union(mem, cat->right->firstpos,
+                                         cat->left->firstpos, tags, assertions);
+          free(tags), tags = NULL;
+          if (!node->firstpos) return REG_ESPACE;
+        } else {
+          node->firstpos = cat->left->firstpos;
+        }
 
-	    /* Compute lastpos. */
-	    if (cat->right->nullable)
-	      {
-		/* The right side matches the empty string.  Make a first pass
-		   with tre_match_empty() to get the number of tags and
-		   parameters. */
-		status = tre_match_empty(stack, cat->right,
-					 NULL, NULL, &num_tags);
-		if (status != REG_OK)
-		  return status;
-		/* Allocate arrays for the tags and parameters. */
-		tags = xmalloc(sizeof(int) * (num_tags + 1));
-		if (!tags)
-		  return REG_ESPACE;
-		tags[0] = -1;
-		assertions = 0;
-		/* Second pass with tre_mach_empty() to get the list of
-		   tags and parameters. */
-		status = tre_match_empty(stack, cat->right, tags,
-					 &assertions, NULL);
-		if (status != REG_OK)
-		  {
-		    xfree(tags);
-		    return status;
-		  }
-		node->lastpos =
-		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
-				tags, assertions);
-		xfree(tags);
-		if (!node->lastpos)
-		  return REG_ESPACE;
-	      }
-	    else
-	      {
-		node->lastpos = cat->right->lastpos;
-	      }
-	    break;
-	  }
+        /* Compute lastpos. */
+        if (cat->right->nullable) {
+          /* The right side matches the empty string.  Make a first pass
+             with tre_match_empty() to get the number of tags and
+             parameters. */
+          status = tre_match_empty(stack, cat->right, NULL, NULL, &num_tags);
+          if (status != REG_OK) return status;
+          /* Allocate arrays for the tags and parameters. */
+          tags = malloc(sizeof(int) * (num_tags + 1));
+          if (!tags) return REG_ESPACE;
+          tags[0] = -1;
+          assertions = 0;
+          /* Second pass with tre_mach_empty() to get the list of
+             tags and parameters. */
+          status = tre_match_empty(stack, cat->right, tags, &assertions, NULL);
+          if (status != REG_OK) {
+            free(tags), tags = NULL;
+            return status;
+          }
+          node->lastpos = tre_set_union(mem, cat->left->lastpos,
+                                        cat->right->lastpos, tags, assertions);
+          free(tags), tags = NULL;
+          if (!node->lastpos) return REG_ESPACE;
+        } else {
+          node->lastpos = cat->right->lastpos;
+        }
+        break;
+      }
 
-	default:
-	  assert(0);
-	  break;
-	}
+      default:
+        __builtin_unreachable();
     }
+  }
 
   return REG_OK;
 }
 
-
 /* Adds a transition from each position in `p1' to each position in `p2'. */
-static reg_errcode_t
-tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
-	       tre_tnfa_transition_t *transitions,
-	       int *counts, int *offs)
-{
+static reg_errcode_t tre_make_trans(tre_pos_and_tags_t *p1,
+                                    tre_pos_and_tags_t *p2,
+                                    tre_tnfa_transition_t *transitions,
+                                    int *counts, int *offs) {
   tre_pos_and_tags_t *orig_p2 = p2;
   tre_tnfa_transition_t *trans;
   int i, j, k, l, dup, prev_p2_pos;
 
   if (transitions != NULL)
-    while (p1->position >= 0)
-      {
-	p2 = orig_p2;
-	prev_p2_pos = -1;
-	while (p2->position >= 0)
-	  {
-	    /* Optimization: if this position was already handled, skip it. */
-	    if (p2->position == prev_p2_pos)
-	      {
-		p2++;
-		continue;
-	      }
-	    prev_p2_pos = p2->position;
-	    /* Set `trans' to point to the next unused transition from
-	       position `p1->position'. */
-	    trans = transitions + offs[p1->position];
-	    while (trans->state != NULL)
-	      {
+    while (p1->position >= 0) {
+      p2 = orig_p2;
+      prev_p2_pos = -1;
+      while (p2->position >= 0) {
+        /* Optimization: if this position was already handled, skip it. */
+        if (p2->position == prev_p2_pos) {
+          p2++;
+          continue;
+        }
+        prev_p2_pos = p2->position;
+        /* Set `trans' to point to the next unused transition from
+           position `p1->position'. */
+        trans = transitions + offs[p1->position];
+        while (trans->state != NULL) {
 #if 0
 		/* If we find a previous transition from `p1->position' to
 		   `p2->position', it is overwritten.  This can happen only
@@ -2538,108 +2231,92 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
 		    break;
 		  }
 #endif
-		trans++;
-	      }
+          trans++;
+        }
 
-	    if (trans->state == NULL)
-	      (trans + 1)->state = NULL;
-	    /* Use the character ranges, assertions, etc. from `p1' for
-	       the transition from `p1' to `p2'. */
-	    trans->code_min = p1->code_min;
-	    trans->code_max = p1->code_max;
-	    trans->state = transitions + offs[p2->position];
-	    trans->state_id = p2->position;
-	    trans->assertions = p1->assertions | p2->assertions
-	      | (p1->class ? ASSERT_CHAR_CLASS : 0)
-	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
-	    if (p1->backref >= 0)
-	      {
-		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
-		assert(p2->backref < 0);
-		trans->u.backref = p1->backref;
-		trans->assertions |= ASSERT_BACKREF;
-	      }
-	    else
-	      trans->u.class = p1->class;
-	    if (p1->neg_classes != NULL)
-	      {
-		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
-		trans->neg_classes =
-		  xmalloc(sizeof(*trans->neg_classes) * (i + 1));
-		if (trans->neg_classes == NULL)
-		  return REG_ESPACE;
-		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
-		  trans->neg_classes[i] = p1->neg_classes[i];
-		trans->neg_classes[i] = (tre_ctype_t)0;
-	      }
-	    else
-	      trans->neg_classes = NULL;
+        if (trans->state == NULL) (trans + 1)->state = NULL;
+        /* Use the character ranges, assertions, etc. from `p1' for
+           the transition from `p1' to `p2'. */
+        trans->code_min = p1->code_min;
+        trans->code_max = p1->code_max;
+        trans->state = transitions + offs[p2->position];
+        trans->state_id = p2->position;
+        trans->assertions =
+            p1->assertions | p2->assertions |
+            (p1->class ? ASSERT_CHAR_CLASS : 0) |
+            (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
+        if (p1->backref >= 0) {
+          unassert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
+          unassert(p2->backref < 0);
+          trans->u.backref = p1->backref;
+          trans->assertions |= ASSERT_BACKREF;
+        } else
+          trans->u.class = p1->class;
+        if (p1->neg_classes != NULL) {
+          for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
+            ;
+          trans->neg_classes = malloc(sizeof(*trans->neg_classes) * (i + 1));
+          if (trans->neg_classes == NULL) return REG_ESPACE;
+          for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
+            trans->neg_classes[i] = p1->neg_classes[i];
+          trans->neg_classes[i] = (tre_ctype_t)0;
+        } else
+          trans->neg_classes = NULL;
 
-	    /* Find out how many tags this transition has. */
-	    i = 0;
-	    if (p1->tags != NULL)
-	      while(p1->tags[i] >= 0)
-		i++;
-	    j = 0;
-	    if (p2->tags != NULL)
-	      while(p2->tags[j] >= 0)
-		j++;
+        /* Find out how many tags this transition has. */
+        i = 0;
+        if (p1->tags != NULL)
+          while (p1->tags[i] >= 0) i++;
+        j = 0;
+        if (p2->tags != NULL)
+          while (p2->tags[j] >= 0) j++;
 
-	    /* If we are overwriting a transition, free the old tag array. */
-	    if (trans->tags != NULL)
-	      xfree(trans->tags);
-	    trans->tags = NULL;
+        /* If we are overwriting a transition, free the old tag array. */
+        if (trans->tags != NULL) free(trans->tags), trans->tags = NULL;
+        trans->tags = NULL;
 
-	    /* If there were any tags, allocate an array and fill it. */
-	    if (i + j > 0)
-	      {
-		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
-		if (!trans->tags)
-		  return REG_ESPACE;
-		i = 0;
-		if (p1->tags != NULL)
-		  while(p1->tags[i] >= 0)
-		    {
-		      trans->tags[i] = p1->tags[i];
-		      i++;
-		    }
-		l = i;
-		j = 0;
-		if (p2->tags != NULL)
-		  while (p2->tags[j] >= 0)
-		    {
-		      /* Don't add duplicates. */
-		      dup = 0;
-		      for (k = 0; k < i; k++)
-			if (trans->tags[k] == p2->tags[j])
-			  {
-			    dup = 1;
-			    break;
-			  }
-		      if (!dup)
-			trans->tags[l++] = p2->tags[j];
-		      j++;
-		    }
-		trans->tags[l] = -1;
-	      }
+        /* If there were any tags, allocate an array and fill it. */
+        if (i + j > 0) {
+          trans->tags = malloc(sizeof(*trans->tags) * (i + j + 1));
+          if (!trans->tags) return REG_ESPACE;
+          i = 0;
+          if (p1->tags != NULL)
+            while (p1->tags[i] >= 0) {
+              trans->tags[i] = p1->tags[i];
+              i++;
+            }
+          l = i;
+          j = 0;
+          if (p2->tags != NULL)
+            while (p2->tags[j] >= 0) {
+              /* Don't add duplicates. */
+              dup = 0;
+              for (k = 0; k < i; k++)
+                if (trans->tags[k] == p2->tags[j]) {
+                  dup = 1;
+                  break;
+                }
+              if (!dup) trans->tags[l++] = p2->tags[j];
+              j++;
+            }
+          trans->tags[l] = -1;
+        }
 
-	    p2++;
-	  }
-	p1++;
+        p2++;
       }
+      p1++;
+    }
   else
     /* Compute a maximum limit for the number of transitions leaving
        from each state. */
-    while (p1->position >= 0)
-      {
-	p2 = orig_p2;
-	while (p2->position >= 0)
-	  {
-	    counts[p1->position]++;
-	    p2++;
-	  }
-	p1++;
+    while (p1->position >= 0) {
+      p2 = orig_p2;
+      while (p2->position >= 0) {
+        counts[p1->position]++;
+        p2++;
       }
+      p1++;
+    }
   return REG_OK;
 }
 
@@ -2647,72 +2324,60 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
    labelled with one character range (there are no transitions on empty
    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
    the regexp. */
-static reg_errcode_t
-tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
-		int *counts, int *offs)
-{
+static reg_errcode_t tre_ast_to_tnfa(tre_ast_node_t *node,
+                                     tre_tnfa_transition_t *transitions,
+                                     int *counts, int *offs) {
   tre_union_t *uni;
   tre_catenation_t *cat;
   tre_iteration_t *iter;
   reg_errcode_t errcode = REG_OK;
 
   /* XXX - recurse using a stack!. */
-  switch (node->type)
-    {
+  switch (node->type) {
     case LITERAL:
       break;
     case UNION:
       uni = (tre_union_t *)node->obj;
       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
-      if (errcode != REG_OK)
-	return errcode;
+      if (errcode != REG_OK) return errcode;
       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
       break;
 
     case CATENATION:
       cat = (tre_catenation_t *)node->obj;
       /* Add a transition from each position in cat->left->lastpos
-	 to each position in cat->right->firstpos. */
+         to each position in cat->right->firstpos. */
       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
-			       transitions, counts, offs);
-      if (errcode != REG_OK)
-	return errcode;
+                               transitions, counts, offs);
+      if (errcode != REG_OK) return errcode;
       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
-      if (errcode != REG_OK)
-	return errcode;
+      if (errcode != REG_OK) return errcode;
       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
       break;
 
     case ITERATION:
       iter = (tre_iteration_t *)node->obj;
-      assert(iter->max == -1 || iter->max == 1);
+      unassert(iter->max == -1 || iter->max == 1);
 
-      if (iter->max == -1)
-	{
-	  assert(iter->min == 0 || iter->min == 1);
-	  /* Add a transition from each last position in the iterated
-	     expression to each first position. */
-	  errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
-				   transitions, counts, offs);
-	  if (errcode != REG_OK)
-	    return errcode;
-	}
+      if (iter->max == -1) {
+        unassert(iter->min == 0 || iter->min == 1);
+        /* Add a transition from each last position in the iterated
+           expression to each first position. */
+        errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
+                                 transitions, counts, offs);
+        if (errcode != REG_OK) return errcode;
+      }
       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
       break;
-    }
+  }
   return errcode;
 }
 
-
-#define ERROR_EXIT(err)		  \
-  do				  \
-    {				  \
-      errcode = err;		  \
-      if (/*CONSTCOND*/1)	  \
-      	goto error_exit;	  \
-    }				  \
- while (/*CONSTCOND*/0)
-
+#define ERROR_EXIT(err)                   \
+  do {                                    \
+    errcode = err;                        \
+    if (/*CONSTCOND*/ 1) goto error_exit; \
+  } while (/*CONSTCOND*/ 0)
 
 /**
  * Compiles regular expression, e.g.
@@ -2731,9 +2396,7 @@ tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
  * @return REG_OK, REG_NOMATCH, REG_BADPAT, etc.
  * @see regexec(), regfree(), regerror()
  */
-int
-regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
-{
+int regcomp(regex_t *preg, const char *regex, int cflags) {
   tre_stack_t *stack;
   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
   tre_pos_and_tags_t *p;
@@ -2752,26 +2415,23 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
   /* Allocate a stack used throughout the compilation process for various
      purposes. */
   stack = tre_stack_new(512, 1024000, 128);
-  if (!stack)
-    return REG_ESPACE;
+  if (!stack) return REG_ESPACE;
   /* Allocate a fast memory allocator. */
   mem = tre_mem_new();
-  if (!mem)
-    {
-      tre_stack_destroy(stack);
-      return REG_ESPACE;
-    }
+  if (!mem) {
+    tre_stack_destroy(stack);
+    return REG_ESPACE;
+  }
 
   /* Parse the regexp. */
-  memset(&parse_ctx, 0, sizeof(parse_ctx));
+  bzero(&parse_ctx, sizeof(parse_ctx));
   parse_ctx.mem = mem;
   parse_ctx.stack = stack;
   parse_ctx.start = regex;
   parse_ctx.cflags = cflags;
   parse_ctx.max_backref = -1;
   errcode = tre_parse(&parse_ctx);
-  if (errcode != REG_OK)
-    ERROR_EXIT(errcode);
+  if (errcode != REG_OK) ERROR_EXIT(errcode);
   preg->re_nsub = parse_ctx.submatch_id - 1;
   tree = parse_ctx.n;
 
@@ -2780,141 +2440,115 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
 #endif /* TRE_DEBUG */
 
   /* Referring to nonexistent subexpressions is illegal. */
-  if (parse_ctx.max_backref > (int)preg->re_nsub)
-    ERROR_EXIT(REG_ESUBREG);
+  if (parse_ctx.max_backref > (int)preg->re_nsub) ERROR_EXIT(REG_ESUBREG);
 
   /* Allocate the TNFA struct. */
-  tnfa = xcalloc(1, sizeof(tre_tnfa_t));
-  if (tnfa == NULL)
-    ERROR_EXIT(REG_ESPACE);
+  tnfa = calloc(1, sizeof(tre_tnfa_t));
+  if (tnfa == NULL) ERROR_EXIT(REG_ESPACE);
   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
   tnfa->have_approx = 0;
   tnfa->num_submatches = parse_ctx.submatch_id;
 
   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
      regexp does not have back references, this can be skipped. */
-  if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
-    {
-
-      /* Figure out how many tags we will need. */
-      errcode = tre_add_tags(NULL, stack, tree, tnfa);
-      if (errcode != REG_OK)
-	ERROR_EXIT(errcode);
-
-      if (tnfa->num_tags > 0)
-	{
-	  tag_directions = xmalloc(sizeof(*tag_directions)
-				   * (tnfa->num_tags + 1));
-	  if (tag_directions == NULL)
-	    ERROR_EXIT(REG_ESPACE);
-	  tnfa->tag_directions = tag_directions;
-	  memset(tag_directions, -1,
-		 sizeof(*tag_directions) * (tnfa->num_tags + 1));
-	}
-      tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
-				   sizeof(*tnfa->minimal_tags));
-      if (tnfa->minimal_tags == NULL)
-	ERROR_EXIT(REG_ESPACE);
-
-      submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
-			      sizeof(*submatch_data));
-      if (submatch_data == NULL)
-	ERROR_EXIT(REG_ESPACE);
-      tnfa->submatch_data = submatch_data;
-
-      errcode = tre_add_tags(mem, stack, tree, tnfa);
-      if (errcode != REG_OK)
-	ERROR_EXIT(errcode);
+  if (tnfa->have_backrefs || !(cflags & REG_NOSUB)) {
+    /* Figure out how many tags we will need. */
+    errcode = tre_add_tags(NULL, stack, tree, tnfa);
+    if (errcode != REG_OK) ERROR_EXIT(errcode);
 
+    if (tnfa->num_tags > 0) {
+      tag_directions = malloc(sizeof(*tag_directions) * (tnfa->num_tags + 1));
+      if (tag_directions == NULL) ERROR_EXIT(REG_ESPACE);
+      tnfa->tag_directions = tag_directions;
+      memset(tag_directions, -1,
+             sizeof(*tag_directions) * (tnfa->num_tags + 1));
     }
+    tnfa->minimal_tags =
+        calloc((unsigned)tnfa->num_tags * 2 + 1, sizeof(*tnfa->minimal_tags));
+    if (tnfa->minimal_tags == NULL) ERROR_EXIT(REG_ESPACE);
+
+    submatch_data =
+        calloc((unsigned)parse_ctx.submatch_id, sizeof(*submatch_data));
+    if (submatch_data == NULL) ERROR_EXIT(REG_ESPACE);
+    tnfa->submatch_data = submatch_data;
+
+    errcode = tre_add_tags(mem, stack, tree, tnfa);
+    if (errcode != REG_OK) ERROR_EXIT(errcode);
+  }
 
   /* Expand iteration nodes. */
-  errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
-			   tag_directions);
-  if (errcode != REG_OK)
-    ERROR_EXIT(errcode);
+  errcode =
+      tre_expand_ast(mem, stack, tree, &parse_ctx.position, tag_directions);
+  if (errcode != REG_OK) ERROR_EXIT(errcode);
 
   /* Add a dummy node for the final state.
      XXX - For certain patterns this dummy node can be optimized away,
-	   for example "a*" or "ab*".	Figure out a simple way to detect
-	   this possibility. */
+           for example "a*" or "ab*".	Figure out a simple way to detect
+           this possibility. */
   tmp_ast_l = tree;
   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
-  if (tmp_ast_r == NULL)
-    ERROR_EXIT(REG_ESPACE);
+  if (tmp_ast_r == NULL) ERROR_EXIT(REG_ESPACE);
 
   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
-  if (tree == NULL)
-    ERROR_EXIT(REG_ESPACE);
+  if (tree == NULL) ERROR_EXIT(REG_ESPACE);
 
   errcode = tre_compute_nfl(mem, stack, tree);
-  if (errcode != REG_OK)
-    ERROR_EXIT(errcode);
+  if (errcode != REG_OK) ERROR_EXIT(errcode);
 
-  counts = xmalloc(sizeof(int) * parse_ctx.position);
-  if (counts == NULL)
-    ERROR_EXIT(REG_ESPACE);
+  counts = malloc(sizeof(int) * parse_ctx.position);
+  if (counts == NULL) ERROR_EXIT(REG_ESPACE);
 
-  offs = xmalloc(sizeof(int) * parse_ctx.position);
-  if (offs == NULL)
-    ERROR_EXIT(REG_ESPACE);
+  offs = malloc(sizeof(int) * parse_ctx.position);
+  if (offs == NULL) ERROR_EXIT(REG_ESPACE);
 
-  for (i = 0; i < parse_ctx.position; i++)
-    counts[i] = 0;
+  for (i = 0; i < parse_ctx.position; i++) counts[i] = 0;
   tre_ast_to_tnfa(tree, NULL, counts, NULL);
 
   add = 0;
-  for (i = 0; i < parse_ctx.position; i++)
-    {
-      offs[i] = add;
-      add += counts[i] + 1;
-      counts[i] = 0;
-    }
-  transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
-  if (transitions == NULL)
-    ERROR_EXIT(REG_ESPACE);
+  for (i = 0; i < parse_ctx.position; i++) {
+    offs[i] = add;
+    add += counts[i] + 1;
+    counts[i] = 0;
+  }
+  transitions = calloc((unsigned)add + 1, sizeof(*transitions));
+  if (transitions == NULL) ERROR_EXIT(REG_ESPACE);
   tnfa->transitions = transitions;
   tnfa->num_transitions = add;
 
   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
-  if (errcode != REG_OK)
-    ERROR_EXIT(errcode);
+  if (errcode != REG_OK) ERROR_EXIT(errcode);
 
   tnfa->firstpos_chars = NULL;
 
   p = tree->firstpos;
   i = 0;
-  while (p->position >= 0)
-    {
-      i++;
-      p++;
-    }
+  while (p->position >= 0) {
+    i++;
+    p++;
+  }
 
-  initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
-  if (initial == NULL)
-    ERROR_EXIT(REG_ESPACE);
+  initial = calloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
+  if (initial == NULL) ERROR_EXIT(REG_ESPACE);
   tnfa->initial = initial;
 
   i = 0;
-  for (p = tree->firstpos; p->position >= 0; p++)
-    {
-      initial[i].state = transitions + offs[p->position];
-      initial[i].state_id = p->position;
-      initial[i].tags = NULL;
-      /* Copy the arrays p->tags, and p->params, they are allocated
-	 from a tre_mem object. */
-      if (p->tags)
-	{
-	  int j;
-	  for (j = 0; p->tags[j] >= 0; j++);
-	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
-	  if (!initial[i].tags)
-	    ERROR_EXIT(REG_ESPACE);
-	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
-	}
-      initial[i].assertions = p->assertions;
-      i++;
+  for (p = tree->firstpos; p->position >= 0; p++) {
+    initial[i].state = transitions + offs[p->position];
+    initial[i].state_id = p->position;
+    initial[i].tags = NULL;
+    /* Copy the arrays p->tags, and p->params, they are allocated
+       from a tre_mem object. */
+    if (p->tags) {
+      int j;
+      for (j = 0; p->tags[j] >= 0; j++)
+        ;
+      initial[i].tags = malloc(sizeof(*p->tags) * (j + 1));
+      if (!initial[i].tags) ERROR_EXIT(REG_ESPACE);
+      memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
     }
+    initial[i].assertions = p->assertions;
+    i++;
+  }
   initial[i].state = NULL;
 
   tnfa->num_transitions = add;
@@ -2924,29 +2558,23 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
 
   tre_mem_destroy(mem);
   tre_stack_destroy(stack);
-  xfree(counts);
-  xfree(offs);
+  free(counts), counts = NULL;
+  free(offs), offs = NULL;
 
   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
   return REG_OK;
 
- error_exit:
+error_exit:
   /* Free everything that was allocated and return the error code. */
   tre_mem_destroy(mem);
-  if (stack != NULL)
-    tre_stack_destroy(stack);
-  if (counts != NULL)
-    xfree(counts);
-  if (offs != NULL)
-    xfree(offs);
+  if (stack != NULL) tre_stack_destroy(stack);
+  if (counts != NULL) free(counts), counts = NULL;
+  if (offs != NULL) free(offs), offs = NULL;
   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
   regfree(preg);
   return errcode;
 }
 
-
-
-
 /**
  * Frees any memory allocated by regcomp().
  *
@@ -2954,51 +2582,43 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
  * which case subsequent calls do nothing. Once a regex is freed, it may
  * be passed to regcomp() to reinitialize it.
  */
-void
-regfree(regex_t *preg)
-{
-  tre_tnfa_t *tnfa;
+void regfree(regex_t *preg) {
   unsigned int i;
+  tre_tnfa_t *tnfa;
   tre_tnfa_transition_t *trans;
-
-  tnfa = (void *)preg->TRE_REGEX_T_FIELD;
-  if (!tnfa)
-    return;
-
-  for (i = 0; i < tnfa->num_transitions; i++)
-    if (tnfa->transitions[i].state)
-      {
-	if (tnfa->transitions[i].tags)
-	  xfree(tnfa->transitions[i].tags);
-	if (tnfa->transitions[i].neg_classes)
-	  xfree(tnfa->transitions[i].neg_classes);
+  if ((tnfa = preg->TRE_REGEX_T_FIELD)) {
+    preg->TRE_REGEX_T_FIELD = 0;
+    for (i = 0; i < tnfa->num_transitions; i++)
+      if (tnfa->transitions[i].state) {
+        if (tnfa->transitions[i].tags) {
+          free(tnfa->transitions[i].tags);
+        }
+        if (tnfa->transitions[i].neg_classes) {
+          free(tnfa->transitions[i].neg_classes);
+        }
       }
-  if (tnfa->transitions)
-    xfree(tnfa->transitions);
-
-  if (tnfa->initial)
-    {
-      for (trans = tnfa->initial; trans->state; trans++)
-	{
-	  if (trans->tags)
-	    xfree(trans->tags);
-	}
-      xfree(tnfa->initial);
+    if (tnfa->transitions) {
+      free(tnfa->transitions);
     }
-
-  if (tnfa->submatch_data)
-    {
-      for (i = 0; i < tnfa->num_submatches; i++)
-	if (tnfa->submatch_data[i].parents)
-	  xfree(tnfa->submatch_data[i].parents);
-      xfree(tnfa->submatch_data);
+    if (tnfa->initial) {
+      for (trans = tnfa->initial; trans->state; trans++) {
+        if (trans->tags) {
+          free(trans->tags);
+        }
+      }
+      free(tnfa->initial);
     }
-
-  if (tnfa->tag_directions)
-    xfree(tnfa->tag_directions);
-  if (tnfa->firstpos_chars)
-    xfree(tnfa->firstpos_chars);
-  if (tnfa->minimal_tags)
-    xfree(tnfa->minimal_tags);
-  xfree(tnfa);
+    if (tnfa->submatch_data) {
+      for (i = 0; i < tnfa->num_submatches; i++) {
+        if (tnfa->submatch_data[i].parents) {
+          free(tnfa->submatch_data[i].parents);
+        }
+      }
+      free(tnfa->submatch_data);
+    }
+    if (tnfa->tag_directions) free(tnfa->tag_directions);
+    if (tnfa->firstpos_chars) free(tnfa->firstpos_chars);
+    if (tnfa->minimal_tags) free(tnfa->minimal_tags);
+    free(tnfa);
+  }
 }
diff --git a/third_party/regex/regerror.c b/third_party/regex/regerror.c
index dbd4b763d..b922b9b59 100644
--- a/third_party/regex/regerror.c
+++ b/third_party/regex/regerror.c
@@ -1,37 +1,65 @@
-#include <string.h>
-#include <regex.h>
-#include <stdio.h>
-#include "libc/str/locale.internal.h"
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/safemacros.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "third_party/regex/regex.h"
 
 /* Error message strings for error codes listed in `regex.h'.  This list
    needs to be in sync with the codes listed there, naturally. */
 
-/* Converted to single string by Rich Felker to remove the need for
- * data relocations at runtime, 27 Feb 2006. */
+static const char kRegexErrors[] =
+    "No error\0"
+    "No match\0"
+    "Invalid regexp\0"
+    "Unknown collating element\0"
+    "Unknown character class name\0"
+    "Trailing backslash\0"
+    "Invalid back reference\0"
+    "Missing ']'\0"
+    "Missing ')'\0"
+    "Missing '}'\0"
+    "Invalid contents of {}\0"
+    "Invalid character range\0"
+    "Out of memory\0"
+    "Repetition not preceded by valid expression\0";
 
-static const char messages[] = {
-  "No error\0"
-  "No match\0"
-  "Invalid regexp\0"
-  "Unknown collating element\0"
-  "Unknown character class name\0"
-  "Trailing backslash\0"
-  "Invalid back reference\0"
-  "Missing ']'\0"
-  "Missing ')'\0"
-  "Missing '}'\0"
-  "Invalid contents of {}\0"
-  "Invalid character range\0"
-  "Out of memory\0"
-  "Repetition not preceded by valid expression\0"
-  "\0Unknown error"
-};
-
-size_t regerror(int e, const regex_t *restrict preg, char *restrict buf, size_t size)
-{
-	const char *s;
-	for (s=messages; e && *s; e--, s+=strlen(s)+1);
-	if (!*s) s++;
-	s = LCTRANS_CUR(s);
-	return 1+snprintf(buf, size, "%s", s);
+static const char *IndexDoubleNulString(const char *s, unsigned i) {
+  size_t n;
+  while (i--) {
+    if ((n = strlen(s))) {
+      s += n + 1;
+    } else {
+      return NULL;
+    }
+  }
+  return s;
+}
+
+/**
+ * Converts regular expression error code to string.
+ *
+ * @param e is error code
+ * @return number of bytes needed to hold entire string
+ */
+size_t regerror(int e, const regex_t *preg, char *buf, size_t size) {
+  return 1 + (snprintf)(buf, size, "%s",
+                        firstnonnull(IndexDoubleNulString(kRegexErrors, e),
+                                     "Unknown error"));
 }
diff --git a/third_party/regex/regexec.c b/third_party/regex/regexec.c
index d99696d8a..fd4b4446f 100644
--- a/third_party/regex/regexec.c
+++ b/third_party/regex/regexec.c
@@ -65,99 +65,86 @@ TRE regex (BSD-2 License)\n\
 Copyright 2001-2009 Ville Laurikari <vl@iki.fi>\n\
 Copyright 2016 Szabolcs Nagy");
 
-static void
-tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
+static void tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
+                            const tre_tnfa_t *tnfa, regoff_t *tags,
+                            regoff_t match_eo);
 
 /***********************************************************************
  from tre-match-utils.h
 ***********************************************************************/
 
-#define GET_NEXT_WCHAR() do {                                                 \
-    prev_c = next_c; pos += pos_add_next;                                     \
-    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
-        if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else pos_add_next++;                                                  \
-    }                                                                         \
-    str_byte += pos_add_next;                                                 \
+#define GET_NEXT_WCHAR()                                               \
+  do {                                                                 \
+    prev_c = next_c;                                                   \
+    pos += pos_add_next;                                               \
+    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
+      if (pos_add_next < 0) {                                          \
+        ret = REG_NOMATCH;                                             \
+        goto error_exit;                                               \
+      } else                                                           \
+        pos_add_next++;                                                \
+    }                                                                  \
+    str_byte += pos_add_next;                                          \
   } while (0)
 
-#define IS_WORD_CHAR(c)	 ((c) == L'_' || tre_isalnum(c))
-
-#define CHECK_ASSERTIONS(assertions)					      \
-  (((assertions & ASSERT_AT_BOL)					      \
-    && (pos > 0 || reg_notbol)						      \
-    && (prev_c != L'\n' || !reg_newline))				      \
-   || ((assertions & ASSERT_AT_EOL)					      \
-       && (next_c != L'\0' || reg_noteol)				      \
-       && (next_c != L'\n' || !reg_newline))				      \
-   || ((assertions & ASSERT_AT_BOW)					      \
-       && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))	              \
-   || ((assertions & ASSERT_AT_EOW)					      \
-       && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))		      \
-   || ((assertions & ASSERT_AT_WB)					      \
-       && (pos != 0 && next_c != L'\0'					      \
-	   && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))		      \
-   || ((assertions & ASSERT_AT_WB_NEG)					      \
-       && (pos == 0 || next_c == L'\0'					      \
-	   || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
-
-#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
-  (((trans_i->assertions & ASSERT_CHAR_CLASS)                                 \
-       && !(tnfa->cflags & REG_ICASE)                                         \
-       && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class))                 \
-    || ((trans_i->assertions & ASSERT_CHAR_CLASS)                             \
-        && (tnfa->cflags & REG_ICASE)                                         \
-        && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class)     \
-	&& !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class))    \
-    || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)                         \
-        && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
-                                      tnfa->cflags & REG_ICASE)))
-
+#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c))
 
+#define CHECK_ASSERTIONS(assertions)                                   \
+  (((assertions & ASSERT_AT_BOL) && (pos > 0 || reg_notbol) &&         \
+    (prev_c != L'\n' || !reg_newline)) ||                              \
+   ((assertions & ASSERT_AT_EOL) && (next_c != L'\0' || reg_noteol) && \
+    (next_c != L'\n' || !reg_newline)) ||                              \
+   ((assertions & ASSERT_AT_BOW) &&                                    \
+    (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) ||                \
+   ((assertions & ASSERT_AT_EOW) &&                                    \
+    (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) ||                \
+   ((assertions & ASSERT_AT_WB) &&                                     \
+    (pos != 0 && next_c != L'\0' &&                                    \
+     IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) ||                 \
+   ((assertions & ASSERT_AT_WB_NEG) &&                                 \
+    (pos == 0 || next_c == L'\0' ||                                    \
+     IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
 
+#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                              \
+  (((trans_i->assertions & ASSERT_CHAR_CLASS) &&                               \
+    !(tnfa->cflags & REG_ICASE) &&                                             \
+    !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) ||                     \
+   ((trans_i->assertions & ASSERT_CHAR_CLASS) && (tnfa->cflags & REG_ICASE) && \
+    !tre_isctype(tre_tolower((tre_cint_t)prev_c), trans_i->u.class) &&         \
+    !tre_isctype(tre_toupper((tre_cint_t)prev_c), trans_i->u.class)) ||        \
+   ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) &&                           \
+    tre_neg_char_classes_match(trans_i->neg_classes, (tre_cint_t)prev_c,       \
+                               tnfa->cflags & REG_ICASE)))
 
 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
-static int
-tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
-	      regoff_t *t1, regoff_t *t2)
-{
+static int tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
+                         regoff_t *t1, regoff_t *t2) {
   int i;
-  for (i = 0; i < num_tags; i++)
-    {
-      if (tag_directions[i] == TRE_TAG_MINIMIZE)
-	{
-	  if (t1[i] < t2[i])
-	    return 1;
-	  if (t1[i] > t2[i])
-	    return 0;
-	}
-      else
-	{
-	  if (t1[i] > t2[i])
-	    return 1;
-	  if (t1[i] < t2[i])
-	    return 0;
-	}
+  for (i = 0; i < num_tags; i++) {
+    if (tag_directions[i] == TRE_TAG_MINIMIZE) {
+      if (t1[i] < t2[i]) return 1;
+      if (t1[i] > t2[i]) return 0;
+    } else {
+      if (t1[i] > t2[i]) return 1;
+      if (t1[i] < t2[i]) return 0;
     }
+  }
   /*  assert(0);*/
   return 0;
 }
 
-static int
-tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
-{
+static int tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc,
+                                      int icase) {
   while (*classes != (tre_ctype_t)0)
-    if ((!icase && tre_isctype(wc, *classes))
-	|| (icase && (tre_isctype(tre_toupper(wc), *classes)
-		      || tre_isctype(tre_tolower(wc), *classes))))
+    if ((!icase && tre_isctype(wc, *classes)) ||
+        (icase && (tre_isctype(tre_toupper(wc), *classes) ||
+                   tre_isctype(tre_tolower(wc), *classes))))
       return 1; /* Match. */
     else
       classes++;
   return 0; /* No match. */
 }
 
-
 /***********************************************************************
  from tre-match-parallel.c
 ***********************************************************************/
@@ -188,12 +175,10 @@ typedef struct {
   regoff_t **tags;
 } tre_reach_pos_t;
 
-
-static reg_errcode_t
-tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
-		      regoff_t *match_tags, int eflags,
-		      regoff_t *match_end_ofs)
-{
+static reg_errcode_t tre_tnfa_run_parallel(const tre_tnfa_t *tnfa,
+                                           const void *string,
+                                           regoff_t *match_tags, int eflags,
+                                           regoff_t *match_end_ofs) {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
   const char *str_byte = string;
@@ -214,13 +199,13 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
   int *tag_i;
   int num_tags, i;
 
-  regoff_t match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
+  regoff_t match_eo = -1; /* end offset of match (-1 if no match found yet) */
   int new_match = 0;
   regoff_t *tmp_tags = NULL;
   regoff_t *tmp_iptr;
 
 #ifdef TRE_MBSTATE
-  memset(&mbstate, '\0', sizeof(mbstate));
+  bzero(&mbstate, sizeof(mbstate));
 #endif /* TRE_MBSTATE */
 
   if (!match_tags)
@@ -237,15 +222,15 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
 
     /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
      * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
-    if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
+    if (num_tags > SIZE_MAX / (8 * sizeof(regoff_t) * tnfa->num_states))
       return REG_ESPACE;
 
     /* Likewise check rbytes. */
-    if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
+    if (tnfa->num_states + 1 > SIZE_MAX / (8 * sizeof(*reach_next)))
       return REG_ESPACE;
 
     /* Likewise check pbytes. */
-    if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
+    if (tnfa->num_states > SIZE_MAX / (8 * sizeof(*reach_pos)))
       return REG_ESPACE;
 
     /* Compute the length of the block we need. */
@@ -253,14 +238,12 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
     pbytes = sizeof(*reach_pos) * tnfa->num_states;
     xbytes = sizeof(regoff_t) * num_tags;
-    total_bytes =
-      (sizeof(long) - 1) * 4 /* for alignment paddings */
-      + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
+    total_bytes = (sizeof(long) - 1) * 4 /* for alignment paddings */
+                  + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
 
     /* Allocate the memory. */
     buf = calloc(total_bytes, 1);
-    if (buf == NULL)
-      return REG_ESPACE;
+    if (buf == NULL) return REG_ESPACE;
 
     /* Get the various pointers within tmp_buf (properly aligned). */
     tmp_tags = (void *)buf;
@@ -275,216 +258,177 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
     reach_pos = (void *)tmp_buf;
     tmp_buf += pbytes;
     tmp_buf += ALIGN(tmp_buf, long);
-    for (i = 0; i < tnfa->num_states; i++)
-      {
-	reach[i].tags = (void *)tmp_buf;
-	tmp_buf += xbytes;
-	reach_next[i].tags = (void *)tmp_buf;
-	tmp_buf += xbytes;
-      }
+    for (i = 0; i < tnfa->num_states; i++) {
+      reach[i].tags = (void *)tmp_buf;
+      tmp_buf += xbytes;
+      reach_next[i].tags = (void *)tmp_buf;
+      tmp_buf += xbytes;
+    }
   }
 
-  for (i = 0; i < tnfa->num_states; i++)
-    reach_pos[i].pos = -1;
+  for (i = 0; i < tnfa->num_states; i++) reach_pos[i].pos = -1;
 
   GET_NEXT_WCHAR();
   pos = 0;
 
   reach_next_i = reach_next;
-  while (1)
-    {
-      /* If no match found yet, add the initial states to `reach_next'. */
-      if (match_eo < 0)
-	{
-	  trans_i = tnfa->initial;
-	  while (trans_i->state != NULL)
-	    {
-	      if (reach_pos[trans_i->state_id].pos < pos)
-		{
-		  if (trans_i->assertions
-		      && CHECK_ASSERTIONS(trans_i->assertions))
-		    {
-		      trans_i++;
-		      continue;
-		    }
+  while (1) {
+    /* If no match found yet, add the initial states to `reach_next'. */
+    if (match_eo < 0) {
+      trans_i = tnfa->initial;
+      while (trans_i->state != NULL) {
+        if (reach_pos[trans_i->state_id].pos < pos) {
+          if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) {
+            trans_i++;
+            continue;
+          }
 
-		  reach_next_i->state = trans_i->state;
-		  for (i = 0; i < num_tags; i++)
-		    reach_next_i->tags[i] = -1;
-		  tag_i = trans_i->tags;
-		  if (tag_i)
-		    while (*tag_i >= 0)
-		      {
-			if (*tag_i < num_tags)
-			  reach_next_i->tags[*tag_i] = pos;
-			tag_i++;
-		      }
-		  if (reach_next_i->state == tnfa->final)
-		    {
-		      match_eo = pos;
-		      new_match = 1;
-		      for (i = 0; i < num_tags; i++)
-			match_tags[i] = reach_next_i->tags[i];
-		    }
-		  reach_pos[trans_i->state_id].pos = pos;
-		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
-		  reach_next_i++;
-		}
-	      trans_i++;
-	    }
-	  reach_next_i->state = NULL;
-	}
-      else
-	{
-	  if (num_tags == 0 || reach_next_i == reach_next)
-	    /* We have found a match. */
-	    break;
-	}
+          reach_next_i->state = trans_i->state;
+          for (i = 0; i < num_tags; i++) reach_next_i->tags[i] = -1;
+          tag_i = trans_i->tags;
+          if (tag_i)
+            while (*tag_i >= 0) {
+              if (*tag_i < num_tags) reach_next_i->tags[*tag_i] = pos;
+              tag_i++;
+            }
+          if (reach_next_i->state == tnfa->final) {
+            match_eo = pos;
+            new_match = 1;
+            for (i = 0; i < num_tags; i++)
+              match_tags[i] = reach_next_i->tags[i];
+          }
+          reach_pos[trans_i->state_id].pos = pos;
+          reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
+          reach_next_i++;
+        }
+        trans_i++;
+      }
+      reach_next_i->state = NULL;
+    } else {
+      if (num_tags == 0 || reach_next_i == reach_next)
+        /* We have found a match. */
+        break;
+    }
 
-      /* Check for end of string. */
-      if (!next_c) break;
+    /* Check for end of string. */
+    if (!next_c) break;
 
-      GET_NEXT_WCHAR();
+    GET_NEXT_WCHAR();
+
+    /* Swap `reach' and `reach_next'. */
+    reach_i = reach;
+    reach = reach_next;
+    reach_next = reach_i;
+
+    /* For each state in `reach', weed out states that don't fulfill the
+       minimal matching conditions. */
+    if (tnfa->num_minimals && new_match) {
+      new_match = 0;
+      reach_next_i = reach_next;
+      for (reach_i = reach; reach_i->state; reach_i++) {
+        int skip = 0;
+        for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) {
+          int end = tnfa->minimal_tags[i];
+          int start = tnfa->minimal_tags[i + 1];
+          if (end >= num_tags) {
+            skip = 1;
+            break;
+          } else if (reach_i->tags[start] == match_tags[start] &&
+                     reach_i->tags[end] < match_tags[end]) {
+            skip = 1;
+            break;
+          }
+        }
+        if (!skip) {
+          reach_next_i->state = reach_i->state;
+          tmp_iptr = reach_next_i->tags;
+          reach_next_i->tags = reach_i->tags;
+          reach_i->tags = tmp_iptr;
+          reach_next_i++;
+        }
+      }
+      reach_next_i->state = NULL;
 
       /* Swap `reach' and `reach_next'. */
       reach_i = reach;
       reach = reach_next;
       reach_next = reach_i;
-
-      /* For each state in `reach', weed out states that don't fulfill the
-	 minimal matching conditions. */
-      if (tnfa->num_minimals && new_match)
-	{
-	  new_match = 0;
-	  reach_next_i = reach_next;
-	  for (reach_i = reach; reach_i->state; reach_i++)
-	    {
-	      int skip = 0;
-	      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
-		{
-		  int end = tnfa->minimal_tags[i];
-		  int start = tnfa->minimal_tags[i + 1];
-		  if (end >= num_tags)
-		    {
-		      skip = 1;
-		      break;
-		    }
-		  else if (reach_i->tags[start] == match_tags[start]
-			   && reach_i->tags[end] < match_tags[end])
-		    {
-		      skip = 1;
-		      break;
-		    }
-		}
-	      if (!skip)
-		{
-		  reach_next_i->state = reach_i->state;
-		  tmp_iptr = reach_next_i->tags;
-		  reach_next_i->tags = reach_i->tags;
-		  reach_i->tags = tmp_iptr;
-		  reach_next_i++;
-		}
-	    }
-	  reach_next_i->state = NULL;
-
-	  /* Swap `reach' and `reach_next'. */
-	  reach_i = reach;
-	  reach = reach_next;
-	  reach_next = reach_i;
-	}
-
-      /* For each state in `reach' see if there is a transition leaving with
-	 the current input symbol to a state not yet in `reach_next', and
-	 add the destination states to `reach_next'. */
-      reach_next_i = reach_next;
-      for (reach_i = reach; reach_i->state; reach_i++)
-	{
-	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
-	    {
-	      /* Does this transition match the input symbol? */
-	      if (trans_i->code_min <= (tre_cint_t)prev_c &&
-		  trans_i->code_max >= (tre_cint_t)prev_c)
-		{
-		  if (trans_i->assertions
-		      && (CHECK_ASSERTIONS(trans_i->assertions)
-			  || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
-		    {
-		      continue;
-		    }
-
-		  /* Compute the tags after this transition. */
-		  for (i = 0; i < num_tags; i++)
-		    tmp_tags[i] = reach_i->tags[i];
-		  tag_i = trans_i->tags;
-		  if (tag_i != NULL)
-		    while (*tag_i >= 0)
-		      {
-			if (*tag_i < num_tags)
-			  tmp_tags[*tag_i] = pos;
-			tag_i++;
-		      }
-
-		  if (reach_pos[trans_i->state_id].pos < pos)
-		    {
-		      /* Found an unvisited node. */
-		      reach_next_i->state = trans_i->state;
-		      tmp_iptr = reach_next_i->tags;
-		      reach_next_i->tags = tmp_tags;
-		      tmp_tags = tmp_iptr;
-		      reach_pos[trans_i->state_id].pos = pos;
-		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
-
-		      if (reach_next_i->state == tnfa->final
-			  && (match_eo == -1
-			      || (num_tags > 0
-				  && reach_next_i->tags[0] <= match_tags[0])))
-			{
-			  match_eo = pos;
-			  new_match = 1;
-			  for (i = 0; i < num_tags; i++)
-			    match_tags[i] = reach_next_i->tags[i];
-			}
-		      reach_next_i++;
-
-		    }
-		  else
-		    {
-		      assert(reach_pos[trans_i->state_id].pos == pos);
-		      /* Another path has also reached this state.  We choose
-			 the winner by examining the tag values for both
-			 paths. */
-		      if (tre_tag_order(num_tags, tnfa->tag_directions,
-					tmp_tags,
-					*reach_pos[trans_i->state_id].tags))
-			{
-			  /* The new path wins. */
-			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
-			  *reach_pos[trans_i->state_id].tags = tmp_tags;
-			  if (trans_i->state == tnfa->final)
-			    {
-			      match_eo = pos;
-			      new_match = 1;
-			      for (i = 0; i < num_tags; i++)
-				match_tags[i] = tmp_tags[i];
-			    }
-			  tmp_tags = tmp_iptr;
-			}
-		    }
-		}
-	    }
-	}
-      reach_next_i->state = NULL;
     }
 
+    /* For each state in `reach' see if there is a transition leaving with
+       the current input symbol to a state not yet in `reach_next', and
+       add the destination states to `reach_next'. */
+    reach_next_i = reach_next;
+    for (reach_i = reach; reach_i->state; reach_i++) {
+      for (trans_i = reach_i->state; trans_i->state; trans_i++) {
+        /* Does this transition match the input symbol? */
+        if (trans_i->code_min <= (tre_cint_t)prev_c &&
+            trans_i->code_max >= (tre_cint_t)prev_c) {
+          if (trans_i->assertions &&
+              (CHECK_ASSERTIONS(trans_i->assertions) ||
+               CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) {
+            continue;
+          }
+
+          /* Compute the tags after this transition. */
+          for (i = 0; i < num_tags; i++) tmp_tags[i] = reach_i->tags[i];
+          tag_i = trans_i->tags;
+          if (tag_i != NULL)
+            while (*tag_i >= 0) {
+              if (*tag_i < num_tags) tmp_tags[*tag_i] = pos;
+              tag_i++;
+            }
+
+          if (reach_pos[trans_i->state_id].pos < pos) {
+            /* Found an unvisited node. */
+            reach_next_i->state = trans_i->state;
+            tmp_iptr = reach_next_i->tags;
+            reach_next_i->tags = tmp_tags;
+            tmp_tags = tmp_iptr;
+            reach_pos[trans_i->state_id].pos = pos;
+            reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
+
+            if (reach_next_i->state == tnfa->final &&
+                (match_eo == -1 ||
+                 (num_tags > 0 && reach_next_i->tags[0] <= match_tags[0]))) {
+              match_eo = pos;
+              new_match = 1;
+              for (i = 0; i < num_tags; i++)
+                match_tags[i] = reach_next_i->tags[i];
+            }
+            reach_next_i++;
+
+          } else {
+            unassert(reach_pos[trans_i->state_id].pos == pos);
+            /* Another path has also reached this state.  We choose
+               the winner by examining the tag values for both
+               paths. */
+            if (tre_tag_order(num_tags, tnfa->tag_directions, tmp_tags,
+                              *reach_pos[trans_i->state_id].tags)) {
+              /* The new path wins. */
+              tmp_iptr = *reach_pos[trans_i->state_id].tags;
+              *reach_pos[trans_i->state_id].tags = tmp_tags;
+              if (trans_i->state == tnfa->final) {
+                match_eo = pos;
+                new_match = 1;
+                for (i = 0; i < num_tags; i++) match_tags[i] = tmp_tags[i];
+              }
+              tmp_tags = tmp_iptr;
+            }
+          }
+        }
+      }
+    }
+    reach_next_i->state = NULL;
+  }
+
   *match_end_ofs = match_eo;
   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
 error_exit:
-  xfree(buf);
+  free(buf), buf = NULL;
   return ret;
 }
 
-
-
 /***********************************************************************
  from tre-match-backtrack.c
 ***********************************************************************/
@@ -528,7 +472,7 @@ typedef struct tre_backtrack_struct {
   tre_backtrack_item_t item;
   struct tre_backtrack_struct *prev;
   struct tre_backtrack_struct *next;
-} *tre_backtrack_t;
+} * tre_backtrack_t;
 
 #ifdef TRE_MBSTATE
 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
@@ -538,84 +482,67 @@ typedef struct tre_backtrack_struct {
 #define BT_STACK_MBSTATE_OUT
 #endif /* !TRE_MBSTATE */
 
-#define tre_bt_mem_new		  tre_mem_new
-#define tre_bt_mem_alloc	  tre_mem_alloc
-#define tre_bt_mem_destroy	  tre_mem_destroy
+#define tre_bt_mem_new     tre_mem_new
+#define tre_bt_mem_alloc   tre_mem_alloc
+#define tre_bt_mem_destroy tre_mem_destroy
 
+#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, \
+                      _tags, _mbstate)                                        \
+  do {                                                                        \
+    int i;                                                                    \
+    if (!stack->next) {                                                       \
+      tre_backtrack_t s;                                                      \
+      s = tre_bt_mem_alloc(mem, sizeof(*s));                                  \
+      if (!s) {                                                               \
+        tre_bt_mem_destroy(mem);                                              \
+        if (tags) free(tags), tags = NULL;                                    \
+        if (pmatch) free(pmatch), pmatch = NULL;                              \
+        if (states_seen) free(states_seen), states_seen = NULL;               \
+        return REG_ESPACE;                                                    \
+      }                                                                       \
+      s->prev = stack;                                                        \
+      s->next = NULL;                                                         \
+      s->item.tags = tre_bt_mem_alloc(mem, sizeof(*tags) * tnfa->num_tags);   \
+      if (!s->item.tags) {                                                    \
+        tre_bt_mem_destroy(mem);                                              \
+        if (tags) free(tags), tags = NULL;                                    \
+        if (pmatch) free(pmatch), pmatch = NULL;                              \
+        if (states_seen) free(states_seen), states_seen = NULL;               \
+        return REG_ESPACE;                                                    \
+      }                                                                       \
+      stack->next = s;                                                        \
+      stack = s;                                                              \
+    } else                                                                    \
+      stack = stack->next;                                                    \
+    stack->item.pos = (_pos);                                                 \
+    stack->item.str_byte = (_str_byte);                                       \
+    stack->item.state = (_state);                                             \
+    stack->item.state_id = (_state_id);                                       \
+    stack->item.next_c = (_next_c);                                           \
+    for (i = 0; i < tnfa->num_tags; i++) stack->item.tags[i] = (_tags)[i];    \
+    BT_STACK_MBSTATE_IN;                                                      \
+  } while (0)
 
-#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
-  do									      \
-    {									      \
-      int i;								      \
-      if (!stack->next)							      \
-	{								      \
-	  tre_backtrack_t s;						      \
-	  s = tre_bt_mem_alloc(mem, sizeof(*s));			      \
-	  if (!s)							      \
-	    {								      \
-	      tre_bt_mem_destroy(mem);					      \
-	      if (tags)							      \
-		xfree(tags);						      \
-	      if (pmatch)						      \
-		xfree(pmatch);						      \
-	      if (states_seen)						      \
-		xfree(states_seen);					      \
-	      return REG_ESPACE;					      \
-	    }								      \
-	  s->prev = stack;						      \
-	  s->next = NULL;						      \
-	  s->item.tags = tre_bt_mem_alloc(mem,				      \
-					  sizeof(*tags) * tnfa->num_tags);    \
-	  if (!s->item.tags)						      \
-	    {								      \
-	      tre_bt_mem_destroy(mem);					      \
-	      if (tags)							      \
-		xfree(tags);						      \
-	      if (pmatch)						      \
-		xfree(pmatch);						      \
-	      if (states_seen)						      \
-		xfree(states_seen);					      \
-	      return REG_ESPACE;					      \
-	    }								      \
-	  stack->next = s;						      \
-	  stack = s;							      \
-	}								      \
-      else								      \
-	stack = stack->next;						      \
-      stack->item.pos = (_pos);						      \
-      stack->item.str_byte = (_str_byte);				      \
-      stack->item.state = (_state);					      \
-      stack->item.state_id = (_state_id);				      \
-      stack->item.next_c = (_next_c);					      \
-      for (i = 0; i < tnfa->num_tags; i++)				      \
-	stack->item.tags[i] = (_tags)[i];				      \
-      BT_STACK_MBSTATE_IN;						      \
-    }									      \
-  while (0)
-
-#define BT_STACK_POP()							      \
-  do									      \
-    {									      \
-      int i;								      \
-      assert(stack->prev);						      \
-      pos = stack->item.pos;						      \
-      str_byte = stack->item.str_byte;					      \
-      state = stack->item.state;					      \
-      next_c = stack->item.next_c;					      \
-      for (i = 0; i < tnfa->num_tags; i++)				      \
-	tags[i] = stack->item.tags[i];					      \
-      BT_STACK_MBSTATE_OUT;						      \
-      stack = stack->prev;						      \
-    }									      \
-  while (0)
+#define BT_STACK_POP()                                                  \
+  do {                                                                  \
+    int i;                                                              \
+    unassert(stack->prev);                                              \
+    pos = stack->item.pos;                                              \
+    str_byte = stack->item.str_byte;                                    \
+    state = stack->item.state;                                          \
+    next_c = stack->item.next_c;                                        \
+    for (i = 0; i < tnfa->num_tags; i++) tags[i] = stack->item.tags[i]; \
+    BT_STACK_MBSTATE_OUT;                                               \
+    stack = stack->prev;                                                \
+  } while (0)
 
 #undef MIN
 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
 
-static reg_errcode_t
-tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
-{
+static reg_errcode_t tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa,
+                                            const void *string,
+                                            regoff_t *match_tags, int eflags,
+                                            regoff_t *match_end_ofs) {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
   const char *str_byte = string;
@@ -658,60 +585,48 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
   int ret;
 
 #ifdef TRE_MBSTATE
-  memset(&mbstate, '\0', sizeof(mbstate));
+  bzero(&mbstate, sizeof(mbstate));
 #endif /* TRE_MBSTATE */
 
-  if (!mem)
-    return REG_ESPACE;
+  if (!mem) return REG_ESPACE;
   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
-  if (!stack)
-    {
-      ret = REG_ESPACE;
-      goto error_exit;
-    }
+  if (!stack) {
+    ret = REG_ESPACE;
+    goto error_exit;
+  }
   stack->prev = NULL;
   stack->next = NULL;
 
-  if (tnfa->num_tags)
-    {
-      tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
-      if (!tags)
-	{
-	  ret = REG_ESPACE;
-	  goto error_exit;
-	}
+  if (tnfa->num_tags) {
+    tags = malloc(sizeof(*tags) * tnfa->num_tags);
+    if (!tags) {
+      ret = REG_ESPACE;
+      goto error_exit;
     }
-  if (tnfa->num_submatches)
-    {
-      pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
-      if (!pmatch)
-	{
-	  ret = REG_ESPACE;
-	  goto error_exit;
-	}
-    }
-  if (tnfa->num_states)
-    {
-      states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
-      if (!states_seen)
-	{
-	  ret = REG_ESPACE;
-	  goto error_exit;
-	}
-    }
-
- retry:
-  {
-    int i;
-    for (i = 0; i < tnfa->num_tags; i++)
-      {
-	tags[i] = -1;
-	if (match_tags)
-	  match_tags[i] = -1;
-      }
-    for (i = 0; i < tnfa->num_states; i++)
-      states_seen[i] = 0;
   }
+  if (tnfa->num_submatches) {
+    pmatch = malloc(sizeof(*pmatch) * tnfa->num_submatches);
+    if (!pmatch) {
+      ret = REG_ESPACE;
+      goto error_exit;
+    }
+  }
+  if (tnfa->num_states) {
+    states_seen = malloc(sizeof(*states_seen) * tnfa->num_states);
+    if (!states_seen) {
+      ret = REG_ESPACE;
+      goto error_exit;
+    }
+  }
+
+retry : {
+  int i;
+  for (i = 0; i < tnfa->num_tags; i++) {
+    tags[i] = -1;
+    if (match_tags) match_tags[i] = -1;
+  }
+  for (i = 0; i < tnfa->num_states; i++) states_seen[i] = 0;
+}
 
   state = NULL;
   pos = pos_start;
@@ -725,219 +640,174 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 
   /* Handle initial states. */
   next_tags = NULL;
-  for (trans_i = tnfa->initial; trans_i->state; trans_i++)
-    {
-      if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
-	{
-	  continue;
-	}
-      if (state == NULL)
-	{
-	  /* Start from this state. */
-	  state = trans_i->state;
-	  next_tags = trans_i->tags;
-	}
-      else
-	{
-	  /* Backtrack to this state. */
-	  BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
-			trans_i->state_id, next_c, tags, mbstate);
-	  {
-	    int *tmp = trans_i->tags;
-	    if (tmp)
-	      while (*tmp >= 0)
-		stack->item.tags[*tmp++] = pos;
-	  }
-	}
+  for (trans_i = tnfa->initial; trans_i->state; trans_i++) {
+    if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) {
+      continue;
     }
+    if (state == NULL) {
+      /* Start from this state. */
+      state = trans_i->state;
+      next_tags = trans_i->tags;
+    } else {
+      /* Backtrack to this state. */
+      BT_STACK_PUSH(pos, str_byte, 0, trans_i->state, trans_i->state_id, next_c,
+                    tags, mbstate);
+      {
+        int *tmp = trans_i->tags;
+        if (tmp)
+          while (*tmp >= 0) stack->item.tags[*tmp++] = pos;
+      }
+    }
+  }
 
   if (next_tags)
-    for (; *next_tags >= 0; next_tags++)
-      tags[*next_tags] = pos;
+    for (; *next_tags >= 0; next_tags++) tags[*next_tags] = pos;
 
+  if (state == NULL) goto backtrack;
 
-  if (state == NULL)
-    goto backtrack;
+  while (1) {
+    tre_tnfa_transition_t *next_state;
+    int empty_br_match;
 
-  while (1)
-    {
-      tre_tnfa_transition_t *next_state;
-      int empty_br_match;
+    if (state == tnfa->final) {
+      if (match_eo < pos || (match_eo == pos && match_tags &&
+                             tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
+                                           tags, match_tags))) {
+        int i;
+        /* This match wins the previous match. */
+        match_eo = pos;
+        if (match_tags)
+          for (i = 0; i < tnfa->num_tags; i++) match_tags[i] = tags[i];
+      }
+      /* Our TNFAs never have transitions leaving from the final state,
+         so we jump right to backtracking. */
+      goto backtrack;
+    }
 
-      if (state == tnfa->final)
-	{
-	  if (match_eo < pos
-	      || (match_eo == pos
-		  && match_tags
-		  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
-				   tags, match_tags)))
-	    {
-	      int i;
-	      /* This match wins the previous match. */
-	      match_eo = pos;
-	      if (match_tags)
-		for (i = 0; i < tnfa->num_tags; i++)
-		  match_tags[i] = tags[i];
-	    }
-	  /* Our TNFAs never have transitions leaving from the final state,
-	     so we jump right to backtracking. */
-	  goto backtrack;
-	}
+    /* Go to the next character in the input string. */
+    empty_br_match = 0;
+    trans_i = state;
+    if (trans_i->state && trans_i->assertions & ASSERT_BACKREF) {
+      /* This is a back reference state.  All transitions leaving from
+         this state have the same back reference "assertion".  Instead
+         of reading the next character, we match the back reference. */
+      regoff_t so, eo;
+      int bt = trans_i->u.backref;
+      regoff_t bt_len;
+      int result;
 
-      /* Go to the next character in the input string. */
-      empty_br_match = 0;
-      trans_i = state;
-      if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
-	{
-	  /* This is a back reference state.  All transitions leaving from
-	     this state have the same back reference "assertion".  Instead
-	     of reading the next character, we match the back reference. */
-	  regoff_t so, eo;
-	  int bt = trans_i->u.backref;
-	  regoff_t bt_len;
-	  int result;
+      /* Get the substring we need to match against.  Remember to
+         turn off REG_NOSUB temporarily. */
+      tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, tnfa, tags,
+                      pos);
+      so = pmatch[bt].rm_so;
+      eo = pmatch[bt].rm_eo;
+      bt_len = eo - so;
 
-	  /* Get the substring we need to match against.  Remember to
-	     turn off REG_NOSUB temporarily. */
-	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
-			  tnfa, tags, pos);
-	  so = pmatch[bt].rm_so;
-	  eo = pmatch[bt].rm_eo;
-	  bt_len = eo - so;
+      result = strncmp((const char *)string + so, str_byte - 1, (size_t)bt_len);
 
-	  result = strncmp((const char*)string + so, str_byte - 1,
-				 (size_t)bt_len);
+      if (result == 0) {
+        /* Back reference matched.  Check for infinite loop. */
+        if (bt_len == 0) empty_br_match = 1;
+        if (empty_br_match && states_seen[trans_i->state_id]) {
+          goto backtrack;
+        }
 
-	  if (result == 0)
-	    {
-	      /* Back reference matched.  Check for infinite loop. */
-	      if (bt_len == 0)
-		empty_br_match = 1;
-	      if (empty_br_match && states_seen[trans_i->state_id])
-		{
-		  goto backtrack;
-		}
+        states_seen[trans_i->state_id] = empty_br_match;
 
-	      states_seen[trans_i->state_id] = empty_br_match;
+        /* Advance in input string and resync `prev_c', `next_c'
+           and pos. */
+        str_byte += bt_len - 1;
+        pos += bt_len - 1;
+        GET_NEXT_WCHAR();
+      } else {
+        goto backtrack;
+      }
+    } else {
+      /* Check for end of string. */
+      if (next_c == L'\0') goto backtrack;
 
-	      /* Advance in input string and resync `prev_c', `next_c'
-		 and pos. */
-	      str_byte += bt_len - 1;
-	      pos += bt_len - 1;
-	      GET_NEXT_WCHAR();
-	    }
-	  else
-	    {
-	      goto backtrack;
-	    }
-	}
-      else
-	{
-	  /* Check for end of string. */
-	  if (next_c == L'\0')
-		goto backtrack;
+      /* Read the next character. */
+      GET_NEXT_WCHAR();
+    }
 
-	  /* Read the next character. */
-	  GET_NEXT_WCHAR();
-	}
+    next_state = NULL;
+    for (trans_i = state; trans_i->state; trans_i++) {
+      if (trans_i->code_min <= (tre_cint_t)prev_c &&
+          trans_i->code_max >= (tre_cint_t)prev_c) {
+        if (trans_i->assertions &&
+            (CHECK_ASSERTIONS(trans_i->assertions) ||
+             CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) {
+          continue;
+        }
 
-      next_state = NULL;
-      for (trans_i = state; trans_i->state; trans_i++)
-	{
-	  if (trans_i->code_min <= (tre_cint_t)prev_c
-	      && trans_i->code_max >= (tre_cint_t)prev_c)
-	    {
-	      if (trans_i->assertions
-		  && (CHECK_ASSERTIONS(trans_i->assertions)
-		      || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
-		{
-		  continue;
-		}
-
-	      if (next_state == NULL)
-		{
-		  /* First matching transition. */
-		  next_state = trans_i->state;
-		  next_tags = trans_i->tags;
-		}
-	      else
-		{
-		  /* Second matching transition.  We may need to backtrack here
-		     to take this transition instead of the first one, so we
-		     push this transition in the backtracking stack so we can
-		     jump back here if needed. */
-		  BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
-				trans_i->state_id, next_c, tags, mbstate);
-		  {
-		    int *tmp;
-		    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
-		      stack->item.tags[*tmp] = pos;
-		  }
-#if 0 /* XXX - it's important not to look at all transitions here to keep
-	 the stack small! */
+        if (next_state == NULL) {
+          /* First matching transition. */
+          next_state = trans_i->state;
+          next_tags = trans_i->tags;
+        } else {
+          /* Second matching transition.  We may need to backtrack here
+             to take this transition instead of the first one, so we
+             push this transition in the backtracking stack so we can
+             jump back here if needed. */
+          BT_STACK_PUSH(pos, str_byte, 0, trans_i->state, trans_i->state_id,
+                        next_c, tags, mbstate);
+          {
+            int *tmp;
+            for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
+              stack->item.tags[*tmp] = pos;
+          }
+#if 0 /* XXX - it's important not to look at all transitions here to keep \
+         the stack small! */
 		  break;
 #endif
-		}
-	    }
-	}
-
-      if (next_state != NULL)
-	{
-	  /* Matching transitions were found.  Take the first one. */
-	  state = next_state;
-
-	  /* Update the tag values. */
-	  if (next_tags)
-	    while (*next_tags >= 0)
-	      tags[*next_tags++] = pos;
-	}
-      else
-	{
-	backtrack:
-	  /* A matching transition was not found.  Try to backtrack. */
-	  if (stack->prev)
-	    {
-	      if (stack->item.state->assertions & ASSERT_BACKREF)
-		{
-		  states_seen[stack->item.state_id] = 0;
-		}
-
-	      BT_STACK_POP();
-	    }
-	  else if (match_eo < 0)
-	    {
-	      /* Try starting from a later position in the input string. */
-	      /* Check for end of string. */
-	      if (next_c == L'\0')
-		    {
-		      break;
-		    }
-	      next_c = next_c_start;
-#ifdef TRE_MBSTATE
-	      mbstate = mbstate_start;
-#endif /* TRE_MBSTATE */
-	      str_byte = str_byte_start;
-	      goto retry;
-	    }
-	  else
-	    {
-	      break;
-	    }
-	}
+        }
+      }
     }
 
+    if (next_state != NULL) {
+      /* Matching transitions were found.  Take the first one. */
+      state = next_state;
+
+      /* Update the tag values. */
+      if (next_tags)
+        while (*next_tags >= 0) tags[*next_tags++] = pos;
+    } else {
+    backtrack:
+      /* A matching transition was not found.  Try to backtrack. */
+      if (stack->prev) {
+        if (stack->item.state->assertions & ASSERT_BACKREF) {
+          states_seen[stack->item.state_id] = 0;
+        }
+
+        BT_STACK_POP();
+      } else if (match_eo < 0) {
+        /* Try starting from a later position in the input string. */
+        /* Check for end of string. */
+        if (next_c == L'\0') {
+          break;
+        }
+        next_c = next_c_start;
+#ifdef TRE_MBSTATE
+        mbstate = mbstate_start;
+#endif /* TRE_MBSTATE */
+        str_byte = str_byte_start;
+        goto retry;
+      } else {
+        break;
+      }
+    }
+  }
+
   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
   *match_end_ofs = match_eo;
 
- error_exit:
+error_exit:
   tre_bt_mem_destroy(mem);
 #ifndef TRE_USE_ALLOCA
-  if (tags)
-    xfree(tags);
-  if (pmatch)
-    xfree(pmatch);
-  if (states_seen)
-    xfree(states_seen);
+  if (tags) free(tags), tags = NULL;
+  if (pmatch) free(pmatch), pmatch = NULL;
+  if (states_seen) free(states_seen), states_seen = NULL;
 #endif /* !TRE_USE_ALLOCA */
 
   return ret;
@@ -949,72 +819,60 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 
 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
    endpoint values. */
-static void
-tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
-{
+static void tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
+                            const tre_tnfa_t *tnfa, regoff_t *tags,
+                            regoff_t match_eo) {
   tre_submatch_data_t *submatch_data;
   unsigned int i, j;
   int *parents;
 
   i = 0;
-  if (match_eo >= 0 && !(cflags & REG_NOSUB))
-    {
-      /* Construct submatch offsets from the tags. */
-      submatch_data = tnfa->submatch_data;
-      while (i < tnfa->num_submatches && i < nmatch)
-	{
-	  if (submatch_data[i].so_tag == tnfa->end_tag)
-	    pmatch[i].rm_so = match_eo;
-	  else
-	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
+  if (match_eo >= 0 && !(cflags & REG_NOSUB)) {
+    /* Construct submatch offsets from the tags. */
+    submatch_data = tnfa->submatch_data;
+    while (i < tnfa->num_submatches && i < nmatch) {
+      if (submatch_data[i].so_tag == tnfa->end_tag)
+        pmatch[i].rm_so = match_eo;
+      else
+        pmatch[i].rm_so = tags[submatch_data[i].so_tag];
 
-	  if (submatch_data[i].eo_tag == tnfa->end_tag)
-	    pmatch[i].rm_eo = match_eo;
-	  else
-	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
+      if (submatch_data[i].eo_tag == tnfa->end_tag)
+        pmatch[i].rm_eo = match_eo;
+      else
+        pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
 
-	  /* If either of the endpoints were not used, this submatch
-	     was not part of the match. */
-	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
-	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
+      /* If either of the endpoints were not used, this submatch
+         was not part of the match. */
+      if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
+        pmatch[i].rm_so = pmatch[i].rm_eo = -1;
 
-	  i++;
-	}
-      /* Reset all submatches that are not within all of their parent
-	 submatches. */
-      i = 0;
-      while (i < tnfa->num_submatches && i < nmatch)
-	{
-	  if (pmatch[i].rm_eo == -1)
-	    assert(pmatch[i].rm_so == -1);
-	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
-
-	  parents = submatch_data[i].parents;
-	  if (parents != NULL)
-	    for (j = 0; parents[j] >= 0; j++)
-	      {
-		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
-		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
-		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
-	      }
-	  i++;
-	}
-    }
-
-  while (i < nmatch)
-    {
-      pmatch[i].rm_so = -1;
-      pmatch[i].rm_eo = -1;
       i++;
     }
+    /* Reset all submatches that are not within all of their parent
+       submatches. */
+    i = 0;
+    while (i < tnfa->num_submatches && i < nmatch) {
+      if (pmatch[i].rm_eo == -1) unassert(pmatch[i].rm_so == -1);
+      unassert(pmatch[i].rm_so <= pmatch[i].rm_eo);
+
+      parents = submatch_data[i].parents;
+      if (parents != NULL)
+        for (j = 0; parents[j] >= 0; j++) {
+          if (pmatch[i].rm_so < pmatch[parents[j]].rm_so ||
+              pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
+            pmatch[i].rm_so = pmatch[i].rm_eo = -1;
+        }
+      i++;
+    }
+  }
+
+  while (i < nmatch) {
+    pmatch[i].rm_so = -1;
+    pmatch[i].rm_eo = -1;
+    i++;
+  }
 }
 
-
-/*
-  Wrapper functions for POSIX compatible regexp matching.
-*/
-
 /**
  * Executes regular expression.
  *
@@ -1022,37 +880,26 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
  * @param eflags can have REG_NOTBOL, REG_NOTEOL
  * @return 0 or REG_NOMATCH
  */
-int
-regexec(const regex_t *restrict preg, const char *restrict string,
-	  size_t nmatch, regmatch_t pmatch[restrict], int eflags)
-{
+int regexec(const regex_t *preg, const char *string, size_t nmatch,
+            regmatch_t *pmatch, int eflags) {
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   reg_errcode_t status;
   regoff_t *tags = NULL, eo;
   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
-  if (tnfa->num_tags > 0 && nmatch > 0)
-    {
-      tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
-      if (tags == NULL)
-	return REG_ESPACE;
-    }
-
+  if (tnfa->num_tags > 0 && nmatch > 0) {
+    tags = malloc(sizeof(*tags) * tnfa->num_tags);
+    if (tags == NULL) return REG_ESPACE;
+  }
   /* Dispatch to the appropriate matcher. */
-  if (tnfa->have_backrefs)
-    {
-      /* The regex has back references, use the backtracking matcher. */
-      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
-    }
-  else
-    {
-      /* Exact matching, no back references, use the parallel matcher. */
-      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
-    }
-
-  if (status == REG_OK)
-    /* A match was found, so fill the submatch registers. */
+  if (tnfa->have_backrefs) {
+    /* The regex has back references, use the backtracking matcher. */
+    status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+  } else {
+    /* Exact matching, no back references, use the parallel matcher. */
+    status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+  }
+  if (status == REG_OK) /* A match was found, so fill the submatch registers. */
     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
-  if (tags)
-    xfree(tags);
+  if (tags) free(tags), tags = NULL;
   return status;
 }
diff --git a/third_party/regex/tre-mem.c b/third_party/regex/tre-mem.c
index ce70ec6f1..971900a08 100644
--- a/third_party/regex/tre-mem.c
+++ b/third_party/regex/tre-mem.c
@@ -56,6 +56,7 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/regex/tre.inc"
 
 /*
   This memory allocator is for allocating small memory blocks efficiently
@@ -64,11 +65,6 @@
   allocators, though.
 */
 
-#include <stdlib.h>
-#include <string.h>
-
-#include "tre.inc"
-
 /*
   This memory allocator is for allocating small memory blocks efficiently
   in terms of memory overhead and execution speed.  The allocated blocks
@@ -77,110 +73,79 @@
 */
 
 /* Returns a new memory allocator or NULL if out of memory. */
-tre_mem_t
-tre_mem_new_impl(int provided, void *provided_block)
-{
+tre_mem_t tre_mem_new_impl(int provided, void *provided_block) {
   tre_mem_t mem;
-  if (provided)
-    {
-      mem = provided_block;
-      memset(mem, 0, sizeof(*mem));
-    }
-  else
-    mem = xcalloc(1, sizeof(*mem));
-  if (mem == NULL)
-    return NULL;
+  if (provided) {
+    mem = provided_block;
+    bzero(mem, sizeof(*mem));
+  } else
+    mem = calloc(1, sizeof(*mem));
+  if (mem == NULL) return NULL;
   return mem;
 }
 
-
 /* Frees the memory allocator and all memory allocated with it. */
-void
-tre_mem_destroy(tre_mem_t mem)
-{
+void tre_mem_destroy(tre_mem_t mem) {
   tre_list_t *tmp, *l = mem->blocks;
-
-  while (l != NULL)
-    {
-      xfree(l->data);
-      tmp = l->next;
-      xfree(l);
-      l = tmp;
-    }
-  xfree(mem);
+  while (l != NULL) {
+    free(l->data), l->data = NULL;
+    tmp = l->next;
+    free(l), l = tmp;
+  }
+  free(mem), mem = NULL;
 }
 
-
 /* Allocates a block of `size' bytes from `mem'.  Returns a pointer to the
    allocated block or NULL if an underlying malloc() failed. */
-void *
-tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block,
-		   int zero, size_t size)
-{
+void *tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block,
+                         int zero, size_t size) {
   void *ptr;
-
-  if (mem->failed)
-    {
-      return NULL;
-    }
-
-  if (mem->n < size)
-    {
-      /* We need more memory than is available in the current block.
-	 Allocate a new block. */
-      tre_list_t *l;
-      if (provided)
-	{
-	  if (provided_block == NULL)
-	    {
-	      mem->failed = 1;
-	      return NULL;
-	    }
-	  mem->ptr = provided_block;
-	  mem->n = TRE_MEM_BLOCK_SIZE;
-	}
+  if (mem->failed) {
+    return NULL;
+  }
+  if (mem->n < size) {
+    /* We need more memory than is available in the current block.
+       Allocate a new block. */
+    tre_list_t *l;
+    if (provided) {
+      if (provided_block == NULL) {
+        mem->failed = 1;
+        return NULL;
+      }
+      mem->ptr = provided_block;
+      mem->n = TRE_MEM_BLOCK_SIZE;
+    } else {
+      int block_size;
+      if (size * 8 > TRE_MEM_BLOCK_SIZE)
+        block_size = size * 8;
       else
-	{
-	  int block_size;
-	  if (size * 8 > TRE_MEM_BLOCK_SIZE)
-	    block_size = size * 8;
-	  else
-	    block_size = TRE_MEM_BLOCK_SIZE;
-	  l = xmalloc(sizeof(*l));
-	  if (l == NULL)
-	    {
-	      mem->failed = 1;
-	      return NULL;
-	    }
-	  l->data = xmalloc(block_size);
-	  if (l->data == NULL)
-	    {
-	      xfree(l);
-	      mem->failed = 1;
-	      return NULL;
-	    }
-	  l->next = NULL;
-	  if (mem->current != NULL)
-	    mem->current->next = l;
-	  if (mem->blocks == NULL)
-	    mem->blocks = l;
-	  mem->current = l;
-	  mem->ptr = l->data;
-	  mem->n = block_size;
-	}
+        block_size = TRE_MEM_BLOCK_SIZE;
+      l = malloc(sizeof(*l));
+      if (l == NULL) {
+        mem->failed = 1;
+        return NULL;
+      }
+      l->data = malloc(block_size);
+      if (l->data == NULL) {
+        free(l), l = NULL;
+        mem->failed = 1;
+        return NULL;
+      }
+      l->next = NULL;
+      if (mem->current != NULL) mem->current->next = l;
+      if (mem->blocks == NULL) mem->blocks = l;
+      mem->current = l;
+      mem->ptr = l->data;
+      mem->n = block_size;
     }
-
+  }
   /* Make sure the next pointer will be aligned. */
   size += ALIGN(mem->ptr + size, long);
-
   /* Allocate from current block. */
   ptr = mem->ptr;
   mem->ptr += size;
   mem->n -= size;
-
   /* Set to zero if needed. */
-  if (zero)
-    memset(ptr, 0, size);
-
+  if (zero) bzero(ptr, size);
   return ptr;
 }
diff --git a/third_party/regex/tre.inc b/third_party/regex/tre.inc
index b33e19973..a86cb46b7 100644
--- a/third_party/regex/tre.inc
+++ b/third_party/regex/tre.inc
@@ -1,52 +1,79 @@
-/*
-  tre-internal.h - TRE internal definitions
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Musl Libc                                                                   │
+│  Copyright © 2005-2014 Rich Felker, et al.                                   │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+│──────────────────────────────────────────────────────────────────────────────│
+│                                                                              │
+│  tre-internal.h - TRE internal definitions                                   │
+│                                                                              │
+│  Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>                         │
+│  All rights reserved.                                                        │
+│                                                                              │
+│  Redistribution and use in source and binary forms, with or without          │
+│  modification, are permitted provided that the following conditions          │
+│  are met:                                                                    │
+│                                                                              │
+│    1. Redistributions of source code must retain the above copyright         │
+│       notice, this list of conditions and the following disclaimer.          │
+│                                                                              │
+│    2. Redistributions in binary form must reproduce the above copyright      │
+│       notice, this list of conditions and the following disclaimer in        │
+│       the documentation and/or other materials provided with the             │
+│       distribution.                                                          │
+│                                                                              │
+│  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS          │
+│  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT         │
+│  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR       │
+│  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT       │
+│  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,      │
+│  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT            │
+│  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,       │
+│  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY       │
+│  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT         │
+│  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE       │
+│  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.        │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"
+#include "libc/wctype.h"
+#include "third_party/regex/regex.h"
 
-  Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    1. Redistributions of source code must retain the above copyright
-       notice, this list of conditions and the following disclaimer.
-
-    2. Redistributions in binary form must reproduce the above copyright
-       notice, this list of conditions and the following disclaimer in the
-       documentation and/or other materials provided with the distribution.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
-  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
-  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <regex.h>
-#include <wchar.h>
-#include <wctype.h>
-
-#undef  TRE_MBSTATE
-
-#ifndef NDEBUG
-#define NDEBUG
-#endif
+#undef TRE_MBSTATE
 
 #define TRE_REGEX_T_FIELD __opaque
 typedef int reg_errcode_t;
-
 typedef wchar_t tre_char_t;
 
-#define DPRINT(msg) do { } while(0)
+#define DPRINT(msg) \
+  do {              \
+  } while (0)
 
-#define elementsof(x)	( sizeof(x) / sizeof(x[0]) )
+#define elementsof(x) (sizeof(x) / sizeof(x[0]))
 
 #define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n)))
 
@@ -54,17 +81,17 @@ typedef wchar_t tre_char_t;
 typedef wint_t tre_cint_t;
 #define TRE_CHAR_MAX 0x10ffff
 
-#define tre_isalnum iswalnum
-#define tre_isalpha iswalpha
-#define tre_isblank iswblank
-#define tre_iscntrl iswcntrl
-#define tre_isdigit iswdigit
-#define tre_isgraph iswgraph
-#define tre_islower iswlower
-#define tre_isprint iswprint
-#define tre_ispunct iswpunct
-#define tre_isspace iswspace
-#define tre_isupper iswupper
+#define tre_isalnum  iswalnum
+#define tre_isalpha  iswalpha
+#define tre_isblank  iswblank
+#define tre_iscntrl  iswcntrl
+#define tre_isdigit  iswdigit
+#define tre_isgraph  iswgraph
+#define tre_islower  iswlower
+#define tre_isprint  iswprint
+#define tre_ispunct  iswpunct
+#define tre_isspace  iswspace
+#define tre_isupper  iswupper
 #define tre_isxdigit iswxdigit
 
 #define tre_tolower towlower
@@ -78,10 +105,10 @@ typedef wctype_t tre_ctype_t;
 
 /* Returns number of bytes to add to (char *)ptr to make it
    properly aligned for the type. */
-#define ALIGN(ptr, type) \
-  ((((long)ptr) % sizeof(type)) \
-   ? (sizeof(type) - (((long)ptr) % sizeof(type))) \
-   : 0)
+#define ALIGN(ptr, type)                               \
+  ((((long)ptr) % sizeof(type))                        \
+       ? (sizeof(type) - (((long)ptr) % sizeof(type))) \
+       : 0)
 
 #undef MAX
 #undef MIN
@@ -115,24 +142,20 @@ struct tnfa_transition {
   tre_ctype_t *neg_classes;
 };
 
-
 /* Assertions. */
-#define ASSERT_AT_BOL		  1   /* Beginning of line. */
-#define ASSERT_AT_EOL		  2   /* End of line. */
-#define ASSERT_CHAR_CLASS	  4   /* Character class in `class'. */
-#define ASSERT_CHAR_CLASS_NEG	  8   /* Character classes in `neg_classes'. */
-#define ASSERT_AT_BOW		 16   /* Beginning of word. */
-#define ASSERT_AT_EOW		 32   /* End of word. */
-#define ASSERT_AT_WB		 64   /* Word boundary. */
-#define ASSERT_AT_WB_NEG	128   /* Not a word boundary. */
-#define ASSERT_BACKREF		256   /* A back reference in `backref'. */
-#define ASSERT_LAST		256
+#define ASSERT_AT_BOL         1   /* Beginning of line. */
+#define ASSERT_AT_EOL         2   /* End of line. */
+#define ASSERT_CHAR_CLASS     4   /* Character class in `class'. */
+#define ASSERT_CHAR_CLASS_NEG 8   /* Character classes in `neg_classes'. */
+#define ASSERT_AT_BOW         16  /* Beginning of word. */
+#define ASSERT_AT_EOW         32  /* End of word. */
+#define ASSERT_AT_WB          64  /* Word boundary. */
+#define ASSERT_AT_WB_NEG      128 /* Not a word boundary. */
+#define ASSERT_BACKREF        256 /* A back reference in `backref'. */
+#define ASSERT_LAST           256
 
 /* Tag directions. */
-typedef enum {
-  TRE_TAG_MINIMIZE = 0,
-  TRE_TAG_MAXIMIZE = 1
-} tre_tag_direction_t;
+typedef enum { TRE_TAG_MINIMIZE = 0, TRE_TAG_MAXIMIZE = 1 } tre_tag_direction_t;
 
 /* Instructions to compute submatch register values from tag values
    after a successful match.  */
@@ -147,7 +170,6 @@ struct tre_submatch_data {
 
 typedef struct tre_submatch_data tre_submatch_data_t;
 
-
 /* TNFA definition. */
 typedef struct tnfa tre_tnfa_t;
 
@@ -187,7 +209,7 @@ typedef struct tre_mem_struct {
   size_t n;
   int failed;
   void **provided;
-} *tre_mem_t;
+} * tre_mem_t;
 
 #define tre_mem_new_impl   __tre_mem_new_impl
 #define tre_mem_alloc_impl __tre_mem_alloc_impl
@@ -195,10 +217,10 @@ typedef struct tre_mem_struct {
 
 tre_mem_t tre_mem_new_impl(int provided, void *provided_block);
 void *tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block,
-                                int zero, size_t size);
+                         int zero, size_t size);
 
 /* Returns a new memory allocator or NULL if out of memory. */
-#define tre_mem_new()  tre_mem_new_impl(0, NULL)
+#define tre_mem_new() tre_mem_new_impl(0, NULL)
 
 /* Allocates a block of `size' bytes from `mem'.  Returns a pointer to the
    allocated block or NULL if an underlying malloc() failed. */
@@ -216,18 +238,11 @@ void *tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block,
 #define tre_mem_newa() \
   tre_mem_new_impl(1, alloca(sizeof(struct tre_mem_struct)))
 
-#define tre_mem_alloca(mem, size)					      \
-  ((mem)->n >= (size)							      \
-   ? tre_mem_alloc_impl((mem), 1, NULL, 0, (size))			      \
-   : tre_mem_alloc_impl((mem), 1, alloca(TRE_MEM_BLOCK_SIZE), 0, (size)))
+#define tre_mem_alloca(mem, size)                      \
+  ((mem)->n >= (size)                                  \
+       ? tre_mem_alloc_impl((mem), 1, NULL, 0, (size)) \
+       : tre_mem_alloc_impl((mem), 1, alloca(TRE_MEM_BLOCK_SIZE), 0, (size)))
 #endif /* TRE_USE_ALLOCA */
 
-
 /* Frees the memory allocator and all memory allocated with it. */
 void tre_mem_destroy(tre_mem_t mem);
-
-#define xmalloc malloc
-#define xcalloc calloc
-#define xfree free
-#define xrealloc realloc
-
diff --git a/third_party/sed/BUILD.mk b/third_party/sed/BUILD.mk
index cb708a5b8..37cf771b4 100644
--- a/third_party/sed/BUILD.mk
+++ b/third_party/sed/BUILD.mk
@@ -23,7 +23,6 @@ THIRD_PARTY_SED_A_DIRECTDEPS =				\
 	LIBC_STR					\
 	LIBC_LOG					\
 	THIRD_PARTY_GETOPT				\
-	THIRD_PARTY_MUSL				\
 	THIRD_PARTY_REGEX
 
 THIRD_PARTY_SED_A_DEPS :=				\
diff --git a/third_party/sed/defs.h b/third_party/sed/defs.h
index 84cba2337..54ac79922 100644
--- a/third_party/sed/defs.h
+++ b/third_party/sed/defs.h
@@ -3,7 +3,6 @@
 #include "libc/calls/typedef/u.h"
 #include "libc/limits.h"
 #include "third_party/regex/regex.h"
-#include "third_party/sed/shade.h"
 COSMOPOLITAN_C_START_
 
 /*
diff --git a/third_party/sed/extern.h b/third_party/sed/extern.h
index fbeb0497a..0c190c9bc 100644
--- a/third_party/sed/extern.h
+++ b/third_party/sed/extern.h
@@ -4,7 +4,6 @@
 #include "libc/stdio/stdio.h"
 #include "third_party/regex/regex.h"
 #include "third_party/sed/defs.h"
-#include "third_party/sed/shade.h"
 COSMOPOLITAN_C_START_
 
 extern struct s_command *prog;
diff --git a/third_party/sed/shade.h b/third_party/sed/shade.h
deleted file mode 100644
index ceea8d4b5..000000000
--- a/third_party/sed/shade.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_SED_SHADE_H_
-#define COSMOPOLITAN_THIRD_PARTY_SED_SHADE_H_
-
-#define prog _sed_prog
-#define appends_ _sed_appends_
-#define g_match _sed_g_match
-#define maxnsub _sed_maxnsub
-#define linenum _sed_linenum
-#define appendnum _sed_appendnum
-#define aflag _sed_aflag
-#define eflag _sed_eflag
-#define nflag _sed_nflag
-#define fname _sed_fname
-#define outfname _sed_outfname
-#define infile _sed_infile
-#define outfile _sed_outfile
-#define rflags _sed_rflags
-#define cfclose _sed_cfclose
-#define compile _sed_compile
-#define cspace _sed_cspace
-#define cu_fgets _sed_cu_fgets
-#define mf_fgets _sed_mf_fgets
-#define lastline _sed_lastline
-#define process _sed_process
-#define resetstate _sed_resetstate
-#define strregerror _sed_strregerror
-#define xmalloc _sed_xmalloc
-#define xrealloc _sed_xrealloc
-#define xcalloc _sed_xcalloc
-
-#endif /* COSMOPOLITAN_THIRD_PARTY_SED_SHADE_H_ */
diff --git a/third_party/smallz4/BUILD.mk b/third_party/smallz4/BUILD.mk
index c4e38d8e7..ecc92c5a3 100644
--- a/third_party/smallz4/BUILD.mk
+++ b/third_party/smallz4/BUILD.mk
@@ -36,9 +36,7 @@ THIRD_PARTY_SMALLZ4_A_DIRECTDEPS =				\
 	LIBC_CALLS						\
 	LIBC_STDIO						\
 	LIBC_STR						\
-	THIRD_PARTY_LIBCXX					\
-	THIRD_PARTY_LIBCXXABI					\
-	THIRD_PARTY_LIBUNWIND					\
+	THIRD_PARTY_LIBCXX
 
 THIRD_PARTY_SMALLZ4_A_DEPS :=					\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_SMALLZ4_A_DIRECTDEPS),$($(x))))
diff --git a/third_party/sqlite3/BUILD.mk b/third_party/sqlite3/BUILD.mk
index 3ea4086fa..3410c4975 100644
--- a/third_party/sqlite3/BUILD.mk
+++ b/third_party/sqlite3/BUILD.mk
@@ -52,7 +52,6 @@ THIRD_PARTY_SQLITE3_A_DIRECTDEPS =					\
 	LIBC_RUNTIME							\
 	LIBC_STDIO							\
 	LIBC_STR							\
-	LIBC_SYSTEM							\
 	LIBC_SYSV							\
 	LIBC_SYSV_CALLS							\
 	LIBC_THREAD							\
@@ -63,7 +62,7 @@ THIRD_PARTY_SQLITE3_A_DIRECTDEPS =					\
 	THIRD_PARTY_MUSL						\
 	THIRD_PARTY_TZ							\
 	THIRD_PARTY_ZLIB						\
-	TOOL_ARGS							\
+	TOOL_ARGS
 
 THIRD_PARTY_SQLITE3_A_DEPS :=						\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_SQLITE3_A_DIRECTDEPS),$($(x))))
diff --git a/third_party/sqlite3/shell.c b/third_party/sqlite3/shell.c
index 5c29318c1..e81818669 100644
--- a/third_party/sqlite3/shell.c
+++ b/third_party/sqlite3/shell.c
@@ -132,7 +132,7 @@ typedef unsigned short int u16;
 #include "libc/sysv/consts/s.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/symbols.internal.h"
-#include "libc/cosmo.h"
+#include "tool/args/args.h"
 #include "third_party/sqlite3/extensions.h"
 #include "third_party/sqlite3/sqlite3expert.h"
 #include "third_party/zlib/zlib.h"
diff --git a/third_party/stb/stb_image.c b/third_party/stb/stb_image.c
index 74eca2447..60b55072d 100644
--- a/third_party/stb/stb_image.c
+++ b/third_party/stb/stb_image.c
@@ -25,7 +25,7 @@
 #include "libc/limits.h"
 #include "libc/log/gdb.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/x86feature.h"
diff --git a/third_party/stb/stb_image_resize.c b/third_party/stb/stb_image_resize.c
index 43fcb1710..7fc71a33e 100644
--- a/third_party/stb/stb_image_resize.c
+++ b/third_party/stb/stb_image_resize.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/stb/stb_image_resize.h"
 #include "libc/assert.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
diff --git a/third_party/stb/stb_image_write.c b/third_party/stb/stb_image_write.c
index 573cb893b..9af55ae36 100644
--- a/third_party/stb/stb_image_write.c
+++ b/third_party/stb/stb_image_write.c
@@ -21,7 +21,7 @@
 #include "libc/assert.h"
 #include "libc/fmt/conv.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/nexgen32e.h"
diff --git a/third_party/stb/stb_truetype.c b/third_party/stb/stb_truetype.c
index ef59f4b6a..e1449c11b 100644
--- a/third_party/stb/stb_truetype.c
+++ b/third_party/stb/stb_truetype.c
@@ -29,7 +29,7 @@
 #include "libc/assert.h"
 #include "libc/serialize.h"
 #include "libc/intrin/likely.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/third_party/tr/BUILD.mk b/third_party/tr/BUILD.mk
index 313bfe150..54a17731b 100644
--- a/third_party/tr/BUILD.mk
+++ b/third_party/tr/BUILD.mk
@@ -22,8 +22,7 @@ THIRD_PARTY_TR_DIRECTDEPS =			\
 	LIBC_RUNTIME				\
 	LIBC_STDIO				\
 	LIBC_STR				\
-	THIRD_PARTY_GETOPT			\
-	THIRD_PARTY_MUSL			\
+	THIRD_PARTY_GETOPT
 
 THIRD_PARTY_TR_DEPS :=				\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_TR_DIRECTDEPS),$($(x))))
diff --git a/third_party/tr/extern.h b/third_party/tr/extern.h
index 14995931e..31b1ed884 100644
--- a/third_party/tr/extern.h
+++ b/third_party/tr/extern.h
@@ -3,8 +3,6 @@
 #include "libc/limits.h"
 COSMOPOLITAN_C_START_
 
-#define next _tr_next
-
 typedef struct {
   enum { STRING1, STRING2 } which;
   enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
diff --git a/third_party/tr/tr.c b/third_party/tr/tr.c
index d9c7f572a..fd1fe6e11 100644
--- a/third_party/tr/tr.c
+++ b/third_party/tr/tr.c
@@ -42,8 +42,8 @@
 #include "third_party/tr/cmd.h"
 #include "third_party/tr/extern.h"
 
-static int delete[NCHARS], squeeze[NCHARS];
-static int translate[NCHARS] = {
+int delete[NCHARS], squeeze[NCHARS];
+int translate[NCHARS] = {
 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,		/* ASCII */
 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
@@ -78,8 +78,8 @@ static int translate[NCHARS] = {
 	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
 };
 
-static STR s1 = { STRING1, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
-static STR s2 = { STRING2, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
+STR s1 = { STRING1, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
+STR s2 = { STRING2, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
 
 static void setup(int *, char *, STR *, int);
 static void usage(void);
@@ -90,6 +90,9 @@ _tr(int argc, char *argv[])
 	int ch, cnt, lastch, *p;
 	int cflag, dflag, sflag;
 
+	if (pledge("stdio", NULL) == -1)
+		err(1, "pledge");
+
 	cflag = dflag = sflag = 0;
 	while ((ch = getopt(argc, argv, "Ccds")) != -1)
 		switch(ch) {
diff --git a/third_party/tree/tree.h b/third_party/tree/tree.h
index 473d487ee..f44295315 100644
--- a/third_party/tree/tree.h
+++ b/third_party/tree/tree.h
@@ -20,7 +20,7 @@
 #include "libc/calls/struct/stat.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 #ifdef __ANDROID
 #define mbstowcs(w,m,x) mbsrtowcs(w,(const char**)(& #m),x,NULL)
diff --git a/third_party/tz/asctime.c b/third_party/tz/asctime.c
new file mode 100644
index 000000000..eb257aa31
--- /dev/null
+++ b/third_party/tz/asctime.c
@@ -0,0 +1,135 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
+│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/str.h"
+#include "libc/stdio/stdio.h"
+#include "private.h"
+
+/* asctime and asctime_r a la POSIX and ISO C, except pad years before 1000.  */
+
+/*
+** This file is in the public domain, so clarified as of
+** 1996-06-05 by Arthur David Olson.
+*/
+
+/*
+** Avoid the temptation to punt entirely to strftime;
+** the output of strftime is supposed to be locale specific
+** whereas the output of asctime is supposed to be constant.
+*/
+
+/*LINTLIBRARY*/
+
+/*
+** All years associated with 32-bit time_t values are exactly four digits long;
+** some years associated with 64-bit time_t values are not.
+** Vintage programs are coded for years that are always four digits long
+** and may assume that the newline always lands in the same place.
+** For years that are less than four digits, we pad the output with
+** leading zeroes to get the newline in the traditional place.
+** The -4 ensures that we get four characters of output even if
+** we call a strftime variant that produces fewer characters for some years.
+** The ISO C and POSIX standards prohibit padding the year,
+** but many implementations pad anyway; most likely the standards are buggy.
+*/
+static char const ASCTIME_FMT[] = "%s %s%3d %.2d:%.2d:%.2d %-4s\n";
+/*
+** For years that are more than four digits we put extra spaces before the year
+** so that code trying to overwrite the newline won't end up overwriting
+** a digit within a year and truncating the year (operating on the assumption
+** that no output is better than wrong output).
+*/
+static char const ASCTIME_FMT_B[] = "%s %s%3d %.2d:%.2d:%.2d     %s\n";
+
+enum { STD_ASCTIME_BUF_SIZE = 26 };
+/*
+** Big enough for something such as
+** ??? ???-2147483648 -2147483648:-2147483648:-2147483648     -2147483648\n
+** (two three-character abbreviations, five strings denoting integers,
+** seven explicit spaces, two explicit colons, a newline,
+** and a trailing NUL byte).
+** The values above are for systems where an int is 32 bits and are provided
+** as an example; the size expression below is a bound for the system at
+** hand.
+*/
+static char buf_asctime[2*3 + 5*INT_STRLEN_MAXIMUM(int) + 7 + 2 + 1 + 1];
+
+/* A similar buffer for ctime.
+   C89 requires that they be the same buffer.
+   This requirement was removed in C99, so support it only if requested,
+   as support is more likely to lead to bugs in badly written programs.  */
+#if SUPPORT_C89
+# define buf_ctime buf_asctime
+#else
+static char buf_ctime[sizeof buf_asctime];
+#endif
+
+char *
+asctime_r(struct tm const *restrict timeptr, char *restrict buf)
+{
+	static const char	wday_name[][4] = {
+		"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
+	};
+	static const char	mon_name[][4] = {
+		"Jan", "Feb", "Mar", "Apr", "May", "Jun",
+		"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+	};
+	register const char *	wn;
+	register const char *	mn;
+	char			year[INT_STRLEN_MAXIMUM(int) + 2];
+	char result[sizeof buf_asctime];
+
+	if (timeptr == NULL) {
+		errno = EINVAL;
+		return strcpy(buf, "??? ??? ?? ??:??:?? ????\n");
+	}
+	if (timeptr->tm_wday < 0 || timeptr->tm_wday >= DAYSPERWEEK)
+		wn = "???";
+	else	wn = wday_name[timeptr->tm_wday];
+	if (timeptr->tm_mon < 0 || timeptr->tm_mon >= MONSPERYEAR)
+		mn = "???";
+	else	mn = mon_name[timeptr->tm_mon];
+	/*
+	** Use strftime's %Y to generate the year, to avoid overflow problems
+	** when computing timeptr->tm_year + TM_YEAR_BASE.
+	** Assume that strftime is unaffected by other out-of-range members
+	** (e.g., timeptr->tm_mday) when processing "%Y".
+	*/
+	strftime(year, sizeof year, "%Y", timeptr);
+	/*
+	** We avoid using snprintf since it's not available on all systems.
+	*/
+	sprintf(result,
+		((strlen(year) <= 4) ? ASCTIME_FMT : ASCTIME_FMT_B),
+		wn, mn,
+		timeptr->tm_mday, timeptr->tm_hour,
+		timeptr->tm_min, timeptr->tm_sec,
+		year);
+	if (strlen(result) < STD_ASCTIME_BUF_SIZE
+	    || buf == buf_ctime || buf == buf_asctime)
+		return strcpy(buf, result);
+	else {
+		errno = EOVERFLOW;
+		return NULL;
+	}
+}
+
+char *
+asctime(register const struct tm *timeptr)
+{
+	return asctime_r(timeptr, buf_asctime);
+}
+
+char *
+ctime_r(const time_t *timep, char *buf)
+{
+	struct tm mytm;
+	struct tm *tmp = localtime_r(timep, &mytm);
+	return tmp ? asctime_r(tmp, buf) : NULL;
+}
+
+char *
+ctime(const time_t *timep)
+{
+	return ctime_r(timep, buf_ctime);
+}
diff --git a/third_party/musl/ctime.c b/third_party/tz/ctime.c
similarity index 100%
rename from third_party/musl/ctime.c
rename to third_party/tz/ctime.c
diff --git a/third_party/musl/ctime_r.c b/third_party/tz/ctime_r.c
similarity index 100%
rename from third_party/musl/ctime_r.c
rename to third_party/tz/ctime_r.c
diff --git a/third_party/tz/difftime.c b/third_party/tz/difftime.c
index 33622f2ff..929dd5b14 100644
--- a/third_party/tz/difftime.c
+++ b/third_party/tz/difftime.c
@@ -2,7 +2,7 @@
 │ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/weirdtypes.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/time.h"
 
 /* Return the difference between two timestamps.  */
diff --git a/third_party/tz/localtime.c b/third_party/tz/localtime.c
index 34f7cf648..06139e49f 100644
--- a/third_party/tz/localtime.c
+++ b/third_party/tz/localtime.c
@@ -2,10 +2,6 @@
 │ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #define LOCALTIME_IMPLEMENTATION
-#include "lock.h"
-#include "tzdir.h"
-#include "tzfile.h"
-#include "private.h"
 #include "libc/calls/blockcancel.internal.h"
 #include "libc/calls/calls.h"
 #include "libc/cxxabi.h"
@@ -14,15 +10,20 @@
 #include "libc/serialize.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/o.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/tls.h"
 #include "libc/time.h"
 #include "libc/inttypes.h"
 #include "libc/sysv/consts/ok.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdckdint.h"
 #include "libc/time.h"
+#include "tzdir.h"
+#include "tzfile.h"
 #include "libc/nt/struct/timezoneinformation.h"
 #include "libc/nt/time.h"
 #include "libc/dce.h"
+#include "private.h"
 
 /* Convert timestamp from time_t to struct tm.  */
 
@@ -623,10 +624,34 @@ localtime_windows_init(void)
 	setenv("TZ", buf, true);
 }
 
+static pthread_mutex_t locallock = PTHREAD_MUTEX_INITIALIZER;
+
+static dontinline void
+localtime_wipe(void)
+{
+	pthread_mutex_init(&locallock, 0);
+}
+
+static dontinline void
+localtime_lock(void)
+{
+	pthread_mutex_lock(&locallock);
+}
+
+static dontinline void
+localtime_unlock(void)
+{
+	pthread_mutex_unlock(&locallock);
+}
+
 __attribute__((__constructor__(80)))
 textstartup static void
 localtime_init(void)
 {
+	localtime_wipe();
+	pthread_atfork(localtime_lock,
+		       localtime_unlock,
+		       localtime_wipe);
 	if (IsWindows())
 		localtime_windows_init();
 }
@@ -2027,9 +2052,9 @@ localtime_tzset_unlocked(void)
 void
 tzset(void)
 {
-	__localtime_lock();
+	localtime_lock();
 	localtime_tzset_unlocked();
-	__localtime_unlock();
+	localtime_unlock();
 }
 
 static void
@@ -2042,7 +2067,7 @@ static void
 localtime_gmtcheck(void)
 {
 	static bool gmt_is_set;
-	__localtime_lock();
+	localtime_lock();
 	if (! gmt_is_set) {
 #ifdef ALL_STATE
 		gmtptr = malloc(sizeof *gmtptr);
@@ -2052,7 +2077,7 @@ localtime_gmtcheck(void)
 			localtime_gmtload(gmtptr);
 		gmt_is_set = true;
 	}
-	__localtime_unlock();
+	localtime_unlock();
 }
 
 /*
@@ -2168,11 +2193,11 @@ localsub(struct state const *sp, time_t const *timep, int_fast32_t setname,
 static struct tm *
 localtime_tzset(time_t const *timep, struct tm *tmp, bool setname)
 {
-	__localtime_lock();
+	localtime_lock();
 	if (setname || !lcl_is_set)
 		localtime_tzset_unlocked();
 	tmp = localsub(lclptr, timep, setname, tmp);
-	__localtime_unlock();
+	localtime_unlock();
 	return tmp;
 }
 
@@ -2809,10 +2834,10 @@ time_t
 mktime(struct tm *tmp)
 {
 	time_t t;
-	__localtime_lock();
+	localtime_lock();
 	localtime_tzset_unlocked();
 	t = mktime_tzname(lclptr, tmp, true);
-	__localtime_unlock();
+	localtime_unlock();
 	return t;
 }
 
diff --git a/third_party/tz/lock.h b/third_party/tz/lock.h
deleted file mode 100644
index 501505478..000000000
--- a/third_party/tz/lock.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_TZ_LOCK_H_
-#define COSMOPOLITAN_THIRD_PARTY_TZ_LOCK_H_
-#include "libc/thread/thread.h"
-COSMOPOLITAN_C_START_
-
-void __localtime_lock(void);
-void __localtime_unlock(void);
-void __localtime_wipe(void);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_TZ_LOCK_H_ */
diff --git a/third_party/tz/strftime.c b/third_party/tz/strftime.c
new file mode 100644
index 000000000..372b98071
--- /dev/null
+++ b/third_party/tz/strftime.c
@@ -0,0 +1,646 @@
+/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8   -*-│
+│ vi: set noet ft=c ts=8 sw=8 fenc=utf-8                                   :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright (c) 1989 The Regents of the University of California.              │
+│ All rights reserved.                                                         │
+│                                                                              │
+│ Redistribution and use in source and binary forms are permitted              │
+│ provided that the above copyright notice and this paragraph are              │
+│ duplicated in all such forms and that any documentation,                     │
+│ advertising materials, and other materials related to such                   │
+│ distribution and use acknowledge that the software was developed             │
+│ by the University of California, Berkeley.  The name of the                  │
+│ University may not be used to endorse or promote products derived            │
+│ from this software without specific prior written permission.                │
+│ THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR               │
+│ IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED               │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.          │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/locale.h"
+#include "libc/time.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
+#include "libc/inttypes.h"
+#include "private.h"
+
+__notice(strftime_notice, "strftime (BSD-3)\n\
+Copyright 1989 The Regents of the University of California");
+
+#ifndef DEPRECATE_TWO_DIGIT_YEARS
+# define DEPRECATE_TWO_DIGIT_YEARS false
+#endif
+
+struct lc_time_T {
+	const char *	mon[MONSPERYEAR];
+	const char *	month[MONSPERYEAR];
+	const char *	wday[DAYSPERWEEK];
+	const char *	weekday[DAYSPERWEEK];
+	const char *	X_fmt;
+	const char *	x_fmt;
+	const char *	c_fmt;
+	const char *	am;
+	const char *	pm;
+	const char *	date_fmt;
+};
+
+static const struct lc_time_T	C_time_locale = {
+	{
+		"Jan", "Feb", "Mar", "Apr", "May", "Jun",
+		"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+	}, {
+		"January", "February", "March", "April", "May", "June",
+		"July", "August", "September", "October", "November", "December"
+	}, {
+		"Sun", "Mon", "Tue", "Wed",
+		"Thu", "Fri", "Sat"
+	}, {
+		"Sunday", "Monday", "Tuesday", "Wednesday",
+		"Thursday", "Friday", "Saturday"
+	},
+
+	/* X_fmt */
+	"%H:%M:%S",
+
+	/*
+	** x_fmt
+	** C99 and later require this format.
+	** Using just numbers (as here) makes Quakers happier;
+	** it's also compatible with SVR4.
+	*/
+	"%m/%d/%y",
+
+	/*
+	** c_fmt
+	** C99 and later require this format.
+	** Previously this code used "%D %X", but we now conform to C99.
+	** Note that
+	**	"%a %b %d %H:%M:%S %Y"
+	** is used by Solaris 2.3.
+	*/
+	"%a %b %e %T %Y",
+
+	/* am */
+	"AM",
+
+	/* pm */
+	"PM",
+
+	/* date_fmt */
+	"%a %b %e %H:%M:%S %Z %Y"
+};
+
+enum warn { IN_NONE, IN_SOME, IN_THIS, IN_ALL };
+
+#ifndef YEAR_2000_NAME
+# define YEAR_2000_NAME "CHECK_STRFTIME_FORMATS_FOR_TWO_DIGIT_YEARS"
+#endif /* !defined YEAR_2000_NAME */
+
+static char *
+_add(const char *str, char *pt, const char *ptlim)
+{
+	while (pt < ptlim && (*pt = *str++) != '\0')
+		++pt;
+	return pt;
+}
+
+static char *
+_conv(int n, const char *format, char *pt, const char *ptlim)
+{
+	char	buf[INT_STRLEN_MAXIMUM(int) + 1];
+
+	sprintf(buf, format, n);
+	return _add(buf, pt, ptlim);
+}
+
+/*
+** POSIX and the C Standard are unclear or inconsistent about
+** what %C and %y do if the year is negative or exceeds 9999.
+** Use the convention that %C concatenated with %y yields the
+** same output as %Y, and that %Y contains at least 4 bytes,
+** with more only if necessary.
+*/
+
+static char *
+_yconv(int a, int b, bool convert_top, bool convert_yy,
+       char *pt, const char *ptlim)
+{
+	register int	lead;
+	register int	trail;
+
+	int DIVISOR = 100;
+	trail = a % DIVISOR + b % DIVISOR;
+	lead = a / DIVISOR + b / DIVISOR + trail / DIVISOR;
+	trail %= DIVISOR;
+	if (trail < 0 && lead > 0) {
+		trail += DIVISOR;
+		--lead;
+	} else if (lead < 0 && trail > 0) {
+		trail -= DIVISOR;
+		++lead;
+	}
+	if (convert_top) {
+		if (lead == 0 && trail < 0)
+			pt = _add("-0", pt, ptlim);
+		else	pt = _conv(lead, "%02d", pt, ptlim);
+	}
+	if (convert_yy)
+		pt = _conv(((trail < 0) ? -trail : trail), "%02d", pt, ptlim);
+	return pt;
+}
+
+static char *
+_fmt(const char *format, const struct tm *t, char *pt,
+     const char *ptlim, enum warn *warnp)
+{
+	struct lc_time_T const *Locale = &C_time_locale;
+
+	for ( ; *format; ++format) {
+		if (*format == '%') {
+label:
+			switch (*++format) {
+			case '\0':
+				--format;
+				break;
+			case 'A':
+				pt = _add((t->tm_wday < 0 ||
+					t->tm_wday >= DAYSPERWEEK) ?
+					"?" : Locale->weekday[t->tm_wday],
+					pt, ptlim);
+				continue;
+			case 'a':
+				pt = _add((t->tm_wday < 0 ||
+					t->tm_wday >= DAYSPERWEEK) ?
+					"?" : Locale->wday[t->tm_wday],
+					pt, ptlim);
+				continue;
+			case 'B':
+				pt = _add((t->tm_mon < 0 ||
+					t->tm_mon >= MONSPERYEAR) ?
+					"?" : Locale->month[t->tm_mon],
+					pt, ptlim);
+				continue;
+			case 'b':
+			case 'h':
+				pt = _add((t->tm_mon < 0 ||
+					t->tm_mon >= MONSPERYEAR) ?
+					"?" : Locale->mon[t->tm_mon],
+					pt, ptlim);
+				continue;
+			case 'C':
+				/*
+				** %C used to do a...
+				**	_fmt("%a %b %e %X %Y", t);
+				** ...whereas now POSIX 1003.2 calls for
+				** something completely different.
+				** (ado, 1993-05-24)
+				*/
+				pt = _yconv(t->tm_year, TM_YEAR_BASE,
+					    true, false, pt, ptlim);
+				continue;
+			case 'c':
+				{
+				enum warn warn2 = IN_SOME;
+
+				pt = _fmt(Locale->c_fmt, t, pt, ptlim, &warn2);
+				if (warn2 == IN_ALL)
+					warn2 = IN_THIS;
+				if (warn2 > *warnp)
+					*warnp = warn2;
+				}
+				continue;
+			case 'D':
+				pt = _fmt("%m/%d/%y", t, pt, ptlim, warnp);
+				continue;
+			case 'd':
+				pt = _conv(t->tm_mday, "%02d", pt, ptlim);
+				continue;
+			case 'E':
+			case 'O':
+				/*
+				** Locale modifiers of C99 and later.
+				** The sequences
+				**	%Ec %EC %Ex %EX %Ey %EY
+				**	%Od %oe %OH %OI %Om %OM
+				**	%OS %Ou %OU %OV %Ow %OW %Oy
+				** are supposed to provide alternative
+				** representations.
+				*/
+				goto label;
+			case 'e':
+				pt = _conv(t->tm_mday, "%2d", pt, ptlim);
+				continue;
+			case 'F':
+				pt = _fmt("%Y-%m-%d", t, pt, ptlim, warnp);
+				continue;
+			case 'H':
+				pt = _conv(t->tm_hour, "%02d", pt, ptlim);
+				continue;
+			case 'I':
+				pt = _conv((t->tm_hour % 12) ?
+					(t->tm_hour % 12) : 12,
+					"%02d", pt, ptlim);
+				continue;
+			case 'j':
+				pt = _conv(t->tm_yday + 1, "%03d", pt, ptlim);
+				continue;
+			case 'k':
+				/*
+				** This used to be...
+				**	_conv(t->tm_hour % 12 ?
+				**		t->tm_hour % 12 : 12, 2, ' ');
+				** ...and has been changed to the below to
+				** match SunOS 4.1.1 and Arnold Robbins'
+				** strftime version 3.0. That is, "%k" and
+				** "%l" have been swapped.
+				** (ado, 1993-05-24)
+				*/
+				pt = _conv(t->tm_hour, "%2d", pt, ptlim);
+				continue;
+#ifdef KITCHEN_SINK
+			case 'K':
+				/*
+				** After all this time, still unclaimed!
+				*/
+				pt = _add("kitchen sink", pt, ptlim);
+				continue;
+#endif /* defined KITCHEN_SINK */
+			case 'l':
+				/*
+				** This used to be...
+				**	_conv(t->tm_hour, 2, ' ');
+				** ...and has been changed to the below to
+				** match SunOS 4.1.1 and Arnold Robbin's
+				** strftime version 3.0. That is, "%k" and
+				** "%l" have been swapped.
+				** (ado, 1993-05-24)
+				*/
+				pt = _conv((t->tm_hour % 12) ?
+					(t->tm_hour % 12) : 12,
+					"%2d", pt, ptlim);
+				continue;
+			case 'M':
+				pt = _conv(t->tm_min, "%02d", pt, ptlim);
+				continue;
+			case 'm':
+				pt = _conv(t->tm_mon + 1, "%02d", pt, ptlim);
+				continue;
+			case 'n':
+				pt = _add("\n", pt, ptlim);
+				continue;
+			case 'p':
+				pt = _add((t->tm_hour >= (HOURSPERDAY / 2)) ?
+					Locale->pm :
+					Locale->am,
+					pt, ptlim);
+				continue;
+			case 'R':
+				pt = _fmt("%H:%M", t, pt, ptlim, warnp);
+				continue;
+			case 'r':
+				pt = _fmt("%I:%M:%S %p", t, pt, ptlim, warnp);
+				continue;
+			case 'S':
+				pt = _conv(t->tm_sec, "%02d", pt, ptlim);
+				continue;
+			case 's':
+				{
+					struct tm	tm;
+					char		buf[INT_STRLEN_MAXIMUM(
+								time_t) + 1];
+					time_t		mkt;
+
+					tm.tm_sec = t->tm_sec;
+					tm.tm_min = t->tm_min;
+					tm.tm_hour = t->tm_hour;
+					tm.tm_mday = t->tm_mday;
+					tm.tm_mon = t->tm_mon;
+					tm.tm_year = t->tm_year;
+#ifdef TM_GMTOFF
+					mkt = timeoff(&tm, t->TM_GMTOFF);
+#else
+					tm.tm_isdst = t->tm_isdst;
+					mkt = mktime(&tm);
+#endif
+					/* If mktime fails, %s expands to the
+					   value of (time_t) -1 as a failure
+					   marker; this is better in practice
+					   than strftime failing.  */
+					if (TYPE_SIGNED(time_t)) {
+					  intmax_t n = mkt;
+					  sprintf(buf, "%"PRIdMAX, n);
+					} else {
+					  uintmax_t n = mkt;
+					  sprintf(buf, "%"PRIuMAX, n);
+					}
+					pt = _add(buf, pt, ptlim);
+				}
+				continue;
+			case 'T':
+				pt = _fmt("%H:%M:%S", t, pt, ptlim, warnp);
+				continue;
+			case 't':
+				pt = _add("\t", pt, ptlim);
+				continue;
+			case 'U':
+				pt = _conv((t->tm_yday + DAYSPERWEEK -
+					t->tm_wday) / DAYSPERWEEK,
+					"%02d", pt, ptlim);
+				continue;
+			case 'u':
+				/*
+				** From Arnold Robbins' strftime version 3.0:
+				** "ISO 8601: Weekday as a decimal number
+				** [1 (Monday) - 7]"
+				** (ado, 1993-05-24)
+				*/
+				pt = _conv((t->tm_wday == 0) ?
+					DAYSPERWEEK : t->tm_wday,
+					"%d", pt, ptlim);
+				continue;
+			case 'V':	/* ISO 8601 week number */
+			case 'G':	/* ISO 8601 year (four digits) */
+			case 'g':	/* ISO 8601 year (two digits) */
+/*
+** From Arnold Robbins' strftime version 3.0: "the week number of the
+** year (the first Monday as the first day of week 1) as a decimal number
+** (01-53)."
+** (ado, 1993-05-24)
+**
+** From <https://www.cl.cam.ac.uk/~mgk25/iso-time.html> by Markus Kuhn:
+** "Week 01 of a year is per definition the first week which has the
+** Thursday in this year, which is equivalent to the week which contains
+** the fourth day of January. In other words, the first week of a new year
+** is the week which has the majority of its days in the new year. Week 01
+** might also contain days from the previous year and the week before week
+** 01 of a year is the last week (52 or 53) of the previous year even if
+** it contains days from the new year. A week starts with Monday (day 1)
+** and ends with Sunday (day 7). For example, the first week of the year
+** 1997 lasts from 1996-12-30 to 1997-01-05..."
+** (ado, 1996-01-02)
+*/
+				{
+					int	year;
+					int	base;
+					int	yday;
+					int	wday;
+					int	w;
+
+					year = t->tm_year;
+					base = TM_YEAR_BASE;
+					yday = t->tm_yday;
+					wday = t->tm_wday;
+					for ( ; ; ) {
+						int	len;
+						int	bot;
+						int	top;
+
+						len = isleap_sum(year, base) ?
+							DAYSPERLYEAR :
+							DAYSPERNYEAR;
+						/*
+						** What yday (-3 ... 3) does
+						** the ISO year begin on?
+						*/
+						bot = ((yday + 11 - wday) %
+							DAYSPERWEEK) - 3;
+						/*
+						** What yday does the NEXT
+						** ISO year begin on?
+						*/
+						top = bot -
+							(len % DAYSPERWEEK);
+						if (top < -3)
+							top += DAYSPERWEEK;
+						top += len;
+						if (yday >= top) {
+							++base;
+							w = 1;
+							break;
+						}
+						if (yday >= bot) {
+							w = 1 + ((yday - bot) /
+								DAYSPERWEEK);
+							break;
+						}
+						--base;
+						yday += isleap_sum(year, base) ?
+							DAYSPERLYEAR :
+							DAYSPERNYEAR;
+					}
+#ifdef XPG4_1994_04_09
+					if ((w == 52 &&
+						t->tm_mon == TM_JANUARY) ||
+						(w == 1 &&
+						t->tm_mon == TM_DECEMBER))
+							w = 53;
+#endif /* defined XPG4_1994_04_09 */
+					if (*format == 'V')
+						pt = _conv(w, "%02d",
+							pt, ptlim);
+					else if (*format == 'g') {
+						*warnp = IN_ALL;
+						pt = _yconv(year, base,
+							false, true,
+							pt, ptlim);
+					} else	pt = _yconv(year, base,
+							true, true,
+							pt, ptlim);
+				}
+				continue;
+			case 'v':
+				/*
+				** From Arnold Robbins' strftime version 3.0:
+				** "date as dd-bbb-YYYY"
+				** (ado, 1993-05-24)
+				*/
+				pt = _fmt("%e-%b-%Y", t, pt, ptlim, warnp);
+				continue;
+			case 'W':
+				pt = _conv((t->tm_yday + DAYSPERWEEK -
+					(t->tm_wday ?
+					(t->tm_wday - 1) :
+					(DAYSPERWEEK - 1))) / DAYSPERWEEK,
+					"%02d", pt, ptlim);
+				continue;
+			case 'w':
+				pt = _conv(t->tm_wday, "%d", pt, ptlim);
+				continue;
+			case 'X':
+				pt = _fmt(Locale->X_fmt, t, pt, ptlim, warnp);
+				continue;
+			case 'x':
+				{
+				enum warn warn2 = IN_SOME;
+
+				pt = _fmt(Locale->x_fmt, t, pt, ptlim, &warn2);
+				if (warn2 == IN_ALL)
+					warn2 = IN_THIS;
+				if (warn2 > *warnp)
+					*warnp = warn2;
+				}
+				continue;
+			case 'y':
+				*warnp = IN_ALL;
+				pt = _yconv(t->tm_year, TM_YEAR_BASE,
+					false, true,
+					pt, ptlim);
+				continue;
+			case 'Y':
+				pt = _yconv(t->tm_year, TM_YEAR_BASE,
+					true, true,
+					pt, ptlim);
+				continue;
+			case 'Z':
+#ifdef TM_ZONE
+				pt = _add(t->TM_ZONE, pt, ptlim);
+#elif HAVE_TZNAME
+				if (t->tm_isdst >= 0)
+					pt = _add(tzname[t->tm_isdst != 0],
+						pt, ptlim);
+#endif
+				/*
+				** C99 and later say that %Z must be
+				** replaced by the empty string if the
+				** time zone abbreviation is not
+				** determinable.
+				*/
+				continue;
+			case 'z':
+#if defined TM_GMTOFF || USG_COMPAT || ALTZONE
+				{
+				long		diff;
+				char const *	sign;
+				bool negative;
+
+# ifdef TM_GMTOFF
+				diff = t->TM_GMTOFF;
+# else
+				/*
+				** C99 and later say that the UT offset must
+				** be computed by looking only at
+				** tm_isdst. This requirement is
+				** incorrect, since it means the code
+				** must rely on magic (in this case
+				** altzone and timezone), and the
+				** magic might not have the correct
+				** offset. Doing things correctly is
+				** tricky and requires disobeying the standard;
+				** see GNU C strftime for details.
+				** For now, punt and conform to the
+				** standard, even though it's incorrect.
+				**
+				** C99 and later say that %z must be replaced by
+				** the empty string if the time zone is not
+				** determinable, so output nothing if the
+				** appropriate variables are not available.
+				*/
+				if (t->tm_isdst < 0)
+					continue;
+				if (t->tm_isdst == 0)
+#  if USG_COMPAT
+					diff = -timezone;
+#  else
+					continue;
+#  endif
+				else
+#  if ALTZONE
+					diff = -altzone;
+#  else
+					continue;
+#  endif
+# endif
+				negative = diff < 0;
+				if (diff == 0) {
+# ifdef TM_ZONE
+				  negative = t->TM_ZONE[0] == '-';
+# else
+				  negative = t->tm_isdst < 0;
+#  if HAVE_TZNAME
+				  if (tzname[t->tm_isdst != 0][0] == '-')
+				    negative = true;
+#  endif
+# endif
+				}
+				if (negative) {
+					sign = "-";
+					diff = -diff;
+				} else	sign = "+";
+				pt = _add(sign, pt, ptlim);
+				diff /= SECSPERMIN;
+				diff = (diff / MINSPERHOUR) * 100 +
+					(diff % MINSPERHOUR);
+				pt = _conv(diff, "%04d", pt, ptlim);
+				}
+#endif
+				continue;
+			case '+':
+				pt = _fmt(Locale->date_fmt, t, pt, ptlim,
+					warnp);
+				continue;
+			case '%':
+			/*
+			** X311J/88-090 (4.12.3.5): if conversion char is
+			** undefined, behavior is undefined. Print out the
+			** character itself as printf(3) also does.
+			*/
+			default:
+				break;
+			}
+		}
+		if (pt == ptlim)
+			break;
+		*pt++ = *format;
+	}
+	return pt;
+}
+
+/**
+ * Converts time to string, e.g.
+ *
+ *     char b[64];
+ *     int64_t sec;
+ *     struct tm tm;
+ *     time(&sec);
+ *     localtime_r(&sec, &tm);
+ *     strftime(b, sizeof(b), "%Y-%m-%dT%H:%M:%S%z", &tm);       // ISO8601
+ *     strftime(b, sizeof(b), "%a, %d %b %Y %H:%M:%S %Z", &tm);  // RFC1123
+ *
+ * @return bytes copied excluding nul, or 0 on error
+ * @see FormatHttpDateTime()
+ */
+size_t
+strftime(char *restrict s, size_t maxsize, char const *restrict format,
+	 struct tm const *restrict t)
+{
+	char *	p;
+	int saved_errno = errno;
+	enum warn warn = IN_NONE;
+
+	tzset();
+	p = _fmt(format, t, s, s + maxsize, &warn);
+	if (!p) {
+	  errno = EOVERFLOW;
+	  return 0;
+	}
+	if (DEPRECATE_TWO_DIGIT_YEARS
+	    && warn != IN_NONE && getenv(YEAR_2000_NAME)) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "strftime format \"%s\" ", format);
+		fprintf(stderr, "yields only two digits of years in ");
+		if (warn == IN_SOME)
+			fprintf(stderr, "some locales");
+		else if (warn == IN_THIS)
+			fprintf(stderr, "the current locale");
+		else	fprintf(stderr, "all locales");
+		fprintf(stderr, "\n");
+	}
+	if (p == s + maxsize) {
+		errno = ERANGE;
+		return 0;
+	}
+	*p = '\0';
+	errno = saved_errno;
+	return p - s;
+}
+
+__weak_reference(strftime, strftime_l);
diff --git a/third_party/unzip/BUILD.mk b/third_party/unzip/BUILD.mk
index a55ca2ff6..b1db75282 100644
--- a/third_party/unzip/BUILD.mk
+++ b/third_party/unzip/BUILD.mk
@@ -24,8 +24,7 @@ THIRD_PARTY_UNZIP_A_DIRECTDEPS =						\
 	LIBC_STR								\
 	LIBC_SYSV								\
 	THIRD_PARTY_BZIP2							\
-	THIRD_PARTY_MUSL							\
-	THIRD_PARTY_TZ								\
+	THIRD_PARTY_TZ
 
 THIRD_PARTY_UNZIP_A_DEPS :=							\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_UNZIP_A_DIRECTDEPS),$($(x))))
diff --git a/third_party/xed/x86ild.greg.c b/third_party/xed/x86ild.greg.c
index cf7749fda..fe70349e2 100644
--- a/third_party/xed/x86ild.greg.c
+++ b/third_party/xed/x86ild.greg.c
@@ -21,7 +21,7 @@
 #include "libc/serialize.h"
 #include "libc/intrin/bsr.h"
 #include "libc/log/libfatal.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/str/str.h"
 #include "third_party/xed/avx512.h"
diff --git a/third_party/xxhash/xxhash.h b/third_party/xxhash/xxhash.h
index a5e4fd471..b69ea3bad 100644
--- a/third_party/xxhash/xxhash.h
+++ b/third_party/xxhash/xxhash.h
@@ -1203,7 +1203,7 @@ struct XXH64_state_s {
 #ifndef XXH_NO_XXH3
 
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #  define XXH_ALIGN(n)      alignas(n)
 #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
 /* In C++ alignas() is a keyword */
diff --git a/third_party/zip/BUILD.mk b/third_party/zip/BUILD.mk
index 1a4ab4c20..d0a163fc3 100644
--- a/third_party/zip/BUILD.mk
+++ b/third_party/zip/BUILD.mk
@@ -85,17 +85,15 @@ THIRD_PARTY_ZIP_DIRECTDEPS =			\
 	LIBC_LOG				\
 	LIBC_MEM				\
 	LIBC_NEXGEN32E				\
-	LIBC_PROC				\
 	LIBC_RUNTIME				\
 	LIBC_STDIO				\
+	LIBC_PROC				\
 	LIBC_STR				\
-	LIBC_SYSTEM				\
 	LIBC_SYSV				\
 	LIBC_X					\
 	THIRD_PARTY_BZIP2			\
-	THIRD_PARTY_MUSL			\
 	THIRD_PARTY_TZ				\
-	THIRD_PARTY_ZLIB			\
+	THIRD_PARTY_ZLIB
 
 THIRD_PARTY_ZIP_DEPS :=				\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_ZIP_DIRECTDEPS),$($(x))))
diff --git a/third_party/zip/README.cosmo b/third_party/zip/README.cosmo
index 309554994..69fbf8c30 100644
--- a/third_party/zip/README.cosmo
+++ b/third_party/zip/README.cosmo
@@ -11,5 +11,4 @@ ORIGIN
 LOCAL CHANGES
 
   - Use Cosmopolitan's PCLMUL optimized CRC32
-  - Don't magically append .zip extension to filename argument
   - Improve find_next_signature() performance using unlocked stdio
diff --git a/third_party/zip/zipfile.c b/third_party/zip/zipfile.c
index 787e73ea4..b03470d79 100644
--- a/third_party/zip/zipfile.c
+++ b/third_party/zip/zipfile.c
@@ -413,10 +413,6 @@ char *ziptyp(s)
   if ((t = malloc(strlen(s) + 5)) == NULL)
     return NULL;
   strcpy(t, s);
-
-  // [jart] don't magically append .zip extension to filename argument
-  if (1) return t;
-
 #  ifdef __human68k__
   _toslash(t);
 #  endif
diff --git a/third_party/zlib/crc_folding.c b/third_party/zlib/crc_folding.c
index cc2fa875a..2702f909a 100644
--- a/third_party/zlib/crc_folding.c
+++ b/third_party/zlib/crc_folding.c
@@ -406,7 +406,7 @@ partial:
     }
 #endif
 
-    memcpy(dst, src, len);  /* TODO: Possibly generate more efficient code. */
+    _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
     partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
         &xmm_crc_part);
 done:
diff --git a/third_party/zlib/deflate.c b/third_party/zlib/deflate.c
index 64ebcddc4..58f9474e7 100644
--- a/third_party/zlib/deflate.c
+++ b/third_party/zlib/deflate.c
@@ -229,8 +229,6 @@ int ZEXPORT deflateInit(strm, level)
     /* To do: ignore strm->next_in if we use it as window */
 }
 
-#define WINDOW_PADDING 8
-
 /* ========================================================================= */
 int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
     z_streamp strm;
@@ -240,6 +238,7 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
     int  memLevel;
     int  strategy;
 {
+    unsigned window_padding = 8;
     deflate_state *s;
     int wrap = 1;
 
@@ -326,12 +325,12 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
     s->hash_shift =  ((s->hash_bits + MIN_MATCH-1) / MIN_MATCH);
 
     s->window = (Bytef *) ZALLOC(strm,
-                                 s->w_size + WINDOW_PADDING,
+                                 s->w_size + window_padding,
                                  2*sizeof(Byte));
     /* Avoid use of unitialized values in the window, see crbug.com/1137613 and
      * crbug.com/1144420 */
     if (s->window) { /* [jart] fix regression in malloc failure checking */
-        zmemzero(s->window, (s->w_size + WINDOW_PADDING) * (2 * sizeof(Byte)));
+        zmemzero(s->window, (s->w_size + window_padding) * (2 * sizeof(Byte)));
     }
     s->prev   = (Posf *)  ZALLOC(strm, s->w_size, sizeof(Pos));
     /* Avoid use of uninitialized value, see:
@@ -771,12 +770,6 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
         wraplen = 6;
     }
 
-    /* With Chromium's hashing, s->hash_bits may not correspond to the
-       memLevel, making the computations below incorrect. Return the
-       conservative bound. */
-    if (s->chromium_zlib_hash)
-        return (fixedlen > storelen ? fixedlen : storelen) + wraplen;
-
     /* if not default parameters, return one of the conservative bounds */
     if (s->w_bits != 15 || s->hash_bits != 8 + 7)
         return (s->w_bits <= s->hash_bits ? fixedlen : storelen) + wraplen;
@@ -1206,9 +1199,7 @@ int ZEXPORT deflateCopy(dest, source)
     zmemcpy((voidpf)ds, (voidpf)ss, sizeof(deflate_state));
     ds->strm = dest;
 
-    ds->window = (Bytef *) ZALLOC(dest,
-                                  ds->w_size + WINDOW_PADDING,
-                                  2*sizeof(Byte));
+    ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte));
     ds->prev   = (Posf *)  ZALLOC(dest, ds->w_size, sizeof(Pos));
     ds->head   = (Posf *)  ZALLOC(dest, ds->hash_size, sizeof(Pos));
     ds->pending_buf = (uchf *) ZALLOC(dest, ds->lit_bufsize, 4);
@@ -1219,8 +1210,7 @@ int ZEXPORT deflateCopy(dest, source)
         return Z_MEM_ERROR;
     }
     /* following zmemcpy do not work for 16-bit MSDOS */
-    zmemcpy(ds->window, ss->window,
-            (ds->w_size + WINDOW_PADDING) * 2 * sizeof(Byte));
+    zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte));
     zmemcpy((voidpf)ds->prev, (voidpf)ss->prev, ds->w_size * sizeof(Pos));
     zmemcpy((voidpf)ds->head, (voidpf)ss->head, ds->hash_size * sizeof(Pos));
     zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size);
diff --git a/third_party/zlib/inflate.c b/third_party/zlib/inflate.c
index acc88d1dc..75fa6b56e 100644
--- a/third_party/zlib/inflate.c
+++ b/third_party/zlib/inflate.c
@@ -256,8 +256,6 @@ int value;
     struct inflate_state FAR *state;
 
     if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
-    if (bits == 0)
-        return Z_OK;
     state = (struct inflate_state FAR *)strm->state;
     if (bits < 0) {
         state->hold = 0;
@@ -1482,7 +1480,7 @@ z_streamp strm;
     /* if first time, start search in bit buffer */
     if (state->mode != SYNC) {
         state->mode = SYNC;
-        state->hold >>= state->bits & 7;
+        state->hold <<= state->bits & 7;
         state->bits -= state->bits & 7;
         len = 0;
         while (state->bits >= 8) {
@@ -1553,9 +1551,8 @@ z_streamp source;
     if (copy == Z_NULL) return Z_MEM_ERROR;
     window = Z_NULL;
     if (state->window != Z_NULL) {
-        window = (unsigned char FAR *)ZALLOC(
-            source, (1U << state->wbits) + CHUNKCOPY_CHUNK_SIZE,
-            sizeof(unsigned char));
+        window = (unsigned char FAR *)
+                 ZALLOC(source, 1U << state->wbits, sizeof(unsigned char));
         if (window == Z_NULL) {
             ZFREE(source, copy);
             return Z_MEM_ERROR;
diff --git a/third_party/zstd/lib/common/compiler.h b/third_party/zstd/lib/common/compiler.h
index e0aee016d..f4b6aec75 100644
--- a/third_party/zstd/lib/common/compiler.h
+++ b/third_party/zstd/lib/common/compiler.h
@@ -288,7 +288,7 @@
 
 # elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
 /* C11 support */
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #  define ZSTD_ALIGNOF(T) alignof(T)
 
 # else
diff --git a/third_party/zstd/lib/common/xxhash.h b/third_party/zstd/lib/common/xxhash.h
index fa8d21d69..6a4ea347b 100644
--- a/third_party/zstd/lib/common/xxhash.h
+++ b/third_party/zstd/lib/common/xxhash.h
@@ -1019,7 +1019,7 @@ struct XXH64_state_s {
 #ifndef XXH_NO_XXH3
 
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #  define XXH_ALIGN(n)      alignas(n)
 #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
 /* In C++ alignas() is a keyword */
diff --git a/tool/args/BUILD.mk b/tool/args/BUILD.mk
index 1ed3f664a..40aca11cd 100644
--- a/tool/args/BUILD.mk
+++ b/tool/args/BUILD.mk
@@ -16,7 +16,6 @@ TOOL_ARGS_A_CHECKS =				\
 	$(TOOL_ARGS_A).pkg
 
 TOOL_ARGS_A_DIRECTDEPS =			\
-	LIBC_CALLS				\
 	LIBC_INTRIN				\
 	LIBC_MEM				\
 	LIBC_NEXGEN32E				\
diff --git a/tool/args/args.c b/tool/args/args.c
index 380f70165..cfb88fd59 100644
--- a/tool/args/args.c
+++ b/tool/args/args.c
@@ -16,6 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "tool/args/args.h"
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/errno.h"
@@ -133,7 +134,6 @@ int LoadZipArgsImpl(int *argc, char ***argv, char *data) {
  * replaced with whatever CLI args were specified by the user.
  *
  * @return 0 on success, or -1 if not found w/o errno clobber
- * @deprecated please use `cosmo_args()` it's more powerful
  */
 int LoadZipArgs(int *argc, char ***argv) {
   int e;
diff --git a/tool/args/args.h b/tool/args/args.h
new file mode 100644
index 000000000..dbb517888
--- /dev/null
+++ b/tool/args/args.h
@@ -0,0 +1,8 @@
+#ifndef COSMOPOLITAN_TOOL_ARGS_ARGS_H_
+#define COSMOPOLITAN_TOOL_ARGS_ARGS_H_
+COSMOPOLITAN_C_START_
+
+int LoadZipArgs(int *, char ***) libcesque;
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_TOOL_ARGS_ARGS_H_ */
diff --git a/tool/args/args2.c b/tool/args/args2.c
deleted file mode 100644
index 6af066c4f..000000000
--- a/tool/args/args2.c
+++ /dev/null
@@ -1,582 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
-#include "libc/cosmo.h"
-#include "libc/ctype.h"
-#include "libc/cxxabi.h"
-#include "libc/errno.h"
-#include "libc/mem/mem.h"
-#include "libc/runtime/runtime.h"
-#include "libc/str/tab.h"
-#include "libc/sysv/consts/o.h"
-
-__static_yoink("zipos");
-
-#define CLEAR(b)                                        \
-  do {                                                  \
-    b##_cap = 8;                                        \
-    b##_len = 0;                                        \
-    if (!(b##_ptr = calloc(b##_cap, sizeof(*b##_ptr)))) \
-      goto Failure;                                     \
-  } while (0)
-
-#define APPEND(b, c)                                           \
-  do {                                                         \
-    if (b##_len + 2 > b##_cap) {                               \
-      b##_cap += b##_cap >> 1;                                 \
-      void *p_ = realloc(b##_ptr, b##_cap * sizeof(*b##_ptr)); \
-      if (!p_)                                                 \
-        goto Failure;                                          \
-      b##_ptr = p_;                                            \
-    }                                                          \
-    b##_ptr[b##_len++] = c;                                    \
-    b##_ptr[b##_len] = 0;                                      \
-  } while (0)
-
-#define APPEND_DUP(b, s)    \
-  do {                      \
-    char *tmp;              \
-    if (!(tmp = strdup(s))) \
-      goto Failure;         \
-    APPEND(args, tmp);      \
-  } while (0)
-
-static int esc(int c) {
-  switch (c) {
-    case 'a':
-      return '\a';
-    case 'b':
-      return '\b';
-    case 't':
-      return '\t';
-    case 'n':
-      return '\n';
-    case 'v':
-      return '\v';
-    case 'f':
-      return '\f';
-    case 'r':
-      return '\r';
-    case 'e':
-      return '\e';
-    default:
-      return -1;
-  }
-}
-
-static void cosmo_args_free(void *list) {
-  char **args = list;
-  char *arg;
-  while ((arg = *args++))
-    free(arg);
-  free(list);
-}
-
-/**
- * Replaces argument list with `/zip/.args` contents if it exists.
- *
- * First read the documentation to LoadZipArgs(). This works basically
- * the same, assuming you pass `"/zip/.args"` as the first argument. The
- * difference is that arguments here are parsed more similarly to the
- * shell. In the old version, if you wanted your zip .args config to
- * insert three arguments at the beginning of your argv, you'd say:
- *
- *     arg1
- *     arg2
- *     arg3
- *
- * This will still work. You can also now say:
- *
- *     arg1 arg2
- *     arg3
- *
- * This breaks backwards compatibility, since the old design was made
- * for programs like ar.ape that wanted to be able to accept filename
- * arguments that could potentially have spaces. This new parser, on the
- * other hand, is designed to help offer the configurability a project
- * like llamafile needs, without going so far as to be Turing complete.
- * For example, you could say:
- *
- *     # this is a comment
- *     this\ is' a single arg'"ok"# # comment
- *
- * Which will result in the C string `"this is a single argok#"`. You
- * can even use $VAR notation to schlep in environment variables. Here's
- * how this is different from shell:
- *
- * 1. We don't expand $foo into multiple arguments if it has spaces
- * 2. Double quoted strings work like they do in C, e.g. `"\177\x7f\n"`
- * 3. You can't recursively reference environment variables
- *
- * If the process was started in a degenerate state without argv[0] then
- * GetProgramExecutableName() will be inserted in its place, on success.
- *
- * The `path` argument may be null, in which case only normalization is
- * performed. It is not considered an error if `path` is specified and
- * the file doesn't exist. The errno state will be left dirty if that
- * happens, so it can be checked by clearing `errno` before calling.
- *
- * The returned memory is copied and automatically freed on exit().
- *
- * @return argc on success, or -1 w/ errno
- */
-int cosmo_args(const char *path, char ***argv) {
-
-  // the file
-  int fd = -1;
-
-  // variable name builder
-  int var_cap = 0;
-  int var_len = 0;
-  char *var_ptr = 0;
-
-  // argument string builder
-  int arg_cap = 0;
-  int arg_len = 0;
-  char *arg_ptr = 0;
-
-  // argument array builder
-  int args_cap = 0;
-  int args_len = 0;
-  char **args_ptr = 0;
-
-  // initialize memory
-  CLEAR(var);
-  CLEAR(arg);
-  CLEAR(args);
-
-  // state machine
-  enum {
-    NORMAL,
-    COMMENT,
-    ARGUMENT,
-    BACKSLASH,
-    DOLLAR,
-    DOLLAR_VAR,
-    DOLLAR_LCB,
-    DOT,
-    DOT_DOT,
-    DOT_DOT_DOT,
-    QUOTE,
-    DQUOTE,
-    DQUOTE_DOLLAR,
-    DQUOTE_DOLLAR_VAR,
-    DQUOTE_DOLLAR_LCB,
-    DQUOTE_BACKSLASH,
-    DQUOTE_BACKSLASH_X,
-    DQUOTE_BACKSLASH_X_XDIGIT,
-    DQUOTE_BACKSLASH_DIGIT,
-    DQUOTE_BACKSLASH_DIGIT_DIGIT,
-  } t = NORMAL;
-
-  // extra state
-  int x, numba = 0;
-
-  // add program argument
-  char **argvp = *argv;
-  if (*argvp) {
-    APPEND_DUP(args, *argvp++);
-  } else {
-    APPEND_DUP(args, GetProgramExecutableName());
-  }
-
-  // perform i/o
-  if (path) {
-    if ((fd = open(path, O_RDONLY)) == -1)
-      if (errno != ENOENT && errno != ENOTDIR)
-        goto Failure;
-    if (fd != -1) {
-      for (;;) {
-        char buf[512];
-        int got = read(fd, buf, sizeof(buf));
-        if (got == -1)
-          goto Failure;
-        if (!got)
-          break;
-        for (int i = 0; i < got; ++i) {
-          int c = buf[i] & 255;
-          switch (t) {
-
-            case NORMAL:
-              switch (c) {
-                case ' ':
-                case '\t':
-                case '\r':
-                case '\n':
-                case '\f':
-                case '\v':
-                  break;
-                case '#':
-                  t = COMMENT;
-                  break;
-                case '\'':
-                  t = QUOTE;
-                  break;
-                case '"':
-                  t = DQUOTE;
-                  break;
-                case '$':
-                  t = DOLLAR;
-                  break;
-                case '.':
-                  t = DOT;
-                  break;
-                case '\\':
-                  t = BACKSLASH;
-                  break;
-                default:
-                  APPEND(arg, c);
-                  t = ARGUMENT;
-                  break;
-              }
-              break;
-
-            Argument:
-            case ARGUMENT:
-              switch (c) {
-                case ' ':
-                case '\t':
-                case '\r':
-                case '\n':
-                case '\f':
-                case '\v':
-                  APPEND(args, arg_ptr);
-                  CLEAR(arg);
-                  t = NORMAL;
-                  break;
-                case '\'':
-                  t = QUOTE;
-                  break;
-                case '"':
-                  t = DQUOTE;
-                  break;
-                case '$':
-                  t = DOLLAR;
-                  break;
-                case '\\':
-                  t = BACKSLASH;
-                  break;
-                default:
-                  APPEND(arg, c);
-                  break;
-              }
-              break;
-
-            case BACKSLASH:
-              if (c == '\r') {
-                // do nothing
-              } else if (c == '\n') {
-                t = NORMAL;
-              } else if ((x = esc(c)) != -1) {
-                APPEND(arg, x);
-                t = ARGUMENT;
-              } else {
-                APPEND(arg, c);
-                t = ARGUMENT;
-              }
-              break;
-
-            case COMMENT:
-              if (c == '\n')
-                t = NORMAL;
-              break;
-
-            case DOLLAR:
-              if (isalnum(c) || c == '_') {
-                APPEND(var, c);
-                t = DOLLAR_VAR;
-              } else if (c == '{') {
-                t = DOLLAR_LCB;
-              } else {
-                APPEND(arg, '$');
-                if (c != '$') {
-                  t = ARGUMENT;
-                  goto Argument;
-                }
-              }
-              break;
-
-            case DOLLAR_VAR:
-              if (isalnum(c) || c == '_') {
-                APPEND(var, c);
-              } else {
-                char *val = getenv(var_ptr);
-                if (!val)
-                  val = "";
-                free(var_ptr);
-                CLEAR(var);
-                while (*val)
-                  APPEND(arg, *val++);
-                t = ARGUMENT;
-                goto Argument;
-              }
-              break;
-
-            case DOLLAR_LCB:
-              if (c == '}') {
-                char *val = getenv(var_ptr);
-                if (!val)
-                  val = "";
-                free(var_ptr);
-                CLEAR(var);
-                while (*val)
-                  APPEND(arg, *val++);
-                t = ARGUMENT;
-              } else {
-                APPEND(var, c);
-              }
-              break;
-
-            case QUOTE:
-              if (c == '\'') {
-                t = ARGUMENT;
-              } else {
-                APPEND(arg, c);
-              }
-              break;
-
-            Dquote:
-            case DQUOTE:
-              if (c == '"') {
-                t = ARGUMENT;
-              } else if (c == '$') {
-                t = DQUOTE_DOLLAR;
-              } else if (c == '\\') {
-                t = DQUOTE_BACKSLASH;
-              } else {
-                APPEND(arg, c);
-              }
-              break;
-
-            case DQUOTE_DOLLAR:
-              if (isalnum(c) || c == '_') {
-                APPEND(var, c);
-                t = DQUOTE_DOLLAR_VAR;
-              } else if (c == '{') {
-                t = DQUOTE_DOLLAR_LCB;
-              } else {
-                APPEND(arg, '$');
-                if (c != '$') {
-                  t = DQUOTE;
-                  goto Dquote;
-                }
-              }
-              break;
-
-            case DQUOTE_DOLLAR_VAR:
-              if (isalnum(c) || c == '_') {
-                APPEND(var, c);
-              } else {
-                char *val = getenv(var_ptr);
-                if (!val)
-                  val = "";
-                free(var_ptr);
-                CLEAR(var);
-                while (*val)
-                  APPEND(arg, *val++);
-                t = DQUOTE;
-                goto Dquote;
-              }
-              break;
-
-            case DQUOTE_DOLLAR_LCB:
-              if (c == '}') {
-                char *val = getenv(var_ptr);
-                if (!val)
-                  val = "";
-                free(var_ptr);
-                CLEAR(var);
-                while (*val)
-                  APPEND(arg, *val++);
-                t = DQUOTE;
-              } else {
-                APPEND(var, c);
-              }
-              break;
-
-            case DQUOTE_BACKSLASH:
-              if (isdigit(c)) {
-                numba = c - '0';
-                t = DQUOTE_BACKSLASH_DIGIT;
-              } else if (c == 'x') {
-                t = DQUOTE_BACKSLASH_X;
-              } else if ((x = esc(c)) != -1) {
-                APPEND(arg, x);
-                t = DQUOTE;
-              } else if (c == '\r') {
-                // do nothing
-              } else if (c == '\n') {
-                t = DQUOTE;
-              } else {
-                APPEND(arg, c);
-                t = DQUOTE;
-              }
-              break;
-
-            case DQUOTE_BACKSLASH_DIGIT:
-              if (isdigit(c)) {
-                numba <<= 3;
-                numba += c - '0';
-                t = DQUOTE_BACKSLASH_DIGIT_DIGIT;
-              } else {
-                APPEND(arg, numba);
-                t = DQUOTE;
-                goto Dquote;
-              }
-              break;
-
-            case DQUOTE_BACKSLASH_DIGIT_DIGIT:
-              if (isdigit(c)) {
-                numba <<= 3;
-                numba += c - '0';
-                APPEND(arg, numba);
-                t = DQUOTE;
-              } else {
-                APPEND(arg, numba);
-                t = DQUOTE;
-                goto Dquote;
-              }
-              break;
-
-            case DQUOTE_BACKSLASH_X:
-              if ((x = kHexToInt[c]) != -1) {
-                numba = x;
-                t = DQUOTE_BACKSLASH_X_XDIGIT;
-              } else {
-                APPEND(arg, 'x');
-                t = DQUOTE;
-                goto Dquote;
-              }
-              break;
-
-            case DQUOTE_BACKSLASH_X_XDIGIT:
-              if ((x = kHexToInt[c]) != -1) {
-                numba <<= 4;
-                numba += x;
-                APPEND(arg, numba);
-                t = DQUOTE;
-              } else {
-                APPEND(arg, numba);
-                t = DQUOTE;
-                goto Dquote;
-              }
-              break;
-
-            case DOT:
-              if (c == '.') {
-                t = DOT_DOT;
-              } else {
-                APPEND(arg, '.');
-                t = ARGUMENT;
-                goto Argument;
-              }
-              break;
-
-            case DOT_DOT:
-              if (c == '.') {
-                t = DOT_DOT_DOT;
-              } else {
-                APPEND(arg, '.');
-                APPEND(arg, '.');
-                t = ARGUMENT;
-                goto Argument;
-              }
-              break;
-
-            case DOT_DOT_DOT:
-              if (isspace(c)) {
-                while (*argvp)
-                  APPEND_DUP(args, *argvp++);
-                t = NORMAL;
-              } else {
-                APPEND(arg, '.');
-                APPEND(arg, '.');
-                APPEND(arg, '.');
-                t = ARGUMENT;
-                goto Argument;
-              }
-              break;
-
-            default:
-              __builtin_unreachable();
-          }
-        }
-      }
-      if (close(fd))
-        goto Failure;
-
-      // clean up dirty state
-      switch (t) {
-        case DOT:
-          APPEND(arg, '.');
-          break;
-        case DOT_DOT:
-          APPEND(arg, '.');
-          APPEND(arg, '.');
-          break;
-        case DOT_DOT_DOT:
-          while (*argvp)
-            APPEND_DUP(args, *argvp++);
-          break;
-        case DOLLAR:
-          APPEND(arg, '$');
-          break;
-        case DOLLAR_VAR:
-        case DQUOTE_DOLLAR_VAR:
-          char *val = getenv(var_ptr);
-          if (!val)
-            val = "";
-          while (*val)
-            APPEND(arg, *val++);
-          break;
-        case DOLLAR_LCB:
-        case DQUOTE_DOLLAR_LCB:
-          APPEND(arg, '$');
-          APPEND(arg, '{');
-          for (int j = 0; var_ptr[j]; ++j)
-            APPEND(arg, var_ptr[j]);
-          break;
-        default:
-          break;
-      }
-      if (arg_len) {
-        APPEND(args, arg_ptr);
-        CLEAR(arg);
-      }
-    }
-  }
-
-  // append original argv if ... wasn't specified
-  while (*argvp)
-    APPEND_DUP(args, *argvp++);
-
-  // return result
-  __cxa_atexit(cosmo_args_free, args_ptr, 0);
-  *argv = args_ptr;
-  free(arg_ptr);
-  free(var_ptr);
-  return args_len;
-
-Failure:
-  cosmo_args_free(args_ptr);
-  free(arg_ptr);
-  if (fd != -1)
-    close(fd);
-  return -1;
-}
diff --git a/tool/build/BUILD.mk b/tool/build/BUILD.mk
index 2d37a2bd0..bfbb2ccb2 100644
--- a/tool/build/BUILD.mk
+++ b/tool/build/BUILD.mk
@@ -42,7 +42,6 @@ TOOL_BUILD_DIRECTDEPS =							\
 	LIBC_SOCK							\
 	LIBC_STDIO							\
 	LIBC_STR							\
-	LIBC_SYSTEM							\
 	LIBC_SYSV							\
 	LIBC_SYSV_CALLS							\
 	LIBC_THREAD							\
@@ -61,7 +60,7 @@ TOOL_BUILD_DIRECTDEPS =							\
 	THIRD_PARTY_XED							\
 	THIRD_PARTY_ZLIB						\
 	THIRD_PARTY_ZLIB_GZ						\
-	TOOL_BUILD_LIB							\
+	TOOL_BUILD_LIB
 
 TOOL_BUILD_DEPS :=							\
 	$(call uniq,$(foreach x,$(TOOL_BUILD_DIRECTDEPS),$($(x))))
@@ -87,11 +86,9 @@ o/$(MODE)/tool/build/cocmd.zip.o: private				\
 
 # we need pic because:
 #   so it can be an LD_PRELOAD payload
-# we need fsanitize-trap=all becuase:
-#   so we don't need to pull in the entire ubsan runtime
 o/$(MODE)/tool/build/dso/sandbox.o: private				\
 		CFLAGS +=						\
-			-fPIC -fsanitize-trap=all
+			-fPIC
 
 o/$(MODE)/tool/build/dso/sandbox.o:					\
 		libc/calls/calls.h					\
@@ -138,8 +135,8 @@ o/$(MODE)/tool/build/dso/dlopen_helper.so:				\
 		o/$(MODE)/tool/build/dso/dlopen_helper.o		\
 		$(OUTPUT_OPTION)
 
-o/$(MODE)/tool/build/dlopen_tester.runs:				\
-		o/$(MODE)/tool/build/dlopen_tester			\
+o/$(MODE)/tool/build/dlopen_test.runs:					\
+		o/$(MODE)/tool/build/dlopen_test			\
 		o/$(MODE)/tool/build/dso/dlopen_helper.so
 	$< o/$(MODE)/tool/build/dso/dlopen_helper.so
 
diff --git a/tool/build/apelink.c b/tool/build/apelink.c
index f84b50ef2..2c707ab61 100644
--- a/tool/build/apelink.c
+++ b/tool/build/apelink.c
@@ -21,7 +21,7 @@
 #include "libc/calls/calls.h"
 #include "libc/ctype.h"
 #include "libc/dce.h"
-#include "libc/dos.h"
+#include "libc/dos.internal.h"
 #include "libc/elf/def.h"
 #include "libc/elf/elf.h"
 #include "libc/elf/scalar.h"
@@ -30,8 +30,8 @@
 #include "libc/fmt/conv.h"
 #include "libc/fmt/itoa.h"
 #include "libc/limits.h"
-#include "libc/macho.h"
-#include "libc/macros.h"
+#include "libc/macho.internal.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/pedef.internal.h"
 #include "libc/nt/struct/imageimportbyname.internal.h"
@@ -41,7 +41,7 @@
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/symbols.internal.h"
 #include "libc/serialize.h"
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #include "libc/stdckdint.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/blake2.h"
@@ -49,7 +49,7 @@
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "third_party/getopt/getopt.internal.h"
 #include "third_party/zlib/zlib.h"
 #include "tool/build/lib/lib.h"
@@ -85,13 +85,6 @@
   "             executable will self-modify its header on\n"   \
   "             the first run, to use the platform format\n"   \
   "\n"                                                         \
-  "  -k KERNEL  test for maching kernel name [repeatable]\n"   \
-  "             when set, the shell script for subsequent\n"   \
-  "             loader executables will check if uname -s\n"   \
-  "             output matches the kernel string, only if\n"   \
-  "             the loader executable architecture is not\n"   \
-  "             an architecture in the input binary list\n"    \
-  "\n"                                                         \
   "  -M PATH    bundle ape loader source code file for m1\n"   \
   "             processors running the xnu kernel so that\n"   \
   "             it can be compiled on the fly by xcode\n"      \
@@ -220,7 +213,6 @@ struct Loader {
   char *ddarg_size1;
   char *ddarg_skip2;
   char *ddarg_size2;
-  const char *kernel;
 };
 
 struct Loaders {
@@ -252,7 +244,6 @@ static struct Inputs inputs;
 static char ape_heredoc[15];
 static enum Strategy strategy;
 static struct Loaders loaders;
-static const char *loader_kernel;
 static const char *custom_sh_code;
 static bool force_bypass_binfmt_misc;
 static bool generate_debuggable_binary;
@@ -269,6 +260,8 @@ static Elf64_Xword notesize;
 
 static char *r_off32_e_lfanew;
 
+#include "libc/mem/tinymalloc.inc"
+
 static wontreturn void Die(const char *thing, const char *reason) {
   tinyprint(2, thing, ": ", reason, "\n", NULL);
   exit(1);
@@ -686,8 +679,6 @@ static void LoadSymbols(Elf64_Ehdr *e, Elf64_Off size, const char *path) {
   struct SymbolTable *st = OpenSymbolTable(path);
   if (!st)
     Die(path, "could not load elf symbol table");
-  st->names = 0;      // make this deterministic
-  st->name_base = 0;  // ready for serialization
   size_t data_size;
   void *data = Deflate(st, st->size, &data_size);
   uint32_t crc = crc32_z(0, st, st->size);
@@ -988,19 +979,13 @@ static void AddLoader(const char *path) {
   if (loaders.n == ARRAYLEN(loaders.p)) {
     Die(prog, "too many loaders");
   }
-  struct Loader *loader = &loaders.p[loaders.n++];
-  loader->path = path;
-  loader->kernel = loader_kernel;
-}
-
-static void SetLoaderKernel(const char *kernel) {
-  loader_kernel = kernel;
+  loaders.p[loaders.n++].path = path;
 }
 
 static void GetOpts(int argc, char *argv[]) {
   int opt, bits;
   bool got_support_vector = false;
-  while ((opt = getopt(argc, argv, "hvgsGBo:l:k:S:M:V:")) != -1) {
+  while ((opt = getopt(argc, argv, "hvgsGBo:l:S:M:V:")) != -1) {
     switch (opt) {
       case 'o':
         outpath = optarg;
@@ -1024,10 +1009,6 @@ static void GetOpts(int argc, char *argv[]) {
         HashInputString("-l");
         AddLoader(optarg);
         break;
-      case 'k':
-        HashInputString("-k");
-        SetLoaderKernel(optarg);
-        break;
       case 'S':
         HashInputString("-S");
         HashInputString(optarg);
@@ -1649,28 +1630,6 @@ static char *GenerateScriptIfMachine(char *p, struct Input *in) {
   }
 }
 
-static char *GenerateScriptIfLoaderMachine(char *p, struct Loader *loader) {
-  if (loader->machine == EM_NEXGEN32E) {
-    p = stpcpy(p, "if [ \"$m\" = x86_64 ] || [ \"$m\" = amd64 ]");
-  } else if (loader->machine == EM_AARCH64) {
-    p = stpcpy(p, "if [ \"$m\" = aarch64 ] || [ \"$m\" = arm64 ] || [ \"$m\" = evbarm ]");
-  } else if (loader->machine == EM_PPC64) {
-    p = stpcpy(p, "if [ \"$m\" = ppc64le ]");
-  } else if (loader->machine == EM_MIPS) {
-    p = stpcpy(p, "if [ \"$m\" = mips64 ]");
-  } else {
-    Die(loader->path, "unsupported cpu architecture");
-  }
-
-  if (loader->kernel) {
-    p = stpcpy(p, " && [ \"$k\" = ");
-    p = stpcpy(p, loader->kernel);
-    p = stpcpy(p, " ]");
-  }
-
-  return stpcpy(p, "; then\n");
-}
-
 static char *FinishGeneratingDosHeader(char *p) {
   p = WRITE16LE(p, 0x1000);  // 10: MZ: lowers upper bound load / 16
   p = WRITE16LE(p, 0xf800);  // 12: MZ: roll greed on bss
@@ -1920,15 +1879,7 @@ int main(int argc, char *argv[]) {
     for (j = i + 1; j < loaders.n; ++j) {
       if (loaders.p[i].os == loaders.p[j].os &&
           loaders.p[i].machine == loaders.p[j].machine) {
-        if (!loaders.p[i].kernel && !loaders.p[j].kernel) {
-          Die(prog, "multiple ape loaders specified for the same platform");
-        }
-        if (loaders.p[i].kernel != NULL &&
-            loaders.p[j].kernel != NULL &&
-            strcmp(loaders.p[i].kernel, loaders.p[j].kernel) == 0) {
-          Die(prog, "multiple ape loaders specified for the same platform "
-                    "with matching kernels");
-        }
+        Die(prog, "multiple ape loaders specified for the same platform");
       }
     }
   }
@@ -2239,36 +2190,6 @@ int main(int argc, char *argv[]) {
         gotsome = true;
       }
     }
-
-    // extract the ape loader for non-input architectures
-    // if the user requested a host kernel check, get the host kernel
-    if (loader_kernel) {
-      p = stpcpy(p, "k=$(uname -s 2>/dev/null) || k=unknown\n");
-    }
-    for (i = 0; i < loaders.n; ++i) {
-      struct Loader *loader = loaders.p + i;
-      if (loader->used) {
-        continue;
-      }
-      loader->used = true;
-      p = GenerateScriptIfLoaderMachine(p, loader);
-      p = stpcpy(p, "mkdir -p \"${t%/*}\" ||exit\n"
-                    "dd if=\"$o\"");
-      p = stpcpy(p, " skip=");
-      loader->ddarg_skip2 = p;
-      p = GenerateDecimalOffsetRelocation(p);
-      p = stpcpy(p, " count=");
-      loader->ddarg_size2 = p;
-      p = GenerateDecimalOffsetRelocation(p);
-      p = stpcpy(p, " bs=1 2>/dev/null | gzip -dc >\"$t.$$\" ||exit\n"
-                    "chmod 755 \"$t.$$\" ||exit\n"
-                    "mv -f \"$t.$$\" \"$t\" ||exit\n");
-      p = stpcpy(p, "exec \"$t\" \"$o\" \"$@\"\n"
-                    "fi\n");
-      gotsome = true;
-    }
-
-    // close if-statements
     if (inputs.n && (support_vector & _HOSTXNU)) {
       if (!gotsome) {
         p = stpcpy(p, "true\n");
diff --git a/tool/build/ar.c b/tool/build/ar.c
index 29f9abb32..616ee56dd 100644
--- a/tool/build/ar.c
+++ b/tool/build/ar.c
@@ -31,7 +31,7 @@
 #include "libc/fmt/magnumstrs.internal.h"
 #include "libc/intrin/bsr.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/serialize.h"
 #include "libc/stdckdint.h"
@@ -40,74 +40,30 @@
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/s.h"
-#include "tool/build/lib/ar.h"
 #include "tool/build/lib/getargs.h"
 
 /**
  * @fileoverview cosmopolitan ar
+ *
+ * This static archiver is superior:
+ *
+ * - Isn't "accidentally quadratic" like GNU ar
+ * - Goes 2x faster than LLVM ar while using 100x less memory
+ * - Can be built as a 52kb APE binary that works well on six OSes
+ *
+ * This static archiver introduces handy features:
+ *
+ * - Arguments may be supplied in an `@args.txt` file
+ * - Directory arguments are ignored
+ *
+ * @see https://www.unix.com/man-page/opensolaris/3head/ar.h/
+ * @see https://en.wikipedia.org/wiki/Ar_(Unix)
  */
 
-static wontreturn void ShowUsage(int rc, int fd) {
-  tinyprint(  //
-      fd,
-      "USAGE\n"
-      "\n",
-      "  ", program_invocation_name, " FLAGS ARCHIVE FILE...\n",
-      "\n"
-      "FLAGS\n"
-      "\n"
-      "  rcs        create new archive with index\n"
-      "  rcsD       always deterministic\n"
-      "  --help     show usage\n"
-      "  --version  show program details\n"
-      "\n"
-      "ARGUMENTS\n"
-      "\n"
-      "  ARCHIVE    should be foo.a\n"
-      "  FILE       should be foo.o, lib.a, or @args.txt\n"
-      "\n"
-      "DOCUMENTATION\n"
-      "\n"
-      "  Your Cosmopolitan Archiver is superior:\n"
-      "\n"
-      "  - Isn't accidentally quadratic like GNU ar. Cosmopolitan Libc is\n"
-      "    distributed as libcosmo.a which contains 5000+ object files and\n"
-      "    is tens of megabytes in size. GNU ar isn't capable of making an\n"
-      "    archive that large. So we invented this ar as a replacement.\n"
-      "\n"
-      "  - Goes 2x faster than LLVM ar thanks to modern system calls like\n"
-      "    copy_file_range(). This ar should also use 100x less memory.\n"
-      "\n"
-      "  - Can be built as a 96kb APE binary that works well on six OSes.\n"
-      "    Cosmopolitan uses the same dev tools on all OSes and archsr to\n"
-      "    ensure compilations are simple and deterministic for everyone.\n"
-      "\n"
-      "  This static archiver introduces handy features:\n"
-      "\n"
-      "  - Arguments may be supplied in an `@args.txt` file. This is useful\n"
-      "    for overcoming the `ARG_MAX` limit, which is especially important\n"
-      "    on Windows, where only very few command arguments can be passed.\n"
-      "    GNU Make can be easily configured to generate args files.\n"
-      "\n"
-      "  - You can merge many .a files into one big .a file. Args that end\n"
-      "    with .a will be opened as static archives. The .o files inside it\n"
-      "    will then be added to your new archive. It would be the same as if\n"
-      "    you passed all the .o files as args. This is fast. For example, to\n"
-      "    merge 37 .a files containing 5000 .o files takes ~38 milliseconds.\n"
-      "\n"
-      "  - Directory arguments are ignored. The biggest gotcha with makefiles\n"
-      "    that use wildcard globbing is that it can't detect when files are\n"
-      "    deleted, which means it can't invalidate the artifacts which had\n"
-      "    depended on that file, leading to nondeterminism and surprising\n"
-      "    build failures. The simplest way to solve that is to add the\n"
-      "    directory to the prerequisites list, since the directory modified\n"
-      "    time will be updated by the OS when files inside it are deleted.\n"
-      "    When doing this, it's simple and elegant to not need to filter\n"
-      "    the directory prerequisites before passing `$^` to `ar`.\n"
-      "\n",
-      NULL);
-  exit(rc);
-}
+#define VERSION                     \
+  "cosmopolitan ar v2.0\n"          \
+  "copyright 2023 justine tunney\n" \
+  "https://github.com/jart/cosmopolitan\n"
 
 #define HEAP_SIZE (256L * 1024 * 1024)
 
@@ -152,6 +108,29 @@ static wontreturn void SysDie(const char *path, const char *func) {
   exit(1);
 }
 
+static wontreturn void ShowUsage(int rc, int fd) {
+  tinyprint(fd, VERSION,
+            "\n"
+            "USAGE\n"
+            "\n",
+            "  ", program_invocation_name, " FLAGS ARCHIVE FILE...\n",
+            "\n"
+            "FLAGS\n"
+            "\n"
+            "  rcs        create new archive with index\n"
+            "  rcsD       always deterministic\n"
+            "  --help     show usage\n"
+            "  --version  show program details\n"
+            "\n"
+            "ARGUMENTS\n"
+            "\n"
+            "  ARCHIVE    should be foo.a\n"
+            "  FILE       should be foo.o or @args.txt\n"
+            "\n",
+            NULL);
+  exit(rc);
+}
+
 // allocates 𝑛 bytes of memory aligned on 𝑎 from .bss
 // - avoids binary bloat of mmap() and malloc()
 // - dies if out of memory or overflow occurs
@@ -180,11 +159,13 @@ static void *balloc(size_t n, size_t a) {
   } else {
     c = 2ull << (__builtin_clzll(n - 1) ^ (sizeof(long long) * CHAR_BIT - 1));
   }
-  if (c < a || c > HEAP_SIZE || p + c > h + HEAP_SIZE)
+  if (c < a || c > HEAP_SIZE || p + c > h + HEAP_SIZE) {
     Die(program_invocation_name, "out of memory");
+  }
   used = p - h + c;
-  if (resizable)
+  if (resizable) {
     memcpy((char *)p - sizeof(c), &c, sizeof(c));
+  }
   return (void *)p;
 }
 
@@ -277,28 +258,21 @@ static void MakeArHeader(struct ar_hdr *h,  //
 // - uses copy_file_range() if possible
 // - returns number of bytes exchanged
 // - dies if operation fails
-static void CopyFileOrDie(const char *inpath, int infd,    //
-                          const char *outpath, int outfd,  //
-                          size_t offset, size_t size) {
+static int64_t CopyFileOrDie(const char *inpath, int infd,  //
+                             const char *outpath, int outfd) {
+  int64_t toto;
   char buf[512];
   size_t exchanged;
+  ssize_t got, wrote;
   enum { CFR, RW } mode;
-  ssize_t want, got, wrote;
-  if (offset)
-    if (lseek(infd, offset, SEEK_SET) == -1)
-      SysDie(inpath, "lseek");
-  for (mode = CFR; size; size -= exchanged) {
+  for (mode = CFR, toto = 0;; toto += exchanged) {
     if (mode == CFR) {
-      want = 4194304;
-      if (want > size)
-        want = size;
-      got = copy_file_range(infd, 0, outfd, 0, want, 0);
+      got = copy_file_range(infd, 0, outfd, 0, 4194304, 0);
       if (!got)
-        Die(inpath, "unexpected eof");
+        break;
       if (got != -1) {
         exchanged = got;
       } else if (errno == EXDEV ||       // different partitions
-                 errno == EINVAL ||      // possible w/ ecryptfs
                  errno == ENOSYS ||      // not linux or freebsd
                  errno == ENOTSUP ||     // probably a /zip file
                  errno == EOPNOTSUPP) {  // technically the same
@@ -308,12 +282,9 @@ static void CopyFileOrDie(const char *inpath, int infd,    //
         SysDie(inpath, "copy_file_range");
       }
     } else {
-      want = sizeof(buf);
-      if (want > size)
-        want = size;
-      got = read(infd, buf, want);
+      got = read(infd, buf, sizeof(buf));
       if (!got)
-        Die(inpath, "unexpected eof");
+        break;
       if (got == -1)
         SysDie(inpath, "read");
       wrote = write(outfd, buf, got);
@@ -324,51 +295,7 @@ static void CopyFileOrDie(const char *inpath, int infd,    //
       exchanged = wrote;
     }
   }
-}
-
-static void AppendName(const char *name, struct Args *names,
-                       struct Bytes *filenames) {
-  struct ar_hdr header1;
-  char bnbuf[PATH_MAX + 1];
-  strlcpy(bnbuf, name, sizeof(bnbuf));
-  char *aname = StrCat(basename(bnbuf), "/");
-  if (strlen(aname) <= sizeof(header1.ar_name)) {
-    AppendArg(names, aname);
-  } else {
-    char ibuf[21];
-    FormatUint64(ibuf, filenames->i);
-    AppendArg(names, StrCat("/", ibuf));
-    AppendBytes(filenames, aname, strlen(aname));
-    AppendBytes(filenames, "\n", 1);
-  }
-}
-
-static void AppendSymbols(const char *path, const Elf64_Ehdr *elf,
-                          size_t elfsize, struct Bytes *symbols,
-                          struct Ints *symnames, int objid) {
-  if (!IsElf64Binary(elf, elfsize))
-    Die(path, "not an elf64 binary");
-  char *strs = GetElfStringTable(elf, elfsize, ".strtab");
-  if (!strs)
-    Die(path, "elf .strtab not found");
-  Elf64_Xword symcount;
-  Elf64_Shdr *symsec = GetElfSymbolTable(elf, elfsize, SHT_SYMTAB, &symcount);
-  Elf64_Sym *syms = GetElfSectionAddress(elf, elfsize, symsec);
-  if (!syms)
-    Die(path, "elf symbol table not found");
-  for (Elf64_Xword j = symsec->sh_info; j < symcount; ++j) {
-    if (!syms[j].st_name)
-      continue;
-    if (syms[j].st_shndx == SHN_UNDEF)
-      continue;
-    if (syms[j].st_shndx == SHN_COMMON)
-      continue;
-    const char *symname = GetElfString(elf, elfsize, strs, syms[j].st_name);
-    if (!symname)
-      Die(path, "elf symbol name corrupted");
-    AppendBytes(symbols, symname, strlen(symname) + 1);
-    AppendInt(symnames, objid);
-  }
+  return toto;
 }
 
 int main(int argc, char *argv[]) {
@@ -382,26 +309,24 @@ int main(int argc, char *argv[]) {
 
   // handle hardcoded flags
   if (argc == 2) {
-    if (IsEqual(argv[1], "-n"))
+    if (IsEqual(argv[1], "-n")) {
       exit(0);
+    }
     if (IsEqual(argv[1], "-h") ||  //
         IsEqual(argv[1], "-?") ||  //
         IsEqual(argv[1], "--help")) {
       ShowUsage(0, 1);
     }
     if (IsEqual(argv[1], "--version")) {
-      tinyprint(1,
-                "cosmopolitan ar v3.0\n"
-                "copyright 2024 justine tunney\n"
-                "https://github.com/jart/cosmopolitan\n",
-                NULL);
+      tinyprint(1, VERSION, NULL);
       exit(0);
     }
   }
 
   // get flags and output path
-  if (argc < 3)
-    Die(argv[0], "missing argument");
+  if (argc < 3) {
+    ShowUsage(1, 2);
+  }
   char *flags = argv[1];
   const char *outpath = argv[2];
 
@@ -422,8 +347,8 @@ int main(int argc, char *argv[]) {
 
   struct Args args = {reballoc(0, 4096, sizeof(char *))};
   struct Args names = {reballoc(0, 4096, sizeof(char *))};
+  struct Ints modes = {reballoc(0, 4096, sizeof(int))};
   struct Ints sizes = {reballoc(0, 4096, sizeof(int))};
-  struct Ints foffsets = {reballoc(0, 4096, sizeof(int))};
   struct Ints symnames = {reballoc(0, 16384, sizeof(int))};
   struct Bytes symbols = {reballoc(0, 131072, sizeof(char))};
   struct Bytes filenames = {reballoc(0, 16384, sizeof(char))};
@@ -440,42 +365,63 @@ int main(int argc, char *argv[]) {
       continue;
     if (endswith(arg, ".pkg"))
       continue;
-    if (endswith(arg, ".a")) {
-      struct Ar ar;
-      struct ArFile arf;
-      openar(&ar, arg);
-      while (readar(&ar, &arf)) {
-        AppendArg(&args, StrDup(arg));
-        AppendInt(&sizes, arf.size);
-        AppendInt(&foffsets, arf.offset);
-        AppendName(arf.name, &names, &filenames);
-        AppendSymbols(arg, arf.data, arf.size, &symbols, &symnames, objectid++);
-      }
-      closear(&ar);
+    if (stat(arg, &st))
+      SysDie(arg, "stat");
+    if (S_ISDIR(st.st_mode))
+      continue;
+    if (!st.st_size)
+      Die(arg, "file is empty");
+    if (st.st_size > 0x7ffff000)
+      Die(arg, "file too large");
+    if ((fd = open(arg, O_RDONLY)) == -1)
+      SysDie(arg, "open");
+    AppendArg(&args, StrDup(arg));
+    AppendInt(&sizes, st.st_size);
+    AppendInt(&modes, st.st_mode);
+    char bnbuf[PATH_MAX + 1];
+    strlcpy(bnbuf, arg, sizeof(bnbuf));
+    char *aname = StrCat(basename(bnbuf), "/");
+    if (strlen(aname) <= sizeof(header1.ar_name)) {
+      AppendArg(&names, aname);
     } else {
-      if (stat(arg, &st))
-        SysDie(arg, "stat");
-      if (S_ISDIR(st.st_mode))
-        continue;
-      if (!st.st_size)
-        Die(arg, "file is empty");
-      if (st.st_size > 0x7ffff000)
-        Die(arg, "file too large");
-      if ((fd = open(arg, O_RDONLY)) == -1)
-        SysDie(arg, "open");
-      AppendArg(&args, StrDup(arg));
-      AppendInt(&sizes, st.st_size);
-      AppendInt(&foffsets, 0);
-      AppendName(arg, &names, &filenames);
-      void *elf = mmap(0, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
-      if (elf == MAP_FAILED)
-        SysDie(arg, "mmap");
-      AppendSymbols(arg, elf, st.st_size, &symbols, &symnames, objectid++);
-      if (munmap(elf, st.st_size))
-        SysDie(arg, "munmap");
-      if (close(fd))
-        SysDie(arg, "close");
+      char ibuf[21];
+      FormatUint64(ibuf, filenames.i);
+      AppendArg(&names, StrCat("/", ibuf));
+      AppendBytes(&filenames, aname, strlen(aname));
+      AppendBytes(&filenames, "\n", 1);
     }
+    size_t mapsize = st.st_size;
+    void *elf = mmap(0, mapsize, PROT_READ, MAP_PRIVATE, fd, 0);
+    if (elf == MAP_FAILED)
+      SysDie(arg, "mmap");
+    if (!IsElf64Binary(elf, mapsize))
+      Die(arg, "not an elf64 binary");
+    char *strs = GetElfStringTable(elf, mapsize, ".strtab");
+    if (!strs)
+      Die(arg, "elf .strtab not found");
+    Elf64_Xword symcount;
+    Elf64_Shdr *symsec = GetElfSymbolTable(elf, mapsize, SHT_SYMTAB, &symcount);
+    Elf64_Sym *syms = GetElfSectionAddress(elf, mapsize, symsec);
+    if (!syms)
+      Die(arg, "elf symbol table not found");
+    for (Elf64_Xword j = symsec->sh_info; j < symcount; ++j) {
+      if (!syms[j].st_name)
+        continue;
+      if (syms[j].st_shndx == SHN_UNDEF)
+        continue;
+      if (syms[j].st_shndx == SHN_COMMON)
+        continue;
+      const char *symname = GetElfString(elf, mapsize, strs, syms[j].st_name);
+      if (!symname)
+        Die(arg, "elf symbol name corrupted");
+      AppendBytes(&symbols, symname, strlen(symname) + 1);
+      AppendInt(&symnames, objectid);
+    }
+    if (munmap(elf, mapsize))
+      SysDie(arg, "munmap");
+    if (close(fd))
+      SysDie(arg, "close");
+    ++objectid;
   }
   getargs_destroy(&ga);
 
@@ -515,37 +461,45 @@ int main(int argc, char *argv[]) {
   MakeArHeader(&header1, "/", 0, tablebufsize + ROUNDUP(symbols.i, 2));
   MakeArHeader(&header2, "//", 0, ROUNDUP(filenames.i, 2));
   WRITE32BE(tablebuf, symnames.i);
-  for (size_t i = 0; i < symnames.i; ++i)
+  for (size_t i = 0; i < symnames.i; ++i) {
     WRITE32BE(tablebuf + 4 + i * 4, offsets[symnames.p[i]]);
+  }
 
   // write output archive
   int outfd;
-  if ((outfd = creat(outpath, 0644)) == -1)
+  if ((outfd = creat(outpath, 0644)) == -1) {
     SysDie(outpath, "creat");
-  if (ftruncate(outfd, outsize))
+  }
+  if (ftruncate(outfd, outsize)) {
     SysDie(outpath, "ftruncate");
-  if ((outsize = writev(outfd, iov, ARRAYLEN(iov))) == -1)
+  }
+  if ((outsize = writev(outfd, iov, ARRAYLEN(iov))) == -1) {
     SysDie(outpath, "writev[1]");
+  }
   for (size_t i = 0; i < args.i; ++i) {
     const char *inpath = args.p[i];
-    if (!(i && IsEqual(inpath, args.p[i - 1])))
-      if ((fd = open(inpath, O_RDONLY)) == -1)
-        SysDie(inpath, "open");
+    if ((fd = open(inpath, O_RDONLY)) == -1) {
+      SysDie(inpath, "open");
+    }
     iov[0].iov_base = "\n";
     outsize += (iov[0].iov_len = outsize & 1);
     iov[1].iov_base = &header1;
     outsize += (iov[1].iov_len = sizeof(struct ar_hdr));
-    MakeArHeader(&header1, names.p[i], 0100644, sizes.p[i]);
-    if (writev(outfd, iov, 2) == -1)
+    MakeArHeader(&header1, names.p[i], modes.p[i], sizes.p[i]);
+    if (writev(outfd, iov, 2) == -1) {
       SysDie(outpath, "writev[2]");
+    }
     outsize += sizes.p[i];
-    CopyFileOrDie(inpath, fd, outpath, outfd, foffsets.p[i], sizes.p[i]);
-    if (!(i + 1 < args.i && IsEqual(inpath, args.p[i + 1])))
-      if (close(fd))
-        SysDie(inpath, "close");
+    if (CopyFileOrDie(inpath, fd, outpath, outfd) != sizes.p[i]) {
+      Die(inpath, "file size changed");
+    }
+    if (close(fd)) {
+      SysDie(inpath, "close");
+    }
   }
-  if (close(outfd))
+  if (close(outfd)) {
     SysDie(outpath, "close");
+  }
 
   return 0;
 }
diff --git a/tool/build/assimilate.c b/tool/build/assimilate.c
index eef749455..8c6d4c1cf 100644
--- a/tool/build/assimilate.c
+++ b/tool/build/assimilate.c
@@ -25,8 +25,8 @@
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/limits.h"
-#include "libc/macho.h"
-#include "libc/macros.h"
+#include "libc/macho.internal.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/serialize.h"
 #include "libc/stdckdint.h"
@@ -67,6 +67,8 @@
 #define FORMAT_MACHO  2
 #define FORMAT_PE     3
 
+#include "libc/mem/tinymalloc.inc"
+
 static int g_arch;
 static int g_format;
 static bool g_force;
diff --git a/tool/build/bigmul.c b/tool/build/bigmul.c
index 181f32df4..b731d9fc7 100644
--- a/tool/build/bigmul.c
+++ b/tool/build/bigmul.c
@@ -19,7 +19,7 @@
 #include "libc/assert.h"
 #include "libc/fmt/conv.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/stdio.h"
@@ -48,10 +48,8 @@ void Multiply%dx%d(uint64_t C[%d], const uint64_t A[%d], const uint64_t B[%d]) {
   uint64_t z,h,l;\n\
   uint64_t ",
          (n + m) * 64, n * 64, m * 64, n + m, n, m, n, m, n + m, n, m);
-  if (!(Rs = calloc(sizeof(*Rs), n + m + 1)))
-    __builtin_trap();
-  if (!(Ra = calloc(sizeof(*Ra), n + m + 1)))
-    __builtin_trap();
+  Rs = gc(calloc(sizeof(*Rs), n + m + 1));
+  Ra = gc(calloc(sizeof(*Ra), n + m + 1));
   for (j = 0; j < n; ++j) {
     if (j)
       printf(", ");
@@ -174,8 +172,6 @@ void Multiply%dx%d(uint64_t C[%d], const uint64_t A[%d], const uint64_t B[%d]) {
   }
   printf("}\n");
   fflush(stdout);
-  free(Ra);
-  free(Rs);
 }
 
 int main(int argc, char *argv[]) {
diff --git a/tool/build/compile.c b/tool/build/compile.c
index 2ade1ce7b..68cc4c9ee 100644
--- a/tool/build/compile.c
+++ b/tool/build/compile.c
@@ -38,7 +38,7 @@
 #include "libc/log/appendresourcereport.internal.h"
 #include "libc/log/color.internal.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/gc.h"
@@ -111,9 +111,9 @@ FLAGS\n\
   -T TARGET    specifies target name for V=0 logging\n\
   -A ACTION    specifies short command name for V=0 logging\n\
   -V NUMBER    specifies compiler version\n\
-  -C SECS      set cpu limit [default 32]\n\
+  -C SECS      set cpu limit [default 16]\n\
   -L SECS      set lat limit [default 90]\n\
-  -P PROCS     set pro limit [default 8192]\n\
+  -P PROCS     set pro limit [default 4096]\n\
   -S BYTES     set stk limit [default 8m]\n\
   -M BYTES     set mem limit [default 2048m]\n\
   -F BYTES     set fsz limit [default 256m]\n\
@@ -229,6 +229,8 @@ const char *const kSafeEnv[] = {
     "TMPDIR",       // needed by compiler
 };
 
+#include "libc/mem/tinymalloc.inc"
+
 void OnAlrm(int sig) {
   ++gotalrm;
 }
@@ -514,11 +516,7 @@ void AddArg(char *actual) {
 }
 
 static int GetBaseCpuFreqMhz(void) {
-#ifdef __x86_64__
   return KCPUIDS(16H, EAX) & 0x7fff;
-#else
-  return 0;
-#endif
 }
 
 void PlanResource(int resource, struct rlimit rlim) {
@@ -527,7 +525,7 @@ void PlanResource(int resource, struct rlimit rlim) {
     return;
   rlim.rlim_cur = MIN(rlim.rlim_cur, prior.rlim_max);
   rlim.rlim_max = MIN(rlim.rlim_max, prior.rlim_max);
-  posix_spawnattr_setrlimit_np(&spawnattr, resource, &rlim);
+  posix_spawnattr_setrlimit(&spawnattr, resource, &rlim);
 }
 
 void SetCpuLimit(int secs) {
@@ -649,7 +647,7 @@ int Launch(void) {
   posix_spawnattr_init(&spawnattr);
   posix_spawnattr_setsigmask(&spawnattr, &savemask);
   posix_spawnattr_setflags(&spawnattr,
-                           POSIX_SPAWN_SETSIGMASK | POSIX_SPAWN_SETRLIMIT_NP);
+                           POSIX_SPAWN_SETSIGMASK | POSIX_SPAWN_SETRLIMIT);
   SetCpuLimit(cpuquota);
   SetFszLimit(fszquota);
   SetMemLimit(memquota);
@@ -798,11 +796,7 @@ bool MovePreservingDestinationInode(const char *from, const char *to) {
     rc = copy_file_range(fdin, 0, fdout, 0, remain, 0);
     if (rc != -1) {
       remain -= rc;
-    } else if (errno == EXDEV ||    // different partitions
-               errno == EINVAL ||   // possible w/ ecryptfs
-               errno == ENOSYS ||   // not linux or freebsd
-               errno == ENOTSUP ||  // no fs support for it
-               errno == EOPNOTSUPP) {
+    } else if (errno == EXDEV || errno == ENOSYS) {
       if (lseek(fdin, 0, SEEK_SET) == -1) {
         res = false;
         break;
@@ -864,7 +858,7 @@ int main(int argc, char *argv[]) {
   verbose = 4;
   timeout = 90;                    // secs
   cpuquota = 32;                   // secs
-  proquota = 8192;                 // procs
+  proquota = 4096;                 // procs
   stkquota = 8 * 1024 * 1024;      // bytes
   fszquota = 256 * 1000 * 1000;    // bytes
   memquota = 2048L * 1024 * 1024;  // bytes
diff --git a/tool/build/cp.c b/tool/build/cp.c
index 5aed44442..afd8e84cc 100644
--- a/tool/build/cp.c
+++ b/tool/build/cp.c
@@ -69,6 +69,8 @@ char linkbuf[PATH_MAX];
 
 void Cp(char *, char *);
 
+#include "libc/mem/tinymalloc.inc"
+
 bool IsDirectory(const char *path) {
   int e;
   bool res;
diff --git a/tool/build/dd.c b/tool/build/dd.c
index 5c5ce461b..77c7d6e26 100644
--- a/tool/build/dd.c
+++ b/tool/build/dd.c
@@ -19,8 +19,6 @@
 #include "libc/calls/calls.h"
 #include "libc/fmt/conv.h"
 #include "libc/limits.h"
-#include "libc/mem/gc.h"
-#include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
@@ -46,8 +44,8 @@ int main(int argc, char *argv[]) {
   long count = LONG_MAX;
   long blocksize = 1;
   int oflags = O_WRONLY | O_TRUNC | O_CREAT;
-  char *infile = gc(strdup("/dev/stdin"));
-  char *oufile = gc(strdup("/dev/stdout"));
+  const char *infile = "/dev/stdin";
+  const char *oufile = "/dev/stdout";
 
   prog = argv[0];
   if (!prog)
diff --git a/tool/build/dlopen_tester.c b/tool/build/dlopen_test.c
similarity index 100%
rename from tool/build/dlopen_tester.c
rename to tool/build/dlopen_test.c
diff --git a/tool/build/elf2pe.c b/tool/build/elf2pe.c
index c015132d2..b1508a926 100644
--- a/tool/build/elf2pe.c
+++ b/tool/build/elf2pe.c
@@ -29,7 +29,7 @@
 #include "libc/intrin/dll.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/pedef.internal.h"
 #include "libc/nt/struct/imagedatadirectory.internal.h"
@@ -159,6 +159,9 @@ static const char *stubpath;
 static long FLAG_SizeOfStackCommit = 64 * 1024;
 static long FLAG_SizeOfStackReserve = 8 * 1024 * 1024;
 
+#define TINYMALLOC_MAX_ALIGN MAX_ALIGN
+#include "libc/mem/tinymalloc.inc"
+
 static wontreturn void Die(const char *thing, const char *reason) {
   tinyprint(2, thing, ": ", reason, "\n", NULL);
   exit(1);
@@ -226,17 +229,6 @@ static struct Segment *NewSegment(void) {
   return s;
 }
 
-static int ConvertElfMachineToPe(struct Elf *elf) {
-  switch (elf->ehdr->e_machine) {
-    case EM_NEXGEN32E:
-      return kNtImageFileMachineNexgen32e;
-    case EM_AARCH64:
-      return kNtImageFileMachineArm64;
-    default:
-      Die(elf->path, "unsupported e_machine");
-  }
-}
-
 static Elf64_Addr RelocateVaddrWithinSegment(struct Elf *elf,
                                              Elf64_Addr vaddr_old,
                                              struct Segment *segment) {
@@ -819,17 +811,7 @@ static uint32_t GetPeSectionCharacteristics(struct Segment *s) {
 // originally in the elf image that ld linked. in order for this to work
 // the executable needs to be linked in `ld -q` mode, since it'll retain
 // the .rela sections we'll need later to fixup the binary.
-static struct ImagePointer GeneratePe(struct Elf *elf, char *fp) {
-
-  int64_t vp = 0;
-  Elf64_Phdr *phdr;
-  for (int i = 0; i < elf->ehdr->e_phnum; ++i) {
-    if ((phdr = GetElfProgramHeaderAddress(elf->ehdr, elf->size, i)) &&
-        phdr->p_type == PT_LOAD) {
-      vp = phdr->p_vaddr;
-      break;
-    }
-  }
+static struct ImagePointer GeneratePe(struct Elf *elf, char *fp, int64_t vp) {
 
   Elf64_Sym *entry;
   if (!(entry = FindGlobal(elf, "__win32_start")) &&
@@ -873,7 +855,7 @@ static struct ImagePointer GeneratePe(struct Elf *elf, char *fp) {
   struct NtImageFileHeader *filehdr;
   filehdr = (struct NtImageFileHeader *)fp;
   fp += sizeof(struct NtImageFileHeader);
-  filehdr->Machine = ConvertElfMachineToPe(elf);
+  filehdr->Machine = kNtImageFileMachineNexgen32e;
   filehdr->TimeDateStamp = 1690072024;
   filehdr->Characteristics =
       kNtPeFileExecutableImage | kNtImageFileLargeAddressAware |
@@ -891,9 +873,7 @@ static struct ImagePointer GeneratePe(struct Elf *elf, char *fp) {
   opthdr->FileAlignment = 512;
   opthdr->SectionAlignment = MAX(4096, elf->align);
   opthdr->MajorOperatingSystemVersion = 6;
-  opthdr->MinorOperatingSystemVersion = 2;
   opthdr->MajorSubsystemVersion = 6;
-  opthdr->MinorSubsystemVersion = 2;
   opthdr->Subsystem = kNtImageSubsystemWindowsCui;
   opthdr->DllCharacteristics = kNtImageDllcharacteristicsNxCompat |
                                kNtImageDllcharacteristicsHighEntropyVa;
@@ -1136,7 +1116,7 @@ int main(int argc, char *argv[]) {
   // translate executable
   struct Elf *elf = OpenElf(argv[optind]);
   char *buf = Memalign(MAX_ALIGN, 134217728);
-  struct ImagePointer ip = GeneratePe(elf, buf);
+  struct ImagePointer ip = GeneratePe(elf, buf, 0x00400000);
   if (creat(outpath, 0755) == -1)
     DieSys(elf->path);
   Pwrite(3, buf, ip.fp - buf, 0);
diff --git a/tool/build/elf2pe.h b/tool/build/elf2pe.h
index 49cb8e71e..53312b1a2 100644
--- a/tool/build/elf2pe.h
+++ b/tool/build/elf2pe.h
@@ -1,8 +1,8 @@
 #ifndef COSMOPOLITAN_TOOL_BUILD_ELF2PE_H_
 #define COSMOPOLITAN_TOOL_BUILD_ELF2PE_H_
 
-#define __dll_import(DLL, RET, FUNC, ARGS)                  \
-  extern RET(*const __msabi __attribute__((__weak__)) FUNC) \
+#define __dll_import(DLL, RET, FUNC, ARGS)                      \
+  extern RET(*const __attribute__((__ms_abi__, __weak__)) FUNC) \
       ARGS __asm__("\"dll$" DLL "$" #FUNC "\"")
 
 #endif /* COSMOPOLITAN_TOOL_BUILD_ELF2PE_H_ */
diff --git a/tool/build/fixupobj.c b/tool/build/fixupobj.c
index dfda7e877..570cd3e4c 100644
--- a/tool/build/fixupobj.c
+++ b/tool/build/fixupobj.c
@@ -31,12 +31,12 @@
 #include "libc/fmt/magnumstrs.internal.h"
 #include "libc/limits.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/serialize.h"
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #include "libc/stdckdint.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
@@ -44,7 +44,7 @@
 #include "libc/sysv/consts/msync.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "third_party/getopt/getopt.internal.h"
 
 /**
@@ -67,6 +67,8 @@ static Elf64_Ehdr *elf;
 static const char *epath;
 static Elf64_Xword symcount;
 
+#include "libc/mem/tinymalloc.inc"
+
 static wontreturn void Die(const char *reason) {
   tinyprint(2, epath, ": ", reason, "\n", NULL);
   exit(1);
@@ -243,7 +245,7 @@ static void CheckPrivilegedCrossReferences(void) {
       if (~shdr->sh_flags & SHF_EXECINSTR)
         continue;  // data reference
       if ((secname = GetElfString(elf, esize, secstrs, shdr->sh_name)) &&
-          !startswith(secname, ".privileged")) {
+          strcmp(".privileged", secname)) {
         tinyprint(2, epath,
                   ": code in .privileged section "
                   "references symbol '",
diff --git a/tool/build/freebsd2sysv.c b/tool/build/freebsd2sysv.c
index c1e79528d..a0086c28d 100644
--- a/tool/build/freebsd2sysv.c
+++ b/tool/build/freebsd2sysv.c
@@ -25,7 +25,7 @@
 #include "libc/sysv/consts/prot.h"
 
 int main(int argc, char *argv[]) {
-  int fd = open(argv[1], O_RDWR);
-  Elf64_Ehdr *e = mmap(0, 64, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  open(argv[1], O_RDWR);
+  Elf64_Ehdr *e = mmap(0, 64, PROT_READ | PROT_WRITE, MAP_SHARED, 3, 0);
   e->e_ident[EI_OSABI] = ELFOSABI_SYSV;
 }
diff --git a/tool/build/gzip.c b/tool/build/gzip.c
index c2c436cac..c26d65a73 100644
--- a/tool/build/gzip.c
+++ b/tool/build/gzip.c
@@ -71,6 +71,8 @@ const char *prog;
 char databuf[32768];
 char pathbuf[PATH_MAX];
 
+#include "libc/mem/tinymalloc.inc"
+
 wontreturn void PrintUsage(int rc, FILE *f) {
   fputs("usage: ", f);
   fputs(prog, f);
@@ -137,7 +139,6 @@ void Compress(const char *inpath) {
   FILE *input;
   gzFile output;
   int rc, errnum;
-  FILE *closeme = 0;
   const char *outpath;
   char *p, openflags[5];
   if ((!inpath || opt_usestdout) && (!isatty(1) || opt_force)) {
@@ -150,7 +151,7 @@ void Compress(const char *inpath) {
     exit(1);
   }
   if (inpath) {
-    input = closeme = fopen(inpath, "rb");
+    input = fopen(inpath, "rb");
   } else {
     inpath = "/dev/stdin";
     input = stdin;
@@ -177,9 +178,8 @@ void Compress(const char *inpath) {
   }
   if (!output) {
     fputs(outpath, stderr);
-    fputs(": gzopen() failed: ", stderr);
-    const char *s = _strerdoc(errno);
-    fputs(s ? s : "EUNKNOWN", stderr);
+    fputs(": gzopen() failed\n", stderr);
+    fputs(_strerdoc(errno), stderr);
     fputs("\n", stderr);
     exit(1);
   }
@@ -189,8 +189,7 @@ void Compress(const char *inpath) {
       errnum = 0;
       fputs(inpath, stderr);
       fputs(": read failed: ", stderr);
-      const char *s = _strerdoc(ferror(input));
-      fputs(s ? s : "EUNKNOWN", stderr);
+      fputs(_strerdoc(ferror(input)), stderr);
       fputs("\n", stderr);
       _Exit(1);
     }
@@ -202,8 +201,8 @@ void Compress(const char *inpath) {
       _Exit(1);
     }
   } while (rc == sizeof(databuf));
-  if (closeme) {
-    if (fclose(closeme)) {
+  if (input != stdin) {
+    if (fclose(input)) {
       fputs(inpath, stderr);
       fputs(": close failed\n", stderr);
       _Exit(1);
@@ -222,7 +221,6 @@ void Compress(const char *inpath) {
 void Decompress(const char *inpath) {
   FILE *output;
   gzFile input;
-  FILE *closeme = 0;
   int rc, n, errnum;
   const char *outpath;
   outpath = 0;
@@ -235,9 +233,8 @@ void Decompress(const char *inpath) {
   }
   if (!input) {
     fputs(inpath, stderr);
-    fputs(": gzopen() failed: ", stderr);
-    const char *s = _strerdoc(errno);
-    fputs(s ? s : "EUNKNOWN", stderr);
+    fputs(": gzopen() failed\n", stderr);
+    fputs(_strerdoc(errno), stderr);
     fputs("\n", stderr);
     exit(1);
   }
@@ -251,11 +248,10 @@ void Decompress(const char *inpath) {
     memcpy(pathbuf, inpath, n - 3);
     pathbuf[n - 3] = 0;
     outpath = pathbuf;
-    if (!(output = closeme = fopen(outpath, opt_append ? "wa" : "wb"))) {
+    if (!(output = fopen(outpath, opt_append ? "wa" : "wb"))) {
       fputs(outpath, stderr);
       fputs(": open failed: ", stderr);
-      const char *s = _strerdoc(errno);
-      fputs(s ? s : "EUNKNOWN", stderr);
+      fputs(_strerdoc(errno), stderr);
       fputs("\n", stderr);
       _Exit(1);
     }
@@ -277,8 +273,7 @@ void Decompress(const char *inpath) {
     if (fwrite(databuf, rc, 1, output) != 1) {
       fputs(outpath, stderr);
       fputs(": write failed: ", stderr);
-      const char *s = _strerdoc(ferror(output));
-      fputs(s ? s : "EUNKNOWN", stderr);
+      fputs(_strerdoc(ferror(output)), stderr);
       fputs("\n", stderr);
       _Exit(1);
     }
@@ -288,8 +283,8 @@ void Decompress(const char *inpath) {
     fputs(": gzclose failed\n", stderr);
     _Exit(1);
   }
-  if (closeme) {
-    if (fclose(closeme)) {
+  if (output != stdout) {
+    if (fclose(output)) {
       fputs(outpath, stderr);
       fputs(": close failed\n", stderr);
       _Exit(1);
diff --git a/tool/build/helpop.c b/tool/build/helpop.c
index e5e0297c3..9da096ca6 100644
--- a/tool/build/helpop.c
+++ b/tool/build/helpop.c
@@ -19,7 +19,7 @@
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/intrin/safemacros.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
diff --git a/tool/build/killall.c b/tool/build/killall.c
index c37102a48..75a41e5c7 100644
--- a/tool/build/killall.c
+++ b/tool/build/killall.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/dce.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/enum/formatmessageflags.h"
 #include "libc/nt/enum/lang.h"
@@ -51,6 +51,8 @@ static const char *prog;
 static char16_t **filters;
 static uint32_t pids[10000];
 
+#include "libc/mem/tinymalloc.inc"
+
 static wontreturn void PrintUsage(int rc, FILE *f) {
   fprintf(f,
           "Usage: %s [-nshv] NAME...\n"
diff --git a/tool/build/lib/ar.c b/tool/build/lib/ar.c
deleted file mode 100644
index 5e09dd15c..000000000
--- a/tool/build/lib/ar.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "tool/build/lib/ar.h"
-#include "libc/ar.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/stat.h"
-#include "libc/ctype.h"
-#include "libc/fmt/conv.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/prot.h"
-
-/**
- * @fileoverview static archive reader
- *
- * This file implements an API similar to opendir() for raeding the .o
- * files within your .a file. This works by mapping the .a file into
- * memory and then yielding pointers into the map where embedded files
- * reside, along with their decoded filenames.
- *
- * To try this library:
- *
- *     make -j o//tool/decode/ar2 o//libc/str/str.a
- *     o//tool/decode/ar2 o//libc/str/str.a
- *
- * This implementation currently dies on error. The intent is to make
- * this as simple of an abstraction as possible for simple command line
- * utilities like o//tool/build/ar. It shouldn't be considered a serious
- * general purpose library. Another thing it can't do is decode symbol
- * table information, since Cosmopolitan Ar currently doesn't need it.
- */
-
-void openar(struct Ar *ar, const char *path) {
-  memset(ar, 0, sizeof(*ar));
-  ar->path = path;
-  if ((ar->fd = open(path, O_RDONLY)) == -1) {
-    perror(ar->path);
-    exit(1);
-  }
-  if (fstat(ar->fd, &ar->st)) {
-    perror(ar->path);
-    exit(1);
-  }
-  ar->map = mmap(0, ar->st.st_size, PROT_READ, MAP_PRIVATE, ar->fd, 0);
-  if (ar->map == MAP_FAILED) {
-    perror(ar->path);
-    exit(1);
-  }
-  if (!startswith(ar->map, ARMAG)) {
-    tinyprint(2, ar->path, ": not an ar file\n", NULL);
-    exit(1);
-  }
-  ar->offset = SARMAG;
-}
-
-void closear(struct Ar *ar) {
-  if (munmap(ar->map, ar->st.st_size)) {
-    perror(ar->path);
-    exit(1);
-  }
-  if (close(ar->fd)) {
-    perror(ar->path);
-    exit(1);
-  }
-}
-
-bool readar(struct Ar *ar, struct ArFile *arf) {
-  for (;;) {
-    ar->offset += 1;
-    ar->offset &= -2;
-    if (ar->offset + sizeof(struct ar_hdr) > ar->st.st_size)
-      return false;
-
-    struct ar_hdr *hdr = (struct ar_hdr *)(ar->map + ar->offset);
-    ar->offset += sizeof(struct ar_hdr);
-
-    char ar_fmag[sizeof(hdr->ar_fmag) + 1] = {0};
-    memcpy(ar_fmag, hdr->ar_fmag, sizeof(hdr->ar_fmag));
-    if (strcmp(ar_fmag, ARFMAG)) {
-      tinyprint(2, ar->path, ": corrupt ar file fmag\n", NULL);
-      exit(1);
-    }
-
-    char ar_name[sizeof(hdr->ar_name) + 1] = {0};
-    memcpy(ar_name, hdr->ar_name, sizeof(hdr->ar_name));
-    for (int j = sizeof(hdr->ar_name) - 1 + 1; j-- && isspace(ar_name[j]);)
-      ar_name[j] = '\0';
-
-    char ar_size[sizeof(hdr->ar_size) + 1] = {0};
-    memcpy(ar_size, hdr->ar_size, sizeof(hdr->ar_size));
-    int size = atoi(ar_size);
-    if (size < 0 || ar->offset + size > ar->st.st_size) {
-      tinyprint(2, ar->path, ": ar size overlaps eof\n", NULL);
-      exit(1);
-    }
-
-    // symbol table
-    if (!strcmp(ar_name, "/")) {
-      ar->offset += size;
-      continue;
-    }
-
-    // filename table
-    if (!strcmp(ar_name, "//")) {
-      ar->filenames = ar->map + ar->offset;
-      ar->filenames_size = size;
-      ar->offset += size;
-      continue;
-    }
-
-    // get name of object file
-    size_t len;
-    const char *e;
-    if (ar_name[0] == '/') {
-      int off = atoi(ar_name + 1);
-      if (off < 0 || off >= ar->filenames_size) {
-        tinyprint(2, ar->path, ": ar filename not found\n", NULL);
-        exit(1);
-      }
-      if (!(e = memchr(ar->filenames + off, '\n', ar->filenames_size - off))) {
-        tinyprint(2, ar->path, ": ar filename overlaps end\n", NULL);
-        exit(1);
-      }
-      if ((len = e - (ar->filenames + off)) >= PATH_MAX) {
-        tinyprint(2, ar->path, ": ar filename too long\n", NULL);
-        exit(1);
-      }
-      memcpy(arf->name, ar->filenames + off, len);
-      arf->name[len] = '\0';
-      if (len && arf->name[len - 1] == '/')
-        arf->name[--len] = '\0';
-    } else if ((len = strlen(ar_name)) && ar_name[len - 1] == '/') {
-      memcpy(arf->name, ar_name, len - 1);
-      arf->name[len - 1] = '\0';
-    } else {
-      tinyprint(2, ar->path, ": unsupported ar name: ", ar_name, "\n", NULL);
-      exit(1);
-    }
-
-    // return pointer to embedded file
-    arf->size = size;
-    arf->offset = ar->offset;
-    arf->data = ar->map + ar->offset;
-    ar->offset += size;
-    return true;
-  }
-}
diff --git a/tool/build/lib/ar.h b/tool/build/lib/ar.h
deleted file mode 100644
index 350ec27a3..000000000
--- a/tool/build/lib/ar.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef COSMOPOLITAN_TOOL_BUILD_LIB_AR_H_
-#define COSMOPOLITAN_TOOL_BUILD_LIB_AR_H_
-#include "libc/calls/struct/stat.h"
-#include "libc/limits.h"
-COSMOPOLITAN_C_START_
-
-struct Ar {
-  const char *path;
-  int fd;
-  struct stat st;
-  char *map;
-  size_t offset;
-  const char *filenames;
-  size_t filenames_size;
-};
-
-struct ArFile {
-  void *data;
-  size_t size;
-  size_t offset;
-  char name[PATH_MAX];
-};
-
-void openar(struct Ar *, const char *);
-void closear(struct Ar *);
-bool readar(struct Ar *, struct ArFile *);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_TOOL_BUILD_LIB_AR_H_ */
diff --git a/tool/build/lib/buffer.c b/tool/build/lib/buffer.c
index 5fe539a56..037047f66 100644
--- a/tool/build/lib/buffer.c
+++ b/tool/build/lib/buffer.c
@@ -19,7 +19,7 @@
 #include "tool/build/lib/buffer.h"
 #include "libc/calls/calls.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/arraylist2.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/stdio.h"
diff --git a/tool/build/lib/elfwriter.c b/tool/build/lib/elfwriter.c
index 32572c773..46cdd0341 100644
--- a/tool/build/lib/elfwriter.c
+++ b/tool/build/lib/elfwriter.c
@@ -26,7 +26,7 @@
 #include "libc/mem/mem.h"
 #include "libc/runtime/memtrack.internal.h"
 #include "libc/runtime/runtime.h"
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/msync.h"
diff --git a/tool/build/lib/elfwriter_zip.c b/tool/build/lib/elfwriter_zip.c
index 886778038..dbe1562d1 100644
--- a/tool/build/lib/elfwriter_zip.c
+++ b/tool/build/lib/elfwriter_zip.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/dos.h"
+#include "libc/dos.internal.h"
 #include "libc/elf/def.h"
 #include "libc/fmt/wintime.internal.h"
 #include "libc/limits.h"
@@ -33,7 +33,7 @@
 #include "libc/time.h"
 #include "libc/x/x.h"
 #include "libc/x/xasprintf.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "net/http/http.h"
 #include "third_party/zlib/zlib.h"
 #include "tool/build/lib/elfwriter.h"
@@ -63,13 +63,6 @@ static int DetermineVersionNeededToExtract(int method) {
   }
 }
 
-static int NormalizeMode(int mode) {
-  int res = mode & S_IFMT;
-  if (mode & 0111)
-    res |= 0111;
-  return res | 0644;
-}
-
 static unsigned char *EmitZipLfileHdr(unsigned char *p, const void *name,
                                       size_t namesize, uint32_t crc,
                                       uint8_t era, uint16_t gflags,
@@ -94,10 +87,10 @@ static unsigned char *EmitZipLfileHdr(unsigned char *p, const void *name,
 static void EmitZipCdirHdr(unsigned char *p, const void *name, size_t namesize,
                            uint32_t crc, uint8_t era, uint16_t gflags,
                            uint16_t method, uint16_t mtime, uint16_t mdate,
-                           uint16_t iattrs, uint16_t unixmode, size_t compsize,
-                           size_t uncompsize, size_t commentsize,
-                           struct timespec mtim, struct timespec atim,
-                           struct timespec ctim) {
+                           uint16_t iattrs, uint16_t dosmode, uint16_t unixmode,
+                           size_t compsize, size_t uncompsize,
+                           size_t commentsize, struct timespec mtim,
+                           struct timespec atim, struct timespec ctim) {
   uint64_t mt, at, ct;
   p = WRITE32LE(p, kZipCfileHdrMagic);
   *p++ = kZipCosmopolitanVersion;
@@ -118,8 +111,8 @@ static void EmitZipCdirHdr(unsigned char *p, const void *name, size_t namesize,
   p = WRITE16LE(p, commentsize);
   p = WRITE16LE(p, 0); /* disk */
   p = WRITE16LE(p, iattrs);
-  p = WRITE16LE(p, 0);
-  p = WRITE16LE(p, NormalizeMode(unixmode));
+  p = WRITE16LE(p, dosmode);
+  p = WRITE16LE(p, unixmode);
   p = WRITE32LE(p, 0); /* RELOCATE ME (kZipCfileOffsetOffset) */
   /* 46 */
   memcpy(p, name, namesize);
@@ -149,8 +142,8 @@ void elfwriter_zip(struct ElfWriter *elf, const char *symbol, const char *cname,
   uint32_t crc;
   unsigned char *lfile, *cfile;
   struct ElfWriterSymRef lfilesym;
-  uint16_t method, gflags, mtime, mdate, iattrs;
   size_t lfilehdrsize, uncompsize, compsize, commentsize;
+  uint16_t method, gflags, mtime, mdate, iattrs, dosmode;
 
   CHECK_NE(0, mtim.tv_sec);
 
@@ -175,6 +168,7 @@ void elfwriter_zip(struct ElfWriter *elf, const char *symbol, const char *cname,
   if (S_ISREG(mode) && istext(data, size)) {
     iattrs |= kZipIattrText;
   }
+  dosmode = !(mode & 0200) ? kNtFileAttributeReadonly : 0;
   method = ShouldCompress(name, namesize, data, size, nocompress)
                ? kZipCompressionDeflate
                : kZipCompressionNone;
@@ -221,8 +215,8 @@ void elfwriter_zip(struct ElfWriter *elf, const char *symbol, const char *cname,
   elfwriter_startsection(elf, ".zip.cdir", SHT_PROGBITS, 0);
   EmitZipCdirHdr(
       (cfile = elfwriter_reserve(elf, ZIP_CFILE_HDR_SIZE + namesize)), name,
-      namesize, crc, era, gflags, method, mtime, mdate, iattrs, mode, compsize,
-      uncompsize, commentsize, mtim, atim, ctim);
+      namesize, crc, era, gflags, method, mtime, mdate, iattrs, dosmode, mode,
+      compsize, uncompsize, commentsize, mtim, atim, ctim);
   elfwriter_appendsym(elf, gc(xasprintf("%s%s", "zip+cdir:", name)),
                       ELF64_ST_INFO(STB_LOCAL, STT_OBJECT), STV_DEFAULT, 0,
                       ZIP_CFILE_HDR_SIZE + namesize);
diff --git a/tool/build/lib/eztls.c b/tool/build/lib/eztls.c
index 0e5de53ef..754d1a533 100644
--- a/tool/build/lib/eztls.c
+++ b/tool/build/lib/eztls.c
@@ -24,7 +24,7 @@
 #include "libc/fmt/itoa.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/intrin/strace.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/thread/thread.h"
 #include "libc/x/x.h"
diff --git a/tool/build/lib/getargs.c b/tool/build/lib/getargs.c
index faf5fe68d..3e31da96b 100644
--- a/tool/build/lib/getargs.c
+++ b/tool/build/lib/getargs.c
@@ -21,7 +21,7 @@
 #include "libc/calls/calls.h"
 #include "libc/errno.h"
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/sysconf.h"
 #include "libc/stdio/stdio.h"
diff --git a/tool/build/lz4toasm.c b/tool/build/lz4toasm.c
index b7a8943aa..b13a16926 100644
--- a/tool/build/lz4toasm.c
+++ b/tool/build/lz4toasm.c
@@ -21,7 +21,7 @@
 #include "libc/fmt/conv.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/kompressor.h"
@@ -29,7 +29,7 @@
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/x/x.h"
 #include "third_party/getopt/getopt.internal.h"
 
@@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
 
   fprintf(fout,
           "/\t%s -o %s -s %s %s\n"
-          "#include \"libc/macros.h\"\n"
+          "#include \"libc/macros.internal.h\"\n"
           "\n",
           argv[0], outpath, symbol, lz4path);
 
diff --git a/tool/build/mkdeps.c b/tool/build/mkdeps.c
index 982615f05..baa6ba843 100644
--- a/tool/build/mkdeps.c
+++ b/tool/build/mkdeps.c
@@ -22,10 +22,10 @@
 #include "libc/fmt/itoa.h"
 #include "libc/fmt/libgen.h"
 #include "libc/fmt/magnumstrs.internal.h"
+#include "libc/intrin/kprintf.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
-#include "libc/mem/leaks.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/crc32.h"
 #include "libc/runtime/runtime.h"
@@ -33,7 +33,7 @@
 #include "libc/stdio/append.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
@@ -146,6 +146,8 @@ static const char *buildroot;
 static const char *genroot;
 static const char *outpath;
 
+#include "libc/mem/tinymalloc.inc"
+
 static inline bool IsBlank(int c) {
   return c == ' ' || c == '\t';
 }
@@ -343,8 +345,9 @@ static const char *FindIncludePath(const char *map, size_t mapsize,
 
   // scan backwards for hash character
   for (;;) {
-    if (q == map)
+    if (q == map) {
       return 0;
+    }
     if (IsBlank(q[-1])) {
       --q;
       continue;
@@ -411,15 +414,17 @@ static void LoadRelationships(int argc, char *argv[]) {
   static char srcdirbuf[PATH_MAX];
   const char *p, *pe, *src, *path, *pathend, *srcdir, *final;
   getargs_init(&ga, argv + optind);
-  while ((src = getargs_next(&ga)))
+  while ((src = getargs_next(&ga))) {
     CreateSourceId(src);
+  }
   getargs_destroy(&ga);
   getargs_init(&ga, argv + optind);
   while ((src = getargs_next(&ga))) {
     is_assembly = endswith(src, ".s");
     srcid = GetSourceId(src);
-    if (strlcpy(srcdirbuf, src, PATH_MAX) >= PATH_MAX)
+    if (strlcpy(srcdirbuf, src, PATH_MAX) >= PATH_MAX) {
       DiePathTooLong(src);
+    }
     srcdir = dirname(srcdirbuf);
     if ((fd = open(src, O_RDONLY)) == -1) {
       if (errno == ENOENT && ga.path) {
@@ -433,14 +438,17 @@ static void LoadRelationships(int argc, char *argv[]) {
       }
       DieSys(src);
     }
-    if ((rc = lseek(fd, 0, SEEK_END)) == -1)
+    if ((rc = lseek(fd, 0, SEEK_END)) == -1) {
       DieSys(src);
+    }
     if ((size = rc)) {
       // repeatedly map to same fixed address so in order to weasel out
       // of incurring the additional overhead of all these munmap calls
-      map = mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
-      if (map == MAP_FAILED)
+      map = mmap((void *)0x311987030000, size, PROT_READ,
+                 MAP_SHARED | MAP_FIXED, fd, 0);
+      if (map == MAP_FAILED) {
         DieSys(src);
+      }
       for (p = map, pe = map + size; p < pe; ++p) {
         if (!(p = memmem(p, pe - p, "include ", 8)))
           break;
@@ -469,10 +477,12 @@ static void LoadRelationships(int argc, char *argv[]) {
           dependency = -1;
           for (long i = 0; i < systempaths.n; ++i) {
             if (!(final =
-                      __join_paths(juf, PATH_MAX, systempaths.p[i], incpath)))
+                      __join_paths(juf, PATH_MAX, systempaths.p[i], incpath))) {
               DiePathTooLong(incpath);
-            if ((dependency = GetSourceId(final)) != -1)
+            }
+            if ((dependency = GetSourceId(final)) != -1) {
               break;
+            }
           }
           if (dependency != -1) {
             AppendEdge(&edges, dependency, srcid);
@@ -496,8 +506,9 @@ static void LoadRelationships(int argc, char *argv[]) {
           dependency = GetSourceId((final = incpath));
           // let foo/bar.c say `#include "hdr.h"`
           if (dependency == -1 && !strchr(final, '/')) {
-            if (!(final = __join_paths(juf, PATH_MAX, srcdir, final)))
+            if (!(final = __join_paths(juf, PATH_MAX, srcdir, final))) {
               DiePathTooLong(incpath);
+            }
             dependency = GetSourceId(final);
           }
           if (dependency == -1) {
@@ -515,11 +526,10 @@ static void LoadRelationships(int argc, char *argv[]) {
           p = pathend + 1;
         }
       }
-      if (munmap(map, size))
-        DieSys(src);
     }
-    if (close(fd))
+    if (close(fd)) {
       DieSys(src);
+    }
   }
   getargs_destroy(&ga);
 }
@@ -530,8 +540,9 @@ static wontreturn void ShowUsage(int rc, int fd) {
 }
 
 static void AddPath(struct Paths *paths, const char *path) {
-  if (paths->n == ARRAYLEN(paths->p))
+  if (paths->n == ARRAYLEN(paths->p)) {
     Die("too many path arguments");
+  }
   paths->p[paths->n++] = path;
 }
 
@@ -546,18 +557,21 @@ static void GetOpts(int argc, char *argv[]) {
         AddPath(&systempaths, optarg);
         break;
       case 'o':
-        if (outpath)
+        if (outpath) {
           Die("multiple output paths specified");
+        }
         outpath = optarg;
         break;
       case 'r':
-        if (buildroot)
+        if (buildroot) {
           Die("multiple build roots specified");
+        }
         buildroot = optarg;
         break;
       case 'g':
-        if (genroot)
+        if (genroot) {
           Die("multiple generated roots specified");
+        }
         genroot = optarg;
         break;
       case 'n':
@@ -568,24 +582,31 @@ static void GetOpts(int argc, char *argv[]) {
         ShowUsage(1, 2);
     }
   }
-  if (optind == argc)
+  if (optind == argc) {
     Die("missing input argument");
-  if (!genroot)
+  }
+  if (!genroot) {
     genroot = "o/";
-  if (!endswith(genroot, "/"))
+  }
+  if (!endswith(genroot, "/")) {
     Die("generated output path must end with slash");
-  if (!buildroot)
+  }
+  if (!buildroot) {
     Die("need build output path");
-  if (!endswith(buildroot, "/"))
+  }
+  if (!endswith(buildroot, "/")) {
     Die("build output path must end with slash");
-  if (!startswith(buildroot, genroot))
+  }
+  if (!startswith(buildroot, genroot)) {
     Die("build output path must start with generated output path");
+  }
   if (!systempaths.n && hermetic) {
     AddPath(&systempaths, "third_party/libcxx/include/");
     AddPath(&systempaths, "libc/isystem/");
   }
-  if (systempaths.n && !hermetic)
+  if (systempaths.n && !hermetic) {
     Die("system path can only be specified in hermetic mode");
+  }
   long j = 0;
   for (long i = 0; i < systempaths.n; ++i) {
     size_t n;
@@ -598,18 +619,21 @@ static void GetOpts(int argc, char *argv[]) {
         DieSys(path);
       }
     }
-    if ((n = strlen(path)) >= PATH_MAX)
+    if ((n = strlen(path)) >= PATH_MAX) {
       DiePathTooLong(path);
-    if (!n || path[n - 1] != '/')
+    }
+    if (!n || path[n - 1] != '/') {
       Die("system path must end with slash");
+    }
   }
   systempaths.n = j;
 }
 
 static const char *StripExt(char pathbuf[hasatleast PATH_MAX], const char *s) {
   static char *dot;
-  if (strlcpy(pathbuf, s, PATH_MAX) >= PATH_MAX)
+  if (strlcpy(pathbuf, s, PATH_MAX) >= PATH_MAX) {
     DiePathTooLong(s);
+  }
   dot = strrchr(pathbuf, '.');
   if (dot)
     *dot = '\0';
@@ -637,10 +661,13 @@ static uint32_t GetFileExtension(const char *s) {
 static bool IsObjectSource(const char *name) {
   int i;
   uint32_t ext;
-  if ((ext = GetFileExtension(name)))
-    for (i = 0; i < ARRAYLEN(kSourceExts); ++i)
-      if (ext == kSourceExts[i])
+  if ((ext = GetFileExtension(name))) {
+    for (i = 0; i < ARRAYLEN(kSourceExts); ++i) {
+      if (ext == kSourceExts[i]) {
         return true;
+      }
+    }
+  }
   return false;
 }
 
@@ -709,18 +736,22 @@ int main(int argc, char *argv[]) {
   LoadRelationships(argc, argv);
   Crunch();
   makefile = Explore();
-  if (outpath && (fd = open(outpath, O_WRONLY | O_CREAT | O_TRUNC, 0644)) == -1)
+  if (outpath &&
+      (fd = open(outpath, O_WRONLY | O_CREAT | O_TRUNC, 0644)) == -1) {
     DieSys(outpath);
+  }
   n = appendz(makefile).i;
-  for (i = 0; i < n; i += (size_t)rc)
-    if ((rc = write(fd, makefile + i, n - i)) == -1)
+  for (i = 0; i < n; i += (size_t)rc) {
+    if ((rc = write(fd, makefile + i, n - i)) == -1) {
       DieSys(outpath);
-  if (outpath && close(fd))
+    }
+  }
+  if (outpath && close(fd)) {
     DieSys(outpath);
+  }
   free(makefile);
   free(edges.p);
   free(sauces);
   free(names);
-  CheckForMemoryLeaks();
   return 0;
 }
diff --git a/tool/build/mv.c b/tool/build/mv.c
index d1cc31b13..503a2855f 100644
--- a/tool/build/mv.c
+++ b/tool/build/mv.c
@@ -62,6 +62,8 @@ char linkbuf[PATH_MAX];
 
 void Mv(char *, char *);
 
+#include "libc/mem/tinymalloc.inc"
+
 wontreturn void Die(const char *path, const char *reason) {
   tinyprint(2, path, ": ", reason, "\n", NULL);
   exit(1);
diff --git a/tool/build/objbincopy.c b/tool/build/objbincopy.c
index 42427c226..ce08d0aed 100644
--- a/tool/build/objbincopy.c
+++ b/tool/build/objbincopy.c
@@ -21,8 +21,8 @@
 #include "libc/elf/elf.h"
 #include "libc/elf/struct/ehdr.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macho.h"
-#include "libc/macros.h"
+#include "libc/macho.internal.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
diff --git a/tool/build/package.c b/tool/build/package.c
index 9c4c20519..57b5de82a 100644
--- a/tool/build/package.c
+++ b/tool/build/package.c
@@ -30,7 +30,7 @@
 #include "libc/intrin/bswap.h"
 #include "libc/intrin/kprintf.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/arraylist.internal.h"
 #include "libc/mem/mem.h"
@@ -151,6 +151,8 @@ struct Relas {
   } *p;
 } prtu;
 
+#include "libc/mem/tinymalloc.inc"
+
 static wontreturn void Die(const char *path, const char *reason) {
   tinyprint(2, path, ": ", reason, "\n", NULL);
   exit(1);
diff --git a/tool/build/pledge.c b/tool/build/pledge.c
index 78f5527d5..e7335b92d 100644
--- a/tool/build/pledge.c
+++ b/tool/build/pledge.c
@@ -42,7 +42,7 @@
 #include "libc/intrin/promises.h"
 #include "libc/intrin/safemacros.h"
 #include "libc/limits.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/alloca.h"
 #include "libc/nexgen32e/kcpuids.h"
@@ -373,11 +373,7 @@ int SetLimit(int r, long lo, long hi) {
 }
 
 static int GetBaseCpuFreqMhz(void) {
-#ifdef __x86_64__
   return KCPUIDS(16H, EAX) & 0x7fff;
-#else
-  return 0;
-#endif
 }
 
 int SetCpuLimit(int secs) {
diff --git a/tool/build/renamestr.c b/tool/build/renamestr.c
deleted file mode 100644
index 1364bc6c3..000000000
--- a/tool/build/renamestr.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
-#include "libc/elf/def.h"
-#include "libc/elf/elf.h"
-#include "libc/elf/scalar.h"
-#include "libc/elf/struct/ehdr.h"
-#include "libc/elf/struct/phdr.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/intrin/likely.h"
-#include "libc/macros.h"
-#include "libc/mem/mem.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/symbols.internal.h"
-#include "libc/stdio/stdio.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/prot.h"
-#include "third_party/getopt/getopt.internal.h"
-
-#define VERSION      \
-  "renamestr v0.1\n" \
-  "https://github.com/jart/cosmopolitan\n"
-
-#define MANUAL                                                \
-  " -f FROM -t TO INPUT \n"                                   \
-  "\n"                                                        \
-  "DESCRIPTION\n"                                             \
-  "\n"                                                        \
-  "  in-place string replacement in ELF binary .rodata\n"     \
-  "\n"                                                        \
-  "  this program may be used to replace strings in the\n"    \
-  "  .rodata sections of ELF binaries, in-place.\n"           \
-  "\n"                                                        \
-  "FLAGS\n"                                                   \
-  "\n"                                                        \
-  "  -h         show usage\n"                                 \
-  "\n"                                                        \
-  "  -v         show version\n"                               \
-  "\n"                                                        \
-  "  -f FROM    source string to replace\n"                   \
-  "\n"                                                        \
-  "  -t TO      target string replacement. must be shorter\n" \
-  "             than FROM string for replacement to work\n"   \
-  "\n"                                                        \
-  "  INPUT      ELF binary containing strings to replace\n"   \
-  "\n"
-
-static const char *prog;
-static const char *exepath;
-static Elf64_Shdr *rodata;
-static char *rostart;
-static char *roend;
-static int exefd;
-
-static wontreturn void Die(const char *thing, const char *reason) {
-  tinyprint(2, thing, ": ", reason, "\n", NULL);
-  exit(1);
-}
-
-static wontreturn void DieSys(const char *thing) {
-  perror(thing);
-  exit(1);
-}
-
-static wontreturn void ShowUsage(int rc, int fd) {
-  tinyprint(fd, "USAGE\n\n  ", prog, MANUAL, NULL);
-  exit(rc);
-}
-
-static void Pwrite(const void *data, size_t size, uint64_t offset) {
-  ssize_t rc;
-  const char *p, *e;
-  for (p = data, e = p + size; p < e; p += (size_t)rc, offset += (size_t)rc) {
-    if ((rc = pwrite(exefd, p, e - p, offset)) == -1) {
-      DieSys(exepath);
-    }
-  }
-}
-
-struct String {
-  const char *str;
-  size_t len;
-};
-
-struct Param {
-  struct String from;
-  struct String to;
-  int count;
-  char *roloc;
-};
-
-struct Params {
-  int n;
-  struct Param p[4];
-};
-
-static struct Params params;
-
-static void GetOpts(int argc, char *argv[]) {
-  int opt;
-  bool partial = false;
-  params.n = 0;
-  struct Param *param;
-  while ((opt = getopt(argc, argv, "hvf:t:")) != -1) {
-    if (params.n >= ARRAYLEN(params.p)) {
-      param = NULL;
-    } else {
-      param = &(params.p[params.n]);
-    }
-    switch (opt) {
-      case 'f':
-        if (!param) {
-          Die(prog, "too many replacements provided");
-        }
-        if (param->from.str) {
-          Die(prog, "from string already provided");
-        }
-        param->from.str = optarg;
-        param->from.len = strlen(optarg);
-        partial = !partial;
-        break;
-      case 't':
-        if (!param) {
-          Die(prog, "too many replacements provided");
-        }
-        if (param->to.str) {
-          Die(prog, "to string already provided");
-        }
-        param->to.str = optarg;
-        param->to.len = strlen(optarg);
-        partial = !partial;
-        break;
-      case 'v':
-        tinyprint(0, VERSION, NULL);
-        exit(0);
-      case 'h':
-        ShowUsage(0, 1);
-      default:
-        ShowUsage(1, 2);
-    }
-    if (param->from.str && param->to.str) {
-      if (param->from.len < param->to.len) {
-        Die(prog, "to.str longer than from.str, cannot replace");
-      }
-      params.n++;
-    }
-  }
-  if (params.n == 0) {
-    Die(prog, "no replacements provided");
-  }
-  if (partial) {
-    Die(prog, "partial replacement provided");
-  }
-  if (optind == argc) {
-    Die(prog, "missing input argument");
-  }
-  if (optind != argc - 1) {
-    Die(prog, "too many args");
-  }
-  exepath = argv[optind];
-}
-
-struct Input {
-  union {
-    char *map;
-    Elf64_Ehdr *elf;
-    unsigned char *umap;
-  };
-  size_t size;
-  const char *path;
-};
-
-static struct Input input;
-
-static void OpenInput(const char *path) {
-  int fd;
-  if ((fd = open(path, O_RDWR)) == -1)
-    DieSys(path);
-  if ((input.size = lseek(fd, 0, SEEK_END)) == -1)
-    DieSys(path);
-  input.path = path;
-  input.map = mmap(0, input.size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
-  if (input.map == MAP_FAILED)
-    DieSys(path);
-  if (!IsElf64Binary(input.elf, input.size))
-    Die(path, "not an elf64 binary");
-  exefd = fd;
-}
-
-static void ReplaceString(struct Param *param) {
-  size_t len;
-  char *x = (char *)memchr(param->roloc, 0, roend - param->roloc);
-  memmove(param->roloc, param->to.str, param->to.len);
-  if (UNLIKELY(x == NULL)) {
-    len = roend - param->roloc;
-    memmove(param->roloc + param->to.len, param->roloc + param->from.len,
-            len - param->from.len);
-  } else {
-    len = x - param->roloc;
-    memmove(param->roloc + param->to.len, param->roloc + param->from.len,
-            len + 1 - param->from.len);
-  }
-  param->roloc += param->to.len;
-}
-
-int main(int argc, char *argv[]) {
-#ifdef MODE_DBG
-  ShowCrashReports();
-#endif
-
-  prog = argv[0];
-
-  if (!prog)
-    prog = "renamestr";
-
-  GetOpts(argc, argv);
-  OpenInput(exepath);
-  rodata = FindElfSectionByName(
-      input.elf, input.size,
-      GetElfSectionNameStringTable(input.elf, input.size), ".rodata");
-  if (!rodata)
-    Die(exepath, "doesn't have .rodata");
-
-  rostart = GetElfSectionAddress(input.elf, input.size, rodata);
-  if (!rostart)
-    Die(prog, "could not get to start of .rodata");
-  roend = rostart + rodata->sh_size;
-
-#ifdef MODE_DBG
-  kprintf("elf file to process: %s\n", exepath);
-  kprintf("file size is %ld\n", input.size);
-#endif
-  for (int i = 0; i < params.n; ++i) {
-    struct Param *param = &(params.p[i]);
-    param->roloc = rostart;
-    param->count = 0;
-#ifdef MODE_DBG
-    kprintf("need to replace '%s' with '%s'\n", param->from.str, param->to.str);
-#endif
-  }
-
-#define NEXT_ROLOC(z) \
-  memmem((z)->roloc, roend - (z)->roloc, (z)->from.str, (z)->from.len)
-  for (int i = 0; i < params.n; ++i) {
-    struct Param *param = &(params.p[i]);
-    for (param->roloc = NEXT_ROLOC(param); param->roloc != NULL;
-         param->roloc = NEXT_ROLOC(param)) {
-      ReplaceString(param);
-      param->count++;
-    }
-  }
-#undef NEXT_ROLOC
-
-  Pwrite(input.map, input.size, 0);
-  if (close(exefd)) {
-    Die(prog, "unable to close file after writing");
-  }
-
-  for (int i = 0; i < params.n; ++i) {
-    struct Param *param = &(params.p[i]);
-    printf("'%s' -> '%s': %d replacements\n", param->from.str, param->to.str,
-           param->count);
-  }
-  return 0;
-}
diff --git a/tool/build/resymbol.c b/tool/build/resymbol.c
index 4bf094ce6..f525324a2 100644
--- a/tool/build/resymbol.c
+++ b/tool/build/resymbol.c
@@ -33,6 +33,8 @@ const char *FLAG_prefix;
 const char *FLAG_suffix;
 const char *path;
 
+#include "libc/mem/tinymalloc.inc"
+
 wontreturn void PrintUsage(int fd, int exitcode) {
   tinyprint(fd, "\n\
 NAME\n\
diff --git a/tool/build/rm.c b/tool/build/rm.c
index c5a041dbc..e4a3a5077 100644
--- a/tool/build/rm.c
+++ b/tool/build/rm.c
@@ -48,6 +48,8 @@ static bool recursive;
 static bool doemptydirs;
 static const char *prog;
 
+#include "libc/mem/tinymalloc.inc"
+
 static wontreturn void PrintUsage(int rc, int fd) {
   tinyprint(fd, "USAGE\n\n  ", prog, USAGE, NULL);
   exit(rc);
diff --git a/tool/build/runit.c b/tool/build/runit.c
index 1b123f3ea..6c2a34fc5 100644
--- a/tool/build/runit.c
+++ b/tool/build/runit.c
@@ -161,12 +161,12 @@ void Connect(void) {
   CHECK_NE(-1,
            (g_sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)));
   expo = INITIAL_CONNECT_TIMEOUT;
-  deadline = timespec_add(timespec_mono(),
+  deadline = timespec_add(timespec_real(),
                           timespec_fromseconds(MAX_WAIT_CONNECT_SECONDS));
   LOGIFNEG1(sigaction(SIGALRM, &(struct sigaction){.sa_handler = OnAlarm}, 0));
   DEBUGF("connecting to %s (%hhu.%hhu.%hhu.%hhu) to run %s", g_hostname, ip4[0],
          ip4[1], ip4[2], ip4[3], g_prog);
-  struct timespec start = timespec_mono();
+  struct timespec start = timespec_real();
 TryAgain:
   alarmed = false;
   LOGIFNEG1(setitimer(
@@ -178,7 +178,7 @@ TryAgain:
   if (rc == -1) {
     if (err == EINTR) {
       expo *= 1.5;
-      if (timespec_cmp(timespec_mono(), deadline) >= 0) {
+      if (timespec_cmp(timespec_real(), deadline) >= 0) {
         FATALF("timeout connecting to %s (%hhu.%hhu.%hhu.%hhu:%d)", g_hostname,
                ip4[0], ip4[1], ip4[2], ip4[3],
                ntohs(((struct sockaddr_in *)ai->ai_addr)->sin_port));
@@ -193,7 +193,7 @@ TryAgain:
   }
   setitimer(ITIMER_REAL, &(const struct itimerval){0}, 0);
   freeaddrinfo(ai);
-  connect_latency = timespec_tomicros(timespec_sub(timespec_mono(), start));
+  connect_latency = timespec_tomicros(timespec_sub(timespec_real(), start));
 }
 
 bool Send(int tmpfd, const void *output, size_t outputsize) {
@@ -204,8 +204,7 @@ bool Send(int tmpfd, const void *output, size_t outputsize) {
   static bool once;
   static z_stream zs;
   zsize = 32768;
-  if (!(zbuf = malloc(zsize)))
-    __builtin_trap();
+  zbuf = gc(malloc(zsize));
   if (!once) {
     CHECK_EQ(Z_OK, deflateInit2(&zs, 4, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL,
                                 Z_DEFAULT_STRATEGY));
@@ -227,7 +226,6 @@ bool Send(int tmpfd, const void *output, size_t outputsize) {
       break;
     }
   } while (!zs.avail_out);
-  free(zbuf);
   return ok;
 }
 
@@ -309,7 +307,7 @@ bool Recv(char *p, int n) {
 
 int ReadResponse(void) {
   int exitcode;
-  struct timespec start = timespec_mono();
+  struct timespec start = timespec_real();
   for (;;) {
     char msg[5];
     if (!Recv(msg, 5)) {
@@ -354,7 +352,7 @@ int ReadResponse(void) {
       break;
     }
   }
-  execute_latency = timespec_tomicros(timespec_sub(timespec_mono(), start));
+  execute_latency = timespec_tomicros(timespec_sub(timespec_real(), start));
   close(g_sock);
   return exitcode;
 }
@@ -379,15 +377,11 @@ int RunOnHost(char *spec) {
   for (;;) {
     Connect();
     EzFd(g_sock);
-    struct timespec start = timespec_mono();
+    struct timespec start = timespec_real();
     err = EzHandshake2();
-    handshake_latency = timespec_tomicros(timespec_sub(timespec_mono(), start));
+    handshake_latency = timespec_tomicros(timespec_sub(timespec_real(), start));
     if (!err)
       break;
-    if (err == MBEDTLS_ERR_NET_CONN_RESET) {
-      close(g_sock);
-      continue;
-    }
     WARNF("handshake with %s:%d failed -0x%04x (%s)",  //
           g_hostname, g_runitdport, err, GetTlsError(err));
     close(g_sock);
diff --git a/tool/build/runitd.c b/tool/build/runitd.c
index 0287bd29a..577876772 100644
--- a/tool/build/runitd.c
+++ b/tool/build/runitd.c
@@ -33,7 +33,7 @@
 #include "libc/intrin/kprintf.h"
 #include "libc/log/appendresourcereport.internal.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/leaks.h"
 #include "libc/mem/mem.h"
@@ -49,7 +49,6 @@
 #include "libc/stdio/append.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
-#include "libc/stdio/sysparam.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/af.h"
 #include "libc/sysv/consts/at.h"
@@ -453,8 +452,8 @@ void *ClientWorker(void *arg) {
   char *addrstr, *origname;
   unsigned char msg[4 + 1 + 4 + 4 + 4];
 
-  ts0 = timespec_mono();
-  ts1 = timespec_mono();
+  ts0 = timespec_real();
+  ts1 = timespec_real();
 
   SetupPresharedKeySsl(MBEDTLS_SSL_IS_SERVER, g_psk);
   defer(FreeClient, client);
@@ -466,14 +465,14 @@ void *ClientWorker(void *arg) {
   addrstr = DescribeAddress(&client->addr);
   DEBUF("%s %s %s", DescribeAddress(&g_servaddr), "accepted", addrstr);
   DEBUF("it took %'zu us to handshake client",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts1)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts1)));
 
   // get the executable
-  ts1 = timespec_mono();
-  ts2 = timespec_mono();
+  ts1 = timespec_real();
+  ts2 = timespec_real();
   Recv(client, msg, sizeof(msg));
   DEBUF("it took %'zu us to receive #1",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts2)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts2)));
   if (READ32BE(msg) != RUNITD_MAGIC) {
     WARNF("%s magic mismatch!", addrstr);
     pthread_exit(0);
@@ -486,19 +485,19 @@ void *ClientWorker(void *arg) {
   filesize = READ32BE(msg + 9);
   crc = READ32BE(msg + 13);
   origname = gc(calloc(1, namesize + 1));
-  ts2 = timespec_mono();
+  ts2 = timespec_real();
   Recv(client, origname, namesize);
   DEBUF("it took %'zu us to receive #2",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts2)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts2)));
   VERBF("%s sent %#s (%'u bytes @ %#s)", addrstr, origname, filesize,
         client->tmpexepath);
   char *exedata = gc(malloc(filesize));
-  ts2 = timespec_mono();
+  ts2 = timespec_real();
   Recv(client, exedata, filesize);
   DEBUF("it took %'zu us to receive #3",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts2)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts2)));
   DEBUF("it took %'zu us to receive executable from network",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts1)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts1)));
   if (crc32_z(0, exedata, filesize) != crc) {
     WARNF("%s crc mismatch! %#s", addrstr, origname);
     pthread_exit(0);
@@ -509,7 +508,7 @@ void *ClientWorker(void *arg) {
   // condition can happen, where etxtbsy is raised by our execve
   // we're using o_cloexec so it's guaranteed to fix itself fast
   // thus we use an optimistic approach to avoid expensive locks
-  ts1 = timespec_mono();
+  ts1 = timespec_real();
   sprintf(client->tmpexepath, "o/%s.XXXXXX",
           basename(stripext(gc(strdup(origname)))));
   int exefd = openatemp(AT_FDCWD, client->tmpexepath, 0, O_CLOEXEC, 0700);
@@ -533,7 +532,7 @@ void *ClientWorker(void *arg) {
     pthread_exit(0);
   }
   DEBUF("it took %'zu us to write executable to disk",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts1)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts1)));
 
   // do the args
   int i = 0;
@@ -574,7 +573,7 @@ RetryOnEtxtbsyRaceCondition:
   posix_spawnattr_t spawnattr;
   posix_spawn_file_actions_t spawnfila;
   sigemptyset(&sigmask);
-  started = timespec_mono();
+  started = timespec_real();
   pipe2(client->pipe, O_CLOEXEC);
   posix_spawnattr_init(&spawnattr);
   posix_spawnattr_setflags(&spawnattr,
@@ -584,11 +583,11 @@ RetryOnEtxtbsyRaceCondition:
   posix_spawn_file_actions_adddup2(&spawnfila, g_bogusfd, 0);
   posix_spawn_file_actions_adddup2(&spawnfila, client->pipe[1], 1);
   posix_spawn_file_actions_adddup2(&spawnfila, client->pipe[1], 2);
-  ts1 = timespec_mono();
+  ts1 = timespec_real();
   err = posix_spawn(&client->pid, client->tmpexepath, &spawnfila, &spawnattr,
                     args, environ);
   DEBUF("it took %'zu us to call posix_spawn",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts1)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts1)));
   if (err) {
     if (err == ETXTBSY) {
       goto RetryOnEtxtbsyRaceCondition;
@@ -603,7 +602,7 @@ RetryOnEtxtbsyRaceCondition:
 
   DEBUF("communicating %s[%d]", origname, client->pid);
   struct timespec deadline =
-      timespec_add(timespec_mono(), timespec_fromseconds(DEATH_CLOCK_SECONDS));
+      timespec_add(timespec_real(), timespec_fromseconds(DEATH_CLOCK_SECONDS));
   for (;;) {
     if (g_interrupted) {
       WARNF("killing %d %s and hanging up %d due to interrupt", client->fd,
@@ -615,7 +614,7 @@ RetryOnEtxtbsyRaceCondition:
       PrintProgramOutput(client);
       pthread_exit(0);
     }
-    struct timespec now = timespec_mono();
+    struct timespec now = timespec_real();
     if (timespec_cmp(now, deadline) >= 0) {
       WARNF("killing %s (pid %d) which timed out after %d seconds", origname,
             client->pid, DEATH_CLOCK_SECONDS);
@@ -626,11 +625,11 @@ RetryOnEtxtbsyRaceCondition:
     fds[0].events = POLLIN;
     fds[1].fd = client->pipe[0];
     fds[1].events = POLLIN;
-    ts1 = timespec_mono();
-    int64_t ms = timespec_tomillis(timespec_sub(deadline, now));
-    events = poll(fds, ARRAYLEN(fds), MIN(ms, -1u));
+    ts1 = timespec_real();
+    events = poll(fds, ARRAYLEN(fds),
+                  timespec_tomillis(timespec_sub(deadline, now)));
     DEBUF("it took %'zu us to call poll",
-          timespec_tomicros(timespec_sub(timespec_mono(), ts1)));
+          timespec_tomicros(timespec_sub(timespec_real(), ts1)));
     if (events == -1) {
       if (errno == EINTR) {
         INFOF("poll interrupted");
@@ -645,10 +644,10 @@ RetryOnEtxtbsyRaceCondition:
       if (fds[0].revents) {
         int received;
         char buf[512];
-        ts1 = timespec_mono();
+        ts1 = timespec_real();
         received = mbedtls_ssl_read(&ezssl, buf, sizeof(buf));
         DEBUF("it took %'zu us to call mbedtls_ssl_read",
-              timespec_tomicros(timespec_sub(timespec_mono(), ts1)));
+              timespec_tomicros(timespec_sub(timespec_real(), ts1)));
         if (!received) {
           WARNF("%s client disconnected so killing worker %d", origname,
                 client->pid);
@@ -673,10 +672,10 @@ RetryOnEtxtbsyRaceCondition:
       }
       if (fds[1].revents) {
         char buf[512];
-        ts1 = timespec_mono();
+        ts1 = timespec_real();
         ssize_t got = read(client->pipe[0], buf, sizeof(buf));
         DEBUF("it took %'zu us to call read",
-              timespec_tomicros(timespec_sub(timespec_mono(), ts1)));
+              timespec_tomicros(timespec_sub(timespec_real(), ts1)));
         if (got == -1) {
           WARNF("got %s reading %s output", strerror(errno), origname);
           goto HangupClientAndTerminateJob;
@@ -694,10 +693,10 @@ RetryOnEtxtbsyRaceCondition:
 WaitAgain:
   DEBUF("waitpid");
   struct rusage rusage;
-  ts1 = timespec_mono();
+  ts1 = timespec_real();
   int wrc = wait4(client->pid, &wstatus, 0, &rusage);
   DEBUF("it took %'zu us to call wait4",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts1)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts1)));
   if (wrc == -1) {
     if (errno == EINTR) {
       WARNF("waitpid interrupted; killing %s pid %d", origname, client->pid);
@@ -715,7 +714,7 @@ WaitAgain:
   }
   client->pid = 0;
   int exitcode;
-  struct timespec ended = timespec_mono();
+  struct timespec ended = timespec_real();
   int64_t micros = timespec_tomicros(timespec_sub(ended, started));
   if (WIFEXITED(wstatus)) {
     if (WEXITSTATUS(wstatus)) {
@@ -750,18 +749,18 @@ WaitAgain:
     AppendResourceReport(&client->output, &rusage, "\n");
     PrintProgramOutput(client);
   }
-  ts1 = timespec_mono();
+  ts1 = timespec_real();
   SendProgramOutput(client);
   SendExitMessage(exitcode);
   mbedtls_ssl_close_notify(&ezssl);
   DEBUF("it took %'zu us to send result to client",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts1)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts1)));
   if (etxtbsy_tries > 1) {
     WARNF("encountered %d ETXTBSY race conditions spawning %s",
           etxtbsy_tries - 1, origname);
   }
   DEBUF("it took %'zu us TO DO EVERYTHING",
-        timespec_tomicros(timespec_sub(timespec_mono(), ts0)));
+        timespec_tomicros(timespec_sub(timespec_real(), ts0)));
   pthread_exit(0);
 }
 
diff --git a/tool/build/sha256sum.c b/tool/build/sha256sum.c
index 50f461791..369676127 100644
--- a/tool/build/sha256sum.c
+++ b/tool/build/sha256sum.c
@@ -25,7 +25,7 @@
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "third_party/getopt/getopt.internal.h"
 #include "third_party/mbedtls/sha256.h"
 
diff --git a/tool/build/symtab.c b/tool/build/symtab.c
index 3c3584be4..b372d6f4d 100644
--- a/tool/build/symtab.c
+++ b/tool/build/symtab.c
@@ -30,6 +30,8 @@
  * @fileoverview elf to symbol table file dump tool
  */
 
+#include "libc/mem/tinymalloc.inc"
+
 void PrintUsage(FILE *f) {
   fprintf(f, "%s%s%s\n", "usage: ", program_invocation_name,
           " [-?h] -o PATH COMDBG");
diff --git a/tool/build/zipcopy.c b/tool/build/zipcopy.c
index 9f3aa8fd0..125ab57ae 100644
--- a/tool/build/zipcopy.c
+++ b/tool/build/zipcopy.c
@@ -32,7 +32,7 @@
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "third_party/getopt/getopt.internal.h"
 
 static int infd;
diff --git a/tool/build/zipobj.c b/tool/build/zipobj.c
index 007bbfa7a..37cb3ef89 100644
--- a/tool/build/zipobj.c
+++ b/tool/build/zipobj.c
@@ -39,7 +39,7 @@
 #include "libc/sysv/consts/s.h"
 #include "libc/time.h"
 #include "libc/x/x.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "third_party/getopt/getopt.internal.h"
 #include "tool/build/lib/elfwriter.h"
 #include "tool/build/lib/stripcomponents.h"
diff --git a/tool/cosmocc/README.md b/tool/cosmocc/README.md
index 84802987e..489cee109 100644
--- a/tool/cosmocc/README.md
+++ b/tool/cosmocc/README.md
@@ -1,7 +1,7 @@
 # Cosmopolitan Toolchain
 
 This toolchain can be used to compile executables that run on Linux /
-MacOS / Windows / FreeBSD / OpenBSD 7.3 / NetBSD for both the x86_64 and
+MacOS / Windows / FreeBSD / OpenBSD / NetBSD for both the x86_64 and
 AARCH64 architectures. In addition to letting you create portable
 binaries, your toolchain is itself comprised of portable binaries,
 enabling you to have a consistent development environment that lets you
@@ -9,13 +9,13 @@ reach a broader audience from the platform(s) of your choosing.
 
 ## What's Included
 
-This toolchain bundles GCC 14.1.0, Clang 19, Cosmopolitan Libc, LLVM
-LIBCXX, LLVM compiler-rt, and LLVM OpenMP. Additional libraries were
-provided by Musl Libc, and the venerable BSDs OSes. This lets you
-benefit from the awesome modern GCC compiler with the strongest GPL
-barrier possible. The preprocessor advertises cross compilers as both
-`__COSMOCC__` and `__COSMOPOLITAN__` whereas `cosmocc` additionally
-defines `__FATCOSMOCC__`.
+This toolchain bundles GCC 14.1.0, Cosmopolitan Libc, LLVM LIBCXX, LLVM
+compiler-rt, and LLVM OpenMP. Additional libraries were provided by Musl
+Libc, and the venerable BSDs OSes. This lets you benefit from the
+awesome modern GCC compiler with the strongest GPL barrier possible. The
+preprocessor advertises cross compilers as both `__COSMOCC__` and
+`__COSMOPOLITAN__` whereas `cosmocc` additionally defines
+`__FATCOSMOCC__`.
 
 ## Getting Started
 
@@ -88,33 +88,6 @@ format used by the host system; however it's also possible to explicitly
 convert APE programs to any architectures / OS combination. For further
 details on usage, run the `assimilate -h` command.
 
-
-## Binary archive format
-
-The APE format includes another portability superpower: the ability to 
-distribute application support files WITHIN the compiled executable file. 
-This is because APE files are also mostly regular zip files! You will 
-need a copy of a compatible zip tool like the modified version of 
-Info-ZIP available here: https://cosmo.zip/pub/cosmos/bin/zip. With this 
-in hand the following command:
-
-```sh
-zip [APE file] [support_file.txt]
-```
-
-adds support_file.txt to your executable. You can see it listed within 
-the archive with `unzip -l [APE file]`. 
-
-Cosmo libc includes compatible file handling functions for accessing the 
-contents of an APE file at the special '/zip' path. So your code is now 
-able to do the following:
-
-```c
-if (access( "/zip/support_file.txt", F_OK) == 0) {
-	fprintf(stderr, "/zip/support_file.txt FOUND and can be used as an asset\n");
-}
-```
-
 ## Gotchas
 
 If you use zsh and have trouble running APE programs try `sh -c ./prog`
@@ -169,139 +142,6 @@ and AARCH64, which is K8 and ARMv8.0. You can pass architecture specific
 flags to use newer ISAs by using the `-Xx86_64` and `-Xaarch64` prefixes
 like `-Xx86_64-mssse3` and `-Xaarch64-march=armv8.2-a+dotprod`.
 
-## Flags
-
-The following supplemental flags are defined by cosmocc:
-
-- `-mcosmo` causes `_COSMO_SOURCE` to be defined. This has a similar
-  effect to defining `_GNU_SOURCE`. When you use this flag, many
-  non-standard GNU, BSD, and Cosmo Libc APIs will become visible in
-  headers, e.g. `stdlib.h` will now define `ShowCrashReports()`.
-  Including `cosmo.h` has a similar effect, however it's recommended
-  that any program that uses cosmo-specific APIs pass this flag.
-
-- `-mclang` (experimental) may be passed to the `cosmocc` command to use
-  Clang instead of GCC under the hood. This can help C++ code compile 3x
-  faster.
-
-- `-mgcc` may be passed to the `cosmocc` command to use GCC instead of
-  Clang under the hood. Since this is the default mode, this flag may be
-  used to override the effect of passing the `-mclang` flag earlier.
-
-- `-mdbg` may be passed when linking programs. It has the same effect as
-  `export MODE=dbg` in that it will cause an alternative build of the
-  Cosmopolitan Libc runtime to be linked that was built with `-O0 -g`.
-  Under the normal build mode, `--ftrace` output generated by your libc
-  is oftentimes missing important details due to inlining. If your build
-  your code with `cosmocc -O0 -mdbg` then `--ftrace` will make much more
-  sense. It's also the only way to make using GDB to troubleshoot issues
-  inside Cosmo Libc work reliably. Please be warned, this flag enables
-  some heavy-hitting runtime checks, such such lock graph validation.
-  The debug Cosmopolitan runtime is able to detect lock cycles globally
-  automatically via your normal usage of `pthread_mutex_t` and then
-  report strongly connected components with C++ symbol demangling. This
-  runtime will absolutely crash your entire process, if it helps you
-  spot a bug. For example, debug cosmo is build with UBSAN so even an
-  undiscovered yet innocent bit shift of a negative number could take
-  you down. So you wouldn't want to use this in prod very often. Please
-  note that passing `-mdbg` doesn't imply `-g -O0 -fsanitize=undefined`
-  which must be passed separately if you want your code to be compiled
-  with the same stuff as libc.
-
-- `-mtiny` may be passed when linking programs. It has the same effect
-  as `export MODE=tiny` in that it will cause an alternative build of
-  the Cosmopolitan Libc runtime to be linked that's optimized for code
-  size. In the normal build mode, the smallest possible binary size will
-  be on the order of hundreds of kb, due to heavyweight features like
-  `--ftrace` and `--strace` being part of the mandatory runtime. Those
-  features don't exist in the tiny runtime, which should produce ~147kb
-  fat binaries and ~36kb x86-only binaries. You may also use this flag
-  when compiling objects. Since there's no function tracing, using this
-  will eliminate the NOPs that get inserted into the prologues of your
-  functions to make them hookable, which also greatly reduces code size.
-  Please note that this does not specify an `-O` flag, so you may want
-  to pass `-Os` too. Please note that this mode is granted leeway to
-  trade away performance whenever possible. Functions like memmove()
-  will stop using fancy vectorization which can dramatically decrease
-  the performance of certain use cases. malloc() will no longer be
-  scalable either. Cosmo malloc() will normally perform similarly to
-  things like jemalloc. But in -mtiny mode it's protected by a GIL that
-  may cause a multithreaded C++ HTTP server that makes intense usage of
-  the STL may drop from 3.7 million requests per second to just 17k.
-  We've seen it happen. malloc() will also stop using cookies which add
-  bloat but are considered important by some people for both security
-  and reporting errors on corruption. APIs will also begin refraining
-  from detecting usage errors that are the fault of the caller, so this
-  mode isn't recommended for development. Where -mtiny truly shines is
-  when you're writing tiny programs. Particularly if they're ephemeral
-  and frequent (e.g. build tooling), because the tiny runtime needs to
-  do less work at process startup.
-
-- `-moptlinux` uses the optimized Linux-only version of Cosmopolitan
-  Libc runtime libraries. Your program will only be able to run on
-  Linux. The runtime is compiled at `-O3` although it still supports AMD
-  K8+ (c. 2003). Optimizations like red zone that wouldn't otherwise be
-  possible are enabled. Function call tracing and system call logging is
-  disabled. All the Windows polyfills go away and your binaries will be
-  significantly tinier. The `cosmocc` compiler will generate a shell
-  script with the magic `jartsr='` so you won't get unwanted attention
-  from Windows virus scanners. You're even allowed to use flags like
-  `-fomit-frame-pointer` when you use this mode. Users report optlinux
-  has helped them make the Python interpreter 5% faster, like distros,
-  optlinux will salt the earth if it gains a 1% advantage on benchmark
-  games. Therefore this mode gives you an apples-to-apples comparison
-  between cosmocc versus the gcc/clang configs used by linux distros.
-
-## Raw Toolchains
-
-The `cosmocc` and `cosmoar` programs use shell script magic to run both
-toolchains under the hood. Sometimes this magic doesn't work when you're
-building software that needs to do things like run the C preprocessor in
-aarch64 mode. In such cases, cosmocc provides x86\_64 and aarch64 only
-toolchains which give you more power and control over your builds.
-
-- `x86_64-unknown-cosmo-cc`, `x86_64-unknown-cosmo-c++`, and
-  `x86_64-linux-cosmo-as` let you build multi-OS programs that only run
-  on x86\_64. You'll need this if you want to compile complex projects
-  like Emacs and OpenSSL. These are shell scripts that help you make
-  sure your software is compiled with the correct set of flags.
-
-- `aarch64-unknown-cosmo-cc`, `aarch64-unknown-cosmo-c++`, and
-  `aarch64-linux-cosmo-as` let you build multi-OS programs that only run
-  on ARM64. You'll need this if you want to compile complex projects
-  like Emacs and OpenSSL. These are shell scripts that help you make
-  sure your software is compiled with the correct set of flags.
-
-- `aarch64-linux-cosmo-cc`, `aarch64-linux-cosmo-c++`,
-  `aarch64-linux-cosmo-as`, and `aarch64-linux-cosmo-ld` are the actual
-  compiler executables. Using these grants full control over your
-  compiler and maximum performance. This is the approach favored for
-  instance by the Cosmopolitan Mono Repo's Makefile. If you use these,
-  then you should have zero expectation of support, because you'll be
-  assuming all responsibility for knowing about all the ABI-related
-  flags your Cosmopolitan runtime requires.
-
-When you use the "unknown" OS compilers, they'll link ELF executables
-which embed an APE program image. This is so it's possible to have DWARF
-debugging data. If you say:
-
-```
-x86_64-unknown-cosmo-cc -Os -mtiny -o hello hello.c
-./hello
-x86_64-linux-cosmo-objcopy -SO binary hello hello.com
-./hello.com
-```
-
-Then you can unwap the raw stripped APE executable and get a much
-smaller file than you otherwise would using the `-s` flag.
-
-If you compile your software twice, using both the x86\_64 and aarch64
-compilers, then it's possible to link the two binaries into a single fat
-binary yourself via the `apelink` program. To understand how this
-process works, it works best if you use the `BUILDLOG` variable, to see
-how the shell script wrappers are doing it. You can also consult the
-build configs of the ahgamut/superconfigure project on GitHub.
-
 ## Troubleshooting
 
 Your `cosmocc` compiler runs a number commands under the hood. If
@@ -459,7 +299,7 @@ statements instead, so that Cosmopolitan Libc's system constants will
 work as expected. Our modifications to GNU GCC are published under the
 ISC license at <https://github.com/ahgamut/gcc/tree/portcosmo-14.1>. The
 binaries you see here were first published at
-<https://github.com/ahgamut/superconfigure/releases/tag/z0.0.60> which
+<https://github.com/ahgamut/superconfigure/releases/tag/z0.0.47> which
 is regularly updated.
 
 ## Legal
diff --git a/tool/cosmocc/bin/cosmocc b/tool/cosmocc/bin/cosmocc
index 3019d0621..5f3ee7152 100755
--- a/tool/cosmocc/bin/cosmocc
+++ b/tool/cosmocc/bin/cosmocc
@@ -75,38 +75,10 @@ elif [ ! -d "$TMPDIR" ]; then
   fi
 fi
 
-use_gcc() {
-  CLANG=0
-  CC_X86_64="$BIN/x86_64-linux-cosmo-gcc"
-  CC_AARCH64="$BIN/aarch64-linux-cosmo-gcc"
-  CXX_X86_64="$BIN/x86_64-linux-cosmo-g++"
-  CXX_AARCH64="$BIN/aarch64-linux-cosmo-g++"
-  TARGET_X86_64=
-  TARGET_AARCH64=
-  FPORTCOSMO="-fportcosmo"
-  FNO_INLINE_FUNCTIONS_CALLED_ONCE="-fno-inline-functions-called-once"
-}
-
-use_clang() {
-  CLANG=1
-  CC_X86_64="$BIN/../libexec/clang"
-  CC_AARCH64="$BIN/../libexec/clang"
-  CXX_X86_64="$BIN/../libexec/clang"
-  CXX_AARCH64="$BIN/../libexec/clang"
-  TARGET_X86_64="--target=x86_64"
-  TARGET_AARCH64="--target=aarch64"
-  FPORTCOSMO=
-  FNO_INLINE_FUNCTIONS_CALLED_ONCE=
-}
-
-use_gcc
-
-X=
 OPT=
 ARGS=
 FLAGS=
 OUTPUT=
-NEED_X=
 MDFLAG=0
 MCOSMO=0
 INTENT=ld
@@ -119,11 +91,11 @@ FLAGS_AARCH64=
 INPUT_FILE_COUNT=0
 DEPENDENCY_OUTPUT=
 NEED_DEPENDENCY_OUTPUT=
-
 for x; do
   if [ x"$x" != x"${x#* }" ]; then
     fatal_error "arguments containing spaces unsupported: $x"
-  elif [ -n "$NEED_OUTPUT" ]; then
+  fi
+  if [ -n "$NEED_OUTPUT" ]; then
     NEED_OUTPUT=
     OUTPUT=$x
     continue
@@ -137,10 +109,6 @@ for x; do
   elif [ -n "$NEED_EQUAL" ]; then
     x="${NEED_EQUAL}=${x}"
     NEED_EQUAL=
-  elif [ -n "$NEED_X" ]; then
-    NEED_X=
-    X=$x
-    x="-x${x}"
   elif [ x"$x" = x"-" ] ||           # is alias for stdin
        [ x"$x" = x"${x#-*}" ]; then  # !startswith(x, "-")
     if [ x"$x" != x"${x%.s}" ] ||
@@ -168,25 +136,10 @@ for x; do
   elif [ x"$x" != x"${x#-MF}" ]; then  # startswith(x, "-MF")
     DEPENDENCY_OUTPUT=${x#-MF}
     continue
-  elif [ x"$x" = x"-MQ" ]; then
-    NEED_DEPENDENCY_OUTPUT=1
-    continue
-  elif [ x"$x" = x"-Wl,--version" ]; then
-    cat <<EOF
-GNU ld (GNU Binutils) 2.42
-Copyright (C) 2024 Free Software Foundation, Inc.
-This program is free software; you may redistribute it under the terms of
-the GNU General Public License version 3 or (at your option) a later version.
-This program has absolutely no warranty.
-EOF
-    exit
   elif [ x"$x" != x"${x#-O}" ]; then  # startswith(x, "-O")
     OPT=$x
   elif [ x"$x" = x"-c" ]; then
-    if [ x"$INTENT" != x"cpp" ]; then
-      INTENT=cc
-    fi
-    continue
+    INTENT=cc
   elif [ x"$x" = x"-E" ] ||
        [ x"$x" = x"-M" ] ||
        [ x"$x" = x"-MM" ]; then
@@ -203,23 +156,6 @@ EOF
   elif [ x"$x" = x"-mcosmo" ]; then
     MCOSMO=1
     continue
-  elif [ x"$x" = x"-mdbg" ]; then
-    MODE=dbg
-    continue
-  elif [ x"$x" = x"-mtiny" ]; then
-    MODE=tiny
-    continue
-  elif [ x"$x" = x"-moptlinux" ]; then
-    MODE=optlinux
-    continue
-  elif [ x"$x" = x"-mclang" ]; then
-    use_clang
-    continue
-  elif [ x"$x" = x"-mgcc" ]; then
-    use_gcc
-    continue
-  elif [ x"$x" = x"-m64" ]; then
-    continue
   elif [ x"$x" = x"-fomit-frame-pointer" ]; then
     # Quoth Apple: "The frame pointer register must always address a
     # valid frame record. Some functions — such as leaf functions or
@@ -265,12 +201,8 @@ EOF
   elif [ x"$x" = x"-dumpversion" ]; then
     echo $GCC_VERSION
     Exit 0
-  elif [ x"$x" = x"-x" ]; then
-    NEED_X=1
-    continue
-  elif [ x"$x" != x"${x#-x}" ]; then
-    X=${x#-x}
-  elif [ x"$x" = x"-e" ] ||
+  elif [ x"$x" = x"-x" ] ||
+       [ x"$x" = x"-e" ] ||
        [ x"$x" = x"-z" ] ||
        [ x"$x" = x"-T" ] ||
        [ x"$x" = x"-L" ] ||
@@ -295,116 +227,56 @@ EOF
   ARGS="$ARGS $x"
 done
 
-# precompiled header mode
-if [ $INTENT != cpp ]; then
-  if [ -z "$X" ]; then
-    ONLY_HEADER_INPUTS=1
-    for x in $ARGS; do
-      if [ x"$x" = x"${x#-*}" ] &&       # !startswith(x, "-")
-         [ x"$x" = x"${x%.h}" ] &&       # !endswith(x, ".h")
-         [ x"$x" = x"${x%.hpp}" ]; then  # !endswith(x, ".hpp")
-        ONLY_HEADER_INPUTS=0
-        break
-      fi
-    done
-    if [ $ONLY_HEADER_INPUTS -eq 1 ]; then
-      INTENT=gch
-    fi
-  elif [ x"$X" = x"c-header" ] ||
-       [ x"$X" = x"c++-header" ]; then
-    INTENT=gch
-  fi
-fi
-if [ $INTENT = gch ]; then
-  fatal_error "precompiled headers only supported with ARCH-unknown-cosmo-cc compilers"
-fi
-
-# check for incorrect usage
 if [ $INPUT_FILE_COUNT -eq 0 ]; then
   fatal_error "no input files"
-elif [ -n "$OUTPUT" ] && [ $INPUT_FILE_COUNT -gt 1 ]; then
-  if [ $INTENT = cc ] || [ $INTENT = cpp ]; then
-    fatal_error "cannot specify '-o' with '-c' or '-E' with multiple files"
-  fi
-fi
-
-if [ $INTENT = ld ]; then
-  use_gcc
+elif [ -z "$INPUT" ] &&
+     [ $INTENT != ld ] &&
+     [ $INPUT_FILE_COUNT -gt 1 ]; then
+  fatal_error "cannot specify '-o' with '-c', or '-E' with multiple files"
 fi
 
 PLATFORM="-D__COSMOPOLITAN__ -D__COSMOCC__ -D__FATCOSMOCC__"
 PREDEF="-include libc/integral/normalize.inc"
-CPPFLAGS="-fno-pie -nostdinc -isystem $BIN/../include"
-CFLAGS="$FPORTCOSMO -fno-semantic-interposition"
+CPPFLAGS="-fno-pie -nostdinc -isystem $BIN/../include -Wno-implicit-int"
+CFLAGS="-fportcosmo -fno-dwarf2-cfi-asm -fno-unwind-tables -fno-asynchronous-unwind-tables -fno-semantic-interposition"
 LDFLAGS="-static -nostdlib -no-pie -fuse-ld=bfd -Wl,-z,noexecstack -Wl,-z,norelro -Wl,--gc-sections"
 PRECIOUS="-fno-omit-frame-pointer"
 
 # these features screw with backtraces so avoid them
-if [ x"$OPT" != x"-Os" ] && [ x"$MODE" != x"tiny" ] && [ x"$MODE" != x"optlinux" ]; then
+if [ x"$OPT" != x"-Os" ] && [ x"$MODE" != x"tiny" ]; then
   CFLAGS="$CFLAGS -fno-optimize-sibling-calls -mno-omit-leaf-frame-pointer"
 fi
-if [ x"$OPT" != x"-O3" ] && [ x"$MODE" != x"optlinux" ]; then
-  if [ $CLANG -eq 0 ]; then
-    CFLAGS="$CFLAGS -fno-schedule-insns2"
-  fi
+if [ x"$OPT" != x"-O3" ]; then
+  CFLAGS="$CFLAGS -fno-schedule-insns2"
 fi
 
-if [ x"$X" = x"c" ] || [ x"$X" = x"c-header" ]; then
-  CPLUSPLUS=0
-elif [ x"$X" = x"c++" ] || [ x"$X" = x"c++-header" ]; then
-  CPLUSPLUS=1
-elif [ x"$PROG" != x"${PROG%++}" ]; then
-  CPLUSPLUS=1
-else
-  CPLUSPLUS=0
-fi
-
-if [ $CPLUSPLUS -eq 1 ]; then
-  CC_X86_64=$CXX_X86_64
-  CC_AARCH64=$CXX_AARCH64
+CC_X86_64="$BIN/x86_64-linux-cosmo-gcc"
+CC_AARCH64="$BIN/aarch64-linux-cosmo-gcc"
+if [ x"$PROG" != x"${PROG%++}" ]; then
+  CC_X86_64="$BIN/x86_64-linux-cosmo-g++"
+  CC_AARCH64="$BIN/aarch64-linux-cosmo-g++"
+  CFLAGS="$CFLAGS -fno-rtti -fno-exceptions -fuse-cxa-atexit"
   CPPFLAGS="-isystem $BIN/../include/third_party/libcxx $CPPFLAGS"
-else
-  CFLAGS="$CFLAGS -Wno-implicit-int"
 fi
 
-if [ x"$MODE" = x"dbg" ]; then
-  LIB_X86_64="$BIN/../x86_64-linux-cosmo/lib/dbg"
-  LIB_AARCH64="$BIN/../aarch64-linux-cosmo/lib/dbg"
-elif [ x"$MODE" = x"tiny" ]; then
-  LIB_X86_64="$BIN/../x86_64-linux-cosmo/lib/tiny"
-  LIB_AARCH64="$BIN/../aarch64-linux-cosmo/lib/tiny"
-elif [ x"$MODE" = x"optlinux" ]; then
-  LIB_X86_64="$BIN/../x86_64-linux-cosmo/lib/optlinux"
-  LIB_AARCH64="$BIN/../aarch64-linux-cosmo/lib/optlinux"
-else
-  LIB_X86_64="$BIN/../x86_64-linux-cosmo/lib"
-  LIB_AARCH64="$BIN/../aarch64-linux-cosmo/lib"
-fi
-
-CRT_X86_64="$LIB_X86_64/ape.o $LIB_X86_64/crt.o"
-CPPFLAGS_X86_64="$CPPFLAGS"
+CRT_X86_64="$BIN/../x86_64-linux-cosmo/lib/ape.o $BIN/../x86_64-linux-cosmo/lib/crt.o"
+CPPFLAGS_X86_64="$CPPFLAGS -mno-red-zone"
 CFLAGS_X86_64="$CFLAGS -mno-tls-direct-seg-refs"
-LDFLAGS_X86_64="$LDFLAGS -L$LIB_X86_64 -L$BIN/../x86_64-linux-cosmo/lib -Wl,-T,$LIB_X86_64/ape.lds -Wl,-z,common-page-size=4096 -Wl,-z,max-page-size=16384"
+LDFLAGS_X86_64="$LDFLAGS -L$BIN/../x86_64-linux-cosmo/lib -Wl,-T,$BIN/../x86_64-linux-cosmo/lib/ape.lds -Wl,-z,common-page-size=4096 -Wl,-z,max-page-size=16384"
 LDLIBS_X86_64="-lcosmo"
 
-CRT_AARCH64="$LIB_AARCH64/crt.o"
+CRT_AARCH64="$BIN/../aarch64-linux-cosmo/lib/crt.o"
 CPPFLAGS_AARCH64="$CPPFLAGS -fsigned-char"
-CFLAGS_AARCH64="$CFLAGS -ffixed-x18 -ffixed-x28"
-LDFLAGS_AARCH64="$LDFLAGS -L$LIB_AARCH64 -L$BIN/../aarch64-linux-cosmo/lib -Wl,-T,$LIB_AARCH64/aarch64.lds -Wl,-z,common-page-size=16384 -Wl,-z,max-page-size=16384"
+CFLAGS_AARCH64="$CFLAGS -ffixed-x18 -ffixed-x28 -mno-outline-atomics"
+LDFLAGS_AARCH64="$LDFLAGS -L$BIN/../aarch64-linux-cosmo/lib -Wl,-T,$BIN/../aarch64-linux-cosmo/lib/aarch64.lds -Wl,-z,common-page-size=16384 -Wl,-z,max-page-size=16384"
 LDLIBS_AARCH64="-lcosmo"
 
-SUPPORT_VECTOR=-1
-if [ x"$MODE" = x"optlinux" ]; then
-  CPPFLAGS_X86_64="$CPPFLAGS_X86_64 -mno-red-zone"
-  SUPPORT_VECTOR=linux
+if [ x"$OPT" != x"-Os" ] && [ x"$MODE" != x"tiny" ]; then
+  CFLAGS_X86_64="${CFLAGS_X86_64} -fpatchable-function-entry=18,16 -fno-inline-functions-called-once -DFTRACE -DSYSDEBUG"
+  CFLAGS_AARCH64="${CFLAGS_AARCH64} -fpatchable-function-entry=7,6 -fno-inline-functions-called-once -DFTRACE -DSYSDEBUG"
 fi
 
-if [ x"$OPT" != x"-Os" ] && [ x"$MODE" != x"tiny" ] && [ x"$MODE" != x"optlinux" ]; then
-  CFLAGS_X86_64="${CFLAGS_X86_64} -fpatchable-function-entry=18,16 $FNO_INLINE_FUNCTIONS_CALLED_ONCE -DFTRACE -DSYSDEBUG"
-  CFLAGS_AARCH64="${CFLAGS_AARCH64} -fpatchable-function-entry=7,6 $FNO_INLINE_FUNCTIONS_CALLED_ONCE -DFTRACE -DSYSDEBUG"
-fi
-
-if [ $CPLUSPLUS -eq 1 ]; then
+if [ x"$PROG" != x"${PROG%++}" ]; then
   LDLIBS_X86_64="-lcxx ${LDLIBS_X86_64}"
   LDLIBS_AARCH64="-lcxx ${LDLIBS_AARCH64}"
 fi
@@ -420,9 +292,6 @@ if [ $INTENT = cpp ]; then
   if [ -n "$OUTPUT" ]; then
     ARGS="$ARGS -o$OUTPUT"
   fi
-  # undefine cpu-specific and linux-specific defines
-  # we get rid of long double too to not lead astray
-  # we shall leave behind unix, __unix, and __unix__
   set -- \
       "$CC_X86_64" \
       -U__k8 \
@@ -431,21 +300,10 @@ if [ $INTENT = cpp ]; then
       -U__amd64__ \
       -U__x86_64 \
       -U__x86_64__ \
-      -U__MMX__ \
-      -U__MMX_WITH_SSE__ \
-      -U__SSE_MATH__ \
-      -U__SEG_FS \
-      -U__SEG_GS \
       -U__SSE__ \
-      -U__FXSR__ \
       -U__SSE2__ \
       -U__SSE2_MATH__ \
-      -Ulinux \
-      -U__linux \
-      -U__linux__ \
-      -U__gnu_linux__ \
       -mno-red-zone \
-      -mlong-double-64 \
       $PLATFORM \
       $CPPFLAGS \
       $ARGS
@@ -495,7 +353,6 @@ build_object() {
   (
     set -- \
         "$CC_X86_64" \
-        $TARGET_X86_64 \
         -o"$OUTPUT_X86_64" \
         $PLATFORM \
         $PREDEF \
@@ -517,7 +374,6 @@ build_object() {
   (
     set -- \
         "$CC_AARCH64" \
-        $TARGET_AARCH64 \
         -o"$OUTPUT_AARCH64" \
         $PLATFORM \
         $PREDEF \
@@ -624,7 +480,6 @@ TEMP_FILES="${TEMP_FILES} $out2"
 (
   set -- \
       "$CC_X86_64" \
-      $TARGET_X86_64 \
       -o"$OUTPUT_X86_64"\
       $CRT_X86_64 \
       $LDFLAGS_X86_64 \
@@ -642,7 +497,6 @@ pid1=$!
 (
   set -- \
       "$CC_AARCH64" \
-      $TARGET_AARCH64 \
       -o"$OUTPUT_AARCH64"\
       $CRT_AARCH64 \
       $LDFLAGS_AARCH64 \
@@ -670,7 +524,6 @@ fi
 
 set -- \
 "$BIN/apelink" \
-  -V "$SUPPORT_VECTOR" \
   -l "$BIN/ape-x86_64.elf" \
   -l "$BIN/ape-aarch64.elf" \
   -M "$BIN/ape-m1.c" \
@@ -681,12 +534,10 @@ set -- \
 log_command "$@"
 "$@" || Exit
 
-if [ x"$MODE" != "optlinux" ]; then
-  set -- \
-  "$BIN/pecheck" "$OUTPUT"
-  log_command "$@"
-  "$@" || Exit
-fi
+set -- \
+"$BIN/pecheck" "$OUTPUT"
+log_command "$@"
+"$@" || Exit
 
 if [ $INTENT = ld ] && [ $SAVE_TEMPS -eq 0 ]; then
   mv -f "$OUTPUT_X86_64" "${OUTPUT%.com}.com.dbg" || Exit
diff --git a/tool/cosmocc/bin/cosmocross b/tool/cosmocc/bin/cosmocross
index 357239920..8c3ff5bc8 100755
--- a/tool/cosmocc/bin/cosmocross
+++ b/tool/cosmocc/bin/cosmocross
@@ -47,8 +47,8 @@ log_command() {
 ORIGINAL="$0 $*"
 PLATFORM="-D__COSMOPOLITAN__ -D__COSMOCC__"
 PREDEF="-include libc/integral/normalize.inc"
-CFLAGS="-fportcosmo -fno-semantic-interposition"
-CPPFLAGS="-fno-pie -nostdinc -isystem $BIN/../include"
+CFLAGS="-fportcosmo -fno-dwarf2-cfi-asm -fno-unwind-tables -fno-asynchronous-unwind-tables -fno-semantic-interposition"
+CPPFLAGS="-fno-pie -nostdinc -isystem $BIN/../include -Wno-implicit-int"
 LDFLAGS="-static -no-pie -nostdlib -fuse-ld=bfd -Wl,-z,noexecstack"
 APEFLAGS="-Wl,--gc-sections"
 PRECIOUS="-fno-omit-frame-pointer"
@@ -59,79 +59,35 @@ if [ x"$ARCH" = x"$PROG" ]; then
   fatal_error "cosmocross must be run via cross compiler"
 fi
 
-X=
-NEED_X=
-for x; do
-  if [ -n "$NEED_X" ]; then
-    NEED_X=
-    X=$x
-  elif [ x"$x" = x"-x" ]; then
-    NEED_X=1
-  elif [ x"$x" != x"${x#-x}" ]; then
-    X=${x#-x}
-  elif [ x"$x" = x"-mtiny" ]; then
-    MODE=tiny
-  elif [ x"$x" = x"-mdbg" ]; then
-    MODE=dbg
-  elif [ x"$x" = x"-moptlinux" ]; then
-    MODE=optlinux
-  fi
-done
-
-if [ x"$MODE" = x"dbg" ]; then
-  LIB="$BIN/../$ARCH-linux-cosmo/lib/dbg"
-elif [ x"$MODE" = x"tiny" ]; then
-  LIB="$BIN/../$ARCH-linux-cosmo/lib/tiny"
-elif [ x"$MODE" = x"optlinux" ]; then
-  LIB="$BIN/../$ARCH-linux-cosmo/lib/optlinux"
-else
-  LIB="$BIN/../$ARCH-linux-cosmo/lib"
-fi
-
-if [ x"$X" = x"c" ] || [ x"$X" = x"c-header" ]; then
-  CPLUSPLUS=0
-elif [ x"$X" = x"c++" ] || [ x"$X" = x"c++-header" ]; then
-  CPLUSPLUS=1
-elif [ x"$PROG" != x"${PROG%++}" ]; then
-  CPLUSPLUS=1
-else
-  CPLUSPLUS=0
-fi
-
 CC="$BIN/$ARCH-linux-cosmo-gcc"
-CRT="$LIB/crt.o"
+CRT="$BIN/../$ARCH-linux-cosmo/lib/crt.o"
 LDLIBS="-lcosmo"
 if [ -z "$COSMOS" ]; then
-  LDFLAGS="$LDFLAGS -L$LIB -L$BIN/../$ARCH-linux-cosmo/lib"
+    LDFLAGS="$LDFLAGS -L$BIN/../$ARCH-linux-cosmo/lib"
 else
-  LDFLAGS="$LDFLAGS -L$COSMOS/lib -L$LIB -L$BIN/../$ARCH-linux-cosmo/lib"
-  CPPFLAGS="$CPPFLAGS -I$COSMOS/include"
+    LDFLAGS="$LDFLAGS -L$COSMOS/lib -L$BIN/../$ARCH-linux-cosmo/lib"
+    CPPFLAGS="$CPPFLAGS -I$COSMOS/include"
 fi
-if [ $CPLUSPLUS -eq 1 ]; then
+if [ x"$PROG" != x"${PROG%++}" ]; then
   CC="$BIN/$ARCH-linux-cosmo-g++"
+  CFLAGS="$CFLAGS -fno-rtti -fno-exceptions -fuse-cxa-atexit"
   CPPFLAGS="-isystem $BIN/../include/third_party/libcxx $CPPFLAGS"
   LDLIBS="-lcxx $LDLIBS"
-else
-  CFLAGS="$CFLAGS -Wno-implicit-int"
 fi
 
 PAGESZ=4096
 if [ x"$ARCH" = x"x86_64" ]; then
   OBJCOPYFLAGS="-S -O binary"
-  CRT="$LIB/ape-no-modify-self.o $CRT"
+  CRT="$BIN/../$ARCH-linux-cosmo/lib/ape-no-modify-self.o $CRT"
+  CPPFLAGS="$CPPFLAGS -mno-red-zone"
   CFLAGS="$CFLAGS -mno-tls-direct-seg-refs"
-  LDFLAGS="$LDFLAGS -Wl,-T,$LIB/ape.lds"
-  if [ x"$MODE" = x"optlinux" ]; then
-    CPPFLAGS="$CPPFLAGS -mred-zone"
-  else
-    CPPFLAGS="$CPPFLAGS -mno-red-zone"
-  fi
+  LDFLAGS="$LDFLAGS -Wl,-T,$BIN/../$ARCH-linux-cosmo/lib/ape.lds"
 elif [ x"$ARCH" = x"aarch64" ]; then
   OBJCOPYFLAGS="-S"
   PAGESZ=16384
   CPPFLAGS="$CPPFLAGS -fsigned-char"
-  CFLAGS="$CFLAGS -ffixed-x18 -ffixed-x28"
-  LDFLAGS="$LDFLAGS -Wl,-T,$LIB/aarch64.lds"
+  CFLAGS="$CFLAGS -ffixed-x18 -ffixed-x28 -mno-outline-atomics"
+  LDFLAGS="$LDFLAGS -Wl,-T,$BIN/../$ARCH-linux-cosmo/lib/aarch64.lds"
 else
   fatal_error "$ARCH: unsupported architecture"
 fi
@@ -184,14 +140,6 @@ for x; do
   elif [ x"$x" = x"-mcosmo" ]; then
     CPPFLAGS="$CPPFLAGS -D_COSMO_SOURCE"
     continue
-  elif [ x"$x" = x"-mdbg" ]; then
-    continue
-  elif [ x"$x" = x"-mtiny" ]; then
-    continue
-  elif [ x"$x" = x"-moptlinux" ]; then
-    continue
-  elif [ x"$x" = x"-m64" ]; then
-    continue
   elif [ x"$x" != x"${x#-o}" ]; then
     OUTPUT=${x#-o}
   elif [ x"$x" = x"-fpic" ]; then
@@ -237,30 +185,8 @@ if [ $RELOCATABLE -eq 1 ]; then
   LDFLAGS="$LDFLAGS -r"
 fi
 
-# precompiled header mode
-if [ $INTENT != cpp ]; then
-  if [ -z "$X" ]; then
-    ONLY_HEADER_INPUTS=1
-    for x; do
-      if [ x"$x" = x"${x#-*}" ] &&       # !startswith(x, "-")
-         [ x"$x" = x"${x%.h}" ] &&       # !endswith(x, ".h")
-         [ x"$x" = x"${x%.hpp}" ]; then  # !endswith(x, ".hpp")
-        ONLY_HEADER_INPUTS=0
-        break
-      fi
-    done
-    if [ $ONLY_HEADER_INPUTS -eq 1 ]; then
-      INTENT=h
-    fi
-  elif [ x"$X" = x"c-header" ] ||
-       [ x"$X" = x"c++-header" ]; then
-    INTENT=h
-  fi
-fi
-
 # support --ftrace unless optimizing for size
 if [ x"$OPT" != x"-Os" ] &&                # $OPT != -Os
-   [ x"$MODE" != x"optlinux" ] &&          # $MODE not optlinux
    [ x"${MODE%tiny}" = x"${MODE}" ]; then  # $MODE not in (tiny, aarch64-tiny)
   if [ x"$ARCH" = x"x86_64" ]; then
     CFLAGS="$CFLAGS -fpatchable-function-entry=18,16 -fno-inline-functions-called-once"
@@ -271,17 +197,16 @@ fi
 
 # maximize frame pointers unless optimizing for size
 if [ x"$OPT" != x"-Os" ] &&               # $OPT != "-Os"
-   [ x"$MODE" != x"optlinux" ] &&         # $MODE not optlinux
    [ x"$MODE" != x"${MODE%tiny}" ]; then  # endswith($MODE, "tiny")
   CFLAGS="$CFLAGS -fno-optimize-sibling-calls -mno-omit-leaf-frame-pointer"
 fi
-if [ x"$OPT" != x"-O3" ] && [ x"$MODE" != x"optlinux" ]; then
+if [ x"$OPT" != x"-O3" ]; then
   CFLAGS="$CFLAGS -fno-schedule-insns2"
 fi
 
 if [ $INTENT = cpp ]; then
   set -- "$CC" $PLATFORM $CPPFLAGS "$@"
-elif [ $INTENT = cc ] || [ $INTENT = s ] || [ $INTENT = h ]; then
+elif [ $INTENT = cc ] || [ $INTENT = s ]; then
   set -- "$CC" $PLATFORM $PREDEF $CFLAGS $CPPFLAGS "$@" $PRECIOUS
 else
   set -- "$CC" $PLATFORM $PREDEF $CFLAGS $CPPFLAGS $CRT "$@" $LDFLAGS $LDLIBS $PRECIOUS
diff --git a/tool/cosmocc/bin/cosmoranlib b/tool/cosmocc/bin/cosmoranlib
deleted file mode 100755
index b1015508b..000000000
--- a/tool/cosmocc/bin/cosmoranlib
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-exec x86_64-linux-cosmo-ranlib "$@"
diff --git a/tool/cosmocc/bin/unknown-unknown-cosmo-ranlib b/tool/cosmocc/bin/unknown-unknown-cosmo-ranlib
deleted file mode 120000
index 0f514249d..000000000
--- a/tool/cosmocc/bin/unknown-unknown-cosmo-ranlib
+++ /dev/null
@@ -1 +0,0 @@
-cosmoranlib
\ No newline at end of file
diff --git a/tool/cosmocc/package.sh b/tool/cosmocc/package.sh
index 9c9ada64a..59b2dc44d 100755
--- a/tool/cosmocc/package.sh
+++ b/tool/cosmocc/package.sh
@@ -15,49 +15,16 @@ mode() {
   esac
 }
 
-_nproc() {
-  case $(uname -s) in
-    Darwin) sysctl -n hw.logicalcpu ;;
-    *)      nproc                   ;;
-  esac
-}
-
-TMPDIR=${TMPDIR:-/tmp}
 OUTDIR=${1:-cosmocc}
 APELINK=o/$(mode)/tool/build/apelink
 AMD64=${2:-x86_64}
 ARM64=${3:-aarch64}
-NPROC=$(($(_nproc)/2))
 GCCVER=14.1.0
 
-if ! MAKE=$(command -v gmake); then
-  if ! MAKE=$(command -v make); then
-    echo please install gnu make >&2
-    exit 1
-  fi
-fi
-
-$MAKE -j$NPROC m= \
+make -j64 m= \
   $APELINK
 
-if ! APE=$(command -v ape); then
-  case $(uname -s) in
-    Darwin)
-      case $(mode) in
-        aarch64)
-          cc -O -o "$TMPDIR/ape.$$" .cosmocc/current/bin/ape-m1.c || exit
-          trap 'rm "$TMPDIR/ape.$$"' EXIT
-          APE=$TMPDIR/ape.$$
-        ;;
-        *) APE=.cosmocc/current/bin/ape-x86_64.macho ;;
-      esac
-      ;;
-    *) APE=.cosmocc/current/bin/ape-$(uname -m).elf ;;
-  esac
-fi
-stat $APE
-
-$MAKE -j$NPROC m=$AMD64 \
+make -j64 m=$AMD64 \
   o/cosmocc.h.txt \
   o/$AMD64/ape/ape.lds \
   o/$AMD64/libc/crt/crt.o \
@@ -94,46 +61,12 @@ $MAKE -j$NPROC m=$AMD64 \
   o/$AMD64/third_party/make/make.dbg \
   o/$AMD64/third_party/ctags/ctags.dbg
 
-$MAKE -j$NPROC m=$AMD64-tiny \
-  o/cosmocc.h.txt \
-  o/$AMD64-tiny/ape/ape.lds \
-  o/$AMD64-tiny/libc/crt/crt.o \
-  o/$AMD64-tiny/ape/ape.elf \
-  o/$AMD64-tiny/ape/ape.macho \
-  o/$AMD64-tiny/ape/ape.o \
-  o/$AMD64-tiny/ape/ape-copy-self.o \
-  o/$AMD64-tiny/ape/ape-no-modify-self.o \
-  o/$AMD64-tiny/cosmopolitan.a \
-  o/$AMD64-tiny/third_party/libcxx/libcxx.a \
-
-$MAKE -j$NPROC m=$AMD64-dbg \
-  o/cosmocc.h.txt \
-  o/$AMD64-dbg/ape/ape.lds \
-  o/$AMD64-dbg/libc/crt/crt.o \
-  o/$AMD64-dbg/ape/ape.elf \
-  o/$AMD64-dbg/ape/ape.macho \
-  o/$AMD64-dbg/ape/ape.o \
-  o/$AMD64-dbg/ape/ape-copy-self.o \
-  o/$AMD64-dbg/ape/ape-no-modify-self.o \
-  o/$AMD64-dbg/cosmopolitan.a \
-  o/$AMD64-dbg/third_party/libcxx/libcxx.a \
-
-$MAKE CONFIG_TARGET_ARCH= -j$NPROC m=$AMD64-optlinux \
-  o/cosmocc.h.txt \
-  o/$AMD64-optlinux/ape/ape.lds \
-  o/$AMD64-optlinux/libc/crt/crt.o \
-  o/$AMD64-optlinux/ape/ape.elf \
-  o/$AMD64-optlinux/ape/ape.macho \
-  o/$AMD64-optlinux/ape/ape.o \
-  o/$AMD64-optlinux/ape/ape-copy-self.o \
-  o/$AMD64-optlinux/ape/ape-no-modify-self.o \
-  o/$AMD64-optlinux/cosmopolitan.a \
-  o/$AMD64-optlinux/third_party/libcxx/libcxx.a \
-
-$MAKE -j$NPROC m=$ARM64 \
+make -j64 m=$ARM64 \
   o/$ARM64/ape/ape.elf \
   o/$ARM64/ape/aarch64.lds \
   o/$ARM64/libc/crt/crt.o \
+  o/$ARM64/ape/ape-copy-self.o \
+  o/$ARM64/ape/ape-no-modify-self.o \
   o/$ARM64/cosmopolitan.a \
   o/$ARM64/third_party/libcxx/libcxx.a \
   o/$ARM64/tool/build/assimilate.dbg \
@@ -162,27 +95,6 @@ $MAKE -j$NPROC m=$ARM64 \
   o/$ARM64/third_party/make/make.dbg \
   o/$ARM64/third_party/ctags/ctags.dbg
 
-$MAKE -j$NPROC m=$ARM64-tiny \
-  o/$ARM64-tiny/ape/ape.elf \
-  o/$ARM64-tiny/ape/aarch64.lds \
-  o/$ARM64-tiny/libc/crt/crt.o \
-  o/$ARM64-tiny/cosmopolitan.a \
-  o/$ARM64-tiny/third_party/libcxx/libcxx.a \
-
-$MAKE -j$NPROC m=$ARM64-dbg \
-  o/$ARM64-dbg/ape/ape.elf \
-  o/$ARM64-dbg/ape/aarch64.lds \
-  o/$ARM64-dbg/libc/crt/crt.o \
-  o/$ARM64-dbg/cosmopolitan.a \
-  o/$ARM64-dbg/third_party/libcxx/libcxx.a \
-
-$MAKE -j$NPROC m=$ARM64-optlinux \
-  o/$ARM64-optlinux/ape/ape.elf \
-  o/$ARM64-optlinux/ape/aarch64.lds \
-  o/$ARM64-optlinux/libc/crt/crt.o \
-  o/$ARM64-optlinux/cosmopolitan.a \
-  o/$ARM64-optlinux/third_party/libcxx/libcxx.a \
-
 mkdir -p "$OUTDIR/bin/"
 cp tool/cosmocc/README.md "$OUTDIR/"
 cp tool/cosmocc/LICENSE.* "$OUTDIR/"
@@ -201,45 +113,17 @@ fetch() {
   else
     curl -LO $1
   fi
-
-  if command -v sha256sum >/dev/null 2>&1; then
-    # can use system sha256sum
-    true
-  elif command -v shasum >/dev/null 2>&1; then
-    sha256sum() {
-      shasum -a 256 "$@"
-    }
-  elif command -v "$PWD/o/build/sha256sum" >/dev/null 2>&1; then
-    # should have been built by download-cosmocc.sh if a system
-    # sha256sum/shasum does not exist
-    sha256sum() {
-      "$PWD/o/build/sha256sum" "$@"
-    }
-  else
-    echo please install sha256sum >&2
-    exit 1
-  fi
-
-  filename=$(basename $1)
-  printf '%s\n' "$2 $filename" >$filename.sha256sum
-  sha256sum -c $filename.sha256sum || exit 1
 }
 
 OLD=$PWD
 cd "$OUTDIR/"
 if [ ! -x bin/x86_64-linux-cosmo-gcc ]; then
-  fetch https://github.com/ahgamut/superconfigure/releases/download/z0.0.60/aarch64-gcc.zip 6a07f915ec0296cd33b3142e75c00ed1a7072c75d92c82a0c0b5f5df2cff0dd2 &
-  fetch https://github.com/ahgamut/superconfigure/releases/download/z0.0.60/x86_64-gcc.zip cbb1659c56a0a4f95a71f59f94693515000d3dd53f79a597acacd53cbad2c7d8 &
-  fetch https://github.com/ahgamut/superconfigure/releases/download/z0.0.60/llvm.zip d42c2e46204d4332975d2d7464c5df63c898c34f8d9d2b83c168c14705ca8edd &
-  wait
-  unzip aarch64-gcc.zip &
-  unzip x86_64-gcc.zip &
-  unzip llvm.zip bin/clang-19 bin/clang-format &
-  wait
+  fetch https://github.com/ahgamut/superconfigure/releases/download/z0.0.47/aarch64-gcc.zip
+  unzip aarch64-gcc.zip
   rm -f aarch64-gcc.zip
+  fetch https://github.com/ahgamut/superconfigure/releases/download/z0.0.47/x86_64-gcc.zip
+  unzip x86_64-gcc.zip
   rm -f x86_64-gcc.zip
-  rm -f llvm.zip
-  mv bin/clang-19 libexec/clang  # use `cosmocc -mclang` instead
 fi
 rm -f bin/*-cpp
 rm -f bin/*-gcc-*
@@ -271,51 +155,19 @@ cd "$OLD"
 
 for arch in $AMD64 $ARM64; do
   mkdir -p "$OUTDIR/$arch-linux-cosmo/lib/"
-  mkdir -p "$OUTDIR/$arch-linux-cosmo/lib/dbg"
-  mkdir -p "$OUTDIR/$arch-linux-cosmo/lib/tiny"
-  mkdir -p "$OUTDIR/$arch-linux-cosmo/lib/optlinux"
-
   cp -f o/$arch/libc/crt/crt.o "$OUTDIR/$arch-linux-cosmo/lib/"
-  cp -f o/$arch-dbg/libc/crt/crt.o "$OUTDIR/$arch-linux-cosmo/lib/dbg/"
-  cp -f o/$arch-tiny/libc/crt/crt.o "$OUTDIR/$arch-linux-cosmo/lib/tiny/"
-  cp -f o/$arch-optlinux/libc/crt/crt.o "$OUTDIR/$arch-linux-cosmo/lib/optlinux/"
-
   cp -f o/$arch/cosmopolitan.a "$OUTDIR/$arch-linux-cosmo/lib/libcosmo.a"
-  cp -f o/$arch-dbg/cosmopolitan.a "$OUTDIR/$arch-linux-cosmo/lib/dbg/libcosmo.a"
-  cp -f o/$arch-tiny/cosmopolitan.a "$OUTDIR/$arch-linux-cosmo/lib/tiny/libcosmo.a"
-  cp -f o/$arch-optlinux/cosmopolitan.a "$OUTDIR/$arch-linux-cosmo/lib/optlinux/libcosmo.a"
-
   cp -f o/$arch/third_party/libcxx/libcxx.a "$OUTDIR/$arch-linux-cosmo/lib/"
-  cp -f o/$arch-dbg/third_party/libcxx/libcxx.a "$OUTDIR/$arch-linux-cosmo/lib/dbg/"
-  cp -f o/$arch-tiny/third_party/libcxx/libcxx.a "$OUTDIR/$arch-linux-cosmo/lib/tiny/"
-  cp -f o/$arch-optlinux/third_party/libcxx/libcxx.a "$OUTDIR/$arch-linux-cosmo/lib/optlinux/"
-
   for lib in c dl gcc_s m crypt pthread resolv rt dl unwind gomp stdc++; do
     printf '\041\074\141\162\143\150\076\012' >"$OUTDIR/$arch-linux-cosmo/lib/lib$lib.a"
   done
   mkdir -p "$OUTDIR/lib/gcc/"
   touch "$OUTDIR/lib/gcc/libgomp.spec"  # needed if user passes -fopenmp but not -lgomp
 done
-
 cp -f o/$AMD64/ape/ape.o "$OUTDIR/x86_64-linux-cosmo/lib/"
-cp -f o/$AMD64-dbg/ape/ape.o "$OUTDIR/x86_64-linux-cosmo/lib/dbg/"
-cp -f o/$AMD64-tiny/ape/ape.o "$OUTDIR/x86_64-linux-cosmo/lib/tiny/"
-cp -f o/$AMD64-optlinux/ape/ape.o "$OUTDIR/x86_64-linux-cosmo/lib/optlinux/"
-
 cp -f o/$AMD64/ape/ape.lds "$OUTDIR/x86_64-linux-cosmo/lib/"
-cp -f o/$AMD64-dbg/ape/ape.lds "$OUTDIR/x86_64-linux-cosmo/lib/dbg/"
-cp -f o/$AMD64-tiny/ape/ape.lds "$OUTDIR/x86_64-linux-cosmo/lib/tiny/"
-cp -f o/$AMD64-optlinux/ape/ape.lds "$OUTDIR/x86_64-linux-cosmo/lib/optlinux/"
-
 cp -f o/$ARM64/ape/aarch64.lds "$OUTDIR/aarch64-linux-cosmo/lib/"
-cp -f o/$ARM64-dbg/ape/aarch64.lds "$OUTDIR/aarch64-linux-cosmo/lib/dbg/"
-cp -f o/$ARM64-tiny/ape/aarch64.lds "$OUTDIR/aarch64-linux-cosmo/lib/tiny/"
-cp -f o/$ARM64-optlinux/ape/aarch64.lds "$OUTDIR/aarch64-linux-cosmo/lib/optlinux/"
-
 cp -f o/$AMD64/ape/ape-no-modify-self.o "$OUTDIR/x86_64-linux-cosmo/lib/"
-cp -f o/$AMD64-dbg/ape/ape-no-modify-self.o "$OUTDIR/x86_64-linux-cosmo/lib/dbg/"
-cp -f o/$AMD64-tiny/ape/ape-no-modify-self.o "$OUTDIR/x86_64-linux-cosmo/lib/tiny/"
-cp -f o/$AMD64-optlinux/ape/ape-no-modify-self.o "$OUTDIR/x86_64-linux-cosmo/lib/optlinux/"
 
 cp -f ape/ape-m1.c "$OUTDIR/bin/"
 cp -af tool/cosmocc/bin/* "$OUTDIR/bin/"
@@ -326,7 +178,7 @@ cp -f o/$ARM64/ape/ape.elf "$OUTDIR/bin/ape-aarch64.elf"
 for x in assimilate march-native mktemper fixupobj zipcopy apelink pecheck mkdeps zipobj \
          ar chmod cocmd cp echo gzip objbincopy package rm touch mkdir compile sha256sum \
          resymbol; do
-  $APE $APELINK \
+  ape $APELINK \
     -l o/$AMD64/ape/ape.elf \
     -l o/$ARM64/ape/ape.elf \
     -M ape/ape-m1.c \
@@ -340,7 +192,7 @@ for x in ar chmod cp echo gzip package rm touch mkdir compile sha256sum; do
 done
 
 for x in make ctags; do
-  $APE $APELINK \
+  ape $APELINK \
     -l o/$AMD64/ape/ape.elf \
     -l o/$ARM64/ape/ape.elf \
     -M ape/ape-m1.c \
diff --git a/tool/curl/curl.c b/tool/curl/curl.c
index c108f0928..c35812955 100644
--- a/tool/curl/curl.c
+++ b/tool/curl/curl.c
@@ -14,7 +14,7 @@
 #include "libc/errno.h"
 #include "libc/fmt/itoa.h"
 #include "libc/fmt/magnumstrs.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/tool/decode/BUILD.mk b/tool/decode/BUILD.mk
index d435d2e86..00318107c 100644
--- a/tool/decode/BUILD.mk
+++ b/tool/decode/BUILD.mk
@@ -37,10 +37,8 @@ TOOL_DECODE_DIRECTDEPS =				\
 	LIBC_X						\
 	THIRD_PARTY_GDTOA				\
 	THIRD_PARTY_GETOPT				\
-	THIRD_PARTY_MUSL				\
 	THIRD_PARTY_TZ					\
 	THIRD_PARTY_XED					\
-	TOOL_BUILD_LIB					\
 	TOOL_DECODE_LIB
 
 TOOL_DECODE_DEPS :=					\
diff --git a/tool/decode/ar.c b/tool/decode/ar.c
index a4e7ee86e..b1bd1892c 100644
--- a/tool/decode/ar.c
+++ b/tool/decode/ar.c
@@ -110,8 +110,9 @@ static void Print(void) {
   printf("\n");
   printf("\t.long\t%-*.u# %s\n", 35, entries, "symbol table entries");
   table = 8 + 60 + 4;
-  for (i = 0; i < entries; ++i)
+  for (i = 0; i < entries; ++i) {
     printf("\t.long\t%#-*.x# %u\n", 35, READ32BE(data + table + i * 4), i);
+  }
   symbols = table + entries * 4;
   symbolslen = arsize - (entries + 1) * 4;
   for (i = o = 0; o < symbolslen; ++i, o += n + 1) {
diff --git a/tool/decode/ar2.c b/tool/decode/ar2.c
deleted file mode 100644
index 66e1fb74f..000000000
--- a/tool/decode/ar2.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/elf/elf.h"
-#include "libc/stdio/stdio.h"
-#include "tool/build/lib/ar.h"
-
-void ProcessFile(const char *path) {
-  struct Ar ar;
-  struct ArFile arf;
-  openar(&ar, path);
-  while (readar(&ar, &arf)) {
-    printf("%s: %s", path, arf.name);
-    if (IsElf64Binary(arf.data, arf.size))
-      printf(" is elf");
-    else
-      printf(" is not elf!!");
-    printf("\n");
-  }
-  closear(&ar);
-}
-
-int main(int argc, char *argv[]) {
-  for (int i = 1; i < argc; ++i)
-    ProcessFile(argv[i]);
-}
diff --git a/tool/decode/elf.c b/tool/decode/elf.c
index 2d62bab2e..4db9e45ab 100644
--- a/tool/decode/elf.c
+++ b/tool/decode/elf.c
@@ -29,7 +29,7 @@
 #include "libc/intrin/safemacros.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/tool/decode/lib/bitabuilder.c b/tool/decode/lib/bitabuilder.c
index f57a2d7cd..9fa598a81 100644
--- a/tool/decode/lib/bitabuilder.c
+++ b/tool/decode/lib/bitabuilder.c
@@ -20,7 +20,7 @@
 #include "libc/assert.h"
 #include "libc/limits.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
diff --git a/tool/decode/lib/disassemblehex.c b/tool/decode/lib/disassemblehex.c
index d9859d890..afa901259 100644
--- a/tool/decode/lib/disassemblehex.c
+++ b/tool/decode/lib/disassemblehex.c
@@ -19,7 +19,7 @@
 #include "tool/decode/lib/disassemblehex.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 static size_t countzeroes(const uint8_t *data, size_t size) {
   size_t i;
diff --git a/tool/decode/lib/machoidnames.c b/tool/decode/lib/machoidnames.c
index 0755f31e3..b2b1c5ecc 100644
--- a/tool/decode/lib/machoidnames.c
+++ b/tool/decode/lib/machoidnames.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "tool/decode/lib/machoidnames.h"
-#include "libc/macho.h"
+#include "libc/macho.internal.h"
 
 const struct IdName kMachoArchitectures[] = {
     {MAC_CPU_MC680x0, "MAC_CPU_MC680x0"},              //
diff --git a/tool/decode/lib/zipnames.c b/tool/decode/lib/zipnames.c
index ae5ec5bcc..a5d834ff4 100644
--- a/tool/decode/lib/zipnames.c
+++ b/tool/decode/lib/zipnames.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "tool/decode/lib/zipnames.h"
 #include "libc/nt/enum/fileflagandattributes.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 
 const struct IdName kZipCompressionNames[] = {
     {kZipCompressionNone, "kZipCompressionNone"},
diff --git a/tool/decode/macho.c b/tool/decode/macho.c
index b177d13aa..9f358aaef 100644
--- a/tool/decode/macho.c
+++ b/tool/decode/macho.c
@@ -22,13 +22,13 @@
 #include "libc/fmt/conv.h"
 #include "libc/fmt/libgen.h"
 #include "libc/intrin/safemacros.h"
-#include "libc/macho.h"
+#include "libc/macho.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/prot.h"
@@ -48,7 +48,7 @@ static size_t machosize;
 static void startfile(void) {
   showtitle("αcτµαlly pδrταblε εxεcµταblε", "tool/decode/macho", NULL, NULL,
             &kModelineAsm);
-  printf("#include \"libc/macho.h\"\n\n", path);
+  printf("#include \"libc/macho.internal.h\"\n\n", path);
 }
 
 static void showmachoheader(void) {
diff --git a/tool/decode/unhex.c b/tool/decode/unhex.c
index 9440d5627..c29e094aa 100644
--- a/tool/decode/unhex.c
+++ b/tool/decode/unhex.c
@@ -8,7 +8,7 @@
 #include "libc/calls/calls.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * @fileoverview Hex to binary converter program.
diff --git a/tool/decode/x86opinfo.c b/tool/decode/x86opinfo.c
index ee2d0bb11..6b741c304 100644
--- a/tool/decode/x86opinfo.c
+++ b/tool/decode/x86opinfo.c
@@ -18,11 +18,11 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/ctype.h"
 #include "libc/errno.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/consts/ex.h"
 #include "libc/sysv/consts/exit.h"
 #include "third_party/getopt/getopt.internal.h"
diff --git a/tool/decode/zip.c b/tool/decode/zip.c
index 1d5d30ede..6b4cf88b2 100644
--- a/tool/decode/zip.c
+++ b/tool/decode/zip.c
@@ -39,7 +39,7 @@
 #include "libc/sysv/consts/prot.h"
 #include "libc/time.h"
 #include "libc/x/xasprintf.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "tool/decode/lib/asmcodegen.h"
 #include "tool/decode/lib/disassemblehex.h"
 #include "tool/decode/lib/flagger.h"
@@ -78,7 +78,7 @@ char *xiso8601(struct timespec ts) {
   ptr += snprintf(ptr, end - ptr, "%09ld", ts.tv_nsec);
   ptr += strftime(ptr, end - ptr, "%z", &tm);
   unassert(ptr + 1 <= end);
-  unassert(realloc_in_place(res, ptr + 1 - res) == res);
+  unassert(realloc_in_place(res, ptr - end) == res);
   return res;
 }
 
diff --git a/tool/emacs/cosmo-asm-mode.el b/tool/emacs/cosmo-asm-mode.el
index 98699ddf0..b948e12af 100644
--- a/tool/emacs/cosmo-asm-mode.el
+++ b/tool/emacs/cosmo-asm-mode.el
@@ -336,10 +336,9 @@
   (set (make-local-variable 'indent-tabs-mode) t)
   (set (make-local-variable 'tab-width) 8))
 
-(eval-after-load 'asm-mode
-  '(progn
-     (add-hook 'asm-mode-hook 'cosmo-asm-supplemental-hook)
-     (setq asm-font-lock-keywords cosmo-asm-font-lock-keywords)))
+(progn
+  (add-hook 'asm-mode-hook 'cosmo-asm-supplemental-hook)
+  (setq asm-font-lock-keywords cosmo-asm-font-lock-keywords))
 
 ;; Make -*-unix-assembly-*- mode line work correctly like GitHub.
 (define-derived-mode unix-assembly-mode asm-mode "UNIX Assembly")
diff --git a/tool/emacs/cosmo-c-keywords.el b/tool/emacs/cosmo-c-keywords.el
index e9a29cec5..ff4acc144 100644
--- a/tool/emacs/cosmo-c-keywords.el
+++ b/tool/emacs/cosmo-c-keywords.el
@@ -218,20 +218,7 @@
            "__sysv_abi__"
            "__mode__"
            "__seg_fs"
-           "__seg_gs"
-           "__access__"
-           "__read_only__"
-           "__write_only__"
-           "__read_write__"
-           "__read_only"
-           "__write_only"
-           "__read_write"
-           "__fd_arg__"
-           "__fd_arg"
-           "__copy__"
-           "__retain__"
-           "__tainted_args__"
-           "__zero_call_used_regs__"))
+           "__seg_gs"))
 
         (clang
          '("__optnone__"
diff --git a/tool/emacs/cosmo-cpp-constants.el b/tool/emacs/cosmo-cpp-constants.el
index 80636c337..5c244b7c0 100644
--- a/tool/emacs/cosmo-cpp-constants.el
+++ b/tool/emacs/cosmo-cpp-constants.el
@@ -17,7 +17,6 @@
     "__GNUC__"
     "__APPLE__"
     "__linux__"
-    "__gnu_linux__"
     "__HAIKU__"
     "__CYGWIN__"
     "__EMSCRIPTEN__"
@@ -26,13 +25,10 @@
     "__NetBSD__"
     "__NetBSD_Version__"
     "__OpenBSD__"
-    "__Fuchsia__"
     "__COSMOPOLITAN__"
     "__COSMOCC__"
     "__FATCOSMOCC__"
     "__GLIBC__"
-    "__ELF__"
-    "__GNU__"
     "__linux"
     "__MACH__"
     "__GNUG__"
@@ -73,7 +69,6 @@
     "__BMI2__"
     "__FMA__"
     "__FAST_MATH__"
-    "__FINITE_MATH_ONLY__"
     "__ROUNDING_MATH__"
     "__NO_MATH_ERRNO__"
     "__FMA4__"
@@ -133,8 +128,6 @@
     "__ARM_FP16_IEEE"
     "__ARM_FP_FAST"
     "__powerpc__"
-    "__POWERPC__"
-    "__ppc__"
     "__powerpc64__"
     "__POWER9_VECTOR__"
     "__wasm_simd128__"
diff --git a/tool/emacs/cosmo-format.el b/tool/emacs/cosmo-format.el
index 140c9e2d2..107c201e9 100644
--- a/tool/emacs/cosmo-format.el
+++ b/tool/emacs/cosmo-format.el
@@ -104,8 +104,7 @@
                           cosmo-format-blacklist))
              (not (save-excursion
                     (beginning-of-buffer)
-                    (or (looking-at "/\\* clang-format off \\*/")
-                        (looking-at "// clang-format off")))))
+                    (looking-at "/\\* clang-format off \\*/"))))
     (let* ((bin (cosmo--find-clang-format-bin))
            (this (buffer-file-name))
            (root (locate-dominating-file this ".clang-format")))
diff --git a/tool/emacs/cosmo-platform-constants.el b/tool/emacs/cosmo-platform-constants.el
index 48b408b29..caf232607 100644
--- a/tool/emacs/cosmo-platform-constants.el
+++ b/tool/emacs/cosmo-platform-constants.el
@@ -5,6 +5,7 @@
          '("__cplusplus"
            "__OBJC__"
            "__STRICT_ANSI__"
+           "__ELF__"
            "__VERSION__"
            "__OPTIMIZE__"
            "__OPTIMIZE_SIZE__"
@@ -28,7 +29,6 @@
            "__LP64__"
            "__SSP__"
            "__SSP_ALL__"
-           "__unix"
            "__unix__"
            "__vax__"
            "__ns16000__"
diff --git a/tool/emacs/cosmo-stuff.el b/tool/emacs/cosmo-stuff.el
index ab45aeeb1..f7c712b2d 100644
--- a/tool/emacs/cosmo-stuff.el
+++ b/tool/emacs/cosmo-stuff.el
@@ -11,19 +11,19 @@
 
 ;;; Code:
 
-;; (require 'asm-mode)
-;; (require 'cc-mode)
-;; (require 'fortran)
-;; (require 'cosmo-c-types)
-;; (require 'cosmo-c-keywords)
-;; (require 'cosmo-c-builtins)
-;; (require 'cosmo-c-constants)
-;; (require 'cosmo-cpp-constants)
-;; (require 'cosmo-platform-constants)
-;; (require 'dired)
-;; (require 'javadown)
-;; (require 'ld-script)
-;; (require 'make-mode)
+(require 'asm-mode)
+(require 'cc-mode)
+(require 'fortran)
+(require 'cosmo-c-types)
+(require 'cosmo-c-keywords)
+(require 'cosmo-c-builtins)
+(require 'cosmo-c-constants)
+(require 'cosmo-cpp-constants)
+(require 'cosmo-platform-constants)
+(require 'dired)
+(require 'javadown)
+(require 'ld-script)
+(require 'make-mode)
 
 (setq cosmo-arch
       (let ((arch (string-trim-right
@@ -149,12 +149,8 @@
        (format "%s/TAGS"
                (or (locate-dominating-file (buffer-name) "Makefile")
                    (file-name-directory (buffer-name))))))
-
-(eval-after-load 'cc-mode
-  '(progn
-     (add-hook 'c-mode-common-hook 'stop-asking-questions-etags)
-     (add-hook 'c++-mode-common-hook 'stop-asking-questions-etags)))
-
+(add-hook 'c-mode-common-hook 'stop-asking-questions-etags)
+(add-hook 'c++-mode-common-hook 'stop-asking-questions-etags)
 (setq tags-revert-without-query t)
 (setq kill-buffer-query-functions ;; disable kill buffer w/ process question
       (delq 'process-kill-buffer-query-function kill-buffer-query-functions))
@@ -303,30 +299,15 @@
   (local-set-key (kbd "C-c C-c") 'cosmo-compile))
 
 (progn
+  (add-hook 'makefile-mode-hook 'cosmo-compile-hook)
+  (add-hook 'asm-mode-hook 'cosmo-compile-hook)
+  (add-hook 'ld-script-mode-hook 'cosmo-compile-hook)
+  (add-hook 'dired-mode-hook 'cosmo-compile-hook)
+  (add-hook 'c-mode-common-hook 'cosmo-compile-hook)
+  (add-hook 'c++-mode-common-hook 'cosmo-compile-hook)
   (add-hook 'fortran-mode-hook 'cosmo-compile-hook)
   (add-hook 'protobuf-mode-hook 'cosmo-compile-hook))
 
-(eval-after-load 'make-mode
-  '(progn
-     (add-hook 'makefile-mode-hook 'cosmo-compile-hook)))
-
-(eval-after-load 'asm-mode
-  '(progn
-     (add-hook 'asm-mode-hook 'cosmo-compile-hook)))
-
-(eval-after-load 'dired
-  '(progn
-     (add-hook 'dired-mode-hook 'cosmo-compile-hook)))
-
-(eval-after-load 'ld-script
-  '(progn
-     (add-hook 'ld-script-mode-hook 'cosmo-compile-hook)))
-
-(eval-after-load 'cc-mode
-  '(progn
-     (add-hook 'c-mode-common-hook 'cosmo-compile-hook)
-     (add-hook 'c++-mode-common-hook 'cosmo-compile-hook)))
-
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Display assembly for C/C++ buffer
@@ -625,13 +606,10 @@
   (add-hook 'asm-mode-hook 'cosmo-assemble-hook)
   (add-hook 'ld-script-mode-hook 'cosmo-assemble-hook)
   (add-hook 'dired-mode-hook 'cosmo-assemble-hook)
+  (add-hook 'c-mode-common-hook 'cosmo-assemble-hook)
   (add-hook 'fortran-mode-hook 'cosmo-assemble-hook)
   (add-hook 'protobuf-mode-hook 'cosmo-assemble-hook))
 
-(eval-after-load 'cc-mode
-  '(progn
-     (add-hook 'c-mode-common-hook 'cosmo-assemble-hook)))
-
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Run buffer.
@@ -710,34 +688,18 @@
               ('t
                (error "cosmo-run: unknown major mode")))))))
 
-(eval-after-load 'cc-mode
-  '(progn
-     (define-key c-mode-base-map (kbd "C-c C-r") 'cosmo-run)
-     (define-key c-mode-map (kbd "C-c C-s") 'cosmo-run-test)
-     (define-key c++-mode-map (kbd "C-c C-s") 'cosmo-run-test)
-     (define-key c-mode-map (kbd "C-c C-_") 'cosmo-run-win7)
-     (define-key c-mode-map (kbd "C-c C-_") 'cosmo-run-win10)
-     (define-key c++-mode-map (kbd "C-c C-_") 'cosmo-run-win10)))
-
-(eval-after-load 'fortran-mode
-  '(progn
-     (define-key fortran-mode-map (kbd "C-c C-r") 'cosmo-run)))
-
-(eval-after-load 'asm-mode
-  '(progn
-     (define-key asm-mode-map (kbd "C-c C-r") 'cosmo-run)))
-
-(eval-after-load 'sh-script
-  '(progn
-     (define-key sh-mode-map (kbd "C-c C-r") 'cosmo-run)))
-
-(eval-after-load 'lua-mode
-  '(progn
-     (define-key lua-mode-map (kbd "C-c C-r") 'cosmo-run)))
-
-(eval-after-load 'python
-  '(progn
-     (define-key python-mode-map (kbd "C-c C-r") 'cosmo-run)))
+(progn
+  (define-key asm-mode-map (kbd "C-c C-r") 'cosmo-run)
+  (define-key c-mode-base-map (kbd "C-c C-r") 'cosmo-run)
+  (define-key fortran-mode-map (kbd "C-c C-r") 'cosmo-run)
+  (define-key sh-mode-map (kbd "C-c C-r") 'cosmo-run)
+  (define-key lua-mode-map (kbd "C-c C-r") 'cosmo-run)
+  (define-key python-mode-map (kbd "C-c C-r") 'cosmo-run)
+  (define-key c-mode-map (kbd "C-c C-s") 'cosmo-run-test)
+  (define-key c++-mode-map (kbd "C-c C-s") 'cosmo-run-test)
+  (define-key c-mode-map (kbd "C-c C-_") 'cosmo-run-win7)
+  (define-key c-mode-map (kbd "C-c C-_") 'cosmo-run-win10)
+  (define-key c++-mode-map (kbd "C-c C-_") 'cosmo-run-win10))
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -763,13 +725,9 @@
         (compile compile-command)
         (gdb (format "gdb -q -i=mi %s -ex run" exec))))))
 
-(eval-after-load 'cc-mode
-  '(progn
-     (define-key c-mode-base-map (kbd "C-c C-d") 'cosmo-debug)))
-
-(eval-after-load 'asm-mode
-  '(progn
-     (define-key asm-mode-map (kbd "C-c C-d") 'cosmo-debug)))
+(progn
+  (define-key asm-mode-map (kbd "C-c C-d") 'cosmo-debug)
+  (define-key c-mode-base-map (kbd "C-c C-d") 'cosmo-debug))
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -861,16 +819,10 @@
       (message header))))
 
 (progn
-  (define-key prog-mode-map (kbd "C-c C-h") 'cosmo-add-include))
-
-(eval-after-load 'cc-mode
-  '(progn
-     (define-key c-mode-base-map (kbd "C-c C-h") 'cosmo-add-include)
-     (define-key c++-mode-map (kbd "C-c C-h") 'cosmo-add-include)))
-
-(eval-after-load 'asm-mode
-  '(progn
-     (define-key asm-mode-map (kbd "C-c C-h") 'cosmo-add-include)))
+  (define-key prog-mode-map (kbd "C-c C-h") 'cosmo-add-include)
+  (define-key asm-mode-map (kbd "C-c C-h") 'cosmo-add-include)
+  (define-key c-mode-base-map (kbd "C-c C-h") 'cosmo-add-include)
+  (define-key c++-mode-map (kbd "C-c C-h") 'cosmo-add-include))
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -891,17 +843,11 @@
 
 (defun cosmo-lisp-is-the-best ()
   (define-key c-mode-base-map (kbd "C-c C-o") 'cosmo-show-optinfo))
-
-(eval-after-load 'cc-mode
-  '(progn
-     (add-hook 'c-mode-common-hook 'cosmo-lisp-is-the-best)))
+(add-hook 'c-mode-common-hook 'cosmo-lisp-is-the-best)
 
 (defun cosmo-lisp-is-the-best++ ()
   (define-key c++-mode-base-map (kbd "C-c C-o") 'cosmo-show-optinfo))
-
-(eval-after-load 'cc-mode
-  '(progn
-     (add-hook 'c++-mode-common-hook 'cosmo-lisp-is-the-best++)))
+(add-hook 'c++-mode-common-hook 'cosmo-lisp-is-the-best++)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -921,8 +867,8 @@
    nil `((,cosmo-cpp-constants-regex . font-lock-constant-face)
          (,cosmo-platform-constants-regex . font-lock-constant-face))))
 
-;; (add-hook 'c-mode-common-hook 'cosmo-c-keywords-hook)
-;; (add-hook 'c++-mode-common-hook 'cosmo-c-keywords-hook)
+(add-hook 'c-mode-common-hook 'cosmo-c-keywords-hook)
+(add-hook 'c++-mode-common-hook 'cosmo-c-keywords-hook)
 (add-hook 'asm-mode-hook 'cosmo-asm-keywords-hook)
 
 
diff --git a/tool/emacs/cosmo.el b/tool/emacs/cosmo.el
index 704bd9b39..88fb36d7f 100644
--- a/tool/emacs/cosmo.el
+++ b/tool/emacs/cosmo.el
@@ -1,7 +1,7 @@
-;; (require 'ld-script)
-;; (require 'optinfo-mode)
-;; (require 'protobuf-mode)
+(require 'ld-script)
+(require 'optinfo-mode)
+(require 'protobuf-mode)
 (require 'cosmo-format)
-;; (require 'cosmo-asm-mode)
+(require 'cosmo-asm-mode)
 (require 'cosmo-stuff)
 (provide 'cosmo)
diff --git a/tool/hello/BUILD.mk b/tool/hello/BUILD.mk
index aff7a82c0..80648401c 100644
--- a/tool/hello/BUILD.mk
+++ b/tool/hello/BUILD.mk
@@ -79,7 +79,7 @@ o/$(MODE)/tool/hello/hello-pe.ape:			\
 # elf2pe can generate binaries that don't have dll imports
 o/$(MODE)/tool/hello/life-pe.dbg:			\
 		o/$(MODE)/tool/hello/life-pe.o
-	@$(COMPILE) -ALINK.elf $(LINK) $(LINKARGS) $(OUTPUT_OPTION) -q -e WinMain -Ttext-segment=0x140000000
+	@$(COMPILE) -ALINK.elf $(LINK) $(LINKARGS) $(OUTPUT_OPTION) -q -e WinMain
 o/$(MODE)/tool/hello/life-pe.ape:			\
 		o/$(MODE)/tool/hello/life-pe.dbg	\
 		o/$(MODE)/tool/build/elf2pe
@@ -94,6 +94,4 @@ o/$(MODE)/tool/hello/wait-pe.ape:			\
 		o/$(MODE)/tool/build/elf2pe
 	@$(COMPILE) -AELF2PE o/$(MODE)/tool/build/elf2pe -R 64kb -S 4kb -o $@ $<
 
-o/$(MODE)/tool/hello/life-pe.ape.zip.o: private ZIPOBJ_FLAGS += -B
-
 $(TOOL_HELLO_OBJS): tool/hello/BUILD.mk
diff --git a/tool/hello/hello-pe.c b/tool/hello/hello-pe.c
index 47198ee76..c54ce5e08 100644
--- a/tool/hello/hello-pe.c
+++ b/tool/hello/hello-pe.c
@@ -7,7 +7,6 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include "libc/nt/thunk/msabi.h"
 #include "tool/build/elf2pe.h"
 
 #define STD_OUTPUT_HANDLE -11u
@@ -16,7 +15,7 @@ __dll_import("kernel32.dll", long, GetStdHandle, (unsigned));
 __dll_import("kernel32.dll", int, WriteFile,
              (long, const void *, unsigned, unsigned *, void *));
 
-__msabi long WinMain(void) {
+__attribute__((__ms_abi__)) long WinMain(void) {
   WriteFile(GetStdHandle(STD_OUTPUT_HANDLE), "hello world\n", 12, 0, 0);
   return 0;
 }
diff --git a/tool/hello/life-pe.c b/tool/hello/life-pe.c
index 5786749af..6f6098d1d 100644
--- a/tool/hello/life-pe.c
+++ b/tool/hello/life-pe.c
@@ -7,8 +7,7 @@
 │   • http://creativecommons.org/publicdomain/zero/1.0/            │
 ╚─────────────────────────────────────────────────────────────────*/
 #endif
-#include "libc/nt/thunk/msabi.h"
 
-__msabi long WinMain(void) {
+__attribute__((__ms_abi__)) long WinMain(void) {
   return 42 << 8;
 }
diff --git a/tool/lambda/BUILD.mk b/tool/lambda/BUILD.mk
index aca58f545..a22b63aea 100644
--- a/tool/lambda/BUILD.mk
+++ b/tool/lambda/BUILD.mk
@@ -28,7 +28,6 @@ TOOL_LAMBDA_DIRECTDEPS =					\
 	LIBC_SYSV						\
 	LIBC_X							\
 	THIRD_PARTY_GETOPT					\
-	THIRD_PARTY_MUSL					\
 	TOOL_LAMBDA_LIB
 
 TOOL_LAMBDA_DEPS :=						\
diff --git a/tool/net/BUILD.mk b/tool/net/BUILD.mk
index 06a80d5f8..52d55f7ec 100644
--- a/tool/net/BUILD.mk
+++ b/tool/net/BUILD.mk
@@ -117,8 +117,7 @@ o/$(MODE)/tool/net/redbean.dbg:						\
 
 o/$(MODE)/tool/net/lsqlite3.o: private					\
 		CFLAGS +=						\
-			-DSQLITE_ENABLE_SESSION				\
-			-DSQLITE_ENABLE_DESERIALIZE
+			-DSQLITE_ENABLE_SESSION
 
 # REDBEAN-DEMO
 #
diff --git a/tool/net/definitions.lua b/tool/net/definitions.lua
index 3732416b0..2424f9241 100644
--- a/tool/net/definitions.lua
+++ b/tool/net/definitions.lua
@@ -4129,22 +4129,38 @@ unix = {
     CLK_TCK = nil,
 
     --- @type integer
-    CLOCK_REALTIME = nil,
+    CLOCK_BOOTTIME = nil,
+    --- @type integer
+    CLOCK_BOOTTIME_ALARM = nil,
     --- @type integer
     CLOCK_MONOTONIC = nil,
     --- @type integer
-    CLOCK_BOOTTIME = nil,
+    CLOCK_MONOTONIC_COARSE = nil,
+    --- @type integer
+    CLOCK_MONOTONIC_PRECISE = nil,
+    --- @type integer
+    CLOCK_MONOTONIC_FAST = nil,
     --- @type integer
     CLOCK_MONOTONIC_RAW = nil,
     --- @type integer
+    CLOCK_PROCESS_CPUTIME_ID = nil,
+    --- @type integer
+    CLOCK_PROF = nil,
+    --- @type integer
+    CLOCK_REALTIME = nil,
+    --- @type integer
+    CLOCK_REALTIME_PRECISE = nil,
+    --- @type integer
+    CLOCK_REALTIME_ALARM = nil,
+    --- @type integer
     CLOCK_REALTIME_COARSE = nil,
     --- @type integer
-    CLOCK_MONOTONIC_COARSE = nil,
+    CLOCK_REALTIME_FAST = nil,
+    --- @type integer
+    CLOCK_TAI = nil,
     ---@type integer
     CLOCK_THREAD_CPUTIME_ID = nil,
     --- @type integer
-    CLOCK_PROCESS_CPUTIME_ID = nil,
-    --- @type integer
     DT_BLK = nil,
     --- @type integer
     DT_CHR = nil,
@@ -5188,14 +5204,11 @@ function unix.fork() end
 ---     unix.execve(prog, {prog, '-hal', '.'}, {'PATH=/bin'})
 ---     unix.exit(127)
 ---
---- If `prog` is an absolute path, then it's returned as-is. If `prog`
---- contains slashes then it's not path searched either and will be
---- returned if it exists. On Windows, it's recommended that you install
---- programs from cosmos to c:/bin/ without any .exe or .com suffix, so
---- they can be discovered like they would on UNIX. If you want to find
---- a program like notepad on the $PATH using this function, then you
---- need to specify "notepad.exe" so it includes the extension.
----
+--- We automatically suffix `.com` and `.exe` for all platforms when
+--- path searching. By default, the current directory is not on the
+--- path. If `prog` is an absolute path, then it's returned as-is. If
+--- `prog` contains slashes then it's not path searched either and will
+--- be returned if it exists.
 ---@param prog string
 ---@return string path
 ---@overload fun(prog: string): nil, error: unix.Errno
@@ -6084,73 +6097,23 @@ function unix.syslog(priority, msg) end
 ---
 --- `clock` can be any one of of:
 ---
---- - `CLOCK_REALTIME` returns a wall clock timestamp represented in
----   nanoseconds since the UNIX epoch (~1970). It'll count time in the
----   suspend state. This clock is subject to being smeared by various
----   adjustments made by NTP. These timestamps can have unpredictable
----   discontinuous jumps when clock_settime() is used. Therefore this
----   clock is the default clock for everything, even pthread condition
----   variables. Cosmopoiltan guarantees this clock will never raise
----   `EINVAL` and also guarantees `CLOCK_REALTIME == 0` will always be
----   the case. On Windows this maps to GetSystemTimePreciseAsFileTime().
----   On platforms with vDSOs like Linux, Windows, and MacOS ARM64 this
----   should take about 20 nanoseconds.
----
---- - `CLOCK_MONOTONIC` returns a timestamp with an unspecified epoch,
----   that should be when the system was powered on. These timestamps
----   shouldn't go backwards. Timestamps shouldn't count time spent in
----   the sleep, suspend, and hibernation states. These timestamps won't
----   be impacted by clock_settime(). These timestamps may be impacted by
----   frequency adjustments made by NTP. Cosmopoiltan guarantees this
----   clock will never raise `EINVAL`. MacOS and BSDs use the word
----   "uptime" to describe this clock. On Windows this maps to
----   QueryUnbiasedInterruptTimePrecise().
----
---- - `CLOCK_BOOTTIME` is a monotonic clock returning a timestamp with an
----   unspecified epoch, that should be relative to when the host system
----   was powered on. These timestamps shouldn't go backwards. Timestamps
----   should also include time spent in a sleep, suspend, or hibernation
----   state. These timestamps aren't impacted by clock_settime(), but
----   they may be impacted by frequency adjustments made by NTP. This
----   clock will raise an `EINVAL` error on extremely old Linux distros
----   like RHEL5. MacOS and BSDs use the word "monotonic" to describe
----   this clock. On Windows this maps to QueryInterruptTimePrecise().
----
---- - `CLOCK_MONOTONIC_RAW` returns a timestamp from an unspecified
----   epoch. These timestamps don't count time spent in the sleep,
----   suspend, and hibernation states. Unlike `CLOCK_MONOTONIC` this
----   clock is guaranteed to not be impacted by frequency adjustments or
----   discontinuous jumps caused by clock_settime(). Providing this level
----   of assurances may make this clock slower than the normal monotonic
----   clock. Furthermore this clock may cause `EINVAL` to be raised if
----   running on a host system that doesn't provide those guarantees,
----   e.g. OpenBSD and MacOS on AMD64.
----
---- - `CLOCK_REALTIME_COARSE` is the same as `CLOCK_REALTIME` except
----   it'll go faster if the host OS provides a cheaper way to read the
----   wall time. Please be warned that coarse can be really coarse.
----   Rather than nano precision, you're looking at `CLK_TCK` precision,
----   which can lag as far as 30 milliseconds behind or possibly more.
----   Cosmopolitan may fallback to `CLOCK_REALTIME` if a faster less
----   accurate clock isn't provided by the system. This clock will raise
----   an `EINVAL` error on extremely old Linux distros like RHEL5.
----
---- - `CLOCK_MONOTONIC_COARSE` is the same as `CLOCK_MONOTONIC` except
----   it'll go faster if the host OS provides a cheaper way to read the
----   unbiased time. Please be warned that coarse can be really coarse.
----   Rather than nano precision, you're looking at `CLK_TCK` precision,
----   which can lag as far as 30 milliseconds behind or possibly more.
----   Cosmopolitan may fallback to `CLOCK_REALTIME` if a faster less
----   accurate clock isn't provided by the system. This clock will raise
----   an `EINVAL` error on extremely old Linux distros like RHEL5.
----
---- - `CLOCK_PROCESS_CPUTIME_ID` returns the amount of time this process
----   was actively scheduled. This is similar to getrusage() and clock().
----   Cosmopoiltan guarantees this clock will never raise `EINVAL`.
----
---- - `CLOCK_THREAD_CPUTIME_ID` returns the amount of time this thread
----   was actively scheduled. This is similar to getrusage() and clock().
----   Cosmopoiltan guarantees this clock will never raise `EINVAL`.
+--- - `CLOCK_REALTIME`: universally supported
+--- - `CLOCK_REALTIME_FAST`: ditto but faster on freebsd
+--- - `CLOCK_REALTIME_PRECISE`: ditto but better on freebsd
+--- - `CLOCK_REALTIME_COARSE`: : like `CLOCK_REALTIME_FAST` but needs Linux 2.6.32+
+--- - `CLOCK_MONOTONIC`: universally supported
+--- - `CLOCK_MONOTONIC_FAST`: ditto but faster on freebsd
+--- - `CLOCK_MONOTONIC_PRECISE`: ditto but better on freebsd
+--- - `CLOCK_MONOTONIC_COARSE`: : like `CLOCK_MONOTONIC_FAST` but needs Linux 2.6.32+
+--- - `CLOCK_MONOTONIC_RAW`: is actually monotonic but needs Linux 2.6.28+
+--- - `CLOCK_PROCESS_CPUTIME_ID`: linux and bsd
+--- - `CLOCK_THREAD_CPUTIME_ID`: linux and bsd
+--- - `CLOCK_MONOTONIC_COARSE`: linux, freebsd
+--- - `CLOCK_PROF`: linux and netbsd
+--- - `CLOCK_BOOTTIME`: linux and openbsd
+--- - `CLOCK_REALTIME_ALARM`: linux-only
+--- - `CLOCK_BOOTTIME_ALARM`: linux-only
+--- - `CLOCK_TAI`: linux-only
 ---
 --- Returns `EINVAL` if clock isn't supported on platform.
 ---
diff --git a/tool/net/fetch.inc b/tool/net/fetch.inc
index 8be5775b0..16fa05a56 100644
--- a/tool/net/fetch.inc
+++ b/tool/net/fetch.inc
@@ -378,7 +378,7 @@ static int LuaFetch(lua_State *L) {
           WARNF("(ftch) HTTP client %s error", "EOF headers");
           goto TransportError;
         }
-        rc = ParseHttpMessage(&msg, inbuf.p, inbuf.n, SHRT_MAX);
+        rc = ParseHttpMessage(&msg, inbuf.p, inbuf.n, inbuf.c);
         if (rc == -1) {
           WARNF("(ftch) HTTP client %s error", "ParseHttpMessage");
           goto TransportError;
diff --git a/tool/net/getadaptersaddresses.c b/tool/net/getadaptersaddresses.c
index 8dea1ab97..91ddbfa94 100644
--- a/tool/net/getadaptersaddresses.c
+++ b/tool/net/getadaptersaddresses.c
@@ -35,7 +35,7 @@
 #include "libc/serialize.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/sockaddr6.h"
-#include "libc/stdalign.h"
+#include "libc/stdalign.internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/af.h"
diff --git a/tool/net/help.txt b/tool/net/help.txt
index ab017578b..f031a83ae 100644
--- a/tool/net/help.txt
+++ b/tool/net/help.txt
@@ -1,6 +1,6 @@
 SYNOPSIS
 
-  redbean.com [-?BVabdfghjkmsuvz] [-p PORT] [-D DIR] [-- SCRIPTARGS...]
+  redbean [-?BVabdfghjkmsuvz] [-p PORT] [-D DIR] [-- SCRIPTARGS...]
 
 DESCRIPTION
 
@@ -137,15 +137,15 @@ USAGE
   This executable is also a ZIP file that contains static assets.
   You can run redbean interactively in your terminal as follows:
 
-    ./redbean.com -vvvmbag        # starts server verbosely
+    ./redbean -vvvmbag        # starts server verbosely
     open http://127.0.0.1:8080/   # shows zip listing page
     CTRL-C                        # 1x: graceful shutdown
     CTRL-C                        # 2x: forceful shutdown
 
   You can override the default listing page by adding:
 
-    zip redbean.com index.lua     # lua server pages take priority
-    zip redbean.com index.html    # default page for directory
+    zip redbean index.lua     # lua server pages take priority
+    zip redbean index.html    # default page for directory
 
   The listing page only applies to the root directory. However the
   default index page applies to subdirectories too. In order for it
@@ -160,7 +160,7 @@ USAGE
       --no-parent            \
       --no-if-modified-since \
       http://a.example/index.html
-    zip -r redbean.com a.example/  # default page for directory
+    zip -r redbean a.example/  # default page for directory
 
   redbean normalizes the trailing slash for you automatically:
 
@@ -198,18 +198,18 @@ USAGE
   by default, embedded as a bas64 data uri. You can override the
   custom page for various errors by adding files to the zip root.
 
-    zip redbean.com 404.html      # custom not found page
+    zip redbean 404.html      # custom not found page
 
   Audio video content should not be compressed in your ZIP files.
   Uncompressed assets enable browsers to send Range HTTP request.
   On the other hand compressed assets are best for gzip encoding.
 
-    zip redbean.com index.html    # adds file
-    zip -0 redbean.com video.mp4  # adds without compression
+    zip redbean index.html    # adds file
+    zip -0 redbean video.mp4  # adds without compression
 
   You can have redbean run as a daemon by doing the following:
 
-    sudo ./redbean.com -vvdp80 -p443 -L redbean.log -P redbean.pid
+    sudo ./redbean -vvdp80 -p443 -L redbean.log -P redbean.pid
     kill -TERM $(cat redbean.pid) # 1x: graceful shutdown
     kill -TERM $(cat redbean.pid) # 2x: forceful shutdown
 
@@ -230,7 +230,14 @@ USAGE
   run on six different operating systems. To do that, it needs to
   extract a 4kb loader program to ${TMPDIR:-${HOME:-.}}/.ape that'll
   map your redbean into memory. It does however check to see if `ape`
-  is on the system path beforehand.
+  is on the system path beforehand. You can also "assimilate" any
+  redbean into the platform-local executable format by running:
+
+      $ file redbean
+      redbean: DOS/MBR boot sector
+      $ ./redbean --assimilate
+      $ file redbean
+      redbean: ELF 64-bit LSB executable
 
 ────────────────────────────────────────────────────────────────────────────────
 SECURITY
@@ -399,14 +406,12 @@ REPL
   encoded in its preferred executable format. You can assimilate your
   redbean into the local format using the following commands:
 
-      $ file redbean.com
-      redbean.com: DOS/MBR boot sector
-      $ curl -o assimilate https://cosmo.zip/pub/cosmos/bin/assimilate
-      $ chmod +x assimilate
-      $ ./assimilate ./redbean.com
-      $ file redbean.com
-      redbean.com: ELF 64-bit LSB executable
-      $ sudo cp redbean.com /usr/bin/redbean
+      $ file redbean
+      redbean: DOS/MBR boot sector
+      $ ./redbean --assimilate
+      $ file redbean
+      redbean: ELF 64-bit LSB executable
+      $ sudo cp redbean /usr/bin/redbean
 
   By following the above steps, redbean can be installed systemwide for
   multiple user accounts. It's also possible to chmod the binary to have
@@ -456,7 +461,7 @@ GLOBALS
 
           Then your `/.init.lua` file will have the `arg` array like:
 
-              arg[-1] = '/usr/bin/redbean
+              arg[-1] = '/usr/bin/redbean'
               arg[ 0] = '/zip/.init.lua'
               arg[ 1] = 'arg1'
               arg[ 2] = 'arg2'
@@ -464,11 +469,11 @@ GLOBALS
           If you launch redbean in interpreter mode (rather than web
           server) mode, then an invocation like this:
 
-              ./redbean.com -i script.lua arg1 arg2
+              ./redbean -i script.lua arg1 arg2
 
           Would have an `arg` array like this:
 
-              arg[-1] = './redbean.com'
+              arg[-1] = './redbean'
               arg[ 0] = 'script.lua'
               arg[ 1] = 'arg1'
               arg[ 2] = 'arg2'
@@ -569,7 +574,7 @@ HOOKS
           *). See functions like Route which asks redbean to do its default
           thing from the handler.
 
-  OnError(status:int, message:string, details:string)
+  OnError(status:int, message:string)
           If this function is defined and if any errors occurs in
           OnHttpRequest() then this method will be called instead of displaying
           the default error page. Useful if you need to display the error page
@@ -581,10 +586,10 @@ HOOKS
           `true`, redbean will close the connection without calling fork.
 
   OnLogLatency(reqtimeus:int, contimeus:int)
-          If this function is defined it'll be called from the child worker
-          process each time redbean completes the handling of a request, but
-          before the response is sent. The handler receives the time (in µs)
-          since the request handling and connection handling started.
+          If this function is defined it'll be called from the main process
+          each time redbean completes handling of a request, but before the
+          response is sent. The handler received the time (in µs) since the
+          request handling and connection handling started.
 
   OnProcessCreate(pid:int, ip:int, port:int, serverip:int, serverport:int)
           If this function is defined it'll be called from the main process
@@ -2877,13 +2882,11 @@ UNIX MODULE
         unix.execve(prog, {prog, '-hal', '.'}, {'PATH=/bin'})
         unix.exit(127)
 
-    If `prog` is an absolute path, then it's returned as-is. If `prog`
-    contains slashes then it's not path searched either and will be
-    returned if it exists. On Windows, it's recommended that you install
-    programs from cosmos to c:/bin/ without any .exe or .com suffix, so
-    they can be discovered like they would on UNIX. If you want to find
-    a program like notepad on the $PATH using this function, then you
-    need to specify "notepad.exe" so it includes the extension.
+    We automatically suffix `.com` and `.exe` for all platforms when
+    path searching. By default, the current directory is not on the
+    path. If `prog` is an absolute path, then it's returned as-is. If
+    `prog` contains slashes then it's not path searched either and will
+    be returned if it exists.
 
   unix.execve(prog:str[, args:List<*>, env:List<*>])
       └─→ nil, unix.Errno
@@ -3675,73 +3678,23 @@ UNIX MODULE
 
     `clock` can be any one of of:
 
-    - `CLOCK_REALTIME` returns a wall clock timestamp represented in
-      nanoseconds since the UNIX epoch (~1970). It'll count time in the
-      suspend state. This clock is subject to being smeared by various
-      adjustments made by NTP. These timestamps can have unpredictable
-      discontinuous jumps when clock_settime() is used. Therefore this
-      clock is the default clock for everything, even pthread condition
-      variables. Cosmopoiltan guarantees this clock will never raise
-      `EINVAL` and also guarantees `CLOCK_REALTIME == 0` will always be
-      the case. On Windows this maps to GetSystemTimePreciseAsFileTime().
-      On platforms with vDSOs like Linux, Windows, and MacOS ARM64 this
-      should take about 20 nanoseconds.
-
-    - `CLOCK_MONOTONIC` returns a timestamp with an unspecified epoch,
-      that should be when the system was powered on. These timestamps
-      shouldn't go backwards. Timestamps shouldn't count time spent in
-      the sleep, suspend, and hibernation states. These timestamps won't
-      be impacted by clock_settime(). These timestamps may be impacted by
-      frequency adjustments made by NTP. Cosmopoiltan guarantees this
-      clock will never raise `EINVAL`. MacOS and BSDs use the word
-      "uptime" to describe this clock. On Windows this maps to
-      QueryUnbiasedInterruptTimePrecise().
-
-    - `CLOCK_BOOTTIME` is a monotonic clock returning a timestamp with an
-      unspecified epoch, that should be relative to when the host system
-      was powered on. These timestamps shouldn't go backwards. Timestamps
-      should also include time spent in a sleep, suspend, or hibernation
-      state. These timestamps aren't impacted by clock_settime(), but
-      they may be impacted by frequency adjustments made by NTP. This
-      clock will raise an `EINVAL` error on extremely old Linux distros
-      like RHEL5. MacOS and BSDs use the word "monotonic" to describe
-      this clock. On Windows this maps to QueryInterruptTimePrecise().
-
-    - `CLOCK_MONOTONIC_RAW` returns a timestamp from an unspecified
-      epoch. These timestamps don't count time spent in the sleep,
-      suspend, and hibernation states. Unlike `CLOCK_MONOTONIC` this
-      clock is guaranteed to not be impacted by frequency adjustments or
-      discontinuous jumps caused by clock_settime(). Providing this level
-      of assurances may make this clock slower than the normal monotonic
-      clock. Furthermore this clock may cause `EINVAL` to be raised if
-      running on a host system that doesn't provide those guarantees,
-      e.g. OpenBSD and MacOS on AMD64.
-
-    - `CLOCK_REALTIME_COARSE` is the same as `CLOCK_REALTIME` except
-      it'll go faster if the host OS provides a cheaper way to read the
-      wall time. Please be warned that coarse can be really coarse.
-      Rather than nano precision, you're looking at `CLK_TCK` precision,
-      which can lag as far as 30 milliseconds behind or possibly more.
-      Cosmopolitan may fallback to `CLOCK_REALTIME` if a faster less
-      accurate clock isn't provided by the system. This clock will raise
-      an `EINVAL` error on extremely old Linux distros like RHEL5.
-
-    - `CLOCK_MONOTONIC_COARSE` is the same as `CLOCK_MONOTONIC` except
-      it'll go faster if the host OS provides a cheaper way to read the
-      unbiased time. Please be warned that coarse can be really coarse.
-      Rather than nano precision, you're looking at `CLK_TCK` precision,
-      which can lag as far as 30 milliseconds behind or possibly more.
-      Cosmopolitan may fallback to `CLOCK_REALTIME` if a faster less
-      accurate clock isn't provided by the system. This clock will raise
-      an `EINVAL` error on extremely old Linux distros like RHEL5.
-
-    - `CLOCK_PROCESS_CPUTIME_ID` returns the amount of time this process
-      was actively scheduled. This is similar to getrusage() and clock().
-      Cosmopoiltan guarantees this clock will never raise `EINVAL`.
-
-    - `CLOCK_THREAD_CPUTIME_ID` returns the amount of time this thread
-      was actively scheduled. This is similar to getrusage() and clock().
-      Cosmopoiltan guarantees this clock will never raise `EINVAL`.
+    - `CLOCK_REALTIME`: universally supported
+    - `CLOCK_REALTIME_FAST`: ditto but faster on freebsd
+    - `CLOCK_REALTIME_PRECISE`: ditto but better on freebsd
+    - `CLOCK_REALTIME_COARSE`: : like `CLOCK_REALTIME_FAST` but needs Linux 2.6.32+
+    - `CLOCK_MONOTONIC`: universally supported
+    - `CLOCK_MONOTONIC_FAST`: ditto but faster on freebsd
+    - `CLOCK_MONOTONIC_PRECISE`: ditto but better on freebsd
+    - `CLOCK_MONOTONIC_COARSE`: : like `CLOCK_MONOTONIC_FAST` but needs Linux 2.6.32+
+    - `CLOCK_MONOTONIC_RAW`: is actually monotonic but needs Linux 2.6.28+
+    - `CLOCK_PROCESS_CPUTIME_ID`: linux and bsd
+    - `CLOCK_THREAD_CPUTIME_ID`: linux and bsd
+    - `CLOCK_MONOTONIC_COARSE`: linux, freebsd
+    - `CLOCK_PROF`: linux and netbsd
+    - `CLOCK_BOOTTIME`: linux and openbsd
+    - `CLOCK_REALTIME_ALARM`: linux-only
+    - `CLOCK_BOOTTIME_ALARM`: linux-only
+    - `CLOCK_TAI`: linux-only
 
     Returns `EINVAL` if clock isn't supported on platform.
 
@@ -4524,8 +4477,9 @@ UNIX MODULE
 
       If the executable in question needs a loader, then you will need
       "rpath prot_exec" too. With APE, security is strongest when you
-      assimilate your binaries beforehand using the assimilate program.
-      On OpenBSD this is mandatory.
+      assimilate your binaries beforehand, using the --assimilate flag,
+      or the o//tool/build/assimilate program. On OpenBSD this is
+      mandatory.
 
     prot_exec
 
@@ -4864,9 +4818,9 @@ UNIX MODULE
         end
 
     It's possible to accomplish the same thing as unix.mapshared()
-    using files and unix.fcntl() advisory locks. For example, that's
-    what SQLite does and we recommend using SQLite for IPC in redbean.
-    However, unix.mapshared is significantly faster and if your app
+    using files and unix.fcntl() advisory locks. However this goes
+    significantly faster. For example, that's what SQLite does and
+    we recommend using SQLite for IPC in redbean. But, if your app
     has thousands of forked processes fighting for a file lock you
     might need something lower level than file locks, to implement
     things like throttling. Shared memory is a good way to do that
diff --git a/tool/net/lfuncs.c b/tool/net/lfuncs.c
index 01b3ce19a..798099c7d 100644
--- a/tool/net/lfuncs.c
+++ b/tool/net/lfuncs.c
@@ -30,7 +30,7 @@
 #include "libc/intrin/popcnt.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
@@ -45,7 +45,7 @@
 #include "libc/str/highwayhash64.h"
 #include "libc/str/str.h"
 #include "libc/str/strwidth.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/consts/af.h"
 #include "libc/sysv/consts/ipproto.h"
 #include "libc/sysv/consts/o.h"
diff --git a/tool/net/ljson.c b/tool/net/ljson.c
index 037cd097b..a3c73616c 100644
--- a/tool/net/ljson.c
+++ b/tool/net/ljson.c
@@ -27,7 +27,7 @@
 #include "libc/serialize.h"
 #include "libc/stdckdint.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/str/utf16.h"
 #include "libc/sysv/consts/auxv.h"
 #include "libc/thread/thread.h"
diff --git a/tool/net/lre.c b/tool/net/lre.c
index 598b5b4b6..e24648e29 100644
--- a/tool/net/lre.c
+++ b/tool/net/lre.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "third_party/lua/lauxlib.h"
 #include "third_party/regex/regex.h"
diff --git a/tool/net/redbean.c b/tool/net/redbean.c
index 93816d1aa..0eec73175 100644
--- a/tool/net/redbean.c
+++ b/tool/net/redbean.c
@@ -31,10 +31,9 @@
 #include "libc/calls/struct/termios.h"
 #include "libc/calls/struct/timespec.h"
 #include "libc/calls/termios.h"
-#include "libc/cosmo.h"
 #include "libc/ctype.h"
 #include "libc/dce.h"
-#include "libc/dos.h"
+#include "libc/dos.internal.h"
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/fmt/itoa.h"
@@ -47,7 +46,7 @@
 #include "libc/log/appendresourcereport.internal.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/alloca.h"
 #include "libc/mem/gc.h"
@@ -72,7 +71,6 @@
 #include "libc/stdio/hex.internal.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
-#include "libc/str/locale.h"
 #include "libc/str/slice.h"
 #include "libc/str/str.h"
 #include "libc/str/strwidth.h"
@@ -106,7 +104,7 @@
 #include "libc/thread/tls.h"
 #include "libc/x/x.h"
 #include "libc/x/xasprintf.h"
-#include "libc/zip.h"
+#include "libc/zip.internal.h"
 #include "net/http/escape.h"
 #include "net/http/http.h"
 #include "net/http/ip.h"
@@ -131,6 +129,7 @@
 #include "third_party/mbedtls/x509_crt.h"
 #include "third_party/musl/netdb.h"
 #include "third_party/zlib/zlib.h"
+#include "tool/args/args.h"
 #include "tool/build/lib/case.h"
 #include "tool/net/lfinger.h"
 #include "tool/net/lfuncs.h"
@@ -170,8 +169,7 @@ __static_yoink("blink_xnu_aarch64");    // is apple silicon
 #define REDBEAN "redbean"
 #endif
 
-//                         XXYYZZ
-#define VERSION          0x030000
+#define VERSION          0x020200
 #define HASH_LOAD_FACTOR /* 1. / */ 4
 #define READ(F, P, N)    readv(F, &(struct iovec){P, N}, 1)
 #define WRITE(F, P, N)   writev(F, &(struct iovec){P, N}, 1)
@@ -181,8 +179,12 @@ __static_yoink("blink_xnu_aarch64");    // is apple silicon
 #define HeaderLength(H)  (cpm.msg.headers[H].b - cpm.msg.headers[H].a)
 #define HeaderEqualCase(H, S) \
   SlicesEqualCase(S, strlen(S), HeaderData(H), HeaderLength(H))
-#define LockInc(P) atomic_fetch_add_explicit(P, +1, memory_order_relaxed)
-#define LockDec(P) atomic_fetch_add_explicit(P, -1, memory_order_relaxed)
+#define LockInc(P)                                            \
+  atomic_fetch_add_explicit((_Atomic(typeof(*(P))) *)(P), +1, \
+                            memory_order_relaxed)
+#define LockDec(P)                                            \
+  atomic_fetch_add_explicit((_Atomic(typeof(*(P))) *)(P), -1, \
+                            memory_order_relaxed)
 
 #define TRACE_BEGIN         \
   do {                      \
@@ -373,21 +375,19 @@ struct Blackhole {
 } blackhole;
 
 static struct Shared {
-  _Atomic(int) workers;
-  struct timespec lastmeltdown;
+  int workers;
   struct timespec nowish;
+  struct timespec lastreindex;
+  struct timespec lastmeltdown;
   char currentdate[32];
   struct rusage server;
   struct rusage children;
   struct Counters {
-#define C(x) _Atomic(long) x;
+#define C(x) long x;
 #include "tool/net/counters.inc"
 #undef C
   } c;
-  pthread_mutex_t datetime_mu;
-  pthread_mutex_t server_mu;
-  pthread_mutex_t children_mu;
-  pthread_mutex_t lastmeltdown_mu;
+  pthread_spinlock_t montermlock;
 } *shared;
 
 static const char kCounterNames[] =
@@ -1348,8 +1348,8 @@ static void CallSimpleHookIfDefined(const char *s) {
 }
 
 static void ReportWorkerExit(int pid, int ws) {
-  int workers =
-      atomic_fetch_sub_explicit(&shared->workers, 1, memory_order_release);
+  int workers;
+  workers = atomic_fetch_sub(&shared->workers, 1) - 1;
   if (WIFEXITED(ws)) {
     if (WEXITSTATUS(ws)) {
       LockInc(&shared->c.failedchildren);
@@ -1381,9 +1381,7 @@ static void ReportWorkerResources(int pid, struct rusage *ru) {
 
 static void HandleWorkerExit(int pid, int ws, struct rusage *ru) {
   LockInc(&shared->c.connectionshandled);
-  unassert(!pthread_mutex_lock(&shared->children_mu));
   rusage_add(&shared->children, ru);
-  unassert(!pthread_mutex_unlock(&shared->children_mu));
   ReportWorkerExit(pid, ws);
   ReportWorkerResources(pid, ru);
   if (hasonprocessdestroy) {
@@ -2129,11 +2127,9 @@ static void UpdateCurrentDate(struct timespec now) {
   int64_t t;
   struct tm tm;
   t = now.tv_sec;
-  gmtime_r(&t, &tm);
-  unassert(!pthread_mutex_lock(&shared->datetime_mu));
   shared->nowish = now;
+  gmtime_r(&t, &tm);
   FormatHttpDateTime(shared->currentdate, &tm);
-  unassert(!pthread_mutex_unlock(&shared->datetime_mu));
 }
 
 static int64_t GetGmtOffset(int64_t t) {
@@ -2278,7 +2274,7 @@ static struct Asset *GetAssetZip(const char *path, size_t pathlen) {
   hash = Hash(path, pathlen);
   for (step = 0;; ++step) {
     i = (hash + ((step * (step + 1)) >> 1)) & (assets.n - 1);
-    if (i >= assets.n || !assets.p || !assets.p[i].hash)
+    if (!assets.p[i].hash)
       return NULL;
     if (hash == assets.p[i].hash &&
         pathlen == ZIP_CFILE_NAMESIZE(zmap + assets.p[i].cf) &&
@@ -2366,10 +2362,7 @@ static char *AppendCache(char *p, int64_t seconds, char *directive) {
     p = stpcpy(p, directive);
   }
   p = AppendCrlf(p);
-  unassert(!pthread_mutex_lock(&shared->datetime_mu));
-  long nowish_sec = shared->nowish.tv_sec;
-  unassert(!pthread_mutex_unlock(&shared->datetime_mu));
-  return AppendExpires(p, nowish_sec + seconds);
+  return AppendExpires(p, shared->nowish.tv_sec + seconds);
 }
 
 static inline char *AppendContentLength(char *p, size_t n) {
@@ -2552,7 +2545,7 @@ static char *CommitOutput(char *p) {
 
 static char *ServeDefaultErrorPage(char *p, unsigned code, const char *reason,
                                    const char *details) {
-  p = AppendContentType(p, "text/html; charset=UTF-8");
+  p = AppendContentType(p, "text/html; charset=ISO-8859-1");
   reason = FreeLater(EscapeHtml(reason, -1, 0));
   appends(&cpm.outbuf, "\
 <!doctype html>\r\n\
@@ -2625,8 +2618,7 @@ static char *ServeErrorImpl(unsigned code, const char *reason,
     lua_getglobal(L, "OnError");
     lua_pushinteger(L, code);
     lua_pushstring(L, reason);
-    lua_pushstring(L, details);
-    if (LuaCallWithTrace(L, 3, 0, NULL) == LUA_OK) {
+    if (LuaCallWithTrace(L, 2, 0, NULL) == LUA_OK) {
       return CommitOutput(GetLuaResponse());
     } else {
       return ServeErrorImplDefault(code, reason, details);
@@ -3108,12 +3100,9 @@ td { padding-right: 3em; }\r\n\
 <td valign=\"top\">\r\n\
 <a href=\"/statusz\">/statusz</a>\r\n\
 ");
-  if (atomic_load_explicit(&shared->c.connectionshandled,
-                           memory_order_acquire)) {
+  if (shared->c.connectionshandled) {
     appends(&cpm.outbuf, "says your redbean<br>\r\n");
-    unassert(!pthread_mutex_lock(&shared->children_mu));
     AppendResourceReport(&cpm.outbuf, &shared->children, "<br>\r\n");
-    unassert(!pthread_mutex_unlock(&shared->children_mu));
   }
   appends(&cpm.outbuf, "<td valign=\"top\">\r\n");
   and = "";
@@ -3135,12 +3124,12 @@ td { padding-right: 3em; }\r\n\
   }
   appendf(&cpm.outbuf, "%s%,ld second%s of operation<br>\r\n", and, y.rem,
           y.rem == 1 ? "" : "s");
-  x = atomic_load_explicit(&shared->c.messageshandled, memory_order_relaxed);
+  x = shared->c.messageshandled;
   appendf(&cpm.outbuf, "%,ld message%s handled<br>\r\n", x, x == 1 ? "" : "s");
-  x = atomic_load_explicit(&shared->c.connectionshandled, memory_order_relaxed);
+  x = shared->c.connectionshandled;
   appendf(&cpm.outbuf, "%,ld connection%s handled<br>\r\n", x,
           x == 1 ? "" : "s");
-  x = atomic_load_explicit(&shared->workers, memory_order_relaxed);
+  x = shared->workers;
   appendf(&cpm.outbuf, "%,ld connection%s active<br>\r\n", x,
           x == 1 ? "" : "s");
   appends(&cpm.outbuf, "</table>\r\n");
@@ -3192,11 +3181,11 @@ static void AppendRusage(const char *a, struct rusage *ru) {
 }
 
 static void ServeCounters(void) {
-  const _Atomic(long) *c;
+  const long *c;
   const char *s;
-  for (c = (const _Atomic(long) *)&shared->c, s = kCounterNames; *s;
+  for (c = (const long *)&shared->c, s = kCounterNames; *s;
        ++c, s += strlen(s) + 1) {
-    AppendLong1(s, atomic_load_explicit(c, memory_order_relaxed));
+    AppendLong1(s, *c);
   }
 }
 
@@ -3209,17 +3198,12 @@ static char *ServeStatusz(void) {
   AppendLong1("pid", getpid());
   AppendLong1("ppid", getppid());
   AppendLong1("now", timespec_real().tv_sec);
-  unassert(!pthread_mutex_lock(&shared->datetime_mu));
   AppendLong1("nowish", shared->nowish.tv_sec);
-  unassert(!pthread_mutex_unlock(&shared->datetime_mu));
   AppendLong1("gmtoff", gmtoff);
   AppendLong1("CLK_TCK", CLK_TCK);
   AppendLong1("startserver", startserver.tv_sec);
-  unassert(!pthread_mutex_lock(&shared->lastmeltdown_mu));
   AppendLong1("lastmeltdown", shared->lastmeltdown.tv_sec);
-  unassert(!pthread_mutex_unlock(&shared->lastmeltdown_mu));
-  AppendLong1("workers",
-              atomic_load_explicit(&shared->workers, memory_order_relaxed));
+  AppendLong1("workers", shared->workers);
   AppendLong1("assets.n", assets.n);
 #ifndef STATIC
   lua_State *L = GL;
@@ -3227,12 +3211,8 @@ static char *ServeStatusz(void) {
               lua_gc(L, LUA_GCCOUNT) * 1024 + lua_gc(L, LUA_GCCOUNTB));
 #endif
   ServeCounters();
-  unassert(!pthread_mutex_lock(&shared->server_mu));
   AppendRusage("server", &shared->server);
-  unassert(!pthread_mutex_unlock(&shared->server_mu));
-  unassert(!pthread_mutex_lock(&shared->children_mu));
   AppendRusage("children", &shared->children);
-  unassert(!pthread_mutex_unlock(&shared->children_mu));
   p = SetStatus(200, "OK");
   p = AppendContentType(p, "text/plain");
   if (cpm.msg.version >= 11) {
@@ -3997,9 +3977,7 @@ static int LuaNilTlsError(lua_State *L, const char *s, int r) {
 #include "tool/net/fetch.inc"
 
 static int LuaGetDate(lua_State *L) {
-  unassert(!pthread_mutex_lock(&shared->datetime_mu));
   lua_pushinteger(L, shared->nowish.tv_sec);
-  unassert(!pthread_mutex_unlock(&shared->datetime_mu));
   return 1;
 }
 
@@ -5053,7 +5031,7 @@ static int LuaProgramTokenBucket(lua_State *L) {
   npassert(pid != -1);
   if (!pid)
     Replenisher();
-  atomic_fetch_add_explicit(&shared->workers, 1, memory_order_acquire);
+  ++shared->workers;
   return 0;
 }
 
@@ -5698,8 +5676,7 @@ static void LogClose(const char *reason) {
   if (amtread || meltdown || killed) {
     LockInc(&shared->c.fumbles);
     INFOF("(stat) %s %s with %,ld unprocessed and %,d handled (%,d workers)",
-          DescribeClient(), reason, amtread, messageshandled,
-          atomic_load_explicit(&shared->workers, memory_order_relaxed));
+          DescribeClient(), reason, amtread, messageshandled, shared->workers);
   } else {
     DEBUGF("(stat) %s %s with %,d messages handled", DescribeClient(), reason,
            messageshandled);
@@ -5757,18 +5734,14 @@ Content-Length: 22\r\n\
 }
 
 static void EnterMeltdownMode(void) {
-  unassert(!pthread_mutex_lock(&shared->lastmeltdown_mu));
   if (timespec_cmp(timespec_sub(timespec_real(), shared->lastmeltdown),
                    (struct timespec){1}) < 0) {
-    unassert(!pthread_mutex_unlock(&shared->lastmeltdown_mu));
     return;
   }
-  shared->lastmeltdown = timespec_real();
-  pthread_mutex_unlock(&shared->lastmeltdown_mu);
-  WARNF("(srvr) server is melting down (%,d workers)",
-        atomic_load_explicit(&shared->workers, memory_order_relaxed));
+  WARNF("(srvr) server is melting down (%,d workers)", shared->workers);
   LOGIFNEG1(kill(0, SIGUSR2));
-  LockInc(&shared->c.meltdowns);
+  shared->lastmeltdown = timespec_real();
+  ++shared->c.meltdowns;
 }
 
 static char *HandlePayloadDisconnect(void) {
@@ -5885,9 +5858,7 @@ static void HandleHeartbeat(void) {
   size_t i;
   UpdateCurrentDate(timespec_real());
   Reindex();
-  unassert(!pthread_mutex_lock(&shared->server_mu));
   getrusage(RUSAGE_SELF, &shared->server);
-  unassert(!pthread_mutex_unlock(&shared->server_mu));
 #ifndef STATIC
   CallSimpleHookIfDefined("OnServerHeartbeat");
   CollectGarbage();
@@ -6507,9 +6478,7 @@ static bool HandleMessageActual(void) {
     DEBUGF("(clnt) could not synchronize message stream");
   }
   if (cpm.msg.version >= 10) {
-    unassert(!pthread_mutex_lock(&shared->datetime_mu));
     p = AppendCrlf(stpcpy(stpcpy(p, "Date: "), shared->currentdate));
-    unassert(!pthread_mutex_unlock(&shared->datetime_mu));
     if (!cpm.branded)
       p = stpcpy(p, serverheader);
     if (extrahdrs)
@@ -6779,9 +6748,7 @@ static int HandleConnection(size_t i) {
       DEBUGF("(token) can't acquire accept() token for client");
     }
     startconnection = timespec_real();
-    if (UNLIKELY(maxworkers) &&
-        atomic_load_explicit(&shared->workers, memory_order_relaxed) >=
-            maxworkers) {
+    if (UNLIKELY(maxworkers) && shared->workers >= maxworkers) {
       EnterMeltdownMode();
       SendServiceUnavailable();
       close(client);
@@ -6799,8 +6766,6 @@ static int HandleConnection(size_t i) {
     } else {
       switch ((pid = fork())) {
         case 0:
-          lua_repl_wock();
-          lua_repl_lock();
           meltdown = false;
           __isworker = true;
           connectionclose = false;
@@ -7378,14 +7343,6 @@ void RedBean(int argc, char *argv[]) {
            (shared = mmap(NULL, ROUNDUP(sizeof(struct Shared), getgransize()),
                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
                           -1, 0)));
-  pthread_mutexattr_t attr;
-  unassert(!pthread_mutexattr_init(&attr));
-  unassert(!pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED));
-  unassert(!pthread_mutex_init(&shared->datetime_mu, &attr));
-  unassert(!pthread_mutex_init(&shared->server_mu, &attr));
-  unassert(!pthread_mutex_init(&shared->children_mu, &attr));
-  unassert(!pthread_mutex_init(&shared->lastmeltdown_mu, &attr));
-  unassert(!pthread_mutexattr_destroy(&attr));
   if (daemonize) {
     for (int i = 0; i < 256; ++i) {
       close(i);
@@ -7471,9 +7428,6 @@ int main(int argc, char *argv[]) {
   ShowCrashReports();
 #endif
 
-  // just in case
-  setlocale(LC_ALL, "C.UTF-8");
-
   LoadZipArgs(&argc, &argv);
   RedBean(argc, argv);
 
diff --git a/tool/net/winbench.c b/tool/net/winbench.c
index f54547d97..ea8063709 100644
--- a/tool/net/winbench.c
+++ b/tool/net/winbench.c
@@ -16,7 +16,7 @@
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/intrin/kprintf.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nt/accounting.h"
 #include "libc/nt/enum/wsaid.h"
diff --git a/tool/plinko/lib/gc.c b/tool/plinko/lib/gc.c
index 33e320eb4..08ef948a8 100644
--- a/tool/plinko/lib/gc.c
+++ b/tool/plinko/lib/gc.c
@@ -24,7 +24,7 @@
 #include "libc/log/check.h"
 #include "libc/log/countbranch.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "tool/plinko/lib/cons.h"
diff --git a/tool/plinko/lib/histo.h b/tool/plinko/lib/histo.h
index 10025713e..4cdc63904 100644
--- a/tool/plinko/lib/histo.h
+++ b/tool/plinko/lib/histo.h
@@ -1,7 +1,7 @@
 #ifndef COSMOPOLITAN_TOOL_PLINKO_LIB_HISTO_H_
 #define COSMOPOLITAN_TOOL_PLINKO_LIB_HISTO_H_
 #include "libc/intrin/bsr.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 COSMOPOLITAN_C_START_
 
 #define HISTO(H, X)                \
diff --git a/tool/plinko/lib/iswide.c b/tool/plinko/lib/iswide.c
index 087a21df3..f458bfad5 100644
--- a/tool/plinko/lib/iswide.c
+++ b/tool/plinko/lib/iswide.c
@@ -16,12 +16,327 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/unicode.h"
+#include "libc/macros.internal.h"
 #include "tool/plinko/lib/char.h"
 
-int GetMonospaceCharacterWidth(int c) {
-  int w = wcwidth(c);
-  if (w < 0)
-    w = 0;
-  return w;
+static const unsigned short kWides[][2] = {
+    {0x1100, 0x115F},  // HANGUL CHOSEONG KIYEOK..HANGUL CHOSEONG FILLER
+    {0x231A, 0x231B},  // WATCH..HOURGLASS
+    {0x2329, 0x2329},  // LEFT-POINTING ANGLE BRACKET
+    {0x232A, 0x232A},  // RIGHT-POINTING ANGLE BRACKET
+    {0x23E9, 0x23EC},  // BLACK RIGHT-POINTING DOUBLE TRIANGLE...
+    {0x23F0, 0x23F0},  // ALARM CLOCK
+    {0x23F3, 0x23F3},  // HOURGLASS WITH FLOWING SAND
+    {0x25FD, 0x25FE},  // WHITE MEDIUM SMALL SQUARE..BLACK MEDIUM SMALL SQUARE
+    {0x2614, 0x2615},  // UMBRELLA WITH RAIN DROPS..HOT BEVERAGE
+    {0x2648, 0x2653},  // ARIES..PISCES
+    {0x267F, 0x267F},  // WHEELCHAIR SYMBOL
+    {0x2693, 0x2693},  // ANCHOR
+    {0x26A1, 0x26A1},  // HIGH VOLTAGE SIGN
+    {0x26AA, 0x26AB},  // MEDIUM WHITE CIRCLE..MEDIUM BLACK CIRCLE
+    {0x26BD, 0x26BE},  // SOCCER BALL..BASEBALL
+    {0x26C4, 0x26C5},  // SNOWMAN WITHOUT SNOW..SUN BEHIND CLOUD
+    {0x26CE, 0x26CE},  // OPHIUCHUS
+    {0x26D4, 0x26D4},  // NO ENTRY
+    {0x26EA, 0x26EA},  // CHURCH
+    {0x26F2, 0x26F3},  // FOUNTAIN..FLAG IN HOLE
+    {0x26F5, 0x26F5},  // SAILBOAT
+    {0x26FA, 0x26FA},  // TENT
+    {0x26FD, 0x26FD},  // FUEL PUMP
+    {0x2705, 0x2705},  // WHITE HEAVY CHECK MARK
+    {0x270A, 0x270B},  // RaiseD FIST..RaiseD HAND
+    {0x2728, 0x2728},  // SPARKLES
+    {0x274C, 0x274C},  // CROSS MARK
+    {0x274E, 0x274E},  // NEGATIVE SQUARED CROSS MARK
+    {0x2753, 0x2755},  // BLACK QUESTION MARK ORNAMENT..WHITE EXCLAMATION MARK
+    {0x2757, 0x2757},  // HEAVY EXCLAMATION MARK SYMBOL
+    {0x2795, 0x2797},  // HEAVY PLUS SIGN..HEAVY DIVISION SIGN
+    {0x27B0, 0x27B0},  // CURLY LOOP
+    {0x27BF, 0x27BF},  // DOUBLE CURLY LOOP
+    {0x2B1B, 0x2B1C},  // BLACK LARGE SQUARE..WHITE LARGE SQUARE
+    {0x2B50, 0x2B50},  // WHITE MEDIUM STAR
+    {0x2B55, 0x2B55},  // HEAVY LARGE CIRCLE
+    {0x2E80, 0x2E99},  // CJK RADICAL REPEAT..CJK RADICAL RAP
+    {0x2E9B, 0x2EF3},  // CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE
+    {0x2F00, 0x2FD5},  // KANGXI RADICAL ONE..KANGXI RADICAL FLUTE
+    {0x2FF0, 0x2FFB},  // IDEOGRAPHIC DESCRIPTION CHARACTER LTR..OVERLAID
+    {0x3000, 0x3000},  // IDEOGRAPHIC SPACE
+    {0x3001, 0x3003},  // IDEOGRAPHIC COMMA..DITTO MARK
+    {0x3004, 0x3004},  // JAPANESE INDUSTRIAL STANDARD SYMBOL
+    {0x3005, 0x3005},  // IDEOGRAPHIC ITERATION MARK
+    {0x3006, 0x3006},  // IDEOGRAPHIC CLOSING MARK
+    {0x3007, 0x3007},  // IDEOGRAPHIC NUMBER ZERO
+    {0x3008, 0x3008},  // LEFT ANGLE BRACKET
+    {0x3009, 0x3009},  // RIGHT ANGLE BRACKET
+    {0x300A, 0x300A},  // LEFT DOUBLE ANGLE BRACKET
+    {0x300B, 0x300B},  // RIGHT DOUBLE ANGLE BRACKET
+    {0x300C, 0x300C},  // LEFT CORNER BRACKET
+    {0x300D, 0x300D},  // RIGHT CORNER BRACKET
+    {0x300E, 0x300E},  // LEFT WHITE CORNER BRACKET
+    {0x300F, 0x300F},  // RIGHT WHITE CORNER BRACKET
+    {0x3010, 0x3010},  // LEFT BLACK LENTICULAR BRACKET
+    {0x3011, 0x3011},  // RIGHT BLACK LENTICULAR BRACKET
+    {0x3012, 0x3013},  // POSTAL MARK..GETA MARK
+    {0x3014, 0x3014},  // LEFT TORTOISE SHELL BRACKET
+    {0x3015, 0x3015},  // RIGHT TORTOISE SHELL BRACKET
+    {0x3016, 0x3016},  // LEFT WHITE LENTICULAR BRACKET
+    {0x3017, 0x3017},  // RIGHT WHITE LENTICULAR BRACKET
+    {0x3018, 0x3018},  // LEFT WHITE TORTOISE SHELL BRACKET
+    {0x3019, 0x3019},  // RIGHT WHITE TORTOISE SHELL BRACKET
+    {0x301A, 0x301A},  // LEFT WHITE SQUARE BRACKET
+    {0x301B, 0x301B},  // RIGHT WHITE SQUARE BRACKET
+    {0x301C, 0x301C},  // WAVE DASH
+    {0x301D, 0x301D},  // REVERSED DOUBLE PRIME QUOTATION MARK
+    {0x301E, 0x301F},  // DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME
+    {0x3020, 0x3020},  // POSTAL MARK FACE
+    {0x3021, 0x3029},  // HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE
+    {0x302A, 0x302D},  // IDEOGRAPHIC LEVEL TONE MARK..ENTERING TONE MARK
+    {0x302E, 0x302F},  // HANGUL SINGLE DOT TONE MARK..DOUBLE DOT TONE MARK
+    {0x3030, 0x3030},  // WAVY DASH
+    {0x3031, 0x3035},  // VERTICAL KANA REPEAT MARK..KANA REPEAT MARK LOWER
+    {0x3036, 0x3037},  // CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LF SYMBOL
+    {0x3038, 0x303A},  // HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
+    {0x303B, 0x303B},  // VERTICAL IDEOGRAPHIC ITERATION MARK
+    {0x303C, 0x303C},  // MASU MARK
+    {0x303D, 0x303D},  // PART ALTERNATION MARK
+    {0x303E, 0x303E},  // IDEOGRAPHIC VARIATION INDICATOR
+    {0x3041, 0x3096},  // HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
+    {0x3099, 0x309A},  // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK...
+    {0x309B, 0x309C},  // KATAKANA-HIRAGANA VOICED SOUND MARK...
+    {0x309D, 0x309E},  // HIRAGANA ITERATION MARK..VOICED ITERATION MARK
+    {0x309F, 0x309F},  // HIRAGANA DIGRAPH YORI
+    {0x30A0, 0x30A0},  // KATAKANA-HIRAGANA DOUBLE HYPHEN
+    {0x30A1, 0x30FA},  // KATAKANA LETTER SMALL A..KATAKANA LETTER VO
+    {0x30FB, 0x30FB},  // KATAKANA MIDDLE DOT
+    {0x30FC, 0x30FE},  // KATAKANA-HIRAGANA PROLONGED SOUND MARK..ITERATION
+    {0x30FF, 0x30FF},  // KATAKANA DIGRAPH KOTO
+    {0x3105, 0x312F},  // BOPOMOFO LETTER B..BOPOMOFO LETTER NN
+    {0x3131, 0x318E},  // HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE
+    {0x3190, 0x3191},  // IDEOGRAPHIC ANNOTATION LINKING MARK..REVERSE
+    {0x3192, 0x3195},  // IDEOGRAPHIC ANNOTATION ONE MARK..FOUR
+    {0x3196, 0x319F},  // IDEOGRAPHIC ANNOTATION TOP MARK..MAN
+    {0x31A0, 0x31BF},  // BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
+    {0x31C0, 0x31E3},  // CJK STROKE T..CJK STROKE Q
+    {0x31F0, 0x31FF},  // KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
+    {0x3200, 0x321E},  // PARENTHESIZED HANGUL KIYEOK..CHARACTER O HU
+    {0x3220, 0x3229},  // PARENTHESIZED IDEOGRAPH ONE..TEN
+    {0x322A, 0x3247},  // PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
+    {0x3250, 0x3250},  // PARTNERSHIP SIGN
+    {0x3251, 0x325F},  // CIRCLED NUMBER TWENTY ONE..CIRCLED 35
+    {0x3260, 0x327F},  // CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL
+    {0x3280, 0x3289},  // CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
+    {0x328A, 0x32B0},  // CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
+    {0x32B1, 0x32BF},  // CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY
+    {0x32C0, 0x32FF},  // TELEGRAPH SYMBOL FOR JANUARY..SQUARE ERA NAME REIWA
+    {0x3300, 0x33FF},  // SQUARE APAATO..SQUARE GAL
+    {0x3400, 0x4DBF},  // CJK UNIFIED IDEOGRAPH
+    {0x4E00, 0x9FFF},  // CJK UNIFIED IDEOGRAPH
+    {0xA000, 0xA014},  // YI SYLLABLE IT..YI SYLLABLE E
+    {0xA015, 0xA015},  // YI SYLLABLE WU
+    {0xA016, 0xA48C},  // YI SYLLABLE BIT..YI SYLLABLE YYR
+    {0xA490, 0xA4C6},  // YI RADICAL QOT..YI RADICAL KE
+    {0xA960, 0xA97C},  // HANGUL CHOSEONG TIKEUT-MIEUM..SSANGYEORINHIEUH
+    {0xAC00, 0xD7A3},  // HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
+    {0xF900, 0xFA6D},  // CJK COMPATIBILITY IDEOGRAPH
+    {0xFA6E, 0xFA6F},  // RESERVED
+    {0xFA70, 0xFAD9},  // CJK COMPATIBILITY IDEOGRAPH
+    {0xFADA, 0xFAFF},  // RESERVED
+    {0xFE10, 0xFE16},  // PRESENTATION FORM FOR VERTICAL COMMA..QUESTION
+    {0xFE17, 0xFE17},  // VERTICAL LEFT WHITE LENTICULAR BRACKET
+    {0xFE18, 0xFE18},  // VERTICAL RIGHT WHITE LENTICULAR BRAKCET
+    {0xFE19, 0xFE19},  // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
+    {0xFE30, 0xFE30},  // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
+    {0xFE31, 0xFE32},  // VERTICAL EM DASH..VERTICAL EN DASH
+    {0xFE33, 0xFE34},  // VERTICAL LOW LINE..VERTICAL WAVY LOW LINE
+    {0xFE35, 0xFE35},  // PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
+    {0xFE36, 0xFE36},  // PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
+    {0xFE37, 0xFE37},  // PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
+    {0xFE38, 0xFE38},  // PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
+    {0xFE39, 0xFE39},  // VERTICAL LEFT TORTOISE SHELL BRACKET
+    {0xFE3A, 0xFE3A},  // VERTICAL RIGHT TORTOISE SHELL BRACKET
+    {0xFE3B, 0xFE3B},  // VERTICAL LEFT BLACK LENTICULAR BRACKET
+    {0xFE3C, 0xFE3C},  // VERTICAL RIGHT BLACK LENTICULAR BRACKET
+    {0xFE3D, 0xFE3D},  // VERTICAL LEFT DOUBLE ANGLE BRACKET
+    {0xFE3E, 0xFE3E},  // VERTICAL RIGHT DOUBLE ANGLE BRACKET
+    {0xFE3F, 0xFE3F},  // VERTICAL LEFT ANGLE BRACKET
+    {0xFE40, 0xFE40},  // VERTICAL RIGHT ANGLE BRACKET
+    {0xFE41, 0xFE41},  // VERTICAL LEFT CORNER BRACKET
+    {0xFE42, 0xFE42},  // VERTICAL RIGHT CORNER BRACKET
+    {0xFE43, 0xFE43},  // VERTICAL LEFT WHITE CORNER BRACKET
+    {0xFE44, 0xFE44},  // VERTICAL RIGHT WHITE CORNER BRACKET
+    {0xFE45, 0xFE46},  // SESAME DOT..WHITE SESAME DOT
+    {0xFE47, 0xFE47},  // VERTICAL LEFT SQUARE BRACKET
+    {0xFE48, 0xFE48},  // VERTICAL RIGHT SQUARE BRACKET
+    {0xFE49, 0xFE4C},  // DASHED OVERLINE..DOUBLE WAVY OVERLINE
+    {0xFE4D, 0xFE4F},  // DASHED LOW LINE..WAVY LOW LINE
+    {0xFE50, 0xFE52},  // SMALL COMMA..SMALL FULL STOP
+    {0xFE54, 0xFE57},  // SMALL SEMICOLON..SMALL EXCLAMATION MARK
+    {0xFE58, 0xFE58},  // SMALL EM DASH
+    {0xFE59, 0xFE59},  // SMALL LEFT PARENTHESIS
+    {0xFE5A, 0xFE5A},  // SMALL RIGHT PARENTHESIS
+    {0xFE5B, 0xFE5B},  // SMALL LEFT CURLY BRACKET
+    {0xFE5C, 0xFE5C},  // SMALL RIGHT CURLY BRACKET
+    {0xFE5D, 0xFE5D},  // SMALL LEFT TORTOISE SHELL BRACKET
+    {0xFE5E, 0xFE5E},  // SMALL RIGHT TORTOISE SHELL BRACKET
+    {0xFE5F, 0xFE61},  // SMALL NUMBER SIGN..SMALL ASTERISK
+    {0xFE62, 0xFE62},  // SMALL PLUS SIGN
+    {0xFE63, 0xFE63},  // SMALL HYPHEN-MINUS
+    {0xFE64, 0xFE66},  // SMALL LESS-THAN SIGN..SMALL EQUALS SIGN
+    {0xFE68, 0xFE68},  // SMALL REVERSE SOLIDUS
+    {0xFE69, 0xFE69},  // SMALL DOLLAR SIGN
+    {0xFE6A, 0xFE6B},  // SMALL PERCENT SIGN..SMALL COMMERCIAL AT
+    {0xFF01, 0xFF03},  // EXCLAMATION MARK..NUMBER SIGN
+    {0xFF04, 0xFF04},  // DOLLAR SIGN
+    {0xFF05, 0xFF07},  // PERCENT SIGN..APOSTROPHE
+    {0xFF08, 0xFF08},  // LEFT PARENTHESIS
+    {0xFF09, 0xFF09},  // RIGHT PARENTHESIS
+    {0xFF0A, 0xFF0A},  // ASTERISK
+    {0xFF0B, 0xFF0B},  // PLUS SIGN
+    {0xFF0C, 0xFF0C},  // COMMA
+    {0xFF0D, 0xFF0D},  // HYPHEN-MINUS
+    {0xFF0E, 0xFF0F},  // FULL STOP..SOLIDUS
+    {0xFF10, 0xFF19},  // DIGIT ZERO..DIGIT NINE
+    {0xFF1A, 0xFF1B},  // COLON..SEMICOLON
+    {0xFF1C, 0xFF1E},  // LESS-THAN..GREATER-THAN
+    {0xFF1F, 0xFF20},  // QUESTION MARK..COMMERCIAL AT
+    {0xFF21, 0xFF3A},  // LATIN CAPITAL LETTER A..Z
+    {0xFF3B, 0xFF3B},  // LEFT SQUARE BRACKET
+    {0xFF3C, 0xFF3C},  // REVERSE SOLIDUS
+    {0xFF3D, 0xFF3D},  // RIGHT SQUARE BRACKET
+    {0xFF3E, 0xFF3E},  // CIRCUMFLEX ACCENT
+    {0xFF3F, 0xFF3F},  // LOW LINE
+    {0xFF40, 0xFF40},  // GRAVE ACCENT
+    {0xFF41, 0xFF5A},  // LATIN SMALL LETTER A..Z
+    {0xFF5B, 0xFF5B},  // LEFT CURLY BRACKET
+    {0xFF5C, 0xFF5C},  // VERTICAL LINE
+    {0xFF5D, 0xFF5D},  // RIGHT CURLY BRACKET
+    {0xFF5E, 0xFF5E},  // TILDE
+    {0xFF5F, 0xFF5F},  // LEFT WHITE PARENTHESIS
+    {0xFF60, 0xFF60},  // RIGHT WHITE PARENTHESIS
+    {0xFFE0, 0xFFE1},  // CENT SIGN..POUND SIGN
+    {0xFFE2, 0xFFE2},  // NOT SIGN
+    {0xFFE3, 0xFFE3},  // MACRON
+    {0xFFE4, 0xFFE4},  // BROKEN BAR
+    {0xFFE5, 0xFFE6},  // YEN SIGN..WON SIGN
+};
+
+static const int kAstralWides[][2] = {
+    {0x16FE0, 0x16FE1},  // TANGUT ITERATION MARK..NUSHU ITERATION MARK
+    {0x16FE2, 0x16FE2},  // OLD CHINESE HOOK MARK
+    {0x16FE3, 0x16FE3},  // OLD CHINESE ITERATION MARK
+    {0x16FE4, 0x16FE4},  // KHITAN SMALL SCRIPT FILLER
+    {0x16FF0, 0x16FF1},  // VIETNAMESE ALTERNATE READING MARK CA..NHAY
+    {0x17000, 0x187F7},  // TANGUT IDEOGRAPH
+    {0x18800, 0x18AFF},  // TANGUT COMPONENT
+    {0x18B00, 0x18CD5},  // KHITAN SMALL SCRIPT CHARACTER
+    {0x18D00, 0x18D08},  // TANGUT IDEOGRAPH
+    {0x1AFF0, 0x1AFF3},  // KATAKANA LETTER MINNAN TONE-2..5
+    {0x1AFF5, 0x1AFFB},  // KATAKANA LETTER MINNAN TONE-7..5
+    {0x1AFFD, 0x1AFFE},  // KATAKANA LETTER MINNAN NASALIZED TONE-7..8
+    {0x1B000, 0x1B0FF},  // KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
+    {0x1B100, 0x1B122},  // HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
+    {0x1B150, 0x1B152},  // HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
+    {0x1B164, 0x1B167},  // KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
+    {0x1B170, 0x1B2FB},  // NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
+    {0x1F004, 0x1F004},  // MAHJONG TILE RED DRAGON
+    {0x1F0CF, 0x1F0CF},  // PLAYING CARD BLACK JOKER
+    {0x1F18E, 0x1F18E},  // NEGATIVE SQUARED AB
+    {0x1F191, 0x1F19A},  // SQUARED CL..SQUARED VS
+    {0x1F200, 0x1F202},  // SQUARE HIRAGANA HOKA..SQUARED KATAKANA SA
+    {0x1F210, 0x1F23B},  // SQUARED CJK UNIFIED IDEOGRAPH
+    {0x1F240, 0x1F248},  // TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH
+    {0x1F250, 0x1F251},  // CIRCLED IDEOGRAPH ADVANTAGE..ACCEPT
+    {0x1F260, 0x1F265},  // ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
+    {0x1F300, 0x1F320},  // CYCLONE..SHOOTING STAR
+    {0x1F32D, 0x1F335},  // HOT DOG..CACTUS
+    {0x1F337, 0x1F37C},  // TULIP..BABY BOTTLE
+    {0x1F37E, 0x1F393},  // BOTTLE WITH POPPING CORK..GRADUATION CAP
+    {0x1F3A0, 0x1F3CA},  // CAROUSEL HORSE..SWIMMER
+    {0x1F3CF, 0x1F3D3},  // CRICKET BAT AND BALL..TABLE TENNIS PADDLE AND BALL
+    {0x1F3E0, 0x1F3F0},  // HOUSE BUILDING..EUROPEAN CASTLE
+    {0x1F3F4, 0x1F3F4},  // WAVING BLACK FLAG
+    {0x1F3F8, 0x1F3FA},  // BADMINTON RACQUET AND SHUTTLECOCK..AMPHORA
+    {0x1F3FB, 0x1F3FF},  // EMOJI MODIFIER FITZPATRICK TYPE-1-2..6
+    {0x1F400, 0x1F43E},  // RAT..PAW PRINTS
+    {0x1F440, 0x1F440},  // EYES
+    {0x1F442, 0x1F4FC},  // EAR..VIDEOCASSETTE
+    {0x1F4FF, 0x1F53D},  // PRAYER BEADS..DOWN-POINTING SMALL RED TRIANGLE
+    {0x1F54B, 0x1F54E},  // KAABA..MENORAH WITH NINE BRANCHES
+    {0x1F550, 0x1F567},  // CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
+    {0x1F57A, 0x1F57A},  // MAN DANCING
+    {0x1F595, 0x1F596},  // REVERSED HAND WITH MIDDLE FINGER EXTENDED..FINGERS
+    {0x1F5A4, 0x1F5A4},  // BLACK HEART
+    {0x1F5FB, 0x1F5FF},  // MOUNT FUJI..MOYAI
+    {0x1F600, 0x1F64F},  // GRINNING FACE..PERSON WITH FOLDED HANDS
+    {0x1F680, 0x1F6C5},  // ROCKET..LEFT LUGGAGE
+    {0x1F6CC, 0x1F6CC},  // SLEEPING ACCOMMODATION
+    {0x1F6D0, 0x1F6D2},  // PLACE OF WORSHIP..SHOPPING TROLLEY
+    {0x1F6D5, 0x1F6D7},  // HINDU TEMPLE..ELEVATOR
+    {0x1F6DD, 0x1F6DF},  // PLAYGROUND SLIDE..RING BUOY
+    {0x1F6EB, 0x1F6EC},  // AIRPLANE DEPARTURE..AIRPLANE ARRIVING
+    {0x1F6F4, 0x1F6FC},  // SCOOTER..ROLLER SKATE
+    {0x1F7E0, 0x1F7EB},  // LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
+    {0x1F7F0, 0x1F7F0},  // HEAVY EQUALS SIGN
+    {0x1F90C, 0x1F93A},  // PINCHED FINGERS..FENCER
+    {0x1F93C, 0x1F945},  // WRESTLERS..GOAL NET
+    {0x1F947, 0x1F9FF},  // FIRST PLACE MEDAL..NAZAR AMULET
+    {0x1FA70, 0x1FA74},  // BALLET SHOES..THONG SANDAL
+    {0x1FA78, 0x1FA7C},  // DROP OF BLOOD..CRUTCH
+    {0x1FA80, 0x1FA86},  // YO-YO..NESTING DOLLS
+    {0x1FA90, 0x1FAAC},  // RINGED PLANET..HAMSA
+    {0x1FAB0, 0x1FABA},  // FLY..NEST WITH EGGS
+    {0x1FAC0, 0x1FAC5},  // ANATOMICAL HEART..PERSON WITH CROWN
+    {0x1FAD0, 0x1FAD9},  // BLUEBERRIES..JAR
+    {0x1FAE0, 0x1FAE7},  // MELTING FACE..BUBBLES
+    {0x1FAF0, 0x1FAF6},  // HAND WITH INDEX FINGER THUMB CROSSED..HEART HANDS
+    {0x20000, 0x2A6DF},  // CJK UNIFIED IDEOGRAPH
+    {0x2A6E0, 0x2A6FF},  // RESERVED
+    {0x2A700, 0x2B738},  // CJK UNIFIED IDEOGRAPH
+    {0x2B739, 0x2B73F},  // RESERVED
+    {0x2B740, 0x2B81D},  // CJK UNIFIED IDEOGRAPH
+    {0x2B81E, 0x2B81F},  // RESERVED
+    {0x2B820, 0x2CEA1},  // CJK UNIFIED IDEOGRAPH
+    {0x2CEA2, 0x2CEAF},  // RESERVED
+    {0x2CEB0, 0x2EBE0},  // CJK UNIFIED IDEOGRAPH
+    {0x2EBE1, 0x2F7FF},  // RESERVED
+    {0x2F800, 0x2FA1D},  // CJK COMPATIBILITY IDEOGRAPH
+    {0x2FA1E, 0x2FA1F},  // RESERVED
+    {0x2FA20, 0x2FFFD},  // RESERVED
+    {0x30000, 0x3134A},  // CJK UNIFIED IDEOGRAPH
+    {0x3134B, 0x3FFFD},  // RESERVED
+};
+
+pureconst bool IsWide(int c) {
+  int m, l, r, n;
+  if (c < 0x1100) {
+    return false;
+  } else if (c < 0x10000) {
+    l = 0;
+    r = n = sizeof(kWides) / sizeof(kWides[0]);
+    while (l < r) {
+      m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
+      if (kWides[m][1] < c) {
+        l = m + 1;
+      } else {
+        r = m;
+      }
+    }
+    return l < n && kWides[l][0] <= c && c <= kWides[l][1];
+  } else {
+    l = 0;
+    r = n = sizeof(kAstralWides) / sizeof(kAstralWides[0]);
+    while (l < r) {
+      m = (l & r) + ((l ^ r) >> 1);  // floor((a+b)/2)
+      if (kAstralWides[m][1] < c) {
+        l = m + 1;
+      } else {
+        r = m;
+      }
+    }
+    return l < n && kAstralWides[l][0] <= c && c <= kAstralWides[l][1];
+  }
+}
+
+pureconst int GetMonospaceCharacterWidth(int c) {
+  return !IsControl(c) + IsWide(c);
 }
diff --git a/tool/plinko/lib/plinko.c b/tool/plinko/lib/plinko.c
index 00bb69622..181c3b838 100644
--- a/tool/plinko/lib/plinko.c
+++ b/tool/plinko/lib/plinko.c
@@ -26,7 +26,7 @@
 #include "libc/log/countbranch.h"
 #include "libc/log/countexpr.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nexgen32e/rdtsc.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/stack.h"
diff --git a/tool/scripts/flakes b/tool/scripts/flakes
deleted file mode 100755
index b054a336a..000000000
--- a/tool/scripts/flakes
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/env python3
-import os
-import sys
-import subprocess
-import concurrent.futures
-from collections import Counter
-from typing import List, Dict, Tuple
-
-NUM_PARALLEL = int(os.cpu_count() * 20)
-
-def find_test_files(root: str) -> List[str]:
-    """Find all executable files ending with _test recursively."""
-    test_files = []
-    if os.path.isdir(root):
-        for root, _, files in os.walk(root):
-            for file in files:
-                if file.endswith('_test'):
-                    file_path = os.path.join(root, file)
-                    if os.access(file_path, os.X_OK):
-                        test_files.append(file_path)
-    elif root.endswith('_test'):
-        test_files.append(root)
-    return test_files
-
-def run_single_test(test_path: str) -> int:
-    """Run a single test and return its exit code."""
-    try:
-        result = subprocess.run(["ape", test_path], capture_output=False)
-        return result.returncode
-    except Exception as e:
-        print(f"Error running {test_path}: {e}")
-        return -1
-
-def run_test_multiple_times(test_path: str, iterations: int = NUM_PARALLEL) -> List[int]:
-    """Run a test multiple times in parallel and collect exit codes."""
-    with concurrent.futures.ProcessPoolExecutor() as executor:
-        futures = [executor.submit(run_single_test, test_path) for _ in range(iterations)]
-        return [f.result() for f in concurrent.futures.as_completed(futures)]
-
-def analyze_results(test_path: str, exit_codes: List[int]) -> Tuple[bool, Dict[int, int]]:
-    """Analyze test results and return if it flaked and error distribution."""
-    error_counts = Counter(code for code in exit_codes if code != 0)
-    return bool(error_counts), dict(error_counts)
-
-def print_flaky_report(test_path: str, error_distribution: Dict[int, int], total_runs: int):
-    """Print a report for a flaky test."""
-    print(f"{test_path} flaked!")
-    for exit_code, count in error_distribution.items():
-        print(f"* {count}/{total_runs} processes died with exit code {exit_code}")
-
-def main(directory = "o"):
-    test_files = find_test_files(directory)
-    for i, test_path in enumerate(test_files):
-        print("testing [%d/%d] %s..." % (i, len(test_files), test_path))
-        sys.stdout.flush()
-        exit_codes = run_test_multiple_times(test_path)
-        is_flaky, error_distribution = analyze_results(test_path, exit_codes)
-        if is_flaky:
-            print_flaky_report(test_path, error_distribution, len(exit_codes))
-            sys.exit(1)
-
-if __name__ == "__main__":
-    main(*sys.argv[1:])
diff --git a/tool/viz/BUILD.mk b/tool/viz/BUILD.mk
index 5e2ced87d..a087fbf8e 100644
--- a/tool/viz/BUILD.mk
+++ b/tool/viz/BUILD.mk
@@ -16,7 +16,6 @@ TOOL_VIZ_BINS =						\
 	$(TOOL_VIZ_COMS:%=%.dbg)
 
 TOOL_VIZ_DIRECTDEPS =					\
-	DSP_AUDIO					\
 	DSP_CORE					\
 	DSP_MPEG					\
 	DSP_SCALE					\
@@ -28,7 +27,6 @@ TOOL_VIZ_DIRECTDEPS =					\
 	LIBC_MEM					\
 	LIBC_NEXGEN32E					\
 	LIBC_NT_COMDLG32				\
-	LIBC_NT_NTDLL					\
 	LIBC_NT_GDI32					\
 	LIBC_NT_KERNEL32				\
 	LIBC_NT_USER32					\
diff --git a/tool/viz/bin2asm.c b/tool/viz/bin2asm.c
index 99c785e5e..37496fd93 100644
--- a/tool/viz/bin2asm.c
+++ b/tool/viz/bin2asm.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 #define COLS 8
 
diff --git a/tool/viz/bing.c b/tool/viz/bing.c
index 750d0f032..1ded5fa49 100644
--- a/tool/viz/bing.c
+++ b/tool/viz/bing.c
@@ -22,7 +22,7 @@
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/sysv/consts/ex.h"
 #include "libc/sysv/consts/exit.h"
 #include "libc/sysv/consts/fileno.h"
diff --git a/tool/viz/clock_nanosleep_accuracy.c b/tool/viz/clock_nanosleep_accuracy.c
index 6a8e162d0..b9a099fe7 100644
--- a/tool/viz/clock_nanosleep_accuracy.c
+++ b/tool/viz/clock_nanosleep_accuracy.c
@@ -16,50 +16,46 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include <assert.h>
-#include <stdio.h>
-#include <time.h>
 #include "libc/assert.h"
-#include "libc/dce.h"
-#include "libc/nt/enum/processcreationflags.h"
-#include "libc/nt/enum/status.h"
-#include "libc/nt/enum/threadpriority.h"
-#include "libc/nt/ntdll.h"
-#include "libc/nt/process.h"
-#include "libc/nt/runtime.h"
-#include "libc/nt/thread.h"
-#include "libc/nt/windows.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/intrin/kprintf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
+#include "libc/sysv/consts/clock.h"
 
-#define MAXIMUM    1e8
+#define MAXIMUM    1e9
 #define ITERATIONS 10
 
-const char *MyDescribeClockName(int clock) {
-  if (clock == CLOCK_REALTIME)
-    return "CLOCK_REALTIME";
-  if (clock == CLOCK_MONOTONIC)
-    return "CLOCK_MONOTONIC";
-  if (clock == CLOCK_BOOTTIME)
-    return "CLOCK_BOOTTIME";
-  if (clock == CLOCK_REALTIME_COARSE)
-    return "CLOCK_REALTIME_COARSE";
-  if (clock == CLOCK_MONOTONIC_COARSE)
-    return "CLOCK_MONOTONIC_COARSE";
-  __builtin_trap();
-}
-
-void TestSleepRelative(int clock) {
+void TestSleepRealRelative(void) {
   printf("\n");
-  printf("testing: clock_nanosleep(%s) with relative timeout\n",
-         MyDescribeClockName(clock));
-  for (long nanos = 1; nanos < (long)MAXIMUM; nanos *= 4) {
+  printf("testing: clock_nanosleep(CLOCK_REALTIME) with relative "
+         "timeout\n");
+  for (long nanos = 1; nanos < (long)MAXIMUM; nanos *= 2) {
     struct timespec t1, t2, wf;
     wf = timespec_fromnanos(nanos);
-    if (clock_gettime(clock, &t1))
-      return;
+    clock_gettime(CLOCK_REALTIME, &t1);
     for (int i = 0; i < ITERATIONS; ++i) {
-      unassert(!clock_nanosleep(clock, 0, &wf, 0));
+      npassert(!clock_nanosleep(CLOCK_REALTIME, 0, &wf, 0));
     }
-    clock_gettime(clock, &t2);
+    clock_gettime(CLOCK_REALTIME, &t2);
+    long took = timespec_tonanos(timespec_sub(t2, t1)) / ITERATIONS;
+    printf("%,12ld ns sleep took %,12ld ns delta %,12ld ns\n", nanos, took,
+           took - nanos);
+  }
+}
+
+void TestSleepMonoRelative(void) {
+  printf("\n");
+  printf("testing: clock_nanosleep(CLOCK_MONOTONIC) with relative "
+         "timeout\n");
+  for (long nanos = 1; nanos < (long)MAXIMUM; nanos *= 2) {
+    struct timespec t1, t2, wf;
+    wf = timespec_fromnanos(nanos);
+    clock_gettime(CLOCK_REALTIME, &t1);
+    for (int i = 0; i < ITERATIONS; ++i) {
+      npassert(!clock_nanosleep(CLOCK_MONOTONIC, 0, &wf, 0));
+    }
+    clock_gettime(CLOCK_REALTIME, &t2);
     long took = timespec_tonanos(timespec_sub(t2, t1)) / ITERATIONS;
     printf("%,12ld ns sleep took %,12ld ns delta %,12ld ns\n", nanos, took,
            took - nanos);
@@ -67,9 +63,6 @@ void TestSleepRelative(int clock) {
 }
 
 int main(int argc, char *argv[]) {
-  TestSleepRelative(CLOCK_REALTIME);
-  TestSleepRelative(CLOCK_REALTIME_COARSE);
-  TestSleepRelative(CLOCK_MONOTONIC);
-  TestSleepRelative(CLOCK_BOOTTIME);
-  TestSleepRelative(CLOCK_MONOTONIC_COARSE);
+  TestSleepRealRelative();
+  TestSleepMonoRelative();
 }
diff --git a/tool/viz/derasterize.c b/tool/viz/derasterize.c
index abd982197..478ec1dc8 100644
--- a/tool/viz/derasterize.c
+++ b/tool/viz/derasterize.c
@@ -25,7 +25,7 @@
 #include "libc/limits.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
diff --git a/tool/viz/dumphexc.c b/tool/viz/dumphexc.c
index 095b7804d..d7f40a953 100644
--- a/tool/viz/dumphexc.c
+++ b/tool/viz/dumphexc.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/append.h"
 #include "libc/stdio/hex.internal.h"
diff --git a/tool/viz/fontspace.c b/tool/viz/fontspace.c
index 7e81629c8..875a73648 100644
--- a/tool/viz/fontspace.c
+++ b/tool/viz/fontspace.c
@@ -24,7 +24,7 @@
 #include "libc/intrin/bsr.h"
 #include "libc/log/libfatal.internal.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
diff --git a/tool/viz/getglyph.c b/tool/viz/getglyph.c
index 4c2a3b5f3..4a7d30f68 100644
--- a/tool/viz/getglyph.c
+++ b/tool/viz/getglyph.c
@@ -20,7 +20,7 @@
 #include "libc/fmt/conv.h"
 #include "libc/limits.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/stdio/append.h"
diff --git a/tool/viz/getifaddrs.c b/tool/viz/getifaddrs.c
index 142e8005e..bd9b22de8 100644
--- a/tool/viz/getifaddrs.c
+++ b/tool/viz/getifaddrs.c
@@ -33,7 +33,7 @@
    eth0
    addr: 10.10.10.237
    netmask: 255.255.255.0
-   broadcast: 10.10.10.255
+   broadcast: 255.255.255.0
    flags: IFF_UP IFF_BROADCAST IFF_MULTICAST IFF_RUNNING
 
    lo
@@ -74,87 +74,13 @@ int main(int argc, char *argv[]) {
       tinyprint(1, "netmask: ", buf, "\n", NULL);
     }
     if ((ifa->ifa_flags & IFF_BROADCAST) &&
-        sockaddr2str(ifa->ifa_broadaddr, buf, sizeof(buf))) {
+        sockaddr2str(ifa->ifa_netmask, buf, sizeof(buf))) {
       tinyprint(1, "broadcast: ", buf, "\n", NULL);
     } else if ((ifa->ifa_flags & IFF_POINTOPOINT) &&
                sockaddr2str(ifa->ifa_dstaddr, buf, sizeof(buf))) {
       tinyprint(1, "dstaddr: ", buf, "\n", NULL);
     }
 
-    if (ifa->ifa_addr->sa_family == AF_INET6) {
-      int scope = ((int *)ifa->ifa_data)[0];
-      int aflags = ((int *)ifa->ifa_data)[1];
-      // #define IPV6_ADDR_LOOPBACK	0x0010U
-      // #define IPV6_ADDR_LINKLOCAL	0x0020U
-      // #define IPV6_ADDR_SITELOCAL	0x0040U
-
-      // #define IFA_F_TEMPORARY		0x01
-      // #define	IFA_F_NODAD		0x02
-      // #define IFA_F_OPTIMISTIC	0x04
-      // #define IFA_F_DADFAILED		0x08
-      // #define	IFA_F_HOMEADDRESS	0x10
-      // #define IFA_F_DEPRECATED	0x20
-      // #define IFA_F_TENTATIVE		0x40
-      // #define IFA_F_PERMANENT		0x80
-      // #define IFA_F_MANAGETEMPADDR	0x100
-      // #define IFA_F_NOPREFIXROUTE	0x200
-      // #define IFA_F_MCAUTOJOIN	0x400
-      // #define IFA_F_STABLE_PRIVACY	0x800
-      tinyprint(1, "scope:", NULL);
-      if (scope == 0x10) {
-        tinyprint(1, " loopback", NULL);
-      }
-      if (scope == 0x20) {
-        tinyprint(1, " linklocal", NULL);
-      }
-      if (scope == 0x40) {
-        tinyprint(1, " sitelocal", NULL);
-      }
-      if (scope == 0x00) {
-        tinyprint(1, " global", NULL);
-      }
-      tinyprint(1, "\n", NULL);
-
-      tinyprint(1, "addr flags:", NULL);
-      if (aflags & 0x01) {
-        tinyprint(1, " temporary", NULL);
-      }
-      if (aflags & 0x02) {
-        tinyprint(1, " nodad", NULL);
-      }
-      if (aflags & 0x04) {
-        tinyprint(1, " optimistic", NULL);
-      }
-      if (aflags & 0x08) {
-        tinyprint(1, " dadfailed", NULL);
-      }
-      if (aflags & 0x10) {
-        tinyprint(1, " homeaddress", NULL);
-      }
-      if (aflags & 0x20) {
-        tinyprint(1, " deprecated", NULL);
-      }
-      if (aflags & 0x40) {
-        tinyprint(1, " tentative", NULL);
-      }
-      if (aflags & 0x80) {
-        tinyprint(1, " permanent", NULL);
-      }
-      if (aflags & 0x100) {
-        tinyprint(1, " managetempaddr", NULL);
-      }
-      if (aflags & 0x200) {
-        tinyprint(1, " noprefixroute", NULL);
-      }
-      if (aflags & 0x400) {
-        tinyprint(1, " mcautojoin", NULL);
-      }
-      if (aflags & 0x800) {
-        tinyprint(1, " stable_privacy", NULL);
-      }
-      tinyprint(1, "\n", NULL);
-    }
-
     tinyprint(1, "flags:", NULL);
     if (ifa->ifa_flags & IFF_UP) {
       tinyprint(1, " IFF_UP", NULL);
diff --git a/tool/viz/lib/BUILD.mk b/tool/viz/lib/BUILD.mk
index e76dd3e01..92512372c 100644
--- a/tool/viz/lib/BUILD.mk
+++ b/tool/viz/lib/BUILD.mk
@@ -36,7 +36,6 @@ TOOL_VIZ_LIB_A_DIRECTDEPS =				\
 	LIBC_RUNTIME					\
 	LIBC_STDIO					\
 	LIBC_STR					\
-	LIBC_THREAD					\
 	LIBC_SYSV					\
 	LIBC_TESTLIB					\
 	LIBC_TINYMATH					\
diff --git a/tool/viz/lib/bilinearscale.c b/tool/viz/lib/bilinearscale.c
index 586e848bd..fd58723a9 100644
--- a/tool/viz/lib/bilinearscale.c
+++ b/tool/viz/lib/bilinearscale.c
@@ -21,7 +21,7 @@
 #include "libc/intrin/bsr.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
diff --git a/tool/viz/lib/dither.c b/tool/viz/lib/dither.c
index dcb43ea7d..d61a39049 100644
--- a/tool/viz/lib/dither.c
+++ b/tool/viz/lib/dither.c
@@ -20,7 +20,7 @@
 #include "libc/intrin/hilbert.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
diff --git a/tool/viz/lib/doublechrominance.S b/tool/viz/lib/doublechrominance.S
index 0db0eb343..b316fb9b1 100644
--- a/tool/viz/lib/doublechrominance.S
+++ b/tool/viz/lib/doublechrominance.S
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 
 //	Duplicates chrominance samples horizontally, e.g.
 //
diff --git a/tool/viz/lib/formatstringtable-testlib.h b/tool/viz/lib/formatstringtable-testlib.h
index 1b884965e..b4eb037c6 100644
--- a/tool/viz/lib/formatstringtable-testlib.h
+++ b/tool/viz/lib/formatstringtable-testlib.h
@@ -1,6 +1,6 @@
 #ifndef COSMOPOLITAN_TOOL_VIZ_LIB_FORMATSTRINGTABLE_TESTLIB_H_
 #define COSMOPOLITAN_TOOL_VIZ_LIB_FORMATSTRINGTABLE_TESTLIB_H_
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/str/str.h"
 #include "libc/testlib/testlib.h"
 #include "tool/viz/lib/formatstringtable.h"
diff --git a/tool/viz/lib/gaussian.c b/tool/viz/lib/gaussian.c
index c737b2240..fb1cd87a8 100644
--- a/tool/viz/lib/gaussian.c
+++ b/tool/viz/lib/gaussian.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "libc/sysv/errfuns.h"
diff --git a/tool/viz/lib/getxtermcodes.c b/tool/viz/lib/getxtermcodes.c
index 0f00469e8..52908f3ab 100644
--- a/tool/viz/lib/getxtermcodes.c
+++ b/tool/viz/lib/getxtermcodes.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/tty/quant.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "tool/viz/lib/graphic.h"
 
 void getxtermcodes(struct TtyRgb *p, const struct Graphic *g) {
diff --git a/tool/viz/lib/perlin3.c b/tool/viz/lib/perlin3.c
index f9f81e9c9..138296e11 100644
--- a/tool/viz/lib/perlin3.c
+++ b/tool/viz/lib/perlin3.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "tool/viz/lib/graphic.h"
 
diff --git a/tool/viz/lib/sharpen.c b/tool/viz/lib/sharpen.c
index c79586537..eec139e4d 100644
--- a/tool/viz/lib/sharpen.c
+++ b/tool/viz/lib/sharpen.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/core/ks8.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "libc/sysv/errfuns.h"
diff --git a/tool/viz/lib/sobel.c b/tool/viz/lib/sobel.c
index 1f83878ee..a6acf15b0 100644
--- a/tool/viz/lib/sobel.c
+++ b/tool/viz/lib/sobel.c
@@ -17,7 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/calls.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/nexgen32e.h"
diff --git a/tool/viz/lib/stringbuilder.c b/tool/viz/lib/stringbuilder.c
index 85906bf8b..f9419a41e 100644
--- a/tool/viz/lib/stringbuilder.c
+++ b/tool/viz/lib/stringbuilder.c
@@ -18,7 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "tool/viz/lib/stringbuilder.h"
 #include "libc/log/check.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "libc/x/x.h"
diff --git a/tool/viz/lib/unsharp.c b/tool/viz/lib/unsharp.c
index 4514f8265..8bd6f272c 100644
--- a/tool/viz/lib/unsharp.c
+++ b/tool/viz/lib/unsharp.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 #include "libc/sysv/errfuns.h"
diff --git a/tool/viz/lib/writetoframebuffer.c b/tool/viz/lib/writetoframebuffer.c
index 9a2b7a116..f46d95387 100644
--- a/tool/viz/lib/writetoframebuffer.c
+++ b/tool/viz/lib/writetoframebuffer.c
@@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "tool/viz/lib/graphic.h"
 
 void WriteToFrameBuffer(size_t dyn, size_t dxn, unsigned char dst[dyn][dxn][4],
diff --git a/tool/viz/lib/ycbcr2rgb3.c b/tool/viz/lib/ycbcr2rgb3.c
index b400b961b..96d7d52ff 100644
--- a/tool/viz/lib/ycbcr2rgb3.c
+++ b/tool/viz/lib/ycbcr2rgb3.c
@@ -30,9 +30,10 @@
 #include "libc/calls/struct/sigset.h"
 #include "libc/calls/struct/timespec.h"
 #include "libc/intrin/bsr.h"
+#include "libc/intrin/pmulhrsw.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
@@ -43,7 +44,6 @@
 #include "libc/str/str.h"
 #include "libc/sysv/consts/sig.h"
 #include "libc/sysv/errfuns.h"
-#include "libc/thread/thread.h"
 #include "libc/time.h"
 #include "libc/x/x.h"
 #include "tool/viz/lib/graphic.h"
@@ -70,7 +70,6 @@ struct timespec magikarp_start_;
 
 struct YCbCr {
   bool yonly;
-  int cpu_count;
   int magnums[8][4];
   int lighting[6][4];
   unsigned char transfer[2][256];
@@ -167,7 +166,6 @@ void YCbCrInit(struct YCbCr **ycbcr, bool yonly, int swing, double gamma,
   if (!*ycbcr)
     *ycbcr = xcalloc(1, sizeof(struct YCbCr));
   (*ycbcr)->yonly = yonly;
-  (*ycbcr)->cpu_count = __get_cpu_count();
   bzero((*ycbcr)->magnums, sizeof((*ycbcr)->magnums));
   bzero((*ycbcr)->lighting, sizeof((*ycbcr)->lighting));
   YCbCrComputeCoefficients(swing, gamma, gamut, illuminant, (*ycbcr)->magnums,
@@ -266,32 +264,14 @@ void YCbCrConvert(struct YCbCr *me, long yn, long xn,
                   const unsigned char Y[restrict yys][yxs], long cys, long cxs,
                   unsigned char Cb[restrict cys][cxs],
                   unsigned char Cr[restrict cys][cxs]) {
-  struct timespec ts = timespec_mono();
+  struct timespec ts = timespec_real();
   if (!me->yonly) {
     YCbCr2Rgb(yn, xn, RGB, yys, yxs, Y, cys, cxs, Cb, Cr, me->magnums,
               me->lighting, me->transfer[pf10_]);
   } else {
     Y2Rgb(yn, xn, RGB, yys, yxs, Y, me->magnums, me->transfer[pf10_]);
   }
-  ycbcr2rgb_latency_ = timespec_tomicros(timespec_sub(timespec_mono(), ts));
-}
-
-struct YCbCr2RgbScalerThreadData {
-  long syw, sxw, dyw, dxw, dyn, dxn, syn, sxn;
-  unsigned char *src;
-  unsigned char *dst;
-  int min, max;
-  struct SamplingSolution *cy, *cx;
-  bool sharpen;
-};
-
-static void *YCbCr2RgbScalerThread(void *arg) {
-  struct YCbCr2RgbScalerThreadData *data =
-      (struct YCbCr2RgbScalerThreadData *)arg;
-  GyaradosUint8(data->syw, data->sxw, data->src, data->dyw, data->dxw,
-                data->dst, data->dyn, data->dxn, data->syn, data->sxn,
-                data->min, data->max, data->cy, data->cx, data->sharpen);
-  return NULL;
+  ycbcr2rgb_latency_ = timespec_tomicros(timespec_sub(timespec_real(), ts));
 }
 
 void YCbCr2RgbScaler(struct YCbCr *me, long dyn, long dxn,
@@ -318,7 +298,7 @@ void YCbCr2RgbScaler(struct YCbCr *me, long dyn, long dxn,
                     Magkern2xY(cys, cxs, Cr, scyn, scxn), HALF(yyn), yxn,
                     HALF(cyn), scxn, syn / 2, sxn, pry, prx);
   } else {
-    struct timespec ts = timespec_mono();
+    struct timespec ts = timespec_real();
     magikarp_latency_ = timespec_tomicros(timespec_sub(ts, magikarp_start_));
     yry = syn / dyn;
     yrx = sxn / dxn;
@@ -343,83 +323,13 @@ void YCbCr2RgbScaler(struct YCbCr *me, long dyn, long dxn,
       sharpen(1, yys, yxs, (void *)Y, yyn, yxn);
     if (pf9_)
       unsharp(1, yys, yxs, (void *)Y, yyn, yxn);
-
-    if (me->cpu_count < 6) {
-      GyaradosUint8(yys, yxs, Y, yys, yxs, Y, dyn, dxn, syn, sxn, 0, 255,
-                    me->luma.cy, me->luma.cx, true);
-      GyaradosUint8(cys, cxs, Cb, cys, cxs, Cb, dyn, dxn, scyn, scxn, 0, 255,
-                    me->chroma.cy, me->chroma.cx, false);
-      GyaradosUint8(cys, cxs, Cr, cys, cxs, Cr, dyn, dxn, scyn, scxn, 0, 255,
-                    me->chroma.cy, me->chroma.cx, false);
-    } else {
-      pthread_t threads[3];
-      struct YCbCr2RgbScalerThreadData thread_data[3];
-
-      // Set up thread data for Y plane.
-      thread_data[0] = (struct YCbCr2RgbScalerThreadData){
-          .syw = yys,
-          .sxw = yxs,
-          .dyw = yys,
-          .dxw = yxs,
-          .dyn = dyn,
-          .dxn = dxn,
-          .syn = syn,
-          .sxn = sxn,
-          .src = (unsigned char *)Y,
-          .dst = (unsigned char *)Y,
-          .min = 0,
-          .max = 255,
-          .cy = me->luma.cy,
-          .cx = me->luma.cx,
-          .sharpen = true,
-      };
-
-      // Set up thread data for Cb plane.
-      thread_data[1] = (struct YCbCr2RgbScalerThreadData){
-          .syw = cys,
-          .sxw = cxs,
-          .dyw = cys,
-          .dxw = cxs,
-          .dyn = dyn,
-          .dxn = dxn,
-          .syn = scyn,
-          .sxn = scxn,
-          .src = (unsigned char *)Cb,
-          .dst = (unsigned char *)Cb,
-          .min = 0,
-          .max = 255,
-          .cy = me->chroma.cy,
-          .cx = me->chroma.cx,
-          .sharpen = false,
-      };
-
-      // Set up thread data for Cr plane.
-      thread_data[2] = (struct YCbCr2RgbScalerThreadData){
-          .syw = cys,
-          .sxw = cxs,
-          .dyw = cys,
-          .dxw = cxs,
-          .dyn = dyn,
-          .dxn = dxn,
-          .syn = scyn,
-          .sxn = scxn,
-          .src = (unsigned char *)Cr,
-          .dst = (unsigned char *)Cr,
-          .min = 0,
-          .max = 255,
-          .cy = me->chroma.cy,
-          .cx = me->chroma.cx,
-          .sharpen = false,
-      };
-
-      // Dispatch threads.
-      for (int i = 0; i < 3; i++)
-        pthread_create(&threads[i], NULL, YCbCr2RgbScalerThread,
-                       &thread_data[i]);
-      for (int i = 3; i--;)
-        pthread_join(threads[i], NULL);
-    }
-    gyarados_latency_ = timespec_tomicros(timespec_sub(timespec_mono(), ts));
+    GyaradosUint8(yys, yxs, Y, yys, yxs, Y, dyn, dxn, syn, sxn, 0, 255,
+                  me->luma.cy, me->luma.cx, true);
+    GyaradosUint8(cys, cxs, Cb, cys, cxs, Cb, dyn, dxn, scyn, scxn, 0, 255,
+                  me->chroma.cy, me->chroma.cx, false);
+    GyaradosUint8(cys, cxs, Cr, cys, cxs, Cr, dyn, dxn, scyn, scxn, 0, 255,
+                  me->chroma.cy, me->chroma.cx, false);
+    gyarados_latency_ = timespec_tomicros(timespec_sub(timespec_real(), ts));
     YCbCrConvert(me, dyn, dxn, RGB, yys, yxs, Y, cys, cxs, Cb, Cr);
     INFOF("done");
   }
@@ -474,7 +384,7 @@ void *YCbCr2RgbScale(long dyn, long dxn,
   CHECK_LE(cyn, cys);
   CHECK_LE(cxn, cxs);
   INFOF("magikarp2x");
-  magikarp_start_ = timespec_mono();
+  magikarp_start_ = timespec_real();
   minyys = MAX(ceil(syn), MAX(yyn, ceil(dyn * pry)));
   minyxs = MAX(ceil(sxn), MAX(yxn, ceil(dxn * prx)));
   mincys = MAX(cyn, ceil(dyn * pry));
diff --git a/tool/viz/life.c b/tool/viz/life.c
index 3b105eb6b..8ca83fb8b 100644
--- a/tool/viz/life.c
+++ b/tool/viz/life.c
@@ -32,7 +32,7 @@
 #include "libc/limits.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/nt2sysv.h"
 #include "libc/nt/comdlg.h"
@@ -1130,7 +1130,7 @@ static bool ShouldDraw(void) {
   static struct timespec next;
   if (!isdragging)
     return true;
-  now = timespec_mono();
+  now = timespec_real();
   if (timespec_cmp(now, next) > 0 && !HasPendingInput()) {
     next = timespec_add(now, timespec_frommicros(1. / 24 * 1e6));
     return true;
diff --git a/tool/viz/memzoom.c b/tool/viz/memzoom.c
index 6481562d3..e3f16a046 100644
--- a/tool/viz/memzoom.c
+++ b/tool/viz/memzoom.c
@@ -36,15 +36,14 @@
 #include "libc/intrin/safemacros.h"
 #include "libc/limits.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/pollfd.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 #include "libc/str/unicode.h"
-#include "libc/sysv/consts/clock.h"
 #include "libc/sysv/consts/ex.h"
 #include "libc/sysv/consts/exit.h"
 #include "libc/sysv/consts/fileno.h"
@@ -335,11 +334,10 @@ static long Index(long y, long x) {
 static void PreventBufferbloat(void) {
   struct timespec now, rate;
   static struct timespec last;
-  now = timespec_mono();
+  now = timespec_real();
   rate = timespec_frommicros(1. / fps * 1e6);
   if (timespec_cmp(timespec_sub(now, last), rate) < 0) {
-    timespec_sleep(CLOCK_MONOTONIC,
-                   timespec_sub(rate, timespec_sub(now, last)));
+    timespec_sleep(timespec_sub(rate, timespec_sub(now, last)));
   }
   last = now;
 }
diff --git a/tool/viz/printansi.c b/tool/viz/printansi.c
index 900dd8b3a..849242540 100644
--- a/tool/viz/printansi.c
+++ b/tool/viz/printansi.c
@@ -33,7 +33,7 @@
 #include "libc/limits.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
diff --git a/tool/viz/printvideo.c b/tool/viz/printvideo.c
index cd35f672c..04c7da6b6 100644
--- a/tool/viz/printvideo.c
+++ b/tool/viz/printvideo.c
@@ -16,11 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "dsp/audio/cosmoaudio/cosmoaudio.h"
 #include "dsp/core/core.h"
 #include "dsp/core/half.h"
 #include "dsp/core/illumination.h"
-#include "dsp/mpeg/pl_mpeg.h"
+#include "dsp/mpeg/mpeg.h"
 #include "dsp/scale/scale.h"
 #include "dsp/tty/quant.h"
 #include "dsp/tty/tty.h"
@@ -40,7 +39,6 @@
 #include "libc/calls/ucontext.h"
 #include "libc/ctype.h"
 #include "libc/cxxabi.h"
-#include "libc/dce.h"
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/fmt/itoa.h"
@@ -49,7 +47,7 @@
 #include "libc/intrin/xchg.h"
 #include "libc/log/check.h"
 #include "libc/log/log.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/math.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/arraylist.internal.h"
@@ -57,9 +55,7 @@
 #include "libc/nexgen32e/bench.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/nt/console.h"
-#include "libc/nt/enum/threadpriority.h"
 #include "libc/nt/runtime.h"
-#include "libc/nt/thread.h"
 #include "libc/runtime/runtime.h"
 #include "libc/sock/sock.h"
 #include "libc/sock/struct/pollfd.h"
@@ -146,6 +142,8 @@ Effects Shortcuts:\n\
   CTRL-G     {Unsharp,Sharp}\n\
 \n\
 Environment Variables:\n\
+  SOX        overrides location of SoX executable\n\
+  FFPLAY     overrides location of FFmpeg ffplay executable\n\
   ROWS=𝑦     sets height [inarticulate mode]\n\
   COLUMNS=𝑥  sets width  [inarticulate mode]\n\
   TERM=dumb  inarticulate mode\n\
@@ -160,6 +158,11 @@ in a different format, then it's fast and easy to convert them:\n\
 The terminal fonts we recommend are PragmataPro, Bitstream Vera Sans\n\
 Mono (known as DejaVu Sans Mono in the open source community), Menlo,\n\
 and Lucida Console.\n\
+\n\
+On Linux, playing audio requires either `sox` or `ffplay` being on\n\
+the $PATH. Kitty is the fastest terminal. Alacritty also has a fast\n\
+display. GNOME Terminal and xterm both work well in 256-color or ANSI\n\
+mode.\n\
 \n"
 
 #define CTRL(C)   ((C) ^ 0100)
@@ -175,9 +178,9 @@ and Lucida Console.\n\
 
 #define TIMEIT(OUT_NANOS, FORM)                                           \
   do {                                                                    \
-    struct timespec Start = timespec_mono();                              \
+    struct timespec Start = timespec_real();                              \
     FORM;                                                                 \
-    (OUT_NANOS) = timespec_tonanos(timespec_sub(timespec_mono(), Start)); \
+    (OUT_NANOS) = timespec_tonanos(timespec_sub(timespec_real(), Start)); \
   } while (0)
 
 typedef bool (*openspeaker_f)(void);
@@ -223,6 +226,16 @@ struct FrameBuffer {
   struct FrameBufferVirtualScreenInfo vscreen;
 };
 
+static const struct itimerval kTimerDisarm = {
+    {0, 0},
+    {0, 0},
+};
+
+static const struct itimerval kTimerHalfSecondSingleShot = {
+    {0, 0},
+    {0, 500000},
+};
+
 static const struct NamedVector kPrimaries[] = {
     {"BT.601", &kBt601Primaries},
     {"BT.709", &kBt709Primaries},
@@ -242,8 +255,7 @@ static const struct NamedVector kLightings[] = {
 
 static plm_t *plm_;
 static float gamma_;
-static float volscale_;
-struct CosmoAudio *ca_;
+static int volscale_;
 static enum Blur blur_;
 static enum Sharp sharp_;
 static jmp_buf jb_, jbi_;
@@ -251,27 +263,32 @@ static double pary_, parx_;
 static struct TtyIdent ti_;
 static struct YCbCr *ycbcr_;
 static bool emboss_, sobel_;
-static const char *patharg_;
+static volatile int playpid_;
 static struct winsize wsize_;
 static float hue_, sat_, lit_;
-static volatile bool resized_;
 static void *xtcodes_, *audio_;
 static struct FrameBuffer fb0_;
 static unsigned chans_, srate_;
 static volatile bool ignoresigs_;
 static size_t dh_, dw_, framecount_;
 static struct FrameCountRing fcring_;
+static volatile bool resized_, piped_;
 static int lumakernel_, chromakernel_;
+static openspeaker_f tryspeakerfns_[4];
 static int primaries_, lighting_, swing_;
 static uint64_t t1, t2, t3, t4, t5, t6, t8;
-static int homerow_, lastrow_, infd_, outfd_;
+static const char *sox_, *ffplay_, *patharg_;
 static struct VtFrame vtframe_[2], *f1_, *f2_;
 static struct Graphic graphic_[2], *g1_, *g2_;
 static struct timespec deadline_, dura_, starttime_;
 static bool yes_, stats_, dither_, ttymode_, istango_;
 static struct timespec decode_start_, f1_start_, f2_start_;
+static int16_t pcm_[PLM_AUDIO_SAMPLES_PER_FRAME * 2 / 8][8];
+static int16_t pcmscale_[PLM_AUDIO_SAMPLES_PER_FRAME * 2 / 8][8];
 static bool fullclear_, historyclear_, tuned_, yonly_, gotvideo_;
-static char status_[7][200], logpath_[PATH_MAX], chansstr_[32], sratestr_[32];
+static int homerow_, lastrow_, playfd_, infd_, outfd_, speakerfails_;
+static char status_[7][200], logpath_[PATH_MAX], fifopath_[PATH_MAX],
+    chansstr_[32], sratestr_[32];
 
 static void OnCtrlC(void) {
   longjmp(jb_, 1);
@@ -281,15 +298,29 @@ static void OnResize(void) {
   resized_ = true;
 }
 
+static void OnSigPipe(void) {
+  piped_ = true;
+}
+
+static void OnSigChld(void) {
+  playpid_ = 0, piped_ = true;
+}
+
+static void StrikeDownCrapware(int sig) {
+  kill(playpid_, SIGKILL);
+}
+
 static struct timespec GetGraceTime(void) {
-  return timespec_sub(deadline_, timespec_mono());
+  return timespec_sub(deadline_, timespec_real());
 }
 
 static char *strntoupper(char *s, size_t n) {
   size_t i;
-  for (i = 0; s[i] && i < n; ++i)
-    if ('a' <= s[i] && s[i] <= 'z')
+  for (i = 0; s[i] && i < n; ++i) {
+    if ('a' <= s[i] && s[i] <= 'z') {
       s[i] -= 'a' - 'A';
+    }
+  }
   return s;
 }
 
@@ -302,9 +333,11 @@ static int GetNamedVector(const struct NamedVector *choices, size_t n,
   strncpy(name, s, sizeof(name));
 #pragma GCC pop_options
   strntoupper(name, sizeof(name));
-  for (i = 0; i < n; ++i)
-    if (memcmp(choices[i].name, name, sizeof(name)) == 0)
+  for (i = 0; i < n; ++i) {
+    if (memcmp(choices[i].name, name, sizeof(name)) == 0) {
       return i;
+    }
+  }
   return -1;
 }
 
@@ -316,9 +349,32 @@ static int GetLighting(const char *s) {
   return GetNamedVector(kLightings, ARRAYLEN(kLightings), s);
 }
 
-static void CloseSpeaker(void) {
-  cosmoaudio_close(ca_);
-  ca_ = 0;
+static bool CloseSpeaker(void) {
+  int rc, wstatus;
+  rc = 0;
+  pthread_yield();
+  if (playfd_) {
+    rc |= close(playfd_);
+    playfd_ = -1;
+  }
+  if (playpid_) {
+    kill(playpid_, SIGTERM);
+    xsigaction(SIGALRM, StrikeDownCrapware, SA_RESETHAND, 0, 0);
+    setitimer(ITIMER_REAL, &kTimerHalfSecondSingleShot, NULL);
+    while (playpid_) {
+      if (waitpid(playpid_, &wstatus, 0) != -1) {
+        rc |= WEXITSTATUS(wstatus);
+      } else if (errno == EINTR) {
+        continue;
+      } else {
+        rc = -1;
+      }
+      break;
+    }
+    playpid_ = 0;
+    setitimer(ITIMER_REAL, &kTimerDisarm, NULL);
+  }
+  return !!rc;
 }
 
 static void ResizeVtFrame(struct VtFrame *f, size_t yn, size_t xn) {
@@ -332,7 +388,7 @@ static float timespec_tofloat(struct timespec ts) {
 
 static void RecordFactThatFrameWasFullyRendered(void) {
   fcring_.p[fcring_.i] =
-      timespec_tofloat(timespec_sub(timespec_mono(), starttime_));
+      timespec_tofloat(timespec_sub(timespec_real(), starttime_));
   fcring_.n += 1;
   fcring_.i += 1;
   fcring_.i &= ARRAYLEN(fcring_.p) - 1;
@@ -382,8 +438,9 @@ static void DimensionDisplay(void) {
       wsize_.ws_row = 25;
       wsize_.ws_col = 80;
       wsize_ = (struct winsize){.ws_row = 40, .ws_col = 80};
-      if (tcgetwinsize(outfd_, &wsize_) == -1)
+      if (tcgetwinsize(outfd_, &wsize_) == -1) {
         tcgetwinsize(0, &wsize_);
+      }
       dh_ = wsize_.ws_row * 2;
       dw_ = wsize_.ws_col * 2;
     }
@@ -407,37 +464,124 @@ static void DimensionDisplay(void) {
     ResizeVtFrame(&vtframe_[1], (g2_->yn), g2_->xn);
     f1_ = &vtframe_[0];
     f2_ = &vtframe_[1];
-    if (ttymode_)
+    if (ttymode_) {
       homerow_ = MIN(wsize_.ws_row - HALF(g2_->yn),
                      HALF(wsize_.ws_row - HALF(g2_->yn)));
+    }
     lastrow_ = homerow_ + HALF(g2_->yn);
     ComputeColoringSolution();
   } while (resized_);
 }
 
+static int WriteAudio(int fd, const void *data, size_t size, int deadlinems) {
+  ssize_t rc;
+  const char *p;
+  size_t wrote, n;
+  p = data;
+  n = size;
+  do {
+  TryAgain:
+    if ((rc = write(fd, p, n)) != -1) {
+      wrote = rc;
+      p += wrote;
+      n -= wrote;
+    } else if (errno == EINTR) {
+      goto TryAgain;
+    } else if (errno == EAGAIN) {
+      if (poll((struct pollfd[]){{fd, POLLOUT}}, 1, deadlinems) == 0) {
+        return etimedout();
+      }
+    } else {
+      return -1;
+    }
+  } while (n);
+  return 0;
+}
+
+static bool TrySpeaker(const char *prog, char *const *args) {
+  int pipefds[2];
+  CHECK_NE(-1, pipe2(pipefds, O_CLOEXEC));
+  if (!(playpid_ = fork())) {
+    dup2(pipefds[0], 0);
+    dup2(fileno(__log_file), 1);
+    dup2(fileno(__log_file), 2);
+    close(fileno(__log_file));
+    execv(prog, args);
+    abort();
+  }
+  playfd_ = pipefds[1];
+  return true;
+}
+
+static bool TrySox(void) {
+  return TrySpeaker(sox_, ARGZ("play", "-q", "-c", chansstr_, "-traw",
+                               "-esigned", "-b16", "-r", sratestr_, "-"));
+}
+
+static bool TryFfplay(void) {
+  return TrySpeaker(ffplay_, ARGZ("ffplay", "-nodisp", "-loglevel", "quiet",
+                                  "-fflags", "nobuffer", "-ac", chansstr_,
+                                  "-ar", sratestr_, "-f", "s16le", "pipe:"));
+}
+
 static bool OpenSpeaker(void) {
-  struct CosmoAudioOpenOptions cao = {};
-  cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions);
-  cao.deviceType = kCosmoAudioDeviceTypePlayback;
-  cao.sampleRate = srate_;
-  cao.channels = chans_;
-  return cosmoaudio_open(&ca_, &cao) == COSMOAUDIO_SUCCESS;
+  size_t i;
+  static bool once, count;
+  if (!once) {
+    once = true;
+    i = 0;
+    if (ffplay_)
+      tryspeakerfns_[i++] = TryFfplay;
+    if (sox_)
+      tryspeakerfns_[i++] = TrySox;
+  }
+  snprintf(fifopath_, sizeof(fifopath_), "%s%s.%d.%d.wav", __get_tmpdir(),
+           firstnonnull(program_invocation_short_name, "unknown"), getpid(),
+           count);
+  for (i = 0; i < ARRAYLEN(tryspeakerfns_); ++i) {
+    if (tryspeakerfns_[i]) {
+      if (++speakerfails_ <= 2 && tryspeakerfns_[i]()) {
+        return true;
+      } else {
+        speakerfails_ = 0;
+        tryspeakerfns_[i] = NULL;
+      }
+    }
+  }
+  return false;
 }
 
 static void OnAudio(plm_t *mpeg, plm_samples_t *samples, void *user) {
-  if (!ca_)
-    return;
-  if (volscale_ != 1.f)
-    for (unsigned i = 0; i < samples->count * chans_; ++i)
-      samples->interleaved[i] *= volscale_;
-  cosmoaudio_write(ca_, samples->interleaved, samples->count);
+  if (playfd_ != -1) {
+    DEBUGF("OnAudio() [grace=%,ldns]", timespec_tonanos(GetGraceTime()));
+    CHECK_EQ(2, chans_);
+    CHECK_EQ(ARRAYLEN(pcm_) * 8, samples->count * chans_);
+    float2short(ARRAYLEN(pcm_), pcm_, (void *)samples->interleaved);
+    scalevolume(ARRAYLEN(pcm_), pcm_, volscale_);
+    sad16x8n(ARRAYLEN(pcm_), pcm_, pcmscale_);
+    DEBUGF("transcoded audio");
+  TryAgain:
+    if (WriteAudio(playfd_, pcm_, sizeof(pcm_), 1000) != -1) {
+      DEBUGF("WriteAudio(%d, %zu) ok [grace=%,ldns]", playfd_,
+             samples->count * 2, timespec_tonanos(GetGraceTime()));
+    } else {
+      WARNF("WriteAudio(%d, %zu) failed: %s", playfd_, samples->count * 2,
+            strerror(errno));
+      CloseSpeaker();
+      if (OpenSpeaker()) {
+        goto TryAgain;
+      }
+    }
+  }
 }
 
 static void DescribeAlgorithms(char *p) {
-  if (dither_ && TTYQUANT()->alg != kTtyQuantTrue)
+  if (dither_ && TTYQUANT()->alg != kTtyQuantTrue) {
     p = stpcpy(p, " ℍithered");
-  if (yonly_)
+  }
+  if (yonly_) {
     p = stpcpy(p, " grayscaled");
+  }
   p += sprintf(p, " magikarp:%d:%d", lumakernel_, chromakernel_);
   switch (TTYQUANT()->alg) {
     case kTtyQuantTrue:
@@ -699,8 +843,9 @@ static void TranscodeVideo(plm_frame_t *pf) {
       default:
         break;
     }
-    if (dither_ && TTYQUANT()->alg != kTtyQuantTrue)
+    if (dither_ && TTYQUANT()->alg != kTtyQuantTrue) {
       dither(g2_->yn, g2_->xn, g2_->b, g2_->yn, g2_->xn);
+    }
   });
 
   if (ShouldUseFrameBuffer()) {
@@ -740,6 +885,7 @@ static void OnVideo(plm_t *mpeg, plm_frame_t *pf, void *user) {
 
 static void OpenVideo(void) {
   size_t yn, xn;
+  playfd_ = -1;
   INFOF("%s(%`'s)", "OpenVideo", patharg_);
   CHECK_NOTNULL((plm_ = plm_create_with_filename(patharg_)));
   swing_ = 219;
@@ -754,9 +900,9 @@ static void OpenVideo(void) {
   FormatInt64(chansstr_, (chans_ = 2));
   FormatInt64(sratestr_, (srate_ = plm_get_samplerate(plm_)));
   if (plm_get_num_audio_streams(plm_) && OpenSpeaker()) {
-    plm_set_audio_enabled(plm_, true);
+    plm_set_audio_enabled(plm_, true, 0);
   } else {
-    plm_set_audio_enabled(plm_, false);
+    plm_set_audio_enabled(plm_, false, 0);
   }
   g2_ = g1_ = resizegraphic(&graphic_[0], yn, xn);
 }
@@ -767,12 +913,13 @@ static ssize_t WriteVideoCall(void) {
   amt = min(4096 * 4, f1_->n - f1_->i);
   if ((rc = write(outfd_, f1_->bytes + f1_->i, amt)) != -1) {
     if ((f1_->i += rc) == f1_->n) {
-      if (plm_get_audio_enabled(plm_))
+      if (plm_get_audio_enabled(plm_)) {
         plm_set_audio_lead_time(
             plm_,
             max(0,
-                min(timespec_tofloat(timespec_sub(timespec_mono(), f1_start_)),
+                min(timespec_tofloat(timespec_sub(timespec_real(), f1_start_)),
                     plm_get_samplerate(plm_) / PLM_AUDIO_SAMPLES_PER_FRAME)));
+      }
       f1_start_ = f2_start_;
       f1_->i = f1_->n = 0;
       struct VtFrame *t = f1_;
@@ -788,8 +935,9 @@ static void DrainVideo(void) {
     ttywrite(outfd_, f1_->bytes + f1_->i, f1_->n - f1_->i);
     f1_->i = f1_->n = 0;
   }
-  if (f2_ && f2_->n)
+  if (f2_ && f2_->n) {
     f2_->i = f2_->n = 0;
+  }
 }
 
 static void WriteVideo(void) {
@@ -942,10 +1090,10 @@ static optimizesize void ReadKeyboard(void) {
               case '[':
                 switch (b[i++]) {
                   case 'A': /* "\e[A" is up arrow */
-                    volscale_ *= 1.05f;
+                    ++volscale_;
                     break;
                   case 'B': /* "\e[B" is down arrow */
-                    volscale_ *= 0.95f;
+                    --volscale_;
                     break;
                   case 'C': /* "\e[C" is right arrow */
                     break;
@@ -1166,9 +1314,9 @@ static void PerformBestEffortIo(void) {
     DEBUGF("poll() toto=%d [grace=%,ldns]", toto,
            timespec_tonanos(GetGraceTime()));
     if (toto) {
-      if (fds[0].revents & (POLLIN | POLLHUP | POLLERR))
+      if (fds[0].revents & (POLLIN | POLLERR))
         ReadKeyboard();
-      if (fds[1].revents & (POLLOUT | POLLHUP | POLLERR))
+      if (fds[1].revents & (POLLOUT | POLLERR))
         WriteVideo();
     }
   } else if (errno == EINTR) {
@@ -1188,8 +1336,14 @@ static void RestoreTty(void) {
 }
 
 static void HandleSignals(void) {
-  if (resized_)
+  if (piped_) {
+    WARNF("SIGPIPE");
+    CloseSpeaker();
+    piped_ = false;
+  }
+  if (resized_) {
     RefreshDisplay();
+  }
 }
 
 static void PrintVideo(void) {
@@ -1197,19 +1351,20 @@ static void PrintVideo(void) {
   dura_ = timespec_frommicros(min(MAX_FRAMERATE, 1 / plm_get_framerate(plm_)) *
                               1e6);
   INFOF("framerate=%f dura=%f", plm_get_framerate(plm_), dura_);
-  next_tick = deadline_ = decode_last = timespec_mono();
+  next_tick = deadline_ = decode_last = timespec_real();
   next_tick = timespec_add(next_tick, dura_);
   deadline_ = timespec_add(deadline_, dura_);
   do {
     DEBUGF("plm_decode [grace=%,ldns]", timespec_tonanos(GetGraceTime()));
-    decode_start_ = timespec_mono();
+    decode_start_ = timespec_real();
     plm_decode(plm_,
                timespec_tofloat(timespec_sub(decode_start_, decode_last)));
     decode_last = decode_start_;
-    decode_end = timespec_mono();
+    decode_end = timespec_real();
     lag = timespec_sub(decode_end, decode_start_);
-    while (timespec_cmp(timespec_add(decode_end, lag), next_tick) > 0)
+    while (timespec_cmp(timespec_add(decode_end, lag), next_tick) > 0) {
       next_tick = timespec_add(next_tick, dura_);
+    }
     deadline_ = timespec_sub(next_tick, lag);
     if (gotvideo_ || !plm_get_video_enabled(plm_)) {
       gotvideo_ = false;
@@ -1217,8 +1372,9 @@ static void PrintVideo(void) {
             timespec_tonanos(lag), timespec_tonanos(GetGraceTime()));
     }
     do {
-      if (!setjmp(jbi_))
+      if (!setjmp(jbi_)) {
         PerformBestEffortIo();
+      }
       HandleSignals();
     } while (timespec_tomillis(GetGraceTime()) > 0);
   } while (plm_ && !plm_has_ended(plm_));
@@ -1237,6 +1393,17 @@ static bool AskUserYesOrNoQuestion(const char *prompt) {
   return c == 'y' || c == 'Y';
 }
 
+static bool CanPlayAudio(void) {
+  if (ffplay_ || sox_) {
+    return true;
+  } else if (AskUserYesOrNoQuestion(
+                 "ffplay not found; continue without audio?")) {
+    return false;
+  } else {
+    longjmp(jb_, 1);
+  }
+}
+
 static void PrintUsage(int rc, int fd) {
   tinyprint(fd, "Usage: ", program_invocation_name, USAGE, NULL);
   exit(rc);
@@ -1275,6 +1442,8 @@ static void GetOpts(int argc, char *argv[]) {
 }
 
 static void OnExit(void) {
+  if (playpid_)
+    kill(playpid_, SIGTERM), sched_yield();
   if (plm_)
     plm_destroy(plm_), plm_ = NULL;
   YCbCrFree(&ycbcr_);
@@ -1291,6 +1460,11 @@ static void OnExit(void) {
   CloseSpeaker();
 }
 
+static void MakeLatencyLittleLessBad(void) {
+  LOGIFNEG1(sys_mlockall(MCL_CURRENT));
+  LOGIFNEG1(nice(-5));
+}
+
 static void PickDefaults(void) {
   /*
    * Direct color ain't true color -- it just means xterm does the
@@ -1299,8 +1473,16 @@ static void PickDefaults(void) {
    *
    * strcmp(nulltoempty(getenv("TERM")), "xterm-direct") == 0
    */
-  if (IsWindows() || !strcmp(nulltoempty(getenv("TERM")), "xterm-kitty"))
+  if (strcmp(nulltoempty(getenv("TERM")), "xterm-kitty") == 0) {
     ttyquantsetup(kTtyQuantTrue, TTYQUANT()->chans, kTtyBlocksUnicode);
+  }
+}
+
+static void RenounceSpecialPrivileges(void) {
+  if (issetugid()) {
+    setegid(getgid());
+    seteuid(getuid());
+  }
 }
 
 #define FBIOGET_VSCREENINFO 0x4600
@@ -1400,13 +1582,10 @@ static void TryToOpenFrameBuffer(void) {
 
 int main(int argc, char *argv[]) {
   sigset_t wut;
+  const char *s;
   ShowCrashReports();
-#ifdef __x86_64__
-  if (IsWindows())
-    SetThreadPriority(GetCurrentThread(), kNtThreadPriorityHighest);
-#endif
   gamma_ = 2.4;
-  volscale_ = 1.f;
+  volscale_ -= 2;
   dither_ = true;
   sigemptyset(&wut);
   sigaddset(&wut, SIGCHLD);
@@ -1420,6 +1599,17 @@ int main(int argc, char *argv[]) {
   if (optind == argc)
     PrintUsage(EX_USAGE, STDERR_FILENO);
   patharg_ = argv[optind];
+  s = commandvenv("SOX", "sox");
+  sox_ = s ? strdup(s) : 0;
+  s = commandvenv("FFPLAY", "ffplay");
+  ffplay_ = s ? strdup(s) : 0;
+  if (!sox_ && !ffplay_) {
+    fprintf(stderr, "please install either the "
+                    "`play` (sox) or "
+                    "`ffplay` (ffmpeg) "
+                    "commands, so printvideo can play audio\n");
+    usleep(10000);
+  }
   infd_ = STDIN_FILENO;
   outfd_ = STDOUT_FILENO;
   if (!setjmp(jb_)) {
@@ -1427,22 +1617,27 @@ int main(int argc, char *argv[]) {
     xsigaction(SIGHUP, OnCtrlC, 0, 0, NULL);
     xsigaction(SIGTERM, OnCtrlC, 0, 0, NULL);
     xsigaction(SIGWINCH, OnResize, 0, 0, NULL);
+    xsigaction(SIGCHLD, OnSigChld, 0, 0, NULL);
+    xsigaction(SIGPIPE, OnSigPipe, 0, 0, NULL);
     if (ttyraw(kTtyLfToCrLf) != -1)
       ttymode_ = true;
     __cxa_atexit((void *)OnExit, NULL, NULL);
     __log_file = fopen(logpath_, "a");
     if (ischardev(infd_) && ischardev(outfd_)) {
-      /* CHECK_NE(-1, fcntl(outfd_, F_SETFL, O_NONBLOCK)); */
+      /* CHECK_NE(-1, fcntl(infd_, F_SETFL, O_NONBLOCK)); */
     } else if (infd_ != outfd_) {
       infd_ = -1;
     }
     /* CHECK_NE(-1, fcntl(outfd_, F_SETFL, O_NONBLOCK)); */
+    if (CanPlayAudio())
+      MakeLatencyLittleLessBad();
     TryToOpenFrameBuffer();
+    RenounceSpecialPrivileges();
     if (t2 > t1)
       longjmp(jb_, 1);
     OpenVideo();
     DimensionDisplay();
-    starttime_ = timespec_mono();
+    starttime_ = timespec_real();
     PrintVideo();
   }
   INFOF("jb_ triggered");
diff --git a/tool/viz/rlimit.c b/tool/viz/rlimit.c
index 6923626ec..e4b4e4ae9 100644
--- a/tool/viz/rlimit.c
+++ b/tool/viz/rlimit.c
@@ -13,7 +13,7 @@
 #include "libc/intrin/describeflags.h"
 #include "libc/intrin/strace.h"
 #include "libc/log/color.internal.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
@@ -43,7 +43,7 @@ static void SetLimit(int resource, uint64_t soft, uint64_t hard) {
         return;
       }
     }
-    fprintf(stderr, "ERROR: SETRLIMIT(%s, %,ld, %,ld) FAILED %m\n",
+    fprintf(stderr, "ERROR: SETRLIMIT(%s, %,ld, %,ld) FAILED %m%n",
             DescribeRlimitName(resource), soft, hard);
     exit(1);
   }
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
   for (i = 0; i < RLIM_NLIMITS; ++i) {
     rc = getrlimit(i, &rlim);
     printf("SETRLIMIT(%-20s, %,16ld, %,16ld) → %d %s\n",
-           _DescribeRlimitName(rlnbuf, i), rlim.rlim_cur, rlim.rlim_max, rc,
+           (DescribeRlimitName)(rlnbuf, i), rlim.rlim_cur, rlim.rlim_max, rc,
            !rc ? "" : strerror(errno));
   }
 
diff --git a/tool/viz/tailf.c b/tool/viz/tailf.c
index b2f7bb5d3..ed420d23e 100644
--- a/tool/viz/tailf.c
+++ b/tool/viz/tailf.c
@@ -19,7 +19,7 @@
 #include "libc/calls/calls.h"
 #include "libc/calls/struct/stat.h"
 #include "libc/intrin/safemacros.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
diff --git a/tool/viz/unbing.c b/tool/viz/unbing.c
index cbf6de954..fc5074eb4 100644
--- a/tool/viz/unbing.c
+++ b/tool/viz/unbing.c
@@ -19,7 +19,7 @@
 #include "libc/calls/calls.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
-#include "libc/str/tab.h"
+#include "libc/str/tab.internal.h"
 
 /**
  * @fileoverview UnBing: Glyphs → Binary.
diff --git a/tool/viz/vdsodump.c b/tool/viz/vdsodump.c
deleted file mode 100644
index 22174a323..000000000
--- a/tool/viz/vdsodump.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
-#include "libc/intrin/getauxval.h"
-#include "libc/runtime/runtime.h"
-#include "libc/sysv/consts/auxv.h"
-
-int main(int argc, char *argv[]) {
-  struct AuxiliaryValue av;
-  av = __getauxval(AT_SYSINFO_EHDR);
-  if (!av.isfound)
-    return 2;
-  int fd = creat("vdso.so", 0644);
-  if (fd == -1)
-    return 3;
-  int i;
-  for (i = 0;; i += getpagesize())
-    if (write(fd, (char *)av.value + i, getpagesize()) == -1)
-      break;
-  if (!i)
-    return 4;
-  if (close(fd))
-    return 5;
-}
diff --git a/tool/viz/virtualquery.c b/tool/viz/virtualquery.c
index 92558fa60..e1ce16cc1 100644
--- a/tool/viz/virtualquery.c
+++ b/tool/viz/virtualquery.c
@@ -20,7 +20,7 @@
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/intrin/describeflags.h"
-#include "libc/macros.h"
+#include "libc/macros.internal.h"
 #include "libc/nt/enum/memflags.h"
 #include "libc/nt/memory.h"
 #include "libc/nt/struct/memorybasicinformation.h"
@@ -40,8 +40,8 @@ static const struct DescribeFlags kNtMemState[] = {
 };
 
 const char *DescribeNtMemState(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kNtMemState, ARRAYLEN(kNtMemState), "kNtMem",
-                        x);
+  return DescribeFlags(buf, 64, kNtMemState, ARRAYLEN(kNtMemState), "kNtMem",
+                       x);
 }
 
 static const struct DescribeFlags kNtMemType[] = {
@@ -51,7 +51,7 @@ static const struct DescribeFlags kNtMemType[] = {
 };
 
 const char *DescribeNtMemType(char buf[64], uint32_t x) {
-  return _DescribeFlags(buf, 64, kNtMemType, ARRAYLEN(kNtMemType), "kNtMem", x);
+  return DescribeFlags(buf, 64, kNtMemType, ARRAYLEN(kNtMemType), "kNtMem", x);
 }
 
 int main(int argc, char *argv[]) {
@@ -72,8 +72,8 @@ int main(int argc, char *argv[]) {
     printf("%.12lx %.12lx %10s %16s %16s %32s %32s\n", mi.AllocationBase,
            mi.BaseAddress, b[0], DescribeNtMemState(b[1], mi.State),
            DescribeNtMemType(b[2], mi.Type),
-           _DescribeNtPageFlags(b[3], mi.AllocationProtect),
-           _DescribeNtPageFlags(b[4], mi.Protect));
+           (DescribeNtPageFlags)(b[3], mi.AllocationProtect),
+           (DescribeNtPageFlags)(b[4], mi.Protect));
   }
 }
 
diff --git a/tool/zsh/mkofs b/tool/zsh/mkofs
index 9e3c8146c..8018d493a 100644
--- a/tool/zsh/mkofs
+++ b/tool/zsh/mkofs
@@ -17,12 +17,7 @@ cut -d' ' -f2 /proc/mounts | while read -r line; do
     return 0
   fi
 done
-if whence doas >/dev/null; then
-  doas=doas
-else
-  doas=sudo
-fi
 ( set -x
-  $doas mount -t tmpfs -o size=10G,noatime,nodiratime /dev/shm "$o"
+  sudo mount -t tmpfs -o size=10G,noatime,nodiratime /dev/shm "$o"
 )
 # vim:ft=zsh
diff --git a/tool/zsh/mmake b/tool/zsh/mmake
index 5efe8cdad..0b5315bae 100644
--- a/tool/zsh/mmake
+++ b/tool/zsh/mmake
@@ -38,21 +38,8 @@ done
   whence nproc >/dev/null || autoload -Uz nproc
   j=-j$(nproc)
 }
-local make=$(
-  case $MAKE in
-    */*)            echo $MAKE                        ;;
-    ?*)             command -v $MAKE                  ;;
-    *)              echo .cosmocc/current/bin/make    ;;
-  esac
-)
-if [[ ! -x $make ]]; then
-  { echo 'please install a suitable make, for example:'
-    echo
-    echo 'https://cosmo.zip/pub/cosmos/bin/make'
-    echo
-    echo 'then put it on $PATH or point $MAKE to it.'
-  } >&2; return 1
-fi
+local make=${MAKE:-${COSMOCC:-/opt/cosmocc/current}/bin/make}
+[[ -x $make ]] || make=${COSMO:-$PWD}/build/bootstrap/make
 ( set -x
   exec $make $j $flags MODE=$mode $targs )
 # vim:ft=zsh
diff --git a/usr/share/ssl/root/usertrust.pem b/usr/share/ssl/root/usertrust.pem
deleted file mode 100644
index 789fb50ae..000000000
--- a/usr/share/ssl/root/usertrust.pem
+++ /dev/null
@@ -1,50 +0,0 @@
------BEGIN CERTIFICATE-----
-MIICjzCCAhWgAwIBAgIQXIuZxVqUxdJxVt7NiYDMJjAKBggqhkjOPQQDAzCBiDEL
-MAkGA1UEBhMCVVMxEzARBgNVBAgTCk5ldyBKZXJzZXkxFDASBgNVBAcTC0plcnNl
-eSBDaXR5MR4wHAYDVQQKExVUaGUgVVNFUlRSVVNUIE5ldHdvcmsxLjAsBgNVBAMT
-JVVTRVJUcnVzdCBFQ0MgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkwHhcNMTAwMjAx
-MDAwMDAwWhcNMzgwMTE4MjM1OTU5WjCBiDELMAkGA1UEBhMCVVMxEzARBgNVBAgT
-Ck5ldyBKZXJzZXkxFDASBgNVBAcTC0plcnNleSBDaXR5MR4wHAYDVQQKExVUaGUg
-VVNFUlRSVVNUIE5ldHdvcmsxLjAsBgNVBAMTJVVTRVJUcnVzdCBFQ0MgQ2VydGlm
-aWNhdGlvbiBBdXRob3JpdHkwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAAQarFRaqflo
-I+d61SRvU8Za2EurxtW20eZzca7dnNYMYf3boIkDuAUU7FfO7l0/4iGzzvfUinng
-o4N+LZfQYcTxmdwlkWOrfzCjtHDix6EznPO/LlxTsV+zfTJ/ijTjeXmjQjBAMB0G
-A1UdDgQWBBQ64QmG1M8ZwpZ2dEl23OA1xmNjmjAOBgNVHQ8BAf8EBAMCAQYwDwYD
-VR0TAQH/BAUwAwEB/zAKBggqhkjOPQQDAwNoADBlAjA2Z6EWCNzklwBBHU6+4WMB
-zzuqQhFkoJ2UOQIReVx7Hfpkue4WQrO/isIJxOzksU0CMQDpKmFHjFJKS04YcPbW
-RNZu9YO6bVi9JNlWSOrvxKJGgYhqOkbRqZtNyWHa0V1Xahg=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIF3jCCA8agAwIBAgIQAf1tMPyjylGoG7xkDjUDLTANBgkqhkiG9w0BAQwFADCB
-iDELMAkGA1UEBhMCVVMxEzARBgNVBAgTCk5ldyBKZXJzZXkxFDASBgNVBAcTC0pl
-cnNleSBDaXR5MR4wHAYDVQQKExVUaGUgVVNFUlRSVVNUIE5ldHdvcmsxLjAsBgNV
-BAMTJVVTRVJUcnVzdCBSU0EgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkwHhcNMTAw
-MjAxMDAwMDAwWhcNMzgwMTE4MjM1OTU5WjCBiDELMAkGA1UEBhMCVVMxEzARBgNV
-BAgTCk5ldyBKZXJzZXkxFDASBgNVBAcTC0plcnNleSBDaXR5MR4wHAYDVQQKExVU
-aGUgVVNFUlRSVVNUIE5ldHdvcmsxLjAsBgNVBAMTJVVTRVJUcnVzdCBSU0EgQ2Vy
-dGlmaWNhdGlvbiBBdXRob3JpdHkwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIK
-AoICAQCAEmUXNg7D2wiz0KxXDXbtzSfTTK1Qg2HiqiBNCS1kCdzOiZ/MPans9s/B
-3PHTsdZ7NygRK0faOca8Ohm0X6a9fZ2jY0K2dvKpOyuR+OJv0OwWIJAJPuLodMkY
-tJHUYmTbf6MG8YgYapAiPLz+E/CHFHv25B+O1ORRxhFnRghRy4YUVD+8M/5+bJz/
-Fp0YvVGONaanZshyZ9shZrHUm3gDwFA66Mzw3LyeTP6vBZY1H1dat//O+T23LLb2
-VN3I5xI6Ta5MirdcmrS3ID3KfyI0rn47aGYBROcBTkZTmzNg95S+UzeQc0PzMsNT
-79uq/nROacdrjGCT3sTHDN/hMq7MkztReJVni+49Vv4M0GkPGw/zJSZrM233bkf6
-c0Plfg6lZrEpfDKEY1WJxA3Bk1QwGROs0303p+tdOmw1XNtB1xLaqUkL39iAigmT
-Yo61Zs8liM2EuLE/pDkP2QKe6xJMlXzzawWpXhaDzLhn4ugTncxbgtNMs+1b/97l
-c6wjOy0AvzVVdAlJ2ElYGn+SNuZRkg7zJn0cTRe8yexDJtC/QV9AqURE9JnnV4ee
-UB9XVKg+/XRjL7FQZQnmWEIuQxpMtPAlR1n6BB6T1CZGSlCBst6+eLf8ZxXhyVeE
-Hg9j1uliutZfVS7qXMYoCAQlObgOK6nyTJccBz8NUvXt7y+CDwIDAQABo0IwQDAd
-BgNVHQ4EFgQUU3m/WqorSs9UgOHYm8Cd8rIDZsswDgYDVR0PAQH/BAQDAgEGMA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEMBQADggIBAFzUfA3P9wF9QZllDHPF
-Up/L+M+ZBn8b2kMVn54CVVeWFPFSPCeHlCjtHzoBN6J2/FNQwISbxmtOuowhT6KO
-VWKR82kV2LyI48SqC/3vqOlLVSoGIG1VeCkZ7l8wXEskEVX/JJpuXior7gtNn3/3
-ATiUFJVDBwn7YKnuHKsSjKCaXqeYalltiz8I+8jRRa8YFWSQEg9zKC7F4iRO/Fjs
-8PRF/iKz6y+O0tlFYQXBl2+odnKPi4w2r78NBc5xjeambx9spnFixdjQg3IM8WcR
-iQycE0xyNN+81XHfqnHd4blsjDwSXWXavVcStkNr/+XeTWYRUc+ZruwXtuhxkYze
-Sf7dNXGiFSeUHM9h4ya7b6NnJSFd5t0dCy5oGzuCr+yDZ4XUmFF0sbmZgIn/f3gZ
-XHlKYC6SQK5MNyosycdiyA5d9zZbyuAlJQG03RoHnHcAP9Dc1ew91Pq7P8yF1m9/
-qS3fuQL39ZeatTXaw2ewh0qpKJ4jjv9cJ2vhsE/zB+4ALtRZh8tSQZXq9EfX7mRB
-VXyNWQKV3WKdwrnuWih0hKWbt5DHDAff9Yk2dDLWKMGwsAvgnEzDHNb842m1R0aB
-L6KCq9NjRHDEjf8tM7qtj3u1cIiuPhnPQCjY/MiQu12ZIvVS5ljFH4gxQ+6IHdfG
-jjxDah2nGN59PRbxYvnKkKj9
------END CERTIFICATE-----
\ No newline at end of file